From de679b3bbe8b810b01929b1f27665ede5eb54f37 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <groeck@chromium.org>
Date: Thu, 24 Mar 2016 10:39:14 -0700
Subject: [PATCH 0001/3220] mm: Export do_munmap

The 0-day build bot reports the following build error, seen if SDCARD_FS
is built as module.

ERROR: "do_munmap" undefined!

Fixes: 84a1b7d3d312 ("Included sdcardfs source code for kernel 3.0")
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
 mm/mmap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/mmap.c b/mm/mmap.c
index 2b1143008793..ee856266cc61 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2639,6 +2639,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 
 	return 0;
 }
+EXPORT_SYMBOL(do_munmap);
 
 int vm_munmap(unsigned long start, size_t len)
 {

From a9c7ddcf0347675f8bc681830d21d27df2ed8cb1 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 25 Nov 2015 16:03:31 -0500
Subject: [PATCH 0002/3220] UPSTREAM: dm: don't save and restore bi_private

Device mapper used the field bi_private to point to dm_target_io. However,
since kernel 3.15, the bi_private field is unused, and so the targets do
not need to save and restore this field.

This patch removes code that saves and restores bi_private from dm-cache,
dm-snapshot and dm-verity.

Change-Id: Ic72905ccb6d58ff94eafaa47ba54b2688d92d3d1
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit fe3265b180d6282648f03bc6ac3958c733df01c2)
---
 drivers/md/dm-cache-target.c | 3 ---
 drivers/md/dm-snap.c         | 6 +-----
 drivers/md/dm-verity.c       | 5 +----
 3 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2fd4c8296144..5780accffa30 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -118,14 +118,12 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
  */
 struct dm_hook_info {
 	bio_end_io_t *bi_end_io;
-	void *bi_private;
 };
 
 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
 			bio_end_io_t *bi_end_io, void *bi_private)
 {
 	h->bi_end_io = bio->bi_end_io;
-	h->bi_private = bio->bi_private;
 
 	bio->bi_end_io = bi_end_io;
 	bio->bi_private = bi_private;
@@ -134,7 +132,6 @@ static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 {
 	bio->bi_end_io = h->bi_end_io;
-	bio->bi_private = h->bi_private;
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c06b74e91cd6..f68d0ae5b198 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -207,7 +207,6 @@ struct dm_snap_pending_exception {
 	 */
 	struct bio *full_bio;
 	bio_end_io_t *full_bio_end_io;
-	void *full_bio_private;
 };
 
 /*
@@ -1485,10 +1484,8 @@ out:
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
 	origin_bios = bio_list_get(&pe->origin_bios);
 	full_bio = pe->full_bio;
-	if (full_bio) {
+	if (full_bio)
 		full_bio->bi_end_io = pe->full_bio_end_io;
-		full_bio->bi_private = pe->full_bio_private;
-	}
 	increment_pending_exceptions_done_count();
 
 	up_write(&s->lock);
@@ -1605,7 +1602,6 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
 
 	pe->full_bio = bio;
 	pe->full_bio_end_io = bio->bi_end_io;
-	pe->full_bio_private = bio->bi_private;
 
 	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
 						   copy_callback, pe);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index ccf41886ebcf..9e8891507c1c 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -83,9 +83,8 @@ struct dm_verity {
 struct dm_verity_io {
 	struct dm_verity *v;
 
-	/* original values of bio->bi_end_io and bio->bi_private */
+	/* original value of bio->bi_end_io */
 	bio_end_io_t *orig_bi_end_io;
-	void *orig_bi_private;
 
 	sector_t block;
 	unsigned n_blocks;
@@ -453,7 +452,6 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
 
 	bio->bi_end_io = io->orig_bi_end_io;
-	bio->bi_private = io->orig_bi_private;
 	bio->bi_error = error;
 
 	bio_endio(bio);
@@ -566,7 +564,6 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	io = dm_per_bio_data(bio, ti->per_bio_data_size);
 	io->v = v;
 	io->orig_bi_end_io = bio->bi_end_io;
-	io->orig_bi_private = bio->bi_private;
 	io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
 	io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits;
 

From ab14138e109c1d292c2eb28bdb8f701cf34a86a0 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 5 Nov 2015 02:02:31 +0000
Subject: [PATCH 0003/3220] UPSTREAM: dm verity: clean up duplicate hashing
 code

Handle dm-verity salting in one place to simplify the code.

Change-Id: If923a01dc63ae5123af13ba1b0863b73e33ddf46
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit 6dbeda3469ced777bc3138ed5918f7ae79670b7b)
---
 drivers/md/dm-verity.c | 266 +++++++++++++++++++++++------------------
 1 file changed, 149 insertions(+), 117 deletions(-)

diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 9e8891507c1c..24517055bd8e 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -172,6 +172,84 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
 	return block >> (level * v->hash_per_block_bits);
 }
 
+/*
+ * Wrapper for crypto_shash_init, which handles verity salting.
+ */
+static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc)
+{
+	int r;
+
+	desc->tfm = v->tfm;
+	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	r = crypto_shash_init(desc);
+
+	if (unlikely(r < 0)) {
+		DMERR("crypto_shash_init failed: %d", r);
+		return r;
+	}
+
+	if (likely(v->version >= 1)) {
+		r = crypto_shash_update(desc, v->salt, v->salt_size);
+
+		if (unlikely(r < 0)) {
+			DMERR("crypto_shash_update failed: %d", r);
+			return r;
+		}
+	}
+
+	return 0;
+}
+
+static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc,
+			      const u8 *data, size_t len)
+{
+	int r = crypto_shash_update(desc, data, len);
+
+	if (unlikely(r < 0))
+		DMERR("crypto_shash_update failed: %d", r);
+
+	return r;
+}
+
+static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc,
+			     u8 *digest)
+{
+	int r;
+
+	if (unlikely(!v->version)) {
+		r = crypto_shash_update(desc, v->salt, v->salt_size);
+
+		if (r < 0) {
+			DMERR("crypto_shash_update failed: %d", r);
+			return r;
+		}
+	}
+
+	r = crypto_shash_final(desc, digest);
+
+	if (unlikely(r < 0))
+		DMERR("crypto_shash_final failed: %d", r);
+
+	return r;
+}
+
+static int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+		       const u8 *data, size_t len, u8 *digest)
+{
+	int r;
+
+	r = verity_hash_init(v, desc);
+	if (unlikely(r < 0))
+		return r;
+
+	r = verity_hash_update(v, desc, data, len);
+	if (unlikely(r < 0))
+		return r;
+
+	return verity_hash_final(v, desc, digest);
+}
+
 static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
 				 sector_t *hash_block, unsigned *offset)
 {
@@ -252,10 +330,10 @@ out:
  * If "skip_unverified" is false, unverified buffer is hashed and verified
  * against current value of io_want_digest(v, io).
  */
-static int verity_verify_level(struct dm_verity_io *io, sector_t block,
-			       int level, bool skip_unverified)
+static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
+			       sector_t block, int level, bool skip_unverified,
+			       u8 *want_digest)
 {
-	struct dm_verity *v = io->v;
 	struct dm_buffer *buf;
 	struct buffer_aux *aux;
 	u8 *data;
@@ -272,74 +350,71 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
 	aux = dm_bufio_get_aux_data(buf);
 
 	if (!aux->hash_verified) {
-		struct shash_desc *desc;
-		u8 *result;
-
 		if (skip_unverified) {
 			r = 1;
 			goto release_ret_r;
 		}
 
-		desc = io_hash_desc(v, io);
-		desc->tfm = v->tfm;
-		desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-		r = crypto_shash_init(desc);
-		if (r < 0) {
-			DMERR("crypto_shash_init failed: %d", r);
+		r = verity_hash(v, io_hash_desc(v, io),
+				data, 1 << v->hash_dev_block_bits,
+				io_real_digest(v, io));
+		if (unlikely(r < 0))
 			goto release_ret_r;
-		}
 
-		if (likely(v->version >= 1)) {
-			r = crypto_shash_update(desc, v->salt, v->salt_size);
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
-				goto release_ret_r;
-			}
-		}
-
-		r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
-		if (r < 0) {
-			DMERR("crypto_shash_update failed: %d", r);
-			goto release_ret_r;
-		}
-
-		if (!v->version) {
-			r = crypto_shash_update(desc, v->salt, v->salt_size);
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
-				goto release_ret_r;
-			}
-		}
-
-		result = io_real_digest(v, io);
-		r = crypto_shash_final(desc, result);
-		if (r < 0) {
-			DMERR("crypto_shash_final failed: %d", r);
-			goto release_ret_r;
-		}
-		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
-					      hash_block)) {
-				r = -EIO;
-				goto release_ret_r;
-			}
-		} else
+		if (likely(memcmp(io_real_digest(v, io), want_digest,
+				  v->digest_size) == 0))
 			aux->hash_verified = 1;
+		else if (verity_handle_err(v,
+					   DM_VERITY_BLOCK_TYPE_METADATA,
+					   hash_block)) {
+			r = -EIO;
+			goto release_ret_r;
+		}
 	}
 
 	data += offset;
-
-	memcpy(io_want_digest(v, io), data, v->digest_size);
-
-	dm_bufio_release(buf);
-	return 0;
+	memcpy(want_digest, data, v->digest_size);
+	r = 0;
 
 release_ret_r:
 	dm_bufio_release(buf);
-
 	return r;
 }
 
+/*
+ * Find a hash for a given block, write it to digest and verify the integrity
+ * of the hash tree if necessary.
+ */
+static int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+				 sector_t block, u8 *digest)
+{
+	int i;
+	int r;
+
+	if (likely(v->levels)) {
+		/*
+		 * First, we try to get the requested hash for
+		 * the current block. If the hash block itself is
+		 * verified, zero is returned. If it isn't, this
+		 * function returns 1 and we fall back to whole
+		 * chain verification.
+		 */
+		r = verity_verify_level(v, io, block, 0, true, digest);
+		if (likely(r <= 0))
+			return r;
+	}
+
+	memcpy(digest, v->root_digest, v->digest_size);
+
+	for (i = v->levels - 1; i >= 0; i--) {
+		r = verity_verify_level(v, io, block, i, false, digest);
+		if (unlikely(r))
+			return r;
+	}
+
+	return 0;
+}
+
 /*
  * Verify one "dm_verity_io" structure.
  */
@@ -349,54 +424,21 @@ static int verity_verify_io(struct dm_verity_io *io)
 	struct bio *bio = dm_bio_from_per_bio_data(io,
 						   v->ti->per_bio_data_size);
 	unsigned b;
-	int i;
 
 	for (b = 0; b < io->n_blocks; b++) {
-		struct shash_desc *desc;
-		u8 *result;
 		int r;
 		unsigned todo;
+		struct shash_desc *desc = io_hash_desc(v, io);
 
-		if (likely(v->levels)) {
-			/*
-			 * First, we try to get the requested hash for
-			 * the current block. If the hash block itself is
-			 * verified, zero is returned. If it isn't, this
-			 * function returns 0 and we fall back to whole
-			 * chain verification.
-			 */
-			int r = verity_verify_level(io, io->block + b, 0, true);
-			if (likely(!r))
-				goto test_block_hash;
-			if (r < 0)
-				return r;
-		}
-
-		memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
-
-		for (i = v->levels - 1; i >= 0; i--) {
-			int r = verity_verify_level(io, io->block + b, i, false);
-			if (unlikely(r))
-				return r;
-		}
-
-test_block_hash:
-		desc = io_hash_desc(v, io);
-		desc->tfm = v->tfm;
-		desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-		r = crypto_shash_init(desc);
-		if (r < 0) {
-			DMERR("crypto_shash_init failed: %d", r);
+		r = verity_hash_for_block(v, io, io->block + b,
+					  io_want_digest(v, io));
+		if (unlikely(r < 0))
+			return r;
+
+		r = verity_hash_init(v, desc);
+		if (unlikely(r < 0))
 			return r;
-		}
 
-		if (likely(v->version >= 1)) {
-			r = crypto_shash_update(desc, v->salt, v->salt_size);
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
-				return r;
-			}
-		}
 		todo = 1 << v->data_dev_block_bits;
 		do {
 			u8 *page;
@@ -407,37 +449,27 @@ test_block_hash:
 			len = bv.bv_len;
 			if (likely(len >= todo))
 				len = todo;
-			r = crypto_shash_update(desc, page + bv.bv_offset, len);
+			r = verity_hash_update(v, desc,  page + bv.bv_offset,
+					       len);
 			kunmap_atomic(page);
 
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
+			if (unlikely(r < 0))
 				return r;
-			}
 
 			bio_advance_iter(bio, &io->iter, len);
 			todo -= len;
 		} while (todo);
 
-		if (!v->version) {
-			r = crypto_shash_update(desc, v->salt, v->salt_size);
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
-				return r;
-			}
-		}
-
-		result = io_real_digest(v, io);
-		r = crypto_shash_final(desc, result);
-		if (r < 0) {
-			DMERR("crypto_shash_final failed: %d", r);
+		r = verity_hash_final(v, desc, io_real_digest(v, io));
+		if (unlikely(r < 0))
 			return r;
-		}
-		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
-					      io->block + b))
-				return -EIO;
-		}
+
+		if (likely(memcmp(io_real_digest(v, io),
+				io_want_digest(v, io), v->digest_size) == 0))
+			continue;
+		else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+				io->block + b))
+			return -EIO;
 	}
 
 	return 0;

From 7475611ceceb9eb0cfd31f44d2c820444e1276d0 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 5 Nov 2015 02:02:32 +0000
Subject: [PATCH 0004/3220] UPSTREAM: dm verity: separate function for parsing
 opt args

Move optional argument parsing into a separate function to make it
easier to add more of them without making verity_ctr even longer.

Change-Id: I9cd9df41c3326824f8cca5764075501987e78a52
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit 753c1fd02807cb43a1c5d01d75d454054d46bdad)
---
 drivers/md/dm-verity.c | 71 +++++++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 28 deletions(-)

diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 24517055bd8e..b0a53c3b926d 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -34,6 +34,8 @@
 #define DM_VERITY_OPT_LOGGING		"ignore_corruption"
 #define DM_VERITY_OPT_RESTART		"restart_on_corruption"
 
+#define DM_VERITY_OPTS_MAX		1
+
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
@@ -721,6 +723,44 @@ static void verity_dtr(struct dm_target *ti)
 	kfree(v);
 }
 
+static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
+{
+	int r;
+	unsigned argc;
+	struct dm_target *ti = v->ti;
+	const char *arg_name;
+
+	static struct dm_arg _args[] = {
+		{0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"},
+	};
+
+	r = dm_read_arg_group(_args, as, &argc, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	if (!argc)
+		return 0;
+
+	do {
+		arg_name = dm_shift_arg(as);
+		argc--;
+
+		if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) {
+			v->mode = DM_VERITY_MODE_LOGGING;
+			continue;
+
+		} else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) {
+			v->mode = DM_VERITY_MODE_RESTART;
+			continue;
+		}
+
+		ti->error = "Unrecognized verity feature request";
+		return -EINVAL;
+	} while (argc && !r);
+
+	return r;
+}
+
 /*
  * Target parameters:
  *	<version>	The current format is version 1.
@@ -739,18 +779,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	struct dm_verity *v;
 	struct dm_arg_set as;
-	const char *opt_string;
-	unsigned int num, opt_params;
+	unsigned int num;
 	unsigned long long num_ll;
 	int r;
 	int i;
 	sector_t hash_position;
 	char dummy;
 
-	static struct dm_arg _args[] = {
-		{0, 1, "Invalid number of feature args"},
-	};
-
 	v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
 	if (!v) {
 		ti->error = "Cannot allocate verity structure";
@@ -895,29 +930,9 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		as.argc = argc;
 		as.argv = argv;
 
-		r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
-		if (r)
+		r = verity_parse_opt_args(&as, v);
+		if (r < 0)
 			goto bad;
-
-		while (opt_params) {
-			opt_params--;
-			opt_string = dm_shift_arg(&as);
-			if (!opt_string) {
-				ti->error = "Not enough feature arguments";
-				r = -EINVAL;
-				goto bad;
-			}
-
-			if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
-				v->mode = DM_VERITY_MODE_LOGGING;
-			else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
-				v->mode = DM_VERITY_MODE_RESTART;
-			else {
-				ti->error = "Invalid feature arguments";
-				r = -EINVAL;
-				goto bad;
-			}
-		}
 	}
 
 	v->hash_per_block_bits =

From 6558fe3cbe58107b887fb449faf7cf0295f65932 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 3 Dec 2015 15:36:00 -0500
Subject: [PATCH 0005/3220] UPSTREAM: dm verity: move dm-verity.c to
 dm-verity-target.c

Prepare for extending dm-verity with an optional object.  Follows the
naming convention used by other DM targets (e.g. dm-cache and dm-era).

Change-Id: If6d2f27b290adf14fa77f3745fdc13aaa417c8dc
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit 03045cbafa2d663ad8d0a583ac219d202d824344)
---
 drivers/md/Makefile                            | 1 +
 drivers/md/{dm-verity.c => dm-verity-target.c} | 0
 2 files changed, 1 insertion(+)
 rename drivers/md/{dm-verity.c => dm-verity-target.c} (100%)

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index f34979cd141a..94e9f6bb33d1 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -16,6 +16,7 @@ dm-cache-mq-y   += dm-cache-policy-mq.o
 dm-cache-smq-y   += dm-cache-policy-smq.o
 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
+dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o raid5-cache.o
 
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity-target.c
similarity index 100%
rename from drivers/md/dm-verity.c
rename to drivers/md/dm-verity-target.c

From 3ec912f8ef374d0805eaf496535aaf51b159e355 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 3 Dec 2015 16:01:51 -0500
Subject: [PATCH 0006/3220] UPSTREAM: dm verity: factor out structures and
 functions useful to separate object

Prepare for an optional verity object to make use of existing dm-verity
structures and functions.

Change-Id: Ib14c3834bfed222b33e068908fb5f71a53e1187b
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit ffa393807cd69656d5b6bc9d9622e205071cbab8)
---
 drivers/md/dm-verity-target.c | 116 +++++-----------------------------
 drivers/md/dm-verity.h        | 112 ++++++++++++++++++++++++++++++++
 2 files changed, 128 insertions(+), 100 deletions(-)
 create mode 100644 drivers/md/dm-verity.h

diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index b0a53c3b926d..7e200ba631fb 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -14,12 +14,10 @@
  * access behavior.
  */
 
-#include "dm-bufio.h"
+#include "dm-verity.h"
 
 #include <linux/module.h>
-#include <linux/device-mapper.h>
 #include <linux/reboot.h>
-#include <crypto/hash.h>
 
 #define DM_MSG_PREFIX			"verity"
 
@@ -28,7 +26,6 @@
 
 #define DM_VERITY_DEFAULT_PREFETCH_SIZE	262144
 
-#define DM_VERITY_MAX_LEVELS		63
 #define DM_VERITY_MAX_CORRUPTED_ERRS	100
 
 #define DM_VERITY_OPT_LOGGING		"ignore_corruption"
@@ -40,72 +37,6 @@ static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
 
-enum verity_mode {
-	DM_VERITY_MODE_EIO,
-	DM_VERITY_MODE_LOGGING,
-	DM_VERITY_MODE_RESTART
-};
-
-enum verity_block_type {
-	DM_VERITY_BLOCK_TYPE_DATA,
-	DM_VERITY_BLOCK_TYPE_METADATA
-};
-
-struct dm_verity {
-	struct dm_dev *data_dev;
-	struct dm_dev *hash_dev;
-	struct dm_target *ti;
-	struct dm_bufio_client *bufio;
-	char *alg_name;
-	struct crypto_shash *tfm;
-	u8 *root_digest;	/* digest of the root block */
-	u8 *salt;		/* salt: its size is salt_size */
-	unsigned salt_size;
-	sector_t data_start;	/* data offset in 512-byte sectors */
-	sector_t hash_start;	/* hash start in blocks */
-	sector_t data_blocks;	/* the number of data blocks */
-	sector_t hash_blocks;	/* the number of hash blocks */
-	unsigned char data_dev_block_bits;	/* log2(data blocksize) */
-	unsigned char hash_dev_block_bits;	/* log2(hash blocksize) */
-	unsigned char hash_per_block_bits;	/* log2(hashes in hash block) */
-	unsigned char levels;	/* the number of tree levels */
-	unsigned char version;
-	unsigned digest_size;	/* digest size for the current hash algorithm */
-	unsigned shash_descsize;/* the size of temporary space for crypto */
-	int hash_failed;	/* set to 1 if hash of any block failed */
-	enum verity_mode mode;	/* mode for handling verification errors */
-	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
-
-	struct workqueue_struct *verify_wq;
-
-	/* starting blocks for each tree level. 0 is the lowest level. */
-	sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
-};
-
-struct dm_verity_io {
-	struct dm_verity *v;
-
-	/* original value of bio->bi_end_io */
-	bio_end_io_t *orig_bi_end_io;
-
-	sector_t block;
-	unsigned n_blocks;
-
-	struct bvec_iter iter;
-
-	struct work_struct work;
-
-	/*
-	 * Three variably-size fields follow this struct:
-	 *
-	 * u8 hash_desc[v->shash_descsize];
-	 * u8 real_digest[v->digest_size];
-	 * u8 want_digest[v->digest_size];
-	 *
-	 * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
-	 */
-};
-
 struct dm_verity_prefetch_work {
 	struct work_struct work;
 	struct dm_verity *v;
@@ -113,21 +44,6 @@ struct dm_verity_prefetch_work {
 	unsigned n_blocks;
 };
 
-static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
-{
-	return (struct shash_desc *)(io + 1);
-}
-
-static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
-	return (u8 *)(io + 1) + v->shash_descsize;
-}
-
-static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
-	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
-}
-
 /*
  * Auxiliary structure appended to each dm-bufio buffer. If the value
  * hash_verified is nonzero, hash of the block has been verified.
@@ -236,8 +152,8 @@ static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc,
 	return r;
 }
 
-static int verity_hash(struct dm_verity *v, struct shash_desc *desc,
-		       const u8 *data, size_t len, u8 *digest)
+int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+		const u8 *data, size_t len, u8 *digest)
 {
 	int r;
 
@@ -325,12 +241,12 @@ out:
  * Verify hash of a metadata block pertaining to the specified data block
  * ("block" argument) at a specified level ("level" argument).
  *
- * On successful return, io_want_digest(v, io) contains the hash value for
- * a lower tree level or for the data block (if we're at the lowest leve).
+ * On successful return, verity_io_want_digest(v, io) contains the hash value
+ * for a lower tree level or for the data block (if we're at the lowest level).
  *
  * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
  * If "skip_unverified" is false, unverified buffer is hashed and verified
- * against current value of io_want_digest(v, io).
+ * against current value of verity_io_want_digest(v, io).
  */
 static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			       sector_t block, int level, bool skip_unverified,
@@ -357,13 +273,13 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			goto release_ret_r;
 		}
 
-		r = verity_hash(v, io_hash_desc(v, io),
+		r = verity_hash(v, verity_io_hash_desc(v, io),
 				data, 1 << v->hash_dev_block_bits,
-				io_real_digest(v, io));
+				verity_io_real_digest(v, io));
 		if (unlikely(r < 0))
 			goto release_ret_r;
 
-		if (likely(memcmp(io_real_digest(v, io), want_digest,
+		if (likely(memcmp(verity_io_real_digest(v, io), want_digest,
 				  v->digest_size) == 0))
 			aux->hash_verified = 1;
 		else if (verity_handle_err(v,
@@ -387,8 +303,8 @@ release_ret_r:
  * Find a hash for a given block, write it to digest and verify the integrity
  * of the hash tree if necessary.
  */
-static int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
-				 sector_t block, u8 *digest)
+int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+			  sector_t block, u8 *digest)
 {
 	int i;
 	int r;
@@ -430,10 +346,10 @@ static int verity_verify_io(struct dm_verity_io *io)
 	for (b = 0; b < io->n_blocks; b++) {
 		int r;
 		unsigned todo;
-		struct shash_desc *desc = io_hash_desc(v, io);
+		struct shash_desc *desc = verity_io_hash_desc(v, io);
 
 		r = verity_hash_for_block(v, io, io->block + b,
-					  io_want_digest(v, io));
+					  verity_io_want_digest(v, io));
 		if (unlikely(r < 0))
 			return r;
 
@@ -462,12 +378,12 @@ static int verity_verify_io(struct dm_verity_io *io)
 			todo -= len;
 		} while (todo);
 
-		r = verity_hash_final(v, desc, io_real_digest(v, io));
+		r = verity_hash_final(v, desc, verity_io_real_digest(v, io));
 		if (unlikely(r < 0))
 			return r;
 
-		if (likely(memcmp(io_real_digest(v, io),
-				io_want_digest(v, io), v->digest_size) == 0))
+		if (likely(memcmp(verity_io_real_digest(v, io),
+				  verity_io_want_digest(v, io), v->digest_size) == 0))
 			continue;
 		else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
 				io->block + b))
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
new file mode 100644
index 000000000000..c7ad4fd05188
--- /dev/null
+++ b/drivers/md/dm-verity.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef DM_VERITY_H
+#define DM_VERITY_H
+
+#include "dm-bufio.h"
+#include <linux/device-mapper.h>
+#include <crypto/hash.h>
+
+#define DM_VERITY_MAX_LEVELS		63
+
+enum verity_mode {
+	DM_VERITY_MODE_EIO,
+	DM_VERITY_MODE_LOGGING,
+	DM_VERITY_MODE_RESTART
+};
+
+enum verity_block_type {
+	DM_VERITY_BLOCK_TYPE_DATA,
+	DM_VERITY_BLOCK_TYPE_METADATA
+};
+
+struct dm_verity {
+	struct dm_dev *data_dev;
+	struct dm_dev *hash_dev;
+	struct dm_target *ti;
+	struct dm_bufio_client *bufio;
+	char *alg_name;
+	struct crypto_shash *tfm;
+	u8 *root_digest;	/* digest of the root block */
+	u8 *salt;		/* salt: its size is salt_size */
+	unsigned salt_size;
+	sector_t data_start;	/* data offset in 512-byte sectors */
+	sector_t hash_start;	/* hash start in blocks */
+	sector_t data_blocks;	/* the number of data blocks */
+	sector_t hash_blocks;	/* the number of hash blocks */
+	unsigned char data_dev_block_bits;	/* log2(data blocksize) */
+	unsigned char hash_dev_block_bits;	/* log2(hash blocksize) */
+	unsigned char hash_per_block_bits;	/* log2(hashes in hash block) */
+	unsigned char levels;	/* the number of tree levels */
+	unsigned char version;
+	unsigned digest_size;	/* digest size for the current hash algorithm */
+	unsigned shash_descsize;/* the size of temporary space for crypto */
+	int hash_failed;	/* set to 1 if hash of any block failed */
+	enum verity_mode mode;	/* mode for handling verification errors */
+	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
+
+	struct workqueue_struct *verify_wq;
+
+	/* starting blocks for each tree level. 0 is the lowest level. */
+	sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+};
+
+struct dm_verity_io {
+	struct dm_verity *v;
+
+	/* original value of bio->bi_end_io */
+	bio_end_io_t *orig_bi_end_io;
+
+	sector_t block;
+	unsigned n_blocks;
+
+	struct bvec_iter iter;
+
+	struct work_struct work;
+
+	/*
+	 * Three variably-size fields follow this struct:
+	 *
+	 * u8 hash_desc[v->shash_descsize];
+	 * u8 real_digest[v->digest_size];
+	 * u8 want_digest[v->digest_size];
+	 *
+	 * To access them use: verity_io_hash_desc(), verity_io_real_digest()
+	 * and verity_io_want_digest().
+	 */
+};
+
+static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v,
+						     struct dm_verity_io *io)
+{
+	return (struct shash_desc *)(io + 1);
+}
+
+static inline u8 *verity_io_real_digest(struct dm_verity *v,
+					struct dm_verity_io *io)
+{
+	return (u8 *)(io + 1) + v->shash_descsize;
+}
+
+static inline u8 *verity_io_want_digest(struct dm_verity *v,
+					struct dm_verity_io *io)
+{
+	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+}
+
+extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+		       const u8 *data, size_t len, u8 *digest);
+
+extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+				 sector_t block, u8 *digest);
+
+#endif /* DM_VERITY_H */

From 890b7865c5f5e7e5b5bc6acaa4193d967fb8bea7 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 3 Dec 2015 16:30:36 -0500
Subject: [PATCH 0007/3220] UPSTREAM: dm verity: factor out
 verity_for_bv_block()

verity_for_bv_block() will be re-used by optional dm-verity object.

Change-Id: I80e0f8e7c9f234fce3fbdf21cb05aba3041d7f98
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit bb4d73ac5e4f0a6c4853f35824f6cb2d396a2f9c)
---
 drivers/md/dm-verity-target.c | 72 ++++++++++++++++++++++++-----------
 drivers/md/dm-verity.h        |  6 +++
 2 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 7e200ba631fb..2b0ee52d1ad8 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -333,19 +333,61 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 	return 0;
 }
 
+/*
+ * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
+ * starting from iter.
+ */
+int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
+			struct bvec_iter *iter,
+			int (*process)(struct dm_verity *v,
+				       struct dm_verity_io *io, u8 *data,
+				       size_t len))
+{
+	unsigned todo = 1 << v->data_dev_block_bits;
+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+
+	do {
+		int r;
+		u8 *page;
+		unsigned len;
+		struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+		page = kmap_atomic(bv.bv_page);
+		len = bv.bv_len;
+
+		if (likely(len >= todo))
+			len = todo;
+
+		r = process(v, io, page + bv.bv_offset, len);
+		kunmap_atomic(page);
+
+		if (r < 0)
+			return r;
+
+		bio_advance_iter(bio, iter, len);
+		todo -= len;
+	} while (todo);
+
+	return 0;
+}
+
+static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
+				 u8 *data, size_t len)
+{
+	return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
+}
+
 /*
  * Verify one "dm_verity_io" structure.
  */
 static int verity_verify_io(struct dm_verity_io *io)
 {
 	struct dm_verity *v = io->v;
-	struct bio *bio = dm_bio_from_per_bio_data(io,
-						   v->ti->per_bio_data_size);
+	struct bvec_iter start;
 	unsigned b;
 
 	for (b = 0; b < io->n_blocks; b++) {
 		int r;
-		unsigned todo;
 		struct shash_desc *desc = verity_io_hash_desc(v, io);
 
 		r = verity_hash_for_block(v, io, io->block + b,
@@ -357,26 +399,10 @@ static int verity_verify_io(struct dm_verity_io *io)
 		if (unlikely(r < 0))
 			return r;
 
-		todo = 1 << v->data_dev_block_bits;
-		do {
-			u8 *page;
-			unsigned len;
-			struct bio_vec bv = bio_iter_iovec(bio, io->iter);
-
-			page = kmap_atomic(bv.bv_page);
-			len = bv.bv_len;
-			if (likely(len >= todo))
-				len = todo;
-			r = verity_hash_update(v, desc,  page + bv.bv_offset,
-					       len);
-			kunmap_atomic(page);
-
-			if (unlikely(r < 0))
-				return r;
-
-			bio_advance_iter(bio, &io->iter, len);
-			todo -= len;
-		} while (todo);
+		start = io->iter;
+		r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update);
+		if (unlikely(r < 0))
+			return r;
 
 		r = verity_hash_final(v, desc, verity_io_real_digest(v, io));
 		if (unlikely(r < 0))
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index c7ad4fd05188..f5af52df8e38 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -103,6 +103,12 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v,
 	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
 }
 
+extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
+			       struct bvec_iter *iter,
+			       int (*process)(struct dm_verity *v,
+					      struct dm_verity_io *io,
+					      u8 *data, size_t len));
+
 extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
 		       const u8 *data, size_t len, u8 *digest);
 

From 83c1827e5887c9288e4ecb7962c6dc63596ad922 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 3 Dec 2015 14:26:30 +0000
Subject: [PATCH 0008/3220] UPSTREAM: dm verity: add support for forward error
 correction

Add support for correcting corrupted blocks using Reed-Solomon.

This code uses RS(255, N) interleaved across data and hash
blocks. Each error-correcting block covers N bytes evenly
distributed across the combined total data, so that each byte is a
maximum distance away from the others. This makes it possible to
recover from several consecutive corrupted blocks with relatively
small space overhead.

In addition, using verity hashes to locate erasures nearly doubles
the effectiveness of error correction. Being able to detect
corrupted blocks also improves performance, because only corrupted
blocks need to corrected.

For a 2 GiB partition, RS(255, 253) (two parity bytes for each
253-byte block) can correct up to 16 MiB of consecutive corrupted
blocks if erasures can be located, and 8 MiB if they cannot, with
16 MiB space overhead.

Change-Id: Ife4f8889f7fbf0974bf3ed4be6d3322ae9b4cb0e
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit a739ff3f543afbb4a041c16cd0182c8e8d366e70)
---
 Documentation/device-mapper/verity.txt |  35 +-
 drivers/md/Kconfig                     |  12 +
 drivers/md/Makefile                    |   4 +
 drivers/md/dm-verity-fec.c             | 812 +++++++++++++++++++++++++
 drivers/md/dm-verity-fec.h             | 152 +++++
 drivers/md/dm-verity-target.c          |  55 +-
 drivers/md/dm-verity.h                 |  10 +
 7 files changed, 1071 insertions(+), 9 deletions(-)
 create mode 100644 drivers/md/dm-verity-fec.c
 create mode 100644 drivers/md/dm-verity-fec.h

diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
index e15bc1a0fb98..d602c801ff59 100644
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -18,11 +18,11 @@ Construction Parameters
 
     0 is the original format used in the Chromium OS.
       The salt is appended when hashing, digests are stored continuously and
-      the rest of the block is padded with zeros.
+      the rest of the block is padded with zeroes.
 
     1 is the current format that should be used for new devices.
       The salt is prepended when hashing and each digest is
-      padded with zeros to the power of two.
+      padded with zeroes to the power of two.
 
 <dev>
     This is the device containing data, the integrity of which needs to be
@@ -79,6 +79,32 @@ restart_on_corruption
     not compatible with ignore_corruption and requires user space support to
     avoid restart loops.
 
+use_fec_from_device <fec_dev>
+    Use forward error correction (FEC) to recover from corruption if hash
+    verification fails. Use encoding data from the specified device. This
+    may be the same device where data and hash blocks reside, in which case
+    fec_start must be outside data and hash areas.
+
+    If the encoding data covers additional metadata, it must be accessible
+    on the hash device after the hash blocks.
+
+    Note: block sizes for data and hash devices must match. Also, if the
+    verity <dev> is encrypted the <fec_dev> should be too.
+
+fec_roots <num>
+    Number of generator roots. This equals to the number of parity bytes in
+    the encoding data. For example, in RS(M, N) encoding, the number of roots
+    is M-N.
+
+fec_blocks <num>
+    The number of encoding data blocks on the FEC device. The block size for
+    the FEC device is <data_block_size>.
+
+fec_start <offset>
+    This is the offset, in <data_block_size> blocks, from the start of the
+    FEC device to the beginning of the encoding data.
+
+
 Theory of operation
 ===================
 
@@ -98,6 +124,11 @@ per-block basis. This allows for a lightweight hash computation on first read
 into the page cache. Block hashes are stored linearly, aligned to the nearest
 block size.
 
+If forward error correction (FEC) support is enabled any recovery of
+corrupted data will be verified using the cryptographic hash of the
+corresponding data. This is why combining error correction with
+integrity checking is essential.
+
 Hash Tree
 ---------
 
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7913fdcfc849..d8b0ab6f3753 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -458,6 +458,18 @@ config DM_VERITY
 
 	  If unsure, say N.
 
+config DM_VERITY_FEC
+	bool "Verity forward error correction support"
+	depends on DM_VERITY
+	select REED_SOLOMON
+	select REED_SOLOMON_DEC8
+	---help---
+	  Add forward error correction support to dm-verity. This option
+	  makes it possible to use pre-generated error correction data to
+	  recover from corrupted blocks.
+
+	  If unsure, say N.
+
 config DM_SWITCH
 	tristate "Switch target support (EXPERIMENTAL)"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 94e9f6bb33d1..62a65764e8e0 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -64,3 +64,7 @@ obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
 endif
+
+ifeq ($(CONFIG_DM_VERITY_FEC),y)
+dm-verity-objs			+= dm-verity-fec.o
+endif
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
new file mode 100644
index 000000000000..88143d36a1d2
--- /dev/null
+++ b/drivers/md/dm-verity-fec.c
@@ -0,0 +1,812 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Sami Tolvanen <samitolvanen@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include "dm-verity-fec.h"
+#include <linux/math64.h>
+
+#define DM_MSG_PREFIX	"verity-fec"
+
+/*
+ * If error correction has been configured, returns true.
+ */
+bool verity_fec_is_enabled(struct dm_verity *v)
+{
+	return v->fec && v->fec->dev;
+}
+
+/*
+ * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable
+ * length fields.
+ */
+static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io)
+{
+	return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io);
+}
+
+/*
+ * Return an interleaved offset for a byte in RS block.
+ */
+static inline u64 fec_interleave(struct dm_verity *v, u64 offset)
+{
+	u32 mod;
+
+	mod = do_div(offset, v->fec->rsn);
+	return offset + mod * (v->fec->rounds << v->data_dev_block_bits);
+}
+
+/*
+ * Decode an RS block using Reed-Solomon.
+ */
+static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio,
+			  u8 *data, u8 *fec, int neras)
+{
+	int i;
+	uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
+
+	for (i = 0; i < v->fec->roots; i++)
+		par[i] = fec[i];
+
+	return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras,
+			  fio->erasures, 0, NULL);
+}
+
+/*
+ * Read error-correcting codes for the requested RS block. Returns a pointer
+ * to the data block. Caller is responsible for releasing buf.
+ */
+static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
+			   unsigned *offset, struct dm_buffer **buf)
+{
+	u64 position, block;
+	u8 *res;
+
+	position = (index + rsb) * v->fec->roots;
+	block = position >> v->data_dev_block_bits;
+	*offset = (unsigned)(position - (block << v->data_dev_block_bits));
+
+	res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf);
+	if (unlikely(IS_ERR(res))) {
+		DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
+		      v->data_dev->name, (unsigned long long)rsb,
+		      (unsigned long long)(v->fec->start + block),
+		      PTR_ERR(res));
+		*buf = NULL;
+	}
+
+	return res;
+}
+
+/* Loop over each preallocated buffer slot. */
+#define fec_for_each_prealloc_buffer(__i) \
+	for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++)
+
+/* Loop over each extra buffer slot. */
+#define fec_for_each_extra_buffer(io, __i) \
+	for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++)
+
+/* Loop over each allocated buffer. */
+#define fec_for_each_buffer(io, __i) \
+	for (__i = 0; __i < (io)->nbufs; __i++)
+
+/* Loop over each RS block in each allocated buffer. */
+#define fec_for_each_buffer_rs_block(io, __i, __j) \
+	fec_for_each_buffer(io, __i) \
+		for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++)
+
+/*
+ * Return a pointer to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline u8 *fec_buffer_rs_block(struct dm_verity *v,
+				      struct dm_verity_fec_io *fio,
+				      unsigned i, unsigned j)
+{
+	return &fio->bufs[i][j * v->fec->rsn];
+}
+
+/*
+ * Return an index to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline unsigned fec_buffer_rs_index(unsigned i, unsigned j)
+{
+	return (i << DM_VERITY_FEC_BUF_RS_BITS) + j;
+}
+
+/*
+ * Decode all RS blocks from buffers and copy corrected bytes into fio->output
+ * starting from block_offset.
+ */
+static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
+			   u64 rsb, int byte_index, unsigned block_offset,
+			   int neras)
+{
+	int r, corrected = 0, res;
+	struct dm_buffer *buf;
+	unsigned n, i, offset;
+	u8 *par, *block;
+
+	par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+	if (IS_ERR(par))
+		return PTR_ERR(par);
+
+	/*
+	 * Decode the RS blocks we have in bufs. Each RS block results in
+	 * one corrected target byte and consumes fec->roots parity bytes.
+	 */
+	fec_for_each_buffer_rs_block(fio, n, i) {
+		block = fec_buffer_rs_block(v, fio, n, i);
+		res = fec_decode_rs8(v, fio, block, &par[offset], neras);
+		if (res < 0) {
+			dm_bufio_release(buf);
+
+			r = res;
+			goto error;
+		}
+
+		corrected += res;
+		fio->output[block_offset] = block[byte_index];
+
+		block_offset++;
+		if (block_offset >= 1 << v->data_dev_block_bits)
+			goto done;
+
+		/* read the next block when we run out of parity bytes */
+		offset += v->fec->roots;
+		if (offset >= 1 << v->data_dev_block_bits) {
+			dm_bufio_release(buf);
+
+			par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+			if (unlikely(IS_ERR(par)))
+				return PTR_ERR(par);
+		}
+	}
+done:
+	r = corrected;
+error:
+	if (r < 0 && neras)
+		DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
+			    v->data_dev->name, (unsigned long long)rsb, r);
+	else if (r > 0)
+		DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
+			     v->data_dev->name, (unsigned long long)rsb, r);
+
+	return r;
+}
+
+/*
+ * Locate data block erasures using verity hashes.
+ */
+static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
+			  u8 *want_digest, u8 *data)
+{
+	if (unlikely(verity_hash(v, verity_io_hash_desc(v, io),
+				 data, 1 << v->data_dev_block_bits,
+				 verity_io_real_digest(v, io))))
+		return 0;
+
+	return memcmp(verity_io_real_digest(v, io), want_digest,
+		      v->digest_size) != 0;
+}
+
+/*
+ * Read data blocks that are part of the RS block and deinterleave as much as
+ * fits into buffers. Check for erasure locations if @neras is non-NULL.
+ */
+static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
+			 u64 rsb, u64 target, unsigned block_offset,
+			 int *neras)
+{
+	int i, j, target_index = -1;
+	struct dm_buffer *buf;
+	struct dm_bufio_client *bufio;
+	struct dm_verity_fec_io *fio = fec_io(io);
+	u64 block, ileaved;
+	u8 *bbuf, *rs_block;
+	u8 want_digest[v->digest_size];
+	unsigned n, k;
+
+	if (neras)
+		*neras = 0;
+
+	/*
+	 * read each of the rsn data blocks that are part of the RS block, and
+	 * interleave contents to available bufs
+	 */
+	for (i = 0; i < v->fec->rsn; i++) {
+		ileaved = fec_interleave(v, rsb * v->fec->rsn + i);
+
+		/*
+		 * target is the data block we want to correct, target_index is
+		 * the index of this block within the rsn RS blocks
+		 */
+		if (ileaved == target)
+			target_index = i;
+
+		block = ileaved >> v->data_dev_block_bits;
+		bufio = v->fec->data_bufio;
+
+		if (block >= v->data_blocks) {
+			block -= v->data_blocks;
+
+			/*
+			 * blocks outside the area were assumed to contain
+			 * zeros when encoding data was generated
+			 */
+			if (unlikely(block >= v->fec->hash_blocks))
+				continue;
+
+			block += v->hash_start;
+			bufio = v->bufio;
+		}
+
+		bbuf = dm_bufio_read(bufio, block, &buf);
+		if (unlikely(IS_ERR(bbuf))) {
+			DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
+				     v->data_dev->name,
+				     (unsigned long long)rsb,
+				     (unsigned long long)block, PTR_ERR(bbuf));
+
+			/* assume the block is corrupted */
+			if (neras && *neras <= v->fec->roots)
+				fio->erasures[(*neras)++] = i;
+
+			continue;
+		}
+
+		/* locate erasures if the block is on the data device */
+		if (bufio == v->fec->data_bufio &&
+		    verity_hash_for_block(v, io, block, want_digest) == 0) {
+			/*
+			 * skip if we have already found the theoretical
+			 * maximum number (i.e. fec->roots) of erasures
+			 */
+			if (neras && *neras <= v->fec->roots &&
+			    fec_is_erasure(v, io, want_digest, bbuf))
+				fio->erasures[(*neras)++] = i;
+		}
+
+		/*
+		 * deinterleave and copy the bytes that fit into bufs,
+		 * starting from block_offset
+		 */
+		fec_for_each_buffer_rs_block(fio, n, j) {
+			k = fec_buffer_rs_index(n, j) + block_offset;
+
+			if (k >= 1 << v->data_dev_block_bits)
+				goto done;
+
+			rs_block = fec_buffer_rs_block(v, fio, n, j);
+			rs_block[i] = bbuf[k];
+		}
+done:
+		dm_bufio_release(buf);
+	}
+
+	return target_index;
+}
+
+/*
+ * Allocate RS control structure and FEC buffers from preallocated mempools,
+ * and attempt to allocate as many extra buffers as available.
+ */
+static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+	unsigned n;
+
+	if (!fio->rs) {
+		fio->rs = mempool_alloc(v->fec->rs_pool, 0);
+		if (unlikely(!fio->rs)) {
+			DMERR("failed to allocate RS");
+			return -ENOMEM;
+		}
+	}
+
+	fec_for_each_prealloc_buffer(n) {
+		if (fio->bufs[n])
+			continue;
+
+		fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO);
+		if (unlikely(!fio->bufs[n])) {
+			DMERR("failed to allocate FEC buffer");
+			return -ENOMEM;
+		}
+	}
+
+	/* try to allocate the maximum number of buffers */
+	fec_for_each_extra_buffer(fio, n) {
+		if (fio->bufs[n])
+			continue;
+
+		fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO);
+		/* we can manage with even one buffer if necessary */
+		if (unlikely(!fio->bufs[n]))
+			break;
+	}
+	fio->nbufs = n;
+
+	if (!fio->output) {
+		fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
+
+		if (!fio->output) {
+			DMERR("failed to allocate FEC page");
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Initialize buffers and clear erasures. fec_read_bufs() assumes buffers are
+ * zeroed before deinterleaving.
+ */
+static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+	unsigned n;
+
+	fec_for_each_buffer(fio, n)
+		memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS);
+
+	memset(fio->erasures, 0, sizeof(fio->erasures));
+}
+
+/*
+ * Decode all RS blocks in a single data block and return the target block
+ * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses
+ * hashes to locate erasures.
+ */
+static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
+			  struct dm_verity_fec_io *fio, u64 rsb, u64 offset,
+			  bool use_erasures)
+{
+	int r, neras = 0;
+	unsigned pos;
+
+	r = fec_alloc_bufs(v, fio);
+	if (unlikely(r < 0))
+		return r;
+
+	for (pos = 0; pos < 1 << v->data_dev_block_bits; ) {
+		fec_init_bufs(v, fio);
+
+		r = fec_read_bufs(v, io, rsb, offset, pos,
+				  use_erasures ? &neras : NULL);
+		if (unlikely(r < 0))
+			return r;
+
+		r = fec_decode_bufs(v, fio, rsb, r, pos, neras);
+		if (r < 0)
+			return r;
+
+		pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS;
+	}
+
+	/* Always re-validate the corrected block against the expected hash */
+	r = verity_hash(v, verity_io_hash_desc(v, io), fio->output,
+			1 << v->data_dev_block_bits,
+			verity_io_real_digest(v, io));
+	if (unlikely(r < 0))
+		return r;
+
+	if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io),
+		   v->digest_size)) {
+		DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)",
+			    v->data_dev->name, (unsigned long long)rsb, neras);
+		return -EILSEQ;
+	}
+
+	return 0;
+}
+
+static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data,
+		       size_t len)
+{
+	struct dm_verity_fec_io *fio = fec_io(io);
+
+	memcpy(data, &fio->output[fio->output_pos], len);
+	fio->output_pos += len;
+
+	return 0;
+}
+
+/*
+ * Correct errors in a block. Copies corrected block to dest if non-NULL,
+ * otherwise to a bio_vec starting from iter.
+ */
+int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+		      enum verity_block_type type, sector_t block, u8 *dest,
+		      struct bvec_iter *iter)
+{
+	int r;
+	struct dm_verity_fec_io *fio = fec_io(io);
+	u64 offset, res, rsb;
+
+	if (!verity_fec_is_enabled(v))
+		return -EOPNOTSUPP;
+
+	if (type == DM_VERITY_BLOCK_TYPE_METADATA)
+		block += v->data_blocks;
+
+	/*
+	 * For RS(M, N), the continuous FEC data is divided into blocks of N
+	 * bytes. Since block size may not be divisible by N, the last block
+	 * is zero padded when decoding.
+	 *
+	 * Each byte of the block is covered by a different RS(M, N) code,
+	 * and each code is interleaved over N blocks to make it less likely
+	 * that bursty corruption will leave us in unrecoverable state.
+	 */
+
+	offset = block << v->data_dev_block_bits;
+
+	res = offset;
+	div64_u64(res, v->fec->rounds << v->data_dev_block_bits);
+
+	/*
+	 * The base RS block we can feed to the interleaver to find out all
+	 * blocks required for decoding.
+	 */
+	rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits);
+
+	/*
+	 * Locating erasures is slow, so attempt to recover the block without
+	 * them first. Do a second attempt with erasures if the corruption is
+	 * bad enough.
+	 */
+	r = fec_decode_rsb(v, io, fio, rsb, offset, false);
+	if (r < 0) {
+		r = fec_decode_rsb(v, io, fio, rsb, offset, true);
+		if (r < 0)
+			return r;
+	}
+
+	if (dest)
+		memcpy(dest, fio->output, 1 << v->data_dev_block_bits);
+	else if (iter) {
+		fio->output_pos = 0;
+		r = verity_for_bv_block(v, io, iter, fec_bv_copy);
+	}
+
+	return r;
+}
+
+/*
+ * Clean up per-bio data.
+ */
+void verity_fec_finish_io(struct dm_verity_io *io)
+{
+	unsigned n;
+	struct dm_verity_fec *f = io->v->fec;
+	struct dm_verity_fec_io *fio = fec_io(io);
+
+	if (!verity_fec_is_enabled(io->v))
+		return;
+
+	mempool_free(fio->rs, f->rs_pool);
+
+	fec_for_each_prealloc_buffer(n)
+		mempool_free(fio->bufs[n], f->prealloc_pool);
+
+	fec_for_each_extra_buffer(fio, n)
+		mempool_free(fio->bufs[n], f->extra_pool);
+
+	mempool_free(fio->output, f->output_pool);
+}
+
+/*
+ * Initialize per-bio data.
+ */
+void verity_fec_init_io(struct dm_verity_io *io)
+{
+	struct dm_verity_fec_io *fio = fec_io(io);
+
+	if (!verity_fec_is_enabled(io->v))
+		return;
+
+	fio->rs = NULL;
+	memset(fio->bufs, 0, sizeof(fio->bufs));
+	fio->nbufs = 0;
+	fio->output = NULL;
+}
+
+/*
+ * Append feature arguments and values to the status table.
+ */
+unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
+				 char *result, unsigned maxlen)
+{
+	if (!verity_fec_is_enabled(v))
+		return sz;
+
+	DMEMIT(" " DM_VERITY_OPT_FEC_DEV " %s "
+	       DM_VERITY_OPT_FEC_BLOCKS " %llu "
+	       DM_VERITY_OPT_FEC_START " %llu "
+	       DM_VERITY_OPT_FEC_ROOTS " %d",
+	       v->fec->dev->name,
+	       (unsigned long long)v->fec->blocks,
+	       (unsigned long long)v->fec->start,
+	       v->fec->roots);
+
+	return sz;
+}
+
+void verity_fec_dtr(struct dm_verity *v)
+{
+	struct dm_verity_fec *f = v->fec;
+
+	if (!verity_fec_is_enabled(v))
+		goto out;
+
+	mempool_destroy(f->rs_pool);
+	mempool_destroy(f->prealloc_pool);
+	mempool_destroy(f->extra_pool);
+	kmem_cache_destroy(f->cache);
+
+	if (f->data_bufio)
+		dm_bufio_client_destroy(f->data_bufio);
+	if (f->bufio)
+		dm_bufio_client_destroy(f->bufio);
+
+	if (f->dev)
+		dm_put_device(v->ti, f->dev);
+out:
+	kfree(f);
+	v->fec = NULL;
+}
+
+static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	struct dm_verity *v = (struct dm_verity *)pool_data;
+
+	return init_rs(8, 0x11d, 0, 1, v->fec->roots);
+}
+
+static void fec_rs_free(void *element, void *pool_data)
+{
+	struct rs_control *rs = (struct rs_control *)element;
+
+	if (rs)
+		free_rs(rs);
+}
+
+bool verity_is_fec_opt_arg(const char *arg_name)
+{
+	return (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV) ||
+		!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS) ||
+		!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START) ||
+		!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS));
+}
+
+int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+			      unsigned *argc, const char *arg_name)
+{
+	int r;
+	struct dm_target *ti = v->ti;
+	const char *arg_value;
+	unsigned long long num_ll;
+	unsigned char num_c;
+	char dummy;
+
+	if (!*argc) {
+		ti->error = "FEC feature arguments require a value";
+		return -EINVAL;
+	}
+
+	arg_value = dm_shift_arg(as);
+	(*argc)--;
+
+	if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) {
+		r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev);
+		if (r) {
+			ti->error = "FEC device lookup failed";
+			return r;
+		}
+
+	} else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS)) {
+		if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 ||
+		    ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
+		     >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) {
+			ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS;
+			return -EINVAL;
+		}
+		v->fec->blocks = num_ll;
+
+	} else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START)) {
+		if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 ||
+		    ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) >>
+		     (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) {
+			ti->error = "Invalid " DM_VERITY_OPT_FEC_START;
+			return -EINVAL;
+		}
+		v->fec->start = num_ll;
+
+	} else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) {
+		if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c ||
+		    num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) ||
+		    num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) {
+			ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS;
+			return -EINVAL;
+		}
+		v->fec->roots = num_c;
+
+	} else {
+		ti->error = "Unrecognized verity FEC feature request";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr.
+ */
+int verity_fec_ctr_alloc(struct dm_verity *v)
+{
+	struct dm_verity_fec *f;
+
+	f = kzalloc(sizeof(struct dm_verity_fec), GFP_KERNEL);
+	if (!f) {
+		v->ti->error = "Cannot allocate FEC structure";
+		return -ENOMEM;
+	}
+	v->fec = f;
+
+	return 0;
+}
+
+/*
+ * Validate arguments and preallocate memory. Must be called after arguments
+ * have been parsed using verity_fec_parse_opt_args.
+ */
+int verity_fec_ctr(struct dm_verity *v)
+{
+	struct dm_verity_fec *f = v->fec;
+	struct dm_target *ti = v->ti;
+	u64 hash_blocks;
+
+	if (!verity_fec_is_enabled(v)) {
+		verity_fec_dtr(v);
+		return 0;
+	}
+
+	/*
+	 * FEC is computed over data blocks, possible metadata, and
+	 * hash blocks. In other words, FEC covers total of fec_blocks
+	 * blocks consisting of the following:
+	 *
+	 *  data blocks | hash blocks | metadata (optional)
+	 *
+	 * We allow metadata after hash blocks to support a use case
+	 * where all data is stored on the same device and FEC covers
+	 * the entire area.
+	 *
+	 * If metadata is included, we require it to be available on the
+	 * hash device after the hash blocks.
+	 */
+
+	hash_blocks = v->hash_blocks - v->hash_start;
+
+	/*
+	 * Require matching block sizes for data and hash devices for
+	 * simplicity.
+	 */
+	if (v->data_dev_block_bits != v->hash_dev_block_bits) {
+		ti->error = "Block sizes must match to use FEC";
+		return -EINVAL;
+	}
+
+	if (!f->roots) {
+		ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS;
+		return -EINVAL;
+	}
+	f->rsn = DM_VERITY_FEC_RSM - f->roots;
+
+	if (!f->blocks) {
+		ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS;
+		return -EINVAL;
+	}
+
+	f->rounds = f->blocks;
+	if (sector_div(f->rounds, f->rsn))
+		f->rounds++;
+
+	/*
+	 * Due to optional metadata, f->blocks can be larger than
+	 * data_blocks and hash_blocks combined.
+	 */
+	if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) {
+		ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS;
+		return -EINVAL;
+	}
+
+	/*
+	 * Metadata is accessed through the hash device, so we require
+	 * it to be large enough.
+	 */
+	f->hash_blocks = f->blocks - v->data_blocks;
+	if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) {
+		ti->error = "Hash device is too small for "
+			DM_VERITY_OPT_FEC_BLOCKS;
+		return -E2BIG;
+	}
+
+	f->bufio = dm_bufio_client_create(f->dev->bdev,
+					  1 << v->data_dev_block_bits,
+					  1, 0, NULL, NULL);
+	if (IS_ERR(f->bufio)) {
+		ti->error = "Cannot initialize FEC bufio client";
+		return PTR_ERR(f->bufio);
+	}
+
+	if (dm_bufio_get_device_size(f->bufio) <
+	    ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) {
+		ti->error = "FEC device is too small";
+		return -E2BIG;
+	}
+
+	f->data_bufio = dm_bufio_client_create(v->data_dev->bdev,
+					       1 << v->data_dev_block_bits,
+					       1, 0, NULL, NULL);
+	if (IS_ERR(f->data_bufio)) {
+		ti->error = "Cannot initialize FEC data bufio client";
+		return PTR_ERR(f->data_bufio);
+	}
+
+	if (dm_bufio_get_device_size(f->data_bufio) < v->data_blocks) {
+		ti->error = "Data device is too small";
+		return -E2BIG;
+	}
+
+	/* Preallocate an rs_control structure for each worker thread */
+	f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc,
+				    fec_rs_free, (void *) v);
+	if (!f->rs_pool) {
+		ti->error = "Cannot allocate RS pool";
+		return -ENOMEM;
+	}
+
+	f->cache = kmem_cache_create("dm_verity_fec_buffers",
+				     f->rsn << DM_VERITY_FEC_BUF_RS_BITS,
+				     0, 0, NULL);
+	if (!f->cache) {
+		ti->error = "Cannot create FEC buffer cache";
+		return -ENOMEM;
+	}
+
+	/* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */
+	f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() *
+						    DM_VERITY_FEC_BUF_PREALLOC,
+						    f->cache);
+	if (!f->prealloc_pool) {
+		ti->error = "Cannot allocate FEC buffer prealloc pool";
+		return -ENOMEM;
+	}
+
+	f->extra_pool = mempool_create_slab_pool(0, f->cache);
+	if (!f->extra_pool) {
+		ti->error = "Cannot allocate FEC buffer extra pool";
+		return -ENOMEM;
+	}
+
+	/* Preallocate an output buffer for each thread */
+	f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(),
+						     1 << v->data_dev_block_bits);
+	if (!f->output_pool) {
+		ti->error = "Cannot allocate FEC output pool";
+		return -ENOMEM;
+	}
+
+	/* Reserve space for our per-bio data */
+	ti->per_bio_data_size += sizeof(struct dm_verity_fec_io);
+
+	return 0;
+}
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
new file mode 100644
index 000000000000..7fa0298b995e
--- /dev/null
+++ b/drivers/md/dm-verity-fec.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Sami Tolvanen <samitolvanen@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#ifndef DM_VERITY_FEC_H
+#define DM_VERITY_FEC_H
+
+#include "dm-verity.h"
+#include <linux/rslib.h>
+
+/* Reed-Solomon(M, N) parameters */
+#define DM_VERITY_FEC_RSM		255
+#define DM_VERITY_FEC_MAX_RSN		253
+#define DM_VERITY_FEC_MIN_RSN		231	/* ~10% space overhead */
+
+/* buffers for deinterleaving and decoding */
+#define DM_VERITY_FEC_BUF_PREALLOC	1	/* buffers to preallocate */
+#define DM_VERITY_FEC_BUF_RS_BITS	4	/* 1 << RS blocks per buffer */
+/* we need buffers for at most 1 << block size RS blocks */
+#define DM_VERITY_FEC_BUF_MAX \
+	(1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
+
+#define DM_VERITY_OPT_FEC_DEV		"use_fec_from_device"
+#define DM_VERITY_OPT_FEC_BLOCKS	"fec_blocks"
+#define DM_VERITY_OPT_FEC_START		"fec_start"
+#define DM_VERITY_OPT_FEC_ROOTS		"fec_roots"
+
+/* configuration */
+struct dm_verity_fec {
+	struct dm_dev *dev;	/* parity data device */
+	struct dm_bufio_client *data_bufio;	/* for data dev access */
+	struct dm_bufio_client *bufio;		/* for parity data access */
+	sector_t start;		/* parity data start in blocks */
+	sector_t blocks;	/* number of blocks covered */
+	sector_t rounds;	/* number of interleaving rounds */
+	sector_t hash_blocks;	/* blocks covered after v->hash_start */
+	unsigned char roots;	/* number of parity bytes, M-N of RS(M, N) */
+	unsigned char rsn;	/* N of RS(M, N) */
+	mempool_t *rs_pool;	/* mempool for fio->rs */
+	mempool_t *prealloc_pool;	/* mempool for preallocated buffers */
+	mempool_t *extra_pool;	/* mempool for extra buffers */
+	mempool_t *output_pool;	/* mempool for output */
+	struct kmem_cache *cache;	/* cache for buffers */
+};
+
+/* per-bio data */
+struct dm_verity_fec_io {
+	struct rs_control *rs;	/* Reed-Solomon state */
+	int erasures[DM_VERITY_FEC_MAX_RSN];	/* erasures for decode_rs8 */
+	u8 *bufs[DM_VERITY_FEC_BUF_MAX];	/* bufs for deinterleaving */
+	unsigned nbufs;		/* number of buffers allocated */
+	u8 *output;		/* buffer for corrected output */
+	size_t output_pos;
+};
+
+#ifdef CONFIG_DM_VERITY_FEC
+
+/* each feature parameter requires a value */
+#define DM_VERITY_OPTS_FEC	8
+
+extern bool verity_fec_is_enabled(struct dm_verity *v);
+
+extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+			     enum verity_block_type type, sector_t block,
+			     u8 *dest, struct bvec_iter *iter);
+
+extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
+					char *result, unsigned maxlen);
+
+extern void verity_fec_finish_io(struct dm_verity_io *io);
+extern void verity_fec_init_io(struct dm_verity_io *io);
+
+extern bool verity_is_fec_opt_arg(const char *arg_name);
+extern int verity_fec_parse_opt_args(struct dm_arg_set *as,
+				     struct dm_verity *v, unsigned *argc,
+				     const char *arg_name);
+
+extern void verity_fec_dtr(struct dm_verity *v);
+
+extern int verity_fec_ctr_alloc(struct dm_verity *v);
+extern int verity_fec_ctr(struct dm_verity *v);
+
+#else /* !CONFIG_DM_VERITY_FEC */
+
+#define DM_VERITY_OPTS_FEC	0
+
+static inline bool verity_fec_is_enabled(struct dm_verity *v)
+{
+	return false;
+}
+
+static inline int verity_fec_decode(struct dm_verity *v,
+				    struct dm_verity_io *io,
+				    enum verity_block_type type,
+				    sector_t block, u8 *dest,
+				    struct bvec_iter *iter)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline unsigned verity_fec_status_table(struct dm_verity *v,
+					       unsigned sz, char *result,
+					       unsigned maxlen)
+{
+	return sz;
+}
+
+static inline void verity_fec_finish_io(struct dm_verity_io *io)
+{
+}
+
+static inline void verity_fec_init_io(struct dm_verity_io *io)
+{
+}
+
+static inline bool verity_is_fec_opt_arg(const char *arg_name)
+{
+	return false;
+}
+
+static inline int verity_fec_parse_opt_args(struct dm_arg_set *as,
+					    struct dm_verity *v,
+					    unsigned *argc,
+					    const char *arg_name)
+{
+	return -EINVAL;
+}
+
+static inline void verity_fec_dtr(struct dm_verity *v)
+{
+}
+
+static inline int verity_fec_ctr_alloc(struct dm_verity *v)
+{
+	return 0;
+}
+
+static inline int verity_fec_ctr(struct dm_verity *v)
+{
+	return 0;
+}
+
+#endif /* CONFIG_DM_VERITY_FEC */
+
+#endif /* DM_VERITY_FEC_H */
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 2b0ee52d1ad8..4f90ec2c6b7a 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -15,6 +15,7 @@
  */
 
 #include "dm-verity.h"
+#include "dm-verity-fec.h"
 
 #include <linux/module.h>
 #include <linux/reboot.h>
@@ -31,7 +32,7 @@
 #define DM_VERITY_OPT_LOGGING		"ignore_corruption"
 #define DM_VERITY_OPT_RESTART		"restart_on_corruption"
 
-#define DM_VERITY_OPTS_MAX		1
+#define DM_VERITY_OPTS_MAX		(1 + DM_VERITY_OPTS_FEC)
 
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
@@ -282,6 +283,10 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 		if (likely(memcmp(verity_io_real_digest(v, io), want_digest,
 				  v->digest_size) == 0))
 			aux->hash_verified = 1;
+		else if (verity_fec_decode(v, io,
+					   DM_VERITY_BLOCK_TYPE_METADATA,
+					   hash_block, data, NULL) == 0)
+			aux->hash_verified = 1;
 		else if (verity_handle_err(v,
 					   DM_VERITY_BLOCK_TYPE_METADATA,
 					   hash_block)) {
@@ -411,8 +416,11 @@ static int verity_verify_io(struct dm_verity_io *io)
 		if (likely(memcmp(verity_io_real_digest(v, io),
 				  verity_io_want_digest(v, io), v->digest_size) == 0))
 			continue;
+		else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
+					   io->block + b, NULL, &start) == 0)
+			continue;
 		else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
-				io->block + b))
+					   io->block + b))
 			return -EIO;
 	}
 
@@ -430,6 +438,8 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_error = error;
 
+	verity_fec_finish_io(io);
+
 	bio_endio(bio);
 }
 
@@ -444,7 +454,7 @@ static void verity_end_io(struct bio *bio)
 {
 	struct dm_verity_io *io = bio->bi_private;
 
-	if (bio->bi_error) {
+	if (bio->bi_error && !verity_fec_is_enabled(io->v)) {
 		verity_finish_io(io, bio->bi_error);
 		return;
 	}
@@ -547,6 +557,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	bio->bi_private = io;
 	io->iter = bio->bi_iter;
 
+	verity_fec_init_io(io);
+
 	verity_submit_prefetch(v, io);
 
 	generic_make_request(bio);
@@ -561,6 +573,7 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct dm_verity *v = ti->private;
+	unsigned args = 0;
 	unsigned sz = 0;
 	unsigned x;
 
@@ -587,8 +600,15 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 		else
 			for (x = 0; x < v->salt_size; x++)
 				DMEMIT("%02x", v->salt[x]);
+		if (v->mode != DM_VERITY_MODE_EIO)
+			args++;
+		if (verity_fec_is_enabled(v))
+			args += DM_VERITY_OPTS_FEC;
+		if (!args)
+			return;
+		DMEMIT(" %u", args);
 		if (v->mode != DM_VERITY_MODE_EIO) {
-			DMEMIT(" 1 ");
+			DMEMIT(" ");
 			switch (v->mode) {
 			case DM_VERITY_MODE_LOGGING:
 				DMEMIT(DM_VERITY_OPT_LOGGING);
@@ -600,6 +620,7 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 				BUG();
 			}
 		}
+		sz = verity_fec_status_table(v, sz, result, maxlen);
 		break;
 	}
 }
@@ -662,6 +683,8 @@ static void verity_dtr(struct dm_target *ti)
 	if (v->data_dev)
 		dm_put_device(ti, v->data_dev);
 
+	verity_fec_dtr(v);
+
 	kfree(v);
 }
 
@@ -694,6 +717,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
 		} else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) {
 			v->mode = DM_VERITY_MODE_RESTART;
 			continue;
+
+		} else if (verity_is_fec_opt_arg(arg_name)) {
+			r = verity_fec_parse_opt_args(as, v, &argc, arg_name);
+			if (r)
+				return r;
+			continue;
 		}
 
 		ti->error = "Unrecognized verity feature request";
@@ -736,6 +765,10 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->private = v;
 	v->ti = ti;
 
+	r = verity_fec_ctr_alloc(v);
+	if (r)
+		goto bad;
+
 	if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
 		ti->error = "Device must be readonly";
 		r = -EINVAL;
@@ -924,8 +957,6 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
-
 	/* WQ_UNBOUND greatly improves performance when running on ramdisk */
 	v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
 	if (!v->verify_wq) {
@@ -934,6 +965,16 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
+	ti->per_bio_data_size = sizeof(struct dm_verity_io) +
+				v->shash_descsize + v->digest_size * 2;
+
+	r = verity_fec_ctr(v);
+	if (r)
+		goto bad;
+
+	ti->per_bio_data_size = roundup(ti->per_bio_data_size,
+					__alignof__(struct dm_verity_io));
+
 	return 0;
 
 bad:
@@ -944,7 +985,7 @@ bad:
 
 static struct target_type verity_target = {
 	.name		= "verity",
-	.version	= {1, 2, 0},
+	.version	= {1, 3, 0},
 	.module		= THIS_MODULE,
 	.ctr		= verity_ctr,
 	.dtr		= verity_dtr,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index f5af52df8e38..8e853722f6c6 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -29,6 +29,8 @@ enum verity_block_type {
 	DM_VERITY_BLOCK_TYPE_METADATA
 };
 
+struct dm_verity_fec;
+
 struct dm_verity {
 	struct dm_dev *data_dev;
 	struct dm_dev *hash_dev;
@@ -58,6 +60,8 @@ struct dm_verity {
 
 	/* starting blocks for each tree level. 0 is the lowest level. */
 	sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+
+	struct dm_verity_fec *fec;	/* forward error correction */
 };
 
 struct dm_verity_io {
@@ -103,6 +107,12 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v,
 	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
 }
 
+static inline u8 *verity_io_digest_end(struct dm_verity *v,
+				       struct dm_verity_io *io)
+{
+	return verity_io_want_digest(v, io) + v->digest_size;
+}
+
 extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 			       struct bvec_iter *iter,
 			       int (*process)(struct dm_verity *v,

From f75998fe8443541f599455aba93871be1e124b9a Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 3 Dec 2015 14:26:31 +0000
Subject: [PATCH 0009/3220] UPSTREAM: dm verity: add ignore_zero_blocks feature

If ignore_zero_blocks is enabled dm-verity will return zeroes for blocks
matching a zero hash without validating the content.

Change-Id: I728fa4b2586b29f2793ea5cb014289892819d249
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit 0cc37c2df4fa0aa702f9662edce4b7ce12c86b7a)
---
 Documentation/device-mapper/verity.txt |  5 ++
 drivers/md/dm-verity-fec.c             |  8 ++-
 drivers/md/dm-verity-target.c          | 87 +++++++++++++++++++++++---
 drivers/md/dm-verity.h                 |  3 +-
 4 files changed, 93 insertions(+), 10 deletions(-)

diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
index d602c801ff59..89fd8f9a259f 100644
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -79,6 +79,11 @@ restart_on_corruption
     not compatible with ignore_corruption and requires user space support to
     avoid restart loops.
 
+ignore_zero_blocks
+    Do not verify blocks that are expected to contain zeroes and always return
+    zeroes instead. This may be useful if the partition contains unused blocks
+    that are not guaranteed to contain zeroes.
+
 use_fec_from_device <fec_dev>
     Use forward error correction (FEC) to recover from corruption if hash
     verification fails. Use encoding data from the specified device. This
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 88143d36a1d2..1cc10c4de701 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -205,6 +205,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
 			 u64 rsb, u64 target, unsigned block_offset,
 			 int *neras)
 {
+	bool is_zero;
 	int i, j, target_index = -1;
 	struct dm_buffer *buf;
 	struct dm_bufio_client *bufio;
@@ -264,7 +265,12 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
 
 		/* locate erasures if the block is on the data device */
 		if (bufio == v->fec->data_bufio &&
-		    verity_hash_for_block(v, io, block, want_digest) == 0) {
+		    verity_hash_for_block(v, io, block, want_digest,
+					  &is_zero) == 0) {
+			/* skip known zero blocks entirely */
+			if (is_zero)
+				continue;
+
 			/*
 			 * skip if we have already found the theoretical
 			 * maximum number (i.e. fec->roots) of erasures
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 4f90ec2c6b7a..5c5d30cb6ec5 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -31,8 +31,9 @@
 
 #define DM_VERITY_OPT_LOGGING		"ignore_corruption"
 #define DM_VERITY_OPT_RESTART		"restart_on_corruption"
+#define DM_VERITY_OPT_IGN_ZEROES	"ignore_zero_blocks"
 
-#define DM_VERITY_OPTS_MAX		(1 + DM_VERITY_OPTS_FEC)
+#define DM_VERITY_OPTS_MAX		(2 + DM_VERITY_OPTS_FEC)
 
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
@@ -309,10 +310,9 @@ release_ret_r:
  * of the hash tree if necessary.
  */
 int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
-			  sector_t block, u8 *digest)
+			  sector_t block, u8 *digest, bool *is_zero)
 {
-	int i;
-	int r;
+	int r = 0, i;
 
 	if (likely(v->levels)) {
 		/*
@@ -324,7 +324,7 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 		 */
 		r = verity_verify_level(v, io, block, 0, true, digest);
 		if (likely(r <= 0))
-			return r;
+			goto out;
 	}
 
 	memcpy(digest, v->root_digest, v->digest_size);
@@ -332,10 +332,15 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 	for (i = v->levels - 1; i >= 0; i--) {
 		r = verity_verify_level(v, io, block, i, false, digest);
 		if (unlikely(r))
-			return r;
+			goto out;
 	}
+out:
+	if (!r && v->zero_digest)
+		*is_zero = !memcmp(v->zero_digest, digest, v->digest_size);
+	else
+		*is_zero = false;
 
-	return 0;
+	return r;
 }
 
 /*
@@ -382,11 +387,19 @@ static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
 	return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
 }
 
+static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
+			  u8 *data, size_t len)
+{
+	memset(data, 0, len);
+	return 0;
+}
+
 /*
  * Verify one "dm_verity_io" structure.
  */
 static int verity_verify_io(struct dm_verity_io *io)
 {
+	bool is_zero;
 	struct dm_verity *v = io->v;
 	struct bvec_iter start;
 	unsigned b;
@@ -396,10 +409,24 @@ static int verity_verify_io(struct dm_verity_io *io)
 		struct shash_desc *desc = verity_io_hash_desc(v, io);
 
 		r = verity_hash_for_block(v, io, io->block + b,
-					  verity_io_want_digest(v, io));
+					  verity_io_want_digest(v, io),
+					  &is_zero);
 		if (unlikely(r < 0))
 			return r;
 
+		if (is_zero) {
+			/*
+			 * If we expect a zero block, don't validate, just
+			 * return zeros.
+			 */
+			r = verity_for_bv_block(v, io, &io->iter,
+						verity_bv_zero);
+			if (unlikely(r < 0))
+				return r;
+
+			continue;
+		}
+
 		r = verity_hash_init(v, desc);
 		if (unlikely(r < 0))
 			return r;
@@ -604,6 +631,8 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 			args++;
 		if (verity_fec_is_enabled(v))
 			args += DM_VERITY_OPTS_FEC;
+		if (v->zero_digest)
+			args++;
 		if (!args)
 			return;
 		DMEMIT(" %u", args);
@@ -620,6 +649,8 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 				BUG();
 			}
 		}
+		if (v->zero_digest)
+			DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES);
 		sz = verity_fec_status_table(v, sz, result, maxlen);
 		break;
 	}
@@ -671,6 +702,7 @@ static void verity_dtr(struct dm_target *ti)
 
 	kfree(v->salt);
 	kfree(v->root_digest);
+	kfree(v->zero_digest);
 
 	if (v->tfm)
 		crypto_free_shash(v->tfm);
@@ -688,6 +720,37 @@ static void verity_dtr(struct dm_target *ti)
 	kfree(v);
 }
 
+static int verity_alloc_zero_digest(struct dm_verity *v)
+{
+	int r = -ENOMEM;
+	struct shash_desc *desc;
+	u8 *zero_data;
+
+	v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
+
+	if (!v->zero_digest)
+		return r;
+
+	desc = kmalloc(v->shash_descsize, GFP_KERNEL);
+
+	if (!desc)
+		return r; /* verity_dtr will free zero_digest */
+
+	zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
+
+	if (!zero_data)
+		goto out;
+
+	r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits,
+			v->zero_digest);
+
+out:
+	kfree(desc);
+	kfree(zero_data);
+
+	return r;
+}
+
 static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
 {
 	int r;
@@ -718,6 +781,14 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
 			v->mode = DM_VERITY_MODE_RESTART;
 			continue;
 
+		} else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) {
+			r = verity_alloc_zero_digest(v);
+			if (r) {
+				ti->error = "Cannot allocate zero digest";
+				return r;
+			}
+			continue;
+
 		} else if (verity_is_fec_opt_arg(arg_name)) {
 			r = verity_fec_parse_opt_args(as, v, &argc, arg_name);
 			if (r)
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 8e853722f6c6..fb419f422d73 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -40,6 +40,7 @@ struct dm_verity {
 	struct crypto_shash *tfm;
 	u8 *root_digest;	/* digest of the root block */
 	u8 *salt;		/* salt: its size is salt_size */
+	u8 *zero_digest;	/* digest for a zero block */
 	unsigned salt_size;
 	sector_t data_start;	/* data offset in 512-byte sectors */
 	sector_t hash_start;	/* hash start in blocks */
@@ -123,6 +124,6 @@ extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
 		       const u8 *data, size_t len, u8 *digest);
 
 extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
-				 sector_t block, u8 *digest);
+				 sector_t block, u8 *digest, bool *is_zero);
 
 #endif /* DM_VERITY_H */

From fcd614efb6c4dd4c4f543feefa6be7f8e49dcc09 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 16 Dec 2015 16:23:49 +0000
Subject: [PATCH 0010/3220] ANDROID: android: base-cfg: enable
 CONFIG_DM_VERITY_FEC

Bug: 21893453
Change-Id: Idd0dfe4e3e527df2eff2f0d734effc40dce294c7
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
(cherry picked from commit 9408350ed80005174918ce5147490035b2cf451b)
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index a4cffb1840d7..220ddb5304df 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -21,6 +21,7 @@ CONFIG_CGROUP_SCHED=y
 CONFIG_CP15_BARRIER_EMULATION=y
 CONFIG_DM_CRYPT=y
 CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
 CONFIG_HIGH_RES_TIMERS=y

From e7d40d2bf8b2991f051734dd45a95385644f8147 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 30 Mar 2016 14:10:13 -0700
Subject: [PATCH 0011/3220] ANDROID: dm verity fec: add sysfs attribute
 fec/corrected

Add a sysfs entry that allows user space to determine whether dm-verity
has come across correctable errors on the underlying block device.

Bug: 22655252
Bug: 27928374
Change-Id: I80547a2aa944af2fb9ffde002650482877ade31b
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
(cherry picked from commit 7911fad5f0a2cf5afc2215657219a21e6630e001)
---
 drivers/md/dm-verity-fec.c | 45 +++++++++++++++++++++++++++++++++++++-
 drivers/md/dm-verity-fec.h |  3 +++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 1cc10c4de701..ad10d6d8ed28 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -11,6 +11,7 @@
 
 #include "dm-verity-fec.h"
 #include <linux/math64.h>
+#include <linux/sysfs.h>
 
 #define DM_MSG_PREFIX	"verity-fec"
 
@@ -175,9 +176,11 @@ error:
 	if (r < 0 && neras)
 		DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
 			    v->data_dev->name, (unsigned long long)rsb, r);
-	else if (r > 0)
+	else if (r > 0) {
 		DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
 			     v->data_dev->name, (unsigned long long)rsb, r);
+		atomic_add_unless(&v->fec->corrected, 1, INT_MAX);
+	}
 
 	return r;
 }
@@ -548,6 +551,7 @@ unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
 void verity_fec_dtr(struct dm_verity *v)
 {
 	struct dm_verity_fec *f = v->fec;
+	struct kobject *kobj = &f->kobj_holder.kobj;
 
 	if (!verity_fec_is_enabled(v))
 		goto out;
@@ -564,6 +568,12 @@ void verity_fec_dtr(struct dm_verity *v)
 
 	if (f->dev)
 		dm_put_device(v->ti, f->dev);
+
+	if (kobj->state_initialized) {
+		kobject_put(kobj);
+		wait_for_completion(dm_get_completion_from_kobject(kobj));
+	}
+
 out:
 	kfree(f);
 	v->fec = NULL;
@@ -652,6 +662,27 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
 	return 0;
 }
 
+static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr,
+			      char *buf)
+{
+	struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec,
+					       kobj_holder.kobj);
+
+	return sprintf(buf, "%d\n", atomic_read(&f->corrected));
+}
+
+static struct kobj_attribute attr_corrected = __ATTR_RO(corrected);
+
+static struct attribute *fec_attrs[] = {
+	&attr_corrected.attr,
+	NULL
+};
+
+static struct kobj_type fec_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = fec_attrs
+};
+
 /*
  * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr.
  */
@@ -675,8 +706,10 @@ int verity_fec_ctr_alloc(struct dm_verity *v)
  */
 int verity_fec_ctr(struct dm_verity *v)
 {
+	int r;
 	struct dm_verity_fec *f = v->fec;
 	struct dm_target *ti = v->ti;
+	struct mapped_device *md = dm_table_get_md(ti->table);
 	u64 hash_blocks;
 
 	if (!verity_fec_is_enabled(v)) {
@@ -684,6 +717,16 @@ int verity_fec_ctr(struct dm_verity *v)
 		return 0;
 	}
 
+	/* Create a kobject and sysfs attributes */
+	init_completion(&f->kobj_holder.completion);
+
+	r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype,
+				 &disk_to_dev(dm_disk(md))->kobj, "%s", "fec");
+	if (r) {
+		ti->error = "Cannot create kobject";
+		return r;
+	}
+
 	/*
 	 * FEC is computed over data blocks, possible metadata, and
 	 * hash blocks. In other words, FEC covers total of fec_blocks
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 7fa0298b995e..8c4bee052a73 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -12,6 +12,7 @@
 #ifndef DM_VERITY_FEC_H
 #define DM_VERITY_FEC_H
 
+#include "dm.h"
 #include "dm-verity.h"
 #include <linux/rslib.h>
 
@@ -48,6 +49,8 @@ struct dm_verity_fec {
 	mempool_t *extra_pool;	/* mempool for extra buffers */
 	mempool_t *output_pool;	/* mempool for output */
 	struct kmem_cache *cache;	/* cache for buffers */
+	atomic_t corrected;		/* corrected errors */
+	struct dm_kobject_holder kobj_holder;	/* for sysfs attributes */
 };
 
 /* per-bio data */

From 2ebb4d97e0c9afeaa45c92f6df2d169d25b6db0b Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@chromium.org>
Date: Wed, 4 Feb 2015 13:54:48 -0800
Subject: [PATCH 0012/3220] cpufreq: interactive: fix policy locking

cpufreq_interactive_speedchange_task() is running as a separate kernel
thread and is calling __cpufreq_driver_target(), which requires callers
to hold policy->rwsem for writing to prevent racing with other parts of
the kernel trying to adjust the frequency, for example kernel thermal
throttling. Let's change the code to take policy->rwsem and while at it
refactor the code a bit.

This was originally 2 changes reviewed at:
	https://chromium-review.googlesource.com/246273
	https://chromium-review.googlesource.com/256120

Change-Id: Icc2d97c6c1b929acd2ee32e8c81d81fd2af778ab
Signed-off-by: Dmitry Torokhov <dtor@chromium.org>
Reviewed-by: Dylan Reid <dgreid@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Dmitry Torokhov <dtor@google.com>
---
 drivers/cpufreq/cpufreq_interactive.c | 99 ++++++++++++++++-----------
 1 file changed, 60 insertions(+), 39 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
index 0be66df4a6e6..5224e897d460 100644
--- a/drivers/cpufreq/cpufreq_interactive.c
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -511,6 +511,58 @@ static void cpufreq_interactive_idle_end(void)
 	up_read(&pcpu->enable_sem);
 }
 
+static void cpufreq_interactive_get_policy_info(struct cpufreq_policy *policy,
+						unsigned int *pmax_freq,
+						u64 *phvt, u64 *pfvt)
+{
+	struct cpufreq_interactive_cpuinfo *pcpu;
+	unsigned int max_freq = 0;
+	u64 hvt = ~0ULL, fvt = 0;
+	unsigned int i;
+
+	for_each_cpu(i, policy->cpus) {
+		pcpu = &per_cpu(cpuinfo, i);
+
+		fvt = max(fvt, pcpu->loc_floor_val_time);
+		if (pcpu->target_freq > max_freq) {
+			max_freq = pcpu->target_freq;
+			hvt = pcpu->loc_hispeed_val_time;
+		} else if (pcpu->target_freq == max_freq) {
+			hvt = min(hvt, pcpu->loc_hispeed_val_time);
+		}
+	}
+
+	*pmax_freq = max_freq;
+	*phvt = hvt;
+	*pfvt = fvt;
+}
+
+static void cpufreq_interactive_adjust_cpu(unsigned int cpu,
+					   struct cpufreq_policy *policy)
+{
+	struct cpufreq_interactive_cpuinfo *pcpu;
+	u64 hvt, fvt;
+	unsigned int max_freq;
+	int i;
+
+	cpufreq_interactive_get_policy_info(policy, &max_freq, &hvt, &fvt);
+
+	for_each_cpu(i, policy->cpus) {
+		pcpu = &per_cpu(cpuinfo, i);
+		pcpu->pol_floor_val_time = fvt;
+	}
+
+	if (max_freq != policy->cur) {
+		__cpufreq_driver_target(policy, max_freq, CPUFREQ_RELATION_H);
+		for_each_cpu(i, policy->cpus) {
+			pcpu = &per_cpu(cpuinfo, i);
+			pcpu->pol_hispeed_val_time = hvt;
+		}
+	}
+
+	trace_cpufreq_interactive_setspeed(cpu, max_freq, policy->cur);
+}
+
 static int cpufreq_interactive_speedchange_task(void *data)
 {
 	unsigned int cpu;
@@ -539,49 +591,18 @@ static int cpufreq_interactive_speedchange_task(void *data)
 		spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
 
 		for_each_cpu(cpu, &tmp_mask) {
-			unsigned int j;
-			unsigned int max_freq = 0;
-			struct cpufreq_interactive_cpuinfo *pjcpu;
-			u64 hvt = ~0ULL, fvt = 0;
-
 			pcpu = &per_cpu(cpuinfo, cpu);
-			if (!down_read_trylock(&pcpu->enable_sem))
-				continue;
-			if (!pcpu->governor_enabled) {
+
+			down_write(&pcpu->policy->rwsem);
+
+			if (likely(down_read_trylock(&pcpu->enable_sem))) {
+				if (likely(pcpu->governor_enabled))
+					cpufreq_interactive_adjust_cpu(cpu,
+							pcpu->policy);
 				up_read(&pcpu->enable_sem);
-				continue;
 			}
 
-			for_each_cpu(j, pcpu->policy->cpus) {
-				pjcpu = &per_cpu(cpuinfo, j);
-
-				fvt = max(fvt, pjcpu->loc_floor_val_time);
-				if (pjcpu->target_freq > max_freq) {
-					max_freq = pjcpu->target_freq;
-					hvt = pjcpu->loc_hispeed_val_time;
-				} else if (pjcpu->target_freq == max_freq) {
-					hvt = min(hvt, pjcpu->loc_hispeed_val_time);
-				}
-			}
-			for_each_cpu(j, pcpu->policy->cpus) {
-				pjcpu = &per_cpu(cpuinfo, j);
-				pjcpu->pol_floor_val_time = fvt;
-			}
-
-			if (max_freq != pcpu->policy->cur) {
-				__cpufreq_driver_target(pcpu->policy,
-							max_freq,
-							CPUFREQ_RELATION_H);
-				for_each_cpu(j, pcpu->policy->cpus) {
-					pjcpu = &per_cpu(cpuinfo, j);
-					pjcpu->pol_hispeed_val_time = hvt;
-				}
-			}
-			trace_cpufreq_interactive_setspeed(cpu,
-						     pcpu->target_freq,
-						     pcpu->policy->cur);
-
-			up_read(&pcpu->enable_sem);
+			up_write(&pcpu->policy->rwsem);
 		}
 	}
 

From 440a57717bcc2a86a5e9800bce62e14d8115a001 Mon Sep 17 00:00:00 2001
From: Daniel Kurtz <djkurtz@chromium.org>
Date: Thu, 28 May 2015 12:08:11 +0800
Subject: [PATCH 0013/3220] cpufreq: interactive: only apply interactive boost
 when enabled

Only apply the interactive boost when the interactive governor is
enabled.  This seems like the right thing to do.

This was originally reviewed on
	https://chromium-review.googlesource.com/273501

Change-Id: I5f4a7320683eada099f9a4253e3d6b0f03057fe8
Signed-off-by: Daniel Kurtz <djkurtz@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Dmitry Torokhov <dtor@google.com>
---
 drivers/cpufreq/cpufreq_interactive.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
index 5224e897d460..bd83be39f17b 100644
--- a/drivers/cpufreq/cpufreq_interactive.c
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -622,9 +622,20 @@ static void cpufreq_interactive_boost(struct cpufreq_interactive_tunables *tunab
 
 	for_each_online_cpu(i) {
 		pcpu = &per_cpu(cpuinfo, i);
-		if (tunables != pcpu->policy->governor_data)
+
+		if (!down_read_trylock(&pcpu->enable_sem))
 			continue;
 
+		if (!pcpu->governor_enabled) {
+			up_read(&pcpu->enable_sem);
+			continue;
+		}
+
+		if (tunables != pcpu->policy->governor_data) {
+			up_read(&pcpu->enable_sem);
+			continue;
+		}
+
 		spin_lock_irqsave(&pcpu->target_freq_lock, flags[1]);
 		if (pcpu->target_freq < tunables->hispeed_freq) {
 			pcpu->target_freq = tunables->hispeed_freq;
@@ -634,6 +645,8 @@ static void cpufreq_interactive_boost(struct cpufreq_interactive_tunables *tunab
 			anyboost = 1;
 		}
 		spin_unlock_irqrestore(&pcpu->target_freq_lock, flags[1]);
+
+		up_read(&pcpu->enable_sem);
 	}
 
 	spin_unlock_irqrestore(&speedchange_cpumask_lock, flags[0]);

From 03fbd079bac71e15a414082cb5aee980ce2935be Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Thu, 31 Mar 2016 13:21:09 -0700
Subject: [PATCH 0014/3220] android: base-cfg: Add CONFIG_INET_DIAG_DESTROY

Change-Id: I67430b05eca8fd520d2795d3db60faf2ec0fab9e
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 220ddb5304df..fa53af0c37ad 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -29,6 +29,7 @@ CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y
 CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
 CONFIG_INET_ESP=y
 CONFIG_INET_XFRM_MODE_TUNNEL=y
 CONFIG_IP6_NF_FILTER=y

From d68354b3b7ef8b8a1d9cf0ba071123ed62fe567f Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <salyzyn@google.com>
Date: Thu, 28 Jan 2016 11:12:25 -0800
Subject: [PATCH 0015/3220] ANDROID: mmc: Add CONFIG_MMC_SIMULATE_MAX_SPEED

When CONFIG_MMC_SIMULATE_MAX_SPEED is enabled, Expose max_read_speed,
max_write_speed and cache_size default module parameters and sysfs
controls to simulate a slow eMMC device. Default values are 0 (off),
0 (off) and 4 MB respectively.

Signed-off-by: Mark Salyzyn <salyzyn@google.com>
Bug: 26976972
Change-Id: I342bfbd8b85f9b790e3f0e1e4e51a900ae07e05d
---
 Documentation/block/00-INDEX          |   6 +
 Documentation/block/mmc-max-speed.txt |  38 ++++
 drivers/mmc/card/Kconfig              |  12 ++
 drivers/mmc/card/block.c              | 300 ++++++++++++++++++++++++++
 drivers/mmc/card/queue.h              |   8 +
 5 files changed, 364 insertions(+)
 create mode 100644 Documentation/block/mmc-max-speed.txt

diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e840b47613f7..bc5148757edb 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -26,3 +26,9 @@ switching-sched.txt
 	- Switching I/O schedulers at runtime
 writeback_cache_control.txt
 	- Control of volatile write back caches
+mmc-max-speed.txt
+	- eMMC layer speed simulation, related to /sys/block/mmcblk*/
+          attributes:
+            max_read_speed
+            max_write_speed
+            cache_size
diff --git a/Documentation/block/mmc-max-speed.txt b/Documentation/block/mmc-max-speed.txt
new file mode 100644
index 000000000000..3f052b9fb999
--- /dev/null
+++ b/Documentation/block/mmc-max-speed.txt
@@ -0,0 +1,38 @@
+eMMC Block layer simulation speed controls in /sys/block/mmcblk*/
+===============================================
+
+Turned on with CONFIG_MMC_SIMULATE_MAX_SPEED which enables MMC device speed
+limiting. Used to test and simulate the behavior of the system when
+confronted with a slow MMC.
+
+Enables max_read_speed, max_write_speed and cache_size attributes and module
+default parameters to control the write or read maximum KB/second speed
+behaviors.
+
+NB: There is room for improving the algorithm for aspects tied directly to
+eMMC specific behavior. For instance, wear leveling and stalls from an
+exhausted erase pool. We would expect that if there was a need to provide
+similar speed simulation controls to other types of block devices, aspects of
+their behavior are modelled separately (e.g. head seek times, heat assist,
+shingling and rotational latency).
+
+/sys/block/mmcblk0/max_read_speed:
+
+Number of KB/second reads allowed to the block device. Used to test and
+simulate the behavior of the system when confronted with a slow reading MMC.
+Set to 0 or "off" to place no speed limit.
+
+/sys/block/mmcblk0/max_write_speed:
+
+Number of KB/second writes allowed to the block device. Used to test and
+simulate the behavior of the system when confronted with a slow writing MMC.
+Set to 0 or "off" to place no speed limit.
+
+/sys/block/mmcblk0/cache_size:
+
+Number of MB of high speed memory or high speed SLC cache expected on the
+eMMC device being simulated. Used to help simulate the write-back behavior
+more accurately. The assumption is the cache has no delay, but draws down
+in the background to the MLC/TLC primary store at the max_write_speed rate.
+Any write speed delays will show up when the cache is full, or when an I/O
+request to flush is issued.
diff --git a/drivers/mmc/card/Kconfig b/drivers/mmc/card/Kconfig
index 5562308699bc..6142ec1b9dfb 100644
--- a/drivers/mmc/card/Kconfig
+++ b/drivers/mmc/card/Kconfig
@@ -68,3 +68,15 @@ config MMC_TEST
 
 	  This driver is only of interest to those developing or
 	  testing a host driver. Most people should say N here.
+
+config MMC_SIMULATE_MAX_SPEED
+	bool "Turn on maximum speed control per block device"
+	depends on MMC_BLOCK
+	help
+	  Say Y here to enable MMC device speed limiting. Used to test and
+	  simulate the behavior of the system when confronted with a slow MMC.
+
+	  Enables max_read_speed, max_write_speed and cache_size attributes to
+	  control the write or read maximum KB/second speed behaviors.
+
+	  If unsure, say N here.
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index f2ce13ab7ae6..c15d3f380524 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -288,6 +288,250 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+
+static int max_read_speed, max_write_speed, cache_size = 4;
+
+module_param(max_read_speed, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(max_read_speed, "maximum KB/s read speed 0=off");
+module_param(max_write_speed, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(max_write_speed, "maximum KB/s write speed 0=off");
+module_param(cache_size, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(cache_size, "MB high speed memory or SLC cache");
+
+/*
+ * helper macros and expectations:
+ *  size    - unsigned long number of bytes
+ *  jiffies - unsigned long HZ timestamp difference
+ *  speed   - unsigned KB/s transfer rate
+ */
+#define size_and_speed_to_jiffies(size, speed) \
+		((size) * HZ / (speed) / 1024UL)
+#define jiffies_and_speed_to_size(jiffies, speed) \
+		(((speed) * (jiffies) * 1024UL) / HZ)
+#define jiffies_and_size_to_speed(jiffies, size) \
+		((size) * HZ / (jiffies) / 1024UL)
+
+/* Limits to report warning */
+/* jiffies_and_size_to_speed(10*HZ, queue_max_hw_sectors(q) * 512UL) ~ 25 */
+#define MIN_SPEED(q) 250 /* 10 times faster than a floppy disk */
+#define MAX_SPEED(q) jiffies_and_size_to_speed(1, queue_max_sectors(q) * 512UL)
+
+#define speed_valid(speed) ((speed) > 0)
+
+static const char off[] = "off\n";
+
+static int max_speed_show(int speed, char *buf)
+{
+	if (speed)
+		return scnprintf(buf, PAGE_SIZE, "%uKB/s\n", speed);
+	else
+		return scnprintf(buf, PAGE_SIZE, off);
+}
+
+static int max_speed_store(const char *buf, struct request_queue *q)
+{
+	unsigned int limit, set = 0;
+
+	if (!strncasecmp(off, buf, sizeof(off) - 2))
+		return set;
+	if (kstrtouint(buf, 0, &set) || (set > INT_MAX))
+		return -EINVAL;
+	if (set == 0)
+		return set;
+	limit = MAX_SPEED(q);
+	if (set > limit)
+		pr_warn("max speed %u ineffective above %u\n", set, limit);
+	limit = MIN_SPEED(q);
+	if (set < limit)
+		pr_warn("max speed %u painful below %u\n", set, limit);
+	return set;
+}
+
+static ssize_t max_write_speed_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+	int ret = max_speed_show(atomic_read(&md->queue.max_write_speed), buf);
+
+	mmc_blk_put(md);
+	return ret;
+}
+
+static ssize_t max_write_speed_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+	int set = max_speed_store(buf, md->queue.queue);
+
+	if (set < 0) {
+		mmc_blk_put(md);
+		return set;
+	}
+
+	atomic_set(&md->queue.max_write_speed, set);
+	mmc_blk_put(md);
+	return count;
+}
+
+static const DEVICE_ATTR(max_write_speed, S_IRUGO | S_IWUSR,
+	max_write_speed_show, max_write_speed_store);
+
+static ssize_t max_read_speed_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+	int ret = max_speed_show(atomic_read(&md->queue.max_read_speed), buf);
+
+	mmc_blk_put(md);
+	return ret;
+}
+
+static ssize_t max_read_speed_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+	int set = max_speed_store(buf, md->queue.queue);
+
+	if (set < 0) {
+		mmc_blk_put(md);
+		return set;
+	}
+
+	atomic_set(&md->queue.max_read_speed, set);
+	mmc_blk_put(md);
+	return count;
+}
+
+static const DEVICE_ATTR(max_read_speed, S_IRUGO | S_IWUSR,
+	max_read_speed_show, max_read_speed_store);
+
+static ssize_t cache_size_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+	struct mmc_queue *mq = &md->queue;
+	int cache_size = atomic_read(&mq->cache_size);
+	int ret;
+
+	if (!cache_size)
+		ret = scnprintf(buf, PAGE_SIZE, off);
+	else {
+		int speed = atomic_read(&mq->max_write_speed);
+
+		if (!speed_valid(speed))
+			ret = scnprintf(buf, PAGE_SIZE, "%uMB\n", cache_size);
+		else { /* We accept race between cache_jiffies and cache_used */
+			unsigned long size = jiffies_and_speed_to_size(
+				jiffies - mq->cache_jiffies, speed);
+			long used = atomic_long_read(&mq->cache_used);
+
+			if (size >= used)
+				size = 0;
+			else
+				size = (used - size) * 100 / cache_size
+					/ 1024UL / 1024UL;
+
+			ret = scnprintf(buf, PAGE_SIZE, "%uMB %lu%% used\n",
+				cache_size, size);
+		}
+	}
+
+	mmc_blk_put(md);
+	return ret;
+}
+
+static ssize_t cache_size_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct mmc_blk_data *md;
+	unsigned int set = 0;
+
+	if (strncasecmp(off, buf, sizeof(off) - 2)
+	 && (kstrtouint(buf, 0, &set) || (set > INT_MAX)))
+		return -EINVAL;
+
+	md = mmc_blk_get(dev_to_disk(dev));
+	atomic_set(&md->queue.cache_size, set);
+	mmc_blk_put(md);
+	return count;
+}
+
+static const DEVICE_ATTR(cache_size, S_IRUGO | S_IWUSR,
+	cache_size_show, cache_size_store);
+
+/* correct for write-back */
+static long mmc_blk_cache_used(struct mmc_queue *mq, unsigned long waitfor)
+{
+	long used = 0;
+	int speed = atomic_read(&mq->max_write_speed);
+
+	if (speed_valid(speed)) {
+		unsigned long size = jiffies_and_speed_to_size(
+					waitfor - mq->cache_jiffies, speed);
+		used = atomic_long_read(&mq->cache_used);
+
+		if (size >= used)
+			used = 0;
+		else
+			used -= size;
+	}
+
+	atomic_long_set(&mq->cache_used, used);
+	mq->cache_jiffies = waitfor;
+
+	return used;
+}
+
+static void mmc_blk_simulate_delay(
+	struct mmc_queue *mq,
+	struct request *req,
+	unsigned long waitfor)
+{
+	int max_speed;
+
+	if (!req)
+		return;
+
+	max_speed = (rq_data_dir(req) == READ)
+		? atomic_read(&mq->max_read_speed)
+		: atomic_read(&mq->max_write_speed);
+	if (speed_valid(max_speed)) {
+		unsigned long bytes = blk_rq_bytes(req);
+
+		if (rq_data_dir(req) != READ) {
+			int cache_size = atomic_read(&mq->cache_size);
+
+			if (cache_size) {
+				unsigned long size = cache_size * 1024L * 1024L;
+				long used = mmc_blk_cache_used(mq, waitfor);
+
+				used += bytes;
+				atomic_long_set(&mq->cache_used, used);
+				bytes = 0;
+				if (used > size)
+					bytes = used - size;
+			}
+		}
+		waitfor += size_and_speed_to_jiffies(bytes, max_speed);
+		if (time_is_after_jiffies(waitfor)) {
+			long msecs = jiffies_to_msecs(waitfor - jiffies);
+
+			if (likely(msecs > 0))
+				msleep(msecs);
+		}
+	}
+}
+
+#else
+
+#define mmc_blk_simulate_delay(mq, req, waitfor)
+
+#endif
+
 static int mmc_blk_open(struct block_device *bdev, fmode_t mode)
 {
 	struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk);
@@ -1264,6 +1508,23 @@ static int mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req)
 	if (ret)
 		ret = -EIO;
 
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+	else if (atomic_read(&mq->cache_size)) {
+		long used = mmc_blk_cache_used(mq, jiffies);
+
+		if (used) {
+			int speed = atomic_read(&mq->max_write_speed);
+
+			if (speed_valid(speed)) {
+				unsigned long msecs = jiffies_to_msecs(
+					size_and_speed_to_jiffies(
+						used, speed));
+				if (msecs)
+					msleep(msecs);
+			}
+		}
+	}
+#endif
 	blk_end_request_all(req, ret);
 
 	return ret ? 0 : 1;
@@ -1943,6 +2204,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
 	struct mmc_async_req *areq;
 	const u8 packed_nr = 2;
 	u8 reqs = 0;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+	unsigned long waitfor = jiffies;
+#endif
 
 	if (!rqc && !mq->mqrq_prev->req)
 		return 0;
@@ -1993,6 +2257,8 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
 			 */
 			mmc_blk_reset_success(md, type);
 
+			mmc_blk_simulate_delay(mq, rqc, waitfor);
+
 			if (mmc_packed_cmd(mq_rq->cmd_type)) {
 				ret = mmc_blk_end_packed_req(mq_rq);
 				break;
@@ -2411,6 +2677,14 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md)
 					card->ext_csd.boot_ro_lockable)
 				device_remove_file(disk_to_dev(md->disk),
 					&md->power_ro_lock);
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+			device_remove_file(disk_to_dev(md->disk),
+						&dev_attr_max_write_speed);
+			device_remove_file(disk_to_dev(md->disk),
+						&dev_attr_max_read_speed);
+			device_remove_file(disk_to_dev(md->disk),
+						&dev_attr_cache_size);
+#endif
 
 			del_gendisk(md->disk);
 		}
@@ -2446,6 +2720,24 @@ static int mmc_add_disk(struct mmc_blk_data *md)
 	ret = device_create_file(disk_to_dev(md->disk), &md->force_ro);
 	if (ret)
 		goto force_ro_fail;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+	atomic_set(&md->queue.max_write_speed, max_write_speed);
+	ret = device_create_file(disk_to_dev(md->disk),
+			&dev_attr_max_write_speed);
+	if (ret)
+		goto max_write_speed_fail;
+	atomic_set(&md->queue.max_read_speed, max_read_speed);
+	ret = device_create_file(disk_to_dev(md->disk),
+			&dev_attr_max_read_speed);
+	if (ret)
+		goto max_read_speed_fail;
+	atomic_set(&md->queue.cache_size, cache_size);
+	atomic_long_set(&md->queue.cache_used, 0);
+	md->queue.cache_jiffies = jiffies;
+	ret = device_create_file(disk_to_dev(md->disk), &dev_attr_cache_size);
+	if (ret)
+		goto cache_size_fail;
+#endif
 
 	if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) &&
 	     card->ext_csd.boot_ro_lockable) {
@@ -2470,6 +2762,14 @@ static int mmc_add_disk(struct mmc_blk_data *md)
 	return ret;
 
 power_ro_lock_fail:
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+	device_remove_file(disk_to_dev(md->disk), &dev_attr_cache_size);
+cache_size_fail:
+	device_remove_file(disk_to_dev(md->disk), &dev_attr_max_read_speed);
+max_read_speed_fail:
+	device_remove_file(disk_to_dev(md->disk), &dev_attr_max_write_speed);
+max_write_speed_fail:
+#endif
 	device_remove_file(disk_to_dev(md->disk), &md->force_ro);
 force_ro_fail:
 	del_gendisk(md->disk);
diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h
index 36cddab57d77..d890d8832e21 100644
--- a/drivers/mmc/card/queue.h
+++ b/drivers/mmc/card/queue.h
@@ -58,6 +58,14 @@ struct mmc_queue {
 	struct mmc_queue_req	mqrq[2];
 	struct mmc_queue_req	*mqrq_cur;
 	struct mmc_queue_req	*mqrq_prev;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+	atomic_t max_write_speed;
+	atomic_t max_read_speed;
+	atomic_t cache_size;
+	/* i/o tracking */
+	atomic_long_t cache_used;
+	unsigned long cache_jiffies;
+#endif
 };
 
 extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,

From a354cef9805bef94bb6fb6bd7887447846fee737 Mon Sep 17 00:00:00 2001
From: Rom Lemarchand <romlem@android.com>
Date: Thu, 7 Apr 2016 07:19:34 -0700
Subject: [PATCH 0016/3220] android: base-cfg: enable CONFIG_QUOTA

Bug: 28032718
Change-Id: I7cb6b641f72085e69b90dca11d2ea68adcd02390
(cherry picked from commit e1b53a388e9cfcf870520a6899a37456cf1ae2c6)
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index fa53af0c37ad..3f82e9339dad 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -138,6 +138,7 @@ CONFIG_PPP_BSDCOMP=y
 CONFIG_PPP_DEFLATE=y
 CONFIG_PPP_MPPE=y
 CONFIG_PREEMPT=y
+CONFIG_QUOTA=y
 CONFIG_RESOURCE_COUNTERS=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y

From 00526cde6b4a4154e4ea528f8a0cea1017abbc1c Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Tue, 5 Apr 2016 13:06:27 -0700
Subject: [PATCH 0017/3220] BACKPORT: selinux: restrict kernel module loading

Backport notes:
Backport uses kernel_module_from_file not kernel_read_file hook.
kernel_read_file replaced kernel_module_from_file in the 4.6 kernel.
There are no inode_security_() helper functions (also introduced in
4.6) so the inode lookup is done using the file_inode() helper which
is standard for kernel version < 4.6.

(Cherry picked from commit 61d612ea731e57dc510472fb746b55cdc017f371)

Utilize existing kernel_read_file hook on kernel module load.
Add module_load permission to the system class.

Enforces restrictions on kernel module origin when calling the
finit_module syscall. The hook checks that source type has
permission module_load for the target type.
Example for finit_module:

allow foo bar_file:system module_load;

Similarly restrictions are enforced on kernel module loading when
calling the init_module syscall. The hook checks that source
type has permission module_load with itself as the target object
because the kernel module is sourced from the calling process.
Example for init_module:

allow foo foo:system module_load;

Bug: 27824855
Change-Id: I64bf3bd1ab2dc735321160642dc6bbfa996f8068
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/selinux/hooks.c            | 33 +++++++++++++++++++++++++++++
 security/selinux/include/classmap.h |  2 +-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7c22a15c7e4b..78d06ff4eeb7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3660,6 +3660,38 @@ static int selinux_kernel_module_request(char *kmod_name)
 			    SYSTEM__MODULE_REQUEST, &ad);
 }
 
+static int selinux_kernel_module_from_file(struct file *file)
+{
+	struct common_audit_data ad;
+	struct inode_security_struct *isec;
+	struct file_security_struct *fsec;
+	struct inode *inode;
+	u32 sid = current_sid();
+	int rc;
+
+	/* init_module */
+	if (file == NULL)
+		return avc_has_perm(sid, sid, SECCLASS_SYSTEM,
+					SYSTEM__MODULE_LOAD, NULL);
+
+	/* finit_module */
+	ad.type = LSM_AUDIT_DATA_PATH;
+	ad.u.path = file->f_path;
+
+	inode = file_inode(file);
+	isec = inode->i_security;
+	fsec = file->f_security;
+
+	if (sid != fsec->sid) {
+		rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad);
+		if (rc)
+			return rc;
+	}
+
+	return avc_has_perm(sid, isec->sid, SECCLASS_SYSTEM,
+				SYSTEM__MODULE_LOAD, &ad);
+}
+
 static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
 {
 	return current_has_perm(p, PROCESS__SETPGID);
@@ -5950,6 +5982,7 @@ static struct security_hook_list selinux_hooks[] = {
 	LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as),
 	LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as),
 	LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request),
+	LSM_HOOK_INIT(kernel_module_from_file, selinux_kernel_module_from_file),
 	LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid),
 	LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid),
 	LSM_HOOK_INIT(task_getsid, selinux_task_getsid),
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 5a4eef59aeff..b393d29ae857 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -32,7 +32,7 @@ struct security_class_mapping secclass_map[] = {
 	    "setsockcreate", NULL } },
 	{ "system",
 	  { "ipc_info", "syslog_read", "syslog_mod",
-	    "syslog_console", "module_request", NULL } },
+	    "syslog_console", "module_request", "module_load", NULL } },
 	{ "capability",
 	  { "chown", "dac_override", "dac_read_search",
 	    "fowner", "fsetid", "kill", "setgid", "setuid", "setpcap",

From 9f6bbb427fc67e5caceec70741def34234078f97 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Tue, 12 Apr 2016 01:19:24 +0530
Subject: [PATCH 0018/3220] ANDROID: base-cfg: enable CONFIG_IP_NF_NAT

IP_NF_TARGET_{MASQUERADE,NETMAP,REDIRECT} configs,
already enabled in android-base.cfg for tethering,
are of no use if CONFIG_IP_NF_NAT is not enabled.

Don't rely on platform config for that and enable
CONFIG_IP_NF_NAT in android-base.cfg as well.

Change-Id: Ic72bcebbd925b142b09539466bf963188c83108a
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 3f82e9339dad..a8b529cae6eb 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -57,6 +57,7 @@ CONFIG_IP_NF_MANGLE=y
 CONFIG_IP_NF_MATCH_AH=y
 CONFIG_IP_NF_MATCH_ECN=y
 CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
 CONFIG_IP_NF_RAW=y
 CONFIG_IP_NF_SECURITY=y
 CONFIG_IP_NF_TARGET_MASQUERADE=y

From ea20f758a0f688b713920e1b6c455ab3f0983c9e Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Tue, 19 Apr 2016 12:37:31 -0700
Subject: [PATCH 0019/3220] Revert "drivers: switch: remove S_IWUSR from
 dev_attr"

This reverts commit dc66dee02dcd6ea774e3ed4ae32e88b0f3b4bee7.
---
 drivers/switch/switch_class.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/switch/switch_class.c b/drivers/switch/switch_class.c
index e373b625806e..e05fc2591147 100644
--- a/drivers/switch/switch_class.c
+++ b/drivers/switch/switch_class.c
@@ -54,8 +54,8 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr,
 	return sprintf(buf, "%s\n", sdev->name);
 }
 
-static DEVICE_ATTR(state, S_IRUGO, state_show, NULL);
-static DEVICE_ATTR(name, S_IRUGO, name_show, NULL);
+static DEVICE_ATTR(state, S_IRUGO | S_IWUSR, state_show, NULL);
+static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, name_show, NULL);
 
 void switch_set_state(struct switch_dev *sdev, int state)
 {

From 00c13787f8275d7ff3e232a0e06a3c26478811e0 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Tue, 19 Apr 2016 12:37:47 -0700
Subject: [PATCH 0020/3220] Revert "switch: switch class and GPIO drivers."

Drivers should use extcon moving forward.
Documentation/extcon/porting-android-switch-class describes
how to port existing switch class drivers to extcon.

This reverts commit e4b8e66e0ae2e78e913d7b86f2507fdb0aa731b4.

Change-Id: I5b622c7ab4c0cb9670f8903f259a99888f503c1a
---
 drivers/Kconfig               |   2 -
 drivers/Makefile              |   1 -
 drivers/switch/Kconfig        |  15 ---
 drivers/switch/Makefile       |   4 -
 drivers/switch/switch_class.c | 174 ----------------------------------
 drivers/switch/switch_gpio.c  | 172 ---------------------------------
 include/linux/switch.h        |  53 -----------
 7 files changed, 421 deletions(-)
 delete mode 100644 drivers/switch/Kconfig
 delete mode 100644 drivers/switch/Makefile
 delete mode 100644 drivers/switch/switch_class.c
 delete mode 100644 drivers/switch/switch_gpio.c
 delete mode 100644 include/linux/switch.h

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 3c363e559864..d2ac339de85f 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -104,8 +104,6 @@ source "drivers/memstick/Kconfig"
 
 source "drivers/leds/Kconfig"
 
-source "drivers/switch/Kconfig"
-
 source "drivers/accessibility/Kconfig"
 
 source "drivers/infiniband/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 8acfb530993d..795d0ca714bf 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -122,7 +122,6 @@ obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
 obj-y				+= mmc/
 obj-$(CONFIG_MEMSTICK)		+= memstick/
 obj-y				+= leds/
-obj-$(CONFIG_SWITCH)		+= switch/
 obj-$(CONFIG_INFINIBAND)	+= infiniband/
 obj-$(CONFIG_SGI_SN)		+= sn/
 obj-y				+= firmware/
diff --git a/drivers/switch/Kconfig b/drivers/switch/Kconfig
deleted file mode 100644
index 19404b6f7778..000000000000
--- a/drivers/switch/Kconfig
+++ /dev/null
@@ -1,15 +0,0 @@
-menuconfig SWITCH
-	tristate "Switch class support"
-	help
-	  Say Y here to enable switch class support. This allows
-	  monitoring switches by userspace via sysfs and uevent.
-
-if SWITCH
-
-config SWITCH_GPIO
-	tristate "GPIO Swith support"
-	depends on GPIOLIB
-	help
-	  Say Y here to enable GPIO based switch support.
-
-endif # SWITCH
diff --git a/drivers/switch/Makefile b/drivers/switch/Makefile
deleted file mode 100644
index f7606ed4a719..000000000000
--- a/drivers/switch/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# Switch Class Driver
-obj-$(CONFIG_SWITCH)		+= switch_class.o
-obj-$(CONFIG_SWITCH_GPIO)	+= switch_gpio.o
-
diff --git a/drivers/switch/switch_class.c b/drivers/switch/switch_class.c
deleted file mode 100644
index e05fc2591147..000000000000
--- a/drivers/switch/switch_class.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- *  drivers/switch/switch_class.c
- *
- * Copyright (C) 2008 Google, Inc.
- * Author: Mike Lockwood <lockwood@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
-*/
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/fs.h>
-#include <linux/err.h>
-#include <linux/switch.h>
-
-struct class *switch_class;
-static atomic_t device_count;
-
-static ssize_t state_show(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	struct switch_dev *sdev = (struct switch_dev *)
-		dev_get_drvdata(dev);
-
-	if (sdev->print_state) {
-		int ret = sdev->print_state(sdev, buf);
-		if (ret >= 0)
-			return ret;
-	}
-	return sprintf(buf, "%d\n", sdev->state);
-}
-
-static ssize_t name_show(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	struct switch_dev *sdev = (struct switch_dev *)
-		dev_get_drvdata(dev);
-
-	if (sdev->print_name) {
-		int ret = sdev->print_name(sdev, buf);
-		if (ret >= 0)
-			return ret;
-	}
-	return sprintf(buf, "%s\n", sdev->name);
-}
-
-static DEVICE_ATTR(state, S_IRUGO | S_IWUSR, state_show, NULL);
-static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, name_show, NULL);
-
-void switch_set_state(struct switch_dev *sdev, int state)
-{
-	char name_buf[120];
-	char state_buf[120];
-	char *prop_buf;
-	char *envp[3];
-	int env_offset = 0;
-	int length;
-
-	if (sdev->state != state) {
-		sdev->state = state;
-
-		prop_buf = (char *)get_zeroed_page(GFP_KERNEL);
-		if (prop_buf) {
-			length = name_show(sdev->dev, NULL, prop_buf);
-			if (length > 0) {
-				if (prop_buf[length - 1] == '\n')
-					prop_buf[length - 1] = 0;
-				snprintf(name_buf, sizeof(name_buf),
-					"SWITCH_NAME=%s", prop_buf);
-				envp[env_offset++] = name_buf;
-			}
-			length = state_show(sdev->dev, NULL, prop_buf);
-			if (length > 0) {
-				if (prop_buf[length - 1] == '\n')
-					prop_buf[length - 1] = 0;
-				snprintf(state_buf, sizeof(state_buf),
-					"SWITCH_STATE=%s", prop_buf);
-				envp[env_offset++] = state_buf;
-			}
-			envp[env_offset] = NULL;
-			kobject_uevent_env(&sdev->dev->kobj, KOBJ_CHANGE, envp);
-			free_page((unsigned long)prop_buf);
-		} else {
-			printk(KERN_ERR "out of memory in switch_set_state\n");
-			kobject_uevent(&sdev->dev->kobj, KOBJ_CHANGE);
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(switch_set_state);
-
-static int create_switch_class(void)
-{
-	if (!switch_class) {
-		switch_class = class_create(THIS_MODULE, "switch");
-		if (IS_ERR(switch_class))
-			return PTR_ERR(switch_class);
-		atomic_set(&device_count, 0);
-	}
-
-	return 0;
-}
-
-int switch_dev_register(struct switch_dev *sdev)
-{
-	int ret;
-
-	if (!switch_class) {
-		ret = create_switch_class();
-		if (ret < 0)
-			return ret;
-	}
-
-	sdev->index = atomic_inc_return(&device_count);
-	sdev->dev = device_create(switch_class, NULL,
-		MKDEV(0, sdev->index), NULL, sdev->name);
-	if (IS_ERR(sdev->dev))
-		return PTR_ERR(sdev->dev);
-
-	ret = device_create_file(sdev->dev, &dev_attr_state);
-	if (ret < 0)
-		goto err_create_file_1;
-	ret = device_create_file(sdev->dev, &dev_attr_name);
-	if (ret < 0)
-		goto err_create_file_2;
-
-	dev_set_drvdata(sdev->dev, sdev);
-	sdev->state = 0;
-	return 0;
-
-err_create_file_2:
-	device_remove_file(sdev->dev, &dev_attr_state);
-err_create_file_1:
-	device_destroy(switch_class, MKDEV(0, sdev->index));
-	printk(KERN_ERR "switch: Failed to register driver %s\n", sdev->name);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(switch_dev_register);
-
-void switch_dev_unregister(struct switch_dev *sdev)
-{
-	device_remove_file(sdev->dev, &dev_attr_name);
-	device_remove_file(sdev->dev, &dev_attr_state);
-	device_destroy(switch_class, MKDEV(0, sdev->index));
-	dev_set_drvdata(sdev->dev, NULL);
-}
-EXPORT_SYMBOL_GPL(switch_dev_unregister);
-
-static int __init switch_class_init(void)
-{
-	return create_switch_class();
-}
-
-static void __exit switch_class_exit(void)
-{
-	class_destroy(switch_class);
-}
-
-module_init(switch_class_init);
-module_exit(switch_class_exit);
-
-MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
-MODULE_DESCRIPTION("Switch class driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/switch/switch_gpio.c b/drivers/switch/switch_gpio.c
deleted file mode 100644
index 621d62d20c99..000000000000
--- a/drivers/switch/switch_gpio.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- *  drivers/switch/switch_gpio.c
- *
- * Copyright (C) 2008 Google, Inc.
- * Author: Mike Lockwood <lockwood@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/switch.h>
-#include <linux/workqueue.h>
-#include <linux/gpio.h>
-
-struct gpio_switch_data {
-	struct switch_dev sdev;
-	unsigned gpio;
-	const char *name_on;
-	const char *name_off;
-	const char *state_on;
-	const char *state_off;
-	int irq;
-	struct work_struct work;
-};
-
-static void gpio_switch_work(struct work_struct *work)
-{
-	int state;
-	struct gpio_switch_data	*data =
-		container_of(work, struct gpio_switch_data, work);
-
-	state = gpio_get_value(data->gpio);
-	switch_set_state(&data->sdev, state);
-}
-
-static irqreturn_t gpio_irq_handler(int irq, void *dev_id)
-{
-	struct gpio_switch_data *switch_data =
-	    (struct gpio_switch_data *)dev_id;
-
-	schedule_work(&switch_data->work);
-	return IRQ_HANDLED;
-}
-
-static ssize_t switch_gpio_print_state(struct switch_dev *sdev, char *buf)
-{
-	struct gpio_switch_data	*switch_data =
-		container_of(sdev, struct gpio_switch_data, sdev);
-	const char *state;
-	if (switch_get_state(sdev))
-		state = switch_data->state_on;
-	else
-		state = switch_data->state_off;
-
-	if (state)
-		return sprintf(buf, "%s\n", state);
-	return -1;
-}
-
-static int gpio_switch_probe(struct platform_device *pdev)
-{
-	struct gpio_switch_platform_data *pdata = pdev->dev.platform_data;
-	struct gpio_switch_data *switch_data;
-	int ret = 0;
-
-	if (!pdata)
-		return -EBUSY;
-
-	switch_data = kzalloc(sizeof(struct gpio_switch_data), GFP_KERNEL);
-	if (!switch_data)
-		return -ENOMEM;
-
-	switch_data->sdev.name = pdata->name;
-	switch_data->gpio = pdata->gpio;
-	switch_data->name_on = pdata->name_on;
-	switch_data->name_off = pdata->name_off;
-	switch_data->state_on = pdata->state_on;
-	switch_data->state_off = pdata->state_off;
-	switch_data->sdev.print_state = switch_gpio_print_state;
-
-	ret = switch_dev_register(&switch_data->sdev);
-	if (ret < 0)
-		goto err_switch_dev_register;
-
-	ret = gpio_request(switch_data->gpio, pdev->name);
-	if (ret < 0)
-		goto err_request_gpio;
-
-	ret = gpio_direction_input(switch_data->gpio);
-	if (ret < 0)
-		goto err_set_gpio_input;
-
-	INIT_WORK(&switch_data->work, gpio_switch_work);
-
-	switch_data->irq = gpio_to_irq(switch_data->gpio);
-	if (switch_data->irq < 0) {
-		ret = switch_data->irq;
-		goto err_detect_irq_num_failed;
-	}
-
-	ret = request_irq(switch_data->irq, gpio_irq_handler,
-			  IRQF_TRIGGER_LOW, pdev->name, switch_data);
-	if (ret < 0)
-		goto err_request_irq;
-
-	/* Perform initial detection */
-	gpio_switch_work(&switch_data->work);
-
-	return 0;
-
-err_request_irq:
-err_detect_irq_num_failed:
-err_set_gpio_input:
-	gpio_free(switch_data->gpio);
-err_request_gpio:
-	switch_dev_unregister(&switch_data->sdev);
-err_switch_dev_register:
-	kfree(switch_data);
-
-	return ret;
-}
-
-static int gpio_switch_remove(struct platform_device *pdev)
-{
-	struct gpio_switch_data *switch_data = platform_get_drvdata(pdev);
-
-	cancel_work_sync(&switch_data->work);
-	gpio_free(switch_data->gpio);
-	switch_dev_unregister(&switch_data->sdev);
-	kfree(switch_data);
-
-	return 0;
-}
-
-static struct platform_driver gpio_switch_driver = {
-	.probe		= gpio_switch_probe,
-	.remove		= gpio_switch_remove,
-	.driver		= {
-		.name	= "switch-gpio",
-		.owner	= THIS_MODULE,
-	},
-};
-
-static int __init gpio_switch_init(void)
-{
-	return platform_driver_register(&gpio_switch_driver);
-}
-
-static void __exit gpio_switch_exit(void)
-{
-	platform_driver_unregister(&gpio_switch_driver);
-}
-
-module_init(gpio_switch_init);
-module_exit(gpio_switch_exit);
-
-MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
-MODULE_DESCRIPTION("GPIO Switch driver");
-MODULE_LICENSE("GPL");
diff --git a/include/linux/switch.h b/include/linux/switch.h
deleted file mode 100644
index 3e4c748e343a..000000000000
--- a/include/linux/switch.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Switch class driver
- *
- * Copyright (C) 2008 Google, Inc.
- * Author: Mike Lockwood <lockwood@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
-*/
-
-#ifndef __LINUX_SWITCH_H__
-#define __LINUX_SWITCH_H__
-
-struct switch_dev {
-	const char	*name;
-	struct device	*dev;
-	int		index;
-	int		state;
-
-	ssize_t	(*print_name)(struct switch_dev *sdev, char *buf);
-	ssize_t	(*print_state)(struct switch_dev *sdev, char *buf);
-};
-
-struct gpio_switch_platform_data {
-	const char *name;
-	unsigned 	gpio;
-
-	/* if NULL, switch_dev.name will be printed */
-	const char *name_on;
-	const char *name_off;
-	/* if NULL, "0" or "1" will be printed */
-	const char *state_on;
-	const char *state_off;
-};
-
-extern int switch_dev_register(struct switch_dev *sdev);
-extern void switch_dev_unregister(struct switch_dev *sdev);
-
-static inline int switch_get_state(struct switch_dev *sdev)
-{
-	return sdev->state;
-}
-
-extern void switch_set_state(struct switch_dev *sdev, int state);
-
-#endif /* __LINUX_SWITCH_H__ */

From a9de5129787db0d135eab61359b820c783fb5676 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Tue, 19 Apr 2016 12:44:42 -0700
Subject: [PATCH 0021/3220] android: base-cfg: remove CONFIG_SWITCH

Change-Id: I3fd1aa7a54fe3a8d3ad5537cbc61386e52f41ea0
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 android/configs/android-base.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index a8b529cae6eb..304f1d4fd7c4 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -148,7 +148,6 @@ CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SETEND_EMULATION=y
 CONFIG_STAGING=y
-CONFIG_SWITCH=y
 CONFIG_SWP_EMULATION=y
 CONFIG_SYNC=y
 CONFIG_TUN=y

From 24c4a0f75c51c1f42cc0125a024b7c262e664ff7 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Thu, 21 Apr 2016 15:43:29 -0700
Subject: [PATCH 0022/3220] Revert "tcp: Fix IPV6 module build errors"

This reverts commit 3823c8136f2170b3ac5e6a5f8b857746a786e845.
---
 net/ipv4/tcp.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b799f20387c6..c0e0fd2dd378 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3260,7 +3260,7 @@ static int tcp_is_local(struct net *net, __be32 addr) {
 	return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK);
 }
 
-#if defined(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static int tcp_is_local6(struct net *net, struct in6_addr *addr) {
 	struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0);
 	return rt6 && rt6->dst.dev && (rt6->dst.dev->flags & IFF_LOOPBACK);
@@ -3328,7 +3328,7 @@ restart:
 					continue;
 			}
 
-#if defined(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 			if (family == AF_INET6) {
 				struct in6_addr *s6;
 				if (!inet->pinet6)
@@ -3365,4 +3365,3 @@ restart:
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(tcp_nuke_addr);

From 215856823c282d0339acdad8c7f154abc30d1027 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Thu, 21 Apr 2016 15:43:58 -0700
Subject: [PATCH 0023/3220] Revert "Don't kill IPv4 sockets when killing IPv6
 sockets was requested."

This reverts commit 8bf4413b4f54e24120b90ecbfee426beeddc3ff0.
---
 net/ipv4/tcp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c0e0fd2dd378..9819762c4a92 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3331,8 +3331,6 @@ restart:
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 			if (family == AF_INET6) {
 				struct in6_addr *s6;
-				if (!inet->pinet6)
-					continue;
 
 				s6 = &sk->sk_v6_rcv_saddr;
 				if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED)

From 9826b2ec8383b14a82f9d1d545a8573986c16abe Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Thu, 21 Apr 2016 15:44:11 -0700
Subject: [PATCH 0024/3220] Revert "net: fix crash in tcp_nuke_addr()"

This reverts commit 08f7c4280cd5efe9e274240c42177f459431bac2.
---
 net/ipv4/tcp.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9819762c4a92..9b19742cbf42 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3301,19 +3301,8 @@ restart:
 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
 			struct inet_sock *inet = inet_sk(sk);
 
-			if (sk->sk_state == TCP_TIME_WAIT) {
-				/*
-				 * Sockets that are in TIME_WAIT state are
-				 * instances of lightweight inet_timewait_sock,
-				 * we should simply skip them (or we'll try to
-				 * access non-existing fields and crash).
-				 */
-				continue;
-			}
-
 			if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
 				continue;
-
 			if (sock_flag(sk, SOCK_DEAD))
 				continue;
 

From 20de1c6bc4d1b4636314fd1a372533b45af809b5 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Thu, 21 Apr 2016 15:44:25 -0700
Subject: [PATCH 0025/3220] Revert "net: fix iterating over hashtable in
 tcp_nuke_addr()"

This reverts commit 4747299b2c8e8778927b3df0501023d76fe4f2d5.
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9b19742cbf42..854e606f3bce 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3291,7 +3291,7 @@ int tcp_nuke_addr(struct net *net, struct sockaddr *addr)
 		return -EAFNOSUPPORT;
 	}
 
-	for (bucket = 0; bucket <= tcp_hashinfo.ehash_mask; bucket++) {
+	for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
 		struct hlist_nulls_node *node;
 		struct sock *sk;
 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);

From 8591a83caea440173feca9008d2c37898bd7dcba Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Thu, 21 Apr 2016 15:47:01 -0700
Subject: [PATCH 0026/3220] Revert "net: socket ioctl to reset connections
 matching local address"

Use SOCK_DESTROY from now instead of SIOCKILLADDR

This reverts commit 38f0ec724f5306c81130ca9343c856aa37a76d54.

Change-Id: I2dcd833b66c88a48de8978dce9d72ab78f9af549
---
 include/net/tcp.h            |   2 -
 include/uapi/linux/sockios.h |   1 -
 net/ipv4/af_inet.c           |   1 -
 net/ipv4/devinet.c           |   8 +--
 net/ipv4/tcp.c               | 105 -----------------------------------
 net/ipv6/af_inet6.c          |  17 ------
 6 files changed, 1 insertion(+), 133 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 984e59d99deb..bcce99a951f8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1681,8 +1681,6 @@ static inline bool tcp_stream_memory_free(const struct sock *sk)
 	return notsent_bytes < tcp_notsent_lowat(tp);
 }
 
-extern int tcp_nuke_addr(struct net *net, struct sockaddr *addr);
-
 #ifdef CONFIG_PROC_FS
 int tcp4_proc_init(void);
 void tcp4_proc_exit(void);
diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h
index 623e9aab645e..e888b1aed69f 100644
--- a/include/uapi/linux/sockios.h
+++ b/include/uapi/linux/sockios.h
@@ -65,7 +65,6 @@
 #define SIOCDIFADDR	0x8936		/* delete PA address		*/
 #define	SIOCSIFHWBROADCAST	0x8937	/* set hardware broadcast addr	*/
 #define SIOCGIFCOUNT	0x8938		/* get number of devices */
-#define SIOCKILLADDR	0x8939		/* kill sockets with this local addr */
 
 #define SIOCGIFBR	0x8940		/* Bridging support		*/
 #define SIOCSIFBR	0x8941		/* Set bridging options 	*/
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 671eb0092915..eb12bd0ff9d3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -886,7 +886,6 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCSIFPFLAGS:
 	case SIOCGIFPFLAGS:
 	case SIOCSIFFLAGS:
-	case SIOCKILLADDR:
 		err = devinet_ioctl(net, cmd, (void __user *)arg);
 		break;
 	default:
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 3792624af316..cebd9d31e65a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -59,7 +59,6 @@
 
 #include <net/arp.h>
 #include <net/ip.h>
-#include <net/tcp.h>
 #include <net/route.h>
 #include <net/ip_fib.h>
 #include <net/rtnetlink.h>
@@ -965,7 +964,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCSIFBRDADDR:	/* Set the broadcast address */
 	case SIOCSIFDSTADDR:	/* Set the destination address */
 	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
-	case SIOCKILLADDR:	/* Nuke all sockets on this address */
 		ret = -EPERM;
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			goto out;
@@ -1017,8 +1015,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	}
 
 	ret = -EADDRNOTAVAIL;
-	if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS
-	    && cmd != SIOCKILLADDR)
+	if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
 		goto done;
 
 	switch (cmd) {
@@ -1145,9 +1142,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 			inet_insert_ifa(ifa);
 		}
 		break;
-	case SIOCKILLADDR:	/* Nuke all connections on this address */
-		ret = tcp_nuke_addr(net, (struct sockaddr *) sin);
-		break;
 	}
 done:
 	rtnl_unlock();
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 854e606f3bce..e0921748b2c0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -276,9 +276,6 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
-#include <net/ip6_route.h>
-#include <net/ipv6.h>
-#include <net/transp_v6.h>
 #include <net/sock.h>
 
 #include <asm/uaccess.h>
@@ -3250,105 +3247,3 @@ void __init tcp_init(void)
 	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
 	tcp_tasklet_init();
 }
-
-static int tcp_is_local(struct net *net, __be32 addr) {
-	struct rtable *rt;
-	struct flowi4 fl4 = { .daddr = addr };
-	rt = ip_route_output_key(net, &fl4);
-	if (IS_ERR_OR_NULL(rt))
-		return 0;
-	return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK);
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static int tcp_is_local6(struct net *net, struct in6_addr *addr) {
-	struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0);
-	return rt6 && rt6->dst.dev && (rt6->dst.dev->flags & IFF_LOOPBACK);
-}
-#endif
-
-/*
- * tcp_nuke_addr - destroy all sockets on the given local address
- * if local address is the unspecified address (0.0.0.0 or ::), destroy all
- * sockets with local addresses that are not configured.
- */
-int tcp_nuke_addr(struct net *net, struct sockaddr *addr)
-{
-	int family = addr->sa_family;
-	unsigned int bucket;
-
-	struct in_addr *in;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	struct in6_addr *in6 = NULL;
-#endif
-	if (family == AF_INET) {
-		in = &((struct sockaddr_in *)addr)->sin_addr;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	} else if (family == AF_INET6) {
-		in6 = &((struct sockaddr_in6 *)addr)->sin6_addr;
-#endif
-	} else {
-		return -EAFNOSUPPORT;
-	}
-
-	for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
-		struct hlist_nulls_node *node;
-		struct sock *sk;
-		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
-
-restart:
-		spin_lock_bh(lock);
-		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
-			struct inet_sock *inet = inet_sk(sk);
-
-			if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
-				continue;
-			if (sock_flag(sk, SOCK_DEAD))
-				continue;
-
-			if (family == AF_INET) {
-				__be32 s4 = inet->inet_rcv_saddr;
-				if (s4 == LOOPBACK4_IPV6)
-					continue;
-
-				if (in->s_addr != s4 &&
-				    !(in->s_addr == INADDR_ANY &&
-				      !tcp_is_local(net, s4)))
-					continue;
-			}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-			if (family == AF_INET6) {
-				struct in6_addr *s6;
-
-				s6 = &sk->sk_v6_rcv_saddr;
-				if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED)
-					continue;
-
-				if (!ipv6_addr_equal(in6, s6) &&
-				    !(ipv6_addr_equal(in6, &in6addr_any) &&
-				      !tcp_is_local6(net, s6)))
-				continue;
-			}
-#endif
-
-			sock_hold(sk);
-			spin_unlock_bh(lock);
-
-			local_bh_disable();
-			bh_lock_sock(sk);
-			sk->sk_err = ETIMEDOUT;
-			sk->sk_error_report(sk);
-
-			tcp_done(sk);
-			bh_unlock_sock(sk);
-			local_bh_enable();
-			sock_put(sk);
-
-			goto restart;
-		}
-		spin_unlock_bh(lock);
-	}
-
-	return 0;
-}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 99fccad391e0..d9b25bd17bf1 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -495,21 +495,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 }
 EXPORT_SYMBOL(inet6_getname);
 
-int inet6_killaddr_ioctl(struct net *net, void __user *arg) {
-	struct in6_ifreq ireq;
-	struct sockaddr_in6 sin6;
-
-	if (!capable(CAP_NET_ADMIN))
-		return -EACCES;
-
-	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
-		return -EFAULT;
-
-	sin6.sin6_family = AF_INET6;
-	sin6.sin6_addr = ireq.ifr6_addr;
-	return tcp_nuke_addr(net, (struct sockaddr *) &sin6);
-}
-
 int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
@@ -533,8 +518,6 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		return addrconf_del_ifaddr(net, (void __user *) arg);
 	case SIOCSIFDSTADDR:
 		return addrconf_set_dstaddr(net, (void __user *) arg);
-	case SIOCKILLADDR:
-		return inet6_killaddr_ioctl(net, (void __user *) arg);
 	default:
 		if (!sk->sk_prot->ioctl)
 			return -ENOIOCTLCMD;

From a7eabf90aea68842b735193f0df703f659b45715 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 25 Apr 2016 16:58:20 +0530
Subject: [PATCH 0027/3220] Revert "misc: uid_stat: Include linux/atomic.h
 instead of asm/atomic.h"

This reverts commit 8d3a6c1538fb021448c4f6381f6191061f947ba1.

This series of patches revert AOSP UID_STAT and NET_ACTIVITY_STATS drivers.
I could not find any meaningful usage of these interfaces in AOSP master.

UID_STAT driver expose "/proc/uid_stat/*" interfaces but it is only
used in AOSP master as in what appears be an out of date bandwidth
test in frameworks/base and in somewhat recent battery utils test
in external/chromium-trace project.

NET_ACTIVITY_STATS driver expose "/proc/net/stat/activity" interface
but I can not track its usage anywhere in AOSP at all.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/misc/uid_stat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c
index 185c69c9738a..8b8c9a22360b 100644
--- a/drivers/misc/uid_stat.c
+++ b/drivers/misc/uid_stat.c
@@ -13,7 +13,7 @@
  *
  */
 
-#include <linux/atomic.h>
+#include <asm/atomic.h>
 
 #include <linux/err.h>
 #include <linux/init.h>

From cdb6973ae15c5f62947ea7afc29fc1175ecb3172 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 25 Apr 2016 17:00:08 +0530
Subject: [PATCH 0028/3220] Revert "misc seq_printf fixes for 4.4"

This reverts commit 5c7566a29bff14166d952f2ea525d5231546f821.

This patch revert some changes in net/netfilter/xt_qtaguid.c as well.
I'll submit another patch to restore those changes.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/misc/uid_stat.c    | 3 +--
 net/activity_stats.c       | 7 ++++---
 net/netfilter/xt_qtaguid.c | 5 +++--
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c
index 8b8c9a22360b..4766c1f83b94 100644
--- a/drivers/misc/uid_stat.c
+++ b/drivers/misc/uid_stat.c
@@ -55,8 +55,7 @@ static int uid_stat_atomic_int_show(struct seq_file *m, void *v)
 	atomic_t *counter = m->private;
 
 	bytes = (unsigned int) (atomic_read(counter) + INT_MIN);
-	seq_printf(m, "%u\n", bytes);
-	return seq_has_overflowed(m) ? -ENOSPC : 0;
+	return seq_printf(m, "%u\n", bytes);
 }
 
 static int uid_stat_read_atomic_int_open(struct inode *inode, struct file *file)
diff --git a/net/activity_stats.c b/net/activity_stats.c
index 3bf92d80b8b9..4609ce2043eb 100644
--- a/net/activity_stats.c
+++ b/net/activity_stats.c
@@ -63,13 +63,14 @@ void activity_stats_update(void)
 static int activity_stats_show(struct seq_file *m, void *v)
 {
 	int i;
+	int ret;
 
 	seq_printf(m, "Min Bucket(sec) Count\n");
 
 	for (i = 0; i < BUCKET_MAX; i++) {
-		seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]);
-		if (seq_has_overflowed(m))
-			return -ENOSPC;
+		ret = seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]);
+		if (ret)
+			return ret;
 	}
 
 	return 0;
diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index e1442bfb668d..04bb081adde8 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -2543,6 +2543,7 @@ static void pp_stats_header(struct seq_file *m)
 static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
 			 int cnt_set)
 {
+	int ret;
 	struct data_counters *cnts;
 	tag_t tag = ts_entry->tn.tag;
 	uid_t stat_uid = get_uid_from_tag(tag);
@@ -2561,7 +2562,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
 	}
 	ppi->item_index++;
 	cnts = &ts_entry->counters;
-	seq_printf(m, "%d %s 0x%llx %u %u "
+	ret = seq_printf(m, "%d %s 0x%llx %u %u "
 		"%llu %llu "
 		"%llu %llu "
 		"%llu %llu "
@@ -2591,7 +2592,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
 		cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
 		cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
 		cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
-	return seq_has_overflowed(m) ? -ENOSPC : 1;
+	return ret ?: 1;
 }
 
 static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry)

From 1ada37dc073ab6e49fd71d2fdc2be2e4f87c56cd Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 25 Apr 2016 17:00:31 +0530
Subject: [PATCH 0029/3220] Revert "misc: uidstat: Remove use of obsolete
 create_proc_read_entry api"

This reverts commit fccab646d33381af63e4f4c0d4f309a1d2b4b0c3.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/misc/uid_stat.c | 50 ++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c
index 4766c1f83b94..509822c81e97 100644
--- a/drivers/misc/uid_stat.c
+++ b/drivers/misc/uid_stat.c
@@ -20,7 +20,6 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/proc_fs.h>
-#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
@@ -49,26 +48,41 @@ static struct uid_stat *find_uid_stat(uid_t uid) {
 	return NULL;
 }
 
-static int uid_stat_atomic_int_show(struct seq_file *m, void *v)
+static int tcp_snd_read_proc(char *page, char **start, off_t off,
+				int count, int *eof, void *data)
 {
+	int len;
 	unsigned int bytes;
-	atomic_t *counter = m->private;
+	char *p = page;
+	struct uid_stat *uid_entry = (struct uid_stat *) data;
+	if (!data)
+		return 0;
 
-	bytes = (unsigned int) (atomic_read(counter) + INT_MIN);
-	return seq_printf(m, "%u\n", bytes);
+	bytes = (unsigned int) (atomic_read(&uid_entry->tcp_snd) + INT_MIN);
+	p += sprintf(p, "%u\n", bytes);
+	len = (p - page) - off;
+	*eof = (len <= count) ? 1 : 0;
+	*start = page + off;
+	return len;
 }
 
-static int uid_stat_read_atomic_int_open(struct inode *inode, struct file *file)
+static int tcp_rcv_read_proc(char *page, char **start, off_t off,
+				int count, int *eof, void *data)
 {
-	return single_open(file, uid_stat_atomic_int_show, PDE_DATA(inode));
-}
+	int len;
+	unsigned int bytes;
+	char *p = page;
+	struct uid_stat *uid_entry = (struct uid_stat *) data;
+	if (!data)
+		return 0;
 
-static const struct file_operations uid_stat_read_atomic_int_fops = {
-	.open		= uid_stat_read_atomic_int_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
+	bytes = (unsigned int) (atomic_read(&uid_entry->tcp_rcv) + INT_MIN);
+	p += sprintf(p, "%u\n", bytes);
+	len = (p - page) - off;
+	*eof = (len <= count) ? 1 : 0;
+	*start = page + off;
+	return len;
+}
 
 /* Create a new entry for tracking the specified uid. */
 static struct uid_stat *create_stat(uid_t uid) {
@@ -95,11 +109,11 @@ static void create_stat_proc(struct uid_stat *new_uid)
 	entry = proc_mkdir(uid_s, parent);
 
 	/* Keep reference to uid_stat so we know what uid to read stats from. */
-	proc_create_data("tcp_snd", S_IRUGO, entry,
-			 &uid_stat_read_atomic_int_fops, &new_uid->tcp_snd);
+	create_proc_read_entry("tcp_snd", S_IRUGO, entry , tcp_snd_read_proc,
+		(void *) new_uid);
 
-	proc_create_data("tcp_rcv", S_IRUGO, entry,
-			 &uid_stat_read_atomic_int_fops, &new_uid->tcp_rcv);
+	create_proc_read_entry("tcp_rcv", S_IRUGO, entry, tcp_rcv_read_proc,
+		(void *) new_uid);
 }
 
 static struct uid_stat *find_or_create_uid_stat(uid_t uid)

From cded5596a31db1b1e7fa914b4b47fb488dbd819c Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 25 Apr 2016 17:00:43 +0530
Subject: [PATCH 0030/3220] Revert "misc: uidstat: avoid create_stat() race and
 blockage."

This reverts commit f7a812174033fe620509e6e8ca7022abd924b1c4.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/misc/uid_stat.c | 52 +++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 31 deletions(-)

diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c
index 509822c81e97..2141124a6c12 100644
--- a/drivers/misc/uid_stat.c
+++ b/drivers/misc/uid_stat.c
@@ -38,13 +38,17 @@ struct uid_stat {
 };
 
 static struct uid_stat *find_uid_stat(uid_t uid) {
+	unsigned long flags;
 	struct uid_stat *entry;
 
+	spin_lock_irqsave(&uid_lock, flags);
 	list_for_each_entry(entry, &uid_list, link) {
 		if (entry->uid == uid) {
+			spin_unlock_irqrestore(&uid_lock, flags);
 			return entry;
 		}
 	}
+	spin_unlock_irqrestore(&uid_lock, flags);
 	return NULL;
 }
 
@@ -86,10 +90,13 @@ static int tcp_rcv_read_proc(char *page, char **start, off_t off,
 
 /* Create a new entry for tracking the specified uid. */
 static struct uid_stat *create_stat(uid_t uid) {
+	unsigned long flags;
+	char uid_s[32];
 	struct uid_stat *new_uid;
+	struct proc_dir_entry *entry;
+
 	/* Create the uid stat struct and append it to the list. */
-	new_uid = kmalloc(sizeof(struct uid_stat), GFP_ATOMIC);
-	if (!new_uid)
+	if ((new_uid = kmalloc(sizeof(struct uid_stat), GFP_KERNEL)) == NULL)
 		return NULL;
 
 	new_uid->uid = uid;
@@ -97,15 +104,11 @@ static struct uid_stat *create_stat(uid_t uid) {
 	atomic_set(&new_uid->tcp_rcv, INT_MIN);
 	atomic_set(&new_uid->tcp_snd, INT_MIN);
 
+	spin_lock_irqsave(&uid_lock, flags);
 	list_add_tail(&new_uid->link, &uid_list);
-	return new_uid;
-}
+	spin_unlock_irqrestore(&uid_lock, flags);
 
-static void create_stat_proc(struct uid_stat *new_uid)
-{
-	char uid_s[32];
-	struct proc_dir_entry *entry;
-	sprintf(uid_s, "%d", new_uid->uid);
+	sprintf(uid_s, "%d", uid);
 	entry = proc_mkdir(uid_s, parent);
 
 	/* Keep reference to uid_stat so we know what uid to read stats from. */
@@ -114,31 +117,17 @@ static void create_stat_proc(struct uid_stat *new_uid)
 
 	create_proc_read_entry("tcp_rcv", S_IRUGO, entry, tcp_rcv_read_proc,
 		(void *) new_uid);
-}
 
-static struct uid_stat *find_or_create_uid_stat(uid_t uid)
-{
-	struct uid_stat *entry;
-	unsigned long flags;
-	spin_lock_irqsave(&uid_lock, flags);
-	entry = find_uid_stat(uid);
-	if (entry) {
-		spin_unlock_irqrestore(&uid_lock, flags);
-		return entry;
-	}
-	entry = create_stat(uid);
-	spin_unlock_irqrestore(&uid_lock, flags);
-	if (entry)
-		create_stat_proc(entry);
-	return entry;
+	return new_uid;
 }
 
 int uid_stat_tcp_snd(uid_t uid, int size) {
 	struct uid_stat *entry;
 	activity_stats_update();
-	entry = find_or_create_uid_stat(uid);
-	if (!entry)
-		return -1;
+	if ((entry = find_uid_stat(uid)) == NULL &&
+		((entry = create_stat(uid)) == NULL)) {
+			return -1;
+	}
 	atomic_add(size, &entry->tcp_snd);
 	return 0;
 }
@@ -146,9 +135,10 @@ int uid_stat_tcp_snd(uid_t uid, int size) {
 int uid_stat_tcp_rcv(uid_t uid, int size) {
 	struct uid_stat *entry;
 	activity_stats_update();
-	entry = find_or_create_uid_stat(uid);
-	if (!entry)
-		return -1;
+	if ((entry = find_uid_stat(uid)) == NULL &&
+		((entry = create_stat(uid)) == NULL)) {
+			return -1;
+	}
 	atomic_add(size, &entry->tcp_rcv);
 	return 0;
 }

From 5c0d8ae10ab777417eefc39cd5f84d64c6f34622 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 25 Apr 2016 17:00:57 +0530
Subject: [PATCH 0031/3220] Revert "net: activity_stats: Stop using obsolete
 create_proc_read_entry api"

This reverts commit 7c121720fa14889d59e933aad0a8b9ce948a39ae.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 net/activity_stats.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/net/activity_stats.c b/net/activity_stats.c
index 4609ce2043eb..8a3e93470069 100644
--- a/net/activity_stats.c
+++ b/net/activity_stats.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/proc_fs.h>
-#include <linux/seq_file.h>
 #include <linux/suspend.h>
 #include <net/net_namespace.h>
 
@@ -60,20 +59,29 @@ void activity_stats_update(void)
 	spin_unlock_irqrestore(&activity_lock, flags);
 }
 
-static int activity_stats_show(struct seq_file *m, void *v)
+static int activity_stats_read_proc(char *page, char **start, off_t off,
+					int count, int *eof, void *data)
 {
 	int i;
-	int ret;
+	int len;
+	char *p = page;
 
-	seq_printf(m, "Min Bucket(sec) Count\n");
+	/* Only print if offset is 0, or we have enough buffer space */
+	if (off || count < (30 * BUCKET_MAX + 22))
+		return -ENOMEM;
+
+	len = snprintf(p, count, "Min Bucket(sec) Count\n");
+	count -= len;
+	p += len;
 
 	for (i = 0; i < BUCKET_MAX; i++) {
-		ret = seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]);
-		if (ret)
-			return ret;
+		len = snprintf(p, count, "%15d %lu\n", 1 << i, activity_stats[i]);
+		count -= len;
+		p += len;
 	}
+	*eof = 1;
 
-	return 0;
+	return p - page;
 }
 
 static int activity_stats_notifier(struct notifier_block *nb,
@@ -92,26 +100,14 @@ static int activity_stats_notifier(struct notifier_block *nb,
 	return 0;
 }
 
-static int activity_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, activity_stats_show, PDE_DATA(inode));
-}
-
-static const struct file_operations activity_stats_fops = {
-	.open		= activity_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static struct notifier_block activity_stats_notifier_block = {
 	.notifier_call = activity_stats_notifier,
 };
 
 static int  __init activity_stats_init(void)
 {
-	proc_create("activity", S_IRUGO,
-		    init_net.proc_net_stat, &activity_stats_fops);
+	create_proc_read_entry("activity", S_IRUGO,
+			init_net.proc_net_stat, activity_stats_read_proc, NULL);
 	return register_pm_notifier(&activity_stats_notifier_block);
 }
 

From 42d9422a803afc7c1db3502787fdbe75577bc14d Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 25 Apr 2016 17:01:08 +0530
Subject: [PATCH 0032/3220] Revert "net: activity_stats: Add statistics for
 network transmission activity"

This reverts commit afedd7beba14385fd797166751fde39e0f52cf72.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/misc/uid_stat.c      |   3 -
 include/net/activity_stats.h |  25 --------
 net/Kconfig                  |   8 ---
 net/Makefile                 |   1 -
 net/activity_stats.c         | 115 -----------------------------------
 5 files changed, 152 deletions(-)
 delete mode 100644 include/net/activity_stats.h
 delete mode 100644 net/activity_stats.c

diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c
index 2141124a6c12..e6760b56083c 100644
--- a/drivers/misc/uid_stat.c
+++ b/drivers/misc/uid_stat.c
@@ -24,7 +24,6 @@
 #include <linux/spinlock.h>
 #include <linux/stat.h>
 #include <linux/uid_stat.h>
-#include <net/activity_stats.h>
 
 static DEFINE_SPINLOCK(uid_lock);
 static LIST_HEAD(uid_list);
@@ -123,7 +122,6 @@ static struct uid_stat *create_stat(uid_t uid) {
 
 int uid_stat_tcp_snd(uid_t uid, int size) {
 	struct uid_stat *entry;
-	activity_stats_update();
 	if ((entry = find_uid_stat(uid)) == NULL &&
 		((entry = create_stat(uid)) == NULL)) {
 			return -1;
@@ -134,7 +132,6 @@ int uid_stat_tcp_snd(uid_t uid, int size) {
 
 int uid_stat_tcp_rcv(uid_t uid, int size) {
 	struct uid_stat *entry;
-	activity_stats_update();
 	if ((entry = find_uid_stat(uid)) == NULL &&
 		((entry = create_stat(uid)) == NULL)) {
 			return -1;
diff --git a/include/net/activity_stats.h b/include/net/activity_stats.h
deleted file mode 100644
index 10e4c1506eeb..000000000000
--- a/include/net/activity_stats.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2010 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * Author: Mike Chan (mike@android.com)
- */
-
-#ifndef __activity_stats_h
-#define __activity_stats_h
-
-#ifdef CONFIG_NET_ACTIVITY_STATS
-void activity_stats_update(void);
-#else
-#define activity_stats_update(void) {}
-#endif
-
-#endif /* _NET_ACTIVITY_STATS_H */
diff --git a/net/Kconfig b/net/Kconfig
index 043fe1dc0860..ce9585cf343a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -92,14 +92,6 @@ config ANDROID_PARANOID_NETWORK
 	help
 		none
 
-config NET_ACTIVITY_STATS
-	bool "Network activity statistics tracking"
-	default y
-	help
-	 Network activity statistics are useful for tracking wireless
-	 modem activity on 2G, 3G, 4G wireless networks. Counts number of
-	 transmissions and groups them in specified time buckets.
-
 config NETWORK_SECMARK
 	bool "Security Marking"
 	help
diff --git a/net/Makefile b/net/Makefile
index eeb9d5db454f..a5d04098dfce 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -77,4 +77,3 @@ endif
 ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
 obj-y				+= l3mdev/
 endif
-obj-$(CONFIG_NET_ACTIVITY_STATS)		+= activity_stats.o
diff --git a/net/activity_stats.c b/net/activity_stats.c
deleted file mode 100644
index 8a3e93470069..000000000000
--- a/net/activity_stats.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/* net/activity_stats.c
- *
- * Copyright (C) 2010 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * Author: Mike Chan (mike@android.com)
- */
-
-#include <linux/proc_fs.h>
-#include <linux/suspend.h>
-#include <net/net_namespace.h>
-
-/*
- * Track transmission rates in buckets (power of 2).
- * 1,2,4,8...512 seconds.
- *
- * Buckets represent the count of network transmissions at least
- * N seconds apart, where N is 1 << bucket index.
- */
-#define BUCKET_MAX 10
-
-/* Track network activity frequency */
-static unsigned long activity_stats[BUCKET_MAX];
-static ktime_t last_transmit;
-static ktime_t suspend_time;
-static DEFINE_SPINLOCK(activity_lock);
-
-void activity_stats_update(void)
-{
-	int i;
-	unsigned long flags;
-	ktime_t now;
-	s64 delta;
-
-	spin_lock_irqsave(&activity_lock, flags);
-	now = ktime_get();
-	delta = ktime_to_ns(ktime_sub(now, last_transmit));
-
-	for (i = BUCKET_MAX - 1; i >= 0; i--) {
-		/*
-		 * Check if the time delta between network activity is within the
-		 * minimum bucket range.
-		 */
-		if (delta < (1000000000ULL << i))
-			continue;
-
-		activity_stats[i]++;
-		last_transmit = now;
-		break;
-	}
-	spin_unlock_irqrestore(&activity_lock, flags);
-}
-
-static int activity_stats_read_proc(char *page, char **start, off_t off,
-					int count, int *eof, void *data)
-{
-	int i;
-	int len;
-	char *p = page;
-
-	/* Only print if offset is 0, or we have enough buffer space */
-	if (off || count < (30 * BUCKET_MAX + 22))
-		return -ENOMEM;
-
-	len = snprintf(p, count, "Min Bucket(sec) Count\n");
-	count -= len;
-	p += len;
-
-	for (i = 0; i < BUCKET_MAX; i++) {
-		len = snprintf(p, count, "%15d %lu\n", 1 << i, activity_stats[i]);
-		count -= len;
-		p += len;
-	}
-	*eof = 1;
-
-	return p - page;
-}
-
-static int activity_stats_notifier(struct notifier_block *nb,
-					unsigned long event, void *dummy)
-{
-	switch (event) {
-		case PM_SUSPEND_PREPARE:
-			suspend_time = ktime_get_real();
-			break;
-
-		case PM_POST_SUSPEND:
-			suspend_time = ktime_sub(ktime_get_real(), suspend_time);
-			last_transmit = ktime_sub(last_transmit, suspend_time);
-	}
-
-	return 0;
-}
-
-static struct notifier_block activity_stats_notifier_block = {
-	.notifier_call = activity_stats_notifier,
-};
-
-static int  __init activity_stats_init(void)
-{
-	create_proc_read_entry("activity", S_IRUGO,
-			init_net.proc_net_stat, activity_stats_read_proc, NULL);
-	return register_pm_notifier(&activity_stats_notifier_block);
-}
-
-subsys_initcall(activity_stats_init);
-

From ece28ad441409646dc6330b06d347465d2730feb Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 25 Apr 2016 17:08:15 +0530
Subject: [PATCH 0033/3220] Revert "misc: uidstat: Adding uid stat driver to
 collect network statistics."

This reverts commit 6b6d5fbf9ae567aefb58099a30bbb6d25fa8925b.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/misc/Kconfig     |   4 -
 drivers/misc/Makefile    |   1 -
 drivers/misc/uid_stat.c  | 153 ---------------------------------------
 include/linux/uid_stat.h |  29 --------
 net/ipv4/tcp.c           |  14 ----
 5 files changed, 201 deletions(-)
 delete mode 100644 drivers/misc/uid_stat.c
 delete mode 100644 include/linux/uid_stat.h

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 98c020b560ac..2763d6b3b063 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -412,10 +412,6 @@ config TI_DAC7512
 	  This driver can also be built as a module. If so, the module
 	  will be called ti_dac7512.
 
-config UID_STAT
-	bool "UID based statistics tracking exported to /proc/uid_stat"
-	default n
-
 config VMWARE_BALLOON
 	tristate "VMware Balloon Driver"
 	depends on VMWARE_VMCI && X86 && HYPERVISOR_GUEST
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 24483a6caa6b..e5142b836aee 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -35,7 +35,6 @@ obj-$(CONFIG_ISL29020)		+= isl29020.o
 obj-$(CONFIG_SENSORS_TSL2550)	+= tsl2550.o
 obj-$(CONFIG_DS1682)		+= ds1682.o
 obj-$(CONFIG_TI_DAC7512)	+= ti_dac7512.o
-obj-$(CONFIG_UID_STAT)		+= uid_stat.o
 obj-$(CONFIG_C2PORT)		+= c2port/
 obj-$(CONFIG_HMC6352)		+= hmc6352.o
 obj-y				+= eeprom/
diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c
deleted file mode 100644
index e6760b56083c..000000000000
--- a/drivers/misc/uid_stat.c
+++ /dev/null
@@ -1,153 +0,0 @@
-/* drivers/misc/uid_stat.c
- *
- * Copyright (C) 2008 - 2009 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <asm/atomic.h>
-
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stat.h>
-#include <linux/uid_stat.h>
-
-static DEFINE_SPINLOCK(uid_lock);
-static LIST_HEAD(uid_list);
-static struct proc_dir_entry *parent;
-
-struct uid_stat {
-	struct list_head link;
-	uid_t uid;
-	atomic_t tcp_rcv;
-	atomic_t tcp_snd;
-};
-
-static struct uid_stat *find_uid_stat(uid_t uid) {
-	unsigned long flags;
-	struct uid_stat *entry;
-
-	spin_lock_irqsave(&uid_lock, flags);
-	list_for_each_entry(entry, &uid_list, link) {
-		if (entry->uid == uid) {
-			spin_unlock_irqrestore(&uid_lock, flags);
-			return entry;
-		}
-	}
-	spin_unlock_irqrestore(&uid_lock, flags);
-	return NULL;
-}
-
-static int tcp_snd_read_proc(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
-{
-	int len;
-	unsigned int bytes;
-	char *p = page;
-	struct uid_stat *uid_entry = (struct uid_stat *) data;
-	if (!data)
-		return 0;
-
-	bytes = (unsigned int) (atomic_read(&uid_entry->tcp_snd) + INT_MIN);
-	p += sprintf(p, "%u\n", bytes);
-	len = (p - page) - off;
-	*eof = (len <= count) ? 1 : 0;
-	*start = page + off;
-	return len;
-}
-
-static int tcp_rcv_read_proc(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
-{
-	int len;
-	unsigned int bytes;
-	char *p = page;
-	struct uid_stat *uid_entry = (struct uid_stat *) data;
-	if (!data)
-		return 0;
-
-	bytes = (unsigned int) (atomic_read(&uid_entry->tcp_rcv) + INT_MIN);
-	p += sprintf(p, "%u\n", bytes);
-	len = (p - page) - off;
-	*eof = (len <= count) ? 1 : 0;
-	*start = page + off;
-	return len;
-}
-
-/* Create a new entry for tracking the specified uid. */
-static struct uid_stat *create_stat(uid_t uid) {
-	unsigned long flags;
-	char uid_s[32];
-	struct uid_stat *new_uid;
-	struct proc_dir_entry *entry;
-
-	/* Create the uid stat struct and append it to the list. */
-	if ((new_uid = kmalloc(sizeof(struct uid_stat), GFP_KERNEL)) == NULL)
-		return NULL;
-
-	new_uid->uid = uid;
-	/* Counters start at INT_MIN, so we can track 4GB of network traffic. */
-	atomic_set(&new_uid->tcp_rcv, INT_MIN);
-	atomic_set(&new_uid->tcp_snd, INT_MIN);
-
-	spin_lock_irqsave(&uid_lock, flags);
-	list_add_tail(&new_uid->link, &uid_list);
-	spin_unlock_irqrestore(&uid_lock, flags);
-
-	sprintf(uid_s, "%d", uid);
-	entry = proc_mkdir(uid_s, parent);
-
-	/* Keep reference to uid_stat so we know what uid to read stats from. */
-	create_proc_read_entry("tcp_snd", S_IRUGO, entry , tcp_snd_read_proc,
-		(void *) new_uid);
-
-	create_proc_read_entry("tcp_rcv", S_IRUGO, entry, tcp_rcv_read_proc,
-		(void *) new_uid);
-
-	return new_uid;
-}
-
-int uid_stat_tcp_snd(uid_t uid, int size) {
-	struct uid_stat *entry;
-	if ((entry = find_uid_stat(uid)) == NULL &&
-		((entry = create_stat(uid)) == NULL)) {
-			return -1;
-	}
-	atomic_add(size, &entry->tcp_snd);
-	return 0;
-}
-
-int uid_stat_tcp_rcv(uid_t uid, int size) {
-	struct uid_stat *entry;
-	if ((entry = find_uid_stat(uid)) == NULL &&
-		((entry = create_stat(uid)) == NULL)) {
-			return -1;
-	}
-	atomic_add(size, &entry->tcp_rcv);
-	return 0;
-}
-
-static int __init uid_stat_init(void)
-{
-	parent = proc_mkdir("uid_stat", NULL);
-	if (!parent) {
-		pr_err("uid_stat: failed to create proc entry\n");
-		return -1;
-	}
-	return 0;
-}
-
-__initcall(uid_stat_init);
diff --git a/include/linux/uid_stat.h b/include/linux/uid_stat.h
deleted file mode 100644
index 6bd6c4e52d17..000000000000
--- a/include/linux/uid_stat.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* include/linux/uid_stat.h
- *
- * Copyright (C) 2008-2009 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef __uid_stat_h
-#define __uid_stat_h
-
-/* Contains definitions for resource tracking per uid. */
-
-#ifdef CONFIG_UID_STAT
-int uid_stat_tcp_snd(uid_t uid, int size);
-int uid_stat_tcp_rcv(uid_t uid, int size);
-#else
-#define uid_stat_tcp_snd(uid, size) do {} while (0);
-#define uid_stat_tcp_rcv(uid, size) do {} while (0);
-#endif
-
-#endif /* _LINUX_UID_STAT_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e0921748b2c0..a1d5c67e251d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,7 +269,6 @@
 #include <linux/crypto.h>
 #include <linux/time.h>
 #include <linux/slab.h>
-#include <linux/uid_stat.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -1284,10 +1283,6 @@ out:
 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
 out_nopush:
 	release_sock(sk);
-
-	if (copied + copied_syn)
-		uid_stat_tcp_snd(from_kuid(&init_user_ns, current_uid()),
-				 copied + copied_syn);
 	return copied + copied_syn;
 
 do_fault:
@@ -1562,8 +1557,6 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 	if (copied > 0) {
 		tcp_recv_skb(sk, seq, &offset);
 		tcp_cleanup_rbuf(sk, copied);
-		uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()),
-				 copied);
 	}
 	return copied;
 }
@@ -1897,10 +1890,6 @@ skip_copy:
 	tcp_cleanup_rbuf(sk, copied);
 
 	release_sock(sk);
-
-	if (copied > 0)
-		uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()),
-				 copied);
 	return copied;
 
 out:
@@ -1909,9 +1898,6 @@ out:
 
 recv_urg:
 	err = tcp_recv_urg(sk, msg, len, flags);
-	if (err > 0)
-		uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()),
-				 err);
 	goto out;
 
 recv_sndq:

From 7c79aca516a51401a66b28ca68bd370ed7e9db0b Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Thu, 1 Oct 2015 10:44:36 +0530
Subject: [PATCH 0034/3220] netfilter: xt_qtaguid: seq_printf fixes

Update seq_printf() usage in xt_qtaguid to align
with changes from mainline commit 6798a8caaf64
"fs/seq_file: convert int seq_vprint/seq_printf/etc...
returns to void".

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 net/netfilter/xt_qtaguid.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index 04bb081adde8..e1442bfb668d 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -2543,7 +2543,6 @@ static void pp_stats_header(struct seq_file *m)
 static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
 			 int cnt_set)
 {
-	int ret;
 	struct data_counters *cnts;
 	tag_t tag = ts_entry->tn.tag;
 	uid_t stat_uid = get_uid_from_tag(tag);
@@ -2562,7 +2561,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
 	}
 	ppi->item_index++;
 	cnts = &ts_entry->counters;
-	ret = seq_printf(m, "%d %s 0x%llx %u %u "
+	seq_printf(m, "%d %s 0x%llx %u %u "
 		"%llu %llu "
 		"%llu %llu "
 		"%llu %llu "
@@ -2592,7 +2591,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
 		cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
 		cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
 		cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
-	return ret ?: 1;
+	return seq_has_overflowed(m) ? -ENOSPC : 1;
 }
 
 static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry)

From 5e00c3098bbfbf22086e7ce350cc63ea913a1c15 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 25 Apr 2016 23:55:44 +0530
Subject: [PATCH 0035/3220] android: recommended.cfg: remove CONFIG_UID_STAT

Remove UID Stat driver.

Change-Id: Ifc9d2c6fe27900f30e6407398f5b24222518bffc
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 android/configs/android-recommended.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index e4c8aaade197..35936afdcae4 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -119,7 +119,6 @@ CONFIG_TIMER_STATS=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_UHID=y
-CONFIG_UID_STAT=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_HIDDEV=y

From 4b7de8375299c9f0e330ce521a956bc35bf1d12c Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 22 Apr 2016 00:00:14 -0700
Subject: [PATCH 0036/3220] vfs: change d_canonical_path to take two paths

bug: 23904372
Change-Id: I4a686d64b6de37decf60019be1718e1d820193e6
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/notify/inotify/inotify_user.c | 2 +-
 fs/sdcardfs/dentry.c             | 6 +++++-
 include/linux/dcache.h           | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f72f3b25b3f2..e2893f17dde2 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -746,7 +746,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	/* support stacked filesystems */
 	if(path.dentry && path.dentry->d_op) {
 		if (path.dentry->d_op->d_canonical_path) {
-			path.dentry->d_op->d_canonical_path(path.dentry, &alteredpath);
+			path.dentry->d_op->d_canonical_path(&path, &alteredpath);
 			canonical_path = &alteredpath;
 			path_put(&path);
 		}
diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index ba165ef11e27..971928ab6c21 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -172,11 +172,15 @@ static int sdcardfs_cmp_ci(const struct dentry *parent,
 	return 1;
 }
 
+static void sdcardfs_canonical_path(const struct path *path, struct path *actual_path) {
+	sdcardfs_get_real_lower(path->dentry, actual_path);
+}
+
 const struct dentry_operations sdcardfs_ci_dops = {
 	.d_revalidate	= sdcardfs_d_revalidate,
 	.d_release	= sdcardfs_d_release,
 	.d_hash 	= sdcardfs_hash_ci,
 	.d_compare	= sdcardfs_cmp_ci,
-	.d_canonical_path = sdcardfs_get_real_lower,
+	.d_canonical_path = sdcardfs_canonical_path,
 };
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index ebad56e31a8c..5819e9b0ee22 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -161,7 +161,7 @@ struct dentry_operations {
 	struct vfsmount *(*d_automount)(struct path *);
 	int (*d_manage)(struct dentry *, bool);
 	struct inode *(*d_select_inode)(struct dentry *, unsigned);
-	void (*d_canonical_path)(const struct dentry *, struct path *);
+	void (*d_canonical_path)(const struct path *, struct path *);
 } ____cacheline_aligned;
 
 /*

From 00ee836d89c5756b4e937eae58c9d0d1d7ffe560 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 22 Apr 2016 00:00:48 -0700
Subject: [PATCH 0037/3220] fuse: Add support for d_canonical_path

Allows FUSE to report to inotify that it is acting
as a layered filesystem. The userspace component
returns a string representing the location of the
underlying file. If the string cannot be resolved
into a path, the top level path is returned instead.

bug: 23904372
Change-Id: Iabdca0bbedfbff59e9c820c58636a68ef9683d9f
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/fuse/dev.c             |  5 +++++
 fs/fuse/dir.c             | 45 +++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h          |  3 +++
 include/uapi/linux/fuse.h |  1 +
 4 files changed, 54 insertions(+)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 175fcdeabe4c..8932c06e40c1 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -13,6 +13,7 @@
 #include <linux/poll.h>
 #include <linux/uio.h>
 #include <linux/miscdevice.h>
+#include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/slab.h>
@@ -1935,6 +1936,10 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 		cs->move_pages = 0;
 
 	err = copy_out_args(cs, &req->out, nbytes);
+	if (req->in.h.opcode == FUSE_CANONICAL_PATH) {
+		req->out.h.error = kern_path((char *)req->out.args[0].value, 0,
+							req->canonical_path);
+	}
 	fuse_copy_finish(cs);
 
 	spin_lock(&fpq->lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5e2e08712d3b..24df0a5df675 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -267,6 +267,50 @@ invalid:
 	goto out;
 }
 
+/*
+ * Get the canonical path. Since we must translate to a path, this must be done
+ * in the context of the userspace daemon, however, the userspace daemon cannot
+ * look up paths on its own. Instead, we handle the lookup as a special case
+ * inside of the write request.
+ */
+static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) {
+	struct inode *inode = path->dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	int err;
+	char *path_name;
+
+	req = fuse_get_req(fc, 1);
+	err = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto default_path;
+
+	path_name = (char*)__get_free_page(GFP_KERNEL);
+	if (!path_name) {
+		fuse_put_request(fc, req);
+		goto default_path;
+	}
+
+	req->in.h.opcode = FUSE_CANONICAL_PATH;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 0;
+	req->out.numargs = 1;
+	req->out.args[0].size = PATH_MAX;
+	req->out.args[0].value = path_name;
+	req->canonical_path = canonical_path;
+	req->out.argvar = 1;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	free_page((unsigned long)path_name);
+	if (!err)
+		return;
+default_path:
+	canonical_path->dentry = path->dentry;
+	canonical_path->mnt = path->mnt;
+	path_get(canonical_path);
+}
+
 static int invalid_nodeid(u64 nodeid)
 {
 	return !nodeid || nodeid == FUSE_ROOT_ID;
@@ -274,6 +318,7 @@ static int invalid_nodeid(u64 nodeid)
 
 const struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
+	.d_canonical_path = fuse_dentry_canonical_path,
 };
 
 int fuse_valid_type(int m)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 405113101db8..6b4f8dd8e9c4 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -363,6 +363,9 @@ struct fuse_req {
 	/** Inode used in the request or NULL */
 	struct inode *inode;
 
+	/** Path used for completing d_canonical_path */
+	struct path *canonical_path;
+
 	/** AIO control block */
 	struct fuse_io_priv *io;
 
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index c9aca042e61d..8485dd4300b8 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -358,6 +358,7 @@ enum fuse_opcode {
 	FUSE_FALLOCATE     = 43,
 	FUSE_READDIRPLUS   = 44,
 	FUSE_RENAME2       = 45,
+	FUSE_CANONICAL_PATH= 2016,
 
 	/* CUSE specific operations */
 	CUSE_INIT          = 4096,

From 5a0725f49f3ab0ebada002e1d4b13943c3a35d18 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Tue, 26 Apr 2016 15:51:06 +0530
Subject: [PATCH 0038/3220] Revert "SELinux: build fix for 4.1"

This reverts commit 43e1b4f528e1654fadd1097f7cc5c50be6e45b77.

This patch is part of code which is already upstreamed in v4.4. Commits
5c73fceb8c70 (SELinux: Enable setting security contexts on rootfs inodes.),
12f348b9dcf6 (SELinux: rename SE_SBLABELSUPP to SBLABEL_MNT), and
b43e725d8d38 (SELinux: use a helper function to determine seclabel).
for reference.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 security/selinux/hooks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 78d06ff4eeb7..d4d12949de09 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -469,7 +469,7 @@ static int sb_finish_set_opts(struct super_block *sb)
 	 * setting SELinux context on in-core inodes.
 	 */
 	if (strncmp(sb->s_type->name, "rootfs", sizeof("rootfs")) == 0)
-		sbsec->flags |= SBLABEL_MNT;
+		sbsec->flags |= SE_SBLABELSUPP;
 
 	/* Initialize the root inode. */
 	rc = inode_doinit_with_dentry(root_inode, root);

From 66daa5f5e2b9078a7cecf37e896385aeee520005 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Tue, 26 Apr 2016 15:51:20 +0530
Subject: [PATCH 0039/3220] Revert "SELinux: Enable setting security contexts
 on rootfs inodes."

This reverts commit 78d36d2111cd4ca722a602846f7db8f54a0b074c.

Drop this duplicate patch. This patch is already upstreamed in v4.4. Commits
5c73fceb8c70 (SELinux: Enable setting security contexts on rootfs inodes.),
12f348b9dcf6 (SELinux: rename SE_SBLABELSUPP to SBLABEL_MNT), and
b43e725d8d38 (SELinux: use a helper function to determine seclabel),
for reference.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 security/selinux/hooks.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d4d12949de09..9b0586d0f3c4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -464,13 +464,6 @@ static int sb_finish_set_opts(struct super_block *sb)
 	if (selinux_is_sblabel_mnt(sb))
 		sbsec->flags |= SBLABEL_MNT;
 
-	/*
-	 * Special handling for rootfs. Is genfs but supports
-	 * setting SELinux context on in-core inodes.
-	 */
-	if (strncmp(sb->s_type->name, "rootfs", sizeof("rootfs")) == 0)
-		sbsec->flags |= SE_SBLABELSUPP;
-
 	/* Initialize the root inode. */
 	rc = inode_doinit_with_dentry(root_inode, root);
 

From ad95c12f66df9efae04b15d5c4d0d0ba56ab2620 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Mon, 25 Apr 2016 14:28:30 -0700
Subject: [PATCH 0040/3220] Revert "mm: vmscan: Add a debug file for shrinkers"

Kernel panic when type "cat /sys/kernel/debug/shrinker"

Unable to handle kernel paging request at virtual address 0af37d40
pgd = d4dec000
[0af37d40] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT SMP ARM
[<c0bb8f24>] (_raw_spin_lock) from [<c020aa08>] (list_lru_count_one+0x14/0x28)
[<c020aa08>] (list_lru_count_one) from [<c02309a8>] (super_cache_count+0x40/0xa0)
[<c02309a8>] (super_cache_count) from [<c01f6ab0>] (debug_shrinker_show+0x50/0x90)
[<c01f6ab0>] (debug_shrinker_show) from [<c024fa5c>] (seq_read+0x1ec/0x48c)
[<c024fa5c>] (seq_read) from [<c022e8f8>] (__vfs_read+0x20/0xd0)
[<c022e8f8>] (__vfs_read) from [<c022f0d0>] (vfs_read+0x7c/0x104)
[<c022f0d0>] (vfs_read) from [<c022f974>] (SyS_read+0x44/0x9c)
[<c022f974>] (SyS_read) from [<c0107580>] (ret_fast_syscall+0x0/0x3c)
Code: e1a04000 e3a00001 ebd66b39 f594f000 (e1943f9f)
---[ end trace 60c74014a63a9688 ]---
Kernel panic - not syncing: Fatal exception

shrink_control.nid is used but not initialzed, same for
shrink_control.memcg.

This reverts commit b0e7a582b2264cdf75874dcd8df915b6b4427755.

Change-Id: I108de88fa4baaef99a53c4e4c6a1d8c4b4804157
Reported-by: Xiaowen Liu <xiaowen.liu@freescale.com>
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 mm/vmscan.c | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0c56f6a812bc..2aec4241b42a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,7 +46,6 @@
 #include <linux/oom.h>
 #include <linux/prefetch.h>
 #include <linux/printk.h>
-#include <linux/debugfs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -221,39 +220,6 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
 }
 
-struct dentry *debug_file;
-
-static int debug_shrinker_show(struct seq_file *s, void *unused)
-{
-	struct shrinker *shrinker;
-	struct shrink_control sc;
-
-	sc.gfp_mask = -1;
-	sc.nr_to_scan = 0;
-
-	down_read(&shrinker_rwsem);
-	list_for_each_entry(shrinker, &shrinker_list, list) {
-		int num_objs;
-
-		num_objs = shrinker->count_objects(shrinker, &sc);
-		seq_printf(s, "%pf %d\n", shrinker->scan_objects, num_objs);
-	}
-	up_read(&shrinker_rwsem);
-	return 0;
-}
-
-static int debug_shrinker_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, debug_shrinker_show, inode->i_private);
-}
-
-static const struct file_operations debug_shrinker_fops = {
-        .open = debug_shrinker_open,
-        .read = seq_read,
-        .llseek = seq_lseek,
-        .release = single_release,
-};
-
 /*
  * Add a shrinker callback to be called from the vm.
  */
@@ -283,15 +249,6 @@ int register_shrinker(struct shrinker *shrinker)
 }
 EXPORT_SYMBOL(register_shrinker);
 
-static int __init add_shrinker_debug(void)
-{
-	debugfs_create_file("shrinker", 0644, NULL, NULL,
-			    &debug_shrinker_fops);
-	return 0;
-}
-
-late_initcall(add_shrinker_debug);
-
 /*
  * Remove one
  */

From 4e461c777e345727aa2988377774c996d303ac46 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Apr 2016 17:12:57 -0700
Subject: [PATCH 0041/3220] xt_qtaguid: Fix panic caused by synack processing

In upstream commit ca6fb06518836ef9b65dc0aac02ff97704d52a05
(tcp: attach SYNACK messages to request sockets instead of
listener)
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ca6fb0651883

The building of synack messages was changed, which made it so
the skb->sk points to a casted request_sock. This is problematic,
as there is no sk_socket in a request_sock. So when the qtaguid_mt
function tries to access the sk->sk_socket, it accesses uninitialized
memory.

After looking at how other netfilter implementations handle this,
I realized there was a skb_to_full_sk() helper added, which the
xt_qtaguid code isn't yet using.

This patch adds its use, and resovles panics seen when accessing
uninitialzed memory when processing synack packets.

Reported-by: YongQin Liu <yongquin.liu@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 net/netfilter/xt_qtaguid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index e1442bfb668d..822dc3c3bce1 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -1689,7 +1689,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	/* default: Fall through and do UID releated work */
 	}
 
-	sk = skb->sk;
+	sk = skb_to_full_sk(skb);
 	/*
 	 * When in TCP_TIME_WAIT the sk is not a "struct sock" but
 	 * "struct inet_timewait_sock" which is missing fields.

From 379cf6e8ddb187ad202bf05d22251e280fee26d1 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Tue, 26 Apr 2016 15:14:35 +0530
Subject: [PATCH 0042/3220] Revert "HID: steelseries: validate output report
 details"

This reverts commit 90037b2720acffa6da2269a10ecf24ec2dace89b.

Remove duplicate code. This patch is already upstreamed in v4.4,
commit 41df7f6d4372 (HID: steelseries: validate output report details).

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/hid/hid-steelseries.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/hid/hid-steelseries.c b/drivers/hid/hid-steelseries.c
index 93ddc1c65b4c..3edd4ac36494 100644
--- a/drivers/hid/hid-steelseries.c
+++ b/drivers/hid/hid-steelseries.c
@@ -253,11 +253,6 @@ static int steelseries_srws1_probe(struct hid_device *hdev,
 		goto err_free;
 	}
 
-	if (!hid_validate_values(hdev, HID_OUTPUT_REPORT, 0, 0, 16)) {
-		ret = -ENODEV;
-		goto err_free;
-	}
-
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
 	if (ret) {
 		hid_err(hdev, "hw start failed\n");

From 9fc7b46c3d3ffdd135c601e5c28a6bb316f3a011 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Tue, 26 Apr 2016 14:47:53 +0530
Subject: [PATCH 0043/3220] Revert "hid-multitouch: Filter collections by
 application usage."

This reverts commit 0840b80cb9626906b57df54e7229db60f9aea4f2.

This patch is already upstreamed in v4.4, commit
658d4aed59b3 (HID: hid-multitouch: Filter collections by application usage.),
and further fixed/cleaned up afterwards in commits
c2ef8f21ea8f (HID: multitouch: add support for trackpads),
76f5902aebda (HID: hid-multitouch: Simplify setup and frame synchronization) et al.

By having this duplicate patch in AOSP we are doing redundant
checks for Touchscreen and Touchpad devices.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/hid/hid-multitouch.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index 96759b270360..3d664d01305e 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c
@@ -434,16 +434,6 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 	if ((usage->hid & HID_USAGE_PAGE) == HID_UP_BUTTON)
 		td->buttons_count++;
 
-	/* Only map fields from TouchScreen or TouchPad collections.
-         * We need to ignore fields that belong to other collections
-         * such as Mouse that might have the same GenericDesktop usages. */
-	if (field->application == HID_DG_TOUCHSCREEN)
-		set_bit(INPUT_PROP_DIRECT, hi->input->propbit);
-	else if (field->application == HID_DG_TOUCHPAD)
-		set_bit(INPUT_PROP_POINTER, hi->input->propbit);
-	else
-		return 0;
-
 	if (usage->usage_index)
 		prev_usage = &field->usage[usage->usage_index - 1];
 

From 3a343a1540d4376d838c0a29bd5462d4e961e766 Mon Sep 17 00:00:00 2001
From: Yongqin Liu <yongqin.liu@linaro.org>
Date: Thu, 28 Apr 2016 13:53:36 +0800
Subject: [PATCH 0044/3220] quick selinux support for tracefs

Here is just the quick fix for tracefs with selinux.
just add tracefs to the list of whitelisted filesystem
types in selinux_is_sblabel_mnt(), but the right fix would be to
generalize this logic as described in the last item on the todo list,
https://bitbucket.org/seandroid/wiki/wiki/ToDo

Change-Id: I2aa803ccffbcd2802a7287514da7648e6b364157
Signed-off-by: Yongqin Liu <yongqin.liu@linaro.org>
---
 security/selinux/hooks.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9b0586d0f3c4..94a0bfc748d1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -420,6 +420,7 @@ static int selinux_is_sblabel_mnt(struct super_block *sb)
 		!strcmp(sb->s_type->name, "sysfs") ||
 		!strcmp(sb->s_type->name, "pstore") ||
 		!strcmp(sb->s_type->name, "debugfs") ||
+		!strcmp(sb->s_type->name, "tracefs") ||
 		!strcmp(sb->s_type->name, "rootfs");
 }
 

From f24888af18176246638498325fb1ecd49e0ef00e Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 2 May 2016 15:32:15 +0530
Subject: [PATCH 0045/3220] Revert "mmc: Add status IRQ and status callback
 function to mmc platform data"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 91fa97e1e5c001d52f6c993d37be08d1e84f47b7.

This patch is no longer valid. There are no users for this status irq and
callback in android-4.x. The Qcom platform (mach-msm/qsd8x50, HTC Dream..)
and SDCC controller (msm_sdcc) using this status IRQ and callback are
dropped from mainline sometime back.

27842bb18b00 (mmc: Remove msm_sdcc driver)
c0c89fafa289 (ARM: Remove mach-msm and associated ARM architecture code)

Change-Id: Ia38e42a06dc184395f79c1ec1d306bf9775704d5
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 include/linux/amba/mmci.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/linux/amba/mmci.h b/include/linux/amba/mmci.h
index 8bfd21c13d59..eff56cb0016a 100644
--- a/include/linux/amba/mmci.h
+++ b/include/linux/amba/mmci.h
@@ -40,10 +40,7 @@ struct mmci_platform_data {
 	int	gpio_wp;
 	int	gpio_cd;
 	bool	cd_invert;
-	unsigned int status_irq;
 	struct embedded_sdio_data *embedded_sdio;
-	int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id);
-
 };
 
 #endif

From d114f2c3570fc63019656b6870e4fcc506852334 Mon Sep 17 00:00:00 2001
From: Jack Pham <jackp@codeaurora.org>
Date: Wed, 23 Mar 2016 13:18:03 -0700
Subject: [PATCH 0046/3220] usb: dual-role: make stub functions inline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_DUAL_ROLE_USB_INTF is disabled but the exported functions
are referenced, the build will result in warnings such as:

	In file included from include/linux/usb/class-dual-role.h:112:13:
	warning: ‘dual_role_instance_changed’ defined but not used
	[-Wunused-function]

These stub functions should be static inline.

Change-Id: I5a9ef58dca32306fac5a4c7f28cdaa36fa8ae078
Signed-off-by: Jack Pham <jackp@codeaurora.org>
(cherry picked from commit 2d152dbb0743526b21d6bbefe097f874c027f860)
(cherry picked from commit 8ad66cafaa10e6ba94ff79a8dbc2cc437c6bfe93)
---
 include/linux/usb/class-dual-role.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/linux/usb/class-dual-role.h b/include/linux/usb/class-dual-role.h
index af42ed34944a..c6df2238012e 100644
--- a/include/linux/usb/class-dual-role.h
+++ b/include/linux/usb/class-dual-role.h
@@ -109,18 +109,19 @@ extern int dual_role_property_is_writeable(struct dual_role_phy_instance
 					   enum dual_role_property prop);
 extern void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role);
 #else /* CONFIG_DUAL_ROLE_USB_INTF */
-static void dual_role_instance_changed(struct dual_role_phy_instance
+static inline void dual_role_instance_changed(struct dual_role_phy_instance
 				       *dual_role){}
-static struct dual_role_phy_instance *__must_check
+static inline struct dual_role_phy_instance *__must_check
 devm_dual_role_instance_register(struct device *parent,
 				 const struct dual_role_phy_desc *desc)
 {
 	return ERR_PTR(-ENOSYS);
 }
-static void devm_dual_role_instance_unregister(struct device *dev,
+static inline void devm_dual_role_instance_unregister(struct device *dev,
 					       struct dual_role_phy_instance
 					       *dual_role){}
-static void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role)
+static inline void *dual_role_get_drvdata(struct dual_role_phy_instance
+		*dual_role)
 {
 	return ERR_PTR(-ENOSYS);
 }

From 8b43e279a24727794cf6dd61fcad956f532002d3 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Wed, 4 May 2016 11:05:15 +0530
Subject: [PATCH 0047/3220] Revert "ARM: Add 'card_present' state to
 mmc_platfrom_data"

This reverts commit 541632275e983573b8250fcd4402f772d7bd1e6f.

mmc_platform_data (or arch/arm/include/asm/mach/mmc.h in general)
has no active user in android-4.x kernels. Also all the necessary
bits are already moved to include/linux/amba/mmci.h.

6ef297f86b62 (ARM: 5720/1: Move MMCI header to amba include dir)

Change-Id: Iff384eb527327bf88543408e0257241c1fd99a43
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm/include/asm/mach/mmc.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/include/asm/mach/mmc.h b/arch/arm/include/asm/mach/mmc.h
index bca864ac945f..f8d391ad9203 100644
--- a/arch/arm/include/asm/mach/mmc.h
+++ b/arch/arm/include/asm/mach/mmc.h
@@ -18,7 +18,6 @@ struct embedded_sdio_data {
 struct mmc_platform_data {
 	unsigned int ocr_mask;			/* available voltages */
 	int built_in;				/* built-in device flag */
-	int card_present;			/* card detect state */
 	u32 (*translate_vdd)(struct device *, unsigned int);
 	unsigned int (*status)(struct device *);
 	struct embedded_sdio_data *embedded_sdio;

From b2cee99f2bada8a9da66cd29b71a79aaf52227ce Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Wed, 4 May 2016 11:14:16 +0530
Subject: [PATCH 0048/3220] Revert "Recreate asm/mach/mmc.h include file"

This reverts commit 5b42ae3edab6c39c1337d36881d29350bb36dcff.

This recereated arch/arm/include/asm/mach/mmc.h include file has
no active user in android-4.x kernels. Also all the necessary bits
are already moved to include/linux/amba/mmci.h.

6ef297f86b62 (ARM: 5720/1: Move MMCI header to amba include dir)

Change-Id: Ibf258b355d17f54f49b777a8f6e0089e9b59a3a5
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm/include/asm/mach/mmc.h | 27 ---------------------------
 1 file changed, 27 deletions(-)
 delete mode 100644 arch/arm/include/asm/mach/mmc.h

diff --git a/arch/arm/include/asm/mach/mmc.h b/arch/arm/include/asm/mach/mmc.h
deleted file mode 100644
index f8d391ad9203..000000000000
--- a/arch/arm/include/asm/mach/mmc.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- *  arch/arm/include/asm/mach/mmc.h
- */
-#ifndef ASMARM_MACH_MMC_H
-#define ASMARM_MACH_MMC_H
-
-#include <linux/mmc/host.h>
-#include <linux/mmc/card.h>
-#include <linux/mmc/sdio_func.h>
-
-struct embedded_sdio_data {
-        struct sdio_cis cis;
-        struct sdio_cccr cccr;
-        struct sdio_embedded_func *funcs;
-        int num_funcs;
-};
-
-struct mmc_platform_data {
-	unsigned int ocr_mask;			/* available voltages */
-	int built_in;				/* built-in device flag */
-	u32 (*translate_vdd)(struct device *, unsigned int);
-	unsigned int (*status)(struct device *);
-	struct embedded_sdio_data *embedded_sdio;
-	int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id);
-};
-
-#endif

From f355b21139b26c70f38ae3cc19388c513a75242c Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Wed, 4 May 2016 13:51:38 -0700
Subject: [PATCH 0049/3220] fiq_debugger: Add option to apply uart overlay by
 FIQ_DEBUGGER_UART_OVERLAY

fiq_debugger is taking over uart, so it is necessary to disable
original uart in DT file. It can be done manually or by overlay.

Change-Id: I9f50ec15b0e22e602d73b9f745fc8666f8925d09
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 drivers/staging/android/fiq_debugger/Kconfig  |  9 ++++++
 .../android/fiq_debugger/fiq_debugger.c       | 30 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/drivers/staging/android/fiq_debugger/Kconfig b/drivers/staging/android/fiq_debugger/Kconfig
index 56f7f999377e..60fc224d4efc 100644
--- a/drivers/staging/android/fiq_debugger/Kconfig
+++ b/drivers/staging/android/fiq_debugger/Kconfig
@@ -42,6 +42,15 @@ config FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE
 	  If enabled, this puts the fiq debugger into console mode by default.
 	  Otherwise, the fiq debugger will start out in debug mode.
 
+config FIQ_DEBUGGER_UART_OVERLAY
+	bool "Install uart DT overlay"
+	depends on FIQ_DEBUGGER
+	select OF_OVERLAY
+	default n
+	help
+	  If enabled, fiq debugger is calling fiq_debugger_uart_overlay()
+	  that will apply overlay uart_overlay@0 to disable proper uart.
+
 config FIQ_WATCHDOG
 	bool
 	select FIQ_DEBUGGER
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c
index 7f056831dbff..0c558331a3d2 100644
--- a/drivers/staging/android/fiq_debugger/fiq_debugger.c
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c
@@ -39,6 +39,10 @@
 #include <asm/fiq_glue.h>
 #endif
 
+#ifdef CONFIG_FIQ_DEBUGGER_UART_OVERLAY
+#include <linux/of.h>
+#endif
+
 #include <linux/uaccess.h>
 
 #include "fiq_debugger.h"
@@ -1201,10 +1205,36 @@ static struct platform_driver fiq_debugger_driver = {
 	},
 };
 
+#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY)
+int fiq_debugger_uart_overlay(void)
+{
+	struct device_node *onp = of_find_node_by_path("/uart_overlay@0");
+	int ret;
+
+	if (!onp) {
+		pr_err("serial_debugger: uart overlay not found\n");
+		return -ENODEV;
+	}
+
+	ret = of_overlay_create(onp);
+	if (ret < 0) {
+		pr_err("serial_debugger: fail to create overlay: %d\n", ret);
+		of_node_put(onp);
+		return ret;
+	}
+
+	pr_info("serial_debugger: uart overlay applied\n");
+	return 0;
+}
+#endif
+
 static int __init fiq_debugger_init(void)
 {
 #if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
 	fiq_debugger_tty_init();
+#endif
+#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY)
+	fiq_debugger_uart_overlay();
 #endif
 	return platform_driver_register(&fiq_debugger_driver);
 }

From 0ecbd590a7175818fcbccc9d5b1dc819aae25c86 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Thu, 17 Dec 2015 07:34:27 +0800
Subject: [PATCH 0050/3220] sched/fair: Fix new task's load avg removed from
 source CPU in wake_up_new_task()

If a newly created task is selected to go to a different CPU in fork
balance when it wakes up the first time, its load averages should
not be removed from the source CPU since they are never added to
it before. The same is also applicable to a never used group entity.

Fix it in remove_entity_load_avg(): when entity's last_update_time
is 0, simply return. This should precisely identify the case in
question, because in other migrations, the last_update_time is set
to 0 after remove_entity_load_avg().

Reported-by: Steve Muckle <steve.muckle@linaro.org>
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
[peterz: cfs_rq_last_update_time]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <Juri.Lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20151216233427.GJ28098@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cfdc0e61066c..6e7d36f2ed8f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2809,6 +2809,27 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
 }
 
+#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	u64 last_update_time_copy;
+	u64 last_update_time;
+
+	do {
+		last_update_time_copy = cfs_rq->load_last_update_time_copy;
+		smp_rmb();
+		last_update_time = cfs_rq->avg.last_update_time;
+	} while (last_update_time != last_update_time_copy);
+
+	return last_update_time;
+}
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->avg.last_update_time;
+}
+#endif
+
 /*
  * Task first catches up with cfs_rq, and then subtract
  * itself from the cfs_rq (task must be off the queue now).
@@ -2818,17 +2839,14 @@ void remove_entity_load_avg(struct sched_entity *se)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 last_update_time;
 
-#ifndef CONFIG_64BIT
-	u64 last_update_time_copy;
+	/*
+	 * Newly created task or never used group entity should not be removed
+	 * from its (source) cfs_rq
+	 */
+	if (se->avg.last_update_time == 0)
+		return;
 
-	do {
-		last_update_time_copy = cfs_rq->load_last_update_time_copy;
-		smp_rmb();
-		last_update_time = cfs_rq->avg.last_update_time;
-	} while (last_update_time != last_update_time_copy);
-#else
-	last_update_time = cfs_rq->avg.last_update_time;
-#endif
+	last_update_time = cfs_rq_last_update_time(cfs_rq);
 
 	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);

From 259bd4cd8a4c5776e86ae1a86f1f8e1168e84212 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 17 Sep 2015 16:10:56 +0100
Subject: [PATCH 0051/3220] cpufreq: Frequency invariant scheduler
 load-tracking support

Implements cpufreq_scale_freq_capacity() to provide the scheduler with a
frequency scaling correction factor for more accurate load-tracking.

The factor is:

	current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)

In fact, freq_scale should be a struct cpufreq_policy data member. But
this would require that the scheduler hot path (__update_load_avg()) would
have to grab the cpufreq lock. This can be avoided by using per-cpu data
initialized to SCHED_CAPACITY_SCALE for freq_scale.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 drivers/cpufreq/cpufreq.c | 29 +++++++++++++++++++++++++++++
 include/linux/cpufreq.h   |  3 +++
 2 files changed, 32 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 8412ce5f93a7..46a7d62c8174 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -347,6 +347,31 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
 #endif
 }
 
+/*********************************************************************
+ *               FREQUENCY INVARIANT CPU CAPACITY                    *
+ *********************************************************************/
+
+static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+
+static void
+scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs)
+{
+	unsigned long cur = freqs ? freqs->new : policy->cur;
+	unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max;
+	int cpu;
+
+	pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n",
+		 cpumask_pr_args(policy->cpus), cur, policy->max, scale);
+
+	for_each_cpu(cpu, policy->cpus)
+		per_cpu(freq_scale, cpu) = scale;
+}
+
+unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(freq_scale, cpu);
+}
+
 static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
 		struct cpufreq_freqs *freqs, unsigned int state)
 {
@@ -450,6 +475,8 @@ wait:
 
 	spin_unlock(&policy->transition_lock);
 
+	scale_freq_capacity(policy, freqs);
+
 	cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
 }
 EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin);
@@ -2126,6 +2153,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 			CPUFREQ_NOTIFY, new_policy);
 
+	scale_freq_capacity(new_policy, NULL);
+
 	policy->min = new_policy->min;
 	policy->max = new_policy->max;
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 177c7680c1a8..11c65f536b6c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -616,4 +616,7 @@ unsigned int cpufreq_generic_get(unsigned int cpu);
 int cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency);
+
+struct sched_domain;
+unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
 #endif /* _LINUX_CPUFREQ_H */

From 372b62344c227283acd23d6ca63543a4ceb27abb Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 23 Sep 2015 12:47:48 +0100
Subject: [PATCH 0052/3220] arm: Enable frequency invariant scheduler
 load-tracking support

Defines arch_scale_freq_capacity() to use cpufreq implementation.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 arch/arm/include/asm/topology.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 370f7a732900..a69917b7d2c9 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -24,6 +24,11 @@ void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
 
+#ifdef CONFIG_CPU_FREQ
+#include <linux/cpufreq.h>
+#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+#endif
+
 #else
 
 static inline void init_cpu_topology(void) { }

From 8411396e9c5cd1badfe05e08624d90cbbb7b1126 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 25 Sep 2015 17:15:11 +0100
Subject: [PATCH 0053/3220] arm64: Enable frequency invariant scheduler
 load-tracking support

Defines arch_scale_freq_capacity() to use cpufreq implementation.

Including <linux/cpufreq.h> in topology.h like for the arm arch doesn't
work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0).
That's why cpufreq_scale_freq_capacity() has to be declared extern in
topology.h.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 arch/arm64/include/asm/topology.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index a3e9d6fdbf21..72c47c3fa7b3 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -22,6 +22,12 @@ void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
 
+#ifdef CONFIG_CPU_FREQ
+#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+struct sched_domain;
+extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+#endif
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_ARM_TOPOLOGY_H */

From 0f8edc82743fd749a3536c68386c89112e760792 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 14 Apr 2015 16:25:31 +0100
Subject: [PATCH 0054/3220] arm: Update arch_scale_cpu_capacity() to reflect
 change to define

arch_scale_cpu_capacity() is no longer a weak function but a #define
instead. Include the #define in topology.h.

cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 arch/arm/include/asm/topology.h | 2 ++
 arch/arm/kernel/topology.c      | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index a69917b7d2c9..e3e596cbb1a7 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -28,6 +28,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu);
 #include <linux/cpufreq.h>
 #define arch_scale_freq_capacity cpufreq_scale_freq_capacity
 #endif
+#define arch_scale_cpu_capacity scale_cpu_capacity
+extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
 
 #else
 
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 08b7847bf912..614554765e44 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -42,7 +42,7 @@
  */
 static DEFINE_PER_CPU(unsigned long, cpu_scale);
 
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
 	return per_cpu(cpu_scale, cpu);
 }

From 1eb2b8aa45840b50cd953052315a8b4f6bc6416d Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 7 May 2015 18:46:15 +0100
Subject: [PATCH 0055/3220] sched: Store system-wide maximum cpu capacity in
 root domain

To be able to compare the capacity of the target cpu with the highest
cpu capacity of the system in the wakeup path, store the system-wide
maximum cpu capacity in the root domain.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/core.c  | 8 ++++++++
 kernel/sched/sched.h | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 732e993b564b..7c6e2b326349 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6971,6 +6971,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	enum s_alloc alloc_state;
 	struct sched_domain *sd;
 	struct s_data d;
+	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;
 
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -7021,11 +7022,18 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
+		rq = cpu_rq(i);
 		sd = *per_cpu_ptr(d.sd, i);
 		cpu_attach_domain(sd, d.rd, i);
+
+		if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity)
+			rq->rd->max_cpu_capacity = rq->cpu_capacity_orig;
 	}
 	rcu_read_unlock();
 
+	if (rq)
+		pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity);
+
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b242775bf670..2b7ffa5a20ad 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -543,6 +543,9 @@ struct root_domain {
 	 */
 	cpumask_var_t rto_mask;
 	struct cpupri cpupri;
+
+	/* Maximum cpu capacity in the system. */
+	unsigned long max_cpu_capacity;
 };
 
 extern struct root_domain def_root_domain;

From e8bcb272bc920d875c26ca2e06c872b9f844f6f1 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Sat, 9 May 2015 19:53:49 +0100
Subject: [PATCH 0056/3220] sched: Add cpu capacity awareness to wakeup
 balancing

Wakeup balancing is completely unaware of cpu capacity, cpu utilization
and task utilization. The task is preferably placed on a cpu which is
idle in the instant the wakeup happens. New tasks
(SD_BALANCE_{FORK,EXEC} are placed on an idle cpu in the idlest group if
such can be found, otherwise it goes on the least loaded one. Existing
tasks (SD_BALANCE_WAKE) are placed on the previous cpu or an idle cpu
sharing the same last level cache unless the wakee_flips heuristic in
wake_wide() decides to fallback to considering cpus outside SD_LLC.
Hence existing tasks are not guaranteed to get a chance to migrate to a
different group at wakeup in case the current one has reduced cpu
capacity (due RT/IRQ pressure or different uarch e.g. ARM big.LITTLE).
They may eventually get pulled by other cpus doing
periodic/idle/nohz_idle balance, but it may take quite a while before it
happens.

This patch adds capacity awareness to find_idlest_{group,queue} (used by
SD_BALANCE_{FORK,EXEC} and SD_BALANCE_WAKE under certain circumstances)
such that groups/cpus that can accommodate the waking task based on task
utilization are preferred. In addition, wakeup of existing tasks
(SD_BALANCE_WAKE) is sent through find_idlest_{group,queue} also if the
task doesn't fit the capacity of the previous cpu to allow it to escape
(override wake_affine) when necessary instead of relying on
periodic/idle/nohz_idle balance to eventually sort it out.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e7d36f2ed8f..5e112bba1b5a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4742,6 +4742,43 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	return 1;
 }
 
+static inline unsigned long task_util(struct task_struct *p)
+{
+	return p->se.avg.util_avg;
+}
+
+static unsigned int capacity_margin = 1280; /* ~20% margin */
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+	unsigned long capacity = capacity_of(cpu);
+
+	util += task_util(p);
+
+	return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+	unsigned long capacity = capacity_of(cpu);
+	unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+	if (capacity == max_capacity)
+		return true;
+
+	if (capacity * capacity_margin > max_capacity * 1024)
+		return true;
+
+	return __task_fits(p, cpu, 0);
+}
+
+static int cpu_util(int cpu);
+
+static inline bool task_fits_spare(struct task_struct *p, int cpu)
+{
+	return __task_fits(p, cpu, cpu_util(cpu));
+}
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -4751,7 +4788,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		  int this_cpu, int sd_flag)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
+	struct sched_group *fit_group = NULL;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
+	unsigned long fit_capacity = ULONG_MAX;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -4782,6 +4821,15 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 				load = target_load(i, load_idx);
 
 			avg_load += load;
+
+			/*
+			 * Look for most energy-efficient group that can fit
+			 * that can fit the task.
+			 */
+			if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
+				fit_capacity = capacity_of(i);
+				fit_group = group;
+			}
 		}
 
 		/* Adjust by relative CPU capacity of the group */
@@ -4795,6 +4843,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		}
 	} while (group = group->next, group != sd->groups);
 
+	if (fit_group)
+		return fit_group;
+
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
@@ -4815,7 +4866,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
 	/* Traverse only the allowed CPUs */
 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-		if (idle_cpu(i)) {
+		if (task_fits_spare(p, i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
 			if (idle && idle->exit_latency < min_exit_latency) {
@@ -4827,7 +4878,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 				min_exit_latency = idle->exit_latency;
 				latest_idle_timestamp = rq->idle_stamp;
 				shallowest_idle_cpu = i;
-			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
+			} else if (idle_cpu(i) &&
+				   (!idle || idle->exit_latency == min_exit_latency) &&
 				   rq->idle_stamp > latest_idle_timestamp) {
 				/*
 				 * If equal or no active idle state, then
@@ -4836,6 +4888,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 				 */
 				latest_idle_timestamp = rq->idle_stamp;
 				shallowest_idle_cpu = i;
+			} else if (shallowest_idle_cpu == -1) {
+				/*
+				 * If we haven't found an idle CPU yet
+				 * pick a non-idle one that can fit the task as
+				 * fallback.
+				 */
+				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
 			load = weighted_cpuload(i);
@@ -4950,7 +5009,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE)
-		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+		want_affine = !wake_wide(p) && task_fits_max(p, cpu) &&
+			      cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
 
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {

From 8a5c0339bb7b7eedd2e75b62ebe4144a195f7ef3 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Mon, 6 Jul 2015 15:01:10 +0100
Subject: [PATCH 0057/3220] sched: Consider spare cpu capacity at task wake-up

find_idlest_group() selects the wake-up target group purely
based on group load which leads to suboptimal choices in low load
scenarios. An idle group with reduced capacity (due to RT tasks or
different cpu type) isn't necessarily a better target than a lightly
loaded group with higher capacity.

The patch adds spare capacity as an additional group selection
parameter. The target group is now selected based on the following
criteria:

1. Return the group with the cpu with most spare capacity and this
capacity is significant if such group exists. Significant spare capacity
is currently at least 20% to spare.

2. Return the group with the lowest load, unless it is the local group
in which case NULL is returned and the search is continued at the next
(lower) level.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5e112bba1b5a..bebc8367edee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4788,9 +4788,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		  int this_cpu, int sd_flag)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
-	struct sched_group *fit_group = NULL;
+	struct sched_group *fit_group = NULL, *spare_group = NULL;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	unsigned long fit_capacity = ULONG_MAX;
+	unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -4798,7 +4799,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		load_idx = sd->wake_idx;
 
 	do {
-		unsigned long load, avg_load;
+		unsigned long load, avg_load, spare_capacity;
 		int local_group;
 		int i;
 
@@ -4830,6 +4831,16 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 				fit_capacity = capacity_of(i);
 				fit_group = group;
 			}
+
+			/*
+			 * Look for group which has most spare capacity on a
+			 * single cpu.
+			 */
+			spare_capacity = capacity_of(i) - cpu_util(i);
+			if (spare_capacity > max_spare_capacity) {
+				max_spare_capacity = spare_capacity;
+				spare_group = group;
+			}
 		}
 
 		/* Adjust by relative CPU capacity of the group */
@@ -4846,6 +4857,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 	if (fit_group)
 		return fit_group;
 
+	if (spare_group)
+		return spare_group;
+
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;

From cda2bd333be6e5e8c38fa072c9c4c296312dfd8d Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Mon, 26 Jan 2015 19:47:28 +0000
Subject: [PATCH 0058/3220] sched: Enable idle balance to pull single task
 towards cpu with higher capacity

We do not want to miss out on the ability to pull a single remaining
task from a potential source cpu towards an idle destination cpu. Add an
extra criteria to need_active_balance() to kick off active load balance
if the source cpu is over-utilized and has lower capacity than the
destination cpu.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/fair.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bebc8367edee..fe40cdb8e640 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4779,6 +4779,11 @@ static inline bool task_fits_spare(struct task_struct *p, int cpu)
 	return __task_fits(p, cpu, cpu_util(cpu));
 }
 
+static bool cpu_overutilized(int cpu)
+{
+	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -6995,6 +7000,13 @@ static int need_active_balance(struct lb_env *env)
 			return 1;
 	}
 
+	if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+				env->src_rq->cfs.h_nr_running == 1 &&
+				cpu_overutilized(env->src_cpu) &&
+				!cpu_overutilized(env->dst_cpu)) {
+			return 1;
+	}
+
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 

From 681fa1413b869ca73f685b9c8769cdd645879804 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Thu, 2 Jul 2015 17:16:34 +0100
Subject: [PATCH 0059/3220] sched: Prevent unnecessary active balance of single
 task in sched group

Scenarios with the busiest group having just one task and the local
being idle on topologies with sched groups with different numbers of
cpus manage to dodge all load-balance bailout conditions resulting the
nr_balance_failed counter to be incremented. This eventually causes a
pointless active migration of the task. This patch prevents this by not
incrementing the counter when the busiest group only has one task.
ASYM_PACKING migrations and migrations due to reduced capacity should
still take place as these are explicitly captured by
need_active_balance().

A better solution would be to not attempt the load-balance in the first
place, but that requires significant changes to the order of bailout
conditions and statistics gathering.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe40cdb8e640..632163ef7628 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5629,6 +5629,7 @@ struct lb_env {
 	int			new_dst_cpu;
 	enum cpu_idle_type	idle;
 	long			imbalance;
+	unsigned int		src_grp_nr_running;
 	/* The set of CPUs under consideration for load-balancing */
 	struct cpumask		*cpus;
 
@@ -6591,6 +6592,8 @@ next_group:
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 
+	env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
 	if (!env->sd->parent) {
 		/* update overload indicator if we are at root domain */
 		if (env->dst_rq->rd->overload != overload)
@@ -7219,7 +7222,8 @@ more_balance:
 		 * excessive cache_hot migrations and active balances.
 		 */
 		if (idle != CPU_NEWLY_IDLE)
-			sd->nr_balance_failed++;
+			if (env.src_grp_nr_running > 1)
+				sd->nr_balance_failed++;
 
 		if (need_active_balance(&env)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);

From d8c8088f41033b1cea9d88149046e1f8bdb9ade0 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 13 Jan 2015 13:43:28 +0000
Subject: [PATCH 0060/3220] sched: Documentation for scheduler energy cost
 model

This documentation patch provides an overview of the experimental
scheduler energy costing model, associated data structures, and a
reference recipe on how platforms can be characterized to derive energy
models.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 Documentation/scheduler/sched-energy.txt | 362 +++++++++++++++++++++++
 1 file changed, 362 insertions(+)
 create mode 100644 Documentation/scheduler/sched-energy.txt

diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt
new file mode 100644
index 000000000000..dab2f9088b33
--- /dev/null
+++ b/Documentation/scheduler/sched-energy.txt
@@ -0,0 +1,362 @@
+Energy cost model for energy-aware scheduling (EXPERIMENTAL)
+
+Introduction
+=============
+
+The basic energy model uses platform energy data stored in sched_group_energy
+data structures attached to the sched_groups in the sched_domain hierarchy. The
+energy cost model offers two functions that can be used to guide scheduling
+decisions:
+
+1.	static unsigned int sched_group_energy(struct energy_env *eenv)
+2.	static int energy_diff(struct energy_env *eenv)
+
+sched_group_energy() estimates the energy consumed by all cpus in a specific
+sched_group including any shared resources owned exclusively by this group of
+cpus. Resources shared with other cpus are excluded (e.g. later level caches).
+
+energy_diff() estimates the total energy impact of a utilization change. That
+is, adding, removing, or migrating utilization (tasks).
+
+Both functions use a struct energy_env to specify the scenario to be evaluated:
+
+	struct energy_env {
+		struct sched_group      *sg_top;
+		struct sched_group      *sg_cap;
+		int                     cap_idx;
+		int                     util_delta;
+		int                     src_cpu;
+		int                     dst_cpu;
+		int                     energy;
+	};
+
+sg_top: sched_group to be evaluated. Not used by energy_diff().
+
+sg_cap: sched_group covering the cpus in the same frequency domain. Set by
+sched_group_energy().
+
+cap_idx: Capacity state to be used for energy calculations. Set by
+find_new_capacity().
+
+util_delta: Amount of utilization to be added, removed, or migrated.
+
+src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be
+-1 if no source (e.g. task wake-up).
+
+dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1
+if utilization is removed (e.g. terminating tasks).
+
+energy: Result of sched_group_energy().
+
+The metric used to represent utilization is the actual per-entity running time
+averaged over time using a geometric series. Very similar to the existing
+per-entity load-tracking, but _not_ scaled by task priority and capped by the
+capacity of the cpu. The latter property does mean that utilization may
+underestimate the compute requirements for task on fully/over utilized cpus.
+The greatest potential for energy savings without affecting performance too much
+is scenarios where the system isn't fully utilized. If the system is deemed
+fully utilized load-balancing should be done with task load (includes task
+priority) instead in the interest of fairness and performance.
+
+
+Background and Terminology
+===========================
+
+To make it clear from the start:
+
+energy = [joule] (resource like a battery on powered devices)
+power = energy/time = [joule/second] = [watt]
+
+The goal of energy-aware scheduling is to minimize energy, while still getting
+the job done. That is, we want to maximize:
+
+	performance [inst/s]
+	--------------------
+	    power [W]
+
+which is equivalent to minimizing:
+
+	energy [J]
+	-----------
+	instruction
+
+while still getting 'good' performance. It is essentially an alternative
+optimization objective to the current performance-only objective for the
+scheduler. This alternative considers two objectives: energy-efficiency and
+performance. Hence, there needs to be a user controllable knob to switch the
+objective. Since it is early days, this is currently a sched_feature
+(ENERGY_AWARE).
+
+The idea behind introducing an energy cost model is to allow the scheduler to
+evaluate the implications of its decisions rather than applying energy-saving
+techniques blindly that may only have positive effects on some platforms. At
+the same time, the energy cost model must be as simple as possible to minimize
+the scheduler latency impact.
+
+Platform topology
+------------------
+
+The system topology (cpus, caches, and NUMA information, not peripherals) is
+represented in the scheduler by the sched_domain hierarchy which has
+sched_groups attached at each level that covers one or more cpus (see
+sched-domains.txt for more details). To add energy awareness to the scheduler
+we need to consider power and frequency domains.
+
+Power domain:
+
+A power domain is a part of the system that can be powered on/off
+independently. Power domains are typically organized in a hierarchy where you
+may be able to power down just a cpu or a group of cpus along with any
+associated resources (e.g.  shared caches). Powering up a cpu means that all
+power domains it is a part of in the hierarchy must be powered up. Hence, it is
+more expensive to power up the first cpu that belongs to a higher level power
+domain than powering up additional cpus in the same high level domain. Two
+level power domain hierarchy example:
+
+		Power source
+		         +-------------------------------+----...
+per group PD		 G                               G
+		         |           +----------+        |
+		    +--------+-------| Shared   |  (other groups)
+per-cpu PD	    G        G       | resource |
+		    |        |       +----------+
+		+-------+ +-------+
+		| CPU 0 | | CPU 1 |
+		+-------+ +-------+
+
+Frequency domain:
+
+Frequency domains (P-states) typically cover the same group of cpus as one of
+the power domain levels. That is, there might be several smaller power domains
+sharing the same frequency (P-state) or there might be a power domain spanning
+multiple frequency domains.
+
+From a scheduling point of view there is no need to know the actual frequencies
+[Hz]. All the scheduler cares about is the compute capacity available at the
+current state (P-state) the cpu is in and any other available states. For that
+reason, and to also factor in any cpu micro-architecture differences, compute
+capacity scaling states are called 'capacity states' in this document. For SMP
+systems this is equivalent to P-states. For mixed micro-architecture systems
+(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture
+performance relative to the other cpus in the system.
+
+Energy modelling:
+------------------
+
+Due to the hierarchical nature of the power domains, the most obvious way to
+model energy costs is therefore to associate power and energy costs with
+domains (groups of cpus). Energy costs of shared resources are associated with
+the group of cpus that share the resources, only the cost of powering the
+cpu itself and any private resources (e.g. private L1 caches) is associated
+with the per-cpu groups (lowest level).
+
+For example, for an SMP system with per-cpu power domains and a cluster level
+(group of cpus) power domain we get the overall energy costs to be:
+
+	energy = energy_cluster + n * energy_cpu
+
+where 'n' is the number of cpus powered up and energy_cluster is the cost paid
+as soon as any cpu in the cluster is powered up.
+
+The power and frequency domains can naturally be mapped onto the existing
+sched_domain hierarchy and sched_groups by adding the necessary data to the
+existing data structures.
+
+The energy model considers energy consumption from two contributors (shown in
+the illustration below):
+
+1. Busy energy: Energy consumed while a cpu and the higher level groups that it
+belongs to are busy running tasks. Busy energy is associated with the state of
+the cpu, not an event. The time the cpu spends in this state varies. Thus, the
+most obvious platform parameter for this contribution is busy power
+(energy/time).
+
+2. Idle energy: Energy consumed while a cpu and higher level groups that it
+belongs to are idle (in a C-state). Like busy energy, idle energy is associated
+with the state of the cpu. Thus, the platform parameter for this contribution
+is idle power (energy/time).
+
+Energy consumed during transitions from an idle-state (C-state) to a busy state
+(P-state) or going the other way is ignored by the model to simplify the energy
+model calculations.
+
+
+	Power
+	^
+	|            busy->idle             idle->busy
+	|            transition             transition
+	|
+	|                _                      __
+	|               / \                    /  \__________________
+	|______________/   \                  /
+	|                   \                /
+	|  Busy              \    Idle      /        Busy
+	|  low P-state        \____________/         high P-state
+	|
+	+------------------------------------------------------------> time
+
+Busy    |--------------|                          |-----------------|
+
+Wakeup                 |------|            |------|
+
+Idle                          |------------|
+
+
+The basic algorithm
+====================
+
+The basic idea is to determine the total energy impact when utilization is
+added or removed by estimating the impact at each level in the sched_domain
+hierarchy starting from the bottom (sched_group contains just a single cpu).
+The energy cost comes from busy time (sched_group is awake because one or more
+cpus are busy) and idle time (in an idle-state). Energy model numbers account
+for energy costs associated with all cpus in the sched_group as a group.
+
+	for_each_domain(cpu, sd) {
+		sg = sched_group_of(cpu)
+		energy_before = curr_util(sg) * busy_power(sg)
+				+ (1-curr_util(sg)) * idle_power(sg)
+		energy_after = new_util(sg) * busy_power(sg)
+				+ (1-new_util(sg)) * idle_power(sg)
+		energy_diff += energy_before - energy_after
+
+	}
+
+	return energy_diff
+
+{curr, new}_util: The cpu utilization at the lowest level and the overall
+non-idle time for the entire group for higher levels. Utilization is in the
+range 0.0 to 1.0 in the pseudo-code.
+
+busy_power: The power consumption of the sched_group.
+
+idle_power: The power consumption of the sched_group when idle.
+
+Note: It is a fundamental assumption that the utilization is (roughly) scale
+invariant. Task utilization tracking factors in any frequency scaling and
+performance scaling differences due to difference cpu microarchitectures such
+that task utilization can be used across the entire system.
+
+
+Platform energy data
+=====================
+
+struct sched_group_energy can be attached to sched_groups in the sched_domain
+hierarchy and has the following members:
+
+cap_states:
+	List of struct capacity_state representing the supported capacity states
+	(P-states). struct capacity_state has two members: cap and power, which
+	represents the compute capacity and the busy_power of the state. The
+	list must be ordered by capacity low->high.
+
+nr_cap_states:
+	Number of capacity states in cap_states list.
+
+idle_states:
+	List of struct idle_state containing idle_state power cost for each
+	idle-state supported by the system orderd by shallowest state first.
+	All states must be included at all level in the hierarchy, i.e. a
+	sched_group spanning just a single cpu must also include coupled
+	idle-states (cluster states). In addition to the cpuidle idle-states,
+	the list must also contain an entry for the idling using the arch
+	default idle (arch_idle_cpu()). Despite this state may not be a true
+	hardware idle-state it is considered the shallowest idle-state in the
+	energy model and must be the first entry. cpus may enter this state
+	(possibly 'active idling') if cpuidle decides not enter a cpuidle
+	idle-state. Default idle may not be used when cpuidle is enabled.
+	In this case, it should just be a copy of the first cpuidle idle-state.
+
+nr_idle_states:
+	Number of idle states in idle_states list.
+
+There are no unit requirements for the energy cost data. Data can be normalized
+with any reference, however, the normalization must be consistent across all
+energy cost data. That is, one bogo-joule/watt must be the same quantity for
+data, but we don't care what it is.
+
+A recipe for platform characterization
+=======================================
+
+Obtaining the actual model data for a particular platform requires some way of
+measuring power/energy. There isn't a tool to help with this (yet). This
+section provides a recipe for use as reference. It covers the steps used to
+characterize the ARM TC2 development platform. This sort of measurements is
+expected to be done anyway when tuning cpuidle and cpufreq for a given
+platform.
+
+The energy model needs two types of data (struct sched_group_energy holds
+these) for each sched_group where energy costs should be taken into account:
+
+1. Capacity state information
+
+A list containing the compute capacity and power consumption when fully
+utilized attributed to the group as a whole for each available capacity state.
+At the lowest level (group contains just a single cpu) this is the power of the
+cpu alone without including power consumed by resources shared with other cpus.
+It basically needs to fit the basic modelling approach described in "Background
+and Terminology" section:
+
+	energy_system = energy_shared + n * energy_cpu
+
+for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at
+the lowest level. 'energy_shared' is included at the next level which
+represents the group of cpus among which the resources are shared.
+
+This model is, of course, a simplification of reality. Thus, power/energy
+attributions might not always exactly represent how the hardware is designed.
+Also, busy power is likely to depend on the workload. It is therefore
+recommended to use a representative mix of workloads when characterizing the
+capacity states.
+
+If the group has no capacity scaling support, the list will contain a single
+state where power is the busy power attributed to the group. The capacity
+should be set to a default value (1024).
+
+When frequency domains include multiple power domains, the group representing
+the frequency domain and all child groups share capacity states. This must be
+indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at
+all levels that share the capacity state must have the list of capacity states
+with the power set to the contribution of the individual group.
+
+2. Idle power information
+
+Stored in the idle_states list. The power number is the group idle power
+consumption in each idle state as well when the group is idle but has not
+entered an idle-state ('active idle' as mentioned earlier). Due to the way the
+energy model is defined, the idle power of the deepest group idle state can
+alternatively be accounted for in the parent group busy power. In that case the
+group idle state power values are offset such that the idle power of the
+deepest state is zero. It is less intuitive, but it is easier to measure as
+idle power consumed by the group and the busy/idle power of the parent group
+cannot be distinguished without per group measurement points.
+
+Measuring capacity states and idle power:
+
+The capacity states' capacity and power can be estimated by running a benchmark
+workload at each available capacity state. By restricting the benchmark to run
+on subsets of cpus it is possible to extrapolate the power consumption of
+shared resources.
+
+ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a
+shared L2 cache. TC2 has on-chip energy counters per cluster. Running a
+benchmark workload on just one cpu in a cluster means that power is consumed in
+the cluster (higher level group) and a single cpu (lowest level group). Adding
+another benchmark task to another cpu increases the power consumption by the
+amount consumed by the additional cpu. Hence, it is possible to extrapolate the
+cluster busy power.
+
+For platforms that don't have energy counters or equivalent instrumentation
+built-in, it may be possible to use an external DAQ to acquire similar data.
+
+If the benchmark includes some performance score (for example sysbench cpu
+benchmark), this can be used to record the compute capacity.
+
+Measuring idle power requires insight into the idle state implementation on the
+particular platform. Specifically, if the platform has coupled idle-states (or
+package states). To measure non-coupled per-cpu idle-states it is necessary to
+keep one cpu busy to keep any shared resources alive to isolate the idle power
+of the cpu from idle/busy power of the shared resources. The cpu can be tricked
+into different per-cpu idle states by disabling the other states. Based on
+various combinations of measurements with specific cpus busy and disabling
+idle-states it is possible to extrapolate the idle-state power.

From e496f328c87173fe188e91c6cd60197fb8e7e305 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 13 Jan 2015 13:45:51 +0000
Subject: [PATCH 0061/3220] sched: Make energy awareness a sched feature

This patch introduces the ENERGY_AWARE sched feature, which is
implemented using jump labels when SCHED_DEBUG is defined. It is
statically set false when SCHED_DEBUG is not defined. Hence this doesn't
allow energy awareness to be enabled without SCHED_DEBUG. This
sched_feature knob will be replaced later with a more appropriate
control knob when things have matured a bit.

ENERGY_AWARE is based on per-entity load-tracking hence FAIR_GROUP_SCHED
must be enable. This dependency isn't checked at compile time yet.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c     | 5 +++++
 kernel/sched/features.h | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 632163ef7628..efc65c3db6b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4651,6 +4651,11 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 #endif
 
+static inline bool energy_aware(void)
+{
+	return sched_feat(ENERGY_AWARE);
+}
+
 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
  * A waker of many should wake a different task than the one last awakened
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa46c2f..b634151ce286 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,8 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+/*
+ * Energy aware scheduling. Use platform energy model to guide scheduling
+ * decisions optimizing for energy efficiency.
+ */
+SCHED_FEAT(ENERGY_AWARE, false)

From 0b3bda54d245ebcfeefc61f0ff763e76ca2a8784 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 14 Nov 2014 16:08:45 +0000
Subject: [PATCH 0062/3220] sched: Introduce energy data structures

The struct sched_group_energy represents the per sched_group related
data which is needed for energy aware scheduling. It contains:

  (1) number of elements of the idle state array
  (2) pointer to the idle state array which comprises 'power consumption'
      for each idle state
  (3) number of elements of the capacity state array
  (4) pointer to the capacity state array which comprises 'compute
      capacity and power consumption' tuples for each capacity state

The struct sched_group obtains a pointer to a struct sched_group_energy.

The function pointer sched_domain_energy_f is introduced into struct
sched_domain_topology_level which will allow the arch to pass a particular
struct sched_group_energy from the topology shim layer into the scheduler
core.

The function pointer sched_domain_energy_f has an 'int cpu' parameter
since the folding of two adjacent sd levels via sd degenerate doesn't work
for all sd levels. I.e. it is not possible for example to use this feature
to provide per-cpu energy in sd level DIE on ARM's TC2 platform.

It was discussed that the folding of sd levels approach is preferable
over the cpu parameter approach, simply because the user (the arch
specifying the sd topology table) can introduce less errors. But since
it is not working, the 'int cpu' parameter is the only way out. It's
possible to use the folding of sd levels approach for
sched_domain_flags_f and the cpu parameter approach for the
sched_domain_energy_f at the same time though. With the use of the
'int cpu' parameter, an extra check function has to be provided to make
sure that all cpus spanned by a sched group are provisioned with the same
energy data.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 include/linux/sched.h | 19 +++++++++++++++++++
 kernel/sched/sched.h  |  1 +
 2 files changed, 20 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fa39434e3fdd..5bee272a86d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1021,6 +1021,22 @@ struct sched_domain_attr {
 
 extern int sched_domain_level_max;
 
+struct capacity_state {
+	unsigned long cap;	/* compute capacity */
+	unsigned long power;	/* power consumption at this compute capacity */
+};
+
+struct idle_state {
+	unsigned long power;	 /* power consumption in this idle state */
+};
+
+struct sched_group_energy {
+	unsigned int nr_idle_states;	/* number of idle states */
+	struct idle_state *idle_states;	/* ptr to idle state array */
+	unsigned int nr_cap_states;	/* number of capacity states */
+	struct capacity_state *cap_states; /* ptr to capacity state array */
+};
+
 struct sched_group;
 
 struct sched_domain {
@@ -1119,6 +1135,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
+typedef
+const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu);
 
 #define SDTL_OVERLAP	0x01
 
@@ -1131,6 +1149,7 @@ struct sd_data {
 struct sched_domain_topology_level {
 	sched_domain_mask_f mask;
 	sched_domain_flags_f sd_flags;
+	sched_domain_energy_f energy;
 	int		    flags;
 	int		    numa_level;
 	struct sd_data      data;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2b7ffa5a20ad..1813cba2995d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -863,6 +863,7 @@ struct sched_group {
 
 	unsigned int group_weight;
 	struct sched_group_capacity *sgc;
+	const struct sched_group_energy const *sge;
 
 	/*
 	 * The CPUs this group covers.

From 4ce990ec65231af6a148c2c0e6b4745fc1d12368 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 14 Nov 2014 16:20:20 +0000
Subject: [PATCH 0063/3220] sched: Initialize energy data structures

The sched_group_energy (sge) pointer of the first sched_group (sg) in
the sched_domain (sd) is initialized to point to the appropriate (in
terms of sd level and cpu) sge data defined in the arch and so to the
correct part of the Energy Model (EM).

Energy-aware scheduling allows that a system has only EM data up to a
certain sd level (so called highest energy aware balancing sd level).
A check in init_sched_energy() enforces that all sd's below this sd
level contain EM data.

The 'int cpu' parameter of sched_domain_energy_f requires that
check_sched_energy_data() makes sure that all cpus spanned by a sg
are provisioned with the same EM data.

This patch has also been tested with feature FORCE_SD_OVERLAP enabled.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7c6e2b326349..f1b1c9eaeb10 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6304,6 +6304,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 
+/*
+ * Check that the per-cpu provided sd energy data is consistent for all cpus
+ * within the mask.
+ */
+static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
+					   const struct cpumask *cpumask)
+{
+	const struct sched_group_energy * const sge = fn(cpu);
+	struct cpumask mask;
+	int i;
+
+	if (cpumask_weight(cpumask) <= 1)
+		return;
+
+	cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
+
+	for_each_cpu(i, &mask) {
+		const struct sched_group_energy * const e = fn(i);
+		int y;
+
+		BUG_ON(e->nr_idle_states != sge->nr_idle_states);
+
+		for (y = 0; y < (e->nr_idle_states); y++) {
+			BUG_ON(e->idle_states[y].power !=
+					sge->idle_states[y].power);
+		}
+
+		BUG_ON(e->nr_cap_states != sge->nr_cap_states);
+
+		for (y = 0; y < (e->nr_cap_states); y++) {
+			BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
+			BUG_ON(e->cap_states[y].power !=
+					sge->cap_states[y].power);
+		}
+	}
+}
+
+static void init_sched_energy(int cpu, struct sched_domain *sd,
+			      sched_domain_energy_f fn)
+{
+	if (!(fn && fn(cpu)))
+		return;
+
+	if (cpu != group_balance_cpu(sd->groups))
+		return;
+
+	if (sd->child && !sd->child->groups->sge) {
+		pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
+#ifdef CONFIG_SCHED_DEBUG
+		pr_err("     energy data on %s but not on %s domain\n",
+			sd->name, sd->child->name);
+#endif
+		return;
+	}
+
+	check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
+
+	sd->groups->sge = fn(cpu);
+}
+
 /*
  * Initializers for schedule domains
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
@@ -7010,10 +7070,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 
 	/* Calculate CPU capacity for physical packages and nodes */
 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
+		struct sched_domain_topology_level *tl = sched_domain_topology;
+
 		if (!cpumask_test_cpu(i, cpu_map))
 			continue;
 
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
+			init_sched_energy(i, sd, tl->energy);
 			claim_allocations(i, sd);
 			init_sched_groups_capacity(i, sd);
 		}

From f0f739d887a4f144ab4937e619288d4cede2cc91 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 13 Jan 2015 13:50:46 +0000
Subject: [PATCH 0064/3220] sched: Introduce SD_SHARE_CAP_STATES sched_domain
 flag

cpufreq is currently keeping it a secret which cpus are sharing
clock source. The scheduler needs to know about clock domains as well
to become more energy aware. The SD_SHARE_CAP_STATES domain flag
indicates whether cpus belonging to the sched_domain share capacity
states (P-states).

There is no connection with cpufreq (yet). The flag must be set by
the arch specific topology code.

cc: Russell King <linux@arm.linux.org.uk>
cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 arch/arm/kernel/topology.c |  3 ++-
 include/linux/sched.h      |  1 +
 kernel/sched/core.c        | 10 +++++++---
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 614554765e44..38e7be162b79 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -277,7 +277,8 @@ void store_cpu_topology(unsigned int cpuid)
 
 static inline int cpu_corepower_flags(void)
 {
-	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
+	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN | \
+	       SD_SHARE_CAP_STATES;
 }
 
 static struct sched_domain_topology_level arm_topology[] = {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5bee272a86d8..09e2d43406eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -989,6 +989,7 @@ extern void wake_up_q(struct wake_q_head *head);
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
 #define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
 #define SD_NUMA			0x4000	/* cross-node balancing */
+#define SD_SHARE_CAP_STATES	0x8000  /* Domain members share capacity state */
 
 #ifdef CONFIG_SCHED_SMT
 static inline int cpu_smt_flags(void)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f1b1c9eaeb10..2863d223c07e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5775,7 +5775,8 @@ static int sd_degenerate(struct sched_domain *sd)
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUCAPACITY |
 			 SD_SHARE_PKG_RESOURCES |
-			 SD_SHARE_POWERDOMAIN)) {
+			 SD_SHARE_POWERDOMAIN |
+			 SD_SHARE_CAP_STATES)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
@@ -5807,7 +5808,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 				SD_SHARE_CPUCAPACITY |
 				SD_SHARE_PKG_RESOURCES |
 				SD_PREFER_SIBLING |
-				SD_SHARE_POWERDOMAIN);
+				SD_SHARE_POWERDOMAIN |
+				SD_SHARE_CAP_STATES);
 		if (nr_node_ids == 1)
 			pflags &= ~SD_SERIALIZE;
 	}
@@ -6472,6 +6474,7 @@ static int sched_domains_curr_level;
  * SD_SHARE_PKG_RESOURCES - describes shared caches
  * SD_NUMA                - describes NUMA topologies
  * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ * SD_SHARE_CAP_STATES    - describes shared capacity states
  *
  * Odd one out:
  * SD_ASYM_PACKING        - describes SMT quirks
@@ -6481,7 +6484,8 @@ static int sched_domains_curr_level;
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA |			\
 	 SD_ASYM_PACKING |		\
-	 SD_SHARE_POWERDOMAIN)
+	 SD_SHARE_POWERDOMAIN |		\
+	 SD_SHARE_CAP_STATES)
 
 static struct sched_domain *
 sd_init(struct sched_domain_topology_level *tl, int cpu)

From c7aeeb88c7370af2fc4e2e8bd156e001f7a96d11 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 10 Jul 2015 13:57:19 +0100
Subject: [PATCH 0065/3220] arm: Cpu invariant scheduler load-tracking and
 capacity support

Provides the scheduler with a cpu scaling correction factor for more
accurate load-tracking and cpu capacity handling.

The Energy Model (EM) (in fact the capacity value of the last element
of the capacity states vector of the core (MC) level sched_group_energy
structure) is used instead of the arm arch specific cpu_efficiency and
dtb property 'clock-frequency' values as the source for this cpu
scaling factor.

The cpu capacity value depends on the micro-architecture and the
maximum frequency of the cpu.

The maximum frequency part should not be confused with the frequency
invariant scheduler load-tracking support which deals with frequency
related scaling due to DFVS functionality.

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 arch/arm/kernel/topology.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 38e7be162b79..da1c611a3b5e 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -153,6 +153,8 @@ static void __init parse_dt_topology(void)
 
 }
 
+static const struct sched_group_energy * const cpu_core_energy(int cpu);
+
 /*
  * Look for a customed capacity of a CPU in the cpu_capacity table during the
  * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
@@ -160,10 +162,14 @@ static void __init parse_dt_topology(void)
  */
 static void update_cpu_capacity(unsigned int cpu)
 {
-	if (!cpu_capacity(cpu))
-		return;
+	unsigned long capacity = SCHED_CAPACITY_SCALE;
 
-	set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+	if (cpu_core_energy(cpu)) {
+		int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
+		capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
+	}
+
+	set_capacity_scale(cpu, capacity);
 
 	pr_info("CPU%u: update cpu_capacity %lu\n",
 		cpu, arch_scale_cpu_capacity(NULL, cpu));

From 7c791bf110c2c1884f02f0c4ceaff886fad2a907 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Thu, 30 Apr 2015 11:53:48 +0100
Subject: [PATCH 0066/3220] arm64: Cpu invariant scheduler load-tracking and
 capacity support

Provides the scheduler with a cpu scaling correction factor for more
accurate load-tracking and cpu capacity handling.

The Energy Model (EM) (in fact the capacity value of the last element
of the capacity states vector of the core (MC) level sched_group_energy
structure) is used as the source for this cpu scaling factor.

The cpu capacity value depends on the micro-architecture and the
maximum frequency of the cpu.

The maximum frequency part should not be confused with the frequency
invariant scheduler load-tracking support which deals with frequency
related scaling due to DFVS functionality.

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 arch/arm64/include/asm/topology.h |  4 ++-
 arch/arm64/kernel/topology.c      | 42 +++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 72c47c3fa7b3..9370a336c934 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -22,11 +22,13 @@ void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
 
+struct sched_domain;
 #ifdef CONFIG_CPU_FREQ
 #define arch_scale_freq_capacity cpufreq_scale_freq_capacity
-struct sched_domain;
 extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
 #endif
+#define arch_scale_cpu_capacity scale_cpu_capacity
+extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
 
 #include <asm-generic/topology.h>
 
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 694f6deedbab..fb99a6735fd4 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -23,6 +23,18 @@
 #include <asm/cputype.h>
 #include <asm/topology.h>
 
+static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+
+unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(cpu_scale, cpu);
+}
+
+static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
+{
+	per_cpu(cpu_scale, cpu) = capacity;
+}
+
 static int __init get_cpu_for_node(struct device_node *node)
 {
 	struct device_node *cpu_node;
@@ -211,6 +223,35 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
 	return &cpu_topology[cpu].core_sibling;
 }
 
+static inline int cpu_corepower_flags(void)
+{
+	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN | \
+	       SD_SHARE_CAP_STATES;
+}
+
+static struct sched_domain_topology_level arm64_topology[] = {
+#ifdef CONFIG_SCHED_MC
+	{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
+#endif
+	{ cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
+static void update_cpu_capacity(unsigned int cpu)
+{
+	unsigned long capacity = SCHED_CAPACITY_SCALE;
+
+	if (cpu_core_energy(cpu)) {
+		int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
+		capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
+	}
+
+	set_capacity_scale(cpu, capacity);
+
+	pr_info("CPU%d: update cpu_capacity %lu\n",
+		cpu, arch_scale_cpu_capacity(NULL, cpu));
+}
+
 static void update_siblings_masks(unsigned int cpuid)
 {
 	struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
@@ -272,6 +313,7 @@ void store_cpu_topology(unsigned int cpuid)
 
 topology_populated:
 	update_siblings_masks(cpuid);
+	update_cpu_capacity(cpuid);
 }
 
 static void __init reset_cpu_topology(void)

From 3e55d2f2fcaf44ac19eb143d895924ff402e375c Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 13 Jan 2015 14:11:28 +0000
Subject: [PATCH 0067/3220] sched: Compute cpu capacity available at current
 frequency

capacity_orig_of() returns the max available compute capacity of a cpu.
For scale-invariant utilization tracking and energy-aware scheduling
decisions it is useful to know the compute capacity available at the
current OPP of a cpu.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index efc65c3db6b5..ea966d9193cd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4651,6 +4651,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 #endif
 
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+static unsigned long capacity_curr_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity_orig *
+	       arch_scale_freq_capacity(NULL, cpu)
+	       >> SCHED_CAPACITY_SHIFT;
+}
+
 static inline bool energy_aware(void)
 {
 	return sched_feat(ENERGY_AWARE);

From b6c0399e0f340887f369c6870ba6a865afe0b0ec Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Thu, 11 Dec 2014 15:25:29 +0000
Subject: [PATCH 0068/3220] sched: Relocated cpu_util() and change return type

Move cpu_util() to an earlier position in fair.c and change return
type to unsigned long as negative usage doesn't make much sense. All
other load and capacity related functions use unsigned long including
the caller of cpu_util().

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 70 ++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 36 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea966d9193cd..318141042a3e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4662,6 +4662,40 @@ static unsigned long capacity_curr_of(int cpu)
 	       >> SCHED_CAPACITY_SHIFT;
 }
 
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static unsigned long cpu_util(int cpu)
+{
+	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+	unsigned long capacity = capacity_orig_of(cpu);
+
+	return (util >= capacity) ? capacity : util;
+}
+
 static inline bool energy_aware(void)
 {
 	return sched_feat(ENERGY_AWARE);
@@ -4788,8 +4822,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu)
 	return __task_fits(p, cpu, 0);
 }
 
-static int cpu_util(int cpu);
-
 static inline bool task_fits_spare(struct task_struct *p, int cpu)
 {
 	return __task_fits(p, cpu, cpu_util(cpu));
@@ -4988,40 +5020,6 @@ done:
 	return target;
 }
 
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static int cpu_util(int cpu)
-{
-	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-	unsigned long capacity = capacity_orig_of(cpu);
-
-	return (util >= capacity) ? capacity : util;
-}
-
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,

From 5ec8ccabfef3989f4514f22c034a15aeab691110 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 2 Jan 2015 17:08:52 +0000
Subject: [PATCH 0069/3220] sched: Highest energy aware balancing sched_domain
 level pointer

Add another member to the family of per-cpu sched_domain shortcut
pointers. This one, sd_ea, points to the highest level at which energy
model is provided. At this level and all levels below all sched_groups
have energy model data attached.

Partial energy model information is possible but restricted to providing
energy model data for lower level sched_domains (sd_ea and below) and
leaving load-balancing on levels above to non-energy-aware
load-balancing. For example, it is possible to apply energy-aware
scheduling within each socket on a multi-socket system and let normal
scheduling handle load-balancing between sockets.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/core.c  | 11 ++++++++++-
 kernel/sched/sched.h |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2863d223c07e..45e54b0e3669 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5993,11 +5993,12 @@ DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_ea);
 
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain *sd;
-	struct sched_domain *busy_sd = NULL;
+	struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
 	int id = cpu;
 	int size = 1;
 
@@ -6018,6 +6019,14 @@ static void update_top_cache_domain(int cpu)
 
 	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
 	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+
+	for_each_domain(cpu, sd) {
+		if (sd->groups->sge)
+			ea_sd = sd;
+		else
+			break;
+	}
+	rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1813cba2995d..a77516a15ba0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -839,6 +839,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_ea);
 
 struct sched_group_capacity {
 	atomic_t ref;

From c1770a521398102b6f2447502035887297b17ea0 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Thu, 18 Dec 2014 14:47:18 +0000
Subject: [PATCH 0070/3220] sched: Calculate energy consumption of sched_group

For energy-aware load-balancing decisions it is necessary to know the
energy consumption estimates of groups of cpus. This patch introduces a
basic function, sched_group_energy(), which estimates the energy
consumption of the cpus in the group and any resources shared by the
members of the group.

NOTE: The function has five levels of identation and breaks the 80
character limit. Refactoring is necessary.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/core.c  |   4 ++
 kernel/sched/fair.c  | 156 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |   1 +
 3 files changed, 161 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 45e54b0e3669..bc5bb26ccf3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5994,6 +5994,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -6027,6 +6028,9 @@ static void update_top_cache_domain(int cpu)
 			break;
 	}
 	rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+	sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+	rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
 }
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 318141042a3e..49f7567c6b07 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4701,6 +4701,162 @@ static inline bool energy_aware(void)
 	return sched_feat(ENERGY_AWARE);
 }
 
+/*
+ * cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
+ * energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ *   capacity = capacity_orig * curr_freq/max_freq
+ *
+ *   norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long cpu_norm_util(int cpu, unsigned long capacity)
+{
+	int util = cpu_util(cpu);
+
+	if (util >= capacity)
+		return SCHED_CAPACITY_SCALE;
+
+	return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static unsigned long group_max_util(struct sched_group *sg)
+{
+	int i;
+	unsigned long max_util = 0;
+
+	for_each_cpu(i, sched_group_cpus(sg))
+		max_util = max(max_util, cpu_util(i));
+
+	return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
+ * energy calculations. Since task executions may or may not overlap in time in
+ * the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned long group_norm_util(struct sched_group *sg, int cap_idx)
+{
+	int i;
+	unsigned long util_sum = 0;
+	unsigned long capacity = sg->sge->cap_states[cap_idx].cap;
+
+	for_each_cpu(i, sched_group_cpus(sg))
+		util_sum += cpu_norm_util(i, capacity);
+
+	if (util_sum > SCHED_CAPACITY_SCALE)
+		return SCHED_CAPACITY_SCALE;
+	return util_sum;
+}
+
+static int find_new_capacity(struct sched_group *sg,
+	const struct sched_group_energy const *sge)
+{
+	int idx;
+	unsigned long util = group_max_util(sg);
+
+	for (idx = 0; idx < sge->nr_cap_states; idx++) {
+		if (sge->cap_states[idx].cap >= util)
+			return idx;
+	}
+
+	return idx;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct sched_group *sg_top)
+{
+	struct sched_domain *sd;
+	int cpu, total_energy = 0;
+	struct cpumask visit_cpus;
+	struct sched_group *sg;
+
+	WARN_ON(!sg_top->sge);
+
+	cpumask_copy(&visit_cpus, sched_group_cpus(sg_top));
+
+	while (!cpumask_empty(&visit_cpus)) {
+		struct sched_group *sg_shared_cap = NULL;
+
+		cpu = cpumask_first(&visit_cpus);
+
+		/*
+		 * Is the group utilization affected by cpus outside this
+		 * sched_group?
+		 */
+		sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+		if (!sd)
+			/*
+			 * We most probably raced with hotplug; returning a
+			 * wrong energy estimation is better than entering an
+			 * infinite loop.
+			 */
+			return -EINVAL;
+
+		if (sd->parent)
+			sg_shared_cap = sd->parent->groups;
+
+		for_each_domain(cpu, sd) {
+			sg = sd->groups;
+
+			/* Has this sched_domain already been visited? */
+			if (sd->child && group_first_cpu(sg) != cpu)
+				break;
+
+			do {
+				struct sched_group *sg_cap_util;
+				unsigned long group_util;
+				int sg_busy_energy, sg_idle_energy, cap_idx;
+
+				if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+					sg_cap_util = sg_shared_cap;
+				else
+					sg_cap_util = sg;
+
+				cap_idx = find_new_capacity(sg_cap_util, sg->sge);
+				group_util = group_norm_util(sg, cap_idx);
+				sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+										>> SCHED_CAPACITY_SHIFT;
+				sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power)
+										>> SCHED_CAPACITY_SHIFT;
+
+				total_energy += sg_busy_energy + sg_idle_energy;
+
+				if (!sd->child)
+					cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+
+				if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top)))
+					goto next_cpu;
+
+			} while (sg = sg->next, sg != sd->groups);
+		}
+next_cpu:
+		continue;
+	}
+
+	return total_energy;
+}
+
 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
  * A waker of many should wake a different task than the one last awakened
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a77516a15ba0..a618db2936d4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -840,6 +840,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 DECLARE_PER_CPU(struct sched_domain *, sd_ea);
+DECLARE_PER_CPU(struct sched_domain *, sd_scs);
 
 struct sched_group_capacity {
 	atomic_t ref;

From df2030c8412e6acdb3f7ec62ed195cc7967c30e2 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 2 Jan 2015 14:21:56 +0000
Subject: [PATCH 0071/3220] sched: Extend sched_group_energy to test
 load-balancing decisions

Extended sched_group_energy() to support energy prediction with usage
(tasks) added/removed from a specific cpu or migrated between a pair of
cpus. Useful for load-balancing decision making.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++--------------
 1 file changed, 63 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 49f7567c6b07..f3de37414a66 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4688,12 +4688,21 @@ static unsigned long capacity_curr_of(int cpu)
  * capacity_orig) as it useful for predicting the capacity required after task
  * migrations (scheduler-driven DVFS).
  */
-static unsigned long cpu_util(int cpu)
+static unsigned long __cpu_util(int cpu, int delta)
 {
 	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
 	unsigned long capacity = capacity_orig_of(cpu);
 
-	return (util >= capacity) ? capacity : util;
+	delta += util;
+	if (delta < 0)
+		return 0;
+
+	return (delta >= capacity) ? capacity : delta;
+}
+
+static unsigned long cpu_util(int cpu)
+{
+	return __cpu_util(cpu, 0);
 }
 
 static inline bool energy_aware(void)
@@ -4701,8 +4710,18 @@ static inline bool energy_aware(void)
 	return sched_feat(ENERGY_AWARE);
 }
 
+struct energy_env {
+	struct sched_group	*sg_top;
+	struct sched_group	*sg_cap;
+	int			cap_idx;
+	int			util_delta;
+	int			src_cpu;
+	int			dst_cpu;
+	int			energy;
+};
+
 /*
- * cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
  * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
  * energy calculations. Using the scale-invariant util returned by
  * cpu_util() and approximating scale-invariant util by:
@@ -4715,9 +4734,9 @@ static inline bool energy_aware(void)
  *
  *   norm_util = running_time/time ~ util/capacity
  */
-static unsigned long cpu_norm_util(int cpu, unsigned long capacity)
+static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
 {
-	int util = cpu_util(cpu);
+	int util = __cpu_util(cpu, delta);
 
 	if (util >= capacity)
 		return SCHED_CAPACITY_SCALE;
@@ -4725,13 +4744,25 @@ static unsigned long cpu_norm_util(int cpu, unsigned long capacity)
 	return (util << SCHED_CAPACITY_SHIFT)/capacity;
 }
 
-static unsigned long group_max_util(struct sched_group *sg)
+static int calc_util_delta(struct energy_env *eenv, int cpu)
 {
-	int i;
+	if (cpu == eenv->src_cpu)
+		return -eenv->util_delta;
+	if (cpu == eenv->dst_cpu)
+		return eenv->util_delta;
+	return 0;
+}
+
+static
+unsigned long group_max_util(struct energy_env *eenv)
+{
+	int i, delta;
 	unsigned long max_util = 0;
 
-	for_each_cpu(i, sched_group_cpus(sg))
-		max_util = max(max_util, cpu_util(i));
+	for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
+		delta = calc_util_delta(eenv, i);
+		max_util = max(max_util, __cpu_util(i, delta));
+	}
 
 	return max_util;
 }
@@ -4745,31 +4776,36 @@ static unsigned long group_max_util(struct sched_group *sg)
  * latter is used as the estimate as it leads to a more pessimistic energy
  * estimate (more busy).
  */
-static unsigned long group_norm_util(struct sched_group *sg, int cap_idx)
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
 {
-	int i;
+	int i, delta;
 	unsigned long util_sum = 0;
-	unsigned long capacity = sg->sge->cap_states[cap_idx].cap;
+	unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
 
-	for_each_cpu(i, sched_group_cpus(sg))
-		util_sum += cpu_norm_util(i, capacity);
+	for_each_cpu(i, sched_group_cpus(sg)) {
+		delta = calc_util_delta(eenv, i);
+		util_sum += __cpu_norm_util(i, capacity, delta);
+	}
 
 	if (util_sum > SCHED_CAPACITY_SCALE)
 		return SCHED_CAPACITY_SCALE;
 	return util_sum;
 }
 
-static int find_new_capacity(struct sched_group *sg,
+static int find_new_capacity(struct energy_env *eenv,
 	const struct sched_group_energy const *sge)
 {
 	int idx;
-	unsigned long util = group_max_util(sg);
+	unsigned long util = group_max_util(eenv);
 
 	for (idx = 0; idx < sge->nr_cap_states; idx++) {
 		if (sge->cap_states[idx].cap >= util)
-			return idx;
+			break;
 	}
 
+	eenv->cap_idx = idx;
+
 	return idx;
 }
 
@@ -4783,16 +4819,16 @@ static int find_new_capacity(struct sched_group *sg,
  * This can probably be done in a faster but more complex way.
  * Note: sched_group_energy() may fail when racing with sched_domain updates.
  */
-static int sched_group_energy(struct sched_group *sg_top)
+static int sched_group_energy(struct energy_env *eenv)
 {
 	struct sched_domain *sd;
 	int cpu, total_energy = 0;
 	struct cpumask visit_cpus;
 	struct sched_group *sg;
 
-	WARN_ON(!sg_top->sge);
+	WARN_ON(!eenv->sg_top->sge);
 
-	cpumask_copy(&visit_cpus, sched_group_cpus(sg_top));
+	cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
 
 	while (!cpumask_empty(&visit_cpus)) {
 		struct sched_group *sg_shared_cap = NULL;
@@ -4824,17 +4860,16 @@ static int sched_group_energy(struct sched_group *sg_top)
 				break;
 
 			do {
-				struct sched_group *sg_cap_util;
 				unsigned long group_util;
 				int sg_busy_energy, sg_idle_energy, cap_idx;
 
 				if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
-					sg_cap_util = sg_shared_cap;
+					eenv->sg_cap = sg_shared_cap;
 				else
-					sg_cap_util = sg;
+					eenv->sg_cap = sg;
 
-				cap_idx = find_new_capacity(sg_cap_util, sg->sge);
-				group_util = group_norm_util(sg, cap_idx);
+				cap_idx = find_new_capacity(eenv, sg->sge);
+				group_util = group_norm_util(eenv, sg);
 				sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
 										>> SCHED_CAPACITY_SHIFT;
 				sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power)
@@ -4845,7 +4880,7 @@ static int sched_group_energy(struct sched_group *sg_top)
 				if (!sd->child)
 					cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
 
-				if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top)))
+				if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
 					goto next_cpu;
 
 			} while (sg = sg->next, sg != sd->groups);
@@ -4854,7 +4889,8 @@ next_cpu:
 		continue;
 	}
 
-	return total_energy;
+	eenv->energy = total_energy;
+	return 0;
 }
 
 /*

From 2c6a8a48a701ca5d767f249eacb776f80858cbaa Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 6 Jan 2015 17:34:05 +0000
Subject: [PATCH 0072/3220] sched: Estimate energy impact of scheduling
 decisions

Adds a generic energy-aware helper function, energy_diff(), that
calculates energy impact of adding, removing, and migrating utilization
in the system.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f3de37414a66..043715612227 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4893,6 +4893,58 @@ next_cpu:
 	return 0;
 }
 
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
+{
+	return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
+}
+
+/*
+ * energy_diff(): Estimate the energy impact of changing the utilization
+ * distribution. eenv specifies the change: utilisation amount, source, and
+ * destination cpu. Source or destination cpu may be -1 in which case the
+ * utilization is removed from or added to the system (e.g. task wake-up). If
+ * both are specified, the utilization is migrated.
+ */
+static int energy_diff(struct energy_env *eenv)
+{
+	struct sched_domain *sd;
+	struct sched_group *sg;
+	int sd_cpu = -1, energy_before = 0, energy_after = 0;
+
+	struct energy_env eenv_before = {
+		.util_delta	= 0,
+		.src_cpu	= eenv->src_cpu,
+		.dst_cpu	= eenv->dst_cpu,
+	};
+
+	if (eenv->src_cpu == eenv->dst_cpu)
+		return 0;
+
+	sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+	sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+
+	if (!sd)
+		return 0; /* Error */
+
+	sg = sd->groups;
+
+	do {
+		if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
+			eenv_before.sg_top = eenv->sg_top = sg;
+
+			if (sched_group_energy(&eenv_before))
+				return 0; /* Invalid result abort */
+			energy_before += eenv_before.energy;
+
+			if (sched_group_energy(eenv))
+				return 0; /* Invalid result abort */
+			energy_after += eenv->energy;
+		}
+	} while (sg = sg->next, sg != sd->groups);
+
+	return energy_after-energy_before;
+}
+
 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
  * A waker of many should wake a different task than the one last awakened

From 1b5ec5d8ab918a62446f91ddfdb3eba88ea8c8dd Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Sat, 9 May 2015 16:49:57 +0100
Subject: [PATCH 0073/3220] sched: Add over-utilization/tipping point indicator

Energy-aware scheduling is only meant to be active while the system is
_not_ over-utilized. That is, there are spare cycles available to shift
tasks around based on their actual utilization to get a more
energy-efficient task distribution without depriving any tasks. When
above the tipping point task placement is done the traditional way based
on load_avg, spreading the tasks across as many cpus as possible based
on priority scaled load to preserve smp_nice. Below the tipping point we
want to use util_avg instead. We need to define a criteria for when we
make the switch.

The util_avg for each cpu converges towards 100% (1024) regardless of
how many task additional task we may put on it. If we define
over-utilized as:

sum_{cpus}(rq.cfs.avg.util_avg) + margin > sum_{cpus}(rq.capacity)

some individual cpus may be over-utilized running multiple tasks even
when the above condition is false. That should be okay as long as we try
to spread the tasks out to avoid per-cpu over-utilization as much as
possible and if all tasks have the _same_ priority. If the latter isn't
true, we have to consider priority to preserve smp_nice.

For example, we could have n_cpus nice=-10 util_avg=55% tasks and
n_cpus/2 nice=0 util_avg=60% tasks. Balancing based on util_avg we are
likely to end up with nice=-10 tasks sharing cpus and nice=0 tasks
getting their own as we 1.5*n_cpus tasks in total and 55%+55% is less
over-utilized than 55%+60% for those cpus that have to be shared. The
system utilization is only 85% of the system capacity, but we are
breaking smp_nice.

To be sure not to break smp_nice, we have defined over-utilization
conservatively as when any cpu in the system is fully utilized at it's
highest frequency instead:

cpu_rq(any).cfs.avg.util_avg + margin > cpu_rq(any).capacity

IOW, as soon as one cpu is (nearly) 100% utilized, we switch to load_avg
to factor in priority to preserve smp_nice.

With this definition, we can skip periodic load-balance as no cpu has an
always-running task when the system is not over-utilized. All tasks will
be periodic and we can balance them at wake-up. This conservative
condition does however mean that some scenarios that could benefit from
energy-aware decisions even if one cpu is fully utilized would not get
those benefits.

For system where some cpus might have reduced capacity on some cpus
(RT-pressure and/or big.LITTLE), we want periodic load-balance checks as
soon a just a single cpu is fully utilized as it might one of those with
reduced capacity and in that case we want to migrate it.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c  | 31 +++++++++++++++++++++++++------
 kernel/sched/sched.h |  3 +++
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 043715612227..3cfb4be3d88f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4144,6 +4144,8 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+static bool cpu_overutilized(int cpu);
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -4154,6 +4156,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int task_new = !(flags & ENQUEUE_WAKEUP);
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
@@ -4185,9 +4188,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(cfs_rq);
 	}
 
-	if (!se)
+	if (!se) {
 		add_nr_running(rq, 1);
-
+		if (!task_new && !rq->rd->overutilized &&
+		    cpu_overutilized(rq->cpu))
+			rq->rd->overutilized = true;
+	}
 	hrtick_update(rq);
 }
 
@@ -6651,11 +6657,12 @@ group_type group_classify(struct sched_group *group,
  * @local_group: Does group contain this_cpu.
  * @sgs: variable to hold the statistics for this group.
  * @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
 			struct sched_group *group, int load_idx,
 			int local_group, struct sg_lb_stats *sgs,
-			bool *overload)
+			bool *overload, bool *overutilized)
 {
 	unsigned long load;
 	int i;
@@ -6685,6 +6692,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->sum_weighted_load += weighted_cpuload(i);
 		if (idle_cpu(i))
 			sgs->idle_cpus++;
+
+		if (cpu_overutilized(i))
+			*overutilized = true;
 	}
 
 	/* Adjust by relative CPU capacity of the group */
@@ -6790,7 +6800,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats tmp_sgs;
 	int load_idx, prefer_sibling = 0;
-	bool overload = false;
+	bool overload = false, overutilized = false;
 
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
@@ -6812,7 +6822,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		}
 
 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
-						&overload);
+						&overload, &overutilized);
 
 		if (local_group)
 			goto next_group;
@@ -6856,8 +6866,14 @@ next_group:
 		/* update overload indicator if we are at root domain */
 		if (env->dst_rq->rd->overload != overload)
 			env->dst_rq->rd->overload = overload;
-	}
 
+		/* Update over-utilization (tipping point, U >= 0) indicator */
+		if (env->dst_rq->rd->overutilized != overutilized)
+			env->dst_rq->rd->overutilized = overutilized;
+	} else {
+		if (!env->dst_rq->rd->overutilized && overutilized)
+			env->dst_rq->rd->overutilized = true;
+	}
 }
 
 /**
@@ -8250,6 +8266,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
+
+	if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+		rq->rd->overutilized = true;
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a618db2936d4..5323d4fc1109 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -528,6 +528,9 @@ struct root_domain {
 	/* Indicate more than one runnable task for any CPU */
 	bool overload;
 
+	/* Indicate one or more cpus over-utilized (tipping point) */
+	bool overutilized;
+
 	/*
 	 * The bit corresponding to a CPU gets set here if such CPU has more
 	 * than one runnable -deadline task (as it is below for RT tasks).

From 19a5ebe13dded57ab2b04defd2e379579da05197 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 27 Jan 2015 13:48:07 +0000
Subject: [PATCH 0074/3220] sched, cpuidle: Track cpuidle state index in the
 scheduler

The idle-state of each cpu is currently pointed to by rq->idle_state but
there isn't any information in the struct cpuidle_state that can used to
look up the idle-state energy model data stored in struct
sched_group_energy. For this purpose is necessary to store the idle
state index as well. Ideally, the idle-state data should be unified.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 drivers/cpuidle/cpuidle.c |  4 ++--
 include/linux/cpuidle.h   |  2 +-
 kernel/sched/idle.c       |  3 ++-
 kernel/sched/sched.h      | 21 +++++++++++++++++++++
 4 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 17a6dc0e2111..b9a6cceb7bac 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -192,7 +192,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	}
 
 	/* Take note of the planned idle state. */
-	sched_idle_set_state(target_state);
+	sched_idle_set_state(target_state, index);
 
 	trace_cpu_idle_rcuidle(index, dev->cpu);
 	time_start = ktime_get();
@@ -205,7 +205,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
 	/* The cpu is no longer idle or about to enter idle. */
-	sched_idle_set_state(NULL);
+	sched_idle_set_state(NULL, -1);
 
 	if (broadcast) {
 		if (WARN_ON_ONCE(!irqs_disabled()))
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 786ad32631a6..6eae1576499e 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -204,7 +204,7 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
 #endif
 
 /* kernel/sched/idle.c */
-extern void sched_idle_set_state(struct cpuidle_state *idle_state);
+extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index);
 extern void default_idle_call(void);
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4a2ef5a02fd3..cbc130efbc5b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -19,9 +19,10 @@
  * sched_idle_set_state - Record idle state for the current CPU.
  * @idle_state: State to record.
  */
-void sched_idle_set_state(struct cpuidle_state *idle_state)
+void sched_idle_set_state(struct cpuidle_state *idle_state, int index)
 {
 	idle_set_state(this_rq(), idle_state);
+	idle_set_state_idx(this_rq(), index);
 }
 
 static int __read_mostly cpu_idle_force_poll;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5323d4fc1109..2301b0476b90 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -693,6 +693,7 @@ struct rq {
 #ifdef CONFIG_CPU_IDLE
 	/* Must be inspected within a rcu lock section */
 	struct cpuidle_state *idle_state;
+	int idle_state_idx;
 #endif
 };
 
@@ -1285,6 +1286,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 	WARN_ON(!rcu_read_lock_held());
 	return rq->idle_state;
 }
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+	rq->idle_state_idx = idle_state_idx;
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+	WARN_ON(!rcu_read_lock_held());
+	return rq->idle_state_idx;
+}
 #else
 static inline void idle_set_state(struct rq *rq,
 				  struct cpuidle_state *idle_state)
@@ -1295,6 +1307,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 {
 	return NULL;
 }
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+	return -1;
+}
 #endif
 
 extern void sysrq_sched_debug_show(void);

From ec055b978e851118a2b9d42f61ecad619cd1ce77 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Tue, 27 Jan 2015 14:04:17 +0000
Subject: [PATCH 0075/3220] sched: Determine the current sched_group idle-state

To estimate the energy consumption of a sched_group in
sched_group_energy() it is necessary to know which idle-state the group
is in when it is idle. For now, it is assumed that this is the current
idle-state (though it might be wrong). Based on the individual cpu
idle-states group_idle_state() finds the group idle-state.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/fair.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3cfb4be3d88f..c11bc7392916 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4815,6 +4815,20 @@ static int find_new_capacity(struct energy_env *eenv,
 	return idx;
 }
 
+static int group_idle_state(struct sched_group *sg)
+{
+	int i, state = INT_MAX;
+
+	/* Find the shallowest idle state in the sched group. */
+	for_each_cpu(i, sched_group_cpus(sg))
+		state = min(state, idle_get_state_idx(cpu_rq(i)));
+
+	/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
+	state++;
+
+	return state;
+}
+
 /*
  * sched_group_energy(): Computes the absolute energy consumption of cpus
  * belonging to the sched_group including shared resources shared only by
@@ -4867,7 +4881,8 @@ static int sched_group_energy(struct energy_env *eenv)
 
 			do {
 				unsigned long group_util;
-				int sg_busy_energy, sg_idle_energy, cap_idx;
+				int sg_busy_energy, sg_idle_energy;
+				int cap_idx, idle_idx;
 
 				if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
 					eenv->sg_cap = sg_shared_cap;
@@ -4875,11 +4890,13 @@ static int sched_group_energy(struct energy_env *eenv)
 					eenv->sg_cap = sg;
 
 				cap_idx = find_new_capacity(eenv, sg->sge);
+				idle_idx = group_idle_state(sg);
 				group_util = group_norm_util(eenv, sg);
 				sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
-										>> SCHED_CAPACITY_SHIFT;
-				sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power)
-										>> SCHED_CAPACITY_SHIFT;
+								>> SCHED_CAPACITY_SHIFT;
+				sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
+								* sg->sge->idle_states[idle_idx].power)
+								>> SCHED_CAPACITY_SHIFT;
 
 				total_energy += sg_busy_energy + sg_idle_energy;
 

From e38982ba3289974385bd8fead9bf767d8cdde386 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Sat, 9 May 2015 20:03:19 +0100
Subject: [PATCH 0076/3220] sched: Energy-aware wake-up task placement

Let available compute capacity and estimated energy impact select
wake-up target cpu when energy-aware scheduling is enabled and the
system in not over-utilized (above the tipping point).

energy_aware_wake_cpu() attempts to find group of cpus with sufficient
compute capacity to accommodate the task and find a cpu with enough spare
capacity to handle the task within that group. Preference is given to
cpus with enough spare capacity at the current OPP. Finally, the energy
impact of the new target and the previous task cpu is compared to select
the wake-up target cpu.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 89 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c11bc7392916..682b4ae9ebd7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5287,6 +5287,86 @@ done:
 	return target;
 }
 
+static int energy_aware_wake_cpu(struct task_struct *p, int target)
+{
+	struct sched_domain *sd;
+	struct sched_group *sg, *sg_target;
+	int target_max_cap = INT_MAX;
+	int target_cpu = task_cpu(p);
+	int i;
+
+	sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+
+	if (!sd)
+		return target;
+
+	sg = sd->groups;
+	sg_target = sg;
+
+	/*
+	 * Find group with sufficient capacity. We only get here if no cpu is
+	 * overutilized. We may end up overutilizing a cpu by adding the task,
+	 * but that should not be any worse than select_idle_sibling().
+	 * load_balance() should sort it out later as we get above the tipping
+	 * point.
+	 */
+	do {
+		/* Assuming all cpus are the same in group */
+		int max_cap_cpu = group_first_cpu(sg);
+
+		/*
+		 * Assume smaller max capacity means more energy-efficient.
+		 * Ideally we should query the energy model for the right
+		 * answer but it easily ends up in an exhaustive search.
+		 */
+		if (capacity_of(max_cap_cpu) < target_max_cap &&
+		    task_fits_max(p, max_cap_cpu)) {
+			sg_target = sg;
+			target_max_cap = capacity_of(max_cap_cpu);
+		}
+	} while (sg = sg->next, sg != sd->groups);
+
+	/* Find cpu with sufficient capacity */
+	for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+		/*
+		 * p's blocked utilization is still accounted for on prev_cpu
+		 * so prev_cpu will receive a negative bias due to the double
+		 * accounting. However, the blocked utilization may be zero.
+		 */
+		int new_util = cpu_util(i) + task_util(p);
+
+		if (new_util > capacity_orig_of(i))
+			continue;
+
+		if (new_util < capacity_curr_of(i)) {
+			target_cpu = i;
+			if (cpu_rq(i)->nr_running)
+				break;
+		}
+
+		/* cpu has capacity at higher OPP, keep it as fallback */
+		if (target_cpu == task_cpu(p))
+			target_cpu = i;
+	}
+
+	if (target_cpu != task_cpu(p)) {
+		struct energy_env eenv = {
+			.util_delta	= task_util(p),
+			.src_cpu	= task_cpu(p),
+			.dst_cpu	= target_cpu,
+		};
+
+		/* Not enough spare capacity on previous cpu */
+		if (cpu_overutilized(task_cpu(p)))
+			return target_cpu;
+
+		if (energy_diff(&eenv) >= 0)
+			return task_cpu(p);
+	}
+
+	return target_cpu;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5309,8 +5389,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE)
-		want_affine = !wake_wide(p) && task_fits_max(p, cpu) &&
-			      cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+		want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
+			      cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
+			      energy_aware();
 
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
@@ -5340,7 +5421,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	}
 
 	if (!sd) {
-		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+		if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
+			new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+		else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, new_cpu);
 
 	} else while (sd) {

From 52b7b8a37c3da55885a9b7e60b2c4db6cf2dbeff Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Sun, 10 May 2015 15:17:32 +0100
Subject: [PATCH 0077/3220] sched: Consider a not over-utilized energy-aware
 system as balanced

In case the system operates below the tipping point indicator,
introduced in ("sched: Add over-utilization/tipping point
indicator"), bail out in find_busiest_group after the dst and src
group statistics have been checked.

There is simply no need to move usage around because all involved
cpus still have spare cycles available.

For an energy-aware system below its tipping point,  we rely on the
task placement of the wakeup path. This works well for short running
tasks.

The existence of long running tasks on one of the involved cpus lets
the system operate over its tipping point. To be able to move such
a task (whose load can't be used to average the load among the cpus)
from a src cpu with lower capacity than the dst_cpu, an additional
rule has to be implemented in need_active_balance.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/fair.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 682b4ae9ebd7..ae4f1ee9cf0c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7194,6 +7194,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	 * this level.
 	 */
 	update_sd_lb_stats(env, &sds);
+
+	if (energy_aware() && !env->dst_rq->rd->overutilized)
+		goto out_balanced;
+
 	local = &sds.local_stat;
 	busiest = &sds.busiest_stat;
 

From b71188bd2c52b6af022d687347eb4c7c29901580 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 3 Feb 2015 13:54:11 +0000
Subject: [PATCH 0078/3220] sched: Disable energy-unfriendly nohz kicks

With energy-aware scheduling enabled nohz_kick_needed() generates many
nohz idle-balance kicks which lead to nothing when multiple tasks get
packed on a single cpu to save energy. This causes unnecessary wake-ups
and hence wastes energy. Make these conditions depend on !energy_aware()
for now until the energy-aware nohz story gets sorted out.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae4f1ee9cf0c..7327221c991e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8259,12 +8259,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
 	if (time_before(now, nohz.next_balance))
 		return false;
 
-	if (rq->nr_running >= 2)
+	if (rq->nr_running >= 2 &&
+	    (!energy_aware() || cpu_overutilized(cpu)))
 		return true;
 
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
-	if (sd) {
+	if (sd && !energy_aware()) {
 		sgc = sd->groups->sgc;
 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
 

From 1b35232614f555cbeed30a357ecf77a37632c347 Mon Sep 17 00:00:00 2001
From: Robin Randhawa <robin.randhawa@arm.com>
Date: Mon, 29 Jun 2015 17:56:20 +0100
Subject: [PATCH 0079/3220] Documentation: DT bindings for energy model cost
 data required by EAS

EAS (energy aware scheduling) provides the scheduler with an alternative
objective - energy efficiency - as opposed to it's current performance
oriented objectives. EAS relies on a simple platform energy cost model
to guide scheduling decisions. The model only considers the CPU
subsystem.

This patch adds documentation describing DT bindings that should be used to
supply the scheduler with an energy cost model.

Signed-off-by: Robin Randhawa <robin.randhawa@arm.com>
---
 .../bindings/scheduler/sched-energy-costs.txt | 360 ++++++++++++++++++
 1 file changed, 360 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt

diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt
new file mode 100644
index 000000000000..11216f09e596
--- /dev/null
+++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt
@@ -0,0 +1,360 @@
+===========================================================
+Energy cost bindings for Energy Aware Scheduling
+===========================================================
+
+===========================================================
+1 - Introduction
+===========================================================
+
+This note specifies bindings required for energy-aware scheduling
+(EAS)[1]. Historically, the scheduler's primary objective has been
+performance.  EAS aims to provide an alternative objective - energy
+efficiency. EAS relies on a simple platform energy cost model to
+guide scheduling decisions.  The model only considers the CPU
+subsystem.
+
+This note is aligned with the definition of the layout of physical
+CPUs in the system as described in the ARM topology binding
+description [2]. The concept is applicable to any system so long as
+the cost model data is provided for those processing elements in
+that system's topology that EAS is required to service.
+
+Processing elements refer to hardware threads, CPUs and clusters of
+related CPUs in increasing order of hierarchy.
+
+EAS requires two key cost metrics - busy costs and idle costs. Busy
+costs comprise of a list of compute capacities for the processing
+element in question and the corresponding power consumption at that
+capacity.  Idle costs comprise of a list of power consumption values
+for each idle state [C-state] that the processing element supports.
+For a detailed description of these metrics, their derivation and
+their use see [3].
+
+These cost metrics are required for processing elements in all
+scheduling domain levels that EAS is required to service.
+
+===========================================================
+2 - energy-costs node
+===========================================================
+
+Energy costs for the processing elements in scheduling domains that
+EAS is required to service are defined in the energy-costs node
+which acts as a container for the actual per processing element cost
+nodes. A single energy-costs node is required for a given system.
+
+- energy-costs node
+
+	Usage: Required
+
+	Description: The energy-costs node is a container node and
+	it's sub-nodes describe costs for each processing element at
+	all scheduling domain levels that EAS is required to
+	service.
+
+	Node name must be "energy-costs".
+
+	The energy-costs node's parent node must be the cpus node.
+
+	The energy-costs node's child nodes can be:
+
+	- one or more cost nodes.
+
+	Any other configuration is considered invalid.
+
+The energy-costs node can only contain a single type of child node
+whose bindings are described in paragraph 4.
+
+===========================================================
+3 - energy-costs node child nodes naming convention
+===========================================================
+
+energy-costs child nodes must follow a naming convention where the
+node name must be "thread-costN", "core-costN", "cluster-costN"
+depending on whether the costs in the node are for a thread, core or
+cluster.  N (where N = {0, 1, ...}) is the node number and has no
+bearing to the OS' logical thread, core or cluster index.
+
+===========================================================
+4 - cost node bindings
+===========================================================
+
+Bindings for cost nodes are defined as follows:
+
+- cluster-cost node
+
+	Description: must be declared within an energy-costs node. A
+	system can contain multiple clusters and each cluster
+	serviced by EAS	must have a corresponding cluster-costs
+	node.
+
+	The cluster-cost node name must be "cluster-costN" as
+	described in 3 above.
+
+	A cluster-cost node must be a leaf node with no children.
+
+	Properties for cluster-cost nodes are described in paragraph
+	5 below.
+
+	Any other configuration is considered invalid.
+
+- core-cost node
+
+	Description: must be declared within an energy-costs node. A
+	system can contain multiple cores and each core serviced by
+	EAS must have a corresponding core-cost node.
+
+	The core-cost node name must be "core-costN" as described in
+	3 above.
+
+	A core-cost node must be a leaf node with no children.
+
+	Properties for core-cost nodes are described in paragraph
+	5 below.
+
+	Any other configuration is considered invalid.
+
+- thread-cost node
+
+	Description: must be declared within an energy-costs node. A
+	system can contain cores with multiple hardware threads and
+	each thread serviced by EAS must have a corresponding
+	thread-cost node.
+
+	The core-cost node name must be "core-costN" as described in
+	3 above.
+
+	A core-cost node must be a leaf node with no children.
+
+	Properties for thread-cost nodes are described in paragraph
+	5 below.
+
+	Any other configuration is considered invalid.
+
+===========================================================
+5 - Cost node properties
+==========================================================
+
+All cost node types must have only the following properties:
+
+- busy-cost-data
+
+	Usage: required
+	Value type: An array of 2-item tuples. Each item is of type
+	u32.
+	Definition: The first item in the tuple is the capacity
+	value as described in [3]. The second item in the tuple is
+	the energy cost value as described in [3].
+
+- idle-cost-data
+
+	Usage: required
+	Value type: An array of 1-item tuples. The item is of type
+	u32.
+	Definition: The item in the tuple is the energy cost value
+	as described in [3].
+
+===========================================================
+4 - Extensions to the cpu node
+===========================================================
+
+The cpu node is extended with a property that establishes the
+connection between the processing element represented by the cpu
+node and the cost-nodes associated with this processing element.
+
+The connection is expressed in line with the topological hierarchy
+that this processing element belongs to starting with the level in
+the hierarchy that this processing element itself belongs to through
+to the highest level that EAS is required to service.  The
+connection cannot be sparse and must be contiguous from the
+processing element's level through to the highest desired level. The
+highest desired level must be the same for all processing elements.
+
+Example: Given that a cpu node may represent a thread that is a part
+of a core, this property may contain multiple elements which
+associate the thread with cost nodes describing the costs for the
+thread itself, the core the thread belongs to, the cluster the core
+belongs to and so on. The elements must be ordered from the lowest
+level nodes to the highest desired level that EAS must service. The
+highest desired level must be the same for all cpu nodes. The
+elements must not be sparse: there must be elements for the current
+thread, the next level of hierarchy (core) and so on without any
+'holes'.
+
+Example: Given that a cpu node may represent a core that is a part
+of a cluster of related cpus this property may contain multiple
+elements which associate the core with cost nodes describing the
+costs for the core itself, the cluster the core belongs to and so
+on. The elements must be ordered from the lowest level nodes to the
+highest desired level that EAS must service. The highest desired
+level must be the same for all cpu nodes. The elements must not be
+sparse: there must be elements for the current thread, the next
+level of hierarchy (core) and so on without any 'holes'.
+
+If the system comprises of hierarchical clusters of clusters, this
+property will contain multiple associations with the relevant number
+of cluster elements in hierarchical order.
+
+Property added to the cpu node:
+
+- sched-energy-costs
+
+	Usage: required
+	Value type: List of phandles
+	Definition: a list of phandles to specific cost nodes in the
+	energy-costs parent node that correspond to the processing
+	element represented by this cpu node in hierarchical order
+	of topology.
+
+	The order of phandles in the list is significant. The first
+	phandle is to the current processing element's own cost
+	node.  Subsequent phandles are to higher hierarchical level
+	cost nodes up until the maximum level that EAS is to
+	service.
+
+	All cpu nodes must have the same highest level cost node.
+
+	The phandle list must not be sparsely populated with handles
+	to non-contiguous hierarchical levels. See commentary above
+	for clarity.
+
+	Any other configuration is invalid.
+
+===========================================================
+5 - Example dts
+===========================================================
+
+Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one
+cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus):
+
+cpus {
+	#address-cells = <2>;
+	#size-cells = <0>;
+	.
+	.
+	.
+	A57_0: cpu@0 {
+		compatible = "arm,cortex-a57","arm,armv8";
+		reg = <0x0 0x0>;
+		device_type = "cpu";
+		enable-method = "psci";
+		next-level-cache = <&A57_L2>;
+		clocks = <&scpi_dvfs 0>;
+		cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+		sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
+	};
+
+	A57_1: cpu@1 {
+		compatible = "arm,cortex-a57","arm,armv8";
+		reg = <0x0 0x1>;
+		device_type = "cpu";
+		enable-method = "psci";
+		next-level-cache = <&A57_L2>;
+		clocks = <&scpi_dvfs 0>;
+		cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+		sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
+	};
+
+	A53_0: cpu@100 {
+		compatible = "arm,cortex-a53","arm,armv8";
+		reg = <0x0 0x100>;
+		device_type = "cpu";
+		enable-method = "psci";
+		next-level-cache = <&A53_L2>;
+		clocks = <&scpi_dvfs 1>;
+		cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+		sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+	};
+
+	A53_1: cpu@101 {
+		compatible = "arm,cortex-a53","arm,armv8";
+		reg = <0x0 0x101>;
+		device_type = "cpu";
+		enable-method = "psci";
+		next-level-cache = <&A53_L2>;
+		clocks = <&scpi_dvfs 1>;
+		cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+		sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+	};
+
+	A53_2: cpu@102 {
+		compatible = "arm,cortex-a53","arm,armv8";
+		reg = <0x0 0x102>;
+		device_type = "cpu";
+		enable-method = "psci";
+		next-level-cache = <&A53_L2>;
+		clocks = <&scpi_dvfs 1>;
+		cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+		sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+	};
+
+	A53_3: cpu@103 {
+		compatible = "arm,cortex-a53","arm,armv8";
+		reg = <0x0 0x103>;
+		device_type = "cpu";
+		enable-method = "psci";
+		next-level-cache = <&A53_L2>;
+		clocks = <&scpi_dvfs 1>;
+		cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+		sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+	};
+
+	energy-costs {
+		CPU_COST_0: core-cost0 {
+			busy-cost-data = <
+				417   168
+				579   251
+				744   359
+				883   479
+				1024  616
+			>;
+			idle-cost-data = <
+				15
+				0
+			>;
+		};
+		CPU_COST_1: core-cost1 {
+			busy-cost-data = <
+				235 33
+				302 46
+				368 61
+				406 76
+				447 93
+			>;
+			idle-cost-data = <
+				6
+				0
+			>;
+		};
+		CLUSTER_COST_0: cluster-cost0 {
+			busy-cost-data = <
+				417   24
+				579   32
+				744   43
+				883   49
+				1024  64
+			>;
+			idle-cost-data = <
+				65
+				24
+			>;
+		};
+		CLUSTER_COST_1: cluster-cost1 {
+			busy-cost-data = <
+				235 26
+				303 30
+				368 39
+				406 47
+				447 57
+			>;
+			idle-cost-data = <
+				56
+				17
+			>;
+		};
+	};
+};
+
+===============================================================================
+[1] https://lkml.org/lkml/2015/5/12/728
+[2] Documentation/devicetree/bindings/topology.txt
+[3] Documentation/scheduler/sched-energy.txt

From d2b3db032e34938c970b148e71cc401fc4c541e8 Mon Sep 17 00:00:00 2001
From: Robin Randhawa <robin.randhawa@arm.com>
Date: Mon, 29 Jun 2015 18:01:58 +0100
Subject: [PATCH 0080/3220] sched: Support for extracting EAS energy costs from
 DT

This patch implements support for extracting energy cost data from DT.
The data should conform to the DT bindings for energy cost data needed
by EAS (energy aware scheduling).

Signed-off-by: Robin Randhawa <robin.randhawa@arm.com>
---
 include/linux/sched_energy.h |  36 ++++++++++
 kernel/sched/Makefile        |   2 +-
 kernel/sched/energy.c        | 124 +++++++++++++++++++++++++++++++++++
 3 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched_energy.h
 create mode 100644 kernel/sched/energy.c

diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h
new file mode 100644
index 000000000000..a3f1627ac609
--- /dev/null
+++ b/include/linux/sched_energy.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_SCHED_ENERGY_H
+#define _LINUX_SCHED_ENERGY_H
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * There doesn't seem to be an NR_CPUS style max number of sched domain
+ * levels so here's an arbitrary constant one for the moment.
+ *
+ * The levels alluded to here correspond to entries in struct
+ * sched_domain_topology_level that are meant to be populated by arch
+ * specific code (topology.c).
+ */
+#define NR_SD_LEVELS 8
+
+#define SD_LEVEL0   0
+#define SD_LEVEL1   1
+#define SD_LEVEL2   2
+#define SD_LEVEL3   3
+#define SD_LEVEL4   4
+#define SD_LEVEL5   5
+#define SD_LEVEL6   6
+#define SD_LEVEL7   7
+
+/*
+ * Convenience macro for iterating through said sd levels.
+ */
+#define for_each_possible_sd_level(level)		    \
+	for (level = 0; level < NR_SD_LEVELS; level++)
+
+extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+void init_sched_energy_costs(void);
+
+#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 67687973ce80..a541b5ce1dcc 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,7 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
+obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o
 obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
new file mode 100644
index 000000000000..b0656b7a93e3
--- /dev/null
+++ b/kernel/sched/energy.c
@@ -0,0 +1,124 @@
+/*
+ * Obtain energy cost data from DT and populate relevant scheduler data
+ * structures.
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) "sched-energy: " fmt
+
+#define DEBUG
+
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
+#include <linux/stddef.h>
+
+struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+static void free_resources(void)
+{
+	int cpu, sd_level;
+	struct sched_group_energy *sge;
+
+	for_each_possible_cpu(cpu) {
+		for_each_possible_sd_level(sd_level) {
+			sge = sge_array[cpu][sd_level];
+			if (sge) {
+				kfree(sge->cap_states);
+				kfree(sge->idle_states);
+				kfree(sge);
+			}
+		}
+	}
+}
+
+void init_sched_energy_costs(void)
+{
+	struct device_node *cn, *cp;
+	struct capacity_state *cap_states;
+	struct idle_state *idle_states;
+	struct sched_group_energy *sge;
+	const struct property *prop;
+	int sd_level, i, nstates, cpu;
+	const __be32 *val;
+
+	for_each_possible_cpu(cpu) {
+		cn = of_get_cpu_node(cpu, NULL);
+		if (!cn) {
+			pr_warn("CPU device node missing for CPU %d\n", cpu);
+			return;
+		}
+
+		if (!of_find_property(cn, "sched-energy-costs", NULL)) {
+			pr_warn("CPU device node has no sched-energy-costs\n");
+			return;
+		}
+
+		for_each_possible_sd_level(sd_level) {
+			cp = of_parse_phandle(cn, "sched-energy-costs", sd_level);
+			if (!cp)
+				break;
+
+			prop = of_find_property(cp, "busy-cost-data", NULL);
+			if (!prop || !prop->value) {
+				pr_warn("No busy-cost data, skipping sched_energy init\n");
+				goto out;
+			}
+
+			sge = kcalloc(1, sizeof(struct sched_group_energy),
+				      GFP_NOWAIT);
+
+			nstates = (prop->length / sizeof(u32)) / 2;
+			cap_states = kcalloc(nstates,
+					     sizeof(struct capacity_state),
+					     GFP_NOWAIT);
+
+			for (i = 0, val = prop->value; i < nstates; i++) {
+				cap_states[i].cap = be32_to_cpup(val++);
+				cap_states[i].power = be32_to_cpup(val++);
+			}
+
+			sge->nr_cap_states = nstates;
+			sge->cap_states = cap_states;
+
+			prop = of_find_property(cp, "idle-cost-data", NULL);
+			if (!prop || !prop->value) {
+				pr_warn("No idle-cost data, skipping sched_energy init\n");
+				goto out;
+			}
+
+			nstates = (prop->length / sizeof(u32));
+			idle_states = kcalloc(nstates,
+					      sizeof(struct idle_state),
+					      GFP_NOWAIT);
+
+			for (i = 0, val = prop->value; i < nstates; i++)
+				idle_states[i].power = be32_to_cpup(val++);
+
+			sge->nr_idle_states = nstates;
+			sge->idle_states = idle_states;
+
+			sge_array[cpu][sd_level] = sge;
+		}
+	}
+
+	pr_info("Sched-energy-costs installed from DT\n");
+	return;
+
+out:
+	free_resources();
+}

From 8296e78e5eaa9bfebf251cbd4bddd97322fa13a9 Mon Sep 17 00:00:00 2001
From: Robin Randhawa <robin.randhawa@arm.com>
Date: Tue, 9 Jun 2015 15:10:00 +0100
Subject: [PATCH 0081/3220] arm64, topology: Updates to use DT bindings for EAS
 costing data

With the bindings and the associated accessors to extract data from the
bindings in place, remove the static hard-coded data from topology.c and
use the accesors instead.

Signed-off-by: Robin Randhawa <robin.randhawa@arm.com>
---
 arch/arm64/kernel/topology.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index fb99a6735fd4..b5b43af6a7dc 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -19,6 +19,8 @@
 #include <linux/nodemask.h>
 #include <linux/of.h>
 #include <linux/sched.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
 
 #include <asm/cputype.h>
 #include <asm/topology.h>
@@ -218,6 +220,33 @@ out:
 struct cpu_topology cpu_topology[NR_CPUS];
 EXPORT_SYMBOL_GPL(cpu_topology);
 
+/* sd energy functions */
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+	struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1];
+
+	if (!sge) {
+		pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu);
+		return NULL;
+	}
+
+	return sge;
+}
+
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+	struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0];
+
+	if (!sge) {
+		pr_warn("Invalid sched_group_energy for CPU%d\n", cpu);
+		return NULL;
+	}
+
+	return sge;
+}
+
 const struct cpumask *cpu_coregroup_mask(int cpu)
 {
 	return &cpu_topology[cpu].core_sibling;
@@ -344,4 +373,8 @@ void __init init_cpu_topology(void)
 	 */
 	if (of_have_populated_dt() && parse_dt_topology())
 		reset_cpu_topology();
+	else
+		set_sched_topology(arm64_topology);
+
+	init_sched_energy_costs();
 }

From b03f1ba3d5bb9db0ca495c8c33ef88588daaae50 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Tue, 22 Sep 2015 16:47:48 +0100
Subject: [PATCH 0082/3220] cpufreq: Max freq invariant scheduler load-tracking
 and cpu capacity support

Implements cpufreq_scale_max_freq_capacity() to provide the scheduler
with a maximum frequency scaling correction factor for more accurate
load-tracking and cpu capacity handling by being able to deal with
frequency capping.

This scaling factor describes the influence of running a cpu with a
current maximum frequency lower than the absolute possible maximum
frequency on load tracking and cpu capacity.

The factor is:

	current_max_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)

In fact, max_freq_scale should be a struct cpufreq_policy data member.
But this would require that the scheduler hot path (__update_load_avg())
would have to grab the cpufreq lock. This can be avoided by using per-cpu
data initialized to SCHED_CAPACITY_SCALE for max_freq_scale.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 drivers/cpufreq/cpufreq.c | 19 +++++++++++++++++++
 include/linux/cpufreq.h   |  1 +
 2 files changed, 20 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 46a7d62c8174..96f1a8860819 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -352,12 +352,14 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
  *********************************************************************/
 
 static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
 
 static void
 scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs)
 {
 	unsigned long cur = freqs ? freqs->new : policy->cur;
 	unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max;
+	struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo;
 	int cpu;
 
 	pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n",
@@ -365,6 +367,18 @@ scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs)
 
 	for_each_cpu(cpu, policy->cpus)
 		per_cpu(freq_scale, cpu) = scale;
+
+	if (freqs)
+		return;
+
+	scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq;
+
+	pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n",
+		 cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq,
+		 scale);
+
+	for_each_cpu(cpu, policy->cpus)
+		per_cpu(max_freq_scale, cpu) = scale;
 }
 
 unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu)
@@ -372,6 +386,11 @@ unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu)
 	return per_cpu(freq_scale, cpu);
 }
 
+unsigned long cpufreq_scale_max_freq_capacity(int cpu)
+{
+	return per_cpu(max_freq_scale, cpu);
+}
+
 static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
 		struct cpufreq_freqs *freqs, unsigned int state)
 {
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 11c65f536b6c..9b1d61e1f1d7 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -619,4 +619,5 @@ int cpufreq_generic_init(struct cpufreq_policy *policy,
 
 struct sched_domain;
 unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+unsigned long cpufreq_scale_max_freq_capacity(int cpu);
 #endif /* _LINUX_CPUFREQ_H */

From 05a773bfc98b786cf504af1bce8bfe4a84a6a586 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Sat, 26 Sep 2015 18:19:54 +0100
Subject: [PATCH 0083/3220] sched: Update max cpu capacity in case of max
 frequency constraints

Wakeup balancing uses cpu capacity awareness and needs to know the
system-wide maximum cpu capacity.

Patch "sched: Store system-wide maximum cpu capacity in root domain"
finds the system-wide maximum cpu capacity during scheduler domain
hierarchy setup. This is sufficient as long as maximum frequency
invariance is not enabled.

If it is enabled, the system-wide maximum cpu capacity can change
between scheduler domain hierarchy setups due to frequency capping.

The cpu capacity is changed in update_cpu_capacity() which is called in
load balance on the lowest scheduler domain hierarchy level. To be able
to know if a change in cpu capacity for a certain cpu also has an effect
on the system-wide maximum cpu capacity it is normally necessary to
iterate over all cpus. This would be way too costly. That's why this
patch follows a different approach.

The unsigned long max_cpu_capacity value in struct root_domain is
replaced with a struct max_cpu_capacity, containing value (the
max_cpu_capacity) and cpu (the cpu index of the cpu providing the
maximum cpu_capacity).

Changes to the system-wide maximum cpu capacity and the cpu index are
made if:

 1 System-wide maximum cpu capacity < cpu capacity
 2 System-wide maximum cpu capacity > cpu capacity and cpu index == cpu

There are no changes to the system-wide maximum cpu capacity in all
other cases.

Atomic read and write access to the pair (max_cpu_capacity.val,
max_cpu_capacity.cpu) is enforced by max_cpu_capacity.lock.

The access to max_cpu_capacity.val in task_fits_max() is still performed
without taking the max_cpu_capacity.lock.

The code to set max cpu capacity in build_sched_domains() has been
removed because the whole functionality is now provided by
update_cpu_capacity() instead.

This approach can introduce errors temporarily, e.g. in case the cpu
currently providing the max cpu capacity has its cpu capacity lowered
due to frequency capping and calls update_cpu_capacity() before any cpu
which might provide the max cpu now.

There is also an outstanding question:

Should the cpu capacity of a cpu going idle be set to a very small
value?

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/core.c  |  8 ++------
 kernel/sched/fair.c  | 32 +++++++++++++++++++++++++++++++-
 kernel/sched/sched.h | 10 +++++++++-
 3 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc5bb26ccf3b..3680e76b6beb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5888,6 +5888,8 @@ static int init_rootdomain(struct root_domain *rd)
 
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_rto_mask;
+
+	init_max_cpu_capacity(&rd->max_cpu_capacity);
 	return 0;
 
 free_rto_mask:
@@ -7105,15 +7107,9 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 		rq = cpu_rq(i);
 		sd = *per_cpu_ptr(d.sd, i);
 		cpu_attach_domain(sd, d.rd, i);
-
-		if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity)
-			rq->rd->max_cpu_capacity = rq->cpu_capacity_orig;
 	}
 	rcu_read_unlock();
 
-	if (rq)
-		pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity);
-
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7327221c991e..91dfc8aa8aff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5078,7 +5078,7 @@ static inline bool __task_fits(struct task_struct *p, int cpu, int util)
 static inline bool task_fits_max(struct task_struct *p, int cpu)
 {
 	unsigned long capacity = capacity_of(cpu);
-	unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity;
+	unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
 
 	if (capacity == max_capacity)
 		return true;
@@ -6564,13 +6564,43 @@ static unsigned long scale_rt_capacity(int cpu)
 	return 1;
 }
 
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+	raw_spin_lock_init(&mcc->lock);
+	mcc->val = 0;
+	mcc->cpu = -1;
+}
+
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
 	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
 	struct sched_group *sdg = sd->groups;
+	struct max_cpu_capacity *mcc;
+	unsigned long max_capacity;
+	int max_cap_cpu;
+	unsigned long flags;
 
 	cpu_rq(cpu)->cpu_capacity_orig = capacity;
 
+	mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+	raw_spin_lock_irqsave(&mcc->lock, flags);
+	max_capacity = mcc->val;
+	max_cap_cpu = mcc->cpu;
+
+	if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+	    (max_capacity < capacity)) {
+		mcc->val = capacity;
+		mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+		raw_spin_unlock_irqrestore(&mcc->lock, flags);
+		pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+		goto skip_unlock;
+#endif
+	}
+	raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
 	capacity *= scale_rt_capacity(cpu);
 	capacity >>= SCHED_CAPACITY_SHIFT;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2301b0476b90..aac581932eff 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -510,6 +510,12 @@ struct dl_rq {
 
 #ifdef CONFIG_SMP
 
+struct max_cpu_capacity {
+	raw_spinlock_t lock;
+	unsigned long val;
+	int cpu;
+};
+
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -548,7 +554,7 @@ struct root_domain {
 	struct cpupri cpupri;
 
 	/* Maximum cpu capacity in the system. */
-	unsigned long max_cpu_capacity;
+	struct max_cpu_capacity max_cpu_capacity;
 };
 
 extern struct root_domain def_root_domain;
@@ -1340,6 +1346,8 @@ unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
 
+extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
+
 static inline void add_nr_running(struct rq *rq, unsigned count)
 {
 	unsigned prev_nr = rq->nr_running;

From 5414a72fc30f2069e3943ceb4e39b0ba9cd9c7e3 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 23 Sep 2015 17:59:55 +0100
Subject: [PATCH 0084/3220] arm: Enable max freq invariant scheduler
 load-tracking and capacity support

Maximum Frequency Invariance has to be part of Cpu Invariance because
Frequency Invariance deals only with differences in load-tracking
introduces by Dynamic Frequency Scaling and not with limiting the
possible range of cpu frequency.

By placing Maximum Frequency Invariance into Cpu Invariance,
load-tracking is scaled via arch_scale_cpu_capacity()
in __update_load_avg() and cpu capacity is scaled via
arch_scale_cpu_capacity() in update_cpu_capacity().

To be able to save the extra multiplication in the scheduler hotpath
(__update_load_avg()) we could:

 1 Inform cpufreq about base cpu capacity at boot and let it handle
   scale_cpu_capacity() as well.
 2 Use the cpufreq policy callback which would update a per-cpu current
   cpu_scale and this value would be return in scale_cpu_capacity().
 3 Use per-cpu current max_freq_scale and current cpu_scale with the
   current patch.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 arch/arm/kernel/topology.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index da1c611a3b5e..0308342def8c 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -44,7 +44,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale);
 
 unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
+#if CONFIG_CPU_FREQ
+	unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
+
+	return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
+#else
 	return per_cpu(cpu_scale, cpu);
+#endif
 }
 
 static void set_capacity_scale(unsigned int cpu, unsigned long capacity)

From b732dd602c61bab535b95237552e530ca8de8b93 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 25 Sep 2015 17:34:15 +0100
Subject: [PATCH 0085/3220] arm64: Enable max freq invariant scheduler
 load-tracking and capacity support

Maximum Frequency Invariance has to be part of Cpu Invariance because
Frequency Invariance deals only with differences in load-tracking
introduces by Dynamic Frequency Scaling and not with limiting the
possible range of cpu frequency.

By placing Maximum Frequency Invariance into Cpu Invariance,
load-tracking is scaled via arch_scale_cpu_capacity()
in __update_load_avg() and cpu capacity is scaled via
arch_scale_cpu_capacity() in update_cpu_capacity().

To be able to save the extra multiplication in the scheduler hotpath
(__update_load_avg()) we could:

  1 Inform cpufreq about base cpu capacity at boot and let it handle
    scale_cpu_capacity() as well.
  2 Use the cpufreq policy callback which would update a per-cpu current
    cpu_scale and this value would be return in scale_cpu_capacity().
  3 Use per-cpu current max_freq_scale and current cpu_scale with the
    current patch.

Including <linux/cpufreq.h> in topology.h like for the arm arch doesn't
work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0).
That's why cpufreq_scale_max_freq_capacity() has to be declared extern
in topology.h.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 arch/arm64/include/asm/topology.h | 1 +
 arch/arm64/kernel/topology.c      | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 9370a336c934..bbd362cd1ed1 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -26,6 +26,7 @@ struct sched_domain;
 #ifdef CONFIG_CPU_FREQ
 #define arch_scale_freq_capacity cpufreq_scale_freq_capacity
 extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+extern unsigned long cpufreq_scale_max_freq_capacity(int cpu);
 #endif
 #define arch_scale_cpu_capacity scale_cpu_capacity
 extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index b5b43af6a7dc..5b2c67a510d8 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -29,7 +29,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
 
 unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
+#ifdef CONFIG_CPU_FREQ
+	unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
+
+	return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
+#else
 	return per_cpu(cpu_scale, cpu);
+#endif
 }
 
 static void set_capacity_scale(unsigned int cpu, unsigned long capacity)

From e14f1518239461ec91e3cb99fba6809c7d5fa4ad Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 13 Jan 2016 15:49:44 +0000
Subject: [PATCH 0086/3220] sched: Do eas idle balance regardless of the rq avg
 idle value

EAS relies on idle balance to migrate a misfit task towards a cpu with
higher capacity.

When such a cpu becomes idle, idle balance should happen even if the rq
avg idle is smaller than the sched migration cost (default 500us).

The rq avg idle is updated during the wakeup of a task in case the rq has
a non-null idle_stamp. This value stays unchanged and valid until the next
task wakes up on this cpu after an idle period.

So rq avg idle could be smaller than sched migration cost preventing the
idle balance from happening. In this case we would be at the mercy of
wakeup, periodic or nohz-idle load balancing to put another task on this
cpu.

To break this dependency towards rq avg idle make EAS idle balance
independent from this rq avg idle has to be larger than sched migration
cost.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 91dfc8aa8aff..7d58b1fea71d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7772,8 +7772,9 @@ static int idle_balance(struct rq *this_rq)
 	 */
 	this_rq->idle_stamp = rq_clock(this_rq);
 
-	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-	    !this_rq->rd->overload) {
+	if (!energy_aware() &&
+	    (this_rq->avg_idle < sysctl_sched_migration_cost ||
+	     !this_rq->rd->overload)) {
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)

From 563ddb604e01393625aa56601188b7eeb20888da Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Thu, 25 Feb 2016 12:43:49 +0000
Subject: [PATCH 0087/3220] sched: Add per-cpu max capacity to
 sched_group_capacity

struct sched_group_capacity currently represents the compute capacity
sum of all cpus in the sched_group. Unless it is divided by the
group_weight to get the average capacity per cpu it hides differences in
cpu capacity for mixed capacity systems (e.g. high RT/IRQ utilization or
ARM big.LITTLE). But even the average may not be sufficient if the group
covers cpus of different capacities. Instead, by extending struct
sched_group_capacity to indicate max per-cpu capacity in the group a
suitable group for a given task utilization can easily be found such
that cpus with reduced capacity can be avoided for tasks with high
utilization (not implemented by this patch).

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/core.c  |  3 ++-
 kernel/sched/fair.c  | 17 ++++++++++++-----
 kernel/sched/sched.h |  3 ++-
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3680e76b6beb..23e82805ec47 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5714,7 +5714,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		printk(KERN_CONT " %*pbl",
 		       cpumask_pr_args(sched_group_cpus(group)));
 		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
-			printk(KERN_CONT " (cpu_capacity = %d)",
+			printk(KERN_CONT " (cpu_capacity = %lu)",
 				group->sgc->capacity);
 		}
 
@@ -6193,6 +6193,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		 * die on a /0 trap.
 		 */
 		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+		sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
 
 		/*
 		 * Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7d58b1fea71d..9762288ee47b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6609,13 +6609,14 @@ skip_unlock: __attribute__ ((unused));
 
 	cpu_rq(cpu)->cpu_capacity = capacity;
 	sdg->sgc->capacity = capacity;
+	sdg->sgc->max_capacity = capacity;
 }
 
 void update_group_capacity(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
-	unsigned long capacity;
+	unsigned long capacity, max_capacity;
 	unsigned long interval;
 
 	interval = msecs_to_jiffies(sd->balance_interval);
@@ -6628,6 +6629,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 	}
 
 	capacity = 0;
+	max_capacity = 0;
 
 	if (child->flags & SD_OVERLAP) {
 		/*
@@ -6652,11 +6654,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 			 */
 			if (unlikely(!rq->sd)) {
 				capacity += capacity_of(cpu);
-				continue;
+			} else {
+				sgc = rq->sd->groups->sgc;
+				capacity += sgc->capacity;
 			}
 
-			sgc = rq->sd->groups->sgc;
-			capacity += sgc->capacity;
+			max_capacity = max(capacity, max_capacity);
 		}
 	} else  {
 		/*
@@ -6666,12 +6669,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
 		group = child->groups;
 		do {
-			capacity += group->sgc->capacity;
+			struct sched_group_capacity *sgc = group->sgc;
+
+			capacity += sgc->capacity;
+			max_capacity = max(sgc->max_capacity, max_capacity);
 			group = group->next;
 		} while (group != child->groups);
 	}
 
 	sdg->sgc->capacity = capacity;
+	sdg->sgc->max_capacity = max_capacity;
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aac581932eff..967d70d49af9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -858,7 +858,8 @@ struct sched_group_capacity {
 	 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
 	 * for a single CPU.
 	 */
-	unsigned int capacity;
+	unsigned long capacity;
+	unsigned long max_capacity; /* Max per-cpu capacity in group */
 	unsigned long next_update;
 	int imbalance; /* XXX unrelated to capacity but shared group state */
 	/*

From f2a8923298fe9a39cf4c9ad48bbcb0781483d1f9 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Thu, 25 Feb 2016 12:47:54 +0000
Subject: [PATCH 0088/3220] sched: Add group_misfit_task load-balance type

To maximize throughput in systems with reduced capacity cpus (e.g.
high RT/IRQ load and/or ARM big.LITTLE) load-balancing has to consider
task and cpu utilization as well as per-cpu compute capacity when
load-balancing in addition to the current average load based
load-balancing policy. Tasks that are scheduled on a reduced capacity
cpu need to be identified and migrated to a higher capacity cpu if
possible.

To implement this additional policy an additional group_type
(load-balance scenario) is added: group_misfit_task. This represents
scenarios where a sched_group has tasks that are not suitable for its
per-cpu capacity. group_misfit_task is only considered if the system is
not overloaded in any other way (group_imbalanced or group_overloaded).

Identifying misfit tasks requires the rq lock to be held. To avoid
taking remote rq locks to examine source sched_groups for misfit tasks,
each cpu is responsible for tracking misfit tasks themselves and update
the rq->misfit_task flag. This means checking task utilization when
tasks are scheduled and on sched_tick.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c  | 29 ++++++++++++++++++++++-------
 kernel/sched/sched.h |  1 +
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9762288ee47b..b16e1b7878f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5739,6 +5739,8 @@ again:
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
+	rq->misfit_task = !task_fits_max(p, rq->cpu);
+
 	return p;
 simple:
 	cfs_rq = &rq->cfs;
@@ -5760,9 +5762,12 @@ simple:
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
+	rq->misfit_task = !task_fits_max(p, rq->cpu);
+
 	return p;
 
 idle:
+	rq->misfit_task = 0;
 	/*
 	 * This is OK, because current is on_cpu, which avoids it being picked
 	 * for load-balance and preemption/IRQs are still disabled avoiding
@@ -5975,6 +5980,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
 enum fbq_type { regular, remote, all };
 
+enum group_type {
+	group_other = 0,
+	group_misfit_task,
+	group_imbalanced,
+	group_overloaded,
+};
+
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
@@ -6446,12 +6458,6 @@ static unsigned long task_h_load(struct task_struct *p)
 
 /********** Helpers for find_busiest_group ************************/
 
-enum group_type {
-	group_other = 0,
-	group_imbalanced,
-	group_overloaded,
-};
-
 /*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
@@ -6467,6 +6473,7 @@ struct sg_lb_stats {
 	unsigned int group_weight;
 	enum group_type group_type;
 	int group_no_capacity;
+	int group_misfit_task; /* A cpu has a task too big for its capacity */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
@@ -6783,6 +6790,9 @@ group_type group_classify(struct sched_group *group,
 	if (sg_imbalanced(group))
 		return group_imbalanced;
 
+	if (sgs->group_misfit_task)
+		return group_misfit_task;
+
 	return group_other;
 }
 
@@ -6830,8 +6840,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		if (idle_cpu(i))
 			sgs->idle_cpus++;
 
-		if (cpu_overutilized(i))
+		if (cpu_overutilized(i)) {
 			*overutilized = true;
+			if (!sgs->group_misfit_task && rq->misfit_task)
+				sgs->group_misfit_task = capacity_of(i);
+		}
 	}
 
 	/* Adjust by relative CPU capacity of the group */
@@ -8412,6 +8425,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 
 	if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
 		rq->rd->overutilized = true;
+
+	rq->misfit_task = !task_fits_max(curr, rq->cpu);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 967d70d49af9..fcbb3e07accc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -584,6 +584,7 @@ struct rq {
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 	unsigned long last_load_update_tick;
+	unsigned int misfit_task;
 #ifdef CONFIG_NO_HZ_COMMON
 	u64 nohz_stamp;
 	unsigned long nohz_flags;

From 0d2b1cdfa11fd62d3358eef332e78b95b6a3e9ec Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Thu, 25 Feb 2016 12:51:35 +0000
Subject: [PATCH 0089/3220] sched: Consider misfit tasks when load-balancing

With the new group_misfit_task load-balancing scenario additional policy
conditions are needed when load-balancing. Misfit task balancing only
makes sense between source group with lower capacity than the target
group. If capacities are the same, fallback to normal group_other
balancing. The aim is to balance tasks such that no task has its
throughput hindered by compute capacity if a cpu with more capacity is
available. Load-balancing is generally based on average load in the
sched_groups, but for misfitting tasks it is necessary to introduce
exceptions to migrate tasks against usual metrics and optimize
throughput.

This patch ensures the following load-balance for mixed capacity systems
(e.g. ARM big.LITTLE) for always-running tasks:

1. Place a task on each cpu starting in order from cpus with highest
capacity to lowest until all cpus are in use (i.e. one task on each
cpu).

2. Once all cpus are in use balance according to compute capacity such
that load per capacity is approximately the same regardless of the
compute capacity (i.e. big cpus get more tasks than little cpus).

Necessary changes are introduced in find_busiest_group(),
calculate_imbalance(), and find_busiest_queue(). This includes passing
the group_type on to find_busiest_queue() through struct lb_env, which
is currently only considers imbalance and not the imbalance situation
(group_type).

To avoid taking remote rq locks to examine source sched_groups for
misfit tasks, each cpu is responsible for tracking misfit tasks
themselves and update the rq->misfit_task flag. This means checking task
utilization when tasks are scheduled and on sched_tick.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 71 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b16e1b7878f6..5b8a9c68f3fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6016,6 +6016,7 @@ struct lb_env {
 	unsigned int		loop_max;
 
 	enum fbq_type		fbq_type;
+	enum group_type		busiest_group_type;
 	struct list_head	tasks;
 };
 
@@ -6780,6 +6781,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
 	return false;
 }
 
+
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-cpu capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+	return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
+							ref->sgc->max_capacity;
+}
+
 static inline enum
 group_type group_classify(struct sched_group *group,
 			  struct sg_lb_stats *sgs)
@@ -6886,9 +6899,25 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	if (sgs->group_type < busiest->group_type)
 		return false;
 
+	/*
+	 * Candidate sg doesn't face any serious load-balance problems
+	 * so don't pick it if the local sg is already filled up.
+	 */
+	if (sgs->group_type == group_other &&
+	    !group_has_capacity(env, &sds->local_stat))
+		return false;
+
 	if (sgs->avg_load <= busiest->avg_load)
 		return false;
 
+	/*
+	 * Candiate sg has no more than one task per cpu and has higher
+	 * per-cpu capacity. No reason to pull tasks to less capable cpus.
+	 */
+	if (sgs->sum_nr_running <= sgs->group_weight &&
+	    group_smaller_cpu_capacity(sds->local, sg))
+		return false;
+
 	/* This is the busiest node in its class. */
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return true;
@@ -6994,6 +7023,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 			sgs->group_type = group_classify(sg, sgs);
 		}
 
+		/*
+		 * Ignore task groups with misfit tasks if local group has no
+		 * capacity or if per-cpu capacity isn't higher.
+		 */
+		if (sgs->group_type == group_misfit_task &&
+		    (!group_has_capacity(env, &sds->local_stat) ||
+		     !group_smaller_cpu_capacity(sg, sds->local)))
+			sgs->group_type = group_other;
+
 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
 			sds->busiest = sg;
 			sds->busiest_stat = *sgs;
@@ -7170,6 +7208,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	 */
 	if (busiest->avg_load <= sds->avg_load ||
 	    local->avg_load >= sds->avg_load) {
+		/* Misfitting tasks should be migrated in any case */
+		if (busiest->group_type == group_misfit_task) {
+			env->imbalance = busiest->group_misfit_task;
+			return;
+		}
+
+		/*
+		 * Busiest group is overloaded, local is not, use the spare
+		 * cycles to maximize throughput
+		 */
+		if (busiest->group_type == group_overloaded &&
+		    local->group_type <= group_misfit_task) {
+			env->imbalance = busiest->load_per_task;
+			return;
+		}
+
 		env->imbalance = 0;
 		return fix_small_imbalance(env, sds);
 	}
@@ -7203,6 +7257,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		(sds->avg_load - local->avg_load) * local->group_capacity
 	) / SCHED_CAPACITY_SCALE;
 
+	/* Boost imbalance to allow misfit task to be balanced. */
+	if (busiest->group_type == group_misfit_task)
+		env->imbalance = max_t(long, env->imbalance,
+				     busiest->group_misfit_task);
+
 	/*
 	 * if *imbalance is less than the average load per runnable task
 	 * there is no guarantee that any tasks will be moved so we'll have
@@ -7276,6 +7335,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	    busiest->group_no_capacity)
 		goto force_balance;
 
+	/* Misfitting tasks should be dealt with regardless of the avg load */
+	if (busiest->group_type == group_misfit_task) {
+		goto force_balance;
+	}
+
 	/*
 	 * If the local group is busier than the selected busiest group
 	 * don't try and pull any tasks.
@@ -7299,7 +7363,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 		 * might end up to just move the imbalance on another group
 		 */
 		if ((busiest->group_type != group_overloaded) &&
-				(local->idle_cpus <= (busiest->idle_cpus + 1)))
+		    (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
+		    !group_smaller_cpu_capacity(sds.busiest, sds.local))
 			goto out_balanced;
 	} else {
 		/*
@@ -7312,6 +7377,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	}
 
 force_balance:
+	env->busiest_group_type = busiest->group_type;
 	/* Looks like there is an imbalance. Compute it */
 	calculate_imbalance(env, &sds);
 	return sds.busiest;
@@ -7370,7 +7436,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 		 */
 
 		if (rq->nr_running == 1 && wl > env->imbalance &&
-		    !check_cpu_capacity(rq, env->sd))
+		    !check_cpu_capacity(rq, env->sd) &&
+		    env->busiest_group_type != group_misfit_task)
 			continue;
 
 		/*

From c3b2e765af4af80ada13e5e34a26a28e3081dd0e Mon Sep 17 00:00:00 2001
From: Michael Turquette <mturquette@baylibre.com>
Date: Tue, 30 Jun 2015 12:45:27 +0100
Subject: [PATCH 0090/3220] cpufreq: introduce cpufreq_driver_is_slow

Some architectures and platforms perform CPU frequency transitions
through a non-blocking method, while some might block or sleep. Even
when frequency transitions do not block or sleep they may be very slow.
This distinction is important when trying to change frequency from
a non-interruptible context in a scheduler hot path.

Describe this distinction with a cpufreq driver flag,
CPUFREQ_DRIVER_FAST. The default is to not have this flag set,
thus erring on the side of caution.

cpufreq_driver_is_slow() is also introduced in this patch. Setting
the above flag will allow this function to return false.

[smuckle@linaro.org: change flag/API to include drivers that are too
 slow for scheduler hot paths, in addition to those that block/sleep]

Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 6 ++++++
 include/linux/cpufreq.h   | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 96f1a8860819..5ca8e9517aae 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -154,6 +154,12 @@ bool have_governor_per_policy(void)
 }
 EXPORT_SYMBOL_GPL(have_governor_per_policy);
 
+bool cpufreq_driver_is_slow(void)
+{
+	return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST);
+}
+EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow);
+
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
 {
 	if (have_governor_per_policy())
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9b1d61e1f1d7..b5ef79aae048 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -160,6 +160,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 int cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
+bool cpufreq_driver_is_slow(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 #else
 static inline unsigned int cpufreq_get(unsigned int cpu)
@@ -317,6 +318,14 @@ struct cpufreq_driver {
  */
 #define CPUFREQ_NEED_INITIAL_FREQ_CHECK	(1 << 5)
 
+/*
+ * Indicates that it is safe to call cpufreq_driver_target from
+ * non-interruptable context in scheduler hot paths.  Drivers must
+ * opt-in to this flag, as the safe default is that they might sleep
+ * or be too slow for hot path use.
+ */
+#define CPUFREQ_DRIVER_FAST		(1 << 6)
+
 int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
 

From a967a45c71ae8978b2c0d06ee0de23f47737cfd5 Mon Sep 17 00:00:00 2001
From: Michael Turquette <mturquette@baylibre.com>
Date: Tue, 30 Jun 2015 12:45:48 +0100
Subject: [PATCH 0091/3220] sched: scheduler-driven cpu frequency selection

Scheduler-driven CPU frequency selection hopes to exploit both
per-task and global information in the scheduler to improve frequency
selection policy, achieving lower power consumption, improved
responsiveness/performance, and less reliance on heuristics and
tunables. For further discussion on the motivation of this integration
see [0].

This patch implements a shim layer between the Linux scheduler and the
cpufreq subsystem. The interface accepts capacity requests from the
CFS, RT and deadline sched classes. The requests from each sched class
are summed on each CPU with a margin applied to the CFS and RT
capacity requests to provide some headroom. Deadline requests are
expected to be precise enough given their nature to not require
headroom. The maximum total capacity request for a CPU in a frequency
domain drives the requested frequency for that domain.

Policy is determined by both the sched classes and this shim layer.

Note that this algorithm is event-driven. There is no polling loop to
check cpu idle time nor any other method which is unsynchronized with
the scheduler, aside from a throttling mechanism to ensure frequency
changes are not attempted faster than the hardware can accommodate them.

Thanks to Juri Lelli <juri.lelli@arm.com> for contributing design ideas,
code and test results, and to Ricky Liang <jcliang@chromium.org>
for initialization and static key inc/dec fixes.

[0] http://article.gmane.org/gmane.linux.kernel/1499836

[smuckle@linaro.org: various additions and fixes, revised commit text]

CC: Ricky Liang <jcliang@chromium.org>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 drivers/cpufreq/Kconfig      |  21 ++
 include/linux/cpufreq.h      |   3 +
 include/linux/sched.h        |   8 +
 kernel/sched/Makefile        |   1 +
 kernel/sched/cpufreq_sched.c | 358 +++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   2 +-
 kernel/sched/sched.h         |  51 +++++
 7 files changed, 443 insertions(+), 1 deletion(-)
 create mode 100644 kernel/sched/cpufreq_sched.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 659879a56dba..af523c539af0 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 	  Be aware that not all cpufreq drivers support the conservative
 	  governor. If unsure have a look at the help section of the
 	  driver. Fallback governor will be the performance governor.
+
+config CPU_FREQ_DEFAULT_GOV_SCHED
+	bool "sched"
+	select CPU_FREQ_GOV_SCHED
+	help
+	  Use the CPUfreq governor 'sched' as default. This scales
+	  cpu frequency using CPU utilization estimates from the
+	  scheduler.
+
 endchoice
 
 config CPU_FREQ_GOV_PERFORMANCE
@@ -183,6 +192,18 @@ config CPU_FREQ_GOV_CONSERVATIVE
 
 	  If in doubt, say N.
 
+config CPU_FREQ_GOV_SCHED
+	bool "'sched' cpufreq governor"
+	depends on CPU_FREQ
+	select CPU_FREQ_GOV_COMMON
+	help
+	  'sched' - this governor scales cpu frequency from the
+	  scheduler as a function of cpu capacity utilization. It does
+	  not evaluate utilization on a periodic basis (as ondemand
+	  does) but instead is event-driven by the scheduler.
+
+	  If in doubt, say N.
+
 comment "CPU frequency scaling drivers"
 
 config CPUFREQ_DT
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index b5ef79aae048..b5433349938a 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -496,6 +496,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
 #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
 extern struct cpufreq_governor cpufreq_gov_conservative;
 #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED)
+extern struct cpufreq_governor cpufreq_gov_sched;
+#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_sched)
 #endif
 
 /*********************************************************************
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 09e2d43406eb..1460c6a4f65f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -927,6 +927,14 @@ enum cpu_idle_type {
 #define SCHED_CAPACITY_SHIFT	10
 #define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
 
+struct sched_capacity_reqs {
+	unsigned long cfs;
+	unsigned long rt;
+	unsigned long dl;
+
+	unsigned long total;
+};
+
 /*
  * Wake-queues are lists of tasks with a pending wakeup, whose
  * callers have already marked the task as woken internally,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index a541b5ce1dcc..0eabc9db4c3d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
new file mode 100644
index 000000000000..58bca8d2ca65
--- /dev/null
+++ b/kernel/sched/cpufreq_sched.c
@@ -0,0 +1,358 @@
+/*
+ *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/percpu.h>
+#include <linux/irq_work.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+
+#include "sched.h"
+
+#define THROTTLE_NSEC		50000000 /* 50ms default */
+
+struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
+static bool __read_mostly cpufreq_driver_slow;
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static struct cpufreq_governor cpufreq_gov_sched;
+#endif
+
+static DEFINE_PER_CPU(unsigned long, enabled);
+DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+
+/**
+ * gov_data - per-policy data internal to the governor
+ * @throttle: next throttling period expiry. Derived from throttle_nsec
+ * @throttle_nsec: throttle period length in nanoseconds
+ * @task: worker thread for dvfs transition that may block/sleep
+ * @irq_work: callback used to wake up worker thread
+ * @requested_freq: last frequency requested by the sched governor
+ *
+ * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
+ * per-policy instance of it is created when the cpufreq_sched governor receives
+ * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
+ * member of struct cpufreq_policy.
+ *
+ * Readers of this data must call down_read(policy->rwsem). Writers must
+ * call down_write(policy->rwsem).
+ */
+struct gov_data {
+	ktime_t throttle;
+	unsigned int throttle_nsec;
+	struct task_struct *task;
+	struct irq_work irq_work;
+	unsigned int requested_freq;
+};
+
+static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
+					    unsigned int freq)
+{
+	struct gov_data *gd = policy->governor_data;
+
+	/* avoid race with cpufreq_sched_stop */
+	if (!down_write_trylock(&policy->rwsem))
+		return;
+
+	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
+
+	gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
+	up_write(&policy->rwsem);
+}
+
+static bool finish_last_request(struct gov_data *gd)
+{
+	ktime_t now = ktime_get();
+
+	if (ktime_after(now, gd->throttle))
+		return false;
+
+	while (1) {
+		int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now));
+
+		usec_left /= NSEC_PER_USEC;
+		usleep_range(usec_left, usec_left + 100);
+		now = ktime_get();
+		if (ktime_after(now, gd->throttle))
+			return true;
+	}
+}
+
+/*
+ * we pass in struct cpufreq_policy. This is safe because changing out the
+ * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
+ * which tears down all of the data structures and __cpufreq_governor(policy,
+ * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
+ * new policy pointer
+ */
+static int cpufreq_sched_thread(void *data)
+{
+	struct sched_param param;
+	struct cpufreq_policy *policy;
+	struct gov_data *gd;
+	unsigned int new_request = 0;
+	unsigned int last_request = 0;
+	int ret;
+
+	policy = (struct cpufreq_policy *) data;
+	gd = policy->governor_data;
+
+	param.sched_priority = 50;
+	ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
+	if (ret) {
+		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+		do_exit(-EINVAL);
+	} else {
+		pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
+				__func__, gd->task->pid);
+	}
+
+	do {
+		set_current_state(TASK_INTERRUPTIBLE);
+		new_request = gd->requested_freq;
+		if (new_request == last_request) {
+			schedule();
+		} else {
+			/*
+			 * if the frequency thread sleeps while waiting to be
+			 * unthrottled, start over to check for a newer request
+			 */
+			if (finish_last_request(gd))
+				continue;
+			last_request = new_request;
+			cpufreq_sched_try_driver_target(policy, new_request);
+		}
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+static void cpufreq_sched_irq_work(struct irq_work *irq_work)
+{
+	struct gov_data *gd;
+
+	gd = container_of(irq_work, struct gov_data, irq_work);
+	if (!gd)
+		return;
+
+	wake_up_process(gd->task);
+}
+
+static void update_fdomain_capacity_request(int cpu)
+{
+	unsigned int freq_new, index_new, cpu_tmp;
+	struct cpufreq_policy *policy;
+	struct gov_data *gd;
+	unsigned long capacity = 0;
+
+	/*
+	 * Avoid grabbing the policy if possible. A test is still
+	 * required after locking the CPU's policy to avoid racing
+	 * with the governor changing.
+	 */
+	if (!per_cpu(enabled, cpu))
+		return;
+
+	policy = cpufreq_cpu_get(cpu);
+	if (IS_ERR_OR_NULL(policy))
+		return;
+
+	if (policy->governor != &cpufreq_gov_sched ||
+	    !policy->governor_data)
+		goto out;
+
+	gd = policy->governor_data;
+
+	/* find max capacity requested by cpus in this policy */
+	for_each_cpu(cpu_tmp, policy->cpus) {
+		struct sched_capacity_reqs *scr;
+
+		scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);
+		capacity = max(capacity, scr->total);
+	}
+
+	/* Convert the new maximum capacity request into a cpu frequency */
+	freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+	if (cpufreq_frequency_table_target(policy, policy->freq_table,
+					   freq_new, CPUFREQ_RELATION_L,
+					   &index_new))
+		goto out;
+	freq_new = policy->freq_table[index_new].frequency;
+
+	if (freq_new == gd->requested_freq)
+		goto out;
+
+	gd->requested_freq = freq_new;
+
+	/*
+	 * Throttling is not yet supported on platforms with fast cpufreq
+	 * drivers.
+	 */
+	if (cpufreq_driver_slow)
+		irq_work_queue_on(&gd->irq_work, cpu);
+	else
+		cpufreq_sched_try_driver_target(policy, freq_new);
+
+out:
+	cpufreq_cpu_put(policy);
+}
+
+void update_cpu_capacity_request(int cpu, bool request)
+{
+	unsigned long new_capacity;
+	struct sched_capacity_reqs *scr;
+
+	/* The rq lock serializes access to the CPU's sched_capacity_reqs. */
+	lockdep_assert_held(&cpu_rq(cpu)->lock);
+
+	scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+
+	new_capacity = scr->cfs + scr->rt;
+	new_capacity = new_capacity * capacity_margin
+		/ SCHED_CAPACITY_SCALE;
+	new_capacity += scr->dl;
+
+	if (new_capacity == scr->total)
+		return;
+
+	scr->total = new_capacity;
+	if (request)
+		update_fdomain_capacity_request(cpu);
+}
+
+static inline void set_sched_freq(void)
+{
+	static_key_slow_inc(&__sched_freq);
+}
+
+static inline void clear_sched_freq(void)
+{
+	static_key_slow_dec(&__sched_freq);
+}
+
+static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
+{
+	struct gov_data *gd;
+	int cpu;
+
+	for_each_cpu(cpu, policy->cpus)
+		memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
+		       sizeof(struct sched_capacity_reqs));
+
+	gd = kzalloc(sizeof(*gd), GFP_KERNEL);
+	if (!gd)
+		return -ENOMEM;
+
+	gd->throttle_nsec = policy->cpuinfo.transition_latency ?
+			    policy->cpuinfo.transition_latency :
+			    THROTTLE_NSEC;
+	pr_debug("%s: throttle threshold = %u [ns]\n",
+		  __func__, gd->throttle_nsec);
+
+	if (cpufreq_driver_is_slow()) {
+		cpufreq_driver_slow = true;
+		gd->task = kthread_create(cpufreq_sched_thread, policy,
+					  "kschedfreq:%d",
+					  cpumask_first(policy->related_cpus));
+		if (IS_ERR_OR_NULL(gd->task)) {
+			pr_err("%s: failed to create kschedfreq thread\n",
+			       __func__);
+			goto err;
+		}
+		get_task_struct(gd->task);
+		kthread_bind_mask(gd->task, policy->related_cpus);
+		wake_up_process(gd->task);
+		init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
+	}
+
+	policy->governor_data = gd;
+	set_sched_freq();
+
+	return 0;
+
+err:
+	kfree(gd);
+	return -ENOMEM;
+}
+
+static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
+{
+	struct gov_data *gd = policy->governor_data;
+
+	clear_sched_freq();
+	if (cpufreq_driver_slow) {
+		kthread_stop(gd->task);
+		put_task_struct(gd->task);
+	}
+
+	policy->governor_data = NULL;
+
+	kfree(gd);
+	return 0;
+}
+
+static int cpufreq_sched_start(struct cpufreq_policy *policy)
+{
+	int cpu;
+
+	for_each_cpu(cpu, policy->cpus)
+		per_cpu(enabled, cpu) = 1;
+
+	return 0;
+}
+
+static int cpufreq_sched_stop(struct cpufreq_policy *policy)
+{
+	int cpu;
+
+	for_each_cpu(cpu, policy->cpus)
+		per_cpu(enabled, cpu) = 0;
+
+	return 0;
+}
+
+static int cpufreq_sched_setup(struct cpufreq_policy *policy,
+			       unsigned int event)
+{
+	switch (event) {
+	case CPUFREQ_GOV_POLICY_INIT:
+		return cpufreq_sched_policy_init(policy);
+	case CPUFREQ_GOV_POLICY_EXIT:
+		return cpufreq_sched_policy_exit(policy);
+	case CPUFREQ_GOV_START:
+		return cpufreq_sched_start(policy);
+	case CPUFREQ_GOV_STOP:
+		return cpufreq_sched_stop(policy);
+	case CPUFREQ_GOV_LIMITS:
+		break;
+	}
+	return 0;
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static
+#endif
+struct cpufreq_governor cpufreq_gov_sched = {
+	.name			= "sched",
+	.governor		= cpufreq_sched_setup,
+	.owner			= THIS_MODULE,
+};
+
+static int __init cpufreq_sched_init(void)
+{
+	int cpu;
+
+	for_each_cpu(cpu, cpu_possible_mask)
+		per_cpu(enabled, cpu) = 0;
+	return cpufreq_register_governor(&cpufreq_gov_sched);
+}
+
+/* Try to make this the default governor */
+fs_initcall(cpufreq_sched_init);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b8a9c68f3fb..3eb5beb590a8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5064,7 +5064,7 @@ static inline unsigned long task_util(struct task_struct *p)
 	return p->se.avg.util_avg;
 }
 
-static unsigned int capacity_margin = 1280; /* ~20% margin */
+unsigned int capacity_margin = 1280; /* ~20% margin */
 
 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fcbb3e07accc..ac28024b2d7a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1455,6 +1455,57 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 }
 #endif
 
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+extern unsigned int capacity_margin;
+extern struct static_key __sched_freq;
+
+static inline bool sched_freq(void)
+{
+	return static_key_false(&__sched_freq);
+}
+
+DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+void update_cpu_capacity_request(int cpu, bool request);
+
+static inline void set_cfs_cpu_capacity(int cpu, bool request,
+					unsigned long capacity)
+{
+	if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) {
+		per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity;
+		update_cpu_capacity_request(cpu, request);
+	}
+}
+
+static inline void set_rt_cpu_capacity(int cpu, bool request,
+				       unsigned long capacity)
+{
+	if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
+		per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
+		update_cpu_capacity_request(cpu, request);
+	}
+}
+
+static inline void set_dl_cpu_capacity(int cpu, bool request,
+				       unsigned long capacity)
+{
+	if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
+		per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
+		update_cpu_capacity_request(cpu, request);
+	}
+}
+#else
+static inline bool sched_freq(void) { return false; }
+static inline void set_cfs_cpu_capacity(int cpu, bool request,
+					unsigned long capacity)
+{ }
+static inline void set_rt_cpu_capacity(int cpu, bool request,
+				       unsigned long capacity)
+{ }
+static inline void set_dl_cpu_capacity(int cpu, bool request,
+				       unsigned long capacity)
+{ }
+#endif
+
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));

From ea429ccef14f4ef33e0a1c94a0369a435ae83807 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Wed, 19 Aug 2015 19:47:12 +0100
Subject: [PATCH 0092/3220] sched/fair: add triggers for OPP change requests

Each time a task is {en,de}queued we might need to adapt the current
frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
this purpose.  Only trigger a freq request if we are effectively waking up
or going to sleep.  Filter out load balancing related calls to reduce the
number of triggers.

[smuckle@linaro.org: resolve merge conflicts, define task_new,
 use renamed static key sched_freq]

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3eb5beb590a8..844788da00be 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4144,6 +4144,21 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+static unsigned long capacity_orig_of(int cpu);
+static int cpu_util(int cpu);
+
+static void update_capacity_of(int cpu)
+{
+	unsigned long req_cap;
+
+	if (!sched_freq())
+		return;
+
+	/* Convert scale-invariant capacity to cpu. */
+	req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
+	set_cfs_cpu_capacity(cpu, true, req_cap);
+}
+
 static bool cpu_overutilized(int cpu);
 
 /*
@@ -4193,6 +4208,20 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (!task_new && !rq->rd->overutilized &&
 		    cpu_overutilized(rq->cpu))
 			rq->rd->overutilized = true;
+
+		/*
+		 * We want to potentially trigger a freq switch
+		 * request only for tasks that are waking up; this is
+		 * because we get here also during load balancing, but
+		 * in these cases it seems wise to trigger as single
+		 * request after load balancing is done.
+		 *
+		 * XXX: how about fork()? Do we need a special
+		 *      flag/something to tell if we are here after a
+		 *      fork() (wakeup_task_new)?
+		 */
+		if (!task_new)
+			update_capacity_of(cpu_of(rq));
 	}
 	hrtick_update(rq);
 }
@@ -4251,9 +4280,24 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(cfs_rq);
 	}
 
-	if (!se)
+	if (!se) {
 		sub_nr_running(rq, 1);
 
+		/*
+		 * We want to potentially trigger a freq switch
+		 * request only for tasks that are going to sleep;
+		 * this is because we get here also during load
+		 * balancing, but in these cases it seems wise to
+		 * trigger as single request after load balancing is
+		 * done.
+		 */
+		if (task_sleep) {
+			if (rq->cfs.nr_running)
+				update_capacity_of(cpu_of(rq));
+			else if (sched_freq())
+				set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+		}
+	}
 	hrtick_update(rq);
 }
 

From 7ff814dd71331fc41d513f422da81ed65e56be47 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Fri, 26 Jun 2015 12:14:23 +0100
Subject: [PATCH 0093/3220] sched/{core,fair}: trigger OPP change request on
 fork()

Patch "sched/fair: add triggers for OPP change requests" introduced OPP
change triggers for enqueue_task_fair(), but the trigger was operating only
for wakeups. Fact is that it makes sense to consider wakeup_new also (i.e.,
fork()), as we don't know anything about a newly created task and thus we
most certainly want to jump to max OPP to not harm performance too much.

However, it is not currently possible (or at least it wasn't evident to me
how to do so :/) to tell new wakeups from other (non wakeup) operations.

This patch introduces an additional flag in sched.h that is only set at
fork() time and it is then consumed in enqueue_task_fair() for our purpose.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/fair.c  | 9 +++------
 kernel/sched/sched.h | 1 +
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 23e82805ec47..5c8d0dbaae5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2387,7 +2387,7 @@ void wake_up_new_task(struct task_struct *p)
 #endif
 
 	rq = __task_rq_lock(p);
-	activate_task(rq, p, 0);
+	activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 844788da00be..3937f79240df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4171,7 +4171,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
-	int task_new = !(flags & ENQUEUE_WAKEUP);
+	int task_new = flags & ENQUEUE_WAKEUP_NEW;
+	int task_wakeup = flags & ENQUEUE_WAKEUP;
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
@@ -4215,12 +4216,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		 * because we get here also during load balancing, but
 		 * in these cases it seems wise to trigger as single
 		 * request after load balancing is done.
-		 *
-		 * XXX: how about fork()? Do we need a special
-		 *      flag/something to tell if we are here after a
-		 *      fork() (wakeup_task_new)?
 		 */
-		if (!task_new)
+		if (task_new || task_wakeup)
 			update_capacity_of(cpu_of(rq));
 	}
 	hrtick_update(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ac28024b2d7a..0052ada93ae8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1181,6 +1181,7 @@ static const u32 prio_to_wmult[40] = {
 #endif
 #define ENQUEUE_REPLENISH	0x08
 #define ENQUEUE_RESTORE	0x10
+#define ENQUEUE_WAKEUP_NEW	0x20
 
 #define DEQUEUE_SLEEP		0x01
 #define DEQUEUE_SAVE		0x02

From f99e3fe6eb9cba8f645f8d00819d8007ee787e2c Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Thu, 25 Jun 2015 14:37:27 +0100
Subject: [PATCH 0094/3220] sched/fair: cpufreq_sched triggers for load
 balancing

As we don't trigger freq changes from {en,de}queue_task_fair() during load
balancing, we need to do explicitly so on load balancing paths.

[smuckle@linaro.org: move update_capacity_of calls so rq lock is held]

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 kernel/sched/fair.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3937f79240df..950046b1b8eb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6384,6 +6384,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
 {
 	raw_spin_lock(&rq->lock);
 	attach_task(rq, p);
+	/*
+	 * We want to potentially raise target_cpu's OPP.
+	 */
+	update_capacity_of(cpu_of(rq));
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -6405,6 +6409,11 @@ static void attach_tasks(struct lb_env *env)
 		attach_task(env->dst_rq, p);
 	}
 
+	/*
+	 * We want to potentially raise env.dst_cpu's OPP.
+	 */
+	update_capacity_of(env->dst_cpu);
+
 	raw_spin_unlock(&env->dst_rq->lock);
 }
 
@@ -7667,6 +7676,11 @@ more_balance:
 		 * ld_moved     - cumulative load moved across iterations
 		 */
 		cur_ld_moved = detach_tasks(&env);
+		/*
+		 * We want to potentially lower env.src_cpu's OPP.
+		 */
+		if (cur_ld_moved)
+			update_capacity_of(env.src_cpu);
 
 		/*
 		 * We've detached some tasks from busiest_rq. Every
@@ -8037,8 +8051,13 @@ static int active_load_balance_cpu_stop(void *data)
 		schedstat_inc(sd, alb_count);
 
 		p = detach_one_task(&env);
-		if (p)
+		if (p) {
 			schedstat_inc(sd, alb_pushed);
+			/*
+			 * We want to potentially lower env.src_cpu's OPP.
+			 */
+			update_capacity_of(env.src_cpu);
+		}
 		else
 			schedstat_inc(sd, alb_failed);
 	}

From 6b6c1924539526c89384067fe70b9fe856c7b626 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@linaro.org>
Date: Thu, 25 Jun 2015 14:12:33 +0100
Subject: [PATCH 0095/3220] sched/fair: jump to max OPP when crossing UP
 threshold

Since the true utilization of a long running task is not detectable
while it is running and might be bigger than the current cpu capacity,
create the maximum cpu capacity head room by requesting the maximum
cpu capacity once the cpu usage plus the capacity margin exceeds the
current capacity. This is also done to try to harm the performance of
a task the least.

Original fair-class only version authored by Juri Lelli
<juri.lelli@arm.com>.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 kernel/sched/core.c  | 41 ++++++++++++++++++++++++++
 kernel/sched/fair.c  | 66 ------------------------------------------
 kernel/sched/sched.h | 68 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 66 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c8d0dbaae5d..ac3d9af477f4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2854,6 +2854,45 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+static unsigned long sum_capacity_reqs(unsigned long cfs_cap,
+				       struct sched_capacity_reqs *scr)
+{
+	unsigned long total = cfs_cap + scr->rt;
+
+	total = total * capacity_margin;
+	total /= SCHED_CAPACITY_SCALE;
+	total += scr->dl;
+	return total;
+}
+
+static void sched_freq_tick(int cpu)
+{
+	struct sched_capacity_reqs *scr;
+	unsigned long capacity_orig, capacity_curr;
+
+	if (!sched_freq())
+		return;
+
+	capacity_orig = capacity_orig_of(cpu);
+	capacity_curr = capacity_curr_of(cpu);
+	if (capacity_curr == capacity_orig)
+		return;
+
+	/*
+	 * To make free room for a task that is building up its "real"
+	 * utilization and to harm its performance the least, request
+	 * a jump to max OPP as soon as the margin of free capacity is
+	 * impacted (specified by capacity_margin).
+	 */
+	scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+	if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr))
+		set_cfs_cpu_capacity(cpu, true, capacity_max);
+}
+#else
+static inline void sched_freq_tick(int cpu) { }
+#endif
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -2880,6 +2919,8 @@ void scheduler_tick(void)
 	trigger_load_balance(rq);
 #endif
 	rq_last_tick_reset(rq);
+
+	sched_freq_tick(cpu);
 }
 
 #ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 950046b1b8eb..bb1d0e3d7835 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4144,9 +4144,6 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
-static unsigned long capacity_orig_of(int cpu);
-static int cpu_util(int cpu);
-
 static void update_capacity_of(int cpu)
 {
 	unsigned long req_cap;
@@ -4521,15 +4518,6 @@ static unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], total);
 }
 
-static unsigned long capacity_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity_orig;
-}
 
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
@@ -4698,60 +4686,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 #endif
 
-/*
- * Returns the current capacity of cpu after applying both
- * cpu and freq scaling.
- */
-static unsigned long capacity_curr_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity_orig *
-	       arch_scale_freq_capacity(NULL, cpu)
-	       >> SCHED_CAPACITY_SHIFT;
-}
-
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static unsigned long __cpu_util(int cpu, int delta)
-{
-	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-	unsigned long capacity = capacity_orig_of(cpu);
-
-	delta += util;
-	if (delta < 0)
-		return 0;
-
-	return (delta >= capacity) ? capacity : delta;
-}
-
-static unsigned long cpu_util(int cpu)
-{
-	return __cpu_util(cpu, 0);
-}
-
 static inline bool energy_aware(void)
 {
 	return sched_feat(ENERGY_AWARE);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0052ada93ae8..6aaff682adb8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1456,7 +1456,75 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 }
 #endif
 
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long capacity_orig_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static inline unsigned long __cpu_util(int cpu, int delta)
+{
+	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+	unsigned long capacity = capacity_orig_of(cpu);
+
+	delta += util;
+	if (delta < 0)
+		return 0;
+
+	return (delta >= capacity) ? capacity : delta;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+	return __cpu_util(cpu, 0);
+}
+
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+static inline unsigned long capacity_curr_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity_orig *
+	       arch_scale_freq_capacity(NULL, cpu)
+	       >> SCHED_CAPACITY_SHIFT;
+}
+
+#endif
+
 #ifdef CONFIG_CPU_FREQ_GOV_SCHED
+#define capacity_max SCHED_CAPACITY_SCALE
 extern unsigned int capacity_margin;
 extern struct static_key __sched_freq;
 

From 9d44dc7fe29c4b925ddbbd6a2551929da28dfbfb Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@linaro.org>
Date: Wed, 25 Nov 2015 15:59:25 -0800
Subject: [PATCH 0096/3220] sched/cpufreq_sched: add trace events

Trace events will aid in debugging, profiling and tuning.

Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 include/trace/events/cpufreq_sched.h | 87 ++++++++++++++++++++++++++++
 kernel/sched/cpufreq_sched.c         |  9 +++
 2 files changed, 96 insertions(+)
 create mode 100644 include/trace/events/cpufreq_sched.h

diff --git a/include/trace/events/cpufreq_sched.h b/include/trace/events/cpufreq_sched.h
new file mode 100644
index 000000000000..a46cd088e969
--- /dev/null
+++ b/include/trace/events/cpufreq_sched.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright (C)  2015 Steve Muckle <smuckle@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cpufreq_sched
+
+#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CPUFREQ_SCHED_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(cpufreq_sched_throttled,
+	    TP_PROTO(unsigned int rem),
+	    TP_ARGS(rem),
+	    TP_STRUCT__entry(
+		    __field(	unsigned int,	rem)
+	    ),
+	    TP_fast_assign(
+		    __entry->rem = rem;
+	    ),
+	    TP_printk("throttled - %d usec remaining", __entry->rem)
+);
+
+TRACE_EVENT(cpufreq_sched_request_opp,
+	    TP_PROTO(int cpu,
+		     unsigned long capacity,
+		     unsigned int freq_new,
+		     unsigned int requested_freq),
+	    TP_ARGS(cpu, capacity, freq_new, requested_freq),
+	    TP_STRUCT__entry(
+		    __field(	int,		cpu)
+		    __field(	unsigned long,	capacity)
+		    __field(	unsigned int,	freq_new)
+		    __field(	unsigned int,	requested_freq)
+		    ),
+	    TP_fast_assign(
+		    __entry->cpu = cpu;
+		    __entry->capacity = capacity;
+		    __entry->freq_new = freq_new;
+		    __entry->requested_freq = requested_freq;
+		    ),
+	    TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d "
+		      "(cur %d)",
+		      __entry->cpu, __entry->capacity, __entry->freq_new,
+		      __entry->requested_freq)
+);
+
+TRACE_EVENT(cpufreq_sched_update_capacity,
+	    TP_PROTO(int cpu,
+		     bool request,
+		     struct sched_capacity_reqs *scr,
+		     unsigned long new_capacity),
+	    TP_ARGS(cpu, request, scr, new_capacity),
+	    TP_STRUCT__entry(
+		    __field(	int,		cpu)
+		    __field(	bool,		request)
+		    __field(	unsigned long,	cfs)
+		    __field(	unsigned long,	rt)
+		    __field(	unsigned long,	dl)
+		    __field(	unsigned long,	total)
+		    __field(	unsigned long,	new_total)
+	    ),
+	    TP_fast_assign(
+		    __entry->cpu = cpu;
+		    __entry->request = request;
+		    __entry->cfs = scr->cfs;
+		    __entry->rt = scr->rt;
+		    __entry->dl = scr->dl;
+		    __entry->total = scr->total;
+		    __entry->new_total = new_capacity;
+	    ),
+	    TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld "
+		      "new_tot=%ld",
+		      __entry->cpu, __entry->request, __entry->cfs, __entry->rt,
+		      __entry->dl, __entry->total, __entry->new_total)
+);
+
+#endif /* _TRACE_CPUFREQ_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index 58bca8d2ca65..5afe56a82491 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -14,6 +14,9 @@
 #include <linux/delay.h>
 #include <linux/string.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpufreq_sched.h>
+
 #include "sched.h"
 
 #define THROTTLE_NSEC		50000000 /* 50ms default */
@@ -78,6 +81,7 @@ static bool finish_last_request(struct gov_data *gd)
 		int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now));
 
 		usec_left /= NSEC_PER_USEC;
+		trace_cpufreq_sched_throttled(usec_left);
 		usleep_range(usec_left, usec_left + 100);
 		now = ktime_get();
 		if (ktime_after(now, gd->throttle))
@@ -186,6 +190,9 @@ static void update_fdomain_capacity_request(int cpu)
 		goto out;
 	freq_new = policy->freq_table[index_new].frequency;
 
+	trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
+					gd->requested_freq);
+
 	if (freq_new == gd->requested_freq)
 		goto out;
 
@@ -222,6 +229,8 @@ void update_cpu_capacity_request(int cpu, bool request)
 	if (new_capacity == scr->total)
 		return;
 
+	trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);
+
 	scr->total = new_capacity;
 	if (request)
 		update_fdomain_capacity_request(cpu);

From cd248fa758ba2358d4dbf612090e6bbc075c46b1 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 20 Oct 2015 10:46:26 +0200
Subject: [PATCH 0097/3220] sched: remove call of sched_avg_update from
 sched_rt_avg_update

rt_avg is only used to scale the available CPU's capacity for CFS
tasks.  As the update of this scaling is done during periodic load
balance, we only have to ensure that sched_avg_update has been called
before any periodic load balancing. This requirement is already
fulfilled by __update_cpu_load so the call in sched_rt_avg_update,
which is part of the hotpath, is useless.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 kernel/sched/sched.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6aaff682adb8..770e119ca692 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1578,7 +1578,6 @@ static inline void set_dl_cpu_capacity(int cpu, bool request,
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-	sched_avg_update(rq);
 }
 #else
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }

From fab5cc59bf31aac4bcb222c7d1013a033cb8ef70 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 3 Nov 2015 10:39:01 +0100
Subject: [PATCH 0098/3220] sched: deadline: use deadline bandwidth in
 scale_rt_capacity

Instead of monitoring the exec time of deadline tasks to evaluate the
CPU capacity consumed by deadline scheduler class, we can directly
calculate it thanks to the sum of utilization of deadline tasks on the
CPU.  We can remove deadline tasks from rt_avg metric and directly use
the average bandwidth of deadline scheduler in scale_rt_capacity.

Based in part on a similar patch from Luca Abeni <luca.abeni@unitn.it>.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++--
 kernel/sched/fair.c     |  8 ++++++++
 kernel/sched/sched.h    |  2 ++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b0a15e285f9..9d9eb50d4059 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
 	return !RB_EMPTY_NODE(&dl_se->rb_node);
 }
 
+static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+	u64 se_bw = dl_se->dl_bw;
+
+	dl_rq->avg_bw += se_bw;
+}
+
+static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+	u64 se_bw = dl_se->dl_bw;
+
+	dl_rq->avg_bw -= se_bw;
+	if (dl_rq->avg_bw < 0) {
+		WARN_ON(1);
+		dl_rq->avg_bw = 0;
+	}
+}
+
 static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
@@ -494,6 +512,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
+	if (dl_se->dl_new)
+		add_average_bw(dl_se, dl_rq);
+
 	/*
 	 * The arrival of a new instance needs special treatment, i.e.,
 	 * the actual scheduling parameters have to be "renewed".
@@ -741,8 +762,6 @@ static void update_curr_dl(struct rq *rq)
 	curr->se.exec_start = rq_clock_task(rq);
 	cpuacct_charge(curr, delta_exec);
 
-	sched_rt_avg_update(rq, delta_exec);
-
 	dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
 	if (dl_runtime_exceeded(dl_se)) {
 		dl_se->dl_throttled = 1;
@@ -1241,6 +1260,8 @@ static void task_fork_dl(struct task_struct *p)
 static void task_dead_dl(struct task_struct *p)
 {
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+	struct dl_rq *dl_rq = dl_rq_of_se(&p->dl);
+	struct rq *rq = rq_of_dl_rq(dl_rq);
 
 	/*
 	 * Since we are TASK_DEAD we won't slip out of the domain!
@@ -1249,6 +1270,8 @@ static void task_dead_dl(struct task_struct *p)
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
 	raw_spin_unlock_irq(&dl_b->lock);
+
+	clear_average_bw(&p->dl, &rq->dl);
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1556,7 +1579,9 @@ retry:
 	}
 
 	deactivate_task(rq, next_task, 0);
+	clear_average_bw(&next_task->dl, &rq->dl);
 	set_task_cpu(next_task, later_rq->cpu);
+	add_average_bw(&next_task->dl, &later_rq->dl);
 	activate_task(later_rq, next_task, 0);
 	ret = 1;
 
@@ -1644,7 +1669,9 @@ static void pull_dl_task(struct rq *this_rq)
 			resched = true;
 
 			deactivate_task(src_rq, p, 0);
+			clear_average_bw(&p->dl, &src_rq->dl);
 			set_task_cpu(p, this_cpu);
+			add_average_bw(&p->dl, &this_rq->dl);
 			activate_task(this_rq, p, 0);
 			dmin = p->dl.deadline;
 
@@ -1750,6 +1777,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (!start_dl_timer(p))
 		__dl_clear_params(p);
 
+	clear_average_bw(&p->dl, &rq->dl);
+
 	/*
 	 * Since this might be the only -deadline task on the rq,
 	 * this is the right place to try to pull some other one
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bb1d0e3d7835..651e160cdd67 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6550,6 +6550,14 @@ static unsigned long scale_rt_capacity(int cpu)
 
 	used = div_u64(avg, total);
 
+	/*
+	 * deadline bandwidth is defined at system level so we must
+	 * weight this bandwidth with the max capacity of the system.
+	 * As a reminder, avg_bw is 20bits width and
+	 * scale_cpu_capacity is 10 bits width
+	 */
+	used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
+
 	if (likely(used < SCHED_CAPACITY_SCALE))
 		return SCHED_CAPACITY_SCALE - used;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 770e119ca692..983c132af11b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -506,6 +506,8 @@ struct dl_rq {
 #else
 	struct dl_bw dl_bw;
 #endif
+	/* This is the "average utilization" for this runqueue */
+	s64 avg_bw;
 };
 
 #ifdef CONFIG_SMP

From 6e1e1ed8720652305b77b6d01f80276ef076aff0 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 26 Oct 2015 18:14:50 +0100
Subject: [PATCH 0099/3220] sched: rt scheduler sets capacity requirement

RT tasks don't provide any running constraints like deadline ones
except their running priority. The only current usable input to
estimate the capacity needed by RT tasks is the rt_avg metric. We use
it to estimate the CPU capacity needed for the RT scheduler class.

In order to monitor the evolution for RT task load, we must
peridiocally check it during the tick.

Then, we use the estimated capacity of the last activity to estimate
the next one which can not be that accurate but is a good starting
point without any impact on the wake up path of RT tasks.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 kernel/sched/rt.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86abe0ea1..9694204660b7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1426,6 +1426,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
 #endif
 }
 
+#ifdef CONFIG_SMP
+static void sched_rt_update_capacity_req(struct rq *rq)
+{
+	u64 total, used, age_stamp, avg;
+	s64 delta;
+
+	if (!sched_freq())
+		return;
+
+	sched_avg_update(rq);
+	/*
+	 * Since we're reading these variables without serialization make sure
+	 * we read them once before doing sanity checks on them.
+	 */
+	age_stamp = READ_ONCE(rq->age_stamp);
+	avg = READ_ONCE(rq->rt_avg);
+	delta = rq_clock(rq) - age_stamp;
+
+	if (unlikely(delta < 0))
+		delta = 0;
+
+	total = sched_avg_period() + delta;
+
+	used = div_u64(avg, total);
+	if (unlikely(used > SCHED_CAPACITY_SCALE))
+		used = SCHED_CAPACITY_SCALE;
+
+	set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
+}
+#else
+static inline void sched_rt_update_capacity_req(struct rq *rq)
+{ }
+
+#endif
+
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 						   struct rt_rq *rt_rq)
 {
@@ -1494,8 +1529,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	if (prev->sched_class == &rt_sched_class)
 		update_curr_rt(rq);
 
-	if (!rt_rq->rt_queued)
+	if (!rt_rq->rt_queued) {
+		/*
+		 * The next task to be picked on this rq will have a lower
+		 * priority than rt tasks so we can spend some time to update
+		 * the capacity used by rt tasks based on the last activity.
+		 * This value will be the used as an estimation of the next
+		 * activity.
+		 */
+		sched_rt_update_capacity_req(rq);
 		return NULL;
+	}
 
 	put_prev_task(rq, prev);
 
@@ -2212,6 +2256,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 
 	update_curr_rt(rq);
 
+	if (rq->rt.rt_nr_running)
+		sched_rt_update_capacity_req(rq);
+
 	watchdog(rq, p);
 
 	/*

From 3eb29105cfe14719e0b61c782efd4b3b509d786d Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Fri, 11 Dec 2015 11:55:51 +0000
Subject: [PATCH 0100/3220] fixup! sched: scheduler-driven cpu frequency
 selection

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 kernel/sched/cpufreq_sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index 5afe56a82491..e1d208e101ed 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -119,9 +119,9 @@ static int cpufreq_sched_thread(void *data)
 	}
 
 	do {
-		set_current_state(TASK_INTERRUPTIBLE);
 		new_request = gd->requested_freq;
 		if (new_request == last_request) {
+			set_current_state(TASK_INTERRUPTIBLE);
 			schedule();
 		} else {
 			/*

From 08d1cfd7c9193272e6caa100cbc5ef875cb5543c Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Fri, 11 Dec 2015 11:58:05 +0000
Subject: [PATCH 0101/3220] fixup! sched/fair: jump to max OPP when crossing UP
 threshold

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 kernel/sched/core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ac3d9af477f4..151ed9c62ad2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2910,6 +2910,7 @@ void scheduler_tick(void)
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
 	calc_global_load_tick(rq);
+	sched_freq_tick(cpu);
 	raw_spin_unlock(&rq->lock);
 
 	perf_event_task_tick();
@@ -2919,8 +2920,6 @@ void scheduler_tick(void)
 	trigger_load_balance(rq);
 #endif
 	rq_last_tick_reset(rq);
-
-	sched_freq_tick(cpu);
 }
 
 #ifdef CONFIG_NO_HZ_FULL

From 8aaf022dd7e5390b3e95fe92a8f49b1de949a687 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Tue, 30 Jun 2015 12:03:26 +0100
Subject: [PATCH 0102/3220] sched/tune: add detailed documentation

The topic of a single simple power-performance tunable, that is wholly
scheduler centric, and has well defined and predictable properties has
come up on several occasions in the past. With techniques such as a
scheduler driven DVFS, we now have a good framework for implementing
such a tunable.

This patch provides a detailed description of the motivations and design
decisions behind the implementation of the SchedTune.

cc: Jonathan Corbet <corbet@lwn.net>
cc: linux-doc@vger.kernel.org
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 Documentation/scheduler/sched-tune.txt | 366 +++++++++++++++++++++++++
 1 file changed, 366 insertions(+)
 create mode 100644 Documentation/scheduler/sched-tune.txt

diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt
new file mode 100644
index 000000000000..9bd2231c01b1
--- /dev/null
+++ b/Documentation/scheduler/sched-tune.txt
@@ -0,0 +1,366 @@
+             Central, scheduler-driven, power-performance control
+                               (EXPERIMENTAL)
+
+Abstract
+========
+
+The topic of a single simple power-performance tunable, that is wholly
+scheduler centric, and has well defined and predictable properties has come up
+on several occasions in the past [1,2]. With techniques such as a scheduler
+driven DVFS [3], we now have a good framework for implementing such a tunable.
+This document describes the overall ideas behind its design and implementation.
+
+
+Table of Contents
+=================
+
+1. Motivation
+2. Introduction
+3. Signal Boosting Strategy
+4. OPP selection using boosted CPU utilization
+5. Per task group boosting
+6. Question and Answers
+   - What about "auto" mode?
+   - What about boosting on a congested system?
+   - How CPUs are boosted when we have tasks with multiple boost values?
+7. References
+
+
+1. Motivation
+=============
+
+Sched-DVFS [3] is a new event-driven cpufreq governor which allows the
+scheduler to select the optimal DVFS operating point (OPP) for running a task
+allocated to a CPU. The introduction of sched-DVFS enables running workloads at
+the most energy efficient OPPs.
+
+However, sometimes it may be desired to intentionally boost the performance of
+a workload even if that could imply a reasonable increase in energy
+consumption. For example, in order to reduce the response time of a task, we
+may want to run the task at a higher OPP than the one that is actually required
+by it's CPU bandwidth demand.
+
+This last requirement is especially important if we consider that one of the
+main goals of the sched-DVFS component is to replace all currently available
+CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling
+driven governors we currently have, it is already more responsive at selecting
+the optimal OPP to run tasks allocated to a CPU. However, just tracking the
+actual task load demand may not be enough from a performance standpoint.  For
+example, it is not possible to get behaviors similar to those provided by the
+"performance" and "interactive" CPUFreq governors.
+
+This document describes an implementation of a tunable, stacked on top of the
+sched-DVFS which extends its functionality to support task performance
+boosting.
+
+By "performance boosting" we mean the reduction of the time required to
+complete a task activation, i.e. the time elapsed from a task wakeup to its
+next deactivation (e.g. because it goes back to sleep or it terminates).  For
+example, if we consider a simple periodic task which executes the same workload
+for 5[s] every 20[s] while running at a certain OPP, a boosted execution of
+that task must complete each of its activations in less than 5[s].
+
+A previous attempt [5] to introduce such a boosting feature has not been
+successful mainly because of the complexity of the proposed solution.  The
+approach described in this document exposes a single simple interface to
+user-space.  This single tunable knob allows the tuning of system wide
+scheduler behaviours ranging from energy efficiency at one end through to
+incremental performance boosting at the other end.  This first tunable affects
+all tasks. However, a more advanced extension of the concept is also provided
+which uses CGroups to boost the performance of only selected tasks while using
+the energy efficient default for all others.
+
+The rest of this document introduces in more details the proposed solution
+which has been named SchedTune.
+
+
+2. Introduction
+===============
+
+SchedTune exposes a simple user-space interface with a single power-performance
+tunable:
+
+  /proc/sys/kernel/sched_cfs_boost
+
+This permits expressing a boost value as an integer in the range [0..100].
+
+A value of 0 (default) configures the CFS scheduler for maximum energy
+efficiency. This means that sched-DVFS runs the tasks at the minimum OPP
+required to satisfy their workload demand.
+A value of 100 configures scheduler for maximum performance, which translates
+to the selection of the maximum OPP on that CPU.
+
+The range between 0 and 100 can be set to satisfy other scenarios suitably. For
+example to satisfy interactive response or depending on other system events
+(battery level etc).
+
+A CGroup based extension is also provided, which permits further user-space
+defined task classification to tune the scheduler for different goals depending
+on the specific nature of the task, e.g. background vs interactive vs
+low-priority.
+
+The overall design of the SchedTune module is built on top of "Per-Entity Load
+Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating
+Performance Point (OPP) selection.
+Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune
+the operating frequency of that CPU to better match the workload demand. The
+selection of the actual OPP being activated is influenced by the global boost
+value, or the boost value for the task CGroup when in use.
+
+This simple biasing approach leverages existing frameworks, which means minimal
+modifications to the scheduler, and yet it allows to achieve a range of
+different behaviours all from a single simple tunable knob.
+The only new concept introduced is that of signal boosting.
+
+
+3. Signal Boosting Strategy
+===========================
+
+The whole PELT machinery works based on the value of a few load tracking signals
+which basically track the CPU bandwidth requirements for tasks and the capacity
+of CPUs. The basic idea behind the SchedTune knob is to artificially inflate
+some of these load tracking signals to make a task or RQ appears more demanding
+that it actually is.
+
+Which signals have to be inflated depends on the specific "consumer".  However,
+independently from the specific (signal, consumer) pair, it is important to
+define a simple and possibly consistent strategy for the concept of boosting a
+signal.
+
+A boosting strategy defines how the "abstract" user-space defined
+sched_cfs_boost value is translated into an internal "margin" value to be added
+to a signal to get its inflated value:
+
+  margin         := boosting_strategy(sched_cfs_boost, signal)
+  boosted_signal := signal + margin
+
+Different boosting strategies were identified and analyzed before selecting the
+one found to be most effective.
+
+Signal Proportional Compensation (SPC)
+--------------------------------------
+
+In this boosting strategy the sched_cfs_boost value is used to compute a
+margin which is proportional to the complement of the original signal.
+When a signal has a maximum possible value, its complement is defined as
+the delta from the actual value and its possible maximum.
+
+Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as
+the maximum possible value, the margin becomes:
+
+	margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal)
+
+Using this boosting strategy:
+- a 100% sched_cfs_boost means that the signal is scaled to the maximum value
+- each value in the range of sched_cfs_boost effectively inflates the signal in
+  question by a quantity which is proportional to the maximum value.
+
+For example, by applying the SPC boosting strategy to the selection of the OPP
+to run a task it is possible to achieve these behaviors:
+
+-   0% boosting: run the task at the minimum OPP required by its workload
+- 100% boosting: run the task at the maximum OPP available for the CPU
+-  50% boosting: run at the half-way OPP between minimum and maximum
+
+Which means that, at 50% boosting, a task will be scheduled to run at half of
+the maximum theoretically achievable performance on the specific target
+platform.
+
+A graphical representation of an SPC boosted signal is represented in the
+following figure where:
+ a) "-" represents the original signal
+ b) "b" represents a  50% boosted signal
+ c) "p" represents a 100% boosted signal
+
+
+   ^
+   |  SCHED_LOAD_SCALE
+   +-----------------------------------------------------------------+
+   |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
+   |
+   |                                             boosted_signal
+   |                                          bbbbbbbbbbbbbbbbbbbbbbbb
+   |
+   |                                            original signal
+   |                  bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+
+   |                                          |
+   |bbbbbbbbbbbbbbbbbb                        |
+   |                                          |
+   |                                          |
+   |                                          |
+   |                  +-----------------------+
+   |                  |
+   |                  |
+   |                  |
+   |------------------+
+   |
+   |
+   +----------------------------------------------------------------------->
+
+The plot above shows a ramped load signal (titled 'original_signal') and it's
+boosted equivalent. For each step of the original signal the boosted signal
+corresponding to a 50% boost is midway from the original signal and the upper
+bound. Boosting by 100% generates a boosted signal which is always saturated to
+the upper bound.
+
+
+4. OPP selection using boosted CPU utilization
+==============================================
+
+It is worth calling out that the implementation does not introduce any new load
+signals. Instead, it provides an API to tune existing signals. This tuning is
+done on demand and only in scheduler code paths where it is sensible to do so.
+The new API calls are defined to return either the default signal or a boosted
+one, depending on the value of sched_cfs_boost. This is a clean an non invasive
+modification of the existing existing code paths.
+
+The signal representing a CPU's utilization is boosted according to the
+previously described SPC boosting strategy. To sched-DVFS, this allows a CPU
+(ie CFS run-queue) to appear more used then it actually is.
+
+Thus, with the sched_cfs_boost enabled we have the following main functions to
+get the current utilization of a CPU:
+
+  cpu_util()
+  boosted_cpu_util()
+
+The new boosted_cpu_util() is similar to the first but returns a boosted
+utilization signal which is a function of the sched_cfs_boost value.
+
+This function is used in the CFS scheduler code paths where sched-DVFS needs to
+decide the OPP to run a CPU at.
+For example, this allows selecting the highest OPP for a CPU which has
+the boost value set to 100%.
+
+
+5. Per task group boosting
+==========================
+
+The availability of a single knob which is used to boost all tasks in the
+system is certainly a simple solution but it quite likely doesn't fit many
+utilization scenarios, especially in the mobile device space.
+
+For example, on battery powered devices there usually are many background
+services which are long running and need energy efficient scheduling. On the
+other hand, some applications are more performance sensitive and require an
+interactive response and/or maximum performance, regardless of the energy cost.
+To better service such scenarios, the SchedTune implementation has an extension
+that provides a more fine grained boosting interface.
+
+A new CGroup controller, namely "schedtune", could be enabled which allows to
+defined and configure task groups with different boosting values.
+Tasks that require special performance can be put into separate CGroups.
+The value of the boost associated with the tasks in this group can be specified
+using a single knob exposed by the CGroup controller:
+
+   schedtune.boost
+
+This knob allows the definition of a boost value that is to be used for
+SPC boosting of all tasks attached to this group.
+
+The current schedtune controller implementation is really simple and has these
+main characteristics:
+
+  1) It is only possible to create 1 level depth hierarchies
+
+     The root control groups define the system-wide boost value to be applied
+     by default to all tasks. Its direct subgroups are named "boost groups" and
+     they define the boost value for specific set of tasks.
+     Further nested subgroups are not allowed since they do not have a sensible
+     meaning from a user-space standpoint.
+
+  2) It is possible to define only a limited number of "boost groups"
+
+     This number is defined at compile time and by default configured to 16.
+     This is a design decision motivated by two main reasons:
+     a) In a real system we do not expect utilization scenarios with more then few
+	boost groups. For example, a reasonable collection of groups could be
+        just "background", "interactive" and "performance".
+     b) It simplifies the implementation considerably, especially for the code
+	which has to compute the per CPU boosting once there are multiple
+        RUNNABLE tasks with different boost values.
+
+Such a simple design should allow servicing the main utilization scenarios identified
+so far. It provides a simple interface which can be used to manage the
+power-performance of all tasks or only selected tasks.
+Moreover, this interface can be easily integrated by user-space run-times (e.g.
+Android, ChromeOS) to implement a QoS solution for task boosting based on tasks
+classification, which has been a long standing requirement.
+
+Setup and usage
+---------------
+
+0. Use a kernel with CGROUP_SCHEDTUNE support enabled
+
+1. Check that the "schedtune" CGroup controller is available:
+
+   root@linaro-nano:~# cat /proc/cgroups
+   #subsys_name	hierarchy	num_cgroups	enabled
+   cpuset  	0		1		1
+   cpu     	0		1		1
+   schedtune	0		1		1
+
+2. Mount a tmpfs to create the CGroups mount point (Optional)
+
+   root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup
+
+3. Mount the "schedtune" controller
+
+   root@linaro-nano:~# mkdir /sys/fs/cgroup/stune
+   root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune
+
+4. Setup the system-wide boost value (Optional)
+
+   If not configured the root control group has a 0% boost value, which
+   basically disables boosting for all tasks in the system thus running in
+   an energy-efficient mode.
+
+   root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost
+
+5. Create task groups and configure their specific boost value (Optional)
+
+   For example here we create a "performance" boost group configure to boost
+   all its tasks to 100%
+
+   root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance
+   root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost
+
+6. Move tasks into the boost group
+
+   For example, the following moves the tasks with PID $TASKPID (and all its
+   threads) into the "performance" boost group.
+
+   root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs
+
+This simple configuration allows only the threads of the $TASKPID task to run,
+when needed, at the highest OPP in the most capable CPU of the system.
+
+
+6. Question and Answers
+=======================
+
+What about "auto" mode?
+-----------------------
+
+The 'auto' mode as described in [5] can be implemented by interfacing SchedTune
+with some suitable user-space element. This element could use the exposed
+system-wide or cgroup based interface.
+
+How are multiple groups of tasks with different boost values managed?
+---------------------------------------------------------------------
+
+The current SchedTune implementation keeps track of the boosted RUNNABLE tasks
+on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization
+is boosted with a value which is the maximum of the boost values of the
+currently RUNNABLE tasks in its RQ.
+
+This allows sched-DVFS to boost a CPU only while there are boosted tasks ready
+to run and switch back to the energy efficient mode as soon as the last boosted
+task is dequeued.
+
+
+7. References
+=============
+[1] http://lwn.net/Articles/552889
+[2] http://lkml.org/lkml/2012/5/18/91
+[3] http://lkml.org/lkml/2015/6/26/620

From 344f4ec4293b75d83282eb13ede965995e5f312f Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Mon, 22 Jun 2015 18:11:44 +0100
Subject: [PATCH 0103/3220] sched/tune: add sysctl interface to define a boost
 value

The current (CFS) scheduler implementation does not allow "to boost"
tasks performance by running them at a higher OPP compared to the
minimum required to meet their workload demands.

To support tasks performance boosting the scheduler should provide a
"knob" which allows to tune how much the system is going to be optimised
for energy efficiency vs performance.

This patch is the first of a series which provides a simple interface to
define a tuning knob. One system-wide "boost" tunable is exposed via:
  /proc/sys/kernel/sched_cfs_boost
which can be configured in the range [0..100], to define a percentage
where:
  - 0%   boost requires to operate in "standard" mode by scheduling
         tasks at the minimum capacities required by the workload demand
  - 100% boost requires to push at maximum the task performances,
         "regardless" of the incurred energy consumption

A boost value in between these two boundaries is used to bias the
power/performance trade-off, the higher the boost value the more the
scheduler is biased toward performance boosting instead of energy
efficiency.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/linux/sched/sysctl.h | 16 ++++++++++++++++
 init/Kconfig                 | 26 ++++++++++++++++++++++++++
 kernel/sched/Makefile        |  1 +
 kernel/sched/tune.c          | 16 ++++++++++++++++
 kernel/sysctl.c              | 11 +++++++++++
 5 files changed, 70 insertions(+)
 create mode 100644 kernel/sched/tune.c

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731cf10b..4479e48c7712 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -77,6 +77,22 @@ extern int sysctl_sched_rt_runtime;
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif
 
+#ifdef CONFIG_SCHED_TUNE
+extern unsigned int sysctl_sched_cfs_boost;
+int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+				   void __user *buffer, size_t *length,
+				   loff_t *ppos);
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+	return sysctl_sched_cfs_boost;
+}
+#else
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SCHED_AUTOGROUP
 extern unsigned int sysctl_sched_autogroup_enabled;
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 235c7a2c0d20..83d8c02e7a57 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1237,6 +1237,32 @@ config SCHED_AUTOGROUP
 	  desktop applications.  Task group autogeneration is currently based
 	  upon task session.
 
+config SCHED_TUNE
+	bool "Boosting for CFS tasks (EXPERIMENTAL)"
+	help
+	  This option enables the system-wide support for task boosting.
+	  When this support is enabled a new sysctl interface is exposed to
+	  userspace via:
+	     /proc/sys/kernel/sched_cfs_boost
+	  which allows to set a system-wide boost value in range [0..100].
+
+	  The currently boosting strategy is implemented in such a way that:
+	  - a 0% boost value requires to operate in "standard" mode by
+	    scheduling all tasks at the minimum capacities required by their
+	    workload demand
+	  - a 100% boost value requires to push at maximum the task
+	    performances, "regardless" of the incurred energy consumption
+
+	  A boost value in between these two boundaries is used to bias the
+	  power/performance trade-off, the higher the boost value the more the
+	  scheduler is biased toward performance boosting instead of energy
+	  efficiency.
+
+	  Since this support exposes a single system-wide knob, the specified
+	  boost value is applied to all (CFS) tasks in the system.
+
+	  If unsure, say N.
+
 config SYSFS_DEPRECATED
 	bool "Enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 0eabc9db4c3d..c6a85f813dfd 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,5 +18,6 @@ obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_SCHED_TUNE) += tune.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
new file mode 100644
index 000000000000..a93af9c2f267
--- /dev/null
+++ b/kernel/sched/tune.c
@@ -0,0 +1,16 @@
+#include "sched.h"
+
+unsigned int sysctl_sched_cfs_boost __read_mostly;
+
+int
+sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp,
+			       loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	return 0;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dc6858d6639e..827ba256fdad 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -434,6 +434,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
+#ifdef CONFIG_SCHED_TUNE
+	{
+		.procname	= "sched_cfs_boost",
+		.data		= &sysctl_sched_cfs_boost,
+		.maxlen		= sizeof(sysctl_sched_cfs_boost),
+		.mode		= 0644,
+		.proc_handler	= &sysctl_sched_cfs_boost_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",

From 6ed27145013d97e49d7e71a56601033cccee743c Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Mon, 22 Jun 2015 18:32:36 +0100
Subject: [PATCH 0104/3220] sched/fair: add function to convert boost value
 into "margin"

The basic idea of the boost knob is to "artificially inflate" a signal
to make a task or logical CPU appears more demanding than it actually
is. Independently from the specific signal, a consistent and possibly
simple semantic for the concept of "signal boosting" must define:
1. how we translate the boost percentage into a "margin" value to be added
   to the original signal to inflate
2. what is the meaning of a boost value from a user-space perspective

This patch provides the implementation of a possible boost semantic,
named "Signal Proportional Compensation" (SPC), where the boost
percentage (BP) is used to compute a margin (M) which is proportional to
the complement of the original signal (OS):
  M = BP * (SCHED_LOAD_SCALE - OS)
The computed margin then added to the OS to obtain the Boosted Signal (BS)
  BS = OS + M

The proposed boost semantic has these main features:
- each signal gets a boost which is proportional to its delta with respect
  to the maximum available capacity in the system (i.e. SCHED_LOAD_SCALE)
- a 100% boosting has a clear understanding from a user-space perspective,
  since it means simply to run (possibly) "all" tasks at the max OPP
- each boosting value means to improve the task performance by a quantity
  which is proportional to the maximum achievable performance on that
  system
Thus this semantics is somehow forcing a behaviour which is:

  50% boosting means to run at half-way between the current and the
    maximum performance which a task could achieve on that system

This patch provides the code to implement a fast integer division to
convert a boost percentage (BP) value into a margin (M).

NOTE: this code is suitable for all signals operating in range
      [0..SCHED_LOAD_SCALE]

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 651e160cdd67..1b6a57837433 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5074,6 +5074,44 @@ static bool cpu_overutilized(int cpu)
 	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
 }
 
+#ifdef CONFIG_SCHED_TUNE
+
+static unsigned long
+schedtune_margin(unsigned long signal, unsigned long boost)
+{
+	unsigned long long margin = 0;
+
+	/*
+	 * Signal proportional compensation (SPC)
+	 *
+	 * The Boost (B) value is used to compute a Margin (M) which is
+	 * proportional to the complement of the original Signal (S):
+	 *   M = B * (SCHED_LOAD_SCALE - S)
+	 * The obtained M could be used by the caller to "boost" S.
+	 */
+	margin  = SCHED_LOAD_SCALE - signal;
+	margin *= boost;
+
+	/*
+	 * Fast integer division by constant:
+	 *  Constant   :                 (C) = 100
+	 *  Precision  : 0.1%            (P) = 0.1
+	 *  Reference  : C * 100 / P     (R) = 100000
+	 *
+	 * Thus:
+	 *  Shift bits : ceil(log(R,2))  (S) = 17
+	 *  Mult const : round(2^S/C)    (M) = 1311
+	 *
+	 *
+	 */
+	margin  *= 1311;
+	margin >>= 17;
+
+	return margin;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.

From 07e22949de7116dfac9098a46109c30776ead625 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 26 Jun 2015 09:55:06 +0100
Subject: [PATCH 0105/3220] sched/fair: add boosted CPU usage

The CPU usage signal is used by the scheduler as an estimation of the
overall bandwidth currently allocated on a CPU. When SchedDVFS is in
use, this signal affects the selection of the operating points (OPP)
required to accommodate all the workload allocated in a CPU.
A convenient way to boost the performance of tasks running on a CPU,
which is also little intrusive, is to boost the CPU usage signal each
time it is used to select an OPP.

This patch introduces a new function:
  get_boosted_cpu_usage(cpu)
to return a boosted value for the usage of a specified CPU.
The margin added to the original usage is:
  1. computed based on the "boosting strategy" in use
  2. proportional to the system-wide boost value defined by provided
     user-space interface

The boosted signal is used by SchedDVFS (transparently) each time it
requires to get an estimation of the capacity required for a CPU.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1b6a57837433..eff548fae43e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4144,6 +4144,8 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+static inline unsigned long boosted_cpu_util(int cpu);
+
 static void update_capacity_of(int cpu)
 {
 	unsigned long req_cap;
@@ -4152,7 +4154,8 @@ static void update_capacity_of(int cpu)
 		return;
 
 	/* Convert scale-invariant capacity to cpu. */
-	req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
+	req_cap = boosted_cpu_util(cpu);
+	req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
 	set_cfs_cpu_capacity(cpu, true, req_cap);
 }
 
@@ -5110,8 +5113,36 @@ schedtune_margin(unsigned long signal, unsigned long boost)
 	return margin;
 }
 
+static inline unsigned int
+schedtune_cpu_margin(unsigned long util)
+{
+	unsigned int boost = get_sysctl_sched_cfs_boost();
+
+	if (boost == 0)
+		return 0;
+
+	return schedtune_margin(util, boost);
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline unsigned int
+schedtune_cpu_margin(unsigned long util)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SCHED_TUNE */
 
+static inline unsigned long
+boosted_cpu_util(int cpu)
+{
+	unsigned long util = cpu_util(cpu);
+	unsigned long margin = schedtune_cpu_margin(util);
+
+	return util + margin;
+}
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.

From 13001f47c9a610705219700af4636386b647e231 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Tue, 23 Jun 2015 09:17:54 +0100
Subject: [PATCH 0106/3220] sched/tune: add initial support for CGroups based
 boosting

To support task performance boosting, the usage of a single knob has the
advantage to be a simple solution, both from the implementation and the
usability standpoint.  However, on a real system it can be difficult to
identify a single value for the knob which fits the needs of multiple
different tasks. For example, some kernel threads and/or user-space
background services should be better managed the "standard" way while we
still want to be able to boost the performance of specific workloads.

In order to improve the flexibility of the task boosting mechanism this
patch is the first of a small series which extends the previous
implementation to introduce a "per task group" support.
This first patch introduces just the basic CGroups support, a new
"schedtune" CGroups controller is added which allows to configure
different boost value for different groups of tasks.
To keep the implementation simple but still effective for a boosting
strategy, the new controller:
  1. allows only a two layer hierarchy
  2. supports only a limited number of boost groups

A two layer hierarchy allows to place each task either:
  a) in the root control group
     thus being subject to a system-wide boosting value
  b) in a child of the root group
     thus being subject to the specific boost value defined by that
     "boost group"

The limited number of "boost groups" supported is mainly motivated by
the observation that in a real system it could be useful to have only
few classes of tasks which deserve different treatment.
For example, background vs foreground or interactive vs low-priority.
As an additional benefit, a limited number of boost groups allows also
to have a simpler implementation especially for the code required to
compute the boost value for CPUs which have runnable tasks belonging to
different boost groups.

cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/linux/cgroup_subsys.h |   4 +
 init/Kconfig                  |  17 +++
 kernel/sched/tune.c           | 223 ++++++++++++++++++++++++++++++++++
 kernel/sysctl.c               |   4 +
 4 files changed, 248 insertions(+)

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 1a96fdaa33d5..e133705d794a 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -26,6 +26,10 @@ SUBSYS(cpu)
 SUBSYS(cpuacct)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE)
+SUBSYS(schedtune)
+#endif
+
 #if IS_ENABLED(CONFIG_BLK_CGROUP)
 SUBSYS(io)
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 83d8c02e7a57..5d9097e2b805 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -999,6 +999,23 @@ config CGROUP_CPUACCT
 	  Provides a simple Resource Controller for monitoring the
 	  total CPU consumed by the tasks in a cgroup.
 
+config CGROUP_SCHEDTUNE
+	bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)"
+	depends on SCHED_TUNE
+	help
+	  This option provides the "schedtune" controller which improves the
+	  flexibility of the task boosting mechanism by introducing the support
+	  to define "per task" boost values.
+
+	  This new controller:
+	  1. allows only a two layers hierarchy, where the root defines the
+	     system-wide boost value and its direct childrens define each one a
+	     different "class of tasks" to be boosted with a different value
+	  2. supports up to 16 different task classes, each one which could be
+	     configured with a different boost value
+
+	  Say N if unsure.
+
 config PAGE_COUNTER
        bool
 
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index a93af9c2f267..95bc8b87c6d4 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -1,7 +1,230 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+
 #include "sched.h"
 
 unsigned int sysctl_sched_cfs_boost __read_mostly;
 
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+/*
+ * EAS scheduler tunables for task groups.
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+	/* SchedTune CGroup subsystem */
+	struct cgroup_subsys_state css;
+
+	/* Boost group allocated ID */
+	int idx;
+
+	/* Boost value for tasks on that SchedTune CGroup */
+	int boost;
+
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct schedtune, css) : NULL;
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+	return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+	return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boost tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to tasks which are not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+	.boost	= 0,
+};
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allow only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ *    make sense to boost with different values (e.g. background vs foreground
+ *    tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ *    implementation especially for the computation of the per-CPU boost
+ *    value
+ */
+#define BOOSTGROUPS_COUNT 4
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+	&root_schedtune,
+	NULL,
+};
+
+/* SchedTune boost groups
+ * Keep track of all the boost groups which impact on CPU, for example when a
+ * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
+ * likely with different boost values.
+ * Since on each system we expect only a limited number of boost groups, here
+ * we use a simple array to keep track of the metrics required to compute the
+ * maximum per-CPU boosting value.
+ */
+struct boost_groups {
+	/* Maximum boost value for all RUNNABLE tasks on a CPU */
+	unsigned boost_max;
+	struct {
+		/* The boost for tasks on that boost group */
+		unsigned boost;
+		/* Count of RUNNABLE tasks on that boost group */
+		unsigned tasks;
+	} group[BOOSTGROUPS_COUNT];
+};
+
+/* Boost groups affecting each CPU in the system */
+DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static u64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+	    u64 boost)
+{
+	struct schedtune *st = css_st(css);
+
+	if (boost < 0 || boost > 100)
+		return -EINVAL;
+
+	st->boost = boost;
+	if (css == &root_schedtune.css)
+		sysctl_sched_cfs_boost = boost;
+
+	return 0;
+}
+
+static struct cftype files[] = {
+	{
+		.name = "boost",
+		.read_u64 = boost_read,
+		.write_u64 = boost_write,
+	},
+	{ }	/* terminate */
+};
+
+static int
+schedtune_boostgroup_init(struct schedtune *st)
+{
+	/* Keep track of allocated boost groups */
+	allocated_group[st->idx] = st;
+
+	return 0;
+}
+
+static int
+schedtune_init(void)
+{
+	struct boost_groups *bg;
+	int cpu;
+
+	/* Initialize the per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		memset(bg, 0, sizeof(struct boost_groups));
+	}
+
+	pr_info("  schedtune configured to support %d boost groups\n",
+		BOOSTGROUPS_COUNT);
+	return 0;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct schedtune *st;
+	int idx;
+
+	if (!parent_css) {
+		schedtune_init();
+		return &root_schedtune.css;
+	}
+
+	/* Allow only single level hierachies */
+	if (parent_css != &root_schedtune.css) {
+		pr_err("Nested SchedTune boosting groups not allowed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Allow only a limited number of boosting groups */
+	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+		if (!allocated_group[idx])
+			break;
+	if (idx == BOOSTGROUPS_COUNT) {
+		pr_err("Trying to create more than %d SchedTune boosting groups\n",
+		       BOOSTGROUPS_COUNT);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	st = kzalloc(sizeof(*st), GFP_KERNEL);
+	if (!st)
+		goto out;
+
+	/* Initialize per CPUs boost group support */
+	st->idx = idx;
+	if (schedtune_boostgroup_init(st))
+		goto release;
+
+	return &st->css;
+
+release:
+	kfree(st);
+out:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+	/* Keep track of allocated boost groups */
+	allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+	struct schedtune *st = css_st(css);
+
+	schedtune_boostgroup_release(st);
+	kfree(st);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+	.css_alloc	= schedtune_css_alloc,
+	.css_free	= schedtune_css_free,
+	.legacy_cftypes	= files,
+	.early_init	= 1,
+};
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
 int
 sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 827ba256fdad..98c0f6ef01f1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -439,7 +439,11 @@ static struct ctl_table kern_table[] = {
 		.procname	= "sched_cfs_boost",
 		.data		= &sysctl_sched_cfs_boost,
 		.maxlen		= sizeof(sysctl_sched_cfs_boost),
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+		.mode		= 0444,
+#else
 		.mode		= 0644,
+#endif
 		.proc_handler	= &sysctl_sched_cfs_boost_handler,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,

From 9cd53fbeb2ad365aac06eebed2baee4984a283b0 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 14 Jan 2016 12:31:35 +0000
Subject: [PATCH 0107/3220] sched/tune: compute and keep track of per CPU boost
 value

When per task boosting is enabled, we could have multiple RUNNABLE tasks
which are concurrently scheduled on the same CPU but each one with a
different boost value.
For example, we could have a scenarios like this:

  Task   SchedTune CGroup   Boost Value
    T1               root            0
    T2       low-priority           10
    T3        interactive           90

In these conditions we expect a CPU to be configured according to a
proper "aggregation" of the required boost values for all the tasks
currently scheduled on this CPU.

A suitable aggregation function is the one which tracks the MAX boost
value for all the tasks RUNNABLE on a CPU. This approach allows to
always satisfy the most boost demanding task while at the same time:
 a) boosting all the concurrently scheduled tasks thus reducing
    potential co-scheduling side-effects on demanding tasks
 b) reduce the number of frequency switch requested towards SchedDVFS,
    thus being more friendly to architectures with slow frequency
    switching times

Every time a task enters/exits the RQ of a CPU the max boost value
should be updated considering all the boost groups currently "affecting"
that CPU, i.e. which have at least one RUNNABLE task currently allocated
on that CPU.

This patch introduces the required support to keep track of the boost
groups currently affecting CPUs. Thanks to the limited number of boost
groups, a small and memory efficient per-cpu array of boost groups
values (cpu_boost_groups) is used which is updated for each CPU entry by
schedtune_boostgroup_update() but only when a schedtune CGroup boost
value is updated. However, this is expected to be a rare operation,
perhaps done just one time at system boot time.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/tune.c | 77 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 95bc8b87c6d4..f62386893725 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -97,6 +97,67 @@ struct boost_groups {
 /* Boost groups affecting each CPU in the system */
 DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
 
+static void
+schedtune_cpu_update(int cpu)
+{
+	struct boost_groups *bg;
+	unsigned boost_max;
+	int idx;
+
+	bg = &per_cpu(cpu_boost_groups, cpu);
+
+	/* The root boost group is always active */
+	boost_max = bg->group[0].boost;
+	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+		/*
+		 * A boost group affects a CPU only if it has
+		 * RUNNABLE tasks on that CPU
+		 */
+		if (bg->group[idx].tasks == 0)
+			continue;
+		boost_max = max(boost_max, bg->group[idx].boost);
+	}
+
+	bg->boost_max = boost_max;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+	struct boost_groups *bg;
+	int cur_boost_max;
+	int old_boost;
+	int cpu;
+
+	/* Update per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+
+		/*
+		 * Keep track of current boost values to compute the per CPU
+		 * maximum only when it has been affected by the new value of
+		 * the updated boost group
+		 */
+		cur_boost_max = bg->boost_max;
+		old_boost = bg->group[idx].boost;
+
+		/* Update the boost value of this boost group */
+		bg->group[idx].boost = boost;
+
+		/* Check if this update increase current max */
+		if (boost > cur_boost_max && bg->group[idx].tasks) {
+			bg->boost_max = boost;
+			continue;
+		}
+
+		/* Check if this update has decreased current max */
+		if (cur_boost_max == old_boost && old_boost > boost)
+			schedtune_cpu_update(cpu);
+	}
+
+	return 0;
+}
+
 static u64
 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
@@ -118,6 +179,9 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
 	if (css == &root_schedtune.css)
 		sysctl_sched_cfs_boost = boost;
 
+	/* Update CPU boost */
+	schedtune_boostgroup_update(st->idx, st->boost);
+
 	return 0;
 }
 
@@ -133,9 +197,19 @@ static struct cftype files[] = {
 static int
 schedtune_boostgroup_init(struct schedtune *st)
 {
+	struct boost_groups *bg;
+	int cpu;
+
 	/* Keep track of allocated boost groups */
 	allocated_group[st->idx] = st;
 
+	/* Initialize the per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		bg->group[st->idx].boost = 0;
+		bg->group[st->idx].tasks = 0;
+	}
+
 	return 0;
 }
 
@@ -203,6 +277,9 @@ out:
 static void
 schedtune_boostgroup_release(struct schedtune *st)
 {
+	/* Reset this boost group */
+	schedtune_boostgroup_update(st->idx, 0);
+
 	/* Keep track of allocated boost groups */
 	allocated_group[st->idx] = NULL;
 }

From a515b887ece159dfc45010667c43bf4b834ee325 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Tue, 7 Jul 2015 15:33:20 +0100
Subject: [PATCH 0108/3220] sched/{fair,tune}: track RUNNABLE tasks impact on
 per CPU boost value

When per-task boosting is enabled, every time a task enters/exits a CPU
its boost value could impact the currently selected OPP for that CPU.
Thus, the "aggregated" boost value for that CPU potentially needs to
be updated to match the current maximum boost value among all the tasks
currently RUNNABLE on that CPU.

This patch introduces the required support to keep track of which boost
groups are impacting a CPU. Each time a task is enqueued/dequeued to/from
a CPU its boost group is used to increment a per-cpu counter of RUNNABLE
tasks on that CPU.
Only when the number of runnable tasks for a specific boost group
becomes 1 or 0 the corresponding boost group changes its effects on
that CPU, specifically:
  a) boost_group::tasks == 1: this boost group starts to impact the CPU
  b) boost_group::tasks == 0: this boost group stops to impact the CPU
In each of these two conditions the aggregation function:
  sched_cpu_update(cpu)
could be required to run in order to identify the new maximum boost
value required for the CPU.

The proposed patch minimizes the number of times the aggregation
function is executed while still providing the required support to
always boost a CPU to the maximum boost value required by all its
currently RUNNABLE tasks.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 17 +++++++---
 kernel/sched/tune.c | 82 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/tune.h | 23 +++++++++++++
 3 files changed, 118 insertions(+), 4 deletions(-)
 create mode 100644 kernel/sched/tune.h

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eff548fae43e..2dbe1ff0a90b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -34,6 +34,7 @@
 #include <trace/events/sched.h>
 
 #include "sched.h"
+#include "tune.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -4210,6 +4211,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		    cpu_overutilized(rq->cpu))
 			rq->rd->overutilized = true;
 
+		schedtune_enqueue_task(p, cpu_of(rq));
+
 		/*
 		 * We want to potentially trigger a freq switch
 		 * request only for tasks that are waking up; this is
@@ -4279,6 +4282,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 	if (!se) {
 		sub_nr_running(rq, 1);
+		schedtune_dequeue_task(p, cpu_of(rq));
 
 		/*
 		 * We want to potentially trigger a freq switch
@@ -5114,10 +5118,15 @@ schedtune_margin(unsigned long signal, unsigned long boost)
 }
 
 static inline unsigned int
-schedtune_cpu_margin(unsigned long util)
+schedtune_cpu_margin(unsigned long util, int cpu)
 {
-	unsigned int boost = get_sysctl_sched_cfs_boost();
+	unsigned int boost;
 
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+	boost = schedtune_cpu_boost(cpu);
+#else
+	boost = get_sysctl_sched_cfs_boost();
+#endif
 	if (boost == 0)
 		return 0;
 
@@ -5127,7 +5136,7 @@ schedtune_cpu_margin(unsigned long util)
 #else /* CONFIG_SCHED_TUNE */
 
 static inline unsigned int
-schedtune_cpu_margin(unsigned long util)
+schedtune_cpu_margin(unsigned long util, int cpu)
 {
 	return 0;
 }
@@ -5138,7 +5147,7 @@ static inline unsigned long
 boosted_cpu_util(int cpu)
 {
 	unsigned long util = cpu_util(cpu);
-	unsigned long margin = schedtune_cpu_margin(util);
+	unsigned long margin = schedtune_cpu_margin(util, cpu);
 
 	return util + margin;
 }
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index f62386893725..540b945a01ce 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -2,6 +2,7 @@
 #include <linux/err.h>
 #include <linux/percpu.h>
 #include <linux/printk.h>
+#include <linux/rcupdate.h>
 #include <linux/slab.h>
 
 #include "sched.h"
@@ -158,6 +159,87 @@ schedtune_boostgroup_update(int idx, int boost)
 	return 0;
 }
 
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+	struct boost_groups *bg;
+	int tasks;
+
+	bg = &per_cpu(cpu_boost_groups, cpu);
+
+	/* Update boosted tasks count while avoiding to make it negative */
+	if (task_count < 0 && bg->group[idx].tasks <= -task_count)
+		bg->group[idx].tasks = 0;
+	else
+		bg->group[idx].tasks += task_count;
+
+	/* Boost group activation or deactivation on that RQ */
+	tasks = bg->group[idx].tasks;
+	if (tasks == 1 || tasks == 0)
+		schedtune_cpu_update(cpu);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+	struct schedtune *st;
+	int idx;
+
+	/*
+	 * When a task is marked PF_EXITING by do_exit() it's going to be
+	 * dequeued and enqueued multiple times in the exit path.
+	 * Thus we avoid any further update, since we do not want to change
+	 * CPU boosting while the task is exiting.
+	 */
+	if (p->flags & PF_EXITING)
+		return;
+
+	/* Get task boost group */
+	rcu_read_lock();
+	st = task_schedtune(p);
+	idx = st->idx;
+	rcu_read_unlock();
+
+	schedtune_tasks_update(p, cpu, idx, 1);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+	struct schedtune *st;
+	int idx;
+
+	/*
+	 * When a task is marked PF_EXITING by do_exit() it's going to be
+	 * dequeued and enqueued multiple times in the exit path.
+	 * Thus we avoid any further update, since we do not want to change
+	 * CPU boosting while the task is exiting.
+	 * The last dequeue will be done by cgroup exit() callback.
+	 */
+	if (p->flags & PF_EXITING)
+		return;
+
+	/* Get task boost group */
+	rcu_read_lock();
+	st = task_schedtune(p);
+	idx = st->idx;
+	rcu_read_unlock();
+
+	schedtune_tasks_update(p, cpu, idx, -1);
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+	struct boost_groups *bg;
+
+	bg = &per_cpu(cpu_boost_groups, cpu);
+	return bg->boost_max;
+}
+
 static u64
 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
new file mode 100644
index 000000000000..561b5171a19b
--- /dev/null
+++ b/kernel/sched/tune.h
@@ -0,0 +1,23 @@
+
+#ifdef CONFIG_SCHED_TUNE
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+int schedtune_cpu_boost(int cpu);
+
+void schedtune_enqueue_task(struct task_struct *p, int cpu);
+void schedtune_dequeue_task(struct task_struct *p, int cpu);
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+#else /* CONFIG_SCHED_TUNE */
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#endif /* CONFIG_SCHED_TUNE */

From a8f65584fc948d646732c0f9533570f97091c78d Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 14 Jan 2016 18:31:53 +0000
Subject: [PATCH 0109/3220] sched/fair: add boosted task utilization

The task utilization signal, which is derived from PELT signals and
properly scaled to be architecture and frequency invariant, is used by
EAS as an estimation of the task requirements in terms of CPU bandwidth.

When the energy aware scheduler is in use, this signal affects the CPU
selection. Thus, a convenient way to bias that decision, which is also
little intrusive, is to boost the task utilization signal each time it
is required to support them.

This patch introduces the new function:
  boosted_task_util(task)
which returns a boosted value for the utilization of the specified task.
The margin added to the original utilization is:
  1. computed based on the "boosting strategy" in use
  2. proportional to boost value defined either by the sysctl interface,
     when global boosting is in use, or the "taskgroup" value, when
     per-task boosting is enabled.

The boosted signal is used by EAS
  a. transparently, via its integration into the task_fits() function
  b. explicitly, in the energy-aware wakeup path

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/tune.c | 14 ++++++++++++++
 kernel/sched/tune.h |  1 +
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2dbe1ff0a90b..f582d58daea5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5048,11 +5048,13 @@ static inline unsigned long task_util(struct task_struct *p)
 
 unsigned int capacity_margin = 1280; /* ~20% margin */
 
+static inline unsigned long boosted_task_util(struct task_struct *task);
+
 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
 {
 	unsigned long capacity = capacity_of(cpu);
 
-	util += task_util(p);
+	util += boosted_task_util(p);
 
 	return (capacity * 1024) > (util * capacity_margin);
 }
@@ -5133,6 +5135,27 @@ schedtune_cpu_margin(unsigned long util, int cpu)
 	return schedtune_margin(util, boost);
 }
 
+static inline unsigned long
+schedtune_task_margin(struct task_struct *task)
+{
+	unsigned int boost;
+	unsigned long util;
+	unsigned long margin;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+	boost = schedtune_task_boost(task);
+#else
+	boost = get_sysctl_sched_cfs_boost();
+#endif
+	if (boost == 0)
+		return 0;
+
+	util = task_util(task);
+	margin = schedtune_margin(util, boost);
+
+	return margin;
+}
+
 #else /* CONFIG_SCHED_TUNE */
 
 static inline unsigned int
@@ -5141,6 +5164,12 @@ schedtune_cpu_margin(unsigned long util, int cpu)
 	return 0;
 }
 
+static inline unsigned int
+schedtune_task_margin(struct task_struct *task)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SCHED_TUNE */
 
 static inline unsigned long
@@ -5152,6 +5181,15 @@ boosted_cpu_util(int cpu)
 	return util + margin;
 }
 
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+	unsigned long util = task_util(task);
+	unsigned long margin = schedtune_task_margin(task);
+
+	return util + margin;
+}
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -5386,7 +5424,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
 		 * so prev_cpu will receive a negative bias due to the double
 		 * accounting. However, the blocked utilization may be zero.
 		 */
-		int new_util = cpu_util(i) + task_util(p);
+		int new_util = cpu_util(i) + boosted_task_util(p);
 
 		if (new_util > capacity_orig_of(i))
 			continue;
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 540b945a01ce..87213861bde5 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -240,6 +240,20 @@ int schedtune_cpu_boost(int cpu)
 	return bg->boost_max;
 }
 
+int schedtune_task_boost(struct task_struct *p)
+{
+	struct schedtune *st;
+	int task_boost;
+
+	/* Get task boost value */
+	rcu_read_lock();
+	st = task_schedtune(p);
+	task_boost = st->boost;
+	rcu_read_unlock();
+
+	return task_boost;
+}
+
 static u64
 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index 561b5171a19b..d756ce7b06e0 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -4,6 +4,7 @@
 #ifdef CONFIG_CGROUP_SCHEDTUNE
 
 int schedtune_cpu_boost(int cpu);
+int schedtune_task_boost(struct task_struct *tsk);
 
 void schedtune_enqueue_task(struct task_struct *p, int cpu);
 void schedtune_dequeue_task(struct task_struct *p, int cpu);

From 36967b2412fa7f356308c5104d95574e84ca03ef Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 14 Jan 2016 18:35:13 +0000
Subject: [PATCH 0110/3220] sched/fair: keep track of energy/capacity
 variations

The current EAS implementation does not allow "to boost" tasks
performances, for example by running them at an higher OPP (or a more
capable CPU), even if that could require a "reasonable" increase in
energy consumption.  To defined how much reasonable is an energy
increase with respect to a required boost value, it is required to
define and compute a trade-off between the expected energy and
performance variations.
However, the current EAS implementation considers only energy variations
while completely disregard the impact on performance for the selection
of a certain schedule candidate.

This patch extends the eenv energy environment to keep track of both
energy and performance deltas which are implied by the activation of a
schedule candidate.
The performance variation is estimated considering the different
capacities of the CPUs in which the task could be scheduled. The idea is
that while running on a CPU with higher capacity (e.g. higher operating
point) the task could (potentially) complete faster and thus get better
performance.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f582d58daea5..3b80ad01aa8b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4706,6 +4706,16 @@ struct energy_env {
 	int			src_cpu;
 	int			dst_cpu;
 	int			energy;
+	struct {
+		int before;
+		int after;
+		int diff;
+	} nrg;
+	struct {
+		int before;
+		int after;
+		int delta;
+	} cap;
 };
 
 /*
@@ -4872,6 +4882,22 @@ static int sched_group_energy(struct energy_env *eenv)
 					eenv->sg_cap = sg;
 
 				cap_idx = find_new_capacity(eenv, sg->sge);
+
+				if (sg->group_weight == 1) {
+					/* Remove capacity of src CPU (before task move) */
+					if (eenv->util_delta == 0 &&
+					    cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
+						eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
+						eenv->cap.delta -= eenv->cap.before;
+					}
+					/* Add capacity of dst CPU  (after task move) */
+					if (eenv->util_delta != 0 &&
+					    cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
+						eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
+						eenv->cap.delta += eenv->cap.after;
+					}
+				}
+
 				idle_idx = group_idle_state(sg);
 				group_util = group_norm_util(eenv, sg);
 				sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
@@ -4920,6 +4946,8 @@ static int energy_diff(struct energy_env *eenv)
 		.util_delta	= 0,
 		.src_cpu	= eenv->src_cpu,
 		.dst_cpu	= eenv->dst_cpu,
+		.nrg		= { 0, 0, 0 },
+		.cap		= { 0, 0, 0 },
 	};
 
 	if (eenv->src_cpu == eenv->dst_cpu)
@@ -4941,13 +4969,21 @@ static int energy_diff(struct energy_env *eenv)
 				return 0; /* Invalid result abort */
 			energy_before += eenv_before.energy;
 
+			/* Keep track of SRC cpu (before) capacity */
+			eenv->cap.before = eenv_before.cap.before;
+			eenv->cap.delta = eenv_before.cap.delta;
+
 			if (sched_group_energy(eenv))
 				return 0; /* Invalid result abort */
 			energy_after += eenv->energy;
 		}
 	} while (sg = sg->next, sg != sd->groups);
 
-	return energy_after-energy_before;
+	eenv->nrg.before = energy_before;
+	eenv->nrg.after = energy_after;
+	eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+
+	return eenv->nrg.diff;
 }
 
 /*

From 637ee3715a1a22c07995be181c0e2fca10d2df2a Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Tue, 12 Jan 2016 18:12:13 +0000
Subject: [PATCH 0111/3220] sched/tune: add support to compute normalized
 energy

The current EAS implementation considers only energy variations, while it
disregards completely the impact on performance for the selection of
a certain schedule candidate. Moreover, it also makes its decision based
on the "absolute" value of expected energy variations.

In order to properly define a trade-off strategy between increased energy
consumption and performances benefits it is required to compare energy
variations with performance variations.

Thus, both performance and energy metrics must be expressed in comparable
units. While the performance variations are expressed in terms of capacity
deltas, which are defined in the range [0..SCHED_LOAD_SCALE], the same
scale is not used for energy variations.

This patch introduces the function:
  schedtune_normalize_energy(energy_diff)
which returns a normalized value in the same range of capacity variations,
i.e. [0..SCHED_LOAD_SCALE].

A proper set of energy normalization constants are required to provide
a fast division by a constant during the normalziation of the energy_diff.
The value of these constants depends on the specific energy model and
topology of a target device.
Thus, this patch provides also the required support for the computation
at boot time of this set of variables.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/tune.c | 321 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/tune.h |   7 +
 2 files changed, 328 insertions(+)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 87213861bde5..1a8ba5a6d99b 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -1,7 +1,9 @@
 #include <linux/cgroup.h>
 #include <linux/err.h>
+#include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/printk.h>
+#include <linux/reciprocal_div.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 
@@ -9,6 +11,84 @@
 
 unsigned int sysctl_sched_cfs_boost __read_mostly;
 
+/*
+ * System energy normalization constants
+ */
+static struct target_nrg {
+	unsigned long min_power;
+	unsigned long max_power;
+	struct reciprocal_value rdiv;
+} schedtune_target_nrg;
+
+/* Performance Boost region (B) threshold params */
+static int perf_boost_idx;
+
+/* Performance Constraint region (C) threshold params */
+static int perf_constrain_idx;
+
+/**
+ * Performance-Energy (P-E) Space thresholds constants
+ */
+struct threshold_params {
+	int nrg_gain;
+	int cap_gain;
+};
+
+/*
+ * System specific P-E space thresholds constants
+ */
+static struct threshold_params
+threshold_gains[] = {
+	{ 0, 4 }, /* >=  0% */
+	{ 0, 4 }, /* >= 10% */
+	{ 1, 4 }, /* >= 20% */
+	{ 2, 4 }, /* >= 30% */
+	{ 3, 4 }, /* >= 40% */
+	{ 4, 3 }, /* >= 50% */
+	{ 4, 2 }, /* >= 60% */
+	{ 4, 1 }, /* >= 70% */
+	{ 4, 0 }, /* >= 80% */
+	{ 4, 0 }  /* >= 90% */
+};
+
+static int
+__schedtune_accept_deltas(int nrg_delta, int cap_delta,
+			  int perf_boost_idx, int perf_constrain_idx)
+{
+	int payoff = -INT_MAX;
+
+	/* Performance Boost (B) region */
+	if (nrg_delta > 0 && cap_delta > 0) {
+		/*
+		 * Evaluate "Performance Boost" vs "Energy Increase"
+		 * payoff criteria:
+		 *    cap_delta / nrg_delta < cap_gain / nrg_gain
+		 * which is:
+		 *    nrg_delta * cap_gain > cap_delta * nrg_gain
+		 */
+		payoff  = nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
+		payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
+		return payoff;
+	}
+
+	/* Performance Constraint (C) region */
+	if (nrg_delta < 0 && cap_delta < 0) {
+		/*
+		 * Evaluate "Performance Boost" vs "Energy Increase"
+		 * payoff criteria:
+		 *    cap_delta / nrg_delta > cap_gain / nrg_gain
+		 * which is:
+		 *    cap_delta * nrg_gain > nrg_delta * cap_gain
+		 */
+		payoff  = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
+		payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
+		return payoff;
+	}
+
+	/* Default: reject schedule candidate */
+	return payoff;
+}
+
 #ifdef CONFIG_CGROUP_SCHEDTUNE
 
 /*
@@ -26,6 +106,11 @@ struct schedtune {
 	/* Boost value for tasks on that SchedTune CGroup */
 	int boost;
 
+	/* Performance Boost (B) region threshold params */
+	int perf_boost_idx;
+
+	/* Performance Constraint (C) region threshold params */
+	int perf_constrain_idx;
 };
 
 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
@@ -55,8 +140,37 @@ static inline struct schedtune *parent_st(struct schedtune *st)
 static struct schedtune
 root_schedtune = {
 	.boost	= 0,
+	.perf_boost_idx = 0,
+	.perf_constrain_idx = 0,
 };
 
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+			struct task_struct *task)
+{
+	struct schedtune *ct;
+	int perf_boost_idx;
+	int perf_constrain_idx;
+
+	/* Optimal (O) region */
+	if (nrg_delta < 0 && cap_delta > 0)
+		return INT_MAX;
+
+	/* Suboptimal (S) region */
+	if (nrg_delta > 0 && cap_delta < 0)
+		return -INT_MAX;
+
+	/* Get task specific perf Boost/Constraints indexes */
+	rcu_read_lock();
+	ct = task_schedtune(task);
+	perf_boost_idx = ct->perf_boost_idx;
+	perf_constrain_idx = ct->perf_constrain_idx;
+	rcu_read_unlock();
+
+	return __schedtune_accept_deltas(nrg_delta, cap_delta,
+			perf_boost_idx, perf_constrain_idx);
+}
+
 /*
  * Maximum number of boost groups to support
  * When per-task boosting is used we still allow only limited number of
@@ -396,6 +510,24 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
 	.early_init	= 1,
 };
 
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+			struct task_struct *task)
+{
+	/* Optimal (O) region */
+	if (nrg_delta < 0 && cap_delta > 0)
+		return INT_MAX;
+
+	/* Suboptimal (S) region */
+	if (nrg_delta > 0 && cap_delta < 0)
+		return -INT_MAX;
+
+	return __schedtune_accept_deltas(nrg_delta, cap_delta,
+			perf_boost_idx, perf_constrain_idx);
+}
+
 #endif /* CONFIG_CGROUP_SCHEDTUNE */
 
 int
@@ -408,5 +540,194 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
 	if (ret || !write)
 		return ret;
 
+	/* Performance Boost (B) region threshold params */
+	perf_boost_idx  = sysctl_sched_cfs_boost;
+	perf_boost_idx /= 10;
+
+	/* Performance Constraint (C) region threshold params */
+	perf_constrain_idx  = 100 - sysctl_sched_cfs_boost;
+	perf_constrain_idx /= 10;
+
 	return 0;
 }
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+int
+schedtune_normalize_energy(int energy_diff)
+{
+	u32 normalized_nrg;
+	int max_delta;
+
+#ifdef CONFIG_SCHED_DEBUG
+	/* Check for boundaries */
+	max_delta  = schedtune_target_nrg.max_power;
+	max_delta -= schedtune_target_nrg.min_power;
+	WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+	/* Do scaling using positive numbers to increase the range */
+	normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+	/* Scale by energy magnitude */
+	normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+	/* Normalize on max energy for target platform */
+	normalized_nrg = reciprocal_divide(
+			normalized_nrg, schedtune_target_nrg.rdiv);
+
+	return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static void
+schedtune_test_nrg(unsigned long delta_pwr)
+{
+	unsigned long test_delta_pwr;
+	unsigned long test_norm_pwr;
+	int idx;
+
+	/*
+	 * Check normalization constants using some constant system
+	 * energy values
+	 */
+	pr_info("schedtune: verify normalization constants...\n");
+	for (idx = 0; idx < 6; ++idx) {
+		test_delta_pwr = delta_pwr >> idx;
+
+		/* Normalize on max energy for target platform */
+		test_norm_pwr = reciprocal_divide(
+					test_delta_pwr << SCHED_LOAD_SHIFT,
+					schedtune_target_nrg.rdiv);
+
+		pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
+			idx, test_delta_pwr, test_norm_pwr);
+	}
+}
+#else
+#define schedtune_test_nrg(delta_pwr)
+#endif
+
+/*
+ * Compute the min/max power consumption of a cluster and all its CPUs
+ */
+static void
+schedtune_add_cluster_nrg(
+		struct sched_domain *sd,
+		struct sched_group *sg,
+		struct target_nrg *ste)
+{
+	struct sched_domain *sd2;
+	struct sched_group *sg2;
+
+	struct cpumask *cluster_cpus;
+	char str[32];
+
+	unsigned long min_pwr;
+	unsigned long max_pwr;
+	int cpu;
+
+	/* Get Cluster energy using EM data for the first CPU */
+	cluster_cpus = sched_group_cpus(sg);
+	snprintf(str, 32, "CLUSTER[%*pbl]",
+		 cpumask_pr_args(cluster_cpus));
+
+	min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
+	max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
+	pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+		str, min_pwr, max_pwr);
+
+	/*
+	 * Keep track of this cluster's energy in the computation of the
+	 * overall system energy
+	 */
+	ste->min_power += min_pwr;
+	ste->max_power += max_pwr;
+
+	/* Get CPU energy using EM data for each CPU in the group */
+	for_each_cpu(cpu, cluster_cpus) {
+		/* Get a SD view for the specific CPU */
+		for_each_domain(cpu, sd2) {
+			/* Get the CPU group */
+			sg2 = sd2->groups;
+			min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
+			max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
+
+			ste->min_power += min_pwr;
+			ste->max_power += max_pwr;
+
+			snprintf(str, 32, "CPU[%d]", cpu);
+			pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+				str, min_pwr, max_pwr);
+
+			/*
+			 * Assume we have EM data only at the CPU and
+			 * the upper CLUSTER level
+			 */
+			BUG_ON(!cpumask_equal(
+				sched_group_cpus(sg),
+				sched_group_cpus(sd2->parent->groups)
+				));
+			break;
+		}
+	}
+}
+
+/*
+ * Initialize the constants required to compute normalized energy.
+ * The values of these constants depends on the EM data for the specific
+ * target system and topology.
+ * Thus, this function is expected to be called by the code
+ * that bind the EM to the topology information.
+ */
+static int
+schedtune_init_late(void)
+{
+	struct target_nrg *ste = &schedtune_target_nrg;
+	unsigned long delta_pwr = 0;
+	struct sched_domain *sd;
+	struct sched_group *sg;
+
+	pr_info("schedtune: init normalization constants...\n");
+	ste->max_power = 0;
+	ste->min_power = 0;
+
+	rcu_read_lock();
+
+	/*
+	 * When EAS is in use, we always have a pointer to the highest SD
+	 * which provides EM data.
+	 */
+	sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
+	if (!sd) {
+		pr_info("schedtune: no energy model data\n");
+		goto nodata;
+	}
+
+	sg = sd->groups;
+	do {
+		schedtune_add_cluster_nrg(sd, sg, ste);
+	} while (sg = sg->next, sg != sd->groups);
+
+	rcu_read_unlock();
+
+	pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+		"SYSTEM", ste->min_power, ste->max_power);
+
+	/* Compute normalization constants */
+	delta_pwr = ste->max_power - ste->min_power;
+	ste->rdiv = reciprocal_value(delta_pwr);
+	pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
+		ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
+
+	schedtune_test_nrg(delta_pwr);
+	return 0;
+
+nodata:
+	rcu_read_unlock();
+	return -EINVAL;
+}
+late_initcall(schedtune_init_late);
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index d756ce7b06e0..f7273a5d994a 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -16,9 +16,16 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu);
 
 #endif /* CONFIG_CGROUP_SCHEDTUNE */
 
+int schedtune_normalize_energy(int energy);
+int schedtune_accept_deltas(int nrg_delta, int cap_delta,
+			    struct task_struct *task);
+
 #else /* CONFIG_SCHED_TUNE */
 
 #define schedtune_enqueue_task(task, cpu) do { } while (0)
 #define schedtune_dequeue_task(task, cpu) do { } while (0)
 
+#define schedtune_normalize_energy(energy) energy
+#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
+
 #endif /* CONFIG_SCHED_TUNE */

From a2a6dc750833243b3b49d016291ae1133196759a Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 15 Jan 2016 15:48:03 +0000
Subject: [PATCH 0112/3220] sched/fair: filter energy_diff() based on
 energy_payoff value

Once the SchedTune support is enabled and the CPU bandwidth demand of a
task is boosted, we could expect increased energy consumptions which are
balanced by corresponding increases of tasks performance.
However, the current implementation of the energy_diff() function
accepts all and _only_ the schedule candidates which results into a
reduced expected system energy, which works against the boosting
strategy.

This patch links the energy_diff() function with the "energy payoff"
engine provided by SchedTune. The energy variation computed by the
energy_diff() function is now filtered using the SchedTune support to
evaluated the energy payoff for a boosted task.

With that patch, the energy_diff() function is going to reported as
"acceptable schedule candidate" only the schedule candidate which
corresponds to a positive energy_payoff.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b80ad01aa8b..1191c4b1b119 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4706,9 +4706,12 @@ struct energy_env {
 	int			src_cpu;
 	int			dst_cpu;
 	int			energy;
+	int			payoff;
+	struct task_struct	*task;
 	struct {
 		int before;
 		int after;
+		int delta;
 		int diff;
 	} nrg;
 	struct {
@@ -4929,6 +4932,44 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
 	return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
 }
 
+#ifdef CONFIG_SCHED_TUNE
+static int energy_diff_evaluate(struct energy_env *eenv)
+{
+	unsigned int boost;
+	int nrg_delta;
+
+	/* Return energy diff when boost margin is 0 */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+	boost = schedtune_task_boost(eenv->task);
+#else
+	boost = get_sysctl_sched_cfs_boost();
+#endif
+	if (boost == 0)
+		return eenv->nrg.diff;
+
+	/* Compute normalized energy diff */
+	nrg_delta = schedtune_normalize_energy(eenv->nrg.diff);
+	eenv->nrg.delta = nrg_delta;
+
+	eenv->payoff = schedtune_accept_deltas(
+			eenv->nrg.delta,
+			eenv->cap.delta,
+			eenv->task);
+
+	/*
+	 * When SchedTune is enabled, the energy_diff() function will return
+	 * the computed energy payoff value. Since the energy_diff() return
+	 * value is expected to be negative by its callers, this evaluation
+	 * function return a negative value each time the evaluation return a
+	 * positive payoff, which is the condition for the acceptance of
+	 * a scheduling decision
+	 */
+	return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff_evaluate(eenv) eenv->nrg.diff
+#endif
+
 /*
  * energy_diff(): Estimate the energy impact of changing the utilization
  * distribution. eenv specifies the change: utilisation amount, source, and
@@ -4946,7 +4987,7 @@ static int energy_diff(struct energy_env *eenv)
 		.util_delta	= 0,
 		.src_cpu	= eenv->src_cpu,
 		.dst_cpu	= eenv->dst_cpu,
-		.nrg		= { 0, 0, 0 },
+		.nrg		= { 0, 0, 0, 0},
 		.cap		= { 0, 0, 0 },
 	};
 
@@ -4982,8 +5023,9 @@ static int energy_diff(struct energy_env *eenv)
 	eenv->nrg.before = energy_before;
 	eenv->nrg.after = energy_after;
 	eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+	eenv->payoff = 0;
 
-	return eenv->nrg.diff;
+	return energy_diff_evaluate(eenv);
 }
 
 /*
@@ -5481,6 +5523,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
 			.util_delta	= task_util(p),
 			.src_cpu	= task_cpu(p),
 			.dst_cpu	= target_cpu,
+			.task		= p,
 		};
 
 		/* Not enough spare capacity on previous cpu */

From 99ed4e57cb4231b2561ec4a6433722dcd4a19a9e Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Mon, 9 Nov 2015 12:06:24 +0000
Subject: [PATCH 0113/3220] DEBUG: sched: add tracepoint for cpu/freq scale
 invariance

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 include/trace/events/sched.h | 24 ++++++++++++++++++++++++
 kernel/sched/fair.c          |  1 +
 2 files changed, 25 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9b90c57517a9..20a7216f8abb 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -562,6 +562,30 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
 
 	TP_printk("cpu=%d", __entry->cpu)
 );
+
+TRACE_EVENT(sched_contrib_scale_f,
+
+	TP_PROTO(int cpu, unsigned long freq_scale_factor,
+		 unsigned long cpu_scale_factor),
+
+	TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned long, freq_scale_factor)
+		__field(unsigned long, cpu_scale_factor)
+	),
+
+	TP_fast_assign(
+		__entry->cpu = cpu;
+		__entry->freq_scale_factor = freq_scale_factor;
+		__entry->cpu_scale_factor = cpu_scale_factor;
+	),
+
+	TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu",
+		  __entry->cpu, __entry->freq_scale_factor,
+		  __entry->cpu_scale_factor)
+);
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1191c4b1b119..ec2e8aecc4f3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2587,6 +2587,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 
 	scale_freq = arch_scale_freq_capacity(NULL, cpu);
 	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+	trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
 
 	/* delta_w is the amount already accumulated against our next period */
 	delta_w = sa->period_contrib;

From 8017fd7418dc34d55d0e8bfc83e6c0ec10909c58 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Mon, 9 Nov 2015 12:07:27 +0000
Subject: [PATCH 0114/3220] DEBUG: sched: add tracepoint for task load/util
 signals

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 include/trace/events/sched.h | 43 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |  3 +++
 2 files changed, 46 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 20a7216f8abb..99f3e648c197 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -586,6 +586,49 @@ TRACE_EVENT(sched_contrib_scale_f,
 		  __entry->cpu, __entry->freq_scale_factor,
 		  __entry->cpu_scale_factor)
 );
+
+/*
+ * Tracepoint for accounting sched averages for tasks.
+ */
+TRACE_EVENT(sched_load_avg_task,
+
+	TP_PROTO(struct task_struct *tsk, struct sched_avg *avg),
+
+	TP_ARGS(tsk, avg),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN		)
+		__field( pid_t,	pid				)
+		__field( int,	cpu				)
+		__field( unsigned long,	load_avg		)
+		__field( unsigned long,	util_avg		)
+		__field( u64,		load_sum		)
+		__field( u32,		util_sum		)
+		__field( u32,		period_contrib		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid			= tsk->pid;
+		__entry->cpu			= task_cpu(tsk);
+		__entry->load_avg		= avg->load_avg;
+		__entry->util_avg		= avg->util_avg;
+		__entry->load_sum		= avg->load_sum;
+		__entry->util_sum		= avg->util_sum;
+		__entry->period_contrib		= avg->period_contrib;
+	),
+
+	TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu"
+		  " util_sum=%u period_contrib=%u",
+		  __entry->comm,
+		  __entry->pid,
+		  __entry->cpu,
+		  __entry->load_avg,
+		  __entry->util_avg,
+		  (u64)__entry->load_sum,
+		  (u32)__entry->util_sum,
+		  (u32)__entry->period_contrib)
+);
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ec2e8aecc4f3..8cdb4e5592f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2731,6 +2731,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 
 	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
 		update_tg_load_avg(cfs_rq, 0);
+
+	if (entity_is_task(se))
+		trace_sched_load_avg_task(task_of(se), &se->avg);
 }
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)

From e5a25996d3a3f46f3abee77f868b922903f5f60e Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Mon, 9 Nov 2015 12:07:48 +0000
Subject: [PATCH 0115/3220] DEBUG: sched: add tracepoint for CPU load/util
 signals

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 include/trace/events/sched.h | 25 +++++++++++++++++++++++++
 kernel/sched/fair.c          |  1 +
 2 files changed, 26 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 99f3e648c197..f58760c2f712 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -629,6 +629,31 @@ TRACE_EVENT(sched_load_avg_task,
 		  (u32)__entry->util_sum,
 		  (u32)__entry->period_contrib)
 );
+
+/*
+ * Tracepoint for accounting sched averages for cpus.
+ */
+TRACE_EVENT(sched_load_avg_cpu,
+
+	TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
+
+	TP_ARGS(cpu, cfs_rq),
+
+	TP_STRUCT__entry(
+		__field( int,	cpu				)
+		__field( unsigned long,	load_avg		)
+		__field( unsigned long,	util_avg		)
+	),
+
+	TP_fast_assign(
+		__entry->cpu			= cpu;
+		__entry->load_avg		= cfs_rq->avg.load_avg;
+		__entry->util_avg		= cfs_rq->avg.util_avg;
+	),
+
+	TP_printk("cpu=%d load_avg=%lu util_avg=%lu",
+		  __entry->cpu, __entry->load_avg, __entry->util_avg)
+);
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8cdb4e5592f9..b5ea9ec335d8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2734,6 +2734,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 
 	if (entity_is_task(se))
 		trace_sched_load_avg_task(task_of(se), &se->avg);
+	trace_sched_load_avg_cpu(cpu, cfs_rq);
 }
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)

From dd2460f3872f1b89839aedb21b999e802542c32b Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Thu, 30 Apr 2015 17:35:23 +0100
Subject: [PATCH 0116/3220] DEBUG: sched,cpufreq: add cpu_capacity change
 tracepoint

This is useful when we want to compare cpu utilization and
cpu curr capacity side by side.

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 drivers/cpufreq/cpufreq.c    |  4 ++++
 include/linux/sched.h        |  2 ++
 include/trace/events/power.h |  7 +++++++
 kernel/sched/fair.c          | 11 +++++++++++
 kernel/sched/sched.h         | 11 -----------
 5 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 5ca8e9517aae..0f30d11fb528 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -29,6 +29,7 @@
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
 #include <linux/tick.h>
+#include <linux/sched.h>
 #include <trace/events/power.h>
 
 static LIST_HEAD(cpufreq_policy_list);
@@ -473,6 +474,7 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy,
 void cpufreq_freq_transition_begin(struct cpufreq_policy *policy,
 		struct cpufreq_freqs *freqs)
 {
+	int cpu;
 
 	/*
 	 * Catch double invocations of _begin() which lead to self-deadlock.
@@ -501,6 +503,8 @@ wait:
 	spin_unlock(&policy->transition_lock);
 
 	scale_freq_capacity(policy, freqs);
+	for_each_cpu(cpu, policy->cpus)
+		trace_cpu_capacity(capacity_curr_of(cpu), cpu);
 
 	cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1460c6a4f65f..b9b11b2dbabd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,6 +1046,8 @@ struct sched_group_energy {
 	struct capacity_state *cap_states; /* ptr to capacity state array */
 };
 
+unsigned long capacity_curr_of(int cpu);
+
 struct sched_group;
 
 struct sched_domain {
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 284244ebfe8d..f4be04e44252 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -120,6 +120,13 @@ DEFINE_EVENT(cpu, cpu_frequency,
 	TP_ARGS(frequency, cpu_id)
 );
 
+DEFINE_EVENT(cpu, cpu_capacity,
+
+	TP_PROTO(unsigned int capacity, unsigned int cpu_id),
+
+	TP_ARGS(capacity, cpu_id)
+);
+
 TRACE_EVENT(device_pm_callback_start,
 
 	TP_PROTO(struct device *dev, const char *pm_ops, int event),
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b5ea9ec335d8..9b74b86aba3a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4698,6 +4698,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 #endif
 
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity_orig *
+	       arch_scale_freq_capacity(NULL, cpu)
+	       >> SCHED_CAPACITY_SHIFT;
+}
+
 static inline bool energy_aware(void)
 {
 	return sched_feat(ENERGY_AWARE);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 983c132af11b..db3f3db9849c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1512,17 +1512,6 @@ static inline unsigned long cpu_util(int cpu)
 	return __cpu_util(cpu, 0);
 }
 
-/*
- * Returns the current capacity of cpu after applying both
- * cpu and freq scaling.
- */
-static inline unsigned long capacity_curr_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity_orig *
-	       arch_scale_freq_capacity(NULL, cpu)
-	       >> SCHED_CAPACITY_SHIFT;
-}
-
 #endif
 
 #ifdef CONFIG_CPU_FREQ_GOV_SCHED

From 0a0f4aa12f6ec50e8b5d917b773e29702714acad Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 14 Nov 2014 16:25:50 +0000
Subject: [PATCH 0117/3220] DEBUG: sched: add energy procfs interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch makes the energy data available via procfs. The related files
are placed as sub-directory named 'energy' inside the
/proc/sys/kernel/sched_domain/cpuX/domainY/groupZ directory for those
cpu/domain/group tuples which have energy information.

The following example depicts the contents of
/proc/sys/kernel/sched_domain/cpu0/domain0/group[01] for a system which
has energy information attached to domain level 0.

├── cpu0
│   ├── domain0
│   │   ├── busy_factor
│   │   ├── busy_idx
│   │   ├── cache_nice_tries
│   │   ├── flags
│   │   ├── forkexec_idx
│   │   ├── group0
│   │   │   └── energy
│   │   │       ├── cap_states
│   │   │       ├── idle_states
│   │   │       ├── nr_cap_states
│   │   │       └── nr_idle_states
│   │   ├── group1
│   │   │   └── energy
│   │   │       ├── cap_states
│   │   │       ├── idle_states
│   │   │       ├── nr_cap_states
│   │   │       └── nr_idle_states
│   │   ├── idle_idx
│   │   ├── imbalance_pct
│   │   ├── max_interval
│   │   ├── max_newidle_lb_cost
│   │   ├── min_interval
│   │   ├── name
│   │   ├── newidle_idx
│   │   └── wake_idx
│   └── domain1
│       ├── busy_factor
│       ├── busy_idx
│       ├── cache_nice_tries
│       ├── flags
│       ├── forkexec_idx
│       ├── idle_idx
│       ├── imbalance_pct
│       ├── max_interval
│       ├── max_newidle_lb_cost
│       ├── min_interval
│       ├── name
│       ├── newidle_idx
│       └── wake_idx

The files 'nr_idle_states' and 'nr_cap_states' contain a scalar value
whereas 'idle_states' and 'cap_states' contain a vector of power
consumption at this idle state respectively (compute capacity, power
consumption) at this capacity state.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 151ed9c62ad2..9e25e68747d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5412,10 +5412,61 @@ set_table_entry(struct ctl_table *entry,
 	}
 }
 
+static struct ctl_table *
+sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(5);
+
+	if (table == NULL)
+		return NULL;
+
+	set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
+			sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
+			sge->nr_idle_states*sizeof(struct idle_state), 0644,
+			proc_doulongvec_minmax, false);
+	set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
+			sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
+			sge->nr_cap_states*sizeof(struct capacity_state), 0644,
+			proc_doulongvec_minmax, false);
+
+	return table;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_group_table(struct sched_group *sg)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(2);
+
+	if (table == NULL)
+		return NULL;
+
+	table->procname = kstrdup("energy", GFP_KERNEL);
+	table->mode = 0555;
+	table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
+
+	return table;
+}
+
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-	struct ctl_table *table = sd_alloc_ctl_entry(14);
+	struct ctl_table *table;
+	unsigned int nr_entries = 14;
+
+	int i = 0;
+	struct sched_group *sg = sd->groups;
+
+	if (sg->sge) {
+		int nr_sgs = 0;
+
+		do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
+
+		nr_entries += nr_sgs;
+	}
+
+	table = sd_alloc_ctl_entry(nr_entries);
 
 	if (table == NULL)
 		return NULL;
@@ -5448,7 +5499,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(long), 0644, proc_doulongvec_minmax, false);
 	set_table_entry(&table[12], "name", sd->name,
 		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-	/* &table[13] is terminator */
+	sg = sd->groups;
+	if (sg->sge) {
+		char buf[32];
+		struct ctl_table *entry = &table[13];
+
+		do {
+			snprintf(buf, 32, "group%d", i);
+			entry->procname = kstrdup(buf, GFP_KERNEL);
+			entry->mode = 0555;
+			entry->child = sd_alloc_ctl_group_table(sg);
+		} while (entry++, i++, sg = sg->next, sg != sd->groups);
+	}
+	/* &table[nr_entries-1] is terminator */
 
 	return table;
 }

From c8a65d2e9a3a6ba9d4d7b1634701dd34017f4107 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Mon, 22 Jun 2015 13:49:07 +0100
Subject: [PATCH 0118/3220] DEBUG: schedtune: add tracepoint for SchedTune
 configuration update

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/trace/events/sched.h | 21 +++++++++++++++++++++
 kernel/sched/tune.c          |  4 ++++
 2 files changed, 25 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index f58760c2f712..8ae43a2ebfda 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -654,6 +654,27 @@ TRACE_EVENT(sched_load_avg_cpu,
 	TP_printk("cpu=%d load_avg=%lu util_avg=%lu",
 		  __entry->cpu, __entry->load_avg, __entry->util_avg)
 );
+
+/*
+ * Tracepoint for sched_tune_config settings
+ */
+TRACE_EVENT(sched_tune_config,
+
+	TP_PROTO(int boost),
+
+	TP_ARGS(boost),
+
+	TP_STRUCT__entry(
+		__field( int,	boost		)
+	),
+
+	TP_fast_assign(
+		__entry->boost 	= boost;
+	),
+
+	TP_printk("boost=%d ", __entry->boost)
+);
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 1a8ba5a6d99b..f5f4c57efb9e 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -7,6 +7,8 @@
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 
+#include <trace/events/sched.h>
+
 #include "sched.h"
 
 unsigned int sysctl_sched_cfs_boost __read_mostly;
@@ -392,6 +394,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
 	/* Update CPU boost */
 	schedtune_boostgroup_update(st->idx, st->boost);
 
+	trace_sched_tune_config(st->boost);
+
 	return 0;
 }
 

From a09a25c5dfc81d08df7b3c2e76ea94a651c17ac2 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Mon, 22 Jun 2015 13:51:07 +0100
Subject: [PATCH 0119/3220] DEBUG: schedtune: add tracepoint for CPU boost
 signal

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/trace/events/sched.h | 27 +++++++++++++++++++++++++++
 kernel/sched/fair.c          |  2 ++
 2 files changed, 29 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 8ae43a2ebfda..eb812071458b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -675,6 +675,33 @@ TRACE_EVENT(sched_tune_config,
 	TP_printk("boost=%d ", __entry->boost)
 );
 
+/*
+ * Tracepoint for accounting CPU  boosted utilization
+ */
+TRACE_EVENT(sched_boost_cpu,
+
+	TP_PROTO(int cpu, unsigned long util, unsigned long margin),
+
+	TP_ARGS(cpu, util, margin),
+
+	TP_STRUCT__entry(
+		__field( int,		cpu			)
+		__field( unsigned long,	util			)
+		__field( unsigned long,	margin			)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= cpu;
+		__entry->util	= util;
+		__entry->margin	= margin;
+	),
+
+	TP_printk("cpu=%d util=%lu margin=%lu",
+		  __entry->cpu,
+		  __entry->util,
+		  __entry->margin)
+);
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b74b86aba3a..3bc56e0ea1ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5272,6 +5272,8 @@ boosted_cpu_util(int cpu)
 	unsigned long util = cpu_util(cpu);
 	unsigned long margin = schedtune_cpu_margin(util, cpu);
 
+	trace_sched_boost_cpu(cpu, util, margin);
+
 	return util + margin;
 }
 

From fed95d9ef1b2674c4a2cda8b2445ca7ef3f6ca59 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Wed, 24 Jun 2015 15:36:08 +0100
Subject: [PATCH 0120/3220] DEBUG: schedtune: add tracepoint for
 schedtune_tasks_update() values

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/trace/events/sched.h | 62 ++++++++++++++++++++++++++++++++++++
 kernel/sched/tune.c          | 12 ++++++-
 2 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index eb812071458b..7ec9dcfc701a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -702,6 +702,68 @@ TRACE_EVENT(sched_boost_cpu,
 		  __entry->margin)
 );
 
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_tasks_update,
+
+	TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
+		unsigned int boost, unsigned int max_boost),
+
+	TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN	)
+		__field( pid_t,		pid		)
+		__field( int,		cpu		)
+		__field( int,		tasks		)
+		__field( int,		idx		)
+		__field( unsigned int,	boost		)
+		__field( unsigned int,	max_boost	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		= tsk->pid;
+		__entry->cpu 		= cpu;
+		__entry->tasks		= tasks;
+		__entry->idx 		= idx;
+		__entry->boost		= boost;
+		__entry->max_boost	= max_boost;
+	),
+
+	TP_printk("pid=%d comm=%s "
+			"cpu=%d tasks=%d idx=%d boost=%u max_boost=%u",
+		__entry->pid, __entry->comm,
+		__entry->cpu, __entry->tasks, __entry->idx,
+		__entry->boost, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for schedtune_boostgroup_update
+ */
+TRACE_EVENT(sched_tune_boostgroup_update,
+
+	TP_PROTO(int cpu, int variation, int max_boost),
+
+	TP_ARGS(cpu, variation, max_boost),
+
+	TP_STRUCT__entry(
+		__field( int,	cpu		)
+		__field( int,	variation	)
+		__field( int,	max_boost	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu		= cpu;
+		__entry->variation	= variation;
+		__entry->max_boost	= max_boost;
+	),
+
+	TP_printk("cpu=%d variation=%d max_boost=%d",
+		__entry->cpu, __entry->variation, __entry->max_boost)
+);
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index f5f4c57efb9e..7a434f2394e7 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -264,12 +264,18 @@ schedtune_boostgroup_update(int idx, int boost)
 		/* Check if this update increase current max */
 		if (boost > cur_boost_max && bg->group[idx].tasks) {
 			bg->boost_max = boost;
+			trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
 			continue;
 		}
 
 		/* Check if this update has decreased current max */
-		if (cur_boost_max == old_boost && old_boost > boost)
+		if (cur_boost_max == old_boost && old_boost > boost) {
 			schedtune_cpu_update(cpu);
+			trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
+			continue;
+		}
+
+		trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
 	}
 
 	return 0;
@@ -293,6 +299,10 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
 	tasks = bg->group[idx].tasks;
 	if (tasks == 1 || tasks == 0)
 		schedtune_cpu_update(cpu);
+
+	trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+			bg->group[idx].boost, bg->boost_max);
+
 }
 
 /*

From e3b53389cf2313172034123a84d760984c4efd71 Mon Sep 17 00:00:00 2001
From: Lianwei Wang <lianwei.wang@gmail.com>
Date: Fri, 6 May 2016 00:17:57 -0700
Subject: [PATCH 0121/3220] Revert "drivers: power: Add watchdog timer to catch
 drivers which lockup during suspend."

This reverts commit ad86cc8ad63229eeeba0628e99f2f59df55a25fd.

Commit 70fea60d888d ("PM / Sleep: Detect device suspend/resume lockup...")
added a suspend/resume watchdog timer to catch the lockup. Let's revert the
duplicate one.

Change-Id: Ic72a87432e27844155467817600adc6cf0c2209c
Signed-off-by: Lianwei Wang <lianwei.wang@gmail.com>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/base/power/main.c | 43 ---------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 6ed8b9326629..2f7a6e1b568c 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -62,12 +62,6 @@ struct suspend_stats suspend_stats;
 static DEFINE_MUTEX(dpm_list_mtx);
 static pm_message_t pm_transition;
 
-static void dpm_drv_timeout(unsigned long data);
-struct dpm_drv_wd_data {
-	struct device *dev;
-	struct task_struct *tsk;
-};
-
 static int async_error;
 
 static char *pm_verb(int event)
@@ -838,30 +832,6 @@ static void async_resume(void *data, async_cookie_t cookie)
 	put_device(dev);
 }
 
-/**
- *	dpm_drv_timeout - Driver suspend / resume watchdog handler
- *	@data: struct device which timed out
- *
- * 	Called when a driver has timed out suspending or resuming.
- * 	There's not much we can do here to recover so
- * 	BUG() out for a crash-dump
- *
- */
-static void dpm_drv_timeout(unsigned long data)
-{
-	struct dpm_drv_wd_data *wd_data = (void *)data;
-	struct device *dev = wd_data->dev;
-	struct task_struct *tsk = wd_data->tsk;
-
-	printk(KERN_EMERG "**** DPM device timeout: %s (%s)\n", dev_name(dev),
-	       (dev->driver ? dev->driver->name : "no driver"));
-
-	printk(KERN_EMERG "dpm suspend stack:\n");
-	show_stack(tsk, NULL);
-
-	BUG();
-}
-
 /**
  * dpm_resume - Execute "resume" callbacks for non-sysdev devices.
  * @state: PM transition of the system being carried out.
@@ -1380,8 +1350,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	pm_callback_t callback = NULL;
 	char *info = NULL;
 	int error = 0;
-	struct timer_list timer;
-	struct dpm_drv_wd_data data;
 	char suspend_abort[MAX_SUSPEND_ABORT_LEN];
 	DECLARE_DPM_WATCHDOG_ON_STACK(wd);
 
@@ -1412,14 +1380,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 
 	if (dev->power.syscore)
 		goto Complete;
-	
-	data.dev = dev;
-	data.tsk = current;
-	init_timer_on_stack(&timer);
-	timer.expires = jiffies + HZ * 12;
-	timer.function = dpm_drv_timeout;
-	timer.data = (unsigned long)&data;
-	add_timer(&timer);
 
 	if (dev->power.direct_complete) {
 		if (pm_runtime_status_suspended(dev)) {
@@ -1500,9 +1460,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	device_unlock(dev);
 	dpm_watchdog_clear(&wd);
 
-	del_timer_sync(&timer);
-	destroy_timer_on_stack(&timer);
-
  Complete:
 	complete_all(&dev->power.completion);
 	if (error)

From e9253e876087accdab8edb763cfa4c3a48a8ce66 Mon Sep 17 00:00:00 2001
From: Jimmy Perchet <jimmy.perchet@parrot.com>
Date: Mon, 9 May 2016 10:32:04 -0700
Subject: [PATCH 0122/3220] FROMLIST: wlcore: Disable filtering in AP role

When you configure (set it up) a STA interface, the driver
install a multicast filter. This is normal behavior, when
one application subscribe to multicast address the filter
is updated. When Access Point interface is configured, there
is no filter installation and the "filter update" path is
disabled in the driver.

The problem happens when you switch an interface from STA
type to AP type. The filter is installed but there are no
means to update it.

Change-Id: Ied22323af831575303abd548574918baa9852dd0
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 drivers/net/wireless/ti/wlcore/init.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/wireless/ti/wlcore/init.c b/drivers/net/wireless/ti/wlcore/init.c
index e92f2639af2c..9fd3c6af0a61 100644
--- a/drivers/net/wireless/ti/wlcore/init.c
+++ b/drivers/net/wireless/ti/wlcore/init.c
@@ -549,6 +549,11 @@ static int wl12xx_init_ap_role(struct wl1271 *wl, struct wl12xx_vif *wlvif)
 {
 	int ret;
 
+	/* Disable filtering */
+	ret = wl1271_acx_group_address_tbl(wl, wlvif, false, NULL, 0);
+	if (ret < 0)
+		return ret;
+
 	ret = wl1271_acx_ap_max_tx_retry(wl, wlvif);
 	if (ret < 0)
 		return ret;

From 92c4fc6f0946e846dee50b79b5dd061fe795dfaf Mon Sep 17 00:00:00 2001
From: Janis Danisevskis <jdanis@google.com>
Date: Thu, 14 Apr 2016 13:57:03 +0100
Subject: [PATCH 0123/3220] UPSTREAM: procfs: fixes pthread cross-thread naming
 if !PR_DUMPABLE

The PR_DUMPABLE flag causes the pid related paths of the
proc file system to be owned by ROOT. The implementation
of pthread_set/getname_np however needs access to
/proc/<pid>/task/<tid>/comm.
If PR_DUMPABLE is false this implementation is locked out.

This patch installs a special permission function for
the file "comm" that grants read and write access to
all threads of the same group regardless of the ownership
of the inode. For all other threads the function falls back
to the generic inode permission check.

Signed-off-by: Janis Danisevskis <jdanis@google.com>
---
 fs/proc/base.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b6962c52965..c21bb4beb0d7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3076,6 +3076,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 	return 0;
 }
 
+/*
+ * proc_tid_comm_permission is a special permission function exclusively
+ * used for the node /proc/<pid>/task/<tid>/comm.
+ * It bypasses generic permission checks in the case where a task of the same
+ * task group attempts to access the node.
+ * The rational behind this is that glibc and bionic access this node for
+ * cross thread naming (pthread_set/getname_np(!self)). However, if
+ * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
+ * which locks out the cross thread naming implementation.
+ * This function makes sure that the node is always accessible for members of
+ * same thread group.
+ */
+static int proc_tid_comm_permission(struct inode *inode, int mask)
+{
+	bool is_same_tgroup;
+	struct task_struct *task;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+	is_same_tgroup = same_thread_group(current, task);
+	put_task_struct(task);
+
+	if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
+		/* This file (/proc/<pid>/task/<tid>/comm) can always be
+		 * read or written by the members of the corresponding
+		 * thread group.
+		 */
+		return 0;
+	}
+
+	return generic_permission(inode, mask);
+}
+
+static const struct inode_operations proc_tid_comm_inode_operations = {
+		.permission = proc_tid_comm_permission,
+};
+
 /*
  * Tasks
  */
@@ -3094,7 +3132,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
-	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+	NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
+			 &proc_tid_comm_inode_operations,
+			 &proc_pid_set_comm_operations, {}),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	ONE("syscall",   S_IRUSR, proc_pid_syscall),
 #endif

From 239dd543887daf9337567e68e4de7f42efa53a48 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Wed, 11 May 2016 11:01:02 -0700
Subject: [PATCH 0124/3220] fiq_debugger: Add fiq_debugger.disable option

This change allows to use same kernel image with
different console options for uart and fiq_debugger.
If fiq_debugger.disable will be set to 1/y/Y,
fiq_debugger will not be initialized.

Change-Id: I71fda54f5f863d13b1437b1f909e52dd375d002d
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 drivers/staging/android/fiq_debugger/fiq_debugger.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c
index 0c558331a3d2..b132cff14f01 100644
--- a/drivers/staging/android/fiq_debugger/fiq_debugger.c
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c
@@ -123,11 +123,13 @@ static bool initial_console_enable;
 #endif
 
 static bool fiq_kgdb_enable;
+static bool fiq_debugger_disable;
 
 module_param_named(no_sleep, initial_no_sleep, bool, 0644);
 module_param_named(debug_enable, initial_debug_enable, bool, 0644);
 module_param_named(console_enable, initial_console_enable, bool, 0644);
 module_param_named(kgdb_enable, fiq_kgdb_enable, bool, 0644);
+module_param_named(disable, fiq_debugger_disable, bool, 0644);
 
 #ifdef CONFIG_FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON
 static inline
@@ -1230,6 +1232,10 @@ int fiq_debugger_uart_overlay(void)
 
 static int __init fiq_debugger_init(void)
 {
+	if (fiq_debugger_disable) {
+		pr_err("serial_debugger: disabled\n");
+		return -ENODEV;
+	}
 #if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
 	fiq_debugger_tty_init();
 #endif

From cc0063b8eb44f08bb899fb47d562c67d73b7245b Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 12 May 2016 11:17:52 -0700
Subject: [PATCH 0125/3220] xt_qtaguid: Fix panic caused by processing non-full
 socket.

In an issue very similar to 4e461c777e3 (xt_qtaguid: Fix panic
caused by synack processing), we were seeing panics on occasion
in testing.

In this case, it was the same issue, but caused by a different
call path, as the sk being returned from qtaguid_find_sk() was
not a full socket. Resulting in the sk->sk_socket deref to fail.

This patch adds an extra check to ensure the sk being retuned
is a full socket, and if not it returns NULL.

Reported-by: Milosz Wasilewski <milosz.wasilewski@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 net/netfilter/xt_qtaguid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index 822dc3c3bce1..e2e7d54f9bb1 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -1606,7 +1606,7 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
 		 * When in TCP_TIME_WAIT the sk is not a "struct sock" but
 		 * "struct inet_timewait_sock" which is missing fields.
 		 */
-		if (sk->sk_state  == TCP_TIME_WAIT) {
+		if (!sk_fullsock(sk) || sk->sk_state  == TCP_TIME_WAIT) {
 			sock_gen_put(sk);
 			sk = NULL;
 		}

From d495067ae176cfd7a027d11c956fbacad009a4c7 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 16 May 2016 13:17:28 +0530
Subject: [PATCH 0126/3220] Revert "cpufreq: interactive: build fixes for 4.4"

This reverts commit bc68f6c4efbd4ddbb15817203f18b7941d9ffd52.

This build fix broke the Interactive Gov at runtime with duplicate sysfs
entry warnings at boot time. We no longer need to this create/destroy
cpufreq sysfs entry at run time on need basis thanks to upstream commit
8eec1020f0c0 (cpufreq: create cpu/cpufreq at boot time) which creates it
at boot time. Hence drop this build fix.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/cpufreq/cpufreq_interactive.c | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
index bd83be39f17b..7f846b06e024 100644
--- a/drivers/cpufreq/cpufreq_interactive.c
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -116,28 +116,6 @@ struct cpufreq_interactive_tunables {
 	bool io_is_busy;
 };
 
-/*
- * HACK: FIXME: Bring back cpufreq_{get,put}_global_kobject()
- * definition removed by upstream commit 8eec1020f0c0 "cpufreq:
- * create cpu/cpufreq at boot time" to fix build failures.
- */
-static int cpufreq_global_kobject_usage;
-
-int cpufreq_get_global_kobject(void)
-{
-	if (!cpufreq_global_kobject_usage++)
-		return kobject_add(cpufreq_global_kobject,
-				&cpu_subsys.dev_root->kobj, "%s", "cpufreq");
-
-	return 0;
-}
-
-void cpufreq_put_global_kobject(void)
-{
-	if (!--cpufreq_global_kobject_usage)
-		kobject_del(cpufreq_global_kobject);
-}
-
 /* For cases where we have single governor instance for system */
 static struct cpufreq_interactive_tunables *common_tunables;
 

From 287d9d0efbd378dc0d7919bc9820737c869f5c0b Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 16 May 2016 13:25:35 +0530
Subject: [PATCH 0127/3220] cpufreq: interactive: drop
 cpufreq_{get,put}_global_kobject func calls

Upstream commit 8eec1020f0c0 (cpufreq: create cpu/cpufreq at boot time)
make sure that cpufreq sysfs entry get created at boot time, and there
is no need to create/destroy it on need basis anymore.

So drop deprecated cpufreq_{get,put}_global_kobject function calls which
otherwise result in following compilation errors:

drivers/cpufreq/cpufreq_interactive.c: In function 'cpufreq_governor_interactive':
drivers/cpufreq/cpufreq_interactive.c:1187:4: error: implicit declaration of function 'cpufreq_get_global_kobject' [-Werror=implicit-function-declaration]
    WARN_ON(cpufreq_get_global_kobject());
    ^
drivers/cpufreq/cpufreq_interactive.c:1197:5: error: implicit declaration of function 'cpufreq_put_global_kobject'[-Werror=implicit-function-declaration]
     cpufreq_put_global_kobject();
     ^

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/cpufreq/cpufreq_interactive.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
index 7f846b06e024..f2929e628820 100644
--- a/drivers/cpufreq/cpufreq_interactive.c
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -1184,7 +1184,6 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
 		policy->governor_data = tunables;
 		if (!have_governor_per_policy()) {
 			common_tunables = tunables;
-			WARN_ON(cpufreq_get_global_kobject());
 		}
 
 		rc = sysfs_create_group(get_governor_parent_kobj(policy),
@@ -1194,7 +1193,6 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
 			policy->governor_data = NULL;
 			if (!have_governor_per_policy()) {
 				common_tunables = NULL;
-				cpufreq_put_global_kobject();
 			}
 			return rc;
 		}
@@ -1218,9 +1216,6 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
 			sysfs_remove_group(get_governor_parent_kobj(policy),
 					get_sysfs_attr());
 
-			if (!have_governor_per_policy())
-				cpufreq_put_global_kobject();
-
 			kfree(tunables);
 			common_tunables = NULL;
 		}

From 9a326f6c084b089765ed8ee903b76a5e8251b0b3 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Tue, 17 May 2016 16:06:17 +0530
Subject: [PATCH 0128/3220] Revert "drivers: power: use 'current' instead of
 'get_current()'"

This reverts commit e1b5d103894d097fb630aebc3c1fdaf257f7c9bb.

This patch fixed the aosp commit ad86cc8ad632 (drivers: power:
Add watchdog timer to catch drivers which lockup during suspend.),
which we dropped in Change Id Ic72a87432e27844155467817600adc6cf0c2209c,
so we no longer need this fix. A part of this patch is already reverted
in above mentioned Change Id.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/base/power/main.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 2f7a6e1b568c..5ea529165362 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -35,8 +35,6 @@
 #include <linux/timer.h>
 #include <linux/wakeup_reason.h>
 
-#include <asm/current.h>
-
 #include "../base.h"
 #include "power.h"
 

From 1f95b0e9be77e4d72a6352b6fc486d0678def27f Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 10 Jan 2016 22:40:55 -0800
Subject: [PATCH 0129/3220] UPSTREAM: tty: Fix unsafe ldisc reference via
 ioctl(TIOCGETD)

(cherry pick from commit 5c17c861a357e9458001f021a7afa7aab9937439)

ioctl(TIOCGETD) retrieves the line discipline id directly from the
ldisc because the line discipline id (c_line) in termios is untrustworthy;
userspace may have set termios via ioctl(TCSETS*) without actually
changing the line discipline via ioctl(TIOCSETD).

However, directly accessing the current ldisc via tty->ldisc is
unsafe; the ldisc ptr dereferenced may be stale if the line discipline
is changing via ioctl(TIOCSETD) or hangup.

Wait for the line discipline reference (just like read() or write())
to retrieve the "current" line discipline id.

Cc: <stable@vger.kernel.org>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Bug: 28409131
Change-Id: I6774bd883a2e48bbe020486c72c42fb410e3f98a
---
 drivers/tty/tty_io.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index bcc8e1e8bb72..ab15b937fedb 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2652,6 +2652,28 @@ static int tiocsetd(struct tty_struct *tty, int __user *p)
 	return ret;
 }
 
+/**
+ *	tiocgetd	-	get line discipline
+ *	@tty: tty device
+ *	@p: pointer to user data
+ *
+ *	Retrieves the line discipline id directly from the ldisc.
+ *
+ *	Locking: waits for ldisc reference (in case the line discipline
+ *		is changing or the tty is being hungup)
+ */
+
+static int tiocgetd(struct tty_struct *tty, int __user *p)
+{
+	struct tty_ldisc *ld;
+	int ret;
+
+	ld = tty_ldisc_ref_wait(tty);
+	ret = put_user(ld->ops->num, p);
+	tty_ldisc_deref(ld);
+	return ret;
+}
+
 /**
  *	send_break	-	performed time break
  *	@tty: device to break on
@@ -2878,7 +2900,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCGSID:
 		return tiocgsid(tty, real_tty, p);
 	case TIOCGETD:
-		return put_user(tty->ldisc->ops->num, (int __user *)p);
+		return tiocgetd(tty, p);
 	case TIOCSETD:
 		return tiocsetd(tty, p);
 	case TIOCVHANGUP:

From 8f2aea82ca669025419a4c9ab23b94f97c1ee170 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 4 Apr 2016 14:15:23 -0400
Subject: [PATCH 0130/3220] =?UTF-8?q?UPSTREAM:=20mac80211:=20fix=20"warnin?=
 =?UTF-8?q?g:=20=E2=80=98target=5Fmetric=E2=80=99=20may=20be=20used=20unin?=
 =?UTF-8?q?itialized"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(This cherry-picks b4201cc4fc6e1c57d6d306b1f787865043d60129 upstream)

This fixes:

net/mac80211/mesh_hwmp.c:603:26: warning: ‘target_metric’ may be used uninitialized in this function

target_metric is only consumed when reply = true so no bug exists here,
but not all versions of gcc realize it.  Initialize to 0 to remove the
warning.

Change-Id: I13923fda9d314f48196c29e4354133dfe01f5abd
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
[jstultz: Cherry-picked to android-4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 net/mac80211/mesh_hwmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index c6be0b4f4058..b6dc2d7cd650 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -530,7 +530,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 	const u8 *target_addr, *orig_addr;
 	const u8 *da;
 	u8 target_flags, ttl, flags;
-	u32 orig_sn, target_sn, lifetime, target_metric;
+	u32 orig_sn, target_sn, lifetime, target_metric = 0;
 	bool reply = false;
 	bool forward = true;
 	bool root_is_gate;

From cba8d1f729d830a200055f0a4bab51883304ac9c Mon Sep 17 00:00:00 2001
From: Winter Wang <wente.wang@nxp.com>
Date: Fri, 20 May 2016 11:05:00 +0800
Subject: [PATCH 0131/3220] ANDROID: usb: gadget: f_midi: set fi->f to NULL
 when free f_midi function

fi->f is set in f_midi's alloc_func, need to clean this to
NULL in free_func, otherwise on ConfigFS's function switch,
midi->usb_function it self is freed, fi->f will be a wild
pointer and run into below kernel panic:
---------------
[   58.950628] Unable to handle kernel paging request at virtual address 63697664
[   58.957869] pgd = c0004000
[   58.960583] [63697664] *pgd=00000000
[   58.964185] Internal error: Oops: 80000005 [#1] PREEMPT SMP ARM
[   58.970111] Modules linked in:
[   58.973191] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.1.15-03504-g34c857c-dirty #89
[   58.981024] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree)
[   58.987557] task: c110bd70 ti: c1100000 task.ti: c1100000
[   58.992962] PC is at 0x63697664
[   58.996120] LR is at android_setup+0x78/0x138
<..snip..>
[   60.044980] 1fc0: ffffffff ffffffff c1000684 00000000 00000000 c108ecd0 c11f7294 c11039c0
[   60.053181] 1fe0: c108eccc c110d148 1000406a 412fc09a 00000000 1000807c 00000000 00000000
[   60.061420] [<c073b1fc>] (android_setup) from [<c0730490>] (udc_irq+0x758/0x1034)
[   60.068951] [<c0730490>] (udc_irq) from [<c017c650>] (handle_irq_event_percpu+0x50/0x254)
[   60.077165] [<c017c650>] (handle_irq_event_percpu) from [<c017c890>] (handle_irq_event+0x3c/0x5c)
[   60.086072] [<c017c890>] (handle_irq_event) from [<c017f3ec>] (handle_fasteoi_irq+0xe0/0x198)
[   60.094630] [<c017f3ec>] (handle_fasteoi_irq) from [<c017bcfc>] (generic_handle_irq+0x2c/0x3c)
[   60.103271] [<c017bcfc>] (generic_handle_irq) from [<c017bfb8>] (__handle_domain_irq+0x7c/0xec)
[   60.112000] [<c017bfb8>] (__handle_domain_irq) from [<c0101450>] (gic_handle_irq+0x24/0x5c)
--------------

Signed-off-by: Winter Wang <wente.wang@nxp.com>
---
 drivers/usb/gadget/function/f_midi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c
index 8a0e7f988d25..847f70363477 100644
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -1140,6 +1140,7 @@ static void f_midi_free(struct usb_function *f)
 	for (i = opts->in_ports - 1; i >= 0; --i)
 		kfree(midi->in_port[i]);
 	kfree(midi);
+	opts->func_inst.f = NULL;
 	--opts->refcnt;
 	mutex_unlock(&opts->lock);
 }

From c11dd134ea73ff9ee6bdb2807ac064cef2f284bd Mon Sep 17 00:00:00 2001
From: Haojian Zhuang <haojian.zhuang@linaro.org>
Date: Fri, 22 Apr 2016 17:23:29 +0800
Subject: [PATCH 0132/3220] arm64: add option to build Image-dtb

Some bootloaders couldn't decompress Image.gz-dtb.

Change-Id: I698cd0c4ee6894e8d0655d88f3ecf4826c28a645
Signed-off-by: Haojian Zhuang <haojian.zhuang@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 arch/arm64/Makefile      | 2 +-
 arch/arm64/boot/Makefile | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index a5c84594f6ad..4e77fd165f38 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -106,7 +106,7 @@ dtbs: prepare scripts
 dtbs_install:
 	$(Q)$(MAKE) $(dtbinst)=$(boot)/dts
 
-Image.gz-dtb: vmlinux scripts dtbs
+Image-dtb Image.gz-dtb: vmlinux scripts dtbs
 	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
 PHONY += vdso_install
diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile
index ddd405f88344..7ab8e74cd83a 100644
--- a/arch/arm64/boot/Makefile
+++ b/arch/arm64/boot/Makefile
@@ -32,6 +32,9 @@ $(obj)/Image: vmlinux FORCE
 $(obj)/Image.bz2: $(obj)/Image FORCE
 	$(call if_changed,bzip2)
 
+$(obj)/Image-dtb: $(obj)/Image $(DTB_OBJS) FORCE
+	$(call if_changed,cat)
+
 $(obj)/Image.gz: $(obj)/Image FORCE
 	$(call if_changed,gzip)
 

From 9f6f7a27247cbfa50c1c9ebdd8417e2006a18100 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Tue, 24 May 2016 14:41:57 -0700
Subject: [PATCH 0133/3220] ARM64: Ignore Image-dtb from git point of view

Change-Id: I5bbf1db90f28ea956383b4a5d91ad508eea656dc
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 arch/arm64/boot/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore
index eb3551131b1e..34e35209fc2e 100644
--- a/arch/arm64/boot/.gitignore
+++ b/arch/arm64/boot/.gitignore
@@ -1,3 +1,4 @@
 Image
+Image-dtb
 Image.gz
 Image.gz-dtb

From dbe3b633a70b928e8f7ca92c9eb6751ea4bd71bc Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Thu, 26 May 2016 12:57:56 +0530
Subject: [PATCH 0134/3220] Revert "arm: dcc_tty: fix armv6 dcc tty build
 failure"

This reverts commit dfc1d4be88597141f5ad9d39908c13944d209009.

Drop AOSP's "armv6 dcc tty driver" in favor of upstream DCC driver for
ARMv6/v7 16c63f8ea49c (drivers: char: hvc: add arm JTAG DCC console
support) and for ARMv8 4cad4c57e0b3 (ARM64: TTY: hvc_dcc: Add support
for ARM64 dcc).

Change-Id: I8110a4fd649b8ac1ec9bfac00255c1214135e4b2
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/char/dcc_tty.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/char/dcc_tty.c b/drivers/char/dcc_tty.c
index 0a62d410286f..a787accdcb14 100644
--- a/drivers/char/dcc_tty.c
+++ b/drivers/char/dcc_tty.c
@@ -26,7 +26,7 @@ MODULE_DESCRIPTION("DCC TTY Driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION("1.0");
 
-DEFINE_SPINLOCK(g_dcc_tty_lock);
+static spinlock_t g_dcc_tty_lock = SPIN_LOCK_UNLOCKED;
 static struct hrtimer g_dcc_timer;
 static char g_dcc_buffer[16];
 static int g_dcc_buffer_head;
@@ -80,8 +80,8 @@ static void dcc_poll_locked(void)
 		);
 		if (rch >= 0) {
 			ch = rch;
-			tty_insert_flip_string(g_dcc_tty->port, &ch, 1);
-			tty_flip_buffer_push(g_dcc_tty->port);
+			tty_insert_flip_string(g_dcc_tty, &ch, 1);
+			tty_flip_buffer_push(g_dcc_tty);
 		}
 	}
 

From fd2e341b4741a29e04f2010ced14edf5d2aa3ee0 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Thu, 26 May 2016 12:58:21 +0530
Subject: [PATCH 0135/3220] Revert "armv6 dcc tty driver"

This reverts commit 97312429c2bef1bf8055d01b35cf12028f60ef62.

Drop AOSP's "armv6 dcc tty driver" in favor of upstream DCC driver for
ARMv6/v7 16c63f8ea49c (drivers: char: hvc: add arm JTAG DCC console
support) and for ARMv8 4cad4c57e0b3 (ARM64: TTY: hvc_dcc: Add support
for ARM64 dcc).

Change-Id: I0ca651ef2d854fff03cee070524fe1e3971b6d8f
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/char/Kconfig   |   4 -
 drivers/char/Makefile  |   1 -
 drivers/char/dcc_tty.c | 326 -----------------------------------------
 3 files changed, 331 deletions(-)
 delete mode 100644 drivers/char/dcc_tty.c

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 29dbe0fbdf8b..a043107da2af 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -588,10 +588,6 @@ config DEVPORT
 	depends on ISA || PCI
 	default y
 
-config DCC_TTY
-	tristate "DCC tty driver"
-	depends on ARM
-
 source "drivers/s390/char/Kconfig"
 
 config TILE_SROM
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 7671551e3196..d8a7579300d2 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -53,7 +53,6 @@ obj-$(CONFIG_PCMCIA)		+= pcmcia/
 obj-$(CONFIG_HANGCHECK_TIMER)	+= hangcheck-timer.o
 obj-$(CONFIG_TCG_TPM)		+= tpm/
 
-obj-$(CONFIG_DCC_TTY)		+= dcc_tty.o
 obj-$(CONFIG_PS3_FLASH)		+= ps3flash.o
 
 obj-$(CONFIG_JS_RTC)		+= js-rtc.o
diff --git a/drivers/char/dcc_tty.c b/drivers/char/dcc_tty.c
deleted file mode 100644
index a787accdcb14..000000000000
--- a/drivers/char/dcc_tty.c
+++ /dev/null
@@ -1,326 +0,0 @@
-/* drivers/char/dcc_tty.c
- *
- * Copyright (C) 2007 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/delay.h>
-#include <linux/console.h>
-#include <linux/hrtimer.h>
-#include <linux/tty.h>
-#include <linux/tty_driver.h>
-#include <linux/tty_flip.h>
-
-MODULE_DESCRIPTION("DCC TTY Driver");
-MODULE_LICENSE("GPL");
-MODULE_VERSION("1.0");
-
-static spinlock_t g_dcc_tty_lock = SPIN_LOCK_UNLOCKED;
-static struct hrtimer g_dcc_timer;
-static char g_dcc_buffer[16];
-static int g_dcc_buffer_head;
-static int g_dcc_buffer_count;
-static unsigned g_dcc_write_delay_usecs = 1;
-static struct tty_driver *g_dcc_tty_driver;
-static struct tty_struct *g_dcc_tty;
-static int g_dcc_tty_open_count;
-
-static void dcc_poll_locked(void)
-{
-	char ch;
-	int rch;
-	int written;
-
-	while (g_dcc_buffer_count) {
-		ch = g_dcc_buffer[g_dcc_buffer_head];
-		asm(
-			"mrc 14, 0, r15, c0, c1, 0\n"
-			"mcrcc 14, 0, %1, c0, c5, 0\n"
-			"movcc %0, #1\n"
-			"movcs %0, #0\n"
-			: "=r" (written)
-			: "r" (ch)
-		);
-		if (written) {
-			if (ch == '\n')
-				g_dcc_buffer[g_dcc_buffer_head] = '\r';
-			else {
-				g_dcc_buffer_head = (g_dcc_buffer_head + 1) % ARRAY_SIZE(g_dcc_buffer);
-				g_dcc_buffer_count--;
-				if (g_dcc_tty)
-					tty_wakeup(g_dcc_tty);
-			}
-			g_dcc_write_delay_usecs = 1;
-		} else {
-			if (g_dcc_write_delay_usecs > 0x100)
-				break;
-			g_dcc_write_delay_usecs <<= 1;
-			udelay(g_dcc_write_delay_usecs);
-		}
-	}
-
-	if (g_dcc_tty && !test_bit(TTY_THROTTLED, &g_dcc_tty->flags)) {
-		asm(
-			"mrc 14, 0, %0, c0, c1, 0\n"
-			"tst %0, #(1 << 30)\n"
-			"moveq %0, #-1\n"
-			"mrcne 14, 0, %0, c0, c5, 0\n"
-			: "=r" (rch)
-		);
-		if (rch >= 0) {
-			ch = rch;
-			tty_insert_flip_string(g_dcc_tty, &ch, 1);
-			tty_flip_buffer_push(g_dcc_tty);
-		}
-	}
-
-
-	if (g_dcc_buffer_count)
-		hrtimer_start(&g_dcc_timer, ktime_set(0, g_dcc_write_delay_usecs * NSEC_PER_USEC), HRTIMER_MODE_REL);
-	else
-		hrtimer_start(&g_dcc_timer, ktime_set(0, 20 * NSEC_PER_MSEC), HRTIMER_MODE_REL);
-}
-
-static int dcc_tty_open(struct tty_struct * tty, struct file * filp)
-{
-	int ret;
-	unsigned long irq_flags;
-
-	spin_lock_irqsave(&g_dcc_tty_lock, irq_flags);
-	if (g_dcc_tty == NULL || g_dcc_tty == tty) {
-		g_dcc_tty = tty;
-		g_dcc_tty_open_count++;
-		ret = 0;
-	} else
-		ret = -EBUSY;
-	spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags);
-
-	printk("dcc_tty_open, tty %p, f_flags %x, returned %d\n", tty, filp->f_flags, ret);
-
-	return ret;
-}
-
-static void dcc_tty_close(struct tty_struct * tty, struct file * filp)
-{
-	printk("dcc_tty_close, tty %p, f_flags %x\n", tty, filp->f_flags);
-	if (g_dcc_tty == tty) {
-		if (--g_dcc_tty_open_count == 0)
-			g_dcc_tty = NULL;
-	}
-}
-
-static int dcc_write(const unsigned char *buf_start, int count)
-{
-	const unsigned char *buf = buf_start;
-	unsigned long irq_flags;
-	int copy_len;
-	int space_left;
-	int tail;
-
-	if (count < 1)
-		return 0;
-
-	spin_lock_irqsave(&g_dcc_tty_lock, irq_flags);
-	do {
-		tail = (g_dcc_buffer_head + g_dcc_buffer_count) % ARRAY_SIZE(g_dcc_buffer);
-		copy_len = ARRAY_SIZE(g_dcc_buffer) - tail;
-		space_left = ARRAY_SIZE(g_dcc_buffer) - g_dcc_buffer_count;
-		if (copy_len > space_left)
-			copy_len = space_left;
-		if (copy_len > count)
-			copy_len = count;
-		memcpy(&g_dcc_buffer[tail], buf, copy_len);
-		g_dcc_buffer_count += copy_len;
-		buf += copy_len;
-		count -= copy_len;
-		if (copy_len < count && copy_len < space_left) {
-			space_left -= copy_len;
-			copy_len = count;
-			if (copy_len > space_left) {
-				copy_len = space_left;
-			}
-			memcpy(g_dcc_buffer, buf, copy_len);
-			buf += copy_len;
-			count -= copy_len;
-			g_dcc_buffer_count += copy_len;
-		}
-		dcc_poll_locked();
-		space_left = ARRAY_SIZE(g_dcc_buffer) - g_dcc_buffer_count;
-	} while(count && space_left);
-	spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags);
-	return buf - buf_start;
-}
-
-static int dcc_tty_write(struct tty_struct * tty, const unsigned char *buf, int count)
-{
-	int ret;
-	/* printk("dcc_tty_write %p, %d\n", buf, count); */
-	ret = dcc_write(buf, count);
-	if (ret != count)
-		printk("dcc_tty_write %p, %d, returned %d\n", buf, count, ret);
-	return ret;
-}
-
-static int dcc_tty_write_room(struct tty_struct *tty)
-{
-	int space_left;
-	unsigned long irq_flags;
-
-	spin_lock_irqsave(&g_dcc_tty_lock, irq_flags);
-	space_left = ARRAY_SIZE(g_dcc_buffer) - g_dcc_buffer_count;
-	spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags);
-	return space_left;
-}
-
-static int dcc_tty_chars_in_buffer(struct tty_struct *tty)
-{
-	int ret;
-	asm(
-		"mrc 14, 0, %0, c0, c1, 0\n"
-		"mov %0, %0, LSR #30\n"
-		"and %0, %0, #1\n"
-		: "=r" (ret)
-	);
-	return ret;
-}
-
-static void dcc_tty_unthrottle(struct tty_struct * tty)
-{
-	unsigned long irq_flags;
-
-	spin_lock_irqsave(&g_dcc_tty_lock, irq_flags);
-	dcc_poll_locked();
-	spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags);
-}
-
-static enum hrtimer_restart dcc_tty_timer_func(struct hrtimer *timer)
-{
-	unsigned long irq_flags;
-
-	spin_lock_irqsave(&g_dcc_tty_lock, irq_flags);
-	dcc_poll_locked();
-	spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags);
-	return HRTIMER_NORESTART;
-}
-
-void dcc_console_write(struct console *co, const char *b, unsigned count)
-{
-#if 1
-	dcc_write(b, count);
-#else
-	/* blocking printk */
-	while (count > 0) {
-		int written;
-		written = dcc_write(b, count);
-		if (written) {
-			b += written;
-			count -= written;
-		}
-	}
-#endif
-}
-
-static struct tty_driver *dcc_console_device(struct console *c, int *index)
-{
-	*index = 0;
-	return g_dcc_tty_driver;
-}
-
-static int __init dcc_console_setup(struct console *co, char *options)
-{
-	if (co->index != 0)
-		return -ENODEV;
-	return 0;
-}
-
-
-static struct console dcc_console =
-{
-	.name		= "ttyDCC",
-	.write		= dcc_console_write,
-	.device		= dcc_console_device,
-	.setup		= dcc_console_setup,
-	.flags		= CON_PRINTBUFFER,
-	.index		= -1,
-};
-
-static struct tty_operations dcc_tty_ops = {
-	.open = dcc_tty_open,
-	.close = dcc_tty_close,
-	.write = dcc_tty_write,
-	.write_room = dcc_tty_write_room,
-	.chars_in_buffer = dcc_tty_chars_in_buffer,
-	.unthrottle = dcc_tty_unthrottle,
-};
-
-static int __init dcc_tty_init(void)
-{
-	int ret;
-
-	hrtimer_init(&g_dcc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	g_dcc_timer.function = dcc_tty_timer_func;
-
-	g_dcc_tty_driver = alloc_tty_driver(1);
-	if (!g_dcc_tty_driver) {
-		printk(KERN_ERR "dcc_tty_probe: alloc_tty_driver failed\n");
-		ret = -ENOMEM;
-		goto err_alloc_tty_driver_failed;
-	}
-	g_dcc_tty_driver->owner = THIS_MODULE;
-	g_dcc_tty_driver->driver_name = "dcc";
-	g_dcc_tty_driver->name = "ttyDCC";
-	g_dcc_tty_driver->major = 0; // auto assign
-	g_dcc_tty_driver->minor_start = 0;
-	g_dcc_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
-	g_dcc_tty_driver->subtype = SERIAL_TYPE_NORMAL;
-	g_dcc_tty_driver->init_termios = tty_std_termios;
-	g_dcc_tty_driver->flags = TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
-	tty_set_operations(g_dcc_tty_driver, &dcc_tty_ops);
-	ret = tty_register_driver(g_dcc_tty_driver);
-	if (ret) {
-		printk(KERN_ERR "dcc_tty_probe: tty_register_driver failed, %d\n", ret);
-		goto err_tty_register_driver_failed;
-	}
-	tty_register_device(g_dcc_tty_driver, 0, NULL);
-
-	register_console(&dcc_console);
-	hrtimer_start(&g_dcc_timer, ktime_set(0, 0), HRTIMER_MODE_REL);
-
-	return 0;
-
-err_tty_register_driver_failed:
-	put_tty_driver(g_dcc_tty_driver);
-	g_dcc_tty_driver = NULL;
-err_alloc_tty_driver_failed:
-	return ret;
-}
-
-static void  __exit dcc_tty_exit(void)
-{
-	int ret;
-
-	tty_unregister_device(g_dcc_tty_driver, 0);
-	ret = tty_unregister_driver(g_dcc_tty_driver);
-	if (ret < 0) {
-		printk(KERN_ERR "dcc_tty_remove: tty_unregister_driver failed, %d\n", ret);
-	} else {
-		put_tty_driver(g_dcc_tty_driver);
-	}
-	g_dcc_tty_driver = NULL;
-}
-
-module_init(dcc_tty_init);
-module_exit(dcc_tty_exit);
-
-

From ef55b4532be92094b0a1f03f2e1a50d4b49a30ae Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 5 Jan 2016 10:18:51 +0100
Subject: [PATCH 0136/3220] UPSTREAM: arm64: module: fix relocation of movz
 instruction with negative immediate

The test whether a movz instruction with a signed immediate should be
turned into a movn instruction (i.e., when the immediate is negative)
is flawed, since the value of imm is always positive. Also, the
subsequent bounds check is incorrect since the limit update never
executes, due to the fact that the imm_type comparison will always be
false for negative signed immediates.

Let's fix this by performing the sign test on sval directly, and
replacing the bounds check with a simple comparison against U16_MAX.

Change-Id: I9ad3d8bfd91e5fdc6434b1be6c3062dfec193176
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
[will: tidied up use of sval, renamed MOVK enum value to MOVKZ]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/module.c | 51 ++++++++++++++------------------------
 1 file changed, 18 insertions(+), 33 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index f4bc779e62e8..03464ab0fff2 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -30,9 +30,6 @@
 #include <asm/insn.h>
 #include <asm/sections.h>
 
-#define	AARCH64_INSN_IMM_MOVNZ		AARCH64_INSN_IMM_MAX
-#define	AARCH64_INSN_IMM_MOVK		AARCH64_INSN_IMM_16
-
 void *module_alloc(unsigned long size)
 {
 	void *p;
@@ -110,16 +107,20 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
 	return 0;
 }
 
+enum aarch64_insn_movw_imm_type {
+	AARCH64_INSN_IMM_MOVNZ,
+	AARCH64_INSN_IMM_MOVKZ,
+};
+
 static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
-			   int lsb, enum aarch64_insn_imm_type imm_type)
+			   int lsb, enum aarch64_insn_movw_imm_type imm_type)
 {
-	u64 imm, limit = 0;
+	u64 imm;
 	s64 sval;
 	u32 insn = le32_to_cpu(*(u32 *)place);
 
 	sval = do_reloc(op, place, val);
-	sval >>= lsb;
-	imm = sval & 0xffff;
+	imm = sval >> lsb;
 
 	if (imm_type == AARCH64_INSN_IMM_MOVNZ) {
 		/*
@@ -128,7 +129,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
 		 * immediate is less than zero.
 		 */
 		insn &= ~(3 << 29);
-		if ((s64)imm >= 0) {
+		if (sval >= 0) {
 			/* >=0: Set the instruction to MOVZ (opcode 10b). */
 			insn |= 2 << 29;
 		} else {
@@ -140,29 +141,13 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
 			 */
 			imm = ~imm;
 		}
-		imm_type = AARCH64_INSN_IMM_MOVK;
 	}
 
 	/* Update the instruction with the new encoding. */
-	insn = aarch64_insn_encode_immediate(imm_type, insn, imm);
+	insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
 	*(u32 *)place = cpu_to_le32(insn);
 
-	/* Shift out the immediate field. */
-	sval >>= 16;
-
-	/*
-	 * For unsigned immediates, the overflow check is straightforward.
-	 * For signed immediates, the sign bit is actually the bit past the
-	 * most significant bit of the field.
-	 * The AARCH64_INSN_IMM_16 immediate type is unsigned.
-	 */
-	if (imm_type != AARCH64_INSN_IMM_16) {
-		sval++;
-		limit++;
-	}
-
-	/* Check the upper bits depending on the sign of the immediate. */
-	if ((u64)sval > limit)
+	if (imm > U16_MAX)
 		return -ERANGE;
 
 	return 0;
@@ -267,25 +252,25 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 			overflow_check = false;
 		case R_AARCH64_MOVW_UABS_G0:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
-					      AARCH64_INSN_IMM_16);
+					      AARCH64_INSN_IMM_MOVKZ);
 			break;
 		case R_AARCH64_MOVW_UABS_G1_NC:
 			overflow_check = false;
 		case R_AARCH64_MOVW_UABS_G1:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
-					      AARCH64_INSN_IMM_16);
+					      AARCH64_INSN_IMM_MOVKZ);
 			break;
 		case R_AARCH64_MOVW_UABS_G2_NC:
 			overflow_check = false;
 		case R_AARCH64_MOVW_UABS_G2:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
-					      AARCH64_INSN_IMM_16);
+					      AARCH64_INSN_IMM_MOVKZ);
 			break;
 		case R_AARCH64_MOVW_UABS_G3:
 			/* We're using the top bits so we can't overflow. */
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48,
-					      AARCH64_INSN_IMM_16);
+					      AARCH64_INSN_IMM_MOVKZ);
 			break;
 		case R_AARCH64_MOVW_SABS_G0:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
@@ -302,7 +287,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 		case R_AARCH64_MOVW_PREL_G0_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
-					      AARCH64_INSN_IMM_MOVK);
+					      AARCH64_INSN_IMM_MOVKZ);
 			break;
 		case R_AARCH64_MOVW_PREL_G0:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
@@ -311,7 +296,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 		case R_AARCH64_MOVW_PREL_G1_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
-					      AARCH64_INSN_IMM_MOVK);
+					      AARCH64_INSN_IMM_MOVKZ);
 			break;
 		case R_AARCH64_MOVW_PREL_G1:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
@@ -320,7 +305,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 		case R_AARCH64_MOVW_PREL_G2_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
-					      AARCH64_INSN_IMM_MOVK);
+					      AARCH64_INSN_IMM_MOVKZ);
 			break;
 		case R_AARCH64_MOVW_PREL_G2:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,

From d8853fd2e900115923bb36a9137d6b3f1095ee70 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 5 Jan 2016 10:18:52 +0100
Subject: [PATCH 0137/3220] UPSTREAM: arm64: module: avoid undefined shift
 behavior in reloc_data()

Compilers may engage the improbability drive when encountering shifts
by a distance that is a multiple of the size of the operand type. Since
the required bounds check is very simple here, we can get rid of all the
fuzzy masking, shifting and comparing, and use the documented bounds
directly.

Change-Id: Ibc1b73f4a630bc182deb6edfa7458b5e29ba9577
Reported-by: David Binderman <dcb314@hotmail.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/module.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 03464ab0fff2..93e970231ca9 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -72,15 +72,18 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, void *place, u64 val)
 
 static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
 {
-	u64 imm_mask = (1 << len) - 1;
 	s64 sval = do_reloc(op, place, val);
 
 	switch (len) {
 	case 16:
 		*(s16 *)place = sval;
+		if (sval < S16_MIN || sval > U16_MAX)
+			return -ERANGE;
 		break;
 	case 32:
 		*(s32 *)place = sval;
+		if (sval < S32_MIN || sval > U32_MAX)
+			return -ERANGE;
 		break;
 	case 64:
 		*(s64 *)place = sval;
@@ -89,21 +92,6 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
 		pr_err("Invalid length (%d) for data relocation\n", len);
 		return 0;
 	}
-
-	/*
-	 * Extract the upper value bits (including the sign bit) and
-	 * shift them to bit 0.
-	 */
-	sval = (s64)(sval & ~(imm_mask >> 1)) >> (len - 1);
-
-	/*
-	 * Overflow has occurred if the value is not representable in
-	 * len bits (i.e the bottom len bits are not sign-extended and
-	 * the top bits are not all zero).
-	 */
-	if ((u64)(sval + 1) > 2)
-		return -ERANGE;
-
 	return 0;
 }
 

From ebac2a3dcd458126b8c918b3315b1367b94f274d Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Tue, 19 Jan 2016 21:35:15 +0000
Subject: [PATCH 0138/3220] BACKPORT: perf tools: Document the perf sysctls

perf_event_paranoid was only documented in source code and a perf error
message.  Copy the documentation from the error message to
Documentation/sysctl/kernel.txt.

perf_cpu_time_max_percent was already documented but missing from the
list at the top, so add it there.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-doc@vger.kernel.org
Link: http://lkml.kernel.org/r/20160119213515.GG2637@decadent.org.uk
[ Remove reference to external Documentation file, provide info inline, as before ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

Bug: 29054680
Change-Id: I13e73cfb2ad761c94762d0c8196df7725abdf5c5
---
 Documentation/sysctl/kernel.txt | 13 +++++++++++++
 tools/perf/util/evsel.c         | 15 +++++++++------
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index af70d1541d3a..88a2c8eb4502 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -58,6 +58,8 @@ show up in /proc/sys/kernel:
 - panic_on_stackoverflow
 - panic_on_unrecovered_nmi
 - panic_on_warn
+- perf_cpu_time_max_percent
+- perf_event_paranoid
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -624,6 +626,17 @@ allowed to execute.
 
 ==============================================================
 
+perf_event_paranoid:
+
+Controls use of the performance events system by unprivileged
+users (without CAP_SYS_ADMIN).  The default value is 1.
+
+ -1: Allow use of (almost) all events by all users
+>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
+>=1: Disallow CPU event access by users without CAP_SYS_ADMIN
+>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+
+==============================================================
 
 pid_max:
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 397fb4ed3c97..d4913a46ee1c 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2313,12 +2313,15 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 	case EPERM:
 	case EACCES:
 		return scnprintf(msg, size,
-		 "You may not have permission to collect %sstats.\n"
-		 "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
-		 " -1 - Not paranoid at all\n"
-		 "  0 - Disallow raw tracepoint access for unpriv\n"
-		 "  1 - Disallow cpu events for unpriv\n"
-		 "  2 - Disallow kernel profiling for unpriv",
+		 "You may not have permission to collect %sstats.\n\n"
+		 "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
+		 "which controls use of the performance events system by\n"
+		 "unprivileged users (without CAP_SYS_ADMIN).\n\n"
+		 "The default value is 1:\n\n"
+		 "  -1: Allow use of (almost) all events by all users\n"
+		 ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n"
+		 ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n"
+		 ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN",
 				 target->system_wide ? "system-wide " : "");
 	case ENOENT:
 		return scnprintf(msg, size, "The %s event is not supported.",

From 9d5f5d93461e2b6f697054c06ed26bc5fa4d629e Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Sun, 29 May 2016 14:22:32 -0700
Subject: [PATCH 0139/3220] FROMLIST: security,perf: Allow further restriction
 of perf_event_open

When kernel.perf_event_open is set to 3 (or greater), disallow all
access to performance events by users without CAP_SYS_ADMIN.
Add a Kconfig symbol CONFIG_SECURITY_PERF_EVENTS_RESTRICT that
makes this value the default.

This is based on a similar feature in grsecurity
(CONFIG_GRKERNSEC_PERF_HARDEN).  This version doesn't include making
the variable read-only.  It also allows enabling further restriction
at run-time regardless of whether the default is changed.

https://lkml.org/lkml/2016/1/11/587

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>

Bug: 29054680
Change-Id: Iff5bff4fc1042e85866df9faa01bce8d04335ab8
---
 Documentation/sysctl/kernel.txt | 4 +++-
 include/linux/perf_event.h      | 5 +++++
 kernel/events/core.c            | 8 ++++++++
 security/Kconfig                | 9 +++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 88a2c8eb4502..5728779df1ab 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -629,12 +629,14 @@ allowed to execute.
 perf_event_paranoid:
 
 Controls use of the performance events system by unprivileged
-users (without CAP_SYS_ADMIN).  The default value is 1.
+users (without CAP_SYS_ADMIN).  The default value is 3 if
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT is set, or 1 otherwise.
 
  -1: Allow use of (almost) all events by all users
 >=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
 >=1: Disallow CPU event access by users without CAP_SYS_ADMIN
 >=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+>=3: Disallow all event access by users without CAP_SYS_ADMIN
 
 ==============================================================
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f9828a48f16a..aa7294092e51 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -989,6 +989,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		loff_t *ppos);
 
 
+static inline bool perf_paranoid_any(void)
+{
+	return sysctl_perf_event_paranoid > 2;
+}
+
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
 	return sysctl_perf_event_paranoid > -1;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cfc227ccfceb..85bc810bece6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,13 @@ static struct srcu_struct pmus_srcu;
  *   0 - disallow raw tracepoint access for unpriv
  *   1 - disallow cpu events for unpriv
  *   2 - disallow kernel profiling for unpriv
+ *   3 - disallow all unpriv perf event use
  */
+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+int sysctl_perf_event_paranoid __read_mostly = 3;
+#else
 int sysctl_perf_event_paranoid __read_mostly = 1;
+#endif
 
 /* Minimum for 512 kiB + 1 user control page */
 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -8265,6 +8270,9 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (flags & ~PERF_FLAG_ALL)
 		return -EINVAL;
 
+	if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	err = perf_copy_attr(attr_uptr, &attr);
 	if (err)
 		return err;
diff --git a/security/Kconfig b/security/Kconfig
index e45237897b43..30a2603e8c85 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -18,6 +18,15 @@ config SECURITY_DMESG_RESTRICT
 
 	  If you are unsure how to answer this question, answer N.
 
+config SECURITY_PERF_EVENTS_RESTRICT
+	bool "Restrict unprivileged use of performance events"
+	depends on PERF_EVENTS
+	help
+	  If you say Y here, the kernel.perf_event_paranoid sysctl
+	  will be set to 3 by default, and no unprivileged use of the
+	  perf_event_open syscall will be permitted unless it is
+	  changed.
+
 config SECURITY
 	bool "Enable different security models"
 	depends on SYSFS

From 0ac94b48c0c947accdacf310c068f6bad4796d02 Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Wed, 1 Jun 2016 13:44:47 -0700
Subject: [PATCH 0140/3220] ANDROID: restrict access to perf events

Add:
CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y

to android-base.cfg

The kernel.perf_event_paranoid sysctl is set to 3 by default.
No unprivileged use of the perf_event_open syscall will be
permitted unless it is changed.

Bug: 29054680
Change-Id: Ie7512259150e146d8e382dc64d40e8faaa438917
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 304f1d4fd7c4..6db5542a51f4 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -145,6 +145,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SETEND_EMULATION=y
 CONFIG_STAGING=y

From 249d2baf9b494381f2ff4f21feeaaa47eaa78e7c Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 3 Jun 2016 14:06:14 -0700
Subject: [PATCH 0141/3220] ANDROID: dm verity fec: limit error correction
 recursion

If verity tree itself is sufficiently corrupted in addition to data
blocks, it's possible for error correction to end up in a deep recursive
error correction loop that eventually causes a kernel panic as follows:

[   14.728962] [<ffffffc0008c1a14>] verity_fec_decode+0xa8/0x138
[   14.734691] [<ffffffc0008c3ee0>] verity_verify_level+0x11c/0x180
[   14.740681] [<ffffffc0008c482c>] verity_hash_for_block+0x88/0xe0
[   14.746671] [<ffffffc0008c1508>] fec_decode_rsb+0x318/0x75c
[   14.752226] [<ffffffc0008c1a14>] verity_fec_decode+0xa8/0x138
[   14.757956] [<ffffffc0008c3ee0>] verity_verify_level+0x11c/0x180
[   14.763944] [<ffffffc0008c482c>] verity_hash_for_block+0x88/0xe0

This change limits the recursion to a reasonable level during a single
I/O operation.

Bug: 28943429
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Change-Id: I0a7ebff331d259c59a5e03c81918cc1613c3a766
(cherry picked from commit f4b9e40597e73942d2286a73463c55f26f61bfa7)
---
 drivers/md/dm-verity-fec.c | 11 ++++++++++-
 drivers/md/dm-verity-fec.h |  4 ++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index ad10d6d8ed28..b26809a47ca3 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -442,6 +442,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 	if (!verity_fec_is_enabled(v))
 		return -EOPNOTSUPP;
 
+	if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) {
+		DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name);
+		return -EIO;
+	}
+
+	fio->level++;
+
 	if (type == DM_VERITY_BLOCK_TYPE_METADATA)
 		block += v->data_blocks;
 
@@ -475,7 +482,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 	if (r < 0) {
 		r = fec_decode_rsb(v, io, fio, rsb, offset, true);
 		if (r < 0)
-			return r;
+			goto done;
 	}
 
 	if (dest)
@@ -485,6 +492,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 		r = verity_for_bv_block(v, io, iter, fec_bv_copy);
 	}
 
+done:
+	fio->level--;
 	return r;
 }
 
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 8c4bee052a73..b8e21cef3ad1 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -28,6 +28,9 @@
 #define DM_VERITY_FEC_BUF_MAX \
 	(1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
 
+/* maximum recursion level for verity_fec_decode */
+#define DM_VERITY_FEC_MAX_RECURSION	4
+
 #define DM_VERITY_OPT_FEC_DEV		"use_fec_from_device"
 #define DM_VERITY_OPT_FEC_BLOCKS	"fec_blocks"
 #define DM_VERITY_OPT_FEC_START		"fec_start"
@@ -61,6 +64,7 @@ struct dm_verity_fec_io {
 	unsigned nbufs;		/* number of buffers allocated */
 	u8 *output;		/* buffer for corrected output */
 	size_t output_pos;
+	unsigned level;		/* recursion level */
 };
 
 #ifdef CONFIG_DM_VERITY_FEC

From c4d8e3e8d2aa09f5826f517339baa7632f7873ea Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 3 Jun 2016 14:22:46 -0700
Subject: [PATCH 0142/3220] ANDROID: dm verity fec: add missing release from
 fec_ktype

Add a release function to allow destroying the dm-verity device.

Bug: 27928374
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Change-Id: Ic0f7c17e4889c5580d70b52d9a709a37165a5747
(cherry picked from commit 0039ccf47c8f99888f7b71b2a36a68a027fbe357)
---
 drivers/md/dm-verity-fec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index b26809a47ca3..454535d23a7f 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -689,7 +689,8 @@ static struct attribute *fec_attrs[] = {
 
 static struct kobj_type fec_ktype = {
 	.sysfs_ops = &kobj_sysfs_ops,
-	.default_attrs = fec_attrs
+	.default_attrs = fec_attrs,
+	.release = dm_kobject_release
 };
 
 /*

From 8f9576b38193b8eb32d94d17f02baab436c3580a Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 17 Jun 2016 11:22:03 -0700
Subject: [PATCH 0143/3220] ANDROID: dm verity fec: fix RS block calculation

A call to do_div was changed in Linux 4.5 to div64_u64 in
verity_fec_decode, which broke RS block calculation due to
incompatible semantics. This change fixes the computation.

Bug: 21893453
Change-Id: Idb88b901e0209c2cccc9c0796689f780592d58f9
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
(cherry picked from commit 879aac93eebcc2862d71afa9eca3a0c0f51b3b01)
---
 drivers/md/dm-verity-fec.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 454535d23a7f..a1e8571ce314 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -463,9 +463,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 	 */
 
 	offset = block << v->data_dev_block_bits;
-
-	res = offset;
-	div64_u64(res, v->fec->rounds << v->data_dev_block_bits);
+	res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits);
 
 	/*
 	 * The base RS block we can feed to the interleaver to find out all

From d053106b93363b77ef8ca60352069cbdc8fa0f87 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 17 Jun 2016 11:31:17 -0700
Subject: [PATCH 0144/3220] ANDROID: dm verity fec: initialize recursion level

Explicitly initialize recursion level to zero at the beginning of each
I/O operation.

Bug: 28943429
Change-Id: I00c612be2b8c22dd5afb65a739551df91cb324fc
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
(cherry picked from commit 32ffb3a22d7fd269b2961323478ece92c06a8334)
---
 drivers/md/dm-verity-fec.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index a1e8571ce314..1dd667b97530 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -532,6 +532,7 @@ void verity_fec_init_io(struct dm_verity_io *io)
 	memset(fio->bufs, 0, sizeof(fio->bufs));
 	fio->nbufs = 0;
 	fio->output = NULL;
+	fio->level = 0;
 }
 
 /*

From ab4f14dd052ba453558a8913d8d5547abbfc7b88 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Mon, 7 Mar 2016 11:31:10 +0100
Subject: [PATCH 0145/3220] UPSTREAM: usbnet: cleanup after bind() in probe()

(cherry pick from commit 1666984c8625b3db19a9abc298931d35ab7bc64b)

In case bind() works, but a later error forces bailing
in probe() in error cases work and a timer may be scheduled.
They must be killed. This fixes an error case related to
the double free reported in
http://www.spinics.net/lists/netdev/msg367669.html
and needs to go on top of Linus' fix to cdc-ncm.

Signed-off-by: Oliver Neukum <ONeukum@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Bug: 28744625
---
 drivers/net/usb/usbnet.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 0744bf2ef2d6..c2ea4e5666fb 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1766,6 +1766,13 @@ out3:
 	if (info->unbind)
 		info->unbind (dev, udev);
 out1:
+	/* subdrivers must undo all they did in bind() if they
+	 * fail it, but we may fail later and a deferred kevent
+	 * may trigger an error resubmitting itself and, worse,
+	 * schedule a timer. So we kill it all just in case.
+	 */
+	cancel_work_sync(&dev->kevent);
+	del_timer_sync(&dev->delay);
 	free_netdev(net);
 out:
 	return status;

From 872e24621a63b75b3b644f6369b04bfb092264b1 Mon Sep 17 00:00:00 2001
From: Thierry Strudel <tstrudel@google.com>
Date: Tue, 14 Jun 2016 17:46:44 -0700
Subject: [PATCH 0146/3220] ANDROID: cpu: send KOBJ_ONLINE event when enabling
 cpus

In case some sysfs nodes needs to be labeled with a different label than
sysfs then user needs to be notified when a core is brought back online.

Signed-off-by: Thierry Strudel <tstrudel@google.com>
Bug: 29359497
Change-Id: I0395c86e01cd49c348fda8f93087d26f88557c91
---
 kernel/cpu.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 37731292f8a1..9ced7c751648 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -627,6 +627,7 @@ void __weak arch_enable_nonboot_cpus_end(void)
 void enable_nonboot_cpus(void)
 {
 	int cpu, error;
+	struct device *cpu_device;
 
 	/* Allow everyone to use the CPU hotplug again */
 	cpu_maps_update_begin();
@@ -644,6 +645,12 @@ void enable_nonboot_cpus(void)
 		trace_suspend_resume(TPS("CPU_ON"), cpu, false);
 		if (!error) {
 			pr_info("CPU%d is up\n", cpu);
+			cpu_device = get_cpu_device(cpu);
+			if (!cpu_device)
+				pr_err("%s: failed to get cpu%d device\n",
+				       __func__, cpu);
+			else
+				kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE);
 			continue;
 		}
 		pr_warn("Error taking CPU%d up: %d\n", cpu, error);

From 6da57e2f0beee58a6570719a865f4e1b1f7822c2 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Thu, 23 Jun 2016 11:51:39 +0530
Subject: [PATCH 0147/3220] ANDROID: configs: remove unused configs

Remove following configs which no longer exist:

CONFIG_IP6_NF_TARGET_REJECT_SKERR
CONFIG_IP_NF_TARGET_REJECT_SKERR
CONFIG_RESOURCE_COUNTERS
CONFIG_TABLET_USB_WACOM

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 android/configs/android-base.cfg        | 3 ---
 android/configs/android-recommended.cfg | 1 -
 2 files changed, 4 deletions(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 6db5542a51f4..bdd99f2becfc 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -37,7 +37,6 @@ CONFIG_IP6_NF_IPTABLES=y
 CONFIG_IP6_NF_MANGLE=y
 CONFIG_IP6_NF_RAW=y
 CONFIG_IP6_NF_TARGET_REJECT=y
-CONFIG_IP6_NF_TARGET_REJECT_SKERR=y
 CONFIG_IPV6=y
 CONFIG_IPV6_MIP6=y
 CONFIG_IPV6_MULTIPLE_TABLES=y
@@ -64,7 +63,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y
 CONFIG_IP_NF_TARGET_NETMAP=y
 CONFIG_IP_NF_TARGET_REDIRECT=y
 CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_REJECT_SKERR=y
 CONFIG_NET=y
 CONFIG_NETDEVICES=y
 CONFIG_NETFILTER=y
@@ -140,7 +138,6 @@ CONFIG_PPP_DEFLATE=y
 CONFIG_PPP_MPPE=y
 CONFIG_PREEMPT=y
 CONFIG_QUOTA=y
-CONFIG_RESOURCE_COUNTERS=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SECURITY=y
diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index 35936afdcae4..c3222a77ba24 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -110,7 +110,6 @@ CONFIG_TABLET_USB_AIPTEK=y
 CONFIG_TABLET_USB_GTCO=y
 CONFIG_TABLET_USB_HANWANG=y
 CONFIG_TABLET_USB_KBTAB=y
-CONFIG_TABLET_USB_WACOM=y
 CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y

From b40d9bcad6420669ccf41f69278506125fa68428 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kangjielu@gmail.com>
Date: Tue, 3 May 2016 16:46:24 -0400
Subject: [PATCH 0148/3220] UPSTREAM: net: fix infoleak in rtnetlink
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(cherry pick from commit 5f8e44741f9f216e33736ea4ec65ca9ac03036e6)

The stack object “map” has a total size of 32 bytes. Its last 4
bytes are padding generated by compiler. These padding bytes are
not initialized and sent out via “nla_put”.

Signed-off-by: Kangjie Lu <kjlu@gatech.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
Bug: 28620102
Change-Id: Ica015c6a90d47e9188b1cd87a280ac6819dd9d09
---
 net/core/rtnetlink.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 34ba7a08876d..9c6d15756e7a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1174,14 +1174,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 
 static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 {
-	struct rtnl_link_ifmap map = {
-		.mem_start   = dev->mem_start,
-		.mem_end     = dev->mem_end,
-		.base_addr   = dev->base_addr,
-		.irq         = dev->irq,
-		.dma         = dev->dma,
-		.port        = dev->if_port,
-	};
+	struct rtnl_link_ifmap map;
+
+	memset(&map, 0, sizeof(map));
+	map.mem_start   = dev->mem_start;
+	map.mem_end     = dev->mem_end;
+	map.base_addr   = dev->base_addr;
+	map.irq         = dev->irq;
+	map.dma         = dev->dma;
+	map.port        = dev->if_port;
+
 	if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
 		return -EMSGSIZE;
 

From 14dfd8269605b9dc3dcdd10945398d3822f07953 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kangjielu@gmail.com>
Date: Tue, 3 May 2016 16:44:07 -0400
Subject: [PATCH 0149/3220] UPSTREAM: ALSA: timer: Fix leak in
 SNDRV_TIMER_IOCTL_PARAMS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(cherry pick from commit cec8f96e49d9be372fdb0c3836dcf31ec71e457e)

The stack object “tread” has a total size of 32 bytes. Its field
“event” and “val” both contain 4 bytes padding. These 8 bytes
padding bytes are sent to user without being initialized.

Signed-off-by: Kangjie Lu <kjlu@gatech.edu>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Bug: 28980557
Change-Id: Ibda2d126f6d72fedf797a98796c3cde7bb03db76
---
 sound/core/timer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/core/timer.c b/sound/core/timer.c
index 8b06413e967e..8b6c3605d2e9 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1680,6 +1680,7 @@ static int snd_timer_user_params(struct file *file,
 	if (tu->timeri->flags & SNDRV_TIMER_IFLG_EARLY_EVENT) {
 		if (tu->tread) {
 			struct snd_timer_tread tread;
+			memset(&tread, 0, sizeof(tread));
 			tread.event = SNDRV_TIMER_EVENT_EARLY;
 			tread.tstamp.tv_sec = 0;
 			tread.tstamp.tv_nsec = 0;

From 29d02a76f5ff4af4eeb5f1ef3b09cd5b48fc2662 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kangjielu@gmail.com>
Date: Tue, 3 May 2016 16:44:32 -0400
Subject: [PATCH 0150/3220] UPSTREAM: ALSA: timer: Fix leak in events via
 snd_timer_user_tinterrupt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(cherry pick from commit e4ec8cc8039a7063e24204299b462bd1383184a5)

The stack object “r1” has a total size of 32 bytes. Its field
“event” and “val” both contain 4 bytes padding. These 8 bytes
padding bytes are sent to user without being initialized.

Signed-off-by: Kangjie Lu <kjlu@gatech.edu>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Bug: 28980217
Change-Id: If2bba3c9ffb4e57190583b0bb2524d3b2514b2a3
---
 sound/core/timer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/core/timer.c b/sound/core/timer.c
index 8b6c3605d2e9..ded95e6f201f 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1215,6 +1215,7 @@ static void snd_timer_user_tinterrupt(struct snd_timer_instance *timeri,
 	}
 	if ((tu->filter & (1 << SNDRV_TIMER_EVENT_RESOLUTION)) &&
 	    tu->last_resolution != resolution) {
+		memset(&r1, 0, sizeof(r1));
 		r1.event = SNDRV_TIMER_EVENT_RESOLUTION;
 		r1.tstamp = tstamp;
 		r1.val = resolution;

From 3d1e8cf7ef81f13d4da5218174d12a40e8bf1116 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kangjielu@gmail.com>
Date: Tue, 3 May 2016 16:44:20 -0400
Subject: [PATCH 0151/3220] UPSTREAM: ALSA: timer: Fix leak in events via
 snd_timer_user_ccallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(cherry pick from commit 9a47e9cff994f37f7f0dbd9ae23740d0f64f9fe6)

The stack object “r1” has a total size of 32 bytes. Its field
“event” and “val” both contain 4 bytes padding. These 8 bytes
padding bytes are sent to user without being initialized.

Signed-off-by: Kangjie Lu <kjlu@gatech.edu>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Bug: 28980217
Change-Id: I2e4c27352894b9f1f4c808b8db3ae5f9284faec1
---
 sound/core/timer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/core/timer.c b/sound/core/timer.c
index ded95e6f201f..1701703b8d93 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1181,6 +1181,7 @@ static void snd_timer_user_ccallback(struct snd_timer_instance *timeri,
 		tu->tstamp = *tstamp;
 	if ((tu->filter & (1 << event)) == 0 || !tu->tread)
 		return;
+	memset(&r1, 0, sizeof(r1));
 	r1.event = event;
 	r1.tstamp = *tstamp;
 	r1.val = resolution;

From 659740adf180779d0177a086656880bc7a43cfd1 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kangjielu@gmail.com>
Date: Tue, 3 May 2016 16:32:16 -0400
Subject: [PATCH 0152/3220] UPSTREAM: USB: usbfs: fix potential infoleak in
 devio
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(cherry pick from commit 681fef8380eb818c0b845fca5d2ab1dcbab114ee)

The stack object “ci” has a total size of 8 bytes. Its last 3 bytes
are padding bytes which are not initialized and leaked to userland
via “copy_to_user”.

Signed-off-by: Kangjie Lu <kjlu@gatech.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Bug: 28619695
Change-Id: I170754d659d0891c075f85211b5e3970b114f097
---
 drivers/usb/core/devio.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 38ae877c46e3..3ffb01ff6549 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -1203,10 +1203,11 @@ static int proc_getdriver(struct usb_dev_state *ps, void __user *arg)
 
 static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg)
 {
-	struct usbdevfs_connectinfo ci = {
-		.devnum = ps->dev->devnum,
-		.slow = ps->dev->speed == USB_SPEED_LOW
-	};
+	struct usbdevfs_connectinfo ci;
+
+	memset(&ci, 0, sizeof(ci));
+	ci.devnum = ps->dev->devnum;
+	ci.slow = ps->dev->speed == USB_SPEED_LOW;
 
 	if (copy_to_user(arg, &ci, sizeof(ci)))
 		return -EFAULT;

From 7dbe59704ab0dd0a07ae2c2b7b590bb2b8ccd6d9 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Thu, 23 Jun 2016 15:35:07 +0530
Subject: [PATCH 0153/3220] ANDROID: base-cfg: enable UID_CPUTIME

Enabled UID_CPUTIME and dependent PROFILING config option.

UID_CPUTIME (/proc/uid_cputime) interfaces provide amount of time a
UID's processes spent executing in user-space and kernel-space. It is
used by batterystats service.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 android/configs/android-base.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index bdd99f2becfc..6496bb3961a2 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -137,6 +137,7 @@ CONFIG_PPP_BSDCOMP=y
 CONFIG_PPP_DEFLATE=y
 CONFIG_PPP_MPPE=y
 CONFIG_PREEMPT=y
+CONFIG_PROFILING=y
 CONFIG_QUOTA=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
@@ -149,6 +150,7 @@ CONFIG_STAGING=y
 CONFIG_SWP_EMULATION=y
 CONFIG_SYNC=y
 CONFIG_TUN=y
+CONFIG_UID_CPUTIME=y
 CONFIG_UNIX=y
 CONFIG_USB_GADGET=y
 CONFIG_USB_CONFIGFS=y

From ef51a2254d6579560bf7ff8ebbeb9b3848fe10bf Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Thu, 7 Jan 2016 16:46:14 +0100
Subject: [PATCH 0154/3220] BACKPORT: PM / sleep: Go direct_complete if driver
 has no callbacks

Backport notes: This resolves clk warnings in the designware i2c
driver on HiKey seen during suspend/resume.

Cherrypicked from: aa8e54b559479d0cb7eb632ba443b8cacd20cd4b

If a suitable prepare callback cannot be found for a given device and
its driver has no PM callbacks at all, assume that it can go direct to
complete when the system goes to sleep.

The reason for this is that there's lots of devices in a system that do
no PM at all and there's no reason for them to prevent their ancestors
to do direct_complete if they can support it.

Change-Id: Ia773afb4b266f012336b99fc8cf87453839e078b
Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[jstultz: Backported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/base/dd.c          |  3 +++
 drivers/base/power/main.c  | 35 +++++++++++++++++++++++++++++++++++
 drivers/base/power/power.h |  3 +++
 include/linux/pm.h         |  1 +
 4 files changed, 42 insertions(+)

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index a641cf3ccad6..9e425fbf83cb 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -205,6 +205,8 @@ static void driver_bound(struct device *dev)
 
 	klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices);
 
+	device_pm_check_callbacks(dev);
+
 	/*
 	 * Make sure the device is no longer in one of the deferred lists and
 	 * kick off retrying all pending devices
@@ -697,6 +699,7 @@ static void __device_release_driver(struct device *dev)
 			dev->pm_domain->dismiss(dev);
 
 		klist_remove(&dev->p->knode_driver);
+		device_pm_check_callbacks(dev);
 		if (dev->bus)
 			blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 						     BUS_NOTIFY_UNBOUND_DRIVER,
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 5ea529165362..77379f3d136a 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -126,6 +126,7 @@ void device_pm_add(struct device *dev)
 {
 	pr_debug("PM: Adding info for %s:%s\n",
 		 dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
+	device_pm_check_callbacks(dev);
 	mutex_lock(&dpm_list_mtx);
 	if (dev->parent && dev->parent->power.is_prepared)
 		dev_warn(dev, "parent %s should not be sleeping\n",
@@ -148,6 +149,7 @@ void device_pm_remove(struct device *dev)
 	mutex_unlock(&dpm_list_mtx);
 	device_wakeup_disable(dev);
 	pm_runtime_remove(dev);
+	device_pm_check_callbacks(dev);
 }
 
 /**
@@ -1574,6 +1576,11 @@ static int device_prepare(struct device *dev, pm_message_t state)
 
 	dev->power.wakeup_path = device_may_wakeup(dev);
 
+	if (dev->power.no_pm_callbacks) {
+		ret = 1;	/* Let device go direct_complete */
+		goto unlock;
+	}
+
 	if (dev->pm_domain) {
 		info = "preparing power domain ";
 		callback = dev->pm_domain->ops.prepare;
@@ -1596,6 +1603,7 @@ static int device_prepare(struct device *dev, pm_message_t state)
 	if (callback)
 		ret = callback(dev);
 
+unlock:
 	device_unlock(dev);
 
 	if (ret < 0) {
@@ -1724,3 +1732,30 @@ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *))
 	device_pm_unlock();
 }
 EXPORT_SYMBOL_GPL(dpm_for_each_dev);
+
+static bool pm_ops_is_empty(const struct dev_pm_ops *ops)
+{
+	if (!ops)
+		return true;
+
+	return !ops->prepare &&
+	       !ops->suspend &&
+	       !ops->suspend_late &&
+	       !ops->suspend_noirq &&
+	       !ops->resume_noirq &&
+	       !ops->resume_early &&
+	       !ops->resume &&
+	       !ops->complete;
+}
+
+void device_pm_check_callbacks(struct device *dev)
+{
+	spin_lock_irq(&dev->power.lock);
+	dev->power.no_pm_callbacks =
+		(!dev->bus || pm_ops_is_empty(dev->bus->pm)) &&
+		(!dev->class || pm_ops_is_empty(dev->class->pm)) &&
+		(!dev->type || pm_ops_is_empty(dev->type->pm)) &&
+		(!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) &&
+		(!dev->driver || pm_ops_is_empty(dev->driver->pm));
+	spin_unlock_irq(&dev->power.lock);
+}
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 998fa6b23084..297beae64314 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -123,6 +123,7 @@ extern void device_pm_remove(struct device *);
 extern void device_pm_move_before(struct device *, struct device *);
 extern void device_pm_move_after(struct device *, struct device *);
 extern void device_pm_move_last(struct device *);
+extern void device_pm_check_callbacks(struct device *dev);
 
 #else /* !CONFIG_PM_SLEEP */
 
@@ -141,6 +142,8 @@ static inline void device_pm_move_after(struct device *deva,
 					struct device *devb) {}
 static inline void device_pm_move_last(struct device *dev) {}
 
+static inline void device_pm_check_callbacks(struct device *dev) {}
+
 #endif /* !CONFIG_PM_SLEEP */
 
 static inline void device_pm_init(struct device *dev)
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 528be6787796..6a5d654f4447 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -573,6 +573,7 @@ struct dev_pm_info {
 	struct wakeup_source	*wakeup;
 	bool			wakeup_path:1;
 	bool			syscore:1;
+	bool			no_pm_callbacks:1;	/* Owned by the PM core */
 #else
 	unsigned int		should_wakeup:1;
 #endif

From e639a4e74d6cd2c01d4f2afbbe0b6e6e233b42d2 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 27 Jun 2016 13:33:59 -0700
Subject: [PATCH 0155/3220] Revert "usb: gadget: prevent change of Host MAC
 address of 'usb0' interface"

This reverts commit 265801537d110eb68d44a2f66015479908f635c0.

Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
---
 drivers/usb/gadget/function/u_ether.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c
index dd73dfe5dcab..74e9f5b5a45d 100644
--- a/drivers/usb/gadget/function/u_ether.c
+++ b/drivers/usb/gadget/function/u_ether.c
@@ -863,8 +863,6 @@ static int eth_stop(struct net_device *net)
 
 /*-------------------------------------------------------------------------*/
 
-static u8 host_ethaddr[ETH_ALEN];
-
 static int get_ether_addr(const char *str, u8 *dev_addr)
 {
 	if (str) {
@@ -895,17 +893,6 @@ static int get_ether_addr_str(u8 dev_addr[ETH_ALEN], char *str, int len)
 	return 18;
 }
 
-static int get_host_ether_addr(u8 *str, u8 *dev_addr)
-{
-	memcpy(dev_addr, str, ETH_ALEN);
-	if (is_valid_ether_addr(dev_addr))
-		return 0;
-
-	random_ether_addr(dev_addr);
-	memcpy(str, dev_addr, ETH_ALEN);
-	return 1;
-}
-
 static const struct net_device_ops eth_netdev_ops = {
 	.ndo_open		= eth_open,
 	.ndo_stop		= eth_stop,
@@ -963,11 +950,9 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g,
 	if (get_ether_addr(dev_addr, net->dev_addr))
 		dev_warn(&g->dev,
 			"using random %s ethernet address\n", "self");
-
-	if (get_host_ether_addr(host_ethaddr, dev->host_mac))
-		dev_warn(&g->dev, "using random %s ethernet address\n", "host");
-	else
-		dev_warn(&g->dev, "using previous %s ethernet address\n", "host");
+	if (get_ether_addr(host_addr, dev->host_mac))
+		dev_warn(&g->dev,
+			"using random %s ethernet address\n", "host");
 
 	if (ethaddr)
 		memcpy(ethaddr, dev->host_mac, ETH_ALEN);

From 074c908972a215f63bb5409f6a317020fb137d5b Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Wed, 22 Jun 2016 16:49:48 +0800
Subject: [PATCH 0156/3220] netfilter: xt_quota2: make quota2_log work well

In upstream commit 7200135bc1e61f1437dc326ae2ef2f310c50b4eb
(netfilter: kill ulog targets)
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7200135bc1e6

ipt_ULOG target was removed, meanwhile, the IP_NF_TARGET_ULOG Kconfig
and ipt_ULOG.h header file were removed too. This causes we cannot enable
QUOTA2_LOG, and netd complains this error: "Unable to open quota socket".
So when we reach the quota2 limit, userspace will not be notified with
this event.

Since IP_NF_TARGET_ULOG was removed, we need not depend on
"IP_NF_TARGET_ULOG=n", and for compatibility, add ulog_packet_msg_t
related definitions copied from "ipt_ULOG.h".

Change-Id: I38132efaabf52bea75dfd736ce734a1b9690e87e
Reported-by: Samboo Shen <samboo.shen@spreadtrum.com>
Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
---
 net/netfilter/Kconfig     |  1 -
 net/netfilter/xt_quota2.c | 21 ++++++++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 273f26b67653..1959548b1161 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1354,7 +1354,6 @@ config NETFILTER_XT_MATCH_QUOTA2
 config NETFILTER_XT_MATCH_QUOTA2_LOG
 	bool '"quota2" Netfilter LOG support'
 	depends on NETFILTER_XT_MATCH_QUOTA2
-	depends on IP_NF_TARGET_ULOG=n    # not yes, not module, just no
 	default n
 	help
 	  This option allows `quota2' to log ONCE when a quota limit
diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c
index 99592ae56d9b..834594aa0085 100644
--- a/net/netfilter/xt_quota2.c
+++ b/net/netfilter/xt_quota2.c
@@ -21,8 +21,27 @@
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_quota2.h>
+
 #ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
-#include <linux/netfilter_ipv4/ipt_ULOG.h>
+/* For compatibility, these definitions are copied from the
+ * deprecated header file <linux/netfilter_ipv4/ipt_ULOG.h> */
+#define ULOG_MAC_LEN	80
+#define ULOG_PREFIX_LEN	32
+
+/* Format of the ULOG packets passed through netlink */
+typedef struct ulog_packet_msg {
+	unsigned long mark;
+	long timestamp_sec;
+	long timestamp_usec;
+	unsigned int hook;
+	char indev_name[IFNAMSIZ];
+	char outdev_name[IFNAMSIZ];
+	size_t data_len;
+	char prefix[ULOG_PREFIX_LEN];
+	unsigned char mac_len;
+	unsigned char mac[ULOG_MAC_LEN];
+	unsigned char payload[0];
+} ulog_packet_msg_t;
 #endif
 
 /**

From 286d25ba46346d5b7125b2155ce92a63a73f4d7b Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 17 Mar 2016 14:20:51 -0700
Subject: [PATCH 0157/3220] BACKPORT: timer: convert timer_slack_ns from
 unsigned long to u64

This backports da8b44d5a9f8bf26da637b7336508ca534d6b319 from upstream.

This patchset introduces a /proc/<pid>/timerslack_ns interface which
would allow controlling processes to be able to set the timerslack value
on other processes in order to save power by avoiding wakeups (Something
Android currently does via out-of-tree patches).

The first patch tries to fix the internal timer_slack_ns usage which was
defined as a long, which limits the slack range to ~4 seconds on 32bit
systems.  It converts it to a u64, which provides the same basically
unlimited slack (500 years) on both 32bit and 64bit machines.

The second patch introduces the /proc/<pid>/timerslack_ns interface
which allows the full 64bit slack range for a task to be read or set on
both 32bit and 64bit machines.

With these two patches, on a 32bit machine, after setting the slack on
bash to 10 seconds:

$ time sleep 1

real    0m10.747s
user    0m0.001s
sys     0m0.005s

The first patch is a little ugly, since I had to chase the slack delta
arguments through a number of functions converting them to u64s.  Let me
know if it makes sense to break that up more or not.

Other than that things are fairly straightforward.

This patch (of 2):

The timer_slack_ns value in the task struct is currently a unsigned
long.  This means that on 32bit applications, the maximum slack is just
over 4 seconds.  However, on 64bit machines, its much much larger (~500
years).

This disparity could make application development a little (as well as
the default_slack) to a u64.  This means both 32bit and 64bit systems
have the same effective internal slack range.

Now the existing ABI via PR_GET_TIMERSLACK and PR_SET_TIMERSLACK specify
the interface as a unsigned long, so we preserve that limitation on
32bit systems, where SET_TIMERSLACK can only set the slack to a unsigned
long value, and GET_TIMERSLACK will return ULONG_MAX if the slack is
actually larger then what can be stored by an unsigned long.

This patch also modifies hrtimer functions which specified the slack
delta as a unsigned long.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c          |  2 +-
 fs/select.c             |  8 ++++----
 include/linux/freezer.h |  2 +-
 include/linux/hrtimer.h | 12 +++++++-----
 include/linux/poll.h    |  2 +-
 include/linux/sched.h   |  4 ++--
 kernel/sys.c            |  5 ++++-
 kernel/time/hrtimer.c   |  8 ++++----
 kernel/time/timer.c     |  4 ++--
 9 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4c999ce7e73a..3ab9c68b8bce 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1588,7 +1588,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 {
 	int res = 0, eavail, timed_out = 0;
 	unsigned long flags;
-	long slack = 0;
+	u64 slack = 0;
 	wait_queue_t wait;
 	ktime_t expires, *to = NULL;
 
diff --git a/fs/select.c b/fs/select.c
index 015547330e88..09e71a00a9b8 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv)
 	return slack;
 }
 
-long select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec *tv)
 {
-	unsigned long ret;
+	u64 ret;
 	struct timespec now;
 
 	/*
@@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	struct poll_wqueues table;
 	poll_table *wait;
 	int retval, i, timed_out = 0;
-	unsigned long slack = 0;
+	u64 slack = 0;
 	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
 	unsigned long busy_end = 0;
 
@@ -784,7 +784,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	poll_table* pt = &wait->pt;
 	ktime_t expire, *to = NULL;
 	int timed_out = 0, count = 0;
-	unsigned long slack = 0;
+	u64 slack = 0;
 	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
 	unsigned long busy_end = 0;
 
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 6b7fd9cf5ea2..dd03e837ebb7 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -231,7 +231,7 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
  * call this with locks held.
  */
 static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
-		unsigned long delta, const enum hrtimer_mode mode)
+		u64 delta, const enum hrtimer_mode mode)
 {
 	int __retval;
 	freezer_do_not_count();
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 76dd4f0da5ca..90d8be19505b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -218,7 +218,7 @@ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time
 	timer->node.expires = ktime_add_safe(time, delta);
 }
 
-static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
+static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta)
 {
 	timer->_softexpires = time;
 	timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
@@ -355,7 +355,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 
 /* Basic timer operations: */
 extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-			unsigned long range_ns, const enum hrtimer_mode mode);
+				   u64 range_ns, const enum hrtimer_mode mode);
 
 /**
  * hrtimer_start - (re)start an hrtimer on the current CPU
@@ -376,7 +376,7 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 static inline void hrtimer_start_expires(struct hrtimer *timer,
 					 enum hrtimer_mode mode)
 {
-	unsigned long delta;
+	u64 delta;
 	ktime_t soft, hard;
 	soft = hrtimer_get_softexpires(timer);
 	hard = hrtimer_get_expires(timer);
@@ -449,10 +449,12 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
 extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				 struct task_struct *tsk);
 
-extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
 						const enum hrtimer_mode mode);
 extern int schedule_hrtimeout_range_clock(ktime_t *expires,
-		unsigned long delta, const enum hrtimer_mode mode, int clock);
+					  u64 delta,
+					  const enum hrtimer_mode mode,
+					  int clock);
 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 
 /* Soft interrupt function to run the hrtimer queues: */
diff --git a/include/linux/poll.h b/include/linux/poll.h
index c08386fb3e08..9fb4f40d9a26 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
 extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 				 ktime_t *expires, unsigned long slack);
-extern long select_estimate_accuracy(struct timespec *tv);
+extern u64 select_estimate_accuracy(struct timespec *tv);
 
 
 static inline int poll_schedule(struct poll_wqueues *pwq, int state)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fa39434e3fdd..a8012d691abf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1767,8 +1767,8 @@ struct task_struct {
 	 * time slack values; these are used to round up poll() and
 	 * select() etc timeout values. These are in nanoseconds.
 	 */
-	unsigned long timer_slack_ns;
-	unsigned long default_timer_slack_ns;
+	u64 timer_slack_ns;
+	u64 default_timer_slack_ns;
 
 #ifdef CONFIG_KASAN
 	unsigned int kasan_depth;
diff --git a/kernel/sys.c b/kernel/sys.c
index 11333311cf1c..0c00e563b153 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2319,7 +2319,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = perf_event_task_enable();
 		break;
 	case PR_GET_TIMERSLACK:
-		error = current->timer_slack_ns;
+		if (current->timer_slack_ns > ULONG_MAX)
+			error = ULONG_MAX;
+		else
+			error = current->timer_slack_ns;
 		break;
 	case PR_SET_TIMERSLACK:
 		if (arg2 <= 0)
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 435b8850dd80..c98439fdce81 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -963,7 +963,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
  *		relative (HRTIMER_MODE_REL)
  */
 void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-			    unsigned long delta_ns, const enum hrtimer_mode mode)
+			    u64 delta_ns, const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
@@ -1529,7 +1529,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	struct restart_block *restart;
 	struct hrtimer_sleeper t;
 	int ret = 0;
-	unsigned long slack;
+	u64 slack;
 
 	slack = current->timer_slack_ns;
 	if (dl_task(current) || rt_task(current))
@@ -1705,7 +1705,7 @@ void __init hrtimers_init(void)
  * @clock:	timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
  */
 int __sched
-schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
+schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
 			       const enum hrtimer_mode mode, int clock)
 {
 	struct hrtimer_sleeper t;
@@ -1773,7 +1773,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
  *
  * Returns 0 when the timer has expired otherwise -EINTR
  */
-int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
 				     const enum hrtimer_mode mode)
 {
 	return schedule_hrtimeout_range_clock(expires, delta, mode,
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index bbc5d1114583..d1798fa0c743 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1698,10 +1698,10 @@ EXPORT_SYMBOL(msleep_interruptible);
 static void __sched do_usleep_range(unsigned long min, unsigned long max)
 {
 	ktime_t kmin;
-	unsigned long delta;
+	u64 delta;
 
 	kmin = ktime_set(0, min * NSEC_PER_USEC);
-	delta = (max - min) * NSEC_PER_USEC;
+	delta = (u64)(max - min) * NSEC_PER_USEC;
 	schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
 }
 

From eb976ed445e57de1e6469b9f3a1aaf33f862c1dd Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 17 Mar 2016 14:20:54 -0700
Subject: [PATCH 0158/3220] BACKPORT: proc: add /proc/<pid>/timerslack_ns
 interface

This backports 5de23d435e88996b1efe0e2cebe242074ce67c9e

This patch provides a proc/PID/timerslack_ns interface which exposes a
task's timerslack value in nanoseconds and allows it to be changed.

This allows power/performance management software to set timer slack for
other threads according to its policy for the thread (such as when the
thread is designated foreground vs.  background activity)

If the value written is non-zero, slack is set to that value.  Otherwise
sets it to the default for the thread.

This interface checks that the calling task has permissions to to use
PTRACE_MODE_ATTACH_FSCREDS on the target task, so that we can ensure
arbitrary apps do not change the timer slack for other apps.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 18 ++++++++
 fs/proc/base.c                     | 67 ++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0bcba2823d9d..1c425191574c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -43,6 +43,7 @@ Table of Contents
   3.7   /proc/<pid>/task/<tid>/children - Information about task children
   3.8   /proc/<pid>/fdinfo/<fd> - Information about opened file
   3.9   /proc/<pid>/map_files - Information about memory mapped files
+  3.10  /proc/<pid>/timerslack_ns - Task timerslack value
 
   4	Configuring procfs
   4.1	Mount options
@@ -1856,6 +1857,23 @@ time one can open(2) mappings from the listings of two processes and
 comparing their inode numbers to figure out which anonymous memory areas
 are actually shared.
 
+3.10	/proc/<pid>/timerslack_ns - Task timerslack value
+---------------------------------------------------------
+This file provides the value of the task's timerslack value in nanoseconds.
+This value specifies a amount of time that normal timers may be deferred
+in order to coalesce timers and avoid unnecessary wakeups.
+
+This allows a task's interactivity vs power consumption trade off to be
+adjusted.
+
+Writing 0 to the file will set the tasks timerslack to the default value.
+
+Valid values are from 0 - ULLONG_MAX
+
+An application setting the value must have PTRACE_MODE_ATTACH_FSCREDS level
+permissions on the task specified to change its timerslack_ns value.
+
+
 ------------------------------------------------------------------------------
 Configuring procfs
 ------------------------------------------------------------------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c21bb4beb0d7..bb51e4ceb15b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2243,6 +2243,72 @@ static const struct file_operations proc_timers_operations = {
 	.release	= seq_release_private,
 };
 
+static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *offset)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+	u64 slack_ns;
+	int err;
+
+	err = kstrtoull_from_user(buf, count, 10, &slack_ns);
+	if (err < 0)
+		return err;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+		task_lock(p);
+		if (slack_ns == 0)
+			p->timer_slack_ns = p->default_timer_slack_ns;
+		else
+			p->timer_slack_ns = slack_ns;
+		task_unlock(p);
+	} else
+		count = -EPERM;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int timerslack_ns_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+	int err =  0;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+		task_lock(p);
+		seq_printf(m, "%llu\n", p->timer_slack_ns);
+		task_unlock(p);
+	} else
+		err = -EPERM;
+
+	put_task_struct(p);
+
+	return err;
+}
+
+static int timerslack_ns_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, timerslack_ns_show, inode);
+}
+
+static const struct file_operations proc_pid_set_timerslack_ns_operations = {
+	.open		= timerslack_ns_open,
+	.read		= seq_read,
+	.write		= timerslack_ns_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int proc_pident_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2820,6 +2886,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	REG("timers",	  S_IRUGO, proc_timers_operations),
 #endif
+	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)

From e6ec611adde326cd28f0692a263b31821f86b071 Mon Sep 17 00:00:00 2001
From: Jann Horn <jann@thejh.net>
Date: Wed, 20 Jan 2016 15:00:04 -0800
Subject: [PATCH 0159/3220] BACKPORT: ptrace: use fsuid, fsgid, effective creds
 for fs access checks

This patch backports 969624b (which backports caaee6234d0 upstream),
from the v4.4-stable branch to the common/android-4.4 branch.

This patch is needed to provide the PTRACE_MODE_ATTACH_FSCREDS definition
which was used by the backported version of proc/<tid>/timerslack_ns
in change-id: Ie5799b9a3402a31f88cd46437dcda4a0e46415a7

commit caaee6234d05a58c5b4d05e7bf766131b810a657 upstream.

By checking the effective credentials instead of the real UID / permitted
capabilities, ensure that the calling process actually intended to use its
credentials.

To ensure that all ptrace checks use the correct caller credentials (e.g.
in case out-of-tree code or newly added code omits the PTRACE_MODE_*CREDS
flag), use two new flags and require one of them to be set.

The problem was that when a privileged task had temporarily dropped its
privileges, e.g.  by calling setreuid(0, user_uid), with the intent to
perform following syscalls with the credentials of a user, it still passed
ptrace access checks that the user would not be able to pass.

While an attacker should not be able to convince the privileged task to
perform a ptrace() syscall, this is a problem because the ptrace access
check is reused for things in procfs.

In particular, the following somewhat interesting procfs entries only rely
on ptrace access checks:

 /proc/$pid/stat - uses the check for determining whether pointers
     should be visible, useful for bypassing ASLR
 /proc/$pid/maps - also useful for bypassing ASLR
 /proc/$pid/cwd - useful for gaining access to restricted
     directories that contain files with lax permissions, e.g. in
     this scenario:
     lrwxrwxrwx root root /proc/13020/cwd -> /root/foobar
     drwx------ root root /root
     drwxr-xr-x root root /root/foobar
     -rw-r--r-- root root /root/foobar/secret

Therefore, on a system where a root-owned mode 6755 binary changes its
effective credentials as described and then dumps a user-specified file,
this could be used by an attacker to reveal the memory layout of root's
processes or reveal the contents of files he is not allowed to access
(through /proc/$pid/cwd).

[akpm@linux-foundation.org: fix warning]
Signed-off-by: Jann Horn <jann@thejh.net>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: "Serge E. Hallyn" <serge.hallyn@ubuntu.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Willy Tarreau <w@1wt.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[jstultz: Cherry-picked for common/android-4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 fs/proc/array.c        |  2 +-
 fs/proc/base.c         | 21 +++++++++++----------
 fs/proc/namespaces.c   |  4 ++--
 include/linux/ptrace.h | 24 +++++++++++++++++++++++-
 kernel/events/core.c   |  2 +-
 kernel/futex.c         |  2 +-
 kernel/futex_compat.c  |  2 +-
 kernel/kcmp.c          |  4 ++--
 kernel/ptrace.c        | 39 +++++++++++++++++++++++++++++++--------
 mm/process_vm_access.c |  2 +-
 security/commoncap.c   |  7 ++++++-
 11 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index d73291f5f0fc..b6c00ce0e29e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
-	permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
+	permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT);
 	mm = get_task_mm(task);
 	if (mm) {
 		vsize = task_vsize(mm);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index bb51e4ceb15b..db221e53a1ac 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
 static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
 			 struct pid *pid, struct task_struct *task)
 {
-	struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
+	struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
 	if (mm && !IS_ERR(mm)) {
 		unsigned int nwords = 0;
 		do {
@@ -430,7 +430,8 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
 
 	wchan = get_wchan(task);
 
-	if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
+	if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
+			&& !lookup_symbol_name(wchan, symname))
 		seq_printf(m, "%s", symname);
 	else
 		seq_putc(m, '0');
@@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task)
 	int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
 	if (err)
 		return err;
-	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
 		mutex_unlock(&task->signal->cred_guard_mutex);
 		return -EPERM;
 	}
@@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode)
 	 */
 	task = get_proc_task(inode);
 	if (task) {
-		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+		allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 		put_task_struct(task);
 	}
 	return allowed;
@@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
 		return true;
 	if (in_group_p(pid->pid_gid))
 		return true;
-	return ptrace_may_access(task, PTRACE_MODE_READ);
+	return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 }
 
 
@@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 	struct mm_struct *mm = ERR_PTR(-ESRCH);
 
 	if (task) {
-		mm = mm_access(task, mode);
+		mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
 		put_task_struct(task);
 
 		if (!IS_ERR_OR_NULL(mm)) {
@@ -1856,7 +1857,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (!task)
 		goto out_notask;
 
-	mm = mm_access(task, PTRACE_MODE_READ);
+	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
 	if (IS_ERR_OR_NULL(mm))
 		goto out;
 
@@ -2007,7 +2008,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
 		goto out;
 
 	result = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
 		goto out_put_task;
 
 	result = -ENOENT;
@@ -2060,7 +2061,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
 		goto out;
 
 	ret = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
 		goto out_put_task;
 
 	ret = 0;
@@ -2596,7 +2597,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
 	if (result)
 		return result;
 
-	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
 		result = -EACCES;
 		goto out_unlock;
 	}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index f6e8354b8cea..1b0ea4a5d89e 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -42,7 +42,7 @@ static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
 	if (!task)
 		return error;
 
-	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+	if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
 		error = ns_get_path(&ns_path, task, ns_ops);
 		if (!error)
 			nd_jump_link(&ns_path);
@@ -63,7 +63,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	if (!task)
 		return res;
 
-	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+	if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
 		res = ns_get_name(name, sizeof(name), task, ns_ops);
 		if (res >= 0)
 			res = readlink_copy(buffer, buflen, name);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 061265f92876..504c98a278d4 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -57,7 +57,29 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
 #define PTRACE_MODE_READ	0x01
 #define PTRACE_MODE_ATTACH	0x02
 #define PTRACE_MODE_NOAUDIT	0x04
-/* Returns true on success, false on denial. */
+#define PTRACE_MODE_FSCREDS 0x08
+#define PTRACE_MODE_REALCREDS 0x10
+
+/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
+#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
+#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
+#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
+#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)
+
+/**
+ * ptrace_may_access - check whether the caller is permitted to access
+ * a target task.
+ * @task: target task
+ * @mode: selects type of access and caller credentials
+ *
+ * Returns true on success, false on denial.
+ *
+ * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must
+ * be set in @mode to specify whether the access was requested through
+ * a filesystem syscall (should use effective capabilities and fsuid
+ * of the caller) or through an explicit syscall such as
+ * process_vm_writev or ptrace (and should use the real credentials).
+ */
 extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
 
 static inline int ptrace_reparented(struct task_struct *child)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 85bc810bece6..2f07e9c34f43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3439,7 +3439,7 @@ find_lively_task_by_vpid(pid_t vpid)
 
 	/* Reuse ptrace permission checks for now. */
 	err = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
 		goto errout;
 
 	return task;
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d7549825a..495a1d06915b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2881,7 +2881,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 	}
 
 	ret = -EPERM;
-	if (!ptrace_may_access(p, PTRACE_MODE_READ))
+	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
 	head = p->robust_list;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 55c8c9349cfe..4ae3232e7a28 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 	}
 
 	ret = -EPERM;
-	if (!ptrace_may_access(p, PTRACE_MODE_READ))
+	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
 	head = p->compat_robust_list;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 0aa69ea1d8fd..3a47fa998fe0 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 			&task2->signal->cred_guard_mutex);
 	if (ret)
 		goto err;
-	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
-	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) {
 		ret = -EPERM;
 		goto err_unlock;
 	}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b760bae64cf1..3189e51db7e8 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -219,6 +219,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
 static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	const struct cred *cred = current_cred(), *tcred;
+	int dumpable = 0;
+	kuid_t caller_uid;
+	kgid_t caller_gid;
+
+	if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) {
+		WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n");
+		return -EPERM;
+	}
 
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
@@ -228,18 +236,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
-	int dumpable = 0;
+
 	/* Don't let security modules deny introspection */
 	if (same_thread_group(task, current))
 		return 0;
 	rcu_read_lock();
+	if (mode & PTRACE_MODE_FSCREDS) {
+		caller_uid = cred->fsuid;
+		caller_gid = cred->fsgid;
+	} else {
+		/*
+		 * Using the euid would make more sense here, but something
+		 * in userland might rely on the old behavior, and this
+		 * shouldn't be a security problem since
+		 * PTRACE_MODE_REALCREDS implies that the caller explicitly
+		 * used a syscall that requests access to another process
+		 * (and not a filesystem syscall to procfs).
+		 */
+		caller_uid = cred->uid;
+		caller_gid = cred->gid;
+	}
 	tcred = __task_cred(task);
-	if (uid_eq(cred->uid, tcred->euid) &&
-	    uid_eq(cred->uid, tcred->suid) &&
-	    uid_eq(cred->uid, tcred->uid)  &&
-	    gid_eq(cred->gid, tcred->egid) &&
-	    gid_eq(cred->gid, tcred->sgid) &&
-	    gid_eq(cred->gid, tcred->gid))
+	if (uid_eq(caller_uid, tcred->euid) &&
+	    uid_eq(caller_uid, tcred->suid) &&
+	    uid_eq(caller_uid, tcred->uid)  &&
+	    gid_eq(caller_gid, tcred->egid) &&
+	    gid_eq(caller_gid, tcred->sgid) &&
+	    gid_eq(caller_gid, tcred->gid))
 		goto ok;
 	if (ptrace_has_cap(tcred->user_ns, mode))
 		goto ok;
@@ -306,7 +329,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 		goto out;
 
 	task_lock(task);
-	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
 	task_unlock(task);
 	if (retval)
 		goto unlock_creds;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index e88d071648c2..5d453e58ddbf 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 		goto free_proc_pages;
 	}
 
-	mm = mm_access(task, PTRACE_MODE_ATTACH);
+	mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
 	if (!mm || IS_ERR(mm)) {
 		rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
 		/*
diff --git a/security/commoncap.c b/security/commoncap.c
index f035b84b3601..7fa251aea32f 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -148,12 +148,17 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
 {
 	int ret = 0;
 	const struct cred *cred, *child_cred;
+	const kernel_cap_t *caller_caps;
 
 	rcu_read_lock();
 	cred = current_cred();
 	child_cred = __task_cred(child);
+	if (mode & PTRACE_MODE_FSCREDS)
+		caller_caps = &cred->cap_effective;
+	else
+		caller_caps = &cred->cap_permitted;
 	if (cred->user_ns == child_cred->user_ns &&
-	    cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
+	    cap_issubset(child_cred->cap_permitted, *caller_caps))
 		goto out;
 	if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
 		goto out;

From 50437c019f146eb138d819d558a3f562419573fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Fri, 8 Jul 2016 13:24:09 -0700
Subject: [PATCH 0160/3220] UPSTREAM: cdc_ncm: do not call usbnet_link_change
 from cdc_ncm_bind
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(cherry pick from commit 4d06dd537f95683aba3651098ae288b7cbff8274)

usbnet_link_change will call schedule_work and should be
avoided if bind is failing. Otherwise we will end up with
scheduled work referring to a netdev which has gone away.

Instead of making the call conditional, we can just defer
it to usbnet_probe, using the driver_info flag made for
this purpose.

Fixes: 8a34b0ae8778 ("usbnet: cdc_ncm: apply usbnet_link_change")
Reported-by: Andrey Konovalov <andreyknvl@gmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
Change-Id: Id9a6d02bdd98bf495d26595cf2cc90e480746186
Bug: 28744625
---
 drivers/net/usb/cdc_ncm.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index e8a1144c5a8b..93c88a2ce55c 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -941,8 +941,6 @@ EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting);
 
 static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 {
-	int ret;
-
 	/* MBIM backwards compatible function? */
 	if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM)
 		return -ENODEV;
@@ -951,16 +949,7 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 	 * Additionally, generic NCM devices are assumed to accept arbitrarily
 	 * placed NDP.
 	 */
-	ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
-
-	/*
-	 * We should get an event when network connection is "connected" or
-	 * "disconnected". Set network connection in "disconnected" state
-	 * (carrier is OFF) during attach, so the IP network stack does not
-	 * start IPv6 negotiation and more.
-	 */
-	usbnet_link_change(dev, 0, 0);
-	return ret;
+	return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
 }
 
 static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max)
@@ -1543,7 +1532,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb)
 
 static const struct driver_info cdc_ncm_info = {
 	.description = "CDC NCM",
-	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET,
+	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
+			| FLAG_LINK_INTR,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
 	.manage_power = usbnet_manage_power,
@@ -1556,7 +1546,7 @@ static const struct driver_info cdc_ncm_info = {
 static const struct driver_info wwan_info = {
 	.description = "Mobile Broadband Network Device",
 	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
-			| FLAG_WWAN,
+			| FLAG_LINK_INTR | FLAG_WWAN,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
 	.manage_power = usbnet_manage_power,
@@ -1569,7 +1559,7 @@ static const struct driver_info wwan_info = {
 static const struct driver_info wwan_noarp_info = {
 	.description = "Mobile Broadband Network Device (NO ARP)",
 	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
-			| FLAG_WWAN | FLAG_NOARP,
+			| FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
 	.manage_power = usbnet_manage_power,

From 5b2129ec25a33cda9662232afdfa6af7c9bc168a Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 8 Jul 2016 14:15:14 -0700
Subject: [PATCH 0161/3220] sdcardfs: Truncate packages_gid.list on overflow

packages_gid.list was improperly returning the wrong
count. Use scnprintf instead, and inform the user that
the list was truncated if it is.

Bug: 30013843
Change-Id: Ida2b2ef7cd86dd87300bfb4c2cdb6bfe2ee1650d
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/packagelist.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 10f0d6be718b..9c3340528eee 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -335,12 +335,19 @@ static ssize_t packages_attr_show(struct config_item *item,
 	struct hashtable_entry *hash_cur;
 	struct hlist_node *h_t;
 	int i;
-	int count = 0;
-	mutex_lock(&pkgl_data_all->hashtable_lock);
-	hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist)
-		count += snprintf(page + count, PAGE_SIZE - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value);
-	mutex_unlock(&pkgl_data_all->hashtable_lock);
+	int count = 0, written = 0;
+	char errormsg[] = "<truncated>\n";
 
+	mutex_lock(&pkgl_data_all->hashtable_lock);
+	hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) {
+		written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value);
+		if (count + written == PAGE_SIZE - sizeof(errormsg)) {
+			count += scnprintf(page + count, PAGE_SIZE - count, errormsg);
+			break;
+		}
+		count += written;
+	}
+	mutex_unlock(&pkgl_data_all->hashtable_lock);
 
 	return count;
 }

From 6b72c6ef4442d1e4cbae32839ee69a1b42704d86 Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Sun, 12 Jun 2016 17:37:52 -0700
Subject: [PATCH 0162/3220] android-recommended.cfg: enable
 fstack-protector-strong

If compiler has stack protector support, set
CONFIG_CC_STACKPROTECTOR_STRONG.

Bug: 28967314
Change-Id: I588c2d544250e9e4b5082b43c237b8f85b7313ca
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
---
 android/configs/android-recommended.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index c3222a77ba24..2f1ef077aa9e 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -11,6 +11,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_CC_STACKPROTECTOR_STRONG=y
 CONFIG_COMPACTION=y
 CONFIG_DEBUG_RODATA=y
 CONFIG_DM_UEVENT=y

From 96f61ff98aab275e0ac7dae4a42d3357e3d6e4a2 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Wed, 1 Jun 2016 10:28:49 -0700
Subject: [PATCH 0163/3220] ANDROID: sdcardfs: fix itnull.cocci warnings

List_for_each_entry has the property that the first argument is always
bound to a real list element, never NULL, so testing dentry is not needed.

Generated by: scripts/coccinelle/iterators/itnull.cocci

Cc: Daniel Rosenberg <drosen@google.com>
Signed-off-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
 fs/sdcardfs/derived_perm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 128b3e56851f..41e0e11b3c35 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -112,7 +112,7 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry)
 void get_derive_permissions_recursive(struct dentry *parent) {
 	struct dentry *dentry;
 	list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
-		if (dentry && dentry->d_inode) {
+		if (dentry->d_inode) {
 			mutex_lock(&dentry->d_inode->i_mutex);
 			get_derived_permission(parent, dentry);
 			fix_derived_permission(dentry->d_inode);

From 9cb6c482ae8032970efe10f6f15d3bbee7795fb4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 22 Mar 2016 18:02:52 +0100
Subject: [PATCH 0164/3220] UPSTREAM: netfilter: x_tables: fix unconditional
 helper

(cherry pick from commit 54d83fc74aa9ec72794373cb47432c5f7fb1a309)

Ben Hawkes says:

 In the mark_source_chains function (net/ipv4/netfilter/ip_tables.c) it
 is possible for a user-supplied ipt_entry structure to have a large
 next_offset field. This field is not bounds checked prior to writing a
 counter value at the supplied offset.

Problem is that mark_source_chains should not have been called --
the rule doesn't have a next entry, so its supposed to return
an absolute verdict of either ACCEPT or DROP.

However, the function conditional() doesn't work as the name implies.
It only checks that the rule is using wildcard address matching.

However, an unconditional rule must also not be using any matches
(no -m args).

The underflow validator only checked the addresses, therefore
passing the 'unconditional absolute verdict' test, while
mark_source_chains also tested for presence of matches, and thus
proceeeded to the next (not-existent) rule.

Unify this so that all the callers have same idea of 'unconditional rule'.

Reported-by: Ben Hawkes <hawkes@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Change-Id: I47ec0713ac563ac244200c7b2c54f09a91aceabc
Bug: 28940694
---
 net/ipv4/netfilter/arp_tables.c | 18 +++++++++---------
 net/ipv4/netfilter/ip_tables.c  | 23 +++++++++++------------
 net/ipv6/netfilter/ip6_tables.c | 23 +++++++++++------------
 3 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 11dccba474b7..e5697e649ce1 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -359,11 +359,12 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 }
 
 /* All zeroes == unconditional rule. */
-static inline bool unconditional(const struct arpt_arp *arp)
+static inline bool unconditional(const struct arpt_entry *e)
 {
 	static const struct arpt_arp uncond;
 
-	return memcmp(arp, &uncond, sizeof(uncond)) == 0;
+	return e->target_offset == sizeof(struct arpt_entry) &&
+	       memcmp(&e->arp, &uncond, sizeof(uncond)) == 0;
 }
 
 /* Figures out from what hook each rule can be called: returns 0 if
@@ -402,11 +403,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 				|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
 
 			/* Unconditional return/END. */
-			if ((e->target_offset == sizeof(struct arpt_entry) &&
+			if ((unconditional(e) &&
 			     (strcmp(t->target.u.user.name,
 				     XT_STANDARD_TARGET) == 0) &&
-			     t->verdict < 0 && unconditional(&e->arp)) ||
-			    visited) {
+			     t->verdict < 0) || visited) {
 				unsigned int oldpos, size;
 
 				if ((strcmp(t->target.u.user.name,
@@ -557,7 +557,7 @@ static bool check_underflow(const struct arpt_entry *e)
 	const struct xt_entry_target *t;
 	unsigned int verdict;
 
-	if (!unconditional(&e->arp))
+	if (!unconditional(e))
 		return false;
 	t = arpt_get_target_c(e);
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
@@ -598,9 +598,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 			newinfo->hook_entry[h] = hook_entries[h];
 		if ((unsigned char *)e - base == underflows[h]) {
 			if (!check_underflow(e)) {
-				pr_err("Underflows must be unconditional and "
-				       "use the STANDARD target with "
-				       "ACCEPT/DROP\n");
+				pr_debug("Underflows must be unconditional and "
+					 "use the STANDARD target with "
+					 "ACCEPT/DROP\n");
 				return -EINVAL;
 			}
 			newinfo->underflow[h] = underflows[h];
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b99affad6ba1..2bf6b3f1a00c 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -168,11 +168,12 @@ get_entry(const void *base, unsigned int offset)
 
 /* All zeroes == unconditional rule. */
 /* Mildly perf critical (only if packet tracing is on) */
-static inline bool unconditional(const struct ipt_ip *ip)
+static inline bool unconditional(const struct ipt_entry *e)
 {
 	static const struct ipt_ip uncond;
 
-	return memcmp(ip, &uncond, sizeof(uncond)) == 0;
+	return e->target_offset == sizeof(struct ipt_entry) &&
+	       memcmp(&e->ip, &uncond, sizeof(uncond)) == 0;
 #undef FWINV
 }
 
@@ -229,11 +230,10 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
 	} else if (s == e) {
 		(*rulenum)++;
 
-		if (s->target_offset == sizeof(struct ipt_entry) &&
+		if (unconditional(s) &&
 		    strcmp(t->target.u.kernel.target->name,
 			   XT_STANDARD_TARGET) == 0 &&
-		   t->verdict < 0 &&
-		   unconditional(&s->ip)) {
+		   t->verdict < 0) {
 			/* Tail of chains: STANDARD target (return/policy) */
 			*comment = *chainname == hookname
 				? comments[NF_IP_TRACE_COMMENT_POLICY]
@@ -476,11 +476,10 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
 
 			/* Unconditional return/END. */
-			if ((e->target_offset == sizeof(struct ipt_entry) &&
+			if ((unconditional(e) &&
 			     (strcmp(t->target.u.user.name,
 				     XT_STANDARD_TARGET) == 0) &&
-			     t->verdict < 0 && unconditional(&e->ip)) ||
-			    visited) {
+			     t->verdict < 0) || visited) {
 				unsigned int oldpos, size;
 
 				if ((strcmp(t->target.u.user.name,
@@ -721,7 +720,7 @@ static bool check_underflow(const struct ipt_entry *e)
 	const struct xt_entry_target *t;
 	unsigned int verdict;
 
-	if (!unconditional(&e->ip))
+	if (!unconditional(e))
 		return false;
 	t = ipt_get_target_c(e);
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
@@ -763,9 +762,9 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 			newinfo->hook_entry[h] = hook_entries[h];
 		if ((unsigned char *)e - base == underflows[h]) {
 			if (!check_underflow(e)) {
-				pr_err("Underflows must be unconditional and "
-				       "use the STANDARD target with "
-				       "ACCEPT/DROP\n");
+				pr_debug("Underflows must be unconditional and "
+					 "use the STANDARD target with "
+					 "ACCEPT/DROP\n");
 				return -EINVAL;
 			}
 			newinfo->underflow[h] = underflows[h];
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 99425cf2819b..37e945cfa5da 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -198,11 +198,12 @@ get_entry(const void *base, unsigned int offset)
 
 /* All zeroes == unconditional rule. */
 /* Mildly perf critical (only if packet tracing is on) */
-static inline bool unconditional(const struct ip6t_ip6 *ipv6)
+static inline bool unconditional(const struct ip6t_entry *e)
 {
 	static const struct ip6t_ip6 uncond;
 
-	return memcmp(ipv6, &uncond, sizeof(uncond)) == 0;
+	return e->target_offset == sizeof(struct ip6t_entry) &&
+	       memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0;
 }
 
 static inline const struct xt_entry_target *
@@ -258,11 +259,10 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
 	} else if (s == e) {
 		(*rulenum)++;
 
-		if (s->target_offset == sizeof(struct ip6t_entry) &&
+		if (unconditional(s) &&
 		    strcmp(t->target.u.kernel.target->name,
 			   XT_STANDARD_TARGET) == 0 &&
-		    t->verdict < 0 &&
-		    unconditional(&s->ipv6)) {
+		    t->verdict < 0) {
 			/* Tail of chains: STANDARD target (return/policy) */
 			*comment = *chainname == hookname
 				? comments[NF_IP6_TRACE_COMMENT_POLICY]
@@ -488,11 +488,10 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
 
 			/* Unconditional return/END. */
-			if ((e->target_offset == sizeof(struct ip6t_entry) &&
+			if ((unconditional(e) &&
 			     (strcmp(t->target.u.user.name,
 				     XT_STANDARD_TARGET) == 0) &&
-			     t->verdict < 0 &&
-			     unconditional(&e->ipv6)) || visited) {
+			     t->verdict < 0) || visited) {
 				unsigned int oldpos, size;
 
 				if ((strcmp(t->target.u.user.name,
@@ -733,7 +732,7 @@ static bool check_underflow(const struct ip6t_entry *e)
 	const struct xt_entry_target *t;
 	unsigned int verdict;
 
-	if (!unconditional(&e->ipv6))
+	if (!unconditional(e))
 		return false;
 	t = ip6t_get_target_c(e);
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
@@ -775,9 +774,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 			newinfo->hook_entry[h] = hook_entries[h];
 		if ((unsigned char *)e - base == underflows[h]) {
 			if (!check_underflow(e)) {
-				pr_err("Underflows must be unconditional and "
-				       "use the STANDARD target with "
-				       "ACCEPT/DROP\n");
+				pr_debug("Underflows must be unconditional and "
+					 "use the STANDARD target with "
+					 "ACCEPT/DROP\n");
 				return -EINVAL;
 			}
 			newinfo->underflow[h] = underflows[h];

From f8a27f34070e29bdbbe88f5330f77d8ac5d1c2fb Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Wed, 23 Mar 2016 16:38:55 +0100
Subject: [PATCH 0165/3220] UPSTREAM: ppp: take reference on channels netns

(cherry pick from commit 1f461dcdd296eecedaffffc6bae2bfa90bd7eb89)

Let channels hold a reference on their network namespace.
Some channel types, like ppp_async and ppp_synctty, can have their
userspace controller running in a different namespace. Therefore they
can't rely on them to preclude their netns from being removed from
under them.

==================================================================
BUG: KASAN: use-after-free in ppp_unregister_channel+0x372/0x3a0 at
addr ffff880064e217e0
Read of size 8 by task syz-executor/11581
=============================================================================
BUG net_namespace (Not tainted): kasan: bad access detected
-----------------------------------------------------------------------------

Disabling lock debugging due to kernel taint
INFO: Allocated in copy_net_ns+0x6b/0x1a0 age=92569 cpu=3 pid=6906
[<      none      >] ___slab_alloc+0x4c7/0x500 kernel/mm/slub.c:2440
[<      none      >] __slab_alloc+0x4c/0x90 kernel/mm/slub.c:2469
[<     inline     >] slab_alloc_node kernel/mm/slub.c:2532
[<     inline     >] slab_alloc kernel/mm/slub.c:2574
[<      none      >] kmem_cache_alloc+0x23a/0x2b0 kernel/mm/slub.c:2579
[<     inline     >] kmem_cache_zalloc kernel/include/linux/slab.h:597
[<     inline     >] net_alloc kernel/net/core/net_namespace.c:325
[<      none      >] copy_net_ns+0x6b/0x1a0 kernel/net/core/net_namespace.c:360
[<      none      >] create_new_namespaces+0x2f6/0x610 kernel/kernel/nsproxy.c:95
[<      none      >] copy_namespaces+0x297/0x320 kernel/kernel/nsproxy.c:150
[<      none      >] copy_process.part.35+0x1bf4/0x5760 kernel/kernel/fork.c:1451
[<     inline     >] copy_process kernel/kernel/fork.c:1274
[<      none      >] _do_fork+0x1bc/0xcb0 kernel/kernel/fork.c:1723
[<     inline     >] SYSC_clone kernel/kernel/fork.c:1832
[<      none      >] SyS_clone+0x37/0x50 kernel/kernel/fork.c:1826
[<      none      >] entry_SYSCALL_64_fastpath+0x16/0x7a kernel/arch/x86/entry/entry_64.S:185

INFO: Freed in net_drop_ns+0x67/0x80 age=575 cpu=2 pid=2631
[<      none      >] __slab_free+0x1fc/0x320 kernel/mm/slub.c:2650
[<     inline     >] slab_free kernel/mm/slub.c:2805
[<      none      >] kmem_cache_free+0x2a0/0x330 kernel/mm/slub.c:2814
[<     inline     >] net_free kernel/net/core/net_namespace.c:341
[<      none      >] net_drop_ns+0x67/0x80 kernel/net/core/net_namespace.c:348
[<      none      >] cleanup_net+0x4e5/0x600 kernel/net/core/net_namespace.c:448
[<      none      >] process_one_work+0x794/0x1440 kernel/kernel/workqueue.c:2036
[<      none      >] worker_thread+0xdb/0xfc0 kernel/kernel/workqueue.c:2170
[<      none      >] kthread+0x23f/0x2d0 kernel/drivers/block/aoe/aoecmd.c:1303
[<      none      >] ret_from_fork+0x3f/0x70 kernel/arch/x86/entry/entry_64.S:468
INFO: Slab 0xffffea0001938800 objects=3 used=0 fp=0xffff880064e20000
flags=0x5fffc0000004080
INFO: Object 0xffff880064e20000 @offset=0 fp=0xffff880064e24200

CPU: 1 PID: 11581 Comm: syz-executor Tainted: G    B           4.4.0+
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
 00000000ffffffff ffff8800662c7790 ffffffff8292049d ffff88003e36a300
 ffff880064e20000 ffff880064e20000 ffff8800662c77c0 ffffffff816f2054
 ffff88003e36a300 ffffea0001938800 ffff880064e20000 0000000000000000
Call Trace:
 [<     inline     >] __dump_stack kernel/lib/dump_stack.c:15
 [<ffffffff8292049d>] dump_stack+0x6f/0xa2 kernel/lib/dump_stack.c:50
 [<ffffffff816f2054>] print_trailer+0xf4/0x150 kernel/mm/slub.c:654
 [<ffffffff816f875f>] object_err+0x2f/0x40 kernel/mm/slub.c:661
 [<     inline     >] print_address_description kernel/mm/kasan/report.c:138
 [<ffffffff816fb0c5>] kasan_report_error+0x215/0x530 kernel/mm/kasan/report.c:236
 [<     inline     >] kasan_report kernel/mm/kasan/report.c:259
 [<ffffffff816fb4de>] __asan_report_load8_noabort+0x3e/0x40 kernel/mm/kasan/report.c:280
 [<     inline     >] ? ppp_pernet kernel/include/linux/compiler.h:218
 [<ffffffff83ad71b2>] ? ppp_unregister_channel+0x372/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392
 [<     inline     >] ppp_pernet kernel/include/linux/compiler.h:218
 [<ffffffff83ad71b2>] ppp_unregister_channel+0x372/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392
 [<     inline     >] ? ppp_pernet kernel/drivers/net/ppp/ppp_generic.c:293
 [<ffffffff83ad6f26>] ? ppp_unregister_channel+0xe6/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392
 [<ffffffff83ae18f3>] ppp_asynctty_close+0xa3/0x130 kernel/drivers/net/ppp/ppp_async.c:241
 [<ffffffff83ae1850>] ? async_lcp_peek+0x5b0/0x5b0 kernel/drivers/net/ppp/ppp_async.c:1000
 [<ffffffff82c33239>] tty_ldisc_close.isra.1+0x99/0xe0 kernel/drivers/tty/tty_ldisc.c:478
 [<ffffffff82c332c0>] tty_ldisc_kill+0x40/0x170 kernel/drivers/tty/tty_ldisc.c:744
 [<ffffffff82c34943>] tty_ldisc_release+0x1b3/0x260 kernel/drivers/tty/tty_ldisc.c:772
 [<ffffffff82c1ef21>] tty_release+0xac1/0x13e0 kernel/drivers/tty/tty_io.c:1901
 [<ffffffff82c1e460>] ? release_tty+0x320/0x320 kernel/drivers/tty/tty_io.c:1688
 [<ffffffff8174de36>] __fput+0x236/0x780 kernel/fs/file_table.c:208
 [<ffffffff8174e405>] ____fput+0x15/0x20 kernel/fs/file_table.c:244
 [<ffffffff813595ab>] task_work_run+0x16b/0x200 kernel/kernel/task_work.c:115
 [<     inline     >] exit_task_work kernel/include/linux/task_work.h:21
 [<ffffffff81307105>] do_exit+0x8b5/0x2c60 kernel/kernel/exit.c:750
 [<ffffffff813fdd20>] ? debug_check_no_locks_freed+0x290/0x290 kernel/kernel/locking/lockdep.c:4123
 [<ffffffff81306850>] ? mm_update_next_owner+0x6f0/0x6f0 kernel/kernel/exit.c:357
 [<ffffffff813215e6>] ? __dequeue_signal+0x136/0x470 kernel/kernel/signal.c:550
 [<ffffffff8132067b>] ? recalc_sigpending_tsk+0x13b/0x180 kernel/kernel/signal.c:145
 [<ffffffff81309628>] do_group_exit+0x108/0x330 kernel/kernel/exit.c:880
 [<ffffffff8132b9d4>] get_signal+0x5e4/0x14f0 kernel/kernel/signal.c:2307
 [<     inline     >] ? kretprobe_table_lock kernel/kernel/kprobes.c:1113
 [<ffffffff8151d355>] ? kprobe_flush_task+0xb5/0x450 kernel/kernel/kprobes.c:1158
 [<ffffffff8115f7d3>] do_signal+0x83/0x1c90 kernel/arch/x86/kernel/signal.c:712
 [<ffffffff8151d2a0>] ? recycle_rp_inst+0x310/0x310 kernel/include/linux/list.h:655
 [<ffffffff8115f750>] ? setup_sigcontext+0x780/0x780 kernel/arch/x86/kernel/signal.c:165
 [<ffffffff81380864>] ? finish_task_switch+0x424/0x5f0 kernel/kernel/sched/core.c:2692
 [<     inline     >] ? finish_lock_switch kernel/kernel/sched/sched.h:1099
 [<ffffffff81380560>] ? finish_task_switch+0x120/0x5f0 kernel/kernel/sched/core.c:2678
 [<     inline     >] ? context_switch kernel/kernel/sched/core.c:2807
 [<ffffffff85d794e9>] ? __schedule+0x919/0x1bd0 kernel/kernel/sched/core.c:3283
 [<ffffffff81003901>] exit_to_usermode_loop+0xf1/0x1a0 kernel/arch/x86/entry/common.c:247
 [<     inline     >] prepare_exit_to_usermode kernel/arch/x86/entry/common.c:282
 [<ffffffff810062ef>] syscall_return_slowpath+0x19f/0x210 kernel/arch/x86/entry/common.c:344
 [<ffffffff85d88022>] int_ret_from_sys_call+0x25/0x9f kernel/arch/x86/entry/entry_64.S:281
Memory state around the buggy address:
 ffff880064e21680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff880064e21700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff880064e21780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                                       ^
 ffff880064e21800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff880064e21880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

Fixes: 273ec51dd7ce ("net: ppp_generic - introduce net-namespace functionality v2")
Reported-by: Baozeng Ding <sploving1@gmail.com>
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Change-Id: Iee0015eca5bd181954bb4896a3720f7549c5ed0b
Bug: 28979703
---
 drivers/net/ppp/ppp_generic.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 9a863c6a6a33..afe6fddd2baa 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -2290,7 +2290,7 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
 
 	pch->ppp = NULL;
 	pch->chan = chan;
-	pch->chan_net = net;
+	pch->chan_net = get_net(net);
 	chan->ppp = pch;
 	init_ppp_file(&pch->file, CHANNEL);
 	pch->file.hdrlen = chan->hdrlen;
@@ -2387,6 +2387,8 @@ ppp_unregister_channel(struct ppp_channel *chan)
 	spin_lock_bh(&pn->all_channels_lock);
 	list_del(&pch->list);
 	spin_unlock_bh(&pn->all_channels_lock);
+	put_net(pch->chan_net);
+	pch->chan_net = NULL;
 
 	pch->file.dead = 1;
 	wake_up_interruptible(&pch->file.rwait);

From 1d2d5ceaf5ae9d41656a084d394a1e38f1a80d3c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 23 Feb 2016 11:03:12 +0000
Subject: [PATCH 0166/3220] UPSTREAM: KEYS: Fix ASN.1 indefinite length object
 parsing

(cherry pick from commit 23c8a812dc3c621009e4f0e5342aa4e2ede1ceaa)

This fixes CVE-2016-0758.

In the ASN.1 decoder, when the length field of an ASN.1 value is extracted,
it isn't validated against the remaining amount of data before being added
to the cursor.  With a sufficiently large size indicated, the check:

	datalen - dp < 2

may then fail due to integer overflow.

Fix this by checking the length indicated against the amount of remaining
data in both places a definite length is determined.

Whilst we're at it, make the following changes:

 (1) Check the maximum size of extended length does not exceed the capacity
     of the variable it's being stored in (len) rather than the type that
     variable is assumed to be (size_t).

 (2) Compare the EOC tag to the symbolic constant ASN1_EOC rather than the
     integer 0.

 (3) To reduce confusion, move the initialisation of len outside of:

	for (len = 0; n > 0; n--) {

     since it doesn't have anything to do with the loop counter n.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Acked-by: Peter Jones <pjones@redhat.com>
Change-Id: If760bc3b8ab0e59fefc24fa687514324348fb8e8
Bug: 29814470
---
 lib/asn1_decoder.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c
index 2b3f46c049d4..554522934c44 100644
--- a/lib/asn1_decoder.c
+++ b/lib/asn1_decoder.c
@@ -74,7 +74,7 @@ next_tag:
 
 	/* Extract a tag from the data */
 	tag = data[dp++];
-	if (tag == 0) {
+	if (tag == ASN1_EOC) {
 		/* It appears to be an EOC. */
 		if (data[dp++] != 0)
 			goto invalid_eoc;
@@ -96,10 +96,8 @@ next_tag:
 
 	/* Extract the length */
 	len = data[dp++];
-	if (len <= 0x7f) {
-		dp += len;
-		goto next_tag;
-	}
+	if (len <= 0x7f)
+		goto check_length;
 
 	if (unlikely(len == ASN1_INDEFINITE_LENGTH)) {
 		/* Indefinite length */
@@ -110,14 +108,18 @@ next_tag:
 	}
 
 	n = len - 0x80;
-	if (unlikely(n > sizeof(size_t) - 1))
+	if (unlikely(n > sizeof(len) - 1))
 		goto length_too_long;
 	if (unlikely(n > datalen - dp))
 		goto data_overrun_error;
-	for (len = 0; n > 0; n--) {
+	len = 0;
+	for (; n > 0; n--) {
 		len <<= 8;
 		len |= data[dp++];
 	}
+check_length:
+	if (len > datalen - dp)
+		goto data_overrun_error;
 	dp += len;
 	goto next_tag;
 

From 2e2e345376a1e0320956512d9ccfa8d9dac60d27 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 16 Jun 2016 15:48:57 +0100
Subject: [PATCH 0167/3220] UPSTREAM: KEYS: potential uninitialized variable

(cherry picked from commit 38327424b40bcebe2de92d07312c89360ac9229a)

If __key_link_begin() failed then "edit" would be uninitialized.  I've
added a check to fix that.

This allows a random user to crash the kernel, though it's quite
difficult to achieve.  There are three ways it can be done as the user
would have to cause an error to occur in __key_link():

 (1) Cause the kernel to run out of memory.  In practice, this is difficult
     to achieve without ENOMEM cropping up elsewhere and aborting the
     attempt.

 (2) Revoke the destination keyring between the keyring ID being looked up
     and it being tested for revocation.  In practice, this is difficult to
     time correctly because the KEYCTL_REJECT function can only be used
     from the request-key upcall process.  Further, users can only make use
     of what's in /sbin/request-key.conf, though this does including a
     rejection debugging test - which means that the destination keyring
     has to be the caller's session keyring in practice.

 (3) Have just enough key quota available to create a key, a new session
     keyring for the upcall and a link in the session keyring, but not then
     sufficient quota to create a link in the nominated destination keyring
     so that it fails with EDQUOT.

The bug can be triggered using option (3) above using something like the
following:

	echo 80 >/proc/sys/kernel/keys/root_maxbytes
	keyctl request2 user debug:fred negate @t

The above sets the quota to something much lower (80) to make the bug
easier to trigger, but this is dependent on the system.  Note also that
the name of the keyring created contains a random number that may be
between 1 and 10 characters in size, so may throw the test off by
changing the amount of quota used.

Assuming the failure occurs, something like the following will be seen:

	kfree_debugcheck: out of range ptr 6b6b6b6b6b6b6b68h
	------------[ cut here ]------------
	kernel BUG at ../mm/slab.c:2821!
	...
	RIP: 0010:[<ffffffff811600f9>] kfree_debugcheck+0x20/0x25
	RSP: 0018:ffff8804014a7de8  EFLAGS: 00010092
	RAX: 0000000000000034 RBX: 6b6b6b6b6b6b6b68 RCX: 0000000000000000
	RDX: 0000000000040001 RSI: 00000000000000f6 RDI: 0000000000000300
	RBP: ffff8804014a7df0 R08: 0000000000000001 R09: 0000000000000000
	R10: ffff8804014a7e68 R11: 0000000000000054 R12: 0000000000000202
	R13: ffffffff81318a66 R14: 0000000000000000 R15: 0000000000000001
	...
	Call Trace:
	  kfree+0xde/0x1bc
	  assoc_array_cancel_edit+0x1f/0x36
	  __key_link_end+0x55/0x63
	  key_reject_and_link+0x124/0x155
	  keyctl_reject_key+0xb6/0xe0
	  keyctl_negate_key+0x10/0x12
	  SyS_keyctl+0x9f/0xe7
	  do_syscall_64+0x63/0x13a
	  entry_SYSCALL64_slow_path+0x25/0x25

Fixes: f70e2e06196a ('KEYS: Do preallocation for __key_link()')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Change-Id: Ia9616cce142a616beea0ef20bde49129939d2d2d
Bug: 29823941
---
 security/keys/key.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/keys/key.c b/security/keys/key.c
index ab7997ded725..534808915371 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -578,7 +578,7 @@ int key_reject_and_link(struct key *key,
 
 	mutex_unlock(&key_construction_mutex);
 
-	if (keyring)
+	if (keyring && link_ret == 0)
 		__key_link_end(keyring, &key->index_key, edit);
 
 	/* wake up anyone waiting for a key to be constructed */

From 44bd40ae321b27fe6b5e107d207a79fe1cbaa507 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 22 Mar 2016 18:02:49 +0100
Subject: [PATCH 0168/3220] UPSTREAM: netfilter: x_tables: validate
 e->target_offset early

(cherry pick from commit bdf533de6968e9686df777dc178486f600c6e617)

We should check that e->target_offset is sane before
mark_source_chains gets called since it will fetch the target entry
for loop detection.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Change-Id: Ic2dbc31c9525d698e94d4d8875886acf3524abbd
Bug: 29637687
---
 net/ipv4/netfilter/arp_tables.c | 17 ++++++++---------
 net/ipv4/netfilter/ip_tables.c  | 17 ++++++++---------
 net/ipv6/netfilter/ip6_tables.c | 17 ++++++++---------
 3 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e5697e649ce1..449e2b56b3b2 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -474,14 +474,12 @@ next:
 	return 1;
 }
 
-static inline int check_entry(const struct arpt_entry *e, const char *name)
+static inline int check_entry(const struct arpt_entry *e)
 {
 	const struct xt_entry_target *t;
 
-	if (!arp_checkentry(&e->arp)) {
-		duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
-	}
 
 	if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
 		return -EINVAL;
@@ -522,10 +520,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 	struct xt_target *target;
 	int ret;
 
-	ret = check_entry(e, name);
-	if (ret)
-		return ret;
-
 	e->counters.pcnt = xt_percpu_counter_alloc();
 	if (IS_ERR_VALUE(e->counters.pcnt))
 		return -ENOMEM;
@@ -576,6 +570,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 					     unsigned int valid_hooks)
 {
 	unsigned int h;
+	int err;
 
 	if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
 	    (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
@@ -590,6 +585,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 		return -EINVAL;
 	}
 
+	err = check_entry(e);
+	if (err)
+		return err;
+
 	/* Check hooks & underflows */
 	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
 		if (!(valid_hooks & (1 << h)))
@@ -1246,7 +1245,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	}
 
 	/* For purposes of check_entry casting the compat entry is fine */
-	ret = check_entry((struct arpt_entry *)e, name);
+	ret = check_entry((struct arpt_entry *)e);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 2bf6b3f1a00c..c3ab1ab12ee6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -568,14 +568,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 }
 
 static int
-check_entry(const struct ipt_entry *e, const char *name)
+check_entry(const struct ipt_entry *e)
 {
 	const struct xt_entry_target *t;
 
-	if (!ip_checkentry(&e->ip)) {
-		duprintf("ip check failed %p %s.\n", e, name);
+	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
-	}
 
 	if (e->target_offset + sizeof(struct xt_entry_target) >
 	    e->next_offset)
@@ -665,10 +663,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
 
-	ret = check_entry(e, name);
-	if (ret)
-		return ret;
-
 	e->counters.pcnt = xt_percpu_counter_alloc();
 	if (IS_ERR_VALUE(e->counters.pcnt))
 		return -ENOMEM;
@@ -740,6 +734,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 			   unsigned int valid_hooks)
 {
 	unsigned int h;
+	int err;
 
 	if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
 	    (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
@@ -754,6 +749,10 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 		return -EINVAL;
 	}
 
+	err = check_entry(e);
+	if (err)
+		return err;
+
 	/* Check hooks & underflows */
 	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
 		if (!(valid_hooks & (1 << h)))
@@ -1505,7 +1504,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	}
 
 	/* For purposes of check_entry casting the compat entry is fine */
-	ret = check_entry((struct ipt_entry *)e, name);
+	ret = check_entry((struct ipt_entry *)e);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 37e945cfa5da..f532cb6d9c3d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -580,14 +580,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 }
 
 static int
-check_entry(const struct ip6t_entry *e, const char *name)
+check_entry(const struct ip6t_entry *e)
 {
 	const struct xt_entry_target *t;
 
-	if (!ip6_checkentry(&e->ipv6)) {
-		duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
-	}
 
 	if (e->target_offset + sizeof(struct xt_entry_target) >
 	    e->next_offset)
@@ -678,10 +676,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
 
-	ret = check_entry(e, name);
-	if (ret)
-		return ret;
-
 	e->counters.pcnt = xt_percpu_counter_alloc();
 	if (IS_ERR_VALUE(e->counters.pcnt))
 		return -ENOMEM;
@@ -752,6 +746,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 			   unsigned int valid_hooks)
 {
 	unsigned int h;
+	int err;
 
 	if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 ||
 	    (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) {
@@ -766,6 +761,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 		return -EINVAL;
 	}
 
+	err = check_entry(e);
+	if (err)
+		return err;
+
 	/* Check hooks & underflows */
 	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
 		if (!(valid_hooks & (1 << h)))
@@ -1517,7 +1516,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	}
 
 	/* For purposes of check_entry casting the compat entry is fine */
-	ret = check_entry((struct ip6t_entry *)e, name);
+	ret = check_entry((struct ip6t_entry *)e);
 	if (ret)
 		return ret;
 

From 160202a2e8a37baf7a1dcd3b3748a19ece5afe3e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 22 Mar 2016 18:02:50 +0100
Subject: [PATCH 0169/3220] UPSTREAM: netfilter: x_tables: make sure
 e->next_offset covers remaining blob size

(cherry pick from commit 6e94e0cfb0887e4013b3b930fa6ab1fe6bb6ba91)

Otherwise this function may read data beyond the ruleset blob.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Change-Id: I9d19ecf3e00a2d52817b35b9042623927895c005
Bug: 29637687
---
 net/ipv4/netfilter/arp_tables.c | 6 ++++--
 net/ipv4/netfilter/ip_tables.c  | 6 ++++--
 net/ipv6/netfilter/ip6_tables.c | 6 ++++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 449e2b56b3b2..36a30fab8625 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -573,7 +573,8 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 	int err;
 
 	if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
-	    (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+	    (unsigned char *)e + sizeof(struct arpt_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit) {
 		duprintf("Bad offset %p\n", e);
 		return -EINVAL;
 	}
@@ -1232,7 +1233,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 
 	duprintf("check_compat_entry_size_and_hooks %p\n", e);
 	if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
-	    (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
+	    (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit) {
 		duprintf("Bad offset %p, limit = %p\n", e, limit);
 		return -EINVAL;
 	}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c3ab1ab12ee6..99d46b0a4ead 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -737,7 +737,8 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 	int err;
 
 	if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
-	    (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+	    (unsigned char *)e + sizeof(struct ipt_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit) {
 		duprintf("Bad offset %p\n", e);
 		return -EINVAL;
 	}
@@ -1491,7 +1492,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 
 	duprintf("check_compat_entry_size_and_hooks %p\n", e);
 	if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
-	    (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
+	    (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit) {
 		duprintf("Bad offset %p, limit = %p\n", e, limit);
 		return -EINVAL;
 	}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index f532cb6d9c3d..6198807e06f4 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -749,7 +749,8 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 	int err;
 
 	if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 ||
-	    (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) {
+	    (unsigned char *)e + sizeof(struct ip6t_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit) {
 		duprintf("Bad offset %p\n", e);
 		return -EINVAL;
 	}
@@ -1503,7 +1504,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 
 	duprintf("check_compat_entry_size_and_hooks %p\n", e);
 	if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 ||
-	    (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) {
+	    (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit) {
 		duprintf("Bad offset %p, limit = %p\n", e, limit);
 		return -EINVAL;
 	}

From 71b1886054473597c46a8a25c95477b5262971b5 Mon Sep 17 00:00:00 2001
From: Andrew Bresticker <abrestic@chromium.org>
Date: Fri, 23 Oct 2015 15:13:42 -0700
Subject: [PATCH 0170/3220] CHROMIUM: android: binder: Fix potential
 scheduling-while-atomic

(cherry picked from commit 166b45af97359159f9585a836c9849e725e31fd6)

Commit f1e7f0a724f6 ("android: binder: Disable preemption while holding
the global binder lock.") re-enabled preemption around most of the sites
where calls to potentially sleeping functions were made, but missed
__alloc_fd(), which can sleep if the fdtable needs to be resized.
Re-enable preemption around __alloc_fd() as well as __fd_install() which
can now sleep in upstream kernels as of commit 8a81252b774b ("fs/file.c:
don't acquire files->file_lock in fd_install()").

BUG=chrome-os-partner:44012
TEST=Build and boot on Smaug.

Change-Id: I9819c4b95876f697e75b1b84810b6c520d9c33ec
Signed-off-by: Andrew Bresticker <abrestic@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/308582
Reviewed-by: Stephen Barber <smbarber@chromium.org>
Reviewed-by: Riley Andrews <riandrews@google.com>
Bug: 30141999
---
 drivers/android/binder.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index f0ce9959d14d..b66fe6545f32 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -379,6 +379,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 	struct files_struct *files = proc->files;
 	unsigned long rlim_cur;
 	unsigned long irqs;
+	int ret;
 
 	if (files == NULL)
 		return -ESRCH;
@@ -389,7 +390,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
 	unlock_task_sighand(proc->tsk, &irqs);
 
-	return __alloc_fd(files, 0, rlim_cur, flags);
+	preempt_enable_no_resched();
+	ret = __alloc_fd(files, 0, rlim_cur, flags);
+	preempt_disable();
+
+	return ret;
 }
 
 /*
@@ -398,8 +403,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
-	if (proc->files)
+	if (proc->files) {
+		preempt_enable_no_resched();
 		__fd_install(proc->files, fd, file);
+		preempt_disable();
+	}
 }
 
 /*

From 13c17d0179f9a055062f37e91a6f6cf00a249ebd Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <salyzyn@google.com>
Date: Mon, 18 Jul 2016 22:21:12 +0000
Subject: [PATCH 0171/3220] Revert "CHROMIUM: android: binder: Fix potential
 scheduling-while-atomic"

This reverts commit 71b1886054473597c46a8a25c95477b5262971b5.

Change-Id: I9ded0ff43535c1367c2cf79dfeec20d4b5f0357a
---
 drivers/android/binder.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index b66fe6545f32..f0ce9959d14d 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -379,7 +379,6 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 	struct files_struct *files = proc->files;
 	unsigned long rlim_cur;
 	unsigned long irqs;
-	int ret;
 
 	if (files == NULL)
 		return -ESRCH;
@@ -390,11 +389,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
 	unlock_task_sighand(proc->tsk, &irqs);
 
-	preempt_enable_no_resched();
-	ret = __alloc_fd(files, 0, rlim_cur, flags);
-	preempt_disable();
-
-	return ret;
+	return __alloc_fd(files, 0, rlim_cur, flags);
 }
 
 /*
@@ -403,11 +398,8 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
-	if (proc->files) {
-		preempt_enable_no_resched();
+	if (proc->files)
 		__fd_install(proc->files, fd, file);
-		preempt_disable();
-	}
 }
 
 /*

From dd802a9c976c9e098d236c551a0c3cb2c6f5271d Mon Sep 17 00:00:00 2001
From: Riley Andrews <riandrews@google.com>
Date: Fri, 5 Jun 2015 18:59:29 -0700
Subject: [PATCH 0172/3220] cpuset: Add allow_attach hook for cpusets on
 android.

This patch provides a allow_attach hook for cpusets,
which resolves lots of the following logcat noise.

W SchedPolicy: add_tid_to_cgroup failed to write '2816' (Permission denied); fd=29
W ActivityManager: Failed setting process group of 2816 to 0
W System.err: java.lang.IllegalArgumentException
W System.err:    at android.os.Process.setProcessGroup(Native Method)
W System.err:    at com.android.server.am.ActivityManagerService.applyOomAdjLocked(ActivityManagerService.java:18763)
W System.err:    at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19028)
W System.err:    at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19106)
W System.err:    at com.android.server.am.ActiveServices.serviceDoneExecutingLocked(ActiveServices.java:2015)
W System.err:    at com.android.server.am.ActiveServices.publishServiceLocked(ActiveServices.java:905)
W System.err:    at com.android.server.am.ActivityManagerService.publishService(ActivityManagerService.java:16065)
W System.err:    at android.app.ActivityManagerNative.onTransact(ActivityManagerNative.java:1007)
W System.err:    at com.android.server.am.ActivityManagerService.onTransact(ActivityManagerService.java:2493)
W System.err:    at android.os.Binder.execTransact(Binder.java:453)

Change-Id: Ic1b61b2bbb7ce74c9e9422b5e22ee9078251de21
[Ported to 4.4, added commit message]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/cpuset.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02a8ea5c9963..0ff01c2a7222 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2051,12 +2051,30 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 	mutex_unlock(&cpuset_mutex);
 }
 
+static int cpuset_allow_attach(struct cgroup_taskset *tset)
+{
+	const struct cred *cred = current_cred(), *tcred;
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
+
+	cgroup_taskset_for_each(task, css, tset) {
+		tcred = __task_cred(task);
+
+		if ((current != task) && !capable(CAP_SYS_ADMIN) &&
+		     cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val)
+			return -EACCES;
+	}
+
+	return 0;
+}
+
 struct cgroup_subsys cpuset_cgrp_subsys = {
 	.css_alloc	= cpuset_css_alloc,
 	.css_online	= cpuset_css_online,
 	.css_offline	= cpuset_css_offline,
 	.css_free	= cpuset_css_free,
 	.can_attach	= cpuset_can_attach,
+	.allow_attach   = cpuset_allow_attach,
 	.cancel_attach	= cpuset_cancel_attach,
 	.attach		= cpuset_attach,
 	.bind		= cpuset_bind,

From f394c99b6ac0236d7e06ee406107294d5f389259 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 5 Jul 2016 22:12:36 -0700
Subject: [PATCH 0173/3220] UPSTREAM: ppp: defer netns reference release for
 ppp channel

(cherry pick from commit 205e1e255c479f3fd77446415706463b282f94e4)

Matt reported that we have a NULL pointer dereference
in ppp_pernet() from ppp_connect_channel(),
i.e. pch->chan_net is NULL.

This is due to that a parallel ppp_unregister_channel()
could happen while we are in ppp_connect_channel(), during
which pch->chan_net set to NULL. Since we need a reference
to net per channel, it makes sense to sync the refcnt
with the life time of the channel, therefore we should
release this reference when we destroy it.

Fixes: 1f461dcdd296 ("ppp: take reference on channels netns")
Reported-by: Matt Bennett <Matt.Bennett@alliedtelesis.co.nz>
Cc: Paul Mackerras <paulus@samba.org>
Cc: linux-ppp@vger.kernel.org
Cc: Guillaume Nault <g.nault@alphalink.fr>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Fixes: Change-Id: Iee0015eca5bd181954bb4896a3720f7549c5ed0b ("UPSTREAM:
ppp: take reference on channels netns")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Change-Id: I24d0bb6f349ab3829f63cfe935ed97b6913a3508
---
 drivers/net/ppp/ppp_generic.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index afe6fddd2baa..50cda254b57a 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -2387,8 +2387,6 @@ ppp_unregister_channel(struct ppp_channel *chan)
 	spin_lock_bh(&pn->all_channels_lock);
 	list_del(&pch->list);
 	spin_unlock_bh(&pn->all_channels_lock);
-	put_net(pch->chan_net);
-	pch->chan_net = NULL;
 
 	pch->file.dead = 1;
 	wake_up_interruptible(&pch->file.rwait);
@@ -2980,6 +2978,9 @@ ppp_disconnect_channel(struct channel *pch)
  */
 static void ppp_destroy_channel(struct channel *pch)
 {
+	put_net(pch->chan_net);
+	pch->chan_net = NULL;
+
 	atomic_dec(&channel_count);
 
 	if (!pch->file.dead) {

From 53c39433d5fd7d71bec7e904a6fa6691a9e90fdc Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 1 Jun 2016 11:55:05 +0200
Subject: [PATCH 0174/3220] UPSTREAM: proc: prevent stacking filesystems on top

(cherry picked from commit e54ad7f1ee263ffa5a2de9c609d58dfa27b21cd9)

This prevents stacking filesystems (ecryptfs and overlayfs) from using
procfs as lower filesystem.  There is too much magic going on inside
procfs, and there is no good reason to stack stuff on top of procfs.

(For example, procfs does access checks in VFS open handlers, and
ecryptfs by design calls open handlers from a kernel thread that doesn't
drop privileges or so.)

Signed-off-by: Jann Horn <jannh@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Change-Id: Ib050ef9dc10e623589d22e3a9e6aee9ee4f0cd5d
Bug: 29444228
---
 fs/proc/root.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 361ab4ee42fc..ec649c92d270 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -121,6 +121,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 
+	/*
+	 * procfs isn't actually a stacking filesystem; however, there is
+	 * too much magic going on inside it to permit stacking things on
+	 * top of it
+	 */
+	sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+
 	if (!proc_parse_options(options, ns)) {
 		deactivate_locked_super(sb);
 		return ERR_PTR(-EINVAL);

From ed23e1145053fb43639a178e89e99bb14f11272b Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 1 Jun 2016 11:55:06 +0200
Subject: [PATCH 0175/3220] UPSTREAM: ecryptfs: forbid opening files without
 mmap handler

(cherry picked from commit 2f36db71009304b3f0b95afacd8eba1f9f046b87)

This prevents users from triggering a stack overflow through a recursive
invocation of pagefault handling that involves mapping procfs files into
virtual memory.

Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Tyler Hicks <tyhicks@canonical.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Change-Id: I0be77c7f8bd3046bc34cd87ef577529792d479bc
Bug: 29444228
---
 fs/ecryptfs/kthread.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index 866bb18efefe..e818f5ac7a26 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/mount.h>
+#include <linux/file.h>
 #include "ecryptfs_kernel.h"
 
 struct ecryptfs_open_req {
@@ -147,7 +148,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
 	flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR;
 	(*lower_file) = dentry_open(&req.path, flags, cred);
 	if (!IS_ERR(*lower_file))
-		goto out;
+		goto have_file;
 	if ((flags & O_ACCMODE) == O_RDONLY) {
 		rc = PTR_ERR((*lower_file));
 		goto out;
@@ -165,8 +166,16 @@ int ecryptfs_privileged_open(struct file **lower_file,
 	mutex_unlock(&ecryptfs_kthread_ctl.mux);
 	wake_up(&ecryptfs_kthread_ctl.wait);
 	wait_for_completion(&req.done);
-	if (IS_ERR(*lower_file))
+	if (IS_ERR(*lower_file)) {
 		rc = PTR_ERR(*lower_file);
+		goto out;
+	}
+have_file:
+	if ((*lower_file)->f_op->mmap == NULL) {
+		fput(*lower_file);
+		*lower_file = NULL;
+		rc = -EMEDIUMTYPE;
+	}
 out:
 	return rc;
 }

From 85a09f2732f9d0bbb9fdec98f292f1db6c75a96a Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 1 Jun 2016 11:55:07 +0200
Subject: [PATCH 0176/3220] UPSTREAM: sched: panic on corrupted stack end

(cherry picked from commit 29d6455178a09e1dc340380c582b13356227e8df)

Until now, hitting this BUG_ON caused a recursive oops (because oops
handling involves do_exit(), which calls into the scheduler, which in
turn raises an oops), which caused stuff below the stack to be
overwritten until a panic happened (e.g.  via an oops in interrupt
context, caused by the overwritten CPU index in the thread_info).

Just panic directly.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Change-Id: Ia3acb3f747f7a58ec2d071644433b0591925969f
Bug: 29444228
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 61b0914cc7aa..b290af9c37a1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3008,7 +3008,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
-	BUG_ON(task_stack_end_corrupted(prev));
+	if (task_stack_end_corrupted(prev))
+		panic("corrupted stack end detected inside scheduler\n");
 #endif
 
 	if (unlikely(in_atomic_preempt_off())) {

From 419d43edc519b477d365b6311c48f875cec6bfdf Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 14 Jul 2016 11:20:55 -0700
Subject: [PATCH 0177/3220] FROMLIST: proc: Relax /proc/<tid>/timerslack_ns
 capability requirements

When an interface to allow a task to change another tasks
timerslack was first proposed, it was suggested that something
greater then CAP_SYS_NICE would be needed, as a task could be
delayed further then what normally could be done with nice
adjustments.

So CAP_SYS_PTRACE was adopted instead for what became the
/proc/<tid>/timerslack_ns interface. However, for Android (where
this feature originates), giving the system_server
CAP_SYS_PTRACE would allow it to observe and modify all tasks
memory. This is considered too high a privilege level for only
needing to change the timerslack.

After some discussion, it was realized that a CAP_SYS_NICE
process can set a task as SCHED_FIFO, so they could fork some
spinning processes and set them all SCHED_FIFO 99, in effect
delaying all other tasks for an infinite amount of time.

So as a CAP_SYS_NICE task can already cause trouble for other
tasks, using it as a required capability for accessing and
modifying /proc/<tid>/timerslack_ns seems sufficient.

Thus, this patch loosens the capability requirements to
CAP_SYS_NICE and removes CAP_SYS_PTRACE, simplifying some
of the code flow as well.

This is technically an ABI change, but as the feature just
landed in 4.6, I suspect no one is yet using it.

Cc: Kees Cook <keescook@chromium.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
CC: Arjan van de Ven <arjan@linux.intel.com>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Colin Cross <ccross@android.com>
Cc: Nick Kralevich <nnk@google.com>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Elliott Hughes <enh@google.com>
Cc: Android Kernel Team <kernel-team@android.com>
Reviewed-by: Nick Kralevich <nnk@google.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>

FROMLIST URL: https://lkml.org/lkml/2016/7/21/522
Change-Id: Ia75481402e3948165a1b7c1551c539530cb25509
(Cherry picked against common/android-4.4)
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 fs/proc/base.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index db221e53a1ac..ee1b021e153e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2260,16 +2260,19 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
 	if (!p)
 		return -ESRCH;
 
-	if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
-		task_lock(p);
-		if (slack_ns == 0)
-			p->timer_slack_ns = p->default_timer_slack_ns;
-		else
-			p->timer_slack_ns = slack_ns;
-		task_unlock(p);
-	} else
+	if (!capable(CAP_SYS_NICE)) {
 		count = -EPERM;
+		goto out;
+	}
 
+	task_lock(p);
+	if (slack_ns == 0)
+		p->timer_slack_ns = p->default_timer_slack_ns;
+	else
+		p->timer_slack_ns = slack_ns;
+	task_unlock(p);
+
+out:
 	put_task_struct(p);
 
 	return count;
@@ -2279,19 +2282,22 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
 	struct task_struct *p;
-	int err =  0;
+	int err = 0;
 
 	p = get_proc_task(inode);
 	if (!p)
 		return -ESRCH;
 
-	if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
-		task_lock(p);
-		seq_printf(m, "%llu\n", p->timer_slack_ns);
-		task_unlock(p);
-	} else
+	if (!capable(CAP_SYS_NICE)) {
 		err = -EPERM;
+		goto out;
+	}
 
+	task_lock(p);
+	seq_printf(m, "%llu\n", p->timer_slack_ns);
+	task_unlock(p);
+
+out:
 	put_task_struct(p);
 
 	return err;

From 5d1b9c0c012ff96d543aa90db888ca511f53dfaa Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 14 Jul 2016 17:22:19 -0700
Subject: [PATCH 0178/3220] FROMLIST: proc: Add LSM hook checks to
 /proc/<tid>/timerslack_ns

As requested, this patch checks the existing LSM hooks
task_getscheduler/task_setscheduler when reading or modifying
the task's timerslack value.

Previous versions added new get/settimerslack LSM hooks, but
since they checked the same PROCESS__SET/GETSCHED values as
existing hooks, it was suggested we just use the existing ones.

Cc: Kees Cook <keescook@chromium.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
CC: Arjan van de Ven <arjan@linux.intel.com>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Colin Cross <ccross@android.com>
Cc: Nick Kralevich <nnk@google.com>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Elliott Hughes <enh@google.com>
Cc: James Morris <jmorris@namei.org>
Cc: Android Kernel Team <kernel-team@android.com>
Cc: linux-security-module@vger.kernel.org
Cc: selinux@tycho.nsa.gov
Signed-off-by: John Stultz <john.stultz@linaro.org>

FROMLIST URL: https://lkml.org/lkml/2016/7/21/523
Change-Id: Id157d10e2fe0b85f1be45035a6117358a42af028
(Cherry picked back to common/android-4.4)
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 fs/proc/base.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ee1b021e153e..ac5d2aea63b5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2265,6 +2265,12 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
 		goto out;
 	}
 
+	err = security_task_setscheduler(p);
+	if (err) {
+		count = err;
+		goto out;
+	}
+
 	task_lock(p);
 	if (slack_ns == 0)
 		p->timer_slack_ns = p->default_timer_slack_ns;
@@ -2293,6 +2299,10 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
 		goto out;
 	}
 
+	err = security_task_getscheduler(p);
+	if (err)
+		goto out;
+
 	task_lock(p);
 	seq_printf(m, "%llu\n", p->timer_slack_ns);
 	task_unlock(p);

From 850117629453026394e13622203e2276891c3b8a Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Fri, 15 Jul 2016 12:39:13 +0200
Subject: [PATCH 0179/3220] BACKPORT: brcmfmac: defer DPC processing during
 probe

The sdio dpc starts processing when in SDIOD_STATE_DATA. This state was
entered right after firmware download. This patch moves that transition
just before enabling sdio interrupt handling thus avoiding watchdog
expiry which would put the bus to sleep while probing.

Change-Id: I09c60752374b8145da78000935062be08c5c8a52
Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 drivers/net/wireless/brcm80211/brcmfmac/sdio.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c
index 7e74ac3ad815..bcf29bf6f727 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c
@@ -3401,10 +3401,6 @@ static int brcmf_sdio_download_firmware(struct brcmf_sdio *bus,
 		goto err;
 	}
 
-	/* Allow full data communication using DPC from now on. */
-	brcmf_sdiod_change_state(bus->sdiodev, BRCMF_SDIOD_DATA);
-	bcmerror = 0;
-
 err:
 	brcmf_sdio_clkctl(bus, CLK_SDONLY, false);
 	sdio_release_host(bus->sdiodev->func[1]);
@@ -4112,6 +4108,9 @@ static void brcmf_sdio_firmware_callback(struct device *dev,
 	}
 
 	if (err == 0) {
+		/* Allow full data communication using DPC from now on. */
+		brcmf_sdiod_change_state(bus->sdiodev, BRCMF_SDIOD_DATA);
+
 		err = brcmf_sdiod_intr_register(sdiodev);
 		if (err != 0)
 			brcmf_err("intr register failed:%d\n", err);

From 1b06e2539281187910eb971202c7b2786fab5b57 Mon Sep 17 00:00:00 2001
From: Anson Jacob <ansonjacob.aj@gmail.com>
Date: Sun, 31 Jul 2016 22:30:14 -0400
Subject: [PATCH 0180/3220] usb: gadget: f_accessory: remove duplicate endpoint
 alloc

usb_ep_autoconfig is called twice for allocating
bulk out endpoint.

Removed the unwanted call.

Fixes Issue: 67180

Change-Id: I03e87a86fbbbc85831ff7f0496adf038d1de2956
Signed-off-by: Anson Jacob <ansonjacob.aj@gmail.com>
---
 drivers/usb/gadget/function/f_accessory.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c
index c62123560143..2ca16a577542 100644
--- a/drivers/usb/gadget/function/f_accessory.c
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -531,15 +531,6 @@ static int create_bulk_endpoints(struct acc_dev *dev,
 	ep->driver_data = dev;		/* claim the endpoint */
 	dev->ep_out = ep;
 
-	ep = usb_ep_autoconfig(cdev->gadget, out_desc);
-	if (!ep) {
-		DBG(cdev, "usb_ep_autoconfig for ep_out failed\n");
-		return -ENODEV;
-	}
-	DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name);
-	ep->driver_data = dev;		/* claim the endpoint */
-	dev->ep_out = ep;
-
 	/* now allocate requests for our endpoints */
 	for (i = 0; i < TX_REQ_MAX; i++) {
 		req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE);

From 818aa36ea868ba8f2985f9ca0906fd9cba3e437d Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Sun, 31 Jul 2016 17:07:46 +0530
Subject: [PATCH 0181/3220] Revert "panic: Add board ID to panic output"

This reverts commit 4e09c510185cb4db2277ce81cce81b7aa06bea45.

I checked for the usage of this debug helper in AOSP common kernels as
well as vendor kernels (e.g exynos, msm, mediatek, omap, tegra, x86,
x86_64) hosted at https://android.googlesource.com/kernel/ and I found
out that other than few fairly obsolete Omap trees (for tuna & Glass)
and Exynos tree (for Manta), there is no active user of this debug
helper. So we can safely remove this helper code.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 include/linux/kernel.h | 4 ----
 kernel/panic.c         | 8 --------
 2 files changed, 12 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 1eaf3d81f5c1..350dfb08aee3 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -830,8 +830,4 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	 /* OTHER_WRITABLE?  Generally considered a bad idea. */		\
 	 BUILD_BUG_ON_ZERO((perms) & 2) +					\
 	 (perms))
-
-/* To identify board information in panic logs, set this */
-extern char *mach_panic_string;
-
 #endif
diff --git a/kernel/panic.c b/kernel/panic.c
index f07bfc9fe613..4b150bc0c6c1 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -28,9 +28,6 @@
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
 
-/* Machine specific panic information string */
-char *mach_panic_string;
-
 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
 static unsigned long tainted_mask;
 static int pause_on_oops;
@@ -416,11 +413,6 @@ late_initcall(init_oops_id);
 void print_oops_end_marker(void)
 {
 	init_oops_id();
-
-	if (mach_panic_string)
-		printk(KERN_WARNING "Board Information: %s\n",
-		       mach_panic_string);
-
 	pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
 }
 

From f3d9c312b89966b6d1ff9e4c503a7ae6d38e0176 Mon Sep 17 00:00:00 2001
From: James Carr <carrja@google.com>
Date: Fri, 29 Jul 2016 19:02:16 -0700
Subject: [PATCH 0182/3220] Implement memory_state_time, used by qcom,cpubw

New driver memory_state_time tracks time spent in different DDR
frequency and bandwidth states.

Memory drivers such as qcom,cpubw can post updated state to the driver
after registering a callback. Processed by a workqueue

Bandwidth buckets are read in from device tree in the relevant qualcomm
section, can be defined in any quantity and spacing.

The data is exposed at /sys/kernel/memory_state_time, able to be read by
the Android framework.

Functionality is behind a config option CONFIG_MEMORY_STATE_TIME

Change-Id: I4fee165571cb975fb9eacbc9aada5e6d7dd748f0
Signed-off-by: James Carr <carrja@google.com>
---
 .../bindings/misc/memory-state-time.txt       |   8 +
 android/configs/android-recommended.cfg       |   1 +
 drivers/misc/Kconfig                          |   6 +
 drivers/misc/Makefile                         |   1 +
 drivers/misc/memory_state_time.c              | 454 ++++++++++++++++++
 include/linux/memory-state-time.h             |  42 ++
 6 files changed, 512 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/misc/memory-state-time.txt
 create mode 100644 drivers/misc/memory_state_time.c
 create mode 100644 include/linux/memory-state-time.h

diff --git a/Documentation/devicetree/bindings/misc/memory-state-time.txt b/Documentation/devicetree/bindings/misc/memory-state-time.txt
new file mode 100644
index 000000000000..c99a506c030d
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/memory-state-time.txt
@@ -0,0 +1,8 @@
+Memory bandwidth and frequency state tracking
+
+Required properties:
+- compatible : should be:
+       "memory-state-time"
+- freq-tbl: Should contain entries with each frequency in Hz.
+- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps.
+       Must match the framework power_profile.xml for the device.
diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index 2f1ef077aa9e..3465a848d74d 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -119,6 +119,7 @@ CONFIG_TIMER_STATS=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_UHID=y
+CONFIG_MEMORY_STATE_TIME=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_HIDDEV=y
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 2763d6b3b063..598f96c26678 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -531,6 +531,12 @@ config UID_CPUTIME
 	help
 	  Per UID based cpu time statistics exported to /proc/uid_cputime
 
+config MEMORY_STATE_TIME
+	tristate "Memory freq/bandwidth time statistics"
+	depends on PROFILING
+	help
+	  Memory time statistics exported to /sys/kernel/memory_state_time
+
 source "drivers/misc/c2port/Kconfig"
 source "drivers/misc/eeprom/Kconfig"
 source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index e5142b836aee..b76b4c9fe104 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -57,3 +57,4 @@ obj-$(CONFIG_ECHO)		+= echo/
 obj-$(CONFIG_VEXPRESS_SYSCFG)	+= vexpress-syscfg.o
 obj-$(CONFIG_CXL_BASE)		+= cxl/
 obj-$(CONFIG_UID_CPUTIME) += uid_cputime.o
+obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o
diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c
new file mode 100644
index 000000000000..34c797a06a31
--- /dev/null
+++ b/drivers/misc/memory_state_time.c
@@ -0,0 +1,454 @@
+/* drivers/misc/memory_state_time.c
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/hashtable.h>
+#include <linux/kconfig.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/memory-state-time.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/time.h>
+#include <linux/timekeeping.h>
+#include <linux/workqueue.h>
+
+#define KERNEL_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct kobj_attribute _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+#define FREQ_HASH_BITS 4
+DECLARE_HASHTABLE(freq_hash_table, FREQ_HASH_BITS);
+
+static DEFINE_MUTEX(mem_lock);
+
+#define TAG "memory_state_time"
+#define BW_NODE "/soc/memory-state-time"
+#define FREQ_TBL "freq-tbl"
+#define BW_TBL "bw-buckets"
+#define NUM_SOURCES "num-sources"
+
+#define LOWEST_FREQ 2
+
+static int curr_bw;
+static int curr_freq;
+static u32 *bw_buckets;
+static u32 *freq_buckets;
+static int num_freqs;
+static int num_buckets;
+static int registered_bw_sources;
+static u64 last_update;
+static bool init_success;
+static struct workqueue_struct *memory_wq;
+static u32 num_sources = 10;
+static int *bandwidths;
+
+struct freq_entry {
+	int freq;
+	u64 *buckets; /* Bandwidth buckets. */
+	struct hlist_node hash;
+};
+
+struct queue_container {
+	struct work_struct update_state;
+	int value;
+	u64 time_now;
+	int id;
+	struct mutex *lock;
+};
+
+static int find_bucket(int bw)
+{
+	int i;
+
+	if (bw_buckets != NULL) {
+		for (i = 0; i < num_buckets; i++) {
+			if (bw_buckets[i] > bw) {
+				pr_debug("Found bucket %d for bandwidth %d\n",
+					i, bw);
+				return i;
+			}
+		}
+		return num_buckets - 1;
+	}
+	return 0;
+}
+
+static u64 get_time_diff(u64 time_now)
+{
+	u64 ms;
+
+	ms = time_now - last_update;
+	last_update = time_now;
+	return ms;
+}
+
+static ssize_t show_stat_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	int i, j;
+	int len = 0;
+	struct freq_entry *freq_entry;
+
+	for (i = 0; i < num_freqs; i++) {
+		hash_for_each_possible(freq_hash_table, freq_entry, hash,
+				freq_buckets[i]) {
+			if (freq_entry->freq == freq_buckets[i]) {
+				len += scnprintf(buf + len, PAGE_SIZE - len,
+						"%d ", freq_buckets[i]);
+				if (len >= PAGE_SIZE)
+					break;
+				for (j = 0; j < num_buckets; j++) {
+					len += scnprintf(buf + len,
+							PAGE_SIZE - len,
+							"%llu ",
+							freq_entry->buckets[j]);
+				}
+				len += scnprintf(buf + len, PAGE_SIZE - len,
+						"\n");
+			}
+		}
+	}
+	pr_debug("Current Time: %llu\n", ktime_get_boot_ns());
+	return len;
+}
+KERNEL_ATTR_RO(show_stat);
+
+static void update_table(u64 time_now)
+{
+	struct freq_entry *freq_entry;
+
+	pr_debug("Last known bw %d freq %d\n", curr_bw, curr_freq);
+	hash_for_each_possible(freq_hash_table, freq_entry, hash, curr_freq) {
+		if (curr_freq == freq_entry->freq) {
+			freq_entry->buckets[find_bucket(curr_bw)]
+					+= get_time_diff(time_now);
+			break;
+		}
+	}
+}
+
+static bool freq_exists(int freq)
+{
+	int i;
+
+	for (i = 0; i < num_freqs; i++) {
+		if (freq == freq_buckets[i])
+			return true;
+	}
+	return false;
+}
+
+static int calculate_total_bw(int bw, int index)
+{
+	int i;
+	int total_bw = 0;
+
+	pr_debug("memory_state_time New bw %d for id %d\n", bw, index);
+	bandwidths[index] = bw;
+	for (i = 0; i < registered_bw_sources; i++)
+		total_bw += bandwidths[i];
+	return total_bw;
+}
+
+static void freq_update_do_work(struct work_struct *work)
+{
+	struct queue_container *freq_state_update
+			= container_of(work, struct queue_container,
+			update_state);
+	if (freq_state_update) {
+		mutex_lock(&mem_lock);
+		update_table(freq_state_update->time_now);
+		curr_freq = freq_state_update->value;
+		mutex_unlock(&mem_lock);
+		kfree(freq_state_update);
+	}
+}
+
+static void bw_update_do_work(struct work_struct *work)
+{
+	struct queue_container *bw_state_update
+			= container_of(work, struct queue_container,
+			update_state);
+	if (bw_state_update) {
+		mutex_lock(&mem_lock);
+		update_table(bw_state_update->time_now);
+		curr_bw = calculate_total_bw(bw_state_update->value,
+				bw_state_update->id);
+		mutex_unlock(&mem_lock);
+		kfree(bw_state_update);
+	}
+}
+
+static void memory_state_freq_update(struct memory_state_update_block *ub,
+		int value)
+{
+	if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+		if (freq_exists(value) && init_success) {
+			struct queue_container *freq_container
+				= kmalloc(sizeof(struct queue_container),
+				GFP_KERNEL);
+			if (!freq_container)
+				return;
+			INIT_WORK(&freq_container->update_state,
+					freq_update_do_work);
+			freq_container->time_now = ktime_get_boot_ns();
+			freq_container->value = value;
+			pr_debug("Scheduling freq update in work queue\n");
+			queue_work(memory_wq, &freq_container->update_state);
+		} else {
+			pr_debug("Freq does not exist.\n");
+		}
+	}
+}
+
+static void memory_state_bw_update(struct memory_state_update_block *ub,
+		int value)
+{
+	if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+		if (init_success) {
+			struct queue_container *bw_container
+				= kmalloc(sizeof(struct queue_container),
+				GFP_KERNEL);
+			if (!bw_container)
+				return;
+			INIT_WORK(&bw_container->update_state,
+					bw_update_do_work);
+			bw_container->time_now = ktime_get_boot_ns();
+			bw_container->value = value;
+			bw_container->id = ub->id;
+			pr_debug("Scheduling bandwidth update in work queue\n");
+			queue_work(memory_wq, &bw_container->update_state);
+		}
+	}
+}
+
+struct memory_state_update_block *memory_state_register_frequency_source(void)
+{
+	struct memory_state_update_block *block;
+
+	if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+		pr_debug("Allocating frequency source\n");
+		block = kmalloc(sizeof(struct memory_state_update_block),
+					GFP_KERNEL);
+		if (!block)
+			return NULL;
+		block->update_call = memory_state_freq_update;
+		return block;
+	}
+	pr_err("Config option disabled.\n");
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_frequency_source);
+
+struct memory_state_update_block *memory_state_register_bandwidth_source(void)
+{
+	struct memory_state_update_block *block;
+
+	if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+		pr_debug("Allocating bandwidth source %d\n",
+				registered_bw_sources);
+		block = kmalloc(sizeof(struct memory_state_update_block),
+					GFP_KERNEL);
+		if (!block)
+			return NULL;
+		block->update_call = memory_state_bw_update;
+		if (registered_bw_sources < num_sources) {
+			block->id = registered_bw_sources++;
+		} else {
+			pr_err("Unable to allocate source; max number reached\n");
+			kfree(block);
+			return NULL;
+		}
+		return block;
+	}
+	pr_err("Config option disabled.\n");
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_bandwidth_source);
+
+/* Buckets are designated by their maximum.
+ * Returns the buckets decided by the capability of the device.
+ */
+static int get_bw_buckets(struct device *dev)
+{
+	int ret, lenb;
+	struct device_node *node = dev->of_node;
+
+	of_property_read_u32(node, NUM_SOURCES, &num_sources);
+	if (of_find_property(node, BW_TBL, &lenb)) {
+		bandwidths = devm_kzalloc(dev,
+				sizeof(*bandwidths) * num_sources, GFP_KERNEL);
+		if (!bandwidths)
+			return -ENOMEM;
+		lenb /= sizeof(*bw_buckets);
+		bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets),
+				GFP_KERNEL);
+		if (!bw_buckets) {
+			devm_kfree(dev, bandwidths);
+			return -ENOMEM;
+		}
+		ret = of_property_read_u32_array(node, BW_TBL, bw_buckets,
+				lenb);
+		if (ret < 0) {
+			devm_kfree(dev, bandwidths);
+			devm_kfree(dev, bw_buckets);
+			pr_err("Unable to read bandwidth table from device tree.\n");
+			return ret;
+		}
+	}
+	curr_bw = 0;
+	num_buckets = lenb;
+	return 0;
+}
+
+/* Adds struct freq_entry nodes to the hashtable for each compatible frequency.
+ * Returns the supported number of frequencies.
+ */
+static int freq_buckets_init(struct device *dev)
+{
+	struct freq_entry *freq_entry;
+	int i;
+	int ret, lenf;
+	struct device_node *node = dev->of_node;
+
+	if (of_find_property(node, FREQ_TBL, &lenf)) {
+		lenf /= sizeof(*freq_buckets);
+		freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets),
+				GFP_KERNEL);
+		if (!freq_buckets)
+			return -ENOMEM;
+		pr_debug("freqs found len %d\n", lenf);
+		ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets,
+				lenf);
+		if (ret < 0) {
+			devm_kfree(dev, freq_buckets);
+			pr_err("Unable to read frequency table from device tree.\n");
+			return ret;
+		}
+		pr_debug("ret freq %d\n", ret);
+	}
+	num_freqs = lenf;
+	curr_freq = freq_buckets[LOWEST_FREQ];
+
+	for (i = 0; i < num_freqs; i++) {
+		freq_entry = devm_kzalloc(dev, sizeof(struct freq_entry),
+				GFP_KERNEL);
+		if (!freq_entry)
+			return -ENOMEM;
+		freq_entry->buckets = devm_kzalloc(dev, sizeof(u64)*num_buckets,
+				GFP_KERNEL);
+		if (!freq_entry->buckets) {
+			devm_kfree(dev, freq_entry);
+			return -ENOMEM;
+		}
+		pr_debug("memory_state_time Adding freq to ht %d\n",
+				freq_buckets[i]);
+		freq_entry->freq = freq_buckets[i];
+		hash_add(freq_hash_table, &freq_entry->hash, freq_buckets[i]);
+	}
+	return 0;
+}
+
+struct kobject *memory_kobj;
+EXPORT_SYMBOL_GPL(memory_kobj);
+
+static struct attribute *memory_attrs[] = {
+	&show_stat_attr.attr,
+	NULL
+};
+
+static struct attribute_group memory_attr_group = {
+	.attrs = memory_attrs,
+};
+
+static int memory_state_time_probe(struct platform_device *pdev)
+{
+	int error;
+
+	error = get_bw_buckets(&pdev->dev);
+	if (error)
+		return error;
+	error = freq_buckets_init(&pdev->dev);
+	if (error)
+		return error;
+	last_update = ktime_get_boot_ns();
+	init_success = true;
+
+	pr_debug("memory_state_time initialized with num_freqs %d\n",
+			num_freqs);
+	return 0;
+}
+
+static const struct of_device_id match_table[] = {
+	{ .compatible = "memory-state-time" },
+	{}
+};
+
+static struct platform_driver memory_state_time_driver = {
+	.probe = memory_state_time_probe,
+	.driver = {
+		.name = "memory-state-time",
+		.of_match_table = match_table,
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init memory_state_time_init(void)
+{
+	int error;
+
+	hash_init(freq_hash_table);
+	memory_wq = create_singlethread_workqueue("memory_wq");
+	if (!memory_wq) {
+		pr_err("Unable to create workqueue.\n");
+		return -EINVAL;
+	}
+	/*
+	 * Create sys/kernel directory for memory_state_time.
+	 */
+	memory_kobj = kobject_create_and_add(TAG, kernel_kobj);
+	if (!memory_kobj) {
+		pr_err("Unable to allocate memory_kobj for sysfs directory.\n");
+		error = -ENOMEM;
+		goto wq;
+	}
+	error = sysfs_create_group(memory_kobj, &memory_attr_group);
+	if (error) {
+		pr_err("Unable to create sysfs folder.\n");
+		goto kobj;
+	}
+
+	error = platform_driver_register(&memory_state_time_driver);
+	if (error) {
+		pr_err("Unable to register memory_state_time platform driver.\n");
+		goto group;
+	}
+	return 0;
+
+group:	sysfs_remove_group(memory_kobj, &memory_attr_group);
+kobj:	kobject_put(memory_kobj);
+wq:	destroy_workqueue(memory_wq);
+	return error;
+}
+module_init(memory_state_time_init);
diff --git a/include/linux/memory-state-time.h b/include/linux/memory-state-time.h
new file mode 100644
index 000000000000..d2212b027866
--- /dev/null
+++ b/include/linux/memory-state-time.h
@@ -0,0 +1,42 @@
+/* include/linux/memory-state-time.h
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/workqueue.h>
+
+#define UPDATE_MEMORY_STATE(BLOCK, VALUE) BLOCK->update_call(BLOCK, VALUE)
+
+struct memory_state_update_block;
+
+typedef void (*memory_state_update_fn_t)(struct memory_state_update_block *ub,
+		int value);
+
+/* This struct is populated when you pass it to a memory_state_register*
+ * function. The update_call function is used for an update and defined in the
+ * typedef memory_state_update_fn_t
+ */
+struct memory_state_update_block {
+	memory_state_update_fn_t update_call;
+	int id;
+};
+
+/* Register a frequency struct memory_state_update_block to provide updates to
+ * memory_state_time about frequency changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_frequency_source(void);
+
+/* Register a bandwidth struct memory_state_update_block to provide updates to
+ * memory_state_time about bandwidth changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_bandwidth_source(void);

From ee247d4205fdee14675c4068c7fbb2c8118ba4ce Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 14 Mar 2016 09:56:35 -0300
Subject: [PATCH 0183/3220] UPSTREAM: net: Fix use after free in the recvmmsg
 exit path

(cherry picked from commit 34b88a68f26a75e4fded796f1a49c40f82234b7d)

The syzkaller fuzzer hit the following use-after-free:

  Call Trace:
   [<ffffffff8175ea0e>] __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:295
   [<ffffffff851cc31a>] __sys_recvmmsg+0x6fa/0x7f0 net/socket.c:2261
   [<     inline     >] SYSC_recvmmsg net/socket.c:2281
   [<ffffffff851cc57f>] SyS_recvmmsg+0x16f/0x180 net/socket.c:2270
   [<ffffffff86332bb6>] entry_SYSCALL_64_fastpath+0x16/0x7a
  arch/x86/entry/entry_64.S:185

And, as Dmitry rightly assessed, that is because we can drop the
reference and then touch it when the underlying recvmsg calls return
some packets and then hit an error, which will make recvmmsg to set
sock->sk->sk_err, oops, fix it.

Reported-and-Tested-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Fixes: a2e2725541fa ("net: Introduce recvmmsg socket syscall")
http://lkml.kernel.org/r/20160122211644.GC2470@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Change-Id: I2adb0faf595b7b634d9b739dfdd1a47109e20ecb
Bug: 30515201
---
 net/socket.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index d730ef9dfbf0..263b334ec5e4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2238,31 +2238,31 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 			break;
 	}
 
+	if (err == 0)
+		goto out_put;
+
+	if (datagrams == 0) {
+		datagrams = err;
+		goto out_put;
+	}
+
+	/*
+	 * We may return less entries than requested (vlen) if the
+	 * sock is non block and there aren't enough datagrams...
+	 */
+	if (err != -EAGAIN) {
+		/*
+		 * ... or  if recvmsg returns an error after we
+		 * received some datagrams, where we record the
+		 * error to return on the next call or if the
+		 * app asks about it using getsockopt(SO_ERROR).
+		 */
+		sock->sk->sk_err = -err;
+	}
 out_put:
 	fput_light(sock->file, fput_needed);
 
-	if (err == 0)
-		return datagrams;
-
-	if (datagrams != 0) {
-		/*
-		 * We may return less entries than requested (vlen) if the
-		 * sock is non block and there aren't enough datagrams...
-		 */
-		if (err != -EAGAIN) {
-			/*
-			 * ... or  if recvmsg returns an error after we
-			 * received some datagrams, where we record the
-			 * error to return on the next call or if the
-			 * app asks about it using getsockopt(SO_ERROR).
-			 */
-			sock->sk->sk_err = -err;
-		}
-
-		return datagrams;
-	}
-
-	return err;
+	return datagrams;
 }
 
 SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,

From 96b0434c25be8c73822a679be35b29ca5263baea Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Wed, 9 Jun 2010 17:47:38 -0500
Subject: [PATCH 0184/3220] CHROMIUM: dm: boot time specification of dm=

This is a wrap-up of three patches pending upstream approval.
I'm bundling them because they are interdependent, and it'll be
easier to drop it on rebase later.

1. dm: allow a dm-fs-style device to be shared via dm-ioctl

Integrates feedback from Alisdair, Mike, and Kiyoshi.

Two main changes occur here:

- One function is added which allows for a programmatically created
mapped device to be inserted into the dm-ioctl hash table.  This binds
the device to a name and, optional, uuid which is needed by udev and
allows for userspace management of the mapped device.

- dm_table_complete() was extended to handle all of the final
functional changes required for the table to be operational once
called.

2. init: boot to device-mapper targets without an initr*

Add a dm= kernel parameter modeled after the md= parameter from
do_mounts_md.  It allows for device-mapper targets to be configured at
boot time for use early in the boot process (as the root device or
otherwise).  It also replaces /dev/XXX calls with major:minor opportunistically.

The format is dm="name uuid ro,table line 1,table line 2,...".  The
parser expects the comma to be safe to use as a newline substitute but,
otherwise, uses the normal separator of space.  Some attempt has been
made to make it forgiving of additional spaces (using skip_spaces()).

A mapped device created during boot will be assigned a minor of 0 and
may be access via /dev/dm-0.

An example dm-linear root with no uuid may look like:

root=/dev/dm-0  dm="lroot none ro, 0 4096 linear /dev/ubdb 0, 4096 4096 linear /dv/ubdc 0"

Once udev is started, /dev/dm-0 will become /dev/mapper/lroot.

Older upstream threads:
http://marc.info/?l=dm-devel&m=127429492521964&w=2
http://marc.info/?l=dm-devel&m=127429499422096&w=2
http://marc.info/?l=dm-devel&m=127429493922000&w=2

Latest upstream threads:
https://patchwork.kernel.org/patch/104859/
https://patchwork.kernel.org/patch/104860/
https://patchwork.kernel.org/patch/104861/

Bug: 27175947

Signed-off-by: Will Drewry <wad@chromium.org>

Review URL: http://codereview.chromium.org/2020011

Change-Id: I92bd53432a11241228d2e5ac89a3b20d19b05a31
---
 Documentation/device-mapper/boot.txt |  42 +++
 Documentation/kernel-parameters.txt  |   6 +
 drivers/md/dm-ioctl.c                |  39 +++
 drivers/md/dm-table.c                |   1 +
 include/linux/device-mapper.h        |   6 +
 init/Makefile                        |   1 +
 init/do_mounts.c                     |   1 +
 init/do_mounts.h                     |  10 +
 init/do_mounts_dm.c                  | 410 +++++++++++++++++++++++++++
 9 files changed, 516 insertions(+)
 create mode 100644 Documentation/device-mapper/boot.txt
 create mode 100644 init/do_mounts_dm.c

diff --git a/Documentation/device-mapper/boot.txt b/Documentation/device-mapper/boot.txt
new file mode 100644
index 000000000000..adcaad5e5e32
--- /dev/null
+++ b/Documentation/device-mapper/boot.txt
@@ -0,0 +1,42 @@
+Boot time creation of mapped devices
+===================================
+
+It is possible to configure a device mapper device to act as the root
+device for your system in two ways.
+
+The first is to build an initial ramdisk which boots to a minimal
+userspace which configures the device, then pivot_root(8) in to it.
+
+For simple device mapper configurations, it is possible to boot directly
+using the following kernel command line:
+
+dm="<name> <uuid> <ro>,table line 1,...,table line n"
+
+name = the name to associate with the device
+	after boot, udev, if used, will use that name to label
+	the device node.
+uuid = may be 'none' or the UUID desired for the device.
+ro = may be "ro" or "rw".  If "ro", the device and device table will be
+	marked read-only.
+
+Each table line may be as normal when using the dmsetup tool except for
+two variations:
+1. Any use of commas will be interpreted as a newline
+2. Quotation marks cannot be escaped and cannot be used without
+   terminating the dm= argument.
+
+Unless renamed by udev, the device node created will be dm-0 as the
+first minor number for the device-mapper is used during early creation.
+
+Example
+=======
+
+- Booting to a linear array made up of user-mode linux block devices:
+
+  dm="lroot none 0, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" \
+  root=/dev/dm-0
+
+Will boot to a rw dm-linear target of 8192 sectors split across two
+block devices identified by their major:minor numbers.  After boot, udev
+will rename this target to /dev/mapper/lroot (depending on the rules).
+No uuid was assigned.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 742f69d18fc8..b02f027cba72 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -56,6 +56,7 @@ parameter is applicable:
 	BLACKFIN Blackfin architecture is enabled.
 	CLK	Common clock infrastructure is enabled.
 	CMA	Contiguous Memory Area support is enabled.
+	DM	Device mapper support is enabled.
 	DRM	Direct Rendering Management support is enabled.
 	DYNAMIC_DEBUG Build in debug messages and enable them at runtime
 	EDD	BIOS Enhanced Disk Drive Services (EDD) is enabled
@@ -915,6 +916,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	dis_ucode_ldr	[X86] Disable the microcode loader.
 
+	dm=		[DM] Allows early creation of a device-mapper device.
+			See Documentation/device-mapper/boot.txt.
+
+	dmasound=	[HW,OSS] Sound subsystem buff
+
 	dma_debug=off	If the kernel is compiled with DMA_API_DEBUG support,
 			this option disables the debugging code at boot.
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 80a439543259..bc5e9a5b1f30 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1923,6 +1923,45 @@ void dm_interface_exit(void)
 	dm_hash_exit();
 }
 
+
+/**
+ * dm_ioctl_export - Permanently export a mapped device via the ioctl interface
+ * @md: Pointer to mapped_device
+ * @name: Buffer (size DM_NAME_LEN) for name
+ * @uuid: Buffer (size DM_UUID_LEN) for uuid or NULL if not desired
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+		    const char *uuid)
+{
+	int r = 0;
+	struct hash_cell *hc;
+
+	if (!md) {
+		r = -ENXIO;
+		goto out;
+	}
+
+	/* The name and uuid can only be set once. */
+	mutex_lock(&dm_hash_cells_mutex);
+	hc = dm_get_mdptr(md);
+	mutex_unlock(&dm_hash_cells_mutex);
+	if (hc) {
+		DMERR("%s: already exported", dm_device_name(md));
+		r = -ENXIO;
+		goto out;
+	}
+
+	r = dm_hash_insert(name, uuid, md);
+	if (r) {
+		DMERR("%s: could not bind to '%s'", dm_device_name(md), name);
+		goto out;
+	}
+
+	/* Let udev know we've changed. */
+	dm_kobject_uevent(md, KOBJ_CHANGE, dm_get_event_nr(md));
+out:
+	return r;
+}
 /**
  * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers
  * @md: Pointer to mapped_device
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 061152a43730..80ede7a70761 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -11,6 +11,7 @@
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/namei.h>
+#include <linux/mount.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/slab.h>
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ec1c61c87d89..87afa0552398 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -380,6 +380,12 @@ void dm_put(struct mapped_device *md);
 void dm_set_mdptr(struct mapped_device *md, void *ptr);
 void *dm_get_mdptr(struct mapped_device *md);
 
+/*
+ * Export the device via the ioctl interface (uses mdptr).
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+		    const char *uuid);
+
 /*
  * A device can still be used while suspended, but I/O is deferred.
  */
diff --git a/init/Makefile b/init/Makefile
index 692b91f1c1d4..243f61de2cba 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -15,6 +15,7 @@ mounts-y			:= do_mounts.o
 mounts-$(CONFIG_BLK_DEV_RAM)	+= do_mounts_rd.o
 mounts-$(CONFIG_BLK_DEV_INITRD)	+= do_mounts_initrd.o
 mounts-$(CONFIG_BLK_DEV_MD)	+= do_mounts_md.o
+mounts-$(CONFIG_BLK_DEV_DM)	+= do_mounts_dm.o
 
 # dependencies on generated files need to be listed explicitly
 $(obj)/version.o: include/generated/compile.h
diff --git a/init/do_mounts.c b/init/do_mounts.c
index dea5de95c2dd..1902a1c80831 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -566,6 +566,7 @@ void __init prepare_namespace(void)
 	wait_for_device_probe();
 
 	md_run_setup();
+	dm_run_setup();
 
 	if (saved_root_name[0]) {
 		root_device_name = saved_root_name;
diff --git a/init/do_mounts.h b/init/do_mounts.h
index f5b978a9bb92..09d22862e8c3 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -74,3 +74,13 @@ void md_run_setup(void);
 static inline void md_run_setup(void) {}
 
 #endif
+
+#ifdef CONFIG_BLK_DEV_DM
+
+void dm_run_setup(void);
+
+#else
+
+static inline void dm_run_setup(void) {}
+
+#endif
diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c
new file mode 100644
index 000000000000..0fd3411533f3
--- /dev/null
+++ b/init/do_mounts_dm.c
@@ -0,0 +1,410 @@
+/* do_mounts_dm.c
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *                    All Rights Reserved.
+ * Based on do_mounts_md.c
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/device-mapper.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+
+#include "do_mounts.h"
+
+#define DM_MAX_NAME 32
+#define DM_MAX_UUID 129
+#define DM_NO_UUID "none"
+
+#define DM_MSG_PREFIX "init"
+
+/* Separators used for parsing the dm= argument. */
+#define DM_FIELD_SEP ' '
+#define DM_LINE_SEP ','
+
+/*
+ * When the device-mapper and any targets are compiled into the kernel
+ * (not a module), one target may be created and used as the root device at
+ * boot time with the parameters given with the boot line dm=...
+ * The code for that is here.
+ */
+
+struct dm_setup_target {
+	sector_t begin;
+	sector_t length;
+	char *type;
+	char *params;
+	/* simple singly linked list */
+	struct dm_setup_target *next;
+};
+
+static struct {
+	int minor;
+	int ro;
+	char name[DM_MAX_NAME];
+	char uuid[DM_MAX_UUID];
+	char *targets;
+	struct dm_setup_target *target;
+	int target_count;
+} dm_setup_args __initdata;
+
+static __initdata int dm_early_setup;
+
+static size_t __init get_dm_option(char *str, char **next, char sep)
+{
+	size_t len = 0;
+	char *endp = NULL;
+
+	if (!str)
+		return 0;
+
+	endp = strchr(str, sep);
+	if (!endp) {  /* act like strchrnul */
+		len = strlen(str);
+		endp = str + len;
+	} else {
+		len = endp - str;
+	}
+
+	if (endp == str)
+		return 0;
+
+	if (!next)
+		return len;
+
+	if (*endp == 0) {
+		/* Don't advance past the nul. */
+		*next = endp;
+	} else {
+		*next = endp + 1;
+	}
+	return len;
+}
+
+static int __init dm_setup_args_init(void)
+{
+	dm_setup_args.minor = 0;
+	dm_setup_args.ro = 0;
+	dm_setup_args.target = NULL;
+	dm_setup_args.target_count = 0;
+	return 0;
+}
+
+static int __init dm_setup_cleanup(void)
+{
+	struct dm_setup_target *target = dm_setup_args.target;
+	struct dm_setup_target *old_target = NULL;
+	while (target) {
+		kfree(target->type);
+		kfree(target->params);
+		old_target = target;
+		target = target->next;
+		kfree(old_target);
+		dm_setup_args.target_count--;
+	}
+	BUG_ON(dm_setup_args.target_count);
+	return 0;
+}
+
+static char * __init dm_setup_parse_device_args(char *str)
+{
+	char *next = NULL;
+	size_t len = 0;
+
+	/* Grab the logical name of the device to be exported to udev */
+	len = get_dm_option(str, &next, DM_FIELD_SEP);
+	if (!len) {
+		DMERR("failed to parse device name");
+		goto parse_fail;
+	}
+	len = min(len + 1, sizeof(dm_setup_args.name));
+	strlcpy(dm_setup_args.name, str, len);  /* includes nul */
+	str = skip_spaces(next);
+
+	/* Grab the UUID value or "none" */
+	len = get_dm_option(str, &next, DM_FIELD_SEP);
+	if (!len) {
+		DMERR("failed to parse device uuid");
+		goto parse_fail;
+	}
+	len = min(len + 1, sizeof(dm_setup_args.uuid));
+	strlcpy(dm_setup_args.uuid, str, len);
+	str = skip_spaces(next);
+
+	/* Determine if the table/device will be read only or read-write */
+	if (!strncmp("ro,", str, 3)) {
+		dm_setup_args.ro = 1;
+	} else if (!strncmp("rw,", str, 3)) {
+		dm_setup_args.ro = 0;
+	} else {
+		DMERR("failed to parse table mode");
+		goto parse_fail;
+	}
+	str = skip_spaces(str + 3);
+
+	return str;
+
+parse_fail:
+	return NULL;
+}
+
+static void __init dm_substitute_devices(char *str, size_t str_len)
+{
+	char *candidate = str;
+	char *candidate_end = str;
+	char old_char;
+	size_t len = 0;
+	dev_t dev;
+
+	if (str_len < 3)
+		return;
+
+	while (str && *str) {
+		candidate = strchr(str, '/');
+		if (!candidate)
+			break;
+
+		/* Avoid embedded slashes */
+		if (candidate != str && *(candidate - 1) != DM_FIELD_SEP) {
+			str = strchr(candidate, DM_FIELD_SEP);
+			continue;
+		}
+
+		len = get_dm_option(candidate, &candidate_end, DM_FIELD_SEP);
+		str = skip_spaces(candidate_end);
+		if (len < 3 || len > 37)  /* name_to_dev_t max; maj:mix min */
+			continue;
+
+		/* Temporarily terminate with a nul */
+		candidate_end--;
+		old_char = *candidate_end;
+		*candidate_end = '\0';
+
+		DMDEBUG("converting candidate device '%s' to dev_t", candidate);
+		/* Use the boot-time specific device naming */
+		dev = name_to_dev_t(candidate);
+		*candidate_end = old_char;
+
+		DMDEBUG(" -> %u", dev);
+		/* No suitable replacement found */
+		if (!dev)
+			continue;
+
+		/* Rewrite the /dev/path as a major:minor */
+		len = snprintf(candidate, len, "%u:%u", MAJOR(dev), MINOR(dev));
+		if (!len) {
+			DMERR("error substituting device major/minor.");
+			break;
+		}
+		candidate += len;
+		/* Pad out with spaces (fixing our nul) */
+		while (candidate < candidate_end)
+			*(candidate++) = DM_FIELD_SEP;
+	}
+}
+
+static int __init dm_setup_parse_targets(char *str)
+{
+	char *next = NULL;
+	size_t len = 0;
+	struct dm_setup_target **target = NULL;
+
+	/* Targets are defined as per the table format but with a
+	 * comma as a newline separator. */
+	target = &dm_setup_args.target;
+	while (str && *str) {
+		*target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL);
+		if (!*target) {
+			DMERR("failed to allocate memory for target %d",
+			      dm_setup_args.target_count);
+			goto parse_fail;
+		}
+		dm_setup_args.target_count++;
+
+		(*target)->begin = simple_strtoull(str, &next, 10);
+		if (!next || *next != DM_FIELD_SEP) {
+			DMERR("failed to parse starting sector for target %d",
+			      dm_setup_args.target_count - 1);
+			goto parse_fail;
+		}
+		str = skip_spaces(next + 1);
+
+		(*target)->length = simple_strtoull(str, &next, 10);
+		if (!next || *next != DM_FIELD_SEP) {
+			DMERR("failed to parse length for target %d",
+			      dm_setup_args.target_count - 1);
+			goto parse_fail;
+		}
+		str = skip_spaces(next + 1);
+
+		len = get_dm_option(str, &next, DM_FIELD_SEP);
+		if (!len ||
+		    !((*target)->type = kstrndup(str, len, GFP_KERNEL))) {
+			DMERR("failed to parse type for target %d",
+			      dm_setup_args.target_count - 1);
+			goto parse_fail;
+		}
+		str = skip_spaces(next);
+
+		len = get_dm_option(str, &next, DM_LINE_SEP);
+		if (!len ||
+		    !((*target)->params = kstrndup(str, len, GFP_KERNEL))) {
+			DMERR("failed to parse params for target %d",
+			      dm_setup_args.target_count - 1);
+			goto parse_fail;
+		}
+		str = skip_spaces(next);
+
+		/* Before moving on, walk through the copied target and
+		 * attempt to replace all /dev/xxx with the major:minor number.
+		 * It may not be possible to resolve them traditionally at
+		 * boot-time. */
+		dm_substitute_devices((*target)->params, len);
+
+		target = &((*target)->next);
+	}
+	DMDEBUG("parsed %d targets", dm_setup_args.target_count);
+
+	return 0;
+
+parse_fail:
+	return 1;
+}
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the DM device now; that is handled by
+ * dm_setup_drive after the low-level disk drivers have initialised.
+ * dm format is as follows:
+ *  dm="name uuid fmode,[table line 1],[table line 2],..."
+ * May be used with root=/dev/dm-0 as it always uses the first dm minor.
+ */
+
+static int __init dm_setup(char *str)
+{
+	dm_setup_args_init();
+
+	str = dm_setup_parse_device_args(str);
+	if (!str) {
+		DMDEBUG("str is NULL");
+		goto parse_fail;
+	}
+
+	/* Target parsing is delayed until we have dynamic memory */
+	dm_setup_args.targets = str;
+
+	printk(KERN_INFO "dm: will configure '%s' on dm-%d\n",
+	       dm_setup_args.name, dm_setup_args.minor);
+
+	dm_early_setup = 1;
+	return 1;
+
+parse_fail:
+	printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n");
+	return 0;
+}
+
+
+static void __init dm_setup_drive(void)
+{
+	struct mapped_device *md = NULL;
+	struct dm_table *table = NULL;
+	struct dm_setup_target *target;
+	char *uuid = dm_setup_args.uuid;
+	fmode_t fmode = FMODE_READ;
+
+	/* Finish parsing the targets. */
+	if (dm_setup_parse_targets(dm_setup_args.targets))
+		goto parse_fail;
+
+	if (dm_create(dm_setup_args.minor, &md)) {
+		DMDEBUG("failed to create the device");
+		goto dm_create_fail;
+	}
+	DMDEBUG("created device '%s'", dm_device_name(md));
+
+	/* In addition to flagging the table below, the disk must be
+	 * set explicitly ro/rw. */
+	set_disk_ro(dm_disk(md), dm_setup_args.ro);
+
+	if (!dm_setup_args.ro)
+		fmode |= FMODE_WRITE;
+	if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) {
+		DMDEBUG("failed to create the table");
+		goto dm_table_create_fail;
+	}
+
+	target = dm_setup_args.target;
+	while (target) {
+		DMINFO("adding target '%llu %llu %s %s'",
+		       (unsigned long long) target->begin,
+		       (unsigned long long) target->length, target->type,
+		       target->params);
+		if (dm_table_add_target(table, target->type, target->begin,
+					target->length, target->params)) {
+			DMDEBUG("failed to add the target to the table");
+			goto add_target_fail;
+		}
+		target = target->next;
+	}
+
+	if (dm_table_complete(table)) {
+		DMDEBUG("failed to complete the table");
+		goto table_complete_fail;
+	}
+
+	/* Suspend the device so that we can bind it to the table. */
+	if (dm_suspend(md, 0)) {
+		DMDEBUG("failed to suspend the device pre-bind");
+		goto suspend_fail;
+	}
+
+	/* Bind the table to the device. This is the only way to associate
+	 * md->map with the table and set the disk capacity directly. */
+	if (dm_swap_table(md, table)) {  /* should return NULL. */
+		DMDEBUG("failed to bind the device to the table");
+		goto table_bind_fail;
+	}
+
+	/* Finally, resume and the device should be ready. */
+	if (dm_resume(md)) {
+		DMDEBUG("failed to resume the device");
+		goto resume_fail;
+	}
+
+	/* Export the dm device via the ioctl interface */
+	if (!strcmp(DM_NO_UUID, dm_setup_args.uuid))
+		uuid = NULL;
+	if (dm_ioctl_export(md, dm_setup_args.name, uuid)) {
+		DMDEBUG("failed to export device with given name and uuid");
+		goto export_fail;
+	}
+	printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor);
+
+	dm_setup_cleanup();
+	return;
+
+export_fail:
+resume_fail:
+table_bind_fail:
+suspend_fail:
+table_complete_fail:
+add_target_fail:
+	dm_table_put(table);
+dm_table_create_fail:
+	dm_put(md);
+dm_create_fail:
+	dm_setup_cleanup();
+parse_fail:
+	printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n",
+	       dm_setup_args.minor, dm_setup_args.name);
+}
+
+__setup("dm=", dm_setup);
+
+void __init dm_run_setup(void)
+{
+	if (!dm_early_setup)
+		return;
+	printk(KERN_INFO "dm: attempting early device configuration.\n");
+	dm_setup_drive();
+}

From 5a77db78398a7247ab6bb6ff9720bae19d1f9ff4 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 8 Feb 2016 16:47:41 -0800
Subject: [PATCH 0185/3220] ANDROID: dm: Rebase on top of 4.1

1. "dm: optimize use SRCU and RCU" removes the use of dm_table_put.
2. "dm: remove request-based logic from make_request_fn wrapper" necessitates
    calling dm_setup_md_queue or else the request_queue's make_request_fn
    pointer ends being unset.

[    7.711600] Internal error: Oops - bad mode: 0 [#1] PREEMPT SMP
[    7.717519] CPU: 1 PID: 1 Comm: swapper/0 Tainted: G        W       4.1.15-02273-gb057d16-dirty #33
[    7.726559] Hardware name: HiKey Development Board (DT)
[    7.731779] task: ffffffc005f8acc0 ti: ffffffc005f8c000 task.ti: ffffffc005f8c000
[    7.739257] PC is at 0x0
[    7.741787] LR is at generic_make_request+0x8c/0x108
....
[    9.082931] Call trace:
[    9.085372] [<          (null)>]           (null)
[    9.090074] [<ffffffc0003f4ac0>] submit_bio+0x98/0x1e0
[    9.095212] [<ffffffc0001e2618>] _submit_bh+0x120/0x1f0
[    9.096165] cfg80211: Calling CRDA to update world regulatory domain
[    9.106781] [<ffffffc0001e5450>] __bread_gfp+0x94/0x114
[    9.112004] [<ffffffc00024a748>] ext4_fill_super+0x18c/0x2d64
[    9.117750] [<ffffffc0001b275c>] mount_bdev+0x194/0x1c0
[    9.122973] [<ffffffc0002450dc>] ext4_mount+0x14/0x1c
[    9.128021] [<ffffffc0001b29a0>] mount_fs+0x3c/0x194
[    9.132985] [<ffffffc0001d059c>] vfs_kern_mount+0x4c/0x134
[    9.138467] [<ffffffc0001d2168>] do_mount+0x204/0xbbc
[    9.143514] [<ffffffc0001d2ec4>] SyS_mount+0x94/0xe8
[    9.148479] [<ffffffc000c54074>] mount_block_root+0x120/0x24c
[    9.154222] [<ffffffc000c543e8>] mount_root+0x110/0x12c
[    9.159443] [<ffffffc000c54574>] prepare_namespace+0x170/0x1b8
[    9.165273] [<ffffffc000c53d98>] kernel_init_freeable+0x23c/0x260
[    9.171365] [<ffffffc0009b1748>] kernel_init+0x10/0x118
[    9.176589] Code: bad PC value
[    9.179807] ---[ end trace 75e1bc52ba364d13 ]---

Bug: 27175947

Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I952d86fd1475f0825f9be1386e3497b36127abd0
---
 init/do_mounts_dm.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c
index 0fd3411533f3..f521bc5ae248 100644
--- a/init/do_mounts_dm.c
+++ b/init/do_mounts_dm.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 
 #include "do_mounts.h"
+#include "../drivers/md/dm.h"
 
 #define DM_MAX_NAME 32
 #define DM_MAX_UUID 129
@@ -333,6 +334,7 @@ static void __init dm_setup_drive(void)
 		goto dm_table_create_fail;
 	}
 
+	dm_lock_md_type(md);
 	target = dm_setup_args.target;
 	while (target) {
 		DMINFO("adding target '%llu %llu %s %s'",
@@ -352,6 +354,17 @@ static void __init dm_setup_drive(void)
 		goto table_complete_fail;
 	}
 
+	if (dm_get_md_type(md) == DM_TYPE_NONE) {
+		dm_set_md_type(md, dm_table_get_type(table));
+		if (dm_setup_md_queue(md)) {
+			DMWARN("unable to set up device queue for new table.");
+			goto setup_md_queue_fail;
+		}
+	} else if (dm_get_md_type(md) != dm_table_get_type(table)) {
+		DMWARN("can't change device type after initial table load.");
+		goto setup_md_queue_fail;
+        }
+
 	/* Suspend the device so that we can bind it to the table. */
 	if (dm_suspend(md, 0)) {
 		DMDEBUG("failed to suspend the device pre-bind");
@@ -380,6 +393,7 @@ static void __init dm_setup_drive(void)
 	}
 	printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor);
 
+	dm_unlock_md_type(md);
 	dm_setup_cleanup();
 	return;
 
@@ -387,9 +401,10 @@ export_fail:
 resume_fail:
 table_bind_fail:
 suspend_fail:
+setup_md_queue_fail:
 table_complete_fail:
 add_target_fail:
-	dm_table_put(table);
+	dm_unlock_md_type(md);
 dm_table_create_fail:
 	dm_put(md);
 dm_create_fail:

From 26cab56d9af49e78dd1b395454f484003cc85575 Mon Sep 17 00:00:00 2001
From: Jeremy Compostella <jeremy.compostella@intel.com>
Date: Mon, 2 May 2016 17:29:28 +0200
Subject: [PATCH 0186/3220] ANDROID: dm: fix dm_substitute_devices()

When candidate is the last parameter, candidate_end points to the '\0'
character and not the DM_FIELD_SEP character.  In such a situation, we
should not move the candidate_end pointer one character backward.

Signed-off-by: Jeremy Compostella <jeremy.compostella@intel.com>
---
 init/do_mounts_dm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c
index f521bc5ae248..ecda58df9a19 100644
--- a/init/do_mounts_dm.c
+++ b/init/do_mounts_dm.c
@@ -176,7 +176,8 @@ static void __init dm_substitute_devices(char *str, size_t str_len)
 			continue;
 
 		/* Temporarily terminate with a nul */
-		candidate_end--;
+		if (*candidate_end)
+			candidate_end--;
 		old_char = *candidate_end;
 		*candidate_end = '\0';
 

From 36d01a590d035804dc100e105facf4049cf6fe43 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 14 Dec 2015 20:09:39 -0800
Subject: [PATCH 0187/3220] ANDROID: dm: Add android verity target

This device-mapper target is virtually a VERITY target. This
target is setup by reading the metadata contents piggybacked
to the actual data blocks in the block device. The signature
of the metadata contents are verified against the key included
in the system keyring. Upon success, the underlying verity
target is setup.

BUG: 27175947

Change-Id: I7e99644a0960ac8279f02c0158ed20999510ea97
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
---
 drivers/md/Kconfig             |  16 +
 drivers/md/Makefile            |   4 +
 drivers/md/dm-android-verity.c | 771 +++++++++++++++++++++++++++++++++
 drivers/md/dm-android-verity.h |  92 ++++
 drivers/md/dm-verity-target.c  |  12 +-
 drivers/md/dm-verity.h         |  12 +
 6 files changed, 901 insertions(+), 6 deletions(-)
 create mode 100644 drivers/md/dm-android-verity.c
 create mode 100644 drivers/md/dm-android-verity.h

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index d8b0ab6f3753..96b419b544ed 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -500,4 +500,20 @@ config DM_LOG_WRITES
 
 	  If unsure, say N.
 
+config DM_ANDROID_VERITY
+	bool "Android verity target support"
+	depends on DM_VERITY
+	depends on X509_CERTIFICATE_PARSER
+	depends on SYSTEM_TRUSTED_KEYRING
+	depends on PUBLIC_KEY_ALGO_RSA
+	depends on KEYS
+	depends on ASYMMETRIC_KEY_TYPE
+	depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
+	---help---
+	  This device-mapper target is virtually a VERITY target. This
+	  target is setup by reading the metadata contents piggybacked
+	  to the actual data blocks in the block device. The signature
+	  of the metadata contents are verified against the key included
+	  in the system keyring. Upon success, the underlying verity
+	  target is setup.
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 62a65764e8e0..c8fb00d8cc36 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -68,3 +68,7 @@ endif
 ifeq ($(CONFIG_DM_VERITY_FEC),y)
 dm-verity-objs			+= dm-verity-fec.o
 endif
+
+ifeq ($(CONFIG_DM_ANDROID_VERITY),y)
+dm-verity-objs			+= dm-android-verity.o
+endif
diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
new file mode 100644
index 000000000000..c77c9fa7a962
--- /dev/null
+++ b/drivers/md/dm-android-verity.c
@@ -0,0 +1,771 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/of.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+#include <asm/setup.h>
+#include <crypto/hash.h>
+#include <crypto/public_key.h>
+#include <crypto/sha.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+
+#include "dm-verity.h"
+#include "dm-android-verity.h"
+
+static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH];
+
+static int __init verified_boot_state_param(char *line)
+{
+	strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate));
+	return 1;
+}
+
+__setup("androidboot.verifiedbootstate=", verified_boot_state_param);
+
+static int __init verity_mode_param(char *line)
+{
+	strlcpy(veritymode, line, sizeof(veritymode));
+	return 1;
+}
+
+__setup("androidboot.veritymode=", verity_mode_param);
+
+static int table_extract_mpi_array(struct public_key_signature *pks,
+				const void *data, size_t len)
+{
+	MPI mpi = mpi_read_raw_data(data, len);
+
+	if (!mpi) {
+		DMERR("Error while allocating mpi array");
+		return -ENOMEM;
+	}
+
+	pks->mpi[0] = mpi;
+	pks->nr_mpi = 1;
+	return 0;
+}
+
+static struct public_key_signature *table_make_digest(
+						enum pkey_hash_algo hash,
+						const void *table,
+						unsigned long table_len)
+{
+	struct public_key_signature *pks = NULL;
+	struct crypto_shash *tfm;
+	struct shash_desc *desc;
+	size_t digest_size, desc_size;
+	int ret;
+
+	/* Allocate the hashing algorithm we're going to need and find out how
+	 * big the hash operational data will be.
+	 */
+	tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
+	if (IS_ERR(tfm))
+		return ERR_CAST(tfm);
+
+	desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+	digest_size = crypto_shash_digestsize(tfm);
+
+	/* We allocate the hash operational data storage on the end of out
+	 * context data and the digest output buffer on the end of that.
+	 */
+	ret = -ENOMEM;
+	pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+	if (!pks)
+		goto error;
+
+	pks->pkey_hash_algo = hash;
+	pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
+	pks->digest_size = digest_size;
+
+	desc = (struct shash_desc *)(pks + 1);
+	desc->tfm = tfm;
+	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	ret = crypto_shash_init(desc);
+	if (ret < 0)
+		goto error;
+
+	ret = crypto_shash_finup(desc, table, table_len, pks->digest);
+	if (ret < 0)
+		goto error;
+
+	crypto_free_shash(tfm);
+	return pks;
+
+error:
+	kfree(pks);
+	crypto_free_shash(tfm);
+	return ERR_PTR(ret);
+}
+
+static int read_block_dev(struct bio_read *payload, struct block_device *bdev,
+		sector_t offset, int length)
+{
+	struct bio *bio;
+	int err = 0, i;
+
+	payload->number_of_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+
+	bio = bio_alloc(GFP_KERNEL, payload->number_of_pages);
+	if (!bio) {
+		DMERR("Error while allocating bio");
+		return -ENOMEM;
+	}
+
+	bio->bi_bdev = bdev;
+	bio->bi_sector = offset;
+
+	payload->page_io = kzalloc(sizeof(struct page *) *
+		payload->number_of_pages, GFP_KERNEL);
+	if (!payload->page_io) {
+		DMERR("page_io array alloc failed");
+		err = -ENOMEM;
+		goto free_bio;
+	}
+
+	for (i = 0; i < payload->number_of_pages; i++) {
+		payload->page_io[i] = alloc_page(GFP_KERNEL);
+		if (!payload->page_io[i]) {
+			DMERR("alloc_page failed");
+			err = -ENOMEM;
+			goto free_pages;
+		}
+		if (!bio_add_page(bio, payload->page_io[i], PAGE_SIZE, 0)) {
+			DMERR("bio_add_page error");
+			err = -EIO;
+			goto free_pages;
+		}
+	}
+
+	if (!submit_bio_wait(READ, bio))
+		/* success */
+		goto free_bio;
+	DMERR("bio read failed");
+	err = -EIO;
+
+free_pages:
+	for (i = 0; i < payload->number_of_pages; i++)
+		if (payload->page_io[i])
+			__free_page(payload->page_io[i]);
+	kfree(payload->page_io);
+free_bio:
+	bio_put(bio);
+	return err;
+}
+
+static inline u64 fec_div_round_up(u64 x, u64 y)
+{
+	u64 remainder;
+
+	return div64_u64_rem(x, y, &remainder) +
+		(remainder > 0 ? 1 : 0);
+}
+
+static inline void populate_fec_metadata(struct fec_header *header,
+				struct fec_ecc_metadata *ecc)
+{
+	ecc->blocks = fec_div_round_up(le64_to_cpu(header->inp_size),
+			FEC_BLOCK_SIZE);
+	ecc->roots = le32_to_cpu(header->roots);
+	ecc->start = le64_to_cpu(header->inp_size);
+}
+
+static inline int validate_fec_header(struct fec_header *header, u64 offset)
+{
+	/* move offset to make the sanity check work for backup header
+	 * as well. */
+	offset -= offset % FEC_BLOCK_SIZE;
+	if (le32_to_cpu(header->magic) != FEC_MAGIC ||
+		le32_to_cpu(header->version) != FEC_VERSION ||
+		le32_to_cpu(header->size) != sizeof(struct fec_header) ||
+		le32_to_cpu(header->roots) == 0 ||
+		le32_to_cpu(header->roots) >= FEC_RSM ||
+		offset < le32_to_cpu(header->fec_size) ||
+		offset - le32_to_cpu(header->fec_size) !=
+		le64_to_cpu(header->inp_size))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int extract_fec_header(dev_t dev, struct fec_header *fec,
+				struct fec_ecc_metadata *ecc)
+{
+	u64 device_size;
+	struct bio_read payload;
+	int i, err = 0;
+	struct block_device *bdev;
+
+	bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+	if (IS_ERR(bdev)) {
+		DMERR("bdev get error");
+		return PTR_ERR(bdev);
+	}
+
+	device_size = i_size_read(bdev->bd_inode);
+
+	/* fec metadata size is a power of 2 and PAGE_SIZE
+	 * is a power of 2 as well.
+	 */
+	BUG_ON(FEC_BLOCK_SIZE > PAGE_SIZE);
+	/* 512 byte sector alignment */
+	BUG_ON(((device_size - FEC_BLOCK_SIZE) % (1 << SECTOR_SHIFT)) != 0);
+
+	err = read_block_dev(&payload, bdev, (device_size -
+		FEC_BLOCK_SIZE) / (1 << SECTOR_SHIFT), FEC_BLOCK_SIZE);
+	if (err) {
+		DMERR("Error while reading verity metadata");
+		goto error;
+	}
+
+	BUG_ON(sizeof(struct fec_header) > PAGE_SIZE);
+	memcpy(fec, page_address(payload.page_io[0]),
+			sizeof(*fec));
+
+	ecc->valid = true;
+	if (validate_fec_header(fec, device_size - FEC_BLOCK_SIZE)) {
+		/* Try the backup header */
+		memcpy(fec, page_address(payload.page_io[0]) + FEC_BLOCK_SIZE
+			- sizeof(*fec) ,
+			sizeof(*fec));
+		if (validate_fec_header(fec, device_size -
+			sizeof(struct fec_header)))
+			ecc->valid = false;
+	}
+
+	if (ecc->valid)
+		populate_fec_metadata(fec, ecc);
+
+	for (i = 0; i < payload.number_of_pages; i++)
+		__free_page(payload.page_io[i]);
+	kfree(payload.page_io);
+
+error:
+	blkdev_put(bdev, FMODE_READ);
+	return err;
+}
+static void find_metadata_offset(struct fec_header *fec,
+		struct block_device *bdev, u64 *metadata_offset)
+{
+	u64 device_size;
+
+	device_size = i_size_read(bdev->bd_inode);
+
+	if (le32_to_cpu(fec->magic) == FEC_MAGIC)
+		*metadata_offset = le64_to_cpu(fec->inp_size) -
+					VERITY_METADATA_SIZE;
+	else
+		*metadata_offset = device_size - VERITY_METADATA_SIZE;
+}
+
+static struct android_metadata *extract_metadata(dev_t dev,
+				struct fec_header *fec)
+{
+	struct block_device *bdev;
+	struct android_metadata_header *header;
+	struct android_metadata *uninitialized_var(metadata);
+	int i;
+	u32 table_length, copy_length, offset;
+	u64 metadata_offset;
+	struct bio_read payload;
+	int err = 0;
+
+	bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+	if (IS_ERR(bdev)) {
+		DMERR("blkdev_get_by_dev failed");
+		return ERR_CAST(bdev);
+	}
+
+	find_metadata_offset(fec, bdev, &metadata_offset);
+
+	/* Verity metadata size is a power of 2 and PAGE_SIZE
+	 * is a power of 2 as well.
+	 * PAGE_SIZE is also a multiple of 512 bytes.
+	*/
+	if (VERITY_METADATA_SIZE > PAGE_SIZE)
+		BUG_ON(VERITY_METADATA_SIZE % PAGE_SIZE != 0);
+	/* 512 byte sector alignment */
+	BUG_ON(metadata_offset % (1 << SECTOR_SHIFT) != 0);
+
+	err = read_block_dev(&payload, bdev, metadata_offset /
+		(1 << SECTOR_SHIFT), VERITY_METADATA_SIZE);
+	if (err) {
+		DMERR("Error while reading verity metadata");
+		metadata = ERR_PTR(err);
+		goto blkdev_release;
+	}
+
+	header = kzalloc(sizeof(*header), GFP_KERNEL);
+	if (!header) {
+		DMERR("kzalloc failed for header");
+		err = -ENOMEM;
+		goto free_payload;
+	}
+
+	memcpy(header, page_address(payload.page_io[0]),
+		sizeof(*header));
+
+	DMINFO("bio magic_number:%u protocol_version:%d table_length:%u",
+		le32_to_cpu(header->magic_number),
+		le32_to_cpu(header->protocol_version),
+		le32_to_cpu(header->table_length));
+
+	metadata = kzalloc(sizeof(*metadata), GFP_KERNEL);
+	if (!metadata) {
+		DMERR("kzalloc for metadata failed");
+		err = -ENOMEM;
+		goto free_header;
+	}
+
+	metadata->header = header;
+	table_length = le32_to_cpu(header->table_length);
+
+	if (table_length == 0 ||
+		table_length > (VERITY_METADATA_SIZE -
+			sizeof(struct android_metadata_header)))
+		goto free_metadata;
+
+	metadata->verity_table = kzalloc(table_length + 1, GFP_KERNEL);
+
+	if (!metadata->verity_table) {
+		DMERR("kzalloc verity_table failed");
+		err = -ENOMEM;
+		goto free_metadata;
+	}
+
+	if (sizeof(struct android_metadata_header) +
+			table_length <= PAGE_SIZE) {
+		memcpy(metadata->verity_table, page_address(payload.page_io[0])
+			+ sizeof(struct android_metadata_header),
+			table_length);
+	} else {
+		copy_length = PAGE_SIZE -
+			sizeof(struct android_metadata_header);
+		memcpy(metadata->verity_table, page_address(payload.page_io[0])
+			+ sizeof(struct android_metadata_header),
+			copy_length);
+		table_length -= copy_length;
+		offset = copy_length;
+		i = 1;
+		while (table_length != 0) {
+			if (table_length > PAGE_SIZE) {
+				memcpy(metadata->verity_table + offset,
+					page_address(payload.page_io[i]),
+					PAGE_SIZE);
+				offset += PAGE_SIZE;
+				table_length -= PAGE_SIZE;
+			} else {
+				memcpy(metadata->verity_table + offset,
+					page_address(payload.page_io[i]),
+					table_length);
+				table_length = 0;
+			}
+			i++;
+		}
+	}
+	metadata->verity_table[table_length] = '\0';
+
+	goto free_payload;
+
+free_metadata:
+	kfree(metadata);
+free_header:
+	kfree(header);
+	metadata = ERR_PTR(err);
+free_payload:
+	for (i = 0; i < payload.number_of_pages; i++)
+		if (payload.page_io[i])
+			__free_page(payload.page_io[i]);
+	kfree(payload.page_io);
+
+	DMINFO("verity_table: %s", metadata->verity_table);
+blkdev_release:
+	blkdev_put(bdev, FMODE_READ);
+	return metadata;
+}
+
+/* helper functions to extract properties from dts */
+const char *find_dt_value(const char *name)
+{
+	struct device_node *firmware;
+	const char *value;
+
+	firmware = of_find_node_by_path("/firmware/android");
+	if (!firmware)
+		return NULL;
+	value = of_get_property(firmware, name, NULL);
+	of_node_put(firmware);
+
+	return value;
+}
+
+static bool is_unlocked(void)
+{
+	static const char unlocked[]  = "orange";
+	static const char verified_boot_prop[] = "verifiedbootstate";
+	const char *value;
+
+	value = find_dt_value(verified_boot_prop);
+	if (!value)
+		value = verifiedbootstate;
+
+	return !strncmp(value, unlocked, sizeof(unlocked) - 1);
+}
+
+static int verity_mode(void)
+{
+	static const char enforcing[] = "enforcing";
+	static const char verified_mode_prop[] = "veritymode";
+	const char *value;
+
+	value = find_dt_value(verified_mode_prop);
+	if (!value)
+		value = veritymode;
+	if (!strncmp(value, enforcing, sizeof(enforcing) - 1))
+		return DM_VERITY_MODE_RESTART;
+
+	return DM_VERITY_MODE_EIO;
+}
+
+static int verify_header(struct android_metadata_header *header)
+{
+	int retval = -EINVAL;
+
+	if (is_unlocked() && le32_to_cpu(header->magic_number) ==
+		VERITY_METADATA_MAGIC_DISABLE) {
+		retval = VERITY_STATE_DISABLE;
+		return retval;
+	}
+
+	if (!(le32_to_cpu(header->magic_number) ==
+		VERITY_METADATA_MAGIC_NUMBER) ||
+		(le32_to_cpu(header->magic_number) ==
+		VERITY_METADATA_MAGIC_DISABLE)) {
+		DMERR("Incorrect magic number");
+		return retval;
+	}
+
+	if (le32_to_cpu(header->protocol_version) !=
+		VERITY_METADATA_VERSION) {
+		DMERR("Unsupported version %u",
+			le32_to_cpu(header->protocol_version));
+		return retval;
+	}
+
+	return 0;
+}
+
+static int verify_verity_signature(char *key_id,
+		struct android_metadata *metadata)
+{
+	key_ref_t key_ref;
+	struct key *key;
+	struct public_key_signature *pks = NULL;
+	int retval = -EINVAL;
+
+	key_ref = keyring_search(make_key_ref(system_trusted_keyring, 1),
+		&key_type_asymmetric, key_id);
+
+	if (IS_ERR(key_ref)) {
+		DMERR("keyring: key not found");
+		return -ENOKEY;
+	}
+
+	key = key_ref_to_ptr(key_ref);
+
+	pks = table_make_digest(PKEY_HASH_SHA256,
+			(const void *)metadata->verity_table,
+			le32_to_cpu(metadata->header->table_length));
+
+	if (IS_ERR(pks)) {
+		DMERR("hashing failed");
+		goto error;
+	}
+
+	retval = table_extract_mpi_array(pks, &metadata->header->signature[0],
+				RSANUMBYTES);
+	if (retval < 0) {
+		DMERR("Error extracting mpi %d", retval);
+		goto error;
+	}
+
+	retval = verify_signature(key, pks);
+	mpi_free(pks->rsa.s);
+error:
+	kfree(pks);
+	key_put(key);
+
+	return retval;
+}
+
+static void handle_error(void)
+{
+	int mode = verity_mode();
+	if (mode == DM_VERITY_MODE_RESTART) {
+		DMERR("triggering restart");
+		kernel_restart("dm-verity device corrupted");
+	} else {
+		DMERR("Mounting verity root failed");
+	}
+}
+
+static inline bool test_mult_overflow(sector_t a, u32 b)
+{
+	sector_t r = (sector_t)~0ULL;
+
+	sector_div(r, b);
+	return a > r;
+}
+
+/*
+ * Target parameters:
+ *	<key id>	Key id of the public key in the system keyring.
+ *			Verity metadata's signature would be verified against
+ *			this. If the key id contains spaces, replace them
+ *			with '#'.
+ *	<block device>	The block device for which dm-verity is being setup.
+ */
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	dev_t uninitialized_var(dev);
+	struct android_metadata *uninitialized_var(metadata);
+	int err = 0, i, mode;
+	char *key_id, *table_ptr, dummy,
+	*verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS];
+	/* One for specifying number of opt args and one for mode */
+	sector_t data_sectors;
+	u32 data_block_size;
+	unsigned int major, minor,
+	no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS;
+	struct fec_header fec;
+	struct fec_ecc_metadata uninitialized_var(ecc);
+	char buf[FEC_ARG_LENGTH], *buf_ptr;
+	unsigned long long tmpll;
+
+	if (argc != 2) {
+		DMERR("Incorrect number of arguments");
+		handle_error();
+		return -EINVAL;
+	}
+
+	/* should come as one of the arguments for the verity target */
+	key_id = argv[0];
+	strreplace(argv[0], '#', ' ');
+
+	if (sscanf(argv[1], "%u:%u%c", &major, &minor, &dummy) == 2) {
+		dev = MKDEV(major, minor);
+		if (MAJOR(dev) != major || MINOR(dev) != minor) {
+			DMERR("Incorrect bdev major minor number");
+			handle_error();
+			return -EOVERFLOW;
+		}
+	}
+
+	DMINFO("key:%s dev:%s", argv[0], argv[1]);
+
+	if (extract_fec_header(dev, &fec, &ecc)) {
+		DMERR("Error while extracting fec header");
+		handle_error();
+		return -EINVAL;
+	}
+
+	metadata = extract_metadata(dev, &fec);
+
+	if (IS_ERR(metadata)) {
+		DMERR("Error while extracting metadata");
+		handle_error();
+		return -EINVAL;
+	}
+
+	err = verify_header(metadata->header);
+
+	if (err == VERITY_STATE_DISABLE) {
+		DMERR("Mounting root with verity disabled");
+		return -EINVAL;
+	} else if (err) {
+		DMERR("Verity header handle error");
+		handle_error();
+		goto free_metadata;
+	}
+
+	err = verify_verity_signature(key_id, metadata);
+
+	if (err) {
+		DMERR("Signature verification failed");
+		handle_error();
+		goto free_metadata;
+	} else
+		DMINFO("Signature verification success");
+
+	table_ptr = metadata->verity_table;
+
+	for (i = 0; i < VERITY_TABLE_ARGS; i++) {
+		verity_table_args[i] = strsep(&table_ptr, " ");
+		if (verity_table_args[i] == NULL)
+			break;
+	}
+
+	if (i != VERITY_TABLE_ARGS) {
+		DMERR("Verity table not in the expected format");
+		err = -EINVAL;
+		handle_error();
+		goto free_metadata;
+	}
+
+	if (sscanf(verity_table_args[5], "%llu%c", &tmpll, &dummy)
+							!= 1) {
+		DMERR("Verity table not in the expected format");
+		handle_error();
+		err = -EINVAL;
+		goto free_metadata;
+	}
+
+	if (tmpll > ULONG_MAX) {
+		DMERR("<num_data_blocks> too large. Forgot to turn on CONFIG_LBDAF?");
+		handle_error();
+		err = -EINVAL;
+		goto free_metadata;
+	}
+
+	data_sectors = tmpll;
+
+	if (sscanf(verity_table_args[3], "%u%c", &data_block_size, &dummy)
+								!= 1) {
+		DMERR("Verity table not in the expected format");
+		handle_error();
+		err = -EINVAL;
+		goto free_metadata;
+	}
+
+	if (test_mult_overflow(data_sectors, data_block_size >>
+							SECTOR_SHIFT)) {
+		DMERR("data_sectors too large");
+		handle_error();
+		err = -EOVERFLOW;
+		goto free_metadata;
+	}
+
+	data_sectors *= data_block_size >> SECTOR_SHIFT;
+	DMINFO("Data sectors %llu", (unsigned long long)data_sectors);
+
+	/* update target length */
+	ti->len = data_sectors;
+
+	/*substitute data_dev and hash_dev*/
+	verity_table_args[1] = argv[1];
+	verity_table_args[2] = argv[1];
+
+	mode = verity_mode();
+
+	if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) {
+		if (mode) {
+			err = snprintf(buf, FEC_ARG_LENGTH,
+			"%u %s " VERITY_TABLE_OPT_FEC_FORMAT,
+			1 + VERITY_TABLE_OPT_FEC_ARGS,
+			mode == DM_VERITY_MODE_RESTART ?
+			VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING,
+			argv[1], ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+			ecc.roots);
+		} else {
+			err = snprintf(buf, FEC_ARG_LENGTH,
+			"%u " VERITY_TABLE_OPT_FEC_FORMAT,
+			VERITY_TABLE_OPT_FEC_ARGS, argv[1],
+			ecc.start / FEC_BLOCK_SIZE, ecc.blocks, ecc.roots);
+		}
+	} else if (mode) {
+		err = snprintf(buf, FEC_ARG_LENGTH,
+			"2 " VERITY_TABLE_OPT_IGNZERO " %s",
+			mode == DM_VERITY_MODE_RESTART ?
+			VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING);
+	} else {
+		err = snprintf(buf, FEC_ARG_LENGTH, "1 %s",
+				 "ignore_zero_blocks");
+	}
+
+	if (err < 0 || err >= FEC_ARG_LENGTH)
+		goto free_metadata;
+
+	buf_ptr = buf;
+
+	for (i = VERITY_TABLE_ARGS; i < (VERITY_TABLE_ARGS +
+		VERITY_TABLE_OPT_FEC_ARGS + 2); i++) {
+		verity_table_args[i] = strsep(&buf_ptr, " ");
+		if (verity_table_args[i] == NULL) {
+			no_of_args = i;
+			break;
+		}
+	}
+
+	err = verity_ctr(ti, no_of_args, verity_table_args);
+
+free_metadata:
+	kfree(metadata->header);
+	kfree(metadata->verity_table);
+	kfree(metadata);
+	return err;
+}
+
+static struct target_type android_verity_target = {
+	.name			= "android-verity",
+	.version		= {1, 0, 0},
+	.module			= THIS_MODULE,
+	.ctr			= android_verity_ctr,
+	.dtr			= verity_dtr,
+	.map			= verity_map,
+	.status			= verity_status,
+	.ioctl			= verity_ioctl,
+	.merge			= verity_merge,
+	.iterate_devices	= verity_iterate_devices,
+	.io_hints		= verity_io_hints,
+};
+
+static int __init dm_android_verity_init(void)
+{
+	int r;
+
+	r = dm_register_target(&android_verity_target);
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_android_verity_exit(void)
+{
+	dm_unregister_target(&android_verity_target);
+}
+
+module_init(dm_android_verity_init);
+module_exit(dm_android_verity_exit);
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
new file mode 100644
index 000000000000..11477ffd2243
--- /dev/null
+++ b/drivers/md/dm-android-verity.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef DM_ANDROID_VERITY_H
+#define DM_ANDROID_VERITY_H
+
+#include <crypto/sha.h>
+
+#define RSANUMBYTES 256
+#define VERITY_METADATA_MAGIC_NUMBER 0xb001b001
+#define VERITY_METADATA_MAGIC_DISABLE 0x46464f56
+#define VERITY_METADATA_VERSION 0
+#define VERITY_STATE_DISABLE 1
+#define DATA_BLOCK_SIZE (4 * 1024)
+#define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE)
+#define VERITY_TABLE_ARGS 10
+#define VERITY_COMMANDLINE_PARAM_LENGTH 20
+
+#define FEC_MAGIC 0xFECFECFE
+#define FEC_BLOCK_SIZE (4 * 1024)
+#define FEC_VERSION 0
+#define FEC_RSM 255
+#define FEC_ARG_LENGTH 300
+
+#define VERITY_TABLE_OPT_RESTART "restart_on_corruption"
+#define VERITY_TABLE_OPT_LOGGING "ignore_corruption"
+#define VERITY_TABLE_OPT_IGNZERO "ignore_zero_blocks"
+
+#define VERITY_TABLE_OPT_FEC_FORMAT \
+	"use_fec_from_device %s fec_start %llu fec_blocks %llu fec_roots %u ignore_zero_blocks"
+#define VERITY_TABLE_OPT_FEC_ARGS 9
+
+#define VERITY_DEBUG 0
+
+#define DM_MSG_PREFIX                   "android-verity"
+/*
+ * There can be two formats.
+ * if fec is present
+ * <data_blocks> <verity_tree> <verity_metdata_32K><fec_data><fec_data_4K>
+ * if fec is not present
+ * <data_blocks> <verity_tree> <verity_metdata_32K>
+ */
+/* TODO: rearrange structure to reduce memory holes
+ * depends on userspace change.
+ */
+struct fec_header {
+	__le32 magic;
+	__le32 version;
+	__le32 size;
+	__le32 roots;
+	__le32 fec_size;
+	__le64 inp_size;
+	u8 hash[SHA256_DIGEST_SIZE];
+};
+
+struct android_metadata_header {
+	__le32 magic_number;
+	__le32 protocol_version;
+	char signature[RSANUMBYTES];
+	__le32 table_length;
+};
+
+struct android_metadata {
+	struct android_metadata_header *header;
+	char *verity_table;
+};
+
+struct fec_ecc_metadata {
+	bool valid;
+	u32 roots;
+	u64 blocks;
+	u64 rounds;
+	u64 start;
+};
+
+struct bio_read {
+	struct page **page_io;
+	int number_of_pages;
+};
+
+#endif /* DM_ANDROID_VERITY_H */
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 5c5d30cb6ec5..65835f15a116 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -551,7 +551,7 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
  * Bio map function. It allocates dm_verity_io structure and bio vector and
  * fills them. Then it issues prefetches and the I/O.
  */
-static int verity_map(struct dm_target *ti, struct bio *bio)
+int verity_map(struct dm_target *ti, struct bio *bio)
 {
 	struct dm_verity *v = ti->private;
 	struct dm_verity_io *io;
@@ -596,7 +596,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 /*
  * Status: V (valid) or C (corruption found)
  */
-static void verity_status(struct dm_target *ti, status_type_t type,
+void verity_status(struct dm_target *ti, status_type_t type,
 			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct dm_verity *v = ti->private;
@@ -669,7 +669,7 @@ static int verity_prepare_ioctl(struct dm_target *ti,
 	return 0;
 }
 
-static int verity_iterate_devices(struct dm_target *ti,
+int verity_iterate_devices(struct dm_target *ti,
 				  iterate_devices_callout_fn fn, void *data)
 {
 	struct dm_verity *v = ti->private;
@@ -677,7 +677,7 @@ static int verity_iterate_devices(struct dm_target *ti,
 	return fn(ti, v->data_dev, v->data_start, ti->len, data);
 }
 
-static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct dm_verity *v = ti->private;
 
@@ -690,7 +690,7 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	blk_limits_io_min(limits, limits->logical_block_size);
 }
 
-static void verity_dtr(struct dm_target *ti)
+void verity_dtr(struct dm_target *ti)
 {
 	struct dm_verity *v = ti->private;
 
@@ -817,7 +817,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
  *	<digest>
  *	<salt>		Hex string or "-" if no salt.
  */
-static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	struct dm_verity *v;
 	struct dm_arg_set as;
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index fb419f422d73..d9cf5e4939eb 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -126,4 +126,16 @@ extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
 extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 				 sector_t block, u8 *digest, bool *is_zero);
 
+extern void verity_status(struct dm_target *ti, status_type_t type,
+			unsigned status_flags, char *result, unsigned maxlen);
+extern int verity_ioctl(struct dm_target *ti, unsigned cmd,
+			unsigned long arg);
+extern int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+			struct bio_vec *biovec, int max_size);
+extern int verity_iterate_devices(struct dm_target *ti,
+				iterate_devices_callout_fn fn, void *data);
+extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits);
+extern void verity_dtr(struct dm_target *ti);
+extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+extern int verity_map(struct dm_target *ti, struct bio *bio);
 #endif /* DM_VERITY_H */

From f42f971b7b59257a8610608b7f45a046c7c57b5d Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 8 Feb 2016 16:28:43 -0800
Subject: [PATCH 0188/3220] ANDROID: dm-android-verity: Rebase on top of 4.1

Following CLs in upstream causes minor changes to dm-android-verity target.
1. keys: change asymmetric keys to use common hash definitions
2. block: Abstract out bvec iterator
Rebase dm-android-verity on top of these changes.

Bug: 27175947

Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: Icfdc3e7b3ead5de335a059cade1aca70414db415
---
 drivers/md/dm-android-verity.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index c77c9fa7a962..aeb5045830d9 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -75,7 +75,7 @@ static int table_extract_mpi_array(struct public_key_signature *pks,
 }
 
 static struct public_key_signature *table_make_digest(
-						enum pkey_hash_algo hash,
+						enum hash_algo hash,
 						const void *table,
 						unsigned long table_len)
 {
@@ -88,7 +88,7 @@ static struct public_key_signature *table_make_digest(
 	/* Allocate the hashing algorithm we're going to need and find out how
 	 * big the hash operational data will be.
 	 */
-	tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
+	tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
 	if (IS_ERR(tfm))
 		return ERR_CAST(tfm);
 
@@ -143,7 +143,7 @@ static int read_block_dev(struct bio_read *payload, struct block_device *bdev,
 	}
 
 	bio->bi_bdev = bdev;
-	bio->bi_sector = offset;
+	bio->bi_iter.bi_sector = offset;
 
 	payload->page_io = kzalloc(sizeof(struct page *) *
 		payload->number_of_pages, GFP_KERNEL);
@@ -505,7 +505,7 @@ static int verify_verity_signature(char *key_id,
 
 	key = key_ref_to_ptr(key_ref);
 
-	pks = table_make_digest(PKEY_HASH_SHA256,
+	pks = table_make_digest(HASH_ALGO_SHA256,
 			(const void *)metadata->verity_table,
 			le32_to_cpu(metadata->header->table_length));
 
@@ -569,7 +569,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	u32 data_block_size;
 	unsigned int major, minor,
 	no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS;
-	struct fec_header fec;
+	struct fec_header uninitialized_var(fec);
 	struct fec_ecc_metadata uninitialized_var(ecc);
 	char buf[FEC_ARG_LENGTH], *buf_ptr;
 	unsigned long long tmpll;

From 67ce481897e0656404fe0af42d563101a5629bbe Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 21 Mar 2016 10:55:23 -0700
Subject: [PATCH 0189/3220] ANDROID: dm: Mounting root as linear device when
 verity disabled

This CL makes android-verity target to be added as linear
dm device if when bootloader is unlocked and verity is disabled.

Bug: 27175947
Change-Id: Ic41ca4b8908fb2777263799cf3a3e25934d70f18
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
---
 drivers/md/dm-android-verity.c | 131 +++++++++++++++++++++++++++------
 drivers/md/dm-android-verity.h |   3 +
 drivers/md/dm-linear.c         |   2 +-
 3 files changed, 113 insertions(+), 23 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index aeb5045830d9..db4ddf789a39 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/device-mapper.h>
@@ -43,6 +44,25 @@
 static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH];
 static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH];
 
+static bool target_added;
+static bool verity_enabled = true;
+struct dentry *debug_dir;
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+
+static struct target_type android_verity_target = {
+	.name                   = "android-verity",
+	.version                = {1, 0, 0},
+	.module                 = THIS_MODULE,
+	.ctr                    = android_verity_ctr,
+	.dtr                    = verity_dtr,
+	.map                    = verity_map,
+	.status                 = verity_status,
+	.ioctl                  = verity_ioctl,
+	.merge                  = verity_merge,
+	.iterate_devices        = verity_iterate_devices,
+	.io_hints               = verity_io_hints,
+};
+
 static int __init verified_boot_state_param(char *line)
 {
 	strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate));
@@ -549,6 +569,35 @@ static inline bool test_mult_overflow(sector_t a, u32 b)
 	return a > r;
 }
 
+static int add_as_linear_device(struct dm_target *ti, char *dev)
+{
+	/*Move to linear mapping defines*/
+	char *linear_table_args[DM_LINEAR_ARGS];
+	char offset[]  = "0";
+	int err = 0;
+
+	linear_table_args[0] = dev;
+	linear_table_args[1] = offset;
+
+	android_verity_target.dtr = linear_target.dtr,
+	android_verity_target.map = linear_target.map,
+	android_verity_target.status = linear_target.status,
+	android_verity_target.ioctl = linear_target.ioctl,
+	android_verity_target.merge = linear_target.merge,
+	android_verity_target.iterate_devices = linear_target.iterate_devices,
+	android_verity_target.io_hints = NULL;
+
+	err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args);
+
+	if (!err) {
+		DMINFO("Added android-verity as a linear target");
+		target_added = true;
+	} else
+		DMERR("Failed to add android-verity as linear target");
+
+	return err;
+}
+
 /*
  * Target parameters:
  *	<key id>	Key id of the public key in the system keyring.
@@ -613,21 +662,27 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	if (err == VERITY_STATE_DISABLE) {
 		DMERR("Mounting root with verity disabled");
-		return -EINVAL;
+		verity_enabled = false;
+		/* we would still have to parse the args to figure out
+		 * the data blocks size. Or may be could map the entire
+		 * partition similar to mounting the device.
+		 */
 	} else if (err) {
 		DMERR("Verity header handle error");
 		handle_error();
 		goto free_metadata;
 	}
 
-	err = verify_verity_signature(key_id, metadata);
+	if (!verity_enabled) {
+		err = verify_verity_signature(key_id, metadata);
 
-	if (err) {
-		DMERR("Signature verification failed");
-		handle_error();
-		goto free_metadata;
-	} else
-		DMINFO("Signature verification success");
+		if (err) {
+			DMERR("Signature verification failed");
+			handle_error();
+			goto free_metadata;
+		} else
+			DMINFO("Signature verification success");
+	}
 
 	table_ptr = metadata->verity_table;
 
@@ -683,6 +738,12 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	/* update target length */
 	ti->len = data_sectors;
 
+	/* Setup linear target and free */
+	if (!verity_enabled) {
+		err = add_as_linear_device(ti, argv[1]);
+		goto free_metadata;
+	}
+
 	/*substitute data_dev and hash_dev*/
 	verity_table_args[1] = argv[1];
 	verity_table_args[2] = argv[1];
@@ -730,6 +791,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	err = verity_ctr(ti, no_of_args, verity_table_args);
 
+	if (err)
+		DMERR("android-verity failed to mount as verity target");
+	else {
+		target_added = true;
+		DMINFO("android-verity mounted as verity target");
+	}
+
 free_metadata:
 	kfree(metadata->header);
 	kfree(metadata->verity_table);
@@ -737,33 +805,52 @@ free_metadata:
 	return err;
 }
 
-static struct target_type android_verity_target = {
-	.name			= "android-verity",
-	.version		= {1, 0, 0},
-	.module			= THIS_MODULE,
-	.ctr			= android_verity_ctr,
-	.dtr			= verity_dtr,
-	.map			= verity_map,
-	.status			= verity_status,
-	.ioctl			= verity_ioctl,
-	.merge			= verity_merge,
-	.iterate_devices	= verity_iterate_devices,
-	.io_hints		= verity_io_hints,
-};
-
 static int __init dm_android_verity_init(void)
 {
 	int r;
+	struct dentry *file;
 
 	r = dm_register_target(&android_verity_target);
 	if (r < 0)
 		DMERR("register failed %d", r);
 
+	/* Tracks the status of the last added target */
+	debug_dir = debugfs_create_dir("android_verity", NULL);
+
+	if (IS_ERR_OR_NULL(debug_dir)) {
+		DMERR("Cannot create android_verity debugfs directory: %ld",
+			PTR_ERR(debug_dir));
+		goto end;
+	}
+
+	file = debugfs_create_bool("target_added", S_IRUGO, debug_dir,
+				(u32 *)&target_added);
+
+	if (IS_ERR_OR_NULL(file)) {
+		DMERR("Cannot create android_verity debugfs directory: %ld",
+			PTR_ERR(debug_dir));
+		debugfs_remove_recursive(debug_dir);
+		goto end;
+	}
+
+	file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir,
+				(u32 *)&verity_enabled);
+
+	if (IS_ERR_OR_NULL(file)) {
+		DMERR("Cannot create android_verity debugfs directory: %ld",
+			PTR_ERR(debug_dir));
+		debugfs_remove_recursive(debug_dir);
+	}
+
+end:
 	return r;
 }
 
 static void __exit dm_android_verity_exit(void)
 {
+	if (!IS_ERR_OR_NULL(debug_dir))
+		debugfs_remove_recursive(debug_dir);
+
 	dm_unregister_target(&android_verity_target);
 }
 
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
index 11477ffd2243..2cf7de1b7910 100644
--- a/drivers/md/dm-android-verity.h
+++ b/drivers/md/dm-android-verity.h
@@ -44,6 +44,8 @@
 #define VERITY_DEBUG 0
 
 #define DM_MSG_PREFIX                   "android-verity"
+
+#define DM_LINEAR_ARGS 2
 /*
  * There can be two formats.
  * if fec is present
@@ -89,4 +91,5 @@ struct bio_read {
 	int number_of_pages;
 };
 
+extern struct target_type linear_target;
 #endif /* DM_ANDROID_VERITY_H */
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 05c35aacb3aa..d8bf876e7bed 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -141,7 +141,7 @@ static int linear_iterate_devices(struct dm_target *ti,
 	return fn(ti, lc->dev, lc->start, ti->len, data);
 }
 
-static struct target_type linear_target = {
+struct target_type linear_target = {
 	.name   = "linear",
 	.version = {1, 2, 1},
 	.module = THIS_MODULE,

From 438e162621ac7e80ec9c81bea3f0eaec0b801ff9 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 28 Mar 2016 14:41:21 -0700
Subject: [PATCH 0190/3220] ANDROID: dm: Minor cleanup

Compacts the linear device arguments removing the
unnecessary variables.

Bug: 27175947
Change-Id: I157170eebe3c0f89a68ae05870a1060f188d0da0
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
---
 drivers/md/dm-android-verity.c | 7 ++-----
 drivers/md/dm-android-verity.h | 2 ++
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index db4ddf789a39..f6ddbee5e2d3 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -572,13 +572,10 @@ static inline bool test_mult_overflow(sector_t a, u32 b)
 static int add_as_linear_device(struct dm_target *ti, char *dev)
 {
 	/*Move to linear mapping defines*/
-	char *linear_table_args[DM_LINEAR_ARGS];
-	char offset[]  = "0";
+	char *linear_table_args[DM_LINEAR_ARGS] = {dev,
+					DM_LINEAR_TARGET_OFFSET};
 	int err = 0;
 
-	linear_table_args[0] = dev;
-	linear_table_args[1] = offset;
-
 	android_verity_target.dtr = linear_target.dtr,
 	android_verity_target.map = linear_target.map,
 	android_verity_target.status = linear_target.status,
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
index 2cf7de1b7910..fe53863c664b 100644
--- a/drivers/md/dm-android-verity.h
+++ b/drivers/md/dm-android-verity.h
@@ -46,6 +46,8 @@
 #define DM_MSG_PREFIX                   "android-verity"
 
 #define DM_LINEAR_ARGS 2
+#define DM_LINEAR_TARGET_OFFSET "0"
+
 /*
  * There can be two formats.
  * if fec is present

From 86fd82659f745b5fea48090fe320cb817e199805 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Tue, 5 Apr 2016 11:18:16 -0700
Subject: [PATCH 0191/3220] ANDROID: dm: rename dm-linear methods for
 dm-android-verity

This keeps linear_target as static variable and just exposes
the linear target methods for android-verity

Cherry-picked: https://android-review.googlesource.com/#/c/212858

Change-Id: I4a377e417b00afd9ecccdb3e605fea31a7df112e
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
(cherry picked from commit a6d1b091f40b25d97849487e29ec097bc5f568dd)
---
 drivers/md/dm-android-verity.c | 14 +++++++-------
 drivers/md/dm-android-verity.h | 12 ++++++++++++
 drivers/md/dm-linear.c         | 26 +++++++++++++-------------
 3 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index f6ddbee5e2d3..b7e059595f75 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -576,15 +576,15 @@ static int add_as_linear_device(struct dm_target *ti, char *dev)
 					DM_LINEAR_TARGET_OFFSET};
 	int err = 0;
 
-	android_verity_target.dtr = linear_target.dtr,
-	android_verity_target.map = linear_target.map,
-	android_verity_target.status = linear_target.status,
-	android_verity_target.ioctl = linear_target.ioctl,
-	android_verity_target.merge = linear_target.merge,
-	android_verity_target.iterate_devices = linear_target.iterate_devices,
+	android_verity_target.dtr = dm_linear_dtr,
+	android_verity_target.map = dm_linear_map,
+	android_verity_target.status = dm_linear_status,
+	android_verity_target.ioctl = dm_linear_ioctl,
+	android_verity_target.merge = dm_linear_merge,
+	android_verity_target.iterate_devices = dm_linear_iterate_devices,
 	android_verity_target.io_hints = NULL;
 
-	err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args);
+	err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args);
 
 	if (!err) {
 		DMINFO("Added android-verity as a linear target");
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
index fe53863c664b..efb796524896 100644
--- a/drivers/md/dm-android-verity.h
+++ b/drivers/md/dm-android-verity.h
@@ -94,4 +94,16 @@ struct bio_read {
 };
 
 extern struct target_type linear_target;
+
+extern void dm_linear_dtr(struct dm_target *ti);
+extern int dm_linear_map(struct dm_target *ti, struct bio *bio);
+extern void dm_linear_status(struct dm_target *ti, status_type_t type,
+			unsigned status_flags, char *result, unsigned maxlen);
+extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd,
+		unsigned long arg);
+extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+		struct bio_vec *biovec, int max_size);
+extern int dm_linear_iterate_devices(struct dm_target *ti,
+			iterate_devices_callout_fn fn, void *data);
+extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv);
 #endif /* DM_ANDROID_VERITY_H */
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index d8bf876e7bed..74caca2888a6 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -25,7 +25,7 @@ struct linear_c {
 /*
  * Construct a linear mapping: <dev_path> <offset>
  */
-static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct linear_c *lc;
 	unsigned long long tmp;
@@ -67,7 +67,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	return ret;
 }
 
-static void linear_dtr(struct dm_target *ti)
+void dm_linear_dtr(struct dm_target *ti)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
 
@@ -92,14 +92,14 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
 			linear_map_sector(ti, bio->bi_iter.bi_sector);
 }
 
-static int linear_map(struct dm_target *ti, struct bio *bio)
+int dm_linear_map(struct dm_target *ti, struct bio *bio)
 {
 	linear_map_bio(ti, bio);
 
 	return DM_MAPIO_REMAPPED;
 }
 
-static void linear_status(struct dm_target *ti, status_type_t type,
+void dm_linear_status(struct dm_target *ti, status_type_t type,
 			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
@@ -116,7 +116,7 @@ static void linear_status(struct dm_target *ti, status_type_t type,
 	}
 }
 
-static int linear_prepare_ioctl(struct dm_target *ti,
+static int dm_linear_prepare_ioctl(struct dm_target *ti,
 		struct block_device **bdev, fmode_t *mode)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
@@ -133,7 +133,7 @@ static int linear_prepare_ioctl(struct dm_target *ti,
 	return 0;
 }
 
-static int linear_iterate_devices(struct dm_target *ti,
+int dm_linear_iterate_devices(struct dm_target *ti,
 				  iterate_devices_callout_fn fn, void *data)
 {
 	struct linear_c *lc = ti->private;
@@ -141,16 +141,16 @@ static int linear_iterate_devices(struct dm_target *ti,
 	return fn(ti, lc->dev, lc->start, ti->len, data);
 }
 
-struct target_type linear_target = {
+static struct target_type linear_target = {
 	.name   = "linear",
 	.version = {1, 2, 1},
 	.module = THIS_MODULE,
-	.ctr    = linear_ctr,
-	.dtr    = linear_dtr,
-	.map    = linear_map,
-	.status = linear_status,
-	.prepare_ioctl = linear_prepare_ioctl,
-	.iterate_devices = linear_iterate_devices,
+	.ctr    = dm_linear_ctr,
+	.dtr    = dm_linear_dtr,
+	.map    = dm_linear_map,
+	.status = dm_linear_status,
+	.prepare_ioctl  = dm_linear_prepare_ioctl,
+	.iterate_devices = dm_linear_iterate_devices,
 };
 
 int __init dm_linear_init(void)

From a517817c1708dd9d762d37d7dd475d4728a4b34e Mon Sep 17 00:00:00 2001
From: Jeremy Compostella <jeremy.compostella@intel.com>
Date: Fri, 15 Apr 2016 13:32:54 +0200
Subject: [PATCH 0192/3220] ANDROID: dm: use name_to_dev_t

This patch makes android_verity_ctr() parse its block device string
parameter with name_to_dev_t().  It allows the use of less hardware
related block device reference like PARTUUID for instance.

Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf07
Signed-off-by: Jeremy Compostella <jeremy.compostella@intel.com>
---
 drivers/md/dm-android-verity.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index b7e059595f75..9c26cbb5f179 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -613,8 +613,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	/* One for specifying number of opt args and one for mode */
 	sector_t data_sectors;
 	u32 data_block_size;
-	unsigned int major, minor,
-	no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS;
+	unsigned int no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS;
 	struct fec_header uninitialized_var(fec);
 	struct fec_ecc_metadata uninitialized_var(ecc);
 	char buf[FEC_ARG_LENGTH], *buf_ptr;
@@ -630,13 +629,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	key_id = argv[0];
 	strreplace(argv[0], '#', ' ');
 
-	if (sscanf(argv[1], "%u:%u%c", &major, &minor, &dummy) == 2) {
-		dev = MKDEV(major, minor);
-		if (MAJOR(dev) != major || MINOR(dev) != minor) {
-			DMERR("Incorrect bdev major minor number");
-			handle_error();
-			return -EOVERFLOW;
-		}
+	dev = name_to_dev_t(argv[1]);
+	if (!dev) {
+		DMERR("no dev found for %s", argv[1]);
+		handle_error();
+		return -EINVAL;
 	}
 
 	DMINFO("key:%s dev:%s", argv[0], argv[1]);

From 9c43aca47bbdf652fc0d05d8e94ee43aaeea2458 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Fri, 20 May 2016 16:44:19 -0700
Subject: [PATCH 0193/3220] ANDROID: dm: fix signature verification flag

The bug was that the signature verification was only
happening when verity was disabled. It should always
happen when verity is enabled.

Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I2d9354e240d36ea06fc68c2d18d8e87b823a4c2f
(cherry picked from commit 5364b5ca0b1a12a58283b51408e43fc36d4e4fe7)
---
 drivers/md/dm-android-verity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index 9c26cbb5f179..00275a986d03 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -667,7 +667,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto free_metadata;
 	}
 
-	if (!verity_enabled) {
+	if (verity_enabled) {
 		err = verify_verity_signature(key_id, metadata);
 
 		if (err) {

From 051d4706c621a73d3622f0b0d91a444ab37a8f1a Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Fri, 20 May 2016 16:45:45 -0700
Subject: [PATCH 0194/3220] ANDROID: dm: use default verity public key

If the dm-android-verity target does not provide a default
key try using the default public key from the system keyring.
The defualt verity keyid is passed as a kernel command line
argument veritykeyid=.

The order of the dm-android-verity params have been reversed
to facilitate the change.

Old format example:
dm="system none ro,0 1 android-verity Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f /dev/mmcblk0p43"

New formats supported:
dm="system none ro,0 1 android-verity /dev/mmcblk0p43 Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f"

(or)

dm="system none ro,0 1 android-verity /dev/mmcblk0p43"
when veritykeyid= is set in the kernel command line.

BUG: 28384658
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I506c89b053d835ab579e703eef2bc1f8487250de
(cherry picked from commit c5c74d0327729f35b576564976885596c6d0e7fb)
---
 drivers/md/dm-android-verity.c | 67 ++++++++++++++++++++++++----------
 drivers/md/dm-android-verity.h | 16 ++++++++
 2 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index 00275a986d03..097fb2b1de89 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -43,6 +43,7 @@
 
 static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH];
 static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH];
 
 static bool target_added;
 static bool verity_enabled = true;
@@ -79,6 +80,19 @@ static int __init verity_mode_param(char *line)
 
 __setup("androidboot.veritymode=", verity_mode_param);
 
+static int __init verity_keyid_param(char *line)
+{
+	strlcpy(veritykeyid, line, sizeof(veritykeyid));
+	return 1;
+}
+
+__setup("veritykeyid=", verity_keyid_param);
+
+static inline bool default_verity_key_id(void)
+{
+	return veritykeyid[0] != '\0';
+}
+
 static int table_extract_mpi_array(struct public_key_signature *pks,
 				const void *data, size_t len)
 {
@@ -608,7 +622,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	dev_t uninitialized_var(dev);
 	struct android_metadata *uninitialized_var(metadata);
 	int err = 0, i, mode;
-	char *key_id, *table_ptr, dummy,
+	char *key_id, *table_ptr, dummy, *target_device,
 	*verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS];
 	/* One for specifying number of opt args and one for mode */
 	sector_t data_sectors;
@@ -619,24 +633,34 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	char buf[FEC_ARG_LENGTH], *buf_ptr;
 	unsigned long long tmpll;
 
-	if (argc != 2) {
+	if (argc == 1) {
+		/* Use the default keyid */
+		if (default_verity_key_id())
+			key_id = veritykeyid;
+		else {
+			DMERR("veritykeyid= is not set");
+			handle_error();
+			return -EINVAL;
+		}
+	} else if (argc == 2)
+		key_id = argv[1];
+	else {
 		DMERR("Incorrect number of arguments");
 		handle_error();
 		return -EINVAL;
 	}
 
-	/* should come as one of the arguments for the verity target */
-	key_id = argv[0];
-	strreplace(argv[0], '#', ' ');
+	strreplace(key_id, '#', ' ');
+	target_device = argv[0];
 
-	dev = name_to_dev_t(argv[1]);
+	dev = name_to_dev_t(target_device);
 	if (!dev) {
-		DMERR("no dev found for %s", argv[1]);
+		DMERR("no dev found for %s", target_device);
 		handle_error();
 		return -EINVAL;
 	}
 
-	DMINFO("key:%s dev:%s", argv[0], argv[1]);
+	DMINFO("key:%s dev:%s", key_id, target_device);
 
 	if (extract_fec_header(dev, &fec, &ecc)) {
 		DMERR("Error while extracting fec header");
@@ -734,30 +758,33 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	/* Setup linear target and free */
 	if (!verity_enabled) {
-		err = add_as_linear_device(ti, argv[1]);
+		err = add_as_linear_device(ti, target_device);
 		goto free_metadata;
 	}
 
 	/*substitute data_dev and hash_dev*/
-	verity_table_args[1] = argv[1];
-	verity_table_args[2] = argv[1];
+	verity_table_args[1] = target_device;
+	verity_table_args[2] = target_device;
 
 	mode = verity_mode();
 
 	if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) {
 		if (mode) {
 			err = snprintf(buf, FEC_ARG_LENGTH,
-			"%u %s " VERITY_TABLE_OPT_FEC_FORMAT,
-			1 + VERITY_TABLE_OPT_FEC_ARGS,
-			mode == DM_VERITY_MODE_RESTART ?
-			VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING,
-			argv[1], ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
-			ecc.roots);
+				"%u %s " VERITY_TABLE_OPT_FEC_FORMAT,
+				1 + VERITY_TABLE_OPT_FEC_ARGS,
+				mode == DM_VERITY_MODE_RESTART ?
+					VERITY_TABLE_OPT_RESTART :
+					VERITY_TABLE_OPT_LOGGING,
+				target_device,
+				ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+				ecc.roots);
 		} else {
 			err = snprintf(buf, FEC_ARG_LENGTH,
-			"%u " VERITY_TABLE_OPT_FEC_FORMAT,
-			VERITY_TABLE_OPT_FEC_ARGS, argv[1],
-			ecc.start / FEC_BLOCK_SIZE, ecc.blocks, ecc.roots);
+				"%u " VERITY_TABLE_OPT_FEC_FORMAT,
+				VERITY_TABLE_OPT_FEC_ARGS, target_device,
+				ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+				ecc.roots);
 		}
 	} else if (mode) {
 		err = snprintf(buf, FEC_ARG_LENGTH,
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
index efb796524896..43655ee0f813 100644
--- a/drivers/md/dm-android-verity.h
+++ b/drivers/md/dm-android-verity.h
@@ -27,6 +27,22 @@
 #define VERITY_TABLE_ARGS 10
 #define VERITY_COMMANDLINE_PARAM_LENGTH 20
 
+/*
+ * <subject>:<sha1-id> is the format for the identifier.
+ * subject can either be the Common Name(CN) + Organization Name(O) or
+ * just the CN if the it is prefixed with O
+ * From https://tools.ietf.org/html/rfc5280#appendix-A
+ * ub-organization-name-length INTEGER ::= 64
+ * ub-common-name-length INTEGER ::= 64
+ *
+ * http://lxr.free-electrons.com/source/crypto/asymmetric_keys/x509_cert_parser.c?v=3.9#L278
+ * ctx->o_size + 2 + ctx->cn_size + 1
+ * + 41 characters for ":" and sha1 id
+ * 64 + 2 + 64 + 1 + 1 + 40 (172)
+ * setting VERITY_DEFAULT_KEY_ID_LENGTH to 200 characters.
+ */
+#define VERITY_DEFAULT_KEY_ID_LENGTH 200
+
 #define FEC_MAGIC 0xFECFECFE
 #define FEC_BLOCK_SIZE (4 * 1024)
 #define FEC_VERSION 0

From 58bae772a7018ebfa735c91af22e26474a1dfcc7 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Fri, 17 Jun 2016 18:54:35 -0700
Subject: [PATCH 0195/3220] ANDROID: dm: mount as linear target if eng build

eng builds dont have verity enabled i.e it does even
have verity metadata appended to the parition. Therefore
add rootdev as linear device and map the entire partition
if build variant is "eng".

(Cherry-picked based on
https://partner-android-review.git.corp.google.com/#/c/618690/)

BUG: 29276559
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I8f5c2289b842b820ca04f5773525e5449bb3f355
---
 drivers/md/dm-android-verity.c | 62 +++++++++++++++++++++++++++++++---
 drivers/md/dm-android-verity.h |  1 +
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index 097fb2b1de89..e1a8e284e7e4 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -44,6 +44,7 @@
 static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH];
 static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH];
 static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH];
+static char buildvariant[BUILD_VARIANT];
 
 static bool target_added;
 static bool verity_enabled = true;
@@ -88,11 +89,26 @@ static int __init verity_keyid_param(char *line)
 
 __setup("veritykeyid=", verity_keyid_param);
 
+static int __init verity_buildvariant(char *line)
+{
+	strlcpy(buildvariant, line, sizeof(buildvariant));
+	return 1;
+}
+
+__setup("buildvariant=", verity_buildvariant);
+
 static inline bool default_verity_key_id(void)
 {
 	return veritykeyid[0] != '\0';
 }
 
+static inline bool is_eng(void)
+{
+	static const char typeeng[]  = "eng";
+
+	return !strncmp(buildvariant, typeeng, sizeof(typeeng));
+}
+
 static int table_extract_mpi_array(struct public_key_signature *pks,
 				const void *data, size_t len)
 {
@@ -262,7 +278,7 @@ static int extract_fec_header(dev_t dev, struct fec_header *fec,
 
 	bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
 
-	if (IS_ERR(bdev)) {
+	if (IS_ERR_OR_NULL(bdev)) {
 		DMERR("bdev get error");
 		return PTR_ERR(bdev);
 	}
@@ -323,6 +339,24 @@ static void find_metadata_offset(struct fec_header *fec,
 		*metadata_offset = device_size - VERITY_METADATA_SIZE;
 }
 
+static int find_size(dev_t dev, u64 *device_size)
+{
+	struct block_device *bdev;
+
+	bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+	if (IS_ERR_OR_NULL(bdev)) {
+		DMERR("blkdev_get_by_dev failed");
+		return PTR_ERR(bdev);
+	}
+
+	*device_size = i_size_read(bdev->bd_inode);
+	*device_size >>= SECTOR_SHIFT;
+
+	DMINFO("blkdev size in sectors: %llu", *device_size);
+	blkdev_put(bdev, FMODE_READ);
+	return 0;
+}
+
 static struct android_metadata *extract_metadata(dev_t dev,
 				struct fec_header *fec)
 {
@@ -337,7 +371,7 @@ static struct android_metadata *extract_metadata(dev_t dev,
 
 	bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
 
-	if (IS_ERR(bdev)) {
+	if (IS_ERR_OR_NULL(bdev)) {
 		DMERR("blkdev_get_by_dev failed");
 		return ERR_CAST(bdev);
 	}
@@ -632,12 +666,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	struct fec_ecc_metadata uninitialized_var(ecc);
 	char buf[FEC_ARG_LENGTH], *buf_ptr;
 	unsigned long long tmpll;
+	u64 device_size;
 
 	if (argc == 1) {
 		/* Use the default keyid */
 		if (default_verity_key_id())
 			key_id = veritykeyid;
-		else {
+		else if (!is_eng()) {
 			DMERR("veritykeyid= is not set");
 			handle_error();
 			return -EINVAL;
@@ -650,7 +685,6 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 	}
 
-	strreplace(key_id, '#', ' ');
 	target_device = argv[0];
 
 	dev = name_to_dev_t(target_device);
@@ -660,6 +694,26 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 	}
 
+	if (is_eng()) {
+		err = find_size(dev, &device_size);
+		if (err) {
+			DMERR("error finding bdev size");
+			handle_error();
+			return err;
+		}
+
+		ti->len = device_size;
+		err = add_as_linear_device(ti, target_device);
+		if (err) {
+			handle_error();
+			return err;
+		}
+		verity_enabled = false;
+		return 0;
+	}
+
+	strreplace(key_id, '#', ' ');
+
 	DMINFO("key:%s dev:%s", key_id, target_device);
 
 	if (extract_fec_header(dev, &fec, &ecc)) {
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
index 43655ee0f813..782e1c815c67 100644
--- a/drivers/md/dm-android-verity.h
+++ b/drivers/md/dm-android-verity.h
@@ -26,6 +26,7 @@
 #define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE)
 #define VERITY_TABLE_ARGS 10
 #define VERITY_COMMANDLINE_PARAM_LENGTH 20
+#define BUILD_VARIANT 20
 
 /*
  * <subject>:<sha1-id> is the format for the identifier.

From f74284f6c25dca79c052c3a1e1bd3fc58de4d4a0 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 27 Jun 2016 16:25:55 -0700
Subject: [PATCH 0196/3220] ANDROID: dm: allow adb disable-verity only in
 userdebug

adb disable-verity was allowed when the phone is in the
unlocked state. Since the driver is now aware of the build
variant, honor "adb disable-verity" only in userdebug
builds.

(Cherry-picked from
https://partner-android-review.git.corp.google.com/#/c/622117)

BUG: 29276559
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I7ce9f38d8c7a62361392c5a8ccebb288f8a3a2ea
---
 drivers/md/dm-android-verity.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index e1a8e284e7e4..999e75bf2ba0 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -109,6 +109,14 @@ static inline bool is_eng(void)
 	return !strncmp(buildvariant, typeeng, sizeof(typeeng));
 }
 
+static inline bool is_userdebug(void)
+{
+	static const char typeuserdebug[]  = "userdebug";
+
+	return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug));
+}
+
+
 static int table_extract_mpi_array(struct public_key_signature *pks,
 				const void *data, size_t len)
 {
@@ -499,19 +507,6 @@ const char *find_dt_value(const char *name)
 	return value;
 }
 
-static bool is_unlocked(void)
-{
-	static const char unlocked[]  = "orange";
-	static const char verified_boot_prop[] = "verifiedbootstate";
-	const char *value;
-
-	value = find_dt_value(verified_boot_prop);
-	if (!value)
-		value = verifiedbootstate;
-
-	return !strncmp(value, unlocked, sizeof(unlocked) - 1);
-}
-
 static int verity_mode(void)
 {
 	static const char enforcing[] = "enforcing";
@@ -531,7 +526,7 @@ static int verify_header(struct android_metadata_header *header)
 {
 	int retval = -EINVAL;
 
-	if (is_unlocked() && le32_to_cpu(header->magic_number) ==
+	if (is_userdebug() && le32_to_cpu(header->magic_number) ==
 		VERITY_METADATA_MAGIC_DISABLE) {
 		retval = VERITY_STATE_DISABLE;
 		return retval;

From ad2f6cf0be78107a2bf435b8a21f801658935719 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Wed, 6 Jul 2016 17:16:19 -0700
Subject: [PATCH 0197/3220] ANDROID: dm: android-verity: Verify header before
 fetching table

Move header validation logic before reading the verity_table as
an invalid header implies the table is invalid as well.

(Cherry-picked from:
https://partner-android-review.git.corp.google.com/#/c/625203)

BUG: 29940612
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: Ib34d25c0854202f3e70df0a6d0ef1d96f0250c8e
---
 drivers/md/dm-android-verity.c | 140 +++++++++++++++++----------------
 1 file changed, 71 insertions(+), 69 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index 999e75bf2ba0..1f4eb099209d 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -365,12 +365,38 @@ static int find_size(dev_t dev, u64 *device_size)
 	return 0;
 }
 
-static struct android_metadata *extract_metadata(dev_t dev,
-				struct fec_header *fec)
+static int verify_header(struct android_metadata_header *header)
+{
+	int retval = -EINVAL;
+
+	if (is_userdebug() && le32_to_cpu(header->magic_number) ==
+			VERITY_METADATA_MAGIC_DISABLE)
+		return VERITY_STATE_DISABLE;
+
+	if (!(le32_to_cpu(header->magic_number) ==
+			VERITY_METADATA_MAGIC_NUMBER) ||
+			(le32_to_cpu(header->magic_number) ==
+			VERITY_METADATA_MAGIC_DISABLE)) {
+		DMERR("Incorrect magic number");
+		return retval;
+	}
+
+	if (le32_to_cpu(header->protocol_version) !=
+			VERITY_METADATA_VERSION) {
+		DMERR("Unsupported version %u",
+			le32_to_cpu(header->protocol_version));
+		return retval;
+	}
+
+	return 0;
+}
+
+static int extract_metadata(dev_t dev, struct fec_header *fec,
+				struct android_metadata **metadata,
+				bool *verity_enabled)
 {
 	struct block_device *bdev;
 	struct android_metadata_header *header;
-	struct android_metadata *uninitialized_var(metadata);
 	int i;
 	u32 table_length, copy_length, offset;
 	u64 metadata_offset;
@@ -381,7 +407,7 @@ static struct android_metadata *extract_metadata(dev_t dev,
 
 	if (IS_ERR_OR_NULL(bdev)) {
 		DMERR("blkdev_get_by_dev failed");
-		return ERR_CAST(bdev);
+		return -ENODEV;
 	}
 
 	find_metadata_offset(fec, bdev, &metadata_offset);
@@ -399,7 +425,6 @@ static struct android_metadata *extract_metadata(dev_t dev,
 		(1 << SECTOR_SHIFT), VERITY_METADATA_SIZE);
 	if (err) {
 		DMERR("Error while reading verity metadata");
-		metadata = ERR_PTR(err);
 		goto blkdev_release;
 	}
 
@@ -418,24 +443,42 @@ static struct android_metadata *extract_metadata(dev_t dev,
 		le32_to_cpu(header->protocol_version),
 		le32_to_cpu(header->table_length));
 
-	metadata = kzalloc(sizeof(*metadata), GFP_KERNEL);
-	if (!metadata) {
+	err = verify_header(header);
+
+	if (err == VERITY_STATE_DISABLE) {
+		DMERR("Mounting root with verity disabled");
+		*verity_enabled = false;
+		/* we would still have to read the metadata to figure out
+		 * the data blocks size. Or may be could map the entire
+		 * partition similar to mounting the device.
+		 *
+		 * Reset error as well as the verity_enabled flag is changed.
+		 */
+		err = 0;
+	} else if (err)
+		goto free_header;
+
+	*metadata = kzalloc(sizeof(**metadata), GFP_KERNEL);
+	if (!*metadata) {
 		DMERR("kzalloc for metadata failed");
 		err = -ENOMEM;
 		goto free_header;
 	}
 
-	metadata->header = header;
+	(*metadata)->header = header;
 	table_length = le32_to_cpu(header->table_length);
 
 	if (table_length == 0 ||
 		table_length > (VERITY_METADATA_SIZE -
-			sizeof(struct android_metadata_header)))
+			sizeof(struct android_metadata_header))) {
+		DMERR("table_length too long");
+		err = -EINVAL;
 		goto free_metadata;
+	}
 
-	metadata->verity_table = kzalloc(table_length + 1, GFP_KERNEL);
+	(*metadata)->verity_table = kzalloc(table_length + 1, GFP_KERNEL);
 
-	if (!metadata->verity_table) {
+	if (!(*metadata)->verity_table) {
 		DMERR("kzalloc verity_table failed");
 		err = -ENOMEM;
 		goto free_metadata;
@@ -443,13 +486,15 @@ static struct android_metadata *extract_metadata(dev_t dev,
 
 	if (sizeof(struct android_metadata_header) +
 			table_length <= PAGE_SIZE) {
-		memcpy(metadata->verity_table, page_address(payload.page_io[0])
+		memcpy((*metadata)->verity_table,
+			page_address(payload.page_io[0])
 			+ sizeof(struct android_metadata_header),
 			table_length);
 	} else {
 		copy_length = PAGE_SIZE -
 			sizeof(struct android_metadata_header);
-		memcpy(metadata->verity_table, page_address(payload.page_io[0])
+		memcpy((*metadata)->verity_table,
+			page_address(payload.page_io[0])
 			+ sizeof(struct android_metadata_header),
 			copy_length);
 		table_length -= copy_length;
@@ -457,13 +502,13 @@ static struct android_metadata *extract_metadata(dev_t dev,
 		i = 1;
 		while (table_length != 0) {
 			if (table_length > PAGE_SIZE) {
-				memcpy(metadata->verity_table + offset,
+				memcpy((*metadata)->verity_table + offset,
 					page_address(payload.page_io[i]),
 					PAGE_SIZE);
 				offset += PAGE_SIZE;
 				table_length -= PAGE_SIZE;
 			} else {
-				memcpy(metadata->verity_table + offset,
+				memcpy((*metadata)->verity_table + offset,
 					page_address(payload.page_io[i]),
 					table_length);
 				table_length = 0;
@@ -471,25 +516,23 @@ static struct android_metadata *extract_metadata(dev_t dev,
 			i++;
 		}
 	}
-	metadata->verity_table[table_length] = '\0';
+	(*metadata)->verity_table[table_length] = '\0';
 
+	DMINFO("verity_table: %s", (*metadata)->verity_table);
 	goto free_payload;
 
 free_metadata:
-	kfree(metadata);
+	kfree(*metadata);
 free_header:
 	kfree(header);
-	metadata = ERR_PTR(err);
 free_payload:
 	for (i = 0; i < payload.number_of_pages; i++)
 		if (payload.page_io[i])
 			__free_page(payload.page_io[i]);
 	kfree(payload.page_io);
-
-	DMINFO("verity_table: %s", metadata->verity_table);
 blkdev_release:
 	blkdev_put(bdev, FMODE_READ);
-	return metadata;
+	return err;
 }
 
 /* helper functions to extract properties from dts */
@@ -522,34 +565,6 @@ static int verity_mode(void)
 	return DM_VERITY_MODE_EIO;
 }
 
-static int verify_header(struct android_metadata_header *header)
-{
-	int retval = -EINVAL;
-
-	if (is_userdebug() && le32_to_cpu(header->magic_number) ==
-		VERITY_METADATA_MAGIC_DISABLE) {
-		retval = VERITY_STATE_DISABLE;
-		return retval;
-	}
-
-	if (!(le32_to_cpu(header->magic_number) ==
-		VERITY_METADATA_MAGIC_NUMBER) ||
-		(le32_to_cpu(header->magic_number) ==
-		VERITY_METADATA_MAGIC_DISABLE)) {
-		DMERR("Incorrect magic number");
-		return retval;
-	}
-
-	if (le32_to_cpu(header->protocol_version) !=
-		VERITY_METADATA_VERSION) {
-		DMERR("Unsupported version %u",
-			le32_to_cpu(header->protocol_version));
-		return retval;
-	}
-
-	return 0;
-}
-
 static int verify_verity_signature(char *key_id,
 		struct android_metadata *metadata)
 {
@@ -649,7 +664,7 @@ static int add_as_linear_device(struct dm_target *ti, char *dev)
 static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	dev_t uninitialized_var(dev);
-	struct android_metadata *uninitialized_var(metadata);
+	struct android_metadata *metadata = NULL;
 	int err = 0, i, mode;
 	char *key_id, *table_ptr, dummy, *target_device,
 	*verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS];
@@ -717,26 +732,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 	}
 
-	metadata = extract_metadata(dev, &fec);
+	err = extract_metadata(dev, &fec, &metadata, &verity_enabled);
 
-	if (IS_ERR(metadata)) {
+	if (err) {
 		DMERR("Error while extracting metadata");
 		handle_error();
-		return -EINVAL;
-	}
-
-	err = verify_header(metadata->header);
-
-	if (err == VERITY_STATE_DISABLE) {
-		DMERR("Mounting root with verity disabled");
-		verity_enabled = false;
-		/* we would still have to parse the args to figure out
-		 * the data blocks size. Or may be could map the entire
-		 * partition similar to mounting the device.
-		 */
-	} else if (err) {
-		DMERR("Verity header handle error");
-		handle_error();
 		goto free_metadata;
 	}
 
@@ -869,8 +869,10 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 free_metadata:
-	kfree(metadata->header);
-	kfree(metadata->verity_table);
+	if (metadata) {
+		kfree(metadata->header);
+		kfree(metadata->verity_table);
+	}
 	kfree(metadata);
 	return err;
 }

From edb97738f20194cb7eea8eeb81bac21f4a8cfb83 Mon Sep 17 00:00:00 2001
From: Jeremy Compostella <jeremy.compostella@intel.com>
Date: Tue, 10 May 2016 13:10:20 +0200
Subject: [PATCH 0198/3220] ANDROID: dm verity fec: pack the fec_header
 structure

The fec_header structure is generated build time and stored on disk.
The fec_header might be build on a 64 bits machine while it is read
per a 32 bits device or the other way around.  In such situations, the
fec_header fields are not aligned as expected by the device and it
fails to read the fec_header structure.

This patch makes the fec_header packed.

Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf06
Signed-off-by: Jeremy Compostella <jeremy.compostella@intel.com>
---
 drivers/md/dm-android-verity.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
index 782e1c815c67..f43b02fbb475 100644
--- a/drivers/md/dm-android-verity.h
+++ b/drivers/md/dm-android-verity.h
@@ -72,9 +72,6 @@
  * if fec is not present
  * <data_blocks> <verity_tree> <verity_metdata_32K>
  */
-/* TODO: rearrange structure to reduce memory holes
- * depends on userspace change.
- */
 struct fec_header {
 	__le32 magic;
 	__le32 version;
@@ -83,7 +80,7 @@ struct fec_header {
 	__le32 fec_size;
 	__le64 inp_size;
 	u8 hash[SHA256_DIGEST_SIZE];
-};
+} __attribute__((packed));
 
 struct android_metadata_header {
 	__le32 magic_number;

From 3a400abdc57e047786d0b35d39617a794e2bb97e Mon Sep 17 00:00:00 2001
From: Joseph Lo <josephl@nvidia.com>
Date: Mon, 22 Apr 2013 14:39:18 +0800
Subject: [PATCH 0199/3220] CHROMIUM: sched: update the average of nr_running

Doing a Exponential moving average per nr_running++/-- does not
guarantee a fixed sample rate which induces errors if there are lots of
threads being enqueued/dequeued from the rq (Linpack mt). Instead of
keeping track of the avg, the scheduler now keeps track of the integral
of nr_running and allows the readers to perform filtering on top.

Original-author: Sai Charan Gurrappadi <sgurrappadi@nvidia.com>

Change-Id: Id946654f32fa8be0eaf9d8fa7c9a8039b5ef9fab
Signed-off-by: Joseph Lo <josephl@nvidia.com>
Signed-off-by: Andrew Bresticker <abrestic@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/174694
Reviewed-on: https://chromium-review.googlesource.com/272853
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/sched.h |  3 +++
 kernel/sched/core.c   | 30 ++++++++++++++++++++++++++
 kernel/sched/sched.h  | 49 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b9b11b2dbabd..ee59abcab30d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -173,6 +173,9 @@ extern bool single_task_running(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
 extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
+#ifdef CONFIG_CPU_QUIET
+extern u64 nr_running_integral(unsigned int cpu);
+#endif
 
 extern void calc_global_load(unsigned long ticks);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e25e68747d7..42c45ccc1f62 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2768,6 +2768,36 @@ unsigned long nr_iowait_cpu(int cpu)
 	return atomic_read(&this->nr_iowait);
 }
 
+#ifdef CONFIG_CPU_QUIET
+u64 nr_running_integral(unsigned int cpu)
+{
+	unsigned int seqcnt;
+	u64 integral;
+	struct rq *q;
+
+	if (cpu >= nr_cpu_ids)
+		return 0;
+
+	q = cpu_rq(cpu);
+
+	/*
+	 * Update average to avoid reading stalled value if there were
+	 * no run-queue changes for a long time. On the other hand if
+	 * the changes are happening right now, just read current value
+	 * directly.
+	 */
+
+	seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+	integral = do_nr_running_integral(q);
+	if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+		read_seqcount_begin(&q->ave_seqcnt);
+		integral = q->nr_running_integral;
+	}
+
+	return integral;
+}
+#endif
+
 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
 	struct rq *rq = this_rq();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db3f3db9849c..5713513adb9d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -594,6 +594,14 @@ struct rq {
 #ifdef CONFIG_NO_HZ_FULL
 	unsigned long last_sched_tick;
 #endif
+
+#ifdef CONFIG_CPU_QUIET
+	/* time-based average load */
+	u64 nr_last_stamp;
+	u64 nr_running_integral;
+	seqcount_t ave_seqcnt;
+#endif
+
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
 	unsigned long nr_load_updates;
@@ -1353,7 +1361,7 @@ extern void init_entity_runnable_average(struct sched_entity *se);
 
 extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
 
-static inline void add_nr_running(struct rq *rq, unsigned count)
+static inline void __add_nr_running(struct rq *rq, unsigned count)
 {
 	unsigned prev_nr = rq->nr_running;
 
@@ -1381,11 +1389,48 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 	}
 }
 
-static inline void sub_nr_running(struct rq *rq, unsigned count)
+static inline void __sub_nr_running(struct rq *rq, unsigned count)
 {
 	rq->nr_running -= count;
 }
 
+#ifdef CONFIG_CPU_QUIET
+#define NR_AVE_SCALE(x)		((x) << FSHIFT)
+static inline u64 do_nr_running_integral(struct rq *rq)
+{
+	s64 nr, deltax;
+	u64 nr_running_integral = rq->nr_running_integral;
+
+	deltax = rq->clock_task - rq->nr_last_stamp;
+	nr = NR_AVE_SCALE(rq->nr_running);
+
+	nr_running_integral += nr * deltax;
+
+	return nr_running_integral;
+}
+
+static inline void add_nr_running(struct rq *rq, unsigned count)
+{
+	write_seqcount_begin(&rq->ave_seqcnt);
+	rq->nr_running_integral = do_nr_running_integral(rq);
+	rq->nr_last_stamp = rq->clock_task;
+	__add_nr_running(rq, count);
+	write_seqcount_end(&rq->ave_seqcnt);
+}
+
+static inline void sub_nr_running(struct rq *rq, unsigned count)
+{
+	write_seqcount_begin(&rq->ave_seqcnt);
+	rq->nr_running_integral = do_nr_running_integral(rq);
+	rq->nr_last_stamp = rq->clock_task;
+	__sub_nr_running(rq, count);
+	write_seqcount_end(&rq->ave_seqcnt);
+}
+#else
+#define add_nr_running __add_nr_running
+#define sub_nr_running __sub_nr_running
+#endif
+
 static inline void rq_last_tick_reset(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_FULL

From 687fa2dcd285d3344fe89cd6a3c992fcaba13b6f Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 14 Nov 2014 17:16:41 +0000
Subject: [PATCH 0200/3220] arm: topology: Define TC2 energy and provide it to
 the scheduler

This patch is only here to be able to test provisioning of energy related
data from an arch topology shim layer to the scheduler. Since there is no
code today which deals with extracting energy related data from the dtb or
acpi, and process it in the topology shim layer, the content of the
sched_group_energy structures as well as the idle_state and capacity_state
arrays are hard-coded here.

This patch defines the sched_group_energy structure as well as the
idle_state and capacity_state array for the cluster (relates to sched
groups (sgs) in DIE sched domain level) and for the core (relates to sgs
in MC sd level) for a Cortex A7 as well as for a Cortex A15.
It further provides related implementations of the sched_domain_energy_f
functions (cpu_cluster_energy() and cpu_core_energy()).

To be able to propagate this information from the topology shim layer to
the scheduler, the elements of the arm_topology[] table have been
provisioned with the appropriate sched_domain_energy_f functions.

Change-Id: I8c014bbd04f6a1d57892be9bfa16affe07948dcf
cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 arch/arm/kernel/topology.c | 126 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 3 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 0308342def8c..f5941004efba 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -287,6 +287,127 @@ void store_cpu_topology(unsigned int cpuid)
 		cpu_topology[cpuid].socket_id, mpidr);
 }
 
+/*
+ * ARM TC2 specific energy cost model data. There are no unit requirements for
+ * the data. Data can be normalized to any reference point, but the
+ * normalization must be consistent. That is, one bogo-joule/watt must be the
+ * same quantity for all data, but we don't care what it is.
+ */
+static struct idle_state idle_states_cluster_a7[] = {
+	 { .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */
+	 { .power = 25 }, /* WFI */
+	 { .power = 10 }, /* cluster-sleep-l */
+	};
+
+static struct idle_state idle_states_cluster_a15[] = {
+	 { .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */
+	 { .power = 70 }, /* WFI */
+	 { .power = 25 }, /* cluster-sleep-b */
+	};
+
+static struct capacity_state cap_states_cluster_a7[] = {
+	/* Cluster only power */
+	 { .cap =  150, .power = 2967, }, /*  350 MHz */
+	 { .cap =  172, .power = 2792, }, /*  400 MHz */
+	 { .cap =  215, .power = 2810, }, /*  500 MHz */
+	 { .cap =  258, .power = 2815, }, /*  600 MHz */
+	 { .cap =  301, .power = 2919, }, /*  700 MHz */
+	 { .cap =  344, .power = 2847, }, /*  800 MHz */
+	 { .cap =  387, .power = 3917, }, /*  900 MHz */
+	 { .cap =  430, .power = 4905, }, /* 1000 MHz */
+	};
+
+static struct capacity_state cap_states_cluster_a15[] = {
+	/* Cluster only power */
+	 { .cap =  426, .power =  7920, }, /*  500 MHz */
+	 { .cap =  512, .power =  8165, }, /*  600 MHz */
+	 { .cap =  597, .power =  8172, }, /*  700 MHz */
+	 { .cap =  682, .power =  8195, }, /*  800 MHz */
+	 { .cap =  768, .power =  8265, }, /*  900 MHz */
+	 { .cap =  853, .power =  8446, }, /* 1000 MHz */
+	 { .cap =  938, .power = 11426, }, /* 1100 MHz */
+	 { .cap = 1024, .power = 15200, }, /* 1200 MHz */
+	};
+
+static struct sched_group_energy energy_cluster_a7 = {
+	  .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7),
+	  .idle_states    = idle_states_cluster_a7,
+	  .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a7),
+	  .cap_states     = cap_states_cluster_a7,
+};
+
+static struct sched_group_energy energy_cluster_a15 = {
+	  .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15),
+	  .idle_states    = idle_states_cluster_a15,
+	  .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a15),
+	  .cap_states     = cap_states_cluster_a15,
+};
+
+static struct idle_state idle_states_core_a7[] = {
+	 { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
+	 { .power = 0 }, /* WFI */
+	 { .power = 0 }, /* cluster-sleep-l */
+	};
+
+static struct idle_state idle_states_core_a15[] = {
+	 { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
+	 { .power = 0 }, /* WFI */
+	 { .power = 0 }, /* cluster-sleep-b */
+	};
+
+static struct capacity_state cap_states_core_a7[] = {
+	/* Power per cpu */
+	 { .cap =  150, .power =  187, }, /*  350 MHz */
+	 { .cap =  172, .power =  275, }, /*  400 MHz */
+	 { .cap =  215, .power =  334, }, /*  500 MHz */
+	 { .cap =  258, .power =  407, }, /*  600 MHz */
+	 { .cap =  301, .power =  447, }, /*  700 MHz */
+	 { .cap =  344, .power =  549, }, /*  800 MHz */
+	 { .cap =  387, .power =  761, }, /*  900 MHz */
+	 { .cap =  430, .power = 1024, }, /* 1000 MHz */
+	};
+
+static struct capacity_state cap_states_core_a15[] = {
+	/* Power per cpu */
+	 { .cap =  426, .power = 2021, }, /*  500 MHz */
+	 { .cap =  512, .power = 2312, }, /*  600 MHz */
+	 { .cap =  597, .power = 2756, }, /*  700 MHz */
+	 { .cap =  682, .power = 3125, }, /*  800 MHz */
+	 { .cap =  768, .power = 3524, }, /*  900 MHz */
+	 { .cap =  853, .power = 3846, }, /* 1000 MHz */
+	 { .cap =  938, .power = 5177, }, /* 1100 MHz */
+	 { .cap = 1024, .power = 6997, }, /* 1200 MHz */
+	};
+
+static struct sched_group_energy energy_core_a7 = {
+	  .nr_idle_states = ARRAY_SIZE(idle_states_core_a7),
+	  .idle_states    = idle_states_core_a7,
+	  .nr_cap_states  = ARRAY_SIZE(cap_states_core_a7),
+	  .cap_states     = cap_states_core_a7,
+};
+
+static struct sched_group_energy energy_core_a15 = {
+	  .nr_idle_states = ARRAY_SIZE(idle_states_core_a15),
+	  .idle_states    = idle_states_core_a15,
+	  .nr_cap_states  = ARRAY_SIZE(cap_states_core_a15),
+	  .cap_states     = cap_states_core_a15,
+};
+
+/* sd energy functions */
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+	return cpu_topology[cpu].socket_id ? &energy_cluster_a7 :
+			&energy_cluster_a15;
+}
+
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+	return cpu_topology[cpu].socket_id ? &energy_core_a7 :
+			&energy_core_a15;
+}
+
 static inline int cpu_corepower_flags(void)
 {
 	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN | \
@@ -295,10 +416,9 @@ static inline int cpu_corepower_flags(void)
 
 static struct sched_domain_topology_level arm_topology[] = {
 #ifdef CONFIG_SCHED_MC
-	{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+	{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
 #endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
 	{ NULL, },
 };
 

From 962b7c10ab3a09cc30114d5ddf01abd9e6979278 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 14 Jan 2016 18:43:37 +0000
Subject: [PATCH 0201/3220] DEBUG: sched/tune: add tracepoint for task boost
 signal

Change-Id: I545d3bf5569fc41c0fa70f51dff9a19c11d532ee
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/trace/events/sched.h | 30 ++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |  2 ++
 2 files changed, 32 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7ec9dcfc701a..606509500446 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -764,6 +764,36 @@ TRACE_EVENT(sched_tune_boostgroup_update,
 		__entry->cpu, __entry->variation, __entry->max_boost)
 );
 
+/*
+ * Tracepoint for accounting task boosted utilization
+ */
+TRACE_EVENT(sched_boost_task,
+
+	TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin),
+
+	TP_ARGS(tsk, util, margin),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN		)
+		__field( pid_t,		pid			)
+		__field( unsigned long,	util			)
+		__field( unsigned long,	margin			)
+
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid	= tsk->pid;
+		__entry->util	= util;
+		__entry->margin	= margin;
+	),
+
+	TP_printk("comm=%s pid=%d util=%lu margin=%lu",
+		  __entry->comm, __entry->pid,
+		  __entry->util,
+		  __entry->margin)
+);
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3bc56e0ea1ab..3a1fd4d3a860 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5283,6 +5283,8 @@ boosted_task_util(struct task_struct *task)
 	unsigned long util = task_util(task);
 	unsigned long margin = schedtune_task_margin(task);
 
+	trace_sched_boost_task(task, util, margin);
+
 	return util + margin;
 }
 

From 4525aa343e8b252dd365e603ad141639989d547c Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 14 Jan 2016 18:47:21 +0000
Subject: [PATCH 0202/3220] DEBUG: sched/tune: add tracepoint for energy_diff()
 values

Change-Id: Id8fafbd85f6d81248f322e073ee790a7ceec0bf7
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/trace/events/sched.h | 57 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          | 11 ++++++-
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 606509500446..13fb31cebf62 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -794,6 +794,63 @@ TRACE_EVENT(sched_boost_task,
 		  __entry->margin)
 );
 
+/*
+ * Tracepoint for accounting sched group energy
+ */
+TRACE_EVENT(sched_energy_diff,
+
+	TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta,
+		int nrgb, int nrga, int nrgd, int capb, int capa, int capd,
+		int nrgn, int nrgp),
+
+	TP_ARGS(tsk, scpu, dcpu, udelta,
+		nrgb, nrga, nrgd, capb, capa, capd,
+		nrgn, nrgp),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN	)
+		__field( pid_t,	pid	)
+		__field( int,	scpu	)
+		__field( int,	dcpu	)
+		__field( int,	udelta	)
+		__field( int,	nrgb	)
+		__field( int,	nrga	)
+		__field( int,	nrgd	)
+		__field( int,	capb	)
+		__field( int,	capa	)
+		__field( int,	capd	)
+		__field( int,	nrgn	)
+		__field( int,	nrgp	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		= tsk->pid;
+		__entry->scpu 		= scpu;
+		__entry->dcpu 		= dcpu;
+		__entry->udelta 	= udelta;
+		__entry->nrgb 		= nrgb;
+		__entry->nrga 		= nrga;
+		__entry->nrgd 		= nrgd;
+		__entry->capb 		= capb;
+		__entry->capa 		= capa;
+		__entry->capd 		= capd;
+		__entry->nrgn 		= nrgn;
+		__entry->nrgp 		= nrgp;
+	),
+
+	TP_printk("pid=%d comm=%s "
+			"src_cpu=%d dst_cpu=%d usage_delta=%d "
+			"nrg_before=%d nrg_after=%d nrg_diff=%d "
+			"cap_before=%d cap_after=%d cap_delta=%d "
+			"nrg_delta=%d nrg_payoff=%d",
+		__entry->pid, __entry->comm,
+		__entry->scpu, __entry->dcpu, __entry->udelta,
+		__entry->nrgb, __entry->nrga, __entry->nrgd,
+		__entry->capb, __entry->capa, __entry->capd,
+		__entry->nrgn, __entry->nrgp)
+);
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3a1fd4d3a860..69cb6fb6793e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4998,6 +4998,7 @@ static int energy_diff(struct energy_env *eenv)
 	struct sched_domain *sd;
 	struct sched_group *sg;
 	int sd_cpu = -1, energy_before = 0, energy_after = 0;
+	int result;
 
 	struct energy_env eenv_before = {
 		.util_delta	= 0,
@@ -5041,7 +5042,15 @@ static int energy_diff(struct energy_env *eenv)
 	eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
 	eenv->payoff = 0;
 
-	return energy_diff_evaluate(eenv);
+	result = energy_diff_evaluate(eenv);
+
+	trace_sched_energy_diff(eenv->task,
+			eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+			eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+			eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+			eenv->nrg.delta, eenv->payoff);
+
+	return result;
 }
 
 /*

From ae54c7741f30a963d84134e92fd452b8252946a8 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Wed, 20 Jan 2016 14:06:05 +0000
Subject: [PATCH 0203/3220] DEBUG: sched/tune: add tracepoint on P-E space
 filtering

Change-Id: I31dfed67c0486713b88efb75df767329f2802e06
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/trace/events/sched.h | 35 +++++++++++++++++++++++++++++++++++
 kernel/sched/tune.c          | 30 ++++++++++++++++++++++++++----
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 13fb31cebf62..cb082a68cdd6 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -851,6 +851,41 @@ TRACE_EVENT(sched_energy_diff,
 		__entry->nrgn, __entry->nrgp)
 );
 
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_filter,
+
+	TP_PROTO(int nrg_delta, int cap_delta,
+		 int nrg_gain,  int cap_gain,
+		 int payoff, int region),
+
+	TP_ARGS(nrg_delta, cap_delta, nrg_gain, cap_gain, payoff, region),
+
+	TP_STRUCT__entry(
+		__field( int,	nrg_delta	)
+		__field( int,	cap_delta	)
+		__field( int,	nrg_gain	)
+		__field( int,	cap_gain	)
+		__field( int,	payoff		)
+		__field( int,	region		)
+	),
+
+	TP_fast_assign(
+		__entry->nrg_delta	= nrg_delta;
+		__entry->cap_delta	= cap_delta;
+		__entry->nrg_gain	= nrg_gain;
+		__entry->cap_gain	= cap_gain;
+		__entry->payoff		= payoff;
+		__entry->region		= region;
+	),
+
+	TP_printk("nrg_delta=%d cap_delta=%d nrg_gain=%d cap_gain=%d payoff=%d region=%d",
+		__entry->nrg_delta, __entry->cap_delta,
+		__entry->nrg_gain, __entry->cap_gain,
+		__entry->payoff, __entry->region)
+);
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 7a434f2394e7..b40d40dc3c49 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -70,6 +70,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta,
 		 */
 		payoff  = nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
 		payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
+
+		trace_sched_tune_filter(
+				nrg_delta, cap_delta,
+				threshold_gains[perf_boost_idx].nrg_gain,
+				threshold_gains[perf_boost_idx].cap_gain,
+				payoff, 8);
+
 		return payoff;
 	}
 
@@ -84,6 +91,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta,
 		 */
 		payoff  = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
 		payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
+
+		trace_sched_tune_filter(
+				nrg_delta, cap_delta,
+				threshold_gains[perf_constrain_idx].nrg_gain,
+				threshold_gains[perf_constrain_idx].cap_gain,
+				payoff, 6);
+
 		return payoff;
 	}
 
@@ -155,12 +169,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta,
 	int perf_constrain_idx;
 
 	/* Optimal (O) region */
-	if (nrg_delta < 0 && cap_delta > 0)
+	if (nrg_delta < 0 && cap_delta > 0) {
+		trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
 		return INT_MAX;
+	}
 
 	/* Suboptimal (S) region */
-	if (nrg_delta > 0 && cap_delta < 0)
+	if (nrg_delta > 0 && cap_delta < 0) {
+		trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
 		return -INT_MAX;
+	}
 
 	/* Get task specific perf Boost/Constraints indexes */
 	rcu_read_lock();
@@ -531,12 +549,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta,
 			struct task_struct *task)
 {
 	/* Optimal (O) region */
-	if (nrg_delta < 0 && cap_delta > 0)
+	if (nrg_delta < 0 && cap_delta > 0) {
+		trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
 		return INT_MAX;
+	}
 
 	/* Suboptimal (S) region */
-	if (nrg_delta > 0 && cap_delta < 0)
+	if (nrg_delta > 0 && cap_delta < 0) {
+		trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
 		return -INT_MAX;
+	}
 
 	return __schedtune_accept_deltas(nrg_delta, cap_delta,
 			perf_boost_idx, perf_constrain_idx);

From 765c2ab363a02a71f9ef4c55a7f598e3dd401e0b Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 22 Jul 2016 11:35:59 +0100
Subject: [PATCH 0204/3220] FIXUP: sched: fix build for non-SMP target

Currently the build for a single-core (e.g. user-mode) Linux is broken
and this configuration is required (at least) to run some network tests.

The main issues for the current code support on single-core systems are:
1. {se,rq}::sched_avg is not available nor maintained for !SMP systems
   This means that load and utilisation signals are NOT available in single
   core systems. All the EAS code depends on these signals.
2. sched_group_energy is also SMP dependant. Again this means that all the
   EAS setup and preparation code (energyn model initialization) has to be
   properly guarded/disabled for !SMP systems.
3. SchedFreq depends on utilization signal, which is not available on
   !SMP systems.
4. SchedTune is useless on unicore systems if SchedFreq is not available.
5. WALT machinery is not required on single-core systems.

This patch addresses all these issues by enforcing some constraints for
single-core systems:
a) WALT, SchedTune and SchedTune are now dependant on SMP
b) The default governor for !SMP systems is INTERACTIVE
c) The energy model initialisation/build functions are
d) Other minor code re-arrangements and CONFIG_SMP guarding to enable
   single core builds.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 drivers/cpufreq/Kconfig      |  1 +
 include/linux/sched_energy.h |  8 ++++++++
 include/trace/events/sched.h |  4 ++++
 init/Kconfig                 |  1 +
 kernel/sched/Makefile        |  4 ++--
 kernel/sched/fair.c          | 33 +++++++++++++++++++++++++++++----
 kernel/sched/sched.h         |  3 +--
 7 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index af523c539af0..a2d63d86bb70 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -195,6 +195,7 @@ config CPU_FREQ_GOV_CONSERVATIVE
 config CPU_FREQ_GOV_SCHED
 	bool "'sched' cpufreq governor"
 	depends on CPU_FREQ
+	depends on SMP
 	select CPU_FREQ_GOV_COMMON
 	help
 	  'sched' - this governor scales cpu frequency from the
diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h
index a3f1627ac609..1daf3e1f98a7 100644
--- a/include/linux/sched_energy.h
+++ b/include/linux/sched_energy.h
@@ -29,8 +29,16 @@
 #define for_each_possible_sd_level(level)		    \
 	for (level = 0; level < NR_SD_LEVELS; level++)
 
+#ifdef CONFIG_SMP
+
 extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
 
 void init_sched_energy_costs(void);
 
+#else
+
+#define init_sched_energy_costs() do { } while (0)
+
+#endif /* CONFIG_SMP */
+
 #endif
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index cb082a68cdd6..07ed295d0f23 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -587,6 +587,8 @@ TRACE_EVENT(sched_contrib_scale_f,
 		  __entry->cpu_scale_factor)
 );
 
+#ifdef CONFIG_SMP
+
 /*
  * Tracepoint for accounting sched averages for tasks.
  */
@@ -886,6 +888,8 @@ TRACE_EVENT(sched_tune_filter,
 		__entry->payoff, __entry->region)
 );
 
+#endif /* CONFIG_SMP */
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/init/Kconfig b/init/Kconfig
index 5d9097e2b805..facb2b587d5b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1256,6 +1256,7 @@ config SCHED_AUTOGROUP
 
 config SCHED_TUNE
 	bool "Boosting for CFS tasks (EXPERIMENTAL)"
+	depends on SMP
 	help
 	  This option enables the system-wide support for task boosting.
 	  When this support is enabled a new sysctl interface is exposed to
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index c6a85f813dfd..174762d8695b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,9 +12,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o
+obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69cb6fb6793e..1b12b14756ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4150,8 +4150,14 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_SMP
+static bool cpu_overutilized(int cpu);
 static inline unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
 
+#ifdef CONFIG_SMP
 static void update_capacity_of(int cpu)
 {
 	unsigned long req_cap;
@@ -4164,8 +4170,7 @@ static void update_capacity_of(int cpu)
 	req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
 	set_cfs_cpu_capacity(cpu, true, req_cap);
 }
-
-static bool cpu_overutilized(int cpu);
+#endif
 
 /*
  * The enqueue_task method is called before nr_running is
@@ -4177,8 +4182,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
 	int task_new = flags & ENQUEUE_WAKEUP_NEW;
 	int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
@@ -4210,8 +4217,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(cfs_rq);
 	}
 
-	if (!se) {
+	if (!se)
 		add_nr_running(rq, 1);
+
+#ifdef CONFIG_SMP
+
+	if (!se) {
 		if (!task_new && !rq->rd->overutilized &&
 		    cpu_overutilized(rq->cpu))
 			rq->rd->overutilized = true;
@@ -4228,6 +4239,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (task_new || task_wakeup)
 			update_capacity_of(cpu_of(rq));
 	}
+#endif /* CONFIG_SMP */
+
 	hrtick_update(rq);
 }
 
@@ -4285,8 +4298,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(cfs_rq);
 	}
 
-	if (!se) {
+	if (!se)
 		sub_nr_running(rq, 1);
+
+#ifdef CONFIG_SMP
+
+	if (!se) {
 		schedtune_dequeue_task(p, cpu_of(rq));
 
 		/*
@@ -4304,6 +4321,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 				set_cfs_cpu_capacity(cpu_of(rq), false, 0);
 		}
 	}
+
+#endif /* CONFIG_SMP */
+
 	hrtick_update(rq);
 }
 
@@ -5692,6 +5712,8 @@ static void task_dead_fair(struct task_struct *p)
 {
 	remove_entity_load_avg(&p->se);
 }
+#else
+#define task_fits_max(p, cpu) true
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -8716,10 +8738,13 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
 
+#ifdef CONFIG_SMP
 	if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
 		rq->rd->overutilized = true;
 
 	rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5713513adb9d..a06c19c48057 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1277,6 +1277,7 @@ extern const struct sched_class idle_sched_class;
 
 #ifdef CONFIG_SMP
 
+extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
@@ -1359,8 +1360,6 @@ unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
 
-extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
-
 static inline void __add_nr_running(struct rq *rq, unsigned count)
 {
 	unsigned prev_nr = rq->nr_running;

From d753e92e19660a6adcc3b8e0076a6c6078e5f59d Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Mon, 1 Aug 2016 11:34:05 +0100
Subject: [PATCH 0205/3220] sched/cpufreq_sched: Consolidated update

Contains:

sched/cpufreq_sched: use shorter throttle for raising OPP

Avoid cases where a brief drop in load causes a change to a low OPP
for the full throttle period. Use a shorter throttle period for
raising OPP than for lowering OPP.

sched-freq: Fix handling of max/min frequency

This reverts commit 9726142608f5b3bf5df4280243c9d324e692a510.

Change-Id: Ia78095354f7ad9492f00deb509a2b45112361eda

sched/cpufreq: Increasing throttle_down_nsec to 50ms

Change-Id: I2d8969cf2a64fa719b9dd86f43f9dd14b1ff84fe

sched-freq: make throttle times tunable

Change-Id: I127879645367425b273441d7f0306bb15d5633cb

Signed-off-by: Srinath Sridharan <srinathsr@google.com>
Signed-off-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
[jstultz: Fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/cpufreq/Kconfig      |   2 +-
 kernel/sched/cpufreq_sched.c | 175 +++++++++++++++++++++++++++++++----
 2 files changed, 160 insertions(+), 17 deletions(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index a2d63d86bb70..d423f51d3276 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -105,7 +105,7 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 
 config CPU_FREQ_DEFAULT_GOV_SCHED
 	bool "sched"
-	select CPU_FREQ_GOV_SCHED
+	select CPU_FREQ_GOV_INTERACTIVE
 	help
 	  Use the CPUfreq governor 'sched' as default. This scales
 	  cpu frequency using CPU utilization estimates from the
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index e1d208e101ed..3f8c67a3ea0f 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -19,7 +19,8 @@
 
 #include "sched.h"
 
-#define THROTTLE_NSEC		50000000 /* 50ms default */
+#define THROTTLE_DOWN_NSEC	50000000 /* 50ms default */
+#define THROTTLE_UP_NSEC	500000 /* 500us default */
 
 struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
 static bool __read_mostly cpufreq_driver_slow;
@@ -33,8 +34,10 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
 
 /**
  * gov_data - per-policy data internal to the governor
- * @throttle: next throttling period expiry. Derived from throttle_nsec
- * @throttle_nsec: throttle period length in nanoseconds
+ * @up_throttle: next throttling period expiry if increasing OPP
+ * @down_throttle: next throttling period expiry if decreasing OPP
+ * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP
+ * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP
  * @task: worker thread for dvfs transition that may block/sleep
  * @irq_work: callback used to wake up worker thread
  * @requested_freq: last frequency requested by the sched governor
@@ -48,11 +51,14 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
  * call down_write(policy->rwsem).
  */
 struct gov_data {
-	ktime_t throttle;
-	unsigned int throttle_nsec;
+	ktime_t up_throttle;
+	ktime_t down_throttle;
+	unsigned int up_throttle_nsec;
+	unsigned int down_throttle_nsec;
 	struct task_struct *task;
 	struct irq_work irq_work;
 	unsigned int requested_freq;
+	int max;
 };
 
 static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
@@ -66,25 +72,29 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
 
 	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
 
-	gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
+	gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
+	gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
 	up_write(&policy->rwsem);
 }
 
-static bool finish_last_request(struct gov_data *gd)
+static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)
 {
 	ktime_t now = ktime_get();
 
-	if (ktime_after(now, gd->throttle))
+	ktime_t throttle = gd->requested_freq < cur_freq ?
+		gd->down_throttle : gd->up_throttle;
+
+	if (ktime_after(now, throttle))
 		return false;
 
 	while (1) {
-		int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now));
+		int usec_left = ktime_to_ns(ktime_sub(throttle, now));
 
 		usec_left /= NSEC_PER_USEC;
 		trace_cpufreq_sched_throttled(usec_left);
 		usleep_range(usec_left, usec_left + 100);
 		now = ktime_get();
-		if (ktime_after(now, gd->throttle))
+		if (ktime_after(now, throttle))
 			return true;
 	}
 }
@@ -128,7 +138,7 @@ static int cpufreq_sched_thread(void *data)
 			 * if the frequency thread sleeps while waiting to be
 			 * unthrottled, start over to check for a newer request
 			 */
-			if (finish_last_request(gd))
+			if (finish_last_request(gd, policy->cur))
 				continue;
 			last_request = new_request;
 			cpufreq_sched_try_driver_target(policy, new_request);
@@ -183,16 +193,21 @@ static void update_fdomain_capacity_request(int cpu)
 	}
 
 	/* Convert the new maximum capacity request into a cpu frequency */
-	freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+	freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT;
 	if (cpufreq_frequency_table_target(policy, policy->freq_table,
 					   freq_new, CPUFREQ_RELATION_L,
 					   &index_new))
 		goto out;
 	freq_new = policy->freq_table[index_new].frequency;
 
+	if (freq_new > policy->max)
+		freq_new = policy->max;
+
+	if (freq_new < policy->min)
+		freq_new = policy->min;
+
 	trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
 					gd->requested_freq);
-
 	if (freq_new == gd->requested_freq)
 		goto out;
 
@@ -246,10 +261,17 @@ static inline void clear_sched_freq(void)
 	static_key_slow_dec(&__sched_freq);
 }
 
+static struct attribute_group sched_attr_group_gov_pol;
+static struct attribute_group *get_sysfs_attr(void)
+{
+	return &sched_attr_group_gov_pol;
+}
+
 static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 {
 	struct gov_data *gd;
 	int cpu;
+	int rc;
 
 	for_each_cpu(cpu, policy->cpus)
 		memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
@@ -259,11 +281,20 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 	if (!gd)
 		return -ENOMEM;
 
-	gd->throttle_nsec = policy->cpuinfo.transition_latency ?
+	gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
 			    policy->cpuinfo.transition_latency :
-			    THROTTLE_NSEC;
+			    THROTTLE_UP_NSEC;
+	gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
 	pr_debug("%s: throttle threshold = %u [ns]\n",
-		  __func__, gd->throttle_nsec);
+		  __func__, gd->up_throttle_nsec);
+
+	gd->max = policy->max;
+
+	rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+	if (rc) {
+		pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
+		goto err;
+	}
 
 	if (cpufreq_driver_is_slow()) {
 		cpufreq_driver_slow = true;
@@ -301,6 +332,8 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
 		put_task_struct(gd->task);
 	}
 
+	sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+
 	policy->governor_data = NULL;
 
 	kfree(gd);
@@ -317,6 +350,32 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy)
 	return 0;
 }
 
+static void cpufreq_sched_limits(struct cpufreq_policy *policy)
+{
+	struct gov_data *gd;
+
+	pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
+		policy->cpu, policy->min, policy->max,
+		policy->cur);
+
+	if (!down_write_trylock(&policy->rwsem))
+		return;
+	/*
+	 * Need to keep track of highest max frequency for
+	 * capacity calculations
+	 */
+	gd = policy->governor_data;
+	if (gd->max < policy->max)
+		gd->max = policy->max;
+
+	if (policy->max < policy->cur)
+		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
+	else if (policy->min > policy->cur)
+		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
+
+	up_write(&policy->rwsem);
+}
+
 static int cpufreq_sched_stop(struct cpufreq_policy *policy)
 {
 	int cpu;
@@ -340,11 +399,95 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy,
 	case CPUFREQ_GOV_STOP:
 		return cpufreq_sched_stop(policy);
 	case CPUFREQ_GOV_LIMITS:
+		cpufreq_sched_limits(policy);
 		break;
 	}
 	return 0;
 }
 
+/* Tunables */
+static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
+{
+	return sprintf(buf, "%u\n", gd->up_throttle_nsec);
+}
+
+static ssize_t store_up_throttle_nsec(struct gov_data *gd,
+		const char *buf, size_t count)
+{
+	int ret;
+	long unsigned int val;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret < 0)
+		return ret;
+	gd->up_throttle_nsec = val;
+	return count;
+}
+
+static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
+{
+	return sprintf(buf, "%u\n", gd->down_throttle_nsec);
+}
+
+static ssize_t store_down_throttle_nsec(struct gov_data *gd,
+		const char *buf, size_t count)
+{
+	int ret;
+	long unsigned int val;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret < 0)
+		return ret;
+	gd->down_throttle_nsec = val;
+	return count;
+}
+
+/*
+ * Create show/store routines
+ * - sys: One governor instance for complete SYSTEM
+ * - pol: One governor instance per struct cpufreq_policy
+ */
+#define show_gov_pol_sys(file_name)					\
+static ssize_t show_##file_name##_gov_pol				\
+(struct cpufreq_policy *policy, char *buf)				\
+{									\
+	return show_##file_name(policy->governor_data, buf);		\
+}
+
+#define store_gov_pol_sys(file_name)					\
+static ssize_t store_##file_name##_gov_pol				\
+(struct cpufreq_policy *policy, const char *buf, size_t count)		\
+{									\
+	return store_##file_name(policy->governor_data, buf, count);	\
+}
+
+#define gov_pol_attr_rw(_name)						\
+	static struct freq_attr _name##_gov_pol =				\
+	__ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
+
+#define show_store_gov_pol_sys(file_name)				\
+	show_gov_pol_sys(file_name);						\
+	store_gov_pol_sys(file_name)
+#define tunable_handlers(file_name) \
+	show_gov_pol_sys(file_name); \
+	store_gov_pol_sys(file_name); \
+	gov_pol_attr_rw(file_name)
+
+tunable_handlers(down_throttle_nsec);
+tunable_handlers(up_throttle_nsec);
+
+/* Per policy governor instance */
+static struct attribute *sched_attributes_gov_pol[] = {
+	&up_throttle_nsec_gov_pol.attr,
+	&down_throttle_nsec_gov_pol.attr,
+	NULL,
+};
+
+static struct attribute_group sched_attr_group_gov_pol = {
+	.attrs = sched_attributes_gov_pol,
+	.name = "sched",
+};
+
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
 static
 #endif

From 2e9abbc942e6211f9ac4dadb14debc39bb9d1418 Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Thu, 14 Jul 2016 09:57:29 +0100
Subject: [PATCH 0206/3220] sched: EAS: take cstate into account when selecting
 idle core

Introduce a new sysctl for this option, 'sched_cstate_aware'.
When this is enabled, select_idle_sibling in CFS is modified to
choose the idle CPU in the sibling group which has the lowest
idle state index - idle state indexes are assumed to increase
as sleep depth and hence wakeup latency increase. In this way,
we attempt to minimise wakeup latency when an idle CPU is
required.

Signed-off-by: Srinath Sridharan <srinathsr@google.com>

Includes:
sched: EAS: fix select_idle_sibling

when sysctl_sched_cstate_aware is enabled, best_idle cpu will not be chosen
in the original flow because it will goto done directly

Bug: 30107557
Change-Id: Ie09c2e3960cafbb976f8d472747faefab3b4d6ac
Signed-off-by: martin_liu <martin_liu@htc.com>
---
 include/linux/sched/sysctl.h |  1 +
 kernel/sched/fair.c          | 55 +++++++++++++++++++++++++++---------
 kernel/sysctl.c              |  7 +++++
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 4479e48c7712..7d021393b0da 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_cstate_aware;
 
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1b12b14756ec..14c8527d7bcb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -51,6 +51,7 @@
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
+unsigned int sysctl_sched_cstate_aware = 1;
 /*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -5468,15 +5469,20 @@ static int select_idle_sibling(struct task_struct *p, int target)
 	struct sched_domain *sd;
 	struct sched_group *sg;
 	int i = task_cpu(p);
+	int best_idle = -1;
+	int best_idle_cstate = -1;
+	int best_idle_capacity = INT_MAX;
 
-	if (idle_cpu(target))
-		return target;
+	if (!sysctl_sched_cstate_aware) {
+		if (idle_cpu(target))
+			return target;
 
-	/*
-	 * If the prevous cpu is cache affine and idle, don't be stupid.
-	 */
-	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-		return i;
+		/*
+		 * If the prevous cpu is cache affine and idle, don't be stupid.
+		 */
+		if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+			return i;
+	}
 
 	/*
 	 * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -5489,18 +5495,41 @@ static int select_idle_sibling(struct task_struct *p, int target)
 						tsk_cpus_allowed(p)))
 				goto next;
 
-			for_each_cpu(i, sched_group_cpus(sg)) {
-				if (i == target || !idle_cpu(i))
-					goto next;
-			}
+			if (sysctl_sched_cstate_aware) {
+				for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+					struct rq *rq = cpu_rq(i);
+					int idle_idx = idle_get_state_idx(rq);
+					unsigned long new_usage = boosted_task_util(p);
+					unsigned long capacity_orig = capacity_orig_of(i);
+					if (new_usage > capacity_orig || !idle_cpu(i))
+						goto next;
 
-			target = cpumask_first_and(sched_group_cpus(sg),
+					if (i == target && new_usage <= capacity_curr_of(target))
+						return target;
+
+					if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
+						best_idle = i;
+						best_idle_cstate = idle_idx;
+						best_idle_capacity = capacity_orig;
+					}
+				}
+			} else {
+				for_each_cpu(i, sched_group_cpus(sg)) {
+					if (i == target || !idle_cpu(i))
+						goto next;
+				}
+
+				target = cpumask_first_and(sched_group_cpus(sg),
 					tsk_cpus_allowed(p));
-			goto done;
+				goto done;
+			}
 next:
 			sg = sg->next;
 		} while (sg != sd->groups);
 	}
+	if (best_idle > 0)
+		target = best_idle;
+
 done:
 	return target;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 98c0f6ef01f1..cc0bf3ecdbda 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -303,6 +303,13 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
 	},
+	{
+		.procname	= "sched_cstate_aware",
+		.data		= &sysctl_sched_cstate_aware,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "sched_wakeup_granularity_ns",
 		.data		= &sysctl_sched_wakeup_granularity,

From 4a5e890ec60d2e341fa560d8149997f5f0c48d67 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Fri, 29 Jul 2016 14:04:11 +0100
Subject: [PATCH 0207/3220] sched/fair: add tunable to force selection at cpu
 granularity

EAS assumes that clusters with smaller capacity cores are more
energy-efficient. This may not be true on non-big-little devices,
so EAS can make incorrect cluster selections when finding a CPU
to wake. The "sched_is_big_little" hint can be used to cause a
cpu-based selection instead of cluster-based selection.

This change incorporates the addition of the sync hint enable patch

EAS did not honour synchronous wakeup hints, a new sysctl is
created to ask EAS to use this information when selecting a CPU.
The control is called "sched_sync_hint_enable".

Also contains:

EAS: sched/fair: for SMP bias toward idle core with capacity

For SMP devices, on wakeup bias towards idle cores that have capacity
vs busy devices that need a higher OPP

eas: favor idle cpus for boosted tasks

BUG: 29533997
BUG: 29512132
Change-Id: I0cc9a1b1b88fb52916f18bf2d25715bdc3634f9c
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>

eas/sched/fair: Favoring busy cpus with low OPPs

BUG: 29533997
BUG: 29512132
Change-Id: I9305b3239698d64278db715a2e277ea0bb4ece79

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 include/linux/sched/sysctl.h |   2 +
 kernel/sched/fair.c          | 191 +++++++++++++++++++++++++++--------
 kernel/sysctl.c              |  14 +++
 3 files changed, 167 insertions(+), 40 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 7d021393b0da..4883dcf3e1a9 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,8 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_is_big_little;
+extern unsigned int sysctl_sched_sync_hint_enable;
 extern unsigned int sysctl_sched_cstate_aware;
 
 enum sched_tunable_scaling {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 14c8527d7bcb..a22df661aaa6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -51,7 +51,10 @@
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
+unsigned int sysctl_sched_is_big_little = 0;
+unsigned int sysctl_sched_sync_hint_enable = 1;
 unsigned int sysctl_sched_cstate_aware = 1;
+
 /*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -5534,7 +5537,97 @@ done:
 	return target;
 }
 
-static int energy_aware_wake_cpu(struct task_struct *p, int target)
+static inline int find_best_target(struct task_struct *p)
+{
+	int i, boosted;
+	int target_cpu = -1;
+	int target_capacity = 0;
+	int backup_capacity = 0;
+	int idle_cpu = -1;
+	int best_idle_cstate = INT_MAX;
+	int backup_cpu = -1;
+	unsigned long task_util_boosted, new_util;
+
+	/*
+	 * Favor 1) busy cpu with most capacity at current OPP
+	 *       2) idle_cpu with capacity at current OPP
+	 *       3) busy cpu with capacity at higher OPP
+	 */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+	boosted = schedtune_task_boost(p);
+#else
+	boosted = 0;
+#endif
+	task_util_boosted = boosted_task_util(p);
+	for_each_cpu(i, tsk_cpus_allowed(p)) {
+		int cur_capacity = capacity_curr_of(i);
+		struct rq *rq = cpu_rq(i);
+		int idle_idx = idle_get_state_idx(rq);
+
+		/*
+		 * p's blocked utilization is still accounted for on prev_cpu
+		 * so prev_cpu will receive a negative bias due to the double
+		 * accounting. However, the blocked utilization may be zero.
+		 */
+		new_util = cpu_util(i) + task_util_boosted;
+
+		/*
+		 * Ensure minimum capacity to grant the required boost.
+		 * The target CPU can be already at a capacity level higher
+		 * than the one required to boost the task.
+		 */
+
+		if (new_util > capacity_orig_of(i))
+			continue;
+
+		/*
+		 * For boosted tasks we favor idle cpus unconditionally to
+		 * improve latency.
+		 */
+		if (idle_idx >= 0 && boosted) {
+			if (idle_cpu < 0 ||
+				(sysctl_sched_cstate_aware &&
+				 best_idle_cstate > idle_idx)) {
+				best_idle_cstate = idle_idx;
+				idle_cpu = i;
+			}
+			continue;
+		}
+
+		if (new_util < cur_capacity) {
+			if (cpu_rq(i)->nr_running) {
+				if (target_capacity == 0 ||
+					target_capacity > cur_capacity) {
+					/* busy CPU with most capacity at current OPP */
+					target_cpu = i;
+					target_capacity = cur_capacity;
+				}
+			} else if (!boosted) {
+				if (idle_cpu < 0 ||
+					(sysctl_sched_cstate_aware &&
+						best_idle_cstate > idle_idx)) {
+					best_idle_cstate = idle_idx;
+					idle_cpu = i;
+				}
+			}
+		} else if (backup_capacity == 0 ||
+				backup_capacity > cur_capacity) {
+			/* first busy CPU with capacity at higher OPP */
+			backup_capacity = cur_capacity;
+			backup_cpu = i;
+		}
+	}
+
+	if (!boosted && target_cpu < 0) {
+		target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu;
+	}
+
+	if (boosted && idle_cpu >= 0)
+		target_cpu = idle_cpu;
+	return target_cpu;
+}
+
+static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 {
 	struct sched_domain *sd;
 	struct sched_group *sg, *sg_target;
@@ -5542,6 +5635,14 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
 	int target_cpu = task_cpu(p);
 	int i;
 
+	if (sysctl_sched_sync_hint_enable && sync) {
+		int cpu = smp_processor_id();
+		cpumask_t search_cpus;
+		cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+		if (cpumask_test_cpu(cpu, &search_cpus))
+			return cpu;
+	}
+
 	sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
 
 	if (!sd)
@@ -5550,50 +5651,60 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
 	sg = sd->groups;
 	sg_target = sg;
 
-	/*
-	 * Find group with sufficient capacity. We only get here if no cpu is
-	 * overutilized. We may end up overutilizing a cpu by adding the task,
-	 * but that should not be any worse than select_idle_sibling().
-	 * load_balance() should sort it out later as we get above the tipping
-	 * point.
-	 */
-	do {
-		/* Assuming all cpus are the same in group */
-		int max_cap_cpu = group_first_cpu(sg);
+	if (sysctl_sched_is_big_little) {
 
 		/*
-		 * Assume smaller max capacity means more energy-efficient.
-		 * Ideally we should query the energy model for the right
-		 * answer but it easily ends up in an exhaustive search.
+		 * Find group with sufficient capacity. We only get here if no cpu is
+		 * overutilized. We may end up overutilizing a cpu by adding the task,
+		 * but that should not be any worse than select_idle_sibling().
+		 * load_balance() should sort it out later as we get above the tipping
+		 * point.
 		 */
-		if (capacity_of(max_cap_cpu) < target_max_cap &&
-		    task_fits_max(p, max_cap_cpu)) {
-			sg_target = sg;
-			target_max_cap = capacity_of(max_cap_cpu);
-		}
-	} while (sg = sg->next, sg != sd->groups);
+		do {
+			/* Assuming all cpus are the same in group */
+			int max_cap_cpu = group_first_cpu(sg);
 
-	/* Find cpu with sufficient capacity */
-	for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+			/*
+			 * Assume smaller max capacity means more energy-efficient.
+			 * Ideally we should query the energy model for the right
+			 * answer but it easily ends up in an exhaustive search.
+			 */
+			if (capacity_of(max_cap_cpu) < target_max_cap &&
+			    task_fits_max(p, max_cap_cpu)) {
+				sg_target = sg;
+				target_max_cap = capacity_of(max_cap_cpu);
+			}
+		} while (sg = sg->next, sg != sd->groups);
+
+		/* Find cpu with sufficient capacity */
+		for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+			/*
+			 * p's blocked utilization is still accounted for on prev_cpu
+			 * so prev_cpu will receive a negative bias due to the double
+			 * accounting. However, the blocked utilization may be zero.
+			 */
+			int new_util = cpu_util(i) + boosted_task_util(p);
+
+			if (new_util > capacity_orig_of(i))
+				continue;
+
+			if (new_util < capacity_curr_of(i)) {
+				target_cpu = i;
+				if (cpu_rq(i)->nr_running)
+					break;
+			}
+
+			/* cpu has capacity at higher OPP, keep it as fallback */
+			if (target_cpu == task_cpu(p))
+				target_cpu = i;
+		}
+	} else {
 		/*
-		 * p's blocked utilization is still accounted for on prev_cpu
-		 * so prev_cpu will receive a negative bias due to the double
-		 * accounting. However, the blocked utilization may be zero.
+		 * Find a cpu with sufficient capacity
 		 */
-		int new_util = cpu_util(i) + boosted_task_util(p);
-
-		if (new_util > capacity_orig_of(i))
-			continue;
-
-		if (new_util < capacity_curr_of(i)) {
-			target_cpu = i;
-			if (cpu_rq(i)->nr_running)
-				break;
-		}
-
-		/* cpu has capacity at higher OPP, keep it as fallback */
-		if (target_cpu == task_cpu(p))
-			target_cpu = i;
+		int tmp_target = find_best_target(p);
+		if (tmp_target >= 0)
+			target_cpu = tmp_target;
 	}
 
 	if (target_cpu != task_cpu(p)) {
@@ -5670,7 +5781,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 	if (!sd) {
 		if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
-			new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+			new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
 		else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, new_cpu);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cc0bf3ecdbda..4671761ae3c5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -303,6 +303,20 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
 	},
+	{
+		.procname	= "sched_is_big_little",
+		.data		= &sysctl_sched_is_big_little,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_sync_hint_enable",
+		.data		= &sysctl_sched_sync_hint_enable,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "sched_cstate_aware",
 		.data		= &sysctl_sched_cstate_aware,

From c50cc2299cb1a9ae769c955f5dd09a9648b1600e Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 11 Mar 2016 16:44:16 -0800
Subject: [PATCH 0208/3220] sched/fair: add tunable to set initial task load

The choice of initial task load upon fork has a large influence
on CPU and OPP selection when scheduler-driven DVFS is in use.
Make this tuneable by adding a new sysctl "sched_initial_task_util".

If the sched governor is not used, the default remains at SCHED_LOAD_SCALE
Otherwise, the value from the sysctl is used. This defaults to 0.

Signed-off-by: "Todd Kjos <tkjos@google.com>"
---
 include/linux/sched/sysctl.h | 1 +
 kernel/sched/fair.c          | 5 ++++-
 kernel/sysctl.c              | 7 +++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 4883dcf3e1a9..2834841c507e 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -41,6 +41,7 @@ extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_is_big_little;
 extern unsigned int sysctl_sched_sync_hint_enable;
+extern unsigned int sysctl_sched_initial_task_util;
 extern unsigned int sysctl_sched_cstate_aware;
 
 enum sched_tunable_scaling {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a22df661aaa6..88aca75a6659 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -53,6 +53,7 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
 unsigned int sysctl_sched_is_big_little = 0;
 unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
 unsigned int sysctl_sched_cstate_aware = 1;
 
 /*
@@ -687,7 +688,9 @@ void init_entity_runnable_average(struct sched_entity *se)
 	sa->period_contrib = 1023;
 	sa->load_avg = scale_load_down(se->load.weight);
 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+	sa->util_avg =  sched_freq() ?
+		sysctl_sched_initial_task_util :
+		scale_load_down(SCHED_LOAD_SCALE);
 	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4671761ae3c5..4457e107ec20 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -317,6 +317,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "sched_initial_task_util",
+		.data		= &sysctl_sched_initial_task_util,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "sched_cstate_aware",
 		.data		= &sysctl_sched_cstate_aware,

From 28e8cb961c2f71a89e391335a9304c3dd8b38d8f Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 28 Jul 2016 16:39:27 +0100
Subject: [PATCH 0209/3220] FIX: sched/tune: update usage of boosted task
 utilisation on CPU selection

A boosted task needs to be scheduled on a CPU which can grant a minimum
capacity which is higher than its utilization.
However, a task can be allocated on a CPU which already provides an utilization
which is higher than the task boosted utilization itself.
Moreover, with the previous approach a task 100% boosted is not fitting any
CPU.

This patch makes use of the boosted task utilization just as a threashold
which defines the minimum capacity should be available on a CPU to host that
task.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 88aca75a6659..9e1935b6e340 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5636,6 +5636,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 	struct sched_group *sg, *sg_target;
 	int target_max_cap = INT_MAX;
 	int target_cpu = task_cpu(p);
+	unsigned long task_util_boosted, new_util;
 	int i;
 
 	if (sysctl_sched_sync_hint_enable && sync) {
@@ -5679,6 +5680,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 			}
 		} while (sg = sg->next, sg != sd->groups);
 
+		task_util_boosted = boosted_task_util(p);
 		/* Find cpu with sufficient capacity */
 		for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
 			/*
@@ -5686,8 +5688,13 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 			 * so prev_cpu will receive a negative bias due to the double
 			 * accounting. However, the blocked utilization may be zero.
 			 */
-			int new_util = cpu_util(i) + boosted_task_util(p);
+			new_util = cpu_util(i) + task_util_boosted;
 
+			/*
+			 * Ensure minimum capacity to grant the required boost.
+			 * The target CPU can be already at a capacity level higher
+			 * than the one required to boost the task.
+			 */
 			if (new_util > capacity_orig_of(i))
 				continue;
 

From 6ba071d89dd72b080b9f0e4abf587cad99d5320b Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 29 Jul 2016 15:45:57 +0100
Subject: [PATCH 0210/3220] FIX: sched/tune: move schedtune_nornalize_energy
 into fair.c

The energy normalization function is required to get the proper values
for the P-E space filtering function to work.
That normalization is part of the hot wakeup path and currently implemented
with a function call.

Moving the normalization function into fair.c allows the compiler to
further optimize that code by reducing overheads in the wakeup hot path.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/sched/fair.c | 121 ++++++++++++++++++++++++++++----------------
 kernel/sched/tune.c |  42 +--------------
 kernel/sched/tune.h |  12 ++++-
 3 files changed, 91 insertions(+), 84 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9e1935b6e340..2b23dfefe6f1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4975,44 +4975,6 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
 	return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
 }
 
-#ifdef CONFIG_SCHED_TUNE
-static int energy_diff_evaluate(struct energy_env *eenv)
-{
-	unsigned int boost;
-	int nrg_delta;
-
-	/* Return energy diff when boost margin is 0 */
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-	boost = schedtune_task_boost(eenv->task);
-#else
-	boost = get_sysctl_sched_cfs_boost();
-#endif
-	if (boost == 0)
-		return eenv->nrg.diff;
-
-	/* Compute normalized energy diff */
-	nrg_delta = schedtune_normalize_energy(eenv->nrg.diff);
-	eenv->nrg.delta = nrg_delta;
-
-	eenv->payoff = schedtune_accept_deltas(
-			eenv->nrg.delta,
-			eenv->cap.delta,
-			eenv->task);
-
-	/*
-	 * When SchedTune is enabled, the energy_diff() function will return
-	 * the computed energy payoff value. Since the energy_diff() return
-	 * value is expected to be negative by its callers, this evaluation
-	 * function return a negative value each time the evaluation return a
-	 * positive payoff, which is the condition for the acceptance of
-	 * a scheduling decision
-	 */
-	return -eenv->payoff;
-}
-#else /* CONFIG_SCHED_TUNE */
-#define energy_diff_evaluate(eenv) eenv->nrg.diff
-#endif
-
 /*
  * energy_diff(): Estimate the energy impact of changing the utilization
  * distribution. eenv specifies the change: utilisation amount, source, and
@@ -5020,12 +4982,11 @@ static int energy_diff_evaluate(struct energy_env *eenv)
  * utilization is removed from or added to the system (e.g. task wake-up). If
  * both are specified, the utilization is migrated.
  */
-static int energy_diff(struct energy_env *eenv)
+static inline int __energy_diff(struct energy_env *eenv)
 {
 	struct sched_domain *sd;
 	struct sched_group *sg;
 	int sd_cpu = -1, energy_before = 0, energy_after = 0;
-	int result;
 
 	struct energy_env eenv_before = {
 		.util_delta	= 0,
@@ -5069,17 +5030,91 @@ static int energy_diff(struct energy_env *eenv)
 	eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
 	eenv->payoff = 0;
 
-	result = energy_diff_evaluate(eenv);
-
 	trace_sched_energy_diff(eenv->task,
 			eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
 			eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
 			eenv->cap.before, eenv->cap.after, eenv->cap.delta,
 			eenv->nrg.delta, eenv->payoff);
 
-	return result;
+	return eenv->nrg.diff;
 }
 
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+	u32 normalized_nrg;
+#ifdef CONFIG_SCHED_DEBUG
+	int max_delta;
+
+	/* Check for boundaries */
+	max_delta  = schedtune_target_nrg.max_power;
+	max_delta -= schedtune_target_nrg.min_power;
+	WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+	/* Do scaling using positive numbers to increase the range */
+	normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+	/* Scale by energy magnitude */
+	normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+	/* Normalize on max energy for target platform */
+	normalized_nrg = reciprocal_divide(
+			normalized_nrg, schedtune_target_nrg.rdiv);
+
+	return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+	unsigned int boost;
+	int nrg_delta;
+
+	/* Conpute "absolute" energy diff */
+	__energy_diff(eenv);
+
+	/* Return energy diff when boost margin is 0 */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+	boost = schedtune_task_boost(eenv->task);
+#else
+	boost = get_sysctl_sched_cfs_boost();
+#endif
+	if (boost == 0)
+		return eenv->nrg.diff;
+
+	/* Compute normalized energy diff */
+	nrg_delta = normalize_energy(eenv->nrg.diff);
+	eenv->nrg.delta = nrg_delta;
+
+	eenv->payoff = schedtune_accept_deltas(
+			eenv->nrg.delta,
+			eenv->cap.delta,
+			eenv->task);
+
+	/*
+	 * When SchedTune is enabled, the energy_diff() function will return
+	 * the computed energy payoff value. Since the energy_diff() return
+	 * value is expected to be negative by its callers, this evaluation
+	 * function return a negative value each time the evaluation return a
+	 * positive payoff, which is the condition for the acceptance of
+	 * a scheduling decision
+	 */
+	return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
  * A waker of many should wake a different task than the one last awakened
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index b40d40dc3c49..8ca8db2de818 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -3,24 +3,17 @@
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/printk.h>
-#include <linux/reciprocal_div.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 
 #include <trace/events/sched.h>
 
 #include "sched.h"
+#include "tune.h"
 
 unsigned int sysctl_sched_cfs_boost __read_mostly;
 
-/*
- * System energy normalization constants
- */
-static struct target_nrg {
-	unsigned long min_power;
-	unsigned long max_power;
-	struct reciprocal_value rdiv;
-} schedtune_target_nrg;
+extern struct target_nrg schedtune_target_nrg;
 
 /* Performance Boost region (B) threshold params */
 static int perf_boost_idx;
@@ -587,37 +580,6 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
-/*
- * System energy normalization
- * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
- * corresponding to the specified energy variation.
- */
-int
-schedtune_normalize_energy(int energy_diff)
-{
-	u32 normalized_nrg;
-	int max_delta;
-
-#ifdef CONFIG_SCHED_DEBUG
-	/* Check for boundaries */
-	max_delta  = schedtune_target_nrg.max_power;
-	max_delta -= schedtune_target_nrg.min_power;
-	WARN_ON(abs(energy_diff) >= max_delta);
-#endif
-
-	/* Do scaling using positive numbers to increase the range */
-	normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
-
-	/* Scale by energy magnitude */
-	normalized_nrg <<= SCHED_LOAD_SHIFT;
-
-	/* Normalize on max energy for target platform */
-	normalized_nrg = reciprocal_divide(
-			normalized_nrg, schedtune_target_nrg.rdiv);
-
-	return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
-}
-
 #ifdef CONFIG_SCHED_DEBUG
 static void
 schedtune_test_nrg(unsigned long delta_pwr)
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index f7273a5d994a..7d2aa7951554 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -1,6 +1,17 @@
 
 #ifdef CONFIG_SCHED_TUNE
 
+#include <linux/reciprocal_div.h>
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+	unsigned long min_power;
+	unsigned long max_power;
+	struct reciprocal_value rdiv;
+};
+
 #ifdef CONFIG_CGROUP_SCHEDTUNE
 
 int schedtune_cpu_boost(int cpu);
@@ -25,7 +36,6 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta,
 #define schedtune_enqueue_task(task, cpu) do { } while (0)
 #define schedtune_dequeue_task(task, cpu) do { } while (0)
 
-#define schedtune_normalize_energy(energy) energy
 #define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
 
 #endif /* CONFIG_SCHED_TUNE */

From 00aae8d5d5cd6f28d7603e0c1c4ac5cf91cb4aa3 Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Thu, 28 Jul 2016 17:28:55 +0100
Subject: [PATCH 0211/3220] sched/tune: Add support for negative boost values

Change-Id: I164ee04ba98c3a776605f18cb65ee61b3e917939

Contains also:

eas/stune: schedtune cpu boost_max must be non-negative.

This is to avoid under-accounting cpu capacity which may
cause task stacking and frequency spikes.

Change-Id: Ie1c1cbd52a6edb77b4c15a830030aa748dff6f29
---
 include/trace/events/sched.h | 20 +++++++++----------
 kernel/sched/fair.c          | 37 ++++++++++++++++++++----------------
 kernel/sched/tune.c          | 25 +++++++++++++++---------
 3 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 07ed295d0f23..7dc39dd384de 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -682,14 +682,14 @@ TRACE_EVENT(sched_tune_config,
  */
 TRACE_EVENT(sched_boost_cpu,
 
-	TP_PROTO(int cpu, unsigned long util, unsigned long margin),
+	TP_PROTO(int cpu, unsigned long util, long margin),
 
 	TP_ARGS(cpu, util, margin),
 
 	TP_STRUCT__entry(
 		__field( int,		cpu			)
 		__field( unsigned long,	util			)
-		__field( unsigned long,	margin			)
+		__field(long,		margin			)
 	),
 
 	TP_fast_assign(
@@ -698,7 +698,7 @@ TRACE_EVENT(sched_boost_cpu,
 		__entry->margin	= margin;
 	),
 
-	TP_printk("cpu=%d util=%lu margin=%lu",
+	TP_printk("cpu=%d util=%lu margin=%ld",
 		  __entry->cpu,
 		  __entry->util,
 		  __entry->margin)
@@ -710,7 +710,7 @@ TRACE_EVENT(sched_boost_cpu,
 TRACE_EVENT(sched_tune_tasks_update,
 
 	TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
-		unsigned int boost, unsigned int max_boost),
+		int boost, int max_boost),
 
 	TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost),
 
@@ -720,8 +720,8 @@ TRACE_EVENT(sched_tune_tasks_update,
 		__field( int,		cpu		)
 		__field( int,		tasks		)
 		__field( int,		idx		)
-		__field( unsigned int,	boost		)
-		__field( unsigned int,	max_boost	)
+		__field( int,		boost		)
+		__field( int,		max_boost	)
 	),
 
 	TP_fast_assign(
@@ -735,7 +735,7 @@ TRACE_EVENT(sched_tune_tasks_update,
 	),
 
 	TP_printk("pid=%d comm=%s "
-			"cpu=%d tasks=%d idx=%d boost=%u max_boost=%u",
+			"cpu=%d tasks=%d idx=%d boost=%d max_boost=%d",
 		__entry->pid, __entry->comm,
 		__entry->cpu, __entry->tasks, __entry->idx,
 		__entry->boost, __entry->max_boost)
@@ -771,7 +771,7 @@ TRACE_EVENT(sched_tune_boostgroup_update,
  */
 TRACE_EVENT(sched_boost_task,
 
-	TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin),
+	TP_PROTO(struct task_struct *tsk, unsigned long util, long margin),
 
 	TP_ARGS(tsk, util, margin),
 
@@ -779,7 +779,7 @@ TRACE_EVENT(sched_boost_task,
 		__array( char,	comm,	TASK_COMM_LEN		)
 		__field( pid_t,		pid			)
 		__field( unsigned long,	util			)
-		__field( unsigned long,	margin			)
+		__field( long,		margin			)
 
 	),
 
@@ -790,7 +790,7 @@ TRACE_EVENT(sched_boost_task,
 		__entry->margin	= margin;
 	),
 
-	TP_printk("comm=%s pid=%d util=%lu margin=%lu",
+	TP_printk("comm=%s pid=%d util=%lu margin=%ld",
 		  __entry->comm, __entry->pid,
 		  __entry->util,
 		  __entry->margin)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2b23dfefe6f1..aef41d9acd83 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5250,22 +5250,25 @@ static bool cpu_overutilized(int cpu)
 
 #ifdef CONFIG_SCHED_TUNE
 
-static unsigned long
-schedtune_margin(unsigned long signal, unsigned long boost)
+static long
+schedtune_margin(unsigned long signal, long boost)
 {
-	unsigned long long margin = 0;
+	long long margin = 0;
 
 	/*
 	 * Signal proportional compensation (SPC)
 	 *
 	 * The Boost (B) value is used to compute a Margin (M) which is
 	 * proportional to the complement of the original Signal (S):
-	 *   M = B * (SCHED_LOAD_SCALE - S)
+	 *   M = B * (SCHED_LOAD_SCALE - S), if B is positive
+	 *   M = B * S, if B is negative
 	 * The obtained M could be used by the caller to "boost" S.
 	 */
-	margin  = SCHED_LOAD_SCALE - signal;
-	margin *= boost;
-
+	if (boost >= 0) {
+		margin  = SCHED_LOAD_SCALE - signal;
+		margin *= boost;
+	} else
+		margin = -signal * boost;
 	/*
 	 * Fast integer division by constant:
 	 *  Constant   :                 (C) = 100
@@ -5281,13 +5284,15 @@ schedtune_margin(unsigned long signal, unsigned long boost)
 	margin  *= 1311;
 	margin >>= 17;
 
+	if (boost < 0)
+		margin *= -1;
 	return margin;
 }
 
-static inline unsigned int
+static inline int
 schedtune_cpu_margin(unsigned long util, int cpu)
 {
-	unsigned int boost;
+	int boost;
 
 #ifdef CONFIG_CGROUP_SCHEDTUNE
 	boost = schedtune_cpu_boost(cpu);
@@ -5300,12 +5305,12 @@ schedtune_cpu_margin(unsigned long util, int cpu)
 	return schedtune_margin(util, boost);
 }
 
-static inline unsigned long
+static inline long
 schedtune_task_margin(struct task_struct *task)
 {
-	unsigned int boost;
+	int boost;
 	unsigned long util;
-	unsigned long margin;
+	long margin;
 
 #ifdef CONFIG_CGROUP_SCHEDTUNE
 	boost = schedtune_task_boost(task);
@@ -5323,13 +5328,13 @@ schedtune_task_margin(struct task_struct *task)
 
 #else /* CONFIG_SCHED_TUNE */
 
-static inline unsigned int
+static inline int
 schedtune_cpu_margin(unsigned long util, int cpu)
 {
 	return 0;
 }
 
-static inline unsigned int
+static inline int
 schedtune_task_margin(struct task_struct *task)
 {
 	return 0;
@@ -5341,7 +5346,7 @@ static inline unsigned long
 boosted_cpu_util(int cpu)
 {
 	unsigned long util = cpu_util(cpu);
-	unsigned long margin = schedtune_cpu_margin(util, cpu);
+	long margin = schedtune_cpu_margin(util, cpu);
 
 	trace_sched_boost_cpu(cpu, util, margin);
 
@@ -5352,7 +5357,7 @@ static inline unsigned long
 boosted_task_util(struct task_struct *task)
 {
 	unsigned long util = task_util(task);
-	unsigned long margin = schedtune_task_margin(task);
+	long margin = schedtune_task_margin(task);
 
 	trace_sched_boost_task(task, util, margin);
 
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 8ca8db2de818..afc4a7747161 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -213,10 +213,11 @@ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
  */
 struct boost_groups {
 	/* Maximum boost value for all RUNNABLE tasks on a CPU */
-	unsigned boost_max;
+	bool idle;
+	int boost_max;
 	struct {
 		/* The boost for tasks on that boost group */
-		unsigned boost;
+		int boost;
 		/* Count of RUNNABLE tasks on that boost group */
 		unsigned tasks;
 	} group[BOOSTGROUPS_COUNT];
@@ -229,7 +230,7 @@ static void
 schedtune_cpu_update(int cpu)
 {
 	struct boost_groups *bg;
-	unsigned boost_max;
+	int boost_max;
 	int idx;
 
 	bg = &per_cpu(cpu_boost_groups, cpu);
@@ -243,9 +244,13 @@ schedtune_cpu_update(int cpu)
 		 */
 		if (bg->group[idx].tasks == 0)
 			continue;
+
 		boost_max = max(boost_max, bg->group[idx].boost);
 	}
-
+	/* Ensures boost_max is non-negative when all cgroup boost values
+	 * are neagtive. Avoids under-accounting of cpu capacity which may cause
+	 * task stacking and frequency spikes.*/
+	boost_max = max(boost_max, 0);
 	bg->boost_max = boost_max;
 }
 
@@ -391,7 +396,7 @@ int schedtune_task_boost(struct task_struct *p)
 	return task_boost;
 }
 
-static u64
+static s64
 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
 	struct schedtune *st = css_st(css);
@@ -401,11 +406,13 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
 
 static int
 boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
-	    u64 boost)
+	    s64 boost)
 {
 	struct schedtune *st = css_st(css);
+	unsigned threshold_idx;
+	int boost_pct;
 
-	if (boost < 0 || boost > 100)
+	if (boost < -100 || boost > 100)
 		return -EINVAL;
 
 	st->boost = boost;
@@ -423,8 +430,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
 static struct cftype files[] = {
 	{
 		.name = "boost",
-		.read_u64 = boost_read,
-		.write_u64 = boost_write,
+		.read_s64 = boost_read,
+		.write_s64 = boost_write,
 	},
 	{ }	/* terminate */
 };

From 254f5095a8143f0b852c5db7672957cf3b4d65d3 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 28 Jul 2016 17:38:25 +0100
Subject: [PATCH 0212/3220] FIXUP: sched/tune: fix payoff calculation for boost
 region

The definition of the acceptance regions as well as the translation of
these regions into a payoff value was both wrong which turned out in:
a) a wrong definition of payoff for the performance boost region
b) a correct "by chance" definition of the payoff for the performance
   constraint region (i.e. two sign errors together fixing the formula)

This patch provides a better description of the cut regions as well as
a fixed version of the payoff computations, which are now reduced to a
single formula usable for both cases.

Reported-by: Leo Yan <leo.yan@linaro.org>
Reviewed-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/tune.c | 79 +++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index afc4a7747161..6d5fbde9c70e 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -51,50 +51,51 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta,
 			  int perf_boost_idx, int perf_constrain_idx)
 {
 	int payoff = -INT_MAX;
+	int gain_idx = -1;
 
 	/* Performance Boost (B) region */
-	if (nrg_delta > 0 && cap_delta > 0) {
-		/*
-		 * Evaluate "Performance Boost" vs "Energy Increase"
-		 * payoff criteria:
-		 *    cap_delta / nrg_delta < cap_gain / nrg_gain
-		 * which is:
-		 *    nrg_delta * cap_gain > cap_delta * nrg_gain
-		 */
-		payoff  = nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
-		payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
-
-		trace_sched_tune_filter(
-				nrg_delta, cap_delta,
-				threshold_gains[perf_boost_idx].nrg_gain,
-				threshold_gains[perf_boost_idx].cap_gain,
-				payoff, 8);
-
-		return payoff;
-	}
-
+	if (nrg_delta >= 0 && cap_delta > 0)
+		gain_idx = perf_boost_idx;
 	/* Performance Constraint (C) region */
-	if (nrg_delta < 0 && cap_delta < 0) {
-		/*
-		 * Evaluate "Performance Boost" vs "Energy Increase"
-		 * payoff criteria:
-		 *    cap_delta / nrg_delta > cap_gain / nrg_gain
-		 * which is:
-		 *    cap_delta * nrg_gain > nrg_delta * cap_gain
-		 */
-		payoff  = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
-		payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
-
-		trace_sched_tune_filter(
-				nrg_delta, cap_delta,
-				threshold_gains[perf_constrain_idx].nrg_gain,
-				threshold_gains[perf_constrain_idx].cap_gain,
-				payoff, 6);
-
-		return payoff;
-	}
+	else if (nrg_delta < 0 && cap_delta <= 0)
+		gain_idx = perf_constrain_idx;
 
 	/* Default: reject schedule candidate */
+	if (gain_idx == -1)
+		return payoff;
+
+	/*
+	 * Evaluate "Performance Boost" vs "Energy Increase"
+	 *
+	 * - Performance Boost (B) region
+	 *
+	 *   Condition: nrg_delta > 0 && cap_delta > 0
+	 *   Payoff criteria:
+	 *     cap_gain / nrg_gain  < cap_delta / nrg_delta =
+	 *     cap_gain * nrg_delta < cap_delta * nrg_gain
+	 *   Note that since both nrg_gain and nrg_delta are positive, the
+	 *   inequality does not change. Thus:
+	 *
+	 *     payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+	 *
+	 * - Performance Constraint (C) region
+	 *
+	 *   Condition: nrg_delta < 0 && cap_delta < 0
+	 *   payoff criteria:
+	 *     cap_gain / nrg_gain  > cap_delta / nrg_delta =
+	 *     cap_gain * nrg_delta < cap_delta * nrg_gain
+	 *   Note that since nrg_gain > 0 while nrg_delta < 0, the
+	 *   inequality change. Thus:
+	 *
+	 *     payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+	 *
+	 * This means that, in case of same positive defined {cap,nrg}_gain
+	 * for both the B and C regions, we can use the same payoff formula
+	 * where a positive value represents the accept condition.
+	 */
+	payoff  = cap_delta * threshold_gains[gain_idx].nrg_gain;
+	payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
+
 	return payoff;
 }
 

From 274bbcfbe44127c5575ed847d48d201a66cc29ce Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 28 Jul 2016 17:42:36 +0100
Subject: [PATCH 0213/3220] sched/{fair,tune}: simplify fair.c code

The usage of conditional compiled code is discouraged in fair.c.

This patch clean up a bit fair.c by moving schedtune_{cpu.task}_boost
definitions into tune.h.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 21 +++------------------
 kernel/sched/tune.h |  6 ++++++
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aef41d9acd83..6390b4cb916c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5077,18 +5077,13 @@ normalize_energy(int energy_diff)
 static inline int
 energy_diff(struct energy_env *eenv)
 {
-	unsigned int boost;
+	int boost = schedtune_task_boost(eenv->task);
 	int nrg_delta;
 
 	/* Conpute "absolute" energy diff */
 	__energy_diff(eenv);
 
 	/* Return energy diff when boost margin is 0 */
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-	boost = schedtune_task_boost(eenv->task);
-#else
-	boost = get_sysctl_sched_cfs_boost();
-#endif
 	if (boost == 0)
 		return eenv->nrg.diff;
 
@@ -5292,13 +5287,8 @@ schedtune_margin(unsigned long signal, long boost)
 static inline int
 schedtune_cpu_margin(unsigned long util, int cpu)
 {
-	int boost;
+	int boost = schedtune_cpu_boost(cpu);
 
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-	boost = schedtune_cpu_boost(cpu);
-#else
-	boost = get_sysctl_sched_cfs_boost();
-#endif
 	if (boost == 0)
 		return 0;
 
@@ -5308,15 +5298,10 @@ schedtune_cpu_margin(unsigned long util, int cpu)
 static inline long
 schedtune_task_margin(struct task_struct *task)
 {
-	int boost;
+	int boost = schedtune_task_boost(task);
 	unsigned long util;
 	long margin;
 
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-	boost = schedtune_task_boost(task);
-#else
-	boost = get_sysctl_sched_cfs_boost();
-#endif
 	if (boost == 0)
 		return 0;
 
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index 7d2aa7951554..99637758a8af 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -22,6 +22,9 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu);
 
 #else /* CONFIG_CGROUP_SCHEDTUNE */
 
+#define schedtune_cpu_boost(cpu)  get_sysctl_sched_cfs_boost()
+#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
+
 #define schedtune_enqueue_task(task, cpu) do { } while (0)
 #define schedtune_dequeue_task(task, cpu) do { } while (0)
 
@@ -33,6 +36,9 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta,
 
 #else /* CONFIG_SCHED_TUNE */
 
+#define schedtune_cpu_boost(cpu)  0
+#define schedtune_task_boost(tsk) 0
+
 #define schedtune_enqueue_task(task, cpu) do { } while (0)
 #define schedtune_dequeue_task(task, cpu) do { } while (0)
 

From 7f8f24a0eafe312899818cc07af8e9f1b33d9c39 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 29 Jul 2016 15:19:41 +0100
Subject: [PATCH 0214/3220] sched/tune: use a single initialisation function

With the introduction of initialization function required to compute the
energy normalization constants from DTB at boot time, we have now a
late_initcall which is already used by SchedTune.

This patch consolidate within that function the other initialization
bits which was previously deferred to the first CGroup creation.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/sched/tune.c | 50 +++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 6d5fbde9c70e..a691b8db2888 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -410,8 +410,6 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
 	    s64 boost)
 {
 	struct schedtune *st = css_st(css);
-	unsigned threshold_idx;
-	int boost_pct;
 
 	if (boost < -100 || boost > 100)
 		return -EINVAL;
@@ -456,33 +454,14 @@ schedtune_boostgroup_init(struct schedtune *st)
 	return 0;
 }
 
-static int
-schedtune_init(void)
-{
-	struct boost_groups *bg;
-	int cpu;
-
-	/* Initialize the per CPU boost groups */
-	for_each_possible_cpu(cpu) {
-		bg = &per_cpu(cpu_boost_groups, cpu);
-		memset(bg, 0, sizeof(struct boost_groups));
-	}
-
-	pr_info("  schedtune configured to support %d boost groups\n",
-		BOOSTGROUPS_COUNT);
-	return 0;
-}
-
 static struct cgroup_subsys_state *
 schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct schedtune *st;
 	int idx;
 
-	if (!parent_css) {
-		schedtune_init();
+	if (!parent_css)
 		return &root_schedtune.css;
-	}
 
 	/* Allow only single level hierachies */
 	if (parent_css != &root_schedtune.css) {
@@ -543,6 +522,22 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
 	.early_init	= 1,
 };
 
+static inline void
+schedtune_init_cgroups(void)
+{
+	struct boost_groups *bg;
+	int cpu;
+
+	/* Initialize the per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		memset(bg, 0, sizeof(struct boost_groups));
+	}
+
+	pr_info("schedtune: configured to support %d boost groups\n",
+		BOOSTGROUPS_COUNT);
+}
+
 #else /* CONFIG_CGROUP_SCHEDTUNE */
 
 int
@@ -690,7 +685,7 @@ schedtune_add_cluster_nrg(
  * that bind the EM to the topology information.
  */
 static int
-schedtune_init_late(void)
+schedtune_init(void)
 {
 	struct target_nrg *ste = &schedtune_target_nrg;
 	unsigned long delta_pwr = 0;
@@ -730,10 +725,17 @@ schedtune_init_late(void)
 		ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
 
 	schedtune_test_nrg(delta_pwr);
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+	schedtune_init_cgroups();
+#else
+	pr_info("schedtune: configured to support global boosting only\n");
+#endif
+
 	return 0;
 
 nodata:
 	rcu_read_unlock();
 	return -EINVAL;
 }
-late_initcall(schedtune_init_late);
+late_initcall(schedtune_init);

From af14760e19ebce67e8913fc0efd32cf839282a95 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 28 Jul 2016 18:44:40 +0100
Subject: [PATCH 0215/3220] FIXUP: sched/tune: fix accounting for runnable
 tasks

Contains:

sched/tune: fix accounting for runnable tasks (1/5)

The accounting for tasks into boost groups of different CPUs is currently
broken mainly because:
a) we do not properly track the change of boost group of a RUNNABLE task
b) there are race conditions between migration code and accounting code

This patch provides a fixes to ensure enqueue/dequeue
accounting also for throttled tasks.

Without this patch is can happen that a task is enqueued into a throttled
RQ thus not being accounted for the boosting of the corresponding RQ.
We could argue that a throttled task should not boost a CPU, however:
a) properly implementing CPU boosting considering throttled tasks will
   increase a lot the complexity of the solution
b) it's not easy to quantify the benefits introduced by such a more
   complex solution

Since task throttling requires the usage of the CFS bandwidth controller,
which is not widely used on mobile systems (at least not by Android kernels
so far), for the time being we go for the simple solution and boost also
for throttled RQs.

sched/tune: fix accounting for runnable tasks (2/5)

This patch provides the code required to enforce proper locking.
A per boost group spinlock has been added to grant atomic
accounting of tasks as well as to serialise enqueue/dequeue operations,
triggered by tasks migrations, with cgroups's attach/detach operations.

sched/tune: fix accounting for runnable tasks (3/5)

This patch adds cgroups {allow,can,cancel}_attach callbacks.

Since a task can be migrated between boost groups while it's running,
the CGroups's attach callbacks have been added to properly migrate
boost contributions of RUNNABLE tasks.

The RQ's lock is used to serialise enqueue/dequeue operations, triggered
by tasks migrations, with cgroups's attach/detach operations. While the
SchedTune's CPU lock is used to grant atrocity of the accounting within
the CPU.

NOTE: the current implementation does not allows a concurrent CPU migration
      and CGroups change.

sched/tune: fix accounting for runnable tasks (4/5)

This fixes accounting for exiting tasks by adding a dedicated call early
in the do_exit() syscall, which disables SchedTune accounting as soon as a
task is flagged PF_EXITING.

This flag is set before the multiple dequeue/enqueue dance triggered
by cgroup_exit() which is useful only to inject useless tasks movements
thus increasing possibilities for race conditions with the migration code.
The schedtune_exit_task() call does the last dequeue of a task from its
current boost group. This is a solution more aligned with what happens in
mainline kernels (>v4.4) where the exit_cgroup does not move anymore a dying
task to the root control group.

sched/tune: fix accounting for runnable tasks (5/5)

To avoid accounting issues at startup, this patch disable the SchedTune
accounting until the required data structures have been properly
initialized.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/exit.c        |   5 ++
 kernel/sched/core.c  |  12 +++
 kernel/sched/fair.c  |  11 ++-
 kernel/sched/sched.h |   3 +
 kernel/sched/tune.c  | 182 ++++++++++++++++++++++++++++++++++++++-----
 kernel/sched/tune.h  |   6 ++
 6 files changed, 195 insertions(+), 24 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 07110c6020a0..92ff63200287 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,6 +54,8 @@
 #include <linux/writeback.h>
 #include <linux/shm.h>
 
+#include "sched/tune.h"
+
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
@@ -699,6 +701,9 @@ void do_exit(long code)
 	}
 
 	exit_signals(tsk);  /* sets PF_EXITING */
+
+	schedtune_exit_task(tsk);
+
 	/*
 	 * tsk->flags are checked in the futex code to protect against
 	 * an exiting task cleaning up the robust pi futexes.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 42c45ccc1f62..f56a528c8ae7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -287,6 +287,18 @@ int sysctl_sched_rt_runtime = 950000;
 /* cpus with isolated domains */
 cpumask_var_t cpu_isolated_map;
 
+struct rq *
+lock_rq_of(struct task_struct *p, unsigned long *flags)
+{
+	return task_rq_lock(p, flags);
+}
+
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
+{
+	task_rq_unlock(rq, p, flags);
+}
+
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6390b4cb916c..a26f40cb7fd0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4234,8 +4234,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		    cpu_overutilized(rq->cpu))
 			rq->rd->overutilized = true;
 
-		schedtune_enqueue_task(p, cpu_of(rq));
-
 		/*
 		 * We want to potentially trigger a freq switch
 		 * request only for tasks that are waking up; this is
@@ -4246,6 +4244,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (task_new || task_wakeup)
 			update_capacity_of(cpu_of(rq));
 	}
+
+	/* Update SchedTune accouting */
+	schedtune_enqueue_task(p, cpu_of(rq));
+
 #endif /* CONFIG_SMP */
 
 	hrtick_update(rq);
@@ -4311,7 +4313,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_SMP
 
 	if (!se) {
-		schedtune_dequeue_task(p, cpu_of(rq));
 
 		/*
 		 * We want to potentially trigger a freq switch
@@ -4329,6 +4330,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		}
 	}
 
+	/* Update SchedTune accouting */
+	schedtune_dequeue_task(p, cpu_of(rq));
+
 #endif /* CONFIG_SMP */
 
 	hrtick_update(rq);
@@ -5604,7 +5608,6 @@ static inline int find_best_target(struct task_struct *p)
 		 * The target CPU can be already at a capacity level higher
 		 * than the one required to boost the task.
 		 */
-
 		if (new_util > capacity_orig_of(i))
 			continue;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a06c19c48057..144d4782a455 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1701,6 +1701,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 
+extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
+extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
 
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index a691b8db2888..4c77cc23e65b 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -11,6 +11,10 @@
 #include "sched.h"
 #include "tune.h"
 
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+static bool schedtune_initialized = false;
+#endif
+
 unsigned int sysctl_sched_cfs_boost __read_mostly;
 
 extern struct target_nrg schedtune_target_nrg;
@@ -222,6 +226,8 @@ struct boost_groups {
 		/* Count of RUNNABLE tasks on that boost group */
 		unsigned tasks;
 	} group[BOOSTGROUPS_COUNT];
+	/* CPU's boost group locking */
+	raw_spinlock_t lock;
 };
 
 /* Boost groups affecting each CPU in the system */
@@ -298,28 +304,24 @@ schedtune_boostgroup_update(int idx, int boost)
 	return 0;
 }
 
+#define ENQUEUE_TASK  1
+#define DEQUEUE_TASK -1
+
 static inline void
 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
 {
-	struct boost_groups *bg;
-	int tasks;
-
-	bg = &per_cpu(cpu_boost_groups, cpu);
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	int tasks = bg->group[idx].tasks + task_count;
 
 	/* Update boosted tasks count while avoiding to make it negative */
-	if (task_count < 0 && bg->group[idx].tasks <= -task_count)
-		bg->group[idx].tasks = 0;
-	else
-		bg->group[idx].tasks += task_count;
-
-	/* Boost group activation or deactivation on that RQ */
-	tasks = bg->group[idx].tasks;
-	if (tasks == 1 || tasks == 0)
-		schedtune_cpu_update(cpu);
+	bg->group[idx].tasks = max(0, tasks);
 
 	trace_sched_tune_tasks_update(p, cpu, tasks, idx,
 			bg->group[idx].boost, bg->boost_max);
 
+	/* Boost group activation or deactivation on that RQ */
+	if (tasks == 1 || tasks == 0)
+		schedtune_cpu_update(cpu);
 }
 
 /*
@@ -327,9 +329,14 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
  */
 void schedtune_enqueue_task(struct task_struct *p, int cpu)
 {
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	unsigned long irq_flags;
 	struct schedtune *st;
 	int idx;
 
+	if (!unlikely(schedtune_initialized))
+		return;
+
 	/*
 	 * When a task is marked PF_EXITING by do_exit() it's going to be
 	 * dequeued and enqueued multiple times in the exit path.
@@ -339,13 +346,109 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
 	if (p->flags & PF_EXITING)
 		return;
 
-	/* Get task boost group */
+	/*
+	 * Boost group accouting is protected by a per-cpu lock and requires
+	 * interrupt to be disabled to avoid race conditions for example on
+	 * do_exit()::cgroup_exit() and task migration.
+	 */
+	raw_spin_lock_irqsave(&bg->lock, irq_flags);
 	rcu_read_lock();
+
 	st = task_schedtune(p);
 	idx = st->idx;
-	rcu_read_unlock();
 
-	schedtune_tasks_update(p, cpu, idx, 1);
+	schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
+	rcu_read_unlock();
+	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+int schedtune_allow_attach(struct cgroup_taskset *tset)
+{
+	/* We always allows tasks to be moved between existing CGroups */
+	return 0;
+}
+
+int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
+	struct boost_groups *bg;
+	unsigned long irq_flags;
+	unsigned int cpu;
+	struct rq *rq;
+	int src_bg; /* Source boost group index */
+	int dst_bg; /* Destination boost group index */
+	int tasks;
+
+	if (!unlikely(schedtune_initialized))
+		return 0;
+
+
+	cgroup_taskset_for_each(task, css, tset) {
+
+		/*
+		 * Lock the CPU's RQ the task is enqueued to avoid race
+		 * conditions with migration code while the task is being
+		 * accounted
+		 */
+		rq = lock_rq_of(task, &irq_flags);
+
+		if (!task->on_rq) {
+			unlock_rq_of(rq, task, &irq_flags);
+			continue;
+		}
+
+		/*
+		 * Boost group accouting is protected by a per-cpu lock and requires
+		 * interrupt to be disabled to avoid race conditions on...
+		 */
+		cpu = cpu_of(rq);
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		raw_spin_lock(&bg->lock);
+
+		dst_bg = css_st(css)->idx;
+		src_bg = task_schedtune(task)->idx;
+
+		/*
+		 * Current task is not changing boostgroup, which can
+		 * happen when the new hierarchy is in use.
+		 */
+		if (unlikely(dst_bg == src_bg)) {
+			raw_spin_unlock(&bg->lock);
+			unlock_rq_of(rq, task, &irq_flags);
+			continue;
+		}
+
+		/*
+		 * This is the case of a RUNNABLE task which is switching its
+		 * current boost group.
+		 */
+
+		/* Move task from src to dst boost group */
+		tasks = bg->group[src_bg].tasks - 1;
+		bg->group[src_bg].tasks = max(0, tasks);
+		bg->group[dst_bg].tasks += 1;
+
+		raw_spin_unlock(&bg->lock);
+		unlock_rq_of(rq, task, &irq_flags);
+
+		/* Update CPU boost group */
+		if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
+			schedtune_cpu_update(task_cpu(task));
+
+	}
+
+	return 0;
+}
+
+void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+	/* This can happen only if SchedTune controller is mounted with
+	 * other hierarchies ane one of them fails. Since usually SchedTune is
+	 * mouted on its own hierarcy, for the time being we do not implement
+	 * a proper rollback mechanism */
+	WARN(1, "SchedTune cancel attach not implemented");
 }
 
 /*
@@ -353,26 +456,62 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
  */
 void schedtune_dequeue_task(struct task_struct *p, int cpu)
 {
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	unsigned long irq_flags;
 	struct schedtune *st;
 	int idx;
 
+	if (!unlikely(schedtune_initialized))
+		return;
+
 	/*
 	 * When a task is marked PF_EXITING by do_exit() it's going to be
 	 * dequeued and enqueued multiple times in the exit path.
 	 * Thus we avoid any further update, since we do not want to change
 	 * CPU boosting while the task is exiting.
-	 * The last dequeue will be done by cgroup exit() callback.
+	 * The last dequeue is already enforce by the do_exit() code path
+	 * via schedtune_exit_task().
 	 */
 	if (p->flags & PF_EXITING)
 		return;
 
-	/* Get task boost group */
+	/*
+	 * Boost group accouting is protected by a per-cpu lock and requires
+	 * interrupt to be disabled to avoid race conditions on...
+	 */
+	raw_spin_lock_irqsave(&bg->lock, irq_flags);
 	rcu_read_lock();
+
 	st = task_schedtune(p);
 	idx = st->idx;
-	rcu_read_unlock();
 
-	schedtune_tasks_update(p, cpu, idx, -1);
+	schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
+	rcu_read_unlock();
+	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+	struct schedtune *st;
+	unsigned long irq_flags;
+	unsigned int cpu;
+	struct rq *rq;
+	int idx;
+
+	if (!unlikely(schedtune_initialized))
+		return;
+
+	rq = lock_rq_of(tsk, &irq_flags);
+	rcu_read_lock();
+
+	cpu = cpu_of(rq);
+	st = task_schedtune(tsk);
+	idx = st->idx;
+	schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+	rcu_read_unlock();
+	unlock_rq_of(rq, tsk, &irq_flags);
 }
 
 int schedtune_cpu_boost(int cpu)
@@ -518,6 +657,9 @@ schedtune_css_free(struct cgroup_subsys_state *css)
 struct cgroup_subsys schedtune_cgrp_subsys = {
 	.css_alloc	= schedtune_css_alloc,
 	.css_free	= schedtune_css_free,
+//	.allow_attach   = schedtune_allow_attach,
+	.can_attach     = schedtune_can_attach,
+	.cancel_attach  = schedtune_cancel_attach,
 	.legacy_cftypes	= files,
 	.early_init	= 1,
 };
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index 99637758a8af..be1785eb1c5b 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -17,6 +17,8 @@ struct target_nrg {
 int schedtune_cpu_boost(int cpu);
 int schedtune_task_boost(struct task_struct *tsk);
 
+void schedtune_exit_task(struct task_struct *tsk);
+
 void schedtune_enqueue_task(struct task_struct *p, int cpu);
 void schedtune_dequeue_task(struct task_struct *p, int cpu);
 
@@ -25,6 +27,8 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu);
 #define schedtune_cpu_boost(cpu)  get_sysctl_sched_cfs_boost()
 #define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
 
+#define schedtune_exit_task(task) do { } while (0)
+
 #define schedtune_enqueue_task(task, cpu) do { } while (0)
 #define schedtune_dequeue_task(task, cpu) do { } while (0)
 
@@ -39,6 +43,8 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta,
 #define schedtune_cpu_boost(cpu)  0
 #define schedtune_task_boost(tsk) 0
 
+#define schedtune_exit_task(task) do { } while (0)
+
 #define schedtune_enqueue_task(task, cpu) do { } while (0)
 #define schedtune_dequeue_task(task, cpu) do { } while (0)
 

From 369bcbbae4407a5b50f472a0fa3d569b9a832081 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 29 Jul 2016 14:41:25 +0100
Subject: [PATCH 0216/3220] sched/fair: optimize idle cpu selection for boosted
 tasks

find_best_target CPU selection is biased towards lower CPU IDs. Bias
towards higher CPUs for boosted tasks. For boosted tasks unconditionally
use the idle CPU returned by find_best_target.

BUG: 29512132
Change-Id: I3d650051752163fcf3dc7909751d1fde3f9d17c0

Conflicts:
	kernel/sched/fair.c
---
 kernel/sched/fair.c | 70 +++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a26f40cb7fd0..6efdcad51603 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5569,32 +5569,30 @@ done:
 	return target;
 }
 
-static inline int find_best_target(struct task_struct *p)
+static inline int find_best_target(struct task_struct *p, bool boosted)
 {
-	int i, boosted;
+	int iter_cpu;
 	int target_cpu = -1;
 	int target_capacity = 0;
 	int backup_capacity = 0;
-	int idle_cpu = -1;
+	int best_idle_cpu = -1;
 	int best_idle_cstate = INT_MAX;
 	int backup_cpu = -1;
 	unsigned long task_util_boosted, new_util;
 
-	/*
-	 * Favor 1) busy cpu with most capacity at current OPP
-	 *       2) idle_cpu with capacity at current OPP
-	 *       3) busy cpu with capacity at higher OPP
-	 */
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-	boosted = schedtune_task_boost(p);
-#else
-	boosted = 0;
-#endif
 	task_util_boosted = boosted_task_util(p);
-	for_each_cpu(i, tsk_cpus_allowed(p)) {
-		int cur_capacity = capacity_curr_of(i);
-		struct rq *rq = cpu_rq(i);
-		int idle_idx = idle_get_state_idx(rq);
+	for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+		int cur_capacity;
+		struct rq *rq;
+		int idle_idx;
+
+		/*
+		 * favor higher cpus for boosted tasks
+		 */
+		int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+
+		if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+			continue;
 
 		/*
 		 * p's blocked utilization is still accounted for on prev_cpu
@@ -5615,46 +5613,43 @@ static inline int find_best_target(struct task_struct *p)
 		 * For boosted tasks we favor idle cpus unconditionally to
 		 * improve latency.
 		 */
-		if (idle_idx >= 0 && boosted) {
-			if (idle_cpu < 0 ||
-				(sysctl_sched_cstate_aware &&
-				 best_idle_cstate > idle_idx)) {
-				best_idle_cstate = idle_idx;
-				idle_cpu = i;
-			}
+		if (idle_cpu(i) && boosted) {
+			if (best_idle_cpu < 0)
+				best_idle_cpu = i;
 			continue;
 		}
 
+		cur_capacity = capacity_curr_of(i);
+		rq = cpu_rq(i);
+		idle_idx = idle_get_state_idx(rq);
+
 		if (new_util < cur_capacity) {
 			if (cpu_rq(i)->nr_running) {
 				if (target_capacity == 0 ||
 					target_capacity > cur_capacity) {
-					/* busy CPU with most capacity at current OPP */
 					target_cpu = i;
 					target_capacity = cur_capacity;
 				}
 			} else if (!boosted) {
-				if (idle_cpu < 0 ||
+				if (best_idle_cpu < 0 ||
 					(sysctl_sched_cstate_aware &&
 						best_idle_cstate > idle_idx)) {
 					best_idle_cstate = idle_idx;
-					idle_cpu = i;
+					best_idle_cpu = i;
 				}
 			}
 		} else if (backup_capacity == 0 ||
 				backup_capacity > cur_capacity) {
-			/* first busy CPU with capacity at higher OPP */
 			backup_capacity = cur_capacity;
 			backup_cpu = i;
 		}
 	}
 
-	if (!boosted && target_cpu < 0) {
-		target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu;
-	}
+	if (boosted && best_idle_cpu >= 0)
+		target_cpu = best_idle_cpu;
+	else if (target_cpu < 0)
+		target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
 
-	if (boosted && idle_cpu >= 0)
-		target_cpu = idle_cpu;
 	return target_cpu;
 }
 
@@ -5740,9 +5735,16 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 		/*
 		 * Find a cpu with sufficient capacity
 		 */
-		int tmp_target = find_best_target(p);
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+		bool boosted = schedtune_task_boost(p) > 0;
+#else
+		bool boosted = 0;
+#endif
+		int tmp_target = find_best_target(p, boosted);
 		if (tmp_target >= 0)
 			target_cpu = tmp_target;
+			if (boosted && idle_cpu(target_cpu))
+				return target_cpu;
 	}
 
 	if (target_cpu != task_cpu(p)) {

From 345be81dba1ba6c7ac1df2b9ba355fc290d5a73b Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 29 Jul 2016 15:32:26 +0100
Subject: [PATCH 0217/3220] sched/tune: fix PB and PC cuts indexes definition

The current definition of the Performance Boost (PB) and Performance Constraint
(PC) regions is has two main issues:
1) in the computation of the boost index we overflow the thresholds_gains
   table for boost=100
2) the two cuts had _NOT_ the same ratio

The last point means that when boost=0 we do _not_ have a "standard" EAS
behaviour, i.e. accepting all candidate which decrease energy regardless
of their impact on performances. Instead, we accept only schedule candidate
which are in the Optimal region, i.e. decrease energy while increasing
performances.

This behaviour can have a negative impact also on CPU selection policies
which tries to spread tasks to reduce latencies. Indeed, for example
we could end up rejecting a schedule candidate which want to move a task
from a congested CPU to an idle one while, specifically in the case where
the target CPU will be running on a lower OPP.

This patch fixes these two issues by properly clamping the boost value
in the appropriate range to compute the threshold indexes as well as
by using the same threshold index for both cuts.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>

sched/tune: fix update of threshold index for boost groups

When SchedTune is configured to work with CGroup mode, each time we update
the boost value of a group we do not update the threshed indexes for the
definition of the Performance Boost (PC) and Performance Constraint (PC)
region. This means that while the OPP boosting and CPU biasing selection
is working as expected, the __schedtune_accept_deltas function is always
using the initial values for these cuts.

This patch ensure that each time a new boost value is configured for a
boost group, the cuts for the PB and PC region are properly updated too.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>

sched/tune: update PC and PB cuts definition

The current definition of Performance Boost (PB) and Performance
Constraint (PC) cuts defines two "dead regions":
- up to 20% boost: we are in energy-reduction only mode, i.e.
  accept all candidate which reduce energy
- over 70% boost: we are in performance-increase only mode, i.e.
  accept only sched candidate which do not reduce performances

This patch uses a more fine grained configuration where these two "dead
regions" are reduced to: up to 10% and over 90%.
This should allow to have some boosting benefits starting from 10% boost
values as well as not being to much permissive starting from boost values
of 80%.

Suggested-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>

bug: 28312446
Change-Id: Ia326c66521e38c98e7a7eddbbb7c437875efa1ba

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/tune.c | 58 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 4c77cc23e65b..d24f365b0c90 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -38,16 +38,16 @@ struct threshold_params {
  */
 static struct threshold_params
 threshold_gains[] = {
-	{ 0, 4 }, /* >=  0% */
-	{ 0, 4 }, /* >= 10% */
-	{ 1, 4 }, /* >= 20% */
-	{ 2, 4 }, /* >= 30% */
-	{ 3, 4 }, /* >= 40% */
-	{ 4, 3 }, /* >= 50% */
-	{ 4, 2 }, /* >= 60% */
-	{ 4, 1 }, /* >= 70% */
-	{ 4, 0 }, /* >= 80% */
-	{ 4, 0 }  /* >= 90% */
+	{ 0, 5 }, /*   < 10% */
+	{ 1, 5 }, /*   < 20% */
+	{ 2, 5 }, /*   < 30% */
+	{ 3, 5 }, /*   < 40% */
+	{ 4, 5 }, /*   < 50% */
+	{ 5, 4 }, /*   < 60% */
+	{ 5, 3 }, /*   < 70% */
+	{ 5, 2 }, /*   < 80% */
+	{ 5, 1 }, /*   < 90% */
+	{ 5, 0 }  /* <= 100% */
 };
 
 static int
@@ -549,13 +549,29 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
 	    s64 boost)
 {
 	struct schedtune *st = css_st(css);
+	unsigned threshold_idx;
+	int boost_pct;
 
 	if (boost < -100 || boost > 100)
 		return -EINVAL;
+	boost_pct = boost;
+
+	/*
+	 * Update threshold params for Performance Boost (B)
+	 * and Performance Constraint (C) regions.
+	 * The current implementatio uses the same cuts for both
+	 * B and C regions.
+	 */
+	threshold_idx = clamp(boost_pct, 0, 99) / 10;
+	st->perf_boost_idx = threshold_idx;
+	st->perf_constrain_idx = threshold_idx;
 
 	st->boost = boost;
-	if (css == &root_schedtune.css)
+	if (css == &root_schedtune.css) {
 		sysctl_sched_cfs_boost = boost;
+		perf_boost_idx  = threshold_idx;
+		perf_constrain_idx  = threshold_idx;
+	}
 
 	/* Update CPU boost */
 	schedtune_boostgroup_update(st->idx, st->boost);
@@ -710,17 +726,25 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
 			       loff_t *ppos)
 {
 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	unsigned threshold_idx;
+	int boost_pct;
 
 	if (ret || !write)
 		return ret;
 
-	/* Performance Boost (B) region threshold params */
-	perf_boost_idx  = sysctl_sched_cfs_boost;
-	perf_boost_idx /= 10;
+	if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
+		return -EINVAL;
+	boost_pct = sysctl_sched_cfs_boost;
 
-	/* Performance Constraint (C) region threshold params */
-	perf_constrain_idx  = 100 - sysctl_sched_cfs_boost;
-	perf_constrain_idx /= 10;
+	/*
+	 * Update threshold params for Performance Boost (B)
+	 * and Performance Constraint (C) regions.
+	 * The current implementatio uses the same cuts for both
+	 * B and C regions.
+	 */
+	threshold_idx = clamp(boost_pct, 0, 99) / 10;
+	perf_boost_idx = threshold_idx;
+	perf_constrain_idx = threshold_idx;
 
 	return 0;
 }

From efb86bd08a2e9217d0b3c33753cf63d27e7c86da Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@codeaurora.org>
Date: Tue, 31 May 2016 09:08:38 -0700
Subject: [PATCH 0218/3220] sched: Introduce Window Assisted Load Tracking
 (WALT)

use a window based view of time in order to track task
demand and CPU utilization in the scheduler.

Window Assisted Load Tracking (WALT) implementation credits:
 Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
 Pavan Kumar Kondeti, Olav Haugan

2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
            and Todd Kjos

Change-Id: I21408236836625d4e7d7de1843d20ed5ff36c708

Includes fixes for issues:

eas/walt: Use walt_ktime_clock() instead of ktime_get_ns() to avoid a
race resulting in watchdog resets
BUG: 29353986
Change-Id: Ic1820e22a136f7c7ebd6f42e15f14d470f6bbbdb

Handle walt accounting anomoly during resume

During resume, there is a corner case where on wakeup, a task's
prev_runnable_sum can go negative. This is a workaround that
fixes the condition and warns (instead of crashing).

BUG: 29464099
Change-Id: I173e7874324b31a3584435530281708145773508

Signed-off-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/sched.h        |   53 ++
 include/linux/sched/sysctl.h |    5 +
 include/trace/events/sched.h |  149 +++++
 init/Kconfig                 |    9 +
 kernel/sched/Makefile        |    1 +
 kernel/sched/core.c          |   43 +-
 kernel/sched/fair.c          |   20 +
 kernel/sched/rt.c            |    4 +
 kernel/sched/sched.h         |   34 ++
 kernel/sched/stop_task.c     |    3 +
 kernel/sched/walt.c          | 1098 ++++++++++++++++++++++++++++++++++
 kernel/sched/walt.h          |   57 ++
 kernel/sysctl.c              |   23 +
 13 files changed, 1498 insertions(+), 1 deletion(-)
 create mode 100644 kernel/sched/walt.c
 create mode 100644 kernel/sched/walt.h

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ee59abcab30d..ce8db12f9e72 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -317,6 +317,15 @@ extern char ___assert_task_state[1 - 2*!!(
 /* Task command name length */
 #define TASK_COMM_LEN 16
 
+enum task_event {
+	PUT_PREV_TASK   = 0,
+	PICK_NEXT_TASK  = 1,
+	TASK_WAKE       = 2,
+	TASK_MIGRATE    = 3,
+	TASK_UPDATE     = 4,
+	IRQ_UPDATE	= 5,
+};
+
 #include <linux/spinlock.h>
 
 /*
@@ -1274,6 +1283,41 @@ struct sched_statistics {
 };
 #endif
 
+#ifdef CONFIG_SCHED_WALT
+#define RAVG_HIST_SIZE_MAX  5
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+	/*
+	 * 'mark_start' marks the beginning of an event (task waking up, task
+	 * starting to execute, task being preempted) within a window
+	 *
+	 * 'sum' represents how runnable a task has been within current
+	 * window. It incorporates both running time and wait time and is
+	 * frequency scaled.
+	 *
+	 * 'sum_history' keeps track of history of 'sum' seen over previous
+	 * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+	 * ignored.
+	 *
+	 * 'demand' represents maximum sum seen over previous
+	 * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+	 * demand for tasks.
+	 *
+	 * 'curr_window' represents task's contribution to cpu busy time
+	 * statistics (rq->curr_runnable_sum) in current window
+	 *
+	 * 'prev_window' represents task's contribution to cpu busy time
+	 * statistics (rq->prev_runnable_sum) in previous window
+	 */
+	u64 mark_start;
+	u32 sum, demand;
+	u32 sum_history[RAVG_HIST_SIZE_MAX];
+	u32 curr_window, prev_window;
+	u16 active_windows;
+};
+#endif
+
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
@@ -1431,6 +1475,15 @@ struct task_struct {
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+#ifdef CONFIG_SCHED_WALT
+	struct ravg ravg;
+	/*
+	 * 'init_load_pct' represents the initial task load assigned to children
+	 * of this task
+	 */
+	u32 init_load_pct;
+#endif
+
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 2834841c507e..710f58a28d63 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -43,6 +43,11 @@ extern unsigned int sysctl_sched_is_big_little;
 extern unsigned int sysctl_sched_sync_hint_enable;
 extern unsigned int sysctl_sched_initial_task_util;
 extern unsigned int sysctl_sched_cstate_aware;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+#endif
 
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7dc39dd384de..33b1c719c9d7 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -888,6 +888,155 @@ TRACE_EVENT(sched_tune_filter,
 		__entry->payoff, __entry->region)
 );
 
+#ifdef CONFIG_SCHED_WALT
+struct rq;
+
+TRACE_EVENT(walt_update_task_ravg,
+
+	TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
+						u64 wallclock, u64 irqtime),
+
+	TP_ARGS(p, rq, evt, wallclock, irqtime),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,   TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	pid_t,	cur_pid			)
+		__field(unsigned int,	cur_freq		)
+		__field(	u64,	wallclock		)
+		__field(	u64,	mark_start		)
+		__field(	u64,	delta_m			)
+		__field(	u64,	win_start		)
+		__field(	u64,	delta			)
+		__field(	u64,	irqtime			)
+		__field(        int,    evt			)
+		__field(unsigned int,	demand			)
+		__field(unsigned int,	sum			)
+		__field(	 int,	cpu			)
+		__field(	u64,	cs			)
+		__field(	u64,	ps			)
+		__field(	u32,	curr_window		)
+		__field(	u32,	prev_window		)
+		__field(	u64,	nt_cs			)
+		__field(	u64,	nt_ps			)
+		__field(	u32,	active_windows		)
+	),
+
+	TP_fast_assign(
+		__entry->wallclock      = wallclock;
+		__entry->win_start      = rq->window_start;
+		__entry->delta          = (wallclock - rq->window_start);
+		__entry->evt            = evt;
+		__entry->cpu            = rq->cpu;
+		__entry->cur_pid        = rq->curr->pid;
+		__entry->cur_freq       = rq->cur_freq;
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid            = p->pid;
+		__entry->mark_start     = p->ravg.mark_start;
+		__entry->delta_m        = (wallclock - p->ravg.mark_start);
+		__entry->demand         = p->ravg.demand;
+		__entry->sum            = p->ravg.sum;
+		__entry->irqtime        = irqtime;
+		__entry->cs             = rq->curr_runnable_sum;
+		__entry->ps             = rq->prev_runnable_sum;
+		__entry->curr_window	= p->ravg.curr_window;
+		__entry->prev_window	= p->ravg.prev_window;
+		__entry->nt_cs		= rq->nt_curr_runnable_sum;
+		__entry->nt_ps		= rq->nt_prev_runnable_sum;
+		__entry->active_windows	= p->ravg.active_windows;
+	),
+
+	TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
+		" cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u"
+		, __entry->wallclock, __entry->win_start, __entry->delta,
+		__entry->evt, __entry->cpu,
+		__entry->cur_freq, __entry->cur_pid,
+		__entry->pid, __entry->comm, __entry->mark_start,
+		__entry->delta_m, __entry->demand,
+		__entry->sum, __entry->irqtime,
+		__entry->cs, __entry->ps,
+		__entry->curr_window, __entry->prev_window,
+		  __entry->nt_cs, __entry->nt_ps,
+		  __entry->active_windows
+		)
+);
+
+TRACE_EVENT(walt_update_history,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+			int evt),
+
+	TP_ARGS(rq, p, runtime, samples, evt),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,   TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(unsigned int,	runtime			)
+		__field(	 int,	samples			)
+		__field(	 int,	evt			)
+		__field(	 u64,	demand			)
+		__field(unsigned int,	walt_avg		)
+		__field(unsigned int,	pelt_avg		)
+		__array(	 u32,	hist, RAVG_HIST_SIZE_MAX)
+		__field(	 int,	cpu			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid            = p->pid;
+		__entry->runtime        = runtime;
+		__entry->samples        = samples;
+		__entry->evt            = evt;
+		__entry->demand         = p->ravg.demand;
+		__entry->walt_avg = (__entry->demand << 10) / walt_ravg_window,
+		__entry->pelt_avg	= p->se.avg.util_avg;
+		memcpy(__entry->hist, p->ravg.sum_history,
+					RAVG_HIST_SIZE_MAX * sizeof(u32));
+		__entry->cpu            = rq->cpu;
+	),
+
+	TP_printk("%d (%s): runtime %u samples %d event %d demand %llu"
+		" walt %u pelt %u (hist: %u %u %u %u %u) cpu %d",
+		__entry->pid, __entry->comm,
+		__entry->runtime, __entry->samples, __entry->evt,
+		__entry->demand,
+		__entry->walt_avg,
+		__entry->pelt_avg,
+		__entry->hist[0], __entry->hist[1],
+		__entry->hist[2], __entry->hist[3],
+		__entry->hist[4], __entry->cpu)
+);
+
+TRACE_EVENT(walt_migration_update_sum,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+
+	TP_ARGS(rq, p),
+
+	TP_STRUCT__entry(
+		__field(int,		cpu			)
+		__field(int,		pid			)
+		__field(	u64,	cs			)
+		__field(	u64,	ps			)
+		__field(	s64,	nt_cs			)
+		__field(	s64,	nt_ps			)
+	),
+
+	TP_fast_assign(
+		__entry->cpu		= cpu_of(rq);
+		__entry->cs		= rq->curr_runnable_sum;
+		__entry->ps		= rq->prev_runnable_sum;
+		__entry->nt_cs		= (s64)rq->nt_curr_runnable_sum;
+		__entry->nt_ps		= (s64)rq->nt_prev_runnable_sum;
+		__entry->pid		= p->pid;
+	),
+
+	TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d",
+		  __entry->cpu, __entry->cs, __entry->ps,
+		  __entry->nt_cs, __entry->nt_ps, __entry->pid)
+);
+#endif /* CONFIG_SCHED_WALT */
+
 #endif /* CONFIG_SMP */
 
 #endif /* _TRACE_SCHED_H */
diff --git a/init/Kconfig b/init/Kconfig
index facb2b587d5b..33f4e3965e3c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -392,6 +392,15 @@ config IRQ_TIME_ACCOUNTING
 
 endchoice
 
+config SCHED_WALT
+        bool "Support window based load tracking"
+        depends on SMP
+        help
+        This feature will allow the scheduler to maintain a tunable window
+	based set of metrics for tasks and runqueues. These metrics can be
+	used to guide task placement as well as task frequency requirements
+	for cpufreq governors.
+
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
 	depends on MULTIUSER
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 174762d8695b..623ce4bde0d5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -15,6 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f56a528c8ae7..b6f41850ed00 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -89,6 +89,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
+#include "walt.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1085,7 +1086,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
 	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	double_lock_balance(rq, cpu_rq(new_cpu));
 	set_task_cpu(p, new_cpu);
+	double_unlock_balance(rq, cpu_rq(new_cpu));
 	raw_spin_unlock(&rq->lock);
 
 	rq = cpu_rq(new_cpu);
@@ -1309,6 +1312,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 			p->sched_class->migrate_task_rq(p);
 		p->se.nr_migrations++;
 		perf_event_task_migrate(p);
+
+		walt_fixup_busy_time(p, new_cpu);
 	}
 
 	__set_task_cpu(p, new_cpu);
@@ -1937,6 +1942,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
 	unsigned long flags;
 	int cpu, success = 0;
+#ifdef CONFIG_SMP
+	struct rq *rq;
+	u64 wallclock;
+#endif
 
 	/*
 	 * If we are going to wake up a thread waiting for CONDITION we
@@ -1994,6 +2003,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	smp_rmb();
 
+	rq = cpu_rq(task_cpu(p));
+
+	raw_spin_lock(&rq->lock);
+	wallclock = walt_ktime_clock();
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+	raw_spin_unlock(&rq->lock);
+
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
@@ -2001,10 +2018,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		p->sched_class->task_waking(p);
 
 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
 	}
+
 #endif /* CONFIG_SMP */
 
 	ttwu_queue(p, cpu);
@@ -2053,8 +2072,13 @@ static void try_to_wake_up_local(struct task_struct *p)
 
 	trace_sched_waking(p);
 
-	if (!task_on_rq_queued(p))
+	if (!task_on_rq_queued(p)) {
+		u64 wallclock = walt_ktime_clock();
+
+		walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+		walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+	}
 
 	ttwu_do_wakeup(rq, p, 0);
 	ttwu_stat(p, smp_processor_id(), 0);
@@ -2120,6 +2144,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
+	walt_init_new_task_load(p);
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2387,6 +2412,9 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq *rq;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+
+	walt_init_new_task_load(p);
+
 	/* Initialize new task's runnable average */
 	init_entity_runnable_average(&p->se);
 #ifdef CONFIG_SMP
@@ -2399,6 +2427,7 @@ void wake_up_new_task(struct task_struct *p)
 #endif
 
 	rq = __task_rq_lock(p);
+	walt_mark_task_starting(p);
 	activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
@@ -2948,9 +2977,12 @@ void scheduler_tick(void)
 	sched_clock_tick();
 
 	raw_spin_lock(&rq->lock);
+	walt_set_window_start(rq);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+			walt_ktime_clock(), 0);
 	calc_global_load_tick(rq);
 	sched_freq_tick(cpu);
 	raw_spin_unlock(&rq->lock);
@@ -3188,6 +3220,7 @@ static void __sched notrace __schedule(bool preempt)
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
+	u64 wallclock;
 
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
@@ -3249,6 +3282,9 @@ static void __sched notrace __schedule(bool preempt)
 		update_rq_clock(rq);
 
 	next = pick_next_task(rq, prev);
+	wallclock = walt_ktime_clock();
+	walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+	walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->clock_skip_update = 0;
@@ -5669,6 +5705,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	switch (action & ~CPU_TASKS_FROZEN) {
 
 	case CPU_UP_PREPARE:
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		walt_set_window_start(rq);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		rq->calc_load_update = calc_load_update;
 		break;
 
@@ -5688,6 +5727,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		sched_ttwu_pending();
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
+		walt_migrate_sync_cpu(cpu);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
@@ -7532,6 +7572,7 @@ void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
 
+	walt_init_cpu_efficiency();
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6efdcad51603..79d54592b53a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,11 +30,13 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/module.h>
 
 #include <trace/events/sched.h>
 
 #include "sched.h"
 #include "tune.h"
+#include "walt.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -56,6 +58,10 @@ unsigned int sysctl_sched_sync_hint_enable = 1;
 unsigned int sysctl_sched_initial_task_util = 0;
 unsigned int sysctl_sched_cstate_aware = 1;
 
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+#endif
 /*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -4209,6 +4215,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running++;
+		walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
 		flags = ENQUEUE_WAKEUP;
 	}
@@ -4216,6 +4223,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running++;
+		walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
 		if (cfs_rq_throttled(cfs_rq))
 			break;
@@ -4230,6 +4238,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_SMP
 
 	if (!se) {
+		walt_inc_cumulative_runnable_avg(rq, p);
 		if (!task_new && !rq->rd->overutilized &&
 		    cpu_overutilized(rq->cpu))
 			rq->rd->overutilized = true;
@@ -4279,6 +4288,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running--;
+		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
@@ -4299,6 +4309,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
+		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
 		if (cfs_rq_throttled(cfs_rq))
 			break;
@@ -4313,6 +4324,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_SMP
 
 	if (!se) {
+		walt_dec_cumulative_runnable_avg(rq, p);
 
 		/*
 		 * We want to potentially trigger a freq switch
@@ -5207,6 +5219,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 
 static inline unsigned long task_util(struct task_struct *p)
 {
+#ifdef CONFIG_SCHED_WALT
+	if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+		unsigned long demand = p->ravg.demand;
+		return (demand << 10) / walt_ravg_window;
+	}
+#endif
 	return p->se.avg.util_avg;
 }
 
@@ -6599,7 +6617,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 
 	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	double_lock_balance(env->src_rq, env->dst_rq);
 	set_task_cpu(p, env->dst_cpu);
+	double_unlock_balance(env->src_rq, env->dst_rq);
 }
 
 /*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9694204660b7..be700bfa1ae4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,6 +8,8 @@
 #include <linux/slab.h>
 #include <linux/irq_work.h>
 
+#include "walt.h"
+
 int sched_rr_timeslice = RR_TIMESLICE;
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -1261,6 +1263,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 		rt_se->timeout = 0;
 
 	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+	walt_inc_cumulative_runnable_avg(rq, p);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
@@ -1272,6 +1275,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se);
+	walt_dec_cumulative_runnable_avg(rq, p);
 
 	dequeue_pushable_task(rq, p);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 144d4782a455..bfd99f9bc6f9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -410,6 +410,10 @@ struct cfs_rq {
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
+#ifdef CONFIG_SCHED_WALT
+	u64 cumulative_runnable_avg;
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 	int runtime_enabled;
 	u64 runtime_expires;
@@ -663,6 +667,27 @@ struct rq {
 	u64 max_idle_balance_cost;
 #endif
 
+#ifdef CONFIG_SCHED_WALT
+	/*
+	 * max_freq = user or thermal defined maximum
+	 * max_possible_freq = maximum supported by hardware
+	 */
+	unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+	struct cpumask freq_domain_cpumask;
+
+	u64 cumulative_runnable_avg;
+	int efficiency; /* Differentiate cpus with different IPC capability */
+	int load_scale_factor;
+	int capacity;
+	int max_possible_capacity;
+	u64 window_start;
+	u64 curr_runnable_sum;
+	u64 prev_runnable_sum;
+	u64 nt_curr_runnable_sum;
+	u64 nt_prev_runnable_sum;
+#endif /* CONFIG_SCHED_WALT */
+
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64 prev_irq_time;
 #endif
@@ -1513,6 +1538,10 @@ static inline unsigned long capacity_orig_of(int cpu)
 	return cpu_rq(cpu)->cpu_capacity_orig;
 }
 
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern unsigned int walt_disabled;
+
 /*
  * cpu_util returns the amount of capacity of a CPU that is used by CFS
  * tasks. The unit of the return value must be the one of capacity so we can
@@ -1544,6 +1573,11 @@ static inline unsigned long __cpu_util(int cpu, int delta)
 	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
 	unsigned long capacity = capacity_orig_of(cpu);
 
+#ifdef CONFIG_SCHED_WALT
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+		util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) /
+			walt_ravg_window;
+#endif
 	delta += util;
 	if (delta < 0)
 		return 0;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index cbc67da10954..61f852d46858 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,4 +1,5 @@
 #include "sched.h"
+#include "walt.h"
 
 /*
  * stop-task scheduling class.
@@ -42,12 +43,14 @@ static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	add_nr_running(rq, 1);
+	walt_inc_cumulative_runnable_avg(rq, p);
 }
 
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	sub_nr_running(rq, 1);
+	walt_dec_cumulative_runnable_avg(rq, p);
 }
 
 static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100644
index 000000000000..1dff3d2e2358
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,1098 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ *             and Todd Kjos
+ */
+
+#include <linux/syscore_ops.h>
+#include <linux/cpufreq.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT		0
+#define WINDOW_STATS_MAX		1
+#define WINDOW_STATS_MAX_RECENT_AVG	2
+#define WINDOW_STATS_AVG		3
+#define WINDOW_STATS_INVALID_POLICY	4
+
+#define EXITING_TASK_MARKER	0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+	WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
+unsigned int __read_mostly walt_disabled = 0;
+
+static unsigned int max_possible_efficiency = 1024;
+static unsigned int min_possible_efficiency = 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+static unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+static unsigned int min_max_freq = 1;
+
+static unsigned int max_capacity = 1024;
+static unsigned int min_capacity = 1024;
+static unsigned int max_load_scale_factor = 1024;
+static unsigned int max_possible_capacity = 1024;
+
+/* Mask of all CPUs that have  max_possible_capacity */
+static cpumask_t mpc_mask = CPU_MASK_ALL;
+
+/* Window size (in ns) */
+__read_mostly unsigned int walt_ravg_window = 20000000;
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+	return p->ravg.demand;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+				 struct task_struct *p)
+{
+	rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+				 struct task_struct *p)
+{
+	rq->cumulative_runnable_avg -= p->ravg.demand;
+	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+			      struct task_struct *p, s64 task_load_delta)
+{
+	rq->cumulative_runnable_avg += task_load_delta;
+	if ((s64)rq->cumulative_runnable_avg < 0)
+		panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+			task_load_delta, task_load(p));
+}
+
+u64 walt_ktime_clock(void)
+{
+	if (unlikely(walt_ktime_suspended))
+		return ktime_to_ns(ktime_last);
+	return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+	walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+	ktime_last = ktime_get();
+	walt_ktime_suspended = true;
+	return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+	.resume	= walt_resume,
+	.suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+	register_syscore_ops(&walt_syscore_ops);
+	return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+		struct task_struct *p)
+{
+	cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+		struct task_struct *p)
+{
+	cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+	if (p->flags & PF_EXITING) {
+		if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+			p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+	get_option(&str, &walt_ravg_window);
+
+	walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+				walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+	return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+	s64 delta;
+	int nr_windows;
+
+	delta = wallclock - rq->window_start;
+	BUG_ON(delta < 0);
+	if (delta < walt_ravg_window)
+		return;
+
+	nr_windows = div64_u64(delta, walt_ravg_window);
+	rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+}
+
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+	unsigned int cur_freq = rq->cur_freq;
+	int sf;
+
+	if (unlikely(cur_freq > max_possible_freq))
+		cur_freq = rq->max_possible_freq;
+
+	/* round up div64 */
+	delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
+			  max_possible_freq);
+
+	sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
+
+	delta *= sf;
+	delta >>= 10;
+
+	return delta;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+	if (!walt_io_is_busy)
+		return 0;
+
+	return atomic_read(&rq->nr_iowait);
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+				     u64 irqtime, int event)
+{
+	if (is_idle_task(p)) {
+		/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+		if (event == PICK_NEXT_TASK)
+			return 0;
+
+		/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+		return irqtime || cpu_is_waiting_on_io(rq);
+	}
+
+	if (event == TASK_WAKE)
+		return 0;
+
+	if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+					 event == TASK_UPDATE)
+		return 1;
+
+	/* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+	return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+	     int event, u64 wallclock, u64 irqtime)
+{
+	int new_window, nr_full_windows = 0;
+	int p_is_curr_task = (p == rq->curr);
+	u64 mark_start = p->ravg.mark_start;
+	u64 window_start = rq->window_start;
+	u32 window_size = walt_ravg_window;
+	u64 delta;
+
+	new_window = mark_start < window_start;
+	if (new_window) {
+		nr_full_windows = div64_u64((window_start - mark_start),
+						window_size);
+		if (p->ravg.active_windows < USHRT_MAX)
+			p->ravg.active_windows++;
+	}
+
+	/* Handle per-task window rollover. We don't care about the idle
+	 * task or exiting tasks. */
+	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+		u32 curr_window = 0;
+
+		if (!nr_full_windows)
+			curr_window = p->ravg.curr_window;
+
+		p->ravg.prev_window = curr_window;
+		p->ravg.curr_window = 0;
+	}
+
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+		/* account_busy_for_cpu_time() = 0, so no update to the
+		 * task's current window needs to be made. This could be
+		 * for example
+		 *
+		 *   - a wakeup event on a task within the current
+		 *     window (!new_window below, no action required),
+		 *   - switching to a new task from idle (PICK_NEXT_TASK)
+		 *     in a new window where irqtime is 0 and we aren't
+		 *     waiting on IO */
+
+		if (!new_window)
+			return;
+
+		/* A new window has started. The RQ demand must be rolled
+		 * over if p is the current task. */
+		if (p_is_curr_task) {
+			u64 prev_sum = 0;
+
+			/* p is either idle task or an exiting task */
+			if (!nr_full_windows) {
+				prev_sum = rq->curr_runnable_sum;
+			}
+
+			rq->prev_runnable_sum = prev_sum;
+			rq->curr_runnable_sum = 0;
+		}
+
+		return;
+	}
+
+	if (!new_window) {
+		/* account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. No rollover
+		 * since we didn't start a new window. An example of this is
+		 * when a task starts execution and then sleeps within the
+		 * same window. */
+
+		if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+			delta = wallclock - mark_start;
+		else
+			delta = irqtime;
+		delta = scale_exec_time(delta, rq);
+		rq->curr_runnable_sum += delta;
+		if (!is_idle_task(p) && !exiting_task(p))
+			p->ravg.curr_window += delta;
+
+		return;
+	}
+
+	if (!p_is_curr_task) {
+		/* account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has also started, but p is not the current task, so the
+		 * window is not rolled over - just split up and account
+		 * as necessary into curr and prev. The window is only
+		 * rolled over when a new window is processed for the current
+		 * task.
+		 *
+		 * Irqtime can't be accounted by a task that isn't the
+		 * currently running task. */
+
+		if (!nr_full_windows) {
+			/* A full window hasn't elapsed, account partial
+			 * contribution to previous completed window. */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			if (!exiting_task(p))
+				p->ravg.prev_window += delta;
+		} else {
+			/* Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size). */
+			delta = scale_exec_time(window_size, rq);
+			if (!exiting_task(p))
+				p->ravg.prev_window = delta;
+		}
+		rq->prev_runnable_sum += delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		rq->curr_runnable_sum += delta;
+		if (!exiting_task(p))
+			p->ravg.curr_window = delta;
+
+		return;
+	}
+
+	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+		/* account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. If any of these three above conditions are true
+		 * then this busy time can't be accounted as irqtime.
+		 *
+		 * Busy time for the idle task or exiting tasks need not
+		 * be accounted.
+		 *
+		 * An example of this would be a task that starts execution
+		 * and then sleeps once a new window has begun. */
+
+		if (!nr_full_windows) {
+			/* A full window hasn't elapsed, account partial
+			 * contribution to previous completed window. */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			if (!is_idle_task(p) && !exiting_task(p))
+				p->ravg.prev_window += delta;
+
+			delta += rq->curr_runnable_sum;
+		} else {
+			/* Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size). */
+			delta = scale_exec_time(window_size, rq);
+			if (!is_idle_task(p) && !exiting_task(p))
+				p->ravg.prev_window = delta;
+
+		}
+		/*
+		 * Rollover for normal runnable sum is done here by overwriting
+		 * the values in prev_runnable_sum and curr_runnable_sum.
+		 * Rollover for new task runnable sum has completed by previous
+		 * if-else statement.
+		 */
+		rq->prev_runnable_sum = delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		rq->curr_runnable_sum = delta;
+		if (!is_idle_task(p) && !exiting_task(p))
+			p->ravg.curr_window = delta;
+
+		return;
+	}
+
+	if (irqtime) {
+		/* account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. The current task must be the idle task because
+		 * irqtime is not accounted for any other task.
+		 *
+		 * Irqtime will be accounted each time we process IRQ activity
+		 * after a period of idleness, so we know the IRQ busy time
+		 * started at wallclock - irqtime. */
+
+		BUG_ON(!is_idle_task(p));
+		mark_start = wallclock - irqtime;
+
+		/* Roll window over. If IRQ busy time was just in the current
+		 * window then that is all that need be accounted. */
+		rq->prev_runnable_sum = rq->curr_runnable_sum;
+		if (mark_start > window_start) {
+			rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+			return;
+		}
+
+		/* The IRQ busy time spanned multiple windows. Process the
+		 * busy time preceding the current window start first. */
+		delta = window_start - mark_start;
+		if (delta > window_size)
+			delta = window_size;
+		delta = scale_exec_time(delta, rq);
+		rq->prev_runnable_sum += delta;
+
+		/* Process the remaining IRQ busy time in the current window. */
+		delta = wallclock - window_start;
+		rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+		return;
+	}
+
+	BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+	/* No need to bother updating task demand for exiting tasks
+	 * or the idle task. */
+	if (exiting_task(p) || is_idle_task(p))
+		return 0;
+
+	/* When a task is waking up it is completing a segment of non-busy
+	 * time. Likewise, if wait time is not treated as busy time, then
+	 * when a task begins to run or is migrated, it is not running and
+	 * is completing a segment of non-busy time. */
+	if (event == TASK_WAKE || (!walt_account_wait_time &&
+			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+			 u32 runtime, int samples, int event)
+{
+	u32 *hist = &p->ravg.sum_history[0];
+	int ridx, widx;
+	u32 max = 0, avg, demand;
+	u64 sum = 0;
+
+	/* Ignore windows where task had no activity */
+	if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+			goto done;
+
+	/* Push new 'runtime' value onto stack */
+	widx = walt_ravg_hist_size - 1;
+	ridx = widx - samples;
+	for (; ridx >= 0; --widx, --ridx) {
+		hist[widx] = hist[ridx];
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
+	}
+
+	for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+		hist[widx] = runtime;
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
+	}
+
+	p->ravg.sum = 0;
+
+	if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+		demand = runtime;
+	} else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+		demand = max;
+	} else {
+		avg = div64_u64(sum, walt_ravg_hist_size);
+		if (walt_window_stats_policy == WINDOW_STATS_AVG)
+			demand = avg;
+		else
+			demand = max(avg, runtime);
+	}
+
+	/*
+	 * A throttled deadline sched class task gets dequeued without
+	 * changing p->on_rq. Since the dequeue decrements hmp stats
+	 * avoid decrementing it here again.
+	 */
+	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+						!p->dl.dl_throttled))
+		fixup_cumulative_runnable_avg(rq, p, demand);
+
+	p->ravg.demand = demand;
+
+done:
+	trace_walt_update_history(rq, p, runtime, samples, event);
+	return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+				u64 delta)
+{
+	delta = scale_exec_time(delta, rq);
+	p->ravg.sum += delta;
+	if (unlikely(p->ravg.sum > walt_ravg_window))
+		p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ *	a) Task event is contained within one window.
+ *		window_start < mark_start < wallclock
+ *
+ *		ws   ms  wc
+ *		|    |   |
+ *		V    V   V
+ *		|---------------|
+ *
+ *	In this case, p->ravg.sum is updated *iff* event is appropriate
+ *	(ex: event == PUT_PREV_TASK)
+ *
+ *	b) Task event spans two windows.
+ *		mark_start < window_start < wallclock
+ *
+ *		ms   ws   wc
+ *		|    |    |
+ *		V    V    V
+ *		-----|-------------------
+ *
+ *	In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ *	is appropriate, then a new window sample is recorded followed
+ *	by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ *	c) Task event spans more than two windows.
+ *
+ *		ms ws_tmp			   ws  wc
+ *		|  |				   |   |
+ *		V  V				   V   V
+ *		---|-------|-------|-------|-------|------
+ *		   |				   |
+ *		   |<------ nr_full_windows ------>|
+ *
+ *	In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ *	event is appropriate, window sample of p->ravg.sum is recorded,
+ *	'nr_full_window' samples of window_size is also recorded *iff*
+ *	event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ *	*iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+	     int event, u64 wallclock)
+{
+	u64 mark_start = p->ravg.mark_start;
+	u64 delta, window_start = rq->window_start;
+	int new_window, nr_full_windows;
+	u32 window_size = walt_ravg_window;
+
+	new_window = mark_start < window_start;
+	if (!account_busy_for_task_demand(p, event)) {
+		if (new_window)
+			/* If the time accounted isn't being accounted as
+			 * busy time, and a new window started, only the
+			 * previous window need be closed out with the
+			 * pre-existing demand. Multiple windows may have
+			 * elapsed, but since empty windows are dropped,
+			 * it is not necessary to account those. */
+			update_history(rq, p, p->ravg.sum, 1, event);
+		return;
+	}
+
+	if (!new_window) {
+		/* The simple case - busy time contained within the existing
+		 * window. */
+		add_to_task_demand(rq, p, wallclock - mark_start);
+		return;
+	}
+
+	/* Busy time spans at least two windows. Temporarily rewind
+	 * window_start to first window boundary after mark_start. */
+	delta = window_start - mark_start;
+	nr_full_windows = div64_u64(delta, window_size);
+	window_start -= (u64)nr_full_windows * (u64)window_size;
+
+	/* Process (window_start - mark_start) first */
+	add_to_task_demand(rq, p, window_start - mark_start);
+
+	/* Push new sample(s) into task's demand history */
+	update_history(rq, p, p->ravg.sum, 1, event);
+	if (nr_full_windows)
+		update_history(rq, p, scale_exec_time(window_size, rq),
+			       nr_full_windows, event);
+
+	/* Roll window_start back to current to process any remainder
+	 * in current window. */
+	window_start += (u64)nr_full_windows * (u64)window_size;
+
+	/* Process (wallclock - window_start) next */
+	mark_start = window_start;
+	add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+	     int event, u64 wallclock, u64 irqtime)
+{
+	if (walt_disabled || !rq->window_start)
+		return;
+
+	lockdep_assert_held(&rq->lock);
+
+	update_window_start(rq, wallclock);
+
+	if (!p->ravg.mark_start)
+		goto done;
+
+	update_task_demand(p, rq, event, wallclock);
+	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+	trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+	p->ravg.mark_start = wallclock;
+}
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+void walt_init_cpu_efficiency(void)
+{
+	int i, efficiency;
+	unsigned int max = 0, min = UINT_MAX;
+
+	for_each_possible_cpu(i) {
+		efficiency = arch_get_cpu_efficiency(i);
+		cpu_rq(i)->efficiency = efficiency;
+
+		if (efficiency > max)
+			max = efficiency;
+		if (efficiency < min)
+			min = efficiency;
+	}
+
+	if (max)
+		max_possible_efficiency = max;
+
+	if (min)
+		min_possible_efficiency = min;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+	u32 sum = 0;
+
+	if (exiting_task(p))
+		sum = EXITING_TASK_MARKER;
+
+	memset(&p->ravg, 0, sizeof(struct ravg));
+	/* Retain EXITING_TASK marker */
+	p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+	u64 wallclock;
+	struct rq *rq = task_rq(p);
+
+	if (!rq->window_start) {
+		reset_task_stats(p);
+		return;
+	}
+
+	wallclock = walt_ktime_clock();
+	p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq)
+{
+	int cpu = cpu_of(rq);
+	struct rq *sync_rq = cpu_rq(sync_cpu);
+
+	if (rq->window_start)
+		return;
+
+	if (cpu == sync_cpu) {
+		rq->window_start = walt_ktime_clock();
+	} else {
+		raw_spin_unlock(&rq->lock);
+		double_rq_lock(rq, sync_rq);
+		rq->window_start = cpu_rq(sync_cpu)->window_start;
+		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+		raw_spin_unlock(&sync_rq->lock);
+	}
+
+	rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+	if (cpu == sync_cpu)
+		sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+	struct rq *src_rq = task_rq(p);
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	u64 wallclock;
+
+	if (!p->on_rq && p->state != TASK_WAKING)
+		return;
+
+	if (exiting_task(p)) {
+		return;
+	}
+
+	if (p->state == TASK_WAKING)
+		double_rq_lock(src_rq, dest_rq);
+
+	wallclock = walt_ktime_clock();
+
+	walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+			TASK_UPDATE, wallclock, 0);
+	walt_update_task_ravg(dest_rq->curr, dest_rq,
+			TASK_UPDATE, wallclock, 0);
+
+	walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+	if (p->ravg.curr_window) {
+		src_rq->curr_runnable_sum -= p->ravg.curr_window;
+		dest_rq->curr_runnable_sum += p->ravg.curr_window;
+	}
+
+	if (p->ravg.prev_window) {
+		src_rq->prev_runnable_sum -= p->ravg.prev_window;
+		dest_rq->prev_runnable_sum += p->ravg.prev_window;
+	}
+
+	if ((s64)src_rq->prev_runnable_sum < 0) {
+		src_rq->prev_runnable_sum = 0;
+		WARN_ON(1);
+	}
+	if ((s64)src_rq->curr_runnable_sum < 0) {
+		src_rq->curr_runnable_sum = 0;
+		WARN_ON(1);
+	}
+
+	trace_walt_migration_update_sum(src_rq, p);
+	trace_walt_migration_update_sum(dest_rq, p);
+
+	if (p->state == TASK_WAKING)
+		double_rq_unlock(src_rq, dest_rq);
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+	int i;
+	int max = 0, min = INT_MAX;
+
+	for_each_online_cpu(i) {
+		if (cpu_rq(i)->capacity > max)
+			max = cpu_rq(i)->capacity;
+		if (cpu_rq(i)->capacity < min)
+			min = cpu_rq(i)->capacity;
+	}
+
+	max_capacity = max;
+	min_capacity = min;
+}
+
+static void update_min_max_capacity(void)
+{
+	unsigned long flags;
+	int i;
+
+	local_irq_save(flags);
+	for_each_possible_cpu(i)
+		raw_spin_lock(&cpu_rq(i)->lock);
+
+	__update_min_max_capacity();
+
+	for_each_possible_cpu(i)
+		raw_spin_unlock(&cpu_rq(i)->lock);
+	local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long capacity_scale_cpu_efficiency(int cpu)
+{
+	return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(int cpu)
+{
+	return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static unsigned long load_scale_cpu_efficiency(int cpu)
+{
+	return DIV_ROUND_UP(1024 * max_possible_efficiency,
+			    cpu_rq(cpu)->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static unsigned long load_scale_cpu_freq(int cpu)
+{
+	return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
+}
+
+static int compute_capacity(int cpu)
+{
+	int capacity = 1024;
+
+	capacity *= capacity_scale_cpu_efficiency(cpu);
+	capacity >>= 10;
+
+	capacity *= capacity_scale_cpu_freq(cpu);
+	capacity >>= 10;
+
+	return capacity;
+}
+
+static int compute_load_scale_factor(int cpu)
+{
+	int load_scale = 1024;
+
+	/*
+	 * load_scale_factor accounts for the fact that task load
+	 * is in reference to "best" performing cpu. Task's load will need to be
+	 * scaled (up) by a factor to determine suitability to be placed on a
+	 * (little) cpu.
+	 */
+	load_scale *= load_scale_cpu_efficiency(cpu);
+	load_scale >>= 10;
+
+	load_scale *= load_scale_cpu_freq(cpu);
+	load_scale >>= 10;
+
+	return load_scale;
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+		unsigned long val, void *data)
+{
+	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+	int i, update_max = 0;
+	u64 highest_mpc = 0, highest_mplsf = 0;
+	const struct cpumask *cpus = policy->related_cpus;
+	unsigned int orig_min_max_freq = min_max_freq;
+	unsigned int orig_max_possible_freq = max_possible_freq;
+	/* Initialized to policy->max in case policy->related_cpus is empty! */
+	unsigned int orig_max_freq = policy->max;
+
+	if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
+						val != CPUFREQ_CREATE_POLICY)
+		return 0;
+
+	if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
+		update_min_max_capacity();
+		return 0;
+	}
+
+	for_each_cpu(i, policy->related_cpus) {
+		cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
+			     policy->related_cpus);
+		orig_max_freq = cpu_rq(i)->max_freq;
+		cpu_rq(i)->min_freq = policy->min;
+		cpu_rq(i)->max_freq = policy->max;
+		cpu_rq(i)->cur_freq = policy->cur;
+		cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
+	}
+
+	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+	if (min_max_freq == 1)
+		min_max_freq = UINT_MAX;
+	min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+	BUG_ON(!min_max_freq);
+	BUG_ON(!policy->max);
+
+	/* Changes to policy other than max_freq don't require any updates */
+	if (orig_max_freq == policy->max)
+		return 0;
+
+	/*
+	 * A changed min_max_freq or max_possible_freq (possible during bootup)
+	 * needs to trigger re-computation of load_scale_factor and capacity for
+	 * all possible cpus (even those offline). It also needs to trigger
+	 * re-computation of nr_big_task count on all online cpus.
+	 *
+	 * A changed rq->max_freq otoh needs to trigger re-computation of
+	 * load_scale_factor and capacity for just the cluster of cpus involved.
+	 * Since small task definition depends on max_load_scale_factor, a
+	 * changed load_scale_factor of one cluster could influence
+	 * classification of tasks in another cluster. Hence a changed
+	 * rq->max_freq will need to trigger re-computation of nr_big_task
+	 * count on all online cpus.
+	 *
+	 * While it should be sufficient for nr_big_tasks to be
+	 * re-computed for only online cpus, we have inadequate context
+	 * information here (in policy notifier) with regard to hotplug-safety
+	 * context in which notification is issued. As a result, we can't use
+	 * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
+	 * fixed up to issue notification always in hotplug-safe context,
+	 * re-compute nr_big_task for all possible cpus.
+	 */
+
+	if (orig_min_max_freq != min_max_freq ||
+		orig_max_possible_freq != max_possible_freq) {
+			cpus = cpu_possible_mask;
+			update_max = 1;
+	}
+
+	/*
+	 * Changed load_scale_factor can trigger reclassification of tasks as
+	 * big or small. Make this change "atomic" so that tasks are accounted
+	 * properly due to changed load_scale_factor
+	 */
+	for_each_cpu(i, cpus) {
+		struct rq *rq = cpu_rq(i);
+
+		rq->capacity = compute_capacity(i);
+		rq->load_scale_factor = compute_load_scale_factor(i);
+
+		if (update_max) {
+			u64 mpc, mplsf;
+
+			mpc = div_u64(((u64) rq->capacity) *
+				rq->max_possible_freq, rq->max_freq);
+			rq->max_possible_capacity = (int) mpc;
+
+			mplsf = div_u64(((u64) rq->load_scale_factor) *
+				rq->max_possible_freq, rq->max_freq);
+
+			if (mpc > highest_mpc) {
+				highest_mpc = mpc;
+				cpumask_clear(&mpc_mask);
+				cpumask_set_cpu(i, &mpc_mask);
+			} else if (mpc == highest_mpc) {
+				cpumask_set_cpu(i, &mpc_mask);
+			}
+
+			if (mplsf > highest_mplsf)
+				highest_mplsf = mplsf;
+		}
+	}
+
+	if (update_max) {
+		max_possible_capacity = highest_mpc;
+		max_load_scale_factor = highest_mplsf;
+	}
+
+	__update_min_max_capacity();
+
+	return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+		unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+	unsigned int cpu = freq->cpu, new_freq = freq->new;
+	unsigned long flags;
+	int i;
+
+	if (val != CPUFREQ_POSTCHANGE)
+		return 0;
+
+	BUG_ON(!new_freq);
+
+	if (cpu_rq(cpu)->cur_freq == new_freq)
+		return 0;
+
+	for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
+		struct rq *rq = cpu_rq(i);
+
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+				      walt_ktime_clock(), 0);
+		rq->cur_freq = new_freq;
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+
+	return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+	.notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+	.notifier_call = cpufreq_notifier_trans
+};
+
+static int register_sched_callback(void)
+{
+	int ret;
+
+	ret = cpufreq_register_notifier(&notifier_policy_block,
+						CPUFREQ_POLICY_NOTIFIER);
+
+	if (!ret)
+		ret = cpufreq_register_notifier(&notifier_trans_block,
+						CPUFREQ_TRANSITION_NOTIFIER);
+
+	return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+	int i;
+	u32 init_load_windows =
+			div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+                          (u64)walt_ravg_window, 100);
+	u32 init_load_pct = current->init_load_pct;
+
+	p->init_load_pct = 0;
+	memset(&p->ravg, 0, sizeof(struct ravg));
+
+	if (init_load_pct) {
+		init_load_windows = div64_u64((u64)init_load_pct *
+			  (u64)walt_ravg_window, 100);
+	}
+
+	p->ravg.demand = init_load_windows;
+	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+		p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100644
index 000000000000..cabc193a683d
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+		u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+		struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+		struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq);
+void walt_migrate_sync_cpu(int cpu);
+void walt_init_cpu_efficiency(void);
+u64 walt_ktime_clock(void);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+		int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+		struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+		struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline void walt_init_cpu_efficiency(void) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern unsigned int walt_disabled;
+
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4457e107ec20..43c59da62f1e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -310,6 +310,29 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_WALT
+	{
+		.procname	= "sched_use_walt_cpu_util",
+		.data		= &sysctl_sched_use_walt_cpu_util,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_use_walt_task_util",
+		.data		= &sysctl_sched_use_walt_task_util,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_walt_init_task_load_pct",
+		.data		= &sysctl_sched_walt_init_task_load_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "sched_sync_hint_enable",
 		.data		= &sysctl_sched_sync_hint_enable,

From 519c62750eb6ebbb5783315272398ced72d7a036 Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Fri, 22 Jul 2016 13:21:15 +0100
Subject: [PATCH 0219/3220] sched/walt: Accounting for number of irqs pending
 on each core

Schedules on a core whose irq count is less than a threshold.
Improves I/O performance of EAS.

Change-Id: I08ff7dd0d22502a0106fc636b1af2e6fe9e758b5
---
 include/linux/sched/sysctl.h |  1 +
 kernel/sched/core.c          |  5 +++
 kernel/sched/cputime.c       | 16 +++++++++
 kernel/sched/fair.c          |  7 +++-
 kernel/sched/sched.h         |  3 ++
 kernel/sched/walt.c          | 65 ++++++++++++++++++++++++++++++++++++
 kernel/sched/walt.h          |  5 +++
 kernel/sysctl.c              |  7 ++++
 8 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 710f58a28d63..d68e88c9d4d7 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -47,6 +47,7 @@ extern unsigned int sysctl_sched_cstate_aware;
 extern unsigned int sysctl_sched_use_walt_cpu_util;
 extern unsigned int sysctl_sched_use_walt_task_util;
 extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
 #endif
 
 enum sched_tunable_scaling {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b6f41850ed00..deb45d1b4e49 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7750,6 +7750,11 @@ void __init sched_init(void)
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+		rq->cur_irqload = 0;
+		rq->avg_irqload = 0;
+		rq->irqload_ts = 0;
+#endif
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 05de80b48586..442a9f7a2832 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -5,6 +5,7 @@
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
 #include "sched.h"
+#include "walt.h"
 
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -49,6 +50,10 @@ void irqtime_account_irq(struct task_struct *curr)
 	unsigned long flags;
 	s64 delta;
 	int cpu;
+#ifdef CONFIG_SCHED_WALT
+	u64 wallclock;
+	bool account = true;
+#endif
 
 	if (!sched_clock_irqtime)
 		return;
@@ -56,6 +61,9 @@ void irqtime_account_irq(struct task_struct *curr)
 	local_irq_save(flags);
 
 	cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+	wallclock = sched_clock_cpu(cpu);
+#endif
 	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
 	__this_cpu_add(irq_start_time, delta);
 
@@ -70,8 +78,16 @@ void irqtime_account_irq(struct task_struct *curr)
 		__this_cpu_add(cpu_hardirq_time, delta);
 	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
 		__this_cpu_add(cpu_softirq_time, delta);
+#ifdef CONFIG_SCHED_WALT
+	else
+		account = false;
+#endif
 
 	irq_time_write_end();
+#ifdef CONFIG_SCHED_WALT
+	if (account)
+		walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79d54592b53a..0ba5653682e8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -61,6 +61,8 @@ unsigned int sysctl_sched_cstate_aware = 1;
 #ifdef CONFIG_SCHED_WALT
 unsigned int sysctl_sched_use_walt_cpu_util = 1;
 unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+    (10 * NSEC_PER_MSEC);
 #endif
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -4258,7 +4260,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	schedtune_enqueue_task(p, cpu_of(rq));
 
 #endif /* CONFIG_SMP */
-
 	hrtick_update(rq);
 }
 
@@ -5627,6 +5628,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
 		if (new_util > capacity_orig_of(i))
 			continue;
 
+#ifdef CONFIG_SCHED_WALT
+		if (walt_cpu_high_irqload(i))
+			continue;
+#endif
 		/*
 		 * For boosted tasks we favor idle cpus unconditionally to
 		 * improve latency.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bfd99f9bc6f9..911d3a94690e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -685,6 +685,9 @@ struct rq {
 	u64 prev_runnable_sum;
 	u64 nt_curr_runnable_sum;
 	u64 nt_prev_runnable_sum;
+	u64 cur_irqload;
+	u64 avg_irqload;
+	u64 irqload_ts;
 #endif /* CONFIG_SCHED_WALT */
 
 
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 1dff3d2e2358..b9ae8d5c4393 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -221,6 +221,71 @@ static int cpu_is_waiting_on_io(struct rq *rq)
 	return atomic_read(&rq->nr_iowait);
 }
 
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+				 u64 delta, u64 wallclock)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags, nr_windows;
+	u64 cur_jiffies_ts;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	/*
+	 * cputime (wallclock) uses sched_clock so use the same here for
+	 * consistency.
+	 */
+	delta += sched_clock() - wallclock;
+	cur_jiffies_ts = get_jiffies_64();
+
+	if (is_idle_task(curr))
+		walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+				 delta);
+
+	nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+	if (nr_windows) {
+		if (nr_windows < 10) {
+			/* Decay CPU's irqload by 3/4 for each window. */
+			rq->avg_irqload *= (3 * nr_windows);
+			rq->avg_irqload = div64_u64(rq->avg_irqload,
+						    4 * nr_windows);
+		} else {
+			rq->avg_irqload = 0;
+		}
+		rq->avg_irqload += rq->cur_irqload;
+		rq->cur_irqload = 0;
+	}
+
+	rq->cur_irqload += delta;
+	rq->irqload_ts = cur_jiffies_ts;
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+	struct rq *rq = cpu_rq(cpu);
+	s64 delta;
+	delta = get_jiffies_64() - rq->irqload_ts;
+
+        /*
+	 * Current context can be preempted by irq and rq->irqload_ts can be
+	 * updated by irq context so that delta can be negative.
+	 * But this is okay and we can safely return as this means there
+	 * was recent irq occurrence.
+	 */
+
+        if (delta < WALT_HIGH_IRQ_TIMEOUT)
+		return rq->avg_irqload;
+        else
+		return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+	return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
 static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
 				     u64 irqtime, int event)
 {
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index cabc193a683d..e181c87a928d 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -31,6 +31,11 @@ void walt_set_window_start(struct rq *rq);
 void walt_migrate_sync_cpu(int cpu);
 void walt_init_cpu_efficiency(void);
 u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+                                  u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
 
 #else /* CONFIG_SCHED_WALT */
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 43c59da62f1e..2b89e8ff0688 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -332,6 +332,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "sched_walt_cpu_high_irqload",
+		.data		= &sysctl_sched_walt_cpu_high_irqload,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #endif
 	{
 		.procname	= "sched_sync_hint_enable",

From dfc1151b46bb5862154a8be2be867dbc1c8aaeb8 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 30 Jun 2016 15:00:41 +0100
Subject: [PATCH 0220/3220] FIXUP: sched: fix set_cfs_cpu_capacity when WALT is
 in use

The CPU utilization reported when WALT is in use already tracks the
contributions due to RT and DL workloads. However, SchedFreq exposes
different capacity update functions, one for each class, and does classes
utilization internally at update_cpu_capacity_request() call time.

This patch ensures that when WALT is in use, the:
  cpu_sched_capacity_reqs::cfs
value is tracking just the load generated by SCHED_OTHER tasks.

Change-Id: Ibd9c9a10874a1d91f62477034548f7664e57cd6a
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/sched.h | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 911d3a94690e..3111d74a5a85 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1611,8 +1611,27 @@ void update_cpu_capacity_request(int cpu, bool request);
 static inline void set_cfs_cpu_capacity(int cpu, bool request,
 					unsigned long capacity)
 {
-	if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) {
-		per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity;
+	struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+		int rtdl = scr->rt + scr->dl;
+		/*
+		 * WALT tracks the utilization of a CPU considering the load
+		 * generated by all the scheduling classes.
+		 * Since the following call to:
+		 *    update_cpu_capacity
+		 * is already adding the RT and DL utilizations let's remove
+		 * these contributions from the WALT signal.
+		 */
+		if (capacity > rtdl)
+			capacity -= rtdl;
+		else
+			capacity = 0;
+	}
+#endif
+	if (scr->cfs != capacity) {
+		scr->cfs = capacity;
 		update_cpu_capacity_request(cpu, request);
 	}
 }

From 782c9d64a1d12aaf3d6decf7b54e212343bb3f7e Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Wed, 29 Jun 2016 11:30:07 -0700
Subject: [PATCH 0221/3220] sched: EAS: Avoid causing spikes to max-freq
 unnecessarily

During scheduler tick handling, the frequency was being set to
max-freq if the current frequency is less than the current
utilization. Change to just request "right" frequency instead
of max.

BUG: 29871410
Change-Id: I6fe65b14413da44b1520ba116f72320083eb92f8
---
 kernel/sched/core.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index deb45d1b4e49..f07615a1674f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2940,7 +2940,7 @@ static unsigned long sum_capacity_reqs(unsigned long cfs_cap,
 static void sched_freq_tick(int cpu)
 {
 	struct sched_capacity_reqs *scr;
-	unsigned long capacity_orig, capacity_curr;
+	unsigned long capacity_orig, capacity_curr, capacity_sum;
 
 	if (!sched_freq())
 		return;
@@ -2953,12 +2953,15 @@ static void sched_freq_tick(int cpu)
 	/*
 	 * To make free room for a task that is building up its "real"
 	 * utilization and to harm its performance the least, request
-	 * a jump to max OPP as soon as the margin of free capacity is
-	 * impacted (specified by capacity_margin).
+	 * a jump to a higher OPP as soon as the margin of free capacity
+	 * is impacted (specified by capacity_margin).
 	 */
+
 	scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-	if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr))
-		set_cfs_cpu_capacity(cpu, true, capacity_max);
+	capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr);
+	if (capacity_curr < capacity_sum) {
+		set_cfs_cpu_capacity(cpu, true, capacity_sum);
+	}
 }
 #else
 static inline void sched_freq_tick(int cpu) { }

From 5156b672040dfd29ba141e02689e1e92e34d525f Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 30 Jun 2016 15:09:24 +0100
Subject: [PATCH 0222/3220] FIXUP: sched: fix SchedFreq integration for both
 PELT and WALT

The current kernel allows to use either PELT or WALT to track CPUs utilizations.
One of the main differences between the two approaches is that PELT
tracks only utilization of SCHED_OTHER classes while WALT tracks all tasks
with a single signal.

The current sched_freq_tick does not make this distinction and, when WALT
is in use, we end up adding multiple time the contribution related to
the RT and DL classes. This patch fixes this issue by:

1. providing two different code paths for PELT and WALT, thus granting that
   when we switch to PELT we get the original behaviour based on the assumption
   that class aggregations is done underneath by SchedFreq.

2. avoiding the double accounting of DL and RT workloads, when WALT is in use,
   by just adding a margin to the original WALT signal when we need to check
   if the CFS capacity has to be increased.

Change-Id: I7326fd50e868e97fb5e12351917e9d2969bfdae7
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/core.c | 91 +++++++++++++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f07615a1674f..02743ffe7ff3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2926,21 +2926,77 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 }
 
 #ifdef CONFIG_CPU_FREQ_GOV_SCHED
-static unsigned long sum_capacity_reqs(unsigned long cfs_cap,
-				       struct sched_capacity_reqs *scr)
-{
-	unsigned long total = cfs_cap + scr->rt;
 
-	total = total * capacity_margin;
-	total /= SCHED_CAPACITY_SCALE;
-	total += scr->dl;
-	return total;
+static inline
+unsigned long add_capacity_margin(unsigned long cpu_capacity)
+{
+	cpu_capacity  = cpu_capacity * capacity_margin;
+	cpu_capacity /= SCHED_CAPACITY_SCALE;
+	return cpu_capacity;
 }
 
+static inline
+unsigned long sum_capacity_reqs(unsigned long cfs_cap,
+				struct sched_capacity_reqs *scr)
+{
+	unsigned long total = add_capacity_margin(cfs_cap + scr->rt);
+	return total += scr->dl;
+}
+
+static void sched_freq_tick_pelt(int cpu)
+{
+	unsigned long cpu_utilization = capacity_max;
+	unsigned long capacity_curr = capacity_curr_of(cpu);
+	struct sched_capacity_reqs *scr;
+
+	scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+	if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
+		return;
+
+	/*
+	 * To make free room for a task that is building up its "real"
+	 * utilization and to harm its performance the least, request
+	 * a jump to a higher OPP as soon as the margin of free capacity
+	 * is impacted (specified by capacity_margin).
+	 */
+	set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+}
+
+#ifdef CONFIG_SCHED_WALT
+static void sched_freq_tick_walt(int cpu)
+{
+	unsigned long cpu_utilization = cpu_util(cpu);
+	unsigned long capacity_curr = capacity_curr_of(cpu);
+
+	if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
+		return sched_freq_tick_pelt(cpu);
+
+	/*
+	 * Add a margin to the WALT utilization.
+	 * NOTE: WALT tracks a single CPU signal for all the scheduling
+	 * classes, thus this margin is going to be added to the DL class as
+	 * well, which is something we do not do in sched_freq_tick_pelt case.
+	 */
+	cpu_utilization = add_capacity_margin(cpu_utilization);
+	if (cpu_utilization <= capacity_curr)
+		return;
+
+	/*
+	 * It is likely that the load is growing so we
+	 * keep the added margin in our request as an
+	 * extra boost.
+	 */
+	set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+
+}
+#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
+#else
+#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
+#endif /* CONFIG_SCHED_WALT */
+
 static void sched_freq_tick(int cpu)
 {
-	struct sched_capacity_reqs *scr;
-	unsigned long capacity_orig, capacity_curr, capacity_sum;
+	unsigned long capacity_orig, capacity_curr;
 
 	if (!sched_freq())
 		return;
@@ -2950,22 +3006,11 @@ static void sched_freq_tick(int cpu)
 	if (capacity_curr == capacity_orig)
 		return;
 
-	/*
-	 * To make free room for a task that is building up its "real"
-	 * utilization and to harm its performance the least, request
-	 * a jump to a higher OPP as soon as the margin of free capacity
-	 * is impacted (specified by capacity_margin).
-	 */
-
-	scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-	capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr);
-	if (capacity_curr < capacity_sum) {
-		set_cfs_cpu_capacity(cpu, true, capacity_sum);
-	}
+	_sched_freq_tick(cpu);
 }
 #else
 static inline void sched_freq_tick(int cpu) { }
-#endif
+#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
 
 /*
  * This function gets called by the timer code, with HZ frequency.

From 740d312ce8aae83956ed1b34b8a4e72b60b04a8f Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Thu, 16 Jun 2016 16:33:54 -0700
Subject: [PATCH 0223/3220] FIXUP: sched/fair: Fix hang during suspend in
 sched_group_energy

BUG: 29353986
Change-Id: I0d0d8d5c107a2e0bd219819e036091106bb40e11
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0ba5653682e8..0b065ff2f4f3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4980,6 +4980,7 @@ static int sched_group_energy(struct energy_env *eenv)
 			} while (sg = sg->next, sg != sd->groups);
 		}
 next_cpu:
+		cpumask_clear_cpu(cpu, &visit_cpus);
 		continue;
 	}
 

From 8935b6b4d230ae7969ccdfaf7baf09d818f88c8b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 4 Jul 2016 15:04:45 +0100
Subject: [PATCH 0224/3220] FIXUP: sched: Fix double-release of spinlock in
 move_queued_task

BUG: 29519455
Change-Id: I4d1c27a1b4bcbba03d4b175d170cfe1701a90ffd
---
 kernel/sched/sched.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3111d74a5a85..4e2e44e3cc7b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1832,7 +1832,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
-	raw_spin_unlock(&busiest->lock);
+	if (this_rq != busiest)
+		raw_spin_unlock(&busiest->lock);
 	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
 

From 23ed57dbcc14871af54db667d460aba64eaf6efe Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Mon, 25 Jul 2016 15:13:58 +0100
Subject: [PATCH 0225/3220] arch_timer: add error handling when the MPM global
 timer is cleared

Bug: 29000863
Signed-off-by: albert.zl_huang <albert.zl_huang@htc.com>
Change-Id: I2b5a28b0a9edb31bdaa1ca2310397dd2f36f6c23

Updated to use arch_timer_read_counter() as arch_counter_get_cntvct
doesn't exist in this kernel.

Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/walt.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index b9ae8d5c4393..d9d09914ce30 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -185,7 +185,14 @@ update_window_start(struct rq *rq, u64 wallclock)
 	int nr_windows;
 
 	delta = wallclock - rq->window_start;
-	BUG_ON(delta < 0);
+	/* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+	if (delta < 0) {
+		if (arch_timer_read_counter() == 0)
+			delta = 0;
+		else
+			BUG_ON(1);
+	}
+
 	if (delta < walt_ravg_window)
 		return;
 

From d4cda03828f5c8eae35efcb08f520f8f1a35950e Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Wed, 13 Jul 2016 16:13:47 -0700
Subject: [PATCH 0226/3220] sched: use util instead of capacity to select busy
 cpu

If cpus are busy, the cpu selection algorithm was favoring
cpus with lower capacity. This can result in uneven packing
since there will be a bias toward the same cpu until there
is a capacity change. Instead use the utilization so there
is immediate feedback as tasks are assigned

BUG: 30115868

Change-Id: I0ac7ae3ab5d8f2f5a5838c29bb6da2c3e8ef44e8
---
 kernel/sched/fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b065ff2f4f3..7b6e95aa7360 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5593,7 +5593,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
 {
 	int iter_cpu;
 	int target_cpu = -1;
-	int target_capacity = 0;
+	int target_util = 0;
 	int backup_capacity = 0;
 	int best_idle_cpu = -1;
 	int best_idle_cstate = INT_MAX;
@@ -5649,10 +5649,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
 
 		if (new_util < cur_capacity) {
 			if (cpu_rq(i)->nr_running) {
-				if (target_capacity == 0 ||
-					target_capacity > cur_capacity) {
+				if (target_util == 0 ||
+					target_util > new_util) {
 					target_cpu = i;
-					target_capacity = cur_capacity;
+					target_util = new_util;
 				}
 			} else if (!boosted) {
 				if (best_idle_cpu < 0 ||

From c5a00c2dad8d161da3c2086cccd6375d8ad5b04f Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Thu, 14 Jul 2016 13:09:03 -0700
Subject: [PATCH 0227/3220] sched/tune: Introducing a new schedtune attribute
 prefer_idle

Hint to enable biasing of tasks towards idle cpus, even when a given
task is negatively boosted. The mechanism allows upto 20% reduction in
camera power without hurting performance.

bug: 28312446
Change-Id: I97ea5671aa1e6bcb165408b41e17bc82e41c2c9e
---
 kernel/sched/fair.c | 23 +++++++++++++----------
 kernel/sched/tune.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/tune.h |  2 ++
 3 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7b6e95aa7360..e099ce747345 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5589,7 +5589,7 @@ done:
 	return target;
 }
 
-static inline int find_best_target(struct task_struct *p, bool boosted)
+static inline int find_best_target(struct task_struct *p, bool prefer_idle)
 {
 	int iter_cpu;
 	int target_cpu = -1;
@@ -5607,9 +5607,9 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
 		int idle_idx;
 
 		/*
-		 * favor higher cpus for boosted tasks
+		 * favor higher cpus for tasks that prefer idle cores
 		 */
-		int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+		int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu;
 
 		if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
 			continue;
@@ -5634,10 +5634,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
 			continue;
 #endif
 		/*
-		 * For boosted tasks we favor idle cpus unconditionally to
+		 * Unconditionally favoring tasks that prefer idle cpus to
 		 * improve latency.
 		 */
-		if (idle_cpu(i) && boosted) {
+		if (idle_cpu(i) && prefer_idle) {
 			if (best_idle_cpu < 0)
 				best_idle_cpu = i;
 			continue;
@@ -5654,7 +5654,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
 					target_cpu = i;
 					target_util = new_util;
 				}
-			} else if (!boosted) {
+			} else if (!prefer_idle) {
 				if (best_idle_cpu < 0 ||
 					(sysctl_sched_cstate_aware &&
 						best_idle_cstate > idle_idx)) {
@@ -5669,7 +5669,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
 		}
 	}
 
-	if (boosted && best_idle_cpu >= 0)
+	if (prefer_idle && best_idle_cpu >= 0)
 		target_cpu = best_idle_cpu;
 	else if (target_cpu < 0)
 		target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
@@ -5761,14 +5761,17 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 		 */
 #ifdef CONFIG_CGROUP_SCHEDTUNE
 		bool boosted = schedtune_task_boost(p) > 0;
+		bool prefer_idle = schedtune_prefer_idle(p) > 0;
 #else
 		bool boosted = 0;
+		bool prefer_idle = 0;
 #endif
-		int tmp_target = find_best_target(p, boosted);
-		if (tmp_target >= 0)
+		int tmp_target = find_best_target(p, boosted || prefer_idle);
+		if (tmp_target >= 0) {
 			target_cpu = tmp_target;
-			if (boosted && idle_cpu(target_cpu))
+			if ((boosted || prefer_idle) && idle_cpu(target_cpu))
 				return target_cpu;
+		}
 	}
 
 	if (target_cpu != task_cpu(p)) {
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index d24f365b0c90..644f8e9ee96f 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -125,6 +125,10 @@ struct schedtune {
 
 	/* Performance Constraint (C) region threshold params */
 	int perf_constrain_idx;
+
+	/* Hint to bias scheduling of tasks on that SchedTune CGroup
+	 * towards idle CPUs */
+	int prefer_idle;
 };
 
 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
@@ -156,6 +160,7 @@ root_schedtune = {
 	.boost	= 0,
 	.perf_boost_idx = 0,
 	.perf_constrain_idx = 0,
+	.prefer_idle = 0,
 };
 
 int
@@ -536,6 +541,38 @@ int schedtune_task_boost(struct task_struct *p)
 	return task_boost;
 }
 
+int schedtune_prefer_idle(struct task_struct *p)
+{
+	struct schedtune *st;
+	int prefer_idle;
+
+	/* Get prefer_idle value */
+	rcu_read_lock();
+	st = task_schedtune(p);
+	prefer_idle = st->prefer_idle;
+	rcu_read_unlock();
+
+	return prefer_idle;
+}
+
+static u64
+prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->prefer_idle;
+}
+
+static int
+prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
+	    u64 prefer_idle)
+{
+	struct schedtune *st = css_st(css);
+	st->prefer_idle = prefer_idle;
+
+	return 0;
+}
+
 static s64
 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
@@ -587,6 +624,11 @@ static struct cftype files[] = {
 		.read_s64 = boost_read,
 		.write_s64 = boost_write,
 	},
+	{
+		.name = "prefer_idle",
+		.read_u64 = prefer_idle_read,
+		.write_u64 = prefer_idle_write,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index be1785eb1c5b..4f6441771e4c 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -17,6 +17,8 @@ struct target_nrg {
 int schedtune_cpu_boost(int cpu);
 int schedtune_task_boost(struct task_struct *tsk);
 
+int schedtune_prefer_idle(struct task_struct *tsk);
+
 void schedtune_exit_task(struct task_struct *tsk);
 
 void schedtune_enqueue_task(struct task_struct *p, int cpu);

From 93db70f21c7bf04dfe5bb3345400da8f97c3ad29 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Wed, 10 Feb 2016 09:24:36 +0000
Subject: [PATCH 0228/3220] DEBUG: sched: add tracepoint for RD overutilized

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/trace/events/sched.h | 20 ++++++++++++++++++++
 kernel/sched/fair.c          | 17 +++++++++++++----
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 33b1c719c9d7..8006e3a46723 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -888,6 +888,26 @@ TRACE_EVENT(sched_tune_filter,
 		__entry->payoff, __entry->region)
 );
 
+/*
+ * Tracepoint for system overutilized flag
+ */
+TRACE_EVENT(sched_overutilized,
+
+	TP_PROTO(bool overutilized),
+
+	TP_ARGS(overutilized),
+
+	TP_STRUCT__entry(
+		__field( bool,	overutilized	)
+	),
+
+	TP_fast_assign(
+		__entry->overutilized	= overutilized;
+	),
+
+	TP_printk("overutilized=%d",
+		__entry->overutilized ? 1 : 0)
+);
 #ifdef CONFIG_SCHED_WALT
 struct rq;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e099ce747345..b84277a8530d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4242,8 +4242,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!se) {
 		walt_inc_cumulative_runnable_avg(rq, p);
 		if (!task_new && !rq->rd->overutilized &&
-		    cpu_overutilized(rq->cpu))
+		    cpu_overutilized(rq->cpu)) {
 			rq->rd->overutilized = true;
+			trace_sched_overutilized(true);
+		}
 
 		/*
 		 * We want to potentially trigger a freq switch
@@ -7503,12 +7505,17 @@ next_group:
 			env->dst_rq->rd->overload = overload;
 
 		/* Update over-utilization (tipping point, U >= 0) indicator */
-		if (env->dst_rq->rd->overutilized != overutilized)
+		if (env->dst_rq->rd->overutilized != overutilized) {
 			env->dst_rq->rd->overutilized = overutilized;
+			trace_sched_overutilized(overutilized);
+		}
 	} else {
-		if (!env->dst_rq->rd->overutilized && overutilized)
+		if (!env->dst_rq->rd->overutilized && overutilized) {
 			env->dst_rq->rd->overutilized = true;
+			trace_sched_overutilized(true);
+		}
 	}
+
 }
 
 /**
@@ -8948,8 +8955,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		task_tick_numa(rq, curr);
 
 #ifdef CONFIG_SMP
-	if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+	if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
 		rq->rd->overutilized = true;
+		trace_sched_overutilized(true);
+	}
 
 	rq->misfit_task = !task_fits_max(curr, rq->cpu);
 #endif

From b9534b8f0128efdb00865af03106a45ce9d489cb Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 29 Jul 2016 16:09:03 +0100
Subject: [PATCH 0229/3220] FIXUP: sched/tune: do initialization as a
 postcore_initicall

SchedTune needs to walk the scheduling domains to compute the energy
normalization constants used for PE space filtering. To build such
constants we need the energy model data for each CPU in the system.
However, by walking the SDs as a late initcall stage, the userspace has
been already initialized and it could happen that some CPUs are
hotplugged out.
For example, this could happen if a user-space thermal manager daemon
detects that CPUs are to much hot during the boot process.

To avoid such a race condition we can anticipate the SchedTune
initialization code to be a postcore_initicall. This allows to keep the
SchedTune initialization code as simple as an initcall while still safely
relaying on SDs provided data.

Such calls are executed before user-space is initialized and thus, apart
from the case of unlucky early-init kernel space generated hotplugs,
this solution should be safe enough to get all the data we need.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/sched/tune.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 644f8e9ee96f..bd7f319ce53e 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -946,4 +946,4 @@ nodata:
 	rcu_read_unlock();
 	return -EINVAL;
 }
-late_initcall(schedtune_init);
+postcore_initcall(schedtune_init);

From c80a9af21aece64d3262900c78bdf39f2fdd6c2e Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Tue, 2 Aug 2016 14:05:46 -0700
Subject: [PATCH 0230/3220] sched/fair: Picking cpus with low OPPs for tasks
 that prefer idle CPUs

When idle cpus cannot be found for Top-app/FG tasks, the cpu selection
algorithm picks a cpu with lowest OPP amongst the busy cpus as a second
choice.

Mitigates the "runnable" time for ui and render threads.

bug: 30481949
bug: 30342017
bug: 30508678
Change-Id: I5a97e31d33284895c0fa6f6942102713ee576d77
---
 kernel/sched/fair.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b84277a8530d..b7b1f2759278 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5651,10 +5651,22 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle)
 
 		if (new_util < cur_capacity) {
 			if (cpu_rq(i)->nr_running) {
-				if (target_util == 0 ||
-					target_util > new_util) {
-					target_cpu = i;
-					target_util = new_util;
+				if(prefer_idle) {
+					// Find a target cpu with lowest
+					// utilization.
+					if (target_util == 0 ||
+						target_util < new_util) {
+						target_cpu = i;
+						target_util = new_util;
+					}
+				} else {
+					// Find a target cpu with highest
+					// utilization.
+					if (target_util == 0 ||
+						target_util > new_util) {
+						target_cpu = i;
+						target_util = new_util;
+					}
 				}
 			} else if (!prefer_idle) {
 				if (best_idle_cpu < 0 ||
@@ -5666,6 +5678,7 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle)
 			}
 		} else if (backup_capacity == 0 ||
 				backup_capacity > cur_capacity) {
+			// Find a backup cpu with least capacity.
 			backup_capacity = cur_capacity;
 			backup_cpu = i;
 		}

From bf93a3672c5e4c1b803958dc6c038f0b163df5cf Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Thu, 4 Aug 2016 12:20:04 +0100
Subject: [PATCH 0231/3220] sched/cpufreq_sched: fix thermal capping events

cpufreq_sched_limits (called when CPUFREQ_GOV_LIMITS event happens)
bails out if policy->rwsem is already locked. However, that rwsem is
always guaranteed to be locked when we get here after a thermal
throttling event happens:

 th_throttling ->
   cpufreq_update_policy()
     ...
     down_write(&policy->rwsem);
     ...
     cpufreq_set_policy() ->
       ...
       __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); ->
         cpufreq_sched_limits()
         ...
         if (!down_write_trylock(&policy->rwsem))
                 return; <-- BAIL OUT!

So, we don't currently react immediately to thermal capping event (even
if reaction is still quick in practice, ~1ms, as lots of events are likely
to trigger a frequency selection on a high loaded system).

Fix this bug by removing the bail out condition.

While we are at it we also slightly change handling of the new limits by
clamping the last requested_freq between policy's max and min. Doing so
gives us the oppurtunity to correctly restore the last requested
frequency as soon as a thermal unthrottling event happens.

bug: 30481949

Change-Id: I3c13e818f238c1ffa66b34e419e8b87314b57427
Suggested-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/sched/cpufreq_sched.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index 3f8c67a3ea0f..4fea269a6598 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -58,7 +58,6 @@ struct gov_data {
 	struct task_struct *task;
 	struct irq_work irq_work;
 	unsigned int requested_freq;
-	int max;
 };
 
 static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
@@ -193,7 +192,7 @@ static void update_fdomain_capacity_request(int cpu)
 	}
 
 	/* Convert the new maximum capacity request into a cpu frequency */
-	freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT;
+	freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
 	if (cpufreq_frequency_table_target(policy, policy->freq_table,
 					   freq_new, CPUFREQ_RELATION_L,
 					   &index_new))
@@ -288,8 +287,6 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 	pr_debug("%s: throttle threshold = %u [ns]\n",
 		  __func__, gd->up_throttle_nsec);
 
-	gd->max = policy->max;
-
 	rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
 	if (rc) {
 		pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
@@ -352,28 +349,17 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy)
 
 static void cpufreq_sched_limits(struct cpufreq_policy *policy)
 {
-	struct gov_data *gd;
+	unsigned int clamp_freq;
+	struct gov_data *gd = policy->governor_data;;
 
 	pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
 		policy->cpu, policy->min, policy->max,
 		policy->cur);
 
-	if (!down_write_trylock(&policy->rwsem))
-		return;
-	/*
-	 * Need to keep track of highest max frequency for
-	 * capacity calculations
-	 */
-	gd = policy->governor_data;
-	if (gd->max < policy->max)
-		gd->max = policy->max;
+	clamp_freq = clamp(gd->requested_freq, policy->min, policy->max);
 
-	if (policy->max < policy->cur)
-		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
-	else if (policy->min > policy->cur)
-		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
-
-	up_write(&policy->rwsem);
+	if (policy->cur != clamp_freq)
+		__cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L);
 }
 
 static int cpufreq_sched_stop(struct cpufreq_policy *policy)

From 74b4fa8e5cfe7655adb40bc88d6d88f916cee250 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Fri, 13 May 2016 11:54:04 +0100
Subject: [PATCH 0232/3220] sched/fair: call OPP update when going idle after
 migration

When a task leaves a rq because it is migrated away it carries its
utilization with him. In this case and OPP update on the src rq might be
needed. The corresponding update at dst rq will happen at enqueue time.

Change-Id: I22754a43760fc8d22a488fe15044af93787ea7a8

sched/fair: Fix uninitialised variable in idle_balance

compiler warned, looks legit.

Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b7b1f2759278..d9af0384dbb0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8322,6 +8322,7 @@ static int idle_balance(struct rq *this_rq)
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	u64 curr_cost = 0;
+	long removed_util=0;
 
 	idle_enter_fair(this_rq);
 
@@ -8345,6 +8346,17 @@ static int idle_balance(struct rq *this_rq)
 
 	raw_spin_unlock(&this_rq->lock);
 
+	/*
+	 * If removed_util_avg is !0 we most probably migrated some task away
+	 * from this_cpu. In this case we might be willing to trigger an OPP
+	 * update, but we want to do so if we don't find anybody else to pull
+	 * here (we will trigger an OPP update with the pulled task's enqueue
+	 * anyway).
+	 *
+	 * Record removed_util before calling update_blocked_averages, and use
+	 * it below (before returning) to see if an OPP update is required.
+	 */
+	removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
 	update_blocked_averages(this_cpu);
 	rcu_read_lock();
 	for_each_domain(this_cpu, sd) {
@@ -8409,6 +8421,12 @@ out:
 	if (pulled_task) {
 		idle_exit_fair(this_rq);
 		this_rq->idle_stamp = 0;
+	} else if (removed_util) {
+		/*
+		 * No task pulled and someone has been migrated away.
+		 * Good case to trigger an OPP update.
+		 */
+		update_capacity_of(this_cpu);
 	}
 
 	return pulled_task;

From 142b2acc79777f3cac93144d7ebf9caa53e97f9b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 14 Jan 2016 15:21:40 -0800
Subject: [PATCH 0233/3220] vmstat: make vmstat_updater deferrable again and
 shut down on idle

Currently the vmstat updater is not deferrable as a result of commit
ba4877b9ca51 ("vmstat: do not use deferrable delayed work for
vmstat_update").  This in turn can cause multiple interruptions of the
applications because the vmstat updater may run at

Make vmstate_update deferrable again and provide a function that folds
the differentials when the processor is going to idle mode thus
addressing the issue of the above commit in a clean way.

Note that the shepherd thread will continue scanning the differentials
from another processor and will reenable the vmstat workers if it
detects any changes.

Change-Id: Idf256cfacb40b4dc8dbb6795cf06b34e8fec7a06
Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update")
Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
Git-commit: 0eb77e9880321915322d42913c3b53241739c8aa
[shashim@codeaurora.org: resolve minor merge conflicts]
Signed-off-by: Shiraz Hashim <shashim@codeaurora.org>
[jstultz: fwdport to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/vmstat.h |  2 ++
 kernel/sched/idle.c    |  1 +
 mm/vmstat.c            | 69 +++++++++++++++++++++++++++---------------
 3 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3e5d9075960f..73fae8c4a5fb 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -189,6 +189,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item);
 extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
+void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
@@ -249,6 +250,7 @@ static inline void __dec_zone_page_state(struct page *page,
 
 static inline void refresh_zone_stat_thresholds(void) { }
 static inline void cpu_vm_stats_fold(int cpu) { }
+static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
 			struct per_cpu_pageset *pset) { }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cbc130efbc5b..917c94abf5bb 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -220,6 +220,7 @@ static void cpu_idle_loop(void)
 		 */
 
 		__current_set_polling();
+		quiet_vmstat();
 		tick_nohz_idle_enter();
 
 		while (!need_resched()) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c54fd2924f25..83a003bc3cae 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -460,7 +460,7 @@ static int fold_diff(int *diff)
  *
  * The function returns the number of global counters updated.
  */
-static int refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(bool do_pagesets)
 {
 	struct zone *zone;
 	int i;
@@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void)
 #endif
 			}
 		}
-		cond_resched();
 #ifdef CONFIG_NUMA
-		/*
-		 * Deal with draining the remote pageset of this
-		 * processor
-		 *
-		 * Check if there are pages remaining in this pageset
-		 * if not then there is nothing to expire.
-		 */
-		if (!__this_cpu_read(p->expire) ||
+		if (do_pagesets) {
+			cond_resched();
+			/*
+			 * Deal with draining the remote pageset of this
+			 * processor
+			 *
+			 * Check if there are pages remaining in this pageset
+			 * if not then there is nothing to expire.
+			 */
+			if (!__this_cpu_read(p->expire) ||
 			       !__this_cpu_read(p->pcp.count))
-			continue;
+				continue;
 
-		/*
-		 * We never drain zones local to this processor.
-		 */
-		if (zone_to_nid(zone) == numa_node_id()) {
-			__this_cpu_write(p->expire, 0);
-			continue;
-		}
+			/*
+			 * We never drain zones local to this processor.
+			 */
+			if (zone_to_nid(zone) == numa_node_id()) {
+				__this_cpu_write(p->expire, 0);
+				continue;
+			}
 
-		if (__this_cpu_dec_return(p->expire))
-			continue;
+			if (__this_cpu_dec_return(p->expire))
+				continue;
 
-		if (__this_cpu_read(p->pcp.count)) {
-			drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
-			changes++;
+			if (__this_cpu_read(p->pcp.count)) {
+				drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+				changes++;
+			}
 		}
 #endif
 	}
@@ -1386,7 +1388,7 @@ static cpumask_var_t cpu_stat_off;
 
 static void vmstat_update(struct work_struct *w)
 {
-	if (refresh_cpu_vm_stats()) {
+	if (refresh_cpu_vm_stats(true)) {
 		/*
 		 * Counters were updated so we expect more updates
 		 * to occur in the future. Keep on running the
@@ -1417,6 +1419,23 @@ static void vmstat_update(struct work_struct *w)
 	}
 }
 
+/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
+void quiet_vmstat(void)
+{
+	if (system_state != SYSTEM_RUNNING)
+		return;
+
+	do {
+		if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+			cancel_delayed_work(this_cpu_ptr(&vmstat_work));
+
+	} while (refresh_cpu_vm_stats(false));
+}
+
 /*
  * Check if the diffs for a certain cpu indicate that
  * an update is needed.
@@ -1449,7 +1468,7 @@ static bool need_update(int cpu)
  */
 static void vmstat_shepherd(struct work_struct *w);
 
-static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
+static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
 
 static void vmstat_shepherd(struct work_struct *w)
 {

From 07ec7db165f1990ca563e18888e82991eac1c5e5 Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Fri, 29 Jul 2016 17:50:11 +0100
Subject: [PATCH 0234/3220] sched/fair: Favor higher cpus only for boosted
 tasks

This CL separates the notion of boost and prefer_idle schedtune
attributes in cpu selection. Today only top-app
tasks are boosted. The CPU selection is slightly tweaked such that
higher order cpus are preferred only for boosted tasks (top-app) and the
rest would be skewed towards lower order cpus.
This avoids starvation issues for fg tasks when interacting with high
priority top-app tasks (a problem often seen in the case of system_server).

bug: 30245369
bug: 30292998
Change-Id: I0377e00893b9f6586eec55632a265518fd2fa8a1

Conflicts:
	kernel/sched/fair.c
---
 kernel/sched/fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d9af0384dbb0..da9323b9ecab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5591,7 +5591,7 @@ done:
 	return target;
 }
 
-static inline int find_best_target(struct task_struct *p, bool prefer_idle)
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
 {
 	int iter_cpu;
 	int target_cpu = -1;
@@ -5609,9 +5609,9 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle)
 		int idle_idx;
 
 		/*
-		 * favor higher cpus for tasks that prefer idle cores
+		 * Iterate from higher cpus for boosted tasks.
 		 */
-		int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu;
+		int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
 
 		if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
 			continue;
@@ -5781,7 +5781,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 		bool boosted = 0;
 		bool prefer_idle = 0;
 #endif
-		int tmp_target = find_best_target(p, boosted || prefer_idle);
+		int tmp_target = find_best_target(p, boosted, prefer_idle);
 		if (tmp_target >= 0) {
 			target_cpu = tmp_target;
 			if ((boosted || prefer_idle) && idle_cpu(target_cpu))

From 3276c3d7bca25f229f396cc182d1af29366ab109 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 May 2015 17:43:34 +0200
Subject: [PATCH 0235/3220] UPSTREAM: sched: Fix a race between
 __kthread_bind() and sched_setaffinity()

Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
without locks, a caller might observe an old value and race with the
set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
it:

	__kthread_bind()
	  do_set_cpus_allowed()
						<SYSCALL>
						  sched_setaffinity()
						    if (p->flags & PF_NO_SETAFFINITIY)
						    set_cpus_allowed_ptr()
	  p->flags |= PF_NO_SETAFFINITY

Fix the bug by putting everything under the regular scheduler locks.

This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dedekind1@gmail.com
Cc: juri.lelli@arm.com
Cc: mgorman@suse.de
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 25834c73f93af7f0712c98ca4593691592e6b360)
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>

BUG=chrome-os-partner:44828
TEST=Boot kernel on Oak.
TEST=smaug-release and strago-release trybots.

Change-Id: Id3c898c5ee1a22ed704e83f2ecf5f78199280d38
Reviewed-on: https://chromium-review.googlesource.com/321264
Commit-Ready: Ricky Liang <jcliang@chromium.org>
Tested-by: Ricky Liang <jcliang@chromium.org>
Reviewed-by: Ricky Liang <jcliang@chromium.org>

Conflicts:
	kernel/sched/core.c
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 02743ffe7ff3..40c8e8d1e07f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5157,6 +5157,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	raw_spin_lock(&rq->lock);
 
 	__sched_fork(0, idle);
+
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 

From 989f33f789a85cc53f4c3f75e31ddfce65c6d873 Mon Sep 17 00:00:00 2001
From: Matt Wagantall <mattw@codeaurora.org>
Date: Tue, 17 Jun 2014 21:43:35 -0700
Subject: [PATCH 0236/3220] sched/rt: print RT tasks when RT throttling is
 activated

Existing debug prints do not provide any clues about which tasks
may have triggered RT throttling. Print the names and PIDs of
all tasks on the throttled rt_rq to help narrow down the source
of the problem.

Change-Id: I180534c8a647254ed38e89d0c981a8f8bccd741c
Signed-off-by: Matt Wagantall <mattw@codeaurora.org>
[rameezmustafa@codeaurora.org]: Port to msm-3.18]
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
---
 kernel/sched/rt.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index be700bfa1ae4..2b9121ea91bf 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -891,6 +891,42 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 	return rt_task_of(rt_se)->prio;
 }
 
+static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
+{
+	struct rt_prio_array *array = &rt_rq->active;
+	struct sched_rt_entity *rt_se;
+	char buf[500];
+	char *pos = buf;
+	char *end = buf + sizeof(buf);
+	int idx;
+
+	pos += snprintf(pos, sizeof(buf),
+		"sched: RT throttling activated for rt_rq %p (cpu %d)\n",
+		rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+	if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
+		goto out;
+
+	pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
+	idx = sched_find_first_bit(array->bitmap);
+	while (idx < MAX_RT_PRIO) {
+		list_for_each_entry(rt_se, array->queue + idx, run_list) {
+			struct task_struct *p;
+
+			if (!rt_entity_is_task(rt_se))
+				continue;
+
+			p = rt_task_of(rt_se);
+			if (pos < end)
+				pos += snprintf(pos, end - pos, "\t%s (%d)\n",
+					p->comm, p->pid);
+		}
+		idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+	}
+out:
+	printk_deferred("%s", buf);
+}
+
 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
 	u64 runtime = sched_rt_runtime(rt_rq);
@@ -914,8 +950,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 		 * but accrue some time due to boosting.
 		 */
 		if (likely(rt_b->rt_runtime)) {
+			static bool once = false;
+
 			rt_rq->rt_throttled = 1;
-			printk_deferred_once("sched: RT throttling activated\n");
+
+			if (!once) {
+				once = true;
+				dump_throttled_rt_tasks(rt_rq);
+			}
 		} else {
 			/*
 			 * In case we did anyway, make it go away,

From 2a4445395f1be46db1200e1329fae06d034675e5 Mon Sep 17 00:00:00 2001
From: Matt Wagantall <mattw@codeaurora.org>
Date: Thu, 19 Jun 2014 14:23:33 -0700
Subject: [PATCH 0237/3220] sched/rt: Add Kconfig option to enable panicking
 for RT throttling

This may be useful for detecting and debugging RT throttling issues.

Change-Id: I5807a897d11997d76421c1fcaa2918aad988c6c9
Signed-off-by: Matt Wagantall <mattw@codeaurora.org>
[rameezmustafa@codeaurora.org]: Port to msm-3.18]
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
[jstultz: forwardported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/sched/rt.c | 9 +++++++++
 lib/Kconfig.debug | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2b9121ea91bf..8a16cba968c4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -924,7 +924,16 @@ static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
 		idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
 	}
 out:
+#ifdef CONFIG_PANIC_ON_RT_THROTTLING
+	/*
+	 * Use pr_err() in the BUG() case since printk_sched() will
+	 * not get flushed and deadlock is not a concern.
+	 */
+	pr_err("%s", buf);
+	BUG();
+#else
 	printk_deferred("%s", buf);
+#endif
 }
 
 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8c15b29d5adc..07ccfa681577 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -855,6 +855,15 @@ config SCHED_INFO
 	bool
 	default n
 
+config PANIC_ON_RT_THROTTLING
+	bool "Panic on RT throttling"
+	help
+	  Say Y here to enable the kernel to panic when a realtime
+	  runqueue is throttled. This may be useful for detecting
+	  and debugging RT throttling issues.
+
+	  Say N if unsure.
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS

From fb8741a0ba7445503ebe648fd96cccc34d7d2088 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 1 Aug 2016 16:49:07 -0700
Subject: [PATCH 0238/3220] FROMLIST: proc: Fix timerslack_ns CAP_SYS_NICE
 check when adjusting self

In changing from checking ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)
to capable(CAP_SYS_NICE), I missed that ptrace_my_access succeeds
when p == current, but the CAP_SYS_NICE doesn't.

Thus while the previous commit was intended to loosen the needed
privledges to modify a processes timerslack, it needlessly restricted
a task modifying its own timerslack via the proc/<tid>/timerslack_ns
(which is permitted also via the PR_SET_TIMERSLACK method).

This patch corrects this by checking if p == current before checking
the CAP_SYS_NICE value.

Cc: Kees Cook <keescook@chromium.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
CC: Arjan van de Ven <arjan@linux.intel.com>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Colin Cross <ccross@android.com>
Cc: Nick Kralevich <nnk@google.com>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Elliott Hughes <enh@google.com>
Cc: Android Kernel Team <kernel-team@android.com>
Mailing-list-url: http://www.spinics.net/lists/kernel/msg2317488.html
Change-Id: Ia3e8aff07c2d41f55b6617502d33c39b7d781aac
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 fs/proc/base.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ac5d2aea63b5..49348349e156 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2260,15 +2260,17 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
 	if (!p)
 		return -ESRCH;
 
-	if (!capable(CAP_SYS_NICE)) {
-		count = -EPERM;
-		goto out;
-	}
+	if (p != current) {
+		if (!capable(CAP_SYS_NICE)) {
+			count = -EPERM;
+			goto out;
+		}
 
-	err = security_task_setscheduler(p);
-	if (err) {
-		count = err;
-		goto out;
+		err = security_task_setscheduler(p);
+		if (err) {
+			count = err;
+			goto out;
+		}
 	}
 
 	task_lock(p);
@@ -2294,14 +2296,16 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
 	if (!p)
 		return -ESRCH;
 
-	if (!capable(CAP_SYS_NICE)) {
-		err = -EPERM;
-		goto out;
-	}
+	if (p != current) {
 
-	err = security_task_getscheduler(p);
-	if (err)
-		goto out;
+		if (!capable(CAP_SYS_NICE)) {
+			err = -EPERM;
+			goto out;
+		}
+		err = security_task_getscheduler(p);
+		if (err)
+			goto out;
+	}
 
 	task_lock(p);
 	seq_printf(m, "%llu\n", p->timer_slack_ns);

From ef58c0a2695707ab8b49368aa77ab7b4828aa1a7 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Fri, 12 Aug 2016 11:24:50 +0530
Subject: [PATCH 0239/3220] ANDROID: net: fib: remove duplicate assignment

Remove duplicate FRA_GOTO assignment.

Fixes: fd2cf795f3ab ("net: core: Support UID-based routing.")

Change-Id: I462c24b16fdef42ae2332571a0b95de3ef9d2e25
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 include/net/fib_rules.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 55b5419cb6a7..bdd985f41022 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -89,7 +89,6 @@ struct fib_rules_ops {
 	[FRA_FWMARK]	= { .type = NLA_U32 }, \
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
-	[FRA_GOTO]	= { .type = NLA_U32 }, \
 	[FRA_UID_START]	= { .type = NLA_U32 }, \
 	[FRA_UID_END]	= { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \

From 0b848391c27294982778254abb2d16723977e839 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Thu, 11 Aug 2016 19:13:22 +0530
Subject: [PATCH 0240/3220] ANDROID: net: core: fix UID-based routing

Fix RTA_UID enum to match it with the Android userspace code which
assumes RTA_UID=18.

With this patch all Android kernel networking unit tests mentioned here
https://source.android.com/devices/tech/config/kernel_network_tests.html
are success.

Without this patch multinetwork_test.py unit test fails.

Change-Id: I3ff36670f7d4e5bf5f01dce584ae9d53deabb3ed
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 include/uapi/linux/rtnetlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 355eea225dd9..3eb02a1d6d8c 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -306,12 +306,12 @@ enum rtattr_type_t {
 	RTA_TABLE,
 	RTA_MARK,
 	RTA_MFC_STATS,
+	RTA_UID,
 	RTA_VIA,
 	RTA_NEWDST,
 	RTA_PREF,
 	RTA_ENCAP_TYPE,
 	RTA_ENCAP,
-	RTA_UID,
 	__RTA_MAX
 };
 

From 1da2a42de4b9371d2988ddf6969ce5efd1560270 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 5 Jul 2016 17:32:29 -0400
Subject: [PATCH 0241/3220] UPSTREAM: Revert "ecryptfs: forbid opening files
 without mmap handler"

(cherry picked from commit 78c4e172412de5d0456dc00d2b34050aa0b683b5)

This reverts upstream commit 2f36db71009304b3f0b95afacd8eba1f9f046b87.

It fixed a local root exploit but also introduced a dependency on
the lower file system implementing an mmap operation just to open a file,
which is a bit of a heavy hammer.  The right fix is to have mmap depend
on the existence of the mmap handler instead.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Cc: stable@vger.kernel.org
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>

Fixes: Change-Id I0be77c7f8bd3046bc34cd87ef577529792d479bc
       ("UPSTREAM: ecryptfs: forbid opening files without mmap handler")
Change-Id: Ib9bc87099f7f89e4e12dbc1a79e884dcadb1befb
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 fs/ecryptfs/kthread.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e818f5ac7a26..866bb18efefe 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/mount.h>
-#include <linux/file.h>
 #include "ecryptfs_kernel.h"
 
 struct ecryptfs_open_req {
@@ -148,7 +147,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
 	flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR;
 	(*lower_file) = dentry_open(&req.path, flags, cred);
 	if (!IS_ERR(*lower_file))
-		goto have_file;
+		goto out;
 	if ((flags & O_ACCMODE) == O_RDONLY) {
 		rc = PTR_ERR((*lower_file));
 		goto out;
@@ -166,16 +165,8 @@ int ecryptfs_privileged_open(struct file **lower_file,
 	mutex_unlock(&ecryptfs_kthread_ctl.mux);
 	wake_up(&ecryptfs_kthread_ctl.wait);
 	wait_for_completion(&req.done);
-	if (IS_ERR(*lower_file)) {
+	if (IS_ERR(*lower_file))
 		rc = PTR_ERR(*lower_file);
-		goto out;
-	}
-have_file:
-	if ((*lower_file)->f_op->mmap == NULL) {
-		fput(*lower_file);
-		*lower_file = NULL;
-		rc = -EMEDIUMTYPE;
-	}
 out:
 	return rc;
 }

From 7997255b0d0ffed60ad402c5022eacf0161dcdc0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 5 Jul 2016 17:32:30 -0400
Subject: [PATCH 0242/3220] UPSTREAM: ecryptfs: don't allow mmap when the lower
 fs doesn't support it

(cherry picked from commit f0fe970df3838c202ef6c07a4c2b36838ef0a88b)

There are legitimate reasons to disallow mmap on certain files, notably
in sysfs or procfs.  We shouldn't emulate mmap support on file systems
that don't offer support natively.

CVE-2016-1583

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Cc: stable@vger.kernel.org
[tyhicks: clean up f_op check by using ecryptfs_file_to_lower()]
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>

Change-Id: I66e3670771630a25b0608f10019d1584e9ce73a6
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 fs/ecryptfs/file.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index feef8a9c4de7..11309683d65f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -170,6 +170,19 @@ out:
 	return rc;
 }
 
+static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct file *lower_file = ecryptfs_file_to_lower(file);
+	/*
+	 * Don't allow mmap on top of file systems that don't support it
+	 * natively.  If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs
+	 * allows recursive mounting, this will need to be extended.
+	 */
+	if (!lower_file->f_op->mmap)
+		return -ENODEV;
+	return generic_file_mmap(file, vma);
+}
+
 /**
  * ecryptfs_open
  * @inode: inode speciying file to open
@@ -364,7 +377,7 @@ const struct file_operations ecryptfs_main_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,
 #endif
-	.mmap = generic_file_mmap,
+	.mmap = ecryptfs_mmap,
 	.open = ecryptfs_open,
 	.flush = ecryptfs_flush,
 	.release = ecryptfs_release,

From 4804692cb18d09c49ecfce3eef2969a69859e413 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 4 May 2016 14:04:13 -0400
Subject: [PATCH 0243/3220] UPSTREAM: ecryptfs: fix handling of directory
 opening

(cherry picked from commit 6a480a7842545ec520a91730209ec0bae41694c1)

First of all, trying to open them r/w is idiocy; it's guaranteed to fail.
Moreover, assigning ->f_pos and assuming that everything will work is
blatantly broken - try that with e.g. tmpfs as underlying layer and watch
the fireworks.  There may be a non-trivial amount of state associated with
current IO position, well beyond the numeric offset.  Using the single
struct file associated with underlying inode is really not a good idea;
we ought to open one for each ecryptfs directory struct file.

Additionally, file_operations both for directories and non-directories are
full of pointless methods; non-directories should *not* have ->iterate(),
directories should not have ->flush(), ->fasync() and ->splice_read().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Change-Id: I4813ce803f270fdd364758ce1dc108b76eab226e
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 fs/ecryptfs/file.c | 71 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 16 deletions(-)

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 11309683d65f..27794b137b24 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
 		.sb = inode->i_sb,
 	};
 	lower_file = ecryptfs_file_to_lower(file);
-	lower_file->f_pos = ctx->pos;
 	rc = iterate_dir(lower_file, &buf.ctx);
 	ctx->pos = buf.ctx.pos;
 	if (rc < 0)
@@ -236,14 +235,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 	}
 	ecryptfs_set_file_lower(
 		file, ecryptfs_inode_to_private(inode)->lower_file);
-	if (d_is_dir(ecryptfs_dentry)) {
-		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
-		mutex_lock(&crypt_stat->cs_mutex);
-		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
-		mutex_unlock(&crypt_stat->cs_mutex);
-		rc = 0;
-		goto out;
-	}
 	rc = read_or_initialize_metadata(ecryptfs_dentry);
 	if (rc)
 		goto out_put;
@@ -260,6 +251,45 @@ out:
 	return rc;
 }
 
+/**
+ * ecryptfs_dir_open
+ * @inode: inode speciying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry *ecryptfs_dentry = file->f_path.dentry;
+	/* Private value of ecryptfs_dentry allocated in
+	 * ecryptfs_lookup() */
+	struct ecryptfs_file_info *file_info;
+	struct file *lower_file;
+
+	/* Released in ecryptfs_release or end of function if failure */
+	file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
+	ecryptfs_set_file_private(file, file_info);
+	if (unlikely(!file_info)) {
+		ecryptfs_printk(KERN_ERR,
+				"Error attempting to allocate memory\n");
+		return -ENOMEM;
+	}
+	lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry),
+				 file->f_flags, current_cred());
+	if (IS_ERR(lower_file)) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the lower file for the dentry with name "
+			"[%pd]; rc = [%ld]\n", __func__,
+			ecryptfs_dentry, PTR_ERR(lower_file));
+		kmem_cache_free(ecryptfs_file_info_cache, file_info);
+		return PTR_ERR(lower_file);
+	}
+	ecryptfs_set_file_lower(file, lower_file);
+	return 0;
+}
+
 static int ecryptfs_flush(struct file *file, fl_owner_t td)
 {
 	struct file *lower_file = ecryptfs_file_to_lower(file);
@@ -280,6 +310,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int ecryptfs_dir_release(struct inode *inode, struct file *file)
+{
+	fput(ecryptfs_file_to_lower(file));
+	kmem_cache_free(ecryptfs_file_info_cache,
+			ecryptfs_file_to_private(file));
+	return 0;
+}
+
+static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence);
+}
+
 static int
 ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
@@ -359,20 +402,16 @@ const struct file_operations ecryptfs_dir_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,
 #endif
-	.open = ecryptfs_open,
-	.flush = ecryptfs_flush,
-	.release = ecryptfs_release,
+	.open = ecryptfs_dir_open,
+	.release = ecryptfs_dir_release,
 	.fsync = ecryptfs_fsync,
-	.fasync = ecryptfs_fasync,
-	.splice_read = generic_file_splice_read,
-	.llseek = default_llseek,
+	.llseek = ecryptfs_dir_llseek,
 };
 
 const struct file_operations ecryptfs_main_fops = {
 	.llseek = generic_file_llseek,
 	.read_iter = ecryptfs_read_update_atime,
 	.write_iter = generic_file_write_iter,
-	.iterate = ecryptfs_readdir,
 	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,

From aa3cda16a57e73e9fc269e1de1eb3c35f3f0ed20 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Tue, 9 Aug 2016 12:47:37 -0700
Subject: [PATCH 0244/3220] ANDROID: dm-verity: adopt changes made to dm
 callbacks

v4.4 introduced changes to the callbacks used for
dm-linear and dm-verity-target targets. Move to those headers
in dm-android-verity.

Verified on hikey while having
BOARD_USES_RECOVERY_AS_BOOT := true
BOARD_BUILD_SYSTEM_ROOT_IMAGE := true

BUG: 27339727
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: Ic64950c3b55f0a6eaa570bcedc2ace83bbf3005e
---
 drivers/md/dm-android-verity.c | 12 +++++-------
 drivers/md/dm-android-verity.h |  6 ++----
 drivers/md/dm-linear.c         |  2 +-
 drivers/md/dm-verity-target.c  |  2 +-
 drivers/md/dm-verity.h         |  6 ++----
 5 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index 1f4eb099209d..15ce2a81c1f4 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -59,8 +59,7 @@ static struct target_type android_verity_target = {
 	.dtr                    = verity_dtr,
 	.map                    = verity_map,
 	.status                 = verity_status,
-	.ioctl                  = verity_ioctl,
-	.merge                  = verity_merge,
+	.prepare_ioctl          = verity_prepare_ioctl,
 	.iterate_devices        = verity_iterate_devices,
 	.io_hints               = verity_io_hints,
 };
@@ -637,8 +636,7 @@ static int add_as_linear_device(struct dm_target *ti, char *dev)
 	android_verity_target.dtr = dm_linear_dtr,
 	android_verity_target.map = dm_linear_map,
 	android_verity_target.status = dm_linear_status,
-	android_verity_target.ioctl = dm_linear_ioctl,
-	android_verity_target.merge = dm_linear_merge,
+	android_verity_target.prepare_ioctl = dm_linear_prepare_ioctl,
 	android_verity_target.iterate_devices = dm_linear_iterate_devices,
 	android_verity_target.io_hints = NULL;
 
@@ -676,7 +674,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	struct fec_ecc_metadata uninitialized_var(ecc);
 	char buf[FEC_ARG_LENGTH], *buf_ptr;
 	unsigned long long tmpll;
-	u64 device_size;
+	u64  uninitialized_var(device_size);
 
 	if (argc == 1) {
 		/* Use the default keyid */
@@ -896,7 +894,7 @@ static int __init dm_android_verity_init(void)
 	}
 
 	file = debugfs_create_bool("target_added", S_IRUGO, debug_dir,
-				(u32 *)&target_added);
+				&target_added);
 
 	if (IS_ERR_OR_NULL(file)) {
 		DMERR("Cannot create android_verity debugfs directory: %ld",
@@ -906,7 +904,7 @@ static int __init dm_android_verity_init(void)
 	}
 
 	file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir,
-				(u32 *)&verity_enabled);
+				&verity_enabled);
 
 	if (IS_ERR_OR_NULL(file)) {
 		DMERR("Cannot create android_verity debugfs directory: %ld",
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
index f43b02fbb475..0c7ff6afec69 100644
--- a/drivers/md/dm-android-verity.h
+++ b/drivers/md/dm-android-verity.h
@@ -113,10 +113,8 @@ extern void dm_linear_dtr(struct dm_target *ti);
 extern int dm_linear_map(struct dm_target *ti, struct bio *bio);
 extern void dm_linear_status(struct dm_target *ti, status_type_t type,
 			unsigned status_flags, char *result, unsigned maxlen);
-extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd,
-		unsigned long arg);
-extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-		struct bio_vec *biovec, int max_size);
+extern int dm_linear_prepare_ioctl(struct dm_target *ti,
+                struct block_device **bdev, fmode_t *mode);
 extern int dm_linear_iterate_devices(struct dm_target *ti,
 			iterate_devices_callout_fn fn, void *data);
 extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 74caca2888a6..8505a771de42 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -116,7 +116,7 @@ void dm_linear_status(struct dm_target *ti, status_type_t type,
 	}
 }
 
-static int dm_linear_prepare_ioctl(struct dm_target *ti,
+int dm_linear_prepare_ioctl(struct dm_target *ti,
 		struct block_device **bdev, fmode_t *mode)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 65835f15a116..5214ed2c7507 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -656,7 +656,7 @@ void verity_status(struct dm_target *ti, status_type_t type,
 	}
 }
 
-static int verity_prepare_ioctl(struct dm_target *ti,
+int verity_prepare_ioctl(struct dm_target *ti,
 		struct block_device **bdev, fmode_t *mode)
 {
 	struct dm_verity *v = ti->private;
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index d9cf5e4939eb..75effca400a3 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -128,10 +128,8 @@ extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 
 extern void verity_status(struct dm_target *ti, status_type_t type,
 			unsigned status_flags, char *result, unsigned maxlen);
-extern int verity_ioctl(struct dm_target *ti, unsigned cmd,
-			unsigned long arg);
-extern int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-			struct bio_vec *biovec, int max_size);
+extern int verity_prepare_ioctl(struct dm_target *ti,
+                struct block_device **bdev, fmode_t *mode);
 extern int verity_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data);
 extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits);

From 1a0cf8ace145843e5d2af961078e0ee4eccf3588 Mon Sep 17 00:00:00 2001
From: Winter Wang <wente.wang@nxp.com>
Date: Wed, 27 Jul 2016 10:03:19 +0800
Subject: [PATCH 0245/3220] UPSTREAM: usb: gadget: configfs: add mutex lock
 before unregister gadget

There may be a race condition if f_fs calls unregister_gadget_item in
ffs_closed() when unregister_gadget is called by UDC store at the same time.
this leads to a kernel NULL pointer dereference:

[  310.644928] Unable to handle kernel NULL pointer dereference at virtual address 00000004
[  310.645053] init: Service 'adbd' is being killed...
[  310.658938] pgd = c9528000
[  310.662515] [00000004] *pgd=19451831, *pte=00000000, *ppte=00000000
[  310.669702] Internal error: Oops: 817 [#1] PREEMPT SMP ARM
[  310.675211] Modules linked in:
[  310.678294] CPU: 0 PID: 1537 Comm: ->transport Not tainted 4.1.15-03725-g793404c #2
[  310.685958] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree)
[  310.692493] task: c8e24200 ti: c945e000 task.ti: c945e000
[  310.697911] PC is at usb_gadget_unregister_driver+0xb4/0xd0
[  310.703502] LR is at __mutex_lock_slowpath+0x10c/0x16c
[  310.708648] pc : [<c075efc0>]    lr : [<c0bfb0bc>]    psr: 600f0113
<snip..>
[  311.565585] [<c075efc0>] (usb_gadget_unregister_driver) from [<c075e2b8>] (unregister_gadget_item+0x1c/0x34)
[  311.575426] [<c075e2b8>] (unregister_gadget_item) from [<c076fcc8>] (ffs_closed+0x8c/0x9c)
[  311.583702] [<c076fcc8>] (ffs_closed) from [<c07736b8>] (ffs_data_reset+0xc/0xa0)
[  311.591194] [<c07736b8>] (ffs_data_reset) from [<c07738ac>] (ffs_data_closed+0x90/0xd0)
[  311.599208] [<c07738ac>] (ffs_data_closed) from [<c07738f8>] (ffs_ep0_release+0xc/0x14)
[  311.607224] [<c07738f8>] (ffs_ep0_release) from [<c023e030>] (__fput+0x80/0x1d0)
[  311.614635] [<c023e030>] (__fput) from [<c014e688>] (task_work_run+0xb0/0xe8)
[  311.621788] [<c014e688>] (task_work_run) from [<c010afdc>] (do_work_pending+0x7c/0xa4)
[  311.629718] [<c010afdc>] (do_work_pending) from [<c010770c>] (work_pending+0xc/0x20)

for functions using functionFS, i.e. android adbd will close /dev/usb-ffs/adb/ep0
when usb IO thread fails, but switch adb from on to off also triggers write
"none" > UDC. These 2 operations both call unregister_gadget, which will lead
to the panic above.

add a mutex before calling unregister_gadget for api used in f_fs.

Signed-off-by: Winter Wang <wente.wang@nxp.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/configfs.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index b27ce0747c18..8df96cb3bb58 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -1737,7 +1737,9 @@ void unregister_gadget_item(struct config_item *item)
 {
 	struct gadget_info *gi = to_gadget_info(item);
 
+	mutex_lock(&gi->lock);
 	unregister_gadget(gi);
+	mutex_unlock(&gi->lock);
 }
 EXPORT_SYMBOL_GPL(unregister_gadget_item);
 

From 34828bd3e72d5f7c1d921a3d090ab6b57f6e1ca7 Mon Sep 17 00:00:00 2001
From: Ricky Liang <jcliang@chromium.org>
Date: Tue, 2 Feb 2016 01:12:06 +0800
Subject: [PATCH 0246/3220] FIXUP: sched: scheduler-driven cpu frequency
 selection

Two fixups that have been reported on LKML. The next version of
scheduler-driver cpu frequency selection patch set should include
these fixes and we can drop this patch then.

Signed-off-by: Ricky Liang <jcliang@chromium.org>

Change-Id: Ia2f8b5c0dd5dac06580256eeb4b259929688af68
---
 kernel/sched/cpufreq_sched.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index 4fea269a6598..f6f9b9b3a4a8 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -131,6 +131,8 @@ static int cpufreq_sched_thread(void *data)
 		new_request = gd->requested_freq;
 		if (new_request == last_request) {
 			set_current_state(TASK_INTERRUPTIBLE);
+			if (kthread_should_stop())
+				break;
 			schedule();
 		} else {
 			/*
@@ -293,6 +295,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 		goto err;
 	}
 
+	policy->governor_data = gd;
 	if (cpufreq_driver_is_slow()) {
 		cpufreq_driver_slow = true;
 		gd->task = kthread_create(cpufreq_sched_thread, policy,
@@ -309,12 +312,12 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 		init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
 	}
 
-	policy->governor_data = gd;
 	set_sched_freq();
 
 	return 0;
 
 err:
+	policy->governor_data = NULL;
 	kfree(gd);
 	return -ENOMEM;
 }

From 76554612b462f53e50f4a40ba80efef90ac7920a Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Wed, 25 Nov 2015 14:09:38 -0500
Subject: [PATCH 0247/3220] sched/fair: Avoid redundant idle_cpu() call in
 update_sg_lb_stats()

Part of the responsibility of the update_sg_lb_stats() function is to
update the idle_cpus statistical counter in struct sg_lb_stats. This
check is done by calling idle_cpu(). The idle_cpu() function, in
turn, checks a number of fields within the run queue structure such
as rq->curr and rq->nr_running.

With the current layout of the run queue structure, rq->curr and
rq->nr_running are in separate cachelines. The rq->curr variable is
checked first followed by nr_running. As nr_running is also accessed
by update_sg_lb_stats() earlier, it makes no sense to load another
cacheline when nr_running is not 0 as idle_cpu() will always return
false in this case.

This patch eliminates this redundant cacheline load by checking the
cached nr_running before calling idle_cpu().

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448478580-26467-2-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit a426f99c91d1036767a7819aaaba6bd3191b7f06)
Signed-off-by: Javi Merino <javi.merino@arm.com>
---
 kernel/sched/fair.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da9323b9ecab..4a7d9d78ce6d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7287,7 +7287,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 			bool *overload, bool *overutilized)
 {
 	unsigned long load;
-	int i;
+	int i, nr_running;
 
 	memset(sgs, 0, sizeof(*sgs));
 
@@ -7304,7 +7304,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->group_util += cpu_util(i);
 		sgs->sum_nr_running += rq->cfs.h_nr_running;
 
-		if (rq->nr_running > 1)
+		nr_running = rq->nr_running;
+		if (nr_running > 1)
 			*overload = true;
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -7312,7 +7313,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
 		sgs->sum_weighted_load += weighted_cpuload(i);
-		if (idle_cpu(i))
+		/*
+		 * No need to call idle_cpu() if nr_running is not 0
+		 */
+		if (!nr_running && idle_cpu(i))
 			sgs->idle_cpus++;
 
 		if (cpu_overutilized(i)) {

From ca9b7b070b5833b9a7f4282df8a8484e5498630f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 10 Jul 2016 10:04:02 +0200
Subject: [PATCH 0248/3220] UPSTREAM: tcp: make challenge acks less predictable

(cherry picked from commit 75ff39ccc1bd5d3c455b6822ab09e533c551f758)

Yue Cao claims that current host rate limiting of challenge ACKS
(RFC 5961) could leak enough information to allow a patient attacker
to hijack TCP sessions. He will soon provide details in an academic
paper.

This patch increases the default limit from 100 to 1000, and adds
some randomization so that the attacker can no longer hijack
sessions without spending a considerable amount of probes.

Based on initial analysis and patch from Linus.

Note that we also have per socket rate limiting, so it is tempting
to remove the host limit in the future.

v2: randomize the count of challenge acks per second, not the period.

Fixes: 282f23c6ee34 ("tcp: implement RFC 5961 3.2")
Reported-by: Yue Cao <ycao009@ucr.edu>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Change-Id: Ib46ba66f5e4a5a7c81bfccd7b0aa83c3d9e1b3bb
Bug: 30809774
---
 net/ipv4/tcp_input.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index da34f830f4bc..2014a701663a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -89,7 +89,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 
 /* rfc5961 challenge ack rate limiting */
-int sysctl_tcp_challenge_ack_limit = 100;
+int sysctl_tcp_challenge_ack_limit = 1000;
 
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
@@ -3428,7 +3428,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 	static u32 challenge_timestamp;
 	static unsigned int challenge_count;
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 now;
+	u32 count, now;
 
 	/* First check our per-socket dupack rate limit. */
 	if (tcp_oow_rate_limited(sock_net(sk), skb,
@@ -3436,13 +3436,18 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 				 &tp->last_oow_ack_time))
 		return;
 
-	/* Then check the check host-wide RFC 5961 rate limit. */
+	/* Then check host-wide RFC 5961 rate limit. */
 	now = jiffies / HZ;
 	if (now != challenge_timestamp) {
+		u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
+
 		challenge_timestamp = now;
-		challenge_count = 0;
+		WRITE_ONCE(challenge_count, half +
+			   prandom_u32_max(sysctl_tcp_challenge_ack_limit));
 	}
-	if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+	count = READ_ONCE(challenge_count);
+	if (count > 0) {
+		WRITE_ONCE(challenge_count, count - 1);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
 		tcp_send_ack(sk);
 	}

From a20900c4aa3596f6318568a662c71e0943b2bbd9 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 13 Jan 2016 17:48:01 +0100
Subject: [PATCH 0249/3220] UPSTREAM: ALSA: timer: Fix race among timer ioctls

(cherry picked from commit af368027a49a751d6ff4ee9e3f9961f35bb4fede)

ALSA timer ioctls have an open race and this may lead to a
use-after-free of timer instance object.  A simplistic fix is to make
each ioctl exclusive.  We have already tread_sem for controlling the
tread, and extend this as a global mutex to be applied to each ioctl.

The downside is, of course, the worse concurrency.  But these ioctls
aren't to be parallel accessible, in anyway, so it should be fine to
serialize there.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Change-Id: I1ac52f1cba5e7408fd88c8fc1c30ca2e83967ebb
Bug: 28694392
---
 sound/core/timer.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/sound/core/timer.c b/sound/core/timer.c
index 1701703b8d93..f19fc67413e6 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -73,7 +73,7 @@ struct snd_timer_user {
 	struct timespec tstamp;		/* trigger tstamp */
 	wait_queue_head_t qchange_sleep;
 	struct fasync_struct *fasync;
-	struct mutex tread_sem;
+	struct mutex ioctl_lock;
 };
 
 /* list of timers */
@@ -1265,7 +1265,7 @@ static int snd_timer_user_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	spin_lock_init(&tu->qlock);
 	init_waitqueue_head(&tu->qchange_sleep);
-	mutex_init(&tu->tread_sem);
+	mutex_init(&tu->ioctl_lock);
 	tu->ticks = 1;
 	tu->queue_size = 128;
 	tu->queue = kmalloc(tu->queue_size * sizeof(struct snd_timer_read),
@@ -1285,8 +1285,10 @@ static int snd_timer_user_release(struct inode *inode, struct file *file)
 	if (file->private_data) {
 		tu = file->private_data;
 		file->private_data = NULL;
+		mutex_lock(&tu->ioctl_lock);
 		if (tu->timeri)
 			snd_timer_close(tu->timeri);
+		mutex_unlock(&tu->ioctl_lock);
 		kfree(tu->queue);
 		kfree(tu->tqueue);
 		kfree(tu);
@@ -1524,7 +1526,6 @@ static int snd_timer_user_tselect(struct file *file,
 	int err = 0;
 
 	tu = file->private_data;
-	mutex_lock(&tu->tread_sem);
 	if (tu->timeri) {
 		snd_timer_close(tu->timeri);
 		tu->timeri = NULL;
@@ -1568,7 +1569,6 @@ static int snd_timer_user_tselect(struct file *file,
 	}
 
       __err:
-      	mutex_unlock(&tu->tread_sem);
 	return err;
 }
 
@@ -1782,7 +1782,7 @@ enum {
 	SNDRV_TIMER_IOCTL_PAUSE_OLD = _IO('T', 0x23),
 };
 
-static long snd_timer_user_ioctl(struct file *file, unsigned int cmd,
+static long __snd_timer_user_ioctl(struct file *file, unsigned int cmd,
 				 unsigned long arg)
 {
 	struct snd_timer_user *tu;
@@ -1799,17 +1799,11 @@ static long snd_timer_user_ioctl(struct file *file, unsigned int cmd,
 	{
 		int xarg;
 
-		mutex_lock(&tu->tread_sem);
-		if (tu->timeri)	{	/* too late */
-			mutex_unlock(&tu->tread_sem);
+		if (tu->timeri)	/* too late */
 			return -EBUSY;
-		}
-		if (get_user(xarg, p)) {
-			mutex_unlock(&tu->tread_sem);
+		if (get_user(xarg, p))
 			return -EFAULT;
-		}
 		tu->tread = xarg ? 1 : 0;
-		mutex_unlock(&tu->tread_sem);
 		return 0;
 	}
 	case SNDRV_TIMER_IOCTL_GINFO:
@@ -1842,6 +1836,18 @@ static long snd_timer_user_ioctl(struct file *file, unsigned int cmd,
 	return -ENOTTY;
 }
 
+static long snd_timer_user_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long arg)
+{
+	struct snd_timer_user *tu = file->private_data;
+	long ret;
+
+	mutex_lock(&tu->ioctl_lock);
+	ret = __snd_timer_user_ioctl(file, cmd, arg);
+	mutex_unlock(&tu->ioctl_lock);
+	return ret;
+}
+
 static int snd_timer_user_fasync(int fd, struct file * file, int on)
 {
 	struct snd_timer_user *tu;

From 50acf9e6e34a14148d6a8c715c79b1cb013f4f3d Mon Sep 17 00:00:00 2001
From: Rainer Weikusat <rweikusat@mobileactivedefense.com>
Date: Thu, 11 Feb 2016 19:37:27 +0000
Subject: [PATCH 0250/3220] UPSTREAM: af_unix: Guard against other == sk in
 unix_dgram_sendmsg

(cherry picked from commit a5527dda344fff0514b7989ef7a755729769daa1)

The unix_dgram_sendmsg routine use the following test

if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {

to determine if sk and other are in an n:1 association (either
established via connect or by using sendto to send messages to an
unrelated socket identified by address). This isn't correct as the
specified address could have been bound to the sending socket itself or
because this socket could have been connected to itself by the time of
the unix_peer_get but disconnected before the unix_state_lock(other). In
both cases, the if-block would be entered despite other == sk which
might either block the sender unintentionally or lead to trying to unlock
the same spin lock twice for a non-blocking send. Add a other != sk
check to guard against this.

Fixes: 7d267278a9ec ("unix: avoid use-after-free in ep_remove_wait_queue")
Reported-By: Philipp Hahn <pmhahn@pmhahn.de>
Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com>
Tested-by: Philipp Hahn <pmhahn@pmhahn.de>
Signed-off-by: David S. Miller <davem@davemloft.net>

Change-Id: I4ebef6a390df3487903b166b837e34c653e01cb2
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 net/unix/af_unix.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ef05cd9403d4..8badff627fef 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1765,7 +1765,12 @@ restart_locked:
 			goto out_unlock;
 	}
 
-	if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
+	/* other == sk && unix_peer(other) != sk if
+	 * - unix_peer(sk) == NULL, destination address bound to sk
+	 * - unix_peer(sk) == sk by time of get but disconnected before lock
+	 */
+	if (other != sk &&
+	    unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
 		if (timeo) {
 			timeo = unix_wait_for_peer(other, timeo);
 

From 8d525c512280bb7d8218fd59c04de985b1886eca Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Tue, 16 Aug 2016 15:51:34 -0700
Subject: [PATCH 0251/3220] Android: MMC/UFS IO Latency Histograms.

This patch adds a new sysfs node (latency_hist) and reports IO
(svc time) latency histograms. Disabled by default, can be enabled
by echoing 0 into latency_hist, stats can be cleared by writing 2
into latency_hist.

Bug: 30677035
Change-Id: I625938135ea33e6e87cf6af1fc7edc136d8b4b32
Signed-off-by: Mohan Srinivasan <srmohan@google.com>
---
 block/blk-core.c          | 80 ++++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/core.c   | 66 ++++++++++++++++++++++++++++++-
 drivers/mmc/core/host.c   |  6 ++-
 drivers/mmc/core/host.h   |  5 +++
 drivers/scsi/ufs/ufshcd.c | 81 +++++++++++++++++++++++++++++++++++++++
 drivers/scsi/ufs/ufshcd.h |  3 ++
 include/linux/blkdev.h    | 76 ++++++++++++++++++++++++++++++++++++
 include/linux/mmc/core.h  |  2 +
 include/linux/mmc/host.h  |  4 ++
 9 files changed, 320 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 33e2f62d5062..f95413fb7f2b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3539,3 +3539,83 @@ int __init blk_dev_init(void)
 
 	return 0;
 }
+
+/*
+ * Blk IO latency support. We want this to be as cheap as possible, so doing
+ * this lockless (and avoiding atomics), a few off by a few errors in this
+ * code is not harmful, and we don't want to do anything that is
+ * perf-impactful.
+ * TODO : If necessary, we can make the histograms per-cpu and aggregate
+ * them when printing them out.
+ */
+void
+blk_zero_latency_hist(struct io_latency_state *s)
+{
+	memset(s->latency_y_axis_read, 0,
+	       sizeof(s->latency_y_axis_read));
+	memset(s->latency_y_axis_write, 0,
+	       sizeof(s->latency_y_axis_write));
+	s->latency_reads_elems = 0;
+	s->latency_writes_elems = 0;
+}
+
+ssize_t
+blk_latency_hist_show(struct io_latency_state *s, char *buf)
+{
+	int i;
+	int bytes_written = 0;
+	int pct, num_elem;
+	u_int64_t elem;
+
+	num_elem = s->latency_reads_elems;
+	if (num_elem > 0) {
+		bytes_written += scnprintf(buf + bytes_written,
+			   PAGE_SIZE - bytes_written,
+			   "IO svc_time Read Latency Histogram (n = %d):\n",
+			   num_elem);
+		for (i = 0;
+		     i < ARRAY_SIZE(latency_x_axis_us);
+		     i++) {
+			elem = s->latency_y_axis_read[i];
+			pct = (elem * 100) / num_elem;
+			bytes_written += scnprintf(buf + bytes_written,
+						   PAGE_SIZE - bytes_written,
+						   "\t< %5lluus%15llu%15d%%\n",
+						   latency_x_axis_us[i],
+						   elem, pct);
+		}
+		/* Last element in y-axis table is overflow */
+		elem = s->latency_y_axis_read[i];
+		pct = (elem * 100) / num_elem;
+		bytes_written += scnprintf(buf + bytes_written,
+					   PAGE_SIZE - bytes_written,
+					   "\t> %5dms%15llu%15d%%\n", 10,
+					   elem, pct);
+	}
+	num_elem = s->latency_writes_elems;
+	if (num_elem > 0) {
+		bytes_written += scnprintf(buf + bytes_written,
+			   PAGE_SIZE - bytes_written,
+			   "IO svc_time Write Latency Histogram (n = %d):\n",
+			   num_elem);
+		for (i = 0;
+		     i < ARRAY_SIZE(latency_x_axis_us);
+		     i++) {
+			elem = s->latency_y_axis_write[i];
+			pct = (elem * 100) / num_elem;
+			bytes_written += scnprintf(buf + bytes_written,
+						   PAGE_SIZE - bytes_written,
+						   "\t< %5lluus%15llu%15d%%\n",
+						   latency_x_axis_us[i],
+						   elem, pct);
+		}
+		/* Last element in y-axis table is overflow */
+		elem = s->latency_y_axis_write[i];
+		pct = (elem * 100) / num_elem;
+		bytes_written += scnprintf(buf + bytes_written,
+					   PAGE_SIZE - bytes_written,
+					   "\t> %5dms%15llu%15d%%\n", 10,
+					   elem, pct);
+	}
+	return bytes_written;
+}
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 9fab52559a8c..b503249950df 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -183,6 +183,17 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
 			pr_debug("%s:     %d bytes transferred: %d\n",
 				mmc_hostname(host),
 				mrq->data->bytes_xfered, mrq->data->error);
+			if (mrq->lat_hist_enabled) {
+				ktime_t completion;
+				u_int64_t delta_us;
+
+				completion = ktime_get();
+				delta_us = ktime_us_delta(completion,
+							  mrq->io_start);
+				blk_update_latency_hist(&host->io_lat_s,
+					(mrq->data->flags & MMC_DATA_READ),
+					delta_us);
+			}
 			trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data);
 		}
 
@@ -627,6 +638,11 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host,
 	}
 
 	if (!err && areq) {
+		if (host->latency_hist_enabled) {
+			areq->mrq->io_start = ktime_get();
+			areq->mrq->lat_hist_enabled = 1;
+		} else
+			areq->mrq->lat_hist_enabled = 0;
 		trace_mmc_blk_rw_start(areq->mrq->cmd->opcode,
 				       areq->mrq->cmd->arg,
 				       areq->mrq->data);
@@ -1964,7 +1980,7 @@ void mmc_init_erase(struct mmc_card *card)
 }
 
 static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card,
-				          unsigned int arg, unsigned int qty)
+					  unsigned int arg, unsigned int qty)
 {
 	unsigned int erase_timeout;
 
@@ -2907,6 +2923,54 @@ static void __exit mmc_exit(void)
 	destroy_workqueue(workqueue);
 }
 
+static ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct mmc_host *host = cls_dev_to_mmc_host(dev);
+
+	return blk_latency_hist_show(&host->io_lat_s, buf);
+}
+
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+		   const char *buf, size_t count)
+{
+	struct mmc_host *host = cls_dev_to_mmc_host(dev);
+	long value;
+
+	if (kstrtol(buf, 0, &value))
+		return -EINVAL;
+	if (value == BLK_IO_LAT_HIST_ZERO)
+		blk_zero_latency_hist(&host->io_lat_s);
+	else if (value == BLK_IO_LAT_HIST_ENABLE ||
+		 value == BLK_IO_LAT_HIST_DISABLE)
+		host->latency_hist_enabled = value;
+	return count;
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+		   latency_hist_show, latency_hist_store);
+
+void
+mmc_latency_hist_sysfs_init(struct mmc_host *host)
+{
+	if (device_create_file(&host->class_dev, &dev_attr_latency_hist))
+		dev_err(&host->class_dev,
+			"Failed to create latency_hist sysfs entry\n");
+}
+
+void
+mmc_latency_hist_sysfs_exit(struct mmc_host *host)
+{
+	device_remove_file(&host->class_dev, &dev_attr_latency_hist);
+}
+
 subsys_initcall(mmc_init);
 module_exit(mmc_exit);
 
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index fcf7829c759e..17068839c74b 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -32,8 +32,6 @@
 #include "slot-gpio.h"
 #include "pwrseq.h"
 
-#define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
-
 static DEFINE_IDR(mmc_host_idr);
 static DEFINE_SPINLOCK(mmc_host_lock);
 
@@ -394,6 +392,8 @@ int mmc_add_host(struct mmc_host *host)
 	mmc_add_host_debugfs(host);
 #endif
 
+	mmc_latency_hist_sysfs_init(host);
+
 	mmc_start_host(host);
 	if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
 		register_pm_notifier(&host->pm_notify);
@@ -422,6 +422,8 @@ void mmc_remove_host(struct mmc_host *host)
 	mmc_remove_host_debugfs(host);
 #endif
 
+	mmc_latency_hist_sysfs_exit(host);
+
 	device_del(&host->class_dev);
 
 	led_trigger_unregister_simple(host->led);
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index 992bf5397633..bf38533406fd 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -12,6 +12,8 @@
 #define _MMC_CORE_HOST_H
 #include <linux/mmc/host.h>
 
+#define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
+
 int mmc_register_host_class(void);
 void mmc_unregister_host_class(void);
 
@@ -21,5 +23,8 @@ void mmc_retune_hold(struct mmc_host *host);
 void mmc_retune_release(struct mmc_host *host);
 int mmc_retune(struct mmc_host *host);
 
+void mmc_latency_hist_sysfs_init(struct mmc_host *host);
+void mmc_latency_hist_sysfs_exit(struct mmc_host *host);
+
 #endif
 
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 85cd2564c157..4167bdbf0ecf 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -39,6 +39,7 @@
 
 #include <linux/async.h>
 #include <linux/devfreq.h>
+#include <linux/blkdev.h>
 
 #include "ufshcd.h"
 #include "unipro.h"
@@ -1332,6 +1333,17 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		clear_bit_unlock(tag, &hba->lrb_in_use);
 		goto out;
 	}
+
+	/* IO svc time latency histogram */
+	if (hba != NULL && cmd->request != NULL) {
+		if (hba->latency_hist_enabled &&
+		    (cmd->request->cmd_type == REQ_TYPE_FS)) {
+			cmd->request->lat_hist_io_start = ktime_get();
+			cmd->request->lat_hist_enabled = 1;
+		} else
+			cmd->request->lat_hist_enabled = 0;
+	}
+
 	WARN_ON(hba->clk_gating.state != CLKS_ON);
 
 	lrbp = &hba->lrb[tag];
@@ -3160,6 +3172,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
 	u32 tr_doorbell;
 	int result;
 	int index;
+	struct request *req;
 
 	/* Resetting interrupt aggregation counters first and reading the
 	 * DOOR_BELL afterward allows us to handle all the completed requests.
@@ -3184,6 +3197,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
 			/* Mark completed command as NULL in LRB */
 			lrbp->cmd = NULL;
 			clear_bit_unlock(index, &hba->lrb_in_use);
+			req = cmd->request;
+			if (req) {
+				/* Update IO svc time latency histogram */
+				if (req->lat_hist_enabled) {
+					ktime_t completion;
+					u_int64_t delta_us;
+
+					completion = ktime_get();
+					delta_us = ktime_us_delta(completion,
+						  req->lat_hist_io_start);
+					/* rq_data_dir() => true if WRITE */
+					blk_update_latency_hist(&hba->io_lat_s,
+						(rq_data_dir(req) == READ),
+						delta_us);
+				}
+			}
 			/* Do not touch lrbp after scsi done */
 			cmd->scsi_done(cmd);
 			__ufshcd_release(hba);
@@ -5327,6 +5356,54 @@ out:
 }
 EXPORT_SYMBOL(ufshcd_shutdown);
 
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+		   const char *buf, size_t count)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+	long value;
+
+	if (kstrtol(buf, 0, &value))
+		return -EINVAL;
+	if (value == BLK_IO_LAT_HIST_ZERO)
+		blk_zero_latency_hist(&hba->io_lat_s);
+	else if (value == BLK_IO_LAT_HIST_ENABLE ||
+		 value == BLK_IO_LAT_HIST_DISABLE)
+		hba->latency_hist_enabled = value;
+	return count;
+}
+
+ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr,
+		  char *buf)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+
+	return blk_latency_hist_show(&hba->io_lat_s, buf);
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+		   latency_hist_show, latency_hist_store);
+
+static void
+ufshcd_init_latency_hist(struct ufs_hba *hba)
+{
+	if (device_create_file(hba->dev, &dev_attr_latency_hist))
+		dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n");
+}
+
+static void
+ufshcd_exit_latency_hist(struct ufs_hba *hba)
+{
+	device_create_file(hba->dev, &dev_attr_latency_hist);
+}
+
 /**
  * ufshcd_remove - de-allocate SCSI host and host memory space
  *		data structure memory
@@ -5342,6 +5419,7 @@ void ufshcd_remove(struct ufs_hba *hba)
 	scsi_host_put(hba->host);
 
 	ufshcd_exit_clk_gating(hba);
+	ufshcd_exit_latency_hist(hba);
 	if (ufshcd_is_clkscaling_enabled(hba))
 		devfreq_remove_device(hba->devfreq);
 	ufshcd_hba_exit(hba);
@@ -5639,6 +5717,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	/* Hold auto suspend until async scan completes */
 	pm_runtime_get_sync(dev);
 
+	ufshcd_init_latency_hist(hba);
+
 	/*
 	 * The device-initialize-sequence hasn't been invoked yet.
 	 * Set the device to power-off state
@@ -5653,6 +5733,7 @@ out_remove_scsi_host:
 	scsi_remove_host(hba->host);
 exit_gating:
 	ufshcd_exit_clk_gating(hba);
+	ufshcd_exit_latency_hist(hba);
 out_disable:
 	hba->is_irq_enabled = false;
 	scsi_host_put(host);
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 2570d9477b37..f3780cf7d895 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -532,6 +532,9 @@ struct ufs_hba {
 	struct devfreq *devfreq;
 	struct ufs_clk_scaling clk_scaling;
 	bool is_sys_suspended;
+
+	int			latency_hist_enabled;
+	struct io_latency_state io_lat_s;
 };
 
 /* Returns true if clocks can be gated. Otherwise false */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c70e3588a48c..faf2c0093527 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -197,6 +197,9 @@ struct request {
 
 	/* for bidi */
 	struct request *next_rq;
+
+	ktime_t			lat_hist_io_start;
+	int			lat_hist_enabled;
 };
 
 static inline unsigned short req_get_ioprio(struct request *req)
@@ -1645,6 +1648,79 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, sector_t,
 		void __pmem **addr, unsigned long *pfn, long size);
+
+/*
+ * X-axis for IO latency histogram support.
+ */
+static const u_int64_t latency_x_axis_us[] = {
+	100,
+	200,
+	300,
+	400,
+	500,
+	600,
+	700,
+	800,
+	900,
+	1000,
+	1200,
+	1400,
+	1600,
+	1800,
+	2000,
+	2500,
+	3000,
+	4000,
+	5000,
+	6000,
+	7000,
+	9000,
+	10000
+};
+
+#define BLK_IO_LAT_HIST_DISABLE         0
+#define BLK_IO_LAT_HIST_ENABLE          1
+#define BLK_IO_LAT_HIST_ZERO            2
+
+struct io_latency_state {
+	u_int64_t	latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1];
+	u_int64_t	latency_reads_elems;
+	u_int64_t	latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1];
+	u_int64_t	latency_writes_elems;
+};
+
+static inline void
+blk_update_latency_hist(struct io_latency_state *s,
+			int read,
+			u_int64_t delta_us)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) {
+		if (delta_us < (u_int64_t)latency_x_axis_us[i]) {
+			if (read)
+				s->latency_y_axis_read[i]++;
+			else
+				s->latency_y_axis_write[i]++;
+			break;
+		}
+	}
+	if (i == ARRAY_SIZE(latency_x_axis_us)) {
+		/* Overflowed the histogram */
+		if (read)
+			s->latency_y_axis_read[i]++;
+		else
+			s->latency_y_axis_write[i]++;
+	}
+	if (read)
+		s->latency_reads_elems++;
+	else
+		s->latency_writes_elems++;
+}
+
+void blk_zero_latency_hist(struct io_latency_state *s);
+ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf);
+
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 37967b6da03c..3349f0676acb 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -136,6 +136,8 @@ struct mmc_request {
 	struct completion	completion;
 	void			(*done)(struct mmc_request *);/* completion function */
 	struct mmc_host		*host;
+	ktime_t			io_start;
+	int			lat_hist_enabled;
 };
 
 struct mmc_card;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 40025b28c1fb..e4862f7cdede 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/device.h>
 #include <linux/fault-inject.h>
+#include <linux/blkdev.h>
 
 #include <linux/mmc/core.h>
 #include <linux/mmc/card.h>
@@ -379,6 +380,9 @@ struct mmc_host {
 	} embedded_sdio_data;
 #endif
 
+	int			latency_hist_enabled;
+	struct io_latency_state io_lat_s;
+
 	unsigned long		private[0] ____cacheline_aligned;
 };
 

From 1c6df5fdcb19a03fb6b652090b4b968a6377fb14 Mon Sep 17 00:00:00 2001
From: Mohamad Ayyash <mkayyash@google.com>
Date: Thu, 25 Aug 2016 00:59:21 +0000
Subject: [PATCH 0252/3220] Revert "Android: MMC/UFS IO Latency Histograms."

This reverts commit 8d525c512280bb7d8218fd59c04de985b1886eca.

Change-Id: I69350b98d9de9b1c9f591e03a90f133e328ba72a
---
 block/blk-core.c          | 80 --------------------------------------
 drivers/mmc/core/core.c   | 66 +------------------------------
 drivers/mmc/core/host.c   |  6 +--
 drivers/mmc/core/host.h   |  5 ---
 drivers/scsi/ufs/ufshcd.c | 81 ---------------------------------------
 drivers/scsi/ufs/ufshcd.h |  3 --
 include/linux/blkdev.h    | 76 ------------------------------------
 include/linux/mmc/core.h  |  2 -
 include/linux/mmc/host.h  |  4 --
 9 files changed, 3 insertions(+), 320 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f95413fb7f2b..33e2f62d5062 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3539,83 +3539,3 @@ int __init blk_dev_init(void)
 
 	return 0;
 }
-
-/*
- * Blk IO latency support. We want this to be as cheap as possible, so doing
- * this lockless (and avoiding atomics), a few off by a few errors in this
- * code is not harmful, and we don't want to do anything that is
- * perf-impactful.
- * TODO : If necessary, we can make the histograms per-cpu and aggregate
- * them when printing them out.
- */
-void
-blk_zero_latency_hist(struct io_latency_state *s)
-{
-	memset(s->latency_y_axis_read, 0,
-	       sizeof(s->latency_y_axis_read));
-	memset(s->latency_y_axis_write, 0,
-	       sizeof(s->latency_y_axis_write));
-	s->latency_reads_elems = 0;
-	s->latency_writes_elems = 0;
-}
-
-ssize_t
-blk_latency_hist_show(struct io_latency_state *s, char *buf)
-{
-	int i;
-	int bytes_written = 0;
-	int pct, num_elem;
-	u_int64_t elem;
-
-	num_elem = s->latency_reads_elems;
-	if (num_elem > 0) {
-		bytes_written += scnprintf(buf + bytes_written,
-			   PAGE_SIZE - bytes_written,
-			   "IO svc_time Read Latency Histogram (n = %d):\n",
-			   num_elem);
-		for (i = 0;
-		     i < ARRAY_SIZE(latency_x_axis_us);
-		     i++) {
-			elem = s->latency_y_axis_read[i];
-			pct = (elem * 100) / num_elem;
-			bytes_written += scnprintf(buf + bytes_written,
-						   PAGE_SIZE - bytes_written,
-						   "\t< %5lluus%15llu%15d%%\n",
-						   latency_x_axis_us[i],
-						   elem, pct);
-		}
-		/* Last element in y-axis table is overflow */
-		elem = s->latency_y_axis_read[i];
-		pct = (elem * 100) / num_elem;
-		bytes_written += scnprintf(buf + bytes_written,
-					   PAGE_SIZE - bytes_written,
-					   "\t> %5dms%15llu%15d%%\n", 10,
-					   elem, pct);
-	}
-	num_elem = s->latency_writes_elems;
-	if (num_elem > 0) {
-		bytes_written += scnprintf(buf + bytes_written,
-			   PAGE_SIZE - bytes_written,
-			   "IO svc_time Write Latency Histogram (n = %d):\n",
-			   num_elem);
-		for (i = 0;
-		     i < ARRAY_SIZE(latency_x_axis_us);
-		     i++) {
-			elem = s->latency_y_axis_write[i];
-			pct = (elem * 100) / num_elem;
-			bytes_written += scnprintf(buf + bytes_written,
-						   PAGE_SIZE - bytes_written,
-						   "\t< %5lluus%15llu%15d%%\n",
-						   latency_x_axis_us[i],
-						   elem, pct);
-		}
-		/* Last element in y-axis table is overflow */
-		elem = s->latency_y_axis_write[i];
-		pct = (elem * 100) / num_elem;
-		bytes_written += scnprintf(buf + bytes_written,
-					   PAGE_SIZE - bytes_written,
-					   "\t> %5dms%15llu%15d%%\n", 10,
-					   elem, pct);
-	}
-	return bytes_written;
-}
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index b503249950df..9fab52559a8c 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -183,17 +183,6 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
 			pr_debug("%s:     %d bytes transferred: %d\n",
 				mmc_hostname(host),
 				mrq->data->bytes_xfered, mrq->data->error);
-			if (mrq->lat_hist_enabled) {
-				ktime_t completion;
-				u_int64_t delta_us;
-
-				completion = ktime_get();
-				delta_us = ktime_us_delta(completion,
-							  mrq->io_start);
-				blk_update_latency_hist(&host->io_lat_s,
-					(mrq->data->flags & MMC_DATA_READ),
-					delta_us);
-			}
 			trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data);
 		}
 
@@ -638,11 +627,6 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host,
 	}
 
 	if (!err && areq) {
-		if (host->latency_hist_enabled) {
-			areq->mrq->io_start = ktime_get();
-			areq->mrq->lat_hist_enabled = 1;
-		} else
-			areq->mrq->lat_hist_enabled = 0;
 		trace_mmc_blk_rw_start(areq->mrq->cmd->opcode,
 				       areq->mrq->cmd->arg,
 				       areq->mrq->data);
@@ -1980,7 +1964,7 @@ void mmc_init_erase(struct mmc_card *card)
 }
 
 static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card,
-					  unsigned int arg, unsigned int qty)
+				          unsigned int arg, unsigned int qty)
 {
 	unsigned int erase_timeout;
 
@@ -2923,54 +2907,6 @@ static void __exit mmc_exit(void)
 	destroy_workqueue(workqueue);
 }
 
-static ssize_t
-latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct mmc_host *host = cls_dev_to_mmc_host(dev);
-
-	return blk_latency_hist_show(&host->io_lat_s, buf);
-}
-
-/*
- * Values permitted 0, 1, 2.
- * 0 -> Disable IO latency histograms (default)
- * 1 -> Enable IO latency histograms
- * 2 -> Zero out IO latency histograms
- */
-static ssize_t
-latency_hist_store(struct device *dev, struct device_attribute *attr,
-		   const char *buf, size_t count)
-{
-	struct mmc_host *host = cls_dev_to_mmc_host(dev);
-	long value;
-
-	if (kstrtol(buf, 0, &value))
-		return -EINVAL;
-	if (value == BLK_IO_LAT_HIST_ZERO)
-		blk_zero_latency_hist(&host->io_lat_s);
-	else if (value == BLK_IO_LAT_HIST_ENABLE ||
-		 value == BLK_IO_LAT_HIST_DISABLE)
-		host->latency_hist_enabled = value;
-	return count;
-}
-
-static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
-		   latency_hist_show, latency_hist_store);
-
-void
-mmc_latency_hist_sysfs_init(struct mmc_host *host)
-{
-	if (device_create_file(&host->class_dev, &dev_attr_latency_hist))
-		dev_err(&host->class_dev,
-			"Failed to create latency_hist sysfs entry\n");
-}
-
-void
-mmc_latency_hist_sysfs_exit(struct mmc_host *host)
-{
-	device_remove_file(&host->class_dev, &dev_attr_latency_hist);
-}
-
 subsys_initcall(mmc_init);
 module_exit(mmc_exit);
 
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 17068839c74b..fcf7829c759e 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -32,6 +32,8 @@
 #include "slot-gpio.h"
 #include "pwrseq.h"
 
+#define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
+
 static DEFINE_IDR(mmc_host_idr);
 static DEFINE_SPINLOCK(mmc_host_lock);
 
@@ -392,8 +394,6 @@ int mmc_add_host(struct mmc_host *host)
 	mmc_add_host_debugfs(host);
 #endif
 
-	mmc_latency_hist_sysfs_init(host);
-
 	mmc_start_host(host);
 	if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
 		register_pm_notifier(&host->pm_notify);
@@ -422,8 +422,6 @@ void mmc_remove_host(struct mmc_host *host)
 	mmc_remove_host_debugfs(host);
 #endif
 
-	mmc_latency_hist_sysfs_exit(host);
-
 	device_del(&host->class_dev);
 
 	led_trigger_unregister_simple(host->led);
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index bf38533406fd..992bf5397633 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -12,8 +12,6 @@
 #define _MMC_CORE_HOST_H
 #include <linux/mmc/host.h>
 
-#define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
-
 int mmc_register_host_class(void);
 void mmc_unregister_host_class(void);
 
@@ -23,8 +21,5 @@ void mmc_retune_hold(struct mmc_host *host);
 void mmc_retune_release(struct mmc_host *host);
 int mmc_retune(struct mmc_host *host);
 
-void mmc_latency_hist_sysfs_init(struct mmc_host *host);
-void mmc_latency_hist_sysfs_exit(struct mmc_host *host);
-
 #endif
 
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 4167bdbf0ecf..85cd2564c157 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -39,7 +39,6 @@
 
 #include <linux/async.h>
 #include <linux/devfreq.h>
-#include <linux/blkdev.h>
 
 #include "ufshcd.h"
 #include "unipro.h"
@@ -1333,17 +1332,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		clear_bit_unlock(tag, &hba->lrb_in_use);
 		goto out;
 	}
-
-	/* IO svc time latency histogram */
-	if (hba != NULL && cmd->request != NULL) {
-		if (hba->latency_hist_enabled &&
-		    (cmd->request->cmd_type == REQ_TYPE_FS)) {
-			cmd->request->lat_hist_io_start = ktime_get();
-			cmd->request->lat_hist_enabled = 1;
-		} else
-			cmd->request->lat_hist_enabled = 0;
-	}
-
 	WARN_ON(hba->clk_gating.state != CLKS_ON);
 
 	lrbp = &hba->lrb[tag];
@@ -3172,7 +3160,6 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
 	u32 tr_doorbell;
 	int result;
 	int index;
-	struct request *req;
 
 	/* Resetting interrupt aggregation counters first and reading the
 	 * DOOR_BELL afterward allows us to handle all the completed requests.
@@ -3197,22 +3184,6 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
 			/* Mark completed command as NULL in LRB */
 			lrbp->cmd = NULL;
 			clear_bit_unlock(index, &hba->lrb_in_use);
-			req = cmd->request;
-			if (req) {
-				/* Update IO svc time latency histogram */
-				if (req->lat_hist_enabled) {
-					ktime_t completion;
-					u_int64_t delta_us;
-
-					completion = ktime_get();
-					delta_us = ktime_us_delta(completion,
-						  req->lat_hist_io_start);
-					/* rq_data_dir() => true if WRITE */
-					blk_update_latency_hist(&hba->io_lat_s,
-						(rq_data_dir(req) == READ),
-						delta_us);
-				}
-			}
 			/* Do not touch lrbp after scsi done */
 			cmd->scsi_done(cmd);
 			__ufshcd_release(hba);
@@ -5356,54 +5327,6 @@ out:
 }
 EXPORT_SYMBOL(ufshcd_shutdown);
 
-/*
- * Values permitted 0, 1, 2.
- * 0 -> Disable IO latency histograms (default)
- * 1 -> Enable IO latency histograms
- * 2 -> Zero out IO latency histograms
- */
-static ssize_t
-latency_hist_store(struct device *dev, struct device_attribute *attr,
-		   const char *buf, size_t count)
-{
-	struct ufs_hba *hba = dev_get_drvdata(dev);
-	long value;
-
-	if (kstrtol(buf, 0, &value))
-		return -EINVAL;
-	if (value == BLK_IO_LAT_HIST_ZERO)
-		blk_zero_latency_hist(&hba->io_lat_s);
-	else if (value == BLK_IO_LAT_HIST_ENABLE ||
-		 value == BLK_IO_LAT_HIST_DISABLE)
-		hba->latency_hist_enabled = value;
-	return count;
-}
-
-ssize_t
-latency_hist_show(struct device *dev, struct device_attribute *attr,
-		  char *buf)
-{
-	struct ufs_hba *hba = dev_get_drvdata(dev);
-
-	return blk_latency_hist_show(&hba->io_lat_s, buf);
-}
-
-static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
-		   latency_hist_show, latency_hist_store);
-
-static void
-ufshcd_init_latency_hist(struct ufs_hba *hba)
-{
-	if (device_create_file(hba->dev, &dev_attr_latency_hist))
-		dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n");
-}
-
-static void
-ufshcd_exit_latency_hist(struct ufs_hba *hba)
-{
-	device_create_file(hba->dev, &dev_attr_latency_hist);
-}
-
 /**
  * ufshcd_remove - de-allocate SCSI host and host memory space
  *		data structure memory
@@ -5419,7 +5342,6 @@ void ufshcd_remove(struct ufs_hba *hba)
 	scsi_host_put(hba->host);
 
 	ufshcd_exit_clk_gating(hba);
-	ufshcd_exit_latency_hist(hba);
 	if (ufshcd_is_clkscaling_enabled(hba))
 		devfreq_remove_device(hba->devfreq);
 	ufshcd_hba_exit(hba);
@@ -5717,8 +5639,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	/* Hold auto suspend until async scan completes */
 	pm_runtime_get_sync(dev);
 
-	ufshcd_init_latency_hist(hba);
-
 	/*
 	 * The device-initialize-sequence hasn't been invoked yet.
 	 * Set the device to power-off state
@@ -5733,7 +5653,6 @@ out_remove_scsi_host:
 	scsi_remove_host(hba->host);
 exit_gating:
 	ufshcd_exit_clk_gating(hba);
-	ufshcd_exit_latency_hist(hba);
 out_disable:
 	hba->is_irq_enabled = false;
 	scsi_host_put(host);
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index f3780cf7d895..2570d9477b37 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -532,9 +532,6 @@ struct ufs_hba {
 	struct devfreq *devfreq;
 	struct ufs_clk_scaling clk_scaling;
 	bool is_sys_suspended;
-
-	int			latency_hist_enabled;
-	struct io_latency_state io_lat_s;
 };
 
 /* Returns true if clocks can be gated. Otherwise false */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index faf2c0093527..c70e3588a48c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -197,9 +197,6 @@ struct request {
 
 	/* for bidi */
 	struct request *next_rq;
-
-	ktime_t			lat_hist_io_start;
-	int			lat_hist_enabled;
 };
 
 static inline unsigned short req_get_ioprio(struct request *req)
@@ -1648,79 +1645,6 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, sector_t,
 		void __pmem **addr, unsigned long *pfn, long size);
-
-/*
- * X-axis for IO latency histogram support.
- */
-static const u_int64_t latency_x_axis_us[] = {
-	100,
-	200,
-	300,
-	400,
-	500,
-	600,
-	700,
-	800,
-	900,
-	1000,
-	1200,
-	1400,
-	1600,
-	1800,
-	2000,
-	2500,
-	3000,
-	4000,
-	5000,
-	6000,
-	7000,
-	9000,
-	10000
-};
-
-#define BLK_IO_LAT_HIST_DISABLE         0
-#define BLK_IO_LAT_HIST_ENABLE          1
-#define BLK_IO_LAT_HIST_ZERO            2
-
-struct io_latency_state {
-	u_int64_t	latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1];
-	u_int64_t	latency_reads_elems;
-	u_int64_t	latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1];
-	u_int64_t	latency_writes_elems;
-};
-
-static inline void
-blk_update_latency_hist(struct io_latency_state *s,
-			int read,
-			u_int64_t delta_us)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) {
-		if (delta_us < (u_int64_t)latency_x_axis_us[i]) {
-			if (read)
-				s->latency_y_axis_read[i]++;
-			else
-				s->latency_y_axis_write[i]++;
-			break;
-		}
-	}
-	if (i == ARRAY_SIZE(latency_x_axis_us)) {
-		/* Overflowed the histogram */
-		if (read)
-			s->latency_y_axis_read[i]++;
-		else
-			s->latency_y_axis_write[i]++;
-	}
-	if (read)
-		s->latency_reads_elems++;
-	else
-		s->latency_writes_elems++;
-}
-
-void blk_zero_latency_hist(struct io_latency_state *s);
-ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf);
-
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 3349f0676acb..37967b6da03c 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -136,8 +136,6 @@ struct mmc_request {
 	struct completion	completion;
 	void			(*done)(struct mmc_request *);/* completion function */
 	struct mmc_host		*host;
-	ktime_t			io_start;
-	int			lat_hist_enabled;
 };
 
 struct mmc_card;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index e4862f7cdede..40025b28c1fb 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -16,7 +16,6 @@
 #include <linux/sched.h>
 #include <linux/device.h>
 #include <linux/fault-inject.h>
-#include <linux/blkdev.h>
 
 #include <linux/mmc/core.h>
 #include <linux/mmc/card.h>
@@ -380,9 +379,6 @@ struct mmc_host {
 	} embedded_sdio_data;
 #endif
 
-	int			latency_hist_enabled;
-	struct io_latency_state io_lat_s;
-
 	unsigned long		private[0] ____cacheline_aligned;
 };
 

From 8a58605e4661d18b4e09f1385fe22b1e83cd2cb6 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Tue, 23 Aug 2016 11:32:37 -0700
Subject: [PATCH 0253/3220] ANDROID: dm: android-verity: Allow android-verity
 to be compiled as an independent module

Exports the device mapper callbacks of linear and dm-verity-target
methods.

Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I0358be0615c431dce3cc78575aaac4ccfe3aacd7
---
 drivers/md/Kconfig            | 3 ++-
 drivers/md/Makefile           | 5 +----
 drivers/md/dm-linear.c        | 6 ++++++
 drivers/md/dm-verity-target.c | 7 +++++++
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 96b419b544ed..6035794bc1f2 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -501,7 +501,7 @@ config DM_LOG_WRITES
 	  If unsure, say N.
 
 config DM_ANDROID_VERITY
-	bool "Android verity target support"
+	tristate "Android verity target support"
 	depends on DM_VERITY
 	depends on X509_CERTIFICATE_PARSER
 	depends on SYSTEM_TRUSTED_KEYRING
@@ -509,6 +509,7 @@ config DM_ANDROID_VERITY
 	depends on KEYS
 	depends on ASYMMETRIC_KEY_TYPE
 	depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
+	depends on MD_LINEAR
 	---help---
 	  This device-mapper target is virtually a VERITY target. This
 	  target is setup by reading the metadata contents piggybacked
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index c8fb00d8cc36..32b5d0a90d60 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
+obj-$(CONFIG_DM_ANDROID_VERITY) += dm-android-verity.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
@@ -68,7 +69,3 @@ endif
 ifeq ($(CONFIG_DM_VERITY_FEC),y)
 dm-verity-objs			+= dm-verity-fec.o
 endif
-
-ifeq ($(CONFIG_DM_ANDROID_VERITY),y)
-dm-verity-objs			+= dm-android-verity.o
-endif
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 8505a771de42..2ff5f32a4b99 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -66,6 +66,7 @@ int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	kfree(lc);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(dm_linear_ctr);
 
 void dm_linear_dtr(struct dm_target *ti)
 {
@@ -74,6 +75,7 @@ void dm_linear_dtr(struct dm_target *ti)
 	dm_put_device(ti, lc->dev);
 	kfree(lc);
 }
+EXPORT_SYMBOL_GPL(dm_linear_dtr);
 
 static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
 {
@@ -98,6 +100,7 @@ int dm_linear_map(struct dm_target *ti, struct bio *bio)
 
 	return DM_MAPIO_REMAPPED;
 }
+EXPORT_SYMBOL_GPL(dm_linear_map);
 
 void dm_linear_status(struct dm_target *ti, status_type_t type,
 			  unsigned status_flags, char *result, unsigned maxlen)
@@ -115,6 +118,7 @@ void dm_linear_status(struct dm_target *ti, status_type_t type,
 		break;
 	}
 }
+EXPORT_SYMBOL_GPL(dm_linear_status);
 
 int dm_linear_prepare_ioctl(struct dm_target *ti,
 		struct block_device **bdev, fmode_t *mode)
@@ -132,6 +136,7 @@ int dm_linear_prepare_ioctl(struct dm_target *ti,
 		return 1;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(dm_linear_prepare_ioctl);
 
 int dm_linear_iterate_devices(struct dm_target *ti,
 				  iterate_devices_callout_fn fn, void *data)
@@ -140,6 +145,7 @@ int dm_linear_iterate_devices(struct dm_target *ti,
 
 	return fn(ti, lc->dev, lc->start, ti->len, data);
 }
+EXPORT_SYMBOL_GPL(dm_linear_iterate_devices);
 
 static struct target_type linear_target = {
 	.name   = "linear",
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 5214ed2c7507..9d3d4b297201 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -592,6 +592,7 @@ int verity_map(struct dm_target *ti, struct bio *bio)
 
 	return DM_MAPIO_SUBMITTED;
 }
+EXPORT_SYMBOL_GPL(verity_map);
 
 /*
  * Status: V (valid) or C (corruption found)
@@ -655,6 +656,7 @@ void verity_status(struct dm_target *ti, status_type_t type,
 		break;
 	}
 }
+EXPORT_SYMBOL_GPL(verity_status);
 
 int verity_prepare_ioctl(struct dm_target *ti,
 		struct block_device **bdev, fmode_t *mode)
@@ -668,6 +670,7 @@ int verity_prepare_ioctl(struct dm_target *ti,
 		return 1;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(verity_prepare_ioctl);
 
 int verity_iterate_devices(struct dm_target *ti,
 				  iterate_devices_callout_fn fn, void *data)
@@ -676,6 +679,7 @@ int verity_iterate_devices(struct dm_target *ti,
 
 	return fn(ti, v->data_dev, v->data_start, ti->len, data);
 }
+EXPORT_SYMBOL_GPL(verity_iterate_devices);
 
 void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
@@ -689,6 +693,7 @@ void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	blk_limits_io_min(limits, limits->logical_block_size);
 }
+EXPORT_SYMBOL_GPL(verity_io_hints);
 
 void verity_dtr(struct dm_target *ti)
 {
@@ -719,6 +724,7 @@ void verity_dtr(struct dm_target *ti)
 
 	kfree(v);
 }
+EXPORT_SYMBOL_GPL(verity_dtr);
 
 static int verity_alloc_zero_digest(struct dm_verity *v)
 {
@@ -1053,6 +1059,7 @@ bad:
 
 	return r;
 }
+EXPORT_SYMBOL_GPL(verity_ctr);
 
 static struct target_type verity_target = {
 	.name		= "verity",

From 8f5bf76871c346bfc4fed9fcf85eff79948aade2 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 11 Jun 2016 20:32:06 +0200
Subject: [PATCH 0254/3220] ipv6: fix endianness error in icmpv6_err

IPv6 ping socket error handler doesn't correctly convert the new 32 bit
mtu to host endianness before using.

[Cherry-pick of net dcb94b88c09ce82a80e188d49bcffdc83ba215a6]

Bug: 29370996
Change-Id: Iea0ca79f16c2a1366d82b3b0a3097093d18da8b7
Cc: Lorenzo Colitti <lorenzo@google.com>
Fixes: 6d0bfe22611602f ("net: ipv6: Add IPv6 support to the ping socket.")
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index a3ec7a77a1ee..41e5c9520c7d 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -98,7 +98,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	if (!(type & ICMPV6_INFOMSG_MASK))
 		if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
-			ping_err(skb, offset, info);
+			ping_err(skb, offset, ntohl(info));
 }
 
 static int icmpv6_rcv(struct sk_buff *skb);

From 75c591ea5787a59e69c8d2b803a7f4f0c29ec34e Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Sat, 13 Aug 2016 01:13:38 +0900
Subject: [PATCH 0255/3220] net: ipv6: Fix ping to link-local addresses.

ping_v6_sendmsg does not set flowi6_oif in response to
sin6_scope_id or sk_bound_dev_if, so it is not possible to use
these APIs to ping an IPv6 address on a different interface.
Instead, it sets flowi6_iif, which is incorrect but harmless.

Stop setting flowi6_iif, and support various ways of setting oif
in the same priority order used by udpv6_sendmsg.

[Backport of net 5e457896986e16c440c97bb94b9ccd95dd157292]

Bug: 29370996
Change-Id: Ibe1b9434c00ed96f1e30acb110734c6570b087b8
Tested: https://android-review.googlesource.com/#/c/254470/
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ping.c | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 2cfaedf03252..9411c8d770a5 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -84,7 +84,7 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	struct icmp6hdr user_icmph;
 	int addr_type;
 	struct in6_addr *daddr;
-	int iif = 0;
+	int oif = 0;
 	struct flowi6 fl6;
 	int err;
 	int hlimit;
@@ -106,25 +106,30 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		if (u->sin6_family != AF_INET6) {
 			return -EAFNOSUPPORT;
 		}
-		if (sk->sk_bound_dev_if &&
-		    sk->sk_bound_dev_if != u->sin6_scope_id) {
-			return -EINVAL;
-		}
 		daddr = &(u->sin6_addr);
-		iif = u->sin6_scope_id;
+		if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
+			oif = u->sin6_scope_id;
 	} else {
 		if (sk->sk_state != TCP_ESTABLISHED)
 			return -EDESTADDRREQ;
 		daddr = &sk->sk_v6_daddr;
 	}
 
-	if (!iif)
-		iif = sk->sk_bound_dev_if;
+	if (!oif)
+		oif = sk->sk_bound_dev_if;
+
+	if (!oif)
+		oif = np->sticky_pktinfo.ipi6_ifindex;
+
+	if (!oif && ipv6_addr_is_multicast(daddr))
+		oif = np->mcast_oif;
+	else if (!oif)
+		oif = np->ucast_oif;
 
 	addr_type = ipv6_addr_type(daddr);
-	if (__ipv6_addr_needs_scope_id(addr_type) && !iif)
-		return -EINVAL;
-	if (addr_type & IPV6_ADDR_MAPPED)
+	if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) ||
+	    (addr_type & IPV6_ADDR_MAPPED) ||
+	    (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if))
 		return -EINVAL;
 
 	/* TODO: use ip6_datagram_send_ctl to get options from cmsg */
@@ -134,17 +139,13 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	fl6.flowi6_proto = IPPROTO_ICMPV6;
 	fl6.saddr = np->saddr;
 	fl6.daddr = *daddr;
+	fl6.flowi6_oif = oif;
 	fl6.flowi6_mark = sk->sk_mark;
 	fl6.flowi6_uid = sock_i_uid(sk);
 	fl6.fl6_icmp_type = user_icmph.icmp6_type;
 	fl6.fl6_icmp_code = user_icmph.icmp6_code;
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
-		fl6.flowi6_oif = np->mcast_oif;
-	else if (!fl6.flowi6_oif)
-		fl6.flowi6_oif = np->ucast_oif;
-
 	dst = ip6_sk_dst_lookup_flow(sk, &fl6,  daddr);
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
@@ -154,11 +155,6 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	if (!np)
 		return -EBADF;
 
-	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
-		fl6.flowi6_oif = np->mcast_oif;
-	else if (!fl6.flowi6_oif)
-		fl6.flowi6_oif = np->ucast_oif;
-
 	pfh.icmph.type = user_icmph.icmp6_type;
 	pfh.icmph.code = user_icmph.icmp6_code;
 	pfh.icmph.checksum = 0;

From 3228c5eb7af2b4cb981706b88ed3c3e81ab8e80a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Aug 2016 08:44:12 -0700
Subject: [PATCH 0256/3220] RFC: FROMLIST: locking/percpu-rwsem: Optimize
 readers and reduce global impact

Currently the percpu-rwsem switches to (global) atomic ops while a
writer is waiting; which could be quite a while and slows down
releasing the readers.

This patch cures this problem by ordering the reader-state vs
reader-count (see the comments in __percpu_down_read() and
percpu_down_write()). This changes a global atomic op into a full
memory barrier, which doesn't have the global cacheline contention.

This also enables using the percpu-rwsem with rcu_sync disabled in order
to bias the implementation differently, reducing the writer latency by
adding some cost to readers.

Mailing-list-URL: https://lkml.org/lkml/2016/8/9/181
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[jstultz: Backported to 4.4]
Change-Id: I8ea04b4dca2ec36f1c2469eccafde1423490572f
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/percpu-rwsem.h  |  84 +++++++++--
 kernel/locking/percpu-rwsem.c | 265 +++++++++++++++++++---------------
 2 files changed, 225 insertions(+), 124 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index c2fa3ecb0dce..146efefde2a1 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -10,30 +10,96 @@
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
-	unsigned int __percpu	*fast_read_ctr;
+	unsigned int __percpu	*read_count;
 	struct rw_semaphore	rw_sem;
-	atomic_t		slow_read_ctr;
-	wait_queue_head_t	write_waitq;
+	wait_queue_head_t	writer;
+	int			readers_block;
 };
 
-extern void percpu_down_read(struct percpu_rw_semaphore *);
-extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
-extern void percpu_up_read(struct percpu_rw_semaphore *);
+extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+	preempt_disable();
+	/*
+	 * We are in an RCU-sched read-side critical section, so the writer
+	 * cannot both change sem->state from readers_fast and start checking
+	 * counters while we are here. So if we see !sem->state, we know that
+	 * the writer won't be checking until we're past the preempt_enable()
+	 * and that one the synchronize_sched() is done, the writer will see
+	 * anything we did within this RCU-sched read-size critical section.
+	 */
+	__this_cpu_inc(*sem->read_count);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		__percpu_down_read(sem, false); /* Unconditional memory barrier */
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+}
+
+static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+	int ret = 1;
+
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	__this_cpu_inc(*sem->read_count);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+
+	if (ret)
+		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	/*
+	 * The barrier() in preempt_disable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_dec(*sem->read_count);
+	else
+		__percpu_up_read(sem); /* Unconditional memory barrier */
+	preempt_enable();
+
+	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+}
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
 extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
 				const char *, struct lock_class_key *);
+
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
-#define percpu_init_rwsem(brw)	\
+#define percpu_init_rwsem(sem)					\
 ({								\
 	static struct lock_class_key rwsem_key;			\
-	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
+	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
-
 #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f231e0bb311c..ce182599cf2e 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,151 +8,186 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
 
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *rwsem_key)
 {
-	brw->fast_read_ctr = alloc_percpu(int);
-	if (unlikely(!brw->fast_read_ctr))
+	sem->read_count = alloc_percpu(int);
+	if (unlikely(!sem->read_count))
 		return -ENOMEM;
 
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
-	__init_rwsem(&brw->rw_sem, name, rwsem_key);
-	rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
-	atomic_set(&brw->slow_read_ctr, 0);
-	init_waitqueue_head(&brw->write_waitq);
+	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+	__init_rwsem(&sem->rw_sem, name, rwsem_key);
+	init_waitqueue_head(&sem->writer);
+	sem->readers_block = 0;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
 
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 {
 	/*
 	 * XXX: temporary kludge. The error path in alloc_super()
 	 * assumes that percpu_free_rwsem() is safe after kzalloc().
 	 */
-	if (!brw->fast_read_ctr)
+	if (!sem->read_count)
 		return;
 
-	rcu_sync_dtor(&brw->rss);
-	free_percpu(brw->fast_read_ctr);
-	brw->fast_read_ctr = NULL; /* catch use after free bugs */
+	rcu_sync_dtor(&sem->rss);
+	free_percpu(sem->read_count);
+	sem->read_count = NULL; /* catch use after free bugs */
 }
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-/*
- * This is the fast-path for down_read/up_read. If it succeeds we rely
- * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * percpu_down_write() and percpu_up_write().
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
-{
-	bool success;
-
-	preempt_disable();
-	success = rcu_sync_is_idle(&brw->rss);
-	if (likely(success))
-		__this_cpu_add(*brw->fast_read_ctr, val);
-	preempt_enable();
-
-	return success;
-}
-
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
-	might_sleep();
-	rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
-
-	if (likely(update_fast_ctr(brw, +1)))
-		return;
-
-	/* Avoid rwsem_acquire_read() and rwsem_release() */
-	__down_read(&brw->rw_sem);
-	atomic_inc(&brw->slow_read_ctr);
-	__up_read(&brw->rw_sem);
-}
-EXPORT_SYMBOL_GPL(percpu_down_read);
-
-int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
-{
-	if (unlikely(!update_fast_ctr(brw, +1))) {
-		if (!__down_read_trylock(&brw->rw_sem))
-			return 0;
-		atomic_inc(&brw->slow_read_ctr);
-		__up_read(&brw->rw_sem);
-	}
-
-	rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
-	return 1;
-}
-
-void percpu_up_read(struct percpu_rw_semaphore *brw)
-{
-	rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-
-	if (likely(update_fast_ctr(brw, -1)))
-		return;
-
-	/* false-positive is possible but harmless */
-	if (atomic_dec_and_test(&brw->slow_read_ctr))
-		wake_up_all(&brw->write_waitq);
-}
-EXPORT_SYMBOL_GPL(percpu_up_read);
-
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
-{
-	unsigned int sum = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		sum += per_cpu(*brw->fast_read_ctr, cpu);
-		per_cpu(*brw->fast_read_ctr, cpu) = 0;
-	}
-
-	return sum;
-}
-
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
 {
 	/*
-	 * Make rcu_sync_is_idle() == F and thus disable the fast-path in
-	 * percpu_down_read() and percpu_up_read(), and wait for gp pass.
+	 * Due to having preemption disabled the decrement happens on
+	 * the same CPU as the increment, avoiding the
+	 * increment-on-one-CPU-and-decrement-on-another problem.
 	 *
-	 * The latter synchronises us with the preceding readers which used
-	 * the fast-past, so we can not miss the result of __this_cpu_add()
-	 * or anything else inside their criticial sections.
+	 * If the reader misses the writer's assignment of readers_block, then
+	 * the writer is guaranteed to see the reader's increment.
+	 *
+	 * Conversely, any readers that increment their sem->read_count after
+	 * the writer looks are guaranteed to see the readers_block value,
+	 * which in turn means that they are guaranteed to immediately
+	 * decrement their sem->read_count, so that it doesn't matter that the
+	 * writer missed them.
 	 */
-	rcu_sync_enter(&brw->rss);
 
-	/* exclude other writers, and block the new readers completely */
-	down_write(&brw->rw_sem);
+	smp_mb(); /* A matches D */
 
-	/* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
-	atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+	/*
+	 * If !readers_block the critical section starts here, matched by the
+	 * release in percpu_up_write().
+	 */
+	if (likely(!smp_load_acquire(&sem->readers_block)))
+		return 1;
 
-	/* wait for all readers to complete their percpu_up_read() */
-	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+	/*
+	 * Per the above comment; we still have preemption disabled and
+	 * will thus decrement on the same CPU as we incremented.
+	 */
+	__percpu_up_read(sem);
+
+	if (try)
+		return 0;
+
+	/*
+	 * We either call schedule() in the wait, or we'll fall through
+	 * and reschedule on the preempt_enable() in percpu_down_read().
+	 */
+	preempt_enable_no_resched();
+
+	/*
+	 * Avoid lockdep for the down/up_read() we already have them.
+	 */
+	__down_read(&sem->rw_sem);
+	this_cpu_inc(*sem->read_count);
+	__up_read(&sem->rw_sem);
+
+	preempt_disable();
+	return 1;
+}
+EXPORT_SYMBOL_GPL(__percpu_down_read);
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	smp_mb(); /* B matches C */
+	/*
+	 * In other words, if they see our decrement (presumably to aggregate
+	 * zero, as that is the only time it matters) they will also see our
+	 * critical section.
+	 */
+	__this_cpu_dec(*sem->read_count);
+
+	/* Prod writer to recheck readers_active */
+	wake_up(&sem->writer);
+}
+EXPORT_SYMBOL_GPL(__percpu_up_read);
+
+#define per_cpu_sum(var)						\
+({									\
+	typeof(var) __sum = 0;						\
+	int cpu;							\
+	compiletime_assert_atomic_type(__sum);				\
+	for_each_possible_cpu(cpu)					\
+		__sum += per_cpu(var, cpu);				\
+	__sum;								\
+})
+
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero.  If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
+{
+	if (per_cpu_sum(*sem->read_count) != 0)
+		return false;
+
+	/*
+	 * If we observed the decrement; ensure we see the entire critical
+	 * section.
+	 */
+
+	smp_mb(); /* C matches B */
+
+	return true;
+}
+
+void percpu_down_write(struct percpu_rw_semaphore *sem)
+{
+	/* Notify readers to take the slow path. */
+	rcu_sync_enter(&sem->rss);
+
+	down_write(&sem->rw_sem);
+
+	/*
+	 * Notify new readers to block; up until now, and thus throughout the
+	 * longish rcu_sync_enter() above, new readers could still come in.
+	 */
+	WRITE_ONCE(sem->readers_block, 1);
+
+	smp_mb(); /* D matches A */
+
+	/*
+	 * If they don't see our writer of readers_block, then we are
+	 * guaranteed to see their sem->read_count increment, and therefore
+	 * will wait for them.
+	 */
+
+	/* Wait for all now active readers to complete. */
+	wait_event(sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
-	/* release the lock, but the readers can't use the fast-path */
-	up_write(&brw->rw_sem);
 	/*
-	 * Enable the fast-path in percpu_down_read() and percpu_up_read()
-	 * but only after another gp pass; this adds the necessary barrier
-	 * to ensure the reader can't miss the changes done by us.
+	 * Signal the writer is done, no fast path yet.
+	 *
+	 * One reason that we cannot just immediately flip to readers_fast is
+	 * that new readers might fail to see the results of this writer's
+	 * critical section.
+	 *
+	 * Therefore we force it through the slow path which guarantees an
+	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	rcu_sync_exit(&brw->rss);
+	smp_store_release(&sem->readers_block, 0);
+
+	/*
+	 * Release the write lock, this will allow readers back in the game.
+	 */
+	up_write(&sem->rw_sem);
+
+	/*
+	 * Once this completes (at least one RCU-sched grace period hence) the
+	 * reader fast path will be available again. Safe to use outside the
+	 * exclusive write lock because its counting.
+	 */
+	rcu_sync_exit(&sem->rss);
 }
 EXPORT_SYMBOL_GPL(percpu_up_write);

From 0c3240a1ef2e840aaa17f593326e3642bc857aa7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Aug 2016 18:54:13 +0200
Subject: [PATCH 0257/3220] RFC: FROMLIST: cgroup: avoid synchronize_sched() in
 __cgroup_procs_write()

The current percpu-rwsem read side is entirely free of serializing insns
at the cost of having a synchronize_sched() in the write path.

The latency of the synchronize_sched() is too high for cgroups. The
commit 1ed1328792ff talks about the write path being a fairly cold path
but this is not the case for Android which moves task to the foreground
cgroup and back around binder IPC calls from foreground processes to
background processes, so it is significantly hotter than human initiated
operations.

Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the
problem, hopefully it should not be that slow after another commit
80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global
impact").

We could just add rcu_sync_enter() into cgroup_init() but we do not want
another synchronize_sched() at boot time, so this patch adds the new helper
which doesn't block but currently can only be called before the first use.

Cc: Tejun Heo <tj@kernel.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: John Stultz <john.stultz@linaro.org>
Reported-by: Dmitry Shmidt <dimitrysh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
[jstultz: backported to 4.4]
Change-Id: I34aa9c394d3052779b56976693e96d861bd255f2
Mailing-list-URL: https://lkml.org/lkml/2016/8/11/557
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/rcu_sync.h |  1 +
 kernel/cgroup.c          |  6 ++++++
 kernel/rcu/sync.c        | 12 ++++++++++++
 3 files changed, 19 insertions(+)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index a63a33e6196e..ece7ed9a4a70 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_enter_start(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a57b7c86871c..bd541053efc2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5346,6 +5346,12 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+	/*
+	 * The latency of the synchronize_sched() is too high for cgroups,
+	 * avoid it at the cost of forcing all readers into the slow path.
+	 */
+	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
 	mutex_lock(&cgroup_mutex);
 
 	/* Add init_css_set to the hash table */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9f3d37..e358313a0d6c 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -82,6 +82,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 	rsp->gp_type = type;
 }
 
+/**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+	rsp->gp_count++;
+	rsp->gp_state = GP_PASSED;
+}
+
 /**
  * rcu_sync_enter() - Force readers onto slowpath
  * @rsp: Pointer to rcu_sync structure to use for synchronization

From e91f1799ff2cc3883907b5f3e141507f9716ff0e Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Wed, 10 Aug 2016 15:43:06 -0400
Subject: [PATCH 0258/3220] RFC: FROMLIST: cgroup: reduce read locked section
 of cgroup_threadgroup_rwsem during fork

cgroup_threadgroup_rwsem is acquired in read mode during process exit
and fork.  It is also grabbed in write mode during
__cgroups_proc_write().  I've recently run into a scenario with lots
of memory pressure and OOM and I am beginning to see

systemd

 __switch_to+0x1f8/0x350
 __schedule+0x30c/0x990
 schedule+0x48/0xc0
 percpu_down_write+0x114/0x170
 __cgroup_procs_write.isra.12+0xb8/0x3c0
 cgroup_file_write+0x74/0x1a0
 kernfs_fop_write+0x188/0x200
 __vfs_write+0x6c/0xe0
 vfs_write+0xc0/0x230
 SyS_write+0x6c/0x110
 system_call+0x38/0xb4

This thread is waiting on the reader of cgroup_threadgroup_rwsem to
exit.  The reader itself is under memory pressure and has gone into
reclaim after fork. There are times the reader also ends up waiting on
oom_lock as well.

 __switch_to+0x1f8/0x350
 __schedule+0x30c/0x990
 schedule+0x48/0xc0
 jbd2_log_wait_commit+0xd4/0x180
 ext4_evict_inode+0x88/0x5c0
 evict+0xf8/0x2a0
 dispose_list+0x50/0x80
 prune_icache_sb+0x6c/0x90
 super_cache_scan+0x190/0x210
 shrink_slab.part.15+0x22c/0x4c0
 shrink_zone+0x288/0x3c0
 do_try_to_free_pages+0x1dc/0x590
 try_to_free_pages+0xdc/0x260
 __alloc_pages_nodemask+0x72c/0xc90
 alloc_pages_current+0xb4/0x1a0
 page_table_alloc+0xc0/0x170
 __pte_alloc+0x58/0x1f0
 copy_page_range+0x4ec/0x950
 copy_process.isra.5+0x15a0/0x1870
 _do_fork+0xa8/0x4b0
 ppc_clone+0x8/0xc

In the meanwhile, all processes exiting/forking are blocked almost
stalling the system.

This patch moves the threadgroup_change_begin from before
cgroup_fork() to just before cgroup_canfork().  There is no nee to
worry about threadgroup changes till the task is actually added to the
threadgroup.  This avoids having to call reclaim with
cgroup_threadgroup_rwsem held.

tj: Subject and description edits.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@vger.kernel.org # v4.2+
Signed-off-by: Tejun Heo <tj@kernel.org>
[jstultz: Cherry-picked from:
 git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 568ac888215c7f]
Change-Id: Ie8ece84fb613cf6a7b08cea1468473a8df2b9661
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 7ec6e9939b2c..d6a6da547e41 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1370,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->real_start_time = ktime_get_boot_ns();
 	p->io_context = NULL;
 	p->audit_context = NULL;
-	threadgroup_change_begin(current);
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
@@ -1522,6 +1521,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 
+	threadgroup_change_begin(current);
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
 	 * forked. It should be noted the the new process's css_set can be changed
@@ -1622,6 +1622,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cancel_cgroup:
 	cgroup_cancel_fork(p, cgrp_ss_priv);
 bad_fork_free_pid:
+	threadgroup_change_end(current);
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 bad_fork_cleanup_io:
@@ -1652,7 +1653,6 @@ bad_fork_cleanup_policy:
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
 #endif
-	threadgroup_change_end(current);
 	delayacct_tsk_free(p);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);

From ba52437821164deacf9af90d559f2e3f4888ff28 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Thu, 14 Jul 2016 11:38:40 -0400
Subject: [PATCH 0259/3220] BACKPORT: tcp: enable per-socket rate limiting of
 all 'challenge acks'

(cherry picked from commit 083ae308280d13d187512b9babe3454342a7987e)

The per-socket rate limit for 'challenge acks' was introduced in the
context of limiting ack loops:

commit f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock")

And I think it can be extended to rate limit all 'challenge acks' on a
per-socket basis.

Since we have the global tcp_challenge_ack_limit, this patch allows for
tcp_challenge_ack_limit to be set to a large value and effectively rely on
the per-socket limit, or set tcp_challenge_ack_limit to a lower value and
still prevents a single connections from consuming the entire challenge ack
quota.

It further moves in the direction of eliminating the global limit at some
point, as Eric Dumazet has suggested. This a follow-up to:
Subject: tcp: make challenge acks less predictable

Cc: Eric Dumazet <edumazet@google.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Yue Cao <ycao009@ucr.edu>
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Change-Id: I622d5ae96e9387e775a0196c892d8d0e1a5564a7
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 net/ipv4/tcp_input.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2014a701663a..2bf1110aa2ae 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3391,6 +3391,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 	return flag;
 }
 
+static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
+				   u32 *last_oow_ack_time)
+{
+	if (*last_oow_ack_time) {
+		s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+		if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+			NET_INC_STATS_BH(net, mib_idx);
+			return true;	/* rate-limited: don't send yet! */
+		}
+	}
+
+	*last_oow_ack_time = tcp_time_stamp;
+
+	return false;	/* not rate-limited: go ahead, send dupack now! */
+}
+
 /* Return true if we're currently rate-limiting out-of-window ACKs and
  * thus shouldn't send a dupack right now. We rate-limit dupacks in
  * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
@@ -3404,21 +3421,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
 	/* Data packets without SYNs are not likely part of an ACK loop. */
 	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
 	    !tcp_hdr(skb)->syn)
-		goto not_rate_limited;
+		return false;
 
-	if (*last_oow_ack_time) {
-		s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
-
-		if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
-			NET_INC_STATS_BH(net, mib_idx);
-			return true;	/* rate-limited: don't send yet! */
-		}
-	}
-
-	*last_oow_ack_time = tcp_time_stamp;
-
-not_rate_limited:
-	return false;	/* not rate-limited: go ahead, send dupack now! */
+	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
 }
 
 /* RFC 5961 7 [ACK Throttling] */
@@ -3431,9 +3436,9 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 	u32 count, now;
 
 	/* First check our per-socket dupack rate limit. */
-	if (tcp_oow_rate_limited(sock_net(sk), skb,
-				 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
-				 &tp->last_oow_ack_time))
+	if (__tcp_oow_rate_limited(sock_net(sk),
+				   LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+				   &tp->last_oow_ack_time))
 		return;
 
 	/* Then check host-wide RFC 5961 rate limit. */

From 258a8f519a47a2b3c0a42940182715fc0de690e7 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 29 Aug 2016 17:31:10 -0700
Subject: [PATCH 0260/3220] UPSTREAM: Input: powermate - fix oops with
 malicious USB descriptors

The powermate driver expects at least one valid USB endpoint in its
probe function.  If given malicious descriptors that specify 0 for
the number of endpoints, it will crash.  Validate the number of
endpoints on the interface before using them.

The full report for this issue can be found here:
http://seclists.org/bugtraq/2016/Mar/85

BUG: 28242610

Reported-by: Ralf Spenneberg <ralf@spenneberg.net>
Signed-off-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I1cb956a35f3bba73324240d5bd0a029f49d3c456
---
 drivers/input/misc/powermate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/input/misc/powermate.c b/drivers/input/misc/powermate.c
index 63b539d3daba..84909a12ff36 100644
--- a/drivers/input/misc/powermate.c
+++ b/drivers/input/misc/powermate.c
@@ -307,6 +307,9 @@ static int powermate_probe(struct usb_interface *intf, const struct usb_device_i
 	int error = -ENOMEM;
 
 	interface = intf->cur_altsetting;
+	if (interface->desc.bNumEndpoints < 1)
+		return -EINVAL;
+
 	endpoint = &interface->endpoint[0].desc;
 	if (!usb_endpoint_is_int_in(endpoint))
 		return -EIO;

From 125d90ca2fd862ecb119a085d95f813a6263d218 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 29 Aug 2016 17:33:52 -0700
Subject: [PATCH 0261/3220] UPSTREAM: USB: cypress_m8: add endpoint sanity
 check

commit c55aee1bf0e6b6feec8b2927b43f7a09a6d5f754 upstream.

An attack using missing endpoints exists.

CVE-2016-3137

BUG: 28242610
Signed-off-by: Oliver Neukum <ONeukum@suse.com>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I1cc7957a5924175d24f12fdc41162ece67c907e5
---
 drivers/usb/serial/cypress_m8.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 01bf53392819..244acb1299a9 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -447,6 +447,11 @@ static int cypress_generic_port_probe(struct usb_serial_port *port)
 	struct usb_serial *serial = port->serial;
 	struct cypress_private *priv;
 
+	if (!port->interrupt_out_urb || !port->interrupt_in_urb) {
+		dev_err(&port->dev, "required endpoint is missing\n");
+		return -ENODEV;
+	}
+
 	priv = kzalloc(sizeof(struct cypress_private), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
@@ -606,12 +611,6 @@ static int cypress_open(struct tty_struct *tty, struct usb_serial_port *port)
 		cypress_set_termios(tty, port, &priv->tmp_termios);
 
 	/* setup the port and start reading from the device */
-	if (!port->interrupt_in_urb) {
-		dev_err(&port->dev, "%s - interrupt_in_urb is empty!\n",
-			__func__);
-		return -1;
-	}
-
 	usb_fill_int_urb(port->interrupt_in_urb, serial->dev,
 		usb_rcvintpipe(serial->dev, port->interrupt_in_endpointAddress),
 		port->interrupt_in_urb->transfer_buffer,

From ee5c7a9739c6ddd0d54385c313b5d6f87652cd63 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Tue, 30 Aug 2016 13:33:55 -0700
Subject: [PATCH 0262/3220] UPSTREAM: USB: mct_u232: add sanity checking in
 probe

commit 4e9a0b05257f29cf4b75f3209243ed71614d062e upstream.

An attack using the lack of sanity checking in probe is known. This
patch checks for the existence of a second port.

CVE-2016-3136
BUG: 28242610
Signed-off-by: Oliver Neukum <ONeukum@suse.com>
[johan: add error message ]
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I284ad648c2087c34a098d67e0cc6d948a568413c
---
 drivers/usb/serial/mct_u232.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index fd707d6a10e2..89726f702202 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -376,14 +376,21 @@ static void mct_u232_msr_to_state(struct usb_serial_port *port,
 
 static int mct_u232_port_probe(struct usb_serial_port *port)
 {
+	struct usb_serial *serial = port->serial;
 	struct mct_u232_private *priv;
 
+	/* check first to simplify error handling */
+	if (!serial->port[1] || !serial->port[1]->interrupt_in_urb) {
+		dev_err(&port->dev, "expected endpoint missing\n");
+		return -ENODEV;
+	}
+
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
 
 	/* Use second interrupt-in endpoint for reading. */
-	priv->read_urb = port->serial->port[1]->interrupt_in_urb;
+	priv->read_urb = serial->port[1]->interrupt_in_urb;
 	priv->read_urb->context = port;
 
 	spin_lock_init(&priv->lock);

From ae6790121ceb4608e1a1dec10e285c8d661463d5 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Tue, 30 Aug 2016 13:35:32 -0700
Subject: [PATCH 0263/3220] UPSTREAM: USB: usb_driver_claim_interface: add
 sanity checking

commit 0b818e3956fc1ad976bee791eadcbb3b5fec5bfd upstream.

Attacks that trick drivers into passing a NULL pointer
to usb_driver_claim_interface() using forged descriptors are
known. This thwarts them by sanity checking.

BUG: 28242610

Signed-off-by: Oliver Neukum <ONeukum@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: Ib43ec5edb156985a9db941785a313f6801df092a
---
 drivers/usb/core/driver.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 56593a9a8726..2057d91d8336 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -502,11 +502,15 @@ static int usb_unbind_interface(struct device *dev)
 int usb_driver_claim_interface(struct usb_driver *driver,
 				struct usb_interface *iface, void *priv)
 {
-	struct device *dev = &iface->dev;
+	struct device *dev;
 	struct usb_device *udev;
 	int retval = 0;
 	int lpm_disable_error;
 
+	if (!iface)
+		return -ENODEV;
+
+	dev = &iface->dev;
 	if (dev->driver)
 		return -EBUSY;
 

From b24e99b3ae1e3860d374d0ed497a371100533e83 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Tue, 30 Aug 2016 13:37:07 -0700
Subject: [PATCH 0264/3220] UPSTREAM: USB: iowarrior: fix oops with malicious
 USB descriptors

commit 4ec0ef3a82125efc36173062a50624550a900ae0 upstream.

The iowarrior driver expects at least one valid endpoint.  If given
malicious descriptors that specify 0 for the number of endpoints,
it will crash in the probe function.  Ensure there is at least
one endpoint on the interface before using it.

The full report of this issue can be found here:
http://seclists.org/bugtraq/2016/Mar/87

BUG: 28242610

Reported-by: Ralf Spenneberg <ralf@spenneberg.net>
Signed-off-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: If5161c23928e9ef77cb3359cba9b36622b1908df
---
 drivers/usb/misc/iowarrior.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index c6bfd13f6c92..1950e87b4219 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -787,6 +787,12 @@ static int iowarrior_probe(struct usb_interface *interface,
 	iface_desc = interface->cur_altsetting;
 	dev->product_id = le16_to_cpu(udev->descriptor.idProduct);
 
+	if (iface_desc->desc.bNumEndpoints < 1) {
+		dev_err(&interface->dev, "Invalid number of endpoints\n");
+		retval = -EINVAL;
+		goto error;
+	}
+
 	/* set up the endpoint information */
 	for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) {
 		endpoint = &iface_desc->endpoint[i].desc;

From 6087158f06e3cc66897cb4571cb673ac9fffb12d Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Tue, 30 Aug 2016 13:39:02 -0700
Subject: [PATCH 0265/3220] UPSTREAM: USB: cdc-acm: more sanity checking

commit 8835ba4a39cf53f705417b3b3a94eb067673f2c9 upstream.

An attack has become available which pretends to be a quirky
device circumventing normal sanity checks and crashes the kernel
by an insufficient number of interfaces. This patch adds a check
to the code path for quirky devices.

BUG: 28242610

Signed-off-by: Oliver Neukum <ONeukum@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: I9a5f7f3c704b65e866335054f470451fcfae9d1c
---
 drivers/usb/class/cdc-acm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 26ca4f910cb0..a7732f80a912 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1113,6 +1113,9 @@ static int acm_probe(struct usb_interface *intf,
 	if (quirks == NO_UNION_NORMAL) {
 		data_interface = usb_ifnum_to_if(usb_dev, 1);
 		control_interface = usb_ifnum_to_if(usb_dev, 0);
+		/* we would crash */
+		if (!data_interface || !control_interface)
+			return -ENODEV;
 		goto skip_normal_probe;
 	}
 

From 4547c0f93518197834f7fc11c6558b566b7e14c8 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <groeck@chromium.org>
Date: Wed, 31 Aug 2016 09:52:16 -0700
Subject: [PATCH 0266/3220] ANDROID: rcu_sync: Export rcu_sync_lockdep_assert

x86_64:allmodconfig fails to build with the following error.

ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined!

Introduced by commit 3228c5eb7af2 ("RFC: FROMLIST: locking/percpu-rwsem:
Optimize readers and reduce global impact"). The applied upstream version
exports the missing symbol, so let's do the same.

Change-Id: If4e516715c3415fe8c82090f287174857561550d
Fixes: 3228c5eb7af2 ("RFC: FROMLIST: locking/percpu-rwsem: Optimize ...")
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
 kernel/rcu/sync.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index e358313a0d6c..b49cf3ac2d47 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -68,6 +68,7 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
 	RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
 			 "suspicious rcu_sync_is_idle() usage");
 }
+EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
 #endif
 
 /**

From a49dcf2e745c25a67a76d8a1e9658c16827c9885 Mon Sep 17 00:00:00 2001
From: Yongqin Liu <yongqin.liu@linaro.org>
Date: Thu, 1 Sep 2016 22:06:04 +0530
Subject: [PATCH 0267/3220] ANDROID: base-cfg: enable SECCOMP config

Enable following seccomp configs

CONFIG_SECCOMP=y
CONFIG_SECCOMP_FILTER=y

Otherwise we will get mediacode error like this on Android N:

E /system/bin/mediaextractor: libminijail: prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER): Invalid argument

Change-Id: I2477b6a2cfdded5c0ebf6ffbb6150b0e5fe2ba12
Signed-off-by: Yongqin Liu <yongqin.liu@linaro.org>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 android/configs/android-base.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 6496bb3961a2..288bab2d858d 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -141,6 +141,8 @@ CONFIG_PROFILING=y
 CONFIG_QUOTA=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y

From af4104512cdb48d5eede620871d6a964a8680253 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 6 Apr 2016 14:06:48 +0100
Subject: [PATCH 0268/3220] UPSTREAM: assoc_array: don't call compare_object()
 on a node

(cherry picked from commit 8d4a2ec1e0b41b0cf9a0c5cd4511da7f8e4f3de2)

Changes since V1: fixed the description and added KASan warning.

In assoc_array_insert_into_terminal_node(), we call the
compare_object() method on all non-empty slots, even when they're
not leaves, passing a pointer to an unexpected structure to
compare_object(). Currently it causes an out-of-bound read access
in keyring_compare_object detected by KASan (see below). The issue
is easily reproduced with keyutils testsuite.
Only call compare_object() when the slot is a leave.

KASan warning:
==================================================================
BUG: KASAN: slab-out-of-bounds in keyring_compare_object+0x213/0x240 at addr ffff880060a6f838
Read of size 8 by task keyctl/1655
=============================================================================
BUG kmalloc-192 (Not tainted): kasan: bad access detected
-----------------------------------------------------------------------------

Disabling lock debugging due to kernel taint
INFO: Allocated in assoc_array_insert+0xfd0/0x3a60 age=69 cpu=1 pid=1647
	___slab_alloc+0x563/0x5c0
	__slab_alloc+0x51/0x90
	kmem_cache_alloc_trace+0x263/0x300
	assoc_array_insert+0xfd0/0x3a60
	__key_link_begin+0xfc/0x270
	key_create_or_update+0x459/0xaf0
	SyS_add_key+0x1ba/0x350
	entry_SYSCALL_64_fastpath+0x12/0x76
INFO: Slab 0xffffea0001829b80 objects=16 used=8 fp=0xffff880060a6f550 flags=0x3fff8000004080
INFO: Object 0xffff880060a6f740 @offset=5952 fp=0xffff880060a6e5d1

Bytes b4 ffff880060a6f730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f740: d1 e5 a6 60 00 88 ff ff 0e 00 00 00 00 00 00 00  ...`............
Object ffff880060a6f750: 02 cf 8e 60 00 88 ff ff 02 c0 8e 60 00 88 ff ff  ...`.......`....
Object ffff880060a6f760: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f770: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f790: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f7a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f7b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f7c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f7d0: 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f7e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
Object ffff880060a6f7f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
CPU: 0 PID: 1655 Comm: keyctl Tainted: G    B           4.5.0-rc4-kasan+ #291
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
 0000000000000000 000000001b2800b4 ffff880060a179e0 ffffffff81b60491
 ffff88006c802900 ffff880060a6f740 ffff880060a17a10 ffffffff815e2969
 ffff88006c802900 ffffea0001829b80 ffff880060a6f740 ffff880060a6e650
Call Trace:
 [<ffffffff81b60491>] dump_stack+0x85/0xc4
 [<ffffffff815e2969>] print_trailer+0xf9/0x150
 [<ffffffff815e9454>] object_err+0x34/0x40
 [<ffffffff815ebe50>] kasan_report_error+0x230/0x550
 [<ffffffff819949be>] ? keyring_get_key_chunk+0x13e/0x210
 [<ffffffff815ec62d>] __asan_report_load_n_noabort+0x5d/0x70
 [<ffffffff81994cc3>] ? keyring_compare_object+0x213/0x240
 [<ffffffff81994cc3>] keyring_compare_object+0x213/0x240
 [<ffffffff81bc238c>] assoc_array_insert+0x86c/0x3a60
 [<ffffffff81bc1b20>] ? assoc_array_cancel_edit+0x70/0x70
 [<ffffffff8199797d>] ? __key_link_begin+0x20d/0x270
 [<ffffffff8199786c>] __key_link_begin+0xfc/0x270
 [<ffffffff81993389>] key_create_or_update+0x459/0xaf0
 [<ffffffff8128ce0d>] ? trace_hardirqs_on+0xd/0x10
 [<ffffffff81992f30>] ? key_type_lookup+0xc0/0xc0
 [<ffffffff8199e19d>] ? lookup_user_key+0x13d/0xcd0
 [<ffffffff81534763>] ? memdup_user+0x53/0x80
 [<ffffffff819983ea>] SyS_add_key+0x1ba/0x350
 [<ffffffff81998230>] ? key_get_type_from_user.constprop.6+0xa0/0xa0
 [<ffffffff828bcf4e>] ? retint_user+0x18/0x23
 [<ffffffff8128cc7e>] ? trace_hardirqs_on_caller+0x3fe/0x580
 [<ffffffff81004017>] ? trace_hardirqs_on_thunk+0x17/0x19
 [<ffffffff828bc432>] entry_SYSCALL_64_fastpath+0x12/0x76
Memory state around the buggy address:
 ffff880060a6f700: fc fc fc fc fc fc fc fc 00 00 00 00 00 00 00 00
 ffff880060a6f780: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc
>ffff880060a6f800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
                                        ^
 ffff880060a6f880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff880060a6f900: fc fc fc fc fc fc 00 00 00 00 00 00 00 00 00 00
==================================================================

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: stable@vger.kernel.org
Change-Id: I903935a221a5b9fb14cec14ef64bd2b6fa8eb222
Bug: 30513364
---
 lib/assoc_array.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index 03dd576e6773..59fd7c0b119c 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -524,7 +524,9 @@ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit,
 			free_slot = i;
 			continue;
 		}
-		if (ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) {
+		if (assoc_array_ptr_is_leaf(ptr) &&
+		    ops->compare_object(assoc_array_ptr_to_leaf(ptr),
+					index_key)) {
 			pr_devel("replace in slot %d\n", i);
 			edit->leaf_p = &node->slots[i];
 			edit->dead_leaf = node->slots[i];

From 687549af9e5df34f8071c32aced7efae0b46fb58 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Fri, 29 Jul 2016 10:40:31 +0200
Subject: [PATCH 0269/3220] UPSTREAM: block: fix use-after-free in seq file

(cherry picked from commit 77da160530dd1dc94f6ae15a981f24e5f0021e84)

I got a KASAN report of use-after-free:

    ==================================================================
    BUG: KASAN: use-after-free in klist_iter_exit+0x61/0x70 at addr ffff8800b6581508
    Read of size 8 by task trinity-c1/315
    =============================================================================
    BUG kmalloc-32 (Not tainted): kasan: bad access detected
    -----------------------------------------------------------------------------

    Disabling lock debugging due to kernel taint
    INFO: Allocated in disk_seqf_start+0x66/0x110 age=144 cpu=1 pid=315
            ___slab_alloc+0x4f1/0x520
            __slab_alloc.isra.58+0x56/0x80
            kmem_cache_alloc_trace+0x260/0x2a0
            disk_seqf_start+0x66/0x110
            traverse+0x176/0x860
            seq_read+0x7e3/0x11a0
            proc_reg_read+0xbc/0x180
            do_loop_readv_writev+0x134/0x210
            do_readv_writev+0x565/0x660
            vfs_readv+0x67/0xa0
            do_preadv+0x126/0x170
            SyS_preadv+0xc/0x10
            do_syscall_64+0x1a1/0x460
            return_from_SYSCALL_64+0x0/0x6a
    INFO: Freed in disk_seqf_stop+0x42/0x50 age=160 cpu=1 pid=315
            __slab_free+0x17a/0x2c0
            kfree+0x20a/0x220
            disk_seqf_stop+0x42/0x50
            traverse+0x3b5/0x860
            seq_read+0x7e3/0x11a0
            proc_reg_read+0xbc/0x180
            do_loop_readv_writev+0x134/0x210
            do_readv_writev+0x565/0x660
            vfs_readv+0x67/0xa0
            do_preadv+0x126/0x170
            SyS_preadv+0xc/0x10
            do_syscall_64+0x1a1/0x460
            return_from_SYSCALL_64+0x0/0x6a

    CPU: 1 PID: 315 Comm: trinity-c1 Tainted: G    B           4.7.0+ #62
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
     ffffea0002d96000 ffff880119b9f918 ffffffff81d6ce81 ffff88011a804480
     ffff8800b6581500 ffff880119b9f948 ffffffff8146c7bd ffff88011a804480
     ffffea0002d96000 ffff8800b6581500 fffffffffffffff4 ffff880119b9f970
    Call Trace:
     [<ffffffff81d6ce81>] dump_stack+0x65/0x84
     [<ffffffff8146c7bd>] print_trailer+0x10d/0x1a0
     [<ffffffff814704ff>] object_err+0x2f/0x40
     [<ffffffff814754d1>] kasan_report_error+0x221/0x520
     [<ffffffff8147590e>] __asan_report_load8_noabort+0x3e/0x40
     [<ffffffff83888161>] klist_iter_exit+0x61/0x70
     [<ffffffff82404389>] class_dev_iter_exit+0x9/0x10
     [<ffffffff81d2e8ea>] disk_seqf_stop+0x3a/0x50
     [<ffffffff8151f812>] seq_read+0x4b2/0x11a0
     [<ffffffff815f8fdc>] proc_reg_read+0xbc/0x180
     [<ffffffff814b24e4>] do_loop_readv_writev+0x134/0x210
     [<ffffffff814b4c45>] do_readv_writev+0x565/0x660
     [<ffffffff814b8a17>] vfs_readv+0x67/0xa0
     [<ffffffff814b8de6>] do_preadv+0x126/0x170
     [<ffffffff814b92ec>] SyS_preadv+0xc/0x10

This problem can occur in the following situation:

open()
 - pread()
    - .seq_start()
       - iter = kmalloc() // succeeds
       - seqf->private = iter
    - .seq_stop()
       - kfree(seqf->private)
 - pread()
    - .seq_start()
       - iter = kmalloc() // fails
    - .seq_stop()
       - class_dev_iter_exit(seqf->private) // boom! old pointer

As the comment in disk_seqf_stop() says, stop is called even if start
failed, so we need to reinitialise the private pointer to NULL when seq
iteration stops.

An alternative would be to set the private pointer to NULL when the
kmalloc() in disk_seqf_start() fails.

Cc: stable@vger.kernel.org
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
Change-Id: I07b33f4b38341f60a37806cdd45b0a0c3ab4d84d
Bug: 30942273
---
 block/genhd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/genhd.c b/block/genhd.c
index 817c67c9e45e..82bc52cad1c1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -831,6 +831,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
 	if (iter) {
 		class_dev_iter_exit(iter);
 		kfree(iter);
+		seqf->private = NULL;
 	}
 }
 

From c7a407d41ed5d2c6aa0c1892e1d9ebee58d5a928 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Date: Thu, 28 Jan 2016 09:22:44 -0200
Subject: [PATCH 0270/3220] UPSTREAM: [media] xc2028: avoid use after free

(cherry picked from commit 8dfbcc4351a0b6d2f2d77f367552f48ffefafe18)

If struct xc2028_config is passed without a firmware name,
the following trouble may happen:

[11009.907205] xc2028 5-0061: type set to XCeive xc2028/xc3028 tuner
[11009.907491] ==================================================================
[11009.907750] BUG: KASAN: use-after-free in strcmp+0x96/0xb0 at addr ffff8803bd78ab40
[11009.907992] Read of size 1 by task modprobe/28992
[11009.907994] =============================================================================
[11009.907997] BUG kmalloc-16 (Tainted: G        W      ): kasan: bad access detected
[11009.907999] -----------------------------------------------------------------------------

[11009.908008] INFO: Allocated in xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] age=0 cpu=3 pid=28992
[11009.908012] 	___slab_alloc+0x581/0x5b0
[11009.908014] 	__slab_alloc+0x51/0x90
[11009.908017] 	__kmalloc+0x27b/0x350
[11009.908022] 	xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd]
[11009.908026] 	usb_hcd_submit_urb+0x1e8/0x1c60
[11009.908029] 	usb_submit_urb+0xb0e/0x1200
[11009.908032] 	usb_serial_generic_write_start+0xb6/0x4c0
[11009.908035] 	usb_serial_generic_write+0x92/0xc0
[11009.908039] 	usb_console_write+0x38a/0x560
[11009.908045] 	call_console_drivers.constprop.14+0x1ee/0x2c0
[11009.908051] 	console_unlock+0x40d/0x900
[11009.908056] 	vprintk_emit+0x4b4/0x830
[11009.908061] 	vprintk_default+0x1f/0x30
[11009.908064] 	printk+0x99/0xb5
[11009.908067] 	kasan_report_error+0x10a/0x550
[11009.908070] 	__asan_report_load1_noabort+0x43/0x50
[11009.908074] INFO: Freed in xc2028_set_config+0x90/0x630 [tuner_xc2028] age=1 cpu=3 pid=28992
[11009.908077] 	__slab_free+0x2ec/0x460
[11009.908080] 	kfree+0x266/0x280
[11009.908083] 	xc2028_set_config+0x90/0x630 [tuner_xc2028]
[11009.908086] 	xc2028_attach+0x310/0x8a0 [tuner_xc2028]
[11009.908090] 	em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb]
[11009.908094] 	em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb]
[11009.908098] 	em28xx_dvb_init+0x81/0x8a [em28xx_dvb]
[11009.908101] 	em28xx_register_extension+0xd9/0x190 [em28xx]
[11009.908105] 	em28xx_dvb_register+0x10/0x1000 [em28xx_dvb]
[11009.908108] 	do_one_initcall+0x141/0x300
[11009.908111] 	do_init_module+0x1d0/0x5ad
[11009.908114] 	load_module+0x6666/0x9ba0
[11009.908117] 	SyS_finit_module+0x108/0x130
[11009.908120] 	entry_SYSCALL_64_fastpath+0x16/0x76
[11009.908123] INFO: Slab 0xffffea000ef5e280 objects=25 used=25 fp=0x          (null) flags=0x2ffff8000004080
[11009.908126] INFO: Object 0xffff8803bd78ab40 @offset=2880 fp=0x0000000000000001

[11009.908130] Bytes b4 ffff8803bd78ab30: 01 00 00 00 2a 07 00 00 9d 28 00 00 01 00 00 00  ....*....(......
[11009.908133] Object ffff8803bd78ab40: 01 00 00 00 00 00 00 00 b0 1d c3 6a 00 88 ff ff  ...........j....
[11009.908137] CPU: 3 PID: 28992 Comm: modprobe Tainted: G    B   W       4.5.0-rc1+ #43
[11009.908140] Hardware name:                  /NUC5i7RYB, BIOS RYBDWi35.86A.0350.2015.0812.1722 08/12/2015
[11009.908142]  ffff8803bd78a000 ffff8802c273f1b8 ffffffff81932007 ffff8803c6407a80
[11009.908148]  ffff8802c273f1e8 ffffffff81556759 ffff8803c6407a80 ffffea000ef5e280
[11009.908153]  ffff8803bd78ab40 dffffc0000000000 ffff8802c273f210 ffffffff8155ccb4
[11009.908158] Call Trace:
[11009.908162]  [<ffffffff81932007>] dump_stack+0x4b/0x64
[11009.908165]  [<ffffffff81556759>] print_trailer+0xf9/0x150
[11009.908168]  [<ffffffff8155ccb4>] object_err+0x34/0x40
[11009.908171]  [<ffffffff8155f260>] kasan_report_error+0x230/0x550
[11009.908175]  [<ffffffff81237d71>] ? trace_hardirqs_off_caller+0x21/0x290
[11009.908179]  [<ffffffff8155e926>] ? kasan_unpoison_shadow+0x36/0x50
[11009.908182]  [<ffffffff8155f5c3>] __asan_report_load1_noabort+0x43/0x50
[11009.908185]  [<ffffffff8155ea00>] ? __asan_register_globals+0x50/0xa0
[11009.908189]  [<ffffffff8194cea6>] ? strcmp+0x96/0xb0
[11009.908192]  [<ffffffff8194cea6>] strcmp+0x96/0xb0
[11009.908196]  [<ffffffffa13ba4ac>] xc2028_set_config+0x15c/0x630 [tuner_xc2028]
[11009.908200]  [<ffffffffa13bac90>] xc2028_attach+0x310/0x8a0 [tuner_xc2028]
[11009.908203]  [<ffffffff8155ea78>] ? memset+0x28/0x30
[11009.908206]  [<ffffffffa13ba980>] ? xc2028_set_config+0x630/0x630 [tuner_xc2028]
[11009.908211]  [<ffffffffa157a59a>] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb]
[11009.908215]  [<ffffffffa157aa2a>] ? em28xx_dvb_init.part.3+0x37c/0x5cf4 [em28xx_dvb]
[11009.908219]  [<ffffffffa157a3a1>] ? hauppauge_hvr930c_init+0x487/0x487 [em28xx_dvb]
[11009.908222]  [<ffffffffa01795ac>] ? lgdt330x_attach+0x1cc/0x370 [lgdt330x]
[11009.908226]  [<ffffffffa01793e0>] ? i2c_read_demod_bytes.isra.2+0x210/0x210 [lgdt330x]
[11009.908230]  [<ffffffff812e87d0>] ? ref_module.part.15+0x10/0x10
[11009.908233]  [<ffffffff812e56e0>] ? module_assert_mutex_or_preempt+0x80/0x80
[11009.908238]  [<ffffffffa157af92>] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb]
[11009.908242]  [<ffffffffa157a6ae>] ? em28xx_attach_xc3028.constprop.7+0x30d/0x30d [em28xx_dvb]
[11009.908245]  [<ffffffff8195222d>] ? string+0x14d/0x1f0
[11009.908249]  [<ffffffff8195381f>] ? symbol_string+0xff/0x1a0
[11009.908253]  [<ffffffff81953720>] ? uuid_string+0x6f0/0x6f0
[11009.908257]  [<ffffffff811a775e>] ? __kernel_text_address+0x7e/0xa0
[11009.908260]  [<ffffffff8104b02f>] ? print_context_stack+0x7f/0xf0
[11009.908264]  [<ffffffff812e9846>] ? __module_address+0xb6/0x360
[11009.908268]  [<ffffffff8137fdc9>] ? is_ftrace_trampoline+0x99/0xe0
[11009.908271]  [<ffffffff811a775e>] ? __kernel_text_address+0x7e/0xa0
[11009.908275]  [<ffffffff81240a70>] ? debug_check_no_locks_freed+0x290/0x290
[11009.908278]  [<ffffffff8104a24b>] ? dump_trace+0x11b/0x300
[11009.908282]  [<ffffffffa13e8143>] ? em28xx_register_extension+0x23/0x190 [em28xx]
[11009.908285]  [<ffffffff81237d71>] ? trace_hardirqs_off_caller+0x21/0x290
[11009.908289]  [<ffffffff8123ff56>] ? trace_hardirqs_on_caller+0x16/0x590
[11009.908292]  [<ffffffff812404dd>] ? trace_hardirqs_on+0xd/0x10
[11009.908296]  [<ffffffffa13e8143>] ? em28xx_register_extension+0x23/0x190 [em28xx]
[11009.908299]  [<ffffffff822dcbb0>] ? mutex_trylock+0x400/0x400
[11009.908302]  [<ffffffff810021a1>] ? do_one_initcall+0x131/0x300
[11009.908306]  [<ffffffff81296dc7>] ? call_rcu_sched+0x17/0x20
[11009.908309]  [<ffffffff8159e708>] ? put_object+0x48/0x70
[11009.908314]  [<ffffffffa1579f11>] em28xx_dvb_init+0x81/0x8a [em28xx_dvb]
[11009.908317]  [<ffffffffa13e81f9>] em28xx_register_extension+0xd9/0x190 [em28xx]
[11009.908320]  [<ffffffffa0150000>] ? 0xffffffffa0150000
[11009.908324]  [<ffffffffa0150010>] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb]
[11009.908327]  [<ffffffff810021b1>] do_one_initcall+0x141/0x300
[11009.908330]  [<ffffffff81002070>] ? try_to_run_init_process+0x40/0x40
[11009.908333]  [<ffffffff8123ff56>] ? trace_hardirqs_on_caller+0x16/0x590
[11009.908337]  [<ffffffff8155e926>] ? kasan_unpoison_shadow+0x36/0x50
[11009.908340]  [<ffffffff8155e926>] ? kasan_unpoison_shadow+0x36/0x50
[11009.908343]  [<ffffffff8155e926>] ? kasan_unpoison_shadow+0x36/0x50
[11009.908346]  [<ffffffff8155ea37>] ? __asan_register_globals+0x87/0xa0
[11009.908350]  [<ffffffff8144da7b>] do_init_module+0x1d0/0x5ad
[11009.908353]  [<ffffffff812f2626>] load_module+0x6666/0x9ba0
[11009.908356]  [<ffffffff812e9c90>] ? symbol_put_addr+0x50/0x50
[11009.908361]  [<ffffffffa1580037>] ? em28xx_dvb_init.part.3+0x5989/0x5cf4 [em28xx_dvb]
[11009.908366]  [<ffffffff812ebfc0>] ? module_frob_arch_sections+0x20/0x20
[11009.908369]  [<ffffffff815bc940>] ? open_exec+0x50/0x50
[11009.908374]  [<ffffffff811671bb>] ? ns_capable+0x5b/0xd0
[11009.908377]  [<ffffffff812f5e58>] SyS_finit_module+0x108/0x130
[11009.908379]  [<ffffffff812f5d50>] ? SyS_init_module+0x1f0/0x1f0
[11009.908383]  [<ffffffff81004044>] ? lockdep_sys_exit_thunk+0x12/0x14
[11009.908394]  [<ffffffff822e6936>] entry_SYSCALL_64_fastpath+0x16/0x76
[11009.908396] Memory state around the buggy address:
[11009.908398]  ffff8803bd78aa00: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[11009.908401]  ffff8803bd78aa80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[11009.908403] >ffff8803bd78ab00: fc fc fc fc fc fc fc fc 00 00 fc fc fc fc fc fc
[11009.908405]                                            ^
[11009.908407]  ffff8803bd78ab80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[11009.908409]  ffff8803bd78ac00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[11009.908411] ==================================================================

In order to avoid it, let's set the cached value of the firmware
name to NULL after freeing it. While here, return an error if
the memory allocation fails.

Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Change-Id: I945c841dcfb45de2056267e4aa50bbe176b527cf
Bug: 30946097
---
 drivers/media/tuners/tuner-xc2028.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c
index 4e941f00b600..082ff5608455 100644
--- a/drivers/media/tuners/tuner-xc2028.c
+++ b/drivers/media/tuners/tuner-xc2028.c
@@ -1403,11 +1403,12 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg)
 	 * in order to avoid troubles during device release.
 	 */
 	kfree(priv->ctrl.fname);
+	priv->ctrl.fname = NULL;
 	memcpy(&priv->ctrl, p, sizeof(priv->ctrl));
 	if (p->fname) {
 		priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL);
 		if (priv->ctrl.fname == NULL)
-			rc = -ENOMEM;
+			return -ENOMEM;
 	}
 
 	/*

From 30439e51fc6f9ab169f7cb9e8e88faf7fb27ae99 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 3 Feb 2016 13:34:00 -0200
Subject: [PATCH 0271/3220] UPSTREAM: [media] xc2028: unlock on error in
 xc2028_set_config()

(cherry picked from commit 210bd104c6acd31c3c6b8b075b3f12d4a9f6b60d)

We have to unlock before returning -ENOMEM.

Fixes: 8dfbcc4351a0 ('[media] xc2028: avoid use after free')

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Change-Id: I7b6ba9fde5c6e29467e6de23d398af2fe56e2547
Bug: 30946097
---
 drivers/media/tuners/tuner-xc2028.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c
index 082ff5608455..317ef63ee789 100644
--- a/drivers/media/tuners/tuner-xc2028.c
+++ b/drivers/media/tuners/tuner-xc2028.c
@@ -1407,8 +1407,10 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg)
 	memcpy(&priv->ctrl, p, sizeof(priv->ctrl));
 	if (p->fname) {
 		priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL);
-		if (priv->ctrl.fname == NULL)
-			return -ENOMEM;
+		if (priv->ctrl.fname == NULL) {
+			rc = -ENOMEM;
+			goto unlock;
+		}
 	}
 
 	/*
@@ -1440,6 +1442,7 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg)
 		} else
 			priv->state = XC2028_WAITING_FIRMWARE;
 	}
+unlock:
 	mutex_unlock(&priv->lock);
 
 	return rc;

From a4e59955acf0ae25d805e0a0d251ac8f4b1808e7 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 5 May 2016 16:22:26 -0700
Subject: [PATCH 0272/3220] UPSTREAM: proc: prevent accessing
 /proc/<PID>/environ until it's ready

(cherry picked from commit 8148a73c9901a8794a50f950083c00ccf97d43b3)

If /proc/<PID>/environ gets read before the envp[] array is fully set up
in create_{aout,elf,elf_fdpic,flat}_tables(), we might end up trying to
read more bytes than are actually written, as env_start will already be
set but env_end will still be zero, making the range calculation
underflow, allowing to read beyond the end of what has been written.

Fix this as it is done for /proc/<PID>/cmdline by testing env_end for
zero.  It is, apparently, intentionally set last in create_*_tables().

This bug was found by the PaX size_overflow plugin that detected the
arithmetic underflow of 'this_len = env_end - (env_start + src)' when
env_end is still zero.

The expected consequence is that userland trying to access
/proc/<PID>/environ of a not yet fully set up process may get
inconsistent data as we're in the middle of copying in the environment
variables.

Fixes: https://forums.grsecurity.net/viewtopic.php?f=3&t=4363
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=116461
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: Pax Team <pageexec@freemail.hu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Change-Id: Ia2f58d48c15478ed4b6e237b63e704c70ff21e96
Bug: 30951939
---
 fs/proc/base.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 49348349e156..df715a095328 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -954,7 +954,8 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	int ret = 0;
 	struct mm_struct *mm = file->private_data;
 
-	if (!mm)
+	/* Ensure the process spawned far enough to have an environment. */
+	if (!mm || !mm->env_end)
 		return 0;
 
 	page = (char *)__get_free_page(GFP_TEMPORARY);

From 8e975e71f5123f4d2567a98f01641f7c02f8d2a2 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Fri, 2 Sep 2016 10:13:21 +0530
Subject: [PATCH 0273/3220] ANDROID: base-cfg: drop SECCOMP_FILTER config

Don't need to set SECCOMP_FILTER explicitly since CONFIG_SECCOMP=y will
select that config anyway.

Fixes: a49dcf2e745c ("ANDROID: base-cfg: enable SECCOMP config")
Change-Id: Iff18ed4d2db5a55b9f9480d5ecbeef7b818b3837
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 android/configs/android-base.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 288bab2d858d..34fa64a669ac 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -142,7 +142,6 @@ CONFIG_QUOTA=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SECCOMP=y
-CONFIG_SECCOMP_FILTER=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y

From d25e3081baa6f22564c1d87b7650c07c8c0b12b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 17 Aug 2016 05:56:26 -0700
Subject: [PATCH 0274/3220] UPSTREAM: tcp: fix use after free in
 tcp_xmit_retransmit_queue()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(cherry picked from commit bb1fceca22492109be12640d49f5ea5a544c6bb4)

When tcp_sendmsg() allocates a fresh and empty skb, it puts it at the
tail of the write queue using tcp_add_write_queue_tail()

Then it attempts to copy user data into this fresh skb.

If the copy fails, we undo the work and remove the fresh skb.

Unfortunately, this undo lacks the change done to tp->highest_sack and
we can leave a dangling pointer (to a freed skb)

Later, tcp_xmit_retransmit_queue() can dereference this pointer and
access freed memory. For regular kernels where memory is not unmapped,
this might cause SACK bugs because tcp_highest_sack_seq() is buggy,
returning garbage instead of tp->snd_nxt, but with various debug
features like CONFIG_DEBUG_PAGEALLOC, this can crash the kernel.

This bug was found by Marco Grassi thanks to syzkaller.

Fixes: 6859d49475d4 ("[TCP]: Abstract tp->highest_sack accessing & point to next skb")
Reported-by: Marco Grassi <marco.gra@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Change-Id: I58bb02d6e4e399612e8580b9e02d11e661df82f5
Bug: 31183296
---
 include/net/tcp.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index bcce99a951f8..30d1194ff1c5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1513,6 +1513,8 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli
 {
 	if (sk->sk_send_head == skb_unlinked)
 		sk->sk_send_head = NULL;
+	if (tcp_sk(sk)->highest_sack == skb_unlinked)
+		tcp_sk(sk)->highest_sack = NULL;
 }
 
 static inline void tcp_init_send_head(struct sock *sk)

From 87518a45ae570ada6a1e404331a6cc910014e1a9 Mon Sep 17 00:00:00 2001
From: Mohamad Ayyash <mkayyash@google.com>
Date: Wed, 11 May 2016 13:18:35 -0700
Subject: [PATCH 0275/3220] BACKPORT: Don't show empty tag stats for
 unprivileged uids

BUG: 27577101
BUG: 27532522
Signed-off-by: Mohamad Ayyash <mkayyash@google.com>
---
 net/netfilter/xt_qtaguid.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index e2e7d54f9bb1..3bf0c59dab2f 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -1946,7 +1946,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 			);
 		f_count = atomic_long_read(
 			&sock_tag_entry->socket->file->f_count);
-		seq_printf(m, "sock=%p tag=0x%llx (uid=%u) pid=%u "
+		seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u "
 			   "f_count=%lu\n",
 			   sock_tag_entry->sk,
 			   sock_tag_entry->tag, uid,
@@ -2548,8 +2548,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
 	uid_t stat_uid = get_uid_from_tag(tag);
 	struct proc_print_info *ppi = m->private;
 	/* Detailed tags are not available to everybody */
-	if (get_atag_from_tag(tag) && !can_read_other_uid_stats(
-						make_kuid(&init_user_ns,stat_uid))) {
+	if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) {
 		CT_DEBUG("qtaguid: stats line: "
 			 "%s 0x%llx %u: insufficient priv "
 			 "from pid=%u tgid=%u uid=%u stats.gid=%u\n",

From 8ac959cc06c8715cdc64aa88993b3a8b1ef61eaa Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 17 Dec 2015 09:57:27 -0800
Subject: [PATCH 0276/3220] UPSTREAM: Add 'unsafe' user access functions for
 batched accesses

The naming is meant to discourage random use: the helper functions are
not really any more "unsafe" than the traditional double-underscore
functions (which need the address range checking), but they do need even
more infrastructure around them, and should not be used willy-nilly.

In addition to checking the access range, these user access functions
require that you wrap the user access with a "user_acess_{begin,end}()"
around it.

That allows architectures that implement kernel user access control
(x86: SMAP, arm64: PAN) to do the user access control in the wrapping
user_access_begin/end part, and then batch up the actual user space
accesses using the new interfaces.

The main (and hopefully only) use for these are for core generic access
helpers, initially just the generic user string functions
(strnlen_user() and strncpy_from_user()).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: Ic64efea41f97171bdbdabe3e531489aebd9b6fac
(cherry picked from commit 5b24a7a2aa2040c8c50c3b71122901d01661ff78)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/x86/include/asm/uaccess.h | 25 +++++++++++++++++++++++++
 include/linux/uaccess.h        |  7 +++++++
 2 files changed, 32 insertions(+)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 09b1b0ab94b7..6c4c39b137e5 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -745,5 +745,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 #undef __copy_from_user_overflow
 #undef __copy_to_user_overflow
 
+/*
+ * The "unsafe" user accesses aren't really "unsafe", but the naming
+ * is a big fat warning: you have to not only do the access_ok()
+ * checking before using them, but you have to surround them with the
+ * user_access_begin/end() pair.
+ */
+#define user_access_begin()	__uaccess_begin()
+#define user_access_end()	__uaccess_end()
+
+#define unsafe_put_user(x, ptr)						\
+({										\
+	int __pu_err;								\
+	__put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT);		\
+	__builtin_expect(__pu_err, 0);						\
+})
+
+#define unsafe_get_user(x, ptr)						\
+({										\
+	int __gu_err;								\
+	unsigned long __gu_val;							\
+	__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT);	\
+	(x) = (__force __typeof__(*(ptr)))__gu_val;				\
+	__builtin_expect(__gu_err, 0);						\
+})
+
 #endif /* _ASM_X86_UACCESS_H */
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 558129af828a..349557825428 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -111,4 +111,11 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
 #define probe_kernel_address(addr, retval)		\
 	probe_kernel_read(&retval, addr, sizeof(retval))
 
+#ifndef user_access_begin
+#define user_access_begin() do { } while (0)
+#define user_access_end() do { } while (0)
+#define unsafe_get_user(x, ptr) __get_user(x, ptr)
+#define unsafe_put_user(x, ptr) __put_user(x, ptr)
+#endif
+
 #endif		/* __LINUX_UACCESS_H__ */

From c1932b10476e4d60f314bc6111101d1d101599b3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 17 Dec 2015 10:05:19 -0800
Subject: [PATCH 0277/3220] UPSTREAM: Use the new batched user accesses in
 generic user string handling

This converts the generic user string functions to use the batched user
access functions.

It makes a big difference on Skylake, which is the first x86
microarchitecture to implement SMAP.  The STAC/CLAC instructions are not
very fast, and doing them for each access inside the loop that copies
strings from user space (which is what the pathname handling does for
every pathname the kernel uses, for example) is very inefficient.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: Ic39a686b4bb1ad9cd16ad0887bb669beed6fe8aa
(cherry picked from commit 9fd4470ff4974c41b1db43c3b355b9085af9c12a)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 lib/strncpy_from_user.c | 11 ++++++++---
 lib/strnlen_user.c      | 18 ++++++++++++++----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index e0af6ff73d14..33840324138c 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -39,7 +39,7 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long
 		unsigned long c, data;
 
 		/* Fall back to byte-at-a-time if we get a page fault */
-		if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
+		if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res))))
 			break;
 		*(unsigned long *)(dst+res) = c;
 		if (has_zero(c, &data, &constants)) {
@@ -55,7 +55,7 @@ byte_at_a_time:
 	while (max) {
 		char c;
 
-		if (unlikely(__get_user(c,src+res)))
+		if (unlikely(unsafe_get_user(c,src+res)))
 			return -EFAULT;
 		dst[res] = c;
 		if (!c)
@@ -107,7 +107,12 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 	src_addr = (unsigned long)src;
 	if (likely(src_addr < max_addr)) {
 		unsigned long max = max_addr - src_addr;
-		return do_strncpy_from_user(dst, src, count, max);
+		long retval;
+
+		user_access_begin();
+		retval = do_strncpy_from_user(dst, src, count, max);
+		user_access_end();
+		return retval;
 	}
 	return -EFAULT;
 }
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 3a5f2b366d84..2625943625d7 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -45,7 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 	src -= align;
 	max += align;
 
-	if (unlikely(__get_user(c,(unsigned long __user *)src)))
+	if (unlikely(unsafe_get_user(c,(unsigned long __user *)src)))
 		return 0;
 	c |= aligned_byte_mask(align);
 
@@ -61,7 +61,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 		if (unlikely(max <= sizeof(unsigned long)))
 			break;
 		max -= sizeof(unsigned long);
-		if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
+		if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res))))
 			return 0;
 	}
 	res -= align;
@@ -112,7 +112,12 @@ long strnlen_user(const char __user *str, long count)
 	src_addr = (unsigned long)str;
 	if (likely(src_addr < max_addr)) {
 		unsigned long max = max_addr - src_addr;
-		return do_strnlen_user(str, count, max);
+		long retval;
+
+		user_access_begin();
+		retval = do_strnlen_user(str, count, max);
+		user_access_end();
+		return retval;
 	}
 	return 0;
 }
@@ -141,7 +146,12 @@ long strlen_user(const char __user *str)
 	src_addr = (unsigned long)str;
 	if (likely(src_addr < max_addr)) {
 		unsigned long max = max_addr - src_addr;
-		return do_strnlen_user(str, ~0ul, max);
+		long retval;
+
+		user_access_begin();
+		retval = do_strnlen_user(str, ~0ul, max);
+		user_access_end();
+		return retval;
 	}
 	return 0;
 }

From fed752db02277e34e92262daac24ad4216e79c56 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 21:28:47 +0100
Subject: [PATCH 0278/3220] BACKPORT: ARM: 8583/1: mm: fix location of _etext

The _etext position is defined to be the end of the kernel text code,
and should not include any part of the data segments. This interferes
with things that might check memory ranges and expect executable code
up to _etext. Just to be conservative, leave the kernel resource as
it was, using __init_begin instead of _etext as the end mark.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Change-Id: Ida514d1359dbe6f782f562ce29b4ba09ae72bfc0
(cherry picked from commit 14c4a533e0996f95a0a64dfd0b6252d788cebc74)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm/kernel/setup.c       | 2 +-
 arch/arm/kernel/vmlinux.lds.S | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 20edd349d379..bf63b4693457 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -772,7 +772,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
 	struct resource *res;
 
 	kernel_code.start   = virt_to_phys(_text);
-	kernel_code.end     = virt_to_phys(_etext - 1);
+	kernel_code.end     = virt_to_phys(__init_begin - 1);
 	kernel_data.start   = virt_to_phys(_sdata);
 	kernel_data.end     = virt_to_phys(_end - 1);
 
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 8b60fde5ce48..be2ab6d3b91f 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -120,6 +120,8 @@ SECTIONS
 #ifdef CONFIG_DEBUG_RODATA
 	. = ALIGN(1<<SECTION_SHIFT);
 #endif
+	_etext = .;			/* End of text section */
+
 	RO_DATA(PAGE_SIZE)
 
 	. = ALIGN(4);
@@ -150,8 +152,6 @@ SECTIONS
 
 	NOTES
 
-	_etext = .;			/* End of text and rodata section */
-
 #ifndef CONFIG_XIP_KERNEL
 # ifdef CONFIG_ARM_KERNMEM_PERMS
 	. = ALIGN(1<<SECTION_SHIFT);

From 79389c0d786133e83dc1ecc991767837cf713b83 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Thu, 23 Jun 2016 15:53:17 +0200
Subject: [PATCH 0279/3220] BACKPORT: arm64: mm: fix location of _etext

As Kees Cook notes in the ARM counterpart of this patch [0]:

  The _etext position is defined to be the end of the kernel text code,
  and should not include any part of the data segments. This interferes
  with things that might check memory ranges and expect executable code
  up to _etext.

In particular, Kees is referring to the HARDENED_USERCOPY patch set [1],
which rejects attempts to call copy_to_user() on kernel ranges containing
executable code, but does allow access to the .rodata segment. Regardless
of whether one may or may not agree with the distinction, it makes sense
for _etext to have the same meaning across architectures.

So let's put _etext where it belongs, between .text and .rodata, and fix
up existing references to use __init_begin instead, which unlike _end_rodata
includes the exception and notes sections as well.

The _etext references in kaslr.c are left untouched, since its references
to [_stext, _etext) are meant to capture potential jump instruction targets,
and so disregarding .rodata is actually an improvement here.

[0] http://article.gmane.org/gmane.linux.kernel/2245084
[1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502

Reported-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: I8f6582525217b9ca324f6a382ea52d30ce1d0dbd
(cherry picked from commit 9fdc14c55cd6579d619ccd9d40982e0805e62b6d)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/kernel/setup.c       | 2 +-
 arch/arm64/kernel/vmlinux.lds.S | 3 ++-
 arch/arm64/mm/mmu.c             | 3 +--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 8119479147db..7f0e7226795e 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -201,7 +201,7 @@ static void __init request_standard_resources(void)
 	struct resource *res;
 
 	kernel_code.start   = virt_to_phys(_text);
-	kernel_code.end     = virt_to_phys(_etext - 1);
+	kernel_code.end     = virt_to_phys(__init_begin - 1);
 	kernel_data.start   = virt_to_phys(_sdata);
 	kernel_data.end     = virt_to_phys(_end - 1);
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 71426a78db12..f472809a7b45 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -114,11 +114,12 @@ SECTIONS
 	}
 
 	ALIGN_DEBUG_RO
+	_etext = .;			/* End of text section */
+
 	RO_DATA(PAGE_SIZE)
 	EXCEPTION_TABLE(8)
 	NOTES
 	ALIGN_DEBUG_RO
-	_etext = .;			/* End of text and rodata section */
 
 	ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
 	__init_begin = .;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 873e363048c6..9a8dc4a79d8b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -337,7 +337,6 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
 				end - kernel_x_end,
 				PAGE_KERNEL);
 	}
-
 }
 #else
 static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
@@ -425,7 +424,7 @@ static void __init fixup_executable(void)
 void mark_rodata_ro(void)
 {
 	create_mapping_late(__pa(_stext), (unsigned long)_stext,
-				(unsigned long)_etext - (unsigned long)_stext,
+				(unsigned long)__init_begin - (unsigned long)_stext,
 				PAGE_KERNEL_ROX);
 
 }

From 5f1b3400f0ccc2368dc444b5d9a7c93be2e700da Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 8 Aug 2016 13:02:01 -0700
Subject: [PATCH 0280/3220] UPSTREAM: unsafe_[get|put]_user: change interface
 to use a error target label

When I initially added the unsafe_[get|put]_user() helpers in commit
5b24a7a2aa20 ("Add 'unsafe' user access functions for batched
accesses"), I made the mistake of modeling the interface on our
traditional __[get|put]_user() functions, which return zero on success,
or -EFAULT on failure.

That interface is fairly easy to use, but it's actually fairly nasty for
good code generation, since it essentially forces the caller to check
the error value for each access.

In particular, since the error handling is already internally
implemented with an exception handler, and we already use "asm goto" for
various other things, we could fairly easily make the error cases just
jump directly to an error label instead, and avoid the need for explicit
checking after each operation.

So switch the interface to pass in an error label, rather than checking
the error value in the caller.  Best do it now before we start growing
more users (the signal handling code in particular would be a good place
to use the new interface).

So rather than

	if (unsafe_get_user(x, ptr))
		... handle error ..

the interface is now

	unsafe_get_user(x, ptr, label);

where an error during the user mode fetch will now just cause a jump to
'label' in the caller.

Right now the actual _implementation_ of this all still ends up being a
"if (err) goto label", and does not take advantage of any exception
label tricks, but for "unsafe_put_user()" in particular it should be
fairly straightforward to convert to using the exception table model.

Note that "unsafe_get_user()" is much harder to convert to a clever
exception table model, because current versions of gcc do not allow the
use of "asm goto" (for the exception) with output values (for the actual
value to be fetched).  But that is hopefully not a limitation in the
long term.

[ Also note that it might be a good idea to switch unsafe_get_user() to
  actually _return_ the value it fetches from user space, but this
  commit only changes the error handling semantics ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: Ib905a84a04d46984320f6fd1056da4d72f3d6b53
(cherry picked from commit 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/x86/include/asm/uaccess.h | 16 ++++++++--------
 include/linux/uaccess.h        |  4 ++--
 lib/strncpy_from_user.c        |  8 ++++----
 lib/strnlen_user.c             |  7 +++----
 4 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 6c4c39b137e5..5abdd0221fb4 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -754,21 +754,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 #define user_access_begin()	__uaccess_begin()
 #define user_access_end()	__uaccess_end()
 
-#define unsafe_put_user(x, ptr)						\
-({										\
+#define unsafe_put_user(x, ptr, err_label)					\
+do {										\
 	int __pu_err;								\
 	__put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT);		\
-	__builtin_expect(__pu_err, 0);						\
-})
+	if (unlikely(__pu_err)) goto err_label;					\
+} while (0)
 
-#define unsafe_get_user(x, ptr)						\
-({										\
+#define unsafe_get_user(x, ptr, err_label)					\
+do {										\
 	int __gu_err;								\
 	unsigned long __gu_val;							\
 	__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;				\
-	__builtin_expect(__gu_err, 0);						\
-})
+	if (unlikely(__gu_err)) goto err_label;					\
+} while (0)
 
 #endif /* _ASM_X86_UACCESS_H */
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 349557825428..f30c187ed785 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -114,8 +114,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
 #ifndef user_access_begin
 #define user_access_begin() do { } while (0)
 #define user_access_end() do { } while (0)
-#define unsafe_get_user(x, ptr) __get_user(x, ptr)
-#define unsafe_put_user(x, ptr) __put_user(x, ptr)
+#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
+#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
 #endif
 
 #endif		/* __LINUX_UACCESS_H__ */
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 33840324138c..5a003a2ebd96 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -39,8 +39,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long
 		unsigned long c, data;
 
 		/* Fall back to byte-at-a-time if we get a page fault */
-		if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res))))
-			break;
+		unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);
+
 		*(unsigned long *)(dst+res) = c;
 		if (has_zero(c, &data, &constants)) {
 			data = prep_zero_mask(c, data, &constants);
@@ -55,8 +55,7 @@ byte_at_a_time:
 	while (max) {
 		char c;
 
-		if (unlikely(unsafe_get_user(c,src+res)))
-			return -EFAULT;
+		unsafe_get_user(c,src+res, efault);
 		dst[res] = c;
 		if (!c)
 			return res;
@@ -75,6 +74,7 @@ byte_at_a_time:
 	 * Nope: we hit the address space limit, and we still had more
 	 * characters the caller would have wanted. That's an EFAULT.
 	 */
+efault:
 	return -EFAULT;
 }
 
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 2625943625d7..8e105ed4df12 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 	src -= align;
 	max += align;
 
-	if (unlikely(unsafe_get_user(c,(unsigned long __user *)src)))
-		return 0;
+	unsafe_get_user(c, (unsigned long __user *)src, efault);
 	c |= aligned_byte_mask(align);
 
 	for (;;) {
@@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 		if (unlikely(max <= sizeof(unsigned long)))
 			break;
 		max -= sizeof(unsigned long);
-		if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res))))
-			return 0;
+		unsafe_get_user(c, (unsigned long __user *)(src+res), efault);
 	}
 	res -= align;
 
@@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 	 * Nope: we hit the address space limit, and we still had more
 	 * characters the caller would have wanted. That's 0.
 	 */
+efault:
 	return 0;
 }
 

From 1e701cdc5bbce1f5561ee25ff8709a18dc0b2282 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 19 Jul 2016 15:00:04 -0700
Subject: [PATCH 0281/3220] UPSTREAM: mm: Add is_migrate_cma_page

Code such as hardened user copy[1] needs a way to tell if a
page is CMA or not. Add is_migrate_cma_page in a similar way
to is_migrate_isolate_page.

[1]http://article.gmane.org/gmane.linux.kernel.mm/155238

Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: I1f9aa13d8d063038fa70b93282a836648fbb4f6d
(cherry picked from commit 7c15d9bb8231f998ae7dc0b72415f5215459f7fb)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 include/linux/mmzone.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e23a9e704536..bab4053fb795 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -65,8 +65,10 @@ enum {
 
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
+#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
 #else
 #  define is_migrate_cma(migratetype) false
+#  define is_migrate_cma_page(_page) false
 #endif
 
 #define for_each_migratetype_order(order, type) \

From c30d7340ee0b167a45cd8d6d5c48add8f62db9a5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 12 Jul 2016 16:19:48 -0700
Subject: [PATCH 0282/3220] BACKPORT: mm: Implement stack frame object
 validation

This creates per-architecture function arch_within_stack_frames() that
should validate if a given object is contained by a kernel stack frame.
Initial implementation is on x86.

This is based on code from PaX.

Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: I1f3b299bb8991d65dcdac6af85d633d4b7776df1
(cherry picked from commit 0f60a8efe4005ab5e65ce000724b04d4ca04a199)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/Kconfig                       |  9 ++++++
 arch/x86/Kconfig                   |  1 +
 arch/x86/include/asm/thread_info.h | 44 ++++++++++++++++++++++++++++++
 include/linux/thread_info.h        |  9 ++++++
 4 files changed, 63 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 31a318a56d98..98f64ad1caf1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -423,6 +423,15 @@ config CC_STACKPROTECTOR_STRONG
 
 endchoice
 
+config HAVE_ARCH_WITHIN_STACK_FRAMES
+	bool
+	help
+	  An architecture should select this if it can walk the kernel stack
+	  frames to determine if an object is part of either the arguments
+	  or local variables (i.e. that it excludes saved return addresses,
+	  and similar) by implementing an inline arch_within_stack_frames(),
+	  which is used by CONFIG_HARDENED_USERCOPY.
+
 config HAVE_CONTEXT_TRACKING
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ae3c83e8bde8..4786de7750b1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,6 +88,7 @@ config X86
 	select HAVE_ARCH_SOFT_DIRTY		if X86_64
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_ARCH_WITHIN_STACK_FRAMES
 	select HAVE_BPF_JIT			if X86_64
 	select HAVE_CC_STACKPROTECTOR
 	select HAVE_CMPXCHG_DOUBLE
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c7b551028740..0c977fc124a7 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -177,6 +177,50 @@ static inline unsigned long current_stack_pointer(void)
 	return sp;
 }
 
+/*
+ * Walks up the stack frames to make sure that the specified object is
+ * entirely contained by a single stack frame.
+ *
+ * Returns:
+ *		 1 if within a frame
+ *		-1 if placed across a frame boundary (or outside stack)
+ *		 0 unable to determine (no frame pointers, etc)
+ */
+static inline int arch_within_stack_frames(const void * const stack,
+					   const void * const stackend,
+					   const void *obj, unsigned long len)
+{
+#if defined(CONFIG_FRAME_POINTER)
+	const void *frame = NULL;
+	const void *oldframe;
+
+	oldframe = __builtin_frame_address(1);
+	if (oldframe)
+		frame = __builtin_frame_address(2);
+	/*
+	 * low ----------------------------------------------> high
+	 * [saved bp][saved ip][args][local vars][saved bp][saved ip]
+	 *                     ^----------------^
+	 *               allow copies only within here
+	 */
+	while (stack <= frame && frame < stackend) {
+		/*
+		 * If obj + len extends past the last frame, this
+		 * check won't pass and the next frame will be 0,
+		 * causing us to bail out and correctly report
+		 * the copy as invalid.
+		 */
+		if (obj + len <= frame)
+			return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1;
+		oldframe = frame;
+		frame = *(const void * const *)frame;
+	}
+	return -1;
+#else
+	return 0;
+#endif
+}
+
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ff307b548ed3..5ecb68e86968 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -145,6 +145,15 @@ static inline bool test_and_clear_restore_sigmask(void)
 #error "no set_restore_sigmask() provided and default one won't work"
 #endif
 
+#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
+static inline int arch_within_stack_frames(const void * const stack,
+					   const void * const stackend,
+					   const void *obj, unsigned long len)
+{
+	return 0;
+}
+#endif
+
 #endif	/* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */

From d677b3104d3c8e927211f20083a6c2e97e003b87 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 7 Jun 2016 11:05:33 -0700
Subject: [PATCH 0283/3220] BACKPORT: mm: Hardened usercopy

This is the start of porting PAX_USERCOPY into the mainline kernel. This
is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The
work is based on code by PaX Team and Brad Spengler, and an earlier port
from Casey Schaufler. Additional non-slab page tests are from Rik van Riel.

This patch contains the logic for validating several conditions when
performing copy_to_user() and copy_from_user() on the kernel object
being copied to/from:
- address range doesn't wrap around
- address range isn't NULL or zero-allocated (with a non-zero copy size)
- if on the slab allocator:
  - object size must be less than or equal to copy size (when check is
    implemented in the allocator, which appear in subsequent patches)
- otherwise, object must not span page allocations (excepting Reserved
  and CMA ranges)
- if on the stack
  - object must not extend before/after the current process stack
  - object must be contained by a valid stack frame (when there is
    arch/build support for identifying stack frames)
- object must not overlap with kernel text

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>

Change-Id: Iff3b5f1ddb04acd99ccf9a9046c7797363962b2a
(cherry picked from commit f5509cc18daa7f82bcc553be70df2117c8eedc16)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 include/linux/slab.h        |  12 ++
 include/linux/thread_info.h |  15 ++
 mm/Makefile                 |   4 +
 mm/usercopy.c               | 269 ++++++++++++++++++++++++++++++++++++
 security/Kconfig            |  28 ++++
 5 files changed, 328 insertions(+)
 create mode 100644 mm/usercopy.c

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2037a861e367..4ef384b172e0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -144,6 +144,18 @@ void kfree(const void *);
 void kzfree(const void *);
 size_t ksize(const void *);
 
+#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
+const char *__check_heap_object(const void *ptr, unsigned long n,
+				struct page *page);
+#else
+static inline const char *__check_heap_object(const void *ptr,
+					      unsigned long n,
+					      struct page *page)
+{
+	return NULL;
+}
+#endif
+
 /*
  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
  * alignment larger than the alignment of a 64-bit integer.
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 5ecb68e86968..0ae29ff9ccfd 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -154,6 +154,21 @@ static inline int arch_within_stack_frames(const void * const stack,
 }
 #endif
 
+#ifdef CONFIG_HARDENED_USERCOPY
+extern void __check_object_size(const void *ptr, unsigned long n,
+					bool to_user);
+
+static inline void check_object_size(const void *ptr, unsigned long n,
+				     bool to_user)
+{
+	__check_object_size(ptr, n, to_user);
+}
+#else
+static inline void check_object_size(const void *ptr, unsigned long n,
+				     bool to_user)
+{ }
+#endif /* CONFIG_HARDENED_USERCOPY */
+
 #endif	/* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */
diff --git a/mm/Makefile b/mm/Makefile
index 2ed43191fc3b..8b532c94008f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,6 +5,9 @@
 KASAN_SANITIZE_slab_common.o := n
 KASAN_SANITIZE_slub.o := n
 
+# Since __builtin_frame_address does work as used, disable the warning.
+CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
+
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -81,3 +84,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
 obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/usercopy.c b/mm/usercopy.c
new file mode 100644
index 000000000000..816feccebeb0
--- /dev/null
+++ b/mm/usercopy.c
@@ -0,0 +1,269 @@
+/*
+ * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
+ * which are designed to protect kernel memory from needless exposure
+ * and overwrite under many unintended conditions. This code is based
+ * on PAX_USERCOPY, which is:
+ *
+ * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
+ * Security Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+
+enum {
+	BAD_STACK = -1,
+	NOT_STACK = 0,
+	GOOD_FRAME,
+	GOOD_STACK,
+};
+
+/*
+ * Checks if a given pointer and length is contained by the current
+ * stack frame (if possible).
+ *
+ * Returns:
+ *	NOT_STACK: not at all on the stack
+ *	GOOD_FRAME: fully within a valid stack frame
+ *	GOOD_STACK: fully on the stack (when can't do frame-checking)
+ *	BAD_STACK: error condition (invalid stack position or bad stack frame)
+ */
+static noinline int check_stack_object(const void *obj, unsigned long len)
+{
+	const void * const stack = task_stack_page(current);
+	const void * const stackend = stack + THREAD_SIZE;
+	int ret;
+
+	/* Object is not on the stack at all. */
+	if (obj + len <= stack || stackend <= obj)
+		return NOT_STACK;
+
+	/*
+	 * Reject: object partially overlaps the stack (passing the
+	 * the check above means at least one end is within the stack,
+	 * so if this check fails, the other end is outside the stack).
+	 */
+	if (obj < stack || stackend < obj + len)
+		return BAD_STACK;
+
+	/* Check if object is safely within a valid frame. */
+	ret = arch_within_stack_frames(stack, stackend, obj, len);
+	if (ret)
+		return ret;
+
+	return GOOD_STACK;
+}
+
+static void report_usercopy(const void *ptr, unsigned long len,
+			    bool to_user, const char *type)
+{
+	pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
+		to_user ? "exposure" : "overwrite",
+		to_user ? "from" : "to", ptr, type ? : "unknown", len);
+	/*
+	 * For greater effect, it would be nice to do do_group_exit(),
+	 * but BUG() actually hooks all the lock-breaking and per-arch
+	 * Oops code, so that is used here instead.
+	 */
+	BUG();
+}
+
+/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
+static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
+		     unsigned long high)
+{
+	unsigned long check_low = (uintptr_t)ptr;
+	unsigned long check_high = check_low + n;
+
+	/* Does not overlap if entirely above or entirely below. */
+	if (check_low >= high || check_high < low)
+		return false;
+
+	return true;
+}
+
+/* Is this address range in the kernel text area? */
+static inline const char *check_kernel_text_object(const void *ptr,
+						   unsigned long n)
+{
+	unsigned long textlow = (unsigned long)_stext;
+	unsigned long texthigh = (unsigned long)_etext;
+	unsigned long textlow_linear, texthigh_linear;
+
+	if (overlaps(ptr, n, textlow, texthigh))
+		return "<kernel text>";
+
+	/*
+	 * Some architectures have virtual memory mappings with a secondary
+	 * mapping of the kernel text, i.e. there is more than one virtual
+	 * kernel address that points to the kernel image. It is usually
+	 * when there is a separate linear physical memory mapping, in that
+	 * __pa() is not just the reverse of __va(). This can be detected
+	 * and checked:
+	 */
+	textlow_linear = (unsigned long)__va(__pa(textlow));
+	/* No different mapping: we're done. */
+	if (textlow_linear == textlow)
+		return NULL;
+
+	/* Check the secondary mapping... */
+	texthigh_linear = (unsigned long)__va(__pa(texthigh));
+	if (overlaps(ptr, n, textlow_linear, texthigh_linear))
+		return "<linear kernel text>";
+
+	return NULL;
+}
+
+static inline const char *check_bogus_address(const void *ptr, unsigned long n)
+{
+	/* Reject if object wraps past end of memory. */
+	if (ptr + n < ptr)
+		return "<wrapped address>";
+
+	/* Reject if NULL or ZERO-allocation. */
+	if (ZERO_OR_NULL_PTR(ptr))
+		return "<null>";
+
+	return NULL;
+}
+
+static inline const char *check_heap_object(const void *ptr, unsigned long n,
+					    bool to_user)
+{
+	struct page *page, *endpage;
+	const void *end = ptr + n - 1;
+	bool is_reserved, is_cma;
+
+	/*
+	 * Some architectures (arm64) return true for virt_addr_valid() on
+	 * vmalloced addresses. Work around this by checking for vmalloc
+	 * first.
+	 */
+	if (is_vmalloc_addr(ptr))
+		return NULL;
+
+	if (!virt_addr_valid(ptr))
+		return NULL;
+
+	page = virt_to_head_page(ptr);
+
+	/* Check slab allocator for flags and size. */
+	if (PageSlab(page))
+		return __check_heap_object(ptr, n, page);
+
+	/*
+	 * Sometimes the kernel data regions are not marked Reserved (see
+	 * check below). And sometimes [_sdata,_edata) does not cover
+	 * rodata and/or bss, so check each range explicitly.
+	 */
+
+	/* Allow reads of kernel rodata region (if not marked as Reserved). */
+	if (ptr >= (const void *)__start_rodata &&
+	    end <= (const void *)__end_rodata) {
+		if (!to_user)
+			return "<rodata>";
+		return NULL;
+	}
+
+	/* Allow kernel data region (if not marked as Reserved). */
+	if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
+		return NULL;
+
+	/* Allow kernel bss region (if not marked as Reserved). */
+	if (ptr >= (const void *)__bss_start &&
+	    end <= (const void *)__bss_stop)
+		return NULL;
+
+	/* Is the object wholly within one base page? */
+	if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
+		   ((unsigned long)end & (unsigned long)PAGE_MASK)))
+		return NULL;
+
+	/* Allow if start and end are inside the same compound page. */
+	endpage = virt_to_head_page(end);
+	if (likely(endpage == page))
+		return NULL;
+
+	/*
+	 * Reject if range is entirely either Reserved (i.e. special or
+	 * device memory), or CMA. Otherwise, reject since the object spans
+	 * several independently allocated pages.
+	 */
+	is_reserved = PageReserved(page);
+	is_cma = is_migrate_cma_page(page);
+	if (!is_reserved && !is_cma)
+		goto reject;
+
+	for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
+		page = virt_to_head_page(ptr);
+		if (is_reserved && !PageReserved(page))
+			goto reject;
+		if (is_cma && !is_migrate_cma_page(page))
+			goto reject;
+	}
+
+	return NULL;
+
+reject:
+	return "<spans multiple pages>";
+}
+
+/*
+ * Validates that the given object is:
+ * - not bogus address
+ * - known-safe heap or stack object
+ * - not in kernel text
+ */
+void __check_object_size(const void *ptr, unsigned long n, bool to_user)
+{
+	const char *err;
+
+	/* Skip all tests if size is zero. */
+	if (!n)
+		return;
+
+	/* Check for invalid addresses. */
+	err = check_bogus_address(ptr, n);
+	if (err)
+		goto report;
+
+	/* Check for bad heap object. */
+	err = check_heap_object(ptr, n, to_user);
+	if (err)
+		goto report;
+
+	/* Check for bad stack object. */
+	switch (check_stack_object(ptr, n)) {
+	case NOT_STACK:
+		/* Object is not touching the current process stack. */
+		break;
+	case GOOD_FRAME:
+	case GOOD_STACK:
+		/*
+		 * Object is either in the correct frame (when it
+		 * is possible to check) or just generally on the
+		 * process stack (when frame checking not available).
+		 */
+		return;
+	default:
+		err = "<process stack>";
+		goto report;
+	}
+
+	/* Check for object in kernel to avoid text exposure. */
+	err = check_kernel_text_object(ptr, n);
+	if (!err)
+		return;
+
+report:
+	report_usercopy(ptr, n, to_user, err);
+}
+EXPORT_SYMBOL(__check_object_size);
diff --git a/security/Kconfig b/security/Kconfig
index 30a2603e8c85..aba6a8d4d1f4 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -127,6 +127,34 @@ config LSM_MMAP_MIN_ADDR
 	  this low address space will need the permission specific to the
 	  systems running LSM.
 
+config HAVE_HARDENED_USERCOPY_ALLOCATOR
+	bool
+	help
+	  The heap allocator implements __check_heap_object() for
+	  validating memory ranges against heap object sizes in
+	  support of CONFIG_HARDENED_USERCOPY.
+
+config HAVE_ARCH_HARDENED_USERCOPY
+	bool
+	help
+	  The architecture supports CONFIG_HARDENED_USERCOPY by
+	  calling check_object_size() just before performing the
+	  userspace copies in the low level implementation of
+	  copy_to_user() and copy_from_user().
+
+config HARDENED_USERCOPY
+	bool "Harden memory copies between kernel and userspace"
+	depends on HAVE_ARCH_HARDENED_USERCOPY
+	select BUG
+	help
+	  This option checks for obviously wrong memory regions when
+	  copying memory to/from the kernel (via copy_to_user() and
+	  copy_from_user() functions) by rejecting memory ranges that
+	  are larger than the specified heap object, span multiple
+	  separately allocates pages, are not on the process stack,
+	  or are part of the kernel text. This kills entire classes
+	  of heap overflow exploits and similar kernel memory exposures.
+
 source security/selinux/Kconfig
 source security/smack/Kconfig
 source security/tomoyo/Kconfig

From 9c61fcc2b39760263436b13b97071ae6e8eeb5f2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:04:01 -0700
Subject: [PATCH 0284/3220] BACKPORT: x86/uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on x86. This is done both in
copy_*_user() and __copy_*_user() because copy_*_user() actually calls
down to _copy_*_user() and not __copy_*_user().

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>

Change-Id: I260db1d4572bdd2f779200aca99d03a170658440
(cherry picked from commit 5b710f34e194c6b7710f69fdb5d798fdf35b98c1)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/x86/Kconfig                  |  1 +
 arch/x86/include/asm/uaccess.h    | 10 ++++++----
 arch/x86/include/asm/uaccess_32.h |  2 ++
 arch/x86/include/asm/uaccess_64.h |  2 ++
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4786de7750b1..9e3b1e57499a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -79,6 +79,7 @@ config X86
 	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
 	select HAVE_AOUT			if X86_32
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5abdd0221fb4..32a047ce0806 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -714,9 +714,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
 	 * case, and do only runtime checking for non-constant sizes.
 	 */
 
-	if (likely(sz < 0 || sz >= n))
+	if (likely(sz < 0 || sz >= n)) {
+		check_object_size(to, n, false);
 		n = _copy_from_user(to, from, n);
-	else if(__builtin_constant_p(n))
+	} else if (__builtin_constant_p(n))
 		copy_from_user_overflow();
 	else
 		__copy_from_user_overflow(sz, n);
@@ -732,9 +733,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 	might_fault();
 
 	/* See the comment in copy_from_user() above. */
-	if (likely(sz < 0 || sz >= n))
+	if (likely(sz < 0 || sz >= n)) {
+		check_object_size(from, n, true);
 		n = _copy_to_user(to, from, n);
-	else if(__builtin_constant_p(n))
+	} else if (__builtin_constant_p(n))
 		copy_to_user_overflow();
 	else
 		__copy_to_user_overflow(sz, n);
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index f5dcb5204dcd..2ffc9188cc70 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -43,6 +43,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
 static __always_inline unsigned long __must_check
 __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 {
+	check_object_size(from, n, true);
 	if (__builtin_constant_p(n)) {
 		unsigned long ret;
 
@@ -143,6 +144,7 @@ static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	might_fault();
+	check_object_size(to, n, false);
 	if (__builtin_constant_p(n)) {
 		unsigned long ret;
 
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f2f9b39b274a..512ffd621fc3 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -53,6 +53,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
 {
 	int ret = 0;
 
+	check_object_size(dst, size, false);
 	if (!__builtin_constant_p(size))
 		return copy_user_generic(dst, (__force void *)src, size);
 	switch (size) {
@@ -103,6 +104,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
 {
 	int ret = 0;
 
+	check_object_size(src, size, true);
 	if (!__builtin_constant_p(size))
 		return copy_user_generic((__force void *)dst, src, size);
 	switch (size) {

From 073fcda55b357777cc3edaa2e67060cd8d78dbb3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:06:53 -0700
Subject: [PATCH 0285/3220] BACKPORT: ARM: uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on arm.

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: I03a44ca7a8c56832f15a6a74ac32e9330df3ac3b
(cherry picked from commit dfd45b6103c973bfcea2341d89e36faf947dbc33)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm/Kconfig               |  1 +
 arch/arm/include/asm/uaccess.h | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d3159ff4de5b..586134e01928 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -33,6 +33,7 @@ config ARM
 	select HARDIRQS_SW_RESEND
 	select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
 	select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
+	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
 	select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
 	select HAVE_ARCH_MMAP_RND_BITS if MMU
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 35c9db857ebe..7fb59199c6bb 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -496,7 +496,10 @@ arm_copy_from_user(void *to, const void __user *from, unsigned long n);
 static inline unsigned long __must_check
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	unsigned int __ua_flags = uaccess_save_and_enable();
+	unsigned int __ua_flags;
+
+	check_object_size(to, n, false);
+	__ua_flags = uaccess_save_and_enable();
 	n = arm_copy_from_user(to, from, n);
 	uaccess_restore(__ua_flags);
 	return n;
@@ -511,11 +514,15 @@ static inline unsigned long __must_check
 __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 #ifndef CONFIG_UACCESS_WITH_MEMCPY
-	unsigned int __ua_flags = uaccess_save_and_enable();
+	unsigned int __ua_flags;
+
+	check_object_size(from, n, true);
+	__ua_flags = uaccess_save_and_enable();
 	n = arm_copy_to_user(to, from, n);
 	uaccess_restore(__ua_flags);
 	return n;
 #else
+	check_object_size(from, n, true);
 	return arm_copy_to_user(to, from, n);
 #endif
 }

From 2f40fdd6bc0e1beca522657d3c9242e7b25917a2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:59:42 -0700
Subject: [PATCH 0286/3220] BACKPORT: arm64/uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on arm64. As done by KASAN in -next,
renames the low-level functions to __arch_copy_*_user() so a static inline
can do additional work before the copy.

Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: I1286cae8e6ffcf12ea54ddd62f1a6d2ce742c8d0
(cherry picked from commit faf5b63e294151d6ac24ca6906d6f221bd3496cd)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/Kconfig               |  1 +
 arch/arm64/include/asm/uaccess.h | 29 ++++++++++++++++++++++-------
 arch/arm64/kernel/arm64ksyms.c   |  4 ++--
 arch/arm64/lib/copy_from_user.S  |  4 ++--
 arch/arm64/lib/copy_to_user.S    |  4 ++--
 5 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 579ab688312d..3ad9d9ea374b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -48,6 +48,7 @@ config ARM64
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_BITREVERSE
+	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
 	select HAVE_ARCH_KGDB
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index b2ede967fe7d..2a44b9795532 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -247,24 +247,39 @@ do {									\
 		-EFAULT;						\
 })
 
-extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n);
-extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
 extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
 extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
 
+static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	check_object_size(to, n, false);
+	return __arch_copy_from_user(to, from, n);
+}
+
+static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	check_object_size(from, n, true);
+	return __arch_copy_to_user(to, from, n);
+}
+
 static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (access_ok(VERIFY_READ, from, n))
-		n = __copy_from_user(to, from, n);
-	else /* security hole - plug it */
+	if (access_ok(VERIFY_READ, from, n)) {
+		check_object_size(to, n, false);
+		n = __arch_copy_from_user(to, from, n);
+	} else /* security hole - plug it */
 		memset(to, 0, n);
 	return n;
 }
 
 static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (access_ok(VERIFY_WRITE, to, n))
-		n = __copy_to_user(to, from, n);
+	if (access_ok(VERIFY_WRITE, to, n)) {
+		check_object_size(from, n, true);
+		n = __arch_copy_to_user(to, from, n);
+	}
 	return n;
 }
 
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 3b6d8cc9dfe0..c654df05b7d7 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -33,8 +33,8 @@ EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
 	/* user mem (segment) */
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(__arch_copy_from_user);
+EXPORT_SYMBOL(__arch_copy_to_user);
 EXPORT_SYMBOL(__clear_user);
 EXPORT_SYMBOL(__copy_in_user);
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 4699cd74f87e..281e75db899a 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -66,7 +66,7 @@
 	.endm
 
 end	.req	x5
-ENTRY(__copy_from_user)
+ENTRY(__arch_copy_from_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
@@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
 	mov	x0, #0				// Nothing to copy
 	ret
-ENDPROC(__copy_from_user)
+ENDPROC(__arch_copy_from_user)
 
 	.section .fixup,"ax"
 	.align	2
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 7512bbbc07ac..db4d187de61f 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -65,7 +65,7 @@
 	.endm
 
 end	.req	x5
-ENTRY(__copy_to_user)
+ENTRY(__arch_copy_to_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
@@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
 	mov	x0, #0
 	ret
-ENDPROC(__copy_to_user)
+ENDPROC(__arch_copy_to_user)
 
 	.section .fixup,"ax"
 	.align	2

From 103aa7df683e7811e88e82f52c2c3f16e419a7b1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:20:59 -0700
Subject: [PATCH 0287/3220] UPSTREAM: mm: SLAB hardened usercopy support

Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the
SLAB allocator to catch any copies that may span objects.

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>

Change-Id: Ib910a71fdc2ab808e1a45b6d33e9bae1681a1f4a
(cherry picked from commit 04385fc5e8fffed84425d909a783c0f0c587d847)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 init/Kconfig |  1 +
 mm/slab.c    | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index 235c7a2c0d20..fa031a140397 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1719,6 +1719,7 @@ choice
 
 config SLAB
 	bool "SLAB"
+	select HAVE_HARDENED_USERCOPY_ALLOCATOR
 	help
 	  The regular slab allocator that is established and known to work
 	  well in all environments. It organizes cache hot objects in
diff --git a/mm/slab.c b/mm/slab.c
index 4765c97ce690..24a615d42d74 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4228,6 +4228,36 @@ static int __init slab_proc_init(void)
 module_init(slab_proc_init);
 #endif
 
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+				struct page *page)
+{
+	struct kmem_cache *cachep;
+	unsigned int objnr;
+	unsigned long offset;
+
+	/* Find and validate object. */
+	cachep = page->slab_cache;
+	objnr = obj_to_index(cachep, page, (void *)ptr);
+	BUG_ON(objnr >= cachep->num);
+
+	/* Find offset within object. */
+	offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+
+	/* Allow address range falling entirely within object size. */
+	if (offset <= cachep->object_size && n <= cachep->object_size - offset)
+		return NULL;
+
+	return cachep->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
 /**
  * ksize - get the actual amount of memory allocated for a given object
  * @objp: Pointer to the object

From 3a9c8260c66ce387754641134089ba6ea5793bd5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:24:05 -0700
Subject: [PATCH 0288/3220] UPSTREAM: mm: SLUB hardened usercopy support

Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the
SLUB allocator to catch any copies that may span objects. Includes a
redzone handling fix discovered by Michael Ellerman.

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Reviwed-by: Laura Abbott <labbott@redhat.com>

Change-Id: I52dc6fb3a3492b937d52b5cf9c046bf03dc40a3a
(cherry picked from commit ed18adc1cdd00a5c55a20fbdaed4804660772281)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 init/Kconfig |  1 +
 mm/slub.c    | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index fa031a140397..e1d1d6936f92 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1727,6 +1727,7 @@ config SLAB
 
 config SLUB
 	bool "SLUB (Unqueued Allocator)"
+	select HAVE_HARDENED_USERCOPY_ALLOCATOR
 	help
 	   SLUB is a slab allocator that minimizes cache line usage
 	   instead of managing queues of cached objects (SLAB approach).
diff --git a/mm/slub.c b/mm/slub.c
index 46997517406e..470f5e561727 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3585,6 +3585,46 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
 
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+				struct page *page)
+{
+	struct kmem_cache *s;
+	unsigned long offset;
+	size_t object_size;
+
+	/* Find object and usable object size. */
+	s = page->slab_cache;
+	object_size = slab_ksize(s);
+
+	/* Reject impossible pointers. */
+	if (ptr < page_address(page))
+		return s->name;
+
+	/* Find offset within object. */
+	offset = (ptr - page_address(page)) % s->size;
+
+	/* Adjust for redzone and reject if within the redzone. */
+	if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+		if (offset < s->red_left_pad)
+			return s->name;
+		offset -= s->red_left_pad;
+	}
+
+	/* Allow address range falling entirely within object size. */
+	if (offset <= object_size && n <= object_size - offset)
+		return NULL;
+
+	return s->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
 static size_t __ksize(const void *object)
 {
 	struct page *page;

From 43243ce74a56007bf71e4fb3310f475fd4441ce4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 19 Aug 2016 12:15:22 -0700
Subject: [PATCH 0289/3220] UPSTREAM: usercopy: avoid potentially undefined
 behavior in pointer math

check_bogus_address() checked for pointer overflow using this expression,
where 'ptr' has type 'const void *':

	ptr + n < ptr

Since pointer wraparound is undefined behavior, gcc at -O2 by default
treats it like the following, which would not behave as intended:

	(long)n < 0

Fortunately, this doesn't currently happen for kernel code because kernel
code is compiled with -fno-strict-overflow.  But the expression should be
fixed anyway to use well-defined integer arithmetic, since it could be
treated differently by different compilers in the future or could be
reported by tools checking for undefined behavior.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: I73b13be651cf35c03482f2014bf2c3dd291518ab
(cherry picked from commit 7329a655875a2f4bd6984fe8a7e00a6981e802f3)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 mm/usercopy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/usercopy.c b/mm/usercopy.c
index 816feccebeb0..f5c4c3a6f2df 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -125,7 +125,7 @@ static inline const char *check_kernel_text_object(const void *ptr,
 static inline const char *check_bogus_address(const void *ptr, unsigned long n)
 {
 	/* Reject if object wraps past end of memory. */
-	if (ptr + n < ptr)
+	if ((unsigned long)ptr + n < (unsigned long)ptr)
 		return "<wrapped address>";
 
 	/* Reject if NULL or ZERO-allocation. */

From 19787e3f1cd696356b4ec02fa3a345dcb4aaf3dc Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 22 Aug 2016 11:53:59 -0500
Subject: [PATCH 0290/3220] UPSTREAM: usercopy: fix overlap check for kernel
 text

When running with a local patch which moves the '_stext' symbol to the
very beginning of the kernel text area, I got the following panic with
CONFIG_HARDENED_USERCOPY:

  usercopy: kernel memory exposure attempt detected from ffff88103dfff000 (<linear kernel text>) (4096 bytes)
  ------------[ cut here ]------------
  kernel BUG at mm/usercopy.c:79!
  invalid opcode: 0000 [#1] SMP
  ...
  CPU: 0 PID: 4800 Comm: cp Not tainted 4.8.0-rc3.after+ #1
  Hardware name: Dell Inc. PowerEdge R720/0X3D66, BIOS 2.5.4 01/22/2016
  task: ffff880817444140 task.stack: ffff880816274000
  RIP: 0010:[<ffffffff8121c796>] __check_object_size+0x76/0x413
  RSP: 0018:ffff880816277c40 EFLAGS: 00010246
  RAX: 000000000000006b RBX: ffff88103dfff000 RCX: 0000000000000000
  RDX: 0000000000000000 RSI: ffff88081f80dfa8 RDI: ffff88081f80dfa8
  RBP: ffff880816277c90 R08: 000000000000054c R09: 0000000000000000
  R10: 0000000000000005 R11: 0000000000000006 R12: 0000000000001000
  R13: ffff88103e000000 R14: ffff88103dffffff R15: 0000000000000001
  FS:  00007fb9d1750800(0000) GS:ffff88081f800000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000021d2000 CR3: 000000081a08f000 CR4: 00000000001406f0
  Stack:
   ffff880816277cc8 0000000000010000 000000043de07000 0000000000000000
   0000000000001000 ffff880816277e60 0000000000001000 ffff880816277e28
   000000000000c000 0000000000001000 ffff880816277ce8 ffffffff8136c3a6
  Call Trace:
   [<ffffffff8136c3a6>] copy_page_to_iter_iovec+0xa6/0x1c0
   [<ffffffff8136e766>] copy_page_to_iter+0x16/0x90
   [<ffffffff811970e3>] generic_file_read_iter+0x3e3/0x7c0
   [<ffffffffa06a738d>] ? xfs_file_buffered_aio_write+0xad/0x260 [xfs]
   [<ffffffff816e6262>] ? down_read+0x12/0x40
   [<ffffffffa06a61b1>] xfs_file_buffered_aio_read+0x51/0xc0 [xfs]
   [<ffffffffa06a6692>] xfs_file_read_iter+0x62/0xb0 [xfs]
   [<ffffffff812224cf>] __vfs_read+0xdf/0x130
   [<ffffffff81222c9e>] vfs_read+0x8e/0x140
   [<ffffffff81224195>] SyS_read+0x55/0xc0
   [<ffffffff81003a47>] do_syscall_64+0x67/0x160
   [<ffffffff816e8421>] entry_SYSCALL64_slow_path+0x25/0x25
  RIP: 0033:[<00007fb9d0c33c00>] 0x7fb9d0c33c00
  RSP: 002b:00007ffc9c262f28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
  RAX: ffffffffffffffda RBX: fffffffffff8ffff RCX: 00007fb9d0c33c00
  RDX: 0000000000010000 RSI: 00000000021c3000 RDI: 0000000000000004
  RBP: 00000000021c3000 R08: 0000000000000000 R09: 00007ffc9c264d6c
  R10: 00007ffc9c262c50 R11: 0000000000000246 R12: 0000000000010000
  R13: 00007ffc9c2630b0 R14: 0000000000000004 R15: 0000000000010000
  Code: 81 48 0f 44 d0 48 c7 c6 90 4d a3 81 48 c7 c0 bb b3 a2 81 48 0f 44 f0 4d 89 e1 48 89 d9 48 c7 c7 68 16 a3 81 31 c0 e8 f4 57 f7 ff <0f> 0b 48 8d 90 00 40 00 00 48 39 d3 0f 83 22 01 00 00 48 39 c3
  RIP  [<ffffffff8121c796>] __check_object_size+0x76/0x413
   RSP <ffff880816277c40>

The checked object's range [ffff88103dfff000, ffff88103e000000) is
valid, so there shouldn't have been a BUG.  The hardened usercopy code
got confused because the range's ending address is the same as the
kernel's text starting address at 0xffff88103e000000.  The overlap check
is slightly off.

Fixes: f5509cc18daa ("mm: Hardened usercopy")
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: I839dbf4ddbb4d9874026a42abed557eb9b3f8bef
(cherry picked from commit 94cd97af690dd9537818dc9841d0ec68bb1dd877)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 mm/usercopy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/usercopy.c b/mm/usercopy.c
index f5c4c3a6f2df..f78015e8b1e5 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -84,7 +84,7 @@ static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
 	unsigned long check_high = check_low + n;
 
 	/* Does not overlap if entirely above or entirely below. */
-	if (check_low >= high || check_high < low)
+	if (check_low >= high || check_high <= low)
 		return false;
 
 	return true;

From 65733da1dd5c3643e310d371bdb2420a6f3d8e97 Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Thu, 25 Aug 2016 18:31:01 -0700
Subject: [PATCH 0291/3220] Android: MMC/UFS IO Latency Histograms.

This patch adds a new sysfs node (latency_hist) and reports
IO (svc time) latency histograms. Disabled by default, can be
enabled by echoing 0 into latency_hist, stats can be cleared
by writing 2 into latency_hist. This commit fixes the 32 bit
build breakage in the previous commit. Tested on both 32 bit
and 64 bit arm devices.

Bug: 30677035
Change-Id: I9a615a16616d80f87e75676ac4d078a5c429dcf9
Signed-off-by: Mohan Srinivasan <srmohan@google.com>
---
 block/blk-core.c          | 82 +++++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/core.c   | 66 ++++++++++++++++++++++++++++++-
 drivers/mmc/core/host.c   |  6 ++-
 drivers/mmc/core/host.h   |  5 +++
 drivers/scsi/ufs/ufshcd.c | 81 ++++++++++++++++++++++++++++++++++++++
 drivers/scsi/ufs/ufshcd.h |  3 ++
 include/linux/blkdev.h    | 76 ++++++++++++++++++++++++++++++++++++
 include/linux/mmc/core.h  |  2 +
 include/linux/mmc/host.h  |  4 ++
 9 files changed, 322 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 33e2f62d5062..840713279726 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -40,6 +40,8 @@
 #include "blk.h"
 #include "blk-mq.h"
 
+#include <linux/math64.h>
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
@@ -3539,3 +3541,83 @@ int __init blk_dev_init(void)
 
 	return 0;
 }
+
+/*
+ * Blk IO latency support. We want this to be as cheap as possible, so doing
+ * this lockless (and avoiding atomics), a few off by a few errors in this
+ * code is not harmful, and we don't want to do anything that is
+ * perf-impactful.
+ * TODO : If necessary, we can make the histograms per-cpu and aggregate
+ * them when printing them out.
+ */
+void
+blk_zero_latency_hist(struct io_latency_state *s)
+{
+	memset(s->latency_y_axis_read, 0,
+	       sizeof(s->latency_y_axis_read));
+	memset(s->latency_y_axis_write, 0,
+	       sizeof(s->latency_y_axis_write));
+	s->latency_reads_elems = 0;
+	s->latency_writes_elems = 0;
+}
+
+ssize_t
+blk_latency_hist_show(struct io_latency_state *s, char *buf)
+{
+	int i;
+	int bytes_written = 0;
+	u_int64_t num_elem, elem;
+	int pct;
+
+	num_elem = s->latency_reads_elems;
+	if (num_elem > 0) {
+		bytes_written += scnprintf(buf + bytes_written,
+			   PAGE_SIZE - bytes_written,
+			   "IO svc_time Read Latency Histogram (n = %llu):\n",
+			   num_elem);
+		for (i = 0;
+		     i < ARRAY_SIZE(latency_x_axis_us);
+		     i++) {
+			elem = s->latency_y_axis_read[i];
+			pct = div64_u64(elem * 100, num_elem);
+			bytes_written += scnprintf(buf + bytes_written,
+						   PAGE_SIZE - bytes_written,
+						   "\t< %5lluus%15llu%15d%%\n",
+						   latency_x_axis_us[i],
+						   elem, pct);
+		}
+		/* Last element in y-axis table is overflow */
+		elem = s->latency_y_axis_read[i];
+		pct = div64_u64(elem * 100, num_elem);
+		bytes_written += scnprintf(buf + bytes_written,
+					   PAGE_SIZE - bytes_written,
+					   "\t> %5dms%15llu%15d%%\n", 10,
+					   elem, pct);
+	}
+	num_elem = s->latency_writes_elems;
+	if (num_elem > 0) {
+		bytes_written += scnprintf(buf + bytes_written,
+			   PAGE_SIZE - bytes_written,
+			   "IO svc_time Write Latency Histogram (n = %llu):\n",
+			   num_elem);
+		for (i = 0;
+		     i < ARRAY_SIZE(latency_x_axis_us);
+		     i++) {
+			elem = s->latency_y_axis_write[i];
+			pct = div64_u64(elem * 100, num_elem);
+			bytes_written += scnprintf(buf + bytes_written,
+						   PAGE_SIZE - bytes_written,
+						   "\t< %5lluus%15llu%15d%%\n",
+						   latency_x_axis_us[i],
+						   elem, pct);
+		}
+		/* Last element in y-axis table is overflow */
+		elem = s->latency_y_axis_write[i];
+		pct = div64_u64(elem * 100, num_elem);
+		bytes_written += scnprintf(buf + bytes_written,
+					   PAGE_SIZE - bytes_written,
+					   "\t> %5dms%15llu%15d%%\n", 10,
+					   elem, pct);
+	}
+	return bytes_written;
+}
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 9fab52559a8c..b503249950df 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -183,6 +183,17 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
 			pr_debug("%s:     %d bytes transferred: %d\n",
 				mmc_hostname(host),
 				mrq->data->bytes_xfered, mrq->data->error);
+			if (mrq->lat_hist_enabled) {
+				ktime_t completion;
+				u_int64_t delta_us;
+
+				completion = ktime_get();
+				delta_us = ktime_us_delta(completion,
+							  mrq->io_start);
+				blk_update_latency_hist(&host->io_lat_s,
+					(mrq->data->flags & MMC_DATA_READ),
+					delta_us);
+			}
 			trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data);
 		}
 
@@ -627,6 +638,11 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host,
 	}
 
 	if (!err && areq) {
+		if (host->latency_hist_enabled) {
+			areq->mrq->io_start = ktime_get();
+			areq->mrq->lat_hist_enabled = 1;
+		} else
+			areq->mrq->lat_hist_enabled = 0;
 		trace_mmc_blk_rw_start(areq->mrq->cmd->opcode,
 				       areq->mrq->cmd->arg,
 				       areq->mrq->data);
@@ -1964,7 +1980,7 @@ void mmc_init_erase(struct mmc_card *card)
 }
 
 static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card,
-				          unsigned int arg, unsigned int qty)
+					  unsigned int arg, unsigned int qty)
 {
 	unsigned int erase_timeout;
 
@@ -2907,6 +2923,54 @@ static void __exit mmc_exit(void)
 	destroy_workqueue(workqueue);
 }
 
+static ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct mmc_host *host = cls_dev_to_mmc_host(dev);
+
+	return blk_latency_hist_show(&host->io_lat_s, buf);
+}
+
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+		   const char *buf, size_t count)
+{
+	struct mmc_host *host = cls_dev_to_mmc_host(dev);
+	long value;
+
+	if (kstrtol(buf, 0, &value))
+		return -EINVAL;
+	if (value == BLK_IO_LAT_HIST_ZERO)
+		blk_zero_latency_hist(&host->io_lat_s);
+	else if (value == BLK_IO_LAT_HIST_ENABLE ||
+		 value == BLK_IO_LAT_HIST_DISABLE)
+		host->latency_hist_enabled = value;
+	return count;
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+		   latency_hist_show, latency_hist_store);
+
+void
+mmc_latency_hist_sysfs_init(struct mmc_host *host)
+{
+	if (device_create_file(&host->class_dev, &dev_attr_latency_hist))
+		dev_err(&host->class_dev,
+			"Failed to create latency_hist sysfs entry\n");
+}
+
+void
+mmc_latency_hist_sysfs_exit(struct mmc_host *host)
+{
+	device_remove_file(&host->class_dev, &dev_attr_latency_hist);
+}
+
 subsys_initcall(mmc_init);
 module_exit(mmc_exit);
 
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index fcf7829c759e..17068839c74b 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -32,8 +32,6 @@
 #include "slot-gpio.h"
 #include "pwrseq.h"
 
-#define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
-
 static DEFINE_IDR(mmc_host_idr);
 static DEFINE_SPINLOCK(mmc_host_lock);
 
@@ -394,6 +392,8 @@ int mmc_add_host(struct mmc_host *host)
 	mmc_add_host_debugfs(host);
 #endif
 
+	mmc_latency_hist_sysfs_init(host);
+
 	mmc_start_host(host);
 	if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
 		register_pm_notifier(&host->pm_notify);
@@ -422,6 +422,8 @@ void mmc_remove_host(struct mmc_host *host)
 	mmc_remove_host_debugfs(host);
 #endif
 
+	mmc_latency_hist_sysfs_exit(host);
+
 	device_del(&host->class_dev);
 
 	led_trigger_unregister_simple(host->led);
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index 992bf5397633..bf38533406fd 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -12,6 +12,8 @@
 #define _MMC_CORE_HOST_H
 #include <linux/mmc/host.h>
 
+#define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
+
 int mmc_register_host_class(void);
 void mmc_unregister_host_class(void);
 
@@ -21,5 +23,8 @@ void mmc_retune_hold(struct mmc_host *host);
 void mmc_retune_release(struct mmc_host *host);
 int mmc_retune(struct mmc_host *host);
 
+void mmc_latency_hist_sysfs_init(struct mmc_host *host);
+void mmc_latency_hist_sysfs_exit(struct mmc_host *host);
+
 #endif
 
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 85cd2564c157..4167bdbf0ecf 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -39,6 +39,7 @@
 
 #include <linux/async.h>
 #include <linux/devfreq.h>
+#include <linux/blkdev.h>
 
 #include "ufshcd.h"
 #include "unipro.h"
@@ -1332,6 +1333,17 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		clear_bit_unlock(tag, &hba->lrb_in_use);
 		goto out;
 	}
+
+	/* IO svc time latency histogram */
+	if (hba != NULL && cmd->request != NULL) {
+		if (hba->latency_hist_enabled &&
+		    (cmd->request->cmd_type == REQ_TYPE_FS)) {
+			cmd->request->lat_hist_io_start = ktime_get();
+			cmd->request->lat_hist_enabled = 1;
+		} else
+			cmd->request->lat_hist_enabled = 0;
+	}
+
 	WARN_ON(hba->clk_gating.state != CLKS_ON);
 
 	lrbp = &hba->lrb[tag];
@@ -3160,6 +3172,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
 	u32 tr_doorbell;
 	int result;
 	int index;
+	struct request *req;
 
 	/* Resetting interrupt aggregation counters first and reading the
 	 * DOOR_BELL afterward allows us to handle all the completed requests.
@@ -3184,6 +3197,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
 			/* Mark completed command as NULL in LRB */
 			lrbp->cmd = NULL;
 			clear_bit_unlock(index, &hba->lrb_in_use);
+			req = cmd->request;
+			if (req) {
+				/* Update IO svc time latency histogram */
+				if (req->lat_hist_enabled) {
+					ktime_t completion;
+					u_int64_t delta_us;
+
+					completion = ktime_get();
+					delta_us = ktime_us_delta(completion,
+						  req->lat_hist_io_start);
+					/* rq_data_dir() => true if WRITE */
+					blk_update_latency_hist(&hba->io_lat_s,
+						(rq_data_dir(req) == READ),
+						delta_us);
+				}
+			}
 			/* Do not touch lrbp after scsi done */
 			cmd->scsi_done(cmd);
 			__ufshcd_release(hba);
@@ -5327,6 +5356,54 @@ out:
 }
 EXPORT_SYMBOL(ufshcd_shutdown);
 
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+		   const char *buf, size_t count)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+	long value;
+
+	if (kstrtol(buf, 0, &value))
+		return -EINVAL;
+	if (value == BLK_IO_LAT_HIST_ZERO)
+		blk_zero_latency_hist(&hba->io_lat_s);
+	else if (value == BLK_IO_LAT_HIST_ENABLE ||
+		 value == BLK_IO_LAT_HIST_DISABLE)
+		hba->latency_hist_enabled = value;
+	return count;
+}
+
+ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr,
+		  char *buf)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+
+	return blk_latency_hist_show(&hba->io_lat_s, buf);
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+		   latency_hist_show, latency_hist_store);
+
+static void
+ufshcd_init_latency_hist(struct ufs_hba *hba)
+{
+	if (device_create_file(hba->dev, &dev_attr_latency_hist))
+		dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n");
+}
+
+static void
+ufshcd_exit_latency_hist(struct ufs_hba *hba)
+{
+	device_create_file(hba->dev, &dev_attr_latency_hist);
+}
+
 /**
  * ufshcd_remove - de-allocate SCSI host and host memory space
  *		data structure memory
@@ -5342,6 +5419,7 @@ void ufshcd_remove(struct ufs_hba *hba)
 	scsi_host_put(hba->host);
 
 	ufshcd_exit_clk_gating(hba);
+	ufshcd_exit_latency_hist(hba);
 	if (ufshcd_is_clkscaling_enabled(hba))
 		devfreq_remove_device(hba->devfreq);
 	ufshcd_hba_exit(hba);
@@ -5639,6 +5717,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	/* Hold auto suspend until async scan completes */
 	pm_runtime_get_sync(dev);
 
+	ufshcd_init_latency_hist(hba);
+
 	/*
 	 * The device-initialize-sequence hasn't been invoked yet.
 	 * Set the device to power-off state
@@ -5653,6 +5733,7 @@ out_remove_scsi_host:
 	scsi_remove_host(hba->host);
 exit_gating:
 	ufshcd_exit_clk_gating(hba);
+	ufshcd_exit_latency_hist(hba);
 out_disable:
 	hba->is_irq_enabled = false;
 	scsi_host_put(host);
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 2570d9477b37..f3780cf7d895 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -532,6 +532,9 @@ struct ufs_hba {
 	struct devfreq *devfreq;
 	struct ufs_clk_scaling clk_scaling;
 	bool is_sys_suspended;
+
+	int			latency_hist_enabled;
+	struct io_latency_state io_lat_s;
 };
 
 /* Returns true if clocks can be gated. Otherwise false */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c70e3588a48c..faf2c0093527 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -197,6 +197,9 @@ struct request {
 
 	/* for bidi */
 	struct request *next_rq;
+
+	ktime_t			lat_hist_io_start;
+	int			lat_hist_enabled;
 };
 
 static inline unsigned short req_get_ioprio(struct request *req)
@@ -1645,6 +1648,79 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, sector_t,
 		void __pmem **addr, unsigned long *pfn, long size);
+
+/*
+ * X-axis for IO latency histogram support.
+ */
+static const u_int64_t latency_x_axis_us[] = {
+	100,
+	200,
+	300,
+	400,
+	500,
+	600,
+	700,
+	800,
+	900,
+	1000,
+	1200,
+	1400,
+	1600,
+	1800,
+	2000,
+	2500,
+	3000,
+	4000,
+	5000,
+	6000,
+	7000,
+	9000,
+	10000
+};
+
+#define BLK_IO_LAT_HIST_DISABLE         0
+#define BLK_IO_LAT_HIST_ENABLE          1
+#define BLK_IO_LAT_HIST_ZERO            2
+
+struct io_latency_state {
+	u_int64_t	latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1];
+	u_int64_t	latency_reads_elems;
+	u_int64_t	latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1];
+	u_int64_t	latency_writes_elems;
+};
+
+static inline void
+blk_update_latency_hist(struct io_latency_state *s,
+			int read,
+			u_int64_t delta_us)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) {
+		if (delta_us < (u_int64_t)latency_x_axis_us[i]) {
+			if (read)
+				s->latency_y_axis_read[i]++;
+			else
+				s->latency_y_axis_write[i]++;
+			break;
+		}
+	}
+	if (i == ARRAY_SIZE(latency_x_axis_us)) {
+		/* Overflowed the histogram */
+		if (read)
+			s->latency_y_axis_read[i]++;
+		else
+			s->latency_y_axis_write[i]++;
+	}
+	if (read)
+		s->latency_reads_elems++;
+	else
+		s->latency_writes_elems++;
+}
+
+void blk_zero_latency_hist(struct io_latency_state *s);
+ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf);
+
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 37967b6da03c..3349f0676acb 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -136,6 +136,8 @@ struct mmc_request {
 	struct completion	completion;
 	void			(*done)(struct mmc_request *);/* completion function */
 	struct mmc_host		*host;
+	ktime_t			io_start;
+	int			lat_hist_enabled;
 };
 
 struct mmc_card;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 40025b28c1fb..e4862f7cdede 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/device.h>
 #include <linux/fault-inject.h>
+#include <linux/blkdev.h>
 
 #include <linux/mmc/core.h>
 #include <linux/mmc/card.h>
@@ -379,6 +380,9 @@ struct mmc_host {
 	} embedded_sdio_data;
 #endif
 
+	int			latency_hist_enabled;
+	struct io_latency_state io_lat_s;
+
 	unsigned long		private[0] ____cacheline_aligned;
 };
 

From ef1cf0cfecdb16c520e32628ef8e6d9fa86fc08f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 19 Aug 2016 12:47:01 -0700
Subject: [PATCH 0292/3220] UPSTREAM: Make the hardened user-copy code depend
 on having a hardened allocator

The kernel test robot reported a usercopy failure in the new hardened
sanity checks, due to a page-crossing copy of the FPU state into the
task structure.

This happened because the kernel test robot was testing with SLOB, which
doesn't actually do the required book-keeping for slab allocations, and
as a result the hardening code didn't realize that the task struct
allocation was one single allocation - and the sanity checks fail.

Since SLOB doesn't even claim to support hardening (and you really
shouldn't use it), the straightforward solution is to just make the
usercopy hardening code depend on the allocator supporting it.

Reported-by: kernel test robot <xiaolong.ye@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: I37d51f866f873341bf7d5297249899b852e1c6ce
(cherry picked from commit 6040e57658eee6eb1315a26119101ca832d1f854)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 security/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/security/Kconfig b/security/Kconfig
index aba6a8d4d1f4..2b42c225de28 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -145,6 +145,7 @@ config HAVE_ARCH_HARDENED_USERCOPY
 config HARDENED_USERCOPY
 	bool "Harden memory copies between kernel and userspace"
 	depends on HAVE_ARCH_HARDENED_USERCOPY
+	depends on HAVE_HARDENED_USERCOPY_ALLOCATOR
 	select BUG
 	help
 	  This option checks for obviously wrong memory regions when

From 0715ce3b882b9982b6acdbf9e46fccd1e6f180d7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 17 Dec 2015 09:45:09 -0800
Subject: [PATCH 0293/3220] UPSTREAM: x86: reorganize SMAP handling in user
 space accesses

This reorganizes how we do the stac/clac instructions in the user access
code.  Instead of adding the instructions directly to the same inline
asm that does the actual user level access and exception handling, add
them at a higher level.

This is mainly preparation for the next step, where we will expose an
interface to allow users to mark several accesses together as being user
space accesses, but it does already clean up some code:

 - the inlined trivial cases of copy_in_user() now do stac/clac just
   once over the accesses: they used to do one pair around the user
   space read, and another pair around the write-back.

 - the {get,put}_user_ex() macros that are used with the catch/try
   handling don't do any stac/clac at all, because that happens in the
   try/catch surrounding them.

Other than those two cleanups that happened naturally from the
re-organization, this should not make any difference. Yet.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: Iaad8756bc8e95876e2e2a7d7bbd333fc478ab441
(cherry picked from commit 11f1a4b9755f5dbc3e822a96502ebe9b044b14d8)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/x86/include/asm/uaccess.h    | 53 +++++++++++------
 arch/x86/include/asm/uaccess_64.h | 94 ++++++++++++++++++++++---------
 2 files changed, 101 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 32a047ce0806..be439e246d91 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -134,6 +134,9 @@ extern int __get_user_4(void);
 extern int __get_user_8(void);
 extern int __get_user_bad(void);
 
+#define __uaccess_begin() stac()
+#define __uaccess_end()   clac()
+
 /*
  * This is a type: either unsigned long, if the argument fits into
  * that type, or otherwise unsigned long long.
@@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 
 #ifdef CONFIG_X86_32
 #define __put_user_asm_u64(x, addr, err, errret)			\
-	asm volatile(ASM_STAC "\n"					\
+	asm volatile("\n"						\
 		     "1:	movl %%eax,0(%2)\n"			\
 		     "2:	movl %%edx,4(%2)\n"			\
-		     "3: " ASM_CLAC "\n"				\
+		     "3:"						\
 		     ".section .fixup,\"ax\"\n"				\
 		     "4:	movl %3,%0\n"				\
 		     "	jmp 3b\n"					\
@@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 		     : "A" (x), "r" (addr), "i" (errret), "0" (err))
 
 #define __put_user_asm_ex_u64(x, addr)					\
-	asm volatile(ASM_STAC "\n"					\
+	asm volatile("\n"						\
 		     "1:	movl %%eax,0(%1)\n"			\
 		     "2:	movl %%edx,4(%1)\n"			\
-		     "3: " ASM_CLAC "\n"				\
+		     "3:"						\
 		     _ASM_EXTABLE_EX(1b, 2b)				\
 		     _ASM_EXTABLE_EX(2b, 3b)				\
 		     : : "A" (x), "r" (addr))
@@ -304,6 +307,10 @@ do {									\
 	}								\
 } while (0)
 
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
 #define __put_user_size_ex(x, ptr, size)				\
 do {									\
 	__chk_user_ptr(ptr);						\
@@ -358,9 +365,9 @@ do {									\
 } while (0)
 
 #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret)	\
-	asm volatile(ASM_STAC "\n"					\
+	asm volatile("\n"						\
 		     "1:	mov"itype" %2,%"rtype"1\n"		\
-		     "2: " ASM_CLAC "\n"				\
+		     "2:\n"						\
 		     ".section .fixup,\"ax\"\n"				\
 		     "3:	mov %3,%0\n"				\
 		     "	xor"itype" %"rtype"1,%"rtype"1\n"		\
@@ -370,6 +377,10 @@ do {									\
 		     : "=r" (err), ltype(x)				\
 		     : "m" (__m(addr)), "i" (errret), "0" (err))
 
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
 #define __get_user_size_ex(x, ptr, size)				\
 do {									\
 	__chk_user_ptr(ptr);						\
@@ -400,7 +411,9 @@ do {									\
 #define __put_user_nocheck(x, ptr, size)			\
 ({								\
 	int __pu_err;						\
+	__uaccess_begin();					\
 	__put_user_size((x), (ptr), (size), __pu_err, -EFAULT);	\
+	__uaccess_end();					\
 	__builtin_expect(__pu_err, 0);				\
 })
 
@@ -408,7 +421,9 @@ do {									\
 ({									\
 	int __gu_err;							\
 	unsigned long __gu_val;						\
+	__uaccess_begin();						\
 	__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT);	\
+	__uaccess_end();						\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
 	__builtin_expect(__gu_err, 0);					\
 })
@@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; };
  * aliasing issues.
  */
 #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret)	\
-	asm volatile(ASM_STAC "\n"					\
+	asm volatile("\n"						\
 		     "1:	mov"itype" %"rtype"1,%2\n"		\
-		     "2: " ASM_CLAC "\n"				\
+		     "2:\n"						\
 		     ".section .fixup,\"ax\"\n"				\
 		     "3:	mov %3,%0\n"				\
 		     "	jmp 2b\n"					\
@@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; };
  */
 #define uaccess_try	do {						\
 	current_thread_info()->uaccess_err = 0;				\
-	stac();								\
+	__uaccess_begin();						\
 	barrier();
 
 #define uaccess_catch(err)						\
-	clac();								\
+	__uaccess_end();						\
 	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\
 } while (0)
 
@@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void)
 	__typeof__(ptr) __uval = (uval);				\
 	__typeof__(*(ptr)) __old = (old);				\
 	__typeof__(*(ptr)) __new = (new);				\
+	__uaccess_begin();						\
 	switch (size) {							\
 	case 1:								\
 	{								\
-		asm volatile("\t" ASM_STAC "\n"				\
+		asm volatile("\n"					\
 			"1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n"		\
-			"2:\t" ASM_CLAC "\n"				\
+			"2:\n"						\
 			"\t.section .fixup, \"ax\"\n"			\
 			"3:\tmov     %3, %0\n"				\
 			"\tjmp     2b\n"				\
@@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void)
 	}								\
 	case 2:								\
 	{								\
-		asm volatile("\t" ASM_STAC "\n"				\
+		asm volatile("\n"					\
 			"1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n"		\
-			"2:\t" ASM_CLAC "\n"				\
+			"2:\n"						\
 			"\t.section .fixup, \"ax\"\n"			\
 			"3:\tmov     %3, %0\n"				\
 			"\tjmp     2b\n"				\
@@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void)
 	}								\
 	case 4:								\
 	{								\
-		asm volatile("\t" ASM_STAC "\n"				\
+		asm volatile("\n"					\
 			"1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"		\
-			"2:\t" ASM_CLAC "\n"				\
+			"2:\n"						\
 			"\t.section .fixup, \"ax\"\n"			\
 			"3:\tmov     %3, %0\n"				\
 			"\tjmp     2b\n"				\
@@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void)
 		if (!IS_ENABLED(CONFIG_X86_64))				\
 			__cmpxchg_wrong_size();				\
 									\
-		asm volatile("\t" ASM_STAC "\n"				\
+		asm volatile("\n"					\
 			"1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n"		\
-			"2:\t" ASM_CLAC "\n"				\
+			"2:\n"						\
 			"\t.section .fixup, \"ax\"\n"			\
 			"3:\tmov     %3, %0\n"				\
 			"\tjmp     2b\n"				\
@@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void)
 	default:							\
 		__cmpxchg_wrong_size();					\
 	}								\
+	__uaccess_end();						\
 	*__uval = __old;						\
 	__ret;								\
 })
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 512ffd621fc3..2957c8237c28 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -57,35 +57,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
 	if (!__builtin_constant_p(size))
 		return copy_user_generic(dst, (__force void *)src, size);
 	switch (size) {
-	case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
+	case 1:
+		__uaccess_begin();
+		__get_user_asm(*(u8 *)dst, (u8 __user *)src,
 			      ret, "b", "b", "=q", 1);
+		__uaccess_end();
 		return ret;
-	case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
+	case 2:
+		__uaccess_begin();
+		__get_user_asm(*(u16 *)dst, (u16 __user *)src,
 			      ret, "w", "w", "=r", 2);
+		__uaccess_end();
 		return ret;
-	case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
+	case 4:
+		__uaccess_begin();
+		__get_user_asm(*(u32 *)dst, (u32 __user *)src,
 			      ret, "l", "k", "=r", 4);
+		__uaccess_end();
 		return ret;
-	case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
+	case 8:
+		__uaccess_begin();
+		__get_user_asm(*(u64 *)dst, (u64 __user *)src,
 			      ret, "q", "", "=r", 8);
+		__uaccess_end();
 		return ret;
 	case 10:
+		__uaccess_begin();
 		__get_user_asm(*(u64 *)dst, (u64 __user *)src,
 			       ret, "q", "", "=r", 10);
-		if (unlikely(ret))
-			return ret;
-		__get_user_asm(*(u16 *)(8 + (char *)dst),
-			       (u16 __user *)(8 + (char __user *)src),
-			       ret, "w", "w", "=r", 2);
+		if (likely(!ret))
+			__get_user_asm(*(u16 *)(8 + (char *)dst),
+				       (u16 __user *)(8 + (char __user *)src),
+				       ret, "w", "w", "=r", 2);
+		__uaccess_end();
 		return ret;
 	case 16:
+		__uaccess_begin();
 		__get_user_asm(*(u64 *)dst, (u64 __user *)src,
 			       ret, "q", "", "=r", 16);
-		if (unlikely(ret))
-			return ret;
-		__get_user_asm(*(u64 *)(8 + (char *)dst),
-			       (u64 __user *)(8 + (char __user *)src),
-			       ret, "q", "", "=r", 8);
+		if (likely(!ret))
+			__get_user_asm(*(u64 *)(8 + (char *)dst),
+				       (u64 __user *)(8 + (char __user *)src),
+				       ret, "q", "", "=r", 8);
+		__uaccess_end();
 		return ret;
 	default:
 		return copy_user_generic(dst, (__force void *)src, size);
@@ -108,35 +122,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
 	if (!__builtin_constant_p(size))
 		return copy_user_generic((__force void *)dst, src, size);
 	switch (size) {
-	case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
+	case 1:
+		__uaccess_begin();
+		__put_user_asm(*(u8 *)src, (u8 __user *)dst,
 			      ret, "b", "b", "iq", 1);
+		__uaccess_end();
 		return ret;
-	case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
+	case 2:
+		__uaccess_begin();
+		__put_user_asm(*(u16 *)src, (u16 __user *)dst,
 			      ret, "w", "w", "ir", 2);
+		__uaccess_end();
 		return ret;
-	case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
+	case 4:
+		__uaccess_begin();
+		__put_user_asm(*(u32 *)src, (u32 __user *)dst,
 			      ret, "l", "k", "ir", 4);
+		__uaccess_end();
 		return ret;
-	case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
+	case 8:
+		__uaccess_begin();
+		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
 			      ret, "q", "", "er", 8);
+		__uaccess_end();
 		return ret;
 	case 10:
+		__uaccess_begin();
 		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
 			       ret, "q", "", "er", 10);
-		if (unlikely(ret))
-			return ret;
-		asm("":::"memory");
-		__put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
-			       ret, "w", "w", "ir", 2);
+		if (likely(!ret)) {
+			asm("":::"memory");
+			__put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+				       ret, "w", "w", "ir", 2);
+		}
+		__uaccess_end();
 		return ret;
 	case 16:
+		__uaccess_begin();
 		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
 			       ret, "q", "", "er", 16);
-		if (unlikely(ret))
-			return ret;
-		asm("":::"memory");
-		__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
-			       ret, "q", "", "er", 8);
+		if (likely(!ret)) {
+			asm("":::"memory");
+			__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+				       ret, "q", "", "er", 8);
+		}
+		__uaccess_end();
 		return ret;
 	default:
 		return copy_user_generic((__force void *)dst, src, size);
@@ -162,39 +192,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 	switch (size) {
 	case 1: {
 		u8 tmp;
+		__uaccess_begin();
 		__get_user_asm(tmp, (u8 __user *)src,
 			       ret, "b", "b", "=q", 1);
 		if (likely(!ret))
 			__put_user_asm(tmp, (u8 __user *)dst,
 				       ret, "b", "b", "iq", 1);
+		__uaccess_end();
 		return ret;
 	}
 	case 2: {
 		u16 tmp;
+		__uaccess_begin();
 		__get_user_asm(tmp, (u16 __user *)src,
 			       ret, "w", "w", "=r", 2);
 		if (likely(!ret))
 			__put_user_asm(tmp, (u16 __user *)dst,
 				       ret, "w", "w", "ir", 2);
+		__uaccess_end();
 		return ret;
 	}
 
 	case 4: {
 		u32 tmp;
+		__uaccess_begin();
 		__get_user_asm(tmp, (u32 __user *)src,
 			       ret, "l", "k", "=r", 4);
 		if (likely(!ret))
 			__put_user_asm(tmp, (u32 __user *)dst,
 				       ret, "l", "k", "ir", 4);
+		__uaccess_end();
 		return ret;
 	}
 	case 8: {
 		u64 tmp;
+		__uaccess_begin();
 		__get_user_asm(tmp, (u64 __user *)src,
 			       ret, "q", "", "=r", 8);
 		if (likely(!ret))
 			__put_user_asm(tmp, (u64 __user *)dst,
 				       ret, "q", "", "er", 8);
+		__uaccess_end();
 		return ret;
 	}
 	default:

From a5eeb9b4123eee6aa88bb7a43dba919641abeeb6 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 15 Mar 2016 14:55:12 -0700
Subject: [PATCH 0294/3220] UPSTREAM: mm/slub: support left redzone

SLUB already has a redzone debugging feature.  But it is only positioned
at the end of object (aka right redzone) so it cannot catch left oob.
Although current object's right redzone acts as left redzone of next
object, first object in a slab cannot take advantage of this effect.
This patch explicitly adds a left red zone to each object to detect left
oob more precisely.

Background:

Someone complained to me that left OOB doesn't catch even if KASAN is
enabled which does page allocation debugging.  That page is out of our
control so it would be allocated when left OOB happens and, in this
case, we can't find OOB.  Moreover, SLUB debugging feature can be
enabled without page allocator debugging and, in this case, we will miss
that OOB.

Before trying to implement, I expected that changes would be too
complex, but, it doesn't look that complex to me now.  Almost changes
are applied to debug specific functions so I feel okay.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: Ib893a17ecabd692e6c402e864196bf89cd6781a5
(cherry picked from commit d86bd1bece6fc41d59253002db5441fe960a37f6)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 include/linux/slub_def.h |   1 +
 mm/slub.c                | 100 +++++++++++++++++++++++++++------------
 2 files changed, 72 insertions(+), 29 deletions(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 33885118523c..f4e857e920cd 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -81,6 +81,7 @@ struct kmem_cache {
 	int reserved;		/* Reserved bytes at the end of slabs */
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
+	int red_left_pad;	/* Left redzone padding size */
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;	/* For sysfs */
 #endif
diff --git a/mm/slub.c b/mm/slub.c
index 470f5e561727..cfd3579cbaeb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #endif
 }
 
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+	if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+		p += s->red_left_pad;
+
+	return p;
+}
+
 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 {
 #ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -224,24 +232,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
  * 			Core slab cache functions
  *******************************************************************/
 
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
-				struct page *page, const void *object)
-{
-	void *base;
-
-	if (!object)
-		return 1;
-
-	base = page_address(page);
-	if (object < base || object >= base + page->objects * s->size ||
-		(object - base) % s->size) {
-		return 0;
-	}
-
-	return 1;
-}
-
 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 {
 	return *(void **)(object + s->offset);
@@ -271,12 +261,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 
 /* Loop over all objects in a slab */
 #define for_each_object(__p, __s, __addr, __objects) \
-	for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
-			__p += (__s)->size)
+	for (__p = fixup_red_left(__s, __addr); \
+		__p < (__addr) + (__objects) * (__s)->size; \
+		__p += (__s)->size)
 
 #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
-	for (__p = (__addr), __idx = 1; __idx <= __objects;\
-			__p += (__s)->size, __idx++)
+	for (__p = fixup_red_left(__s, __addr), __idx = 1; \
+		__idx <= __objects; \
+		__p += (__s)->size, __idx++)
 
 /* Determine object index from a given position */
 static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -456,6 +448,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
 		set_bit(slab_index(p, s, addr), map);
 }
 
+static inline int size_from_object(struct kmem_cache *s)
+{
+	if (s->flags & SLAB_RED_ZONE)
+		return s->size - s->red_left_pad;
+
+	return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+	if (s->flags & SLAB_RED_ZONE)
+		p -= s->red_left_pad;
+
+	return p;
+}
+
 /*
  * Debug settings:
  */
@@ -489,6 +497,26 @@ static inline void metadata_access_disable(void)
 /*
  * Object debugging
  */
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+				struct page *page, void *object)
+{
+	void *base;
+
+	if (!object)
+		return 1;
+
+	base = page_address(page);
+	object = restore_red_left(s, object);
+	if (object < base || object >= base + page->objects * s->size ||
+		(object - base) % s->size) {
+		return 0;
+	}
+
+	return 1;
+}
+
 static void print_section(char *text, u8 *addr, unsigned int length)
 {
 	metadata_access_enable();
@@ -628,7 +656,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 	pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
 	       p, p - addr, get_freepointer(s, p));
 
-	if (p > addr + 16)
+	if (s->flags & SLAB_RED_ZONE)
+		print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+	else if (p > addr + 16)
 		print_section("Bytes b4 ", p - 16, 16);
 
 	print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -645,9 +675,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 	if (s->flags & SLAB_STORE_USER)
 		off += 2 * sizeof(struct track);
 
-	if (off != s->size)
+	if (off != size_from_object(s))
 		/* Beginning of the filler is the free pointer */
-		print_section("Padding ", p + off, s->size - off);
+		print_section("Padding ", p + off, size_from_object(s) - off);
 
 	dump_stack();
 }
@@ -677,6 +707,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
 {
 	u8 *p = object;
 
+	if (s->flags & SLAB_RED_ZONE)
+		memset(p - s->red_left_pad, val, s->red_left_pad);
+
 	if (s->flags & __OBJECT_POISON) {
 		memset(p, POISON_FREE, s->object_size - 1);
 		p[s->object_size - 1] = POISON_END;
@@ -769,11 +802,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 		/* We also have user information there */
 		off += 2 * sizeof(struct track);
 
-	if (s->size == off)
+	if (size_from_object(s) == off)
 		return 1;
 
 	return check_bytes_and_report(s, page, p, "Object padding",
-				p + off, POISON_INUSE, s->size - off);
+			p + off, POISON_INUSE, size_from_object(s) - off);
 }
 
 /* Check the pad bytes at the end of a slab page */
@@ -817,6 +850,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
 	u8 *endobject = object + s->object_size;
 
 	if (s->flags & SLAB_RED_ZONE) {
+		if (!check_bytes_and_report(s, page, object, "Redzone",
+			object - s->red_left_pad, val, s->red_left_pad))
+			return 0;
+
 		if (!check_bytes_and_report(s, page, object, "Redzone",
 			endobject, val, s->inuse - s->object_size))
 			return 0;
@@ -1468,7 +1505,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 			set_freepointer(s, p, NULL);
 	}
 
-	page->freelist = start;
+	page->freelist = fixup_red_left(s, start);
 	page->inuse = page->objects;
 	page->frozen = 1;
 
@@ -3283,7 +3320,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 		 */
 		size += 2 * sizeof(struct track);
 
-	if (flags & SLAB_RED_ZONE)
+	if (flags & SLAB_RED_ZONE) {
 		/*
 		 * Add some empty padding so that we can catch
 		 * overwrites from earlier objects rather than let
@@ -3292,6 +3329,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 		 * of the object.
 		 */
 		size += sizeof(void *);
+
+		s->red_left_pad = sizeof(void *);
+		s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+		size += s->red_left_pad;
+	}
 #endif
 
 	/*

From cd55f3c2be7a55bb708ba0f5fb02ae1c08cc1d33 Mon Sep 17 00:00:00 2001
From: Riley Andrews <riandrews@google.com>
Date: Tue, 6 Sep 2016 15:16:25 -0700
Subject: [PATCH 0295/3220] cpuset: Make cpusets restore on hotplug

This deliberately changes the behavior of the per-cpuset
cpus file to not be effected by hotplug. When a cpu is offlined,
it will be removed from the cpuset/cpus file. When a cpu is onlined,
if the cpuset originally requested that that cpu was part of the cpuset,
that cpu will be restored to the cpuset. The cpus files still
have to be hierachical, but the ranges no longer have to be out of
the currently online cpus, just the physically present cpus.

Change-Id: I22cdf33e7d312117bcefba1aeb0125e1ada289a9
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 kernel/cpuset.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0ff01c2a7222..3b4f27981778 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -99,6 +99,7 @@ struct cpuset {
 
 	/* user-configured CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t cpus_allowed;
+	cpumask_var_t cpus_requested;
 	nodemask_t mems_allowed;
 
 	/* effective CPUs and Memory Nodes allow to tasks */
@@ -385,7 +386,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 {
-	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+	return	cpumask_subset(p->cpus_requested, q->cpus_requested) &&
 		nodes_subset(p->mems_allowed, q->mems_allowed) &&
 		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
 		is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -485,7 +486,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	cpuset_for_each_child(c, css, par) {
 		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 		    c != cur &&
-		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+		    cpumask_intersects(trial->cpus_requested, c->cpus_requested))
 			goto out;
 		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 		    c != cur &&
@@ -944,17 +945,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (!*buf) {
 		cpumask_clear(trialcs->cpus_allowed);
 	} else {
-		retval = cpulist_parse(buf, trialcs->cpus_allowed);
+		retval = cpulist_parse(buf, trialcs->cpus_requested);
 		if (retval < 0)
 			return retval;
 
-		if (!cpumask_subset(trialcs->cpus_allowed,
-				    top_cpuset.cpus_allowed))
+		if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
 			return -EINVAL;
+
+		cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
 	}
 
 	/* Nothing to do if the cpus didn't change */
-	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+	if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
 		return 0;
 
 	retval = validate_change(cs, trialcs);
@@ -963,6 +965,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+	cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
 	spin_unlock_irq(&callback_lock);
 
 	/* use trialcs->cpus_allowed as a temp variable */
@@ -1731,7 +1734,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 
 	switch (type) {
 	case FILE_CPULIST:
-		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
 		break;
 	case FILE_MEMLIST:
 		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
@@ -1920,11 +1923,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 		return ERR_PTR(-ENOMEM);
 	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
 		goto free_cs;
+	if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+		goto free_allowed;
 	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
-		goto free_cpus;
+		goto free_requested;
 
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
+	cpumask_clear(cs->cpus_requested);
 	nodes_clear(cs->mems_allowed);
 	cpumask_clear(cs->effective_cpus);
 	nodes_clear(cs->effective_mems);
@@ -1933,7 +1939,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 
 	return &cs->css;
 
-free_cpus:
+free_requested:
+	free_cpumask_var(cs->cpus_requested);
+free_allowed:
 	free_cpumask_var(cs->cpus_allowed);
 free_cs:
 	kfree(cs);
@@ -1996,6 +2004,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cs->mems_allowed = parent->mems_allowed;
 	cs->effective_mems = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+	cpumask_copy(cs->cpus_requested, parent->cpus_requested);
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
 	spin_unlock_irq(&callback_lock);
 out_unlock:
@@ -2030,6 +2039,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 
 	free_cpumask_var(cs->effective_cpus);
 	free_cpumask_var(cs->cpus_allowed);
+	free_cpumask_var(cs->cpus_requested);
 	kfree(cs);
 }
 
@@ -2096,8 +2106,11 @@ int __init cpuset_init(void)
 		BUG();
 	if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
 		BUG();
+	if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL))
+		BUG();
 
 	cpumask_setall(top_cpuset.cpus_allowed);
+	cpumask_setall(top_cpuset.cpus_requested);
 	nodes_setall(top_cpuset.mems_allowed);
 	cpumask_setall(top_cpuset.effective_cpus);
 	nodes_setall(top_cpuset.effective_mems);
@@ -2231,7 +2244,7 @@ retry:
 		goto retry;
 	}
 
-	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+	cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
 	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
 
 	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);

From 96a970e08bdec910c7780e8f823b3595eef6ba8c Mon Sep 17 00:00:00 2001
From: Phil Turnbull <phil.turnbull@oracle.com>
Date: Tue, 2 Feb 2016 13:36:45 -0500
Subject: [PATCH 0296/3220] UPSTREAM: netfilter: nfnetlink: correctly validate
 length of batch messages

(cherry picked from commit c58d6c93680f28ac58984af61d0a7ebf4319c241)

If nlh->nlmsg_len is zero then an infinite loop is triggered because
'skb_pull(skb, msglen);' pulls zero bytes.

The calculation in nlmsg_len() underflows if 'nlh->nlmsg_len <
NLMSG_HDRLEN' which bypasses the length validation and will later
trigger an out-of-bound read.

If the length validation does fail then the malformed batch message is
copied back to userspace. However, we cannot do this because the
nlh->nlmsg_len can be invalid. This leads to an out-of-bounds read in
netlink_ack:

    [   41.455421] ==================================================================
    [   41.456431] BUG: KASAN: slab-out-of-bounds in memcpy+0x1d/0x40 at addr ffff880119e79340
    [   41.456431] Read of size 4294967280 by task a.out/987
    [   41.456431] =============================================================================
    [   41.456431] BUG kmalloc-512 (Not tainted): kasan: bad access detected
    [   41.456431] -----------------------------------------------------------------------------
    ...
    [   41.456431] Bytes b4 ffff880119e79310: 00 00 00 00 d5 03 00 00 b0 fb fe ff 00 00 00 00  ................
    [   41.456431] Object ffff880119e79320: 20 00 00 00 10 00 05 00 00 00 00 00 00 00 00 00   ...............
    [   41.456431] Object ffff880119e79330: 14 00 0a 00 01 03 fc 40 45 56 11 22 33 10 00 05  .......@EV."3...
    [   41.456431] Object ffff880119e79340: f0 ff ff ff 88 99 aa bb 00 14 00 0a 00 06 fe fb  ................
                                            ^^ start of batch nlmsg with
                                               nlmsg_len=4294967280
    ...
    [   41.456431] Memory state around the buggy address:
    [   41.456431]  ffff880119e79400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    [   41.456431]  ffff880119e79480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    [   41.456431] >ffff880119e79500: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc
    [   41.456431]                                ^
    [   41.456431]  ffff880119e79580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
    [   41.456431]  ffff880119e79600: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb
    [   41.456431] ==================================================================

Fix this with better validation of nlh->nlmsg_len and by setting
NFNL_BATCH_FAILURE if any batch message fails length validation.

CAP_NET_ADMIN is required to trigger the bugs.

Fixes: 9ea2aa8b7dba ("netfilter: nfnetlink: validate nfnetlink header from batch")
Signed-off-by: Phil Turnbull <phil.turnbull@oracle.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Change-Id: Id3e15c40cb464bf2791af907c235d8a316b2449c
Bug: 30947055
---
 net/netfilter/nfnetlink.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 77afe913d03d..9adedba78eea 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -326,10 +326,12 @@ replay:
 		nlh = nlmsg_hdr(skb);
 		err = 0;
 
-		if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) ||
-		    skb->len < nlh->nlmsg_len) {
-			err = -EINVAL;
-			goto ack;
+		if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+		    skb->len < nlh->nlmsg_len ||
+		    nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
+			nfnl_err_reset(&err_list);
+			status |= NFNL_BATCH_FAILURE;
+			goto done;
 		}
 
 		/* Only requests are handled by the kernel */

From 6b93f8214eabf0f363eab5283c2ad18b5bc33135 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Fri, 27 Nov 2015 14:30:21 -0500
Subject: [PATCH 0297/3220] UPSTREAM: tty: Prevent ldisc drivers from re-using
 stale tty fields

(cherry picked from commit dd42bf1197144ede075a9d4793123f7689e164bc)

Line discipline drivers may mistakenly misuse ldisc-related fields
when initializing. For example, a failure to initialize tty->receive_room
in the N_GIGASET_M101 line discipline was recently found and fixed [1].
Now, the N_X25 line discipline has been discovered accessing the previous
line discipline's already-freed private data [2].

Harden the ldisc interface against misuse by initializing revelant
tty fields before instancing the new line discipline.

[1]
    commit fd98e9419d8d622a4de91f76b306af6aa627aa9c
    Author: Tilman Schmidt <tilman@imap.cc>
    Date:   Tue Jul 14 00:37:13 2015 +0200

    isdn/gigaset: reset tty->receive_room when attaching ser_gigaset

[2] Report from Sasha Levin <sasha.levin@oracle.com>
    [  634.336761] ==================================================================
    [  634.338226] BUG: KASAN: use-after-free in x25_asy_open_tty+0x13d/0x490 at addr ffff8800a743efd0
    [  634.339558] Read of size 4 by task syzkaller_execu/8981
    [  634.340359] =============================================================================
    [  634.341598] BUG kmalloc-512 (Not tainted): kasan: bad access detected
    ...
    [  634.405018] Call Trace:
    [  634.405277] dump_stack (lib/dump_stack.c:52)
    [  634.405775] print_trailer (mm/slub.c:655)
    [  634.406361] object_err (mm/slub.c:662)
    [  634.406824] kasan_report_error (mm/kasan/report.c:138 mm/kasan/report.c:236)
    [  634.409581] __asan_report_load4_noabort (mm/kasan/report.c:279)
    [  634.411355] x25_asy_open_tty (drivers/net/wan/x25_asy.c:559 (discriminator 1))
    [  634.413997] tty_ldisc_open.isra.2 (drivers/tty/tty_ldisc.c:447)
    [  634.414549] tty_set_ldisc (drivers/tty/tty_ldisc.c:567)
    [  634.415057] tty_ioctl (drivers/tty/tty_io.c:2646 drivers/tty/tty_io.c:2879)
    [  634.423524] do_vfs_ioctl (fs/ioctl.c:43 fs/ioctl.c:607)
    [  634.427491] SyS_ioctl (fs/ioctl.c:622 fs/ioctl.c:613)
    [  634.427945] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:188)

Cc: Tilman Schmidt <tilman@imap.cc>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Change-Id: Ibed6feadfb9706d478f93feec3b240aecfc64af3
Bug: 30951112
---
 drivers/tty/tty_ldisc.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 629e3c865072..9bee25cfa0be 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -417,6 +417,10 @@ EXPORT_SYMBOL_GPL(tty_ldisc_flush);
  *	they are not on hot paths so a little discipline won't do
  *	any harm.
  *
+ *	The line discipline-related tty_struct fields are reset to
+ *	prevent the ldisc driver from re-using stale information for
+ *	the new ldisc instance.
+ *
  *	Locking: takes termios_rwsem
  */
 
@@ -425,6 +429,9 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
 	down_write(&tty->termios_rwsem);
 	tty->termios.c_line = num;
 	up_write(&tty->termios_rwsem);
+
+	tty->disc_data = NULL;
+	tty->receive_room = 0;
 }
 
 /**

From 65a2cd6d500d2620e5a5aa156b518d5fd25d492a Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Wed, 7 Sep 2016 17:39:42 -0700
Subject: [PATCH 0298/3220] Android: Fix build breakages.

The IO latency histogram change broke allmodconfig and
allnoconfig builds. This fixes those breakages.

Change-Id: I9cdae655b40ed155468f3cef25cdb74bb56c4d3e
Signed-off-by: Mohan Srinivasan <srmohan@google.com>
---
 block/blk-core.c         | 2 ++
 include/linux/mmc/host.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 840713279726..64f3455a83ef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3560,6 +3560,7 @@ blk_zero_latency_hist(struct io_latency_state *s)
 	s->latency_reads_elems = 0;
 	s->latency_writes_elems = 0;
 }
+EXPORT_SYMBOL(blk_zero_latency_hist);
 
 ssize_t
 blk_latency_hist_show(struct io_latency_state *s, char *buf)
@@ -3621,3 +3622,4 @@ blk_latency_hist_show(struct io_latency_state *s, char *buf)
 	}
 	return bytes_written;
 }
+EXPORT_SYMBOL(blk_latency_hist_show);
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index e4862f7cdede..97b2b0b1f99d 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -380,8 +380,10 @@ struct mmc_host {
 	} embedded_sdio_data;
 #endif
 
+#ifdef CONFIG_BLOCK
 	int			latency_hist_enabled;
 	struct io_latency_state io_lat_s;
+#endif
 
 	unsigned long		private[0] ____cacheline_aligned;
 };

From 9f139d106df7b6a1805b0e4f3b234cedf25d2bbb Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 1 Jul 2016 00:39:35 -0700
Subject: [PATCH 0299/3220] UPSTREAM: block: fix use-after-free in
 sys_ioprio_get()

(cherry picked from commit 8ba8682107ee2ca3347354e018865d8e1967c5f4)

get_task_ioprio() accesses the task->io_context without holding the task
lock and thus can race with exit_io_context(), leading to a
use-after-free. The reproducer below hits this within a few seconds on
my 4-core QEMU VM:

#define _GNU_SOURCE
#include <assert.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/wait.h>

int main(int argc, char **argv)
{
	pid_t pid, child;
	long nproc, i;

	/* ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); */
	syscall(SYS_ioprio_set, 1, 0, 0x6000);

	nproc = sysconf(_SC_NPROCESSORS_ONLN);

	for (i = 0; i < nproc; i++) {
		pid = fork();
		assert(pid != -1);
		if (pid == 0) {
			for (;;) {
				pid = fork();
				assert(pid != -1);
				if (pid == 0) {
					_exit(0);
				} else {
					child = wait(NULL);
					assert(child == pid);
				}
			}
		}

		pid = fork();
		assert(pid != -1);
		if (pid == 0) {
			for (;;) {
				/* ioprio_get(IOPRIO_WHO_PGRP, 0); */
				syscall(SYS_ioprio_get, 2, 0);
			}
		}
	}

	for (;;) {
		/* ioprio_get(IOPRIO_WHO_PGRP, 0); */
		syscall(SYS_ioprio_get, 2, 0);
	}

	return 0;
}

This gets us KASAN dumps like this:

[   35.526914] ==================================================================
[   35.530009] BUG: KASAN: out-of-bounds in get_task_ioprio+0x7b/0x90 at addr ffff880066f34e6c
[   35.530009] Read of size 2 by task ioprio-gpf/363
[   35.530009] =============================================================================
[   35.530009] BUG blkdev_ioc (Not tainted): kasan: bad access detected
[   35.530009] -----------------------------------------------------------------------------

[   35.530009] Disabling lock debugging due to kernel taint
[   35.530009] INFO: Allocated in create_task_io_context+0x2b/0x370 age=0 cpu=0 pid=360
[   35.530009] 	___slab_alloc+0x55d/0x5a0
[   35.530009] 	__slab_alloc.isra.20+0x2b/0x40
[   35.530009] 	kmem_cache_alloc_node+0x84/0x200
[   35.530009] 	create_task_io_context+0x2b/0x370
[   35.530009] 	get_task_io_context+0x92/0xb0
[   35.530009] 	copy_process.part.8+0x5029/0x5660
[   35.530009] 	_do_fork+0x155/0x7e0
[   35.530009] 	SyS_clone+0x19/0x20
[   35.530009] 	do_syscall_64+0x195/0x3a0
[   35.530009] 	return_from_SYSCALL_64+0x0/0x6a
[   35.530009] INFO: Freed in put_io_context+0xe7/0x120 age=0 cpu=0 pid=1060
[   35.530009] 	__slab_free+0x27b/0x3d0
[   35.530009] 	kmem_cache_free+0x1fb/0x220
[   35.530009] 	put_io_context+0xe7/0x120
[   35.530009] 	put_io_context_active+0x238/0x380
[   35.530009] 	exit_io_context+0x66/0x80
[   35.530009] 	do_exit+0x158e/0x2b90
[   35.530009] 	do_group_exit+0xe5/0x2b0
[   35.530009] 	SyS_exit_group+0x1d/0x20
[   35.530009] 	entry_SYSCALL_64_fastpath+0x1a/0xa4
[   35.530009] INFO: Slab 0xffffea00019bcd00 objects=20 used=4 fp=0xffff880066f34ff0 flags=0x1fffe0000004080
[   35.530009] INFO: Object 0xffff880066f34e58 @offset=3672 fp=0x0000000000000001
[   35.530009] ==================================================================

Fix it by grabbing the task lock while we poke at the io_context.

Cc: stable@vger.kernel.org
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Change-Id: I3f5858cc9a1b9d4124ae7a6578660dec219d2c57
Bug: 30946378
---
 block/ioprio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/ioprio.c b/block/ioprio.c
index cc7800e9eb44..01b8116298a1 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -150,8 +150,10 @@ static int get_task_ioprio(struct task_struct *p)
 	if (ret)
 		goto out;
 	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	task_lock(p);
 	if (p->io_context)
 		ret = p->io_context->ioprio;
+	task_unlock(p);
 out:
 	return ret;
 }

From d942dd3b7c156b005bf01c33af729ab783182f4c Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 19 Jan 2016 12:34:58 +0100
Subject: [PATCH 0300/3220] UPSTREAM: HID: core: prevent out-of-bound readings

(cherry picked from commit 50220dead1650609206efe91f0cc116132d59b3f)

Plugging a Logitech DJ receiver with KASAN activated raises a bunch of
out-of-bound readings.

The fields are allocated up to MAX_USAGE, meaning that potentially, we do
not have enough fields to fit the incoming values.
Add checks and silence KASAN.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Change-Id: Iaf25e882a6696884439d7091b5fbb0b350d893d3
Bug: 30951261
---
 drivers/hid/hid-core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index c6f7a694f67a..fa5f81bbc95f 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1251,6 +1251,7 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field,
 		/* Ignore report if ErrorRollOver */
 		if (!(field->flags & HID_MAIN_ITEM_VARIABLE) &&
 		    value[n] >= min && value[n] <= max &&
+		    value[n] - min < field->maxusage &&
 		    field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1)
 			goto exit;
 	}
@@ -1263,11 +1264,13 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field,
 		}
 
 		if (field->value[n] >= min && field->value[n] <= max
+			&& field->value[n] - min < field->maxusage
 			&& field->usage[field->value[n] - min].hid
 			&& search(value, field->value[n], count))
 				hid_process_event(hid, field, &field->usage[field->value[n] - min], 0, interrupt);
 
 		if (value[n] >= min && value[n] <= max
+			&& value[n] - min < field->maxusage
 			&& field->usage[value[n] - min].hid
 			&& search(field->value, value[n], count))
 				hid_process_event(hid, field, &field->usage[value[n] - min], 1, interrupt);

From 1a4f17ec07cc4a938b6695e92e4f2b2ec7c3d527 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 6 Sep 2016 11:56:01 -0700
Subject: [PATCH 0301/3220] UPSTREAM: x86/uaccess: force copy_*_user() to be
 inlined

As already done with __copy_*_user(), mark copy_*_user() as __always_inline.
Without this, the checks for things like __builtin_const_p() won't work
consistently in either hardened usercopy nor the recent adjustments for
detecting usercopy overflows at compile time.

The change in kernel text size is detectable, but very small:

 text      data     bss     dec      hex     filename
12118735  5768608 14229504 32116847 1ea106f vmlinux.before
12120207  5768608 14229504 32118319 1ea162f vmlinux.after

Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: I284c85c2a782145f46655a91d4f83874c90eba61
(cherry picked from commit e6971009a95a74f28c58bbae415c40effad1226c)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/x86/include/asm/uaccess.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index be439e246d91..dbe64f27280e 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -706,7 +706,7 @@ __copy_from_user_overflow(int size, unsigned long count)
 
 #endif
 
-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
 copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	int sz = __compiletime_object_size(to);
@@ -742,7 +742,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
 	return n;
 }
 
-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
 copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	int sz = __compiletime_object_size(from);

From 1fd70f71e8f428f0c191a6b86e72e700dbe34bed Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 31 Aug 2016 16:04:21 -0700
Subject: [PATCH 0302/3220] BACKPORT: usercopy: fold builtin_const check into
 inline function

Instead of having each caller of check_object_size() need to remember to
check for a const size parameter, move the check into check_object_size()
itself. This actually matches the original implementation in PaX, though
this commit cleans up the now-redundant builtin_const() calls in the
various architectures.

Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: I348809399c10ffa051251866063be674d064b9ff
(cherry picked from 81409e9e28058811c9ea865345e1753f8f677e44)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 include/linux/thread_info.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 0ae29ff9ccfd..eded095fe81e 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -161,7 +161,8 @@ extern void __check_object_size(const void *ptr, unsigned long n,
 static inline void check_object_size(const void *ptr, unsigned long n,
 				     bool to_user)
 {
-	__check_object_size(ptr, n, to_user);
+	if (!__builtin_constant_p(n))
+		__check_object_size(ptr, n, to_user);
 }
 #else
 static inline void check_object_size(const void *ptr, unsigned long n,

From 93584e7e1b4ea543ed04cff643186946196eaebb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Sep 2016 09:39:32 -0700
Subject: [PATCH 0303/3220] UPSTREAM: usercopy: force check_object_size()
 inline

Just for good measure, make sure that check_object_size() is always
inlined too, as already done for copy_*_user() and __copy_*_user().

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: Ibfdf4790d03fe426e68d9a864c55a0d1bbfb7d61
(cherry picked from commit a85d6b8242dc78ef3f4542a0f979aebcbe77fc4e)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 include/linux/thread_info.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index eded095fe81e..4cf89517783a 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -158,8 +158,8 @@ static inline int arch_within_stack_frames(const void * const stack,
 extern void __check_object_size(const void *ptr, unsigned long n,
 					bool to_user);
 
-static inline void check_object_size(const void *ptr, unsigned long n,
-				     bool to_user)
+static __always_inline void check_object_size(const void *ptr, unsigned long n,
+					      bool to_user)
 {
 	if (!__builtin_constant_p(n))
 		__check_object_size(ptr, n, to_user);

From 92e04f8a13a8bb56a7544e56978663dc2e4f82cc Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Sep 2016 09:54:34 -0700
Subject: [PATCH 0304/3220] UPSTREAM: usercopy: remove page-spanning test for
 now

A custom allocator without __GFP_COMP that copies to userspace has been
found in vmw_execbuf_process[1], so this disables the page-span checker
by placing it behind a CONFIG for future work where such things can be
tracked down later.

[1] https://bugzilla.redhat.com/show_bug.cgi?id=1373326

Reported-by: Vinson Lee <vlee@freedesktop.org>
Fixes: f5509cc18daa ("mm: Hardened usercopy")
Signed-off-by: Kees Cook <keescook@chromium.org>

Change-Id: I4177c0fb943f14a5faf5c70f5e54bf782c316f43
(cherry picked from commit 8e1f74ea02cf4562404c48c6882214821552c13f)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 mm/usercopy.c    | 61 +++++++++++++++++++++++++++---------------------
 security/Kconfig | 11 +++++++++
 2 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/mm/usercopy.c b/mm/usercopy.c
index f78015e8b1e5..b34996a3860b 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -135,30 +135,15 @@ static inline const char *check_bogus_address(const void *ptr, unsigned long n)
 	return NULL;
 }
 
-static inline const char *check_heap_object(const void *ptr, unsigned long n,
-					    bool to_user)
+/* Checks for allocs that are marked in some way as spanning multiple pages. */
+static inline const char *check_page_span(const void *ptr, unsigned long n,
+					  struct page *page, bool to_user)
 {
-	struct page *page, *endpage;
+#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
 	const void *end = ptr + n - 1;
+	struct page *endpage;
 	bool is_reserved, is_cma;
 
-	/*
-	 * Some architectures (arm64) return true for virt_addr_valid() on
-	 * vmalloced addresses. Work around this by checking for vmalloc
-	 * first.
-	 */
-	if (is_vmalloc_addr(ptr))
-		return NULL;
-
-	if (!virt_addr_valid(ptr))
-		return NULL;
-
-	page = virt_to_head_page(ptr);
-
-	/* Check slab allocator for flags and size. */
-	if (PageSlab(page))
-		return __check_heap_object(ptr, n, page);
-
 	/*
 	 * Sometimes the kernel data regions are not marked Reserved (see
 	 * check below). And sometimes [_sdata,_edata) does not cover
@@ -187,7 +172,7 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n,
 		   ((unsigned long)end & (unsigned long)PAGE_MASK)))
 		return NULL;
 
-	/* Allow if start and end are inside the same compound page. */
+	/* Allow if fully inside the same compound (__GFP_COMP) page. */
 	endpage = virt_to_head_page(end);
 	if (likely(endpage == page))
 		return NULL;
@@ -200,20 +185,44 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n,
 	is_reserved = PageReserved(page);
 	is_cma = is_migrate_cma_page(page);
 	if (!is_reserved && !is_cma)
-		goto reject;
+		return "<spans multiple pages>";
 
 	for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
 		page = virt_to_head_page(ptr);
 		if (is_reserved && !PageReserved(page))
-			goto reject;
+			return "<spans Reserved and non-Reserved pages>";
 		if (is_cma && !is_migrate_cma_page(page))
-			goto reject;
+			return "<spans CMA and non-CMA pages>";
 	}
+#endif
 
 	return NULL;
+}
 
-reject:
-	return "<spans multiple pages>";
+static inline const char *check_heap_object(const void *ptr, unsigned long n,
+					    bool to_user)
+{
+	struct page *page;
+
+	/*
+	 * Some architectures (arm64) return true for virt_addr_valid() on
+	 * vmalloced addresses. Work around this by checking for vmalloc
+	 * first.
+	 */
+	if (is_vmalloc_addr(ptr))
+		return NULL;
+
+	if (!virt_addr_valid(ptr))
+		return NULL;
+
+	page = virt_to_head_page(ptr);
+
+	/* Check slab allocator for flags and size. */
+	if (PageSlab(page))
+		return __check_heap_object(ptr, n, page);
+
+	/* Verify object does not incorrectly span multiple pages. */
+	return check_page_span(ptr, n, page, to_user);
 }
 
 /*
diff --git a/security/Kconfig b/security/Kconfig
index 2b42c225de28..3aa60791f84d 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -156,6 +156,17 @@ config HARDENED_USERCOPY
 	  or are part of the kernel text. This kills entire classes
 	  of heap overflow exploits and similar kernel memory exposures.
 
+config HARDENED_USERCOPY_PAGESPAN
+	bool "Refuse to copy allocations that span multiple pages"
+	depends on HARDENED_USERCOPY
+	depends on !COMPILE_TEST
+	help
+	  When a multi-page allocation is done without __GFP_COMP,
+	  hardened usercopy will reject attempts to copy it. There are,
+	  however, several cases of this in the kernel that have not all
+	  been removed. This config is intended to be used only while
+	  trying to find such users.
+
 source security/selinux/Kconfig
 source security/smack/Kconfig
 source security/tomoyo/Kconfig

From a38338f1cd595f6a66870ba46cc6a4ae7378c084 Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <salyzyn@google.com>
Date: Wed, 31 Aug 2016 08:09:04 -0700
Subject: [PATCH 0305/3220] FROMLIST: pstore: drop pmsg bounce buffer

(from https://lkml.org/lkml/2016/9/1/428)
(cherry pick from android-3.10 commit b58133100b38f2bf83cad2d7097417a3a196ed0b)

Removing a bounce buffer copy operation in the pmsg driver path is
always better. We also gain in overall performance by not requesting
a vmalloc on every write as this can cause precious RT tasks, such
as user facing media operation, to stall while memory is being
reclaimed. Added a write_buf_user to the pstore functions, a backup
platform write_buf_user that uses the small buffer that is part of
the instance, and implemented a ramoops write_buf_user that only
supports PSTORE_TYPE_PMSG.

Signed-off-by: Mark Salyzyn <salyzyn@google.com>
Bug: 31057326
Change-Id: I4cdee1cd31467aa3e6c605bce2fbd4de5b0f8caa
---
 fs/pstore/platform.c       | 36 +++++++++++++++++++++++++++++
 fs/pstore/pmsg.c           | 35 +++++-----------------------
 fs/pstore/ram.c            | 19 +++++++++++++++
 fs/pstore/ram_core.c       | 47 ++++++++++++++++++++++++++++++++++++--
 include/linux/pstore.h     | 11 ++++++---
 include/linux/pstore_ram.h |  7 ++++--
 6 files changed, 119 insertions(+), 36 deletions(-)

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 588461bb2dd4..40a0fe0a4e05 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -431,6 +431,40 @@ static int pstore_write_compat(enum pstore_type_id type,
 			     size, psi);
 }
 
+static int pstore_write_buf_user_compat(enum pstore_type_id type,
+			       enum kmsg_dump_reason reason,
+			       u64 *id, unsigned int part,
+			       const char __user *buf,
+			       bool compressed, size_t size,
+			       struct pstore_info *psi)
+{
+	unsigned long flags = 0;
+	size_t i, bufsize = size;
+	long ret = 0;
+
+	if (unlikely(!access_ok(VERIFY_READ, buf, size)))
+		return -EFAULT;
+	if (bufsize > psinfo->bufsize)
+		bufsize = psinfo->bufsize;
+	spin_lock_irqsave(&psinfo->buf_lock, flags);
+	for (i = 0; i < size; ) {
+		size_t c = min(size - i, bufsize);
+
+		ret = __copy_from_user(psinfo->buf, buf + i, c);
+		if (unlikely(ret != 0)) {
+			ret = -EFAULT;
+			break;
+		}
+		ret = psi->write_buf(type, reason, id, part, psinfo->buf,
+				     compressed, c, psi);
+		if (unlikely(ret < 0))
+			break;
+		i += c;
+	}
+	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+	return unlikely(ret < 0) ? ret : size;
+}
+
 /*
  * platform specific persistent storage driver registers with
  * us here. If pstore is already mounted, call the platform
@@ -453,6 +487,8 @@ int pstore_register(struct pstore_info *psi)
 
 	if (!psi->write)
 		psi->write = pstore_write_compat;
+	if (!psi->write_buf_user)
+		psi->write_buf_user = pstore_write_buf_user_compat;
 	psinfo = psi;
 	mutex_init(&psinfo->read_mutex);
 	spin_unlock(&pstore_lock);
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 7de20cd3797f..78f6176c020f 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -19,48 +19,25 @@
 #include "internal.h"
 
 static DEFINE_MUTEX(pmsg_lock);
-#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
 
 static ssize_t write_pmsg(struct file *file, const char __user *buf,
 			  size_t count, loff_t *ppos)
 {
-	size_t i, buffer_size;
-	char *buffer;
+	u64 id;
+	int ret;
 
 	if (!count)
 		return 0;
 
+	/* check outside lock, page in any data. write_buf_user also checks */
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
 
-	buffer_size = count;
-	if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
-		buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
-	buffer = vmalloc(buffer_size);
-	if (!buffer)
-		return -ENOMEM;
-
 	mutex_lock(&pmsg_lock);
-	for (i = 0; i < count; ) {
-		size_t c = min(count - i, buffer_size);
-		u64 id;
-		long ret;
-
-		ret = __copy_from_user(buffer, buf + i, c);
-		if (unlikely(ret != 0)) {
-			mutex_unlock(&pmsg_lock);
-			vfree(buffer);
-			return -EFAULT;
-		}
-		psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
-				  psinfo);
-
-		i += c;
-	}
-
+	ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count,
+				     psinfo);
 	mutex_unlock(&pmsg_lock);
-	vfree(buffer);
-	return count;
+	return ret ? ret : count;
 }
 
 static const struct file_operations pmsg_fops = {
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 414041342a99..5b10c2b4146c 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -331,6 +331,24 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 	return 0;
 }
 
+static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type,
+						 enum kmsg_dump_reason reason,
+						 u64 *id, unsigned int part,
+						 const char __user *buf,
+						 bool compressed, size_t size,
+						 struct pstore_info *psi)
+{
+	if (type == PSTORE_TYPE_PMSG) {
+		struct ramoops_context *cxt = psi->data;
+
+		if (!cxt->mprz)
+			return -ENOMEM;
+		return persistent_ram_write_user(cxt->mprz, buf, size);
+	}
+
+	return -EINVAL;
+}
+
 static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
 				struct timespec time, struct pstore_info *psi)
 {
@@ -369,6 +387,7 @@ static struct ramoops_context oops_cxt = {
 		.open	= ramoops_pstore_open,
 		.read	= ramoops_pstore_read,
 		.write_buf	= ramoops_pstore_write_buf,
+		.write_buf_user	= ramoops_pstore_write_buf_user,
 		.erase	= ramoops_pstore_erase,
 	},
 };
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 76c3f80efdfa..aa9afe573155 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -17,15 +17,16 @@
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/errno.h>
-#include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/io.h>
+#include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/memblock.h>
+#include <linux/pstore_ram.h>
 #include <linux/rslib.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
-#include <linux/pstore_ram.h>
 #include <asm/page.h>
 
 struct persistent_ram_buffer {
@@ -303,6 +304,16 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz,
 	persistent_ram_update_ecc(prz, start, count);
 }
 
+static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz,
+	const void __user *s, unsigned int start, unsigned int count)
+{
+	struct persistent_ram_buffer *buffer = prz->buffer;
+	int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ?
+		-EFAULT : 0;
+	persistent_ram_update_ecc(prz, start, count);
+	return ret;
+}
+
 void persistent_ram_save_old(struct persistent_ram_zone *prz)
 {
 	struct persistent_ram_buffer *buffer = prz->buffer;
@@ -356,6 +367,38 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz,
 	return count;
 }
 
+int notrace persistent_ram_write_user(struct persistent_ram_zone *prz,
+	const void __user *s, unsigned int count)
+{
+	int rem, ret = 0, c = count;
+	size_t start;
+
+	if (unlikely(!access_ok(VERIFY_READ, s, count)))
+		return -EFAULT;
+	if (unlikely(c > prz->buffer_size)) {
+		s += c - prz->buffer_size;
+		c = prz->buffer_size;
+	}
+
+	buffer_size_add(prz, c);
+
+	start = buffer_start_add(prz, c);
+
+	rem = prz->buffer_size - start;
+	if (unlikely(rem < c)) {
+		ret = persistent_ram_update_user(prz, s, start, rem);
+		s += rem;
+		c -= rem;
+		start = 0;
+	}
+	if (likely(!ret))
+		ret = persistent_ram_update_user(prz, s, start, c);
+
+	persistent_ram_update_header_ecc(prz);
+
+	return unlikely(ret) ? ret : count;
+}
+
 size_t persistent_ram_old_size(struct persistent_ram_zone *prz)
 {
 	return prz->old_log_size;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 831479f8df8f..5cae2c6c90ad 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -22,12 +22,13 @@
 #ifndef _LINUX_PSTORE_H
 #define _LINUX_PSTORE_H
 
-#include <linux/time.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
 #include <linux/kmsg_dump.h>
 #include <linux/mutex.h>
-#include <linux/types.h>
 #include <linux/spinlock.h>
-#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/types.h>
 
 /* types */
 enum pstore_type_id {
@@ -67,6 +68,10 @@ struct pstore_info {
 			enum kmsg_dump_reason reason, u64 *id,
 			unsigned int part, const char *buf, bool compressed,
 			size_t size, struct pstore_info *psi);
+	int		(*write_buf_user)(enum pstore_type_id type,
+			enum kmsg_dump_reason reason, u64 *id,
+			unsigned int part, const char __user *buf,
+			bool compressed, size_t size, struct pstore_info *psi);
 	int		(*erase)(enum pstore_type_id type, u64 id,
 			int count, struct timespec time,
 			struct pstore_info *psi);
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index 712757f320a4..45ac5a0d29ee 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -17,11 +17,12 @@
 #ifndef __LINUX_PSTORE_RAM_H__
 #define __LINUX_PSTORE_RAM_H__
 
+#include <linux/compiler.h>
 #include <linux/device.h>
+#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/types.h>
-#include <linux/init.h>
 
 struct persistent_ram_buffer;
 struct rs_control;
@@ -59,7 +60,9 @@ void persistent_ram_free(struct persistent_ram_zone *prz);
 void persistent_ram_zap(struct persistent_ram_zone *prz);
 
 int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
-	unsigned int count);
+			 unsigned int count);
+int persistent_ram_write_user(struct persistent_ram_zone *prz,
+			      const void __user *s, unsigned int count);
 
 void persistent_ram_save_old(struct persistent_ram_zone *prz);
 size_t persistent_ram_old_size(struct persistent_ram_zone *prz);

From 81330959ee15a682fbefcdd4f5350a979d8e8fdb Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 8 Sep 2016 16:43:21 -0700
Subject: [PATCH 0306/3220] cpufreq: Kconfig: Fixup incorrect selection by
 CPU_FREQ_DEFAULT_GOV_SCHED

The CPU_FREQ_DEFAULT_GOV_SCHED option is incorrectly selecting
CPU_FREQ_GOV_INTERACTIVE, when it should be selecting
CPU_FREQ_GOV_SCHED.

Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/cpufreq/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index d423f51d3276..a2d63d86bb70 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -105,7 +105,7 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 
 config CPU_FREQ_DEFAULT_GOV_SCHED
 	bool "sched"
-	select CPU_FREQ_GOV_INTERACTIVE
+	select CPU_FREQ_GOV_SCHED
 	help
 	  Use the CPUfreq governor 'sched' as default. This scales
 	  cpu frequency using CPU utilization estimates from the

From 1c81f45725d2cee3c11e1d213eb30cc7f8feea0a Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Thu, 25 Aug 2016 11:06:37 +0530
Subject: [PATCH 0307/3220] sched/walt: include missing header for
 arm_timer_read_counter()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include clocksource/arm_arch_timer.h to fix implicit function
declaration of ‘arch_timer_read_counter’ build error for ARCH=arm.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
[jstultz: Cherry-picked from common/android-3.18]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/sched/walt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index d9d09914ce30..07b7f84b37e2 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -22,6 +22,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/cpufreq.h>
 #include <trace/events/sched.h>
+#include <clocksource/arm_arch_timer.h>
 #include "sched.h"
 #include "walt.h"
 

From 0f959e0a3261539eb14420d02a491188cc52f9e8 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 15 Mar 2016 12:14:49 +0100
Subject: [PATCH 0308/3220] BACKPORT: ALSA: usb-audio: Minor code cleanup in
 create_fixed_stream_quirk()

(cherry picked from commit 902eb7fd1e4af3ac69b9b30f8373f118c92b9729)

Just a minor code cleanup: unify the error paths.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Change-Id: I8253a86235df2ac1258153c9e128fa158527567f
Bug: 30952477
---
 sound/usb/quirks.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index b6c0c8e3b450..a85b1953d83a 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -167,16 +167,12 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip,
 	stream = (fp->endpoint & USB_DIR_IN)
 		? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK;
 	err = snd_usb_add_audio_stream(chip, stream, fp);
-	if (err < 0) {
-		kfree(fp);
-		kfree(rate_table);
-		return err;
-	}
+	if (err < 0)
+		goto error;
 	if (fp->iface != get_iface_desc(&iface->altsetting[0])->bInterfaceNumber ||
 	    fp->altset_idx >= iface->num_altsetting) {
-		kfree(fp);
-		kfree(rate_table);
-		return -EINVAL;
+		err = -EINVAL;
+		goto error;
 	}
 	alts = &iface->altsetting[fp->altset_idx];
 	altsd = get_iface_desc(alts);
@@ -190,6 +186,11 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip,
 	snd_usb_init_pitch(chip, fp->iface, alts, fp);
 	snd_usb_init_sample_rate(chip, fp->iface, alts, fp, fp->rate_max);
 	return 0;
+
+ error:
+	kfree(fp);
+	kfree(rate_table);
+	return err;
 }
 
 static int create_auto_pcm_quirk(struct snd_usb_audio *chip,

From 571763be0e7f0aab3aa3eb02bb315b63539cc10c Mon Sep 17 00:00:00 2001
From: Vladis Dronov <vdronov@redhat.com>
Date: Thu, 31 Mar 2016 12:05:43 -0400
Subject: [PATCH 0309/3220] UPSTREAM: ALSA: usb-audio: Fix double-free in error
 paths after snd_usb_add_audio_stream() call

(cherry picked from commit 836b34a935abc91e13e63053d0a83b24dfb5ea78)

create_fixed_stream_quirk(), snd_usb_parse_audio_interface() and
create_uaxx_quirk() functions allocate the audioformat object by themselves
and free it upon error before returning. However, once the object is linked
to a stream, it's freed again in snd_usb_audio_pcm_free(), thus it'll be
double-freed, eventually resulting in a memory corruption.

This patch fixes these failures in the error paths by unlinking the audioformat
object before freeing it.

Based on a patch by Takashi Iwai <tiwai@suse.de>

[Note for stable backports:
 this patch requires the commit 902eb7fd1e4a ('ALSA: usb-audio: Minor
 code cleanup in create_fixed_stream_quirk()')]

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1283358
Reported-by: Ralf Spenneberg <ralf@spenneberg.net>
Cc: <stable@vger.kernel.org> # see the note above
Signed-off-by: Vladis Dronov <vdronov@redhat.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Change-Id: I7073a17d8c99886d2f6ed7981892712ba7dd5873
Bug: 30952477
---
 sound/usb/quirks.c | 4 ++++
 sound/usb/stream.c | 6 +++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index a85b1953d83a..79c1cccc606a 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -150,6 +150,7 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip,
 		usb_audio_err(chip, "cannot memdup\n");
 		return -ENOMEM;
 	}
+	INIT_LIST_HEAD(&fp->list);
 	if (fp->nr_rates > MAX_NR_RATES) {
 		kfree(fp);
 		return -EINVAL;
@@ -188,6 +189,7 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip,
 	return 0;
 
  error:
+	list_del(&fp->list); /* unlink for avoiding double-free */
 	kfree(fp);
 	kfree(rate_table);
 	return err;
@@ -463,6 +465,7 @@ static int create_uaxx_quirk(struct snd_usb_audio *chip,
 	fp->ep_attr = get_endpoint(alts, 0)->bmAttributes;
 	fp->datainterval = 0;
 	fp->maxpacksize = le16_to_cpu(get_endpoint(alts, 0)->wMaxPacketSize);
+	INIT_LIST_HEAD(&fp->list);
 
 	switch (fp->maxpacksize) {
 	case 0x120:
@@ -486,6 +489,7 @@ static int create_uaxx_quirk(struct snd_usb_audio *chip,
 		? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK;
 	err = snd_usb_add_audio_stream(chip, stream, fp);
 	if (err < 0) {
+		list_del(&fp->list); /* unlink for avoiding double-free */
 		kfree(fp);
 		return err;
 	}
diff --git a/sound/usb/stream.c b/sound/usb/stream.c
index 8ee14f2365e7..3b23102230c0 100644
--- a/sound/usb/stream.c
+++ b/sound/usb/stream.c
@@ -316,7 +316,9 @@ static struct snd_pcm_chmap_elem *convert_chmap(int channels, unsigned int bits,
 /*
  * add this endpoint to the chip instance.
  * if a stream with the same endpoint already exists, append to it.
- * if not, create a new pcm stream.
+ * if not, create a new pcm stream. note, fp is added to the substream
+ * fmt_list and will be freed on the chip instance release. do not free
+ * fp or do remove it from the substream fmt_list to avoid double-free.
  */
 int snd_usb_add_audio_stream(struct snd_usb_audio *chip,
 			     int stream,
@@ -677,6 +679,7 @@ int snd_usb_parse_audio_interface(struct snd_usb_audio *chip, int iface_no)
 					* (fp->maxpacksize & 0x7ff);
 		fp->attributes = parse_uac_endpoint_attributes(chip, alts, protocol, iface_no);
 		fp->clock = clock;
+		INIT_LIST_HEAD(&fp->list);
 
 		/* some quirks for attributes here */
 
@@ -725,6 +728,7 @@ int snd_usb_parse_audio_interface(struct snd_usb_audio *chip, int iface_no)
 		dev_dbg(&dev->dev, "%u:%d: add audio endpoint %#x\n", iface_no, altno, fp->endpoint);
 		err = snd_usb_add_audio_stream(chip, stream, fp);
 		if (err < 0) {
+			list_del(&fp->list); /* unlink for avoiding double-free */
 			kfree(fp->rate_table);
 			kfree(fp->chmap);
 			kfree(fp);

From 6c7e03dde14e5074134f47d9ec3dea7de24e4e58 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 29 Aug 2016 19:48:17 +0530
Subject: [PATCH 0310/3220] DEBUG: cpufreq: fix cpu_capacity tracing build for
 non-smp systems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cpu curr capacity can only be traced for SMP systems. Non-SMP builds
will fail with:

drivers/cpufreq/cpufreq.c: In function ‘cpufreq_freq_transition_begin’:
drivers/cpufreq/cpufreq.c:438:22: error: implicit declaration of function ‘capacity_curr_of’ [-Werror=implicit-function-declaration]
   trace_cpu_capacity(capacity_curr_of(cpu), cpu);
                      ^

Fixes: Change-Id: Icd0930d11068fcb7d2b6a9a48e7ed974904e1081
       ("DEBUG: sched,cpufreq: add cpu_capacity change tracepoint")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
[jstultz: Cherry-picked from common/android-3.18]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 7264820e6443..7b728143440d 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -29,7 +29,9 @@
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
 #include <linux/tick.h>
+#ifdef CONFIG_SMP
 #include <linux/sched.h>
+#endif
 #include <trace/events/power.h>
 
 static LIST_HEAD(cpufreq_policy_list);
@@ -474,7 +476,9 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy,
 void cpufreq_freq_transition_begin(struct cpufreq_policy *policy,
 		struct cpufreq_freqs *freqs)
 {
+#ifdef CONFIG_SMP
 	int cpu;
+#endif
 
 	/*
 	 * Catch double invocations of _begin() which lead to self-deadlock.
@@ -503,8 +507,10 @@ wait:
 	spin_unlock(&policy->transition_lock);
 
 	scale_freq_capacity(policy, freqs);
+#ifdef CONFIG_SMP
 	for_each_cpu(cpu, policy->cpus)
 		trace_cpu_capacity(capacity_curr_of(cpu), cpu);
+#endif
 
 	cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
 }

From aeb4a3112e516ef93069acdc04b65e9ae5882d28 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Wed, 24 Aug 2016 11:52:17 +0530
Subject: [PATCH 0311/3220] sched/walt: use do_div instead of division operator

Use do_div() instead of "/" operator to fix undefined references to
"__aeabi_uldivmod" build error for ARCH=arm.

Also in TP_fast_assign(), along with do_div() usage,  replace "," with
";" which would have resulted in a syntax error (!), because
'#define TP_fast_assign(args...) args' would have stripped off the ","
and left white space between these two assignments after CPP phase.

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
[jstultz: Cherry-picked from common/android-3.18]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/trace/events/sched.h | 3 ++-
 kernel/sched/sched.h         | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index c50310a7fd6d..dffaffab4bc8 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -1057,7 +1057,8 @@ TRACE_EVENT(walt_update_history,
 		__entry->samples        = samples;
 		__entry->evt            = evt;
 		__entry->demand         = p->ravg.demand;
-		__entry->walt_avg = (__entry->demand << 10) / walt_ravg_window,
+		__entry->walt_avg	= (__entry->demand << 10);
+		do_div(__entry->walt_avg, walt_ravg_window);
 		__entry->pelt_avg	= p->se.avg.util_avg;
 		memcpy(__entry->hist, p->ravg.sum_history,
 					RAVG_HIST_SIZE_MAX * sizeof(u32));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4e2e44e3cc7b..06f15724a639 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1577,9 +1577,10 @@ static inline unsigned long __cpu_util(int cpu, int delta)
 	unsigned long capacity = capacity_orig_of(cpu);
 
 #ifdef CONFIG_SCHED_WALT
-	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
-		util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) /
-			walt_ravg_window;
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+		util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT;
+		do_div(util, walt_ravg_window);
+	}
 #endif
 	delta += util;
 	if (delta < 0)

From ad607cd620b8caeebf3ed3336fa80921c588b27f Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@linaro.org>
Date: Wed, 4 May 2016 18:56:45 -0700
Subject: [PATCH 0312/3220] arm: Fix build error "conflicting types for
 'scale_cpu_capacity'"

Commit "arm: Update arch_scale_cpu_capacity() to reflect change to
define" introduced a dependency on struct sched_domain in
arch/arm/include/asm/topologoy.h, but that structure is only currently
defined if CONFIG_CPU_FREQ is enabled, which causes
include/linux/cpufreq.h to get pulled in which defines it.

Include <linux/cpufreq.h> regardless of CONFIG_CPU_FREQ so struct
sched_domain is always defined.

Fixes: Change-Id: I372bd5e4c1e203428d72b18c8a806b06f3567ef6
       ("arm: Update arch_scale_cpu_capacity() to reflect change to define")
Signed-off-by: Steve Muckle <smuckle@linaro.org>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
[jstultz: Cherry-picked from android-3.18]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/arm/include/asm/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index e3e596cbb1a7..d06064120694 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_ARM_CPU_TOPOLOGY
 
+#include <linux/cpufreq.h>
 #include <linux/cpumask.h>
 
 struct cputopo_arm {
@@ -25,7 +26,6 @@ void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #ifdef CONFIG_CPU_FREQ
-#include <linux/cpufreq.h>
 #define arch_scale_freq_capacity cpufreq_scale_freq_capacity
 #endif
 #define arch_scale_cpu_capacity scale_cpu_capacity

From 23eeab46c8550123a5e66b7b11862118dbb05d69 Mon Sep 17 00:00:00 2001
From: Jon Medhurst <tixy@linaro.org>
Date: Thu, 2 Jun 2016 12:18:08 +0000
Subject: [PATCH 0313/3220] arm: Fix #if/#ifdef typo in topology.c

Probably a typo in arch/arm/kernel/topology.c

This patch fixes the warning...

arch/arm/kernel/topology.c: In function 'scale_cpu_capacity':
arch/arm/kernel/topology.c:47:5: warning: "CONFIG_CPU_FREQ" is not defined [-Wundef]

Fixes: Change-Id: If5e9e0ba8ff5a5d3236b373dbce8c72ea71b5e18
       ("arm: Enable max freq invariant scheduler load-tracking and capacity support")
Signed-off-by: Jon Medhurst <tixy@linaro.org>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
[jstultz: Cherry-picked from android-3.18]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/arm/kernel/topology.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index f5941004efba..4f2c51ef162d 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -44,7 +44,7 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale);
 
 unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-#if CONFIG_CPU_FREQ
+#ifdef CONFIG_CPU_FREQ
 	unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
 
 	return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;

From fb3cce0136514c65c2d1a9d2285a357cfb96d54c Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Wed, 24 Aug 2016 11:02:29 +0100
Subject: [PATCH 0314/3220] FIXUP: sched/tune: add fixes missing from a
 previous patch

The previous patch:

  e7ce26f - FIXUP: sched/tune: fix accounting for runnable tasks

squashed together patches of a series to fix SchedTune's accounting
issues. However, in the consolidation and cleanup of the series to merge
in the Android Common Kernel, we somehow missed a couple of important
changes:
1) the schedtune_exit function is not more required, because e7ce26f
   fixes accounting of exiting tasks in a different way
2) the schedtune_initialized flag was not set at the end of
   scheddtune_init_cgroup() thus failing to enabled SchedTune at boot.

This patch thus is to be considered an integration of e7ce26f.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
[jstultz: Cherry-picked from android-3.18. It should be noted that
some of this patch was already applied in the 4.4 patches (schedtune_exit
doesn't exist for example), but this patch just ensures things are totally
synced up]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/sched/tune.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index bd7f319ce53e..505d7b35b0e1 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -715,7 +715,7 @@ schedtune_css_free(struct cgroup_subsys_state *css)
 struct cgroup_subsys schedtune_cgrp_subsys = {
 	.css_alloc	= schedtune_css_alloc,
 	.css_free	= schedtune_css_free,
-//	.allow_attach   = schedtune_allow_attach,
+	.allow_attach   = schedtune_allow_attach,
 	.can_attach     = schedtune_can_attach,
 	.cancel_attach  = schedtune_cancel_attach,
 	.legacy_cftypes	= files,
@@ -736,6 +736,8 @@ schedtune_init_cgroups(void)
 
 	pr_info("schedtune: configured to support %d boost groups\n",
 		BOOSTGROUPS_COUNT);
+
+	schedtune_initialized = true;
 }
 
 #else /* CONFIG_CGROUP_SCHEDTUNE */

From a85045c03470e88923ed98bb5917b289fa752030 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Wed, 24 Aug 2016 11:27:27 +0100
Subject: [PATCH 0315/3220] FIXUP: sched/tune: update accouting before CPU
 capacity

The SchedTune tasks accounting is used to identify how many tasks are in
a boostgroup and thus to bias the selection of an OPP based on the
maximum boost value of the active boostgroups.
The current implementation however update the accounting after CPU
capacity has been update. This has two effects:
a) when we enqueue a boosted task, we do not immediately boost its CPU
b) when we dequeue a boosted task, we can keep a CPU boosted even if not
   required

This patch change the order of the SchedTune accounting and SchedFreq
updated to ensure to have always an updated representation of which
boosted tasks are runnable on a CPU before updating its capacity.

Reported-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c362be1df195..2cd3e2671f16 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4240,6 +4240,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+	/*
+	 * Update SchedTune accounting.
+	 *
+	 * We do it before updating the CPU capacity to ensure the
+	 * boost value of the current task is accounted for in the
+	 * selection of the OPP.
+	 *
+	 * We do it also in the case where we enqueue a throttled task;
+	 * we could argue that a throttled task should not boost a CPU,
+	 * however:
+	 * a) properly implementing CPU boosting considering throttled
+	 *    tasks will increase a lot the complexity of the solution
+	 * b) it's not easy to quantify the benefits introduced by
+	 *    such a more complex solution.
+	 * Thus, for the time being we go for the simple solution and boost
+	 * also for throttled RQs.
+	 */
+	schedtune_enqueue_task(p, cpu_of(rq));
+
 	if (!se) {
 		walt_inc_cumulative_runnable_avg(rq, p);
 		if (!task_new && !rq->rd->overutilized &&
@@ -4259,9 +4278,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			update_capacity_of(cpu_of(rq));
 	}
 
-	/* Update SchedTune accouting */
-	schedtune_enqueue_task(p, cpu_of(rq));
-
 #endif /* CONFIG_SMP */
 	hrtick_update(rq);
 }
@@ -4327,6 +4343,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+	/*
+	 * Update SchedTune accounting
+	 *
+	 * We do it before updating the CPU capacity to ensure the
+	 * boost value of the current task is accounted for in the
+	 * selection of the OPP.
+	 */
+	schedtune_dequeue_task(p, cpu_of(rq));
+
 	if (!se) {
 		walt_dec_cumulative_runnable_avg(rq, p);
 
@@ -4346,9 +4371,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		}
 	}
 
-	/* Update SchedTune accouting */
-	schedtune_dequeue_task(p, cpu_of(rq));
-
 #endif /* CONFIG_SMP */
 
 	hrtick_update(rq);

From 1510827f29ced43ac686c4625ce709e0dfb1292d Mon Sep 17 00:00:00 2001
From: Jungseung Lee <js07.lee@samsung.com>
Date: Tue, 29 Dec 2015 04:47:00 +0800
Subject: [PATCH 0316/3220] UPSTREAM: ARM: 8494/1: mm: Enable PXN when running
 non-LPAE kernel on LPAE processor

The VMSA field of MMFR0 (bottom 4 bits) is incremented for each
added feature.  PXN is supported if the value is >= 4 and LPAE
is supported if it is >= 5.

In case a kernel with CONFIG_ARM_LPAE disabled is used on a
processor that supports LPAE, we can still use PXN in short
descriptors.  So check for >= 4 not == 4.

Signed-off-by: Jungseung Lee <js07.lee@samsung.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 4867f5daf82c..de9f8921e407 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -572,7 +572,7 @@ static void __init build_mem_type_table(void)
 	 * in the Short-descriptor translation table format descriptors.
 	 */
 	if (cpu_arch == CPU_ARCH_ARMv7 &&
-		(read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) {
+		(read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) {
 		user_pmd_table |= PMD_PXNTABLE;
 	}
 #endif

From 702ea26d1f7d421a31831de98ea02e92c6782c20 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 19 Jul 2016 17:42:57 -0400
Subject: [PATCH 0317/3220] UPSTREAM: audit: fix a double fetch in
 audit_log_single_execve_arg()

(cherry picked from commit 43761473c254b45883a64441dd0bc85a42f3645c)

There is a double fetch problem in audit_log_single_execve_arg()
where we first check the execve(2) argumnets for any "bad" characters
which would require hex encoding and then re-fetch the arguments for
logging in the audit record[1].  Of course this leaves a window of
opportunity for an unsavory application to munge with the data.

This patch reworks things by only fetching the argument data once[2]
into a buffer where it is scanned and logged into the audit
records(s).  In addition to fixing the double fetch, this patch
improves on the original code in a few other ways: better handling
of large arguments which require encoding, stricter record length
checking, and some performance improvements (completely unverified,
but we got rid of some strlen() calls, that's got to be a good
thing).

As part of the development of this patch, I've also created a basic
regression test for the audit-testsuite, the test can be tracked on
GitHub at the following link:

 * https://github.com/linux-audit/audit-testsuite/issues/25

[1] If you pay careful attention, there is actually a triple fetch
problem due to a strnlen_user() call at the top of the function.

[2] This is a tiny white lie, we do make a call to strnlen_user()
prior to fetching the argument data.  I don't like it, but due to the
way the audit record is structured we really have no choice unless we
copy the entire argument at once (which would require a rather
wasteful allocation).  The good news is that with this patch the
kernel no longer relies on this strnlen_user() value for anything
beyond recording it in the log, we also update it with a trustworthy
value whenever possible.

Reported-by: Pengfei Wang <wpengfeinudt@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Change-Id: I10e979e94605e3cf8d461e3e521f8f9837228aa5
Bug: 30956807
---
 kernel/auditsc.c | 332 +++++++++++++++++++++++------------------------
 1 file changed, 164 insertions(+), 168 deletions(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b86cc04959de..48f45987dc6c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
 #include <linux/compat.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
+#include <linux/uaccess.h>
 #include <uapi/linux/limits.h>
 
 #include "audit.h"
@@ -82,7 +83,8 @@
 #define AUDITSC_SUCCESS 1
 #define AUDITSC_FAILURE 2
 
-/* no execve audit message should be longer than this (userspace limits) */
+/* no execve audit message should be longer than this (userspace limits),
+ * see the note near the top of audit_log_execve_info() about this value */
 #define MAX_EXECVE_AUDIT_LEN 7500
 
 /* max length to print of cmdline/proctitle value during audit */
@@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 	return rc;
 }
 
-/*
- * to_send and len_sent accounting are very loose estimates.  We aren't
- * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundary)
- *
- * why snprintf?  an int is up to 12 digits long.  if we just assumed when
- * logging that a[%d]= was going to be 16 characters long we would be wasting
- * space in every audit message.  In one 7500 byte message we can log up to
- * about 1000 min size arguments.  That comes down to about 50% waste of space
- * if we didn't do the snprintf to find out how long arg_num_len was.
- */
-static int audit_log_single_execve_arg(struct audit_context *context,
-					struct audit_buffer **ab,
-					int arg_num,
-					size_t *len_sent,
-					const char __user *p,
-					char *buf)
-{
-	char arg_num_len_buf[12];
-	const char __user *tmp_p = p;
-	/* how many digits are in arg_num? 5 is the length of ' a=""' */
-	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
-	size_t len, len_left, to_send;
-	size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
-	unsigned int i, has_cntl = 0, too_long = 0;
-	int ret;
-
-	/* strnlen_user includes the null we don't want to send */
-	len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
-
-	/*
-	 * We just created this mm, if we can't find the strings
-	 * we just copied into it something is _very_ wrong. Similar
-	 * for strings that are too long, we should not have created
-	 * any.
-	 */
-	if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
-		send_sig(SIGKILL, current, 0);
-		return -1;
-	}
-
-	/* walk the whole argument looking for non-ascii chars */
-	do {
-		if (len_left > MAX_EXECVE_AUDIT_LEN)
-			to_send = MAX_EXECVE_AUDIT_LEN;
-		else
-			to_send = len_left;
-		ret = copy_from_user(buf, tmp_p, to_send);
-		/*
-		 * There is no reason for this copy to be short. We just
-		 * copied them here, and the mm hasn't been exposed to user-
-		 * space yet.
-		 */
-		if (ret) {
-			WARN_ON(1);
-			send_sig(SIGKILL, current, 0);
-			return -1;
-		}
-		buf[to_send] = '\0';
-		has_cntl = audit_string_contains_control(buf, to_send);
-		if (has_cntl) {
-			/*
-			 * hex messages get logged as 2 bytes, so we can only
-			 * send half as much in each message
-			 */
-			max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
-			break;
-		}
-		len_left -= to_send;
-		tmp_p += to_send;
-	} while (len_left > 0);
-
-	len_left = len;
-
-	if (len > max_execve_audit_len)
-		too_long = 1;
-
-	/* rewalk the argument actually logging the message */
-	for (i = 0; len_left > 0; i++) {
-		int room_left;
-
-		if (len_left > max_execve_audit_len)
-			to_send = max_execve_audit_len;
-		else
-			to_send = len_left;
-
-		/* do we have space left to send this argument in this ab? */
-		room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
-		if (has_cntl)
-			room_left -= (to_send * 2);
-		else
-			room_left -= to_send;
-		if (room_left < 0) {
-			*len_sent = 0;
-			audit_log_end(*ab);
-			*ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
-			if (!*ab)
-				return 0;
-		}
-
-		/*
-		 * first record needs to say how long the original string was
-		 * so we can be sure nothing was lost.
-		 */
-		if ((i == 0) && (too_long))
-			audit_log_format(*ab, " a%d_len=%zu", arg_num,
-					 has_cntl ? 2*len : len);
-
-		/*
-		 * normally arguments are small enough to fit and we already
-		 * filled buf above when we checked for control characters
-		 * so don't bother with another copy_from_user
-		 */
-		if (len >= max_execve_audit_len)
-			ret = copy_from_user(buf, p, to_send);
-		else
-			ret = 0;
-		if (ret) {
-			WARN_ON(1);
-			send_sig(SIGKILL, current, 0);
-			return -1;
-		}
-		buf[to_send] = '\0';
-
-		/* actually log it */
-		audit_log_format(*ab, " a%d", arg_num);
-		if (too_long)
-			audit_log_format(*ab, "[%d]", i);
-		audit_log_format(*ab, "=");
-		if (has_cntl)
-			audit_log_n_hex(*ab, buf, to_send);
-		else
-			audit_log_string(*ab, buf);
-
-		p += to_send;
-		len_left -= to_send;
-		*len_sent += arg_num_len;
-		if (has_cntl)
-			*len_sent += to_send * 2;
-		else
-			*len_sent += to_send;
-	}
-	/* include the null we didn't log */
-	return len + 1;
-}
-
 static void audit_log_execve_info(struct audit_context *context,
 				  struct audit_buffer **ab)
 {
-	int i, len;
-	size_t len_sent = 0;
-	const char __user *p;
+	long len_max;
+	long len_rem;
+	long len_full;
+	long len_buf;
+	long len_abuf;
+	long len_tmp;
+	bool require_data;
+	bool encode;
+	unsigned int iter;
+	unsigned int arg;
+	char *buf_head;
 	char *buf;
+	const char __user *p = (const char __user *)current->mm->arg_start;
 
-	p = (const char __user *)current->mm->arg_start;
+	/* NOTE: this buffer needs to be large enough to hold all the non-arg
+	 *       data we put in the audit record for this argument (see the
+	 *       code below) ... at this point in time 96 is plenty */
+	char abuf[96];
 
-	audit_log_format(*ab, "argc=%d", context->execve.argc);
+	/* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
+	 *       current value of 7500 is not as important as the fact that it
+	 *       is less than 8k, a setting of 7500 gives us plenty of wiggle
+	 *       room if we go over a little bit in the logging below */
+	WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
+	len_max = MAX_EXECVE_AUDIT_LEN;
 
-	/*
-	 * we need some kernel buffer to hold the userspace args.  Just
-	 * allocate one big one rather than allocating one of the right size
-	 * for every single argument inside audit_log_single_execve_arg()
-	 * should be <8k allocation so should be pretty safe.
-	 */
-	buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
-	if (!buf) {
+	/* scratch buffer to hold the userspace args */
+	buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+	if (!buf_head) {
 		audit_panic("out of memory for argv string");
 		return;
 	}
+	buf = buf_head;
 
-	for (i = 0; i < context->execve.argc; i++) {
-		len = audit_log_single_execve_arg(context, ab, i,
-						  &len_sent, p, buf);
-		if (len <= 0)
-			break;
-		p += len;
-	}
-	kfree(buf);
+	audit_log_format(*ab, "argc=%d", context->execve.argc);
+
+	len_rem = len_max;
+	len_buf = 0;
+	len_full = 0;
+	require_data = true;
+	encode = false;
+	iter = 0;
+	arg = 0;
+	do {
+		/* NOTE: we don't ever want to trust this value for anything
+		 *       serious, but the audit record format insists we
+		 *       provide an argument length for really long arguments,
+		 *       e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
+		 *       to use strncpy_from_user() to obtain this value for
+		 *       recording in the log, although we don't use it
+		 *       anywhere here to avoid a double-fetch problem */
+		if (len_full == 0)
+			len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+		/* read more data from userspace */
+		if (require_data) {
+			/* can we make more room in the buffer? */
+			if (buf != buf_head) {
+				memmove(buf_head, buf, len_buf);
+				buf = buf_head;
+			}
+
+			/* fetch as much as we can of the argument */
+			len_tmp = strncpy_from_user(&buf_head[len_buf], p,
+						    len_max - len_buf);
+			if (len_tmp == -EFAULT) {
+				/* unable to copy from userspace */
+				send_sig(SIGKILL, current, 0);
+				goto out;
+			} else if (len_tmp == (len_max - len_buf)) {
+				/* buffer is not large enough */
+				require_data = true;
+				/* NOTE: if we are going to span multiple
+				 *       buffers force the encoding so we stand
+				 *       a chance at a sane len_full value and
+				 *       consistent record encoding */
+				encode = true;
+				len_full = len_full * 2;
+				p += len_tmp;
+			} else {
+				require_data = false;
+				if (!encode)
+					encode = audit_string_contains_control(
+								buf, len_tmp);
+				/* try to use a trusted value for len_full */
+				if (len_full < len_max)
+					len_full = (encode ?
+						    len_tmp * 2 : len_tmp);
+				p += len_tmp + 1;
+			}
+			len_buf += len_tmp;
+			buf_head[len_buf] = '\0';
+
+			/* length of the buffer in the audit record? */
+			len_abuf = (encode ? len_buf * 2 : len_buf + 2);
+		}
+
+		/* write as much as we can to the audit log */
+		if (len_buf > 0) {
+			/* NOTE: some magic numbers here - basically if we
+			 *       can't fit a reasonable amount of data into the
+			 *       existing audit buffer, flush it and start with
+			 *       a new buffer */
+			if ((sizeof(abuf) + 8) > len_rem) {
+				len_rem = len_max;
+				audit_log_end(*ab);
+				*ab = audit_log_start(context,
+						      GFP_KERNEL, AUDIT_EXECVE);
+				if (!*ab)
+					goto out;
+			}
+
+			/* create the non-arg portion of the arg record */
+			len_tmp = 0;
+			if (require_data || (iter > 0) ||
+			    ((len_abuf + sizeof(abuf)) > len_rem)) {
+				if (iter == 0) {
+					len_tmp += snprintf(&abuf[len_tmp],
+							sizeof(abuf) - len_tmp,
+							" a%d_len=%lu",
+							arg, len_full);
+				}
+				len_tmp += snprintf(&abuf[len_tmp],
+						    sizeof(abuf) - len_tmp,
+						    " a%d[%d]=", arg, iter++);
+			} else
+				len_tmp += snprintf(&abuf[len_tmp],
+						    sizeof(abuf) - len_tmp,
+						    " a%d=", arg);
+			WARN_ON(len_tmp >= sizeof(abuf));
+			abuf[sizeof(abuf) - 1] = '\0';
+
+			/* log the arg in the audit record */
+			audit_log_format(*ab, "%s", abuf);
+			len_rem -= len_tmp;
+			len_tmp = len_buf;
+			if (encode) {
+				if (len_abuf > len_rem)
+					len_tmp = len_rem / 2; /* encoding */
+				audit_log_n_hex(*ab, buf, len_tmp);
+				len_rem -= len_tmp * 2;
+				len_abuf -= len_tmp * 2;
+			} else {
+				if (len_abuf > len_rem)
+					len_tmp = len_rem - 2; /* quotes */
+				audit_log_n_string(*ab, buf, len_tmp);
+				len_rem -= len_tmp + 2;
+				/* don't subtract the "2" because we still need
+				 * to add quotes to the remaining string */
+				len_abuf -= len_tmp;
+			}
+			len_buf -= len_tmp;
+			buf += len_tmp;
+		}
+
+		/* ready to move to the next argument? */
+		if ((len_buf == 0) && !require_data) {
+			arg++;
+			iter = 0;
+			len_full = 0;
+			require_data = true;
+			encode = false;
+		}
+	} while (arg < context->execve.argc);
+
+	/* NOTE: the caller handles the final audit_log_end() call */
+
+out:
+	kfree(buf_head);
 }
 
 static void show_special(struct audit_context *context, int *call_panic)

From 8a627afb7a4ffda3d6f87d9bcaa19b839d247f5d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 23 Feb 2016 14:58:52 -0800
Subject: [PATCH 0318/3220] UPSTREAM: x86: fix SMAP in 32-bit environments

(cherry picked from commit de9e478b9d49f3a0214310d921450cf5bb4a21e6)

In commit 11f1a4b9755f ("x86: reorganize SMAP handling in user space
accesses") I changed how the stac/clac instructions were generated
around the user space accesses, which then made it possible to do
batched accesses efficiently for user string copies etc.

However, in doing so, I completely spaced out, and didn't even think
about the 32-bit case.  And nobody really even seemed to notice, because
SMAP doesn't even exist until modern Skylake processors, and you'd have
to be crazy to run 32-bit kernels on a modern CPU.

Which brings us to Andy Lutomirski.

He actually tested the 32-bit kernel on new hardware, and noticed that
it doesn't work.  My bad.  The trivial fix is to add the required
uaccess begin/end markers around the raw accesses in <asm/uaccess_32.h>.

I feel a bit bad about this patch, just because that header file really
should be cleaned up to avoid all the duplicated code in it, and this
commit just expands on the problem.  But this just fixes the bug without
any bigger cleanup surgery.

Reported-and-tested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: Ic044ebfe658a13179984111d062ca3a0b1404110
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/x86/include/asm/uaccess_32.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 2ffc9188cc70..c3724acd0fe4 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -49,20 +49,28 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__put_user_size(*(u8 *)from, (u8 __user *)to,
 					1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__put_user_size(*(u16 *)from, (u16 __user *)to,
 					2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__put_user_size(*(u32 *)from, (u32 __user *)to,
 					4, ret, 4);
+			__uaccess_end();
 			return ret;
 		case 8:
+			__uaccess_begin();
 			__put_user_size(*(u64 *)from, (u64 __user *)to,
 					8, ret, 8);
+			__uaccess_end();
 			return ret;
 		}
 	}
@@ -104,13 +112,19 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			__uaccess_end();
 			return ret;
 		}
 	}
@@ -150,13 +164,19 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			__uaccess_end();
 			return ret;
 		}
 	}
@@ -172,13 +192,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			__uaccess_end();
 			return ret;
 		}
 	}

From 5b432907fedfa6413bfe5971068853a786c4be16 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 23 Nov 2015 13:26:19 +0000
Subject: [PATCH 0319/3220] UPSTREAM: arm64: mm: detect bad __create_mapping
 uses

If a caller of __create_mapping provides a PA and VA which have
different sub-page offsets, it is not clear which offset they expect to
apply to the mapping, and is indicative of a bad caller.

In some cases, the region we wish to map may validly have a sub-page
offset in the physical and virtual addresses. For example, EFI runtime
regions have 4K granularity, yet may be mapped by a 64K page kernel. So
long as the physical and virtual offsets are the same, the region will
be mapped at the expected VAs.

Disallow calls with differing sub-page offsets, and WARN when they are
encountered, so that we can detect and fix such cases.

Cc: Laura Abbott <labbott@fedoraproject.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Steve Capper <steve.capper@linaro.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: __create_mapping-fixes

(cherry picked from commit cc5d2b3b95cdbb3fed4e38e667d17b9ac7250f7a)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I114a1265b10ff76daff385728d2125e618c313a1
---
 arch/arm64/mm/mmu.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9a8dc4a79d8b..52f03bb07429 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -251,6 +251,13 @@ static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
 {
 	unsigned long addr, length, end, next;
 
+	/*
+	 * If the virtual and physical address don't have the same offset
+	 * within a page, we cannot map the region as the caller expects.
+	 */
+	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
+		return;
+
 	addr = virt & PAGE_MASK;
 	length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
 

From ab8158d22085047097653a799fc9bae6e9920edf Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 23 Nov 2015 13:26:20 +0000
Subject: [PATCH 0320/3220] UPSTREAM: arm64: mm: allow sections for unaligned
 bases

Callees of __create_mapping may decide to create section mappings if
sufficient low bits of the physical and virtual addresses they were
passed are zero. While __create_mapping rounds the virtual base address
down, it does not similarly round the physical base address down, and
hence non-zero bits in the physical address can prevent use of a section
mapping, even where a whole next-level table would be used instead.

Round down the physical base address in __create_mapping to enable all
callees to always create section mappings when such a mapping is
possible.

Cc: Laura Abbott <labbott@fedoraproject.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Steve Capper <steve.capper@linaro.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: __create_mapping-fixes

(cherry picked from commit 9c4e08a3022b6df90d31ef4007291faabfce5431)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ic447350efdba3cd9f9e101c72183e04e39dd28d2
---
 arch/arm64/mm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 52f03bb07429..4c381f0e77a7 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -258,6 +258,7 @@ static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
 	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
 		return;
 
+	phys &= PAGE_MASK;
 	addr = virt & PAGE_MASK;
 	length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
 

From 7152164d7b5fdfe68423ca1568fc227e92d3d6f7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 30 Oct 2015 18:56:19 +0000
Subject: [PATCH 0321/3220] UPSTREAM: arm64: pgtable: implement
 pte_accessible()

This patch implements the pte_accessible() macro, which can be used to
test whether or not a given pte is a candidate for allocation in the
TLB.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029

(cherry picked from commit 76c714be0e5e60c935a53b31be58939510ba1d0f)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I249e2d15665870149dd17d1cdb3850008f5a56fd
---
 arch/arm64/include/asm/pgtable.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 63f52b55defe..824416a219e7 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -167,6 +167,16 @@ extern struct page *empty_zero_page;
 	((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER))
 #define pte_valid_not_user(pte) \
 	((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID)
+#define pte_valid_young(pte) \
+	((pte_val(pte) & (PTE_VALID | PTE_AF)) == (PTE_VALID | PTE_AF))
+
+/*
+ * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
+ * so that we don't erroneously return false for pages that have been
+ * remapped as PROT_NONE but are yet to be flushed from the TLB.
+ */
+#define pte_accessible(mm, pte)	\
+	(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid_young(pte))
 
 static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
 {

From 4277d7534ed7af7f103e20504dff3aead7799c99 Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend.vanspriel@broadcom.com>
Date: Fri, 2 Sep 2016 09:37:24 +0100
Subject: [PATCH 0322/3220] UPSTREAM: brcmfmac: avoid potential stack overflow
 in brcmf_cfg80211_start_ap()

commit ded89912156b1a47d940a0c954c43afbabd0c42c upstream

User-space can choose to omit NL80211_ATTR_SSID and only provide raw
IE TLV data. When doing so it can provide SSID IE with length exceeding
the allowed size. The driver further processes this IE copying it
into a local variable without checking the length. Hence stack can be
corrupted and used as exploit.

Cc: stable@vger.kernel.org # v4.4, v4.1
Reported-by: Daxing Guo <freener.gdx@gmail.com>
Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c
index deb5f78dcacc..786fff3d7466 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c
@@ -4099,7 +4099,7 @@ brcmf_cfg80211_start_ap(struct wiphy *wiphy, struct net_device *ndev,
 				(u8 *)&settings->beacon.head[ie_offset],
 				settings->beacon.head_len - ie_offset,
 				WLAN_EID_SSID);
-		if (!ssid_ie)
+		if (!ssid_ie || ssid_ie->len > IEEE80211_MAX_SSID_LEN)
 			return -EINVAL;
 
 		memcpy(ssid_le.SSID, ssid_ie->data, ssid_ie->len);

From 1a8cf46bbfcc2eac1db8cfcc9c3b0596b9ee1d66 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Thu, 14 Apr 2016 17:01:17 +0200
Subject: [PATCH 0323/3220] UPSTREAM: usb: gadget: f_fs: Fix use-after-free

(cherry picked from commit 38740a5b87d53ceb89eb2c970150f6e94e00373a)

When using asynchronous read or write operations on the USB endpoints the
issuer of the IO request is notified by calling the ki_complete() callback
of the submitted kiocb when the URB has been completed.

Calling this ki_complete() callback will free kiocb. Make sure that the
structure is no longer accessed beyond that point, otherwise undefined
behaviour might occur.

Fixes: 2e4c7553cd6f ("usb: gadget: f_fs: add aio support")
Cc: <stable@vger.kernel.org> # v3.15+
Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Change-Id: I3c7b643f6440c4fb6160a57c1058523030b46a6c
Bug: 30950866
---
 drivers/usb/gadget/function/f_fs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index cf43e9e18368..79d895c2dd71 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -646,6 +646,7 @@ static void ffs_user_copy_worker(struct work_struct *work)
 						   work);
 	int ret = io_data->req->status ? io_data->req->status :
 					 io_data->req->actual;
+	bool kiocb_has_eventfd = io_data->kiocb->ki_flags & IOCB_EVENTFD;
 
 	if (io_data->read && ret > 0) {
 		use_mm(io_data->mm);
@@ -657,13 +658,11 @@ static void ffs_user_copy_worker(struct work_struct *work)
 
 	io_data->kiocb->ki_complete(io_data->kiocb, ret, ret);
 
-	if (io_data->ffs->ffs_eventfd &&
-	    !(io_data->kiocb->ki_flags & IOCB_EVENTFD))
+	if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd)
 		eventfd_signal(io_data->ffs->ffs_eventfd, 1);
 
 	usb_ep_free_request(io_data->ep, io_data->req);
 
-	io_data->kiocb->private = NULL;
 	if (io_data->read)
 		kfree(io_data->to_free);
 	kfree(io_data->buf);

From 6b51f3d20c52982bd71028509ebb3a4a2389850b Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Fri, 20 Nov 2015 17:59:10 +0800
Subject: [PATCH 0324/3220] UPSTREAM: arm64: add __init/__initdata section
 marker to some functions/variables

These functions/variables are not needed after booting, so mark them
as __init or __initdata.

Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029

(cherry picked from commit a7c61a3452d39078919f0e1f493ff966fb64f0db)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I50a3362e186750e139d2440d2c1e1d49ace896e1
---
 arch/arm64/kernel/armv8_deprecated.c | 6 +++---
 arch/arm64/kernel/cpufeature.c       | 9 +++++----
 arch/arm64/kernel/fpsimd.c           | 2 +-
 arch/arm64/mm/dma-mapping.c          | 4 ++--
 arch/arm64/mm/init.c                 | 6 +++---
 5 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 937f5e58a4d3..3e01207917b1 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -62,7 +62,7 @@ struct insn_emulation {
 };
 
 static LIST_HEAD(insn_emulation);
-static int nr_insn_emulated;
+static int nr_insn_emulated __initdata;
 static DEFINE_RAW_SPINLOCK(insn_emulation_lock);
 
 static void register_emulation_hooks(struct insn_emulation_ops *ops)
@@ -173,7 +173,7 @@ static int update_insn_emulation_mode(struct insn_emulation *insn,
 	return ret;
 }
 
-static void register_insn_emulation(struct insn_emulation_ops *ops)
+static void __init register_insn_emulation(struct insn_emulation_ops *ops)
 {
 	unsigned long flags;
 	struct insn_emulation *insn;
@@ -237,7 +237,7 @@ static struct ctl_table ctl_abi[] = {
 	{ }
 };
 
-static void register_insn_emulation_sysctl(struct ctl_table *table)
+static void __init register_insn_emulation_sysctl(struct ctl_table *table)
 {
 	unsigned long flags;
 	int i = 0;
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 0669c63281ea..5c90aa490a2b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -684,7 +684,7 @@ static const struct arm64_cpu_capabilities arm64_hwcaps[] = {
 	{},
 };
 
-static void cap_set_hwcap(const struct arm64_cpu_capabilities *cap)
+static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap)
 {
 	switch (cap->hwcap_type) {
 	case CAP_HWCAP:
@@ -729,7 +729,7 @@ static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities *
 	return rc;
 }
 
-static void setup_cpu_hwcaps(void)
+static void __init setup_cpu_hwcaps(void)
 {
 	int i;
 	const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps;
@@ -758,7 +758,8 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
  * Run through the enabled capabilities and enable() it on all active
  * CPUs
  */
-static void enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
+static void __init
+enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
 {
 	int i;
 
@@ -897,7 +898,7 @@ static inline void set_sys_caps_initialised(void)
 
 #endif	/* CONFIG_HOTPLUG_CPU */
 
-static void setup_feature_capabilities(void)
+static void __init setup_feature_capabilities(void)
 {
 	update_cpu_capabilities(arm64_features, "detected feature:");
 	enable_cpu_capabilities(arm64_features);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 4c46c54a3ad7..acc1afd5c749 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -289,7 +289,7 @@ static struct notifier_block fpsimd_cpu_pm_notifier_block = {
 	.notifier_call = fpsimd_cpu_pm_notifier,
 };
 
-static void fpsimd_pm_init(void)
+static void __init fpsimd_pm_init(void)
 {
 	cpu_pm_register_notifier(&fpsimd_cpu_pm_notifier_block);
 }
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 6ad554b01e95..7b71b7c47adb 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -40,7 +40,7 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot,
 static struct gen_pool *atomic_pool;
 
 #define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
 
 static int __init early_coherent_pool(char *p)
 {
@@ -896,7 +896,7 @@ static int __iommu_attach_notifier(struct notifier_block *nb,
 	return 0;
 }
 
-static int register_iommu_dma_ops_notifier(struct bus_type *bus)
+static int __init register_iommu_dma_ops_notifier(struct bus_type *bus)
 {
 	struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL);
 	int ret;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 325f89ff897e..d43715a14383 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -71,7 +71,7 @@ early_param("initrd", early_initrd);
  * currently assumes that for memory starting above 4G, 32-bit devices will
  * use a DMA offset.
  */
-static phys_addr_t max_zone_dma_phys(void)
+static phys_addr_t __init max_zone_dma_phys(void)
 {
 	phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32);
 	return min(offset + (1ULL << 32), memblock_end_of_DRAM());
@@ -128,11 +128,11 @@ EXPORT_SYMBOL(pfn_valid);
 #endif
 
 #ifndef CONFIG_SPARSEMEM
-static void arm64_memory_present(void)
+static void __init arm64_memory_present(void)
 {
 }
 #else
-static void arm64_memory_present(void)
+static void __init arm64_memory_present(void)
 {
 	struct memblock_region *reg;
 

From f65f45205e5bae2464be051cb6deca598f62f3e8 Mon Sep 17 00:00:00 2001
From: Yury Norov <ynorov@caviumnetworks.com>
Date: Wed, 2 Dec 2015 14:00:10 +0000
Subject: [PATCH 0325/3220] UPSTREAM: arm64: fix COMPAT_SHMLBA definition for
 large pages

ARM glibc uses (4 * __getpagesize()) for SHMLBA, which is correct for
4KB pages and works fine for 64KB pages, but the kernel uses a hardcoded
16KB that is too small for 64KB page based kernels. This changes the
definition to what user space sees when using 64KB pages.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029

(cherry picked from commit b9b7aebb42d1b1392f3111de61136bb6cf3aae3f)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I698814d005a28c7fe3963cded9756f88660d4aa0
---
 arch/arm64/include/asm/shmparam.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/shmparam.h b/arch/arm64/include/asm/shmparam.h
index 4df608a8459e..e368a55ebd22 100644
--- a/arch/arm64/include/asm/shmparam.h
+++ b/arch/arm64/include/asm/shmparam.h
@@ -21,7 +21,7 @@
  * alignment value. Since we don't have aliasing D-caches, the rest of
  * the time we can safely use PAGE_SIZE.
  */
-#define COMPAT_SHMLBA	0x4000
+#define COMPAT_SHMLBA	(4 * PAGE_SIZE)
 
 #include <asm-generic/shmparam.h>
 

From 9b49bf531ba02bcdbc6d4182a179a49dd30349d9 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 23 Nov 2015 15:12:59 +0000
Subject: [PATCH 0326/3220] UPSTREAM: arm64: enable HAVE_IRQ_TIME_ACCOUNTING

arm64 relies on the arm_arch_timer for sched_clock, so we can select
HAVE_IRQ_TIME_ACCOUNTING and have the core sched-clock code enable the
feature at runtime based on the rate.

Reported-by: Mario Smarduch <m.smarduch@samsung.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029

(cherry picked from commit 24da208db32ee1e4757ceaba898c47add8e5361e)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I849e010459dbbde0ac0d44d665dadcea9f8bf12d
---
 Documentation/features/time/irq-time-acct/arch-support.txt | 2 +-
 arch/arm64/Kconfig                                         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt
index e63316239938..4199ffecc0ff 100644
--- a/Documentation/features/time/irq-time-acct/arch-support.txt
+++ b/Documentation/features/time/irq-time-acct/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: |  ..  |
     |         arc: | TODO |
     |         arm: |  ok  |
-    |       arm64: |  ..  |
+    |       arm64: |  ok  |
     |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3ad9d9ea374b..92f234a5579b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -73,6 +73,7 @@ config ARM64
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS
+	select HAVE_IRQ_TIME_ACCOUNTING
 	select HAVE_MEMBLOCK
 	select HAVE_PATA_PLATFORM
 	select HAVE_PERF_EVENTS

From 5bb08d2a552a3a7ba1e84527ea7987018fcc2f69 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 19 Nov 2015 17:48:31 +0000
Subject: [PATCH 0327/3220] UPSTREAM: arm64: spinlock: serialise
 spin_unlock_wait against concurrent lockers

Boqun Feng reported a rather nasty ordering issue with spin_unlock_wait
on architectures implementing spin_lock with LL/SC sequences and acquire
semantics:

 | CPU 1                   CPU 2                     CPU 3
 | ==================      ====================      ==============
 |                                                   spin_unlock(&lock);
 |                         spin_lock(&lock):
 |                           r1 = *lock; // r1 == 0;
 |                         o = READ_ONCE(object); // reordered here
 | object = NULL;
 | smp_mb();
 | spin_unlock_wait(&lock);
 |                           *lock = 1;
 | smp_mb();
 | o->dead = true;
 |                         if (o) // true
 |                           BUG_ON(o->dead); // true!!

The crux of the problem is that spin_unlock_wait(&lock) can return on
CPU 1 whilst CPU 2 is in the process of taking the lock. This can be
resolved by upgrading spin_unlock_wait to a LOCK operation, forcing it
to serialise against a concurrent locker and giving it acquire semantics
in the process (although it is not at all clear whether this is needed -
different callers seem to assume different things about the barrier
semantics and architectures are similarly disjoint in their
implementations of the macro).

This patch implements spin_unlock_wait using an LL/SC sequence with
acquire semantics on arm64. For v8.1 systems with the LSE atomics, the
exclusive writeback is omitted, since the spin_lock operation is
indivisible and no intermediate state can be observed.

Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029

(cherry picked from commit d86b8da04dfa4771a68bdbad6c424d40f22f0d14)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I27d88458c99dfd475d0326bd1d408ec3e575a7dd
---
 arch/arm64/include/asm/spinlock.h | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index c85e96d174a5..fc9682bfe002 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -26,9 +26,28 @@
  * The memory barriers are implicit with the load-acquire and store-release
  * instructions.
  */
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	unsigned int tmp;
+	arch_spinlock_t lockval;
 
-#define arch_spin_unlock_wait(lock) \
-	do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+	asm volatile(
+"	sevl\n"
+"1:	wfe\n"
+"2:	ldaxr	%w0, %2\n"
+"	eor	%w1, %w0, %w0, ror #16\n"
+"	cbnz	%w1, 1b\n"
+	ARM64_LSE_ATOMIC_INSN(
+	/* LL/SC */
+"	stxr	%w1, %w0, %2\n"
+"	cbnz	%w1, 2b\n", /* Serialise against any concurrent lockers */
+	/* LSE atomics */
+"	nop\n"
+"	nop\n")
+	: "=&r" (lockval), "=&r" (tmp), "+Q" (*lock)
+	:
+	: "memory");
+}
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 

From b42c14a531b4b887ffca279ad12eb844a99408ae Mon Sep 17 00:00:00 2001
From: Li Bin <huawei.libin@huawei.com>
Date: Fri, 4 Dec 2015 11:38:39 +0800
Subject: [PATCH 0328/3220] UPSTREAM: arm64: ftrace: stop using kstop_machine
 to enable/disable tracing

For ftrace on arm64, kstop_machine which is hugely disruptive
to a running system is not needed to convert nops to ftrace calls
or back, because that to be modified instrucions, that NOP, B or BL,
are all safe instructions which called "concurrent modification
and execution of instructions", that can be executed by one
thread of execution as they are being modified by another thread
of execution without requiring explicit synchronization.

Signed-off-by: Li Bin <huawei.libin@huawei.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: arm64-ftrace

(cherry picked from commit 81a6a146e88eca5d6726569779778d61489d85aa)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I54e2c0d49bd68f9547bd9f0da8b7520e02bb0714
---
 arch/arm64/kernel/ftrace.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index c851be795080..9669b331a23b 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -93,6 +93,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 	return ftrace_modify_code(pc, old, new, true);
 }
 
+void arch_ftrace_update_code(int command)
+{
+	ftrace_modify_all_code(command);
+}
+
 int __init ftrace_dyn_arch_init(void)
 {
 	return 0;

From 2c21fce80b6fab7304d2081f2b1157033e30ad50 Mon Sep 17 00:00:00 2001
From: Li Bin <huawei.libin@huawei.com>
Date: Fri, 4 Dec 2015 11:38:40 +0800
Subject: [PATCH 0329/3220] UPSTREAM: arm64: ftrace: fix the comments for
 ftrace_modify_code

There is no need to worry about module and __init text disappearing
case, because that ftrace has a module notifier that is called when
a module is being unloaded and before the text goes away and this
code grabs the ftrace_lock mutex and removes the module functions
from the ftrace list, such that it will no longer do any
modifications to that module's text, the update to make functions
be traced or not is done under the ftrace_lock mutex as well.
And by now, __init section codes should not been modified
by ftrace, because it is black listed in recordmcount.c and
ignored by ftrace.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Li Bin <huawei.libin@huawei.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: arm64-ftrace

(cherry picked from commit 004ab584e028093996cf5b8e220b8bc50c5111cf)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I69df7eddbf9e17031920b950312399dc4d36c09e
---
 arch/arm64/kernel/ftrace.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 9669b331a23b..8f7005bc35bd 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -29,12 +29,11 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
 
 	/*
 	 * Note:
-	 * Due to modules and __init, code can disappear and change,
-	 * we need to protect against faulting as well as code changing.
-	 * We do this by aarch64_insn_*() which use the probe_kernel_*().
-	 *
-	 * No lock is held here because all the modifications are run
-	 * through stop_machine().
+	 * We are paranoid about modifying text, as if a bug were to happen, it
+	 * could cause us to read or write to someplace that could cause harm.
+	 * Carefully read and modify the code with aarch64_insn_*() which uses
+	 * probe_kernel_*(), and make sure what we read is what we expected it
+	 * to be before modifying it.
 	 */
 	if (validate) {
 		if (aarch64_insn_read((void *)pc, &replaced))

From 611a16d0e5fe5f9f77c62551d94770e2cc825eb5 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 4 Dec 2015 12:42:29 +0000
Subject: [PATCH 0330/3220] UPSTREAM: arm64: Add trace_hardirqs_off annotation
 in ret_to_user

When a kernel is built with CONFIG_TRACE_IRQFLAGS the following warning
is produced when entering userspace for the first time:

  WARNING: at /work/Linux/linux-2.6-aarch64/kernel/locking/lockdep.c:3519
  Modules linked in:
  CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc3+ #639
  Hardware name: Juno (DT)
  task: ffffffc9768a0000 ti: ffffffc9768a8000 task.ti: ffffffc9768a8000
  PC is at check_flags.part.22+0x19c/0x1a8
  LR is at check_flags.part.22+0x19c/0x1a8
  pc : [<ffffffc0000fba6c>] lr : [<ffffffc0000fba6c>] pstate: 600001c5
  sp : ffffffc9768abe10
  x29: ffffffc9768abe10 x28: ffffffc9768a8000
  x27: 0000000000000000 x26: 0000000000000001
  x25: 00000000000000a6 x24: ffffffc00064be6c
  x23: ffffffc0009f249e x22: ffffffc9768a0000
  x21: ffffffc97fea5480 x20: 00000000000001c0
  x19: ffffffc00169a000 x18: 0000005558cc7b58
  x17: 0000007fb78e3180 x16: 0000005558d2e238
  x15: ffffffffffffffff x14: 0ffffffffffffffd
  x13: 0000000000000008 x12: 0101010101010101
  x11: 7f7f7f7f7f7f7f7f x10: fefefefefefeff63
  x9 : 7f7f7f7f7f7f7f7f x8 : 6e655f7371726964
  x7 : 0000000000000001 x6 : ffffffc0001079c4
  x5 : 0000000000000000 x4 : 0000000000000001
  x3 : ffffffc001698438 x2 : 0000000000000000
  x1 : ffffffc9768a0000 x0 : 000000000000002e
  Call trace:
  [<ffffffc0000fba6c>] check_flags.part.22+0x19c/0x1a8
  [<ffffffc0000fc440>] lock_is_held+0x80/0x98
  [<ffffffc00064bafc>] __schedule+0x404/0x730
  [<ffffffc00064be6c>] schedule+0x44/0xb8
  [<ffffffc000085bb0>] ret_to_user+0x0/0x24
  possible reason: unannotated irqs-off.
  irq event stamp: 502169
  hardirqs last  enabled at (502169): [<ffffffc000085a98>] el0_irq_naked+0x1c/0x24
  hardirqs last disabled at (502167): [<ffffffc0000bb3bc>] __do_softirq+0x17c/0x298
  softirqs last  enabled at (502168): [<ffffffc0000bb43c>] __do_softirq+0x1fc/0x298
  softirqs last disabled at (502143): [<ffffffc0000bb830>] irq_exit+0xa0/0xf0

This happens because we disable interrupts in ret_to_user before calling
schedule() in work_resched. This patch adds the necessary
trace_hardirqs_off annotation.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029

(cherry picked from commit db3899a6477a4dccd26cbfb7f408b6be2cc068e0)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I868c6c64c548f12156467959e2b8df09caae282f
---
 arch/arm64/kernel/entry.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 7ed3d75f6304..e5b25389c48f 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -634,6 +634,9 @@ work_pending:
 	bl	do_notify_resume
 	b	ret_to_user
 work_resched:
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bl	trace_hardirqs_off		// the IRQs are off here, inform the tracing code
+#endif
 	bl	schedule
 
 /*

From b7547d55d7a578f62053c86f475eabd68282e6aa Mon Sep 17 00:00:00 2001
From: Jungseok Lee <jungseoklee85@gmail.com>
Date: Fri, 4 Dec 2015 11:02:25 +0000
Subject: [PATCH 0331/3220] UPSTREAM: arm64: Store struct thread_info in sp_el0

There is need for figuring out how to manage struct thread_info data when
IRQ stack is introduced. struct thread_info information should be copied
to IRQ stack under the current thread_info calculation logic whenever
context switching is invoked. This is too expensive to keep supporting
the approach.

Instead, this patch pays attention to sp_el0 which is an unused scratch
register in EL1 context. sp_el0 utilization not only simplifies the
management, but also prevents text section size from being increased
largely due to static allocated IRQ stack as removing masking operation
using THREAD_SIZE in many places.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jungseok Lee <jungseoklee85@gmail.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: per-cpu-irq-stack

(cherry picked from commit 6cdf9c7ca687e01840d0215437620a20263012fc)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I53c9f44a0772b8649f302a65a7a6519d8eebcb91
---
 arch/arm64/include/asm/thread_info.h | 10 ++++++++--
 arch/arm64/kernel/entry.S            | 15 ++++++++++++---
 arch/arm64/kernel/head.S             |  5 +++++
 arch/arm64/kernel/sleep.S            |  3 +++
 4 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 90c7ff233735..abd64bd1f6d9 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -73,10 +73,16 @@ register unsigned long current_stack_pointer asm ("sp");
  */
 static inline struct thread_info *current_thread_info(void) __attribute_const__;
 
+/*
+ * struct thread_info can be accessed directly via sp_el0.
+ */
 static inline struct thread_info *current_thread_info(void)
 {
-	return (struct thread_info *)
-		(current_stack_pointer & ~(THREAD_SIZE - 1));
+	unsigned long sp_el0;
+
+	asm ("mrs %0, sp_el0" : "=r" (sp_el0));
+
+	return (struct thread_info *)sp_el0;
 }
 
 #define thread_saved_pc(tsk)	\
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index e5b25389c48f..245fa6837880 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -88,7 +88,8 @@
 
 	.if	\el == 0
 	mrs	x21, sp_el0
-	get_thread_info tsk			// Ensure MDSCR_EL1.SS is clear,
+	mov	tsk, sp
+	and	tsk, tsk, #~(THREAD_SIZE - 1)	// Ensure MDSCR_EL1.SS is clear,
 	ldr	x19, [tsk, #TI_FLAGS]		// since we can unmask debug
 	disable_step_tsk x19, x20		// exceptions when scheduling.
 	.else
@@ -107,6 +108,13 @@
 	str	x21, [sp, #S_SYSCALLNO]
 	.endif
 
+	/*
+	 * Set sp_el0 to current thread_info.
+	 */
+	.if	\el == 0
+	msr	sp_el0, tsk
+	.endif
+
 	/*
 	 * Registers that may be useful after this macro is invoked:
 	 *
@@ -164,8 +172,7 @@ alternative_endif
 	.endm
 
 	.macro	get_thread_info, rd
-	mov	\rd, sp
-	and	\rd, \rd, #~(THREAD_SIZE - 1)	// top of stack
+	mrs	\rd, sp_el0
 	.endm
 
 /*
@@ -599,6 +606,8 @@ ENTRY(cpu_switch_to)
 	ldp	x29, x9, [x8], #16
 	ldr	lr, [x8]
 	mov	sp, x9
+	and	x9, x9, #~(THREAD_SIZE - 1)
+	msr	sp_el0, x9
 	ret
 ENDPROC(cpu_switch_to)
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 23cfc08fc8ba..b363f340f2c7 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -424,6 +424,9 @@ __mmap_switched:
 	b	1b
 2:
 	adr_l	sp, initial_sp, x4
+	mov	x4, sp
+	and	x4, x4, #~(THREAD_SIZE - 1)
+	msr	sp_el0, x4			// Save thread_info
 	str_l	x21, __fdt_pointer, x5		// Save FDT pointer
 	str_l	x24, memstart_addr, x6		// Save PHYS_OFFSET
 	mov	x29, #0
@@ -606,6 +609,8 @@ ENDPROC(secondary_startup)
 ENTRY(__secondary_switched)
 	ldr	x0, [x21]			// get secondary_data.stack
 	mov	sp, x0
+	and	x0, x0, #~(THREAD_SIZE - 1)
+	msr	sp_el0, x0			// save thread_info
 	mov	x29, #0
 	b	secondary_start_kernel
 ENDPROC(__secondary_switched)
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index f586f7c875e2..e33fe33876ab 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -173,6 +173,9 @@ ENTRY(cpu_resume)
 	/* load physical address of identity map page table in x1 */
 	adrp	x1, idmap_pg_dir
 	mov	sp, x2
+	/* save thread_info */
+	and	x2, x2, #~(THREAD_SIZE - 1)
+	msr	sp_el0, x2
 	/*
 	 * cpu_do_resume expects x0 to contain context physical address
 	 * pointer and x1 to contain physical address of 1:1 page tables

From 52494a8bb8a274cb31bb241bb92bd2617855f9f8 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Fri, 4 Dec 2015 11:02:26 +0000
Subject: [PATCH 0332/3220] UPSTREAM: arm64: Modify stack trace and dump for
 use with irq_stack

This patch allows unwind_frame() to traverse from interrupt stack to task
stack correctly. It requires data from a dummy stack frame, created
during irq_stack_entry(), added by a later patch.

A similar approach is taken to modify dump_backtrace(), which expects to
find struct pt_regs underneath any call to functions marked __exception.
When on an irq_stack, the struct pt_regs is stored on the old task stack,
the location of which is stored in the dummy stack frame.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
[james.morse: merged two patches, reworked for per_cpu irq_stacks, and
 no alignment guarantees, added irq_stack definitions]
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: per-cpu-irq-stack

(cherry picked from commit 132cd887b5c54758d04bf25c52fa48f45e843a30)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I60b29291620a71ab7b6564730299d29f41ceb199
---
 arch/arm64/include/asm/irq.h   | 32 ++++++++++++++++++++++++++++++++
 arch/arm64/kernel/irq.c        |  3 +++
 arch/arm64/kernel/stacktrace.c | 29 +++++++++++++++++++++++++++--
 arch/arm64/kernel/traps.c      | 14 +++++++++++++-
 4 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index 8e8d30684392..e2f3f135a3bc 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -1,10 +1,32 @@
 #ifndef __ASM_IRQ_H
 #define __ASM_IRQ_H
 
+#define IRQ_STACK_SIZE			THREAD_SIZE
+#define IRQ_STACK_START_SP		THREAD_START_SP
+
+#ifndef __ASSEMBLER__
+
+#include <linux/percpu.h>
+
 #include <asm-generic/irq.h>
+#include <asm/thread_info.h>
 
 struct pt_regs;
 
+DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
+
+/*
+ * The highest address on the stack, and the first to be used. Used to
+ * find the dummy-stack frame put down by el?_irq() in entry.S.
+ */
+#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP)
+
+/*
+ * The offset from irq_stack_ptr where entry.S will store the original
+ * stack pointer. Used by unwind_frame() and dump_backtrace().
+ */
+#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10));
+
 extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
 
 static inline int nr_legacy_irqs(void)
@@ -12,4 +34,14 @@ static inline int nr_legacy_irqs(void)
 	return 0;
 }
 
+static inline bool on_irq_stack(unsigned long sp, int cpu)
+{
+	/* variable names the same as kernel/stacktrace.c */
+	unsigned long low = (unsigned long)per_cpu(irq_stack, cpu);
+	unsigned long high = low + IRQ_STACK_START_SP;
+
+	return (low <= sp && sp <= high);
+}
+
+#endif /* !__ASSEMBLER__ */
 #endif
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 9f17ec071ee0..1e3cef578e21 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -30,6 +30,9 @@
 
 unsigned long irq_err_count;
 
+/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */
+DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16);
+
 int arch_show_interrupts(struct seq_file *p, int prec)
 {
 	show_ipi_list(p, prec);
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index ccb6078ed9f2..b947eeffa5b2 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
 
+#include <asm/irq.h>
 #include <asm/stacktrace.h>
 
 /*
@@ -39,17 +40,41 @@ int notrace unwind_frame(struct stackframe *frame)
 {
 	unsigned long high, low;
 	unsigned long fp = frame->fp;
+	unsigned long irq_stack_ptr;
+
+	/*
+	 * Use raw_smp_processor_id() to avoid false-positives from
+	 * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping
+	 * task stacks, we can be pre-empted in this case, so
+	 * {raw_,}smp_processor_id() may give us the wrong value. Sleeping
+	 * tasks can't ever be on an interrupt stack, so regardless of cpu,
+	 * the checks will always fail.
+	 */
+	irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id());
 
 	low  = frame->sp;
-	high = ALIGN(low, THREAD_SIZE);
+	/* irq stacks are not THREAD_SIZE aligned */
+	if (on_irq_stack(frame->sp, raw_smp_processor_id()))
+		high = irq_stack_ptr;
+	else
+		high = ALIGN(low, THREAD_SIZE) - 0x20;
 
-	if (fp < low || fp > high - 0x18 || fp & 0xf)
+	if (fp < low || fp > high || fp & 0xf)
 		return -EINVAL;
 
 	frame->sp = fp + 0x10;
 	frame->fp = *(unsigned long *)(fp);
 	frame->pc = *(unsigned long *)(fp + 8);
 
+	/*
+	 * Check whether we are going to walk through from interrupt stack
+	 * to task stack.
+	 * If we reach the end of the stack - and its an interrupt stack,
+	 * read the original task stack pointer from the dummy frame.
+	 */
+	if (frame->sp == irq_stack_ptr)
+		frame->sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr);
+
 	return 0;
 }
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index e9b9b5364393..8a0084541f84 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -146,6 +146,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs)
 static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 {
 	struct stackframe frame;
+	unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
 
 	pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
 
@@ -180,9 +181,20 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 		if (ret < 0)
 			break;
 		stack = frame.sp;
-		if (in_exception_text(where))
+		if (in_exception_text(where)) {
+			/*
+			 * If we switched to the irq_stack before calling this
+			 * exception handler, then the pt_regs will be on the
+			 * task stack. The easiest way to tell is if the large
+			 * pt_regs would overlap with the end of the irq_stack.
+			 */
+			if (stack < irq_stack_ptr &&
+			    (stack + sizeof(struct pt_regs)) > irq_stack_ptr)
+				stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr);
+
 			dump_mem("", "Exception stack", stack,
 				 stack + sizeof(struct pt_regs), false);
+		}
 	}
 }
 

From d8c228e9513c852cd1c6388a8e011eec9120435c Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 4 Dec 2015 11:02:27 +0000
Subject: [PATCH 0333/3220] UPSTREAM: arm64: Add do_softirq_own_stack() and
 enable irq_stacks

entry.S is modified to switch to the per_cpu irq_stack during el{0,1}_irq.
irq_count is used to detect recursive interrupts on the irq_stack, it is
updated late by do_softirq_own_stack(), when called on the irq_stack, before
__do_softirq() re-enables interrupts to process softirqs.

do_softirq_own_stack() is added by this patch, but does not yet switch
stack.

This patch adds the dummy stack frame and data needed by the previous
stack tracing patches.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: per-cpu-irq-stack

(cherry picked from commit 8e23dacd12a48e58125b84c817da50850b73280a)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I9f79437af3da0398cb12e7afd1aa9f473f59b2e6
---
 arch/arm64/include/asm/irq.h |  2 ++
 arch/arm64/kernel/entry.S    | 42 ++++++++++++++++++++++++++++++++++--
 arch/arm64/kernel/irq.c      | 38 +++++++++++++++++++++++++++++++-
 3 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index e2f3f135a3bc..fa2a8d0e4792 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -11,6 +11,8 @@
 #include <asm-generic/irq.h>
 #include <asm/thread_info.h>
 
+#define __ARCH_HAS_DO_SOFTIRQ
+
 struct pt_regs;
 
 DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 245fa6837880..8f7e737949fe 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -27,6 +27,7 @@
 #include <asm/cpufeature.h>
 #include <asm/errno.h>
 #include <asm/esr.h>
+#include <asm/irq.h>
 #include <asm/thread_info.h>
 #include <asm/unistd.h>
 
@@ -175,6 +176,42 @@ alternative_endif
 	mrs	\rd, sp_el0
 	.endm
 
+	.macro	irq_stack_entry, dummy_lr
+	mov	x19, sp			// preserve the original sp
+
+	adr_l	x25, irq_stack
+	mrs	x26, tpidr_el1
+	add	x25, x25, x26
+
+	/*
+	 * Check the lowest address on irq_stack for the irq_count value,
+	 * incremented by do_softirq_own_stack if we have re-enabled irqs
+	 * while on the irq_stack.
+	 */
+	ldr	x26, [x25]
+	cbnz	x26, 9998f		// recursive use?
+
+	/* switch to the irq stack */
+	mov	x26, #IRQ_STACK_START_SP
+	add	x26, x25, x26
+	mov	sp, x26
+
+	/* Add a dummy stack frame */
+	stp     x29, \dummy_lr, [sp, #-16]!           // dummy stack frame
+	mov	x29, sp
+	stp     xzr, x19, [sp, #-16]!
+
+9998:
+	.endm
+
+	/*
+	 * x19 should be preserved between irq_stack_entry and
+	 * irq_stack_exit.
+	 */
+	.macro	irq_stack_exit
+	mov	sp, x19
+	.endm
+
 /*
  * These are the registers used in the syscall handler, and allow us to
  * have in theory up to 7 arguments to a function - x0 to x6.
@@ -190,10 +227,11 @@ tsk	.req	x28		// current thread_info
  * Interrupt handling.
  */
 	.macro	irq_handler
-	adrp	x1, handle_arch_irq
-	ldr	x1, [x1, #:lo12:handle_arch_irq]
+	ldr_l	x1, handle_arch_irq
 	mov	x0, sp
+	irq_stack_entry x22
 	blr	x1
+	irq_stack_exit
 	.endm
 
 	.text
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 1e3cef578e21..ff7ebb710e51 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -25,14 +25,24 @@
 #include <linux/irq.h>
 #include <linux/smp.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/irqchip.h>
 #include <linux/seq_file.h>
 
 unsigned long irq_err_count;
 
-/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */
+/*
+ * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned.
+ * irq_stack[0] is used as irq_count, a non-zero value indicates the stack
+ * is in use, and el?_irq() shouldn't switch to it. This is used to detect
+ * recursive use of the irq_stack, it is lazily updated by
+ * do_softirq_own_stack(), which is called on the irq_stack, before
+ * re-enabling interrupts to process softirqs.
+ */
 DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16);
 
+#define IRQ_COUNT()	(*per_cpu(irq_stack, smp_processor_id()))
+
 int arch_show_interrupts(struct seq_file *p, int prec)
 {
 	show_ipi_list(p, prec);
@@ -56,3 +66,29 @@ void __init init_IRQ(void)
 	if (!handle_arch_irq)
 		panic("No interrupt controller found.");
 }
+
+/*
+ * do_softirq_own_stack() is called from irq_exit() before __do_softirq()
+ * re-enables interrupts, at which point we may re-enter el?_irq(). We
+ * increase irq_count here so that el1_irq() knows that it is already on the
+ * irq stack.
+ *
+ * Called with interrupts disabled, so we don't worry about moving cpu, or
+ * being interrupted while modifying irq_count.
+ *
+ * This function doesn't actually switch stack.
+ */
+void do_softirq_own_stack(void)
+{
+	int cpu = smp_processor_id();
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	if (on_irq_stack(current_stack_pointer, cpu)) {
+		IRQ_COUNT()++;
+		__do_softirq();
+		IRQ_COUNT()--;
+	} else {
+		__do_softirq();
+	}
+}

From b485dccd9eee43b0739198f53459c6a0088e93a5 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 9 Dec 2015 13:58:42 +0000
Subject: [PATCH 0334/3220] UPSTREAM: arm64: irq: fix walking from irq stack to
 task stack

Running with CONFIG_DEBUG_SPINLOCK=y can trigger a BUG with the new IRQ
stack code:

  BUG: spinlock lockup suspected on CPU#1

This is due to the IRQ_STACK_TO_TASK_STACK macro incorrectly retrieving
the task stack pointer stashed at the top of the IRQ stack.

Sayeth James:

| Yup, this is what is happening. Its an off-by-one due to broken
| thinking about how the stack works. My broken thinking was:
|
| >   top ------------
| >       | dummy_lr | <- irq_stack_ptr
| >       ------------
| >       |   x29    |
| >       ------------
| >       |   x19    | <- irq_stack_ptr - 0x10
| >       ------------
| >       |   xzr    |
| >       ------------
|
| But the stack-pointer is decreased before use. So it actually looks
| like this:
|
| >       ------------
| >       |          |  <- irq_stack_ptr
| >   top ------------
| >       | dummy_lr |
| >       ------------
| >       |   x29    | <- irq_stack_ptr - 0x10
| >       ------------
| >       |   x19    |
| >       ------------
| >       |   xzr    | <- irq_stack_ptr - 0x20
| >       ------------
|
| The value being used as the original stack is x29, which in all the
| tests is sp but without the current frames data, hence there are no
| missing frames in the output.
|
| Jungseok Lee picked it up with a 32bit user space because aarch32
| can't use x29, so it remains 0 forever. The fix he posted is correct.

This patch fixes the macro and adds some of this wisdom to a comment,
so that the layout of the IRQ stack is well understood.

Cc: James Morse <james.morse@arm.com>
Reported-by: Jungseok Lee <jungseoklee85@gmail.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: per-cpu-irq-stack

(cherry picked from commit 7596abf2e5661d52c4f414f37addeed54e098880)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ic65c0d0d90a30a5caf8b3791d1e856400bd2b5f5
---
 arch/arm64/include/asm/irq.h | 20 ++++++++++++++++++--
 arch/arm64/kernel/entry.S    |  2 +-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index fa2a8d0e4792..877c7e358384 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -19,7 +19,23 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
 
 /*
  * The highest address on the stack, and the first to be used. Used to
- * find the dummy-stack frame put down by el?_irq() in entry.S.
+ * find the dummy-stack frame put down by el?_irq() in entry.S, which
+ * is structured as follows:
+ *
+ *       ------------
+ *       |          |  <- irq_stack_ptr
+ *   top ------------
+ *       |  elr_el1 |
+ *       ------------
+ *       |   x29    | <- irq_stack_ptr - 0x10
+ *       ------------
+ *       |   xzr    |
+ *       ------------
+ *       |   x19    | <- irq_stack_ptr - 0x20
+ *       ------------
+ *
+ * where x19 holds a copy of the task stack pointer.
+ *
  */
 #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP)
 
@@ -27,7 +43,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
  * The offset from irq_stack_ptr where entry.S will store the original
  * stack pointer. Used by unwind_frame() and dump_backtrace().
  */
-#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10));
+#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20));
 
 extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 8f7e737949fe..be7ec544b540 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -199,7 +199,7 @@ alternative_endif
 	/* Add a dummy stack frame */
 	stp     x29, \dummy_lr, [sp, #-16]!           // dummy stack frame
 	mov	x29, sp
-	stp     xzr, x19, [sp, #-16]!
+	stp     x19, xzr, [sp, #-16]!
 
 9998:
 	.endm

From 722e6114950d4dfdf890c045461932c91a4209d7 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Thu, 10 Dec 2015 10:22:39 +0000
Subject: [PATCH 0335/3220] UPSTREAM: arm64: Add this_cpu_ptr() assembler macro
 for use in entry.S

irq_stack is a per_cpu variable, that needs to be access from entry.S.
Use an assembler macro instead of the unreadable details.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: per-cpu-irq-stack

(cherry picked from commit aa4d5d3cbc258c355151a3903211b27359390ec5)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I2ccc11f442c0303c62e1c3e0a05a088958c922b8
---
 arch/arm64/include/asm/assembler.h | 11 +++++++++++
 arch/arm64/kernel/entry.S          |  4 +---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 12eff928ef8b..bb7b72734c24 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -193,6 +193,17 @@ lr	.req	x30		// link register
 	str	\src, [\tmp, :lo12:\sym]
 	.endm
 
+	/*
+	 * @sym: The name of the per-cpu variable
+	 * @reg: Result of per_cpu(sym, smp_processor_id())
+	 * @tmp: scratch register
+	 */
+	.macro this_cpu_ptr, sym, reg, tmp
+	adr_l	\reg, \sym
+	mrs	\tmp, tpidr_el1
+	add	\reg, \reg, \tmp
+	.endm
+
 /*
  * Annotate a function as position independent, i.e., safe to be called before
  * the kernel virtual mapping is activated.
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index be7ec544b540..e394f8c9595a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -179,9 +179,7 @@ alternative_endif
 	.macro	irq_stack_entry, dummy_lr
 	mov	x19, sp			// preserve the original sp
 
-	adr_l	x25, irq_stack
-	mrs	x26, tpidr_el1
-	add	x25, x25, x26
+	this_cpu_ptr irq_stack, x25, x26
 
 	/*
 	 * Check the lowest address on irq_stack for the irq_count value,

From 4ba051d5a5476288cb27912c90de10aaf7c7f151 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Thu, 10 Dec 2015 10:22:40 +0000
Subject: [PATCH 0336/3220] UPSTREAM: arm64: when walking onto the task stack,
 check sp & fp are in current->stack

When unwind_frame() reaches the bottom of the irq_stack, the last fp
points to the original task stack. unwind_frame() uses
IRQ_STACK_TO_TASK_STACK() to find the sp value. If either values is
wrong, we may end up walking a corrupt stack.

Check these values are sane by testing if they are both on the stack
pointed to by current->stack.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: per-cpu-irq-stack

(cherry picked from commit 1ffe199b1c9b72a8e752a9ae2a7af10128ab2ca1)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I2e5bf1ce899a1018f1c5b8ccb4f7c816d61bba21
---
 arch/arm64/kernel/stacktrace.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index b947eeffa5b2..d916d5b6aef6 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -71,9 +71,17 @@ int notrace unwind_frame(struct stackframe *frame)
 	 * to task stack.
 	 * If we reach the end of the stack - and its an interrupt stack,
 	 * read the original task stack pointer from the dummy frame.
+	 *
+	 * Check the frame->fp we read from the bottom of the irq_stack,
+	 * and the original task stack pointer are both in current->stack.
 	 */
-	if (frame->sp == irq_stack_ptr)
-		frame->sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr);
+	if (frame->sp == irq_stack_ptr) {
+		unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr);
+
+		if(object_is_on_stack((void *)orig_sp) &&
+		   object_is_on_stack((void *)frame->fp))
+			frame->sp = orig_sp;
+	}
 
 	return 0;
 }

From 6674ba575b07febb6f025811e6afb64c15834b30 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Thu, 10 Dec 2015 10:22:41 +0000
Subject: [PATCH 0337/3220] UPSTREAM: arm64: don't call C code with el0's fp
 register

On entry from el0, we save all the registers on the kernel stack, and
restore them before returning. x29 remains unchanged when we call out
to C code, which will store x29 as the frame-pointer on the stack.

Instead, write 0 into x29 after entry from el0, to avoid any risk of
tracing into user space.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: per-cpu-irq-stack

(cherry picked from commit 49003a8d6b35e128ef5e51433e60e783a46fbe5f)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ifae7003018e4088d5de038cef25fa210211a75b6
---
 arch/arm64/kernel/entry.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index e394f8c9595a..2284c296e3f7 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -93,6 +93,8 @@
 	and	tsk, tsk, #~(THREAD_SIZE - 1)	// Ensure MDSCR_EL1.SS is clear,
 	ldr	x19, [tsk, #TI_FLAGS]		// since we can unmask debug
 	disable_step_tsk x19, x20		// exceptions when scheduling.
+
+	mov	x29, xzr			// fp pointed to user-space
 	.else
 	add	x21, sp, #S_FRAME_SIZE
 	.endif

From eb87372aa93bc9849fe05a86eb0994e2bc755c45 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 23 Jun 2016 18:42:51 -0700
Subject: [PATCH 0338/3220] net: diag: Add support to filter on device index

Add support to inet_diag facility to filter sockets based on device
index. If an interface index is in the filter only sockets bound
to that index (sk_bound_dev_if) are returned.

[cherry-pick of net-next 637c841dd7a5f9bd97b75cbe90b526fa1a52e530]

Change-Id: I6b6bcdcf15d3142003f1ee53b4d82f2fabbb8250
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |  1 +
 net/ipv4/inet_diag.c           | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 68a1f71fde9f..b49bfb6ea22a 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -72,6 +72,7 @@ enum {
 	INET_DIAG_BC_AUTO,
 	INET_DIAG_BC_S_COND,
 	INET_DIAG_BC_D_COND,
+	INET_DIAG_BC_DEV_COND,   /* u32 ifindex */
 };
 
 struct inet_diag_hostcond {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a403a676d452..e532671c3f4f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -44,6 +44,7 @@ struct inet_diag_entry {
 	u16 dport;
 	u16 family;
 	u16 userlocks;
+	u32 ifindex;
 };
 
 static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -552,6 +553,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
 			yes = 0;
 			break;
 		}
+		case INET_DIAG_BC_DEV_COND: {
+			u32 ifindex;
+
+			ifindex = *((const u32 *)(op + 1));
+			if (ifindex != entry->ifindex)
+				yes = 0;
+			break;
+		}
 		}
 
 		if (yes) {
@@ -594,6 +603,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
 	entry_fill_addrs(&entry, sk);
 	entry.sport = inet->inet_num;
 	entry.dport = ntohs(inet->inet_dport);
+	entry.ifindex = sk->sk_bound_dev_if;
 	entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
 
 	return inet_diag_bc_run(bc, &entry);
@@ -617,6 +627,17 @@ static int valid_cc(const void *bc, int len, int cc)
 	return 0;
 }
 
+/* data is u32 ifindex */
+static bool valid_devcond(const struct inet_diag_bc_op *op, int len,
+			  int *min_len)
+{
+	/* Check ifindex space. */
+	*min_len += sizeof(u32);
+	if (len < *min_len)
+		return false;
+
+	return true;
+}
 /* Validate an inet_diag_hostcond. */
 static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
 			   int *min_len)
@@ -681,6 +702,10 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
 			if (!valid_hostcond(bc, len, &min_len))
 				return -EINVAL;
 			break;
+		case INET_DIAG_BC_DEV_COND:
+			if (!valid_devcond(bc, len, &min_len))
+				return -EINVAL;
+			break;
 		case INET_DIAG_BC_S_GE:
 		case INET_DIAG_BC_S_LE:
 		case INET_DIAG_BC_D_GE:

From 5bd0c83ba48ce73eb89bd4308b07a263a07dd21e Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Wed, 24 Aug 2016 15:46:25 +0900
Subject: [PATCH 0339/3220] net: diag: slightly refactor the inet_diag_bc_audit
 error checks.

This simplifies the code a bit and also allows inet_diag_bc_audit
to send to userspace an error that isn't EINVAL.

[cherry-pick of net-next 627cc4add53c0470bfd118002669205d222d3a54]

Change-Id: Iee3d2bbb19f3110d71f0698ffb293f9bdffc8ef1
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e532671c3f4f..75f6d4bde273 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -687,10 +687,16 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op,
 	return true;
 }
 
-static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+static int inet_diag_bc_audit(const struct nlattr *attr)
 {
-	const void *bc = bytecode;
-	int  len = bytecode_len;
+	const void *bytecode, *bc;
+	int bytecode_len, len;
+
+	if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
+		return -EINVAL;
+
+	bytecode = bc = nla_data(attr);
+	len = bytecode_len = nla_len(attr);
 
 	while (len > 0) {
 		int min_len = sizeof(struct inet_diag_bc_op);
@@ -1001,13 +1007,13 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		if (nlmsg_attrlen(nlh, hdrlen)) {
 			struct nlattr *attr;
+			int err;
 
 			attr = nlmsg_find_attr(nlh, hdrlen,
 					       INET_DIAG_REQ_BYTECODE);
-			if (!attr ||
-			    nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
-			    inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
-				return -EINVAL;
+			err = inet_diag_bc_audit(attr);
+			if (err)
+				return err;
 		}
 		{
 			struct netlink_dump_control c = {
@@ -1032,13 +1038,13 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
 	    h->nlmsg_flags & NLM_F_DUMP) {
 		if (nlmsg_attrlen(h, hdrlen)) {
 			struct nlattr *attr;
+			int err;
 
 			attr = nlmsg_find_attr(h, hdrlen,
 					       INET_DIAG_REQ_BYTECODE);
-			if (!attr ||
-			    nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
-			    inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
-				return -EINVAL;
+			err = inet_diag_bc_audit(attr);
+			if (err)
+				return err;
 		}
 		{
 			struct netlink_dump_control c = {

From ecd02bbc43641454b940fcf9c16b39ab8179af67 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Wed, 24 Aug 2016 15:46:26 +0900
Subject: [PATCH 0340/3220] net: diag: allow socket bytecode filters to match
 socket marks

This allows a privileged process to filter by socket mark when
dumping sockets via INET_DIAG_BY_FAMILY. This is useful on
systems that use mark-based routing such as Android.

The ability to filter socket marks requires CAP_NET_ADMIN, which
is consistent with other privileged operations allowed by the
SOCK_DIAG interface such as the ability to destroy sockets and
the ability to inspect BPF filters attached to packet sockets.

[cherry-pick of a52e95abf772b43c9226e9a72d3c1353903ba96f]

Change-Id: I8b90b814264d9808bda050cdba8f104943bdb9a8
Tested: https://android-review.googlesource.com/261350
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |  6 ++++++
 net/ipv4/inet_diag.c           | 36 +++++++++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index b49bfb6ea22a..35435c348c60 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -73,6 +73,7 @@ enum {
 	INET_DIAG_BC_S_COND,
 	INET_DIAG_BC_D_COND,
 	INET_DIAG_BC_DEV_COND,   /* u32 ifindex */
+	INET_DIAG_BC_MARK_COND,
 };
 
 struct inet_diag_hostcond {
@@ -82,6 +83,11 @@ struct inet_diag_hostcond {
 	__be32	addr[0];
 };
 
+struct inet_diag_markcond {
+	__u32 mark;
+	__u32 mask;
+};
+
 /* Base info structure. It contains socket identity (addrs/ports/cookie)
  * and, alas, the information shown by netstat. */
 struct inet_diag_msg {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 75f6d4bde273..5202149cbce0 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -45,6 +45,7 @@ struct inet_diag_entry {
 	u16 family;
 	u16 userlocks;
 	u32 ifindex;
+	u32 mark;
 };
 
 static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -561,6 +562,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
 				yes = 0;
 			break;
 		}
+		case INET_DIAG_BC_MARK_COND: {
+			struct inet_diag_markcond *cond;
+
+			cond = (struct inet_diag_markcond *)(op + 1);
+			if ((entry->mark & cond->mask) != cond->mark)
+				yes = 0;
+			break;
+		}
 		}
 
 		if (yes) {
@@ -605,6 +614,12 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
 	entry.dport = ntohs(inet->inet_dport);
 	entry.ifindex = sk->sk_bound_dev_if;
 	entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
+	if (sk_fullsock(sk))
+		entry.mark = sk->sk_mark;
+	else if (sk->sk_state == TCP_NEW_SYN_RECV)
+		entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
+	else
+		entry.mark = 0;
 
 	return inet_diag_bc_run(bc, &entry);
 }
@@ -687,8 +702,17 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op,
 	return true;
 }
 
-static int inet_diag_bc_audit(const struct nlattr *attr)
+static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
+			   int *min_len)
 {
+	*min_len += sizeof(struct inet_diag_markcond);
+	return len >= *min_len;
+}
+
+static int inet_diag_bc_audit(const struct nlattr *attr,
+			      const struct sk_buff *skb)
+{
+	bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
 	const void *bytecode, *bc;
 	int bytecode_len, len;
 
@@ -719,6 +743,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr)
 			if (!valid_port_comparison(bc, len, &min_len))
 				return -EINVAL;
 			break;
+		case INET_DIAG_BC_MARK_COND:
+			if (!net_admin)
+				return -EPERM;
+			if (!valid_markcond(bc, len, &min_len))
+				return -EINVAL;
+			break;
 		case INET_DIAG_BC_AUTO:
 		case INET_DIAG_BC_JMP:
 		case INET_DIAG_BC_NOP:
@@ -1011,7 +1041,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 			attr = nlmsg_find_attr(nlh, hdrlen,
 					       INET_DIAG_REQ_BYTECODE);
-			err = inet_diag_bc_audit(attr);
+			err = inet_diag_bc_audit(attr, skb);
 			if (err)
 				return err;
 		}
@@ -1042,7 +1072,7 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
 
 			attr = nlmsg_find_attr(h, hdrlen,
 					       INET_DIAG_REQ_BYTECODE);
-			err = inet_diag_bc_audit(attr);
+			err = inet_diag_bc_audit(attr, skb);
 			if (err)
 				return err;
 		}

From 2f500d61a4df5764d94abfab7cbc43e4e9b92881 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 23 Aug 2016 21:06:33 -0700
Subject: [PATCH 0341/3220] net: diag: support SOCK_DESTROY for UDP sockets

This implements SOCK_DESTROY for UDP sockets similar to what was done
for TCP with commit c1e64e298b8ca ("net: diag: Support destroying TCP
sockets.") A process with a UDP socket targeted for destroy is awakened
and recvmsg fails with ECONNABORTED.

[cherry-pick of 5d77dca82839ef016a93ad7acd7058b14d967752]

Change-Id: I4b4862548e6e3c05dde27781e7daa0b18b93bd81
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h   |  1 +
 net/ipv4/udp.c      | 15 +++++++++
 net/ipv4/udp_diag.c | 79 +++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/udp.c      |  1 +
 4 files changed, 96 insertions(+)

diff --git a/include/net/udp.h b/include/net/udp.h
index 6d4ed18e1427..e57f50258cda 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -238,6 +238,7 @@ int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
 				  const struct sock *));
 void udp_err(struct sk_buff *, u32);
+int udp_abort(struct sock *sk, int err);
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
 int udp_push_pending_frames(struct sock *sk);
 void udp_flush_pending_frames(struct sock *sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d8946929813e..660933edd2d2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2257,6 +2257,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 }
 EXPORT_SYMBOL(udp_poll);
 
+int udp_abort(struct sock *sk, int err)
+{
+	lock_sock(sk);
+
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+	udp_disconnect(sk, 0);
+
+	release_sock(sk);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(udp_abort);
+
 struct proto udp_prot = {
 	.name		   = "UDP",
 	.owner		   = THIS_MODULE,
@@ -2288,6 +2302,7 @@ struct proto udp_prot = {
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
 	.clear_sk	   = sk_prot_clear_portaddr_nulls,
+	.diag_destroy	   = udp_abort,
 };
 EXPORT_SYMBOL(udp_prot);
 
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 6116604bf6e8..8b9570c84fd5 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -165,12 +165,88 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 	r->idiag_wqueue = sk_wmem_alloc_get(sk);
 }
 
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int __udp_diag_destroy(struct sk_buff *in_skb,
+			      const struct inet_diag_req_v2 *req,
+			      struct udp_table *tbl)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct sock *sk;
+	int err;
+
+	rcu_read_lock();
+
+	if (req->sdiag_family == AF_INET)
+		sk = __udp4_lib_lookup(net,
+				req->id.idiag_dst[0], req->id.idiag_dport,
+				req->id.idiag_src[0], req->id.idiag_sport,
+				req->id.idiag_if, tbl);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (req->sdiag_family == AF_INET6) {
+		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+		    ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+			sk = __udp4_lib_lookup(net,
+					req->id.idiag_dst[0], req->id.idiag_dport,
+					req->id.idiag_src[0], req->id.idiag_sport,
+					req->id.idiag_if, tbl);
+
+		else
+			sk = __udp6_lib_lookup(net,
+					(struct in6_addr *)req->id.idiag_dst,
+					req->id.idiag_dport,
+					(struct in6_addr *)req->id.idiag_src,
+					req->id.idiag_sport,
+					req->id.idiag_if, tbl);
+	}
+#endif
+	else {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+
+	rcu_read_unlock();
+
+	if (!sk)
+		return -ENOENT;
+
+	if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+		sock_put(sk);
+		return -ENOENT;
+	}
+
+	err = sock_diag_destroy(sk, ECONNABORTED);
+
+	sock_put(sk);
+
+	return err;
+}
+
+static int udp_diag_destroy(struct sk_buff *in_skb,
+			    const struct inet_diag_req_v2 *req)
+{
+	return __udp_diag_destroy(in_skb, req, &udp_table);
+}
+
+static int udplite_diag_destroy(struct sk_buff *in_skb,
+				const struct inet_diag_req_v2 *req)
+{
+	return __udp_diag_destroy(in_skb, req, &udplite_table);
+}
+
+#endif
+
 static const struct inet_diag_handler udp_diag_handler = {
 	.dump		 = udp_diag_dump,
 	.dump_one	 = udp_diag_dump_one,
 	.idiag_get_info  = udp_diag_get_info,
 	.idiag_type	 = IPPROTO_UDP,
 	.idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+	.destroy	 = udp_diag_destroy,
+#endif
 };
 
 static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
@@ -192,6 +268,9 @@ static const struct inet_diag_handler udplite_diag_handler = {
 	.idiag_get_info  = udp_diag_get_info,
 	.idiag_type	 = IPPROTO_UDPLITE,
 	.idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+	.destroy	 = udplite_diag_destroy,
+#endif
 };
 
 static int __init udp_diag_init(void)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a1b6adc20e1e..54235ef177bb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1552,6 +1552,7 @@ struct proto udpv6_prot = {
 	.compat_getsockopt = compat_udpv6_getsockopt,
 #endif
 	.clear_sk	   = udp_v6_clear_sk,
+	.diag_destroy      = udp_abort,
 };
 
 static struct inet_protosw udpv6_protosw = {

From 513e28e1983a31c52f30b4ca1b1e69e4c25b71fe Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Wed, 7 Sep 2016 13:38:35 +0900
Subject: [PATCH 0342/3220] net: diag: make udp_diag_destroy work for mapped
 addresses.

udp_diag_destroy does look up the IPv4 UDP hashtable for mapped
addresses, but it gets the IPv4 address to look up from the
beginning of the IPv6 address instead of the end.

[cherry-pick of net-next f95bf346226b9b79352e05508beececc807cc37a]

Change-Id: Ia369482c4645bcade320b2c33a763f1ce4378ff1
Tested: https://android-review.googlesource.com/269874
Fixes: 5d77dca82839 ("net: diag: support SOCK_DESTROY for UDP sockets")
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_diag.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 8b9570c84fd5..39e0b8347bd2 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -186,8 +186,8 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
 		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
 		    ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
 			sk = __udp4_lib_lookup(net,
-					req->id.idiag_dst[0], req->id.idiag_dport,
-					req->id.idiag_src[0], req->id.idiag_sport,
+					req->id.idiag_dst[3], req->id.idiag_dport,
+					req->id.idiag_src[3], req->id.idiag_sport,
 					req->id.idiag_if, tbl);
 
 		else

From 662afd95b354bcf78a301838f345c59d9e164ad4 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Thu, 8 Sep 2016 00:42:25 +0900
Subject: [PATCH 0343/3220] net: inet: diag: expose the socket mark to
 privileged processes.

This adds the capability for a process that has CAP_NET_ADMIN on
a socket to see the socket mark in socket dumps.

Commit a52e95abf772 ("net: diag: allow socket bytecode filters to
match socket marks") recently gave privileged processes the
ability to filter socket dumps based on mark. This patch is
complementary: it ensures that the mark is also passed to
userspace in the socket's netlink attributes.  It is useful for
tools like ss which display information about sockets.

[backport of net-next d545caca827b65aab557a9e9dcdcf1e5a3823c2d]

Change-Id: I33336ed9c3ee3fb78fe05c4c47b7fd18c6e33ef1
Tested: https://android-review.googlesource.com/270210
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_diag.h      |  2 +-
 include/uapi/linux/inet_diag.h |  6 ++++-
 net/ipv4/inet_diag.c           | 44 +++++++++++++++++++++++-----------
 net/ipv4/udp_diag.c            | 10 ++++----
 4 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 7c27fa1030e8..795852dc3434 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -37,7 +37,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
 		      struct user_namespace *user_ns,
 		      u32 pid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh);
+		      const struct nlmsghdr *unlh, bool net_admin);
 void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb,
 			 struct netlink_callback *cb,
 			 const struct inet_diag_req_v2 *r,
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 35435c348c60..c7f189bd5979 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -120,9 +120,13 @@ enum {
 	INET_DIAG_DCTCPINFO,
 	INET_DIAG_PROTOCOL,  /* response attribute only */
 	INET_DIAG_SKV6ONLY,
+	INET_DIAG_LOCALS,
+	INET_DIAG_PEERS,
+	INET_DIAG_PAD,
+	INET_DIAG_MARK,
 };
 
-#define INET_DIAG_MAX INET_DIAG_SKV6ONLY
+#define INET_DIAG_MAX INET_DIAG_MARK
 
 /* INET_DIAG_MEM */
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 5202149cbce0..fcb83b2a61f0 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -98,6 +98,7 @@ static size_t inet_sk_attr_size(void)
 		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
 		+ nla_total_size(1) /* INET_DIAG_TOS */
 		+ nla_total_size(1) /* INET_DIAG_TCLASS */
+		+ nla_total_size(4) /* INET_DIAG_MARK */
 		+ nla_total_size(sizeof(struct inet_diag_meminfo))
 		+ nla_total_size(sizeof(struct inet_diag_msg))
 		+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -110,7 +111,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
 		      struct user_namespace *user_ns,
 		      u32 portid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh)
+		      const struct nlmsghdr *unlh,
+		      bool net_admin)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 	const struct tcp_congestion_ops *ca_ops;
@@ -160,6 +162,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 #endif
 
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+		goto errout;
+
 	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
 	r->idiag_inode = sock_i_ino(sk);
 
@@ -258,10 +263,11 @@ static int inet_csk_diag_fill(struct sock *sk,
 			      const struct inet_diag_req_v2 *req,
 			      struct user_namespace *user_ns,
 			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh)
+			      const struct nlmsghdr *unlh,
+			      bool net_admin)
 {
-	return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
-				 user_ns, portid, seq, nlmsg_flags, unlh);
+	return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
+				 portid, seq, nlmsg_flags, unlh, net_admin);
 }
 
 static int inet_twsk_diag_fill(struct sock *sk,
@@ -303,8 +309,9 @@ static int inet_twsk_diag_fill(struct sock *sk,
 
 static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh)
+			      const struct nlmsghdr *unlh, bool net_admin)
 {
+	struct request_sock *reqsk = inet_reqsk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	long tmo;
@@ -318,7 +325,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 	inet_diag_msg_common_fill(r, sk);
 	r->idiag_state = TCP_SYN_RECV;
 	r->idiag_timer = 1;
-	r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+	r->idiag_retrans = reqsk->num_retrans;
 
 	BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
 		     offsetof(struct sock, sk_cookie));
@@ -330,6 +337,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 	r->idiag_uid	= 0;
 	r->idiag_inode	= 0;
 
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+				     inet_rsk(reqsk)->ir_mark))
+		return -EMSGSIZE;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 }
@@ -338,7 +349,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			const struct inet_diag_req_v2 *r,
 			struct user_namespace *user_ns,
 			u32 portid, u32 seq, u16 nlmsg_flags,
-			const struct nlmsghdr *unlh)
+			const struct nlmsghdr *unlh, bool net_admin)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
 		return inet_twsk_diag_fill(sk, skb, portid, seq,
@@ -346,10 +357,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 
 	if (sk->sk_state == TCP_NEW_SYN_RECV)
 		return inet_req_diag_fill(sk, skb, portid, seq,
-					  nlmsg_flags, unlh);
+					  nlmsg_flags, unlh, net_admin);
 
 	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
-				  nlmsg_flags, unlh);
+				  nlmsg_flags, unlh, net_admin);
 }
 
 struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -416,7 +427,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 	err = sk_diag_fill(sk, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh);
+			   nlh->nlmsg_seq, 0, nlh,
+			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		nlmsg_free(rep);
@@ -777,7 +789,8 @@ static int inet_csk_diag_dump(struct sock *sk,
 			      struct sk_buff *skb,
 			      struct netlink_callback *cb,
 			      const struct inet_diag_req_v2 *r,
-			      const struct nlattr *bc)
+			      const struct nlattr *bc,
+			      bool net_admin)
 {
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
@@ -785,7 +798,8 @@ static int inet_csk_diag_dump(struct sock *sk,
 	return inet_csk_diag_fill(sk, skb, r,
 				  sk_user_ns(NETLINK_CB(cb->skb).sk),
 				  NETLINK_CB(cb->skb).portid,
-				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
+				  net_admin);
 }
 
 static void twsk_build_assert(void)
@@ -821,6 +835,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	int i, num, s_i, s_num;
 	u32 idiag_states = r->idiag_states;
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 
 	if (idiag_states & TCPF_SYN_RECV)
 		idiag_states |= TCPF_NEW_SYN_RECV;
@@ -862,7 +877,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 				    cb->args[3] > 0)
 					goto next_listen;
 
-				if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+				if (inet_csk_diag_dump(sk, skb, cb, r,
+						       bc, net_admin) < 0) {
 					spin_unlock_bh(&ilb->lock);
 					goto done;
 				}
@@ -930,7 +946,7 @@ skip_listen_ht:
 					   sk_user_ns(NETLINK_CB(cb->skb).sk),
 					   NETLINK_CB(cb->skb).portid,
 					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
-					   cb->nlh);
+					   cb->nlh, net_admin);
 			if (res < 0) {
 				spin_unlock_bh(lock);
 				goto done;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 39e0b8347bd2..092aa60e8b92 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -20,7 +20,7 @@
 static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 			struct netlink_callback *cb,
 			const struct inet_diag_req_v2 *req,
-			struct nlattr *bc)
+			struct nlattr *bc, bool net_admin)
 {
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
@@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 	return inet_sk_diag_fill(sk, NULL, skb, req,
 			sk_user_ns(NETLINK_CB(cb->skb).sk),
 			NETLINK_CB(cb->skb).portid,
-			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
 }
 
 static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
@@ -75,7 +75,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 	err = inet_sk_diag_fill(sk, NULL, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh);
+			   nlh->nlmsg_seq, 0, nlh,
+			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(rep);
@@ -98,6 +99,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 {
 	int num, s_num, slot, s_slot;
 	struct net *net = sock_net(skb->sk);
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
@@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 			    r->id.idiag_dport)
 				goto next;
 
-			if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+			if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
 				spin_unlock_bh(&hslot->lock);
 				goto done;
 			}

From c47e4a12f511721dab3d2407d18e0e1a6b7b0e1c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 9 Dec 2015 12:44:36 +0000
Subject: [PATCH 0344/3220] UPSTREAM: arm64: mm: remove pointless PAGE_MASKing

As pgd_offset{,_k} shift the input address by PGDIR_SHIFT, the sub-page
bits will always be shifted out. There is no need to apply PAGE_MASK
before this.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit e2c30ee320eb96304896c7ab84499e5bc5e5fb6e)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I86d3438aecf7295d5895e1430c1e19fbc82c9719
---
 arch/arm64/mm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 4c381f0e77a7..7b6e58a58bf3 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -288,7 +288,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt,
 			&phys, virt);
 		return;
 	}
-	__create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), phys, virt,
+	__create_mapping(&init_mm, pgd_offset_k(virt), phys, virt,
 			 size, prot, early_alloc);
 }
 
@@ -309,7 +309,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 		return;
 	}
 
-	return __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK),
+	return __create_mapping(&init_mm, pgd_offset_k(virt),
 				phys, virt, size, prot, late_alloc);
 }
 

From 31400ef82e7ae74663e9ca8e041516ede5e617e4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 9 Dec 2015 12:44:37 +0000
Subject: [PATCH 0345/3220] BACKPORT: arm64: Remove redundant padding from
 linker script

Currently we place an ALIGN_DEBUG_RO between text and data for the .text
and .init sections, and depending on configuration each of these may
result in up to SECTION_SIZE bytes worth of padding (for
DEBUG_RODATA_ALIGN).

We make no distinction between the text and data in each of these
sections at any point when creating the initial page tables in head.S.
We also make no distinction when modifying the tables; __map_memblock,
fixup_executable, mark_rodata_ro, and fixup_init only work at section
granularity. Thus this padding is unnecessary.

For the spit between init text and data we impose a minimum alignment of
16 bytes, but this is also unnecessary. The init data is output
immediately after the padding before any symbols are defined, so this is
not required to keep a symbol for linker a section array correctly
associated with the data. Any objects within the section will be given
at least their usual alignment regardless.

This patch removes the redundant padding.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 5b28cd9d084eca8ddc46270d2720305bfd40e348)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I5540ba1f4d90e2ae8fafa22e98c389bc4e975ac7
---
 arch/arm64/kernel/vmlinux.lds.S | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index f472809a7b45..1cc195872a1f 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -113,7 +113,6 @@ SECTIONS
 		*(.got)			/* Global offset table		*/
 	}
 
-	ALIGN_DEBUG_RO
 	_etext = .;			/* End of text section */
 
 	RO_DATA(PAGE_SIZE)
@@ -129,7 +128,6 @@ SECTIONS
 		ARM_EXIT_KEEP(EXIT_TEXT)
 	}
 
-	ALIGN_DEBUG_RO_MIN(16)
 	.init.data : {
 		INIT_DATA
 		INIT_SETUP(16)

From 8ace694d891ede66ac585fa12aa76cb9c5530427 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 9 Dec 2015 12:44:38 +0000
Subject: [PATCH 0346/3220] UPSTREAM: arm64: mm: fold alternatives into .init

Currently we treat the alternatives separately from other data that's
only used during initialisation, using separate .altinstructions and
.altinstr_replacement linker sections. These are freed for general
allocation separately from .init*. This is problematic as:

* We do not remove execute permissions, as we do for .init, leaving the
  memory executable.

* We pad between them, making the kernel Image bianry up to PAGE_SIZE
  bytes larger than necessary.

This patch moves the two sections into the contiguous region used for
.init*. This saves some memory, ensures that we remove execute
permissions, and allows us to remove some code made redundant by this
reorganisation.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Andre Przywara <andre.przywara@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 9aa4ec1571da62366cfddc20f3b923609604fe63)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I3fee118ead5c73ade50ea10d436881c9424a549c
---
 arch/arm64/include/asm/alternative.h | 1 -
 arch/arm64/kernel/alternative.c      | 6 ------
 arch/arm64/kernel/vmlinux.lds.S      | 5 ++---
 arch/arm64/mm/init.c                 | 1 -
 4 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index d56ec0715157..e4962f04201e 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -19,7 +19,6 @@ struct alt_instr {
 
 void __init apply_alternatives_all(void);
 void apply_alternatives(void *start, size_t length);
-void free_alternatives_memory(void);
 
 #define ALTINSTR_ENTRY(feature)						      \
 	" .word 661b - .\n"				/* label           */ \
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index ab9db0e9818c..d2ee1b21a10d 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -158,9 +158,3 @@ void apply_alternatives(void *start, size_t length)
 
 	__apply_alternatives(&region);
 }
-
-void free_alternatives_memory(void)
-{
-	free_reserved_area(__alt_instructions, __alt_instructions_end,
-			   0, "alternatives");
-}
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 1cc195872a1f..1a056bb26633 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -142,9 +142,6 @@ SECTIONS
 
 	PERCPU_SECTION(L1_CACHE_BYTES)
 
-	. = ALIGN(PAGE_SIZE);
-	__init_end = .;
-
 	. = ALIGN(4);
 	.altinstructions : {
 		__alt_instructions = .;
@@ -156,6 +153,8 @@ SECTIONS
 	}
 
 	. = ALIGN(PAGE_SIZE);
+	__init_end = .;
+
 	_data = .;
 	_sdata = .;
 	RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index d43715a14383..f13078bbf66e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -362,7 +362,6 @@ void free_initmem(void)
 {
 	fixup_init();
 	free_initmem_default(0);
-	free_alternatives_memory();
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD

From c5ca0f872092c8c7fbcc903be244b63a33afd585 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 10 Dec 2015 16:54:32 +0000
Subject: [PATCH 0347/3220] UPSTREAM: arm64: cmpxchg: Don't incldue
 linux/mmdebug.h

The arm64 asm/cmpxchg.h includes linux/mmdebug.h but doesn't so far as I
can tell actually use anything from it.  Removing the inclusion reduces
spurious header dependency rebuilds and also avoids issues with
recursive inclusions of headers causing build breaks due to attempts to
use things before they are defined if linux/mmdebug.h starts pulling in
more low level headers.

Such errors have happened in -next recently, for example:

In file included from include/linux/completion.h:11:0,
                 from include/linux/rcupdate.h:43,
                 from include/linux/tracepoint.h:19,
                 from include/linux/mmdebug.h:6,
                 from ./arch/arm64/include/asm/cmpxchg.h:22,
                 from ./arch/arm64/include/asm/atomic.h:41,
                 from include/linux/atomic.h:4,
                 from include/linux/spinlock.h:406,
                 from include/linux/seqlock.h:35,
                 from include/linux/time.h:5,
                 from include/uapi/linux/timex.h:56,
                 from include/linux/timex.h:56,
                 from include/linux/sched.h:19,
                 from arch/arm64/kernel/asm-offsets.c:21:
include/linux/wait.h: In function 'wait_on_atomic_t':
include/linux/wait.h:1218:2: error: implicit declaration of function 'atomic_read' [-Werror=implicit-function-declaration]
 if (atomic_read(val) == 0)

Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 4a6ccf30263f4e265c0f171561bf4c40bed5f273)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Idf44176dad0abc11e67b4e416b02a3fba14f3f1b
---
 arch/arm64/include/asm/cmpxchg.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 9ea611ea69df..510c7b404454 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -19,7 +19,6 @@
 #define __ASM_CMPXCHG_H
 
 #include <linux/bug.h>
-#include <linux/mmdebug.h>
 
 #include <asm/atomic.h>
 #include <asm/barrier.h>

From 2393897c7c7e10c2e8fe7d1c32466cbb885c9047 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 11 Dec 2015 11:04:31 +0000
Subject: [PATCH 0348/3220] UPSTREAM: arm64: mm: place __cpu_setup in .text

We drop __cpu_setup in .text.init, which ends up being part of .text.
The .text.init section was a legacy section name which has been unused
elsewhere for a long time.

The ".text.init" name is misleading if read as a synonym for
".init.text". Any CPU may execute __cpu_setup before turning the MMU on,
so it should simply live in .text.

Remove the pointless section assignment. This will leave __cpu_setup in
the .text section.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit f00083cae331e5d3eecade6b4fdc35d0825e73ef)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I2e9b154cd6de92662c70c2b957479448252c661a
---
 arch/arm64/mm/proc.S | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index cacecc4ad3e5..b6f9053ab184 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -139,8 +139,6 @@ ENTRY(cpu_do_switch_mm)
 	ret
 ENDPROC(cpu_do_switch_mm)
 
-	.section ".text.init", #alloc, #execinstr
-
 /*
  *	__cpu_setup
  *

From 9e4d16282cb949c1ef3d9736de79c8155cb24d7b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 17 Nov 2015 14:45:47 +0000
Subject: [PATCH 0349/3220] UPSTREAM: arm64: Documentation: add list of
 software workarounds for errata

It's not immediately obvious which hardware errata are worked around in
the Linux kernel for an arbitrary kernel tree, so add a file to keep
track of what we're working around.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 9cb9c9e5ba8453537e8e645318edf231fe54eaf9)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I139972ff6e10e12c4b4f27cd047f55b3dd8b4118
---
 Documentation/arm64/silicon-errata.txt | 58 ++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 Documentation/arm64/silicon-errata.txt

diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt
new file mode 100644
index 000000000000..58b71ddf9b60
--- /dev/null
+++ b/Documentation/arm64/silicon-errata.txt
@@ -0,0 +1,58 @@
+                Silicon Errata and Software Workarounds
+                =======================================
+
+Author: Will Deacon <will.deacon@arm.com>
+Date  : 27 November 2015
+
+It is an unfortunate fact of life that hardware is often produced with
+so-called "errata", which can cause it to deviate from the architecture
+under specific circumstances.  For hardware produced by ARM, these
+errata are broadly classified into the following categories:
+
+  Category A: A critical error without a viable workaround.
+  Category B: A significant or critical error with an acceptable
+              workaround.
+  Category C: A minor error that is not expected to occur under normal
+              operation.
+
+For more information, consult one of the "Software Developers Errata
+Notice" documents available on infocenter.arm.com (registration
+required).
+
+As far as Linux is concerned, Category B errata may require some special
+treatment in the operating system. For example, avoiding a particular
+sequence of code, or configuring the processor in a particular way. A
+less common situation may require similar actions in order to declassify
+a Category A erratum into a Category C erratum. These are collectively
+known as "software workarounds" and are only required in the minority of
+cases (e.g. those cases that both require a non-secure workaround *and*
+can be triggered by Linux).
+
+For software workarounds that may adversely impact systems unaffected by
+the erratum in question, a Kconfig entry is added under "Kernel
+Features" -> "ARM errata workarounds via the alternatives framework".
+These are enabled by default and patched in at runtime when an affected
+CPU is detected. For less-intrusive workarounds, a Kconfig option is not
+available and the code is structured (preferably with a comment) in such
+a way that the erratum will not be hit.
+
+This approach can make it slightly onerous to determine exactly which
+errata are worked around in an arbitrary kernel source tree, so this
+file acts as a registry of software workarounds in the Linux Kernel and
+will be updated when new workarounds are committed and backported to
+stable kernels.
+
+| Implementor    | Component       | Erratum ID      | Kconfig                 |
++----------------+-----------------+-----------------+-------------------------+
+| ARM            | Cortex-A53      | #826319         | ARM64_ERRATUM_826319    |
+| ARM            | Cortex-A53      | #827319         | ARM64_ERRATUM_827319    |
+| ARM            | Cortex-A53      | #824069         | ARM64_ERRATUM_824069    |
+| ARM            | Cortex-A53      | #819472         | ARM64_ERRATUM_819472    |
+| ARM            | Cortex-A53      | #845719         | ARM64_ERRATUM_845719    |
+| ARM            | Cortex-A53      | #843419         | ARM64_ERRATUM_843419    |
+| ARM            | Cortex-A57      | #832075         | ARM64_ERRATUM_832075    |
+| ARM            | Cortex-A57      | #852523         | N/A                     |
+| ARM            | Cortex-A57      | #834220         | ARM64_ERRATUM_834220    |
+|                |                 |                 |                         |
+| Cavium         | ThunderX ITS    | #22375, #24313  | CAVIUM_ERRATUM_22375    |
+| Cavium         | ThunderX GICv3  | #23154          | CAVIUM_ERRATUM_23154    |

From 41b7e19e9b27266024dc0565426a91d922c3d96b Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Tue, 15 Dec 2015 11:21:25 +0000
Subject: [PATCH 0350/3220] UPSTREAM: arm64: reduce stack use in irq_handler

The code for switching to irq_stack stores three pieces of information on
the stack, fp+lr, as a fake stack frame (that lets us walk back onto the
interrupted tasks stack frame), and the address of the struct pt_regs that
contains the register values from kernel entry. (which dump_backtrace()
will print in any stack trace).

To reduce this, we store fp, and the pointer to the struct pt_regs.
unwind_frame() can recognise this as the irq_stack dummy frame, (as it only
appears at the top of the irq_stack), and use the struct pt_regs values
to find the missing interrupted link-register.

Suggested-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 971c67ce37cfeeaf560e792a2c3bc21d8b67163a)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I84cbb04857a441083d331e875c3e228d24ec2276
---
 arch/arm64/include/asm/irq.h   | 11 ++++-------
 arch/arm64/kernel/entry.S      | 12 +++++++-----
 arch/arm64/kernel/stacktrace.c | 19 ++++++++++++++++---
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index 877c7e358384..3bece4379bd9 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -25,16 +25,13 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
  *       ------------
  *       |          |  <- irq_stack_ptr
  *   top ------------
- *       |  elr_el1 |
+ *       |   x19    | <- irq_stack_ptr - 0x08
  *       ------------
  *       |   x29    | <- irq_stack_ptr - 0x10
  *       ------------
- *       |   xzr    |
- *       ------------
- *       |   x19    | <- irq_stack_ptr - 0x20
- *       ------------
  *
- * where x19 holds a copy of the task stack pointer.
+ * where x19 holds a copy of the task stack pointer where the struct pt_regs
+ * from kernel_entry can be found.
  *
  */
 #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP)
@@ -43,7 +40,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
  * The offset from irq_stack_ptr where entry.S will store the original
  * stack pointer. Used by unwind_frame() and dump_backtrace().
  */
-#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20));
+#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08)))
 
 extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2284c296e3f7..0667fb7d8bb1 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -178,7 +178,7 @@ alternative_endif
 	mrs	\rd, sp_el0
 	.endm
 
-	.macro	irq_stack_entry, dummy_lr
+	.macro	irq_stack_entry
 	mov	x19, sp			// preserve the original sp
 
 	this_cpu_ptr irq_stack, x25, x26
@@ -196,10 +196,12 @@ alternative_endif
 	add	x26, x25, x26
 	mov	sp, x26
 
-	/* Add a dummy stack frame */
-	stp     x29, \dummy_lr, [sp, #-16]!           // dummy stack frame
+	/*
+	 * Add a dummy stack frame, this non-standard format is fixed up
+	 * by unwind_frame()
+	 */
+	stp     x29, x19, [sp, #-16]!
 	mov	x29, sp
-	stp     x19, xzr, [sp, #-16]!
 
 9998:
 	.endm
@@ -229,7 +231,7 @@ tsk	.req	x28		// current thread_info
 	.macro	irq_handler
 	ldr_l	x1, handle_arch_irq
 	mov	x0, sp
-	irq_stack_entry x22
+	irq_stack_entry
 	blr	x1
 	irq_stack_exit
 	.endm
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index d916d5b6aef6..b9fd3a8abfc1 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -70,17 +70,30 @@ int notrace unwind_frame(struct stackframe *frame)
 	 * Check whether we are going to walk through from interrupt stack
 	 * to task stack.
 	 * If we reach the end of the stack - and its an interrupt stack,
-	 * read the original task stack pointer from the dummy frame.
+	 * unpack the dummy frame to find the original elr.
 	 *
 	 * Check the frame->fp we read from the bottom of the irq_stack,
 	 * and the original task stack pointer are both in current->stack.
 	 */
 	if (frame->sp == irq_stack_ptr) {
+		struct pt_regs *irq_args;
 		unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr);
 
-		if(object_is_on_stack((void *)orig_sp) &&
-		   object_is_on_stack((void *)frame->fp))
+		if (object_is_on_stack((void *)orig_sp) &&
+		   object_is_on_stack((void *)frame->fp)) {
 			frame->sp = orig_sp;
+
+			/* orig_sp is the saved pt_regs, find the elr */
+			irq_args = (struct pt_regs *)orig_sp;
+			frame->pc = irq_args->pc;
+		} else {
+			/*
+			 * This frame has a non-standard format, and we
+			 * didn't fix it, because the data looked wrong.
+			 * Refuse to output this frame.
+			 */
+			return -EINVAL;
+		}
 	}
 
 	return 0;

From b10551a35399a277afe9b2e28266e2a00a5f6253 Mon Sep 17 00:00:00 2001
From: Ashok Kumar <ashoks@broadcom.com>
Date: Thu, 17 Dec 2015 01:38:31 -0800
Subject: [PATCH 0351/3220] UPSTREAM: arm64: Defer dcache flush in
 __cpu_copy_user_page

Defer dcache flushing to __sync_icache_dcache by calling
flush_dcache_page which clears PG_dcache_clean flag.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ashok Kumar <ashoks@broadcom.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit e6b1185f77351aa154e63bd54b05d07ff99d4ffa)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I2e02ce2f43f68287337bed30e3c3455c0eee4164
---
 arch/arm64/mm/copypage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index 13bbc3be6f5a..22e4cb4d6f53 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -24,8 +24,9 @@
 
 void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr)
 {
+	struct page *page = virt_to_page(kto);
 	copy_page(kto, kfrom);
-	__flush_dcache_area(kto, PAGE_SIZE);
+	flush_dcache_page(page);
 }
 EXPORT_SYMBOL_GPL(__cpu_copy_user_page);
 

From 350a6a0fe0fe70d15a59084c8f4863d10271887f Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Wed, 13 Jan 2016 14:50:03 +0000
Subject: [PATCH 0352/3220] BACKPORT: arm64: kernel: fix architected PMU
 registers unconditional access

The Performance Monitors extension is an optional feature of the
AArch64 architecture, therefore, in order to access Performance
Monitors registers safely, the kernel should detect the architected
PMU unit presence through the ID_AA64DFR0_EL1 register PMUVer field
before accessing them.

This patch implements a guard by reading the ID_AA64DFR0_EL1 register
PMUVer field to detect the architected PMU presence and prevent accessing
PMU system registers if the Performance Monitors extension is not
implemented in the core.

Cc: Peter Maydell <peter.maydell@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: <stable@vger.kernel.org>
Fixes: 60792ad349f3 ("arm64: kernel: enforce pmuserenr_el0 initialization and restore")
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit f436b2ac90a095746beb6729b8ee8ed87c9eaede)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ie442b9feba5d143bd6b6c82d70190fc8bc95298d
---
 arch/arm64/kernel/head.S    |  5 +++++
 arch/arm64/mm/proc-macros.S | 12 ++++++++++++
 arch/arm64/mm/proc.S        |  2 ++
 3 files changed, 19 insertions(+)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index b363f340f2c7..17ce7285bb12 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -515,9 +515,14 @@ CPU_LE(	movk	x0, #0x30d0, lsl #16	)	// Clear EE and E0E on LE systems
 #endif
 
 	/* EL2 debug */
+	mrs	x0, id_aa64dfr0_el1		// Check ID_AA64DFR0_EL1 PMUVer
+	sbfx	x0, x0, #8, #4
+	cmp	x0, #1
+	b.lt	4f				// Skip if no PMU present
 	mrs	x0, pmcr_el0			// Disable debug access traps
 	ubfx	x0, x0, #11, #5			// to EL2 and allow access to
 	msr	mdcr_el2, x0			// all PMU counters from EL1
+4:
 
 	/* Stage-2 translation */
 	msr	vttbr_el2, xzr
diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S
index 4c4d93c4bf65..d69dffffaa89 100644
--- a/arch/arm64/mm/proc-macros.S
+++ b/arch/arm64/mm/proc-macros.S
@@ -62,3 +62,15 @@
 	bfi	\valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
 #endif
 	.endm
+
+/*
+ * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
+ */
+	.macro	reset_pmuserenr_el0, tmpreg
+	mrs	\tmpreg, id_aa64dfr0_el1	// Check ID_AA64DFR0_EL1 PMUVer
+	sbfx	\tmpreg, \tmpreg, #8, #4
+	cmp	\tmpreg, #1			// Skip if no PMU present
+	b.lt	9000f
+	msr	pmuserenr_el0, xzr		// Disable PMU access from EL0
+9000:
+	.endm
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index b6f9053ab184..c164d2cb35c0 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -117,6 +117,7 @@ ENTRY(cpu_do_resume)
 	 */
 	ubfx	x11, x11, #1, #1
 	msr	oslar_el1, x11
+	reset_pmuserenr_el0 x0			// Disable PMU access from EL0
 	mov	x0, x12
 	dsb	nsh		// Make sure local tlb invalidation completed
 	isb
@@ -153,6 +154,7 @@ ENTRY(__cpu_setup)
 	msr	cpacr_el1, x0			// Enable FP/ASIMD
 	mov	x0, #1 << 12			// Reset mdscr_el1 and disable
 	msr	mdscr_el1, x0			// access to the DCC from EL0
+	reset_pmuserenr_el0 x0			// Disable PMU access from EL0
 	/*
 	 * Memory region attributes for LPAE:
 	 *

From d378add20954990566e91f914105e2638564d8ba Mon Sep 17 00:00:00 2001
From: Ashok Kumar <ashoks@broadcom.com>
Date: Thu, 17 Dec 2015 01:38:32 -0800
Subject: [PATCH 0353/3220] BACKPORT: arm64: Use PoU cache instr for I/D
 coherency

In systems with three levels of cache(PoU at L1 and PoC at L3),
PoC cache flush instructions flushes L2 and L3 caches which could affect
performance.
For cache flushes for I and D coherency, PoU should suffice.
So changing all I and D coherency related cache flushes to PoU.

Introduced a new __clean_dcache_area_pou API for dcache flush till PoU
and provided a common macro for __flush_dcache_area and
__clean_dcache_area_pou.

Also, now in __sync_icache_dcache, icache invalidation for non-aliasing
VIPT icache is done only for that particular page instead of the earlier
__flush_icache_all.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ashok Kumar <ashoks@broadcom.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 0a28714c53fd4f7aea709be7577dfbe0095c8c3e)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I64f065140d5e8783e91ed53ae9c7a2e33a3e515a
---
 arch/arm64/include/asm/cacheflush.h |  1 +
 arch/arm64/mm/cache.S               | 28 ++++++++++++++----------
 arch/arm64/mm/flush.c               | 33 ++++++++++++++++-------------
 arch/arm64/mm/proc-macros.S         | 22 +++++++++++++++++++
 4 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 54efedaf331f..7fc294c3bc5b 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -68,6 +68,7 @@
 extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end);
 extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void __flush_dcache_area(void *addr, size_t len);
+extern void __clean_dcache_area_pou(void *addr, size_t len);
 extern long __flush_cache_user_range(unsigned long start, unsigned long end);
 
 static inline void flush_cache_mm(struct mm_struct *mm)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index cfa44a6adc0a..6df07069a025 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -81,25 +81,31 @@ ENDPROC(__flush_cache_user_range)
 /*
  *	__flush_dcache_area(kaddr, size)
  *
- *	Ensure that the data held in the page kaddr is written back to the
- *	page in question.
+ *	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ *	are cleaned and invalidated to the PoC.
  *
  *	- kaddr   - kernel address
  *	- size    - size in question
  */
 ENTRY(__flush_dcache_area)
-	dcache_line_size x2, x3
-	add	x1, x0, x1
-	sub	x3, x2, #1
-	bic	x0, x0, x3
-1:	dc	civac, x0			// clean & invalidate D line / unified line
-	add	x0, x0, x2
-	cmp	x0, x1
-	b.lo	1b
-	dsb	sy
+	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
 ENDPIPROC(__flush_dcache_area)
 
+/*
+ *	__clean_dcache_area_pou(kaddr, size)
+ *
+ * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	are cleaned to the PoU.
+ *
+ *	- kaddr   - kernel address
+ *	- size    - size in question
+ */
+ENTRY(__clean_dcache_area_pou)
+	dcache_by_line_op cvau, ish, x0, x1, x2, x3
+	ret
+ENDPROC(__clean_dcache_area_pou)
+
 /*
  *	__inval_cache_range(start, end)
  *	- start   - start address of region
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index c26b804015e8..46649d6e6c5a 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -34,19 +34,24 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
 		__flush_icache_all();
 }
 
+static void sync_icache_aliases(void *kaddr, unsigned long len)
+{
+	unsigned long addr = (unsigned long)kaddr;
+
+	if (icache_is_aliasing()) {
+		__clean_dcache_area_pou(kaddr, len);
+		__flush_icache_all();
+	} else {
+		flush_icache_range(addr, addr + len);
+	}
+}
+
 static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 				unsigned long uaddr, void *kaddr,
 				unsigned long len)
 {
-	if (vma->vm_flags & VM_EXEC) {
-		unsigned long addr = (unsigned long)kaddr;
-		if (icache_is_aliasing()) {
-			__flush_dcache_area(kaddr, len);
-			__flush_icache_all();
-		} else {
-			flush_icache_range(addr, addr + len);
-		}
-	}
+	if (vma->vm_flags & VM_EXEC)
+		sync_icache_aliases(kaddr, len);
 }
 
 /*
@@ -74,13 +79,11 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr)
 	if (!page_mapping(page))
 		return;
 
-	if (!test_and_set_bit(PG_dcache_clean, &page->flags)) {
-		__flush_dcache_area(page_address(page),
-				PAGE_SIZE << compound_order(page));
+	if (!test_and_set_bit(PG_dcache_clean, &page->flags))
+		sync_icache_aliases(page_address(page),
+				    PAGE_SIZE << compound_order(page));
+	else if (icache_is_aivivt())
 		__flush_icache_all();
-	} else if (icache_is_aivivt()) {
-		__flush_icache_all();
-	}
 }
 
 /*
diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S
index d69dffffaa89..984edcda1850 100644
--- a/arch/arm64/mm/proc-macros.S
+++ b/arch/arm64/mm/proc-macros.S
@@ -74,3 +74,25 @@
 	msr	pmuserenr_el0, xzr		// Disable PMU access from EL0
 9000:
 	.endm
+
+/*
+ * Macro to perform a data cache maintenance for the interval
+ * [kaddr, kaddr + size)
+ *
+ * 	op:		operation passed to dc instruction
+ * 	domain:		domain used in dsb instruciton
+ * 	kaddr:		starting virtual address of the region
+ * 	size:		size of the region
+ * 	Corrupts: 	kaddr, size, tmp1, tmp2
+ */
+	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	dcache_line_size \tmp1, \tmp2
+	add	\size, \kaddr, \size
+	sub	\tmp2, \tmp1, #1
+	bic	\kaddr, \kaddr, \tmp2
+9998:	dc	\op, \kaddr
+	add	\kaddr, \kaddr, \tmp1
+	cmp	\kaddr, \size
+	b.lo	9998b
+	dsb	\domain
+	.endm

From 357838314d69e67ff89db49a7170fa1968638b88 Mon Sep 17 00:00:00 2001
From: David Woods <dwoods@ezchip.com>
Date: Thu, 17 Dec 2015 14:31:26 -0500
Subject: [PATCH 0354/3220] UPSTREAM: arm64: hugetlb: add support for PTE
 contiguous bit

The arm64 MMU supports a Contiguous bit which is a hint that the TTE
is one of a set of contiguous entries which can be cached in a single
TLB entry.  Supporting this bit adds new intermediate huge page sizes.

The set of huge page sizes available depends on the base page size.
Without using contiguous pages the huge page sizes are as follows.

 4KB:   2MB  1GB
64KB: 512MB

With a 4KB granule, the contiguous bit groups together sets of 16 pages
and with a 64KB granule it groups sets of 32 pages.  This enables two new
huge page sizes in each case, so that the full set of available sizes
is as follows.

 4KB:  64KB   2MB  32MB  1GB
64KB:   2MB 512MB  16GB

If a 16KB granule is used then the contiguous bit groups 128 pages
at the PTE level and 32 pages at the PMD level.

If the base page size is set to 64KB then 2MB pages are enabled by
default.  It is possible in the future to make 2MB the default huge
page size for both 4KB and 64KB granules.

Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Reviewed-by: Steve Capper <steve.capper@linaro.org>
Signed-off-by: David Woods <dwoods@ezchip.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 66b3923a1a0f77a563b43f43f6ad091354abbfe9)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I5e99c5165bc5eb966adf4d4523632fd9eedd9602
---
 arch/arm64/Kconfig                     |   3 -
 arch/arm64/include/asm/hugetlb.h       |  44 ++--
 arch/arm64/include/asm/pgtable-hwdef.h |  18 +-
 arch/arm64/include/asm/pgtable.h       |  10 +-
 arch/arm64/mm/hugetlbpage.c            | 274 ++++++++++++++++++++++++-
 include/linux/hugetlb.h                |   2 -
 6 files changed, 313 insertions(+), 38 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 92f234a5579b..ac3dbd933064 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -564,9 +564,6 @@ config HW_PERF_EVENTS
 config SYS_SUPPORTS_HUGETLBFS
 	def_bool y
 
-config ARCH_WANT_GENERAL_HUGETLB
-	def_bool y
-
 config ARCH_WANT_HUGE_PMD_SHARE
 	def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
 
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index bb4052e85dba..bbc1e35aa601 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -26,36 +26,7 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
 	return *ptep;
 }
 
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-				   pte_t *ptep, pte_t pte)
-{
-	set_pte_at(mm, addr, ptep, pte);
-}
 
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
-					 unsigned long addr, pte_t *ptep)
-{
-	ptep_clear_flush(vma, addr, ptep);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
-					   unsigned long addr, pte_t *ptep)
-{
-	ptep_set_wrprotect(mm, addr, ptep);
-}
-
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
-					    unsigned long addr, pte_t *ptep)
-{
-	return ptep_get_and_clear(mm, addr, ptep);
-}
-
-static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
-					     unsigned long addr, pte_t *ptep,
-					     pte_t pte, int dirty)
-{
-	return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
 
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 					  unsigned long addr, unsigned long end,
@@ -97,4 +68,19 @@ static inline void arch_clear_hugepage_flags(struct page *page)
 	clear_bit(PG_dcache_clean, &page->flags);
 }
 
+extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+				struct page *page, int writable);
+#define arch_make_huge_pte arch_make_huge_pte
+extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+			    pte_t *ptep, pte_t pte);
+extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+				      unsigned long addr, pte_t *ptep,
+				      pte_t pte, int dirty);
+extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+				     unsigned long addr, pte_t *ptep);
+extern void huge_ptep_set_wrprotect(struct mm_struct *mm,
+				    unsigned long addr, pte_t *ptep);
+extern void huge_ptep_clear_flush(struct vm_area_struct *vma,
+				  unsigned long addr, pte_t *ptep);
+
 #endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index d6739e836f7b..5c25b831273d 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -90,7 +90,23 @@
 /*
  * Contiguous page definitions.
  */
-#define CONT_PTES		(_AC(1, UL) << CONT_SHIFT)
+#ifdef CONFIG_ARM64_64K_PAGES
+#define CONT_PTE_SHIFT		5
+#define CONT_PMD_SHIFT		5
+#elif defined(CONFIG_ARM64_16K_PAGES)
+#define CONT_PTE_SHIFT		7
+#define CONT_PMD_SHIFT		5
+#else
+#define CONT_PTE_SHIFT		4
+#define CONT_PMD_SHIFT		4
+#endif
+
+#define CONT_PTES		(1 << CONT_PTE_SHIFT)
+#define CONT_PTE_SIZE		(CONT_PTES * PAGE_SIZE)
+#define CONT_PTE_MASK		(~(CONT_PTE_SIZE - 1))
+#define CONT_PMDS		(1 << CONT_PMD_SHIFT)
+#define CONT_PMD_SIZE		(CONT_PMDS * PMD_SIZE)
+#define CONT_PMD_MASK		(~(CONT_PMD_SIZE - 1))
 /* the the numerical offset of the PTE within a range of CONT_PTES */
 #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1))
 
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 824416a219e7..51f382b7e5c7 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -227,7 +227,8 @@ static inline pte_t pte_mkspecial(pte_t pte)
 
 static inline pte_t pte_mkcont(pte_t pte)
 {
-	return set_pte_bit(pte, __pgprot(PTE_CONT));
+	pte = set_pte_bit(pte, __pgprot(PTE_CONT));
+	return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE));
 }
 
 static inline pte_t pte_mknoncont(pte_t pte)
@@ -235,6 +236,11 @@ static inline pte_t pte_mknoncont(pte_t pte)
 	return clear_pte_bit(pte, __pgprot(PTE_CONT));
 }
 
+static inline pmd_t pmd_mkcont(pmd_t pmd)
+{
+	return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
+}
+
 static inline void set_pte(pte_t *ptep, pte_t pte)
 {
 	*ptep = pte;
@@ -308,7 +314,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 /*
  * Hugetlb definitions.
  */
-#define HUGE_MAX_HSTATE		2
+#define HUGE_MAX_HSTATE		4
 #define HPAGE_SHIFT		PMD_SHIFT
 #define HPAGE_SIZE		(_AC(1, UL) << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 383b03ff38f8..82d607c3614e 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -41,17 +41,289 @@ int pud_huge(pud_t pud)
 #endif
 }
 
+static int find_num_contig(struct mm_struct *mm, unsigned long addr,
+			   pte_t *ptep, pte_t pte, size_t *pgsize)
+{
+	pgd_t *pgd = pgd_offset(mm, addr);
+	pud_t *pud;
+	pmd_t *pmd;
+
+	*pgsize = PAGE_SIZE;
+	if (!pte_cont(pte))
+		return 1;
+	if (!pgd_present(*pgd)) {
+		VM_BUG_ON(!pgd_present(*pgd));
+		return 1;
+	}
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud)) {
+		VM_BUG_ON(!pud_present(*pud));
+		return 1;
+	}
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd)) {
+		VM_BUG_ON(!pmd_present(*pmd));
+		return 1;
+	}
+	if ((pte_t *)pmd == ptep) {
+		*pgsize = PMD_SIZE;
+		return CONT_PMDS;
+	}
+	return CONT_PTES;
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+			    pte_t *ptep, pte_t pte)
+{
+	size_t pgsize;
+	int i;
+	int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize);
+	unsigned long pfn;
+	pgprot_t hugeprot;
+
+	if (ncontig == 1) {
+		set_pte_at(mm, addr, ptep, pte);
+		return;
+	}
+
+	pfn = pte_pfn(pte);
+	hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+	for (i = 0; i < ncontig; i++) {
+		pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep,
+			 pte_val(pfn_pte(pfn, hugeprot)));
+		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+		ptep++;
+		pfn += pgsize >> PAGE_SHIFT;
+		addr += pgsize;
+	}
+}
+
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+		      unsigned long addr, unsigned long sz)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pte_t *pte = NULL;
+
+	pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz);
+	pgd = pgd_offset(mm, addr);
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return NULL;
+
+	if (sz == PUD_SIZE) {
+		pte = (pte_t *)pud;
+	} else if (sz == (PAGE_SIZE * CONT_PTES)) {
+		pmd_t *pmd = pmd_alloc(mm, pud, addr);
+
+		WARN_ON(addr & (sz - 1));
+		/*
+		 * Note that if this code were ever ported to the
+		 * 32-bit arm platform then it will cause trouble in
+		 * the case where CONFIG_HIGHPTE is set, since there
+		 * will be no pte_unmap() to correspond with this
+		 * pte_alloc_map().
+		 */
+		pte = pte_alloc_map(mm, NULL, pmd, addr);
+	} else if (sz == PMD_SIZE) {
+		if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
+		    pud_none(*pud))
+			pte = huge_pmd_share(mm, addr, pud);
+		else
+			pte = (pte_t *)pmd_alloc(mm, pud, addr);
+	} else if (sz == (PMD_SIZE * CONT_PMDS)) {
+		pmd_t *pmd;
+
+		pmd = pmd_alloc(mm, pud, addr);
+		WARN_ON(addr & (sz - 1));
+		return (pte_t *)pmd;
+	}
+
+	pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr,
+	       sz, pte, pte_val(*pte));
+	return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd = NULL;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
+	if (!pgd_present(*pgd))
+		return NULL;
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+		return NULL;
+
+	if (pud_huge(*pud))
+		return (pte_t *)pud;
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		return NULL;
+
+	if (pte_cont(pmd_pte(*pmd))) {
+		pmd = pmd_offset(
+			pud, (addr & CONT_PMD_MASK));
+		return (pte_t *)pmd;
+	}
+	if (pmd_huge(*pmd))
+		return (pte_t *)pmd;
+	pte = pte_offset_kernel(pmd, addr);
+	if (pte_present(*pte) && pte_cont(*pte)) {
+		pte = pte_offset_kernel(
+			pmd, (addr & CONT_PTE_MASK));
+		return pte;
+	}
+	return NULL;
+}
+
+pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+			 struct page *page, int writable)
+{
+	size_t pagesize = huge_page_size(hstate_vma(vma));
+
+	if (pagesize == CONT_PTE_SIZE) {
+		entry = pte_mkcont(entry);
+	} else if (pagesize == CONT_PMD_SIZE) {
+		entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
+	} else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) {
+		pr_warn("%s: unrecognized huge page size 0x%lx\n",
+			__func__, pagesize);
+	}
+	return entry;
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+			      unsigned long addr, pte_t *ptep)
+{
+	pte_t pte;
+
+	if (pte_cont(*ptep)) {
+		int ncontig, i;
+		size_t pgsize;
+		pte_t *cpte;
+		bool is_dirty = false;
+
+		cpte = huge_pte_offset(mm, addr);
+		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+		/* save the 1st pte to return */
+		pte = ptep_get_and_clear(mm, addr, cpte);
+		for (i = 1; i < ncontig; ++i) {
+			/*
+			 * If HW_AFDBM is enabled, then the HW could
+			 * turn on the dirty bit for any of the page
+			 * in the set, so check them all.
+			 */
+			++cpte;
+			if (pte_dirty(ptep_get_and_clear(mm, addr, cpte)))
+				is_dirty = true;
+		}
+		if (is_dirty)
+			return pte_mkdirty(pte);
+		else
+			return pte;
+	} else {
+		return ptep_get_and_clear(mm, addr, ptep);
+	}
+}
+
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+			       unsigned long addr, pte_t *ptep,
+			       pte_t pte, int dirty)
+{
+	pte_t *cpte;
+
+	if (pte_cont(pte)) {
+		int ncontig, i, changed = 0;
+		size_t pgsize = 0;
+		unsigned long pfn = pte_pfn(pte);
+		/* Select all bits except the pfn */
+		pgprot_t hugeprot =
+			__pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^
+				 pte_val(pte));
+
+		cpte = huge_pte_offset(vma->vm_mm, addr);
+		pfn = pte_pfn(*cpte);
+		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+					  *cpte, &pgsize);
+		for (i = 0; i < ncontig; ++i, ++cpte) {
+			changed = ptep_set_access_flags(vma, addr, cpte,
+							pfn_pte(pfn,
+								hugeprot),
+							dirty);
+			pfn += pgsize >> PAGE_SHIFT;
+		}
+		return changed;
+	} else {
+		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+	}
+}
+
+void huge_ptep_set_wrprotect(struct mm_struct *mm,
+			     unsigned long addr, pte_t *ptep)
+{
+	if (pte_cont(*ptep)) {
+		int ncontig, i;
+		pte_t *cpte;
+		size_t pgsize = 0;
+
+		cpte = huge_pte_offset(mm, addr);
+		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+		for (i = 0; i < ncontig; ++i, ++cpte)
+			ptep_set_wrprotect(mm, addr, cpte);
+	} else {
+		ptep_set_wrprotect(mm, addr, ptep);
+	}
+}
+
+void huge_ptep_clear_flush(struct vm_area_struct *vma,
+			   unsigned long addr, pte_t *ptep)
+{
+	if (pte_cont(*ptep)) {
+		int ncontig, i;
+		pte_t *cpte;
+		size_t pgsize = 0;
+
+		cpte = huge_pte_offset(vma->vm_mm, addr);
+		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+					  *cpte, &pgsize);
+		for (i = 0; i < ncontig; ++i, ++cpte)
+			ptep_clear_flush(vma, addr, cpte);
+	} else {
+		ptep_clear_flush(vma, addr, ptep);
+	}
+}
+
 static __init int setup_hugepagesz(char *opt)
 {
 	unsigned long ps = memparse(opt, &opt);
+
 	if (ps == PMD_SIZE) {
 		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
 	} else if (ps == PUD_SIZE) {
 		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+	} else if (ps == (PAGE_SIZE * CONT_PTES)) {
+		hugetlb_add_hstate(CONT_PTE_SHIFT);
+	} else if (ps == (PMD_SIZE * CONT_PMDS)) {
+		hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT);
 	} else {
-		pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
+		pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
 		return 0;
 	}
 	return 1;
 }
 __setup("hugepagesz=", setup_hugepagesz);
+
+#ifdef CONFIG_ARM64_64K_PAGES
+static __init int add_default_hugepagesz(void)
+{
+	if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
+		hugetlb_add_hstate(CONT_PMD_SHIFT);
+	return 0;
+}
+arch_initcall(add_default_hugepagesz);
+#endif
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 685c262e0be8..b0eb06423d5e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -96,9 +96,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
 				struct address_space *mapping,
 				pgoff_t idx, unsigned long address);
 
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
-#endif
 
 extern int hugepages_treat_as_movable;
 extern int sysctl_hugetlb_shm_group;

From 0c6cdfec1241b011aeecb1cc0b4d69f122a8d075 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 18 Dec 2015 16:01:47 +0000
Subject: [PATCH 0355/3220] UPSTREAM: arm64: remove irq_count and
 do_softirq_own_stack()

sysrq_handle_reboot() re-enables interrupts while on the irq stack. The
irq_stack implementation wrongly assumed this would only ever happen
via the softirq path, allowing it to update irq_count late, in
do_softirq_own_stack().

This means if an irq occurs in sysrq_handle_reboot(), during
emergency_restart() the stack will be corrupted, as irq_count wasn't
updated.

Lose the optimisation, and instead of moving the adding/subtracting of
irq_count into irq_stack_entry/irq_stack_exit, remove it, and compare
sp_el0 (struct thread_info) with sp & ~(THREAD_SIZE - 1). This tells us
if we are on a task stack, if so, we can safely switch to the irq stack.
Finally, remove do_softirq_own_stack(), we don't need it anymore.

Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
[will: use get_thread_info macro]
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit d224a69e3d80fe08f285d1f41d21b590bae4fa9f)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I1f613279bf875443b10d65b1cd1ed4a6abfcb605
---
 arch/arm64/include/asm/irq.h |  2 --
 arch/arm64/kernel/entry.S    | 19 +++++++++---------
 arch/arm64/kernel/irq.c      | 38 +-----------------------------------
 3 files changed, 11 insertions(+), 48 deletions(-)

diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index 3bece4379bd9..b77197d941fc 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -11,8 +11,6 @@
 #include <asm-generic/irq.h>
 #include <asm/thread_info.h>
 
-#define __ARCH_HAS_DO_SOFTIRQ
-
 struct pt_regs;
 
 DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 0667fb7d8bb1..c0db321db7e1 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -181,19 +181,20 @@ alternative_endif
 	.macro	irq_stack_entry
 	mov	x19, sp			// preserve the original sp
 
-	this_cpu_ptr irq_stack, x25, x26
-
 	/*
-	 * Check the lowest address on irq_stack for the irq_count value,
-	 * incremented by do_softirq_own_stack if we have re-enabled irqs
-	 * while on the irq_stack.
+	 * Compare sp with the current thread_info, if the top
+	 * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and
+	 * should switch to the irq stack.
 	 */
-	ldr	x26, [x25]
-	cbnz	x26, 9998f		// recursive use?
+	and	x25, x19, #~(THREAD_SIZE - 1)
+	cmp	x25, tsk
+	b.ne	9998f
 
-	/* switch to the irq stack */
+	this_cpu_ptr irq_stack, x25, x26
 	mov	x26, #IRQ_STACK_START_SP
 	add	x26, x25, x26
+
+	/* switch to the irq stack */
 	mov	sp, x26
 
 	/*
@@ -405,10 +406,10 @@ el1_irq:
 	bl	trace_hardirqs_off
 #endif
 
+	get_thread_info tsk
 	irq_handler
 
 #ifdef CONFIG_PREEMPT
-	get_thread_info tsk
 	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
 	cbnz	w24, 1f				// preempt count != 0
 	ldr	x0, [tsk, #TI_FLAGS]		// get flags
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index ff7ebb710e51..2386b26c0712 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -25,24 +25,14 @@
 #include <linux/irq.h>
 #include <linux/smp.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/irqchip.h>
 #include <linux/seq_file.h>
 
 unsigned long irq_err_count;
 
-/*
- * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned.
- * irq_stack[0] is used as irq_count, a non-zero value indicates the stack
- * is in use, and el?_irq() shouldn't switch to it. This is used to detect
- * recursive use of the irq_stack, it is lazily updated by
- * do_softirq_own_stack(), which is called on the irq_stack, before
- * re-enabling interrupts to process softirqs.
- */
+/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */
 DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16);
 
-#define IRQ_COUNT()	(*per_cpu(irq_stack, smp_processor_id()))
-
 int arch_show_interrupts(struct seq_file *p, int prec)
 {
 	show_ipi_list(p, prec);
@@ -66,29 +56,3 @@ void __init init_IRQ(void)
 	if (!handle_arch_irq)
 		panic("No interrupt controller found.");
 }
-
-/*
- * do_softirq_own_stack() is called from irq_exit() before __do_softirq()
- * re-enables interrupts, at which point we may re-enter el?_irq(). We
- * increase irq_count here so that el1_irq() knows that it is already on the
- * irq stack.
- *
- * Called with interrupts disabled, so we don't worry about moving cpu, or
- * being interrupted while modifying irq_count.
- *
- * This function doesn't actually switch stack.
- */
-void do_softirq_own_stack(void)
-{
-	int cpu = smp_processor_id();
-
-	WARN_ON_ONCE(!irqs_disabled());
-
-	if (on_irq_stack(current_stack_pointer, cpu)) {
-		IRQ_COUNT()++;
-		__do_softirq();
-		IRQ_COUNT()--;
-	} else {
-		__do_softirq();
-	}
-}

From cc6027395e152f2a7aee43623bac1d3bdd8f5da0 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Tue, 15 Dec 2015 17:33:39 +0900
Subject: [PATCH 0356/3220] UPSTREAM: arm64: ftrace: modify a stack frame in a
 safe way

Function graph tracer modifies a return address (LR) in a stack frame by
calling ftrace_prepare_return() in a traced function's function prologue.
The current code does this modification before preserving an original
address at ftrace_push_return_trace() and there is always a small window
of inconsistency when an interrupt occurs.

This doesn't matter, as far as an interrupt stack is introduced, because
stack tracer won't be invoked in an interrupt context. But it would be
better to proactively minimize such a window by moving the LR modification
after ftrace_push_return_trace().

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 79fdee9b6355c9720f14717e1ad66af51bb331b5)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ief0a136aa01348b4d0d3f447544f21fd77835c67
---
 arch/arm64/kernel/ftrace.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 8f7005bc35bd..ebecf9aa33d1 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -129,23 +129,20 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 	 * on other archs. It's unlikely on AArch64.
 	 */
 	old = *parent;
-	*parent = return_hooker;
 
 	trace.func = self_addr;
 	trace.depth = current->curr_ret_stack + 1;
 
 	/* Only trace if the calling function expects to */
-	if (!ftrace_graph_entry(&trace)) {
-		*parent = old;
+	if (!ftrace_graph_entry(&trace))
 		return;
-	}
 
 	err = ftrace_push_return_trace(old, self_addr, &trace.depth,
 				       frame_pointer);
-	if (err == -EBUSY) {
-		*parent = old;
+	if (err == -EBUSY)
 		return;
-	}
+	else
+		*parent = return_hooker;
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE

From 6adbc95c63d8b5a2873865ee8b6a4092ac95a0e9 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Tue, 15 Dec 2015 17:33:40 +0900
Subject: [PATCH 0357/3220] UPSTREAM: arm64: pass a task parameter to
 unwind_frame()

Function graph tracer modifies a return address (LR) in a stack frame
to hook a function's return. This will result in many useless entries
(return_to_handler) showing up in a call stack list.
We will fix this problem in a later patch ("arm64: ftrace: fix a stack
tracer's output under function graph tracer"). But since real return
addresses are saved in ret_stack[] array in struct task_struct,
unwind functions need to be notified of, in addition to a stack pointer
address, which task is being traced in order to find out real return
addresses.

This patch extends unwind functions' interfaces by adding an extra
argument of a pointer to task_struct.

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit fe13f95b720075327a761fe6ddb45b0c90cab504)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I92a9a07468c182d5abbacaa73a90984ab11ad535
---
 arch/arm64/include/asm/stacktrace.h | 6 ++++--
 arch/arm64/kernel/perf_callchain.c  | 2 +-
 arch/arm64/kernel/process.c         | 2 +-
 arch/arm64/kernel/return_address.c  | 2 +-
 arch/arm64/kernel/stacktrace.c      | 8 ++++----
 arch/arm64/kernel/time.c            | 2 +-
 arch/arm64/kernel/traps.c           | 2 +-
 7 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 7318f6d54aa9..6fb61c5090b4 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -16,14 +16,16 @@
 #ifndef __ASM_STACKTRACE_H
 #define __ASM_STACKTRACE_H
 
+struct task_struct;
+
 struct stackframe {
 	unsigned long fp;
 	unsigned long sp;
 	unsigned long pc;
 };
 
-extern int unwind_frame(struct stackframe *frame);
-extern void walk_stackframe(struct stackframe *frame,
+extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
+extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
 			    int (*fn)(struct stackframe *, void *), void *data);
 
 #endif	/* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 3aa74830cc69..797220da912b 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -165,7 +165,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	frame.sp = regs->sp;
 	frame.pc = regs->pc;
 
-	walk_stackframe(&frame, callchain_trace, entry);
+	walk_stackframe(current, &frame, callchain_trace, entry);
 }
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 62068ded1e90..51be6356fb03 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -414,7 +414,7 @@ unsigned long get_wchan(struct task_struct *p)
 	do {
 		if (frame.sp < stack_page ||
 		    frame.sp >= stack_page + THREAD_SIZE ||
-		    unwind_frame(&frame))
+		    unwind_frame(p, &frame))
 			return 0;
 		if (!in_sched_functions(frame.pc))
 			return frame.pc;
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
index 6c4fd2810ecb..07b37ac05be4 100644
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -44,7 +44,7 @@ void *return_address(unsigned int level)
 	frame.sp = current_stack_pointer;
 	frame.pc = (unsigned long)return_address; /* dummy */
 
-	walk_stackframe(&frame, save_return_addr, &data);
+	walk_stackframe(current, &frame, save_return_addr, &data);
 
 	if (!data.level)
 		return data.addr;
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index b9fd3a8abfc1..f7ee597ec883 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -36,7 +36,7 @@
  *	ldp	x29, x30, [sp]
  *	add	sp, sp, #0x10
  */
-int notrace unwind_frame(struct stackframe *frame)
+int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 {
 	unsigned long high, low;
 	unsigned long fp = frame->fp;
@@ -99,7 +99,7 @@ int notrace unwind_frame(struct stackframe *frame)
 	return 0;
 }
 
-void notrace walk_stackframe(struct stackframe *frame,
+void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
 		     int (*fn)(struct stackframe *, void *), void *data)
 {
 	while (1) {
@@ -107,7 +107,7 @@ void notrace walk_stackframe(struct stackframe *frame,
 
 		if (fn(frame, data))
 			break;
-		ret = unwind_frame(frame);
+		ret = unwind_frame(tsk, frame);
 		if (ret < 0)
 			break;
 	}
@@ -159,7 +159,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 		frame.pc = (unsigned long)save_stack_trace_tsk;
 	}
 
-	walk_stackframe(&frame, save_trace, &data);
+	walk_stackframe(tsk, &frame, save_trace, &data);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 13339b6ffc1a..6e5c521f123a 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -53,7 +53,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 	frame.sp = regs->sp;
 	frame.pc = regs->pc;
 	do {
-		int ret = unwind_frame(&frame);
+		int ret = unwind_frame(NULL, &frame);
 		if (ret < 0)
 			return 0;
 	} while (in_lock_functions(frame.pc));
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 8a0084541f84..937008523fa5 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -177,7 +177,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 		int ret;
 
 		dump_backtrace_entry(where);
-		ret = unwind_frame(&frame);
+		ret = unwind_frame(tsk, &frame);
 		if (ret < 0)
 			break;
 		stack = frame.sp;

From 0c078f6a47bb3de392e16788a119d575441c1fb2 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Tue, 15 Dec 2015 17:33:41 +0900
Subject: [PATCH 0358/3220] UPSTREAM: arm64: ftrace: fix a stack tracer's
 output under function graph tracer

Function graph tracer modifies a return address (LR) in a stack frame
to hook a function return. This will result in many useless entries
(return_to_handler) showing up in
 a) a stack tracer's output
 b) perf call graph (with perf record -g)
 c) dump_backtrace (at panic et al.)

For example, in case of a),
  $ echo function_graph > /sys/kernel/debug/tracing/current_tracer
  $ echo 1 > /proc/sys/kernel/stack_trace_enabled
  $ cat /sys/kernel/debug/tracing/stack_trace
        Depth    Size   Location    (54 entries)
        -----    ----   --------
  0)     4504      16   gic_raise_softirq+0x28/0x150
  1)     4488      80   smp_cross_call+0x38/0xb8
  2)     4408      48   return_to_handler+0x0/0x40
  3)     4360      32   return_to_handler+0x0/0x40
  ...

In case of b),
  $ echo function_graph > /sys/kernel/debug/tracing/current_tracer
  $ perf record -e mem:XXX:x -ag -- sleep 10
  $ perf report
                  ...
                  |          |          |--0.22%-- 0x550f8
                  |          |          |          0x10888
                  |          |          |          el0_svc_naked
                  |          |          |          sys_openat
                  |          |          |          return_to_handler
                  |          |          |          return_to_handler
                  ...

In case of c),
  $ echo function_graph > /sys/kernel/debug/tracing/current_tracer
  $ echo c > /proc/sysrq-trigger
  ...
  Call trace:
  [<ffffffc00044d3ac>] sysrq_handle_crash+0x24/0x30
  [<ffffffc000092250>] return_to_handler+0x0/0x40
  [<ffffffc000092250>] return_to_handler+0x0/0x40
  ...

This patch replaces such entries with real addresses preserved in
current->ret_stack[] at unwind_frame(). This way, we can cover all
the cases.

Reviewed-by: Jungseok Lee <jungseoklee85@gmail.com>
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
[will: fixed minor context changes conflicting with irq stack bits]
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 20380bb390a443b2c5c8800cec59743faf8151b4)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I6360182f8d04fdd2e31c0cb6054aefa2adb216e7
---
 arch/arm64/include/asm/ftrace.h     |  2 ++
 arch/arm64/include/asm/stacktrace.h |  3 +++
 arch/arm64/kernel/perf_callchain.c  |  3 +++
 arch/arm64/kernel/process.c         |  3 +++
 arch/arm64/kernel/return_address.c  |  3 +++
 arch/arm64/kernel/stacktrace.c      | 17 +++++++++++++++++
 arch/arm64/kernel/time.c            |  3 +++
 arch/arm64/kernel/traps.c           | 26 ++++++++++++++++++++------
 8 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index c5534facf941..3c60f37e48ab 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -28,6 +28,8 @@ struct dyn_arch_ftrace {
 
 extern unsigned long ftrace_graph_call;
 
+extern void return_to_handler(void);
+
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
 	/*
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 6fb61c5090b4..801a16dbbdf6 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -22,6 +22,9 @@ struct stackframe {
 	unsigned long fp;
 	unsigned long sp;
 	unsigned long pc;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	unsigned int graph;
+#endif
 };
 
 extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 797220da912b..ff4665462a02 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -164,6 +164,9 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	frame.fp = regs->regs[29];
 	frame.sp = regs->sp;
 	frame.pc = regs->pc;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	frame.graph = current->curr_ret_stack;
+#endif
 
 	walk_stackframe(current, &frame, callchain_trace, entry);
 }
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 51be6356fb03..5f6244526e31 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -410,6 +410,9 @@ unsigned long get_wchan(struct task_struct *p)
 	frame.fp = thread_saved_fp(p);
 	frame.sp = thread_saved_sp(p);
 	frame.pc = thread_saved_pc(p);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	frame.graph = p->curr_ret_stack;
+#endif
 	stack_page = (unsigned long)task_stack_page(p);
 	do {
 		if (frame.sp < stack_page ||
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
index 07b37ac05be4..1718706fde83 100644
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -43,6 +43,9 @@ void *return_address(unsigned int level)
 	frame.fp = (unsigned long)__builtin_frame_address(0);
 	frame.sp = current_stack_pointer;
 	frame.pc = (unsigned long)return_address; /* dummy */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	frame.graph = current->curr_ret_stack;
+#endif
 
 	walk_stackframe(current, &frame, save_return_addr, &data);
 
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index f7ee597ec883..4fad9787ab46 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -17,6 +17,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/ftrace.h>
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
 
@@ -66,6 +67,19 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	frame->fp = *(unsigned long *)(fp);
 	frame->pc = *(unsigned long *)(fp + 8);
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	if (tsk && tsk->ret_stack &&
+			(frame->pc == (unsigned long)return_to_handler)) {
+		/*
+		 * This is a case where function graph tracer has
+		 * modified a return address (LR) in a stack frame
+		 * to hook a function return.
+		 * So replace it to an original value.
+		 */
+		frame->pc = tsk->ret_stack[frame->graph--].ret;
+	}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 	/*
 	 * Check whether we are going to walk through from interrupt stack
 	 * to task stack.
@@ -158,6 +172,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 		frame.sp = current_stack_pointer;
 		frame.pc = (unsigned long)save_stack_trace_tsk;
 	}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	frame.graph = tsk->curr_ret_stack;
+#endif
 
 	walk_stackframe(tsk, &frame, save_trace, &data);
 	if (trace->nr_entries < trace->max_entries)
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 6e5c521f123a..59779699a1a4 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -52,6 +52,9 @@ unsigned long profile_pc(struct pt_regs *regs)
 	frame.fp = regs->regs[29];
 	frame.sp = regs->sp;
 	frame.pc = regs->pc;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	frame.graph = -1; /* no task info */
+#endif
 	do {
 		int ret = unwind_frame(NULL, &frame);
 		if (ret < 0)
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 937008523fa5..bdc293f6adc4 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -147,17 +147,14 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 {
 	struct stackframe frame;
 	unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+	int skip;
 
 	pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
 
 	if (!tsk)
 		tsk = current;
 
-	if (regs) {
-		frame.fp = regs->regs[29];
-		frame.sp = regs->sp;
-		frame.pc = regs->pc;
-	} else if (tsk == current) {
+	if (tsk == current) {
 		frame.fp = (unsigned long)__builtin_frame_address(0);
 		frame.sp = current_stack_pointer;
 		frame.pc = (unsigned long)dump_backtrace;
@@ -169,14 +166,31 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 		frame.sp = thread_saved_sp(tsk);
 		frame.pc = thread_saved_pc(tsk);
 	}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	frame.graph = tsk->curr_ret_stack;
+#endif
 
+	skip = !!regs;
 	pr_emerg("Call trace:\n");
 	while (1) {
 		unsigned long where = frame.pc;
 		unsigned long stack;
 		int ret;
 
-		dump_backtrace_entry(where);
+		/* skip until specified stack frame */
+		if (!skip) {
+			dump_backtrace_entry(where);
+		} else if (frame.fp == regs->regs[29]) {
+			skip = 0;
+			/*
+			 * Mostly, this is the case where this function is
+			 * called in panic/abort. As exception handler's
+			 * stack frame does not contain the corresponding pc
+			 * at which an exception has taken place, use regs->pc
+			 * instead.
+			 */
+			dump_backtrace_entry(regs->pc);
+		}
 		ret = unwind_frame(tsk, &frame);
 		if (ret < 0)
 			break;

From cc68289965af2e593fdf5161f7e6407d48d79191 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 21 Dec 2015 16:44:27 +0000
Subject: [PATCH 0359/3220] UPSTREAM: arm64: traps: address fallout from printk
 -> pr_* conversion

Commit ac7b406c1a9d ("arm64: Use pr_* instead of printk") was a fairly
mindless s/printk/pr_*/ change driven by a complaint from checkpatch.

As is usual with such changes, this has led to some odd behaviour on
arm64:

  * syslog now picks up the "pr_emerg" line from dump_backtrace, but not
    the actual trace, which leads to a bunch of "kernel:Call trace:"
    lines in the log

  * __{pte,pmd,pgd}_error print at KERN_CRIT, as opposed to KERN_ERR
    which is used by other architectures.

This patch restores the original printk behaviour for dump_backtrace
and downgrade the pgtable error macros to KERN_ERR.

Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit c9cd0ed925c0b927283d4739bfe689eb9d1e9dfd)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Iaf028e5368df4623f7257ef432a7f9da86261609
---
 arch/arm64/kernel/traps.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index bdc293f6adc4..cbedd724f48e 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -171,7 +171,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 #endif
 
 	skip = !!regs;
-	pr_emerg("Call trace:\n");
+	printk("Call trace:\n");
 	while (1) {
 		unsigned long where = frame.pc;
 		unsigned long stack;
@@ -482,22 +482,22 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
 
 void __pte_error(const char *file, int line, unsigned long val)
 {
-	pr_crit("%s:%d: bad pte %016lx.\n", file, line, val);
+	pr_err("%s:%d: bad pte %016lx.\n", file, line, val);
 }
 
 void __pmd_error(const char *file, int line, unsigned long val)
 {
-	pr_crit("%s:%d: bad pmd %016lx.\n", file, line, val);
+	pr_err("%s:%d: bad pmd %016lx.\n", file, line, val);
 }
 
 void __pud_error(const char *file, int line, unsigned long val)
 {
-	pr_crit("%s:%d: bad pud %016lx.\n", file, line, val);
+	pr_err("%s:%d: bad pud %016lx.\n", file, line, val);
 }
 
 void __pgd_error(const char *file, int line, unsigned long val)
 {
-	pr_crit("%s:%d: bad pgd %016lx.\n", file, line, val);
+	pr_err("%s:%d: bad pgd %016lx.\n", file, line, val);
 }
 
 /* GENERIC_BUG traps */

From 606d0cab2091ee86fa8dc7c77a53a9e59e9fac05 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 5 Jan 2016 15:36:59 +0000
Subject: [PATCH 0360/3220] UPSTREAM: arm64: mm: move pgd_cache initialisation
 to pgtable_cache_init

Initialising the suppport for EFI runtime services requires us to
allocate a pgd off the back of an early_initcall. On systems where the
PGD_SIZE is smaller than PAGE_SIZE (e.g. 64k pages and 48-bit VA), the
pgd_cache isn't initialised at this stage, and we panic with a NULL
dereference during boot:

  Unable to handle kernel NULL pointer dereference at virtual address 00000000

  __create_mapping.isra.5+0x84/0x350
  create_pgd_mapping+0x20/0x28
  efi_create_mapping+0x5c/0x6c
  arm_enable_runtime_services+0x154/0x1e4
  do_one_initcall+0x8c/0x190
  kernel_init_freeable+0x84/0x1ec
  kernel_init+0x10/0xe0
  ret_from_fork+0x10/0x50

This patch fixes the problem by initialising the pgd_cache earlier, in
the pgtable_cache_init callback, which sounds suspiciously like what it
was intended for.

Reported-by: Dennis Chen <dennis.chen@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 39b5be9b4233a9f212b98242bddf008f379b5122)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I124f03d19299be93124af35641294bf73c13bb22
---
 arch/arm64/include/asm/pgtable.h |  3 ++-
 arch/arm64/mm/pgd.c              | 12 ++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 51f382b7e5c7..69d2e2f86bce 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -680,7 +680,8 @@ extern int kern_addr_valid(unsigned long addr);
 
 #include <asm-generic/pgtable.h>
 
-#define pgtable_cache_init() do { } while (0)
+void pgd_cache_init(void);
+#define pgtable_cache_init	pgd_cache_init
 
 /*
  * On AArch64, the cache coherency is handled via the set_pte_at() function.
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index cb3ba1b812e7..ae11d4e03d0e 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -46,14 +46,14 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 		kmem_cache_free(pgd_cache, pgd);
 }
 
-static int __init pgd_cache_init(void)
+void __init pgd_cache_init(void)
 {
+	if (PGD_SIZE == PAGE_SIZE)
+		return;
+
 	/*
 	 * Naturally aligned pgds required by the architecture.
 	 */
-	if (PGD_SIZE != PAGE_SIZE)
-		pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE,
-					      SLAB_PANIC, NULL);
-	return 0;
+	pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE,
+				      SLAB_PANIC, NULL);
 }
-core_initcall(pgd_cache_init);

From d686cf4829c01ca92e8b10603b9250221b3bdfcb Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 5 Jan 2016 17:33:34 +0000
Subject: [PATCH 0361/3220] UPSTREAM: arm64: entry: remove pointless SPSR mode
 check

In work_pending, we may skip work if the stacked SPSR value represents
anything other than an EL0 context. We then immediately invoke the
kernel_exit 0 macro as part of ret_to_user, assuming a return to EL0.
This is somewhat confusing.

We use work_pending as part of the ret_to_user/ret_fast_syscall state
machine. We only use ret_fast_syscall in the return from an SVC issued
from EL0. We use ret_to_user for return from EL0 exception handlers and
also for return from ret_from_fork in the case the task was not a kernel
thread (i.e. it is a user task).

Thus in all cases the stacked SPSR value must represent an EL0 context,
and the check is redundant. This patch removes it, along with the now
unused no_work_pending label.

Cc: Chris Metcalf <cmetcalf@ezchip.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit ee03353bc04f8e460cc4e3da80d9721d9ecb89f1)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I2738149342ef469e8cde7c5f8f7c65daab93fb2b
---
 arch/arm64/kernel/entry.S | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index c0db321db7e1..1f7f5a2b61bf 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -676,10 +676,7 @@ ret_fast_syscall_trace:
 work_pending:
 	tbnz	x1, #TIF_NEED_RESCHED, work_resched
 	/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
-	ldr	x2, [sp, #S_PSTATE]
 	mov	x0, sp				// 'regs'
-	tst	x2, #PSR_MODE_MASK		// user mode regs?
-	b.ne	no_work_pending			// returning to kernel
 	enable_irq				// enable interrupts for do_notify_resume()
 	bl	do_notify_resume
 	b	ret_to_user
@@ -698,7 +695,6 @@ ret_to_user:
 	and	x2, x1, #_TIF_WORK_MASK
 	cbnz	x2, work_pending
 	enable_step_tsk x1, x2
-no_work_pending:
 	kernel_exit 0
 ENDPROC(ret_to_user)
 

From 3c31c6d7098894f0bf886ebe711fd930855626e5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 23 Dec 2015 10:29:28 +0100
Subject: [PATCH 0362/3220] UPSTREAM: efi: stub: define
 DISABLE_BRANCH_PROFILING for all architectures

This moves the DISABLE_BRANCH_PROFILING define from the x86 specific
to the general CFLAGS definition for the stub. This fixes build errors
when building for arm64 with CONFIG_PROFILE_ALL_BRANCHES_ENABLED.

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit b523e185bba36164ca48a190f5468c140d815414)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Id5a2a3986adc60ddf3d247c038667ce9719f3ee0
---
 drivers/firmware/efi/libstub/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 3c0467d3688c..c0ddd1b8dca3 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -8,7 +8,7 @@ cflags-$(CONFIG_X86_32)		:= -march=i386
 cflags-$(CONFIG_X86_64)		:= -mcmodel=small
 cflags-$(CONFIG_X86)		+= -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
 				   -fPIC -fno-strict-aliasing -mno-red-zone \
-				   -mno-mmx -mno-sse -DDISABLE_BRANCH_PROFILING
+				   -mno-mmx -mno-sse
 
 cflags-$(CONFIG_ARM64)		:= $(subst -pg,,$(KBUILD_CFLAGS))
 cflags-$(CONFIG_ARM)		:= $(subst -pg,,$(KBUILD_CFLAGS)) \
@@ -16,7 +16,7 @@ cflags-$(CONFIG_ARM)		:= $(subst -pg,,$(KBUILD_CFLAGS)) \
 
 cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
 
-KBUILD_CFLAGS			:= $(cflags-y) \
+KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
 				   $(call cc-option,-ffreestanding) \
 				   $(call cc-option,-fno-stack-protector)
 

From a725004a74ab543510a8158b206c334756e9d8cf Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 6 Jan 2016 11:05:27 +0000
Subject: [PATCH 0363/3220] UPSTREAM: arm64: head.S: use memset to clear BSS

Currently we use an open-coded memzero to clear the BSS. As it is a
trivial implementation, it is sub-optimal.

Our optimised memset doesn't use the stack, is position-independent, and
for the memzero case can use of DC ZVA to clear large blocks
efficiently. In __mmap_switched the MMU is on and there are no live
caller-saved registers, so we can safely call an uninstrumented memset.

This patch changes __mmap_switched to use memset when clearing the BSS.
We use the __pi_memset alias so as to avoid any instrumentation in all
kernel configurations.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 2a803c4db615d85126c5c7afd5849a3cfde71422)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I3dc7050fe5566f2126cbea9abfa6063c8e6b029a
---
 arch/arm64/kernel/head.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 17ce7285bb12..917d98108b3f 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -415,14 +415,13 @@ ENDPROC(__create_page_tables)
  */
 	.set	initial_sp, init_thread_union + THREAD_START_SP
 __mmap_switched:
-	adr_l	x6, __bss_start
-	adr_l	x7, __bss_stop
+	// Clear BSS
+	adr_l	x0, __bss_start
+	mov	x1, xzr
+	adr_l	x2, __bss_stop
+	sub	x2, x2, x0
+	bl	__pi_memset
 
-1:	cmp	x6, x7
-	b.hs	2f
-	str	xzr, [x6], #8			// Clear BSS
-	b	1b
-2:
 	adr_l	sp, initial_sp, x4
 	mov	x4, sp
 	and	x4, x4, #~(THREAD_SIZE - 1)

From 66a58725b83cdc0754f0329465f7169aa8417e3d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 15 Jan 2016 13:28:57 +0100
Subject: [PATCH 0364/3220] UPSTREAM: arm64: hide __efistub_ aliases from
 kallsyms

Commit e8f3010f7326 ("arm64/efi: isolate EFI stub from the kernel
proper") isolated the EFI stub code from the kernel proper by prefixing
all of its symbols with __efistub_, and selectively allowing access to
core kernel symbols from the stub by emitting __efistub_ aliases for
functions and variables that the stub can access legally.

As an unintended side effect, these aliases are emitted into the
kallsyms symbol table, which means they may turn up in backtraces,
e.g.,

  ...
  PC is at __efistub_memset+0x108/0x200
  LR is at fixup_init+0x3c/0x48
  ...
  [<ffffff8008328608>] __efistub_memset+0x108/0x200
  [<ffffff8008094dcc>] free_initmem+0x2c/0x40
  [<ffffff8008645198>] kernel_init+0x20/0xe0
  [<ffffff8008085cd0>] ret_from_fork+0x10/0x40

The backtrace in question has nothing to do with the EFI stub, but
simply returns one of the several aliases of memset() that have been
recorded in the kallsyms table. This is undesirable, since it may
suggest to people who are not aware of this that the issue they are
seeing is somehow EFI related.

So hide the __efistub_ aliases from kallsyms, by emitting them as
absolute linker symbols explicitly. The distinction between those
and section relative symbols is completely irrelevant to these
definitions, and to the final link we are performing when these
definitions are being taken into account (the distinction is only
relevant to symbols defined inside a section definition when performing
a partial link), and so the resulting values are identical to the
original ones. Since absolute symbols are ignored by kallsyms, this
will result in these values to be omitted from its symbol table.

After this patch, the backtrace generated from the same address looks
like this:
  ...
  PC is at __memset+0x108/0x200
  LR is at fixup_init+0x3c/0x48
  ...
  [<ffffff8008328608>] __memset+0x108/0x200
  [<ffffff8008094dcc>] free_initmem+0x2c/0x40
  [<ffffff8008645198>] kernel_init+0x20/0xe0
  [<ffffff8008085cd0>] ret_from_fork+0x10/0x40

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 75feee3d9d51775072d3a04f47d4a439a4c4590e)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ieeb8c576e31f0dd0b7f82982ffa6362864eed311
---
 arch/arm64/kernel/image.h | 40 ++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index bc2abb8b1599..999633bd7294 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -64,6 +64,16 @@
 
 #ifdef CONFIG_EFI
 
+/*
+ * Prevent the symbol aliases below from being emitted into the kallsyms
+ * table, by forcing them to be absolute symbols (which are conveniently
+ * ignored by scripts/kallsyms) rather than section relative symbols.
+ * The distinction is only relevant for partial linking, and only for symbols
+ * that are defined within a section declaration (which is not the case for
+ * the definitions below) so the resulting values will be identical.
+ */
+#define KALLSYMS_HIDE(sym)	ABSOLUTE(sym)
+
 /*
  * The EFI stub has its own symbol namespace prefixed by __efistub_, to
  * isolate it from the kernel proper. The following symbols are legally
@@ -73,25 +83,25 @@
  * linked at. The routines below are all implemented in assembler in a
  * position independent manner
  */
-__efistub_memcmp		= __pi_memcmp;
-__efistub_memchr		= __pi_memchr;
-__efistub_memcpy		= __pi_memcpy;
-__efistub_memmove		= __pi_memmove;
-__efistub_memset		= __pi_memset;
-__efistub_strlen		= __pi_strlen;
-__efistub_strcmp		= __pi_strcmp;
-__efistub_strncmp		= __pi_strncmp;
-__efistub___flush_dcache_area	= __pi___flush_dcache_area;
+__efistub_memcmp		= KALLSYMS_HIDE(__pi_memcmp);
+__efistub_memchr		= KALLSYMS_HIDE(__pi_memchr);
+__efistub_memcpy		= KALLSYMS_HIDE(__pi_memcpy);
+__efistub_memmove		= KALLSYMS_HIDE(__pi_memmove);
+__efistub_memset		= KALLSYMS_HIDE(__pi_memset);
+__efistub_strlen		= KALLSYMS_HIDE(__pi_strlen);
+__efistub_strcmp		= KALLSYMS_HIDE(__pi_strcmp);
+__efistub_strncmp		= KALLSYMS_HIDE(__pi_strncmp);
+__efistub___flush_dcache_area	= KALLSYMS_HIDE(__pi___flush_dcache_area);
 
 #ifdef CONFIG_KASAN
-__efistub___memcpy		= __pi_memcpy;
-__efistub___memmove		= __pi_memmove;
-__efistub___memset		= __pi_memset;
+__efistub___memcpy		= KALLSYMS_HIDE(__pi_memcpy);
+__efistub___memmove		= KALLSYMS_HIDE(__pi_memmove);
+__efistub___memset		= KALLSYMS_HIDE(__pi_memset);
 #endif
 
-__efistub__text			= _text;
-__efistub__end			= _end;
-__efistub__edata		= _edata;
+__efistub__text			= KALLSYMS_HIDE(_text);
+__efistub__end			= KALLSYMS_HIDE(_end);
+__efistub__edata		= KALLSYMS_HIDE(_edata);
 
 #endif
 

From a84dda4e8627f7b6a91a5c2cd17e4cd059ac9cbf Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 15 Jan 2016 16:55:33 -0800
Subject: [PATCH 0365/3220] UPSTREAM: arch/arm/include/asm/pgtable-3level.h:
 add pmd_mkclean for THP

MADV_FREE needs pmd_dirty and pmd_mkclean for detecting recent overwrite
of the contents since MADV_FREE syscall is called for THP page.

This patch adds pmd_mkclean for THP page MADV_FREE support.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Shaohua Li <shli@kernel.org>
Cc: <yalin.wang2010@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chen Gang <gang.chen.5i5j@gmail.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Helge Deller <deller@gmx.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Jason Evans <je@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mika Penttil <mika.penttila@nextfour.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roland Dreier <roland@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 44842045e4baaf406db2954dd2e07152fa61528d)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I59d53667aa8c40dea4f18fc58acc7d27f4a85a04
---
 arch/arm/include/asm/pgtable-3level.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index a745a2a53853..6d6012a320b2 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -249,6 +249,7 @@ PMD_BIT_FUNC(mkold,	&= ~PMD_SECT_AF);
 PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
 PMD_BIT_FUNC(mkwrite,   &= ~L_PMD_SECT_RDONLY);
 PMD_BIT_FUNC(mkdirty,   |= L_PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mkclean,   &= ~L_PMD_SECT_DIRTY);
 PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 
 #define pmd_mkhuge(pmd)		(__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))

From fd89b55b8391791a0cd49258a8922b4982d19f47 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 11 Jan 2016 14:50:21 +0100
Subject: [PATCH 0366/3220] UPSTREAM: arm64: kasan: ensure that the KASAN zero
 page is mapped read-only

When switching from the early KASAN shadow region, which maps the
entire shadow space read-write, to the permanent KASAN shadow region,
which uses a zero page to shadow regions that are not subject to
instrumentation, the lowest level table kasan_zero_pte[] may be
reused unmodified, which means that the mappings of the zero page
that it contains will still be read-write.

So update it explicitly to map the zero page read only when we
activate the permanent mapping.

Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 7b1af9795773d745c2a8c7d4ca5f2936e8b6adfb)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I4745dc91236c2612ad3e13be1cc176ffd923f7da
---
 arch/arm64/mm/kasan_init.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index cf038c7d9fa9..cab7a5be40aa 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -120,6 +120,7 @@ static void __init cpu_set_ttbr1(unsigned long ttbr1)
 void __init kasan_init(void)
 {
 	struct memblock_region *reg;
+	int i;
 
 	/*
 	 * We are going to perform proper setup of shadow memory.
@@ -155,6 +156,14 @@ void __init kasan_init(void)
 				pfn_to_nid(virt_to_pfn(start)));
 	}
 
+	/*
+	 * KAsan may reuse the contents of kasan_zero_pte directly, so we
+	 * should make sure that it maps the zero page read-only.
+	 */
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		set_pte(&kasan_zero_pte[i],
+			pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
+
 	memset(kasan_zero_page, 0, PAGE_SIZE);
 	cpu_set_ttbr1(__pa(swapper_pg_dir));
 	flush_tlb_all();

From da4712caec8748615aa17eeb9d6c3cd3b0f4f910 Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Sun, 24 Jan 2016 15:24:12 +0900
Subject: [PATCH 0367/3220] UPSTREAM: arm64: Fix an enum typo in mm/dump.c

This patch fixes a typo in mm/dump.c:
"MODUELS_END_NR" should be "MODULES_END_NR".

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit b3122023df935cf14bf951da98ca598d71b9f826)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I5e73d18fd70f20f200a974f5f2ba22cb7ae64952
---
 arch/arm64/mm/dump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 5a22a119a74c..0adbebbc2803 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -46,7 +46,7 @@ enum address_markers_idx {
 	PCI_START_NR,
 	PCI_END_NR,
 	MODULES_START_NR,
-	MODUELS_END_NR,
+	MODULES_END_NR,
 	KERNEL_SPACE_NR,
 };
 

From 720345213019ec25bc43f6b78592f4c39e2c1731 Mon Sep 17 00:00:00 2001
From: William Cohen <wcohen@redhat.com>
Date: Thu, 21 Jan 2016 22:56:26 -0500
Subject: [PATCH 0368/3220] BACKPORT: Eliminate the .eh_frame sections from the
 aarch64 vmlinux and kernel modules

By default the aarch64 gcc generates .eh_frame sections.  Unlike
.debug_frame sections, the .eh_frame sections are loaded into memory
when the associated code is loaded.  On an example kernel being built
with this default the .eh_frame section in vmlinux used an extra 1.7MB
of memory.  The x86 disables the creation of the .eh_frame section.
The aarch64 should probably do the same to save some memory.

Signed-off-by: William Cohen <wcohen@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 728dabd6d1751cf5e0f8e0535891393da62396e9)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I697baae2209d6d11f2cc447459d935f7200eb7b1
---
 arch/arm64/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 4e77fd165f38..693a4dc6220c 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -28,6 +28,7 @@ endif
 
 KBUILD_CFLAGS	+= -mgeneral-regs-only $(lseinstr)
 KBUILD_CFLAGS	+= -fno-pic
+KBUILD_CFLAGS	+= -fno-asynchronous-unwind-tables
 KBUILD_AFLAGS	+= $(lseinstr)
 
 ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)

From 81d07628d337df8070830a82fbca2f52015e0ebd Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:44:55 +0000
Subject: [PATCH 0369/3220] UPSTREAM: asm-generic: Fix local variable shadow in
 __set_fixmap_offset

Currently __set_fixmap_offset is a macro function which has a local
variable called 'addr'. If a caller passes a 'phys' parameter which is
derived from a variable also called 'addr', the local variable will
shadow this, and the compiler will complain about the use of an
uninitialized variable. To avoid the issue with namespace clashes,
'addr' is prefixed with a liberal sprinkling of underscores.

Turning __set_fixmap_offset into a static inline breaks the build for
several architectures. Fixing this properly requires updates to a number
of architectures to make them agree on the prototype of __set_fixmap (it
could be done as a subsequent patch series).

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
[catalin.marinas@arm.com: squashed the original function patch and macro fixup]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 3694bd76781b76c4f8d2ecd85018feeb1609f0e5)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Iec27cb36dfca39e333de9f7318e76da0670d0156
---
 include/asm-generic/fixmap.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h
index 1cbb8338edf3..827e4d3bbc7a 100644
--- a/include/asm-generic/fixmap.h
+++ b/include/asm-generic/fixmap.h
@@ -70,12 +70,12 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
 #endif
 
 /* Return a pointer with offset calculated */
-#define __set_fixmap_offset(idx, phys, flags)		      \
-({							      \
-	unsigned long addr;				      \
-	__set_fixmap(idx, phys, flags);			      \
-	addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1)); \
-	addr;						      \
+#define __set_fixmap_offset(idx, phys, flags)				\
+({									\
+	unsigned long ________addr;					\
+	__set_fixmap(idx, phys, flags);					\
+	________addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1));	\
+	________addr;							\
 })
 
 #define set_fixmap_offset(idx, phys) \

From 0c8db8d0d4c615dd53b703fd632d4481e2bb18d6 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:44:56 +0000
Subject: [PATCH 0370/3220] UPSTREAM: arm64: mm: specialise pagetable
 allocators

We pass a size parameter to early_alloc and late_alloc, but these are
only ever used to allocate single pages. In late_alloc we always
allocate a single page.

Both allocators provide us with zeroed pages (such that all entries are
invalid), but we have no barriers between allocating a page and adding
that page to existing (live) tables. A concurrent page table walk may
see stale data, leading to a number of issues.

This patch specialises the two allocators for page tables. The size
parameter is removed and the necessary dsb(ishst) is folded into each.
To make it clear that the functions are intended for use for page table
allocation, they are renamed to {early,late}_pgtable_alloc, with the
related function pointed renamed to pgtable_alloc.

As the dsb(ishst) is now in the allocator, the existing barrier for the
zero page is redundant and thus is removed. The previously missing
include of barrier.h is added.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 21ab99c289d350f4ae454bc069870009db6df20e)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ib88fafc146506943122aa1ca6ee5a6c331ddb26c
---
 arch/arm64/mm/mmu.c | 49 +++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 7b6e58a58bf3..ad8c076d0aef 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/stop_machine.h>
 
+#include <asm/barrier.h>
 #include <asm/cputype.h>
 #include <asm/fixmap.h>
 #include <asm/kernel-pgtable.h>
@@ -62,15 +63,18 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
 
-static void __init *early_alloc(unsigned long sz)
+static void __init *early_pgtable_alloc(void)
 {
 	phys_addr_t phys;
 	void *ptr;
 
-	phys = memblock_alloc(sz, sz);
+	phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 	BUG_ON(!phys);
 	ptr = __va(phys);
-	memset(ptr, 0, sz);
+	memset(ptr, 0, PAGE_SIZE);
+
+	/* Ensure the zeroed page is visible to the page table walker */
+	dsb(ishst);
 	return ptr;
 }
 
@@ -95,12 +99,12 @@ static void split_pmd(pmd_t *pmd, pte_t *pte)
 static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
 				  unsigned long end, unsigned long pfn,
 				  pgprot_t prot,
-				  void *(*alloc)(unsigned long size))
+				  void *(*pgtable_alloc)(void))
 {
 	pte_t *pte;
 
 	if (pmd_none(*pmd) || pmd_sect(*pmd)) {
-		pte = alloc(PTRS_PER_PTE * sizeof(pte_t));
+		pte = pgtable_alloc();
 		if (pmd_sect(*pmd))
 			split_pmd(pmd, pte);
 		__pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE);
@@ -130,7 +134,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd)
 static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 				  unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
-				  void *(*alloc)(unsigned long size))
+				  void *(*pgtable_alloc)(void))
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -139,7 +143,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 	 * Check for initial section mappings in the pgd/pud and remove them.
 	 */
 	if (pud_none(*pud) || pud_sect(*pud)) {
-		pmd = alloc(PTRS_PER_PMD * sizeof(pmd_t));
+		pmd = pgtable_alloc();
 		if (pud_sect(*pud)) {
 			/*
 			 * need to have the 1G of mappings continue to be
@@ -174,7 +178,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 			}
 		} else {
 			alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys),
-				       prot, alloc);
+				       prot, pgtable_alloc);
 		}
 		phys += next - addr;
 	} while (pmd++, addr = next, addr != end);
@@ -195,13 +199,13 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next,
 static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 				  unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
-				  void *(*alloc)(unsigned long size))
+				  void *(*pgtable_alloc)(void))
 {
 	pud_t *pud;
 	unsigned long next;
 
 	if (pgd_none(*pgd)) {
-		pud = alloc(PTRS_PER_PUD * sizeof(pud_t));
+		pud = pgtable_alloc();
 		pgd_populate(mm, pgd, pud);
 	}
 	BUG_ON(pgd_bad(*pgd));
@@ -234,7 +238,8 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 				}
 			}
 		} else {
-			alloc_init_pmd(mm, pud, addr, next, phys, prot, alloc);
+			alloc_init_pmd(mm, pud, addr, next, phys, prot,
+				       pgtable_alloc);
 		}
 		phys += next - addr;
 	} while (pud++, addr = next, addr != end);
@@ -247,7 +252,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
 				    phys_addr_t phys, unsigned long virt,
 				    phys_addr_t size, pgprot_t prot,
-				    void *(*alloc)(unsigned long size))
+				    void *(*pgtable_alloc)(void))
 {
 	unsigned long addr, length, end, next;
 
@@ -265,18 +270,18 @@ static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
 	end = addr + length;
 	do {
 		next = pgd_addr_end(addr, end);
-		alloc_init_pud(mm, pgd, addr, next, phys, prot, alloc);
+		alloc_init_pud(mm, pgd, addr, next, phys, prot, pgtable_alloc);
 		phys += next - addr;
 	} while (pgd++, addr = next, addr != end);
 }
 
-static void *late_alloc(unsigned long size)
+static void *late_pgtable_alloc(void)
 {
-	void *ptr;
-
-	BUG_ON(size > PAGE_SIZE);
-	ptr = (void *)__get_free_page(PGALLOC_GFP);
+	void *ptr = (void *)__get_free_page(PGALLOC_GFP);
 	BUG_ON(!ptr);
+
+	/* Ensure the zeroed page is visible to the page table walker */
+	dsb(ishst);
 	return ptr;
 }
 
@@ -289,7 +294,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt,
 		return;
 	}
 	__create_mapping(&init_mm, pgd_offset_k(virt), phys, virt,
-			 size, prot, early_alloc);
+			 size, prot, early_pgtable_alloc);
 }
 
 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
@@ -297,7 +302,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       pgprot_t prot)
 {
 	__create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot,
-				late_alloc);
+				late_pgtable_alloc);
 }
 
 static void create_mapping_late(phys_addr_t phys, unsigned long virt,
@@ -310,7 +315,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 	}
 
 	return __create_mapping(&init_mm, pgd_offset_k(virt),
-				phys, virt, size, prot, late_alloc);
+				phys, virt, size, prot, late_pgtable_alloc);
 }
 
 #ifdef CONFIG_DEBUG_RODATA
@@ -457,7 +462,7 @@ void __init paging_init(void)
 	fixup_executable();
 
 	/* allocate the zero page. */
-	zero_page = early_alloc(PAGE_SIZE);
+	zero_page = early_pgtable_alloc();
 
 	bootmem_init();
 

From 2563a08010f2a945a39c19853297ed980c8d469f Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:44:57 +0000
Subject: [PATCH 0371/3220] UPSTREAM: arm64: mm: place empty_zero_page in bss

Currently the zero page is set up in paging_init, and thus we cannot use
the zero page earlier. We use the zero page as a reserved TTBR value
from which no TLB entries may be allocated (e.g. when uninstalling the
idmap). To enable such usage earlier (as may be required for invasive
changes to the kernel page tables), and to minimise the time that the
idmap is active, we need to be able to use the zero page before
paging_init.

This patch follows the example set by x86, by allocating the zero page
at compile time, in .bss. This means that the zero page itself is
available immediately upon entry to start_kernel (as we zero .bss before
this), and also means that the zero page takes up no space in the raw
Image binary. The associated struct page is allocated in bootmem_init,
and remains unavailable until this time.

Outside of arch code, the only users of empty_zero_page assume that the
empty_zero_page symbol refers to the zeroed memory itself, and that
ZERO_PAGE(x) must be used to acquire the associated struct page,
following the example of x86. This patch also brings arm64 inline with
these assumptions.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 5227cfa71f9e8574373f4d0e9e754942d76cdf67)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I43005dd8533d9968d2cef48bd2821d4507cae5ad
---
 arch/arm64/include/asm/mmu_context.h | 2 +-
 arch/arm64/include/asm/pgtable.h     | 4 ++--
 arch/arm64/kernel/head.S             | 1 +
 arch/arm64/mm/mmu.c                  | 9 +--------
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 24165784b803..600eacb9f7d5 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -48,7 +48,7 @@ static inline void contextidr_thread_switch(struct task_struct *next)
  */
 static inline void cpu_set_reserved_ttbr0(void)
 {
-	unsigned long ttbr = page_to_phys(empty_zero_page);
+	unsigned long ttbr = virt_to_phys(empty_zero_page);
 
 	asm(
 	"	msr	ttbr0_el1, %0			// set TTBR0\n"
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 69d2e2f86bce..e11dc90fbf56 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -121,8 +121,8 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
  */
-extern struct page *empty_zero_page;
-#define ZERO_PAGE(vaddr)	(empty_zero_page)
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr)	virt_to_page(empty_zero_page)
 
 #define pte_ERROR(pte)		__pte_error(__FILE__, __LINE__, pte_val(pte))
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 917d98108b3f..53b9f9f128c2 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -421,6 +421,7 @@ __mmap_switched:
 	adr_l	x2, __bss_stop
 	sub	x2, x2, x0
 	bl	__pi_memset
+	dsb	ishst				// Make zero page visible to PTW
 
 	adr_l	sp, initial_sp, x4
 	mov	x4, sp
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ad8c076d0aef..acf0442b9759 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -49,7 +49,7 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
  * Empty_zero_page is a special page that is used for zero-initialized data
  * and COW.
  */
-struct page *empty_zero_page;
+unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);
 
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
@@ -456,18 +456,11 @@ void fixup_init(void)
  */
 void __init paging_init(void)
 {
-	void *zero_page;
-
 	map_mem();
 	fixup_executable();
 
-	/* allocate the zero page. */
-	zero_page = early_pgtable_alloc();
-
 	bootmem_init();
 
-	empty_zero_page = virt_to_page(zero_page);
-
 	/*
 	 * TTBR0 is only used for the identity mapping at this stage. Make it
 	 * point to zero page to avoid speculatively fetching new entries.

From c66ef5f947144ca420c7483226e3ac64ad8fd813 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:44:58 +0000
Subject: [PATCH 0372/3220] UPSTREAM: arm64: unify idmap removal

We currently open-code the removal of the idmap and restoration of the
current task's MMU state in a few places.

Before introducing yet more copies of this sequence, unify these to call
a new helper, cpu_uninstall_idmap.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 9e8e865bbe294a69666a1996bda3e87825b258c0)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I6e9cb0253a1d2d63232f8fa0b3f39f8f6987b239
---
 arch/arm64/include/asm/mmu_context.h | 25 +++++++++++++++++++++++++
 arch/arm64/kernel/setup.c            |  1 +
 arch/arm64/kernel/smp.c              |  4 +---
 arch/arm64/kernel/suspend.c          | 20 ++++----------------
 arch/arm64/mm/mmu.c                  |  4 +---
 5 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 600eacb9f7d5..b1b2514d8883 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -27,6 +27,7 @@
 #include <asm-generic/mm_hooks.h>
 #include <asm/cputype.h>
 #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 
 #ifdef CONFIG_PID_IN_CONTEXTIDR
 static inline void contextidr_thread_switch(struct task_struct *next)
@@ -89,6 +90,30 @@ static inline void cpu_set_default_tcr_t0sz(void)
 	: "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH));
 }
 
+/*
+ * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm.
+ *
+ * The idmap lives in the same VA range as userspace, but uses global entries
+ * and may use a different TCR_EL1.T0SZ. To avoid issues resulting from
+ * speculative TLB fetches, we must temporarily install the reserved page
+ * tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ.
+ *
+ * If current is a not a user task, the mm covers the TTBR1_EL1 page tables,
+ * which should not be installed in TTBR0_EL1. In this case we can leave the
+ * reserved page tables in place.
+ */
+static inline void cpu_uninstall_idmap(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	cpu_set_reserved_ttbr0();
+	local_flush_tlb_all();
+	cpu_set_default_tcr_t0sz();
+
+	if (mm != &init_mm)
+		cpu_switch_mm(mm->pgd, mm);
+}
+
 /*
  * It would be nice to return ASIDs back to the allocator, but unfortunately
  * that introduces a race with a generation rollover where we could erroneously
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 7f0e7226795e..de8e187ec8c6 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -62,6 +62,7 @@
 #include <asm/memblock.h>
 #include <asm/efi.h>
 #include <asm/xen/hypervisor.h>
+#include <asm/mmu_context.h>
 
 phys_addr_t __fdt_pointer __initdata;
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index b1adc51b2c2e..68e7f79630d4 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -149,9 +149,7 @@ asmlinkage void secondary_start_kernel(void)
 	 * TTBR0 is only used for the identity mapping at this stage. Make it
 	 * point to zero page to avoid speculatively fetching new entries.
 	 */
-	cpu_set_reserved_ttbr0();
-	local_flush_tlb_all();
-	cpu_set_default_tcr_t0sz();
+	cpu_uninstall_idmap();
 
 	preempt_disable();
 	trace_hardirqs_off();
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 1095aa483a1c..66055392f445 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -60,7 +60,6 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
  */
 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 {
-	struct mm_struct *mm = current->active_mm;
 	int ret;
 	unsigned long flags;
 
@@ -87,22 +86,11 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	ret = __cpu_suspend_enter(arg, fn);
 	if (ret == 0) {
 		/*
-		 * We are resuming from reset with TTBR0_EL1 set to the
-		 * idmap to enable the MMU; set the TTBR0 to the reserved
-		 * page tables to prevent speculative TLB allocations, flush
-		 * the local tlb and set the default tcr_el1.t0sz so that
-		 * the TTBR0 address space set-up is properly restored.
-		 * If the current active_mm != &init_mm we entered cpu_suspend
-		 * with mappings in TTBR0 that must be restored, so we switch
-		 * them back to complete the address space configuration
-		 * restoration before returning.
+		 * We are resuming from reset with the idmap active in TTBR0_EL1.
+		 * We must uninstall the idmap and restore the expected MMU
+		 * state before we can possibly return to userspace.
 		 */
-		cpu_set_reserved_ttbr0();
-		local_flush_tlb_all();
-		cpu_set_default_tcr_t0sz();
-
-		if (mm != &init_mm)
-			cpu_switch_mm(mm->pgd, mm);
+		cpu_uninstall_idmap();
 
 		/*
 		 * Restore per-cpu offset before any kernel
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index acf0442b9759..b00462172705 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -465,9 +465,7 @@ void __init paging_init(void)
 	 * TTBR0 is only used for the identity mapping at this stage. Make it
 	 * point to zero page to avoid speculatively fetching new entries.
 	 */
-	cpu_set_reserved_ttbr0();
-	local_flush_tlb_all();
-	cpu_set_default_tcr_t0sz();
+	cpu_uninstall_idmap();
 }
 
 /*

From 8ac5abfecf99f0c1c065c83a0ff7481d29173e03 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:44:59 +0000
Subject: [PATCH 0373/3220] UPSTREAM: arm64: unmap idmap earlier

During boot we leave the idmap in place until paging_init, as we
previously had to wait for the zero page to become allocated and
accessible.

Now that we have a statically-allocated zero page, we can uninstall the
idmap much earlier in the boot process, making it far easier to spot
accidental use of physical addresses. This also brings the cold boot
path in line with the secondary boot path.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 86ccce896cb0aa800a7a6dcd29b41ffc4eeb1a75)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I6375bd9855e45727790697875b7cd19f84a4dd7f
---
 arch/arm64/kernel/setup.c | 6 ++++++
 arch/arm64/mm/mmu.c       | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index de8e187ec8c6..079db3ba21c4 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -314,6 +314,12 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	local_async_enable();
 
+	/*
+	 * TTBR0 is only used for the identity mapping at this stage. Make it
+	 * point to zero page to avoid speculatively fetching new entries.
+	 */
+	cpu_uninstall_idmap();
+
 	efi_init();
 	arm64_memblock_init();
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b00462172705..c0a4160331f8 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -460,12 +460,6 @@ void __init paging_init(void)
 	fixup_executable();
 
 	bootmem_init();
-
-	/*
-	 * TTBR0 is only used for the identity mapping at this stage. Make it
-	 * point to zero page to avoid speculatively fetching new entries.
-	 */
-	cpu_uninstall_idmap();
 }
 
 /*

From 4ba7b986cf3bbc2e3e66f2a7805356a30f3b203c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:00 +0000
Subject: [PATCH 0374/3220] UPSTREAM: arm64: add function to install the idmap

In some cases (e.g. when making invasive changes to the kernel page
tables) we will need to execute code from the idmap.

Add a new helper which may be used to install the idmap, complementing
the existing cpu_uninstall_idmap.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 609116d202a8c5fd3fe393eb85373cbee906df68)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I1aac9e85e3d49add72c8aab7d80777a39f5fdd8e
---
 arch/arm64/include/asm/mmu_context.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index b1b2514d8883..944f2730a940 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -74,7 +74,7 @@ static inline bool __cpu_uses_extended_idmap(void)
 /*
  * Set TCR.T0SZ to its default value (based on VA_BITS)
  */
-static inline void cpu_set_default_tcr_t0sz(void)
+static inline void __cpu_set_tcr_t0sz(unsigned long t0sz)
 {
 	unsigned long tcr;
 
@@ -87,9 +87,12 @@ static inline void cpu_set_default_tcr_t0sz(void)
 	"	msr	tcr_el1, %0	;"
 	"	isb"
 	: "=&r" (tcr)
-	: "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH));
+	: "r"(t0sz), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH));
 }
 
+#define cpu_set_default_tcr_t0sz()	__cpu_set_tcr_t0sz(TCR_T0SZ(VA_BITS))
+#define cpu_set_idmap_tcr_t0sz()	__cpu_set_tcr_t0sz(idmap_t0sz)
+
 /*
  * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm.
  *
@@ -114,6 +117,15 @@ static inline void cpu_uninstall_idmap(void)
 		cpu_switch_mm(mm->pgd, mm);
 }
 
+static inline void cpu_install_idmap(void)
+{
+	cpu_set_reserved_ttbr0();
+	local_flush_tlb_all();
+	cpu_set_idmap_tcr_t0sz();
+
+	cpu_switch_mm(idmap_pg_dir, &init_mm);
+}
+
 /*
  * It would be nice to return ASIDs back to the allocator, but unfortunately
  * that introduces a race with a generation rollover where we could erroneously

From 214ae8984d113452991df67aa2820a28755219c3 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:01 +0000
Subject: [PATCH 0375/3220] UPSTREAM: arm64: mm: add code to safely replace
 TTBR1_EL1

If page tables are modified without suitable TLB maintenance, the ARM
architecture permits multiple TLB entries to be allocated for the same
VA. When this occurs, it is permitted that TLB conflict aborts are
raised in response to synchronous data/instruction accesses, and/or and
amalgamation of the TLB entries may be used as a result of a TLB lookup.

The presence of conflicting TLB entries may result in a variety of
behaviours detrimental to the system (e.g. erroneous physical addresses
may be used by I-cache fetches and/or page table walks). Some of these
cases may result in unexpected changes of hardware state, and/or result
in the (asynchronous) delivery of SError.

To avoid these issues, we must avoid situations where conflicting
entries may be allocated into TLBs. For user and module mappings we can
follow a strict break-before-make approach, but this cannot work for
modifications to the swapper page tables that cover the kernel text and
data.

Instead, this patch adds code which is intended to be executed from the
idmap, which can safely unmap the swapper page tables as it only
requires the idmap to be active. This enables us to uninstall the active
TTBR1_EL1 entry, invalidate TLBs, then install a new TTBR1_EL1 entry
without potentially unmapping code or data required for the sequence.
This avoids the risk of conflict, but requires that updates are staged
in a copy of the swapper page tables prior to being installed.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 50e1881ddde2a986c7d0d2150985239e5e3d7d96)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ibd0a7c802a1abe0ba08a99819fd62ac646186005
---
 arch/arm64/include/asm/mmu_context.h | 19 +++++++++++++++++++
 arch/arm64/mm/proc.S                 | 28 ++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 944f2730a940..a00f7cf35bbd 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -126,6 +126,25 @@ static inline void cpu_install_idmap(void)
 	cpu_switch_mm(idmap_pg_dir, &init_mm);
 }
 
+/*
+ * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
+ * avoiding the possibility of conflicting TLB entries being allocated.
+ */
+static inline void cpu_replace_ttbr1(pgd_t *pgd)
+{
+	typedef void (ttbr_replace_func)(phys_addr_t);
+	extern ttbr_replace_func idmap_cpu_replace_ttbr1;
+	ttbr_replace_func *replace_phys;
+
+	phys_addr_t pgd_phys = virt_to_phys(pgd);
+
+	replace_phys = (void *)virt_to_phys(idmap_cpu_replace_ttbr1);
+
+	cpu_install_idmap();
+	replace_phys(pgd_phys);
+	cpu_uninstall_idmap();
+}
+
 /*
  * It would be nice to return ASIDs back to the allocator, but unfortunately
  * that introduces a race with a generation rollover where we could erroneously
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index c164d2cb35c0..0c19534a901e 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -140,6 +140,34 @@ ENTRY(cpu_do_switch_mm)
 	ret
 ENDPROC(cpu_do_switch_mm)
 
+	.pushsection ".idmap.text", "ax"
+/*
+ * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd)
+ *
+ * This is the low-level counterpart to cpu_replace_ttbr1, and should not be
+ * called by anything else. It can only be executed from a TTBR0 mapping.
+ */
+ENTRY(idmap_cpu_replace_ttbr1)
+	mrs	x2, daif
+	msr	daifset, #0xf
+
+	adrp	x1, empty_zero_page
+	msr	ttbr1_el1, x1
+	isb
+
+	tlbi	vmalle1
+	dsb	nsh
+	isb
+
+	msr	ttbr1_el1, x0
+	isb
+
+	msr	daif, x2
+
+	ret
+ENDPROC(idmap_cpu_replace_ttbr1)
+	.popsection
+
 /*
  *	__cpu_setup
  *

From 0991212c3218e24c047ce32fa462931a5a53024e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:02 +0000
Subject: [PATCH 0376/3220] UPSTREAM: arm64: kasan: avoid TLB conflicts

The page table modification performed during the KASAN init risks the
allocation of conflicting TLB entries, as it swaps a set of valid global
entries for another without suitable TLB maintenance.

The presence of conflicting TLB entries can result in the delivery of
synchronous TLB conflict aborts, or may result in the use of erroneous
data being returned in response to a TLB lookup. This can affect
explicit data accesses from software as well as translations performed
asynchronously (e.g. as part of page table walks or speculative I-cache
fetches), and can therefore result in a wide variety of problems.

To avoid this, use cpu_replace_ttbr1 to swap the page tables. This
ensures that when the new tables are installed there are no stale
entries from the old tables which may conflict. As all updates are made
to the tables while they are not active, the updates themselves are
safe.

At the same time, add the missing barrier to ensure that the tmp_pg_dir
entries updated via memcpy are visible to the page table walkers at the
point the tmp_pg_dir is installed. All other page table updates made as
part of KASAN initialisation have the requisite barriers due to the use
of the standard page table accessors.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit c1a88e9124a499939ebd8069d5e4d3937f019157)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I0934b236e25e015d6946dcedcca14ba9512418f7
---
 arch/arm64/mm/kasan_init.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index cab7a5be40aa..263b59020500 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -16,6 +16,7 @@
 #include <linux/memblock.h>
 #include <linux/start_kernel.h>
 
+#include <asm/mmu_context.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -108,15 +109,6 @@ static void __init clear_pgds(unsigned long start,
 		set_pgd(pgd_offset_k(start), __pgd(0));
 }
 
-static void __init cpu_set_ttbr1(unsigned long ttbr1)
-{
-	asm(
-	"	msr	ttbr1_el1, %0\n"
-	"	isb"
-	:
-	: "r" (ttbr1));
-}
-
 void __init kasan_init(void)
 {
 	struct memblock_region *reg;
@@ -130,8 +122,8 @@ void __init kasan_init(void)
 	 * setup will be finished.
 	 */
 	memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir));
-	cpu_set_ttbr1(__pa(tmp_pg_dir));
-	flush_tlb_all();
+	dsb(ishst);
+	cpu_replace_ttbr1(tmp_pg_dir);
 
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
@@ -165,8 +157,7 @@ void __init kasan_init(void)
 			pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
-	cpu_set_ttbr1(__pa(swapper_pg_dir));
-	flush_tlb_all();
+	cpu_replace_ttbr1(swapper_pg_dir);
 
 	/* At this point kasan is fully initialized. Enable error messages */
 	init_task.kasan_depth = 0;

From ded0130fa93036b518b16b5af7ef7c318b67e3e4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:03 +0000
Subject: [PATCH 0377/3220] UPSTREAM: arm64: mm: move pte_* macros

For pmd, pud, and pgd levels of table, functions including p?d_index and
p?d_offset are defined after the p?d_page_vaddr function for the
immediately higher level of table.

The pte functions however are defined much earlier, even though several
rely on the later definition of pmd_page_vaddr. While this isn't
currently a problem as these are macros, it prevents the logical
grouping of later C functions (which cannot rely on prototypes for
functions not yet defined).

Move these definitions after pmd_page_vaddr, for consistency with the
placement of these functions for other levels of table.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 053520f7d3923cc6d37afb28f9887cb1e7d77454)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I44c7327dc0f257188c0c2204ec7da0e7fdca7f64
---
 arch/arm64/include/asm/pgtable.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e11dc90fbf56..1d55520c3561 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -134,16 +134,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 #define pte_clear(mm,addr,ptep)	set_pte(ptep, __pte(0))
 #define pte_page(pte)		(pfn_to_page(pte_pfn(pte)))
 
-/* Find an entry in the third-level page table. */
-#define pte_index(addr)		(((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-
-#define pte_offset_kernel(dir,addr)	(pmd_page_vaddr(*(dir)) + pte_index(addr))
-
-#define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
-#define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir), (addr))
-#define pte_unmap(pte)			do { } while (0)
-#define pte_unmap_nested(pte)		do { } while (0)
-
 /*
  * The following only work if pte_present(). Undefined behaviour otherwise.
  */
@@ -445,6 +435,16 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
 	return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK);
 }
 
+/* Find an entry in the third-level page table. */
+#define pte_index(addr)		(((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+
+#define pte_offset_kernel(dir,addr)	(pmd_page_vaddr(*(dir)) + pte_index(addr))
+
+#define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
+#define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir), (addr))
+#define pte_unmap(pte)			do { } while (0)
+#define pte_unmap_nested(pte)		do { } while (0)
+
 #define pmd_page(pmd)		pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
 
 /*

From 4d41b2a8a8412b07033ddcd56cc0b31b38889cdb Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:04 +0000
Subject: [PATCH 0378/3220] UPSTREAM: arm64: mm: add functions to walk page
 tables by PA

To allow us to walk tables allocated into the fixmap, we need to acquire
the physical address of a page, rather than the virtual address in the
linear map.

This patch adds new p??_page_paddr and p??_offset_phys functions to
acquire the physical address of a next-level table, and changes
p??_offset* into macros which simply convert this to a linear map VA.
This renders p??_page_vaddr unused, and hence they are removed.

At the pgd level, a new pgd_offset_raw function is added to find the
relevant PGD entry given the base of a PGD and a virtual address.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit dca56dca7124709f3dfca81afe61b4d98eb9cacf)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ie43d00014322e29aec70617db3af242ed8545738
---
 arch/arm64/include/asm/pgtable.h | 39 +++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 1d55520c3561..837fc7b8a9a4 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -430,15 +430,16 @@ static inline void pmd_clear(pmd_t *pmdp)
 	set_pmd(pmdp, __pmd(0));
 }
 
-static inline pte_t *pmd_page_vaddr(pmd_t pmd)
+static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 {
-	return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK);
+	return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK;
 }
 
 /* Find an entry in the third-level page table. */
 #define pte_index(addr)		(((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 
-#define pte_offset_kernel(dir,addr)	(pmd_page_vaddr(*(dir)) + pte_index(addr))
+#define pte_offset_phys(dir,addr)	(pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t))
+#define pte_offset_kernel(dir,addr)	((pte_t *)__va(pte_offset_phys((dir), (addr))))
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
 #define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir), (addr))
@@ -473,21 +474,23 @@ static inline void pud_clear(pud_t *pudp)
 	set_pud(pudp, __pud(0));
 }
 
-static inline pmd_t *pud_page_vaddr(pud_t pud)
+static inline phys_addr_t pud_page_paddr(pud_t pud)
 {
-	return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK);
+	return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK;
 }
 
 /* Find an entry in the second-level page table. */
 #define pmd_index(addr)		(((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
 
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
-{
-	return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr);
-}
+#define pmd_offset_phys(dir, addr)	(pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t))
+#define pmd_offset(dir, addr)		((pmd_t *)__va(pmd_offset_phys((dir), (addr))))
 
 #define pud_page(pud)		pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK))
 
+#else
+
+#define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
+
 #endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
 #if CONFIG_PGTABLE_LEVELS > 3
@@ -509,21 +512,23 @@ static inline void pgd_clear(pgd_t *pgdp)
 	set_pgd(pgdp, __pgd(0));
 }
 
-static inline pud_t *pgd_page_vaddr(pgd_t pgd)
+static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 {
-	return __va(pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK);
+	return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK;
 }
 
 /* Find an entry in the frst-level page table. */
 #define pud_index(addr)		(((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
 
-static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
-{
-	return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr);
-}
+#define pud_offset_phys(dir, addr)	(pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t))
+#define pud_offset(dir, addr)		((pud_t *)__va(pud_offset_phys((dir), (addr))))
 
 #define pgd_page(pgd)		pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK))
 
+#else
+
+#define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
+
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
 #define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
@@ -531,7 +536,9 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
 /* to find an entry in a page-table-directory */
 #define pgd_index(addr)		(((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
 
-#define pgd_offset(mm, addr)	((mm)->pgd+pgd_index(addr))
+#define pgd_offset_raw(pgd, addr)	((pgd) + pgd_index(addr))
+
+#define pgd_offset(mm, addr)	(pgd_offset_raw((mm)->pgd, (addr)))
 
 /* to find an entry in a kernel page-table-directory */
 #define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)

From 1845f62c4eb95f0dd288ac54bba784e0acb53171 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:05 +0000
Subject: [PATCH 0379/3220] UPSTREAM: arm64: mm: avoid redundant __pa(__va(x))

When we "upgrade" to a section mapping, we free any table we made
redundant by giving it back to memblock. To get the PA, we acquire the
physical address and convert this to a VA, then subsequently convert
this back to a PA.

This works currently, but will not work if the tables are not accessed
via linear map VAs (e.g. is we use fixmap slots).

This patch uses {pmd,pud}_page_paddr to acquire the PA. This avoids the
__pa(__va()) round trip, saving some work and avoiding reliance on the
linear mapping.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 316b39db06718d59d82736df9fc65cf05b467cc7)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I26ed77790b5b00a15ea09a5a23a176de5b73a002
---
 arch/arm64/mm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index c0a4160331f8..fa2471ac191a 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -171,7 +171,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 			if (!pmd_none(old_pmd)) {
 				flush_tlb_all();
 				if (pmd_table(old_pmd)) {
-					phys_addr_t table = __pa(pte_offset_map(&old_pmd, 0));
+					phys_addr_t table = pmd_page_paddr(old_pmd);
 					if (!WARN_ON_ONCE(slab_is_available()))
 						memblock_free(table, PAGE_SIZE);
 				}
@@ -232,7 +232,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 			if (!pud_none(old_pud)) {
 				flush_tlb_all();
 				if (pud_table(old_pud)) {
-					phys_addr_t table = __pa(pmd_offset(&old_pud, 0));
+					phys_addr_t table = pud_page_paddr(old_pud);
 					if (!WARN_ON_ONCE(slab_is_available()))
 						memblock_free(table, PAGE_SIZE);
 				}

From 04390d5e9950ee522e1f6f829615a14c2368a242 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:06 +0000
Subject: [PATCH 0380/3220] UPSTREAM: arm64: mm: add __{pud,pgd}_populate

We currently have __pmd_populate for creating a pmd table entry given
the physical address of a pte, but don't have equivalents for the pud or
pgd levels of table.

To enable us to manipulate tables which are mapped outside of the linear
mapping (where we have a PA, but not a linear map VA), it is useful to
have these functions.

This patch adds __{pud,pgd}_populate. As these should not be called when
the kernel uses folded {pmd,pud}s, in these cases they expand to
BUILD_BUG(). So long as the appropriate checks are made on the {pud,pgd}
entry prior to attempting population, these should be optimized out at
compile time.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 1e531cce68c92b46c7d29f36a72f9a3e5886678f)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I4bb3b67d0e8b63dfbd1292a93bdde42ead23a5c2
---
 arch/arm64/include/asm/pgalloc.h | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index c15053902942..ff98585d085a 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -42,11 +42,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 	free_page((unsigned long)pmd);
 }
 
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot)
 {
-	set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE));
+	set_pud(pud, __pud(pmd | prot));
 }
 
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+	__pud_populate(pud, __pa(pmd), PMD_TYPE_TABLE);
+}
+#else
+static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot)
+{
+	BUILD_BUG();
+}
 #endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
 #if CONFIG_PGTABLE_LEVELS > 3
@@ -62,11 +71,20 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 	free_page((unsigned long)pud);
 }
 
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot)
 {
-	set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE));
+	set_pgd(pgdp, __pgd(pud | prot));
 }
 
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+	__pgd_populate(pgd, __pa(pud), PUD_TYPE_TABLE);
+}
+#else
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot)
+{
+	BUILD_BUG();
+}
 #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);

From d548784c60aa9b5be69218bcbbdaf5f7a767c659 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:07 +0000
Subject: [PATCH 0381/3220] UPSTREAM: arm64: mm: add functions to walk tables
 in fixmap

As a preparatory step to allow us to allocate early page tables from
unmapped memory using memblock_alloc, add new p??_{set,clear}_fixmap*
functions which can be used to walk page tables outside of the linear
mapping by using fixmap slots.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 961faac114819a01e627fe9c9c82b830bb3849d4)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I58a0f54d2c637ef0fda8e0b9187ade969aecd347
---
 arch/arm64/include/asm/fixmap.h  | 10 ++++++++++
 arch/arm64/include/asm/pgtable.h | 26 ++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 309704544d22..1a617d46fce9 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -62,6 +62,16 @@ enum fixed_addresses {
 
 	FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
 	FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
+
+	/*
+	 * Used for kernel page table creation, so unmapped memory may be used
+	 * for tables.
+	 */
+	FIX_PTE,
+	FIX_PMD,
+	FIX_PUD,
+	FIX_PGD,
+
 	__end_of_fixed_addresses
 };
 
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 837fc7b8a9a4..6a58dd48663f 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -57,6 +57,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/fixmap.h>
 #include <linux/mmdebug.h>
 
 extern void __pte_error(const char *file, int line, unsigned long val);
@@ -446,6 +447,10 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 #define pte_unmap(pte)			do { } while (0)
 #define pte_unmap_nested(pte)		do { } while (0)
 
+#define pte_set_fixmap(addr)		((pte_t *)set_fixmap_offset(FIX_PTE, addr))
+#define pte_set_fixmap_offset(pmd, addr)	pte_set_fixmap(pte_offset_phys(pmd, addr))
+#define pte_clear_fixmap()		clear_fixmap(FIX_PTE)
+
 #define pmd_page(pmd)		pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
 
 /*
@@ -485,12 +490,21 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
 #define pmd_offset_phys(dir, addr)	(pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t))
 #define pmd_offset(dir, addr)		((pmd_t *)__va(pmd_offset_phys((dir), (addr))))
 
+#define pmd_set_fixmap(addr)		((pmd_t *)set_fixmap_offset(FIX_PMD, addr))
+#define pmd_set_fixmap_offset(pud, addr)	pmd_set_fixmap(pmd_offset_phys(pud, addr))
+#define pmd_clear_fixmap()		clear_fixmap(FIX_PMD)
+
 #define pud_page(pud)		pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK))
 
 #else
 
 #define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
 
+/* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
+#define pmd_set_fixmap(addr)		NULL
+#define pmd_set_fixmap_offset(pudp, addr)	((pmd_t *)pudp)
+#define pmd_clear_fixmap()
+
 #endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
 #if CONFIG_PGTABLE_LEVELS > 3
@@ -523,12 +537,21 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 #define pud_offset_phys(dir, addr)	(pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t))
 #define pud_offset(dir, addr)		((pud_t *)__va(pud_offset_phys((dir), (addr))))
 
+#define pud_set_fixmap(addr)		((pud_t *)set_fixmap_offset(FIX_PUD, addr))
+#define pud_set_fixmap_offset(pgd, addr)	pud_set_fixmap(pud_offset_phys(pgd, addr))
+#define pud_clear_fixmap()		clear_fixmap(FIX_PUD)
+
 #define pgd_page(pgd)		pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK))
 
 #else
 
 #define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
 
+/* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
+#define pud_set_fixmap(addr)		NULL
+#define pud_set_fixmap_offset(pgdp, addr)	((pud_t *)pgdp)
+#define pud_clear_fixmap()
+
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
 #define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
@@ -543,6 +566,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 /* to find an entry in a kernel page-table-directory */
 #define pgd_offset_k(addr)	pgd_offset(&init_mm, addr)
 
+#define pgd_set_fixmap(addr)	((pgd_t *)set_fixmap_offset(FIX_PGD, addr))
+#define pgd_clear_fixmap()	clear_fixmap(FIX_PGD)
+
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
 	const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |

From 7739d1799b628e8a52402f25034c960ee2e27a9a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:08 +0000
Subject: [PATCH 0382/3220] UPSTREAM: arm64: mm: use fixmap when creating page
 tables

As a preparatory step to allow us to allocate early page tables from
unmapped memory using memblock_alloc, modify the __create_mapping
callees to map and unmap the tables they modify using fixmap entries.

All but the top-level pgd initialisation is performed via the fixmap.
Subsequent patches will inject the pgd physical address, and migrate to
using the FIX_PGD slot.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit f4710445458c0a1bd1c3c014ada2e7d7dc7b882f)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I71d2fb02d009915b5aab7f3467c4a3c658e898de
---
 arch/arm64/mm/mmu.c | 61 ++++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index fa2471ac191a..d7fa7f52f446 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -63,19 +63,30 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
 
-static void __init *early_pgtable_alloc(void)
+static phys_addr_t __init early_pgtable_alloc(void)
 {
 	phys_addr_t phys;
 	void *ptr;
 
 	phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 	BUG_ON(!phys);
-	ptr = __va(phys);
+
+	/*
+	 * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
+	 * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
+	 * any level of table.
+	 */
+	ptr = pte_set_fixmap(phys);
+
 	memset(ptr, 0, PAGE_SIZE);
 
-	/* Ensure the zeroed page is visible to the page table walker */
-	dsb(ishst);
-	return ptr;
+	/*
+	 * Implicit barriers also ensure the zeroed page is visible to the page
+	 * table walker
+	 */
+	pte_clear_fixmap();
+
+	return phys;
 }
 
 /*
@@ -99,24 +110,28 @@ static void split_pmd(pmd_t *pmd, pte_t *pte)
 static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
 				  unsigned long end, unsigned long pfn,
 				  pgprot_t prot,
-				  void *(*pgtable_alloc)(void))
+				  phys_addr_t (*pgtable_alloc)(void))
 {
 	pte_t *pte;
 
 	if (pmd_none(*pmd) || pmd_sect(*pmd)) {
-		pte = pgtable_alloc();
+		phys_addr_t pte_phys = pgtable_alloc();
+		pte = pte_set_fixmap(pte_phys);
 		if (pmd_sect(*pmd))
 			split_pmd(pmd, pte);
-		__pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE);
+		__pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE);
 		flush_tlb_all();
+		pte_clear_fixmap();
 	}
 	BUG_ON(pmd_bad(*pmd));
 
-	pte = pte_offset_kernel(pmd, addr);
+	pte = pte_set_fixmap_offset(pmd, addr);
 	do {
 		set_pte(pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	pte_clear_fixmap();
 }
 
 static void split_pud(pud_t *old_pud, pmd_t *pmd)
@@ -134,7 +149,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd)
 static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 				  unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
-				  void *(*pgtable_alloc)(void))
+				  phys_addr_t (*pgtable_alloc)(void))
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -143,7 +158,8 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 	 * Check for initial section mappings in the pgd/pud and remove them.
 	 */
 	if (pud_none(*pud) || pud_sect(*pud)) {
-		pmd = pgtable_alloc();
+		phys_addr_t pmd_phys = pgtable_alloc();
+		pmd = pmd_set_fixmap(pmd_phys);
 		if (pud_sect(*pud)) {
 			/*
 			 * need to have the 1G of mappings continue to be
@@ -151,12 +167,13 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 			 */
 			split_pud(pud, pmd);
 		}
-		pud_populate(mm, pud, pmd);
+		__pud_populate(pud, pmd_phys, PUD_TYPE_TABLE);
 		flush_tlb_all();
+		pmd_clear_fixmap();
 	}
 	BUG_ON(pud_bad(*pud));
 
-	pmd = pmd_offset(pud, addr);
+	pmd = pmd_set_fixmap_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		/* try section mapping first */
@@ -182,6 +199,8 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 		}
 		phys += next - addr;
 	} while (pmd++, addr = next, addr != end);
+
+	pmd_clear_fixmap();
 }
 
 static inline bool use_1G_block(unsigned long addr, unsigned long next,
@@ -199,18 +218,18 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next,
 static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 				  unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
-				  void *(*pgtable_alloc)(void))
+				  phys_addr_t (*pgtable_alloc)(void))
 {
 	pud_t *pud;
 	unsigned long next;
 
 	if (pgd_none(*pgd)) {
-		pud = pgtable_alloc();
-		pgd_populate(mm, pgd, pud);
+		phys_addr_t pud_phys = pgtable_alloc();
+		__pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE);
 	}
 	BUG_ON(pgd_bad(*pgd));
 
-	pud = pud_offset(pgd, addr);
+	pud = pud_set_fixmap_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 
@@ -243,6 +262,8 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 		}
 		phys += next - addr;
 	} while (pud++, addr = next, addr != end);
+
+	pud_clear_fixmap();
 }
 
 /*
@@ -252,7 +273,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
 				    phys_addr_t phys, unsigned long virt,
 				    phys_addr_t size, pgprot_t prot,
-				    void *(*pgtable_alloc)(void))
+				    phys_addr_t (*pgtable_alloc)(void))
 {
 	unsigned long addr, length, end, next;
 
@@ -275,14 +296,14 @@ static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
 	} while (pgd++, addr = next, addr != end);
 }
 
-static void *late_pgtable_alloc(void)
+static phys_addr_t late_pgtable_alloc(void)
 {
 	void *ptr = (void *)__get_free_page(PGALLOC_GFP);
 	BUG_ON(!ptr);
 
 	/* Ensure the zeroed page is visible to the page table walker */
 	dsb(ishst);
-	return ptr;
+	return __pa(ptr);
 }
 
 static void __init create_mapping(phys_addr_t phys, unsigned long virt,

From caf0cdb7ae7d5b5a4da584ff37228c3f26a55aa0 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:09 +0000
Subject: [PATCH 0383/3220] UPSTREAM: arm64: mm: allocate pagetables anywhere

Now that create_mapping uses fixmap slots to modify pte, pmd, and pud
entries, we can access page tables anywhere in physical memory,
regardless of the extent of the linear mapping.

Given that, we no longer need to limit memblock allocations during page
table creation, and can leave the limit as its default
MEMBLOCK_ALLOC_ANYWHERE.

We never add memory which will fall outside of the linear map range
given phys_offset and MAX_MEMBLOCK_ADDR are configured appropriately, so
any tables we create will fall in the linear map of the final tables.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit cdef5f6e9e0e5ee397759b664a9f875ff59ccf01)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I9b6487491f47351303a147147c3d274da20def59
---
 arch/arm64/mm/mmu.c | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d7fa7f52f446..e41f30b35928 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -383,20 +383,6 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
 static void __init map_mem(void)
 {
 	struct memblock_region *reg;
-	phys_addr_t limit;
-
-	/*
-	 * Temporarily limit the memblock range. We need to do this as
-	 * create_mapping requires puds, pmds and ptes to be allocated from
-	 * memory addressable from the initial direct kernel mapping.
-	 *
-	 * The initial direct kernel mapping, located at swapper_pg_dir, gives
-	 * us PUD_SIZE (with SECTION maps) or PMD_SIZE (without SECTION maps,
-	 * memory starting from PHYS_OFFSET (which must be aligned to 2MB as
-	 * per Documentation/arm64/booting.txt).
-	 */
-	limit = PHYS_OFFSET + SWAPPER_INIT_MAP_SIZE;
-	memblock_set_current_limit(limit);
 
 	/* map all the memory banks */
 	for_each_memblock(memory, reg) {
@@ -406,29 +392,8 @@ static void __init map_mem(void)
 		if (start >= end)
 			break;
 
-		if (ARM64_SWAPPER_USES_SECTION_MAPS) {
-			/*
-			 * For the first memory bank align the start address and
-			 * current memblock limit to prevent create_mapping() from
-			 * allocating pte page tables from unmapped memory. With
-			 * the section maps, if the first block doesn't end on section
-			 * size boundary, create_mapping() will try to allocate a pte
-			 * page, which may be returned from an unmapped area.
-			 * When section maps are not used, the pte page table for the
-			 * current limit is already present in swapper_pg_dir.
-			 */
-			if (start < limit)
-				start = ALIGN(start, SECTION_SIZE);
-			if (end < limit) {
-				limit = end & SECTION_MASK;
-				memblock_set_current_limit(limit);
-			}
-		}
 		__map_memblock(start, end);
 	}
-
-	/* Limit no longer required. */
-	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
 }
 
 static void __init fixup_executable(void)

From 880bee2ed6ea6051d1c387a8eb2cd81140b2ec64 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:10 +0000
Subject: [PATCH 0384/3220] UPSTREAM: arm64: mm: allow passing a pgdir to
 alloc_init_*

To allow us to initialise pgdirs which are fixmapped, allow explicitly
passing a pgdir rather than an mm. A new __create_pgd_mapping function
is added for this, with existing __create_mapping callers migrated to
this.

The mm argument was previously only used at the top level. Now that it
is redundant at all levels, it is removed. To indicate its new found
similarity to alloc_init_{pud,pmd,pte}, __create_mapping is renamed to
init_pgd.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 11509a306bb6ea595878b2d246d2d56b1783e040)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>

Change-Id: I554f90b986a0fce6c96c0a0a9da1d6e61602d0a9
---
 arch/arm64/mm/mmu.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e41f30b35928..f949fe238643 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -146,8 +146,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd)
 	} while (pmd++, i++, i < PTRS_PER_PMD);
 }
 
-static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
-				  unsigned long addr, unsigned long end,
+static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
 				  phys_addr_t (*pgtable_alloc)(void))
 {
@@ -215,8 +214,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next,
 	return true;
 }
 
-static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
-				  unsigned long addr, unsigned long end,
+static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
 				  phys_addr_t (*pgtable_alloc)(void))
 {
@@ -257,7 +255,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 				}
 			}
 		} else {
-			alloc_init_pmd(mm, pud, addr, next, phys, prot,
+			alloc_init_pmd(pud, addr, next, phys, prot,
 				       pgtable_alloc);
 		}
 		phys += next - addr;
@@ -270,8 +268,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
  * Create the page directory entries and any necessary page tables for the
  * mapping specified by 'md'.
  */
-static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
-				    phys_addr_t phys, unsigned long virt,
+static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt,
 				    phys_addr_t size, pgprot_t prot,
 				    phys_addr_t (*pgtable_alloc)(void))
 {
@@ -291,7 +288,7 @@ static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
 	end = addr + length;
 	do {
 		next = pgd_addr_end(addr, end);
-		alloc_init_pud(mm, pgd, addr, next, phys, prot, pgtable_alloc);
+		alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc);
 		phys += next - addr;
 	} while (pgd++, addr = next, addr != end);
 }
@@ -306,6 +303,14 @@ static phys_addr_t late_pgtable_alloc(void)
 	return __pa(ptr);
 }
 
+static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+				 unsigned long virt, phys_addr_t size,
+				 pgprot_t prot,
+				 phys_addr_t (*alloc)(void))
+{
+	init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc);
+}
+
 static void __init create_mapping(phys_addr_t phys, unsigned long virt,
 				  phys_addr_t size, pgprot_t prot)
 {
@@ -314,16 +319,16 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt,
 			&phys, virt);
 		return;
 	}
-	__create_mapping(&init_mm, pgd_offset_k(virt), phys, virt,
-			 size, prot, early_pgtable_alloc);
+	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
+			     early_pgtable_alloc);
 }
 
 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       unsigned long virt, phys_addr_t size,
 			       pgprot_t prot)
 {
-	__create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot,
-				late_pgtable_alloc);
+	__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
+			     late_pgtable_alloc);
 }
 
 static void create_mapping_late(phys_addr_t phys, unsigned long virt,
@@ -335,8 +340,8 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 		return;
 	}
 
-	return __create_mapping(&init_mm, pgd_offset_k(virt),
-				phys, virt, size, prot, late_pgtable_alloc);
+	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
+			     late_pgtable_alloc);
 }
 
 #ifdef CONFIG_DEBUG_RODATA

From 9f31360553319ee4e8fc3a1c97ef4788c024d6de Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:11 +0000
Subject: [PATCH 0385/3220] BACKPORT: arm64: ensure _stext and _etext are
 page-aligned

Currently we have separate ALIGN_DEBUG_RO{,_MIN} directives to align
_etext and __init_begin. While we ensure that __init_begin is
page-aligned, we do not provide the same guarantee for _etext. This is
not problematic currently as the alignment of __init_begin is sufficient
to prevent issues when we modify permissions.

Subsequent patches will assume page alignment of segments of the kernel
we wish to map with different permissions. To ensure this, move _etext
after the ALIGN_DEBUG_RO_MIN for the init section. This renders the
prior ALIGN_DEBUG_RO irrelevant, and hence it is removed. Likewise,
upgrade to ALIGN_DEBUG_RO_MIN(PAGE_SIZE) for _stext.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit fca082bfb543ccaaff864fc0892379ccaa1711cd)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: If1a829aa5c363f2c76eb1a18209c3c00ee3ffd0a
---
 arch/arm64/kernel/vmlinux.lds.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 1a056bb26633..763e5aab5752 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -95,7 +95,7 @@ SECTIONS
 		_text = .;
 		HEAD_TEXT
 	}
-	ALIGN_DEBUG_RO
+	ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
 	.text : {			/* Real text segment		*/
 		_stext = .;		/* Text and read-only data	*/
 			__exception_text_start = .;
@@ -118,9 +118,9 @@ SECTIONS
 	RO_DATA(PAGE_SIZE)
 	EXCEPTION_TABLE(8)
 	NOTES
-	ALIGN_DEBUG_RO
 
 	ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
+	_etext = .;			/* End of text and rodata section */
 	__init_begin = .;
 
 	INIT_TEXT_SECTION(8)

From 63048c3fad6dd5219008c7d842cf1a5e4bac5508 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:45:12 +0000
Subject: [PATCH 0386/3220] BACKPORT: arm64: mm: create new fine-grained
 mappings at boot

At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).

Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.

Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.

To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.

This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: rework-pagetable

(cherry picked from commit 068a17a5805dfbca4bbf03e664ca6b19709cc7a8)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I7ddaa296b94106846ebf8392d91186df8673df64
---
 arch/arm64/include/asm/kasan.h |   3 +
 arch/arm64/mm/kasan_init.c     |  15 ++++
 arch/arm64/mm/mmu.c            | 154 ++++++++++++++++++++-------------
 3 files changed, 110 insertions(+), 62 deletions(-)

diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h
index 2774fa384c47..de0d21211c34 100644
--- a/arch/arm64/include/asm/kasan.h
+++ b/arch/arm64/include/asm/kasan.h
@@ -7,6 +7,7 @@
 
 #include <linux/linkage.h>
 #include <asm/memory.h>
+#include <asm/pgtable-types.h>
 
 /*
  * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
@@ -28,10 +29,12 @@
 #define KASAN_SHADOW_OFFSET     (KASAN_SHADOW_END - (1ULL << (64 - 3)))
 
 void kasan_init(void);
+void kasan_copy_shadow(pgd_t *pgdir);
 asmlinkage void kasan_early_init(void);
 
 #else
 static inline void kasan_init(void) { }
+static inline void kasan_copy_shadow(pgd_t *pgdir) { }
 #endif
 
 #endif
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 263b59020500..cc569a38bc76 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -97,6 +97,21 @@ asmlinkage void __init kasan_early_init(void)
 	kasan_map_early_shadow();
 }
 
+/*
+ * Copy the current shadow region into a new pgdir.
+ */
+void __init kasan_copy_shadow(pgd_t *pgdir)
+{
+	pgd_t *pgd, *pgd_new, *pgd_end;
+
+	pgd = pgd_offset_k(KASAN_SHADOW_START);
+	pgd_end = pgd_offset_k(KASAN_SHADOW_END);
+	pgd_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START);
+	do {
+		set_pgd(pgd_new, *pgd);
+	} while (pgd++, pgd_new++, pgd != pgd_end);
+}
+
 static void __init clear_pgds(unsigned long start,
 			unsigned long end)
 {
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index f949fe238643..6f5770a1b140 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -33,6 +33,7 @@
 #include <asm/barrier.h>
 #include <asm/cputype.h>
 #include <asm/fixmap.h>
+#include <asm/kasan.h>
 #include <asm/kernel-pgtable.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
@@ -344,48 +345,42 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 			     late_pgtable_alloc);
 }
 
-#ifdef CONFIG_DEBUG_RODATA
-static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
+static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
 {
+
+	unsigned long kernel_start = __pa(_stext);
+	unsigned long kernel_end = __pa(_end);
+
 	/*
-	 * Set up the executable regions using the existing section mappings
-	 * for now. This will get more fine grained later once all memory
-	 * is mapped
+	 * The kernel itself is mapped at page granularity. Map all other
+	 * memory, making sure we don't overwrite the existing kernel mappings.
 	 */
-	unsigned long kernel_x_start = round_down(__pa(_stext), SWAPPER_BLOCK_SIZE);
-	unsigned long kernel_x_end = round_up(__pa(__init_end), SWAPPER_BLOCK_SIZE);
 
-	if (end < kernel_x_start) {
-		create_mapping(start, __phys_to_virt(start),
-			end - start, PAGE_KERNEL);
-	} else if (start >= kernel_x_end) {
-		create_mapping(start, __phys_to_virt(start),
-			end - start, PAGE_KERNEL);
-	} else {
-		if (start < kernel_x_start)
-			create_mapping(start, __phys_to_virt(start),
-				kernel_x_start - start,
-				PAGE_KERNEL);
-		create_mapping(kernel_x_start,
-				__phys_to_virt(kernel_x_start),
-				kernel_x_end - kernel_x_start,
-				PAGE_KERNEL_EXEC);
-		if (kernel_x_end < end)
-			create_mapping(kernel_x_end,
-				__phys_to_virt(kernel_x_end),
-				end - kernel_x_end,
-				PAGE_KERNEL);
+	/* No overlap with the kernel. */
+	if (end < kernel_start || start >= kernel_end) {
+		__create_pgd_mapping(pgd, start, __phys_to_virt(start),
+				     end - start, PAGE_KERNEL,
+				     early_pgtable_alloc);
+		return;
 	}
-}
-#else
-static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
-{
-	create_mapping(start, __phys_to_virt(start), end - start,
-			PAGE_KERNEL_EXEC);
-}
-#endif
 
-static void __init map_mem(void)
+	/*
+	 * This block overlaps the kernel mapping. Map the portion(s) which
+	 * don't overlap.
+	 */
+	if (start < kernel_start)
+		__create_pgd_mapping(pgd, start,
+				     __phys_to_virt(start),
+				     kernel_start - start, PAGE_KERNEL,
+				     early_pgtable_alloc);
+	if (kernel_end < end)
+		__create_pgd_mapping(pgd, kernel_end,
+				     __phys_to_virt(kernel_end),
+				     end - kernel_end, PAGE_KERNEL,
+				     early_pgtable_alloc);
+}
+
+static void __init map_mem(pgd_t *pgd)
 {
 	struct memblock_region *reg;
 
@@ -397,33 +392,10 @@ static void __init map_mem(void)
 		if (start >= end)
 			break;
 
-		__map_memblock(start, end);
+		__map_memblock(pgd, start, end);
 	}
 }
 
-static void __init fixup_executable(void)
-{
-#ifdef CONFIG_DEBUG_RODATA
-	/* now that we are actually fully mapped, make the start/end more fine grained */
-	if (!IS_ALIGNED((unsigned long)_stext, SWAPPER_BLOCK_SIZE)) {
-		unsigned long aligned_start = round_down(__pa(_stext),
-							 SWAPPER_BLOCK_SIZE);
-
-		create_mapping(aligned_start, __phys_to_virt(aligned_start),
-				__pa(_stext) - aligned_start,
-				PAGE_KERNEL);
-	}
-
-	if (!IS_ALIGNED((unsigned long)__init_end, SWAPPER_BLOCK_SIZE)) {
-		unsigned long aligned_end = round_up(__pa(__init_end),
-							  SWAPPER_BLOCK_SIZE);
-		create_mapping(__pa(__init_end), (unsigned long)__init_end,
-				aligned_end - __pa(__init_end),
-				PAGE_KERNEL);
-	}
-#endif
-}
-
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void)
 {
@@ -441,14 +413,72 @@ void fixup_init(void)
 			PAGE_KERNEL);
 }
 
+static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end,
+				    pgprot_t prot)
+{
+	phys_addr_t pa_start = __pa(va_start);
+	unsigned long size = va_end - va_start;
+
+	BUG_ON(!PAGE_ALIGNED(pa_start));
+	BUG_ON(!PAGE_ALIGNED(size));
+
+	__create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
+			     early_pgtable_alloc);
+}
+
+/*
+ * Create fine-grained mappings for the kernel.
+ */
+static void __init map_kernel(pgd_t *pgd)
+{
+
+	map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC);
+	map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC);
+	map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL);
+
+	/*
+	 * The fixmap falls in a separate pgd to the kernel, and doesn't live
+	 * in the carveout for the swapper_pg_dir. We can simply re-use the
+	 * existing dir for the fixmap.
+	 */
+	set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START));
+
+	kasan_copy_shadow(pgd);
+}
+
 /*
  * paging_init() sets up the page tables, initialises the zone memory
  * maps and sets up the zero page.
  */
 void __init paging_init(void)
 {
-	map_mem();
-	fixup_executable();
+	phys_addr_t pgd_phys = early_pgtable_alloc();
+	pgd_t *pgd = pgd_set_fixmap(pgd_phys);
+
+	map_kernel(pgd);
+	map_mem(pgd);
+
+	/*
+	 * We want to reuse the original swapper_pg_dir so we don't have to
+	 * communicate the new address to non-coherent secondaries in
+	 * secondary_entry, and so cpu_switch_mm can generate the address with
+	 * adrp+add rather than a load from some global variable.
+	 *
+	 * To do this we need to go via a temporary pgd.
+	 */
+	cpu_replace_ttbr1(__va(pgd_phys));
+	memcpy(swapper_pg_dir, pgd, PAGE_SIZE);
+	cpu_replace_ttbr1(swapper_pg_dir);
+
+	pgd_clear_fixmap();
+	memblock_free(pgd_phys, PAGE_SIZE);
+
+	/*
+	 * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd
+	 * allocated with it.
+	 */
+	memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE,
+		      SWAPPER_DIR_SIZE - PAGE_SIZE);
 
 	bootmem_init();
 }

From 17e35cbf1863e5cf74425594056ef28b18f762bf Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Tue, 23 Aug 2016 18:22:33 -0400
Subject: [PATCH 0387/3220] UPSTREAM: tun: fix transmit timestamp support

Instead of using sock_tx_timestamp, use skb_tx_timestamp to record
software transmit timestamp of a packet.

sock_tx_timestamp resets and overrides the tx_flags of the skb.
The function is intended to be called from within the protocol
layer when creating the skb, not from a device driver. This is
inconsistent with other drivers and will cause issues for TCP.

In TCP, we intend to sample the timestamps for the last byte
for each sendmsg/sendpage. For that reason, tcp_sendmsg calls
tcp_tx_timestamp only with the last skb that it generates.
For example, if a 128KB message is split into two 64KB packets
we want to sample the SND timestamp of the last packet. The current
code in the tun driver, however, will result in sampling the SND
timestamp for both packets.

Also, when the last packet is split into smaller packets for
retranmission (see tcp_fragment), the tun driver will record
timestamps for all of the retransmitted packets and not only the
last packet.

Change-Id: If7458ab31de52aa15a12364b6c1ac2a8f93f17a7
Fixes: eda297729171 (tun: Support software transmit time stamping.)
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Francis Yan <francisyyan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2228a539184a..7b4c65effe5e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -858,10 +858,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
 		goto drop;
 
-	if (skb->sk && sk_fullsock(skb->sk)) {
-		sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags);
-		sw_tx_timestamp(skb);
-	}
+	skb_tx_timestamp(skb);
 
 	/* Orphan the skb - required as we might hang on to it
 	 * for indefinite time.

From fa42b29f530e15078b4e3070fd62e310211ee3de Mon Sep 17 00:00:00 2001
From: Eric Ernst <eric.ernst@linux.intel.com>
Date: Fri, 2 Sep 2016 16:12:06 -0700
Subject: [PATCH 0388/3220] input: keyreset: switch to orderly_reboot

Prior restart function would make a call to sys_sync and then
execute a kernel reset.  Rather than call the sync directly,
thus necessitating this driver to be builtin, call orderly_reboot,
which will take care of the file system sync.

Note: since CONFIG_INPUT Kconfig is tristate, this driver can be built
as module, despite being marked bool.

Signed-off-by: Eric Ernst <eric.ernst@linux.intel.com>
---
 drivers/input/keyreset.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/input/keyreset.c b/drivers/input/keyreset.c
index 7fbf7247e65f..7e5222aec7c1 100644
--- a/drivers/input/keyreset.c
+++ b/drivers/input/keyreset.c
@@ -32,8 +32,7 @@ struct keyreset_state {
 
 static void do_restart(struct work_struct *unused)
 {
-	sys_sync();
-	kernel_restart(NULL);
+	orderly_reboot();
 }
 
 static void do_reset_fn(void *priv)

From 5449876ad748ec9eae64267f3d7aab93c82a464d Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Mon, 19 Sep 2016 14:37:34 -0700
Subject: [PATCH 0389/3220] eas/sched/fair: Fixing comments in
 find_best_target.

Change-Id: I83f5b9887e98f9fdb81318cde45408e7ebfc4b13
Signed-off-by: Srinath Sridharan <srinathsr@google.com>
---
 kernel/sched/fair.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2cd3e2671f16..7e42cd11e430 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5674,17 +5674,19 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 
 		if (new_util < cur_capacity) {
 			if (cpu_rq(i)->nr_running) {
-				if(prefer_idle) {
-					// Find a target cpu with lowest
-					// utilization.
+				if (prefer_idle) {
+					/* Find a target cpu with highest
+					 * utilization.
+					 */
 					if (target_util == 0 ||
 						target_util < new_util) {
 						target_cpu = i;
 						target_util = new_util;
 					}
 				} else {
-					// Find a target cpu with highest
-					// utilization.
+					/* Find a target cpu with lowest
+					 * utilization.
+					 */
 					if (target_util == 0 ||
 						target_util > new_util) {
 						target_cpu = i;

From b7c491d2c40fde253ec06a63b3f93ea60c0467a9 Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Sun, 18 Sep 2016 21:39:28 -0700
Subject: [PATCH 0390/3220] ANDROID: fiq_debugger: Pass task parameter to
 unwind_frame()

Fixes: fe13f95b7200 ("arm64: pass a task parameter to unwind_frame()")

Bug: 30369029
Patchset: rework-pagetable

Change-Id: I9a4ab50ef61532d27282f189f063c938c196ec08
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
---
 drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
index 99c6584fcfa5..97246bcbcd62 100644
--- a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
@@ -197,6 +197,6 @@ void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
 		frame.sp = regs->sp;
 		frame.pc = regs->pc;
 		output->printf(output, "\n");
-		walk_stackframe(&frame, report_trace, &sts);
+		walk_stackframe(current, &frame, report_trace, &sts);
 	}
 }

From cf43809d7aa07868ffacd83892745c85f91c8a53 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Tue, 20 Sep 2016 17:00:47 +0100
Subject: [PATCH 0391/3220] sched/walt: Drop arch-specific timer access

On at least one platform, occasionally the timer providing the wallclock
was able to be reset/go backwards for at least some time after wakeup.

Accept that this might happen and warn the first time, but otherwise just
carry on.

Change-Id: Id3164477ba79049561af7f0889cbeebc199ead4e
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/walt.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 07b7f84b37e2..2ffb1680b380 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -22,7 +22,6 @@
 #include <linux/syscore_ops.h>
 #include <linux/cpufreq.h>
 #include <trace/events/sched.h>
-#include <clocksource/arm_arch_timer.h>
 #include "sched.h"
 #include "walt.h"
 
@@ -188,10 +187,8 @@ update_window_start(struct rq *rq, u64 wallclock)
 	delta = wallclock - rq->window_start;
 	/* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
 	if (delta < 0) {
-		if (arch_timer_read_counter() == 0)
-			delta = 0;
-		else
-			BUG_ON(1);
+		delta = 0;
+		WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
 	}
 
 	if (delta < walt_ravg_window)

From 32cbbe59538d2dd0b77822cc27ce32ca487c467d Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Mon, 19 Sep 2016 17:33:50 -0700
Subject: [PATCH 0392/3220] ANDROID: fs: FS tracepoints to track IO.

Adds tracepoints in ext4/f2fs/mpage to track readpages/buffered
write()s. This allows us to track files that are being read/written
to PIDs.

Change-Id: I26bd36f933108927d6903da04d8cb42fd9c3ef3d
Signed-off-by: Mohan Srinivasan <srmohan@google.com>
---
 fs/ext4/inline.c                           |  6 ++
 fs/ext4/inode.c                            | 27 ++++++++
 fs/ext4/readpage.c                         | 41 +++++++++--
 fs/f2fs/data.c                             | 21 ++++++
 fs/f2fs/inline.c                           | 11 +++
 fs/mpage.c                                 | 30 ++++++++
 include/trace/events/android_fs.h          | 31 +++++++++
 include/trace/events/android_fs_template.h | 79 ++++++++++++++++++++++
 8 files changed, 242 insertions(+), 4 deletions(-)
 create mode 100644 include/trace/events/android_fs.h
 create mode 100644 include/trace/events/android_fs_template.h

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d884989cc83d..af34979684a4 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -18,6 +18,7 @@
 #include "ext4.h"
 #include "xattr.h"
 #include "truncate.h"
+#include <trace/events/android_fs.h>
 
 #define EXT4_XATTR_SYSTEM_DATA	"data"
 #define EXT4_MIN_INLINE_DATA_SIZE	((sizeof(__le32) * EXT4_N_BLOCKS))
@@ -500,6 +501,9 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
 		return -EAGAIN;
 	}
 
+	trace_android_fs_dataread_start(inode, page_offset(page), PAGE_SIZE,
+					current->pid, current->comm);
+
 	/*
 	 * Current inline data can only exist in the 1st page,
 	 * So for all the other pages, just set them uptodate.
@@ -511,6 +515,8 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
 		SetPageUptodate(page);
 	}
 
+	trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE);
+
 	up_read(&EXT4_I(inode)->xattr_sem);
 
 	unlock_page(page);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ea433a7f4bca..a7208a662ee6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -44,6 +44,7 @@
 #include "truncate.h"
 
 #include <trace/events/ext4.h>
+#include <trace/events/android_fs.h>
 
 #define MPAGE_DA_EXTENT_TAIL 0x01
 
@@ -982,6 +983,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index;
 	unsigned from, to;
 
+	trace_android_fs_datawrite_start(inode, pos, len,
+					 current->pid, current->comm);
 	trace_ext4_write_begin(inode, pos, len, flags);
 	/*
 	 * Reserve one block more for addition to orphan list in case
@@ -1118,6 +1121,7 @@ static int ext4_write_end(struct file *file,
 	int ret = 0, ret2;
 	int i_size_changed = 0;
 
+	trace_android_fs_datawrite_end(inode, pos, len);
 	trace_ext4_write_end(inode, pos, len, copied);
 	if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
 		ret = ext4_jbd2_file_inode(handle, inode);
@@ -1226,6 +1230,7 @@ static int ext4_journalled_write_end(struct file *file,
 	unsigned from, to;
 	int size_changed = 0;
 
+	trace_android_fs_datawrite_end(inode, pos, len);
 	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -2670,6 +2675,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 					len, flags, pagep, fsdata);
 	}
 	*fsdata = (void *)0;
+	trace_android_fs_datawrite_start(inode, pos, len,
+					 current->pid, current->comm);
 	trace_ext4_da_write_begin(inode, pos, len, flags);
 
 	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -2788,6 +2795,7 @@ static int ext4_da_write_end(struct file *file,
 		return ext4_write_end(file, mapping, pos,
 				      len, copied, page, fsdata);
 
+	trace_android_fs_datawrite_end(inode, pos, len);
 	trace_ext4_da_write_end(inode, pos, len, copied);
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + copied - 1;
@@ -3276,12 +3284,31 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	if (ext4_has_inline_data(inode))
 		return 0;
 
+	if (trace_android_fs_dataread_start_enabled() &&
+	    (iov_iter_rw(iter) == READ))
+		trace_android_fs_dataread_start(inode, offset, count,
+						current->pid,
+						current->comm);
+	if (trace_android_fs_datawrite_start_enabled() &&
+	    (iov_iter_rw(iter) == WRITE))
+		trace_android_fs_datawrite_start(inode, offset, count,
+						 current->pid,
+						 current->comm);
+
 	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_direct_IO(iocb, iter, offset);
 	else
 		ret = ext4_ind_direct_IO(iocb, iter, offset);
 	trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
+
+	if (trace_android_fs_dataread_start_enabled() &&
+	    (iov_iter_rw(iter) == READ))
+		trace_android_fs_dataread_end(inode, offset, count);
+	if (trace_android_fs_datawrite_start_enabled() &&
+	    (iov_iter_rw(iter) == WRITE))
+		trace_android_fs_datawrite_end(inode, offset, count);
+
 	return ret;
 }
 
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5dc5e95063de..1ce24a6759a0 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -45,6 +45,7 @@
 #include <linux/cleancache.h>
 
 #include "ext4.h"
+#include <trace/events/android_fs.h>
 
 /*
  * Call ext4_decrypt on every single page, reusing the encryption
@@ -86,6 +87,17 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
 #endif
 }
 
+static void
+ext4_trace_read_completion(struct bio *bio)
+{
+	struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+	if (first_page != NULL)
+		trace_android_fs_dataread_end(first_page->mapping->host,
+					      page_offset(first_page),
+					      bio->bi_iter.bi_size);
+}
+
 /*
  * I/O completion handler for multipage BIOs.
  *
@@ -103,6 +115,9 @@ static void mpage_end_io(struct bio *bio)
 	struct bio_vec *bv;
 	int i;
 
+	if (trace_android_fs_dataread_start_enabled())
+		ext4_trace_read_completion(bio);
+
 	if (ext4_bio_encrypted(bio)) {
 		struct ext4_crypto_ctx *ctx = bio->bi_private;
 
@@ -130,6 +145,24 @@ static void mpage_end_io(struct bio *bio)
 	bio_put(bio);
 }
 
+static void
+ext4_submit_bio_read(struct bio *bio)
+{
+	if (trace_android_fs_dataread_start_enabled()) {
+		struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+		if (first_page != NULL) {
+			trace_android_fs_dataread_start(
+				first_page->mapping->host,
+				page_offset(first_page),
+				bio->bi_iter.bi_size,
+				current->pid,
+				current->comm);
+		}
+	}
+	submit_bio(READ, bio);
+}
+
 int ext4_mpage_readpages(struct address_space *mapping,
 			 struct list_head *pages, struct page *page,
 			 unsigned nr_pages)
@@ -271,7 +304,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		 */
 		if (bio && (last_block_in_bio != blocks[0] - 1)) {
 		submit_and_realloc:
-			submit_bio(READ, bio);
+			ext4_submit_bio_read(bio);
 			bio = NULL;
 		}
 		if (bio == NULL) {
@@ -303,14 +336,14 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
 		     (relative_block == map.m_len)) ||
 		    (first_hole != blocks_per_page)) {
-			submit_bio(READ, bio);
+			ext4_submit_bio_read(bio);
 			bio = NULL;
 		} else
 			last_block_in_bio = blocks[blocks_per_page - 1];
 		goto next_page;
 	confused:
 		if (bio) {
-			submit_bio(READ, bio);
+			ext4_submit_bio_read(bio);
 			bio = NULL;
 		}
 		if (!PageUptodate(page))
@@ -323,6 +356,6 @@ int ext4_mpage_readpages(struct address_space *mapping,
 	}
 	BUG_ON(pages && !list_empty(pages));
 	if (bio)
-		submit_bio(READ, bio);
+		ext4_submit_bio_read(bio);
 	return 0;
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 972eab7ac071..e692958d6e78 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -26,6 +26,7 @@
 #include "segment.h"
 #include "trace.h"
 #include <trace/events/f2fs.h>
+#include <trace/events/android_fs.h>
 
 static void f2fs_read_end_io(struct bio *bio)
 {
@@ -1401,6 +1402,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	struct dnode_of_data dn;
 	int err = 0;
 
+	trace_android_fs_datawrite_start(inode, pos, len,
+					 current->pid, current->comm);
 	trace_f2fs_write_begin(inode, pos, len, flags);
 
 	f2fs_balance_fs(sbi);
@@ -1529,6 +1532,7 @@ static int f2fs_write_end(struct file *file,
 {
 	struct inode *inode = page->mapping->host;
 
+	trace_android_fs_datawrite_end(inode, pos, len);
 	trace_f2fs_write_end(inode, pos, len, copied);
 
 	set_page_dirty(page);
@@ -1582,6 +1586,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 
+	if (trace_android_fs_dataread_start_enabled() &&
+	    (iov_iter_rw(iter) == READ))
+		trace_android_fs_dataread_start(inode, offset,
+						count, current->pid,
+						current->comm);
+	if (trace_android_fs_datawrite_start_enabled() &&
+	    (iov_iter_rw(iter) == WRITE))
+		trace_android_fs_datawrite_start(inode, offset, count,
+						 current->pid, current->comm);
+
 	if (iov_iter_rw(iter) == WRITE) {
 		__allocate_data_blocks(inode, offset, count);
 		if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
@@ -1595,6 +1609,13 @@ out:
 	if (err < 0 && iov_iter_rw(iter) == WRITE)
 		f2fs_write_failed(mapping, offset + count);
 
+	if (trace_android_fs_dataread_start_enabled() &&
+	    (iov_iter_rw(iter) == READ))
+		trace_android_fs_dataread_end(inode, offset, count);
+	if (trace_android_fs_datawrite_start_enabled() &&
+	    (iov_iter_rw(iter) == WRITE))
+		trace_android_fs_datawrite_end(inode, offset, count);
+
 	trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err);
 
 	return err;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index bda7126466c0..d2c5d69ba0b1 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -13,6 +13,7 @@
 
 #include "f2fs.h"
 #include "node.h"
+#include <trace/events/android_fs.h>
 
 bool f2fs_may_inline_data(struct inode *inode)
 {
@@ -84,14 +85,22 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 {
 	struct page *ipage;
 
+	trace_android_fs_dataread_start(inode, page_offset(page),
+					PAGE_SIZE, current->pid,
+					current->comm);
+
 	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
 	if (IS_ERR(ipage)) {
+		trace_android_fs_dataread_end(inode, page_offset(page),
+					      PAGE_SIZE);
 		unlock_page(page);
 		return PTR_ERR(ipage);
 	}
 
 	if (!f2fs_has_inline_data(inode)) {
 		f2fs_put_page(ipage, 1);
+		trace_android_fs_dataread_end(inode, page_offset(page),
+					      PAGE_SIZE);
 		return -EAGAIN;
 	}
 
@@ -102,6 +111,8 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 
 	SetPageUptodate(page);
 	f2fs_put_page(ipage, 1);
+	trace_android_fs_dataread_end(inode, page_offset(page),
+				      PAGE_SIZE);
 	unlock_page(page);
 	return 0;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index 1480d3a18037..5c65d8942692 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -30,6 +30,14 @@
 #include <linux/cleancache.h>
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/android_fs.h>
+
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_end);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_end);
+
 /*
  * I/O completion handler for multipage BIOs.
  *
@@ -47,6 +55,16 @@ static void mpage_end_io(struct bio *bio)
 	struct bio_vec *bv;
 	int i;
 
+	if (trace_android_fs_dataread_end_enabled() &&
+	    (bio_data_dir(bio) == READ)) {
+		struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+		if (first_page != NULL)
+			trace_android_fs_dataread_end(first_page->mapping->host,
+						      page_offset(first_page),
+						      bio->bi_iter.bi_size);
+	}
+
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
 		page_endio(page, bio_data_dir(bio), bio->bi_error);
@@ -57,6 +75,18 @@ static void mpage_end_io(struct bio *bio)
 
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
+	if (trace_android_fs_dataread_start_enabled() && (rw == READ)) {
+		struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+		if (first_page != NULL) {
+			trace_android_fs_dataread_start(
+				first_page->mapping->host,
+				page_offset(first_page),
+				bio->bi_iter.bi_size,
+				current->pid,
+				current->comm);
+		}
+	}
 	bio->bi_end_io = mpage_end_io;
 	guard_bio_eod(rw, bio);
 	submit_bio(rw, bio);
diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h
new file mode 100644
index 000000000000..531da433a7bc
--- /dev/null
+++ b/include/trace/events/android_fs.h
@@ -0,0 +1,31 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM android_fs
+
+#if !defined(_TRACE_ANDROID_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_H
+
+#include <linux/tracepoint.h>
+#include <trace/events/android_fs_template.h>
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start,
+	TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+		 pid_t pid, char *command),
+	TP_ARGS(inode, offset, bytes, pid, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end,
+	TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+	TP_ARGS(inode, offset, bytes));
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start,
+	TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+		 pid_t pid, char *command),
+	TP_ARGS(inode, offset, bytes, pid, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end,
+	TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+	TP_ARGS(inode, offset, bytes));
+
+#endif /* _TRACE_ANDROID_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h
new file mode 100644
index 000000000000..618988b047c1
--- /dev/null
+++ b/include/trace/events/android_fs_template.h
@@ -0,0 +1,79 @@
+#if !defined(_TRACE_ANDROID_FS_TEMPLATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_TEMPLATE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(android_fs_data_start_template,
+	TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+		 pid_t pid, char *command),
+	TP_ARGS(inode, offset, bytes, pid, command),
+	TP_STRUCT__entry(
+		__array(char, path, MAX_FILTER_STR_VAL);
+		__field(char *, pathname);
+		__field(loff_t,	offset);
+		__field(int,	bytes);
+		__field(loff_t,	i_size);
+		__string(cmdline, command);
+		__field(pid_t,	pid);
+		__field(ino_t,	ino);
+	),
+	TP_fast_assign(
+		{
+			struct dentry *d;
+
+			/*
+			 * Grab a reference to the inode here because
+			 * d_obtain_alias() will either drop the inode
+			 * reference if it locates an existing dentry
+			 * or transfer the reference to the new dentry
+			 * created. In our case, the file is still open,
+			 * so the dentry is guaranteed to exist (connected),
+			 * so d_obtain_alias() drops the reference we
+			 * grabbed here.
+			 */
+			ihold(inode);
+			d = d_obtain_alias(inode);
+			if (!IS_ERR(d)) {
+				__entry->pathname = dentry_path(d,
+							__entry->path,
+							MAX_FILTER_STR_VAL);
+				dput(d);
+			} else
+				__entry->pathname = ERR_PTR(-EINVAL);
+			__entry->offset		= offset;
+			__entry->bytes		= bytes;
+			__entry->i_size		= i_size_read(inode);
+			__assign_str(cmdline, command);
+			__entry->pid		= pid;
+			__entry->ino		= inode->i_ino;
+		}
+	),
+	TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s,"
+		  " pid %d, i_size %llu, ino %lu",
+		  (IS_ERR(__entry->pathname) ? "ERROR" : __entry->pathname),
+		  __entry->offset, __entry->bytes, __get_str(cmdline),
+		  __entry->pid, __entry->i_size,
+		  (unsigned long) __entry->ino)
+);
+
+DECLARE_EVENT_CLASS(android_fs_data_end_template,
+	TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+	TP_ARGS(inode, offset, bytes),
+	TP_STRUCT__entry(
+		__field(ino_t,	ino);
+		__field(loff_t,	offset);
+		__field(int,	bytes);
+	),
+	TP_fast_assign(
+		{
+			__entry->ino		= inode->i_ino;
+			__entry->offset		= offset;
+			__entry->bytes		= bytes;
+		}
+	),
+	TP_printk("ino %lu, offset %llu, bytes %d",
+		  (unsigned long) __entry->ino,
+		  __entry->offset, __entry->bytes)
+);
+
+#endif /* _TRACE_ANDROID_FS_TEMPLATE_H */

From df232437710122fcb4e4a0484a1eded5aec29a6a Mon Sep 17 00:00:00 2001
From: Caesar Wang <wxt@rock-chips.com>
Date: Tue, 23 Aug 2016 11:47:02 +0100
Subject: [PATCH 0393/3220] sched/fair: remove printk while schedule is in
 progress

It will cause deadlock and while(1) if call printk while schedule is in
progress. The block state like as below:

cpu0(hold the console sem):
printk->console_unlock->up_sem->spin_lock(&sem->lock)->wake_up_process(cpu1)
->try_to_wake_up(cpu1)->while(p->on_cpu).

cpu1(request console sem):
console_lock->down_sem->schedule->idle_banlance->update_cpu_capacity->
printk->console_trylock->spin_lock(&sem->lock).

p->on_cpu will be 1 forever, because the task is still running on cpu1,
so cpu0 is blocked in while(p->on_cpu), but cpu1 could not get
spin_lock(&sem->lock), it is blocked too, it means the task will running
on cpu1 forever.

Signed-off-by: Caesar Wang <wxt@rock-chips.com>
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e42cd11e430..6b881cf81d79 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7089,7 +7089,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 		mcc->cpu = cpu;
 #ifdef CONFIG_SCHED_DEBUG
 		raw_spin_unlock_irqrestore(&mcc->lock, flags);
-		pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+		printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+				cpu, capacity);
 		goto skip_unlock;
 #endif
 	}

From 53c63d3b4f06e31269d66173d327164f273e4233 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 20 Sep 2016 18:42:22 -0700
Subject: [PATCH 0394/3220] sched: Add Kconfig option DEFAULT_USE_ENERGY_AWARE
 to set ENERGY_AWARE feature flag

The ENERGY_AWARE sched feature flag cannot be set unless
CONFIG_SCHED_DEBUG is enabled.

So this patch allows the flag to default to true at build time
if the config is set.

Change-Id: I8835a571fdb7a8f8ee6a54af1e11a69f3b5ce8e6
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 init/Kconfig            | 10 ++++++++++
 kernel/sched/features.h |  4 ++++
 2 files changed, 14 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index e71e35cf723c..acb6645ffda5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1290,6 +1290,16 @@ config SCHED_TUNE
 
 	  If unsure, say N.
 
+config DEFAULT_USE_ENERGY_AWARE
+	bool "Default to enabling the Energy Aware Scheduler feature"
+	default n
+	help
+	  This option defaults the ENERGY_AWARE scheduling feature to true,
+	  as without SCHED_DEBUG set this feature can't be enabled or disabled
+	  via sysctl.
+
+	  Say N if unsure.
+
 config SYSFS_DEPRECATED
 	bool "Enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index b634151ce286..55e461055332 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -73,4 +73,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
  * Energy aware scheduling. Use platform energy model to guide scheduling
  * decisions optimizing for energy efficiency.
  */
+#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE
+SCHED_FEAT(ENERGY_AWARE, true)
+#else
 SCHED_FEAT(ENERGY_AWARE, false)
+#endif

From 55b6a243332cde20882ba331a429d4ce5ec93ae7 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Tue, 26 Jan 2016 11:10:38 +0000
Subject: [PATCH 0395/3220] BACKPORT: arm64: kernel: implement ACPI parking
 protocol

The SBBR and ACPI specifications allow ACPI based systems that do not
implement PSCI (eg systems with no EL3) to boot through the ACPI parking
protocol specification[1].

This patch implements the ACPI parking protocol CPU operations, and adds
code that eases parsing the parking protocol data structures to the
ARM64 SMP initializion carried out at the same time as cpus enumeration.

To wake-up the CPUs from the parked state, this patch implements a
wakeup IPI for ARM64 (ie arch_send_wakeup_ipi_mask()) that mirrors the
ARM one, so that a specific IPI is sent for wake-up purpose in order
to distinguish it from other IPI sources.

Given the current ACPI MADT parsing API, the patch implements a glue
layer that helps passing MADT GICC data structure from SMP initialization
code to the parking protocol implementation somewhat overriding the CPU
operations interfaces. This to avoid creating a completely trasparent
DT/ACPI CPU operations layer that would require creating opaque
structure handling for CPUs data (DT represents CPU through DT nodes, ACPI
through static MADT table entries), which seems overkill given that ACPI
on ARM64 mandates only two booting protocols (PSCI and parking protocol),
so there is no need for further protocol additions.

Based on the original work by Mark Salter <msalter@redhat.com>

[1] https://acpica.org/sites/acpica/files/MP%20Startup%20for%20ARM%20platforms.docx

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Loc Ho <lho@apm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Al Stone <ahs3@redhat.com>
[catalin.marinas@arm.com: Added WARN_ONCE(!acpi_parking_protocol_valid() on the IPI]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 5e89c55e4ed81d7abb1ce8828db35fa389dc0e90)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ie9a872bff124ca6b3551c812df768cc378658bcc
---
 arch/arm64/Kconfig                        |   9 ++
 arch/arm64/include/asm/acpi.h             |  19 ++-
 arch/arm64/include/asm/hardirq.h          |   2 +-
 arch/arm64/include/asm/smp.h              |   9 ++
 arch/arm64/kernel/Makefile                |   1 +
 arch/arm64/kernel/acpi_parking_protocol.c | 153 ++++++++++++++++++++++
 arch/arm64/kernel/cpu_ops.c               |  27 +++-
 arch/arm64/kernel/smp.c                   |  28 ++++
 8 files changed, 242 insertions(+), 6 deletions(-)
 create mode 100644 arch/arm64/kernel/acpi_parking_protocol.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index ac3dbd933064..1405a1bcc26d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -740,6 +740,15 @@ endmenu
 
 menu "Boot options"
 
+config ARM64_ACPI_PARKING_PROTOCOL
+	bool "Enable support for the ARM64 ACPI parking protocol"
+	depends on ACPI
+	help
+	  Enable support for the ARM64 ACPI parking protocol. If disabled
+	  the kernel will not allow booting through the ARM64 ACPI parking
+	  protocol even if the corresponding data is present in the ACPI
+	  MADT table.
+
 config CMDLINE
 	string "Default kernel command string"
 	default ""
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index caafd63b8092..aee323b13802 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -87,9 +87,26 @@ void __init acpi_init_cpus(void);
 static inline void acpi_init_cpus(void) { }
 #endif /* CONFIG_ACPI */
 
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+bool acpi_parking_protocol_valid(int cpu);
+void __init
+acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor);
+#else
+static inline bool acpi_parking_protocol_valid(int cpu) { return false; }
+static inline void
+acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor)
+{}
+#endif
+
 static inline const char *acpi_get_enable_method(int cpu)
 {
-	return acpi_psci_present() ? "psci" : NULL;
+	if (acpi_psci_present())
+		return "psci";
+
+	if (acpi_parking_protocol_valid(cpu))
+		return "parking-protocol";
+
+	return NULL;
 }
 
 #ifdef	CONFIG_ACPI_APEI
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index a57601f9d17c..8740297dac77 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -20,7 +20,7 @@
 #include <linux/threads.h>
 #include <asm/irq.h>
 
-#define NR_IPI	5
+#define NR_IPI	6
 
 typedef struct {
 	unsigned int __softirq_pending;
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index d9c3d6a6100a..2013a4dc5124 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -64,6 +64,15 @@ extern void secondary_entry(void);
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
+#else
+static inline void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
+{
+	BUILD_BUG();
+}
+#endif
+
 extern int __cpu_disable(void);
 
 extern void __cpu_die(unsigned int cpu);
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 474691f8b13a..c4e2f70c0aa0 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -41,6 +41,7 @@ arm64-obj-$(CONFIG_EFI)			+= efi.o efi-entry.stub.o
 arm64-obj-$(CONFIG_PCI)			+= pci.o
 arm64-obj-$(CONFIG_ARMV8_DEPRECATED)	+= armv8_deprecated.o
 arm64-obj-$(CONFIG_ACPI)		+= acpi.o
+arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL)	+= acpi_parking_protocol.o
 
 obj-y					+= $(arm64-obj-y) vdso/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c
new file mode 100644
index 000000000000..4b1e5a7a98da
--- /dev/null
+++ b/arch/arm64/kernel/acpi_parking_protocol.c
@@ -0,0 +1,153 @@
+/*
+ * ARM64 ACPI Parking Protocol implementation
+ *
+ * Authors: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ *	    Mark Salter <msalter@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/acpi.h>
+#include <linux/types.h>
+
+#include <asm/cpu_ops.h>
+
+struct cpu_mailbox_entry {
+	phys_addr_t mailbox_addr;
+	u8 version;
+	u8 gic_cpu_id;
+};
+
+static struct cpu_mailbox_entry cpu_mailbox_entries[NR_CPUS];
+
+void __init acpi_set_mailbox_entry(int cpu,
+				   struct acpi_madt_generic_interrupt *p)
+{
+	struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+
+	cpu_entry->mailbox_addr = p->parked_address;
+	cpu_entry->version = p->parking_version;
+	cpu_entry->gic_cpu_id = p->cpu_interface_number;
+}
+
+bool acpi_parking_protocol_valid(int cpu)
+{
+	struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+
+	return cpu_entry->mailbox_addr && cpu_entry->version;
+}
+
+static int acpi_parking_protocol_cpu_init(unsigned int cpu)
+{
+	pr_debug("%s: ACPI parked addr=%llx\n", __func__,
+		  cpu_mailbox_entries[cpu].mailbox_addr);
+
+	return 0;
+}
+
+static int acpi_parking_protocol_cpu_prepare(unsigned int cpu)
+{
+	return 0;
+}
+
+struct parking_protocol_mailbox {
+	__le32 cpu_id;
+	__le32 reserved;
+	__le64 entry_point;
+};
+
+static int acpi_parking_protocol_cpu_boot(unsigned int cpu)
+{
+	struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+	struct parking_protocol_mailbox __iomem *mailbox;
+	__le32 cpu_id;
+
+	/*
+	 * Map mailbox memory with attribute device nGnRE (ie ioremap -
+	 * this deviates from the parking protocol specifications since
+	 * the mailboxes are required to be mapped nGnRnE; the attribute
+	 * discrepancy is harmless insofar as the protocol specification
+	 * is concerned).
+	 * If the mailbox is mistakenly allocated in the linear mapping
+	 * by FW ioremap will fail since the mapping will be prevented
+	 * by the kernel (it clashes with the linear mapping attributes
+	 * specifications).
+	 */
+	mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox));
+	if (!mailbox)
+		return -EIO;
+
+	cpu_id = readl_relaxed(&mailbox->cpu_id);
+	/*
+	 * Check if firmware has set-up the mailbox entry properly
+	 * before kickstarting the respective cpu.
+	 */
+	if (cpu_id != ~0U) {
+		iounmap(mailbox);
+		return -ENXIO;
+	}
+
+	/*
+	 * We write the entry point and cpu id as LE regardless of the
+	 * native endianness of the kernel. Therefore, any boot-loaders
+	 * that read this address need to convert this address to the
+	 * Boot-Loader's endianness before jumping.
+	 */
+	writeq_relaxed(__pa(secondary_entry), &mailbox->entry_point);
+	writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id);
+
+	arch_send_wakeup_ipi_mask(cpumask_of(cpu));
+
+	iounmap(mailbox);
+
+	return 0;
+}
+
+static void acpi_parking_protocol_cpu_postboot(void)
+{
+	int cpu = smp_processor_id();
+	struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+	struct parking_protocol_mailbox __iomem *mailbox;
+	__le64 entry_point;
+
+	/*
+	 * Map mailbox memory with attribute device nGnRE (ie ioremap -
+	 * this deviates from the parking protocol specifications since
+	 * the mailboxes are required to be mapped nGnRnE; the attribute
+	 * discrepancy is harmless insofar as the protocol specification
+	 * is concerned).
+	 * If the mailbox is mistakenly allocated in the linear mapping
+	 * by FW ioremap will fail since the mapping will be prevented
+	 * by the kernel (it clashes with the linear mapping attributes
+	 * specifications).
+	 */
+	mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox));
+	if (!mailbox)
+		return;
+
+	entry_point = readl_relaxed(&mailbox->entry_point);
+	/*
+	 * Check if firmware has cleared the entry_point as expected
+	 * by the protocol specification.
+	 */
+	WARN_ON(entry_point);
+
+	iounmap(mailbox);
+}
+
+const struct cpu_operations acpi_parking_protocol_ops = {
+	.name		= "parking-protocol",
+	.cpu_init	= acpi_parking_protocol_cpu_init,
+	.cpu_prepare	= acpi_parking_protocol_cpu_prepare,
+	.cpu_boot	= acpi_parking_protocol_cpu_boot,
+	.cpu_postboot	= acpi_parking_protocol_cpu_postboot
+};
diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c
index b6bd7d447768..c7cfb8fe06f9 100644
--- a/arch/arm64/kernel/cpu_ops.c
+++ b/arch/arm64/kernel/cpu_ops.c
@@ -25,19 +25,30 @@
 #include <asm/smp_plat.h>
 
 extern const struct cpu_operations smp_spin_table_ops;
+extern const struct cpu_operations acpi_parking_protocol_ops;
 extern const struct cpu_operations cpu_psci_ops;
 
 const struct cpu_operations *cpu_ops[NR_CPUS];
 
-static const struct cpu_operations *supported_cpu_ops[] __initconst = {
+static const struct cpu_operations *dt_supported_cpu_ops[] __initconst = {
 	&smp_spin_table_ops,
 	&cpu_psci_ops,
 	NULL,
 };
 
+static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = {
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+	&acpi_parking_protocol_ops,
+#endif
+	&cpu_psci_ops,
+	NULL,
+};
+
 static const struct cpu_operations * __init cpu_get_ops(const char *name)
 {
-	const struct cpu_operations **ops = supported_cpu_ops;
+	const struct cpu_operations **ops;
+
+	ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops;
 
 	while (*ops) {
 		if (!strcmp(name, (*ops)->name))
@@ -75,8 +86,16 @@ static const char *__init cpu_read_enable_method(int cpu)
 		}
 	} else {
 		enable_method = acpi_get_enable_method(cpu);
-		if (!enable_method)
-			pr_err("Unsupported ACPI enable-method\n");
+		if (!enable_method) {
+			/*
+			 * In ACPI systems the boot CPU does not require
+			 * checking the enable method since for some
+			 * boot protocol (ie parking protocol) it need not
+			 * be initialized. Don't warn spuriously.
+			 */
+			if (cpu != 0)
+				pr_err("Unsupported ACPI enable-method\n");
+		}
 	}
 
 	return enable_method;
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 68e7f79630d4..24cb4f800033 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -70,6 +70,7 @@ enum ipi_msg_type {
 	IPI_CPU_STOP,
 	IPI_TIMER,
 	IPI_IRQ_WORK,
+	IPI_WAKEUP
 };
 
 /*
@@ -443,6 +444,17 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
 	/* map the logical cpu id to cpu MPIDR */
 	cpu_logical_map(cpu_count) = hwid;
 
+	/*
+	 * Set-up the ACPI parking protocol cpu entries
+	 * while initializing the cpu_logical_map to
+	 * avoid parsing MADT entries multiple times for
+	 * nothing (ie a valid cpu_logical_map entry should
+	 * contain a valid parking protocol data set to
+	 * initialize the cpu if the parking protocol is
+	 * the only available enable method).
+	 */
+	acpi_set_mailbox_entry(cpu_count, processor);
+
 	cpu_count++;
 }
 
@@ -625,6 +637,7 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
 	S(IPI_CPU_STOP, "CPU stop interrupts"),
 	S(IPI_TIMER, "Timer broadcast interrupts"),
 	S(IPI_IRQ_WORK, "IRQ work interrupts"),
+	S(IPI_WAKEUP, "CPU wake-up interrupts"),
 };
 
 static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
@@ -668,6 +681,13 @@ void arch_send_call_function_single_ipi(int cpu)
 	smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC);
 }
 
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
+{
+	smp_cross_call(mask, IPI_WAKEUP);
+}
+#endif
+
 #ifdef CONFIG_IRQ_WORK
 void arch_irq_work_raise(void)
 {
@@ -745,6 +765,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 		break;
 #endif
 
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+	case IPI_WAKEUP:
+		WARN_ONCE(!acpi_parking_protocol_valid(cpu),
+			  "CPU%u: Wake-up IPI outside the ACPI parking protocol\n",
+			  cpu);
+		break;
+#endif
+
 	default:
 		pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
 		break;

From 0b0254ded773d1f241b1ae7c7fdb137a72f60911 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 27 Jan 2016 10:50:19 +0100
Subject: [PATCH 0396/3220] UPSTREAM: arm64: allow vmalloc regions to be set
 with set_memory_*

The range of set_memory_* is currently restricted to the module address
range because of difficulties in breaking down larger block sizes.
vmalloc maps PAGE_SIZE pages so it is safe to use as well. Update the
function ranges and add a comment explaining why the range is restricted
the way it is.

Suggested-by: Laura Abbott <labbott@fedoraproject.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 95f5c80050ad723163aa80dc8bffd48ef4afc6d5)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I80117bfd93ffe90a8edc89cbf6f456d9423bcf73
---
 arch/arm64/mm/pageattr.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 3571c7309c5e..29ee7d85ca4c 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -14,6 +14,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -44,6 +45,7 @@ static int change_memory_common(unsigned long addr, int numpages,
 	unsigned long end = start + size;
 	int ret;
 	struct page_change_data data;
+	struct vm_struct *area;
 
 	if (!PAGE_ALIGNED(addr)) {
 		start &= PAGE_MASK;
@@ -51,10 +53,23 @@ static int change_memory_common(unsigned long addr, int numpages,
 		WARN_ON_ONCE(1);
 	}
 
-	if (start < MODULES_VADDR || start >= MODULES_END)
-		return -EINVAL;
-
-	if (end < MODULES_VADDR || end >= MODULES_END)
+	/*
+	 * Kernel VA mappings are always live, and splitting live section
+	 * mappings into page mappings may cause TLB conflicts. This means
+	 * we have to ensure that changing the permission bits of the range
+	 * we are operating on does not result in such splitting.
+	 *
+	 * Let's restrict ourselves to mappings created by vmalloc (or vmap).
+	 * Those are guaranteed to consist entirely of page mappings, and
+	 * splitting is never needed.
+	 *
+	 * So check whether the [addr, addr + size) interval is entirely
+	 * covered by precisely one VM area that has the VM_ALLOC flag set.
+	 */
+	area = find_vm_area((void *)addr);
+	if (!area ||
+	    end > (unsigned long)area->addr + area->size ||
+	    !(area->flags & VM_ALLOC))
 		return -EINVAL;
 
 	data.set_mask = set_mask;

From 58ffa07081e1aabd092257eb6f2fb718dcd3d486 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 2 Feb 2016 12:46:23 +0000
Subject: [PATCH 0397/3220] UPSTREAM: arm64: prefetch: don't provide
 spin_lock_prefetch with LSE

The LSE atomics rely on us not dirtying data at L1 if we can avoid it,
otherwise many of the potential scalability benefits are lost.

This patch replaces spin_lock_prefetch with a nop when the LSE atomics
are in use, so that users don't shoot themselves in the foot by causing
needless coherence traffic at L1.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Tested-by: Andrew Pinski <apinski@cavium.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit cd5e10bdf3795d22f10787bb1991c43798c885d5)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ib8bc3f38d9306c13e017139ae4f2a7c8d6b61e18
---
 arch/arm64/include/asm/processor.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 4acb7ca94fcd..31b76fce4477 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -29,6 +29,7 @@
 
 #include <linux/string.h>
 
+#include <asm/alternative.h>
 #include <asm/fpsimd.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/pgtable-hwdef.h>
@@ -177,9 +178,11 @@ static inline void prefetchw(const void *ptr)
 }
 
 #define ARCH_HAS_SPINLOCK_PREFETCH
-static inline void spin_lock_prefetch(const void *x)
+static inline void spin_lock_prefetch(const void *ptr)
 {
-	prefetchw(x);
+	asm volatile(ARM64_LSE_ATOMIC_INSN(
+		     "prfm pstl1strm, %a0",
+		     "nop") : : "p" (ptr));
 }
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT

From b5924f376a221ffa0ee64a4410aff54416c2fd2d Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 2 Feb 2016 12:46:24 +0000
Subject: [PATCH 0398/3220] UPSTREAM: arm64: prefetch: add alternative pattern
 for CPUs without a prefetcher

Most CPUs have a hardware prefetcher which generally performs better
without explicit prefetch instructions issued by software, however
some CPUs (e.g. Cavium ThunderX) rely solely on explicit prefetch
instructions.

This patch adds an alternative pattern (ARM64_HAS_NO_HW_PREFETCH) to
allow our library code to make use of explicit prefetch instructions
during things like copy routines only when the CPU does not have the
capability to perform the prefetching itself.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Tested-by: Andrew Pinski <apinski@cavium.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit d5370f754875460662abe8561388e019d90dd0c4)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ie33097c9b7786922ff8c457d16515c3188b8e94b
---
 arch/arm64/include/asm/cpufeature.h |  3 ++-
 arch/arm64/include/asm/cputype.h    | 17 ++++++++++++++++-
 arch/arm64/kernel/cpu_errata.c      | 18 +++---------------
 arch/arm64/kernel/cpufeature.c      | 17 +++++++++++++++++
 4 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 8f271b83f910..8d56bd8550dc 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -30,8 +30,9 @@
 #define ARM64_HAS_LSE_ATOMICS			5
 #define ARM64_WORKAROUND_CAVIUM_23154		6
 #define ARM64_WORKAROUND_834220			7
+#define ARM64_HAS_NO_HW_PREFETCH		8
 
-#define ARM64_NCAPS				8
+#define ARM64_NCAPS				9
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 1a5949364ed0..7540284a17fe 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -57,11 +57,22 @@
 #define MIDR_IMPLEMENTOR(midr)	\
 	(((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT)
 
-#define MIDR_CPU_PART(imp, partnum) \
+#define MIDR_CPU_MODEL(imp, partnum) \
 	(((imp)			<< MIDR_IMPLEMENTOR_SHIFT) | \
 	(0xf			<< MIDR_ARCHITECTURE_SHIFT) | \
 	((partnum)		<< MIDR_PARTNUM_SHIFT))
 
+#define MIDR_CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \
+			     MIDR_ARCHITECTURE_MASK)
+
+#define MIDR_IS_CPU_MODEL_RANGE(midr, model, rv_min, rv_max)		\
+({									\
+	u32 _model = (midr) & MIDR_CPU_MODEL_MASK;			\
+	u32 rv = (midr) & (MIDR_REVISION_MASK | MIDR_VARIANT_MASK);	\
+									\
+	_model == (model) && rv >= (rv_min) && rv <= (rv_max);		\
+ })
+
 #define ARM_CPU_IMP_ARM			0x41
 #define ARM_CPU_IMP_APM			0x50
 #define ARM_CPU_IMP_CAVIUM		0x43
@@ -75,6 +86,10 @@
 
 #define CAVIUM_CPU_PART_THUNDERX	0x0A1
 
+#define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
+#define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
+#define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
+
 #ifndef __ASSEMBLY__
 
 /*
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index feb6b4efa641..e6bc988e8dbf 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -21,24 +21,12 @@
 #include <asm/cputype.h>
 #include <asm/cpufeature.h>
 
-#define MIDR_CORTEX_A53 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
-#define MIDR_CORTEX_A57 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
-#define MIDR_THUNDERX	MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
-
-#define CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \
-			MIDR_ARCHITECTURE_MASK)
-
 static bool __maybe_unused
 is_affected_midr_range(const struct arm64_cpu_capabilities *entry)
 {
-	u32 midr = read_cpuid_id();
-
-	if ((midr & CPU_MODEL_MASK) != entry->midr_model)
-		return false;
-
-	midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK;
-
-	return (midr >= entry->midr_range_min && midr <= entry->midr_range_max);
+	return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model,
+				       entry->midr_range_min,
+				       entry->midr_range_max);
 }
 
 #define MIDR_RANGE(model, min, max) \
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 5c90aa490a2b..3615d7d7c9af 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -621,6 +621,18 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry)
 	return has_sre;
 }
 
+static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry)
+{
+	u32 midr = read_cpuid_id();
+	u32 rv_min, rv_max;
+
+	/* Cavium ThunderX pass 1.x and 2.x */
+	rv_min = 0;
+	rv_max = (1 << MIDR_VARIANT_SHIFT) | MIDR_REVISION_MASK;
+
+	return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, rv_min, rv_max);
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
@@ -651,6 +663,11 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.min_field_value = 2,
 	},
 #endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */
+	{
+		.desc = "Software prefetching using PRFM",
+		.capability = ARM64_HAS_NO_HW_PREFETCH,
+		.matches = has_no_hw_prefetch,
+	},
 	{},
 };
 

From 297bf83fa4e7804c28c6d950f39dbd3860552cb9 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 2 Feb 2016 12:46:25 +0000
Subject: [PATCH 0399/3220] UPSTREAM: arm64: lib: improve copy_page to deal
 with 128 bytes at a time

We want to avoid lots of different copy_page implementations, settling
for something that is "good enough" everywhere and hopefully easy to
understand and maintain whilst we're at it.

This patch reworks our copy_page implementation based on discussions
with Cavium on the list and benchmarking on Cortex-A processors so that:

  - The loop is unrolled to copy 128 bytes per iteration

  - The reads are offset so that we read from the next 128-byte block
    in the same iteration that we store the previous block

  - Explicit prefetch instructions are removed for now, since they hurt
    performance on CPUs with hardware prefetching

  - The loop exit condition is calculated at the start of the loop

Signed-off-by: Will Deacon <will.deacon@arm.com>
Tested-by: Andrew Pinski <apinski@cavium.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 223e23e8aa26b0bb62c597637e77295e14f6a62c)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Icabd86bbecc60ad0d730ab796e33b8762cecb1fb
---
 arch/arm64/lib/copy_page.S | 46 +++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 512b9a7b980e..2534533ceb1d 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -27,20 +27,50 @@
  *	x1 - src
  */
 ENTRY(copy_page)
-	/* Assume cache line size is 64 bytes. */
-	prfm	pldl1strm, [x1, #64]
-1:	ldp	x2, x3, [x1]
+	ldp	x2, x3, [x1]
 	ldp	x4, x5, [x1, #16]
 	ldp	x6, x7, [x1, #32]
 	ldp	x8, x9, [x1, #48]
-	add	x1, x1, #64
-	prfm	pldl1strm, [x1, #64]
+	ldp	x10, x11, [x1, #64]
+	ldp	x12, x13, [x1, #80]
+	ldp	x14, x15, [x1, #96]
+	ldp	x16, x17, [x1, #112]
+
+	mov	x18, #(PAGE_SIZE - 128)
+	add	x1, x1, #128
+1:
+	subs	x18, x18, #128
+
+	stnp	x2, x3, [x0]
+	ldp	x2, x3, [x1]
+	stnp	x4, x5, [x0, #16]
+	ldp	x4, x5, [x1, #16]
+	stnp	x6, x7, [x0, #32]
+	ldp	x6, x7, [x1, #32]
+	stnp	x8, x9, [x0, #48]
+	ldp	x8, x9, [x1, #48]
+	stnp	x10, x11, [x0, #64]
+	ldp	x10, x11, [x1, #64]
+	stnp	x12, x13, [x0, #80]
+	ldp	x12, x13, [x1, #80]
+	stnp	x14, x15, [x0, #96]
+	ldp	x14, x15, [x1, #96]
+	stnp	x16, x17, [x0, #112]
+	ldp	x16, x17, [x1, #112]
+
+	add	x0, x0, #128
+	add	x1, x1, #128
+
+	b.gt	1b
+
 	stnp	x2, x3, [x0]
 	stnp	x4, x5, [x0, #16]
 	stnp	x6, x7, [x0, #32]
 	stnp	x8, x9, [x0, #48]
-	add	x0, x0, #64
-	tst	x1, #(PAGE_SIZE - 1)
-	b.ne	1b
+	stnp	x10, x11, [x0, #64]
+	stnp	x12, x13, [x0, #80]
+	stnp	x14, x15, [x0, #96]
+	stnp	x16, x17, [x0, #112]
+
 	ret
 ENDPROC(copy_page)

From 6e1f8e3c0fb86a90fe98667cfa1be71957534c66 Mon Sep 17 00:00:00 2001
From: Andrew Pinski <apinski@cavium.com>
Date: Tue, 2 Feb 2016 12:46:26 +0000
Subject: [PATCH 0400/3220] UPSTREAM: arm64: lib: patch in prfm for copy_page
 if requested

On ThunderX T88 pass 1 and pass 2, there is no hardware prefetching so
we need to patch in explicit software prefetching instructions

Prefetching improves this code by 60% over the original code and 2x
over the code without prefetching for the affected hardware using the
benchmark code at https://github.com/apinski-cavium/copy_page_benchmark

Signed-off-by: Andrew Pinski <apinski@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Tested-by: Andrew Pinski <apinski@cavium.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 60e0a09db24adc8809696307e5d97cc4ba7cb3e0)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I3821a4d3a7b6fd68b4b0aca31478ec960e4e5172
---
 arch/arm64/lib/copy_page.S | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 2534533ceb1d..4c1e700840b6 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -18,6 +18,8 @@
 #include <linux/const.h>
 #include <asm/assembler.h>
 #include <asm/page.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative.h>
 
 /*
  * Copy a page from src to dest (both are page aligned)
@@ -27,6 +29,15 @@
  *	x1 - src
  */
 ENTRY(copy_page)
+alternative_if_not ARM64_HAS_NO_HW_PREFETCH
+	nop
+	nop
+alternative_else
+	# Prefetch two cache lines ahead.
+	prfm    pldl1strm, [x1, #128]
+	prfm    pldl1strm, [x1, #256]
+alternative_endif
+
 	ldp	x2, x3, [x1]
 	ldp	x4, x5, [x1, #16]
 	ldp	x6, x7, [x1, #32]
@@ -41,6 +52,12 @@ ENTRY(copy_page)
 1:
 	subs	x18, x18, #128
 
+alternative_if_not ARM64_HAS_NO_HW_PREFETCH
+	nop
+alternative_else
+	prfm    pldl1strm, [x1, #384]
+alternative_endif
+
 	stnp	x2, x3, [x0]
 	ldp	x2, x3, [x1]
 	stnp	x4, x5, [x0, #16]

From f54fba19ed7891dc214085f55d73990f19e9c756 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 10 Feb 2016 10:07:30 +0000
Subject: [PATCH 0401/3220] UPSTREAM: arm64: prefetch: add missing #include for
 spin_lock_prefetch

As of 52e662326e1e ("arm64: prefetch: don't provide spin_lock_prefetch
with LSE"), spin_lock_prefetch is patched at runtime when the LSE atomics
are in use. This relies on the ARM64_LSE_ATOMIC_INSN macro to drive
the alternatives framework, but that macro is only available via
asm/lse.h, which isn't explicitly included in processor.h. Consequently,
drivers can run into build failures such as:

   In file included from include/linux/prefetch.h:14:0,
                    from drivers/net/ethernet/intel/i40e/i40e_txrx.c:27:
   arch/arm64/include/asm/processor.h: In function 'spin_lock_prefetch':
   arch/arm64/include/asm/processor.h:183:15: error: expected string literal before 'ARM64_LSE_ATOMIC_INSN'
     asm volatile(ARM64_LSE_ATOMIC_INSN(

This patch add the missing include and gets things building again.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit afb83cc3f0e4f86ea0e1cc3db7a90f58f1abd4d5)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I9a9e6b920f96b060781a01bff50a0ca8db78cf7f
---
 arch/arm64/include/asm/processor.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 31b76fce4477..5bb1d763d17a 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -32,6 +32,7 @@
 #include <asm/alternative.h>
 #include <asm/fpsimd.h>
 #include <asm/hw_breakpoint.h>
+#include <asm/lse.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/ptrace.h>
 #include <asm/types.h>

From a8a3d94dbb8ef81a8710b4fec7dfd1a07f2647f9 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@fedoraproject.org>
Date: Fri, 5 Feb 2016 16:24:46 -0800
Subject: [PATCH 0402/3220] UPSTREAM: arm64: Drop alloc function from
 create_mapping

create_mapping is only used in fixmap_remap_fdt. All the create_mapping
calls need to happen on existing translation table pages without
additional allocations. Rather than have an alloc function be called
and fail, just set it to NULL and catch its use. Also change
the name to create_mapping_noalloc to better capture what exactly is
going on.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@fedoraproject.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 132233a759580f5ce9b1bfaac9073e47d03c460d)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I48db9adae540e79d47328b4045f26f22ecebc482
---
 arch/arm64/mm/mmu.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 6f5770a1b140..b4788d3e1551 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -116,7 +116,9 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
 	pte_t *pte;
 
 	if (pmd_none(*pmd) || pmd_sect(*pmd)) {
-		phys_addr_t pte_phys = pgtable_alloc();
+		phys_addr_t pte_phys;
+		BUG_ON(!pgtable_alloc);
+		pte_phys = pgtable_alloc();
 		pte = pte_set_fixmap(pte_phys);
 		if (pmd_sect(*pmd))
 			split_pmd(pmd, pte);
@@ -158,7 +160,9 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
 	 * Check for initial section mappings in the pgd/pud and remove them.
 	 */
 	if (pud_none(*pud) || pud_sect(*pud)) {
-		phys_addr_t pmd_phys = pgtable_alloc();
+		phys_addr_t pmd_phys;
+		BUG_ON(!pgtable_alloc);
+		pmd_phys = pgtable_alloc();
 		pmd = pmd_set_fixmap(pmd_phys);
 		if (pud_sect(*pud)) {
 			/*
@@ -223,7 +227,9 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
 	unsigned long next;
 
 	if (pgd_none(*pgd)) {
-		phys_addr_t pud_phys = pgtable_alloc();
+		phys_addr_t pud_phys;
+		BUG_ON(!pgtable_alloc);
+		pud_phys = pgtable_alloc();
 		__pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE);
 	}
 	BUG_ON(pgd_bad(*pgd));
@@ -312,7 +318,12 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 	init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc);
 }
 
-static void __init create_mapping(phys_addr_t phys, unsigned long virt,
+/*
+ * This function can only be used to modify existing table entries,
+ * without allocating new levels of table. Note that this permits the
+ * creation of new section or page entries.
+ */
+static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
 				  phys_addr_t size, pgprot_t prot)
 {
 	if (virt < VMALLOC_START) {
@@ -321,7 +332,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt,
 		return;
 	}
 	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
-			     early_pgtable_alloc);
+			     NULL);
 }
 
 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
@@ -678,7 +689,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 	/*
 	 * Make sure that the FDT region can be mapped without the need to
 	 * allocate additional translation table pages, so that it is safe
-	 * to call create_mapping() this early.
+	 * to call create_mapping_noalloc() this early.
 	 *
 	 * On 64k pages, the FDT will be mapped using PTEs, so we need to
 	 * be in the same PMD as the rest of the fixmap.
@@ -694,8 +705,8 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 	dt_virt = (void *)dt_virt_base + offset;
 
 	/* map the first chunk so we can read the size from the header */
-	create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
-		       SWAPPER_BLOCK_SIZE, prot);
+	create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
+			dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
 
 	if (fdt_check_header(dt_virt) != 0)
 		return NULL;
@@ -705,7 +716,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 		return NULL;
 
 	if (offset + size > SWAPPER_BLOCK_SIZE)
-		create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
+		create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
 			       round_up(offset + size, SWAPPER_BLOCK_SIZE), prot);
 
 	memblock_reserve(dt_phys, size);

From 644210f3f890d1e5481378d28d150d1c5f4191fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Penttil=C3=A4?= <mika.penttila@nextfour.com>
Date: Tue, 26 Jan 2016 15:47:25 +0000
Subject: [PATCH 0403/3220] UPSTREAM: arm64: mm: avoid calling
 apply_to_page_range on empty range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Calling apply_to_page_range with an empty range results in a BUG_ON
from the core code. This can be triggered by trying to load the st_drv
module with CONFIG_DEBUG_SET_MODULE_RONX enabled:

  kernel BUG at mm/memory.c:1874!
  Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
  Modules linked in:
  CPU: 3 PID: 1764 Comm: insmod Not tainted 4.5.0-rc1+ #2
  Hardware name: ARM Juno development board (r0) (DT)
  task: ffffffc9763b8000 ti: ffffffc975af8000 task.ti: ffffffc975af8000
  PC is at apply_to_page_range+0x2cc/0x2d0
  LR is at change_memory_common+0x80/0x108

This patch fixes the issue by making change_memory_common (called by the
set_memory_* functions) a NOP when numpages == 0, therefore avoiding the
erroneous call to apply_to_page_range and bringing us into line with x86
and s390.

Cc: <stable@vger.kernel.org>
Reviewed-by: Laura Abbott <labbott@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Mika Penttilä <mika.penttila@nextfour.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 57adec866c0440976c96a4b8f5b59fb411b1cacb)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ia107d3b324cc8237f669778a7c9c3abae8637501
---
 arch/arm64/mm/pageattr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 29ee7d85ca4c..0795c3a36d8f 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -72,6 +72,9 @@ static int change_memory_common(unsigned long addr, int numpages,
 	    !(area->flags & VM_ALLOC))
 		return -EINVAL;
 
+	if (!numpages)
+		return 0;
+
 	data.set_mask = set_mask;
 	data.clear_mask = clear_mask;
 

From 664e159255fe7afc89145b4eed47b370d01489fa Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@fedoraproject.org>
Date: Fri, 5 Feb 2016 16:24:47 -0800
Subject: [PATCH 0404/3220] UPSTREAM: arm64: Add support for
 ARCH_SUPPORTS_DEBUG_PAGEALLOC

ARCH_SUPPORTS_DEBUG_PAGEALLOC provides a hook to map and unmap
pages for debugging purposes. This requires memory be mapped
with PAGE_SIZE mappings since breaking down larger mappings
at runtime will lead to TLB conflicts. Check if debug_pagealloc
is enabled at runtime and if so, map everyting with PAGE_SIZE
pages. Implement the functions to actually map/unmap the
pages at runtime.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@fedoraproject.org>
[catalin.marinas@arm.com: static annotation block_mappings_allowed() and #ifdef]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 83863f25e4b8214e994ef8b5647aad614d74b45d)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ia9b9bc5fcc1938bafbc2930f08d27973cfc3d038
---
 arch/arm64/Kconfig       |  3 +++
 arch/arm64/mm/mmu.c      | 26 +++++++++++++++++++++--
 arch/arm64/mm/pageattr.c | 46 +++++++++++++++++++++++++++++++---------
 3 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1405a1bcc26d..49988da6f342 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -541,6 +541,9 @@ config HOTPLUG_CPU
 source kernel/Kconfig.preempt
 source kernel/Kconfig.hz
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 config ARCH_HAS_HOLES_MEMORYMODEL
 	def_bool y if SPARSEMEM
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b4788d3e1551..8d0e49d2cba9 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -149,6 +149,26 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd)
 	} while (pmd++, i++, i < PTRS_PER_PMD);
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void))
+{
+
+	/*
+	 * If debug_page_alloc is enabled we must map the linear map
+	 * using pages. However, other mappings created by
+	 * create_mapping_noalloc must use sections in some cases. Allow
+	 * sections to be used in those cases, where no pgtable_alloc
+	 * function is provided.
+	 */
+	return !pgtable_alloc || !debug_pagealloc_enabled();
+}
+#else
+static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void))
+{
+	return true;
+}
+#endif
+
 static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
 				  phys_addr_t (*pgtable_alloc)(void))
@@ -181,7 +201,8 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
 	do {
 		next = pmd_addr_end(addr, end);
 		/* try section mapping first */
-		if (((addr | next | phys) & ~SECTION_MASK) == 0) {
+		if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
+		      block_mappings_allowed(pgtable_alloc)) {
 			pmd_t old_pmd =*pmd;
 			set_pmd(pmd, __pmd(phys |
 					   pgprot_val(mk_sect_prot(prot))));
@@ -241,7 +262,8 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
 		/*
 		 * For 4K granule only, attempt to put down a 1GB block
 		 */
-		if (use_1G_block(addr, next, phys)) {
+		if (use_1G_block(addr, next, phys) &&
+		    block_mappings_allowed(pgtable_alloc)) {
 			pud_t old_pud = *pud;
 			set_pud(pud, __pud(phys |
 					   pgprot_val(mk_sect_prot(prot))));
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 0795c3a36d8f..ca6d268e3313 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -37,14 +37,31 @@ static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr,
 	return 0;
 }
 
+/*
+ * This function assumes that the range is mapped with PAGE_SIZE pages.
+ */
+static int __change_memory_common(unsigned long start, unsigned long size,
+				pgprot_t set_mask, pgprot_t clear_mask)
+{
+	struct page_change_data data;
+	int ret;
+
+	data.set_mask = set_mask;
+	data.clear_mask = clear_mask;
+
+	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
+					&data);
+
+	flush_tlb_kernel_range(start, start + size);
+	return ret;
+}
+
 static int change_memory_common(unsigned long addr, int numpages,
 				pgprot_t set_mask, pgprot_t clear_mask)
 {
 	unsigned long start = addr;
 	unsigned long size = PAGE_SIZE*numpages;
 	unsigned long end = start + size;
-	int ret;
-	struct page_change_data data;
 	struct vm_struct *area;
 
 	if (!PAGE_ALIGNED(addr)) {
@@ -75,14 +92,7 @@ static int change_memory_common(unsigned long addr, int numpages,
 	if (!numpages)
 		return 0;
 
-	data.set_mask = set_mask;
-	data.clear_mask = clear_mask;
-
-	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
-					&data);
-
-	flush_tlb_kernel_range(start, end);
-	return ret;
+	return __change_memory_common(start, size, set_mask, clear_mask);
 }
 
 int set_memory_ro(unsigned long addr, int numpages)
@@ -114,3 +124,19 @@ int set_memory_x(unsigned long addr, int numpages)
 					__pgprot(PTE_PXN));
 }
 EXPORT_SYMBOL_GPL(set_memory_x);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	unsigned long addr = (unsigned long) page_address(page);
+
+	if (enable)
+		__change_memory_common(addr, PAGE_SIZE * numpages,
+					__pgprot(PTE_VALID),
+					__pgprot(0));
+	else
+		__change_memory_common(addr, PAGE_SIZE * numpages,
+					__pgprot(0),
+					__pgprot(PTE_VALID));
+}
+#endif

From 4d4c2259aa3270aceb9c55562afdf586c8328eee Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@fedoraproject.org>
Date: Fri, 5 Feb 2016 16:24:48 -0800
Subject: [PATCH 0405/3220] UPSTREAM: arm64: ptdump: Indicate whether memory
 should be faulting

With CONFIG_DEBUG_PAGEALLOC, pages do not have the valid bit
set when free in the buddy allocator. Add an indiciation to
the page table dumping code that the valid bit is not set,
'F' for fault, to make this easier to understand.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@fedoraproject.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit d7e9d59494a9a5d83274f5af2148b82ca22dff3f)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I7e8200549e22cb3a01b5e827bf12b872b9cefc58
---
 arch/arm64/mm/dump.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 0adbebbc2803..0841b2bf0e6a 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -90,6 +90,11 @@ struct prot_bits {
 
 static const struct prot_bits pte_bits[] = {
 	{
+		.mask	= PTE_VALID,
+		.val	= PTE_VALID,
+		.set	= " ",
+		.clear	= "F",
+	}, {
 		.mask	= PTE_USER,
 		.val	= PTE_USER,
 		.set	= "USR",

From 3aa18069d0038c0475ca598d2a9e9533ab6c28f7 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 5 Feb 2016 15:50:18 -0800
Subject: [PATCH 0406/3220] UPSTREAM: arm64: ubsan: select
 ARCH_HAS_UBSAN_SANITIZE_ALL

To enable UBSAN on arm64, ARCH_HAS_UBSAN_SANITIZE_ALL need to be selected.

Basic kernel bootup test is passed on arm64 with CONFIG_UBSAN_SANITIZE_ALL
enabled.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit f0b7f8a4b44657386273a67179dd901c81cd11a6)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I640f50e70e3562bf1caf2ce164d6ed5640e041f6
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 49988da6f342..71ce487d7b78 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -13,6 +13,7 @@ config ARM64
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
 	select ARCH_WANT_FRAME_POINTERS
+	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARM_AMBA
 	select ARM_ARCH_TIMER
 	select ARM_GIC

From a275e00e2833f58216ed58baae6de790a4dc27cb Mon Sep 17 00:00:00 2001
From: David Brown <david.brown@linaro.org>
Date: Wed, 10 Feb 2016 13:52:22 -0800
Subject: [PATCH 0407/3220] UPSTREAM: arm64: vdso: Mark vDSO code as read-only

Although the arm64 vDSO is cleanly separated by code/data with the
code being read-only in userspace mappings, the code page is still
writable from the kernel.  There have been exploits (such as
http://itszn.com/blog/?p=21) that take advantage of this on x86 to go
from a bad kernel write to full root.

Prevent this specific exploit on arm64 by putting the vDSO code page
in read-only memory as well.

Before the change:
[    3.138366] vdso: 2 pages (1 code @ ffffffc000a71000, 1 data @ ffffffc000a70000)
---[ Kernel Mapping ]---
0xffffffc000000000-0xffffffc000082000         520K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc000082000-0xffffffc000200000        1528K     ro x  SHD AF            UXN MEM/NORMAL
0xffffffc000200000-0xffffffc000800000           6M     ro x  SHD AF        BLK UXN MEM/NORMAL
0xffffffc000800000-0xffffffc0009b6000        1752K     ro x  SHD AF            UXN MEM/NORMAL
0xffffffc0009b6000-0xffffffc000c00000        2344K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc000c00000-0xffffffc008000000         116M     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc00c000000-0xffffffc07f000000        1840M     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc800000000-0xffffffc840000000           1G     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc840000000-0xffffffc87ae00000         942M     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc87ae00000-0xffffffc87ae70000         448K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87af80000-0xffffffc87af8a000          40K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87af8b000-0xffffffc87b000000         468K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87b000000-0xffffffc87fe00000          78M     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc87fe00000-0xffffffc87ff50000        1344K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87ff90000-0xffffffc87ffa0000          64K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87fff0000-0xffffffc880000000          64K     RW NX SHD AF            UXN MEM/NORMAL

After:
[    3.138368] vdso: 2 pages (1 code @ ffffffc0006de000, 1 data @ ffffffc000a74000)
---[ Kernel Mapping ]---
0xffffffc000000000-0xffffffc000082000         520K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc000082000-0xffffffc000200000        1528K     ro x  SHD AF            UXN MEM/NORMAL
0xffffffc000200000-0xffffffc000800000           6M     ro x  SHD AF        BLK UXN MEM/NORMAL
0xffffffc000800000-0xffffffc0009b8000        1760K     ro x  SHD AF            UXN MEM/NORMAL
0xffffffc0009b8000-0xffffffc000c00000        2336K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc000c00000-0xffffffc008000000         116M     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc00c000000-0xffffffc07f000000        1840M     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc800000000-0xffffffc840000000           1G     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc840000000-0xffffffc87ae00000         942M     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc87ae00000-0xffffffc87ae70000         448K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87af80000-0xffffffc87af8a000          40K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87af8b000-0xffffffc87b000000         468K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87b000000-0xffffffc87fe00000          78M     RW NX SHD AF        BLK UXN MEM/NORMAL
0xffffffc87fe00000-0xffffffc87ff50000        1344K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87ff90000-0xffffffc87ffa0000          64K     RW NX SHD AF            UXN MEM/NORMAL
0xffffffc87fff0000-0xffffffc880000000          64K     RW NX SHD AF            UXN MEM/NORMAL

Inspired by https://lkml.org/lkml/2016/1/19/494 based on work by the
PaX Team, Brad Spengler, and Kees Cook.

Signed-off-by: David Brown <david.brown@linaro.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
[catalin.marinas@arm.com: removed superfluous __PAGE_ALIGNED_DATA]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 88d8a7994e564d209d4b2583496631c2357d386b)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I3fe4b48df8b27313ac61c947746805442757932c
---
 arch/arm64/kernel/vdso/vdso.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S
index 60c1db54b41a..82379a70ef03 100644
--- a/arch/arm64/kernel/vdso/vdso.S
+++ b/arch/arm64/kernel/vdso/vdso.S
@@ -21,9 +21,8 @@
 #include <linux/const.h>
 #include <asm/page.h>
 
-	__PAGE_ALIGNED_DATA
-
 	.globl vdso_start, vdso_end
+	.section .rodata
 	.balign PAGE_SIZE
 vdso_start:
 	.incbin "arch/arm64/kernel/vdso/vdso.so"

From de30ecfe314d3aca506ed45e0892a942ce523bd4 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 15 Feb 2016 09:51:49 +0100
Subject: [PATCH 0408/3220] UPSTREAM: arm64: use local label prefixes for
 __reg_num symbols

The __reg_num_xNN symbols that are used to implement the msr_s and
mrs_s macros are recorded in the ELF metadata of each object file.
This does not affect the size of the final binary, but it does clutter
the output of tools like readelf, i.e.,

  $ readelf -a vmlinux |grep -c __reg_num_x
  50976

So let's use symbols with the .L prefix, these are strictly local,
and don't end up in the object files.

  $ readelf -a vmlinux |grep -c __reg_num_x
  0

Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 7abc7d833c9eb16efc8a59239d3771a6e30be367)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Idefe9841ef7d1ddcc5161fc8de14153cfadaf4f3
---
 arch/arm64/include/asm/sysreg.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index d48ab5b41f52..76907c94b11f 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -194,32 +194,32 @@
 #ifdef __ASSEMBLY__
 
 	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
-	.equ	__reg_num_x\num, \num
+	.equ	.L__reg_num_x\num, \num
 	.endr
-	.equ	__reg_num_xzr, 31
+	.equ	.L__reg_num_xzr, 31
 
 	.macro	mrs_s, rt, sreg
-	.inst	0xd5200000|(\sreg)|(__reg_num_\rt)
+	.inst	0xd5200000|(\sreg)|(.L__reg_num_\rt)
 	.endm
 
 	.macro	msr_s, sreg, rt
-	.inst	0xd5000000|(\sreg)|(__reg_num_\rt)
+	.inst	0xd5000000|(\sreg)|(.L__reg_num_\rt)
 	.endm
 
 #else
 
 asm(
 "	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n"
-"	.equ	__reg_num_x\\num, \\num\n"
+"	.equ	.L__reg_num_x\\num, \\num\n"
 "	.endr\n"
-"	.equ	__reg_num_xzr, 31\n"
+"	.equ	.L__reg_num_xzr, 31\n"
 "\n"
 "	.macro	mrs_s, rt, sreg\n"
-"	.inst	0xd5200000|(\\sreg)|(__reg_num_\\rt)\n"
+"	.inst	0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n"
 "	.endm\n"
 "\n"
 "	.macro	msr_s, sreg, rt\n"
-"	.inst	0xd5000000|(\\sreg)|(__reg_num_\\rt)\n"
+"	.inst	0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n"
 "	.endm\n"
 );
 

From ac3060aed4ea0173b093c474fbd8981c4e1dc274 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 5 Feb 2016 14:58:46 +0000
Subject: [PATCH 0409/3220] UPSTREAM: arm64: cpufeature: Change read_cpuid() to
 use sysreg's mrs_s macro

Older assemblers may not have support for newer feature registers. To get
round this, sysreg.h provides a 'mrs_s' macro that takes a register
encoding and generates the raw instruction.

Change read_cpuid() to use mrs_s in all cases so that new registers
don't have to be a special case. Including sysreg.h means we need to move
the include and definition of read_cpuid() after the #ifndef __ASSEMBLY__
to avoid syntax errors in vmlinux.lds.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 0f54b14e76f5302afe164dc911b049b5df836ff5)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ic59c81a97b4585b3c0964f293ddee08f7cb594ac
---
 arch/arm64/include/asm/cpufeature.h |  2 +-
 arch/arm64/include/asm/cputype.h    | 20 ++++++-----
 arch/arm64/kernel/cpufeature.c      | 54 ++++++++++++++---------------
 arch/arm64/kernel/cpuinfo.c         | 50 +++++++++++++-------------
 arch/arm64/mm/context.c             |  2 +-
 5 files changed, 65 insertions(+), 63 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 8d56bd8550dc..8131abfabb0a 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -177,7 +177,7 @@ u64 read_system_reg(u32 id);
 
 static inline bool cpu_supports_mixed_endian_el0(void)
 {
-	return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1));
+	return id_aa64mmfr0_mixed_endian_el0(read_cpuid(SYS_ID_AA64MMFR0_EL1));
 }
 
 static inline bool system_supports_mixed_endian_el0(void)
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 7540284a17fe..b3a83da152a7 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -32,12 +32,6 @@
 #define MPIDR_AFFINITY_LEVEL(mpidr, level) \
 	((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK)
 
-#define read_cpuid(reg) ({						\
-	u64 __val;							\
-	asm("mrs	%0, " #reg : "=r" (__val));			\
-	__val;								\
-})
-
 #define MIDR_REVISION_MASK	0xf
 #define MIDR_REVISION(midr)	((midr) & MIDR_REVISION_MASK)
 #define MIDR_PARTNUM_SHIFT	4
@@ -92,6 +86,14 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/sysreg.h>
+
+#define read_cpuid(reg) ({						\
+	u64 __val;							\
+	asm("mrs_s	%0, " __stringify(reg) : "=r" (__val));		\
+	__val;								\
+})
+
 /*
  * The CPU ID never changes at run time, so we might as well tell the
  * compiler that it's constant.  Use this function to read the CPU ID
@@ -99,12 +101,12 @@
  */
 static inline u32 __attribute_const__ read_cpuid_id(void)
 {
-	return read_cpuid(MIDR_EL1);
+	return read_cpuid(SYS_MIDR_EL1);
 }
 
 static inline u64 __attribute_const__ read_cpuid_mpidr(void)
 {
-	return read_cpuid(MPIDR_EL1);
+	return read_cpuid(SYS_MPIDR_EL1);
 }
 
 static inline unsigned int __attribute_const__ read_cpuid_implementor(void)
@@ -119,7 +121,7 @@ static inline unsigned int __attribute_const__ read_cpuid_part_number(void)
 
 static inline u32 __attribute_const__ read_cpuid_cachetype(void)
 {
-	return read_cpuid(CTR_EL0);
+	return read_cpuid(SYS_CTR_EL0);
 }
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 3615d7d7c9af..1ef10e784031 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -808,35 +808,35 @@ static inline void set_sys_caps_initialised(void)
 static u64 __raw_read_system_reg(u32 sys_id)
 {
 	switch (sys_id) {
-	case SYS_ID_PFR0_EL1:		return (u64)read_cpuid(ID_PFR0_EL1);
-	case SYS_ID_PFR1_EL1:		return (u64)read_cpuid(ID_PFR1_EL1);
-	case SYS_ID_DFR0_EL1:		return (u64)read_cpuid(ID_DFR0_EL1);
-	case SYS_ID_MMFR0_EL1:		return (u64)read_cpuid(ID_MMFR0_EL1);
-	case SYS_ID_MMFR1_EL1:		return (u64)read_cpuid(ID_MMFR1_EL1);
-	case SYS_ID_MMFR2_EL1:		return (u64)read_cpuid(ID_MMFR2_EL1);
-	case SYS_ID_MMFR3_EL1:		return (u64)read_cpuid(ID_MMFR3_EL1);
-	case SYS_ID_ISAR0_EL1:		return (u64)read_cpuid(ID_ISAR0_EL1);
-	case SYS_ID_ISAR1_EL1:		return (u64)read_cpuid(ID_ISAR1_EL1);
-	case SYS_ID_ISAR2_EL1:		return (u64)read_cpuid(ID_ISAR2_EL1);
-	case SYS_ID_ISAR3_EL1:		return (u64)read_cpuid(ID_ISAR3_EL1);
-	case SYS_ID_ISAR4_EL1:		return (u64)read_cpuid(ID_ISAR4_EL1);
-	case SYS_ID_ISAR5_EL1:		return (u64)read_cpuid(ID_ISAR4_EL1);
-	case SYS_MVFR0_EL1:		return (u64)read_cpuid(MVFR0_EL1);
-	case SYS_MVFR1_EL1:		return (u64)read_cpuid(MVFR1_EL1);
-	case SYS_MVFR2_EL1:		return (u64)read_cpuid(MVFR2_EL1);
+	case SYS_ID_PFR0_EL1:		return read_cpuid(SYS_ID_PFR0_EL1);
+	case SYS_ID_PFR1_EL1:		return read_cpuid(SYS_ID_PFR1_EL1);
+	case SYS_ID_DFR0_EL1:		return read_cpuid(SYS_ID_DFR0_EL1);
+	case SYS_ID_MMFR0_EL1:		return read_cpuid(SYS_ID_MMFR0_EL1);
+	case SYS_ID_MMFR1_EL1:		return read_cpuid(SYS_ID_MMFR1_EL1);
+	case SYS_ID_MMFR2_EL1:		return read_cpuid(SYS_ID_MMFR2_EL1);
+	case SYS_ID_MMFR3_EL1:		return read_cpuid(SYS_ID_MMFR3_EL1);
+	case SYS_ID_ISAR0_EL1:		return read_cpuid(SYS_ID_ISAR0_EL1);
+	case SYS_ID_ISAR1_EL1:		return read_cpuid(SYS_ID_ISAR1_EL1);
+	case SYS_ID_ISAR2_EL1:		return read_cpuid(SYS_ID_ISAR2_EL1);
+	case SYS_ID_ISAR3_EL1:		return read_cpuid(SYS_ID_ISAR3_EL1);
+	case SYS_ID_ISAR4_EL1:		return read_cpuid(SYS_ID_ISAR4_EL1);
+	case SYS_ID_ISAR5_EL1:		return read_cpuid(SYS_ID_ISAR4_EL1);
+	case SYS_MVFR0_EL1:		return read_cpuid(SYS_MVFR0_EL1);
+	case SYS_MVFR1_EL1:		return read_cpuid(SYS_MVFR1_EL1);
+	case SYS_MVFR2_EL1:		return read_cpuid(SYS_MVFR2_EL1);
 
-	case SYS_ID_AA64PFR0_EL1:	return (u64)read_cpuid(ID_AA64PFR0_EL1);
-	case SYS_ID_AA64PFR1_EL1:	return (u64)read_cpuid(ID_AA64PFR0_EL1);
-	case SYS_ID_AA64DFR0_EL1:	return (u64)read_cpuid(ID_AA64DFR0_EL1);
-	case SYS_ID_AA64DFR1_EL1:	return (u64)read_cpuid(ID_AA64DFR0_EL1);
-	case SYS_ID_AA64MMFR0_EL1:	return (u64)read_cpuid(ID_AA64MMFR0_EL1);
-	case SYS_ID_AA64MMFR1_EL1:	return (u64)read_cpuid(ID_AA64MMFR1_EL1);
-	case SYS_ID_AA64ISAR0_EL1:	return (u64)read_cpuid(ID_AA64ISAR0_EL1);
-	case SYS_ID_AA64ISAR1_EL1:	return (u64)read_cpuid(ID_AA64ISAR1_EL1);
+	case SYS_ID_AA64PFR0_EL1:	return read_cpuid(SYS_ID_AA64PFR0_EL1);
+	case SYS_ID_AA64PFR1_EL1:	return read_cpuid(SYS_ID_AA64PFR0_EL1);
+	case SYS_ID_AA64DFR0_EL1:	return read_cpuid(SYS_ID_AA64DFR0_EL1);
+	case SYS_ID_AA64DFR1_EL1:	return read_cpuid(SYS_ID_AA64DFR0_EL1);
+	case SYS_ID_AA64MMFR0_EL1:	return read_cpuid(SYS_ID_AA64MMFR0_EL1);
+	case SYS_ID_AA64MMFR1_EL1:	return read_cpuid(SYS_ID_AA64MMFR1_EL1);
+	case SYS_ID_AA64ISAR0_EL1:	return read_cpuid(SYS_ID_AA64ISAR0_EL1);
+	case SYS_ID_AA64ISAR1_EL1:	return read_cpuid(SYS_ID_AA64ISAR1_EL1);
 
-	case SYS_CNTFRQ_EL0:		return (u64)read_cpuid(CNTFRQ_EL0);
-	case SYS_CTR_EL0:		return (u64)read_cpuid(CTR_EL0);
-	case SYS_DCZID_EL0:		return (u64)read_cpuid(DCZID_EL0);
+	case SYS_CNTFRQ_EL0:		return read_cpuid(SYS_CNTFRQ_EL0);
+	case SYS_CTR_EL0:		return read_cpuid(SYS_CTR_EL0);
+	case SYS_DCZID_EL0:		return read_cpuid(SYS_DCZID_EL0);
 	default:
 		BUG();
 		return 0;
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 212ae6361d8b..76df22272804 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -201,35 +201,35 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 {
 	info->reg_cntfrq = arch_timer_get_cntfrq();
 	info->reg_ctr = read_cpuid_cachetype();
-	info->reg_dczid = read_cpuid(DCZID_EL0);
+	info->reg_dczid = read_cpuid(SYS_DCZID_EL0);
 	info->reg_midr = read_cpuid_id();
 
-	info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1);
-	info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1);
-	info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1);
-	info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1);
-	info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
-	info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
-	info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
-	info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
+	info->reg_id_aa64dfr0 = read_cpuid(SYS_ID_AA64DFR0_EL1);
+	info->reg_id_aa64dfr1 = read_cpuid(SYS_ID_AA64DFR1_EL1);
+	info->reg_id_aa64isar0 = read_cpuid(SYS_ID_AA64ISAR0_EL1);
+	info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1);
+	info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1);
+	info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1);
+	info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1);
+	info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1);
 
-	info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
-	info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
-	info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
-	info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
-	info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
-	info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
-	info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
-	info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
-	info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
-	info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
-	info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
-	info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
-	info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
+	info->reg_id_dfr0 = read_cpuid(SYS_ID_DFR0_EL1);
+	info->reg_id_isar0 = read_cpuid(SYS_ID_ISAR0_EL1);
+	info->reg_id_isar1 = read_cpuid(SYS_ID_ISAR1_EL1);
+	info->reg_id_isar2 = read_cpuid(SYS_ID_ISAR2_EL1);
+	info->reg_id_isar3 = read_cpuid(SYS_ID_ISAR3_EL1);
+	info->reg_id_isar4 = read_cpuid(SYS_ID_ISAR4_EL1);
+	info->reg_id_isar5 = read_cpuid(SYS_ID_ISAR5_EL1);
+	info->reg_id_mmfr0 = read_cpuid(SYS_ID_MMFR0_EL1);
+	info->reg_id_mmfr1 = read_cpuid(SYS_ID_MMFR1_EL1);
+	info->reg_id_mmfr2 = read_cpuid(SYS_ID_MMFR2_EL1);
+	info->reg_id_mmfr3 = read_cpuid(SYS_ID_MMFR3_EL1);
+	info->reg_id_pfr0 = read_cpuid(SYS_ID_PFR0_EL1);
+	info->reg_id_pfr1 = read_cpuid(SYS_ID_PFR1_EL1);
 
-	info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
-	info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
-	info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
+	info->reg_mvfr0 = read_cpuid(SYS_MVFR0_EL1);
+	info->reg_mvfr1 = read_cpuid(SYS_MVFR1_EL1);
+	info->reg_mvfr2 = read_cpuid(SYS_MVFR2_EL1);
 
 	cpuinfo_detect_icache_policy(info);
 
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index e87f53ff5f58..7275628ba59f 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -187,7 +187,7 @@ switch_mm_fastpath:
 
 static int asids_init(void)
 {
-	int fld = cpuid_feature_extract_field(read_cpuid(ID_AA64MMFR0_EL1), 4);
+	int fld = cpuid_feature_extract_field(read_cpuid(SYS_ID_AA64MMFR0_EL1), 4);
 
 	switch (fld) {
 	default:

From 8d6623808d9b75778dce5afba7fbbbd387eb4160 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 5 Feb 2016 14:58:47 +0000
Subject: [PATCH 0410/3220] UPSTREAM: arm64: add ARMv8.2 id_aa64mmfr2 boiler
 plate

ARMv8.2 adds a new feature register id_aa64mmfr2. This patch adds the
cpu feature boiler plate used by the actual features in later patches.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 406e308770a92bd33995b2e5b681e86358328bb0)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>

Change-Id: I51db326696ba1e18e9a4c667acbeb3e25e0b151e
---
 arch/arm64/include/asm/cpu.h    |  1 +
 arch/arm64/include/asm/sysreg.h |  4 ++++
 arch/arm64/kernel/cpufeature.c  | 10 ++++++++++
 arch/arm64/kernel/cpuinfo.c     |  1 +
 4 files changed, 16 insertions(+)

diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index b5e9cee4b5f8..13a6103130cd 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -36,6 +36,7 @@ struct cpuinfo_arm64 {
 	u64		reg_id_aa64isar1;
 	u64		reg_id_aa64mmfr0;
 	u64		reg_id_aa64mmfr1;
+	u64		reg_id_aa64mmfr2;
 	u64		reg_id_aa64pfr0;
 	u64		reg_id_aa64pfr1;
 
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 76907c94b11f..4bc8655529df 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -70,6 +70,7 @@
 
 #define SYS_ID_AA64MMFR0_EL1		sys_reg(3, 0, 0, 7, 0)
 #define SYS_ID_AA64MMFR1_EL1		sys_reg(3, 0, 0, 7, 1)
+#define SYS_ID_AA64MMFR2_EL1		sys_reg(3, 0, 0, 7, 2)
 
 #define SYS_CNTFRQ_EL0			sys_reg(3, 3, 14, 0, 0)
 #define SYS_CTR_EL0			sys_reg(3, 3, 0, 0, 1)
@@ -135,6 +136,9 @@
 #define ID_AA64MMFR1_VMIDBITS_SHIFT	4
 #define ID_AA64MMFR1_HADBS_SHIFT	0
 
+/* id_aa64mmfr2 */
+#define ID_AA64MMFR2_UAO_SHIFT		4
+
 /* id_aa64dfr0 */
 #define ID_AA64DFR0_CTX_CMPS_SHIFT	28
 #define ID_AA64DFR0_WRPS_SHIFT		20
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 1ef10e784031..42918c797e8e 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -123,6 +123,11 @@ static struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
 	ARM64_FTR_END,
 };
 
+static struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
+	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0),
+	ARM64_FTR_END,
+};
+
 static struct arm64_ftr_bits ftr_ctr[] = {
 	U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 31, 1, 1),	/* RAO */
 	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 3, 0),
@@ -284,6 +289,7 @@ static struct arm64_ftr_reg arm64_ftr_regs[] = {
 	/* Op1 = 0, CRn = 0, CRm = 7 */
 	ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0),
 	ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1),
+	ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2),
 
 	/* Op1 = 3, CRn = 0, CRm = 0 */
 	ARM64_FTR_REG(SYS_CTR_EL0, ftr_ctr),
@@ -408,6 +414,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 	init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1);
+	init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
 	init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
 	init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
@@ -517,6 +524,8 @@ void update_cpu_features(int cpu,
 				      info->reg_id_aa64mmfr0, boot->reg_id_aa64mmfr0);
 	taint |= check_update_ftr_reg(SYS_ID_AA64MMFR1_EL1, cpu,
 				      info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1);
+	taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu,
+				      info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2);
 
 	/*
 	 * EL3 is not our concern.
@@ -831,6 +840,7 @@ static u64 __raw_read_system_reg(u32 sys_id)
 	case SYS_ID_AA64DFR1_EL1:	return read_cpuid(SYS_ID_AA64DFR0_EL1);
 	case SYS_ID_AA64MMFR0_EL1:	return read_cpuid(SYS_ID_AA64MMFR0_EL1);
 	case SYS_ID_AA64MMFR1_EL1:	return read_cpuid(SYS_ID_AA64MMFR1_EL1);
+	case SYS_ID_AA64MMFR2_EL1:	return read_cpuid(SYS_ID_AA64MMFR2_EL1);
 	case SYS_ID_AA64ISAR0_EL1:	return read_cpuid(SYS_ID_AA64ISAR0_EL1);
 	case SYS_ID_AA64ISAR1_EL1:	return read_cpuid(SYS_ID_AA64ISAR1_EL1);
 
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 76df22272804..966fbd52550b 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -210,6 +210,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1);
 	info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1);
 	info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1);
+	info->reg_id_aa64mmfr2 = read_cpuid(SYS_ID_AA64MMFR2_EL1);
 	info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1);
 	info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1);
 

From dba95a8428b15b2cff37f5b07e4539ccb82a330e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 5 Feb 2016 14:58:48 +0000
Subject: [PATCH 0411/3220] UPSTREAM: arm64: kernel: Add support for User
 Access Override

'User Access Override' is a new ARMv8.2 feature which allows the
unprivileged load and store instructions to be overridden to behave in
the normal way.

This patch converts {get,put}_user() and friends to use ldtr*/sttr*
instructions - so that they can only access EL0 memory, then enables
UAO when fs==KERNEL_DS so that these functions can access kernel memory.

This allows user space's read/write permissions to be checked against the
page tables, instead of testing addr<USER_DS, then using the kernel's
read/write permissions.

Signed-off-by: James Morse <james.morse@arm.com>
[catalin.marinas@arm.com: move uao_thread_switch() above dsb()]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 57f4959bad0a154aeca125b7d38d1d9471a12422)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>

Change-Id: I1a6a74a1f33b92d54368bd99387b55cf62930903
---
 arch/arm64/Kconfig                   | 21 ++++++++
 arch/arm64/include/asm/alternative.h | 72 ++++++++++++++++++++++++++++
 arch/arm64/include/asm/cpufeature.h  |  3 +-
 arch/arm64/include/asm/processor.h   |  1 +
 arch/arm64/include/asm/sysreg.h      |  3 ++
 arch/arm64/include/asm/thread_info.h |  6 +++
 arch/arm64/include/asm/uaccess.h     | 44 ++++++++++++-----
 arch/arm64/include/uapi/asm/ptrace.h |  1 +
 arch/arm64/kernel/cpufeature.c       | 11 +++++
 arch/arm64/kernel/process.c          | 19 ++++++++
 arch/arm64/lib/clear_user.S          |  8 ++--
 arch/arm64/lib/copy_from_user.S      |  8 ++--
 arch/arm64/lib/copy_in_user.S        | 16 +++----
 arch/arm64/lib/copy_to_user.S        |  8 ++--
 arch/arm64/mm/fault.c                | 31 +++++++++---
 15 files changed, 213 insertions(+), 39 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 71ce487d7b78..ae51e1b18337 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -740,6 +740,27 @@ config ARM64_LSE_ATOMICS
 
 endmenu
 
+config ARM64_UAO
+	bool "Enable support for User Access Override (UAO)"
+	default y
+	help
+	  User Access Override (UAO; part of the ARMv8.2 Extensions)
+	  causes the 'unprivileged' variant of the load/store instructions to
+	  be overriden to be privileged.
+
+	  This option changes get_user() and friends to use the 'unprivileged'
+	  variant of the load/store instructions. This ensures that user-space
+	  really did have access to the supplied memory. When addr_limit is
+	  set to kernel memory the UAO bit will be set, allowing privileged
+	  access to kernel memory.
+
+	  Choosing this option will cause copy_to_user() et al to use user-space
+	  memory permissions.
+
+	  The feature is detected at runtime, the kernel will use the
+	  regular load/store instructions if the cpu does not implement the
+	  feature.
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index e4962f04201e..a9fc24ec1aa9 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_ALTERNATIVE_H
 #define __ASM_ALTERNATIVE_H
 
+#include <asm/cpufeature.h>
+
 #ifndef __ASSEMBLY__
 
 #include <linux/init.h>
@@ -63,6 +65,8 @@ void apply_alternatives(void *start, size_t length);
 
 #else
 
+#include <asm/assembler.h>
+
 .macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len
 	.word \orig_offset - .
 	.word \alt_offset - .
@@ -136,6 +140,74 @@ void apply_alternatives(void *start, size_t length);
 	alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
 
 
+/*
+ * Generate the assembly for UAO alternatives with exception table entries.
+ * This is complicated as there is no post-increment or pair versions of the
+ * unprivileged instructions, and USER() only works for single instructions.
+ */
+#ifdef CONFIG_ARM64_UAO
+	.macro uao_ldp l, reg1, reg2, addr, post_inc
+		alternative_if_not ARM64_HAS_UAO
+8888:			ldp	\reg1, \reg2, [\addr], \post_inc;
+8889:			nop;
+			nop;
+		alternative_else
+			ldtr	\reg1, [\addr];
+			ldtr	\reg2, [\addr, #8];
+			add	\addr, \addr, \post_inc;
+		alternative_endif
+
+		.section __ex_table,"a";
+		.align	3;
+		.quad	8888b,\l;
+		.quad	8889b,\l;
+		.previous;
+	.endm
+
+	.macro uao_stp l, reg1, reg2, addr, post_inc
+		alternative_if_not ARM64_HAS_UAO
+8888:			stp	\reg1, \reg2, [\addr], \post_inc;
+8889:			nop;
+			nop;
+		alternative_else
+			sttr	\reg1, [\addr];
+			sttr	\reg2, [\addr, #8];
+			add	\addr, \addr, \post_inc;
+		alternative_endif
+
+		.section __ex_table,"a";
+		.align	3;
+		.quad	8888b,\l;
+		.quad	8889b,\l;
+		.previous
+	.endm
+
+	.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+		alternative_if_not ARM64_HAS_UAO
+8888:			\inst	\reg, [\addr], \post_inc;
+			nop;
+		alternative_else
+			\alt_inst	\reg, [\addr];
+			add		\addr, \addr, \post_inc;
+		alternative_endif
+
+		.section __ex_table,"a";
+		.align	3;
+		.quad	8888b,\l;
+		.previous
+	.endm
+#else
+	.macro uao_ldp l, reg1, reg2, addr, post_inc
+		USER(\l, ldp \reg1, \reg2, [\addr], \post_inc)
+	.endm
+	.macro uao_stp l, reg1, reg2, addr, post_inc
+		USER(\l, stp \reg1, \reg2, [\addr], \post_inc)
+	.endm
+	.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+		USER(\l, \inst \reg, [\addr], \post_inc)
+	.endm
+#endif
+
 #endif  /*  __ASSEMBLY__  */
 
 /*
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 8131abfabb0a..a5df7cde616b 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -31,8 +31,9 @@
 #define ARM64_WORKAROUND_CAVIUM_23154		6
 #define ARM64_WORKAROUND_834220			7
 #define ARM64_HAS_NO_HW_PREFETCH		8
+#define ARM64_HAS_UAO				9
 
-#define ARM64_NCAPS				9
+#define ARM64_NCAPS				10
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 5bb1d763d17a..cef1cf398356 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -191,5 +191,6 @@ static inline void spin_lock_prefetch(const void *ptr)
 #endif
 
 void cpu_enable_pan(void *__unused);
+void cpu_enable_uao(void *__unused);
 
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 4bc8655529df..b9fd8ec79033 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -77,9 +77,12 @@
 #define SYS_DCZID_EL0			sys_reg(3, 3, 0, 0, 7)
 
 #define REG_PSTATE_PAN_IMM		sys_reg(0, 0, 4, 0, 4)
+#define REG_PSTATE_UAO_IMM		sys_reg(0, 0, 4, 0, 3)
 
 #define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\
 				     (!!x)<<8 | 0x1f)
+#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\
+				     (!!x)<<8 | 0x1f)
 
 /* SCTLR_EL1 */
 #define SCTLR_EL1_CP15BEN	(0x1 << 5)
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index abd64bd1f6d9..eba8db6838af 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -85,6 +85,12 @@ static inline struct thread_info *current_thread_info(void)
 	return (struct thread_info *)sp_el0;
 }
 
+/* Access struct thread_info of another thread */
+static inline struct thread_info *get_thread_info(unsigned long thread_stack)
+{
+	return (struct thread_info *)(thread_stack & ~(THREAD_SIZE - 1));
+}
+
 #define thread_saved_pc(tsk)	\
 	((unsigned long)(tsk->thread.cpu_context.pc))
 #define thread_saved_sp(tsk)	\
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 2a44b9795532..a227ba376f23 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -64,6 +64,16 @@ extern int fixup_exception(struct pt_regs *regs);
 static inline void set_fs(mm_segment_t fs)
 {
 	current_thread_info()->addr_limit = fs;
+
+	/*
+	 * Enable/disable UAO so that copy_to_user() etc can access
+	 * kernel memory with the unprivileged instructions.
+	 */
+	if (IS_ENABLED(CONFIG_ARM64_UAO) && fs == KERNEL_DS)
+		asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO));
+	else
+		asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO,
+				CONFIG_ARM64_UAO));
 }
 
 #define segment_eq(a, b)	((a) == (b))
@@ -113,9 +123,10 @@ static inline void set_fs(mm_segment_t fs)
  * The "__xxx_error" versions set the third argument to -EFAULT if an error
  * occurs, and leave it unchanged on success.
  */
-#define __get_user_asm(instr, reg, x, addr, err)			\
+#define __get_user_asm(instr, alt_instr, reg, x, addr, err, feature)	\
 	asm volatile(							\
-	"1:	" instr "	" reg "1, [%2]\n"			\
+	"1:"ALTERNATIVE(instr "     " reg "1, [%2]\n",			\
+			alt_instr " " reg "1, [%2]\n", feature)		\
 	"2:\n"								\
 	"	.section .fixup, \"ax\"\n"				\
 	"	.align	2\n"						\
@@ -138,16 +149,20 @@ do {									\
 			CONFIG_ARM64_PAN));				\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
-		__get_user_asm("ldrb", "%w", __gu_val, (ptr), (err));	\
+		__get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr),  \
+			       (err), ARM64_HAS_UAO);			\
 		break;							\
 	case 2:								\
-		__get_user_asm("ldrh", "%w", __gu_val, (ptr), (err));	\
+		__get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr),  \
+			       (err), ARM64_HAS_UAO);			\
 		break;							\
 	case 4:								\
-		__get_user_asm("ldr", "%w", __gu_val, (ptr), (err));	\
+		__get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr),	\
+			       (err), ARM64_HAS_UAO);			\
 		break;							\
 	case 8:								\
-		__get_user_asm("ldr", "%",  __gu_val, (ptr), (err));	\
+		__get_user_asm("ldr", "ldtr", "%",  __gu_val, (ptr),	\
+			       (err), ARM64_HAS_UAO);			\
 		break;							\
 	default:							\
 		BUILD_BUG();						\
@@ -181,9 +196,10 @@ do {									\
 		((x) = 0, -EFAULT);					\
 })
 
-#define __put_user_asm(instr, reg, x, addr, err)			\
+#define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature)	\
 	asm volatile(							\
-	"1:	" instr "	" reg "1, [%2]\n"			\
+	"1:"ALTERNATIVE(instr "     " reg "1, [%2]\n",			\
+			alt_instr " " reg "1, [%2]\n", feature)		\
 	"2:\n"								\
 	"	.section .fixup,\"ax\"\n"				\
 	"	.align	2\n"						\
@@ -205,16 +221,20 @@ do {									\
 			CONFIG_ARM64_PAN));				\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
-		__put_user_asm("strb", "%w", __pu_val, (ptr), (err));	\
+		__put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr),	\
+			       (err), ARM64_HAS_UAO);			\
 		break;							\
 	case 2:								\
-		__put_user_asm("strh", "%w", __pu_val, (ptr), (err));	\
+		__put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr),	\
+			       (err), ARM64_HAS_UAO);			\
 		break;							\
 	case 4:								\
-		__put_user_asm("str",  "%w", __pu_val, (ptr), (err));	\
+		__put_user_asm("str", "sttr", "%w", __pu_val, (ptr),	\
+			       (err), ARM64_HAS_UAO);			\
 		break;							\
 	case 8:								\
-		__put_user_asm("str",  "%", __pu_val, (ptr), (err));	\
+		__put_user_asm("str", "sttr", "%", __pu_val, (ptr),	\
+			       (err), ARM64_HAS_UAO);			\
 		break;							\
 	default:							\
 		BUILD_BUG();						\
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 208db3df135a..b5c3933ed441 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -45,6 +45,7 @@
 #define PSR_A_BIT	0x00000100
 #define PSR_D_BIT	0x00000200
 #define PSR_PAN_BIT	0x00400000
+#define PSR_UAO_BIT	0x00800000
 #define PSR_Q_BIT	0x08000000
 #define PSR_V_BIT	0x10000000
 #define PSR_C_BIT	0x20000000
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 42918c797e8e..ae22edf9d3c9 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -677,6 +677,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.capability = ARM64_HAS_NO_HW_PREFETCH,
 		.matches = has_no_hw_prefetch,
 	},
+#ifdef CONFIG_ARM64_UAO
+	{
+		.desc = "User Access Override",
+		.capability = ARM64_HAS_UAO,
+		.matches = has_cpuid_feature,
+		.sys_reg = SYS_ID_AA64MMFR2_EL1,
+		.field_pos = ID_AA64MMFR2_UAO_SHIFT,
+		.min_field_value = 1,
+		.enable = cpu_enable_uao,
+	},
+#endif /* CONFIG_ARM64_UAO */
 	{},
 };
 
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 5f6244526e31..4431e8364881 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -46,6 +46,7 @@
 #include <linux/notifier.h>
 #include <trace/events/power.h>
 
+#include <asm/alternative.h>
 #include <asm/compat.h>
 #include <asm/cacheflush.h>
 #include <asm/fpsimd.h>
@@ -346,6 +347,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	} else {
 		memset(childregs, 0, sizeof(struct pt_regs));
 		childregs->pstate = PSR_MODE_EL1h;
+		if (IS_ENABLED(CONFIG_ARM64_UAO) &&
+		    cpus_have_cap(ARM64_HAS_UAO))
+			childregs->pstate |= PSR_UAO_BIT;
 		p->thread.cpu_context.x19 = stack_start;
 		p->thread.cpu_context.x20 = stk_sz;
 	}
@@ -374,6 +378,20 @@ static void tls_thread_switch(struct task_struct *next)
 	: : "r" (tpidr), "r" (tpidrro));
 }
 
+/* Restore the UAO state depending on next's addr_limit */
+static void uao_thread_switch(struct task_struct *next)
+{
+	unsigned long next_sp = next->thread.cpu_context.sp;
+
+	if (IS_ENABLED(CONFIG_ARM64_UAO) &&
+	    get_thread_info(next_sp)->addr_limit == KERNEL_DS)
+		asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO,
+			        CONFIG_ARM64_UAO));
+	else
+		asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO,
+				CONFIG_ARM64_UAO));
+}
+
 /*
  * Thread switching.
  */
@@ -386,6 +404,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	tls_thread_switch(next);
 	hw_breakpoint_thread_switch(next);
 	contextidr_thread_switch(next);
+	uao_thread_switch(next);
 
 	/*
 	 * Complete any pending TLB or cache maintenance on this CPU in case
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index a9723c71c52b..3f950b677c07 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -39,20 +39,20 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
 	subs	x1, x1, #8
 	b.mi	2f
 1:
-USER(9f, str	xzr, [x0], #8	)
+uao_user_alternative 9f, str, sttr, xzr, x0, 8
 	subs	x1, x1, #8
 	b.pl	1b
 2:	adds	x1, x1, #4
 	b.mi	3f
-USER(9f, str	wzr, [x0], #4	)
+uao_user_alternative 9f, str, sttr, wzr, x0, 4
 	sub	x1, x1, #4
 3:	adds	x1, x1, #2
 	b.mi	4f
-USER(9f, strh	wzr, [x0], #2	)
+uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
 	sub	x1, x1, #2
 4:	adds	x1, x1, #1
 	b.mi	5f
-USER(9f, strb	wzr, [x0]	)
+uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
 5:	mov	x0, #0
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 281e75db899a..be09fda09986 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -34,7 +34,7 @@
  */
 
 	.macro ldrb1 ptr, regB, val
-	USER(9998f, ldrb  \ptr, [\regB], \val)
+	uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
 	.endm
 
 	.macro strb1 ptr, regB, val
@@ -42,7 +42,7 @@
 	.endm
 
 	.macro ldrh1 ptr, regB, val
-	USER(9998f, ldrh  \ptr, [\regB], \val)
+	uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
 	.endm
 
 	.macro strh1 ptr, regB, val
@@ -50,7 +50,7 @@
 	.endm
 
 	.macro ldr1 ptr, regB, val
-	USER(9998f, ldr \ptr, [\regB], \val)
+	uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
 	.endm
 
 	.macro str1 ptr, regB, val
@@ -58,7 +58,7 @@
 	.endm
 
 	.macro ldp1 ptr, regB, regC, val
-	USER(9998f, ldp \ptr, \regB, [\regC], \val)
+	uao_ldp 9998f, \ptr, \regB, \regC, \val
 	.endm
 
 	.macro stp1 ptr, regB, regC, val
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 81c8fc93c100..feaad1520dc1 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -35,35 +35,35 @@
  *	x0 - bytes not copied
  */
 	.macro ldrb1 ptr, regB, val
-	USER(9998f, ldrb  \ptr, [\regB], \val)
+	uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
 	.endm
 
 	.macro strb1 ptr, regB, val
-	USER(9998f, strb \ptr, [\regB], \val)
+	uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
 	.endm
 
 	.macro ldrh1 ptr, regB, val
-	USER(9998f, ldrh  \ptr, [\regB], \val)
+	uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
 	.endm
 
 	.macro strh1 ptr, regB, val
-	USER(9998f, strh \ptr, [\regB], \val)
+	uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
 	.endm
 
 	.macro ldr1 ptr, regB, val
-	USER(9998f, ldr \ptr, [\regB], \val)
+	uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
 	.endm
 
 	.macro str1 ptr, regB, val
-	USER(9998f, str \ptr, [\regB], \val)
+	uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
 	.endm
 
 	.macro ldp1 ptr, regB, regC, val
-	USER(9998f, ldp \ptr, \regB, [\regC], \val)
+	uao_ldp 9998f, \ptr, \regB, \regC, \val
 	.endm
 
 	.macro stp1 ptr, regB, regC, val
-	USER(9998f, stp \ptr, \regB, [\regC], \val)
+	uao_stp 9998f, \ptr, \regB, \regC, \val
 	.endm
 
 end	.req	x5
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index db4d187de61f..b4b9178aa36b 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -37,7 +37,7 @@
 	.endm
 
 	.macro strb1 ptr, regB, val
-	USER(9998f, strb \ptr, [\regB], \val)
+	uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
 	.endm
 
 	.macro ldrh1 ptr, regB, val
@@ -45,7 +45,7 @@
 	.endm
 
 	.macro strh1 ptr, regB, val
-	USER(9998f, strh \ptr, [\regB], \val)
+	uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
 	.endm
 
 	.macro ldr1 ptr, regB, val
@@ -53,7 +53,7 @@
 	.endm
 
 	.macro str1 ptr, regB, val
-	USER(9998f, str \ptr, [\regB], \val)
+	uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
 	.endm
 
 	.macro ldp1 ptr, regB, regC, val
@@ -61,7 +61,7 @@
 	.endm
 
 	.macro stp1 ptr, regB, regC, val
-	USER(9998f, stp \ptr, \regB, [\regC], \val)
+	uao_stp 9998f, \ptr, \regB, \regC, \val
 	.endm
 
 end	.req	x5
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 92ddac1e8ca2..820d47353cf0 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -192,6 +192,14 @@ out:
 	return fault;
 }
 
+static inline int permission_fault(unsigned int esr)
+{
+	unsigned int ec       = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT;
+	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
+
+	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+}
+
 static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 				   struct pt_regs *regs)
 {
@@ -225,12 +233,10 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		mm_flags |= FAULT_FLAG_WRITE;
 	}
 
-	/*
-	 * PAN bit set implies the fault happened in kernel space, but not
-	 * in the arch's user access functions.
-	 */
-	if (IS_ENABLED(CONFIG_ARM64_PAN) && (regs->pstate & PSR_PAN_BIT))
-		goto no_context;
+	if (permission_fault(esr) && (addr < USER_DS)) {
+		if (!search_exception_tables(regs->pc))
+			panic("Accessing user space memory outside uaccess.h routines");
+	}
 
 	/*
 	 * As per x86, we may deadlock here. However, since the kernel only
@@ -561,3 +567,16 @@ void cpu_enable_pan(void *__unused)
 	config_sctlr_el1(SCTLR_EL1_SPAN, 0);
 }
 #endif /* CONFIG_ARM64_PAN */
+
+#ifdef CONFIG_ARM64_UAO
+/*
+ * Kernel threads have fs=KERNEL_DS by default, and don't need to call
+ * set_fs(), devtmpfs in particular relies on this behaviour.
+ * We need to enable the feature at runtime (instead of adding it to
+ * PSR_MODE_EL1h) as the feature may not be implemented by the cpu.
+ */
+void cpu_enable_uao(void *__unused)
+{
+	asm(SET_PSTATE_UAO(1));
+}
+#endif /* CONFIG_ARM64_UAO */

From f7a32945ec495b6b88c251bd240b0d54f6636f06 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 5 Feb 2016 14:58:49 +0000
Subject: [PATCH 0412/3220] UPSTREAM: arm64: cpufeature: Test 'matches' pointer
 to find the end of the list

CPU feature code uses the desc field as a test to find the end of the list,
this means every entry must have a description. This generates noise for
entries in the list that aren't really features, but combinations of them.
e.g.
> CPU features: detected feature: Privileged Access Never
> CPU features: detected feature: PAN and not UAO

These combination features are needed for corner cases with alternatives,
where cpu features interact.

Change all walkers of the arm64_features[] and arm64_hwcaps[] lists to test
'matches' not 'desc', and only print 'desc' if it is non-NULL.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by : Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 644c2ae198412c956700e55a2acf80b2541f6aa5)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>

Change-Id: I4500bb7c547e2e67ea56e242a8621df539f6fd67
---
 arch/arm64/kernel/cpufeature.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index ae22edf9d3c9..9cc8186cd14b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -771,7 +771,7 @@ static void __init setup_cpu_hwcaps(void)
 	int i;
 	const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps;
 
-	for (i = 0; hwcaps[i].desc; i++)
+	for (i = 0; hwcaps[i].matches; i++)
 		if (hwcaps[i].matches(&hwcaps[i]))
 			cap_set_hwcap(&hwcaps[i]);
 }
@@ -781,11 +781,11 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
 {
 	int i;
 
-	for (i = 0; caps[i].desc; i++) {
+	for (i = 0; caps[i].matches; i++) {
 		if (!caps[i].matches(&caps[i]))
 			continue;
 
-		if (!cpus_have_cap(caps[i].capability))
+		if (!cpus_have_cap(caps[i].capability) && caps[i].desc)
 			pr_info("%s %s\n", info, caps[i].desc);
 		cpus_set_cap(caps[i].capability);
 	}
@@ -800,7 +800,7 @@ enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
 {
 	int i;
 
-	for (i = 0; caps[i].desc; i++)
+	for (i = 0; caps[i].matches; i++)
 		if (caps[i].enable && cpus_have_cap(caps[i].capability))
 			on_each_cpu(caps[i].enable, NULL, true);
 }
@@ -907,7 +907,7 @@ void verify_local_cpu_capabilities(void)
 		return;
 
 	caps = arm64_features;
-	for (i = 0; caps[i].desc; i++) {
+	for (i = 0; caps[i].matches; i++) {
 		if (!cpus_have_cap(caps[i].capability) || !caps[i].sys_reg)
 			continue;
 		/*
@@ -920,7 +920,7 @@ void verify_local_cpu_capabilities(void)
 			caps[i].enable(NULL);
 	}
 
-	for (i = 0, caps = arm64_hwcaps; caps[i].desc; i++) {
+	for (i = 0, caps = arm64_hwcaps; caps[i].matches; i++) {
 		if (!cpus_have_hwcap(&caps[i]))
 			continue;
 		if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i]))

From 612055f383a023e12042f75d6d951e76ee4c5748 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 5 Feb 2016 14:58:50 +0000
Subject: [PATCH 0413/3220] BACKPORT: arm64: kernel: Don't toggle PAN on
 systems with UAO

If a CPU supports both Privileged Access Never (PAN) and User Access
Override (UAO), we don't need to disable/re-enable PAN round all
copy_to_user() like calls.

UAO alternatives cause these calls to use the 'unprivileged' load/store
instructions, which are overridden to be the privileged kind when
fs==KERNEL_DS.

This patch changes the copy_to_user() calls to have their PAN toggling
depend on a new composite 'feature' ARM64_ALT_PAN_NOT_UAO.

If both features are detected, PAN will be enabled, but the copy_to_user()
alternatives will not be applied. This means PAN will be enabled all the
time for these functions. If only PAN is detected, the toggling will be
enabled as normal.

This will save the time taken to disable/re-enable PAN, and allow us to
catch copy_to_user() accesses that occur with fs==KERNEL_DS.

Futex and swp-emulation code continue to hang their PAN toggling code on
ARM64_HAS_PAN.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 705441960033e66b63524521f153fbb28c99ddbd)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I3fa35ebacaf401e1344e76932a26fdd14a8a3cdb
---
 arch/arm64/include/asm/cpufeature.h |  3 ++-
 arch/arm64/include/asm/uaccess.h    |  8 ++++----
 arch/arm64/kernel/cpufeature.c      | 16 ++++++++++++++++
 arch/arm64/lib/clear_user.S         |  4 ++--
 arch/arm64/lib/copy_from_user.S     |  4 ++--
 arch/arm64/lib/copy_in_user.S       |  4 ++--
 arch/arm64/lib/copy_to_user.S       |  4 ++--
 arch/arm64/mm/fault.c               |  3 +++
 8 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index a5df7cde616b..37a53fc6b384 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -32,8 +32,9 @@
 #define ARM64_WORKAROUND_834220			7
 #define ARM64_HAS_NO_HW_PREFETCH		8
 #define ARM64_HAS_UAO				9
+#define ARM64_ALT_PAN_NOT_UAO			10
 
-#define ARM64_NCAPS				10
+#define ARM64_NCAPS				11
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index a227ba376f23..7b07e73b6316 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -145,7 +145,7 @@ static inline void set_fs(mm_segment_t fs)
 do {									\
 	unsigned long __gu_val;						\
 	__chk_user_ptr(ptr);						\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,	\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
 			CONFIG_ARM64_PAN));				\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
@@ -168,7 +168,7 @@ do {									\
 		BUILD_BUG();						\
 	}								\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,	\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
 			CONFIG_ARM64_PAN));				\
 } while (0)
 
@@ -217,7 +217,7 @@ do {									\
 do {									\
 	__typeof__(*(ptr)) __pu_val = (x);				\
 	__chk_user_ptr(ptr);						\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,	\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
 			CONFIG_ARM64_PAN));				\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
@@ -239,7 +239,7 @@ do {									\
 	default:							\
 		BUILD_BUG();						\
 	}								\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,	\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
 			CONFIG_ARM64_PAN));				\
 } while (0)
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9cc8186cd14b..7566cad9fa1d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -67,6 +67,10 @@ DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
 		.width = 0,				\
 	}
 
+/* meta feature for alternatives */
+static bool __maybe_unused
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry);
+
 static struct arm64_ftr_bits ftr_id_aa64isar0[] = {
 	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
 	ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_RDM_SHIFT, 4, 0),
@@ -688,6 +692,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.enable = cpu_enable_uao,
 	},
 #endif /* CONFIG_ARM64_UAO */
+#ifdef CONFIG_ARM64_PAN
+	{
+		.capability = ARM64_ALT_PAN_NOT_UAO,
+		.matches = cpufeature_pan_not_uao,
+	},
+#endif /* CONFIG_ARM64_PAN */
 	{},
 };
 
@@ -966,3 +976,9 @@ void __init setup_cpu_features(void)
 		pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
 			L1_CACHE_BYTES, cls);
 }
+
+static bool __maybe_unused
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry)
+{
+	return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO));
+}
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index 3f950b677c07..5d1cad3ce6d6 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -33,7 +33,7 @@
  * Alignment fixed up by hardware.
  */
 ENTRY(__clear_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	mov	x2, x1			// save the size for fixup return
 	subs	x1, x1, #8
@@ -54,7 +54,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
 	b.mi	5f
 uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
 5:	mov	x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	ret
 ENDPROC(__clear_user)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index be09fda09986..0b90497d4424 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -67,11 +67,11 @@
 
 end	.req	x5
 ENTRY(__arch_copy_from_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	mov	x0, #0				// Nothing to copy
 	ret
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index feaad1520dc1..f7292dd08c84 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -68,11 +68,11 @@
 
 end	.req	x5
 ENTRY(__copy_in_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	mov	x0, #0
 	ret
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index b4b9178aa36b..7a7efe255034 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -66,11 +66,11 @@
 
 end	.req	x5
 ENTRY(__arch_copy_to_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	mov	x0, #0
 	ret
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 820d47353cf0..d0762a729d01 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -234,6 +234,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	if (permission_fault(esr) && (addr < USER_DS)) {
+		if (get_thread_info(regs->sp)->addr_limit == KERNEL_DS)
+			panic("Accessing user space memory with fs=KERNEL_DS");
+
 		if (!search_exception_tables(regs->pc))
 			panic("Accessing user space memory outside uaccess.h routines");
 	}

From c39e2026d6c2daef475d66cc0656769f93b42fc4 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 18 Feb 2016 15:50:04 +0000
Subject: [PATCH 0414/3220] UPSTREAM: arm64: Remove the get_thread_info()
 function

This function was introduced by previous commits implementing UAO.
However, it can be replaced with task_thread_info() in
uao_thread_switch() or get_fs() in do_page_fault() (the latter being
called only on the current context, so no need for using the saved
pt_regs).

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit e950631e84e7e38892ffbeee5e1816b270026b0e)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ic6e9b6af7314fa83d9b0773ae3fac5a2ff34e67a
---
 arch/arm64/include/asm/thread_info.h |  6 ------
 arch/arm64/kernel/process.c          | 15 ++++++---------
 arch/arm64/mm/fault.c                |  2 +-
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index eba8db6838af..abd64bd1f6d9 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -85,12 +85,6 @@ static inline struct thread_info *current_thread_info(void)
 	return (struct thread_info *)sp_el0;
 }
 
-/* Access struct thread_info of another thread */
-static inline struct thread_info *get_thread_info(unsigned long thread_stack)
-{
-	return (struct thread_info *)(thread_stack & ~(THREAD_SIZE - 1));
-}
-
 #define thread_saved_pc(tsk)	\
 	((unsigned long)(tsk->thread.cpu_context.pc))
 #define thread_saved_sp(tsk)	\
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 4431e8364881..6f3fb46170bf 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -381,15 +381,12 @@ static void tls_thread_switch(struct task_struct *next)
 /* Restore the UAO state depending on next's addr_limit */
 static void uao_thread_switch(struct task_struct *next)
 {
-	unsigned long next_sp = next->thread.cpu_context.sp;
-
-	if (IS_ENABLED(CONFIG_ARM64_UAO) &&
-	    get_thread_info(next_sp)->addr_limit == KERNEL_DS)
-		asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO,
-			        CONFIG_ARM64_UAO));
-	else
-		asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO,
-				CONFIG_ARM64_UAO));
+	if (IS_ENABLED(CONFIG_ARM64_UAO)) {
+		if (task_thread_info(next)->addr_limit == KERNEL_DS)
+			asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO));
+		else
+			asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO));
+	}
 }
 
 /*
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index d0762a729d01..a8eafeceb08a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -234,7 +234,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	if (permission_fault(esr) && (addr < USER_DS)) {
-		if (get_thread_info(regs->sp)->addr_limit == KERNEL_DS)
+		if (get_fs() == KERNEL_DS)
 			panic("Accessing user space memory with fs=KERNEL_DS");
 
 		if (!search_exception_tables(regs->pc))

From 6bfaecfc5801909762309f4ec831f789581f279b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:32 +0100
Subject: [PATCH 0415/3220] UPSTREAM: of/fdt: make memblock minimum physical
 address arch configurable

By default, early_init_dt_add_memory_arch() ignores memory below
the base of the kernel image since it won't be addressable via the
linear mapping. However, this is not appropriate anymore once we
decouple the kernel text mapping from the linear mapping, so archs
may want to drop the low limit entirely. So allow the minimum to be
overridden by setting MIN_MEMBLOCK_ADDR.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 270522a04f7a9911983878fa37da467f9ff1c938)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>

Change-Id: I4bb2626a87493262a64584b3d808de260129127e
---
 drivers/of/fdt.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 86fff625f6d5..628f436e16f6 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1006,13 +1006,16 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK
+#ifndef MIN_MEMBLOCK_ADDR
+#define MIN_MEMBLOCK_ADDR	__pa(PAGE_OFFSET)
+#endif
 #ifndef MAX_MEMBLOCK_ADDR
 #define MAX_MEMBLOCK_ADDR	((phys_addr_t)~0)
 #endif
 
 void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
 {
-	const u64 phys_offset = __pa(PAGE_OFFSET);
+	const u64 phys_offset = MIN_MEMBLOCK_ADDR;
 
 	if (!PAGE_ALIGNED(base)) {
 		if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {

From 6f77d1a3662212ea72e5c37c381bf0d2a09a463e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:33 +0100
Subject: [PATCH 0416/3220] UPSTREAM: of/fdt: factor out assignment of
 initrd_start/initrd_end

Since architectures may not yet have their linear mapping up and running
when the initrd address is discovered from the DT, factor out the
assignment of initrd_start and initrd_end, so that an architecture can
override it and use the translation it needs.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 369bc9abf22bf026e8645a4dd746b90649a2f6ee)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I8c258bdc4367955314e9a5223dc4c7751a06a98d
---
 drivers/of/fdt.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 628f436e16f6..c6d196188bc9 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -760,6 +760,16 @@ const void * __init of_flat_dt_match_machine(const void *default_match,
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
+#ifndef __early_init_dt_declare_initrd
+static void __early_init_dt_declare_initrd(unsigned long start,
+					   unsigned long end)
+{
+	initrd_start = (unsigned long)__va(start);
+	initrd_end = (unsigned long)__va(end);
+	initrd_below_start_ok = 1;
+}
+#endif
+
 /**
  * early_init_dt_check_for_initrd - Decode initrd location from flat tree
  * @node: reference to node containing initrd location ('chosen')
@@ -782,9 +792,7 @@ static void __init early_init_dt_check_for_initrd(unsigned long node)
 		return;
 	end = of_read_number(prop, len/4);
 
-	initrd_start = (unsigned long)__va(start);
-	initrd_end = (unsigned long)__va(end);
-	initrd_below_start_ok = 1;
+	__early_init_dt_declare_initrd(start, end);
 
 	pr_debug("initrd_start=0x%llx  initrd_end=0x%llx\n",
 		 (unsigned long long)start, (unsigned long long)end);

From 6727d505317c6032c58063c46fe317045f3dacb4 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:34 +0100
Subject: [PATCH 0417/3220] BACKPORT: arm64: prevent potential circular header
 dependencies in asm/bug.h

Currently, using BUG_ON() in header files is cumbersome, due to the fact
that asm/bug.h transitively includes a lot of other header files, resulting
in the actual BUG_ON() invocation appearing before its definition in the
preprocessor input. So let's reverse the #include dependency between
asm/bug.h and asm/debug-monitors.h, by moving the definition of BUG_BRK_IMM
from the latter to the former. Also fix up one user of asm/debug-monitors.h
which relied on a transitive include.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 03336b1df9929e5d9c28fd9768948b6151cb046c)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I71470b7db9ef858a5a8368a872f931936c723a25
---
 arch/arm64/include/asm/bug.h            | 2 +-
 arch/arm64/include/asm/debug-monitors.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 4a748ce9ba1a..679d49221998 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -18,7 +18,7 @@
 #ifndef _ARCH_ARM64_ASM_BUG_H
 #define _ARCH_ARM64_ASM_BUG_H
 
-#include <asm/debug-monitors.h>
+#define BUG_BRK_IMM			0x800
 
 #ifdef CONFIG_GENERIC_BUG
 #define HAVE_ARCH_BUG
diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h
index 279c85b5ec09..e893a1fca9c2 100644
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -20,6 +20,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <asm/bug.h>
 #include <asm/esr.h>
 #include <asm/insn.h>
 #include <asm/ptrace.h>
@@ -57,7 +58,6 @@
 #define FAULT_BRK_IMM			0x100
 #define KGDB_DYN_DBG_BRK_IMM		0x400
 #define KGDB_COMPILED_DBG_BRK_IMM	0x401
-#define BUG_BRK_IMM			0x800
 
 /*
  * BRK instruction encoding

From e3ec18326ffe69b6736232499ddf8634ed40601c Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:35 +0100
Subject: [PATCH 0418/3220] BACKPORT: arm64: add support for ioremap() block
 mappings

This wires up the existing generic huge-vmap feature, which allows
ioremap() to use PMD or PUD sized block mappings. It also adds support
to the unmap path for dealing with block mappings, which will allow us
to unmap the __init region using unmap_kernel_range() in a subsequent
patch.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 324420bf91f60582bb481133db9547111768ef17)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I4765ae77f7d67c3972b7e5b19d43db434e8b777c
---
 .../features/vm/huge-vmap/arch-support.txt    |  2 +-
 arch/arm64/Kconfig                            |  1 +
 arch/arm64/include/asm/memory.h               |  6 +++
 arch/arm64/mm/mmu.c                           | 41 +++++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt
index af6816bccb43..df1d1f3c9af2 100644
--- a/Documentation/features/vm/huge-vmap/arch-support.txt
+++ b/Documentation/features/vm/huge-vmap/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: | TODO |
     |         arm: | TODO |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
     |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index ae51e1b18337..14866e975277 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -50,6 +50,7 @@ config ARM64
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_BITREVERSE
 	select HAVE_ARCH_HARDENED_USERCOPY
+	select HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
 	select HAVE_ARCH_KGDB
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 853953cd1f08..c65aad7b13dc 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -100,6 +100,12 @@
 #define MT_S2_NORMAL		0xf
 #define MT_S2_DEVICE_nGnRE	0x1
 
+#ifdef CONFIG_ARM64_4K_PAGES
+#define IOREMAP_MAX_ORDER	(PUD_SHIFT)
+#else
+#define IOREMAP_MAX_ORDER	(PMD_SHIFT)
+#endif
+
 #ifndef __ASSEMBLY__
 
 extern phys_addr_t		memstart_addr;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8d0e49d2cba9..1f4f4d21457f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -745,3 +745,44 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 
 	return dt_virt;
 }
+
+int __init arch_ioremap_pud_supported(void)
+{
+	/* only 4k granule supports level 1 block mappings */
+	return IS_ENABLED(CONFIG_ARM64_4K_PAGES);
+}
+
+int __init arch_ioremap_pmd_supported(void)
+{
+	return 1;
+}
+
+int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot)
+{
+	BUG_ON(phys & ~PUD_MASK);
+	set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
+	return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot)
+{
+	BUG_ON(phys & ~PMD_MASK);
+	set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
+	return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+	if (!pud_sect(*pud))
+		return 0;
+	pud_clear(pud);
+	return 1;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+	if (!pmd_sect(*pmd))
+		return 0;
+	pmd_clear(pmd);
+	return 1;
+}

From c8d7bc708e8d0c197c3a0e3197bfb7f283ef352a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:36 +0100
Subject: [PATCH 0419/3220] UPSTREAM: arm64: introduce KIMAGE_VADDR as the
 virtual base of the kernel region

This introduces the preprocessor symbol KIMAGE_VADDR which will serve as
the symbolic virtual base of the kernel region, i.e., the kernel's virtual
offset will be KIMAGE_VADDR + TEXT_OFFSET. For now, we define it as being
equal to PAGE_OFFSET, but in the future, it will be moved below it once
we move the kernel virtual mapping out of the linear mapping.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit ab893fb9f1b17f02139bce547bb4b69e96b9ae16)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I31427bd2b948a22bb8ce1d22109682fc66efb98d
---
 arch/arm64/include/asm/memory.h | 10 ++++++++--
 arch/arm64/kernel/head.S        |  2 +-
 arch/arm64/kernel/vmlinux.lds.S |  4 ++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index c65aad7b13dc..aebc739f5a11 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -51,7 +51,8 @@
 #define VA_BITS			(CONFIG_ARM64_VA_BITS)
 #define VA_START		(UL(0xffffffffffffffff) << VA_BITS)
 #define PAGE_OFFSET		(UL(0xffffffffffffffff) << (VA_BITS - 1))
-#define MODULES_END		(PAGE_OFFSET)
+#define KIMAGE_VADDR		(PAGE_OFFSET)
+#define MODULES_END		(KIMAGE_VADDR)
 #define MODULES_VADDR		(MODULES_END - SZ_64M)
 #define PCI_IO_END		(MODULES_VADDR - SZ_2M)
 #define PCI_IO_START		(PCI_IO_END - PCI_IO_SIZE)
@@ -75,8 +76,13 @@
  * private definitions which should NOT be used outside memory.h
  * files.  Use virt_to_phys/phys_to_virt/__pa/__va instead.
  */
-#define __virt_to_phys(x)	(((phys_addr_t)(x) - PAGE_OFFSET + PHYS_OFFSET))
+#define __virt_to_phys(x) ({						\
+	phys_addr_t __x = (phys_addr_t)(x);				\
+	__x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) :	\
+			     (__x - KIMAGE_VADDR + PHYS_OFFSET); })
+
 #define __phys_to_virt(x)	((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET))
+#define __phys_to_kimg(x)	((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR))
 
 /*
  * Convert a page to/from a physical address
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 53b9f9f128c2..04d38a058b19 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -389,7 +389,7 @@ __create_page_tables:
 	 * Map the kernel image (starting with PHYS_OFFSET).
 	 */
 	mov	x0, x26				// swapper_pg_dir
-	mov	x5, #PAGE_OFFSET
+	ldr	x5, =KIMAGE_VADDR
 	create_pgd_entry x0, x5, x3, x6
 	ldr	x6, =KERNEL_END			// __va(KERNEL_END)
 	mov	x3, x24				// phys offset
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 763e5aab5752..7703feba23d6 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -89,7 +89,7 @@ SECTIONS
 		*(.discard.*)
 	}
 
-	. = PAGE_OFFSET + TEXT_OFFSET;
+	. = KIMAGE_VADDR + TEXT_OFFSET;
 
 	.head.text : {
 		_text = .;
@@ -188,4 +188,4 @@ ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
 /*
  * If padding is applied before .head.text, virt<->phys conversions will fail.
  */
-ASSERT(_text == (PAGE_OFFSET + TEXT_OFFSET), "HEAD is misaligned")
+ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned")

From 58539890b74ae227da611e91965c2e5ee2f57645 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:37 +0100
Subject: [PATCH 0420/3220] UPSTREAM: arm64: pgtable: implement static
 [pte|pmd|pud]_offset variants

The page table accessors pte_offset(), pud_offset() and pmd_offset()
rely on __va translations, so they can only be used after the linear
mapping has been installed. For the early fixmap and kasan init routines,
whose page tables are allocated statically in the kernel image, these
functions will return bogus values. So implement pte_offset_kimg(),
pmd_offset_kimg() and pud_offset_kimg(), which can be used instead
before any page tables have been allocated dynamically.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 6533945a32c762c5db70d7a3ec251a040b2d9661)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ibea400f0938db568524fb83eb2d22d8658bbb56b
---
 arch/arm64/include/asm/pgtable.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 6a58dd48663f..f3e887d5376c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -453,6 +453,9 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 
 #define pmd_page(pmd)		pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
 
+/* use ONLY for statically allocated translation tables */
+#define pte_offset_kimg(dir,addr)	((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr))))
+
 /*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
@@ -496,6 +499,9 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
 
 #define pud_page(pud)		pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK))
 
+/* use ONLY for statically allocated translation tables */
+#define pmd_offset_kimg(dir,addr)	((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr))))
+
 #else
 
 #define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
@@ -505,6 +511,8 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
 #define pmd_set_fixmap_offset(pudp, addr)	((pmd_t *)pudp)
 #define pmd_clear_fixmap()
 
+#define pmd_offset_kimg(dir,addr)	((pmd_t *)dir)
+
 #endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
 #if CONFIG_PGTABLE_LEVELS > 3
@@ -543,6 +551,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 
 #define pgd_page(pgd)		pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK))
 
+/* use ONLY for statically allocated translation tables */
+#define pud_offset_kimg(dir,addr)	((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr))))
+
 #else
 
 #define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
@@ -552,6 +563,8 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 #define pud_set_fixmap_offset(pgdp, addr)	((pud_t *)pgdp)
 #define pud_clear_fixmap()
 
+#define pud_offset_kimg(dir,addr)	((pud_t *)dir)
+
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
 #define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))

From 3b177382c499864e0fd6a6a11236a616a7105be5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:38 +0100
Subject: [PATCH 0421/3220] UPSTREAM: arm64: decouple early fixmap init from
 linear mapping

Since the early fixmap page tables are populated using pages that are
part of the static footprint of the kernel, they are covered by the
initial kernel mapping, and we can refer to them without using __va/__pa
translations, which are tied to the linear mapping.

Since the fixmap page tables are disjoint from the kernel mapping up
to the top level pgd entry, we can refer to bm_pte[] directly, and there
is no need to walk the page tables and perform __pa()/__va() translations
at each step.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 157962f5a8f236cab898b68bdaa69ce68922f0bf)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I49221a199962aec6d4f3712bfb3dd041d64ba99b
---
 arch/arm64/mm/mmu.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1f4f4d21457f..84ef01d7aaf8 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -614,7 +614,7 @@ static inline pud_t * fixmap_pud(unsigned long addr)
 
 	BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd));
 
-	return pud_offset(pgd, addr);
+	return pud_offset_kimg(pgd, addr);
 }
 
 static inline pmd_t * fixmap_pmd(unsigned long addr)
@@ -623,16 +623,12 @@ static inline pmd_t * fixmap_pmd(unsigned long addr)
 
 	BUG_ON(pud_none(*pud) || pud_bad(*pud));
 
-	return pmd_offset(pud, addr);
+	return pmd_offset_kimg(pud, addr);
 }
 
 static inline pte_t * fixmap_pte(unsigned long addr)
 {
-	pmd_t *pmd = fixmap_pmd(addr);
-
-	BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd));
-
-	return pte_offset_kernel(pmd, addr);
+	return &bm_pte[pte_index(addr)];
 }
 
 void __init early_fixmap_init(void)
@@ -644,14 +640,14 @@ void __init early_fixmap_init(void)
 
 	pgd = pgd_offset_k(addr);
 	pgd_populate(&init_mm, pgd, bm_pud);
-	pud = pud_offset(pgd, addr);
+	pud = fixmap_pud(addr);
 	pud_populate(&init_mm, pud, bm_pmd);
-	pmd = pmd_offset(pud, addr);
+	pmd = fixmap_pmd(addr);
 	pmd_populate_kernel(&init_mm, pmd, bm_pte);
 
 	/*
 	 * The boot-ioremap range spans multiple pmds, for which
-	 * we are not preparted:
+	 * we are not prepared:
 	 */
 	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
 		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));

From 1ca6787a7310da94e9bf0846b0a2e1a9e9a91291 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:39 +0100
Subject: [PATCH 0422/3220] BACKPORT: arm64: kvm: deal with kernel symbols
 outside of linear mapping

KVM on arm64 uses a fixed offset between the linear mapping at EL1 and
the HYP mapping at EL2. Before we can move the kernel virtual mapping
out of the linear mapping, we have to make sure that references to kernel
symbols that are accessed via the HYP mapping are translated to their
linear equivalent.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit a0bf9776cd0be4490d4675d4108e13379849fc7f)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I316f029d22a16773c168a151dba59bed7921fa7e
---
 arch/arm/include/asm/kvm_asm.h    |  2 ++
 arch/arm/kvm/arm.c                |  5 +++--
 arch/arm64/include/asm/kvm_asm.h  | 17 +++++++++++++++++
 arch/arm64/include/asm/kvm_host.h |  8 +++++---
 arch/arm64/kvm/hyp.S              |  6 +++---
 5 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 194c91b610ff..c35c349da069 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -79,6 +79,8 @@
 #define rr_lo_hi(a1, a2) a1, a2
 #endif
 
+#define kvm_ksym_ref(kva)	(kva)
+
 #ifndef __ASSEMBLY__
 struct kvm;
 struct kvm_vcpu;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index e06fd299de08..70e6d557c75f 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -969,7 +969,7 @@ static void cpu_init_hyp_mode(void *dummy)
 	pgd_ptr = kvm_mmu_get_httbr();
 	stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
 	hyp_stack_ptr = stack_page + PAGE_SIZE;
-	vector_ptr = (unsigned long)__kvm_hyp_vector;
+	vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
 
 	__cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
 
@@ -1061,7 +1061,8 @@ static int init_hyp_mode(void)
 	/*
 	 * Map the Hyp-code called directly from the host
 	 */
-	err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end);
+	err = create_hyp_mappings(kvm_ksym_ref(__kvm_hyp_code_start),
+				  kvm_ksym_ref(__kvm_hyp_code_end));
 	if (err) {
 		kvm_err("Cannot map world-switch code\n");
 		goto out_free_mappings;
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 5e377101f919..e95c39543629 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -102,7 +102,24 @@
 #define KVM_ARM64_DEBUG_DIRTY_SHIFT	0
 #define KVM_ARM64_DEBUG_DIRTY		(1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
 
+#define kvm_ksym_ref(sym)		((void *)&sym + kvm_ksym_shift)
+
 #ifndef __ASSEMBLY__
+#if __GNUC__ > 4
+#define kvm_ksym_shift			(PAGE_OFFSET - KIMAGE_VADDR)
+#else
+/*
+ * GCC versions 4.9 and older will fold the constant below into the addend of
+ * the reference to 'sym' above if kvm_ksym_shift is declared static or if the
+ * constant is used directly. However, since we use the small code model for
+ * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair,
+ * with a +/- 4 GB range, resulting in linker relocation errors if the shift
+ * is sufficiently large. So prevent the compiler from folding the shift into
+ * the addend, by making the shift a variable with external linkage.
+ */
+__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR;
+#endif
+
 struct kvm;
 struct kvm_vcpu;
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a35ce7266aac..90c6368ad7c8 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -222,7 +222,7 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 
-u64 kvm_call_hyp(void *hypfn, ...);
+u64 __kvm_call_hyp(void *hypfn, ...);
 void force_vm_exit(const cpumask_t *mask);
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
@@ -243,8 +243,8 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
 	 * Call initialization code, and switch to the full blown
 	 * HYP code.
 	 */
-	kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
-		     hyp_stack_ptr, vector_ptr);
+	__kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
+		       hyp_stack_ptr, vector_ptr);
 }
 
 static inline void kvm_arch_hardware_disable(void) {}
@@ -258,4 +258,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
 
+#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__)
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 86c289832272..309e3479dc2c 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -923,7 +923,7 @@ __hyp_panic_str:
 	.align	2
 
 /*
- * u64 kvm_call_hyp(void *hypfn, ...);
+ * u64 __kvm_call_hyp(void *hypfn, ...);
  *
  * This is not really a variadic function in the classic C-way and care must
  * be taken when calling this to ensure parameters are passed in registers
@@ -940,10 +940,10 @@ __hyp_panic_str:
  * used to implement __hyp_get_vectors in the same way as in
  * arch/arm64/kernel/hyp_stub.S.
  */
-ENTRY(kvm_call_hyp)
+ENTRY(__kvm_call_hyp)
 	hvc	#0
 	ret
-ENDPROC(kvm_call_hyp)
+ENDPROC(__kvm_call_hyp)
 
 .macro invalid_vector	label, target
 	.align	2

From c3a32b2934bd3190c651fb53ad734b296a45f15e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:40 +0100
Subject: [PATCH 0423/3220] UPSTREAM: arm64: move kernel image to base of
 vmalloc area

This moves the module area to right before the vmalloc area, and moves
the kernel image to the base of the vmalloc area. This is an intermediate
step towards implementing KASLR, which allows the kernel image to be
located anywhere in the vmalloc area.

Since other subsystems such as hibernate may still need to refer to the
kernel text or data segments via their linears addresses, both are mapped
in the linear region as well. The linear alias of the text region is
mapped read-only/non-executable to prevent inadvertent modification or
execution.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit f9040773b7bbbd9e98eb6184a263512a7cfc133f)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I698faed47bb7cfc256a1b5b5407a7c586bdc63b3
---
 arch/arm64/include/asm/kasan.h   |   2 +-
 arch/arm64/include/asm/memory.h  |  21 ++++--
 arch/arm64/include/asm/pgtable.h |  10 +--
 arch/arm64/mm/dump.c             |  12 ++--
 arch/arm64/mm/init.c             |  23 +++----
 arch/arm64/mm/kasan_init.c       |  27 +++++++-
 arch/arm64/mm/mmu.c              | 110 +++++++++++++++++++++----------
 7 files changed, 137 insertions(+), 68 deletions(-)

diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h
index de0d21211c34..71ad0f93eb71 100644
--- a/arch/arm64/include/asm/kasan.h
+++ b/arch/arm64/include/asm/kasan.h
@@ -14,7 +14,7 @@
  * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses.
  */
 #define KASAN_SHADOW_START      (VA_START)
-#define KASAN_SHADOW_END        (KASAN_SHADOW_START + (1UL << (VA_BITS - 3)))
+#define KASAN_SHADOW_END        (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
 
 /*
  * This value is used to map an address to the corresponding shadow
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index aebc739f5a11..4388651d1f0d 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -45,16 +45,15 @@
  * VA_START - the first kernel virtual address.
  * TASK_SIZE - the maximum size of a user space task.
  * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
- * The module space lives between the addresses given by TASK_SIZE
- * and PAGE_OFFSET - it must be within 128MB of the kernel text.
  */
 #define VA_BITS			(CONFIG_ARM64_VA_BITS)
 #define VA_START		(UL(0xffffffffffffffff) << VA_BITS)
 #define PAGE_OFFSET		(UL(0xffffffffffffffff) << (VA_BITS - 1))
-#define KIMAGE_VADDR		(PAGE_OFFSET)
-#define MODULES_END		(KIMAGE_VADDR)
-#define MODULES_VADDR		(MODULES_END - SZ_64M)
-#define PCI_IO_END		(MODULES_VADDR - SZ_2M)
+#define KIMAGE_VADDR		(MODULES_END)
+#define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)
+#define MODULES_VADDR		(VA_START + KASAN_SHADOW_SIZE)
+#define MODULES_VSIZE		(SZ_64M)
+#define PCI_IO_END		(PAGE_OFFSET - SZ_2M)
 #define PCI_IO_START		(PCI_IO_END - PCI_IO_SIZE)
 #define FIXADDR_TOP		(PCI_IO_START - SZ_2M)
 #define TASK_SIZE_64		(UL(1) << VA_BITS)
@@ -71,6 +70,16 @@
 
 #define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 4))
 
+/*
+ * The size of the KASAN shadow region. This should be 1/8th of the
+ * size of the entire kernel virtual address space.
+ */
+#ifdef CONFIG_KASAN
+#define KASAN_SHADOW_SIZE	(UL(1) << (VA_BITS - 3))
+#else
+#define KASAN_SHADOW_SIZE	(0)
+#endif
+
 /*
  * Physical vs virtual RAM address space conversion.  These are
  * private definitions which should NOT be used outside memory.h
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index f3e887d5376c..9bf72b72b76d 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -36,19 +36,13 @@
  *
  * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array
  *	(rounded up to PUD_SIZE).
- * VMALLOC_START: beginning of the kernel VA space
+ * VMALLOC_START: beginning of the kernel vmalloc space
  * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
  *	fixed mappings and modules
  */
 #define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
 
-#ifndef CONFIG_KASAN
-#define VMALLOC_START		(VA_START)
-#else
-#include <asm/kasan.h>
-#define VMALLOC_START		(KASAN_SHADOW_END + SZ_64K)
-#endif
-
+#define VMALLOC_START		(MODULES_END)
 #define VMALLOC_END		(PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
 
 #define vmemmap			((struct page *)(VMALLOC_END + SZ_64K))
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 0841b2bf0e6a..6be918478f85 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -35,7 +35,9 @@ struct addr_marker {
 };
 
 enum address_markers_idx {
-	VMALLOC_START_NR = 0,
+	MODULES_START_NR = 0,
+	MODULES_END_NR,
+	VMALLOC_START_NR,
 	VMALLOC_END_NR,
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	VMEMMAP_START_NR,
@@ -45,12 +47,12 @@ enum address_markers_idx {
 	FIXADDR_END_NR,
 	PCI_START_NR,
 	PCI_END_NR,
-	MODULES_START_NR,
-	MODULES_END_NR,
 	KERNEL_SPACE_NR,
 };
 
 static struct addr_marker address_markers[] = {
+	{ MODULES_VADDR,	"Modules start" },
+	{ MODULES_END,		"Modules end" },
 	{ VMALLOC_START,	"vmalloc() Area" },
 	{ VMALLOC_END,		"vmalloc() End" },
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -61,9 +63,7 @@ static struct addr_marker address_markers[] = {
 	{ FIXADDR_TOP,		"Fixmap end" },
 	{ PCI_IO_START,		"PCI I/O start" },
 	{ PCI_IO_END,		"PCI I/O end" },
-	{ MODULES_VADDR,	"Modules start" },
-	{ MODULES_END,		"Modules end" },
-	{ PAGE_OFFSET,		"Kernel Mapping" },
+	{ PAGE_OFFSET,		"Linear Mapping" },
 	{ -1,			NULL },
 };
 
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index f13078bbf66e..46ae88e0744e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -36,6 +36,7 @@
 #include <linux/swiotlb.h>
 
 #include <asm/fixmap.h>
+#include <asm/kasan.h>
 #include <asm/memory.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
@@ -304,22 +305,26 @@ void __init mem_init(void)
 #ifdef CONFIG_KASAN
 		  "    kasan   : 0x%16lx - 0x%16lx   (%6ld GB)\n"
 #endif
+		  "    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n"
 		  "    vmalloc : 0x%16lx - 0x%16lx   (%6ld GB)\n"
+		  "      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
+		  "      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
+		  "      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n"
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 		  "    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n"
 		  "              0x%16lx - 0x%16lx   (%6ld MB actual)\n"
 #endif
 		  "    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n"
 		  "    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n"
-		  "    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n"
-		  "    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n"
-		  "      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
-		  "      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
-		  "      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+		  "    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n",
 #ifdef CONFIG_KASAN
 		  MLG(KASAN_SHADOW_START, KASAN_SHADOW_END),
 #endif
+		  MLM(MODULES_VADDR, MODULES_END),
 		  MLG(VMALLOC_START, VMALLOC_END),
+		  MLK_ROUNDUP(__init_begin, __init_end),
+		  MLK_ROUNDUP(_text, _etext),
+		  MLK_ROUNDUP(_sdata, _edata),
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 		  MLG((unsigned long)vmemmap,
 		      (unsigned long)vmemmap + VMEMMAP_SIZE),
@@ -328,11 +333,7 @@ void __init mem_init(void)
 #endif
 		  MLK(FIXADDR_START, FIXADDR_TOP),
 		  MLM(PCI_IO_START, PCI_IO_END),
-		  MLM(MODULES_VADDR, MODULES_END),
-		  MLM(PAGE_OFFSET, (unsigned long)high_memory),
-		  MLK_ROUNDUP(__init_begin, __init_end),
-		  MLK_ROUNDUP(_text, _etext),
-		  MLK_ROUNDUP(_sdata, _edata));
+		  MLM(PAGE_OFFSET, (unsigned long)high_memory));
 
 #undef MLK
 #undef MLM
@@ -360,8 +361,8 @@ void __init mem_init(void)
 
 void free_initmem(void)
 {
-	fixup_init();
 	free_initmem_default(0);
+	fixup_init();
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index cc569a38bc76..7f10cc91fa8a 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -17,9 +17,11 @@
 #include <linux/start_kernel.h>
 
 #include <asm/mmu_context.h>
+#include <asm/kernel-pgtable.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
+#include <asm/sections.h>
 #include <asm/tlbflush.h>
 
 static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
@@ -33,7 +35,7 @@ static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
 	if (pmd_none(*pmd))
 		pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
 
-	pte = pte_offset_kernel(pmd, addr);
+	pte = pte_offset_kimg(pmd, addr);
 	do {
 		next = addr + PAGE_SIZE;
 		set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page),
@@ -51,7 +53,7 @@ static void __init kasan_early_pmd_populate(pud_t *pud,
 	if (pud_none(*pud))
 		pud_populate(&init_mm, pud, kasan_zero_pmd);
 
-	pmd = pmd_offset(pud, addr);
+	pmd = pmd_offset_kimg(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		kasan_early_pte_populate(pmd, addr, next);
@@ -68,7 +70,7 @@ static void __init kasan_early_pud_populate(pgd_t *pgd,
 	if (pgd_none(*pgd))
 		pgd_populate(&init_mm, pgd, kasan_zero_pud);
 
-	pud = pud_offset(pgd, addr);
+	pud = pud_offset_kimg(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		kasan_early_pmd_populate(pud, addr, next);
@@ -126,9 +128,13 @@ static void __init clear_pgds(unsigned long start,
 
 void __init kasan_init(void)
 {
+	u64 kimg_shadow_start, kimg_shadow_end;
 	struct memblock_region *reg;
 	int i;
 
+	kimg_shadow_start = (u64)kasan_mem_to_shadow(_text);
+	kimg_shadow_end = (u64)kasan_mem_to_shadow(_end);
+
 	/*
 	 * We are going to perform proper setup of shadow memory.
 	 * At first we should unmap early shadow (clear_pgds() call bellow).
@@ -142,8 +148,23 @@ void __init kasan_init(void)
 
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
+	vmemmap_populate(kimg_shadow_start, kimg_shadow_end, NUMA_NO_NODE);
+
+	/*
+	 * vmemmap_populate() has populated the shadow region that covers the
+	 * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round
+	 * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent
+	 * kasan_populate_zero_shadow() from replacing the PMD block mappings
+	 * with PMD table mappings at the edges of the shadow region for the
+	 * kernel image.
+	 */
+	if (ARM64_SWAPPER_USES_SECTION_MAPS)
+		kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE);
+
 	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
 			kasan_mem_to_shadow((void *)MODULES_VADDR));
+	kasan_populate_zero_shadow((void *)kimg_shadow_end,
+			kasan_mem_to_shadow((void *)PAGE_OFFSET));
 
 	for_each_memblock(memory, reg) {
 		void *start = (void *)__phys_to_virt(reg->base);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 84ef01d7aaf8..de7a8627928a 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -53,6 +53,10 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);
 
+static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
+static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
+static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
+
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 			      unsigned long size, pgprot_t vma_prot)
 {
@@ -380,16 +384,15 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 
 static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
 {
-
 	unsigned long kernel_start = __pa(_stext);
-	unsigned long kernel_end = __pa(_end);
+	unsigned long kernel_end = __pa(_etext);
 
 	/*
-	 * The kernel itself is mapped at page granularity. Map all other
-	 * memory, making sure we don't overwrite the existing kernel mappings.
+	 * Take care not to create a writable alias for the
+	 * read-only text and rodata sections of the kernel image.
 	 */
 
-	/* No overlap with the kernel. */
+	/* No overlap with the kernel text */
 	if (end < kernel_start || start >= kernel_end) {
 		__create_pgd_mapping(pgd, start, __phys_to_virt(start),
 				     end - start, PAGE_KERNEL,
@@ -398,8 +401,8 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
 	}
 
 	/*
-	 * This block overlaps the kernel mapping. Map the portion(s) which
-	 * don't overlap.
+	 * This block overlaps the kernel text mapping.
+	 * Map the portion(s) which don't overlap.
 	 */
 	if (start < kernel_start)
 		__create_pgd_mapping(pgd, start,
@@ -411,6 +414,16 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
 				     __phys_to_virt(kernel_end),
 				     end - kernel_end, PAGE_KERNEL,
 				     early_pgtable_alloc);
+
+	/*
+	 * Map the linear alias of the [_stext, _etext) interval as
+	 * read-only/non-executable. This makes the contents of the
+	 * region accessible to subsystems such as hibernate, but
+	 * protects it from inadvertent modification or execution.
+	 */
+	__create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start),
+			     kernel_end - kernel_start, PAGE_KERNEL_RO,
+			     early_pgtable_alloc);
 }
 
 static void __init map_mem(pgd_t *pgd)
@@ -429,25 +442,28 @@ static void __init map_mem(pgd_t *pgd)
 	}
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void)
 {
+	if (!IS_ENABLED(CONFIG_DEBUG_RODATA))
+		return;
+
 	create_mapping_late(__pa(_stext), (unsigned long)_stext,
 				(unsigned long)__init_begin - (unsigned long)_stext,
 				PAGE_KERNEL_ROX);
-
 }
-#endif
 
 void fixup_init(void)
 {
-	create_mapping_late(__pa(__init_begin), (unsigned long)__init_begin,
-			(unsigned long)__init_end - (unsigned long)__init_begin,
-			PAGE_KERNEL);
+	/*
+	 * Unmap the __init region but leave the VM area in place. This
+	 * prevents the region from being reused for kernel modules, which
+	 * is not supported by kallsyms.
+	 */
+	unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
 }
 
 static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end,
-				    pgprot_t prot)
+				    pgprot_t prot, struct vm_struct *vma)
 {
 	phys_addr_t pa_start = __pa(va_start);
 	unsigned long size = va_end - va_start;
@@ -457,6 +473,14 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end,
 
 	__create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
 			     early_pgtable_alloc);
+
+	vma->addr	= va_start;
+	vma->phys_addr	= pa_start;
+	vma->size	= size;
+	vma->flags	= VM_MAP;
+	vma->caller	= __builtin_return_address(0);
+
+	vm_area_add_early(vma);
 }
 
 /*
@@ -464,17 +488,35 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end,
  */
 static void __init map_kernel(pgd_t *pgd)
 {
+	static struct vm_struct vmlinux_text, vmlinux_init, vmlinux_data;
 
-	map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC);
-	map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC);
-	map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL);
+	map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC, &vmlinux_text);
+	map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
+			 &vmlinux_init);
+	map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
 
-	/*
-	 * The fixmap falls in a separate pgd to the kernel, and doesn't live
-	 * in the carveout for the swapper_pg_dir. We can simply re-use the
-	 * existing dir for the fixmap.
-	 */
-	set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START));
+	if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) {
+		/*
+		 * The fixmap falls in a separate pgd to the kernel, and doesn't
+		 * live in the carveout for the swapper_pg_dir. We can simply
+		 * re-use the existing dir for the fixmap.
+		 */
+		set_pgd(pgd_offset_raw(pgd, FIXADDR_START),
+			*pgd_offset_k(FIXADDR_START));
+	} else if (CONFIG_PGTABLE_LEVELS > 3) {
+		/*
+		 * The fixmap shares its top level pgd entry with the kernel
+		 * mapping. This can really only occur when we are running
+		 * with 16k/4 levels, so we can simply reuse the pud level
+		 * entry instead.
+		 */
+		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
+		set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
+			__pud(__pa(bm_pmd) | PUD_TYPE_TABLE));
+		pud_clear_fixmap();
+	} else {
+		BUG();
+	}
 
 	kasan_copy_shadow(pgd);
 }
@@ -600,14 +642,6 @@ void vmemmap_free(unsigned long start, unsigned long end)
 }
 #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
 
-static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
-#if CONFIG_PGTABLE_LEVELS > 2
-static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
-#endif
-#if CONFIG_PGTABLE_LEVELS > 3
-static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
-#endif
-
 static inline pud_t * fixmap_pud(unsigned long addr)
 {
 	pgd_t *pgd = pgd_offset_k(addr);
@@ -639,8 +673,18 @@ void __init early_fixmap_init(void)
 	unsigned long addr = FIXADDR_START;
 
 	pgd = pgd_offset_k(addr);
-	pgd_populate(&init_mm, pgd, bm_pud);
-	pud = fixmap_pud(addr);
+	if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) {
+		/*
+		 * We only end up here if the kernel mapping and the fixmap
+		 * share the top level pgd entry, which should only happen on
+		 * 16k/4 levels configurations.
+		 */
+		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
+		pud = pud_offset_kimg(pgd, addr);
+	} else {
+		pgd_populate(&init_mm, pgd, bm_pud);
+		pud = fixmap_pud(addr);
+	}
 	pud_populate(&init_mm, pud, bm_pmd);
 	pmd = fixmap_pmd(addr);
 	pmd_populate_kernel(&init_mm, pmd, bm_pte);

From 23b3715e4cc4262ba8fedabfe67e898549696126 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:41 +0100
Subject: [PATCH 0424/3220] UPSTREAM: arm64: defer __va translation of
 initrd_start and initrd_end

Before deferring the assignment of memstart_addr in a subsequent patch, to
the moment where all memory has been discovered and possibly clipped based
on the size of the linear region and the presence of a mem= command line
parameter, we need to ensure that memstart_addr is not used to perform __va
translations before it is assigned.

One such use is in the generic early DT discovery of the initrd location,
which is recorded as a virtual address in the globals initrd_start and
initrd_end. So wire up the generic support to declare the initrd addresses,
and implement it without __va() translations, and perform the translation
after memstart_addr has been assigned.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit a89dea585371a9d5d85499db47c93f129be8e0c4)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I7d0b3dd7adcf069d4e7c1f58fd12e59c4cb62017
---
 arch/arm64/include/asm/memory.h |  8 ++++++++
 arch/arm64/mm/init.c            | 13 +++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 4388651d1f0d..18b7e77c7495 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -121,6 +121,14 @@
 #define IOREMAP_MAX_ORDER	(PMD_SHIFT)
 #endif
 
+#ifdef CONFIG_BLK_DEV_INITRD
+#define __early_init_dt_declare_initrd(__start, __end)			\
+	do {								\
+		initrd_start = (__start);				\
+		initrd_end = (__end);					\
+	} while (0)
+#endif
+
 #ifndef __ASSEMBLY__
 
 extern phys_addr_t		memstart_addr;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 46ae88e0744e..16e18d35eb04 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -59,8 +59,8 @@ static int __init early_initrd(char *p)
 	if (*endp == ',') {
 		size = memparse(endp + 1, NULL);
 
-		initrd_start = (unsigned long)__va(start);
-		initrd_end = (unsigned long)__va(start + size);
+		initrd_start = start;
+		initrd_end = start + size;
 	}
 	return 0;
 }
@@ -170,8 +170,13 @@ void __init arm64_memblock_init(void)
 	 */
 	memblock_reserve(__pa(_text), _end - _text);
 #ifdef CONFIG_BLK_DEV_INITRD
-	if (initrd_start)
-		memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start);
+	if (initrd_start) {
+		memblock_reserve(initrd_start, initrd_end - initrd_start);
+
+		/* the generic initrd code expects virtual addresses */
+		initrd_start = __phys_to_virt(initrd_start);
+		initrd_end = __phys_to_virt(initrd_end);
+	}
 #endif
 
 	early_init_fdt_scan_reserved_mem();

From cb4d47d9a2e863d38c1a2c81b4f2c5181cf1c8df Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 16 Feb 2016 13:52:42 +0100
Subject: [PATCH 0425/3220] UPSTREAM: arm64: allow kernel Image to be loaded
 anywhere in physical memory

This relaxes the kernel Image placement requirements, so that it
may be placed at any 2 MB aligned offset in physical memory.

This is accomplished by ignoring PHYS_OFFSET when installing
memblocks, and accounting for the apparent virtual offset of
the kernel Image. As a result, virtual address references
below PAGE_OFFSET are correctly mapped onto physical references
into the kernel Image regardless of where it sits in memory.

Special care needs to be taken for dealing with memory limits passed
via mem=, since the generic implementation clips memory top down, which
may clip the kernel image itself if it is loaded high up in memory. To
deal with this case, we simply add back the memory covering the kernel
image, which may result in more memory to be retained than was passed
as a mem= parameter.

Since mem= should not be considered a production feature, a panic notifier
handler is installed that dumps the memory limit at panic time if one was
set.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit a7f8de168ace487fa7b88cb154e413cf40e87fc6)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I1d28cb66b658ef89f9648918565ddc07df4660f8
---
 Documentation/arm64/booting.txt         | 20 +++++---
 arch/arm64/include/asm/boot.h           |  6 +++
 arch/arm64/include/asm/kernel-pgtable.h | 12 +++++
 arch/arm64/include/asm/kvm_asm.h        | 17 +------
 arch/arm64/include/asm/memory.h         | 18 +++----
 arch/arm64/kernel/head.S                |  6 ++-
 arch/arm64/kernel/image.h               | 13 +++--
 arch/arm64/mm/init.c                    | 63 ++++++++++++++++++++++++-
 arch/arm64/mm/mmu.c                     |  3 ++
 9 files changed, 119 insertions(+), 39 deletions(-)

diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt
index 701d39d3171a..56d6d8b796db 100644
--- a/Documentation/arm64/booting.txt
+++ b/Documentation/arm64/booting.txt
@@ -109,7 +109,13 @@ Header notes:
 			1 - 4K
 			2 - 16K
 			3 - 64K
-  Bits 3-63:	Reserved.
+  Bit 3:	Kernel physical placement
+			0 - 2MB aligned base should be as close as possible
+			    to the base of DRAM, since memory below it is not
+			    accessible via the linear mapping
+			1 - 2MB aligned base may be anywhere in physical
+			    memory
+  Bits 4-63:	Reserved.
 
 - When image_size is zero, a bootloader should attempt to keep as much
   memory as possible free for use by the kernel immediately after the
@@ -117,14 +123,14 @@ Header notes:
   depending on selected features, and is effectively unbound.
 
 The Image must be placed text_offset bytes from a 2MB aligned base
-address near the start of usable system RAM and called there. Memory
-below that base address is currently unusable by Linux, and therefore it
-is strongly recommended that this location is the start of system RAM.
-The region between the 2 MB aligned base address and the start of the
-image has no special significance to the kernel, and may be used for
-other purposes.
+address anywhere in usable system RAM and called there. The region
+between the 2 MB aligned base address and the start of the image has no
+special significance to the kernel, and may be used for other purposes.
 At least image_size bytes from the start of the image must be free for
 use by the kernel.
+NOTE: versions prior to v4.6 cannot make use of memory below the
+physical offset of the Image so it is recommended that the Image be
+placed as close as possible to the start of system RAM.
 
 Any memory described to the kernel (even that below the start of the
 image) which is not marked as reserved from the kernel (e.g., with a
diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h
index 81151b67b26b..ebf2481889c3 100644
--- a/arch/arm64/include/asm/boot.h
+++ b/arch/arm64/include/asm/boot.h
@@ -11,4 +11,10 @@
 #define MIN_FDT_ALIGN		8
 #define MAX_FDT_SIZE		SZ_2M
 
+/*
+ * arm64 requires the kernel image to placed
+ * TEXT_OFFSET bytes beyond a 2 MB aligned base
+ */
+#define MIN_KIMG_ALIGN		SZ_2M
+
 #endif
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index a459714ee29e..5c6375d8528b 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -79,5 +79,17 @@
 #define SWAPPER_MM_MMUFLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
 #endif
 
+/*
+ * To make optimal use of block mappings when laying out the linear
+ * mapping, round down the base of physical memory to a size that can
+ * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
+ * (64k granule), or a multiple that can be mapped using contiguous bits
+ * in the page tables: 32 * PMD_SIZE (16k granule)
+ */
+#ifdef CONFIG_ARM64_64K_PAGES
+#define ARM64_MEMSTART_ALIGN	SZ_512M
+#else
+#define ARM64_MEMSTART_ALIGN	SZ_1G
+#endif
 
 #endif	/* __ASM_KERNEL_PGTABLE_H */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index e95c39543629..419bc6661b5c 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -102,24 +102,9 @@
 #define KVM_ARM64_DEBUG_DIRTY_SHIFT	0
 #define KVM_ARM64_DEBUG_DIRTY		(1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
 
-#define kvm_ksym_ref(sym)		((void *)&sym + kvm_ksym_shift)
+#define kvm_ksym_ref(sym)		phys_to_virt((u64)&sym - kimage_voffset)
 
 #ifndef __ASSEMBLY__
-#if __GNUC__ > 4
-#define kvm_ksym_shift			(PAGE_OFFSET - KIMAGE_VADDR)
-#else
-/*
- * GCC versions 4.9 and older will fold the constant below into the addend of
- * the reference to 'sym' above if kvm_ksym_shift is declared static or if the
- * constant is used directly. However, since we use the small code model for
- * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair,
- * with a +/- 4 GB range, resulting in linker relocation errors if the shift
- * is sufficiently large. So prevent the compiler from folding the shift into
- * the addend, by making the shift a variable with external linkage.
- */
-__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR;
-#endif
-
 struct kvm;
 struct kvm_vcpu;
 
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 18b7e77c7495..3239e4d78e0d 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -24,6 +24,7 @@
 #include <linux/compiler.h>
 #include <linux/const.h>
 #include <linux/types.h>
+#include <asm/bug.h>
 #include <asm/sizes.h>
 
 /*
@@ -88,10 +89,10 @@
 #define __virt_to_phys(x) ({						\
 	phys_addr_t __x = (phys_addr_t)(x);				\
 	__x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) :	\
-			     (__x - KIMAGE_VADDR + PHYS_OFFSET); })
+			     (__x - kimage_voffset); })
 
 #define __phys_to_virt(x)	((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET))
-#define __phys_to_kimg(x)	((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR))
+#define __phys_to_kimg(x)	((unsigned long)((x) + kimage_voffset))
 
 /*
  * Convert a page to/from a physical address
@@ -133,15 +134,16 @@
 
 extern phys_addr_t		memstart_addr;
 /* PHYS_OFFSET - the physical address of the start of memory. */
-#define PHYS_OFFSET		({ memstart_addr; })
+#define PHYS_OFFSET		({ BUG_ON(memstart_addr & 1); memstart_addr; })
+
+/* the offset between the kernel virtual and physical mappings */
+extern u64			kimage_voffset;
 
 /*
- * The maximum physical address that the linear direct mapping
- * of system RAM can cover. (PAGE_OFFSET can be interpreted as
- * a 2's complement signed quantity and negated to derive the
- * maximum size of the linear mapping.)
+ * Allow all memory at the discovery stage. We will clip it later.
  */
-#define MAX_MEMBLOCK_ADDR	({ memstart_addr - PAGE_OFFSET - 1; })
+#define MIN_MEMBLOCK_ADDR	0
+#define MAX_MEMBLOCK_ADDR	U64_MAX
 
 /*
  * PFNs are used to describe any physical page; this means
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 04d38a058b19..05b98289093e 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -428,7 +428,11 @@ __mmap_switched:
 	and	x4, x4, #~(THREAD_SIZE - 1)
 	msr	sp_el0, x4			// Save thread_info
 	str_l	x21, __fdt_pointer, x5		// Save FDT pointer
-	str_l	x24, memstart_addr, x6		// Save PHYS_OFFSET
+
+	ldr	x4, =KIMAGE_VADDR		// Save the offset between
+	sub	x4, x4, x24			// the kernel virtual and
+	str_l	x4, kimage_voffset, x5		// physical mappings
+
 	mov	x29, #0
 #ifdef CONFIG_KASAN
 	bl	kasan_early_init
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index 999633bd7294..c9c62cab25a4 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -42,15 +42,18 @@
 #endif
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
-#define __HEAD_FLAG_BE	1
+#define __HEAD_FLAG_BE		1
 #else
-#define __HEAD_FLAG_BE	0
+#define __HEAD_FLAG_BE		0
 #endif
 
-#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2)
+#define __HEAD_FLAG_PAGE_SIZE	((PAGE_SHIFT - 10) / 2)
 
-#define __HEAD_FLAGS	((__HEAD_FLAG_BE << 0) |	\
-			 (__HEAD_FLAG_PAGE_SIZE << 1))
+#define __HEAD_FLAG_PHYS_BASE	1
+
+#define __HEAD_FLAGS		((__HEAD_FLAG_BE << 0) |	\
+				 (__HEAD_FLAG_PAGE_SIZE << 1) |	\
+				 (__HEAD_FLAG_PHYS_BASE << 3))
 
 /*
  * These will output as part of the Image header, which should be little-endian
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 16e18d35eb04..e333f7f0b4f2 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -35,8 +35,10 @@
 #include <linux/efi.h>
 #include <linux/swiotlb.h>
 
+#include <asm/boot.h>
 #include <asm/fixmap.h>
 #include <asm/kasan.h>
+#include <asm/kernel-pgtable.h>
 #include <asm/memory.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
@@ -46,7 +48,13 @@
 
 #include "mm.h"
 
-phys_addr_t memstart_addr __read_mostly = 0;
+/*
+ * We need to be able to catch inadvertent references to memstart_addr
+ * that occur (potentially in generic code) before arm64_memblock_init()
+ * executes, which assigns it its actual value. So use a default value
+ * that cannot be mistaken for a real physical address.
+ */
+phys_addr_t memstart_addr __read_mostly = ~0ULL;
 phys_addr_t arm64_dma_phys_limit __read_mostly;
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -162,7 +170,33 @@ early_param("mem", early_mem);
 
 void __init arm64_memblock_init(void)
 {
-	memblock_enforce_memory_limit(memory_limit);
+	const s64 linear_region_size = -(s64)PAGE_OFFSET;
+
+	/*
+	 * Select a suitable value for the base of physical memory.
+	 */
+	memstart_addr = round_down(memblock_start_of_DRAM(),
+				   ARM64_MEMSTART_ALIGN);
+
+	/*
+	 * Remove the memory that we will not be able to cover with the
+	 * linear mapping. Take care not to clip the kernel which may be
+	 * high in memory.
+	 */
+	memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)),
+			ULLONG_MAX);
+	if (memblock_end_of_DRAM() > linear_region_size)
+		memblock_remove(0, memblock_end_of_DRAM() - linear_region_size);
+
+	/*
+	 * Apply the memory limit if it was set. Since the kernel may be loaded
+	 * high up in memory, add back the kernel region that must be accessible
+	 * via the linear mapping.
+	 */
+	if (memory_limit != (phys_addr_t)ULLONG_MAX) {
+		memblock_enforce_memory_limit(memory_limit);
+		memblock_add(__pa(_text), (u64)(_end - _text));
+	}
 
 	/*
 	 * Register the kernel text, kernel data, initrd, and initial
@@ -388,3 +422,28 @@ static int __init keepinitrd_setup(char *__unused)
 
 __setup("keepinitrd", keepinitrd_setup);
 #endif
+
+/*
+ * Dump out memory limit information on panic.
+ */
+static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p)
+{
+	if (memory_limit != (phys_addr_t)ULLONG_MAX) {
+		pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
+	} else {
+		pr_emerg("Memory Limit: none\n");
+	}
+	return 0;
+}
+
+static struct notifier_block mem_limit_notifier = {
+	.notifier_call = dump_mem_limit,
+};
+
+static int __init register_mem_limit_dumper(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list,
+				       &mem_limit_notifier);
+	return 0;
+}
+__initcall(register_mem_limit_dumper);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index de7a8627928a..8577437b6c0a 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -46,6 +46,9 @@
 
 u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
 
+u64 kimage_voffset __read_mostly;
+EXPORT_SYMBOL(kimage_voffset);
+
 /*
  * Empty_zero_page is a special page that is used for zero-initialized data
  * and COW.

From 298b730c260f6766a31ac4b3fa203d7d251a9cbd Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 19 Feb 2016 14:28:58 +0000
Subject: [PATCH 0426/3220] UPSTREAM: arm64: User die() instead of panic() in
 do_page_fault()

The former gives better error reporting on unhandled permission faults
(introduced by the UAO patches).

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 70c8abc28762d04e36c92e07eee2ce6ab41049cb)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ia419eccf1554a32fa4131ac15b277d4d2d4eb508
---
 arch/arm64/mm/fault.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index a8eafeceb08a..44e56de23f79 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -235,10 +235,10 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 
 	if (permission_fault(esr) && (addr < USER_DS)) {
 		if (get_fs() == KERNEL_DS)
-			panic("Accessing user space memory with fs=KERNEL_DS");
+			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
 
 		if (!search_exception_tables(regs->pc))
-			panic("Accessing user space memory outside uaccess.h routines");
+			die("Accessing user space memory outside uaccess.h routines", regs, esr);
 	}
 
 	/*

From 477376b452f0d5fc564dfe2d0fc0fe8156051bb1 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 22 Feb 2016 18:46:03 +0100
Subject: [PATCH 0427/3220] UPSTREAM: arm64: mm: only perform memstart_addr
 sanity check if DEBUG_VM

Checking whether memstart_addr has been assigned every time it is
referenced adds a branch instruction that may hurt performance if
the reference in question occurs on a hot path. So only perform the
check if CONFIG_DEBUG_VM=y.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
[catalin.marinas@arm.com: replaced #ifdef with VM_BUG_ON]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit a92405f082d43267575444a6927085e4c8a69e4e)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ia5f206d9a2dbbdbfc3f05fe985d4eca309f0d889
---
 arch/arm64/include/asm/memory.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 3239e4d78e0d..460d09bf9442 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -132,9 +132,11 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/mmdebug.h>
+
 extern phys_addr_t		memstart_addr;
 /* PHYS_OFFSET - the physical address of the start of memory. */
-#define PHYS_OFFSET		({ BUG_ON(memstart_addr & 1); memstart_addr; })
+#define PHYS_OFFSET		({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })
 
 /* the offset between the kernel virtual and physical mappings */
 extern u64			kimage_voffset;

From b4fd0619fedde9b2849e844db1bde3929c336a82 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 22 Feb 2016 18:46:04 +0100
Subject: [PATCH 0428/3220] UPSTREAM: arm64: mm: use bit ops rather than
 arithmetic in pa/va translations

Since PAGE_OFFSET is chosen such that it cuts the kernel VA space right
in half, and since the size of the kernel VA space itself is always a
power of 2, we can treat PAGE_OFFSET as a bitmask and replace the
additions/subtractions with 'or' and 'and-not' operations.

For the comparison against PAGE_OFFSET, a mov/cmp/branch sequence ends
up getting replaced with a single tbz instruction. For the additions and
subtractions, we save a mov instruction since the mask is folded into the
instruction's immediate field.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 8439e62a15614e8fcd43835d57b7245cd9870dc5)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I1ea4ef654dd7b7693f8713dab28ca0739b8a2c62
---
 arch/arm64/include/asm/memory.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 460d09bf9442..eb798156cf56 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -88,10 +88,10 @@
  */
 #define __virt_to_phys(x) ({						\
 	phys_addr_t __x = (phys_addr_t)(x);				\
-	__x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) :	\
-			     (__x - kimage_voffset); })
+	__x & BIT(VA_BITS - 1) ? (__x & ~PAGE_OFFSET) + PHYS_OFFSET :	\
+				 (__x - kimage_voffset); })
 
-#define __phys_to_virt(x)	((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET))
+#define __phys_to_virt(x)	((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET)
 #define __phys_to_kimg(x)	((unsigned long)((x) + kimage_voffset))
 
 /*
@@ -132,6 +132,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/bitops.h>
 #include <linux/mmdebug.h>
 
 extern phys_addr_t		memstart_addr;

From 0983513a290d07016a2b9c097157bec9685800ce Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 23 Feb 2016 08:56:45 +0100
Subject: [PATCH 0429/3220] UPSTREAM: arm64: move brk immediate argument
 definitions to separate header

Instead of reversing the header dependency between asm/bug.h and
asm/debug-monitors.h, split off the brk instruction immediate value
defines into a new header asm/brk-imm.h, and include it from both.

This solves the circular dependency issue that prevents BUG() from
being used in some header files, and keeps the definitions together.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit f98deee9a9f8c47d05a0f64d86440882dca772ff)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Id4827af98ab3d413828c589bc379acecabeff108
---
 arch/arm64/include/asm/brk-imm.h        | 25 +++++++++++++++++++++++++
 arch/arm64/include/asm/bug.h            |  2 +-
 arch/arm64/include/asm/debug-monitors.h | 14 +-------------
 3 files changed, 27 insertions(+), 14 deletions(-)
 create mode 100644 arch/arm64/include/asm/brk-imm.h

diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h
new file mode 100644
index 000000000000..ed693c5bcec0
--- /dev/null
+++ b/arch/arm64/include/asm/brk-imm.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __ASM_BRK_IMM_H
+#define __ASM_BRK_IMM_H
+
+/*
+ * #imm16 values used for BRK instruction generation
+ * Allowed values for kgdb are 0x400 - 0x7ff
+ * 0x100: for triggering a fault on purpose (reserved)
+ * 0x400: for dynamic BRK instruction
+ * 0x401: for compile time BRK instruction
+ * 0x800: kernel-mode BUG() and WARN() traps
+ */
+#define FAULT_BRK_IMM			0x100
+#define KGDB_DYN_DBG_BRK_IMM		0x400
+#define KGDB_COMPILED_DBG_BRK_IMM	0x401
+#define BUG_BRK_IMM			0x800
+
+#endif
diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 679d49221998..561190d15881 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -18,7 +18,7 @@
 #ifndef _ARCH_ARM64_ASM_BUG_H
 #define _ARCH_ARM64_ASM_BUG_H
 
-#define BUG_BRK_IMM			0x800
+#include <asm/brk-imm.h>
 
 #ifdef CONFIG_GENERIC_BUG
 #define HAVE_ARCH_BUG
diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h
index e893a1fca9c2..2fcb9b7c876c 100644
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -20,7 +20,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <asm/bug.h>
+#include <asm/brk-imm.h>
 #include <asm/esr.h>
 #include <asm/insn.h>
 #include <asm/ptrace.h>
@@ -47,18 +47,6 @@
  */
 #define BREAK_INSTR_SIZE		AARCH64_INSN_SIZE
 
-/*
- * #imm16 values used for BRK instruction generation
- * Allowed values for kgbd are 0x400 - 0x7ff
- * 0x100: for triggering a fault on purpose (reserved)
- * 0x400: for dynamic BRK instruction
- * 0x401: for compile time BRK instruction
- * 0x800: kernel-mode BUG() and WARN() traps
- */
-#define FAULT_BRK_IMM			0x100
-#define KGDB_DYN_DBG_BRK_IMM		0x400
-#define KGDB_COMPILED_DBG_BRK_IMM	0x401
-
 /*
  * BRK instruction encoding
  * The #imm16 value should be placed at bits[20:5] within BRK ins

From 4166edc437c98ef74d6a1cf3a2b2fabeabca2b18 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 24 Nov 2015 12:37:35 +0100
Subject: [PATCH 0430/3220] UPSTREAM: arm64: add support for module PLTs

This adds support for emitting PLTs at module load time for relative
branches that are out of range. This is a prerequisite for KASLR, which
may place the kernel and the modules anywhere in the vmalloc area,
making it more likely that branch target offsets exceed the maximum
range of +/- 128 MB.

In this version, I removed the distinction between relocations against
.init executable sections and ordinary executable sections. The reason
is that it is hardly worth the trouble, given that .init.text usually
does not contain that many far branches, and this version now only
reserves PLT entry space for jump and call relocations against undefined
symbols (since symbols defined in the same module can be assumed to be
within +/- 128 MB)

For example, the mac80211.ko module (which is fairly sizable at ~400 KB)
built with -mcmodel=large gives the following relocation counts:

                    relocs    branches   unique     !local
  .text              3925       3347       518        219
  .init.text           11          8         7          1
  .exit.text            4          4         4          1
  .text.unlikely       81         67        36         17

('unique' means branches to unique type/symbol/addend combos, of which
!local is the subset referring to undefined symbols)

IOW, we are only emitting a single PLT entry for the .init sections, and
we are better off just adding it to the core PLT section instead.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit fd045f6cd98ec4953147b318418bd45e441e52a3)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>

Change-Id: I1b46bb817e7d16a1b9a394b100c9e5de46c0837c
---
 arch/arm64/Kconfig              |   9 ++
 arch/arm64/Makefile             |   6 +-
 arch/arm64/include/asm/module.h |  11 ++
 arch/arm64/kernel/Makefile      |   1 +
 arch/arm64/kernel/module-plts.c | 201 ++++++++++++++++++++++++++++++++
 arch/arm64/kernel/module.c      |  22 ++++
 arch/arm64/kernel/module.lds    |   3 +
 7 files changed, 252 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kernel/module-plts.c
 create mode 100644 arch/arm64/kernel/module.lds

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 14866e975277..65a8e7c9fdd1 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -399,6 +399,7 @@ config ARM64_ERRATUM_843419
 	bool "Cortex-A53: 843419: A load or store might access an incorrect address"
 	depends on MODULES
 	default y
+	select ARM64_MODULE_CMODEL_LARGE
 	help
 	  This option builds kernel modules using the large memory model in
 	  order to avoid the use of the ADRP instruction, which can cause
@@ -762,6 +763,14 @@ config ARM64_UAO
 	  regular load/store instructions if the cpu does not implement the
 	  feature.
 
+config ARM64_MODULE_CMODEL_LARGE
+	bool
+
+config ARM64_MODULE_PLTS
+	bool
+	select ARM64_MODULE_CMODEL_LARGE
+	select HAVE_MOD_ARCH_SPECIFIC
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 693a4dc6220c..763509ff6f24 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -43,10 +43,14 @@ endif
 
 CHECKFLAGS	+= -D__aarch64__
 
-ifeq ($(CONFIG_ARM64_ERRATUM_843419), y)
+ifeq ($(CONFIG_ARM64_MODULE_CMODEL_LARGE), y)
 KBUILD_CFLAGS_MODULE	+= -mcmodel=large
 endif
 
+ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
+KBUILD_LDFLAGS_MODULE	+= -T $(srctree)/arch/arm64/kernel/module.lds
+endif
+
 # Default value
 head-y		:= arch/arm64/kernel/head.o
 
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index e80e232b730e..8652fb613304 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -20,4 +20,15 @@
 
 #define MODULE_ARCH_VERMAGIC	"aarch64"
 
+#ifdef CONFIG_ARM64_MODULE_PLTS
+struct mod_arch_specific {
+	struct elf64_shdr	*plt;
+	int			plt_num_entries;
+	int			plt_max_entries;
+};
+#endif
+
+u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela,
+			  Elf64_Sym *sym);
+
 #endif /* __ASM_MODULE_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index c4e2f70c0aa0..8d971f9c6ed5 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -30,6 +30,7 @@ arm64-obj-$(CONFIG_COMPAT)		+= sys32.o kuser32.o signal32.o 	\
 					   ../../arm/kernel/opcodes.o
 arm64-obj-$(CONFIG_FUNCTION_TRACER)	+= ftrace.o entry-ftrace.o
 arm64-obj-$(CONFIG_MODULES)		+= arm64ksyms.o module.o
+arm64-obj-$(CONFIG_ARM64_MODULE_PLTS)	+= module-plts.o
 arm64-obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o perf_callchain.o
 arm64-obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o
 arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
new file mode 100644
index 000000000000..1ce90d8450ae
--- /dev/null
+++ b/arch/arm64/kernel/module-plts.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2014-2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/elf.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+
+struct plt_entry {
+	/*
+	 * A program that conforms to the AArch64 Procedure Call Standard
+	 * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or
+	 * IP1 (x17) may be inserted at any branch instruction that is
+	 * exposed to a relocation that supports long branches. Since that
+	 * is exactly what we are dealing with here, we are free to use x16
+	 * as a scratch register in the PLT veneers.
+	 */
+	__le32	mov0;	/* movn	x16, #0x....			*/
+	__le32	mov1;	/* movk	x16, #0x...., lsl #16		*/
+	__le32	mov2;	/* movk	x16, #0x...., lsl #32		*/
+	__le32	br;	/* br	x16				*/
+};
+
+u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela,
+			  Elf64_Sym *sym)
+{
+	struct plt_entry *plt = (struct plt_entry *)mod->arch.plt->sh_addr;
+	int i = mod->arch.plt_num_entries;
+	u64 val = sym->st_value + rela->r_addend;
+
+	/*
+	 * We only emit PLT entries against undefined (SHN_UNDEF) symbols,
+	 * which are listed in the ELF symtab section, but without a type
+	 * or a size.
+	 * So, similar to how the module loader uses the Elf64_Sym::st_value
+	 * field to store the resolved addresses of undefined symbols, let's
+	 * borrow the Elf64_Sym::st_size field (whose value is never used by
+	 * the module loader, even for symbols that are defined) to record
+	 * the address of a symbol's associated PLT entry as we emit it for a
+	 * zero addend relocation (which is the only kind we have to deal with
+	 * in practice). This allows us to find duplicates without having to
+	 * go through the table every time.
+	 */
+	if (rela->r_addend == 0 && sym->st_size != 0) {
+		BUG_ON(sym->st_size < (u64)plt || sym->st_size >= (u64)&plt[i]);
+		return sym->st_size;
+	}
+
+	mod->arch.plt_num_entries++;
+	BUG_ON(mod->arch.plt_num_entries > mod->arch.plt_max_entries);
+
+	/*
+	 * MOVK/MOVN/MOVZ opcode:
+	 * +--------+------------+--------+-----------+-------------+---------+
+	 * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] |
+	 * +--------+------------+--------+-----------+-------------+---------+
+	 *
+	 * Rd     := 0x10 (x16)
+	 * hw     := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32)
+	 * opc    := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ)
+	 * sf     := 1 (64-bit variant)
+	 */
+	plt[i] = (struct plt_entry){
+		cpu_to_le32(0x92800010 | (((~val      ) & 0xffff)) << 5),
+		cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5),
+		cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5),
+		cpu_to_le32(0xd61f0200)
+	};
+
+	if (rela->r_addend == 0)
+		sym->st_size = (u64)&plt[i];
+
+	return (u64)&plt[i];
+}
+
+#define cmp_3way(a,b)	((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rela(const void *a, const void *b)
+{
+	const Elf64_Rela *x = a, *y = b;
+	int i;
+
+	/* sort by type, symbol index and addend */
+	i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info));
+	if (i == 0)
+		i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info));
+	if (i == 0)
+		i = cmp_3way(x->r_addend, y->r_addend);
+	return i;
+}
+
+static bool duplicate_rel(const Elf64_Rela *rela, int num)
+{
+	/*
+	 * Entries are sorted by type, symbol index and addend. That means
+	 * that, if a duplicate entry exists, it must be in the preceding
+	 * slot.
+	 */
+	return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0;
+}
+
+static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num)
+{
+	unsigned int ret = 0;
+	Elf64_Sym *s;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		switch (ELF64_R_TYPE(rela[i].r_info)) {
+		case R_AARCH64_JUMP26:
+		case R_AARCH64_CALL26:
+			/*
+			 * We only have to consider branch targets that resolve
+			 * to undefined symbols. This is not simply a heuristic,
+			 * it is a fundamental limitation, since the PLT itself
+			 * is part of the module, and needs to be within 128 MB
+			 * as well, so modules can never grow beyond that limit.
+			 */
+			s = syms + ELF64_R_SYM(rela[i].r_info);
+			if (s->st_shndx != SHN_UNDEF)
+				break;
+
+			/*
+			 * Jump relocations with non-zero addends against
+			 * undefined symbols are supported by the ELF spec, but
+			 * do not occur in practice (e.g., 'jump n bytes past
+			 * the entry point of undefined function symbol f').
+			 * So we need to support them, but there is no need to
+			 * take them into consideration when trying to optimize
+			 * this code. So let's only check for duplicates when
+			 * the addend is zero: this allows us to record the PLT
+			 * entry address in the symbol table itself, rather than
+			 * having to search the list for duplicates each time we
+			 * emit one.
+			 */
+			if (rela[i].r_addend != 0 || !duplicate_rel(rela, i))
+				ret++;
+			break;
+		}
+	}
+	return ret;
+}
+
+int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+			      char *secstrings, struct module *mod)
+{
+	unsigned long plt_max_entries = 0;
+	Elf64_Sym *syms = NULL;
+	int i;
+
+	/*
+	 * Find the empty .plt section so we can expand it to store the PLT
+	 * entries. Record the symtab address as well.
+	 */
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		if (strcmp(".plt", secstrings + sechdrs[i].sh_name) == 0)
+			mod->arch.plt = sechdrs + i;
+		else if (sechdrs[i].sh_type == SHT_SYMTAB)
+			syms = (Elf64_Sym *)sechdrs[i].sh_addr;
+	}
+
+	if (!mod->arch.plt) {
+		pr_err("%s: module PLT section missing\n", mod->name);
+		return -ENOEXEC;
+	}
+	if (!syms) {
+		pr_err("%s: module symtab section missing\n", mod->name);
+		return -ENOEXEC;
+	}
+
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset;
+		int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela);
+		Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info;
+
+		if (sechdrs[i].sh_type != SHT_RELA)
+			continue;
+
+		/* ignore relocations that operate on non-exec sections */
+		if (!(dstsec->sh_flags & SHF_EXECINSTR))
+			continue;
+
+		/* sort by type, symbol index and addend */
+		sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL);
+
+		plt_max_entries += count_plts(syms, rels, numrels);
+	}
+
+	mod->arch.plt->sh_type = SHT_NOBITS;
+	mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+	mod->arch.plt->sh_addralign = L1_CACHE_BYTES;
+	mod->arch.plt->sh_size = plt_max_entries * sizeof(struct plt_entry);
+	mod->arch.plt_num_entries = 0;
+	mod->arch.plt_max_entries = plt_max_entries;
+	return 0;
+}
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 93e970231ca9..a9dde97f5ca5 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -38,6 +38,21 @@ void *module_alloc(unsigned long size)
 				GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
 				NUMA_NO_NODE, __builtin_return_address(0));
 
+	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
+	    !IS_ENABLED(CONFIG_KASAN))
+		/*
+		 * KASAN can only deal with module allocations being served
+		 * from the reserved module region, since the remainder of
+		 * the vmalloc region is already backed by zero shadow pages,
+		 * and punching holes into it is non-trivial. Since the module
+		 * region is not randomized when KASAN is enabled, it is even
+		 * less likely that the module region gets exhausted, so we
+		 * can simply omit this fallback in that case.
+		 */
+		p = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START,
+				VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
+				NUMA_NO_NODE, __builtin_return_address(0));
+
 	if (p && (kasan_module_alloc(p, size) < 0)) {
 		vfree(p);
 		return NULL;
@@ -361,6 +376,13 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 		case R_AARCH64_CALL26:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26,
 					     AARCH64_INSN_IMM_26);
+
+			if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
+			    ovf == -ERANGE) {
+				val = module_emit_plt_entry(me, &rel[i], sym);
+				ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2,
+						     26, AARCH64_INSN_IMM_26);
+			}
 			break;
 
 		default:
diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds
new file mode 100644
index 000000000000..8949f6c6f729
--- /dev/null
+++ b/arch/arm64/kernel/module.lds
@@ -0,0 +1,3 @@
+SECTIONS {
+	.plt (NOLOAD) : { BYTE(0) }
+}

From ee1ef5eefa116dd23b1311c019e6d4d115bb1f4d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sat, 26 Dec 2015 13:48:02 +0100
Subject: [PATCH 0431/3220] UPSTREAM: arm64: avoid R_AARCH64_ABS64 relocations
 for Image header fields

Unfortunately, the current way of using the linker to emit build time
constants into the Image header will no longer work once we switch to
the use of PIE executables. The reason is that such constants are emitted
into the binary using R_AARCH64_ABS64 relocations, which are resolved at
runtime, not at build time, and the places targeted by those relocations
will contain zeroes before that.

So refactor the endian swapping linker script constant generation code so
that it emits the upper and lower 32-bit words separately.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 6ad1fe5d9077a1ab40bf74b61994d2e770b00b14)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Iaa809a0b5fcf628e1e49cd6aaa0f31f31ce95c23
---
 arch/arm64/include/asm/assembler.h | 11 ++++++++++
 arch/arm64/kernel/head.S           |  6 +++---
 arch/arm64/kernel/image.h          | 32 ++++++++++++++++++------------
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index bb7b72734c24..ba5aff6c830e 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -215,4 +215,15 @@ lr	.req	x30		// link register
 	.size	__pi_##x, . - x;	\
 	ENDPROC(x)
 
+	/*
+	 * Emit a 64-bit absolute little endian symbol reference in a way that
+	 * ensures that it will be resolved at build time, even when building a
+	 * PIE binary. This requires cooperation from the linker script, which
+	 * must emit the lo32/hi32 halves individually.
+	 */
+	.macro	le64sym, sym
+	.long	\sym\()_lo32
+	.long	\sym\()_hi32
+	.endm
+
 #endif	/* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 05b98289093e..f076debf392d 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -83,9 +83,9 @@ efi_head:
 	b	stext				// branch to kernel start, magic
 	.long	0				// reserved
 #endif
-	.quad	_kernel_offset_le		// Image load offset from start of RAM, little-endian
-	.quad	_kernel_size_le			// Effective size of kernel image, little-endian
-	.quad	_kernel_flags_le		// Informative flags, little-endian
+	le64sym	_kernel_offset_le		// Image load offset from start of RAM, little-endian
+	le64sym	_kernel_size_le			// Effective size of kernel image, little-endian
+	le64sym	_kernel_flags_le		// Informative flags, little-endian
 	.quad	0				// reserved
 	.quad	0				// reserved
 	.quad	0				// reserved
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index c9c62cab25a4..db1bf57948f1 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -26,21 +26,27 @@
  * There aren't any ELF relocations we can use to endian-swap values known only
  * at link time (e.g. the subtraction of two symbol addresses), so we must get
  * the linker to endian-swap certain values before emitting them.
+ *
+ * Note that, in order for this to work when building the ELF64 PIE executable
+ * (for KASLR), these values should not be referenced via R_AARCH64_ABS64
+ * relocations, since these are fixed up at runtime rather than at build time
+ * when PIE is in effect. So we need to split them up in 32-bit high and low
+ * words.
  */
 #ifdef CONFIG_CPU_BIG_ENDIAN
-#define DATA_LE64(data)					\
-	((((data) & 0x00000000000000ff) << 56) |	\
-	 (((data) & 0x000000000000ff00) << 40) |	\
-	 (((data) & 0x0000000000ff0000) << 24) |	\
-	 (((data) & 0x00000000ff000000) << 8)  |	\
-	 (((data) & 0x000000ff00000000) >> 8)  |	\
-	 (((data) & 0x0000ff0000000000) >> 24) |	\
-	 (((data) & 0x00ff000000000000) >> 40) |	\
-	 (((data) & 0xff00000000000000) >> 56))
+#define DATA_LE32(data)				\
+	((((data) & 0x000000ff) << 24) |	\
+	 (((data) & 0x0000ff00) << 8)  |	\
+	 (((data) & 0x00ff0000) >> 8)  |	\
+	 (((data) & 0xff000000) >> 24))
 #else
-#define DATA_LE64(data) ((data) & 0xffffffffffffffff)
+#define DATA_LE32(data) ((data) & 0xffffffff)
 #endif
 
+#define DEFINE_IMAGE_LE64(sym, data)				\
+	sym##_lo32 = DATA_LE32((data) & 0xffffffff);		\
+	sym##_hi32 = DATA_LE32((data) >> 32)
+
 #ifdef CONFIG_CPU_BIG_ENDIAN
 #define __HEAD_FLAG_BE		1
 #else
@@ -61,9 +67,9 @@
  * endian swapped in head.S, all are done here for consistency.
  */
 #define HEAD_SYMBOLS						\
-	_kernel_size_le		= DATA_LE64(_end - _text);	\
-	_kernel_offset_le	= DATA_LE64(TEXT_OFFSET);	\
-	_kernel_flags_le	= DATA_LE64(__HEAD_FLAGS);
+	DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text);	\
+	DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET);	\
+	DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS);
 
 #ifdef CONFIG_EFI
 

From 68c7746e3ca999b2d3c75af8ac714ad28c1a4288 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sat, 26 Dec 2015 12:46:40 +0100
Subject: [PATCH 0432/3220] UPSTREAM: arm64: avoid dynamic relocations in early
 boot code

Before implementing KASLR for arm64 by building a self-relocating PIE
executable, we have to ensure that values we use before the relocation
routine is executed are not subject to dynamic relocation themselves.
This applies not only to virtual addresses, but also to values that are
supplied by the linker at build time and relocated using R_AARCH64_ABS64
relocations.

So instead, use assemble time constants, or force the use of static
relocations by folding the constants into the instructions.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 2bf31a4a05f5b00f37d65ba029d36a0230286cb7)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>

Change-Id: Icce0176591e3c0ae444e1ea54258efe677933c5b
---
 arch/arm64/kernel/efi-entry.S |  2 +-
 arch/arm64/kernel/head.S      | 39 +++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index a773db92908b..f82036e02485 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -61,7 +61,7 @@ ENTRY(entry)
 	 */
 	mov	x20, x0		// DTB address
 	ldr	x0, [sp, #16]	// relocated _text address
-	ldr	x21, =stext_offset
+	movz	x21, #:abs_g0:stext_offset
 	add	x21, x0, x21
 
 	/*
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index f076debf392d..4cad8f9f2268 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -67,12 +67,11 @@
  * in the entry routines.
  */
 	__HEAD
-
+_head:
 	/*
 	 * DO NOT MODIFY. Image header expected by Linux boot-loaders.
 	 */
 #ifdef CONFIG_EFI
-efi_head:
 	/*
 	 * This add instruction has no meaningful effect except that
 	 * its opcode forms the magic "MZ" signature required by UEFI.
@@ -94,14 +93,14 @@ efi_head:
 	.byte	0x4d
 	.byte	0x64
 #ifdef CONFIG_EFI
-	.long	pe_header - efi_head		// Offset to the PE header.
+	.long	pe_header - _head		// Offset to the PE header.
 #else
 	.word	0				// reserved
 #endif
 
 #ifdef CONFIG_EFI
 	.globl	__efistub_stext_offset
-	.set	__efistub_stext_offset, stext - efi_head
+	.set	__efistub_stext_offset, stext - _head
 	.align 3
 pe_header:
 	.ascii	"PE"
@@ -124,7 +123,7 @@ optional_header:
 	.long	_end - stext			// SizeOfCode
 	.long	0				// SizeOfInitializedData
 	.long	0				// SizeOfUninitializedData
-	.long	__efistub_entry - efi_head	// AddressOfEntryPoint
+	.long	__efistub_entry - _head		// AddressOfEntryPoint
 	.long	__efistub_stext_offset		// BaseOfCode
 
 extra_header_fields:
@@ -139,7 +138,7 @@ extra_header_fields:
 	.short	0				// MinorSubsystemVersion
 	.long	0				// Win32VersionValue
 
-	.long	_end - efi_head			// SizeOfImage
+	.long	_end - _head			// SizeOfImage
 
 	// Everything before the kernel image is considered part of the header
 	.long	__efistub_stext_offset		// SizeOfHeaders
@@ -219,11 +218,13 @@ ENTRY(stext)
 	 * On return, the CPU will be ready for the MMU to be turned on and
 	 * the TCR will have been set.
 	 */
-	ldr	x27, =__mmap_switched		// address to jump to after
+	ldr	x27, 0f				// address to jump to after
 						// MMU has been enabled
 	adr_l	lr, __enable_mmu		// return (PIC) address
 	b	__cpu_setup			// initialise processor
 ENDPROC(stext)
+	.align	3
+0:	.quad	__mmap_switched - (_head - TEXT_OFFSET) + KIMAGE_VADDR
 
 /*
  * Preserve the arguments passed by the bootloader in x0 .. x3
@@ -391,7 +392,8 @@ __create_page_tables:
 	mov	x0, x26				// swapper_pg_dir
 	ldr	x5, =KIMAGE_VADDR
 	create_pgd_entry x0, x5, x3, x6
-	ldr	x6, =KERNEL_END			// __va(KERNEL_END)
+	ldr	w6, kernel_img_size
+	add	x6, x6, x5
 	mov	x3, x24				// phys offset
 	create_block_map x0, x7, x3, x5, x6
 
@@ -408,6 +410,9 @@ __create_page_tables:
 	mov	lr, x27
 	ret
 ENDPROC(__create_page_tables)
+
+kernel_img_size:
+	.long	_end - (_head - TEXT_OFFSET)
 	.ltorg
 
 /*
@@ -415,6 +420,10 @@ ENDPROC(__create_page_tables)
  */
 	.set	initial_sp, init_thread_union + THREAD_START_SP
 __mmap_switched:
+	adr_l	x8, vectors			// load VBAR_EL1 with virtual
+	msr	vbar_el1, x8			// vector table address
+	isb
+
 	// Clear BSS
 	adr_l	x0, __bss_start
 	mov	x1, xzr
@@ -610,13 +619,19 @@ ENTRY(secondary_startup)
 	adrp	x26, swapper_pg_dir
 	bl	__cpu_setup			// initialise processor
 
-	ldr	x21, =secondary_data
-	ldr	x27, =__secondary_switched	// address to jump to after enabling the MMU
+	ldr	x8, =KIMAGE_VADDR
+	ldr	w9, 0f
+	sub	x27, x8, w9, sxtw		// address to jump to after enabling the MMU
 	b	__enable_mmu
 ENDPROC(secondary_startup)
+0:	.long	(_text - TEXT_OFFSET) - __secondary_switched
 
 ENTRY(__secondary_switched)
-	ldr	x0, [x21]			// get secondary_data.stack
+	adr_l	x5, vectors
+	msr	vbar_el1, x5
+	isb
+
+	ldr_l	x0, secondary_data		// get secondary_data.stack
 	mov	sp, x0
 	and	x0, x0, #~(THREAD_SIZE - 1)
 	msr	sp_el0, x0			// save thread_info
@@ -641,8 +656,6 @@ __enable_mmu:
 	ubfx	x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4
 	cmp	x2, #ID_AA64MMFR0_TGRAN_SUPPORTED
 	b.ne	__no_granule_support
-	ldr	x5, =vectors
-	msr	vbar_el1, x5
 	msr	ttbr0_el1, x25			// load TTBR0
 	msr	ttbr1_el1, x26			// load TTBR1
 	isb

From d78098f62acbb27b7b222b8e3a517581d0842a9f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 11 Jan 2016 17:08:26 +0100
Subject: [PATCH 0433/3220] UPSTREAM: arm64: make asm/elf.h available to asm
 files

This reshuffles some code in asm/elf.h and puts a #ifndef __ASSEMBLY__
around its C definitions so that the CPP defines can be used in asm
source files as well.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 4a2e034e5cdadde4c712f79bdd57d1455c76a3db)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ic499e950d2ef297d10848862a6dfa07b90887f4c
---
 arch/arm64/include/asm/elf.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index faad6df49e5b..435f55952e1f 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -24,15 +24,6 @@
 #include <asm/ptrace.h>
 #include <asm/user.h>
 
-typedef unsigned long elf_greg_t;
-
-#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t))
-#define ELF_CORE_COPY_REGS(dest, regs)	\
-	*(struct user_pt_regs *)&(dest) = (regs)->user_regs;
-
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-typedef struct user_fpsimd_state elf_fpregset_t;
-
 /*
  * AArch64 static relocation types.
  */
@@ -127,6 +118,17 @@ typedef struct user_fpsimd_state elf_fpregset_t;
  */
 #define ELF_ET_DYN_BASE	(2 * TASK_SIZE_64 / 3)
 
+#ifndef __ASSEMBLY__
+
+typedef unsigned long elf_greg_t;
+
+#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t))
+#define ELF_CORE_COPY_REGS(dest, regs)	\
+	*(struct user_pt_regs *)&(dest) = (regs)->user_regs;
+
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+typedef struct user_fpsimd_state elf_fpregset_t;
+
 /*
  * When the program starts, a1 contains a pointer to a function to be
  * registered with atexit, as per the SVR4 ABI.  A value of 0 means we have no
@@ -186,4 +188,6 @@ extern int aarch32_setup_vectors_page(struct linux_binprm *bprm,
 
 #endif /* CONFIG_COMPAT */
 
+#endif /* !__ASSEMBLY__ */
+
 #endif

From e0357208fbf9e2c9c33af1900c52c2cbaefbe910 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Tue, 2 Feb 2016 15:53:59 +0000
Subject: [PATCH 0434/3220] UPSTREAM: arm64: futex.h: Add missing PAN toggling

futex.h's futex_atomic_cmpxchg_inatomic() does not use the
__futex_atomic_op() macro and needs its own PAN toggling. This was missed
when the feature was implemented.

Fixes: 338d4f49d6f ("arm64: kernel: Add support for Privileged Access Never")
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 811d61e384e24759372bb3f01772f3744b0a8327)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I6e7b338a1af17b784d4196101422c3acee3b88ed
---
 arch/arm64/include/asm/futex.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 007a69fc4f40..5f3ab8c1db55 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -121,6 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 		return -EFAULT;
 
 	asm volatile("// futex_atomic_cmpxchg_inatomic\n"
+ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "	prfm	pstl1strm, %2\n"
 "1:	ldxr	%w1, %2\n"
 "	sub	%w3, %w1, %w4\n"
@@ -137,6 +138,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 "	.align	3\n"
 "	.quad	1b, 4b, 2b, 4b\n"
 "	.popsection\n"
+ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
 	: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
 	: "memory");

From eebed210c5e62cfe0f9f8de4173099f2d32faae9 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sun, 10 Jan 2016 11:42:28 +0100
Subject: [PATCH 0435/3220] UPSTREAM: scripts/sortextable: add support for
 ET_DYN binaries

Add support to scripts/sortextable for handling relocatable (PIE)
executables, whose ELF type is ET_DYN, not ET_EXEC. Other than adding
support for the new type, no changes are needed.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 7b957b6e603623ef8b2e8222fa94b976df613fa2)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: If55296ef4934b99c38ceb5acbd7c4a7fb23f24c1
---
 scripts/sortextable.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index c2423d913b46..ecefa0a634f8 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -266,9 +266,9 @@ do_file(char const *const fname)
 		break;
 	}  /* end switch */
 	if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0
-	||  r2(&ehdr->e_type) != ET_EXEC
+	||  (r2(&ehdr->e_type) != ET_EXEC && r2(&ehdr->e_type) != ET_DYN)
 	||  ehdr->e_ident[EI_VERSION] != EV_CURRENT) {
-		fprintf(stderr, "unrecognized ET_EXEC file %s\n", fname);
+		fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file %s\n", fname);
 		fail_file();
 	}
 
@@ -304,7 +304,7 @@ do_file(char const *const fname)
 		if (r2(&ehdr->e_ehsize) != sizeof(Elf32_Ehdr)
 		||  r2(&ehdr->e_shentsize) != sizeof(Elf32_Shdr)) {
 			fprintf(stderr,
-				"unrecognized ET_EXEC file: %s\n", fname);
+				"unrecognized ET_EXEC/ET_DYN file: %s\n", fname);
 			fail_file();
 		}
 		do32(ehdr, fname, custom_sort);
@@ -314,7 +314,7 @@ do_file(char const *const fname)
 		if (r2(&ghdr->e_ehsize) != sizeof(Elf64_Ehdr)
 		||  r2(&ghdr->e_shentsize) != sizeof(Elf64_Shdr)) {
 			fprintf(stderr,
-				"unrecognized ET_EXEC file: %s\n", fname);
+				"unrecognized ET_EXEC/ET_DYN file: %s\n", fname);
 			fail_file();
 		}
 		do64(ghdr, fname, custom_sort);

From 27375cf76085992a805dc5ad1db397e7227717c9 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 1 Jan 2016 12:39:09 +0100
Subject: [PATCH 0436/3220] UPSTREAM: extable: add support for relative
 extables to search and sort routines

This adds support to the generic search_extable() and sort_extable()
implementations for dealing with exception table entries whose fields
contain relative offsets rather than absolute addresses.

Acked-by: Helge Deller <deller@gmx.de>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit a272858a3c1ecd4a935ba23c66668f81214bd110)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I9d144d351d547c49bf3203a69dfff3cb71a51177
---
 lib/extable.c | 50 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/lib/extable.c b/lib/extable.c
index 4cac81ec225e..0be02ad561e9 100644
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -14,7 +14,37 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+#ifndef ARCH_HAS_RELATIVE_EXTABLE
+#define ex_to_insn(x)	((x)->insn)
+#else
+static inline unsigned long ex_to_insn(const struct exception_table_entry *x)
+{
+	return (unsigned long)&x->insn + x->insn;
+}
+#endif
+
 #ifndef ARCH_HAS_SORT_EXTABLE
+#ifndef ARCH_HAS_RELATIVE_EXTABLE
+#define swap_ex		NULL
+#else
+static void swap_ex(void *a, void *b, int size)
+{
+	struct exception_table_entry *x = a, *y = b, tmp;
+	int delta = b - a;
+
+	tmp = *x;
+	x->insn = y->insn + delta;
+	y->insn = tmp.insn - delta;
+
+#ifdef swap_ex_entry_fixup
+	swap_ex_entry_fixup(x, y, tmp, delta);
+#else
+	x->fixup = y->fixup + delta;
+	y->fixup = tmp.fixup - delta;
+#endif
+}
+#endif /* ARCH_HAS_RELATIVE_EXTABLE */
+
 /*
  * The exception table needs to be sorted so that the binary
  * search that we use to find entries in it works properly.
@@ -26,9 +56,9 @@ static int cmp_ex(const void *a, const void *b)
 	const struct exception_table_entry *x = a, *y = b;
 
 	/* avoid overflow */
-	if (x->insn > y->insn)
+	if (ex_to_insn(x) > ex_to_insn(y))
 		return 1;
-	if (x->insn < y->insn)
+	if (ex_to_insn(x) < ex_to_insn(y))
 		return -1;
 	return 0;
 }
@@ -37,7 +67,7 @@ void sort_extable(struct exception_table_entry *start,
 		  struct exception_table_entry *finish)
 {
 	sort(start, finish - start, sizeof(struct exception_table_entry),
-	     cmp_ex, NULL);
+	     cmp_ex, swap_ex);
 }
 
 #ifdef CONFIG_MODULES
@@ -48,13 +78,15 @@ void sort_extable(struct exception_table_entry *start,
 void trim_init_extable(struct module *m)
 {
 	/*trim the beginning*/
-	while (m->num_exentries && within_module_init(m->extable[0].insn, m)) {
+	while (m->num_exentries &&
+	       within_module_init(ex_to_insn(&m->extable[0]), m)) {
 		m->extable++;
 		m->num_exentries--;
 	}
 	/*trim the end*/
 	while (m->num_exentries &&
-		within_module_init(m->extable[m->num_exentries-1].insn, m))
+	       within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]),
+				  m))
 		m->num_exentries--;
 }
 #endif /* CONFIG_MODULES */
@@ -81,13 +113,13 @@ search_extable(const struct exception_table_entry *first,
 		 * careful, the distance between value and insn
 		 * can be larger than MAX_LONG:
 		 */
-		if (mid->insn < value)
+		if (ex_to_insn(mid) < value)
 			first = mid + 1;
-		else if (mid->insn > value)
+		else if (ex_to_insn(mid) > value)
 			last = mid - 1;
 		else
 			return mid;
-        }
-        return NULL;
+	}
+	return NULL;
 }
 #endif

From 161994d19e72907ec95e545d1129675c9c966f1a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 1 Jan 2016 15:02:12 +0100
Subject: [PATCH 0437/3220] UPSTREAM: arm64: switch to relative exception
 tables

Instead of using absolute addresses for both the exception location
and the fixup, use offsets relative to the exception table entry values.
Not only does this cut the size of the exception table in half, it is
also a prerequisite for KASLR, since absolute exception table entries
are subject to dynamic relocation, which is incompatible with the sorting
of the exception table that occurs at build time.

This patch also introduces the _ASM_EXTABLE preprocessor macro (which
exists on x86 as well) and its _asm_extable assembly counterpart, as
shorthands to emit exception table entries.

Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 6c94f27ac847ff8ef15b3da5b200574923bd6287)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Icedda8ee8c32843c439765783816d7d71ca0073a
---
 arch/arm64/include/asm/alternative.h    | 19 +++++-----------
 arch/arm64/include/asm/assembler.h      | 15 +++++++++----
 arch/arm64/include/asm/futex.h          | 12 ++++------
 arch/arm64/include/asm/uaccess.h        | 30 +++++++++++++------------
 arch/arm64/include/asm/word-at-a-time.h |  7 +++---
 arch/arm64/kernel/armv8_deprecated.c    |  7 ++----
 arch/arm64/mm/extable.c                 |  2 +-
 scripts/sortextable.c                   |  2 +-
 8 files changed, 43 insertions(+), 51 deletions(-)

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index a9fc24ec1aa9..beccbdefa106 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -157,11 +157,8 @@ void apply_alternatives(void *start, size_t length);
 			add	\addr, \addr, \post_inc;
 		alternative_endif
 
-		.section __ex_table,"a";
-		.align	3;
-		.quad	8888b,\l;
-		.quad	8889b,\l;
-		.previous;
+		_asm_extable	8888b,\l;
+		_asm_extable	8889b,\l;
 	.endm
 
 	.macro uao_stp l, reg1, reg2, addr, post_inc
@@ -175,11 +172,8 @@ void apply_alternatives(void *start, size_t length);
 			add	\addr, \addr, \post_inc;
 		alternative_endif
 
-		.section __ex_table,"a";
-		.align	3;
-		.quad	8888b,\l;
-		.quad	8889b,\l;
-		.previous
+		_asm_extable	8888b,\l;
+		_asm_extable	8889b,\l;
 	.endm
 
 	.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
@@ -191,10 +185,7 @@ void apply_alternatives(void *start, size_t length);
 			add		\addr, \addr, \post_inc;
 		alternative_endif
 
-		.section __ex_table,"a";
-		.align	3;
-		.quad	8888b,\l;
-		.previous
+		_asm_extable	8888b,\l;
 	.endm
 #else
 	.macro uao_ldp l, reg1, reg2, addr, post_inc
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ba5aff6c830e..70f7b9e04598 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -94,12 +94,19 @@
 	dmb	\opt
 	.endm
 
+/*
+ * Emit an entry into the exception table
+ */
+	.macro		_asm_extable, from, to
+	.pushsection	__ex_table, "a"
+	.align		3
+	.long		(\from - .), (\to - .)
+	.popsection
+	.endm
+
 #define USER(l, x...)				\
 9999:	x;					\
-	.section __ex_table,"a";		\
-	.align	3;				\
-	.quad	9999b,l;			\
-	.previous
+	_asm_extable	9999b, l
 
 /*
  * Register aliases.
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 5f3ab8c1db55..f2585cdd32c2 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -42,10 +42,8 @@
 "4:	mov	%w0, %w5\n"						\
 "	b	3b\n"							\
 "	.popsection\n"							\
-"	.pushsection __ex_table,\"a\"\n"				\
-"	.align	3\n"							\
-"	.quad	1b, 4b, 2b, 4b\n"					\
-"	.popsection\n"							\
+	_ASM_EXTABLE(1b, 4b)						\
+	_ASM_EXTABLE(2b, 4b)						\
 	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,		\
 		    CONFIG_ARM64_PAN)					\
 	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp)	\
@@ -134,10 +132,8 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "4:	mov	%w0, %w6\n"
 "	b	3b\n"
 "	.popsection\n"
-"	.pushsection __ex_table,\"a\"\n"
-"	.align	3\n"
-"	.quad	1b, 4b, 2b, 4b\n"
-"	.popsection\n"
+	_ASM_EXTABLE(1b, 4b)
+	_ASM_EXTABLE(2b, 4b)
 ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
 	: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 7b07e73b6316..c3d445b42351 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -36,11 +36,11 @@
 #define VERIFY_WRITE 1
 
 /*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * The exception table consists of pairs of relative offsets: the first
+ * is the relative offset to an instruction that is allowed to fault,
+ * and the second is the relative offset at which the program should
+ * continue. No registers are modified, so it is entirely up to the
+ * continuation code to figure out what to do.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -50,9 +50,11 @@
 
 struct exception_table_entry
 {
-	unsigned long insn, fixup;
+	int insn, fixup;
 };
 
+#define ARCH_HAS_RELATIVE_EXTABLE
+
 extern int fixup_exception(struct pt_regs *regs);
 
 #define KERNEL_DS	(-1UL)
@@ -115,6 +117,12 @@ static inline void set_fs(mm_segment_t fs)
 #define access_ok(type, addr, size)	__range_ok(addr, size)
 #define user_addr_max			get_fs
 
+#define _ASM_EXTABLE(from, to)						\
+	"	.pushsection	__ex_table, \"a\"\n"			\
+	"	.align		3\n"					\
+	"	.long		(" #from " - .), (" #to " - .)\n"	\
+	"	.popsection\n"
+
 /*
  * The "__xxx" versions of the user access functions do not verify the address
  * space - it must have been done previously with a separate "access_ok()"
@@ -134,10 +142,7 @@ static inline void set_fs(mm_segment_t fs)
 	"	mov	%1, #0\n"					\
 	"	b	2b\n"						\
 	"	.previous\n"						\
-	"	.section __ex_table,\"a\"\n"				\
-	"	.align	3\n"						\
-	"	.quad	1b, 3b\n"					\
-	"	.previous"						\
+	_ASM_EXTABLE(1b, 3b)						\
 	: "+r" (err), "=&r" (x)						\
 	: "r" (addr), "i" (-EFAULT))
 
@@ -206,10 +211,7 @@ do {									\
 	"3:	mov	%w0, %3\n"					\
 	"	b	2b\n"						\
 	"	.previous\n"						\
-	"	.section __ex_table,\"a\"\n"				\
-	"	.align	3\n"						\
-	"	.quad	1b, 3b\n"					\
-	"	.previous"						\
+	_ASM_EXTABLE(1b, 3b)						\
 	: "+r" (err)							\
 	: "r" (x), "r" (addr), "i" (-EFAULT))
 
diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h
index aab5bf09e9d9..2b79b8a89457 100644
--- a/arch/arm64/include/asm/word-at-a-time.h
+++ b/arch/arm64/include/asm/word-at-a-time.h
@@ -16,6 +16,8 @@
 #ifndef __ASM_WORD_AT_A_TIME_H
 #define __ASM_WORD_AT_A_TIME_H
 
+#include <asm/uaccess.h>
+
 #ifndef __AARCH64EB__
 
 #include <linux/kernel.h>
@@ -81,10 +83,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr)
 #endif
 	"	b	2b\n"
 	"	.popsection\n"
-	"	.pushsection __ex_table,\"a\"\n"
-	"	.align	3\n"
-	"	.quad	1b, 3b\n"
-	"	.popsection"
+	_ASM_EXTABLE(1b, 3b)
 	: "=&r" (ret), "=&r" (offset)
 	: "r" (addr), "Q" (*(unsigned long *)addr));
 
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 3e01207917b1..c37202c0c838 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -297,11 +297,8 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
 	"4:	mov		%w0, %w5\n"			\
 	"	b		3b\n"				\
 	"	.popsection"					\
-	"	.pushsection	 __ex_table,\"a\"\n"		\
-	"	.align		3\n"				\
-	"	.quad		0b, 4b\n"			\
-	"	.quad		1b, 4b\n"			\
-	"	.popsection\n"					\
+	_ASM_EXTABLE(0b, 4b)					\
+	_ASM_EXTABLE(1b, 4b)					\
 	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,	\
 		CONFIG_ARM64_PAN)				\
 	: "=&r" (res), "+r" (data), "=&r" (temp)		\
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index 79444279ba8c..81acd4706878 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs)
 
 	fixup = search_exception_tables(instruction_pointer(regs));
 	if (fixup)
-		regs->pc = fixup->fixup;
+		regs->pc = (unsigned long)&fixup->fixup + fixup->fixup;
 
 	return fixup != NULL;
 }
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index ecefa0a634f8..19d83647846c 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -282,12 +282,12 @@ do_file(char const *const fname)
 	case EM_386:
 	case EM_X86_64:
 	case EM_S390:
+	case EM_AARCH64:
 		custom_sort = sort_relative_table;
 		break;
 	case EM_ARCOMPACT:
 	case EM_ARCV2:
 	case EM_ARM:
-	case EM_AARCH64:
 	case EM_MICROBLAZE:
 	case EM_MIPS:
 	case EM_XTENSA:

From 01684359230797235a0bdbd9768241c41f990105 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 26 Jan 2016 09:13:44 +0100
Subject: [PATCH 0438/3220] UPSTREAM: arm64: add support for building vmlinux
 as a relocatable PIE binary

This implements CONFIG_RELOCATABLE, which links the final vmlinux
image with a dynamic relocation section, allowing the early boot code
to perform a relocation to a different virtual address at runtime.

This is a prerequisite for KASLR (CONFIG_RANDOMIZE_BASE).

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 1e48ef7fcc374051730381a2a05da77eb4eafdb0)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: If02e065722d438f85feb62240fc230e16f58e912
---
 arch/arm64/Kconfig              | 11 +++++++++++
 arch/arm64/Makefile             |  4 ++++
 arch/arm64/include/asm/elf.h    |  2 ++
 arch/arm64/kernel/head.S        | 32 ++++++++++++++++++++++++++++++++
 arch/arm64/kernel/vmlinux.lds.S | 16 ++++++++++++++++
 5 files changed, 65 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 65a8e7c9fdd1..1faf7905f0fa 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -771,6 +771,17 @@ config ARM64_MODULE_PLTS
 	select ARM64_MODULE_CMODEL_LARGE
 	select HAVE_MOD_ARCH_SPECIFIC
 
+config RELOCATABLE
+	bool
+	help
+	  This builds the kernel as a Position Independent Executable (PIE),
+	  which retains all relocation metadata required to relocate the
+	  kernel binary at runtime to a different virtual address than the
+	  address it was linked at.
+	  Since AArch64 uses the RELA relocation format, this requires a
+	  relocation pass at runtime even if the kernel is loaded at the
+	  same address it was linked at.
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 763509ff6f24..47f61e9b8fec 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -15,6 +15,10 @@ CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
 OBJCOPYFLAGS	:=-O binary -R .note -R .note.gnu.build-id -R .comment -S
 GZFLAGS		:=-9
 
+ifneq ($(CONFIG_RELOCATABLE),)
+LDFLAGS_vmlinux		+= -pie
+endif
+
 KBUILD_DEFCONFIG := defconfig
 
 # Check for binutils support for specific extensions
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 435f55952e1f..24ed037f09fd 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -77,6 +77,8 @@
 #define R_AARCH64_MOVW_PREL_G2_NC	292
 #define R_AARCH64_MOVW_PREL_G3		293
 
+#define R_AARCH64_RELATIVE		1027
+
 /*
  * These are used to set parameters in the core dumps.
  */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 4cad8f9f2268..4e69412a7323 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -29,6 +29,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/cache.h>
 #include <asm/cputype.h>
+#include <asm/elf.h>
 #include <asm/kernel-pgtable.h>
 #include <asm/memory.h>
 #include <asm/pgtable-hwdef.h>
@@ -432,6 +433,37 @@ __mmap_switched:
 	bl	__pi_memset
 	dsb	ishst				// Make zero page visible to PTW
 
+#ifdef CONFIG_RELOCATABLE
+
+	/*
+	 * Iterate over each entry in the relocation table, and apply the
+	 * relocations in place.
+	 */
+	adr_l	x8, __dynsym_start		// start of symbol table
+	adr_l	x9, __reloc_start		// start of reloc table
+	adr_l	x10, __reloc_end		// end of reloc table
+
+0:	cmp	x9, x10
+	b.hs	2f
+	ldp	x11, x12, [x9], #24
+	ldr	x13, [x9, #-8]
+	cmp	w12, #R_AARCH64_RELATIVE
+	b.ne	1f
+	str	x13, [x11]
+	b	0b
+
+1:	cmp	w12, #R_AARCH64_ABS64
+	b.ne	0b
+	add	x12, x12, x12, lsl #1		// symtab offset: 24x top word
+	add	x12, x8, x12, lsr #(32 - 3)	// ... shifted into bottom word
+	ldr	x15, [x12, #8]			// Elf64_Sym::st_value
+	add	x15, x13, x15
+	str	x15, [x11]
+	b	0b
+
+2:
+#endif
+
 	adr_l	sp, initial_sp, x4
 	mov	x4, sp
 	and	x4, x4, #~(THREAD_SIZE - 1)
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 7703feba23d6..14813a6ca13b 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -87,6 +87,7 @@ SECTIONS
 		EXIT_CALL
 		*(.discard)
 		*(.discard.*)
+		*(.interp .dynamic)
 	}
 
 	. = KIMAGE_VADDR + TEXT_OFFSET;
@@ -151,6 +152,21 @@ SECTIONS
 	.altinstr_replacement : {
 		*(.altinstr_replacement)
 	}
+	.rela : ALIGN(8) {
+		__reloc_start = .;
+		*(.rela .rela*)
+		__reloc_end = .;
+	}
+	.dynsym : ALIGN(8) {
+		__dynsym_start = .;
+		*(.dynsym)
+	}
+	.dynstr : {
+		*(.dynstr)
+	}
+	.hash : {
+		*(.hash)
+	}
 
 	. = ALIGN(PAGE_SIZE);
 	__init_end = .;

From a91a00b66c794c81d7afd04f9d2a5385952a24e1 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 26 Jan 2016 14:12:01 +0100
Subject: [PATCH 0439/3220] BACKPORT: arm64: add support for kernel ASLR

This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.

If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.

If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit f80fb3a3d50843a401dac4b566b3b131da8077a2)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I3f5fafa4e92e5ff39259d57065541366237eb021
---
 arch/arm64/Kconfig              |  29 ++++++
 arch/arm64/include/asm/memory.h |   5 +-
 arch/arm64/include/asm/module.h |   6 ++
 arch/arm64/kernel/Makefile      |   1 +
 arch/arm64/kernel/head.S        |  59 +++++++++--
 arch/arm64/kernel/kaslr.c       | 173 ++++++++++++++++++++++++++++++++
 arch/arm64/kernel/module.c      |   3 +-
 arch/arm64/kernel/setup.c       |  29 ++++++
 arch/arm64/mm/kasan_init.c      |  17 +++-
 arch/arm64/mm/mmu.c             |  29 ++++--
 10 files changed, 329 insertions(+), 22 deletions(-)
 create mode 100644 arch/arm64/kernel/kaslr.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1faf7905f0fa..a42e3235484b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -782,6 +782,35 @@ config RELOCATABLE
 	  relocation pass at runtime even if the kernel is loaded at the
 	  same address it was linked at.
 
+config RANDOMIZE_BASE
+	bool "Randomize the address of the kernel image"
+	select ARM64_MODULE_PLTS
+	select RELOCATABLE
+	help
+	  Randomizes the virtual address at which the kernel image is
+	  loaded, as a security feature that deters exploit attempts
+	  relying on knowledge of the location of kernel internals.
+
+	  It is the bootloader's job to provide entropy, by passing a
+	  random u64 value in /chosen/kaslr-seed at kernel entry.
+
+	  If unsure, say N.
+
+config RANDOMIZE_MODULE_REGION_FULL
+	bool "Randomize the module region independently from the core kernel"
+	depends on RANDOMIZE_BASE
+	default y
+	help
+	  Randomizes the location of the module region without considering the
+	  location of the core kernel. This way, it is impossible for modules
+	  to leak information about the location of core kernel data structures
+	  but it does imply that function calls between modules and the core
+	  kernel will need to be resolved via veneers in the module PLT.
+
+	  When this option is not set, the module region will be randomized over
+	  a limited range that contains the [_stext, _etext] interval of the
+	  core kernel, so branch relocations are always in range.
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index eb798156cf56..5f8667a99e41 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -53,7 +53,7 @@
 #define KIMAGE_VADDR		(MODULES_END)
 #define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)
 #define MODULES_VADDR		(VA_START + KASAN_SHADOW_SIZE)
-#define MODULES_VSIZE		(SZ_64M)
+#define MODULES_VSIZE		(SZ_128M)
 #define PCI_IO_END		(PAGE_OFFSET - SZ_2M)
 #define PCI_IO_START		(PCI_IO_END - PCI_IO_SIZE)
 #define FIXADDR_TOP		(PCI_IO_START - SZ_2M)
@@ -139,6 +139,9 @@ extern phys_addr_t		memstart_addr;
 /* PHYS_OFFSET - the physical address of the start of memory. */
 #define PHYS_OFFSET		({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })
 
+/* the virtual base of the kernel image (minus TEXT_OFFSET) */
+extern u64			kimage_vaddr;
+
 /* the offset between the kernel virtual and physical mappings */
 extern u64			kimage_voffset;
 
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index 8652fb613304..e12af6754634 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -31,4 +31,10 @@ struct mod_arch_specific {
 u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela,
 			  Elf64_Sym *sym);
 
+#ifdef CONFIG_RANDOMIZE_BASE
+extern u64 module_alloc_base;
+#else
+#define module_alloc_base	((u64)_etext - MODULES_VSIZE)
+#endif
+
 #endif /* __ASM_MODULE_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 8d971f9c6ed5..49a2430b0786 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -43,6 +43,7 @@ arm64-obj-$(CONFIG_PCI)			+= pci.o
 arm64-obj-$(CONFIG_ARMV8_DEPRECATED)	+= armv8_deprecated.o
 arm64-obj-$(CONFIG_ACPI)		+= acpi.o
 arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL)	+= acpi_parking_protocol.o
+arm64-obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr.o
 
 obj-y					+= $(arm64-obj-y) vdso/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 4e69412a7323..319f896c6e74 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -210,6 +210,7 @@ section_table:
 ENTRY(stext)
 	bl	preserve_boot_args
 	bl	el2_setup			// Drop to EL1, w20=cpu_boot_mode
+	mov	x23, xzr			// KASLR offset, defaults to 0
 	adrp	x24, __PHYS_OFFSET
 	bl	set_cpu_boot_mode_flag
 	bl	__create_page_tables		// x25=TTBR0, x26=TTBR1
@@ -313,7 +314,7 @@ ENDPROC(preserve_boot_args)
 __create_page_tables:
 	adrp	x25, idmap_pg_dir
 	adrp	x26, swapper_pg_dir
-	mov	x27, lr
+	mov	x28, lr
 
 	/*
 	 * Invalidate the idmap and swapper page tables to avoid potential
@@ -392,6 +393,7 @@ __create_page_tables:
 	 */
 	mov	x0, x26				// swapper_pg_dir
 	ldr	x5, =KIMAGE_VADDR
+	add	x5, x5, x23			// add KASLR displacement
 	create_pgd_entry x0, x5, x3, x6
 	ldr	w6, kernel_img_size
 	add	x6, x6, x5
@@ -408,8 +410,7 @@ __create_page_tables:
 	dmb	sy
 	bl	__inval_cache_range
 
-	mov	lr, x27
-	ret
+	ret	x28
 ENDPROC(__create_page_tables)
 
 kernel_img_size:
@@ -421,6 +422,7 @@ kernel_img_size:
  */
 	.set	initial_sp, init_thread_union + THREAD_START_SP
 __mmap_switched:
+	mov	x28, lr				// preserve LR
 	adr_l	x8, vectors			// load VBAR_EL1 with virtual
 	msr	vbar_el1, x8			// vector table address
 	isb
@@ -449,19 +451,26 @@ __mmap_switched:
 	ldr	x13, [x9, #-8]
 	cmp	w12, #R_AARCH64_RELATIVE
 	b.ne	1f
-	str	x13, [x11]
+	add	x13, x13, x23			// relocate
+	str	x13, [x11, x23]
 	b	0b
 
 1:	cmp	w12, #R_AARCH64_ABS64
 	b.ne	0b
 	add	x12, x12, x12, lsl #1		// symtab offset: 24x top word
 	add	x12, x8, x12, lsr #(32 - 3)	// ... shifted into bottom word
+	ldrsh	w14, [x12, #6]			// Elf64_Sym::st_shndx
 	ldr	x15, [x12, #8]			// Elf64_Sym::st_value
+	cmp	w14, #-0xf			// SHN_ABS (0xfff1) ?
+	add	x14, x15, x23			// relocate
+	csel	x15, x14, x15, ne
 	add	x15, x13, x15
-	str	x15, [x11]
+	str	x15, [x11, x23]
 	b	0b
 
-2:
+2:	adr_l	x8, kimage_vaddr		// make relocated kimage_vaddr
+	dc	cvac, x8			// value visible to secondaries
+	dsb	sy				// with MMU off
 #endif
 
 	adr_l	sp, initial_sp, x4
@@ -470,13 +479,23 @@ __mmap_switched:
 	msr	sp_el0, x4			// Save thread_info
 	str_l	x21, __fdt_pointer, x5		// Save FDT pointer
 
-	ldr	x4, =KIMAGE_VADDR		// Save the offset between
+	ldr_l	x4, kimage_vaddr		// Save the offset between
 	sub	x4, x4, x24			// the kernel virtual and
 	str_l	x4, kimage_voffset, x5		// physical mappings
 
 	mov	x29, #0
 #ifdef CONFIG_KASAN
 	bl	kasan_early_init
+#endif
+#ifdef CONFIG_RANDOMIZE_BASE
+	cbnz	x23, 0f				// already running randomized?
+	mov	x0, x21				// pass FDT address in x0
+	bl	kaslr_early_init		// parse FDT for KASLR options
+	cbz	x0, 0f				// KASLR disabled? just proceed
+	mov	x23, x0				// record KASLR offset
+	ret	x28				// we must enable KASLR, return
+						// to __enable_mmu()
+0:
 #endif
 	b	start_kernel
 ENDPROC(__mmap_switched)
@@ -486,6 +505,10 @@ ENDPROC(__mmap_switched)
  * hotplug and needs to have the same protections as the text region
  */
 	.section ".text","ax"
+
+ENTRY(kimage_vaddr)
+	.quad		_text - TEXT_OFFSET
+
 /*
  * If we're fortunate enough to boot at EL2, ensure that the world is
  * sane before dropping to EL1.
@@ -651,7 +674,7 @@ ENTRY(secondary_startup)
 	adrp	x26, swapper_pg_dir
 	bl	__cpu_setup			// initialise processor
 
-	ldr	x8, =KIMAGE_VADDR
+	ldr	x8, kimage_vaddr
 	ldr	w9, 0f
 	sub	x27, x8, w9, sxtw		// address to jump to after enabling the MMU
 	b	__enable_mmu
@@ -684,6 +707,7 @@ ENDPROC(__secondary_switched)
  */
 	.section	".idmap.text", "ax"
 __enable_mmu:
+	mrs	x18, sctlr_el1			// preserve old SCTLR_EL1 value
 	mrs	x1, ID_AA64MMFR0_EL1
 	ubfx	x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4
 	cmp	x2, #ID_AA64MMFR0_TGRAN_SUPPORTED
@@ -701,6 +725,25 @@ __enable_mmu:
 	ic	iallu
 	dsb	nsh
 	isb
+#ifdef CONFIG_RANDOMIZE_BASE
+	mov	x19, x0				// preserve new SCTLR_EL1 value
+	blr	x27
+
+	/*
+	 * If we return here, we have a KASLR displacement in x23 which we need
+	 * to take into account by discarding the current kernel mapping and
+	 * creating a new one.
+	 */
+	msr	sctlr_el1, x18			// disable the MMU
+	isb
+	bl	__create_page_tables		// recreate kernel mapping
+
+	msr	sctlr_el1, x19			// re-enable the MMU
+	isb
+	ic	ialluis				// flush instructions fetched
+	isb					// via old mapping
+	add	x27, x27, x23			// relocated __mmap_switched
+#endif
 	br	x27
 ENDPROC(__enable_mmu)
 
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
new file mode 100644
index 000000000000..8b32a1f8f09f
--- /dev/null
+++ b/arch/arm64/kernel/kaslr.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/libfdt.h>
+#include <linux/mm_types.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include <asm/fixmap.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/memory.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+
+u64 __read_mostly module_alloc_base;
+
+static __init u64 get_kaslr_seed(void *fdt)
+{
+	int node, len;
+	u64 *prop;
+	u64 ret;
+
+	node = fdt_path_offset(fdt, "/chosen");
+	if (node < 0)
+		return 0;
+
+	prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len);
+	if (!prop || len != sizeof(u64))
+		return 0;
+
+	ret = fdt64_to_cpu(*prop);
+	*prop = 0;
+	return ret;
+}
+
+static __init const u8 *get_cmdline(void *fdt)
+{
+	static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE;
+
+	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) {
+		int node;
+		const u8 *prop;
+
+		node = fdt_path_offset(fdt, "/chosen");
+		if (node < 0)
+			goto out;
+
+		prop = fdt_getprop(fdt, node, "bootargs", NULL);
+		if (!prop)
+			goto out;
+		return prop;
+	}
+out:
+	return default_cmdline;
+}
+
+extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size,
+				       pgprot_t prot);
+
+/*
+ * This routine will be executed with the kernel mapped at its default virtual
+ * address, and if it returns successfully, the kernel will be remapped, and
+ * start_kernel() will be executed from a randomized virtual offset. The
+ * relocation will result in all absolute references (e.g., static variables
+ * containing function pointers) to be reinitialized, and zero-initialized
+ * .bss variables will be reset to 0.
+ */
+u64 __init kaslr_early_init(u64 dt_phys)
+{
+	void *fdt;
+	u64 seed, offset, mask, module_range;
+	const u8 *cmdline, *str;
+	int size;
+
+	/*
+	 * Set a reasonable default for module_alloc_base in case
+	 * we end up running with module randomization disabled.
+	 */
+	module_alloc_base = (u64)_etext - MODULES_VSIZE;
+
+	/*
+	 * Try to map the FDT early. If this fails, we simply bail,
+	 * and proceed with KASLR disabled. We will make another
+	 * attempt at mapping the FDT in setup_machine()
+	 */
+	early_fixmap_init();
+	fdt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL);
+	if (!fdt)
+		return 0;
+
+	/*
+	 * Retrieve (and wipe) the seed from the FDT
+	 */
+	seed = get_kaslr_seed(fdt);
+	if (!seed)
+		return 0;
+
+	/*
+	 * Check if 'nokaslr' appears on the command line, and
+	 * return 0 if that is the case.
+	 */
+	cmdline = get_cmdline(fdt);
+	str = strstr(cmdline, "nokaslr");
+	if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
+		return 0;
+
+	/*
+	 * OK, so we are proceeding with KASLR enabled. Calculate a suitable
+	 * kernel image offset from the seed. Let's place the kernel in the
+	 * lower half of the VMALLOC area (VA_BITS - 2).
+	 * Even if we could randomize at page granularity for 16k and 64k pages,
+	 * let's always round to 2 MB so we don't interfere with the ability to
+	 * map using contiguous PTEs
+	 */
+	mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1);
+	offset = seed & mask;
+
+	/*
+	 * The kernel Image should not extend across a 1GB/32MB/512MB alignment
+	 * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
+	 * happens, increase the KASLR offset by the size of the kernel image.
+	 */
+	if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) !=
+	    (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT))
+		offset = (offset + (u64)(_end - _text)) & mask;
+
+	if (IS_ENABLED(CONFIG_KASAN))
+		/*
+		 * KASAN does not expect the module region to intersect the
+		 * vmalloc region, since shadow memory is allocated for each
+		 * module at load time, whereas the vmalloc region is shadowed
+		 * by KASAN zero pages. So keep modules out of the vmalloc
+		 * region if KASAN is enabled.
+		 */
+		return offset;
+
+	if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
+		/*
+		 * Randomize the module region independently from the core
+		 * kernel. This prevents modules from leaking any information
+		 * about the address of the kernel itself, but results in
+		 * branches between modules and the core kernel that are
+		 * resolved via PLTs. (Branches between modules will be
+		 * resolved normally.)
+		 */
+		module_range = VMALLOC_END - VMALLOC_START - MODULES_VSIZE;
+		module_alloc_base = VMALLOC_START;
+	} else {
+		/*
+		 * Randomize the module region by setting module_alloc_base to
+		 * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE,
+		 * _stext) . This guarantees that the resulting region still
+		 * covers [_stext, _etext], and that all relative branches can
+		 * be resolved without veneers.
+		 */
+		module_range = MODULES_VSIZE - (u64)(_etext - _stext);
+		module_alloc_base = (u64)_etext + offset - MODULES_VSIZE;
+	}
+
+	/* use the lower 21 bits to randomize the base of the module region */
+	module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21;
+	module_alloc_base &= PAGE_MASK;
+
+	return offset;
+}
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index a9dde97f5ca5..7f316982ce00 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -34,7 +34,8 @@ void *module_alloc(unsigned long size)
 {
 	void *p;
 
-	p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
+	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
+				module_alloc_base + MODULES_VSIZE,
 				GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
 				NUMA_NO_NODE, __builtin_return_address(0));
 
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 079db3ba21c4..29b8c247d56f 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -388,3 +388,32 @@ static int __init topology_init(void)
 	return 0;
 }
 subsys_initcall(topology_init);
+
+/*
+ * Dump out kernel offset information on panic.
+ */
+static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
+			      void *p)
+{
+	u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR;
+
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) {
+		pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n",
+			 kaslr_offset, KIMAGE_VADDR);
+	} else {
+		pr_emerg("Kernel Offset: disabled\n");
+	}
+	return 0;
+}
+
+static struct notifier_block kernel_offset_notifier = {
+	.notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list,
+				       &kernel_offset_notifier);
+	return 0;
+}
+__initcall(register_kernel_offset_dumper);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 7f10cc91fa8a..56e19d150c21 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -129,12 +129,16 @@ static void __init clear_pgds(unsigned long start,
 void __init kasan_init(void)
 {
 	u64 kimg_shadow_start, kimg_shadow_end;
+	u64 mod_shadow_start, mod_shadow_end;
 	struct memblock_region *reg;
 	int i;
 
 	kimg_shadow_start = (u64)kasan_mem_to_shadow(_text);
 	kimg_shadow_end = (u64)kasan_mem_to_shadow(_end);
 
+	mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR);
+	mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END);
+
 	/*
 	 * We are going to perform proper setup of shadow memory.
 	 * At first we should unmap early shadow (clear_pgds() call bellow).
@@ -158,13 +162,20 @@ void __init kasan_init(void)
 	 * with PMD table mappings at the edges of the shadow region for the
 	 * kernel image.
 	 */
-	if (ARM64_SWAPPER_USES_SECTION_MAPS)
+	if (ARM64_SWAPPER_USES_SECTION_MAPS) {
+		kimg_shadow_start = round_down(kimg_shadow_start,
+					       SWAPPER_BLOCK_SIZE);
 		kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE);
+	}
 
 	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
-			kasan_mem_to_shadow((void *)MODULES_VADDR));
+				   (void *)mod_shadow_start);
 	kasan_populate_zero_shadow((void *)kimg_shadow_end,
-			kasan_mem_to_shadow((void *)PAGE_OFFSET));
+				   kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+	if (kimg_shadow_start > mod_shadow_end)
+		kasan_populate_zero_shadow((void *)mod_shadow_end,
+					   (void *)kimg_shadow_start);
 
 	for_each_memblock(memory, reg) {
 		void *start = (void *)__phys_to_virt(reg->base);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8577437b6c0a..f4d1da0c1531 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -676,7 +676,8 @@ void __init early_fixmap_init(void)
 	unsigned long addr = FIXADDR_START;
 
 	pgd = pgd_offset_k(addr);
-	if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) {
+	if (CONFIG_PGTABLE_LEVELS > 3 &&
+	    !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) {
 		/*
 		 * We only end up here if the kernel mapping and the fixmap
 		 * share the top level pgd entry, which should only happen on
@@ -733,11 +734,10 @@ void __set_fixmap(enum fixed_addresses idx,
 	}
 }
 
-void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
+void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 {
 	const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
-	pgprot_t prot = PAGE_KERNEL_RO;
-	int size, offset;
+	int offset;
 	void *dt_virt;
 
 	/*
@@ -776,16 +776,27 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 	if (fdt_check_header(dt_virt) != 0)
 		return NULL;
 
-	size = fdt_totalsize(dt_virt);
-	if (size > MAX_FDT_SIZE)
+	*size = fdt_totalsize(dt_virt);
+	if (*size > MAX_FDT_SIZE)
 		return NULL;
 
-	if (offset + size > SWAPPER_BLOCK_SIZE)
+	if (offset + *size > SWAPPER_BLOCK_SIZE)
 		create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
-			       round_up(offset + size, SWAPPER_BLOCK_SIZE), prot);
+			       round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
+
+	return dt_virt;
+}
+
+void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
+{
+	void *dt_virt;
+	int size;
+
+	dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);
+	if (!dt_virt)
+		return NULL;
 
 	memblock_reserve(dt_phys, size);
-
 	return dt_virt;
 }
 

From 09cbceb939d85d7d25b8e424445a2f83cac97c9d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 26 Feb 2016 17:57:13 +0100
Subject: [PATCH 0440/3220] UPSTREAM: arm64: vmemmap: use virtual projection of
 linear region

Commit dd006da21646 ("arm64: mm: increase VA range of identity map") made
some changes to the memory mapping code to allow physical memory to reside
at an offset that exceeds the size of the virtual mapping.

However, since the size of the vmemmap area is proportional to the size of
the VA area, but it is populated relative to the physical space, we may
end up with the struct page array being mapped outside of the vmemmap
region. For instance, on my Seattle A0 box, I can see the following output
in the dmesg log.

   vmemmap : 0xffffffbdc0000000 - 0xffffffbfc0000000   (     8 GB maximum)
             0xffffffbfc0000000 - 0xffffffbfd0000000   (   256 MB actual)

We can fix this by deciding that the vmemmap region is not a projection of
the physical space, but of the virtual space above PAGE_OFFSET, i.e., the
linear region. This way, we are guaranteed that the vmemmap region is of
sufficient size, and we can even reduce the size by half.

Cc: <stable@vger.kernel.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit dfd55ad85e4a7fbaa82df12467515ac3c81e8a3e)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I8112d910f9659941dab6de5b3791f395150c77f1
---
 arch/arm64/include/asm/pgtable.h | 7 ++++---
 arch/arm64/mm/init.c             | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9bf72b72b76d..d48ea8fc789d 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -34,18 +34,19 @@
 /*
  * VMALLOC and SPARSEMEM_VMEMMAP ranges.
  *
- * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array
+ * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array
  *	(rounded up to PUD_SIZE).
  * VMALLOC_START: beginning of the kernel vmalloc space
  * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
  *	fixed mappings and modules
  */
-#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
+#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE)
 
 #define VMALLOC_START		(MODULES_END)
 #define VMALLOC_END		(PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
 
-#define vmemmap			((struct page *)(VMALLOC_END + SZ_64K))
+#define VMEMMAP_START		(VMALLOC_END + SZ_64K)
+#define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
 
 #define FIRST_USER_ADDRESS	0UL
 
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index e333f7f0b4f2..04c7e8ba47ce 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -365,8 +365,8 @@ void __init mem_init(void)
 		  MLK_ROUNDUP(_text, _etext),
 		  MLK_ROUNDUP(_sdata, _edata),
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-		  MLG((unsigned long)vmemmap,
-		      (unsigned long)vmemmap + VMEMMAP_SIZE),
+		  MLG(VMEMMAP_START,
+		      VMEMMAP_START + VMEMMAP_SIZE),
 		  MLM((unsigned long)virt_to_page(PAGE_OFFSET),
 		      (unsigned long)virt_to_page(high_memory)),
 #endif

From ee97096e89bbb5f39614638bd32fded97eb08691 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 26 Feb 2016 17:57:14 +0100
Subject: [PATCH 0441/3220] UPSTREAM: arm64: mm: treat memstart_addr as a
 signed quantity

Commit c031a4213c11 ("arm64: kaslr: randomize the linear region")
implements randomization of the linear region, by subtracting a random
multiple of PUD_SIZE from memstart_addr. This causes the virtual mapping
of system RAM to move upwards in the linear region, and at the same time
causes memstart_addr to assume a value which may be negative if the offset
of system RAM in the physical space is smaller than its offset relative to
PAGE_OFFSET in the virtual space.

Since memstart_addr is effectively an offset now, redefine its type as s64
so that expressions involving shifting or division preserve its sign.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 020d044f66874eba058ce8264fc550f3eca67879)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I0482ebc13baaa9005cf372795e656c2417be9d1c
---
 arch/arm64/include/asm/memory.h | 2 +-
 arch/arm64/mm/init.c            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 5f8667a99e41..12f8a00fb3f1 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -135,7 +135,7 @@
 #include <linux/bitops.h>
 #include <linux/mmdebug.h>
 
-extern phys_addr_t		memstart_addr;
+extern s64			memstart_addr;
 /* PHYS_OFFSET - the physical address of the start of memory. */
 #define PHYS_OFFSET		({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })
 
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 04c7e8ba47ce..3d75ff40c5ff 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -54,7 +54,7 @@
  * executes, which assigns it its actual value. So use a default value
  * that cannot be mistaken for a real physical address.
  */
-phys_addr_t memstart_addr __read_mostly = ~0ULL;
+s64 memstart_addr __read_mostly = -1;
 phys_addr_t arm64_dma_phys_limit __read_mostly;
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -183,7 +183,7 @@ void __init arm64_memblock_init(void)
 	 * linear mapping. Take care not to clip the kernel which may be
 	 * high in memory.
 	 */
-	memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)),
+	memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)),
 			ULLONG_MAX);
 	if (memblock_end_of_DRAM() > linear_region_size)
 		memblock_remove(0, memblock_end_of_DRAM() - linear_region_size);

From 22346f1e99e4ed88ed8f68d95875a92395c3fea4 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 29 Jan 2016 11:59:03 +0100
Subject: [PATCH 0442/3220] BACKPORT: arm64: kaslr: randomize the linear region

When KASLR is enabled (CONFIG_RANDOMIZE_BASE=y), and entropy has been
provided by the bootloader, randomize the placement of RAM inside the
linear region if sufficient space is available. For instance, on a 4KB
granule/3 levels kernel, the linear region is 256 GB in size, and we can
choose any 1 GB aligned offset that is far enough from the top of the
address space to fit the distance between the start of the lowest memblock
and the top of the highest memblock.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 031a4213c11a5db475f528c182f7b3858df11db)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I272de8ee358351d95eacc7dc5f47600adec3e813
---
 arch/arm64/kernel/kaslr.c |  4 ++++
 arch/arm64/mm/init.c      | 22 ++++++++++++++++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 8b32a1f8f09f..582983920054 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -21,6 +21,7 @@
 #include <asm/sections.h>
 
 u64 __read_mostly module_alloc_base;
+u16 __initdata memstart_offset_seed;
 
 static __init u64 get_kaslr_seed(void *fdt)
 {
@@ -123,6 +124,9 @@ u64 __init kaslr_early_init(u64 dt_phys)
 	mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1);
 	offset = seed & mask;
 
+	/* use the top 16 bits to randomize the linear region */
+	memstart_offset_seed = seed >> 48;
+
 	/*
 	 * The kernel Image should not extend across a 1GB/32MB/512MB alignment
 	 * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 3d75ff40c5ff..8e3cf19e4f00 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -198,6 +198,23 @@ void __init arm64_memblock_init(void)
 		memblock_add(__pa(_text), (u64)(_end - _text));
 	}
 
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+		extern u16 memstart_offset_seed;
+		u64 range = linear_region_size -
+			    (memblock_end_of_DRAM() - memblock_start_of_DRAM());
+
+		/*
+		 * If the size of the linear region exceeds, by a sufficient
+		 * margin, the size of the region that the available physical
+		 * memory spans, randomize the linear region as well.
+		 */
+		if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {
+			range = range / ARM64_MEMSTART_ALIGN + 1;
+			memstart_addr -= ARM64_MEMSTART_ALIGN *
+					 ((range * memstart_offset_seed) >> 16);
+		}
+	}
+
 	/*
 	 * Register the kernel text, kernel data, initrd, and initial
 	 * pagetables with memblock.
@@ -367,12 +384,13 @@ void __init mem_init(void)
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 		  MLG(VMEMMAP_START,
 		      VMEMMAP_START + VMEMMAP_SIZE),
-		  MLM((unsigned long)virt_to_page(PAGE_OFFSET),
+		  MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()),
 		      (unsigned long)virt_to_page(high_memory)),
 #endif
 		  MLK(FIXADDR_START, FIXADDR_TOP),
 		  MLM(PCI_IO_START, PCI_IO_END),
-		  MLM(PAGE_OFFSET, (unsigned long)high_memory));
+		  MLM(__phys_to_virt(memblock_start_of_DRAM()),
+		      (unsigned long)high_memory));
 
 #undef MLK
 #undef MLM

From 39b0815e4872fe6eb20a7d1b42b551e341cb7167 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sun, 10 Jan 2016 11:29:07 +0100
Subject: [PATCH 0443/3220] BACKPORT: efi: stub: implement
 efi_get_random_bytes() based on EFI_RNG_PROTOCOL

This exposes the firmware's implementation of EFI_RNG_PROTOCOL via a new
function efi_get_random_bytes().

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit e4fbf4767440472f9d23b0f25a2b905e1c63b6a8)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Id46036b78c2efd223b6cd5488e512fd93e8f597d
---
 drivers/firmware/efi/libstub/Makefile  |  2 +-
 drivers/firmware/efi/libstub/efistub.h |  3 +++
 drivers/firmware/efi/libstub/random.c  | 35 ++++++++++++++++++++++++++
 include/linux/efi.h                    |  6 ++++-
 4 files changed, 44 insertions(+), 2 deletions(-)
 create mode 100644 drivers/firmware/efi/libstub/random.c

diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index c0ddd1b8dca3..68bdd9f23e13 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -34,7 +34,7 @@ $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
 lib-$(CONFIG_EFI_ARMSTUB)	+= arm-stub.o fdt.o string.o \
 				   $(patsubst %.c,lib-%.o,$(arm-deps))
 
-lib-$(CONFIG_ARM64)		+= arm64-stub.o
+lib-$(CONFIG_ARM64)		+= arm64-stub.o random.o
 CFLAGS_arm64-stub.o 		:= -DTEXT_OFFSET=$(TEXT_OFFSET)
 
 #
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 6b6548fda089..206b7252b9d1 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -43,4 +43,7 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size,
 		     unsigned long desc_size, efi_memory_desc_t *runtime_map,
 		     int *count);
 
+efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table,
+				  unsigned long size, u8 *out);
+
 #endif
diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c
new file mode 100644
index 000000000000..97941ee5954f
--- /dev/null
+++ b/drivers/firmware/efi/libstub/random.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 Linaro Ltd;  <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/efi.h>
+#include <asm/efi.h>
+
+#include "efistub.h"
+
+struct efi_rng_protocol {
+	efi_status_t (*get_info)(struct efi_rng_protocol *,
+				 unsigned long *, efi_guid_t *);
+	efi_status_t (*get_rng)(struct efi_rng_protocol *,
+				efi_guid_t *, unsigned long, u8 *out);
+};
+
+efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table_arg,
+				  unsigned long size, u8 *out)
+{
+	efi_guid_t rng_proto = EFI_RNG_PROTOCOL_GUID;
+	efi_status_t status;
+	struct efi_rng_protocol *rng;
+
+	status = efi_call_early(locate_protocol, &rng_proto, NULL,
+				(void **)&rng);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	return rng->get_rng(rng, NULL, size, out);
+}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 569b5a866bb1..e747eb08b2be 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -299,7 +299,7 @@ typedef struct {
 	void *open_protocol_information;
 	void *protocols_per_handle;
 	void *locate_handle_buffer;
-	void *locate_protocol;
+	efi_status_t (*locate_protocol)(efi_guid_t *, void *, void **);
 	void *install_multiple_protocol_interfaces;
 	void *uninstall_multiple_protocol_interfaces;
 	void *calculate_crc32;
@@ -599,6 +599,10 @@ void efi_native_runtime_setup(void);
 #define EFI_PROPERTIES_TABLE_GUID \
     EFI_GUID(  0x880aaca3, 0x4adc, 0x4a04, 0x90, 0x79, 0xb7, 0x47, 0x34, 0x08, 0x25, 0xe5 )
 
+#define EFI_RNG_PROTOCOL_GUID \
+	EFI_GUID(0x3152bca5, 0xeade, 0x433d, \
+		 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44)
+
 typedef struct {
 	efi_guid_t guid;
 	u64 table;

From 718645ea36329edb7d1c01871c13385fe1a2f03a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 11 Jan 2016 10:43:16 +0100
Subject: [PATCH 0444/3220] UPSTREAM: efi: stub: add implementation of
 efi_random_alloc()

This implements efi_random_alloc(), which allocates a chunk of memory of
a certain size at a certain alignment, and uses the random_seed argument
it receives to randomize the address of the allocation.

This is implemented by iterating over the UEFI memory map, counting the
number of suitable slots (aligned offsets) within each region, and picking
a random number between 0 and 'number of slots - 1' to select the slot,
This should guarantee that each possible offset is chosen equally likely.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 2ddbfc81eac84a299cb4747a8764bc43f23e9008)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I8f59e3e91a71c752d69fd08ca43a890977c82919
---
 drivers/firmware/efi/libstub/efistub.h |   4 +
 drivers/firmware/efi/libstub/random.c  | 100 +++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 206b7252b9d1..5ed3d3f38166 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -46,4 +46,8 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size,
 efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table,
 				  unsigned long size, u8 *out);
 
+efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg,
+			      unsigned long size, unsigned long align,
+			      unsigned long *addr, unsigned long random_seed);
+
 #endif
diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c
index 97941ee5954f..53f6d3fe6d86 100644
--- a/drivers/firmware/efi/libstub/random.c
+++ b/drivers/firmware/efi/libstub/random.c
@@ -33,3 +33,103 @@ efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table_arg,
 
 	return rng->get_rng(rng, NULL, size, out);
 }
+
+/*
+ * Return the number of slots covered by this entry, i.e., the number of
+ * addresses it covers that are suitably aligned and supply enough room
+ * for the allocation.
+ */
+static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
+					 unsigned long size,
+					 unsigned long align)
+{
+	u64 start, end;
+
+	if (md->type != EFI_CONVENTIONAL_MEMORY)
+		return 0;
+
+	start = round_up(md->phys_addr, align);
+	end = round_down(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - size,
+			 align);
+
+	if (start > end)
+		return 0;
+
+	return (end - start + 1) / align;
+}
+
+/*
+ * The UEFI memory descriptors have a virtual address field that is only used
+ * when installing the virtual mapping using SetVirtualAddressMap(). Since it
+ * is unused here, we can reuse it to keep track of each descriptor's slot
+ * count.
+ */
+#define MD_NUM_SLOTS(md)	((md)->virt_addr)
+
+efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg,
+			      unsigned long size,
+			      unsigned long align,
+			      unsigned long *addr,
+			      unsigned long random_seed)
+{
+	unsigned long map_size, desc_size, total_slots = 0, target_slot;
+	efi_status_t status;
+	efi_memory_desc_t *memory_map;
+	int map_offset;
+
+	status = efi_get_memory_map(sys_table_arg, &memory_map, &map_size,
+				    &desc_size, NULL, NULL);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	if (align < EFI_ALLOC_ALIGN)
+		align = EFI_ALLOC_ALIGN;
+
+	/* count the suitable slots in each memory map entry */
+	for (map_offset = 0; map_offset < map_size; map_offset += desc_size) {
+		efi_memory_desc_t *md = (void *)memory_map + map_offset;
+		unsigned long slots;
+
+		slots = get_entry_num_slots(md, size, align);
+		MD_NUM_SLOTS(md) = slots;
+		total_slots += slots;
+	}
+
+	/* find a random number between 0 and total_slots */
+	target_slot = (total_slots * (u16)random_seed) >> 16;
+
+	/*
+	 * target_slot is now a value in the range [0, total_slots), and so
+	 * it corresponds with exactly one of the suitable slots we recorded
+	 * when iterating over the memory map the first time around.
+	 *
+	 * So iterate over the memory map again, subtracting the number of
+	 * slots of each entry at each iteration, until we have found the entry
+	 * that covers our chosen slot. Use the residual value of target_slot
+	 * to calculate the randomly chosen address, and allocate it directly
+	 * using EFI_ALLOCATE_ADDRESS.
+	 */
+	for (map_offset = 0; map_offset < map_size; map_offset += desc_size) {
+		efi_memory_desc_t *md = (void *)memory_map + map_offset;
+		efi_physical_addr_t target;
+		unsigned long pages;
+
+		if (target_slot >= MD_NUM_SLOTS(md)) {
+			target_slot -= MD_NUM_SLOTS(md);
+			continue;
+		}
+
+		target = round_up(md->phys_addr, align) + target_slot * align;
+		pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+
+		status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS,
+					EFI_LOADER_DATA, pages, &target);
+		if (status == EFI_SUCCESS)
+			*addr = target;
+		break;
+	}
+
+	efi_call_early(free_pool, memory_map);
+
+	return status;
+}

From 3fafb1a67d5261cd8a0d9a9ad0316baf98d0bba8 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 11 Jan 2016 11:47:49 +0100
Subject: [PATCH 0445/3220] UPSTREAM: efi: stub: use high allocation for
 converted command line

Before we can move the command line processing before the allocation
of the kernel, which is required for detecting the 'nokaslr' option
which controls that allocation, move the converted command line higher
up in memory, to prevent it from interfering with the kernel itself.

Since x86 needs the address to fit in 32 bits, use UINT_MAX as the upper
bound there. Otherwise, use ULONG_MAX (i.e., no limit)

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 48fcb2d0216103d15306caa4814e2381104df6d8)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ie959355658d3f2f1819bee842c77cc5eef54b8e7
---
 arch/x86/include/asm/efi.h                     | 2 ++
 drivers/firmware/efi/libstub/efi-stub-helper.c | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 0010c78c4998..08b1f2f6ea50 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -25,6 +25,8 @@
 #define EFI32_LOADER_SIGNATURE	"EL32"
 #define EFI64_LOADER_SIGNATURE	"EL64"
 
+#define MAX_CMDLINE_ADDRESS	UINT_MAX
+
 #ifdef CONFIG_X86_32
 
 
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index f07d4a67fa76..29ed2f9b218c 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -649,6 +649,10 @@ static u8 *efi_utf16_to_utf8(u8 *dst, const u16 *src, int n)
 	return dst;
 }
 
+#ifndef MAX_CMDLINE_ADDRESS
+#define MAX_CMDLINE_ADDRESS	ULONG_MAX
+#endif
+
 /*
  * Convert the unicode UEFI command line to ASCII to pass to kernel.
  * Size of memory allocated return in *cmd_line_len.
@@ -684,7 +688,8 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg,
 
 	options_bytes++;	/* NUL termination */
 
-	status = efi_low_alloc(sys_table_arg, options_bytes, 0, &cmdline_addr);
+	status = efi_high_alloc(sys_table_arg, options_bytes, 0,
+				&cmdline_addr, MAX_CMDLINE_ADDRESS);
 	if (status != EFI_SUCCESS)
 		return NULL;
 

From 4de18b705de63c094a0c23754337c750d072ade9 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 26 Jan 2016 14:48:29 +0100
Subject: [PATCH 0446/3220] UPSTREAM: arm64: efi: invoke EFI_RNG_PROTOCOL to
 supply KASLR randomness

Since arm64 does not use a decompressor that supplies an execution
environment where it is feasible to some extent to provide a source of
randomness, the arm64 KASLR kernel depends on the bootloader to supply
some random bits in the /chosen/kaslr-seed DT property upon kernel entry.

On UEFI systems, we can use the EFI_RNG_PROTOCOL, if supplied, to obtain
some random bits. At the same time, use it to randomize the offset of the
kernel Image in physical memory.

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 2b5fe07a78a09a32002642b8a823428ade611f16)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I9cb7ae5727dfdf3726b1c9544bce74722ec77bbd
---
 arch/arm64/Kconfig                        |  5 ++
 drivers/firmware/efi/libstub/arm-stub.c   | 40 +++++++----
 drivers/firmware/efi/libstub/arm64-stub.c | 84 ++++++++++++++++-------
 drivers/firmware/efi/libstub/fdt.c        | 14 ++++
 4 files changed, 105 insertions(+), 38 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a42e3235484b..0b5ec89a8ec4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -794,6 +794,11 @@ config RANDOMIZE_BASE
 	  It is the bootloader's job to provide entropy, by passing a
 	  random u64 value in /chosen/kaslr-seed at kernel entry.
 
+	  When booting via the UEFI stub, it will invoke the firmware's
+	  EFI_RNG_PROTOCOL implementation (if available) to supply entropy
+	  to the kernel proper. In addition, it will randomise the physical
+	  location of the kernel Image as well.
+
 	  If unsure, say N.
 
 config RANDOMIZE_MODULE_REGION_FULL
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index 950c87f5d279..d5aa1d16154f 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -18,6 +18,8 @@
 
 #include "efistub.h"
 
+bool __nokaslr;
+
 static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg)
 {
 	static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID;
@@ -207,14 +209,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		pr_efi_err(sys_table, "Failed to find DRAM base\n");
 		goto fail;
 	}
-	status = handle_kernel_image(sys_table, image_addr, &image_size,
-				     &reserve_addr,
-				     &reserve_size,
-				     dram_base, image);
-	if (status != EFI_SUCCESS) {
-		pr_efi_err(sys_table, "Failed to relocate kernel\n");
-		goto fail;
-	}
 
 	/*
 	 * Get the command line from EFI, using the LOADED_IMAGE
@@ -224,7 +218,28 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 	cmdline_ptr = efi_convert_cmdline(sys_table, image, &cmdline_size);
 	if (!cmdline_ptr) {
 		pr_efi_err(sys_table, "getting command line via LOADED_IMAGE_PROTOCOL\n");
-		goto fail_free_image;
+		goto fail;
+	}
+
+	/* check whether 'nokaslr' was passed on the command line */
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+		static const u8 default_cmdline[] = CONFIG_CMDLINE;
+		const u8 *str, *cmdline = cmdline_ptr;
+
+		if (IS_ENABLED(CONFIG_CMDLINE_FORCE))
+			cmdline = default_cmdline;
+		str = strstr(cmdline, "nokaslr");
+		if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
+			__nokaslr = true;
+	}
+
+	status = handle_kernel_image(sys_table, image_addr, &image_size,
+				     &reserve_addr,
+				     &reserve_size,
+				     dram_base, image);
+	if (status != EFI_SUCCESS) {
+		pr_efi_err(sys_table, "Failed to relocate kernel\n");
+		goto fail_free_cmdline;
 	}
 
 	status = efi_parse_options(cmdline_ptr);
@@ -244,7 +259,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 
 		if (status != EFI_SUCCESS) {
 			pr_efi_err(sys_table, "Failed to load device tree!\n");
-			goto fail_free_cmdline;
+			goto fail_free_image;
 		}
 	}
 
@@ -286,12 +301,11 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 	efi_free(sys_table, initrd_size, initrd_addr);
 	efi_free(sys_table, fdt_size, fdt_addr);
 
-fail_free_cmdline:
-	efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr);
-
 fail_free_image:
 	efi_free(sys_table, image_size, *image_addr);
 	efi_free(sys_table, reserve_size, reserve_addr);
+fail_free_cmdline:
+	efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr);
 fail:
 	return EFI_ERROR;
 }
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index 78dfbd34b6bf..e0e6b74fef8f 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -13,6 +13,10 @@
 #include <asm/efi.h>
 #include <asm/sections.h>
 
+#include "efistub.h"
+
+extern bool __nokaslr;
+
 efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg,
 					unsigned long *image_addr,
 					unsigned long *image_size,
@@ -23,26 +27,52 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg,
 {
 	efi_status_t status;
 	unsigned long kernel_size, kernel_memsize = 0;
-	unsigned long nr_pages;
 	void *old_image_addr = (void *)*image_addr;
 	unsigned long preferred_offset;
+	u64 phys_seed = 0;
+
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+		if (!__nokaslr) {
+			status = efi_get_random_bytes(sys_table_arg,
+						      sizeof(phys_seed),
+						      (u8 *)&phys_seed);
+			if (status == EFI_NOT_FOUND) {
+				pr_efi(sys_table_arg, "EFI_RNG_PROTOCOL unavailable, no randomness supplied\n");
+			} else if (status != EFI_SUCCESS) {
+				pr_efi_err(sys_table_arg, "efi_get_random_bytes() failed\n");
+				return status;
+			}
+		} else {
+			pr_efi(sys_table_arg, "KASLR disabled on kernel command line\n");
+		}
+	}
 
 	/*
 	 * The preferred offset of the kernel Image is TEXT_OFFSET bytes beyond
 	 * a 2 MB aligned base, which itself may be lower than dram_base, as
 	 * long as the resulting offset equals or exceeds it.
 	 */
-	preferred_offset = round_down(dram_base, SZ_2M) + TEXT_OFFSET;
+	preferred_offset = round_down(dram_base, MIN_KIMG_ALIGN) + TEXT_OFFSET;
 	if (preferred_offset < dram_base)
-		preferred_offset += SZ_2M;
+		preferred_offset += MIN_KIMG_ALIGN;
 
-	/* Relocate the image, if required. */
 	kernel_size = _edata - _text;
-	if (*image_addr != preferred_offset) {
-		kernel_memsize = kernel_size + (_end - _edata);
+	kernel_memsize = kernel_size + (_end - _edata);
 
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) {
 		/*
-		 * First, try a straight allocation at the preferred offset.
+		 * If KASLR is enabled, and we have some randomness available,
+		 * locate the kernel at a randomized offset in physical memory.
+		 */
+		*reserve_size = kernel_memsize + TEXT_OFFSET;
+		status = efi_random_alloc(sys_table_arg, *reserve_size,
+					  MIN_KIMG_ALIGN, reserve_addr,
+					  phys_seed);
+
+		*image_addr = *reserve_addr + TEXT_OFFSET;
+	} else {
+		/*
+		 * Else, try a straight allocation at the preferred offset.
 		 * This will work around the issue where, if dram_base == 0x0,
 		 * efi_low_alloc() refuses to allocate at 0x0 (to prevent the
 		 * address of the allocation to be mistaken for a FAIL return
@@ -52,27 +82,31 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg,
 		 * Mustang), we can still place the kernel at the address
 		 * 'dram_base + TEXT_OFFSET'.
 		 */
-		*image_addr = *reserve_addr = preferred_offset;
-		nr_pages = round_up(kernel_memsize, EFI_ALLOC_ALIGN) /
-			   EFI_PAGE_SIZE;
-		status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS,
-					EFI_LOADER_DATA, nr_pages,
-					(efi_physical_addr_t *)reserve_addr);
-		if (status != EFI_SUCCESS) {
-			kernel_memsize += TEXT_OFFSET;
-			status = efi_low_alloc(sys_table_arg, kernel_memsize,
-					       SZ_2M, reserve_addr);
+		if (*image_addr == preferred_offset)
+			return EFI_SUCCESS;
 
-			if (status != EFI_SUCCESS) {
-				pr_efi_err(sys_table_arg, "Failed to relocate kernel\n");
-				return status;
-			}
-			*image_addr = *reserve_addr + TEXT_OFFSET;
-		}
-		memcpy((void *)*image_addr, old_image_addr, kernel_size);
-		*reserve_size = kernel_memsize;
+		*image_addr = *reserve_addr = preferred_offset;
+		*reserve_size = round_up(kernel_memsize, EFI_ALLOC_ALIGN);
+
+		status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS,
+					EFI_LOADER_DATA,
+					*reserve_size / EFI_PAGE_SIZE,
+					(efi_physical_addr_t *)reserve_addr);
 	}
 
+	if (status != EFI_SUCCESS) {
+		*reserve_size = kernel_memsize + TEXT_OFFSET;
+		status = efi_low_alloc(sys_table_arg, *reserve_size,
+				       MIN_KIMG_ALIGN, reserve_addr);
+
+		if (status != EFI_SUCCESS) {
+			pr_efi_err(sys_table_arg, "Failed to relocate kernel\n");
+			*reserve_size = 0;
+			return status;
+		}
+		*image_addr = *reserve_addr + TEXT_OFFSET;
+	}
+	memcpy((void *)*image_addr, old_image_addr, kernel_size);
 
 	return EFI_SUCCESS;
 }
diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c
index b62e2f5dcab3..b1c22cf18f7d 100644
--- a/drivers/firmware/efi/libstub/fdt.c
+++ b/drivers/firmware/efi/libstub/fdt.c
@@ -147,6 +147,20 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt,
 	if (status)
 		goto fdt_set_fail;
 
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+		efi_status_t efi_status;
+
+		efi_status = efi_get_random_bytes(sys_table, sizeof(fdt_val64),
+						  (u8 *)&fdt_val64);
+		if (efi_status == EFI_SUCCESS) {
+			status = fdt_setprop(fdt, node, "kaslr-seed",
+					     &fdt_val64, sizeof(fdt_val64));
+			if (status)
+				goto fdt_set_fail;
+		} else if (efi_status != EFI_NOT_FOUND) {
+			return efi_status;
+		}
+	}
 	return EFI_SUCCESS;
 
 fdt_set_fail:

From a2f0639adbc63164137683f6d0c4deb88406290d Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Thu, 11 Feb 2016 13:53:10 -0800
Subject: [PATCH 0447/3220] UPSTREAM: arm64: make irq_stack_ptr more robust

Switching between stacks is only valid if we are tracing ourselves while on the
irq_stack, so it is only valid when in current and non-preemptible context,
otherwise is is just zeroed off.

Fixes: 132cd887b5c5 ("arm64: Modify stack trace and dump for use with irq_stack")
Acked-by: James Morse <james.morse@arm.com>
Tested-by: James Morse <james.morse@arm.com>
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit a80a0eb70c358f8c7dda4bb62b2278dc6285217b)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I431d3d5e8e1f556ddfef283af88dd2f63b825f7c
---
 arch/arm64/kernel/stacktrace.c | 13 ++++++-------
 arch/arm64/kernel/traps.c      | 11 ++++++++++-
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 4fad9787ab46..cfd46c227c8c 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -44,14 +44,13 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	unsigned long irq_stack_ptr;
 
 	/*
-	 * Use raw_smp_processor_id() to avoid false-positives from
-	 * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping
-	 * task stacks, we can be pre-empted in this case, so
-	 * {raw_,}smp_processor_id() may give us the wrong value. Sleeping
-	 * tasks can't ever be on an interrupt stack, so regardless of cpu,
-	 * the checks will always fail.
+	 * Switching between stacks is valid when tracing current and in
+	 * non-preemptible context.
 	 */
-	irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id());
+	if (tsk == current && !preemptible())
+		irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+	else
+		irq_stack_ptr = 0;
 
 	low  = frame->sp;
 	/* irq stacks are not THREAD_SIZE aligned */
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index cbedd724f48e..c5392081b49b 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -146,9 +146,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs)
 static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 {
 	struct stackframe frame;
-	unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+	unsigned long irq_stack_ptr;
 	int skip;
 
+	/*
+	 * Switching between stacks is valid when tracing current and in
+	 * non-preemptible context.
+	 */
+	if (tsk == current && !preemptible())
+		irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+	else
+		irq_stack_ptr = 0;
+
 	pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
 
 	if (!tsk)

From 46debe0769bb9597eaeb460f8dd0c3a85ad76af1 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 9 Mar 2016 15:22:55 +0000
Subject: [PATCH 0448/3220] UPSTREAM: arm64: hugetlb: partial revert of
 66b3923a1a0f

Commit 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit")
introduced support for huge pages using the contiguous bit in the PTE
as opposed to block mappings, which may be slightly unwieldy (512M) in
64k page configurations.

Unfortunately, this support has resulted in some late regressions when
running the libhugetlbfs test suite with 64k pages and CONFIG_DEBUG_VM
as a result of a BUG:

 | readback (2M: 64):	------------[ cut here ]------------
 | kernel BUG at fs/hugetlbfs/inode.c:446!
 | Internal error: Oops - BUG: 0 [#1] SMP
 | Modules linked in:
 | CPU: 7 PID: 1448 Comm: readback Not tainted 4.5.0-rc7 #148
 | Hardware name: linux,dummy-virt (DT)
 | task: fffffe0040964b00 ti: fffffe00c2668000 task.ti: fffffe00c2668000
 | PC is at remove_inode_hugepages+0x44c/0x480
 | LR is at remove_inode_hugepages+0x264/0x480

Rather than revert the entire patch, simply avoid advertising the
contiguous huge page sizes for now while people are actively working on
a fix. This patch can then be reverted once things have been sorted out.

Cc: David Woods <dwoods@ezchip.com>
Reported-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit ff7925848b50050732ac0401e0acf27e8b241d7b)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I3a9751fa79b2d2871dbdc06ea1aa3d1336bb4f4f
---
 arch/arm64/mm/hugetlbpage.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 82d607c3614e..da30529bb1f6 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -306,10 +306,6 @@ static __init int setup_hugepagesz(char *opt)
 		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
 	} else if (ps == PUD_SIZE) {
 		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
-	} else if (ps == (PAGE_SIZE * CONT_PTES)) {
-		hugetlb_add_hstate(CONT_PTE_SHIFT);
-	} else if (ps == (PMD_SIZE * CONT_PMDS)) {
-		hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT);
 	} else {
 		pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
 		return 0;
@@ -317,13 +313,3 @@ static __init int setup_hugepagesz(char *opt)
 	return 1;
 }
 __setup("hugepagesz=", setup_hugepagesz);
-
-#ifdef CONFIG_ARM64_64K_PAGES
-static __init int add_default_hugepagesz(void)
-{
-	if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
-		hugetlb_add_hstate(CONT_PMD_SHIFT);
-	return 0;
-}
-arch_initcall(add_default_hugepagesz);
-#endif

From b6d54c720c294dd973ddff2d111a4f3a0d35dc74 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 15 Mar 2016 11:22:57 +0000
Subject: [PATCH 0449/3220] UPSTREAM: arm64: fix KASLR boot-time I-cache
 maintenance

Commit f80fb3a3d50843a4 ("arm64: add support for kernel ASLR") missed a
DSB necessary to complete I-cache maintenance in the primary boot path,
and hence stale instructions may still be present in the I-cache and may
be executed until the I-cache maintenance naturally completes.

Since commit 8ec41987436d566f ("arm64: mm: ensure patched kernel text is
fetched from PoU"), all CPUs invalidate their I-caches after their MMU
is enabled. Prior a CPU's MMU having been enabled, arbitrary lines may
have been fetched from the PoC into I-caches. We never patch text
expected to be executed with the MMU off. Thus, it is unnecessary to
perform broadcast I-cache maintenance in the primary boot path.

This patch reduces the scope of the I-cache maintenance to the local
CPU, and adds the missing DSB with similar scope, matching prior
maintenance in the primary boot path.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Ard Biesehvuel <ard.biesheuvel@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit b90b4a608ea2401cc491828f7a385edd2e236e37)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ic66b5fec29867b86782ad6c3243642afc1f40080
---
 arch/arm64/kernel/head.S | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 319f896c6e74..a88a15447c3b 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -740,8 +740,9 @@ __enable_mmu:
 
 	msr	sctlr_el1, x19			// re-enable the MMU
 	isb
-	ic	ialluis				// flush instructions fetched
-	isb					// via old mapping
+	ic	iallu				// flush instructions fetched
+	dsb	nsh				// via old mapping
+	isb
 	add	x27, x27, x23			// relocated __mmap_switched
 #endif
 	br	x27

From 25fd795692e05141f36099f778c054a86d586812 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 22 Mar 2016 10:11:45 +0000
Subject: [PATCH 0450/3220] UPSTREAM: arm64: consistently use p?d_set_huge

Commit 324420bf91f60582 ("arm64: add support for ioremap() block
mappings") added new p?d_set_huge functions which do the hard work to
generate and set a correct block entry.

These differ from open-coded huge page creation in the early page table
code by explicitly setting the P?D_TYPE_SECT bits (which are implicitly
retained by mk_sect_prot() for any valid prot), but are otherwise
identical (and cannot fail on arm64).

For simplicity and consistency, make use of these in the initial page
table creation code.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit c661cb1c537e2364bfdabb298fb934fd77445e98)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I25e58a1626831c2c709abcded989d1770fea851c
---
 arch/arm64/mm/mmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index f4d1da0c1531..72e634533a3f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -211,8 +211,7 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
 		if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
 		      block_mappings_allowed(pgtable_alloc)) {
 			pmd_t old_pmd =*pmd;
-			set_pmd(pmd, __pmd(phys |
-					   pgprot_val(mk_sect_prot(prot))));
+			pmd_set_huge(pmd, phys, prot);
 			/*
 			 * Check for previous table entries created during
 			 * boot (__create_page_tables) and flush them.
@@ -272,8 +271,7 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
 		if (use_1G_block(addr, next, phys) &&
 		    block_mappings_allowed(pgtable_alloc)) {
 			pud_t old_pud = *pud;
-			set_pud(pud, __pud(phys |
-					   pgprot_val(mk_sect_prot(prot))));
+			pud_set_huge(pud, phys, prot);
 
 			/*
 			 * If we have an old value for a pud, it will

From 1c4143d40cec48cac343bf1e8a426746ac240f92 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 10 Mar 2016 18:41:16 +0000
Subject: [PATCH 0451/3220] UPSTREAM: arm64: kasan: Fix zero shadow mapping
 overriding kernel image shadow

With the 16KB and 64KB page size configurations, SWAPPER_BLOCK_SIZE is
PAGE_SIZE and ARM64_SWAPPER_USES_SECTION_MAPS is 0. Since
kimg_shadow_end is not page aligned (_end shifted by
KASAN_SHADOW_SCALE_SHIFT), the edges of previously mapped kernel image
shadow via vmemmap_populate() may be overridden by subsequent calls to
kasan_populate_zero_shadow(), leading to kernel panics like below:

------------------------------------------------------------------------------
Unable to handle kernel paging request at virtual address fffffc100135068c
pgd = fffffc8009ac0000
[fffffc100135068c] *pgd=00000009ffee0003, *pud=00000009ffee0003, *pmd=00000009ffee0003, *pte=00e0000081a00793
Internal error: Oops: 9600004f [#1] PREEMPT SMP
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.5.0-rc4+ #1984
Hardware name: Juno (DT)
task: fffffe09001a0000 ti: fffffe0900200000 task.ti: fffffe0900200000
PC is at __memset+0x4c/0x200
LR is at kasan_unpoison_shadow+0x34/0x50
pc : [<fffffc800846f1cc>] lr : [<fffffc800821ff54>] pstate: 00000245
sp : fffffe0900203db0
x29: fffffe0900203db0 x28: 0000000000000000
x27: 0000000000000000 x26: 0000000000000000
x25: fffffc80099b69d0 x24: 0000000000000001
x23: 0000000000000000 x22: 0000000000002000
x21: dffffc8000000000 x20: 1fffff9001350a8c
x19: 0000000000002000 x18: 0000000000000008
x17: 0000000000000147 x16: ffffffffffffffff
x15: 79746972100e041d x14: ffffff0000000000
x13: ffff000000000000 x12: 0000000000000000
x11: 0101010101010101 x10: 1fffffc11c000000
x9 : 0000000000000000 x8 : fffffc100135068c
x7 : 0000000000000000 x6 : 000000000000003f
x5 : 0000000000000040 x4 : 0000000000000004
x3 : fffffc100134f651 x2 : 0000000000000400
x1 : 0000000000000000 x0 : fffffc100135068c

Process swapper/0 (pid: 1, stack limit = 0xfffffe0900200020)
Call trace:
[<fffffc800846f1cc>] __memset+0x4c/0x200
[<fffffc8008220044>] __asan_register_globals+0x5c/0xb0
[<fffffc8008a09d34>] _GLOBAL__sub_I_65535_1_sunrpc_cache_lookup+0x1c/0x28
[<fffffc8008f20d28>] kernel_init_freeable+0x104/0x274
[<fffffc80089e1948>] kernel_init+0x10/0xf8
[<fffffc8008093a00>] ret_from_fork+0x10/0x50
------------------------------------------------------------------------------

This patch aligns kimg_shadow_start and kimg_shadow_end to
SWAPPER_BLOCK_SIZE in all configurations.

Fixes: f9040773b7bb ("arm64: move kernel image to base of vmalloc area")
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 2776e0e8ef683a42fe3e9a5facf576b73579700e)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I13a6b38aefbeddd20bc87cb1382f2787bbc5cf9c
---
 arch/arm64/mm/kasan_init.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 56e19d150c21..206dd95ea292 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -158,15 +158,12 @@ void __init kasan_init(void)
 	 * vmemmap_populate() has populated the shadow region that covers the
 	 * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round
 	 * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent
-	 * kasan_populate_zero_shadow() from replacing the PMD block mappings
-	 * with PMD table mappings at the edges of the shadow region for the
-	 * kernel image.
+	 * kasan_populate_zero_shadow() from replacing the page table entries
+	 * (PMD or PTE) at the edges of the shadow region for the kernel
+	 * image.
 	 */
-	if (ARM64_SWAPPER_USES_SECTION_MAPS) {
-		kimg_shadow_start = round_down(kimg_shadow_start,
-					       SWAPPER_BLOCK_SIZE);
-		kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE);
-	}
+	kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE);
+	kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE);
 
 	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
 				   (void *)mod_shadow_start);

From f41890295d6496ed50bbe8606d23fb10661a844f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 2 Mar 2016 09:47:13 +0100
Subject: [PATCH 0452/3220] UPSTREAM: arm64: mm: check at build time that
 PAGE_OFFSET divides the VA space evenly

Commit 8439e62a1561 ("arm64: mm: use bit ops rather than arithmetic in
pa/va translations") changed the boundary check against PAGE_OFFSET from
an arithmetic comparison to a bit test. This means we now silently assume
that PAGE_OFFSET is a power of 2 that divides the kernel virtual address
space into two equal halves. So make that assumption explicit.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 6d2aa549de1fc998581d216de3853aa131aa4446)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I8c3bc8cdb7d7f7dea092fd1a208b04583a141054
---
 arch/arm64/mm/init.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 8e3cf19e4f00..2c38be3df4c8 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -172,6 +172,13 @@ void __init arm64_memblock_init(void)
 {
 	const s64 linear_region_size = -(s64)PAGE_OFFSET;
 
+	/*
+	 * Ensure that the linear region takes up exactly half of the kernel
+	 * virtual address space. This way, we can distinguish a linear address
+	 * from a kernel/module/vmalloc address by testing a single bit.
+	 */
+	BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1));
+
 	/*
 	 * Select a suitable value for the base of physical memory.
 	 */

From 391ba6bf3e866b9deb4b0f1d2f52c0ede0acce5a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Thu, 25 Feb 2016 20:48:53 +0100
Subject: [PATCH 0453/3220] UPSTREAM: arm64: lse: deal with clobbered IP
 registers after branch via PLT

The LSE atomics implementation uses runtime patching to patch in calls
to out of line non-LSE atomics implementations on cores that lack hardware
support for LSE. To avoid paying the overhead cost of a function call even
if no call ends up being made, the bl instruction is kept invisible to the
compiler, and the out of line implementations preserve all registers, not
just the ones that they are required to preserve as per the AAPCS64.

However, commit fd045f6cd98e ("arm64: add support for module PLTs") added
support for routing branch instructions via veneers if the branch target
offset exceeds the range of the ordinary relative branch instructions.
Since this deals with jump and call instructions that are exposed to ELF
relocations, the PLT code uses x16 to hold the address of the branch target
when it performs an indirect branch-to-register, something which is
explicitly allowed by the AAPCS64 (and ordinary compiler generated code
does not expect register x16 or x17 to retain their values across a bl
instruction).

Since the lse runtime patched bl instructions don't adhere to the AAPCS64,
they don't deal with this clobbering of registers x16 and x17. So add them
to the clobber list of the asm() statements that perform the call
instructions, and drop x16 and x17 from the list of registers that are
callee saved in the out of line non-LSE implementations.

In addition, since we have given these functions two scratch registers,
they no longer need to stack/unstack temp registers.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
[will: factored clobber list into #define, updated Makefile comment]
Signed-off-by: Will Deacon <will.deacon@arm.com>

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 5be8b70af1ca78cefb8b756d157532360a5fd663)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ia44a54eba315a47a6b8aaa2259b444e0139162c0
---
 arch/arm64/include/asm/atomic_lse.h | 38 ++++++++++++++---------------
 arch/arm64/include/asm/lse.h        |  1 +
 arch/arm64/lib/Makefile             | 13 +++++-----
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 197e06afbf71..39c1d340fec5 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -36,7 +36,7 @@ static inline void atomic_andnot(int i, atomic_t *v)
 	"	stclr	%w[i], %[v]\n")
 	: [i] "+r" (w0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 static inline void atomic_or(int i, atomic_t *v)
@@ -48,7 +48,7 @@ static inline void atomic_or(int i, atomic_t *v)
 	"	stset	%w[i], %[v]\n")
 	: [i] "+r" (w0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 static inline void atomic_xor(int i, atomic_t *v)
@@ -60,7 +60,7 @@ static inline void atomic_xor(int i, atomic_t *v)
 	"	steor	%w[i], %[v]\n")
 	: [i] "+r" (w0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 static inline void atomic_add(int i, atomic_t *v)
@@ -72,7 +72,7 @@ static inline void atomic_add(int i, atomic_t *v)
 	"	stadd	%w[i], %[v]\n")
 	: [i] "+r" (w0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 #define ATOMIC_OP_ADD_RETURN(name, mb, cl...)				\
@@ -90,7 +90,7 @@ static inline int atomic_add_return##name(int i, atomic_t *v)		\
 	"	add	%w[i], %w[i], w30")				\
 	: [i] "+r" (w0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
-	: "x30" , ##cl);						\
+	: __LL_SC_CLOBBERS, ##cl);					\
 									\
 	return w0;							\
 }
@@ -116,7 +116,7 @@ static inline void atomic_and(int i, atomic_t *v)
 	"	stclr	%w[i], %[v]")
 	: [i] "+r" (w0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 static inline void atomic_sub(int i, atomic_t *v)
@@ -133,7 +133,7 @@ static inline void atomic_sub(int i, atomic_t *v)
 	"	stadd	%w[i], %[v]")
 	: [i] "+r" (w0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 #define ATOMIC_OP_SUB_RETURN(name, mb, cl...)				\
@@ -153,7 +153,7 @@ static inline int atomic_sub_return##name(int i, atomic_t *v)		\
 	"	add	%w[i], %w[i], w30")				\
 	: [i] "+r" (w0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
-	: "x30" , ##cl);						\
+	: __LL_SC_CLOBBERS , ##cl);					\
 									\
 	return w0;							\
 }
@@ -177,7 +177,7 @@ static inline void atomic64_andnot(long i, atomic64_t *v)
 	"	stclr	%[i], %[v]\n")
 	: [i] "+r" (x0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 static inline void atomic64_or(long i, atomic64_t *v)
@@ -189,7 +189,7 @@ static inline void atomic64_or(long i, atomic64_t *v)
 	"	stset	%[i], %[v]\n")
 	: [i] "+r" (x0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 static inline void atomic64_xor(long i, atomic64_t *v)
@@ -201,7 +201,7 @@ static inline void atomic64_xor(long i, atomic64_t *v)
 	"	steor	%[i], %[v]\n")
 	: [i] "+r" (x0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 static inline void atomic64_add(long i, atomic64_t *v)
@@ -213,7 +213,7 @@ static inline void atomic64_add(long i, atomic64_t *v)
 	"	stadd	%[i], %[v]\n")
 	: [i] "+r" (x0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 #define ATOMIC64_OP_ADD_RETURN(name, mb, cl...)				\
@@ -231,7 +231,7 @@ static inline long atomic64_add_return##name(long i, atomic64_t *v)	\
 	"	add	%[i], %[i], x30")				\
 	: [i] "+r" (x0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
-	: "x30" , ##cl);						\
+	: __LL_SC_CLOBBERS, ##cl);					\
 									\
 	return x0;							\
 }
@@ -257,7 +257,7 @@ static inline void atomic64_and(long i, atomic64_t *v)
 	"	stclr	%[i], %[v]")
 	: [i] "+r" (x0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 static inline void atomic64_sub(long i, atomic64_t *v)
@@ -274,7 +274,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
 	"	stadd	%[i], %[v]")
 	: [i] "+r" (x0), [v] "+Q" (v->counter)
 	: "r" (x1)
-	: "x30");
+	: __LL_SC_CLOBBERS);
 }
 
 #define ATOMIC64_OP_SUB_RETURN(name, mb, cl...)				\
@@ -294,7 +294,7 @@ static inline long atomic64_sub_return##name(long i, atomic64_t *v)	\
 	"	add	%[i], %[i], x30")				\
 	: [i] "+r" (x0), [v] "+Q" (v->counter)				\
 	: "r" (x1)							\
-	: "x30" , ##cl);						\
+	: __LL_SC_CLOBBERS, ##cl);					\
 									\
 	return x0;							\
 }
@@ -330,7 +330,7 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	"2:")
 	: [ret] "+&r" (x0), [v] "+Q" (v->counter)
 	:
-	: "x30", "cc", "memory");
+	: __LL_SC_CLOBBERS, "cc", "memory");
 
 	return x0;
 }
@@ -359,7 +359,7 @@ static inline unsigned long __cmpxchg_case_##name(volatile void *ptr,	\
 	"	mov	%" #w "[ret], " #w "30")			\
 	: [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr)		\
 	: [old] "r" (x1), [new] "r" (x2)				\
-	: "x30" , ##cl);						\
+	: __LL_SC_CLOBBERS, ##cl);					\
 									\
 	return x0;							\
 }
@@ -416,7 +416,7 @@ static inline long __cmpxchg_double##name(unsigned long old1,		\
 	  [v] "+Q" (*(unsigned long *)ptr)				\
 	: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),		\
 	  [oldval1] "r" (oldval1), [oldval2] "r" (oldval2)		\
-	: "x30" , ##cl);						\
+	: __LL_SC_CLOBBERS, ##cl);					\
 									\
 	return x0;							\
 }
diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h
index 3de42d68611d..23acc00be32d 100644
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@@ -26,6 +26,7 @@ __asm__(".arch_extension	lse");
 
 /* Macro for constructing calls to out-of-line ll/sc atomics */
 #define __LL_SC_CALL(op)	"bl\t" __stringify(__LL_SC_PREFIX(op)) "\n"
+#define __LL_SC_CLOBBERS	"x16", "x17", "x30"
 
 /* In-line patching at runtime */
 #define ARM64_LSE_ATOMIC_INSN(llsc, lse)				\
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 1a811ecf71da..c86b7909ef31 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -4,15 +4,16 @@ lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
 		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
 		   strchr.o strrchr.o
 
-# Tell the compiler to treat all general purpose registers as
-# callee-saved, which allows for efficient runtime patching of the bl
-# instruction in the caller with an atomic instruction when supported by
-# the CPU. Result and argument registers are handled correctly, based on
-# the function prototype.
+# Tell the compiler to treat all general purpose registers (with the
+# exception of the IP registers, which are already handled by the caller
+# in case of a PLT) as callee-saved, which allows for efficient runtime
+# patching of the bl instruction in the caller with an atomic instruction
+# when supported by the CPU. Result and argument registers are handled
+# correctly, based on the function prototype.
 lib-$(CONFIG_ARM64_LSE_ATOMICS) += atomic_ll_sc.o
 CFLAGS_atomic_ll_sc.o	:= -fcall-used-x0 -ffixed-x1 -ffixed-x2		\
 		   -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6		\
 		   -ffixed-x7 -fcall-saved-x8 -fcall-saved-x9		\
 		   -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12	\
 		   -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15	\
-		   -fcall-saved-x16 -fcall-saved-x17 -fcall-saved-x18
+		   -fcall-saved-x18

From 1d2e2e3c0f8f343e4e74cdd89fb94fc341578e64 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 10 Mar 2016 18:30:56 +0000
Subject: [PATCH 0454/3220] UPSTREAM: arm64: kasan: Use actual memory node when
 populating the kernel image shadow

With the 16KB or 64KB page configurations, the generic
vmemmap_populate() implementation warns on potential offnode
page_structs via vmemmap_verify() because the arm64 kasan_init() passes
NUMA_NO_NODE instead of the actual node for the kernel image memory.

Fixes: f9040773b7bb ("arm64: move kernel image to base of vmalloc area")
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: James Morse <james.morse@arm.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 2f76969f2eef051bdd63d38b08d78e790440b0ad)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I8985e5b4628a9c7076767d4565f7633635813b5c
---
 arch/arm64/mm/kasan_init.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 206dd95ea292..757009daa9ed 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -152,7 +152,8 @@ void __init kasan_init(void)
 
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
-	vmemmap_populate(kimg_shadow_start, kimg_shadow_end, NUMA_NO_NODE);
+	vmemmap_populate(kimg_shadow_start, kimg_shadow_end,
+			 pfn_to_nid(virt_to_pfn(_text)));
 
 	/*
 	 * vmemmap_populate() has populated the shadow region that covers the

From 42f010ac16ad91ca029ecb77dbf18ca95d2b9598 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 26 Jul 2016 10:16:55 -0700
Subject: [PATCH 0455/3220] UPSTREAM: arm64: Only select ARM64_MODULE_PLTS if
 MODULES=y
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Selecting CONFIG_RANDOMIZE_BASE=y and CONFIG_MODULES=n fails to build
the module PLTs support:

  CC      arch/arm64/kernel/module-plts.o
/work/Linux/linux-2.6-aarch64/arch/arm64/kernel/module-plts.c: In function ‘module_emit_plt_entry’:
/work/Linux/linux-2.6-aarch64/arch/arm64/kernel/module-plts.c:32:49: error: dereferencing pointer to incomplete type ‘struct module’

This patch selects ARM64_MODULE_PLTS conditionally only if MODULES is
enabled.

Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR")
Cc: <stable@vger.kernel.org> # 4.6+
Reported-by: Jeff Vander Stoep <jeffv@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit b9c220b589daaf140f5b8ebe502c98745b94e65c)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I446cb3aa78f1c64b5aa1e2e90fda13f7d46cac33
---
 arch/arm64/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 0b5ec89a8ec4..bb92d3875791 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -784,7 +784,7 @@ config RELOCATABLE
 
 config RANDOMIZE_BASE
 	bool "Randomize the address of the kernel image"
-	select ARM64_MODULE_PLTS
+	select ARM64_MODULE_PLTS if MODULES
 	select RELOCATABLE
 	help
 	  Randomizes the virtual address at which the kernel image is

From 1488d59cb5fd80dea0abb78bafb5130b64f561db Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 24 Aug 2016 18:02:08 +0100
Subject: [PATCH 0456/3220] UPSTREAM: arm64: avoid TLB conflict with
 CONFIG_RANDOMIZE_BASE

When CONFIG_RANDOMIZE_BASE is selected, we modify the page tables to remap the
kernel at a newly-chosen VA range. We do this with the MMU disabled, but do not
invalidate TLBs prior to re-enabling the MMU with the new tables. Thus the old
mappings entries may still live in TLBs, and we risk violating
Break-Before-Make requirements, leading to TLB conflicts and/or other issues.

We invalidate TLBs when we uninsall the idmap in early setup code, but prior to
this we are subject to issues relating to the Break-Before-Make violation.

Avoid these issues by invalidating the TLBs before the new mappings can be
used by the hardware.

Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR")
Cc: <stable@vger.kernel.org> # 4.6+
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit fd363bd417ddb6103564c69cfcbd92d9a7877431)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I6c23ce55cdd8b66587b6787b8f28df8535e39f24
---
 arch/arm64/kernel/head.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index a88a15447c3b..54fd1e2590a2 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -738,6 +738,9 @@ __enable_mmu:
 	isb
 	bl	__create_page_tables		// recreate kernel mapping
 
+	tlbi	vmalle1				// Remove any stale TLB entries
+	dsb	nsh
+
 	msr	sctlr_el1, x19			// re-enable the MMU
 	isb
 	ic	iallu				// flush instructions fetched

From 542ecd89801bcab7306825c0fbc6231cd67e170f Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 8 Jun 2016 15:10:57 +0100
Subject: [PATCH 0457/3220] UPSTREAM: arm64: spinlock: fix spin_unlock_wait for
 LSE atomics

Commit d86b8da04dfa ("arm64: spinlock: serialise spin_unlock_wait against
concurrent lockers") fixed spin_unlock_wait for LL/SC-based atomics under
the premise that the LSE atomics (in particular, the LDADDA instruction)
are indivisible.

Unfortunately, these instructions are only indivisible when used with the
-AL (full ordering) suffix and, consequently, the same issue can
theoretically be observed with LSE atomics, where a later (in program
order) load can be speculated before the write portion of the atomic
operation.

This patch fixes the issue by performing a CAS of the lock once we've
established that it's unlocked, in much the same way as the LL/SC code.

Fixes: d86b8da04dfa ("arm64: spinlock: serialise spin_unlock_wait against concurrent lockers")
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Patchset: kaslr-arm64-4.4

(cherry picked from commit 3a5facd09da848193f5bcb0dea098a298bc1a29d)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Icedaa4c508784bf43d0b5787586480fd668ccc49
---
 arch/arm64/include/asm/spinlock.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index fc9682bfe002..deb6e5ac891f 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -37,13 +37,17 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 "2:	ldaxr	%w0, %2\n"
 "	eor	%w1, %w0, %w0, ror #16\n"
 "	cbnz	%w1, 1b\n"
+	/* Serialise against any concurrent lockers */
 	ARM64_LSE_ATOMIC_INSN(
 	/* LL/SC */
 "	stxr	%w1, %w0, %2\n"
-"	cbnz	%w1, 2b\n", /* Serialise against any concurrent lockers */
-	/* LSE atomics */
 "	nop\n"
-"	nop\n")
+"	nop\n",
+	/* LSE atomics */
+"	mov	%w1, %w0\n"
+"	cas	%w0, %w0, %2\n"
+"	eor	%w1, %w1, %w0\n")
+"	cbnz	%w1, 2b\n"
 	: "=&r" (lockval), "=&r" (tmp), "+Q" (*lock)
 	:
 	: "memory");

From 8ab85055db1054611473cfd1d72512e45b37e251 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:12 -0800
Subject: [PATCH 0458/3220] UPSTREAM: asm-generic: Consolidate mark_rodata_ro()

Instead of defining mark_rodata_ro() in each architecture, consolidate it.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Gross <agross@codeaurora.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ashok Kumar <ashoks@broadcom.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Brown <david.brown@linaro.org>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Link: http://lkml.kernel.org/r/1455748879-21872-2-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 31660652
Change-Id: Iec0c44b3f5d7948954da93fba6cb57888a2709de
(cherry picked from commit e267d97b83d9cecc16c54825f9f3ac7f72dc1e1e)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm/include/asm/cacheflush.h    | 1 -
 arch/arm64/include/asm/cacheflush.h  | 4 ----
 arch/parisc/include/asm/cacheflush.h | 4 ----
 arch/x86/include/asm/cacheflush.h    | 1 -
 include/linux/init.h                 | 4 ++++
 5 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index d5525bfc7e3e..9156fc303afd 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -491,7 +491,6 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
 #endif
 
 #ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
 #else
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 7fc294c3bc5b..22dda613f9c9 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -156,8 +156,4 @@ int set_memory_rw(unsigned long addr, int numpages);
 int set_memory_x(unsigned long addr, int numpages);
 int set_memory_nx(unsigned long addr, int numpages);
 
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
 #endif
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 845272ce9cc5..7bd69bd43a01 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma
 	}
 }
 
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
 #include <asm/kmap_types.h>
 
 #define ARCH_HAS_KMAP
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e63aa38e85fb..c8cff75c5b21 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -92,7 +92,6 @@ void clflush_cache_range(void *addr, unsigned int size);
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
 #ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
 extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
diff --git a/include/linux/init.h b/include/linux/init.h
index b449f378f995..aedb254abc37 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -142,6 +142,10 @@ void prepare_namespace(void);
 void __init load_default_modules(void);
 int __init init_rootfs(void);
 
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
 extern void (*late_time_init)(void);
 
 extern bool initcall_debug;

From 36e8afda0dc5ac915ad9a6742e5a8708ed656249 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:13 -0800
Subject: [PATCH 0459/3220] UPSTREAM: mm/init: Add 'rodata=off' boot cmdline
 parameter to disable read-only kernel mappings

It may be useful to debug writes to the readonly sections of memory,
so provide a cmdline "rodata=off" to allow for this. This can be
expanded in the future to support "log" and "write" modes, but that
will need to be architecture-specific.

This also makes KDB software breakpoints more usable, as read-only
mappings can now be disabled on any kernel.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-3-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 31660652
Change-Id: I67b818ca390afdd42ab1c27cb4f8ac64bbdb3b65
(cherry picked from commit d2aa1acad22f1bdd0cfa67b3861800e392254454)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 Documentation/kernel-parameters.txt |  4 ++++
 init/main.c                         | 27 +++++++++++++++++++++++----
 kernel/debug/kdb/kdb_bp.c           |  4 +---
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b02f027cba72..6bb9f95e7690 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3415,6 +3415,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	ro		[KNL] Mount root device read-only on boot
 
+	rodata=		[KNL]
+		on	Mark read-only kernel memory as read-only (default).
+		off	Leave read-only kernel memory writable for debugging.
+
 	root=		[KNL] Root filesystem
 			See name_to_dev_t comment in init/do_mounts.c.
 
diff --git a/init/main.c b/init/main.c
index 9e64d7097f1a..fbafa271531c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -93,9 +93,6 @@ static int kernel_init(void *);
 extern void init_IRQ(void);
 extern void fork_init(void);
 extern void radix_tree_init(void);
-#ifndef CONFIG_DEBUG_RODATA
-static inline void mark_rodata_ro(void) { }
-#endif
 
 /*
  * Debug helper: via this flag we know that we are in 'early bootup code'
@@ -929,6 +926,28 @@ static int try_to_run_init_process(const char *init_filename)
 
 static noinline void __init kernel_init_freeable(void);
 
+#ifdef CONFIG_DEBUG_RODATA
+static bool rodata_enabled = true;
+static int __init set_debug_rodata(char *str)
+{
+	return strtobool(str, &rodata_enabled);
+}
+__setup("rodata=", set_debug_rodata);
+
+static void mark_readonly(void)
+{
+	if (rodata_enabled)
+		mark_rodata_ro();
+	else
+		pr_info("Kernel memory protection disabled.\n");
+}
+#else
+static inline void mark_readonly(void)
+{
+	pr_warn("This architecture does not have kernel memory protection.\n");
+}
+#endif
+
 static int __ref kernel_init(void *unused)
 {
 	int ret;
@@ -937,7 +956,7 @@ static int __ref kernel_init(void *unused)
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
 	free_initmem();
-	mark_rodata_ro();
+	mark_readonly();
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();
 
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index e1dbf4a2c69e..90ff129c88a2 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
 	} else {
 		kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
 			   __func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
 		if (!bp->bp_type) {
 			kdb_printf("Software breakpoints are unavailable.\n"
-				   "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+				   "  Boot the kernel with rodata=off\n"
 				   "  OR use hw breaks: help bph\n");
 		}
-#endif
 		return 1;
 	}
 	return 0;

From fa88d4cf695b9c3a0de87afaa32d59a3913cfd52 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:14 -0800
Subject: [PATCH 0460/3220] UPSTREAM: x86/mm: Always enable CONFIG_DEBUG_RODATA
 and remove the Kconfig option

This removes the CONFIG_DEBUG_RODATA option and makes it always enabled.

This simplifies the code and also makes it clearer that read-only mapped
memory is just as fundamental a security feature in kernel-space as it is
in user-space.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-4-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 31660652
Change-Id: I3e79c7c4ead79a81c1445f1b3dd28003517faf18
(cherry picked from commit 9ccaf77cf05915f51231d158abfd5448aedde758)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/x86/Kconfig                  |  3 +++
 arch/x86/Kconfig.debug            | 18 +++---------------
 arch/x86/include/asm/cacheflush.h |  5 -----
 arch/x86/include/asm/kvm_para.h   |  7 -------
 arch/x86/include/asm/sections.h   |  2 +-
 arch/x86/kernel/ftrace.c          |  6 +++---
 arch/x86/kernel/kgdb.c            |  8 ++------
 arch/x86/kernel/test_nx.c         |  2 --
 arch/x86/kernel/test_rodata.c     |  2 +-
 arch/x86/kernel/vmlinux.lds.S     | 25 +++++++++++--------------
 arch/x86/mm/init_32.c             |  3 ---
 arch/x86/mm/init_64.c             |  3 ---
 arch/x86/mm/pageattr.c            |  2 +-
 13 files changed, 25 insertions(+), 61 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9e3b1e57499a..7fa0a1830ef6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -307,6 +307,9 @@ config ARCH_SUPPORTS_UPROBES
 config FIX_EARLYCON_MEM
 	def_bool y
 
+config DEBUG_RODATA
+	def_bool y
+
 config PGTABLE_LEVELS
 	int
 	default 4 if X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 137dfa96aa14..1f6c306a9a00 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -91,28 +91,16 @@ config EFI_PGT_DUMP
 	  issues with the mapping of the EFI runtime regions into that
 	  table.
 
-config DEBUG_RODATA
-	bool "Write protect kernel read-only data structures"
-	default y
-	depends on DEBUG_KERNEL
-	---help---
-	  Mark the kernel read-only data as write-protected in the pagetables,
-	  in order to catch accidental (and incorrect) writes to such const
-	  data. This is recommended so that we can catch kernel bugs sooner.
-	  If in doubt, say "Y".
-
 config DEBUG_RODATA_TEST
-	bool "Testcase for the DEBUG_RODATA feature"
-	depends on DEBUG_RODATA
+	bool "Testcase for the marking rodata read-only"
 	default y
 	---help---
-	  This option enables a testcase for the DEBUG_RODATA
-	  feature as well as for the change_page_attr() infrastructure.
+	  This option enables a testcase for the setting rodata read-only
+	  as well as for the change_page_attr() infrastructure.
 	  If in doubt, say "N"
 
 config DEBUG_WX
 	bool "Warn on W+X mappings at boot"
-	depends on DEBUG_RODATA
 	select X86_PTDUMP_CORE
 	---help---
 	  Generate a warning if any W+X mappings are found at boot.
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index c8cff75c5b21..61518cf79437 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -91,15 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size);
 
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
-#ifdef CONFIG_DEBUG_RODATA
 extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
 
 #ifdef CONFIG_DEBUG_RODATA_TEST
 int rodata_test(void);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c1adf33fdd0d..bc62e7cbf1b1 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
 }
 #endif /* CONFIG_KVM_GUEST */
 
-#ifdef CONFIG_DEBUG_RODATA
 #define KVM_HYPERCALL \
         ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
 
 /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
  * instruction.  The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 0a5242428659..13b6cdd0af57 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,7 +7,7 @@
 extern char __brk_base[], __brk_limit[];
 extern struct exception_table_entry __stop___ex_table[];
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 extern char __end_rodata_hpage_align[];
 #endif
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 311bcf338f07..eb6bd34582c6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
 static unsigned long text_ip_addr(unsigned long ip)
 {
 	/*
-	 * On x86_64, kernel text mappings are mapped read-only with
-	 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
-	 * of the kernel text mapping to modify the kernel text.
+	 * On x86_64, kernel text mappings are mapped read-only, so we use
+	 * the kernel identity mapping instead of the kernel text mapping
+	 * to modify the kernel text.
 	 *
 	 * For 32bit kernels, these mappings are same and we can use
 	 * kernel identity mapping to modify code.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 44256a62702b..ed15cd486d06 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 {
 	int err;
-#ifdef CONFIG_DEBUG_RODATA
 	char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
 
 	bpt->type = BP_BREAKPOINT;
 	err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 		return err;
 	err = probe_kernel_write((char *)bpt->bpt_addr,
 				 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
 	if (!err)
 		return err;
 	/*
@@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 	if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
 		return -EINVAL;
 	bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
 	return err;
 }
 
 int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 {
-#ifdef CONFIG_DEBUG_RODATA
 	int err;
 	char opc[BREAK_INSTR_SIZE];
 
@@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 	if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
 		goto knl_write;
 	return err;
+
 knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
 	return probe_kernel_write((char *)bpt->bpt_addr,
 				  (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
 }
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 3f92ce07e525..27538f183c3b 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -142,7 +142,6 @@ static int test_NX(void)
 	 * by the error message
 	 */
 
-#ifdef CONFIG_DEBUG_RODATA
 	/* Test 3: Check if the .rodata section is executable */
 	if (rodata_test_data != 0xC3) {
 		printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
 		printk(KERN_ERR "test_nx: .rodata section is executable\n");
 		ret = -ENODEV;
 	}
-#endif
 
 #if 0
 	/* Test 4: Check if the .data section of a module is executable */
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index 5ecbfe5099da..cb4a01b41e27 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -76,5 +76,5 @@ int rodata_test(void)
 }
 
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf11f562..fe133b710bef 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 /*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
  *
  * However, kernel identity mappings will have different RWX permissions
  * to the pages mapping to text and to the pages padding (which are freed) the
  * text section. Hence kernel identity mappings will be broken to smaller
  * pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
  */
-#define X64_ALIGN_DEBUG_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
 
-#define X64_ALIGN_DEBUG_RODATA_END				\
+#define X64_ALIGN_RODATA_END					\
 		. = ALIGN(HPAGE_SIZE);				\
 		__end_rodata_hpage_align = .;
 
 #else
 
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
 
 #endif
 
@@ -112,13 +111,11 @@ SECTIONS
 
 	EXCEPTION_TABLE(16) :text = 0x9090
 
-#if defined(CONFIG_DEBUG_RODATA)
 	/* .text should occupy whole number of pages */
 	. = ALIGN(PAGE_SIZE);
-#endif
-	X64_ALIGN_DEBUG_RODATA_BEGIN
+	X64_ALIGN_RODATA_BEGIN
 	RO_DATA(PAGE_SIZE)
-	X64_ALIGN_DEBUG_RODATA_END
+	X64_ALIGN_RODATA_END
 
 	/* Data */
 	.data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cb4ef3de61f9..2ebfbaf61142 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void)
 	return flag;
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
@@ -960,5 +959,3 @@ void mark_rodata_ro(void)
 	if (__supported_pte_mask & _PAGE_NX)
 		debug_checkwx();
 }
-#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe0ce2c..e08d141844ee 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1062,7 +1062,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
@@ -1154,8 +1153,6 @@ void mark_rodata_ro(void)
 	debug_checkwx();
 }
 
-#endif
-
 int kern_addr_valid(unsigned long addr)
 {
 	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3137a4feed1..8d77c7f7a614 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -278,7 +278,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 	/*
 	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 	 * kernel text mappings for the large page aligned text, rodata sections

From bfd3de058f94daa2a2751e7660bb9561fa23d7cd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:15 -0800
Subject: [PATCH 0461/3220] UPSTREAM: arch: Introduce post-init read-only
 memory

One of the easiest ways to protect the kernel from attack is to reduce
the internal attack surface exposed when a "write" flaw is available. By
making as much of the kernel read-only as possible, we reduce the
attack surface.

Many things are written to only during __init, and never changed
again. These cannot be made "const" since the compiler will do the wrong
thing (we do actually need to write to them). Instead, move these items
into a memory region that will be made read-only during mark_rodata_ro()
which happens after all kernel __init code has finished.

This introduces __ro_after_init as a way to mark such memory, and adds
some documentation about the existing __read_mostly marking.

This improves the security of the Linux kernel by marking formerly
read-write memory regions as read-only on a fully booted up system.

Based on work by PaX Team and Brad Spengler.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-5-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 31660652
Change-Id: I640f6d858d9770a5e480d12a1c716adf8842feb0
(cherry picked from commit c74ba8b3480da6ddaea17df2263ec09b869ac496)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/parisc/include/asm/cache.h   |  3 +++
 include/asm-generic/vmlinux.lds.h |  1 +
 include/linux/cache.h             | 14 ++++++++++++++
 3 files changed, 18 insertions(+)

diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index 3d0e17bcc8e9..df0f52bd18b4 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -22,6 +22,9 @@
 
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
+/* Read-only memory is marked before mark_rodata_ro() is called. */
+#define __ro_after_init	__read_mostly
+
 void parisc_cache_init(void);	/* initializes cache-flushing */
 void disable_sr_hashing_asm(int); /* low level support for above */
 void disable_sr_hashing(void);   /* turns off space register hashing */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c4bd0e2c173c..772c784ba763 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -256,6 +256,7 @@
 	.rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {		\
 		VMLINUX_SYMBOL(__start_rodata) = .;			\
 		*(.rodata) *(.rodata.*)					\
+		*(.data..ro_after_init)	/* Read only after init */	\
 		*(__vermagic)		/* Kernel version magic */	\
 		. = ALIGN(8);						\
 		VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .;		\
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 17e7e82d2aa7..1be04f8c563a 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -12,10 +12,24 @@
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 #endif
 
+/*
+ * __read_mostly is used to keep rarely changing variables out of frequently
+ * updated cachelines. If an architecture doesn't support it, ignore the
+ * hint.
+ */
 #ifndef __read_mostly
 #define __read_mostly
 #endif
 
+/*
+ * __ro_after_init is used to mark things that are read-only after init (i.e.
+ * after mark_rodata_ro() has been called). These are effectively read-only,
+ * but may get written to during init, so can't live in .rodata (via "const").
+ */
+#ifndef __ro_after_init
+#define __ro_after_init __attribute__((__section__(".data..ro_after_init")))
+#endif
+
 #ifndef ____cacheline_aligned
 #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
 #endif

From ba24621d7a79e3335f478fead5a84d6ee1a212ce Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:16 -0800
Subject: [PATCH 0462/3220] UPSTREAM: lkdtm: Verify that '__ro_after_init'
 works correctly

The new __ro_after_init section should be writable before init, but
not after. Validate that it gets updated at init and can't be written
to afterwards.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-6-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 31660652
Change-Id: I75301d0497fde49a02f13b4e75300111ddadda9d
(cherry picked from commit 7cca071ccbd2a293ea69168ace6abbcdce53098e)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 drivers/misc/lkdtm.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c
index 11fdadc68e53..2a6eaf1122b4 100644
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -103,6 +103,7 @@ enum ctype {
 	CT_EXEC_USERSPACE,
 	CT_ACCESS_USERSPACE,
 	CT_WRITE_RO,
+	CT_WRITE_RO_AFTER_INIT,
 	CT_WRITE_KERN,
 };
 
@@ -140,6 +141,7 @@ static char* cp_type[] = {
 	"EXEC_USERSPACE",
 	"ACCESS_USERSPACE",
 	"WRITE_RO",
+	"WRITE_RO_AFTER_INIT",
 	"WRITE_KERN",
 };
 
@@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up);
 static u8 data_area[EXEC_SIZE];
 
 static const unsigned long rodata = 0xAA55AA55;
+static unsigned long ro_after_init __ro_after_init = 0x55AA5500;
 
 module_param(recur_count, int, 0644);
 MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test");
@@ -503,11 +506,28 @@ static void lkdtm_do_action(enum ctype which)
 		break;
 	}
 	case CT_WRITE_RO: {
-		unsigned long *ptr;
+		/* Explicitly cast away "const" for the test. */
+		unsigned long *ptr = (unsigned long *)&rodata;
 
-		ptr = (unsigned long *)&rodata;
+		pr_info("attempting bad rodata write at %p\n", ptr);
+		*ptr ^= 0xabcd1234;
 
-		pr_info("attempting bad write at %p\n", ptr);
+		break;
+	}
+	case CT_WRITE_RO_AFTER_INIT: {
+		unsigned long *ptr = &ro_after_init;
+
+		/*
+		 * Verify we were written to during init. Since an Oops
+		 * is considered a "success", a failure is to just skip the
+		 * real test.
+		 */
+		if ((*ptr & 0xAA) != 0xAA) {
+			pr_info("%p was NOT written during init!?\n", ptr);
+			break;
+		}
+
+		pr_info("attempting bad ro_after_init write at %p\n", ptr);
 		*ptr ^= 0xabcd1234;
 
 		break;
@@ -817,6 +837,9 @@ static int __init lkdtm_module_init(void)
 	int n_debugfs_entries = 1; /* Assume only the direct entry */
 	int i;
 
+	/* Make sure we can write to __ro_after_init values during __init */
+	ro_after_init |= 0xAA;
+
 	/* Register debugfs interface */
 	lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL);
 	if (!lkdtm_debugfs_root) {

From 60e8ff20653571ccab43f9f4554baaa26436e34b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:17 -0800
Subject: [PATCH 0463/3220] UPSTREAM: x86/vdso: Mark the vDSO code read-only
 after init

The vDSO does not need to be writable after __init, so mark it as
__ro_after_init. The result kills the exploit method of writing to the
vDSO from kernel space resulting in userspace executing the modified code,
as shown here to bypass SMEP restrictions: http://itszn.com/blog/?p=21

The memory map (with added vDSO address reporting) shows the vDSO moving
into read-only memory:

Before:
	[    0.143067] vDSO @ ffffffff82004000
	[    0.143551] vDSO @ ffffffff82006000
	---[ High Kernel Mapping ]---
	0xffffffff80000000-0xffffffff81000000      16M                         pmd
	0xffffffff81000000-0xffffffff81800000       8M   ro     PSE     GLB x  pmd
	0xffffffff81800000-0xffffffff819f3000    1996K   ro             GLB x  pte
	0xffffffff819f3000-0xffffffff81a00000      52K   ro                 NX pte
	0xffffffff81a00000-0xffffffff81e00000       4M   ro     PSE     GLB NX pmd
	0xffffffff81e00000-0xffffffff81e05000      20K   ro             GLB NX pte
	0xffffffff81e05000-0xffffffff82000000    2028K   ro                 NX pte
	0xffffffff82000000-0xffffffff8214f000    1340K   RW             GLB NX pte
	0xffffffff8214f000-0xffffffff82281000    1224K   RW                 NX pte
	0xffffffff82281000-0xffffffff82400000    1532K   RW             GLB NX pte
	0xffffffff82400000-0xffffffff83200000      14M   RW     PSE     GLB NX pmd
	0xffffffff83200000-0xffffffffc0000000     974M                         pmd

After:
	[    0.145062] vDSO @ ffffffff81da1000
	[    0.146057] vDSO @ ffffffff81da4000
	---[ High Kernel Mapping ]---
	0xffffffff80000000-0xffffffff81000000      16M                         pmd
	0xffffffff81000000-0xffffffff81800000       8M   ro     PSE     GLB x  pmd
	0xffffffff81800000-0xffffffff819f3000    1996K   ro             GLB x  pte
	0xffffffff819f3000-0xffffffff81a00000      52K   ro                 NX pte
	0xffffffff81a00000-0xffffffff81e00000       4M   ro     PSE     GLB NX pmd
	0xffffffff81e00000-0xffffffff81e0b000      44K   ro             GLB NX pte
	0xffffffff81e0b000-0xffffffff82000000    2004K   ro                 NX pte
	0xffffffff82000000-0xffffffff8214c000    1328K   RW             GLB NX pte
	0xffffffff8214c000-0xffffffff8227e000    1224K   RW                 NX pte
	0xffffffff8227e000-0xffffffff82400000    1544K   RW             GLB NX pte
	0xffffffff82400000-0xffffffff83200000      14M   RW     PSE     GLB NX pmd
	0xffffffff83200000-0xffffffffc0000000     974M                         pmd

Based on work by PaX Team and Brad Spengler.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-7-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 31660652
Change-Id: Iafbf7314a1106c297ea883031ee96c4f53c04a2b
(cherry picked from commit 018ef8dcf3de5f62e2cc1a9273cc27e1c6ba8de5)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/x86/entry/vdso/vdso2c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 0224987556ce..3f69326ed545 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
 	fprintf(outfile, "#include <asm/vdso.h>\n");
 	fprintf(outfile, "\n");
 	fprintf(outfile,
-		"static unsigned char raw_data[%lu] __page_aligned_data = {",
+		"static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
 		mapping_size);
 	for (j = 0; j < stripped_len; j++) {
 		if (j % 10 == 0)

From 3cfedbe672ad7fbd73cb5a073ee6a6f60f8130fa Mon Sep 17 00:00:00 2001
From: David Brown <david.brown@linaro.org>
Date: Wed, 17 Feb 2016 14:41:18 -0800
Subject: [PATCH 0464/3220] UPSTREAM: ARM/vdso: Mark the vDSO code read-only
 after init

Although the ARM vDSO is cleanly separated by code/data with the code
being read-only in userspace mappings, the code page is still writable
from the kernel.

There have been exploits (such as http://itszn.com/blog/?p=21) that
take advantage of this on x86 to go from a bad kernel write to full
root.

Prevent this specific exploit class on ARM as well by putting the vDSO
code page in post-init read-only memory as well.

Before:
	vdso: 1 text pages at base 80927000
	root@Vexpress:/ cat /sys/kernel/debug/kernel_page_tables
	---[ Modules ]---
	---[ Kernel Mapping ]---
	0x80000000-0x80100000           1M     RW NX SHD
	0x80100000-0x80600000           5M     ro x  SHD
	0x80600000-0x80800000           2M     ro NX SHD
	0x80800000-0xbe000000         984M     RW NX SHD

After:
	vdso: 1 text pages at base 8072b000
	root@Vexpress:/ cat /sys/kernel/debug/kernel_page_tables
	---[ Modules ]---
	---[ Kernel Mapping ]---
	0x80000000-0x80100000           1M     RW NX SHD
	0x80100000-0x80600000           5M     ro x  SHD
	0x80600000-0x80800000           2M     ro NX SHD
	0x80800000-0xbe000000         984M     RW NX SHD

Inspired by https://lkml.org/lkml/2016/1/19/494 based on work by the
PaX Team, Brad Spengler, and Kees Cook.

Signed-off-by: David Brown <david.brown@linaro.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nathan Lynch <nathan_lynch@mentor.com>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1455748879-21872-8-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 31660652
Change-Id: I8d3cb7707644343aa907b2d584312ccdad63e270
(cherry picked from commit 11bf9b865898961cee60a41c483c9f27ec76e12e)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm/vdso/vdso.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm/vdso/vdso.S b/arch/arm/vdso/vdso.S
index b2b97e3e7bab..a62a7b64f49c 100644
--- a/arch/arm/vdso/vdso.S
+++ b/arch/arm/vdso/vdso.S
@@ -23,9 +23,8 @@
 #include <linux/const.h>
 #include <asm/page.h>
 
-	__PAGE_ALIGNED_DATA
-
 	.globl vdso_start, vdso_end
+	.section .data..ro_after_init
 	.balign PAGE_SIZE
 vdso_start:
 	.incbin "arch/arm/vdso/vdso.so"

From 609abcd0c940b2fa0b7a548c58665bfb5c02d22b Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 7 Jun 2016 12:20:51 +0200
Subject: [PATCH 0465/3220] UPSTREAM: vmlinux.lds.h: allow arch specific
 handling of ro_after_init data section

commit c74ba8b3480d ("arch: Introduce post-init read-only memory")
introduced the __ro_after_init attribute which allows to add variables
to the ro_after_init data section.

This new section was added to rodata, even though it contains writable
data. This in turn causes problems on architectures which mark the
page table entries read-only that point to rodata very early.

This patch allows architectures to implement an own handling of the
.data..ro_after_init section.
Usually that would be:
- mark the rodata section read-only very early
- mark the ro_after_init section read-only within mark_rodata_ro

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

Bug: 31660652
Change-Id: If68cb4d86f88678c9bac8c47072775ab85ef5770
(cherry picked from commit 32fb2fc5c357fb99616bbe100dbcb27bc7f5d045)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 include/asm-generic/vmlinux.lds.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 772c784ba763..33eb2fe23071 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -248,6 +248,14 @@
 	. = ALIGN(align);						\
 	*(.data..init_task)
 
+/*
+ * Allow architectures to handle ro_after_init data on their
+ * own by defining an empty RO_AFTER_INIT_DATA.
+ */
+#ifndef RO_AFTER_INIT_DATA
+#define RO_AFTER_INIT_DATA *(.data..ro_after_init)
+#endif
+
 /*
  * Read only Data
  */
@@ -256,7 +264,7 @@
 	.rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {		\
 		VMLINUX_SYMBOL(__start_rodata) = .;			\
 		*(.rodata) *(.rodata.*)					\
-		*(.data..ro_after_init)	/* Read only after init */	\
+		RO_AFTER_INIT_DATA	/* Read only after init */	\
 		*(__vermagic)		/* Kernel version magic */	\
 		. = ALIGN(8);						\
 		VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .;		\

From 4055d6b56ac1de5b900ce6f0b484996aa357ee66 Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Fri, 23 Sep 2016 10:44:37 -0700
Subject: [PATCH 0466/3220] android-base.cfg: Enable kernel ASLR

Bug: 30369029
Change-Id: I0c1c932255866f308d67de1df2ad52c9c19c4799
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 34fa64a669ac..1ebef5c69fb6 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -139,6 +139,7 @@ CONFIG_PPP_MPPE=y
 CONFIG_PREEMPT=y
 CONFIG_PROFILING=y
 CONFIG_QUOTA=y
+CONFIG_RANDOMIZE_BASE=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SECCOMP=y

From 494a59a68340b0244cd6edf7a367e9233b006a3f Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 30 Aug 2016 17:19:13 -0400
Subject: [PATCH 0467/3220] BACKPORT: audit: consistently record PIDs with
 task_tgid_nr()

Unfortunately we record PIDs in audit records using a variety of
methods despite the correct way being the use of task_tgid_nr().
This patch converts all of these callers, except for the case of
AUDIT_SET in audit_receive_msg() (see the comment in the code).

Reported-by: Jeff Vander Stoep <jeffv@google.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>

Bug: 28952093

(cherry picked from commit fa2bea2f5cca5b8d4a3e5520d2e8c0ede67ac108)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: If6645f9de8bc58ed9755f28dc6af5fbf08d72a00
---
 kernel/audit.c       |  8 +++++++-
 kernel/auditsc.c     | 12 ++++++------
 security/lsm_audit.c |  4 ++--
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/audit.c b/kernel/audit.c
index 5ffcbd354a52..34f690b9213a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -870,6 +870,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				return err;
 		}
 		if (s.mask & AUDIT_STATUS_PID) {
+			/* NOTE: we are using task_tgid_vnr() below because
+			 *       the s.pid value is relative to the namespace
+			 *       of the caller; at present this doesn't matter
+			 *       much since you can really only run auditd
+			 *       from the initial pid namespace, but something
+			 *       to keep in mind if this changes */
 			int new_pid = s.pid;
 
 			if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
@@ -1896,7 +1902,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 			 " euid=%u suid=%u fsuid=%u"
 			 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
 			 task_ppid_nr(tsk),
-			 task_pid_nr(tsk),
+			 task_tgid_nr(tsk),
 			 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
 			 from_kuid(&init_user_ns, cred->uid),
 			 from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 48f45987dc6c..63f0e495f517 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -458,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 
 		switch (f->type) {
 		case AUDIT_PID:
-			pid = task_pid_nr(tsk);
+			pid = task_tgid_nr(tsk);
 			result = audit_comparator(pid, f->op, f->val);
 			break;
 		case AUDIT_PPID:
@@ -1987,7 +1987,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
 	if (!ab)
 		return;
-	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+	audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
 	audit_log_task_context(ab);
 	audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
 			 oldloginuid, loginuid, oldsessionid, sessionid, !rc);
@@ -2212,7 +2212,7 @@ void __audit_ptrace(struct task_struct *t)
 {
 	struct audit_context *context = current->audit_context;
 
-	context->target_pid = task_pid_nr(t);
+	context->target_pid = task_tgid_nr(t);
 	context->target_auid = audit_get_loginuid(t);
 	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
@@ -2237,7 +2237,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	if (audit_pid && t->tgid == audit_pid) {
 		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
-			audit_sig_pid = task_pid_nr(tsk);
+			audit_sig_pid = task_tgid_nr(tsk);
 			if (uid_valid(tsk->loginuid))
 				audit_sig_uid = tsk->loginuid;
 			else
@@ -2337,7 +2337,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 void __audit_log_capset(const struct cred *new, const struct cred *old)
 {
 	struct audit_context *context = current->audit_context;
-	context->capset.pid = task_pid_nr(current);
+	context->capset.pid = task_tgid_nr(current);
 	context->capset.cap.effective   = new->cap_effective;
 	context->capset.cap.inheritable = new->cap_effective;
 	context->capset.cap.permitted   = new->cap_permitted;
@@ -2369,7 +2369,7 @@ static void audit_log_task(struct audit_buffer *ab)
 			 from_kgid(&init_user_ns, gid),
 			 sessionid);
 	audit_log_task_context(ab);
-	audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+	audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
 	audit_log_untrustedstring(ab, get_task_comm(comm, current));
 	audit_log_d_path_exe(ab, current->mm);
 }
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index cccbf3068cdc..45d927ab807d 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -220,7 +220,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	 */
 	BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2);
 
-	audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+	audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
 	audit_log_untrustedstring(ab, memcpy(comm, current->comm, sizeof(comm)));
 
 	switch (a->type) {
@@ -294,7 +294,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	case LSM_AUDIT_DATA_TASK: {
 		struct task_struct *tsk = a->u.tsk;
 		if (tsk) {
-			pid_t pid = task_pid_nr(tsk);
+			pid_t pid = task_tgid_nr(tsk);
 			if (pid) {
 				char comm[sizeof(tsk->comm)];
 				audit_log_format(ab, " opid=%d ocomm=", pid);

From 1daecfe215ea3379fdc842f264607d37dd8fe091 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Tue, 27 Sep 2016 13:48:29 -0700
Subject: [PATCH 0468/3220] ANDROID: dm: android-verity: Remove fec_header
 location constraint

This CL removes the mandate of the fec_header being located right
after the ECC data.

(Cherry-picked from https://android-review.googlesource.com/#/c/280401)

Bug: 28865197
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: Ie04c8cf2dd755f54d02dbdc4e734a13d6f6507b5
---
 drivers/md/dm-android-verity.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index 15ce2a81c1f4..bb6c1285e499 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -266,10 +266,7 @@ static inline int validate_fec_header(struct fec_header *header, u64 offset)
 		le32_to_cpu(header->version) != FEC_VERSION ||
 		le32_to_cpu(header->size) != sizeof(struct fec_header) ||
 		le32_to_cpu(header->roots) == 0 ||
-		le32_to_cpu(header->roots) >= FEC_RSM ||
-		offset < le32_to_cpu(header->fec_size) ||
-		offset - le32_to_cpu(header->fec_size) !=
-		le64_to_cpu(header->inp_size))
+		le32_to_cpu(header->roots) >= FEC_RSM)
 		return -EINVAL;
 
 	return 0;

From bb849676d5a9351c7d263c20b5ef9e67ec463412 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 30 Nov 2015 13:28:15 +0100
Subject: [PATCH 0469/3220] UPSTREAM: mm/memblock: add MEMBLOCK_NOMAP attribute
 to memblock memory table

This introduces the MEMBLOCK_NOMAP attribute and the required plumbing
to make it usable as an indicator that some parts of normal memory
should not be covered by the kernel direct mapping. It is up to the
arch to actually honor the attribute when laying out this mapping,
but the memblock code itself is modified to disregard these regions
for allocations and other general use.

Cc: linux-mm@kvack.org
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: I55cd3abdf514ac54c071fa0037d8dac73bda798d
(cherry picked from commit bf3d3cc580f9960883ebf9ea05868f336d9491c2)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 include/linux/memblock.h |  8 ++++++++
 mm/memblock.c            | 28 ++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 24daf8fc4d7c..fec66f86eeff 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -25,6 +25,7 @@ enum {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
 	MEMBLOCK_HOTPLUG	= 0x1,	/* hotpluggable region */
 	MEMBLOCK_MIRROR		= 0x2,	/* mirrored region */
+	MEMBLOCK_NOMAP		= 0x4,	/* don't add to kernel direct mapping */
 };
 
 struct memblock_region {
@@ -82,6 +83,7 @@ bool memblock_overlaps_region(struct memblock_type *type,
 int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
+int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 ulong choose_memblock_flags(void);
 
 /* Low level functions */
@@ -184,6 +186,11 @@ static inline bool memblock_is_mirror(struct memblock_region *m)
 	return m->flags & MEMBLOCK_MIRROR;
 }
 
+static inline bool memblock_is_nomap(struct memblock_region *m)
+{
+	return m->flags & MEMBLOCK_NOMAP;
+}
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
@@ -319,6 +326,7 @@ phys_addr_t memblock_start_of_DRAM(void);
 phys_addr_t memblock_end_of_DRAM(void);
 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
 int memblock_is_memory(phys_addr_t addr);
+int memblock_is_map_memory(phys_addr_t addr);
 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
 int memblock_is_reserved(phys_addr_t addr);
 bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
diff --git a/mm/memblock.c b/mm/memblock.c
index d300f1329814..07ff069fef25 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -822,6 +822,17 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
 	return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
 }
 
+/**
+ * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
+{
+	return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP);
+}
 
 /**
  * __next_reserved_mem_region - next function for for_each_reserved_region()
@@ -913,6 +924,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
 		if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
 			continue;
 
+		/* skip nomap memory unless we were asked for it explicitly */
+		if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+			continue;
+
 		if (!type_b) {
 			if (out_start)
 				*out_start = m_start;
@@ -1022,6 +1037,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 		if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
 			continue;
 
+		/* skip nomap memory unless we were asked for it explicitly */
+		if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+			continue;
+
 		if (!type_b) {
 			if (out_start)
 				*out_start = m_start;
@@ -1519,6 +1538,15 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
 	return memblock_search(&memblock.memory, addr) != -1;
 }
 
+int __init_memblock memblock_is_map_memory(phys_addr_t addr)
+{
+	int i = memblock_search(&memblock.memory, addr);
+
+	if (i == -1)
+		return false;
+	return !memblock_is_nomap(&memblock.memory.regions[i]);
+}
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
 			 unsigned long *start_pfn, unsigned long *end_pfn)

From b6535fe666034be99b777a0dfad672636e17de35 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 30 Nov 2015 13:28:16 +0100
Subject: [PATCH 0470/3220] BACKPORT: arm64: only consider memblocks with NOMAP
 cleared for linear mapping

Take the new memblock attribute MEMBLOCK_NOMAP into account when
deciding whether a certain region is or should be covered by the
kernel direct mapping.

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: Id7346a09bb3aee5e9a5ef8812251f80cf8265532
(cherry picked from commit 68709f45385aeddb0ca96a060c0c8259944f321b)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/mm/init.c | 2 +-
 arch/arm64/mm/mmu.c  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 2c38be3df4c8..08aa45ff05b6 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -131,7 +131,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
 
 int pfn_valid(unsigned long pfn)
 {
-	return (pfn & PFN_MASK) == pfn && memblock_is_memory(pfn << PAGE_SHIFT);
+	return (pfn & PFN_MASK) == pfn && memblock_is_map_memory(pfn << PAGE_SHIFT);
 }
 EXPORT_SYMBOL(pfn_valid);
 #endif
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 72e634533a3f..9dafcfde7c26 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -438,6 +438,8 @@ static void __init map_mem(pgd_t *pgd)
 
 		if (start >= end)
 			break;
+		if (memblock_is_nomap(reg))
+			continue;
 
 		__map_memblock(pgd, start, end);
 	}

From 4ff6ae812864e5122da39ca70072987ce047f374 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 30 Nov 2015 13:28:17 +0100
Subject: [PATCH 0471/3220] UPSTREAM: arm64/efi: mark UEFI reserved regions as
 MEMBLOCK_NOMAP

Change the EFI memory reservation logic to use memblock_mark_nomap()
rather than memblock_reserve() to mark UEFI reserved regions as
occupied. In addition to reserving them against allocations done by
memblock, this will also prevent them from being covered by the linear
mapping.

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: Ia3ce78f40f8d41a9afdd42238fe9cbfd81bbff08
(cherry picked from commit 4dffbfc48d65e5d8157a634fd670065d237a9377)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/kernel/efi.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 4eeb17198cfa..04531d35f1df 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -187,7 +187,7 @@ static __init void reserve_regions(void)
 			early_init_dt_add_memory_arch(paddr, size);
 
 		if (is_reserve_region(md)) {
-			memblock_reserve(paddr, size);
+			memblock_mark_nomap(paddr, size);
 			if (efi_enabled(EFI_DBG))
 				pr_cont("*");
 		}
@@ -209,8 +209,6 @@ void __init efi_init(void)
 
 	efi_system_table = params.system_table;
 
-	memblock_reserve(params.mmap & PAGE_MASK,
-			 PAGE_ALIGN(params.mmap_size + (params.mmap & ~PAGE_MASK)));
 	memmap.phys_map = params.mmap;
 	memmap.map = early_memremap(params.mmap, params.mmap_size);
 	if (memmap.map == NULL) {
@@ -230,6 +228,9 @@ void __init efi_init(void)
 
 	reserve_regions();
 	early_memunmap(memmap.map, params.mmap_size);
+	memblock_mark_nomap(params.mmap & PAGE_MASK,
+			    PAGE_ALIGN(params.mmap_size +
+				       (params.mmap & ~PAGE_MASK)));
 }
 
 static bool __init efi_virtmap_init(void)

From 0467f02fb679b2de7c583af61cb6e48aca8237fc Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 30 Nov 2015 13:28:18 +0100
Subject: [PATCH 0472/3220] UPSTREAM: arm64/efi: split off EFI init and runtime
 code for reuse by 32-bit ARM

This splits off the early EFI init and runtime code that
- discovers the EFI params and the memory map from the FDT, and installs
  the memblocks and config tables.
- prepares and installs the EFI page tables so that UEFI Runtime Services
  can be invoked at the virtual address installed by the stub.

This will allow it to be reused for 32-bit ARM.

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: I143e4b38a5426f70027eff6cc5f732ac370ae69d
(cherry picked from commit e5bc22a42e4d46cc203fdfb6d2c76202b08666a0)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/kernel/efi.c            | 326 +----------------------------
 drivers/firmware/efi/Makefile      |   3 +
 drivers/firmware/efi/arm-init.c    | 208 ++++++++++++++++++
 drivers/firmware/efi/arm-runtime.c | 151 +++++++++++++
 4 files changed, 363 insertions(+), 325 deletions(-)
 create mode 100644 drivers/firmware/efi/arm-init.c
 create mode 100644 drivers/firmware/efi/arm-runtime.c

diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 04531d35f1df..bd3b2f5adf0c 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -11,318 +11,11 @@
  *
  */
 
-#include <linux/atomic.h>
 #include <linux/dmi.h>
 #include <linux/efi.h>
-#include <linux/export.h>
-#include <linux/memblock.h>
-#include <linux/mm_types.h>
-#include <linux/bootmem.h>
-#include <linux/of.h>
-#include <linux/of_fdt.h>
-#include <linux/preempt.h>
-#include <linux/rbtree.h>
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
+#include <linux/init.h>
 
-#include <asm/cacheflush.h>
 #include <asm/efi.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/mmu.h>
-#include <asm/pgtable.h>
-
-struct efi_memory_map memmap;
-
-static u64 efi_system_table;
-
-static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss;
-
-static struct mm_struct efi_mm = {
-	.mm_rb			= RB_ROOT,
-	.pgd			= efi_pgd,
-	.mm_users		= ATOMIC_INIT(2),
-	.mm_count		= ATOMIC_INIT(1),
-	.mmap_sem		= __RWSEM_INITIALIZER(efi_mm.mmap_sem),
-	.page_table_lock	= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
-	.mmlist			= LIST_HEAD_INIT(efi_mm.mmlist),
-};
-
-static int __init is_normal_ram(efi_memory_desc_t *md)
-{
-	if (md->attribute & EFI_MEMORY_WB)
-		return 1;
-	return 0;
-}
-
-/*
- * Translate a EFI virtual address into a physical address: this is necessary,
- * as some data members of the EFI system table are virtually remapped after
- * SetVirtualAddressMap() has been called.
- */
-static phys_addr_t efi_to_phys(unsigned long addr)
-{
-	efi_memory_desc_t *md;
-
-	for_each_efi_memory_desc(&memmap, md) {
-		if (!(md->attribute & EFI_MEMORY_RUNTIME))
-			continue;
-		if (md->virt_addr == 0)
-			/* no virtual mapping has been installed by the stub */
-			break;
-		if (md->virt_addr <= addr &&
-		    (addr - md->virt_addr) < (md->num_pages << EFI_PAGE_SHIFT))
-			return md->phys_addr + addr - md->virt_addr;
-	}
-	return addr;
-}
-
-static int __init uefi_init(void)
-{
-	efi_char16_t *c16;
-	void *config_tables;
-	u64 table_size;
-	char vendor[100] = "unknown";
-	int i, retval;
-
-	efi.systab = early_memremap(efi_system_table,
-				    sizeof(efi_system_table_t));
-	if (efi.systab == NULL) {
-		pr_warn("Unable to map EFI system table.\n");
-		return -ENOMEM;
-	}
-
-	set_bit(EFI_BOOT, &efi.flags);
-	set_bit(EFI_64BIT, &efi.flags);
-
-	/*
-	 * Verify the EFI Table
-	 */
-	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) {
-		pr_err("System table signature incorrect\n");
-		retval = -EINVAL;
-		goto out;
-	}
-	if ((efi.systab->hdr.revision >> 16) < 2)
-		pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n",
-			efi.systab->hdr.revision >> 16,
-			efi.systab->hdr.revision & 0xffff);
-
-	/* Show what we know for posterity */
-	c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor),
-			     sizeof(vendor) * sizeof(efi_char16_t));
-	if (c16) {
-		for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i)
-			vendor[i] = c16[i];
-		vendor[i] = '\0';
-		early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t));
-	}
-
-	pr_info("EFI v%u.%.02u by %s\n",
-		efi.systab->hdr.revision >> 16,
-		efi.systab->hdr.revision & 0xffff, vendor);
-
-	table_size = sizeof(efi_config_table_64_t) * efi.systab->nr_tables;
-	config_tables = early_memremap(efi_to_phys(efi.systab->tables),
-				       table_size);
-	if (config_tables == NULL) {
-		pr_warn("Unable to map EFI config table array.\n");
-		retval = -ENOMEM;
-		goto out;
-	}
-	retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables,
-					 sizeof(efi_config_table_64_t), NULL);
-
-	early_memunmap(config_tables, table_size);
-out:
-	early_memunmap(efi.systab,  sizeof(efi_system_table_t));
-	return retval;
-}
-
-/*
- * Return true for RAM regions we want to permanently reserve.
- */
-static __init int is_reserve_region(efi_memory_desc_t *md)
-{
-	switch (md->type) {
-	case EFI_LOADER_CODE:
-	case EFI_LOADER_DATA:
-	case EFI_BOOT_SERVICES_CODE:
-	case EFI_BOOT_SERVICES_DATA:
-	case EFI_CONVENTIONAL_MEMORY:
-	case EFI_PERSISTENT_MEMORY:
-		return 0;
-	default:
-		break;
-	}
-	return is_normal_ram(md);
-}
-
-static __init void reserve_regions(void)
-{
-	efi_memory_desc_t *md;
-	u64 paddr, npages, size;
-
-	if (efi_enabled(EFI_DBG))
-		pr_info("Processing EFI memory map:\n");
-
-	for_each_efi_memory_desc(&memmap, md) {
-		paddr = md->phys_addr;
-		npages = md->num_pages;
-
-		if (efi_enabled(EFI_DBG)) {
-			char buf[64];
-
-			pr_info("  0x%012llx-0x%012llx %s",
-				paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1,
-				efi_md_typeattr_format(buf, sizeof(buf), md));
-		}
-
-		memrange_efi_to_native(&paddr, &npages);
-		size = npages << PAGE_SHIFT;
-
-		if (is_normal_ram(md))
-			early_init_dt_add_memory_arch(paddr, size);
-
-		if (is_reserve_region(md)) {
-			memblock_mark_nomap(paddr, size);
-			if (efi_enabled(EFI_DBG))
-				pr_cont("*");
-		}
-
-		if (efi_enabled(EFI_DBG))
-			pr_cont("\n");
-	}
-
-	set_bit(EFI_MEMMAP, &efi.flags);
-}
-
-void __init efi_init(void)
-{
-	struct efi_fdt_params params;
-
-	/* Grab UEFI information placed in FDT by stub */
-	if (!efi_get_fdt_params(&params))
-		return;
-
-	efi_system_table = params.system_table;
-
-	memmap.phys_map = params.mmap;
-	memmap.map = early_memremap(params.mmap, params.mmap_size);
-	if (memmap.map == NULL) {
-		/*
-		* If we are booting via UEFI, the UEFI memory map is the only
-		* description of memory we have, so there is little point in
-		* proceeding if we cannot access it.
-		*/
-		panic("Unable to map EFI memory map.\n");
-	}
-	memmap.map_end = memmap.map + params.mmap_size;
-	memmap.desc_size = params.desc_size;
-	memmap.desc_version = params.desc_ver;
-
-	if (uefi_init() < 0)
-		return;
-
-	reserve_regions();
-	early_memunmap(memmap.map, params.mmap_size);
-	memblock_mark_nomap(params.mmap & PAGE_MASK,
-			    PAGE_ALIGN(params.mmap_size +
-				       (params.mmap & ~PAGE_MASK)));
-}
-
-static bool __init efi_virtmap_init(void)
-{
-	efi_memory_desc_t *md;
-
-	init_new_context(NULL, &efi_mm);
-
-	for_each_efi_memory_desc(&memmap, md) {
-		pgprot_t prot;
-
-		if (!(md->attribute & EFI_MEMORY_RUNTIME))
-			continue;
-		if (md->virt_addr == 0)
-			return false;
-
-		pr_info("  EFI remap 0x%016llx => %p\n",
-			md->phys_addr, (void *)md->virt_addr);
-
-		/*
-		 * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be
-		 * executable, everything else can be mapped with the XN bits
-		 * set.
-		 */
-		if (!is_normal_ram(md))
-			prot = __pgprot(PROT_DEVICE_nGnRE);
-		else if (md->type == EFI_RUNTIME_SERVICES_CODE ||
-			 !PAGE_ALIGNED(md->phys_addr))
-			prot = PAGE_KERNEL_EXEC;
-		else
-			prot = PAGE_KERNEL;
-
-		create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr,
-				   md->num_pages << EFI_PAGE_SHIFT, 
-				   __pgprot(pgprot_val(prot) | PTE_NG));
-	}
-	return true;
-}
-
-/*
- * Enable the UEFI Runtime Services if all prerequisites are in place, i.e.,
- * non-early mapping of the UEFI system table and virtual mappings for all
- * EFI_MEMORY_RUNTIME regions.
- */
-static int __init arm64_enable_runtime_services(void)
-{
-	u64 mapsize;
-
-	if (!efi_enabled(EFI_BOOT)) {
-		pr_info("EFI services will not be available.\n");
-		return 0;
-	}
-
-	if (efi_runtime_disabled()) {
-		pr_info("EFI runtime services will be disabled.\n");
-		return 0;
-	}
-
-	pr_info("Remapping and enabling EFI services.\n");
-
-	mapsize = memmap.map_end - memmap.map;
-	memmap.map = (__force void *)ioremap_cache(memmap.phys_map,
-						   mapsize);
-	if (!memmap.map) {
-		pr_err("Failed to remap EFI memory map\n");
-		return -ENOMEM;
-	}
-	memmap.map_end = memmap.map + mapsize;
-	efi.memmap = &memmap;
-
-	efi.systab = (__force void *)ioremap_cache(efi_system_table,
-						   sizeof(efi_system_table_t));
-	if (!efi.systab) {
-		pr_err("Failed to remap EFI System Table\n");
-		return -ENOMEM;
-	}
-	set_bit(EFI_SYSTEM_TABLES, &efi.flags);
-
-	if (!efi_virtmap_init()) {
-		pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n");
-		return -ENOMEM;
-	}
-
-	/* Set up runtime services function pointers */
-	efi_native_runtime_setup();
-	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
-
-	efi.runtime_version = efi.systab->hdr.revision;
-
-	return 0;
-}
-early_initcall(arm64_enable_runtime_services);
 
 static int __init arm64_dmi_init(void)
 {
@@ -338,23 +31,6 @@ static int __init arm64_dmi_init(void)
 }
 core_initcall(arm64_dmi_init);
 
-static void efi_set_pgd(struct mm_struct *mm)
-{
-	switch_mm(NULL, mm, NULL);
-}
-
-void efi_virtmap_load(void)
-{
-	preempt_disable();
-	efi_set_pgd(&efi_mm);
-}
-
-void efi_virtmap_unload(void)
-{
-	efi_set_pgd(current->active_mm);
-	preempt_enable();
-}
-
 /*
  * UpdateCapsule() depends on the system being shutdown via
  * ResetSystem().
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index ec379a4164cc..f292917b00e7 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -18,3 +18,6 @@ obj-$(CONFIG_EFI_RUNTIME_MAP)		+= runtime-map.o
 obj-$(CONFIG_EFI_RUNTIME_WRAPPERS)	+= runtime-wrappers.o
 obj-$(CONFIG_EFI_STUB)			+= libstub/
 obj-$(CONFIG_EFI_FAKE_MEMMAP)		+= fake_mem.o
+
+arm-obj-$(CONFIG_EFI)			:= arm-init.o arm-runtime.o
+obj-$(CONFIG_ARM64)			+= $(arm-obj-y)
diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
new file mode 100644
index 000000000000..ffdd76a51929
--- /dev/null
+++ b/drivers/firmware/efi/arm-init.c
@@ -0,0 +1,208 @@
+/*
+ * Extensible Firmware Interface
+ *
+ * Based on Extensible Firmware Interface Specification version 2.4
+ *
+ * Copyright (C) 2013 - 2015 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm_types.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+
+#include <asm/efi.h>
+
+struct efi_memory_map memmap;
+
+u64 efi_system_table;
+
+static int __init is_normal_ram(efi_memory_desc_t *md)
+{
+	if (md->attribute & EFI_MEMORY_WB)
+		return 1;
+	return 0;
+}
+
+/*
+ * Translate a EFI virtual address into a physical address: this is necessary,
+ * as some data members of the EFI system table are virtually remapped after
+ * SetVirtualAddressMap() has been called.
+ */
+static phys_addr_t efi_to_phys(unsigned long addr)
+{
+	efi_memory_desc_t *md;
+
+	for_each_efi_memory_desc(&memmap, md) {
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
+		if (md->virt_addr == 0)
+			/* no virtual mapping has been installed by the stub */
+			break;
+		if (md->virt_addr <= addr &&
+		    (addr - md->virt_addr) < (md->num_pages << EFI_PAGE_SHIFT))
+			return md->phys_addr + addr - md->virt_addr;
+	}
+	return addr;
+}
+
+static int __init uefi_init(void)
+{
+	efi_char16_t *c16;
+	void *config_tables;
+	u64 table_size;
+	char vendor[100] = "unknown";
+	int i, retval;
+
+	efi.systab = early_memremap(efi_system_table,
+				    sizeof(efi_system_table_t));
+	if (efi.systab == NULL) {
+		pr_warn("Unable to map EFI system table.\n");
+		return -ENOMEM;
+	}
+
+	set_bit(EFI_BOOT, &efi.flags);
+	set_bit(EFI_64BIT, &efi.flags);
+
+	/*
+	 * Verify the EFI Table
+	 */
+	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) {
+		pr_err("System table signature incorrect\n");
+		retval = -EINVAL;
+		goto out;
+	}
+	if ((efi.systab->hdr.revision >> 16) < 2)
+		pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n",
+			efi.systab->hdr.revision >> 16,
+			efi.systab->hdr.revision & 0xffff);
+
+	/* Show what we know for posterity */
+	c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor),
+			     sizeof(vendor) * sizeof(efi_char16_t));
+	if (c16) {
+		for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i)
+			vendor[i] = c16[i];
+		vendor[i] = '\0';
+		early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t));
+	}
+
+	pr_info("EFI v%u.%.02u by %s\n",
+		efi.systab->hdr.revision >> 16,
+		efi.systab->hdr.revision & 0xffff, vendor);
+
+	table_size = sizeof(efi_config_table_64_t) * efi.systab->nr_tables;
+	config_tables = early_memremap(efi_to_phys(efi.systab->tables),
+				       table_size);
+	if (config_tables == NULL) {
+		pr_warn("Unable to map EFI config table array.\n");
+		retval = -ENOMEM;
+		goto out;
+	}
+	retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables,
+					 sizeof(efi_config_table_64_t), NULL);
+
+	early_memunmap(config_tables, table_size);
+out:
+	early_memunmap(efi.systab,  sizeof(efi_system_table_t));
+	return retval;
+}
+
+/*
+ * Return true for RAM regions we want to permanently reserve.
+ */
+static __init int is_reserve_region(efi_memory_desc_t *md)
+{
+	switch (md->type) {
+	case EFI_LOADER_CODE:
+	case EFI_LOADER_DATA:
+	case EFI_BOOT_SERVICES_CODE:
+	case EFI_BOOT_SERVICES_DATA:
+	case EFI_CONVENTIONAL_MEMORY:
+	case EFI_PERSISTENT_MEMORY:
+		return 0;
+	default:
+		break;
+	}
+	return is_normal_ram(md);
+}
+
+static __init void reserve_regions(void)
+{
+	efi_memory_desc_t *md;
+	u64 paddr, npages, size;
+
+	if (efi_enabled(EFI_DBG))
+		pr_info("Processing EFI memory map:\n");
+
+	for_each_efi_memory_desc(&memmap, md) {
+		paddr = md->phys_addr;
+		npages = md->num_pages;
+
+		if (efi_enabled(EFI_DBG)) {
+			char buf[64];
+
+			pr_info("  0x%012llx-0x%012llx %s",
+				paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1,
+				efi_md_typeattr_format(buf, sizeof(buf), md));
+		}
+
+		memrange_efi_to_native(&paddr, &npages);
+		size = npages << PAGE_SHIFT;
+
+		if (is_normal_ram(md))
+			early_init_dt_add_memory_arch(paddr, size);
+
+		if (is_reserve_region(md)) {
+			memblock_mark_nomap(paddr, size);
+			if (efi_enabled(EFI_DBG))
+				pr_cont("*");
+		}
+
+		if (efi_enabled(EFI_DBG))
+			pr_cont("\n");
+	}
+
+	set_bit(EFI_MEMMAP, &efi.flags);
+}
+
+void __init efi_init(void)
+{
+	struct efi_fdt_params params;
+
+	/* Grab UEFI information placed in FDT by stub */
+	if (!efi_get_fdt_params(&params))
+		return;
+
+	efi_system_table = params.system_table;
+
+	memmap.phys_map = params.mmap;
+	memmap.map = early_memremap(params.mmap, params.mmap_size);
+	if (memmap.map == NULL) {
+		/*
+		* If we are booting via UEFI, the UEFI memory map is the only
+		* description of memory we have, so there is little point in
+		* proceeding if we cannot access it.
+		*/
+		panic("Unable to map EFI memory map.\n");
+	}
+	memmap.map_end = memmap.map + params.mmap_size;
+	memmap.desc_size = params.desc_size;
+	memmap.desc_version = params.desc_ver;
+
+	if (uefi_init() < 0)
+		return;
+
+	reserve_regions();
+	early_memunmap(memmap.map, params.mmap_size);
+	memblock_mark_nomap(params.mmap & PAGE_MASK,
+			    PAGE_ALIGN(params.mmap_size +
+				       (params.mmap & ~PAGE_MASK)));
+}
diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c
new file mode 100644
index 000000000000..974743e13a4d
--- /dev/null
+++ b/drivers/firmware/efi/arm-runtime.c
@@ -0,0 +1,151 @@
+/*
+ * Extensible Firmware Interface
+ *
+ * Based on Extensible Firmware Interface Specification version 2.4
+ *
+ * Copyright (C) 2013, 2014 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/efi.h>
+#include <linux/memblock.h>
+#include <linux/mm_types.h>
+#include <linux/preempt.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <asm/cacheflush.h>
+#include <asm/efi.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+
+static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss;
+
+extern u64 efi_system_table;
+
+static struct mm_struct efi_mm = {
+	.mm_rb			= RB_ROOT,
+	.pgd			= efi_pgd,
+	.mm_users		= ATOMIC_INIT(2),
+	.mm_count		= ATOMIC_INIT(1),
+	.mmap_sem		= __RWSEM_INITIALIZER(efi_mm.mmap_sem),
+	.page_table_lock	= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
+	.mmlist			= LIST_HEAD_INIT(efi_mm.mmlist),
+};
+
+static bool __init efi_virtmap_init(void)
+{
+	efi_memory_desc_t *md;
+
+	init_new_context(NULL, &efi_mm);
+
+	for_each_efi_memory_desc(&memmap, md) {
+		pgprot_t prot;
+
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
+		if (md->virt_addr == 0)
+			return false;
+
+		pr_info("  EFI remap 0x%016llx => %p\n",
+			md->phys_addr, (void *)md->virt_addr);
+
+		/*
+		 * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be
+		 * executable, everything else can be mapped with the XN bits
+		 * set.
+		 */
+		if ((md->attribute & EFI_MEMORY_WB) == 0)
+			prot = __pgprot(PROT_DEVICE_nGnRE);
+		else if (md->type == EFI_RUNTIME_SERVICES_CODE ||
+			 !PAGE_ALIGNED(md->phys_addr))
+			prot = PAGE_KERNEL_EXEC;
+		else
+			prot = PAGE_KERNEL;
+
+		create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr,
+				   md->num_pages << EFI_PAGE_SHIFT,
+				   __pgprot(pgprot_val(prot) | PTE_NG));
+	}
+	return true;
+}
+
+/*
+ * Enable the UEFI Runtime Services if all prerequisites are in place, i.e.,
+ * non-early mapping of the UEFI system table and virtual mappings for all
+ * EFI_MEMORY_RUNTIME regions.
+ */
+static int __init arm64_enable_runtime_services(void)
+{
+	u64 mapsize;
+
+	if (!efi_enabled(EFI_BOOT)) {
+		pr_info("EFI services will not be available.\n");
+		return 0;
+	}
+
+	if (efi_runtime_disabled()) {
+		pr_info("EFI runtime services will be disabled.\n");
+		return 0;
+	}
+
+	pr_info("Remapping and enabling EFI services.\n");
+
+	mapsize = memmap.map_end - memmap.map;
+	memmap.map = (__force void *)ioremap_cache(memmap.phys_map,
+						   mapsize);
+	if (!memmap.map) {
+		pr_err("Failed to remap EFI memory map\n");
+		return -ENOMEM;
+	}
+	memmap.map_end = memmap.map + mapsize;
+	efi.memmap = &memmap;
+
+	efi.systab = (__force void *)ioremap_cache(efi_system_table,
+						   sizeof(efi_system_table_t));
+	if (!efi.systab) {
+		pr_err("Failed to remap EFI System Table\n");
+		return -ENOMEM;
+	}
+	set_bit(EFI_SYSTEM_TABLES, &efi.flags);
+
+	if (!efi_virtmap_init()) {
+		pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n");
+		return -ENOMEM;
+	}
+
+	/* Set up runtime services function pointers */
+	efi_native_runtime_setup();
+	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+
+	efi.runtime_version = efi.systab->hdr.revision;
+
+	return 0;
+}
+early_initcall(arm64_enable_runtime_services);
+
+static void efi_set_pgd(struct mm_struct *mm)
+{
+	switch_mm(NULL, mm, NULL);
+}
+
+void efi_virtmap_load(void)
+{
+	preempt_disable();
+	efi_set_pgd(&efi_mm);
+}
+
+void efi_virtmap_unload(void)
+{
+	efi_set_pgd(current->active_mm);
+	preempt_enable();
+}

From e5ef8437e39123daae78dbd9010c45226681d076 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 30 Nov 2015 13:28:19 +0100
Subject: [PATCH 0473/3220] UPSTREAM: arm64/efi: refactor EFI init and runtime
 code for reuse by 32-bit ARM

This refactors the EFI init and runtime code that will be shared
between arm64 and ARM so that it can be built for both archs.

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: Ieee70bbe117170d2054a9c82c4f1a8143b7e302b
(cherry picked from commit f7d924894265794f447ea799dd853400749b5a22)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/efi.h       |  9 ++++++
 arch/arm64/kernel/efi.c            | 23 ++++++++++++++
 drivers/firmware/efi/arm-init.c    |  7 +++--
 drivers/firmware/efi/arm-runtime.c | 48 ++++++++++--------------------
 drivers/firmware/efi/efi.c         |  2 ++
 5 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index ef572206f1c3..8e88a696c9cb 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -2,7 +2,9 @@
 #define _ASM_EFI_H
 
 #include <asm/io.h>
+#include <asm/mmu_context.h>
 #include <asm/neon.h>
+#include <asm/tlbflush.h>
 
 #ifdef CONFIG_EFI
 extern void efi_init(void);
@@ -10,6 +12,8 @@ extern void efi_init(void);
 #define efi_init()
 #endif
 
+int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
+
 #define efi_call_virt(f, ...)						\
 ({									\
 	efi_##f##_t *__f;						\
@@ -63,6 +67,11 @@ extern void efi_init(void);
  *   Services are enabled and the EFI_RUNTIME_SERVICES bit set.
  */
 
+static inline void efi_set_pgd(struct mm_struct *mm)
+{
+	switch_mm(NULL, mm, NULL);
+}
+
 void efi_virtmap_load(void);
 void efi_virtmap_unload(void);
 
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index bd3b2f5adf0c..b6abc852f2a1 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -17,6 +17,29 @@
 
 #include <asm/efi.h>
 
+int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
+{
+	pteval_t prot_val;
+
+	/*
+	 * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be
+	 * executable, everything else can be mapped with the XN bits
+	 * set.
+	 */
+	if ((md->attribute & EFI_MEMORY_WB) == 0)
+		prot_val = PROT_DEVICE_nGnRE;
+	else if (md->type == EFI_RUNTIME_SERVICES_CODE ||
+		 !PAGE_ALIGNED(md->phys_addr))
+		prot_val = pgprot_val(PAGE_KERNEL_EXEC);
+	else
+		prot_val = pgprot_val(PAGE_KERNEL);
+
+	create_pgd_mapping(mm, md->phys_addr, md->virt_addr,
+			   md->num_pages << EFI_PAGE_SHIFT,
+			   __pgprot(prot_val | PTE_NG));
+	return 0;
+}
+
 static int __init arm64_dmi_init(void)
 {
 	/*
diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
index ffdd76a51929..9e15d571b53c 100644
--- a/drivers/firmware/efi/arm-init.c
+++ b/drivers/firmware/efi/arm-init.c
@@ -57,7 +57,7 @@ static int __init uefi_init(void)
 {
 	efi_char16_t *c16;
 	void *config_tables;
-	u64 table_size;
+	size_t table_size;
 	char vendor[100] = "unknown";
 	int i, retval;
 
@@ -69,7 +69,8 @@ static int __init uefi_init(void)
 	}
 
 	set_bit(EFI_BOOT, &efi.flags);
-	set_bit(EFI_64BIT, &efi.flags);
+	if (IS_ENABLED(CONFIG_64BIT))
+		set_bit(EFI_64BIT, &efi.flags);
 
 	/*
 	 * Verify the EFI Table
@@ -107,7 +108,7 @@ static int __init uefi_init(void)
 		goto out;
 	}
 	retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables,
-					 sizeof(efi_config_table_64_t), NULL);
+					 sizeof(efi_config_table_t), NULL);
 
 	early_memunmap(config_tables, table_size);
 out:
diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c
index 974743e13a4d..6ae21e41a429 100644
--- a/drivers/firmware/efi/arm-runtime.c
+++ b/drivers/firmware/efi/arm-runtime.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/efi.h>
+#include <linux/io.h>
 #include <linux/memblock.h>
 #include <linux/mm_types.h>
 #include <linux/preempt.h>
@@ -23,18 +24,14 @@
 
 #include <asm/cacheflush.h>
 #include <asm/efi.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
 #include <asm/mmu.h>
+#include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 
-static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss;
-
 extern u64 efi_system_table;
 
 static struct mm_struct efi_mm = {
 	.mm_rb			= RB_ROOT,
-	.pgd			= efi_pgd,
 	.mm_users		= ATOMIC_INIT(2),
 	.mm_count		= ATOMIC_INIT(1),
 	.mmap_sem		= __RWSEM_INITIALIZER(efi_mm.mmap_sem),
@@ -46,35 +43,27 @@ static bool __init efi_virtmap_init(void)
 {
 	efi_memory_desc_t *md;
 
+	efi_mm.pgd = pgd_alloc(&efi_mm);
 	init_new_context(NULL, &efi_mm);
 
 	for_each_efi_memory_desc(&memmap, md) {
-		pgprot_t prot;
+		phys_addr_t phys = md->phys_addr;
+		int ret;
 
 		if (!(md->attribute & EFI_MEMORY_RUNTIME))
 			continue;
 		if (md->virt_addr == 0)
 			return false;
 
-		pr_info("  EFI remap 0x%016llx => %p\n",
-			md->phys_addr, (void *)md->virt_addr);
-
-		/*
-		 * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be
-		 * executable, everything else can be mapped with the XN bits
-		 * set.
-		 */
-		if ((md->attribute & EFI_MEMORY_WB) == 0)
-			prot = __pgprot(PROT_DEVICE_nGnRE);
-		else if (md->type == EFI_RUNTIME_SERVICES_CODE ||
-			 !PAGE_ALIGNED(md->phys_addr))
-			prot = PAGE_KERNEL_EXEC;
-		else
-			prot = PAGE_KERNEL;
-
-		create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr,
-				   md->num_pages << EFI_PAGE_SHIFT,
-				   __pgprot(pgprot_val(prot) | PTE_NG));
+		ret = efi_create_mapping(&efi_mm, md);
+		if  (!ret) {
+			pr_info("  EFI remap %pa => %p\n",
+				&phys, (void *)(unsigned long)md->virt_addr);
+		} else {
+			pr_warn("  EFI remap %pa: failed to create mapping (%d)\n",
+				&phys, ret);
+			return false;
+		}
 	}
 	return true;
 }
@@ -84,7 +73,7 @@ static bool __init efi_virtmap_init(void)
  * non-early mapping of the UEFI system table and virtual mappings for all
  * EFI_MEMORY_RUNTIME regions.
  */
-static int __init arm64_enable_runtime_services(void)
+static int __init arm_enable_runtime_services(void)
 {
 	u64 mapsize;
 
@@ -131,12 +120,7 @@ static int __init arm64_enable_runtime_services(void)
 
 	return 0;
 }
-early_initcall(arm64_enable_runtime_services);
-
-static void efi_set_pgd(struct mm_struct *mm)
-{
-	switch_mm(NULL, mm, NULL);
-}
+early_initcall(arm_enable_runtime_services);
 
 void efi_virtmap_load(void)
 {
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 027ca212179f..cffa89b3317b 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -25,6 +25,8 @@
 #include <linux/io.h>
 #include <linux/platform_device.h>
 
+#include <asm/efi.h>
+
 struct efi __read_mostly efi = {
 	.mps			= EFI_INVALID_TABLE_ADDR,
 	.acpi			= EFI_INVALID_TABLE_ADDR,

From d5f46e4b247971c9c10f902151dcf7df8c750b1c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 5 Nov 2015 15:09:17 +0000
Subject: [PATCH 0474/3220] UPSTREAM: arm64: Add macros to read/write system
 registers

Rather than crafting custom macros for reading/writing each system
register provide generics accessors, read_sysreg and write_sysreg, for
this purpose.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Suzuki Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

Change-Id: I1d6cf948bc6660dfd096ff5a18eba682941098c1
(cherry picked from commit 3600c2fdc09a43a30909743569e35a29121602ed)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/sysreg.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index b9fd8ec79033..1a78d6e2a78b 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -20,6 +20,8 @@
 #ifndef __ASM_SYSREG_H
 #define __ASM_SYSREG_H
 
+#include <linux/stringify.h>
+
 #include <asm/opcodes.h>
 
 /*
@@ -215,6 +217,8 @@
 
 #else
 
+#include <linux/types.h>
+
 asm(
 "	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n"
 "	.equ	.L__reg_num_x\\num, \\num\n"
@@ -239,6 +243,23 @@ static inline void config_sctlr_el1(u32 clear, u32 set)
 	val |= set;
 	asm volatile("msr sctlr_el1, %0" : : "r" (val));
 }
+
+/*
+ * Unlike read_cpuid, calls to read_sysreg are never expected to be
+ * optimized away or replaced with synthetic values.
+ */
+#define read_sysreg(r) ({					\
+	u64 __val;						\
+	asm volatile("mrs %0, " __stringify(r) : "=r" (__val));	\
+	__val;							\
+})
+
+#define write_sysreg(v, r) do {					\
+	u64 __val = (u64)v;					\
+	asm volatile("msr " __stringify(r) ", %0"		\
+		     : : "r" (__val));				\
+} while (0)
+
 #endif
 
 #endif	/* __ASM_SYSREG_H */

From 3e64a8612dfa8738b95ad904a454b859ec9d9174 Mon Sep 17 00:00:00 2001
From: Andrew Pinski <apinski@cavium.com>
Date: Wed, 24 Feb 2016 17:44:57 -0800
Subject: [PATCH 0475/3220] UPSTREAM: arm64: Add workaround for Cavium erratum
 27456

On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI
instructions may cause the icache to become corrupted if it contains
data for a non-current ASID.

This patch implements the workaround (which invalidates the local
icache when switching the mm) by using code patching.

Signed-off-by: Andrew Pinski <apinski@cavium.com>
Signed-off-by: David Daney <david.daney@cavium.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: I60e6d17926b067a4e022d7b159e239114303a547
(cherry picked from commit 104a0c02e8b1936c049e18a6d4e4ab040fb61213)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 Documentation/arm64/silicon-errata.txt |  1 +
 arch/arm64/Kconfig                     | 11 +++++++++++
 arch/arm64/include/asm/cpufeature.h    |  3 ++-
 arch/arm64/kernel/cpu_errata.c         |  9 +++++++++
 arch/arm64/mm/proc.S                   | 12 ++++++++++++
 5 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt
index 58b71ddf9b60..ba4b6acfc545 100644
--- a/Documentation/arm64/silicon-errata.txt
+++ b/Documentation/arm64/silicon-errata.txt
@@ -56,3 +56,4 @@ stable kernels.
 |                |                 |                 |                         |
 | Cavium         | ThunderX ITS    | #22375, #24313  | CAVIUM_ERRATUM_22375    |
 | Cavium         | ThunderX GICv3  | #23154          | CAVIUM_ERRATUM_23154    |
+| Cavium         | ThunderX Core   | #27456          | CAVIUM_ERRATUM_27456    |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index bb92d3875791..bf24d67a1ff1 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -439,6 +439,17 @@ config CAVIUM_ERRATUM_23154
 
 	  If unsure, say Y.
 
+config CAVIUM_ERRATUM_27456
+	bool "Cavium erratum 27456: Broadcast TLBI instructions may cause icache corruption"
+	default y
+	help
+	  On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI
+	  instructions may cause the icache to become corrupted if it
+	  contains data for a non-current ASID.  The fix is to
+	  invalidate the icache when changing the mm context.
+
+	  If unsure, say Y.
+
 endmenu
 
 
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 37a53fc6b384..727e594ac5c2 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -33,8 +33,9 @@
 #define ARM64_HAS_NO_HW_PREFETCH		8
 #define ARM64_HAS_UAO				9
 #define ARM64_ALT_PAN_NOT_UAO			10
+#define ARM64_WORKAROUND_CAVIUM_27456		12
 
-#define ARM64_NCAPS				11
+#define ARM64_NCAPS				13
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index e6bc988e8dbf..06afd04e02c0 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -87,6 +87,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		.capability = ARM64_WORKAROUND_CAVIUM_23154,
 		MIDR_RANGE(MIDR_THUNDERX, 0x00, 0x01),
 	},
+#endif
+#ifdef CONFIG_CAVIUM_ERRATUM_27456
+	{
+	/* Cavium ThunderX, T88 pass 1.x - 2.1 */
+		.desc = "Cavium erratum 27456",
+		.capability = ARM64_WORKAROUND_CAVIUM_27456,
+		MIDR_RANGE(MIDR_THUNDERX, 0x00,
+			   (1 << MIDR_VARIANT_SHIFT) | 1),
+	},
 #endif
 	{
 	}
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 0c19534a901e..543f5198005a 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -25,6 +25,8 @@
 #include <asm/hwcap.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/pgtable.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative.h>
 
 #include "proc-macros.S"
 
@@ -137,7 +139,17 @@ ENTRY(cpu_do_switch_mm)
 	bfi	x0, x1, #48, #16		// set the ASID
 	msr	ttbr0_el1, x0			// set TTBR0
 	isb
+alternative_if_not ARM64_WORKAROUND_CAVIUM_27456
 	ret
+	nop
+	nop
+	nop
+alternative_else
+	ic	iallu
+	dsb	nsh
+	isb
+	ret
+alternative_endif
 ENDPROC(cpu_do_switch_mm)
 
 	.pushsection ".idmap.text", "ax"

From b4ad81e8d05dc75ee527c56f6936ebc7fa3f46a4 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 18 Mar 2016 10:58:09 +0100
Subject: [PATCH 0476/3220] UPSTREAM: arm64/kernel: fix incorrect EL0 check in
 inv_entry macro

The implementation of macro inv_entry refers to its 'el' argument without
the required leading backslash, which results in an undefined symbol
'el' to be passed into the kernel_entry macro rather than the index of
the exception level as intended.

This undefined symbol strangely enough does not result in build failures,
although it is visible in vmlinux:

     $ nm -n vmlinux |head
                      U el
     0000000000000000 A _kernel_flags_le_hi32
     0000000000000000 A _kernel_offset_le_hi32
     0000000000000000 A _kernel_size_le_hi32
     000000000000000a A _kernel_flags_le_lo32
     .....

However, it does result in incorrect code being generated for invalid
exceptions taken from EL0, since the argument check in kernel_entry
assumes EL1 if its argument does not equal '0'.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: I406c1207682a4dff3054a019c26fdf1310b08ed1
(cherry picked from commit b660950c60a7278f9d8deb7c32a162031207c758)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/kernel/entry.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 1f7f5a2b61bf..12e8d2bcb3f9 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -277,7 +277,7 @@ END(vectors)
  * Invalid mode handlers
  */
 	.macro	inv_entry, el, reason, regsize = 64
-	kernel_entry el, \regsize
+	kernel_entry \el, \regsize
 	mov	x0, sp
 	mov	x1, #\reason
 	mrs	x2, esr_el1

From c1ee3c7d19d2735996d99f6c94a50ac77a9ea338 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 14:25:46 +0200
Subject: [PATCH 0477/3220] UPSTREAM: arm64/mm: ensure memstart_addr remains
 sufficiently aligned

After choosing memstart_addr to be the highest multiple of
ARM64_MEMSTART_ALIGN less than or equal to the first usable physical memory
address, we clip the memblocks to the maximum size of the linear region.
Since the kernel may be high up in memory, we take care not to clip the
kernel itself, which means we have to clip some memory from the bottom if
this occurs, to ensure that the distance between the first and the last
usable physical memory address can be covered by the linear region.

However, we fail to update memstart_addr if this clipping from the bottom
occurs, which means that we may still end up with virtual addresses that
wrap into the userland range. So increment memstart_addr as appropriate to
prevent this from happening.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: I72306207cc46a30b780f5e00b9ef23aa8409867e
(cherry picked from commit 2958987f5da2ebcf6a237c5f154d7e3340e60945)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/mm/init.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 08aa45ff05b6..0db12e9e8765 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -192,8 +192,12 @@ void __init arm64_memblock_init(void)
 	 */
 	memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)),
 			ULLONG_MAX);
-	if (memblock_end_of_DRAM() > linear_region_size)
-		memblock_remove(0, memblock_end_of_DRAM() - linear_region_size);
+	if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
+		/* ensure that memstart_addr remains sufficiently aligned */
+		memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
+					 ARM64_MEMSTART_ALIGN);
+		memblock_remove(0, memstart_addr);
+	}
 
 	/*
 	 * Apply the memory limit if it was set. Since the kernel may be loaded

From 9297d46fb6479dad23622f61760dd9fb1700442d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 14:25:47 +0200
Subject: [PATCH 0478/3220] UPSTREAM: arm64: choose memstart_addr based on
 minimum sparsemem section alignment

This redefines ARM64_MEMSTART_ALIGN in terms of the minimal alignment
required by sparsemem vmemmap. This comes down to using 1 GB for all
translation granules if CONFIG_SPARSEMEM_VMEMMAP is enabled.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: I05b8bc6ab24f677f263b09d7c31fcce4f21269b1
(cherry picked from commit 06e9bf2fd9b372bc1c757995b6ca1cfab0720acb)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/kernel-pgtable.h | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 5c6375d8528b..7e51d1b57c0c 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -19,6 +19,7 @@
 #ifndef __ASM_KERNEL_PGTABLE_H
 #define __ASM_KERNEL_PGTABLE_H
 
+#include <asm/sparsemem.h>
 
 /*
  * The linear mapping and the start of memory are both 2M aligned (per
@@ -86,10 +87,24 @@
  * (64k granule), or a multiple that can be mapped using contiguous bits
  * in the page tables: 32 * PMD_SIZE (16k granule)
  */
-#ifdef CONFIG_ARM64_64K_PAGES
-#define ARM64_MEMSTART_ALIGN	SZ_512M
+#if defined(CONFIG_ARM64_4K_PAGES)
+#define ARM64_MEMSTART_SHIFT		PUD_SHIFT
+#elif defined(CONFIG_ARM64_16K_PAGES)
+#define ARM64_MEMSTART_SHIFT		(PMD_SHIFT + 5)
 #else
-#define ARM64_MEMSTART_ALIGN	SZ_1G
+#define ARM64_MEMSTART_SHIFT		PMD_SHIFT
+#endif
+
+/*
+ * sparsemem vmemmap imposes an additional requirement on the alignment of
+ * memstart_addr, due to the fact that the base of the vmemmap region
+ * has a direct correspondence, and needs to appear sufficiently aligned
+ * in the virtual address space.
+ */
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) && ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
+#define ARM64_MEMSTART_ALIGN	(1UL << SECTION_SIZE_BITS)
+#else
+#define ARM64_MEMSTART_ALIGN	(1UL << ARM64_MEMSTART_SHIFT)
 #endif
 
 #endif	/* __ASM_KERNEL_PGTABLE_H */

From 29089dcc2c5272437ce9bbdae4ea249ed25e8281 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 13 Apr 2016 16:01:22 +0100
Subject: [PATCH 0479/3220] UPSTREAM: arm64: Implement ptep_set_access_flags()
 for hardware AF/DBM

When hardware updates of the access and dirty states are enabled, the
default ptep_set_access_flags() implementation based on calling
set_pte_at() directly is potentially racy. This triggers the "racy dirty
state clearing" warning in set_pte_at() because an existing writable PTE
is overridden with a clean entry.

There are two main scenarios for this situation:

1. The CPU getting an access fault does not support hardware updates of
   the access/dirty flags. However, a different agent in the system
   (e.g. SMMU) can do this, therefore overriding a writable entry with a
   clean one could potentially lose the automatically updated dirty
   status

2. A more complex situation is possible when all CPUs support hardware
   AF/DBM:

   a) Initial state: shareable + writable vma and pte_none(pte)
   b) Read fault taken by two threads of the same process on different
      CPUs
   c) CPU0 takes the mmap_sem and proceeds to handling the fault. It
      eventually reaches do_set_pte() which sets a writable + clean pte.
      CPU0 releases the mmap_sem
   d) CPU1 acquires the mmap_sem and proceeds to handle_pte_fault(). The
      pte entry it reads is present, writable and clean and it continues
      to pte_mkyoung()
   e) CPU1 calls ptep_set_access_flags()

   If between (d) and (e) the hardware (another CPU) updates the dirty
   state (clears PTE_RDONLY), CPU1 will override the PTR_RDONLY bit
   marking the entry clean again.

This patch implements an arm64-specific ptep_set_access_flags() function
to perform an atomic update of the PTE flags.

Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits")
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Ming Lei <tom.leiming@gmail.com>
Tested-by: Julien Grall <julien.grall@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: <stable@vger.kernel.org> # 4.3+
[will: reworded comment]
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: Id2a0b0d8eb6e7df6325ecb48b88b8401a5dd09e5
(cherry picked from commit 66dbd6e61a526ae7d11a208238ae2c17e5cacb6b)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/pgtable.h |  5 ++++
 arch/arm64/mm/fault.c            | 50 ++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index d48ea8fc789d..0c82a1d9f31e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -594,6 +594,11 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 }
 
 #ifdef CONFIG_ARM64_HW_AFDBM
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pte_t *ptep,
+				 pte_t entry, int dirty);
+
 /*
  * Atomic pte/pmd modifications.
  */
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 44e56de23f79..9bb45de46085 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -81,6 +81,56 @@ void show_pte(struct mm_struct *mm, unsigned long addr)
 	printk("\n");
 }
 
+#ifdef CONFIG_ARM64_HW_AFDBM
+/*
+ * This function sets the access flags (dirty, accessed), as well as write
+ * permission, and only to a more permissive setting.
+ *
+ * It needs to cope with hardware update of the accessed/dirty state by other
+ * agents in the system and can safely skip the __sync_icache_dcache() call as,
+ * like set_pte_at(), the PTE is never changed from no-exec to exec here.
+ *
+ * Returns whether or not the PTE actually changed.
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pte_t *ptep,
+			  pte_t entry, int dirty)
+{
+	pteval_t old_pteval;
+	unsigned int tmp;
+
+	if (pte_same(*ptep, entry))
+		return 0;
+
+	/* only preserve the access flags and write permission */
+	pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY;
+
+	/*
+	 * PTE_RDONLY is cleared by default in the asm below, so set it in
+	 * back if necessary (read-only or clean PTE).
+	 */
+	if (!pte_write(entry) || !dirty)
+		pte_val(entry) |= PTE_RDONLY;
+
+	/*
+	 * Setting the flags must be done atomically to avoid racing with the
+	 * hardware update of the access/dirty state.
+	 */
+	asm volatile("//	ptep_set_access_flags\n"
+	"	prfm	pstl1strm, %2\n"
+	"1:	ldxr	%0, %2\n"
+	"	and	%0, %0, %3		// clear PTE_RDONLY\n"
+	"	orr	%0, %0, %4		// set flags\n"
+	"	stxr	%w1, %0, %2\n"
+	"	cbnz	%w1, 1b\n"
+	: "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep))
+	: "L" (~PTE_RDONLY), "r" (pte_val(entry)));
+
+	flush_tlb_fix_spurious_fault(vma, address);
+	return 1;
+}
+#endif
+
 /*
  * The kernel tried to access some page that wasn't present.
  */

From 2515432e66e01f632e49069e7c04b969dc13c3d5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:44 +0200
Subject: [PATCH 0480/3220] UPSTREAM: arm64: introduce mov_q macro to move a
 constant into a 64-bit register

Implement a macro mov_q that can be used to move an immediate constant
into a 64-bit register, using between 2 and 4 movz/movk instructions
(depending on the operand)

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: I7e6b684e46cad5df79e6b8bc28d72b9e37daedd6
(cherry picked from commit 30b5ba5cf333cc650e474eaf2cc1ae91bc7cf89f)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 70f7b9e04598..9ea846ded55c 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -233,4 +233,24 @@ lr	.req	x30		// link register
 	.long	\sym\()_hi32
 	.endm
 
+	/*
+	 * mov_q - move an immediate constant into a 64-bit register using
+	 *         between 2 and 4 movz/movk instructions (depending on the
+	 *         magnitude and sign of the operand)
+	 */
+	.macro	mov_q, reg, val
+	.if (((\val) >> 31) == 0 || ((\val) >> 31) == 0x1ffffffff)
+	movz	\reg, :abs_g1_s:\val
+	.else
+	.if (((\val) >> 47) == 0 || ((\val) >> 47) == 0x1ffff)
+	movz	\reg, :abs_g2_s:\val
+	.else
+	movz	\reg, :abs_g3:\val
+	movk	\reg, :abs_g2_nc:\val
+	.endif
+	movk	\reg, :abs_g1_nc:\val
+	.endif
+	movk	\reg, :abs_g0_nc:\val
+	.endm
+
 #endif	/* __ASM_ASSEMBLER_H */

From 9c8a3a98f3c33eb7665dd5f101f28c46fedb4b75 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Wed, 27 Apr 2016 17:47:00 +0100
Subject: [PATCH 0481/3220] BACKPORT: arm64: Fold proc-macros.S into
 assembler.h

To allow the assembler macros defined in arch/arm64/mm/proc-macros.S to
be used outside the mm code move the contents of proc-macros.S to
asm/assembler.h.  Also, delete proc-macros.S, and fix up all references
to proc-macros.S.

Signed-off-by: Geoff Levand <geoff@infradead.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
[rebased, included dcache_by_line_op]
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: I09e694442ffd25dcac864216d0369c9727ad0090
(cherry picked from commit 7b7293ae3dbd0a1965bf310b77fed5f9bb37bb93)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h | 82 ++++++++++++++++++++++++-
 arch/arm64/mm/cache.S              |  2 -
 arch/arm64/mm/proc-macros.S        | 98 ------------------------------
 arch/arm64/mm/proc.S               |  3 -
 4 files changed, 81 insertions(+), 104 deletions(-)
 delete mode 100644 arch/arm64/mm/proc-macros.S

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 9ea846ded55c..33bbb7ba7fb5 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -1,5 +1,5 @@
 /*
- * Based on arch/arm/include/asm/assembler.h
+ * Based on arch/arm/include/asm/assembler.h, arch/arm/mm/proc-macros.S
  *
  * Copyright (C) 1996-2000 Russell King
  * Copyright (C) 2012 ARM Ltd.
@@ -23,6 +23,8 @@
 #ifndef __ASM_ASSEMBLER_H
 #define __ASM_ASSEMBLER_H
 
+#include <asm/asm-offsets.h>
+#include <asm/pgtable-hwdef.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
 
@@ -211,6 +213,84 @@ lr	.req	x30		// link register
 	add	\reg, \reg, \tmp
 	.endm
 
+/*
+ * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
+ */
+	.macro	vma_vm_mm, rd, rn
+	ldr	\rd, [\rn, #VMA_VM_MM]
+	.endm
+
+/*
+ * mmid - get context id from mm pointer (mm->context.id)
+ */
+	.macro	mmid, rd, rn
+	ldr	\rd, [\rn, #MM_CONTEXT_ID]
+	.endm
+
+/*
+ * dcache_line_size - get the minimum D-cache line size from the CTR register.
+ */
+	.macro	dcache_line_size, reg, tmp
+	mrs	\tmp, ctr_el0			// read CTR
+	ubfm	\tmp, \tmp, #16, #19		// cache line size encoding
+	mov	\reg, #4			// bytes per word
+	lsl	\reg, \reg, \tmp		// actual cache line size
+	.endm
+
+/*
+ * icache_line_size - get the minimum I-cache line size from the CTR register.
+ */
+	.macro	icache_line_size, reg, tmp
+	mrs	\tmp, ctr_el0			// read CTR
+	and	\tmp, \tmp, #0xf		// cache line size encoding
+	mov	\reg, #4			// bytes per word
+	lsl	\reg, \reg, \tmp		// actual cache line size
+	.endm
+
+/*
+ * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map
+ */
+	.macro	tcr_set_idmap_t0sz, valreg, tmpreg
+#ifndef CONFIG_ARM64_VA_BITS_48
+	ldr_l	\tmpreg, idmap_t0sz
+	bfi	\valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
+#endif
+	.endm
+
+/*
+ * Macro to perform a data cache maintenance for the interval
+ * [kaddr, kaddr + size)
+ *
+ * 	op:		operation passed to dc instruction
+ * 	domain:		domain used in dsb instruciton
+ * 	kaddr:		starting virtual address of the region
+ * 	size:		size of the region
+ * 	Corrupts:	kaddr, size, tmp1, tmp2
+ */
+	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	dcache_line_size \tmp1, \tmp2
+	add	\size, \kaddr, \size
+	sub	\tmp2, \tmp1, #1
+	bic	\kaddr, \kaddr, \tmp2
+9998:	dc	\op, \kaddr
+	add	\kaddr, \kaddr, \tmp1
+	cmp	\kaddr, \size
+	b.lo	9998b
+	dsb	\domain
+	.endm
+
+/*
+ * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
+ */
+	.macro	reset_pmuserenr_el0, tmpreg
+	mrs	\tmpreg, id_aa64dfr0_el1	// Check ID_AA64DFR0_EL1 PMUVer
+	sbfx	\tmpreg, \tmpreg, #8, #4
+	cmp	\tmpreg, #1			// Skip if no PMU present
+	b.lt	9000f
+	msr	pmuserenr_el0, xzr		// Disable PMU access from EL0
+9000:
+	.endm
+
 /*
  * Annotate a function as position independent, i.e., safe to be called before
  * the kernel virtual mapping is activated.
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 6df07069a025..50ff9ba3a236 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -24,8 +24,6 @@
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
 
-#include "proc-macros.S"
-
 /*
  *	flush_icache_range(start,end)
  *
diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S
deleted file mode 100644
index 984edcda1850..000000000000
--- a/arch/arm64/mm/proc-macros.S
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Based on arch/arm/mm/proc-macros.S
- *
- * Copyright (C) 2012 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-
-/*
- * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
- */
-	.macro	vma_vm_mm, rd, rn
-	ldr	\rd, [\rn, #VMA_VM_MM]
-	.endm
-
-/*
- * mmid - get context id from mm pointer (mm->context.id)
- */
-	.macro	mmid, rd, rn
-	ldr	\rd, [\rn, #MM_CONTEXT_ID]
-	.endm
-
-/*
- * dcache_line_size - get the minimum D-cache line size from the CTR register.
- */
-	.macro	dcache_line_size, reg, tmp
-	mrs	\tmp, ctr_el0			// read CTR
-	ubfm	\tmp, \tmp, #16, #19		// cache line size encoding
-	mov	\reg, #4			// bytes per word
-	lsl	\reg, \reg, \tmp		// actual cache line size
-	.endm
-
-/*
- * icache_line_size - get the minimum I-cache line size from the CTR register.
- */
-	.macro	icache_line_size, reg, tmp
-	mrs	\tmp, ctr_el0			// read CTR
-	and	\tmp, \tmp, #0xf		// cache line size encoding
-	mov	\reg, #4			// bytes per word
-	lsl	\reg, \reg, \tmp		// actual cache line size
-	.endm
-
-/*
- * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map
- */
-	.macro	tcr_set_idmap_t0sz, valreg, tmpreg
-#ifndef CONFIG_ARM64_VA_BITS_48
-	ldr_l	\tmpreg, idmap_t0sz
-	bfi	\valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
-#endif
-	.endm
-
-/*
- * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
- */
-	.macro	reset_pmuserenr_el0, tmpreg
-	mrs	\tmpreg, id_aa64dfr0_el1	// Check ID_AA64DFR0_EL1 PMUVer
-	sbfx	\tmpreg, \tmpreg, #8, #4
-	cmp	\tmpreg, #1			// Skip if no PMU present
-	b.lt	9000f
-	msr	pmuserenr_el0, xzr		// Disable PMU access from EL0
-9000:
-	.endm
-
-/*
- * Macro to perform a data cache maintenance for the interval
- * [kaddr, kaddr + size)
- *
- * 	op:		operation passed to dc instruction
- * 	domain:		domain used in dsb instruciton
- * 	kaddr:		starting virtual address of the region
- * 	size:		size of the region
- * 	Corrupts: 	kaddr, size, tmp1, tmp2
- */
-	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
-	dcache_line_size \tmp1, \tmp2
-	add	\size, \kaddr, \size
-	sub	\tmp2, \tmp1, #1
-	bic	\kaddr, \kaddr, \tmp2
-9998:	dc	\op, \kaddr
-	add	\kaddr, \kaddr, \tmp1
-	cmp	\kaddr, \size
-	b.lo	9998b
-	dsb	\domain
-	.endm
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 543f5198005a..9f6deacf41d2 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -23,13 +23,10 @@
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
 #include <asm/hwcap.h>
-#include <asm/pgtable-hwdef.h>
 #include <asm/pgtable.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
 
-#include "proc-macros.S"
-
 #ifdef CONFIG_ARM64_64K_PAGES
 #define TCR_TG_FLAGS	TCR_TG0_64K | TCR_TG1_64K
 #elif defined(CONFIG_ARM64_16K_PAGES)

From fdc4c4805c5afb51fdd0e03ee55281d9b10f25c1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 13 Jun 2016 11:15:14 +0100
Subject: [PATCH 0482/3220] UPSTREAM: arm64: fix dump_instr when PAN and UAO
 are in use

If the kernel is set to show unhandled signals, and a user task does not
handle a SIGILL as a result of an instruction abort, we will attempt to
log the offending instruction with dump_instr before killing the task.

We use dump_instr to log the encoding of the offending userspace
instruction. However, dump_instr is also used to dump instructions from
kernel space, and internally always switches to KERNEL_DS before dumping
the instruction with get_user. When both PAN and UAO are in use, reading
a user instruction via get_user while in KERNEL_DS will result in a
permission fault, which leads to an Oops.

As we have regs corresponding to the context of the original instruction
abort, we can inspect this and only flip to KERNEL_DS if the original
abort was taken from the kernel, avoiding this issue. At the same time,
remove the redundant (and incorrect) comments regarding the order
dump_mem and dump_instr are called in.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: <stable@vger.kernel.org> #4.6+
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Vladimir Murzin <vladimir.murzin@arm.com>
Tested-by: Vladimir Murzin <vladimir.murzin@arm.com>
Fixes: 57f4959bad0a154a ("arm64: kernel: Add support for User Access Override")
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: I54c00f3598d227a7e2767b357cb453075dcce7bd
(cherry picked from commit c5cea06be060f38e5400d796e61cfc8c36e52924)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/kernel/traps.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index c5392081b49b..58651a9dfcf8 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -64,8 +64,7 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom,
 
 	/*
 	 * We need to switch to kernel mode so that we can use __get_user
-	 * to safely read from kernel space.  Note that we now dump the
-	 * code first, just in case the backtrace kills us.
+	 * to safely read from kernel space.
 	 */
 	fs = get_fs();
 	set_fs(KERNEL_DS);
@@ -111,21 +110,12 @@ static void dump_backtrace_entry(unsigned long where)
 	print_ip_sym(where);
 }
 
-static void dump_instr(const char *lvl, struct pt_regs *regs)
+static void __dump_instr(const char *lvl, struct pt_regs *regs)
 {
 	unsigned long addr = instruction_pointer(regs);
-	mm_segment_t fs;
 	char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str;
 	int i;
 
-	/*
-	 * We need to switch to kernel mode so that we can use __get_user
-	 * to safely read from kernel space.  Note that we now dump the
-	 * code first, just in case the backtrace kills us.
-	 */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-
 	for (i = -4; i < 1; i++) {
 		unsigned int val, bad;
 
@@ -139,8 +129,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs)
 		}
 	}
 	printk("%sCode: %s\n", lvl, str);
+}
 
-	set_fs(fs);
+static void dump_instr(const char *lvl, struct pt_regs *regs)
+{
+	if (!user_mode(regs)) {
+		mm_segment_t fs = get_fs();
+		set_fs(KERNEL_DS);
+		__dump_instr(lvl, regs);
+		set_fs(fs);
+	} else {
+		__dump_instr(lvl, regs);
+	}
 }
 
 static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)

From 916a99af4c1bf19207dd4589b1704ab144407c77 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 13 Jun 2016 17:57:02 +0100
Subject: [PATCH 0483/3220] UPSTREAM: arm64: mm: mark fault_info table const

Unlike the debug_fault_info table, we never intentionally alter the
fault_info table at runtime, and all derived pointers are treated as
const currently.

Make the table const so that it can be placed in .rodata and protected
from unintentional writes, as we do for the syscall tables.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: I3fb0bb55427835c165cc377d8dc2a3fa9e6e950d
(cherry picked from commit bbb1681ee3653bdcfc6a4ba31902738118311fd4)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9bb45de46085..b5773944d1d3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -438,7 +438,7 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	return 1;
 }
 
-static struct fault_info {
+static const struct fault_info {
 	int	(*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs);
 	int	sig;
 	int	code;

From 32054524cb03d884215a6051ad8c21077c241250 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 31 May 2016 12:33:01 +0100
Subject: [PATCH 0484/3220] UPSTREAM: arm64: add macro to extract ESR_ELx.EC

Several places open-code extraction of the EC field from an ESR_ELx
value, in subtly different ways. This is unfortunate duplication and
variation, and the precise logic used to extract the field is a
distraction.

This patch adds a new macro, ESR_ELx_EC(), to extract the EC field from
an ESR_ELx value in a consistent fashion.

Existing open-coded extractions in core arm64 code are moved over to the
new helper. KVM code is left as-is for the moment.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Huang Shijie <shijie.huang@arm.com>
Cc: Dave P Martin <dave.martin@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: Ib634a4795277d243fce5dd30b139e2ec1465bee9
(cherry picked from commit 275f344bec51e9100bae81f3cc8c6940bbfb24c0)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/esr.h | 1 +
 arch/arm64/kernel/traps.c    | 2 +-
 arch/arm64/mm/fault.c        | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 77eeb2cc648f..f772e15c4766 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -74,6 +74,7 @@
 
 #define ESR_ELx_EC_SHIFT	(26)
 #define ESR_ELx_EC_MASK		(UL(0x3F) << ESR_ELx_EC_SHIFT)
+#define ESR_ELx_EC(esr)		(((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT)
 
 #define ESR_ELx_IL		(UL(1) << 25)
 #define ESR_ELx_ISS_MASK	(ESR_ELx_IL - 1)
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 58651a9dfcf8..29d7b68f63f0 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -465,7 +465,7 @@ static const char *esr_class_str[] = {
 
 const char *esr_get_class_string(u32 esr)
 {
-	return esr_class_str[esr >> ESR_ELx_EC_SHIFT];
+	return esr_class_str[ESR_ELx_EC(esr)];
 }
 
 /*
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index b5773944d1d3..59b8e1db6338 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -244,7 +244,7 @@ out:
 
 static inline int permission_fault(unsigned int esr)
 {
-	unsigned int ec       = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT;
+	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
 
 	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM);

From b8f39d900e1581dd99788c1e38989b93d114dced Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 31 May 2016 12:33:03 +0100
Subject: [PATCH 0485/3220] UPSTREAM: arm64: kill ESR_LNX_EXEC

Currently we treat ESR_EL1 bit 24 as software-defined for distinguishing
instruction aborts from data aborts, but this bit is architecturally
RES0 for instruction aborts, and could be allocated for an arbitrary
purpose in future. Additionally, we hard-code the value in entry.S
without the mnemonic, making the code difficult to understand.

Instead, remove ESR_LNX_EXEC, and distinguish aborts based on the esr,
which we already pass to the sole use of ESR_LNX_EXEC. A new helper,
is_el0_instruction_abort() is added to make the logic clear. Any
instruction aborts taken from EL1 will already have been handled by
bad_mode, so we need not handle that case in the helper.

For consistency, the existing permission_fault helper is renamed to
is_permission_fault, and the return type is changed to bool. There
should be no functional changes as the return value was a boolean
expression, and the result is only used in another boolean expression.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Dave P Martin <dave.martin@arm.com>
Cc: Huang Shijie <shijie.huang@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: Iaf66fa5f3b13cf985b11a3b0a40c4333fe9ef833
(cherry picked from commit 541ec870ef31433018d245614254bd9d810a9ac3)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/kernel/entry.S |  2 +-
 arch/arm64/mm/fault.c     | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 12e8d2bcb3f9..eefffa81c6df 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -532,7 +532,7 @@ el0_ia:
 	enable_dbg_and_irq
 	ct_user_exit
 	mov	x0, x26
-	orr	x1, x25, #1 << 24		// use reserved ISS bit for instruction aborts
+	mov	x1, x25
 	mov	x2, sp
 	bl	do_mem_abort
 	b	ret_to_user
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 59b8e1db6338..56c6bfbd19b4 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -202,8 +202,6 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 #define VM_FAULT_BADMAP		0x010000
 #define VM_FAULT_BADACCESS	0x020000
 
-#define ESR_LNX_EXEC		(1 << 24)
-
 static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
 			   unsigned int mm_flags, unsigned long vm_flags,
 			   struct task_struct *tsk)
@@ -242,7 +240,7 @@ out:
 	return fault;
 }
 
-static inline int permission_fault(unsigned int esr)
+static inline bool is_permission_fault(unsigned int esr)
 {
 	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
@@ -250,6 +248,11 @@ static inline int permission_fault(unsigned int esr)
 	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
 }
 
+static bool is_el0_instruction_abort(unsigned int esr)
+{
+	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
+}
+
 static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 				   struct pt_regs *regs)
 {
@@ -276,14 +279,14 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	if (user_mode(regs))
 		mm_flags |= FAULT_FLAG_USER;
 
-	if (esr & ESR_LNX_EXEC) {
+	if (is_el0_instruction_abort(esr)) {
 		vm_flags = VM_EXEC;
 	} else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) {
 		vm_flags = VM_WRITE;
 		mm_flags |= FAULT_FLAG_WRITE;
 	}
 
-	if (permission_fault(esr) && (addr < USER_DS)) {
+	if (is_permission_fault(esr) && (addr < USER_DS)) {
 		if (get_fs() == KERNEL_DS)
 			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
 

From 9887915a4ecf3697d056f01f8e7fe09c853f93bb Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Wed, 27 Apr 2016 17:47:10 +0100
Subject: [PATCH 0486/3220] UPSTREAM: arm64: Add new asm macro copy_page

Kexec and hibernate need to copy pages of memory, but may not have all
of the kernel mapped, and are unable to call copy_page().

Add a simplistic copy_page() macro, that can be inlined in these
situations. lib/copy_page.S provides a bigger better version, but
uses more registers.

Signed-off-by: Geoff Levand <geoff@infradead.org>
[Changed asm label to 9998, added commit message]
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: If23a454e211b1f57f8ba1a2a00b44dabf4b82932
(cherry picked from commit 5003dbde45961dd7ab3d8a09ab9ad8bcb604db40)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 33bbb7ba7fb5..290e13428f4a 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -24,6 +24,7 @@
 #define __ASM_ASSEMBLER_H
 
 #include <asm/asm-offsets.h>
+#include <asm/page.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
@@ -291,6 +292,24 @@ lr	.req	x30		// link register
 9000:
 	.endm
 
+/*
+ * copy_page - copy src to dest using temp registers t1-t8
+ */
+	.macro copy_page dest:req src:req t1:req t2:req t3:req t4:req t5:req t6:req t7:req t8:req
+9998:	ldp	\t1, \t2, [\src]
+	ldp	\t3, \t4, [\src, #16]
+	ldp	\t5, \t6, [\src, #32]
+	ldp	\t7, \t8, [\src, #48]
+	add	\src, \src, #64
+	stnp	\t1, \t2, [\dest]
+	stnp	\t3, \t4, [\dest, #16]
+	stnp	\t5, \t6, [\dest, #32]
+	stnp	\t7, \t8, [\dest, #48]
+	add	\dest, \dest, #64
+	tst	\src, #(PAGE_SIZE - 1)
+	b.ne	9998b
+	.endm
+
 /*
  * Annotate a function as position independent, i.e., safe to be called before
  * the kernel virtual mapping is activated.

From 1d8922da229f3b2b4b6a9e75ad7dd75f862462bb Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Tue, 28 Jun 2016 18:07:27 +0100
Subject: [PATCH 0487/3220] UPSTREAM: Revert "arm64: alternatives: add enable
 parameter to conditional asm macros"

Commit 77ee306c0aea9 ("arm64: alternatives: add enable parameter to
conditional asm macros") extended the alternative assembly macros.
Unfortunately this does not really work as one would expect, as the
enable parameter in fact correctly protects the alternative section
magic, but not the actual code sequences.
This results in having both the original instruction(s) _and_  the
alternative ones, if enable if false.
Since there is no user of this macros anyway, just revert it.

This reverts commit 77ee306c0aea9a219daec256ad25982944affef8.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: I608104891335dfa2dacdb364754ae2658088ddf2
(cherry picked from commit b82bfa4793cd0f8fde49b85e0ad66906682e7447)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/alternative.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index beccbdefa106..7288071195e1 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -95,13 +95,11 @@ void apply_alternatives(void *start, size_t length);
  * The code that follows this macro will be assembled and linked as
  * normal. There are no restrictions on this code.
  */
-.macro alternative_if_not cap, enable = 1
-	.if \enable
+.macro alternative_if_not cap
 	.pushsection .altinstructions, "a"
 	altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f
 	.popsection
 661:
-	.endif
 .endm
 
 /*
@@ -118,22 +116,18 @@ void apply_alternatives(void *start, size_t length);
  *    alternative sequence it is defined in (branches into an
  *    alternative sequence are not fixed up).
  */
-.macro alternative_else, enable = 1
-	.if \enable
+.macro alternative_else
 662:	.pushsection .altinstr_replacement, "ax"
 663:
-	.endif
 .endm
 
 /*
  * Complete an alternative code sequence.
  */
-.macro alternative_endif, enable = 1
-	.if \enable
+.macro alternative_endif
 664:	.popsection
 	.org	. - (664b-663b) + (662b-661b)
 	.org	. - (662b-661b) + (664b-663b)
-	.endif
 .endm
 
 #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...)	\

From 405893f77e25abf7584f8ea65567c032849e41dd Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Tue, 28 Jun 2016 18:07:28 +0100
Subject: [PATCH 0488/3220] UPSTREAM: arm64: fix "dc cvau" cache operation on
 errata-affected core

The ARM errata 819472, 826319, 827319 and 824069 for affected
Cortex-A53 cores demand to promote "dc cvau" instructions to
"dc civac" as well.
Attribute the usage of the instruction in __flush_cache_user_range
to also be covered by our alternative patching efforts.
For that we introduce an assembly macro which both deals with
alternatives while still tagging the instructions as USER.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: If5e7933ba32331b2aa28fc5d9e019649452f0f6c
(cherry picked from commit 290622efc76ece22ef76a30bf117755891ab27f6)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/alternative.h | 4 ++++
 arch/arm64/mm/cache.S                | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 7288071195e1..8746ff6abd77 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -133,6 +133,10 @@ void apply_alternatives(void *start, size_t length);
 #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...)	\
 	alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
 
+.macro user_alt, label, oldinstr, newinstr, cond
+9999:	alternative_insn "\oldinstr", "\newinstr", \cond
+	_ASM_EXTABLE 9999b, \label
+.endm
 
 /*
  * Generate the assembly for UAO alternatives with exception table entries.
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 50ff9ba3a236..07d7352d7c38 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -52,7 +52,7 @@ ENTRY(__flush_cache_user_range)
 	sub	x3, x2, #1
 	bic	x4, x0, x3
 1:
-USER(9f, dc	cvau, x4	)		// clean D line to PoU
+user_alt 9f, "dc cvau, x4",  "dc civac, x4",  ARM64_WORKAROUND_CLEAN_CACHE
 	add	x4, x4, x2
 	cmp	x4, x1
 	b.lo	1b

From 76440ecb4dfc5f76582199a32b6c33ee9f0aa221 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Tue, 28 Jun 2016 18:07:29 +0100
Subject: [PATCH 0489/3220] UPSTREAM: arm64: include alternative handling in
 dcache_by_line_op

The newly introduced dcache_by_line_op macro is used at least in
one occassion at the moment to issue a "dc cvau" instruction,
which is affected by ARM errata 819472, 826319, 827319 and 824069.
Change the macro to allow for alternative patching in there to
protect affected Cortex-A53 cores.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
[catalin.marinas@arm.com: indentation fixups]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: I450594dc311b09b6b832b707a9abb357608cc6e4
(cherry picked from commit 823066d9edcdfe4cedb06216c2b1f91efaf68a87)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 290e13428f4a..9e8ac1e73457 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -24,6 +24,7 @@
 #define __ASM_ASSEMBLER_H
 
 #include <asm/asm-offsets.h>
+#include <asm/cpufeature.h>
 #include <asm/page.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/ptrace.h>
@@ -273,7 +274,16 @@ lr	.req	x30		// link register
 	add	\size, \kaddr, \size
 	sub	\tmp2, \tmp1, #1
 	bic	\kaddr, \kaddr, \tmp2
-9998:	dc	\op, \kaddr
+9998:
+	.if	(\op == cvau || \op == cvac)
+alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
+	dc	\op, \kaddr
+alternative_else
+	dc	civac, \kaddr
+alternative_endif
+	.else
+	dc	\op, \kaddr
+	.endif
 	add	\kaddr, \kaddr, \tmp1
 	cmp	\kaddr, \size
 	b.lo	9998b

From 4caec81662608819e904586b172abc85945b4d65 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 20 Jun 2016 18:28:01 +0100
Subject: [PATCH 0490/3220] BACKPORT: arm64: kernel: Save and restore UAO and
 addr_limit on exception entry

If we take an exception while at EL1, the exception handler inherits
the original context's addr_limit and PSTATE.UAO values. To be consistent
always reset addr_limit and PSTATE.UAO on (re-)entry to EL1. This
prevents accidental re-use of the original context's addr_limit.

Based on a similar patch for arm from Russell King.

Cc: <stable@vger.kernel.org> # 4.6-
Acked-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Change-Id: Iab453201c6e08bc6e22500b7c5570dd0fe2d1b74
(cherry picked from commit e19a6ee2460bdd0d0055a6029383422773f9999a)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/ptrace.h |  2 ++
 arch/arm64/kernel/asm-offsets.c |  1 +
 arch/arm64/kernel/entry.S       | 19 +++++++++++++++++--
 arch/arm64/mm/fault.c           |  3 ++-
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index e9e5467e0bf4..200d5c32b38f 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -116,6 +116,8 @@ struct pt_regs {
 	};
 	u64 orig_x0;
 	u64 syscallno;
+	u64 orig_addr_limit;
+	u64 unused;	// maintain 16 byte alignment
 };
 
 #define arch_has_single_step()	(1)
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 25de8b244961..087cf9a65359 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -58,6 +58,7 @@ int main(void)
   DEFINE(S_PC,			offsetof(struct pt_regs, pc));
   DEFINE(S_ORIG_X0,		offsetof(struct pt_regs, orig_x0));
   DEFINE(S_SYSCALLNO,		offsetof(struct pt_regs, syscallno));
+  DEFINE(S_ORIG_ADDR_LIMIT,	offsetof(struct pt_regs, orig_addr_limit));
   DEFINE(S_FRAME_SIZE,		sizeof(struct pt_regs));
   BLANK();
   DEFINE(MM_CONTEXT_ID,		offsetof(struct mm_struct, context.id.counter));
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index eefffa81c6df..9fabed5dd2bf 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -28,6 +28,7 @@
 #include <asm/errno.h>
 #include <asm/esr.h>
 #include <asm/irq.h>
+#include <asm/memory.h>
 #include <asm/thread_info.h>
 #include <asm/unistd.h>
 
@@ -97,7 +98,14 @@
 	mov	x29, xzr			// fp pointed to user-space
 	.else
 	add	x21, sp, #S_FRAME_SIZE
-	.endif
+	get_thread_info tsk
+	/* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
+	ldr	x20, [tsk, #TI_ADDR_LIMIT]
+	str	x20, [sp, #S_ORIG_ADDR_LIMIT]
+	mov	x20, #TASK_SIZE_64
+	str	x20, [tsk, #TI_ADDR_LIMIT]
+	ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO)
+	.endif /* \el == 0 */
 	mrs	x22, elr_el1
 	mrs	x23, spsr_el1
 	stp	lr, x21, [sp, #S_LR]
@@ -128,6 +136,14 @@
 	.endm
 
 	.macro	kernel_exit, el
+	.if	\el != 0
+	/* Restore the task's original addr_limit. */
+	ldr	x20, [sp, #S_ORIG_ADDR_LIMIT]
+	str	x20, [tsk, #TI_ADDR_LIMIT]
+
+	/* No need to restore UAO, it will be restored from SPSR_EL1 */
+	.endif
+
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 	.if	\el == 0
 	ct_user_enter
@@ -406,7 +422,6 @@ el1_irq:
 	bl	trace_hardirqs_off
 #endif
 
-	get_thread_info tsk
 	irq_handler
 
 #ifdef CONFIG_PREEMPT
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 56c6bfbd19b4..03d44388ae7b 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -287,7 +287,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	if (is_permission_fault(esr) && (addr < USER_DS)) {
-		if (get_fs() == KERNEL_DS)
+		/* regs->orig_addr_limit may be 0 if we entered from EL0 */
+		if (regs->orig_addr_limit == KERNEL_DS)
 			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
 
 		if (!search_exception_tables(regs->pc))

From d907eb252977469ed9bded38bd76c2212065c2cf Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 9 Aug 2016 18:25:26 -0700
Subject: [PATCH 0491/3220] UPSTREAM: arm64: Handle el1 synchronous instruction
 aborts cleanly

Executing from a non-executable area gives an ugly message:

lkdtm: Performing direct entry EXEC_RODATA
lkdtm: attempting ok execution at ffff0000084c0e08
lkdtm: attempting bad execution at ffff000008880700
Bad mode in Synchronous Abort handler detected on CPU2, code 0x8400000e -- IABT (current EL)
CPU: 2 PID: 998 Comm: sh Not tainted 4.7.0-rc2+ #13
Hardware name: linux,dummy-virt (DT)
task: ffff800077e35780 ti: ffff800077970000 task.ti: ffff800077970000
PC is at lkdtm_rodata_do_nothing+0x0/0x8
LR is at execute_location+0x74/0x88

The 'IABT (current EL)' indicates the error but it's a bit cryptic
without knowledge of the ARM ARM. There is also no indication of the
specific address which triggered the fault. The increase in kernel
page permissions makes hitting this case more likely as well.
Handling the case in the vectors gives a much more familiar looking
error message:

lkdtm: Performing direct entry EXEC_RODATA
lkdtm: attempting ok execution at ffff0000084c0840
lkdtm: attempting bad execution at ffff000008880680
Unable to handle kernel paging request at virtual address ffff000008880680
pgd = ffff8000089b2000
[ffff000008880680] *pgd=00000000489b4003, *pud=0000000048904003, *pmd=0000000000000000
Internal error: Oops: 8400000e [#1] PREEMPT SMP
Modules linked in:
CPU: 1 PID: 997 Comm: sh Not tainted 4.7.0-rc1+ #24
Hardware name: linux,dummy-virt (DT)
task: ffff800077f9f080 ti: ffff800008a1c000 task.ti: ffff800008a1c000
PC is at lkdtm_rodata_do_nothing+0x0/0x8
LR is at execute_location+0x74/0x88

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: Ifba74589ba2cf05b28335d4fd3e3140ef73668db
(cherry picked from commit 9adeb8e72dbfe976709df01e259ed556ee60e779)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/kernel/entry.S |  7 +++++++
 arch/arm64/mm/fault.c     | 14 ++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 9fabed5dd2bf..533e1c9fd5a6 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -352,6 +352,8 @@ el1_sync:
 	lsr	x24, x1, #ESR_ELx_EC_SHIFT	// exception class
 	cmp	x24, #ESR_ELx_EC_DABT_CUR	// data abort in EL1
 	b.eq	el1_da
+	cmp	x24, #ESR_ELx_EC_IABT_CUR	// instruction abort in EL1
+	b.eq	el1_ia
 	cmp	x24, #ESR_ELx_EC_SYS64		// configurable trap
 	b.eq	el1_undef
 	cmp	x24, #ESR_ELx_EC_SP_ALIGN	// stack alignment exception
@@ -363,6 +365,11 @@ el1_sync:
 	cmp	x24, #ESR_ELx_EC_BREAKPT_CUR	// debug exception in EL1
 	b.ge	el1_dbg
 	b	el1_inv
+
+el1_ia:
+	/*
+	 * Fall through to the Data abort case
+	 */
 el1_da:
 	/*
 	 * Data abort handling
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 03d44388ae7b..e328430dbcec 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -131,6 +131,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 }
 #endif
 
+static bool is_el1_instruction_abort(unsigned int esr)
+{
+	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
+}
+
 /*
  * The kernel tried to access some page that wasn't present.
  */
@@ -139,8 +144,9 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
 {
 	/*
 	 * Are we prepared to handle this kernel fault?
+	 * We are almost certainly not prepared to handle instruction faults.
 	 */
-	if (fixup_exception(regs))
+	if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
 		return;
 
 	/*
@@ -245,7 +251,8 @@ static inline bool is_permission_fault(unsigned int esr)
 	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
 
-	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) ||
+	       (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
 }
 
 static bool is_el0_instruction_abort(unsigned int esr)
@@ -291,6 +298,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		if (regs->orig_addr_limit == KERNEL_DS)
 			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
 
+		if (is_el1_instruction_abort(esr))
+			die("Attempting to execute userspace memory", regs, esr);
+
 		if (!search_exception_tables(regs->pc))
 			die("Accessing user space memory outside uaccess.h routines", regs, esr);
 	}

From 23368b642deb01ac6ce668ec1dedfcc0cab25c71 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 14:58:21 +0100
Subject: [PATCH 0492/3220] FROMLIST: arm64: Factor out PAN enabling/disabling
 into separate uaccess_* macros

This patch moves the directly coded alternatives for turning PAN on/off
into separate uaccess_{enable,disable} macros or functions. The asm
macros take a few arguments which will be used in subsequent patches.

Note that any (unlikely) access that the compiler might generate between
uaccess_enable() and uaccess_disable(), other than those explicitly
specified by the user access code, will not be protected by PAN.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: Ic3fddd706400c8798f57456c56361d84d234f6ef
(cherry picked from commit a4820644c627b82cbc865f2425bb788c94743b16)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/futex.h       | 14 ++---
 arch/arm64/include/asm/uaccess.h     | 79 +++++++++++++++++++++++++---
 arch/arm64/kernel/armv8_deprecated.c | 10 ++--
 arch/arm64/lib/clear_user.S          |  8 ++-
 arch/arm64/lib/copy_from_user.S      |  8 ++-
 arch/arm64/lib/copy_in_user.S        |  8 ++-
 arch/arm64/lib/copy_to_user.S        |  8 ++-
 7 files changed, 95 insertions(+), 40 deletions(-)

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index f2585cdd32c2..71dfa3b42313 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -27,9 +27,9 @@
 #include <asm/sysreg.h>
 
 #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg)		\
+do {									\
+	uaccess_enable();						\
 	asm volatile(							\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,		\
-		    CONFIG_ARM64_PAN)					\
 "	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%w1, %2\n"						\
 	insn "\n"							\
@@ -44,11 +44,11 @@
 "	.popsection\n"							\
 	_ASM_EXTABLE(1b, 4b)						\
 	_ASM_EXTABLE(2b, 4b)						\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,		\
-		    CONFIG_ARM64_PAN)					\
 	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp)	\
 	: "r" (oparg), "Ir" (-EFAULT)					\
-	: "memory")
+	: "memory");							\
+	uaccess_disable();						\
+} while (0)
 
 static inline int
 futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
@@ -118,8 +118,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
+	uaccess_enable();
 	asm volatile("// futex_atomic_cmpxchg_inatomic\n"
-ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "	prfm	pstl1strm, %2\n"
 "1:	ldxr	%w1, %2\n"
 "	sub	%w3, %w1, %w4\n"
@@ -134,10 +134,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "	.popsection\n"
 	_ASM_EXTABLE(1b, 4b)
 	_ASM_EXTABLE(2b, 4b)
-ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
 	: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
 	: "memory");
+	uaccess_disable();
 
 	*uval = val;
 	return ret;
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index c3d445b42351..c8ef22a9a83b 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -18,6 +18,8 @@
 #ifndef __ASM_UACCESS_H
 #define __ASM_UACCESS_H
 
+#ifndef __ASSEMBLY__
+
 /*
  * User space memory access functions
  */
@@ -123,6 +125,44 @@ static inline void set_fs(mm_segment_t fs)
 	"	.long		(" #from " - .), (" #to " - .)\n"	\
 	"	.popsection\n"
 
+/*
+ * User access enabling/disabling.
+ */
+#define __uaccess_disable(alt)						\
+do {									\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,			\
+			CONFIG_ARM64_PAN));				\
+} while (0)
+
+#define __uaccess_enable(alt)						\
+do {									\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,			\
+			CONFIG_ARM64_PAN));				\
+} while (0)
+
+static inline void uaccess_disable(void)
+{
+	__uaccess_disable(ARM64_HAS_PAN);
+}
+
+static inline void uaccess_enable(void)
+{
+	__uaccess_enable(ARM64_HAS_PAN);
+}
+
+/*
+ * These functions are no-ops when UAO is present.
+ */
+static inline void uaccess_disable_not_uao(void)
+{
+	__uaccess_disable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+static inline void uaccess_enable_not_uao(void)
+{
+	__uaccess_enable(ARM64_ALT_PAN_NOT_UAO);
+}
+
 /*
  * The "__xxx" versions of the user access functions do not verify the address
  * space - it must have been done previously with a separate "access_ok()"
@@ -150,8 +190,7 @@ static inline void set_fs(mm_segment_t fs)
 do {									\
 	unsigned long __gu_val;						\
 	__chk_user_ptr(ptr);						\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
+	uaccess_enable_not_uao();					\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
 		__get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr),  \
@@ -172,9 +211,8 @@ do {									\
 	default:							\
 		BUILD_BUG();						\
 	}								\
+	uaccess_disable_not_uao();					\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
 } while (0)
 
 #define __get_user(x, ptr)						\
@@ -219,8 +257,7 @@ do {									\
 do {									\
 	__typeof__(*(ptr)) __pu_val = (x);				\
 	__chk_user_ptr(ptr);						\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
+	uaccess_enable_not_uao();					\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
 		__put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr),	\
@@ -241,8 +278,7 @@ do {									\
 	default:							\
 		BUILD_BUG();						\
 	}								\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
+	uaccess_disable_not_uao();					\
 } while (0)
 
 #define __put_user(x, ptr)						\
@@ -327,4 +363,31 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
 extern __must_check long strnlen_user(const char __user *str, long n);
 
+#else	/* __ASSEMBLY__ */
+
+#include <asm/alternative.h>
+#include <asm/assembler.h>
+
+/*
+ * User access enabling/disabling macros. These are no-ops when UAO is
+ * present.
+ */
+	.macro	uaccess_disable_not_uao, tmp1
+alternative_if_not ARM64_ALT_PAN_NOT_UAO
+	nop
+alternative_else
+	SET_PSTATE_PAN(1)
+alternative_endif
+	.endm
+
+	.macro	uaccess_enable_not_uao, tmp1, tmp2
+alternative_if_not ARM64_ALT_PAN_NOT_UAO
+	nop
+alternative_else
+	SET_PSTATE_PAN(0)
+alternative_endif
+	.endm
+
+#endif	/* __ASSEMBLY__ */
+
 #endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index c37202c0c838..562f7ddb158b 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -281,9 +281,9 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
  * Error-checking SWP macros implemented using ldxr{b}/stxr{b}
  */
 #define __user_swpX_asm(data, addr, res, temp, B)		\
+do {								\
+	uaccess_enable();					\
 	__asm__ __volatile__(					\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,	\
-		    CONFIG_ARM64_PAN)				\
 	"0:	ldxr"B"		%w2, [%3]\n"			\
 	"1:	stxr"B"		%w0, %w1, [%3]\n"		\
 	"	cbz		%w0, 2f\n"			\
@@ -299,11 +299,11 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
 	"	.popsection"					\
 	_ASM_EXTABLE(0b, 4b)					\
 	_ASM_EXTABLE(1b, 4b)					\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,	\
-		CONFIG_ARM64_PAN)				\
 	: "=&r" (res), "+r" (data), "=&r" (temp)		\
 	: "r" (addr), "i" (-EAGAIN), "i" (-EFAULT)		\
-	: "memory")
+	: "memory");						\
+	uaccess_disable();					\
+} while (0)
 
 #define __user_swp_asm(data, addr, res, temp) \
 	__user_swpX_asm(data, addr, res, temp, "")
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index 5d1cad3ce6d6..08b5f18ba604 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,10 +17,10 @@
  */
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 	.text
 
@@ -33,8 +33,7 @@
  * Alignment fixed up by hardware.
  */
 ENTRY(__clear_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x2, x3
 	mov	x2, x1			// save the size for fixup return
 	subs	x1, x1, #8
 	b.mi	2f
@@ -54,8 +53,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
 	b.mi	5f
 uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
 5:	mov	x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x2
 	ret
 ENDPROC(__clear_user)
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 0b90497d4424..6505ec81f1da 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -16,11 +16,11 @@
 
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/cache.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 /*
  * Copy from user space to a kernel buffer (alignment handled by the hardware)
@@ -67,12 +67,10 @@
 
 end	.req	x5
 ENTRY(__arch_copy_from_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x3
 	mov	x0, #0				// Nothing to copy
 	ret
 ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index f7292dd08c84..9b04ff3ab610 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -18,11 +18,11 @@
 
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/cache.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 /*
  * Copy from user space to user space (alignment handled by the hardware)
@@ -68,12 +68,10 @@
 
 end	.req	x5
 ENTRY(__copy_in_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x3
 	mov	x0, #0
 	ret
 ENDPROC(__copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 7a7efe255034..8077e4f34d56 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -16,11 +16,11 @@
 
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/cache.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 /*
  * Copy to user space from a kernel buffer (alignment handled by the hardware)
@@ -66,12 +66,10 @@
 
 end	.req	x5
 ENTRY(__arch_copy_to_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x3
 	mov	x0, #0
 	ret
 ENDPROC(__arch_copy_to_user)

From 3b66929169de053042d47e482dd5748794756153 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 15:48:55 +0100
Subject: [PATCH 0493/3220] FROMLIST: arm64: Factor out TTBR0_EL1 post-update
 workaround into a specific asm macro

This patch takes the errata workaround code out of cpu_do_switch_mm into
a dedicated post_ttbr0_update_workaround macro which will be reused in a
subsequent patch.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: I69f94e4c41046bd52ca9340b72d97bfcf955b586
(cherry picked from commit 4398e6a1644373a4c2f535f4153c8378d0914630)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h | 17 +++++++++++++++++
 arch/arm64/mm/proc.S               | 11 +----------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 9e8ac1e73457..9d3e77a5cf07 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -362,4 +362,21 @@ alternative_endif
 	movk	\reg, :abs_g0_nc:\val
 	.endm
 
+/*
+ * Errata workaround post TTBR0_EL1 update.
+ */
+	.macro	post_ttbr0_update_workaround
+#ifdef CONFIG_CAVIUM_ERRATUM_27456
+alternative_if_not ARM64_WORKAROUND_CAVIUM_27456
+	nop
+	nop
+	nop
+alternative_else
+	ic	iallu
+	dsb	nsh
+	isb
+alternative_endif
+#endif
+	.endm
+
 #endif	/* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 9f6deacf41d2..765713702625 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -136,17 +136,8 @@ ENTRY(cpu_do_switch_mm)
 	bfi	x0, x1, #48, #16		// set the ASID
 	msr	ttbr0_el1, x0			// set TTBR0
 	isb
-alternative_if_not ARM64_WORKAROUND_CAVIUM_27456
+	post_ttbr0_update_workaround
 	ret
-	nop
-	nop
-	nop
-alternative_else
-	ic	iallu
-	dsb	nsh
-	isb
-	ret
-alternative_endif
 ENDPROC(cpu_do_switch_mm)
 
 	.pushsection ".idmap.text", "ax"

From 1911d36b27ba58ee18592df25b7ee636d4d4c41d Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 16:53:00 +0100
Subject: [PATCH 0494/3220] FROMLIST: arm64: Introduce uaccess_{disable,enable}
 functionality based on TTBR0_EL1

This patch adds the uaccess macros/functions to disable access to user
space by setting TTBR0_EL1 to a reserved zeroed page. Since the value
written to TTBR0_EL1 must be a physical address, for simplicity this
patch introduces a reserved_ttbr0 page at a constant offset from
swapper_pg_dir. The uaccess_disable code uses the ttbr1_el1 value
adjusted by the reserved_ttbr0 offset.

Enabling access to user is done by restoring TTBR0_EL1 with the value
from the struct thread_info ttbr0 variable. Interrupts must be disabled
during the uaccess_ttbr0_enable code to ensure the atomicity of the
thread_info.ttbr0 read and TTBR0_EL1 write. This patch also moves the
get_thread_info asm macro from entry.S to assembler.h for reuse in the
uaccess_ttbr0_* macros.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: Idf09a870b8612dce23215bce90d88781f0c0c3aa
(cherry picked from commit 940d37234182d2675ab8ab46084840212d735018)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h      | 16 +++++
 arch/arm64/include/asm/cpufeature.h     |  6 ++
 arch/arm64/include/asm/kernel-pgtable.h |  7 ++
 arch/arm64/include/asm/thread_info.h    |  3 +
 arch/arm64/include/asm/uaccess.h        | 96 +++++++++++++++++++++++--
 arch/arm64/kernel/asm-offsets.c         |  3 +
 arch/arm64/kernel/cpufeature.c          |  1 +
 arch/arm64/kernel/entry.S               |  4 --
 arch/arm64/kernel/head.S                |  6 +-
 arch/arm64/kernel/vmlinux.lds.S         |  5 ++
 10 files changed, 134 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 9d3e77a5cf07..aeb4554b3af3 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -53,6 +53,15 @@
 	msr	daifclr, #2
 	.endm
 
+	.macro	save_and_disable_irq, flags
+	mrs	\flags, daif
+	msr	daifset, #2
+	.endm
+
+	.macro	restore_irq, flags
+	msr	daif, \flags
+	.endm
+
 /*
  * Enable and disable debug exceptions.
  */
@@ -362,6 +371,13 @@ alternative_endif
 	movk	\reg, :abs_g0_nc:\val
 	.endm
 
+/*
+ * Return the current thread_info.
+ */
+	.macro	get_thread_info, rd
+	mrs	\rd, sp_el0
+	.endm
+
 /*
  * Errata workaround post TTBR0_EL1 update.
  */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 727e594ac5c2..f125c03ab2e1 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -188,6 +188,12 @@ static inline bool system_supports_mixed_endian_el0(void)
 	return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1));
 }
 
+static inline bool system_uses_ttbr0_pan(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) &&
+		!cpus_have_cap(ARM64_HAS_PAN);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 7e51d1b57c0c..7803343e5881 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -19,6 +19,7 @@
 #ifndef __ASM_KERNEL_PGTABLE_H
 #define __ASM_KERNEL_PGTABLE_H
 
+#include <asm/pgtable.h>
 #include <asm/sparsemem.h>
 
 /*
@@ -54,6 +55,12 @@
 #define SWAPPER_DIR_SIZE	(SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
 #define IDMAP_DIR_SIZE		(IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+#define RESERVED_TTBR0_SIZE	(PAGE_SIZE)
+#else
+#define RESERVED_TTBR0_SIZE	(0)
+#endif
+
 /* Initial memory map size */
 #if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_BLOCK_SHIFT	SECTION_SHIFT
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index abd64bd1f6d9..b3325a9cb90f 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -47,6 +47,9 @@ typedef unsigned long mm_segment_t;
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	mm_segment_t		addr_limit;	/* address limit */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	u64			ttbr0;		/* saved TTBR0_EL1 */
+#endif
 	struct task_struct	*task;		/* main task structure */
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	int			cpu;		/* cpu */
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index c8ef22a9a83b..c37c064d7cdd 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -28,6 +28,7 @@
 
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
+#include <asm/kernel-pgtable.h>
 #include <asm/ptrace.h>
 #include <asm/sysreg.h>
 #include <asm/errno.h>
@@ -128,16 +129,57 @@ static inline void set_fs(mm_segment_t fs)
 /*
  * User access enabling/disabling.
  */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void uaccess_ttbr0_disable(void)
+{
+	unsigned long ttbr;
+
+	/* reserved_ttbr0 placed at the end of swapper_pg_dir */
+	ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE;
+	write_sysreg(ttbr, ttbr0_el1);
+	isb();
+}
+
+static inline void uaccess_ttbr0_enable(void)
+{
+	unsigned long flags;
+
+	/*
+	 * Disable interrupts to avoid preemption between reading the 'ttbr0'
+	 * variable and the MSR. A context switch could trigger an ASID
+	 * roll-over and an update of 'ttbr0'.
+	 */
+	local_irq_save(flags);
+	write_sysreg(current_thread_info()->ttbr0, ttbr0_el1);
+	isb();
+	local_irq_restore(flags);
+}
+#else
+static inline void uaccess_ttbr0_disable(void)
+{
+}
+
+static inline void uaccess_ttbr0_enable(void)
+{
+}
+#endif
+
 #define __uaccess_disable(alt)						\
 do {									\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,			\
-			CONFIG_ARM64_PAN));				\
+	if (system_uses_ttbr0_pan())					\
+		uaccess_ttbr0_disable();				\
+	else								\
+		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,		\
+				CONFIG_ARM64_PAN));			\
 } while (0)
 
 #define __uaccess_enable(alt)						\
 do {									\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,			\
-			CONFIG_ARM64_PAN));				\
+	if (system_uses_ttbr0_pan())					\
+		uaccess_ttbr0_enable();					\
+	else								\
+		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,		\
+				CONFIG_ARM64_PAN));			\
 } while (0)
 
 static inline void uaccess_disable(void)
@@ -367,12 +409,39 @@ extern __must_check long strnlen_user(const char __user *str, long n);
 
 #include <asm/alternative.h>
 #include <asm/assembler.h>
+#include <asm/kernel-pgtable.h>
 
 /*
- * User access enabling/disabling macros. These are no-ops when UAO is
- * present.
+ * User access enabling/disabling macros.
+ */
+	.macro	uaccess_ttbr0_disable, tmp1
+	mrs	\tmp1, ttbr1_el1		// swapper_pg_dir
+	add	\tmp1, \tmp1, #SWAPPER_DIR_SIZE	// reserved_ttbr0 at the end of swapper_pg_dir
+	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
+	isb
+	.endm
+
+	.macro	uaccess_ttbr0_enable, tmp1
+	get_thread_info \tmp1
+	ldr	\tmp1, [\tmp1, #TI_TTBR0]	// load saved TTBR0_EL1
+	msr	ttbr0_el1, \tmp1		// set the non-PAN TTBR0_EL1
+	isb
+	.endm
+
+/*
+ * These macros are no-ops when UAO is present.
  */
 	.macro	uaccess_disable_not_uao, tmp1
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+alternative_if_not ARM64_HAS_PAN
+	uaccess_ttbr0_disable \tmp1
+alternative_else
+	nop
+	nop
+	nop
+	nop
+alternative_endif
+#endif
 alternative_if_not ARM64_ALT_PAN_NOT_UAO
 	nop
 alternative_else
@@ -381,6 +450,21 @@ alternative_endif
 	.endm
 
 	.macro	uaccess_enable_not_uao, tmp1, tmp2
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+alternative_if_not ARM64_HAS_PAN
+	save_and_disable_irq \tmp2		// avoid preemption
+	uaccess_ttbr0_enable \tmp1
+	restore_irq \tmp2
+alternative_else
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+alternative_endif
+#endif
 alternative_if_not ARM64_ALT_PAN_NOT_UAO
 	nop
 alternative_else
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 087cf9a65359..d0ec987dba5b 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -36,6 +36,9 @@ int main(void)
   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+  DEFINE(TI_TTBR0,		offsetof(struct thread_info, ttbr0));
+#endif
   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
   DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
   BLANK();
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 7566cad9fa1d..40ee3f2933e7 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -43,6 +43,7 @@ unsigned int compat_elf_hwcap2 __read_mostly;
 #endif
 
 DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
+EXPORT_SYMBOL(cpu_hwcaps);
 
 #define __ARM64_FTR_BITS(SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \
 	{						\
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 533e1c9fd5a6..8eb8eb085036 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -190,10 +190,6 @@ alternative_endif
 	eret					// return to kernel
 	.endm
 
-	.macro	get_thread_info, rd
-	mrs	\rd, sp_el0
-	.endm
-
 	.macro	irq_stack_entry
 	mov	x19, sp			// preserve the original sp
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 54fd1e2590a2..9bfa58fea8ce 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -321,14 +321,14 @@ __create_page_tables:
 	 * dirty cache lines being evicted.
 	 */
 	mov	x0, x25
-	add	x1, x26, #SWAPPER_DIR_SIZE
+	add	x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
 	bl	__inval_cache_range
 
 	/*
 	 * Clear the idmap and swapper page tables.
 	 */
 	mov	x0, x25
-	add	x6, x26, #SWAPPER_DIR_SIZE
+	add	x6, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
 1:	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
@@ -406,7 +406,7 @@ __create_page_tables:
 	 * tables again to remove any speculatively loaded cache lines.
 	 */
 	mov	x0, x25
-	add	x1, x26, #SWAPPER_DIR_SIZE
+	add	x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
 	dmb	sy
 	bl	__inval_cache_range
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 14813a6ca13b..87fd0556a1bd 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -185,6 +185,11 @@ SECTIONS
 	swapper_pg_dir = .;
 	. += SWAPPER_DIR_SIZE;
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	reserved_ttbr0 = .;
+	. += RESERVED_TTBR0_SIZE;
+#endif
+
 	_end = .;
 
 	STABS_DEBUG

From 5775ca34829caf0664c8ccc02fd0e93cb6022e0f Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 2 Sep 2016 14:54:03 +0100
Subject: [PATCH 0495/3220] FROMLIST: arm64: Disable TTBR0_EL1 during normal
 kernel execution

When the TTBR0 PAN feature is enabled, the kernel entry points need to
disable access to TTBR0_EL1. The PAN status of the interrupted context
is stored as part of the saved pstate, reusing the PSR_PAN_BIT (22).
Restoring access to TTBR0_PAN is done on exception return if returning
to user or returning to a context where PAN was disabled.

Context switching via switch_mm() must defer the update of TTBR0_EL1
until a return to user or an explicit uaccess_enable() call.

Special care needs to be taken for two cases where TTBR0_EL1 is set
outside the normal kernel context switch operation: EFI run-time
services (via efi_set_pgd) and CPU suspend (via cpu_(un)install_idmap).
Code has been added to avoid deferred TTBR0_EL1 switching as in
switch_mm() and restore the reserved TTBR0_EL1 when uninstalling the
special TTBR0_EL1.

This patch also removes a stale comment on the switch_mm() function.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: Id1198cf1cde022fad10a94f95d698fae91d742aa
(cherry picked from commit d26cfd64c973b31f73091c882e07350e14fdd6c9)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/efi.h         | 26 ++++++++++-
 arch/arm64/include/asm/mmu_context.h | 51 +++++++++++++++------
 arch/arm64/include/asm/ptrace.h      |  2 +
 arch/arm64/kernel/entry.S            | 67 ++++++++++++++++++++++++++++
 arch/arm64/kernel/setup.c            |  9 ++++
 arch/arm64/mm/context.c              |  7 ++-
 6 files changed, 146 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 8e88a696c9cb..932f5a56d1a6 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_EFI_H
 #define _ASM_EFI_H
 
+#include <asm/cpufeature.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/neon.h>
@@ -69,7 +70,30 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
 
 static inline void efi_set_pgd(struct mm_struct *mm)
 {
-	switch_mm(NULL, mm, NULL);
+	__switch_mm(mm);
+
+	if (system_uses_ttbr0_pan()) {
+		if (mm != current->active_mm) {
+			/*
+			 * Update the current thread's saved ttbr0 since it is
+			 * restored as part of a return from exception. Set
+			 * the hardware TTBR0_EL1 using cpu_switch_mm()
+			 * directly to enable potential errata workarounds.
+			 */
+			update_saved_ttbr0(current, mm);
+			cpu_switch_mm(mm->pgd, mm);
+		} else {
+			/*
+			 * Defer the switch to the current thread's TTBR0_EL1
+			 * until uaccess_enable(). Restore the current
+			 * thread's saved ttbr0 corresponding to its active_mm
+			 * (if different from init_mm).
+			 */
+			cpu_set_reserved_ttbr0();
+			if (current->active_mm != &init_mm)
+				update_saved_ttbr0(current, current->active_mm);
+		}
+	}
 }
 
 void efi_virtmap_load(void);
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index a00f7cf35bbd..4a32fd5f101d 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
 #include <asm/proc-fns.h>
 #include <asm-generic/mm_hooks.h>
 #include <asm/cputype.h>
@@ -113,7 +114,7 @@ static inline void cpu_uninstall_idmap(void)
 	local_flush_tlb_all();
 	cpu_set_default_tcr_t0sz();
 
-	if (mm != &init_mm)
+	if (mm != &init_mm && !system_uses_ttbr0_pan())
 		cpu_switch_mm(mm->pgd, mm);
 }
 
@@ -173,21 +174,27 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 }
 
-/*
- * This is the actual mm switch as far as the scheduler
- * is concerned.  No registers are touched.  We avoid
- * calling the CPU specific function when the mm hasn't
- * actually changed.
- */
-static inline void
-switch_mm(struct mm_struct *prev, struct mm_struct *next,
-	  struct task_struct *tsk)
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+				      struct mm_struct *mm)
+{
+	if (system_uses_ttbr0_pan()) {
+		BUG_ON(mm->pgd == swapper_pg_dir);
+		task_thread_info(tsk)->ttbr0 =
+			virt_to_phys(mm->pgd) | ASID(mm) << 48;
+	}
+}
+#else
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+				      struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void __switch_mm(struct mm_struct *next)
 {
 	unsigned int cpu = smp_processor_id();
 
-	if (prev == next)
-		return;
-
 	/*
 	 * init_mm.pgd does not contain any user mappings and it is always
 	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
@@ -200,7 +207,23 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	check_and_switch_context(next, cpu);
 }
 
+static inline void
+switch_mm(struct mm_struct *prev, struct mm_struct *next,
+	  struct task_struct *tsk)
+{
+	if (prev != next)
+		__switch_mm(next);
+
+	/*
+	 * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
+	 * value may have not been initialised yet (activate_mm caller) or the
+	 * ASID has changed since the last run (following the context switch
+	 * of another thread of the same process).
+	 */
+	update_saved_ttbr0(tsk, next);
+}
+
 #define deactivate_mm(tsk,mm)	do { } while (0)
-#define activate_mm(prev,next)	switch_mm(prev, next, NULL)
+#define activate_mm(prev,next)	switch_mm(prev, next, current)
 
 #endif
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 200d5c32b38f..dd4257e286b1 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -21,6 +21,8 @@
 
 #include <uapi/asm/ptrace.h>
 
+#define _PSR_PAN_BIT		22
+
 /* Current Exception Level values, as contained in CurrentEL */
 #define CurrentEL_EL1		(1 << 2)
 #define CurrentEL_EL2		(2 << 2)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 8eb8eb085036..eb185c93dfc6 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -29,7 +29,9 @@
 #include <asm/esr.h>
 #include <asm/irq.h>
 #include <asm/memory.h>
+#include <asm/ptrace.h>
 #include <asm/thread_info.h>
+#include <asm/uaccess.h>
 #include <asm/unistd.h>
 
 /*
@@ -109,6 +111,34 @@
 	mrs	x22, elr_el1
 	mrs	x23, spsr_el1
 	stp	lr, x21, [sp, #S_LR]
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
+	 * EL0, there is no need to check the state of TTBR0_EL1 since
+	 * accesses are always enabled.
+	 * Note that the meaning of this bit differs from the ARMv8.1 PAN
+	 * feature as all TTBR0_EL1 accesses are disabled, not just those to
+	 * user mappings.
+	 */
+alternative_if_not ARM64_HAS_PAN
+	nop
+alternative_else
+	b	1f				// skip TTBR0 PAN
+alternative_endif
+
+	.if	\el != 0
+	mrs	x21, ttbr0_el1
+	tst	x21, #0xffff << 48		// Check for the reserved ASID
+	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
+	b.eq	1f				// TTBR0 access already disabled
+	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
+	.endif
+
+	uaccess_ttbr0_disable x21
+1:
+#endif
+
 	stp	x22, x23, [sp, #S_PC]
 
 	/*
@@ -147,6 +177,42 @@
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 	.if	\el == 0
 	ct_user_enter
+	.endif
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
+	 * PAN bit checking.
+	 */
+alternative_if_not ARM64_HAS_PAN
+	nop
+alternative_else
+	b	2f				// skip TTBR0 PAN
+alternative_endif
+
+	.if	\el != 0
+	tbnz	x22, #_PSR_PAN_BIT, 1f		// Skip re-enabling TTBR0 access if previously disabled
+	.endif
+
+	uaccess_ttbr0_enable x0
+
+	.if	\el == 0
+	/*
+	 * Enable errata workarounds only if returning to user. The only
+	 * workaround currently required for TTBR0_EL1 changes are for the
+	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
+	 * corruption).
+	 */
+	post_ttbr0_update_workaround
+	.endif
+1:
+	.if	\el != 0
+	and	x22, x22, #~PSR_PAN_BIT		// ARMv8.0 CPUs do not understand this bit
+	.endif
+2:
+#endif
+
+	.if	\el == 0
 	ldr	x23, [sp, #S_SP]		// load return stack pointer
 	msr	sp_el0, x23
 #ifdef CONFIG_ARM64_ERRATUM_845719
@@ -168,6 +234,7 @@ alternative_else
 alternative_endif
 #endif
 	.endif
+
 	msr	elr_el1, x21			// set up the return data
 	msr	spsr_el1, x22
 	ldp	x0, x1, [sp, #16 * 0]
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 29b8c247d56f..6591bf23422b 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -347,6 +347,15 @@ void __init setup_arch(char **cmdline_p)
 	smp_init_cpus();
 	smp_build_mpidr_hash();
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Make sure init_thread_info.ttbr0 always generates translation
+	 * faults in case uaccess_enable() is inadvertently called by the init
+	 * thread.
+	 */
+	init_thread_info.ttbr0 = virt_to_phys(empty_zero_page);
+#endif
+
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
 	conswitchp = &vga_con;
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 7275628ba59f..25128089c386 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -182,7 +182,12 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath:
-	cpu_switch_mm(mm->pgd, mm);
+	/*
+	 * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
+	 * emulating PAN.
+	 */
+	if (!system_uses_ttbr0_pan())
+		cpu_switch_mm(mm->pgd, mm);
 }
 
 static int asids_init(void)

From 5dc2b7c7bb33138270ff9494be6cf334bd3d20e1 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 18:22:39 +0100
Subject: [PATCH 0496/3220] FROMLIST: arm64: Handle faults caused by
 inadvertent user access with PAN enabled

When TTBR0_EL1 is set to the reserved page, an erroneous kernel access
to user space would generate a translation fault. This patch adds the
checks for the software-set PSR_PAN_BIT to emulate a permission fault
and report it accordingly.

This patch also updates the description of the synchronous external
aborts on translation table walks.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: I623113fc8bf6d5f023aeec7a0640b62a25ef8420
(cherry picked from commit be3db9340c8011d22f06715339b66bcbbd4893bd)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/mm/fault.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index e328430dbcec..f0c75fd6a3fa 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -246,13 +246,19 @@ out:
 	return fault;
 }
 
-static inline bool is_permission_fault(unsigned int esr)
+static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs)
 {
 	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
 
-	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) ||
-	       (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+	if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
+		return false;
+
+	if (system_uses_ttbr0_pan())
+		return fsc_type == ESR_ELx_FSC_FAULT &&
+			(regs->pstate & PSR_PAN_BIT);
+	else
+		return fsc_type == ESR_ELx_FSC_PERM;
 }
 
 static bool is_el0_instruction_abort(unsigned int esr)
@@ -293,7 +299,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		mm_flags |= FAULT_FLAG_WRITE;
 	}
 
-	if (is_permission_fault(esr) && (addr < USER_DS)) {
+	if (addr < USER_DS && is_permission_fault(esr, regs)) {
 		/* regs->orig_addr_limit may be 0 if we entered from EL0 */
 		if (regs->orig_addr_limit == KERNEL_DS)
 			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
@@ -478,10 +484,10 @@ static const struct fault_info {
 	{ do_bad,		SIGBUS,  0,		"unknown 17"			},
 	{ do_bad,		SIGBUS,  0,		"unknown 18"			},
 	{ do_bad,		SIGBUS,  0,		"unknown 19"			},
-	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
-	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
-	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
-	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
 	{ do_bad,		SIGBUS,  0,		"synchronous parity error"	},
 	{ do_bad,		SIGBUS,  0,		"unknown 25"			},
 	{ do_bad,		SIGBUS,  0,		"unknown 26"			},

From 4dbc88bd2b6a74fd33483ee2593dcf2bd858eabe Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 5 Jul 2016 12:25:15 +0100
Subject: [PATCH 0497/3220] FROMLIST: arm64: xen: Enable user access before a
 privcmd hvc call

Privcmd calls are issued by the userspace. The kernel needs to enable
access to TTBR0_EL1 as the hypervisor would issue stage 1 translations
to user memory via AT instructions. Since AT instructions are not
affected by the PAN bit (ARMv8.1), we only need the explicit
uaccess_enable/disable if the TTBR0 PAN option is enabled.

Reviewed-by: Julien Grall <julien.grall@arm.com>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: I927f14076ba94c83e609b19f46dd373287e11fc4
(cherry picked from commit 8cc1f33d2c9f206b6505bedba41aed2b33c203c0)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/xen/hypercall.S | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 8bbe9401f4f0..6d6e4af1a4bf 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -49,6 +49,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/uaccess.h>
 #include <xen/interface/xen.h>
 
 
@@ -89,6 +90,24 @@ ENTRY(privcmd_call)
 	mov x2, x3
 	mov x3, x4
 	mov x4, x5
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Privcmd calls are issued by the userspace. The kernel needs to
+	 * enable access to TTBR0_EL1 as the hypervisor would issue stage 1
+	 * translations to user memory via AT instructions. Since AT
+	 * instructions are not affected by the PAN bit (ARMv8.1), we only
+	 * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
+	 * is enabled (it implies that hardware UAO and PAN disabled).
+	 */
+	uaccess_enable_not_uao x6, x7
+#endif
 	hvc XEN_IMM
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Disable userspace access from kernel once the hyp call completed.
+	 */
+	uaccess_disable_not_uao x6
+#endif
 	ret
 ENDPROC(privcmd_call);

From 67cd3bda54dadba4f8892105adf9c2f3982bfa0a Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 18:25:31 +0100
Subject: [PATCH 0498/3220] FROMLIST: arm64: Enable CONFIG_ARM64_SW_TTBR0_PAN

This patch adds the Kconfig option to enable support for TTBR0 PAN
emulation. The option is default off because of a slight performance hit
when enabled, caused by the additional TTBR0_EL1 switching during user
access operations or exception entry/exit code.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Change-Id: Id00a8ad4169d6eb6176c468d953436eb4ae887ae
(cherry picked from commit 6a2d7bad43474c48b68394d455b84a16b7d7dc3f)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/Kconfig | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index bf24d67a1ff1..488fed2d5e5e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -706,6 +706,14 @@ config SETEND_EMULATION
 	  If unsure, say Y
 endif
 
+config ARM64_SW_TTBR0_PAN
+	bool "Emulate Priviledged Access Never using TTBR0_EL1 switching"
+	help
+	  Enabling this option prevents the kernel from accessing
+	  user-space memory directly by pointing TTBR0_EL1 to a reserved
+	  zeroed area and reserved ASID. The user access routines
+	  restore the valid TTBR0_EL1 temporarily.
+
 menu "ARMv8.1 architectural features"
 
 config ARM64_HW_AFDBM

From d625b6286ac7af9bc19811c1a625867a71c631bf Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 12 Jan 2016 14:22:45 +0100
Subject: [PATCH 0499/3220] UPSTREAM: ia64: split off early_ioremap()
 declarations into asm/early_ioremap.h

Unlike x86, arm64 and ARM, ia64 does not declare its implementations
of early_ioremap/early_iounmap/early_memremap/early_memunmap in a header
file called <asm/early_ioremap.h>

This complicates the use of these functions in generic code, since the
header cannot be included directly, and we have to rely on transitive
includes, which is fragile.

So create a <asm/early_ioremap.h> for ia64, and move the existing
definitions into it.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>

Change-Id: I31eb55a9d57596faa40aec64bd26ce3ec21b0b4d
(cherry picked from commit 809267708557ed5575831282f719ca644698084b)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/ia64/include/asm/early_ioremap.h | 10 ++++++++++
 arch/ia64/include/asm/io.h            |  5 +----
 2 files changed, 11 insertions(+), 4 deletions(-)
 create mode 100644 arch/ia64/include/asm/early_ioremap.h

diff --git a/arch/ia64/include/asm/early_ioremap.h b/arch/ia64/include/asm/early_ioremap.h
new file mode 100644
index 000000000000..eec9e1d1b833
--- /dev/null
+++ b/arch/ia64/include/asm/early_ioremap.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_IA64_EARLY_IOREMAP_H
+#define _ASM_IA64_EARLY_IOREMAP_H
+
+extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size);
+#define early_memremap(phys_addr, size)        early_ioremap(phys_addr, size)
+
+extern void early_iounmap (volatile void __iomem *addr, unsigned long size);
+#define early_memunmap(addr, size)             early_iounmap(addr, size)
+
+#endif
diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h
index 9041bbe2b7b4..a865d2a04f75 100644
--- a/arch/ia64/include/asm/io.h
+++ b/arch/ia64/include/asm/io.h
@@ -20,6 +20,7 @@
  */
 
 #include <asm/unaligned.h>
+#include <asm/early_ioremap.h>
 
 /* We don't use IO slowdowns on the ia64, but.. */
 #define __SLOW_DOWN_IO	do { } while (0)
@@ -427,10 +428,6 @@ __writeq (unsigned long val, volatile void __iomem *addr)
 extern void __iomem * ioremap(unsigned long offset, unsigned long size);
 extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
 extern void iounmap (volatile void __iomem *addr);
-extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size);
-#define early_memremap(phys_addr, size)        early_ioremap(phys_addr, size)
-extern void early_iounmap (volatile void __iomem *addr, unsigned long size);
-#define early_memunmap(addr, size)             early_iounmap(addr, size)
 static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size)
 {
 	return ioremap(phys_addr, size);

From a92cff09551d9ef0605af1cf6a7d9ee087d036c3 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 12 Jan 2016 14:22:46 +0100
Subject: [PATCH 0500/3220] UPSTREAM: efi: include asm/early_ioremap.h not
 asm/efi.h to get early_memremap

The code in efi.c uses early_memremap(), but relies on a transitive
include rather than including asm/early_ioremap.h directly, since
this header did not exist on ia64.

Commit f7d924894265 ("arm64/efi: refactor EFI init and runtime code
for reuse by 32-bit ARM") attempted to work around this by including
asm/efi.h, which transitively includes asm/early_ioremap.h on most
architectures. However, since asm/efi.h does not exist on ia64 either,
this is not much of an improvement.

Now that we have created an asm/early_ioremap.h for ia64, we can just
include it directly.

Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>

Change-Id: Ifa3e69e0b4078bac1e1d29bfe56861eb394e865b
(cherry picked from commit 0f7f2f0c0fcbe5e2bcba707a628ebaedfe2be4b4)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 drivers/firmware/efi/efi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index cffa89b3317b..2cd37dad67a6 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -25,7 +25,7 @@
 #include <linux/io.h>
 #include <linux/platform_device.h>
 
-#include <asm/efi.h>
+#include <asm/early_ioremap.h>
 
 struct efi __read_mostly efi = {
 	.mps			= EFI_INVALID_TABLE_ADDR,

From 8fd31d5a2c06726a6060ea355bb5c90d44b132c4 Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Mon, 3 Oct 2016 16:17:34 -0700
Subject: [PATCH 0501/3220] Fix a build breakage in IO latency hist code.

Fix a build breakage where MMC is enabled, but BLOCK is not.

Change-Id: I0eb422d12264f0371f3368ae7c37342ef9efabaa
Signed-off-by: Mohan Srinivasan <srmohan@google.com>
---
 drivers/mmc/core/core.c  | 6 ++++++
 drivers/mmc/core/host.c  | 4 ++++
 include/linux/mmc/core.h | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index b503249950df..f9d928cbc214 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -183,6 +183,7 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
 			pr_debug("%s:     %d bytes transferred: %d\n",
 				mmc_hostname(host),
 				mrq->data->bytes_xfered, mrq->data->error);
+#ifdef CONFIG_BLOCK
 			if (mrq->lat_hist_enabled) {
 				ktime_t completion;
 				u_int64_t delta_us;
@@ -194,6 +195,7 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
 					(mrq->data->flags & MMC_DATA_READ),
 					delta_us);
 			}
+#endif
 			trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data);
 		}
 
@@ -638,11 +640,13 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host,
 	}
 
 	if (!err && areq) {
+#ifdef CONFIG_BLOCK
 		if (host->latency_hist_enabled) {
 			areq->mrq->io_start = ktime_get();
 			areq->mrq->lat_hist_enabled = 1;
 		} else
 			areq->mrq->lat_hist_enabled = 0;
+#endif
 		trace_mmc_blk_rw_start(areq->mrq->cmd->opcode,
 				       areq->mrq->cmd->arg,
 				       areq->mrq->data);
@@ -2923,6 +2927,7 @@ static void __exit mmc_exit(void)
 	destroy_workqueue(workqueue);
 }
 
+#ifdef CONFIG_BLOCK
 static ssize_t
 latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
@@ -2970,6 +2975,7 @@ mmc_latency_hist_sysfs_exit(struct mmc_host *host)
 {
 	device_remove_file(&host->class_dev, &dev_attr_latency_hist);
 }
+#endif
 
 subsys_initcall(mmc_init);
 module_exit(mmc_exit);
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 17068839c74b..443fdfc22d8a 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -392,7 +392,9 @@ int mmc_add_host(struct mmc_host *host)
 	mmc_add_host_debugfs(host);
 #endif
 
+#ifdef CONFIG_BLOCK
 	mmc_latency_hist_sysfs_init(host);
+#endif
 
 	mmc_start_host(host);
 	if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
@@ -422,7 +424,9 @@ void mmc_remove_host(struct mmc_host *host)
 	mmc_remove_host_debugfs(host);
 #endif
 
+#ifdef CONFIG_BLOCK
 	mmc_latency_hist_sysfs_exit(host);
+#endif
 
 	device_del(&host->class_dev);
 
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 3349f0676acb..0860efd6e1be 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -137,7 +137,9 @@ struct mmc_request {
 	void			(*done)(struct mmc_request *);/* completion function */
 	struct mmc_host		*host;
 	ktime_t			io_start;
+#ifdef CONFIG_BLOCK
 	int			lat_hist_enabled;
+#endif
 };
 
 struct mmc_card;

From 1a6c07cf6f2671e1a8a561745bdcc295a7cfeb17 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Fri, 19 Feb 2016 11:50:32 -0600
Subject: [PATCH 0502/3220] BACKPORT: arm64: mm: Mark .rodata as RO

Currently the .rodata section is actually still executable when DEBUG_RODATA
is enabled. This changes that so the .rodata is actually read only, no execute.
It also adds the .rodata section to the mem_init banner.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
[catalin.marinas@arm.com: added vm_struct vmlinux_rodata in map_kernel()]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 31374226
Change-Id: I6fd95beaf814fc91805da12c5329a57ce9008fd7
(cherry picked from commit 2f39b5f91eb4bccd786d194e70db1dccad784755)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/kernel/vmlinux.lds.S |  6 +++---
 arch/arm64/mm/init.c            |  2 ++
 arch/arm64/mm/mmu.c             | 25 ++++++++++++++++---------
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 87fd0556a1bd..f1d6c49dcc5f 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -114,14 +114,14 @@ SECTIONS
 		*(.got)			/* Global offset table		*/
 	}
 
+	ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
 	_etext = .;			/* End of text section */
 
-	RO_DATA(PAGE_SIZE)
-	EXCEPTION_TABLE(8)
+	RO_DATA(PAGE_SIZE)		/* everything from this point to     */
+	EXCEPTION_TABLE(8)		/* __init_begin will be marked RO NX */
 	NOTES
 
 	ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
-	_etext = .;			/* End of text and rodata section */
 	__init_begin = .;
 
 	INIT_TEXT_SECTION(8)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 0db12e9e8765..678878077996 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -376,6 +376,7 @@ void __init mem_init(void)
 		  "    vmalloc : 0x%16lx - 0x%16lx   (%6ld GB)\n"
 		  "      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
 		  "      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
+		  "    .rodata : 0x%p" " - 0x%p" "   (%6ld KB)\n"
 		  "      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n"
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 		  "    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n"
@@ -391,6 +392,7 @@ void __init mem_init(void)
 		  MLG(VMALLOC_START, VMALLOC_END),
 		  MLK_ROUNDUP(__init_begin, __init_end),
 		  MLK_ROUNDUP(_text, _etext),
+		  MLK_ROUNDUP(__start_rodata, __init_begin),
 		  MLK_ROUNDUP(_sdata, _edata),
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 		  MLG(VMEMMAP_START,
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9dafcfde7c26..4b38f6b8806e 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -386,14 +386,14 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
 {
 	unsigned long kernel_start = __pa(_stext);
-	unsigned long kernel_end = __pa(_etext);
+	unsigned long kernel_end = __pa(__init_begin);
 
 	/*
 	 * Take care not to create a writable alias for the
 	 * read-only text and rodata sections of the kernel image.
 	 */
 
-	/* No overlap with the kernel text */
+	/* No overlap with the kernel text/rodata */
 	if (end < kernel_start || start >= kernel_end) {
 		__create_pgd_mapping(pgd, start, __phys_to_virt(start),
 				     end - start, PAGE_KERNEL,
@@ -402,7 +402,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
 	}
 
 	/*
-	 * This block overlaps the kernel text mapping.
+	 * This block overlaps the kernel text/rodata mapping.
 	 * Map the portion(s) which don't overlap.
 	 */
 	if (start < kernel_start)
@@ -417,7 +417,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
 				     early_pgtable_alloc);
 
 	/*
-	 * Map the linear alias of the [_stext, _etext) interval as
+	 * Map the linear alias of the [_stext, __init_begin) interval as
 	 * read-only/non-executable. This makes the contents of the
 	 * region accessible to subsystems such as hibernate, but
 	 * protects it from inadvertent modification or execution.
@@ -447,12 +447,18 @@ static void __init map_mem(pgd_t *pgd)
 
 void mark_rodata_ro(void)
 {
-	if (!IS_ENABLED(CONFIG_DEBUG_RODATA))
-		return;
+	unsigned long section_size;
 
+	section_size = (unsigned long)_etext - (unsigned long)_stext;
 	create_mapping_late(__pa(_stext), (unsigned long)_stext,
-				(unsigned long)__init_begin - (unsigned long)_stext,
-				PAGE_KERNEL_ROX);
+			    section_size, PAGE_KERNEL_ROX);
+	/*
+	 * mark .rodata as read only. Use __init_begin rather than __end_rodata
+	 * to cover NOTES and EXCEPTION_TABLE.
+	 */
+	section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
+	create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata,
+			    section_size, PAGE_KERNEL_RO);
 }
 
 void fixup_init(void)
@@ -491,9 +497,10 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end,
  */
 static void __init map_kernel(pgd_t *pgd)
 {
-	static struct vm_struct vmlinux_text, vmlinux_init, vmlinux_data;
+	static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data;
 
 	map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC, &vmlinux_text);
+	map_kernel_chunk(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata);
 	map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
 			 &vmlinux_init);
 	map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);

From 50bad62b51e802b5bd5e731915234905fbe4d4f6 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Wed, 21 Sep 2016 15:25:04 -0700
Subject: [PATCH 0503/3220] BACKPORT: arm64: Correctly bounds check
 virt_addr_valid

virt_addr_valid is supposed to return true if and only if virt_to_page
returns a valid page structure. The current macro does math on whatever
address is given and passes that to pfn_valid to verify. vmalloc and
module addresses can happen to generate a pfn that 'happens' to be
valid. Fix this by only performing the pfn_valid check on addresses that
have the potential to be valid.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 31374226
Change-Id: I75cbeb3edb059f19af992b7f5d0baa283f95991b
(cherry picked from commit ca219452c6b8a6cd1369b6a78b1cf069d0386865)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/memory.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 12f8a00fb3f1..ba1b3409d7ed 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -193,7 +193,11 @@ static inline void *phys_to_virt(phys_addr_t x)
 #define ARCH_PFN_OFFSET		((unsigned long)PHYS_PFN_OFFSET)
 
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define	virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define _virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+
+#define _virt_addr_is_linear(kaddr)	(((u64)(kaddr)) >= PAGE_OFFSET)
+#define virt_addr_valid(kaddr)		(_virt_addr_is_linear(kaddr) && \
+					 _virt_addr_valid(kaddr))
 
 #endif
 

From 762a327de1b8dc48f813c7d397f7b627cff215c5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 8 Sep 2016 09:57:07 +0200
Subject: [PATCH 0504/3220] UPSTREAM: fs/proc/kcore.c: Make bounce buffer
 global for read

Next patch adds bounce buffer for ktext area, so it's
convenient to have single bounce buffer for both
vmalloc/module and ktext cases.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 31374226
Change-Id: I8f517354e6d12aed75ed4ae6c0a6adef0a1e61da
(cherry picked from commit f5beeb1851ea6f8cfcf2657f26cb24c0582b4945)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 fs/proc/kcore.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 92e6726f6e37..2dc84040b0cb 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
+	char *buf = file->private_data;
 	ssize_t acc = 0;
 	size_t size, tsz;
 	size_t elf_buflen;
@@ -500,18 +501,10 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			if (clear_user(buffer, tsz))
 				return -EFAULT;
 		} else if (is_vmalloc_or_module_addr((void *)start)) {
-			char * elf_buf;
-
-			elf_buf = kzalloc(tsz, GFP_KERNEL);
-			if (!elf_buf)
-				return -ENOMEM;
-			vread(elf_buf, (char *)start, tsz);
+			vread(buf, (char *)start, tsz);
 			/* we have to zero-fill user buffer even if no read */
-			if (copy_to_user(buffer, elf_buf, tsz)) {
-				kfree(elf_buf);
+			if (copy_to_user(buffer, buf, tsz))
 				return -EFAULT;
-			}
-			kfree(elf_buf);
 		} else {
 			if (kern_addr_valid(start)) {
 				unsigned long n;
@@ -549,6 +542,11 @@ static int open_kcore(struct inode *inode, struct file *filp)
 {
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
+
+	filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!filp->private_data)
+		return -ENOMEM;
+
 	if (kcore_need_update)
 		kcore_update_ram();
 	if (i_size_read(inode) != proc_root_kcore->size) {
@@ -559,10 +557,16 @@ static int open_kcore(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int release_kcore(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
 
 static const struct file_operations proc_kcore_operations = {
 	.read		= read_kcore,
 	.open		= open_kcore,
+	.release	= release_kcore,
 	.llseek		= default_llseek,
 };
 

From c7642969a507168ab9d4a5d4715030af5d2a11a7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 8 Sep 2016 09:57:08 +0200
Subject: [PATCH 0505/3220] UPSTREAM: fs/proc/kcore.c: Add bounce buffer for
 ktext data

We hit hardened usercopy feature check for kernel text access by reading
kcore file:

  usercopy: kernel memory exposure attempt detected from ffffffff8179a01f (<kernel text>) (4065 bytes)
  kernel BUG at mm/usercopy.c:75!

Bypassing this check for kcore by adding bounce buffer for ktext data.

Reported-by: Steve Best <sbest@redhat.com>
Fixes: f5509cc18daa ("mm: Hardened usercopy")
Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 31374226
Change-Id: Ic93e6041b67d804a994518bf4950811f828b406e
(cherry picked from commit df04abfd181acc276ba6762c8206891ae10ae00d)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 fs/proc/kcore.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 2dc84040b0cb..21f198aa0961 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -509,7 +509,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			if (kern_addr_valid(start)) {
 				unsigned long n;
 
-				n = copy_to_user(buffer, (char *)start, tsz);
+				/*
+				 * Using bounce buffer to bypass the
+				 * hardened user copy kernel text checks.
+				 */
+				memcpy(buf, (char *) start, tsz);
+				n = copy_to_user(buffer, buf, tsz);
 				/*
 				 * We cannot distinguish between fault on source
 				 * and fault on destination. When this happens

From d02eecfab2e3b76185f396a1e3dad004b6962696 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 5 Oct 2016 09:52:07 -0700
Subject: [PATCH 0506/3220] ANDROID: android-base: CONFIG_HARDENED_USERCOPY=y

Bug: 31374226
Change-Id: I977e76395017d8d718ea634421b3635023934ef9
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 1ebef5c69fb6..8531a7a79e33 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -24,6 +24,7 @@ CONFIG_DM_VERITY=y
 CONFIG_DM_VERITY_FEC=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y

From 620ed669ae0bca79885abb5a3c242ac5bf000f3b Mon Sep 17 00:00:00 2001
From: EunTaik Lee <eun.taik.lee@samsung.com>
Date: Wed, 24 Feb 2016 04:38:06 +0000
Subject: [PATCH 0507/3220] UPSTREAM: staging/android/ion : fix a race
 condition in the ion driver

There is a use-after-free problem in the ion driver.
This is caused by a race condition in the ion_ioctl()
function.

A handle has ref count of 1 and two tasks on different
cpus calls ION_IOC_FREE simultaneously.

cpu 0                                   cpu 1
-------------------------------------------------------
ion_handle_get_by_id()
(ref == 2)
                            ion_handle_get_by_id()
                            (ref == 3)

ion_free()
(ref == 2)

ion_handle_put()
(ref == 1)

                            ion_free()
                            (ref == 0 so ion_handle_destroy() is
                            called
                            and the handle is freed.)

                            ion_handle_put() is called and it
                            decreases the slub's next free pointer

The problem is detected as an unaligned access in the
spin lock functions since it uses load exclusive
 instruction. In some cases it corrupts the slub's
free pointer which causes a mis-aligned access to the
next free pointer.(kmalloc returns a pointer like
ffffc0745b4580aa). And it causes lots of other
hard-to-debug problems.

This symptom is caused since the first member in the
ion_handle structure is the reference count and the
ion driver decrements the reference after it has been
freed.

To fix this problem client->lock mutex is extended
to protect all the codes that uses the handle.

Signed-off-by: Eun Taik Lee <eun.taik.lee@samsung.com>
Reviewed-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 9590232bb4f4cc824f3425a6e1349afbe6d6d2b7)
bug: 31568617
Change-Id: I4ea2be0cad3305c4e196126a02e2ab7108ef0976
---
 drivers/staging/android/ion/ion.c | 59 +++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index e237e9f3312d..1958d58fdca3 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -385,13 +385,22 @@ static void ion_handle_get(struct ion_handle *handle)
 	kref_get(&handle->ref);
 }
 
-static int ion_handle_put(struct ion_handle *handle)
+static int ion_handle_put_nolock(struct ion_handle *handle)
+{
+	int ret;
+
+	ret = kref_put(&handle->ref, ion_handle_destroy);
+
+	return ret;
+}
+
+int ion_handle_put(struct ion_handle *handle)
 {
 	struct ion_client *client = handle->client;
 	int ret;
 
 	mutex_lock(&client->lock);
-	ret = kref_put(&handle->ref, ion_handle_destroy);
+	ret = ion_handle_put_nolock(handle);
 	mutex_unlock(&client->lock);
 
 	return ret;
@@ -415,18 +424,28 @@ static struct ion_handle *ion_handle_lookup(struct ion_client *client,
 	return ERR_PTR(-EINVAL);
 }
 
-static struct ion_handle *ion_handle_get_by_id(struct ion_client *client,
+static struct ion_handle *ion_handle_get_by_id_nolock(struct ion_client *client,
+						int id)
+{
+	struct ion_handle *handle;
+
+	handle = idr_find(&client->idr, id);
+	if (handle)
+		ion_handle_get(handle);
+
+	return handle ? handle : ERR_PTR(-EINVAL);
+}
+
+struct ion_handle *ion_handle_get_by_id(struct ion_client *client,
 						int id)
 {
 	struct ion_handle *handle;
 
 	mutex_lock(&client->lock);
-	handle = idr_find(&client->idr, id);
-	if (handle)
-		ion_handle_get(handle);
+	handle = ion_handle_get_by_id_nolock(client, id);
 	mutex_unlock(&client->lock);
 
-	return handle ? handle : ERR_PTR(-EINVAL);
+	return handle;
 }
 
 static bool ion_handle_validate(struct ion_client *client,
@@ -530,22 +549,28 @@ struct ion_handle *ion_alloc(struct ion_client *client, size_t len,
 }
 EXPORT_SYMBOL(ion_alloc);
 
-void ion_free(struct ion_client *client, struct ion_handle *handle)
+static void ion_free_nolock(struct ion_client *client, struct ion_handle *handle)
 {
 	bool valid_handle;
 
 	BUG_ON(client != handle->client);
 
-	mutex_lock(&client->lock);
 	valid_handle = ion_handle_validate(client, handle);
 
 	if (!valid_handle) {
 		WARN(1, "%s: invalid handle passed to free.\n", __func__);
-		mutex_unlock(&client->lock);
 		return;
 	}
+	ion_handle_put_nolock(handle);
+}
+
+void ion_free(struct ion_client *client, struct ion_handle *handle)
+{
+	BUG_ON(client != handle->client);
+
+	mutex_lock(&client->lock);
+	ion_free_nolock(client, handle);
 	mutex_unlock(&client->lock);
-	ion_handle_put(handle);
 }
 EXPORT_SYMBOL(ion_free);
 
@@ -1281,11 +1306,15 @@ static long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	{
 		struct ion_handle *handle;
 
-		handle = ion_handle_get_by_id(client, data.handle.handle);
-		if (IS_ERR(handle))
+		mutex_lock(&client->lock);
+		handle = ion_handle_get_by_id_nolock(client, data.handle.handle);
+		if (IS_ERR(handle)) {
+			mutex_unlock(&client->lock);
 			return PTR_ERR(handle);
-		ion_free(client, handle);
-		ion_handle_put(handle);
+		}
+		ion_free_nolock(client, handle);
+		ion_handle_put_nolock(handle);
+		mutex_unlock(&client->lock);
 		break;
 	}
 	case ION_IOC_SHARE:

From b81f4c5f3167142b5b0b91b474658a850e8c5450 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= <arve@android.com>
Date: Tue, 2 Aug 2016 15:40:39 -0700
Subject: [PATCH 0508/3220] ANDROID: binder: Add strong ref checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prevent using a binder_ref with only weak references where a strong
reference is required.

BUG: 30445380
Change-Id: I66c15b066808f28bd27bfe50fd0e03ff45a09fca
Signed-off-by: Arve Hjønnevåg <arve@android.com>
---
 drivers/android/binder.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index f0ce9959d14d..ce84ff14ce9b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1003,7 +1003,7 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal)
 
 
 static struct binder_ref *binder_get_ref(struct binder_proc *proc,
-					 uint32_t desc)
+					 uint32_t desc, bool need_strong_ref)
 {
 	struct rb_node *n = proc->refs_by_desc.rb_node;
 	struct binder_ref *ref;
@@ -1011,12 +1011,16 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc,
 	while (n) {
 		ref = rb_entry(n, struct binder_ref, rb_node_desc);
 
-		if (desc < ref->desc)
+		if (desc < ref->desc) {
 			n = n->rb_left;
-		else if (desc > ref->desc)
+		} else if (desc > ref->desc) {
 			n = n->rb_right;
-		else
+		} else if (need_strong_ref && !ref->strong) {
+			binder_user_error("tried to use weak ref as strong ref\n");
+			return NULL;
+		} else {
 			return ref;
+		}
 	}
 	return NULL;
 }
@@ -1286,7 +1290,8 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 		} break;
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
-			struct binder_ref *ref = binder_get_ref(proc, fp->handle);
+			struct binder_ref *ref = binder_get_ref(proc, fp->handle,
+						fp->type == BINDER_TYPE_HANDLE);
 
 			if (ref == NULL) {
 				pr_err("transaction release %d bad handle %d\n",
@@ -1381,7 +1386,7 @@ static void binder_transaction(struct binder_proc *proc,
 		if (tr->target.handle) {
 			struct binder_ref *ref;
 
-			ref = binder_get_ref(proc, tr->target.handle);
+			ref = binder_get_ref(proc, tr->target.handle, true);
 			if (ref == NULL) {
 				binder_user_error("%d:%d got transaction to invalid handle\n",
 					proc->pid, thread->pid);
@@ -1590,7 +1595,8 @@ static void binder_transaction(struct binder_proc *proc,
 		} break;
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
-			struct binder_ref *ref = binder_get_ref(proc, fp->handle);
+			struct binder_ref *ref = binder_get_ref(proc, fp->handle,
+						fp->type == BINDER_TYPE_HANDLE);
 
 			if (ref == NULL) {
 				binder_user_error("%d:%d got transaction with invalid handle, %d\n",
@@ -1801,7 +1807,9 @@ static int binder_thread_write(struct binder_proc *proc,
 						ref->desc);
 				}
 			} else
-				ref = binder_get_ref(proc, target);
+				ref = binder_get_ref(proc, target,
+						     cmd == BC_ACQUIRE ||
+						     cmd == BC_RELEASE);
 			if (ref == NULL) {
 				binder_user_error("%d:%d refcount change on invalid ref %d\n",
 					proc->pid, thread->pid, target);
@@ -1997,7 +2005,7 @@ static int binder_thread_write(struct binder_proc *proc,
 			if (get_user(cookie, (binder_uintptr_t __user *)ptr))
 				return -EFAULT;
 			ptr += sizeof(binder_uintptr_t);
-			ref = binder_get_ref(proc, target);
+			ref = binder_get_ref(proc, target, false);
 			if (ref == NULL) {
 				binder_user_error("%d:%d %s invalid ref %d\n",
 					proc->pid, thread->pid,

From 78c26bebd182874a7383a5d07446673b61d1ea0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= <arve@android.com>
Date: Fri, 12 Aug 2016 16:04:28 -0700
Subject: [PATCH 0509/3220] ANDROID: binder: Clear binder and cookie when
 setting handle in flat binder struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prevents leaking pointers between processes

BUG: 30768347
Change-Id: Id898076926f658a1b8b27a3ccb848756b36de4ca
Signed-off-by: Arve Hjønnevåg <arve@android.com>
---
 drivers/android/binder.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index ce84ff14ce9b..b8ba5b7516b9 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1583,7 +1583,9 @@ static void binder_transaction(struct binder_proc *proc,
 				fp->type = BINDER_TYPE_HANDLE;
 			else
 				fp->type = BINDER_TYPE_WEAK_HANDLE;
+			fp->binder = 0;
 			fp->handle = ref->desc;
+			fp->cookie = 0;
 			binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE,
 				       &thread->todo);
 
@@ -1631,7 +1633,9 @@ static void binder_transaction(struct binder_proc *proc,
 					return_error = BR_FAILED_REPLY;
 					goto err_binder_get_ref_for_node_failed;
 				}
+				fp->binder = 0;
 				fp->handle = new_ref->desc;
+				fp->cookie = 0;
 				binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL);
 				trace_binder_transaction_ref_to_ref(t, ref,
 								    new_ref);
@@ -1685,6 +1689,7 @@ static void binder_transaction(struct binder_proc *proc,
 			binder_debug(BINDER_DEBUG_TRANSACTION,
 				     "        fd %d -> %d\n", fp->handle, target_fd);
 			/* TODO: fput? */
+			fp->binder = 0;
 			fp->handle = target_fd;
 		} break;
 

From 077d045666866063087a1e71e5e82ede3e139d96 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 25 May 2016 11:48:25 -0400
Subject: [PATCH 0510/3220] UPSTREAM: percpu: fix synchronization between
 chunk->map_extend_work and chunk destruction

(cherry picked from commit 4f996e234dad488e5d9ba0858bc1bae12eff82c3)

Atomic allocations can trigger async map extensions which is serviced
by chunk->map_extend_work.  pcpu_balance_work which is responsible for
destroying idle chunks wasn't synchronizing properly against
chunk->map_extend_work and may end up freeing the chunk while the work
item is still in flight.

This patch fixes the bug by rolling async map extension operations
into pcpu_balance_work.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: stable@vger.kernel.org # v3.18+
Fixes: 9c824b6a172c ("percpu: make sure chunk->map array has available space")
Change-Id: I8f4aaf7fe0bc0e9f353d41e0a7840c40d6a32117
Bug: 31596597
---
 mm/percpu.c | 57 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 8a943b97a053..58b014900f0f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -110,7 +110,7 @@ struct pcpu_chunk {
 	int			map_used;	/* # of map entries used before the sentry */
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
-	struct work_struct	map_extend_work;/* async ->map[] extension */
+	struct list_head	map_extend_list;/* on pcpu_map_extend_chunks */
 
 	void			*data;		/* chunk data */
 	int			first_free;	/* no free below this */
@@ -164,6 +164,9 @@ static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 
+/* chunks which need their map areas extended, protected by pcpu_lock */
+static LIST_HEAD(pcpu_map_extend_chunks);
+
 /*
  * The number of empty populated pages, protected by pcpu_lock.  The
  * reserved chunk doesn't contribute to the count.
@@ -397,13 +400,19 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
 {
 	int margin, new_alloc;
 
+	lockdep_assert_held(&pcpu_lock);
+
 	if (is_atomic) {
 		margin = 3;
 
 		if (chunk->map_alloc <
-		    chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
-		    pcpu_async_enabled)
-			schedule_work(&chunk->map_extend_work);
+		    chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
+			if (list_empty(&chunk->map_extend_list)) {
+				list_add_tail(&chunk->map_extend_list,
+					      &pcpu_map_extend_chunks);
+				pcpu_schedule_balance_work();
+			}
+		}
 	} else {
 		margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
 	}
@@ -469,20 +478,6 @@ out_unlock:
 	return 0;
 }
 
-static void pcpu_map_extend_workfn(struct work_struct *work)
-{
-	struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
-						map_extend_work);
-	int new_alloc;
-
-	spin_lock_irq(&pcpu_lock);
-	new_alloc = pcpu_need_to_extend(chunk, false);
-	spin_unlock_irq(&pcpu_lock);
-
-	if (new_alloc)
-		pcpu_extend_area_map(chunk, new_alloc);
-}
-
 /**
  * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
  * @chunk: chunk the candidate area belongs to
@@ -742,7 +737,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
 	chunk->map_used = 1;
 
 	INIT_LIST_HEAD(&chunk->list);
-	INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
+	INIT_LIST_HEAD(&chunk->map_extend_list);
 	chunk->free_size = pcpu_unit_size;
 	chunk->contig_hint = pcpu_unit_size;
 
@@ -1131,6 +1126,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
 			continue;
 
+		list_del_init(&chunk->map_extend_list);
 		list_move(&chunk->list, &to_free);
 	}
 
@@ -1148,6 +1144,25 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		pcpu_destroy_chunk(chunk);
 	}
 
+	/* service chunks which requested async area map extension */
+	do {
+		int new_alloc = 0;
+
+		spin_lock_irq(&pcpu_lock);
+
+		chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
+					struct pcpu_chunk, map_extend_list);
+		if (chunk) {
+			list_del_init(&chunk->map_extend_list);
+			new_alloc = pcpu_need_to_extend(chunk, false);
+		}
+
+		spin_unlock_irq(&pcpu_lock);
+
+		if (new_alloc)
+			pcpu_extend_area_map(chunk, new_alloc);
+	} while (chunk);
+
 	/*
 	 * Ensure there are certain number of free populated pages for
 	 * atomic allocs.  Fill up from the most packed so that atomic
@@ -1646,7 +1661,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	 */
 	schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
 	INIT_LIST_HEAD(&schunk->list);
-	INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
+	INIT_LIST_HEAD(&schunk->map_extend_list);
 	schunk->base_addr = base_addr;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
@@ -1675,7 +1690,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	if (dyn_size) {
 		dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
 		INIT_LIST_HEAD(&dchunk->list);
-		INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
+		INIT_LIST_HEAD(&dchunk->map_extend_list);
 		dchunk->base_addr = base_addr;
 		dchunk->map = dmap;
 		dchunk->map_alloc = ARRAY_SIZE(dmap);

From 3b41e21ffaa38ff8ed8fa8b71f4410f45f42011f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 25 May 2016 11:48:25 -0400
Subject: [PATCH 0511/3220] UPSTREAM: percpu: fix synchronization between
 synchronous map extension and chunk destruction

(cherry picked from commit 6710e594f71ccaad8101bc64321152af7cd9ea28)

For non-atomic allocations, pcpu_alloc() can try to extend the area
map synchronously after dropping pcpu_lock; however, the extension
wasn't synchronized against chunk destruction and the chunk might get
freed while extension is in progress.

This patch fixes the bug by putting most of non-atomic allocations
under pcpu_alloc_mutex to synchronize against pcpu_balance_work which
is responsible for async chunk management including destruction.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: stable@vger.kernel.org # v3.18+
Fixes: 1a4d76076cda ("percpu: implement asynchronous chunk population")
Change-Id: I8800962e658e78eac866fff4a4e00294c58a3dec
Bug: 31596597
---
 mm/percpu.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 58b014900f0f..1f376bce413c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -160,7 +160,7 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 static DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
-static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop */
+static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 
@@ -446,6 +446,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
 	size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
 	unsigned long flags;
 
+	lockdep_assert_held(&pcpu_alloc_mutex);
+
 	new = pcpu_mem_zalloc(new_size);
 	if (!new)
 		return -ENOMEM;
@@ -892,6 +894,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		return NULL;
 	}
 
+	if (!is_atomic)
+		mutex_lock(&pcpu_alloc_mutex);
+
 	spin_lock_irqsave(&pcpu_lock, flags);
 
 	/* serve reserved allocations from the reserved chunk if available */
@@ -964,12 +969,9 @@ restart:
 	if (is_atomic)
 		goto fail;
 
-	mutex_lock(&pcpu_alloc_mutex);
-
 	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
 		chunk = pcpu_create_chunk();
 		if (!chunk) {
-			mutex_unlock(&pcpu_alloc_mutex);
 			err = "failed to allocate new chunk";
 			goto fail;
 		}
@@ -980,7 +982,6 @@ restart:
 		spin_lock_irqsave(&pcpu_lock, flags);
 	}
 
-	mutex_unlock(&pcpu_alloc_mutex);
 	goto restart;
 
 area_found:
@@ -990,8 +991,6 @@ area_found:
 	if (!is_atomic) {
 		int page_start, page_end, rs, re;
 
-		mutex_lock(&pcpu_alloc_mutex);
-
 		page_start = PFN_DOWN(off);
 		page_end = PFN_UP(off + size);
 
@@ -1002,7 +1001,6 @@ area_found:
 
 			spin_lock_irqsave(&pcpu_lock, flags);
 			if (ret) {
-				mutex_unlock(&pcpu_alloc_mutex);
 				pcpu_free_area(chunk, off, &occ_pages);
 				err = "failed to populate";
 				goto fail_unlock;
@@ -1042,6 +1040,8 @@ fail:
 		/* see the flag handling in pcpu_blance_workfn() */
 		pcpu_atomic_alloc_failed = true;
 		pcpu_schedule_balance_work();
+	} else {
+		mutex_unlock(&pcpu_alloc_mutex);
 	}
 	return NULL;
 }

From d9e3c1d5f7bd8df275f8bce5ed78393c7a5fb22a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 8 Mar 2016 21:09:29 +0700
Subject: [PATCH 0512/3220] UPSTREAM: arm64: account for sparsemem section
 alignment when choosing vmemmap offset

Commit dfd55ad85e4a ("arm64: vmemmap: use virtual projection of linear
region") fixed an issue where the struct page array would overflow into the
adjacent virtual memory region if system RAM was placed so high up in
physical memory that its addresses were not representable in the build time
configured virtual address size.

However, the fix failed to take into account that the vmemmap region needs
to be relatively aligned with respect to the sparsemem section size, so that
a sequence of page structs corresponding with a sparsemem section in the
linear region appears naturally aligned in the vmemmap region.

So round up vmemmap to sparsemem section size. Since this essentially moves
the projection of the linear region up in memory, also revert the reduction
of the size of the vmemmap region.

Cc: <stable@vger.kernel.org>
Fixes: dfd55ad85e4a ("arm64: vmemmap: use virtual projection of linear region")
Tested-by: Mark Langsdorf <mlangsdo@redhat.com>
Tested-by: David Daney <david.daney@cavium.com>
Tested-by: Robert Richter <rrichter@cavium.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029

(cherry picked from commit 36e5cd6b897e17d03008f81e075625d8e43e52d0)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I77bad8c6a7c1a7c3dda92a37ceef5ddfb196ec70
---
 arch/arm64/include/asm/pgtable.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0c82a1d9f31e..40017aa2fcbd 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -40,13 +40,14 @@
  * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
  *	fixed mappings and modules
  */
-#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE)
+#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
 
 #define VMALLOC_START		(MODULES_END)
 #define VMALLOC_END		(PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
 
 #define VMEMMAP_START		(VMALLOC_END + SZ_64K)
-#define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
+#define vmemmap			((struct page *)VMEMMAP_START - \
+				 SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT))
 
 #define FIRST_USER_ADDRESS	0UL
 

From 663cc539523fbf35732ba0af6a15bcb6d6370c91 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 18 Apr 2016 17:09:47 +0200
Subject: [PATCH 0513/3220] UPSTREAM: arm64: relocatable: deal with physically
 misaligned kernel images

When booting a relocatable kernel image, there is no practical reason
to refuse an image whose load address is not exactly TEXT_OFFSET bytes
above a 2 MB aligned base address, as long as the physical and virtual
misalignment with respect to the swapper block size are equal, and are
both aligned to THREAD_SIZE.

Since the virtual misalignment is under our control when we first enter
the kernel proper, we can simply choose its value to be equal to the
physical misalignment.

So treat the misalignment of the physical load address as the initial
KASLR offset, and fix up the remaining code to deal with that.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
Bug: 32122850

(cherry picked from commit 08cdac619c81b3fa8cd73aeed2330ffe0a0b73ca)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: I658cb3467ba9a4f5b1f5a1cbb972fdc5a3562bf0
---
 arch/arm64/kernel/head.S  | 9 ++++++---
 arch/arm64/kernel/kaslr.c | 6 +++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 9bfa58fea8ce..461d6cc258dd 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -25,6 +25,7 @@
 #include <linux/irqchip/arm-gic-v3.h>
 
 #include <asm/assembler.h>
+#include <asm/boot.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
 #include <asm/cache.h>
@@ -210,8 +211,8 @@ section_table:
 ENTRY(stext)
 	bl	preserve_boot_args
 	bl	el2_setup			// Drop to EL1, w20=cpu_boot_mode
-	mov	x23, xzr			// KASLR offset, defaults to 0
 	adrp	x24, __PHYS_OFFSET
+	and	x23, x24, MIN_KIMG_ALIGN - 1	// KASLR offset, defaults to 0
 	bl	set_cpu_boot_mode_flag
 	bl	__create_page_tables		// x25=TTBR0, x26=TTBR1
 	/*
@@ -488,11 +489,13 @@ __mmap_switched:
 	bl	kasan_early_init
 #endif
 #ifdef CONFIG_RANDOMIZE_BASE
-	cbnz	x23, 0f				// already running randomized?
+	tst	x23, ~(MIN_KIMG_ALIGN - 1)	// already running randomized?
+	b.ne	0f
 	mov	x0, x21				// pass FDT address in x0
+	mov	x1, x23				// pass modulo offset in x1
 	bl	kaslr_early_init		// parse FDT for KASLR options
 	cbz	x0, 0f				// KASLR disabled? just proceed
-	mov	x23, x0				// record KASLR offset
+	orr	x23, x23, x0			// record KASLR offset
 	ret	x28				// we must enable KASLR, return
 						// to __enable_mmu()
 0:
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 582983920054..b05469173ba5 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -74,7 +74,7 @@ extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size,
  * containing function pointers) to be reinitialized, and zero-initialized
  * .bss variables will be reset to 0.
  */
-u64 __init kaslr_early_init(u64 dt_phys)
+u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
 {
 	void *fdt;
 	u64 seed, offset, mask, module_range;
@@ -132,8 +132,8 @@ u64 __init kaslr_early_init(u64 dt_phys)
 	 * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
 	 * happens, increase the KASLR offset by the size of the kernel image.
 	 */
-	if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) !=
-	    (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT))
+	if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) !=
+	    (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT))
 		offset = (offset + (u64)(_end - _text)) & mask;
 
 	if (IS_ENABLED(CONFIG_KASAN))

From e78f134a78a0ae95b83ac0cac47ab0bb584ebaa7 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@chromium.org>
Date: Thu, 6 Oct 2016 15:53:38 -0700
Subject: [PATCH 0514/3220] CHROMIUM: remove Android's cgroup generic
 permissions checks

The implementation is utterly broken, resulting in all processes being
allows to move tasks between sets (as long as they have access to the
"tasks" attribute), and upstream is heading towards checking only
capability anyway, so let's get rid of this code.

BUG=b:31790445,chromium:647994
TEST=Boot android container, examine logcat

Change-Id: I2f780a5992c34e52a8f2d0b3557fc9d490da2779
Signed-off-by: Dmitry Torokhov <dtor@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/394967
Reviewed-by: Ricky Zhou <rickyz@chromium.org>
Reviewed-by: John Stultz <john.stultz@linaro.org>
---
 Documentation/cgroups/cgroups.txt |  9 -----
 include/linux/cgroup-defs.h       |  1 -
 include/linux/cgroup.h            | 14 --------
 kernel/cgroup.c                   | 59 ++-----------------------------
 kernel/sched/core.c               |  1 -
 mm/memcontrol.c                   | 10 ------
 6 files changed, 2 insertions(+), 92 deletions(-)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 2d984e20783b..c6256ae9885b 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -578,15 +578,6 @@ is completely unused; @cgrp->parent is still valid. (Note - can also
 be called for a newly-created cgroup if an error occurs after this
 subsystem's create() method has been called for the new cgroup).
 
-int allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called prior to moving a task into a cgroup; if the subsystem
-returns an error, this will abort the attach operation.  Used
-to extend the permission checks - if all subsystems in a cgroup
-return 0, the attach will be allowed to proceed, even if the
-default permission check (root or same user) fails.
-
 int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 (cgroup_mutex held by caller)
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4a4eea01956c..06b77f9dd3f2 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -422,7 +422,6 @@ struct cgroup_subsys {
 	void (*css_reset)(struct cgroup_subsys_state *css);
 	void (*css_e_css_changed)(struct cgroup_subsys_state *css);
 
-	int (*allow_attach)(struct cgroup_taskset *tset);
 	int (*can_attach)(struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_taskset *tset);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 70358b9f5a7a..cb91b44f5f78 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -528,16 +528,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 	pr_cont_kernfs_path(cgrp->kn);
 }
 
-/*
- * Default Android check for whether the current process is allowed to move a
- * task across cgroups, either because CAP_SYS_NICE is set or because the uid
- * of the calling process is the same as the moved task or because we are
- * running as root.
- * Returns 0 if this is allowed, or -EACCES otherwise.
- */
-int subsys_cgroup_allow_attach(struct cgroup_taskset *tset);
-
-
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -562,10 +552,6 @@ static inline void cgroup_free(struct task_struct *p) {}
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 
-static inline int subsys_cgroup_allow_attach(void *tset)
-{
-	return -EINVAL;
-}
 #endif /* !CONFIG_CGROUPS */
 
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bd541053efc2..371ee5a827e0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2663,45 +2663,6 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	return ret;
 }
 
-int subsys_cgroup_allow_attach(struct cgroup_taskset *tset)
-{
-	const struct cred *cred = current_cred(), *tcred;
-	struct task_struct *task;
-	struct cgroup_subsys_state *css;
-
-	if (capable(CAP_SYS_NICE))
-		return 0;
-
-	cgroup_taskset_for_each(task, css, tset) {
-		tcred = __task_cred(task);
-
-		if (current != task && !uid_eq(cred->euid, tcred->uid) &&
-		    !uid_eq(cred->euid, tcred->suid))
-			return -EACCES;
-	}
-
-	return 0;
-}
-
-static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-{
-	struct cgroup_subsys_state *css;
-	int i;
-	int ret;
-
-	for_each_css(css, i, cgrp) {
-		if (css->ss->allow_attach) {
-			ret = css->ss->allow_attach(tset);
-			if (ret)
-				return ret;
-		} else {
-			return -EACCES;
-		}
-	}
-
-	return 0;
-}
-
 static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct cgroup *dst_cgrp,
 					 struct kernfs_open_file *of)
@@ -2716,24 +2677,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 	 */
 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 	    !uid_eq(cred->euid, tcred->uid) &&
-	    !uid_eq(cred->euid, tcred->suid)) {
-		/*
-		 * if the default permission check fails, give each
-		 * cgroup a chance to extend the permission check
-		 */
-		struct cgroup_taskset tset = {
-			.src_csets = LIST_HEAD_INIT(tset.src_csets),
-			.dst_csets = LIST_HEAD_INIT(tset.dst_csets),
-			.csets = &tset.src_csets,
-		};
-		struct css_set *cset;
-		cset = task_css_set(task);
-		list_add(&cset->mg_node, &tset.src_csets);
-		ret = cgroup_allow_attach(dst_cgrp, &tset);
-		list_del(&tset.src_csets);
-		if (ret)
-			ret = -EACCES;
-	}
+	    !uid_eq(cred->euid, tcred->suid))
+		ret = -EACCES;
 
 	if (!ret && cgroup_on_dfl(dst_cgrp)) {
 		struct super_block *sb = of->file->f_path.dentry->d_sb;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c90bfb620981..471094bfbcc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8935,7 +8935,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-	.allow_attach   = subsys_cgroup_allow_attach,
 	.legacy_cftypes	= cpu_files,
 	.early_init	= 1,
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5c714f33f494..ee6acd279953 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4879,11 +4879,6 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
 	return ret;
 }
 
-static int mem_cgroup_allow_attach(struct cgroup_taskset *tset)
-{
-	return subsys_cgroup_allow_attach(tset);
-}
-
 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
 {
 	if (mc.to)
@@ -5045,10 +5040,6 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
 {
 	return 0;
 }
-static int mem_cgroup_allow_attach(struct cgroup_taskset *tset)
-{
-	return 0;
-}
 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
 {
 }
@@ -5232,7 +5223,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
-	.allow_attach = mem_cgroup_allow_attach,
 	.bind = mem_cgroup_bind,
 	.dfl_cftypes = memory_files,
 	.legacy_cftypes = mem_cgroup_legacy_files,

From 140cab831036741cac88255f282cd68b72140356 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@chromium.org>
Date: Thu, 6 Oct 2016 16:14:16 -0700
Subject: [PATCH 0515/3220] CHROMIUM: cgroups: relax permissions on moving
 tasks between cgroups

Android expects system_server to be able to move tasks between different
cgroups/cpusets, but does not want to be running as root. Let's relax
permission check so that processes can move other tasks if they have
CAP_SYS_NICE in the affected task's user namespace.

BUG=b:31790445,chromium:647994
TEST=Boot android container, examine logcat

Change-Id: Ia919c66ab6ed6a6daf7c4cf67feb38b13b1ad09b
Signed-off-by: Dmitry Torokhov <dtor@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/394927
Reviewed-by: Ricky Zhou <rickyz@chromium.org>
---
 kernel/cgroup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 371ee5a827e0..45c5e134d05b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2677,7 +2677,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 	 */
 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 	    !uid_eq(cred->euid, tcred->uid) &&
-	    !uid_eq(cred->euid, tcred->suid))
+	    !uid_eq(cred->euid, tcred->suid) &&
+	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
 		ret = -EACCES;
 
 	if (!ret && cgroup_on_dfl(dst_cgrp)) {

From 7af6188c36382c1a11551cf8bea62bcb3dcaa585 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 11 Oct 2016 13:51:27 -0700
Subject: [PATCH 0516/3220] BACKPORT: lib: harden strncpy_from_user

The strncpy_from_user() accessor is effectively a copy_from_user()
specialised to copy strings, terminating early at a NUL byte if possible.
In other respects it is identical, and can be used to copy an arbitrarily
large buffer from userspace into the kernel.  Conceptually, it exposes a
similar attack surface.

As with copy_from_user(), we check the destination range when the kernel
is built with KASAN, but unlike copy_from_user() we do not check the
destination buffer when using HARDENED_USERCOPY.  As strncpy_from_user()
calls get_user() in a loop, we must call check_object_size() explicitly.

This patch adds this instrumentation to strncpy_from_user(), per the same
rationale as with the regular copy_from_user().  In the absence of
hardened usercopy this will have no impact as the instrumentation expands
to an empty static inline function.

Link: http://lkml.kernel.org/r/1472221903-31181-1-git-send-email-mark.rutland@arm.com
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 31374226
Change-Id: I898e4e9f19307e37a9be497cb1a0d7f1e3911661
(cherry picked from commit bf90e56e467ed5766722972d483e6711889ed1b0)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 lib/strncpy_from_user.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 5a003a2ebd96..05efc1fa97f0 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,5 +1,6 @@
 #include <linux/compiler.h>
 #include <linux/export.h>
+#include <linux/thread_info.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -109,6 +110,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 		unsigned long max = max_addr - src_addr;
 		long retval;
 
+		check_object_size(dst, count, false);
 		user_access_begin();
 		retval = do_strncpy_from_user(dst, src, count, max);
 		user_access_end();

From 7bb5218b77d0aef456634625fff7909f88b4f705 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <groeck@chromium.org>
Date: Tue, 18 Oct 2016 12:35:03 -0700
Subject: [PATCH 0517/3220] cgroup: Remove leftover instances of allow_attach
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix:

kernel/sched/tune.c:718:2: error:
	unknown field ‘allow_attach’ specified in initializer
kernel/cpuset.c:2087:2: error:
	unknown field 'allow_attach' specified in initializer

Change-Id: Ie524350ffc6158f3182d90095cca502e58b6f197
Fixes: e78f134a78a0 ("CHROMIUM: remove Android's cgroup generic permissions checks")
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
 kernel/cpuset.c     | 18 ------------------
 kernel/sched/tune.c |  7 -------
 2 files changed, 25 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3b4f27981778..3d0f77112eb3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2061,30 +2061,12 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 	mutex_unlock(&cpuset_mutex);
 }
 
-static int cpuset_allow_attach(struct cgroup_taskset *tset)
-{
-	const struct cred *cred = current_cred(), *tcred;
-	struct task_struct *task;
-	struct cgroup_subsys_state *css;
-
-	cgroup_taskset_for_each(task, css, tset) {
-		tcred = __task_cred(task);
-
-		if ((current != task) && !capable(CAP_SYS_ADMIN) &&
-		     cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val)
-			return -EACCES;
-	}
-
-	return 0;
-}
-
 struct cgroup_subsys cpuset_cgrp_subsys = {
 	.css_alloc	= cpuset_css_alloc,
 	.css_online	= cpuset_css_online,
 	.css_offline	= cpuset_css_offline,
 	.css_free	= cpuset_css_free,
 	.can_attach	= cpuset_can_attach,
-	.allow_attach   = cpuset_allow_attach,
 	.cancel_attach	= cpuset_cancel_attach,
 	.attach		= cpuset_attach,
 	.bind		= cpuset_bind,
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 505d7b35b0e1..68a24a044b0a 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -368,12 +368,6 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
 	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
 }
 
-int schedtune_allow_attach(struct cgroup_taskset *tset)
-{
-	/* We always allows tasks to be moved between existing CGroups */
-	return 0;
-}
-
 int schedtune_can_attach(struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
@@ -715,7 +709,6 @@ schedtune_css_free(struct cgroup_subsys_state *css)
 struct cgroup_subsys schedtune_cgrp_subsys = {
 	.css_alloc	= schedtune_css_alloc,
 	.css_free	= schedtune_css_free,
-	.allow_attach   = schedtune_allow_attach,
 	.can_attach     = schedtune_can_attach,
 	.cancel_attach  = schedtune_cancel_attach,
 	.legacy_cftypes	= files,

From ffa907b7664062250d813d97fb9a9bdb5511897a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 17 Oct 2016 16:18:39 +0100
Subject: [PATCH 0518/3220] UPSTREAM: arm64: kaslr: keep modules close to the
 kernel when DYNAMIC_FTRACE=y

The RANDOMIZE_MODULE_REGION_FULL Kconfig option allows KASLR to be
configured in such a way that kernel modules and the core kernel are
allocated completely independently, which implies that modules are likely
to require branches via PLT entries to reach the core kernel. The dynamic
ftrace code does not expect that, and assumes that it can patch module
code to perform a relative branch to anywhere in the core kernel. This
may result in errors such as

  branch_imm_common: offset out of range
  ------------[ cut here ]------------
  WARNING: CPU: 3 PID: 196 at kernel/trace/ftrace.c:1995 ftrace_bug+0x220/0x2e8
  Modules linked in:

  CPU: 3 PID: 196 Comm: systemd-udevd Not tainted 4.8.0-22-generic #24
  Hardware name: AMD Seattle/Seattle, BIOS 10:34:40 Oct  6 2016
  task: ffff8d1bef7dde80 task.stack: ffff8d1bef6b0000
  PC is at ftrace_bug+0x220/0x2e8
  LR is at ftrace_process_locs+0x330/0x430

So make RANDOMIZE_MODULE_REGION_FULL mutually exclusive with DYNAMIC_FTRACE
at the Kconfig level.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
(cherry picked from commit 8fe88a4145cdeee486af60e61f5d5a14f804fa45)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ifb2474dcbb7a3066fe5724ee53a2048d61e80ccc
---
 arch/arm64/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 488fed2d5e5e..a4d3c3cd7b23 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -822,7 +822,7 @@ config RANDOMIZE_BASE
 
 config RANDOMIZE_MODULE_REGION_FULL
 	bool "Randomize the module region independently from the core kernel"
-	depends on RANDOMIZE_BASE
+	depends on RANDOMIZE_BASE && !DYNAMIC_FTRACE
 	default y
 	help
 	  Randomizes the location of the module region without considering the

From 4a8b645cef3caa87b85269078d9e1f02115107b6 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Thu, 13 Oct 2016 17:42:09 +0100
Subject: [PATCH 0519/3220] UPSTREAM: arm64: kaslr: fix breakage with
 CONFIG_MODVERSIONS=y

As it turns out, the KASLR code breaks CONFIG_MODVERSIONS, since the
kcrctab has an absolute address field that is relocated at runtime
when the kernel offset is randomized.

This has been fixed already for PowerPC in the past, so simply wire up
the existing code dealing with this issue.

Cc: <stable@vger.kernel.org>
Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR")
Tested-by: Timur Tabi <timur@codeaurora.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 30369029
(cherry picked from commit 8fe88a4145cdeee486af60e61f5d5a14f804fa45)
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Change-Id: Ia40bb68eb5ba7df14214243657948d469f1d5717
---
 arch/arm64/include/asm/module.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index e12af6754634..06ff7fd9e81f 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -17,6 +17,7 @@
 #define __ASM_MODULE_H
 
 #include <asm-generic/module.h>
+#include <asm/memory.h>
 
 #define MODULE_ARCH_VERMAGIC	"aarch64"
 
@@ -32,6 +33,10 @@ u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela,
 			  Elf64_Sym *sym);
 
 #ifdef CONFIG_RANDOMIZE_BASE
+#ifdef CONFIG_MODVERSIONS
+#define ARCH_RELOCATES_KCRCTAB
+#define reloc_start 		(kimage_vaddr - KIMAGE_VADDR)
+#endif
 extern u64 module_alloc_base;
 #else
 #define module_alloc_base	((u64)_etext - MODULES_VSIZE)

From a3c8dc25c446485dcbf8a978f755eead9eac3671 Mon Sep 17 00:00:00 2001
From: Lianwei Wang <lianwei.wang@gmail.com>
Date: Thu, 9 Jun 2016 23:43:28 -0700
Subject: [PATCH 0520/3220] UPSTREAM: cpu/hotplug: Handle unbalanced hotplug
 enable/disable

(cherry picked from commit 01b41159066531cc8d664362ff0cd89dd137bbfa)

When cpu_hotplug_enable() is called unbalanced w/o a preceeding
cpu_hotplug_disable() the code emits a warning, but happily decrements the
disabled counter. This causes the next operations to malfunction.

Prevent the decrement and just emit a warning.

Signed-off-by: Lianwei Wang <lianwei.wang@gmail.com>
Cc: peterz@infradead.org
Cc: linux-pm@vger.kernel.org
Cc: oleg@redhat.com
Link: http://lkml.kernel.org/r/1465541008-12476-1-git-send-email-lianwei.wang@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9ced7c751648..c8a1751be224 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -185,10 +185,17 @@ void cpu_hotplug_disable(void)
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
 
+static void __cpu_hotplug_enable(void)
+{
+	if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+		return;
+	cpu_hotplug_disabled--;
+}
+
 void cpu_hotplug_enable(void)
 {
 	cpu_maps_update_begin();
-	WARN_ON(--cpu_hotplug_disabled < 0);
+	__cpu_hotplug_enable();
 	cpu_maps_update_done();
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
@@ -631,7 +638,7 @@ void enable_nonboot_cpus(void)
 
 	/* Allow everyone to use the CPU hotplug again */
 	cpu_maps_update_begin();
-	WARN_ON(--cpu_hotplug_disabled < 0);
+	__cpu_hotplug_enable();
 	if (cpumask_empty(frozen_cpus))
 		goto out;
 

From 64f0245f312a849acc83b7c6cdcc8f7c1a14cca3 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 18 Oct 2016 16:20:23 -0700
Subject: [PATCH 0521/3220] [RFC]cgroup: Change from CAP_SYS_NICE to
 CAP_SYS_RESOURCE for cgroup migration permissions

Try to better match what we're pushing upstream, use CAP_SYS_RESOURCE
instead of CAP_SYS_NICE, which shoudln't affect Android as Zygote and
system_server already use CAP_SYS_RESOURCE.

Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 45c5e134d05b..00af24ac0167 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2678,7 +2678,7 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 	    !uid_eq(cred->euid, tcred->uid) &&
 	    !uid_eq(cred->euid, tcred->suid) &&
-	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
+	    !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
 		ret = -EACCES;
 
 	if (!ret && cgroup_on_dfl(dst_cgrp)) {

From b6be9b7b199b05a13d72dc562110eebc725ed0ec Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Thu, 20 Oct 2016 15:45:01 -0400
Subject: [PATCH 0522/3220] disable aio support in recommended configuration

The aio interface adds substantial attack surface for a feature that's
not being exposed by Android at all. It's unlikely that anyone is using
the kernel feature directly either. This feature is rarely used even on
servers. The glibc POSIX aio calls really use thread pools. The lack of
widespread usage also means this is relatively poorly audited/tested.

The kernel's aio rarely provides performance benefits over using a
thread pool and is quite incomplete in terms of system call coverage
along with having edge cases where blocking can occur. Part of the
performance issue is the fact that it only supports direct io, not
buffered io. The existing API is considered fundamentally flawed
and it's unlikely it will be expanded, but rather replaced:

https://marc.info/?l=linux-aio&m=145255815216051&w=2

Since ext4 encryption means no direct io support, kernel aio isn't even
going to work properly on Android devices using file-based encryption.

Change-Id: Iccc7cab4437791240817e6275a23e1d3f4a47f2d
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
---
 android/configs/android-recommended.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index 3465a848d74d..3fd0b13488a1 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -1,4 +1,5 @@
 #  KEEP ALPHABETICALLY SORTED
+# CONFIG_AIO is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_LEGACY_PTYS is not set

From ce0c65986ceeaa05249c990e1744d4caeb734e60 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Wed, 13 Jul 2016 12:06:49 +0200
Subject: [PATCH 0523/3220] android: binder: split flat_binder_object.

flat_binder_object is used for both handling
binder objects and file descriptors, even though
the two are mostly independent. Since we'll
have more fixup objects in binder in the future,
instead of extending flat_binder_object again,
split out file descriptors to their own object
while retaining backwards compatibility to
existing user-space clients. All binder objects
just share a header.

Change-Id: If3c55f27a2aa8f21815383e0e807be47895e4786
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c            | 162 +++++++++++++++++++---------
 include/uapi/linux/android/binder.h |  31 +++++-
 2 files changed, 141 insertions(+), 52 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index b8ba5b7516b9..50cc56f3d0cc 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -146,6 +146,11 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 			binder_stop_on_user_error = 2; \
 	} while (0)
 
+#define to_flat_binder_object(hdr) \
+	container_of(hdr, struct flat_binder_object, hdr)
+
+#define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr)
+
 enum binder_stat_types {
 	BINDER_STAT_PROC,
 	BINDER_STAT_THREAD,
@@ -1241,6 +1246,47 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 	}
 }
 
+/**
+ * binder_validate_object() - checks for a valid metadata object in a buffer.
+ * @buffer:	binder_buffer that we're parsing.
+ * @offset:	offset in the buffer at which to validate an object.
+ *
+ * Return:	If there's a valid metadata object at @offset in @buffer, the
+ *		size of that object. Otherwise, it returns zero.
+ */
+static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
+{
+	/* Check if we can read a header first */
+	struct binder_object_header *hdr;
+	size_t object_size = 0;
+
+	if (offset > buffer->data_size - sizeof(*hdr) ||
+	    buffer->data_size < sizeof(*hdr) ||
+	    !IS_ALIGNED(offset, sizeof(u32)))
+		return 0;
+
+	/* Ok, now see if we can read a complete object. */
+	hdr = (struct binder_object_header *)(buffer->data + offset);
+	switch (hdr->type) {
+	case BINDER_TYPE_BINDER:
+	case BINDER_TYPE_WEAK_BINDER:
+	case BINDER_TYPE_HANDLE:
+	case BINDER_TYPE_WEAK_HANDLE:
+		object_size = sizeof(struct flat_binder_object);
+		break;
+	case BINDER_TYPE_FD:
+		object_size = sizeof(struct binder_fd_object);
+		break;
+	default:
+		return 0;
+	}
+	if (offset <= buffer->data_size - object_size &&
+	    buffer->data_size >= object_size)
+		return object_size;
+	else
+		return 0;
+}
+
 static void binder_transaction_buffer_release(struct binder_proc *proc,
 					      struct binder_buffer *buffer,
 					      binder_size_t *failed_at)
@@ -1263,21 +1309,23 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 	else
 		off_end = (void *)offp + buffer->offsets_size;
 	for (; offp < off_end; offp++) {
-		struct flat_binder_object *fp;
+		struct binder_object_header *hdr;
+		size_t object_size = binder_validate_object(buffer, *offp);
 
-		if (*offp > buffer->data_size - sizeof(*fp) ||
-		    buffer->data_size < sizeof(*fp) ||
-		    !IS_ALIGNED(*offp, sizeof(u32))) {
-			pr_err("transaction release %d bad offset %lld, size %zd\n",
+		if (object_size == 0) {
+			pr_err("transaction release %d bad object at offset %lld, size %zd\n",
 			       debug_id, (u64)*offp, buffer->data_size);
 			continue;
 		}
-		fp = (struct flat_binder_object *)(buffer->data + *offp);
-		switch (fp->type) {
+		hdr = (struct binder_object_header *)(buffer->data + *offp);
+		switch (hdr->type) {
 		case BINDER_TYPE_BINDER:
 		case BINDER_TYPE_WEAK_BINDER: {
-			struct binder_node *node = binder_get_node(proc, fp->binder);
+			struct flat_binder_object *fp;
+			struct binder_node *node;
 
+			fp = to_flat_binder_object(hdr);
+			node = binder_get_node(proc, fp->binder);
 			if (node == NULL) {
 				pr_err("transaction release %d bad node %016llx\n",
 				       debug_id, (u64)fp->binder);
@@ -1286,13 +1334,17 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			binder_debug(BINDER_DEBUG_TRANSACTION,
 				     "        node %d u%016llx\n",
 				     node->debug_id, (u64)node->ptr);
-			binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0);
+			binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER,
+					0);
 		} break;
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
-			struct binder_ref *ref = binder_get_ref(proc, fp->handle,
-						fp->type == BINDER_TYPE_HANDLE);
+			struct flat_binder_object *fp;
+			struct binder_ref *ref;
 
+			fp = to_flat_binder_object(hdr);
+			ref = binder_get_ref(proc, fp->handle,
+					     hdr->type == BINDER_TYPE_HANDLE);
 			if (ref == NULL) {
 				pr_err("transaction release %d bad handle %d\n",
 				 debug_id, fp->handle);
@@ -1301,19 +1353,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			binder_debug(BINDER_DEBUG_TRANSACTION,
 				     "        ref %d desc %d (node %d)\n",
 				     ref->debug_id, ref->desc, ref->node->debug_id);
-			binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE);
+			binder_dec_ref(ref, hdr->type == BINDER_TYPE_HANDLE);
 		} break;
 
-		case BINDER_TYPE_FD:
+		case BINDER_TYPE_FD: {
+			struct binder_fd_object *fp = to_binder_fd_object(hdr);
+
 			binder_debug(BINDER_DEBUG_TRANSACTION,
-				     "        fd %d\n", fp->handle);
+				     "        fd %d\n", fp->fd);
 			if (failed_at)
-				task_close_fd(proc, fp->handle);
-			break;
+				task_close_fd(proc, fp->fd);
+		} break;
 
 		default:
 			pr_err("transaction release %d bad object type %x\n",
-				debug_id, fp->type);
+				debug_id, hdr->type);
 			break;
 		}
 	}
@@ -1530,28 +1584,29 @@ static void binder_transaction(struct binder_proc *proc,
 	off_end = (void *)offp + tr->offsets_size;
 	off_min = 0;
 	for (; offp < off_end; offp++) {
-		struct flat_binder_object *fp;
+		struct binder_object_header *hdr;
+		size_t object_size = binder_validate_object(t->buffer, *offp);
 
-		if (*offp > t->buffer->data_size - sizeof(*fp) ||
-		    *offp < off_min ||
-		    t->buffer->data_size < sizeof(*fp) ||
-		    !IS_ALIGNED(*offp, sizeof(u32))) {
-			binder_user_error("%d:%d got transaction with invalid offset, %lld (min %lld, max %lld)\n",
+		if (object_size == 0 || *offp < off_min) {
+			binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n",
 					  proc->pid, thread->pid, (u64)*offp,
 					  (u64)off_min,
-					  (u64)(t->buffer->data_size -
-					  sizeof(*fp)));
+					  (u64)t->buffer->data_size);
 			return_error = BR_FAILED_REPLY;
 			goto err_bad_offset;
 		}
-		fp = (struct flat_binder_object *)(t->buffer->data + *offp);
-		off_min = *offp + sizeof(struct flat_binder_object);
-		switch (fp->type) {
+
+		hdr = (struct binder_object_header *)(t->buffer->data + *offp);
+		off_min = *offp + object_size;
+		switch (hdr->type) {
 		case BINDER_TYPE_BINDER:
 		case BINDER_TYPE_WEAK_BINDER: {
+			struct flat_binder_object *fp;
+			struct binder_node *node;
 			struct binder_ref *ref;
-			struct binder_node *node = binder_get_node(proc, fp->binder);
 
+			fp = to_flat_binder_object(hdr);
+			node = binder_get_node(proc, fp->binder);
 			if (node == NULL) {
 				node = binder_new_node(proc, fp->binder, fp->cookie);
 				if (node == NULL) {
@@ -1579,14 +1634,14 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error = BR_FAILED_REPLY;
 				goto err_binder_get_ref_for_node_failed;
 			}
-			if (fp->type == BINDER_TYPE_BINDER)
-				fp->type = BINDER_TYPE_HANDLE;
+			if (hdr->type == BINDER_TYPE_BINDER)
+				hdr->type = BINDER_TYPE_HANDLE;
 			else
-				fp->type = BINDER_TYPE_WEAK_HANDLE;
+				hdr->type = BINDER_TYPE_WEAK_HANDLE;
 			fp->binder = 0;
 			fp->handle = ref->desc;
 			fp->cookie = 0;
-			binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE,
+			binder_inc_ref(ref, hdr->type == BINDER_TYPE_HANDLE,
 				       &thread->todo);
 
 			trace_binder_transaction_node_to_ref(t, node, ref);
@@ -1597,9 +1652,12 @@ static void binder_transaction(struct binder_proc *proc,
 		} break;
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
-			struct binder_ref *ref = binder_get_ref(proc, fp->handle,
-						fp->type == BINDER_TYPE_HANDLE);
+			struct flat_binder_object *fp;
+			struct binder_ref *ref;
 
+			fp = to_flat_binder_object(hdr);
+			ref = binder_get_ref(proc, fp->handle,
+					     hdr->type == BINDER_TYPE_HANDLE);
 			if (ref == NULL) {
 				binder_user_error("%d:%d got transaction with invalid handle, %d\n",
 						proc->pid,
@@ -1613,13 +1671,15 @@ static void binder_transaction(struct binder_proc *proc,
 				goto err_binder_get_ref_failed;
 			}
 			if (ref->node->proc == target_proc) {
-				if (fp->type == BINDER_TYPE_HANDLE)
-					fp->type = BINDER_TYPE_BINDER;
+				if (hdr->type == BINDER_TYPE_HANDLE)
+					hdr->type = BINDER_TYPE_BINDER;
 				else
-					fp->type = BINDER_TYPE_WEAK_BINDER;
+					hdr->type = BINDER_TYPE_WEAK_BINDER;
 				fp->binder = ref->node->ptr;
 				fp->cookie = ref->node->cookie;
-				binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL);
+				binder_inc_node(ref->node,
+						hdr->type == BINDER_TYPE_BINDER,
+						0, NULL);
 				trace_binder_transaction_ref_to_node(t, ref);
 				binder_debug(BINDER_DEBUG_TRANSACTION,
 					     "        ref %d desc %d -> node %d u%016llx\n",
@@ -1636,7 +1696,9 @@ static void binder_transaction(struct binder_proc *proc,
 				fp->binder = 0;
 				fp->handle = new_ref->desc;
 				fp->cookie = 0;
-				binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL);
+				binder_inc_ref(new_ref,
+					       hdr->type == BINDER_TYPE_HANDLE,
+					       NULL);
 				trace_binder_transaction_ref_to_ref(t, ref,
 								    new_ref);
 				binder_debug(BINDER_DEBUG_TRANSACTION,
@@ -1649,25 +1711,26 @@ static void binder_transaction(struct binder_proc *proc,
 		case BINDER_TYPE_FD: {
 			int target_fd;
 			struct file *file;
+			struct binder_fd_object *fp = to_binder_fd_object(hdr);
 
 			if (reply) {
 				if (!(in_reply_to->flags & TF_ACCEPT_FDS)) {
 					binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n",
-						proc->pid, thread->pid, fp->handle);
+						proc->pid, thread->pid, fp->fd);
 					return_error = BR_FAILED_REPLY;
 					goto err_fd_not_allowed;
 				}
 			} else if (!target_node->accept_fds) {
 				binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n",
-					proc->pid, thread->pid, fp->handle);
+					proc->pid, thread->pid, fp->fd);
 				return_error = BR_FAILED_REPLY;
 				goto err_fd_not_allowed;
 			}
 
-			file = fget(fp->handle);
+			file = fget(fp->fd);
 			if (file == NULL) {
 				binder_user_error("%d:%d got transaction with invalid fd, %d\n",
-					proc->pid, thread->pid, fp->handle);
+					proc->pid, thread->pid, fp->fd);
 				return_error = BR_FAILED_REPLY;
 				goto err_fget_failed;
 			}
@@ -1685,17 +1748,18 @@ static void binder_transaction(struct binder_proc *proc,
 				goto err_get_unused_fd_failed;
 			}
 			task_fd_install(target_proc, target_fd, file);
-			trace_binder_transaction_fd(t, fp->handle, target_fd);
+			trace_binder_transaction_fd(t, fp->fd, target_fd);
 			binder_debug(BINDER_DEBUG_TRANSACTION,
-				     "        fd %d -> %d\n", fp->handle, target_fd);
+				     "        fd %d -> %d\n", fp->fd,
+				     target_fd);
 			/* TODO: fput? */
-			fp->binder = 0;
-			fp->handle = target_fd;
+			fp->pad_binder = 0;
+			fp->fd = target_fd;
 		} break;
 
 		default:
 			binder_user_error("%d:%d got transaction with invalid object type, %x\n",
-				proc->pid, thread->pid, fp->type);
+				proc->pid, thread->pid, hdr->type);
 			return_error = BR_FAILED_REPLY;
 			goto err_bad_object_type;
 		}
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 41420e341e75..f67c2b1c0713 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -48,6 +48,14 @@ typedef __u64 binder_size_t;
 typedef __u64 binder_uintptr_t;
 #endif
 
+/**
+ * struct binder_object_header - header shared by all binder metadata objects.
+ * @type:	type of the object
+ */
+struct binder_object_header {
+	__u32        type;
+};
+
 /*
  * This is the flattened representation of a Binder object for transfer
  * between processes.  The 'offsets' supplied as part of a binder transaction
@@ -56,9 +64,8 @@ typedef __u64 binder_uintptr_t;
  * between processes.
  */
 struct flat_binder_object {
-	/* 8 bytes for large_flat_header. */
-	__u32		type;
-	__u32		flags;
+	struct binder_object_header	hdr;
+	__u32				flags;
 
 	/* 8 bytes of data. */
 	union {
@@ -70,6 +77,24 @@ struct flat_binder_object {
 	binder_uintptr_t	cookie;
 };
 
+/**
+ * struct binder_fd_object - describes a filedescriptor to be fixed up.
+ * @hdr:	common header structure
+ * @pad_flags:	padding to remain compatible with old userspace code
+ * @pad_binder:	padding to remain compatible with old userspace code
+ * @fd:		file descriptor
+ * @cookie:	opaque data, used by user-space
+ */
+struct binder_fd_object {
+	struct binder_object_header	hdr;
+	__u32				pad_flags;
+	union {
+		binder_uintptr_t	pad_binder;
+		__u32			fd;
+	};
+
+	binder_uintptr_t		cookie;
+};
 /*
  * On 64-bit platforms where user code may run in 32-bits the driver must
  * translate the buffer (and local binder) addresses appropriately.

From 803df5635e4ac377586e3664cf798c3152670bc0 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 30 Sep 2016 15:51:48 +0200
Subject: [PATCH 0524/3220] android: binder: support multiple context managers.

Move the context manager state into a separate
struct context, and allow for each process to have
its own context associated with it.

Change-Id: Ifa934370241a2d447dd519eac3fd0682c6d00ab4
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 60 ++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 50cc56f3d0cc..93780d1cc199 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -56,8 +56,6 @@ static HLIST_HEAD(binder_dead_nodes);
 
 static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
-static struct binder_node *binder_context_mgr_node;
-static kuid_t binder_context_mgr_uid = INVALID_UID;
 static int binder_last_id;
 static struct workqueue_struct *binder_deferred_workqueue;
 
@@ -216,6 +214,15 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
 	return e;
 }
 
+struct binder_context {
+	struct binder_node *binder_context_mgr_node;
+	kuid_t binder_context_mgr_uid;
+};
+
+static struct binder_context global_context = {
+	.binder_context_mgr_uid = INVALID_UID,
+};
+
 struct binder_work {
 	struct list_head entry;
 	enum {
@@ -331,6 +338,7 @@ struct binder_proc {
 	int ready_threads;
 	long default_priority;
 	struct dentry *debugfs_entry;
+	struct binder_context *context;
 };
 
 enum {
@@ -935,8 +943,10 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal,
 		if (internal) {
 			if (target_list == NULL &&
 			    node->internal_strong_refs == 0 &&
-			    !(node == binder_context_mgr_node &&
-			    node->has_strong_ref)) {
+			    !(node->proc &&
+			      node == node->proc->context->
+				      binder_context_mgr_node &&
+			      node->has_strong_ref)) {
 				pr_err("invalid inc strong node for %d\n",
 					node->debug_id);
 				return -EINVAL;
@@ -1037,6 +1047,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 	struct rb_node **p = &proc->refs_by_node.rb_node;
 	struct rb_node *parent = NULL;
 	struct binder_ref *ref, *new_ref;
+	struct binder_context *context = proc->context;
 
 	while (*p) {
 		parent = *p;
@@ -1059,7 +1070,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 	rb_link_node(&new_ref->rb_node_node, parent, p);
 	rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node);
 
-	new_ref->desc = (node == binder_context_mgr_node) ? 0 : 1;
+	new_ref->desc = (node == context->binder_context_mgr_node) ? 0 : 1;
 	for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
 		ref = rb_entry(n, struct binder_ref, rb_node_desc);
 		if (ref->desc > new_ref->desc)
@@ -1389,6 +1400,7 @@ static void binder_transaction(struct binder_proc *proc,
 	struct binder_transaction *in_reply_to = NULL;
 	struct binder_transaction_log_entry *e;
 	uint32_t return_error;
+	struct binder_context *context = proc->context;
 
 	e = binder_transaction_log_add(&binder_transaction_log);
 	e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY);
@@ -1449,7 +1461,7 @@ static void binder_transaction(struct binder_proc *proc,
 			}
 			target_node = ref->node;
 		} else {
-			target_node = binder_context_mgr_node;
+			target_node = context->binder_context_mgr_node;
 			if (target_node == NULL) {
 				return_error = BR_DEAD_REPLY;
 				goto err_no_context_mgr_node;
@@ -1840,6 +1852,7 @@ static int binder_thread_write(struct binder_proc *proc,
 			binder_size_t *consumed)
 {
 	uint32_t cmd;
+	struct binder_context *context = proc->context;
 	void __user *buffer = (void __user *)(uintptr_t)binder_buffer;
 	void __user *ptr = buffer + *consumed;
 	void __user *end = buffer + size;
@@ -1866,10 +1879,10 @@ static int binder_thread_write(struct binder_proc *proc,
 			if (get_user(target, (uint32_t __user *)ptr))
 				return -EFAULT;
 			ptr += sizeof(uint32_t);
-			if (target == 0 && binder_context_mgr_node &&
+			if (target == 0 && context->binder_context_mgr_node &&
 			    (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) {
 				ref = binder_get_ref_for_node(proc,
-					       binder_context_mgr_node);
+					context->binder_context_mgr_node);
 				if (ref->desc != target) {
 					binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n",
 						proc->pid, thread->pid,
@@ -2775,9 +2788,11 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 {
 	int ret = 0;
 	struct binder_proc *proc = filp->private_data;
+	struct binder_context *context = proc->context;
+
 	kuid_t curr_euid = current_euid();
 
-	if (binder_context_mgr_node != NULL) {
+	if (context->binder_context_mgr_node) {
 		pr_err("BINDER_SET_CONTEXT_MGR already set\n");
 		ret = -EBUSY;
 		goto out;
@@ -2785,27 +2800,27 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 	ret = security_binder_set_context_mgr(proc->tsk);
 	if (ret < 0)
 		goto out;
-	if (uid_valid(binder_context_mgr_uid)) {
-		if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
+	if (uid_valid(context->binder_context_mgr_uid)) {
+		if (!uid_eq(context->binder_context_mgr_uid, curr_euid)) {
 			pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
 			       from_kuid(&init_user_ns, curr_euid),
 			       from_kuid(&init_user_ns,
-					binder_context_mgr_uid));
+					 context->binder_context_mgr_uid));
 			ret = -EPERM;
 			goto out;
 		}
 	} else {
-		binder_context_mgr_uid = curr_euid;
+		context->binder_context_mgr_uid = curr_euid;
 	}
-	binder_context_mgr_node = binder_new_node(proc, 0, 0);
-	if (binder_context_mgr_node == NULL) {
+	context->binder_context_mgr_node = binder_new_node(proc, 0, 0);
+	if (!context->binder_context_mgr_node) {
 		ret = -ENOMEM;
 		goto out;
 	}
-	binder_context_mgr_node->local_weak_refs++;
-	binder_context_mgr_node->local_strong_refs++;
-	binder_context_mgr_node->has_strong_ref = 1;
-	binder_context_mgr_node->has_weak_ref = 1;
+	context->binder_context_mgr_node->local_weak_refs++;
+	context->binder_context_mgr_node->local_strong_refs++;
+	context->binder_context_mgr_node->has_strong_ref = 1;
+	context->binder_context_mgr_node->has_weak_ref = 1;
 out:
 	return ret;
 }
@@ -3035,6 +3050,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
 		return -ENOMEM;
 	get_task_struct(current);
 	proc->tsk = current;
+	proc->context = &global_context;
 	INIT_LIST_HEAD(&proc->todo);
 	init_waitqueue_head(&proc->wait);
 	proc->default_priority = task_nice(current);
@@ -3147,6 +3163,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 static void binder_deferred_release(struct binder_proc *proc)
 {
 	struct binder_transaction *t;
+	struct binder_context *context = proc->context;
 	struct rb_node *n;
 	int threads, nodes, incoming_refs, outgoing_refs, buffers,
 		active_transactions, page_count;
@@ -3156,11 +3173,12 @@ static void binder_deferred_release(struct binder_proc *proc)
 
 	hlist_del(&proc->proc_node);
 
-	if (binder_context_mgr_node && binder_context_mgr_node->proc == proc) {
+	if (context->binder_context_mgr_node &&
+	    context->binder_context_mgr_node->proc == proc) {
 		binder_debug(BINDER_DEBUG_DEAD_BINDER,
 			     "%s: %d context_mgr_node gone\n",
 			     __func__, proc->pid);
-		binder_context_mgr_node = NULL;
+		context->binder_context_mgr_node = NULL;
 	}
 
 	threads = 0;

From 8b980bee79719478fddc7ff595386b0c32661a33 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Mon, 17 Oct 2016 15:17:31 +0200
Subject: [PATCH 0525/3220] android: binder: deal with contexts in debugfs.

Properly print the context in debugfs entries.

Change-Id: If10c2129536d9f39bae542afd7318ca79af60e3a
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 93780d1cc199..635be4d9b8f3 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -190,6 +190,7 @@ struct binder_transaction_log_entry {
 	int to_node;
 	int data_size;
 	int offsets_size;
+	const char *context_name;
 };
 struct binder_transaction_log {
 	int next;
@@ -217,10 +218,12 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
 struct binder_context {
 	struct binder_node *binder_context_mgr_node;
 	kuid_t binder_context_mgr_uid;
+	const char *name;
 };
 
 static struct binder_context global_context = {
 	.binder_context_mgr_uid = INVALID_UID,
+	.name = "binder",
 };
 
 struct binder_work {
@@ -1409,6 +1412,7 @@ static void binder_transaction(struct binder_proc *proc,
 	e->target_handle = tr->target.handle;
 	e->data_size = tr->data_size;
 	e->offsets_size = tr->offsets_size;
+	e->context_name = proc->context->name;
 
 	if (reply) {
 		in_reply_to = thread->transaction_stack;
@@ -3069,8 +3073,17 @@ static int binder_open(struct inode *nodp, struct file *filp)
 		char strbuf[11];
 
 		snprintf(strbuf, sizeof(strbuf), "%u", proc->pid);
+		/*
+		 * proc debug entries are shared between contexts, so
+		 * this will fail if the process tries to open the driver
+		 * again with a different context. The priting code will
+		 * anyway print all contexts that a given PID has, so this
+		 * is not a problem.
+		 */
 		proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO,
-			binder_debugfs_dir_entry_proc, proc, &binder_proc_fops);
+			binder_debugfs_dir_entry_proc,
+			(void *)(unsigned long)proc->pid,
+			&binder_proc_fops);
 	}
 
 	return 0;
@@ -3465,6 +3478,7 @@ static void print_binder_proc(struct seq_file *m,
 	size_t header_pos;
 
 	seq_printf(m, "proc %d\n", proc->pid);
+	seq_printf(m, "context %s\n", proc->context->name);
 	header_pos = m->count;
 
 	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
@@ -3589,6 +3603,7 @@ static void print_binder_proc_stats(struct seq_file *m,
 	int count, strong, weak;
 
 	seq_printf(m, "proc %d\n", proc->pid);
+	seq_printf(m, "context %s\n", proc->context->name);
 	count = 0;
 	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
 		count++;
@@ -3696,23 +3711,18 @@ static int binder_transactions_show(struct seq_file *m, void *unused)
 static int binder_proc_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *itr;
-	struct binder_proc *proc = m->private;
+	int pid = (unsigned long)m->private;
 	int do_lock = !binder_debug_no_lock;
-	bool valid_proc = false;
 
 	if (do_lock)
 		binder_lock(__func__);
 
 	hlist_for_each_entry(itr, &binder_procs, proc_node) {
-		if (itr == proc) {
-			valid_proc = true;
-			break;
+		if (itr->pid == pid) {
+			seq_puts(m, "binder proc state:\n");
+			print_binder_proc(m, itr, 1);
 		}
 	}
-	if (valid_proc) {
-		seq_puts(m, "binder proc state:\n");
-		print_binder_proc(m, proc, 1);
-	}
 	if (do_lock)
 		binder_unlock(__func__);
 	return 0;
@@ -3722,11 +3732,11 @@ static void print_binder_transaction_log_entry(struct seq_file *m,
 					struct binder_transaction_log_entry *e)
 {
 	seq_printf(m,
-		   "%d: %s from %d:%d to %d:%d node %d handle %d size %d:%d\n",
+		   "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d\n",
 		   e->debug_id, (e->call_type == 2) ? "reply" :
 		   ((e->call_type == 1) ? "async" : "call "), e->from_proc,
-		   e->from_thread, e->to_proc, e->to_thread, e->to_node,
-		   e->target_handle, e->data_size, e->offsets_size);
+		   e->from_thread, e->to_proc, e->to_thread, e->context_name,
+		   e->to_node, e->target_handle, e->data_size, e->offsets_size);
 }
 
 static int binder_transaction_log_show(struct seq_file *m, void *unused)

From 04e3812e712fce296da074fbfba6519a77678dfb Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 30 Sep 2016 16:08:09 +0200
Subject: [PATCH 0526/3220] android: binder: support multiple /dev instances.

Add a new module parameter 'devices', that can be
used to specify the names of the binder device
nodes we want to populate in /dev.

Each device node has its own context manager, and
is therefore logically separated from all the other
device nodes.

The config option CONFIG_ANDROID_BINDER_DEVICES can
be used to set the default value of the parameter.

This approach was favored over using IPC namespaces,
mostly because we require a single process to be a
part of multiple binder contexts, which seemed harder
to achieve with namespaces.

Change-Id: I3df72b2a19b5ad5a0360e6322482db7b00a12b24
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/Kconfig  | 12 ++++++
 drivers/android/binder.c | 85 ++++++++++++++++++++++++++++++++++------
 2 files changed, 86 insertions(+), 11 deletions(-)

diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index bdfc6c6f4f5a..a82fc022d34b 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -19,6 +19,18 @@ config ANDROID_BINDER_IPC
 	  Android process, using Binder to identify, invoke and pass arguments
 	  between said processes.
 
+config ANDROID_BINDER_DEVICES
+	string "Android Binder devices"
+	depends on ANDROID_BINDER_IPC
+	default "binder"
+	---help---
+	  Default value for the binder.devices parameter.
+
+	  The binder.devices parameter is a comma-separated list of strings
+	  that specifies the names of the binder device nodes that will be
+	  created. Each binder device has its own context manager, and is
+	  therefore logically separated from the other devices.
+
 config ANDROID_BINDER_IPC_32BIT
 	bool
 	depends on !64BIT && ANDROID_BINDER_IPC
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 635be4d9b8f3..29700728bbd1 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -50,6 +50,7 @@ static DEFINE_MUTEX(binder_main_lock);
 static DEFINE_MUTEX(binder_deferred_lock);
 static DEFINE_MUTEX(binder_mmap_lock);
 
+static HLIST_HEAD(binder_devices);
 static HLIST_HEAD(binder_procs);
 static HLIST_HEAD(binder_deferred_list);
 static HLIST_HEAD(binder_dead_nodes);
@@ -114,6 +115,9 @@ module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO);
 static bool binder_debug_no_lock;
 module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO);
 
+static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
+module_param_named(devices, binder_devices_param, charp, S_IRUGO);
+
 static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait);
 static int binder_stop_on_user_error;
 
@@ -221,9 +225,10 @@ struct binder_context {
 	const char *name;
 };
 
-static struct binder_context global_context = {
-	.binder_context_mgr_uid = INVALID_UID,
-	.name = "binder",
+struct binder_device {
+	struct hlist_node hlist;
+	struct miscdevice miscdev;
+	struct binder_context context;
 };
 
 struct binder_work {
@@ -3045,6 +3050,7 @@ err_bad_arg:
 static int binder_open(struct inode *nodp, struct file *filp)
 {
 	struct binder_proc *proc;
+	struct binder_device *binder_dev;
 
 	binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n",
 		     current->group_leader->pid, current->pid);
@@ -3054,10 +3060,12 @@ static int binder_open(struct inode *nodp, struct file *filp)
 		return -ENOMEM;
 	get_task_struct(current);
 	proc->tsk = current;
-	proc->context = &global_context;
 	INIT_LIST_HEAD(&proc->todo);
 	init_waitqueue_head(&proc->wait);
 	proc->default_priority = task_nice(current);
+	binder_dev = container_of(filp->private_data, struct binder_device,
+				  miscdev);
+	proc->context = &binder_dev->context;
 
 	binder_lock(__func__);
 
@@ -3764,20 +3772,44 @@ static const struct file_operations binder_fops = {
 	.release = binder_release,
 };
 
-static struct miscdevice binder_miscdev = {
-	.minor = MISC_DYNAMIC_MINOR,
-	.name = "binder",
-	.fops = &binder_fops
-};
-
 BINDER_DEBUG_ENTRY(state);
 BINDER_DEBUG_ENTRY(stats);
 BINDER_DEBUG_ENTRY(transactions);
 BINDER_DEBUG_ENTRY(transaction_log);
 
+static int __init init_binder_device(const char *name)
+{
+	int ret;
+	struct binder_device *binder_device;
+
+	binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL);
+	if (!binder_device)
+		return -ENOMEM;
+
+	binder_device->miscdev.fops = &binder_fops;
+	binder_device->miscdev.minor = MISC_DYNAMIC_MINOR;
+	binder_device->miscdev.name = name;
+
+	binder_device->context.binder_context_mgr_uid = INVALID_UID;
+	binder_device->context.name = name;
+
+	ret = misc_register(&binder_device->miscdev);
+	if (ret < 0) {
+		kfree(binder_device);
+		return ret;
+	}
+
+	hlist_add_head(&binder_device->hlist, &binder_devices);
+
+	return ret;
+}
+
 static int __init binder_init(void)
 {
 	int ret;
+	char *device_name, *device_names;
+	struct binder_device *device;
+	struct hlist_node *tmp;
 
 	binder_deferred_workqueue = create_singlethread_workqueue("binder");
 	if (!binder_deferred_workqueue)
@@ -3787,7 +3819,7 @@ static int __init binder_init(void)
 	if (binder_debugfs_dir_entry_root)
 		binder_debugfs_dir_entry_proc = debugfs_create_dir("proc",
 						 binder_debugfs_dir_entry_root);
-	ret = misc_register(&binder_miscdev);
+
 	if (binder_debugfs_dir_entry_root) {
 		debugfs_create_file("state",
 				    S_IRUGO,
@@ -3815,6 +3847,37 @@ static int __init binder_init(void)
 				    &binder_transaction_log_failed,
 				    &binder_transaction_log_fops);
 	}
+
+	/*
+	 * Copy the module_parameter string, because we don't want to
+	 * tokenize it in-place.
+	 */
+	device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
+	if (!device_names) {
+		ret = -ENOMEM;
+		goto err_alloc_device_names_failed;
+	}
+	strcpy(device_names, binder_devices_param);
+
+	while ((device_name = strsep(&device_names, ","))) {
+		ret = init_binder_device(device_name);
+		if (ret)
+			goto err_init_binder_device_failed;
+	}
+
+	return ret;
+
+err_init_binder_device_failed:
+	hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) {
+		misc_deregister(&device->miscdev);
+		hlist_del(&device->hlist);
+		kfree(device);
+	}
+err_alloc_device_names_failed:
+	debugfs_remove_recursive(binder_debugfs_dir_entry_root);
+
+	destroy_workqueue(binder_deferred_workqueue);
+
 	return ret;
 }
 

From bfd49fea44974857219e7387a11534c12d2440fc Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Thu, 29 Sep 2016 15:38:14 +0200
Subject: [PATCH 0527/3220] android: binder: refactor binder_transact()

Moved handling of fixup for binder objects,
handles and file descriptors into separate
functions.

Change-Id: If6849f1caee3834aa87d0ab08950bb1e21ec6e38
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 309 ++++++++++++++++++++++-----------------
 1 file changed, 172 insertions(+), 137 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 29700728bbd1..ef42b10c41ad 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1392,10 +1392,172 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 	}
 }
 
+static int binder_translate_binder(struct flat_binder_object *fp,
+				   struct binder_transaction *t,
+				   struct binder_thread *thread)
+{
+	struct binder_node *node;
+	struct binder_ref *ref;
+	struct binder_proc *proc = thread->proc;
+	struct binder_proc *target_proc = t->to_proc;
+
+	node = binder_get_node(proc, fp->binder);
+	if (!node) {
+		node = binder_new_node(proc, fp->binder, fp->cookie);
+		if (!node)
+			return -ENOMEM;
+
+		node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+		node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
+	}
+	if (fp->cookie != node->cookie) {
+		binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n",
+				  proc->pid, thread->pid, (u64)fp->binder,
+				  node->debug_id, (u64)fp->cookie,
+				  (u64)node->cookie);
+		return -EINVAL;
+	}
+	if (security_binder_transfer_binder(proc->tsk, target_proc->tsk))
+		return -EPERM;
+
+	ref = binder_get_ref_for_node(target_proc, node);
+	if (!ref)
+		return -EINVAL;
+
+	if (fp->hdr.type == BINDER_TYPE_BINDER)
+		fp->hdr.type = BINDER_TYPE_HANDLE;
+	else
+		fp->hdr.type = BINDER_TYPE_WEAK_HANDLE;
+	fp->binder = 0;
+	fp->handle = ref->desc;
+	fp->cookie = 0;
+	binder_inc_ref(ref, fp->hdr.type == BINDER_TYPE_HANDLE, &thread->todo);
+
+	trace_binder_transaction_node_to_ref(t, node, ref);
+	binder_debug(BINDER_DEBUG_TRANSACTION,
+		     "        node %d u%016llx -> ref %d desc %d\n",
+		     node->debug_id, (u64)node->ptr,
+		     ref->debug_id, ref->desc);
+
+	return 0;
+}
+
+static int binder_translate_handle(struct flat_binder_object *fp,
+				   struct binder_transaction *t,
+				   struct binder_thread *thread)
+{
+	struct binder_ref *ref;
+	struct binder_proc *proc = thread->proc;
+	struct binder_proc *target_proc = t->to_proc;
+
+	ref = binder_get_ref(proc, fp->handle,
+			     fp->hdr.type == BINDER_TYPE_HANDLE);
+	if (!ref) {
+		binder_user_error("%d:%d got transaction with invalid handle, %d\n",
+				  proc->pid, thread->pid, fp->handle);
+		return -EINVAL;
+	}
+	if (security_binder_transfer_binder(proc->tsk, target_proc->tsk))
+		return -EPERM;
+
+	if (ref->node->proc == target_proc) {
+		if (fp->hdr.type == BINDER_TYPE_HANDLE)
+			fp->hdr.type = BINDER_TYPE_BINDER;
+		else
+			fp->hdr.type = BINDER_TYPE_WEAK_BINDER;
+		fp->binder = ref->node->ptr;
+		fp->cookie = ref->node->cookie;
+		binder_inc_node(ref->node, fp->hdr.type == BINDER_TYPE_BINDER,
+				0, NULL);
+		trace_binder_transaction_ref_to_node(t, ref);
+		binder_debug(BINDER_DEBUG_TRANSACTION,
+			     "        ref %d desc %d -> node %d u%016llx\n",
+			     ref->debug_id, ref->desc, ref->node->debug_id,
+			     (u64)ref->node->ptr);
+	} else {
+		struct binder_ref *new_ref;
+
+		new_ref = binder_get_ref_for_node(target_proc, ref->node);
+		if (!new_ref)
+			return -EINVAL;
+
+		fp->binder = 0;
+		fp->handle = new_ref->desc;
+		fp->cookie = 0;
+		binder_inc_ref(new_ref, fp->hdr.type == BINDER_TYPE_HANDLE,
+			       NULL);
+		trace_binder_transaction_ref_to_ref(t, ref, new_ref);
+		binder_debug(BINDER_DEBUG_TRANSACTION,
+			     "        ref %d desc %d -> ref %d desc %d (node %d)\n",
+			     ref->debug_id, ref->desc, new_ref->debug_id,
+			     new_ref->desc, ref->node->debug_id);
+	}
+	return 0;
+}
+
+static int binder_translate_fd(int fd,
+			       struct binder_transaction *t,
+			       struct binder_thread *thread,
+			       struct binder_transaction *in_reply_to)
+{
+	struct binder_proc *proc = thread->proc;
+	struct binder_proc *target_proc = t->to_proc;
+	int target_fd;
+	struct file *file;
+	int ret;
+	bool target_allows_fd;
+
+	if (in_reply_to)
+		target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS);
+	else
+		target_allows_fd = t->buffer->target_node->accept_fds;
+	if (!target_allows_fd) {
+		binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n",
+				  proc->pid, thread->pid,
+				  in_reply_to ? "reply" : "transaction",
+				  fd);
+		ret = -EPERM;
+		goto err_fd_not_accepted;
+	}
+
+	file = fget(fd);
+	if (!file) {
+		binder_user_error("%d:%d got transaction with invalid fd, %d\n",
+				  proc->pid, thread->pid, fd);
+		ret = -EBADF;
+		goto err_fget;
+	}
+	ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file);
+	if (ret < 0) {
+		ret = -EPERM;
+		goto err_security;
+	}
+
+	target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
+	if (target_fd < 0) {
+		ret = -ENOMEM;
+		goto err_get_unused_fd;
+	}
+	task_fd_install(target_proc, target_fd, file);
+	trace_binder_transaction_fd(t, fd, target_fd);
+	binder_debug(BINDER_DEBUG_TRANSACTION, "        fd %d -> %d\n",
+		     fd, target_fd);
+
+	return target_fd;
+
+err_get_unused_fd:
+err_security:
+	fput(file);
+err_fget:
+err_fd_not_accepted:
+	return ret;
+}
+
 static void binder_transaction(struct binder_proc *proc,
 			       struct binder_thread *thread,
 			       struct binder_transaction_data *tr, int reply)
 {
+	int ret;
 	struct binder_transaction *t;
 	struct binder_work *tcomplete;
 	binder_size_t *offp, *off_end;
@@ -1623,157 +1785,35 @@ static void binder_transaction(struct binder_proc *proc,
 		case BINDER_TYPE_BINDER:
 		case BINDER_TYPE_WEAK_BINDER: {
 			struct flat_binder_object *fp;
-			struct binder_node *node;
-			struct binder_ref *ref;
 
 			fp = to_flat_binder_object(hdr);
-			node = binder_get_node(proc, fp->binder);
-			if (node == NULL) {
-				node = binder_new_node(proc, fp->binder, fp->cookie);
-				if (node == NULL) {
-					return_error = BR_FAILED_REPLY;
-					goto err_binder_new_node_failed;
-				}
-				node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
-				node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
-			}
-			if (fp->cookie != node->cookie) {
-				binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n",
-					proc->pid, thread->pid,
-					(u64)fp->binder, node->debug_id,
-					(u64)fp->cookie, (u64)node->cookie);
+			ret = binder_translate_binder(fp, t, thread);
+			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
-				goto err_binder_get_ref_for_node_failed;
+				goto err_translate_failed;
 			}
-			if (security_binder_transfer_binder(proc->tsk,
-							    target_proc->tsk)) {
-				return_error = BR_FAILED_REPLY;
-				goto err_binder_get_ref_for_node_failed;
-			}
-			ref = binder_get_ref_for_node(target_proc, node);
-			if (ref == NULL) {
-				return_error = BR_FAILED_REPLY;
-				goto err_binder_get_ref_for_node_failed;
-			}
-			if (hdr->type == BINDER_TYPE_BINDER)
-				hdr->type = BINDER_TYPE_HANDLE;
-			else
-				hdr->type = BINDER_TYPE_WEAK_HANDLE;
-			fp->binder = 0;
-			fp->handle = ref->desc;
-			fp->cookie = 0;
-			binder_inc_ref(ref, hdr->type == BINDER_TYPE_HANDLE,
-				       &thread->todo);
-
-			trace_binder_transaction_node_to_ref(t, node, ref);
-			binder_debug(BINDER_DEBUG_TRANSACTION,
-				     "        node %d u%016llx -> ref %d desc %d\n",
-				     node->debug_id, (u64)node->ptr,
-				     ref->debug_id, ref->desc);
 		} break;
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
 			struct flat_binder_object *fp;
-			struct binder_ref *ref;
 
 			fp = to_flat_binder_object(hdr);
-			ref = binder_get_ref(proc, fp->handle,
-					     hdr->type == BINDER_TYPE_HANDLE);
-			if (ref == NULL) {
-				binder_user_error("%d:%d got transaction with invalid handle, %d\n",
-						proc->pid,
-						thread->pid, fp->handle);
+			ret = binder_translate_handle(fp, t, thread);
+			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
-				goto err_binder_get_ref_failed;
-			}
-			if (security_binder_transfer_binder(proc->tsk,
-							    target_proc->tsk)) {
-				return_error = BR_FAILED_REPLY;
-				goto err_binder_get_ref_failed;
-			}
-			if (ref->node->proc == target_proc) {
-				if (hdr->type == BINDER_TYPE_HANDLE)
-					hdr->type = BINDER_TYPE_BINDER;
-				else
-					hdr->type = BINDER_TYPE_WEAK_BINDER;
-				fp->binder = ref->node->ptr;
-				fp->cookie = ref->node->cookie;
-				binder_inc_node(ref->node,
-						hdr->type == BINDER_TYPE_BINDER,
-						0, NULL);
-				trace_binder_transaction_ref_to_node(t, ref);
-				binder_debug(BINDER_DEBUG_TRANSACTION,
-					     "        ref %d desc %d -> node %d u%016llx\n",
-					     ref->debug_id, ref->desc, ref->node->debug_id,
-					     (u64)ref->node->ptr);
-			} else {
-				struct binder_ref *new_ref;
-
-				new_ref = binder_get_ref_for_node(target_proc, ref->node);
-				if (new_ref == NULL) {
-					return_error = BR_FAILED_REPLY;
-					goto err_binder_get_ref_for_node_failed;
-				}
-				fp->binder = 0;
-				fp->handle = new_ref->desc;
-				fp->cookie = 0;
-				binder_inc_ref(new_ref,
-					       hdr->type == BINDER_TYPE_HANDLE,
-					       NULL);
-				trace_binder_transaction_ref_to_ref(t, ref,
-								    new_ref);
-				binder_debug(BINDER_DEBUG_TRANSACTION,
-					     "        ref %d desc %d -> ref %d desc %d (node %d)\n",
-					     ref->debug_id, ref->desc, new_ref->debug_id,
-					     new_ref->desc, ref->node->debug_id);
+				goto err_translate_failed;
 			}
 		} break;
 
 		case BINDER_TYPE_FD: {
-			int target_fd;
-			struct file *file;
 			struct binder_fd_object *fp = to_binder_fd_object(hdr);
+			int target_fd = binder_translate_fd(fp->fd, t, thread,
+							    in_reply_to);
 
-			if (reply) {
-				if (!(in_reply_to->flags & TF_ACCEPT_FDS)) {
-					binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n",
-						proc->pid, thread->pid, fp->fd);
-					return_error = BR_FAILED_REPLY;
-					goto err_fd_not_allowed;
-				}
-			} else if (!target_node->accept_fds) {
-				binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n",
-					proc->pid, thread->pid, fp->fd);
-				return_error = BR_FAILED_REPLY;
-				goto err_fd_not_allowed;
-			}
-
-			file = fget(fp->fd);
-			if (file == NULL) {
-				binder_user_error("%d:%d got transaction with invalid fd, %d\n",
-					proc->pid, thread->pid, fp->fd);
-				return_error = BR_FAILED_REPLY;
-				goto err_fget_failed;
-			}
-			if (security_binder_transfer_file(proc->tsk,
-							  target_proc->tsk,
-							  file) < 0) {
-				fput(file);
-				return_error = BR_FAILED_REPLY;
-				goto err_get_unused_fd_failed;
-			}
-			target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
 			if (target_fd < 0) {
-				fput(file);
 				return_error = BR_FAILED_REPLY;
-				goto err_get_unused_fd_failed;
+				goto err_translate_failed;
 			}
-			task_fd_install(target_proc, target_fd, file);
-			trace_binder_transaction_fd(t, fp->fd, target_fd);
-			binder_debug(BINDER_DEBUG_TRANSACTION,
-				     "        fd %d -> %d\n", fp->fd,
-				     target_fd);
-			/* TODO: fput? */
 			fp->pad_binder = 0;
 			fp->fd = target_fd;
 		} break;
@@ -1810,12 +1850,7 @@ static void binder_transaction(struct binder_proc *proc,
 		wake_up_interruptible(target_wait);
 	return;
 
-err_get_unused_fd_failed:
-err_fget_failed:
-err_fd_not_allowed:
-err_binder_get_ref_for_node_failed:
-err_binder_get_ref_failed:
-err_binder_new_node_failed:
+err_translate_failed:
 err_bad_object_type:
 err_bad_offset:
 err_copy_data_failed:

From 843a25788dba57df35820bcc91796fb30b56335c Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 30 Sep 2016 14:05:40 +0200
Subject: [PATCH 0528/3220] android: binder: add extra size to allocator.

The binder_buffer allocator currently only allocates
space for the data and offsets buffers of a Parcel.
This change allows for requesting an additional chunk
of data in the buffer, which can for example be used
to hold additional meta-data about the transaction
(eg a security context).

Change-Id: I58ab9c383a2e1a3057aae6adaa596ce867f1b157
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 41 +++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index ef42b10c41ad..83a6c1bbbad5 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -303,6 +303,7 @@ struct binder_buffer {
 	struct binder_node *target_node;
 	size_t data_size;
 	size_t offsets_size;
+	size_t extra_buffers_size;
 	uint8_t data[0];
 };
 
@@ -670,7 +671,9 @@ err_no_vma:
 
 static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 					      size_t data_size,
-					      size_t offsets_size, int is_async)
+					      size_t offsets_size,
+					      size_t extra_buffers_size,
+					      int is_async)
 {
 	struct rb_node *n = proc->free_buffers.rb_node;
 	struct binder_buffer *buffer;
@@ -678,7 +681,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 	struct rb_node *best_fit = NULL;
 	void *has_page_addr;
 	void *end_page_addr;
-	size_t size;
+	size_t size, data_offsets_size;
 
 	if (proc->vma == NULL) {
 		pr_err("%d: binder_alloc_buf, no vma\n",
@@ -686,15 +689,20 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 		return NULL;
 	}
 
-	size = ALIGN(data_size, sizeof(void *)) +
+	data_offsets_size = ALIGN(data_size, sizeof(void *)) +
 		ALIGN(offsets_size, sizeof(void *));
 
-	if (size < data_size || size < offsets_size) {
+	if (data_offsets_size < data_size || data_offsets_size < offsets_size) {
 		binder_user_error("%d: got transaction with invalid size %zd-%zd\n",
 				proc->pid, data_size, offsets_size);
 		return NULL;
 	}
-
+	size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *));
+	if (size < data_offsets_size || size < extra_buffers_size) {
+		binder_user_error("%d: got transaction with invalid extra_buffers_size %zd\n",
+				  proc->pid, extra_buffers_size);
+		return NULL;
+	}
 	if (is_async &&
 	    proc->free_async_space < size + sizeof(struct binder_buffer)) {
 		binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
@@ -763,6 +771,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 		      proc->pid, size, buffer);
 	buffer->data_size = data_size;
 	buffer->offsets_size = offsets_size;
+	buffer->extra_buffers_size = extra_buffers_size;
 	buffer->async_transaction = is_async;
 	if (is_async) {
 		proc->free_async_space -= size + sizeof(struct binder_buffer);
@@ -837,7 +846,8 @@ static void binder_free_buf(struct binder_proc *proc,
 	buffer_size = binder_buffer_size(proc, buffer);
 
 	size = ALIGN(buffer->data_size, sizeof(void *)) +
-		ALIGN(buffer->offsets_size, sizeof(void *));
+		ALIGN(buffer->offsets_size, sizeof(void *)) +
+		ALIGN(buffer->extra_buffers_size, sizeof(void *));
 
 	binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 		     "%d: binder_free_buf %p size %zd buffer_size %zd\n",
@@ -1555,7 +1565,8 @@ err_fd_not_accepted:
 
 static void binder_transaction(struct binder_proc *proc,
 			       struct binder_thread *thread,
-			       struct binder_transaction_data *tr, int reply)
+			       struct binder_transaction_data *tr, int reply,
+			       binder_size_t extra_buffers_size)
 {
 	int ret;
 	struct binder_transaction *t;
@@ -1699,20 +1710,22 @@ static void binder_transaction(struct binder_proc *proc,
 
 	if (reply)
 		binder_debug(BINDER_DEBUG_TRANSACTION,
-			     "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld\n",
+			     "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld-%lld\n",
 			     proc->pid, thread->pid, t->debug_id,
 			     target_proc->pid, target_thread->pid,
 			     (u64)tr->data.ptr.buffer,
 			     (u64)tr->data.ptr.offsets,
-			     (u64)tr->data_size, (u64)tr->offsets_size);
+			     (u64)tr->data_size, (u64)tr->offsets_size,
+			     (u64)extra_buffers_size);
 	else
 		binder_debug(BINDER_DEBUG_TRANSACTION,
-			     "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld\n",
+			     "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld-%lld\n",
 			     proc->pid, thread->pid, t->debug_id,
 			     target_proc->pid, target_node->debug_id,
 			     (u64)tr->data.ptr.buffer,
 			     (u64)tr->data.ptr.offsets,
-			     (u64)tr->data_size, (u64)tr->offsets_size);
+			     (u64)tr->data_size, (u64)tr->offsets_size,
+			     (u64)extra_buffers_size);
 
 	if (!reply && !(tr->flags & TF_ONE_WAY))
 		t->from = thread;
@@ -1728,7 +1741,8 @@ static void binder_transaction(struct binder_proc *proc,
 	trace_binder_transaction(reply, t, target_node);
 
 	t->buffer = binder_alloc_buf(target_proc, tr->data_size,
-		tr->offsets_size, !reply && (t->flags & TF_ONE_WAY));
+		tr->offsets_size, extra_buffers_size,
+		!reply && (t->flags & TF_ONE_WAY));
 	if (t->buffer == NULL) {
 		return_error = BR_FAILED_REPLY;
 		goto err_binder_alloc_buf_failed;
@@ -2078,7 +2092,8 @@ static int binder_thread_write(struct binder_proc *proc,
 			if (copy_from_user(&tr, ptr, sizeof(tr)))
 				return -EFAULT;
 			ptr += sizeof(tr);
-			binder_transaction(proc, thread, &tr, cmd == BC_REPLY);
+			binder_transaction(proc, thread, &tr,
+					   cmd == BC_REPLY, 0);
 			break;
 		}
 

From dd9bc4f9f144a89d52a197fcf0ea033a287b7095 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 30 Sep 2016 14:10:07 +0200
Subject: [PATCH 0529/3220] android: binder: support for scatter-gather.

Previously all data passed over binder needed
to be serialized, with the exception of Binder
objects and file descriptors.

This patchs adds support for scatter-gathering raw
memory buffers into a binder transaction, avoiding
the need to first serialize them into a Parcel.

To remain backwards compatibile with existing
binder clients, it introduces two new command
ioctls for this purpose - BC_TRANSACTION_SG and
BC_REPLY_SG. These commands may only be used with
the new binder_transaction_data_sg structure,
which adds a field for the total size of the
buffers we are scatter-gathering.

Because memory buffers may contain pointers to
other buffers, we allow callers to specify
a parent buffer and an offset into it, to indicate
this is a location pointing to the buffer that
we are fixing up. The kernel will then take care
of fixing up the pointer to that buffer as well.

Change-Id: I02417f28cff14688f2e1d6fcb959438fd96566cc
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c            | 242 ++++++++++++++++++++++++++--
 include/uapi/linux/android/binder.h |  45 ++++++
 2 files changed, 275 insertions(+), 12 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 83a6c1bbbad5..a0f4d29e5148 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -153,6 +153,9 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 
 #define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr)
 
+#define to_binder_buffer_object(hdr) \
+	container_of(hdr, struct binder_buffer_object, hdr)
+
 enum binder_stat_types {
 	BINDER_STAT_PROC,
 	BINDER_STAT_THREAD,
@@ -166,7 +169,7 @@ enum binder_stat_types {
 
 struct binder_stats {
 	int br[_IOC_NR(BR_FAILED_REPLY) + 1];
-	int bc[_IOC_NR(BC_DEAD_BINDER_DONE) + 1];
+	int bc[_IOC_NR(BC_REPLY_SG) + 1];
 	int obj_created[BINDER_STAT_COUNT];
 	int obj_deleted[BINDER_STAT_COUNT];
 };
@@ -1306,6 +1309,9 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
 	case BINDER_TYPE_FD:
 		object_size = sizeof(struct binder_fd_object);
 		break;
+	case BINDER_TYPE_PTR:
+		object_size = sizeof(struct binder_buffer_object);
+		break;
 	default:
 		return 0;
 	}
@@ -1316,11 +1322,111 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
 		return 0;
 }
 
+/**
+ * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer.
+ * @b:		binder_buffer containing the object
+ * @index:	index in offset array at which the binder_buffer_object is
+ *		located
+ * @start:	points to the start of the offset array
+ * @num_valid:	the number of valid offsets in the offset array
+ *
+ * Return:	If @index is within the valid range of the offset array
+ *		described by @start and @num_valid, and if there's a valid
+ *		binder_buffer_object at the offset found in index @index
+ *		of the offset array, that object is returned. Otherwise,
+ *		%NULL is returned.
+ *		Note that the offset found in index @index itself is not
+ *		verified; this function assumes that @num_valid elements
+ *		from @start were previously verified to have valid offsets.
+ */
+static struct binder_buffer_object *binder_validate_ptr(struct binder_buffer *b,
+							binder_size_t index,
+							binder_size_t *start,
+							binder_size_t num_valid)
+{
+	struct binder_buffer_object *buffer_obj;
+	binder_size_t *offp;
+
+	if (index >= num_valid)
+		return NULL;
+
+	offp = start + index;
+	buffer_obj = (struct binder_buffer_object *)(b->data + *offp);
+	if (buffer_obj->hdr.type != BINDER_TYPE_PTR)
+		return NULL;
+
+	return buffer_obj;
+}
+
+/**
+ * binder_validate_fixup() - validates pointer/fd fixups happen in order.
+ * @b:			transaction buffer
+ * @objects_start	start of objects buffer
+ * @buffer:		binder_buffer_object in which to fix up
+ * @offset:		start offset in @buffer to fix up
+ * @last_obj:		last binder_buffer_object that we fixed up in
+ * @last_min_offset:	minimum fixup offset in @last_obj
+ *
+ * Return:		%true if a fixup in buffer @buffer at offset @offset is
+ *			allowed.
+ *
+ * For safety reasons, we only allow fixups inside a buffer to happen
+ * at increasing offsets; additionally, we only allow fixup on the last
+ * buffer object that was verified, or one of its parents.
+ *
+ * Example of what is allowed:
+ *
+ * A
+ *   B (parent = A, offset = 0)
+ *   C (parent = A, offset = 16)
+ *     D (parent = C, offset = 0)
+ *   E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset)
+ *
+ * Examples of what is not allowed:
+ *
+ * Decreasing offsets within the same parent:
+ * A
+ *   C (parent = A, offset = 16)
+ *   B (parent = A, offset = 0) // decreasing offset within A
+ *
+ * Referring to a parent that wasn't the last object or any of its parents:
+ * A
+ *   B (parent = A, offset = 0)
+ *   C (parent = A, offset = 0)
+ *   C (parent = A, offset = 16)
+ *     D (parent = B, offset = 0) // B is not A or any of A's parents
+ */
+static bool binder_validate_fixup(struct binder_buffer *b,
+				  binder_size_t *objects_start,
+				  struct binder_buffer_object *buffer,
+				  binder_size_t fixup_offset,
+				  struct binder_buffer_object *last_obj,
+				  binder_size_t last_min_offset)
+{
+	if (!last_obj) {
+		/* Nothing to fix up in */
+		return false;
+	}
+
+	while (last_obj != buffer) {
+		/*
+		 * Safe to retrieve the parent of last_obj, since it
+		 * was already previously verified by the driver.
+		 */
+		if ((last_obj->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0)
+			return false;
+		last_min_offset = last_obj->parent_offset + sizeof(uintptr_t);
+		last_obj = (struct binder_buffer_object *)
+			(b->data + *(objects_start + last_obj->parent));
+	}
+	return (fixup_offset >= last_min_offset);
+}
+
 static void binder_transaction_buffer_release(struct binder_proc *proc,
 					      struct binder_buffer *buffer,
 					      binder_size_t *failed_at)
 {
-	binder_size_t *offp, *off_end;
+	binder_size_t *offp, *off_start, *off_end;
 	int debug_id = buffer->debug_id;
 
 	binder_debug(BINDER_DEBUG_TRANSACTION,
@@ -1331,13 +1437,13 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 	if (buffer->target_node)
 		binder_dec_node(buffer->target_node, 1, 0);
 
-	offp = (binder_size_t *)(buffer->data +
-				 ALIGN(buffer->data_size, sizeof(void *)));
+	off_start = (binder_size_t *)(buffer->data +
+				      ALIGN(buffer->data_size, sizeof(void *)));
 	if (failed_at)
 		off_end = failed_at;
 	else
-		off_end = (void *)offp + buffer->offsets_size;
-	for (; offp < off_end; offp++) {
+		off_end = (void *)off_start + buffer->offsets_size;
+	for (offp = off_start; offp < off_end; offp++) {
 		struct binder_object_header *hdr;
 		size_t object_size = binder_validate_object(buffer, *offp);
 
@@ -1393,7 +1499,12 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			if (failed_at)
 				task_close_fd(proc, fp->fd);
 		} break;
-
+		case BINDER_TYPE_PTR:
+			/*
+			 * Nothing to do here, this will get cleaned up when the
+			 * transaction buffer gets freed
+			 */
+			break;
 		default:
 			pr_err("transaction release %d bad object type %x\n",
 				debug_id, hdr->type);
@@ -1563,6 +1674,53 @@ err_fd_not_accepted:
 	return ret;
 }
 
+static int binder_fixup_parent(struct binder_transaction *t,
+			       struct binder_thread *thread,
+			       struct binder_buffer_object *bp,
+			       binder_size_t *off_start,
+			       binder_size_t num_valid,
+			       struct binder_buffer_object *last_fixup_obj,
+			       binder_size_t last_fixup_min_off)
+{
+	struct binder_buffer_object *parent;
+	u8 *parent_buffer;
+	struct binder_buffer *b = t->buffer;
+	struct binder_proc *proc = thread->proc;
+	struct binder_proc *target_proc = t->to_proc;
+
+	if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT))
+		return 0;
+
+	parent = binder_validate_ptr(b, bp->parent, off_start, num_valid);
+	if (!parent) {
+		binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+				  proc->pid, thread->pid);
+		return -EINVAL;
+	}
+
+	if (!binder_validate_fixup(b, off_start,
+				   parent, bp->parent_offset,
+				   last_fixup_obj,
+				   last_fixup_min_off)) {
+		binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+				  proc->pid, thread->pid);
+		return -EINVAL;
+	}
+
+	if (parent->length < sizeof(binder_uintptr_t) ||
+	    bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) {
+		/* No space for a pointer here! */
+		binder_user_error("%d:%d got transaction with invalid parent offset\n",
+				  proc->pid, thread->pid);
+		return -EINVAL;
+	}
+	parent_buffer = (u8 *)(parent->buffer -
+			       target_proc->user_buffer_offset);
+	*(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer;
+
+	return 0;
+}
+
 static void binder_transaction(struct binder_proc *proc,
 			       struct binder_thread *thread,
 			       struct binder_transaction_data *tr, int reply,
@@ -1571,8 +1729,9 @@ static void binder_transaction(struct binder_proc *proc,
 	int ret;
 	struct binder_transaction *t;
 	struct binder_work *tcomplete;
-	binder_size_t *offp, *off_end;
+	binder_size_t *offp, *off_end, *off_start;
 	binder_size_t off_min;
+	u8 *sg_bufp, *sg_buf_end;
 	struct binder_proc *target_proc;
 	struct binder_thread *target_thread = NULL;
 	struct binder_node *target_node = NULL;
@@ -1581,6 +1740,8 @@ static void binder_transaction(struct binder_proc *proc,
 	struct binder_transaction *in_reply_to = NULL;
 	struct binder_transaction_log_entry *e;
 	uint32_t return_error;
+	struct binder_buffer_object *last_fixup_obj = NULL;
+	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
 
 	e = binder_transaction_log_add(&binder_transaction_log);
@@ -1755,8 +1916,9 @@ static void binder_transaction(struct binder_proc *proc,
 	if (target_node)
 		binder_inc_node(target_node, 1, 0, NULL);
 
-	offp = (binder_size_t *)(t->buffer->data +
-				 ALIGN(tr->data_size, sizeof(void *)));
+	off_start = (binder_size_t *)(t->buffer->data +
+				      ALIGN(tr->data_size, sizeof(void *)));
+	offp = off_start;
 
 	if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t)
 			   tr->data.ptr.buffer, tr->data_size)) {
@@ -1778,7 +1940,16 @@ static void binder_transaction(struct binder_proc *proc,
 		return_error = BR_FAILED_REPLY;
 		goto err_bad_offset;
 	}
-	off_end = (void *)offp + tr->offsets_size;
+	if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) {
+		binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n",
+				  proc->pid, thread->pid,
+				  extra_buffers_size);
+		return_error = BR_FAILED_REPLY;
+		goto err_bad_offset;
+	}
+	off_end = (void *)off_start + tr->offsets_size;
+	sg_bufp = (u8 *)(PTR_ALIGN(off_end, sizeof(void *)));
+	sg_buf_end = sg_bufp + extra_buffers_size;
 	off_min = 0;
 	for (; offp < off_end; offp++) {
 		struct binder_object_header *hdr;
@@ -1831,7 +2002,41 @@ static void binder_transaction(struct binder_proc *proc,
 			fp->pad_binder = 0;
 			fp->fd = target_fd;
 		} break;
+		case BINDER_TYPE_PTR: {
+			struct binder_buffer_object *bp =
+				to_binder_buffer_object(hdr);
+			size_t buf_left = sg_buf_end - sg_bufp;
 
+			if (bp->length > buf_left) {
+				binder_user_error("%d:%d got transaction with too large buffer\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
+				goto err_bad_offset;
+			}
+			if (copy_from_user(sg_bufp,
+					   (const void __user *)(uintptr_t)
+					   bp->buffer, bp->length)) {
+				binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
+				goto err_copy_data_failed;
+			}
+			/* Fixup buffer pointer to target proc address space */
+			bp->buffer = (uintptr_t)sg_bufp +
+				target_proc->user_buffer_offset;
+			sg_bufp += ALIGN(bp->length, sizeof(u64));
+
+			ret = binder_fixup_parent(t, thread, bp, off_start,
+						  offp - off_start,
+						  last_fixup_obj,
+						  last_fixup_min_off);
+			if (ret < 0) {
+				return_error = BR_FAILED_REPLY;
+				goto err_translate_failed;
+			}
+			last_fixup_obj = bp;
+			last_fixup_min_off = 0;
+		} break;
 		default:
 			binder_user_error("%d:%d got transaction with invalid object type, %x\n",
 				proc->pid, thread->pid, hdr->type);
@@ -2085,6 +2290,17 @@ static int binder_thread_write(struct binder_proc *proc,
 			break;
 		}
 
+		case BC_TRANSACTION_SG:
+		case BC_REPLY_SG: {
+			struct binder_transaction_data_sg tr;
+
+			if (copy_from_user(&tr, ptr, sizeof(tr)))
+				return -EFAULT;
+			ptr += sizeof(tr);
+			binder_transaction(proc, thread, &tr.transaction_data,
+					   cmd == BC_REPLY_SG, tr.buffers_size);
+			break;
+		}
 		case BC_TRANSACTION:
 		case BC_REPLY: {
 			struct binder_transaction_data tr;
@@ -3606,7 +3822,9 @@ static const char * const binder_command_strings[] = {
 	"BC_EXIT_LOOPER",
 	"BC_REQUEST_DEATH_NOTIFICATION",
 	"BC_CLEAR_DEATH_NOTIFICATION",
-	"BC_DEAD_BINDER_DONE"
+	"BC_DEAD_BINDER_DONE",
+	"BC_TRANSACTION_SG",
+	"BC_REPLY_SG",
 };
 
 static const char * const binder_objstat_strings[] = {
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index f67c2b1c0713..f3ef6e2634ba 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -33,6 +33,7 @@ enum {
 	BINDER_TYPE_HANDLE	= B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
 	BINDER_TYPE_WEAK_HANDLE	= B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
 	BINDER_TYPE_FD		= B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
+	BINDER_TYPE_PTR		= B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
 };
 
 enum {
@@ -95,6 +96,39 @@ struct binder_fd_object {
 
 	binder_uintptr_t		cookie;
 };
+
+/* struct binder_buffer_object - object describing a userspace buffer
+ * @hdr:		common header structure
+ * @flags:		one or more BINDER_BUFFER_* flags
+ * @buffer:		address of the buffer
+ * @length:		length of the buffer
+ * @parent:		index in offset array pointing to parent buffer
+ * @parent_offset:	offset in @parent pointing to this buffer
+ *
+ * A binder_buffer object represents an object that the
+ * binder kernel driver can copy verbatim to the target
+ * address space. A buffer itself may be pointed to from
+ * within another buffer, meaning that the pointer inside
+ * that other buffer needs to be fixed up as well. This
+ * can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT
+ * flag in @flags, by setting @parent buffer to the index
+ * in the offset array pointing to the parent binder_buffer_object,
+ * and by setting @parent_offset to the offset in the parent buffer
+ * at which the pointer to this buffer is located.
+ */
+struct binder_buffer_object {
+	struct binder_object_header	hdr;
+	__u32				flags;
+	binder_uintptr_t		buffer;
+	binder_size_t			length;
+	binder_size_t			parent;
+	binder_size_t			parent_offset;
+};
+
+enum {
+	BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
+};
+
 /*
  * On 64-bit platforms where user code may run in 32-bits the driver must
  * translate the buffer (and local binder) addresses appropriately.
@@ -187,6 +221,11 @@ struct binder_transaction_data {
 	} data;
 };
 
+struct binder_transaction_data_sg {
+	struct binder_transaction_data transaction_data;
+	binder_size_t buffers_size;
+};
+
 struct binder_ptr_cookie {
 	binder_uintptr_t ptr;
 	binder_uintptr_t cookie;
@@ -371,6 +410,12 @@ enum binder_driver_command_protocol {
 	/*
 	 * void *: cookie
 	 */
+
+	BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg),
+	BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg),
+	/*
+	 * binder_transaction_data_sg: the sent command.
+	 */
 };
 
 #endif /* _UAPI_LINUX_BINDER_H */

From e124de382771254e09bb2bbd2ac753cdae71a36d Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Tue, 18 Oct 2016 13:58:55 +0200
Subject: [PATCH 0530/3220] android: binder: support for file-descriptor
 arrays.

This patch introduces a new binder_fd_array object,
that allows us to support one or more file descriptors
embedded in a buffer that is scatter-gathered.

Change-Id: I647a53cf0d905c7be0dfd9333806982def68dd74
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c            | 137 ++++++++++++++++++++++++++++
 include/uapi/linux/android/binder.h |  28 ++++++
 2 files changed, 165 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index a0f4d29e5148..951393825261 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -156,6 +156,9 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 #define to_binder_buffer_object(hdr) \
 	container_of(hdr, struct binder_buffer_object, hdr)
 
+#define to_binder_fd_array_object(hdr) \
+	container_of(hdr, struct binder_fd_array_object, hdr)
+
 enum binder_stat_types {
 	BINDER_STAT_PROC,
 	BINDER_STAT_THREAD,
@@ -1312,6 +1315,9 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
 	case BINDER_TYPE_PTR:
 		object_size = sizeof(struct binder_buffer_object);
 		break;
+	case BINDER_TYPE_FDA:
+		object_size = sizeof(struct binder_fd_array_object);
+		break;
 	default:
 		return 0;
 	}
@@ -1505,6 +1511,47 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			 * transaction buffer gets freed
 			 */
 			break;
+		case BINDER_TYPE_FDA: {
+			struct binder_fd_array_object *fda;
+			struct binder_buffer_object *parent;
+			uintptr_t parent_buffer;
+			u32 *fd_array;
+			size_t fd_index;
+			binder_size_t fd_buf_size;
+
+			fda = to_binder_fd_array_object(hdr);
+			parent = binder_validate_ptr(buffer, fda->parent,
+						     off_start,
+						     offp - off_start);
+			if (!parent) {
+				pr_err("transaction release %d bad parent offset",
+				       debug_id);
+				continue;
+			}
+			/*
+			 * Since the parent was already fixed up, convert it
+			 * back to kernel address space to access it
+			 */
+			parent_buffer = parent->buffer -
+				proc->user_buffer_offset;
+
+			fd_buf_size = sizeof(u32) * fda->num_fds;
+			if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+				pr_err("transaction release %d invalid number of fds (%lld)\n",
+				       debug_id, (u64)fda->num_fds);
+				continue;
+			}
+			if (fd_buf_size > parent->length ||
+			    fda->parent_offset > parent->length - fd_buf_size) {
+				/* No space for all file descriptors here. */
+				pr_err("transaction release %d not enough space for %lld fds in buffer\n",
+				       debug_id, (u64)fda->num_fds);
+				continue;
+			}
+			fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+			for (fd_index = 0; fd_index < fda->num_fds; fd_index++)
+				task_close_fd(proc, fd_array[fd_index]);
+		} break;
 		default:
 			pr_err("transaction release %d bad object type %x\n",
 				debug_id, hdr->type);
@@ -1674,6 +1721,63 @@ err_fd_not_accepted:
 	return ret;
 }
 
+static int binder_translate_fd_array(struct binder_fd_array_object *fda,
+				     struct binder_buffer_object *parent,
+				     struct binder_transaction *t,
+				     struct binder_thread *thread,
+				     struct binder_transaction *in_reply_to)
+{
+	binder_size_t fdi, fd_buf_size, num_installed_fds;
+	int target_fd;
+	uintptr_t parent_buffer;
+	u32 *fd_array;
+	struct binder_proc *proc = thread->proc;
+	struct binder_proc *target_proc = t->to_proc;
+
+	fd_buf_size = sizeof(u32) * fda->num_fds;
+	if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+		binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n",
+				  proc->pid, thread->pid, (u64)fda->num_fds);
+		return -EINVAL;
+	}
+	if (fd_buf_size > parent->length ||
+	    fda->parent_offset > parent->length - fd_buf_size) {
+		/* No space for all file descriptors here. */
+		binder_user_error("%d:%d not enough space to store %lld fds in buffer\n",
+				  proc->pid, thread->pid, (u64)fda->num_fds);
+		return -EINVAL;
+	}
+	/*
+	 * Since the parent was already fixed up, convert it
+	 * back to the kernel address space to access it
+	 */
+	parent_buffer = parent->buffer - target_proc->user_buffer_offset;
+	fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+	if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) {
+		binder_user_error("%d:%d parent offset not aligned correctly.\n",
+				  proc->pid, thread->pid);
+		return -EINVAL;
+	}
+	for (fdi = 0; fdi < fda->num_fds; fdi++) {
+		target_fd = binder_translate_fd(fd_array[fdi], t, thread,
+						in_reply_to);
+		if (target_fd < 0)
+			goto err_translate_fd_failed;
+		fd_array[fdi] = target_fd;
+	}
+	return 0;
+
+err_translate_fd_failed:
+	/*
+	 * Failed to allocate fd or security error, free fds
+	 * installed so far.
+	 */
+	num_installed_fds = fdi;
+	for (fdi = 0; fdi < num_installed_fds; fdi++)
+		task_close_fd(target_proc, fd_array[fdi]);
+	return target_fd;
+}
+
 static int binder_fixup_parent(struct binder_transaction *t,
 			       struct binder_thread *thread,
 			       struct binder_buffer_object *bp,
@@ -2002,6 +2106,38 @@ static void binder_transaction(struct binder_proc *proc,
 			fp->pad_binder = 0;
 			fp->fd = target_fd;
 		} break;
+		case BINDER_TYPE_FDA: {
+			struct binder_fd_array_object *fda =
+				to_binder_fd_array_object(hdr);
+			struct binder_buffer_object *parent =
+				binder_validate_ptr(t->buffer, fda->parent,
+						    off_start,
+						    offp - off_start);
+			if (!parent) {
+				binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
+				goto err_bad_parent;
+			}
+			if (!binder_validate_fixup(t->buffer, off_start,
+						   parent, fda->parent_offset,
+						   last_fixup_obj,
+						   last_fixup_min_off)) {
+				binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
+				goto err_bad_parent;
+			}
+			ret = binder_translate_fd_array(fda, parent, t, thread,
+							in_reply_to);
+			if (ret < 0) {
+				return_error = BR_FAILED_REPLY;
+				goto err_translate_failed;
+			}
+			last_fixup_obj = parent;
+			last_fixup_min_off =
+				fda->parent_offset + sizeof(u32) * fda->num_fds;
+		} break;
 		case BINDER_TYPE_PTR: {
 			struct binder_buffer_object *bp =
 				to_binder_buffer_object(hdr);
@@ -2072,6 +2208,7 @@ static void binder_transaction(struct binder_proc *proc,
 err_translate_failed:
 err_bad_object_type:
 err_bad_offset:
+err_bad_parent:
 err_copy_data_failed:
 	trace_binder_transaction_failed_buffer_release(t->buffer);
 	binder_transaction_buffer_release(target_proc, t->buffer, offp);
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index f3ef6e2634ba..51f891fb1b18 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -33,6 +33,7 @@ enum {
 	BINDER_TYPE_HANDLE	= B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
 	BINDER_TYPE_WEAK_HANDLE	= B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
 	BINDER_TYPE_FD		= B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
+	BINDER_TYPE_FDA		= B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE),
 	BINDER_TYPE_PTR		= B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
 };
 
@@ -129,6 +130,33 @@ enum {
 	BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
 };
 
+/* struct binder_fd_array_object - object describing an array of fds in a buffer
+ * @hdr:		common header structure
+ * @num_fds:		number of file descriptors in the buffer
+ * @parent:		index in offset array to buffer holding the fd array
+ * @parent_offset:	start offset of fd array in the buffer
+ *
+ * A binder_fd_array object represents an array of file
+ * descriptors embedded in a binder_buffer_object. It is
+ * different from a regular binder_buffer_object because it
+ * describes a list of file descriptors to fix up, not an opaque
+ * blob of memory, and hence the kernel needs to treat it differently.
+ *
+ * An example of how this would be used is with Android's
+ * native_handle_t object, which is a struct with a list of integers
+ * and a list of file descriptors. The native_handle_t struct itself
+ * will be represented by a struct binder_buffer_objct, whereas the
+ * embedded list of file descriptors is represented by a
+ * struct binder_fd_array_object with that binder_buffer_object as
+ * a parent.
+ */
+struct binder_fd_array_object {
+	struct binder_object_header	hdr;
+	binder_size_t			num_fds;
+	binder_size_t			parent;
+	binder_size_t			parent_offset;
+};
+
 /*
  * On 64-bit platforms where user code may run in 32-bits the driver must
  * translate the buffer (and local binder) addresses appropriately.

From 96009ab737c4e6da44a0fff72d33534583454d03 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Thu, 15 Sep 2016 16:05:40 +0530
Subject: [PATCH 0531/3220] ANDROID: usb: gadget: audio_source: fix comparison
 of distinct pointer types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use div_s64() instead of do_div() to fix following "comparison of
distinct pointer types lacks a cast" warning in do_div() call in
audio_send() for ARCH=arm in Linux 4.8-rc6:

  CC      drivers/usb/gadget/function/f_audio_source.o
In file included from ./arch/arm/include/asm/div64.h:126:0,
                 from ./include/linux/kernel.h:142,
                 from ./include/linux/list.h:8,
                 from ./include/linux/kobject.h:20,
                 from ./include/linux/device.h:17,
                 from drivers/usb/gadget/function/f_audio_source.c:17:
drivers/usb/gadget/function/f_audio_source.c: In function ‘audio_send’:
./include/asm-generic/div64.h:207:28: warning: comparison of distinct pointer types lacks a cast
  (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \
                            ^
drivers/usb/gadget/function/f_audio_source.c:381:2: note: in expansion of macro ‘do_div’
  do_div(msecs, 1000000);
  ^
./include/asm-generic/div64.h:207:28: warning: comparison of distinct pointer types lacks a cast
  (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \
                            ^
drivers/usb/gadget/function/f_audio_source.c:383:2: note: in expansion of macro ‘do_div’
  do_div(frames, 1000);
  ^
  LD      drivers/usb/gadget/function/usb_f_audio_source.o

Change-Id: Ie1a920c8948f3fc3f1263add25a402ded132fd66
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/usb/gadget/function/f_audio_source.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c
index bcd817439dbf..2489a5fa2685 100644
--- a/drivers/usb/gadget/function/f_audio_source.c
+++ b/drivers/usb/gadget/function/f_audio_source.c
@@ -377,10 +377,9 @@ static void audio_send(struct audio_dev *audio)
 
 	/* compute number of frames to send */
 	now = ktime_get();
-	msecs = ktime_to_ns(now) - ktime_to_ns(audio->start_time);
-	do_div(msecs, 1000000);
-	frames = msecs * SAMPLE_RATE;
-	do_div(frames, 1000);
+	msecs = div_s64((ktime_to_ns(now) - ktime_to_ns(audio->start_time)),
+			1000000);
+	frames = div_s64((msecs * SAMPLE_RATE), 1000);
 
 	/* Readjust our frames_sent if we fall too far behind.
 	 * If we get too far behind it is better to drop some frames than

From 2182c4daf185176ac99bf71a4ae1786ea9620714 Mon Sep 17 00:00:00 2001
From: Jonathan Hamilton <jonathan.hamilton@imgtec.com>
Date: Wed, 21 Sep 2016 12:40:51 -0700
Subject: [PATCH 0532/3220] ANDROID: video: adf: Avoid directly referencing
 user pointers

Enabling KASAN on a kernel using ADF causes a number of places where
user-supplied pointers to ioctls  pointers are directly dereferenced
without copy_from_user or access_ok.

Bug: 31806036

Signed-off-by: Jonathan Hamilton <jonathan.hamilton@imgtec.com>
Change-Id: I6e86237aaa6cec0f6e1c385336aefcc5332080ae
---
 drivers/video/adf/adf_fops.c | 73 +++++++++++++++---------------------
 1 file changed, 31 insertions(+), 42 deletions(-)

diff --git a/drivers/video/adf/adf_fops.c b/drivers/video/adf/adf_fops.c
index 8726617f73ab..705411bfaebb 100644
--- a/drivers/video/adf/adf_fops.c
+++ b/drivers/video/adf/adf_fops.c
@@ -132,7 +132,7 @@ static int adf_eng_get_data(struct adf_overlay_engine *eng,
 					eng->ops->n_supported_formats));
 
 	mutex_lock(&dev->client_lock);
-	ret = adf_obj_copy_custom_data_to_user(&eng->base, arg->custom_data,
+	ret = adf_obj_copy_custom_data_to_user(&eng->base, data.custom_data,
 			&data.custom_data_size);
 	mutex_unlock(&dev->client_lock);
 
@@ -144,7 +144,7 @@ static int adf_eng_get_data(struct adf_overlay_engine *eng,
 		goto done;
 	}
 
-	if (supported_formats && copy_to_user(arg->supported_formats,
+	if (supported_formats && copy_to_user(data.supported_formats,
 			supported_formats,
 			n_supported_formats * sizeof(supported_formats[0])))
 		ret = -EFAULT;
@@ -220,56 +220,45 @@ static int adf_device_post_config(struct adf_device *dev,
 	int complete_fence_fd;
 	struct adf_buffer *bufs = NULL;
 	struct adf_interface **intfs = NULL;
-	size_t n_intfs, n_bufs, i;
+	struct adf_post_config data;
+	size_t i;
 	void *custom_data = NULL;
-	size_t custom_data_size;
 	int ret = 0;
 
+	if (copy_from_user(&data, arg, sizeof(data)))
+		return -EFAULT;
+
 	complete_fence_fd = get_unused_fd_flags(O_CLOEXEC);
 	if (complete_fence_fd < 0)
 		return complete_fence_fd;
 
-	if (get_user(n_intfs, &arg->n_interfaces)) {
-		ret = -EFAULT;
-		goto err_get_user;
-	}
-
-	if (n_intfs > ADF_MAX_INTERFACES) {
+	if (data.n_interfaces > ADF_MAX_INTERFACES) {
 		ret = -EINVAL;
 		goto err_get_user;
 	}
 
-	if (get_user(n_bufs, &arg->n_bufs)) {
-		ret = -EFAULT;
-		goto err_get_user;
-	}
-
-	if (n_bufs > ADF_MAX_BUFFERS) {
+	if (data.n_bufs > ADF_MAX_BUFFERS) {
 		ret = -EINVAL;
 		goto err_get_user;
 	}
 
-	if (get_user(custom_data_size, &arg->custom_data_size)) {
-		ret = -EFAULT;
-		goto err_get_user;
-	}
-
-	if (custom_data_size > ADF_MAX_CUSTOM_DATA_SIZE) {
+	if (data.custom_data_size > ADF_MAX_CUSTOM_DATA_SIZE) {
 		ret = -EINVAL;
 		goto err_get_user;
 	}
 
-	if (n_intfs) {
-		intfs = kmalloc(sizeof(intfs[0]) * n_intfs, GFP_KERNEL);
+	if (data.n_interfaces) {
+		intfs = kmalloc(sizeof(intfs[0]) * data.n_interfaces,
+			GFP_KERNEL);
 		if (!intfs) {
 			ret = -ENOMEM;
 			goto err_get_user;
 		}
 	}
 
-	for (i = 0; i < n_intfs; i++) {
+	for (i = 0; i < data.n_interfaces; i++) {
 		u32 intf_id;
-		if (get_user(intf_id, &arg->interfaces[i])) {
+		if (get_user(intf_id, &data.interfaces[i])) {
 			ret = -EFAULT;
 			goto err_get_user;
 		}
@@ -281,31 +270,31 @@ static int adf_device_post_config(struct adf_device *dev,
 		}
 	}
 
-	if (n_bufs) {
-		bufs = kzalloc(sizeof(bufs[0]) * n_bufs, GFP_KERNEL);
+	if (data.n_bufs) {
+		bufs = kzalloc(sizeof(bufs[0]) * data.n_bufs, GFP_KERNEL);
 		if (!bufs) {
 			ret = -ENOMEM;
 			goto err_get_user;
 		}
 	}
 
-	for (i = 0; i < n_bufs; i++) {
-		ret = adf_buffer_import(dev, &arg->bufs[i], &bufs[i]);
+	for (i = 0; i < data.n_bufs; i++) {
+		ret = adf_buffer_import(dev, &data.bufs[i], &bufs[i]);
 		if (ret < 0) {
 			memset(&bufs[i], 0, sizeof(bufs[i]));
 			goto err_import;
 		}
 	}
 
-	if (custom_data_size) {
-		custom_data = kzalloc(custom_data_size, GFP_KERNEL);
+	if (data.custom_data_size) {
+		custom_data = kzalloc(data.custom_data_size, GFP_KERNEL);
 		if (!custom_data) {
 			ret = -ENOMEM;
 			goto err_import;
 		}
 
-		if (copy_from_user(custom_data, arg->custom_data,
-				custom_data_size)) {
+		if (copy_from_user(custom_data, data.custom_data,
+				data.custom_data_size)) {
 			ret = -EFAULT;
 			goto err_import;
 		}
@@ -316,8 +305,8 @@ static int adf_device_post_config(struct adf_device *dev,
 		goto err_import;
 	}
 
-	complete_fence = adf_device_post_nocopy(dev, intfs, n_intfs, bufs,
-			n_bufs, custom_data, custom_data_size);
+	complete_fence = adf_device_post_nocopy(dev, intfs, data.n_interfaces,
+			bufs, data.n_bufs, custom_data, data.custom_data_size);
 	if (IS_ERR(complete_fence)) {
 		ret = PTR_ERR(complete_fence);
 		goto err_import;
@@ -327,7 +316,7 @@ static int adf_device_post_config(struct adf_device *dev,
 	return 0;
 
 err_import:
-	for (i = 0; i < n_bufs; i++)
+	for (i = 0; i < data.n_bufs; i++)
 		adf_buffer_cleanup(&bufs[i]);
 
 err_get_user:
@@ -481,19 +470,19 @@ static int adf_device_get_data(struct adf_device *dev,
 			data.n_allowed_attachments);
 
 	mutex_lock(&dev->client_lock);
-	ret = adf_obj_copy_custom_data_to_user(&dev->base, arg->custom_data,
+	ret = adf_obj_copy_custom_data_to_user(&dev->base, data.custom_data,
 			&data.custom_data_size);
 	mutex_unlock(&dev->client_lock);
 
 	if (ret < 0)
 		goto done;
 
-	ret = adf_copy_attachment_list_to_user(arg->attachments,
+	ret = adf_copy_attachment_list_to_user(data.attachments,
 			data.n_attachments, attach, n_attach);
 	if (ret < 0)
 		goto done;
 
-	ret = adf_copy_attachment_list_to_user(arg->allowed_attachments,
+	ret = adf_copy_attachment_list_to_user(data.allowed_attachments,
 			data.n_allowed_attachments, allowed_attach,
 			n_allowed_attach);
 	if (ret < 0)
@@ -592,7 +581,7 @@ static int adf_intf_get_data(struct adf_interface *intf,
 	data.n_available_modes = intf->n_modes;
 	read_unlock_irqrestore(&intf->hotplug_modelist_lock, flags);
 
-	if (copy_to_user(arg->available_modes, modelist, modelist_size)) {
+	if (copy_to_user(data.available_modes, modelist, modelist_size)) {
 		ret = -EFAULT;
 		goto done;
 	}
@@ -601,7 +590,7 @@ static int adf_intf_get_data(struct adf_interface *intf,
 	memcpy(&data.current_mode, &intf->current_mode,
 			sizeof(intf->current_mode));
 
-	ret = adf_obj_copy_custom_data_to_user(&intf->base, arg->custom_data,
+	ret = adf_obj_copy_custom_data_to_user(&intf->base, data.custom_data,
 			&data.custom_data_size);
 done:
 	mutex_unlock(&dev->client_lock);

From 0a60e50f0b910612c3f9560fc3a791e70d146b0b Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Tue, 11 Aug 2015 12:34:45 +0530
Subject: [PATCH 0533/3220] usb: gadget: f_mtp: simplify ptp NULL pointer check

Simplify MTP/PTP dev NULL pointer check introduced in
Change-Id: Ic44a699d96df2e13467fc081bff88b97dcc5afb2
and restrict it to MTP/PTP function level only.

Return ERR_PTR() instead of NULL from mtp_ptp function
to skip doing NULL pointer checks all the way up to
configfs.c

Fixes: Change-Id: Ic44a699d96df2e13467fc081bff88b97dcc5afb2
       ("usb: gadget: fix NULL ptr derefer while symlinking PTP func")
Change-Id: Iab7c55089c115550c3506f6cca960a07ae52713d
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/usb/gadget/configfs.c       | 5 -----
 drivers/usb/gadget/function/f_mtp.c | 2 +-
 drivers/usb/gadget/functions.c      | 2 +-
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index 8df96cb3bb58..54849fe9cb01 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -425,11 +425,6 @@ static int config_usb_cfg_link(
 	}
 
 	f = usb_get_function(fi);
-	if (f == NULL) {
-		/* Are we trying to symlink PTP without MTP function? */
-		ret = -EINVAL; /* Invalid Configuration */
-		goto out;
-	}
 	if (IS_ERR(f)) {
 		ret = PTR_ERR(f);
 		goto out;
diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
index 74195be9b054..b9e3e97c57c4 100644
--- a/drivers/usb/gadget/function/f_mtp.c
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -1498,7 +1498,7 @@ struct usb_function *function_alloc_mtp_ptp(struct usb_function_instance *fi,
 		pr_err("\t2: Create MTP function\n");
 		pr_err("\t3: Create and symlink PTP function"
 				" with a gadget configuration\n");
-		return NULL;
+		return ERR_PTR(-EINVAL); /* Invalid Configuration */
 	}
 
 	dev = fi_mtp->dev;
diff --git a/drivers/usb/gadget/functions.c b/drivers/usb/gadget/functions.c
index 389c1f3d0fee..b13f839e7368 100644
--- a/drivers/usb/gadget/functions.c
+++ b/drivers/usb/gadget/functions.c
@@ -58,7 +58,7 @@ struct usb_function *usb_get_function(struct usb_function_instance *fi)
 	struct usb_function *f;
 
 	f = fi->fd->alloc_func(fi);
-	if ((f == NULL) || IS_ERR(f))
+	if (IS_ERR(f))
 		return f;
 	f->fi = fi;
 	return f;

From ab2bff9a8f87be3710edef3f2f17dee78962a6a1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 13 May 2016 09:34:12 -0400
Subject: [PATCH 0534/3220] UPSTREAM: ring-buffer: Prevent overflow of size in
 ring_buffer_resize()

(Cherry picked from commit 59643d1535eb220668692a5359de22545af579f6)

If the size passed to ring_buffer_resize() is greater than MAX_LONG - BUF_PAGE_SIZE
then the DIV_ROUND_UP() will return zero.

Here's the details:

  # echo 18014398509481980 > /sys/kernel/debug/tracing/buffer_size_kb

tracing_entries_write() processes this and converts kb to bytes.

 18014398509481980 << 10 = 18446744073709547520

and this is passed to ring_buffer_resize() as unsigned long size.

 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);

Where DIV_ROUND_UP(a, b) is (a + b - 1)/b

BUF_PAGE_SIZE is 4080 and here

 18446744073709547520 + 4080 - 1 = 18446744073709551599

where 18446744073709551599 is still smaller than 2^64

 2^64 - 18446744073709551599 = 17

But now 18446744073709551599 / 4080 = 4521260802379792

and size = size * 4080 = 18446744073709551360

This is checked to make sure its still greater than 2 * 4080,
which it is.

Then we convert to the number of buffer pages needed.

 nr_page = DIV_ROUND_UP(size, BUF_PAGE_SIZE)

but this time size is 18446744073709551360 and

 2^64 - (18446744073709551360 + 4080 - 1) = -3823

Thus it overflows and the resulting number is less than 4080, which makes

  3823 / 4080 = 0

an nr_pages is set to this. As we already checked against the minimum that
nr_pages may be, this causes the logic to fail as well, and we crash the
kernel.

There's no reason to have the two DIV_ROUND_UP() (that's just result of
historical code changes), clean up the code and fix this bug.

Cc: stable@vger.kernel.org # 3.5+
Fixes: 83f40318dab00 ("ring-buffer: Make removal of ring buffer pages atomic")
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Change-Id: I1147672317a3ad0fc995b1f32baaa050a7976ac4
Bug: 32659848
---
 kernel/trace/ring_buffer.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c6045a27ba3..7fa26a57a847 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1665,14 +1665,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
 	    !cpumask_test_cpu(cpu_id, buffer->cpumask))
 		return size;
 
-	size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
-	size *= BUF_PAGE_SIZE;
+	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 
 	/* we need a minimum of two pages */
-	if (size < BUF_PAGE_SIZE * 2)
-		size = BUF_PAGE_SIZE * 2;
+	if (nr_pages < 2)
+		nr_pages = 2;
 
-	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	size = nr_pages * BUF_PAGE_SIZE;
 
 	/*
 	 * Don't succeed if resizing is disabled, as a reader might be

From 40ceb2c69964f8bde97d4ded4306508db16fd365 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Tue, 15 Nov 2016 19:25:40 -0800
Subject: [PATCH 0535/3220] usb: gadget: Fix compilation problem with tx_qlen
 field

Change-Id: I38c4f4a850b0329fb4a06b2c7e45558e16d66151
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 drivers/usb/gadget/function/u_ether.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c
index 930f58f0ba32..cc5210a87614 100644
--- a/drivers/usb/gadget/function/u_ether.c
+++ b/drivers/usb/gadget/function/u_ether.c
@@ -746,7 +746,7 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb,
 		req->no_interrupt = (((dev->gadget->speed == USB_SPEED_HIGH ||
 				       dev->gadget->speed == USB_SPEED_SUPER)) &&
 					!list_empty(&dev->tx_reqs))
-			? ((atomic_read(&dev->tx_qlen) % dev->qmult) != 0)
+			? ((dev->tx_qlen % dev->qmult) != 0)
 			: 0;
 
 	retval = usb_ep_queue(in, req, GFP_ATOMIC);

From 6dc8c51a76e57ae01fa087bdb5450423ce76a373 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 11:58:52 +0530
Subject: [PATCH 0536/3220] cpufreq: sched: Fix kernel crash on accessing sysfs
 file

If the cpufreq driver hasn't set the CPUFREQ_HAVE_GOVERNOR_PER_POLICY
flag, then the kernel will crash on accessing sysfs files for the sched
governor.

CPUFreq governors we can have the governor specific sysfs files in two
places:

A. /sys/devices/system/cpu/cpuX/cpufreq/<governor>
B. /sys/devices/system/cpu/cpufreq/<governor>

The case A. is for governor per policy case, where we can control the
governor tunables for each policy separately. The case B. is for system
wide tunable values.

The schedfreq governor only implements the case A. and not B.  The sysfs
files in case B will still be present in
/sys/devices/system/cpu/cpufreq/<governor>, but accessing them will
crash kernel as the governor doesn't support that.

Moreover the sched governor is pretty new and will be used only for the
ARM platforms and there is no need to support the case B at all.

Hence use policy->kobj instead of get_governor_parent_kobj(), so that we
always create the sysfs files in path A.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 kernel/sched/cpufreq_sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index f6f9b9b3a4a8..d751bc2d0d6e 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -289,7 +289,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 	pr_debug("%s: throttle threshold = %u [ns]\n",
 		  __func__, gd->up_throttle_nsec);
 
-	rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+	rc = sysfs_create_group(&policy->kobj, get_sysfs_attr());
 	if (rc) {
 		pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
 		goto err;
@@ -332,7 +332,7 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
 		put_task_struct(gd->task);
 	}
 
-	sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+	sysfs_remove_group(&policy->kobj, get_sysfs_attr());
 
 	policy->governor_data = NULL;
 

From ff458c4fca9797d314a024bad05b59831eff3f58 Mon Sep 17 00:00:00 2001
From: Anson Jacob <ansonjacob.aj@gmail.com>
Date: Fri, 11 Nov 2016 01:10:04 -0500
Subject: [PATCH 0537/3220] ANDROID: usb: gadget: function: cleanup: Add blank
 line after declaration

Fix warning generated by checkpatch.pl:
Missing a blank line after declarations

Change-Id: Id129bb8cc8fa37c67a647e2e5996bb2817020e65
Signed-off-by: Anson Jacob <ansonjacob.aj@gmail.com>
---
 drivers/usb/gadget/function/f_accessory.c    | 2 ++
 drivers/usb/gadget/function/f_audio_source.c | 1 +
 drivers/usb/gadget/function/f_mtp.c          | 4 ++++
 3 files changed, 7 insertions(+)

diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c
index 2ca16a577542..9d3ec0e37475 100644
--- a/drivers/usb/gadget/function/f_accessory.c
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -211,6 +211,7 @@ static inline struct acc_dev *func_to_dev(struct usb_function *f)
 static struct usb_request *acc_request_new(struct usb_ep *ep, int buffer_size)
 {
 	struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
 	if (!req)
 		return NULL;
 
@@ -1021,6 +1022,7 @@ acc_function_unbind(struct usb_configuration *c, struct usb_function *f)
 static void acc_start_work(struct work_struct *data)
 {
 	char *envp[2] = { "ACCESSORY=START", NULL };
+
 	kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp);
 }
 
diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c
index 2489a5fa2685..db7903d19c43 100644
--- a/drivers/usb/gadget/function/f_audio_source.c
+++ b/drivers/usb/gadget/function/f_audio_source.c
@@ -310,6 +310,7 @@ static struct device_attribute *audio_source_function_attributes[] = {
 static struct usb_request *audio_request_new(struct usb_ep *ep, int buffer_size)
 {
 	struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
 	if (!req)
 		return NULL;
 
diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
index b9e3e97c57c4..d8b69af6e335 100644
--- a/drivers/usb/gadget/function/f_mtp.c
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -361,6 +361,7 @@ static inline struct mtp_dev *func_to_mtp(struct usb_function *f)
 static struct usb_request *mtp_request_new(struct usb_ep *ep, int buffer_size)
 {
 	struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
 	if (!req)
 		return NULL;
 
@@ -1121,6 +1122,7 @@ static int mtp_ctrlrequest(struct usb_composite_dev *cdev,
 		} else if (ctrl->bRequest == MTP_REQ_GET_DEVICE_STATUS
 				&& w_index == 0 && w_value == 0) {
 			struct mtp_device_status *status = cdev->req->buf;
+
 			status->wLength =
 				__constant_cpu_to_le16(sizeof(*status));
 
@@ -1143,6 +1145,7 @@ static int mtp_ctrlrequest(struct usb_composite_dev *cdev,
 	/* respond with data transfer or status phase? */
 	if (value >= 0) {
 		int rc;
+
 		cdev->req->zero = value < w_length;
 		cdev->req->length = value;
 		rc = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC);
@@ -1378,6 +1381,7 @@ static struct mtp_instance *to_mtp_instance(struct config_item *item)
 static void mtp_attr_release(struct config_item *item)
 {
 	struct mtp_instance *fi_mtp = to_mtp_instance(item);
+
 	usb_put_function_instance(&fi_mtp->func_inst);
 }
 

From 1a8b993d92021b4f81da4b219a4214bdec06f541 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Fri, 26 Feb 2016 19:00:03 +0000
Subject: [PATCH 0538/3220] BACKPORT: staging: goldfish: audio: add devicetree
 bindings

Introduce devicetree bindings to the Goldfish staging audio driver.

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 283ded10312a3b75e384313f6f529ec2c636cf2c)

Change-Id: Ib75d3a4cac7353084a8da18a96fb298a759bacc0
---
 .../devicetree/bindings/goldfish/audio.txt      | 17 +++++++++++++++++
 drivers/staging/goldfish/goldfish_audio.c       |  9 ++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/devicetree/bindings/goldfish/audio.txt

diff --git a/Documentation/devicetree/bindings/goldfish/audio.txt b/Documentation/devicetree/bindings/goldfish/audio.txt
new file mode 100644
index 000000000000..d043fda433ba
--- /dev/null
+++ b/Documentation/devicetree/bindings/goldfish/audio.txt
@@ -0,0 +1,17 @@
+Android Goldfish Audio
+
+Android goldfish audio device generated by android emulator.
+
+Required properties:
+
+- compatible : should contain "google,goldfish-audio" to match emulator
+- reg        : <registers mapping>
+- interrupts : <interrupt mapping>
+
+Example:
+
+	goldfish_audio@9030000 {
+		compatible = "google,goldfish-audio";
+		reg = <0x9030000 0x100>;
+		interrupts = <0x4>;
+	};
diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c
index b0927e49d0a8..8841680ced9f 100644
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c
@@ -344,11 +344,18 @@ static int goldfish_audio_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id goldfish_audio_of_match[] = {
+	{ .compatible = "google,goldfish-audio", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, goldfish_audio_of_match);
+
 static struct platform_driver goldfish_audio_driver = {
 	.probe		= goldfish_audio_probe,
 	.remove		= goldfish_audio_remove,
 	.driver = {
-		.name = "goldfish_audio"
+		.name = "goldfish_audio",
+		.of_match_table = goldfish_audio_of_match,
 	}
 };
 

From ac4d96f09a06c127d9d7509efbe6403a6b19d208 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Fri, 26 Feb 2016 18:45:30 +0000
Subject: [PATCH 0539/3220] BACKPORT: power: goldfish_battery: add devicetree
 bindings

Add device tree bindings to the Goldfish virtual platform battery drivers.

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
(cherry picked from commit 65d687a7b7d6f27e4306fe8cc8a1ca66a1a760f6)

Change-Id: If947ea3341ff0cb713c56e14d18d51a3f5912b64
---
 .../devicetree/bindings/goldfish/battery.txt    | 17 +++++++++++++++++
 drivers/power/goldfish_battery.c                |  9 ++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/devicetree/bindings/goldfish/battery.txt

diff --git a/Documentation/devicetree/bindings/goldfish/battery.txt b/Documentation/devicetree/bindings/goldfish/battery.txt
new file mode 100644
index 000000000000..4fb613933214
--- /dev/null
+++ b/Documentation/devicetree/bindings/goldfish/battery.txt
@@ -0,0 +1,17 @@
+Android Goldfish Battery
+
+Android goldfish battery device generated by android emulator.
+
+Required properties:
+
+- compatible : should contain "google,goldfish-battery" to match emulator
+- reg        : <registers mapping>
+- interrupts : <interrupt mapping>
+
+Example:
+
+	goldfish_battery@9020000 {
+		compatible = "google,goldfish-battery";
+		reg = <0x9020000 0x1000>;
+		interrupts = <0x3>;
+	};
diff --git a/drivers/power/goldfish_battery.c b/drivers/power/goldfish_battery.c
index a50bb988c69a..7510796e190c 100644
--- a/drivers/power/goldfish_battery.c
+++ b/drivers/power/goldfish_battery.c
@@ -227,11 +227,18 @@ static int goldfish_battery_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id goldfish_battery_of_match[] = {
+	{ .compatible = "google,goldfish-battery", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, goldfish_battery_of_match);
+
 static struct platform_driver goldfish_battery_device = {
 	.probe		= goldfish_battery_probe,
 	.remove		= goldfish_battery_remove,
 	.driver = {
-		.name = "goldfish-battery"
+		.name = "goldfish-battery",
+		.of_match_table = goldfish_battery_of_match,
 	}
 };
 module_platform_driver(goldfish_battery_device);

From aec62280ca9d6091b08c979284ae41379549d506 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Fri, 26 Feb 2016 12:05:02 -0800
Subject: [PATCH 0540/3220] BACKPORT: Input: goldfish_events - add devicetree
 bindings

Add device tree bindings to the Goldfish virtual platform event driver.

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Alan <alan@linux.intel.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
(cherry picked from commit 8c5dc5a1ada2b79259e55a4bd150135d23529c6a)

Change-Id: I677d8e0d92294f53f7cc5a79300b6462b65e8aad
---
 .../devicetree/bindings/goldfish/events.txt     | 17 +++++++++++++++++
 drivers/input/keyboard/goldfish_events.c        |  7 +++++++
 2 files changed, 24 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/goldfish/events.txt

diff --git a/Documentation/devicetree/bindings/goldfish/events.txt b/Documentation/devicetree/bindings/goldfish/events.txt
new file mode 100644
index 000000000000..5babf46317a4
--- /dev/null
+++ b/Documentation/devicetree/bindings/goldfish/events.txt
@@ -0,0 +1,17 @@
+Android Goldfish Events Keypad
+
+Android goldfish events keypad device generated by android emulator.
+
+Required properties:
+
+- compatible : should contain "google,goldfish-events-keypad" to match emulator
+- reg        : <registers mapping>
+- interrupts : <interrupt mapping>
+
+Example:
+
+	goldfish-events@9040000 {
+		compatible = "google,goldfish-events-keypad";
+		reg = <0x9040000 0x1000>;
+		interrupts = <0x5>;
+	};
diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c
index 907e4e278fce..b11d218604a7 100644
--- a/drivers/input/keyboard/goldfish_events.c
+++ b/drivers/input/keyboard/goldfish_events.c
@@ -178,10 +178,17 @@ static int events_probe(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id goldfish_events_of_match[] = {
+	{ .compatible = "google,goldfish-events-keypad", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, goldfish_events_of_match);
+
 static struct platform_driver events_driver = {
 	.probe	= events_probe,
 	.driver	= {
 		.name	= "goldfish_events",
+		.of_match_table = goldfish_events_of_match,
 	},
 };
 

From 3f5a380007fec7d4f6a0f361cf9da78a206e53c1 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Fri, 26 Feb 2016 19:01:05 +0000
Subject: [PATCH 0541/3220] BACKPORT: tty: goldfish: support platform_device
 with id -1

When the platform bus sets the platform_device id to -1 (PLATFORM_DEVID_NONE),
use an incrementing counter for the TTY index instead

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 465893e18878e119d8d0255439fad8debbd646fd)

Change-Id: Ifec5ee9d71c7c076e59bb7af77c0184d1b1383cb
---
 drivers/tty/goldfish.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index 0f82c0b146f6..4356a23c588f 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -68,8 +68,7 @@ static void goldfish_tty_do_write(int line, const char *buf, unsigned count)
 
 static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id)
 {
-	struct platform_device *pdev = dev_id;
-	struct goldfish_tty *qtty = &goldfish_ttys[pdev->id];
+	struct goldfish_tty *qtty = dev_id;
 	void __iomem *base = qtty->base;
 	unsigned long irq_flags;
 	unsigned char *buf;
@@ -233,6 +232,7 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	struct device *ttydev;
 	void __iomem *base;
 	u32 irq;
+	unsigned int line;
 
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (r == NULL)
@@ -248,10 +248,16 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 
 	irq = r->start;
 
-	if (pdev->id >= goldfish_tty_line_count)
-		goto err_unmap;
-
 	mutex_lock(&goldfish_tty_lock);
+
+	if (pdev->id == PLATFORM_DEVID_NONE)
+		line = goldfish_tty_current_line_count;
+	else
+		line = pdev->id;
+
+	if (line >= goldfish_tty_line_count)
+		goto err_create_driver_failed;
+
 	if (goldfish_tty_current_line_count == 0) {
 		ret = goldfish_tty_create_driver();
 		if (ret)
@@ -259,7 +265,7 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	}
 	goldfish_tty_current_line_count++;
 
-	qtty = &goldfish_ttys[pdev->id];
+	qtty = &goldfish_ttys[line];
 	spin_lock_init(&qtty->lock);
 	tty_port_init(&qtty->port);
 	qtty->port.ops = &goldfish_port_ops;
@@ -269,13 +275,13 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	writel(GOLDFISH_TTY_CMD_INT_DISABLE, base + GOLDFISH_TTY_CMD);
 
 	ret = request_irq(irq, goldfish_tty_interrupt, IRQF_SHARED,
-						"goldfish_tty", pdev);
+						"goldfish_tty", qtty);
 	if (ret)
 		goto err_request_irq_failed;
 
 
 	ttydev = tty_port_register_device(&qtty->port, goldfish_tty_driver,
-							pdev->id, &pdev->dev);
+							line, &pdev->dev);
 	if (IS_ERR(ttydev)) {
 		ret = PTR_ERR(ttydev);
 		goto err_tty_register_device_failed;
@@ -286,8 +292,9 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	qtty->console.device = goldfish_tty_console_device;
 	qtty->console.setup = goldfish_tty_console_setup;
 	qtty->console.flags = CON_PRINTBUFFER;
-	qtty->console.index = pdev->id;
+	qtty->console.index = line;
 	register_console(&qtty->console);
+	platform_set_drvdata(pdev, qtty);
 
 	mutex_unlock(&goldfish_tty_lock);
 	return 0;
@@ -307,13 +314,12 @@ err_unmap:
 
 static int goldfish_tty_remove(struct platform_device *pdev)
 {
-	struct goldfish_tty *qtty;
+	struct goldfish_tty *qtty = platform_get_drvdata(pdev);
 
 	mutex_lock(&goldfish_tty_lock);
 
-	qtty = &goldfish_ttys[pdev->id];
 	unregister_console(&qtty->console);
-	tty_unregister_device(goldfish_tty_driver, pdev->id);
+	tty_unregister_device(goldfish_tty_driver, qtty->console.index);
 	iounmap(qtty->base);
 	qtty->base = NULL;
 	free_irq(qtty->irq, pdev);

From b58abfa8bd400855bf09c84ae27dd2a4446c60f9 Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@imgtec.com>
Date: Fri, 26 Feb 2016 19:00:44 +0000
Subject: [PATCH 0542/3220] BACKPORT: drivers: tty: goldfish: Add device tree
 bindings

Enable support for registering this device using the device tree.
Device tree node example for registering Goldfish TTY device :

goldfish_tty@1f004000 {
    interrupts = <0xc>;
    reg = <0x1f004000 0x1000>;
    compatible = "google,goldfish-tty";
};

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Alan <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 9b883eea26ccf043b608e398cf6a26231d44f5fb)

Change-Id: Idbe1bbac4f371e2feb6730712b08b66be1188ea7
---
 .../devicetree/bindings/goldfish/tty.txt        | 17 +++++++++++++++++
 drivers/tty/goldfish.c                          | 10 +++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/devicetree/bindings/goldfish/tty.txt

diff --git a/Documentation/devicetree/bindings/goldfish/tty.txt b/Documentation/devicetree/bindings/goldfish/tty.txt
new file mode 100644
index 000000000000..82648278da77
--- /dev/null
+++ b/Documentation/devicetree/bindings/goldfish/tty.txt
@@ -0,0 +1,17 @@
+Android Goldfish TTY
+
+Android goldfish tty device generated by android emulator.
+
+Required properties:
+
+- compatible : should contain "google,goldfish-tty" to match emulator
+- reg        : <registers mapping>
+- interrupts : <interrupt mapping>
+
+Example:
+
+	goldfish_tty@1f004000 {
+		compatible = "google,goldfish-tty";
+		reg = <0x1f004000 0x1000>;
+		interrupts = <0xc>;
+	};
diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index 4356a23c588f..1e332855b933 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -330,11 +330,19 @@ static int goldfish_tty_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id goldfish_tty_of_match[] = {
+	{ .compatible = "google,goldfish-tty", },
+	{},
+};
+
+MODULE_DEVICE_TABLE(of, goldfish_tty_of_match);
+
 static struct platform_driver goldfish_tty_platform_driver = {
 	.probe = goldfish_tty_probe,
 	.remove = goldfish_tty_remove,
 	.driver = {
-		.name = "goldfish_tty"
+		.name = "goldfish_tty",
+		.of_match_table = goldfish_tty_of_match,
 	}
 };
 

From 779a63ef878a4d8cd601e791fdf81df9fd5f953a Mon Sep 17 00:00:00 2001
From: Yu Ning <yu.ning@intel.com>
Date: Tue, 1 Mar 2016 23:46:10 +0000
Subject: [PATCH 0543/3220] BACKPORT: goldfish: Enable ACPI-based enumeration
 for goldfish battery

Add the ACPI bindings to the goldfish battery driver.

Signed-off-by: Yu Ning <yu.ning@intel.com>
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
(cherry picked from commit fdb2f37a54470473c6b7c9d680c4c114dd9bc434)

Change-Id: I3b53481b5868b0b26848397420c9ba16a747819f
---
 drivers/power/goldfish_battery.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/power/goldfish_battery.c b/drivers/power/goldfish_battery.c
index 7510796e190c..f5c525e4482a 100644
--- a/drivers/power/goldfish_battery.c
+++ b/drivers/power/goldfish_battery.c
@@ -24,6 +24,7 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/acpi.h>
 
 struct goldfish_battery_data {
 	void __iomem *reg_base;
@@ -233,12 +234,19 @@ static const struct of_device_id goldfish_battery_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, goldfish_battery_of_match);
 
+static const struct acpi_device_id goldfish_battery_acpi_match[] = {
+	{ "GFSH0001", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_battery_acpi_match);
+
 static struct platform_driver goldfish_battery_device = {
 	.probe		= goldfish_battery_probe,
 	.remove		= goldfish_battery_remove,
 	.driver = {
 		.name = "goldfish-battery",
 		.of_match_table = goldfish_battery_of_match,
+		.acpi_match_table = ACPI_PTR(goldfish_battery_acpi_match),
 	}
 };
 module_platform_driver(goldfish_battery_device);

From 212048596dbaa5e81386c9594782c4928ed6efa1 Mon Sep 17 00:00:00 2001
From: Jason Hu <jia-cheng.hu@intel.com>
Date: Fri, 26 Feb 2016 12:06:47 -0800
Subject: [PATCH 0544/3220] BACKPORT: Input: goldfish_events - enable
 ACPI-based enumeration for goldfish events

Add ACPI binding to the goldfish events driver.

Signed-off-by: Jason Hu <jia-cheng.hu@intel.com>
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Alan <alan@linux.intel.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
(cherry picked from commit 0581ce09fd2c976125a20791268d7206db156d2f)

Change-Id: Ic3e4f1cffb111ea6c69977e63dd598e3fcb55f19
---
 drivers/input/keyboard/goldfish_events.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c
index b11d218604a7..f6e643b589b6 100644
--- a/drivers/input/keyboard/goldfish_events.c
+++ b/drivers/input/keyboard/goldfish_events.c
@@ -22,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/io.h>
+#include <linux/acpi.h>
 
 enum {
 	REG_READ        = 0x00,
@@ -184,11 +185,20 @@ static const struct of_device_id goldfish_events_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, goldfish_events_of_match);
 
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id goldfish_events_acpi_match[] = {
+	{ "GFSH0002", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_events_acpi_match);
+#endif
+
 static struct platform_driver events_driver = {
 	.probe	= events_probe,
 	.driver	= {
 		.name	= "goldfish_events",
 		.of_match_table = goldfish_events_of_match,
+		.acpi_match_table = ACPI_PTR(goldfish_events_acpi_match),
 	},
 };
 

From f3d949fd57efc8d8a868517e0f64b77662388417 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Fri, 26 Feb 2016 19:00:18 +0000
Subject: [PATCH 0545/3220] BACKPORT: staging: goldfish: audio: fix
 compiliation on arm

We do actually need slab.h, by luck we get it on other platforms but not
always on ARM. Include it properly.

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Alan <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 4532150762ceb0d6fd765ebcb3ba6966fbb8faab)

Change-Id: I93a0c35da40f26aaa7c253e3c0cefaa883ea3391
---
 drivers/staging/goldfish/goldfish_audio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c
index 8841680ced9f..4191054b878e 100644
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c
@@ -26,6 +26,7 @@
 #include <linux/sched.h>
 #include <linux/dma-mapping.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 #include <linux/goldfish.h>
 
 MODULE_AUTHOR("Google, Inc.");

From 95f38dcf1d106e62e97f207344978d6230a9c292 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Mon, 28 Oct 2013 15:33:33 -0700
Subject: [PATCH 0546/3220] ANDROID: video: goldfishfb: add devicetree bindings

Change-Id: I5f4ba861b981edf39af537001f8ac72202927031
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 drivers/video/fbdev/goldfishfb.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c
index 7f6c9e6cfc6c..f0e651bbbd61 100644
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -304,12 +304,19 @@ static int goldfish_fb_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id goldfish_fb_of_match[] = {
+	{ .compatible = "google,goldfish-fb", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, goldfish_fb_of_match);
 
 static struct platform_driver goldfish_fb_driver = {
 	.probe		= goldfish_fb_probe,
 	.remove		= goldfish_fb_remove,
 	.driver = {
-		.name = "goldfish_fb"
+		.name = "goldfish_fb",
+		.owner = THIS_MODULE,
+		.of_match_table = goldfish_fb_of_match,
 	}
 };
 

From 9cec6e315327318d99542f5ce695f62351cc2886 Mon Sep 17 00:00:00 2001
From: Yu Ning <yu.ning@intel.com>
Date: Thu, 12 Feb 2015 11:44:40 +0800
Subject: [PATCH 0547/3220] ANDROID: goldfish: Enable ACPI-based enumeration
 for goldfish framebuffer

Follow the same way in which ACPI was enabled for goldfish battery. See
commit d3be10e for details.

Note that this patch also depends on commit af33cac.

Change-Id: Ic63b6e7e0a4b9896ef9a9d0ed135a7796a4c1fdb
Signed-off-by: Yu Ning <yu.ning@intel.com>
---
 drivers/video/fbdev/goldfishfb.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c
index f0e651bbbd61..58b33e4af16e 100644
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/platform_device.h>
+#include <linux/acpi.h>
 
 enum {
 	FB_GET_WIDTH        = 0x00,
@@ -310,6 +311,12 @@ static const struct of_device_id goldfish_fb_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, goldfish_fb_of_match);
 
+static const struct acpi_device_id goldfish_fb_acpi_match[] = {
+	{ "GFSH0004", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_fb_acpi_match);
+
 static struct platform_driver goldfish_fb_driver = {
 	.probe		= goldfish_fb_probe,
 	.remove		= goldfish_fb_remove,
@@ -317,6 +324,7 @@ static struct platform_driver goldfish_fb_driver = {
 		.name = "goldfish_fb",
 		.owner = THIS_MODULE,
 		.of_match_table = goldfish_fb_of_match,
+		.acpi_match_table = ACPI_PTR(goldfish_fb_acpi_match),
 	}
 };
 

From a666500dd7205043280ed38c2bca81854b4067be Mon Sep 17 00:00:00 2001
From: Yu Ning <yu.ning@intel.com>
Date: Tue, 31 Mar 2015 14:41:48 +0800
Subject: [PATCH 0548/3220] ANDROID: goldfish: Enable ACPI-based enumeration
 for goldfish audio

Follow the same way in which ACPI was enabled for goldfish battery. See
commit d3be10e for details.

Change-Id: I6ffe38ebc80fb8af8322152370b9d1fd227eaf50
Signed-off-by: Yu Ning <yu.ning@intel.com>
---
 drivers/staging/goldfish/goldfish_audio.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c
index 4191054b878e..83bbe459b93a 100644
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c
@@ -28,6 +28,7 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 #include <linux/goldfish.h>
+#include <linux/acpi.h>
 
 MODULE_AUTHOR("Google, Inc.");
 MODULE_DESCRIPTION("Android QEMU Audio Driver");
@@ -351,12 +352,19 @@ static const struct of_device_id goldfish_audio_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, goldfish_audio_of_match);
 
+static const struct acpi_device_id goldfish_audio_acpi_match[] = {
+	{ "GFSH0005", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_audio_acpi_match);
+
 static struct platform_driver goldfish_audio_driver = {
 	.probe		= goldfish_audio_probe,
 	.remove		= goldfish_audio_remove,
 	.driver = {
 		.name = "goldfish_audio",
 		.of_match_table = goldfish_audio_of_match,
+		.acpi_match_table = ACPI_PTR(goldfish_audio_acpi_match),
 	}
 };
 

From d5a5ff91f852f26d7261cb31cb126f267567af24 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Thu, 19 Jun 2014 16:24:04 +0200
Subject: [PATCH 0549/3220] ANDROID: goldfish_fb: Set pixclock = 0

User space Android code identifies pixclock == 0 as a sign for emulation
and will set the frame rate to 60 fps when reading this value, which is
the desired outcome.

Change-Id: I759bf518bf6683446bc786bf1be3cafa02dd8d42
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 drivers/video/fbdev/goldfishfb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c
index 58b33e4af16e..131fee0341c3 100644
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -235,7 +235,7 @@ static int goldfish_fb_probe(struct platform_device *pdev)
 	fb->fb.var.activate	= FB_ACTIVATE_NOW;
 	fb->fb.var.height	= readl(fb->reg_base + FB_GET_PHYS_HEIGHT);
 	fb->fb.var.width	= readl(fb->reg_base + FB_GET_PHYS_WIDTH);
-	fb->fb.var.pixclock	= 10000;
+	fb->fb.var.pixclock	= 0;
 
 	fb->fb.var.red.offset = 11;
 	fb->fb.var.red.length = 5;

From 8bf12bc1b78dac6cb4fb7e4fbc0920978d17f5ea Mon Sep 17 00:00:00 2001
From: Lingfeng Yang <lfy@google.com>
Date: Fri, 18 Dec 2015 12:04:43 -0800
Subject: [PATCH 0550/3220] ANDROID: goldfish_events: no extra EV_SYN; register
 goldfish

If we send SYN_REPORT on every single
multitouch event, it breaks the multitouch.

The multitouch becomes janky and
having to click 2-3 times to
do stuff (plus randomly activating notification
bars when not clicking)

If we suppress these SYN_REPORTS,
multitouch will work fine, plus the events
will have a protocol that looks nice.

In addition, we need to register Goldfish Events
as a multitouch device by issuing
input_mt_init_slots, otherwise
input_handle_abs_event in drivers/input/input.c
will silently drop all ABS_MT_SLOT events,
making it so that touches with more than 1 finger
do not work properly.

Signed-off-by: "Lingfeng Yang" <lfy@google.com>
Change-Id: Ib2350f7d1732449d246f6f0d9b7b08f02cc7c2dd
(cherry picked from commit 6cf40d0a16330e1ef42bdf07d9aba6c16ee11fbc)
---
 drivers/input/keyboard/goldfish_events.c | 28 +++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c
index f6e643b589b6..c877e56a9bd5 100644
--- a/drivers/input/keyboard/goldfish_events.c
+++ b/drivers/input/keyboard/goldfish_events.c
@@ -17,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/input.h>
+#include <linux/input/mt.h>
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
@@ -24,6 +25,8 @@
 #include <linux/io.h>
 #include <linux/acpi.h>
 
+#define GOLDFISH_MAX_FINGERS 5
+
 enum {
 	REG_READ        = 0x00,
 	REG_SET_PAGE    = 0x00,
@@ -52,7 +55,21 @@ static irqreturn_t events_interrupt(int irq, void *dev_id)
 	value = __raw_readl(edev->addr + REG_READ);
 
 	input_event(edev->input, type, code, value);
-	input_sync(edev->input);
+	// Send an extra (EV_SYN, SYN_REPORT, 0x0) event
+	// if a key was pressed. Some keyboard device
+        // drivers may only send the EV_KEY event and
+        // not EV_SYN.
+        // Note that sending an extra SYN_REPORT is not
+        // necessary nor correct protocol with other
+        // devices such as touchscreens, which will send
+        // their own SYN_REPORT's when sufficient event
+        // information has been collected (e.g., for
+        // touchscreens, when pressure and X/Y coordinates
+	// have been received). Hence, we will only send
+	// this extra SYN_REPORT if type == EV_KEY.
+	if (type == EV_KEY) {
+		input_sync(edev->input);
+	}
 	return IRQ_HANDLED;
 }
 
@@ -154,6 +171,15 @@ static int events_probe(struct platform_device *pdev)
 
 	input_dev->name = edev->name;
 	input_dev->id.bustype = BUS_HOST;
+	// Set the Goldfish Device to be multi-touch.
+	// In the Ranchu kernel, there is multi-touch-specific
+	// code for handling ABS_MT_SLOT events.
+	// See drivers/input/input.c:input_handle_abs_event.
+	// If we do not issue input_mt_init_slots,
+        // the kernel will filter out needed ABS_MT_SLOT
+        // events when we touch the screen in more than one place,
+        // preventing multi-touch with more than one finger from working.
+	input_mt_init_slots(input_dev, GOLDFISH_MAX_FINGERS, 0);
 
 	events_import_bits(edev, input_dev->evbit, EV_SYN, EV_MAX);
 	events_import_bits(edev, input_dev->keybit, EV_KEY, KEY_MAX);

From 18e334986224af1dc9d1a61dd8957f07919c76d0 Mon Sep 17 00:00:00 2001
From: Joshua Lang <joshualang@google.com>
Date: Fri, 17 Jun 2016 17:30:55 -0700
Subject: [PATCH 0551/3220] ANDROID: goldfish_audio: Clear audio read buffer
 status after each read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The buffer_status field is interrupt updated. After every read request,
the buffer_status read field should be reset so that on the next loop
iteration we don't read a stale value and read data before the
device is ready.

Signed-off-by: “Joshua Lang” <joshualang@google.com>
Change-Id: I4943d5aaada1cad9c7e59a94a87c387578dabe86
---
 drivers/staging/goldfish/goldfish_audio.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c
index 83bbe459b93a..63b79c09b41b 100644
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c
@@ -117,6 +117,7 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf,
 				   size_t count, loff_t *pos)
 {
 	struct goldfish_audio *data = fp->private_data;
+	unsigned long irq_flags;
 	int length;
 	int result = 0;
 
@@ -130,6 +131,10 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf,
 		wait_event_interruptible(data->wait, data->buffer_status &
 					 AUDIO_INT_READ_BUFFER_FULL);
 
+		spin_lock_irqsave(&data->lock, irq_flags);
+		data->buffer_status &= ~AUDIO_INT_READ_BUFFER_FULL;
+		spin_unlock_irqrestore(&data->lock, irq_flags);
+
 		length = AUDIO_READ(data, AUDIO_READ_BUFFER_AVAILABLE);
 
 		/* copy data to user space */

From 0c1529aecf8fb719cf49de4d5a452b855e437387 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Fri, 7 Oct 2016 16:20:47 -0700
Subject: [PATCH 0552/3220] ANDROID: goldfish: add ranchu defconfigs

Change-Id: I73ef1b132b6203ae921a1e1d4850eaadf58f8926
---
 arch/arm/configs/ranchu_defconfig        | 315 +++++++++++++++++
 arch/arm64/configs/ranchu_defconfig      | 311 +++++++++++++++++
 arch/x86/configs/i386_ranchu_defconfig   | 422 +++++++++++++++++++++++
 arch/x86/configs/x86_64_ranchu_defconfig | 417 ++++++++++++++++++++++
 4 files changed, 1465 insertions(+)
 create mode 100644 arch/arm/configs/ranchu_defconfig
 create mode 100644 arch/arm64/configs/ranchu_defconfig
 create mode 100644 arch/x86/configs/i386_ranchu_defconfig
 create mode 100644 arch/x86/configs/x86_64_ranchu_defconfig

diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig
new file mode 100644
index 000000000000..35a90af941a4
--- /dev/null
+++ b/arch/arm/configs/ranchu_defconfig
@@ -0,0 +1,315 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_ARCH_MMAP_RND_BITS=16
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+# CONFIG_IOSCHED_CFQ is not set
+CONFIG_ARCH_VIRT=y
+CONFIG_ARM_KERNMEM_PERMS=y
+CONFIG_SMP=y
+CONFIG_PREEMPT=y
+CONFIG_AEABI=y
+CONFIG_HIGHMEM=y
+CONFIG_KSM=y
+CONFIG_SECCOMP=y
+CONFIG_CMDLINE="console=ttyAMA0"
+CONFIG_VFP=y
+CONFIG_NEON=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_BRIDGE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_MTD=y
+CONFIG_MTD_CMDLINE_PARTS=y
+CONFIG_MTD_BLOCK=y
+CONFIG_MTD_CFI=y
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_CFI_AMDSTD=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_SMSC911X=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+# CONFIG_WLAN is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_SERPORT is not set
+CONFIG_SERIO_AMBAKMI=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+# CONFIG_HW_RANDOM is not set
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_FB_SIMPLE=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_PL031=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SW_SYNC_USER=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_NFS_FS=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_DEBUG_INFO=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_VIRTUALIZATION=y
diff --git a/arch/arm64/configs/ranchu_defconfig b/arch/arm64/configs/ranchu_defconfig
new file mode 100644
index 000000000000..00eb346e0928
--- /dev/null
+++ b/arch/arm64/configs/ranchu_defconfig
@@ -0,0 +1,311 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+# CONFIG_SWAP is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_BSD_PROCESS_ACCT_V3=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_ARCH_MMAP_RND_BITS=24
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+CONFIG_ARCH_VEXPRESS=y
+CONFIG_NR_CPUS=4
+CONFIG_PREEMPT=y
+CONFIG_KSM=y
+CONFIG_SECCOMP=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_SWP_EMULATION=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_CMDLINE="console=ttyAMA0"
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_COMPAT=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_RPFILTER=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_TARGET_ECN=y
+CONFIG_IP_NF_TARGET_TTL=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_AH=y
+CONFIG_IP6_NF_MATCH_EUI64=y
+CONFIG_IP6_NF_MATCH_FRAG=y
+CONFIG_IP6_NF_MATCH_OPTS=y
+CONFIG_IP6_NF_MATCH_HL=y
+CONFIG_IP6_NF_MATCH_IPV6HEADER=y
+CONFIG_IP6_NF_MATCH_MH=y
+CONFIG_IP6_NF_MATCH_RT=y
+CONFIG_IP6_NF_TARGET_HL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_BRIDGE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_SCSI=y
+# CONFIG_SCSI_PROC_FS is not set
+CONFIG_BLK_DEV_SD=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_SMC91X=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+# CONFIG_WLAN is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+# CONFIG_HW_RANDOM is not set
+CONFIG_BATTERY_GOLDFISH=y
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_FB_SIMPLE=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+# CONFIG_USB_SUPPORT is not set
+CONFIG_RTC_CLASS=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SW_SYNC_USER=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_NFS_FS=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+# CONFIG_FTRACE is not set
+CONFIG_ATOMIC64_SELFTEST=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig
new file mode 100644
index 000000000000..b0e4e0ed4b11
--- /dev/null
+++ b/arch/x86/configs/i386_ranchu_defconfig
@@ -0,0 +1,422 @@
+# CONFIG_64BIT is not set
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_ARCH_MMAP_RND_BITS=16
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_SMP=y
+CONFIG_X86_BIGSMP=y
+CONFIG_MCORE2=y
+CONFIG_X86_GENERIC=y
+CONFIG_HPET_TIMER=y
+CONFIG_NR_CPUS=512
+CONFIG_PREEMPT=y
+# CONFIG_X86_MCE is not set
+CONFIG_X86_REBOOTFIXUPS=y
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_EFI_STUB=y
+CONFIG_HZ_100=y
+CONFIG_PHYSICAL_START=0x100000
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_INET_DIAG is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DMA_CMA=y
+CONFIG_CMA_SIZE_MBYTES=16
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_DEBUG=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_NVRAM=y
+CONFIG_I2C_I801=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ISCSI_IBFT_FIND=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_FUSE_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=2048
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_AES_586=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_PKCS7_TEST_KEY=y
+# CONFIG_VIRTUALIZATION is not set
+CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig
new file mode 100644
index 000000000000..8dae21ed3ede
--- /dev/null
+++ b/arch/x86/configs/x86_64_ranchu_defconfig
@@ -0,0 +1,417 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_ARCH_MMAP_RND_BITS=32
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_SMP=y
+CONFIG_MCORE2=y
+CONFIG_MAXSMP=y
+CONFIG_PREEMPT=y
+# CONFIG_X86_MCE is not set
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_EFI_STUB=y
+CONFIG_HZ_100=y
+CONFIG_PHYSICAL_START=0x100000
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_PCI_MMCONFIG=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_IA32_EMULATION=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_INET_DIAG is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DMA_CMA=y
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_DEBUG=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_NVRAM=y
+CONFIG_I2C_I801=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ISCSI_IBFT_FIND=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_FUSE_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_PKCS7_TEST_KEY=y
+# CONFIG_VIRTUALIZATION is not set
+CONFIG_CRC_T10DIF=y

From 2c8ebe3687301b2a94bb551d6d1a6570e2b407d2 Mon Sep 17 00:00:00 2001
From: Lingfeng Yang <lfy@google.com>
Date: Mon, 13 Jun 2016 09:24:07 -0700
Subject: [PATCH 0553/3220] ANDROID: goldfish: Add goldfish sync driver

This is kernel driver for controlling the Goldfish sync
device on the host. It is used to maintain ordering
in critical OpenGL state changes while using
GPU emulation.

The guest open()'s the Goldfish sync device to create
a context for possibly maintaining sync timeline and fences.
There is a 1:1 correspondence between such sync contexts
and OpenGL contexts in the guest that need synchronization
(which in turn, is anything involving swapping buffers,
SurfaceFlinger, or Hardware Composer).

The ioctl QUEUE_WORK takes a handle to a sync object
and attempts to tell the host GPU to wait on the sync object
and deal with signaling it. It possibly outputs
a fence FD on which the Android systems that use them
(GLConsumer, SurfaceFlinger, anything employing
EGL_ANDROID_native_fence_sync) can use to wait.

Design decisions and work log:

- New approach is to have the guest issue ioctls that
  trigger host wait, and then host increments timeline.
- We need the host's sync object handle and sync thread handle
  as the necessary information for that.
- ioctl() from guest can work simultaneously with the
  interrupt handling for commands from host.
- optimization: don't write back on timeline inc
- Change spin lock design to be much more lightweight;
  do not call sw_sync functions or loop too long
  anywhere.
- Send read/write commands in batches to minimize guest/host
  transitions.
- robustness: BUG if we will overrun the cmd buffer.
- robustness: return fd -1 if we cannot get an unused fd.
- correctness: remove global mutex
- cleanup pass done, incl. but not limited to:
    - removal of clear_upto and
    - switching to devm_***

This is part of a sequential, multi-CL change:

external/qemu:

https://android-review.googlesource.com/239442 <- host-side device's
host interface

https://android-review.googlesource.com/221593
https://android-review.googlesource.com/248563
https://android-review.googlesource.com/248564
https://android-review.googlesource.com/223032

external/qemu-android:

https://android-review.googlesource.com/238790 <- host-side device
implementation

kernel/goldfish:

https://android-review.googlesource.com/232631 <- needed
https://android-review.googlesource.com/238399 <- this CL

Also squash following bug fixes from android-goldfish-3.18 branch.

b44d486 goldfish_sync: provide a signal to detect reboot
ad1f597 goldfish_sync: fix stalls by avoiding early kfree()
de208e8 [goldfish-sync] Fix possible race between kernel and user space

Change-Id: I22f8a0e824717a7e751b1b0e1b461455501502b6
---
 arch/x86/configs/i386_ranchu_defconfig   |   1 +
 arch/x86/configs/x86_64_ranchu_defconfig |   1 +
 drivers/staging/goldfish/Kconfig         |   6 +
 drivers/staging/goldfish/Makefile        |   5 +
 drivers/staging/goldfish/goldfish_sync.c | 987 +++++++++++++++++++++++
 5 files changed, 1000 insertions(+)
 create mode 100644 drivers/staging/goldfish/goldfish_sync.c

diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig
index b0e4e0ed4b11..0206eb8cfb61 100644
--- a/arch/x86/configs/i386_ranchu_defconfig
+++ b/arch/x86/configs/i386_ranchu_defconfig
@@ -363,6 +363,7 @@ CONFIG_SYNC=y
 CONFIG_SW_SYNC=y
 CONFIG_ION=y
 CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH_SYNC=y
 CONFIG_SND_HDA_INTEL=y
 CONFIG_GOLDFISH=y
 CONFIG_GOLDFISH_PIPE=y
diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig
index 8dae21ed3ede..dd389774bacb 100644
--- a/arch/x86/configs/x86_64_ranchu_defconfig
+++ b/arch/x86/configs/x86_64_ranchu_defconfig
@@ -360,6 +360,7 @@ CONFIG_SYNC=y
 CONFIG_SW_SYNC=y
 CONFIG_ION=y
 CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH_SYNC=y
 CONFIG_SND_HDA_INTEL=y
 CONFIG_GOLDFISH=y
 CONFIG_GOLDFISH_PIPE=y
diff --git a/drivers/staging/goldfish/Kconfig b/drivers/staging/goldfish/Kconfig
index 4e094602437c..c579141a7bed 100644
--- a/drivers/staging/goldfish/Kconfig
+++ b/drivers/staging/goldfish/Kconfig
@@ -4,6 +4,12 @@ config GOLDFISH_AUDIO
 	---help---
 	  Emulated audio channel for the Goldfish Android Virtual Device
 
+config GOLDFISH_SYNC
+    tristate "Goldfish AVD Sync Driver"
+    depends on GOLDFISH
+	---help---
+	  Emulated sync fences for the Goldfish Android Virtual Device
+
 config MTD_GOLDFISH_NAND
 	tristate "Goldfish NAND device"
 	depends on GOLDFISH
diff --git a/drivers/staging/goldfish/Makefile b/drivers/staging/goldfish/Makefile
index dec34ad58162..0cf525588210 100644
--- a/drivers/staging/goldfish/Makefile
+++ b/drivers/staging/goldfish/Makefile
@@ -4,3 +4,8 @@
 
 obj-$(CONFIG_GOLDFISH_AUDIO) += goldfish_audio.o
 obj-$(CONFIG_MTD_GOLDFISH_NAND)	+= goldfish_nand.o
+
+# and sync
+
+ccflags-y := -Idrivers/staging/android
+obj-$(CONFIG_GOLDFISH_SYNC) += goldfish_sync.o
diff --git a/drivers/staging/goldfish/goldfish_sync.c b/drivers/staging/goldfish/goldfish_sync.c
new file mode 100644
index 000000000000..ba8def29901e
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync.c
@@ -0,0 +1,987 @@
+/*
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/acpi.h>
+
+#include <linux/string.h>
+#include <linux/syscalls.h>
+
+#include "sw_sync.h"
+#include "sync.h"
+
+#define ERR(...) printk(KERN_ERR __VA_ARGS__);
+
+#define INFO(...) printk(KERN_INFO __VA_ARGS__);
+
+#define DPRINT(...) pr_debug(__VA_ARGS__);
+
+#define DTRACE() DPRINT("%s: enter", __func__)
+
+/* The Goldfish sync driver is designed to provide a interface
+ * between the underlying host's sync device and the kernel's
+ * sw_sync.
+ * The purpose of the device/driver is to enable lightweight
+ * creation and signaling of timelines and fences
+ * in order to synchronize the guest with host-side graphics events.
+ *
+ * Each time the interrupt trips, the driver
+ * may perform a sw_sync operation.
+ */
+
+/* The operations are: */
+
+/* Ready signal - used to mark when irq should lower */
+#define CMD_SYNC_READY            0
+
+/* Create a new timeline. writes timeline handle */
+#define CMD_CREATE_SYNC_TIMELINE  1
+
+/* Create a fence object. reads timeline handle and time argument.
+ * Writes fence fd to the SYNC_REG_HANDLE register. */
+#define CMD_CREATE_SYNC_FENCE     2
+
+/* Increments timeline. reads timeline handle and time argument */
+#define CMD_SYNC_TIMELINE_INC     3
+
+/* Destroys a timeline. reads timeline handle */
+#define CMD_DESTROY_SYNC_TIMELINE 4
+
+/* Starts a wait on the host with
+ * the given glsync object and sync thread handle. */
+#define CMD_TRIGGER_HOST_WAIT     5
+
+/* The register layout is: */
+
+#define SYNC_REG_BATCH_COMMAND                0x00 /* host->guest batch commands */
+#define SYNC_REG_BATCH_GUESTCOMMAND           0x04 /* guest->host batch commands */
+#define SYNC_REG_BATCH_COMMAND_ADDR           0x08 /* communicate physical address of host->guest batch commands */
+#define SYNC_REG_BATCH_COMMAND_ADDR_HIGH      0x0c /* 64-bit part */
+#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR      0x10 /* communicate physical address of guest->host commands */
+#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH 0x14 /* 64-bit part */
+#define SYNC_REG_INIT                         0x18 /* signals that the device has been probed */
+
+/* There is an ioctl associated with goldfish sync driver.
+ * Make it conflict with ioctls that are not likely to be used
+ * in the emulator.
+ *
+ * '@'	00-0F	linux/radeonfb.h	conflict!
+ * '@'	00-0F	drivers/video/aty/aty128fb.c	conflict!
+ */
+#define GOLDFISH_SYNC_IOC_MAGIC	'@'
+
+#define GOLDFISH_SYNC_IOC_QUEUE_WORK	_IOWR(GOLDFISH_SYNC_IOC_MAGIC, 0, struct goldfish_sync_ioctl_info)
+
+/* The above definitions (command codes, register layout, ioctl definitions)
+ * need to be in sync with the following files:
+ *
+ * Host-side (emulator):
+ * external/qemu/android/emulation/goldfish_sync.h
+ * external/qemu-android/hw/misc/goldfish_sync.c
+ *
+ * Guest-side (system image):
+ * device/generic/goldfish-opengl/system/egl/goldfish_sync.h
+ * device/generic/goldfish/ueventd.ranchu.rc
+ * platform/build/target/board/generic/sepolicy/file_contexts
+ */
+struct goldfish_sync_hostcmd {
+	/* sorted for alignment */
+	uint64_t handle;
+	uint64_t hostcmd_handle;
+	uint32_t cmd;
+	uint32_t time_arg;
+};
+
+struct goldfish_sync_guestcmd {
+	uint64_t host_command; /* uint64_t for alignment */
+	uint64_t glsync_handle;
+	uint64_t thread_handle;
+	uint64_t guest_timeline_handle;
+};
+
+#define GOLDFISH_SYNC_MAX_CMDS 64
+
+struct goldfish_sync_state {
+	char __iomem *reg_base;
+	int irq;
+
+	/* Spinlock protects |to_do| / |to_do_end|. */
+	spinlock_t lock;
+	/* |mutex_lock| protects all concurrent access
+	 * to timelines for both kernel and user space. */
+	struct mutex mutex_lock;
+
+	/* Buffer holding commands issued from host. */
+	struct goldfish_sync_hostcmd to_do[GOLDFISH_SYNC_MAX_CMDS];
+	uint32_t to_do_end;
+
+	/* Addresses for the reading or writing
+	 * of individual commands. The host can directly write
+	 * to |batch_hostcmd| (and then this driver immediately
+	 * copies contents to |to_do|). This driver either replies
+	 * through |batch_hostcmd| or simply issues a
+	 * guest->host command through |batch_guestcmd|.
+	 */
+	struct goldfish_sync_hostcmd *batch_hostcmd;
+	struct goldfish_sync_guestcmd *batch_guestcmd;
+
+	/* Used to give this struct itself to a work queue
+	 * function for executing actual sync commands. */
+	struct work_struct work_item;
+};
+
+static struct goldfish_sync_state global_sync_state[1];
+
+struct goldfish_sync_timeline_obj {
+	struct sw_sync_timeline *sw_sync_tl;
+	uint32_t current_time;
+	/* We need to be careful about when we deallocate
+	 * this |goldfish_sync_timeline_obj| struct.
+	 * In order to ensure proper cleanup, we need to
+	 * consider the triggered host-side wait that may
+	 * still be in flight when the guest close()'s a
+	 * goldfish_sync device's sync context fd (and
+	 * destroys the |sw_sync_tl| field above).
+	 * The host-side wait may raise IRQ
+	 * and tell the kernel to increment the timeline _after_
+	 * the |sw_sync_tl| has already been set to null.
+	 *
+	 * From observations on OpenGL apps and CTS tests, this
+	 * happens at some very low probability upon context
+	 * destruction or process close, but it does happen
+	 * and it needs to be handled properly. Otherwise,
+	 * if we clean up the surrounding |goldfish_sync_timeline_obj|
+	 * too early, any |handle| field of any host->guest command
+	 * might not even point to a null |sw_sync_tl| field,
+	 * but to garbage memory or even a reclaimed |sw_sync_tl|.
+	 * If we do not count such "pending waits" and kfree the object
+	 * immediately upon |goldfish_sync_timeline_destroy|,
+	 * we might get mysterous RCU stalls after running a long
+	 * time because the garbage memory that is being read
+	 * happens to be interpretable as a |spinlock_t| struct
+	 * that is currently in the locked state.
+	 *
+	 * To track when to free the |goldfish_sync_timeline_obj|
+	 * itself, we maintain a kref.
+	 * The kref essentially counts the timeline itself plus
+	 * the number of waits in flight. kref_init/kref_put
+	 * are issued on
+	 * |goldfish_sync_timeline_create|/|goldfish_sync_timeline_destroy|
+	 * and kref_get/kref_put are issued on
+	 * |goldfish_sync_fence_create|/|goldfish_sync_timeline_inc|.
+	 *
+	 * The timeline is destroyed after reference count
+	 * reaches zero, which would happen after
+	 * |goldfish_sync_timeline_destroy| and all pending
+	 * |goldfish_sync_timeline_inc|'s are fulfilled.
+	 *
+	 * NOTE (1): We assume that |fence_create| and
+	 * |timeline_inc| calls are 1:1, otherwise the kref scheme
+	 * will not work. This is a valid assumption as long
+	 * as the host-side virtual device implementation
+	 * does not insert any timeline increments
+	 * that we did not trigger from here.
+	 *
+	 * NOTE (2): The use of kref by itself requires no locks,
+	 * but this does not mean everything works without locks.
+	 * Related timeline operations do require a lock of some sort,
+	 * or at least are not proven to work without it.
+	 * In particualr, we assume that all the operations
+	 * done on the |kref| field above are done in contexts where
+	 * |global_sync_state->mutex_lock| is held. Do not
+	 * remove that lock until everything is proven to work
+	 * without it!!! */
+	struct kref kref;
+};
+
+/* We will call |delete_timeline_obj| when the last reference count
+ * of the kref is decremented. This deletes the sw_sync
+ * timeline object along with the wrapper itself. */
+static void delete_timeline_obj(struct kref* kref) {
+	struct goldfish_sync_timeline_obj* obj =
+		container_of(kref, struct goldfish_sync_timeline_obj, kref);
+
+	sync_timeline_destroy(&obj->sw_sync_tl->obj);
+	obj->sw_sync_tl = NULL;
+	kfree(obj);
+}
+
+static uint64_t gensym_ctr;
+static void gensym(char *dst)
+{
+	sprintf(dst, "goldfish_sync:gensym:%llu", gensym_ctr);
+	gensym_ctr++;
+}
+
+/* |goldfish_sync_timeline_create| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static struct goldfish_sync_timeline_obj*
+goldfish_sync_timeline_create(void)
+{
+
+	char timeline_name[256];
+	struct sw_sync_timeline *res_sync_tl = NULL;
+	struct goldfish_sync_timeline_obj *res;
+
+	DTRACE();
+
+	gensym(timeline_name);
+
+	res_sync_tl = sw_sync_timeline_create(timeline_name);
+	if (!res_sync_tl) {
+		ERR("Failed to create sw_sync timeline.");
+		return NULL;
+	}
+
+	res = kzalloc(sizeof(struct goldfish_sync_timeline_obj), GFP_KERNEL);
+	res->sw_sync_tl = res_sync_tl;
+	res->current_time = 0;
+	kref_init(&res->kref);
+
+	DPRINT("new timeline_obj=0x%p", res);
+	return res;
+}
+
+/* |goldfish_sync_fence_create| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static int
+goldfish_sync_fence_create(struct goldfish_sync_timeline_obj *obj,
+							uint32_t val)
+{
+
+	int fd;
+	char fence_name[256];
+	struct sync_pt *syncpt = NULL;
+	struct sync_fence *sync_obj = NULL;
+	struct sw_sync_timeline *tl;
+
+	DTRACE();
+
+	if (!obj) return -1;
+
+	tl = obj->sw_sync_tl;
+
+	syncpt = sw_sync_pt_create(tl, val);
+	if (!syncpt) {
+		ERR("could not create sync point! "
+			"sync_timeline=0x%p val=%d",
+			   tl, val);
+		return -1;
+	}
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		ERR("could not get unused fd for sync fence. "
+			"errno=%d", fd);
+		goto err_cleanup_pt;
+	}
+
+	gensym(fence_name);
+
+	sync_obj = sync_fence_create(fence_name, syncpt);
+	if (!sync_obj) {
+		ERR("could not create sync fence! "
+			"sync_timeline=0x%p val=%d sync_pt=0x%p",
+			   tl, val, syncpt);
+		goto err_cleanup_fd_pt;
+	}
+
+	DPRINT("installing sync fence into fd %d sync_obj=0x%p", fd, sync_obj);
+	sync_fence_install(sync_obj, fd);
+	kref_get(&obj->kref);
+
+	return fd;
+
+err_cleanup_fd_pt:
+	put_unused_fd(fd);
+err_cleanup_pt:
+	sync_pt_free(syncpt);
+	return -1;
+}
+
+/* |goldfish_sync_timeline_inc| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static void
+goldfish_sync_timeline_inc(struct goldfish_sync_timeline_obj *obj, uint32_t inc)
+{
+	DTRACE();
+	/* Just give up if someone else nuked the timeline.
+	 * Whoever it was won't care that it doesn't get signaled. */
+	if (!obj) return;
+
+	DPRINT("timeline_obj=0x%p", obj);
+	sw_sync_timeline_inc(obj->sw_sync_tl, inc);
+	DPRINT("incremented timeline. increment max_time");
+	obj->current_time += inc;
+
+	/* Here, we will end up deleting the timeline object if it
+	 * turns out that this call was a pending increment after
+	 * |goldfish_sync_timeline_destroy| was called. */
+	kref_put(&obj->kref, delete_timeline_obj);
+	DPRINT("done");
+}
+
+/* |goldfish_sync_timeline_destroy| assumes
+ * that |global_sync_state->mutex_lock| is held. */
+static void
+goldfish_sync_timeline_destroy(struct goldfish_sync_timeline_obj *obj)
+{
+	DTRACE();
+	/* See description of |goldfish_sync_timeline_obj| for why we
+	 * should not immediately destroy |obj| */
+	kref_put(&obj->kref, delete_timeline_obj);
+}
+
+static inline void
+goldfish_sync_cmd_queue(struct goldfish_sync_state *sync_state,
+						uint32_t cmd,
+						uint64_t handle,
+						uint32_t time_arg,
+						uint64_t hostcmd_handle)
+{
+	struct goldfish_sync_hostcmd *to_add;
+
+	DTRACE();
+
+	BUG_ON(sync_state->to_do_end == GOLDFISH_SYNC_MAX_CMDS);
+
+	to_add = &sync_state->to_do[sync_state->to_do_end];
+
+	to_add->cmd = cmd;
+	to_add->handle = handle;
+	to_add->time_arg = time_arg;
+	to_add->hostcmd_handle = hostcmd_handle;
+
+	sync_state->to_do_end += 1;
+}
+
+static inline void
+goldfish_sync_hostcmd_reply(struct goldfish_sync_state *sync_state,
+							uint32_t cmd,
+							uint64_t handle,
+							uint32_t time_arg,
+							uint64_t hostcmd_handle)
+{
+	unsigned long irq_flags;
+	struct goldfish_sync_hostcmd *batch_hostcmd =
+		sync_state->batch_hostcmd;
+
+	DTRACE();
+
+	spin_lock_irqsave(&sync_state->lock, irq_flags);
+
+	batch_hostcmd->cmd = cmd;
+	batch_hostcmd->handle = handle;
+	batch_hostcmd->time_arg = time_arg;
+	batch_hostcmd->hostcmd_handle = hostcmd_handle;
+	writel(0, sync_state->reg_base + SYNC_REG_BATCH_COMMAND);
+
+	spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+}
+
+static inline void
+goldfish_sync_send_guestcmd(struct goldfish_sync_state *sync_state,
+							uint32_t cmd,
+							uint64_t glsync_handle,
+							uint64_t thread_handle,
+							uint64_t timeline_handle)
+{
+	unsigned long irq_flags;
+	struct goldfish_sync_guestcmd *batch_guestcmd =
+		sync_state->batch_guestcmd;
+
+	DTRACE();
+
+	spin_lock_irqsave(&sync_state->lock, irq_flags);
+
+	batch_guestcmd->host_command = (uint64_t)cmd;
+	batch_guestcmd->glsync_handle = (uint64_t)glsync_handle;
+	batch_guestcmd->thread_handle = (uint64_t)thread_handle;
+	batch_guestcmd->guest_timeline_handle = (uint64_t)timeline_handle;
+	writel(0, sync_state->reg_base + SYNC_REG_BATCH_GUESTCOMMAND);
+
+	spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+}
+
+/* |goldfish_sync_interrupt| handles IRQ raises from the virtual device.
+ * In the context of OpenGL, this interrupt will fire whenever we need
+ * to signal a fence fd in the guest, with the command
+ * |CMD_SYNC_TIMELINE_INC|.
+ * However, because this function will be called in an interrupt context,
+ * it is necessary to do the actual work of signaling off of interrupt context.
+ * The shared work queue is used for this purpose. At the end when
+ * all pending commands are intercepted by the interrupt handler,
+ * we call |schedule_work|, which will later run the actual
+ * desired sync command in |goldfish_sync_work_item_fn|.
+ */
+static irqreturn_t goldfish_sync_interrupt(int irq, void *dev_id)
+{
+
+	struct goldfish_sync_state *sync_state = dev_id;
+
+	uint32_t nextcmd;
+	uint32_t command_r;
+	uint64_t handle_rw;
+	uint32_t time_r;
+	uint64_t hostcmd_handle_rw;
+
+	int count = 0;
+
+	DTRACE();
+
+	sync_state = dev_id;
+
+	spin_lock(&sync_state->lock);
+
+	for (;;) {
+
+		readl(sync_state->reg_base + SYNC_REG_BATCH_COMMAND);
+		nextcmd = sync_state->batch_hostcmd->cmd;
+
+		if (nextcmd == 0)
+			break;
+
+		command_r = nextcmd;
+		handle_rw = sync_state->batch_hostcmd->handle;
+		time_r = sync_state->batch_hostcmd->time_arg;
+		hostcmd_handle_rw = sync_state->batch_hostcmd->hostcmd_handle;
+
+		goldfish_sync_cmd_queue(
+				sync_state,
+				command_r,
+				handle_rw,
+				time_r,
+				hostcmd_handle_rw);
+
+		count++;
+	}
+
+	spin_unlock(&sync_state->lock);
+
+	schedule_work(&sync_state->work_item);
+
+	return (count == 0) ? IRQ_NONE : IRQ_HANDLED;
+}
+
+/* |goldfish_sync_work_item_fn| does the actual work of servicing
+ * host->guest sync commands. This function is triggered whenever
+ * the IRQ for the goldfish sync device is raised. Once it starts
+ * running, it grabs the contents of the buffer containing the
+ * commands it needs to execute (there may be multiple, because
+ * our IRQ is active high and not edge triggered), and then
+ * runs all of them one after the other.
+ */
+static void goldfish_sync_work_item_fn(struct work_struct *input)
+{
+
+	struct goldfish_sync_state *sync_state;
+	int sync_fence_fd;
+
+	struct goldfish_sync_timeline_obj *timeline;
+	uint64_t timeline_ptr;
+
+	uint64_t hostcmd_handle;
+
+	uint32_t cmd;
+	uint64_t handle;
+	uint32_t time_arg;
+
+	struct goldfish_sync_hostcmd *todo;
+	uint32_t todo_end;
+
+	unsigned long irq_flags;
+
+	struct goldfish_sync_hostcmd to_run[GOLDFISH_SYNC_MAX_CMDS];
+	uint32_t i = 0;
+
+	sync_state = container_of(input, struct goldfish_sync_state, work_item);
+
+	mutex_lock(&sync_state->mutex_lock);
+
+	spin_lock_irqsave(&sync_state->lock, irq_flags); {
+
+		todo_end = sync_state->to_do_end;
+
+		DPRINT("num sync todos: %u", sync_state->to_do_end);
+
+		for (i = 0; i < todo_end; i++)
+			to_run[i] = sync_state->to_do[i];
+
+		/* We expect that commands will come in at a slow enough rate
+		 * so that incoming items will not be more than
+		 * GOLDFISH_SYNC_MAX_CMDS.
+		 *
+		 * This is because the way the sync device is used,
+		 * it's only for managing buffer data transfers per frame,
+		 * with a sequential dependency between putting things in
+		 * to_do and taking them out. Once a set of commands is
+		 * queued up in to_do, the user of the device waits for
+		 * them to be processed before queuing additional commands,
+		 * which limits the rate at which commands come in
+		 * to the rate at which we take them out here.
+		 *
+		 * We also don't expect more than MAX_CMDS to be issued
+		 * at once; there is a correspondence between
+		 * which buffers need swapping to the (display / buffer queue)
+		 * to particular commands, and we don't expect there to be
+		 * enough display or buffer queues in operation at once
+		 * to overrun GOLDFISH_SYNC_MAX_CMDS.
+		 */
+		sync_state->to_do_end = 0;
+
+	} spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+
+	for (i = 0; i < todo_end; i++) {
+		DPRINT("todo index: %u", i);
+
+		todo = &to_run[i];
+
+		cmd = todo->cmd;
+
+		handle = (uint64_t)todo->handle;
+		time_arg = todo->time_arg;
+		hostcmd_handle = (uint64_t)todo->hostcmd_handle;
+
+		DTRACE();
+
+		timeline = (struct goldfish_sync_timeline_obj *)(uintptr_t)handle;
+
+		switch (cmd) {
+		case CMD_SYNC_READY:
+			break;
+		case CMD_CREATE_SYNC_TIMELINE:
+			DPRINT("exec CMD_CREATE_SYNC_TIMELINE: "
+					"handle=0x%llx time_arg=%d",
+					handle, time_arg);
+			timeline = goldfish_sync_timeline_create();
+			timeline_ptr = (uintptr_t)timeline;
+			goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_TIMELINE,
+										timeline_ptr,
+										0,
+										hostcmd_handle);
+			DPRINT("sync timeline created: %p", timeline);
+			break;
+		case CMD_CREATE_SYNC_FENCE:
+			DPRINT("exec CMD_CREATE_SYNC_FENCE: "
+					"handle=0x%llx time_arg=%d",
+					handle, time_arg);
+			sync_fence_fd = goldfish_sync_fence_create(timeline, time_arg);
+			goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_FENCE,
+										sync_fence_fd,
+										0,
+										hostcmd_handle);
+			break;
+		case CMD_SYNC_TIMELINE_INC:
+			DPRINT("exec CMD_SYNC_TIMELINE_INC: "
+					"handle=0x%llx time_arg=%d",
+					handle, time_arg);
+			goldfish_sync_timeline_inc(timeline, time_arg);
+			break;
+		case CMD_DESTROY_SYNC_TIMELINE:
+			DPRINT("exec CMD_DESTROY_SYNC_TIMELINE: "
+					"handle=0x%llx time_arg=%d",
+					handle, time_arg);
+			goldfish_sync_timeline_destroy(timeline);
+			break;
+		}
+		DPRINT("Done executing sync command");
+	}
+	mutex_unlock(&sync_state->mutex_lock);
+}
+
+/* Guest-side interface: file operations */
+
+/* Goldfish sync context and ioctl info.
+ *
+ * When a sync context is created by open()-ing the goldfish sync device, we
+ * create a sync context (|goldfish_sync_context|).
+ *
+ * Currently, the only data required to track is the sync timeline itself
+ * along with the current time, which are all packed up in the
+ * |goldfish_sync_timeline_obj| field. We use a |goldfish_sync_context|
+ * as the filp->private_data.
+ *
+ * Next, when a sync context user requests that work be queued and a fence
+ * fd provided, we use the |goldfish_sync_ioctl_info| struct, which holds
+ * information about which host handles to touch for this particular
+ * queue-work operation. We need to know about the host-side sync thread
+ * and the particular host-side GLsync object. We also possibly write out
+ * a file descriptor.
+ */
+struct goldfish_sync_context {
+	struct goldfish_sync_timeline_obj *timeline;
+};
+
+struct goldfish_sync_ioctl_info {
+	uint64_t host_glsync_handle_in;
+	uint64_t host_syncthread_handle_in;
+	int fence_fd_out;
+};
+
+static int goldfish_sync_open(struct inode *inode, struct file *file)
+{
+
+	struct goldfish_sync_context *sync_context;
+
+	DTRACE();
+
+	mutex_lock(&global_sync_state->mutex_lock);
+
+	sync_context = kzalloc(sizeof(struct goldfish_sync_context), GFP_KERNEL);
+
+	if (sync_context == NULL) {
+		ERR("Creation of goldfish sync context failed!");
+		mutex_unlock(&global_sync_state->mutex_lock);
+		return -ENOMEM;
+	}
+
+	sync_context->timeline = NULL;
+
+	file->private_data = sync_context;
+
+	DPRINT("successfully create a sync context @0x%p", sync_context);
+
+	mutex_unlock(&global_sync_state->mutex_lock);
+
+	return 0;
+}
+
+static int goldfish_sync_release(struct inode *inode, struct file *file)
+{
+
+	struct goldfish_sync_context *sync_context;
+
+	DTRACE();
+
+	mutex_lock(&global_sync_state->mutex_lock);
+
+	sync_context = file->private_data;
+
+	if (sync_context->timeline)
+		goldfish_sync_timeline_destroy(sync_context->timeline);
+
+	sync_context->timeline = NULL;
+
+	kfree(sync_context);
+
+	mutex_unlock(&global_sync_state->mutex_lock);
+
+	return 0;
+}
+
+/* |goldfish_sync_ioctl| is the guest-facing interface of goldfish sync
+ * and is used in conjunction with eglCreateSyncKHR to queue up the
+ * actual work of waiting for the EGL sync command to complete,
+ * possibly returning a fence fd to the guest.
+ */
+static long goldfish_sync_ioctl(struct file *file,
+								unsigned int cmd,
+								unsigned long arg)
+{
+	struct goldfish_sync_context *sync_context_data;
+	struct goldfish_sync_timeline_obj *timeline;
+	int fd_out;
+	struct goldfish_sync_ioctl_info ioctl_data;
+
+	DTRACE();
+
+	sync_context_data = file->private_data;
+	fd_out = -1;
+
+	switch (cmd) {
+	case GOLDFISH_SYNC_IOC_QUEUE_WORK:
+
+		DPRINT("exec GOLDFISH_SYNC_IOC_QUEUE_WORK");
+
+		mutex_lock(&global_sync_state->mutex_lock);
+
+		if (copy_from_user(&ioctl_data,
+						(void __user *)arg,
+						sizeof(ioctl_data))) {
+			ERR("Failed to copy memory for ioctl_data from user.");
+			mutex_unlock(&global_sync_state->mutex_lock);
+			return -EFAULT;
+		}
+
+		if (ioctl_data.host_syncthread_handle_in == 0) {
+			DPRINT("Error: zero host syncthread handle!!!");
+			mutex_unlock(&global_sync_state->mutex_lock);
+			return -EFAULT;
+		}
+
+		if (!sync_context_data->timeline) {
+			DPRINT("no timeline yet, create one.");
+			sync_context_data->timeline = goldfish_sync_timeline_create();
+			DPRINT("timeline: 0x%p", &sync_context_data->timeline);
+		}
+
+		timeline = sync_context_data->timeline;
+		fd_out = goldfish_sync_fence_create(timeline,
+											timeline->current_time + 1);
+		DPRINT("Created fence with fd %d and current time %u (timeline: 0x%p)",
+			   fd_out,
+			   sync_context_data->timeline->current_time + 1,
+			   sync_context_data->timeline);
+
+		ioctl_data.fence_fd_out = fd_out;
+
+		if (copy_to_user((void __user *)arg,
+						&ioctl_data,
+						sizeof(ioctl_data))) {
+			DPRINT("Error, could not copy to user!!!");
+
+			sys_close(fd_out);
+			/* We won't be doing an increment, kref_put immediately. */
+			kref_put(&timeline->kref, delete_timeline_obj);
+			mutex_unlock(&global_sync_state->mutex_lock);
+			return -EFAULT;
+		}
+
+		/* We are now about to trigger a host-side wait;
+		 * accumulate on |pending_waits|. */
+		goldfish_sync_send_guestcmd(global_sync_state,
+				CMD_TRIGGER_HOST_WAIT,
+				ioctl_data.host_glsync_handle_in,
+				ioctl_data.host_syncthread_handle_in,
+				(uint64_t)(uintptr_t)(sync_context_data->timeline));
+
+		mutex_unlock(&global_sync_state->mutex_lock);
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations goldfish_sync_fops = {
+	.owner = THIS_MODULE,
+	.open = goldfish_sync_open,
+	.release = goldfish_sync_release,
+	.unlocked_ioctl = goldfish_sync_ioctl,
+	.compat_ioctl = goldfish_sync_ioctl,
+};
+
+static struct miscdevice goldfish_sync_device = {
+	.name = "goldfish_sync",
+	.fops = &goldfish_sync_fops,
+};
+
+
+static bool setup_verify_batch_cmd_addr(struct goldfish_sync_state *sync_state,
+										void *batch_addr,
+										uint32_t addr_offset,
+										uint32_t addr_offset_high)
+{
+	uint64_t batch_addr_phys;
+	uint32_t batch_addr_phys_test_lo;
+	uint32_t batch_addr_phys_test_hi;
+
+	if (!batch_addr) {
+		ERR("Could not use batch command address!");
+		return false;
+	}
+
+	batch_addr_phys = virt_to_phys(batch_addr);
+	writel((uint32_t)(batch_addr_phys),
+			sync_state->reg_base + addr_offset);
+	writel((uint32_t)(batch_addr_phys >> 32),
+			sync_state->reg_base + addr_offset_high);
+
+	batch_addr_phys_test_lo =
+		readl(sync_state->reg_base + addr_offset);
+	batch_addr_phys_test_hi =
+		readl(sync_state->reg_base + addr_offset_high);
+
+	if (virt_to_phys(batch_addr) !=
+			(((uint64_t)batch_addr_phys_test_hi << 32) |
+			 batch_addr_phys_test_lo)) {
+		ERR("Invalid batch command address!");
+		return false;
+	}
+
+	return true;
+}
+
+int goldfish_sync_probe(struct platform_device *pdev)
+{
+	struct resource *ioresource;
+	struct goldfish_sync_state *sync_state = global_sync_state;
+	int status;
+
+	DTRACE();
+
+	sync_state->to_do_end = 0;
+
+	spin_lock_init(&sync_state->lock);
+	mutex_init(&sync_state->mutex_lock);
+
+	platform_set_drvdata(pdev, sync_state);
+
+	ioresource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (ioresource == NULL) {
+		ERR("platform_get_resource failed");
+		return -ENODEV;
+	}
+
+	sync_state->reg_base = devm_ioremap(&pdev->dev, ioresource->start, PAGE_SIZE);
+	if (sync_state->reg_base == NULL) {
+		ERR("Could not ioremap");
+		return -ENOMEM;
+	}
+
+	sync_state->irq = platform_get_irq(pdev, 0);
+	if (sync_state->irq < 0) {
+		ERR("Could not platform_get_irq");
+		return -ENODEV;
+	}
+
+	status = devm_request_irq(&pdev->dev,
+							sync_state->irq,
+							goldfish_sync_interrupt,
+							IRQF_SHARED,
+							pdev->name,
+							sync_state);
+	if (status) {
+		ERR("request_irq failed");
+		return -ENODEV;
+	}
+
+	INIT_WORK(&sync_state->work_item,
+			  goldfish_sync_work_item_fn);
+
+	misc_register(&goldfish_sync_device);
+
+	/* Obtain addresses for batch send/recv of commands. */
+	{
+		struct goldfish_sync_hostcmd *batch_addr_hostcmd;
+		struct goldfish_sync_guestcmd *batch_addr_guestcmd;
+
+		batch_addr_hostcmd = devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_hostcmd),
+				GFP_KERNEL);
+		batch_addr_guestcmd = devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_guestcmd),
+				GFP_KERNEL);
+
+		if (!setup_verify_batch_cmd_addr(sync_state,
+					batch_addr_hostcmd,
+					SYNC_REG_BATCH_COMMAND_ADDR,
+					SYNC_REG_BATCH_COMMAND_ADDR_HIGH)) {
+			ERR("goldfish_sync: Could not setup batch command address");
+			return -ENODEV;
+		}
+
+		if (!setup_verify_batch_cmd_addr(sync_state,
+					batch_addr_guestcmd,
+					SYNC_REG_BATCH_GUESTCOMMAND_ADDR,
+					SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH)) {
+			ERR("goldfish_sync: Could not setup batch guest command address");
+			return -ENODEV;
+		}
+
+		sync_state->batch_hostcmd = batch_addr_hostcmd;
+		sync_state->batch_guestcmd = batch_addr_guestcmd;
+	}
+
+	INFO("goldfish_sync: Initialized goldfish sync device");
+
+	writel(0, sync_state->reg_base + SYNC_REG_INIT);
+
+	return 0;
+}
+
+static int goldfish_sync_remove(struct platform_device *pdev)
+{
+	struct goldfish_sync_state *sync_state = global_sync_state;
+
+	DTRACE();
+
+	misc_deregister(&goldfish_sync_device);
+	memset(sync_state, 0, sizeof(struct goldfish_sync_state));
+	return 0;
+}
+
+static const struct of_device_id goldfish_sync_of_match[] = {
+	{ .compatible = "google,goldfish-sync", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, goldfish_sync_of_match);
+
+static const struct acpi_device_id goldfish_sync_acpi_match[] = {
+	{ "GFSH0006", 0 },
+	{ },
+};
+
+MODULE_DEVICE_TABLE(acpi, goldfish_sync_acpi_match);
+
+static struct platform_driver goldfish_sync = {
+	.probe = goldfish_sync_probe,
+	.remove = goldfish_sync_remove,
+	.driver = {
+		.name = "goldfish_sync",
+		.of_match_table = goldfish_sync_of_match,
+		.acpi_match_table = ACPI_PTR(goldfish_sync_acpi_match),
+	}
+};
+
+module_platform_driver(goldfish_sync);
+
+MODULE_AUTHOR("Google, Inc.");
+MODULE_DESCRIPTION("Android QEMU Sync Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
+
+/* This function is only to run a basic test of sync framework.
+ * It creates a timeline and fence object whose signal point is at 1.
+ * The timeline is incremented, and we use the sync framework's
+ * sync_fence_wait on that fence object. If everything works out,
+ * we should not hang in the wait and return immediately.
+ * There is no way to explicitly run this test yet, but it
+ * can be used by inserting it at the end of goldfish_sync_probe.
+ */
+void test_kernel_sync(void)
+{
+	struct goldfish_sync_timeline_obj *test_timeline;
+	int test_fence_fd;
+
+	DTRACE();
+
+	DPRINT("test sw_sync");
+
+	test_timeline = goldfish_sync_timeline_create();
+	DPRINT("sw_sync_timeline_create -> 0x%p", test_timeline);
+
+	test_fence_fd = goldfish_sync_fence_create(test_timeline, 1);
+	DPRINT("sync_fence_create -> %d", test_fence_fd);
+
+	DPRINT("incrementing test timeline");
+	goldfish_sync_timeline_inc(test_timeline, 1);
+
+	DPRINT("test waiting (should NOT hang)");
+	sync_fence_wait(
+			sync_fence_fdget(test_fence_fd), -1);
+
+	DPRINT("test waiting (afterward)");
+}

From e96f35d6073717452d9bc484dad0c3cfb61f5c46 Mon Sep 17 00:00:00 2001
From: Yurii Zubrytskyi <zyy@google.com>
Date: Wed, 4 May 2016 13:05:38 -0700
Subject: [PATCH 0554/3220] ANDROID: goldfish_pipe: bugfixes and performance
 improvements.

Combine following patches from android-goldfish-3.18 branch:

c0f015a [pipe] Fix the pipe driver for x64 platform + correct pages count
48e6bf5 [pipe] Use get_use_pages_fast() which is possibly faster
fb20f13 [goldfish] More pages in goldfish pipe
f180e6d goldfish_pipe: Return from read_write on signal and EIO
3dec3b7 [pipe] Fix a minor leak in setup_access_params_addr()

Change-Id: I1041fd65d7faaec123e6cedd3dbbc5a2fbb86c4d
---
 drivers/platform/goldfish/goldfish_pipe.c | 123 ++++++++++++++--------
 1 file changed, 77 insertions(+), 46 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 3215a33cf4fe..55929ed58ea3 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -109,6 +109,16 @@
 #define PIPE_WAKE_READ         (1 << 1)  /* pipe can now be read from */
 #define PIPE_WAKE_WRITE        (1 << 2)  /* pipe can now be written to */
 
+#define MAX_PAGES_TO_GRAB 32
+
+#define DEBUG 0
+
+#if DEBUG
+#define DPRINT(...) { printk(KERN_ERR __VA_ARGS__); }
+#else
+#define DPRINT(...)
+#endif
+
 struct access_params {
 	unsigned long channel;
 	u32 size;
@@ -231,8 +241,10 @@ static int setup_access_params_addr(struct platform_device *pdev,
 	if (valid_batchbuffer_addr(dev, aps)) {
 		dev->aps = aps;
 		return 0;
-	} else
+	} else {
+		devm_kfree(&pdev->dev, aps);
 		return -1;
+	}
 }
 
 /* A value that will not be set by qemu emulator */
@@ -269,6 +281,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 	struct goldfish_pipe *pipe = filp->private_data;
 	struct goldfish_pipe_dev *dev = pipe->dev;
 	unsigned long address, address_end;
+	struct page* pages[MAX_PAGES_TO_GRAB] = {};
 	int count = 0, ret = -EINVAL;
 
 	/* If the emulator already closed the pipe, no need to go further */
@@ -292,45 +305,59 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 	address_end = address + bufflen;
 
 	while (address < address_end) {
-		unsigned long  page_end = (address & PAGE_MASK) + PAGE_SIZE;
-		unsigned long  next     = page_end < address_end ? page_end
-								 : address_end;
-		unsigned long  avail    = next - address;
-		int status, wakeBit;
-
-		struct page *page;
-
-		/* Either vaddr or paddr depending on the device version */
-		unsigned long xaddr;
+		unsigned long page_end = (address & PAGE_MASK) + PAGE_SIZE;
+		unsigned long next, avail;
+		int status, wakeBit, page_i, num_contiguous_pages;
+		long first_page, last_page, requested_pages;
+		unsigned long xaddr, xaddr_prev, xaddr_i;
 
 		/*
-		 * We grab the pages on a page-by-page basis in case user
-		 * space gives us a potentially huge buffer but the read only
-		 * returns a small amount, then there's no need to pin that
-		 * much memory to the process.
+		 * Attempt to grab multiple physically contiguous pages.
 		 */
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, address, 1,
-				     !is_write, 0, &page, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (ret < 0)
-			return ret;
-
-		if (dev->version) {
-			/* Device version 1 or newer (qemu-android) expects the
-			 * physical address. */
-			xaddr = page_to_phys(page) | (address & ~PAGE_MASK);
-		} else {
-			/* Device version 0 (classic emulator) expects the
-			 * virtual address. */
-			xaddr = address;
+		first_page = address & PAGE_MASK;
+		last_page = (address_end - 1) & PAGE_MASK;
+		requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1;
+		if (requested_pages > MAX_PAGES_TO_GRAB) {
+			requested_pages = MAX_PAGES_TO_GRAB;
 		}
+		ret = get_user_pages_fast(first_page, requested_pages,
+				!is_write, pages);
+
+		DPRINT("%s: requested pages: %d %d\n", __FUNCTION__, ret, requested_pages);
+		if (ret == 0) {
+			DPRINT("%s: error: (requested pages == 0) (wanted %d)\n",
+					__FUNCTION__, requested_pages);
+			return ret;
+		}
+		if (ret < 0) {
+			DPRINT("%s: (requested pages < 0) %d \n",
+				 	__FUNCTION__, requested_pages);
+			return ret;
+		}
+
+		xaddr = page_to_phys(pages[0]) | (address & ~PAGE_MASK);
+		xaddr_prev = xaddr;
+		num_contiguous_pages = ret == 0 ? 0 : 1;
+		for (page_i = 1; page_i < ret; page_i++) {
+			xaddr_i = page_to_phys(pages[page_i]) | (address & ~PAGE_MASK);
+			if (xaddr_i == xaddr_prev + PAGE_SIZE) {
+				page_end += PAGE_SIZE;
+				xaddr_prev = xaddr_i;
+				num_contiguous_pages++;
+			} else {
+				DPRINT("%s: discontinuous page boundary: %d pages instead\n",
+						__FUNCTION__, page_i);
+				break;
+			}
+		}
+		next = page_end < address_end ? page_end : address_end;
+		avail = next - address;
 
 		/* Now, try to transfer the bytes in the current page */
 		spin_lock_irqsave(&dev->lock, irq_flags);
 		if (access_with_param(dev,
-				      is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER,
-				      xaddr, avail, pipe, &status)) {
+					is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER,
+					xaddr, avail, pipe, &status)) {
 			gf_write_ptr(pipe, dev->base + PIPE_REG_CHANNEL,
 				     dev->base + PIPE_REG_CHANNEL_HIGH);
 			writel(avail, dev->base + PIPE_REG_SIZE);
@@ -343,9 +370,13 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 		}
 		spin_unlock_irqrestore(&dev->lock, irq_flags);
 
-		if (status > 0 && !is_write)
-			set_page_dirty(page);
-		put_page(page);
+		for (page_i = 0; page_i < ret; page_i++) {
+			if (status > 0 && !is_write &&
+				page_i < num_contiguous_pages) {
+				set_page_dirty(pages[page_i]);
+			}
+			put_page(pages[page_i]);
+		}
 
 		if (status > 0) { /* Correct transfer */
 			count += status;
@@ -367,7 +398,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 			 */
 			if (status != PIPE_ERROR_AGAIN)
 				pr_info_ratelimited("goldfish_pipe: backend returned error %d on %s\n",
-					    status, is_write ? "write" : "read");
+						status, is_write ? "write" : "read");
 			ret = 0;
 			break;
 		}
@@ -377,7 +408,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 		 * non-blocking mode, just return the error code.
 		 */
 		if (status != PIPE_ERROR_AGAIN ||
-			(filp->f_flags & O_NONBLOCK) != 0) {
+				(filp->f_flags & O_NONBLOCK) != 0) {
 			ret = goldfish_pipe_error_convert(status);
 			break;
 		}
@@ -391,7 +422,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 
 		/* Tell the emulator we're going to wait for a wake event */
 		goldfish_cmd(pipe,
-			     is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ);
+				is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ);
 
 		/* Unlock the pipe, then wait for the wake signal */
 		mutex_unlock(&pipe->lock);
@@ -399,15 +430,11 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 		while (test_bit(wakeBit, &pipe->flags)) {
 			if (wait_event_interruptible(
 					pipe->wake_queue,
-					!test_bit(wakeBit, &pipe->flags))) {
-				ret = -ERESTARTSYS;
-				break;
-			}
+					!test_bit(wakeBit, &pipe->flags)))
+				return -ERESTARTSYS;
 
-			if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) {
-				ret = -EIO;
-				break;
-			}
+			if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
+				return -EIO;
 		}
 
 		/* Try to re-acquire the lock */
@@ -543,6 +570,8 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 
 	pipe->dev = dev;
 	mutex_init(&pipe->lock);
+	DPRINT("%s: call. pipe_dev pipe_dev=0x%lx new_pipe_addr=0x%lx file=0x%lx\n", __FUNCTION__, pipe_dev, pipe, file);
+	// spin lock init, write head of list, i guess
 	init_waitqueue_head(&pipe->wake_queue);
 
 	/*
@@ -565,6 +594,7 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
 {
 	struct goldfish_pipe *pipe = filp->private_data;
 
+	DPRINT("%s: call. pipe=0x%lx file=0x%lx\n", __FUNCTION__, pipe, filp);
 	/* The guest is closing the channel, so tell the emulator right now */
 	goldfish_cmd(pipe, CMD_CLOSE);
 	kfree(pipe);
@@ -589,6 +619,7 @@ static struct miscdevice goldfish_pipe_device = {
 
 static int goldfish_pipe_probe(struct platform_device *pdev)
 {
+	DPRINT("%s: call. platform_device=0x%lx\n", __FUNCTION__, pdev);
 	int err;
 	struct resource *r;
 	struct goldfish_pipe_dev *dev = pipe_dev;

From bc43565e1ac5ba3f204886a2275726bb4c3d44e6 Mon Sep 17 00:00:00 2001
From: Yurii Zubrytskyi <zyy@google.com>
Date: Fri, 29 Jul 2016 10:51:46 -0700
Subject: [PATCH 0555/3220] ANDROID: goldfish_pipe: An implementation of more
 parallel pipe

This is a driver code for a redesigned android pipe.
Currently it works for x86 and x64 emulators with the following
performance results:
  ADB push to /dev/null,
  Ubuntu,
  400 MB file,
  times are for 1/10/100 parallel adb commands
x86 adb push: (4.4s / 11.5s / 2m10s) -> (2.8s / 6s / 51s)
x64 adb push: (7s / 15s / (too long, 6m+) -> (2.7s / 6.2s / 52s)

ADB pull and push to /data/ have the same %% of speedup
More importantly, I don't see any signs of slowdowns when
run in parallel with Antutu benchmark, so it is definitely
making much better job at multithreading.

The code features dynamic host detection: old emulator gets
the previous version of the pipe driver code.

Combine follow patch from android-goldfish-3.10

b543285 [pipe] Increase the default pipe buffers size, make it configurable

Signed-off-by: "Yurii Zubrytskyi" <zyy@google.com>
Change-Id: I140d506204cab6e78dd503e5a43abc8886e4ffff
---
 drivers/platform/goldfish/Makefile           |   2 +-
 drivers/platform/goldfish/goldfish_pipe.c    | 168 +---
 drivers/platform/goldfish/goldfish_pipe.h    |  91 ++
 drivers/platform/goldfish/goldfish_pipe_v2.c | 888 +++++++++++++++++++
 4 files changed, 1005 insertions(+), 144 deletions(-)
 create mode 100644 drivers/platform/goldfish/goldfish_pipe.h
 create mode 100644 drivers/platform/goldfish/goldfish_pipe_v2.c

diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile
index d3487125838c..e53ae2fc717b 100644
--- a/drivers/platform/goldfish/Makefile
+++ b/drivers/platform/goldfish/Makefile
@@ -2,4 +2,4 @@
 # Makefile for Goldfish platform specific drivers
 #
 obj-$(CONFIG_GOLDFISH_BUS)	+= pdev_bus.o
-obj-$(CONFIG_GOLDFISH_PIPE)	+= goldfish_pipe.o
+obj-$(CONFIG_GOLDFISH_PIPE)	+= goldfish_pipe.o goldfish_pipe_v2.o
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 55929ed58ea3..cf7ce97e7346 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -15,51 +15,11 @@
  *
  */
 
-/* This source file contains the implementation of a special device driver
- * that intends to provide a *very* fast communication channel between the
- * guest system and the QEMU emulator.
- *
- * Usage from the guest is simply the following (error handling simplified):
- *
- *    int  fd = open("/dev/qemu_pipe",O_RDWR);
- *    .... write() or read() through the pipe.
- *
- * This driver doesn't deal with the exact protocol used during the session.
- * It is intended to be as simple as something like:
- *
- *    // do this _just_ after opening the fd to connect to a specific
- *    // emulator service.
- *    const char*  msg = "<pipename>";
- *    if (write(fd, msg, strlen(msg)+1) < 0) {
- *       ... could not connect to <pipename> service
- *       close(fd);
- *    }
- *
- *    // after this, simply read() and write() to communicate with the
- *    // service. Exact protocol details left as an exercise to the reader.
- *
- * This driver is very fast because it doesn't copy any data through
- * intermediate buffers, since the emulator is capable of translating
- * guest user addresses into host ones.
- *
- * Note that we must however ensure that each user page involved in the
- * exchange is properly mapped during a transfer.
+/* This source file contains the implementation of the legacy version of
+ * a goldfish pipe device driver. See goldfish_pipe_v2.c for the current
+ * version.
  */
-
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/miscdevice.h>
-#include <linux/platform_device.h>
-#include <linux/poll.h>
-#include <linux/sched.h>
-#include <linux/bitops.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-#include <linux/goldfish.h>
-#include <linux/mm.h>
-#include <linux/acpi.h>
+#include "goldfish_pipe.h"
 
 /*
  * IMPORTANT: The following constants must match the ones used and defined
@@ -119,6 +79,14 @@
 #define DPRINT(...)
 #endif
 
+/* This data type models a given pipe instance */
+struct goldfish_pipe {
+	struct goldfish_pipe_dev *dev;
+	struct mutex lock;
+	unsigned long flags;
+	wait_queue_head_t wake_queue;
+};
+
 struct access_params {
 	unsigned long channel;
 	u32 size;
@@ -129,29 +97,6 @@ struct access_params {
 	u32 flags;
 };
 
-/* The global driver data. Holds a reference to the i/o page used to
- * communicate with the emulator, and a wake queue for blocked tasks
- * waiting to be awoken.
- */
-struct goldfish_pipe_dev {
-	spinlock_t lock;
-	unsigned char __iomem *base;
-	struct access_params *aps;
-	int irq;
-	u32 version;
-};
-
-static struct goldfish_pipe_dev   pipe_dev[1];
-
-/* This data type models a given pipe instance */
-struct goldfish_pipe {
-	struct goldfish_pipe_dev *dev;
-	struct mutex lock;
-	unsigned long flags;
-	wait_queue_head_t wake_queue;
-};
-
-
 /* Bit flags for the 'flags' field */
 enum {
 	BIT_CLOSED_ON_HOST = 0,  /* pipe closed by host */
@@ -323,7 +268,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 		ret = get_user_pages_fast(first_page, requested_pages,
 				!is_write, pages);
 
-		DPRINT("%s: requested pages: %d %d\n", __FUNCTION__, ret, requested_pages);
+		DPRINT("%s: requested pages: %d %d %p\n", __FUNCTION__,
+			ret, requested_pages, first_page);
 		if (ret == 0) {
 			DPRINT("%s: error: (requested pages == 0) (wanted %d)\n",
 					__FUNCTION__, requested_pages);
@@ -611,97 +557,33 @@ static const struct file_operations goldfish_pipe_fops = {
 	.release = goldfish_pipe_release,
 };
 
-static struct miscdevice goldfish_pipe_device = {
+static struct miscdevice goldfish_pipe_dev = {
 	.minor = MISC_DYNAMIC_MINOR,
 	.name = "goldfish_pipe",
 	.fops = &goldfish_pipe_fops,
 };
 
-static int goldfish_pipe_probe(struct platform_device *pdev)
+int goldfish_pipe_device_init_v1(struct platform_device *pdev)
 {
-	DPRINT("%s: call. platform_device=0x%lx\n", __FUNCTION__, pdev);
-	int err;
-	struct resource *r;
 	struct goldfish_pipe_dev *dev = pipe_dev;
-
-	/* not thread safe, but this should not happen */
-	WARN_ON(dev->base != NULL);
-
-	spin_lock_init(&dev->lock);
-
-	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (r == NULL || resource_size(r) < PAGE_SIZE) {
-		dev_err(&pdev->dev, "can't allocate i/o page\n");
-		return -EINVAL;
-	}
-	dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE);
-	if (dev->base == NULL) {
-		dev_err(&pdev->dev, "ioremap failed\n");
-		return -EINVAL;
-	}
-
-	r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (r == NULL) {
-		err = -EINVAL;
-		goto error;
-	}
-	dev->irq = r->start;
-
-	err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
+	int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
 				IRQF_SHARED, "goldfish_pipe", dev);
 	if (err) {
-		dev_err(&pdev->dev, "unable to allocate IRQ\n");
-		goto error;
+		dev_err(&pdev->dev, "unable to allocate IRQ for v1\n");
+		return err;
 	}
 
-	err = misc_register(&goldfish_pipe_device);
+	err = misc_register(&goldfish_pipe_dev);
 	if (err) {
-		dev_err(&pdev->dev, "unable to register device\n");
-		goto error;
+		dev_err(&pdev->dev, "unable to register v1 device\n");
+		return err;
 	}
+
 	setup_access_params_addr(pdev, dev);
-
-	/* Although the pipe device in the classic Android emulator does not
-	 * recognize the 'version' register, it won't treat this as an error
-	 * either and will simply return 0, which is fine. */
-	dev->version = readl(dev->base + PIPE_REG_VERSION);
 	return 0;
-
-error:
-	dev->base = NULL;
-	return err;
 }
 
-static int goldfish_pipe_remove(struct platform_device *pdev)
+void goldfish_pipe_device_deinit_v1(struct platform_device *pdev)
 {
-	struct goldfish_pipe_dev *dev = pipe_dev;
-	misc_deregister(&goldfish_pipe_device);
-	dev->base = NULL;
-	return 0;
+    misc_deregister(&goldfish_pipe_dev);
 }
-
-static const struct acpi_device_id goldfish_pipe_acpi_match[] = {
-	{ "GFSH0003", 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match);
-
-static const struct of_device_id goldfish_pipe_of_match[] = {
-	{ .compatible = "generic,android-pipe", },
-	{},
-};
-MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match);
-
-static struct platform_driver goldfish_pipe = {
-	.probe = goldfish_pipe_probe,
-	.remove = goldfish_pipe_remove,
-	.driver = {
-		.name = "goldfish_pipe",
-		.of_match_table = goldfish_pipe_of_match,
-		.acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match),
-	}
-};
-
-module_platform_driver(goldfish_pipe);
-MODULE_AUTHOR("David Turner <digit@google.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/platform/goldfish/goldfish_pipe.h b/drivers/platform/goldfish/goldfish_pipe.h
new file mode 100644
index 000000000000..9b75a51dba24
--- /dev/null
+++ b/drivers/platform/goldfish/goldfish_pipe.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef GOLDFISH_PIPE_H
+#define GOLDFISH_PIPE_H
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/miscdevice.h>
+#include <linux/platform_device.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/goldfish.h>
+#include <linux/mm.h>
+#include <linux/acpi.h>
+
+
+/* Initialize the legacy version of the pipe device driver */
+int goldfish_pipe_device_init_v1(struct platform_device *pdev);
+
+/* Deinitialize the legacy version of the pipe device driver */
+void goldfish_pipe_device_deinit_v1(struct platform_device *pdev);
+
+/* Forward declarations for the device struct */
+struct goldfish_pipe;
+struct goldfish_pipe_device_buffers;
+
+/* The global driver data. Holds a reference to the i/o page used to
+ * communicate with the emulator, and a wake queue for blocked tasks
+ * waiting to be awoken.
+ */
+struct goldfish_pipe_dev {
+	/*
+	 * Global device spinlock. Protects the following members:
+	 *  - pipes, pipes_capacity
+	 *  - [*pipes, *pipes + pipes_capacity) - array data
+	 *  - first_signalled_pipe,
+	 *      goldfish_pipe::prev_signalled,
+	 *      goldfish_pipe::next_signalled,
+	 *      goldfish_pipe::signalled_flags - all singnalled-related fields,
+	 *                                       in all allocated pipes
+	 *  - open_command_params - PIPE_CMD_OPEN-related buffers
+	 *
+	 * It looks like a lot of different fields, but the trick is that the only
+	 * operation that happens often is the signalled pipes array manipulation.
+	 * That's why it's OK for now to keep the rest of the fields under the same
+	 * lock. If we notice too much contention because of PIPE_CMD_OPEN,
+	 * then we should add a separate lock there.
+	 */
+	spinlock_t lock;
+
+	/*
+	 * Array of the pipes of |pipes_capacity| elements,
+	 * indexed by goldfish_pipe::id
+	 */
+	struct goldfish_pipe **pipes;
+	u32 pipes_capacity;
+
+	/* Pointers to the buffers host uses for interaction with this driver */
+	struct goldfish_pipe_dev_buffers *buffers;
+
+	/* Head of a doubly linked list of signalled pipes */
+	struct goldfish_pipe *first_signalled_pipe;
+
+	/* Some device-specific data */
+	int irq;
+	int version;
+	unsigned char __iomem *base;
+
+	/* v1-specific access parameters */
+	struct access_params *aps;
+};
+
+extern struct goldfish_pipe_dev pipe_dev[1];
+
+#endif /* GOLDFISH_PIPE_H */
diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
new file mode 100644
index 000000000000..bdb039bc7dde
--- /dev/null
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -0,0 +1,888 @@
+/*
+ * Copyright (C) 2012 Intel, Inc.
+ * Copyright (C) 2013 Intel, Inc.
+ * Copyright (C) 2014 Linaro Limited
+ * Copyright (C) 2011-2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* This source file contains the implementation of a special device driver
+ * that intends to provide a *very* fast communication channel between the
+ * guest system and the QEMU emulator.
+ *
+ * Usage from the guest is simply the following (error handling simplified):
+ *
+ *    int  fd = open("/dev/qemu_pipe",O_RDWR);
+ *    .... write() or read() through the pipe.
+ *
+ * This driver doesn't deal with the exact protocol used during the session.
+ * It is intended to be as simple as something like:
+ *
+ *    // do this _just_ after opening the fd to connect to a specific
+ *    // emulator service.
+ *    const char*  msg = "<pipename>";
+ *    if (write(fd, msg, strlen(msg)+1) < 0) {
+ *       ... could not connect to <pipename> service
+ *       close(fd);
+ *    }
+ *
+ *    // after this, simply read() and write() to communicate with the
+ *    // service. Exact protocol details left as an exercise to the reader.
+ *
+ * This driver is very fast because it doesn't copy any data through
+ * intermediate buffers, since the emulator is capable of translating
+ * guest user addresses into host ones.
+ *
+ * Note that we must however ensure that each user page involved in the
+ * exchange is properly mapped during a transfer.
+ */
+
+#include "goldfish_pipe.h"
+
+
+/*
+ * Update this when something changes in the driver's behavior so the host
+ * can benefit from knowing it
+ */
+enum {
+	PIPE_DRIVER_VERSION = 2,
+	PIPE_CURRENT_DEVICE_VERSION = 2
+};
+
+/*
+ * IMPORTANT: The following constants must match the ones used and defined
+ * in external/qemu/hw/goldfish_pipe.c in the Android source tree.
+ */
+
+/* List of bitflags returned in status of CMD_POLL command */
+enum PipePollFlags {
+	PIPE_POLL_IN	= 1 << 0,
+	PIPE_POLL_OUT	= 1 << 1,
+	PIPE_POLL_HUP	= 1 << 2
+};
+
+/* Possible status values used to signal errors - see goldfish_pipe_error_convert */
+enum PipeErrors {
+	PIPE_ERROR_INVAL  = -1,
+	PIPE_ERROR_AGAIN  = -2,
+	PIPE_ERROR_NOMEM  = -3,
+	PIPE_ERROR_IO     = -4
+};
+
+/* Bit-flags used to signal events from the emulator */
+enum PipeWakeFlags {
+	PIPE_WAKE_CLOSED = 1 << 0,  /* emulator closed pipe */
+	PIPE_WAKE_READ   = 1 << 1,  /* pipe can now be read from */
+	PIPE_WAKE_WRITE  = 1 << 2  /* pipe can now be written to */
+};
+
+/* Bit flags for the 'flags' field */
+enum PipeFlagsBits {
+	BIT_CLOSED_ON_HOST = 0,  /* pipe closed by host */
+	BIT_WAKE_ON_WRITE  = 1,  /* want to be woken on writes */
+	BIT_WAKE_ON_READ   = 2,  /* want to be woken on reads */
+};
+
+enum PipeRegs {
+	PIPE_REG_CMD = 0,
+
+	PIPE_REG_SIGNAL_BUFFER_HIGH = 4,
+	PIPE_REG_SIGNAL_BUFFER = 8,
+	PIPE_REG_SIGNAL_BUFFER_COUNT = 12,
+
+	PIPE_REG_OPEN_BUFFER_HIGH = 20,
+	PIPE_REG_OPEN_BUFFER = 24,
+
+	PIPE_REG_VERSION = 36,
+
+	PIPE_REG_GET_SIGNALLED = 48,
+};
+
+enum PipeCmdCode {
+	PIPE_CMD_OPEN = 1,	/* to be used by the pipe device itself */
+	PIPE_CMD_CLOSE,
+	PIPE_CMD_POLL,
+	PIPE_CMD_WRITE,
+	PIPE_CMD_WAKE_ON_WRITE,
+	PIPE_CMD_READ,
+	PIPE_CMD_WAKE_ON_READ,
+
+	/*
+	 * TODO(zyy): implement a deferred read/write execution to allow parallel
+	 *  processing of pipe operations on the host.
+	*/
+	PIPE_CMD_WAKE_ON_DONE_IO,
+};
+
+enum {
+	MAX_BUFFERS_PER_COMMAND = 336,
+	MAX_SIGNALLED_PIPES = 64,
+	INITIAL_PIPES_CAPACITY = 64
+};
+
+struct goldfish_pipe_dev;
+struct goldfish_pipe;
+struct goldfish_pipe_command;
+
+/* A per-pipe command structure, shared with the host */
+struct goldfish_pipe_command {
+	s32 cmd;		/* PipeCmdCode, guest -> host */
+	s32 id;			/* pipe id, guest -> host */
+	s32 status;		/* command execution status, host -> guest */
+	s32 reserved;	/* to pad to 64-bit boundary */
+	union {
+		/* Parameters for PIPE_CMD_{READ,WRITE} */
+		struct {
+			u32 buffers_count;					/* number of buffers, guest -> host */
+			s32 consumed_size;					/* number of consumed bytes, host -> guest */
+			u64 ptrs[MAX_BUFFERS_PER_COMMAND]; 	/* buffer pointers, guest -> host */
+			u32 sizes[MAX_BUFFERS_PER_COMMAND];	/* buffer sizes, guest -> host */
+		} rw_params;
+	};
+};
+
+/* A single signalled pipe information */
+struct signalled_pipe_buffer {
+	u32 id;
+	u32 flags;
+};
+
+/* Parameters for the PIPE_CMD_OPEN command */
+struct open_command_param {
+	u64 command_buffer_ptr;
+	u32 rw_params_max_count;
+};
+
+/* Device-level set of buffers shared with the host */
+struct goldfish_pipe_dev_buffers {
+	struct open_command_param open_command_params;
+	struct signalled_pipe_buffer signalled_pipe_buffers[MAX_SIGNALLED_PIPES];
+};
+
+/* This data type models a given pipe instance */
+struct goldfish_pipe {
+	u32 id;							/* pipe ID - index into goldfish_pipe_dev::pipes array */
+	unsigned long flags;			/* The wake flags pipe is waiting for
+									 * Note: not protected with any lock, uses atomic operations
+									 *  and barriers to make it thread-safe.
+									 */
+	unsigned long signalled_flags;	/* wake flags host have signalled,
+									 *  - protected by goldfish_pipe_dev::lock */
+
+	struct goldfish_pipe_command *command_buffer;	/* A pointer to command buffer */
+
+	/* doubly linked list of signalled pipes, protected by goldfish_pipe_dev::lock */
+	struct goldfish_pipe *prev_signalled;
+	struct goldfish_pipe *next_signalled;
+
+	/*
+	 * A pipe's own lock. Protects the following:
+	 *  - *command_buffer - makes sure a command can safely write its parameters
+	 *    to the host and read the results back.
+	 */
+	struct mutex lock;
+
+	wait_queue_head_t wake_queue;	/* A wake queue for sleeping until host signals an event */
+	struct goldfish_pipe_dev *dev;	/* Pointer to the parent goldfish_pipe_dev instance */
+};
+
+struct goldfish_pipe_dev pipe_dev[1] = {};
+
+static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
+{
+	pipe->command_buffer->cmd = cmd;
+	pipe->command_buffer->status = PIPE_ERROR_INVAL;	/* failure by default */
+	writel(pipe->id, pipe->dev->base + PIPE_REG_CMD);
+	return pipe->command_buffer->status;
+}
+
+static int goldfish_cmd(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
+{
+	int status;
+	if (mutex_lock_interruptible(&pipe->lock))
+		return PIPE_ERROR_IO;
+	status = goldfish_cmd_locked(pipe, cmd);
+	mutex_unlock(&pipe->lock);
+	return status;
+}
+
+/*
+ * This function converts an error code returned by the emulator through
+ * the PIPE_REG_STATUS i/o register into a valid negative errno value.
+ */
+static int goldfish_pipe_error_convert(int status)
+{
+	switch (status) {
+	case PIPE_ERROR_AGAIN:
+		return -EAGAIN;
+	case PIPE_ERROR_NOMEM:
+		return -ENOMEM;
+	case PIPE_ERROR_IO:
+		return -EIO;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int pin_user_pages(unsigned long first_page, unsigned long last_page,
+	unsigned last_page_size, int is_write,
+	struct page *pages[MAX_BUFFERS_PER_COMMAND], unsigned *iter_last_page_size)
+{
+	int ret;
+	int requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1;
+	if (requested_pages > MAX_BUFFERS_PER_COMMAND) {
+		requested_pages = MAX_BUFFERS_PER_COMMAND;
+		*iter_last_page_size = PAGE_SIZE;
+	} else {
+		*iter_last_page_size = last_page_size;
+	}
+
+	ret = get_user_pages_fast(
+			first_page, requested_pages, !is_write, pages);
+	if (ret <= 0)
+		return -EFAULT;
+	if (ret < requested_pages)
+		*iter_last_page_size = PAGE_SIZE;
+	return ret;
+
+}
+
+static void release_user_pages(struct page **pages, int pages_count,
+	int is_write, s32 consumed_size)
+{
+	int i;
+	for (i = 0; i < pages_count; i++) {
+		if (!is_write && consumed_size > 0) {
+			set_page_dirty(pages[i]);
+		}
+		put_page(pages[i]);
+	}
+}
+
+/* Populate the call parameters, merging adjacent pages together */
+static void populate_rw_params(
+	struct page **pages, int pages_count,
+	unsigned long address, unsigned long address_end,
+	unsigned long first_page, unsigned long last_page,
+	unsigned iter_last_page_size, int is_write,
+	struct goldfish_pipe_command *command)
+{
+	/*
+	 * Process the first page separately - it's the only page that
+	 * needs special handling for its start address.
+	 */
+	unsigned long xaddr = page_to_phys(pages[0]);
+	unsigned long xaddr_prev = xaddr;
+	int buffer_idx = 0;
+	int i = 1;
+	int size_on_page = first_page == last_page
+			? (int)(address_end - address)
+			: (PAGE_SIZE - (address & ~PAGE_MASK));
+	command->rw_params.ptrs[0] = (u64)(xaddr | (address & ~PAGE_MASK));
+	command->rw_params.sizes[0] = size_on_page;
+	for (; i < pages_count; ++i) {
+		xaddr = page_to_phys(pages[i]);
+		size_on_page = (i == pages_count - 1) ? iter_last_page_size : PAGE_SIZE;
+		if (xaddr == xaddr_prev + PAGE_SIZE) {
+			command->rw_params.sizes[buffer_idx] += size_on_page;
+		} else {
+			++buffer_idx;
+			command->rw_params.ptrs[buffer_idx] = (u64)xaddr;
+			command->rw_params.sizes[buffer_idx] = size_on_page;
+		}
+		xaddr_prev = xaddr;
+	}
+	command->rw_params.buffers_count = buffer_idx + 1;
+}
+
+static int transfer_max_buffers(struct goldfish_pipe* pipe,
+	unsigned long address, unsigned long address_end, int is_write,
+	unsigned long last_page, unsigned int last_page_size,
+	s32* consumed_size, int* status)
+{
+	struct page *pages[MAX_BUFFERS_PER_COMMAND];
+	unsigned long first_page = address & PAGE_MASK;
+	unsigned int iter_last_page_size;
+	int pages_count = pin_user_pages(first_page, last_page,
+			last_page_size, is_write,
+			pages, &iter_last_page_size);
+	if (pages_count < 0)
+		return pages_count;
+
+	/* Serialize access to the pipe command buffers */
+	if (mutex_lock_interruptible(&pipe->lock))
+		return -ERESTARTSYS;
+
+	populate_rw_params(pages, pages_count, address, address_end,
+		first_page, last_page, iter_last_page_size, is_write,
+		pipe->command_buffer);
+
+	/* Transfer the data */
+	*status = goldfish_cmd_locked(pipe,
+						is_write ? PIPE_CMD_WRITE : PIPE_CMD_READ);
+
+	*consumed_size = pipe->command_buffer->rw_params.consumed_size;
+
+	mutex_unlock(&pipe->lock);
+
+	release_user_pages(pages, pages_count, is_write, *consumed_size);
+
+	return 0;
+}
+
+static int wait_for_host_signal(struct goldfish_pipe *pipe, int is_write)
+{
+	u32 wakeBit = is_write ? BIT_WAKE_ON_WRITE : BIT_WAKE_ON_READ;
+	set_bit(wakeBit, &pipe->flags);
+
+	/* Tell the emulator we're going to wait for a wake event */
+	(void)goldfish_cmd(pipe,
+			is_write ? PIPE_CMD_WAKE_ON_WRITE : PIPE_CMD_WAKE_ON_READ);
+
+	while (test_bit(wakeBit, &pipe->flags)) {
+		if (wait_event_interruptible(
+				pipe->wake_queue,
+				!test_bit(wakeBit, &pipe->flags)))
+			return -ERESTARTSYS;
+
+		if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
+			return -EIO;
+	}
+
+	return 0;
+}
+
+static ssize_t goldfish_pipe_read_write(struct file *filp,
+	char __user *buffer, size_t bufflen, int is_write)
+{
+	struct goldfish_pipe *pipe = filp->private_data;
+	int count = 0, ret = -EINVAL;
+	unsigned long address, address_end, last_page;
+	unsigned int last_page_size;
+
+	/* If the emulator already closed the pipe, no need to go further */
+	if (unlikely(test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)))
+		return -EIO;
+	/* Null reads or writes succeeds */
+	if (unlikely(bufflen == 0))
+		return 0;
+	/* Check the buffer range for access */
+	if (unlikely(!access_ok(is_write ? VERIFY_WRITE : VERIFY_READ,
+			buffer, bufflen)))
+		return -EFAULT;
+
+	address = (unsigned long)buffer;
+	address_end = address + bufflen;
+	last_page = (address_end - 1) & PAGE_MASK;
+	last_page_size = ((address_end - 1) & ~PAGE_MASK) + 1;
+
+	while (address < address_end) {
+		s32 consumed_size;
+		int status;
+		ret = transfer_max_buffers(pipe, address, address_end, is_write,
+				last_page, last_page_size, &consumed_size, &status);
+		if (ret < 0)
+			break;
+
+		if (consumed_size > 0) {
+			/* No matter what's the status, we've transfered something */
+			count += consumed_size;
+			address += consumed_size;
+		}
+		if (status > 0)
+			continue;
+		if (status == 0) {
+			/* EOF */
+			ret = 0;
+			break;
+		}
+		if (count > 0) {
+			/*
+			 * An error occured, but we already transfered
+			 * something on one of the previous iterations.
+			 * Just return what we already copied and log this
+			 * err.
+			 */
+			if (status != PIPE_ERROR_AGAIN)
+				pr_info_ratelimited("goldfish_pipe: backend error %d on %s\n",
+									status, is_write ? "write" : "read");
+			break;
+		}
+
+		/*
+		 * If the error is not PIPE_ERROR_AGAIN, or if we are in
+		 * non-blocking mode, just return the error code.
+		 */
+		if (status != PIPE_ERROR_AGAIN || (filp->f_flags & O_NONBLOCK) != 0) {
+			ret = goldfish_pipe_error_convert(status);
+			break;
+		}
+
+		status = wait_for_host_signal(pipe, is_write);
+		if (status < 0)
+			return status;
+	}
+
+	if (count > 0)
+		return count;
+	return ret;
+}
+
+static ssize_t goldfish_pipe_read(struct file *filp, char __user *buffer,
+				size_t bufflen, loff_t *ppos)
+{
+	return goldfish_pipe_read_write(filp, buffer, bufflen, /* is_write */ 0);
+}
+
+static ssize_t goldfish_pipe_write(struct file *filp,
+				const char __user *buffer, size_t bufflen,
+				loff_t *ppos)
+{
+	return goldfish_pipe_read_write(filp,
+			/* cast away the const */(char __user *)buffer, bufflen,
+			/* is_write */ 1);
+}
+
+static unsigned int goldfish_pipe_poll(struct file *filp, poll_table *wait)
+{
+	struct goldfish_pipe *pipe = filp->private_data;
+	unsigned int mask = 0;
+	int status;
+
+	poll_wait(filp, &pipe->wake_queue, wait);
+
+	status = goldfish_cmd(pipe, PIPE_CMD_POLL);
+	if (status < 0) {
+		return -ERESTARTSYS;
+	}
+
+	if (status & PIPE_POLL_IN)
+		mask |= POLLIN | POLLRDNORM;
+	if (status & PIPE_POLL_OUT)
+		mask |= POLLOUT | POLLWRNORM;
+	if (status & PIPE_POLL_HUP)
+		mask |= POLLHUP;
+	if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
+		mask |= POLLERR;
+
+	return mask;
+}
+
+static void signalled_pipes_add_locked(struct goldfish_pipe_dev *dev,
+	u32 id, u32 flags)
+{
+	struct goldfish_pipe *pipe;
+
+	BUG_ON(id >= dev->pipes_capacity);
+
+	pipe = dev->pipes[id];
+	if (!pipe)
+		return;
+	pipe->signalled_flags |= flags;
+
+	if (pipe->prev_signalled || pipe->next_signalled
+		|| dev->first_signalled_pipe == pipe)
+		return;	/* already in the list */
+	pipe->next_signalled = dev->first_signalled_pipe;
+	if (dev->first_signalled_pipe) {
+		dev->first_signalled_pipe->prev_signalled = pipe;
+	}
+	dev->first_signalled_pipe = pipe;
+}
+
+static void signalled_pipes_remove_locked(struct goldfish_pipe_dev *dev,
+	struct goldfish_pipe *pipe) {
+	if (pipe->prev_signalled)
+		pipe->prev_signalled->next_signalled = pipe->next_signalled;
+	if (pipe->next_signalled)
+		pipe->next_signalled->prev_signalled = pipe->prev_signalled;
+	if (pipe == dev->first_signalled_pipe)
+		dev->first_signalled_pipe = pipe->next_signalled;
+	pipe->prev_signalled = NULL;
+	pipe->next_signalled = NULL;
+}
+
+static struct goldfish_pipe *signalled_pipes_pop_front(struct goldfish_pipe_dev *dev,
+		int *wakes)
+{
+	struct goldfish_pipe *pipe;
+	unsigned long flags;
+	spin_lock_irqsave(&dev->lock, flags);
+
+	pipe = dev->first_signalled_pipe;
+	if (pipe) {
+		*wakes = pipe->signalled_flags;
+		pipe->signalled_flags = 0;
+		/*
+		 * This is an optimized version of signalled_pipes_remove_locked() -
+		 * we want to make it as fast as possible to wake the sleeping pipe
+		 * operations faster
+		 */
+		dev->first_signalled_pipe = pipe->next_signalled;
+		if (dev->first_signalled_pipe)
+			dev->first_signalled_pipe->prev_signalled = NULL;
+		pipe->next_signalled = NULL;
+	}
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+	return pipe;
+}
+
+static void goldfish_interrupt_task(unsigned long unused)
+{
+	struct goldfish_pipe_dev *dev = pipe_dev;
+	/* Iterate over the signalled pipes and wake them one by one */
+	struct goldfish_pipe *pipe;
+	int wakes;
+	while ((pipe = signalled_pipes_pop_front(dev, &wakes)) != NULL) {
+		if (wakes & PIPE_WAKE_CLOSED) {
+			pipe->flags = 1 << BIT_CLOSED_ON_HOST;
+		} else {
+			if (wakes & PIPE_WAKE_READ)
+				clear_bit(BIT_WAKE_ON_READ, &pipe->flags);
+			if (wakes & PIPE_WAKE_WRITE)
+				clear_bit(BIT_WAKE_ON_WRITE, &pipe->flags);
+		}
+		/*
+		 * wake_up_interruptible() implies a write barrier, so don't explicitly
+		 * add another one here.
+		 */
+		wake_up_interruptible(&pipe->wake_queue);
+	}
+}
+DECLARE_TASKLET(goldfish_interrupt_tasklet, goldfish_interrupt_task, 0);
+
+/*
+ * The general idea of the interrupt handling:
+ *
+ *  1. device raises an interrupt if there's at least one signalled pipe
+ *  2. IRQ handler reads the signalled pipes and their count from the device
+ *  3. device writes them into a shared buffer and returns the count
+ *      it only resets the IRQ if it has returned all signalled pipes,
+ *      otherwise it leaves it raised, so IRQ handler will be called
+ *      again for the next chunk
+ *  4. IRQ handler adds all returned pipes to the device's signalled pipes list
+ *  5. IRQ handler launches a tasklet to process the signalled pipes from the
+ *      list in a separate context
+ */
+static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id)
+{
+	u32 count;
+	u32 i;
+	unsigned long flags;
+	struct goldfish_pipe_dev *dev = dev_id;
+	if (dev != pipe_dev)
+		return IRQ_NONE;
+
+	/* Request the signalled pipes from the device */
+	spin_lock_irqsave(&dev->lock, flags);
+
+	count = readl(dev->base + PIPE_REG_GET_SIGNALLED);
+	if (count == 0) {
+		spin_unlock_irqrestore(&dev->lock, flags);
+		return IRQ_NONE;
+	}
+	if (count > MAX_SIGNALLED_PIPES)
+		count = MAX_SIGNALLED_PIPES;
+
+	for (i = 0; i < count; ++i)
+		signalled_pipes_add_locked(dev,
+			dev->buffers->signalled_pipe_buffers[i].id,
+			dev->buffers->signalled_pipe_buffers[i].flags);
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	tasklet_schedule(&goldfish_interrupt_tasklet);
+	return IRQ_HANDLED;
+}
+
+static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
+{
+	int id;
+	for (id = 0; id < dev->pipes_capacity; ++id)
+		if (!dev->pipes[id])
+			return id;
+
+	{
+		/* Reallocate the array */
+		u32 new_capacity = 2 * dev->pipes_capacity;
+		struct goldfish_pipe **pipes =
+				kcalloc(new_capacity, sizeof(*pipes), GFP_KERNEL);
+		if (!pipes)
+			return -ENOMEM;
+		memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity);
+		kfree(dev->pipes);
+		dev->pipes = pipes;
+		id = dev->pipes_capacity;
+		dev->pipes_capacity = new_capacity;
+	}
+	return id;
+}
+
+/**
+ *	goldfish_pipe_open - open a channel to the AVD
+ *	@inode: inode of device
+ *	@file: file struct of opener
+ *
+ *	Create a new pipe link between the emulator and the use application.
+ *	Each new request produces a new pipe.
+ *
+ *	Note: we use the pipe ID as a mux. All goldfish emulations are 32bit
+ *	right now so this is fine. A move to 64bit will need this addressing
+ */
+static int goldfish_pipe_open(struct inode *inode, struct file *file)
+{
+	struct goldfish_pipe_dev *dev = pipe_dev;
+	unsigned long flags;
+	int id;
+	int status;
+
+	/* Allocate new pipe kernel object */
+	struct goldfish_pipe *pipe = kzalloc(sizeof(*pipe), GFP_KERNEL);
+	if (pipe == NULL)
+		return -ENOMEM;
+
+	pipe->dev = dev;
+	mutex_init(&pipe->lock);
+	init_waitqueue_head(&pipe->wake_queue);
+
+	/*
+	 * Command buffer needs to be allocated on its own page to make sure it is
+	 * physically contiguous in host's address space.
+	 */
+	pipe->command_buffer =
+			(struct goldfish_pipe_command*)__get_free_page(GFP_KERNEL);
+	if (!pipe->command_buffer) {
+		status = -ENOMEM;
+		goto err_pipe;
+	}
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	id = get_free_pipe_id_locked(dev);
+	if (id < 0) {
+		status = id;
+		goto err_id_locked;
+	}
+
+	dev->pipes[id] = pipe;
+	pipe->id = id;
+	pipe->command_buffer->id = id;
+
+	/* Now tell the emulator we're opening a new pipe. */
+	dev->buffers->open_command_params.rw_params_max_count =
+			MAX_BUFFERS_PER_COMMAND;
+	dev->buffers->open_command_params.command_buffer_ptr =
+			(u64)(unsigned long)__pa(pipe->command_buffer);
+	status = goldfish_cmd_locked(pipe, PIPE_CMD_OPEN);
+	spin_unlock_irqrestore(&dev->lock, flags);
+	if (status < 0)
+		goto err_cmd;
+	/* All is done, save the pipe into the file's private data field */
+	file->private_data = pipe;
+	return 0;
+
+err_cmd:
+	spin_lock_irqsave(&dev->lock, flags);
+	dev->pipes[id] = NULL;
+err_id_locked:
+	spin_unlock_irqrestore(&dev->lock, flags);
+	free_page((unsigned long)pipe->command_buffer);
+err_pipe:
+	kfree(pipe);
+	return status;
+}
+
+static int goldfish_pipe_release(struct inode *inode, struct file *filp)
+{
+	unsigned long flags;
+	struct goldfish_pipe *pipe = filp->private_data;
+	struct goldfish_pipe_dev *dev = pipe->dev;
+
+	/* The guest is closing the channel, so tell the emulator right now */
+	(void)goldfish_cmd(pipe, PIPE_CMD_CLOSE);
+
+	spin_lock_irqsave(&dev->lock, flags);
+	dev->pipes[pipe->id] = NULL;
+	signalled_pipes_remove_locked(dev, pipe);
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	filp->private_data = NULL;
+	free_page((unsigned long)pipe->command_buffer);
+	kfree(pipe);
+	return 0;
+}
+
+static const struct file_operations goldfish_pipe_fops = {
+	.owner = THIS_MODULE,
+	.read = goldfish_pipe_read,
+	.write = goldfish_pipe_write,
+	.poll = goldfish_pipe_poll,
+	.open = goldfish_pipe_open,
+	.release = goldfish_pipe_release,
+};
+
+static struct miscdevice goldfish_pipe_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "goldfish_pipe",
+	.fops = &goldfish_pipe_fops,
+};
+
+static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
+{
+	char *page;
+	struct goldfish_pipe_dev *dev = pipe_dev;
+	int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
+				IRQF_SHARED, "goldfish_pipe", dev);
+	if (err) {
+		dev_err(&pdev->dev, "unable to allocate IRQ for v2\n");
+		return err;
+	}
+
+	err = misc_register(&goldfish_pipe_dev);
+	if (err) {
+		dev_err(&pdev->dev, "unable to register v2 device\n");
+		return err;
+	}
+
+	dev->first_signalled_pipe = NULL;
+	dev->pipes_capacity = INITIAL_PIPES_CAPACITY;
+	dev->pipes = kcalloc(dev->pipes_capacity, sizeof(*dev->pipes), GFP_KERNEL);
+	if (!dev->pipes)
+		return -ENOMEM;
+
+	/*
+	 * We're going to pass two buffers, open_command_params and
+	 * signalled_pipe_buffers, to the host. This means each of those buffers
+	 * needs to be contained in a single physical page. The easiest choice is
+	 * to just allocate a page and place the buffers in it.
+	 */
+	BUG_ON(sizeof(*dev->buffers) > PAGE_SIZE);
+	page = (char*)__get_free_page(GFP_KERNEL);
+	if (!page) {
+		kfree(dev->pipes);
+		return -ENOMEM;
+	}
+	dev->buffers = (struct goldfish_pipe_dev_buffers*)page;
+
+	/* Send the buffer addresses to the host */
+	{
+		u64 paddr = __pa(&dev->buffers->signalled_pipe_buffers);
+		writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH);
+		writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_SIGNAL_BUFFER);
+		writel((u32)MAX_SIGNALLED_PIPES, dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT);
+
+		paddr = __pa(&dev->buffers->open_command_params);
+		writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_OPEN_BUFFER_HIGH);
+		writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_OPEN_BUFFER);
+	}
+	return 0;
+}
+
+static void goldfish_pipe_device_deinit_v2(struct platform_device *pdev) {
+	struct goldfish_pipe_dev *dev = pipe_dev;
+	misc_deregister(&goldfish_pipe_dev);
+	kfree(dev->pipes);
+	free_page((unsigned long)dev->buffers);
+}
+
+static int goldfish_pipe_probe(struct platform_device *pdev)
+{
+	int err;
+	struct resource *r;
+	struct goldfish_pipe_dev *dev = pipe_dev;
+
+	BUG_ON(sizeof(struct goldfish_pipe_command) > PAGE_SIZE);
+
+	/* not thread safe, but this should not happen */
+	WARN_ON(dev->base != NULL);
+
+	spin_lock_init(&dev->lock);
+
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (r == NULL || resource_size(r) < PAGE_SIZE) {
+		dev_err(&pdev->dev, "can't allocate i/o page\n");
+		return -EINVAL;
+	}
+	dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE);
+	if (dev->base == NULL) {
+		dev_err(&pdev->dev, "ioremap failed\n");
+		return -EINVAL;
+	}
+
+	r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (r == NULL) {
+		err = -EINVAL;
+		goto error;
+	}
+	dev->irq = r->start;
+
+	/*
+	 * Exchange the versions with the host device
+	 *
+	 * Note: v1 driver used to not report its version, so we write it before
+	 *  reading device version back: this allows the host implementation to
+	 *  detect the old driver (if there was no version write before read).
+	 */
+	writel((u32)PIPE_DRIVER_VERSION, dev->base + PIPE_REG_VERSION);
+	dev->version = readl(dev->base + PIPE_REG_VERSION);
+	if (dev->version < PIPE_CURRENT_DEVICE_VERSION) {
+		/* initialize the old device version */
+		err = goldfish_pipe_device_init_v1(pdev);
+	} else {
+		/* Host device supports the new interface */
+		err = goldfish_pipe_device_init_v2(pdev);
+	}
+	if (!err)
+		return 0;
+
+error:
+	dev->base = NULL;
+	return err;
+}
+
+static int goldfish_pipe_remove(struct platform_device *pdev)
+{
+	struct goldfish_pipe_dev *dev = pipe_dev;
+	if (dev->version < PIPE_CURRENT_DEVICE_VERSION)
+		goldfish_pipe_device_deinit_v1(pdev);
+	else
+		goldfish_pipe_device_deinit_v2(pdev);
+	dev->base = NULL;
+	return 0;
+}
+
+static const struct acpi_device_id goldfish_pipe_acpi_match[] = {
+	{ "GFSH0003", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match);
+
+static const struct of_device_id goldfish_pipe_of_match[] = {
+	{ .compatible = "google,android-pipe", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match);
+
+static struct platform_driver goldfish_pipe_driver = {
+	.probe = goldfish_pipe_probe,
+	.remove = goldfish_pipe_remove,
+	.driver = {
+		.name = "goldfish_pipe",
+		.of_match_table = goldfish_pipe_of_match,
+		.acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match),
+	}
+};
+
+module_platform_driver(goldfish_pipe_driver);
+MODULE_AUTHOR("David Turner <digit@google.com>");
+MODULE_LICENSE("GPL");

From f5555c059ec1ee4a7298baa373aed789d35f0133 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Thu, 23 Jul 2015 10:40:57 -0700
Subject: [PATCH 0556/3220] ANDROID: arch: x86: disable pic for Android
 toolchain

Android toolchains enable PIC, so explicitly disable it with
-fno-pic (this is the upstream gcc default)

Signed-off-by: Greg Hackmann <ghackmann@google.com>
(cherry picked from commit 892606ece2bebfa5a1ed62e9552cc973707ae9d3)

Change-Id: I1e600363e5d18e459479fe4eb23d76855e16868d
---
 arch/x86/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 4086abca0b32..53949c886341 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -97,6 +97,8 @@ else
         KBUILD_CFLAGS += $(call cc-option,-mno-80387)
         KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
 
+        KBUILD_CFLAGS += -fno-pic
+
 	# Use -mpreferred-stack-boundary=3 if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
 

From 6297c6ba0d217d5b0998738fbfaff2f04cad77e6 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Thu, 17 Nov 2016 17:01:43 -0800
Subject: [PATCH 0557/3220] arm64: rename ranchu defconfig to ranchu64

Change-Id: Ib7cd1ef722167905957623f65c3cc064e9d5c357
---
 arch/arm64/configs/{ranchu_defconfig => ranchu64_defconfig} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename arch/arm64/configs/{ranchu_defconfig => ranchu64_defconfig} (100%)

diff --git a/arch/arm64/configs/ranchu_defconfig b/arch/arm64/configs/ranchu64_defconfig
similarity index 100%
rename from arch/arm64/configs/ranchu_defconfig
rename to arch/arm64/configs/ranchu64_defconfig

From 6502f5fcefadabb4ebd44583a11610751c3ff194 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Fri, 18 Nov 2016 07:26:19 +0100
Subject: [PATCH 0558/3220] ANDROID: goldfish_pipe: fix call_kern.cocci
 warnings

Function get_free_pipe_id_locked called on line 671 inside lock on line
669 but uses GFP_KERNEL.  Replace with GFP_ATOMIC.

Generated by: scripts/coccinelle/locks/call_kern.cocci

CC: Yurii Zubrytskyi <zyy@google.com>
Signed-off-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index bdb039bc7dde..ad373ed36555 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -616,7 +616,8 @@ static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
 		/* Reallocate the array */
 		u32 new_capacity = 2 * dev->pipes_capacity;
 		struct goldfish_pipe **pipes =
-				kcalloc(new_capacity, sizeof(*pipes), GFP_KERNEL);
+				kcalloc(new_capacity, sizeof(*pipes),
+					GFP_ATOMIC);
 		if (!pipes)
 			return -ENOMEM;
 		memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity);

From 5f93f3cb3ab26abd6791bc1f01e6a9db73e65536 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Fri, 18 Nov 2016 13:16:07 +0800
Subject: [PATCH 0559/3220] ANDROID: video: goldfishfb: fix
 platform_no_drv_owner.cocci warnings

drivers/video/fbdev/goldfishfb.c:318:3-8: No need to set .owner here. The core will do it.

 Remove .owner field if calls are used which set it automatically

Generated by: scripts/coccinelle/api/platform_no_drv_owner.cocci

CC: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
 drivers/video/fbdev/goldfishfb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c
index 131fee0341c3..1e56b50e4082 100644
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -322,7 +322,6 @@ static struct platform_driver goldfish_fb_driver = {
 	.remove		= goldfish_fb_remove,
 	.driver = {
 		.name = "goldfish_fb",
-		.owner = THIS_MODULE,
 		.of_match_table = goldfish_fb_of_match,
 		.acpi_match_table = ACPI_PTR(goldfish_fb_acpi_match),
 	}

From 37e461fc4453488c394ccdbdcb8e286c122b047a Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Fri, 18 Nov 2016 11:09:02 -0800
Subject: [PATCH 0560/3220] ANDROID: goldfish: goldfish_pipe: fix locking
 errors

If the get_user_pages_fast() call in goldfish_pipe_read_write() failed,
it would return while still holding pipe->lock.

goldfish_pipe_read_write() later releases and tries to re-acquire
pipe->lock.  If the re-acquire call failed, goldfish_pipe_read_write()
would try unlock pipe->lock on exit anyway.

This fixes the smatch messages:

drivers/platform/goldfish/goldfish_pipe.c:392 goldfish_pipe_read_write() error: double unlock 'mutex:&pipe->lock'
drivers/platform/goldfish/goldfish_pipe.c:397 goldfish_pipe_read_write() warn: inconsistent returns 'mutex:&pipe->lock'.

Change-Id: Ifd06a76b32027ca451a001704ade0c5440ed69c4
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 drivers/platform/goldfish/goldfish_pipe.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index cf7ce97e7346..fd1452e28352 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -273,11 +273,13 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 		if (ret == 0) {
 			DPRINT("%s: error: (requested pages == 0) (wanted %d)\n",
 					__FUNCTION__, requested_pages);
+			mutex_unlock(&pipe->lock);
 			return ret;
 		}
 		if (ret < 0) {
 			DPRINT("%s: (requested pages < 0) %d \n",
 				 	__FUNCTION__, requested_pages);
+			mutex_unlock(&pipe->lock);
 			return ret;
 		}
 
@@ -384,10 +386,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 		}
 
 		/* Try to re-acquire the lock */
-		if (mutex_lock_interruptible(&pipe->lock)) {
-			ret = -ERESTARTSYS;
-			break;
-		}
+		if (mutex_lock_interruptible(&pipe->lock))
+			return -ERESTARTSYS;
 	}
 	mutex_unlock(&pipe->lock);
 

From dc1767cd0f28e0d1ba607bb4b5e397366ae87336 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Fri, 18 Nov 2016 11:40:40 -0800
Subject: [PATCH 0561/3220] ANDROID: goldfish_pipe: fix allmodconfig build

tree:   https://android.googlesource.com/kernel/common android-4.4
head:   6297c6ba0d217d5b0998738fbfaff2f04cad77e6
commit: bc43565e1ac5ba3f204886a2275726bb4c3d44e6 [18/20] ANDROID:
goldfish_pipe: An implementation of more parallel pipe
config: i386-randconfig-s1-201646 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        git checkout bc43565e1ac5ba3f204886a2275726bb4c3d44e6
        # save the attached .config to linux build tree
        make ARCH=i386

All errors (new ones prefixed by >>):

>> ERROR: "goldfish_pipe_device_deinit_v1" [drivers/platform/goldfish/goldfish_pipe_v2.ko] undefined!
>> ERROR: "goldfish_pipe_device_init_v1" [drivers/platform/goldfish/goldfish_pipe_v2.ko] undefined!
>> ERROR: "pipe_dev" [drivers/platform/goldfish/goldfish_pipe.ko] undefined!

Change-Id: Ibd51441edf82e6bb6824acc05ea795570cc374e8
---
 drivers/platform/goldfish/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile
index e53ae2fc717b..277a820ee4e1 100644
--- a/drivers/platform/goldfish/Makefile
+++ b/drivers/platform/goldfish/Makefile
@@ -2,4 +2,5 @@
 # Makefile for Goldfish platform specific drivers
 #
 obj-$(CONFIG_GOLDFISH_BUS)	+= pdev_bus.o
-obj-$(CONFIG_GOLDFISH_PIPE)	+= goldfish_pipe.o goldfish_pipe_v2.o
+obj-$(CONFIG_GOLDFISH_PIPE)	+= goldfish_pipe_all.o
+goldfish_pipe_all-objs := goldfish_pipe.o goldfish_pipe_v2.o

From af80d7c3e91450fbcad01c497b394bf8ab01d37c Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 28 Nov 2016 14:35:22 -0800
Subject: [PATCH 0562/3220] UPSTREAM: timekeeping: Add a fast and NMI safe boot
 clock

This boot clock can be used as a tracing clock and will account for
suspend time.

To keep it NMI safe since we're accessing from tracing, we're not using a
separate timekeeper with updates to monotonic clock and boot offset
protected with seqlocks. This has the following minor side effects:

(1) Its possible that a timestamp be taken after the boot offset is updated
but before the timekeeper is updated. If this happens, the new boot offset
is added to the old timekeeping making the clock appear to update slightly
earlier:
   CPU 0                                        CPU 1
   timekeeping_inject_sleeptime64()
   __timekeeping_inject_sleeptime(tk, delta);
                                                timestamp();
   timekeeping_update(tk, TK_CLEAR_NTP...);

(2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
partially updated.  Since the tk->offs_boot update is a rare event, this
should be a rare occurrence which postprocessing should be able to handle.

Bug: b/33184060

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h |  1 +
 kernel/time/timekeeping.c   | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index ec89d846324c..b7246d2ed7c9 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -233,6 +233,7 @@ static inline u64 ktime_get_raw_ns(void)
 
 extern u64 ktime_get_mono_fast_ns(void);
 extern u64 ktime_get_raw_fast_ns(void);
+extern u64 ktime_get_boot_fast_ns(void);
 
 /*
  * Timespec interfaces utilizing the ktime based ones
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d563c1960302..7c92bad46761 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -402,6 +402,35 @@ u64 ktime_get_raw_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
 
+/**
+ * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
+ *
+ * To keep it NMI safe since we're accessing from tracing, we're not using a
+ * separate timekeeper with updates to monotonic clock and boot offset
+ * protected with seqlocks. This has the following minor side effects:
+ *
+ * (1) Its possible that a timestamp be taken after the boot offset is updated
+ * but before the timekeeper is updated. If this happens, the new boot offset
+ * is added to the old timekeeping making the clock appear to update slightly
+ * earlier:
+ *    CPU 0                                        CPU 1
+ *    timekeeping_inject_sleeptime64()
+ *    __timekeeping_inject_sleeptime(tk, delta);
+ *                                                 timestamp();
+ *    timekeeping_update(tk, TK_CLEAR_NTP...);
+ *
+ * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
+ * partially updated.  Since the tk->offs_boot update is a rare event, this
+ * should be a rare occurrence which postprocessing should be able to handle.
+ */
+u64 notrace ktime_get_boot_fast_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+
+	return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+}
+EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+
 /* Suspend-time cycles value for halted fast timekeeper. */
 static cycle_t cycles_at_suspend;
 

From 5fe7d45f445841c6261e8695428a532db0e0377c Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 28 Nov 2016 14:35:23 -0800
Subject: [PATCH 0563/3220] UPSTREAM: trace: Add an option for boot clock as
 trace clock

Unlike monotonic clock, boot clock as a trace clock will account for
time spent in suspend useful for tracing suspend/resume. This uses
earlier introduced infrastructure for using the fast boot clock.

Bug: b/33184060

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5f375d4c05fb..312557c04c10 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -890,6 +890,7 @@ static struct {
 	{ trace_clock,			"perf",		1 },
 	{ ktime_get_mono_fast_ns,	"mono",		1 },
 	{ ktime_get_raw_fast_ns,	"mono_raw",	1 },
+	{ ktime_get_boot_fast_ns,	"boot",		1 },
 	ARCH_TRACE_CLOCKS
 };
 

From 7c04e2fcb076dd28760f2eca2eaf48dd37e52a40 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 28 Nov 2016 14:35:24 -0800
Subject: [PATCH 0564/3220] UPSTREAM: trace: Update documentation for mono,
 mono_raw and boot clock

Documentation was missing for mono and mono_raw, add them and also for
the boot clock introduced in this series.

Bug: b/33184060

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.txt | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index cf2f9e9dfe4d..fa16fb2302a5 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -357,6 +357,26 @@ of ftrace. Here is a list of some of the key files:
 		  to correlate events across hypervisor/guest if
 		  tb_offset is known.
 
+	  mono: This uses the fast monotonic clock (CLOCK_MONOTONIC)
+		which is monotonic and is subject to NTP rate adjustments.
+
+	  mono_raw:
+		This is the raw monotonic clock (CLOCK_MONOTONIC_RAW)
+		which is montonic but is not subject to any rate adjustments
+		and ticks at the same rate as the hardware clocksource.
+
+	  boot: This is the boot clock (CLOCK_BOOTTIME) and is based on the
+		fast monotonic clock, but also accounts for time spent in
+		suspend. Since the clock access is designed for use in
+		tracing in the suspend path, some side effects are possible
+		if clock is accessed after the suspend time is accounted before
+		the fast mono clock is updated. In this case, the clock update
+		appears to happen slightly sooner than it normally would have.
+		Also on 32-bit systems, its possible that the 64-bit boot offset
+		sees a partial update. These effects are rare and post
+		processing should be able to handle them. See comments on
+		ktime_get_boot_fast_ns function for more information.
+
 	To set a clock, simply echo the clock name into this file.
 
 	  echo global > trace_clock

From 8e53f7c02371eeef39805cc998eb73ac19a1320d Mon Sep 17 00:00:00 2001
From: Ke Wang <ke.wang@spreadtrum.com>
Date: Fri, 25 Nov 2016 13:38:45 +0800
Subject: [PATCH 0565/3220] sched: tune: Fix lacking spinlock initialization

The spinlock used by boost_groups in sched tune must be initialized.
This commit fixes this lack and the following errors:

[    0.384739] c2 BUG: spinlock bad magic on CPU#2, swapper/2/0
[    0.390313] c2  lock: 0xffffffc15fe1fc80, .magic:00000000, .owner: <none>/-1, .owner_cpu: 0
[    0.398739] c2 CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.4.6+ #4
[    0.404816] c2 Hardware name: Spreadtrum SP9860gBoard (DT)
[    0.410462] c2 Call trace:
[    0.413159] c2 [<ffffff800808b50c>] dump_backtrace+0x0/0x210
[    0.418803] c2 [<ffffff800808b73c>] show_stack+0x20/0x28
[    0.424100] c2 [<ffffff8008433310>] dump_stack+0xa8/0xe0
[    0.429398] c2 [<ffffff8008139398>] spin_dump+0x78/0x9c
[    0.434608] c2 [<ffffff80081393ec>] spin_bug+0x30/0x3c
[    0.439644] c2 [<ffffff80081394e4>] do_raw_spin_lock+0xac/0x1b4
[    0.445639] c2 [<ffffff8008abffe4>] _raw_spin_lock_irqsave+0x58/0x68
[    0.451977] c2 [<ffffff800812a560>] schedtune_enqueue_task+0x84/0x3bc
[    0.458320] c2 [<ffffff8008111678>] enqueue_task_fair+0x438/0x208c
[    0.464487] c2 [<ffffff80080feeec>] activate_task+0x70/0xd0
[    0.470130] c2 [<ffffff80080ff4a4>] ttwu_do_activate.constprop.131+0x4c/0x98
[    0.477079] c2 [<ffffff80081005d0>] try_to_wake_up+0x254/0x54c
[    0.482899] c2 [<ffffff80081009d4>] default_wake_function+0x30/0x3c
[    0.489154] c2 [<ffffff8008122464>] autoremove_wake_function+0x3c/0x6c
[    0.495754] c2 [<ffffff8008121b70>] __wake_up_common+0x64/0xa4
[    0.501574] c2 [<ffffff8008121e9c>] __wake_up+0x48/0x60
[    0.506788] c2 [<ffffff8008150fac>] rcu_gp_kthread_wake+0x50/0x5c
[    0.512866] c2 [<ffffff8008151fec>] note_gp_changes+0xac/0xd4
[    0.518597] c2 [<ffffff8008153044>] rcu_process_callbacks+0xe8/0x93c
[    0.524940] c2 [<ffffff80080d0b84>] __do_softirq+0x24c/0x5b8
[    0.530584] c2 [<ffffff80080d1284>] irq_exit+0xc0/0xec
[    0.535623] c2 [<ffffff8008144208>] __handle_domain_irq+0x94/0xf8
[    0.541789] c2 [<ffffff8008082554>] gic_handle_irq+0x64/0xc0

Signed-off-by: Ke Wang <ke.wang@spreadtrum.com>
---
 kernel/sched/tune.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 68a24a044b0a..079b18802f17 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -725,6 +725,7 @@ schedtune_init_cgroups(void)
 	for_each_possible_cpu(cpu) {
 		bg = &per_cpu(cpu_boost_groups, cpu);
 		memset(bg, 0, sizeof(struct boost_groups));
+		raw_spin_lock_init(&bg->lock);
 	}
 
 	pr_info("schedtune: configured to support %d boost groups\n",

From 25dcb75878cd3b685a70c688758a4d3726e92703 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Mon, 12 Sep 2016 15:51:35 -0700
Subject: [PATCH 0566/3220] build: add build server configs for goldfish

Change-Id: Icd7a8d44df2b09394be5c6230c64ecb374cae236
---
 build.config.goldfish.arm    | 12 ++++++++++++
 build.config.goldfish.arm64  | 12 ++++++++++++
 build.config.goldfish.mips   | 11 +++++++++++
 build.config.goldfish.mips64 | 11 +++++++++++
 build.config.goldfish.x86    | 12 ++++++++++++
 build.config.goldfish.x86_64 | 12 ++++++++++++
 6 files changed, 70 insertions(+)
 create mode 100644 build.config.goldfish.arm
 create mode 100644 build.config.goldfish.arm64
 create mode 100644 build.config.goldfish.mips
 create mode 100644 build.config.goldfish.mips64
 create mode 100644 build.config.goldfish.x86
 create mode 100644 build.config.goldfish.x86_64

diff --git a/build.config.goldfish.arm b/build.config.goldfish.arm
new file mode 100644
index 000000000000..bab53668e033
--- /dev/null
+++ b/build.config.goldfish.arm
@@ -0,0 +1,12 @@
+ARCH=arm
+BRANCH=android-4.4
+CROSS_COMPILE=arm-linux-androidkernel-
+DEFCONFIG=ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=goldfish
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin
+FILES="
+arch/arm/boot/zImage
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.arm64 b/build.config.goldfish.arm64
new file mode 100644
index 000000000000..0b4c40604b76
--- /dev/null
+++ b/build.config.goldfish.arm64
@@ -0,0 +1,12 @@
+ARCH=arm64
+BRANCH=android-4.4
+CROSS_COMPILE=aarch64-linux-android-
+DEFCONFIG=ranchu64_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=goldfish
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
+FILES="
+arch/arm64/boot/Image
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.mips b/build.config.goldfish.mips
new file mode 100644
index 000000000000..5dcd8a181ec0
--- /dev/null
+++ b/build.config.goldfish.mips
@@ -0,0 +1,11 @@
+ARCH=mips
+BRANCH=android-4.4
+CROSS_COMPILE=mips64el-linux-android-
+DEFCONFIG=ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=goldfish
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
+FILES="
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.mips64 b/build.config.goldfish.mips64
new file mode 100644
index 000000000000..9c0b6cbfdb9b
--- /dev/null
+++ b/build.config.goldfish.mips64
@@ -0,0 +1,11 @@
+ARCH=mips
+BRANCH=android-4.4
+CROSS_COMPILE=mips64el-linux-android-
+DEFCONFIG=ranchu64_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=goldfish
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
+FILES="
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.x86 b/build.config.goldfish.x86
new file mode 100644
index 000000000000..2b8a9b75a14b
--- /dev/null
+++ b/build.config.goldfish.x86
@@ -0,0 +1,12 @@
+ARCH=x86
+BRANCH=android-4.4
+CROSS_COMPILE=x86_64-linux-android-
+DEFCONFIG=i386_ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=goldfish
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.x86_64 b/build.config.goldfish.x86_64
new file mode 100644
index 000000000000..940caefc800f
--- /dev/null
+++ b/build.config.goldfish.x86_64
@@ -0,0 +1,12 @@
+ARCH=x86_64
+BRANCH=android-4.4
+CROSS_COMPILE=x86_64-linux-android-
+DEFCONFIG=x86_64_ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=goldfish
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"

From ace74ccf82cfb2b73ce1df2e698d20c2fbc559dd Mon Sep 17 00:00:00 2001
From: Keun-young Park <keunyoung@google.com>
Date: Mon, 14 Nov 2016 18:25:15 -0800
Subject: [PATCH 0567/3220] ANDROID: dm verity: add minimum prefetch size

- For device like eMMC, it gives better performance to read more hash
  blocks at a time.
- For android, set it to default 128.
  For other devices, set it to 1 which is the same as now.
- saved boot-up time by 300ms in tested device

bug: 32246564

Cc: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Keun-young Park <keunyoung@google.com>
---
 drivers/md/Kconfig            | 16 ++++++++++++++++
 drivers/md/dm-verity-target.c |  9 ++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 6035794bc1f2..3d237a03dab3 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -458,6 +458,21 @@ config DM_VERITY
 
 	  If unsure, say N.
 
+config DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+	bool "Prefetch size 128"
+
+config DM_VERITY_HASH_PREFETCH_MIN_SIZE
+	int "Verity hash prefetch minimum size"
+	depends on DM_VERITY
+	range 1 4096
+	default 128 if DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+	default 1
+	---help---
+	  This sets minimum number of hash blocks to prefetch for dm-verity.
+	  For devices like eMMC, having larger prefetch size like 128 can improve
+	  performance with increased memory consumption for keeping more hashes
+	  in RAM.
+
 config DM_VERITY_FEC
 	bool "Verity forward error correction support"
 	depends on DM_VERITY
@@ -510,6 +525,7 @@ config DM_ANDROID_VERITY
 	depends on ASYMMETRIC_KEY_TYPE
 	depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
 	depends on MD_LINEAR
+	select DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
 	---help---
 	  This device-mapper target is virtually a VERITY target. This
 	  target is setup by reading the metadata contents piggybacked
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 9d3d4b297201..c7e97cf6e7fb 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -501,6 +501,7 @@ static void verity_prefetch_io(struct work_struct *work)
 		container_of(work, struct dm_verity_prefetch_work, work);
 	struct dm_verity *v = pw->v;
 	int i;
+	sector_t prefetch_size;
 
 	for (i = v->levels - 2; i >= 0; i--) {
 		sector_t hash_block_start;
@@ -523,8 +524,14 @@ static void verity_prefetch_io(struct work_struct *work)
 				hash_block_end = v->hash_blocks - 1;
 		}
 no_prefetch_cluster:
+		// for emmc, it is more efficient to send bigger read
+		prefetch_size = max((sector_t)CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE,
+			hash_block_end - hash_block_start + 1);
+		if ((hash_block_start + prefetch_size) >= (v->hash_start + v->hash_blocks)) {
+			prefetch_size = hash_block_end - hash_block_start + 1;
+		}
 		dm_bufio_prefetch(v->bufio, hash_block_start,
-				  hash_block_end - hash_block_start + 1);
+				  prefetch_size);
 	}
 
 	kfree(pw);

From 4272b1a3c3f6a25f3c92f7fcbfcd6167cfbc1ced Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Wed, 7 Dec 2016 18:11:48 -0800
Subject: [PATCH 0568/3220] build: fix build config kernel_dir

Change-Id: I88b87a9c85990b12dc8174349cfc14eddfb379d2
---
 build.config.goldfish.arm    | 2 +-
 build.config.goldfish.arm64  | 2 +-
 build.config.goldfish.mips   | 2 +-
 build.config.goldfish.mips64 | 2 +-
 build.config.goldfish.x86    | 2 +-
 build.config.goldfish.x86_64 | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/build.config.goldfish.arm b/build.config.goldfish.arm
index bab53668e033..866da9361b71 100644
--- a/build.config.goldfish.arm
+++ b/build.config.goldfish.arm
@@ -3,7 +3,7 @@ BRANCH=android-4.4
 CROSS_COMPILE=arm-linux-androidkernel-
 DEFCONFIG=ranchu_defconfig
 EXTRA_CMDS=''
-KERNEL_DIR=goldfish
+KERNEL_DIR=common
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin
 FILES="
 arch/arm/boot/zImage
diff --git a/build.config.goldfish.arm64 b/build.config.goldfish.arm64
index 0b4c40604b76..9c963cf4a3d8 100644
--- a/build.config.goldfish.arm64
+++ b/build.config.goldfish.arm64
@@ -3,7 +3,7 @@ BRANCH=android-4.4
 CROSS_COMPILE=aarch64-linux-android-
 DEFCONFIG=ranchu64_defconfig
 EXTRA_CMDS=''
-KERNEL_DIR=goldfish
+KERNEL_DIR=common
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
 FILES="
 arch/arm64/boot/Image
diff --git a/build.config.goldfish.mips b/build.config.goldfish.mips
index 5dcd8a181ec0..8af53d2c2940 100644
--- a/build.config.goldfish.mips
+++ b/build.config.goldfish.mips
@@ -3,7 +3,7 @@ BRANCH=android-4.4
 CROSS_COMPILE=mips64el-linux-android-
 DEFCONFIG=ranchu_defconfig
 EXTRA_CMDS=''
-KERNEL_DIR=goldfish
+KERNEL_DIR=common
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
 FILES="
 vmlinux
diff --git a/build.config.goldfish.mips64 b/build.config.goldfish.mips64
index 9c0b6cbfdb9b..2a33d36dc4c8 100644
--- a/build.config.goldfish.mips64
+++ b/build.config.goldfish.mips64
@@ -3,7 +3,7 @@ BRANCH=android-4.4
 CROSS_COMPILE=mips64el-linux-android-
 DEFCONFIG=ranchu64_defconfig
 EXTRA_CMDS=''
-KERNEL_DIR=goldfish
+KERNEL_DIR=common
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
 FILES="
 vmlinux
diff --git a/build.config.goldfish.x86 b/build.config.goldfish.x86
index 2b8a9b75a14b..f86253f58d4d 100644
--- a/build.config.goldfish.x86
+++ b/build.config.goldfish.x86
@@ -3,7 +3,7 @@ BRANCH=android-4.4
 CROSS_COMPILE=x86_64-linux-android-
 DEFCONFIG=i386_ranchu_defconfig
 EXTRA_CMDS=''
-KERNEL_DIR=goldfish
+KERNEL_DIR=common
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
 FILES="
 arch/x86/boot/bzImage
diff --git a/build.config.goldfish.x86_64 b/build.config.goldfish.x86_64
index 940caefc800f..e1738861ec5c 100644
--- a/build.config.goldfish.x86_64
+++ b/build.config.goldfish.x86_64
@@ -3,7 +3,7 @@ BRANCH=android-4.4
 CROSS_COMPILE=x86_64-linux-android-
 DEFCONFIG=x86_64_ranchu_defconfig
 EXTRA_CMDS=''
-KERNEL_DIR=goldfish
+KERNEL_DIR=common
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
 FILES="
 arch/x86/boot/bzImage

From 3a29814dae2302eda661514805d6fba32a8e3ba0 Mon Sep 17 00:00:00 2001
From: Ke Wang <ke.wang@spreadtrum.com>
Date: Thu, 8 Dec 2016 14:02:10 +0800
Subject: [PATCH 0569/3220] sched: fix wrong truncation of walt_avg

The result of "__entry->walt_avg = (__entry->demand << 10)" will exceed
the range of "unsigned int", which will be truncated and make the trace
looks like as follows:

UnityMain-4588  [004]  6029.645672: walt_update_history:  4588(UnityMain): runtime 9928307 samples 1 event 4
			demand 9928307 walt 157 pelt 870 (hist: 9928307 9604307 8440077 87392 34144328) cpu 4
UnityMain-4588  [004]  6029.653658: walt_update_history:  4588(UnityMain): runtime 10000000 samples 1 event 4
			demand 10000000 walt 165 pelt 886 (hist: 10000000 9955691 6549308 64000 34144328) cpu 4

Fix this by using a u64 type instead of unsgined int type and make the
trace as below:

UnityMain-4617  [004]   117.613558: walt_update_history:  4617(UnityMain): runtime 5770597 samples 1 event 4
			demand 7038739 walt 720 pelt 680 (hist: 5770597 7680001 8904509 65596 156) cpu 4
UnityMain-4617  [004]   117.633560: walt_update_history:  4617(UnityMain): runtime 9911238 samples 1 event 4
			demand 9911238 walt 1014 pelt 769 (hist: 9911238 5770597 7680001 0 1664188058) cpu 4

Signed-off-by: Ke Wang <ke.wang@spreadtrum.com>
---
 include/trace/events/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index dffaffab4bc8..c18d8c89bd12 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -1044,7 +1044,7 @@ TRACE_EVENT(walt_update_history,
 		__field(	 int,	samples			)
 		__field(	 int,	evt			)
 		__field(	 u64,	demand			)
-		__field(unsigned int,	walt_avg		)
+		__field(	 u64,	walt_avg		)
 		__field(unsigned int,	pelt_avg		)
 		__array(	 u32,	hist, RAVG_HIST_SIZE_MAX)
 		__field(	 int,	cpu			)
@@ -1066,7 +1066,7 @@ TRACE_EVENT(walt_update_history,
 	),
 
 	TP_printk("%d (%s): runtime %u samples %d event %d demand %llu"
-		" walt %u pelt %u (hist: %u %u %u %u %u) cpu %d",
+		" walt %llu pelt %u (hist: %u %u %u %u %u) cpu %d",
 		__entry->pid, __entry->comm,
 		__entry->runtime, __entry->samples, __entry->evt,
 		__entry->demand,

From e487a24793bbf6d1ff2ba1c20575a9adabc13698 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Tue, 6 Dec 2016 11:50:53 +0000
Subject: [PATCH 0570/3220] sched/walt: kill {min,max}_capacity

{min,max}_capacity are static variables that are only updated from
__update_min_max_capacity(), but not used anywhere else.

Remove them together with the function updating them. This has also
the nice side effect of fixing a LOCKDEP warning related to locking
all CPUs in update_min_max_capacity(), as reported by Ke Wang:

[    2.853595] c0 =============================================
[    2.859219] c0 [ INFO: possible recursive locking detected ]
[    2.864852] c0 4.4.6+ #5 Tainted: G        W
[    2.869604] c0 ---------------------------------------------
[    2.875230] c0 swapper/0/1 is trying to acquire lock:
[    2.880248]  (&rq->lock){-.-.-.}, at: [<ffffff80081241cc>] cpufreq_notifier_policy+0x2e8/0x37c
[    2.888815] c0
[    2.888815] c0 but task is already holding lock:
[    2.895132]  (&rq->lock){-.-.-.}, at: [<ffffff80081241cc>] cpufreq_notifier_policy+0x2e8/0x37c
[    2.903700] c0
[    2.903700] c0 other info that might help us debug this:
[    2.910710] c0  Possible unsafe locking scenario:
[    2.910710] c0
[    2.917112] c0        CPU0
[    2.919795] c0        ----
[    2.922478]   lock(&rq->lock);
[    2.925507]   lock(&rq->lock);
[    2.928536] c0
[    2.928536] c0  *** DEADLOCK ***
[    2.928536] c0
[    2.935200] c0  May be due to missing lock nesting notation
[    2.935200] c0
[    2.942471] c0 7 locks held by swapper/0/1:
[    2.946623]  #0:  (&dev->mutex){......}, at: [<ffffff800850e118>] __driver_attach+0x64/0xb8
[    2.954931]  #1:  (&dev->mutex){......}, at: [<ffffff800850e128>] __driver_attach+0x74/0xb8
[    2.963239]  #2:  (cpu_hotplug.lock){++++++}, at: [<ffffff80080cb218>] get_online_cpus+0x48/0xa8
[    2.971979]  #3:  (subsys mutex#6){+.+.+.}, at: [<ffffff800850bed4>] subsys_interface_register+0x44/0xc0
[    2.981411]  #4:  (&policy->rwsem){+.+.+.}, at: [<ffffff8008720338>] cpufreq_online+0x330/0x76c
[    2.990065]  #5:  ((cpufreq_policy_notifier_list).rwsem){.+.+..}, at: [<ffffff80080f3418>] blocking_notifier_call_chain+0x38/0xc4
[    3.001661]  #6:  (&rq->lock){-.-.-.}, at: [<ffffff80081241cc>] cpufreq_notifier_policy+0x2e8/0x37c
[    3.010661] c0
[    3.010661] c0 stack backtrace:
[    3.015514] c0 CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W 4.4.6+ #5
[    3.022864] c0 Hardware name: Spreadtrum SP9860g Board (DT)
[    3.028402] c0 Call trace:
[    3.031092] c0 [<ffffff800808b50c>] dump_backtrace+0x0/0x210
[    3.036716] c0 [<ffffff800808b73c>] show_stack+0x20/0x28
[    3.041994] c0 [<ffffff8008433310>] dump_stack+0xa8/0xe0
[    3.047273] c0 [<ffffff80081349e0>] __lock_acquire+0x1e0c/0x2218
[    3.053243] c0 [<ffffff80081353c0>] lock_acquire+0xe0/0x280
[    3.058784] c0 [<ffffff8008abfdfc>] _raw_spin_lock+0x44/0x58
[    3.064407] c0 [<ffffff80081241cc>] cpufreq_notifier_policy+0x2e8/0x37c
[    3.070983] c0 [<ffffff80080f3458>] blocking_notifier_call_chain+0x78/0xc4
[    3.077820] c0 [<ffffff8008720294>] cpufreq_online+0x28c/0x76c
[    3.083618] c0 [<ffffff80087208a4>] cpufreq_add_dev+0x98/0xdc
[    3.089331] c0 [<ffffff800850bf14>] subsys_interface_register+0x84/0xc0
[    3.095907] c0 [<ffffff800871fa0c>] cpufreq_register_driver+0x168/0x28c
[    3.102486] c0 [<ffffff80087272f8>] sprd_cpufreq_probe+0x134/0x19c
[    3.108629] c0 [<ffffff8008510768>] platform_drv_probe+0x58/0xd0
[    3.114599] c0 [<ffffff800850de2c>] driver_probe_device+0x1e8/0x470
[    3.120830] c0 [<ffffff800850e168>] __driver_attach+0xb4/0xb8
[    3.126541] c0 [<ffffff800850b750>] bus_for_each_dev+0x6c/0xac
[    3.132339] c0 [<ffffff800850d6c0>] driver_attach+0x2c/0x34
[    3.137877] c0 [<ffffff800850d234>] bus_add_driver+0x210/0x298
[    3.143676] c0 [<ffffff800850f1f4>] driver_register+0x7c/0x114
[    3.149476] c0 [<ffffff8008510654>] __platform_driver_register+0x60/0x6c
[    3.156139] c0 [<ffffff8008f49f40>] sprd_cpufreq_platdrv_init+0x18/0x20
[    3.162714] c0 [<ffffff8008082a64>] do_one_initcall+0xd0/0x1d8
[    3.168514] c0 [<ffffff8008f0bc58>] kernel_init_freeable+0x1fc/0x29c
[    3.174834] c0 [<ffffff8008ab554c>] kernel_init+0x20/0x12c
[    3.180281] c0 [<ffffff8008086290>] ret_from_fork+0x10/0x40

Reported-by: Ke Wang <ke.wang@spreadtrum.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 kernel/sched/walt.c | 45 +--------------------------------------------
 1 file changed, 1 insertion(+), 44 deletions(-)

diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 2ffb1680b380..6e053bd9830c 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -62,8 +62,6 @@ static unsigned int max_possible_freq = 1;
  */
 static unsigned int min_max_freq = 1;
 
-static unsigned int max_capacity = 1024;
-static unsigned int min_capacity = 1024;
 static unsigned int max_load_scale_factor = 1024;
 static unsigned int max_possible_capacity = 1024;
 
@@ -869,39 +867,6 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
 		double_rq_unlock(src_rq, dest_rq);
 }
 
-/* Keep track of max/min capacity possible across CPUs "currently" */
-static void __update_min_max_capacity(void)
-{
-	int i;
-	int max = 0, min = INT_MAX;
-
-	for_each_online_cpu(i) {
-		if (cpu_rq(i)->capacity > max)
-			max = cpu_rq(i)->capacity;
-		if (cpu_rq(i)->capacity < min)
-			min = cpu_rq(i)->capacity;
-	}
-
-	max_capacity = max;
-	min_capacity = min;
-}
-
-static void update_min_max_capacity(void)
-{
-	unsigned long flags;
-	int i;
-
-	local_irq_save(flags);
-	for_each_possible_cpu(i)
-		raw_spin_lock(&cpu_rq(i)->lock);
-
-	__update_min_max_capacity();
-
-	for_each_possible_cpu(i)
-		raw_spin_unlock(&cpu_rq(i)->lock);
-	local_irq_restore(flags);
-}
-
 /*
  * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
  * least efficient cpu gets capacity of 1024
@@ -984,15 +949,9 @@ static int cpufreq_notifier_policy(struct notifier_block *nb,
 	/* Initialized to policy->max in case policy->related_cpus is empty! */
 	unsigned int orig_max_freq = policy->max;
 
-	if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
-						val != CPUFREQ_CREATE_POLICY)
+	if (val != CPUFREQ_NOTIFY)
 		return 0;
 
-	if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
-		update_min_max_capacity();
-		return 0;
-	}
-
 	for_each_cpu(i, policy->related_cpus) {
 		cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
 			     policy->related_cpus);
@@ -1082,8 +1041,6 @@ static int cpufreq_notifier_policy(struct notifier_block *nb,
 		max_load_scale_factor = highest_mplsf;
 	}
 
-	__update_min_max_capacity();
-
 	return 0;
 }
 

From 3313d27976a9f1fb3aa96487fdf11724a7f08cb3 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Thu, 8 Dec 2016 17:06:03 -0800
Subject: [PATCH 0571/3220] goldfish: enable CONFIG_INET_DIAG_DESTROY

Bug: 31648368
Change-Id: I3715cc6474129ba2176be62ed2c0a7d09a6f2ac7
---
 arch/arm/configs/ranchu_defconfig        | 1 +
 arch/arm64/configs/ranchu64_defconfig    | 1 +
 arch/x86/configs/i386_ranchu_defconfig   | 2 +-
 arch/x86/configs/x86_64_ranchu_defconfig | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig
index 35a90af941a4..49e7bbd5825a 100644
--- a/arch/arm/configs/ranchu_defconfig
+++ b/arch/arm/configs/ranchu_defconfig
@@ -48,6 +48,7 @@ CONFIG_UNIX=y
 CONFIG_XFRM_USER=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_ADVANCED_ROUTER=y
 CONFIG_IP_MULTIPLE_TABLES=y
diff --git a/arch/arm64/configs/ranchu64_defconfig b/arch/arm64/configs/ranchu64_defconfig
index 00eb346e0928..fc55008d8c4c 100644
--- a/arch/arm64/configs/ranchu64_defconfig
+++ b/arch/arm64/configs/ranchu64_defconfig
@@ -50,6 +50,7 @@ CONFIG_UNIX=y
 CONFIG_XFRM_USER=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_ADVANCED_ROUTER=y
 CONFIG_IP_MULTIPLE_TABLES=y
diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig
index 0206eb8cfb61..65ed8c8f8444 100644
--- a/arch/x86/configs/i386_ranchu_defconfig
+++ b/arch/x86/configs/i386_ranchu_defconfig
@@ -89,7 +89,7 @@ CONFIG_SYN_COOKIES=y
 CONFIG_INET_ESP=y
 # CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_LRO is not set
-# CONFIG_INET_DIAG is not set
+CONFIG_INET_DIAG_DESTROY=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
 CONFIG_IPV6_OPTIMISTIC_DAD=y
diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig
index dd389774bacb..d977bd91e390 100644
--- a/arch/x86/configs/x86_64_ranchu_defconfig
+++ b/arch/x86/configs/x86_64_ranchu_defconfig
@@ -87,7 +87,7 @@ CONFIG_SYN_COOKIES=y
 CONFIG_INET_ESP=y
 # CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_LRO is not set
-# CONFIG_INET_DIAG is not set
+CONFIG_INET_DIAG_DESTROY=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
 CONFIG_IPV6_OPTIMISTIC_DAD=y

From fde8582a59f5b6968d81d770d1fb22d3487392a3 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Thu, 3 Mar 2016 15:10:59 +0100
Subject: [PATCH 0572/3220] UPSTREAM: arm64: enable CONFIG_DEBUG_RODATA by
 default

(Cherry picked from commit 57efac2f7108e3255d0dfe512290c9896f4ed55f)

In spite of its name, CONFIG_DEBUG_RODATA is an important hardening feature
for production kernels, and distros all enable it by default in their
kernel configs. However, since enabling it used to result in more granular,
and thus less efficient kernel mappings, it is not enabled by default for
performance reasons.

However, since commit 2f39b5f91eb4 ("arm64: mm: Mark .rodata as RO"), the
various kernel segments (.text, .rodata, .init and .data) are already
mapped individually, and the only effect of setting CONFIG_DEBUG_RODATA is
that the existing .text and .rodata mappings are updated late in the boot
sequence to have their read-only attributes set, which means that any
performance concerns related to enabling CONFIG_DEBUG_RODATA are no longer
valid.

So from now on, make CONFIG_DEBUG_RODATA default to 'y'

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/Kconfig.debug | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 04fb73b973f1..8b0cd45de394 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -64,13 +64,13 @@ config DEBUG_SET_MODULE_RONX
 
 config DEBUG_RODATA
 	bool "Make kernel text and rodata read-only"
+	default y
 	help
 	  If this is set, kernel text and rodata will be made read-only. This
 	  is to help catch accidental or malicious attempts to change the
-	  kernel's executable code. Additionally splits rodata from kernel
-	  text so it can be made explicitly non-executable.
+	  kernel's executable code.
 
-          If in doubt, say Y
+	  If in doubt, say Y
 
 config DEBUG_ALIGN_RODATA
 	depends on DEBUG_RODATA && ARM64_4K_PAGES

From b571c4f0cf7e6dbcb2a51fbceefcf27518003b47 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 5 May 2016 10:43:59 +0100
Subject: [PATCH 0573/3220] UPSTREAM: arm64: Fix typo in the
 pmdp_huge_get_and_clear() definition

(Cherry picked from commit 911f56eeb87ee378f5e215469268a7a2f68a5a8a)

With hardware AF/DBM support, pmd modifications (transparent huge pages)
should be performed atomically using load/store exclusive. The initial
patches defined the get-and-clear function and __HAVE_ARCH_* macro
without the "huge" word, leaving the pmdp_huge_get_and_clear() to the
default, non-atomic implementation.

Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits")
Cc: <stable@vger.kernel.org> # 4.3+
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/include/asm/pgtable.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 40017aa2fcbd..b420eee6026a 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -652,9 +652,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-				       unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address, pmd_t *pmdp)
 {
 	return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp));
 }

From f82be531155c6ce5890df4ffe6fad9b1fa2eafac Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 5 May 2016 10:44:00 +0100
Subject: [PATCH 0574/3220] UPSTREAM: arm64: Implement pmdp_set_access_flags()
 for hardware AF/DBM

(Cherry picked from commit 282aa7051b0169991b34716f0f22d9c2f59c46c4)

The update to the accessed or dirty states for block mappings must be
done atomically on hardware with support for automatic AF/DBM. The
ptep_set_access_flags() function has been fixed as part of commit
66dbd6e61a52 ("arm64: Implement ptep_set_access_flags() for hardware
AF/DBM"). This patch brings pmdp_set_access_flags() in line with the pte
counterpart.

Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits")
Cc: <stable@vger.kernel.org> # 4.4.x: 66dbd6e61a52: arm64: Implement ptep_set_access_flags() for hardware AF/DBM
Cc: <stable@vger.kernel.org> # 4.3+
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/include/asm/pgtable.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b420eee6026a..22dbef8a677b 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -600,6 +600,16 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pte_t *ptep,
 				 pte_t entry, int dirty);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
+					unsigned long address, pmd_t *pmdp,
+					pmd_t entry, int dirty)
+{
+	return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
+}
+#endif
+
 /*
  * Atomic pte/pmd modifications.
  */

From 18f41ad6976a772f283b6563963957b85d833e5b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 7 Jun 2016 17:55:15 +0100
Subject: [PATCH 0575/3220] UPSTREAM: arm64: mm: always take dirty state from
 new pte in ptep_set_access_flags

(Cherry picked from commit 0106d456c4cb1770253fefc0ab23c9ca760b43f7)

Commit 66dbd6e61a52 ("arm64: Implement ptep_set_access_flags() for
hardware AF/DBM") ensured that pte flags are updated atomically in the
face of potential concurrent, hardware-assisted updates. However, Alex
reports that:

 | This patch breaks swapping for me.
 | In the broken case, you'll see either systemd cpu time spike (because
 | it's stuck in a page fault loop) or the system hang (because the
 | application owning the screen is stuck in a page fault loop).

It turns out that this is because the 'dirty' argument to
ptep_set_access_flags is always 0 for read faults, and so we can't use
it to set PTE_RDONLY. The failing sequence is:

  1. We put down a PTE_WRITE | PTE_DIRTY | PTE_AF pte
  2. Memory pressure -> pte_mkold(pte) -> clear PTE_AF
  3. A read faults due to the missing access flag
  4. ptep_set_access_flags is called with dirty = 0, due to the read fault
  5. pte is then made PTE_WRITE | PTE_DIRTY | PTE_AF | PTE_RDONLY (!)
  6. A write faults, but pte_write is true so we get stuck

The solution is to check the new page table entry (as would be done by
the generic, non-atomic definition of ptep_set_access_flags that just
calls set_pte_at) to establish the dirty state.

Cc: <stable@vger.kernel.org> # 4.3+
Fixes: 66dbd6e61a52 ("arm64: Implement ptep_set_access_flags() for hardware AF/DBM")
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Alexander Graf <agraf@suse.de>
Tested-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>

Fixes: Change-Id: Id2a0b0d8eb6e7df6325ecb48b88b8401a5dd09e5
       ("UPSTREAM: arm64: Implement ptep_set_access_flags() for hardware AF/DBM")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index f0c75fd6a3fa..9cedb10b1107 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -109,7 +109,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 	 * PTE_RDONLY is cleared by default in the asm below, so set it in
 	 * back if necessary (read-only or clean PTE).
 	 */
-	if (!pte_write(entry) || !dirty)
+	if (!pte_write(entry) || !pte_sw_dirty(entry))
 		pte_val(entry) |= PTE_RDONLY;
 
 	/*

From 61f26de8c058833bb29aa4641717fc5b873724b5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 30 Mar 2016 09:46:23 +0200
Subject: [PATCH 0576/3220] UPSTREAM: efi/arm64: Don't apply MEMBLOCK_NOMAP to
 UEFI memory map mapping

(Cherry picked from commit 7cc8cbcf82d165dd658d89a7a287140948e76413)

Commit 4dffbfc48d65 ("arm64/efi: mark UEFI reserved regions as
MEMBLOCK_NOMAP") updated the mapping logic of both the RuntimeServices
regions as well as the kernel's copy of the UEFI memory map to set the
MEMBLOCK_NOMAP flag, which causes these regions to be omitted from the
kernel direct mapping, and from being covered by a struct page.
For the RuntimeServices regions, this is an obvious win, since the contents
of these regions have significance to the firmware executable code itself,
and are mapped in the EFI page tables using attributes that are described in
the UEFI memory map, and which may differ from the attributes we use for
mapping system RAM. It also prevents the contents from being modified
inadvertently, since the EFI page tables are only live during runtime
service invocations.

None of these concerns apply to the allocation that covers the UEFI memory
map, since it is entirely owned by the kernel. Setting the MEMBLOCK_NOMAP on
the region did allow us to use ioremap_cache() to map it both on arm64 and
on ARM, since the latter does not allow ioremap_cache() to be used on
regions that are covered by a struct page.

The ioremap_cache() on ARM restriction will be lifted in the v4.7 timeframe,
but in the mean time, it has been reported that commit 4dffbfc48d65 causes
a regression on 64k granule kernels. This is due to the fact that, given
the 64 KB page size, the region that we end up removing from the kernel
direct mapping is rounded up to 64 KB, and this 64 KB page frame may be
shared with the initrd when booting via GRUB (which does not align its
EFI_LOADER_DATA allocations to 64 KB like the stub does). This will crash
the kernel as soon as it tries to access the initrd.

Since the issue is specific to arm64, revert back to memblock_reserve()'ing
the UEFI memory map when running on arm64. This is a temporary fix for v4.5
and v4.6, and will be superseded in the v4.7 timeframe when we will be able
to move back to memblock_reserve() unconditionally.

Fixes: 4dffbfc48d65 ("arm64/efi: mark UEFI reserved regions as MEMBLOCK_NOMAP")
Reported-by: Mark Salter <msalter@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Leif Lindholm <leif.lindholm@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Mark Langsdorf <mlangsdo@redhat.com>
Cc: <stable@vger.kernel.org> # v4.5
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>

Fixes: Change-Id: Ia3ce78f40f8d41a9afdd42238fe9cbfd81bbff08
       ("UPSTREAM: arm64/efi: mark UEFI reserved regions as MEMBLOCK_NOMAP")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/firmware/efi/arm-init.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
index 9e15d571b53c..a76c35fc0b92 100644
--- a/drivers/firmware/efi/arm-init.c
+++ b/drivers/firmware/efi/arm-init.c
@@ -203,7 +203,19 @@ void __init efi_init(void)
 
 	reserve_regions();
 	early_memunmap(memmap.map, params.mmap_size);
-	memblock_mark_nomap(params.mmap & PAGE_MASK,
-			    PAGE_ALIGN(params.mmap_size +
-				       (params.mmap & ~PAGE_MASK)));
+
+	if (IS_ENABLED(CONFIG_ARM)) {
+		/*
+		 * ARM currently does not allow ioremap_cache() to be called on
+		 * memory regions that are covered by struct page. So remove the
+		 * UEFI memory map from the linear mapping.
+		 */
+		memblock_mark_nomap(params.mmap & PAGE_MASK,
+				    PAGE_ALIGN(params.mmap_size +
+					       (params.mmap & ~PAGE_MASK)));
+	} else {
+		memblock_reserve(params.mmap & PAGE_MASK,
+				 PAGE_ALIGN(params.mmap_size +
+					    (params.mmap & ~PAGE_MASK)));
+	}
 }

From 9789b697c6e4db733b4afe9572cd8e053c63e943 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Mon, 12 Dec 2016 11:41:11 +0900
Subject: [PATCH 0577/3220] Revert "net: core: Support UID-based routing."

This reverts commit fd2cf795f3ab193752781be7372949ac1780d0ed.

Bug: 16355602
Change-Id: I1ec2d1eb3d53f4186b60c6ca5d6a20fcca46d442
---
 include/net/fib_rules.h          |  4 ---
 include/net/flow.h               |  9 +-----
 include/net/ip.h                 |  1 -
 include/net/ip6_route.h          |  2 +-
 include/net/route.h              |  6 ++--
 include/uapi/linux/fib_rules.h   |  2 --
 include/uapi/linux/rtnetlink.h   |  1 -
 net/core/fib_rules.c             | 53 ++------------------------------
 net/ipv4/fib_frontend.c          |  1 -
 net/ipv4/inet_connection_sock.c  |  4 +--
 net/ipv4/ip_output.c             |  3 +-
 net/ipv4/ping.c                  |  3 +-
 net/ipv4/raw.c                   |  3 +-
 net/ipv4/route.c                 | 25 ++++-----------
 net/ipv4/syncookies.c            |  5 ++-
 net/ipv4/udp.c                   |  3 +-
 net/ipv6/af_inet6.c              |  1 -
 net/ipv6/ah6.c                   |  2 +-
 net/ipv6/datagram.c              |  1 -
 net/ipv6/esp6.c                  |  2 +-
 net/ipv6/icmp.c                  |  2 +-
 net/ipv6/inet6_connection_sock.c |  2 --
 net/ipv6/ip6_vti.c               |  2 +-
 net/ipv6/ipcomp6.c               |  2 +-
 net/ipv6/ping.c                  |  1 -
 net/ipv6/raw.c                   |  1 -
 net/ipv6/route.c                 | 11 ++-----
 net/ipv6/syncookies.c            |  1 -
 net/ipv6/tcp_ipv6.c              |  1 -
 net/ipv6/udp.c                   |  1 -
 30 files changed, 27 insertions(+), 128 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index bdd985f41022..59160de702b6 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -29,8 +29,6 @@ struct fib_rule {
 	int			suppress_prefixlen;
 	char			iifname[IFNAMSIZ];
 	char			oifname[IFNAMSIZ];
-	kuid_t			uid_start;
-	kuid_t			uid_end;
 	struct rcu_head		rcu;
 };
 
@@ -89,8 +87,6 @@ struct fib_rules_ops {
 	[FRA_FWMARK]	= { .type = NLA_U32 }, \
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
-	[FRA_UID_START]	= { .type = NLA_U32 }, \
-	[FRA_UID_END]	= { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
 	[FRA_GOTO]	= { .type = NLA_U32 }
diff --git a/include/net/flow.h b/include/net/flow.h
index 833080732dec..83969eebebf3 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -11,7 +11,6 @@
 #include <linux/in6.h>
 #include <linux/atomic.h>
 #include <net/flow_dissector.h>
-#include <linux/uidgid.h>
 
 /*
  * ifindex generation is per-net namespace, and loopback is
@@ -39,7 +38,6 @@ struct flowi_common {
 #define FLOWI_FLAG_SKIP_NH_OIF		0x08
 	__u32	flowic_secid;
 	struct flowi_tunnel flowic_tun_key;
-	kuid_t  flowic_uid;
 };
 
 union flowi_uli {
@@ -77,7 +75,6 @@ struct flowi4 {
 #define flowi4_flags		__fl_common.flowic_flags
 #define flowi4_secid		__fl_common.flowic_secid
 #define flowi4_tun_key		__fl_common.flowic_tun_key
-#define flowi4_uid		__fl_common.flowic_uid
 
 	/* (saddr,daddr) must be grouped, same order as in IP header */
 	__be32			saddr;
@@ -97,8 +94,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 				      __u32 mark, __u8 tos, __u8 scope,
 				      __u8 proto, __u8 flags,
 				      __be32 daddr, __be32 saddr,
-				      __be16 dport, __be16 sport,
-				      kuid_t uid)
+				      __be16 dport, __be16 sport)
 {
 	fl4->flowi4_oif = oif;
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
@@ -109,7 +105,6 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 	fl4->flowi4_flags = flags;
 	fl4->flowi4_secid = 0;
 	fl4->flowi4_tun_key.tun_id = 0;
-	fl4->flowi4_uid = uid;
 	fl4->daddr = daddr;
 	fl4->saddr = saddr;
 	fl4->fl4_dport = dport;
@@ -138,7 +133,6 @@ struct flowi6 {
 #define flowi6_flags		__fl_common.flowic_flags
 #define flowi6_secid		__fl_common.flowic_secid
 #define flowi6_tun_key		__fl_common.flowic_tun_key
-#define flowi6_uid		__fl_common.flowic_uid
 	struct in6_addr		daddr;
 	struct in6_addr		saddr;
 	__be32			flowlabel;
@@ -183,7 +177,6 @@ struct flowi {
 #define flowi_flags	u.__fl_common.flowic_flags
 #define flowi_secid	u.__fl_common.flowic_secid
 #define flowi_tun_key	u.__fl_common.flowic_tun_key
-#define flowi_uid	u.__fl_common.flowic_uid
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
diff --git a/include/net/ip.h b/include/net/ip.h
index 4f3ef345f4c2..1a98f1ca1638 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -170,7 +170,6 @@ struct ip_reply_arg {
 				/* -1 if not needed */ 
 	int	    bound_dev_if;
 	u8  	    tos;
-	kuid_t	    uid;
 }; 
 
 #define IP_REPLY_ARG_NOSRCCHECK 1
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 4bbd221637cd..877f682989b8 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -108,7 +108,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 		  const struct in6_addr *gwaddr);
 
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
-		     u32 mark, kuid_t uid);
+		     u32 mark);
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark);
 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
diff --git a/include/net/route.h b/include/net/route.h
index d016a8cb45cf..a3b9ef74a389 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -154,8 +154,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
 	flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
 			   RT_SCOPE_UNIVERSE, proto,
 			   sk ? inet_sk_flowi_flags(sk) : 0,
-			   daddr, saddr, dport, sport,
-			   sk ? sock_i_uid(sk) : GLOBAL_ROOT_UID);
+			   daddr, saddr, dport, sport);
 	if (sk)
 		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
 	return ip_route_output_flow(net, fl4, sk);
@@ -268,8 +267,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
 	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags, dst, src, dport, sport,
-			   sock_i_uid(sk));
+			   protocol, flow_flags, dst, src, dport, sport);
 }
 
 static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index ce19c5bf51f7..96161b8202b5 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -49,8 +49,6 @@ enum {
 	FRA_TABLE,	/* Extended table id */
 	FRA_FWMASK,	/* mask for netfilter mark */
 	FRA_OIFNAME,
-	FRA_UID_START,	/* UID range */
-	FRA_UID_END,
 	__FRA_MAX
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 3eb02a1d6d8c..123a5af4e8bb 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -306,7 +306,6 @@ enum rtattr_type_t {
 	RTA_TABLE,
 	RTA_MARK,
 	RTA_MFC_STATS,
-	RTA_UID,
 	RTA_VIA,
 	RTA_NEWDST,
 	RTA_PREF,
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 3fbd839f6d20..365de66436ac 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -33,8 +33,6 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 	r->table = table;
 	r->flags = flags;
 	r->fr_net = ops->fro_net;
-	r->uid_start = INVALID_UID;
-	r->uid_end = INVALID_UID;
 
 	r->suppress_prefixlen = -1;
 	r->suppress_ifgroup = -1;
@@ -174,23 +172,6 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
 }
 EXPORT_SYMBOL_GPL(fib_rules_unregister);
 
-static inline kuid_t fib_nl_uid(struct nlattr *nla)
-{
-	return make_kuid(current_user_ns(), nla_get_u32(nla));
-}
-
-static int nla_put_uid(struct sk_buff *skb, int idx, kuid_t uid)
-{
-	return nla_put_u32(skb, idx, from_kuid_munged(current_user_ns(), uid));
-}
-
-static int fib_uid_range_match(struct flowi *fl, struct fib_rule *rule)
-{
-	return (!uid_valid(rule->uid_start) && !uid_valid(rule->uid_end)) ||
-	       (uid_gte(fl->flowi_uid, rule->uid_start) &&
-		uid_lte(fl->flowi_uid, rule->uid_end));
-}
-
 static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 			  struct flowi *fl, int flags)
 {
@@ -208,9 +189,6 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 	if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
 		goto out;
 
-	if (!fib_uid_range_match(fl, rule))
-		goto out;
-
 	ret = ops->match(rule, fl, flags);
 out:
 	return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -393,19 +371,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	} else if (rule->action == FR_ACT_GOTO)
 		goto errout_free;
 
-	/* UID start and end must either both be valid or both unspecified. */
-	rule->uid_start = rule->uid_end = INVALID_UID;
-	if (tb[FRA_UID_START] || tb[FRA_UID_END]) {
-		if (tb[FRA_UID_START] && tb[FRA_UID_END]) {
-			rule->uid_start = fib_nl_uid(tb[FRA_UID_START]);
-			rule->uid_end = fib_nl_uid(tb[FRA_UID_END]);
-		}
-		if (!uid_valid(rule->uid_start) ||
-		    !uid_valid(rule->uid_end) ||
-		    !uid_lte(rule->uid_start, rule->uid_end))
-		goto errout_free;
-	}
-
 	err = ops->configure(rule, skb, frh, tb);
 	if (err < 0)
 		goto errout_free;
@@ -518,14 +483,6 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 		    (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
 			continue;
 
-		if (tb[FRA_UID_START] &&
-		    !uid_eq(rule->uid_start, fib_nl_uid(tb[FRA_UID_START])))
-			continue;
-
-		if (tb[FRA_UID_END] &&
-		    !uid_eq(rule->uid_end, fib_nl_uid(tb[FRA_UID_END])))
-			continue;
-
 		if (!ops->compare(rule, frh, tb))
 			continue;
 
@@ -592,9 +549,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
 			 + nla_total_size(4) /* FRA_FWMARK */
 			 + nla_total_size(4) /* FRA_FWMASK */
-			 + nla_total_size(8) /* FRA_TUN_ID */
-			 + nla_total_size(4) /* FRA_UID_START */
-			 + nla_total_size(4); /* FRA_UID_END */
+			 + nla_total_size(8); /* FRA_TUN_ID */
 
 	if (ops->nlmsg_payload)
 		payload += ops->nlmsg_payload(rule);
@@ -652,11 +607,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	    (rule->target &&
 	     nla_put_u32(skb, FRA_GOTO, rule->target)) ||
 	    (rule->tun_id &&
-	     nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)) ||
-	    (uid_valid(rule->uid_start) &&
-	     nla_put_uid(skb, FRA_UID_START, rule->uid_start)) ||
-	    (uid_valid(rule->uid_end) &&
-	     nla_put_uid(skb, FRA_UID_END, rule->uid_end)))
+	     nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)))
 		goto nla_put_failure;
 
 	if (rule->suppress_ifgroup != -1) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e10edb5e78b0..473447593060 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -629,7 +629,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_FLOW]		= { .type = NLA_U32 },
 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
-	[RTA_UID]		= { .type = NLA_U32 },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 728414dcea3b..46b9c887bede 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -420,7 +420,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
-			   htons(ireq->ir_num), sock_i_uid((struct sock *)sk));
+			   htons(ireq->ir_num));
 	security_req_classify_flow(req, flowi4_to_flowi(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
@@ -457,7 +457,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
-			   htons(ireq->ir_num), sock_i_uid((struct sock *)sk));
+			   htons(ireq->ir_num));
 	security_req_classify_flow(req, flowi4_to_flowi(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 33bef2763c72..4233cbe47052 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1573,8 +1573,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
 			   daddr, saddr,
-			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
-			   arg->uid);
+			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index b27e98010dea..e89094ab5ddb 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -789,8 +789,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
 			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
-			   inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
-			   sock_i_uid(sk));
+			   inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
 
 	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_flow(net, &fl4, sk);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 865895d3fb27..bc35f1842512 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -599,8 +599,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 			   inet_sk_flowi_flags(sk) |
 			    (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
-			   daddr, saddr, 0, 0,
-			   sock_i_uid(sk));
+			   daddr, saddr, 0, 0);
 
 	if (!saddr && ipc.oif) {
 		err = l3mdev_get_saddr(net, ipc.oif, &fl4);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a0d842f4e9cf..85f184e429c6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -500,7 +500,7 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
-static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
+static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 			     const struct iphdr *iph,
 			     int oif, u8 tos,
 			     u8 prot, u32 mark, int flow_flags)
@@ -516,12 +516,11 @@ static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
 	flowi4_init_output(fl4, oif, mark, tos,
 			   RT_SCOPE_UNIVERSE, prot,
 			   flow_flags,
-			   iph->daddr, iph->saddr, 0, 0,
-			   sk ? sock_i_uid(sk) : GLOBAL_ROOT_UID);
+			   iph->daddr, iph->saddr, 0, 0);
 }
 
 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
-			       struct sock *sk)
+			       const struct sock *sk)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	int oif = skb->dev->ifindex;
@@ -532,7 +531,7 @@ static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 }
 
-static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
+static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 	const struct ip_options_rcu *inet_opt;
@@ -546,12 +545,11 @@ static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 			   inet_sk_flowi_flags(sk),
-			   daddr, inet->inet_saddr, 0, 0,
-			   sock_i_uid(sk));
+			   daddr, inet->inet_saddr, 0, 0);
 	rcu_read_unlock();
 }
 
-static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
+static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 				 const struct sk_buff *skb)
 {
 	if (skb)
@@ -2424,11 +2422,6 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
 		goto nla_put_failure;
 
-	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
-	    nla_put_u32(skb, RTA_UID,
-			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
-		goto nla_put_failure;
-
 	error = rt->dst.error;
 
 	if (rt_is_input_route(rt)) {
@@ -2480,7 +2473,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	int mark;
 	struct sk_buff *skb;
 	u32 table_id = RT_TABLE_MAIN;
-	kuid_t uid;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
 	if (err < 0)
@@ -2508,10 +2500,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
-	if (tb[RTA_UID])
-		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
-	else
-		uid = (iif ? INVALID_UID : current_uid());
 
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = dst;
@@ -2519,7 +2507,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	fl4.flowi4_tos = rtm->rtm_tos;
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
-	fl4.flowi4_uid = uid;
 
 	if (netif_index_is_l3_master(net, fl4.flowi4_oif))
 		fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 31b6a4c9db32..4cbe9f0a4281 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -374,9 +374,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
 			   inet_sk_flowi_flags(sk),
-			   (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
-			   ireq->ir_loc_addr, th->source, th->dest,
-			   sock_i_uid(sk));
+			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,
+			   ireq->ir_loc_addr, th->source, th->dest);
 	security_req_classify_flow(req, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(sock_net(sk), &fl4);
 	if (IS_ERR(rt)) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 660933edd2d2..f8b3842b9070 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1023,8 +1023,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
 				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
 				   flow_flags,
-				   faddr, saddr, dport, inet->inet_sport,
-				   sock_i_uid(sk));
+				   faddr, saddr, dport, inet->inet_sport);
 
 		if (!saddr && ipc.oif) {
 			err = l3mdev_get_saddr(net, ipc.oif, fl4);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d9b25bd17bf1..9dbfacb6e0d9 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -678,7 +678,6 @@ int inet6_sk_rebuild_header(struct sock *sk)
 		fl6.flowi6_mark = sk->sk_mark;
 		fl6.fl6_dport = inet->inet_dport;
 		fl6.fl6_sport = inet->inet_sport;
-		fl6.flowi6_uid = sock_i_uid(sk);
 		security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
 		rcu_read_lock();
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index c52b8fc904c9..0630a4d5daaa 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -664,7 +664,7 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (type == NDISC_REDIRECT)
 		ip6_redirect(skb, net, skb->dev->ifindex, 0);
 	else
-		ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID);
+		ip6_update_pmtu(skb, net, info, 0, 0);
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 0743a5f4c533..517c55b01ba8 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -161,7 +161,6 @@ ipv4_connected:
 	fl6.flowi6_mark = sk->sk_mark;
 	fl6.fl6_dport = inet->inet_dport;
 	fl6.fl6_sport = inet->inet_sport;
-	fl6.flowi6_uid = sock_i_uid(sk);
 
 	if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
 		fl6.flowi6_oif = np->mcast_oif;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index f921368c32c9..060a60b2f8a6 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -476,7 +476,7 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (type == NDISC_REDIRECT)
 		ip6_redirect(skb, net, skb->dev->ifindex, 0);
 	else
-		ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID);
+		ip6_update_pmtu(skb, net, info, 0, 0);
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 41e5c9520c7d..3697cd08c515 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -92,7 +92,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct net *net = dev_net(skb->dev);
 
 	if (type == ICMPV6_PKT_TOOBIG)
-		ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID);
+		ip6_update_pmtu(skb, net, info, 0, 0);
 	else if (type == NDISC_REDIRECT)
 		ip6_redirect(skb, net, skb->dev->ifindex, 0);
 
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 897bb6eb5751..a7ca2cde2ecb 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -86,7 +86,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
 	fl6->flowi6_mark = ireq->ir_mark;
 	fl6->fl6_dport = ireq->ir_rmt_port;
 	fl6->fl6_sport = htons(ireq->ir_num);
-	fl6->flowi6_uid = sock_i_uid((struct sock *)sk);
 	security_req_classify_flow(req, flowi6_to_flowi(fl6));
 
 	dst = ip6_dst_lookup_flow(sk, fl6, final_p);
@@ -135,7 +134,6 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
 	fl6->flowi6_mark = sk->sk_mark;
 	fl6->fl6_sport = inet->inet_sport;
 	fl6->fl6_dport = inet->inet_dport;
-	fl6->flowi6_uid = sock_i_uid(sk);
 	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 
 	rcu_read_lock();
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c76ebc7fc52d..0a8610b33d79 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -599,7 +599,7 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (type == NDISC_REDIRECT)
 		ip6_redirect(skb, net, skb->dev->ifindex, 0);
 	else
-		ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID);
+		ip6_update_pmtu(skb, net, info, 0, 0);
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index b247baceb797..1b9316e1386a 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -76,7 +76,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (type == NDISC_REDIRECT)
 		ip6_redirect(skb, net, skb->dev->ifindex, 0);
 	else
-		ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID);
+		ip6_update_pmtu(skb, net, info, 0, 0);
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 9411c8d770a5..a5cf82ccf406 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -141,7 +141,6 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	fl6.daddr = *daddr;
 	fl6.flowi6_oif = oif;
 	fl6.flowi6_mark = sk->sk_mark;
-	fl6.flowi6_uid = sock_i_uid(sk);
 	fl6.fl6_icmp_type = user_icmph.icmp6_type;
 	fl6.fl6_icmp_code = user_icmph.icmp6_code;
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index d9ad71a01b4c..99140986e887 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -768,7 +768,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	memset(&fl6, 0, sizeof(fl6));
 
 	fl6.flowi6_mark = sk->sk_mark;
-	fl6.flowi6_uid = sock_i_uid(sk);
 
 	if (sin6) {
 		if (addr_len < SIN6_LEN_RFC2133)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 02ba70201e05..60359bea6a16 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1383,7 +1383,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 }
 
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
-		     int oif, u32 mark, kuid_t uid)
+		     int oif, u32 mark)
 {
 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
 	struct dst_entry *dst;
@@ -1395,7 +1395,6 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 	fl6.daddr = iph->daddr;
 	fl6.saddr = iph->saddr;
 	fl6.flowlabel = ip6_flowinfo(iph);
-	fl6.flowi6_uid = uid;
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (!dst->error)
@@ -1407,7 +1406,7 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu);
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
 {
 	ip6_update_pmtu(skb, sock_net(sk), mtu,
-			sk->sk_bound_dev_if, sk->sk_mark, sock_i_uid(sk));
+			sk->sk_bound_dev_if, sk->sk_mark);
 }
 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
 
@@ -2686,7 +2685,6 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_PREF]              = { .type = NLA_U8 },
 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
-	[RTA_UID]		= { .type = NLA_U32 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -3248,11 +3246,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	if (tb[RTA_MARK])
 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
 
-	if (tb[RTA_UID])
-		fl6.flowi6_uid = make_kuid(current_user_ns(),
-					   nla_get_u32(tb[RTA_UID]));
-	else
-		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
 	if (iif) {
 		struct net_device *dev;
 		int flags = 0;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index a22015fab95e..eaf7ac496d50 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -228,7 +228,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		fl6.flowi6_mark = ireq->ir_mark;
 		fl6.fl6_dport = ireq->ir_rmt_port;
 		fl6.fl6_sport = inet_sk(sk)->inet_sport;
-		fl6.flowi6_uid = sock_i_uid(sk);
 		security_req_classify_flow(req, flowi6_to_flowi(&fl6));
 
 		dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e6b044480333..f6eb65a5d343 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -234,7 +234,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.flowi6_mark = sk->sk_mark;
 	fl6.fl6_dport = usin->sin6_port;
 	fl6.fl6_sport = inet->inet_sport;
-	fl6.flowi6_uid = sock_i_uid(sk);
 
 	opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
 	final_p = fl6_update_dst(&fl6, opt, &final);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 54235ef177bb..fcce7b4ba8ff 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1243,7 +1243,6 @@ do_udp_sendmsg:
 		fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
 
 	fl6.flowi6_mark = sk->sk_mark;
-	fl6.flowi6_uid = sock_i_uid(sk);
 
 	if (msg->msg_controllen) {
 		opt = &opt_space;

From eb964bdba79aee0f244efef0730d9d022ccc9ac8 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 4 Nov 2016 02:23:41 +0900
Subject: [PATCH 0578/3220] net: core: Add a UID field to struct sock.

Protocol sockets (struct sock) don't have UIDs, but most of the
time, they map 1:1 to userspace sockets (struct socket) which do.

Various operations such as the iptables xt_owner match need
access to the "UID of a socket", and do so by following the
backpointer to the struct socket. This involves taking
sk_callback_lock and doesn't work when there is no socket
because userspace has already called close().

Simplify this by adding a sk_uid field to struct sock whose value
matches the UID of the corresponding struct socket. The semantics
are as follows:

1. Whenever sk_socket is non-null: sk_uid is the same as the UID
   in sk_socket, i.e., matches the return value of sock_i_uid.
   Specifically, the UID is set when userspace calls socket(),
   fchown(), or accept().
2. When sk_socket is NULL, sk_uid is defined as follows:
   - For a socket that no longer has a sk_socket because
     userspace has called close(): the previous UID.
   - For a cloned socket (e.g., an incoming connection that is
     established but on which userspace has not yet called
     accept): the UID of the socket it was cloned from.
   - For a socket that has never had an sk_socket: UID 0 inside
     the user namespace corresponding to the network namespace
     the socket belongs to.

Kernel sockets created by sock_create_kern are a special case
of #1 and sk_uid is the user that created them. For kernel
sockets created at network namespace creation time, such as the
per-processor ICMP and TCP sockets, this is the user that created
the network namespace.

Bug: 16355602
Change-Id: Idbc3e9a0cec91c4c6e01916b967b6237645ebe59
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h |  7 +++++++
 net/core/sock.c    |  5 ++++-
 net/socket.c       | 14 ++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 2d663ee8494d..5d8e0049b71c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -446,6 +446,7 @@ struct sock {
 	void			*sk_security;
 #endif
 	__u32			sk_mark;
+	kuid_t			sk_uid;
 #ifdef CONFIG_CGROUP_NET_CLASSID
 	u32			sk_classid;
 #endif
@@ -1682,6 +1683,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 	sk->sk_wq = parent->wq;
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
+	sk->sk_uid = SOCK_INODE(parent)->i_uid;
 	security_sock_graft(sk, parent);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1689,6 +1691,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 kuid_t sock_i_uid(struct sock *sk);
 unsigned long sock_i_ino(struct sock *sk);
 
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+	return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
 static inline u32 net_tx_rndhash(void)
 {
 	u32 v = prandom_u32();
diff --git a/net/core/sock.c b/net/core/sock.c
index 0d91f7dca751..d0f83260cddd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2383,8 +2383,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 		sk->sk_type	=	sock->type;
 		sk->sk_wq	=	sock->wq;
 		sock->sk	=	sk;
-	} else
+		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
+	} else {
 		sk->sk_wq	=	NULL;
+		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
+	}
 
 	rwlock_init(&sk->sk_callback_lock);
 	lockdep_set_class_and_name(&sk->sk_callback_lock,
diff --git a/net/socket.c b/net/socket.c
index 263b334ec5e4..1012991fb560 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -520,9 +520,23 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
 	return used;
 }
 
+int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	int err = simple_setattr(dentry, iattr);
+
+	if (!err) {
+		struct socket *sock = SOCKET_I(d_inode(dentry));
+
+		sock->sk->sk_uid = iattr->ia_uid;
+	}
+
+	return err;
+}
+
 static const struct inode_operations sockfs_inode_ops = {
 	.getxattr = sockfs_getxattr,
 	.listxattr = sockfs_listxattr,
+	.setattr = sockfs_setattr,
 };
 
 /**

From 03441d56d878c40acd8e595548a68996199c2135 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 4 Nov 2016 02:23:42 +0900
Subject: [PATCH 0579/3220] net: core: add UID to flows, rules, and routes

- Define a new FIB rule attributes, FRA_UID_RANGE, to describe a
  range of UIDs.
- Define a RTA_UID attribute for per-UID route lookups and dumps.
- Support passing these attributes to and from userspace via
  rtnetlink. The value INVALID_UID indicates no UID was
  specified.
- Add a UID field to the flow structures.

Bug: 16355602
Change-Id: Iea98e6fedd0fd4435a1f4efa3deb3629505619ab
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h        |  9 ++++-
 include/net/flow.h             |  5 +++
 include/uapi/linux/fib_rules.h |  8 ++++
 include/uapi/linux/rtnetlink.h |  3 ++
 net/core/fib_rules.c           | 72 +++++++++++++++++++++++++++++++++-
 net/ipv4/fib_frontend.c        |  1 +
 net/ipv4/route.c               | 11 ++++++
 net/ipv6/route.c               |  7 ++++
 8 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 59160de702b6..bd2b5c007561 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -8,6 +8,11 @@
 #include <net/flow.h>
 #include <net/rtnetlink.h>
 
+struct fib_kuid_range {
+	kuid_t start;
+	kuid_t end;
+};
+
 struct fib_rule {
 	struct list_head	list;
 	int			iifindex;
@@ -29,6 +34,7 @@ struct fib_rule {
 	int			suppress_prefixlen;
 	char			iifname[IFNAMSIZ];
 	char			oifname[IFNAMSIZ];
+	struct fib_kuid_range	uid_range;
 	struct rcu_head		rcu;
 };
 
@@ -89,7 +95,8 @@ struct fib_rules_ops {
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
-	[FRA_GOTO]	= { .type = NLA_U32 }
+	[FRA_GOTO]	= { .type = NLA_U32 }, \
+	[FRA_UID_RANGE]	= { .len = sizeof(struct fib_rule_uid_range) }
 
 static inline void fib_rule_get(struct fib_rule *rule)
 {
diff --git a/include/net/flow.h b/include/net/flow.h
index 83969eebebf3..8913962d7d25 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -11,6 +11,7 @@
 #include <linux/in6.h>
 #include <linux/atomic.h>
 #include <net/flow_dissector.h>
+#include <linux/uidgid.h>
 
 /*
  * ifindex generation is per-net namespace, and loopback is
@@ -38,6 +39,7 @@ struct flowi_common {
 #define FLOWI_FLAG_SKIP_NH_OIF		0x08
 	__u32	flowic_secid;
 	struct flowi_tunnel flowic_tun_key;
+	kuid_t  flowic_uid;
 };
 
 union flowi_uli {
@@ -75,6 +77,7 @@ struct flowi4 {
 #define flowi4_flags		__fl_common.flowic_flags
 #define flowi4_secid		__fl_common.flowic_secid
 #define flowi4_tun_key		__fl_common.flowic_tun_key
+#define flowi4_uid		__fl_common.flowic_uid
 
 	/* (saddr,daddr) must be grouped, same order as in IP header */
 	__be32			saddr;
@@ -133,6 +136,7 @@ struct flowi6 {
 #define flowi6_flags		__fl_common.flowic_flags
 #define flowi6_secid		__fl_common.flowic_secid
 #define flowi6_tun_key		__fl_common.flowic_tun_key
+#define flowi6_uid		__fl_common.flowic_uid
 	struct in6_addr		daddr;
 	struct in6_addr		saddr;
 	__be32			flowlabel;
@@ -177,6 +181,7 @@ struct flowi {
 #define flowi_flags	u.__fl_common.flowic_flags
 #define flowi_secid	u.__fl_common.flowic_secid
 #define flowi_tun_key	u.__fl_common.flowic_tun_key
+#define flowi_uid	u.__fl_common.flowic_uid
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 96161b8202b5..bbf02a63a011 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -29,6 +29,11 @@ struct fib_rule_hdr {
 	__u32		flags;
 };
 
+struct fib_rule_uid_range {
+	__u32		start;
+	__u32		end;
+};
+
 enum {
 	FRA_UNSPEC,
 	FRA_DST,	/* destination address */
@@ -49,6 +54,9 @@ enum {
 	FRA_TABLE,	/* Extended table id */
 	FRA_FWMASK,	/* mask for netfilter mark */
 	FRA_OIFNAME,
+	FRA_PAD,
+	FRA_L3MDEV,	/* iif or oif is l3mdev goto its table */
+	FRA_UID_RANGE,	/* UID range */
 	__FRA_MAX
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 123a5af4e8bb..d66101789bfd 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -311,6 +311,9 @@ enum rtattr_type_t {
 	RTA_PREF,
 	RTA_ENCAP_TYPE,
 	RTA_ENCAP,
+	RTA_EXPIRES,
+	RTA_PAD,
+	RTA_UID,
 	__RTA_MAX
 };
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 365de66436ac..cb744a352167 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,11 @@
 #include <net/fib_rules.h>
 #include <net/ip_tunnels.h>
 
+static const struct fib_kuid_range fib_kuid_range_unset = {
+	KUIDT_INIT(0),
+	KUIDT_INIT(~0),
+};
+
 int fib_default_rule_add(struct fib_rules_ops *ops,
 			 u32 pref, u32 table, u32 flags)
 {
@@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 	r->table = table;
 	r->flags = flags;
 	r->fr_net = ops->fro_net;
+	r->uid_range = fib_kuid_range_unset;
 
 	r->suppress_prefixlen = -1;
 	r->suppress_ifgroup = -1;
@@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
 }
 EXPORT_SYMBOL_GPL(fib_rules_unregister);
 
+static int uid_range_set(struct fib_kuid_range *range)
+{
+	return uid_valid(range->start) && uid_valid(range->end);
+}
+
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+	struct fib_rule_uid_range *in;
+	struct fib_kuid_range out;
+
+	in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+	out.start = make_kuid(current_user_ns(), in->start);
+	out.end = make_kuid(current_user_ns(), in->end);
+
+	return out;
+}
+
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+	struct fib_rule_uid_range out = {
+		from_kuid_munged(current_user_ns(), range->start),
+		from_kuid_munged(current_user_ns(), range->end)
+	};
+
+	return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
 static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 			  struct flowi *fl, int flags)
 {
@@ -189,6 +223,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 	if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
 		goto out;
 
+	if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+	    uid_gt(fl->flowi_uid, rule->uid_range.end))
+		goto out;
+
 	ret = ops->match(rule, fl, flags);
 out:
 	return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -371,6 +409,21 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	} else if (rule->action == FR_ACT_GOTO)
 		goto errout_free;
 
+	if (tb[FRA_UID_RANGE]) {
+		if (current_user_ns() != net->user_ns) {
+			err = -EPERM;
+			goto errout_free;
+		}
+
+		rule->uid_range = nla_get_kuid_range(tb);
+
+		if (!uid_range_set(&rule->uid_range) ||
+		    !uid_lte(rule->uid_range.start, rule->uid_range.end))
+			goto errout_free;
+	} else {
+		rule->uid_range = fib_kuid_range_unset;
+	}
+
 	err = ops->configure(rule, skb, frh, tb);
 	if (err < 0)
 		goto errout_free;
@@ -432,6 +485,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	struct fib_rules_ops *ops = NULL;
 	struct fib_rule *rule, *tmp;
 	struct nlattr *tb[FRA_MAX+1];
+	struct fib_kuid_range range;
 	int err = -EINVAL;
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
@@ -451,6 +505,14 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (err < 0)
 		goto errout;
 
+	if (tb[FRA_UID_RANGE]) {
+		range = nla_get_kuid_range(tb);
+		if (!uid_range_set(&range))
+			goto errout;
+	} else {
+		range = fib_kuid_range_unset;
+	}
+
 	list_for_each_entry(rule, &ops->rules_list, list) {
 		if (frh->action && (frh->action != rule->action))
 			continue;
@@ -483,6 +545,11 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 		    (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
 			continue;
 
+		if (uid_range_set(&range) &&
+		    (!uid_eq(rule->uid_range.start, range.start) ||
+		     !uid_eq(rule->uid_range.end, range.end)))
+			continue;
+
 		if (!ops->compare(rule, frh, tb))
 			continue;
 
@@ -550,6 +617,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(4) /* FRA_FWMARK */
 			 + nla_total_size(4) /* FRA_FWMASK */
 			 + nla_total_size(8); /* FRA_TUN_ID */
+			 + nla_total_size(sizeof(struct fib_kuid_range));
 
 	if (ops->nlmsg_payload)
 		payload += ops->nlmsg_payload(rule);
@@ -607,7 +675,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	    (rule->target &&
 	     nla_put_u32(skb, FRA_GOTO, rule->target)) ||
 	    (rule->tun_id &&
-	     nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)))
+	     nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)) ||
+	    (uid_range_set(&rule->uid_range) &&
+	     nla_put_uid_range(skb, &rule->uid_range)))
 		goto nla_put_failure;
 
 	if (rule->suppress_ifgroup != -1) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 473447593060..e10edb5e78b0 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -629,6 +629,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_FLOW]		= { .type = NLA_U32 },
 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
+	[RTA_UID]		= { .type = NLA_U32 },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 85f184e429c6..25ca8d4b6565 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2422,6 +2422,11 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
 		goto nla_put_failure;
 
+	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+	    nla_put_u32(skb, RTA_UID,
+			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
+		goto nla_put_failure;
+
 	error = rt->dst.error;
 
 	if (rt_is_input_route(rt)) {
@@ -2473,6 +2478,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	int mark;
 	struct sk_buff *skb;
 	u32 table_id = RT_TABLE_MAIN;
+	kuid_t uid;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
 	if (err < 0)
@@ -2500,6 +2506,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+	if (tb[RTA_UID])
+		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
+	else
+		uid = (iif ? INVALID_UID : current_uid());
 
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = dst;
@@ -2507,6 +2517,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	fl4.flowi4_tos = rtm->rtm_tos;
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
+	fl4.flowi4_uid = uid;
 
 	if (netif_index_is_l3_master(net, fl4.flowi4_oif))
 		fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 60359bea6a16..a1297c41d147 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2685,6 +2685,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_PREF]              = { .type = NLA_U8 },
 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
+	[RTA_UID]		= { .type = NLA_U32 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -3246,6 +3247,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 	if (tb[RTA_MARK])
 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
 
+	if (tb[RTA_UID])
+		fl6.flowi6_uid = make_kuid(current_user_ns(),
+					   nla_get_u32(tb[RTA_UID]));
+	else
+		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
+
 	if (iif) {
 		struct net_device *dev;
 		int flags = 0;

From 344afd627cca0223464079135926f611fdbb0574 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 4 Nov 2016 02:23:43 +0900
Subject: [PATCH 0580/3220] net: inet: Support UID-based routing in IP
 protocols.

- Use the UID in routing lookups made by protocol connect() and
  sendmsg() functions.
- Make sure that routing lookups triggered by incoming packets
  (e.g., Path MTU discovery) take the UID of the socket into
  account.
- For packets not associated with a userspace socket, (e.g., ping
  replies) use UID 0 inside the user namespace corresponding to
  the network namespace the socket belongs to. This allows
  all namespaces to apply routing and iptables rules to
  kernel-originated traffic in that namespaces by matching UID 0.
  This is better than using the UID of the kernel socket that is
  sending the traffic, because the UID of kernel sockets created
  at namespace creation time (e.g., the per-processor ICMP and
  TCP sockets) is the UID of the user that created the socket,
  which might not be mapped in the namespace.

Bug: 16355602
Change-Id: I910504b508948057912bc188fd1e8aca28294de3
Tested: compiles allnoconfig, allyesconfig, allmodconfig
Tested: https://android-review.googlesource.com/253302
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h               |  4 +++-
 include/net/ip.h                 |  1 +
 include/net/ip6_route.h          |  5 +++--
 include/net/route.h              |  5 +++--
 net/ipv4/icmp.c                  |  2 ++
 net/ipv4/inet_connection_sock.c  |  4 ++--
 net/ipv4/ip_output.c             |  3 ++-
 net/ipv4/ping.c                  |  3 ++-
 net/ipv4/raw.c                   |  2 +-
 net/ipv4/route.c                 | 26 +++++++++++++++-----------
 net/ipv4/syncookies.c            |  2 +-
 net/ipv4/tcp_ipv4.c              | 11 +++++++----
 net/ipv4/udp.c                   |  3 ++-
 net/ipv6/af_inet6.c              |  1 +
 net/ipv6/ah6.c                   |  5 +++--
 net/ipv6/datagram.c              |  1 +
 net/ipv6/esp6.c                  |  5 +++--
 net/ipv6/icmp.c                  |  7 +++++--
 net/ipv6/inet6_connection_sock.c |  2 ++
 net/ipv6/ip6_gre.c               |  4 ++++
 net/ipv6/ip6_tunnel.c            |  3 +++
 net/ipv6/ip6_vti.c               |  5 +++--
 net/ipv6/ipcomp6.c               |  5 +++--
 net/ipv6/netfilter.c             |  1 +
 net/ipv6/ping.c                  |  1 +
 net/ipv6/raw.c                   |  1 +
 net/ipv6/route.c                 | 13 +++++++++----
 net/ipv6/syncookies.c            |  1 +
 net/ipv6/tcp_ipv6.c              |  2 ++
 net/ipv6/udp.c                   |  1 +
 net/l2tp/l2tp_ip6.c              |  1 +
 31 files changed, 89 insertions(+), 41 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 8913962d7d25..833080732dec 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -97,7 +97,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 				      __u32 mark, __u8 tos, __u8 scope,
 				      __u8 proto, __u8 flags,
 				      __be32 daddr, __be32 saddr,
-				      __be16 dport, __be16 sport)
+				      __be16 dport, __be16 sport,
+				      kuid_t uid)
 {
 	fl4->flowi4_oif = oif;
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
@@ -108,6 +109,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 	fl4->flowi4_flags = flags;
 	fl4->flowi4_secid = 0;
 	fl4->flowi4_tun_key.tun_id = 0;
+	fl4->flowi4_uid = uid;
 	fl4->daddr = daddr;
 	fl4->saddr = saddr;
 	fl4->fl4_dport = dport;
diff --git a/include/net/ip.h b/include/net/ip.h
index 1a98f1ca1638..4f3ef345f4c2 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -170,6 +170,7 @@ struct ip_reply_arg {
 				/* -1 if not needed */ 
 	int	    bound_dev_if;
 	u8  	    tos;
+	kuid_t	    uid;
 }; 
 
 #define IP_REPLY_ARG_NOSRCCHECK 1
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 877f682989b8..55eea0bd2010 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -108,9 +108,10 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 		  const struct in6_addr *gwaddr);
 
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
-		     u32 mark);
+		     u32 mark, kuid_t uid);
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark);
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+		  kuid_t uid);
 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
 			    u32 mark);
 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);
diff --git a/include/net/route.h b/include/net/route.h
index a3b9ef74a389..3adb9c724818 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -154,7 +154,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
 	flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
 			   RT_SCOPE_UNIVERSE, proto,
 			   sk ? inet_sk_flowi_flags(sk) : 0,
-			   daddr, saddr, dport, sport);
+			   daddr, saddr, dport, sport, sock_net_uid(net, sk));
 	if (sk)
 		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
 	return ip_route_output_flow(net, fl4, sk);
@@ -267,7 +267,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
 	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags, dst, src, dport, sport);
+			   protocol, flow_flags, dst, src, dport, sport,
+			   sk->sk_uid);
 }
 
 static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36e26977c908..ef2d4322aba7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -425,6 +425,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	fl4.daddr = daddr;
 	fl4.saddr = saddr;
 	fl4.flowi4_mark = mark;
+	fl4.flowi4_uid = sock_net_uid(net, NULL);
 	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 	fl4.flowi4_proto = IPPROTO_ICMP;
 	fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
@@ -473,6 +474,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 		      param->replyopts.opt.opt.faddr : iph->saddr);
 	fl4->saddr = saddr;
 	fl4->flowi4_mark = mark;
+	fl4->flowi4_uid = sock_net_uid(net, NULL);
 	fl4->flowi4_tos = RT_TOS(tos);
 	fl4->flowi4_proto = IPPROTO_ICMP;
 	fl4->fl4_icmp_type = type;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 46b9c887bede..7e4a9c1fb615 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -420,7 +420,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
-			   htons(ireq->ir_num));
+			   htons(ireq->ir_num), sk->sk_uid);
 	security_req_classify_flow(req, flowi4_to_flowi(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
@@ -457,7 +457,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
-			   htons(ireq->ir_num));
+			   htons(ireq->ir_num), sk->sk_uid);
 	security_req_classify_flow(req, flowi4_to_flowi(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4233cbe47052..33bef2763c72 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1573,7 +1573,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
 			   daddr, saddr,
-			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+			   arg->uid);
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index e89094ab5ddb..d5d3f0f5e5ad 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -789,7 +789,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
 			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
-			   inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+			   inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
+			   sk->sk_uid);
 
 	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_flow(net, &fl4, sk);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bc35f1842512..fbf73f0130a7 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -599,7 +599,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 			   inet_sk_flowi_flags(sk) |
 			    (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
-			   daddr, saddr, 0, 0);
+			   daddr, saddr, 0, 0, sk->sk_uid);
 
 	if (!saddr && ipc.oif) {
 		err = l3mdev_get_saddr(net, ipc.oif, &fl4);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 25ca8d4b6565..9ed5f69f84e2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -500,7 +500,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
-static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
+			     const struct sock *sk,
 			     const struct iphdr *iph,
 			     int oif, u8 tos,
 			     u8 prot, u32 mark, int flow_flags)
@@ -516,7 +517,8 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 	flowi4_init_output(fl4, oif, mark, tos,
 			   RT_SCOPE_UNIVERSE, prot,
 			   flow_flags,
-			   iph->daddr, iph->saddr, 0, 0);
+			   iph->daddr, iph->saddr, 0, 0,
+			   sock_net_uid(net, sk));
 }
 
 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
@@ -528,7 +530,7 @@ static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 	u8 prot = iph->protocol;
 	u32 mark = skb->mark;
 
-	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+	__build_flow_key(sock_net(sk), fl4, sk, iph, oif, tos, prot, mark, 0);
 }
 
 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
@@ -545,7 +547,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 			   inet_sk_flowi_flags(sk),
-			   daddr, inet->inet_saddr, 0, 0);
+			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 	rcu_read_unlock();
 }
 
@@ -793,7 +795,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 
 	rt = (struct rtable *) dst;
 
-	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+	__build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0);
 	__ip_do_redirect(rt, skb, &fl4, true);
 }
 
@@ -1011,7 +1013,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 	if (!mark)
 		mark = IP4_REPLY_MARK(net, skb->mark);
 
-	__build_flow_key(&fl4, NULL, iph, oif,
+	__build_flow_key(net, &fl4, NULL, iph, oif,
 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
@@ -1027,7 +1029,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 	struct flowi4 fl4;
 	struct rtable *rt;
 
-	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
 
 	if (!fl4.flowi4_mark)
 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
@@ -1046,6 +1048,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 	struct rtable *rt;
 	struct dst_entry *odst = NULL;
 	bool new = false;
+	struct net *net = sock_net(sk);
 
 	bh_lock_sock(sk);
 
@@ -1059,7 +1062,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 		goto out;
 	}
 
-	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
 
 	rt = (struct rtable *)odst;
 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
@@ -1099,7 +1102,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
 	struct flowi4 fl4;
 	struct rtable *rt;
 
-	__build_flow_key(&fl4, NULL, iph, oif,
+	__build_flow_key(net, &fl4, NULL, iph, oif,
 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
@@ -1114,9 +1117,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 	const struct iphdr *iph = (const struct iphdr *) skb->data;
 	struct flowi4 fl4;
 	struct rtable *rt;
+	struct net *net = sock_net(sk);
 
-	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
-	rt = __ip_route_output_key(sock_net(sk), &fl4);
+	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
 		__ip_do_redirect(rt, skb, &fl4, false);
 		ip_rt_put(rt);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 4cbe9f0a4281..2dc982b15df8 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -375,7 +375,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
 			   inet_sk_flowi_flags(sk),
 			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,
-			   ireq->ir_loc_addr, th->source, th->dest);
+			   ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
 	security_req_classify_flow(req, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(sock_net(sk), &fl4);
 	if (IS_ERR(rt)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 205e6745393f..cf1efa823af6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -684,6 +684,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 		arg.bound_dev_if = sk->sk_bound_dev_if;
 
 	arg.tos = ip_hdr(skb)->tos;
+	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -705,7 +706,8 @@ release_sk1:
    outside socket context is ugly, certainly. What can I do?
  */
 
-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb,
+			    u32 seq, u32 ack,
 			    u32 win, u32 tsval, u32 tsecr, int oif,
 			    struct tcp_md5sig_key *key,
 			    int reply_flags, u8 tos)
@@ -720,7 +722,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 			];
 	} rep;
 	struct ip_reply_arg arg;
-	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct net *net = sock_net(sk);
 
 	memset(&rep.th, 0, sizeof(struct tcphdr));
 	memset(&arg, 0, sizeof(arg));
@@ -769,6 +771,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 	if (oif)
 		arg.bound_dev_if = oif;
 	arg.tos = tos;
+	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -782,7 +785,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 	struct inet_timewait_sock *tw = inet_twsk(sk);
 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 
-	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+	tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 			tcp_time_stamp + tcptw->tw_ts_offset,
 			tcptw->tw_ts_recent,
@@ -801,7 +804,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 	 */
-	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+	tcp_v4_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
 			tcp_time_stamp,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f8b3842b9070..8acf544794a1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1023,7 +1023,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
 				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
 				   flow_flags,
-				   faddr, saddr, dport, inet->inet_sport);
+				   faddr, saddr, dport, inet->inet_sport,
+				   sk->sk_uid);
 
 		if (!saddr && ipc.oif) {
 			err = l3mdev_get_saddr(net, ipc.oif, fl4);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9dbfacb6e0d9..1604163c2850 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -678,6 +678,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 		fl6.flowi6_mark = sk->sk_mark;
 		fl6.fl6_dport = inet->inet_dport;
 		fl6.fl6_sport = inet->inet_sport;
+		fl6.flowi6_uid = sk->sk_uid;
 		security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
 		rcu_read_lock();
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 0630a4d5daaa..189eb10b742d 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -662,9 +662,10 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		return 0;
 
 	if (type == NDISC_REDIRECT)
-		ip6_redirect(skb, net, skb->dev->ifindex, 0);
+		ip6_redirect(skb, net, skb->dev->ifindex, 0,
+			     sock_net_uid(net, NULL));
 	else
-		ip6_update_pmtu(skb, net, info, 0, 0);
+		ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 517c55b01ba8..b0fa0dcf33d5 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -161,6 +161,7 @@ ipv4_connected:
 	fl6.flowi6_mark = sk->sk_mark;
 	fl6.fl6_dport = inet->inet_dport;
 	fl6.fl6_sport = inet->inet_sport;
+	fl6.flowi6_uid = sk->sk_uid;
 
 	if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
 		fl6.flowi6_oif = np->mcast_oif;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 060a60b2f8a6..218f0cba231c 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -474,9 +474,10 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		return 0;
 
 	if (type == NDISC_REDIRECT)
-		ip6_redirect(skb, net, skb->dev->ifindex, 0);
+		ip6_redirect(skb, net, skb->dev->ifindex, 0,
+			     sock_net_uid(net, NULL));
 	else
-		ip6_update_pmtu(skb, net, info, 0, 0);
+		ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 3697cd08c515..3ae2fbe07b25 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -92,9 +92,10 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct net *net = dev_net(skb->dev);
 
 	if (type == ICMPV6_PKT_TOOBIG)
-		ip6_update_pmtu(skb, net, info, 0, 0);
+		ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
 	else if (type == NDISC_REDIRECT)
-		ip6_redirect(skb, net, skb->dev->ifindex, 0);
+		ip6_redirect(skb, net, skb->dev->ifindex, 0,
+			     sock_net_uid(net, NULL));
 
 	if (!(type & ICMPV6_INFOMSG_MASK))
 		if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
@@ -478,6 +479,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 	fl6.flowi6_oif = iif;
 	fl6.fl6_icmp_type = type;
 	fl6.fl6_icmp_code = code;
+	fl6.flowi6_uid = sock_net_uid(net, NULL);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
 	sk = icmpv6_xmit_lock(net);
@@ -585,6 +587,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	fl6.flowi6_oif = l3mdev_fib_oif(skb->dev);
 	fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
 	fl6.flowi6_mark = mark;
+	fl6.flowi6_uid = sock_net_uid(net, NULL);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
 	sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index a7ca2cde2ecb..dc79ebc14189 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -86,6 +86,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
 	fl6->flowi6_mark = ireq->ir_mark;
 	fl6->fl6_dport = ireq->ir_rmt_port;
 	fl6->fl6_sport = htons(ireq->ir_num);
+	fl6->flowi6_uid = sk->sk_uid;
 	security_req_classify_flow(req, flowi6_to_flowi(fl6));
 
 	dst = ip6_dst_lookup_flow(sk, fl6, final_p);
@@ -134,6 +135,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
 	fl6->flowi6_mark = sk->sk_mark;
 	fl6->fl6_sport = inet->inet_sport;
 	fl6->fl6_dport = inet->inet_dport;
+	fl6->flowi6_uid = sk->sk_uid;
 	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 
 	rcu_read_lock();
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index e5ea177d34c6..ae8510a5f604 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -792,6 +792,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
 	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
 		fl6.flowi6_mark = skb->mark;
 
+	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
 	err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
 	if (err != 0) {
 		/* XXX: send ICMP error even if DF is not set. */
@@ -842,6 +844,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
 	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
 		fl6.flowi6_mark = skb->mark;
 
+	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
 	err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
 	if (err != 0) {
 		if (err == -EMSGSIZE)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 137fca42aaa6..fec9b8c5622c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1190,6 +1190,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
 	fl6.flowi6_proto = IPPROTO_IPIP;
 
+	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
 	dsfield = ipv4_get_dsfield(iph);
 
 	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
@@ -1243,6 +1245,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
 	fl6.flowi6_proto = IPPROTO_IPV6;
+	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
 
 	dsfield = ipv6_get_dsfield(ipv6h);
 	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 0a8610b33d79..24fb9c0efd00 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -597,9 +597,10 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		return 0;
 
 	if (type == NDISC_REDIRECT)
-		ip6_redirect(skb, net, skb->dev->ifindex, 0);
+		ip6_redirect(skb, net, skb->dev->ifindex, 0,
+			     sock_net_uid(net, NULL));
 	else
-		ip6_update_pmtu(skb, net, info, 0, 0);
+		ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 1b9316e1386a..54d165b9845a 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -74,9 +74,10 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		return 0;
 
 	if (type == NDISC_REDIRECT)
-		ip6_redirect(skb, net, skb->dev->ifindex, 0);
+		ip6_redirect(skb, net, skb->dev->ifindex, 0,
+			     sock_net_uid(net, NULL));
 	else
-		ip6_update_pmtu(skb, net, info, 0, 0);
+		ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d11c46833d61..39970e212ad5 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -26,6 +26,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 	struct flowi6 fl6 = {
 		.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
 		.flowi6_mark = skb->mark,
+		.flowi6_uid = sock_net_uid(net, skb->sk),
 		.daddr = iph->daddr,
 		.saddr = iph->saddr,
 	};
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index a5cf82ccf406..dc338f876514 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -141,6 +141,7 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	fl6.daddr = *daddr;
 	fl6.flowi6_oif = oif;
 	fl6.flowi6_mark = sk->sk_mark;
+	fl6.flowi6_uid = sk->sk_uid;
 	fl6.fl6_icmp_type = user_icmph.icmp6_type;
 	fl6.fl6_icmp_code = user_icmph.icmp6_code;
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 99140986e887..58eb8ee19f34 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -768,6 +768,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	memset(&fl6, 0, sizeof(fl6));
 
 	fl6.flowi6_mark = sk->sk_mark;
+	fl6.flowi6_uid = sk->sk_uid;
 
 	if (sin6) {
 		if (addr_len < SIN6_LEN_RFC2133)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a1297c41d147..30835d96278e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1383,7 +1383,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 }
 
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
-		     int oif, u32 mark)
+		     int oif, u32 mark, kuid_t uid)
 {
 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
 	struct dst_entry *dst;
@@ -1395,6 +1395,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 	fl6.daddr = iph->daddr;
 	fl6.saddr = iph->saddr;
 	fl6.flowlabel = ip6_flowinfo(iph);
+	fl6.flowi6_uid = uid;
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (!dst->error)
@@ -1406,7 +1407,7 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu);
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
 {
 	ip6_update_pmtu(skb, sock_net(sk), mtu,
-			sk->sk_bound_dev_if, sk->sk_mark);
+			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
 }
 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
 
@@ -1487,7 +1488,8 @@ static struct dst_entry *ip6_route_redirect(struct net *net,
 				flags, __ip6_route_redirect);
 }
 
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+		  kuid_t uid)
 {
 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
 	struct dst_entry *dst;
@@ -1500,6 +1502,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
 	fl6.daddr = iph->daddr;
 	fl6.saddr = iph->saddr;
 	fl6.flowlabel = ip6_flowinfo(iph);
+	fl6.flowi6_uid = uid;
 
 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
 	rt6_do_redirect(dst, NULL, skb);
@@ -1521,6 +1524,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
 	fl6.flowi6_mark = mark;
 	fl6.daddr = msg->dest;
 	fl6.saddr = iph->daddr;
+	fl6.flowi6_uid = sock_net_uid(net, NULL);
 
 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
 	rt6_do_redirect(dst, NULL, skb);
@@ -1529,7 +1533,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
 
 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
 {
-	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
+	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
+		     sk->sk_uid);
 }
 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index eaf7ac496d50..336843ca4e6b 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -228,6 +228,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		fl6.flowi6_mark = ireq->ir_mark;
 		fl6.fl6_dport = ireq->ir_rmt_port;
 		fl6.fl6_sport = inet_sk(sk)->inet_sport;
+		fl6.flowi6_uid = sk->sk_uid;
 		security_req_classify_flow(req, flowi6_to_flowi(&fl6));
 
 		dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f6eb65a5d343..1f5505ecf7f7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -234,6 +234,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.flowi6_mark = sk->sk_mark;
 	fl6.fl6_dport = usin->sin6_port;
 	fl6.fl6_sport = inet->inet_sport;
+	fl6.flowi6_uid = sk->sk_uid;
 
 	opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
 	final_p = fl6_update_dst(&fl6, opt, &final);
@@ -810,6 +811,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
+	fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
 	/* Pass a socket to ip6_dst_lookup either it is for RST
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index fcce7b4ba8ff..0890fd6d4248 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1243,6 +1243,7 @@ do_udp_sendmsg:
 		fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
 
 	fl6.flowi6_mark = sk->sk_mark;
+	fl6.flowi6_uid = sk->sk_uid;
 
 	if (msg->msg_controllen) {
 		opt = &opt_space;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index a2c8747d2936..c14c59f18c59 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -515,6 +515,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	memset(&fl6, 0, sizeof(fl6));
 
 	fl6.flowi6_mark = sk->sk_mark;
+	fl6.flowi6_uid = sk->sk_uid;
 
 	if (lsa) {
 		if (addr_len < SIN6_LEN_RFC2133)

From 341965cf103dc4d60a829e9a55941c618a81358f Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Wed, 30 Nov 2016 02:56:47 +0900
Subject: [PATCH 0581/3220] net: ipv4: Don't crash if passing a null sk to
 ip_rt_update_pmtu.

Commit e2d118a1cb5e ("net: inet: Support UID-based routing in IP
protocols.") made __build_flow_key call sock_net(sk) to determine
the network namespace of the passed-in socket. This crashes if sk
is NULL.

Fix this by getting the network namespace from the skb instead.

Bug: 16355602
Change-Id: I27161b70f448bb95adce3994a97920d54987ce4e
Fixes: e2d118a1cb5e ("net: inet: Support UID-based routing in IP protocols.")
Reported-by: Erez Shitrit <erezsh@dev.mellanox.co.il>
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9ed5f69f84e2..ad5ce6652623 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -524,13 +524,14 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 			       const struct sock *sk)
 {
+	const struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph = ip_hdr(skb);
 	int oif = skb->dev->ifindex;
 	u8 tos = RT_TOS(iph->tos);
 	u8 prot = iph->protocol;
 	u32 mark = skb->mark;
 
-	__build_flow_key(sock_net(sk), fl4, sk, iph, oif, tos, prot, mark, 0);
+	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 }
 
 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)

From 0aac1fcfa327de74251daf85a496e3e863f6c004 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Tue, 20 Dec 2016 11:08:34 -0800
Subject: [PATCH 0582/3220] ANDROID: android-base: Enable QUOTA related configs

Bug: 33757366
Change-Id: Iec4f55c3ca4a16dbc8695054f481d9261c56d0f6
---
 android/configs/android-base.cfg | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 8531a7a79e33..f10371a981b7 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -139,7 +139,11 @@ CONFIG_PPP_DEFLATE=y
 CONFIG_PPP_MPPE=y
 CONFIG_PREEMPT=y
 CONFIG_PROFILING=y
+CONFIG_QFMT_V2=y
 CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+CONFIG_QUOTA_TREE=y
+CONFIG_QUOTACTL=y
 CONFIG_RANDOMIZE_BASE=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y

From 676b8efcce52d582153d3df3bc90689cab07a058 Mon Sep 17 00:00:00 2001
From: mukesh agrawal <quiche@google.com>
Date: Tue, 12 Jul 2016 11:28:05 -0700
Subject: [PATCH 0583/3220] ANDROID: trace: net: use %pK for kernel pointers

We want to use network trace events in production
builds, to help diagnose Wifi problems. However, we
don't want to expose raw kernel pointers in such
builds.

Change the format specifier for the skbaddr field,
so that, if kptr_restrict is enabled, the pointers
will be reported as 0.

Bug: 30090733
Change-Id: Ic4bd583d37af6637343601feca875ee24479ddff
Signed-off-by: mukesh agrawal <quiche@google.com>
---
 include/trace/events/net.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 49cc7c3de252..89d009e10938 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -57,7 +57,7 @@ TRACE_EVENT(net_dev_start_xmit,
 		__entry->gso_type = skb_shinfo(skb)->gso_type;
 	),
 
-	TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
+	TP_printk("dev=%s queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
 		  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
 		  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
 		  __entry->protocol, __entry->ip_summed, __entry->len,
@@ -90,7 +90,7 @@ TRACE_EVENT(net_dev_xmit,
 		__assign_str(name, dev->name);
 	),
 
-	TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+	TP_printk("dev=%s skbaddr=%pK len=%u rc=%d",
 		__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
 );
 
@@ -112,7 +112,7 @@ DECLARE_EVENT_CLASS(net_dev_template,
 		__assign_str(name, skb->dev->name);
 	),
 
-	TP_printk("dev=%s skbaddr=%p len=%u",
+	TP_printk("dev=%s skbaddr=%pK len=%u",
 		__get_str(name), __entry->skbaddr, __entry->len)
 )
 
@@ -191,7 +191,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 		__entry->gso_type = skb_shinfo(skb)->gso_type;
 	),
 
-	TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
+	TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
 		  __get_str(name), __entry->napi_id, __entry->queue_mapping,
 		  __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
 		  __entry->vlan_tci, __entry->protocol, __entry->ip_summed,

From ee2bcbd8eddaca598b47284d3c52b593a9d09d7e Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 2 Jan 2017 20:18:05 +0530
Subject: [PATCH 0584/3220] ANDROID: sched/walt: fix build failure if
 FAIR_GROUP_SCHED=n

Fix SCHED_WALT dependency on FAIR_GROUP_SCHED otherwise we run
into following build failure:

  CC      kernel/sched/walt.o
kernel/sched/walt.c: In function 'walt_inc_cfs_cumulative_runnable_avg':
kernel/sched/walt.c:148:8: error: 'struct cfs_rq' has no member named 'cumulative_runnable_avg'
  cfs_rq->cumulative_runnable_avg += p->ravg.demand;
        ^
kernel/sched/walt.c: In function 'walt_dec_cfs_cumulative_runnable_avg':
kernel/sched/walt.c:154:8: error: 'struct cfs_rq' has no member named 'cumulative_runnable_avg'
  cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
        ^

Reported-at: https://bugs.linaro.org/show_bug.cgi?id=2793
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 init/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/init/Kconfig b/init/Kconfig
index acb6645ffda5..445af1262134 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -395,6 +395,7 @@ endchoice
 config SCHED_WALT
         bool "Support window based load tracking"
         depends on SMP
+        depends on FAIR_GROUP_SCHED
         help
         This feature will allow the scheduler to maintain a tunable window
 	based set of metrics for tasks and runqueues. These metrics can be

From 9ce0ba91cecce1377d9fca96eb097c428a44054f Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 14 Dec 2016 12:31:55 -0800
Subject: [PATCH 0585/3220] Revert "FROMLIST: arm64: Enable
 CONFIG_ARM64_SW_TTBR0_PAN"

This reverts commit 67cd3bda54dadba4f8892105adf9c2f3982bfa0a.

Bug: 31432001
Change-Id: I1e5836ce0b41b2262d95c5c4c49ace3b96ae0b1f
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/Kconfig | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a4d3c3cd7b23..095a3afb1e9d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -706,14 +706,6 @@ config SETEND_EMULATION
 	  If unsure, say Y
 endif
 
-config ARM64_SW_TTBR0_PAN
-	bool "Emulate Priviledged Access Never using TTBR0_EL1 switching"
-	help
-	  Enabling this option prevents the kernel from accessing
-	  user-space memory directly by pointing TTBR0_EL1 to a reserved
-	  zeroed area and reserved ASID. The user access routines
-	  restore the valid TTBR0_EL1 temporarily.
-
 menu "ARMv8.1 architectural features"
 
 config ARM64_HW_AFDBM

From c558527eedcd5e166fb91db26ed183e5b88e036f Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 14 Dec 2016 12:32:07 -0800
Subject: [PATCH 0586/3220] Revert "FROMLIST: arm64: xen: Enable user access
 before a privcmd hvc call"

This reverts commit 4dbc88bd2b6a74fd33483ee2593dcf2bd858eabe.

Bug: 31432001
Change-Id: I2c3d591a2c631e7ff02c0bcb91624735e8c12f0a
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/xen/hypercall.S | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 6d6e4af1a4bf..8bbe9401f4f0 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -49,7 +49,6 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/uaccess.h>
 #include <xen/interface/xen.h>
 
 
@@ -90,24 +89,6 @@ ENTRY(privcmd_call)
 	mov x2, x3
 	mov x3, x4
 	mov x4, x5
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	/*
-	 * Privcmd calls are issued by the userspace. The kernel needs to
-	 * enable access to TTBR0_EL1 as the hypervisor would issue stage 1
-	 * translations to user memory via AT instructions. Since AT
-	 * instructions are not affected by the PAN bit (ARMv8.1), we only
-	 * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
-	 * is enabled (it implies that hardware UAO and PAN disabled).
-	 */
-	uaccess_enable_not_uao x6, x7
-#endif
 	hvc XEN_IMM
-
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	/*
-	 * Disable userspace access from kernel once the hyp call completed.
-	 */
-	uaccess_disable_not_uao x6
-#endif
 	ret
 ENDPROC(privcmd_call);

From 1f150a5e5a9630492742def51ebbf665051fbada Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 14 Dec 2016 12:32:16 -0800
Subject: [PATCH 0587/3220] Revert "FROMLIST: arm64: Handle faults caused by
 inadvertent user access with PAN enabled"

This reverts commit 5dc2b7c7bb33138270ff9494be6cf334bd3d20e1.

Bug: 31432001
Change-Id: I384a9af199f502f8fa3ae3733db67a4c547dbd55
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/mm/fault.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9cedb10b1107..0aacbd763e6b 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -246,19 +246,13 @@ out:
 	return fault;
 }
 
-static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs)
+static inline bool is_permission_fault(unsigned int esr)
 {
 	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
 
-	if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
-		return false;
-
-	if (system_uses_ttbr0_pan())
-		return fsc_type == ESR_ELx_FSC_FAULT &&
-			(regs->pstate & PSR_PAN_BIT);
-	else
-		return fsc_type == ESR_ELx_FSC_PERM;
+	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) ||
+	       (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
 }
 
 static bool is_el0_instruction_abort(unsigned int esr)
@@ -299,7 +293,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		mm_flags |= FAULT_FLAG_WRITE;
 	}
 
-	if (addr < USER_DS && is_permission_fault(esr, regs)) {
+	if (is_permission_fault(esr) && (addr < USER_DS)) {
 		/* regs->orig_addr_limit may be 0 if we entered from EL0 */
 		if (regs->orig_addr_limit == KERNEL_DS)
 			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
@@ -484,10 +478,10 @@ static const struct fault_info {
 	{ do_bad,		SIGBUS,  0,		"unknown 17"			},
 	{ do_bad,		SIGBUS,  0,		"unknown 18"			},
 	{ do_bad,		SIGBUS,  0,		"unknown 19"			},
-	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
-	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
-	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
-	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
 	{ do_bad,		SIGBUS,  0,		"synchronous parity error"	},
 	{ do_bad,		SIGBUS,  0,		"unknown 25"			},
 	{ do_bad,		SIGBUS,  0,		"unknown 26"			},

From 77a4773a3b6e5b031ad3ecb68c217c45ed964fa3 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 14 Dec 2016 12:32:25 -0800
Subject: [PATCH 0588/3220] Revert "FROMLIST: arm64: Disable TTBR0_EL1 during
 normal kernel execution"

This reverts commit 5775ca34829caf0664c8ccc02fd0e93cb6022e0f.

Bug: 31432001
Change-Id: I9b07c2f01bc2bcfed51f60ab487034639f5e1960
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/efi.h         | 26 +----------
 arch/arm64/include/asm/mmu_context.h | 51 ++++++---------------
 arch/arm64/include/asm/ptrace.h      |  2 -
 arch/arm64/kernel/entry.S            | 67 ----------------------------
 arch/arm64/kernel/setup.c            |  9 ----
 arch/arm64/mm/context.c              |  7 +--
 6 files changed, 16 insertions(+), 146 deletions(-)

diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 932f5a56d1a6..8e88a696c9cb 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_EFI_H
 #define _ASM_EFI_H
 
-#include <asm/cpufeature.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/neon.h>
@@ -70,30 +69,7 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
 
 static inline void efi_set_pgd(struct mm_struct *mm)
 {
-	__switch_mm(mm);
-
-	if (system_uses_ttbr0_pan()) {
-		if (mm != current->active_mm) {
-			/*
-			 * Update the current thread's saved ttbr0 since it is
-			 * restored as part of a return from exception. Set
-			 * the hardware TTBR0_EL1 using cpu_switch_mm()
-			 * directly to enable potential errata workarounds.
-			 */
-			update_saved_ttbr0(current, mm);
-			cpu_switch_mm(mm->pgd, mm);
-		} else {
-			/*
-			 * Defer the switch to the current thread's TTBR0_EL1
-			 * until uaccess_enable(). Restore the current
-			 * thread's saved ttbr0 corresponding to its active_mm
-			 * (if different from init_mm).
-			 */
-			cpu_set_reserved_ttbr0();
-			if (current->active_mm != &init_mm)
-				update_saved_ttbr0(current, current->active_mm);
-		}
-	}
+	switch_mm(NULL, mm, NULL);
 }
 
 void efi_virtmap_load(void);
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 4a32fd5f101d..a00f7cf35bbd 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -23,7 +23,6 @@
 #include <linux/sched.h>
 
 #include <asm/cacheflush.h>
-#include <asm/cpufeature.h>
 #include <asm/proc-fns.h>
 #include <asm-generic/mm_hooks.h>
 #include <asm/cputype.h>
@@ -114,7 +113,7 @@ static inline void cpu_uninstall_idmap(void)
 	local_flush_tlb_all();
 	cpu_set_default_tcr_t0sz();
 
-	if (mm != &init_mm && !system_uses_ttbr0_pan())
+	if (mm != &init_mm)
 		cpu_switch_mm(mm->pgd, mm);
 }
 
@@ -174,27 +173,21 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 }
 
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-static inline void update_saved_ttbr0(struct task_struct *tsk,
-				      struct mm_struct *mm)
-{
-	if (system_uses_ttbr0_pan()) {
-		BUG_ON(mm->pgd == swapper_pg_dir);
-		task_thread_info(tsk)->ttbr0 =
-			virt_to_phys(mm->pgd) | ASID(mm) << 48;
-	}
-}
-#else
-static inline void update_saved_ttbr0(struct task_struct *tsk,
-				      struct mm_struct *mm)
-{
-}
-#endif
-
-static inline void __switch_mm(struct mm_struct *next)
+/*
+ * This is the actual mm switch as far as the scheduler
+ * is concerned.  No registers are touched.  We avoid
+ * calling the CPU specific function when the mm hasn't
+ * actually changed.
+ */
+static inline void
+switch_mm(struct mm_struct *prev, struct mm_struct *next,
+	  struct task_struct *tsk)
 {
 	unsigned int cpu = smp_processor_id();
 
+	if (prev == next)
+		return;
+
 	/*
 	 * init_mm.pgd does not contain any user mappings and it is always
 	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
@@ -207,23 +200,7 @@ static inline void __switch_mm(struct mm_struct *next)
 	check_and_switch_context(next, cpu);
 }
 
-static inline void
-switch_mm(struct mm_struct *prev, struct mm_struct *next,
-	  struct task_struct *tsk)
-{
-	if (prev != next)
-		__switch_mm(next);
-
-	/*
-	 * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
-	 * value may have not been initialised yet (activate_mm caller) or the
-	 * ASID has changed since the last run (following the context switch
-	 * of another thread of the same process).
-	 */
-	update_saved_ttbr0(tsk, next);
-}
-
 #define deactivate_mm(tsk,mm)	do { } while (0)
-#define activate_mm(prev,next)	switch_mm(prev, next, current)
+#define activate_mm(prev,next)	switch_mm(prev, next, NULL)
 
 #endif
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index dd4257e286b1..200d5c32b38f 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -21,8 +21,6 @@
 
 #include <uapi/asm/ptrace.h>
 
-#define _PSR_PAN_BIT		22
-
 /* Current Exception Level values, as contained in CurrentEL */
 #define CurrentEL_EL1		(1 << 2)
 #define CurrentEL_EL2		(2 << 2)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index eb185c93dfc6..8eb8eb085036 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -29,9 +29,7 @@
 #include <asm/esr.h>
 #include <asm/irq.h>
 #include <asm/memory.h>
-#include <asm/ptrace.h>
 #include <asm/thread_info.h>
-#include <asm/uaccess.h>
 #include <asm/unistd.h>
 
 /*
@@ -111,34 +109,6 @@
 	mrs	x22, elr_el1
 	mrs	x23, spsr_el1
 	stp	lr, x21, [sp, #S_LR]
-
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	/*
-	 * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
-	 * EL0, there is no need to check the state of TTBR0_EL1 since
-	 * accesses are always enabled.
-	 * Note that the meaning of this bit differs from the ARMv8.1 PAN
-	 * feature as all TTBR0_EL1 accesses are disabled, not just those to
-	 * user mappings.
-	 */
-alternative_if_not ARM64_HAS_PAN
-	nop
-alternative_else
-	b	1f				// skip TTBR0 PAN
-alternative_endif
-
-	.if	\el != 0
-	mrs	x21, ttbr0_el1
-	tst	x21, #0xffff << 48		// Check for the reserved ASID
-	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
-	b.eq	1f				// TTBR0 access already disabled
-	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
-	.endif
-
-	uaccess_ttbr0_disable x21
-1:
-#endif
-
 	stp	x22, x23, [sp, #S_PC]
 
 	/*
@@ -177,42 +147,6 @@ alternative_endif
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 	.if	\el == 0
 	ct_user_enter
-	.endif
-
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	/*
-	 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
-	 * PAN bit checking.
-	 */
-alternative_if_not ARM64_HAS_PAN
-	nop
-alternative_else
-	b	2f				// skip TTBR0 PAN
-alternative_endif
-
-	.if	\el != 0
-	tbnz	x22, #_PSR_PAN_BIT, 1f		// Skip re-enabling TTBR0 access if previously disabled
-	.endif
-
-	uaccess_ttbr0_enable x0
-
-	.if	\el == 0
-	/*
-	 * Enable errata workarounds only if returning to user. The only
-	 * workaround currently required for TTBR0_EL1 changes are for the
-	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
-	 * corruption).
-	 */
-	post_ttbr0_update_workaround
-	.endif
-1:
-	.if	\el != 0
-	and	x22, x22, #~PSR_PAN_BIT		// ARMv8.0 CPUs do not understand this bit
-	.endif
-2:
-#endif
-
-	.if	\el == 0
 	ldr	x23, [sp, #S_SP]		// load return stack pointer
 	msr	sp_el0, x23
 #ifdef CONFIG_ARM64_ERRATUM_845719
@@ -234,7 +168,6 @@ alternative_else
 alternative_endif
 #endif
 	.endif
-
 	msr	elr_el1, x21			// set up the return data
 	msr	spsr_el1, x22
 	ldp	x0, x1, [sp, #16 * 0]
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 6591bf23422b..29b8c247d56f 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -347,15 +347,6 @@ void __init setup_arch(char **cmdline_p)
 	smp_init_cpus();
 	smp_build_mpidr_hash();
 
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	/*
-	 * Make sure init_thread_info.ttbr0 always generates translation
-	 * faults in case uaccess_enable() is inadvertently called by the init
-	 * thread.
-	 */
-	init_thread_info.ttbr0 = virt_to_phys(empty_zero_page);
-#endif
-
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
 	conswitchp = &vga_con;
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 25128089c386..7275628ba59f 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -182,12 +182,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath:
-	/*
-	 * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
-	 * emulating PAN.
-	 */
-	if (!system_uses_ttbr0_pan())
-		cpu_switch_mm(mm->pgd, mm);
+	cpu_switch_mm(mm->pgd, mm);
 }
 
 static int asids_init(void)

From b4ae8c8946b279fd142b8f85b0631606bc714a42 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 14 Dec 2016 12:32:37 -0800
Subject: [PATCH 0589/3220] Revert "FROMLIST: arm64: Introduce
 uaccess_{disable,enable} functionality based on TTBR0_EL1"

This reverts commit 1911d36b27ba58ee18592df25b7ee636d4d4c41d.

Bug: 31432001
Change-Id: Iee77eed8454f379b948dbbaf65c105952ea30bef
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h      | 16 -----
 arch/arm64/include/asm/cpufeature.h     |  6 --
 arch/arm64/include/asm/kernel-pgtable.h |  7 --
 arch/arm64/include/asm/thread_info.h    |  3 -
 arch/arm64/include/asm/uaccess.h        | 96 ++-----------------------
 arch/arm64/kernel/asm-offsets.c         |  3 -
 arch/arm64/kernel/cpufeature.c          |  1 -
 arch/arm64/kernel/entry.S               |  4 ++
 arch/arm64/kernel/head.S                |  6 +-
 arch/arm64/kernel/vmlinux.lds.S         |  5 --
 10 files changed, 13 insertions(+), 134 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index aeb4554b3af3..9d3e77a5cf07 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -53,15 +53,6 @@
 	msr	daifclr, #2
 	.endm
 
-	.macro	save_and_disable_irq, flags
-	mrs	\flags, daif
-	msr	daifset, #2
-	.endm
-
-	.macro	restore_irq, flags
-	msr	daif, \flags
-	.endm
-
 /*
  * Enable and disable debug exceptions.
  */
@@ -371,13 +362,6 @@ alternative_endif
 	movk	\reg, :abs_g0_nc:\val
 	.endm
 
-/*
- * Return the current thread_info.
- */
-	.macro	get_thread_info, rd
-	mrs	\rd, sp_el0
-	.endm
-
 /*
  * Errata workaround post TTBR0_EL1 update.
  */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index f125c03ab2e1..727e594ac5c2 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -188,12 +188,6 @@ static inline bool system_supports_mixed_endian_el0(void)
 	return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1));
 }
 
-static inline bool system_uses_ttbr0_pan(void)
-{
-	return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) &&
-		!cpus_have_cap(ARM64_HAS_PAN);
-}
-
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 7803343e5881..7e51d1b57c0c 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -19,7 +19,6 @@
 #ifndef __ASM_KERNEL_PGTABLE_H
 #define __ASM_KERNEL_PGTABLE_H
 
-#include <asm/pgtable.h>
 #include <asm/sparsemem.h>
 
 /*
@@ -55,12 +54,6 @@
 #define SWAPPER_DIR_SIZE	(SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
 #define IDMAP_DIR_SIZE		(IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
 
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-#define RESERVED_TTBR0_SIZE	(PAGE_SIZE)
-#else
-#define RESERVED_TTBR0_SIZE	(0)
-#endif
-
 /* Initial memory map size */
 #if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_BLOCK_SHIFT	SECTION_SHIFT
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index b3325a9cb90f..abd64bd1f6d9 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -47,9 +47,6 @@ typedef unsigned long mm_segment_t;
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	mm_segment_t		addr_limit;	/* address limit */
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	u64			ttbr0;		/* saved TTBR0_EL1 */
-#endif
 	struct task_struct	*task;		/* main task structure */
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	int			cpu;		/* cpu */
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index c37c064d7cdd..c8ef22a9a83b 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -28,7 +28,6 @@
 
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
-#include <asm/kernel-pgtable.h>
 #include <asm/ptrace.h>
 #include <asm/sysreg.h>
 #include <asm/errno.h>
@@ -129,57 +128,16 @@ static inline void set_fs(mm_segment_t fs)
 /*
  * User access enabling/disabling.
  */
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-static inline void uaccess_ttbr0_disable(void)
-{
-	unsigned long ttbr;
-
-	/* reserved_ttbr0 placed at the end of swapper_pg_dir */
-	ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE;
-	write_sysreg(ttbr, ttbr0_el1);
-	isb();
-}
-
-static inline void uaccess_ttbr0_enable(void)
-{
-	unsigned long flags;
-
-	/*
-	 * Disable interrupts to avoid preemption between reading the 'ttbr0'
-	 * variable and the MSR. A context switch could trigger an ASID
-	 * roll-over and an update of 'ttbr0'.
-	 */
-	local_irq_save(flags);
-	write_sysreg(current_thread_info()->ttbr0, ttbr0_el1);
-	isb();
-	local_irq_restore(flags);
-}
-#else
-static inline void uaccess_ttbr0_disable(void)
-{
-}
-
-static inline void uaccess_ttbr0_enable(void)
-{
-}
-#endif
-
 #define __uaccess_disable(alt)						\
 do {									\
-	if (system_uses_ttbr0_pan())					\
-		uaccess_ttbr0_disable();				\
-	else								\
-		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,		\
-				CONFIG_ARM64_PAN));			\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,			\
+			CONFIG_ARM64_PAN));				\
 } while (0)
 
 #define __uaccess_enable(alt)						\
 do {									\
-	if (system_uses_ttbr0_pan())					\
-		uaccess_ttbr0_enable();					\
-	else								\
-		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,		\
-				CONFIG_ARM64_PAN));			\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,			\
+			CONFIG_ARM64_PAN));				\
 } while (0)
 
 static inline void uaccess_disable(void)
@@ -409,39 +367,12 @@ extern __must_check long strnlen_user(const char __user *str, long n);
 
 #include <asm/alternative.h>
 #include <asm/assembler.h>
-#include <asm/kernel-pgtable.h>
 
 /*
- * User access enabling/disabling macros.
- */
-	.macro	uaccess_ttbr0_disable, tmp1
-	mrs	\tmp1, ttbr1_el1		// swapper_pg_dir
-	add	\tmp1, \tmp1, #SWAPPER_DIR_SIZE	// reserved_ttbr0 at the end of swapper_pg_dir
-	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
-	isb
-	.endm
-
-	.macro	uaccess_ttbr0_enable, tmp1
-	get_thread_info \tmp1
-	ldr	\tmp1, [\tmp1, #TI_TTBR0]	// load saved TTBR0_EL1
-	msr	ttbr0_el1, \tmp1		// set the non-PAN TTBR0_EL1
-	isb
-	.endm
-
-/*
- * These macros are no-ops when UAO is present.
+ * User access enabling/disabling macros. These are no-ops when UAO is
+ * present.
  */
 	.macro	uaccess_disable_not_uao, tmp1
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-alternative_if_not ARM64_HAS_PAN
-	uaccess_ttbr0_disable \tmp1
-alternative_else
-	nop
-	nop
-	nop
-	nop
-alternative_endif
-#endif
 alternative_if_not ARM64_ALT_PAN_NOT_UAO
 	nop
 alternative_else
@@ -450,21 +381,6 @@ alternative_endif
 	.endm
 
 	.macro	uaccess_enable_not_uao, tmp1, tmp2
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-alternative_if_not ARM64_HAS_PAN
-	save_and_disable_irq \tmp2		// avoid preemption
-	uaccess_ttbr0_enable \tmp1
-	restore_irq \tmp2
-alternative_else
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-alternative_endif
-#endif
 alternative_if_not ARM64_ALT_PAN_NOT_UAO
 	nop
 alternative_else
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index d0ec987dba5b..087cf9a65359 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -36,9 +36,6 @@ int main(void)
   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-  DEFINE(TI_TTBR0,		offsetof(struct thread_info, ttbr0));
-#endif
   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
   DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
   BLANK();
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 40ee3f2933e7..7566cad9fa1d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -43,7 +43,6 @@ unsigned int compat_elf_hwcap2 __read_mostly;
 #endif
 
 DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
-EXPORT_SYMBOL(cpu_hwcaps);
 
 #define __ARM64_FTR_BITS(SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \
 	{						\
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 8eb8eb085036..533e1c9fd5a6 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -190,6 +190,10 @@ alternative_endif
 	eret					// return to kernel
 	.endm
 
+	.macro	get_thread_info, rd
+	mrs	\rd, sp_el0
+	.endm
+
 	.macro	irq_stack_entry
 	mov	x19, sp			// preserve the original sp
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 461d6cc258dd..16c62dea7934 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -322,14 +322,14 @@ __create_page_tables:
 	 * dirty cache lines being evicted.
 	 */
 	mov	x0, x25
-	add	x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
+	add	x1, x26, #SWAPPER_DIR_SIZE
 	bl	__inval_cache_range
 
 	/*
 	 * Clear the idmap and swapper page tables.
 	 */
 	mov	x0, x25
-	add	x6, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
+	add	x6, x26, #SWAPPER_DIR_SIZE
 1:	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
@@ -407,7 +407,7 @@ __create_page_tables:
 	 * tables again to remove any speculatively loaded cache lines.
 	 */
 	mov	x0, x25
-	add	x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
+	add	x1, x26, #SWAPPER_DIR_SIZE
 	dmb	sy
 	bl	__inval_cache_range
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index f1d6c49dcc5f..c241ea5359b9 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -185,11 +185,6 @@ SECTIONS
 	swapper_pg_dir = .;
 	. += SWAPPER_DIR_SIZE;
 
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	reserved_ttbr0 = .;
-	. += RESERVED_TTBR0_SIZE;
-#endif
-
 	_end = .;
 
 	STABS_DEBUG

From 81688dbb3294585e137270f84275e6e94cbc9d7e Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 14 Dec 2016 12:32:46 -0800
Subject: [PATCH 0590/3220] Revert "FROMLIST: arm64: Factor out TTBR0_EL1
 post-update workaround into a specific asm macro"

This reverts commit 3b66929169de053042d47e482dd5748794756153.

Bug: 31432001
Change-Id: Ib38fcf553ca2077531cbf550fbaa75378a8723c5
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h | 17 -----------------
 arch/arm64/mm/proc.S               | 11 ++++++++++-
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 9d3e77a5cf07..9e8ac1e73457 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -362,21 +362,4 @@ alternative_endif
 	movk	\reg, :abs_g0_nc:\val
 	.endm
 
-/*
- * Errata workaround post TTBR0_EL1 update.
- */
-	.macro	post_ttbr0_update_workaround
-#ifdef CONFIG_CAVIUM_ERRATUM_27456
-alternative_if_not ARM64_WORKAROUND_CAVIUM_27456
-	nop
-	nop
-	nop
-alternative_else
-	ic	iallu
-	dsb	nsh
-	isb
-alternative_endif
-#endif
-	.endm
-
 #endif	/* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 765713702625..9f6deacf41d2 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -136,8 +136,17 @@ ENTRY(cpu_do_switch_mm)
 	bfi	x0, x1, #48, #16		// set the ASID
 	msr	ttbr0_el1, x0			// set TTBR0
 	isb
-	post_ttbr0_update_workaround
+alternative_if_not ARM64_WORKAROUND_CAVIUM_27456
 	ret
+	nop
+	nop
+	nop
+alternative_else
+	ic	iallu
+	dsb	nsh
+	isb
+	ret
+alternative_endif
 ENDPROC(cpu_do_switch_mm)
 
 	.pushsection ".idmap.text", "ax"

From c86266edfaa4e39a5f784e1c9ee8ec6ba2803cc1 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 14 Dec 2016 12:32:56 -0800
Subject: [PATCH 0591/3220] Revert "FROMLIST: arm64: Factor out PAN
 enabling/disabling into separate uaccess_* macros"

This reverts commit 23368b642deb01ac6ce668ec1dedfcc0cab25c71.

Bug: 31432001
Change-Id: Ia59e5fc75ef905b89d5f9194f1e762c1e5eff5bf
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/futex.h       | 14 ++---
 arch/arm64/include/asm/uaccess.h     | 79 +++-------------------------
 arch/arm64/kernel/armv8_deprecated.c | 10 ++--
 arch/arm64/lib/clear_user.S          |  8 +--
 arch/arm64/lib/copy_from_user.S      |  8 +--
 arch/arm64/lib/copy_in_user.S        |  8 +--
 arch/arm64/lib/copy_to_user.S        |  8 +--
 7 files changed, 40 insertions(+), 95 deletions(-)

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 71dfa3b42313..f2585cdd32c2 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -27,9 +27,9 @@
 #include <asm/sysreg.h>
 
 #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg)		\
-do {									\
-	uaccess_enable();						\
 	asm volatile(							\
+	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,		\
+		    CONFIG_ARM64_PAN)					\
 "	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%w1, %2\n"						\
 	insn "\n"							\
@@ -44,11 +44,11 @@ do {									\
 "	.popsection\n"							\
 	_ASM_EXTABLE(1b, 4b)						\
 	_ASM_EXTABLE(2b, 4b)						\
+	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,		\
+		    CONFIG_ARM64_PAN)					\
 	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp)	\
 	: "r" (oparg), "Ir" (-EFAULT)					\
-	: "memory");							\
-	uaccess_disable();						\
-} while (0)
+	: "memory")
 
 static inline int
 futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
@@ -118,8 +118,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	uaccess_enable();
 	asm volatile("// futex_atomic_cmpxchg_inatomic\n"
+ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "	prfm	pstl1strm, %2\n"
 "1:	ldxr	%w1, %2\n"
 "	sub	%w3, %w1, %w4\n"
@@ -134,10 +134,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 "	.popsection\n"
 	_ASM_EXTABLE(1b, 4b)
 	_ASM_EXTABLE(2b, 4b)
+ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
 	: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
 	: "memory");
-	uaccess_disable();
 
 	*uval = val;
 	return ret;
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index c8ef22a9a83b..c3d445b42351 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -18,8 +18,6 @@
 #ifndef __ASM_UACCESS_H
 #define __ASM_UACCESS_H
 
-#ifndef __ASSEMBLY__
-
 /*
  * User space memory access functions
  */
@@ -125,44 +123,6 @@ static inline void set_fs(mm_segment_t fs)
 	"	.long		(" #from " - .), (" #to " - .)\n"	\
 	"	.popsection\n"
 
-/*
- * User access enabling/disabling.
- */
-#define __uaccess_disable(alt)						\
-do {									\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,			\
-			CONFIG_ARM64_PAN));				\
-} while (0)
-
-#define __uaccess_enable(alt)						\
-do {									\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,			\
-			CONFIG_ARM64_PAN));				\
-} while (0)
-
-static inline void uaccess_disable(void)
-{
-	__uaccess_disable(ARM64_HAS_PAN);
-}
-
-static inline void uaccess_enable(void)
-{
-	__uaccess_enable(ARM64_HAS_PAN);
-}
-
-/*
- * These functions are no-ops when UAO is present.
- */
-static inline void uaccess_disable_not_uao(void)
-{
-	__uaccess_disable(ARM64_ALT_PAN_NOT_UAO);
-}
-
-static inline void uaccess_enable_not_uao(void)
-{
-	__uaccess_enable(ARM64_ALT_PAN_NOT_UAO);
-}
-
 /*
  * The "__xxx" versions of the user access functions do not verify the address
  * space - it must have been done previously with a separate "access_ok()"
@@ -190,7 +150,8 @@ static inline void uaccess_enable_not_uao(void)
 do {									\
 	unsigned long __gu_val;						\
 	__chk_user_ptr(ptr);						\
-	uaccess_enable_not_uao();					\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
+			CONFIG_ARM64_PAN));				\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
 		__get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr),  \
@@ -211,8 +172,9 @@ do {									\
 	default:							\
 		BUILD_BUG();						\
 	}								\
-	uaccess_disable_not_uao();					\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
+			CONFIG_ARM64_PAN));				\
 } while (0)
 
 #define __get_user(x, ptr)						\
@@ -257,7 +219,8 @@ do {									\
 do {									\
 	__typeof__(*(ptr)) __pu_val = (x);				\
 	__chk_user_ptr(ptr);						\
-	uaccess_enable_not_uao();					\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
+			CONFIG_ARM64_PAN));				\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
 		__put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr),	\
@@ -278,7 +241,8 @@ do {									\
 	default:							\
 		BUILD_BUG();						\
 	}								\
-	uaccess_disable_not_uao();					\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
+			CONFIG_ARM64_PAN));				\
 } while (0)
 
 #define __put_user(x, ptr)						\
@@ -363,31 +327,4 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
 extern __must_check long strnlen_user(const char __user *str, long n);
 
-#else	/* __ASSEMBLY__ */
-
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-
-/*
- * User access enabling/disabling macros. These are no-ops when UAO is
- * present.
- */
-	.macro	uaccess_disable_not_uao, tmp1
-alternative_if_not ARM64_ALT_PAN_NOT_UAO
-	nop
-alternative_else
-	SET_PSTATE_PAN(1)
-alternative_endif
-	.endm
-
-	.macro	uaccess_enable_not_uao, tmp1, tmp2
-alternative_if_not ARM64_ALT_PAN_NOT_UAO
-	nop
-alternative_else
-	SET_PSTATE_PAN(0)
-alternative_endif
-	.endm
-
-#endif	/* __ASSEMBLY__ */
-
 #endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 562f7ddb158b..c37202c0c838 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -281,9 +281,9 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
  * Error-checking SWP macros implemented using ldxr{b}/stxr{b}
  */
 #define __user_swpX_asm(data, addr, res, temp, B)		\
-do {								\
-	uaccess_enable();					\
 	__asm__ __volatile__(					\
+	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,	\
+		    CONFIG_ARM64_PAN)				\
 	"0:	ldxr"B"		%w2, [%3]\n"			\
 	"1:	stxr"B"		%w0, %w1, [%3]\n"		\
 	"	cbz		%w0, 2f\n"			\
@@ -299,11 +299,11 @@ do {								\
 	"	.popsection"					\
 	_ASM_EXTABLE(0b, 4b)					\
 	_ASM_EXTABLE(1b, 4b)					\
+	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,	\
+		CONFIG_ARM64_PAN)				\
 	: "=&r" (res), "+r" (data), "=&r" (temp)		\
 	: "r" (addr), "i" (-EAGAIN), "i" (-EFAULT)		\
-	: "memory");						\
-	uaccess_disable();					\
-} while (0)
+	: "memory")
 
 #define __user_swp_asm(data, addr, res, temp) \
 	__user_swpX_asm(data, addr, res, temp, "")
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index 08b5f18ba604..5d1cad3ce6d6 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,10 +17,10 @@
  */
 #include <linux/linkage.h>
 
+#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
-#include <asm/uaccess.h>
 
 	.text
 
@@ -33,7 +33,8 @@
  * Alignment fixed up by hardware.
  */
 ENTRY(__clear_user)
-	uaccess_enable_not_uao x2, x3
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
+	    CONFIG_ARM64_PAN)
 	mov	x2, x1			// save the size for fixup return
 	subs	x1, x1, #8
 	b.mi	2f
@@ -53,7 +54,8 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
 	b.mi	5f
 uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
 5:	mov	x0, #0
-	uaccess_disable_not_uao x2
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
+	    CONFIG_ARM64_PAN)
 	ret
 ENDPROC(__clear_user)
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 6505ec81f1da..0b90497d4424 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -16,11 +16,11 @@
 
 #include <linux/linkage.h>
 
+#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/cache.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
-#include <asm/uaccess.h>
 
 /*
  * Copy from user space to a kernel buffer (alignment handled by the hardware)
@@ -67,10 +67,12 @@
 
 end	.req	x5
 ENTRY(__arch_copy_from_user)
-	uaccess_enable_not_uao x3, x4
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
+	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
+	    CONFIG_ARM64_PAN)
 	mov	x0, #0				// Nothing to copy
 	ret
 ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 9b04ff3ab610..f7292dd08c84 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -18,11 +18,11 @@
 
 #include <linux/linkage.h>
 
+#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/cache.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
-#include <asm/uaccess.h>
 
 /*
  * Copy from user space to user space (alignment handled by the hardware)
@@ -68,10 +68,12 @@
 
 end	.req	x5
 ENTRY(__copy_in_user)
-	uaccess_enable_not_uao x3, x4
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
+	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
+	    CONFIG_ARM64_PAN)
 	mov	x0, #0
 	ret
 ENDPROC(__copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 8077e4f34d56..7a7efe255034 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -16,11 +16,11 @@
 
 #include <linux/linkage.h>
 
+#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/cache.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
-#include <asm/uaccess.h>
 
 /*
  * Copy to user space from a kernel buffer (alignment handled by the hardware)
@@ -66,10 +66,12 @@
 
 end	.req	x5
 ENTRY(__arch_copy_to_user)
-	uaccess_enable_not_uao x3, x4
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
+	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
+	    CONFIG_ARM64_PAN)
 	mov	x0, #0
 	ret
 ENDPROC(__arch_copy_to_user)

From 12ac5b67a13dabdb6e9cd9c0811bcebe12a23b1c Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 6 Sep 2016 16:40:23 +0100
Subject: [PATCH 0592/3220] UPSTREAM: arm64: barriers: introduce nops and
 __nops macros for NOP sequences

NOP sequences tend to get used for padding out alternative sections
and uarch-specific pipeline flushes in errata workarounds.

This patch adds macros for generating these sequences as both inline
asm blocks, but also as strings suitable for embedding in other asm
blocks directly.

Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 31432001
Change-Id: I7f82b677a065ede302a763d39ffcc3fef83f8fbe
(cherry picked from commit f99a250cb6a3b301b101b4c0f5fcb80593bba6dc)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h | 9 +++++++++
 arch/arm64/include/asm/barrier.h   | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 9e8ac1e73457..bacc75b6c78c 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -98,6 +98,15 @@
 	dmb	\opt
 	.endm
 
+/*
+ * NOP sequence
+ */
+	.macro	nops, num
+	.rept	\num
+	nop
+	.endr
+	.endm
+
 /*
  * Emit an entry into the exception table
  */
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 9622eb48f894..c5dbc5cb8f10 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -20,6 +20,9 @@
 
 #ifndef __ASSEMBLY__
 
+#define __nops(n)	".rept	" #n "\nnop\n.endr\n"
+#define nops(n)		asm volatile(__nops(n))
+
 #define sev()		asm volatile("sev" : : : "memory")
 #define wfe()		asm volatile("wfe" : : : "memory")
 #define wfi()		asm volatile("wfi" : : : "memory")

From 50874355b1f54648e28504df38e772f9e52bc2f9 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 7 Sep 2016 11:07:08 +0100
Subject: [PATCH 0593/3220] UPSTREAM: arm64: alternative: add auto-nop
 infrastructure

In some cases, one side of an alternative sequence is simply a number of
NOPs used to balance the other side. Keeping track of this manually is
tedious, and the presence of large chains of NOPs makes the code more
painful to read than necessary.

To ameliorate matters, this patch adds a new alternative_else_nop_endif,
which automatically balances an alternative sequence with a trivial NOP
sled.

In many cases, we would like a NOP-sled in the default case, and
instructions patched in in the presence of a feature. To enable the NOPs
to be generated automatically for this case, this patch also adds a new
alternative_if, and updates alternative_else and alternative_endif to
work with either alternative_if or alternative_endif.

Cc: Andre Przywara <andre.przywara@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
[will: use new nops macro to generate nop sequences]
Signed-off-by: Will Deacon <will.deacon@arm.com>

Bug: 31432001
Change-Id: I28d8aae073e113048577c41cfe27c91215fb4cf3
(cherry picked from commit 792d47379f4d4c76692f1795f33d38582f8907fa)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/alternative.h | 72 +++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 8746ff6abd77..55101bd86b98 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -2,6 +2,7 @@
 #define __ASM_ALTERNATIVE_H
 
 #include <asm/cpufeature.h>
+#include <asm/insn.h>
 
 #ifndef __ASSEMBLY__
 
@@ -90,24 +91,15 @@ void apply_alternatives(void *start, size_t length);
 .endm
 
 /*
- * Begin an alternative code sequence.
+ * Alternative sequences
  *
- * The code that follows this macro will be assembled and linked as
- * normal. There are no restrictions on this code.
- */
-.macro alternative_if_not cap
-	.pushsection .altinstructions, "a"
-	altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f
-	.popsection
-661:
-.endm
-
-/*
- * Provide the alternative code sequence.
+ * The code for the case where the capability is not present will be
+ * assembled and linked as normal. There are no restrictions on this
+ * code.
  *
- * The code that follows this macro is assembled into a special
- * section to be used for dynamic patching. Code that follows this
- * macro must:
+ * The code for the case where the capability is present will be
+ * assembled into a special section to be used for dynamic patching.
+ * Code for that case must:
  *
  * 1. Be exactly the same length (in bytes) as the default code
  *    sequence.
@@ -116,8 +108,38 @@ void apply_alternatives(void *start, size_t length);
  *    alternative sequence it is defined in (branches into an
  *    alternative sequence are not fixed up).
  */
+
+/*
+ * Begin an alternative code sequence.
+ */
+.macro alternative_if_not cap
+	.set .Lasm_alt_mode, 0
+	.pushsection .altinstructions, "a"
+	altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f
+	.popsection
+661:
+.endm
+
+.macro alternative_if cap
+	.set .Lasm_alt_mode, 1
+	.pushsection .altinstructions, "a"
+	altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f
+	.popsection
+	.pushsection .altinstr_replacement, "ax"
+	.align 2	/* So GAS knows label 661 is suitably aligned */
+661:
+.endm
+
+/*
+ * Provide the other half of the alternative code sequence.
+ */
 .macro alternative_else
-662:	.pushsection .altinstr_replacement, "ax"
+662:
+	.if .Lasm_alt_mode==0
+	.pushsection .altinstr_replacement, "ax"
+	.else
+	.popsection
+	.endif
 663:
 .endm
 
@@ -125,11 +147,25 @@ void apply_alternatives(void *start, size_t length);
  * Complete an alternative code sequence.
  */
 .macro alternative_endif
-664:	.popsection
+664:
+	.if .Lasm_alt_mode==0
+	.popsection
+	.endif
 	.org	. - (664b-663b) + (662b-661b)
 	.org	. - (662b-661b) + (664b-663b)
 .endm
 
+/*
+ * Provides a trivial alternative or default sequence consisting solely
+ * of NOPs. The number of NOPs is chosen automatically to match the
+ * previous case.
+ */
+.macro alternative_else_nop_endif
+alternative_else
+	nops	(662b-661b) / AARCH64_INSN_SIZE
+alternative_endif
+.endm
+
 #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...)	\
 	alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
 

From a858462e1f9fdd5e67cf4cc94fa3fc3fa9846228 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 14:58:21 +0100
Subject: [PATCH 0594/3220] BACKPORT: arm64: Factor out PAN enabling/disabling
 into separate uaccess_* macros

This patch moves the directly coded alternatives for turning PAN on/off
into separate uaccess_{enable,disable} macros or functions. The asm
macros take a few arguments which will be used in subsequent patches.

Note that any (unlikely) access that the compiler might generate between
uaccess_enable() and uaccess_disable(), other than those explicitly
specified by the user access code, will not be protected by PAN.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 31432001
Change-Id: I75a410139d0756edab3210ee091fa5d047a22e04
(cherry picked from commit bd38967d406fb4f9fca67d612db71b5d74cfb0f5)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/futex.h       | 17 +++---
 arch/arm64/include/asm/uaccess.h     | 79 ++++++++++++++++++++++++----
 arch/arm64/kernel/armv8_deprecated.c | 11 ++--
 arch/arm64/lib/clear_user.S          | 11 ++--
 arch/arm64/lib/copy_from_user.S      | 11 ++--
 arch/arm64/lib/copy_in_user.S        | 11 ++--
 arch/arm64/lib/copy_to_user.S        | 11 ++--
 7 files changed, 93 insertions(+), 58 deletions(-)

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index f2585cdd32c2..85c4a8981d47 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -21,15 +21,12 @@
 #include <linux/futex.h>
 #include <linux/uaccess.h>
 
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
 #include <asm/errno.h>
-#include <asm/sysreg.h>
 
 #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg)		\
+do {									\
+	uaccess_enable();						\
 	asm volatile(							\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,		\
-		    CONFIG_ARM64_PAN)					\
 "	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%w1, %2\n"						\
 	insn "\n"							\
@@ -44,11 +41,11 @@
 "	.popsection\n"							\
 	_ASM_EXTABLE(1b, 4b)						\
 	_ASM_EXTABLE(2b, 4b)						\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,		\
-		    CONFIG_ARM64_PAN)					\
 	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp)	\
 	: "r" (oparg), "Ir" (-EFAULT)					\
-	: "memory")
+	: "memory");							\
+	uaccess_disable();						\
+} while (0)
 
 static inline int
 futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
@@ -118,8 +115,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
+	uaccess_enable();
 	asm volatile("// futex_atomic_cmpxchg_inatomic\n"
-ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "	prfm	pstl1strm, %2\n"
 "1:	ldxr	%w1, %2\n"
 "	sub	%w3, %w1, %w4\n"
@@ -134,10 +131,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "	.popsection\n"
 	_ASM_EXTABLE(1b, 4b)
 	_ASM_EXTABLE(2b, 4b)
-ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
 	: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
 	: "memory");
+	uaccess_disable();
 
 	*uval = val;
 	return ret;
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index c3d445b42351..8259eded53dc 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -18,16 +18,19 @@
 #ifndef __ASM_UACCESS_H
 #define __ASM_UACCESS_H
 
+#include <asm/alternative.h>
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
 /*
  * User space memory access functions
  */
 #include <linux/string.h>
 #include <linux/thread_info.h>
 
-#include <asm/alternative.h>
 #include <asm/cpufeature.h>
 #include <asm/ptrace.h>
-#include <asm/sysreg.h>
 #include <asm/errno.h>
 #include <asm/memory.h>
 #include <asm/compiler.h>
@@ -123,6 +126,44 @@ static inline void set_fs(mm_segment_t fs)
 	"	.long		(" #from " - .), (" #to " - .)\n"	\
 	"	.popsection\n"
 
+/*
+ * User access enabling/disabling.
+ */
+#define __uaccess_disable(alt)						\
+do {									\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,			\
+			CONFIG_ARM64_PAN));				\
+} while (0)
+
+#define __uaccess_enable(alt)						\
+do {									\
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,			\
+			CONFIG_ARM64_PAN));				\
+} while (0)
+
+static inline void uaccess_disable(void)
+{
+	__uaccess_disable(ARM64_HAS_PAN);
+}
+
+static inline void uaccess_enable(void)
+{
+	__uaccess_enable(ARM64_HAS_PAN);
+}
+
+/*
+ * These functions are no-ops when UAO is present.
+ */
+static inline void uaccess_disable_not_uao(void)
+{
+	__uaccess_disable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+static inline void uaccess_enable_not_uao(void)
+{
+	__uaccess_enable(ARM64_ALT_PAN_NOT_UAO);
+}
+
 /*
  * The "__xxx" versions of the user access functions do not verify the address
  * space - it must have been done previously with a separate "access_ok()"
@@ -150,8 +191,7 @@ static inline void set_fs(mm_segment_t fs)
 do {									\
 	unsigned long __gu_val;						\
 	__chk_user_ptr(ptr);						\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
+	uaccess_enable_not_uao();					\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
 		__get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr),  \
@@ -172,9 +212,8 @@ do {									\
 	default:							\
 		BUILD_BUG();						\
 	}								\
+	uaccess_disable_not_uao();					\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
 } while (0)
 
 #define __get_user(x, ptr)						\
@@ -219,8 +258,7 @@ do {									\
 do {									\
 	__typeof__(*(ptr)) __pu_val = (x);				\
 	__chk_user_ptr(ptr);						\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
+	uaccess_enable_not_uao();					\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
 		__put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr),	\
@@ -241,8 +279,7 @@ do {									\
 	default:							\
 		BUILD_BUG();						\
 	}								\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
+	uaccess_disable_not_uao();					\
 } while (0)
 
 #define __put_user(x, ptr)						\
@@ -327,4 +364,26 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
 extern __must_check long strnlen_user(const char __user *str, long n);
 
+#else	/* __ASSEMBLY__ */
+
+#include <asm/assembler.h>
+
+/*
+ * User access enabling/disabling macros. These are no-ops when UAO is
+ * present.
+ */
+	.macro	uaccess_disable_not_uao, tmp1
+alternative_if ARM64_ALT_PAN_NOT_UAO
+	SET_PSTATE_PAN(1)
+alternative_else_nop_endif
+	.endm
+
+	.macro	uaccess_enable_not_uao, tmp1, tmp2
+alternative_if ARM64_ALT_PAN_NOT_UAO
+	SET_PSTATE_PAN(0)
+alternative_else_nop_endif
+	.endm
+
+#endif	/* __ASSEMBLY__ */
+
 #endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index c37202c0c838..74dc6c1d97ee 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -14,7 +14,6 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 
-#include <asm/alternative.h>
 #include <asm/cpufeature.h>
 #include <asm/insn.h>
 #include <asm/opcodes.h>
@@ -281,9 +280,9 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
  * Error-checking SWP macros implemented using ldxr{b}/stxr{b}
  */
 #define __user_swpX_asm(data, addr, res, temp, B)		\
+do {								\
+	uaccess_enable();					\
 	__asm__ __volatile__(					\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,	\
-		    CONFIG_ARM64_PAN)				\
 	"0:	ldxr"B"		%w2, [%3]\n"			\
 	"1:	stxr"B"		%w0, %w1, [%3]\n"		\
 	"	cbz		%w0, 2f\n"			\
@@ -299,11 +298,11 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
 	"	.popsection"					\
 	_ASM_EXTABLE(0b, 4b)					\
 	_ASM_EXTABLE(1b, 4b)					\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,	\
-		CONFIG_ARM64_PAN)				\
 	: "=&r" (res), "+r" (data), "=&r" (temp)		\
 	: "r" (addr), "i" (-EAGAIN), "i" (-EFAULT)		\
-	: "memory")
+	: "memory");						\
+	uaccess_disable();					\
+} while (0)
 
 #define __user_swp_asm(data, addr, res, temp) \
 	__user_swpX_asm(data, addr, res, temp, "")
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index 5d1cad3ce6d6..d7150e30438a 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,10 +17,7 @@
  */
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 	.text
 
@@ -33,8 +30,7 @@
  * Alignment fixed up by hardware.
  */
 ENTRY(__clear_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x2, x3
 	mov	x2, x1			// save the size for fixup return
 	subs	x1, x1, #8
 	b.mi	2f
@@ -54,8 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
 	b.mi	5f
 uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
 5:	mov	x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x2
 	ret
 ENDPROC(__clear_user)
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 0b90497d4424..90154f3f7f2a 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -16,11 +16,8 @@
 
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
-#include <asm/assembler.h>
 #include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 /*
  * Copy from user space to a kernel buffer (alignment handled by the hardware)
@@ -67,12 +64,10 @@
 
 end	.req	x5
 ENTRY(__arch_copy_from_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x3
 	mov	x0, #0				// Nothing to copy
 	ret
 ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index f7292dd08c84..718b1c4e2f85 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -18,11 +18,8 @@
 
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
-#include <asm/assembler.h>
 #include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 /*
  * Copy from user space to user space (alignment handled by the hardware)
@@ -68,12 +65,10 @@
 
 end	.req	x5
 ENTRY(__copy_in_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x3
 	mov	x0, #0
 	ret
 ENDPROC(__copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 7a7efe255034..e99e31c9acac 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -16,11 +16,8 @@
 
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
-#include <asm/assembler.h>
 #include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 /*
  * Copy to user space from a kernel buffer (alignment handled by the hardware)
@@ -66,12 +63,10 @@
 
 end	.req	x5
 ENTRY(__arch_copy_to_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x3
 	mov	x0, #0
 	ret
 ENDPROC(__arch_copy_to_user)

From 4fe5f2cb94341423c484483625215628134906ca Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 15:48:55 +0100
Subject: [PATCH 0595/3220] BACKPORT: arm64: Factor out TTBR0_EL1 post-update
 workaround into a specific asm macro

This patch takes the errata workaround code out of cpu_do_switch_mm into
a dedicated post_ttbr0_update_workaround macro which will be reused in a
subsequent patch.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 31432001
Change-Id: I2b45b11ab7390c3545b9e162532109c1526bef14
(cherry picked from commit f33bcf03e6079668da6bf4eec4a7dcf9289131d0)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h | 13 +++++++++++++
 arch/arm64/mm/proc.S               | 11 +----------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index bacc75b6c78c..7b2a8925ac86 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -371,4 +371,17 @@ alternative_endif
 	movk	\reg, :abs_g0_nc:\val
 	.endm
 
+/*
+ * Errata workaround post TTBR0_EL1 update.
+ */
+	.macro	post_ttbr0_update_workaround
+#ifdef CONFIG_CAVIUM_ERRATUM_27456
+alternative_if ARM64_WORKAROUND_CAVIUM_27456
+	ic	iallu
+	dsb	nsh
+	isb
+alternative_else_nop_endif
+#endif
+	.endm
+
 #endif	/* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 9f6deacf41d2..765713702625 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -136,17 +136,8 @@ ENTRY(cpu_do_switch_mm)
 	bfi	x0, x1, #48, #16		// set the ASID
 	msr	ttbr0_el1, x0			// set TTBR0
 	isb
-alternative_if_not ARM64_WORKAROUND_CAVIUM_27456
+	post_ttbr0_update_workaround
 	ret
-	nop
-	nop
-	nop
-alternative_else
-	ic	iallu
-	dsb	nsh
-	isb
-	ret
-alternative_endif
 ENDPROC(cpu_do_switch_mm)
 
 	.pushsection ".idmap.text", "ax"

From b83fbf1e77257a9436b27f487925c7d3347f901f Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 16:53:00 +0100
Subject: [PATCH 0596/3220] BACKPORT: arm64: Introduce uaccess_{disable,enable}
 functionality based on TTBR0_EL1

This patch adds the uaccess macros/functions to disable access to user
space by setting TTBR0_EL1 to a reserved zeroed page. Since the value
written to TTBR0_EL1 must be a physical address, for simplicity this
patch introduces a reserved_ttbr0 page at a constant offset from
swapper_pg_dir. The uaccess_disable code uses the ttbr1_el1 value
adjusted by the reserved_ttbr0 offset.

Enabling access to user is done by restoring TTBR0_EL1 with the value
from the struct thread_info ttbr0 variable. Interrupts must be disabled
during the uaccess_ttbr0_enable code to ensure the atomicity of the
thread_info.ttbr0 read and TTBR0_EL1 write. This patch also moves the
get_thread_info asm macro from entry.S to assembler.h for reuse in the
uaccess_ttbr0_* macros.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 31432001
Change-Id: I54ada623160cb47f5762e0e39a5e84a75252dbfd
(cherry picked from commit 4b65a5db362783ab4b04ca1c1d2ad70ed9b0ba2a)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/assembler.h      |  16 ++++
 arch/arm64/include/asm/cpufeature.h     |   6 ++
 arch/arm64/include/asm/kernel-pgtable.h |   7 ++
 arch/arm64/include/asm/thread_info.h    |   3 +
 arch/arm64/include/asm/uaccess.h        | 108 ++++++++++++++++++++++--
 arch/arm64/kernel/asm-offsets.c         |   3 +
 arch/arm64/kernel/cpufeature.c          |   1 +
 arch/arm64/kernel/entry.S               |   4 -
 arch/arm64/kernel/head.S                |   6 +-
 arch/arm64/kernel/vmlinux.lds.S         |   5 ++
 10 files changed, 146 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 7b2a8925ac86..d8855ca6068a 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -53,6 +53,15 @@
 	msr	daifclr, #2
 	.endm
 
+	.macro	save_and_disable_irq, flags
+	mrs	\flags, daif
+	msr	daifset, #2
+	.endm
+
+	.macro	restore_irq, flags
+	msr	daif, \flags
+	.endm
+
 /*
  * Enable and disable debug exceptions.
  */
@@ -371,6 +380,13 @@ alternative_endif
 	movk	\reg, :abs_g0_nc:\val
 	.endm
 
+/*
+ * Return the current thread_info.
+ */
+	.macro	get_thread_info, rd
+	mrs	\rd, sp_el0
+	.endm
+
 /*
  * Errata workaround post TTBR0_EL1 update.
  */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 727e594ac5c2..f125c03ab2e1 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -188,6 +188,12 @@ static inline bool system_supports_mixed_endian_el0(void)
 	return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1));
 }
 
+static inline bool system_uses_ttbr0_pan(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) &&
+		!cpus_have_cap(ARM64_HAS_PAN);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 7e51d1b57c0c..7803343e5881 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -19,6 +19,7 @@
 #ifndef __ASM_KERNEL_PGTABLE_H
 #define __ASM_KERNEL_PGTABLE_H
 
+#include <asm/pgtable.h>
 #include <asm/sparsemem.h>
 
 /*
@@ -54,6 +55,12 @@
 #define SWAPPER_DIR_SIZE	(SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
 #define IDMAP_DIR_SIZE		(IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+#define RESERVED_TTBR0_SIZE	(PAGE_SIZE)
+#else
+#define RESERVED_TTBR0_SIZE	(0)
+#endif
+
 /* Initial memory map size */
 #if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_BLOCK_SHIFT	SECTION_SHIFT
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index abd64bd1f6d9..794d22603f04 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -48,6 +48,9 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	u64			ttbr0;		/* saved TTBR0_EL1 */
+#endif
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	int			cpu;		/* cpu */
 };
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 8259eded53dc..955c6e58a624 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -19,6 +19,7 @@
 #define __ASM_UACCESS_H
 
 #include <asm/alternative.h>
+#include <asm/kernel-pgtable.h>
 #include <asm/sysreg.h>
 
 #ifndef __ASSEMBLY__
@@ -129,16 +130,71 @@ static inline void set_fs(mm_segment_t fs)
 /*
  * User access enabling/disabling.
  */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void __uaccess_ttbr0_disable(void)
+{
+	unsigned long ttbr;
+
+	/* reserved_ttbr0 placed at the end of swapper_pg_dir */
+	ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE;
+	write_sysreg(ttbr, ttbr0_el1);
+	isb();
+}
+
+static inline void __uaccess_ttbr0_enable(void)
+{
+	unsigned long flags;
+
+	/*
+	 * Disable interrupts to avoid preemption between reading the 'ttbr0'
+	 * variable and the MSR. A context switch could trigger an ASID
+	 * roll-over and an update of 'ttbr0'.
+	 */
+	local_irq_save(flags);
+	write_sysreg(current_thread_info()->ttbr0, ttbr0_el1);
+	isb();
+	local_irq_restore(flags);
+}
+
+static inline bool uaccess_ttbr0_disable(void)
+{
+	if (!system_uses_ttbr0_pan())
+		return false;
+	__uaccess_ttbr0_disable();
+	return true;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+	if (!system_uses_ttbr0_pan())
+		return false;
+	__uaccess_ttbr0_enable();
+	return true;
+}
+#else
+static inline bool uaccess_ttbr0_disable(void)
+{
+	return false;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+	return false;
+}
+#endif
+
 #define __uaccess_disable(alt)						\
 do {									\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,			\
-			CONFIG_ARM64_PAN));				\
+	if (!uaccess_ttbr0_disable())					\
+		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,		\
+				CONFIG_ARM64_PAN));			\
 } while (0)
 
 #define __uaccess_enable(alt)						\
 do {									\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,			\
-			CONFIG_ARM64_PAN));				\
+	if (uaccess_ttbr0_enable())					\
+		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,		\
+				CONFIG_ARM64_PAN));			\
 } while (0)
 
 static inline void uaccess_disable(void)
@@ -369,16 +425,56 @@ extern __must_check long strnlen_user(const char __user *str, long n);
 #include <asm/assembler.h>
 
 /*
- * User access enabling/disabling macros. These are no-ops when UAO is
- * present.
+ * User access enabling/disabling macros.
+ */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	.macro	__uaccess_ttbr0_disable, tmp1
+	mrs	\tmp1, ttbr1_el1		// swapper_pg_dir
+	add	\tmp1, \tmp1, #SWAPPER_DIR_SIZE	// reserved_ttbr0 at the end of swapper_pg_dir
+	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
+	isb
+	.endm
+
+	.macro	__uaccess_ttbr0_enable, tmp1
+	get_thread_info \tmp1
+	ldr	\tmp1, [\tmp1, #TSK_TI_TTBR0]	// load saved TTBR0_EL1
+	msr	ttbr0_el1, \tmp1		// set the non-PAN TTBR0_EL1
+	isb
+	.endm
+
+	.macro	uaccess_ttbr0_disable, tmp1
+alternative_if_not ARM64_HAS_PAN
+	__uaccess_ttbr0_disable \tmp1
+alternative_else_nop_endif
+	.endm
+
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+alternative_if_not ARM64_HAS_PAN
+	save_and_disable_irq \tmp2		// avoid preemption
+	__uaccess_ttbr0_enable \tmp1
+	restore_irq \tmp2
+alternative_else_nop_endif
+	.endm
+#else
+	.macro	uaccess_ttbr0_disable, tmp1
+	.endm
+
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+	.endm
+#endif
+
+/*
+ * These macros are no-ops when UAO is present.
  */
 	.macro	uaccess_disable_not_uao, tmp1
+	uaccess_ttbr0_disable \tmp1
 alternative_if ARM64_ALT_PAN_NOT_UAO
 	SET_PSTATE_PAN(1)
 alternative_else_nop_endif
 	.endm
 
 	.macro	uaccess_enable_not_uao, tmp1, tmp2
+	uaccess_ttbr0_enable \tmp1, \tmp2
 alternative_if ARM64_ALT_PAN_NOT_UAO
 	SET_PSTATE_PAN(0)
 alternative_else_nop_endif
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 087cf9a65359..40ef661bd3a5 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -38,6 +38,9 @@ int main(void)
   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
   DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+  DEFINE(TSK_TI_TTBR0,		offsetof(struct thread_info, ttbr0));
+#endif
   BLANK();
   DEFINE(THREAD_CPU_CONTEXT,	offsetof(struct task_struct, thread.cpu_context));
   BLANK();
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 7566cad9fa1d..40ee3f2933e7 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -43,6 +43,7 @@ unsigned int compat_elf_hwcap2 __read_mostly;
 #endif
 
 DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
+EXPORT_SYMBOL(cpu_hwcaps);
 
 #define __ARM64_FTR_BITS(SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \
 	{						\
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 533e1c9fd5a6..8eb8eb085036 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -190,10 +190,6 @@ alternative_endif
 	eret					// return to kernel
 	.endm
 
-	.macro	get_thread_info, rd
-	mrs	\rd, sp_el0
-	.endm
-
 	.macro	irq_stack_entry
 	mov	x19, sp			// preserve the original sp
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 16c62dea7934..461d6cc258dd 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -322,14 +322,14 @@ __create_page_tables:
 	 * dirty cache lines being evicted.
 	 */
 	mov	x0, x25
-	add	x1, x26, #SWAPPER_DIR_SIZE
+	add	x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
 	bl	__inval_cache_range
 
 	/*
 	 * Clear the idmap and swapper page tables.
 	 */
 	mov	x0, x25
-	add	x6, x26, #SWAPPER_DIR_SIZE
+	add	x6, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
 1:	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
@@ -407,7 +407,7 @@ __create_page_tables:
 	 * tables again to remove any speculatively loaded cache lines.
 	 */
 	mov	x0, x25
-	add	x1, x26, #SWAPPER_DIR_SIZE
+	add	x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
 	dmb	sy
 	bl	__inval_cache_range
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index c241ea5359b9..f1d6c49dcc5f 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -185,6 +185,11 @@ SECTIONS
 	swapper_pg_dir = .;
 	. += SWAPPER_DIR_SIZE;
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	reserved_ttbr0 = .;
+	. += RESERVED_TTBR0_SIZE;
+#endif
+
 	_end = .;
 
 	STABS_DEBUG

From 02ef7a8c3c9dedda83188574cd4d3071d307b814 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 2 Sep 2016 14:54:03 +0100
Subject: [PATCH 0597/3220] BACKPORT: arm64: Disable TTBR0_EL1 during normal
 kernel execution

When the TTBR0 PAN feature is enabled, the kernel entry points need to
disable access to TTBR0_EL1. The PAN status of the interrupted context
is stored as part of the saved pstate, reusing the PSR_PAN_BIT (22).
Restoring access to TTBR0_EL1 is done on exception return if returning
to user or returning to a context where PAN was disabled.

Context switching via switch_mm() must defer the update of TTBR0_EL1
until a return to user or an explicit uaccess_enable() call.

Special care needs to be taken for two cases where TTBR0_EL1 is set
outside the normal kernel context switch operation: EFI run-time
services (via efi_set_pgd) and CPU suspend (via cpu_(un)install_idmap).
Code has been added to avoid deferred TTBR0_EL1 switching as in
switch_mm() and restore the reserved TTBR0_EL1 when uninstalling the
special TTBR0_EL1.

User cache maintenance (user_cache_maint_handler and
__flush_cache_user_range) needs the TTBR0_EL1 re-instated since the
operations are performed by user virtual address.

This patch also removes a stale comment on the switch_mm() function.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 31432001
Change-Id: I85a49f70e13b153b9903851edf56f6531c14e6de
(cherry picked from commit 39bc88e5e38e9b213bd7d833ce0df6ec029761ad)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/efi.h         | 26 +++++++++++-
 arch/arm64/include/asm/mmu_context.h | 53 ++++++++++++++++-------
 arch/arm64/kernel/entry.S            | 63 ++++++++++++++++++++++++++++
 arch/arm64/kernel/setup.c            |  9 ++++
 arch/arm64/mm/cache.S                |  6 ++-
 arch/arm64/mm/context.c              |  7 +++-
 6 files changed, 147 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 8e88a696c9cb..932f5a56d1a6 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_EFI_H
 #define _ASM_EFI_H
 
+#include <asm/cpufeature.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/neon.h>
@@ -69,7 +70,30 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
 
 static inline void efi_set_pgd(struct mm_struct *mm)
 {
-	switch_mm(NULL, mm, NULL);
+	__switch_mm(mm);
+
+	if (system_uses_ttbr0_pan()) {
+		if (mm != current->active_mm) {
+			/*
+			 * Update the current thread's saved ttbr0 since it is
+			 * restored as part of a return from exception. Set
+			 * the hardware TTBR0_EL1 using cpu_switch_mm()
+			 * directly to enable potential errata workarounds.
+			 */
+			update_saved_ttbr0(current, mm);
+			cpu_switch_mm(mm->pgd, mm);
+		} else {
+			/*
+			 * Defer the switch to the current thread's TTBR0_EL1
+			 * until uaccess_enable(). Restore the current
+			 * thread's saved ttbr0 corresponding to its active_mm
+			 * (if different from init_mm).
+			 */
+			cpu_set_reserved_ttbr0();
+			if (current->active_mm != &init_mm)
+				update_saved_ttbr0(current, current->active_mm);
+		}
+	}
 }
 
 void efi_virtmap_load(void);
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index a00f7cf35bbd..e53d30c6f779 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
 #include <asm/proc-fns.h>
 #include <asm-generic/mm_hooks.h>
 #include <asm/cputype.h>
@@ -113,7 +114,7 @@ static inline void cpu_uninstall_idmap(void)
 	local_flush_tlb_all();
 	cpu_set_default_tcr_t0sz();
 
-	if (mm != &init_mm)
+	if (mm != &init_mm && !system_uses_ttbr0_pan())
 		cpu_switch_mm(mm->pgd, mm);
 }
 
@@ -173,21 +174,27 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 }
 
-/*
- * This is the actual mm switch as far as the scheduler
- * is concerned.  No registers are touched.  We avoid
- * calling the CPU specific function when the mm hasn't
- * actually changed.
- */
-static inline void
-switch_mm(struct mm_struct *prev, struct mm_struct *next,
-	  struct task_struct *tsk)
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+				      struct mm_struct *mm)
+{
+	if (system_uses_ttbr0_pan()) {
+		BUG_ON(mm->pgd == swapper_pg_dir);
+		task_thread_info(tsk)->ttbr0 =
+			virt_to_phys(mm->pgd) | ASID(mm) << 48;
+	}
+}
+#else
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+				      struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void __switch_mm(struct mm_struct *next)
 {
 	unsigned int cpu = smp_processor_id();
 
-	if (prev == next)
-		return;
-
 	/*
 	 * init_mm.pgd does not contain any user mappings and it is always
 	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
@@ -200,7 +207,25 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	check_and_switch_context(next, cpu);
 }
 
+static inline void
+switch_mm(struct mm_struct *prev, struct mm_struct *next,
+	  struct task_struct *tsk)
+{
+	if (prev != next)
+		__switch_mm(next);
+
+	/*
+	 * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
+	 * value may have not been initialised yet (activate_mm caller) or the
+	 * ASID has changed since the last run (following the context switch
+	 * of another thread of the same process). Avoid setting the reserved
+	 * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit).
+	 */
+	if (next != &init_mm)
+		update_saved_ttbr0(tsk, next);
+}
+
 #define deactivate_mm(tsk,mm)	do { } while (0)
-#define activate_mm(prev,next)	switch_mm(prev, next, NULL)
+#define activate_mm(prev,next)	switch_mm(prev, next, current)
 
 #endif
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 8eb8eb085036..d6f10ab36c15 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -29,7 +29,9 @@
 #include <asm/esr.h>
 #include <asm/irq.h>
 #include <asm/memory.h>
+#include <asm/ptrace.h>
 #include <asm/thread_info.h>
+#include <asm/uaccess.h>
 #include <asm/unistd.h>
 
 /*
@@ -109,6 +111,32 @@
 	mrs	x22, elr_el1
 	mrs	x23, spsr_el1
 	stp	lr, x21, [sp, #S_LR]
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
+	 * EL0, there is no need to check the state of TTBR0_EL1 since
+	 * accesses are always enabled.
+	 * Note that the meaning of this bit differs from the ARMv8.1 PAN
+	 * feature as all TTBR0_EL1 accesses are disabled, not just those to
+	 * user mappings.
+	 */
+alternative_if ARM64_HAS_PAN
+	b	1f				// skip TTBR0 PAN
+alternative_else_nop_endif
+
+	.if	\el != 0
+	mrs	x21, ttbr0_el1
+	tst	x21, #0xffff << 48		// Check for the reserved ASID
+	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
+	b.eq	1f				// TTBR0 access already disabled
+	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
+	.endif
+
+	__uaccess_ttbr0_disable x21
+1:
+#endif
+
 	stp	x22, x23, [sp, #S_PC]
 
 	/*
@@ -147,6 +175,40 @@
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 	.if	\el == 0
 	ct_user_enter
+	.endif
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
+	 * PAN bit checking.
+	 */
+alternative_if ARM64_HAS_PAN
+	b	2f				// skip TTBR0 PAN
+alternative_else_nop_endif
+
+	.if	\el != 0
+	tbnz	x22, #22, 1f			// Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
+	.endif
+
+	__uaccess_ttbr0_enable x0
+
+	.if	\el == 0
+	/*
+	 * Enable errata workarounds only if returning to user. The only
+	 * workaround currently required for TTBR0_EL1 changes are for the
+	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
+	 * corruption).
+	 */
+	post_ttbr0_update_workaround
+	.endif
+1:
+	.if	\el != 0
+	and	x22, x22, #~PSR_PAN_BIT		// ARMv8.0 CPUs do not understand this bit
+	.endif
+2:
+#endif
+
+	.if	\el == 0
 	ldr	x23, [sp, #S_SP]		// load return stack pointer
 	msr	sp_el0, x23
 #ifdef CONFIG_ARM64_ERRATUM_845719
@@ -168,6 +230,7 @@ alternative_else
 alternative_endif
 #endif
 	.endif
+
 	msr	elr_el1, x21			// set up the return data
 	msr	spsr_el1, x22
 	ldp	x0, x1, [sp, #16 * 0]
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 29b8c247d56f..6591bf23422b 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -347,6 +347,15 @@ void __init setup_arch(char **cmdline_p)
 	smp_init_cpus();
 	smp_build_mpidr_hash();
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Make sure init_thread_info.ttbr0 always generates translation
+	 * faults in case uaccess_enable() is inadvertently called by the init
+	 * thread.
+	 */
+	init_thread_info.ttbr0 = virt_to_phys(empty_zero_page);
+#endif
+
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
 	conswitchp = &vga_con;
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 07d7352d7c38..3be2cda5dbda 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -23,6 +23,7 @@
 #include <asm/assembler.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
+#include <asm/uaccess.h>
 
 /*
  *	flush_icache_range(start,end)
@@ -48,6 +49,7 @@ ENTRY(flush_icache_range)
  *	- end     - virtual end address of region
  */
 ENTRY(__flush_cache_user_range)
+	uaccess_ttbr0_enable x2, x3
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x0, x3
@@ -69,10 +71,12 @@ USER(9f, ic	ivau, x4	)		// invalidate I line PoU
 	dsb	ish
 	isb
 	mov	x0, #0
+1:
+	uaccess_ttbr0_disable x1
 	ret
 9:
 	mov	x0, #-EFAULT
-	ret
+	b	1b
 ENDPROC(flush_icache_range)
 ENDPROC(__flush_cache_user_range)
 
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 7275628ba59f..25128089c386 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -182,7 +182,12 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath:
-	cpu_switch_mm(mm->pgd, mm);
+	/*
+	 * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
+	 * emulating PAN.
+	 */
+	if (!system_uses_ttbr0_pan())
+		cpu_switch_mm(mm->pgd, mm);
 }
 
 static int asids_init(void)

From 17080bcf2c6ec3e34b36f68d7e4b37b55ff9d16f Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 18:22:39 +0100
Subject: [PATCH 0598/3220] UPSTREAM: arm64: Handle faults caused by
 inadvertent user access with PAN enabled

When TTBR0_EL1 is set to the reserved page, an erroneous kernel access
to user space would generate a translation fault. This patch adds the
checks for the software-set PSR_PAN_BIT to emulate a permission fault
and report it accordingly.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 31432001
Change-Id: I87e48f6075f84878e4d26d4fadf6eaac49d2cb4e
(cherry picked from commit 786889636ad75296c213547d1ca656af4c59f390)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/mm/fault.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 0aacbd763e6b..bff871f5b10a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -246,13 +246,19 @@ out:
 	return fault;
 }
 
-static inline bool is_permission_fault(unsigned int esr)
+static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs)
 {
 	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
 
-	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) ||
-	       (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+	if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
+		return false;
+
+	if (system_uses_ttbr0_pan())
+		return fsc_type == ESR_ELx_FSC_FAULT &&
+			(regs->pstate & PSR_PAN_BIT);
+	else
+		return fsc_type == ESR_ELx_FSC_PERM;
 }
 
 static bool is_el0_instruction_abort(unsigned int esr)
@@ -293,7 +299,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		mm_flags |= FAULT_FLAG_WRITE;
 	}
 
-	if (is_permission_fault(esr) && (addr < USER_DS)) {
+	if (addr < USER_DS && is_permission_fault(esr, regs)) {
 		/* regs->orig_addr_limit may be 0 if we entered from EL0 */
 		if (regs->orig_addr_limit == KERNEL_DS)
 			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);

From 9fcab0c5b4280c6fc860b23ae6335712a4eb77c6 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 5 Jul 2016 12:25:15 +0100
Subject: [PATCH 0599/3220] UPSTREAM: arm64: xen: Enable user access before a
 privcmd hvc call

Privcmd calls are issued by the userspace. The kernel needs to enable
access to TTBR0_EL1 as the hypervisor would issue stage 1 translations
to user memory via AT instructions. Since AT instructions are not
affected by the PAN bit (ARMv8.1), we only need the explicit
uaccess_enable/disable if the TTBR0 PAN option is enabled.

Reviewed-by: Julien Grall <julien.grall@arm.com>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 31432001
Change-Id: I64d827923d869c1868702c8a18efa99ea91d3151
(cherry picked from commit 9cf09d68b89ae5fe0261dcc69464bcc676900af6)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/xen/hypercall.S | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 8bbe9401f4f0..b96db5dafec4 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -49,6 +49,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/uaccess.h>
 #include <xen/interface/xen.h>
 
 
@@ -89,6 +90,20 @@ ENTRY(privcmd_call)
 	mov x2, x3
 	mov x3, x4
 	mov x4, x5
+	/*
+	 * Privcmd calls are issued by the userspace. The kernel needs to
+	 * enable access to TTBR0_EL1 as the hypervisor would issue stage 1
+	 * translations to user memory via AT instructions. Since AT
+	 * instructions are not affected by the PAN bit (ARMv8.1), we only
+	 * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
+	 * is enabled (it implies that hardware UAO and PAN disabled).
+	 */
+	uaccess_ttbr0_enable x6, x7
 	hvc XEN_IMM
+
+	/*
+	 * Disable userspace access from kernel once the hyp call completed.
+	 */
+	uaccess_ttbr0_disable x6
 	ret
 ENDPROC(privcmd_call);

From 7f89f7225caf80f6d23a3396ca4cc1fd3ccd4685 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 1 Jul 2016 18:25:31 +0100
Subject: [PATCH 0600/3220] UPSTREAM: arm64: Enable CONFIG_ARM64_SW_TTBR0_PAN

This patch adds the Kconfig option to enable support for TTBR0 PAN
emulation. The option is default off because of a slight performance hit
when enabled, caused by the additional TTBR0_EL1 switching during user
access operations or exception entry/exit code.

Cc: Will Deacon <will.deacon@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 31432001
Change-Id: I2f0b5f332e3c56ea0453ff69826525dec49f034b
(cherry picked from commit ba42822af1c287f038aa550f3578c61c212a892e)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/Kconfig | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 095a3afb1e9d..a9089fa82221 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -706,6 +706,14 @@ config SETEND_EMULATION
 	  If unsure, say Y
 endif
 
+config ARM64_SW_TTBR0_PAN
+	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
+	help
+	  Enabling this option prevents the kernel from accessing
+	  user-space memory directly by pointing TTBR0_EL1 to a reserved
+	  zeroed area and reserved ASID. The user access routines
+	  restore the valid TTBR0_EL1 temporarily.
+
 menu "ARMv8.1 architectural features"
 
 config ARM64_HW_AFDBM

From 5937c0601163e53fe5595692a8fedd428034b017 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 12 Dec 2016 13:50:26 +0000
Subject: [PATCH 0601/3220] UPSTREAM: arm64: Disable PAN on uaccess_enable()

Commit 4b65a5db3627 ("arm64: Introduce uaccess_{disable,enable}
functionality based on TTBR0_EL1") added conditional user access
enable/disable. Unfortunately, a typo prevents the PAN bit from being
cleared for user access functions.

Restore the PAN functionality by adding the missing '!'.

Fixes: b65a5db3627 ("arm64: Introduce uaccess_{disable,enable} functionality based on TTBR0_EL1")
Reported-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 31432001
Change-Id: If61cb6cc756affc7df7fa06213723a8b96eb1e80
(cherry picked from commit 75037120e62b58c536999eb23d70cfcb6d6c0bcc)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 955c6e58a624..efafdf39cb3b 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -192,7 +192,7 @@ do {									\
 
 #define __uaccess_enable(alt)						\
 do {									\
-	if (uaccess_ttbr0_enable())					\
+	if (!uaccess_ttbr0_enable())					\
 		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,		\
 				CONFIG_ARM64_PAN));			\
 } while (0)

From d5dce523ee59d843a8292d887333fe2fd1c7277a Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 4 Jan 2017 09:11:04 -0800
Subject: [PATCH 0602/3220] ANDROID: configs: CONFIG_ARM64_SW_TTBR0_PAN=y

Bug: 31432001
Change-Id: Ia72c3aa70a463d3a7f52b76e5082520aa328d29b
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 android/configs/android-recommended.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index 3fd0b13488a1..70aaae17ad29 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -8,6 +8,7 @@
 # CONFIG_VT is not set
 CONFIG_ANDROID_TIMED_GPIO=y
 CONFIG_ARM_KERNMEM_PERMS=y
+CONFIG_ARM64_SW_TTBR0_PAN=y
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y

From d5ed6f6f5f92d98f1f4329ff1c5a19fbb0f25550 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 30 Dec 2016 17:42:32 -0600
Subject: [PATCH 0603/3220] net: socket: don't set sk_uid to garbage value in
 ->setattr()

->setattr() was recently implemented for socket files to sync the socket
inode's uid to the new 'sk_uid' member of struct sock.  It does this by
copying over the ia_uid member of struct iattr.  However, ia_uid is
actually only valid when ATTR_UID is set in ia_valid, indicating that
the uid is being changed, e.g. by chown.  Other metadata operations such
as chmod or utimes leave ia_uid uninitialized.  Therefore, sk_uid could
be set to a "garbage" value from the stack.

Fix this by only copying the uid over when ATTR_UID is set.

[cherry-pick of net e1a3a60a2ebe991605acb14cd58e39c0545e174e]

Bug: 16355602
Change-Id: I20e53848e54282b72a388ce12bfa88da5e3e9efe
Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Tested-by: Lorenzo Colitti <lorenzo@google.com>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/socket.c b/net/socket.c
index 1012991fb560..29c20061704e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -524,7 +524,7 @@ int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	int err = simple_setattr(dentry, iattr);
 
-	if (!err) {
+	if (!err && (iattr->ia_valid & ATTR_UID)) {
 		struct socket *sock = SOCKET_I(d_inode(dentry));
 
 		sock->sk->sk_uid = iattr->ia_uid;

From 3e61a942c7eb5378c2a1c31bc3236995b3bdf7f2 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 18:04:53 +0100
Subject: [PATCH 0604/3220] MIPS: Prevent "restoration" of MSA context in
 non-MSA kernels

commit 6533af4d4831c421cd9aa4dce7cfc19a3514cc09 upstream.

If a kernel doesn't support MSA context (ie. CONFIG_CPU_HAS_MSA=n) then
it will only keep 64 bits per FP register in thread context, and the
calls to set_fpr64 in restore_msa_extcontext will overrun the end of the
FP register context into the FCSR & MSACSR values. GCC 6.x has become
smart enough to detect this & complain like so:

    arch/mips/kernel/signal.c: In function 'protected_restore_fp_context':
    ./arch/mips/include/asm/processor.h:114:17: error: array subscript is above array bounds [-Werror=array-bounds]
      fpr->val##width[FPR_IDX(width, idx)] = val;   \
      ~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~
    ./arch/mips/include/asm/processor.h:118:1: note: in expansion of macro 'BUILD_FPR_ACCESS'
     BUILD_FPR_ACCESS(64)

The only way to trigger this code to run would be for a program to set
up an artificial extended MSA context structure following a sigframe &
execute sigreturn. Whilst this doesn't allow a program to write to any
state that it couldn't already, it makes little sense to allow this
"restoration" of MSA context in a system that doesn't support MSA.

Fix this by killing a program with SIGSYS if it tries something as crazy
as "restoring" fake MSA context in this way, also fixing the build error
& allowing for most of restore_msa_extcontext to be optimised out of
kernels without support for MSA.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Reported-by: Michal Toman <michal.toman@imgtec.com>
Fixes: bf82cb30c7e5 ("MIPS: Save MSA extended context around signals")
Tested-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Michal Toman <michal.toman@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/13164/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/signal.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index bf792e2839a6..fc7c1f0b3d8d 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -195,6 +195,9 @@ static int restore_msa_extcontext(void __user *buf, unsigned int size)
 	unsigned int csr;
 	int i, err;
 
+	if (!config_enabled(CONFIG_CPU_HAS_MSA))
+		return SIGSYS;
+
 	if (size != sizeof(*msa))
 		return -EINVAL;
 
@@ -398,8 +401,8 @@ int protected_restore_fp_context(void __user *sc)
 	}
 
 fp_done:
-	if (used & USED_EXTCONTEXT)
-		err |= restore_extcontext(sc_to_extcontext(sc));
+	if (!err && (used & USED_EXTCONTEXT))
+		err = restore_extcontext(sc_to_extcontext(sc));
 
 	return err ?: sig;
 }

From 1475f79c87f07664f9af64f5d2e77d360855ac28 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Dec 2016 09:44:53 -0800
Subject: [PATCH 0605/3220] UPSTREAM: net: avoid signed overflows for
 SO_{SND|RCV}BUFFORCE

(cherry picked from commit b98b0bc8c431e3ceb4b26b0dfc8db509518fb290)

CAP_NET_ADMIN users should not be allowed to set negative
sk_sndbuf or sk_rcvbuf values, as it can lead to various memory
corruptions, crashes, OOM...

Note that before commit 82981930125a ("net: cleanups in
sock_setsockopt()"), the bug was even more serious, since SO_SNDBUF
and SO_RCVBUF were vulnerable.

This needs to be backported to all known linux kernels.

Again, many thanks to syzkaller team for discovering this gem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Change-Id: I2b621c28c02267af5b34a379b2970fe5fb61a4f6
Bug: 33363517
---
 net/core/sock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index d0f83260cddd..3334c4bf62cd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -745,7 +745,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		val = min_t(u32, val, sysctl_wmem_max);
 set_sndbuf:
 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
-		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
+		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 		/* Wake up sending tasks if we upped the value. */
 		sk->sk_write_space(sk);
 		break;
@@ -781,7 +781,7 @@ set_rcvbuf:
 		 * returning the value we actually used in getsockopt
 		 * is the most desirable behavior.
 		 */
-		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
+		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 		break;
 
 	case SO_RCVBUFFORCE:

From 56ea977b0fc3d2e7e68e5a93a2b44aa3e25e34ed Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 16 May 2016 17:28:16 +0800
Subject: [PATCH 0606/3220] UPSTREAM: netlink: Fix dump skb leak/double free

(cherry picked from commit 92964c79b357efd980812c4de5c1fd2ec8bb5520)

When we free cb->skb after a dump, we do it after releasing the
lock.  This means that a new dump could have started in the time
being and we'll end up freeing their skb instead of ours.

This patch saves the skb and module before we unlock so we free
the right memory.

Fixes: 16b304f3404f ("netlink: Eliminate kmalloc in netlink dump operation.")
Reported-by: Baozeng Ding <sploving1@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Change-Id: Ie2db6a32a49686c6d22c4a88c251b288343c7813
Bug: 33393474
---
 net/netlink/af_netlink.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 59651af8cc27..ff36e88d19d9 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2784,6 +2784,7 @@ static int netlink_dump(struct sock *sk)
 	struct netlink_callback *cb;
 	struct sk_buff *skb = NULL;
 	struct nlmsghdr *nlh;
+	struct module *module;
 	int len, err = -ENOBUFS;
 	int alloc_min_size;
 	int alloc_size;
@@ -2863,9 +2864,11 @@ static int netlink_dump(struct sock *sk)
 		cb->done(cb);
 
 	nlk->cb_running = false;
+	module = cb->module;
+	skb = cb->skb;
 	mutex_unlock(nlk->cb_mutex);
-	module_put(cb->module);
-	consume_skb(cb->skb);
+	module_put(module);
+	consume_skb(skb);
 	return 0;
 
 errout_skb:

From a11f71fcd94d5315bc6fbfe74ec6c5cbb19104d1 Mon Sep 17 00:00:00 2001
From: Philip Pettersson <philip.pettersson@gmail.com>
Date: Wed, 30 Nov 2016 14:55:36 -0800
Subject: [PATCH 0607/3220] UPSTREAM: packet: fix race condition in
 packet_set_ring

(cherry picked from commit 84ac7260236a49c79eede91617700174c2c19b0c)

When packet_set_ring creates a ring buffer it will initialize a
struct timer_list if the packet version is TPACKET_V3. This value
can then be raced by a different thread calling setsockopt to
set the version to TPACKET_V1 before packet_set_ring has finished.

This leads to a use-after-free on a function pointer in the
struct timer_list when the socket is closed as the previously
initialized timer will not be deleted.

The bug is fixed by taking lock_sock(sk) in packet_setsockopt when
changing the packet version while also taking the lock at the start
of packet_set_ring.

Fixes: f6fb8f100b80 ("af-packet: TPACKET_V3 flexible buffer implementation.")
Signed-off-by: Philip Pettersson <philip.pettersson@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Change-Id: Ice451620ecf2c2a5ba3709f45fbb5f3f5c5bb389
Bug: 33358926
---
 net/packet/af_packet.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 992396aa635c..caffff01f410 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3577,19 +3577,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 		if (optlen != sizeof(val))
 			return -EINVAL;
-		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
-			return -EBUSY;
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
 		switch (val) {
 		case TPACKET_V1:
 		case TPACKET_V2:
 		case TPACKET_V3:
-			po->tp_version = val;
-			return 0;
+			break;
 		default:
 			return -EINVAL;
 		}
+		lock_sock(sk);
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+			ret = -EBUSY;
+		} else {
+			po->tp_version = val;
+			ret = 0;
+		}
+		release_sock(sk);
+		return ret;
 	}
 	case PACKET_RESERVE:
 	{
@@ -4071,6 +4077,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	/* Added to avoid minimal code churn */
 	struct tpacket_req *req = &req_u->req;
 
+	lock_sock(sk);
 	/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
 	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
 		WARN(1, "Tx-ring is not supported.\n");
@@ -4152,7 +4159,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			goto out;
 	}
 
-	lock_sock(sk);
 
 	/* Detach socket from network */
 	spin_lock(&po->bind_lock);
@@ -4201,11 +4207,11 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		if (!tx_ring)
 			prb_shutdown_retire_blk_timer(po, rb_queue);
 	}
-	release_sock(sk);
 
 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->tp_block_nr);
 out:
+	release_sock(sk);
 	return err;
 }
 

From f4e6542320b9ce98a4598bf168b0b4d12c7c2b65 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 18 Nov 2016 22:13:00 +0100
Subject: [PATCH 0608/3220] UPSTREAM: l2tp: fix racy SOCK_ZAPPED flag check in
 l2tp_ip{,6}_bind()

(cherry picked from commit 32c231164b762dddefa13af5a0101032c70b50ef)

Lock socket before checking the SOCK_ZAPPED flag in l2tp_ip6_bind().
Without lock, a concurrent call could modify the socket flags between
the sock_flag(sk, SOCK_ZAPPED) test and the lock_sock() call. This way,
a socket could be inserted twice in l2tp_ip6_bind_table. Releasing it
would then leave a stale pointer there, generating use-after-free
errors when walking through the list or modifying adjacent entries.

BUG: KASAN: use-after-free in l2tp_ip6_close+0x22e/0x290 at addr ffff8800081b0ed8
Write of size 8 by task syz-executor/10987
CPU: 0 PID: 10987 Comm: syz-executor Not tainted 4.8.0+ #39
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
 ffff880031d97838 ffffffff829f835b ffff88001b5a1640 ffff8800081b0ec0
 ffff8800081b15a0 ffff8800081b6d20 ffff880031d97860 ffffffff8174d3cc
 ffff880031d978f0 ffff8800081b0e80 ffff88001b5a1640 ffff880031d978e0
Call Trace:
 [<ffffffff829f835b>] dump_stack+0xb3/0x118 lib/dump_stack.c:15
 [<ffffffff8174d3cc>] kasan_object_err+0x1c/0x70 mm/kasan/report.c:156
 [<     inline     >] print_address_description mm/kasan/report.c:194
 [<ffffffff8174d666>] kasan_report_error+0x1f6/0x4d0 mm/kasan/report.c:283
 [<     inline     >] kasan_report mm/kasan/report.c:303
 [<ffffffff8174db7e>] __asan_report_store8_noabort+0x3e/0x40 mm/kasan/report.c:329
 [<     inline     >] __write_once_size ./include/linux/compiler.h:249
 [<     inline     >] __hlist_del ./include/linux/list.h:622
 [<     inline     >] hlist_del_init ./include/linux/list.h:637
 [<ffffffff8579047e>] l2tp_ip6_close+0x22e/0x290 net/l2tp/l2tp_ip6.c:239
 [<ffffffff850b2dfd>] inet_release+0xed/0x1c0 net/ipv4/af_inet.c:415
 [<ffffffff851dc5a0>] inet6_release+0x50/0x70 net/ipv6/af_inet6.c:422
 [<ffffffff84c4581d>] sock_release+0x8d/0x1d0 net/socket.c:570
 [<ffffffff84c45976>] sock_close+0x16/0x20 net/socket.c:1017
 [<ffffffff817a108c>] __fput+0x28c/0x780 fs/file_table.c:208
 [<ffffffff817a1605>] ____fput+0x15/0x20 fs/file_table.c:244
 [<ffffffff813774f9>] task_work_run+0xf9/0x170
 [<ffffffff81324aae>] do_exit+0x85e/0x2a00
 [<ffffffff81326dc8>] do_group_exit+0x108/0x330
 [<ffffffff81348cf7>] get_signal+0x617/0x17a0 kernel/signal.c:2307
 [<ffffffff811b49af>] do_signal+0x7f/0x18f0
 [<ffffffff810039bf>] exit_to_usermode_loop+0xbf/0x150 arch/x86/entry/common.c:156
 [<     inline     >] prepare_exit_to_usermode arch/x86/entry/common.c:190
 [<ffffffff81006060>] syscall_return_slowpath+0x1a0/0x1e0 arch/x86/entry/common.c:259
 [<ffffffff85e4d726>] entry_SYSCALL_64_fastpath+0xc4/0xc6
Object at ffff8800081b0ec0, in cache L2TP/IPv6 size: 1448
Allocated:
PID = 10987
 [ 1116.897025] [<ffffffff811ddcb6>] save_stack_trace+0x16/0x20
 [ 1116.897025] [<ffffffff8174c736>] save_stack+0x46/0xd0
 [ 1116.897025] [<ffffffff8174c9ad>] kasan_kmalloc+0xad/0xe0
 [ 1116.897025] [<ffffffff8174cee2>] kasan_slab_alloc+0x12/0x20
 [ 1116.897025] [<     inline     >] slab_post_alloc_hook mm/slab.h:417
 [ 1116.897025] [<     inline     >] slab_alloc_node mm/slub.c:2708
 [ 1116.897025] [<     inline     >] slab_alloc mm/slub.c:2716
 [ 1116.897025] [<ffffffff817476a8>] kmem_cache_alloc+0xc8/0x2b0 mm/slub.c:2721
 [ 1116.897025] [<ffffffff84c4f6a9>] sk_prot_alloc+0x69/0x2b0 net/core/sock.c:1326
 [ 1116.897025] [<ffffffff84c58ac8>] sk_alloc+0x38/0xae0 net/core/sock.c:1388
 [ 1116.897025] [<ffffffff851ddf67>] inet6_create+0x2d7/0x1000 net/ipv6/af_inet6.c:182
 [ 1116.897025] [<ffffffff84c4af7b>] __sock_create+0x37b/0x640 net/socket.c:1153
 [ 1116.897025] [<     inline     >] sock_create net/socket.c:1193
 [ 1116.897025] [<     inline     >] SYSC_socket net/socket.c:1223
 [ 1116.897025] [<ffffffff84c4b46f>] SyS_socket+0xef/0x1b0 net/socket.c:1203
 [ 1116.897025] [<ffffffff85e4d685>] entry_SYSCALL_64_fastpath+0x23/0xc6
Freed:
PID = 10987
 [ 1116.897025] [<ffffffff811ddcb6>] save_stack_trace+0x16/0x20
 [ 1116.897025] [<ffffffff8174c736>] save_stack+0x46/0xd0
 [ 1116.897025] [<ffffffff8174cf61>] kasan_slab_free+0x71/0xb0
 [ 1116.897025] [<     inline     >] slab_free_hook mm/slub.c:1352
 [ 1116.897025] [<     inline     >] slab_free_freelist_hook mm/slub.c:1374
 [ 1116.897025] [<     inline     >] slab_free mm/slub.c:2951
 [ 1116.897025] [<ffffffff81748b28>] kmem_cache_free+0xc8/0x330 mm/slub.c:2973
 [ 1116.897025] [<     inline     >] sk_prot_free net/core/sock.c:1369
 [ 1116.897025] [<ffffffff84c541eb>] __sk_destruct+0x32b/0x4f0 net/core/sock.c:1444
 [ 1116.897025] [<ffffffff84c5aca4>] sk_destruct+0x44/0x80 net/core/sock.c:1452
 [ 1116.897025] [<ffffffff84c5ad33>] __sk_free+0x53/0x220 net/core/sock.c:1460
 [ 1116.897025] [<ffffffff84c5af23>] sk_free+0x23/0x30 net/core/sock.c:1471
 [ 1116.897025] [<ffffffff84c5cb6c>] sk_common_release+0x28c/0x3e0 ./include/net/sock.h:1589
 [ 1116.897025] [<ffffffff8579044e>] l2tp_ip6_close+0x1fe/0x290 net/l2tp/l2tp_ip6.c:243
 [ 1116.897025] [<ffffffff850b2dfd>] inet_release+0xed/0x1c0 net/ipv4/af_inet.c:415
 [ 1116.897025] [<ffffffff851dc5a0>] inet6_release+0x50/0x70 net/ipv6/af_inet6.c:422
 [ 1116.897025] [<ffffffff84c4581d>] sock_release+0x8d/0x1d0 net/socket.c:570
 [ 1116.897025] [<ffffffff84c45976>] sock_close+0x16/0x20 net/socket.c:1017
 [ 1116.897025] [<ffffffff817a108c>] __fput+0x28c/0x780 fs/file_table.c:208
 [ 1116.897025] [<ffffffff817a1605>] ____fput+0x15/0x20 fs/file_table.c:244
 [ 1116.897025] [<ffffffff813774f9>] task_work_run+0xf9/0x170
 [ 1116.897025] [<ffffffff81324aae>] do_exit+0x85e/0x2a00
 [ 1116.897025] [<ffffffff81326dc8>] do_group_exit+0x108/0x330
 [ 1116.897025] [<ffffffff81348cf7>] get_signal+0x617/0x17a0 kernel/signal.c:2307
 [ 1116.897025] [<ffffffff811b49af>] do_signal+0x7f/0x18f0
 [ 1116.897025] [<ffffffff810039bf>] exit_to_usermode_loop+0xbf/0x150 arch/x86/entry/common.c:156
 [ 1116.897025] [<     inline     >] prepare_exit_to_usermode arch/x86/entry/common.c:190
 [ 1116.897025] [<ffffffff81006060>] syscall_return_slowpath+0x1a0/0x1e0 arch/x86/entry/common.c:259
 [ 1116.897025] [<ffffffff85e4d726>] entry_SYSCALL_64_fastpath+0xc4/0xc6
Memory state around the buggy address:
 ffff8800081b0d80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff8800081b0e00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff8800081b0e80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
                                                    ^
 ffff8800081b0f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8800081b0f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

==================================================================

The same issue exists with l2tp_ip_bind() and l2tp_ip_bind_table.

Fixes: c51ce49735c1 ("l2tp: fix oops in L2TP IP sockets for connect() AF_UNSPEC case")
Reported-by: Baozeng Ding <sploving1@gmail.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Tested-by: Baozeng Ding <sploving1@gmail.com>
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
Change-Id: I74b0e6bf0d0a5e0e2f4d8a3c6e52ea75a572b114
Bug: 33753815
---
 net/l2tp/l2tp_ip.c  | 5 +++--
 net/l2tp/l2tp_ip6.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index ec22078b0914..70a0f8083a3b 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -249,8 +249,6 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	int ret;
 	int chk_addr_ret;
 
-	if (!sock_flag(sk, SOCK_ZAPPED))
-		return -EINVAL;
 	if (addr_len < sizeof(struct sockaddr_l2tpip))
 		return -EINVAL;
 	if (addr->l2tp_family != AF_INET)
@@ -265,6 +263,9 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	read_unlock_bh(&l2tp_ip_lock);
 
 	lock_sock(sk);
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		goto out;
+
 	if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
 		goto out;
 
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index c14c59f18c59..b70e70776563 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -264,8 +264,6 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	int addr_type;
 	int err;
 
-	if (!sock_flag(sk, SOCK_ZAPPED))
-		return -EINVAL;
 	if (addr->l2tp_family != AF_INET6)
 		return -EINVAL;
 	if (addr_len < sizeof(*addr))
@@ -291,6 +289,9 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	lock_sock(sk);
 
 	err = -EINVAL;
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		goto out_unlock;
+
 	if (sk->sk_state != TCP_CLOSE)
 		goto out_unlock;
 

From 7f18f0963d81a096e741cfc14ec9c2915f633e0a Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Mon, 9 Jan 2017 17:20:11 +0000
Subject: [PATCH 0609/3220] DEBUG: sched/fair: Fix missing sched_load_avg_cpu
 events

update_cfs_rq_load_avg is called from update_blocked_averages without triggering
the sched_load_avg_cpu event. Move the event trigger to inside
update_cfs_rq_load_avg to avoid this missing event.

Change-Id: I6c4f66f687a644e4e7f798db122d28a8f5919b7b
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b881cf81d79..aeb9b550470b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2726,6 +2726,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
+	trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
 	return decayed || removed;
 }
 
@@ -2749,7 +2751,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 
 	if (entity_is_task(se))
 		trace_sched_load_avg_task(task_of(se), &se->avg);
-	trace_sched_load_avg_cpu(cpu, cfs_rq);
 }
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)

From 1cb392e10307ba3ef7d9a602e59e54ae3b6399ad Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Tue, 10 Jan 2017 11:31:01 +0000
Subject: [PATCH 0610/3220] DEBUG: sched/fair: Fix sched_load_avg_cpu events
 for task_groups

The current sched_load_avg_cpu event traces the load for any cfs_rq that is
updated. This is not representative of the CPU load - instead we should only
trace this event when the cfs_rq being updated is in the root_task_group.

Change-Id: I345c2f13f6b5718cb4a89beb247f7887ce97ed6b
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
---
 kernel/sched/fair.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aeb9b550470b..7d4151601860 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2726,7 +2726,9 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-	trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+	/* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+	if (cfs_rq == &rq_of(cfs_rq)->cfs)
+		trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
 
 	return decayed || removed;
 }

From 82a6fedb6ea53c9f3eb00ab1e0b839d3fcfe5fc2 Mon Sep 17 00:00:00 2001
From: Luca Stefani <luca.stefani.ge1@gmail.com>
Date: Fri, 13 Jan 2017 16:09:57 +0100
Subject: [PATCH 0611/3220] arm64: kernel: Fix build warning

* Issue: After da643dc .enable expects an int
  -> Change cpu_enable_uao protype to int

Change-Id: I581a1afd13cd90637e6a7a315ede2af8950b401f
Signed-off-by: Luca Stefani <luca.stefani.ge1@gmail.com>
---
 arch/arm64/include/asm/processor.h | 2 +-
 arch/arm64/mm/fault.c              | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 83d0aa97b577..4be934fde409 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -191,6 +191,6 @@ static inline void spin_lock_prefetch(const void *ptr)
 #endif
 
 int cpu_enable_pan(void *__unused);
-void cpu_enable_uao(void *__unused);
+int cpu_enable_uao(void *__unused);
 
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 2c0a80ca3536..2581ede3075a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -658,8 +658,9 @@ int cpu_enable_pan(void *__unused)
  * We need to enable the feature at runtime (instead of adding it to
  * PSR_MODE_EL1h) as the feature may not be implemented by the cpu.
  */
-void cpu_enable_uao(void *__unused)
+int cpu_enable_uao(void *__unused)
 {
 	asm(SET_PSTATE_UAO(1));
+	return 0;
 }
 #endif /* CONFIG_ARM64_UAO */

From 0ec03f8457996ad0067ae1f62b70a092f726bb95 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 13 Apr 2016 16:38:34 -0700
Subject: [PATCH 0612/3220] ANDROID: sdcardfs: override umask on mkdir and
 create

The mode on files created on the lower fs should
not be affected by the umask of the calling
task's fs_struct. Instead, we create a copy
and modify it as needed. This also lets us avoid
the string shenanigans around .nomedia files.

Bug: 27992761
Change-Id: Ia3a6e56c24c6e19b3b01c1827e46403bb71c2f4c
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/fs_struct.c      |  1 +
 fs/sdcardfs/inode.c | 70 ++++++++++++++++++++++-----------------------
 2 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 7dca743b2ce1..005dcb401369 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -127,6 +127,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	}
 	return fs;
 }
+EXPORT_SYMBOL_GPL(copy_fs_struct);
 
 int unshare_fs_struct(void)
 {
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 2528da0d3ae1..4b140ba86955 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -19,6 +19,7 @@
  */
 
 #include "sdcardfs.h"
+#include <linux/fs_struct.h>
 
 /* Do not directly use this function. Use OVERRIDE_CRED() instead. */
 const struct cred * override_fsids(struct sdcardfs_sb_info* sbi)
@@ -56,6 +57,8 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	struct dentry *lower_parent_dentry = NULL;
 	struct path lower_path;
 	const struct cred *saved_cred = NULL;
+	struct fs_struct *saved_fs;
+	struct fs_struct *copied_fs;
 
 	if(!check_caller_access_to_name(dir, dentry->d_name.name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
@@ -74,6 +77,16 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 
 	/* set last 16bytes of mode field to 0664 */
 	mode = (mode & S_IFMT) | 00664;
+
+	/* temporarily change umask for lower fs write */
+	saved_fs = current->fs;
+	copied_fs = copy_fs_struct(current->fs);
+	if (!copied_fs) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+	current->fs = copied_fs;
+	current->fs->umask = 0;
 	err = vfs_create(d_inode(lower_parent_dentry), lower_dentry, mode, want_excl);
 	if (err)
 		goto out;
@@ -85,6 +98,9 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
 
 out:
+	current->fs = saved_fs;
+	free_fs_struct(copied_fs);
+out_unlock:
 	unlock_dir(lower_parent_dentry);
 	sdcardfs_put_lower_path(dentry, &lower_path);
 	REVERT_CRED(saved_cred);
@@ -245,11 +261,9 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 	const struct cred *saved_cred = NULL;
 	struct sdcardfs_inode_info *pi = SDCARDFS_I(dir);
-	char *page_buf;
-	char *nomedia_dir_name;
-	char *nomedia_fullpath;
-	int fullpath_namelen;
 	int touch_err = 0;
+	struct fs_struct *saved_fs;
+	struct fs_struct *copied_fs;
 
 	if(!check_caller_access_to_name(dir, dentry->d_name.name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
@@ -276,6 +290,16 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 
 	/* set last 16bytes of mode field to 0775 */
 	mode = (mode & S_IFMT) | 00775;
+
+	/* temporarily change umask for lower fs write */
+	saved_fs = current->fs;
+	copied_fs = copy_fs_struct(current->fs);
+	if (!copied_fs) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+	current->fs = copied_fs;
+	current->fs->umask = 0;
 	err = vfs_mkdir(d_inode(lower_parent_dentry), lower_dentry, mode);
 
 	if (err)
@@ -316,42 +340,18 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	/* When creating /Android/data and /Android/obb, mark them as .nomedia */
 	if (make_nomedia_in_obb ||
 		((pi->perm == PERM_ANDROID) && (!strcasecmp(dentry->d_name.name, "data")))) {
-
-		page_buf = (char *)__get_free_page(GFP_KERNEL);
-		if (!page_buf) {
-			printk(KERN_ERR "sdcardfs: failed to allocate page buf\n");
-			goto out;
-		}
-
-		nomedia_dir_name = d_absolute_path(&lower_path, page_buf, PAGE_SIZE);
-		if (IS_ERR(nomedia_dir_name)) {
-			free_page((unsigned long)page_buf);
-			printk(KERN_ERR "sdcardfs: failed to get .nomedia dir name\n");
-			goto out;
-		}
-
-		fullpath_namelen = page_buf + PAGE_SIZE - nomedia_dir_name - 1;
-		fullpath_namelen += strlen("/.nomedia");
-		nomedia_fullpath = kzalloc(fullpath_namelen + 1, GFP_KERNEL);
-		if (!nomedia_fullpath) {
-			free_page((unsigned long)page_buf);
-			printk(KERN_ERR "sdcardfs: failed to allocate .nomedia fullpath buf\n");
-			goto out;
-		}
-
-		strcpy(nomedia_fullpath, nomedia_dir_name);
-		free_page((unsigned long)page_buf);
-		strcat(nomedia_fullpath, "/.nomedia");
-		touch_err = touch(nomedia_fullpath, 0664);
+		set_fs_pwd(current->fs, &lower_path);
+		touch_err = touch(".nomedia", 0664);
 		if (touch_err) {
-			printk(KERN_ERR "sdcardfs: failed to touch(%s): %d\n",
-							nomedia_fullpath, touch_err);
-			kfree(nomedia_fullpath);
+			printk(KERN_ERR "sdcardfs: failed to create .nomedia in %s: %d\n",
+							lower_path.dentry->d_name.name, touch_err);
 			goto out;
 		}
-		kfree(nomedia_fullpath);
 	}
 out:
+	current->fs = saved_fs;
+	free_fs_struct(copied_fs);
+out_unlock:
 	unlock_dir(lower_parent_dentry);
 	sdcardfs_put_lower_path(dentry, &lower_path);
 out_revert:

From cc16efd13c3015889efe0bfe4f2cb5f4020ec4ca Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 27 Apr 2016 15:31:29 -0700
Subject: [PATCH 0613/3220] ANDROID: sdcardfs: Check for other cases on path
 lookup

This fixes a bug where the first lookup of a
file or folder created under a different view
would not be case insensitive. It will now
search through for a case insensitive match
if the initial lookup fails.

Bug:28024488
Change-Id: I4ff9ce297b9f2f9864b47540e740fd491c545229
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/lookup.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index a01b06a514fd..a127d05b5054 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -240,6 +240,28 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 	/* Use vfs_path_lookup to check if the dentry exists or not */
 	err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0,
 				&lower_path);
+	/* check for other cases */
+	if (err == -ENOENT) {
+		struct dentry *child;
+		struct dentry *match = NULL;
+		spin_lock(&lower_dir_dentry->d_lock);
+		list_for_each_entry(child, &lower_dir_dentry->d_subdirs, d_child) {
+			if (child && d_inode(child)) {
+				if (strcasecmp(child->d_name.name, name)==0) {
+					match = dget(child);
+					break;
+				}
+			}
+		}
+		spin_unlock(&lower_dir_dentry->d_lock);
+		if (match) {
+			err = vfs_path_lookup(lower_dir_dentry,
+						lower_dir_mnt,
+						match->d_name.name, 0,
+						&lower_path);
+			dput(match);
+		}
+	}
 
 	/* no error: handle positive dentries */
 	if (!err) {

From 96acdab2f4de226a253abbc6f7bab149bb82c4bf Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 16 Aug 2016 15:19:26 -0700
Subject: [PATCH 0614/3220] ANDROID: sdcardfs: Fix locking for permission fix
 up

Iterating over d_subdirs requires taking d_lock.
Removed several unneeded locks.

Change-Id: I5b1588e54c7e6ee19b756d6705171c7f829e2650
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c | 6 ++----
 fs/sdcardfs/inode.c        | 2 --
 fs/sdcardfs/lookup.c       | 4 +---
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 41e0e11b3c35..bfe402b8cf32 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -111,15 +111,15 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry)
 
 void get_derive_permissions_recursive(struct dentry *parent) {
 	struct dentry *dentry;
+	spin_lock(&parent->d_lock);
 	list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
 		if (dentry->d_inode) {
-			mutex_lock(&dentry->d_inode->i_mutex);
 			get_derived_permission(parent, dentry);
 			fix_derived_permission(dentry->d_inode);
 			get_derive_permissions_recursive(dentry);
-			mutex_unlock(&dentry->d_inode->i_mutex);
 		}
 	}
+	spin_unlock(&parent->d_lock);
 }
 
 /* main function for updating derived permission */
@@ -135,7 +135,6 @@ inline void update_derived_permission_lock(struct dentry *dentry)
 	 * 1. need to check whether the dentry is updated or not
 	 * 2. remove the root dentry update
 	 */
-	mutex_lock(&dentry->d_inode->i_mutex);
 	if(IS_ROOT(dentry)) {
 		//setup_default_pre_root_state(dentry->d_inode);
 	} else {
@@ -146,7 +145,6 @@ inline void update_derived_permission_lock(struct dentry *dentry)
 		}
 	}
 	fix_derived_permission(dentry->d_inode);
-	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 
 int need_graft_path(struct dentry *dentry)
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 4b140ba86955..1a23c0cc8f58 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -513,11 +513,9 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 	/* At this point, not all dentry information has been moved, so
 	 * we pass along new_dentry for the name.*/
-	mutex_lock(&d_inode(old_dentry)->i_mutex);
 	get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry);
 	fix_derived_permission(d_inode(old_dentry));
 	get_derive_permissions_recursive(old_dentry);
-	mutex_unlock(&d_inode(old_dentry)->i_mutex);
 out:
 	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
 	dput(lower_old_dir_dentry);
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index a127d05b5054..c74a7d1bc18e 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -387,11 +387,9 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (dentry->d_inode) {
 		fsstack_copy_attr_times(dentry->d_inode,
 					sdcardfs_lower_inode(dentry->d_inode));
-		/* get drived permission */
-		mutex_lock(&dentry->d_inode->i_mutex);
+		/* get derived permission */
 		get_derived_permission(parent, dentry);
 		fix_derived_permission(dentry->d_inode);
-		mutex_unlock(&dentry->d_inode->i_mutex);
 	}
 	/* update parent directory's atime */
 	fsstack_copy_attr_atime(parent->d_inode,

From 1fb590f2d5ada66a89b82c7657b3002892c966b6 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 10 May 2016 13:42:43 -0700
Subject: [PATCH 0615/3220] ANDROID: sdcardfs: Switch package list to RCU

Switched the package id hashmap to use RCU.

Change-Id: I9fdcab279009005bf28536247d11e13babab0b93
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c |   3 +-
 fs/sdcardfs/packagelist.c  | 212 +++++++++++++++++--------------------
 fs/sdcardfs/sdcardfs.h     |   2 +-
 3 files changed, 99 insertions(+), 118 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index bfe402b8cf32..2a75ad873a7c 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -47,7 +47,6 @@ void setup_derived_state(struct inode *inode, perm_t perm,
 /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */
 void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry)
 {
-	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 	struct sdcardfs_inode_info *info = SDCARDFS_I(dentry->d_inode);
 	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode);
 	appid_t appid;
@@ -96,7 +95,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st
 		case PERM_ANDROID_DATA:
 		case PERM_ANDROID_OBB:
 		case PERM_ANDROID_MEDIA:
-			appid = get_appid(sbi->pkgl_id, newdentry->d_name.name);
+			appid = get_appid(newdentry->d_name.name);
 			if (appid != 0) {
 				info->d_uid = multiuser_get_uid(parent_info->userid, appid);
 			}
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 9c3340528eee..f5a49c513568 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -29,26 +29,13 @@
 
 #include <linux/configfs.h>
 
-#define STRING_BUF_SIZE		(512)
-
 struct hashtable_entry {
 	struct hlist_node hlist;
-	void *key;
-	unsigned int value;
+	const char *key;
+	atomic_t value;
 };
 
-struct sb_list {
-	struct super_block *sb;
-	struct list_head list;
-};
-
-struct packagelist_data {
-	DECLARE_HASHTABLE(package_to_appid,8);
-	struct mutex hashtable_lock;
-
-};
-
-static struct packagelist_data *pkgl_data_all;
+static DEFINE_HASHTABLE(package_to_appid, 8);
 
 static struct kmem_cache *hashtable_entry_cachep;
 
@@ -64,22 +51,21 @@ static unsigned int str_hash(const char *key) {
 	return h;
 }
 
-appid_t get_appid(void *pkgl_id, const char *app_name)
+appid_t get_appid(const char *app_name)
 {
-	struct packagelist_data *pkgl_dat = pkgl_data_all;
 	struct hashtable_entry *hash_cur;
 	unsigned int hash = str_hash(app_name);
 	appid_t ret_id;
 
-	mutex_lock(&pkgl_dat->hashtable_lock);
-	hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) {
+	rcu_read_lock();
+	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
 		if (!strcasecmp(app_name, hash_cur->key)) {
-			ret_id = (appid_t)hash_cur->value;
-			mutex_unlock(&pkgl_dat->hashtable_lock);
+			ret_id = atomic_read(&hash_cur->value);
+			rcu_read_unlock();
 			return ret_id;
 		}
 	}
-	mutex_unlock(&pkgl_dat->hashtable_lock);
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -120,116 +106,118 @@ int open_flags_to_access_mode(int open_flags) {
 	}
 }
 
-static int insert_str_to_int_lock(struct packagelist_data *pkgl_dat, char *key,
-		unsigned int value)
+static struct hashtable_entry *alloc_packagelist_entry(const char *key,
+		appid_t value)
+{
+	struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep,
+			GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->key = kstrdup(key, GFP_KERNEL);
+	if (!ret->key) {
+		kmem_cache_free(hashtable_entry_cachep, ret);
+		return NULL;
+	}
+
+	atomic_set(&ret->value, value);
+	return ret;
+}
+
+static int insert_packagelist_entry_locked(const char *key, appid_t value)
 {
 	struct hashtable_entry *hash_cur;
 	struct hashtable_entry *new_entry;
 	unsigned int hash = str_hash(key);
 
-	hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) {
+	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
 		if (!strcasecmp(key, hash_cur->key)) {
-			hash_cur->value = value;
+			atomic_set(&hash_cur->value, value);
 			return 0;
 		}
 	}
-	new_entry = kmem_cache_alloc(hashtable_entry_cachep, GFP_KERNEL);
+	new_entry = alloc_packagelist_entry(key, value);
 	if (!new_entry)
 		return -ENOMEM;
-	new_entry->key = kstrdup(key, GFP_KERNEL);
-	new_entry->value = value;
-	hash_add(pkgl_dat->package_to_appid, &new_entry->hlist, hash);
+	hash_add_rcu(package_to_appid, &new_entry->hlist, hash);
 	return 0;
 }
 
 static void fixup_perms(struct super_block *sb) {
 	if (sb && sb->s_magic == SDCARDFS_SUPER_MAGIC) {
-		mutex_lock(&sb->s_root->d_inode->i_mutex);
 		get_derive_permissions_recursive(sb->s_root);
-		mutex_unlock(&sb->s_root->d_inode->i_mutex);
 	}
 }
 
-static int insert_str_to_int(struct packagelist_data *pkgl_dat, char *key,
-		unsigned int value) {
-	int ret;
-	struct sdcardfs_sb_info *sbinfo;
-	mutex_lock(&sdcardfs_super_list_lock);
-	mutex_lock(&pkgl_dat->hashtable_lock);
-	ret = insert_str_to_int_lock(pkgl_dat, key, value);
-	mutex_unlock(&pkgl_dat->hashtable_lock);
-
-	list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
-		if (sbinfo) {
-			fixup_perms(sbinfo->sb);
-		}
-	}
-	mutex_unlock(&sdcardfs_super_list_lock);
-	return ret;
-}
-
-static void remove_str_to_int_lock(struct hashtable_entry *h_entry) {
-	kfree(h_entry->key);
-	hash_del(&h_entry->hlist);
-	kmem_cache_free(hashtable_entry_cachep, h_entry);
-}
-
-static void remove_str_to_int(struct packagelist_data *pkgl_dat, const char *key)
+static void fixup_all_perms(void)
 {
 	struct sdcardfs_sb_info *sbinfo;
+	list_for_each_entry(sbinfo, &sdcardfs_super_list, list)
+		if (sbinfo)
+			fixup_perms(sbinfo->sb);
+}
+
+static int insert_packagelist_entry(const char *key, appid_t value)
+{
+	int err;
+
+	mutex_lock(&sdcardfs_super_list_lock);
+	err = insert_packagelist_entry_locked(key, value);
+	if (!err)
+		fixup_all_perms();
+	mutex_unlock(&sdcardfs_super_list_lock);
+
+	return err;
+}
+
+static void free_packagelist_entry(struct hashtable_entry *entry)
+{
+	kfree(entry->key);
+	hash_del_rcu(&entry->hlist);
+	kmem_cache_free(hashtable_entry_cachep, entry);
+}
+
+static void remove_packagelist_entry_locked(const char *key)
+{
 	struct hashtable_entry *hash_cur;
 	unsigned int hash = str_hash(key);
-	mutex_lock(&sdcardfs_super_list_lock);
-	mutex_lock(&pkgl_dat->hashtable_lock);
-	hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) {
+
+	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
 		if (!strcasecmp(key, hash_cur->key)) {
-			remove_str_to_int_lock(hash_cur);
-			break;
-		}
-	}
-	mutex_unlock(&pkgl_dat->hashtable_lock);
-	list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
-		if (sbinfo) {
-			fixup_perms(sbinfo->sb);
+			hash_del_rcu(&hash_cur->hlist);
+			synchronize_rcu();
+			free_packagelist_entry(hash_cur);
+			return;
 		}
 	}
+}
+
+static void remove_packagelist_entry(const char *key)
+{
+	mutex_lock(&sdcardfs_super_list_lock);
+	remove_packagelist_entry_locked(key);
+	fixup_all_perms();
 	mutex_unlock(&sdcardfs_super_list_lock);
 	return;
 }
 
-static void remove_all_hashentrys(struct packagelist_data *pkgl_dat)
+static void packagelist_destroy(void)
 {
 	struct hashtable_entry *hash_cur;
 	struct hlist_node *h_t;
+	HLIST_HEAD(free_list);
 	int i;
-	mutex_lock(&pkgl_dat->hashtable_lock);
-	hash_for_each_safe(pkgl_dat->package_to_appid, i, h_t, hash_cur, hlist)
-		remove_str_to_int_lock(hash_cur);
-	mutex_unlock(&pkgl_dat->hashtable_lock);
-	hash_init(pkgl_dat->package_to_appid);
-}
+	mutex_lock(&sdcardfs_super_list_lock);
+	hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) {
+		hash_del_rcu(&hash_cur->hlist);
+		hlist_add_head(&hash_cur->hlist, &free_list);
 
-static struct packagelist_data * packagelist_create(void)
-{
-	struct packagelist_data *pkgl_dat;
-
-	pkgl_dat = kmalloc(sizeof(*pkgl_dat), GFP_KERNEL | __GFP_ZERO);
-	if (!pkgl_dat) {
-                printk(KERN_ERR "sdcardfs: Failed to create hash\n");
-		return ERR_PTR(-ENOMEM);
 	}
-
-	mutex_init(&pkgl_dat->hashtable_lock);
-	hash_init(pkgl_dat->package_to_appid);
-
-	return pkgl_dat;
-}
-
-static void packagelist_destroy(struct packagelist_data *pkgl_dat)
-{
-	remove_all_hashentrys(pkgl_dat);
+	synchronize_rcu();
+	hlist_for_each_entry_safe(hash_cur, h_t, &free_list, hlist)
+		free_packagelist_entry(hash_cur);
+	mutex_unlock(&sdcardfs_super_list_lock);
 	printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n");
-	kfree(pkgl_dat);
 }
 
 struct package_appid {
@@ -245,26 +233,21 @@ static inline struct package_appid *to_package_appid(struct config_item *item)
 static ssize_t package_appid_attr_show(struct config_item *item,
 				      char *page)
 {
-	ssize_t count;
-	count = sprintf(page, "%d\n", get_appid(pkgl_data_all, item->ci_name));
-	return count;
+	return scnprintf(page, PAGE_SIZE, "%u\n", get_appid(item->ci_name));
 }
 
 static ssize_t package_appid_attr_store(struct config_item *item,
 				       const char *page, size_t count)
 {
 	struct package_appid *package_appid = to_package_appid(item);
-	unsigned long tmp;
-	char *p = (char *) page;
+	unsigned int tmp;
 	int ret;
 
-	tmp = simple_strtoul(p, &p, 10);
-	if (!p || (*p && (*p != '\n')))
-		return -EINVAL;
+	ret = kstrtouint(page, 10, &tmp);
+	if (ret)
+		return ret;
 
-	if (tmp > INT_MAX)
-		return -ERANGE;
-	ret = insert_str_to_int(pkgl_data_all, item->ci_name, (unsigned int)tmp);
+	ret = insert_packagelist_entry(item->ci_name, tmp);
 	package_appid->add_pid = tmp;
 	if (ret)
 		return ret;
@@ -289,7 +272,7 @@ static void package_appid_release(struct config_item *item)
 {
 	printk(KERN_INFO "sdcardfs: removing %s\n", item->ci_dentry->d_name.name);
 	/* item->ci_name is freed already, so we rely on the dentry */
-	remove_str_to_int(pkgl_data_all, item->ci_dentry->d_name.name);
+	remove_packagelist_entry(item->ci_dentry->d_name.name);
 	kfree(to_package_appid(item));
 }
 
@@ -333,21 +316,21 @@ static ssize_t packages_attr_show(struct config_item *item,
 					 char *page)
 {
 	struct hashtable_entry *hash_cur;
-	struct hlist_node *h_t;
 	int i;
 	int count = 0, written = 0;
-	char errormsg[] = "<truncated>\n";
+	const char errormsg[] = "<truncated>\n";
 
-	mutex_lock(&pkgl_data_all->hashtable_lock);
-	hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) {
-		written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value);
+	rcu_read_lock();
+	hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) {
+		written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n",
+					(const char *)hash_cur->key, atomic_read(&hash_cur->value));
 		if (count + written == PAGE_SIZE - sizeof(errormsg)) {
 			count += scnprintf(page + count, PAGE_SIZE - count, errormsg);
 			break;
 		}
 		count += written;
 	}
-	mutex_unlock(&pkgl_data_all->hashtable_lock);
+	rcu_read_unlock();
 
 	return count;
 }
@@ -430,7 +413,6 @@ int packagelist_init(void)
 		return -ENOMEM;
 	}
 
-	pkgl_data_all = packagelist_create();
 	configfs_sdcardfs_init();
         return 0;
 }
@@ -438,7 +420,7 @@ int packagelist_init(void)
 void packagelist_exit(void)
 {
 	configfs_sdcardfs_exit();
-	packagelist_destroy(pkgl_data_all);
+	packagelist_destroy();
 	if (hashtable_entry_cachep)
 		kmem_cache_destroy(hashtable_entry_cachep);
 }
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index f111f898b630..75284f339ae0 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -396,7 +396,7 @@ extern struct mutex sdcardfs_super_list_lock;
 extern struct list_head sdcardfs_super_list;
 
 /* for packagelist.c */
-extern appid_t get_appid(void *pkgl_id, const char *app_name);
+extern appid_t get_appid(const char *app_name);
 extern int check_caller_access_to_name(struct inode *parent_node, const char* name);
 extern int open_flags_to_access_mode(int open_flags);
 extern int packagelist_init(void);

From 640e5265032d2e36c8933f09657f34946fbf8259 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 18 May 2016 16:57:10 -0700
Subject: [PATCH 0616/3220] ANDROID: sdcardfs: Added top to sdcardfs_inode_info

Adding packages to the package list and moving files
takes a large amount of locks, and is currently a
heavy operation. This adds a 'top' field to the
inode_info, which points to the inode for the top
most directory whose owner you would like to match.

On permission checks and get_attr, we look up the
owner based on the information at top. When we change
a package mapping, we need only modify the information
in the corresponding top inode_info's. When renaming,
we must ensure top is set correctly in all children.
This happens when an app specific folder gets moved
outside of the folder for that app.

Change-Id: Ib749c60b568e9a45a46f8ceed985c1338246ec6c
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c | 73 ++++++++++++++++++++++++++++++++++----
 fs/sdcardfs/inode.c        | 45 +++++++++++++++++++----
 fs/sdcardfs/main.c         |  4 +--
 fs/sdcardfs/packagelist.c  | 12 +++----
 fs/sdcardfs/sdcardfs.h     | 40 ++++++++++++++++++---
 fs/sdcardfs/super.c        |  1 +
 6 files changed, 149 insertions(+), 26 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 2a75ad873a7c..89daf69efbaa 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -30,11 +30,12 @@ static void inherit_derived_state(struct inode *parent, struct inode *child)
 	ci->userid = pi->userid;
 	ci->d_uid = pi->d_uid;
 	ci->under_android = pi->under_android;
+	set_top(ci, pi->top);
 }
 
 /* helper function for derived state */
-void setup_derived_state(struct inode *inode, perm_t perm,
-                        userid_t userid, uid_t uid, bool under_android)
+void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
+                        uid_t uid, bool under_android, struct inode *top)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
 
@@ -42,6 +43,7 @@ void setup_derived_state(struct inode *inode, perm_t perm,
 	info->userid = userid;
 	info->d_uid = uid;
 	info->under_android = under_android;
+	set_top(info, top);
 }
 
 /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */
@@ -70,6 +72,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st
 			/* Legacy internal layout places users at top level */
 			info->perm = PERM_ROOT;
 			info->userid = simple_strtoul(newdentry->d_name.name, NULL, 10);
+			set_top(info, &info->vfs_inode);
 			break;
 		case PERM_ROOT:
 			/* Assume masked off by default. */
@@ -77,19 +80,23 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID;
 				info->under_android = true;
+				set_top(info, &info->vfs_inode);
 			}
 			break;
 		case PERM_ANDROID:
 			if (!strcasecmp(newdentry->d_name.name, "data")) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID_DATA;
+				set_top(info, &info->vfs_inode);
 			} else if (!strcasecmp(newdentry->d_name.name, "obb")) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID_OBB;
+				set_top(info, &info->vfs_inode);
 				/* Single OBB directory is always shared */
 			} else if (!strcasecmp(newdentry->d_name.name, "media")) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID_MEDIA;
+				set_top(info, &info->vfs_inode);
 			}
 			break;
 		case PERM_ANDROID_DATA:
@@ -99,6 +106,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st
 			if (appid != 0) {
 				info->d_uid = multiuser_get_uid(parent_info->userid, appid);
 			}
+			set_top(info, &info->vfs_inode);
 			break;
 	}
 }
@@ -108,14 +116,65 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry)
 	get_derived_permission_new(parent, dentry, dentry);
 }
 
-void get_derive_permissions_recursive(struct dentry *parent) {
+static int descendant_may_need_fixup(perm_t perm) {
+	if (perm == PERM_PRE_ROOT || perm == PERM_ROOT || perm == PERM_ANDROID)
+		return 1;
+	return 0;
+}
+
+static int needs_fixup(perm_t perm) {
+	if (perm == PERM_ANDROID_DATA || perm == PERM_ANDROID_OBB
+			|| perm == PERM_ANDROID_MEDIA)
+		return 1;
+	return 0;
+}
+
+void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len) {
+	struct dentry *child;
+	struct sdcardfs_inode_info *info;
+	if (!dget(dentry))
+		return;
+	if (!dentry->d_inode) {
+		dput(dentry);
+		return;
+	}
+	info = SDCARDFS_I(d_inode(dentry));
+
+	if (needs_fixup(info->perm)) {
+		mutex_lock(&d_inode(dentry)->i_mutex);
+		child = lookup_one_len(name, dentry, len);
+		mutex_unlock(&d_inode(dentry)->i_mutex);
+		if (!IS_ERR(child)) {
+			if (child->d_inode) {
+				get_derived_permission(dentry, child);
+				fix_derived_permission(d_inode(child));
+			}
+			dput(child);
+		}
+	} else 	if (descendant_may_need_fixup(info->perm)) {
+		mutex_lock(&d_inode(dentry)->i_mutex);
+		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+				fixup_perms_recursive(child, name, len);
+		}
+		mutex_unlock(&d_inode(dentry)->i_mutex);
+	}
+	dput(dentry);
+}
+
+void fixup_top_recursive(struct dentry *parent) {
 	struct dentry *dentry;
+	struct sdcardfs_inode_info *info;
+	if (!d_inode(parent))
+		return;
+	info = SDCARDFS_I(d_inode(parent));
 	spin_lock(&parent->d_lock);
 	list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
-		if (dentry->d_inode) {
-			get_derived_permission(parent, dentry);
-			fix_derived_permission(dentry->d_inode);
-			get_derive_permissions_recursive(dentry);
+		if (d_inode(dentry)) {
+			if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) {
+				get_derived_permission(parent, dentry);
+				fix_derived_permission(d_inode(dentry));
+				fixup_top_recursive(dentry);
+			}
 		}
 	}
 	spin_unlock(&parent->d_lock);
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 1a23c0cc8f58..67bcee2c379a 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -515,7 +515,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * we pass along new_dentry for the name.*/
 	get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry);
 	fix_derived_permission(d_inode(old_dentry));
-	get_derive_permissions_recursive(old_dentry);
+	fixup_top_recursive(old_dentry);
 out:
 	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
 	dput(lower_old_dir_dentry);
@@ -587,6 +587,16 @@ static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie)
 static int sdcardfs_permission(struct inode *inode, int mask)
 {
 	int err;
+	struct inode *top = grab_top(SDCARDFS_I(inode));
+
+	if (!top)
+		return -EINVAL;
+	/* Ensure owner is up to date */
+	if (!uid_eq(inode->i_uid, top->i_uid)) {
+		SDCARDFS_I(inode)->d_uid = SDCARDFS_I(top)->d_uid;
+		fix_derived_permission(inode);
+	}
+	release_top(SDCARDFS_I(inode));
 
 	/*
 	 * Permission check on sdcardfs inode.
@@ -725,6 +735,30 @@ out_err:
 	return err;
 }
 
+static int sdcardfs_fillattr(struct inode *inode, struct kstat *stat)
+{
+	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
+	struct inode *top = grab_top(info);
+	if (!top)
+		return -EINVAL;
+
+	stat->dev = inode->i_sb->s_dev;
+	stat->ino = inode->i_ino;
+	stat->mode = (inode->i_mode  & S_IFMT) | get_mode(SDCARDFS_I(top));
+	stat->nlink = inode->i_nlink;
+	stat->uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid);
+	stat->gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(top)));
+	stat->rdev = inode->i_rdev;
+	stat->size = i_size_read(inode);
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blksize = (1 << inode->i_blkbits);
+	stat->blocks = inode->i_blocks;
+	release_top(info);
+	return 0;
+}
+
 static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
@@ -733,6 +767,7 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct inode *lower_inode;
 	struct path lower_path;
 	struct dentry *parent;
+	int err;
 
 	parent = dget_parent(dentry);
 	if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) {
@@ -750,14 +785,12 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	lower_dentry = lower_path.dentry;
 	lower_inode = sdcardfs_lower_inode(inode);
 
-
 	sdcardfs_copy_and_fix_attrs(inode, lower_inode);
 	fsstack_copy_inode_size(inode, lower_inode);
 
-
-	generic_fillattr(inode, stat);
+	err = sdcardfs_fillattr(inode, stat);
 	sdcardfs_put_lower_path(dentry, &lower_path);
-	return 0;
+	return err;
 }
 
 const struct inode_operations sdcardfs_symlink_iops = {
@@ -775,9 +808,7 @@ const struct inode_operations sdcardfs_symlink_iops = {
 const struct inode_operations sdcardfs_dir_iops = {
 	.create		= sdcardfs_create,
 	.lookup		= sdcardfs_lookup,
-#if 0
 	.permission	= sdcardfs_permission,
-#endif
 	.unlink		= sdcardfs_unlink,
 	.mkdir		= sdcardfs_mkdir,
 	.rmdir		= sdcardfs_rmdir,
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index a6522286d731..6d526bf3d956 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -268,13 +268,13 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name,
 	sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL);
 	mutex_lock(&sdcardfs_super_list_lock);
 	if(sb_info->options.multiuser) {
-		setup_derived_state(sb->s_root->d_inode, PERM_PRE_ROOT, sb_info->options.fs_user_id, AID_ROOT, false);
+		setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root));
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name);
 		/*err =  prepare_dir(sb_info->obbpath_s,
 					sb_info->options.fs_low_uid,
 					sb_info->options.fs_low_gid, 00755);*/
 	} else {
-		setup_derived_state(sb->s_root->d_inode, PERM_ROOT, sb_info->options.fs_low_uid, AID_ROOT, false);
+		setup_derived_state(sb->s_root->d_inode, PERM_ROOT, sb_info->options.fs_low_uid, AID_ROOT, false, sb->s_root->d_inode);
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
 	}
 	fix_derived_permission(sb->s_root->d_inode);
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index f5a49c513568..03776fa5f26c 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -143,18 +143,18 @@ static int insert_packagelist_entry_locked(const char *key, appid_t value)
 	return 0;
 }
 
-static void fixup_perms(struct super_block *sb) {
+static void fixup_perms(struct super_block *sb, const char *key) {
 	if (sb && sb->s_magic == SDCARDFS_SUPER_MAGIC) {
-		get_derive_permissions_recursive(sb->s_root);
+		fixup_perms_recursive(sb->s_root, key, strlen(key));
 	}
 }
 
-static void fixup_all_perms(void)
+static void fixup_all_perms(const char *key)
 {
 	struct sdcardfs_sb_info *sbinfo;
 	list_for_each_entry(sbinfo, &sdcardfs_super_list, list)
 		if (sbinfo)
-			fixup_perms(sbinfo->sb);
+			fixup_perms(sbinfo->sb, key);
 }
 
 static int insert_packagelist_entry(const char *key, appid_t value)
@@ -164,7 +164,7 @@ static int insert_packagelist_entry(const char *key, appid_t value)
 	mutex_lock(&sdcardfs_super_list_lock);
 	err = insert_packagelist_entry_locked(key, value);
 	if (!err)
-		fixup_all_perms();
+		fixup_all_perms(key);
 	mutex_unlock(&sdcardfs_super_list_lock);
 
 	return err;
@@ -196,7 +196,7 @@ static void remove_packagelist_entry(const char *key)
 {
 	mutex_lock(&sdcardfs_super_list_lock);
 	remove_packagelist_entry_locked(key);
-	fixup_all_perms();
+	fixup_all_perms(key);
 	mutex_unlock(&sdcardfs_super_list_lock);
 	return;
 }
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 75284f339ae0..cfda98d257b6 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -169,6 +169,8 @@ struct sdcardfs_inode_info {
 	userid_t userid;
 	uid_t d_uid;
 	bool under_android;
+	/* top folder for ownership */
+	struct inode *top;
 
 	struct inode vfs_inode;
 };
@@ -321,6 +323,35 @@ static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \
 SDCARDFS_DENT_FUNC(lower_path)
 SDCARDFS_DENT_FUNC(orig_path)
 
+/* grab a refererence if we aren't linking to ourself */
+static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top)
+{
+	struct inode *old_top = NULL;
+	BUG_ON(IS_ERR_OR_NULL(top));
+	if (info->top && info->top != &info->vfs_inode) {
+		old_top = info->top;
+	}
+	if (top != &info->vfs_inode)
+		igrab(top);
+	info->top = top;
+	iput(old_top);
+}
+
+static inline struct inode *grab_top(struct sdcardfs_inode_info *info)
+{
+	struct inode *top = info->top;
+	if (top) {
+		return igrab(top);
+	} else {
+		return NULL;
+	}
+}
+
+static inline void release_top(struct sdcardfs_inode_info *info)
+{
+	iput(info->top);
+}
+
 static inline int get_gid(struct sdcardfs_inode_info *info) {
 	struct sdcardfs_sb_info *sb_info = SDCARDFS_SB(info->vfs_inode.i_sb);
 	if (sb_info->options.gid == AID_SDCARD_RW) {
@@ -337,7 +368,7 @@ static inline int get_gid(struct sdcardfs_inode_info *info) {
 static inline int get_mode(struct sdcardfs_inode_info *info) {
 	int owner_mode;
 	int filtered_mode;
-	struct sdcardfs_sb_info *sb_info = SDCARDFS_SB(info->vfs_inode.i_sb);
+	struct sdcardfs_sb_info * sb_info = SDCARDFS_SB(info->vfs_inode.i_sb);
 	int visible_mode = 0775 & ~sb_info->options.mask;
 
 	if (info->perm == PERM_PRE_ROOT) {
@@ -403,11 +434,12 @@ extern int packagelist_init(void);
 extern void packagelist_exit(void);
 
 /* for derived_perm.c */
-extern void setup_derived_state(struct inode *inode, perm_t perm,
-			userid_t userid, uid_t uid, bool under_android);
+extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
+			uid_t uid, bool under_android, struct inode *top);
 extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
 extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry);
-extern void get_derive_permissions_recursive(struct dentry *parent);
+extern void fixup_top_recursive(struct dentry *parent);
+extern void fixup_perms_recursive(struct dentry *dentry, const char *name, size_t len);
 
 extern void update_derived_permission_lock(struct dentry *dentry);
 extern int need_graft_path(struct dentry *dentry);
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index 1d6490128c99..0a465395aab7 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -126,6 +126,7 @@ static void sdcardfs_evict_inode(struct inode *inode)
 	 */
 	lower_inode = sdcardfs_lower_inode(inode);
 	sdcardfs_set_lower_inode(inode, NULL);
+	set_top(SDCARDFS_I(inode), inode);
 	iput(lower_inode);
 }
 

From cae2e7f25c48be832ef5c7206346074db67599d4 Mon Sep 17 00:00:00 2001
From: alvin_liang <alvin_liang@htc.com>
Date: Mon, 19 Sep 2016 16:59:12 +0800
Subject: [PATCH 0617/3220] ANDROID: sdcardfs: fix external storage exporting
 incorrect uid

Symptom: App cannot write into per-app folder
Root Cause: sdcardfs exports incorrect uid
Solution: fix uid
Project: All
Note:
Test done by RD: passed

Change-Id: Iff64f6f40ba4c679f07f4426d3db6e6d0db7e3ca
---
 fs/sdcardfs/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 6d526bf3d956..2decea3d1e3e 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -274,7 +274,7 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name,
 					sb_info->options.fs_low_uid,
 					sb_info->options.fs_low_gid, 00755);*/
 	} else {
-		setup_derived_state(sb->s_root->d_inode, PERM_ROOT, sb_info->options.fs_low_uid, AID_ROOT, false, sb->s_root->d_inode);
+		setup_derived_state(d_inode(sb->s_root), PERM_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root));
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
 	}
 	fix_derived_permission(sb->s_root->d_inode);

From f32ddec923dd91d3c3c3cbb5fed48b4d36aac99c Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 26 Sep 2016 14:48:22 -0700
Subject: [PATCH 0618/3220] ANDROID: sdcardfs: Move directory unlock before
 touch

This removes a deadlock under low memory conditions.
filp_open can call lookup_slow, which will attempt to
lock the parent.

Change-Id: I940643d0793f5051d1e79a56f4da2fa8ca3d8ff7
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/inode.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 67bcee2c379a..3c353c95ef3e 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -296,14 +296,17 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	copied_fs = copy_fs_struct(current->fs);
 	if (!copied_fs) {
 		err = -ENOMEM;
+		unlock_dir(lower_parent_dentry);
 		goto out_unlock;
 	}
 	current->fs = copied_fs;
 	current->fs->umask = 0;
 	err = vfs_mkdir(d_inode(lower_parent_dentry), lower_dentry, mode);
 
-	if (err)
+	if (err) {
+		unlock_dir(lower_parent_dentry);
 		goto out;
+	}
 
 	/* if it is a local obb dentry, setup it with the base obbpath */
 	if(need_graft_path(dentry)) {
@@ -325,14 +328,18 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	}
 
 	err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pi->userid);
-	if (err)
+	if (err) {
+		unlock_dir(lower_parent_dentry);
 		goto out;
+	}
 
 	fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
 	fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
 	/* update number of links on parent directory */
 	set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink);
 
+	unlock_dir(lower_parent_dentry);
+
 	if ((!sbi->options.multiuser) && (!strcasecmp(dentry->d_name.name, "obb"))
 		&& (pi->perm == PERM_ANDROID) && (pi->userid == 0))
 		make_nomedia_in_obb = 1;
@@ -352,7 +359,6 @@ out:
 	current->fs = saved_fs;
 	free_fs_struct(copied_fs);
 out_unlock:
-	unlock_dir(lower_parent_dentry);
 	sdcardfs_put_lower_path(dentry, &lower_path);
 out_revert:
 	REVERT_CRED(saved_cred);

From 6b42d02561d335017cd6066f506514f32962fa2d Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 26 Oct 2016 15:29:51 -0700
Subject: [PATCH 0619/3220] ANDROID: mnt: Add filesystem private data to mount
 points

This starts to add private data associated directly
to mount points. The intent is to give filesystems
a sense of where they have come from, as a means of
letting a filesystem take different actions based on
this information.

Change-Id: Ie769d7b3bb2f5972afe05c1bf16cf88c91647ab2
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/namespace.c        | 28 +++++++++++++++++++++++++++-
 fs/pnode.c            | 14 ++++++++++++++
 fs/pnode.h            |  1 +
 include/linux/fs.h    |  3 +++
 include/linux/mount.h |  1 +
 5 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 0570729c87fd..c38f318eb9df 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -577,6 +577,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 
 static void free_vfsmnt(struct mount *mnt)
 {
+	kfree(mnt->mnt.data);
 	kfree_const(mnt->mnt_devname);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
@@ -942,11 +943,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 
+	mnt->mnt.data = NULL;
+	if (type->alloc_mnt_data) {
+		mnt->mnt.data = type->alloc_mnt_data();
+		if (!mnt->mnt.data) {
+			mnt_free_id(mnt);
+			free_vfsmnt(mnt);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
 	if (flags & MS_KERNMOUNT)
 		mnt->mnt.mnt_flags = MNT_INTERNAL;
 
 	root = mount_fs(type, flags, name, data);
 	if (IS_ERR(root)) {
+		kfree(mnt->mnt.data);
 		mnt_free_id(mnt);
 		free_vfsmnt(mnt);
 		return ERR_CAST(root);
@@ -974,6 +985,14 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 
+	if (sb->s_op->clone_mnt_data) {
+		mnt->mnt.data = sb->s_op->clone_mnt_data(old->mnt.data);
+		if (!mnt->mnt.data) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+	}
+
 	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
 		mnt->mnt_group_id = 0; /* not a peer of original */
 	else
@@ -1042,6 +1061,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	return mnt;
 
  out_free:
+	kfree(mnt->mnt.data);
 	mnt_free_id(mnt);
 	free_vfsmnt(mnt);
 	return ERR_PTR(err);
@@ -2207,8 +2227,14 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 		err = change_mount_flags(path->mnt, flags);
 	else if (!capable(CAP_SYS_ADMIN))
 		err = -EPERM;
-	else
+	else {
 		err = do_remount_sb(sb, flags, data, 0);
+		namespace_lock();
+		lock_mount_hash();
+		propagate_remount(mnt);
+		unlock_mount_hash();
+		namespace_unlock();
+	}
 	if (!err) {
 		lock_mount_hash();
 		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
diff --git a/fs/pnode.c b/fs/pnode.c
index 6367e1e435c6..4e2d78ec053a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -450,3 +450,17 @@ int propagate_umount(struct list_head *list)
 		__propagate_umount(mnt);
 	return 0;
 }
+
+int propagate_remount(struct mount *mnt) {
+	struct mount *m;
+	struct super_block *sb = mnt->mnt.mnt_sb;
+	int ret = 0;
+
+	if (sb->s_op->copy_mnt_data) {
+		for (m = first_slave(mnt); m->mnt_slave.next != &mnt->mnt_slave_list; m = next_slave(m)) {
+			sb->s_op->copy_mnt_data(m->mnt.data, mnt->mnt.data);
+		}
+	}
+
+	return ret;
+}
diff --git a/fs/pnode.h b/fs/pnode.h
index 0fcdbe7ca648..4e8e94dc9e6a 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -44,6 +44,7 @@ int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 int propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
+int propagate_remount(struct mount *);
 void mnt_release_group_id(struct mount *);
 int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3aa514254161..3996b8ec0b84 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1697,6 +1697,8 @@ struct super_operations {
 	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
+	void *(*clone_mnt_data) (void *);
+	void (*copy_mnt_data) (void *, void *);
 	void (*umount_begin) (struct super_block *);
 
 	int (*show_options)(struct seq_file *, struct dentry *);
@@ -1931,6 +1933,7 @@ struct file_system_type {
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
+	void *(*alloc_mnt_data) (void);
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;
 	struct file_system_type * next;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index f822c3c11377..0e9b0977237a 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -67,6 +67,7 @@ struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
 	struct super_block *mnt_sb;	/* pointer to superblock */
 	int mnt_flags;
+	void *data;
 };
 
 struct file; /* forward dec */

From 0ad2dd493c2fb614cb0b7f2355990ce035224590 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 26 Oct 2016 15:58:22 -0700
Subject: [PATCH 0620/3220] ANDROID: vfs: Allow filesystems to access their
 private mount data

Now we pass the vfsmount when mounting and remounting.
This allows the filesystem to actually set up the mount
specific data, although we can't quite do anything with
it yet. show_options is expanded to include data that
lives with the mount.

To avoid changing existing filesystems, these have
been added as new vfs functions.

Change-Id: If80670bfad9f287abb8ac22457e1b034c9697097
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/internal.h       |  4 +++-
 fs/namespace.c      |  4 ++--
 fs/proc_namespace.c |  8 ++++++--
 fs/super.c          | 28 +++++++++++++++++++++++-----
 include/linux/fs.h  |  4 ++++
 5 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index 71859c4d0b41..6387b35a1c0d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -84,9 +84,11 @@ extern struct file *get_empty_filp(void);
  * super.c
  */
 extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int do_remount_sb2(struct vfsmount *, struct super_block *, int,
+								void *, int);
 extern bool trylock_super(struct super_block *sb);
 extern struct dentry *mount_fs(struct file_system_type *,
-			       int, const char *, void *);
+			       int, const char *, struct vfsmount *, void *);
 extern struct super_block *user_get_super(dev_t);
 
 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index c38f318eb9df..90c47548ab61 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -955,7 +955,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	if (flags & MS_KERNMOUNT)
 		mnt->mnt.mnt_flags = MNT_INTERNAL;
 
-	root = mount_fs(type, flags, name, data);
+	root = mount_fs(type, flags, name, &mnt->mnt, data);
 	if (IS_ERR(root)) {
 		kfree(mnt->mnt.data);
 		mnt_free_id(mnt);
@@ -2228,7 +2228,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	else if (!capable(CAP_SYS_ADMIN))
 		err = -EPERM;
 	else {
-		err = do_remount_sb(sb, flags, data, 0);
+		err = do_remount_sb2(path->mnt, sb, flags, data, 0);
 		namespace_lock();
 		lock_mount_hash();
 		propagate_remount(mnt);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8ebd9a334085..a0770fee84d2 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -118,7 +118,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
 	if (err)
 		goto out;
 	show_mnt_opts(m, mnt);
-	if (sb->s_op->show_options)
+	if (sb->s_op->show_options2)
+			err = sb->s_op->show_options2(mnt, m, mnt_path.dentry);
+	else if (sb->s_op->show_options)
 		err = sb->s_op->show_options(m, mnt_path.dentry);
 	seq_puts(m, " 0 0\n");
 out:
@@ -178,7 +180,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 	err = show_sb_opts(m, sb);
 	if (err)
 		goto out;
-	if (sb->s_op->show_options)
+	if (sb->s_op->show_options2) {
+		err = sb->s_op->show_options2(mnt, m, mnt->mnt_root);
+	} else if (sb->s_op->show_options)
 		err = sb->s_op->show_options(m, mnt->mnt_root);
 	seq_putc(m, '\n');
 out:
diff --git a/fs/super.c b/fs/super.c
index 1014e7cc355f..589aff731caa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -702,7 +702,8 @@ rescan:
 }
 
 /**
- *	do_remount_sb - asks filesystem to change mount options.
+ *	do_remount_sb2 - asks filesystem to change mount options.
+ *	@mnt:   mount we are looking at
  *	@sb:	superblock in question
  *	@flags:	numeric part of options
  *	@data:	the rest of options
@@ -710,7 +711,7 @@ rescan:
  *
  *	Alters the mount options of a mounted file system.
  */
-int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+int do_remount_sb2(struct vfsmount *mnt, struct super_block *sb, int flags, void *data, int force)
 {
 	int retval;
 	int remount_ro;
@@ -752,7 +753,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		}
 	}
 
-	if (sb->s_op->remount_fs) {
+	if (mnt && sb->s_op->remount_fs2) {
+		retval = sb->s_op->remount_fs2(mnt, sb, &flags, data);
+		if (retval) {
+			if (!force)
+				goto cancel_readonly;
+			/* If forced remount, go ahead despite any errors */
+			WARN(1, "forced remount of a %s fs returned %i\n",
+			     sb->s_type->name, retval);
+		}
+	} else if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
 		if (retval) {
 			if (!force)
@@ -784,6 +794,11 @@ cancel_readonly:
 	return retval;
 }
 
+int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+{
+	return do_remount_sb2(NULL, sb, flags, data, force);
+}
+
 static void do_emergency_remount(struct work_struct *work)
 {
 	struct super_block *sb, *p = NULL;
@@ -1103,7 +1118,7 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 EXPORT_SYMBOL(mount_single);
 
 struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+mount_fs(struct file_system_type *type, int flags, const char *name, struct vfsmount *mnt, void *data)
 {
 	struct dentry *root;
 	struct super_block *sb;
@@ -1120,7 +1135,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 			goto out_free_secdata;
 	}
 
-	root = type->mount(type, flags, name, data);
+	if (type->mount2)
+		root = type->mount2(mnt, type, flags, name, data);
+	else
+		root = type->mount(type, flags, name, data);
 	if (IS_ERR(root)) {
 		error = PTR_ERR(root);
 		goto out_free_secdata;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3996b8ec0b84..31382e732e9e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1697,11 +1697,13 @@ struct super_operations {
 	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
+	int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *);
 	void *(*clone_mnt_data) (void *);
 	void (*copy_mnt_data) (void *, void *);
 	void (*umount_begin) (struct super_block *);
 
 	int (*show_options)(struct seq_file *, struct dentry *);
+	int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *);
 	int (*show_devname)(struct seq_file *, struct dentry *);
 	int (*show_path)(struct seq_file *, struct dentry *);
 	int (*show_stats)(struct seq_file *, struct dentry *);
@@ -1933,6 +1935,8 @@ struct file_system_type {
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
+	struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int,
+			       const char *, void *);
 	void *(*alloc_mnt_data) (void);
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;

From 21fc44e40ae3c574c288159846ab5a4762bd0e3f Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 26 Oct 2016 16:27:45 -0700
Subject: [PATCH 0621/3220] ANDROID: vfs: Add permission2 for filesystems with
 per mount permissions

This allows filesystems to use their mount private data to
influence the permssions they return in permission2. It has
been separated into a new call to avoid disrupting current
permission users.

Change-Id: I9d416e3b8b6eca84ef3e336bd2af89ddd51df6ca
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/exec.c                          |   2 +-
 fs/namei.c                         | 175 ++++++++++++++++++++---------
 fs/notify/fanotify/fanotify_user.c |   2 +-
 fs/notify/inotify/inotify_user.c   |   2 +-
 fs/open.c                          |  16 ++-
 fs/utimes.c                        |   2 +-
 include/linux/fs.h                 |  11 ++
 include/linux/namei.h              |   1 +
 ipc/mqueue.c                       |  10 +-
 security/inode.c                   |   2 +-
 10 files changed, 158 insertions(+), 65 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index b06623a9347f..50f062d64d3a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1123,7 +1123,7 @@ EXPORT_SYMBOL(flush_old_exec);
 
 void would_dump(struct linux_binprm *bprm, struct file *file)
 {
-	if (inode_permission(file_inode(file), MAY_READ) < 0)
+	if (inode_permission2(file->f_path.mnt, file_inode(file), MAY_READ) < 0)
 		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
 }
 EXPORT_SYMBOL(would_dump);
diff --git a/fs/namei.c b/fs/namei.c
index 0c3974cd3ecd..82c7ec6532d5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -373,9 +373,11 @@ EXPORT_SYMBOL(generic_permission);
  * flag in inode->i_opflags, that says "this has not special
  * permission function, use the fast case".
  */
-static inline int do_inode_permission(struct inode *inode, int mask)
+static inline int do_inode_permission(struct vfsmount *mnt, struct inode *inode, int mask)
 {
 	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
+		if (likely(mnt && inode->i_op->permission2))
+			return inode->i_op->permission2(mnt, inode, mask);
 		if (likely(inode->i_op->permission))
 			return inode->i_op->permission(inode, mask);
 
@@ -399,7 +401,7 @@ static inline int do_inode_permission(struct inode *inode, int mask)
  * This does not check for a read-only file system.  You probably want
  * inode_permission().
  */
-int __inode_permission(struct inode *inode, int mask)
+int __inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
 {
 	int retval;
 
@@ -411,7 +413,7 @@ int __inode_permission(struct inode *inode, int mask)
 			return -EACCES;
 	}
 
-	retval = do_inode_permission(inode, mask);
+	retval = do_inode_permission(mnt, inode, mask);
 	if (retval)
 		return retval;
 
@@ -419,7 +421,14 @@ int __inode_permission(struct inode *inode, int mask)
 	if (retval)
 		return retval;
 
-	return security_inode_permission(inode, mask);
+	retval = security_inode_permission(inode, mask);
+	return retval;
+}
+EXPORT_SYMBOL(__inode_permission2);
+
+int __inode_permission(struct inode *inode, int mask)
+{
+	return __inode_permission2(NULL, inode, mask);
 }
 EXPORT_SYMBOL(__inode_permission);
 
@@ -455,14 +464,20 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
  *
  * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
  */
-int inode_permission(struct inode *inode, int mask)
+int inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
 {
 	int retval;
 
 	retval = sb_permission(inode->i_sb, inode, mask);
 	if (retval)
 		return retval;
-	return __inode_permission(inode, mask);
+	return __inode_permission2(mnt, inode, mask);
+}
+EXPORT_SYMBOL(inode_permission2);
+
+int inode_permission(struct inode *inode, int mask)
+{
+	return inode_permission2(NULL, inode, mask);
 }
 EXPORT_SYMBOL(inode_permission);
 
@@ -1643,13 +1658,13 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
 static inline int may_lookup(struct nameidata *nd)
 {
 	if (nd->flags & LOOKUP_RCU) {
-		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+		int err = inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
 		if (err != -ECHILD)
 			return err;
 		if (unlazy_walk(nd, NULL, 0))
 			return -ECHILD;
 	}
-	return inode_permission(nd->inode, MAY_EXEC);
+	return inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC);
 }
 
 static inline int handle_dots(struct nameidata *nd, int type)
@@ -1998,11 +2013,12 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	nd->depth = 0;
 	if (flags & LOOKUP_ROOT) {
 		struct dentry *root = nd->root.dentry;
+		struct vfsmount *mnt = nd->root.mnt;
 		struct inode *inode = root->d_inode;
 		if (*s) {
 			if (!d_can_lookup(root))
 				return ERR_PTR(-ENOTDIR);
-			retval = inode_permission(inode, MAY_EXEC);
+			retval = inode_permission2(mnt, inode, MAY_EXEC);
 			if (retval)
 				return ERR_PTR(retval);
 		}
@@ -2273,13 +2289,14 @@ EXPORT_SYMBOL(vfs_path_lookup);
 /**
  * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:	pathname component to lookup
+ * @mnt:	mount we are looking up on
  * @base:	base directory to lookup from
  * @len:	maximum length @len should be interpreted to
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
  */
-struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *lookup_one_len2(const char *name, struct vfsmount *mnt, struct dentry *base, int len)
 {
 	struct qstr this;
 	unsigned int c;
@@ -2313,12 +2330,18 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 			return ERR_PTR(err);
 	}
 
-	err = inode_permission(base->d_inode, MAY_EXEC);
+	err = inode_permission2(mnt, base->d_inode, MAY_EXEC);
 	if (err)
 		return ERR_PTR(err);
 
 	return __lookup_hash(&this, base, 0);
 }
+EXPORT_SYMBOL(lookup_one_len2);
+
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+	return lookup_one_len2(name, NULL, base, len);
+}
 EXPORT_SYMBOL(lookup_one_len);
 
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
@@ -2545,7 +2568,7 @@ EXPORT_SYMBOL(__check_sticky);
  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
  *     nfs_async_unlink().
  */
-static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+static int may_delete(struct vfsmount *mnt, struct inode *dir, struct dentry *victim, bool isdir)
 {
 	struct inode *inode = d_backing_inode(victim);
 	int error;
@@ -2557,7 +2580,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 	BUG_ON(victim->d_parent->d_inode != dir);
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
@@ -2588,14 +2611,14 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
  *  3. We should have write and exec permissions on dir
  *  4. We can't do it if dir is immutable (done in permission())
  */
-static inline int may_create(struct inode *dir, struct dentry *child)
+static inline int may_create(struct vfsmount *mnt, struct inode *dir, struct dentry *child)
 {
 	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
 	if (child->d_inode)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /*
@@ -2642,10 +2665,10 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
 
-int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool want_excl)
+int vfs_create2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry,
+		umode_t mode, bool want_excl)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(mnt, dir, dentry);
 	if (error)
 		return error;
 
@@ -2661,11 +2684,19 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		fsnotify_create(dir, dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_create2);
+
+int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool want_excl)
+{
+	return vfs_create2(NULL, dir, dentry, mode, want_excl);
+}
 EXPORT_SYMBOL(vfs_create);
 
 static int may_open(struct path *path, int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
+	struct vfsmount *mnt = path->mnt;
 	struct inode *inode = dentry->d_inode;
 	int error;
 
@@ -2694,7 +2725,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
 		break;
 	}
 
-	error = inode_permission(inode, acc_mode);
+	error = inode_permission2(mnt, inode, acc_mode);
 	if (error)
 		return error;
 
@@ -2750,7 +2781,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
 	if (error)
 		return error;
 
-	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+	error = inode_permission2(dir->mnt, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -2948,6 +2979,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 			bool got_write, int *opened)
 {
 	struct dentry *dir = nd->path.dentry;
+	struct vfsmount *mnt = nd->path.mnt;
 	struct inode *dir_inode = dir->d_inode;
 	struct dentry *dentry;
 	int error;
@@ -2995,7 +3027,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 		error = security_path_mknod(&nd->path, dentry, mode, 0);
 		if (error)
 			goto out_dput;
-		error = vfs_create(dir->d_inode, dentry, mode,
+		error = vfs_create2(mnt, dir->d_inode, dentry, mode,
 				   nd->flags & LOOKUP_EXCL);
 		if (error)
 			goto out_dput;
@@ -3254,7 +3286,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		goto out;
 	dir = path.dentry->d_inode;
 	/* we want directory to be writable */
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission2(path.mnt, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out2;
 	if (!dir->i_op->tmpfile) {
@@ -3488,9 +3520,9 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname,
 }
 EXPORT_SYMBOL(user_path_create);
 
-int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+int vfs_mknod2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(mnt, dir, dentry);
 
 	if (error)
 		return error;
@@ -3514,6 +3546,12 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 		fsnotify_create(dir, dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_mknod2);
+
+int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+	return vfs_mknod2(NULL, dir, dentry, mode, dev);
+}
 EXPORT_SYMBOL(vfs_mknod);
 
 static int may_mknod(umode_t mode)
@@ -3556,10 +3594,10 @@ retry:
 		goto out;
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
-			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+			error = vfs_create2(path.mnt, path.dentry->d_inode,dentry,mode,true);
 			break;
 		case S_IFCHR: case S_IFBLK:
-			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
+			error = vfs_mknod2(path.mnt, path.dentry->d_inode,dentry,mode,
 					new_decode_dev(dev));
 			break;
 		case S_IFIFO: case S_IFSOCK:
@@ -3580,9 +3618,9 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
 }
 
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+int vfs_mkdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(mnt, dir, dentry);
 	unsigned max_links = dir->i_sb->s_max_links;
 
 	if (error)
@@ -3604,6 +3642,12 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 		fsnotify_mkdir(dir, dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_mkdir2);
+
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	return vfs_mkdir2(NULL, dir, dentry, mode);
+}
 EXPORT_SYMBOL(vfs_mkdir);
 
 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
@@ -3622,7 +3666,7 @@ retry:
 		mode &= ~current_umask();
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error)
-		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+		error = vfs_mkdir2(path.mnt, path.dentry->d_inode, dentry, mode);
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -3661,9 +3705,9 @@ void dentry_unhash(struct dentry *dentry)
 }
 EXPORT_SYMBOL(dentry_unhash);
 
-int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+int vfs_rmdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry)
 {
-	int error = may_delete(dir, dentry, 1);
+	int error = may_delete(mnt, dir, dentry, 1);
 
 	if (error)
 		return error;
@@ -3698,6 +3742,10 @@ out:
 		d_delete(dentry);
 	return error;
 }
+int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	return vfs_rmdir2(NULL, dir, dentry);
+}
 EXPORT_SYMBOL(vfs_rmdir);
 
 static long do_rmdir(int dfd, const char __user *pathname)
@@ -3743,7 +3791,7 @@ retry:
 	error = security_path_rmdir(&path, dentry);
 	if (error)
 		goto exit3;
-	error = vfs_rmdir(path.dentry->d_inode, dentry);
+	error = vfs_rmdir2(path.mnt, path.dentry->d_inode, dentry);
 exit3:
 	dput(dentry);
 exit2:
@@ -3782,10 +3830,10 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.
  */
-int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+int vfs_unlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
 {
 	struct inode *target = dentry->d_inode;
-	int error = may_delete(dir, dentry, 0);
+	int error = may_delete(mnt, dir, dentry, 0);
 
 	if (error)
 		return error;
@@ -3820,6 +3868,12 @@ out:
 
 	return error;
 }
+EXPORT_SYMBOL(vfs_unlink2);
+
+int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+{
+	return vfs_unlink2(NULL, dir, dentry, delegated_inode);
+}
 EXPORT_SYMBOL(vfs_unlink);
 
 /*
@@ -3867,7 +3921,7 @@ retry_deleg:
 		error = security_path_unlink(&path, dentry);
 		if (error)
 			goto exit2;
-		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+		error = vfs_unlink2(path.mnt, path.dentry->d_inode, dentry, &delegated_inode);
 exit2:
 		dput(dentry);
 	}
@@ -3917,9 +3971,9 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 	return do_unlinkat(AT_FDCWD, pathname);
 }
 
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+int vfs_symlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, const char *oldname)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(mnt, dir, dentry);
 
 	if (error)
 		return error;
@@ -3936,6 +3990,12 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 		fsnotify_create(dir, dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_symlink2);
+
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+{
+	return vfs_symlink2(NULL, dir, dentry, oldname);
+}
 EXPORT_SYMBOL(vfs_symlink);
 
 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
@@ -3958,7 +4018,7 @@ retry:
 
 	error = security_path_symlink(&path, dentry, from->name);
 	if (!error)
-		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+		error = vfs_symlink2(path.mnt, path.dentry->d_inode, dentry, from->name);
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -3993,7 +4053,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.
  */
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+int vfs_link2(struct vfsmount *mnt, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
 {
 	struct inode *inode = old_dentry->d_inode;
 	unsigned max_links = dir->i_sb->s_max_links;
@@ -4002,7 +4062,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (!inode)
 		return -ENOENT;
 
-	error = may_create(dir, new_dentry);
+	error = may_create(mnt, dir, new_dentry);
 	if (error)
 		return error;
 
@@ -4045,6 +4105,12 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		fsnotify_link(dir, inode, new_dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_link2);
+
+int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+{
+	return vfs_link2(NULL, old_dentry, dir, new_dentry, delegated_inode);
+}
 EXPORT_SYMBOL(vfs_link);
 
 /*
@@ -4100,7 +4166,7 @@ retry:
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+	error = vfs_link2(old_path.mnt, old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
 out_dput:
 	done_path_create(&new_path, new_dentry);
 	if (delegated_inode) {
@@ -4175,7 +4241,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *	   ->i_mutex on parents, which works but leads to some truly excessive
  *	   locking].
  */
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+int vfs_rename2(struct vfsmount *mnt,
+	       struct inode *old_dir, struct dentry *old_dentry,
 	       struct inode *new_dir, struct dentry *new_dentry,
 	       struct inode **delegated_inode, unsigned int flags)
 {
@@ -4190,19 +4257,19 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (source == target)
 		return 0;
 
-	error = may_delete(old_dir, old_dentry, is_dir);
+	error = may_delete(mnt, old_dir, old_dentry, is_dir);
 	if (error)
 		return error;
 
 	if (!target) {
-		error = may_create(new_dir, new_dentry);
+		error = may_create(mnt, new_dir, new_dentry);
 	} else {
 		new_is_dir = d_is_dir(new_dentry);
 
 		if (!(flags & RENAME_EXCHANGE))
-			error = may_delete(new_dir, new_dentry, is_dir);
+			error = may_delete(mnt, new_dir, new_dentry, is_dir);
 		else
-			error = may_delete(new_dir, new_dentry, new_is_dir);
+			error = may_delete(mnt, new_dir, new_dentry, new_is_dir);
 	}
 	if (error)
 		return error;
@@ -4219,12 +4286,12 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 */
 	if (new_dir != old_dir) {
 		if (is_dir) {
-			error = inode_permission(source, MAY_WRITE);
+			error = inode_permission2(mnt, source, MAY_WRITE);
 			if (error)
 				return error;
 		}
 		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-			error = inode_permission(target, MAY_WRITE);
+			error = inode_permission2(mnt, target, MAY_WRITE);
 			if (error)
 				return error;
 		}
@@ -4307,6 +4374,14 @@ out:
 
 	return error;
 }
+EXPORT_SYMBOL(vfs_rename2);
+
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+	       struct inode *new_dir, struct dentry *new_dentry,
+	       struct inode **delegated_inode, unsigned int flags)
+{
+	return vfs_rename2(NULL, old_dir, old_dentry, new_dir, new_dentry, delegated_inode, flags);
+}
 EXPORT_SYMBOL(vfs_rename);
 
 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
@@ -4420,7 +4495,7 @@ retry_deleg:
 				     &new_path, new_dentry, flags);
 	if (error)
 		goto exit5;
-	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
+	error = vfs_rename2(old_path.mnt, old_path.dentry->d_inode, old_dentry,
 			   new_path.dentry->d_inode, new_dentry,
 			   &delegated_inode, flags);
 exit5:
@@ -4465,7 +4540,7 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
 
 int vfs_whiteout(struct inode *dir, struct dentry *dentry)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(NULL, dir, dentry);
 	if (error)
 		return error;
 
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8e8e6bcd1d43..72559b94f159 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -475,7 +475,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
 	}
 
 	/* you can only watch an inode if you have read permissions on it */
-	ret = inode_permission(path->dentry->d_inode, MAY_READ);
+	ret = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
 	if (ret)
 		path_put(path);
 out:
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e2893f17dde2..4c5b43d15e6e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -337,7 +337,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 	if (error)
 		return error;
 	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	error = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
 	if (error)
 		path_put(path);
 	return error;
diff --git a/fs/open.c b/fs/open.c
index b6f1e96a7c0b..1786467725b9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -68,9 +68,11 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 long vfs_truncate(struct path *path, loff_t length)
 {
 	struct inode *inode;
+	struct vfsmount *mnt;
 	long error;
 
 	inode = path->dentry->d_inode;
+	mnt = path->mnt;
 
 	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
 	if (S_ISDIR(inode->i_mode))
@@ -82,7 +84,7 @@ long vfs_truncate(struct path *path, loff_t length)
 	if (error)
 		goto out;
 
-	error = inode_permission(inode, MAY_WRITE);
+	error = inode_permission2(mnt, inode, MAY_WRITE);
 	if (error)
 		goto mnt_drop_write_and_out;
 
@@ -340,6 +342,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 	struct cred *override_cred;
 	struct path path;
 	struct inode *inode;
+	struct vfsmount *mnt;
 	int res;
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
 
@@ -370,6 +373,7 @@ retry:
 		goto out;
 
 	inode = d_backing_inode(path.dentry);
+	mnt = path.mnt;
 
 	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		/*
@@ -381,7 +385,7 @@ retry:
 			goto out_path_release;
 	}
 
-	res = inode_permission(inode, mode | MAY_ACCESS);
+	res = inode_permission2(mnt, inode, mode | MAY_ACCESS);
 	/* SuS v2 requires we report a read only fs too */
 	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 		goto out_path_release;
@@ -425,7 +429,7 @@ retry:
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
@@ -445,6 +449,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 {
 	struct fd f = fdget_raw(fd);
 	struct inode *inode;
+	struct vfsmount *mnt;
 	int error = -EBADF;
 
 	error = -EBADF;
@@ -452,12 +457,13 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 		goto out;
 
 	inode = file_inode(f.file);
+	mnt = f.file->f_path.mnt;
 
 	error = -ENOTDIR;
 	if (!S_ISDIR(inode->i_mode))
 		goto out_putf;
 
-	error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
+	error = inode_permission2(mnt, inode, MAY_EXEC | MAY_CHDIR);
 	if (!error)
 		set_fs_pwd(current->fs, &f.file->f_path);
 out_putf:
@@ -476,7 +482,7 @@ retry:
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
diff --git a/fs/utimes.c b/fs/utimes.c
index aa138d64560a..c3c663dbf083 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -97,7 +97,7 @@ static int utimes_common(struct path *path, struct timespec *times)
 			goto mnt_drop_write_and_out;
 
 		if (!inode_owner_or_capable(inode)) {
-			error = inode_permission(inode, MAY_WRITE);
+			error = inode_permission2(path->mnt, inode, MAY_WRITE);
 			if (error)
 				goto mnt_drop_write_and_out;
 		}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 31382e732e9e..b5a863580120 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1503,13 +1503,21 @@ extern bool inode_owner_or_capable(const struct inode *inode);
  * VFS helper functions..
  */
 extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
+extern int vfs_create2(struct vfsmount *, struct inode *, struct dentry *, umode_t, bool);
 extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
+extern int vfs_mkdir2(struct vfsmount *, struct inode *, struct dentry *, umode_t);
 extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+extern int vfs_mknod2(struct vfsmount *, struct inode *, struct dentry *, umode_t, dev_t);
 extern int vfs_symlink(struct inode *, struct dentry *, const char *);
+extern int vfs_symlink2(struct vfsmount *, struct inode *, struct dentry *, const char *);
 extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
+extern int vfs_link2(struct vfsmount *, struct dentry *, struct inode *, struct dentry *, struct inode **);
 extern int vfs_rmdir(struct inode *, struct dentry *);
+extern int vfs_rmdir2(struct vfsmount *, struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
+extern int vfs_unlink2(struct vfsmount *, struct inode *, struct dentry *, struct inode **);
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
+extern int vfs_rename2(struct vfsmount *, struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
 extern int vfs_whiteout(struct inode *, struct dentry *);
 
 /*
@@ -1635,6 +1643,7 @@ struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 	const char * (*follow_link) (struct dentry *, void **);
 	int (*permission) (struct inode *, int);
+	int (*permission2) (struct vfsmount *, struct inode *, int);
 	struct posix_acl * (*get_acl)(struct inode *, int);
 
 	int (*readlink) (struct dentry *, char __user *,int);
@@ -2442,7 +2451,9 @@ extern sector_t bmap(struct inode *, sector_t);
 #endif
 extern int notify_change(struct dentry *, struct iattr *, struct inode **);
 extern int inode_permission(struct inode *, int);
+extern int inode_permission2(struct vfsmount *, struct inode *, int);
 extern int __inode_permission(struct inode *, int);
+extern int __inode_permission2(struct vfsmount *, struct inode *, int);
 extern int generic_permission(struct inode *, int);
 extern int __check_sticky(struct inode *dir, struct inode *inode);
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d53c25453aca..023359f18567 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -79,6 +79,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 		const char *, unsigned int, struct path *);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len2(const char *, struct vfsmount *mnt, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 161a1807e6ef..25b7a678f9ef 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -747,7 +747,7 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
 	}
 
 	mode &= ~current_umask();
-	ret = vfs_create(dir, path->dentry, mode, true);
+	ret = vfs_create2(path->mnt, dir, path->dentry, mode, true);
 	path->dentry->d_fsdata = NULL;
 	if (ret)
 		return ERR_PTR(ret);
@@ -763,7 +763,7 @@ static struct file *do_open(struct path *path, int oflag)
 	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
 		return ERR_PTR(-EINVAL);
 	acc = oflag2acc[oflag & O_ACCMODE];
-	if (inode_permission(d_inode(path->dentry), acc))
+	if (inode_permission2(path->mnt, d_inode(path->dentry), acc))
 		return ERR_PTR(-EACCES);
 	return dentry_open(path, oflag, current_cred());
 }
@@ -796,7 +796,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 	ro = mnt_want_write(mnt);	/* we'll drop it in any case */
 	error = 0;
 	mutex_lock(&d_inode(root)->i_mutex);
-	path.dentry = lookup_one_len(name->name, root, strlen(name->name));
+	path.dentry = lookup_one_len2(name->name, mnt, root, strlen(name->name));
 	if (IS_ERR(path.dentry)) {
 		error = PTR_ERR(path.dentry);
 		goto out_putfd;
@@ -867,7 +867,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	if (err)
 		goto out_name;
 	mutex_lock_nested(&d_inode(mnt->mnt_root)->i_mutex, I_MUTEX_PARENT);
-	dentry = lookup_one_len(name->name, mnt->mnt_root,
+	dentry = lookup_one_len2(name->name, mnt, mnt->mnt_root,
 				strlen(name->name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -879,7 +879,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 		err = -ENOENT;
 	} else {
 		ihold(inode);
-		err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
+		err = vfs_unlink2(mnt, d_inode(dentry->d_parent), dentry, NULL);
 	}
 	dput(dentry);
 
diff --git a/security/inode.c b/security/inode.c
index 16622aef9bde..0f1a041bf6cb 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -100,7 +100,7 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode,
 	dir = d_inode(parent);
 
 	mutex_lock(&dir->i_mutex);
-	dentry = lookup_one_len(name, parent, strlen(name));
+	dentry = lookup_one_len2(name, mount, parent, strlen(name));
 	if (IS_ERR(dentry))
 		goto out;
 

From 1cbf8e31e3a7440a3ada22ba06f62d6359c27c0e Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 26 Oct 2016 16:33:11 -0700
Subject: [PATCH 0622/3220] ANDROID: vfs: Add setattr2 for filesystems with per
 mount permissions

This allows filesystems to use their mount private data to
influence the permssions they use in setattr2. It has
been separated into a new call to avoid disrupting current
setattr users.

Change-Id: I19959038309284448f1b7f232d579674ef546385
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/attr.c          | 12 ++++++++++--
 fs/coredump.c      |  2 +-
 fs/inode.c         |  6 +++---
 fs/namei.c         |  2 +-
 fs/open.c          | 21 ++++++++++++++-------
 fs/utimes.c        |  2 +-
 include/linux/fs.h |  4 ++++
 7 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index 6530ced19697..34926834cd49 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -187,7 +187,7 @@ EXPORT_SYMBOL(setattr_copy);
  * the file open for write, as there can be no conflicting delegation in
  * that case.
  */
-int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+int notify_change2(struct vfsmount *mnt, struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	umode_t mode = inode->i_mode;
@@ -262,7 +262,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	if (error)
 		return error;
 
-	if (inode->i_op->setattr)
+	if (mnt && inode->i_op->setattr2)
+		error = inode->i_op->setattr2(mnt, dentry, attr);
+	else if (inode->i_op->setattr)
 		error = inode->i_op->setattr(dentry, attr);
 	else
 		error = simple_setattr(dentry, attr);
@@ -275,4 +277,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 
 	return error;
 }
+EXPORT_SYMBOL(notify_change2);
+
+int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+{
+	return notify_change2(NULL, dentry, attr, delegated_inode);
+}
 EXPORT_SYMBOL(notify_change);
diff --git a/fs/coredump.c b/fs/coredump.c
index 1777331eee76..84d251b582e9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -695,7 +695,7 @@ void do_coredump(const siginfo_t *siginfo)
 			goto close_fail;
 		if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
 			goto close_fail;
-		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+		if (do_truncate2(cprm.file->f_path.mnt, cprm.file->f_path.dentry, 0, 0, cprm.file))
 			goto close_fail;
 	}
 
diff --git a/fs/inode.c b/fs/inode.c
index 1be5f9003eb3..aa3cf5a1b819 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1715,7 +1715,7 @@ int dentry_needs_remove_privs(struct dentry *dentry)
 }
 EXPORT_SYMBOL(dentry_needs_remove_privs);
 
-static int __remove_privs(struct dentry *dentry, int kill)
+static int __remove_privs(struct vfsmount *mnt, struct dentry *dentry, int kill)
 {
 	struct iattr newattrs;
 
@@ -1724,7 +1724,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
 	 * Note we call this on write, so notify_change will not
 	 * encounter any conflicting delegations:
 	 */
-	return notify_change(dentry, &newattrs, NULL);
+	return notify_change2(mnt, dentry, &newattrs, NULL);
 }
 
 /*
@@ -1746,7 +1746,7 @@ int file_remove_privs(struct file *file)
 	if (kill < 0)
 		return kill;
 	if (kill)
-		error = __remove_privs(dentry, kill);
+		error = __remove_privs(file->f_path.mnt, dentry, kill);
 	if (!error)
 		inode_has_no_xattr(inode);
 
diff --git a/fs/namei.c b/fs/namei.c
index 82c7ec6532d5..0230bc59f344 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2760,7 +2760,7 @@ static int handle_truncate(struct file *filp)
 	if (!error)
 		error = security_path_truncate(path);
 	if (!error) {
-		error = do_truncate(path->dentry, 0,
+		error = do_truncate2(path->mnt, path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
 				    filp);
 	}
diff --git a/fs/open.c b/fs/open.c
index 1786467725b9..08406e19e2cf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -34,8 +34,8 @@
 
 #include "internal.h"
 
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
-	struct file *filp)
+int do_truncate2(struct vfsmount *mnt, struct dentry *dentry, loff_t length,
+		unsigned int time_attrs, struct file *filp)
 {
 	int ret;
 	struct iattr newattrs;
@@ -60,10 +60,15 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 
 	mutex_lock(&dentry->d_inode->i_mutex);
 	/* Note any delegations or leases have already been broken: */
-	ret = notify_change(dentry, &newattrs, NULL);
+	ret = notify_change2(mnt, dentry, &newattrs, NULL);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 	return ret;
 }
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+	struct file *filp)
+{
+	return do_truncate2(NULL, dentry, length, time_attrs, filp);
+}
 
 long vfs_truncate(struct path *path, loff_t length)
 {
@@ -108,7 +113,7 @@ long vfs_truncate(struct path *path, loff_t length)
 	if (!error)
 		error = security_path_truncate(path);
 	if (!error)
-		error = do_truncate(path->dentry, length, 0, NULL);
+		error = do_truncate2(mnt, path->dentry, length, 0, NULL);
 
 put_write_and_out:
 	put_write_access(inode);
@@ -157,6 +162,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 {
 	struct inode *inode;
 	struct dentry *dentry;
+	struct vfsmount *mnt;
 	struct fd f;
 	int error;
 
@@ -173,6 +179,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 		small = 0;
 
 	dentry = f.file->f_path.dentry;
+	mnt = f.file->f_path.mnt;
 	inode = dentry->d_inode;
 	error = -EINVAL;
 	if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
@@ -192,7 +199,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 	if (!error)
 		error = security_path_truncate(&f.file->f_path);
 	if (!error)
-		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+		error = do_truncate2(mnt, dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
 	sb_end_write(inode->i_sb);
 out_putf:
 	fdput(f);
@@ -522,7 +529,7 @@ retry_deleg:
 		goto out_unlock;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	error = notify_change(path->dentry, &newattrs, &delegated_inode);
+	error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
 	if (delegated_inode) {
@@ -602,7 +609,7 @@ retry_deleg:
 	mutex_lock(&inode->i_mutex);
 	error = security_path_chown(path, uid, gid);
 	if (!error)
-		error = notify_change(path->dentry, &newattrs, &delegated_inode);
+		error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
 	mutex_unlock(&inode->i_mutex);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
diff --git a/fs/utimes.c b/fs/utimes.c
index c3c663dbf083..dfb457546c60 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -104,7 +104,7 @@ static int utimes_common(struct path *path, struct timespec *times)
 	}
 retry_deleg:
 	mutex_lock(&inode->i_mutex);
-	error = notify_change(path->dentry, &newattrs, &delegated_inode);
+	error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
 	mutex_unlock(&inode->i_mutex);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b5a863580120..31bf8e1e58ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1661,6 +1661,7 @@ struct inode_operations {
 	int (*rename2) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
 	int (*setattr) (struct dentry *, struct iattr *);
+	int (*setattr2) (struct vfsmount *, struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
@@ -2226,6 +2227,8 @@ struct filename {
 extern long vfs_truncate(struct path *, loff_t);
 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 		       struct file *filp);
+extern int do_truncate2(struct vfsmount *, struct dentry *, loff_t start,
+			unsigned int time_attrs, struct file *filp);
 extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
 			loff_t len);
 extern long do_sys_open(int dfd, const char __user *filename, int flags,
@@ -2450,6 +2453,7 @@ extern void emergency_remount(void);
 extern sector_t bmap(struct inode *, sector_t);
 #endif
 extern int notify_change(struct dentry *, struct iattr *, struct inode **);
+extern int notify_change2(struct vfsmount *, struct dentry *, struct iattr *, struct inode **);
 extern int inode_permission(struct inode *, int);
 extern int inode_permission2(struct vfsmount *, struct inode *, int);
 extern int __inode_permission(struct inode *, int);

From f9cb61dcb00ca9a04b98a9f35ac09f3004c87138 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 26 Oct 2016 16:48:45 -0700
Subject: [PATCH 0623/3220] ANDROID: sdcardfs: User new permission2 functions

Change-Id: Ic7e0fb8fdcebb31e657b079fe02ac834c4a50db9
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/inode.c    | 25 +++++++++++++++++++------
 fs/sdcardfs/sdcardfs.h |  4 ++--
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 3c353c95ef3e..dc64c9e2f5e7 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -54,6 +54,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 {
 	int err;
 	struct dentry *lower_dentry;
+	struct vfsmount *lower_dentry_mnt;
 	struct dentry *lower_parent_dentry = NULL;
 	struct path lower_path;
 	const struct cred *saved_cred = NULL;
@@ -73,6 +74,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
+	lower_dentry_mnt = lower_path.mnt;
 	lower_parent_dentry = lock_parent(lower_dentry);
 
 	/* set last 16bytes of mode field to 0664 */
@@ -87,7 +89,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	}
 	current->fs = copied_fs;
 	current->fs->umask = 0;
-	err = vfs_create(d_inode(lower_parent_dentry), lower_dentry, mode, want_excl);
+	err = vfs_create2(lower_dentry_mnt, d_inode(lower_parent_dentry), lower_dentry, mode, want_excl);
 	if (err)
 		goto out;
 
@@ -154,6 +156,7 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int err;
 	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
 	struct inode *lower_dir_inode = sdcardfs_lower_inode(dir);
 	struct dentry *lower_dir_dentry;
 	struct path lower_path;
@@ -172,10 +175,11 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
+	lower_mnt = lower_path.mnt;
 	dget(lower_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
 
-	err = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+	err = vfs_unlink2(lower_mnt, lower_dir_inode, lower_dentry, NULL);
 
 	/*
 	 * Note: unlinking on top of NFS can cause silly-renamed files.
@@ -256,6 +260,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	int err;
 	int make_nomedia_in_obb = 0;
 	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
 	struct dentry *lower_parent_dentry = NULL;
 	struct path lower_path;
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
@@ -286,6 +291,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	/* the lower_dentry is negative here */
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
+	lower_mnt = lower_path.mnt;
 	lower_parent_dentry = lock_parent(lower_dentry);
 
 	/* set last 16bytes of mode field to 0775 */
@@ -301,7 +307,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	}
 	current->fs = copied_fs;
 	current->fs->umask = 0;
-	err = vfs_mkdir(d_inode(lower_parent_dentry), lower_dentry, mode);
+	err = vfs_mkdir2(lower_mnt, d_inode(lower_parent_dentry), lower_dentry, mode);
 
 	if (err) {
 		unlock_dir(lower_parent_dentry);
@@ -370,6 +376,7 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
+	struct vfsmount *lower_mnt;
 	int err;
 	struct path lower_path;
 	const struct cred *saved_cred = NULL;
@@ -390,9 +397,10 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
 	sdcardfs_get_real_lower(dentry, &lower_path);
 
 	lower_dentry = lower_path.dentry;
+	lower_mnt = lower_path.mnt;
 	lower_dir_dentry = lock_parent(lower_dentry);
 
-	err = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry);
+	err = vfs_rmdir2(lower_mnt, d_inode(lower_dir_dentry), lower_dentry);
 	if (err)
 		goto out;
 
@@ -456,6 +464,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *lower_new_dentry = NULL;
 	struct dentry *lower_old_dir_dentry = NULL;
 	struct dentry *lower_new_dir_dentry = NULL;
+	struct vfsmount *lower_mnt = NULL;
 	struct dentry *trap = NULL;
 	struct dentry *new_parent = NULL;
 	struct path lower_old_path, lower_new_path;
@@ -477,6 +486,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	sdcardfs_get_lower_path(new_dentry, &lower_new_path);
 	lower_old_dentry = lower_old_path.dentry;
 	lower_new_dentry = lower_new_path.dentry;
+	lower_mnt = lower_old_path.mnt;
 	lower_old_dir_dentry = dget_parent(lower_old_dentry);
 	lower_new_dir_dentry = dget_parent(lower_new_dentry);
 
@@ -492,7 +502,8 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out;
 	}
 
-	err = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry,
+	err = vfs_rename2(lower_mnt,
+			 d_inode(lower_old_dir_dentry), lower_old_dentry,
 			 d_inode(lower_new_dir_dentry), lower_new_dentry,
 			 NULL, 0);
 	if (err)
@@ -642,6 +653,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia)
 {
 	int err;
 	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
 	struct inode *inode;
 	struct inode *lower_inode;
 	struct path lower_path;
@@ -675,6 +687,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia)
 
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
+	lower_mnt = lower_path.mnt;
 	lower_inode = sdcardfs_lower_inode(inode);
 
 	/* prepare our own lower struct iattr (with the lower file) */
@@ -718,7 +731,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia)
 	 * tries to open(), unlink(), then ftruncate() a file.
 	 */
 	mutex_lock(&d_inode(lower_dentry)->i_mutex);
-	err = notify_change(lower_dentry, &lower_ia, /* note: lower_ia */
+	err = notify_change2(lower_mnt, lower_dentry, &lower_ia, /* note: lower_ia */
 			NULL);
 	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
 	if (current->mm)
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index cfda98d257b6..5132f1dc5a4d 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -476,7 +476,7 @@ static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t m
 		goto out_unlock;
 	}
 
-	err = vfs_mkdir(d_inode(parent.dentry), dent, mode);
+	err = vfs_mkdir2(parent.mnt, d_inode(parent.dentry), dent, mode);
 	if (err) {
 		if (err == -EEXIST)
 			err = 0;
@@ -487,7 +487,7 @@ static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t m
 	attrs.ia_gid = make_kgid(&init_user_ns, gid);
 	attrs.ia_valid = ATTR_UID | ATTR_GID;
 	mutex_lock(&d_inode(dent)->i_mutex);
-	notify_change(dent, &attrs, NULL);
+	notify_change2(parent.mnt, dent, &attrs, NULL);
 	mutex_unlock(&d_inode(dent)->i_mutex);
 
 out_dput:

From 6b6e896b0beed019c1fe4ad1b49c327d905956b8 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 26 Oct 2016 17:36:05 -0700
Subject: [PATCH 0624/3220] ANDROID: sdcardfs: Add gid and mask to private
 mount data

Adds support for mount2, remount2, and the functions
to allocate/clone/copy the private data

The next patch will switch over to actually using it.

Change-Id: I8a43da26021d33401f655f0b2784ead161c575e3
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/main.c     | 103 ++++++++++++++++++++++++++++++++++++-----
 fs/sdcardfs/sdcardfs.h |   8 ++++
 fs/sdcardfs/super.c    |  64 ++++++++++++++++++++++---
 3 files changed, 157 insertions(+), 18 deletions(-)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 2decea3d1e3e..5400e7e63d27 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -49,7 +49,8 @@ static const match_table_t sdcardfs_tokens = {
 };
 
 static int parse_options(struct super_block *sb, char *options, int silent,
-				int *debug, struct sdcardfs_mount_options *opts)
+				int *debug, struct sdcardfs_vfsmount_options *vfsopts,
+				struct sdcardfs_mount_options *opts)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -58,9 +59,11 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 	/* by default, we use AID_MEDIA_RW as uid, gid */
 	opts->fs_low_uid = AID_MEDIA_RW;
 	opts->fs_low_gid = AID_MEDIA_RW;
+	vfsopts->mask = 0;
 	opts->mask = 0;
 	opts->multiuser = false;
 	opts->fs_user_id = 0;
+	vfsopts->gid = 0;
 	opts->gid = 0;
 	/* by default, 0MB is reserved */
 	opts->reserved_mb = 0;
@@ -95,6 +98,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 			if (match_int(&args[0], &option))
 				return 0;
 			opts->gid = option;
+			vfsopts->gid = option;
 			break;
 		case Opt_userid:
 			if (match_int(&args[0], &option))
@@ -105,6 +109,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 			if (match_int(&args[0], &option))
 				return 0;
 			opts->mask = option;
+			vfsopts->mask = option;
 			break;
 		case Opt_multiuser:
 			opts->multiuser = true;
@@ -135,6 +140,65 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 	return 0;
 }
 
+int parse_options_remount(struct super_block *sb, char *options, int silent,
+				struct sdcardfs_vfsmount_options *vfsopts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int debug;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, sdcardfs_tokens, args);
+
+		switch (token) {
+		case Opt_debug:
+			debug = 1;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			vfsopts->gid = option;
+
+			break;
+		case Opt_mask:
+			if (match_int(&args[0], &option))
+				return 0;
+			vfsopts->mask = option;
+			break;
+		case Opt_multiuser:
+		case Opt_userid:
+		case Opt_fsuid:
+		case Opt_fsgid:
+		case Opt_reserved_mb:
+			printk( KERN_WARNING "Option \"%s\" can't be changed during remount\n", p);
+			break;
+		/* unknown option */
+		default:
+			if (!silent) {
+				printk( KERN_ERR "Unrecognized mount option \"%s\" "
+						"or missing value", p);
+			}
+			return -EINVAL;
+		}
+	}
+
+	if (debug) {
+		printk( KERN_INFO "sdcardfs : options - debug:%d\n", debug);
+		printk( KERN_INFO "sdcardfs : options - gid:%d\n", vfsopts->gid);
+		printk( KERN_INFO "sdcardfs : options - mask:%d\n", vfsopts->mask);
+	}
+
+	return 0;
+}
+
 #if 0
 /*
  * our custom d_alloc_root work-alike
@@ -172,14 +236,15 @@ EXPORT_SYMBOL_GPL(sdcardfs_super_list);
  * There is no need to lock the sdcardfs_super_info's rwsem as there is no
  * way anyone can have a reference to the superblock at this point in time.
  */
-static int sdcardfs_read_super(struct super_block *sb, const char *dev_name,
-						void *raw_data, int silent)
+static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
+		const char *dev_name, void *raw_data, int silent)
 {
 	int err = 0;
 	int debug;
 	struct super_block *lower_sb;
 	struct path lower_path;
 	struct sdcardfs_sb_info *sb_info;
+	struct sdcardfs_vfsmount_options *mnt_opt = mnt->data;
 	struct inode *inode;
 
 	printk(KERN_INFO "sdcardfs version 2.0\n");
@@ -212,7 +277,7 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name,
 
 	sb_info = sb->s_fs_info;
 	/* parse options */
-	err = parse_options(sb, raw_data, silent, &debug, &sb_info->options);
+	err = parse_options(sb, raw_data, silent, &debug, mnt_opt, &sb_info->options);
 	if (err) {
 		printk(KERN_ERR	"sdcardfs: invalid options\n");
 		goto out_freesbi;
@@ -306,9 +371,9 @@ out:
 }
 
 /* A feature which supports mount_nodev() with options */
-static struct dentry *mount_nodev_with_options(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data,
-        int (*fill_super)(struct super_block *, const char *, void *, int))
+static struct dentry *mount_nodev_with_options(struct vfsmount *mnt,
+	struct file_system_type *fs_type, int flags, const char *dev_name, void *data,
+        int (*fill_super)(struct vfsmount *, struct super_block *, const char *, void *, int))
 
 {
 	int error;
@@ -319,7 +384,7 @@ static struct dentry *mount_nodev_with_options(struct file_system_type *fs_type,
 
 	s->s_flags = flags;
 
-	error = fill_super(s, dev_name, data, flags & MS_SILENT ? 1 : 0);
+	error = fill_super(mnt, s, dev_name, data, flags & MS_SILENT ? 1 : 0);
 	if (error) {
 		deactivate_locked_super(s);
 		return ERR_PTR(error);
@@ -328,15 +393,27 @@ static struct dentry *mount_nodev_with_options(struct file_system_type *fs_type,
 	return dget(s->s_root);
 }
 
-struct dentry *sdcardfs_mount(struct file_system_type *fs_type, int flags,
+static struct dentry *sdcardfs_mount(struct vfsmount *mnt,
+		struct file_system_type *fs_type, int flags,
 			    const char *dev_name, void *raw_data)
 {
 	/*
 	 * dev_name is a lower_path_name,
 	 * raw_data is a option string.
 	 */
-	return mount_nodev_with_options(fs_type, flags, dev_name,
-					raw_data, sdcardfs_read_super);
+	return mount_nodev_with_options(mnt, fs_type, flags, dev_name,
+						raw_data, sdcardfs_read_super);
+}
+
+static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type, int flags,
+		    const char *dev_name, void *raw_data)
+{
+	WARN(1, "sdcardfs does not support mount. Use mount2.\n");
+	return ERR_PTR(-EINVAL);
+}
+
+void *sdcardfs_alloc_mnt_data(void) {
+	return kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
 }
 
 void sdcardfs_kill_sb(struct super_block *sb) {
@@ -353,7 +430,9 @@ void sdcardfs_kill_sb(struct super_block *sb) {
 static struct file_system_type sdcardfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= SDCARDFS_NAME,
-	.mount		= sdcardfs_mount,
+	.mount		= sdcardfs_mount_wrn,
+	.mount2		= sdcardfs_mount,
+	.alloc_mnt_data = sdcardfs_alloc_mnt_data,
 	.kill_sb	= sdcardfs_kill_sb,
 	.fs_flags	= 0,
 };
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 5132f1dc5a4d..22ef29857022 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -193,6 +193,14 @@ struct sdcardfs_mount_options {
 	unsigned int reserved_mb;
 };
 
+struct sdcardfs_vfsmount_options {
+	gid_t gid;
+	mode_t mask;
+};
+
+extern int parse_options_remount(struct super_block *sb, char *options, int silent,
+		struct sdcardfs_vfsmount_options *vfsopts);
+
 /* sdcardfs super-block data in memory */
 struct sdcardfs_sb_info {
 	struct super_block *sb;
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index 0a465395aab7..edda32b68dc0 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -108,6 +108,50 @@ static int sdcardfs_remount_fs(struct super_block *sb, int *flags, char *options
 	return err;
 }
 
+/*
+ * @mnt: mount point we are remounting
+ * @sb: superblock we are remounting
+ * @flags: numeric mount options
+ * @options: mount options string
+ */
+static int sdcardfs_remount_fs2(struct vfsmount *mnt, struct super_block *sb,
+						int *flags, char *options)
+{
+	int err = 0;
+
+	/*
+	 * The VFS will take care of "ro" and "rw" flags among others.  We
+	 * can safely accept a few flags (RDONLY, MANDLOCK), and honor
+	 * SILENT, but anything else left over is an error.
+	 */
+	if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT | MS_REMOUNT)) != 0) {
+		printk(KERN_ERR
+		       "sdcardfs: remount flags 0x%x unsupported\n", *flags);
+		err = -EINVAL;
+	}
+	printk(KERN_INFO "Remount options were %s for vfsmnt %p.\n", options, mnt);
+	err = parse_options_remount(sb, options, *flags & ~MS_SILENT, mnt->data);
+
+
+	return err;
+}
+
+static void* sdcardfs_clone_mnt_data(void *data) {
+	struct sdcardfs_vfsmount_options* opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+	struct sdcardfs_vfsmount_options* old = data;
+	if(!opt) return NULL;
+	opt->gid = old->gid;
+	opt->mask = old->mask;
+	return opt;
+}
+
+static void sdcardfs_copy_mnt_data(void *data, void *newdata) {
+	struct sdcardfs_vfsmount_options* old = data;
+	struct sdcardfs_vfsmount_options* new = newdata;
+	old->gid = new->gid;
+	old->mask = new->mask;
+}
+
 /*
  * Called by iput() when the inode reference count reached zero
  * and the inode is not hashed anywhere.  Used to clear anything
@@ -191,19 +235,24 @@ static void sdcardfs_umount_begin(struct super_block *sb)
 		lower_sb->s_op->umount_begin(lower_sb);
 }
 
-static int sdcardfs_show_options(struct seq_file *m, struct dentry *root)
+static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m, struct dentry *root)
 {
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(root->d_sb);
 	struct sdcardfs_mount_options *opts = &sbi->options;
+	struct sdcardfs_vfsmount_options *vfsopts = mnt->data;
 
 	if (opts->fs_low_uid != 0)
-		seq_printf(m, ",uid=%u", opts->fs_low_uid);
+		seq_printf(m, ",fsuid=%u", opts->fs_low_uid);
 	if (opts->fs_low_gid != 0)
-		seq_printf(m, ",gid=%u", opts->fs_low_gid);
-
+		seq_printf(m, ",fsgid=%u", opts->fs_low_gid);
+	if (vfsopts->gid != 0)
+		seq_printf(m, ",gid=%u", vfsopts->gid);
 	if (opts->multiuser)
 		seq_printf(m, ",multiuser");
-
+	if (vfsopts->mask)
+		seq_printf(m, ",mask=%u", vfsopts->mask);
+	if (opts->fs_user_id)
+		seq_printf(m, ",userid=%u", opts->fs_user_id);
 	if (opts->reserved_mb != 0)
 		seq_printf(m, ",reserved=%uMB", opts->reserved_mb);
 
@@ -214,9 +263,12 @@ const struct super_operations sdcardfs_sops = {
 	.put_super	= sdcardfs_put_super,
 	.statfs		= sdcardfs_statfs,
 	.remount_fs	= sdcardfs_remount_fs,
+	.remount_fs2	= sdcardfs_remount_fs2,
+	.clone_mnt_data	= sdcardfs_clone_mnt_data,
+	.copy_mnt_data	= sdcardfs_copy_mnt_data,
 	.evict_inode	= sdcardfs_evict_inode,
 	.umount_begin	= sdcardfs_umount_begin,
-	.show_options	= sdcardfs_show_options,
+	.show_options2	= sdcardfs_show_options,
 	.alloc_inode	= sdcardfs_alloc_inode,
 	.destroy_inode	= sdcardfs_destroy_inode,
 	.drop_inode	= generic_delete_inode,

From 4d70f73115248761f35aaa9725a22e4b957d2938 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 26 Oct 2016 20:27:20 -0700
Subject: [PATCH 0625/3220] ANDROID: sdcardfs: Use per mount permissions

This switches sdcardfs over to using permission2.
Instead of mounting several sdcardfs instances onto
the same underlaying directory, you bind mount a
single mount several times, and remount with the
options you want. These are stored in the private
mount data, allowing you to maintain the same tree,
but have different permissions for different mount
points.

Warning functions have been added for permission,
as it should never be called, and the correct
behavior is unclear.

Change-Id: I841b1d70ec60cf2b866fa48edeb74a0b0f8334f5
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c |  20 ++++--
 fs/sdcardfs/inode.c        | 127 ++++++++++++++++++++++++++++++-------
 fs/sdcardfs/lookup.c       |   4 +-
 fs/sdcardfs/main.c         |   8 +--
 fs/sdcardfs/sdcardfs.h     |  44 ++++++++-----
 5 files changed, 150 insertions(+), 53 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 89daf69efbaa..066edbbb6ad6 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -141,13 +141,23 @@ void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len)
 	info = SDCARDFS_I(d_inode(dentry));
 
 	if (needs_fixup(info->perm)) {
+		/* We need permission to fix up these values.
+		 * Since permissions are based of of the mount, and
+		 * we are accessing without the mount point, we create
+		 * a fake mount with the permissions we will be using.
+		 */
+		struct vfsmount fakemnt;
+		struct sdcardfs_vfsmount_options opts;
+		fakemnt.data = &opts;
+		opts.gid = AID_SDCARD_RW;
+		opts.mask = 0;
 		mutex_lock(&d_inode(dentry)->i_mutex);
-		child = lookup_one_len(name, dentry, len);
+		child = lookup_one_len2(name, &fakemnt, dentry, len);
 		mutex_unlock(&d_inode(dentry)->i_mutex);
 		if (!IS_ERR(child)) {
-			if (child->d_inode) {
+			if (d_inode(child)) {
 				get_derived_permission(dentry, child);
-				fix_derived_permission(d_inode(child));
+				fixup_tmp_permissions(d_inode(child));
 			}
 			dput(child);
 		}
@@ -172,7 +182,7 @@ void fixup_top_recursive(struct dentry *parent) {
 		if (d_inode(dentry)) {
 			if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) {
 				get_derived_permission(parent, dentry);
-				fix_derived_permission(d_inode(dentry));
+				fixup_tmp_permissions(d_inode(dentry));
 				fixup_top_recursive(dentry);
 			}
 		}
@@ -202,7 +212,7 @@ inline void update_derived_permission_lock(struct dentry *dentry)
 			dput(parent);
 		}
 	}
-	fix_derived_permission(dentry->d_inode);
+	fixup_tmp_permissions(d_inode(dentry));
 }
 
 int need_graft_path(struct dentry *dentry)
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index dc64c9e2f5e7..76a6e8ad0736 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -531,7 +531,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/* At this point, not all dentry information has been moved, so
 	 * we pass along new_dentry for the name.*/
 	get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry);
-	fix_derived_permission(d_inode(old_dentry));
+	fixup_tmp_permissions(d_inode(old_dentry));
 	fixup_top_recursive(old_dentry);
 out:
 	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
@@ -601,26 +601,63 @@ static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie)
 }
 #endif
 
-static int sdcardfs_permission(struct inode *inode, int mask)
+static int sdcardfs_permission_wrn(struct inode *inode, int mask)
+{
+	WARN(1, "sdcardfs does not support permission. Use permission2.\n");
+	return -EINVAL;
+}
+
+void copy_attrs(struct inode *dest, const struct inode *src)
+{
+	dest->i_mode = src->i_mode;
+	dest->i_uid = src->i_uid;
+	dest->i_gid = src->i_gid;
+	dest->i_rdev = src->i_rdev;
+	dest->i_atime = src->i_atime;
+	dest->i_mtime = src->i_mtime;
+	dest->i_ctime = src->i_ctime;
+	dest->i_blkbits = src->i_blkbits;
+	dest->i_flags = src->i_flags;
+#ifdef CONFIG_FS_POSIX_ACL
+	dest->i_acl = src->i_acl;
+#endif
+#ifdef CONFIG_SECURITY
+	dest->i_security = src->i_security;
+#endif
+}
+
+static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int mask)
 {
 	int err;
+	struct inode tmp;
 	struct inode *top = grab_top(SDCARDFS_I(inode));
 
-	if (!top)
+	if (!top) {
+		release_top(SDCARDFS_I(inode));
+		WARN(1, "Top value was null!\n");
 		return -EINVAL;
-	/* Ensure owner is up to date */
-	if (!uid_eq(inode->i_uid, top->i_uid)) {
-		SDCARDFS_I(inode)->d_uid = SDCARDFS_I(top)->d_uid;
-		fix_derived_permission(inode);
 	}
-	release_top(SDCARDFS_I(inode));
 
 	/*
 	 * Permission check on sdcardfs inode.
 	 * Calling process should have AID_SDCARD_RW permission
+	 * Since generic_permission only needs i_mode, i_uid,
+	 * i_gid, and i_sb, we can create a fake inode to pass
+	 * this information down in.
+	 *
+	 * The underlying code may attempt to take locks in some
+	 * cases for features we're not using, but if that changes,
+	 * locks must be dealt with to avoid undefined behavior.
 	 */
-	err = generic_permission(inode, mask);
-
+	copy_attrs(&tmp, inode);
+	tmp.i_uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid);
+	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top)));
+	tmp.i_mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top));
+	release_top(SDCARDFS_I(inode));
+	tmp.i_sb = inode->i_sb;
+	if (IS_POSIXACL(inode))
+		printk(KERN_WARNING "%s: This may be undefined behavior... \n", __func__);
+	err = generic_permission(&tmp, mask);
 	/* XXX
 	 * Original sdcardfs code calls inode_permission(lower_inode,.. )
 	 * for checking inode permission. But doing such things here seems
@@ -649,7 +686,13 @@ static int sdcardfs_permission(struct inode *inode, int mask)
 
 }
 
-static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia)
+static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia)
+{
+	WARN(1, "sdcardfs does not support setattr. User setattr2.\n");
+	return -EINVAL;
+}
+
+static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct iattr *ia)
 {
 	int err;
 	struct dentry *lower_dentry;
@@ -659,17 +702,45 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia)
 	struct path lower_path;
 	struct iattr lower_ia;
 	struct dentry *parent;
+	struct inode tmp;
+	struct inode *top;
+	const struct cred *saved_cred = NULL;
 
 	inode = d_inode(dentry);
+	top = grab_top(SDCARDFS_I(inode));
+
+	if (!top) {
+		release_top(SDCARDFS_I(inode));
+		return -EINVAL;
+	}
+
+	/*
+	 * Permission check on sdcardfs inode.
+	 * Calling process should have AID_SDCARD_RW permission
+	 * Since generic_permission only needs i_mode, i_uid,
+	 * i_gid, and i_sb, we can create a fake inode to pass
+	 * this information down in.
+	 *
+	 * The underlying code may attempt to take locks in some
+	 * cases for features we're not using, but if that changes,
+	 * locks must be dealt with to avoid undefined behavior.
+	 *
+	 */
+	copy_attrs(&tmp, inode);
+	tmp.i_uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid);
+	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top)));
+	tmp.i_mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top));
+	tmp.i_size = i_size_read(inode);
+	release_top(SDCARDFS_I(inode));
+	tmp.i_sb = inode->i_sb;
 
 	/*
 	 * Check if user has permission to change inode.  We don't check if
 	 * this user can change the lower inode: that should happen when
 	 * calling notify_change on the lower inode.
 	 */
-	err = inode_change_ok(inode, ia);
+	err = inode_change_ok(&tmp, ia);
 
-	/* no vfs_XXX operations required, cred overriding will be skipped. wj*/
 	if (!err) {
 		/* check the Android group ID */
 		parent = dget_parent(dentry);
@@ -685,6 +756,9 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia)
 	if (err)
 		goto out_err;
 
+	/* save current_cred and override it */
+	OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred);
+
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
 	lower_mnt = lower_path.mnt;
@@ -708,7 +782,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia)
 	if (current->mm)
 		down_write(&current->mm->mmap_sem);
 	if (ia->ia_valid & ATTR_SIZE) {
-		err = inode_newsize_ok(inode, ia->ia_size);
+		err = inode_newsize_ok(&tmp, ia->ia_size);
 		if (err) {
 			if (current->mm)
 				up_write(&current->mm->mmap_sem);
@@ -750,11 +824,12 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia)
 
 out:
 	sdcardfs_put_lower_path(dentry, &lower_path);
+	REVERT_CRED(saved_cred);
 out_err:
 	return err;
 }
 
-static int sdcardfs_fillattr(struct inode *inode, struct kstat *stat)
+static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode, struct kstat *stat)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
 	struct inode *top = grab_top(info);
@@ -763,10 +838,10 @@ static int sdcardfs_fillattr(struct inode *inode, struct kstat *stat)
 
 	stat->dev = inode->i_sb->s_dev;
 	stat->ino = inode->i_ino;
-	stat->mode = (inode->i_mode  & S_IFMT) | get_mode(SDCARDFS_I(top));
+	stat->mode = (inode->i_mode  & S_IFMT) | get_mode(mnt, SDCARDFS_I(top));
 	stat->nlink = inode->i_nlink;
 	stat->uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid);
-	stat->gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(top)));
+	stat->gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top)));
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
 	stat->atime = inode->i_atime;
@@ -807,14 +882,14 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	sdcardfs_copy_and_fix_attrs(inode, lower_inode);
 	fsstack_copy_inode_size(inode, lower_inode);
 
-	err = sdcardfs_fillattr(inode, stat);
+	err = sdcardfs_fillattr(mnt, inode, stat);
 	sdcardfs_put_lower_path(dentry, &lower_path);
 	return err;
 }
 
 const struct inode_operations sdcardfs_symlink_iops = {
-	.permission	= sdcardfs_permission,
-	.setattr	= sdcardfs_setattr,
+	.permission2	= sdcardfs_permission,
+	.setattr2	= sdcardfs_setattr,
 	/* XXX Following operations are implemented,
 	 *     but FUSE(sdcard) or FAT does not support them
 	 *     These methods are *NOT* perfectly tested.
@@ -827,12 +902,14 @@ const struct inode_operations sdcardfs_symlink_iops = {
 const struct inode_operations sdcardfs_dir_iops = {
 	.create		= sdcardfs_create,
 	.lookup		= sdcardfs_lookup,
-	.permission	= sdcardfs_permission,
+	.permission	= sdcardfs_permission_wrn,
+	.permission2	= sdcardfs_permission,
 	.unlink		= sdcardfs_unlink,
 	.mkdir		= sdcardfs_mkdir,
 	.rmdir		= sdcardfs_rmdir,
 	.rename		= sdcardfs_rename,
-	.setattr	= sdcardfs_setattr,
+	.setattr	= sdcardfs_setattr_wrn,
+	.setattr2	= sdcardfs_setattr,
 	.getattr	= sdcardfs_getattr,
 	/* XXX Following operations are implemented,
 	 *     but FUSE(sdcard) or FAT does not support them
@@ -844,7 +921,9 @@ const struct inode_operations sdcardfs_dir_iops = {
 };
 
 const struct inode_operations sdcardfs_main_iops = {
-	.permission	= sdcardfs_permission,
-	.setattr	= sdcardfs_setattr,
+	.permission	= sdcardfs_permission_wrn,
+	.permission2	= sdcardfs_permission,
+	.setattr	= sdcardfs_setattr_wrn,
+	.setattr2	= sdcardfs_setattr,
 	.getattr	= sdcardfs_getattr,
 };
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index c74a7d1bc18e..00a711ec2733 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -244,6 +244,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 	if (err == -ENOENT) {
 		struct dentry *child;
 		struct dentry *match = NULL;
+		mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
 		spin_lock(&lower_dir_dentry->d_lock);
 		list_for_each_entry(child, &lower_dir_dentry->d_subdirs, d_child) {
 			if (child && d_inode(child)) {
@@ -254,6 +255,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 			}
 		}
 		spin_unlock(&lower_dir_dentry->d_lock);
+		mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
 		if (match) {
 			err = vfs_path_lookup(lower_dir_dentry,
 						lower_dir_mnt,
@@ -389,7 +391,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 					sdcardfs_lower_inode(dentry->d_inode));
 		/* get derived permission */
 		get_derived_permission(parent, dentry);
-		fix_derived_permission(dentry->d_inode);
+		fixup_tmp_permissions(d_inode(dentry));
 	}
 	/* update parent directory's atime */
 	fsstack_copy_attr_atime(parent->d_inode,
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 5400e7e63d27..eec10ccacd99 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -28,7 +28,6 @@ enum {
 	Opt_fsgid,
 	Opt_gid,
 	Opt_debug,
-	Opt_lower_fs,
 	Opt_mask,
 	Opt_multiuser, // May need?
 	Opt_userid,
@@ -60,11 +59,9 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 	opts->fs_low_uid = AID_MEDIA_RW;
 	opts->fs_low_gid = AID_MEDIA_RW;
 	vfsopts->mask = 0;
-	opts->mask = 0;
 	opts->multiuser = false;
 	opts->fs_user_id = 0;
 	vfsopts->gid = 0;
-	opts->gid = 0;
 	/* by default, 0MB is reserved */
 	opts->reserved_mb = 0;
 
@@ -97,7 +94,6 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 		case Opt_gid:
 			if (match_int(&args[0], &option))
 				return 0;
-			opts->gid = option;
 			vfsopts->gid = option;
 			break;
 		case Opt_userid:
@@ -108,7 +104,6 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 		case Opt_mask:
 			if (match_int(&args[0], &option))
 				return 0;
-			opts->mask = option;
 			vfsopts->mask = option;
 			break;
 		case Opt_multiuser:
@@ -258,6 +253,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 
 	printk(KERN_INFO "sdcardfs: dev_name -> %s\n", dev_name);
 	printk(KERN_INFO "sdcardfs: options -> %s\n", (char *)raw_data);
+	printk(KERN_INFO "sdcardfs: mnt -> %p\n", mnt);
 
 	/* parse lower path */
 	err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
@@ -342,7 +338,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 		setup_derived_state(d_inode(sb->s_root), PERM_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root));
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
 	}
-	fix_derived_permission(sb->s_root->d_inode);
+	fixup_tmp_permissions(d_inode(sb->s_root));
 	sb_info->sb = sb;
 	list_add(&sb_info->list, &sdcardfs_super_list);
 	mutex_unlock(&sdcardfs_super_list_lock);
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 22ef29857022..b03130329014 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -68,14 +68,20 @@
 
 #define AID_PACKAGE_INFO  1027
 
-#define fix_derived_permission(x)	\
+
+/*
+ * Permissions are handled by our permission function.
+ * We don't want anyone who happens to look at our inode value to prematurely
+ * block access, so store more permissive values. These are probably never
+ * used.
+ */
+#define fixup_tmp_permissions(x)	\
 	do {						\
 		(x)->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(x)->d_uid);	\
-		(x)->i_gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(x)));	\
-		(x)->i_mode = ((x)->i_mode & S_IFMT) | get_mode(SDCARDFS_I(x));\
+		(x)->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW);	\
+		(x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\
 	} while (0)
 
-
 /* OVERRIDE_CRED() and REVERT_CRED()
  * 	OVERRID_CRED()
  * 		backup original task->cred
@@ -187,8 +193,6 @@ struct sdcardfs_mount_options {
 	uid_t fs_low_uid;
 	gid_t fs_low_gid;
 	userid_t fs_user_id;
-	gid_t gid;
-	mode_t mask;
 	bool multiuser;
 	unsigned int reserved_mb;
 };
@@ -360,9 +364,10 @@ static inline void release_top(struct sdcardfs_inode_info *info)
 	iput(info->top);
 }
 
-static inline int get_gid(struct sdcardfs_inode_info *info) {
-	struct sdcardfs_sb_info *sb_info = SDCARDFS_SB(info->vfs_inode.i_sb);
-	if (sb_info->options.gid == AID_SDCARD_RW) {
+static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info) {
+	struct sdcardfs_vfsmount_options *opts = mnt->data;
+
+	if (opts->gid == AID_SDCARD_RW) {
 		/* As an optimization, certain trusted system components only run
 		 * as owner but operate across all users. Since we're now handing
 		 * out the sdcard_rw GID only to trusted apps, we're okay relaxing
@@ -370,14 +375,15 @@ static inline int get_gid(struct sdcardfs_inode_info *info) {
 		 * assigned to app directories are still multiuser aware. */
 		return AID_SDCARD_RW;
 	} else {
-		return multiuser_get_uid(info->userid, sb_info->options.gid);
+		return multiuser_get_uid(info->userid, opts->gid);
 	}
 }
-static inline int get_mode(struct sdcardfs_inode_info *info) {
+static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *info) {
 	int owner_mode;
 	int filtered_mode;
-	struct sdcardfs_sb_info * sb_info = SDCARDFS_SB(info->vfs_inode.i_sb);
-	int visible_mode = 0775 & ~sb_info->options.mask;
+	struct sdcardfs_vfsmount_options *opts = mnt->data;
+	int visible_mode = 0775 & ~opts->mask;
+
 
 	if (info->perm == PERM_PRE_ROOT) {
 		/* Top of multi-user view should always be visible to ensure
@@ -387,7 +393,7 @@ static inline int get_mode(struct sdcardfs_inode_info *info) {
 		/* Block "other" access to Android directories, since only apps
 		* belonging to a specific user should be in there; we still
 		* leave +x open for the default view. */
-		if (sb_info->options.gid == AID_SDCARD_RW) {
+		if (opts->gid == AID_SDCARD_RW) {
 			visible_mode = visible_mode & ~0006;
 		} else {
 			visible_mode = visible_mode & ~0007;
@@ -553,12 +559,16 @@ static inline int check_min_free_space(struct dentry *dentry, size_t size, int d
 		return 1;
 }
 
-/* Copies attrs and maintains sdcardfs managed attrs */
+/*
+ * Copies attrs and maintains sdcardfs managed attrs
+ * Since our permission check handles all special permissions, set those to be open
+ */
 static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct inode *src)
 {
-	dest->i_mode = (src->i_mode  & S_IFMT) | get_mode(SDCARDFS_I(dest));
+	dest->i_mode = (src->i_mode  & S_IFMT) | S_IRWXU | S_IRWXG |
+			S_IROTH | S_IXOTH; /* 0775 */
 	dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->d_uid);
-	dest->i_gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(dest)));
+	dest->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW);
 	dest->i_rdev = src->i_rdev;
 	dest->i_atime = src->i_atime;
 	dest->i_mtime = src->i_mtime;

From 9eaefe628d1b144ed0893418639df1ef6cc1aac6 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 15 Nov 2016 13:35:18 -0800
Subject: [PATCH 0626/3220] ANDROID: sdcardfs: Change magic value

Sdcardfs uses the same magic value as wrapfs.
This should not be the case. As it is entirely
in memory, the value can be changed without any
loss of compatibility.

Change-Id: I24200b805d5e6d32702638be99e47d50d7f2f746
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 include/uapi/linux/magic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index cfb5c406f344..e97d5b3ccfa8 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -52,7 +52,7 @@
 #define REISER2FS_SUPER_MAGIC_STRING	"ReIsEr2Fs"
 #define REISER2FS_JR_SUPER_MAGIC_STRING	"ReIsEr3Fs"
 
-#define SDCARDFS_SUPER_MAGIC	0xb550ca10
+#define SDCARDFS_SUPER_MAGIC	0x5dca2df5
 
 #define SMB_SUPER_MAGIC		0x517B
 #define CGROUP_SUPER_MAGIC	0x27e0eb

From 5cb5648935924fde6366062fc3a144ba06bfbd17 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 27 Dec 2016 12:36:29 -0800
Subject: [PATCH 0627/3220] ANDROID: sdcardfs: Fix locking issue with permision
 fix up

Don't use lookup_one_len so we can grab the spinlock that
protects d_subdirs.

Bug: 30954918
Change-Id: I0c6a393252db7beb467e0d563739a3a14e1b5115
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 066edbbb6ad6..c77695c8f729 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -141,32 +141,26 @@ void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len)
 	info = SDCARDFS_I(d_inode(dentry));
 
 	if (needs_fixup(info->perm)) {
-		/* We need permission to fix up these values.
-		 * Since permissions are based of of the mount, and
-		 * we are accessing without the mount point, we create
-		 * a fake mount with the permissions we will be using.
-		 */
-		struct vfsmount fakemnt;
-		struct sdcardfs_vfsmount_options opts;
-		fakemnt.data = &opts;
-		opts.gid = AID_SDCARD_RW;
-		opts.mask = 0;
-		mutex_lock(&d_inode(dentry)->i_mutex);
-		child = lookup_one_len2(name, &fakemnt, dentry, len);
-		mutex_unlock(&d_inode(dentry)->i_mutex);
-		if (!IS_ERR(child)) {
-			if (d_inode(child)) {
-				get_derived_permission(dentry, child);
-				fixup_tmp_permissions(d_inode(child));
-			}
-			dput(child);
+		spin_lock(&dentry->d_lock);
+		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+				dget(child);
+				if (!strncasecmp(child->d_name.name, name, len)) {
+					if (d_inode(child)) {
+						get_derived_permission(dentry, child);
+						fixup_tmp_permissions(d_inode(child));
+						dput(child);
+						break;
+					}
+				}
+				dput(child);
 		}
+		spin_unlock(&dentry->d_lock);
 	} else 	if (descendant_may_need_fixup(info->perm)) {
-		mutex_lock(&d_inode(dentry)->i_mutex);
+		spin_lock(&dentry->d_lock);
 		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
 				fixup_perms_recursive(child, name, len);
 		}
-		mutex_unlock(&d_inode(dentry)->i_mutex);
+		spin_unlock(&dentry->d_lock);
 	}
 	dput(dentry);
 }

From e33aa348eec662cc9dfb078445dda1644b294a86 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 5 Jan 2017 14:37:11 -0800
Subject: [PATCH 0628/3220] ANDROID: sdcardfs: Switch ->d_inode to d_inode()

Change-Id: I12375cc2d6e82fb8adf0319be971f335f8d7a312
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c | 16 ++++++++--------
 fs/sdcardfs/file.c         |  2 +-
 fs/sdcardfs/lookup.c       | 14 +++++++-------
 fs/sdcardfs/main.c         |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index c77695c8f729..0c1a91f70b40 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -49,8 +49,8 @@ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
 /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */
 void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry)
 {
-	struct sdcardfs_inode_info *info = SDCARDFS_I(dentry->d_inode);
-	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode);
+	struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
+	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
 	appid_t appid;
 
 	/* By default, each inode inherits from its parent.
@@ -61,7 +61,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st
 	 * stage of each system call by fix_derived_permission(inode).
 	 */
 
-	inherit_derived_state(parent->d_inode, dentry->d_inode);
+	inherit_derived_state(d_inode(parent), d_inode(dentry));
 
 	/* Derive custom permissions based on parent and current node */
 	switch (parent_info->perm) {
@@ -134,7 +134,7 @@ void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len)
 	struct sdcardfs_inode_info *info;
 	if (!dget(dentry))
 		return;
-	if (!dentry->d_inode) {
+	if (!d_inode(dentry)) {
 		dput(dentry);
 		return;
 	}
@@ -189,7 +189,7 @@ inline void update_derived_permission_lock(struct dentry *dentry)
 {
 	struct dentry *parent;
 
-	if(!dentry || !dentry->d_inode) {
+	if(!dentry || !d_inode(dentry)) {
 		printk(KERN_ERR "sdcardfs: %s: invalid dentry\n", __func__);
 		return;
 	}
@@ -198,7 +198,7 @@ inline void update_derived_permission_lock(struct dentry *dentry)
 	 * 2. remove the root dentry update
 	 */
 	if(IS_ROOT(dentry)) {
-		//setup_default_pre_root_state(dentry->d_inode);
+		//setup_default_pre_root_state(d_inode(dentry));
 	} else {
 		parent = dget_parent(dentry);
 		if(parent) {
@@ -213,7 +213,7 @@ int need_graft_path(struct dentry *dentry)
 {
 	int ret = 0;
 	struct dentry *parent = dget_parent(dentry);
-	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode);
+	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 
 	if(parent_info->perm == PERM_ANDROID &&
@@ -272,7 +272,7 @@ int is_base_obbpath(struct dentry *dentry)
 {
 	int ret = 0;
 	struct dentry *parent = dget_parent(dentry);
-	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode);
+	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 
 	spin_lock(&SDCARDFS_D(dentry)->lock);
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index c249fa982d3c..7750a0472389 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -216,7 +216,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 		goto out_err;
 	}
 
-	if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name)) {
+	if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
                          "	dentry: %s, task:%s\n",
 						 __func__, dentry->d_name.name, current->comm);
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 00a711ec2733..e94a65c8bbbd 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -179,7 +179,7 @@ int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
 	struct inode *lower_inode;
 	struct super_block *lower_sb;
 
-	lower_inode = lower_path->dentry->d_inode;
+	lower_inode = d_inode(lower_path->dentry);
 	lower_sb = sdcardfs_lower_super(sb);
 
 	/* check that the lower file system didn't cross a mount point */
@@ -359,7 +359,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	parent = dget_parent(dentry);
 
-	if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name)) {
+	if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) {
 		ret = ERR_PTR(-EACCES);
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
                          "	dentry: %s, task:%s\n",
@@ -386,16 +386,16 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 	if (ret)
 		dentry = ret;
-	if (dentry->d_inode) {
-		fsstack_copy_attr_times(dentry->d_inode,
-					sdcardfs_lower_inode(dentry->d_inode));
+	if (d_inode(dentry)) {
+		fsstack_copy_attr_times(d_inode(dentry),
+					sdcardfs_lower_inode(d_inode(dentry)));
 		/* get derived permission */
 		get_derived_permission(parent, dentry);
 		fixup_tmp_permissions(d_inode(dentry));
 	}
 	/* update parent directory's atime */
-	fsstack_copy_attr_atime(parent->d_inode,
-				sdcardfs_lower_inode(parent->d_inode));
+	fsstack_copy_attr_atime(d_inode(parent),
+				sdcardfs_lower_inode(d_inode(parent)));
 
 out:
 	sdcardfs_put_lower_path(parent, &lower_parent_path);
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index eec10ccacd99..7a8eae29e44d 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -297,7 +297,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	sb->s_op = &sdcardfs_sops;
 
 	/* get a new inode and allocate our root dentry */
-	inode = sdcardfs_iget(sb, lower_path.dentry->d_inode, 0);
+	inode = sdcardfs_iget(sb, d_inode(lower_path.dentry), 0);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_sput;

From b5858221c1c4f4bdc9ef67eb75ecf22580368820 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 5 Jan 2017 14:37:11 -0800
Subject: [PATCH 0629/3220] ANDROID: mnt: remount should propagate to slaves of
 slaves

propagate_remount was not accounting for the slave mounts
of other slave mounts, leading to some namespaces not
recieving the remount information.

bug:33731928
Change-Id: Idc9e8c2ed126a4143229fc23f10a959c2d0a3854
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/pnode.c | 27 +++++++++++++++++++++------
 fs/pnode.h |  2 +-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 4e2d78ec053a..7bb1879e8442 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -451,16 +451,31 @@ int propagate_umount(struct list_head *list)
 	return 0;
 }
 
-int propagate_remount(struct mount *mnt) {
-	struct mount *m;
+/*
+ *  Iterates over all slaves, and slaves of slaves.
+ */
+static struct mount *next_descendent(struct mount *root, struct mount *cur)
+{
+	if (!IS_MNT_NEW(cur) && !list_empty(&cur->mnt_slave_list))
+		return first_slave(cur);
+	do {
+		if (cur->mnt_slave.next != &cur->mnt_master->mnt_slave_list)
+			return next_slave(cur);
+		cur = cur->mnt_master;
+	} while (cur != root);
+	return NULL;
+}
+
+void propagate_remount(struct mount *mnt)
+{
+	struct mount *m = mnt;
 	struct super_block *sb = mnt->mnt.mnt_sb;
-	int ret = 0;
 
 	if (sb->s_op->copy_mnt_data) {
-		for (m = first_slave(mnt); m->mnt_slave.next != &mnt->mnt_slave_list; m = next_slave(m)) {
+		m = next_descendent(mnt, m);
+		while (m) {
 			sb->s_op->copy_mnt_data(m->mnt.data, mnt->mnt.data);
+			m = next_descendent(mnt, m);
 		}
 	}
-
-	return ret;
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 4e8e94dc9e6a..3cb58c0cdcbc 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -44,7 +44,7 @@ int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 int propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
-int propagate_remount(struct mount *);
+void propagate_remount(struct mount *);
 void mnt_release_group_id(struct mount *);
 int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);

From d240a0a145287caab3601ac48c55d696488ebf1c Mon Sep 17 00:00:00 2001
From: Cameron Gutman <aicommander@gmail.com>
Date: Thu, 23 Jun 2016 10:24:42 -0700
Subject: [PATCH 0630/3220] BACKPORT: Input: xpad - fix oops when attaching an
 unknown Xbox One gamepad

Xbox One controllers have multiple interfaces which all have the
same class, subclass, and protocol. One of the these interfaces
has only a single endpoint. When Xpad attempts to bind to this
interface, it causes an oops when trying initialize the output URB
by trying to access the second endpoint's descriptor.

This situation was avoided for known Xbox One devices by checking
the XTYPE constant associated with the VID and PID tuple. However,
this breaks when new or previously unknown Xbox One controllers
are attached to the system.

This change addresses the problem by deriving the XTYPE for Xbox
One controllers based on the interface protocol before checking
the interface number.

Change-Id: If15a19cde514ffdeddb506da9c4d34479408005a
Fixes: 1a48ff81b391 ("Input: xpad - add support for Xbox One controllers")
Signed-off-by: Cameron Gutman <aicommander@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/joystick/xpad.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c
index fd4100d56d8c..35e444b4b8b0 100644
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c
@@ -1206,16 +1206,6 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 			break;
 	}
 
-	if (xpad_device[i].xtype == XTYPE_XBOXONE &&
-	    intf->cur_altsetting->desc.bInterfaceNumber != 0) {
-		/*
-		 * The Xbox One controller lists three interfaces all with the
-		 * same interface class, subclass and protocol. Differentiate by
-		 * interface number.
-		 */
-		return -ENODEV;
-	}
-
 	xpad = kzalloc(sizeof(struct usb_xpad), GFP_KERNEL);
 	if (!xpad)
 		return -ENOMEM;
@@ -1246,6 +1236,8 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 		if (intf->cur_altsetting->desc.bInterfaceClass == USB_CLASS_VENDOR_SPEC) {
 			if (intf->cur_altsetting->desc.bInterfaceProtocol == 129)
 				xpad->xtype = XTYPE_XBOX360W;
+			else if (intf->cur_altsetting->desc.bInterfaceProtocol == 208)
+				xpad->xtype = XTYPE_XBOXONE;
 			else
 				xpad->xtype = XTYPE_XBOX360;
 		} else {
@@ -1260,6 +1252,17 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 			xpad->mapping |= MAP_STICKS_TO_NULL;
 	}
 
+	if (xpad->xtype == XTYPE_XBOXONE &&
+	    intf->cur_altsetting->desc.bInterfaceNumber != 0) {
+		/*
+		 * The Xbox One controller lists three interfaces all with the
+		 * same interface class, subclass and protocol. Differentiate by
+		 * interface number.
+		 */
+		error = -ENODEV;
+		goto err_free_in_urb;
+	}
+
 	error = xpad_init_output(intf, xpad);
 	if (error)
 		goto err_free_in_urb;

From a07b34771ace2077f218aa6eb30f4107495a6c2e Mon Sep 17 00:00:00 2001
From: Cameron Gutman <aicommander@gmail.com>
Date: Wed, 29 Jun 2016 09:51:35 -0700
Subject: [PATCH 0631/3220] BACKPORT: Input: xpad - validate USB endpoint count
 during probe

This prevents a malicious USB device from causing an oops.

Change-Id: I47c27541a4c2f0cec354eb83b3013bb825ed6e90
Signed-off-by: Cameron Gutman <aicommander@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/joystick/xpad.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c
index 35e444b4b8b0..2b2f9d66c2c7 100644
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c
@@ -1200,6 +1200,9 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 	int ep_irq_in_idx;
 	int i, error;
 
+	if (intf->cur_altsetting->desc.bNumEndpoints != 2)
+		return -ENODEV;
+
 	for (i = 0; xpad_device[i].idVendor; i++) {
 		if ((le16_to_cpu(udev->descriptor.idVendor) == xpad_device[i].idVendor) &&
 		    (le16_to_cpu(udev->descriptor.idProduct) == xpad_device[i].idProduct))

From 1cd3d347147bee1b8a3fb7624ab23eb3bdcece41 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <groeck@chromium.org>
Date: Mon, 30 Jan 2017 12:26:08 -0800
Subject: [PATCH 0632/3220] ANDROID: fs: Export free_fs_struct and set_fs_pwd

allmodconfig builds fail with:

ERROR: "free_fs_struct" undefined!
ERROR: "set_fs_pwd" undefined!

Export the missing symbols.

Change-Id: I4877ead19d7e7f0c93d4c4cad5681364284323aa
Fixes: 0ec03f845799 ("ANDROID: sdcardfs: override umask on mkdir and create")
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
 fs/fs_struct.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 005dcb401369..940c683561dd 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -44,6 +44,7 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
 	if (old_pwd.dentry)
 		path_put(&old_pwd);
 }
+EXPORT_SYMBOL(set_fs_pwd);
 
 static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
 {
@@ -89,6 +90,7 @@ void free_fs_struct(struct fs_struct *fs)
 	path_put(&fs->pwd);
 	kmem_cache_free(fs_cachep, fs);
 }
+EXPORT_SYMBOL(free_fs_struct);
 
 void exit_fs(struct task_struct *tsk)
 {

From 4c7fc336f6e3ec34af2c6931fce4d496f560da90 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <groeck@chromium.org>
Date: Mon, 30 Jan 2017 12:29:00 -0800
Subject: [PATCH 0633/3220] ANDROID: fs: Export vfs_rmdir2

allmodconfig builds fail with

ERROR: "vfs_rmdir2" undefined!

Export the missing function.

Change-Id: I983d327e59fd34e0484f3c54d925e97d3905c19c
Fixes: f9cb61dcb00c ("ANDROID: sdcardfs: User new permission2 functions")
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
 fs/namei.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/namei.c b/fs/namei.c
index 0230bc59f344..a896620f6d79 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3742,6 +3742,8 @@ out:
 		d_delete(dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_rmdir2);
+
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	return vfs_rmdir2(NULL, dir, dentry);

From df3087d4836dd35c138c628c50b0986169183ee9 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Wed, 1 Feb 2017 12:53:45 +0530
Subject: [PATCH 0634/3220] ANDROID: binder: fix format specifier for type
 binder_size_t
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix following warning on 32bit ARCH build:

  CC      drivers/android/binder.o
drivers/android/binder.c: In function ‘binder_transaction’:
./include/linux/kern_levels.h:4:18: warning: format ‘%lld’ expects argument of type ‘long long int’,
but argument 4 has type ‘binder_size_t {aka unsigned int}’ [-Wformat=]
drivers/android/binder.c:2047:3: note: in expansion of macro ‘binder_user_error’
   binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n",
   ^

Change-Id: I943d0d4d54f7f2a019900cc18e55bed661bec5a5
Fixes: Change-Id: I02417f28cff14688f2e1d6fcb959438fd96566cc
       (android: binder: support for scatter-gather.")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/android/binder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 951393825261..2196244c9647 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2047,7 +2047,7 @@ static void binder_transaction(struct binder_proc *proc,
 	if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) {
 		binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n",
 				  proc->pid, thread->pid,
-				  extra_buffers_size);
+				  (u64)extra_buffers_size);
 		return_error = BR_FAILED_REPLY;
 		goto err_bad_offset;
 	}

From 3e4f5484dcd596661af85bb28e96d946255e99f4 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 20 Jan 2017 15:19:13 -0800
Subject: [PATCH 0635/3220] ANDROID: sdcardfs: Allow non-owners to touch

This modifies the permission checks in setattr to
allow for non-owners to modify the timestamp of
files to things other than the current time.
This still requires write access, as enforced by
the permission call, but relaxes the requirement
that the caller must be the owner, allowing those
with group permissions to change it as well.

Bug: 11118565
Change-Id: Ied31f0cce2797675c7ef179eeb4e088185adcbad
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/inode.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 76a6e8ad0736..6f450e523675 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -739,6 +739,11 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	 * this user can change the lower inode: that should happen when
 	 * calling notify_change on the lower inode.
 	 */
+	/* prepare our own lower struct iattr (with the lower file) */
+	memcpy(&lower_ia, ia, sizeof(lower_ia));
+	/* Allow touch updating timestamps. A previous permission check ensures
+	 * we have write access. Changes to mode, owner, and group are ignored*/
+	ia->ia_valid |= ATTR_FORCE;
 	err = inode_change_ok(&tmp, ia);
 
 	if (!err) {
@@ -764,8 +769,6 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	lower_mnt = lower_path.mnt;
 	lower_inode = sdcardfs_lower_inode(inode);
 
-	/* prepare our own lower struct iattr (with the lower file) */
-	memcpy(&lower_ia, ia, sizeof(lower_ia));
 	if (ia->ia_valid & ATTR_FILE)
 		lower_ia.ia_file = sdcardfs_lower_file(ia->ia_file);
 

From d9300a998d196f15ad750f2ce568c37a49e3d82e Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Sat, 21 Jan 2017 00:35:26 -0800
Subject: [PATCH 0636/3220] ANDROID: sdcardfs: Refactor configfs interface

This refactors the configfs code to be more easily extended.
It will allow additional files to be added easily.

Bug: 34542611
Bug: 34262585
Change-Id: I73c9b0ae5ca7eb27f4ebef3e6807f088b512d539
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/packagelist.c | 133 +++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 80 deletions(-)

diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 03776fa5f26c..0b3fb50b1fe4 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -220,26 +220,24 @@ static void packagelist_destroy(void)
 	printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n");
 }
 
-struct package_appid {
+struct package_details {
 	struct config_item item;
-	int add_pid;
+	const char *name;
 };
 
-static inline struct package_appid *to_package_appid(struct config_item *item)
+static inline struct package_details *to_package_details(struct config_item *item)
 {
-	return item ? container_of(item, struct package_appid, item) : NULL;
+	return item ? container_of(item, struct package_details, item) : NULL;
 }
 
-static ssize_t package_appid_attr_show(struct config_item *item,
-				      char *page)
+static ssize_t package_details_appid_show(struct config_item *item, char *page)
 {
-	return scnprintf(page, PAGE_SIZE, "%u\n", get_appid(item->ci_name));
+	return scnprintf(page, PAGE_SIZE, "%u\n", get_appid(to_package_details(item)->name));
 }
 
-static ssize_t package_appid_attr_store(struct config_item *item,
+static ssize_t package_details_appid_store(struct config_item *item,
 				       const char *page, size_t count)
 {
-	struct package_appid *package_appid = to_package_appid(item);
 	unsigned int tmp;
 	int ret;
 
@@ -247,73 +245,60 @@ static ssize_t package_appid_attr_store(struct config_item *item,
 	if (ret)
 		return ret;
 
-	ret = insert_packagelist_entry(item->ci_name, tmp);
-	package_appid->add_pid = tmp;
+	ret = insert_packagelist_entry(to_package_details(item)->name, tmp);
+
 	if (ret)
 		return ret;
 
 	return count;
 }
 
-static struct configfs_attribute package_appid_attr_add_pid = {
-	.ca_owner = THIS_MODULE,
-	.ca_name = "appid",
-	.ca_mode = S_IRUGO | S_IWUGO,
-	.show = package_appid_attr_show,
-	.store = package_appid_attr_store,
-};
+static void package_details_release(struct config_item *item)
+{
+	struct package_details *package_details = to_package_details(item);
+	printk(KERN_INFO "sdcardfs: removing %s\n", package_details->name);
+	remove_packagelist_entry(package_details->name);
+	kfree(package_details->name);
+	kfree(package_details);
+}
 
-static struct configfs_attribute *package_appid_attrs[] = {
-	&package_appid_attr_add_pid,
+CONFIGFS_ATTR(package_details_, appid);
+
+static struct configfs_attribute *package_details_attrs[] = {
+	&package_details_attr_appid,
 	NULL,
 };
 
-static void package_appid_release(struct config_item *item)
-{
-	printk(KERN_INFO "sdcardfs: removing %s\n", item->ci_dentry->d_name.name);
-	/* item->ci_name is freed already, so we rely on the dentry */
-	remove_packagelist_entry(item->ci_dentry->d_name.name);
-	kfree(to_package_appid(item));
-}
-
-static struct configfs_item_operations package_appid_item_ops = {
-	.release		= package_appid_release,
+static struct configfs_item_operations package_details_item_ops = {
+      .release = package_details_release,
 };
 
 static struct config_item_type package_appid_type = {
-	.ct_item_ops	= &package_appid_item_ops,
-	.ct_attrs	= package_appid_attrs,
+	.ct_item_ops	= &package_details_item_ops,
+	.ct_attrs	= package_details_attrs,
 	.ct_owner	= THIS_MODULE,
 };
 
-
-struct sdcardfs_packages {
-	struct config_group group;
-};
-
-static inline struct sdcardfs_packages *to_sdcardfs_packages(struct config_item *item)
+static struct config_item *packages_make_item(struct config_group *group, const char *name)
 {
-	return item ? container_of(to_config_group(item), struct sdcardfs_packages, group) : NULL;
-}
+	struct package_details *package_details;
 
-static struct config_item *sdcardfs_packages_make_item(struct config_group *group, const char *name)
-{
-	struct package_appid *package_appid;
-
-	package_appid = kzalloc(sizeof(struct package_appid), GFP_KERNEL);
-	if (!package_appid)
+	package_details = kzalloc(sizeof(struct package_details), GFP_KERNEL);
+	if (!package_details)
 		return ERR_PTR(-ENOMEM);
+	package_details->name = kstrdup(name, GFP_KERNEL);
+	if (!package_details->name) {
+		kfree(package_details);
+		return ERR_PTR(-ENOMEM);
+	}
 
-	config_item_init_type_name(&package_appid->item, name,
+	config_item_init_type_name(&package_details->item, name,
 				   &package_appid_type);
 
-	package_appid->add_pid = 0;
-
-	return &package_appid->item;
+	return &package_details->item;
 }
 
-static ssize_t packages_attr_show(struct config_item *item,
-					 char *page)
+static ssize_t packages_list_show(struct config_item *item, char *page)
 {
 	struct hashtable_entry *hash_cur;
 	int i;
@@ -335,49 +320,37 @@ static ssize_t packages_attr_show(struct config_item *item,
 	return count;
 }
 
-static struct configfs_attribute sdcardfs_packages_attr_description = {
-	.ca_owner = THIS_MODULE,
-	.ca_name = "packages_gid.list",
-	.ca_mode = S_IRUGO,
-	.show = packages_attr_show,
+static struct configfs_attribute packages_attr_packages_gid_list = {
+	.ca_name	= "packages_gid.list",
+	.ca_mode	= S_IRUGO,
+	.ca_owner	= THIS_MODULE,
+	.show		= packages_list_show,
 };
 
-static struct configfs_attribute *sdcardfs_packages_attrs[] = {
-	&sdcardfs_packages_attr_description,
+static struct configfs_attribute *packages_attrs[] = {
+	&packages_attr_packages_gid_list,
 	NULL,
 };
 
-static void sdcardfs_packages_release(struct config_item *item)
-{
-
-	printk(KERN_INFO "sdcardfs: destroyed something?\n");
-	kfree(to_sdcardfs_packages(item));
-}
-
-static struct configfs_item_operations sdcardfs_packages_item_ops = {
-	.release	= sdcardfs_packages_release,
-};
-
 /*
  * Note that, since no extra work is required on ->drop_item(),
  * no ->drop_item() is provided.
  */
-static struct configfs_group_operations sdcardfs_packages_group_ops = {
-	.make_item	= sdcardfs_packages_make_item,
+static struct configfs_group_operations packages_group_ops = {
+	.make_item	= packages_make_item,
 };
 
-static struct config_item_type sdcardfs_packages_type = {
-	.ct_item_ops	= &sdcardfs_packages_item_ops,
-	.ct_group_ops	= &sdcardfs_packages_group_ops,
-	.ct_attrs	= sdcardfs_packages_attrs,
+static struct config_item_type packages_type = {
+	.ct_group_ops	= &packages_group_ops,
+	.ct_attrs	= packages_attrs,
 	.ct_owner	= THIS_MODULE,
 };
 
-static struct configfs_subsystem sdcardfs_packages_subsys = {
+static struct configfs_subsystem sdcardfs_packages = {
 	.su_group = {
 		.cg_item = {
 			.ci_namebuf = "sdcardfs",
-			.ci_type = &sdcardfs_packages_type,
+			.ci_type = &packages_type,
 		},
 	},
 };
@@ -385,7 +358,7 @@ static struct configfs_subsystem sdcardfs_packages_subsys = {
 static int configfs_sdcardfs_init(void)
 {
 	int ret;
-	struct configfs_subsystem *subsys = &sdcardfs_packages_subsys;
+	struct configfs_subsystem *subsys = &sdcardfs_packages;
 
 	config_group_init(&subsys->su_group);
 	mutex_init(&subsys->su_mutex);
@@ -400,7 +373,7 @@ static int configfs_sdcardfs_init(void)
 
 static void configfs_sdcardfs_exit(void)
 {
-	configfs_unregister_subsystem(&sdcardfs_packages_subsys);
+	configfs_unregister_subsystem(&sdcardfs_packages);
 }
 
 int packagelist_init(void)

From 09c77d6230066003be095f1ac315622ec17ee355 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Sun, 22 Jan 2017 15:32:49 -0800
Subject: [PATCH 0637/3220] ANDROID: sdcardfs: add support for user permission
 isolation

This allows you to hide the existence of a package from
a user by adding them to an exclude list. If a user
creates that package's folder and is on the exclude list,
they will not see that package's id.

Bug: 34542611
Change-Id: I9eb82e0bf2457d7eb81ee56153b9c7d2f6646323
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c |  32 ++--
 fs/sdcardfs/packagelist.c  | 297 ++++++++++++++++++++++++++++++++++---
 fs/sdcardfs/sdcardfs.h     |  17 ++-
 3 files changed, 306 insertions(+), 40 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 0c1a91f70b40..8e3baee4a2d9 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -103,7 +103,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st
 		case PERM_ANDROID_OBB:
 		case PERM_ANDROID_MEDIA:
 			appid = get_appid(newdentry->d_name.name);
-			if (appid != 0) {
+			if (appid != 0 && !is_excluded(newdentry->d_name.name, parent_info->userid)) {
 				info->d_uid = multiuser_get_uid(parent_info->userid, appid);
 			}
 			set_top(info, &info->vfs_inode);
@@ -116,8 +116,10 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry)
 	get_derived_permission_new(parent, dentry, dentry);
 }
 
-static int descendant_may_need_fixup(perm_t perm) {
-	if (perm == PERM_PRE_ROOT || perm == PERM_ROOT || perm == PERM_ANDROID)
+static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) {
+	if (info->perm == PERM_ROOT)
+		return (limit->flags & BY_USERID)?info->userid == limit->userid:1;
+	if (info->perm == PERM_PRE_ROOT || info->perm == PERM_ANDROID)
 		return 1;
 	return 0;
 }
@@ -129,7 +131,7 @@ static int needs_fixup(perm_t perm) {
 	return 0;
 }
 
-void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len) {
+void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit) {
 	struct dentry *child;
 	struct sdcardfs_inode_info *info;
 	if (!dget(dentry))
@@ -143,22 +145,22 @@ void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len)
 	if (needs_fixup(info->perm)) {
 		spin_lock(&dentry->d_lock);
 		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
-				dget(child);
-				if (!strncasecmp(child->d_name.name, name, len)) {
-					if (d_inode(child)) {
-						get_derived_permission(dentry, child);
-						fixup_tmp_permissions(d_inode(child));
-						dput(child);
-						break;
-					}
+			dget(child);
+			if (!(limit->flags & BY_NAME) || !strncasecmp(child->d_name.name, limit->name, limit->length)) {
+				if (d_inode(child)) {
+					get_derived_permission(dentry, child);
+					fixup_tmp_permissions(d_inode(child));
+					dput(child);
+					break;
 				}
-				dput(child);
+			}
+			dput(child);
 		}
 		spin_unlock(&dentry->d_lock);
-	} else 	if (descendant_may_need_fixup(info->perm)) {
+	} else 	if (descendant_may_need_fixup(info, limit)) {
 		spin_lock(&dentry->d_lock);
 		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
-				fixup_perms_recursive(child, name, len);
+				fixup_perms_recursive(child, limit);
 		}
 		spin_unlock(&dentry->d_lock);
 	}
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 0b3fb50b1fe4..6eb73ddc2ceb 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -31,11 +31,13 @@
 
 struct hashtable_entry {
 	struct hlist_node hlist;
+	struct hlist_node dlist; /* for deletion cleanup */
 	const char *key;
 	atomic_t value;
 };
 
 static DEFINE_HASHTABLE(package_to_appid, 8);
+static DEFINE_HASHTABLE(package_to_userid, 8);
 
 static struct kmem_cache *hashtable_entry_cachep;
 
@@ -69,6 +71,22 @@ appid_t get_appid(const char *app_name)
 	return 0;
 }
 
+appid_t is_excluded(const char *app_name, userid_t user)
+{
+	struct hashtable_entry *hash_cur;
+	unsigned int hash = str_hash(app_name);
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+		if (atomic_read(&hash_cur->value) == user && !strcasecmp(app_name, hash_cur->key)) {
+			rcu_read_unlock();
+			return 1;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 /* Kernel has already enforced everything we returned through
  * derive_permissions_locked(), so this is used to lock down access
  * even further, such as enforcing that apps hold sdcard_rw. */
@@ -124,7 +142,7 @@ static struct hashtable_entry *alloc_packagelist_entry(const char *key,
 	return ret;
 }
 
-static int insert_packagelist_entry_locked(const char *key, appid_t value)
+static int insert_packagelist_appid_entry_locked(const char *key, appid_t value)
 {
 	struct hashtable_entry *hash_cur;
 	struct hashtable_entry *new_entry;
@@ -143,18 +161,64 @@ static int insert_packagelist_entry_locked(const char *key, appid_t value)
 	return 0;
 }
 
-static void fixup_perms(struct super_block *sb, const char *key) {
-	if (sb && sb->s_magic == SDCARDFS_SUPER_MAGIC) {
-		fixup_perms_recursive(sb->s_root, key, strlen(key));
+static int insert_userid_exclude_entry_locked(const char *key, userid_t value)
+{
+	struct hashtable_entry *hash_cur;
+	struct hashtable_entry *new_entry;
+	unsigned int hash = str_hash(key);
+
+	/* Only insert if not already present */
+	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+		if (atomic_read(&hash_cur->value) == value && !strcasecmp(key, hash_cur->key))
+			return 0;
+	}
+	new_entry = alloc_packagelist_entry(key, value);
+	if (!new_entry)
+		return -ENOMEM;
+	hash_add_rcu(package_to_userid, &new_entry->hlist, hash);
+	return 0;
+}
+
+static void fixup_all_perms_name(const char *key)
+{
+	struct sdcardfs_sb_info *sbinfo;
+	struct limit_search limit = {
+		.flags = BY_NAME,
+		.name = key,
+		.length = strlen(key),
+	};
+	list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+		if (sbinfo_has_sdcard_magic(sbinfo))
+			fixup_perms_recursive(sbinfo->sb->s_root, &limit);
 	}
 }
 
-static void fixup_all_perms(const char *key)
+static void fixup_all_perms_name_userid(const char *key, userid_t userid)
 {
 	struct sdcardfs_sb_info *sbinfo;
-	list_for_each_entry(sbinfo, &sdcardfs_super_list, list)
-		if (sbinfo)
-			fixup_perms(sbinfo->sb, key);
+	struct limit_search limit = {
+		.flags = BY_NAME | BY_USERID,
+		.name = key,
+		.length = strlen(key),
+		.userid = userid,
+	};
+	list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+		if (sbinfo_has_sdcard_magic(sbinfo))
+			fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+	}
+}
+
+static void fixup_all_perms_userid(userid_t userid)
+{
+	struct sdcardfs_sb_info *sbinfo;
+	struct limit_search limit = {
+		.flags = BY_USERID,
+		.userid = userid,
+	};
+	list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+		if (sbinfo_has_sdcard_magic(sbinfo))
+			fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+	}
 }
 
 static int insert_packagelist_entry(const char *key, appid_t value)
@@ -162,9 +226,22 @@ static int insert_packagelist_entry(const char *key, appid_t value)
 	int err;
 
 	mutex_lock(&sdcardfs_super_list_lock);
-	err = insert_packagelist_entry_locked(key, value);
+	err = insert_packagelist_appid_entry_locked(key, value);
 	if (!err)
-		fixup_all_perms(key);
+		fixup_all_perms_name(key);
+	mutex_unlock(&sdcardfs_super_list_lock);
+
+	return err;
+}
+
+static int insert_userid_exclude_entry(const char *key, userid_t value)
+{
+	int err;
+
+	mutex_lock(&sdcardfs_super_list_lock);
+	err = insert_userid_exclude_entry_locked(key, value);
+	if (!err)
+		fixup_all_perms_name_userid(key, value);
 	mutex_unlock(&sdcardfs_super_list_lock);
 
 	return err;
@@ -173,7 +250,7 @@ static int insert_packagelist_entry(const char *key, appid_t value)
 static void free_packagelist_entry(struct hashtable_entry *entry)
 {
 	kfree(entry->key);
-	hash_del_rcu(&entry->hlist);
+	hash_del_rcu(&entry->dlist);
 	kmem_cache_free(hashtable_entry_cachep, entry);
 }
 
@@ -181,22 +258,84 @@ static void remove_packagelist_entry_locked(const char *key)
 {
 	struct hashtable_entry *hash_cur;
 	unsigned int hash = str_hash(key);
+	struct hlist_node *h_t;
+	HLIST_HEAD(free_list);
 
+	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+		if (!strcasecmp(key, hash_cur->key)) {
+			hash_del_rcu(&hash_cur->hlist);
+			hlist_add_head(&hash_cur->dlist, &free_list);
+		}
+	}
 	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
 		if (!strcasecmp(key, hash_cur->key)) {
 			hash_del_rcu(&hash_cur->hlist);
-			synchronize_rcu();
-			free_packagelist_entry(hash_cur);
-			return;
+			hlist_add_head(&hash_cur->dlist, &free_list);
+			break;
 		}
 	}
+	synchronize_rcu();
+	hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
+		free_packagelist_entry(hash_cur);
 }
 
 static void remove_packagelist_entry(const char *key)
 {
 	mutex_lock(&sdcardfs_super_list_lock);
 	remove_packagelist_entry_locked(key);
-	fixup_all_perms(key);
+	fixup_all_perms_name(key);
+	mutex_unlock(&sdcardfs_super_list_lock);
+	return;
+}
+
+static void remove_userid_all_entry_locked(userid_t userid)
+{
+	struct hashtable_entry *hash_cur;
+	struct hlist_node *h_t;
+	HLIST_HEAD(free_list);
+	int i;
+
+	hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+		if (atomic_read(&hash_cur->value) == userid) {
+			hash_del_rcu(&hash_cur->hlist);
+			hlist_add_head(&hash_cur->dlist, &free_list);
+		}
+	}
+	synchronize_rcu();
+	hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) {
+		free_packagelist_entry(hash_cur);
+	}
+}
+
+static void remove_userid_all_entry(userid_t userid)
+{
+	mutex_lock(&sdcardfs_super_list_lock);
+	remove_userid_all_entry_locked(userid);
+	fixup_all_perms_userid(userid);
+	mutex_unlock(&sdcardfs_super_list_lock);
+	return;
+}
+
+static void remove_userid_exclude_entry_locked(const char *key, userid_t userid)
+{
+	struct hashtable_entry *hash_cur;
+	unsigned int hash = str_hash(key);
+
+	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+		if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == userid) {
+			hash_del_rcu(&hash_cur->hlist);
+			synchronize_rcu();
+			free_packagelist_entry(hash_cur);
+			break;
+		}
+	}
+}
+
+static void remove_userid_exclude_entry(const char *key, userid_t userid)
+{
+	mutex_lock(&sdcardfs_super_list_lock);
+	remove_userid_exclude_entry_locked(key, userid);
+	fixup_all_perms_name_userid(key, userid);
 	mutex_unlock(&sdcardfs_super_list_lock);
 	return;
 }
@@ -210,16 +349,44 @@ static void packagelist_destroy(void)
 	mutex_lock(&sdcardfs_super_list_lock);
 	hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) {
 		hash_del_rcu(&hash_cur->hlist);
-		hlist_add_head(&hash_cur->hlist, &free_list);
-
+		hlist_add_head(&hash_cur->dlist, &free_list);
+	}
+	hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+		hash_del_rcu(&hash_cur->hlist);
+		hlist_add_head(&hash_cur->dlist, &free_list);
 	}
 	synchronize_rcu();
-	hlist_for_each_entry_safe(hash_cur, h_t, &free_list, hlist)
+	hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
 		free_packagelist_entry(hash_cur);
 	mutex_unlock(&sdcardfs_super_list_lock);
 	printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n");
 }
 
+#define SDCARDFS_CONFIGFS_ATTR(_pfx, _name)			\
+static struct configfs_attribute _pfx##attr_##_name = {	\
+	.ca_name	= __stringify(_name),		\
+	.ca_mode	= S_IRUGO | S_IWUGO,		\
+	.ca_owner	= THIS_MODULE,			\
+	.show		= _pfx##_name##_show,		\
+	.store		= _pfx##_name##_store,		\
+}
+
+#define SDCARDFS_CONFIGFS_ATTR_RO(_pfx, _name)			\
+static struct configfs_attribute _pfx##attr_##_name = {	\
+	.ca_name	= __stringify(_name),		\
+	.ca_mode	= S_IRUGO,			\
+	.ca_owner	= THIS_MODULE,			\
+	.show		= _pfx##_name##_show,		\
+}
+
+#define SDCARDFS_CONFIGFS_ATTR_WO(_pfx, _name)			\
+static struct configfs_attribute _pfx##attr_##_name = {	\
+	.ca_name	= __stringify(_name),		\
+	.ca_mode	= S_IWUGO,			\
+	.ca_owner	= THIS_MODULE,			\
+	.store		= _pfx##_name##_store,		\
+}
+
 struct package_details {
 	struct config_item item;
 	const char *name;
@@ -253,6 +420,58 @@ static ssize_t package_details_appid_store(struct config_item *item,
 	return count;
 }
 
+static ssize_t package_details_excluded_userids_show(struct config_item *item,
+				      char *page)
+{
+	struct package_details *package_details = to_package_details(item);
+	struct hashtable_entry *hash_cur;
+	unsigned int hash = str_hash(package_details->name);
+	int count = 0;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+		if (!strcasecmp(package_details->name, hash_cur->key))
+			count += scnprintf(page + count, PAGE_SIZE - count,
+					"%d ", atomic_read(&hash_cur->value));
+	}
+	rcu_read_unlock();
+	if (count)
+		count--;
+	count += scnprintf(page + count, PAGE_SIZE - count, "\n");
+	return count;
+}
+
+static ssize_t package_details_excluded_userids_store(struct config_item *item,
+				       const char *page, size_t count)
+{
+	unsigned int tmp;
+	int ret;
+
+	ret = kstrtouint(page, 10, &tmp);
+	if (ret)
+		return ret;
+
+	ret = insert_userid_exclude_entry(to_package_details(item)->name, tmp);
+
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static ssize_t package_details_clear_userid_store(struct config_item *item,
+				       const char *page, size_t count)
+{
+	unsigned int tmp;
+	int ret;
+
+	ret = kstrtouint(page, 10, &tmp);
+	if (ret)
+		return ret;
+	remove_userid_exclude_entry(to_package_details(item)->name, tmp);
+	return count;
+}
+
 static void package_details_release(struct config_item *item)
 {
 	struct package_details *package_details = to_package_details(item);
@@ -262,10 +481,14 @@ static void package_details_release(struct config_item *item)
 	kfree(package_details);
 }
 
-CONFIGFS_ATTR(package_details_, appid);
+SDCARDFS_CONFIGFS_ATTR(package_details_, appid);
+SDCARDFS_CONFIGFS_ATTR(package_details_, excluded_userids);
+SDCARDFS_CONFIGFS_ATTR_WO(package_details_, clear_userid);
 
 static struct configfs_attribute *package_details_attrs[] = {
 	&package_details_attr_appid,
+	&package_details_attr_excluded_userids,
+	&package_details_attr_clear_userid,
 	NULL,
 };
 
@@ -293,23 +516,33 @@ static struct config_item *packages_make_item(struct config_group *group, const
 	}
 
 	config_item_init_type_name(&package_details->item, name,
-				   &package_appid_type);
+						&package_appid_type);
 
 	return &package_details->item;
 }
 
 static ssize_t packages_list_show(struct config_item *item, char *page)
 {
-	struct hashtable_entry *hash_cur;
+	struct hashtable_entry *hash_cur_app;
+	struct hashtable_entry *hash_cur_user;
 	int i;
 	int count = 0, written = 0;
 	const char errormsg[] = "<truncated>\n";
+	unsigned int hash;
 
 	rcu_read_lock();
-	hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) {
+	hash_for_each_rcu(package_to_appid, i, hash_cur_app, hlist) {
 		written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n",
-					(const char *)hash_cur->key, atomic_read(&hash_cur->value));
-		if (count + written == PAGE_SIZE - sizeof(errormsg)) {
+					hash_cur_app->key, atomic_read(&hash_cur_app->value));
+		hash = str_hash(hash_cur_app->key);
+		hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) {
+			if (!strcasecmp(hash_cur_app->key, hash_cur_user->key)) {
+				written += scnprintf(page + count + written - 1,
+					PAGE_SIZE - sizeof(errormsg) - count - written + 1,
+					" %d\n",	atomic_read(&hash_cur_user->value)) - 1;
+			}
+		}
+		if (count + written == PAGE_SIZE - sizeof(errormsg) - 1) {
 			count += scnprintf(page + count, PAGE_SIZE - count, errormsg);
 			break;
 		}
@@ -320,6 +553,19 @@ static ssize_t packages_list_show(struct config_item *item, char *page)
 	return count;
 }
 
+static ssize_t packages_remove_userid_store(struct config_item *item,
+				       const char *page, size_t count)
+{
+	unsigned int tmp;
+	int ret;
+
+	ret = kstrtouint(page, 10, &tmp);
+	if (ret)
+		return ret;
+	remove_userid_all_entry(tmp);
+	return count;
+}
+
 static struct configfs_attribute packages_attr_packages_gid_list = {
 	.ca_name	= "packages_gid.list",
 	.ca_mode	= S_IRUGO,
@@ -327,8 +573,11 @@ static struct configfs_attribute packages_attr_packages_gid_list = {
 	.show		= packages_list_show,
 };
 
+SDCARDFS_CONFIGFS_ATTR_WO(packages_, remove_userid);
+
 static struct configfs_attribute *packages_attrs[] = {
 	&packages_attr_packages_gid_list,
+	&packages_attr_remove_userid,
 	NULL,
 };
 
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index b03130329014..3434849cee63 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -335,6 +335,11 @@ static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \
 SDCARDFS_DENT_FUNC(lower_path)
 SDCARDFS_DENT_FUNC(orig_path)
 
+static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo)
+{
+  return sbinfo && sbinfo->sb && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC;
+}
+
 /* grab a refererence if we aren't linking to ourself */
 static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top)
 {
@@ -442,18 +447,28 @@ extern struct list_head sdcardfs_super_list;
 
 /* for packagelist.c */
 extern appid_t get_appid(const char *app_name);
+extern appid_t is_excluded(const char *app_name, userid_t userid);
 extern int check_caller_access_to_name(struct inode *parent_node, const char* name);
 extern int open_flags_to_access_mode(int open_flags);
 extern int packagelist_init(void);
 extern void packagelist_exit(void);
 
 /* for derived_perm.c */
+#define BY_NAME		(1 << 0)
+#define BY_USERID	(1 << 1)
+struct limit_search {
+	unsigned int flags;
+	const char *name;
+	size_t length;
+	userid_t userid;
+};
+
 extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
 			uid_t uid, bool under_android, struct inode *top);
 extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
 extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry);
 extern void fixup_top_recursive(struct dentry *parent);
-extern void fixup_perms_recursive(struct dentry *dentry, const char *name, size_t len);
+extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
 
 extern void update_derived_permission_lock(struct dentry *dentry);
 extern int need_graft_path(struct dentry *dentry);

From d46684aa425d950ed86d4e3c10367b2a5c0b355d Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 26 Jan 2017 20:10:34 -0800
Subject: [PATCH 0638/3220] ANDROID: sdcardfs: Remove redundant operation

We call get_derived_permission_new unconditionally, so we don't need
to call update_derived_permission_lock, which does the same thing.

Change-Id: I0748100828c6af806da807241a33bf42be614935
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/inode.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 6f450e523675..a2cad0f76f14 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -466,7 +466,6 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *lower_new_dir_dentry = NULL;
 	struct vfsmount *lower_mnt = NULL;
 	struct dentry *trap = NULL;
-	struct dentry *new_parent = NULL;
 	struct path lower_old_path, lower_new_path;
 	const struct cred *saved_cred = NULL;
 
@@ -516,17 +515,6 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (new_dir != old_dir) {
 		sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry));
 		fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry));
-
-		/* update the derived permission of the old_dentry
-		 * with its new parent
-		 */
-		new_parent = dget_parent(new_dentry);
-		if(new_parent) {
-			if(d_inode(old_dentry)) {
-				update_derived_permission_lock(old_dentry);
-			}
-			dput(new_parent);
-		}
 	}
 	/* At this point, not all dentry information has been moved, so
 	 * we pass along new_dentry for the name.*/

From 461da83e67affea70e2f70a0231606ad3f29f5b7 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 25 Jan 2017 13:48:45 -0800
Subject: [PATCH 0639/3220] ANDROID: sdcardfs: Add GID Derivation to sdcardfs

This changes sdcardfs to modify the user and group in the
underlying filesystem depending on its usage. Ownership is
set by Android user, and package, as well as if the file is
under obb or cache. Other files can be labeled by extension.
Those values are set via the configfs interace.

To add an entry,
mkdir -p [configfs root]/sdcardfs/extensions/[gid]/[ext]

Bug: 34262585
Change-Id: I4e030ce84f094a678376349b1a96923e5076a0f4
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c | 167 +++++++++++++++++++++++++--
 fs/sdcardfs/file.c         |   2 +-
 fs/sdcardfs/inode.c        |  34 +++---
 fs/sdcardfs/lookup.c       |   3 +-
 fs/sdcardfs/multiuser.h    |  28 +++--
 fs/sdcardfs/packagelist.c  | 228 ++++++++++++++++++++++++++++++++++---
 fs/sdcardfs/sdcardfs.h     |  25 +++-
 7 files changed, 432 insertions(+), 55 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 8e3baee4a2d9..d2bff5ecdad0 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -30,6 +30,8 @@ static void inherit_derived_state(struct inode *parent, struct inode *child)
 	ci->userid = pi->userid;
 	ci->d_uid = pi->d_uid;
 	ci->under_android = pi->under_android;
+	ci->under_cache = pi->under_cache;
+	ci->under_obb = pi->under_obb;
 	set_top(ci, pi->top);
 }
 
@@ -43,11 +45,13 @@ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
 	info->userid = userid;
 	info->d_uid = uid;
 	info->under_android = under_android;
+	info->under_cache = false;
+	info->under_obb = false;
 	set_top(info, top);
 }
 
 /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */
-void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry)
+void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const char *name)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
 	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
@@ -57,26 +61,30 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st
 	 * the properties are maintained on its private fields
 	 * because the inode attributes will be modified with that of
 	 * its lower inode.
-	 * The derived state will be updated on the last
-	 * stage of each system call by fix_derived_permission(inode).
+	 * These values are used by our custom permission call instead
+	 * of using the inode permissions.
 	 */
 
 	inherit_derived_state(d_inode(parent), d_inode(dentry));
 
+	/* Files don't get special labels */
+	if (!S_ISDIR(d_inode(dentry)->i_mode))
+		return;
 	/* Derive custom permissions based on parent and current node */
 	switch (parent_info->perm) {
 		case PERM_INHERIT:
+		case PERM_ANDROID_PACKAGE_CACHE:
 			/* Already inherited above */
 			break;
 		case PERM_PRE_ROOT:
 			/* Legacy internal layout places users at top level */
 			info->perm = PERM_ROOT;
-			info->userid = simple_strtoul(newdentry->d_name.name, NULL, 10);
+			info->userid = simple_strtoul(name, NULL, 10);
 			set_top(info, &info->vfs_inode);
 			break;
 		case PERM_ROOT:
 			/* Assume masked off by default. */
-			if (!strcasecmp(newdentry->d_name.name, "Android")) {
+			if (!strcasecmp(name, "Android")) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID;
 				info->under_android = true;
@@ -84,36 +92,152 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st
 			}
 			break;
 		case PERM_ANDROID:
-			if (!strcasecmp(newdentry->d_name.name, "data")) {
+			if (!strcasecmp(name, "data")) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID_DATA;
 				set_top(info, &info->vfs_inode);
-			} else if (!strcasecmp(newdentry->d_name.name, "obb")) {
+			} else if (!strcasecmp(name, "obb")) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID_OBB;
+				info->under_obb = true;
 				set_top(info, &info->vfs_inode);
 				/* Single OBB directory is always shared */
-			} else if (!strcasecmp(newdentry->d_name.name, "media")) {
+			} else if (!strcasecmp(name, "media")) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID_MEDIA;
 				set_top(info, &info->vfs_inode);
 			}
 			break;
-		case PERM_ANDROID_DATA:
 		case PERM_ANDROID_OBB:
+		case PERM_ANDROID_DATA:
 		case PERM_ANDROID_MEDIA:
-			appid = get_appid(newdentry->d_name.name);
-			if (appid != 0 && !is_excluded(newdentry->d_name.name, parent_info->userid)) {
+			info->perm = PERM_ANDROID_PACKAGE;
+			appid = get_appid(name);
+			if (appid != 0 && !is_excluded(name, parent_info->userid)) {
 				info->d_uid = multiuser_get_uid(parent_info->userid, appid);
 			}
 			set_top(info, &info->vfs_inode);
 			break;
+		case PERM_ANDROID_PACKAGE:
+			if (!strcasecmp(name, "cache")) {
+				info->perm = PERM_ANDROID_PACKAGE_CACHE;
+				info->under_cache = true;
+			}
+			break;
 	}
 }
 
 void get_derived_permission(struct dentry *parent, struct dentry *dentry)
 {
-	get_derived_permission_new(parent, dentry, dentry);
+	get_derived_permission_new(parent, dentry, dentry->d_name.name);
+}
+
+static appid_t get_type(const char *name) {
+	const char *ext = strrchr(name, '.');
+	appid_t id;
+
+	if (ext && ext[0]) {
+		ext = &ext[1];
+		id = get_ext_gid(ext);
+		return id?:AID_MEDIA_RW;
+	}
+	return AID_MEDIA_RW;
+}
+
+void fixup_lower_ownership(struct dentry* dentry, const char *name) {
+	struct path path;
+	struct inode *inode;
+	struct inode *delegated_inode = NULL;
+	int error;
+	struct sdcardfs_inode_info *info;
+	struct sdcardfs_inode_info *info_top;
+	perm_t perm;
+	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+	uid_t uid = sbi->options.fs_low_uid;
+	gid_t gid = sbi->options.fs_low_gid;
+	struct iattr newattrs;
+
+	info = SDCARDFS_I(d_inode(dentry));
+	perm = info->perm;
+	if (info->under_obb) {
+		perm = PERM_ANDROID_OBB;
+	} else if (info->under_cache) {
+		perm = PERM_ANDROID_PACKAGE_CACHE;
+	} else if (perm == PERM_INHERIT) {
+		info_top = SDCARDFS_I(grab_top(info));
+		perm = info_top->perm;
+		release_top(info);
+	}
+
+	switch (perm) {
+		case PERM_ROOT:
+		case PERM_ANDROID:
+		case PERM_ANDROID_DATA:
+		case PERM_ANDROID_MEDIA:
+		case PERM_ANDROID_PACKAGE:
+		case PERM_ANDROID_PACKAGE_CACHE:
+			uid = multiuser_get_uid(info->userid, uid);
+			break;
+		case PERM_ANDROID_OBB:
+			uid = AID_MEDIA_OBB;
+			break;
+		case PERM_PRE_ROOT:
+		default:
+			break;
+	}
+	switch (perm) {
+		case PERM_ROOT:
+		case PERM_ANDROID:
+		case PERM_ANDROID_DATA:
+		case PERM_ANDROID_MEDIA:
+			if (S_ISDIR(d_inode(dentry)->i_mode))
+				gid = multiuser_get_uid(info->userid, AID_MEDIA_RW);
+			else
+				gid = multiuser_get_uid(info->userid, get_type(name));
+			break;
+		case PERM_ANDROID_OBB:
+			gid = AID_MEDIA_OBB;
+			break;
+		case PERM_ANDROID_PACKAGE:
+			if (info->d_uid != 0)
+				gid = multiuser_get_ext_gid(info->userid, info->d_uid);
+			else
+				gid = multiuser_get_uid(info->userid, uid);
+			break;
+		case PERM_ANDROID_PACKAGE_CACHE:
+			if (info->d_uid != 0)
+				gid = multiuser_get_cache_gid(info->userid, info->d_uid);
+			else
+				gid = multiuser_get_uid(info->userid, uid);
+			break;
+		case PERM_PRE_ROOT:
+		default:
+			break;
+	}
+
+	sdcardfs_get_lower_path(dentry, &path);
+	inode = d_inode(path.dentry);
+	if (d_inode(path.dentry)->i_gid.val != gid || d_inode(path.dentry)->i_uid.val != uid) {
+retry_deleg:
+		newattrs.ia_valid = ATTR_GID | ATTR_UID | ATTR_FORCE;
+		newattrs.ia_uid = make_kuid(current_user_ns(), uid);
+		newattrs.ia_gid = make_kgid(current_user_ns(), gid);
+		if (!S_ISDIR(inode->i_mode))
+			newattrs.ia_valid |=
+				ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+		mutex_lock(&inode->i_mutex);
+		error = security_path_chown(&path, newattrs.ia_uid, newattrs.ia_gid);
+		if (!error)
+			error = notify_change2(path.mnt, path.dentry, &newattrs, &delegated_inode);
+		mutex_unlock(&inode->i_mutex);
+		if (delegated_inode) {
+			error = break_deleg_wait(&delegated_inode);
+			if (!error)
+				goto retry_deleg;
+		}
+		if (error)
+			pr_err("sdcardfs: Failed to touch up lower fs gid/uid.\n");
+	}
 }
 
 static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) {
@@ -167,9 +291,28 @@ void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit) {
 	dput(dentry);
 }
 
+void drop_recursive(struct dentry *parent) {
+	struct dentry *dentry;
+	struct sdcardfs_inode_info *info;
+	if (!d_inode(parent))
+		return;
+	info = SDCARDFS_I(d_inode(parent));
+	spin_lock(&parent->d_lock);
+	list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
+		if (d_inode(dentry)) {
+			if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) {
+				drop_recursive(dentry);
+				d_drop(dentry);
+			}
+		}
+	}
+	spin_unlock(&parent->d_lock);
+}
+
 void fixup_top_recursive(struct dentry *parent) {
 	struct dentry *dentry;
 	struct sdcardfs_inode_info *info;
+
 	if (!d_inode(parent))
 		return;
 	info = SDCARDFS_I(d_inode(parent));
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 7750a0472389..006c6ff57ad7 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -225,7 +225,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(sbi, saved_cred);
+	OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(inode));
 
 	file->private_data =
 		kzalloc(sizeof(struct sdcardfs_file_info), GFP_KERNEL);
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index a2cad0f76f14..cb0588691a0f 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -22,16 +22,21 @@
 #include <linux/fs_struct.h>
 
 /* Do not directly use this function. Use OVERRIDE_CRED() instead. */
-const struct cred * override_fsids(struct sdcardfs_sb_info* sbi)
+const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info)
 {
 	struct cred * cred;
 	const struct cred * old_cred;
+	uid_t uid;
 
 	cred = prepare_creds();
 	if (!cred)
 		return NULL;
 
-	cred->fsuid = make_kuid(&init_user_ns, sbi->options.fs_low_uid);
+	if (info->under_obb)
+		uid = AID_MEDIA_OBB;
+	else
+		uid = multiuser_get_uid(info->userid, sbi->options.fs_low_uid);
+	cred->fsuid = make_kuid(&init_user_ns, uid);
 	cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid);
 
 	old_cred = override_creds(cred);
@@ -70,7 +75,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred);
+	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
 
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
@@ -98,6 +103,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 		goto out;
 	fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
 	fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
+	fixup_lower_ownership(dentry, dentry->d_name.name);
 
 out:
 	current->fs = saved_fs;
@@ -171,7 +177,7 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred);
+	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
 
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
@@ -279,7 +285,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred);
+	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
 
 	/* check disk space */
 	if (!check_min_free_space(dentry, 0, 1)) {
@@ -343,9 +349,8 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
 	/* update number of links on parent directory */
 	set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink);
-
+	fixup_lower_ownership(dentry, dentry->d_name.name);
 	unlock_dir(lower_parent_dentry);
-
 	if ((!sbi->options.multiuser) && (!strcasecmp(dentry->d_name.name, "obb"))
 		&& (pi->perm == PERM_ANDROID) && (pi->userid == 0))
 		make_nomedia_in_obb = 1;
@@ -353,6 +358,8 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	/* When creating /Android/data and /Android/obb, mark them as .nomedia */
 	if (make_nomedia_in_obb ||
 		((pi->perm == PERM_ANDROID) && (!strcasecmp(dentry->d_name.name, "data")))) {
+		REVERT_CRED(saved_cred);
+		OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(d_inode(dentry)));
 		set_fs_pwd(current->fs, &lower_path);
 		touch_err = touch(".nomedia", 0664);
 		if (touch_err) {
@@ -390,7 +397,7 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred);
+	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
 
 	/* sdcardfs_get_real_lower(): in case of remove an user's obb dentry
 	 * the dentry on the original path should be deleted. */
@@ -479,7 +486,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(old_dir->i_sb), saved_cred);
+	OVERRIDE_CRED(SDCARDFS_SB(old_dir->i_sb), saved_cred, SDCARDFS_I(new_dir));
 
 	sdcardfs_get_real_lower(old_dentry, &lower_old_path);
 	sdcardfs_get_lower_path(new_dentry, &lower_new_path);
@@ -516,11 +523,10 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry));
 		fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry));
 	}
-	/* At this point, not all dentry information has been moved, so
-	 * we pass along new_dentry for the name.*/
-	get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry);
+	get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry->d_name.name);
 	fixup_tmp_permissions(d_inode(old_dentry));
-	fixup_top_recursive(old_dentry);
+	fixup_lower_ownership(old_dentry, new_dentry->d_name.name);
+	drop_recursive(old_dentry); /* Can't fixup ownership recursively :( */
 out:
 	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
 	dput(lower_old_dir_dentry);
@@ -750,7 +756,7 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 		goto out_err;
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred);
+	OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred, SDCARDFS_I(inode));
 
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index e94a65c8bbbd..3c9454e5e1c6 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -368,7 +368,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
         }
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred);
+	OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
 
 	sdcardfs_get_lower_path(parent, &lower_parent_path);
 
@@ -392,6 +392,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 		/* get derived permission */
 		get_derived_permission(parent, dentry);
 		fixup_tmp_permissions(d_inode(dentry));
+		fixup_lower_ownership(dentry, dentry->d_name.name);
 	}
 	/* update parent directory's atime */
 	fsstack_copy_attr_atime(d_inode(parent),
diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h
index 923ba101dfa9..52bc20080904 100644
--- a/fs/sdcardfs/multiuser.h
+++ b/fs/sdcardfs/multiuser.h
@@ -18,20 +18,32 @@
  * General Public License.
  */
 
-#define MULTIUSER_APP_PER_USER_RANGE 100000
+#define AID_USER_OFFSET     100000 /* offset for uid ranges for each user */
+#define AID_APP_START        10000 /* first app user */
+#define AID_APP_END          19999 /* last app user */
+#define AID_CACHE_GID_START  20000 /* start of gids for apps to mark cached data */
+#define AID_EXT_GID_START    30000 /* start of gids for apps to mark external data */
+#define AID_SHARED_GID_START 50000 /* start of gids for apps in each user to share */
 
 typedef uid_t userid_t;
 typedef uid_t appid_t;
 
-static inline userid_t multiuser_get_user_id(uid_t uid) {
-    return uid / MULTIUSER_APP_PER_USER_RANGE;
+static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) {
+    return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
 }
 
-static inline appid_t multiuser_get_app_id(uid_t uid) {
-    return uid % MULTIUSER_APP_PER_USER_RANGE;
+static inline gid_t multiuser_get_cache_gid(userid_t user_id, appid_t app_id) {
+    if (app_id >= AID_APP_START && app_id <= AID_APP_END) {
+        return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_CACHE_GID_START);
+    } else {
+        return -1;
+    }
 }
 
-static inline uid_t multiuser_get_uid(userid_t userId, appid_t appId) {
-    return userId * MULTIUSER_APP_PER_USER_RANGE + (appId % MULTIUSER_APP_PER_USER_RANGE);
+static inline gid_t multiuser_get_ext_gid(userid_t user_id, appid_t app_id) {
+    if (app_id >= AID_APP_START && app_id <= AID_APP_END) {
+        return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_EXT_GID_START);
+    } else {
+        return -1;
+    }
 }
-
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 6eb73ddc2ceb..cdab1967317b 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -21,6 +21,7 @@
 #include "sdcardfs.h"
 #include <linux/hashtable.h>
 #include <linux/delay.h>
+#include <linux/radix-tree.h>
 
 
 #include <linux/init.h>
@@ -38,6 +39,8 @@ struct hashtable_entry {
 
 static DEFINE_HASHTABLE(package_to_appid, 8);
 static DEFINE_HASHTABLE(package_to_userid, 8);
+static DEFINE_HASHTABLE(ext_to_groupid, 8);
+
 
 static struct kmem_cache *hashtable_entry_cachep;
 
@@ -53,15 +56,33 @@ static unsigned int str_hash(const char *key) {
 	return h;
 }
 
-appid_t get_appid(const char *app_name)
+appid_t get_appid(const char *key)
 {
 	struct hashtable_entry *hash_cur;
-	unsigned int hash = str_hash(app_name);
+	unsigned int hash = str_hash(key);
 	appid_t ret_id;
 
 	rcu_read_lock();
 	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
-		if (!strcasecmp(app_name, hash_cur->key)) {
+		if (!strcasecmp(key, hash_cur->key)) {
+			ret_id = atomic_read(&hash_cur->value);
+			rcu_read_unlock();
+			return ret_id;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+appid_t get_ext_gid(const char *key)
+{
+	struct hashtable_entry *hash_cur;
+	unsigned int hash = str_hash(key);
+	appid_t ret_id;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+		if (!strcasecmp(key, hash_cur->key)) {
 			ret_id = atomic_read(&hash_cur->value);
 			rcu_read_unlock();
 			return ret_id;
@@ -124,7 +145,7 @@ int open_flags_to_access_mode(int open_flags) {
 	}
 }
 
-static struct hashtable_entry *alloc_packagelist_entry(const char *key,
+static struct hashtable_entry *alloc_hashtable_entry(const char *key,
 		appid_t value)
 {
 	struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep,
@@ -154,13 +175,31 @@ static int insert_packagelist_appid_entry_locked(const char *key, appid_t value)
 			return 0;
 		}
 	}
-	new_entry = alloc_packagelist_entry(key, value);
+	new_entry = alloc_hashtable_entry(key, value);
 	if (!new_entry)
 		return -ENOMEM;
 	hash_add_rcu(package_to_appid, &new_entry->hlist, hash);
 	return 0;
 }
 
+static int insert_ext_gid_entry_locked(const char *key, appid_t value)
+{
+	struct hashtable_entry *hash_cur;
+	struct hashtable_entry *new_entry;
+	unsigned int hash = str_hash(key);
+
+	/* An extension can only belong to one gid */
+	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+		if (!strcasecmp(key, hash_cur->key))
+			return -EINVAL;
+	}
+	new_entry = alloc_hashtable_entry(key, value);
+	if (!new_entry)
+		return -ENOMEM;
+	hash_add_rcu(ext_to_groupid, &new_entry->hlist, hash);
+	return 0;
+}
+
 static int insert_userid_exclude_entry_locked(const char *key, userid_t value)
 {
 	struct hashtable_entry *hash_cur;
@@ -172,7 +211,7 @@ static int insert_userid_exclude_entry_locked(const char *key, userid_t value)
 		if (atomic_read(&hash_cur->value) == value && !strcasecmp(key, hash_cur->key))
 			return 0;
 	}
-	new_entry = alloc_packagelist_entry(key, value);
+	new_entry = alloc_hashtable_entry(key, value);
 	if (!new_entry)
 		return -ENOMEM;
 	hash_add_rcu(package_to_userid, &new_entry->hlist, hash);
@@ -234,6 +273,17 @@ static int insert_packagelist_entry(const char *key, appid_t value)
 	return err;
 }
 
+static int insert_ext_gid_entry(const char *key, appid_t value)
+{
+	int err;
+
+	mutex_lock(&sdcardfs_super_list_lock);
+	err = insert_ext_gid_entry_locked(key, value);
+	mutex_unlock(&sdcardfs_super_list_lock);
+
+	return err;
+}
+
 static int insert_userid_exclude_entry(const char *key, userid_t value)
 {
 	int err;
@@ -247,7 +297,7 @@ static int insert_userid_exclude_entry(const char *key, userid_t value)
 	return err;
 }
 
-static void free_packagelist_entry(struct hashtable_entry *entry)
+static void free_hashtable_entry(struct hashtable_entry *entry)
 {
 	kfree(entry->key);
 	hash_del_rcu(&entry->dlist);
@@ -276,7 +326,7 @@ static void remove_packagelist_entry_locked(const char *key)
 	}
 	synchronize_rcu();
 	hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
-		free_packagelist_entry(hash_cur);
+		free_hashtable_entry(hash_cur);
 }
 
 static void remove_packagelist_entry(const char *key)
@@ -288,6 +338,29 @@ static void remove_packagelist_entry(const char *key)
 	return;
 }
 
+static void remove_ext_gid_entry_locked(const char *key, gid_t group)
+{
+	struct hashtable_entry *hash_cur;
+	unsigned int hash = str_hash(key);
+
+	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+		if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == group) {
+			hash_del_rcu(&hash_cur->hlist);
+			synchronize_rcu();
+			free_hashtable_entry(hash_cur);
+			break;
+		}
+	}
+}
+
+static void remove_ext_gid_entry(const char *key, gid_t group)
+{
+	mutex_lock(&sdcardfs_super_list_lock);
+	remove_ext_gid_entry_locked(key, group);
+	mutex_unlock(&sdcardfs_super_list_lock);
+	return;
+}
+
 static void remove_userid_all_entry_locked(userid_t userid)
 {
 	struct hashtable_entry *hash_cur;
@@ -303,7 +376,7 @@ static void remove_userid_all_entry_locked(userid_t userid)
 	}
 	synchronize_rcu();
 	hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) {
-		free_packagelist_entry(hash_cur);
+		free_hashtable_entry(hash_cur);
 	}
 }
 
@@ -325,7 +398,7 @@ static void remove_userid_exclude_entry_locked(const char *key, userid_t userid)
 		if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == userid) {
 			hash_del_rcu(&hash_cur->hlist);
 			synchronize_rcu();
-			free_packagelist_entry(hash_cur);
+			free_hashtable_entry(hash_cur);
 			break;
 		}
 	}
@@ -357,7 +430,7 @@ static void packagelist_destroy(void)
 	}
 	synchronize_rcu();
 	hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
-		free_packagelist_entry(hash_cur);
+		free_hashtable_entry(hash_cur);
 	mutex_unlock(&sdcardfs_super_list_lock);
 	printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n");
 }
@@ -502,6 +575,127 @@ static struct config_item_type package_appid_type = {
 	.ct_owner	= THIS_MODULE,
 };
 
+struct extensions_value {
+	struct config_group group;
+	unsigned int num;
+};
+
+struct extension_details {
+	struct config_item item;
+	const char* name;
+	unsigned int num;
+};
+
+static inline struct extensions_value *to_extensions_value(struct config_item *item)
+{
+	return item ? container_of(to_config_group(item), struct extensions_value, group) : NULL;
+}
+
+static inline struct extension_details *to_extension_details(struct config_item *item)
+{
+	return item ? container_of(item, struct extension_details, item) : NULL;
+}
+
+static void extension_details_release(struct config_item *item)
+{
+	struct extension_details *extension_details = to_extension_details(item);
+
+	printk(KERN_INFO "sdcardfs: No longer mapping %s files to gid %d\n",
+			extension_details->name, extension_details->num);
+	remove_ext_gid_entry(extension_details->name, extension_details->num);
+	kfree(extension_details->name);
+	kfree(extension_details);
+}
+
+static struct configfs_item_operations extension_details_item_ops = {
+	.release = extension_details_release,
+};
+
+static struct config_item_type extension_details_type = {
+	.ct_item_ops = &extension_details_item_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item *extension_details_make_item(struct config_group *group, const char *name)
+{
+	struct extensions_value *extensions_value = to_extensions_value(&group->cg_item);
+	struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL);
+	int ret;
+	if (!extension_details)
+		return ERR_PTR(-ENOMEM);
+
+	extension_details->name = kstrdup(name, GFP_KERNEL);
+	if (!extension_details->name) {
+		kfree(extension_details);
+		return ERR_PTR(-ENOMEM);
+	}
+	extension_details->num = extensions_value->num;
+	ret = insert_ext_gid_entry(name, extensions_value->num);
+
+	if (ret) {
+		kfree(extension_details->name);
+		kfree(extension_details);
+		return ERR_PTR(ret);
+	}
+	config_item_init_type_name(&extension_details->item, name, &extension_details_type);
+
+	return &extension_details->item;
+}
+
+static struct configfs_group_operations extensions_value_group_ops = {
+	.make_item = extension_details_make_item,
+};
+
+static struct config_item_type extensions_name_type = {
+	.ct_group_ops	= &extensions_value_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct config_group *extensions_make_group(struct config_group *group, const char *name)
+{
+	struct extensions_value *extensions_value;
+	unsigned int tmp;
+	int ret;
+
+	extensions_value = kzalloc(sizeof(struct extensions_value), GFP_KERNEL);
+	if (!extensions_value)
+		return ERR_PTR(-ENOMEM);
+	ret = kstrtouint(name, 10, &tmp);
+	if (ret) {
+		kfree(extensions_value);
+		return ERR_PTR(ret);
+	}
+
+	extensions_value->num = tmp;
+	config_group_init_type_name(&extensions_value->group, name,
+						&extensions_name_type);
+	return &extensions_value->group;
+}
+
+static void extensions_drop_group(struct config_group *group, struct config_item *item)
+{
+	struct extensions_value *value = to_extensions_value(item);
+	printk(KERN_INFO "sdcardfs: No longer mapping any files to gid %d\n", value->num);
+	kfree(value);
+}
+
+static struct configfs_group_operations extensions_group_ops = {
+	.make_group	= extensions_make_group,
+	.drop_item	= extensions_drop_group,
+};
+
+static struct config_item_type extensions_type = {
+	.ct_group_ops	= &extensions_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+struct config_group extension_group = {
+	.cg_item = {
+		.ci_namebuf = "extensions",
+		.ci_type = &extensions_type,
+	},
+};
+
 static struct config_item *packages_make_item(struct config_group *group, const char *name)
 {
 	struct package_details *package_details;
@@ -595,20 +789,28 @@ static struct config_item_type packages_type = {
 	.ct_owner	= THIS_MODULE,
 };
 
+struct config_group *sd_default_groups[] = {
+	&extension_group,
+	NULL,
+};
+
 static struct configfs_subsystem sdcardfs_packages = {
 	.su_group = {
 		.cg_item = {
 			.ci_namebuf = "sdcardfs",
 			.ci_type = &packages_type,
 		},
+		.default_groups = sd_default_groups,
 	},
 };
 
 static int configfs_sdcardfs_init(void)
 {
-	int ret;
+	int ret, i;
 	struct configfs_subsystem *subsys = &sdcardfs_packages;
-
+	for (i = 0; sd_default_groups[i]; i++) {
+		config_group_init(sd_default_groups[i]);
+	}
 	config_group_init(&subsys->su_group);
 	mutex_init(&subsys->su_mutex);
 	ret = configfs_register_subsystem(subsys);
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 3434849cee63..03da961e3b09 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -65,6 +65,9 @@
 #define AID_SDCARD_PICS   1033	/* external storage photos access */
 #define AID_SDCARD_AV     1034	/* external storage audio/video access */
 #define AID_SDCARD_ALL    1035	/* access all users external storage */
+#define AID_MEDIA_OBB     1059  /* obb files */
+
+#define AID_SDCARD_IMAGE  1057
 
 #define AID_PACKAGE_INFO  1027
 
@@ -91,12 +94,12 @@
  * These two macro should be used in pair, and OVERRIDE_CRED() should be
  * placed at the beginning of a function, right after variable declaration.
  */
-#define OVERRIDE_CRED(sdcardfs_sbi, saved_cred)		\
-	saved_cred = override_fsids(sdcardfs_sbi);	\
+#define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info)		\
+	saved_cred = override_fsids(sdcardfs_sbi, info);	\
 	if (!saved_cred) { return -ENOMEM; }
 
-#define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred)	\
-	saved_cred = override_fsids(sdcardfs_sbi);	\
+#define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info)	\
+	saved_cred = override_fsids(sdcardfs_sbi, info);	\
 	if (!saved_cred) { return ERR_PTR(-ENOMEM); }
 
 #define REVERT_CRED(saved_cred)	revert_fsids(saved_cred)
@@ -127,13 +130,18 @@ typedef enum {
     PERM_ANDROID_OBB,
     /* This node is "/Android/media" */
     PERM_ANDROID_MEDIA,
+    /* This node is "/Android/[data|media|obb]/[package]" */
+    PERM_ANDROID_PACKAGE,
+    /* This node is "/Android/[data|media|obb]/[package]/cache" */
+    PERM_ANDROID_PACKAGE_CACHE,
 } perm_t;
 
 struct sdcardfs_sb_info;
 struct sdcardfs_mount_options;
+struct sdcardfs_inode_info;
 
 /* Do not directly use this function. Use OVERRIDE_CRED() instead. */
-const struct cred * override_fsids(struct sdcardfs_sb_info* sbi);
+const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info);
 /* Do not directly use this function, use REVERT_CRED() instead. */
 void revert_fsids(const struct cred * old_cred);
 
@@ -175,6 +183,8 @@ struct sdcardfs_inode_info {
 	userid_t userid;
 	uid_t d_uid;
 	bool under_android;
+	bool under_cache;
+	bool under_obb;
 	/* top folder for ownership */
 	struct inode *top;
 
@@ -447,6 +457,7 @@ extern struct list_head sdcardfs_super_list;
 
 /* for packagelist.c */
 extern appid_t get_appid(const char *app_name);
+extern appid_t get_ext_gid(const char *app_name);
 extern appid_t is_excluded(const char *app_name, userid_t userid);
 extern int check_caller_access_to_name(struct inode *parent_node, const char* name);
 extern int open_flags_to_access_mode(int open_flags);
@@ -466,11 +477,13 @@ struct limit_search {
 extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
 			uid_t uid, bool under_android, struct inode *top);
 extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
-extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry);
+extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const char *name);
+extern void drop_recursive(struct dentry *parent);
 extern void fixup_top_recursive(struct dentry *parent);
 extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
 
 extern void update_derived_permission_lock(struct dentry *dentry);
+void fixup_lower_ownership(struct dentry* dentry, const char *name);
 extern int need_graft_path(struct dentry *dentry);
 extern int is_base_obbpath(struct dentry *dentry);
 extern int is_obbpath_invalid(struct dentry *dentry);

From 7191add87cd90ac136df580204b008b33a913347 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 27 Jan 2017 19:35:08 -0800
Subject: [PATCH 0640/3220] ANDROID: sdcardfs: switch to full_name_hash and
 qstr

Use the kernel's string hash function instead of rolling
our own. Additionally, save a bit of calculation by using
the qstr struct in place of strings.

Change-Id: I0bbeb5ec2a9233f40135ad632e6f22c30ffa95c1
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/packagelist.c | 215 +++++++++++++++++++++-----------------
 1 file changed, 121 insertions(+), 94 deletions(-)

diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index cdab1967317b..b02feef08d51 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -22,7 +22,7 @@
 #include <linux/hashtable.h>
 #include <linux/delay.h>
 #include <linux/radix-tree.h>
-
+#include <linux/dcache.h>
 
 #include <linux/init.h>
 #include <linux/module.h>
@@ -33,7 +33,7 @@
 struct hashtable_entry {
 	struct hlist_node hlist;
 	struct hlist_node dlist; /* for deletion cleanup */
-	const char *key;
+	struct qstr key;
 	atomic_t value;
 };
 
@@ -44,27 +44,53 @@ static DEFINE_HASHTABLE(ext_to_groupid, 8);
 
 static struct kmem_cache *hashtable_entry_cachep;
 
-static unsigned int str_hash(const char *key) {
-	int i;
-	unsigned int h = strlen(key);
-	char *data = (char *)key;
-
-	for (i = 0; i < strlen(key); i++) {
-		h = h * 31 + *data;
-		data++;
-	}
-	return h;
+static void inline qstr_init(struct qstr *q, const char *name) {
+	q->name = name;
+	q->len = strlen(q->name);
+	q->hash = full_name_hash(q->name, q->len);
 }
 
-appid_t get_appid(const char *key)
+static inline int qstr_copy(const struct qstr *src, struct qstr *dest) {
+	dest->name = kstrdup(src->name, GFP_KERNEL);
+	dest->hash_len = src->hash_len;
+	return !!dest->name;
+}
+
+
+static appid_t __get_appid(const struct qstr *key)
 {
 	struct hashtable_entry *hash_cur;
-	unsigned int hash = str_hash(key);
+	unsigned int hash = key->hash;
 	appid_t ret_id;
 
 	rcu_read_lock();
 	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key, hash_cur->key)) {
+		if (!strcasecmp(key->name, hash_cur->key.name)) {
+			ret_id = atomic_read(&hash_cur->value);
+			rcu_read_unlock();
+			return ret_id;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+appid_t get_appid(const char *key)
+{
+	struct qstr q;
+	qstr_init(&q, key);
+	return __get_appid(&q);
+}
+
+static appid_t __get_ext_gid(const struct qstr *key)
+{
+	struct hashtable_entry *hash_cur;
+	unsigned int hash = key->hash;
+	appid_t ret_id;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+		if (!strcasecmp(key->name, hash_cur->key.name)) {
 			ret_id = atomic_read(&hash_cur->value);
 			rcu_read_unlock();
 			return ret_id;
@@ -75,17 +101,23 @@ appid_t get_appid(const char *key)
 }
 
 appid_t get_ext_gid(const char *key)
+{
+	struct qstr q;
+	qstr_init(&q, key);
+	return __get_ext_gid(&q);
+}
+
+static appid_t __is_excluded(const struct qstr *app_name, userid_t user)
 {
 	struct hashtable_entry *hash_cur;
-	unsigned int hash = str_hash(key);
-	appid_t ret_id;
+	unsigned int hash = app_name->hash;
 
 	rcu_read_lock();
-	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key, hash_cur->key)) {
-			ret_id = atomic_read(&hash_cur->value);
+	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+		if (atomic_read(&hash_cur->value) == user &&
+				!strcasecmp(app_name->name, hash_cur->key.name)) {
 			rcu_read_unlock();
-			return ret_id;
+			return 1;
 		}
 	}
 	rcu_read_unlock();
@@ -94,20 +126,12 @@ appid_t get_ext_gid(const char *key)
 
 appid_t is_excluded(const char *app_name, userid_t user)
 {
-	struct hashtable_entry *hash_cur;
-	unsigned int hash = str_hash(app_name);
-
-	rcu_read_lock();
-	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
-		if (atomic_read(&hash_cur->value) == user && !strcasecmp(app_name, hash_cur->key)) {
-			rcu_read_unlock();
-			return 1;
-		}
-	}
-	rcu_read_unlock();
-	return 0;
+	struct qstr q;
+	qstr_init(&q, app_name);
+	return __is_excluded(&q, user);
 }
 
+
 /* Kernel has already enforced everything we returned through
  * derive_permissions_locked(), so this is used to lock down access
  * even further, such as enforcing that apps hold sdcard_rw. */
@@ -145,7 +169,7 @@ int open_flags_to_access_mode(int open_flags) {
 	}
 }
 
-static struct hashtable_entry *alloc_hashtable_entry(const char *key,
+static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key,
 		appid_t value)
 {
 	struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep,
@@ -153,8 +177,7 @@ static struct hashtable_entry *alloc_hashtable_entry(const char *key,
 	if (!ret)
 		return NULL;
 
-	ret->key = kstrdup(key, GFP_KERNEL);
-	if (!ret->key) {
+	if (!qstr_copy(key, &ret->key)) {
 		kmem_cache_free(hashtable_entry_cachep, ret);
 		return NULL;
 	}
@@ -163,14 +186,14 @@ static struct hashtable_entry *alloc_hashtable_entry(const char *key,
 	return ret;
 }
 
-static int insert_packagelist_appid_entry_locked(const char *key, appid_t value)
+static int insert_packagelist_appid_entry_locked(const struct qstr *key, appid_t value)
 {
 	struct hashtable_entry *hash_cur;
 	struct hashtable_entry *new_entry;
-	unsigned int hash = str_hash(key);
+	unsigned int hash = key->hash;
 
 	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key, hash_cur->key)) {
+		if (!strcasecmp(key->name, hash_cur->key.name)) {
 			atomic_set(&hash_cur->value, value);
 			return 0;
 		}
@@ -182,15 +205,15 @@ static int insert_packagelist_appid_entry_locked(const char *key, appid_t value)
 	return 0;
 }
 
-static int insert_ext_gid_entry_locked(const char *key, appid_t value)
+static int insert_ext_gid_entry_locked(const struct qstr *key, appid_t value)
 {
 	struct hashtable_entry *hash_cur;
 	struct hashtable_entry *new_entry;
-	unsigned int hash = str_hash(key);
+	unsigned int hash = key->hash;
 
 	/* An extension can only belong to one gid */
 	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key, hash_cur->key))
+		if (!strcasecmp(key->name, hash_cur->key.name))
 			return -EINVAL;
 	}
 	new_entry = alloc_hashtable_entry(key, value);
@@ -200,15 +223,16 @@ static int insert_ext_gid_entry_locked(const char *key, appid_t value)
 	return 0;
 }
 
-static int insert_userid_exclude_entry_locked(const char *key, userid_t value)
+static int insert_userid_exclude_entry_locked(const struct qstr *key, userid_t value)
 {
 	struct hashtable_entry *hash_cur;
 	struct hashtable_entry *new_entry;
-	unsigned int hash = str_hash(key);
+	unsigned int hash = key->hash;
 
 	/* Only insert if not already present */
 	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
-		if (atomic_read(&hash_cur->value) == value && !strcasecmp(key, hash_cur->key))
+		if (atomic_read(&hash_cur->value) == value &&
+				!strcasecmp(key->name, hash_cur->key.name))
 			return 0;
 	}
 	new_entry = alloc_hashtable_entry(key, value);
@@ -218,13 +242,13 @@ static int insert_userid_exclude_entry_locked(const char *key, userid_t value)
 	return 0;
 }
 
-static void fixup_all_perms_name(const char *key)
+static void fixup_all_perms_name(const struct qstr *key)
 {
 	struct sdcardfs_sb_info *sbinfo;
 	struct limit_search limit = {
 		.flags = BY_NAME,
-		.name = key,
-		.length = strlen(key),
+		.name = key->name,
+		.length = key->len,
 	};
 	list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
 		if (sbinfo_has_sdcard_magic(sbinfo))
@@ -232,13 +256,13 @@ static void fixup_all_perms_name(const char *key)
 	}
 }
 
-static void fixup_all_perms_name_userid(const char *key, userid_t userid)
+static void fixup_all_perms_name_userid(const struct qstr *key, userid_t userid)
 {
 	struct sdcardfs_sb_info *sbinfo;
 	struct limit_search limit = {
 		.flags = BY_NAME | BY_USERID,
-		.name = key,
-		.length = strlen(key),
+		.name = key->name,
+		.length = key->len,
 		.userid = userid,
 	};
 	list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
@@ -260,7 +284,7 @@ static void fixup_all_perms_userid(userid_t userid)
 	}
 }
 
-static int insert_packagelist_entry(const char *key, appid_t value)
+static int insert_packagelist_entry(const struct qstr *key, appid_t value)
 {
 	int err;
 
@@ -273,7 +297,7 @@ static int insert_packagelist_entry(const char *key, appid_t value)
 	return err;
 }
 
-static int insert_ext_gid_entry(const char *key, appid_t value)
+static int insert_ext_gid_entry(const struct qstr *key, appid_t value)
 {
 	int err;
 
@@ -284,7 +308,7 @@ static int insert_ext_gid_entry(const char *key, appid_t value)
 	return err;
 }
 
-static int insert_userid_exclude_entry(const char *key, userid_t value)
+static int insert_userid_exclude_entry(const struct qstr *key, userid_t value)
 {
 	int err;
 
@@ -299,26 +323,26 @@ static int insert_userid_exclude_entry(const char *key, userid_t value)
 
 static void free_hashtable_entry(struct hashtable_entry *entry)
 {
-	kfree(entry->key);
+	kfree(entry->key.name);
 	hash_del_rcu(&entry->dlist);
 	kmem_cache_free(hashtable_entry_cachep, entry);
 }
 
-static void remove_packagelist_entry_locked(const char *key)
+static void remove_packagelist_entry_locked(const struct qstr *key)
 {
 	struct hashtable_entry *hash_cur;
-	unsigned int hash = str_hash(key);
+	unsigned int hash = key->hash;
 	struct hlist_node *h_t;
 	HLIST_HEAD(free_list);
 
 	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key, hash_cur->key)) {
+		if (!strcasecmp(key->name, hash_cur->key.name)) {
 			hash_del_rcu(&hash_cur->hlist);
 			hlist_add_head(&hash_cur->dlist, &free_list);
 		}
 	}
 	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key, hash_cur->key)) {
+		if (!strcasecmp(key->name, hash_cur->key.name)) {
 			hash_del_rcu(&hash_cur->hlist);
 			hlist_add_head(&hash_cur->dlist, &free_list);
 			break;
@@ -329,7 +353,7 @@ static void remove_packagelist_entry_locked(const char *key)
 		free_hashtable_entry(hash_cur);
 }
 
-static void remove_packagelist_entry(const char *key)
+static void remove_packagelist_entry(const struct qstr *key)
 {
 	mutex_lock(&sdcardfs_super_list_lock);
 	remove_packagelist_entry_locked(key);
@@ -338,13 +362,13 @@ static void remove_packagelist_entry(const char *key)
 	return;
 }
 
-static void remove_ext_gid_entry_locked(const char *key, gid_t group)
+static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group)
 {
 	struct hashtable_entry *hash_cur;
-	unsigned int hash = str_hash(key);
+	unsigned int hash = key->hash;
 
 	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == group) {
+		if (!strcasecmp(key->name, hash_cur->key.name) && atomic_read(&hash_cur->value) == group) {
 			hash_del_rcu(&hash_cur->hlist);
 			synchronize_rcu();
 			free_hashtable_entry(hash_cur);
@@ -353,7 +377,7 @@ static void remove_ext_gid_entry_locked(const char *key, gid_t group)
 	}
 }
 
-static void remove_ext_gid_entry(const char *key, gid_t group)
+static void remove_ext_gid_entry(const struct qstr *key, gid_t group)
 {
 	mutex_lock(&sdcardfs_super_list_lock);
 	remove_ext_gid_entry_locked(key, group);
@@ -389,13 +413,14 @@ static void remove_userid_all_entry(userid_t userid)
 	return;
 }
 
-static void remove_userid_exclude_entry_locked(const char *key, userid_t userid)
+static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t userid)
 {
 	struct hashtable_entry *hash_cur;
-	unsigned int hash = str_hash(key);
+	unsigned int hash = key->hash;
 
 	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == userid) {
+		if (!strcasecmp(key->name, hash_cur->key.name) &&
+				atomic_read(&hash_cur->value) == userid) {
 			hash_del_rcu(&hash_cur->hlist);
 			synchronize_rcu();
 			free_hashtable_entry(hash_cur);
@@ -404,7 +429,7 @@ static void remove_userid_exclude_entry_locked(const char *key, userid_t userid)
 	}
 }
 
-static void remove_userid_exclude_entry(const char *key, userid_t userid)
+static void remove_userid_exclude_entry(const struct qstr *key, userid_t userid)
 {
 	mutex_lock(&sdcardfs_super_list_lock);
 	remove_userid_exclude_entry_locked(key, userid);
@@ -462,7 +487,7 @@ static struct configfs_attribute _pfx##attr_##_name = {	\
 
 struct package_details {
 	struct config_item item;
-	const char *name;
+	struct qstr name;
 };
 
 static inline struct package_details *to_package_details(struct config_item *item)
@@ -472,7 +497,7 @@ static inline struct package_details *to_package_details(struct config_item *ite
 
 static ssize_t package_details_appid_show(struct config_item *item, char *page)
 {
-	return scnprintf(page, PAGE_SIZE, "%u\n", get_appid(to_package_details(item)->name));
+	return scnprintf(page, PAGE_SIZE, "%u\n", __get_appid(&to_package_details(item)->name));
 }
 
 static ssize_t package_details_appid_store(struct config_item *item,
@@ -485,7 +510,7 @@ static ssize_t package_details_appid_store(struct config_item *item,
 	if (ret)
 		return ret;
 
-	ret = insert_packagelist_entry(to_package_details(item)->name, tmp);
+	ret = insert_packagelist_entry(&to_package_details(item)->name, tmp);
 
 	if (ret)
 		return ret;
@@ -498,12 +523,12 @@ static ssize_t package_details_excluded_userids_show(struct config_item *item,
 {
 	struct package_details *package_details = to_package_details(item);
 	struct hashtable_entry *hash_cur;
-	unsigned int hash = str_hash(package_details->name);
+	unsigned int hash = package_details->name.hash;
 	int count = 0;
 
 	rcu_read_lock();
 	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
-		if (!strcasecmp(package_details->name, hash_cur->key))
+		if (!strcasecmp(package_details->name.name, hash_cur->key.name))
 			count += scnprintf(page + count, PAGE_SIZE - count,
 					"%d ", atomic_read(&hash_cur->value));
 	}
@@ -524,7 +549,7 @@ static ssize_t package_details_excluded_userids_store(struct config_item *item,
 	if (ret)
 		return ret;
 
-	ret = insert_userid_exclude_entry(to_package_details(item)->name, tmp);
+	ret = insert_userid_exclude_entry(&to_package_details(item)->name, tmp);
 
 	if (ret)
 		return ret;
@@ -541,16 +566,16 @@ static ssize_t package_details_clear_userid_store(struct config_item *item,
 	ret = kstrtouint(page, 10, &tmp);
 	if (ret)
 		return ret;
-	remove_userid_exclude_entry(to_package_details(item)->name, tmp);
+	remove_userid_exclude_entry(&to_package_details(item)->name, tmp);
 	return count;
 }
 
 static void package_details_release(struct config_item *item)
 {
 	struct package_details *package_details = to_package_details(item);
-	printk(KERN_INFO "sdcardfs: removing %s\n", package_details->name);
-	remove_packagelist_entry(package_details->name);
-	kfree(package_details->name);
+	printk(KERN_INFO "sdcardfs: removing %s\n", package_details->name.name);
+	remove_packagelist_entry(&package_details->name);
+	kfree(package_details->name.name);
 	kfree(package_details);
 }
 
@@ -582,7 +607,7 @@ struct extensions_value {
 
 struct extension_details {
 	struct config_item item;
-	const char* name;
+	struct qstr name;
 	unsigned int num;
 };
 
@@ -601,9 +626,9 @@ static void extension_details_release(struct config_item *item)
 	struct extension_details *extension_details = to_extension_details(item);
 
 	printk(KERN_INFO "sdcardfs: No longer mapping %s files to gid %d\n",
-			extension_details->name, extension_details->num);
-	remove_ext_gid_entry(extension_details->name, extension_details->num);
-	kfree(extension_details->name);
+			extension_details->name.name, extension_details->num);
+	remove_ext_gid_entry(&extension_details->name, extension_details->num);
+	kfree(extension_details->name.name);
 	kfree(extension_details);
 }
 
@@ -620,20 +645,21 @@ static struct config_item *extension_details_make_item(struct config_group *grou
 {
 	struct extensions_value *extensions_value = to_extensions_value(&group->cg_item);
 	struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL);
+	const char *tmp;
 	int ret;
 	if (!extension_details)
 		return ERR_PTR(-ENOMEM);
 
-	extension_details->name = kstrdup(name, GFP_KERNEL);
-	if (!extension_details->name) {
+	tmp = kstrdup(name, GFP_KERNEL);
+	if (!tmp) {
 		kfree(extension_details);
 		return ERR_PTR(-ENOMEM);
 	}
-	extension_details->num = extensions_value->num;
-	ret = insert_ext_gid_entry(name, extensions_value->num);
+	qstr_init(&extension_details->name, tmp);
+	ret = insert_ext_gid_entry(&extension_details->name, extensions_value->num);
 
 	if (ret) {
-		kfree(extension_details->name);
+		kfree(extension_details->name.name);
 		kfree(extension_details);
 		return ERR_PTR(ret);
 	}
@@ -699,16 +725,17 @@ struct config_group extension_group = {
 static struct config_item *packages_make_item(struct config_group *group, const char *name)
 {
 	struct package_details *package_details;
+	const char *tmp;
 
 	package_details = kzalloc(sizeof(struct package_details), GFP_KERNEL);
 	if (!package_details)
 		return ERR_PTR(-ENOMEM);
-	package_details->name = kstrdup(name, GFP_KERNEL);
-	if (!package_details->name) {
+	tmp = kstrdup(name, GFP_KERNEL);
+	if (!tmp) {
 		kfree(package_details);
 		return ERR_PTR(-ENOMEM);
 	}
-
+	qstr_init(&package_details->name, tmp);
 	config_item_init_type_name(&package_details->item, name,
 						&package_appid_type);
 
@@ -727,13 +754,13 @@ static ssize_t packages_list_show(struct config_item *item, char *page)
 	rcu_read_lock();
 	hash_for_each_rcu(package_to_appid, i, hash_cur_app, hlist) {
 		written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n",
-					hash_cur_app->key, atomic_read(&hash_cur_app->value));
-		hash = str_hash(hash_cur_app->key);
+					hash_cur_app->key.name, atomic_read(&hash_cur_app->value));
+		hash = hash_cur_app->key.hash;
 		hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) {
-			if (!strcasecmp(hash_cur_app->key, hash_cur_user->key)) {
+			if (!strcasecmp(hash_cur_app->key.name, hash_cur_user->key.name)) {
 				written += scnprintf(page + count + written - 1,
 					PAGE_SIZE - sizeof(errormsg) - count - written + 1,
-					" %d\n",	atomic_read(&hash_cur_user->value)) - 1;
+					" %d\n", atomic_read(&hash_cur_user->value)) - 1;
 			}
 		}
 		if (count + written == PAGE_SIZE - sizeof(errormsg) - 1) {

From 9ce149a4581a1516f8bead77075821e2cf05ee21 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 31 Jan 2017 20:07:51 -0800
Subject: [PATCH 0641/3220] ANDROID: sdcardfs: Switch strcasecmp for internal
 call

This moves our uses of strcasecmp over to an internal call so we can
easily change implementations later if we so desire. Additionally,
we leverage qstr's where appropriate to save time on comparisons.

Change-Id: I32fdc4fd0cd3b7b735dcfd82f60a2516fd8272a5
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c | 35 ++++++++++++++++++++-------------
 fs/sdcardfs/file.c         |  2 +-
 fs/sdcardfs/inode.c        | 24 ++++++++++++-----------
 fs/sdcardfs/lookup.c       | 18 +++++++----------
 fs/sdcardfs/packagelist.c  | 40 ++++++++++++++++++++------------------
 fs/sdcardfs/sdcardfs.h     | 17 ++++++++++++++--
 6 files changed, 78 insertions(+), 58 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index d2bff5ecdad0..0bb442338a85 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -51,11 +51,16 @@ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
 }
 
 /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */
-void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const char *name)
+void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
 	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
 	appid_t appid;
+	struct qstr q_Android = QSTR_LITERAL("Android");
+	struct qstr q_data = QSTR_LITERAL("data");
+	struct qstr q_obb = QSTR_LITERAL("obb");
+	struct qstr q_media = QSTR_LITERAL("media");
+	struct qstr q_cache = QSTR_LITERAL("cache");
 
 	/* By default, each inode inherits from its parent.
 	 * the properties are maintained on its private fields
@@ -79,12 +84,12 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co
 		case PERM_PRE_ROOT:
 			/* Legacy internal layout places users at top level */
 			info->perm = PERM_ROOT;
-			info->userid = simple_strtoul(name, NULL, 10);
+			info->userid = simple_strtoul(name->name, NULL, 10);
 			set_top(info, &info->vfs_inode);
 			break;
 		case PERM_ROOT:
 			/* Assume masked off by default. */
-			if (!strcasecmp(name, "Android")) {
+			if (qstr_case_eq(name, &q_Android)) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID;
 				info->under_android = true;
@@ -92,17 +97,17 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co
 			}
 			break;
 		case PERM_ANDROID:
-			if (!strcasecmp(name, "data")) {
+			if (qstr_case_eq(name, &q_data)) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID_DATA;
 				set_top(info, &info->vfs_inode);
-			} else if (!strcasecmp(name, "obb")) {
+			} else if (qstr_case_eq(name, &q_obb)) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID_OBB;
 				info->under_obb = true;
 				set_top(info, &info->vfs_inode);
 				/* Single OBB directory is always shared */
-			} else if (!strcasecmp(name, "media")) {
+			} else if (qstr_case_eq(name, &q_media)) {
 				/* App-specific directories inside; let anyone traverse */
 				info->perm = PERM_ANDROID_MEDIA;
 				set_top(info, &info->vfs_inode);
@@ -112,14 +117,14 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co
 		case PERM_ANDROID_DATA:
 		case PERM_ANDROID_MEDIA:
 			info->perm = PERM_ANDROID_PACKAGE;
-			appid = get_appid(name);
-			if (appid != 0 && !is_excluded(name, parent_info->userid)) {
+			appid = get_appid(name->name);
+			if (appid != 0 && !is_excluded(name->name, parent_info->userid)) {
 				info->d_uid = multiuser_get_uid(parent_info->userid, appid);
 			}
 			set_top(info, &info->vfs_inode);
 			break;
 		case PERM_ANDROID_PACKAGE:
-			if (!strcasecmp(name, "cache")) {
+			if (qstr_case_eq(name, &q_cache)) {
 				info->perm = PERM_ANDROID_PACKAGE_CACHE;
 				info->under_cache = true;
 			}
@@ -129,7 +134,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co
 
 void get_derived_permission(struct dentry *parent, struct dentry *dentry)
 {
-	get_derived_permission_new(parent, dentry, dentry->d_name.name);
+	get_derived_permission_new(parent, dentry, &dentry->d_name);
 }
 
 static appid_t get_type(const char *name) {
@@ -360,9 +365,10 @@ int need_graft_path(struct dentry *dentry)
 	struct dentry *parent = dget_parent(dentry);
 	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+	struct qstr obb = QSTR_LITERAL("obb");
 
 	if(parent_info->perm == PERM_ANDROID &&
-			!strcasecmp(dentry->d_name.name, "obb")) {
+			qstr_case_eq(&dentry->d_name, &obb)) {
 
 		/* /Android/obb is the base obbpath of DERIVED_UNIFIED */
 		if(!(sbi->options.multiuser == false
@@ -399,7 +405,7 @@ int is_obbpath_invalid(struct dentry *dent)
 			} else {
 				obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX);
 				if (d_unhashed(di->lower_path.dentry) ||
-					strcasecmp(sbi->obbpath_s, obbpath_s)) {
+					!str_case_eq(sbi->obbpath_s, obbpath_s)) {
 					ret = 1;
 				}
 				kfree(path_buf);
@@ -419,15 +425,16 @@ int is_base_obbpath(struct dentry *dentry)
 	struct dentry *parent = dget_parent(dentry);
 	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+	struct qstr q_obb = QSTR_LITERAL("obb");
 
 	spin_lock(&SDCARDFS_D(dentry)->lock);
 	if (sbi->options.multiuser) {
 		if(parent_info->perm == PERM_PRE_ROOT &&
-				!strcasecmp(dentry->d_name.name, "obb")) {
+				qstr_case_eq(&dentry->d_name, &q_obb)) {
 			ret = 1;
 		}
 	} else  if (parent_info->perm == PERM_ANDROID &&
-			!strcasecmp(dentry->d_name.name, "obb")) {
+			qstr_case_eq(&dentry->d_name, &q_obb)) {
 		ret = 1;
 	}
 	spin_unlock(&SDCARDFS_D(dentry)->lock);
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 006c6ff57ad7..23f8cd7f8877 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -216,7 +216,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 		goto out_err;
 	}
 
-	if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) {
+	if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
                          "	dentry: %s, task:%s\n",
 						 __func__, dentry->d_name.name, current->comm);
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index cb0588691a0f..68e615045616 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -66,7 +66,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	struct fs_struct *saved_fs;
 	struct fs_struct *copied_fs;
 
-	if(!check_caller_access_to_name(dir, dentry->d_name.name)) {
+	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
 						 "  dentry: %s, task:%s\n",
 						 __func__, dentry->d_name.name, current->comm);
@@ -168,7 +168,7 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct path lower_path;
 	const struct cred *saved_cred = NULL;
 
-	if(!check_caller_access_to_name(dir, dentry->d_name.name)) {
+	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
 						 "  dentry: %s, task:%s\n",
 						 __func__, dentry->d_name.name, current->comm);
@@ -275,8 +275,10 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	int touch_err = 0;
 	struct fs_struct *saved_fs;
 	struct fs_struct *copied_fs;
+	struct qstr q_obb = QSTR_LITERAL("obb");
+	struct qstr q_data = QSTR_LITERAL("data");
 
-	if(!check_caller_access_to_name(dir, dentry->d_name.name)) {
+	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
 						 "  dentry: %s, task:%s\n",
 						 __func__, dentry->d_name.name, current->comm);
@@ -351,13 +353,13 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink);
 	fixup_lower_ownership(dentry, dentry->d_name.name);
 	unlock_dir(lower_parent_dentry);
-	if ((!sbi->options.multiuser) && (!strcasecmp(dentry->d_name.name, "obb"))
+	if ((!sbi->options.multiuser) && (qstr_case_eq(&dentry->d_name, &q_obb))
 		&& (pi->perm == PERM_ANDROID) && (pi->userid == 0))
 		make_nomedia_in_obb = 1;
 
 	/* When creating /Android/data and /Android/obb, mark them as .nomedia */
 	if (make_nomedia_in_obb ||
-		((pi->perm == PERM_ANDROID) && (!strcasecmp(dentry->d_name.name, "data")))) {
+		((pi->perm == PERM_ANDROID) && (qstr_case_eq(&dentry->d_name, &q_data)))) {
 		REVERT_CRED(saved_cred);
 		OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(d_inode(dentry)));
 		set_fs_pwd(current->fs, &lower_path);
@@ -388,7 +390,7 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct path lower_path;
 	const struct cred *saved_cred = NULL;
 
-	if(!check_caller_access_to_name(dir, dentry->d_name.name)) {
+	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
 						 "  dentry: %s, task:%s\n",
 						 __func__, dentry->d_name.name, current->comm);
@@ -476,8 +478,8 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct path lower_old_path, lower_new_path;
 	const struct cred *saved_cred = NULL;
 
-	if(!check_caller_access_to_name(old_dir, old_dentry->d_name.name) ||
-		!check_caller_access_to_name(new_dir, new_dentry->d_name.name)) {
+	if(!check_caller_access_to_name(old_dir, &old_dentry->d_name) ||
+		!check_caller_access_to_name(new_dir, &new_dentry->d_name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
 						 "  new_dentry: %s, task:%s\n",
 						 __func__, new_dentry->d_name.name, current->comm);
@@ -523,7 +525,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry));
 		fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry));
 	}
-	get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry->d_name.name);
+	get_derived_permission_new(new_dentry->d_parent, old_dentry, &new_dentry->d_name);
 	fixup_tmp_permissions(d_inode(old_dentry));
 	fixup_lower_ownership(old_dentry, new_dentry->d_name.name);
 	drop_recursive(old_dentry); /* Can't fixup ownership recursively :( */
@@ -743,7 +745,7 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	if (!err) {
 		/* check the Android group ID */
 		parent = dget_parent(dentry);
-		if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) {
+		if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
 			printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
 							 "  dentry: %s, task:%s\n",
 							 __func__, dentry->d_name.name, current->comm);
@@ -861,7 +863,7 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	int err;
 
 	parent = dget_parent(dentry);
-	if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) {
+	if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
 						 "  dentry: %s, task:%s\n",
 						 __func__, dentry->d_name.name, current->comm);
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 3c9454e5e1c6..9135866b7766 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -219,9 +219,8 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 	struct vfsmount *lower_dir_mnt;
 	struct dentry *lower_dir_dentry = NULL;
 	struct dentry *lower_dentry;
-	const char *name;
+	const struct qstr *name;
 	struct path lower_path;
-	struct qstr this;
 	struct sdcardfs_sb_info *sbi;
 
 	sbi = SDCARDFS_SB(dentry->d_sb);
@@ -231,14 +230,14 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 	if (IS_ROOT(dentry))
 		goto out;
 
-	name = dentry->d_name.name;
+	name = &dentry->d_name;
 
 	/* now start the actual lookup procedure */
 	lower_dir_dentry = lower_parent_path->dentry;
 	lower_dir_mnt = lower_parent_path->mnt;
 
 	/* Use vfs_path_lookup to check if the dentry exists or not */
-	err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0,
+	err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name->name, 0,
 				&lower_path);
 	/* check for other cases */
 	if (err == -ENOENT) {
@@ -248,7 +247,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 		spin_lock(&lower_dir_dentry->d_lock);
 		list_for_each_entry(child, &lower_dir_dentry->d_subdirs, d_child) {
 			if (child && d_inode(child)) {
-				if (strcasecmp(child->d_name.name, name)==0) {
+				if (qstr_case_eq(&child->d_name, name)) {
 					match = dget(child);
 					break;
 				}
@@ -307,14 +306,11 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 		goto out;
 
 	/* instatiate a new negative dentry */
-	this.name = name;
-	this.len = strlen(name);
-	this.hash = full_name_hash(this.name, this.len);
-	lower_dentry = d_lookup(lower_dir_dentry, &this);
+	lower_dentry = d_lookup(lower_dir_dentry, name);
 	if (lower_dentry)
 		goto setup_lower;
 
-	lower_dentry = d_alloc(lower_dir_dentry, &this);
+	lower_dentry = d_alloc(lower_dir_dentry, name);
 	if (!lower_dentry) {
 		err = -ENOMEM;
 		goto out;
@@ -359,7 +355,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	parent = dget_parent(dentry);
 
-	if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) {
+	if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
 		ret = ERR_PTR(-EACCES);
 		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
                          "	dentry: %s, task:%s\n",
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index b02feef08d51..d96fcde041cc 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -65,7 +65,7 @@ static appid_t __get_appid(const struct qstr *key)
 
 	rcu_read_lock();
 	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key->name, hash_cur->key.name)) {
+		if (qstr_case_eq(key, &hash_cur->key)) {
 			ret_id = atomic_read(&hash_cur->value);
 			rcu_read_unlock();
 			return ret_id;
@@ -90,7 +90,7 @@ static appid_t __get_ext_gid(const struct qstr *key)
 
 	rcu_read_lock();
 	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key->name, hash_cur->key.name)) {
+		if (qstr_case_eq(key, &hash_cur->key)) {
 			ret_id = atomic_read(&hash_cur->value);
 			rcu_read_unlock();
 			return ret_id;
@@ -115,7 +115,7 @@ static appid_t __is_excluded(const struct qstr *app_name, userid_t user)
 	rcu_read_lock();
 	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
 		if (atomic_read(&hash_cur->value) == user &&
-				!strcasecmp(app_name->name, hash_cur->key.name)) {
+				qstr_case_eq(app_name, &hash_cur->key)) {
 			rcu_read_unlock();
 			return 1;
 		}
@@ -124,24 +124,26 @@ static appid_t __is_excluded(const struct qstr *app_name, userid_t user)
 	return 0;
 }
 
-appid_t is_excluded(const char *app_name, userid_t user)
+appid_t is_excluded(const char *key, userid_t user)
 {
 	struct qstr q;
-	qstr_init(&q, app_name);
+	qstr_init(&q, key);
 	return __is_excluded(&q, user);
 }
 
-
 /* Kernel has already enforced everything we returned through
  * derive_permissions_locked(), so this is used to lock down access
  * even further, such as enforcing that apps hold sdcard_rw. */
-int check_caller_access_to_name(struct inode *parent_node, const char* name) {
+int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name) {
+	struct qstr q_autorun = QSTR_LITERAL("autorun.inf");
+	struct qstr q__android_secure = QSTR_LITERAL(".android_secure");
+	struct qstr q_android_secure = QSTR_LITERAL("android_secure");
 
 	/* Always block security-sensitive files at root */
 	if (parent_node && SDCARDFS_I(parent_node)->perm == PERM_ROOT) {
-		if (!strcasecmp(name, "autorun.inf")
-			|| !strcasecmp(name, ".android_secure")
-			|| !strcasecmp(name, "android_secure")) {
+		if (qstr_case_eq(name, &q_autorun)
+			|| qstr_case_eq(name, &q__android_secure)
+			|| qstr_case_eq(name, &q_android_secure)) {
 			return 0;
 		}
 	}
@@ -193,7 +195,7 @@ static int insert_packagelist_appid_entry_locked(const struct qstr *key, appid_t
 	unsigned int hash = key->hash;
 
 	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key->name, hash_cur->key.name)) {
+		if (qstr_case_eq(key, &hash_cur->key)) {
 			atomic_set(&hash_cur->value, value);
 			return 0;
 		}
@@ -213,7 +215,7 @@ static int insert_ext_gid_entry_locked(const struct qstr *key, appid_t value)
 
 	/* An extension can only belong to one gid */
 	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key->name, hash_cur->key.name))
+		if (qstr_case_eq(key, &hash_cur->key))
 			return -EINVAL;
 	}
 	new_entry = alloc_hashtable_entry(key, value);
@@ -232,7 +234,7 @@ static int insert_userid_exclude_entry_locked(const struct qstr *key, userid_t v
 	/* Only insert if not already present */
 	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
 		if (atomic_read(&hash_cur->value) == value &&
-				!strcasecmp(key->name, hash_cur->key.name))
+				qstr_case_eq(key, &hash_cur->key))
 			return 0;
 	}
 	new_entry = alloc_hashtable_entry(key, value);
@@ -336,13 +338,13 @@ static void remove_packagelist_entry_locked(const struct qstr *key)
 	HLIST_HEAD(free_list);
 
 	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key->name, hash_cur->key.name)) {
+		if (qstr_case_eq(key, &hash_cur->key)) {
 			hash_del_rcu(&hash_cur->hlist);
 			hlist_add_head(&hash_cur->dlist, &free_list);
 		}
 	}
 	hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key->name, hash_cur->key.name)) {
+		if (qstr_case_eq(key, &hash_cur->key)) {
 			hash_del_rcu(&hash_cur->hlist);
 			hlist_add_head(&hash_cur->dlist, &free_list);
 			break;
@@ -368,7 +370,7 @@ static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group)
 	unsigned int hash = key->hash;
 
 	hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key->name, hash_cur->key.name) && atomic_read(&hash_cur->value) == group) {
+		if (qstr_case_eq(key, &hash_cur->key) && atomic_read(&hash_cur->value) == group) {
 			hash_del_rcu(&hash_cur->hlist);
 			synchronize_rcu();
 			free_hashtable_entry(hash_cur);
@@ -419,7 +421,7 @@ static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t
 	unsigned int hash = key->hash;
 
 	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
-		if (!strcasecmp(key->name, hash_cur->key.name) &&
+		if (qstr_case_eq(key, &hash_cur->key) &&
 				atomic_read(&hash_cur->value) == userid) {
 			hash_del_rcu(&hash_cur->hlist);
 			synchronize_rcu();
@@ -528,7 +530,7 @@ static ssize_t package_details_excluded_userids_show(struct config_item *item,
 
 	rcu_read_lock();
 	hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
-		if (!strcasecmp(package_details->name.name, hash_cur->key.name))
+		if (qstr_case_eq(&package_details->name, &hash_cur->key))
 			count += scnprintf(page + count, PAGE_SIZE - count,
 					"%d ", atomic_read(&hash_cur->value));
 	}
@@ -757,7 +759,7 @@ static ssize_t packages_list_show(struct config_item *item, char *page)
 					hash_cur_app->key.name, atomic_read(&hash_cur_app->value));
 		hash = hash_cur_app->key.hash;
 		hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) {
-			if (!strcasecmp(hash_cur_app->key.name, hash_cur_user->key.name)) {
+			if (qstr_case_eq(&hash_cur_app->key, &hash_cur_user->key)) {
 				written += scnprintf(page + count + written - 1,
 					PAGE_SIZE - sizeof(errormsg) - count - written + 1,
 					" %d\n", atomic_read(&hash_cur_user->value)) - 1;
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 03da961e3b09..f3cced313108 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -459,7 +459,7 @@ extern struct list_head sdcardfs_super_list;
 extern appid_t get_appid(const char *app_name);
 extern appid_t get_ext_gid(const char *app_name);
 extern appid_t is_excluded(const char *app_name, userid_t userid);
-extern int check_caller_access_to_name(struct inode *parent_node, const char* name);
+extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr* name);
 extern int open_flags_to_access_mode(int open_flags);
 extern int packagelist_init(void);
 extern void packagelist_exit(void);
@@ -477,7 +477,7 @@ struct limit_search {
 extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
 			uid_t uid, bool under_android, struct inode *top);
 extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
-extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const char *name);
+extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name);
 extern void drop_recursive(struct dentry *parent);
 extern void fixup_top_recursive(struct dentry *parent);
 extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
@@ -605,4 +605,17 @@ static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct
 	dest->i_flags = src->i_flags;
 	set_nlink(dest, src->i_nlink);
 }
+
+static inline bool str_case_eq(const char *s1, const char *s2)
+{
+	return !strcasecmp(s1, s2);
+}
+
+static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2)
+{
+	return q1->len == q2->len && str_case_eq(q1->name, q2->name);
+}
+
+#define QSTR_LITERAL(string) QSTR_INIT(string, sizeof(string)-1)
+
 #endif	/* not _SDCARDFS_H_ */

From 91495bc622e9356dd63a7c77b98a21f0e2f5d2b2 Mon Sep 17 00:00:00 2001
From: Pratyush Anand <panand@redhat.com>
Date: Mon, 14 Nov 2016 19:32:42 +0530
Subject: [PATCH 0642/3220] BACKPORT: hw_breakpoint: Allow watchpoint of length
 3,5,6 and 7

(cherry picked from commit 651be3cb085341a21847e47c694c249c3e1e4e5b)

We only support breakpoint/watchpoint of length 1, 2, 4 and 8. If we can
support other length as well, then user may watch more data with less
number of watchpoints (provided hardware supports it). For example: if we
have to watch only 4th, 5th and 6th byte from a 64 bit aligned address, we
will have to use two slots to implement it currently. One slot will watch a
half word at offset 4 and other a byte at offset 6. If we can have a
watchpoint of length 3 then we can watch it with single slot as well.

ARM64 hardware does support such functionality, therefore adding these new
definitions in generic layer.

Signed-off-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Pavel Labath <labath@google.com>
[pavel: tools/include/uapi/linux/hw_breakpoint.h is not present in this branch]
Change-Id: Ie17ed89ca526e4fddf591bb4e556fdfb55fc2eac
Bug: 30919905
---
 include/uapi/linux/hw_breakpoint.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/uapi/linux/hw_breakpoint.h b/include/uapi/linux/hw_breakpoint.h
index b04000a2296a..2b65efd19a46 100644
--- a/include/uapi/linux/hw_breakpoint.h
+++ b/include/uapi/linux/hw_breakpoint.h
@@ -4,7 +4,11 @@
 enum {
 	HW_BREAKPOINT_LEN_1 = 1,
 	HW_BREAKPOINT_LEN_2 = 2,
+	HW_BREAKPOINT_LEN_3 = 3,
 	HW_BREAKPOINT_LEN_4 = 4,
+	HW_BREAKPOINT_LEN_5 = 5,
+	HW_BREAKPOINT_LEN_6 = 6,
+	HW_BREAKPOINT_LEN_7 = 7,
 	HW_BREAKPOINT_LEN_8 = 8,
 };
 

From 7409857a0717fa78dc936ea08099880be893156c Mon Sep 17 00:00:00 2001
From: Pratyush Anand <panand@redhat.com>
Date: Mon, 14 Nov 2016 19:32:43 +0530
Subject: [PATCH 0643/3220] UPSTREAM: arm64: Allow hw watchpoint at varied
 offset from base address

ARM64 hardware supports watchpoint at any double word aligned address.
However, it can select any consecutive bytes from offset 0 to 7 from that
base address. For example, if base address is programmed as 0x420030 and
byte select is 0x1C, then access of 0x420032,0x420033 and 0x420034 will
generate a watchpoint exception.

Currently, we do not have such modularity. We can only program byte,
halfword, word and double word access exception from any base address.

This patch adds support to overcome above limitations.

Signed-off-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Pavel Labath <labath@google.com>
Change-Id: I28b1ca63f63182c10c3d6b6b3bacf6c56887ddbe
Bug: 30919905
---
 arch/arm64/include/asm/hw_breakpoint.h |  2 +-
 arch/arm64/kernel/hw_breakpoint.c      | 47 +++++++++++++-------------
 arch/arm64/kernel/ptrace.c             |  7 ++--
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h
index 9732908bfc8a..8acfd989a4e4 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h
@@ -110,7 +110,7 @@ struct perf_event;
 struct pmu;
 
 extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
-				  int *gen_len, int *gen_type);
+				  int *gen_len, int *gen_type, int *offset);
 extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
 extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index b45c95d34b83..21cb18ba9ff8 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -345,7 +345,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
  * to generic breakpoint descriptions.
  */
 int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
-			   int *gen_len, int *gen_type)
+			   int *gen_len, int *gen_type, int *offset)
 {
 	/* Type */
 	switch (ctrl.type) {
@@ -365,8 +365,12 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
 		return -EINVAL;
 	}
 
+	if (!ctrl.len)
+		return -EINVAL;
+	*offset = __ffs(ctrl.len);
+
 	/* Len */
-	switch (ctrl.len) {
+	switch (ctrl.len >> *offset) {
 	case ARM_BREAKPOINT_LEN_1:
 		*gen_len = HW_BREAKPOINT_LEN_1;
 		break;
@@ -513,18 +517,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 		default:
 			return -EINVAL;
 		}
-
-		info->address &= ~alignment_mask;
-		info->ctrl.len <<= offset;
 	} else {
 		if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE)
 			alignment_mask = 0x3;
 		else
 			alignment_mask = 0x7;
-		if (info->address & alignment_mask)
-			return -EINVAL;
+		offset = info->address & alignment_mask;
 	}
 
+	info->address &= ~alignment_mask;
+	info->ctrl.len <<= offset;
+
 	/*
 	 * Disallow per-task kernel breakpoints since these would
 	 * complicate the stepping code.
@@ -659,8 +662,8 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 			      struct pt_regs *regs)
 {
 	int i, step = 0, *kernel_step, access;
-	u32 ctrl_reg;
-	u64 val, alignment_mask;
+	u32 ctrl_reg, lens, lene;
+	u64 val;
 	struct perf_event *wp, **slots;
 	struct debug_info *debug_info;
 	struct arch_hw_breakpoint *info;
@@ -678,25 +681,21 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 			goto unlock;
 
 		info = counter_arch_bp(wp);
-		/* AArch32 watchpoints are either 4 or 8 bytes aligned. */
-		if (is_compat_task()) {
-			if (info->ctrl.len == ARM_BREAKPOINT_LEN_8)
-				alignment_mask = 0x7;
-			else
-				alignment_mask = 0x3;
-		} else {
-			alignment_mask = 0x7;
-		}
 
-		/* Check if the watchpoint value matches. */
+		/* Check if the watchpoint value and byte select match. */
 		val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
-		if (val != (addr & ~alignment_mask))
-			goto unlock;
-
-		/* Possible match, check the byte address select to confirm. */
 		ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
 		decode_ctrl_reg(ctrl_reg, &ctrl);
-		if (!((1 << (addr & alignment_mask)) & ctrl.len))
+		lens = ffs(ctrl.len) - 1;
+		lene = fls(ctrl.len) - 1;
+		/*
+		 * FIXME: reported address can be anywhere between "the
+		 * lowest address accessed by the memory access that
+		 * triggered the watchpoint" and "the highest watchpointed
+		 * address accessed by the memory access". So, it may not
+		 * lie in the interval of watchpoint address range.
+		 */
+		if (addr < val + lens || addr > val + lene)
 			goto unlock;
 
 		/*
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 1971f491bb90..4db41b860c18 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -220,13 +220,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
 				     struct arch_hw_breakpoint_ctrl ctrl,
 				     struct perf_event_attr *attr)
 {
-	int err, len, type, disabled = !ctrl.enabled;
+	int err, len, type, offset, disabled = !ctrl.enabled;
 
 	attr->disabled = disabled;
 	if (disabled)
 		return 0;
 
-	err = arch_bp_generic_fields(ctrl, &len, &type);
+	err = arch_bp_generic_fields(ctrl, &len, &type, &offset);
 	if (err)
 		return err;
 
@@ -245,6 +245,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
 
 	attr->bp_len	= len;
 	attr->bp_type	= type;
+	attr->bp_addr	+= offset;
 
 	return 0;
 }
@@ -297,7 +298,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type,
 	if (IS_ERR(bp))
 		return PTR_ERR(bp);
 
-	*addr = bp ? bp->attr.bp_addr : 0;
+	*addr = bp ? counter_arch_bp(bp)->address : 0;
 	return 0;
 }
 

From b48318f371e8a8b98238deac868bc7af8ed8ba4b Mon Sep 17 00:00:00 2001
From: Pavel Labath <test.tberghammer@gmail.com>
Date: Mon, 14 Nov 2016 19:32:44 +0530
Subject: [PATCH 0644/3220] BACKPORT: arm64: hw_breakpoint: Handle inexact
 watchpoint addresses

(cherry picked from commit fdfeff0f9e3d9be2b68fa02566017ffc581ae17b)

Arm64 hardware does not always report a watchpoint hit address that
matches one of the watchpoints set. It can also report an address
"near" the watchpoint if a single instruction access both watched and
unwatched addresses. There is no straight-forward way, short of
disassembling the offending instruction, to map that address back to
the watchpoint.

Previously, when the hardware reported a watchpoint hit on an address
that did not match our watchpoint (this happens in case of instructions
which access large chunks of memory such as "stp") the process would
enter a loop where we would be continually resuming it (because we did
not recognise that watchpoint hit) and it would keep hitting the
watchpoint again and again. The tracing process would never get
notified of the watchpoint hit.

This commit fixes the problem by looking at the watchpoints near the
address reported by the hardware. If the address does not exactly match
one of the watchpoints we have set, it attributes the hit to the
nearest watchpoint we have.  This heuristic is a bit dodgy, but I don't
think we can do much more, given the hardware limitations.

Signed-off-by: Pavel Labath <labath@google.com>
[panand: reworked to rebase on his patches]
Signed-off-by: Pratyush Anand <panand@redhat.com>
[will: use __ffs instead of ffs - 1]
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Pavel Labath <labath@google.com>
[pavel: trivial fixup in hw_breakpoint.c:watchpoint_handler]
Change-Id: I714dfaa3947d89d89a9e9a1ea84914d44ba0faa3
Bug: 30919905
---
 arch/arm64/kernel/hw_breakpoint.c | 98 ++++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 28 deletions(-)

diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 21cb18ba9ff8..6e02c8e4a8d0 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -658,11 +658,46 @@ unlock:
 	return 0;
 }
 
+/*
+ * Arm64 hardware does not always report a watchpoint hit address that matches
+ * one of the watchpoints set. It can also report an address "near" the
+ * watchpoint if a single instruction access both watched and unwatched
+ * addresses. There is no straight-forward way, short of disassembling the
+ * offending instruction, to map that address back to the watchpoint. This
+ * function computes the distance of the memory access from the watchpoint as a
+ * heuristic for the likelyhood that a given access triggered the watchpoint.
+ *
+ * See Section D2.10.5 "Determining the memory location that caused a Watchpoint
+ * exception" of ARMv8 Architecture Reference Manual for details.
+ *
+ * The function returns the distance of the address from the bytes watched by
+ * the watchpoint. In case of an exact match, it returns 0.
+ */
+static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
+					struct arch_hw_breakpoint_ctrl *ctrl)
+{
+	u64 wp_low, wp_high;
+	u32 lens, lene;
+
+	lens = __ffs(ctrl->len);
+	lene = __fls(ctrl->len);
+
+	wp_low = val + lens;
+	wp_high = val + lene;
+	if (addr < wp_low)
+		return wp_low - addr;
+	else if (addr > wp_high)
+		return addr - wp_high;
+	else
+		return 0;
+}
+
 static int watchpoint_handler(unsigned long addr, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	int i, step = 0, *kernel_step, access;
-	u32 ctrl_reg, lens, lene;
+	int i, step = 0, *kernel_step, access, closest_match = 0;
+	u64 min_dist = -1, dist;
+	u32 ctrl_reg;
 	u64 val;
 	struct perf_event *wp, **slots;
 	struct debug_info *debug_info;
@@ -672,31 +707,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 	slots = this_cpu_ptr(wp_on_reg);
 	debug_info = &current->thread.debug;
 
+	/*
+	 * Find all watchpoints that match the reported address. If no exact
+	 * match is found. Attribute the hit to the closest watchpoint.
+	 */
+	rcu_read_lock();
 	for (i = 0; i < core_num_wrps; ++i) {
-		rcu_read_lock();
-
 		wp = slots[i];
-
 		if (wp == NULL)
-			goto unlock;
-
-		info = counter_arch_bp(wp);
-
-		/* Check if the watchpoint value and byte select match. */
-		val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
-		ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
-		decode_ctrl_reg(ctrl_reg, &ctrl);
-		lens = ffs(ctrl.len) - 1;
-		lene = fls(ctrl.len) - 1;
-		/*
-		 * FIXME: reported address can be anywhere between "the
-		 * lowest address accessed by the memory access that
-		 * triggered the watchpoint" and "the highest watchpointed
-		 * address accessed by the memory access". So, it may not
-		 * lie in the interval of watchpoint address range.
-		 */
-		if (addr < val + lens || addr > val + lene)
-			goto unlock;
+			continue;
 
 		/*
 		 * Check that the access type matches.
@@ -705,18 +724,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 		access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W :
 			 HW_BREAKPOINT_R;
 		if (!(access & hw_breakpoint_type(wp)))
-			goto unlock;
+			continue;
 
+		/* Check if the watchpoint value and byte select match. */
+		val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
+		ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
+		decode_ctrl_reg(ctrl_reg, &ctrl);
+		dist = get_distance_from_watchpoint(addr, val, &ctrl);
+		if (dist < min_dist) {
+			min_dist = dist;
+			closest_match = i;
+		}
+		/* Is this an exact match? */
+		if (dist != 0)
+			continue;
+
+		info = counter_arch_bp(wp);
 		info->trigger = addr;
 		perf_bp_event(wp, regs);
 
 		/* Do we need to handle the stepping? */
 		if (!wp->overflow_handler)
 			step = 1;
-
-unlock:
-		rcu_read_unlock();
 	}
+	if (min_dist > 0 && min_dist != -1) {
+		/* No exact match found. */
+		wp = slots[closest_match];
+		info = counter_arch_bp(wp);
+		info->trigger = addr;
+		perf_bp_event(wp, regs);
+
+		/* Do we need to handle the stepping? */
+		if (!wp->overflow_handler)
+			step = 1;
+	}
+	rcu_read_unlock();
 
 	if (!step)
 		return 0;

From edc166a8714b012a3dd207e437c772ae2a264eca Mon Sep 17 00:00:00 2001
From: Pratyush Anand <panand@redhat.com>
Date: Mon, 14 Nov 2016 19:32:45 +0530
Subject: [PATCH 0645/3220] UPSTREAM: arm64: Allow hw watchpoint of length
 3,5,6 and 7

(cherry picked from commit 0ddb8e0b784ba034f3096d5a54684d0d73155e2a)

Since, arm64 can support all offset within a double word limit. Therefore,
now support other lengths within that range as well.

Signed-off-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Pavel Labath <labath@google.com>
Change-Id: Ibcb263a3903572336ccbf96e0180d3990326545a
Bug: 30919905
---
 arch/arm64/include/asm/hw_breakpoint.h |  4 +++
 arch/arm64/kernel/hw_breakpoint.c      | 36 ++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h
index 8acfd989a4e4..c72b8e201ab4 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h
@@ -68,7 +68,11 @@ static inline void decode_ctrl_reg(u32 reg,
 /* Lengths */
 #define ARM_BREAKPOINT_LEN_1	0x1
 #define ARM_BREAKPOINT_LEN_2	0x3
+#define ARM_BREAKPOINT_LEN_3	0x7
 #define ARM_BREAKPOINT_LEN_4	0xf
+#define ARM_BREAKPOINT_LEN_5	0x1f
+#define ARM_BREAKPOINT_LEN_6	0x3f
+#define ARM_BREAKPOINT_LEN_7	0x7f
 #define ARM_BREAKPOINT_LEN_8	0xff
 
 /* Kernel stepping */
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 6e02c8e4a8d0..2d2792a714ad 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -313,9 +313,21 @@ static int get_hbp_len(u8 hbp_len)
 	case ARM_BREAKPOINT_LEN_2:
 		len_in_bytes = 2;
 		break;
+	case ARM_BREAKPOINT_LEN_3:
+		len_in_bytes = 3;
+		break;
 	case ARM_BREAKPOINT_LEN_4:
 		len_in_bytes = 4;
 		break;
+	case ARM_BREAKPOINT_LEN_5:
+		len_in_bytes = 5;
+		break;
+	case ARM_BREAKPOINT_LEN_6:
+		len_in_bytes = 6;
+		break;
+	case ARM_BREAKPOINT_LEN_7:
+		len_in_bytes = 7;
+		break;
 	case ARM_BREAKPOINT_LEN_8:
 		len_in_bytes = 8;
 		break;
@@ -377,9 +389,21 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
 	case ARM_BREAKPOINT_LEN_2:
 		*gen_len = HW_BREAKPOINT_LEN_2;
 		break;
+	case ARM_BREAKPOINT_LEN_3:
+		*gen_len = HW_BREAKPOINT_LEN_3;
+		break;
 	case ARM_BREAKPOINT_LEN_4:
 		*gen_len = HW_BREAKPOINT_LEN_4;
 		break;
+	case ARM_BREAKPOINT_LEN_5:
+		*gen_len = HW_BREAKPOINT_LEN_5;
+		break;
+	case ARM_BREAKPOINT_LEN_6:
+		*gen_len = HW_BREAKPOINT_LEN_6;
+		break;
+	case ARM_BREAKPOINT_LEN_7:
+		*gen_len = HW_BREAKPOINT_LEN_7;
+		break;
 	case ARM_BREAKPOINT_LEN_8:
 		*gen_len = HW_BREAKPOINT_LEN_8;
 		break;
@@ -423,9 +447,21 @@ static int arch_build_bp_info(struct perf_event *bp)
 	case HW_BREAKPOINT_LEN_2:
 		info->ctrl.len = ARM_BREAKPOINT_LEN_2;
 		break;
+	case HW_BREAKPOINT_LEN_3:
+		info->ctrl.len = ARM_BREAKPOINT_LEN_3;
+		break;
 	case HW_BREAKPOINT_LEN_4:
 		info->ctrl.len = ARM_BREAKPOINT_LEN_4;
 		break;
+	case HW_BREAKPOINT_LEN_5:
+		info->ctrl.len = ARM_BREAKPOINT_LEN_5;
+		break;
+	case HW_BREAKPOINT_LEN_6:
+		info->ctrl.len = ARM_BREAKPOINT_LEN_6;
+		break;
+	case HW_BREAKPOINT_LEN_7:
+		info->ctrl.len = ARM_BREAKPOINT_LEN_7;
+		break;
 	case HW_BREAKPOINT_LEN_8:
 		info->ctrl.len = ARM_BREAKPOINT_LEN_8;
 		break;

From 0213f79a418915f2db36572ae86b748438ed635b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Aug 2016 00:34:58 -0400
Subject: [PATCH 0646/3220] UPSTREAM: udp: properly support MSG_PEEK with
 truncated buffers

[ Upstream commit 197c949e7798fbf28cfadc69d9ca0c2abbf93191 ]

Backport of this upstream commit into stable kernels :
89c22d8c3b27 ("net: Fix skb csum races when peeking")
exposed a bug in udp stack vs MSG_PEEK support, when user provides
a buffer smaller than skb payload.

In this case,
skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr),
                                 msg->msg_iov);
returns -EFAULT.

This bug does not happen in upstream kernels since Al Viro did a great
job to replace this into :
skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg);
This variant is safe vs short buffers.

For the time being, instead reverting Herbert Xu patch and add back
skb->ip_summed invalid changes, simply store the result of
udp_lib_checksum_complete() so that we avoid computing the checksum a
second time, and avoid the problematic
skb_copy_and_csum_datagram_iovec() call.

This patch can be applied on recent kernels as it avoids a double
checksumming, then backported to stable kernels as a bug fix.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit dfe2042d96065f044a794f684e9f7976a4ca6e24)
Bug: 32813456
---
 net/ipv4/udp.c | 6 ++++--
 net/ipv6/udp.c | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8acf544794a1..8271c01be33e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1274,6 +1274,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 	int peeked, off = 0;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
+	bool checksum_valid = false;
 	bool slow;
 
 	if (flags & MSG_ERRQUEUE)
@@ -1299,11 +1300,12 @@ try_again:
 	 */
 
 	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
-		if (udp_lib_checksum_complete(skb))
+		checksum_valid = !udp_lib_checksum_complete(skb);
+		if (!checksum_valid)
 			goto csum_copy_err;
 	}
 
-	if (skb_csum_unnecessary(skb))
+	if (checksum_valid || skb_csum_unnecessary(skb))
 		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
 					    msg, copied);
 	else {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0890fd6d4248..d46bb67c3001 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -402,6 +402,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	int peeked, off = 0;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
+	bool checksum_valid = false;
 	int is_udp4;
 	bool slow;
 
@@ -433,11 +434,12 @@ try_again:
 	 */
 
 	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
-		if (udp_lib_checksum_complete(skb))
+		checksum_valid = !udp_lib_checksum_complete(skb);
+		if (!checksum_valid)
 			goto csum_copy_err;
 	}
 
-	if (skb_csum_unnecessary(skb))
+	if (checksum_valid || skb_csum_unnecessary(skb))
 		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
 					    msg, copied);
 	else {

From 49b60d4aa95aa0519238a06fde5c838146742796 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 19 Sep 2016 17:39:09 +0200
Subject: [PATCH 0647/3220] BACKPORT: posix_acl: Clear SGID bit when setting
 file permissions

(cherry pick from commit 073931017b49d9458aa351605b43a7e34598caef)

When file permissions are modified via chmod(2) and the user is not in
the owning group or capable of CAP_FSETID, the setgid bit is cleared in
inode_change_ok().  Setting a POSIX ACL via setxattr(2) sets the file
permissions as well as the new ACL, but doesn't clear the setgid bit in
a similar way; this allows to bypass the check in chmod(2).  Fix that.

NB: We did not resolve the ACL leak in this CL, require additional
    upstream fix.

References: CVE-2016-7097
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Bug: 32458736
Change-Id: I19591ad452cc825ac282b3cfd2daaa72aa9a1ac1
---
 fs/9p/acl.c               | 40 +++++++++++++++++----------------------
 fs/btrfs/acl.c            |  6 ++----
 fs/ceph/acl.c             |  6 ++----
 fs/ext2/acl.c             | 12 ++++--------
 fs/ext4/acl.c             | 12 ++++--------
 fs/f2fs/acl.c             |  6 ++----
 fs/gfs2/acl.c             | 12 +++---------
 fs/hfsplus/posix_acl.c    |  4 ++--
 fs/jffs2/acl.c            |  9 ++++-----
 fs/jfs/acl.c              |  6 ++----
 fs/ocfs2/acl.c            | 10 ++++------
 fs/posix_acl.c            | 31 ++++++++++++++++++++++++++++++
 fs/reiserfs/xattr_acl.c   |  8 ++------
 fs/xfs/xfs_acl.c          | 13 ++++---------
 include/linux/posix_acl.h |  1 +
 15 files changed, 84 insertions(+), 92 deletions(-)

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index a7e28890f5ef..929b618da43b 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -282,32 +282,26 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 	switch (handler->flags) {
 	case ACL_TYPE_ACCESS:
 		if (acl) {
-			umode_t mode = inode->i_mode;
-			retval = posix_acl_equiv_mode(acl, &mode);
-			if (retval < 0)
+			struct iattr iattr;
+
+			retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
+			if (retval)
 				goto err_out;
-			else {
-				struct iattr iattr;
-				if (retval == 0) {
-					/*
-					 * ACL can be represented
-					 * by the mode bits. So don't
-					 * update ACL.
-					 */
-					acl = NULL;
-					value = NULL;
-					size = 0;
-				}
-				/* Updte the mode bits */
-				iattr.ia_mode = ((mode & S_IALLUGO) |
-						 (inode->i_mode & ~S_IALLUGO));
-				iattr.ia_valid = ATTR_MODE;
-				/* FIXME should we update ctime ?
-				 * What is the following setxattr update the
-				 * mode ?
+			if (!acl) {
+				/*
+				 * ACL can be represented
+				 * by the mode bits. So don't
+				 * update ACL.
 				 */
-				v9fs_vfs_setattr_dotl(dentry, &iattr);
+				value = NULL;
+				size = 0;
 			}
+			iattr.ia_valid = ATTR_MODE;
+			/* FIXME should we update ctime ?
+			 * What is the following setxattr update the
+			 * mode ?
+			 */
+			v9fs_vfs_setattr_dotl(dentry, &iattr);
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9a0124a95851..fb3e64d37cb4 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -83,11 +83,9 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (ret < 0)
+			ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (ret)
 				return ret;
-			if (ret == 0)
-				acl = NULL;
 		}
 		ret = 0;
 		break;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 8f84646f10e9..4d8caeb94a11 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -94,11 +94,9 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			ret = posix_acl_equiv_mode(acl, &new_mode);
-			if (ret < 0)
+			ret = posix_acl_update_mode(inode, &new_mode, &acl);
+			if (ret)
 				goto out;
-			if (ret == 0)
-				acl = NULL;
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 27695e6f4e46..d6aeb84e90b6 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -193,15 +193,11 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		case ACL_TYPE_ACCESS:
 			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
 			if (acl) {
-				error = posix_acl_equiv_mode(acl, &inode->i_mode);
-				if (error < 0)
+				error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+				if (error)
 					return error;
-				else {
-					inode->i_ctime = CURRENT_TIME_SEC;
-					mark_inode_dirty(inode);
-					if (error == 0)
-						acl = NULL;
-				}
+				inode->i_ctime = CURRENT_TIME_SEC;
+				mark_inode_dirty(inode);
 			}
 			break;
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 69b1e73026a5..c3fe1e323951 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -196,15 +196,11 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 	case ACL_TYPE_ACCESS:
 		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl) {
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
+			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (error)
 				return error;
-			else {
-				inode->i_ctime = ext4_current_time(inode);
-				ext4_mark_inode_dirty(handle, inode);
-				if (error == 0)
-					acl = NULL;
-			}
+			inode->i_ctime = ext4_current_time(inode);
+			ext4_mark_inode_dirty(handle, inode);
 		}
 		break;
 
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index c8f25f7241f0..e9a8d676c6bc 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -214,12 +214,10 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl) {
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
+			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (error)
 				return error;
 			set_acl_inode(fi, inode->i_mode);
-			if (error == 0)
-				acl = NULL;
 		}
 		break;
 
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1be3b061c05c..ff0ac96a8e7b 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -79,17 +79,11 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	if (type == ACL_TYPE_ACCESS) {
 		umode_t mode = inode->i_mode;
 
-		error = posix_acl_equiv_mode(acl, &mode);
-		if (error < 0)
+		error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+		if (error)
 			return error;
-
-		if (error == 0)
-			acl = NULL;
-
-		if (mode != inode->i_mode) {
-			inode->i_mode = mode;
+		if (mode != inode->i_mode)
 			mark_inode_dirty(inode);
-		}
 	}
 
 	if (acl) {
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index df0c9af68d05..71b3087b7e32 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -68,8 +68,8 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
 	case ACL_TYPE_ACCESS:
 		xattr_name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			err = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (err < 0)
+			err = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (err)
 				return err;
 		}
 		err = 0;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 2f7a3c090489..f9f86f87d32b 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -235,9 +235,10 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	case ACL_TYPE_ACCESS:
 		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
 		if (acl) {
-			umode_t mode = inode->i_mode;
-			rc = posix_acl_equiv_mode(acl, &mode);
-			if (rc < 0)
+			umode_t mode;
+
+			rc = posix_acl_update_mode(inode, &mode, &acl);
+			if (rc)
 				return rc;
 			if (inode->i_mode != mode) {
 				struct iattr attr;
@@ -249,8 +250,6 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 				if (rc < 0)
 					return rc;
 			}
-			if (rc == 0)
-				acl = NULL;
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 0c8ca830b113..9fad9f4fe883 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -84,13 +84,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
 	case ACL_TYPE_ACCESS:
 		ea_name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			rc = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (rc < 0)
+			rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (rc)
 				return rc;
 			inode->i_ctime = CURRENT_TIME;
 			mark_inode_dirty(inode);
-			if (rc == 0)
-				acl = NULL;
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0cdf497c91ef..18f0c9afab66 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -241,13 +241,11 @@ int ocfs2_set_acl(handle_t *handle,
 	case ACL_TYPE_ACCESS:
 		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl) {
-			umode_t mode = inode->i_mode;
-			ret = posix_acl_equiv_mode(acl, &mode);
-			if (ret < 0)
-				return ret;
+			umode_t mode;
 
-			if (ret == 0)
-				acl = NULL;
+			ret = posix_acl_update_mode(inode, &mode, &acl);
+			if (ret)
+				return ret;
 
 			ret = ocfs2_acl_set_mode(inode, di_bh,
 						 handle, mode);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4adde1e2cbec..31f6a17327a8 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -592,6 +592,37 @@ no_mem:
 }
 EXPORT_SYMBOL_GPL(posix_acl_create);
 
+/**
+ * posix_acl_update_mode  -  update mode in set_acl
+ *
+ * Update the file mode when setting an ACL: compute the new file permission
+ * bits based on the ACL.  In addition, if the ACL is equivalent to the new
+ * file mode, set *acl to NULL to indicate that no ACL should be set.
+ *
+ * As with chmod, clear the setgit bit if the caller is not in the owning group
+ * or capable of CAP_FSETID (see inode_change_ok).
+ *
+ * Called from set_acl inode operations.
+ */
+int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
+			  struct posix_acl **acl)
+{
+	umode_t mode = inode->i_mode;
+	int error;
+
+	error = posix_acl_equiv_mode(*acl, &mode);
+	if (error < 0)
+		return error;
+	if (error == 0)
+		*acl = NULL;
+	if (!in_group_p(inode->i_gid) &&
+	    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+		mode &= ~S_ISGID;
+	*mode_p = mode;
+	return 0;
+}
+EXPORT_SYMBOL(posix_acl_update_mode);
+
 /*
  * Fix up the uids and gids in posix acl extended attributes in place.
  */
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 4b34b9dc03dd..9b1824f35501 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -246,13 +246,9 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
+			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (error)
 				return error;
-			else {
-				if (error == 0)
-					acl = NULL;
-			}
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6bb470fbb8e8..c5101a3295d8 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -288,16 +288,11 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		return error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		umode_t mode = inode->i_mode;
-		error = posix_acl_equiv_mode(acl, &mode);
-
-		if (error <= 0) {
-			acl = NULL;
-
-			if (error < 0)
-				return error;
-		}
+		umode_t mode;
 
+		error = posix_acl_update_mode(inode, &mode, &acl);
+		if (error)
+			return error;
 		error = xfs_set_mode(inode, mode);
 		if (error)
 			return error;
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 3e96a6a76103..d1a8ad7e5ae4 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -95,6 +95,7 @@ extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 extern int posix_acl_chmod(struct inode *, umode_t);
 extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
 		struct posix_acl **);
+extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **);
 
 extern int simple_set_acl(struct inode *, struct posix_acl *, int);
 extern int simple_acl_create(struct inode *, struct inode *);

From dc2ad0661d6967e307bcdfa9172f681d21ea9a61 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 13 Dec 2016 10:33:34 -0800
Subject: [PATCH 0648/3220] FROMLIST: 9p: fix a potential acl leak

(https://lkml.org/lkml/2016/12/13/579)

posix_acl_update_mode() could possibly clear 'acl', if so
we leak the memory pointed by 'acl'. Save this pointer
before calling posix_acl_update_mode() and release the memory
if 'acl' really gets cleared.

Reported-by: Mark Salyzyn <salyzyn@android.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Greg Kurz <groug@kaod.org>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Bug: 32458736
Change-Id: Ia78da401e6fd1bfd569653bd2cd0ebd3f9c737a0
---
 fs/9p/acl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 929b618da43b..c30c6ceac2c4 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -283,6 +283,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 	case ACL_TYPE_ACCESS:
 		if (acl) {
 			struct iattr iattr;
+			struct posix_acl *old_acl = acl;
 
 			retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
 			if (retval)
@@ -293,6 +294,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 				 * by the mode bits. So don't
 				 * update ACL.
 				 */
+				posix_acl_release(old_acl);
 				value = NULL;
 				size = 0;
 			}

From d50112645714fc60c8f144710cfb595181494a8d Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Mon, 6 Feb 2017 14:27:24 -0800
Subject: [PATCH 0649/3220] ANDROID: android-recommended.cfg:
 CONFIG_CPU_SW_DOMAIN_PAN=y

Bug: 31374660
Change-Id: Id2710a5fa2694da66d3f34cbcc0c2a58a006cec5
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 android/configs/android-recommended.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index 70aaae17ad29..28610303db60 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -15,6 +15,7 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_CC_STACKPROTECTOR_STRONG=y
 CONFIG_COMPACTION=y
+CONFIG_CPU_SW_DOMAIN_PAN=y
 CONFIG_DEBUG_RODATA=y
 CONFIG_DM_UEVENT=y
 CONFIG_DRAGONRISE_FF=y

From 50dcddba6c11f2156b4dfefa87e86d961f46b89e Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht <adriens@google.com>
Date: Wed, 28 Sep 2016 12:14:39 -0700
Subject: [PATCH 0650/3220] Squashfs: remove the FILE_CACHE option

FILE_DIRECT is working fine and offers faster results and lower memory
footprint.

Removing FILE_CACHE makes our life easier because we don't have to
maintain 2 differents function that does the same thing.

Signed-off-by: Adrien Schildknecht <adriens@google.com>
Change-Id: I6689ba74d0042c222a806f9edc539995e8e04c6b
---
 fs/squashfs/Kconfig      | 28 ---------------------------
 fs/squashfs/Makefile     |  3 +--
 fs/squashfs/file_cache.c | 38 ------------------------------------
 fs/squashfs/page_actor.h | 42 +---------------------------------------
 4 files changed, 2 insertions(+), 109 deletions(-)
 delete mode 100644 fs/squashfs/file_cache.c

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index ffb093e72b6c..6dd158a216f4 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -25,34 +25,6 @@ config SQUASHFS
 
 	  If unsure, say N.
 
-choice
-	prompt "File decompression options"
-	depends on SQUASHFS
-	help
-	  Squashfs now supports two options for decompressing file
-	  data.  Traditionally Squashfs has decompressed into an
-	  intermediate buffer and then memcopied it into the page cache.
-	  Squashfs now supports the ability to decompress directly into
-	  the page cache.
-
-	  If unsure, select "Decompress file data into an intermediate buffer"
-
-config SQUASHFS_FILE_CACHE
-	bool "Decompress file data into an intermediate buffer"
-	help
-	  Decompress file data into an intermediate buffer and then
-	  memcopy it into the page cache.
-
-config SQUASHFS_FILE_DIRECT
-	bool "Decompress files directly into the page cache"
-	help
-	  Directly decompress file data into the page cache.
-	  Doing so can significantly improve performance because
-	  it eliminates a memcpy and it also removes the lock contention
-	  on the single buffer.
-
-endchoice
-
 choice
 	prompt "Decompressor parallelisation options"
 	depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 246a6f329d89..fe51f1507ed1 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,8 +5,7 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o decompressor.o
-squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
-squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-y += file_direct.o page_actor.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
deleted file mode 100644
index f2310d2a2019..000000000000
--- a/fs/squashfs/file_cache.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2013
- * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/fs.h>
-#include <linux/vfs.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/pagemap.h>
-#include <linux/mutex.h>
-
-#include "squashfs_fs.h"
-#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
-#include "squashfs.h"
-
-/* Read separately compressed datablock and memcopy into page cache */
-int squashfs_readpage_block(struct page *page, u64 block, int bsize)
-{
-	struct inode *i = page->mapping->host;
-	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
-		block, bsize);
-	int res = buffer->error;
-
-	if (res)
-		ERROR("Unable to read page, block %llx, size %x\n", block,
-			bsize);
-	else
-		squashfs_copy_cache(page, buffer, buffer->length, 0);
-
-	squashfs_cache_put(buffer);
-	return res;
-}
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 26dd82008b82..d2df0544e0df 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -8,46 +8,6 @@
  * the COPYING file in the top-level directory.
  */
 
-#ifndef CONFIG_SQUASHFS_FILE_DIRECT
-struct squashfs_page_actor {
-	void	**page;
-	int	pages;
-	int	length;
-	int	next_page;
-};
-
-static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
-	int pages, int length)
-{
-	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-
-	if (actor == NULL)
-		return NULL;
-
-	actor->length = length ? : pages * PAGE_CACHE_SIZE;
-	actor->page = page;
-	actor->pages = pages;
-	actor->next_page = 0;
-	return actor;
-}
-
-static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
-{
-	actor->next_page = 1;
-	return actor->page[0];
-}
-
-static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
-{
-	return actor->next_page == actor->pages ? NULL :
-		actor->page[actor->next_page++];
-}
-
-static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
-{
-	/* empty */
-}
-#else
 struct squashfs_page_actor {
 	union {
 		void		**buffer;
@@ -77,5 +37,5 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
 {
 	actor->squashfs_finish_page(actor);
 }
-#endif
+
 #endif

From 417aca479b15292460e981f969d3bcb7fb89517c Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht <adriens@google.com>
Date: Wed, 28 Sep 2016 13:59:18 -0700
Subject: [PATCH 0651/3220] Squashfs: refactor page_actor

This patch essentially does 3 things:
  1/ Always use an array of page to store the data instead of a mix of
     buffers and pages.
  2/ It is now possible to have 'holes' in a page actor, i.e. NULL
     pages in the array.
     When reading a block (default 128K), squashfs tries to grab all
     the pages covering this block. If a single page is up-to-date or
     locked, it falls back to using an intermediate buffer to do the
     read and then copy the pages in the actor. Allowing holes in the
     page actor remove the need for this intermediate buffer.
  3/ Refactor the wrappers to share code that deals with page actors.

Signed-off-by: Adrien Schildknecht <adriens@google.com>
Change-Id: I98128bed5d518cf31b67e788a85b275e9a323bec
---
 fs/squashfs/cache.c          |  73 +++++--------
 fs/squashfs/decompressor.c   |  53 ++++-----
 fs/squashfs/file_direct.c    |   4 +-
 fs/squashfs/lz4_wrapper.c    |  32 +-----
 fs/squashfs/lzo_wrapper.c    |  40 ++-----
 fs/squashfs/page_actor.c     | 201 ++++++++++++++++++++++-------------
 fs/squashfs/page_actor.h     |  64 +++++++----
 fs/squashfs/squashfs_fs_sb.h |   2 +-
 fs/squashfs/xz_wrapper.c     |  15 ++-
 fs/squashfs/zlib_wrapper.c   |  14 ++-
 10 files changed, 270 insertions(+), 228 deletions(-)

diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 1cb70a0b2168..6785d086ab38 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -209,17 +209,14 @@ void squashfs_cache_put(struct squashfs_cache_entry *entry)
  */
 void squashfs_cache_delete(struct squashfs_cache *cache)
 {
-	int i, j;
+	int i;
 
 	if (cache == NULL)
 		return;
 
 	for (i = 0; i < cache->entries; i++) {
-		if (cache->entry[i].data) {
-			for (j = 0; j < cache->pages; j++)
-				kfree(cache->entry[i].data[j]);
-			kfree(cache->entry[i].data);
-		}
+		if (cache->entry[i].page)
+			free_page_array(cache->entry[i].page, cache->pages);
 		kfree(cache->entry[i].actor);
 	}
 
@@ -236,7 +233,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
 struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 	int block_size)
 {
-	int i, j;
+	int i;
 	struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
 
 	if (cache == NULL) {
@@ -268,22 +265,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 		init_waitqueue_head(&cache->entry[i].wait_queue);
 		entry->cache = cache;
 		entry->block = SQUASHFS_INVALID_BLK;
-		entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
-		if (entry->data == NULL) {
+		entry->page = alloc_page_array(cache->pages, GFP_KERNEL);
+		if (!entry->page) {
 			ERROR("Failed to allocate %s cache entry\n", name);
 			goto cleanup;
 		}
-
-		for (j = 0; j < cache->pages; j++) {
-			entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
-			if (entry->data[j] == NULL) {
-				ERROR("Failed to allocate %s buffer\n", name);
-				goto cleanup;
-			}
-		}
-
-		entry->actor = squashfs_page_actor_init(entry->data,
-						cache->pages, 0);
+		entry->actor = squashfs_page_actor_init(entry->page,
+			cache->pages, 0, NULL);
 		if (entry->actor == NULL) {
 			ERROR("Failed to allocate %s cache entry\n", name);
 			goto cleanup;
@@ -314,18 +302,20 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
 		return min(length, entry->length - offset);
 
 	while (offset < entry->length) {
-		void *buff = entry->data[offset / PAGE_CACHE_SIZE]
-				+ (offset % PAGE_CACHE_SIZE);
+		void *buff = kmap_atomic(entry->page[offset / PAGE_CACHE_SIZE])
+			     + (offset % PAGE_CACHE_SIZE);
 		int bytes = min_t(int, entry->length - offset,
 				PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
 
 		if (bytes >= remaining) {
 			memcpy(buffer, buff, remaining);
+			kunmap_atomic(buff);
 			remaining = 0;
 			break;
 		}
 
 		memcpy(buffer, buff, bytes);
+		kunmap_atomic(buff);
 		buffer += bytes;
 		remaining -= bytes;
 		offset += bytes;
@@ -416,43 +406,38 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
 void *squashfs_read_table(struct super_block *sb, u64 block, int length)
 {
 	int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	int i, res;
-	void *table, *buffer, **data;
+	struct page **page;
+	void *buff;
+	int res;
 	struct squashfs_page_actor *actor;
 
-	table = buffer = kmalloc(length, GFP_KERNEL);
-	if (table == NULL)
+	page = alloc_page_array(pages, GFP_KERNEL);
+	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
-	if (data == NULL) {
+	actor = squashfs_page_actor_init(page, pages, length, NULL);
+	if (actor == NULL) {
 		res = -ENOMEM;
 		goto failed;
 	}
 
-	actor = squashfs_page_actor_init(data, pages, length);
-	if (actor == NULL) {
-		res = -ENOMEM;
-		goto failed2;
-	}
-
-	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
-		data[i] = buffer;
-
 	res = squashfs_read_data(sb, block, length |
 		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);
 
-	kfree(data);
-	kfree(actor);
-
 	if (res < 0)
-		goto failed;
+		goto failed2;
 
-	return table;
+	buff = kmalloc(length, GFP_KERNEL);
+	if (!buff)
+		goto failed2;
+	squashfs_actor_to_buf(actor, buff, length);
+	squashfs_page_actor_free(actor, 0);
+	free_page_array(page, pages);
+	return buff;
 
 failed2:
-	kfree(data);
+	squashfs_page_actor_free(actor, 0);
 failed:
-	kfree(table);
+	free_page_array(page, pages);
 	return ERR_PTR(res);
 }
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index e9034bf6e5ae..7de35bf297aa 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -24,7 +24,8 @@
 #include <linux/types.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/buffer_head.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -94,40 +95,44 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
 static void *get_comp_opts(struct super_block *sb, unsigned short flags)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
-	void *buffer = NULL, *comp_opts;
+	void *comp_opts, *buffer = NULL;
+	struct page *page;
 	struct squashfs_page_actor *actor = NULL;
 	int length = 0;
 
+	if (!SQUASHFS_COMP_OPTS(flags))
+		return squashfs_comp_opts(msblk, buffer, length);
+
 	/*
 	 * Read decompressor specific options from file system if present
 	 */
-	if (SQUASHFS_COMP_OPTS(flags)) {
-		buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
-		if (buffer == NULL) {
-			comp_opts = ERR_PTR(-ENOMEM);
-			goto out;
-		}
 
-		actor = squashfs_page_actor_init(&buffer, 1, 0);
-		if (actor == NULL) {
-			comp_opts = ERR_PTR(-ENOMEM);
-			goto out;
-		}
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
 
-		length = squashfs_read_data(sb,
-			sizeof(struct squashfs_super_block), 0, NULL, actor);
-
-		if (length < 0) {
-			comp_opts = ERR_PTR(length);
-			goto out;
-		}
+	actor = squashfs_page_actor_init(&page, 1, 0, NULL);
+	if (actor == NULL) {
+		comp_opts = ERR_PTR(-ENOMEM);
+		goto actor_error;
 	}
 
-	comp_opts = squashfs_comp_opts(msblk, buffer, length);
+	length = squashfs_read_data(sb,
+		sizeof(struct squashfs_super_block), 0, NULL, actor);
 
-out:
-	kfree(actor);
-	kfree(buffer);
+	if (length < 0) {
+		comp_opts = ERR_PTR(length);
+		goto read_error;
+	}
+
+	buffer = kmap_atomic(page);
+	comp_opts = squashfs_comp_opts(msblk, buffer, length);
+	kunmap_atomic(buffer);
+
+read_error:
+	squashfs_page_actor_free(actor, 0);
+actor_error:
+	__free_page(page);
 	return comp_opts;
 }
 
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 43e7a7eddac0..9d033ec0f133 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -52,7 +52,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 	 * Create a "page actor" which will kmap and kunmap the
 	 * page cache pages appropriately within the decompressor
 	 */
-	actor = squashfs_page_actor_init_special(page, pages, 0);
+	actor = squashfs_page_actor_init(page, pages, 0, NULL);
 	if (actor == NULL)
 		goto out;
 
@@ -131,7 +131,7 @@ mark_errored:
 	}
 
 out:
-	kfree(actor);
+	squashfs_page_actor_free(actor, 0);
 	kfree(page);
 	return res;
 }
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index c31e2bc9c081..df4fa3c7ddd0 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -94,39 +94,17 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	struct buffer_head **bh, int b, int offset, int length,
 	struct squashfs_page_actor *output)
 {
-	struct squashfs_lz4 *stream = strm;
-	void *buff = stream->input, *data;
-	int avail, i, bytes = length, res;
+	int res;
 	size_t dest_len = output->length;
+	struct squashfs_lz4 *stream = strm;
 
-	for (i = 0; i < b; i++) {
-		avail = min(bytes, msblk->devblksize - offset);
-		memcpy(buff, bh[i]->b_data + offset, avail);
-		buff += avail;
-		bytes -= avail;
-		offset = 0;
-		put_bh(bh[i]);
-	}
-
+	squashfs_bh_to_buf(bh, b, stream->input, offset, length,
+		msblk->devblksize);
 	res = lz4_decompress_unknownoutputsize(stream->input, length,
 					stream->output, &dest_len);
 	if (res)
 		return -EIO;
-
-	bytes = dest_len;
-	data = squashfs_first_page(output);
-	buff = stream->output;
-	while (data) {
-		if (bytes <= PAGE_CACHE_SIZE) {
-			memcpy(data, buff, bytes);
-			break;
-		}
-		memcpy(data, buff, PAGE_CACHE_SIZE);
-		buff += PAGE_CACHE_SIZE;
-		bytes -= PAGE_CACHE_SIZE;
-		data = squashfs_next_page(output);
-	}
-	squashfs_finish_page(output);
+	squashfs_buf_to_actor(stream->output, output, dest_len);
 
 	return dest_len;
 }
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 244b9fbfff7b..2c844d53a59e 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -79,45 +79,19 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	struct buffer_head **bh, int b, int offset, int length,
 	struct squashfs_page_actor *output)
 {
-	struct squashfs_lzo *stream = strm;
-	void *buff = stream->input, *data;
-	int avail, i, bytes = length, res;
+	int res;
 	size_t out_len = output->length;
+	struct squashfs_lzo *stream = strm;
 
-	for (i = 0; i < b; i++) {
-		avail = min(bytes, msblk->devblksize - offset);
-		memcpy(buff, bh[i]->b_data + offset, avail);
-		buff += avail;
-		bytes -= avail;
-		offset = 0;
-		put_bh(bh[i]);
-	}
-
+	squashfs_bh_to_buf(bh, b, stream->input, offset, length,
+		msblk->devblksize);
 	res = lzo1x_decompress_safe(stream->input, (size_t)length,
 					stream->output, &out_len);
 	if (res != LZO_E_OK)
-		goto failed;
+		return -EIO;
+	squashfs_buf_to_actor(stream->output, output, out_len);
 
-	res = bytes = (int)out_len;
-	data = squashfs_first_page(output);
-	buff = stream->output;
-	while (data) {
-		if (bytes <= PAGE_CACHE_SIZE) {
-			memcpy(data, buff, bytes);
-			break;
-		} else {
-			memcpy(data, buff, PAGE_CACHE_SIZE);
-			buff += PAGE_CACHE_SIZE;
-			bytes -= PAGE_CACHE_SIZE;
-			data = squashfs_next_page(output);
-		}
-	}
-	squashfs_finish_page(output);
-
-	return res;
-
-failed:
-	return -EIO;
+	return out_len;
 }
 
 const struct squashfs_decompressor squashfs_lzo_comp_ops = {
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 5a1c11f56441..53863508e400 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -9,79 +9,11 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/buffer_head.h>
 #include "page_actor.h"
 
-/*
- * This file contains implementations of page_actor for decompressing into
- * an intermediate buffer, and for decompressing directly into the
- * page cache.
- *
- * Calling code should avoid sleeping between calls to squashfs_first_page()
- * and squashfs_finish_page().
- */
-
-/* Implementation of page_actor for decompressing into intermediate buffer */
-static void *cache_first_page(struct squashfs_page_actor *actor)
-{
-	actor->next_page = 1;
-	return actor->buffer[0];
-}
-
-static void *cache_next_page(struct squashfs_page_actor *actor)
-{
-	if (actor->next_page == actor->pages)
-		return NULL;
-
-	return actor->buffer[actor->next_page++];
-}
-
-static void cache_finish_page(struct squashfs_page_actor *actor)
-{
-	/* empty */
-}
-
-struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
-	int pages, int length)
-{
-	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-
-	if (actor == NULL)
-		return NULL;
-
-	actor->length = length ? : pages * PAGE_CACHE_SIZE;
-	actor->buffer = buffer;
-	actor->pages = pages;
-	actor->next_page = 0;
-	actor->squashfs_first_page = cache_first_page;
-	actor->squashfs_next_page = cache_next_page;
-	actor->squashfs_finish_page = cache_finish_page;
-	return actor;
-}
-
-/* Implementation of page_actor for decompressing directly into page cache. */
-static void *direct_first_page(struct squashfs_page_actor *actor)
-{
-	actor->next_page = 1;
-	return actor->pageaddr = kmap_atomic(actor->page[0]);
-}
-
-static void *direct_next_page(struct squashfs_page_actor *actor)
-{
-	if (actor->pageaddr)
-		kunmap_atomic(actor->pageaddr);
-
-	return actor->pageaddr = actor->next_page == actor->pages ? NULL :
-		kmap_atomic(actor->page[actor->next_page++]);
-}
-
-static void direct_finish_page(struct squashfs_page_actor *actor)
-{
-	if (actor->pageaddr)
-		kunmap_atomic(actor->pageaddr);
-}
-
-struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
-	int pages, int length)
+struct squashfs_page_actor *squashfs_page_actor_init(struct page **page,
+	int pages, int length, void (*release_pages)(struct page **, int, int))
 {
 	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
 
@@ -93,8 +25,129 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
 	actor->pages = pages;
 	actor->next_page = 0;
 	actor->pageaddr = NULL;
-	actor->squashfs_first_page = direct_first_page;
-	actor->squashfs_next_page = direct_next_page;
-	actor->squashfs_finish_page = direct_finish_page;
+	actor->release_pages = release_pages;
 	return actor;
 }
+
+void squashfs_page_actor_free(struct squashfs_page_actor *actor, int error)
+{
+	if (!actor)
+		return;
+
+	if (actor->release_pages)
+		actor->release_pages(actor->page, actor->pages, error);
+	kfree(actor);
+}
+
+void squashfs_actor_to_buf(struct squashfs_page_actor *actor, void *buf,
+	int length)
+{
+	void *pageaddr;
+	int pos = 0, avail, i;
+
+	for (i = 0; i < actor->pages && pos < length; ++i) {
+		avail = min_t(int, length - pos, PAGE_CACHE_SIZE);
+		if (actor->page[i]) {
+			pageaddr = kmap_atomic(actor->page[i]);
+			memcpy(buf + pos, pageaddr, avail);
+			kunmap_atomic(pageaddr);
+		}
+		pos += avail;
+	}
+}
+
+void squashfs_buf_to_actor(void *buf, struct squashfs_page_actor *actor,
+	int length)
+{
+	void *pageaddr;
+	int pos = 0, avail, i;
+
+	for (i = 0; i < actor->pages && pos < length; ++i) {
+		avail = min_t(int, length - pos, PAGE_CACHE_SIZE);
+		if (actor->page[i]) {
+			pageaddr = kmap_atomic(actor->page[i]);
+			memcpy(pageaddr, buf + pos, avail);
+			kunmap_atomic(pageaddr);
+		}
+		pos += avail;
+	}
+}
+
+void squashfs_bh_to_actor(struct buffer_head **bh, int nr_buffers,
+	struct squashfs_page_actor *actor, int offset, int length, int blksz)
+{
+	void *kaddr = NULL;
+	int bytes = 0, pgoff = 0, b = 0, p = 0, avail, i;
+
+	while (bytes < length) {
+		if (actor->page[p]) {
+			kaddr = kmap_atomic(actor->page[p]);
+			while (pgoff < PAGE_CACHE_SIZE && bytes < length) {
+				avail = min_t(int, blksz - offset,
+						PAGE_CACHE_SIZE - pgoff);
+				memcpy(kaddr + pgoff, bh[b]->b_data + offset,
+				       avail);
+				pgoff += avail;
+				bytes += avail;
+				offset = (offset + avail) % blksz;
+				if (!offset) {
+					put_bh(bh[b]);
+					++b;
+				}
+			}
+			kunmap_atomic(kaddr);
+			pgoff = 0;
+		} else {
+			for (i = 0; i < PAGE_CACHE_SIZE / blksz; ++i) {
+				if (bh[b])
+					put_bh(bh[b]);
+				++b;
+			}
+			bytes += PAGE_CACHE_SIZE;
+		}
+		++p;
+	}
+}
+
+void squashfs_bh_to_buf(struct buffer_head **bh, int nr_buffers, void *buf,
+	int offset, int length, int blksz)
+{
+	int i, avail, bytes = 0;
+
+	for (i = 0; i < nr_buffers && bytes < length; ++i) {
+		avail = min_t(int, length - bytes, blksz - offset);
+		if (bh[i]) {
+			memcpy(buf + bytes, bh[i]->b_data + offset, avail);
+			put_bh(bh[i]);
+		}
+		bytes += avail;
+		offset = 0;
+	}
+}
+
+void free_page_array(struct page **page, int nr_pages)
+{
+	int i;
+
+	for (i = 0; i < nr_pages; ++i)
+		__free_page(page[i]);
+	kfree(page);
+}
+
+struct page **alloc_page_array(int nr_pages, int gfp_mask)
+{
+	int i;
+	struct page **page;
+
+	page = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
+	if (!page)
+		return NULL;
+	for (i = 0; i < nr_pages; ++i) {
+		page[i] = alloc_page(gfp_mask);
+		if (!page[i]) {
+			free_page_array(page, i);
+			return NULL;
+		}
+	}
+	return page;
+}
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index d2df0544e0df..aa1ed790b5a3 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -5,37 +5,61 @@
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
  * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
+ * the COPYING file in the top-level squashfsory.
  */
 
 struct squashfs_page_actor {
-	union {
-		void		**buffer;
-		struct page	**page;
-	};
+	struct page	**page;
 	void	*pageaddr;
-	void    *(*squashfs_first_page)(struct squashfs_page_actor *);
-	void    *(*squashfs_next_page)(struct squashfs_page_actor *);
-	void    (*squashfs_finish_page)(struct squashfs_page_actor *);
 	int	pages;
 	int	length;
 	int	next_page;
+	void	(*release_pages)(struct page **, int, int);
 };
 
-extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
-extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
-							 **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init(struct page **,
+	int, int, void (*)(struct page **, int, int));
+extern void squashfs_page_actor_free(struct squashfs_page_actor *, int);
+
+extern void squashfs_actor_to_buf(struct squashfs_page_actor *, void *, int);
+extern void squashfs_buf_to_actor(void *, struct squashfs_page_actor *, int);
+extern void squashfs_bh_to_actor(struct buffer_head **, int,
+	struct squashfs_page_actor *, int, int, int);
+extern void squashfs_bh_to_buf(struct buffer_head **, int, void *, int, int,
+	int);
+
+/*
+ * Calling code should avoid sleeping between calls to squashfs_first_page()
+ * and squashfs_finish_page().
+ */
 static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
 {
-	return actor->squashfs_first_page(actor);
-}
-static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
-{
-	return actor->squashfs_next_page(actor);
-}
-static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
-{
-	actor->squashfs_finish_page(actor);
+	actor->next_page = 1;
+	return actor->pageaddr = actor->page[0] ? kmap_atomic(actor->page[0])
+						: NULL;
 }
 
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+	if (!IS_ERR_OR_NULL(actor->pageaddr))
+		kunmap_atomic(actor->pageaddr);
+
+	if (actor->next_page == actor->pages)
+		return actor->pageaddr = ERR_PTR(-ENODATA);
+
+	actor->pageaddr = actor->page[actor->next_page] ?
+	    kmap_atomic(actor->page[actor->next_page]) : NULL;
+	++actor->next_page;
+	return actor->pageaddr;
+}
+
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+	if (!IS_ERR_OR_NULL(actor->pageaddr))
+		kunmap_atomic(actor->pageaddr);
+}
+
+extern struct page **alloc_page_array(int, int);
+extern void free_page_array(struct page **, int);
+
 #endif
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 1da565cb50c3..8a6995de0277 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -49,7 +49,7 @@ struct squashfs_cache_entry {
 	int			num_waiters;
 	wait_queue_head_t	wait_queue;
 	struct squashfs_cache	*cache;
-	void			**data;
+	struct page		**page;
 	struct squashfs_page_actor	*actor;
 };
 
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index c609624e4b8a..14cd373e1897 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -55,7 +55,7 @@ static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk,
 	struct comp_opts *opts;
 	int err = 0, n;
 
-	opts = kmalloc(sizeof(*opts), GFP_KERNEL);
+	opts = kmalloc(sizeof(*opts), GFP_ATOMIC);
 	if (opts == NULL) {
 		err = -ENOMEM;
 		goto out2;
@@ -136,6 +136,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	enum xz_ret xz_err;
 	int avail, total = 0, k = 0;
 	struct squashfs_xz *stream = strm;
+	void *buf = NULL;
 
 	xz_dec_reset(stream->state);
 	stream->buf.in_pos = 0;
@@ -156,12 +157,20 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (stream->buf.out_pos == stream->buf.out_size) {
 			stream->buf.out = squashfs_next_page(output);
-			if (stream->buf.out != NULL) {
+			if (!IS_ERR(stream->buf.out)) {
 				stream->buf.out_pos = 0;
 				total += PAGE_CACHE_SIZE;
 			}
 		}
 
+		if (!stream->buf.out) {
+			if (!buf) {
+				buf = kmalloc(PAGE_CACHE_SIZE, GFP_ATOMIC);
+				if (!buf)
+					goto out;
+			}
+			stream->buf.out = buf;
+		}
 		xz_err = xz_dec_run(stream->state, &stream->buf);
 
 		if (stream->buf.in_pos == stream->buf.in_size && k < b)
@@ -173,11 +182,13 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	if (xz_err != XZ_STREAM_END || k < b)
 		goto out;
 
+	kfree(buf);
 	return total + stream->buf.out_pos;
 
 out:
 	for (; k < b; k++)
 		put_bh(bh[k]);
+	kfree(buf);
 
 	return -EIO;
 }
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 8727caba6882..09c892b5308e 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -66,6 +66,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	struct buffer_head **bh, int b, int offset, int length,
 	struct squashfs_page_actor *output)
 {
+	void *buf = NULL;
 	int zlib_err, zlib_init = 0, k = 0;
 	z_stream *stream = strm;
 
@@ -84,10 +85,19 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (stream->avail_out == 0) {
 			stream->next_out = squashfs_next_page(output);
-			if (stream->next_out != NULL)
+			if (!IS_ERR(stream->next_out))
 				stream->avail_out = PAGE_CACHE_SIZE;
 		}
 
+		if (!stream->next_out) {
+			if (!buf) {
+				buf = kmalloc(PAGE_CACHE_SIZE, GFP_ATOMIC);
+				if (!buf)
+					goto out;
+			}
+			stream->next_out = buf;
+		}
+
 		if (!zlib_init) {
 			zlib_err = zlib_inflateInit(stream);
 			if (zlib_err != Z_OK) {
@@ -115,11 +125,13 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	if (k < b)
 		goto out;
 
+	kfree(buf);
 	return stream->total_out;
 
 out:
 	for (; k < b; k++)
 		put_bh(bh[k]);
+	kfree(buf);
 
 	return -EIO;
 }

From c9994560db8e33d933a3e844b75e5ba2f40764db Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht <adriens@google.com>
Date: Mon, 7 Nov 2016 12:37:55 -0800
Subject: [PATCH 0652/3220] Squashfs: replace buffer_head with BIO

The 'll_rw_block' has been deprecated and BIO is now the basic container
for block I/O within the kernel.

Switching to BIO offers 2 advantages:
  1/ It removes synchronous wait for the up-to-date buffers: SquashFS
     now deals with decompressions/copies asynchronously.
     Implementing an asynchronous mechanism to read data is needed to
     efficiently implement .readpages().
  2/ Prior to this patch, merging the read requests entirely depends on
     the IO scheduler. SquashFS has more information than the IO
     scheduler about what could be merged. Moreover, merging the reads
     at the FS level means that we rely less on the IO scheduler.

Signed-off-by: Adrien Schildknecht <adriens@google.com>
Change-Id: I775d2e11f017476e1899518ab52d9d0a8a0bce28
---
 fs/squashfs/block.c       | 527 +++++++++++++++++++++++++++-----------
 fs/squashfs/file_direct.c | 204 +++++----------
 fs/squashfs/squashfs.h    |   6 +
 fs/squashfs/super.c       |   7 +
 4 files changed, 464 insertions(+), 280 deletions(-)

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 0cea9b9236d0..8a75d812ffdf 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -28,9 +28,12 @@
 
 #include <linux/fs.h>
 #include <linux/vfs.h>
+#include <linux/bio.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/pagemap.h>
 #include <linux/buffer_head.h>
+#include <linux/workqueue.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -38,45 +41,355 @@
 #include "decompressor.h"
 #include "page_actor.h"
 
-/*
- * Read the metadata block length, this is stored in the first two
- * bytes of the metadata block.
- */
-static struct buffer_head *get_block_length(struct super_block *sb,
-			u64 *cur_index, int *offset, int *length)
+static struct workqueue_struct *squashfs_read_wq;
+
+struct squashfs_read_request {
+	struct super_block *sb;
+	u64 index;
+	int length;
+	int compressed;
+	int offset;
+	u64 read_end;
+	struct squashfs_page_actor *output;
+	enum {
+		SQUASHFS_COPY,
+		SQUASHFS_DECOMPRESS,
+		SQUASHFS_METADATA,
+	} data_processing;
+	bool synchronous;
+
+	/*
+	 * If the read is synchronous, it is possible to retrieve information
+	 * about the request by setting these pointers.
+	 */
+	int *res;
+	int *bytes_read;
+	int *bytes_uncompressed;
+
+	int nr_buffers;
+	struct buffer_head **bh;
+	struct work_struct offload;
+};
+
+struct squashfs_bio_request {
+	struct buffer_head **bh;
+	int nr_buffers;
+};
+
+static int squashfs_bio_submit(struct squashfs_read_request *req);
+
+int squashfs_init_read_wq(void)
 {
-	struct squashfs_sb_info *msblk = sb->s_fs_info;
-	struct buffer_head *bh;
+	squashfs_read_wq = create_workqueue("SquashFS read wq");
+	return !!squashfs_read_wq;
+}
 
-	bh = sb_bread(sb, *cur_index);
-	if (bh == NULL)
-		return NULL;
+void squashfs_destroy_read_wq(void)
+{
+	flush_workqueue(squashfs_read_wq);
+	destroy_workqueue(squashfs_read_wq);
+}
 
-	if (msblk->devblksize - *offset == 1) {
-		*length = (unsigned char) bh->b_data[*offset];
-		put_bh(bh);
-		bh = sb_bread(sb, ++(*cur_index));
-		if (bh == NULL)
-			return NULL;
-		*length |= (unsigned char) bh->b_data[0] << 8;
-		*offset = 1;
-	} else {
-		*length = (unsigned char) bh->b_data[*offset] |
-			(unsigned char) bh->b_data[*offset + 1] << 8;
-		*offset += 2;
+static void free_read_request(struct squashfs_read_request *req, int error)
+{
+	if (!req->synchronous)
+		squashfs_page_actor_free(req->output, error);
+	if (req->res)
+		*(req->res) = error;
+	kfree(req->bh);
+	kfree(req);
+}
 
-		if (*offset == msblk->devblksize) {
-			put_bh(bh);
-			bh = sb_bread(sb, ++(*cur_index));
-			if (bh == NULL)
-				return NULL;
-			*offset = 0;
+static void squashfs_process_blocks(struct squashfs_read_request *req)
+{
+	int error = 0;
+	int bytes, i, length;
+	struct squashfs_sb_info *msblk = req->sb->s_fs_info;
+	struct squashfs_page_actor *actor = req->output;
+	struct buffer_head **bh = req->bh;
+	int nr_buffers = req->nr_buffers;
+
+	for (i = 0; i < nr_buffers; ++i) {
+		if (!bh[i])
+			continue;
+		wait_on_buffer(bh[i]);
+		if (!buffer_uptodate(bh[i]))
+			error = -EIO;
+	}
+	if (error)
+		goto cleanup;
+
+	if (req->data_processing == SQUASHFS_METADATA) {
+		/* Extract the length of the metadata block */
+		if (req->offset != msblk->devblksize - 1)
+			length = *((u16 *)(bh[0]->b_data + req->offset));
+		else {
+			length = bh[0]->b_data[req->offset];
+			length |= bh[1]->b_data[0] << 8;
+		}
+		req->compressed = SQUASHFS_COMPRESSED(length);
+		req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS
+						       : SQUASHFS_COPY;
+		length = SQUASHFS_COMPRESSED_SIZE(length);
+		if (req->index + length + 2 > req->read_end) {
+			for (i = 0; i < nr_buffers; ++i)
+				put_bh(bh[i]);
+			kfree(bh);
+			req->length = length;
+			req->index += 2;
+			squashfs_bio_submit(req);
+			return;
+		}
+		req->length = length;
+		req->offset = (req->offset + 2) % PAGE_SIZE;
+		if (req->offset < 2) {
+			put_bh(bh[0]);
+			++bh;
+			--nr_buffers;
+		}
+	}
+	if (req->bytes_read)
+		*(req->bytes_read) = req->length;
+
+	if (req->data_processing == SQUASHFS_COPY) {
+		squashfs_bh_to_actor(bh, nr_buffers, req->output, req->offset,
+			req->length, msblk->devblksize);
+	} else if (req->data_processing == SQUASHFS_DECOMPRESS) {
+		req->length = squashfs_decompress(msblk, bh, nr_buffers,
+			req->offset, req->length, actor);
+		if (req->length < 0) {
+			error = -EIO;
+			goto cleanup;
 		}
 	}
 
-	return bh;
+	/* Last page may have trailing bytes not filled */
+	bytes = req->length % PAGE_SIZE;
+	if (bytes && actor->page[actor->pages - 1])
+		zero_user_segment(actor->page[actor->pages - 1], bytes,
+				  PAGE_SIZE);
+
+cleanup:
+	if (req->bytes_uncompressed)
+		*(req->bytes_uncompressed) = req->length;
+	if (error) {
+		for (i = 0; i < nr_buffers; ++i)
+			if (bh[i])
+				put_bh(bh[i]);
+	}
+	free_read_request(req, error);
 }
 
+static void read_wq_handler(struct work_struct *work)
+{
+	squashfs_process_blocks(container_of(work,
+		    struct squashfs_read_request, offload));
+}
+
+static void squashfs_bio_end_io(struct bio *bio)
+{
+	int i;
+	int error = bio->bi_error;
+	struct squashfs_bio_request *bio_req = bio->bi_private;
+
+	bio_put(bio);
+
+	for (i = 0; i < bio_req->nr_buffers; ++i) {
+		if (!bio_req->bh[i])
+			continue;
+		if (!error)
+			set_buffer_uptodate(bio_req->bh[i]);
+		else
+			clear_buffer_uptodate(bio_req->bh[i]);
+		unlock_buffer(bio_req->bh[i]);
+	}
+	kfree(bio_req);
+}
+
+static int actor_getblks(struct squashfs_read_request *req, u64 block)
+{
+	int i;
+
+	req->bh = kmalloc_array(req->nr_buffers, sizeof(*(req->bh)), GFP_NOIO);
+	if (!req->bh)
+		return -ENOMEM;
+
+	for (i = 0; i < req->nr_buffers; ++i) {
+		req->bh[i] = sb_getblk(req->sb, block + i);
+		if (!req->bh[i]) {
+			while (--i) {
+				if (req->bh[i])
+					put_bh(req->bh[i]);
+			}
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int squashfs_bio_submit(struct squashfs_read_request *req)
+{
+	struct bio *bio = NULL;
+	struct buffer_head *bh;
+	struct squashfs_bio_request *bio_req = NULL;
+	int b = 0, prev_block = 0;
+	struct squashfs_sb_info *msblk = req->sb->s_fs_info;
+
+	u64 read_start = round_down(req->index, msblk->devblksize);
+	u64 read_end = round_up(req->index + req->length, msblk->devblksize);
+	sector_t block = read_start >> msblk->devblksize_log2;
+	sector_t block_end = read_end >> msblk->devblksize_log2;
+	int offset = read_start - round_down(req->index, PAGE_SIZE);
+	int nr_buffers = block_end - block;
+	int blksz = msblk->devblksize;
+	int bio_max_pages = nr_buffers > BIO_MAX_PAGES ? BIO_MAX_PAGES
+						       : nr_buffers;
+
+	/* Setup the request */
+	req->read_end = read_end;
+	req->offset = req->index - read_start;
+	req->nr_buffers = nr_buffers;
+	if (actor_getblks(req, block) < 0)
+		goto getblk_failed;
+
+	/* Create and submit the BIOs */
+	for (b = 0; b < nr_buffers; ++b, offset += blksz) {
+		bh = req->bh[b];
+		if (!bh || !trylock_buffer(bh))
+			continue;
+		if (buffer_uptodate(bh)) {
+			unlock_buffer(bh);
+			continue;
+		}
+		offset %= PAGE_SIZE;
+
+		/* Append the buffer to the current BIO if it is contiguous */
+		if (bio && bio_req && prev_block + 1 == b) {
+			if (bio_add_page(bio, bh->b_page, blksz, offset)) {
+				bio_req->nr_buffers += 1;
+				prev_block = b;
+				continue;
+			}
+		}
+
+		/* Otherwise, submit the current BIO and create a new one */
+		if (bio)
+			submit_bio(READ, bio);
+		bio_req = kcalloc(1, sizeof(struct squashfs_bio_request),
+				  GFP_NOIO);
+		if (!bio_req)
+			goto req_alloc_failed;
+		bio_req->bh = &req->bh[b];
+		bio = bio_alloc(GFP_NOIO, bio_max_pages);
+		if (!bio)
+			goto bio_alloc_failed;
+		bio->bi_bdev = req->sb->s_bdev;
+		bio->bi_iter.bi_sector = (block + b)
+				       << (msblk->devblksize_log2 - 9);
+		bio->bi_private = bio_req;
+		bio->bi_end_io = squashfs_bio_end_io;
+
+		bio_add_page(bio, bh->b_page, blksz, offset);
+		bio_req->nr_buffers += 1;
+		prev_block = b;
+	}
+	if (bio)
+		submit_bio(READ, bio);
+
+	if (req->synchronous)
+		squashfs_process_blocks(req);
+	else {
+		INIT_WORK(&req->offload, read_wq_handler);
+		schedule_work(&req->offload);
+	}
+	return 0;
+
+bio_alloc_failed:
+	kfree(bio_req);
+req_alloc_failed:
+	unlock_buffer(bh);
+	while (--nr_buffers >= b)
+		if (req->bh[nr_buffers])
+			put_bh(req->bh[nr_buffers]);
+	while (--b >= 0)
+		if (req->bh[b])
+			wait_on_buffer(req->bh[b]);
+getblk_failed:
+	free_read_request(req, -ENOMEM);
+	return -ENOMEM;
+}
+
+static int read_metadata_block(struct squashfs_read_request *req,
+			       u64 *next_index)
+{
+	int ret, error, bytes_read = 0, bytes_uncompressed = 0;
+	struct squashfs_sb_info *msblk = req->sb->s_fs_info;
+
+	if (req->index + 2 > msblk->bytes_used) {
+		free_read_request(req, -EINVAL);
+		return -EINVAL;
+	}
+	req->length = 2;
+
+	/* Do not read beyond the end of the device */
+	if (req->index + req->length > msblk->bytes_used)
+		req->length = msblk->bytes_used - req->index;
+	req->data_processing = SQUASHFS_METADATA;
+
+	/*
+	 * Reading metadata is always synchronous because we don't know the
+	 * length in advance and the function is expected to update
+	 * 'next_index' and return the length.
+	 */
+	req->synchronous = true;
+	req->res = &error;
+	req->bytes_read = &bytes_read;
+	req->bytes_uncompressed = &bytes_uncompressed;
+
+	TRACE("Metadata block @ 0x%llx, %scompressed size %d, src size %d\n",
+	      req->index, req->compressed ? "" : "un", bytes_read,
+	      req->output->length);
+
+	ret = squashfs_bio_submit(req);
+	if (ret)
+		return ret;
+	if (error)
+		return error;
+	if (next_index)
+		*next_index += 2 + bytes_read;
+	return bytes_uncompressed;
+}
+
+static int read_data_block(struct squashfs_read_request *req, int length,
+			   u64 *next_index, bool synchronous)
+{
+	int ret, error = 0, bytes_uncompressed = 0, bytes_read = 0;
+
+	req->compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+	req->length = length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+	req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS
+					       : SQUASHFS_COPY;
+
+	req->synchronous = synchronous;
+	if (synchronous) {
+		req->res = &error;
+		req->bytes_read = &bytes_read;
+		req->bytes_uncompressed = &bytes_uncompressed;
+	}
+
+	TRACE("Data block @ 0x%llx, %scompressed size %d, src size %d\n",
+	      req->index, req->compressed ? "" : "un", req->length,
+	      req->output->length);
+
+	ret = squashfs_bio_submit(req);
+	if (ret)
+		return ret;
+	if (synchronous)
+		ret = error ? error : bytes_uncompressed;
+	if (next_index)
+		*next_index += length;
+	return ret;
+}
 
 /*
  * Read and decompress a metadata block or datablock.  Length is non-zero
@@ -87,128 +400,50 @@ static struct buffer_head *get_block_length(struct super_block *sb,
  * generated a larger block - this does occasionally happen with compression
  * algorithms).
  */
-int squashfs_read_data(struct super_block *sb, u64 index, int length,
-		u64 *next_index, struct squashfs_page_actor *output)
+static int __squashfs_read_data(struct super_block *sb, u64 index, int length,
+	u64 *next_index, struct squashfs_page_actor *output, bool sync)
 {
-	struct squashfs_sb_info *msblk = sb->s_fs_info;
-	struct buffer_head **bh;
-	int offset = index & ((1 << msblk->devblksize_log2) - 1);
-	u64 cur_index = index >> msblk->devblksize_log2;
-	int bytes, compressed, b = 0, k = 0, avail, i;
+	struct squashfs_read_request *req;
 
-	bh = kcalloc(((output->length + msblk->devblksize - 1)
-		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
-	if (bh == NULL)
+	req = kcalloc(1, sizeof(struct squashfs_read_request), GFP_KERNEL);
+	if (!req) {
+		if (!sync)
+			squashfs_page_actor_free(output, -ENOMEM);
 		return -ENOMEM;
-
-	if (length) {
-		/*
-		 * Datablock.
-		 */
-		bytes = -offset;
-		compressed = SQUASHFS_COMPRESSED_BLOCK(length);
-		length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
-		if (next_index)
-			*next_index = index + length;
-
-		TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
-			index, compressed ? "" : "un", length, output->length);
-
-		if (length < 0 || length > output->length ||
-				(index + length) > msblk->bytes_used)
-			goto read_failure;
-
-		for (b = 0; bytes < length; b++, cur_index++) {
-			bh[b] = sb_getblk(sb, cur_index);
-			if (bh[b] == NULL)
-				goto block_release;
-			bytes += msblk->devblksize;
-		}
-		ll_rw_block(READ, b, bh);
-	} else {
-		/*
-		 * Metadata block.
-		 */
-		if ((index + 2) > msblk->bytes_used)
-			goto read_failure;
-
-		bh[0] = get_block_length(sb, &cur_index, &offset, &length);
-		if (bh[0] == NULL)
-			goto read_failure;
-		b = 1;
-
-		bytes = msblk->devblksize - offset;
-		compressed = SQUASHFS_COMPRESSED(length);
-		length = SQUASHFS_COMPRESSED_SIZE(length);
-		if (next_index)
-			*next_index = index + length + 2;
-
-		TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
-				compressed ? "" : "un", length);
-
-		if (length < 0 || length > output->length ||
-					(index + length) > msblk->bytes_used)
-			goto block_release;
-
-		for (; bytes < length; b++) {
-			bh[b] = sb_getblk(sb, ++cur_index);
-			if (bh[b] == NULL)
-				goto block_release;
-			bytes += msblk->devblksize;
-		}
-		ll_rw_block(READ, b - 1, bh + 1);
 	}
 
-	for (i = 0; i < b; i++) {
-		wait_on_buffer(bh[i]);
-		if (!buffer_uptodate(bh[i]))
-			goto block_release;
+	req->sb = sb;
+	req->index = index;
+	req->output = output;
+
+	if (next_index)
+		*next_index = index;
+
+	if (length)
+		length = read_data_block(req, length, next_index, sync);
+	else
+		length = read_metadata_block(req, next_index);
+
+	if (length < 0) {
+		ERROR("squashfs_read_data failed to read block 0x%llx\n",
+		      (unsigned long long)index);
+		return -EIO;
 	}
 
-	if (compressed) {
-		length = squashfs_decompress(msblk, bh, b, offset, length,
-			output);
-		if (length < 0)
-			goto read_failure;
-	} else {
-		/*
-		 * Block is uncompressed.
-		 */
-		int in, pg_offset = 0;
-		void *data = squashfs_first_page(output);
-
-		for (bytes = length; k < b; k++) {
-			in = min(bytes, msblk->devblksize - offset);
-			bytes -= in;
-			while (in) {
-				if (pg_offset == PAGE_CACHE_SIZE) {
-					data = squashfs_next_page(output);
-					pg_offset = 0;
-				}
-				avail = min_t(int, in, PAGE_CACHE_SIZE -
-						pg_offset);
-				memcpy(data + pg_offset, bh[k]->b_data + offset,
-						avail);
-				in -= avail;
-				pg_offset += avail;
-				offset += avail;
-			}
-			offset = 0;
-			put_bh(bh[k]);
-		}
-		squashfs_finish_page(output);
-	}
-
-	kfree(bh);
 	return length;
-
-block_release:
-	for (; k < b; k++)
-		put_bh(bh[k]);
-
-read_failure:
-	ERROR("squashfs_read_data failed to read block 0x%llx\n",
-					(unsigned long long) index);
-	kfree(bh);
-	return -EIO;
+}
+
+int squashfs_read_data(struct super_block *sb, u64 index, int length,
+	u64 *next_index, struct squashfs_page_actor *output)
+{
+	return __squashfs_read_data(sb, index, length, next_index, output,
+				    true);
+}
+
+int squashfs_read_data_async(struct super_block *sb, u64 index, int length,
+	u64 *next_index, struct squashfs_page_actor *output)
+{
+
+	return __squashfs_read_data(sb, index, length, next_index, output,
+				    false);
 }
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 9d033ec0f133..10fe1272535f 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -20,8 +20,68 @@
 #include "squashfs.h"
 #include "page_actor.h"
 
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
-	int pages, struct page **page);
+static void release_actor_pages(struct page **page, int pages, int error)
+{
+	int i;
+
+	for (i = 0; i < pages; i++) {
+		if (!page[i])
+			continue;
+		flush_dcache_page(page[i]);
+		if (!error)
+			SetPageUptodate(page[i]);
+		else {
+			SetPageError(page[i]);
+			zero_user_segment(page[i], 0, PAGE_CACHE_SIZE);
+		}
+		unlock_page(page[i]);
+		put_page(page[i]);
+	}
+	kfree(page);
+}
+
+/*
+ * Create a "page actor" which will kmap and kunmap the
+ * page cache pages appropriately within the decompressor
+ */
+static struct squashfs_page_actor *actor_from_page_cache(
+	struct page *target_page, int start_index, int nr_pages)
+{
+	int i, n;
+	struct page **page;
+	struct squashfs_page_actor *actor;
+
+	page = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
+	if (!page)
+		return NULL;
+
+	/* Try to grab all the pages covered by the SquashFS block */
+	for (i = 0, n = start_index; i < nr_pages; i++, n++) {
+		if (target_page->index == n) {
+			page[i] = target_page;
+		} else {
+			page[i] = grab_cache_page_nowait(target_page->mapping,
+							 n);
+			if (page[i] == NULL)
+				continue;
+		}
+
+		if (PageUptodate(page[i])) {
+			unlock_page(page[i]);
+			put_page(page[i]);
+			page[i] = NULL;
+		}
+	}
+
+	actor = squashfs_page_actor_init(page, nr_pages, 0,
+			release_actor_pages);
+	if (!actor) {
+		release_actor_pages(page, nr_pages, -ENOMEM);
+		kfree(page);
+		return NULL;
+	}
+	return actor;
+}
 
 /* Read separately compressed datablock directly into page cache */
 int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
@@ -34,143 +94,19 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
 	int start_index = target_page->index & ~mask;
 	int end_index = start_index | mask;
-	int i, n, pages, missing_pages, bytes, res = -ENOMEM;
-	struct page **page;
+	int pages, res = -ENOMEM;
 	struct squashfs_page_actor *actor;
-	void *pageaddr;
 
 	if (end_index > file_end)
 		end_index = file_end;
-
 	pages = end_index - start_index + 1;
 
-	page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
-	if (page == NULL)
-		return res;
+	actor = actor_from_page_cache(target_page, start_index, pages);
+	if (!actor)
+		return -ENOMEM;
 
-	/*
-	 * Create a "page actor" which will kmap and kunmap the
-	 * page cache pages appropriately within the decompressor
-	 */
-	actor = squashfs_page_actor_init(page, pages, 0, NULL);
-	if (actor == NULL)
-		goto out;
-
-	/* Try to grab all the pages covered by the Squashfs block */
-	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
-		page[i] = (n == target_page->index) ? target_page :
-			grab_cache_page_nowait(target_page->mapping, n);
-
-		if (page[i] == NULL) {
-			missing_pages++;
-			continue;
-		}
-
-		if (PageUptodate(page[i])) {
-			unlock_page(page[i]);
-			page_cache_release(page[i]);
-			page[i] = NULL;
-			missing_pages++;
-		}
-	}
-
-	if (missing_pages) {
-		/*
-		 * Couldn't get one or more pages, this page has either
-		 * been VM reclaimed, but others are still in the page cache
-		 * and uptodate, or we're racing with another thread in
-		 * squashfs_readpage also trying to grab them.  Fall back to
-		 * using an intermediate buffer.
-		 */
-		res = squashfs_read_cache(target_page, block, bsize, pages,
-								page);
-		if (res < 0)
-			goto mark_errored;
-
-		goto out;
-	}
-
-	/* Decompress directly into the page cache buffers */
-	res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
-	if (res < 0)
-		goto mark_errored;
-
-	/* Last page may have trailing bytes not filled */
-	bytes = res % PAGE_CACHE_SIZE;
-	if (bytes) {
-		pageaddr = kmap_atomic(page[pages - 1]);
-		memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
-		kunmap_atomic(pageaddr);
-	}
-
-	/* Mark pages as uptodate, unlock and release */
-	for (i = 0; i < pages; i++) {
-		flush_dcache_page(page[i]);
-		SetPageUptodate(page[i]);
-		unlock_page(page[i]);
-		if (page[i] != target_page)
-			page_cache_release(page[i]);
-	}
-
-	kfree(actor);
-	kfree(page);
-
-	return 0;
-
-mark_errored:
-	/* Decompression failed, mark pages as errored.  Target_page is
-	 * dealt with by the caller
-	 */
-	for (i = 0; i < pages; i++) {
-		if (page[i] == NULL || page[i] == target_page)
-			continue;
-		flush_dcache_page(page[i]);
-		SetPageError(page[i]);
-		unlock_page(page[i]);
-		page_cache_release(page[i]);
-	}
-
-out:
-	squashfs_page_actor_free(actor, 0);
-	kfree(page);
-	return res;
-}
-
-
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
-	int pages, struct page **page)
-{
-	struct inode *i = target_page->mapping->host;
-	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
-						 block, bsize);
-	int bytes = buffer->length, res = buffer->error, n, offset = 0;
-	void *pageaddr;
-
-	if (res) {
-		ERROR("Unable to read page, block %llx, size %x\n", block,
-			bsize);
-		goto out;
-	}
-
-	for (n = 0; n < pages && bytes > 0; n++,
-			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
-		int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
-
-		if (page[n] == NULL)
-			continue;
-
-		pageaddr = kmap_atomic(page[n]);
-		squashfs_copy_data(pageaddr, buffer, offset, avail);
-		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
-		kunmap_atomic(pageaddr);
-		flush_dcache_page(page[n]);
-		SetPageUptodate(page[n]);
-		unlock_page(page[n]);
-		if (page[n] != target_page)
-			page_cache_release(page[n]);
-	}
-
-out:
-	squashfs_cache_put(buffer);
-	return res;
+	get_page(target_page);
+	res = squashfs_read_data_async(inode->i_sb, block, bsize, NULL,
+				       actor);
+	return res < 0 ? res : 0;
 }
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 887d6d270080..985317fbf6c2 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -28,8 +28,14 @@
 #define WARNING(s, args...)	pr_warn("SQUASHFS: "s, ## args)
 
 /* block.c */
+extern int squashfs_init_read_wq(void);
+extern void squashfs_destroy_read_wq(void);
 extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
 				struct squashfs_page_actor *);
+extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
+	struct squashfs_page_actor *);
+extern int squashfs_read_data_async(struct super_block *, u64, int, u64 *,
+	struct squashfs_page_actor *);
 
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5056babe00df..61cd0b39ed0e 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -444,9 +444,15 @@ static int __init init_squashfs_fs(void)
 	if (err)
 		return err;
 
+	if (!squashfs_init_read_wq()) {
+		destroy_inodecache();
+		return -ENOMEM;
+        }
+
 	err = register_filesystem(&squashfs_fs_type);
 	if (err) {
 		destroy_inodecache();
+		squashfs_destroy_read_wq();
 		return err;
 	}
 
@@ -460,6 +466,7 @@ static void __exit exit_squashfs_fs(void)
 {
 	unregister_filesystem(&squashfs_fs_type);
 	destroy_inodecache();
+	squashfs_destroy_read_wq();
 }
 
 

From 5e9c466d6ec02a6d43ab7ff595c90b10bd0eada1 Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht <adriens@google.com>
Date: Mon, 7 Nov 2016 12:41:42 -0800
Subject: [PATCH 0653/3220] Squashfs: implement .readpages()

Squashfs does not implement .readpages(), so the kernel just repeatedly
calls .readpage().

The readpages function tries to pack as much pages as possible in the
same page actor so that only 1 read request is issued.

Now that the read requests are asynchronous, the kernel can truly
prefetch pages using its readahead algorithm.

Signed-off-by: Adrien Schildknecht <adriens@google.com>
Change-Id: Ice70e029dc24526f61e4e5a1a902588be2212498
---
 fs/squashfs/file.c        | 138 ++++++++++++++++++++++++++++----------
 fs/squashfs/file_direct.c |  64 ++++++++++++------
 fs/squashfs/squashfs.h    |   5 +-
 3 files changed, 151 insertions(+), 56 deletions(-)

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index e5c9689062ba..6f5ef8d7e55a 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,12 +47,16 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/mutex.h>
+#include <linux/mm_inline.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 
+// Backported from 4.5
+#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
+
 /*
  * Locate cache slot in range [offset, index] for specified inode.  If
  * there's more than one return the slot closest to index.
@@ -438,6 +442,21 @@ static int squashfs_readpage_fragment(struct page *page)
 	return res;
 }
 
+static int squashfs_readpages_fragment(struct page *page,
+	struct list_head *readahead_pages, struct address_space *mapping)
+{
+	if (!page) {
+		page = lru_to_page(readahead_pages);
+		list_del(&page->lru);
+		if (add_to_page_cache_lru(page, mapping, page->index,
+			mapping_gfp_constraint(mapping, GFP_KERNEL))) {
+			put_page(page);
+			return 0;
+		}
+	}
+	return squashfs_readpage_fragment(page);
+}
+
 static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
 {
 	struct inode *inode = page->mapping->host;
@@ -450,54 +469,105 @@ static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
 	return 0;
 }
 
-static int squashfs_readpage(struct file *file, struct page *page)
+static int squashfs_readpages_sparse(struct page *page,
+	struct list_head *readahead_pages, int index, int file_end,
+	struct address_space *mapping)
 {
-	struct inode *inode = page->mapping->host;
+	if (!page) {
+		page = lru_to_page(readahead_pages);
+		list_del(&page->lru);
+		if (add_to_page_cache_lru(page, mapping, page->index,
+			mapping_gfp_constraint(mapping, GFP_KERNEL))) {
+			put_page(page);
+			return 0;
+		}
+	}
+	return squashfs_readpage_sparse(page, index, file_end);
+}
+
+static int __squashfs_readpages(struct file *file, struct page *page,
+	struct list_head *readahead_pages, unsigned int nr_pages,
+	struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-	int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
 	int file_end = i_size_read(inode) >> msblk->block_log;
 	int res;
-	void *pageaddr;
+
+	do {
+		struct page *cur_page = page ? page
+					     : lru_to_page(readahead_pages);
+		int page_index = cur_page->index;
+		int index = page_index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+
+		if (page_index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+						PAGE_CACHE_SHIFT))
+			return 1;
+
+		if (index < file_end || squashfs_i(inode)->fragment_block ==
+						SQUASHFS_INVALID_BLK) {
+			u64 block = 0;
+			int bsize = read_blocklist(inode, index, &block);
+
+			if (bsize < 0)
+				return -1;
+
+			if (bsize == 0) {
+				res = squashfs_readpages_sparse(page,
+					readahead_pages, index, file_end,
+					mapping);
+			} else {
+				res = squashfs_readpages_block(page,
+					readahead_pages, &nr_pages, mapping,
+					page_index, block, bsize);
+			}
+		} else {
+			res = squashfs_readpages_fragment(page,
+				readahead_pages, mapping);
+		}
+		if (res)
+			return 0;
+		page = NULL;
+	} while (readahead_pages && !list_empty(readahead_pages));
+
+	return 0;
+}
+
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+	int ret;
 
 	TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
-				page->index, squashfs_i(inode)->start);
+	      page->index, squashfs_i(page->mapping->host)->start);
 
-	if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-					PAGE_CACHE_SHIFT))
-		goto out;
+	get_page(page);
 
-	if (index < file_end || squashfs_i(inode)->fragment_block ==
-					SQUASHFS_INVALID_BLK) {
-		u64 block = 0;
-		int bsize = read_blocklist(inode, index, &block);
-		if (bsize < 0)
-			goto error_out;
-
-		if (bsize == 0)
-			res = squashfs_readpage_sparse(page, index, file_end);
+	ret = __squashfs_readpages(file, page, NULL, 1, page->mapping);
+	if (ret) {
+		flush_dcache_page(page);
+		if (ret < 0)
+			SetPageError(page);
 		else
-			res = squashfs_readpage_block(page, block, bsize);
-	} else
-		res = squashfs_readpage_fragment(page);
+			SetPageUptodate(page);
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		unlock_page(page);
+		put_page(page);
+	}
 
-	if (!res)
-		return 0;
-
-error_out:
-	SetPageError(page);
-out:
-	pageaddr = kmap_atomic(page);
-	memset(pageaddr, 0, PAGE_CACHE_SIZE);
-	kunmap_atomic(pageaddr);
-	flush_dcache_page(page);
-	if (!PageError(page))
-		SetPageUptodate(page);
-	unlock_page(page);
+	return 0;
+}
 
+static int squashfs_readpages(struct file *file, struct address_space *mapping,
+			      struct list_head *pages, unsigned int nr_pages)
+{
+	TRACE("Entered squashfs_readpages, %u pages, first page index %lx\n",
+		nr_pages, lru_to_page(pages)->index);
+	__squashfs_readpages(file, NULL, pages, nr_pages, mapping);
 	return 0;
 }
 
 
 const struct address_space_operations squashfs_aops = {
-	.readpage = squashfs_readpage
+	.readpage = squashfs_readpage,
+	.readpages = squashfs_readpages,
 };
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 10fe1272535f..3fb4ce210edb 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/mutex.h>
+#include <linux/mm_inline.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -20,6 +21,9 @@
 #include "squashfs.h"
 #include "page_actor.h"
 
+// Backported from 4.5
+#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
+
 static void release_actor_pages(struct page **page, int pages, int error)
 {
 	int i;
@@ -45,23 +49,40 @@ static void release_actor_pages(struct page **page, int pages, int error)
  * page cache pages appropriately within the decompressor
  */
 static struct squashfs_page_actor *actor_from_page_cache(
-	struct page *target_page, int start_index, int nr_pages)
+	unsigned int actor_pages, struct page *target_page,
+	struct list_head *rpages, unsigned int *nr_pages, int start_index,
+	struct address_space *mapping)
 {
-	int i, n;
 	struct page **page;
 	struct squashfs_page_actor *actor;
+	int i, n;
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
 
-	page = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
+	page = kmalloc_array(actor_pages, sizeof(void *), GFP_KERNEL);
 	if (!page)
 		return NULL;
 
-	/* Try to grab all the pages covered by the SquashFS block */
-	for (i = 0, n = start_index; i < nr_pages; i++, n++) {
-		if (target_page->index == n) {
+	for (i = 0, n = start_index; i < actor_pages; i++, n++) {
+		if (target_page == NULL && rpages && !list_empty(rpages)) {
+			struct page *cur_page = lru_to_page(rpages);
+
+			if (cur_page->index < start_index + actor_pages) {
+				list_del(&cur_page->lru);
+				--(*nr_pages);
+				if (add_to_page_cache_lru(cur_page, mapping,
+							  cur_page->index, gfp))
+					put_page(cur_page);
+				else
+					target_page = cur_page;
+			} else
+				rpages = NULL;
+		}
+
+		if (target_page && target_page->index == n) {
 			page[i] = target_page;
+			target_page = NULL;
 		} else {
-			page[i] = grab_cache_page_nowait(target_page->mapping,
-							 n);
+			page[i] = grab_cache_page_nowait(mapping, n);
 			if (page[i] == NULL)
 				continue;
 		}
@@ -73,39 +94,42 @@ static struct squashfs_page_actor *actor_from_page_cache(
 		}
 	}
 
-	actor = squashfs_page_actor_init(page, nr_pages, 0,
+	actor = squashfs_page_actor_init(page, actor_pages, 0,
 			release_actor_pages);
 	if (!actor) {
-		release_actor_pages(page, nr_pages, -ENOMEM);
+		release_actor_pages(page, actor_pages, -ENOMEM);
 		kfree(page);
 		return NULL;
 	}
 	return actor;
 }
 
-/* Read separately compressed datablock directly into page cache */
-int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
+int squashfs_readpages_block(struct page *target_page,
+			     struct list_head *readahead_pages,
+			     unsigned int *nr_pages,
+			     struct address_space *mapping,
+			     int page_index, u64 block, int bsize)
 
 {
-	struct inode *inode = target_page->mapping->host;
+	struct squashfs_page_actor *actor;
+	struct inode *inode = mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-
 	int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
 	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
-	int start_index = target_page->index & ~mask;
+	int start_index = page_index & ~mask;
 	int end_index = start_index | mask;
-	int pages, res = -ENOMEM;
-	struct squashfs_page_actor *actor;
+	int actor_pages, res;
 
 	if (end_index > file_end)
 		end_index = file_end;
-	pages = end_index - start_index + 1;
+	actor_pages = end_index - start_index + 1;
 
-	actor = actor_from_page_cache(target_page, start_index, pages);
+	actor = actor_from_page_cache(actor_pages, target_page,
+				      readahead_pages, nr_pages, start_index,
+				      mapping);
 	if (!actor)
 		return -ENOMEM;
 
-	get_page(target_page);
 	res = squashfs_read_data_async(inode->i_sb, block, bsize, NULL,
 				       actor);
 	return res < 0 ? res : 0;
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 985317fbf6c2..6093579c6c5d 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -76,8 +76,9 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
 void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
 				int);
 
-/* file_xxx.c */
-extern int squashfs_readpage_block(struct page *, u64, int);
+/* file_direct.c */
+extern int squashfs_readpages_block(struct page *, struct list_head *,
+	unsigned int *, struct address_space *, int, u64, int);
 
 /* id.c */
 extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);

From d9aa8ddc51cbd7d3bc68d664f9b71548a04447e9 Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht <adriens@google.com>
Date: Thu, 29 Sep 2016 15:25:30 -0700
Subject: [PATCH 0654/3220] Squashfs: optimize reading uncompressed data

When dealing with uncompressed data, there is no need to read a whole
block (default 128K) to get the desired page: the pages are
independent from each others.

This patch change the readpages logic so that reading uncompressed
data only read the number of pages advised by the readahead algorithm.

Moreover, if the page actor contains holes (i.e. pages that are already
up-to-date), squashfs skips the buffer_head associated to those pages.

This patch greatly improve the performance of random reads for
uncompressed files because squashfs only read what is needed. It also
reduces the number of unnecessary reads.

Signed-off-by: Adrien Schildknecht <adriens@google.com>
Change-Id: I1850150fbf4b45c9dd138d88409fea1ab44054c0
---
 fs/squashfs/block.c       | 25 +++++++++++++++++++++++++
 fs/squashfs/file_direct.c | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 8a75d812ffdf..2eb66decc5ab 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -206,6 +206,22 @@ static void squashfs_bio_end_io(struct bio *bio)
 	kfree(bio_req);
 }
 
+static int bh_is_optional(struct squashfs_read_request *req, int idx)
+{
+	int start_idx, end_idx;
+	struct squashfs_sb_info *msblk = req->sb->s_fs_info;
+
+	start_idx = (idx * msblk->devblksize - req->offset) / PAGE_CACHE_SIZE;
+	end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) / PAGE_CACHE_SIZE;
+	if (start_idx >= req->output->pages)
+		return 1;
+	if (start_idx < 0)
+		start_idx = end_idx;
+	if (end_idx >= req->output->pages)
+		end_idx = start_idx;
+	return !req->output->page[start_idx] && !req->output->page[end_idx];
+}
+
 static int actor_getblks(struct squashfs_read_request *req, u64 block)
 {
 	int i;
@@ -215,6 +231,15 @@ static int actor_getblks(struct squashfs_read_request *req, u64 block)
 		return -ENOMEM;
 
 	for (i = 0; i < req->nr_buffers; ++i) {
+		/*
+		 * When dealing with an uncompressed block, the actor may
+		 * contains NULL pages. There's no need to read the buffers
+		 * associated with these pages.
+		 */
+		if (!req->compressed && bh_is_optional(req, i)) {
+			req->bh[i] = NULL;
+			continue;
+		}
 		req->bh[i] = sb_getblk(req->sb, block + i);
 		if (!req->bh[i]) {
 			while (--i) {
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 3fb4ce210edb..c97af4c6ccd0 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -114,15 +114,38 @@ int squashfs_readpages_block(struct page *target_page,
 	struct squashfs_page_actor *actor;
 	struct inode *inode = mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-	int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+	int start_index, end_index, file_end, actor_pages, res;
 	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
-	int start_index = page_index & ~mask;
-	int end_index = start_index | mask;
-	int actor_pages, res;
 
-	if (end_index > file_end)
-		end_index = file_end;
-	actor_pages = end_index - start_index + 1;
+	/*
+	 * If readpage() is called on an uncompressed datablock, we can just
+	 * read the pages instead of fetching the whole block.
+	 * This greatly improves the performance when a process keep doing
+	 * random reads because we only fetch the necessary data.
+	 * The readahead algorithm will take care of doing speculative reads
+	 * if necessary.
+	 * We can't read more than 1 block even if readahead provides use more
+	 * pages because we don't know yet if the next block is compressed or
+	 * not.
+	 */
+	if (bsize && !SQUASHFS_COMPRESSED_BLOCK(bsize)) {
+		u64 block_end = block + msblk->block_size;
+
+		block += (page_index & mask) * PAGE_CACHE_SIZE;
+		actor_pages = (block_end - block) / PAGE_CACHE_SIZE;
+		if (*nr_pages < actor_pages)
+			actor_pages = *nr_pages;
+		start_index = page_index;
+		bsize = min_t(int, bsize, (PAGE_CACHE_SIZE * actor_pages)
+					  | SQUASHFS_COMPRESSED_BIT_BLOCK);
+	} else {
+		file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+		start_index = page_index & ~mask;
+		end_index = start_index | mask;
+		if (end_index > file_end)
+			end_index = file_end;
+		actor_pages = end_index - start_index + 1;
+	}
 
 	actor = actor_from_page_cache(actor_pages, target_page,
 				      readahead_pages, nr_pages, start_index,

From 11fac3aed57f1ef25755fbdb167c31cc322e2b09 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 9 Feb 2017 19:38:57 -0800
Subject: [PATCH 0655/3220] ANDROID: export security_path_chown

BUG: 35142419
Change-Id: I05a9430a3c1bc624e019055175ad377290b4e774
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 security/security.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/security/security.c b/security/security.c
index 46f405ce6b0f..ae05ab153c5a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -498,6 +498,7 @@ int security_path_chown(struct path *path, kuid_t uid, kgid_t gid)
 		return 0;
 	return call_int_hook(path_chown, 0, path, uid, gid);
 }
+EXPORT_SYMBOL(security_path_chown);
 
 int security_path_chroot(struct path *path)
 {

From d854b688907b34fcab97fc3b58000084255ee53a Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Fri, 3 Feb 2017 15:48:03 -0800
Subject: [PATCH 0656/3220] ANDROID: Refactor fs readpage/write tracepoints.

Refactor the fs readpage/write tracepoints to move the
inode->path lookup outside the tracepoint code, and pass a pointer
to the path into the tracepoint code instead. This is necessary
because the tracepoint code runs non-preemptible. Thanks to
Trilok Soni for catching this in 4.4.

Change-Id: I7486c5947918d155a30c61d6b9cd5027cf8fbe15
Signed-off-by: Mohan Srinivasan <srmohan@google.com>
---
 fs/ext4/inline.c                           | 12 ++++-
 fs/ext4/inode.c                            | 53 ++++++++++++++++------
 fs/ext4/readpage.c                         |  6 +++
 fs/f2fs/data.c                             | 40 ++++++++++++----
 fs/f2fs/inline.c                           | 13 ++++--
 fs/mpage.c                                 |  6 +++
 include/trace/events/android_fs.h          | 44 ++++++++++++++++--
 include/trace/events/android_fs_template.h | 34 +++-----------
 8 files changed, 147 insertions(+), 61 deletions(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index af34979684a4..3c0ee824f512 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -501,8 +501,16 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
 		return -EAGAIN;
 	}
 
-	trace_android_fs_dataread_start(inode, page_offset(page), PAGE_SIZE,
-					current->pid, current->comm);
+	if (trace_android_fs_dataread_start_enabled()) {
+		char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+		path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    inode);
+		trace_android_fs_dataread_start(inode, page_offset(page),
+						PAGE_SIZE, current->pid,
+						path, current->comm);
+	}
 
 	/*
 	 * Current inline data can only exist in the 1st page,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a7208a662ee6..07037262337a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -983,8 +983,16 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index;
 	unsigned from, to;
 
-	trace_android_fs_datawrite_start(inode, pos, len,
-					 current->pid, current->comm);
+	if (trace_android_fs_datawrite_start_enabled()) {
+		char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+		path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    inode);
+		trace_android_fs_datawrite_start(inode, pos, len,
+						 current->pid, path,
+						 current->comm);
+	}
 	trace_ext4_write_begin(inode, pos, len, flags);
 	/*
 	 * Reserve one block more for addition to orphan list in case
@@ -2675,8 +2683,16 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 					len, flags, pagep, fsdata);
 	}
 	*fsdata = (void *)0;
-	trace_android_fs_datawrite_start(inode, pos, len,
-					 current->pid, current->comm);
+	if (trace_android_fs_datawrite_start_enabled()) {
+		char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+		path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    inode);
+		trace_android_fs_datawrite_start(inode, pos, len,
+						 current->pid,
+						 path, current->comm);
+	}
 	trace_ext4_da_write_begin(inode, pos, len, flags);
 
 	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -3285,16 +3301,27 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 
 	if (trace_android_fs_dataread_start_enabled() &&
-	    (iov_iter_rw(iter) == READ))
-		trace_android_fs_dataread_start(inode, offset, count,
-						current->pid,
-						current->comm);
-	if (trace_android_fs_datawrite_start_enabled() &&
-	    (iov_iter_rw(iter) == WRITE))
-		trace_android_fs_datawrite_start(inode, offset, count,
-						 current->pid,
-						 current->comm);
+	    (iov_iter_rw(iter) == READ)) {
+		char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
 
+		path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    inode);
+		trace_android_fs_dataread_start(inode, offset, count,
+						current->pid, path,
+						current->comm);
+	}
+	if (trace_android_fs_datawrite_start_enabled() &&
+	    (iov_iter_rw(iter) == WRITE)) {
+		char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+		path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    inode);
+		trace_android_fs_datawrite_start(inode, offset, count,
+						 current->pid, path,
+						 current->comm);
+	}
 	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_direct_IO(iocb, iter, offset);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 1ce24a6759a0..1c5db9fd9c8f 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -152,11 +152,17 @@ ext4_submit_bio_read(struct bio *bio)
 		struct page *first_page = bio->bi_io_vec[0].bv_page;
 
 		if (first_page != NULL) {
+			char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+			path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    first_page->mapping->host);
 			trace_android_fs_dataread_start(
 				first_page->mapping->host,
 				page_offset(first_page),
 				bio->bi_iter.bi_size,
 				current->pid,
+				path,
 				current->comm);
 		}
 	}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e692958d6e78..8936044dee4c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1402,8 +1402,16 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	struct dnode_of_data dn;
 	int err = 0;
 
-	trace_android_fs_datawrite_start(inode, pos, len,
-					 current->pid, current->comm);
+	if (trace_android_fs_datawrite_start_enabled()) {
+		char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+		path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    inode);
+		trace_android_fs_datawrite_start(inode, pos, len,
+						 current->pid, path,
+						 current->comm);
+	}
 	trace_f2fs_write_begin(inode, pos, len, flags);
 
 	f2fs_balance_fs(sbi);
@@ -1587,15 +1595,27 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 
 	if (trace_android_fs_dataread_start_enabled() &&
-	    (iov_iter_rw(iter) == READ))
-		trace_android_fs_dataread_start(inode, offset,
-						count, current->pid,
-						current->comm);
-	if (trace_android_fs_datawrite_start_enabled() &&
-	    (iov_iter_rw(iter) == WRITE))
-		trace_android_fs_datawrite_start(inode, offset, count,
-						 current->pid, current->comm);
+	    (iov_iter_rw(iter) == READ)) {
+		char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
 
+		path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    inode);
+		trace_android_fs_dataread_start(inode, offset,
+						count, current->pid, path,
+						current->comm);
+	}
+	if (trace_android_fs_datawrite_start_enabled() &&
+	    (iov_iter_rw(iter) == WRITE)) {
+		char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+		path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    inode);
+		trace_android_fs_datawrite_start(inode, offset, count,
+						 current->pid, path,
+						 current->comm);
+	}
 	if (iov_iter_rw(iter) == WRITE) {
 		__allocate_data_blocks(inode, offset, count);
 		if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index d2c5d69ba0b1..dbb2cc4df603 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -85,9 +85,16 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 {
 	struct page *ipage;
 
-	trace_android_fs_dataread_start(inode, page_offset(page),
-					PAGE_SIZE, current->pid,
-					current->comm);
+	if (trace_android_fs_dataread_start_enabled()) {
+		char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+		path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    inode);
+		trace_android_fs_dataread_start(inode, page_offset(page),
+						PAGE_SIZE, current->pid,
+						path, current->comm);
+	}
 
 	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
 	if (IS_ERR(ipage)) {
diff --git a/fs/mpage.c b/fs/mpage.c
index 5c65d8942692..0fd48fdcc1b1 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -79,11 +79,17 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 		struct page *first_page = bio->bi_io_vec[0].bv_page;
 
 		if (first_page != NULL) {
+			char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+			path = android_fstrace_get_pathname(pathbuf,
+						    MAX_TRACE_PATHBUF_LEN,
+						    first_page->mapping->host);
 			trace_android_fs_dataread_start(
 				first_page->mapping->host,
 				page_offset(first_page),
 				bio->bi_iter.bi_size,
 				current->pid,
+				path,
 				current->comm);
 		}
 	}
diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h
index 531da433a7bc..49509533d3fa 100644
--- a/include/trace/events/android_fs.h
+++ b/include/trace/events/android_fs.h
@@ -9,8 +9,8 @@
 
 DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start,
 	TP_PROTO(struct inode *inode, loff_t offset, int bytes,
-		 pid_t pid, char *command),
-	TP_ARGS(inode, offset, bytes, pid, command));
+		 pid_t pid, char *pathname, char *command),
+	TP_ARGS(inode, offset, bytes, pid, pathname, command));
 
 DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end,
 	TP_PROTO(struct inode *inode, loff_t offset, int bytes),
@@ -18,14 +18,48 @@ DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end,
 
 DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start,
 	TP_PROTO(struct inode *inode, loff_t offset, int bytes,
-		 pid_t pid, char *command),
-	TP_ARGS(inode, offset, bytes, pid, command));
+		 pid_t pid, char *pathname, char *command),
+	TP_ARGS(inode, offset, bytes, pid, pathname, command));
 
 DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end,
 	TP_PROTO(struct inode *inode, loff_t offset, int bytes),
-	TP_ARGS(inode, offset, bytes));
+	     TP_ARGS(inode, offset, bytes));
 
 #endif /* _TRACE_ANDROID_FS_H */
 
 /* This part must be outside protection */
 #include <trace/define_trace.h>
+
+#ifndef ANDROID_FSTRACE_GET_PATHNAME
+#define ANDROID_FSTRACE_GET_PATHNAME
+
+/* Sizes an on-stack array, so careful if sizing this up ! */
+#define MAX_TRACE_PATHBUF_LEN	256
+
+static inline char *
+android_fstrace_get_pathname(char *buf, int buflen, struct inode *inode)
+{
+	char *path;
+	struct dentry *d;
+
+	/*
+	 * d_obtain_alias() will either iput() if it locates an existing
+	 * dentry or transfer the reference to the new dentry created.
+	 * So get an extra reference here.
+	 */
+	ihold(inode);
+	d = d_obtain_alias(inode);
+	if (likely(!IS_ERR(d))) {
+		path = dentry_path_raw(d, buf, buflen);
+		if (unlikely(IS_ERR(path))) {
+			strcpy(buf, "ERROR");
+			path = buf;
+		}
+		dput(d);
+	} else {
+		strcpy(buf, "ERROR");
+		path = buf;
+	}
+	return path;
+}
+#endif
diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h
index 618988b047c1..4e61ffe7a814 100644
--- a/include/trace/events/android_fs_template.h
+++ b/include/trace/events/android_fs_template.h
@@ -5,11 +5,10 @@
 
 DECLARE_EVENT_CLASS(android_fs_data_start_template,
 	TP_PROTO(struct inode *inode, loff_t offset, int bytes,
-		 pid_t pid, char *command),
-	TP_ARGS(inode, offset, bytes, pid, command),
+		 pid_t pid, char *pathname, char *command),
+	TP_ARGS(inode, offset, bytes, pid, pathname, command),
 	TP_STRUCT__entry(
-		__array(char, path, MAX_FILTER_STR_VAL);
-		__field(char *, pathname);
+		__string(pathbuf, pathname);
 		__field(loff_t,	offset);
 		__field(int,	bytes);
 		__field(loff_t,	i_size);
@@ -19,27 +18,7 @@ DECLARE_EVENT_CLASS(android_fs_data_start_template,
 	),
 	TP_fast_assign(
 		{
-			struct dentry *d;
-
-			/*
-			 * Grab a reference to the inode here because
-			 * d_obtain_alias() will either drop the inode
-			 * reference if it locates an existing dentry
-			 * or transfer the reference to the new dentry
-			 * created. In our case, the file is still open,
-			 * so the dentry is guaranteed to exist (connected),
-			 * so d_obtain_alias() drops the reference we
-			 * grabbed here.
-			 */
-			ihold(inode);
-			d = d_obtain_alias(inode);
-			if (!IS_ERR(d)) {
-				__entry->pathname = dentry_path(d,
-							__entry->path,
-							MAX_FILTER_STR_VAL);
-				dput(d);
-			} else
-				__entry->pathname = ERR_PTR(-EINVAL);
+			__assign_str(pathbuf, pathname);
 			__entry->offset		= offset;
 			__entry->bytes		= bytes;
 			__entry->i_size		= i_size_read(inode);
@@ -50,9 +29,8 @@ DECLARE_EVENT_CLASS(android_fs_data_start_template,
 	),
 	TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s,"
 		  " pid %d, i_size %llu, ino %lu",
-		  (IS_ERR(__entry->pathname) ? "ERROR" : __entry->pathname),
-		  __entry->offset, __entry->bytes, __get_str(cmdline),
-		  __entry->pid, __entry->i_size,
+		  __get_str(pathbuf), __entry->offset, __entry->bytes,
+		  __get_str(cmdline), __entry->pid, __entry->i_size,
 		  (unsigned long) __entry->ino)
 );
 

From 15227d3ccce3bcac1bd797a7428f1e3305bd9d6c Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Fri, 12 Feb 2016 09:47:52 -0600
Subject: [PATCH 0657/3220] UPSTREAM: arm/arm64: crypto: assure that ECB modes
 don't require an IV

ECB modes don't use an initialization vector. The kernel
/proc/crypto interface doesn't reflect this properly.

Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from bee038a4bd2efe8188cc80dfdad706a9abe568ad)
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: Ief9558d2b41be58a2d845d2033a141b5ef7b585f
---
 arch/arm/crypto/aes-ce-glue.c | 4 ++--
 arch/arm64/crypto/aes-glue.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c
index b445a5d56f43..89a3a3e592d6 100644
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@@ -364,7 +364,7 @@ static struct crypto_alg aes_algs[] = { {
 	.cra_blkcipher = {
 		.min_keysize	= AES_MIN_KEY_SIZE,
 		.max_keysize	= AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
+		.ivsize		= 0,
 		.setkey		= ce_aes_setkey,
 		.encrypt	= ecb_encrypt,
 		.decrypt	= ecb_decrypt,
@@ -441,7 +441,7 @@ static struct crypto_alg aes_algs[] = { {
 	.cra_ablkcipher = {
 		.min_keysize	= AES_MIN_KEY_SIZE,
 		.max_keysize	= AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
+		.ivsize		= 0,
 		.setkey		= ablk_set_key,
 		.encrypt	= ablk_encrypt,
 		.decrypt	= ablk_decrypt,
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 05d9e16c0dfd..7a3d22a46faf 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -294,7 +294,7 @@ static struct crypto_alg aes_algs[] = { {
 	.cra_blkcipher = {
 		.min_keysize	= AES_MIN_KEY_SIZE,
 		.max_keysize	= AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
+		.ivsize		= 0,
 		.setkey		= aes_setkey,
 		.encrypt	= ecb_encrypt,
 		.decrypt	= ecb_decrypt,
@@ -371,7 +371,7 @@ static struct crypto_alg aes_algs[] = { {
 	.cra_ablkcipher = {
 		.min_keysize	= AES_MIN_KEY_SIZE,
 		.max_keysize	= AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
+		.ivsize		= 0,
 		.setkey		= ablk_set_key,
 		.encrypt	= ablk_encrypt,
 		.decrypt	= ablk_decrypt,

From 93867d9bc5c10fca7d6e75b57a8a4b8171a8dfe8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 10 Jan 2017 16:47:49 -0800
Subject: [PATCH 0658/3220] ANDROID: crypto: allow blkcipher walks over
 ablkcipher data

Add a function blkcipher_ablkcipher_walk_virt() which allows ablkcipher
algorithms to use the blkcipher_walk API to walk over their data.  This
will be used by the HEH algorithm, which to support asynchronous ECB
algorithms will be an ablkcipher, but it also needs to make other passes
over the data.

Bug: 32975945
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: I05f9a0e5473ba6115fcc72d5122d6b0b18b2078b
---
 crypto/blkcipher.c      | 21 +++++++++++++++++++++
 include/crypto/algapi.h |  3 +++
 2 files changed, 24 insertions(+)

diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index 8cc1622b2ee0..dae7194bceb9 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -372,6 +372,27 @@ int blkcipher_aead_walk_virt_block(struct blkcipher_desc *desc,
 }
 EXPORT_SYMBOL_GPL(blkcipher_aead_walk_virt_block);
 
+/*
+ * This function allows ablkcipher algorithms to use the blkcipher_walk API to
+ * walk over their data.  The specified crypto_ablkcipher tfm is used to
+ * initialize the struct blkcipher_walk, and the crypto_blkcipher specified in
+ * desc->tfm is never used so it can be left NULL.  (Yes, this design is ugly,
+ * but it parallels blkcipher_aead_walk_virt_block() above.  In the 4.10 kernel
+ * this is starting to be cleaned up...)
+ */
+int blkcipher_ablkcipher_walk_virt(struct blkcipher_desc *desc,
+				   struct blkcipher_walk *walk,
+				   struct crypto_ablkcipher *tfm)
+{
+	walk->flags &= ~BLKCIPHER_WALK_PHYS;
+	walk->walk_blocksize = crypto_ablkcipher_blocksize(tfm);
+	walk->cipher_blocksize = walk->walk_blocksize;
+	walk->ivsize = crypto_ablkcipher_ivsize(tfm);
+	walk->alignmask = crypto_ablkcipher_alignmask(tfm);
+	return blkcipher_walk_first(desc, walk);
+}
+EXPORT_SYMBOL_GPL(blkcipher_ablkcipher_walk_virt);
+
 static int setkey_unaligned(struct crypto_tfm *tfm, const u8 *key,
 			    unsigned int keylen)
 {
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index c9fe145f7dd3..04661e1fb625 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -202,6 +202,9 @@ int blkcipher_aead_walk_virt_block(struct blkcipher_desc *desc,
 				   struct blkcipher_walk *walk,
 				   struct crypto_aead *tfm,
 				   unsigned int blocksize);
+int blkcipher_ablkcipher_walk_virt(struct blkcipher_desc *desc,
+				   struct blkcipher_walk *walk,
+				   struct crypto_ablkcipher *tfm);
 
 int ablkcipher_walk_done(struct ablkcipher_request *req,
 			 struct ablkcipher_walk *walk, int err);

From c8bb10b1eec8156fe3efd1543b6431c32434ba28 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 10 Jan 2017 16:47:49 -0800
Subject: [PATCH 0659/3220] ANDROID: crypto: shash - Add crypto_grab_shash()
 and crypto_spawn_shash_alg()

Analogous to crypto_grab_skcipher() and crypto_spawn_skcipher_alg(),
these are useful for algorithms that need to use a shash sub-algorithm,
possibly in addition to other sub-algorithms.

Bug: 32975945
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: I44e5a519d73f5f839e3b6ecbf8c66e36ec569557
---
 crypto/shash.c                 | 8 ++++++++
 include/crypto/internal/hash.h | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/crypto/shash.c b/crypto/shash.c
index ecb1e3d39bf0..c10d16373b87 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -682,6 +682,14 @@ void shash_free_instance(struct crypto_instance *inst)
 }
 EXPORT_SYMBOL_GPL(shash_free_instance);
 
+int crypto_grab_shash(struct crypto_shash_spawn *spawn,
+		      const char *name, u32 type, u32 mask)
+{
+	spawn->base.frontend = &crypto_shash_type;
+	return crypto_grab_spawn(&spawn->base, name, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_grab_shash);
+
 int crypto_init_shash_spawn(struct crypto_shash_spawn *spawn,
 			    struct shash_alg *alg,
 			    struct crypto_instance *inst)
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 3b4af1d7c7e9..476d99d0edb7 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -102,6 +102,8 @@ int shash_register_instance(struct crypto_template *tmpl,
 			    struct shash_instance *inst);
 void shash_free_instance(struct crypto_instance *inst);
 
+int crypto_grab_shash(struct crypto_shash_spawn *spawn,
+		      const char *name, u32 type, u32 mask);
 int crypto_init_shash_spawn(struct crypto_shash_spawn *spawn,
 			    struct shash_alg *alg,
 			    struct crypto_instance *inst);
@@ -111,6 +113,12 @@ static inline void crypto_drop_shash(struct crypto_shash_spawn *spawn)
 	crypto_drop_spawn(&spawn->base);
 }
 
+static inline struct shash_alg *crypto_spawn_shash_alg(
+	struct crypto_shash_spawn *spawn)
+{
+	return container_of(spawn->base.alg, struct shash_alg, base);
+}
+
 struct shash_alg *shash_attr_alg(struct rtattr *rta, u32 type, u32 mask);
 
 int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc);

From 8ea7531e4764805981be72a2b1eed3617af24a38 Mon Sep 17 00:00:00 2001
From: Alex Cope <alexcope@google.com>
Date: Mon, 14 Nov 2016 11:02:54 -0800
Subject: [PATCH 0660/3220] UPSTREAM: crypto: gf128mul - Zero memory when
 freeing multiplication table

GF(2^128) multiplication tables are typically used for secret
information, so it's a good idea to zero them on free.

Signed-off-by: Alex Cope <alexcope@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
(cherry-picked from 75aa0a7cafe951538c7cb7c5ed457a3371ec5bcd)
Bug: 32975945
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: I37b1ae9544158007f9ee2caf070120f4a42153ab
---
 crypto/gf128mul.c         | 4 ++--
 include/crypto/gf128mul.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 5276607c72d0..0594dd6f82f2 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -352,8 +352,8 @@ void gf128mul_free_64k(struct gf128mul_64k *t)
 	int i;
 
 	for (i = 0; i < 16; i++)
-		kfree(t->t[i]);
-	kfree(t);
+		kzfree(t->t[i]);
+	kzfree(t);
 }
 EXPORT_SYMBOL(gf128mul_free_64k);
 
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index da2530e34b26..7217fe6dbe33 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -177,7 +177,7 @@ void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t);
 
 static inline void gf128mul_free_4k(struct gf128mul_4k *t)
 {
-	kfree(t);
+	kzfree(t);
 }
 
 

From 3eaf06b785a603e0258af10ef1f98d8a511874a8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 10 Jan 2017 17:04:54 -0800
Subject: [PATCH 0661/3220] ANDROID: crypto: gf128mul - Refactor gf128 overflow
 macros and tables

Rename and clean up the GF(2^128) overflow macros and tables.  Their
usage is more general than the name suggested, e.g. what was previously
known as the "bbe" table can actually be used for both "bbe" and "ble"
multiplication.

Bug: 32975945
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: Ie6c47b4075ca40031eb1767e9b468cfd7bf1b2e4
---
 crypto/gf128mul.c | 68 ++++++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 0594dd6f82f2..8b65b1eb5dda 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -88,33 +88,47 @@
 	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
 }
 
-/*	Given the value i in 0..255 as the byte overflow when a field element
-    in GHASH is multiplied by x^8, this function will return the values that
-    are generated in the lo 16-bit word of the field value by applying the
-    modular polynomial. The values lo_byte and hi_byte are returned via the
-    macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into
-    memory as required by a suitable definition of this macro operating on
-    the table above
-*/
+/*
+ * Given a value i in 0..255 as the byte overflow when a field element
+ * in GF(2^128) is multiplied by x^8, the following macro returns the
+ * 16-bit value that must be XOR-ed into the low-degree end of the
+ * product to reduce it modulo the irreducible polynomial x^128 + x^7 +
+ * x^2 + x + 1.
+ *
+ * There are two versions of the macro, and hence two tables: one for
+ * the "be" convention where the highest-order bit is the coefficient of
+ * the highest-degree polynomial term, and one for the "le" convention
+ * where the highest-order bit is the coefficient of the lowest-degree
+ * polynomial term.  In both cases the values are stored in CPU byte
+ * endianness such that the coefficients are ordered consistently across
+ * bytes, i.e. in the "be" table bits 15..0 of the stored value
+ * correspond to the coefficients of x^15..x^0, and in the "le" table
+ * bits 15..0 correspond to the coefficients of x^0..x^15.
+ *
+ * Therefore, provided that the appropriate byte endianness conversions
+ * are done by the multiplication functions (and these must be in place
+ * anyway to support both little endian and big endian CPUs), the "be"
+ * table can be used for multiplications of both "bbe" and "ble"
+ * elements, and the "le" table can be used for multiplications of both
+ * "lle" and "lbe" elements.
+ */
 
-#define xx(p, q)	0x##p##q
-
-#define xda_bbe(i) ( \
-	(i & 0x80 ? xx(43, 80) : 0) ^ (i & 0x40 ? xx(21, c0) : 0) ^ \
-	(i & 0x20 ? xx(10, e0) : 0) ^ (i & 0x10 ? xx(08, 70) : 0) ^ \
-	(i & 0x08 ? xx(04, 38) : 0) ^ (i & 0x04 ? xx(02, 1c) : 0) ^ \
-	(i & 0x02 ? xx(01, 0e) : 0) ^ (i & 0x01 ? xx(00, 87) : 0) \
+#define xda_be(i) ( \
+	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
+	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
+	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
+	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
 )
 
-#define xda_lle(i) ( \
-	(i & 0x80 ? xx(e1, 00) : 0) ^ (i & 0x40 ? xx(70, 80) : 0) ^ \
-	(i & 0x20 ? xx(38, 40) : 0) ^ (i & 0x10 ? xx(1c, 20) : 0) ^ \
-	(i & 0x08 ? xx(0e, 10) : 0) ^ (i & 0x04 ? xx(07, 08) : 0) ^ \
-	(i & 0x02 ? xx(03, 84) : 0) ^ (i & 0x01 ? xx(01, c2) : 0) \
+#define xda_le(i) ( \
+	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
+	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
+	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
+	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
 )
 
-static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle);
-static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe);
+static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
+static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
 
 /* These functions multiply a field element by x, by x^4 and by x^8
  * in the polynomial field representation. It uses 32-bit word operations
@@ -126,7 +140,7 @@ static void gf128mul_x_lle(be128 *r, const be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_lle[(b << 7) & 0xff];
+	u64 _tt = gf128mul_table_le[(b << 7) & 0xff];
 
 	r->b = cpu_to_be64((b >> 1) | (a << 63));
 	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
@@ -136,7 +150,7 @@ static void gf128mul_x_bbe(be128 *r, const be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_bbe[a >> 63];
+	u64 _tt = gf128mul_table_be[a >> 63];
 
 	r->a = cpu_to_be64((a << 1) | (b >> 63));
 	r->b = cpu_to_be64((b << 1) ^ _tt);
@@ -146,7 +160,7 @@ void gf128mul_x_ble(be128 *r, const be128 *x)
 {
 	u64 a = le64_to_cpu(x->a);
 	u64 b = le64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_bbe[b >> 63];
+	u64 _tt = gf128mul_table_be[b >> 63];
 
 	r->a = cpu_to_le64((a << 1) ^ _tt);
 	r->b = cpu_to_le64((b << 1) | (a >> 63));
@@ -157,7 +171,7 @@ static void gf128mul_x8_lle(be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_lle[b & 0xff];
+	u64 _tt = gf128mul_table_le[b & 0xff];
 
 	x->b = cpu_to_be64((b >> 8) | (a << 56));
 	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
@@ -167,7 +181,7 @@ static void gf128mul_x8_bbe(be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_bbe[a >> 56];
+	u64 _tt = gf128mul_table_be[a >> 56];
 
 	x->a = cpu_to_be64((a << 8) | (b >> 56));
 	x->b = cpu_to_be64((b << 8) ^ _tt);

From ce2ace45d931f46e79e8e3c2e857c083e67be554 Mon Sep 17 00:00:00 2001
From: Alex Cope <alexcope@google.com>
Date: Tue, 10 Jan 2017 16:47:49 -0800
Subject: [PATCH 0662/3220] ANDROID: crypto: gf128mul - Add ble multiplication
 functions

Adding ble multiplication to GF128mul, and fixing up comments.

The ble multiplication functions multiply GF(2^128) elements in the
ble format. This format is preferable because the bits within each
byte map to polynomial coefficients in the natural order (lowest order
bit = coefficient of lowest degree polynomial term), and the bytes are
stored in little endian order which matches the endianness of most
modern CPUs.

These new functions will be used by the HEH algorithm.

Signed-off-by: Alex Cope <alexcope@google.com>
Bug: 32975945
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: I39a58e8ee83e6f9b2e6bd51738f816dbfa2f3a47
---
 crypto/gf128mul.c         | 99 ++++++++++++++++++++++++++++++++++++---
 include/crypto/gf128mul.h | 45 +++++++++---------
 2 files changed, 117 insertions(+), 27 deletions(-)

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 8b65b1eb5dda..f3d9f6da0767 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -44,7 +44,7 @@
  ---------------------------------------------------------------------------
  Issue 31/01/2006
 
- This file provides fast multiplication in GF(128) as required by several
+ This file provides fast multiplication in GF(2^128) as required by several
  cryptographic authentication modes
 */
 
@@ -130,9 +130,10 @@
 static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
 static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
 
-/* These functions multiply a field element by x, by x^4 and by x^8
- * in the polynomial field representation. It uses 32-bit word operations
- * to gain speed but compensates for machine endianess and hence works
+/*
+ * The following functions multiply a field element by x or by x^8 in
+ * the polynomial field representation.  They use 64-bit word operations
+ * to gain speed but compensate for machine endianness and hence work
  * correctly on both styles of machine.
  */
 
@@ -187,6 +188,16 @@ static void gf128mul_x8_bbe(be128 *x)
 	x->b = cpu_to_be64((b << 8) ^ _tt);
 }
 
+static void gf128mul_x8_ble(be128 *x)
+{
+	u64 a = le64_to_cpu(x->b);
+	u64 b = le64_to_cpu(x->a);
+	u64 _tt = gf128mul_table_be[a >> 56];
+
+	x->b = cpu_to_le64((a << 8) | (b >> 56));
+	x->a = cpu_to_le64((b << 8) ^ _tt);
+}
+
 void gf128mul_lle(be128 *r, const be128 *b)
 {
 	be128 p[8];
@@ -263,9 +274,48 @@ void gf128mul_bbe(be128 *r, const be128 *b)
 }
 EXPORT_SYMBOL(gf128mul_bbe);
 
+void gf128mul_ble(be128 *r, const be128 *b)
+{
+	be128 p[8];
+	int i;
+
+	p[0] = *r;
+	for (i = 0; i < 7; ++i)
+		gf128mul_x_ble((be128 *)&p[i + 1], (be128 *)&p[i]);
+
+	memset(r, 0, sizeof(*r));
+	for (i = 0;;) {
+		u8 ch = ((u8 *)b)[15 - i];
+
+		if (ch & 0x80)
+			be128_xor(r, r, &p[7]);
+		if (ch & 0x40)
+			be128_xor(r, r, &p[6]);
+		if (ch & 0x20)
+			be128_xor(r, r, &p[5]);
+		if (ch & 0x10)
+			be128_xor(r, r, &p[4]);
+		if (ch & 0x08)
+			be128_xor(r, r, &p[3]);
+		if (ch & 0x04)
+			be128_xor(r, r, &p[2]);
+		if (ch & 0x02)
+			be128_xor(r, r, &p[1]);
+		if (ch & 0x01)
+			be128_xor(r, r, &p[0]);
+
+		if (++i >= 16)
+			break;
+
+		gf128mul_x8_ble(r);
+	}
+}
+EXPORT_SYMBOL(gf128mul_ble);
+
+
 /*      This version uses 64k bytes of table space.
     A 16 byte buffer has to be multiplied by a 16 byte key
-    value in GF(128).  If we consider a GF(128) value in
+    value in GF(2^128).  If we consider a GF(2^128) value in
     the buffer's lowest byte, we can construct a table of
     the 256 16 byte values that result from the 256 values
     of this byte.  This requires 4096 bytes. But we also
@@ -399,7 +449,7 @@ EXPORT_SYMBOL(gf128mul_64k_bbe);
 
 /*      This version uses 4k bytes of table space.
     A 16 byte buffer has to be multiplied by a 16 byte key
-    value in GF(128).  If we consider a GF(128) value in a
+    value in GF(2^128).  If we consider a GF(2^128) value in a
     single byte, we can construct a table of the 256 16 byte
     values that result from the 256 values of this byte.
     This requires 4096 bytes. If we take the highest byte in
@@ -457,6 +507,28 @@ out:
 }
 EXPORT_SYMBOL(gf128mul_init_4k_bbe);
 
+struct gf128mul_4k *gf128mul_init_4k_ble(const be128 *g)
+{
+	struct gf128mul_4k *t;
+	int j, k;
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	t->t[1] = *g;
+	for (j = 1; j <= 64; j <<= 1)
+		gf128mul_x_ble(&t->t[j + j], &t->t[j]);
+
+	for (j = 2; j < 256; j += j)
+		for (k = 1; k < j; ++k)
+			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
+
+out:
+	return t;
+}
+EXPORT_SYMBOL(gf128mul_init_4k_ble);
+
 void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
 {
 	u8 *ap = (u8 *)a;
@@ -487,5 +559,20 @@ void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t)
 }
 EXPORT_SYMBOL(gf128mul_4k_bbe);
 
+void gf128mul_4k_ble(be128 *a, struct gf128mul_4k *t)
+{
+	u8 *ap = (u8 *)a;
+	be128 r[1];
+	int i = 15;
+
+	*r = t->t[ap[15]];
+	while (i--) {
+		gf128mul_x8_ble(r);
+		be128_xor(r, r, &t->t[ap[i]]);
+	}
+	*a = *r;
+}
+EXPORT_SYMBOL(gf128mul_4k_ble);
+
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index 7217fe6dbe33..230760aef93b 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -43,7 +43,7 @@
  ---------------------------------------------------------------------------
  Issue Date: 31/01/2006
 
- An implementation of field multiplication in Galois Field GF(128)
+ An implementation of field multiplication in Galois Field GF(2^128)
 */
 
 #ifndef _CRYPTO_GF128MUL_H
@@ -65,7 +65,7 @@
  * are left and the lsb's are right. char b[16] is an array and b[0] is
  * the first octet.
  *
- * 80000000 00000000 00000000 00000000 .... 00000000 00000000 00000000
+ * 10000000 00000000 00000000 00000000 .... 00000000 00000000 00000000
  *   b[0]     b[1]     b[2]     b[3]          b[13]    b[14]    b[15]
  *
  * Every bit is a coefficient of some power of X. We can store the bits
@@ -99,21 +99,21 @@
  *
  * bbe on a little endian machine u32 x[4]:
  *
- *  MS            x[0]           LS  MS            x[1]		  LS
+ *  MS            x[0]           LS  MS            x[1]           LS
  *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
  *  103..96 111.104 119.112 127.120  71...64 79...72 87...80 95...88
  *
- *  MS            x[2]           LS  MS            x[3]		  LS
+ *  MS            x[2]           LS  MS            x[3]           LS
  *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
  *  39...32 47...40 55...48 63...56  07...00 15...08 23...16 31...24
  *
  * ble on a little endian machine
  *
- *  MS            x[0]           LS  MS            x[1]		  LS
+ *  MS            x[0]           LS  MS            x[1]           LS
  *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
  *  31...24 23...16 15...08 07...00  63...56 55...48 47...40 39...32
  *
- *  MS            x[2]           LS  MS            x[3]		  LS
+ *  MS            x[2]           LS  MS            x[3]           LS
  *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
  *  95...88 87...80 79...72 71...64  127.120 199.112 111.104 103..96
  *
@@ -127,7 +127,7 @@
  * machines this will automatically aligned to wordsize and on a 64-bit
  * machine also.
  */
-/*	Multiply a GF128 field element by x. Field elements are held in arrays
+/*  Multiply a GF128 field element by x. Field elements are held in arrays
     of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower
     indexed bits placed in the more numerically significant bit positions
     within bytes.
@@ -135,45 +135,47 @@
     On little endian machines the bit indexes translate into the bit
     positions within four 32-bit words in the following way
 
-    MS            x[0]           LS  MS            x[1]		  LS
+    MS            x[0]           LS  MS            x[1]           LS
     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
     24...31 16...23 08...15 00...07  56...63 48...55 40...47 32...39
 
-    MS            x[2]           LS  MS            x[3]		  LS
+    MS            x[2]           LS  MS            x[3]           LS
     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
     88...95 80...87 72...79 64...71  120.127 112.119 104.111 96..103
 
     On big endian machines the bit indexes translate into the bit
     positions within four 32-bit words in the following way
 
-    MS            x[0]           LS  MS            x[1]		  LS
+    MS            x[0]           LS  MS            x[1]           LS
     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
     00...07 08...15 16...23 24...31  32...39 40...47 48...55 56...63
 
-    MS            x[2]           LS  MS            x[3]		  LS
+    MS            x[2]           LS  MS            x[3]           LS
     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
     64...71 72...79 80...87 88...95  96..103 104.111 112.119 120.127
 */
 
-/*	A slow generic version of gf_mul, implemented for lle and bbe
- * 	It multiplies a and b and puts the result in a */
+/*  A slow generic version of gf_mul, implemented for lle, bbe, and ble.
+ *  It multiplies a and b and puts the result in a
+ */
 void gf128mul_lle(be128 *a, const be128 *b);
-
 void gf128mul_bbe(be128 *a, const be128 *b);
+void gf128mul_ble(be128 *a, const be128 *b);
 
-/* multiply by x in ble format, needed by XTS */
+/* multiply by x in ble format, needed by XTS and HEH */
 void gf128mul_x_ble(be128 *a, const be128 *b);
 
 /* 4k table optimization */
-
 struct gf128mul_4k {
 	be128 t[256];
 };
 
 struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g);
 struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g);
+struct gf128mul_4k *gf128mul_init_4k_ble(const be128 *g);
 void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t);
 void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t);
+void gf128mul_4k_ble(be128 *a, struct gf128mul_4k *t);
 
 static inline void gf128mul_free_4k(struct gf128mul_4k *t)
 {
@@ -181,16 +183,17 @@ static inline void gf128mul_free_4k(struct gf128mul_4k *t)
 }
 
 
-/* 64k table optimization, implemented for lle and bbe */
+/* 64k table optimization, implemented for lle, ble, and bbe */
 
 struct gf128mul_64k {
 	struct gf128mul_4k *t[16];
 };
 
-/* first initialize with the constant factor with which you
- * want to multiply and then call gf128_64k_lle with the other
- * factor in the first argument, the table in the second and a
- * scratch register in the third. Afterwards *a = *r. */
+/* First initialize with the constant factor with which you
+ * want to multiply and then call gf128mul_64k_bbe with the other
+ * factor in the first argument, and the table in the second.
+ * Afterwards, the result is stored in *a.
+ */
 struct gf128mul_64k *gf128mul_init_64k_lle(const be128 *g);
 struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g);
 void gf128mul_free_64k(struct gf128mul_64k *t);

From 698ffc03b70134f4f4af89bf64f3bcb96e358545 Mon Sep 17 00:00:00 2001
From: Alex Cope <alexcope@google.com>
Date: Tue, 10 Jan 2017 16:47:49 -0800
Subject: [PATCH 0663/3220] ANDROID: crypto: heh - Add Hash-Encrypt-Hash (HEH)
 algorithm

Hash-Encrypt-Hash (HEH) is a proposed block cipher mode of operation
which extends the strong pseudo-random permutation property of block
ciphers (e.g. AES) to arbitrary length input strings.  This provides a
stronger notion of security than existing block cipher modes of
operation (e.g. CBC, CTR, XTS), though it is usually less performant.
It uses two keyed invertible hash functions with a layer of ECB
encryption applied in-between.  The algorithm is currently specified by
the following Internet Draft:

	https://tools.ietf.org/html/draft-cope-heh-01

This patch adds HEH as a symmetric cipher only.  Support for HEH as an
AEAD is not yet implemented.

HEH will use an existing accelerated ecb(block_cipher) implementation
for the encrypt step if available.  Accelerated versions of the hash
step are planned but will be left for later patches.

This patch backports HEH to the 4.4 Android kernel, initially for use by
ext4 filenames encryption.  Note that HEH is not yet upstream; however,
patches have been made available on linux-crypto, and as noted there is
also a draft specification available.  This backport required updating
the code to conform to the legacy ablkcipher API rather than the
skcipher API, which wasn't complete in 4.4.

Signed-off-by: Alex Cope <alexcope@google.com>
Bug: 32975945
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: I945bcc9c0115916824d701bae91b86e3f059a1a9
---
 crypto/Kconfig   |  17 +
 crypto/Makefile  |   1 +
 crypto/heh.c     | 899 +++++++++++++++++++++++++++++++++++++++++++++++
 crypto/testmgr.c |  15 +
 crypto/testmgr.h | 194 ++++++++++
 5 files changed, 1126 insertions(+)
 create mode 100644 crypto/heh.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 7240821137fd..627227f1162d 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -289,6 +289,23 @@ config CRYPTO_CBC
 	  CBC: Cipher Block Chaining mode
 	  This block cipher algorithm is required for IPSec.
 
+config CRYPTO_HEH
+	tristate "HEH support"
+	select CRYPTO_CMAC
+	select CRYPTO_ECB
+	select CRYPTO_GF128MUL
+	select CRYPTO_MANAGER
+	help
+	  HEH: Hash-Encrypt-Hash mode
+	  HEH is a proposed block cipher mode of operation which extends the
+	  strong pseudo-random permutation (SPRP) property of block ciphers to
+	  arbitrary-length input strings.  This provides a stronger notion of
+	  security than existing block cipher modes of operation (e.g. CBC, CTR,
+	  XTS), though it is usually less performant.  Applications include disk
+	  encryption and encryption of file names and contents.  Currently, this
+	  implementation only provides a symmetric cipher interface, so it can't
+	  yet be used as an AEAD.
+
 config CRYPTO_CTR
 	tristate "CTR support"
 	select CRYPTO_BLKCIPHER
diff --git a/crypto/Makefile b/crypto/Makefile
index f7aba923458d..3d36c5a6a5ea 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -65,6 +65,7 @@ obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
 obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
 obj-$(CONFIG_CRYPTO_CBC) += cbc.o
+obj-$(CONFIG_CRYPTO_HEH) += heh.o
 obj-$(CONFIG_CRYPTO_PCBC) += pcbc.o
 obj-$(CONFIG_CRYPTO_CTS) += cts.o
 obj-$(CONFIG_CRYPTO_LRW) += lrw.o
diff --git a/crypto/heh.c b/crypto/heh.c
new file mode 100644
index 000000000000..48a284cecaa2
--- /dev/null
+++ b/crypto/heh.c
@@ -0,0 +1,899 @@
+/*
+ * HEH: Hash-Encrypt-Hash mode
+ *
+ * Copyright (c) 2016 Google Inc.
+ *
+ * Authors:
+ *	Alex Cope <alexcope@google.com>
+ *	Eric Biggers <ebiggers@google.com>
+ */
+
+/*
+ * Hash-Encrypt-Hash (HEH) is a proposed block cipher mode of operation which
+ * extends the strong pseudo-random permutation (SPRP) property of block ciphers
+ * (e.g. AES) to arbitrary length input strings.  It uses two keyed invertible
+ * hash functions with a layer of ECB encryption applied in-between.  The
+ * algorithm is specified by the following Internet Draft:
+ *
+ *	https://tools.ietf.org/html/draft-cope-heh-01
+ *
+ * Although HEH can be used as either a regular symmetric cipher or as an AEAD,
+ * currently this module only provides it as a symmetric cipher.  Additionally,
+ * only 16-byte nonces are supported.
+ */
+
+#include <crypto/gf128mul.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/skcipher.h>
+#include "internal.h"
+
+/*
+ * The block size is the size of GF(2^128) elements and also the required block
+ * size of the underlying block cipher.
+ */
+#define HEH_BLOCK_SIZE		16
+
+struct heh_instance_ctx {
+	struct crypto_shash_spawn cmac;
+	struct crypto_skcipher_spawn ecb;
+};
+
+struct heh_tfm_ctx {
+	struct crypto_shash *cmac;
+	struct crypto_ablkcipher *ecb;
+	struct gf128mul_4k *tau_key;
+};
+
+struct heh_cmac_data {
+	u8 nonce[HEH_BLOCK_SIZE];
+	__le32 nonce_length;
+	__le32 aad_length;
+	__le32 message_length;
+	__le32 padding;
+};
+
+struct heh_req_ctx { /* aligned to alignmask */
+	be128 beta1_key;
+	be128 beta2_key;
+	union {
+		struct {
+			struct heh_cmac_data data;
+			struct shash_desc desc;
+			/* + crypto_shash_descsize(cmac) */
+		} cmac;
+		struct {
+			u8 keystream[HEH_BLOCK_SIZE];
+			u8 tmp[HEH_BLOCK_SIZE];
+			struct scatterlist tmp_sgl[2];
+			struct ablkcipher_request req;
+			/* + crypto_ablkcipher_reqsize(ecb) */
+		} ecb;
+	} u;
+};
+
+/*
+ * Get the offset in bytes to the last full block, or equivalently the length of
+ * all full blocks excluding the last
+ */
+static inline unsigned int get_tail_offset(unsigned int len)
+{
+	len -= len % HEH_BLOCK_SIZE;
+	return len - HEH_BLOCK_SIZE;
+}
+
+static inline struct heh_req_ctx *heh_req_ctx(struct ablkcipher_request *req)
+{
+	unsigned int alignmask = crypto_ablkcipher_alignmask(
+						crypto_ablkcipher_reqtfm(req));
+
+	return (void *)PTR_ALIGN((u8 *)ablkcipher_request_ctx(req),
+				 alignmask + 1);
+}
+
+static inline void async_done(struct crypto_async_request *areq, int err,
+			      int (*next_step)(struct ablkcipher_request *,
+					       u32))
+{
+	struct ablkcipher_request *req = areq->data;
+
+	if (err)
+		goto out;
+
+	err = next_step(req, req->base.flags & ~CRYPTO_TFM_REQ_MAY_SLEEP);
+	if (err == -EINPROGRESS ||
+	    (err == -EBUSY && (req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)))
+		return;
+out:
+	ablkcipher_request_complete(req, err);
+}
+
+/*
+ * Generate the per-message "beta" keys used by the hashing layers of HEH.  The
+ * first beta key is the CMAC of the nonce, the additional authenticated data
+ * (AAD), and the lengths in bytes of the nonce, AAD, and message.  The nonce
+ * and AAD are each zero-padded to the next 16-byte block boundary, and the
+ * lengths are serialized as 4-byte little endian integers and zero-padded to
+ * the next 16-byte block boundary.
+ * The second beta key is the first one interpreted as an element in GF(2^128)
+ * and multiplied by x.
+ *
+ * Note that because the nonce and AAD may, in general, be variable-length, the
+ * key generation must be done by a pseudo-random function (PRF) on
+ * variable-length inputs.  CBC-MAC does not satisfy this, as it is only a PRF
+ * on fixed-length inputs.  CMAC remedies this flaw.  Including the lengths of
+ * the nonce, AAD, and message is also critical to avoid collisions.
+ *
+ * That being said, this implementation does not yet operate as an AEAD and
+ * therefore there is never any AAD, nor are variable-length nonces supported.
+ */
+static int generate_betas(struct ablkcipher_request *req,
+			  be128 *beta1_key, be128 *beta2_key)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct heh_req_ctx *rctx = heh_req_ctx(req);
+	struct heh_cmac_data *data = &rctx->u.cmac.data;
+	struct shash_desc *desc = &rctx->u.cmac.desc;
+	int err;
+
+	BUILD_BUG_ON(sizeof(*data) != 2 * HEH_BLOCK_SIZE);
+	memcpy(data->nonce, req->info, HEH_BLOCK_SIZE);
+	data->nonce_length = cpu_to_le32(HEH_BLOCK_SIZE);
+	data->aad_length = cpu_to_le32(0);
+	data->message_length = cpu_to_le32(req->nbytes);
+	data->padding = cpu_to_le32(0);
+
+	desc->tfm = ctx->cmac;
+	desc->flags = req->base.flags;
+
+	err = crypto_shash_digest(desc, (const u8 *)data, sizeof(*data),
+				  (u8 *)beta1_key);
+	if (err)
+		return err;
+
+	gf128mul_x_ble(beta2_key, beta1_key);
+	return 0;
+}
+
+/*
+ * Evaluation of a polynomial over GF(2^128) using Horner's rule.  The
+ * polynomial is evaluated at 'point'.  The polynomial's coefficients are taken
+ * from 'coeffs_sgl' and are for terms with consecutive descending degree ending
+ * at degree 1.  'bytes_of_coeffs' is 16 times the number of terms.
+ */
+static be128 evaluate_polynomial(struct gf128mul_4k *point,
+				 struct scatterlist *coeffs_sgl,
+				 unsigned int bytes_of_coeffs)
+{
+	be128 value = {0};
+	struct sg_mapping_iter miter;
+	unsigned int remaining = bytes_of_coeffs;
+	unsigned int needed = 0;
+
+	sg_miter_start(&miter, coeffs_sgl, sg_nents(coeffs_sgl),
+		       SG_MITER_FROM_SG | SG_MITER_ATOMIC);
+	while (remaining) {
+		be128 coeff;
+		const u8 *src;
+		unsigned int srclen;
+		u8 *dst = (u8 *)&value;
+
+		/*
+		 * Note: scatterlist elements are not necessarily evenly
+		 * divisible into blocks, nor are they necessarily aligned to
+		 * __alignof__(be128).
+		 */
+		sg_miter_next(&miter);
+
+		src = miter.addr;
+		srclen = min_t(unsigned int, miter.length, remaining);
+		remaining -= srclen;
+
+		if (needed) {
+			unsigned int n = min(srclen, needed);
+			u8 *pos = dst + (HEH_BLOCK_SIZE - needed);
+
+			needed -= n;
+			srclen -= n;
+
+			while (n--)
+				*pos++ ^= *src++;
+
+			if (!needed)
+				gf128mul_4k_ble(&value, point);
+		}
+
+		while (srclen >= HEH_BLOCK_SIZE) {
+			memcpy(&coeff, src, HEH_BLOCK_SIZE);
+			be128_xor(&value, &value, &coeff);
+			gf128mul_4k_ble(&value, point);
+			src += HEH_BLOCK_SIZE;
+			srclen -= HEH_BLOCK_SIZE;
+		}
+
+		if (srclen) {
+			needed = HEH_BLOCK_SIZE - srclen;
+			do {
+				*dst++ ^= *src++;
+			} while (--srclen);
+		}
+	}
+	sg_miter_stop(&miter);
+	return value;
+}
+
+/*
+ * Split the message into 16 byte blocks, padding out the last block, and use
+ * the blocks as coefficients in the evaluation of a polynomial over GF(2^128)
+ * at the secret point 'tau_key'. For ease of implementing the higher-level
+ * heh_hash_inv() function, the constant and degree-1 coefficients are swapped
+ * if there is a partial block.
+ *
+ * Mathematically, compute:
+ *   if (no partial block)
+ *     k^{N-1} * m_0 + ... + k * m_{N-2} + m_{N-1}
+ *   else if (partial block)
+ *     k^N * m_0 + ... + k^2 * m_{N-2} + k * m_N + m_{N-1}
+ *
+ * where:
+ *	t is tau_key
+ *	N is the number of full blocks in the message
+ *	m_i is the i-th full block in the message for i = 0 to N-1 inclusive
+ *	m_N is the partial block of the message zero-padded up to 16 bytes
+ */
+static be128 poly_hash(struct crypto_ablkcipher *tfm, struct scatterlist *sgl,
+		       unsigned int len)
+{
+	struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	unsigned int tail_offset = get_tail_offset(len);
+	unsigned int tail_len = len - tail_offset;
+	be128 hash;
+	be128 tail[2];
+
+	/* Handle all full blocks except the last */
+	hash = evaluate_polynomial(ctx->tau_key, sgl, tail_offset);
+
+	/* Handle the last full block and the partial block */
+	scatterwalk_map_and_copy(tail, sgl, tail_offset, tail_len, 0);
+
+	if (tail_len != HEH_BLOCK_SIZE) {
+		/* handle the partial block */
+		memset((u8 *)tail + tail_len, 0, sizeof(tail) - tail_len);
+		be128_xor(&hash, &hash, &tail[1]);
+		gf128mul_4k_ble(&hash, ctx->tau_key);
+	}
+	be128_xor(&hash, &hash, &tail[0]);
+	return hash;
+}
+
+/*
+ * Transform all full blocks except the last.
+ * This is used by both the hash and inverse hash phases.
+ */
+static int heh_tfm_blocks(struct ablkcipher_request *req,
+			  struct scatterlist *src_sgl,
+			  struct scatterlist *dst_sgl, unsigned int len,
+			  const be128 *hash, const be128 *beta_key)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct blkcipher_desc desc = { .flags = req->base.flags };
+	struct blkcipher_walk walk;
+	be128 e = *beta_key;
+	int err;
+	unsigned int nbytes;
+
+	blkcipher_walk_init(&walk, dst_sgl, src_sgl, len);
+
+	err = blkcipher_ablkcipher_walk_virt(&desc, &walk, tfm);
+
+	while ((nbytes = walk.nbytes)) {
+		const be128 *src = (be128 *)walk.src.virt.addr;
+		be128 *dst = (be128 *)walk.dst.virt.addr;
+
+		do {
+			gf128mul_x_ble(&e, &e);
+			be128_xor(dst, src, hash);
+			be128_xor(dst, dst, &e);
+			src++;
+			dst++;
+		} while ((nbytes -= HEH_BLOCK_SIZE) >= HEH_BLOCK_SIZE);
+		err = blkcipher_walk_done(&desc, &walk, nbytes);
+	}
+	return err;
+}
+
+/*
+ * The hash phase of HEH.  Given a message, compute:
+ *
+ *     (m_0 + H, ..., m_{N-2} + H, H, m_N) + (xb, x^2b, ..., x^{N-1}b, b, 0)
+ *
+ * where:
+ *	N is the number of full blocks in the message
+ *	m_i is the i-th full block in the message for i = 0 to N-1 inclusive
+ *	m_N is the unpadded partial block, possibly empty
+ *	H is the poly_hash() of the message, keyed by tau_key
+ *	b is beta_key
+ *	x is the element x in our representation of GF(2^128)
+ *
+ * Note that the partial block remains unchanged, but it does affect the result
+ * of poly_hash() and therefore the transformation of all the full blocks.
+ */
+static int heh_hash(struct ablkcipher_request *req, const be128 *beta_key)
+{
+	be128 hash;
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	unsigned int tail_offset = get_tail_offset(req->nbytes);
+	unsigned int partial_len = req->nbytes % HEH_BLOCK_SIZE;
+	int err;
+
+	/* poly_hash() the full message including the partial block */
+	hash = poly_hash(tfm, req->src, req->nbytes);
+
+	/* Transform all full blocks except the last */
+	err = heh_tfm_blocks(req, req->src, req->dst, tail_offset, &hash,
+			     beta_key);
+	if (err)
+		return err;
+
+	/* Set the last full block to hash XOR beta_key */
+	be128_xor(&hash, &hash, beta_key);
+	scatterwalk_map_and_copy(&hash, req->dst, tail_offset, HEH_BLOCK_SIZE,
+				 1);
+
+	/* Copy the partial block if needed */
+	if (partial_len != 0 && req->src != req->dst) {
+		unsigned int offs = tail_offset + HEH_BLOCK_SIZE;
+
+		scatterwalk_map_and_copy(&hash, req->src, offs, partial_len, 0);
+		scatterwalk_map_and_copy(&hash, req->dst, offs, partial_len, 1);
+	}
+	return 0;
+}
+
+/*
+ * The inverse hash phase of HEH.  This undoes the result of heh_hash().
+ */
+static int heh_hash_inv(struct ablkcipher_request *req, const be128 *beta_key)
+{
+	be128 hash;
+	be128 tmp;
+	struct scatterlist tmp_sgl[2];
+	struct scatterlist *tail_sgl;
+	unsigned int len = req->nbytes;
+	unsigned int tail_offset = get_tail_offset(len);
+	struct scatterlist *sgl = req->dst;
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	int err;
+
+	/*
+	 * The last full block was computed as hash XOR beta_key, so XOR it with
+	 * beta_key to recover hash.
+	 */
+	tail_sgl = scatterwalk_ffwd(tmp_sgl, sgl, tail_offset);
+	scatterwalk_map_and_copy(&hash, tail_sgl, 0, HEH_BLOCK_SIZE, 0);
+	be128_xor(&hash, &hash, beta_key);
+
+	/* Transform all full blocks except the last */
+	err = heh_tfm_blocks(req, sgl, sgl, tail_offset, &hash, beta_key);
+	if (err)
+		return err;
+
+	/*
+	 * Recover the last full block.  We know 'hash', i.e. the poly_hash() of
+	 * the the original message.  The last full block was the constant term
+	 * of the polynomial.  To recover the last full block, temporarily zero
+	 * it, compute the poly_hash(), and take the difference from 'hash'.
+	 */
+	memset(&tmp, 0, sizeof(tmp));
+	scatterwalk_map_and_copy(&tmp, tail_sgl, 0, HEH_BLOCK_SIZE, 1);
+	tmp = poly_hash(tfm, sgl, len);
+	be128_xor(&tmp, &tmp, &hash);
+	scatterwalk_map_and_copy(&tmp, tail_sgl, 0, HEH_BLOCK_SIZE, 1);
+	return 0;
+}
+
+static int heh_hash_inv_step(struct ablkcipher_request *req, u32 flags)
+{
+	struct heh_req_ctx *rctx = heh_req_ctx(req);
+
+	return heh_hash_inv(req, &rctx->beta2_key);
+}
+
+static int heh_ecb_step_3(struct ablkcipher_request *req, u32 flags)
+{
+	struct heh_req_ctx *rctx = heh_req_ctx(req);
+	u8 partial_block[HEH_BLOCK_SIZE] __aligned(__alignof__(u32));
+	unsigned int tail_offset = get_tail_offset(req->nbytes);
+	unsigned int partial_offset = tail_offset + HEH_BLOCK_SIZE;
+	unsigned int partial_len = req->nbytes - partial_offset;
+
+	/*
+	 * Extract the pad in req->dst at tail_offset, and xor the partial block
+	 * with it to create encrypted partial block
+	 */
+	scatterwalk_map_and_copy(rctx->u.ecb.keystream, req->dst, tail_offset,
+				 HEH_BLOCK_SIZE, 0);
+	scatterwalk_map_and_copy(partial_block, req->dst, partial_offset,
+				 partial_len, 0);
+	crypto_xor(partial_block, rctx->u.ecb.keystream, partial_len);
+
+	/*
+	 * Store the encrypted final block and partial block back in dst_sg
+	 */
+	scatterwalk_map_and_copy(&rctx->u.ecb.tmp, req->dst, tail_offset,
+				 HEH_BLOCK_SIZE, 1);
+	scatterwalk_map_and_copy(partial_block, req->dst, partial_offset,
+				 partial_len, 1);
+
+	return heh_hash_inv_step(req, flags);
+}
+
+static void heh_ecb_step_2_done(struct crypto_async_request *areq, int err)
+{
+	return async_done(areq, err, heh_ecb_step_3);
+}
+
+static int heh_ecb_step_2(struct ablkcipher_request *req, u32 flags)
+{
+	struct heh_req_ctx *rctx = heh_req_ctx(req);
+	unsigned int partial_len = req->nbytes % HEH_BLOCK_SIZE;
+	struct scatterlist *tmp_sgl;
+	int err;
+	unsigned int tail_offset = get_tail_offset(req->nbytes);
+
+	if (partial_len == 0)
+		return heh_hash_inv_step(req, flags);
+
+	/*
+	 * Extract the final full block, store it in tmp, and then xor that with
+	 * the value saved in u.ecb.keystream
+	 */
+	scatterwalk_map_and_copy(rctx->u.ecb.tmp, req->dst, tail_offset,
+				 HEH_BLOCK_SIZE, 0);
+	crypto_xor(rctx->u.ecb.keystream, rctx->u.ecb.tmp, HEH_BLOCK_SIZE);
+
+	/*
+	 * Encrypt the value in rctx->u.ecb.keystream to create the pad for the
+	 * partial block.
+	 * We cannot encrypt stack buffers, so re-use the dst_sg to do this
+	 * encryption to avoid a malloc. The value at tail_offset is stored in
+	 * tmp, and will be restored later.
+	 */
+	scatterwalk_map_and_copy(rctx->u.ecb.keystream, req->dst, tail_offset,
+				 HEH_BLOCK_SIZE, 1);
+	tmp_sgl = scatterwalk_ffwd(rctx->u.ecb.tmp_sgl, req->dst, tail_offset);
+	ablkcipher_request_set_callback(&rctx->u.ecb.req, flags,
+					heh_ecb_step_2_done, req);
+	ablkcipher_request_set_crypt(&rctx->u.ecb.req, tmp_sgl, tmp_sgl,
+				     HEH_BLOCK_SIZE, NULL);
+	err = crypto_ablkcipher_encrypt(&rctx->u.ecb.req);
+	if (err)
+		return err;
+	return heh_ecb_step_3(req, flags);
+}
+
+static void heh_ecb_full_done(struct crypto_async_request *areq, int err)
+{
+	return async_done(areq, err, heh_ecb_step_2);
+}
+
+/*
+ * The encrypt phase of HEH.  This uses ECB encryption, with special handling
+ * for the partial block at the end if any.  The source data is already in
+ * req->dst, so the encryption happens in-place.
+ *
+ * After the encrypt phase we continue on to the inverse hash phase.  The
+ * functions calls are chained to support asynchronous ECB algorithms.
+ */
+static int heh_ecb(struct ablkcipher_request *req, bool decrypt)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct heh_req_ctx *rctx = heh_req_ctx(req);
+	struct ablkcipher_request *ecb_req = &rctx->u.ecb.req;
+	unsigned int tail_offset = get_tail_offset(req->nbytes);
+	unsigned int full_len = tail_offset + HEH_BLOCK_SIZE;
+	int err;
+
+	/*
+	 * Save the last full block before it is encrypted/decrypted. This will
+	 * be used later to encrypt/decrypt the partial block
+	 */
+	scatterwalk_map_and_copy(rctx->u.ecb.keystream, req->dst, tail_offset,
+				 HEH_BLOCK_SIZE, 0);
+
+	/* Encrypt/decrypt all full blocks */
+	ablkcipher_request_set_tfm(ecb_req, ctx->ecb);
+	ablkcipher_request_set_callback(ecb_req, req->base.flags,
+				      heh_ecb_full_done, req);
+	ablkcipher_request_set_crypt(ecb_req, req->dst, req->dst, full_len,
+				     NULL);
+	if (decrypt)
+		err = crypto_ablkcipher_decrypt(ecb_req);
+	else
+		err = crypto_ablkcipher_encrypt(ecb_req);
+	if (err)
+		return err;
+
+	return heh_ecb_step_2(req, req->base.flags);
+}
+
+static int heh_crypt(struct ablkcipher_request *req, bool decrypt)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct heh_req_ctx *rctx = heh_req_ctx(req);
+	int err;
+
+	/* Inputs must be at least one full block */
+	if (req->nbytes < HEH_BLOCK_SIZE)
+		return -EINVAL;
+
+	/* Key must have been set */
+	if (!ctx->tau_key)
+		return -ENOKEY;
+	err = generate_betas(req, &rctx->beta1_key, &rctx->beta2_key);
+	if (err)
+		return err;
+
+	if (decrypt)
+		swap(rctx->beta1_key, rctx->beta2_key);
+
+	err = heh_hash(req, &rctx->beta1_key);
+	if (err)
+		return err;
+
+	return heh_ecb(req, decrypt);
+}
+
+static int heh_encrypt(struct ablkcipher_request *req)
+{
+	return heh_crypt(req, false);
+}
+
+static int heh_decrypt(struct ablkcipher_request *req)
+{
+	return heh_crypt(req, true);
+}
+
+static int heh_setkey(struct crypto_ablkcipher *parent, const u8 *key,
+		      unsigned int keylen)
+{
+	struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(parent);
+	struct crypto_shash *cmac = ctx->cmac;
+	struct crypto_ablkcipher *ecb = ctx->ecb;
+	SHASH_DESC_ON_STACK(desc, cmac);
+	u8 *derived_keys;
+	u8 digest[HEH_BLOCK_SIZE];
+	unsigned int i;
+	int err;
+
+	/* set prf_key = key */
+	crypto_shash_clear_flags(cmac, CRYPTO_TFM_REQ_MASK);
+	crypto_shash_set_flags(cmac, crypto_ablkcipher_get_flags(parent) &
+				     CRYPTO_TFM_REQ_MASK);
+	err = crypto_shash_setkey(cmac, key, keylen);
+	crypto_ablkcipher_set_flags(parent, crypto_shash_get_flags(cmac) &
+					    CRYPTO_TFM_RES_MASK);
+	if (err)
+		return err;
+
+	/*
+	 * Generate tau_key and ecb_key as follows:
+	 * tau_key = cmac(prf_key, 0x00...01)
+	 * ecb_key = cmac(prf_key, 0x00...02) || cmac(prf_key, 0x00...03) || ...
+	 *           truncated to keylen bytes
+	 */
+	derived_keys = kzalloc(round_up(HEH_BLOCK_SIZE + keylen,
+					HEH_BLOCK_SIZE), GFP_KERNEL);
+	if (!derived_keys)
+		return -ENOMEM;
+	desc->tfm = cmac;
+	desc->flags = (crypto_shash_get_flags(cmac) & CRYPTO_TFM_REQ_MASK);
+	for (i = 0; i < keylen + HEH_BLOCK_SIZE; i += HEH_BLOCK_SIZE) {
+		derived_keys[i + HEH_BLOCK_SIZE - 1] =
+					0x01 + i / HEH_BLOCK_SIZE;
+		err = crypto_shash_digest(desc, derived_keys + i,
+					  HEH_BLOCK_SIZE, digest);
+		if (err)
+			goto out;
+		memcpy(derived_keys + i, digest, HEH_BLOCK_SIZE);
+	}
+
+	if (ctx->tau_key)
+		gf128mul_free_4k(ctx->tau_key);
+	err = -ENOMEM;
+	ctx->tau_key = gf128mul_init_4k_ble((const be128 *)derived_keys);
+	if (!ctx->tau_key)
+		goto out;
+
+	crypto_ablkcipher_clear_flags(ecb, CRYPTO_TFM_REQ_MASK);
+	crypto_ablkcipher_set_flags(ecb, crypto_ablkcipher_get_flags(parent) &
+					 CRYPTO_TFM_REQ_MASK);
+	err = crypto_ablkcipher_setkey(ecb, derived_keys + HEH_BLOCK_SIZE,
+				       keylen);
+	crypto_ablkcipher_set_flags(parent, crypto_ablkcipher_get_flags(ecb) &
+					    CRYPTO_TFM_RES_MASK);
+out:
+	kzfree(derived_keys);
+	return err;
+}
+
+static int heh_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
+	struct heh_instance_ctx *ictx = crypto_instance_ctx(inst);
+	struct heh_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct crypto_shash *cmac;
+	struct crypto_ablkcipher *ecb;
+	unsigned int reqsize;
+	int err;
+
+	cmac = crypto_spawn_shash(&ictx->cmac);
+	if (IS_ERR(cmac))
+		return PTR_ERR(cmac);
+
+	ecb = crypto_spawn_skcipher(&ictx->ecb);
+	err = PTR_ERR(ecb);
+	if (IS_ERR(ecb))
+		goto err_free_cmac;
+
+	ctx->cmac = cmac;
+	ctx->ecb = ecb;
+
+	reqsize = crypto_tfm_alg_alignmask(tfm) &
+		  ~(crypto_tfm_ctx_alignment() - 1);
+	reqsize += max(offsetof(struct heh_req_ctx, u.cmac.desc) +
+			sizeof(struct shash_desc) +
+			crypto_shash_descsize(cmac),
+		       offsetof(struct heh_req_ctx, u.ecb.req) +
+			sizeof(struct ablkcipher_request) +
+			crypto_ablkcipher_reqsize(ecb));
+	tfm->crt_ablkcipher.reqsize = reqsize;
+	return 0;
+
+err_free_cmac:
+	crypto_free_shash(cmac);
+	return err;
+}
+
+static void heh_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct heh_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	gf128mul_free_4k(ctx->tau_key);
+	crypto_free_shash(ctx->cmac);
+	crypto_free_ablkcipher(ctx->ecb);
+}
+
+static void heh_free_instance(struct crypto_instance *inst)
+{
+	struct heh_instance_ctx *ctx = crypto_instance_ctx(inst);
+
+	crypto_drop_shash(&ctx->cmac);
+	crypto_drop_skcipher(&ctx->ecb);
+	kfree(inst);
+}
+
+/*
+ * Create an instance of HEH as a ablkcipher.
+ *
+ * This relies on underlying CMAC and ECB algorithms, usually cmac(aes) and
+ * ecb(aes).  For performance reasons we support asynchronous ECB algorithms.
+ * However, we do not yet support asynchronous CMAC algorithms because CMAC is
+ * only used on a small fixed amount of data per request, independent of the
+ * request length.  This would change if AEAD or variable-length nonce support
+ * were to be exposed.
+ */
+static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb,
+			     const char *full_name, const char *cmac_name,
+			     const char *ecb_name)
+{
+	struct crypto_attr_type *algt;
+	struct crypto_instance *inst;
+	struct heh_instance_ctx *ctx;
+	struct shash_alg *cmac;
+	struct crypto_alg *ecb;
+	int err;
+
+	algt = crypto_get_attr_type(tb);
+	if (IS_ERR(algt))
+		return PTR_ERR(algt);
+
+	/* User must be asking for something compatible with ablkcipher */
+	if ((algt->type ^ CRYPTO_ALG_TYPE_ABLKCIPHER) & algt->mask)
+		return -EINVAL;
+
+	/* Allocate the ablkcipher instance */
+	inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
+	if (!inst)
+		return -ENOMEM;
+
+	ctx = crypto_instance_ctx(inst);
+
+	/* Set up the cmac and ecb spawns */
+
+	ctx->cmac.base.inst = inst;
+	err = crypto_grab_shash(&ctx->cmac, cmac_name, 0, CRYPTO_ALG_ASYNC);
+	if (err)
+		goto err_free_inst;
+	cmac = crypto_spawn_shash_alg(&ctx->cmac);
+	err = -EINVAL;
+	if (cmac->digestsize != HEH_BLOCK_SIZE)
+		goto err_drop_cmac;
+
+	ctx->ecb.base.inst = inst;
+	err = crypto_grab_skcipher(&ctx->ecb, ecb_name, 0,
+				   crypto_requires_sync(algt->type,
+							algt->mask));
+	if (err)
+		goto err_drop_cmac;
+	ecb = crypto_skcipher_spawn_alg(&ctx->ecb);
+
+	/* HEH only supports block ciphers with 16 byte block size */
+	err = -EINVAL;
+	if (ecb->cra_blocksize != HEH_BLOCK_SIZE)
+		goto err_drop_ecb;
+
+	/* The underlying "ECB" algorithm must not require an IV */
+	err = -EINVAL;
+	if ((ecb->cra_flags & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_BLKCIPHER) {
+		if (ecb->cra_blkcipher.ivsize != 0)
+			goto err_drop_ecb;
+	} else {
+		if (ecb->cra_ablkcipher.ivsize != 0)
+			goto err_drop_ecb;
+	}
+
+	/* Set the instance names */
+	err = -ENAMETOOLONG;
+	if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME,
+		     "heh_base(%s,%s)", cmac->base.cra_driver_name,
+		     ecb->cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+		goto err_drop_ecb;
+
+	err = -ENAMETOOLONG;
+	if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME,
+		     "%s", full_name) >= CRYPTO_MAX_ALG_NAME)
+		goto err_drop_ecb;
+
+	/* Finish initializing the instance */
+
+	inst->alg.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER |
+				((cmac->base.cra_flags | ecb->cra_flags) &
+				 CRYPTO_ALG_ASYNC);
+	inst->alg.cra_blocksize = HEH_BLOCK_SIZE;
+	inst->alg.cra_ctxsize = sizeof(struct heh_tfm_ctx);
+	inst->alg.cra_alignmask = ecb->cra_alignmask | (__alignof__(be128) - 1);
+	inst->alg.cra_priority = ecb->cra_priority;
+	inst->alg.cra_type = &crypto_ablkcipher_type;
+	inst->alg.cra_init = heh_init_tfm;
+	inst->alg.cra_exit = heh_exit_tfm;
+
+	inst->alg.cra_ablkcipher.setkey = heh_setkey;
+	inst->alg.cra_ablkcipher.encrypt = heh_encrypt;
+	inst->alg.cra_ablkcipher.decrypt = heh_decrypt;
+	if ((ecb->cra_flags & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_BLKCIPHER) {
+		inst->alg.cra_ablkcipher.min_keysize = ecb->cra_blkcipher.min_keysize;
+		inst->alg.cra_ablkcipher.max_keysize = ecb->cra_blkcipher.max_keysize;
+	} else {
+		inst->alg.cra_ablkcipher.min_keysize = ecb->cra_ablkcipher.min_keysize;
+		inst->alg.cra_ablkcipher.max_keysize = ecb->cra_ablkcipher.max_keysize;
+	}
+	inst->alg.cra_ablkcipher.ivsize = HEH_BLOCK_SIZE;
+
+	/* Register the instance */
+	err = crypto_register_instance(tmpl, inst);
+	if (err)
+		goto err_drop_ecb;
+	return 0;
+
+err_drop_ecb:
+	crypto_drop_skcipher(&ctx->ecb);
+err_drop_cmac:
+	crypto_drop_shash(&ctx->cmac);
+err_free_inst:
+	kfree(inst);
+	return err;
+}
+
+static int heh_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+	const char *cipher_name;
+	char full_name[CRYPTO_MAX_ALG_NAME];
+	char cmac_name[CRYPTO_MAX_ALG_NAME];
+	char ecb_name[CRYPTO_MAX_ALG_NAME];
+
+	/* Get the name of the requested block cipher (e.g. aes) */
+	cipher_name = crypto_attr_alg_name(tb[1]);
+	if (IS_ERR(cipher_name))
+		return PTR_ERR(cipher_name);
+
+	if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "heh(%s)", cipher_name) >=
+	    CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	if (snprintf(cmac_name, CRYPTO_MAX_ALG_NAME, "cmac(%s)", cipher_name) >=
+	    CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	if (snprintf(ecb_name, CRYPTO_MAX_ALG_NAME, "ecb(%s)", cipher_name) >=
+	    CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	return heh_create_common(tmpl, tb, full_name, cmac_name, ecb_name);
+}
+
+static struct crypto_template heh_tmpl = {
+	.name = "heh",
+	.create = heh_create,
+	.free = heh_free_instance,
+	.module = THIS_MODULE,
+};
+
+static int heh_base_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+	char full_name[CRYPTO_MAX_ALG_NAME];
+	const char *cmac_name;
+	const char *ecb_name;
+
+	cmac_name = crypto_attr_alg_name(tb[1]);
+	if (IS_ERR(cmac_name))
+		return PTR_ERR(cmac_name);
+
+	ecb_name = crypto_attr_alg_name(tb[2]);
+	if (IS_ERR(ecb_name))
+		return PTR_ERR(ecb_name);
+
+	if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "heh_base(%s,%s)",
+		     cmac_name, ecb_name) >= CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	return heh_create_common(tmpl, tb, full_name, cmac_name, ecb_name);
+}
+
+/*
+ * If HEH is instantiated as "heh_base" instead of "heh", then specific
+ * implementations of cmac and ecb can be specified instead of just the cipher
+ */
+static struct crypto_template heh_base_tmpl = {
+	.name = "heh_base",
+	.create = heh_base_create,
+	.free = heh_free_instance,
+	.module = THIS_MODULE,
+};
+
+static int __init heh_module_init(void)
+{
+	int err;
+
+	err = crypto_register_template(&heh_tmpl);
+	if (err)
+		return err;
+
+	err = crypto_register_template(&heh_base_tmpl);
+	if (err)
+		goto out_undo_heh;
+
+	return 0;
+
+out_undo_heh:
+	crypto_unregister_template(&heh_tmpl);
+	return err;
+}
+
+static void __exit heh_module_exit(void)
+{
+	crypto_unregister_template(&heh_tmpl);
+	crypto_unregister_template(&heh_base_tmpl);
+}
+
+module_init(heh_module_init);
+module_exit(heh_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Hash-Encrypt-Hash block cipher mode");
+MODULE_ALIAS_CRYPTO("heh");
+MODULE_ALIAS_CRYPTO("heh_base");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index ae8c57fd8bc7..f03ae20a5735 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3196,6 +3196,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				.count = GHASH_TEST_VECTORS
 			}
 		}
+	}, {
+		.alg = "heh(aes)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = aes_heh_enc_tv_template,
+					.count = AES_HEH_ENC_TEST_VECTORS
+				},
+				.dec = {
+					.vecs = aes_heh_dec_tv_template,
+					.count = AES_HEH_DEC_TEST_VECTORS
+				}
+			}
+		}
 	}, {
 		.alg = "hmac(crc32)",
 		.test = alg_test_hash,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index da0a8fd765f4..97a523993bd8 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -14139,6 +14139,8 @@ static struct cipher_testvec cast6_xts_dec_tv_template[] = {
 #define AES_DEC_TEST_VECTORS 4
 #define AES_CBC_ENC_TEST_VECTORS 5
 #define AES_CBC_DEC_TEST_VECTORS 5
+#define AES_HEH_ENC_TEST_VECTORS 4
+#define AES_HEH_DEC_TEST_VECTORS 4
 #define HMAC_MD5_ECB_CIPHER_NULL_ENC_TEST_VECTORS 2
 #define HMAC_MD5_ECB_CIPHER_NULL_DEC_TEST_VECTORS 2
 #define HMAC_SHA1_ECB_CIPHER_NULL_ENC_TEST_VEC 2
@@ -14511,6 +14513,198 @@ static struct cipher_testvec aes_dec_tv_template[] = {
 	},
 };
 
+static struct cipher_testvec aes_heh_enc_tv_template[] = {
+	{
+		.key    = "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+		.klen   = 16,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+		.ilen   = 16,
+		.result = "\xd8\xbd\x40\xbf\xca\xe5\xee\x81"
+			  "\x0f\x3d\x1f\x1f\xae\x89\x07\x55",
+		.rlen   = 16,
+		.also_non_np = 1,
+		.np	= 2,
+		.tap	= { 8, 8 },
+	}, {
+		.key    = "\xa8\xda\x24\x9b\x5e\xfa\x13\xc2"
+			  "\xc1\x94\xbf\x32\xba\x38\xa3\x77",
+		.klen   = 16,
+		.iv	= "\x4d\x47\x61\x37\x2b\x47\x86\xf0"
+			  "\xd6\x47\xb5\xc2\xe8\xcf\x85\x27",
+		.input	= "\xb8\xee\x29\xe4\xa5\xd1\xe7\x55"
+			  "\xd0\xfd\xe7\x22\x63\x76\x36\xe2"
+			  "\xf8\x0c\xf8\xfe\x65\x76\xe7\xca"
+			  "\xc1\x42\xf5\xca\x5a\xa8\xac\x2a",
+		.ilen   = 32,
+		.result = "\x59\xf2\x78\x4e\x10\x94\xf9\x5c"
+			  "\x22\x23\x78\x2a\x30\x48\x11\x97"
+			  "\xb1\xfe\x70\xc4\xef\xdf\x04\xef"
+			  "\x16\x39\x04\xcf\xc0\x95\x9a\x98",
+		.rlen   = 32,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 16, 13, 3 },
+	}, {
+		.key    = "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+		.klen   = 16,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00",
+		.ilen   = 63,
+		.result = "\xe0\x40\xeb\xe9\x52\xbe\x65\x60"
+			  "\xe4\x68\x68\xa3\x73\x75\xb8\x52"
+			  "\xef\x38\x6a\x87\x25\x25\xf6\x04"
+			  "\xe5\x8e\xbe\x14\x8b\x02\x14\x1f"
+			  "\xa9\x73\xb7\xad\x15\xbe\x9c\xa0"
+			  "\xd2\x8a\x2c\xdc\xd4\xe3\x05\x55"
+			  "\x0a\xf5\xf8\x51\xee\xe5\x62\xa5"
+			  "\x71\xa7\x7c\x15\x5d\x7a\x9e",
+		.rlen   = 63,
+		.also_non_np = 1,
+		.np	= 8,
+		.tap	= { 20, 20, 10, 8, 2, 1, 1, 1 },
+	}, {
+		.key    = "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+		.klen   = 16,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x01"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00",
+		.ilen   = 63,
+		.result = "\x4b\x1a\x15\xa0\xaf\x08\x6d\x70"
+			  "\xf0\xa7\x97\xb5\x31\x4b\x8c\xc3"
+			  "\x4d\xf2\x7a\x9d\xdd\xd4\x15\x99"
+			  "\x57\xad\xc6\xb1\x35\x69\xf5\x6a"
+			  "\x2d\x70\xe4\x97\x49\xb2\x9f\x71"
+			  "\xde\x22\xb5\x70\x8c\x69\x24\xd3"
+			  "\xad\x80\x58\x48\x90\xe4\xed\xba"
+			  "\x76\x3d\x71\x7c\x57\x25\x87",
+		.rlen   = 63,
+		.also_non_np = 1,
+		.np	= 8,
+		.tap	= { 20, 20, 10, 8, 2, 1, 1, 1 },
+	}
+};
+
+static struct cipher_testvec aes_heh_dec_tv_template[] = {
+	{
+		.key    = "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+		.klen   = 16,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input = "\xd8\xbd\x40\xbf\xca\xe5\xee\x81"
+			  "\x0f\x3d\x1f\x1f\xae\x89\x07\x55",
+		.ilen   = 16,
+		.result	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+		.rlen   = 16,
+		.also_non_np = 1,
+		.np	= 2,
+		.tap	= { 8, 8 },
+	}, {
+		.key    = "\xa8\xda\x24\x9b\x5e\xfa\x13\xc2"
+			  "\xc1\x94\xbf\x32\xba\x38\xa3\x77",
+		.klen   = 16,
+		.iv	= "\x4d\x47\x61\x37\x2b\x47\x86\xf0"
+			  "\xd6\x47\xb5\xc2\xe8\xcf\x85\x27",
+		.input = "\x59\xf2\x78\x4e\x10\x94\xf9\x5c"
+			  "\x22\x23\x78\x2a\x30\x48\x11\x97"
+			  "\xb1\xfe\x70\xc4\xef\xdf\x04\xef"
+			  "\x16\x39\x04\xcf\xc0\x95\x9a\x98",
+		.ilen   = 32,
+		.result	= "\xb8\xee\x29\xe4\xa5\xd1\xe7\x55"
+			  "\xd0\xfd\xe7\x22\x63\x76\x36\xe2"
+			  "\xf8\x0c\xf8\xfe\x65\x76\xe7\xca"
+			  "\xc1\x42\xf5\xca\x5a\xa8\xac\x2a",
+		.rlen   = 32,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 16, 13, 3 },
+	}, {
+		.key    = "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+		.klen   = 16,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input = "\xe0\x40\xeb\xe9\x52\xbe\x65\x60"
+			  "\xe4\x68\x68\xa3\x73\x75\xb8\x52"
+			  "\xef\x38\x6a\x87\x25\x25\xf6\x04"
+			  "\xe5\x8e\xbe\x14\x8b\x02\x14\x1f"
+			  "\xa9\x73\xb7\xad\x15\xbe\x9c\xa0"
+			  "\xd2\x8a\x2c\xdc\xd4\xe3\x05\x55"
+			  "\x0a\xf5\xf8\x51\xee\xe5\x62\xa5"
+			  "\x71\xa7\x7c\x15\x5d\x7a\x9e",
+		.ilen   = 63,
+		.result	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00",
+		.rlen   = 63,
+		.also_non_np = 1,
+		.np	= 8,
+		.tap	= { 20, 20, 10, 8, 2, 1, 1, 1 },
+	}, {
+		.key    = "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+		.klen   = 16,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input = "\x4b\x1a\x15\xa0\xaf\x08\x6d\x70"
+			  "\xf0\xa7\x97\xb5\x31\x4b\x8c\xc3"
+			  "\x4d\xf2\x7a\x9d\xdd\xd4\x15\x99"
+			  "\x57\xad\xc6\xb1\x35\x69\xf5\x6a"
+			  "\x2d\x70\xe4\x97\x49\xb2\x9f\x71"
+			  "\xde\x22\xb5\x70\x8c\x69\x24\xd3"
+			  "\xad\x80\x58\x48\x90\xe4\xed\xba"
+			  "\x76\x3d\x71\x7c\x57\x25\x87",
+		.ilen   = 63,
+		.result	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x01"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00",
+		.rlen   = 63,
+		.also_non_np = 1,
+		.np	= 8,
+		.tap	= { 20, 20, 10, 8, 2, 1, 1, 1 },
+	}
+};
+
 static struct cipher_testvec aes_cbc_enc_tv_template[] = {
 	{ /* From RFC 3602 */
 		.key    = "\x06\xa9\x21\x40\x36\xb8\xa1\x5b"

From 58b9edb065b0b75a794f9b6f80e42e992bde72d2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 11 Jan 2017 10:36:41 -0800
Subject: [PATCH 0664/3220] ANDROID: crypto: heh - factor out poly_hash
 algorithm

Factor most of poly_hash() out into its own keyed hash algorithm so that
optimized architecture-specific implementations of it will be possible.

For now we call poly_hash through the shash API, since HEH already had
an example of using shash for another algorithm (CMAC), and we will not
be adding any poly_hash implementations that require ahash yet.  We can
however switch to ahash later if it becomes useful.

Bug: 32508661
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: I8de54ddcecd1d7fa6e9842a09506a08129bae0b6
---
 crypto/heh.c | 330 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 232 insertions(+), 98 deletions(-)

diff --git a/crypto/heh.c b/crypto/heh.c
index 48a284cecaa2..10c00aaf797e 100644
--- a/crypto/heh.c
+++ b/crypto/heh.c
@@ -37,13 +37,14 @@
 
 struct heh_instance_ctx {
 	struct crypto_shash_spawn cmac;
+	struct crypto_shash_spawn poly_hash;
 	struct crypto_skcipher_spawn ecb;
 };
 
 struct heh_tfm_ctx {
 	struct crypto_shash *cmac;
+	struct crypto_shash *poly_hash; /* keyed with tau_key */
 	struct crypto_ablkcipher *ecb;
-	struct gf128mul_4k *tau_key;
 };
 
 struct heh_cmac_data {
@@ -63,6 +64,10 @@ struct heh_req_ctx { /* aligned to alignmask */
 			struct shash_desc desc;
 			/* + crypto_shash_descsize(cmac) */
 		} cmac;
+		struct {
+			struct shash_desc desc;
+			/* + crypto_shash_descsize(poly_hash) */
+		} poly_hash;
 		struct {
 			u8 keystream[HEH_BLOCK_SIZE];
 			u8 tmp[HEH_BLOCK_SIZE];
@@ -157,73 +162,138 @@ static int generate_betas(struct ablkcipher_request *req,
 	return 0;
 }
 
+/*****************************************************************************/
+
 /*
- * Evaluation of a polynomial over GF(2^128) using Horner's rule.  The
- * polynomial is evaluated at 'point'.  The polynomial's coefficients are taken
- * from 'coeffs_sgl' and are for terms with consecutive descending degree ending
- * at degree 1.  'bytes_of_coeffs' is 16 times the number of terms.
+ * This is the generic version of poly_hash.  It does the GF(2^128)
+ * multiplication by 'tau_key' using a precomputed table, without using any
+ * special CPU instructions.  On some platforms, an accelerated version (with
+ * higher cra_priority) may be used instead.
  */
-static be128 evaluate_polynomial(struct gf128mul_4k *point,
-				 struct scatterlist *coeffs_sgl,
-				 unsigned int bytes_of_coeffs)
+
+struct poly_hash_tfm_ctx {
+	struct gf128mul_4k *tau_key;
+};
+
+struct poly_hash_desc_ctx {
+	be128 digest;
+	unsigned int count;
+};
+
+static int poly_hash_setkey(struct crypto_shash *tfm,
+			    const u8 *key, unsigned int keylen)
 {
-	be128 value = {0};
-	struct sg_mapping_iter miter;
-	unsigned int remaining = bytes_of_coeffs;
-	unsigned int needed = 0;
+	struct poly_hash_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+	be128 key128;
 
-	sg_miter_start(&miter, coeffs_sgl, sg_nents(coeffs_sgl),
-		       SG_MITER_FROM_SG | SG_MITER_ATOMIC);
-	while (remaining) {
-		be128 coeff;
-		const u8 *src;
-		unsigned int srclen;
-		u8 *dst = (u8 *)&value;
+	if (keylen != HEH_BLOCK_SIZE) {
+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
 
-		/*
-		 * Note: scatterlist elements are not necessarily evenly
-		 * divisible into blocks, nor are they necessarily aligned to
-		 * __alignof__(be128).
-		 */
-		sg_miter_next(&miter);
+	if (tctx->tau_key)
+		gf128mul_free_4k(tctx->tau_key);
+	memcpy(&key128, key, HEH_BLOCK_SIZE);
+	tctx->tau_key = gf128mul_init_4k_ble(&key128);
+	if (!tctx->tau_key)
+		return -ENOMEM;
+	return 0;
+}
 
-		src = miter.addr;
-		srclen = min_t(unsigned int, miter.length, remaining);
-		remaining -= srclen;
+static int poly_hash_init(struct shash_desc *desc)
+{
+	struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc);
 
-		if (needed) {
-			unsigned int n = min(srclen, needed);
-			u8 *pos = dst + (HEH_BLOCK_SIZE - needed);
+	ctx->digest = (be128) { 0 };
+	ctx->count = 0;
+	return 0;
+}
 
-			needed -= n;
-			srclen -= n;
+static int poly_hash_update(struct shash_desc *desc, const u8 *src,
+			    unsigned int len)
+{
+	struct poly_hash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int partial = ctx->count % HEH_BLOCK_SIZE;
+	u8 *dst = (u8 *)&ctx->digest + partial;
 
-			while (n--)
-				*pos++ ^= *src++;
+	ctx->count += len;
 
-			if (!needed)
-				gf128mul_4k_ble(&value, point);
-		}
+	/* Finishing at least one block? */
+	if (partial + len >= HEH_BLOCK_SIZE) {
 
-		while (srclen >= HEH_BLOCK_SIZE) {
-			memcpy(&coeff, src, HEH_BLOCK_SIZE);
-			be128_xor(&value, &value, &coeff);
-			gf128mul_4k_ble(&value, point);
-			src += HEH_BLOCK_SIZE;
-			srclen -= HEH_BLOCK_SIZE;
-		}
+		if (partial) {
+			/* Finish the pending block. */
+			unsigned int n = HEH_BLOCK_SIZE - partial;
 
-		if (srclen) {
-			needed = HEH_BLOCK_SIZE - srclen;
+			len -= n;
 			do {
 				*dst++ ^= *src++;
-			} while (--srclen);
+			} while (--n);
+
+			gf128mul_4k_ble(&ctx->digest, tctx->tau_key);
 		}
+
+		/* Process zero or more full blocks. */
+		while (len >= HEH_BLOCK_SIZE) {
+			be128 coeff;
+
+			memcpy(&coeff, src, HEH_BLOCK_SIZE);
+			be128_xor(&ctx->digest, &ctx->digest, &coeff);
+			src += HEH_BLOCK_SIZE;
+			len -= HEH_BLOCK_SIZE;
+			gf128mul_4k_ble(&ctx->digest, tctx->tau_key);
+		}
+		dst = (u8 *)&ctx->digest;
 	}
-	sg_miter_stop(&miter);
-	return value;
+
+	/* Continue adding the next block to 'digest'. */
+	while (len--)
+		*dst++ ^= *src++;
+	return 0;
 }
 
+static int poly_hash_final(struct shash_desc *desc, u8 *out)
+{
+	struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	/* Finish the last block if needed. */
+	if (ctx->count % HEH_BLOCK_SIZE) {
+		struct poly_hash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+
+		gf128mul_4k_ble(&ctx->digest, tctx->tau_key);
+	}
+
+	memcpy(out, &ctx->digest, HEH_BLOCK_SIZE);
+	return 0;
+}
+
+static void poly_hash_exit(struct crypto_tfm *tfm)
+{
+	struct poly_hash_tfm_ctx *tctx = crypto_tfm_ctx(tfm);
+
+	gf128mul_free_4k(tctx->tau_key);
+}
+
+static struct shash_alg poly_hash_alg = {
+	.digestsize	= HEH_BLOCK_SIZE,
+	.init		= poly_hash_init,
+	.update		= poly_hash_update,
+	.final		= poly_hash_final,
+	.setkey		= poly_hash_setkey,
+	.descsize	= sizeof(struct poly_hash_desc_ctx),
+	.base		= {
+		.cra_name		= "poly_hash",
+		.cra_driver_name	= "poly_hash-generic",
+		.cra_priority		= 100,
+		.cra_ctxsize		= sizeof(struct poly_hash_tfm_ctx),
+		.cra_exit		= poly_hash_exit,
+		.cra_module		= THIS_MODULE,
+	},
+};
+
+/*****************************************************************************/
+
 /*
  * Split the message into 16 byte blocks, padding out the last block, and use
  * the blocks as coefficients in the evaluation of a polynomial over GF(2^128)
@@ -242,18 +312,42 @@ static be128 evaluate_polynomial(struct gf128mul_4k *point,
  *	N is the number of full blocks in the message
  *	m_i is the i-th full block in the message for i = 0 to N-1 inclusive
  *	m_N is the partial block of the message zero-padded up to 16 bytes
+ *
+ * Note that most of this is now separated out into its own keyed hash
+ * algorithm, to allow optimized implementations.  However, we still handle the
+ * swapping of the last two coefficients here in the HEH template because this
+ * simplifies the poly_hash algorithms: they don't have to buffer an extra
+ * block, don't have to duplicate as much code, and are more similar to GHASH.
  */
-static be128 poly_hash(struct crypto_ablkcipher *tfm, struct scatterlist *sgl,
-		       unsigned int len)
+static int poly_hash(struct ablkcipher_request *req, struct scatterlist *sgl,
+		     be128 *hash)
 {
+	struct heh_req_ctx *rctx = heh_req_ctx(req);
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
 	struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-	unsigned int tail_offset = get_tail_offset(len);
-	unsigned int tail_len = len - tail_offset;
-	be128 hash;
+	struct shash_desc *desc = &rctx->u.poly_hash.desc;
+	unsigned int tail_offset = get_tail_offset(req->nbytes);
+	unsigned int tail_len = req->nbytes - tail_offset;
 	be128 tail[2];
+	unsigned int i, n;
+	struct sg_mapping_iter miter;
+	int err;
+
+	desc->tfm = ctx->poly_hash;
+	desc->flags = req->base.flags;
 
 	/* Handle all full blocks except the last */
-	hash = evaluate_polynomial(ctx->tau_key, sgl, tail_offset);
+	err = crypto_shash_init(desc);
+	sg_miter_start(&miter, sgl, sg_nents(sgl),
+		       SG_MITER_FROM_SG | SG_MITER_ATOMIC);
+	for (i = 0; i < tail_offset && !err; i += n) {
+		sg_miter_next(&miter);
+		n = min_t(unsigned int, miter.length, tail_offset - i);
+		err = crypto_shash_update(desc, miter.addr, n);
+	}
+	sg_miter_stop(&miter);
+	if (err)
+		return err;
 
 	/* Handle the last full block and the partial block */
 	scatterwalk_map_and_copy(tail, sgl, tail_offset, tail_len, 0);
@@ -261,11 +355,15 @@ static be128 poly_hash(struct crypto_ablkcipher *tfm, struct scatterlist *sgl,
 	if (tail_len != HEH_BLOCK_SIZE) {
 		/* handle the partial block */
 		memset((u8 *)tail + tail_len, 0, sizeof(tail) - tail_len);
-		be128_xor(&hash, &hash, &tail[1]);
-		gf128mul_4k_ble(&hash, ctx->tau_key);
+		err = crypto_shash_update(desc, (u8 *)&tail[1], HEH_BLOCK_SIZE);
+		if (err)
+			return err;
 	}
-	be128_xor(&hash, &hash, &tail[0]);
-	return hash;
+	err = crypto_shash_final(desc, (u8 *)hash);
+	if (err)
+		return err;
+	be128_xor(hash, hash, &tail[0]);
+	return 0;
 }
 
 /*
@@ -323,13 +421,14 @@ static int heh_tfm_blocks(struct ablkcipher_request *req,
 static int heh_hash(struct ablkcipher_request *req, const be128 *beta_key)
 {
 	be128 hash;
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
 	unsigned int tail_offset = get_tail_offset(req->nbytes);
 	unsigned int partial_len = req->nbytes % HEH_BLOCK_SIZE;
 	int err;
 
 	/* poly_hash() the full message including the partial block */
-	hash = poly_hash(tfm, req->src, req->nbytes);
+	err = poly_hash(req, req->src, &hash);
+	if (err)
+		return err;
 
 	/* Transform all full blocks except the last */
 	err = heh_tfm_blocks(req, req->src, req->dst, tail_offset, &hash,
@@ -361,10 +460,8 @@ static int heh_hash_inv(struct ablkcipher_request *req, const be128 *beta_key)
 	be128 tmp;
 	struct scatterlist tmp_sgl[2];
 	struct scatterlist *tail_sgl;
-	unsigned int len = req->nbytes;
-	unsigned int tail_offset = get_tail_offset(len);
+	unsigned int tail_offset = get_tail_offset(req->nbytes);
 	struct scatterlist *sgl = req->dst;
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
 	int err;
 
 	/*
@@ -388,7 +485,9 @@ static int heh_hash_inv(struct ablkcipher_request *req, const be128 *beta_key)
 	 */
 	memset(&tmp, 0, sizeof(tmp));
 	scatterwalk_map_and_copy(&tmp, tail_sgl, 0, HEH_BLOCK_SIZE, 1);
-	tmp = poly_hash(tfm, sgl, len);
+	err = poly_hash(req, sgl, &tmp);
+	if (err)
+		return err;
 	be128_xor(&tmp, &tmp, &hash);
 	scatterwalk_map_and_copy(&tmp, tail_sgl, 0, HEH_BLOCK_SIZE, 1);
 	return 0;
@@ -522,8 +621,6 @@ static int heh_ecb(struct ablkcipher_request *req, bool decrypt)
 
 static int heh_crypt(struct ablkcipher_request *req, bool decrypt)
 {
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
-	struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm);
 	struct heh_req_ctx *rctx = heh_req_ctx(req);
 	int err;
 
@@ -531,9 +628,6 @@ static int heh_crypt(struct ablkcipher_request *req, bool decrypt)
 	if (req->nbytes < HEH_BLOCK_SIZE)
 		return -EINVAL;
 
-	/* Key must have been set */
-	if (!ctx->tau_key)
-		return -ENOKEY;
 	err = generate_betas(req, &rctx->beta1_key, &rctx->beta2_key);
 	if (err)
 		return err;
@@ -602,11 +696,8 @@ static int heh_setkey(struct crypto_ablkcipher *parent, const u8 *key,
 		memcpy(derived_keys + i, digest, HEH_BLOCK_SIZE);
 	}
 
-	if (ctx->tau_key)
-		gf128mul_free_4k(ctx->tau_key);
-	err = -ENOMEM;
-	ctx->tau_key = gf128mul_init_4k_ble((const be128 *)derived_keys);
-	if (!ctx->tau_key)
+	err = crypto_shash_setkey(ctx->poly_hash, derived_keys, HEH_BLOCK_SIZE);
+	if (err)
 		goto out;
 
 	crypto_ablkcipher_clear_flags(ecb, CRYPTO_TFM_REQ_MASK);
@@ -627,6 +718,7 @@ static int heh_init_tfm(struct crypto_tfm *tfm)
 	struct heh_instance_ctx *ictx = crypto_instance_ctx(inst);
 	struct heh_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct crypto_shash *cmac;
+	struct crypto_shash *poly_hash;
 	struct crypto_ablkcipher *ecb;
 	unsigned int reqsize;
 	int err;
@@ -635,25 +727,37 @@ static int heh_init_tfm(struct crypto_tfm *tfm)
 	if (IS_ERR(cmac))
 		return PTR_ERR(cmac);
 
+	poly_hash = crypto_spawn_shash(&ictx->poly_hash);
+	err = PTR_ERR(poly_hash);
+	if (IS_ERR(poly_hash))
+		goto err_free_cmac;
+
 	ecb = crypto_spawn_skcipher(&ictx->ecb);
 	err = PTR_ERR(ecb);
 	if (IS_ERR(ecb))
-		goto err_free_cmac;
+		goto err_free_poly_hash;
 
 	ctx->cmac = cmac;
+	ctx->poly_hash = poly_hash;
 	ctx->ecb = ecb;
 
 	reqsize = crypto_tfm_alg_alignmask(tfm) &
 		  ~(crypto_tfm_ctx_alignment() - 1);
-	reqsize += max(offsetof(struct heh_req_ctx, u.cmac.desc) +
-			sizeof(struct shash_desc) +
-			crypto_shash_descsize(cmac),
-		       offsetof(struct heh_req_ctx, u.ecb.req) +
-			sizeof(struct ablkcipher_request) +
-			crypto_ablkcipher_reqsize(ecb));
+	reqsize += max3(offsetof(struct heh_req_ctx, u.cmac.desc) +
+			  sizeof(struct shash_desc) +
+			  crypto_shash_descsize(cmac),
+			offsetof(struct heh_req_ctx, u.poly_hash.desc) +
+			  sizeof(struct shash_desc) +
+			  crypto_shash_descsize(poly_hash),
+			offsetof(struct heh_req_ctx, u.ecb.req) +
+			  sizeof(struct ablkcipher_request) +
+			  crypto_ablkcipher_reqsize(ecb));
 	tfm->crt_ablkcipher.reqsize = reqsize;
+
 	return 0;
 
+err_free_poly_hash:
+	crypto_free_shash(poly_hash);
 err_free_cmac:
 	crypto_free_shash(cmac);
 	return err;
@@ -663,8 +767,8 @@ static void heh_exit_tfm(struct crypto_tfm *tfm)
 {
 	struct heh_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	gf128mul_free_4k(ctx->tau_key);
 	crypto_free_shash(ctx->cmac);
+	crypto_free_shash(ctx->poly_hash);
 	crypto_free_ablkcipher(ctx->ecb);
 }
 
@@ -673,6 +777,7 @@ static void heh_free_instance(struct crypto_instance *inst)
 	struct heh_instance_ctx *ctx = crypto_instance_ctx(inst);
 
 	crypto_drop_shash(&ctx->cmac);
+	crypto_drop_shash(&ctx->poly_hash);
 	crypto_drop_skcipher(&ctx->ecb);
 	kfree(inst);
 }
@@ -689,12 +794,13 @@ static void heh_free_instance(struct crypto_instance *inst)
  */
 static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb,
 			     const char *full_name, const char *cmac_name,
-			     const char *ecb_name)
+			     const char *poly_hash_name, const char *ecb_name)
 {
 	struct crypto_attr_type *algt;
 	struct crypto_instance *inst;
 	struct heh_instance_ctx *ctx;
 	struct shash_alg *cmac;
+	struct shash_alg *poly_hash;
 	struct crypto_alg *ecb;
 	int err;
 
@@ -713,10 +819,9 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb,
 
 	ctx = crypto_instance_ctx(inst);
 
-	/* Set up the cmac and ecb spawns */
-
+	/* Set up the cmac spawn */
 	ctx->cmac.base.inst = inst;
-	err = crypto_grab_shash(&ctx->cmac, cmac_name, 0, CRYPTO_ALG_ASYNC);
+	err = crypto_grab_shash(&ctx->cmac, cmac_name, 0, 0);
 	if (err)
 		goto err_free_inst;
 	cmac = crypto_spawn_shash_alg(&ctx->cmac);
@@ -724,12 +829,23 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb,
 	if (cmac->digestsize != HEH_BLOCK_SIZE)
 		goto err_drop_cmac;
 
+	/* Set up the poly_hash spawn */
+	ctx->poly_hash.base.inst = inst;
+	err = crypto_grab_shash(&ctx->poly_hash, poly_hash_name, 0, 0);
+	if (err)
+		goto err_drop_cmac;
+	poly_hash = crypto_spawn_shash_alg(&ctx->poly_hash);
+	err = -EINVAL;
+	if (poly_hash->digestsize != HEH_BLOCK_SIZE)
+		goto err_drop_poly_hash;
+
+	/* Set up the ecb spawn */
 	ctx->ecb.base.inst = inst;
 	err = crypto_grab_skcipher(&ctx->ecb, ecb_name, 0,
 				   crypto_requires_sync(algt->type,
 							algt->mask));
 	if (err)
-		goto err_drop_cmac;
+		goto err_drop_poly_hash;
 	ecb = crypto_skcipher_spawn_alg(&ctx->ecb);
 
 	/* HEH only supports block ciphers with 16 byte block size */
@@ -750,7 +866,8 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb,
 	/* Set the instance names */
 	err = -ENAMETOOLONG;
 	if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME,
-		     "heh_base(%s,%s)", cmac->base.cra_driver_name,
+		     "heh_base(%s,%s,%s)", cmac->base.cra_driver_name,
+		     poly_hash->base.cra_driver_name,
 		     ecb->cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
 		goto err_drop_ecb;
 
@@ -762,8 +879,7 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb,
 	/* Finish initializing the instance */
 
 	inst->alg.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER |
-				((cmac->base.cra_flags | ecb->cra_flags) &
-				 CRYPTO_ALG_ASYNC);
+				(ecb->cra_flags & CRYPTO_ALG_ASYNC);
 	inst->alg.cra_blocksize = HEH_BLOCK_SIZE;
 	inst->alg.cra_ctxsize = sizeof(struct heh_tfm_ctx);
 	inst->alg.cra_alignmask = ecb->cra_alignmask | (__alignof__(be128) - 1);
@@ -792,6 +908,8 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb,
 
 err_drop_ecb:
 	crypto_drop_skcipher(&ctx->ecb);
+err_drop_poly_hash:
+	crypto_drop_shash(&ctx->poly_hash);
 err_drop_cmac:
 	crypto_drop_shash(&ctx->cmac);
 err_free_inst:
@@ -823,7 +941,8 @@ static int heh_create(struct crypto_template *tmpl, struct rtattr **tb)
 	    CRYPTO_MAX_ALG_NAME)
 		return -ENAMETOOLONG;
 
-	return heh_create_common(tmpl, tb, full_name, cmac_name, ecb_name);
+	return heh_create_common(tmpl, tb, full_name, cmac_name, "poly_hash",
+				 ecb_name);
 }
 
 static struct crypto_template heh_tmpl = {
@@ -837,26 +956,34 @@ static int heh_base_create(struct crypto_template *tmpl, struct rtattr **tb)
 {
 	char full_name[CRYPTO_MAX_ALG_NAME];
 	const char *cmac_name;
+	const char *poly_hash_name;
 	const char *ecb_name;
 
 	cmac_name = crypto_attr_alg_name(tb[1]);
 	if (IS_ERR(cmac_name))
 		return PTR_ERR(cmac_name);
 
-	ecb_name = crypto_attr_alg_name(tb[2]);
+	poly_hash_name = crypto_attr_alg_name(tb[2]);
+	if (IS_ERR(poly_hash_name))
+		return PTR_ERR(poly_hash_name);
+
+	ecb_name = crypto_attr_alg_name(tb[3]);
 	if (IS_ERR(ecb_name))
 		return PTR_ERR(ecb_name);
 
-	if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "heh_base(%s,%s)",
-		     cmac_name, ecb_name) >= CRYPTO_MAX_ALG_NAME)
+	if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "heh_base(%s,%s,%s)",
+		     cmac_name, poly_hash_name, ecb_name) >=
+	    CRYPTO_MAX_ALG_NAME)
 		return -ENAMETOOLONG;
 
-	return heh_create_common(tmpl, tb, full_name, cmac_name, ecb_name);
+	return heh_create_common(tmpl, tb, full_name, cmac_name, poly_hash_name,
+				 ecb_name);
 }
 
 /*
  * If HEH is instantiated as "heh_base" instead of "heh", then specific
- * implementations of cmac and ecb can be specified instead of just the cipher
+ * implementations of cmac, poly_hash, and ecb can be specified instead of just
+ * the cipher.
  */
 static struct crypto_template heh_base_tmpl = {
 	.name = "heh_base",
@@ -877,8 +1004,14 @@ static int __init heh_module_init(void)
 	if (err)
 		goto out_undo_heh;
 
+	err = crypto_register_shash(&poly_hash_alg);
+	if (err)
+		goto out_undo_heh_base;
+
 	return 0;
 
+out_undo_heh_base:
+	crypto_unregister_template(&heh_base_tmpl);
 out_undo_heh:
 	crypto_unregister_template(&heh_tmpl);
 	return err;
@@ -888,6 +1021,7 @@ static void __exit heh_module_exit(void)
 {
 	crypto_unregister_template(&heh_tmpl);
 	crypto_unregister_template(&heh_base_tmpl);
+	crypto_unregister_shash(&poly_hash_alg);
 }
 
 module_init(heh_module_init);

From 0223de3a24eff401c8eafb27055ad5fc290f2808 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 10 Jan 2017 18:32:19 -0800
Subject: [PATCH 0665/3220] ANDROID: arm64/crypto: add ARMv8-CE optimized
 poly_hash algorithm

poly_hash is part of the HEH (Hash-Encrypt-Hash) encryption mode,
proposed in Internet Draft
https://tools.ietf.org/html/draft-cope-heh-01.  poly_hash is very
similar to GHASH; besides the swapping of the last two coefficients
which we opted to handle in the HEH template, poly_hash just uses a
different finite field representation.  As with GHASH, poly_hash becomes
much faster and more secure against timing attacks when implemented
using carryless multiplication instructions instead of tables.  This
patch adds an ARMv8-CE optimized version of poly_hash, based roughly on
the existing ARMv8-CE optimized version of GHASH.

Benchmark results are shown below, but note that the resistance to
timing attacks may be even more important than the performance gain.

poly_hash only:

    poly_hash-generic:
        1,000,000 setkey() takes 1185 ms
        hashing is 328 MB/s

    poly_hash-ce:
        1,000,000 setkey() takes 8 ms
        hashing is 1756 MB/s

heh(aes) with 4096-byte inputs (this is the ideal case, as the
improvement is less significant with smaller inputs):

    encryption with "heh_base(cmac(aes-ce),poly_hash-generic,ecb-aes-ce)": 118 MB/s
    decryption with "heh_base(cmac(aes-ce),poly_hash-generic,ecb-aes-ce)": 120 MB/s

    encryption with "heh_base(cmac(aes-ce),poly_hash-ce,ecb-aes-ce)": 291 MB/s
    decryption with "heh_base(cmac(aes-ce),poly_hash-ce,ecb-aes-ce)": 293 MB/s

Bug: 32508661
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: I621ec0e1115df7e6f5cbd7e864a4a9d8d2e94cf2
---
 arch/arm64/crypto/Kconfig             |   5 +
 arch/arm64/crypto/Makefile            |   3 +
 arch/arm64/crypto/poly-hash-ce-core.S | 163 +++++++++++++++++++++++++
 arch/arm64/crypto/poly-hash-ce-glue.c | 166 ++++++++++++++++++++++++++
 crypto/Kconfig                        |   1 +
 5 files changed, 338 insertions(+)
 create mode 100644 arch/arm64/crypto/poly-hash-ce-core.S
 create mode 100644 arch/arm64/crypto/poly-hash-ce-glue.c

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 2cf32e9887e1..de1aab4b5da8 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -23,6 +23,11 @@ config CRYPTO_GHASH_ARM64_CE
 	depends on ARM64 && KERNEL_MODE_NEON
 	select CRYPTO_HASH
 
+config CRYPTO_POLY_HASH_ARM64_CE
+	tristate "poly_hash (for HEH encryption mode) using ARMv8 Crypto Extensions"
+	depends on ARM64 && KERNEL_MODE_NEON
+	select CRYPTO_HASH
+
 config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index abb79b3cfcfe..f0a8f2475ea3 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -17,6 +17,9 @@ sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
+obj-$(CONFIG_CRYPTO_POLY_HASH_ARM64_CE) += poly-hash-ce.o
+poly-hash-ce-y := poly-hash-ce-glue.o poly-hash-ce-core.o
+
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
 
diff --git a/arch/arm64/crypto/poly-hash-ce-core.S b/arch/arm64/crypto/poly-hash-ce-core.S
new file mode 100644
index 000000000000..8ccb544c5526
--- /dev/null
+++ b/arch/arm64/crypto/poly-hash-ce-core.S
@@ -0,0 +1,163 @@
+/*
+ * Accelerated poly_hash implementation with ARMv8 PMULL instructions.
+ *
+ * Based on ghash-ce-core.S.
+ *
+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2017 Google, Inc. <ebiggers@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	KEY	.req	v0
+	KEY2	.req	v1
+	T1	.req	v2
+	T2	.req	v3
+	GSTAR	.req	v4
+	XL	.req	v5
+	XM	.req	v6
+	XH	.req	v7
+
+	.text
+	.arch		armv8-a+crypto
+
+	/* 16-byte aligned (2**4 = 16); not required, but might as well */
+	.align		4
+.Lgstar:
+	.quad		0x87, 0x87
+
+/*
+ * void pmull_poly_hash_update(le128 *digest, const le128 *key,
+ *			       const u8 *src, unsigned int blocks,
+ *			       unsigned int partial);
+ */
+ENTRY(pmull_poly_hash_update)
+
+	/* Load digest into XL */
+	ld1		{XL.16b}, [x0]
+
+	/* Load key into KEY */
+	ld1		{KEY.16b}, [x1]
+
+	/* Load g*(x) = g(x) + x^128 = x^7 + x^2 + x + 1 into both halves of
+	 * GSTAR */
+	adr		x1, .Lgstar
+	ld1		{GSTAR.2d}, [x1]
+
+	/* Set KEY2 to (KEY[1]+KEY[0]):(KEY[1]+KEY[0]).  This is needed for
+	 * Karatsuba multiplication. */
+	ext		KEY2.16b, KEY.16b, KEY.16b, #8
+	eor		KEY2.16b, KEY2.16b, KEY.16b
+
+	/* If 'partial' is nonzero, then we're finishing a pending block and
+	 * should go right to the multiplication. */
+	cbnz		w4, 1f
+
+0:
+	/* Add the next block from 'src' to the digest */
+	ld1		{T1.16b}, [x2], #16
+	eor		XL.16b, XL.16b, T1.16b
+	sub		w3, w3, #1
+
+1:
+	/*
+	 * Multiply the current 128-bit digest (a1:a0, in XL) by the 128-bit key
+	 * (b1:b0, in KEY) using Karatsuba multiplication.
+	 */
+
+	/* T1 = (a1+a0):(a1+a0) */
+	ext		T1.16b, XL.16b, XL.16b, #8
+	eor		T1.16b, T1.16b, XL.16b
+
+	/* XH = a1 * b1 */
+	pmull2		XH.1q, XL.2d, KEY.2d
+
+	/* XL = a0 * b0 */
+	pmull		XL.1q, XL.1d, KEY.1d
+
+	/* XM = (a1+a0) * (b1+b0) */
+	pmull		XM.1q, T1.1d, KEY2.1d
+
+	/* XM += (XH[0]:XL[1]) + XL + XH */
+	ext		T1.16b, XL.16b, XH.16b, #8
+	eor		T2.16b, XL.16b, XH.16b
+	eor		XM.16b, XM.16b, T1.16b
+	eor		XM.16b, XM.16b, T2.16b
+
+	/*
+	 * Now the 256-bit product is in XH[1]:XM:XL[0].  It represents a
+	 * polynomial over GF(2) with degree as large as 255.  We need to
+	 * compute its remainder modulo g(x) = x^128+x^7+x^2+x+1.  For this it
+	 * is sufficient to compute the remainder of the high half 'c(x)x^128'
+	 * add it to the low half.  To reduce the high half we use the Barrett
+	 * reduction method.  The basic idea is that we can express the
+	 * remainder p(x) as g(x)q(x) mod x^128, where q(x) = (c(x)x^128)/g(x).
+	 * As detailed in [1], to avoid having to divide by g(x) at runtime the
+	 * following equivalent expression can be derived:
+	 *
+	 *	p(x) = [ g*(x)((c(x)q+(x))/x^128) ] mod x^128
+	 *
+	 * where g*(x) = x^128+g(x) = x^7+x^2+x+1, and q+(x) = x^256/g(x) = g(x)
+	 * in this case.  This is also equivalent to:
+	 *
+	 *	p(x) = [ g*(x)((c(x)(x^128 + g*(x)))/x^128) ] mod x^128
+	 *	     = [ g*(x)(c(x) + (c(x)g*(x))/x^128) ] mod x^128
+	 *
+	 * Since deg g*(x) < 64:
+	 *
+	 *	p(x) = [ g*(x)(c(x) + ((c(x)/x^64)g*(x))/x^64) ] mod x^128
+	 *	     = [ g*(x)((c(x)/x^64)x^64 + (c(x) mod x^64) +
+	 *				((c(x)/x^64)g*(x))/x^64) ] mod x^128
+	 *
+	 * Letting t(x) = g*(x)(c(x)/x^64):
+	 *
+	 *	p(x) = [ t(x)x^64 + g*(x)((c(x) mod x^64) + t(x)/x^64) ] mod x^128
+	 *
+	 * Therefore, to do the reduction we only need to issue two 64-bit =>
+	 * 128-bit carryless multiplications: g*(x) times c(x)/x^64, and g*(x)
+	 * times ((c(x) mod x^64) + t(x)/x^64).  (Multiplication by x^64 doesn't
+	 * count since it is simply a shift or move.)
+	 *
+	 * An alternate reduction method, also based on Barrett reduction and
+	 * described in [1], uses only shifts and XORs --- no multiplications.
+	 * However, the method with multiplications requires fewer instructions
+	 * and is faster on processors with fast carryless multiplication.
+	 *
+	 * [1] "Intel Carry-Less Multiplication Instruction and its Usage for
+	 * Computing the GCM Mode",
+	 * https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
+	 */
+
+	/* 256-bit product is XH[1]:XM:XL[0], so c(x) is XH[1]:XM[1] */
+
+	/* T1 = t(x) = g*(x)(c(x)/x^64) */
+	pmull2		T1.1q, GSTAR.2d, XH.2d
+
+	/* T2 = g*(x)((c(x) mod x^64) + t(x)/x^64) */
+	eor		T2.16b, XM.16b, T1.16b
+	pmull2		T2.1q, GSTAR.2d, T2.2d
+
+	/* Make XL[0] be the low half of the 128-bit result by adding the low 64
+	 * bits of the T2 term to what was already there.  The 't(x)x^64' term
+	 * makes no difference, so skip it. */
+	eor		XL.16b, XL.16b, T2.16b
+
+	/* Make XL[1] be the high half of the 128-bit result by adding the high
+	 * 64 bits of the 't(x)x^64' and T2 terms to what was already in XM[0],
+	 * then moving XM[0] to XL[1]. */
+	eor		XM.16b, XM.16b, T1.16b
+	ext		T2.16b, T2.16b, T2.16b, #8
+	eor		XM.16b, XM.16b, T2.16b
+	mov		XL.d[1], XM.d[0]
+
+	/* If more blocks remain, then loop back to process the next block;
+	 * else, store the digest and return. */
+	cbnz		w3, 0b
+	st1		{XL.16b}, [x0]
+	ret
+ENDPROC(pmull_poly_hash_update)
diff --git a/arch/arm64/crypto/poly-hash-ce-glue.c b/arch/arm64/crypto/poly-hash-ce-glue.c
new file mode 100644
index 000000000000..e195740c9ecf
--- /dev/null
+++ b/arch/arm64/crypto/poly-hash-ce-glue.c
@@ -0,0 +1,166 @@
+/*
+ * Accelerated poly_hash implementation with ARMv8 PMULL instructions.
+ *
+ * Based on ghash-ce-glue.c.
+ *
+ * poly_hash is part of the HEH (Hash-Encrypt-Hash) encryption mode, proposed in
+ * Internet Draft https://tools.ietf.org/html/draft-cope-heh-01.
+ *
+ * poly_hash is very similar to GHASH: both algorithms are keyed hashes which
+ * interpret their input data as coefficients of a polynomial over GF(2^128),
+ * then calculate a hash value by evaluating that polynomial at the point given
+ * by the key, e.g. using Horner's rule.  The difference is that poly_hash uses
+ * the more natural "ble" convention to represent GF(2^128) elements, whereas
+ * GHASH uses the less natural "lle" convention (see include/crypto/gf128mul.h).
+ * The ble convention makes it simpler to implement GF(2^128) multiplication.
+ *
+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2017 Google Inc. <ebiggers@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/b128ops.h>
+#include <crypto/internal/hash.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+/*
+ * Note: in this algorithm we currently use 'le128' to represent GF(2^128)
+ * elements, even though poly_hash-generic uses 'be128'.  Both types are
+ * actually "wrong" because the elements are actually in 'ble' format, and there
+ * should be a ble type to represent this --- as well as lle, bbe, and lbe types
+ * for the other conventions for representing GF(2^128) elements.  But
+ * practically it doesn't matter which type we choose here, so we just use le128
+ * since it's arguably more accurate, while poly_hash-generic still has to use
+ * be128 because the generic GF(2^128) multiplication functions all take be128.
+ */
+
+struct poly_hash_desc_ctx {
+	le128 digest;
+	unsigned int count;
+};
+
+asmlinkage void pmull_poly_hash_update(le128 *digest, const le128 *key,
+				       const u8 *src, unsigned int blocks,
+				       unsigned int partial);
+
+static int poly_hash_setkey(struct crypto_shash *tfm,
+			    const u8 *key, unsigned int keylen)
+{
+	if (keylen != sizeof(le128)) {
+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(crypto_shash_ctx(tfm), key, sizeof(le128));
+	return 0;
+}
+
+static int poly_hash_init(struct shash_desc *desc)
+{
+	struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->digest = (le128) { 0 };
+	ctx->count = 0;
+	return 0;
+}
+
+static int poly_hash_update(struct shash_desc *desc, const u8 *src,
+			    unsigned int len)
+{
+	struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int partial = ctx->count % sizeof(le128);
+	u8 *dst = (u8 *)&ctx->digest + partial;
+
+	ctx->count += len;
+
+	/* Finishing at least one block? */
+	if (partial + len >= sizeof(le128)) {
+		const le128 *key = crypto_shash_ctx(desc->tfm);
+
+		if (partial) {
+			/* Finish the pending block. */
+			unsigned int n = sizeof(le128) - partial;
+
+			len -= n;
+			do {
+				*dst++ ^= *src++;
+			} while (--n);
+		}
+
+		/*
+		 * Do the real work.  If 'partial' is nonzero, this starts by
+		 * multiplying 'digest' by 'key'.  Then for each additional full
+		 * block it adds the block to 'digest' and multiplies by 'key'.
+		 */
+		kernel_neon_begin_partial(8);
+		pmull_poly_hash_update(&ctx->digest, key, src,
+				       len / sizeof(le128), partial);
+		kernel_neon_end();
+
+		src += len - (len % sizeof(le128));
+		len %= sizeof(le128);
+		dst = (u8 *)&ctx->digest;
+	}
+
+	/* Continue adding the next block to 'digest'. */
+	while (len--)
+		*dst++ ^= *src++;
+	return 0;
+}
+
+static int poly_hash_final(struct shash_desc *desc, u8 *out)
+{
+	struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int partial = ctx->count % sizeof(le128);
+
+	/* Finish the last block if needed. */
+	if (partial) {
+		const le128 *key = crypto_shash_ctx(desc->tfm);
+
+		kernel_neon_begin_partial(8);
+		pmull_poly_hash_update(&ctx->digest, key, NULL, 0, partial);
+		kernel_neon_end();
+	}
+
+	memcpy(out, &ctx->digest, sizeof(le128));
+	return 0;
+}
+
+static struct shash_alg poly_hash_alg = {
+	.digestsize	= sizeof(le128),
+	.init		= poly_hash_init,
+	.update		= poly_hash_update,
+	.final		= poly_hash_final,
+	.setkey		= poly_hash_setkey,
+	.descsize	= sizeof(struct poly_hash_desc_ctx),
+	.base		= {
+		.cra_name		= "poly_hash",
+		.cra_driver_name	= "poly_hash-ce",
+		.cra_priority		= 300,
+		.cra_ctxsize		= sizeof(le128),
+		.cra_module		= THIS_MODULE,
+	},
+};
+
+static int __init poly_hash_ce_mod_init(void)
+{
+	return crypto_register_shash(&poly_hash_alg);
+}
+
+static void __exit poly_hash_ce_mod_exit(void)
+{
+	crypto_unregister_shash(&poly_hash_alg);
+}
+
+MODULE_DESCRIPTION("Polynomial evaluation hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_LICENSE("GPL v2");
+
+module_cpu_feature_match(PMULL, poly_hash_ce_mod_init);
+module_exit(poly_hash_ce_mod_exit);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 627227f1162d..3240d394426c 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -295,6 +295,7 @@ config CRYPTO_HEH
 	select CRYPTO_ECB
 	select CRYPTO_GF128MUL
 	select CRYPTO_MANAGER
+	select CRYPTO_POLY_HASH_ARM64_CE if ARM64 && KERNEL_MODE_NEON
 	help
 	  HEH: Hash-Encrypt-Hash mode
 	  HEH is a proposed block cipher mode of operation which extends the

From 3e0dd6ec69beb4748a2fc93b5140da2248693736 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 10 Jan 2017 17:02:39 -0800
Subject: [PATCH 0666/3220] ANDROID: ext4: allow encrypting filenames using HEH
 algorithm

Update ext4 encryption to allow filenames to be encrypted using the
Hash-Encrypt-Hash (HEH) block cipher mode of operation, which is
believed to be more secure than CBC, particularly within the constant
initialization vector (IV) constraint of filename encryption.  Notably,
HEH avoids the "common prefix" problem of CBC.  Both algorithms use
AES-256 as the underlying block cipher and take a 256-bit key.

We assign mode number 126 to HEH, just below 127
(EXT4_ENCRYPTION_MODE_PRIVATE) which in some kernels is reserved for
inline encryption on MSM chipsets.  Note that these modes are not yet
upstream, which is why these numbers are being used; it's preferable to
avoid collisions with modes that may be added upstream.  Also, although
HEH is not hardware-specific, we aren't currently reserving mode number
5 for HEH upstream, since for now we are tying HEH to the new key
derivation method which might become an independent flag upstream, and
there's also a chance that details of HEH will change after it gets
wider review.

Bug: 32975945
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: I81418709d47da0e0ac607ae3f91088063c2d5dd4
---
 fs/ext4/Kconfig        | 1 +
 fs/ext4/crypto_fname.c | 3 ++-
 fs/ext4/crypto_key.c   | 3 +++
 fs/ext4/ext4.h         | 1 +
 fs/ext4/ext4_crypto.h  | 3 +++
 5 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index b46e9fc64196..3c8293215603 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -106,6 +106,7 @@ config EXT4_ENCRYPTION
 	select CRYPTO_ECB
 	select CRYPTO_XTS
 	select CRYPTO_CTS
+	select CRYPTO_HEH
 	select CRYPTO_CTR
 	select CRYPTO_SHA256
 	select KEYS
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 2fbef8a14760..e2645ca9b95e 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -44,7 +44,8 @@ static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res)
 
 bool ext4_valid_filenames_enc_mode(uint32_t mode)
 {
-	return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS);
+	return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS ||
+		mode == EXT4_ENCRYPTION_MODE_AES_256_HEH);
 }
 
 static unsigned max_name_len(struct inode *inode)
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index c5882b36e558..3600dbf4e971 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -182,6 +182,9 @@ retry:
 	case EXT4_ENCRYPTION_MODE_AES_256_CTS:
 		cipher_str = "cts(cbc(aes))";
 		break;
+	case EXT4_ENCRYPTION_MODE_AES_256_HEH:
+		cipher_str = "heh(aes)";
+		break;
 	default:
 		printk_once(KERN_WARNING
 			    "ext4: unsupported key mode %d (ino %u)\n",
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5cf6d8be48dd..bd135cfe7927 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -588,6 +588,7 @@ enum {
 #define EXT4_ENCRYPTION_MODE_AES_256_GCM	2
 #define EXT4_ENCRYPTION_MODE_AES_256_CBC	3
 #define EXT4_ENCRYPTION_MODE_AES_256_CTS	4
+#define EXT4_ENCRYPTION_MODE_AES_256_HEH	126
 
 #include "ext4_crypto.h"
 
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index ac7d4e813796..41080095b1b7 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -60,6 +60,7 @@ struct ext4_encryption_context {
 #define EXT4_AES_256_GCM_KEY_SIZE 32
 #define EXT4_AES_256_CBC_KEY_SIZE 32
 #define EXT4_AES_256_CTS_KEY_SIZE 32
+#define EXT4_AES_256_HEH_KEY_SIZE 32
 #define EXT4_AES_256_XTS_KEY_SIZE 64
 #define EXT4_MAX_KEY_SIZE 64
 
@@ -121,6 +122,8 @@ static inline int ext4_encryption_key_size(int mode)
 		return EXT4_AES_256_CBC_KEY_SIZE;
 	case EXT4_ENCRYPTION_MODE_AES_256_CTS:
 		return EXT4_AES_256_CTS_KEY_SIZE;
+	case EXT4_ENCRYPTION_MODE_AES_256_HEH:
+		return EXT4_AES_256_HEH_KEY_SIZE;
 	default:
 		BUG();
 	}

From a425a70b2627c5d429c875619d9d42a39bf0f7dd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 10 Jan 2017 17:02:40 -0800
Subject: [PATCH 0667/3220] ANDROID: ext4: add a non-reversible key derivation
 method

Add a new per-file key derivation method to ext4 encryption defined as:

    derived_key[0:127]   = AES-256-ENCRYPT(master_key[0:255], nonce)
    derived_key[128:255] = AES-256-ENCRYPT(master_key[0:255], nonce ^ 0x01)
    derived_key[256:383] = AES-256-ENCRYPT(master_key[256:511], nonce)
    derived_key[384:511] = AES-256-ENCRYPT(master_key[256:511], nonce ^ 0x01)

... where the derived key and master key are both 512 bits, the nonce is
128 bits, AES-256-ENCRYPT takes the arguments (key, plaintext), and
'nonce ^ 0x01' denotes flipping the low order bit of the last byte.

The existing key derivation method is
'derived_key = AES-128-ECB-ENCRYPT(key=nonce, plaintext=master_key)'.
We want to make this change because currently, given a derived key you
can easily compute the master key by computing
'AES-128-ECB-DECRYPT(key=nonce, ciphertext=derived_key)'.

This was formerly OK because the previous threat model assumed that the
master key and derived keys are equally hard to obtain by an attacker.
However, we are looking to move the master key into secure hardware in
some cases, so we want to make sure that an attacker with access to a
derived key cannot compute the master key.

We are doing this instead of increasing the nonce to 512 bits because
it's important that the per-file xattr fit in the inode itself.  By
default, inodes are 256 bytes, and on Android we're already pretty close
to that limit. If we increase the nonce size, we end up allocating a new
filesystem block for each and every encrypted file, which has a
substantial performance and disk utilization impact.

Another option considered was to use the HMAC-SHA512 of the nonce, keyed
by the master key.  However this would be a little less performant,
would be less extensible to other key sizes and MAC algorithms, and
would pull in a dependency (security-wise and code-wise) on SHA-512.

Due to the use of "aes" rather than "ecb(aes)" in the implementation,
the new key derivation method is actually about twice as fast as the old
one, though the old one could be optimized similarly as well.

This patch makes the new key derivation method be used whenever HEH is
used to encrypt filenames.  Although these two features are logically
independent, it was decided to bundle them together for now.  Note that
neither feature is upstream yet, and it cannot be guaranteed that the
on-disk format won't change if/when these features are upstreamed.  For
this reason, and as noted in the previous patch, the features are both
behind a special mode number for now.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: Iee4113f57e59dc8c0b7dc5238d7003c83defb986
---
 fs/ext4/crypto_key.c  | 98 +++++++++++++++++++++++++++++++++++++++----
 fs/ext4/ext4_crypto.h |  1 +
 2 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 3600dbf4e971..776c51beff35 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -29,16 +29,16 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc)
 }
 
 /**
- * ext4_derive_key_aes() - Derive a key using AES-128-ECB
+ * ext4_derive_key_v1() - Derive a key using AES-128-ECB
  * @deriving_key: Encryption key used for derivation.
  * @source_key:   Source key to which to apply derivation.
  * @derived_key:  Derived key.
  *
- * Return: Zero on success; non-zero otherwise.
+ * Return: 0 on success, -errno on failure
  */
-static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
-			       char source_key[EXT4_AES_256_XTS_KEY_SIZE],
-			       char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
+static int ext4_derive_key_v1(const char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
+			      const char source_key[EXT4_AES_256_XTS_KEY_SIZE],
+			      char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
 {
 	int res = 0;
 	struct ablkcipher_request *req = NULL;
@@ -83,6 +83,91 @@ out:
 	return res;
 }
 
+/**
+ * ext4_derive_key_v2() - Derive a key non-reversibly
+ * @nonce: the nonce associated with the file
+ * @master_key: the master key referenced by the file
+ * @derived_key: (output) the resulting derived key
+ *
+ * This function computes the following:
+ *	 derived_key[0:127]   = AES-256-ENCRYPT(master_key[0:255], nonce)
+ *	 derived_key[128:255] = AES-256-ENCRYPT(master_key[0:255], nonce ^ 0x01)
+ *	 derived_key[256:383] = AES-256-ENCRYPT(master_key[256:511], nonce)
+ *	 derived_key[384:511] = AES-256-ENCRYPT(master_key[256:511], nonce ^ 0x01)
+ *
+ * 'nonce ^ 0x01' denotes flipping the low order bit of the last byte.
+ *
+ * Unlike the v1 algorithm, the v2 algorithm is "non-reversible", meaning that
+ * compromising a derived key does not also compromise the master key.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int ext4_derive_key_v2(const char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE],
+			      const char master_key[EXT4_MAX_KEY_SIZE],
+			      char derived_key[EXT4_MAX_KEY_SIZE])
+{
+	const int noncelen = EXT4_KEY_DERIVATION_NONCE_SIZE;
+	struct crypto_cipher *tfm;
+	int err;
+	int i;
+
+	/*
+	 * Since we only use each transform for a small number of encryptions,
+	 * requesting just "aes" turns out to be significantly faster than
+	 * "ecb(aes)", by about a factor of two.
+	 */
+	tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	BUILD_BUG_ON(4 * EXT4_KEY_DERIVATION_NONCE_SIZE != EXT4_MAX_KEY_SIZE);
+	BUILD_BUG_ON(2 * EXT4_AES_256_ECB_KEY_SIZE != EXT4_MAX_KEY_SIZE);
+	for (i = 0; i < 2; i++) {
+		memcpy(derived_key, nonce, noncelen);
+		memcpy(derived_key + noncelen, nonce, noncelen);
+		derived_key[2 * noncelen - 1] ^= 0x01;
+		err = crypto_cipher_setkey(tfm, master_key,
+					   EXT4_AES_256_ECB_KEY_SIZE);
+		if (err)
+			break;
+		crypto_cipher_encrypt_one(tfm, derived_key, derived_key);
+		crypto_cipher_encrypt_one(tfm, derived_key + noncelen,
+					  derived_key + noncelen);
+		master_key += EXT4_AES_256_ECB_KEY_SIZE;
+		derived_key += 2 * noncelen;
+	}
+	crypto_free_cipher(tfm);
+	return err;
+}
+
+/**
+ * ext4_derive_key() - Derive a per-file key from a nonce and master key
+ * @ctx: the encryption context associated with the file
+ * @master_key: the master key referenced by the file
+ * @derived_key: (output) the resulting derived key
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int ext4_derive_key(const struct ext4_encryption_context *ctx,
+			   const char master_key[EXT4_MAX_KEY_SIZE],
+			   char derived_key[EXT4_MAX_KEY_SIZE])
+{
+	BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE);
+	BUILD_BUG_ON(EXT4_AES_256_XTS_KEY_SIZE != EXT4_MAX_KEY_SIZE);
+
+	/*
+	 * Although the key derivation algorithm is logically independent of the
+	 * choice of encryption modes, in this kernel it is bundled with HEH
+	 * encryption of filenames, which is another crypto improvement that
+	 * requires an on-disk format change and requires userspace to specify
+	 * different encryption policies.
+	 */
+	if (ctx->filenames_encryption_mode == EXT4_ENCRYPTION_MODE_AES_256_HEH)
+		return ext4_derive_key_v2(ctx->nonce, master_key, derived_key);
+	else
+		return ext4_derive_key_v1(ctx->nonce, master_key, derived_key);
+}
+
 void ext4_free_crypt_info(struct ext4_crypt_info *ci)
 {
 	if (!ci)
@@ -231,8 +316,7 @@ retry:
 		res = -ENOKEY;
 		goto out;
 	}
-	res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
-				  raw_key);
+	res = ext4_derive_key(&ctx, master_key->raw, raw_key);
 	if (res)
 		goto out;
 got_key:
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index 41080095b1b7..5a1684bd083b 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -58,6 +58,7 @@ struct ext4_encryption_context {
 #define EXT4_XTS_TWEAK_SIZE 16
 #define EXT4_AES_128_ECB_KEY_SIZE 16
 #define EXT4_AES_256_GCM_KEY_SIZE 32
+#define EXT4_AES_256_ECB_KEY_SIZE 32
 #define EXT4_AES_256_CBC_KEY_SIZE 32
 #define EXT4_AES_256_CTS_KEY_SIZE 32
 #define EXT4_AES_256_HEH_KEY_SIZE 32

From 8b75db9857a49502a9b7156433f1bc57451805c1 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@google.com>
Date: Mon, 13 Feb 2017 09:22:36 -0800
Subject: [PATCH 0668/3220] ANDROID: ext4 crypto: Disables zeroing on
 truncation when there's no key

When performing orphan cleanup on mount, ext4 may truncate pages.
Truncation as currently implemented may require the encryption key for
partial zeroing, and the key isn't necessarily available on mount.
Since the userspace tools don't perform the partial zeroing operation
anyway, let's just skip doing that in the kernel.

This patch fixes a BUG_ON() oops.

Bug: 35209576
Change-Id: I2527a3f8d2c57d2de5df03fda69ee397f76095d7
Signed-off-by: Michael Halcrow <mhalcrow@google.com>
---
 fs/ext4/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 07037262337a..e2cb00640d1e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3557,6 +3557,11 @@ static int ext4_block_truncate_page(handle_t *handle,
 	unsigned blocksize;
 	struct inode *inode = mapping->host;
 
+	/* If we are processing an encrypted inode during orphan list
+	 * handling */
+	if (ext4_encrypted_inode(inode) && !ext4_has_encryption_key(inode))
+		return 0;
+
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 

From 56026a89e632af0cf45602dee1b2881bf21c4eba Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 14 Feb 2017 20:47:17 -0800
Subject: [PATCH 0669/3220] ANDROID: sdcardfs: Fix incorrect hash

This adds back the hash calculation removed as part of
the previous patch, as it is in fact necessary.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35307857
Change-Id: Ie607332bcf2c5d2efdf924e4060ef3f576bf25dc
---
 fs/sdcardfs/lookup.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 9135866b7766..6b595e892316 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -221,6 +221,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 	struct dentry *lower_dentry;
 	const struct qstr *name;
 	struct path lower_path;
+	struct qstr dname;
 	struct sdcardfs_sb_info *sbi;
 
 	sbi = SDCARDFS_SB(dentry->d_sb);
@@ -306,11 +307,14 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 		goto out;
 
 	/* instatiate a new negative dentry */
-	lower_dentry = d_lookup(lower_dir_dentry, name);
+	dname.name = name->name;
+	dname.len = name->len;
+	dname.hash = full_name_hash(dname.name, dname.len);
+	lower_dentry = d_lookup(lower_dir_dentry, &dname);
 	if (lower_dentry)
 		goto setup_lower;
 
-	lower_dentry = d_alloc(lower_dir_dentry, name);
+	lower_dentry = d_alloc(lower_dir_dentry, &dname);
 	if (!lower_dentry) {
 		err = -ENOMEM;
 		goto out;

From 1914b2934b3eb3def0105f5be0b1b98c4169f9ec Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 16 Feb 2017 17:55:22 -0800
Subject: [PATCH 0670/3220] ANDROID: sdcardfs: Add missing path_put

"ANDROID: sdcardfs: Add GID Derivation to sdcardfs" introduced
an unbalanced pat_get, leading to storage space not being freed
after deleting a file until rebooting. This adds the missing path_put.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 34691169
Change-Id: Ia7ef97ec2eca2c555cc06b235715635afc87940e
---
 fs/sdcardfs/derived_perm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 0bb442338a85..ca239a942065 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -243,6 +243,7 @@ retry_deleg:
 		if (error)
 			pr_err("sdcardfs: Failed to touch up lower fs gid/uid.\n");
 	}
+	sdcardfs_put_lower_path(dentry, &path);
 }
 
 static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) {

From 880c68e41dd2eab7bdbc64c8061e5b7e0e269cd3 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 22 Feb 2017 14:41:58 -0800
Subject: [PATCH 0671/3220] ANDROID: sdcardfs: Don't bother deleting freelist

There is no point deleting entries from dlist, as
that is a temporary list on the stack from which
contains only entries that are being deleted.

Not all code paths set up dlist, so those that
don't were performing invalid accesses in
hash_del_rcu. As an additional means to prevent
any other issue, we null out the list entries when
we allocate from the cache.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35666680
Change-Id: Ibb1e28c08c3a600c29418d39ba1c0f3db3bf31e5
---
 fs/sdcardfs/packagelist.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index d96fcde041cc..56d643f4a9ee 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -178,6 +178,8 @@ static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key,
 			GFP_KERNEL);
 	if (!ret)
 		return NULL;
+	INIT_HLIST_NODE(&ret->dlist);
+	INIT_HLIST_NODE(&ret->hlist);
 
 	if (!qstr_copy(key, &ret->key)) {
 		kmem_cache_free(hashtable_entry_cachep, ret);
@@ -326,7 +328,6 @@ static int insert_userid_exclude_entry(const struct qstr *key, userid_t value)
 static void free_hashtable_entry(struct hashtable_entry *entry)
 {
 	kfree(entry->key.name);
-	hash_del_rcu(&entry->dlist);
 	kmem_cache_free(hashtable_entry_cachep, entry);
 }
 

From 166f168a48d69ae0715983324bc80df8ff2aa4a4 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 24 Feb 2017 15:41:48 -0800
Subject: [PATCH 0672/3220] ANDROID: sdcardfs: implement vm_ops->page_mkwrite

This comes from the wrapfs patch
3dfec0ffe5e2 Wrapfs: implement vm_ops->page_mkwrite

Some file systems (e.g., ext4) require it.  Reported by Ted Ts'o.

Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 34133558
Change-Id: I1a389b2422c654a6d3046bb8ec3e20511aebfa8e
---
 fs/sdcardfs/mmap.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c
index e21f64675a80..7dd715875ef1 100644
--- a/fs/sdcardfs/mmap.c
+++ b/fs/sdcardfs/mmap.c
@@ -48,6 +48,39 @@ static int sdcardfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return err;
 }
 
+static int sdcardfs_page_mkwrite(struct vm_area_struct *vma,
+			       struct vm_fault *vmf)
+{
+	int err = 0;
+	struct file *file, *lower_file;
+	const struct vm_operations_struct *lower_vm_ops;
+	struct vm_area_struct lower_vma;
+
+	memcpy(&lower_vma, vma, sizeof(struct vm_area_struct));
+	file = lower_vma.vm_file;
+	lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
+	BUG_ON(!lower_vm_ops);
+	if (!lower_vm_ops->page_mkwrite)
+		goto out;
+
+	lower_file = sdcardfs_lower_file(file);
+	/*
+	 * XXX: vm_ops->page_mkwrite may be called in parallel.
+	 * Because we have to resort to temporarily changing the
+	 * vma->vm_file to point to the lower file, a concurrent
+	 * invocation of sdcardfs_page_mkwrite could see a different
+	 * value.  In this workaround, we keep a different copy of the
+	 * vma structure in our stack, so we never expose a different
+	 * value of the vma->vm_file called to us, even temporarily.
+	 * A better fix would be to change the calling semantics of
+	 * ->page_mkwrite to take an explicit file pointer.
+	 */
+	lower_vma.vm_file = lower_file;
+	err = lower_vm_ops->page_mkwrite(&lower_vma, vmf);
+out:
+	return err;
+}
+
 static ssize_t sdcardfs_direct_IO(struct kiocb *iocb,
 		struct iov_iter *iter, loff_t pos)
 {
@@ -78,4 +111,5 @@ const struct address_space_operations sdcardfs_aops = {
 
 const struct vm_operations_struct sdcardfs_vm_ops = {
 	.fault		= sdcardfs_fault,
+	.page_mkwrite	= sdcardfs_page_mkwrite,
 };

From 98c307e68c2887263a0c4321b54c402abf5ef7fc Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 24 Feb 2017 15:49:45 -0800
Subject: [PATCH 0673/3220] ANDROID: sdcardfs: support direct-IO (DIO)
 operations

This comes from the wrapfs patch
2e346c83b26e Wrapfs: support direct-IO (DIO) operations

Signed-off-by: Li Mengyang <li.mengyang@stonybrook.edu>
Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 34133558
Change-Id: I3fd779c510ab70d56b1d918f99c20421b524cdc4
---
 fs/sdcardfs/mmap.c     | 21 ++++-----------------
 fs/sdcardfs/sdcardfs.h |  1 +
 2 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c
index 7dd715875ef1..0d4089c62c3a 100644
--- a/fs/sdcardfs/mmap.c
+++ b/fs/sdcardfs/mmap.c
@@ -85,27 +85,14 @@ static ssize_t sdcardfs_direct_IO(struct kiocb *iocb,
 		struct iov_iter *iter, loff_t pos)
 {
 	/*
-     * This function returns zero on purpose in order to support direct IO.
-	 * __dentry_open checks a_ops->direct_IO and returns EINVAL if it is null.
-     *
-	 * However, this function won't be called by certain file operations
-     * including generic fs functions.  * reads and writes are delivered to
-     * the lower file systems and the direct IOs will be handled by them.
-	 *
-     * NOTE: exceptionally, on the recent kernels (since Linux 3.8.x),
-     * swap_writepage invokes this function directly.
+	 * This function should never be called directly.  We need it
+	 * to exist, to get past a check in open_check_o_direct(),
+	 * which is called from do_last().
 	 */
-	printk(KERN_INFO "%s, operation is not supported\n", __func__);
-	return 0;
+	return -EINVAL;
 }
 
-/*
- * XXX: the default address_space_ops for sdcardfs is empty.  We cannot set
- * our inode->i_mapping->a_ops to NULL because too many code paths expect
- * the a_ops vector to be non-NULL.
- */
 const struct address_space_operations sdcardfs_aops = {
-	/* empty on purpose */
 	.direct_IO	= sdcardfs_direct_IO,
 };
 
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index f3cced313108..042f989f0bea 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -29,6 +29,7 @@
 #include <linux/dcache.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/aio.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
 #include <linux/namei.h>

From 6696986a932a28b17858279c5eb69f83d566d4de Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Mon, 17 Jun 2013 18:36:56 +0100
Subject: [PATCH 0674/3220] cpufreq: interactive governor drops bits in time
 calculation

Keep time calculation in 64-bit throughout. If we have long times
between idle calculations this can result in deltas > 32 bits
which causes incorrect load percentage calculations and selecting
the wrong frequencies if we truncate here.

Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 drivers/cpufreq/cpufreq_interactive.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
index f2929e628820..889c9b8b2237 100644
--- a/drivers/cpufreq/cpufreq_interactive.c
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -312,13 +312,13 @@ static u64 update_load(int cpu)
 		pcpu->policy->governor_data;
 	u64 now;
 	u64 now_idle;
-	unsigned int delta_idle;
-	unsigned int delta_time;
+	u64 delta_idle;
+	u64 delta_time;
 	u64 active_time;
 
 	now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);
-	delta_idle = (unsigned int)(now_idle - pcpu->time_in_idle);
-	delta_time = (unsigned int)(now - pcpu->time_in_idle_timestamp);
+	delta_idle = (now_idle - pcpu->time_in_idle);
+	delta_time = (now - pcpu->time_in_idle_timestamp);
 
 	if (delta_time <= delta_idle)
 		active_time = 0;

From 6e1c2455aa286eec7579615883fb5ea7ecac8853 Mon Sep 17 00:00:00 2001
From: Anson Jacob <ansonjacob.aj@gmail.com>
Date: Thu, 17 Nov 2016 02:32:40 -0500
Subject: [PATCH 0675/3220] ANDROID: usb: gadget: function: Fix commenting
 style

Fix checkpatch.pl warning:
Block comments use * on subsequent lines

Change-Id: I9c92f128fdb3aeeb6ab9c7039e11f857bebb9539
Signed-off-by: Anson Jacob <ansonjacob.aj@gmail.com>
---
 drivers/usb/gadget/function/f_accessory.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c
index 9d3ec0e37475..f2fa0c271d70 100644
--- a/drivers/usb/gadget/function/f_accessory.c
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -676,9 +676,10 @@ static ssize_t acc_write(struct file *fp, const char __user *buf,
 			req->zero = 0;
 		} else {
 			xfer = count;
-			/* If the data length is a multple of the
+			/*
+			 * If the data length is a multple of the
 			 * maxpacket size then send a zero length packet(ZLP).
-			*/
+			 */
 			req->zero = ((xfer % dev->ep_in->maxpacket) == 0);
 		}
 		if (copy_from_user(req->buf, buf, xfer)) {
@@ -820,11 +821,11 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev,
 	unsigned long flags;
 
 /*
-	printk(KERN_INFO "acc_ctrlrequest "
-			"%02x.%02x v%04x i%04x l%u\n",
-			b_requestType, b_request,
-			w_value, w_index, w_length);
-*/
+ *	printk(KERN_INFO "acc_ctrlrequest "
+ *			"%02x.%02x v%04x i%04x l%u\n",
+ *			b_requestType, b_request,
+ *			w_value, w_index, w_length);
+ */
 
 	if (b_requestType == (USB_DIR_OUT | USB_TYPE_VENDOR)) {
 		if (b_request == ACCESSORY_START) {

From f04805218728192d346291c1a0e78cb184142bad Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Mon, 14 Nov 2016 09:48:02 -0800
Subject: [PATCH 0676/3220] ANDROID: dm: android-verity: fix
 table_make_digest() error handling

If table_make_digest() fails, verify_verity_signature() would try to
pass the returned ERR_PTR() to kfree().

This fixes the smatch error:

drivers/md/dm-android-verity.c:601 verify_verity_signature() error: 'pks' dereferencing possible ERR_PTR()

Change-Id: I9b9b7764b538cb4a5f94337660e9b0f149b139be
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 drivers/md/dm-android-verity.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index bb6c1285e499..ec0a4d19ca3e 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -585,6 +585,8 @@ static int verify_verity_signature(char *key_id,
 
 	if (IS_ERR(pks)) {
 		DMERR("hashing failed");
+		retval = PTR_ERR(pks);
+		pks = NULL;
 		goto error;
 	}
 

From 16f66bfe88211599834fa7109117553835ab4690 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Thu, 10 Nov 2016 19:36:15 -0700
Subject: [PATCH 0677/3220] nf: IDLETIMER: Fix use after free condition during
 work

schedule_work(&timer->work) appears to be called after
cancel_work_sync(&info->timer->work) is completed.
Work can be scheduled from the PM_POST_SUSPEND notification event
even after cancel_work_sync is called.

Call stack

-004|notify_netlink_uevent(
    |    [X19] timer = 0xFFFFFFC0A5DFC780 -> (
    |      ...
    |      [NSD:0xFFFFFFC0A5DFC800] kobj = 0x6B6B6B6B6B6B6B6B,
    |      [NSD:0xFFFFFFC0A5DFC868] timeout = 0x6B6B6B6B,
    |      [NSD:0xFFFFFFC0A5DFC86C] refcnt = 0x6B6B6B6B,
    |      [NSD:0xFFFFFFC0A5DFC870] work_pending = 0x6B,
    |      [NSD:0xFFFFFFC0A5DFC871] send_nl_msg = 0x6B,
    |      [NSD:0xFFFFFFC0A5DFC872] active = 0x6B,
    |      [NSD:0xFFFFFFC0A5DFC874] uid = 0x6B6B6B6B,
    |      [NSD:0xFFFFFFC0A5DFC878] suspend_time_valid = 0x6B))
-005|idletimer_tg_work(
-006|__read_once_size(inline)
-006|static_key_count(inline)
-006|static_key_false(inline)
-006|trace_workqueue_execute_end(inline)
-006|process_one_work(
-007|worker_thread(
-008|kthread(
-009|ret_from_fork(asm)
---|end of frame

Force any pending idletimer_tg_work() to complete before freeing
the associated work struct and after unregistering to the pm_notifier
callback.

Change-Id: I4c5f0a1c142f7d698c092cf7bcafdb0f9fbaa9c1
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 net/netfilter/xt_IDLETIMER.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 0975c993a94e..ada5a304e61e 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -456,6 +456,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
 		del_timer_sync(&info->timer->timer);
 		sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
 		unregister_pm_notifier(&info->timer->pm_nb);
+		cancel_work_sync(&info->timer->work);
 		kfree(info->timer->attr.attr.name);
 		kfree(info->timer);
 	} else {

From 33af4c0c5faa2ae101db8701d1000512a27dfdb7 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Wed, 2 Nov 2016 11:56:40 -0600
Subject: [PATCH 0678/3220] nf: IDLETIMER: Use fullsock when querying uid

sock_i_uid() acquires the sk_callback_lock which does not exist for
sockets in TCP_NEW_SYN_RECV state. This results in errors showing up
as spinlock bad magic. Fix this by looking for the full sock as
suggested by Eric.

Callstack for reference -

-003|rwlock_bug
-004|arch_read_lock
-004|do_raw_read_lock
-005|raw_read_lock_bh
-006|sock_i_uid
-007|from_kuid_munged(inline)
-007|reset_timer
-008|idletimer_tg_target
-009|ipt_do_table
-010|iptable_mangle_hook
-011|nf_iterate
-012|nf_hook_slow
-013|NF_HOOK_COND(inline)
-013|ip_output
-014|ip_local_out
-015|ip_build_and_send_pkt
-016|tcp_v4_send_synack
-017|atomic_sub_return(inline)
-017|reqsk_put(inline)
-017|tcp_conn_request
-018|tcp_v4_conn_request
-019|tcp_rcv_state_process
-020|tcp_v4_do_rcv
-021|tcp_v4_rcv
-022|ip_local_deliver_finish
-023|NF_HOOK_THRESH(inline)
-023|NF_HOOK(inline)
-023|ip_local_deliver
-024|ip_rcv_finish
-025|NF_HOOK_THRESH(inline)
-025|NF_HOOK(inline)
-025|ip_rcv
-026|deliver_skb(inline)
-026|deliver_ptype_list_skb(inline)
-026|__netif_receive_skb_core
-027|__netif_receive_skb
-028|netif_receive_skb_internal
-029|netif_receive_skb

Change-Id: Ic8f3a3d2d7af31434d1163b03971994e2125d552
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Cc: Eric Dumazet <edumazet@google.com>
---
 net/netfilter/xt_IDLETIMER.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index ada5a304e61e..f11aa28b96ce 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -49,6 +49,7 @@
 #include <linux/notifier.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/inet_sock.h>
 
 struct idletimer_tg_attr {
 	struct attribute attr;
@@ -355,7 +356,7 @@ static void reset_timer(const struct idletimer_tg_info *info,
 		/* Stores the uid resposible for waking up the radio */
 		if (skb && (skb->sk)) {
 			timer->uid = from_kuid_munged(current_user_ns(),
-						sock_i_uid(skb->sk));
+					sock_i_uid(skb_to_full_sk(skb)));
 		}
 
 		/* checks if there is a pending inactive notification*/

From 359795138dc5440e09c58025e28ec1b38d648c09 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Tue, 7 Mar 2017 15:51:18 +0100
Subject: [PATCH 0679/3220] binder: use group leader instead of open thread

The binder allocator assumes that the thread that
called binder_open will never die for the lifetime of
that proc. That thread is normally the group_leader,
however it may not be. Use the group_leader instead
of current.

Bug: 35707103
Test: Created test case to open with temporary thread

Change-Id: Id693f74b3591f3524a8c6e9508e70f3e5a80c588
Signed-off-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index e6af2e819980..08cde76875d2 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3359,7 +3359,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	const char *failure_string;
 	struct binder_buffer *buffer;
 
-	if (proc->tsk != current)
+	if (proc->tsk != current->group_leader)
 		return -EINVAL;
 
 	if ((vma->vm_end - vma->vm_start) > SZ_4M)
@@ -3461,8 +3461,8 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	proc = kzalloc(sizeof(*proc), GFP_KERNEL);
 	if (proc == NULL)
 		return -ENOMEM;
-	get_task_struct(current);
-	proc->tsk = current;
+	get_task_struct(current->group_leader);
+	proc->tsk = current->group_leader;
 	INIT_LIST_HEAD(&proc->todo);
 	init_waitqueue_head(&proc->wait);
 	proc->default_priority = task_nice(current);

From 0fd0992d350402a927408caf20726307993e0010 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Tue, 7 Mar 2017 15:54:56 +0100
Subject: [PATCH 0680/3220] android: binder: add padding to
 binder_fd_array_object.

binder_fd_array_object starts with a 4-byte header,
followed by a few fields that are 8 bytes when
ANDROID_BINDER_IPC_32BIT=N.

This can cause alignment issues in a 64-bit kernel
with a 32-bit userspace, as on x86_32 an 8-byte primitive
may be aligned to a 4-byte address. Pad with a __u32
to fix this.

Change-Id: I4374ed2cc3ccd3c6a1474cb7209b53ebfd91077b
Signed-off-by: Martijn Coenen <maco@android.com>
---
 include/uapi/linux/android/binder.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 51f891fb1b18..7668b5791c91 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -132,6 +132,7 @@ enum {
 
 /* struct binder_fd_array_object - object describing an array of fds in a buffer
  * @hdr:		common header structure
+ * @pad:		padding to ensure correct alignment
  * @num_fds:		number of file descriptors in the buffer
  * @parent:		index in offset array to buffer holding the fd array
  * @parent_offset:	start offset of fd array in the buffer
@@ -152,6 +153,7 @@ enum {
  */
 struct binder_fd_array_object {
 	struct binder_object_header	hdr;
+	__u32				pad;
 	binder_size_t			num_fds;
 	binder_size_t			parent;
 	binder_size_t			parent_offset;

From d6bbb3276728e4a13c832c55f948a40a27bc4a33 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 30 Sep 2016 16:40:04 +0200
Subject: [PATCH 0681/3220] android: binder: move global binder state into
 context struct.

This change moves all global binder state into
the context struct, thereby completely separating
the state and the locks between two different contexts.

The debugfs entries remain global, printing entries
from all contexts.

Change-Id: If8e3e2bece7bc6f974b66fbcf1d91d529ffa62f0
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 392 ++++++++++++++++++++++++++-------------
 1 file changed, 259 insertions(+), 133 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 08cde76875d2..9cf4f9bbc711 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -18,6 +18,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <asm/cacheflush.h>
+#include <linux/atomic.h>
 #include <linux/fdtable.h>
 #include <linux/file.h>
 #include <linux/freezer.h>
@@ -46,19 +47,11 @@
 #include <uapi/linux/android/binder.h>
 #include "binder_trace.h"
 
-static DEFINE_MUTEX(binder_main_lock);
-static DEFINE_MUTEX(binder_deferred_lock);
-static DEFINE_MUTEX(binder_mmap_lock);
-
 static HLIST_HEAD(binder_devices);
-static HLIST_HEAD(binder_procs);
-static HLIST_HEAD(binder_deferred_list);
-static HLIST_HEAD(binder_dead_nodes);
 
 static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
-static int binder_last_id;
-static struct workqueue_struct *binder_deferred_workqueue;
+atomic_t binder_last_id;
 
 #define BINDER_DEBUG_ENTRY(name) \
 static int binder_##name##_open(struct inode *inode, struct file *file) \
@@ -173,20 +166,24 @@ enum binder_stat_types {
 struct binder_stats {
 	int br[_IOC_NR(BR_FAILED_REPLY) + 1];
 	int bc[_IOC_NR(BC_REPLY_SG) + 1];
-	int obj_created[BINDER_STAT_COUNT];
-	int obj_deleted[BINDER_STAT_COUNT];
 };
 
-static struct binder_stats binder_stats;
+/* These are still global, since it's not always easy to get the context */
+struct binder_obj_stats {
+	atomic_t obj_created[BINDER_STAT_COUNT];
+	atomic_t obj_deleted[BINDER_STAT_COUNT];
+};
+
+static struct binder_obj_stats binder_obj_stats;
 
 static inline void binder_stats_deleted(enum binder_stat_types type)
 {
-	binder_stats.obj_deleted[type]++;
+	atomic_inc(&binder_obj_stats.obj_deleted[type]);
 }
 
 static inline void binder_stats_created(enum binder_stat_types type)
 {
-	binder_stats.obj_created[type]++;
+	atomic_inc(&binder_obj_stats.obj_created[type]);
 }
 
 struct binder_transaction_log_entry {
@@ -207,8 +204,6 @@ struct binder_transaction_log {
 	int full;
 	struct binder_transaction_log_entry entry[32];
 };
-static struct binder_transaction_log binder_transaction_log;
-static struct binder_transaction_log binder_transaction_log_failed;
 
 static struct binder_transaction_log_entry *binder_transaction_log_add(
 	struct binder_transaction_log *log)
@@ -229,6 +224,21 @@ struct binder_context {
 	struct binder_node *binder_context_mgr_node;
 	kuid_t binder_context_mgr_uid;
 	const char *name;
+
+	struct mutex binder_main_lock;
+	struct mutex binder_deferred_lock;
+	struct mutex binder_mmap_lock;
+
+	struct hlist_head binder_procs;
+	struct hlist_head binder_dead_nodes;
+	struct hlist_head binder_deferred_list;
+
+	struct work_struct deferred_work;
+	struct workqueue_struct *binder_deferred_workqueue;
+	struct binder_transaction_log transaction_log;
+	struct binder_transaction_log transaction_log_failed;
+
+	struct binder_stats binder_stats;
 };
 
 struct binder_device {
@@ -451,17 +461,18 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 	return retval;
 }
 
-static inline void binder_lock(const char *tag)
+static inline void binder_lock(struct binder_context *context, const char *tag)
 {
 	trace_binder_lock(tag);
-	mutex_lock(&binder_main_lock);
+	mutex_lock(&context->binder_main_lock);
 	trace_binder_locked(tag);
 }
 
-static inline void binder_unlock(const char *tag)
+static inline void binder_unlock(struct binder_context *context,
+				 const char *tag)
 {
 	trace_binder_unlock(tag);
-	mutex_unlock(&binder_main_lock);
+	mutex_unlock(&context->binder_main_lock);
 }
 
 static void binder_set_nice(long nice)
@@ -946,7 +957,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 	binder_stats_created(BINDER_STAT_NODE);
 	rb_link_node(&node->rb_node, parent, p);
 	rb_insert_color(&node->rb_node, &proc->nodes);
-	node->debug_id = ++binder_last_id;
+	node->debug_id = atomic_inc_return(&binder_last_id);
 	node->proc = proc;
 	node->ptr = ptr;
 	node->cookie = cookie;
@@ -1088,7 +1099,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 	if (new_ref == NULL)
 		return NULL;
 	binder_stats_created(BINDER_STAT_REF);
-	new_ref->debug_id = ++binder_last_id;
+	new_ref->debug_id = atomic_inc_return(&binder_last_id);
 	new_ref->proc = proc;
 	new_ref->node = node;
 	rb_link_node(&new_ref->rb_node_node, parent, p);
@@ -1848,7 +1859,7 @@ static void binder_transaction(struct binder_proc *proc,
 	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
 
-	e = binder_transaction_log_add(&binder_transaction_log);
+	e = binder_transaction_log_add(&context->transaction_log);
 	e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY);
 	e->from_proc = proc->pid;
 	e->from_thread = thread->pid;
@@ -1970,7 +1981,7 @@ static void binder_transaction(struct binder_proc *proc,
 	}
 	binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE);
 
-	t->debug_id = ++binder_last_id;
+	t->debug_id = atomic_inc_return(&binder_last_id);
 	e->debug_id = t->debug_id;
 
 	if (reply)
@@ -2234,7 +2245,8 @@ err_no_context_mgr_node:
 	{
 		struct binder_transaction_log_entry *fe;
 
-		fe = binder_transaction_log_add(&binder_transaction_log_failed);
+		fe = binder_transaction_log_add(
+				&context->transaction_log_failed);
 		*fe = *e;
 	}
 
@@ -2262,8 +2274,8 @@ static int binder_thread_write(struct binder_proc *proc,
 			return -EFAULT;
 		ptr += sizeof(uint32_t);
 		trace_binder_command(cmd);
-		if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) {
-			binder_stats.bc[_IOC_NR(cmd)]++;
+		if (_IOC_NR(cmd) < ARRAY_SIZE(context->binder_stats.bc)) {
+			context->binder_stats.bc[_IOC_NR(cmd)]++;
 			proc->stats.bc[_IOC_NR(cmd)]++;
 			thread->stats.bc[_IOC_NR(cmd)]++;
 		}
@@ -2628,8 +2640,8 @@ static void binder_stat_br(struct binder_proc *proc,
 			   struct binder_thread *thread, uint32_t cmd)
 {
 	trace_binder_return(cmd);
-	if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) {
-		binder_stats.br[_IOC_NR(cmd)]++;
+	if (_IOC_NR(cmd) < ARRAY_SIZE(proc->stats.br)) {
+		proc->context->binder_stats.br[_IOC_NR(cmd)]++;
 		proc->stats.br[_IOC_NR(cmd)]++;
 		thread->stats.br[_IOC_NR(cmd)]++;
 	}
@@ -2693,7 +2705,7 @@ retry:
 	if (wait_for_proc_work)
 		proc->ready_threads++;
 
-	binder_unlock(__func__);
+	binder_unlock(proc->context, __func__);
 
 	trace_binder_wait_for_work(wait_for_proc_work,
 				   !!thread->transaction_stack,
@@ -2720,7 +2732,7 @@ retry:
 			ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread));
 	}
 
-	binder_lock(__func__);
+	binder_lock(proc->context, __func__);
 
 	if (wait_for_proc_work)
 		proc->ready_threads--;
@@ -3107,14 +3119,14 @@ static unsigned int binder_poll(struct file *filp,
 	struct binder_thread *thread = NULL;
 	int wait_for_proc_work;
 
-	binder_lock(__func__);
+	binder_lock(proc->context, __func__);
 
 	thread = binder_get_thread(proc);
 
 	wait_for_proc_work = thread->transaction_stack == NULL &&
 		list_empty(&thread->todo) && thread->return_error == BR_OK;
 
-	binder_unlock(__func__);
+	binder_unlock(proc->context, __func__);
 
 	if (wait_for_proc_work) {
 		if (binder_has_proc_work(proc, thread))
@@ -3241,6 +3253,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int ret;
 	struct binder_proc *proc = filp->private_data;
+	struct binder_context *context = proc->context;
 	struct binder_thread *thread;
 	unsigned int size = _IOC_SIZE(cmd);
 	void __user *ubuf = (void __user *)arg;
@@ -3254,7 +3267,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	if (ret)
 		goto err_unlocked;
 
-	binder_lock(__func__);
+	binder_lock(context, __func__);
 	thread = binder_get_thread(proc);
 	if (thread == NULL) {
 		ret = -ENOMEM;
@@ -3306,7 +3319,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 err:
 	if (thread)
 		thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN;
-	binder_unlock(__func__);
+	binder_unlock(context, __func__);
 	wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
 	if (ret && ret != -ERESTARTSYS)
 		pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
@@ -3378,7 +3391,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	}
 	vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
 
-	mutex_lock(&binder_mmap_lock);
+	mutex_lock(&proc->context->binder_mmap_lock);
 	if (proc->buffer) {
 		ret = -EBUSY;
 		failure_string = "already mapped";
@@ -3393,7 +3406,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	}
 	proc->buffer = area->addr;
 	proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer;
-	mutex_unlock(&binder_mmap_lock);
+	mutex_unlock(&proc->context->binder_mmap_lock);
 
 #ifdef CONFIG_CPU_CACHE_VIPT
 	if (cache_is_vipt_aliasing()) {
@@ -3438,12 +3451,12 @@ err_alloc_small_buf_failed:
 	kfree(proc->pages);
 	proc->pages = NULL;
 err_alloc_pages_failed:
-	mutex_lock(&binder_mmap_lock);
+	mutex_lock(&proc->context->binder_mmap_lock);
 	vfree(proc->buffer);
 	proc->buffer = NULL;
 err_get_vm_area_failed:
 err_already_mapped:
-	mutex_unlock(&binder_mmap_lock);
+	mutex_unlock(&proc->context->binder_mmap_lock);
 err_bad_arg:
 	pr_err("binder_mmap: %d %lx-%lx %s failed %d\n",
 	       proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
@@ -3470,15 +3483,15 @@ static int binder_open(struct inode *nodp, struct file *filp)
 				  miscdev);
 	proc->context = &binder_dev->context;
 
-	binder_lock(__func__);
+	binder_lock(proc->context, __func__);
 
 	binder_stats_created(BINDER_STAT_PROC);
-	hlist_add_head(&proc->proc_node, &binder_procs);
+	hlist_add_head(&proc->proc_node, &proc->context->binder_procs);
 	proc->pid = current->group_leader->pid;
 	INIT_LIST_HEAD(&proc->delivered_death);
 	filp->private_data = proc;
 
-	binder_unlock(__func__);
+	binder_unlock(proc->context, __func__);
 
 	if (binder_debugfs_dir_entry_proc) {
 		char strbuf[11];
@@ -3543,6 +3556,7 @@ static int binder_release(struct inode *nodp, struct file *filp)
 static int binder_node_release(struct binder_node *node, int refs)
 {
 	struct binder_ref *ref;
+	struct binder_context *context = node->proc->context;
 	int death = 0;
 
 	list_del_init(&node->work.entry);
@@ -3558,7 +3572,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 	node->proc = NULL;
 	node->local_strong_refs = 0;
 	node->local_weak_refs = 0;
-	hlist_add_head(&node->dead_node, &binder_dead_nodes);
+	hlist_add_head(&node->dead_node, &context->binder_dead_nodes);
 
 	hlist_for_each_entry(ref, &node->refs, node_entry) {
 		refs++;
@@ -3623,7 +3637,8 @@ static void binder_deferred_release(struct binder_proc *proc)
 		node = rb_entry(n, struct binder_node, rb_node);
 		nodes++;
 		rb_erase(&node->rb_node, &proc->nodes);
-		incoming_refs = binder_node_release(node, incoming_refs);
+		incoming_refs = binder_node_release(node,
+						    incoming_refs);
 	}
 
 	outgoing_refs = 0;
@@ -3695,14 +3710,16 @@ static void binder_deferred_func(struct work_struct *work)
 {
 	struct binder_proc *proc;
 	struct files_struct *files;
+	struct binder_context *context =
+		container_of(work, struct binder_context, deferred_work);
 
 	int defer;
 
 	do {
-		binder_lock(__func__);
-		mutex_lock(&binder_deferred_lock);
-		if (!hlist_empty(&binder_deferred_list)) {
-			proc = hlist_entry(binder_deferred_list.first,
+		binder_lock(context, __func__);
+		mutex_lock(&context->binder_deferred_lock);
+		if (!hlist_empty(&context->binder_deferred_list)) {
+			proc = hlist_entry(context->binder_deferred_list.first,
 					struct binder_proc, deferred_work_node);
 			hlist_del_init(&proc->deferred_work_node);
 			defer = proc->deferred_work;
@@ -3711,7 +3728,7 @@ static void binder_deferred_func(struct work_struct *work)
 			proc = NULL;
 			defer = 0;
 		}
-		mutex_unlock(&binder_deferred_lock);
+		mutex_unlock(&context->binder_deferred_lock);
 
 		files = NULL;
 		if (defer & BINDER_DEFERRED_PUT_FILES) {
@@ -3726,24 +3743,24 @@ static void binder_deferred_func(struct work_struct *work)
 		if (defer & BINDER_DEFERRED_RELEASE)
 			binder_deferred_release(proc); /* frees proc */
 
-		binder_unlock(__func__);
+		binder_unlock(context, __func__);
 		if (files)
 			put_files_struct(files);
 	} while (proc);
 }
-static DECLARE_WORK(binder_deferred_work, binder_deferred_func);
 
 static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer)
 {
-	mutex_lock(&binder_deferred_lock);
+	mutex_lock(&proc->context->binder_deferred_lock);
 	proc->deferred_work |= defer;
 	if (hlist_unhashed(&proc->deferred_work_node)) {
 		hlist_add_head(&proc->deferred_work_node,
-				&binder_deferred_list);
-		queue_work(binder_deferred_workqueue, &binder_deferred_work);
+				&proc->context->binder_deferred_list);
+		queue_work(proc->context->binder_deferred_workqueue,
+			   &proc->context->deferred_work);
 	}
-	mutex_unlock(&binder_deferred_lock);
+	mutex_unlock(&proc->context->binder_deferred_lock);
 }
 
 static void print_binder_transaction(struct seq_file *m, const char *prefix,
@@ -3974,8 +3991,20 @@ static const char * const binder_objstat_strings[] = {
 	"transaction_complete"
 };
 
+static void add_binder_stats(struct binder_stats *from, struct binder_stats *to)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(to->bc); i++)
+		to->bc[i] += from->bc[i];
+
+	for (i = 0; i < ARRAY_SIZE(to->br); i++)
+		to->br[i] += from->br[i];
+}
+
 static void print_binder_stats(struct seq_file *m, const char *prefix,
-			       struct binder_stats *stats)
+			       struct binder_stats *stats,
+			       struct binder_obj_stats *obj_stats)
 {
 	int i;
 
@@ -3995,16 +4024,21 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
 				   binder_return_strings[i], stats->br[i]);
 	}
 
-	BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
+	if (!obj_stats)
+		return;
+
+	BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) !=
 		     ARRAY_SIZE(binder_objstat_strings));
-	BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
-		     ARRAY_SIZE(stats->obj_deleted));
-	for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) {
-		if (stats->obj_created[i] || stats->obj_deleted[i])
+	BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) !=
+		     ARRAY_SIZE(obj_stats->obj_deleted));
+	for (i = 0; i < ARRAY_SIZE(obj_stats->obj_created); i++) {
+		int obj_created = atomic_read(&obj_stats->obj_created[i]);
+		int obj_deleted = atomic_read(&obj_stats->obj_deleted[i]);
+
+		if (obj_created || obj_deleted)
 			seq_printf(m, "%s%s: active %d total %d\n", prefix,
-				binder_objstat_strings[i],
-				stats->obj_created[i] - stats->obj_deleted[i],
-				stats->obj_created[i]);
+				   binder_objstat_strings[i],
+				   obj_created - obj_deleted, obj_created);
 	}
 }
 
@@ -4059,85 +4093,131 @@ static void print_binder_proc_stats(struct seq_file *m,
 	}
 	seq_printf(m, "  pending transactions: %d\n", count);
 
-	print_binder_stats(m, "  ", &proc->stats);
+	print_binder_stats(m, "  ", &proc->stats, NULL);
 }
 
 
 static int binder_state_show(struct seq_file *m, void *unused)
 {
+	struct binder_device *device;
+	struct binder_context *context;
 	struct binder_proc *proc;
 	struct binder_node *node;
 	int do_lock = !binder_debug_no_lock;
-
-	if (do_lock)
-		binder_lock(__func__);
+	bool wrote_dead_nodes_header = false;
 
 	seq_puts(m, "binder state:\n");
 
-	if (!hlist_empty(&binder_dead_nodes))
-		seq_puts(m, "dead nodes:\n");
-	hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
-		print_binder_node(m, node);
+	hlist_for_each_entry(device, &binder_devices, hlist) {
+		context = &device->context;
+		if (do_lock)
+			binder_lock(context, __func__);
+		if (!wrote_dead_nodes_header &&
+		    !hlist_empty(&context->binder_dead_nodes)) {
+			seq_puts(m, "dead nodes:\n");
+			wrote_dead_nodes_header = true;
+		}
+		hlist_for_each_entry(node, &context->binder_dead_nodes,
+				     dead_node)
+			print_binder_node(m, node);
 
-	hlist_for_each_entry(proc, &binder_procs, proc_node)
-		print_binder_proc(m, proc, 1);
-	if (do_lock)
-		binder_unlock(__func__);
+		if (do_lock)
+			binder_unlock(context, __func__);
+	}
+
+	hlist_for_each_entry(device, &binder_devices, hlist) {
+		context = &device->context;
+		if (do_lock)
+			binder_lock(context, __func__);
+
+		hlist_for_each_entry(proc, &context->binder_procs, proc_node)
+			print_binder_proc(m, proc, 1);
+		if (do_lock)
+			binder_unlock(context, __func__);
+	}
 	return 0;
 }
 
 static int binder_stats_show(struct seq_file *m, void *unused)
 {
+	struct binder_device *device;
+	struct binder_context *context;
 	struct binder_proc *proc;
+	struct binder_stats total_binder_stats;
 	int do_lock = !binder_debug_no_lock;
 
-	if (do_lock)
-		binder_lock(__func__);
+	memset(&total_binder_stats, 0, sizeof(struct binder_stats));
+
+	hlist_for_each_entry(device, &binder_devices, hlist) {
+		context = &device->context;
+		if (do_lock)
+			binder_lock(context, __func__);
+
+		add_binder_stats(&context->binder_stats, &total_binder_stats);
+
+		if (do_lock)
+			binder_unlock(context, __func__);
+	}
 
 	seq_puts(m, "binder stats:\n");
+	print_binder_stats(m, "", &total_binder_stats, &binder_obj_stats);
 
-	print_binder_stats(m, "", &binder_stats);
+	hlist_for_each_entry(device, &binder_devices, hlist) {
+		context = &device->context;
+		if (do_lock)
+			binder_lock(context, __func__);
 
-	hlist_for_each_entry(proc, &binder_procs, proc_node)
-		print_binder_proc_stats(m, proc);
-	if (do_lock)
-		binder_unlock(__func__);
+		hlist_for_each_entry(proc, &context->binder_procs, proc_node)
+			print_binder_proc_stats(m, proc);
+		if (do_lock)
+			binder_unlock(context, __func__);
+	}
 	return 0;
 }
 
 static int binder_transactions_show(struct seq_file *m, void *unused)
 {
+	struct binder_device *device;
+	struct binder_context *context;
 	struct binder_proc *proc;
 	int do_lock = !binder_debug_no_lock;
 
-	if (do_lock)
-		binder_lock(__func__);
-
 	seq_puts(m, "binder transactions:\n");
-	hlist_for_each_entry(proc, &binder_procs, proc_node)
-		print_binder_proc(m, proc, 0);
-	if (do_lock)
-		binder_unlock(__func__);
+	hlist_for_each_entry(device, &binder_devices, hlist) {
+		context = &device->context;
+		if (do_lock)
+			binder_lock(context, __func__);
+
+		hlist_for_each_entry(proc, &context->binder_procs, proc_node)
+			print_binder_proc(m, proc, 0);
+		if (do_lock)
+			binder_unlock(context, __func__);
+	}
 	return 0;
 }
 
 static int binder_proc_show(struct seq_file *m, void *unused)
 {
+	struct binder_device *device;
+	struct binder_context *context;
 	struct binder_proc *itr;
 	int pid = (unsigned long)m->private;
 	int do_lock = !binder_debug_no_lock;
 
-	if (do_lock)
-		binder_lock(__func__);
+	hlist_for_each_entry(device, &binder_devices, hlist) {
+		context = &device->context;
+		if (do_lock)
+			binder_lock(context, __func__);
 
-	hlist_for_each_entry(itr, &binder_procs, proc_node) {
-		if (itr->pid == pid) {
-			seq_puts(m, "binder proc state:\n");
-			print_binder_proc(m, itr, 1);
+		hlist_for_each_entry(itr, &context->binder_procs, proc_node) {
+			if (itr->pid == pid) {
+				seq_puts(m, "binder proc state:\n");
+				print_binder_proc(m, itr, 1);
+			}
 		}
+		if (do_lock)
+			binder_unlock(context, __func__);
 	}
-	if (do_lock)
-		binder_unlock(__func__);
 	return 0;
 }
 
@@ -4152,11 +4232,10 @@ static void print_binder_transaction_log_entry(struct seq_file *m,
 		   e->to_node, e->target_handle, e->data_size, e->offsets_size);
 }
 
-static int binder_transaction_log_show(struct seq_file *m, void *unused)
+static int print_binder_transaction_log(struct seq_file *m,
+					struct binder_transaction_log *log)
 {
-	struct binder_transaction_log *log = m->private;
 	int i;
-
 	if (log->full) {
 		for (i = log->next; i < ARRAY_SIZE(log->entry); i++)
 			print_binder_transaction_log_entry(m, &log->entry[i]);
@@ -4166,6 +4245,31 @@ static int binder_transaction_log_show(struct seq_file *m, void *unused)
 	return 0;
 }
 
+static int binder_transaction_log_show(struct seq_file *m, void *unused)
+{
+	struct binder_device *device;
+	struct binder_context *context;
+
+	hlist_for_each_entry(device, &binder_devices, hlist) {
+		context = &device->context;
+		print_binder_transaction_log(m, &context->transaction_log);
+	}
+	return 0;
+}
+
+static int binder_failed_transaction_log_show(struct seq_file *m, void *unused)
+{
+	struct binder_device *device;
+	struct binder_context *context;
+
+	hlist_for_each_entry(device, &binder_devices, hlist) {
+		context = &device->context;
+		print_binder_transaction_log(m,
+					     &context->transaction_log_failed);
+	}
+	return 0;
+}
+
 static const struct file_operations binder_fops = {
 	.owner = THIS_MODULE,
 	.poll = binder_poll,
@@ -4181,11 +4285,20 @@ BINDER_DEBUG_ENTRY(state);
 BINDER_DEBUG_ENTRY(stats);
 BINDER_DEBUG_ENTRY(transactions);
 BINDER_DEBUG_ENTRY(transaction_log);
+BINDER_DEBUG_ENTRY(failed_transaction_log);
+
+static void __init free_binder_device(struct binder_device *device)
+{
+	if (device->context.binder_deferred_workqueue)
+		destroy_workqueue(device->context.binder_deferred_workqueue);
+	kfree(device);
+}
 
 static int __init init_binder_device(const char *name)
 {
 	int ret;
 	struct binder_device *binder_device;
+	struct binder_context *context;
 
 	binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL);
 	if (!binder_device)
@@ -4195,31 +4308,65 @@ static int __init init_binder_device(const char *name)
 	binder_device->miscdev.minor = MISC_DYNAMIC_MINOR;
 	binder_device->miscdev.name = name;
 
-	binder_device->context.binder_context_mgr_uid = INVALID_UID;
-	binder_device->context.name = name;
+	context = &binder_device->context;
+	context->binder_context_mgr_uid = INVALID_UID;
+	context->name = name;
+
+	mutex_init(&context->binder_main_lock);
+	mutex_init(&context->binder_deferred_lock);
+	mutex_init(&context->binder_mmap_lock);
+
+	context->binder_deferred_workqueue =
+		create_singlethread_workqueue(name);
+
+	if (!context->binder_deferred_workqueue) {
+		ret = -ENOMEM;
+		goto err_create_singlethread_workqueue_failed;
+	}
+
+	INIT_HLIST_HEAD(&context->binder_procs);
+	INIT_HLIST_HEAD(&context->binder_dead_nodes);
+	INIT_HLIST_HEAD(&context->binder_deferred_list);
+	INIT_WORK(&context->deferred_work, binder_deferred_func);
 
 	ret = misc_register(&binder_device->miscdev);
 	if (ret < 0) {
-		kfree(binder_device);
-		return ret;
+		goto err_misc_register_failed;
 	}
 
 	hlist_add_head(&binder_device->hlist, &binder_devices);
+	return ret;
+
+err_create_singlethread_workqueue_failed:
+err_misc_register_failed:
+	free_binder_device(binder_device);
 
 	return ret;
 }
 
 static int __init binder_init(void)
 {
-	int ret;
+	int ret = 0;
 	char *device_name, *device_names;
 	struct binder_device *device;
 	struct hlist_node *tmp;
 
-	binder_deferred_workqueue = create_singlethread_workqueue("binder");
-	if (!binder_deferred_workqueue)
+	/*
+	 * Copy the module_parameter string, because we don't want to
+	 * tokenize it in-place.
+	 */
+	device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
+	if (!device_names)
 		return -ENOMEM;
 
+	strcpy(device_names, binder_devices_param);
+
+	while ((device_name = strsep(&device_names, ","))) {
+		ret = init_binder_device(device_name);
+		if (ret)
+			goto err_init_binder_device_failed;
+	}
+
 	binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL);
 	if (binder_debugfs_dir_entry_root)
 		binder_debugfs_dir_entry_proc = debugfs_create_dir("proc",
@@ -4244,30 +4391,13 @@ static int __init binder_init(void)
 		debugfs_create_file("transaction_log",
 				    S_IRUGO,
 				    binder_debugfs_dir_entry_root,
-				    &binder_transaction_log,
+				    NULL,
 				    &binder_transaction_log_fops);
 		debugfs_create_file("failed_transaction_log",
 				    S_IRUGO,
 				    binder_debugfs_dir_entry_root,
-				    &binder_transaction_log_failed,
-				    &binder_transaction_log_fops);
-	}
-
-	/*
-	 * Copy the module_parameter string, because we don't want to
-	 * tokenize it in-place.
-	 */
-	device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
-	if (!device_names) {
-		ret = -ENOMEM;
-		goto err_alloc_device_names_failed;
-	}
-	strcpy(device_names, binder_devices_param);
-
-	while ((device_name = strsep(&device_names, ","))) {
-		ret = init_binder_device(device_name);
-		if (ret)
-			goto err_init_binder_device_failed;
+				    NULL,
+				    &binder_failed_transaction_log_fops);
 	}
 
 	return ret;
@@ -4276,12 +4406,8 @@ err_init_binder_device_failed:
 	hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) {
 		misc_deregister(&device->miscdev);
 		hlist_del(&device->hlist);
-		kfree(device);
+		free_binder_device(device);
 	}
-err_alloc_device_names_failed:
-	debugfs_remove_recursive(binder_debugfs_dir_entry_root);
-
-	destroy_workqueue(binder_deferred_workqueue);
 
 	return ret;
 }

From f52e71a12e80643225e7b6faf9b4dc171229af39 Mon Sep 17 00:00:00 2001
From: Anson Jacob <ansonjacob.aj@gmail.com>
Date: Fri, 12 Aug 2016 20:38:10 -0400
Subject: [PATCH 0682/3220] usb: gadget: f_accessory: Fix for UsbAccessory
 clean unbind.

Reapplying fix by Darren Whobrey (Change 69674)

Fixes issues: 20545, 59667 and 61390.
With prior version of f_accessory.c, UsbAccessories would not
unbind cleanly when application is closed or i/o stopped
while the usb cable is still connected. The accessory gadget
driver would be left in an invalid state which was not reset
on subsequent binding or opening. A reboot was necessary to clear.

In some phones this issues causes the phone to reboot upon
unplugging the USB cable.

Main problem was that acc_disconnect was being called on I/O error
which reset disconnected and online.

Minor fix required to properly track setting and unsetting of
disconnected and online flags. Also added urb Q wakeup's on unbind
to help unblock waiting threads.

Tested on Nexus 7 grouper. Expected behaviour now observed:
closing accessory causes blocked i/o to interrupt with IOException.
Accessory can be restarted following closing of file handle
and re-opening.

This is a generic fix that applies to all devices.

Change-Id: I4e08b326730dd3a2820c863124cee10f7cb5501e
Signed-off-by: Darren Whobrey <d.whobrey@mildai.org>
Signed-off-by: Anson Jacob <ansonjacob.aj@gmail.com>
---
 drivers/usb/gadget/function/f_accessory.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c
index f2fa0c271d70..76b8ae08a551 100644
--- a/drivers/usb/gadget/function/f_accessory.c
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -77,9 +77,13 @@ struct acc_dev {
 	struct usb_ep *ep_in;
 	struct usb_ep *ep_out;
 
-	/* set to 1 when we connect */
+	/* online indicates state of function_set_alt & function_unbind
+	 * set to 1 when we connect
+	 */
 	int online:1;
-	/* Set to 1 when we disconnect.
+
+	/* disconnected indicates state of open & release
+	 * Set to 1 when we disconnect.
 	 * Not cleared until our file is closed.
 	 */
 	int disconnected:1;
@@ -263,7 +267,6 @@ static struct usb_request *req_get(struct acc_dev *dev, struct list_head *head)
 
 static void acc_set_disconnected(struct acc_dev *dev)
 {
-	dev->online = 0;
 	dev->disconnected = 1;
 }
 
@@ -764,7 +767,10 @@ static int acc_release(struct inode *ip, struct file *fp)
 	printk(KERN_INFO "acc_release\n");
 
 	WARN_ON(!atomic_xchg(&_acc_dev->open_excl, 0));
-	_acc_dev->disconnected = 0;
+	/* indicate that we are disconnected
+	 * still could be online so don't touch online flag
+	 */
+	_acc_dev->disconnected = 1;
 	return 0;
 }
 
@@ -1012,6 +1018,10 @@ acc_function_unbind(struct usb_configuration *c, struct usb_function *f)
 	struct usb_request *req;
 	int i;
 
+	dev->online = 0;		/* clear online flag */
+	wake_up(&dev->read_wq);		/* unblock reads on closure */
+	wake_up(&dev->write_wq);	/* likewise for writes */
+
 	while ((req = req_get(dev, &dev->tx_idle)))
 		acc_request_free(req, dev->ep_in);
 	for (i = 0; i < RX_REQ_MAX; i++)
@@ -1143,6 +1153,7 @@ static int acc_function_set_alt(struct usb_function *f,
 	}
 
 	dev->online = 1;
+	dev->disconnected = 0; /* if online then not disconnected */
 
 	/* readers may be blocked waiting for us to go online */
 	wake_up(&dev->read_wq);
@@ -1155,7 +1166,8 @@ static void acc_function_disable(struct usb_function *f)
 	struct usb_composite_dev	*cdev = dev->cdev;
 
 	DBG(cdev, "acc_function_disable\n");
-	acc_set_disconnected(dev);
+	acc_set_disconnected(dev); /* this now only sets disconnected */
+	dev->online = 0; /* so now need to clear online flag here too */
 	usb_ep_disable(dev->ep_in);
 	usb_ep_disable(dev->ep_out);
 

From 6f4a2453a14bae428dd10ba3bb9c15dccfc9eb8d Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Fri, 10 Mar 2017 16:08:30 -0800
Subject: [PATCH 0683/3220] ANDROID: Replace spaces by '_' for some android
 filesystem tracepoints.

Andoid files frequently have spaces in them, as do cmdline strings.
Replace these spaces with '_', so tools that parse these tracepoints
don't get terribly confused.

Change-Id: I1cbbedf5c803aa6a58d9b8b7836e9125683c49d1
Signed-off-by: Mohan Srinivasan <srmohan@google.com>
(cherry picked from commit 5035d5f0933758dd515327d038e5bef7e40dbaa7)
---
 include/trace/events/android_fs_template.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h
index 4e61ffe7a814..b23d17b56c63 100644
--- a/include/trace/events/android_fs_template.h
+++ b/include/trace/events/android_fs_template.h
@@ -18,11 +18,18 @@ DECLARE_EVENT_CLASS(android_fs_data_start_template,
 	),
 	TP_fast_assign(
 		{
+			/*
+			 * Replace the spaces in filenames and cmdlines
+			 * because this screws up the tooling that parses
+			 * the traces.
+			 */
 			__assign_str(pathbuf, pathname);
+			(void)strreplace(__get_str(pathbuf), ' ', '_');
 			__entry->offset		= offset;
 			__entry->bytes		= bytes;
 			__entry->i_size		= i_size_read(inode);
 			__assign_str(cmdline, command);
+			(void)strreplace(__get_str(cmdline), ' ', '_');
 			__entry->pid		= pid;
 			__entry->ino		= inode->i_ino;
 		}

From 174a03a0bdc252ec122d27deca311f6caa0c2810 Mon Sep 17 00:00:00 2001
From: "Jon Medhurst (Tixy)" <tixy@linaro.org>
Date: Wed, 9 Dec 2015 09:40:53 +0000
Subject: [PATCH 0684/3220] arm64: dts: juno: Add idle-states to device tree

This patch adds idle-states bindings data collected through a set of
benchmarking experiments (latency and energy consumption) on Juno
boards. Latencies data represents the worst case scenarios as required
by the DT idle-states bindings.

Change-Id: I7b2d81fa66f8ce8b229457cfefff06e9edd545c7
(cherry picked from commit 286896f43b0248960f69660159b507b23751b38a)
Signed-off-by: Jon Medhurst <tixy@linaro.org>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 arch/arm64/boot/dts/arm/juno-r1.dts | 28 ++++++++++++++++++++++++++++
 arch/arm64/boot/dts/arm/juno.dts    | 28 ++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/arch/arm64/boot/dts/arm/juno-r1.dts b/arch/arm64/boot/dts/arm/juno-r1.dts
index 93bc3d7d51c0..8826f834f54f 100644
--- a/arch/arm64/boot/dts/arm/juno-r1.dts
+++ b/arch/arm64/boot/dts/arm/juno-r1.dts
@@ -60,6 +60,28 @@
 			};
 		};
 
+		idle-states {
+			entry-method = "arm,psci";
+
+			CPU_SLEEP_0: cpu-sleep-0 {
+				compatible = "arm,idle-state";
+				arm,psci-suspend-param = <0x0010000>;
+				local-timer-stop;
+				entry-latency-us = <300>;
+				exit-latency-us = <1200>;
+				min-residency-us = <2000>;
+			};
+
+			CLUSTER_SLEEP_0: cluster-sleep-0 {
+				compatible = "arm,idle-state";
+				arm,psci-suspend-param = <0x1010000>;
+				local-timer-stop;
+				entry-latency-us = <300>;
+				exit-latency-us = <1200>;
+				min-residency-us = <2500>;
+			};
+		};
+
 		A57_0: cpu@0 {
 			compatible = "arm,cortex-a57","arm,armv8";
 			reg = <0x0 0x0>;
@@ -67,6 +89,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A57_L2>;
 			clocks = <&scpi_dvfs 0>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A57_1: cpu@1 {
@@ -76,6 +99,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A57_L2>;
 			clocks = <&scpi_dvfs 0>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A53_0: cpu@100 {
@@ -85,6 +109,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A53_1: cpu@101 {
@@ -94,6 +119,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A53_2: cpu@102 {
@@ -103,6 +129,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A53_3: cpu@103 {
@@ -112,6 +139,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A57_L2: l2-cache0 {
diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts
index 53442b5ee4ff..dcfcf15a17f5 100644
--- a/arch/arm64/boot/dts/arm/juno.dts
+++ b/arch/arm64/boot/dts/arm/juno.dts
@@ -60,6 +60,28 @@
 			};
 		};
 
+		idle-states {
+			entry-method = "arm,psci";
+
+			CPU_SLEEP_0: cpu-sleep-0 {
+				compatible = "arm,idle-state";
+				arm,psci-suspend-param = <0x0010000>;
+				local-timer-stop;
+				entry-latency-us = <300>;
+				exit-latency-us = <1200>;
+				min-residency-us = <2000>;
+			};
+
+			CLUSTER_SLEEP_0: cluster-sleep-0 {
+				compatible = "arm,idle-state";
+				arm,psci-suspend-param = <0x1010000>;
+				local-timer-stop;
+				entry-latency-us = <300>;
+				exit-latency-us = <1200>;
+				min-residency-us = <2500>;
+			};
+		};
+
 		A57_0: cpu@0 {
 			compatible = "arm,cortex-a57","arm,armv8";
 			reg = <0x0 0x0>;
@@ -67,6 +89,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A57_L2>;
 			clocks = <&scpi_dvfs 0>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A57_1: cpu@1 {
@@ -76,6 +99,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A57_L2>;
 			clocks = <&scpi_dvfs 0>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A53_0: cpu@100 {
@@ -85,6 +109,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A53_1: cpu@101 {
@@ -94,6 +119,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A53_2: cpu@102 {
@@ -103,6 +129,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A53_3: cpu@103 {
@@ -112,6 +139,7 @@
 			enable-method = "psci";
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
+			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
 		};
 
 		A57_L2: l2-cache0 {

From a2849d45025ca5894e99502752a86ebc744a4361 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Fri, 13 Nov 2015 10:21:39 +0000
Subject: [PATCH 0685/3220] DTB: Add EAS compatible Juno Energy model to
 'juno.dts'

EAS expects the energy model for the CPUs and cluster states to be
available in the DTB. The energy model data comes from previous versions.

Change-Id: I87535c8d802797361333929d809b43383bc8954b
(cherry picked from commit bf137f205f312a1814ae38f908ec7bdbdddeaa3e (LSK 4.4))
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Signed-off-by: Jon Medhurst <tixy@linaro.org>
---
 .../arm64/boot/dts/arm/juno-sched-energy.dtsi | 147 ++++++++++++++++++
 arch/arm64/boot/dts/arm/juno.dts              |   8 +
 2 files changed, 155 insertions(+)
 create mode 100644 arch/arm64/boot/dts/arm/juno-sched-energy.dtsi

diff --git a/arch/arm64/boot/dts/arm/juno-sched-energy.dtsi b/arch/arm64/boot/dts/arm/juno-sched-energy.dtsi
new file mode 100644
index 000000000000..38207e4391ab
--- /dev/null
+++ b/arch/arm64/boot/dts/arm/juno-sched-energy.dtsi
@@ -0,0 +1,147 @@
+/*
+ * ARM JUNO specific energy cost model data. There are no unit requirements for
+ * the data. Data can be normalized to any reference point, but the
+ * normalization must be consistent. That is, one bogo-joule/watt must be the
+ * same quantity for all data, but we don't care what it is.
+ */
+
+/* static struct idle_state idle_states_cluster_a53[] = { */
+/*        { .power = 56 }, /\* arch_cpu_idle() (active idle) = WFI *\/ */
+/*        { .power = 56 }, /\* WFI *\/ */
+/*        { .power = 56 }, /\* cpu-sleep-0 *\/ */
+/*        { .power = 17 }, /\* cluster-sleep-0 *\/ */
+/* }; */
+
+/* static struct idle_state idle_states_cluster_a57[] = { */
+/*        { .power = 65 }, /\* arch_cpu_idle() (active idle) = WFI *\/ */
+/*        { .power = 65 }, /\* WFI *\/ */
+/*        { .power = 65 }, /\* cpu-sleep-0 *\/ */
+/*        { .power = 24 }, /\* cluster-sleep-0 *\/ */
+/* }; */
+
+/* static struct capacity_state cap_states_cluster_a53[] = { */
+/*         /\* Power per cluster *\/ */
+/*        { .cap =  235, .power = 26, }, /\*  450 MHz *\/ */
+/*        { .cap =  303, .power = 30, }, /\*  575 MHz *\/ */
+/*        { .cap =  368, .power = 39, }, /\*  700 MHz *\/ */
+/*        { .cap =  406, .power = 47, }, /\*  775 MHz *\/ */
+/*        { .cap =  447, .power = 57, }, /\*  850 Mhz *\/ */
+/* }; */
+
+/* static struct capacity_state cap_states_cluster_a57[] = { */
+/*         /\* Power per cluster *\/ */
+/*        { .cap =  417, .power = 24, }, /\*  450 MHz *\/ */
+/*        { .cap =  579, .power = 32, }, /\*  625 MHz *\/ */
+/*        { .cap =  744, .power = 43, }, /\*  800 MHz *\/ */
+/*        { .cap =  883, .power = 49, }, /\*  950 MHz *\/ */
+/*        { .cap = 1024, .power = 64, }, /\* 1100 MHz *\/ */
+/* }; */
+
+/* static struct sched_group_energy energy_cluster_a53 = { */
+/*        .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a53), */
+/*        .idle_states    = idle_states_cluster_a53, */
+/*        .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a53), */
+/*        .cap_states     = cap_states_cluster_a53, */
+/* }; */
+
+/* static struct sched_group_energy energy_cluster_a57 = { */
+/*        .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a57), */
+/*        .idle_states    = idle_states_cluster_a57, */
+/*        .nr_cap_states  = ARRAY_SIZE(cap_states_cluster_a57), */
+/*        .cap_states     = cap_states_cluster_a57, */
+/* }; */
+
+/* static struct idle_state idle_states_core_a53[] = { */
+/*        { .power = 6 }, /\* arch_cpu_idle() (active idle) = WFI *\/ */
+/*        { .power = 6 }, /\* WFI *\/ */
+/*        { .power = 0 }, /\* cpu-sleep-0 *\/ */
+/*        { .power = 0 }, /\* cluster-sleep-0 *\/ */
+/* }; */
+
+/* static struct idle_state idle_states_core_a57[] = { */
+/*        { .power = 15 }, /\* arch_cpu_idle() (active idle) = WFI *\/ */
+/*        { .power = 15 }, /\* WFI *\/ */
+/*        { .power = 0  }, /\* cpu-sleep-0 *\/ */
+/*        { .power = 0  }, /\* cluster-sleep-0 *\/ */
+/* }; */
+
+/* static struct capacity_state cap_states_core_a53[] = { */
+/*         /\* Power per cpu *\/ */
+/*        { .cap =  235, .power =  33, }, /\*  450 MHz *\/ */
+/*        { .cap =  302, .power =  46, }, /\*  575 MHz *\/ */
+/*        { .cap =  368, .power =  61, }, /\*  700 MHz *\/ */
+/*        { .cap =  406, .power =  76, }, /\*  775 MHz *\/ */
+/*        { .cap =  447, .power =  93, }, /\*  850 Mhz *\/ */
+/* }; */
+
+/* static struct capacity_state cap_states_core_a57[] = { */
+/*         /\* Power per cpu *\/ */
+/*        { .cap =  417, .power = 168, }, /\*  450 MHz *\/ */
+/*        { .cap =  579, .power = 251, }, /\*  625 MHz *\/ */
+/*        { .cap =  744, .power = 359, }, /\*  800 MHz *\/ */
+/*        { .cap =  883, .power = 479, }, /\*  950 MHz *\/ */
+/*        { .cap = 1024, .power = 616, }, /\* 1100 MHz *\/ */
+/* }; */
+
+energy-costs {
+	CPU_COST_A57: core-cost0 {
+		busy-cost-data = <
+			417   168
+			579   251
+			744   359
+			883   479
+		       1023   616
+		>;
+		idle-cost-data = <
+		      15
+		      15
+		       0
+		       0
+		>;
+	};
+	CPU_COST_A53: core-cost1 {
+		busy-cost-data = <
+			235    33
+			302    46
+			368    61
+			406    76
+			447    93
+		>;
+		idle-cost-data = <
+		      6
+		      6
+		      0
+		      0
+		>;
+	};
+	CLUSTER_COST_A57: cluster-cost0 {
+		busy-cost-data = <
+			417    24
+			579    32
+			744    43
+			883    49
+		       1024    64
+		>;
+		idle-cost-data = <
+			 65
+			 65
+			 65
+			 24
+		>;
+	};
+	CLUSTER_COST_A53: cluster-cost1 {
+		busy-cost-data = <
+			235    26
+			303    30
+			368    39
+			406    47
+			447    57
+		>;
+		idle-cost-data = <
+			56
+			56
+			56
+			17
+		>;
+	};
+};
diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts
index dcfcf15a17f5..f113a80519cd 100644
--- a/arch/arm64/boot/dts/arm/juno.dts
+++ b/arch/arm64/boot/dts/arm/juno.dts
@@ -90,6 +90,7 @@
 			next-level-cache = <&A57_L2>;
 			clocks = <&scpi_dvfs 0>;
 			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+			sched-energy-costs = <&CPU_COST_A57 &CLUSTER_COST_A57>;
 		};
 
 		A57_1: cpu@1 {
@@ -100,6 +101,7 @@
 			next-level-cache = <&A57_L2>;
 			clocks = <&scpi_dvfs 0>;
 			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+			sched-energy-costs = <&CPU_COST_A57 &CLUSTER_COST_A57>;
 		};
 
 		A53_0: cpu@100 {
@@ -110,6 +112,7 @@
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
 			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+			sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>;
 		};
 
 		A53_1: cpu@101 {
@@ -120,6 +123,7 @@
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
 			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+			sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>;
 		};
 
 		A53_2: cpu@102 {
@@ -130,6 +134,7 @@
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
 			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+			sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>;
 		};
 
 		A53_3: cpu@103 {
@@ -140,6 +145,7 @@
 			next-level-cache = <&A53_L2>;
 			clocks = <&scpi_dvfs 1>;
 			cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+			sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>;
 		};
 
 		A57_L2: l2-cache0 {
@@ -149,6 +155,8 @@
 		A53_L2: l2-cache1 {
 			compatible = "cache";
 		};
+
+		/include/ "juno-sched-energy.dtsi"
 	};
 
 	pmu_a57 {

From bce3e4dd9d49f2a3da44841a6b0b6dcb56ec149d Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Tue, 10 Jan 2017 16:10:35 -0800
Subject: [PATCH 0686/3220] ANDROID: uid_cputime: add per-uid IO usage
 accounting

IO usages are accounted in foreground and background buckets.
For each uid, io usage is calculated in two steps.

delta = current total of all uid tasks - previus total
current bucket += delta

Bucket is determined by current uid stat. Userspace writes to
/proc/uid_procstat/set <uid> <stat> when uid stat is updated.

/proc/uid_io/stats shows IO usage in this format.
<uid> <foreground IO> <background IO>

Signed-off-by: Jin Qian <jinqian@google.com>
Bug: 34198239
Change-Id: Ib8bebda53e7a56f45ea3eb0ec9a3153d44188102
---
 drivers/misc/uid_cputime.c | 236 ++++++++++++++++++++++++++++++++++---
 1 file changed, 220 insertions(+), 16 deletions(-)

diff --git a/drivers/misc/uid_cputime.c b/drivers/misc/uid_cputime.c
index c1ad5246f564..f5135bb01a7a 100644
--- a/drivers/misc/uid_cputime.c
+++ b/drivers/misc/uid_cputime.c
@@ -30,7 +30,24 @@
 DECLARE_HASHTABLE(hash_table, UID_HASH_BITS);
 
 static DEFINE_MUTEX(uid_lock);
-static struct proc_dir_entry *parent;
+static struct proc_dir_entry *cpu_parent;
+static struct proc_dir_entry *io_parent;
+static struct proc_dir_entry *proc_parent;
+
+struct io_stats {
+	u64 read_bytes;
+	u64 write_bytes;
+	u64 rchar;
+	u64 wchar;
+};
+
+#define UID_STATE_FOREGROUND	0
+#define UID_STATE_BACKGROUND	1
+#define UID_STATE_BUCKET_SIZE	2
+
+#define UID_STATE_TOTAL_CURR	2
+#define UID_STATE_TOTAL_LAST	3
+#define UID_STATE_SIZE		4
 
 struct uid_entry {
 	uid_t uid;
@@ -38,6 +55,8 @@ struct uid_entry {
 	cputime_t stime;
 	cputime_t active_utime;
 	cputime_t active_stime;
+	int state;
+	struct io_stats io[UID_STATE_SIZE];
 	struct hlist_node hash;
 };
 
@@ -70,7 +89,7 @@ static struct uid_entry *find_or_register_uid(uid_t uid)
 	return uid_entry;
 }
 
-static int uid_stat_show(struct seq_file *m, void *v)
+static int uid_cputime_show(struct seq_file *m, void *v)
 {
 	struct uid_entry *uid_entry;
 	struct task_struct *task, *temp;
@@ -119,13 +138,13 @@ static int uid_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int uid_stat_open(struct inode *inode, struct file *file)
+static int uid_cputime_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, uid_stat_show, PDE_DATA(inode));
+	return single_open(file, uid_cputime_show, PDE_DATA(inode));
 }
 
-static const struct file_operations uid_stat_fops = {
-	.open		= uid_stat_open,
+static const struct file_operations uid_cputime_fops = {
+	.open		= uid_cputime_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
@@ -184,6 +203,162 @@ static const struct file_operations uid_remove_fops = {
 	.write		= uid_remove_write,
 };
 
+static void add_uid_io_curr_stats(struct uid_entry *uid_entry,
+			struct task_struct *task)
+{
+	struct io_stats *io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR];
+
+	io_curr->read_bytes += task->ioac.read_bytes;
+	io_curr->write_bytes +=
+		task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+	io_curr->rchar += task->ioac.rchar;
+	io_curr->wchar += task->ioac.wchar;
+}
+
+static void clean_uid_io_last_stats(struct uid_entry *uid_entry,
+			struct task_struct *task)
+{
+	struct io_stats *io_last = &uid_entry->io[UID_STATE_TOTAL_LAST];
+
+	io_last->read_bytes -= task->ioac.read_bytes;
+	io_last->write_bytes -=
+		task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+	io_last->rchar -= task->ioac.rchar;
+	io_last->wchar -= task->ioac.wchar;
+}
+
+static void update_io_stats_locked(void)
+{
+	struct uid_entry *uid_entry;
+	struct task_struct *task, *temp;
+	struct io_stats *io_bucket, *io_curr, *io_last;
+	unsigned long bkt;
+
+	BUG_ON(!mutex_is_locked(&uid_lock));
+
+	hash_for_each(hash_table, bkt, uid_entry, hash)
+		memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+			sizeof(struct io_stats));
+
+	read_lock(&tasklist_lock);
+	do_each_thread(temp, task) {
+		uid_entry = find_or_register_uid(from_kuid_munged(
+			current_user_ns(), task_uid(task)));
+		if (!uid_entry)
+			continue;
+		add_uid_io_curr_stats(uid_entry, task);
+	} while_each_thread(temp, task);
+	read_unlock(&tasklist_lock);
+
+	hash_for_each(hash_table, bkt, uid_entry, hash) {
+		io_bucket = &uid_entry->io[uid_entry->state];
+		io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR];
+		io_last = &uid_entry->io[UID_STATE_TOTAL_LAST];
+
+		io_bucket->read_bytes +=
+			io_curr->read_bytes - io_last->read_bytes;
+		io_bucket->write_bytes +=
+			io_curr->write_bytes - io_last->write_bytes;
+		io_bucket->rchar += io_curr->rchar - io_last->rchar;
+		io_bucket->wchar += io_curr->wchar - io_last->wchar;
+
+		io_last->read_bytes = io_curr->read_bytes;
+		io_last->write_bytes = io_curr->write_bytes;
+		io_last->rchar = io_curr->rchar;
+		io_last->wchar = io_curr->wchar;
+	}
+}
+
+static int uid_io_show(struct seq_file *m, void *v)
+{
+	struct uid_entry *uid_entry;
+	unsigned long bkt;
+
+	mutex_lock(&uid_lock);
+
+	update_io_stats_locked();
+
+	hash_for_each(hash_table, bkt, uid_entry, hash) {
+		seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+			uid_entry->uid,
+			uid_entry->io[UID_STATE_FOREGROUND].rchar,
+			uid_entry->io[UID_STATE_FOREGROUND].wchar,
+			uid_entry->io[UID_STATE_FOREGROUND].read_bytes,
+			uid_entry->io[UID_STATE_FOREGROUND].write_bytes,
+			uid_entry->io[UID_STATE_BACKGROUND].rchar,
+			uid_entry->io[UID_STATE_BACKGROUND].wchar,
+			uid_entry->io[UID_STATE_BACKGROUND].read_bytes,
+			uid_entry->io[UID_STATE_BACKGROUND].write_bytes);
+	}
+
+	mutex_unlock(&uid_lock);
+
+	return 0;
+}
+
+static int uid_io_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, uid_io_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_io_fops = {
+	.open		= uid_io_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int uid_procstat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, NULL, NULL);
+}
+
+static ssize_t uid_procstat_write(struct file *file,
+			const char __user *buffer, size_t count, loff_t *ppos)
+{
+	struct uid_entry *uid_entry;
+	uid_t uid;
+	int argc, state;
+	char input[128];
+
+	if (count >= sizeof(input))
+		return -EINVAL;
+
+	if (copy_from_user(input, buffer, count))
+		return -EFAULT;
+
+	input[count] = '\0';
+
+	argc = sscanf(input, "%u %d", &uid, &state);
+	if (argc != 2)
+		return -EINVAL;
+
+	if (state != UID_STATE_BACKGROUND && state != UID_STATE_FOREGROUND)
+		return -EINVAL;
+
+	mutex_lock(&uid_lock);
+
+	uid_entry = find_or_register_uid(uid);
+	if (!uid_entry || uid_entry->state == state) {
+		mutex_unlock(&uid_lock);
+		return -EINVAL;
+	}
+
+	update_io_stats_locked();
+
+	uid_entry->state = state;
+
+	mutex_unlock(&uid_lock);
+
+	return count;
+}
+
+static const struct file_operations uid_procstat_fops = {
+	.open		= uid_procstat_open,
+	.release	= single_release,
+	.write		= uid_procstat_write,
+};
+
 static int process_notifier(struct notifier_block *self,
 			unsigned long cmd, void *v)
 {
@@ -207,6 +382,9 @@ static int process_notifier(struct notifier_block *self,
 	uid_entry->utime += utime;
 	uid_entry->stime += stime;
 
+	update_io_stats_locked();
+	clean_uid_io_last_stats(uid_entry, task);
+
 exit:
 	mutex_unlock(&uid_lock);
 	return NOTIFY_OK;
@@ -216,25 +394,51 @@ static struct notifier_block process_notifier_block = {
 	.notifier_call	= process_notifier,
 };
 
-static int __init proc_uid_cputime_init(void)
+static int __init proc_uid_sys_stats_init(void)
 {
 	hash_init(hash_table);
 
-	parent = proc_mkdir("uid_cputime", NULL);
-	if (!parent) {
-		pr_err("%s: failed to create proc entry\n", __func__);
-		return -ENOMEM;
+	cpu_parent = proc_mkdir("uid_cputime", NULL);
+	if (!cpu_parent) {
+		pr_err("%s: failed to create uid_cputime proc entry\n",
+			__func__);
+		goto err;
 	}
 
-	proc_create_data("remove_uid_range", S_IWUGO, parent, &uid_remove_fops,
-					NULL);
+	proc_create_data("remove_uid_range", 0222, cpu_parent,
+		&uid_remove_fops, NULL);
+	proc_create_data("show_uid_stat", 0444, cpu_parent,
+		&uid_cputime_fops, NULL);
 
-	proc_create_data("show_uid_stat", S_IRUGO, parent, &uid_stat_fops,
-					NULL);
+	io_parent = proc_mkdir("uid_io", NULL);
+	if (!io_parent) {
+		pr_err("%s: failed to create uid_io proc entry\n",
+			__func__);
+		goto err;
+	}
+
+	proc_create_data("stats", 0444, io_parent,
+		&uid_io_fops, NULL);
+
+	proc_parent = proc_mkdir("uid_procstat", NULL);
+	if (!proc_parent) {
+		pr_err("%s: failed to create uid_procstat proc entry\n",
+			__func__);
+		goto err;
+	}
+
+	proc_create_data("set", 0222, proc_parent,
+		&uid_procstat_fops, NULL);
 
 	profile_event_register(PROFILE_TASK_EXIT, &process_notifier_block);
 
 	return 0;
+
+err:
+	remove_proc_subtree("uid_cputime", NULL);
+	remove_proc_subtree("uid_io", NULL);
+	remove_proc_subtree("uid_procstat", NULL);
+	return -ENOMEM;
 }
 
-early_initcall(proc_uid_cputime_init);
+early_initcall(proc_uid_sys_stats_init);

From 6a61b529b4a92817adfb1d1393c6b914419a4555 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Tue, 10 Jan 2017 16:11:07 -0800
Subject: [PATCH 0687/3220] ANDROID: uid_sys_stats: rename uid_cputime.c to
 uid_sys_stats.c

This module tracks cputime and io stats.

Signed-off-by: Jin Qian <jinqian@google.com>
Bug: 34198239
Change-Id: I9ee7d9e915431e0bb714b36b5a2282e1fdcc7342
---
 android/configs/android-base.cfg                | 2 +-
 drivers/misc/Kconfig                            | 6 ++++--
 drivers/misc/Makefile                           | 2 +-
 drivers/misc/{uid_cputime.c => uid_sys_stats.c} | 0
 4 files changed, 6 insertions(+), 4 deletions(-)
 rename drivers/misc/{uid_cputime.c => uid_sys_stats.c} (100%)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index f10371a981b7..2098fe97198c 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -157,7 +157,7 @@ CONFIG_STAGING=y
 CONFIG_SWP_EMULATION=y
 CONFIG_SYNC=y
 CONFIG_TUN=y
-CONFIG_UID_CPUTIME=y
+CONFIG_UID_SYS_STATS=y
 CONFIG_UNIX=y
 CONFIG_USB_GADGET=y
 CONFIG_USB_CONFIGFS=y
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 06eddc0cb24f..5847d3be0835 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -525,11 +525,13 @@ config VEXPRESS_SYSCFG
 	  bus. System Configuration interface is one of the possible means
 	  of generating transactions on this bus.
 
-config UID_CPUTIME
-	bool "Per-UID cpu time statistics"
+config UID_SYS_STATS
+	bool "Per-UID statistics"
 	depends on PROFILING
 	help
 	  Per UID based cpu time statistics exported to /proc/uid_cputime
+	  Per UID based io statistics exported to /proc/uid_io
+	  Per UID based procstat control in /proc/uid_procstat
 
 config MEMORY_STATE_TIME
 	tristate "Memory freq/bandwidth time statistics"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index b76b4c9fe104..9a3b402921b2 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -56,5 +56,5 @@ obj-$(CONFIG_GENWQE)		+= genwqe/
 obj-$(CONFIG_ECHO)		+= echo/
 obj-$(CONFIG_VEXPRESS_SYSCFG)	+= vexpress-syscfg.o
 obj-$(CONFIG_CXL_BASE)		+= cxl/
-obj-$(CONFIG_UID_CPUTIME) += uid_cputime.o
+obj-$(CONFIG_UID_SYS_STATS) += uid_sys_stats.o
 obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o
diff --git a/drivers/misc/uid_cputime.c b/drivers/misc/uid_sys_stats.c
similarity index 100%
rename from drivers/misc/uid_cputime.c
rename to drivers/misc/uid_sys_stats.c

From 5a420edf10dc41f907ba5b8175a9befde3219096 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Tue, 17 Jan 2017 17:26:07 -0800
Subject: [PATCH 0688/3220] ANDROID: uid_sys_stats: allow writing same state

Signed-off-by: Jin Qian <jinqian@google.com>
Bug: 34360629
Change-Id: Ia748351e07910b1febe54f0484ca1be58c4eb9c7
---
 drivers/misc/uid_sys_stats.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index f5135bb01a7a..5f82d17e7ad9 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -339,11 +339,16 @@ static ssize_t uid_procstat_write(struct file *file,
 	mutex_lock(&uid_lock);
 
 	uid_entry = find_or_register_uid(uid);
-	if (!uid_entry || uid_entry->state == state) {
+	if (!uid_entry) {
 		mutex_unlock(&uid_lock);
 		return -EINVAL;
 	}
 
+	if (uid_entry->state == state) {
+		mutex_unlock(&uid_lock);
+		return count;
+	}
+
 	update_io_stats_locked();
 
 	uid_entry->state = state;

From 5c866b0f8a2ff6d75fece4463fb9162ca43a4a3a Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Tue, 28 Feb 2017 15:09:42 -0800
Subject: [PATCH 0689/3220] ANDROID: uid_sys_stats: fix negative write bytes.

A task can cancel writes made by other tasks. In rare cases,
cancelled_write_bytes is larger than write_bytes if the task
itself didn't make any write. This doesn't affect total size
but may cause confusion when looking at IO usage on individual
tasks.

Bug: 35851986
Change-Id: If6cb549aeef9e248e18d804293401bb2b91918ca
Signed-off-by: Jin Qian <jinqian@google.com>
---
 drivers/misc/uid_sys_stats.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 5f82d17e7ad9..7b746acf416c 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -203,14 +203,21 @@ static const struct file_operations uid_remove_fops = {
 	.write		= uid_remove_write,
 };
 
+static u64 compute_write_bytes(struct task_struct *task)
+{
+	if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes)
+		return 0;
+
+	return task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+}
+
 static void add_uid_io_curr_stats(struct uid_entry *uid_entry,
 			struct task_struct *task)
 {
 	struct io_stats *io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR];
 
 	io_curr->read_bytes += task->ioac.read_bytes;
-	io_curr->write_bytes +=
-		task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+	io_curr->write_bytes += compute_write_bytes(task);
 	io_curr->rchar += task->ioac.rchar;
 	io_curr->wchar += task->ioac.wchar;
 }
@@ -221,8 +228,7 @@ static void clean_uid_io_last_stats(struct uid_entry *uid_entry,
 	struct io_stats *io_last = &uid_entry->io[UID_STATE_TOTAL_LAST];
 
 	io_last->read_bytes -= task->ioac.read_bytes;
-	io_last->write_bytes -=
-		task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+	io_last->write_bytes -= compute_write_bytes(task);
 	io_last->rchar -= task->ioac.rchar;
 	io_last->wchar -= task->ioac.wchar;
 }

From a4f5f251e951d2e6f454f82c58e7044f7de87a0d Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Thu, 2 Mar 2017 13:32:59 -0800
Subject: [PATCH 0690/3220] ANDROID: sched: add a counter to track fsync

Change-Id: I6c138de5b2332eea70f57e098134d1d141247b3f
Signed-off-by: Jin Qian <jinqian@google.com>
---
 fs/sync.c                              | 1 +
 include/linux/sched.h                  | 8 ++++++++
 include/linux/task_io_accounting.h     | 2 ++
 include/linux/task_io_accounting_ops.h | 1 +
 4 files changed, 12 insertions(+)

diff --git a/fs/sync.c b/fs/sync.c
index dd5d1711c7ac..452179e31c39 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -218,6 +218,7 @@ static int do_fsync(unsigned int fd, int datasync)
 	if (f.file) {
 		ret = vfs_fsync(f.file, datasync);
 		fdput(f);
+		inc_syscfs(current);
 	}
 	return ret;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8be9f0dbdd0c..5b250c9f7718 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3227,6 +3227,11 @@ static inline void inc_syscw(struct task_struct *tsk)
 {
 	tsk->ioac.syscw++;
 }
+
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+	tsk->ioac.syscfs++;
+}
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
@@ -3243,6 +3248,9 @@ static inline void inc_syscr(struct task_struct *tsk)
 static inline void inc_syscw(struct task_struct *tsk)
 {
 }
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+}
 #endif
 
 #ifndef TASK_SIZE_OF
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index bdf855c2856f..2dd338fdf881 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -18,6 +18,8 @@ struct task_io_accounting {
 	u64 syscr;
 	/* # of write syscalls */
 	u64 syscw;
+	/* # of fsync syscalls */
+	u64 syscfs;
 #endif /* CONFIG_TASK_XACCT */
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index 4d090f9ee608..1b505c804af3 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -96,6 +96,7 @@ static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
 	dst->wchar += src->wchar;
 	dst->syscr += src->syscr;
 	dst->syscw += src->syscw;
+	dst->syscfs += src->syscfs;
 }
 #else
 static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,

From 3f7fac35ec629680d3cb3b83cc321dab736aa6cc Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Thu, 2 Mar 2017 13:39:43 -0800
Subject: [PATCH 0691/3220] ANDROID: uid_sys_stats: account for fsync syscalls

Change-Id: Ie888d8a0f4ec7a27dea86dc4afba8e6fd4203488
Signed-off-by: Jin Qian <jinqian@google.com>
---
 drivers/misc/uid_sys_stats.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 7b746acf416c..4988e323cf02 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -39,6 +39,7 @@ struct io_stats {
 	u64 write_bytes;
 	u64 rchar;
 	u64 wchar;
+	u64 fsync;
 };
 
 #define UID_STATE_FOREGROUND	0
@@ -220,6 +221,7 @@ static void add_uid_io_curr_stats(struct uid_entry *uid_entry,
 	io_curr->write_bytes += compute_write_bytes(task);
 	io_curr->rchar += task->ioac.rchar;
 	io_curr->wchar += task->ioac.wchar;
+	io_curr->fsync += task->ioac.syscfs;
 }
 
 static void clean_uid_io_last_stats(struct uid_entry *uid_entry,
@@ -231,6 +233,7 @@ static void clean_uid_io_last_stats(struct uid_entry *uid_entry,
 	io_last->write_bytes -= compute_write_bytes(task);
 	io_last->rchar -= task->ioac.rchar;
 	io_last->wchar -= task->ioac.wchar;
+	io_last->fsync -= task->ioac.syscfs;
 }
 
 static void update_io_stats_locked(void)
@@ -267,11 +270,13 @@ static void update_io_stats_locked(void)
 			io_curr->write_bytes - io_last->write_bytes;
 		io_bucket->rchar += io_curr->rchar - io_last->rchar;
 		io_bucket->wchar += io_curr->wchar - io_last->wchar;
+		io_bucket->fsync += io_curr->fsync - io_last->fsync;
 
 		io_last->read_bytes = io_curr->read_bytes;
 		io_last->write_bytes = io_curr->write_bytes;
 		io_last->rchar = io_curr->rchar;
 		io_last->wchar = io_curr->wchar;
+		io_last->fsync = io_curr->fsync;
 	}
 }
 
@@ -285,7 +290,7 @@ static int uid_io_show(struct seq_file *m, void *v)
 	update_io_stats_locked();
 
 	hash_for_each(hash_table, bkt, uid_entry, hash) {
-		seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+		seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 			uid_entry->uid,
 			uid_entry->io[UID_STATE_FOREGROUND].rchar,
 			uid_entry->io[UID_STATE_FOREGROUND].wchar,
@@ -294,7 +299,9 @@ static int uid_io_show(struct seq_file *m, void *v)
 			uid_entry->io[UID_STATE_BACKGROUND].rchar,
 			uid_entry->io[UID_STATE_BACKGROUND].wchar,
 			uid_entry->io[UID_STATE_BACKGROUND].read_bytes,
-			uid_entry->io[UID_STATE_BACKGROUND].write_bytes);
+			uid_entry->io[UID_STATE_BACKGROUND].write_bytes,
+			uid_entry->io[UID_STATE_FOREGROUND].fsync,
+			uid_entry->io[UID_STATE_BACKGROUND].fsync);
 	}
 
 	mutex_unlock(&uid_lock);

From 3009d5325611867c6a185272aaf9b733946d8852 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 1 Mar 2017 17:04:41 -0800
Subject: [PATCH 0692/3220] ANDROID: sdcardfs: Fix case insensitive lookup

The previous case insensitive lookup relied on the
entry being present in the dcache. This instead uses
iterate_dir to find the correct case.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
bug: 35633782
Change-Id: I556f7090773468c1943c89a5e2aa07f746ba49c5
---
 fs/sdcardfs/lookup.c | 68 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 6b595e892316..bbfb0775c700 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -206,6 +206,28 @@ out:
 	return err;
 }
 
+struct sdcardfs_name_data {
+	struct dir_context ctx;
+	const struct qstr *to_find;
+	char *name;
+	bool found;
+};
+
+static int sdcardfs_name_match(struct dir_context *ctx, const char *name, int namelen,
+		loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct sdcardfs_name_data *buf = container_of(ctx, struct sdcardfs_name_data, ctx);
+	struct qstr candidate = QSTR_INIT(name, namelen);
+
+	if (qstr_case_eq(buf->to_find, &candidate)) {
+		memcpy(buf->name, name, namelen);
+		buf->name[namelen] = 0;
+		buf->found = true;
+		return 1;
+	}
+	return 0;
+}
+
 /*
  * Main driver function for sdcardfs's lookup.
  *
@@ -242,27 +264,39 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 				&lower_path);
 	/* check for other cases */
 	if (err == -ENOENT) {
-		struct dentry *child;
-		struct dentry *match = NULL;
-		mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
-		spin_lock(&lower_dir_dentry->d_lock);
-		list_for_each_entry(child, &lower_dir_dentry->d_subdirs, d_child) {
-			if (child && d_inode(child)) {
-				if (qstr_case_eq(&child->d_name, name)) {
-					match = dget(child);
-					break;
-				}
-			}
+		struct file *file;
+		const struct cred *cred = current_cred();
+
+		struct sdcardfs_name_data buffer = {
+			.ctx.actor = sdcardfs_name_match,
+			.to_find = name,
+			.name = __getname(),
+			.found = false,
+		};
+
+		if (!buffer.name) {
+			err = -ENOMEM;
+			goto out;
 		}
-		spin_unlock(&lower_dir_dentry->d_lock);
-		mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
-		if (match) {
+		file = dentry_open(lower_parent_path, O_RDONLY, cred);
+		if (IS_ERR(file)) {
+			err = PTR_ERR(file);
+			goto put_name;
+		}
+		err = iterate_dir(file, &buffer.ctx);
+		fput(file);
+		if (err)
+			goto put_name;
+
+		if (buffer.found)
 			err = vfs_path_lookup(lower_dir_dentry,
 						lower_dir_mnt,
-						match->d_name.name, 0,
+						buffer.name, 0,
 						&lower_path);
-			dput(match);
-		}
+		else
+			err = -ENOENT;
+put_name:
+		__putname(buffer.name);
 	}
 
 	/* no error: handle positive dentries */

From 6693b9450021aebb1f00e6dd05e6369420acdec2 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 2 Mar 2017 18:07:21 -0800
Subject: [PATCH 0693/3220] ANDROID: sdcardfs: rate limit warning print

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35848445
Change-Id: Ida72ea0ece191b2ae4a8babae096b2451eb563f6
---
 fs/sdcardfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 68e615045616..aa85fcc8ec1a 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -20,6 +20,7 @@
 
 #include "sdcardfs.h"
 #include <linux/fs_struct.h>
+#include <linux/ratelimit.h>
 
 /* Do not directly use this function. Use OVERRIDE_CRED() instead. */
 const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info)
@@ -599,7 +600,7 @@ static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie)
 
 static int sdcardfs_permission_wrn(struct inode *inode, int mask)
 {
-	WARN(1, "sdcardfs does not support permission. Use permission2.\n");
+	WARN_RATELIMIT(1, "sdcardfs does not support permission. Use permission2.\n");
 	return -EINVAL;
 }
 
@@ -684,7 +685,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma
 
 static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia)
 {
-	WARN(1, "sdcardfs does not support setattr. User setattr2.\n");
+	WARN_RATELIMIT(1, "sdcardfs does not support setattr. User setattr2.\n");
 	return -EINVAL;
 }
 

From 650cf58edf0091e149546a75404517e1481c3070 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 2 Mar 2017 15:11:27 -0800
Subject: [PATCH 0694/3220] ANDROID: sdcardfs: Replace get/put with d_lock

dput cannot be called with a spin_lock. Instead,
we protect our accesses by holding the d_lock.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35643557
Change-Id: I22cf30856d75b5616cbb0c223724f5ab866b5114
---
 fs/sdcardfs/derived_perm.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index ca239a942065..c0d5a9d72bde 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -261,40 +261,48 @@ static int needs_fixup(perm_t perm) {
 	return 0;
 }
 
-void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit) {
+static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit, int depth)
+{
 	struct dentry *child;
 	struct sdcardfs_inode_info *info;
-	if (!dget(dentry))
-		return;
+
+	/*
+	 * All paths will terminate their recursion on hitting PERM_ANDROID_OBB,
+	 * PERM_ANDROID_MEDIA, or PERM_ANDROID_DATA. This happens at a depth of
+	 * at most 3.
+	 */
+	WARN(depth > 3, "%s: Max expected depth exceeded!\n", __func__);
+	spin_lock_nested(&dentry->d_lock, depth);
 	if (!d_inode(dentry)) {
-		dput(dentry);
+		spin_unlock(&dentry->d_lock);
 		return;
 	}
 	info = SDCARDFS_I(d_inode(dentry));
 
 	if (needs_fixup(info->perm)) {
-		spin_lock(&dentry->d_lock);
 		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
-			dget(child);
+			spin_lock_nested(&child->d_lock, depth + 1);
 			if (!(limit->flags & BY_NAME) || !strncasecmp(child->d_name.name, limit->name, limit->length)) {
 				if (d_inode(child)) {
 					get_derived_permission(dentry, child);
 					fixup_tmp_permissions(d_inode(child));
-					dput(child);
+					spin_unlock(&child->d_lock);
 					break;
 				}
 			}
-			dput(child);
+			spin_unlock(&child->d_lock);
 		}
-		spin_unlock(&dentry->d_lock);
 	} else 	if (descendant_may_need_fixup(info, limit)) {
-		spin_lock(&dentry->d_lock);
 		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
-				fixup_perms_recursive(child, limit);
+				__fixup_perms_recursive(child, limit, depth + 1);
 		}
-		spin_unlock(&dentry->d_lock);
 	}
-	dput(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit)
+{
+	__fixup_perms_recursive(dentry, limit, 0);
 }
 
 void drop_recursive(struct dentry *parent) {

From 371536f070bfb173575a5e69ca475eb7ab34c9f3 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 8 Mar 2017 17:11:51 -0800
Subject: [PATCH 0695/3220] ANDROID: sdcardfs: Use spin_lock_nested

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 36007653
Change-Id: I805d5afec797669679853fb2bb993ee38e6276e4
---
 fs/sdcardfs/dentry.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index 971928ab6c21..e6f8e9edf8a9 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -76,10 +76,10 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 
 	if (dentry < lower_dentry) {
 		spin_lock(&dentry->d_lock);
-		spin_lock(&lower_dentry->d_lock);
+		spin_lock_nested(&lower_dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	} else {
 		spin_lock(&lower_dentry->d_lock);
-		spin_lock(&dentry->d_lock);
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	}
 
 	if (dentry->d_name.len != lower_dentry->d_name.len) {

From 2c917ce67144478ef0b5aa6c0c812e4abbfcd6c0 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 8 Mar 2017 17:20:02 -0800
Subject: [PATCH 0696/3220] ANDROID: sdcardfs: Switch to internal case
 insensitive compare

There were still a few places where we called into a case
insensitive lookup that was not defined by sdcardfs.
Moving them all to the same place will allow us to switch
the implementation in the future.

Additionally, the check in fixup_perms_recursive did not
take into account the length of both strings, causing
extraneous matches when the name we were looking for was
a prefix of the child name.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: I45ce768cd782cb4ea1ae183772781387c590ecc2
---
 fs/sdcardfs/dentry.c       | 8 ++------
 fs/sdcardfs/derived_perm.c | 2 +-
 fs/sdcardfs/packagelist.c  | 6 ++----
 fs/sdcardfs/sdcardfs.h     | 8 ++++++--
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index e6f8e9edf8a9..64494a50e250 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -82,11 +82,7 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	}
 
-	if (dentry->d_name.len != lower_dentry->d_name.len) {
-		__d_drop(dentry);
-		err = 0;
-	} else if (strncasecmp(dentry->d_name.name, lower_dentry->d_name.name,
-				dentry->d_name.len) != 0) {
+	if (!qstr_case_eq(&dentry->d_name, &lower_dentry->d_name)) {
 		__d_drop(dentry);
 		err = 0;
 	}
@@ -166,7 +162,7 @@ static int sdcardfs_cmp_ci(const struct dentry *parent,
 	}
 	*/
 	if (name->len == len) {
-		if (strncasecmp(name->name, str, len) == 0)
+		if (str_n_case_eq(name->name, str, len))
 			return 0;
 	}
 	return 1;
diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index c0d5a9d72bde..925692f0d20c 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -282,7 +282,7 @@ static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *
 	if (needs_fixup(info->perm)) {
 		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
 			spin_lock_nested(&child->d_lock, depth + 1);
-			if (!(limit->flags & BY_NAME) || !strncasecmp(child->d_name.name, limit->name, limit->length)) {
+			if (!(limit->flags & BY_NAME) || qstr_case_eq(&child->d_name, &limit->name)) {
 				if (d_inode(child)) {
 					get_derived_permission(dentry, child);
 					fixup_tmp_permissions(d_inode(child));
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 56d643f4a9ee..68f8f4571615 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -251,8 +251,7 @@ static void fixup_all_perms_name(const struct qstr *key)
 	struct sdcardfs_sb_info *sbinfo;
 	struct limit_search limit = {
 		.flags = BY_NAME,
-		.name = key->name,
-		.length = key->len,
+		.name = QSTR_INIT(key->name, key->len),
 	};
 	list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
 		if (sbinfo_has_sdcard_magic(sbinfo))
@@ -265,8 +264,7 @@ static void fixup_all_perms_name_userid(const struct qstr *key, userid_t userid)
 	struct sdcardfs_sb_info *sbinfo;
 	struct limit_search limit = {
 		.flags = BY_NAME | BY_USERID,
-		.name = key->name,
-		.length = key->len,
+		.name = QSTR_INIT(key->name, key->len),
 		.userid = userid,
 	};
 	list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 042f989f0bea..0778eb063b63 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -470,8 +470,7 @@ extern void packagelist_exit(void);
 #define BY_USERID	(1 << 1)
 struct limit_search {
 	unsigned int flags;
-	const char *name;
-	size_t length;
+	struct qstr name;
 	userid_t userid;
 };
 
@@ -612,6 +611,11 @@ static inline bool str_case_eq(const char *s1, const char *s2)
 	return !strcasecmp(s1, s2);
 }
 
+static inline bool str_n_case_eq(const char *s1, const char *s2, size_t len)
+{
+	return !strncasecmp(s1, s2, len);
+}
+
 static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2)
 {
 	return q1->len == q2->len && str_case_eq(q1->name, q2->name);

From a5504f851af9c9897b7cde886650fb56c7e6443f Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 8 Mar 2017 17:45:46 -0800
Subject: [PATCH 0697/3220] ANDROID: sdcardfs: Use d_invalidate instead of
 drop_recurisve

drop_recursive did not properly remove stale dentries.
Instead, we use the vfs's d_invalidate, which does the proper cleanup.

Additionally, remove the no longer used drop_recursive, and
fixup_top_recursive that that are no longer used.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: Ibff61b0c34b725b024a050169047a415bc90f0d8
---
 fs/sdcardfs/derived_perm.c | 38 --------------------------------------
 fs/sdcardfs/inode.c        |  2 +-
 fs/sdcardfs/sdcardfs.h     |  2 --
 3 files changed, 1 insertion(+), 41 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 925692f0d20c..4b365b95b437 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -305,44 +305,6 @@ void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit)
 	__fixup_perms_recursive(dentry, limit, 0);
 }
 
-void drop_recursive(struct dentry *parent) {
-	struct dentry *dentry;
-	struct sdcardfs_inode_info *info;
-	if (!d_inode(parent))
-		return;
-	info = SDCARDFS_I(d_inode(parent));
-	spin_lock(&parent->d_lock);
-	list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
-		if (d_inode(dentry)) {
-			if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) {
-				drop_recursive(dentry);
-				d_drop(dentry);
-			}
-		}
-	}
-	spin_unlock(&parent->d_lock);
-}
-
-void fixup_top_recursive(struct dentry *parent) {
-	struct dentry *dentry;
-	struct sdcardfs_inode_info *info;
-
-	if (!d_inode(parent))
-		return;
-	info = SDCARDFS_I(d_inode(parent));
-	spin_lock(&parent->d_lock);
-	list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
-		if (d_inode(dentry)) {
-			if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) {
-				get_derived_permission(parent, dentry);
-				fixup_tmp_permissions(d_inode(dentry));
-				fixup_top_recursive(dentry);
-			}
-		}
-	}
-	spin_unlock(&parent->d_lock);
-}
-
 /* main function for updating derived permission */
 inline void update_derived_permission_lock(struct dentry *dentry)
 {
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index aa85fcc8ec1a..8d0875ff5710 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -529,7 +529,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	get_derived_permission_new(new_dentry->d_parent, old_dentry, &new_dentry->d_name);
 	fixup_tmp_permissions(d_inode(old_dentry));
 	fixup_lower_ownership(old_dentry, new_dentry->d_name.name);
-	drop_recursive(old_dentry); /* Can't fixup ownership recursively :( */
+	d_invalidate(old_dentry); /* Can't fixup ownership recursively :( */
 out:
 	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
 	dput(lower_old_dir_dentry);
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 0778eb063b63..e28a7dd47ebb 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -478,8 +478,6 @@ extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t useri
 			uid_t uid, bool under_android, struct inode *top);
 extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
 extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name);
-extern void drop_recursive(struct dentry *parent);
-extern void fixup_top_recursive(struct dentry *parent);
 extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
 
 extern void update_derived_permission_lock(struct dentry *dentry);

From f84495e490a5ddf5f06632c7002f5e498dea6a9e Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 9 Mar 2017 18:12:16 -0800
Subject: [PATCH 0698/3220] ANDROID: sdcardfs: Get the blocksize from the lower
 fs

This changes sdcardfs to be more in line with the
getattr in wrapfs, which calls the lower fs's getattr
to get the block size

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 34723223
Change-Id: I1c9e16604ba580a8cdefa17f02dcc489d7351aed
---
 fs/sdcardfs/inode.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 8d0875ff5710..f713fad909d8 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -856,9 +856,7 @@ static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode, struct k
 static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
-	struct dentry *lower_dentry;
-	struct inode *inode;
-	struct inode *lower_inode;
+	struct kstat lower_stat;
 	struct path lower_path;
 	struct dentry *parent;
 	int err;
@@ -873,16 +871,15 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	}
 	dput(parent);
 
-	inode = d_inode(dentry);
-
 	sdcardfs_get_lower_path(dentry, &lower_path);
-	lower_dentry = lower_path.dentry;
-	lower_inode = sdcardfs_lower_inode(inode);
-
-	sdcardfs_copy_and_fix_attrs(inode, lower_inode);
-	fsstack_copy_inode_size(inode, lower_inode);
-
-	err = sdcardfs_fillattr(mnt, inode, stat);
+	err = vfs_getattr(&lower_path, &lower_stat);
+	if (err)
+		goto out;
+	sdcardfs_copy_and_fix_attrs(d_inode(dentry),
+			      d_inode(lower_path.dentry));
+	err = sdcardfs_fillattr(mnt, d_inode(dentry), stat);
+	stat->blocks = lower_stat.blocks;
+out:
 	sdcardfs_put_lower_path(dentry, &lower_path);
 	return err;
 }

From 1e2f5dbfa3ab99eb764f5d0b1abc64946c3e0e2b Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 9 Mar 2017 20:59:18 -0800
Subject: [PATCH 0699/3220] ANDROID: sdcardfs: declare MODULE_ALIAS_FS

From commit ee616b78aa87 ("Wrapfs: declare MODULE_ALIAS_FS")

Signed-off-by: Daniel Rosenberg <drosen@google.com>
bug: 35766959
Change-Id: Ia4728ab49d065b1d2eb27825046f14b97c328cba
---
 fs/sdcardfs/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 7a8eae29e44d..4e2aded8d1d9 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -432,6 +432,7 @@ static struct file_system_type sdcardfs_fs_type = {
 	.kill_sb	= sdcardfs_kill_sb,
 	.fs_flags	= 0,
 };
+MODULE_ALIAS_FS(SDCARDFS_NAME);
 
 static int __init init_sdcardfs_fs(void)
 {

From 8dbb44c8aa4a47430b42500c3e8a482121162024 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 10 Mar 2017 12:39:42 -0800
Subject: [PATCH 0700/3220] ANDROID: sdcardfs: Use case insensitive hash
 function

Case insensitive comparisons don't help us much if
we hash to different buckets...

Signed-off-by: Daniel Rosenberg <drosen@google.com>
bug: 36004503
Change-Id: I91e00dbcd860a709cbd4f7fd7fc6d855779f3285
---
 fs/sdcardfs/packagelist.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 68f8f4571615..e72fe83f7837 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -20,6 +20,7 @@
 
 #include "sdcardfs.h"
 #include <linux/hashtable.h>
+#include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/radix-tree.h>
 #include <linux/dcache.h>
@@ -44,10 +45,18 @@ static DEFINE_HASHTABLE(ext_to_groupid, 8);
 
 static struct kmem_cache *hashtable_entry_cachep;
 
+static unsigned int full_name_case_hash(const unsigned char *name, unsigned int len)
+{
+	unsigned long hash = init_name_hash();
+	while (len--)
+		hash = partial_name_hash(tolower(*name++), hash);
+	return end_name_hash(hash);
+}
+
 static void inline qstr_init(struct qstr *q, const char *name) {
 	q->name = name;
 	q->len = strlen(q->name);
-	q->hash = full_name_hash(q->name, q->len);
+	q->hash = full_name_case_hash(q->name, q->len);
 }
 
 static inline int qstr_copy(const struct qstr *src, struct qstr *dest) {

From 1a736af098a35e3eb42b28f92384cc18e47200bf Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 10 Mar 2017 13:54:30 -0800
Subject: [PATCH 0701/3220] ANDROID: sdcardfs: move path_put outside of
 spinlock

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35643557
Change-Id: Ib279ebd7dd4e5884d184d67696a93e34993bc1ef
---
 fs/sdcardfs/derived_perm.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 4b365b95b437..dba58d8c9d60 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -357,6 +357,8 @@ int is_obbpath_invalid(struct dentry *dent)
 	struct sdcardfs_dentry_info *di = SDCARDFS_D(dent);
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dent->d_sb);
 	char *path_buf, *obbpath_s;
+	int need_put = 0;
+	struct path lower_path;
 
 	/* check the base obbpath has been changed.
 	 * this routine can check an uninitialized obb dentry as well.
@@ -383,10 +385,13 @@ int is_obbpath_invalid(struct dentry *dent)
 			}
 
 			//unlock_dir(lower_parent);
-			path_put(&di->lower_path);
+			pathcpy(&lower_path, &di->lower_path);
+			need_put = 1;
 		}
 	}
 	spin_unlock(&di->lock);
+	if (need_put)
+		path_put(&lower_path);
 	return ret;
 }
 

From f1e5d0086840d504b12acfb958f19be932bab7ad Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 10 Mar 2017 18:58:25 -0800
Subject: [PATCH 0702/3220] ANDROID: sdcardfs: Remove uninformative prints

At best these prints do not provide useful information, and
at worst, some allow userspace to abuse the kernel log.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 36138424
Change-Id: I812c57cc6a22b37262935ab77f48f3af4c36827e
---
 fs/sdcardfs/derived_perm.c |  1 -
 fs/sdcardfs/file.c         |  3 ---
 fs/sdcardfs/inode.c        | 24 +-----------------------
 fs/sdcardfs/lookup.c       |  3 ---
 4 files changed, 1 insertion(+), 30 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index dba58d8c9d60..763f10340487 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -437,7 +437,6 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
 
 	if(!err) {
 		/* the obbpath base has been found */
-		printk(KERN_INFO "sdcardfs: the sbi->obbpath is found\n");
 		pathcpy(lower_path, &obbpath);
 	} else {
 		/* if the sbi->obbpath is not available, we can optionally
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 23f8cd7f8877..0592facef704 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -217,9 +217,6 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 	}
 
 	if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
-		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
-                         "	dentry: %s, task:%s\n",
-						 __func__, dentry->d_name.name, current->comm);
 		err = -EACCES;
 		goto out_err;
 	}
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index f713fad909d8..f19f11ea19fc 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -68,9 +68,6 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	struct fs_struct *copied_fs;
 
 	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
-		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
-						 "  dentry: %s, task:%s\n",
-						 __func__, dentry->d_name.name, current->comm);
 		err = -EACCES;
 		goto out_eacces;
 	}
@@ -170,9 +167,6 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
 	const struct cred *saved_cred = NULL;
 
 	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
-		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
-						 "  dentry: %s, task:%s\n",
-						 __func__, dentry->d_name.name, current->comm);
 		err = -EACCES;
 		goto out_eacces;
 	}
@@ -280,9 +274,6 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	struct qstr q_data = QSTR_LITERAL("data");
 
 	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
-		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
-						 "  dentry: %s, task:%s\n",
-						 __func__, dentry->d_name.name, current->comm);
 		err = -EACCES;
 		goto out_eacces;
 	}
@@ -392,9 +383,6 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
 	const struct cred *saved_cred = NULL;
 
 	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
-		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
-						 "  dentry: %s, task:%s\n",
-						 __func__, dentry->d_name.name, current->comm);
 		err = -EACCES;
 		goto out_eacces;
 	}
@@ -481,9 +469,6 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	if(!check_caller_access_to_name(old_dir, &old_dentry->d_name) ||
 		!check_caller_access_to_name(new_dir, &new_dentry->d_name)) {
-		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
-						 "  new_dentry: %s, task:%s\n",
-						 __func__, new_dentry->d_name.name, current->comm);
 		err = -EACCES;
 		goto out_eacces;
 	}
@@ -746,12 +731,8 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	if (!err) {
 		/* check the Android group ID */
 		parent = dget_parent(dentry);
-		if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
-			printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
-							 "  dentry: %s, task:%s\n",
-							 __func__, dentry->d_name.name, current->comm);
+		if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name))
 			err = -EACCES;
-		}
 		dput(parent);
 	}
 
@@ -863,9 +844,6 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 	parent = dget_parent(dentry);
 	if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
-		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
-						 "  dentry: %s, task:%s\n",
-						 __func__, dentry->d_name.name, current->comm);
 		dput(parent);
 		return -EACCES;
 	}
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index bbfb0775c700..fffb94c923c4 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -395,9 +395,6 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
 		ret = ERR_PTR(-EACCES);
-		printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n"
-                         "	dentry: %s, task:%s\n",
-						 __func__, dentry->d_name.name, current->comm);
 		goto out_err;
         }
 

From 003515b560db54d22424d91a5b5a626b42634ebd Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 13 Mar 2017 13:53:54 -0700
Subject: [PATCH 0703/3220] ANDROID: sdcardfs: Use tabs instead of spaces in
 multiuser.h

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35331000
Change-Id: Ic7801914a7dd377e270647f81070020e1f0bab9b
---
 fs/sdcardfs/multiuser.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h
index 52bc20080904..ca141ff40b49 100644
--- a/fs/sdcardfs/multiuser.h
+++ b/fs/sdcardfs/multiuser.h
@@ -29,21 +29,21 @@ typedef uid_t userid_t;
 typedef uid_t appid_t;
 
 static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) {
-    return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
+	return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
 }
 
 static inline gid_t multiuser_get_cache_gid(userid_t user_id, appid_t app_id) {
-    if (app_id >= AID_APP_START && app_id <= AID_APP_END) {
-        return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_CACHE_GID_START);
-    } else {
-        return -1;
-    }
+	if (app_id >= AID_APP_START && app_id <= AID_APP_END) {
+		return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_CACHE_GID_START);
+	} else {
+		return -1;
+	}
 }
 
 static inline gid_t multiuser_get_ext_gid(userid_t user_id, appid_t app_id) {
-    if (app_id >= AID_APP_START && app_id <= AID_APP_END) {
-        return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_EXT_GID_START);
-    } else {
-        return -1;
-    }
+	if (app_id >= AID_APP_START && app_id <= AID_APP_END) {
+		return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_EXT_GID_START);
+	} else {
+		return -1;
+	}
 }

From d0b44039aa8477151b4630aae61abb5764c981b8 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 13 Mar 2017 15:34:03 -0700
Subject: [PATCH 0704/3220] ANDROID: sdcardfs: Fix gid issue

We were already calculating most of these values,
and erroring out because the check was confused by this.
Instead of recalculating, adjust it as needed.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 36160015
Change-Id: I9caf3e2fd32ca2e37ff8ed71b1d392f1761bc9a9
---
 fs/sdcardfs/derived_perm.c |  4 ++--
 fs/sdcardfs/multiuser.h    | 21 ++++++++-------------
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 763f10340487..28e9b8d42f9e 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -205,13 +205,13 @@ void fixup_lower_ownership(struct dentry* dentry, const char *name) {
 			break;
 		case PERM_ANDROID_PACKAGE:
 			if (info->d_uid != 0)
-				gid = multiuser_get_ext_gid(info->userid, info->d_uid);
+				gid = multiuser_get_ext_gid(info->d_uid);
 			else
 				gid = multiuser_get_uid(info->userid, uid);
 			break;
 		case PERM_ANDROID_PACKAGE_CACHE:
 			if (info->d_uid != 0)
-				gid = multiuser_get_cache_gid(info->userid, info->d_uid);
+				gid = multiuser_get_cache_gid(info->d_uid);
 			else
 				gid = multiuser_get_uid(info->userid, uid);
 			break;
diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h
index ca141ff40b49..2e89b5872314 100644
--- a/fs/sdcardfs/multiuser.h
+++ b/fs/sdcardfs/multiuser.h
@@ -28,22 +28,17 @@
 typedef uid_t userid_t;
 typedef uid_t appid_t;
 
-static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) {
+static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id)
+{
 	return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
 }
 
-static inline gid_t multiuser_get_cache_gid(userid_t user_id, appid_t app_id) {
-	if (app_id >= AID_APP_START && app_id <= AID_APP_END) {
-		return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_CACHE_GID_START);
-	} else {
-		return -1;
-	}
+static inline gid_t multiuser_get_cache_gid(uid_t uid)
+{
+	return uid - AID_APP_START + AID_CACHE_GID_START;
 }
 
-static inline gid_t multiuser_get_ext_gid(userid_t user_id, appid_t app_id) {
-	if (app_id >= AID_APP_START && app_id <= AID_APP_END) {
-		return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_EXT_GID_START);
-	} else {
-		return -1;
-	}
+static inline gid_t multiuser_get_ext_gid(uid_t uid)
+{
+	return uid - AID_APP_START + AID_EXT_GID_START;
 }

From 4b20ed9406bc88023353f04e07788ed4c3846aa2 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 14 Mar 2017 15:39:05 -0700
Subject: [PATCH 0705/3220] ANDROID: vfs: user permission2 in notify_change2

This allows filesystems to use their mount private data to
influence the permissions they use when attempting to touch.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 36228261
Change-Id: I1052319ba1c3ce5d5e586aa7f8a80c08851a5c7f
---
 fs/attr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/attr.c b/fs/attr.c
index 11be2265a2d5..c86b37c38fb7 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -211,7 +211,7 @@ int notify_change2(struct vfsmount *mnt, struct dentry * dentry, struct iattr *
 			return -EPERM;
 
 		if (!inode_owner_or_capable(inode)) {
-			error = inode_permission(inode, MAY_WRITE);
+			error = inode_permission2(mnt, inode, MAY_WRITE);
 			if (error)
 				return error;
 		}

From 75a7736c2ad9b019ac131f124f53f3c194bf3c95 Mon Sep 17 00:00:00 2001
From: Wei Wang <wvw@google.com>
Date: Mon, 13 Mar 2017 12:22:21 -0700
Subject: [PATCH 0706/3220] uid_sys_stats: change to use rt_mutex

We see this happens multiple times in heavy workload in systrace
and AMS stuck in uid_lock.

Running process:        Process 953
Running thread: android.ui
State:  Uninterruptible Sleep
Start:
1,025.628 ms
Duration:
27,955.949 ms
On CPU:
Running instead:        system_server
Args:
{kernel callsite when blocked:: "uid_procstat_write+0xb8/0x144"}

Changing to rt_mutex can mitigate the priority inversion

Bug: 34991231
Bug: 34193533
Test: on marlin
Change-Id: I28eb3971331cea60b1075740c792ab87d103262c
Signed-off-by: Wei Wang <wvw@google.com>
---
 drivers/misc/uid_sys_stats.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 4988e323cf02..204b23484266 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -21,6 +21,7 @@
 #include <linux/list.h>
 #include <linux/proc_fs.h>
 #include <linux/profile.h>
+#include <linux/rtmutex.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
@@ -29,7 +30,7 @@
 #define UID_HASH_BITS	10
 DECLARE_HASHTABLE(hash_table, UID_HASH_BITS);
 
-static DEFINE_MUTEX(uid_lock);
+static DEFINE_RT_MUTEX(uid_lock);
 static struct proc_dir_entry *cpu_parent;
 static struct proc_dir_entry *io_parent;
 static struct proc_dir_entry *proc_parent;
@@ -98,7 +99,7 @@ static int uid_cputime_show(struct seq_file *m, void *v)
 	cputime_t stime;
 	unsigned long bkt;
 
-	mutex_lock(&uid_lock);
+	rt_mutex_lock(&uid_lock);
 
 	hash_for_each(hash_table, bkt, uid_entry, hash) {
 		uid_entry->active_stime = 0;
@@ -111,7 +112,7 @@ static int uid_cputime_show(struct seq_file *m, void *v)
 			current_user_ns(), task_uid(task)));
 		if (!uid_entry) {
 			read_unlock(&tasklist_lock);
-			mutex_unlock(&uid_lock);
+			rt_mutex_unlock(&uid_lock);
 			pr_err("%s: failed to find the uid_entry for uid %d\n",
 				__func__, from_kuid_munged(current_user_ns(),
 				task_uid(task)));
@@ -135,7 +136,7 @@ static int uid_cputime_show(struct seq_file *m, void *v)
 				cputime_to_jiffies(total_stime)) * USEC_PER_MSEC);
 	}
 
-	mutex_unlock(&uid_lock);
+	rt_mutex_unlock(&uid_lock);
 	return 0;
 }
 
@@ -182,7 +183,7 @@ static ssize_t uid_remove_write(struct file *file,
 		kstrtol(end_uid, 10, &uid_end) != 0) {
 		return -EINVAL;
 	}
-	mutex_lock(&uid_lock);
+	rt_mutex_lock(&uid_lock);
 
 	for (; uid_start <= uid_end; uid_start++) {
 		hash_for_each_possible_safe(hash_table, uid_entry, tmp,
@@ -194,7 +195,7 @@ static ssize_t uid_remove_write(struct file *file,
 		}
 	}
 
-	mutex_unlock(&uid_lock);
+	rt_mutex_unlock(&uid_lock);
 	return count;
 }
 
@@ -243,7 +244,7 @@ static void update_io_stats_locked(void)
 	struct io_stats *io_bucket, *io_curr, *io_last;
 	unsigned long bkt;
 
-	BUG_ON(!mutex_is_locked(&uid_lock));
+	BUG_ON(!rt_mutex_is_locked(&uid_lock));
 
 	hash_for_each(hash_table, bkt, uid_entry, hash)
 		memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
@@ -285,7 +286,7 @@ static int uid_io_show(struct seq_file *m, void *v)
 	struct uid_entry *uid_entry;
 	unsigned long bkt;
 
-	mutex_lock(&uid_lock);
+	rt_mutex_lock(&uid_lock);
 
 	update_io_stats_locked();
 
@@ -304,7 +305,7 @@ static int uid_io_show(struct seq_file *m, void *v)
 			uid_entry->io[UID_STATE_BACKGROUND].fsync);
 	}
 
-	mutex_unlock(&uid_lock);
+	rt_mutex_unlock(&uid_lock);
 
 	return 0;
 }
@@ -349,16 +350,16 @@ static ssize_t uid_procstat_write(struct file *file,
 	if (state != UID_STATE_BACKGROUND && state != UID_STATE_FOREGROUND)
 		return -EINVAL;
 
-	mutex_lock(&uid_lock);
+	rt_mutex_lock(&uid_lock);
 
 	uid_entry = find_or_register_uid(uid);
 	if (!uid_entry) {
-		mutex_unlock(&uid_lock);
+		rt_mutex_unlock(&uid_lock);
 		return -EINVAL;
 	}
 
 	if (uid_entry->state == state) {
-		mutex_unlock(&uid_lock);
+		rt_mutex_unlock(&uid_lock);
 		return count;
 	}
 
@@ -366,7 +367,7 @@ static ssize_t uid_procstat_write(struct file *file,
 
 	uid_entry->state = state;
 
-	mutex_unlock(&uid_lock);
+	rt_mutex_unlock(&uid_lock);
 
 	return count;
 }
@@ -388,7 +389,7 @@ static int process_notifier(struct notifier_block *self,
 	if (!task)
 		return NOTIFY_OK;
 
-	mutex_lock(&uid_lock);
+	rt_mutex_lock(&uid_lock);
 	uid = from_kuid_munged(current_user_ns(), task_uid(task));
 	uid_entry = find_or_register_uid(uid);
 	if (!uid_entry) {
@@ -404,7 +405,7 @@ static int process_notifier(struct notifier_block *self,
 	clean_uid_io_last_stats(uid_entry, task);
 
 exit:
-	mutex_unlock(&uid_lock);
+	rt_mutex_unlock(&uid_lock);
 	return NOTIFY_OK;
 }
 

From 870382b806a424c63a02b43a3195953b833395f3 Mon Sep 17 00:00:00 2001
From: Bowgo Tsai <bowgotsai@google.com>
Date: Thu, 2 Mar 2017 18:54:15 +0800
Subject: [PATCH 0707/3220] ANDROID: dm: android-verity: allow disable
 dm-verity for Treble VTS

To start Treble VTS test, a single AOSP system.img will be flashed onto
the device. The size of AOSP system.img might be different than the
system partition size on device, making locating verity metadata fail
(at the last fixed size of the partition).

This change allows disabling dm-verity on system partition when the
device is unlocked (orange device state) with invalid metadata.

BUG: 35603549
Test: boot device with a different-sized system.img, checks verity is
      not enabled via:

          "adb shell getprop | grep partition.system.verified"

Change-Id: Ide78dca4eefde4ab019e4b202d3f590dcb1bb506
Signed-off-by: Bowgo Tsai <bowgotsai@google.com>
---
 drivers/md/dm-android-verity.c | 53 ++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index ec0a4d19ca3e..c3c9502baf18 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -115,6 +115,12 @@ static inline bool is_userdebug(void)
 	return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug));
 }
 
+static inline bool is_unlocked(void)
+{
+	static const char unlocked[] = "orange";
+
+	return !strncmp(verifiedbootstate, unlocked, sizeof(unlocked));
+}
 
 static int table_extract_mpi_array(struct public_key_signature *pks,
 				const void *data, size_t len)
@@ -650,6 +656,28 @@ static int add_as_linear_device(struct dm_target *ti, char *dev)
 	return err;
 }
 
+static int create_linear_device(struct dm_target *ti, dev_t dev,
+				char *target_device)
+{
+	u64 device_size = 0;
+	int err = find_size(dev, &device_size);
+
+	if (err) {
+		DMERR("error finding bdev size");
+		handle_error();
+		return err;
+	}
+
+	ti->len = device_size;
+	err = add_as_linear_device(ti, target_device);
+	if (err) {
+		handle_error();
+		return err;
+	}
+	verity_enabled = false;
+	return 0;
+}
+
 /*
  * Target parameters:
  *	<key id>	Key id of the public key in the system keyring.
@@ -673,7 +701,6 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	struct fec_ecc_metadata uninitialized_var(ecc);
 	char buf[FEC_ARG_LENGTH], *buf_ptr;
 	unsigned long long tmpll;
-	u64  uninitialized_var(device_size);
 
 	if (argc == 1) {
 		/* Use the default keyid */
@@ -701,23 +728,8 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 	}
 
-	if (is_eng()) {
-		err = find_size(dev, &device_size);
-		if (err) {
-			DMERR("error finding bdev size");
-			handle_error();
-			return err;
-		}
-
-		ti->len = device_size;
-		err = add_as_linear_device(ti, target_device);
-		if (err) {
-			handle_error();
-			return err;
-		}
-		verity_enabled = false;
-		return 0;
-	}
+	if (is_eng())
+		return create_linear_device(ti, dev, target_device);
 
 	strreplace(key_id, '#', ' ');
 
@@ -732,6 +744,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	err = extract_metadata(dev, &fec, &metadata, &verity_enabled);
 
 	if (err) {
+		/* Allow invalid metadata when the device is unlocked */
+		if (is_unlocked()) {
+			DMWARN("Allow invalid metadata when unlocked");
+			return create_linear_device(ti, dev, target_device);
+		}
 		DMERR("Error while extracting metadata");
 		handle_error();
 		goto free_metadata;

From 152a401d269358de1875789a5d4e85ea8fc3775b Mon Sep 17 00:00:00 2001
From: yangdongdong <yangdongdong@xiaomi.com>
Date: Sat, 8 Aug 2015 11:59:59 +0800
Subject: [PATCH 0708/3220] ANDROID: power: align wakeup_sources format

This aligns every column of elements in wakeup_sources to
conveniently check any specific column for suspicious power
consumption wakeup source or for other easily human readable purpose.

Change-Id: Iac8b0538170fcc0cca9f6857c15d9a4c62c8865e
Signed-off-by: yangdongdong <yangdongdong@xiaomi.com>
---
 drivers/base/power/wakeup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 09c07f519952..0e494108c20c 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -1042,7 +1042,7 @@ static int print_wakeup_source_stats(struct seq_file *m,
 		active_time = ktime_set(0, 0);
 	}
 
-	seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
+	seq_printf(m, "%-32s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
 		   ws->name, active_count, ws->event_count,
 		   ws->wakeup_count, ws->expire_count,
 		   ktime_to_ms(active_time), ktime_to_ms(total_time),
@@ -1062,7 +1062,7 @@ static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
 {
 	struct wakeup_source *ws;
 
-	seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+	seq_puts(m, "name\t\t\t\t\tactive_count\tevent_count\twakeup_count\t"
 		"expire_count\tactive_since\ttotal_time\tmax_time\t"
 		"last_change\tprevent_suspend_time\n");
 

From 2474d8bad0034d4c1143e10947c44a0d6b37dc92 Mon Sep 17 00:00:00 2001
From: Max Shi <meixuanshi@google.com>
Date: Fri, 26 Aug 2016 15:00:16 -0700
Subject: [PATCH 0709/3220] config: disable CONFIG_USELIB and CONFIG_FHANDLE

turn off the two kernel configs to disable related system ABI.

Bug: 30903194
Change-Id: I32e2ff3323135ce4b67a86f106fa9327a71fe309
Signed-off-by: Max Shi <meixuanshi@google.com>
---
 android/configs/android-base.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 2098fe97198c..d1f9628e4377 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -1,10 +1,12 @@
 #  KEEP ALPHABETICALLY SORTED
 # CONFIG_DEVKMEM is not set
 # CONFIG_DEVMEM is not set
+# CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
 # CONFIG_MODULES is not set
 # CONFIG_OABI_COMPAT is not set
 # CONFIG_SYSVIPC is not set
+# CONFIG_USELIB is not set
 CONFIG_ANDROID=y
 CONFIG_ANDROID_BINDER_IPC=y
 CONFIG_ANDROID_LOW_MEMORY_KILLER=y

From a5e2a1ddbc50f49135a381137fde4eaee12e5071 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Tue, 7 Mar 2017 10:37:56 -0800
Subject: [PATCH 0710/3220] ANDROID: sched: fix duplicate sched_group_energy
 const specifiers

EAS uses "const struct sched_group_energy * const" fairly consistently.
But a couple of places swap the "*" and second "const", making the
pointer mutable.

In the case of struct sched_group, "* const" would have been an error,
since init_sched_energy() writes to sd->groups->sge.

Change-Id: Ic6a8fcf99e65c0f25d9cc55c32625ef3ca5c9aca
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 kernel/sched/fair.c  | 2 +-
 kernel/sched/sched.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3331f453a17f..83cfb72b2d95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4919,7 +4919,7 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
 }
 
 static int find_new_capacity(struct energy_env *eenv,
-	const struct sched_group_energy const *sge)
+	const struct sched_group_energy * const sge)
 {
 	int idx;
 	unsigned long util = group_max_util(eenv);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2f2b959ad244..780522c65cea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -915,7 +915,7 @@ struct sched_group {
 
 	unsigned int group_weight;
 	struct sched_group_capacity *sgc;
-	const struct sched_group_energy const *sge;
+	const struct sched_group_energy *sge;
 
 	/*
 	 * The CPUs this group covers.

From b631d8c06f64f744f272bedbe8839f8d687db6da Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Mon, 20 Mar 2017 14:06:27 -0700
Subject: [PATCH 0711/3220] ANDROID: android-verity: do not compile as
 independent module

dm-android-verity depends on optional kernel command line parameters.
When compiled as module the __setup macro ends up being a no-op
resulting in the following warnings:

/work/build/batch/drivers/md/dm-android-verity.c:91:19: warning: 'verity_buildvariant' defined but not used [-Wunused-function]
 static int __init verity_buildvariant(char *line)
                   ^~~~~~~~~~~~~~~~~~~
/work/build/batch/drivers/md/dm-android-verity.c:83:19: warning: 'verity_keyid_param' defined but not used [-Wunused-function]
 static int __init verity_keyid_param(char *line)
                   ^~~~~~~~~~~~~~~~~~
/work/build/batch/drivers/md/dm-android-verity.c:75:19: warning: 'verity_mode_param' defined but not used [-Wunused-function]
 static int __init verity_mode_param(char *line)
                   ^~~~~~~~~~~~~~~~~
/work/build/batch/drivers/md/dm-android-verity.c:67:19: warning: 'verified_boot_state_param' defined but not used [-Wunused-function]
 static int __init verified_boot_state_param(char *line)
                   ^~~~~~~~~~~~~~~~~~~~~~~~~
Tested with allmodconfig.

Change-Id: Idfe0c97b216bb620cc7264e968b494eb3a765157
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
---
 drivers/md/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3d237a03dab3..9eb08a43cd27 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -516,15 +516,15 @@ config DM_LOG_WRITES
 	  If unsure, say N.
 
 config DM_ANDROID_VERITY
-	tristate "Android verity target support"
-	depends on DM_VERITY
+	bool "Android verity target support"
+	depends on DM_VERITY=y
 	depends on X509_CERTIFICATE_PARSER
 	depends on SYSTEM_TRUSTED_KEYRING
 	depends on PUBLIC_KEY_ALGO_RSA
 	depends on KEYS
 	depends on ASYMMETRIC_KEY_TYPE
 	depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
-	depends on MD_LINEAR
+	depends on MD_LINEAR=y
 	select DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
 	---help---
 	  This device-mapper target is virtually a VERITY target. This

From bc5b6dd5dfd838f7338e8d1ae42f670adc1a4d03 Mon Sep 17 00:00:00 2001
From: Jungseung Lee <js07.lee@samsung.com>
Date: Thu, 22 Dec 2016 12:37:34 +0900
Subject: [PATCH 0712/3220] BACKPORT: mmc: core: Export device lifetime
 information through sysfs

In the eMMC 5.0 version of the spec, several EXT_CSD fields about
device lifetime are added.

 - Two types of estimated indications reflected by averaged wear out of memory
 - An indication reflected by average reserved blocks

Export the information through sysfs.

Signed-off-by: Jungseung Lee <js07.lee@samsung.com>
Reviewed-by: Jaehoon Chung <jh80.chung@samsung.com>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c   | 12 ++++++++++++
 include/linux/mmc/card.h |  3 +++
 include/linux/mmc/mmc.h  |  3 +++
 3 files changed, 18 insertions(+)

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 79a0c26e1419..0e3d7152c8af 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -592,6 +592,12 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		card->ext_csd.ffu_capable =
 			(ext_csd[EXT_CSD_SUPPORTED_MODE] & 0x1) &&
 			!(ext_csd[EXT_CSD_FW_CONFIG] & 0x1);
+
+		card->ext_csd.pre_eol_info = ext_csd[EXT_CSD_PRE_EOL_INFO];
+		card->ext_csd.device_life_time_est_typ_a =
+			ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A];
+		card->ext_csd.device_life_time_est_typ_b =
+			ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B];
 	}
 out:
 	return err;
@@ -721,6 +727,10 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
 MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name);
 MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid);
 MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv);
+MMC_DEV_ATTR(pre_eol_info, "%02x\n", card->ext_csd.pre_eol_info);
+MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n",
+	card->ext_csd.device_life_time_est_typ_a,
+	card->ext_csd.device_life_time_est_typ_b);
 MMC_DEV_ATTR(serial, "0x%08x\n", card->cid.serial);
 MMC_DEV_ATTR(enhanced_area_offset, "%llu\n",
 		card->ext_csd.enhanced_area_offset);
@@ -757,6 +767,8 @@ static struct attribute *mmc_std_attrs[] = {
 	&dev_attr_name.attr,
 	&dev_attr_oemid.attr,
 	&dev_attr_prv.attr,
+	&dev_attr_pre_eol_info.attr,
+	&dev_attr_life_time.attr,
 	&dev_attr_serial.attr,
 	&dev_attr_enhanced_area_offset.attr,
 	&dev_attr_enhanced_area_size.attr,
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index eb0151bac50c..8f23fb2c5ed2 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -118,6 +118,9 @@ struct mmc_ext_csd {
 	u8			raw_pwr_cl_ddr_200_360;	/* 253 */
 	u8			raw_bkops_status;	/* 246 */
 	u8			raw_sectors[4];		/* 212 - 4 bytes */
+	u8			pre_eol_info;		/* 267 */
+	u8			device_life_time_est_typ_a;	/* 268 */
+	u8			device_life_time_est_typ_b;	/* 269 */
 
 	unsigned int            feature_support;
 #define MMC_DISCARD_FEATURE	BIT(0)                  /* CMD38 feature */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 15f2c4a0a62c..2c6b1d45626e 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -330,6 +330,9 @@ struct _mmc_csd {
 #define EXT_CSD_CACHE_SIZE		249	/* RO, 4 bytes */
 #define EXT_CSD_PWR_CL_DDR_200_360	253	/* RO */
 #define EXT_CSD_FIRMWARE_VERSION	254	/* RO, 8 bytes */
+#define EXT_CSD_PRE_EOL_INFO		267	/* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A	268	/* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B	269	/* RO */
 #define EXT_CSD_SUPPORTED_MODE		493	/* RO */
 #define EXT_CSD_TAG_UNIT_SIZE		498	/* RO */
 #define EXT_CSD_DATA_TAG_SUPPORT	499	/* RO */

From 659da09c3d7af03ea57dbb6320e7899d89a7e403 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Tue, 21 Mar 2017 11:43:02 -0700
Subject: [PATCH 0713/3220] ANDROID: mmc: core: export emmc revision

Change-Id: If23bc838327ef751ee65baadc429e218eaa2a848
Signed-off-by: Jin Qian <jinqian@google.com>
---
 drivers/mmc/core/mmc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 0e3d7152c8af..83b9bf880834 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -727,6 +727,7 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
 MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name);
 MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid);
 MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv);
+MMC_DEV_ATTR(rev, "0x%x\n", card->ext_csd.rev);
 MMC_DEV_ATTR(pre_eol_info, "%02x\n", card->ext_csd.pre_eol_info);
 MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n",
 	card->ext_csd.device_life_time_est_typ_a,
@@ -767,6 +768,7 @@ static struct attribute *mmc_std_attrs[] = {
 	&dev_attr_name.attr,
 	&dev_attr_oemid.attr,
 	&dev_attr_prv.attr,
+	&dev_attr_rev.attr,
 	&dev_attr_pre_eol_info.attr,
 	&dev_attr_life_time.attr,
 	&dev_attr_serial.attr,

From e953f89b8563efe0b92f70033d237874c43d4a3d Mon Sep 17 00:00:00 2001
From: Joel Scherpelz <jscherpelz@google.com>
Date: Wed, 22 Mar 2017 18:19:04 +0900
Subject: [PATCH 0714/3220] net: ipv6: Add sysctl for minimum prefix len
 acceptable in RIOs.

This commit adds a new sysctl accept_ra_rt_info_min_plen that
defines the minimum acceptable prefix length of Route Information
Options. The new sysctl is intended to be used together with
accept_ra_rt_info_max_plen to configure a range of acceptable
prefix lengths. It is useful to prevent misconfigurations from
unintentionally blackholing too much of the IPv6 address space
(e.g., home routers announcing RIOs for fc00::/7, which is
incorrect).

[backport of net-next bbea124bc99df968011e76eba105fe964a4eceab]
Bug: 33333670
Test: net_test passes

Signed-off-by: Joel Scherpelz <jscherpelz@google.com>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 13 +++++++++++--
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              | 10 ++++++++++
 include/uapi/linux/sysctl.h            |  1 +
 net/ipv6/addrconf.c                    | 10 ++++++++++
 net/ipv6/ndisc.c                       |  2 ++
 6 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2042261408b9..5f1ea84ed72b 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1413,11 +1413,20 @@ accept_ra_pinfo - BOOLEAN
 	Functional default: enabled if accept_ra is enabled.
 			    disabled if accept_ra is disabled.
 
+accept_ra_rt_info_min_plen - INTEGER
+	Minimum prefix length of Route Information in RA.
+
+	Route Information w/ prefix smaller than this variable shall
+	be ignored.
+
+	Functional default: 0 if accept_ra_rtr_pref is enabled.
+			    -1 if accept_ra_rtr_pref is disabled.
+
 accept_ra_rt_info_max_plen - INTEGER
 	Maximum prefix length of Route Information in RA.
 
-	Route Information w/ prefix larger than or equal to this
-	variable shall be ignored.
+	Route Information w/ prefix larger than this variable shall
+	be ignored.
 
 	Functional default: 0 if accept_ra_rtr_pref is enabled.
 			    -1 if accept_ra_rtr_pref is disabled.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ce777260e9ea..1182f0e21697 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -36,6 +36,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_rtr_pref;
 	__s32		rtr_probe_interval;
 #ifdef CONFIG_IPV6_ROUTE_INFO
+	__s32		accept_ra_rt_info_min_plen;
 	__s32		accept_ra_rt_info_max_plen;
 #endif
 #endif
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 2b1533859749..c462f1dc175e 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -175,6 +175,16 @@ enum {
 	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+	DEVCONF_DROP_UNSOLICITED_NA,
+	DEVCONF_KEEP_ADDR_ON_DOWN,
+	DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
+	DEVCONF_SEG6_ENABLED,
+	DEVCONF_SEG6_REQUIRE_HMAC,
+	DEVCONF_ENHANCED_DAD,
+	DEVCONF_ADDR_GEN_MODE,
+	DEVCONF_DISABLE_POLICY,
+	DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
 	DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 0956373b56db..d18980e74534 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -570,6 +570,7 @@ enum {
 	NET_IPV6_PROXY_NDP=23,
 	NET_IPV6_ACCEPT_SOURCE_ROUTE=25,
 	NET_IPV6_ACCEPT_RA_FROM_LOCAL=26,
+	NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 498a664b8dc9..860ffbe778cf 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -202,6 +202,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_ra_rtr_pref	= 1,
 	.rtr_probe_interval	= 60 * HZ,
 #ifdef CONFIG_IPV6_ROUTE_INFO
+	.accept_ra_rt_info_min_plen = 0,
 	.accept_ra_rt_info_max_plen = 0,
 #endif
 #endif
@@ -247,6 +248,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.accept_ra_rtr_pref	= 1,
 	.rtr_probe_interval	= 60 * HZ,
 #ifdef CONFIG_IPV6_ROUTE_INFO
+	.accept_ra_rt_info_min_plen = 0,
 	.accept_ra_rt_info_max_plen = 0,
 #endif
 #endif
@@ -4689,6 +4691,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_RTR_PROBE_INTERVAL] =
 		jiffies_to_msecs(cnf->rtr_probe_interval);
 #ifdef CONFIG_IPV6_ROUTE_INFO
+	array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen;
 	array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
 #endif
 #endif
@@ -5648,6 +5651,13 @@ static struct addrconf_sysctl_table
 			.proc_handler	= proc_dointvec_jiffies,
 		},
 #ifdef CONFIG_IPV6_ROUTE_INFO
+		{
+			.procname	= "accept_ra_rt_info_min_plen",
+			.data		= &ipv6_devconf.accept_ra_rt_info_min_plen,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			.procname	= "accept_ra_rt_info_max_plen",
 			.data		= &ipv6_devconf.accept_ra_rt_info_max_plen,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 84afb9a77278..3452f9037ad4 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1358,6 +1358,8 @@ skip_linkparms:
 			if (ri->prefix_len == 0 &&
 			    !in6_dev->cnf.accept_ra_defrtr)
 				continue;
+			if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen)
+				continue;
 			if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
 				continue;
 			rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,

From f0faedd6b468777f3bb5834f97100794d562c8b7 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Thu, 23 Mar 2017 13:51:24 -0700
Subject: [PATCH 0715/3220] fix the deadlock in xt_qtaguid when enable DDEBUG

When DDEBUG is enabled, the prdebug_full_state() function will try to
recursively aquire the spinlock of sock_tag_list and causing deadlock. A
check statement is added before it aquire the spinlock to differentiate
the behavior depend on the caller of the function.

Bug: 36559739
Test: Compile and run test under system/extra/test/iptables/
Change-Id: Ie3397fbaa207e14fe214d47aaf5e8ca1f4a712ee
Signed-off-by: Chenbo Feng <fengc@google.com>
---
 net/netfilter/xt_qtaguid.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index 3bf0c59dab2f..0f5628a59917 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -1814,8 +1814,11 @@ ret_res:
 }
 
 #ifdef DDEBUG
-/* This function is not in xt_qtaguid_print.c because of locks visibility */
-static void prdebug_full_state(int indent_level, const char *fmt, ...)
+/*
+ * This function is not in xt_qtaguid_print.c because of locks visibility.
+ * The lock of sock_tag_list must be aquired before calling this function
+ */
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...)
 {
 	va_list args;
 	char *fmt_buff;
@@ -1836,16 +1839,12 @@ static void prdebug_full_state(int indent_level, const char *fmt, ...)
 	kfree(buff);
 	va_end(args);
 
-	spin_lock_bh(&sock_tag_list_lock);
 	prdebug_sock_tag_tree(indent_level, &sock_tag_tree);
-	spin_unlock_bh(&sock_tag_list_lock);
 
-	spin_lock_bh(&sock_tag_list_lock);
 	spin_lock_bh(&uid_tag_data_tree_lock);
 	prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree);
 	prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree);
 	spin_unlock_bh(&uid_tag_data_tree_lock);
-	spin_unlock_bh(&sock_tag_list_lock);
 
 	spin_lock_bh(&iface_stat_list_lock);
 	prdebug_iface_stat_list(indent_level, &iface_stat_list);
@@ -1854,7 +1853,7 @@ static void prdebug_full_state(int indent_level, const char *fmt, ...)
 	pr_debug("qtaguid: %s(): }\n", __func__);
 }
 #else
-static void prdebug_full_state(int indent_level, const char *fmt, ...) {}
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...) {}
 #endif
 
 struct proc_ctrl_print_info {
@@ -1977,8 +1976,11 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 			   (u64)atomic64_read(&qtu_events.match_no_sk),
 			   (u64)atomic64_read(&qtu_events.match_no_sk_file));
 
-		/* Count the following as part of the last item_index */
-		prdebug_full_state(0, "proc ctrl");
+		/* Count the following as part of the last item_index. No need
+		 * to lock the sock_tag_list here since it is already locked when
+		 * starting the seq_file operation
+		 */
+		prdebug_full_state_locked(0, "proc ctrl");
 	}
 
 	return 0;
@@ -2887,8 +2889,10 @@ static int qtudev_release(struct inode *inode, struct file *file)
 
 	sock_tag_tree_erase(&st_to_free_tree);
 
-	prdebug_full_state(0, "%s(): pid=%u tgid=%u", __func__,
+	spin_lock_bh(&sock_tag_list_lock);
+	prdebug_full_state_locked(0, "%s(): pid=%u tgid=%u", __func__,
 			   current->pid, current->tgid);
+	spin_unlock_bh(&sock_tag_list_lock);
 	return 0;
 }
 

From 93a520cc3cc2106bc8cb3354d68d75f5f9d26817 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 21 Mar 2017 16:28:27 -0700
Subject: [PATCH 0716/3220] ANDROID: sdcardfs: correct order of descriptors

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35331000
Change-Id: Ia6d16b19c8c911f41231d2a12be0740057edfacf
---
 fs/sdcardfs/packagelist.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index e72fe83f7837..5398ebf8a34b 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -48,12 +48,14 @@ static struct kmem_cache *hashtable_entry_cachep;
 static unsigned int full_name_case_hash(const unsigned char *name, unsigned int len)
 {
 	unsigned long hash = init_name_hash();
+
 	while (len--)
 		hash = partial_name_hash(tolower(*name++), hash);
 	return end_name_hash(hash);
 }
 
-static void inline qstr_init(struct qstr *q, const char *name) {
+static inline void qstr_init(struct qstr *q, const char *name)
+{
 	q->name = name;
 	q->len = strlen(q->name);
 	q->hash = full_name_case_hash(q->name, q->len);

From dfdaa9584471358a3d3865ca7de972875dd18bdd Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 16 Mar 2017 17:42:58 -0700
Subject: [PATCH 0717/3220] ANDROID: sdcardfs: Fix formatting

This fixes various spacing and bracket related issues
pointed out by checkpatch.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35331000
Change-Id: I6e248833a7a04e3899f3ae9462d765cfcaa70c96
---
 fs/sdcardfs/dentry.c       |  13 +-
 fs/sdcardfs/derived_perm.c | 242 +++++++++++++++++++------------------
 fs/sdcardfs/file.c         |   5 +-
 fs/sdcardfs/inode.c        |  50 ++++----
 fs/sdcardfs/lookup.c       |  32 ++---
 fs/sdcardfs/main.c         |  34 ++++--
 fs/sdcardfs/packagelist.c  |  43 ++++---
 fs/sdcardfs/sdcardfs.h     | 106 ++++++++--------
 fs/sdcardfs/super.c        |  29 +++--
 9 files changed, 300 insertions(+), 254 deletions(-)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index 64494a50e250..aaa15dff6f51 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -46,7 +46,8 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	spin_unlock(&dentry->d_lock);
 
 	/* check uninitialized obb_dentry and
-	 * whether the base obbpath has been changed or not */
+	 * whether the base obbpath has been changed or not
+	 */
 	if (is_obbpath_invalid(dentry)) {
 		d_drop(dentry);
 		return 0;
@@ -106,12 +107,10 @@ out:
 static void sdcardfs_d_release(struct dentry *dentry)
 {
 	/* release and reset the lower paths */
-	if(has_graft_path(dentry)) {
+	if (has_graft_path(dentry))
 		sdcardfs_put_reset_orig_path(dentry);
-	}
 	sdcardfs_put_reset_lower_path(dentry);
 	free_dentry_private_data(dentry);
-	return;
 }
 
 static int sdcardfs_hash_ci(const struct dentry *dentry,
@@ -168,14 +167,16 @@ static int sdcardfs_cmp_ci(const struct dentry *parent,
 	return 1;
 }
 
-static void sdcardfs_canonical_path(const struct path *path, struct path *actual_path) {
+static void sdcardfs_canonical_path(const struct path *path,
+				struct path *actual_path)
+{
 	sdcardfs_get_real_lower(path->dentry, actual_path);
 }
 
 const struct dentry_operations sdcardfs_ci_dops = {
 	.d_revalidate	= sdcardfs_d_revalidate,
 	.d_release	= sdcardfs_d_release,
-	.d_hash 	= sdcardfs_hash_ci,
+	.d_hash	= sdcardfs_hash_ci,
 	.d_compare	= sdcardfs_cmp_ci,
 	.d_canonical_path = sdcardfs_canonical_path,
 };
diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 28e9b8d42f9e..51fa565742bd 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -37,7 +37,8 @@ static void inherit_derived_state(struct inode *parent, struct inode *child)
 
 /* helper function for derived state */
 void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
-                        uid_t uid, bool under_android, struct inode *top)
+						uid_t uid, bool under_android,
+						struct inode *top)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
 
@@ -50,11 +51,14 @@ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
 	set_top(info, top);
 }
 
-/* While renaming, there is a point where we want the path from dentry, but the name from newdentry */
-void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name)
+/* While renaming, there is a point where we want the path from dentry,
+ * but the name from newdentry
+ */
+void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
+				const struct qstr *name)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
-	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
+	struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
 	appid_t appid;
 	struct qstr q_Android = QSTR_LITERAL("Android");
 	struct qstr q_data = QSTR_LITERAL("data");
@@ -77,58 +81,57 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co
 		return;
 	/* Derive custom permissions based on parent and current node */
 	switch (parent_info->perm) {
-		case PERM_INHERIT:
-		case PERM_ANDROID_PACKAGE_CACHE:
-			/* Already inherited above */
-			break;
-		case PERM_PRE_ROOT:
-			/* Legacy internal layout places users at top level */
-			info->perm = PERM_ROOT;
-			info->userid = simple_strtoul(name->name, NULL, 10);
+	case PERM_INHERIT:
+	case PERM_ANDROID_PACKAGE_CACHE:
+		/* Already inherited above */
+		break;
+	case PERM_PRE_ROOT:
+		/* Legacy internal layout places users at top level */
+		info->perm = PERM_ROOT;
+		info->userid = simple_strtoul(name->name, NULL, 10);
+		set_top(info, &info->vfs_inode);
+		break;
+	case PERM_ROOT:
+		/* Assume masked off by default. */
+		if (qstr_case_eq(name, &q_Android)) {
+			/* App-specific directories inside; let anyone traverse */
+			info->perm = PERM_ANDROID;
+			info->under_android = true;
 			set_top(info, &info->vfs_inode);
-			break;
-		case PERM_ROOT:
-			/* Assume masked off by default. */
-			if (qstr_case_eq(name, &q_Android)) {
-				/* App-specific directories inside; let anyone traverse */
-				info->perm = PERM_ANDROID;
-				info->under_android = true;
-				set_top(info, &info->vfs_inode);
-			}
-			break;
-		case PERM_ANDROID:
-			if (qstr_case_eq(name, &q_data)) {
-				/* App-specific directories inside; let anyone traverse */
-				info->perm = PERM_ANDROID_DATA;
-				set_top(info, &info->vfs_inode);
-			} else if (qstr_case_eq(name, &q_obb)) {
-				/* App-specific directories inside; let anyone traverse */
-				info->perm = PERM_ANDROID_OBB;
-				info->under_obb = true;
-				set_top(info, &info->vfs_inode);
-				/* Single OBB directory is always shared */
-			} else if (qstr_case_eq(name, &q_media)) {
-				/* App-specific directories inside; let anyone traverse */
-				info->perm = PERM_ANDROID_MEDIA;
-				set_top(info, &info->vfs_inode);
-			}
-			break;
-		case PERM_ANDROID_OBB:
-		case PERM_ANDROID_DATA:
-		case PERM_ANDROID_MEDIA:
-			info->perm = PERM_ANDROID_PACKAGE;
-			appid = get_appid(name->name);
-			if (appid != 0 && !is_excluded(name->name, parent_info->userid)) {
-				info->d_uid = multiuser_get_uid(parent_info->userid, appid);
-			}
+		}
+		break;
+	case PERM_ANDROID:
+		if (qstr_case_eq(name, &q_data)) {
+			/* App-specific directories inside; let anyone traverse */
+			info->perm = PERM_ANDROID_DATA;
 			set_top(info, &info->vfs_inode);
-			break;
-		case PERM_ANDROID_PACKAGE:
-			if (qstr_case_eq(name, &q_cache)) {
-				info->perm = PERM_ANDROID_PACKAGE_CACHE;
-				info->under_cache = true;
-			}
-			break;
+		} else if (qstr_case_eq(name, &q_obb)) {
+			/* App-specific directories inside; let anyone traverse */
+			info->perm = PERM_ANDROID_OBB;
+			info->under_obb = true;
+			set_top(info, &info->vfs_inode);
+			/* Single OBB directory is always shared */
+		} else if (qstr_case_eq(name, &q_media)) {
+			/* App-specific directories inside; let anyone traverse */
+			info->perm = PERM_ANDROID_MEDIA;
+			set_top(info, &info->vfs_inode);
+		}
+		break;
+	case PERM_ANDROID_OBB:
+	case PERM_ANDROID_DATA:
+	case PERM_ANDROID_MEDIA:
+		info->perm = PERM_ANDROID_PACKAGE;
+		appid = get_appid(name->name);
+		if (appid != 0 && !is_excluded(name->name, parent_info->userid))
+			info->d_uid = multiuser_get_uid(parent_info->userid, appid);
+		set_top(info, &info->vfs_inode);
+		break;
+	case PERM_ANDROID_PACKAGE:
+		if (qstr_case_eq(name, &q_cache)) {
+			info->perm = PERM_ANDROID_PACKAGE_CACHE;
+			info->under_cache = true;
+		}
+		break;
 	}
 }
 
@@ -137,7 +140,8 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry)
 	get_derived_permission_new(parent, dentry, &dentry->d_name);
 }
 
-static appid_t get_type(const char *name) {
+static appid_t get_type(const char *name)
+{
 	const char *ext = strrchr(name, '.');
 	appid_t id;
 
@@ -149,7 +153,8 @@ static appid_t get_type(const char *name) {
 	return AID_MEDIA_RW;
 }
 
-void fixup_lower_ownership(struct dentry* dentry, const char *name) {
+void fixup_lower_ownership(struct dentry *dentry, const char *name)
+{
 	struct path path;
 	struct inode *inode;
 	struct inode *delegated_inode = NULL;
@@ -175,49 +180,49 @@ void fixup_lower_ownership(struct dentry* dentry, const char *name) {
 	}
 
 	switch (perm) {
-		case PERM_ROOT:
-		case PERM_ANDROID:
-		case PERM_ANDROID_DATA:
-		case PERM_ANDROID_MEDIA:
-		case PERM_ANDROID_PACKAGE:
-		case PERM_ANDROID_PACKAGE_CACHE:
-			uid = multiuser_get_uid(info->userid, uid);
-			break;
-		case PERM_ANDROID_OBB:
-			uid = AID_MEDIA_OBB;
-			break;
-		case PERM_PRE_ROOT:
-		default:
-			break;
+	case PERM_ROOT:
+	case PERM_ANDROID:
+	case PERM_ANDROID_DATA:
+	case PERM_ANDROID_MEDIA:
+	case PERM_ANDROID_PACKAGE:
+	case PERM_ANDROID_PACKAGE_CACHE:
+		uid = multiuser_get_uid(info->userid, uid);
+		break;
+	case PERM_ANDROID_OBB:
+		uid = AID_MEDIA_OBB;
+		break;
+	case PERM_PRE_ROOT:
+	default:
+		break;
 	}
 	switch (perm) {
-		case PERM_ROOT:
-		case PERM_ANDROID:
-		case PERM_ANDROID_DATA:
-		case PERM_ANDROID_MEDIA:
-			if (S_ISDIR(d_inode(dentry)->i_mode))
-				gid = multiuser_get_uid(info->userid, AID_MEDIA_RW);
-			else
-				gid = multiuser_get_uid(info->userid, get_type(name));
-			break;
-		case PERM_ANDROID_OBB:
-			gid = AID_MEDIA_OBB;
-			break;
-		case PERM_ANDROID_PACKAGE:
-			if (info->d_uid != 0)
-				gid = multiuser_get_ext_gid(info->d_uid);
-			else
-				gid = multiuser_get_uid(info->userid, uid);
-			break;
-		case PERM_ANDROID_PACKAGE_CACHE:
-			if (info->d_uid != 0)
-				gid = multiuser_get_cache_gid(info->d_uid);
-			else
-				gid = multiuser_get_uid(info->userid, uid);
-			break;
-		case PERM_PRE_ROOT:
-		default:
-			break;
+	case PERM_ROOT:
+	case PERM_ANDROID:
+	case PERM_ANDROID_DATA:
+	case PERM_ANDROID_MEDIA:
+		if (S_ISDIR(d_inode(dentry)->i_mode))
+			gid = multiuser_get_uid(info->userid, AID_MEDIA_RW);
+		else
+			gid = multiuser_get_uid(info->userid, get_type(name));
+		break;
+	case PERM_ANDROID_OBB:
+		gid = AID_MEDIA_OBB;
+		break;
+	case PERM_ANDROID_PACKAGE:
+		if (info->d_uid != 0)
+			gid = multiuser_get_ext_gid(info->d_uid);
+		else
+			gid = multiuser_get_uid(info->userid, uid);
+		break;
+	case PERM_ANDROID_PACKAGE_CACHE:
+		if (info->d_uid != 0)
+			gid = multiuser_get_cache_gid(info->d_uid);
+		else
+			gid = multiuser_get_uid(info->userid, uid);
+		break;
+	case PERM_PRE_ROOT:
+	default:
+		break;
 	}
 
 	sdcardfs_get_lower_path(dentry, &path);
@@ -246,7 +251,8 @@ retry_deleg:
 	sdcardfs_put_lower_path(dentry, &path);
 }
 
-static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) {
+static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit)
+{
 	if (info->perm == PERM_ROOT)
 		return (limit->flags & BY_USERID)?info->userid == limit->userid:1;
 	if (info->perm == PERM_PRE_ROOT || info->perm == PERM_ANDROID)
@@ -254,7 +260,8 @@ static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct li
 	return 0;
 }
 
-static int needs_fixup(perm_t perm) {
+static int needs_fixup(perm_t perm)
+{
 	if (perm == PERM_ANDROID_DATA || perm == PERM_ANDROID_OBB
 			|| perm == PERM_ANDROID_MEDIA)
 		return 1;
@@ -292,9 +299,9 @@ static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *
 			}
 			spin_unlock(&child->d_lock);
 		}
-	} else 	if (descendant_may_need_fixup(info, limit)) {
+	} else if (descendant_may_need_fixup(info, limit)) {
 		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
-				__fixup_perms_recursive(child, limit, depth + 1);
+			__fixup_perms_recursive(child, limit, depth + 1);
 		}
 	}
 	spin_unlock(&dentry->d_lock);
@@ -310,7 +317,7 @@ inline void update_derived_permission_lock(struct dentry *dentry)
 {
 	struct dentry *parent;
 
-	if(!dentry || !d_inode(dentry)) {
+	if (!dentry || !d_inode(dentry)) {
 		printk(KERN_ERR "sdcardfs: %s: invalid dentry\n", __func__);
 		return;
 	}
@@ -318,11 +325,11 @@ inline void update_derived_permission_lock(struct dentry *dentry)
 	 * 1. need to check whether the dentry is updated or not
 	 * 2. remove the root dentry update
 	 */
-	if(IS_ROOT(dentry)) {
+	if (IS_ROOT(dentry)) {
 		//setup_default_pre_root_state(d_inode(dentry));
 	} else {
 		parent = dget_parent(dentry);
-		if(parent) {
+		if (parent) {
 			get_derived_permission(parent, dentry);
 			dput(parent);
 		}
@@ -334,15 +341,15 @@ int need_graft_path(struct dentry *dentry)
 {
 	int ret = 0;
 	struct dentry *parent = dget_parent(dentry);
-	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
+	struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 	struct qstr obb = QSTR_LITERAL("obb");
 
-	if(parent_info->perm == PERM_ANDROID &&
+	if (parent_info->perm == PERM_ANDROID &&
 			qstr_case_eq(&dentry->d_name, &obb)) {
 
 		/* /Android/obb is the base obbpath of DERIVED_UNIFIED */
-		if(!(sbi->options.multiuser == false
+		if (!(sbi->options.multiuser == false
 				&& parent_info->userid == 0)) {
 			ret = 1;
 		}
@@ -362,17 +369,18 @@ int is_obbpath_invalid(struct dentry *dent)
 
 	/* check the base obbpath has been changed.
 	 * this routine can check an uninitialized obb dentry as well.
-	 * regarding the uninitialized obb, refer to the sdcardfs_mkdir() */
+	 * regarding the uninitialized obb, refer to the sdcardfs_mkdir()
+	 */
 	spin_lock(&di->lock);
-	if(di->orig_path.dentry) {
- 		if(!di->lower_path.dentry) {
+	if (di->orig_path.dentry) {
+		if (!di->lower_path.dentry) {
 			ret = 1;
 		} else {
 			path_get(&di->lower_path);
 			//lower_parent = lock_parent(lower_path->dentry);
 
 			path_buf = kmalloc(PATH_MAX, GFP_ATOMIC);
-			if(!path_buf) {
+			if (!path_buf) {
 				ret = 1;
 				printk(KERN_ERR "sdcardfs: fail to allocate path_buf in %s.\n", __func__);
 			} else {
@@ -399,13 +407,13 @@ int is_base_obbpath(struct dentry *dentry)
 {
 	int ret = 0;
 	struct dentry *parent = dget_parent(dentry);
-	struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent));
+	struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 	struct qstr q_obb = QSTR_LITERAL("obb");
 
 	spin_lock(&SDCARDFS_D(dentry)->lock);
 	if (sbi->options.multiuser) {
-		if(parent_info->perm == PERM_PRE_ROOT &&
+		if (parent_info->perm == PERM_PRE_ROOT &&
 				qstr_case_eq(&dentry->d_name, &q_obb)) {
 			ret = 1;
 		}
@@ -420,7 +428,8 @@ int is_base_obbpath(struct dentry *dentry)
 /* The lower_path will be stored to the dentry's orig_path
  * and the base obbpath will be copyed to the lower_path variable.
  * if an error returned, there's no change in the lower_path
- * returns: -ERRNO if error (0: no error) */
+ * returns: -ERRNO if error (0: no error)
+ */
 int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
 {
 	int err = 0;
@@ -435,7 +444,7 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
 	err = kern_path(sbi->obbpath_s,
 			LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &obbpath);
 
-	if(!err) {
+	if (!err) {
 		/* the obbpath base has been found */
 		pathcpy(lower_path, &obbpath);
 	} else {
@@ -443,7 +452,8 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
 		 * setup the lower_path with its orig_path.
 		 * but, the current implementation just returns an error
 		 * because the sdcard daemon also regards this case as
-		 * a lookup fail. */
+		 * a lookup fail.
+		 */
 		printk(KERN_INFO "sdcardfs: the sbi->obbpath is not available\n");
 	}
 	return err;
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 0592facef704..4c9726606c0f 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -216,7 +216,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 		goto out_err;
 	}
 
-	if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+	if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
 		err = -EACCES;
 		goto out_err;
 	}
@@ -248,9 +248,8 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 
 	if (err)
 		kfree(SDCARDFS_F(file));
-	else {
+	else
 		sdcardfs_copy_and_fix_attrs(inode, sdcardfs_lower_inode(inode));
-	}
 
 out_revert_cred:
 	REVERT_CRED(saved_cred);
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index f19f11ea19fc..ce187d865f29 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -23,10 +23,10 @@
 #include <linux/ratelimit.h>
 
 /* Do not directly use this function. Use OVERRIDE_CRED() instead. */
-const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info)
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_inode_info *info)
 {
-	struct cred * cred;
-	const struct cred * old_cred;
+	struct cred *cred;
+	const struct cred *old_cred;
 	uid_t uid;
 
 	cred = prepare_creds();
@@ -46,9 +46,9 @@ const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs
 }
 
 /* Do not directly use this function, use REVERT_CRED() instead. */
-void revert_fsids(const struct cred * old_cred)
+void revert_fsids(const struct cred *old_cred)
 {
-	const struct cred * cur_cred;
+	const struct cred *cur_cred;
 
 	cur_cred = current->cred;
 	revert_creds(old_cred);
@@ -67,7 +67,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	struct fs_struct *saved_fs;
 	struct fs_struct *copied_fs;
 
-	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
+	if (!check_caller_access_to_name(dir, &dentry->d_name)) {
 		err = -EACCES;
 		goto out_eacces;
 	}
@@ -166,7 +166,7 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct path lower_path;
 	const struct cred *saved_cred = NULL;
 
-	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
+	if (!check_caller_access_to_name(dir, &dentry->d_name)) {
 		err = -EACCES;
 		goto out_eacces;
 	}
@@ -240,13 +240,14 @@ out:
 }
 #endif
 
-static int touch(char *abs_path, mode_t mode) {
+static int touch(char *abs_path, mode_t mode)
+{
 	struct file *filp = filp_open(abs_path, O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW, mode);
+
 	if (IS_ERR(filp)) {
 		if (PTR_ERR(filp) == -EEXIST) {
 			return 0;
-		}
-		else {
+		} else {
 			printk(KERN_ERR "sdcardfs: failed to open(%s): %ld\n",
 						abs_path, PTR_ERR(filp));
 			return PTR_ERR(filp);
@@ -273,7 +274,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	struct qstr q_obb = QSTR_LITERAL("obb");
 	struct qstr q_data = QSTR_LITERAL("data");
 
-	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
+	if (!check_caller_access_to_name(dir, &dentry->d_name)) {
 		err = -EACCES;
 		goto out_eacces;
 	}
@@ -315,19 +316,21 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	}
 
 	/* if it is a local obb dentry, setup it with the base obbpath */
-	if(need_graft_path(dentry)) {
+	if (need_graft_path(dentry)) {
 
 		err = setup_obb_dentry(dentry, &lower_path);
-		if(err) {
+		if (err) {
 			/* if the sbi->obbpath is not available, the lower_path won't be
 			 * changed by setup_obb_dentry() but the lower path is saved to
 			 * its orig_path. this dentry will be revalidated later.
-			 * but now, the lower_path should be NULL */
+			 * but now, the lower_path should be NULL
+			 */
 			sdcardfs_put_reset_lower_path(dentry);
 
 			/* the newly created lower path which saved to its orig_path or
 			 * the lower_path is the base obbpath.
-			 * therefore, an additional path_get is required */
+			 * therefore, an additional path_get is required
+			 */
 			path_get(&lower_path);
 		} else
 			make_nomedia_in_obb = 1;
@@ -382,7 +385,7 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct path lower_path;
 	const struct cred *saved_cred = NULL;
 
-	if(!check_caller_access_to_name(dir, &dentry->d_name)) {
+	if (!check_caller_access_to_name(dir, &dentry->d_name)) {
 		err = -EACCES;
 		goto out_eacces;
 	}
@@ -391,7 +394,8 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
 	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
 
 	/* sdcardfs_get_real_lower(): in case of remove an user's obb dentry
-	 * the dentry on the original path should be deleted. */
+	 * the dentry on the original path should be deleted.
+	 */
 	sdcardfs_get_real_lower(dentry, &lower_path);
 
 	lower_dentry = lower_path.dentry;
@@ -467,7 +471,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct path lower_old_path, lower_new_path;
 	const struct cred *saved_cred = NULL;
 
-	if(!check_caller_access_to_name(old_dir, &old_dentry->d_name) ||
+	if (!check_caller_access_to_name(old_dir, &old_dentry->d_name) ||
 		!check_caller_access_to_name(new_dir, &new_dentry->d_name)) {
 		err = -EACCES;
 		goto out_eacces;
@@ -656,6 +660,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma
 		 * we check it with AID_MEDIA_RW permission
 		 */
 		struct inode *lower_inode;
+
 		OVERRIDE_CRED(SDCARDFS_SB(inode->sb));
 
 		lower_inode = sdcardfs_lower_inode(inode);
@@ -724,7 +729,8 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	/* prepare our own lower struct iattr (with the lower file) */
 	memcpy(&lower_ia, ia, sizeof(lower_ia));
 	/* Allow touch updating timestamps. A previous permission check ensures
-	 * we have write access. Changes to mode, owner, and group are ignored*/
+	 * we have write access. Changes to mode, owner, and group are ignored
+	 */
 	ia->ia_valid |= ATTR_FORCE;
 	err = inode_change_ok(&tmp, ia);
 
@@ -810,10 +816,12 @@ out_err:
 	return err;
 }
 
-static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode, struct kstat *stat)
+static int sdcardfs_fillattr(struct vfsmount *mnt,
+				struct inode *inode, struct kstat *stat)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
 	struct inode *top = grab_top(info);
+
 	if (!top)
 		return -EINVAL;
 
@@ -843,7 +851,7 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	int err;
 
 	parent = dget_parent(dentry);
-	if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+	if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
 		dput(parent);
 		return -EACCES;
 	}
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index fffb94c923c4..fab9fd9d0105 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -73,6 +73,7 @@ static int sdcardfs_inode_test(struct inode *inode, void *candidate_data/*void *
 {
 	struct inode *current_lower_inode = sdcardfs_lower_inode(inode);
 	userid_t current_userid = SDCARDFS_I(inode)->userid;
+
 	if (current_lower_inode == ((struct inode_data *)candidate_data)->lower_inode &&
 			current_userid == ((struct inode_data *)candidate_data)->id)
 		return 1; /* found a match */
@@ -102,7 +103,7 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, u
 			      * instead.
 			      */
 			     lower_inode->i_ino, /* hashval */
-			     sdcardfs_inode_test,	/* inode comparison function */
+			     sdcardfs_inode_test, /* inode comparison function */
 			     sdcardfs_inode_set, /* inode init function */
 			     &data); /* data passed to test+set fxns */
 	if (!inode) {
@@ -213,8 +214,8 @@ struct sdcardfs_name_data {
 	bool found;
 };
 
-static int sdcardfs_name_match(struct dir_context *ctx, const char *name, int namelen,
-		loff_t offset, u64 ino, unsigned int d_type)
+static int sdcardfs_name_match(struct dir_context *ctx, const char *name,
+		int namelen, loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct sdcardfs_name_data *buf = container_of(ctx, struct sdcardfs_name_data, ctx);
 	struct qstr candidate = QSTR_INIT(name, namelen);
@@ -303,23 +304,26 @@ put_name:
 	if (!err) {
 		/* check if the dentry is an obb dentry
 		 * if true, the lower_inode must be replaced with
-		 * the inode of the graft path */
+		 * the inode of the graft path
+		 */
 
-		if(need_graft_path(dentry)) {
+		if (need_graft_path(dentry)) {
 
 			/* setup_obb_dentry()
- 			 * The lower_path will be stored to the dentry's orig_path
+			 * The lower_path will be stored to the dentry's orig_path
 			 * and the base obbpath will be copyed to the lower_path variable.
 			 * if an error returned, there's no change in the lower_path
-			 * 		returns: -ERRNO if error (0: no error) */
+			 * returns: -ERRNO if error (0: no error)
+			 */
 			err = setup_obb_dentry(dentry, &lower_path);
 
-			if(err) {
+			if (err) {
 				/* if the sbi->obbpath is not available, we can optionally
 				 * setup the lower_path with its orig_path.
 				 * but, the current implementation just returns an error
 				 * because the sdcard daemon also regards this case as
-				 * a lookup fail. */
+				 * a lookup fail.
+				 */
 				printk(KERN_INFO "sdcardfs: base obbpath is not available\n");
 				sdcardfs_put_reset_orig_path(dentry);
 				goto out;
@@ -374,9 +378,9 @@ out:
 
 /*
  * On success:
- * 	fills dentry object appropriate values and returns NULL.
+ * fills dentry object appropriate values and returns NULL.
  * On fail (== error)
- * 	returns error ptr
+ * returns error ptr
  *
  * @dir : Parent inode. It is locked (dir->i_mutex)
  * @dentry : Target dentry to lookup. we should set each of fields.
@@ -393,10 +397,10 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	parent = dget_parent(dentry);
 
-	if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+	if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
 		ret = ERR_PTR(-EACCES);
 		goto out_err;
-        }
+	}
 
 	/* save current_cred and override it */
 	OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
@@ -412,9 +416,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path, SDCARDFS_I(dir)->userid);
 	if (IS_ERR(ret))
-	{
 		goto out;
-	}
 	if (ret)
 		dentry = ret;
 	if (d_inode(dentry)) {
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 4e2aded8d1d9..c249bd73b121 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -72,6 +72,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
+
 		if (!*p)
 			continue;
 
@@ -148,6 +149,7 @@ int parse_options_remount(struct super_block *sb, char *options, int silent,
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
+
 		if (!*p)
 			continue;
 
@@ -223,8 +225,8 @@ static struct dentry *sdcardfs_d_alloc_root(struct super_block *sb)
 #endif
 
 DEFINE_MUTEX(sdcardfs_super_list_lock);
-LIST_HEAD(sdcardfs_super_list);
 EXPORT_SYMBOL_GPL(sdcardfs_super_list_lock);
+LIST_HEAD(sdcardfs_super_list);
 EXPORT_SYMBOL_GPL(sdcardfs_super_list);
 
 /*
@@ -328,14 +330,15 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	/* setup permission policy */
 	sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL);
 	mutex_lock(&sdcardfs_super_list_lock);
-	if(sb_info->options.multiuser) {
-		setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root));
+	if (sb_info->options.multiuser) {
+		setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT,
+					sb_info->options.fs_user_id, AID_ROOT,
+					false, d_inode(sb->s_root));
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name);
-		/*err =  prepare_dir(sb_info->obbpath_s,
-					sb_info->options.fs_low_uid,
-					sb_info->options.fs_low_gid, 00755);*/
 	} else {
-		setup_derived_state(d_inode(sb->s_root), PERM_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root));
+		setup_derived_state(d_inode(sb->s_root), PERM_ROOT,
+					sb_info->options.fs_user_id, AID_ROOT,
+					false, d_inode(sb->s_root));
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
 	}
 	fixup_tmp_permissions(d_inode(sb->s_root));
@@ -368,8 +371,10 @@ out:
 
 /* A feature which supports mount_nodev() with options */
 static struct dentry *mount_nodev_with_options(struct vfsmount *mnt,
-	struct file_system_type *fs_type, int flags, const char *dev_name, void *data,
-        int (*fill_super)(struct vfsmount *, struct super_block *, const char *, void *, int))
+			struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *data,
+			int (*fill_super)(struct vfsmount *, struct super_block *,
+						const char *, void *, int))
 
 {
 	int error;
@@ -401,19 +406,22 @@ static struct dentry *sdcardfs_mount(struct vfsmount *mnt,
 						raw_data, sdcardfs_read_super);
 }
 
-static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type, int flags,
-		    const char *dev_name, void *raw_data)
+static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type,
+		    int flags, const char *dev_name, void *raw_data)
 {
 	WARN(1, "sdcardfs does not support mount. Use mount2.\n");
 	return ERR_PTR(-EINVAL);
 }
 
-void *sdcardfs_alloc_mnt_data(void) {
+void *sdcardfs_alloc_mnt_data(void)
+{
 	return kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
 }
 
-void sdcardfs_kill_sb(struct super_block *sb) {
+void sdcardfs_kill_sb(struct super_block *sb)
+{
 	struct sdcardfs_sb_info *sbi;
+
 	if (sb->s_magic == SDCARDFS_SUPER_MAGIC) {
 		sbi = SDCARDFS_SB(sb);
 		mutex_lock(&sdcardfs_super_list_lock);
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 5398ebf8a34b..58218fed4083 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -61,7 +61,8 @@ static inline void qstr_init(struct qstr *q, const char *name)
 	q->hash = full_name_case_hash(q->name, q->len);
 }
 
-static inline int qstr_copy(const struct qstr *src, struct qstr *dest) {
+static inline int qstr_copy(const struct qstr *src, struct qstr *dest)
+{
 	dest->name = kstrdup(src->name, GFP_KERNEL);
 	dest->hash_len = src->hash_len;
 	return !!dest->name;
@@ -89,6 +90,7 @@ static appid_t __get_appid(const struct qstr *key)
 appid_t get_appid(const char *key)
 {
 	struct qstr q;
+
 	qstr_init(&q, key);
 	return __get_appid(&q);
 }
@@ -114,6 +116,7 @@ static appid_t __get_ext_gid(const struct qstr *key)
 appid_t get_ext_gid(const char *key)
 {
 	struct qstr q;
+
 	qstr_init(&q, key);
 	return __get_ext_gid(&q);
 }
@@ -144,8 +147,10 @@ appid_t is_excluded(const char *key, userid_t user)
 
 /* Kernel has already enforced everything we returned through
  * derive_permissions_locked(), so this is used to lock down access
- * even further, such as enforcing that apps hold sdcard_rw. */
-int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name) {
+ * even further, such as enforcing that apps hold sdcard_rw.
+ */
+int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name)
+{
 	struct qstr q_autorun = QSTR_LITERAL("autorun.inf");
 	struct qstr q__android_secure = QSTR_LITERAL(".android_secure");
 	struct qstr q_android_secure = QSTR_LITERAL("android_secure");
@@ -160,26 +165,26 @@ int check_caller_access_to_name(struct inode *parent_node, const struct qstr *na
 	}
 
 	/* Root always has access; access for any other UIDs should always
-	 * be controlled through packages.list. */
-	if (from_kuid(&init_user_ns, current_fsuid()) == 0) {
+	 * be controlled through packages.list.
+	 */
+	if (from_kuid(&init_user_ns, current_fsuid()) == 0)
 		return 1;
-	}
 
 	/* No extra permissions to enforce */
 	return 1;
 }
 
 /* This function is used when file opening. The open flags must be
- * checked before calling check_caller_access_to_name() */
-int open_flags_to_access_mode(int open_flags) {
-	if((open_flags & O_ACCMODE) == O_RDONLY) {
+ * checked before calling check_caller_access_to_name()
+ */
+int open_flags_to_access_mode(int open_flags)
+{
+	if ((open_flags & O_ACCMODE) == O_RDONLY)
 		return 0; /* R_OK */
-	} else if ((open_flags & O_ACCMODE) == O_WRONLY) {
+	if ((open_flags & O_ACCMODE) == O_WRONLY)
 		return 1; /* W_OK */
-	} else {
-		/* Probably O_RDRW, but treat as default to be safe */
+	/* Probably O_RDRW, but treat as default to be safe */
 		return 1; /* R_OK | W_OK */
-	}
 }
 
 static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key,
@@ -371,7 +376,6 @@ static void remove_packagelist_entry(const struct qstr *key)
 	remove_packagelist_entry_locked(key);
 	fixup_all_perms_name(key);
 	mutex_unlock(&sdcardfs_super_list_lock);
-	return;
 }
 
 static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group)
@@ -394,7 +398,6 @@ static void remove_ext_gid_entry(const struct qstr *key, gid_t group)
 	mutex_lock(&sdcardfs_super_list_lock);
 	remove_ext_gid_entry_locked(key, group);
 	mutex_unlock(&sdcardfs_super_list_lock);
-	return;
 }
 
 static void remove_userid_all_entry_locked(userid_t userid)
@@ -422,7 +425,6 @@ static void remove_userid_all_entry(userid_t userid)
 	remove_userid_all_entry_locked(userid);
 	fixup_all_perms_userid(userid);
 	mutex_unlock(&sdcardfs_super_list_lock);
-	return;
 }
 
 static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t userid)
@@ -447,7 +449,6 @@ static void remove_userid_exclude_entry(const struct qstr *key, userid_t userid)
 	remove_userid_exclude_entry_locked(key, userid);
 	fixup_all_perms_name_userid(key, userid);
 	mutex_unlock(&sdcardfs_super_list_lock);
-	return;
 }
 
 static void packagelist_destroy(void)
@@ -456,6 +457,7 @@ static void packagelist_destroy(void)
 	struct hlist_node *h_t;
 	HLIST_HEAD(free_list);
 	int i;
+
 	mutex_lock(&sdcardfs_super_list_lock);
 	hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) {
 		hash_del_rcu(&hash_cur->hlist);
@@ -603,7 +605,7 @@ static struct configfs_attribute *package_details_attrs[] = {
 };
 
 static struct configfs_item_operations package_details_item_ops = {
-      .release = package_details_release,
+	.release = package_details_release,
 };
 
 static struct config_item_type package_appid_type = {
@@ -659,6 +661,7 @@ static struct config_item *extension_details_make_item(struct config_group *grou
 	struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL);
 	const char *tmp;
 	int ret;
+
 	if (!extension_details)
 		return ERR_PTR(-ENOMEM);
 
@@ -713,6 +716,7 @@ static struct config_group *extensions_make_group(struct config_group *group, co
 static void extensions_drop_group(struct config_group *group, struct config_item *item)
 {
 	struct extensions_value *value = to_extensions_value(item);
+
 	printk(KERN_INFO "sdcardfs: No longer mapping any files to gid %d\n", value->num);
 	kfree(value);
 }
@@ -847,6 +851,7 @@ static int configfs_sdcardfs_init(void)
 {
 	int ret, i;
 	struct configfs_subsystem *subsys = &sdcardfs_packages;
+
 	for (i = 0; sd_default_groups[i]; i++) {
 		config_group_init(sd_default_groups[i]);
 	}
@@ -877,7 +882,7 @@ int packagelist_init(void)
 	}
 
 	configfs_sdcardfs_init();
-        return 0;
+	return 0;
 }
 
 void packagelist_exit(void)
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index e28a7dd47ebb..d1b86fff1b70 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -87,11 +87,11 @@
 	} while (0)
 
 /* OVERRIDE_CRED() and REVERT_CRED()
- * 	OVERRID_CRED()
- * 		backup original task->cred
- * 		and modifies task->cred->fsuid/fsgid to specified value.
+ *	OVERRIDE_CRED()
+ *		backup original task->cred
+ *		and modifies task->cred->fsuid/fsgid to specified value.
  *	REVERT_CRED()
- * 		restore original task->cred->fsuid/fsgid.
+ *		restore original task->cred->fsuid/fsgid.
  * These two macro should be used in pair, and OVERRIDE_CRED() should be
  * placed at the beginning of a function, right after variable declaration.
  */
@@ -114,27 +114,29 @@
 /* Android 5.0 support */
 
 /* Permission mode for a specific node. Controls how file permissions
- * are derived for children nodes. */
+ * are derived for children nodes.
+ */
 typedef enum {
-    /* Nothing special; this node should just inherit from its parent. */
-    PERM_INHERIT,
-    /* This node is one level above a normal root; used for legacy layouts
-     * which use the first level to represent user_id. */
-    PERM_PRE_ROOT,
-    /* This node is "/" */
-    PERM_ROOT,
-    /* This node is "/Android" */
-    PERM_ANDROID,
-    /* This node is "/Android/data" */
-    PERM_ANDROID_DATA,
-    /* This node is "/Android/obb" */
-    PERM_ANDROID_OBB,
-    /* This node is "/Android/media" */
-    PERM_ANDROID_MEDIA,
-    /* This node is "/Android/[data|media|obb]/[package]" */
-    PERM_ANDROID_PACKAGE,
-    /* This node is "/Android/[data|media|obb]/[package]/cache" */
-    PERM_ANDROID_PACKAGE_CACHE,
+	/* Nothing special; this node should just inherit from its parent. */
+	PERM_INHERIT,
+	/* This node is one level above a normal root; used for legacy layouts
+	 * which use the first level to represent user_id.
+	 */
+	PERM_PRE_ROOT,
+	/* This node is "/" */
+	PERM_ROOT,
+	/* This node is "/Android" */
+	PERM_ANDROID,
+	/* This node is "/Android/data" */
+	PERM_ANDROID_DATA,
+	/* This node is "/Android/obb" */
+	PERM_ANDROID_OBB,
+	/* This node is "/Android/media" */
+	PERM_ANDROID_MEDIA,
+	/* This node is "/Android/[data|media|obb]/[package]" */
+	PERM_ANDROID_PACKAGE,
+	/* This node is "/Android/[data|media|obb]/[package]/cache" */
+	PERM_ANDROID_PACKAGE_CACHE,
 } perm_t;
 
 struct sdcardfs_sb_info;
@@ -142,9 +144,9 @@ struct sdcardfs_mount_options;
 struct sdcardfs_inode_info;
 
 /* Do not directly use this function. Use OVERRIDE_CRED() instead. */
-const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info);
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_inode_info *info);
 /* Do not directly use this function, use REVERT_CRED() instead. */
-void revert_fsids(const struct cred * old_cred);
+void revert_fsids(const struct cred *old_cred);
 
 /* operations vectors defined in specific files */
 extern const struct file_operations sdcardfs_main_fops;
@@ -221,7 +223,8 @@ struct sdcardfs_sb_info {
 	struct super_block *sb;
 	struct super_block *lower_sb;
 	/* derived perm policy : some of options have been added
-	 * to sdcardfs_mount_options (Android 4.4 support) */
+	 * to sdcardfs_mount_options (Android 4.4 support)
+	 */
 	struct sdcardfs_mount_options options;
 	spinlock_t lock;	/* protects obbpath */
 	char *obbpath_s;
@@ -332,7 +335,7 @@ static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \
 { \
 	struct path pname; \
 	spin_lock(&SDCARDFS_D(dent)->lock); \
-	if(SDCARDFS_D(dent)->pname.dentry) { \
+	if (SDCARDFS_D(dent)->pname.dentry) { \
 		pathcpy(&pname, &SDCARDFS_D(dent)->pname); \
 		SDCARDFS_D(dent)->pname.dentry = NULL; \
 		SDCARDFS_D(dent)->pname.mnt = NULL; \
@@ -348,17 +351,17 @@ SDCARDFS_DENT_FUNC(orig_path)
 
 static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo)
 {
-  return sbinfo && sbinfo->sb && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC;
+	return sbinfo && sbinfo->sb && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC;
 }
 
 /* grab a refererence if we aren't linking to ourself */
 static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top)
 {
 	struct inode *old_top = NULL;
+
 	BUG_ON(IS_ERR_OR_NULL(top));
-	if (info->top && info->top != &info->vfs_inode) {
+	if (info->top && info->top != &info->vfs_inode)
 		old_top = info->top;
-	}
 	if (top != &info->vfs_inode)
 		igrab(top);
 	info->top = top;
@@ -368,11 +371,11 @@ static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top)
 static inline struct inode *grab_top(struct sdcardfs_inode_info *info)
 {
 	struct inode *top = info->top;
-	if (top) {
+
+	if (top)
 		return igrab(top);
-	} else {
+	else
 		return NULL;
-	}
 }
 
 static inline void release_top(struct sdcardfs_inode_info *info)
@@ -380,21 +383,24 @@ static inline void release_top(struct sdcardfs_inode_info *info)
 	iput(info->top);
 }
 
-static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info) {
+static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info)
+{
 	struct sdcardfs_vfsmount_options *opts = mnt->data;
 
-	if (opts->gid == AID_SDCARD_RW) {
+	if (opts->gid == AID_SDCARD_RW)
 		/* As an optimization, certain trusted system components only run
 		 * as owner but operate across all users. Since we're now handing
 		 * out the sdcard_rw GID only to trusted apps, we're okay relaxing
 		 * the user boundary enforcement for the default view. The UIDs
-		 * assigned to app directories are still multiuser aware. */
+		 * assigned to app directories are still multiuser aware.
+		 */
 		return AID_SDCARD_RW;
-	} else {
+	else
 		return multiuser_get_uid(info->userid, opts->gid);
-	}
 }
-static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *info) {
+
+static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *info)
+{
 	int owner_mode;
 	int filtered_mode;
 	struct sdcardfs_vfsmount_options *opts = mnt->data;
@@ -403,17 +409,18 @@ static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *inf
 
 	if (info->perm == PERM_PRE_ROOT) {
 		/* Top of multi-user view should always be visible to ensure
-		* secondary users can traverse inside. */
+		* secondary users can traverse inside.
+		*/
 		visible_mode = 0711;
 	} else if (info->under_android) {
 		/* Block "other" access to Android directories, since only apps
 		* belonging to a specific user should be in there; we still
-		* leave +x open for the default view. */
-		if (opts->gid == AID_SDCARD_RW) {
+		* leave +x open for the default view.
+		*/
+		if (opts->gid == AID_SDCARD_RW)
 			visible_mode = visible_mode & ~0006;
-		} else {
+		else
 			visible_mode = visible_mode & ~0007;
-		}
 	}
 	owner_mode = info->lower_inode->i_mode & 0700;
 	filtered_mode = visible_mode & (owner_mode | (owner_mode >> 3) | (owner_mode >> 6));
@@ -438,7 +445,7 @@ static inline void sdcardfs_get_real_lower(const struct dentry *dent,
 	/* in case of a local obb dentry
 	 * the orig_path should be returned
 	 */
-	if(has_graft_path(dent))
+	if (has_graft_path(dent))
 		sdcardfs_get_orig_path(dent, real_lower);
 	else
 		sdcardfs_get_lower_path(dent, real_lower);
@@ -447,7 +454,7 @@ static inline void sdcardfs_get_real_lower(const struct dentry *dent,
 static inline void sdcardfs_put_real_lower(const struct dentry *dent,
 						struct path *real_lower)
 {
-	if(has_graft_path(dent))
+	if (has_graft_path(dent))
 		sdcardfs_put_orig_path(dent, real_lower);
 	else
 		sdcardfs_put_lower_path(dent, real_lower);
@@ -460,7 +467,7 @@ extern struct list_head sdcardfs_super_list;
 extern appid_t get_appid(const char *app_name);
 extern appid_t get_ext_gid(const char *app_name);
 extern appid_t is_excluded(const char *app_name, userid_t userid);
-extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr* name);
+extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name);
 extern int open_flags_to_access_mode(int open_flags);
 extern int packagelist_init(void);
 extern void packagelist_exit(void);
@@ -481,7 +488,7 @@ extern void get_derived_permission_new(struct dentry *parent, struct dentry *den
 extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
 
 extern void update_derived_permission_lock(struct dentry *dentry);
-void fixup_lower_ownership(struct dentry* dentry, const char *name);
+void fixup_lower_ownership(struct dentry *dentry, const char *name);
 extern int need_graft_path(struct dentry *dentry);
 extern int is_base_obbpath(struct dentry *dentry);
 extern int is_obbpath_invalid(struct dentry *dentry);
@@ -491,6 +498,7 @@ extern int setup_obb_dentry(struct dentry *dentry, struct path *lower_path);
 static inline struct dentry *lock_parent(struct dentry *dentry)
 {
 	struct dentry *dir = dget_parent(dentry);
+
 	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
 	return dir;
 }
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index edda32b68dc0..a4629f20812e 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -36,7 +36,7 @@ static void sdcardfs_put_super(struct super_block *sb)
 	if (!spd)
 		return;
 
-	if(spd->obbpath_s) {
+	if (spd->obbpath_s) {
 		kfree(spd->obbpath_s);
 		path_put(&spd->obbpath);
 	}
@@ -125,29 +125,33 @@ static int sdcardfs_remount_fs2(struct vfsmount *mnt, struct super_block *sb,
 	 * SILENT, but anything else left over is an error.
 	 */
 	if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT | MS_REMOUNT)) != 0) {
-		printk(KERN_ERR
-		       "sdcardfs: remount flags 0x%x unsupported\n", *flags);
+		pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
 		err = -EINVAL;
 	}
-	printk(KERN_INFO "Remount options were %s for vfsmnt %p.\n", options, mnt);
+	pr_info("Remount options were %s for vfsmnt %p.\n", options, mnt);
 	err = parse_options_remount(sb, options, *flags & ~MS_SILENT, mnt->data);
 
 
 	return err;
 }
 
-static void* sdcardfs_clone_mnt_data(void *data) {
-	struct sdcardfs_vfsmount_options* opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
-	struct sdcardfs_vfsmount_options* old = data;
-	if(!opt) return NULL;
+static void *sdcardfs_clone_mnt_data(void *data)
+{
+	struct sdcardfs_vfsmount_options *opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+	struct sdcardfs_vfsmount_options *old = data;
+
+	if (!opt)
+		return NULL;
 	opt->gid = old->gid;
 	opt->mask = old->mask;
 	return opt;
 }
 
-static void sdcardfs_copy_mnt_data(void *data, void *newdata) {
-	struct sdcardfs_vfsmount_options* old = data;
-	struct sdcardfs_vfsmount_options* new = newdata;
+static void sdcardfs_copy_mnt_data(void *data, void *newdata)
+{
+	struct sdcardfs_vfsmount_options *old = data;
+	struct sdcardfs_vfsmount_options *new = newdata;
+
 	old->gid = new->gid;
 	old->mask = new->mask;
 }
@@ -235,7 +239,8 @@ static void sdcardfs_umount_begin(struct super_block *sb)
 		lower_sb->s_op->umount_begin(lower_sb);
 }
 
-static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m, struct dentry *root)
+static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
+			struct dentry *root)
 {
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(root->d_sb);
 	struct sdcardfs_mount_options *opts = &sbi->options;

From 9eb0b8ba1c9248d3e76f170265be0162026ea5c2 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 16 Mar 2017 19:33:35 -0700
Subject: [PATCH 0718/3220] ANDROID: sdcardfs: Fix style issues with comments

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35331000
Change-Id: I8791ef7eac527645ecb9407908e7e5ece35b8f80
---
 fs/sdcardfs/dentry.c       | 16 +---------------
 fs/sdcardfs/derived_perm.c |  9 +++------
 fs/sdcardfs/main.c         |  2 +-
 3 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index aaa15dff6f51..8e31d1a80f0c 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -127,12 +127,10 @@ static int sdcardfs_hash_ci(const struct dentry *dentry,
 	unsigned long hash;
 
 	name = qstr->name;
-	//len = vfat_striptail_len(qstr);
 	len = qstr->len;
 
 	hash = init_name_hash();
 	while (len--)
-		//hash = partial_name_hash(nls_tolower(t, *name++), hash);
 		hash = partial_name_hash(tolower(*name++), hash);
 	qstr->hash = end_name_hash(hash);
 
@@ -146,20 +144,8 @@ static int sdcardfs_cmp_ci(const struct dentry *parent,
 		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
-	/* This function is copy of vfat_cmpi */
-	// FIXME Should we support national language?
-	//struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
-	//unsigned int alen, blen;
+	/* FIXME Should we support national language? */
 
-	/* A filename cannot end in '.' or we treat it like it has none */
-	/*
-	alen = vfat_striptail_len(name);
-	blen = __vfat_striptail_len(len, str);
-	if (alen == blen) {
-		if (nls_strnicmp(t, name->name, str, alen) == 0)
-			return 0;
-	}
-	*/
 	if (name->len == len) {
 		if (str_n_case_eq(name->name, str, len))
 			return 0;
diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 51fa565742bd..748eb3741b3b 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -325,9 +325,7 @@ inline void update_derived_permission_lock(struct dentry *dentry)
 	 * 1. need to check whether the dentry is updated or not
 	 * 2. remove the root dentry update
 	 */
-	if (IS_ROOT(dentry)) {
-		//setup_default_pre_root_state(d_inode(dentry));
-	} else {
+	if (!IS_ROOT(dentry)) {
 		parent = dget_parent(dentry);
 		if (parent) {
 			get_derived_permission(parent, dentry);
@@ -377,7 +375,6 @@ int is_obbpath_invalid(struct dentry *dent)
 			ret = 1;
 		} else {
 			path_get(&di->lower_path);
-			//lower_parent = lock_parent(lower_path->dentry);
 
 			path_buf = kmalloc(PATH_MAX, GFP_ATOMIC);
 			if (!path_buf) {
@@ -392,7 +389,6 @@ int is_obbpath_invalid(struct dentry *dent)
 				kfree(path_buf);
 			}
 
-			//unlock_dir(lower_parent);
 			pathcpy(&lower_path, &di->lower_path);
 			need_put = 1;
 		}
@@ -438,7 +434,8 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
 
 	/* A local obb dentry must have its own orig_path to support rmdir
 	 * and mkdir of itself. Usually, we expect that the sbi->obbpath
-	 * is avaiable on this stage. */
+	 * is avaiable on this stage.
+	 */
 	sdcardfs_set_orig_path(dentry, lower_path);
 
 	err = kern_path(sbi->obbpath_s,
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index c249bd73b121..95cf03379019 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -29,7 +29,7 @@ enum {
 	Opt_gid,
 	Opt_debug,
 	Opt_mask,
-	Opt_multiuser, // May need?
+	Opt_multiuser,
 	Opt_userid,
 	Opt_reserved_mb,
 	Opt_err,

From 1ed9910bedd662fe74d213b6e4333d1f08dfdb90 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 21 Mar 2017 16:29:13 -0700
Subject: [PATCH 0719/3220] ANDROID: sdcardfs: remove unneeded null check

As pointed out by checkpatch, these functions already
handle null inputs, so the checks are not needed.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35331000
Change-Id: I189342f032dfcefee36b27648bb512488ad61d20
---
 fs/sdcardfs/lookup.c      | 3 +--
 fs/sdcardfs/packagelist.c | 3 +--
 fs/sdcardfs/super.c       | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index fab9fd9d0105..51a904e59f46 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -36,8 +36,7 @@ int sdcardfs_init_dentry_cache(void)
 
 void sdcardfs_destroy_dentry_cache(void)
 {
-	if (sdcardfs_dentry_cachep)
-		kmem_cache_destroy(sdcardfs_dentry_cachep);
+	kmem_cache_destroy(sdcardfs_dentry_cachep);
 }
 
 void free_dentry_private_data(struct dentry *dentry)
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 58218fed4083..3c016cb0d8bb 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -889,6 +889,5 @@ void packagelist_exit(void)
 {
 	configfs_sdcardfs_exit();
 	packagelist_destroy();
-	if (hashtable_entry_cachep)
-		kmem_cache_destroy(hashtable_entry_cachep);
+	kmem_cache_destroy(hashtable_entry_cachep);
 }
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index a4629f20812e..7d3c331379f5 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -222,8 +222,7 @@ int sdcardfs_init_inode_cache(void)
 /* sdcardfs inode cache destructor */
 void sdcardfs_destroy_inode_cache(void)
 {
-	if (sdcardfs_inode_cachep)
-		kmem_cache_destroy(sdcardfs_inode_cachep);
+	kmem_cache_destroy(sdcardfs_inode_cachep);
 }
 
 /*

From b7dbda19b8cebde338a9034d4706519c93c25a39 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 16 Mar 2017 17:46:13 -0700
Subject: [PATCH 0720/3220] ANDROID: sdcardfs: Use pr_[...] instead of printk

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35331000
Change-Id: Ibc635ec865750530d32b87067779f681fe58a003
---
 fs/sdcardfs/derived_perm.c |  6 ++---
 fs/sdcardfs/file.c         |  9 ++++----
 fs/sdcardfs/inode.c        |  8 +++----
 fs/sdcardfs/lookup.c       |  2 +-
 fs/sdcardfs/main.c         | 45 +++++++++++++++++---------------------
 fs/sdcardfs/packagelist.c  | 16 +++++++-------
 fs/sdcardfs/sdcardfs.h     |  2 +-
 fs/sdcardfs/super.c        |  5 ++---
 8 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 748eb3741b3b..eabdbfd30054 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -318,7 +318,7 @@ inline void update_derived_permission_lock(struct dentry *dentry)
 	struct dentry *parent;
 
 	if (!dentry || !d_inode(dentry)) {
-		printk(KERN_ERR "sdcardfs: %s: invalid dentry\n", __func__);
+		pr_err("sdcardfs: %s: invalid dentry\n", __func__);
 		return;
 	}
 	/* FIXME:
@@ -379,7 +379,7 @@ int is_obbpath_invalid(struct dentry *dent)
 			path_buf = kmalloc(PATH_MAX, GFP_ATOMIC);
 			if (!path_buf) {
 				ret = 1;
-				printk(KERN_ERR "sdcardfs: fail to allocate path_buf in %s.\n", __func__);
+				pr_err("sdcardfs: fail to allocate path_buf in %s.\n", __func__);
 			} else {
 				obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX);
 				if (d_unhashed(di->lower_path.dentry) ||
@@ -451,7 +451,7 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
 		 * because the sdcard daemon also regards this case as
 		 * a lookup fail.
 		 */
-		printk(KERN_INFO "sdcardfs: the sbi->obbpath is not available\n");
+		pr_info("sdcardfs: the sbi->obbpath is not available\n");
 	}
 	return err;
 }
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 4c9726606c0f..eee4eb5896e5 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -65,7 +65,7 @@ static ssize_t sdcardfs_write(struct file *file, const char __user *buf,
 
 	/* check disk space */
 	if (!check_min_free_space(dentry, count, 0)) {
-		printk(KERN_INFO "No minimum free space.\n");
+		pr_err("No minimum free space.\n");
 		return -ENOSPC;
 	}
 
@@ -160,8 +160,7 @@ static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma)
 	lower_file = sdcardfs_lower_file(file);
 	if (willwrite && !lower_file->f_mapping->a_ops->writepage) {
 		err = -EINVAL;
-		printk(KERN_ERR "sdcardfs: lower file system does not "
-		       "support writeable mmap\n");
+		pr_err("sdcardfs: lower file system does not support writeable mmap\n");
 		goto out;
 	}
 
@@ -173,14 +172,14 @@ static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!SDCARDFS_F(file)->lower_vm_ops) {
 		err = lower_file->f_op->mmap(lower_file, vma);
 		if (err) {
-			printk(KERN_ERR "sdcardfs: lower mmap failed %d\n", err);
+			pr_err("sdcardfs: lower mmap failed %d\n", err);
 			goto out;
 		}
 		saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */
 		err = do_munmap(current->mm, vma->vm_start,
 				vma->vm_end - vma->vm_start);
 		if (err) {
-			printk(KERN_ERR "sdcardfs: do_munmap failed %d\n", err);
+			pr_err("sdcardfs: do_munmap failed %d\n", err);
 			goto out;
 		}
 	}
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index ce187d865f29..f15cb11ca8fd 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -248,7 +248,7 @@ static int touch(char *abs_path, mode_t mode)
 		if (PTR_ERR(filp) == -EEXIST) {
 			return 0;
 		} else {
-			printk(KERN_ERR "sdcardfs: failed to open(%s): %ld\n",
+			pr_err("sdcardfs: failed to open(%s): %ld\n",
 						abs_path, PTR_ERR(filp));
 			return PTR_ERR(filp);
 		}
@@ -284,7 +284,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 
 	/* check disk space */
 	if (!check_min_free_space(dentry, 0, 1)) {
-		printk(KERN_INFO "sdcardfs: No minimum free space.\n");
+		pr_err("sdcardfs: No minimum free space.\n");
 		err = -ENOSPC;
 		goto out_revert;
 	}
@@ -360,7 +360,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 		set_fs_pwd(current->fs, &lower_path);
 		touch_err = touch(".nomedia", 0664);
 		if (touch_err) {
-			printk(KERN_ERR "sdcardfs: failed to create .nomedia in %s: %d\n",
+			pr_err("sdcardfs: failed to create .nomedia in %s: %d\n",
 							lower_path.dentry->d_name.name, touch_err);
 			goto out;
 		}
@@ -642,7 +642,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma
 	release_top(SDCARDFS_I(inode));
 	tmp.i_sb = inode->i_sb;
 	if (IS_POSIXACL(inode))
-		printk(KERN_WARNING "%s: This may be undefined behavior... \n", __func__);
+		pr_warn("%s: This may be undefined behavior...\n", __func__);
 	err = generic_permission(&tmp, mask);
 	/* XXX
 	 * Original sdcardfs code calls inode_permission(lower_inode,.. )
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 51a904e59f46..cfa2b12b6411 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -323,7 +323,7 @@ put_name:
 				 * because the sdcard daemon also regards this case as
 				 * a lookup fail.
 				 */
-				printk(KERN_INFO "sdcardfs: base obbpath is not available\n");
+				pr_info("sdcardfs: base obbpath is not available\n");
 				sdcardfs_put_reset_orig_path(dentry);
 				goto out;
 			}
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 95cf03379019..7344635522cd 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -117,19 +117,17 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 			break;
 		/* unknown option */
 		default:
-			if (!silent) {
-				printk( KERN_ERR "Unrecognized mount option \"%s\" "
-						"or missing value", p);
-			}
+			if (!silent)
+				pr_err("Unrecognized mount option \"%s\" or missing value", p);
 			return -EINVAL;
 		}
 	}
 
 	if (*debug) {
-		printk( KERN_INFO "sdcardfs : options - debug:%d\n", *debug);
-		printk( KERN_INFO "sdcardfs : options - uid:%d\n",
+		pr_info("sdcardfs : options - debug:%d\n", *debug);
+		pr_info("sdcardfs : options - uid:%d\n",
 							opts->fs_low_uid);
-		printk( KERN_INFO "sdcardfs : options - gid:%d\n",
+		pr_info("sdcardfs : options - gid:%d\n",
 							opts->fs_low_gid);
 	}
 
@@ -175,22 +173,20 @@ int parse_options_remount(struct super_block *sb, char *options, int silent,
 		case Opt_fsuid:
 		case Opt_fsgid:
 		case Opt_reserved_mb:
-			printk( KERN_WARNING "Option \"%s\" can't be changed during remount\n", p);
+			pr_warn("Option \"%s\" can't be changed during remount\n", p);
 			break;
 		/* unknown option */
 		default:
-			if (!silent) {
-				printk( KERN_ERR "Unrecognized mount option \"%s\" "
-						"or missing value", p);
-			}
+			if (!silent)
+				pr_err("Unrecognized mount option \"%s\" or missing value", p);
 			return -EINVAL;
 		}
 	}
 
 	if (debug) {
-		printk( KERN_INFO "sdcardfs : options - debug:%d\n", debug);
-		printk( KERN_INFO "sdcardfs : options - gid:%d\n", vfsopts->gid);
-		printk( KERN_INFO "sdcardfs : options - mask:%d\n", vfsopts->mask);
+		pr_info("sdcardfs : options - debug:%d\n", debug);
+		pr_info("sdcardfs : options - gid:%d\n", vfsopts->gid);
+		pr_info("sdcardfs : options - mask:%d\n", vfsopts->mask);
 	}
 
 	return 0;
@@ -244,31 +240,30 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	struct sdcardfs_vfsmount_options *mnt_opt = mnt->data;
 	struct inode *inode;
 
-	printk(KERN_INFO "sdcardfs version 2.0\n");
+	pr_info("sdcardfs version 2.0\n");
 
 	if (!dev_name) {
-		printk(KERN_ERR
-		       "sdcardfs: read_super: missing dev_name argument\n");
+		pr_err("sdcardfs: read_super: missing dev_name argument\n");
 		err = -EINVAL;
 		goto out;
 	}
 
-	printk(KERN_INFO "sdcardfs: dev_name -> %s\n", dev_name);
-	printk(KERN_INFO "sdcardfs: options -> %s\n", (char *)raw_data);
-	printk(KERN_INFO "sdcardfs: mnt -> %p\n", mnt);
+	pr_info("sdcardfs: dev_name -> %s\n", dev_name);
+	pr_info("sdcardfs: options -> %s\n", (char *)raw_data);
+	pr_info("sdcardfs: mnt -> %p\n", mnt);
 
 	/* parse lower path */
 	err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
 			&lower_path);
 	if (err) {
-		printk(KERN_ERR	"sdcardfs: error accessing lower directory '%s'\n", dev_name);
+		pr_err("sdcardfs: error accessing lower directory '%s'\n", dev_name);
 		goto out;
 	}
 
 	/* allocate superblock private data */
 	sb->s_fs_info = kzalloc(sizeof(struct sdcardfs_sb_info), GFP_KERNEL);
 	if (!SDCARDFS_SB(sb)) {
-		printk(KERN_CRIT "sdcardfs: read_super: out of memory\n");
+		pr_crit("sdcardfs: read_super: out of memory\n");
 		err = -ENOMEM;
 		goto out_free;
 	}
@@ -277,7 +272,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	/* parse options */
 	err = parse_options(sb, raw_data, silent, &debug, mnt_opt, &sb_info->options);
 	if (err) {
-		printk(KERN_ERR	"sdcardfs: invalid options\n");
+		pr_err("sdcardfs: invalid options\n");
 		goto out_freesbi;
 	}
 
@@ -347,7 +342,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	mutex_unlock(&sdcardfs_super_list_lock);
 
 	if (!silent)
-		printk(KERN_INFO "sdcardfs: mounted on top of %s type %s\n",
+		pr_info("sdcardfs: mounted on top of %s type %s\n",
 				dev_name, lower_sb->s_type->name);
 	goto out; /* all is well */
 
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 3c016cb0d8bb..89196e31073e 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -471,7 +471,7 @@ static void packagelist_destroy(void)
 	hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
 		free_hashtable_entry(hash_cur);
 	mutex_unlock(&sdcardfs_super_list_lock);
-	printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n");
+	pr_info("sdcardfs: destroyed packagelist pkgld\n");
 }
 
 #define SDCARDFS_CONFIGFS_ATTR(_pfx, _name)			\
@@ -587,7 +587,8 @@ static ssize_t package_details_clear_userid_store(struct config_item *item,
 static void package_details_release(struct config_item *item)
 {
 	struct package_details *package_details = to_package_details(item);
-	printk(KERN_INFO "sdcardfs: removing %s\n", package_details->name.name);
+
+	pr_info("sdcardfs: removing %s\n", package_details->name.name);
 	remove_packagelist_entry(&package_details->name);
 	kfree(package_details->name.name);
 	kfree(package_details);
@@ -639,7 +640,7 @@ static void extension_details_release(struct config_item *item)
 {
 	struct extension_details *extension_details = to_extension_details(item);
 
-	printk(KERN_INFO "sdcardfs: No longer mapping %s files to gid %d\n",
+	pr_info("sdcardfs: No longer mapping %s files to gid %d\n",
 			extension_details->name.name, extension_details->num);
 	remove_ext_gid_entry(&extension_details->name, extension_details->num);
 	kfree(extension_details->name.name);
@@ -717,7 +718,7 @@ static void extensions_drop_group(struct config_group *group, struct config_item
 {
 	struct extensions_value *value = to_extensions_value(item);
 
-	printk(KERN_INFO "sdcardfs: No longer mapping any files to gid %d\n", value->num);
+	pr_info("sdcardfs: No longer mapping any files to gid %d\n", value->num);
 	kfree(value);
 }
 
@@ -852,14 +853,13 @@ static int configfs_sdcardfs_init(void)
 	int ret, i;
 	struct configfs_subsystem *subsys = &sdcardfs_packages;
 
-	for (i = 0; sd_default_groups[i]; i++) {
+	for (i = 0; sd_default_groups[i]; i++)
 		config_group_init(sd_default_groups[i]);
-	}
 	config_group_init(&subsys->su_group);
 	mutex_init(&subsys->su_mutex);
 	ret = configfs_register_subsystem(subsys);
 	if (ret) {
-		printk(KERN_ERR "Error %d while registering subsystem %s\n",
+		pr_err("Error %d while registering subsystem %s\n",
 		       ret,
 		       subsys->su_group.cg_item.ci_namebuf);
 	}
@@ -877,7 +877,7 @@ int packagelist_init(void)
 		kmem_cache_create("packagelist_hashtable_entry",
 					sizeof(struct hashtable_entry), 0, 0, NULL);
 	if (!hashtable_entry_cachep) {
-		printk(KERN_ERR "sdcardfs: failed creating pkgl_hashtable entry slab cache\n");
+		pr_err("sdcardfs: failed creating pkgl_hashtable entry slab cache\n");
 		return -ENOMEM;
 	}
 
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index d1b86fff1b70..a9c5a8345d09 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -53,7 +53,7 @@
 #define SDCARDFS_ROOT_INO     1
 
 /* useful for tracking code reachability */
-#define UDBG printk(KERN_DEFAULT "DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__)
+#define UDBG pr_default("DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__)
 
 #define SDCARDFS_DIRENT_SIZE 256
 
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index 7d3c331379f5..67b4ae24337a 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -64,7 +64,7 @@ static int sdcardfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (sbi->options.reserved_mb) {
 		/* Invalid statfs informations. */
 		if (buf->f_bsize == 0) {
-			printk(KERN_ERR "Returned block size is zero.\n");
+			pr_err("Returned block size is zero.\n");
 			return -EINVAL;
 		}
 
@@ -100,8 +100,7 @@ static int sdcardfs_remount_fs(struct super_block *sb, int *flags, char *options
 	 * SILENT, but anything else left over is an error.
 	 */
 	if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT)) != 0) {
-		printk(KERN_ERR
-		       "sdcardfs: remount flags 0x%x unsupported\n", *flags);
+		pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
 		err = -EINVAL;
 	}
 

From a9a3f48222dbc841998989c8b363e86d21b1379b Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 16 Mar 2017 19:32:59 -0700
Subject: [PATCH 0721/3220] ANDROID: sdcardfs: Use to kstrout

Switch from deprecated simple_strtoul to kstrout

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35331000
Change-Id: If18bd133b4d2877f71e58b58fc31371ff6613ed5
---
 fs/sdcardfs/derived_perm.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index eabdbfd30054..f82fedd29007 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -60,6 +60,8 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 	struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
 	struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
 	appid_t appid;
+	unsigned long user_num;
+	int err;
 	struct qstr q_Android = QSTR_LITERAL("Android");
 	struct qstr q_data = QSTR_LITERAL("data");
 	struct qstr q_obb = QSTR_LITERAL("obb");
@@ -88,7 +90,11 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 	case PERM_PRE_ROOT:
 		/* Legacy internal layout places users at top level */
 		info->perm = PERM_ROOT;
-		info->userid = simple_strtoul(name->name, NULL, 10);
+		err = kstrtoul(name->name, 10, &user_num);
+		if (err)
+			info->userid = 0;
+		else
+			info->userid = user_num;
 		set_top(info, &info->vfs_inode);
 		break;
 	case PERM_ROOT:

From 2b37dac9abaa183633ee933517c8c6e9d8cc3477 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 21 Mar 2017 17:27:40 -0700
Subject: [PATCH 0722/3220] ANDROID: sdcardfs: Use seq_puts over seq_printf

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35331000
Change-Id: I3795ec61ce61e324738815b1ce3b0e09b25d723f
---
 fs/sdcardfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index 67b4ae24337a..a3393e959c63 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -251,7 +251,7 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
 	if (vfsopts->gid != 0)
 		seq_printf(m, ",gid=%u", vfsopts->gid);
 	if (opts->multiuser)
-		seq_printf(m, ",multiuser");
+		seq_puts(m, ",multiuser");
 	if (vfsopts->mask)
 		seq_printf(m, ",mask=%u", vfsopts->mask);
 	if (opts->fs_user_id)

From ff9fa56a43ea617f6d0e028a41cc79760bd4dcff Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 21 Mar 2017 19:11:38 -0700
Subject: [PATCH 0723/3220] ANDROID: sdcardfs: Fix style issues in macros

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35331000
Change-Id: I89c4035029dc2236081a7685c55cac595d9e7ebf
---
 fs/sdcardfs/sdcardfs.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index a9c5a8345d09..2b67b9a8ef9f 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -96,21 +96,21 @@
  * placed at the beginning of a function, right after variable declaration.
  */
 #define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info)		\
+	do {	\
 	saved_cred = override_fsids(sdcardfs_sbi, info);	\
-	if (!saved_cred) { return -ENOMEM; }
+	if (!saved_cred)	\
+		return -ENOMEM;	\
+	} while (0)
 
 #define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info)	\
+	do {	\
 	saved_cred = override_fsids(sdcardfs_sbi, info);	\
-	if (!saved_cred) { return ERR_PTR(-ENOMEM); }
+	if (!saved_cred)	\
+		return ERR_PTR(-ENOMEM);	\
+	} while (0)
 
 #define REVERT_CRED(saved_cred)	revert_fsids(saved_cred)
 
-#define DEBUG_CRED()		\
-	printk("KAKJAGI: %s:%d fsuid %d fsgid %d\n", 	\
-		__FUNCTION__, __LINE__, 		\
-		(int)current->cred->fsuid, 		\
-		(int)current->cred->fsgid);
-
 /* Android 5.0 support */
 
 /* Permission mode for a specific node. Controls how file permissions

From e1feee06592832d3b9f51e0649b18d34f0a5e16e Mon Sep 17 00:00:00 2001
From: Prasanna Karthik <mkarthi3@visteon.com>
Date: Tue, 8 Dec 2015 17:30:25 +0100
Subject: [PATCH 0724/3220] UPSTREAM: ARM: 8476/1: VDSO: use PTR_ERR_OR_ZERO
 for vma check

(cherry pick from commit 38fc2f6c98262913388de338d5b0cda67e3f78cd)

Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR

Signed-off-by: Prasanna Karthik <mkarthi3@visteon.com>
Signed-off-by: Nathan Lynch <nathan_lynch@mentor.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Bug: 20045882
Bug: 19198045
Change-Id: Id4838948c19f031a66af1654a4c47668288a0037
---
 arch/arm/kernel/vdso.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index 54a5aeab988d..994e971a8538 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -224,7 +224,7 @@ static int install_vvar(struct mm_struct *mm, unsigned long addr)
 				       VM_READ | VM_MAYREAD,
 				       &vdso_data_mapping);
 
-	return IS_ERR(vma) ? PTR_ERR(vma) : 0;
+	return PTR_ERR_OR_ZERO(vma);
 }
 
 /* assumes mmap_sem is write-locked */

From 510b819279cb6d4b803059ef66ec1bc05e187a04 Mon Sep 17 00:00:00 2001
From: Chen Gang <xili_gchen_5257@hotmail.com>
Date: Thu, 14 Jan 2016 15:18:33 -0800
Subject: [PATCH 0725/3220] UPSTREAM: mm: add PHYS_PFN, use it in
 __phys_to_pfn()

(cherry pick from commit 8f235d1a3eb7198affe7cadf676a10afb8a46a1a)

__phys_to_pfn and __pfn_to_phys are symmetric, PHYS_PFN and PFN_PHYS are
semmetric:

 - y = (phys_addr_t)x << PAGE_SHIFT

 - y >> PAGE_SHIFT = (phys_add_t)x

 - (unsigned long)(y >> PAGE_SHIFT) = x

[akpm@linux-foundation.org: use macro arg name `x']
[arnd@arndb.de: include linux/pfn.h for PHYS_PFN definition]
Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Bug: 20045882
Bug: 19198045
Change-Id: If968d2246b381b9e5d6446e9d6d9fa45bb718e91
---
 include/asm-generic/memory_model.h | 4 +++-
 include/linux/pfn.h                | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 4b4b056a6eb0..5148150cc80b 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_MEMORY_MODEL_H
 #define __ASM_MEMORY_MODEL_H
 
+#include <linux/pfn.h>
+
 #ifndef __ASSEMBLY__
 
 #if defined(CONFIG_FLATMEM)
@@ -72,7 +74,7 @@
 /*
  * Convert a physical address to a Page Frame Number and back
  */
-#define	__phys_to_pfn(paddr)	((unsigned long)((paddr) >> PAGE_SHIFT))
+#define	__phys_to_pfn(paddr)	PHYS_PFN(paddr)
 #define	__pfn_to_phys(pfn)	PFN_PHYS(pfn)
 
 #define page_to_pfn __page_to_pfn
diff --git a/include/linux/pfn.h b/include/linux/pfn.h
index 7646637221f3..97f3e88aead4 100644
--- a/include/linux/pfn.h
+++ b/include/linux/pfn.h
@@ -9,5 +9,6 @@
 #define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
 #define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
 #define PFN_PHYS(x)	((phys_addr_t)(x) << PAGE_SHIFT)
+#define PHYS_PFN(x)	((unsigned long)((x) >> PAGE_SHIFT))
 
 #endif

From f186947ce6fb40a19ac180d819b7c5e733a3a229 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Sun, 13 Mar 2016 09:13:55 +0900
Subject: [PATCH 0726/3220] UPSTREAM: kbuild: drop FORCE from PHONY targets

(cherry pick from commit 2e8d696b79e9c68d3005a9b09a8c72625d141ea6)

These targets are marked as PHONY.  No need to add FORCE to their
dependency.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Michal Marek <mmarek@suse.com>
Bug: 20045882
Bug: 19198045
Change-Id: I718d46c339c99418d543cc64cabc851307e5abb7
---
 Makefile                     | 8 ++++----
 arch/arm/vdso/Makefile       | 2 +-
 arch/ia64/Makefile           | 4 ++--
 arch/x86/entry/vdso/Makefile | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 3efe2ea99e2d..6450e18ebd7a 100644
--- a/Makefile
+++ b/Makefile
@@ -146,7 +146,7 @@ PHONY += $(MAKECMDGOALS) sub-make
 $(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make
 	@:
 
-sub-make: FORCE
+sub-make:
 	$(Q)$(MAKE) -C $(KBUILD_OUTPUT) KBUILD_SRC=$(CURDIR) \
 	-f $(CURDIR)/Makefile $(filter-out _all sub-make,$(MAKECMDGOALS))
 
@@ -1000,7 +1000,7 @@ prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
 
 archprepare: archheaders archscripts prepare1 scripts_basic
 
-prepare0: archprepare FORCE
+prepare0: archprepare
 	$(Q)$(MAKE) $(build)=.
 
 # All the preparing..
@@ -1045,7 +1045,7 @@ INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware
 export INSTALL_FW_PATH
 
 PHONY += firmware_install
-firmware_install: FORCE
+firmware_install:
 	@mkdir -p $(objtree)/firmware
 	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install
 
@@ -1065,7 +1065,7 @@ PHONY += archscripts
 archscripts:
 
 PHONY += __headers
-__headers: $(version_h) scripts_basic asm-generic archheaders archscripts FORCE
+__headers: $(version_h) scripts_basic asm-generic archheaders archscripts
 	$(Q)$(MAKE) $(build)=scripts build_unifdef
 
 PHONY += headers_install_all
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index 1160434eece0..59a8fa7b8a3b 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -74,5 +74,5 @@ $(MODLIB)/vdso: FORCE
 	@mkdir -p $(MODLIB)/vdso
 
 PHONY += vdso_install
-vdso_install: $(obj)/vdso.so.dbg $(MODLIB)/vdso FORCE
+vdso_install: $(obj)/vdso.so.dbg $(MODLIB)/vdso
 	$(call cmd,vdso_install)
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 970d0bd99621..648f1cef33fa 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -95,8 +95,8 @@ define archhelp
   echo '* unwcheck	- Check vmlinux for invalid unwind info'
 endef
 
-archprepare: make_nr_irqs_h FORCE
+archprepare: make_nr_irqs_h
 PHONY += make_nr_irqs_h FORCE
 
-make_nr_irqs_h: FORCE
+make_nr_irqs_h:
 	$(Q)$(MAKE) $(build)=arch/ia64/kernel include/generated/nr-irqs.h
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 265c0ed68118..7af017a8958f 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -187,10 +187,10 @@ vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
 $(MODLIB)/vdso: FORCE
 	@mkdir -p $(MODLIB)/vdso
 
-$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
+$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso
 	$(call cmd,vdso_install)
 
 PHONY += vdso_install $(vdso_img_insttargets)
-vdso_install: $(vdso_img_insttargets) FORCE
+vdso_install: $(vdso_img_insttargets)
 
 clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*

From fb47dee8008c8424ab54543bb92f95505faca66d Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 12 May 2016 17:39:15 +0100
Subject: [PATCH 0727/3220] UPSTREAM: arm64: fix vdso-offsets.h dependency

(cherry pick from commit a66649dab35033bd67988670fa60c268b0444cda)

arm64/kernel/{vdso,signal}.c include vdso-offsets.h, as well as any
file that includes asm/vdso.h. Therefore, vdso-offsets.h must be
generated before these files are compiled.

The current rules in arm64/kernel/Makefile do not actually enforce
this, because even though $(obj)/vdso is listed as a prerequisite for
vdso-offsets.h, this does not result in the intended effect of
building the vdso subdirectory (before all the other objects). As a
consequence, depending on the order in which the rules are followed,
vdso-offsets.h is updated or not before arm64/kernel/{vdso,signal}.o
are built. The current rules also impose an unnecessary dependency on
vdso-offsets.h for all arm64/kernel/*.o, resulting in unnecessary
rebuilds. This is made obvious when using make -j:

  touch arch/arm64/kernel/vdso/gettimeofday.S && make -j$NCPUS arch/arm64/kernel

will sometimes result in none of arm64/kernel/*.o being
rebuilt, sometimes all of them, or even just some of them.

It is quite difficult to ensure that a header is generated before it
is used with recursive Makefiles by using normal rules.  Instead,
arch-specific generated headers are normally built in the archprepare
recipe in the arch Makefile (see for instance arch/ia64/Makefile).
Unfortunately, asm-offsets.h is included in gettimeofday.S, and must
therefore be generated before vdso-offsets.h, which is not the case if
archprepare is used. For this reason, a rule run after archprepare has
to be used.

This commit adds rules in arm64/Makefile to build vdso-offsets.h
during the prepare step, ensuring that vdso-offsets.h is generated
before building anything. It also removes the now-unnecessary
dependencies on vdso-offsets.h in arm64/kernel/Makefile. Finally, it
removes the duplication of asm-offsets.h between arm64/kernel/vdso/
and include/generated/ and makes include/generated/vdso-offsets.h a
target in arm64/kernel/vdso/Makefile.

Cc: Will Deacon <will.deacon@arm.com>
Cc: Michal Marek <mmarek@suse.com>
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Bug: 20045882
Bug: 19198045
Change-Id: Iaaf1c6d3c98287617fc7f63a736752f6597bc8d0
---
 arch/arm64/Makefile             | 10 ++++++++++
 arch/arm64/kernel/Makefile      |  4 ----
 arch/arm64/kernel/vdso/Makefile |  7 +++----
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 67074e612db2..5820cff5f843 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -128,6 +128,16 @@ archclean:
 	$(Q)$(MAKE) $(clean)=$(boot)
 	$(Q)$(MAKE) $(clean)=$(boot)/dts
 
+# We need to generate vdso-offsets.h before compiling certain files in kernel/.
+# In order to do that, we should use the archprepare target, but we can't since
+# asm-offsets.h is included in some files used to generate vdso-offsets.h, and
+# asm-offsets.h is built in prepare0, for which archprepare is a dependency.
+# Therefore we need to generate the header after prepare0 has been made, hence
+# this hack.
+prepare: vdso_prepare
+vdso_prepare: prepare0
+	$(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso include/generated/vdso-offsets.h
+
 define archhelp
   echo  '* Image.gz      - Compressed kernel image (arch/$(ARCH)/boot/Image.gz)'
   echo  '  Image         - Uncompressed kernel image (arch/$(ARCH)/boot/Image)'
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 49a2430b0786..b570e9e36812 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -49,7 +49,3 @@ obj-y					+= $(arm64-obj-y) vdso/
 obj-m					+= $(arm64-obj-m)
 head-y					:= head.o
 extra-y					+= $(head-y) vmlinux.lds
-
-# vDSO - this must be built first to generate the symbol offsets
-$(call objectify,$(arm64-obj-y)): $(obj)/vdso/vdso-offsets.h
-$(obj)/vdso/vdso-offsets.h: $(obj)/vdso
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index b467fd0a384b..62c84f7cb01b 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -23,7 +23,7 @@ GCOV_PROFILE := n
 ccflags-y += -Wl,-shared
 
 obj-y += vdso.o
-extra-y += vdso.lds vdso-offsets.h
+extra-y += vdso.lds
 CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
 
 # Force dependency (incbin is bad)
@@ -42,11 +42,10 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
 gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
 quiet_cmd_vdsosym = VDSOSYM $@
 define cmd_vdsosym
-	$(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ && \
-	cp $@ include/generated/
+	$(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
 endef
 
-$(obj)/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE
+include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE
 	$(call if_changed,vdsosym)
 
 # Assembly rules for the .S files

From 9d2622be9a417c2de47bfe38aa617d1db51de518 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 12 Jul 2016 11:23:59 +0100
Subject: [PATCH 0728/3220] UPSTREAM: arm64: Refactor vDSO time functions

(cherry pick from commit b33f491f5a9aaf171b7de0f905362eb0314af478)

Time functions are directly implemented in assembly in arm64, and it
is desirable to keep it this way for performance reasons (everything
fits in registers, so that the stack is not used at all). However, the
current implementation is quite difficult to read and understand (even
considering it's assembly).  Additionally, due to the structure of
__kernel_clock_gettime, which heavily uses conditional branches to
share code between the different clocks, it is difficult to support a
new clock without making the branches even harder to follow.

This commit completely refactors the structure of clock_gettime (and
gettimeofday along the way) while keeping exactly the same algorithms.
We no longer try to share code; instead, macros provide common
operations. This new approach comes with a number of advantages:
- In clock_gettime, clock implementations are no longer interspersed,
  making them much more readable. Additionally, macros only use
  registers passed as arguments or reserved with .req, this way it is
  easy to make sure that registers are properly allocated. To avoid a
  large number of branches in a given execution path, a jump table is
  used; a normal execution uses 3 unconditional branches.
- __do_get_tspec has been replaced with 2 macros (get_ts_clock_mono,
  get_clock_shifted_nsec) and explicit loading of data from the vDSO
  page. Consequently, clock_gettime and gettimeofday are now leaf
  functions, and saving x30 (lr) is no longer necessary.
- Variables protected by tb_seq_count are now loaded all at once,
  allowing to merge the seqcnt_read macro into seqcnt_check.
- For CLOCK_REALTIME_COARSE, removed an unused load of the wall to
  monotonic timespec.
- For CLOCK_MONOTONIC_COARSE, removed a few shift instructions.

Obviously, the downside of sharing less code is an increase in code
size. However since the vDSO has its own code page, this does not
really matter, as long as the size of the DSO remains below 4 kB. For
now this should be all right:
                    Before  After
  vdso.so size (B)  2776    3000

Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Reviewed-by: Dave Martin <dave.martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Bug: 20045882
Bug: 19198045
Change-Id: I0a0036c54419ee561efe3aad97489e7748ddc59e
---
 arch/arm64/kernel/vdso/gettimeofday.S | 298 +++++++++++++++-----------
 1 file changed, 172 insertions(+), 126 deletions(-)

diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S
index efa79e8d4196..c06919ee29e1 100644
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ b/arch/arm64/kernel/vdso/gettimeofday.S
@@ -26,24 +26,100 @@
 #define NSEC_PER_SEC_HI16	0x3b9a
 
 vdso_data	.req	x6
-use_syscall	.req	w7
-seqcnt		.req	w8
+seqcnt		.req	w7
+w_tmp		.req	w8
+x_tmp		.req	x8
+
+/*
+ * Conventions for macro arguments:
+ * - An argument is write-only if its name starts with "res".
+ * - All other arguments are read-only, unless otherwise specified.
+ */
 
 	.macro	seqcnt_acquire
 9999:	ldr	seqcnt, [vdso_data, #VDSO_TB_SEQ_COUNT]
 	tbnz	seqcnt, #0, 9999b
 	dmb	ishld
-	ldr	use_syscall, [vdso_data, #VDSO_USE_SYSCALL]
 	.endm
 
-	.macro	seqcnt_read, cnt
+	.macro	seqcnt_check fail
 	dmb	ishld
-	ldr	\cnt, [vdso_data, #VDSO_TB_SEQ_COUNT]
+	ldr	w_tmp, [vdso_data, #VDSO_TB_SEQ_COUNT]
+	cmp	w_tmp, seqcnt
+	b.ne	\fail
 	.endm
 
-	.macro	seqcnt_check, cnt, fail
-	cmp	\cnt, seqcnt
-	b.ne	\fail
+	.macro	syscall_check fail
+	ldr	w_tmp, [vdso_data, #VDSO_USE_SYSCALL]
+	cbnz	w_tmp, \fail
+	.endm
+
+	.macro get_nsec_per_sec res
+	mov	\res, #NSEC_PER_SEC_LO16
+	movk	\res, #NSEC_PER_SEC_HI16, lsl #16
+	.endm
+
+	/*
+	 * Returns the clock delta, in nanoseconds left-shifted by the clock
+	 * shift.
+	 */
+	.macro	get_clock_shifted_nsec res, cycle_last, mult
+	/* Read the virtual counter. */
+	isb
+	mrs	x_tmp, cntvct_el0
+	/* Calculate cycle delta and convert to ns. */
+	sub	\res, x_tmp, \cycle_last
+	/* We can only guarantee 56 bits of precision. */
+	movn	x_tmp, #0xff00, lsl #48
+	and	\res, x_tmp, \res
+	mul	\res, \res, \mult
+	.endm
+
+	/*
+	 * Returns in res_{sec,nsec} the REALTIME timespec, based on the
+	 * "wall time" (xtime) and the clock_mono delta.
+	 */
+	.macro	get_ts_realtime res_sec, res_nsec, \
+			clock_nsec, xtime_sec, xtime_nsec, nsec_to_sec
+	add	\res_nsec, \clock_nsec, \xtime_nsec
+	udiv	x_tmp, \res_nsec, \nsec_to_sec
+	add	\res_sec, \xtime_sec, x_tmp
+	msub	\res_nsec, x_tmp, \nsec_to_sec, \res_nsec
+	.endm
+
+	/* sec and nsec are modified in place. */
+	.macro add_ts sec, nsec, ts_sec, ts_nsec, nsec_to_sec
+	/* Add timespec. */
+	add	\sec, \sec, \ts_sec
+	add	\nsec, \nsec, \ts_nsec
+
+	/* Normalise the new timespec. */
+	cmp	\nsec, \nsec_to_sec
+	b.lt	9999f
+	sub	\nsec, \nsec, \nsec_to_sec
+	add	\sec, \sec, #1
+9999:
+	cmp	\nsec, #0
+	b.ge	9998f
+	add	\nsec, \nsec, \nsec_to_sec
+	sub	\sec, \sec, #1
+9998:
+	.endm
+
+	.macro clock_gettime_return, shift=0
+	.if \shift == 1
+	lsr	x11, x11, x12
+	.endif
+	stp	x10, x11, [x1, #TSPEC_TV_SEC]
+	mov	x0, xzr
+	ret
+	.endm
+
+	.macro jump_slot jumptable, index, label
+	.if (. - \jumptable) != 4 * (\index)
+	.error "Jump slot index mismatch"
+	.endif
+	b	\label
 	.endm
 
 	.text
@@ -51,18 +127,24 @@ seqcnt		.req	w8
 /* int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); */
 ENTRY(__kernel_gettimeofday)
 	.cfi_startproc
-	mov	x2, x30
-	.cfi_register x30, x2
-
-	/* Acquire the sequence counter and get the timespec. */
 	adr	vdso_data, _vdso_data
-1:	seqcnt_acquire
-	cbnz	use_syscall, 4f
-
 	/* If tv is NULL, skip to the timezone code. */
 	cbz	x0, 2f
-	bl	__do_get_tspec
-	seqcnt_check w9, 1b
+
+	/* Compute the time of day. */
+1:	seqcnt_acquire
+	syscall_check fail=4f
+	ldr	x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
+	ldp	w11, w12, [vdso_data, #VDSO_CS_MULT]
+	ldp	x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
+	seqcnt_check fail=1b
+
+	get_nsec_per_sec res=x9
+	lsl	x9, x9, x12
+
+	get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11
+	get_ts_realtime res_sec=x10, res_nsec=x11, \
+		clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9
 
 	/* Convert ns to us. */
 	mov	x13, #1000
@@ -76,95 +158,102 @@ ENTRY(__kernel_gettimeofday)
 	stp	w4, w5, [x1, #TZ_MINWEST]
 3:
 	mov	x0, xzr
-	ret	x2
+	ret
 4:
 	/* Syscall fallback. */
 	mov	x8, #__NR_gettimeofday
 	svc	#0
-	ret	x2
+	ret
 	.cfi_endproc
 ENDPROC(__kernel_gettimeofday)
 
+#define JUMPSLOT_MAX CLOCK_MONOTONIC_COARSE
+
 /* int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); */
 ENTRY(__kernel_clock_gettime)
 	.cfi_startproc
-	cmp	w0, #CLOCK_REALTIME
-	ccmp	w0, #CLOCK_MONOTONIC, #0x4, ne
-	b.ne	2f
-
-	mov	x2, x30
-	.cfi_register x30, x2
-
-	/* Get kernel timespec. */
+	cmp 	w0, #JUMPSLOT_MAX
+	b.hi 	syscall
 	adr	vdso_data, _vdso_data
-1:	seqcnt_acquire
-	cbnz	use_syscall, 7f
+	adr 	x_tmp, jumptable
+	add 	x_tmp, x_tmp, w0, uxtw #2
+	br 	x_tmp
 
-	bl	__do_get_tspec
-	seqcnt_check w9, 1b
+	ALIGN
+jumptable:
+	jump_slot jumptable, CLOCK_REALTIME, realtime
+	jump_slot jumptable, CLOCK_MONOTONIC, monotonic
+	b 	syscall
+	b 	syscall
+	b 	syscall
+	jump_slot jumptable, CLOCK_REALTIME_COARSE, realtime_coarse
+	jump_slot jumptable, CLOCK_MONOTONIC_COARSE, monotonic_coarse
 
-	mov	x30, x2
+	.if (. - jumptable) != 4 * (JUMPSLOT_MAX + 1)
+	.error	"Wrong jumptable size"
+	.endif
 
-	cmp	w0, #CLOCK_MONOTONIC
-	b.ne	6f
+	ALIGN
+realtime:
+	seqcnt_acquire
+	syscall_check fail=syscall
+	ldr	x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
+	ldp	w11, w12, [vdso_data, #VDSO_CS_MULT]
+	ldp	x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
+	seqcnt_check fail=realtime
 
-	/* Get wtm timespec. */
-	ldp	x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC]
+	/* All computations are done with left-shifted nsecs. */
+	get_nsec_per_sec res=x9
+	lsl	x9, x9, x12
 
-	/* Check the sequence counter. */
-	seqcnt_read w9
-	seqcnt_check w9, 1b
-	b	4f
-2:
-	cmp	w0, #CLOCK_REALTIME_COARSE
-	ccmp	w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne
-	b.ne	8f
+	get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11
+	get_ts_realtime res_sec=x10, res_nsec=x11, \
+		clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9
+	clock_gettime_return, shift=1
 
-	/* xtime_coarse_nsec is already right-shifted */
-	mov	x12, #0
+	ALIGN
+monotonic:
+	seqcnt_acquire
+	syscall_check fail=syscall
+	ldr	x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
+	ldp	w11, w12, [vdso_data, #VDSO_CS_MULT]
+	ldp	x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
+	ldp	x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC]
+	seqcnt_check fail=monotonic
 
-	/* Get coarse timespec. */
-	adr	vdso_data, _vdso_data
-3:	seqcnt_acquire
+	/* All computations are done with left-shifted nsecs. */
+	lsl	x4, x4, x12
+	get_nsec_per_sec res=x9
+	lsl	x9, x9, x12
+
+	get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11
+	get_ts_realtime res_sec=x10, res_nsec=x11, \
+		clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9
+
+	add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9
+	clock_gettime_return, shift=1
+
+	ALIGN
+realtime_coarse:
+	seqcnt_acquire
 	ldp	x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC]
+	seqcnt_check fail=realtime_coarse
+	clock_gettime_return
 
-	/* Get wtm timespec. */
+	ALIGN
+monotonic_coarse:
+	seqcnt_acquire
+	ldp	x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC]
 	ldp	x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC]
+	seqcnt_check fail=monotonic_coarse
 
-	/* Check the sequence counter. */
-	seqcnt_read w9
-	seqcnt_check w9, 3b
+	/* Computations are done in (non-shifted) nsecs. */
+	get_nsec_per_sec res=x9
+	add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9
+	clock_gettime_return
 
-	cmp	w0, #CLOCK_MONOTONIC_COARSE
-	b.ne	6f
-4:
-	/* Add on wtm timespec. */
-	add	x10, x10, x13
-	lsl	x14, x14, x12
-	add	x11, x11, x14
-
-	/* Normalise the new timespec. */
-	mov	x15, #NSEC_PER_SEC_LO16
-	movk	x15, #NSEC_PER_SEC_HI16, lsl #16
-	lsl	x15, x15, x12
-	cmp	x11, x15
-	b.lt	5f
-	sub	x11, x11, x15
-	add	x10, x10, #1
-5:
-	cmp	x11, #0
-	b.ge	6f
-	add	x11, x11, x15
-	sub	x10, x10, #1
-
-6:	/* Store to the user timespec. */
-	lsr	x11, x11, x12
-	stp	x10, x11, [x1, #TSPEC_TV_SEC]
-	mov	x0, xzr
-	ret
-7:
-	mov	x30, x2
-8:	/* Syscall fallback. */
+	ALIGN
+syscall: /* Syscall fallback. */
 	mov	x8, #__NR_clock_gettime
 	svc	#0
 	ret
@@ -203,46 +292,3 @@ ENTRY(__kernel_clock_getres)
 	.quad	CLOCK_COARSE_RES
 	.cfi_endproc
 ENDPROC(__kernel_clock_getres)
-
-/*
- * Read the current time from the architected counter.
- * Expects vdso_data to be initialised.
- * Clobbers the temporary registers (x9 - x15).
- * Returns:
- *  - w9		= vDSO sequence counter
- *  - (x10, x11)	= (ts->tv_sec, shifted ts->tv_nsec)
- *  - w12		= cs_shift
- */
-ENTRY(__do_get_tspec)
-	.cfi_startproc
-
-	/* Read from the vDSO data page. */
-	ldr	x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
-	ldp	x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
-	ldp	w11, w12, [vdso_data, #VDSO_CS_MULT]
-	seqcnt_read w9
-
-	/* Read the virtual counter. */
-	isb
-	mrs	x15, cntvct_el0
-
-	/* Calculate cycle delta and convert to ns. */
-	sub	x10, x15, x10
-	/* We can only guarantee 56 bits of precision. */
-	movn	x15, #0xff00, lsl #48
-	and	x10, x15, x10
-	mul	x10, x10, x11
-
-	/* Use the kernel time to calculate the new timespec. */
-	mov	x11, #NSEC_PER_SEC_LO16
-	movk	x11, #NSEC_PER_SEC_HI16, lsl #16
-	lsl	x11, x11, x12
-	add	x15, x10, x14
-	udiv	x14, x15, x11
-	add	x10, x13, x14
-	mul	x13, x14, x11
-	sub	x11, x15, x13
-
-	ret
-	.cfi_endproc
-ENDPROC(__do_get_tspec)

From c2e0fb3550184ee2afeb98c7e8fd481ab750679a Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 12 Jul 2016 11:24:00 +0100
Subject: [PATCH 0729/3220] UPSTREAM: arm64: Add support for
 CLOCK_MONOTONIC_RAW in clock_gettime() vDSO

(cherry pick from commit 49eea433b326a0ac5c7c941a011b2c65990bd19b)

So far the arm64 clock_gettime() vDSO implementation only supported
the following clocks, falling back to the syscall for the others:
- CLOCK_REALTIME{,_COARSE}
- CLOCK_MONOTONIC{,_COARSE}

This patch adds support for the CLOCK_MONOTONIC_RAW clock, taking
advantage of the recent refactoring of the vDSO time functions. Like
the non-_COARSE clocks, this only works when the "arch_sys_counter"
clocksource is in use (allowing us to read the current time from the
virtual counter register), otherwise we also have to fall back to the
syscall.

Most of the data is shared with CLOCK_MONOTONIC, and the algorithm is
similar. The reference implementation in kernel/time/timekeeping.c
shows that:
- CLOCK_MONOTONIC = tk->wall_to_monotonic + tk->xtime_sec +
  timekeeping_get_ns(&tk->tkr_mono)
- CLOCK_MONOTONIC_RAW = tk->raw_time + timekeeping_get_ns(&tk->tkr_raw)
- tkr_mono and tkr_raw are identical (in particular, same
  clocksource), except these members:
  * mult (only mono's multiplier is NTP-adjusted)
  * xtime_nsec (always 0 for raw)

Therefore, tk->raw_time and tkr_raw->mult are now also stored in the
vDSO data page.

Cc: Ali Saidi <ali.saidi@arm.com>
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Reviewed-by: Dave Martin <dave.martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Bug: 20045882
Bug: 19198045
Change-Id: I854adcc1192757a1ac40662e85d466ca709d0682
---
 arch/arm64/include/asm/vdso_datapage.h |  8 +++-
 arch/arm64/kernel/asm-offsets.c        |  6 ++-
 arch/arm64/kernel/vdso.c               |  8 +++-
 arch/arm64/kernel/vdso/gettimeofday.S  | 57 +++++++++++++++++++++-----
 4 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/vdso_datapage.h b/arch/arm64/include/asm/vdso_datapage.h
index de66199673d7..2b9a63771eda 100644
--- a/arch/arm64/include/asm/vdso_datapage.h
+++ b/arch/arm64/include/asm/vdso_datapage.h
@@ -22,6 +22,8 @@
 
 struct vdso_data {
 	__u64 cs_cycle_last;	/* Timebase at clocksource init */
+	__u64 raw_time_sec;	/* Raw time */
+	__u64 raw_time_nsec;
 	__u64 xtime_clock_sec;	/* Kernel time */
 	__u64 xtime_clock_nsec;
 	__u64 xtime_coarse_sec;	/* Coarse time */
@@ -29,8 +31,10 @@ struct vdso_data {
 	__u64 wtm_clock_sec;	/* Wall to monotonic time */
 	__u64 wtm_clock_nsec;
 	__u32 tb_seq_count;	/* Timebase sequence counter */
-	__u32 cs_mult;		/* Clocksource multiplier */
-	__u32 cs_shift;		/* Clocksource shift */
+	/* cs_* members must be adjacent and in this order (ldp accesses) */
+	__u32 cs_mono_mult;	/* NTP-adjusted clocksource multiplier */
+	__u32 cs_shift;		/* Clocksource shift (mono = raw) */
+	__u32 cs_raw_mult;	/* Raw clocksource multiplier */
 	__u32 tz_minuteswest;	/* Whacky timezone stuff */
 	__u32 tz_dsttime;
 	__u32 use_syscall;
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 40ef661bd3a5..b84d8e85d19d 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -79,6 +79,7 @@ int main(void)
   BLANK();
   DEFINE(CLOCK_REALTIME,	CLOCK_REALTIME);
   DEFINE(CLOCK_MONOTONIC,	CLOCK_MONOTONIC);
+  DEFINE(CLOCK_MONOTONIC_RAW,	CLOCK_MONOTONIC_RAW);
   DEFINE(CLOCK_REALTIME_RES,	MONOTONIC_RES_NSEC);
   DEFINE(CLOCK_REALTIME_COARSE,	CLOCK_REALTIME_COARSE);
   DEFINE(CLOCK_MONOTONIC_COARSE,CLOCK_MONOTONIC_COARSE);
@@ -86,6 +87,8 @@ int main(void)
   DEFINE(NSEC_PER_SEC,		NSEC_PER_SEC);
   BLANK();
   DEFINE(VDSO_CS_CYCLE_LAST,	offsetof(struct vdso_data, cs_cycle_last));
+  DEFINE(VDSO_RAW_TIME_SEC,	offsetof(struct vdso_data, raw_time_sec));
+  DEFINE(VDSO_RAW_TIME_NSEC,	offsetof(struct vdso_data, raw_time_nsec));
   DEFINE(VDSO_XTIME_CLK_SEC,	offsetof(struct vdso_data, xtime_clock_sec));
   DEFINE(VDSO_XTIME_CLK_NSEC,	offsetof(struct vdso_data, xtime_clock_nsec));
   DEFINE(VDSO_XTIME_CRS_SEC,	offsetof(struct vdso_data, xtime_coarse_sec));
@@ -93,7 +96,8 @@ int main(void)
   DEFINE(VDSO_WTM_CLK_SEC,	offsetof(struct vdso_data, wtm_clock_sec));
   DEFINE(VDSO_WTM_CLK_NSEC,	offsetof(struct vdso_data, wtm_clock_nsec));
   DEFINE(VDSO_TB_SEQ_COUNT,	offsetof(struct vdso_data, tb_seq_count));
-  DEFINE(VDSO_CS_MULT,		offsetof(struct vdso_data, cs_mult));
+  DEFINE(VDSO_CS_MONO_MULT,	offsetof(struct vdso_data, cs_mono_mult));
+  DEFINE(VDSO_CS_RAW_MULT,	offsetof(struct vdso_data, cs_raw_mult));
   DEFINE(VDSO_CS_SHIFT,		offsetof(struct vdso_data, cs_shift));
   DEFINE(VDSO_TZ_MINWEST,	offsetof(struct vdso_data, tz_minuteswest));
   DEFINE(VDSO_TZ_DSTTIME,	offsetof(struct vdso_data, tz_dsttime));
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 97bc68f4c689..54f7b327fd18 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -212,10 +212,16 @@ void update_vsyscall(struct timekeeper *tk)
 	vdso_data->wtm_clock_nsec		= tk->wall_to_monotonic.tv_nsec;
 
 	if (!use_syscall) {
+		/* tkr_mono.cycle_last == tkr_raw.cycle_last */
 		vdso_data->cs_cycle_last	= tk->tkr_mono.cycle_last;
+		vdso_data->raw_time_sec		= tk->raw_time.tv_sec;
+		vdso_data->raw_time_nsec	= tk->raw_time.tv_nsec;
 		vdso_data->xtime_clock_sec	= tk->xtime_sec;
 		vdso_data->xtime_clock_nsec	= tk->tkr_mono.xtime_nsec;
-		vdso_data->cs_mult		= tk->tkr_mono.mult;
+		/* tkr_raw.xtime_nsec == 0 */
+		vdso_data->cs_mono_mult		= tk->tkr_mono.mult;
+		vdso_data->cs_raw_mult		= tk->tkr_raw.mult;
+		/* tkr_mono.shift == tkr_raw.shift */
 		vdso_data->cs_shift		= tk->tkr_mono.shift;
 	}
 
diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S
index c06919ee29e1..e00b4671bd7c 100644
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ b/arch/arm64/kernel/vdso/gettimeofday.S
@@ -87,6 +87,15 @@ x_tmp		.req	x8
 	msub	\res_nsec, x_tmp, \nsec_to_sec, \res_nsec
 	.endm
 
+	/*
+	 * Returns in res_{sec,nsec} the timespec based on the clock_raw delta,
+	 * used for CLOCK_MONOTONIC_RAW.
+	 */
+	.macro	get_ts_clock_raw res_sec, res_nsec, clock_nsec, nsec_to_sec
+	udiv	\res_sec, \clock_nsec, \nsec_to_sec
+	msub	\res_nsec, \res_sec, \nsec_to_sec, \clock_nsec
+	.endm
+
 	/* sec and nsec are modified in place. */
 	.macro add_ts sec, nsec, ts_sec, ts_nsec, nsec_to_sec
 	/* Add timespec. */
@@ -135,7 +144,8 @@ ENTRY(__kernel_gettimeofday)
 1:	seqcnt_acquire
 	syscall_check fail=4f
 	ldr	x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
-	ldp	w11, w12, [vdso_data, #VDSO_CS_MULT]
+	/* w11 = cs_mono_mult, w12 = cs_shift */
+	ldp	w11, w12, [vdso_data, #VDSO_CS_MONO_MULT]
 	ldp	x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
 	seqcnt_check fail=1b
 
@@ -172,20 +182,20 @@ ENDPROC(__kernel_gettimeofday)
 /* int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); */
 ENTRY(__kernel_clock_gettime)
 	.cfi_startproc
-	cmp 	w0, #JUMPSLOT_MAX
-	b.hi 	syscall
+	cmp	w0, #JUMPSLOT_MAX
+	b.hi	syscall
 	adr	vdso_data, _vdso_data
-	adr 	x_tmp, jumptable
-	add 	x_tmp, x_tmp, w0, uxtw #2
-	br 	x_tmp
+	adr	x_tmp, jumptable
+	add	x_tmp, x_tmp, w0, uxtw #2
+	br	x_tmp
 
 	ALIGN
 jumptable:
 	jump_slot jumptable, CLOCK_REALTIME, realtime
 	jump_slot jumptable, CLOCK_MONOTONIC, monotonic
-	b 	syscall
-	b 	syscall
-	b 	syscall
+	b	syscall
+	b	syscall
+	jump_slot jumptable, CLOCK_MONOTONIC_RAW, monotonic_raw
 	jump_slot jumptable, CLOCK_REALTIME_COARSE, realtime_coarse
 	jump_slot jumptable, CLOCK_MONOTONIC_COARSE, monotonic_coarse
 
@@ -198,7 +208,8 @@ realtime:
 	seqcnt_acquire
 	syscall_check fail=syscall
 	ldr	x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
-	ldp	w11, w12, [vdso_data, #VDSO_CS_MULT]
+	/* w11 = cs_mono_mult, w12 = cs_shift */
+	ldp	w11, w12, [vdso_data, #VDSO_CS_MONO_MULT]
 	ldp	x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
 	seqcnt_check fail=realtime
 
@@ -216,7 +227,8 @@ monotonic:
 	seqcnt_acquire
 	syscall_check fail=syscall
 	ldr	x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
-	ldp	w11, w12, [vdso_data, #VDSO_CS_MULT]
+	/* w11 = cs_mono_mult, w12 = cs_shift */
+	ldp	w11, w12, [vdso_data, #VDSO_CS_MONO_MULT]
 	ldp	x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
 	ldp	x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC]
 	seqcnt_check fail=monotonic
@@ -233,6 +245,28 @@ monotonic:
 	add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9
 	clock_gettime_return, shift=1
 
+	ALIGN
+monotonic_raw:
+	seqcnt_acquire
+	syscall_check fail=syscall
+	ldr	x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
+	/* w11 = cs_raw_mult, w12 = cs_shift */
+	ldp	w12, w11, [vdso_data, #VDSO_CS_SHIFT]
+	ldp	x13, x14, [vdso_data, #VDSO_RAW_TIME_SEC]
+	seqcnt_check fail=monotonic_raw
+
+	/* All computations are done with left-shifted nsecs. */
+	lsl	x14, x14, x12
+	get_nsec_per_sec res=x9
+	lsl	x9, x9, x12
+
+	get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11
+	get_ts_clock_raw res_sec=x10, res_nsec=x11, \
+		clock_nsec=x15, nsec_to_sec=x9
+
+	add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9
+	clock_gettime_return, shift=1
+
 	ALIGN
 realtime_coarse:
 	seqcnt_acquire
@@ -265,6 +299,7 @@ ENTRY(__kernel_clock_getres)
 	.cfi_startproc
 	cmp	w0, #CLOCK_REALTIME
 	ccmp	w0, #CLOCK_MONOTONIC, #0x4, ne
+	ccmp	w0, #CLOCK_MONOTONIC_RAW, #0x4, ne
 	b.ne	1f
 
 	ldr	x2, 5f

From b4b7a2466892929e5f72104089505f7b5bb7bfd6 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Mon, 15 Aug 2016 09:20:21 +0100
Subject: [PATCH 0730/3220] UPSTREAM: ARM: 8597/1: VDSO: put RO and RO after
 init objects into proper sections

(cherry pick from commit 92bb8d5d55f7fe1a7a1201c42120c1611840807c)

vdso_data_mapping is never modified, so mark it as const.

vdso_total_pages, vdso_data_page, vdso_text_mapping and cntvct_ok are
initialized by vdso_init(), thereafter are read only.

The fact that they are read only after init makes them candidates for
__ro_after_init declarations.

Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Nathan Lynch <nathan_lynch@mentor.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Bug: 20045882
Bug: 19198045
Change-Id: I0b2f9807d3cd66c5ef7457ac6fc180a41cbe05c6
---
 arch/arm/kernel/vdso.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index 994e971a8538..bbbffe946122 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -17,6 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/cache.h>
 #include <linux/elf.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
@@ -39,7 +40,7 @@
 static struct page **vdso_text_pagelist;
 
 /* Total number of pages needed for the data and text portions of the VDSO. */
-unsigned int vdso_total_pages __read_mostly;
+unsigned int vdso_total_pages __ro_after_init;
 
 /*
  * The VDSO data page.
@@ -47,13 +48,13 @@ unsigned int vdso_total_pages __read_mostly;
 static union vdso_data_store vdso_data_store __page_aligned_data;
 static struct vdso_data *vdso_data = &vdso_data_store.data;
 
-static struct page *vdso_data_page;
-static struct vm_special_mapping vdso_data_mapping = {
+static struct page *vdso_data_page __ro_after_init;
+static const struct vm_special_mapping vdso_data_mapping = {
 	.name = "[vvar]",
 	.pages = &vdso_data_page,
 };
 
-static struct vm_special_mapping vdso_text_mapping = {
+static struct vm_special_mapping vdso_text_mapping __ro_after_init = {
 	.name = "[vdso]",
 };
 
@@ -67,7 +68,7 @@ struct elfinfo {
 /* Cached result of boot-time check for whether the arch timer exists,
  * and if so, whether the virtual counter is useable.
  */
-static bool cntvct_ok __read_mostly;
+static bool cntvct_ok __ro_after_init;
 
 static bool __init cntvct_functional(void)
 {

From 30e1bf349c52dd1dbb599562559b60dc7c702f3a Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Mon, 15 Aug 2016 14:45:44 +0800
Subject: [PATCH 0731/3220] UPSTREAM: arm64: vdso: add __init section marker to
 alloc_vectors_page

(cherry pick from commit 1aed28f94ce6c1f6c24bcbbd5fcd749b55f65e9e)

It is not needed after booting, this patch moves the alloc_vectors_page
function to the __init section.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Bug: 20045882
Bug: 19198045
Change-Id: I1d7b89f9e20890b18e77c8752f1eef33d721ea81
---
 arch/arm64/kernel/vdso.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 54f7b327fd18..0949b4812887 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -55,7 +55,7 @@ struct vdso_data *vdso_data = &vdso_data_store.data;
  */
 static struct page *vectors_page[1];
 
-static int alloc_vectors_page(void)
+static int __init alloc_vectors_page(void)
 {
 	extern char __kuser_helper_start[], __kuser_helper_end[];
 	extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[];

From 0c61bc75ee662c79fc22f94979ca2cb6b5de0e54 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Mon, 15 Aug 2016 14:45:45 +0800
Subject: [PATCH 0732/3220] UPSTREAM: arm64: vdso: constify vm_special_mapping
 used for aarch32 vectors page

(cherry pick from commit b6d081bddf397026575a437b603b118dff2606ff)

The vm_special_mapping spec which is used for aarch32 vectors page is
never modified, so mark it as const.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Bug: 20045882
Bug: 19198045
Change-Id: If1afd11bfeefee45a2b96634b18143937a773a14
---
 arch/arm64/kernel/vdso.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 0949b4812887..3b8acfae7797 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -88,7 +88,7 @@ int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr = AARCH32_VECTORS_BASE;
-	static struct vm_special_mapping spec = {
+	static const struct vm_special_mapping spec = {
 		.name	= "[vectors]",
 		.pages	= vectors_page,
 

From a08cafa7e09cb5450a2596e6ea15bdea976d8d59 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Tue, 28 Mar 2017 13:30:18 -0700
Subject: [PATCH 0733/3220] ANDROID: ARM64: Allow to choose appended kernel
 image

By default appended kernel image is Image.gz-dtb.
New config option BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME
allows to choose between Image.gz-dtb and Image-dtb.

Change-Id: I1c71b85136f1beeb61782e4646820718c1ccd7e4
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 arch/arm64/Kconfig  | 20 ++++++++++++++++++++
 arch/arm64/Makefile |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 77bea9751a31..35be8566140e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -930,6 +930,26 @@ config BUILD_ARM64_APPENDED_DTB_IMAGE
 	  DTBs to be built by default (instead of a standalone Image.gz.)
 	  The image will built in arch/arm64/boot/Image.gz-dtb
 
+choice
+	prompt "Appended DTB Kernel Image name"
+	depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+	help
+	  Enabling this option will cause a specific kernel image Image or
+	  Image.gz to be used for final image creation.
+	  The image will built in arch/arm64/boot/IMAGE-NAME-dtb
+
+	config IMG_GZ_DTB
+		bool "Image.gz-dtb"
+	config IMG_DTB
+		bool "Image-dtb"
+endchoice
+
+config BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME
+	string
+	depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+	default "Image.gz-dtb" if IMG_GZ_DTB
+	default "Image-dtb" if IMG_DTB
+
 config BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES
 	string "Default dtb names"
 	depends on BUILD_ARM64_APPENDED_DTB_IMAGE
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 5820cff5f843..ba55698f71e8 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -85,7 +85,7 @@ core-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
 
 # Default target when executing plain make
 ifeq ($(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE),y)
-KBUILD_IMAGE	:= Image.gz-dtb
+KBUILD_IMAGE	:= $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME))
 else
 KBUILD_IMAGE	:= Image.gz
 endif

From 55c20761453ed9b2d22f5c5101c3833dc1a7159a Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 11 Oct 2016 13:54:38 -0700
Subject: [PATCH 0734/3220] config: android: move device mapper options to
 recommended

CONFIG_MD is in recommended, but other dependent options like DM_CRYPT and
DM_VERITY options are in base.  The result is the options in base don't
get enabled when applying both base and recommended fragments.  Move all
the options to recommended.

Link: http://lkml.kernel.org/r/20160908185934.18098-1-robh@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit f023a3956f273859ed36f624f75a66c272124b16)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 android/configs/android-base.cfg        | 4 ----
 android/configs/android-recommended.cfg | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index d1f9628e4377..59d4908b3343 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -13,7 +13,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y
 CONFIG_ARMV8_DEPRECATED=y
 CONFIG_ASHMEM=y
 CONFIG_AUDIT=y
-CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_CPUACCT=y
@@ -21,9 +20,6 @@ CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
 CONFIG_HARDENED_USERCOPY=y
diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index 28610303db60..e346b378dd64 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -10,6 +10,7 @@ CONFIG_ANDROID_TIMED_GPIO=y
 CONFIG_ARM_KERNMEM_PERMS=y
 CONFIG_ARM64_SW_TTBR0_PAN=y
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
@@ -17,7 +18,10 @@ CONFIG_CC_STACKPROTECTOR_STRONG=y
 CONFIG_COMPACTION=y
 CONFIG_CPU_SW_DOMAIN_PAN=y
 CONFIG_DEBUG_RODATA=y
+CONFIG_DM_CRYPT=y
 CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
 CONFIG_DRAGONRISE_FF=y
 CONFIG_ENABLE_DEFAULT_TRACERS=y
 CONFIG_EXT4_FS=y

From 312b1f89aadbca0cc46f4cabad16f6b00f27c402 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 11 Oct 2016 13:54:41 -0700
Subject: [PATCH 0735/3220] UPSTREAM: config: android: set SELinux as default
 security mode

Android won't boot without SELinux enabled, so make it the default.

Link: http://lkml.kernel.org/r/20160908185934.18098-2-robh@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit d90ae51a3e7556c9f50431db43cd8190934ccf94)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 59d4908b3343..3672bf339006 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -20,6 +20,7 @@ CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DEFAULT_SECURITY_SELINUX=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
 CONFIG_HARDENED_USERCOPY=y

From b6cf5c77e06e10cb93e768f96893007cf54a77ae Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 11 Oct 2016 13:54:36 -0700
Subject: [PATCH 0736/3220] UPSTREAM: config/android: Remove
 CONFIG_IPV6_PRIVACY

Option is long gone, see commit 5d9efa7ee99e ("ipv6: Remove privacy
config option.")

Link: http://lkml.kernel.org/r/20160811170340.9859-1-bp@alien8.de
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Rob Herring <robh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit a2c6a235dbf4318fc7f7981932478e6c47f093ab)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 android/configs/android-base.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 3672bf339006..1ff32af855c3 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -41,7 +41,6 @@ CONFIG_IPV6=y
 CONFIG_IPV6_MIP6=y
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_PRIVACY=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
 CONFIG_IP_ADVANCED_ROUTER=y

From 635194062b5eb5248e7bf98ddbacf63c2ec67119 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Sat, 1 Apr 2017 19:08:12 +0200
Subject: [PATCH 0737/3220] ANDROID: sort android-recommended.cfg

It got out-of-order, so resort it to make it easier to sync with other
trees.

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 android/configs/android-recommended.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index e346b378dd64..eecf8d80453a 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -7,8 +7,8 @@
 # CONFIG_PM_WAKELOCKS_GC is not set
 # CONFIG_VT is not set
 CONFIG_ANDROID_TIMED_GPIO=y
-CONFIG_ARM_KERNMEM_PERMS=y
 CONFIG_ARM64_SW_TTBR0_PAN=y
+CONFIG_ARM_KERNMEM_PERMS=y
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_LOOP=y
@@ -97,6 +97,7 @@ CONFIG_LOGIRUMBLEPAD2_FF=y
 CONFIG_LOGITECH_FF=y
 CONFIG_MD=y
 CONFIG_MEDIA_SUPPORT=y
+CONFIG_MEMORY_STATE_TIME=y
 CONFIG_MSDOS_FS=y
 CONFIG_PANIC_TIMEOUT=5
 CONFIG_PANTHERLORD_FF=y
@@ -126,7 +127,6 @@ CONFIG_TIMER_STATS=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_UHID=y
-CONFIG_MEMORY_STATE_TIME=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_HIDDEV=y

From 025b221d5309fe93280e765cf16ad4c8d2ce76c4 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 31 Mar 2017 09:33:33 -0700
Subject: [PATCH 0738/3220] ANDROID: binder: add hwbinder,vndbinder to
 BINDER_DEVICES.

These will be required going forward.

Change-Id: Idf0593461cef88051564ae0df495c156e31048c4
Signed-off-by: Martijn Coenen <maco@google.com>
---
 android/configs/android-base.cfg | 1 +
 drivers/android/Kconfig          | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 1ff32af855c3..ece06816e046 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -9,6 +9,7 @@
 # CONFIG_USELIB is not set
 CONFIG_ANDROID=y
 CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
 CONFIG_ANDROID_LOW_MEMORY_KILLER=y
 CONFIG_ARMV8_DEPRECATED=y
 CONFIG_ASHMEM=y
diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index a82fc022d34b..4d4cdc1a6e25 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -22,7 +22,7 @@ config ANDROID_BINDER_IPC
 config ANDROID_BINDER_DEVICES
 	string "Android Binder devices"
 	depends on ANDROID_BINDER_IPC
-	default "binder"
+	default "binder,hwbinder,vndbinder"
 	---help---
 	  Default value for the binder.devices parameter.
 

From 21afcf6ff8c745ab9c551f113eeaacff3f41b515 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Tue, 4 Apr 2017 19:50:20 +0200
Subject: [PATCH 0739/3220] ANDROID: android-base.cfg: properly sort the file

It somehow got out of alphabetical order, fix it to make merges and
testing easier.

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 android/configs/android-base.cfg | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index ece06816e046..a1969d0d7926 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -8,8 +8,8 @@
 # CONFIG_SYSVIPC is not set
 # CONFIG_USELIB is not set
 CONFIG_ANDROID=y
-CONFIG_ANDROID_BINDER_IPC=y
 CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
+CONFIG_ANDROID_BINDER_IPC=y
 CONFIG_ANDROID_LOW_MEMORY_KILLER=y
 CONFIG_ARMV8_DEPRECATED=y
 CONFIG_ASHMEM=y
@@ -140,9 +140,9 @@ CONFIG_PREEMPT=y
 CONFIG_PROFILING=y
 CONFIG_QFMT_V2=y
 CONFIG_QUOTA=y
+CONFIG_QUOTACTL=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_QUOTA_TREE=y
-CONFIG_QUOTACTL=y
 CONFIG_RANDOMIZE_BASE=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
@@ -158,14 +158,14 @@ CONFIG_SYNC=y
 CONFIG_TUN=y
 CONFIG_UID_SYS_STATS=y
 CONFIG_UNIX=y
-CONFIG_USB_GADGET=y
 CONFIG_USB_CONFIGFS=y
-CONFIG_USB_CONFIGFS_F_FS=y
-CONFIG_USB_CONFIGFS_F_MTP=y
-CONFIG_USB_CONFIGFS_F_PTP=y
 CONFIG_USB_CONFIGFS_F_ACC=y
 CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
-CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_CONFIGFS_F_FS=y
 CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_USB_CONFIGFS_F_MTP=y
+CONFIG_USB_CONFIGFS_F_PTP=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_GADGET=y
 CONFIG_USB_OTG_WAKELOCK=y
 CONFIG_XFRM_USER=y

From 8588d88f1a37342dc3e9c0000cdc45792e237f3e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Tue, 4 Apr 2017 19:45:38 +0200
Subject: [PATCH 0740/3220] ANDROID: android-base.cfg: add CONFIG_IKCONFIG
 option

This adds CONFIG_IKCONFIG and CONFIG_IKCONFIG_PROC options, which are a
requirement for the O release.

Bug: 35803310
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
(cherry picked from commit 7d9280f579ff0731facb1e10f32e4a88a07f33f8)
---
 android/configs/android-base.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index a1969d0d7926..7d4ce82f5b8e 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -26,6 +26,8 @@ CONFIG_EMBEDDED=y
 CONFIG_FB=y
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_HIGH_RES_TIMERS=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
 CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y

From 10610ac9af3a9645ad1c7ea876f8859dbb650251 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Tue, 4 Apr 2017 19:46:59 +0200
Subject: [PATCH 0741/3220] ANDROID: android-base.cfg: add CONFIG_MODULES
 option

This adds CONFIG_MODULES, CONFIG_MODULE_UNLOAD, and CONFIG_MODVERSIONS
which are required by the O release.

Bug: 35803310
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
(cherry picked from commit 56f22e654a311f3c2492b8b3609916265fe34e20)
---
 android/configs/android-base.cfg | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 7d4ce82f5b8e..59c4f3a0df19 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -65,6 +65,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y
 CONFIG_IP_NF_TARGET_NETMAP=y
 CONFIG_IP_NF_TARGET_REDIRECT=y
 CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
 CONFIG_NET=y
 CONFIG_NETDEVICES=y
 CONFIG_NETFILTER=y

From 31f9ce371384a31cac09e3b659d119716e1632e6 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Mon, 16 Jan 2017 17:35:52 +0900
Subject: [PATCH 0742/3220] android: base-cfg: enable CONFIG_INET_DIAG_DESTROY

As of Android N, this is required to close sockets when a
network disconnects.

Change-Id: I9fe81c5fc5224c17bfd8d9e236ea9e436b5971cb
(cherry picked from commit 4a15cee4bdaf764756e98cd8f03784f330459ab1)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 59c4f3a0df19..30e01074f64a 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -29,6 +29,7 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_INET6_AH=y
+CONFIG_INET6_DIAG_DESTROY=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y
 CONFIG_INET=y

From e246a2f11fcca7702dbf526da235d37c828742ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= <maze@google.com>
Date: Tue, 27 Sep 2016 23:57:58 -0700
Subject: [PATCH 0743/3220] UPSTREAM: ipv6 addrconf: implement RFC7559 router
 solicitation backoff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This implements:
  https://tools.ietf.org/html/rfc7559

Backoff is performed according to RFC3315 section 14:
  https://tools.ietf.org/html/rfc3315#section-14

We allow setting /proc/sys/net/ipv6/conf/*/router_solicitations
to a negative value meaning an unlimited number of retransmits,
and we make this the new default (inline with the RFC).

We also add a new setting:
  /proc/sys/net/ipv6/conf/*/router_solicitation_max_interval
defaulting to 1 hour (per RFC recommendation).

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Acked-by: Erik Kline <ek@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit bd11f0741fa5a2c296629898ad07759dd12b35bb in
DaveM's net-next/master, should make Linus' tree in 4.9-rc1)
Change-Id: Ia32cdc5c61481893ef8040734e014bf2229fc39e
---
 include/linux/ipv6.h   |  1 +
 include/net/addrconf.h |  3 ++-
 include/net/if_inet6.h |  1 +
 net/ipv6/addrconf.c    | 51 ++++++++++++++++++++++++++++++++++++------
 4 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 1182f0e21697..a0fc3cf932af 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -18,6 +18,7 @@ struct ipv6_devconf {
 	__s32		dad_transmits;
 	__s32		rtr_solicits;
 	__s32		rtr_solicit_interval;
+	__s32		rtr_solicit_max_interval;
 	__s32		rtr_solicit_delay;
 	__s32		force_mld_version;
 	__s32		mldv1_unsolicited_report_interval;
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 3275ddf9f00d..d540657819ef 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -1,8 +1,9 @@
 #ifndef _ADDRCONF_H
 #define _ADDRCONF_H
 
-#define MAX_RTR_SOLICITATIONS		3
+#define MAX_RTR_SOLICITATIONS		-1		/* unlimited */
 #define RTR_SOLICITATION_INTERVAL	(4*HZ)
+#define RTR_SOLICITATION_MAX_INTERVAL	(3600*HZ)	/* 1 hour */
 
 #define MIN_VALID_LIFETIME		(2*3600)	/* 2 hours */
 
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 1c8b6820b694..515352c6280a 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -201,6 +201,7 @@ struct inet6_dev {
 	struct ipv6_devstat	stats;
 
 	struct timer_list	rs_timer;
+	__s32			rs_interval;	/* in jiffies */
 	__u8			rs_probes;
 
 	__u8			addr_gen_mode;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 860ffbe778cf..a942a18b7943 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -112,6 +112,27 @@ static inline u32 cstamp_delta(unsigned long cstamp)
 	return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
 }
 
+static inline s32 rfc3315_s14_backoff_init(s32 irt)
+{
+	/* multiply 'initial retransmission time' by 0.9 .. 1.1 */
+	u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt;
+	do_div(tmp, 1000000);
+	return (s32)tmp;
+}
+
+static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
+{
+	/* multiply 'retransmission timeout' by 1.9 .. 2.1 */
+	u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt;
+	do_div(tmp, 1000000);
+	if ((s32)tmp > mrt) {
+		/* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
+		tmp = (900000 + prandom_u32() % 200001) * (u64)mrt;
+		do_div(tmp, 1000000);
+	}
+	return (s32)tmp;
+}
+
 #ifdef CONFIG_SYSCTL
 static int addrconf_sysctl_register(struct inet6_dev *idev);
 static void addrconf_sysctl_unregister(struct inet6_dev *idev);
@@ -187,6 +208,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.dad_transmits		= 1,
 	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
 	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
+	.rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
 	.rtr_solicit_delay	= MAX_RTR_SOLICITATION_DELAY,
 	.use_tempaddr		= 0,
 	.temp_valid_lft		= TEMP_VALID_LIFETIME,
@@ -233,6 +255,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.dad_transmits		= 1,
 	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
 	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
+	.rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
 	.rtr_solicit_delay	= MAX_RTR_SOLICITATION_DELAY,
 	.use_tempaddr		= 0,
 	.temp_valid_lft		= TEMP_VALID_LIFETIME,
@@ -3487,7 +3510,7 @@ static void addrconf_rs_timer(unsigned long data)
 	if (idev->if_flags & IF_RA_RCVD)
 		goto out;
 
-	if (idev->rs_probes++ < idev->cnf.rtr_solicits) {
+	if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) {
 		write_unlock(&idev->lock);
 		if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
 			ndisc_send_rs(dev, &lladdr,
@@ -3496,11 +3519,13 @@ static void addrconf_rs_timer(unsigned long data)
 			goto put;
 
 		write_lock(&idev->lock);
+		idev->rs_interval = rfc3315_s14_backoff_update(
+			idev->rs_interval, idev->cnf.rtr_solicit_max_interval);
 		/* The wait after the last probe can be shorter */
 		addrconf_mod_rs_timer(idev, (idev->rs_probes ==
 					     idev->cnf.rtr_solicits) ?
 				      idev->cnf.rtr_solicit_delay :
-				      idev->cnf.rtr_solicit_interval);
+				      idev->rs_interval);
 	} else {
 		/*
 		 * Note: we do not support deprecated "all on-link"
@@ -3728,7 +3753,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
 	send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
 	send_rs = send_mld &&
 		  ipv6_accept_ra(ifp->idev) &&
-		  ifp->idev->cnf.rtr_solicits > 0 &&
+		  ifp->idev->cnf.rtr_solicits != 0 &&
 		  (dev->flags&IFF_LOOPBACK) == 0;
 	read_unlock_bh(&ifp->idev->lock);
 
@@ -3750,10 +3775,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
 
 		write_lock_bh(&ifp->idev->lock);
 		spin_lock(&ifp->lock);
+		ifp->idev->rs_interval = rfc3315_s14_backoff_init(
+			ifp->idev->cnf.rtr_solicit_interval);
 		ifp->idev->rs_probes = 1;
 		ifp->idev->if_flags |= IF_RS_SENT;
-		addrconf_mod_rs_timer(ifp->idev,
-				      ifp->idev->cnf.rtr_solicit_interval);
+		addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
 		spin_unlock(&ifp->lock);
 		write_unlock_bh(&ifp->idev->lock);
 	}
@@ -4670,6 +4696,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
 	array[DEVCONF_RTR_SOLICIT_INTERVAL] =
 		jiffies_to_msecs(cnf->rtr_solicit_interval);
+	array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
+		jiffies_to_msecs(cnf->rtr_solicit_max_interval);
 	array[DEVCONF_RTR_SOLICIT_DELAY] =
 		jiffies_to_msecs(cnf->rtr_solicit_delay);
 	array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
@@ -4879,7 +4907,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
 		return -EINVAL;
 	if (!ipv6_accept_ra(idev))
 		return -EINVAL;
-	if (idev->cnf.rtr_solicits <= 0)
+	if (idev->cnf.rtr_solicits == 0)
 		return -EINVAL;
 
 	write_lock_bh(&idev->lock);
@@ -4904,8 +4932,10 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
 
 	if (update_rs) {
 		idev->if_flags |= IF_RS_SENT;
+		idev->rs_interval = rfc3315_s14_backoff_init(
+			idev->cnf.rtr_solicit_interval);
 		idev->rs_probes = 1;
-		addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval);
+		addrconf_mod_rs_timer(idev, idev->rs_interval);
 	}
 
 	/* Well, that's kinda nasty ... */
@@ -5542,6 +5572,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
 		},
+		{
+			.procname	= "router_solicitation_max_interval",
+			.data		= &ipv6_devconf.rtr_solicit_max_interval,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_jiffies,
+		},
 		{
 			.procname	= "router_solicitation_delay",
 			.data		= &ipv6_devconf.rtr_solicit_delay,

From 3a75d7a94709a4cf17a059d669d27bc82bb6bf9b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Fri, 31 Mar 2017 17:34:55 +0200
Subject: [PATCH 0744/3220] Merge 4.4.59 into android-4.4

Changes in 4.4.59:
	xfrm: policy: init locks early
	xfrm_user: validate XFRM_MSG_NEWAE XFRMA_REPLAY_ESN_VAL replay_window
	xfrm_user: validate XFRM_MSG_NEWAE incoming ESN size harder
	virtio_balloon: init 1st buffer in stats vq
	pinctrl: qcom: Don't clear status bit on irq_unmask
	c6x/ptrace: Remove useless PTRACE_SETREGSET implementation
	h8300/ptrace: Fix incorrect register transfer count
	mips/ptrace: Preserve previous registers for short regset write
	sparc/ptrace: Preserve previous registers for short regset write
	metag/ptrace: Preserve previous registers for short regset write
	metag/ptrace: Provide default TXSTATUS for short NT_PRSTATUS
	metag/ptrace: Reject partial NT_METAG_RPIPE writes
	fscrypt: remove broken support for detecting keyring key revocation
	sched/rt: Add a missing rescheduling point
	Linux 4.4.59

Change-Id: Ifa35307b133cbf29d0a0084bb78a7b0436182b53
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 Makefile                           |  2 +-
 arch/c6x/kernel/ptrace.c           | 41 ------------------------------
 arch/h8300/kernel/ptrace.c         |  8 +++---
 arch/metag/kernel/ptrace.c         | 19 +++++++++++---
 arch/mips/kernel/ptrace.c          |  3 ++-
 arch/sparc/kernel/ptrace_64.c      |  2 +-
 drivers/pinctrl/qcom/pinctrl-msm.c |  4 ---
 drivers/virtio/virtio_balloon.c    |  2 ++
 fs/ext4/crypto_key.c               | 28 +++++---------------
 fs/ext4/ext4.h                     | 14 +---------
 fs/ext4/ext4_crypto.h              |  1 -
 fs/f2fs/crypto_key.c               | 28 +++++---------------
 fs/f2fs/f2fs.h                     | 14 +---------
 fs/f2fs/f2fs_crypto.h              |  1 -
 kernel/sched/deadline.c            |  3 +--
 kernel/sched/rt.c                  |  3 +--
 net/xfrm/xfrm_policy.c             | 10 ++++----
 net/xfrm/xfrm_user.c               |  9 ++++++-
 18 files changed, 57 insertions(+), 135 deletions(-)

diff --git a/Makefile b/Makefile
index 6450e18ebd7a..fe17b173d773 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 58
+SUBLEVEL = 59
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 
diff --git a/arch/c6x/kernel/ptrace.c b/arch/c6x/kernel/ptrace.c
index 3c494e84444d..a511ac16a8e3 100644
--- a/arch/c6x/kernel/ptrace.c
+++ b/arch/c6x/kernel/ptrace.c
@@ -69,46 +69,6 @@ static int gpr_get(struct task_struct *target,
 				   0, sizeof(*regs));
 }
 
-static int gpr_set(struct task_struct *target,
-		   const struct user_regset *regset,
-		   unsigned int pos, unsigned int count,
-		   const void *kbuf, const void __user *ubuf)
-{
-	int ret;
-	struct pt_regs *regs = task_pt_regs(target);
-
-	/* Don't copyin TSR or CSR */
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &regs,
-				 0, PT_TSR * sizeof(long));
-	if (ret)
-		return ret;
-
-	ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
-					PT_TSR * sizeof(long),
-					(PT_TSR + 1) * sizeof(long));
-	if (ret)
-		return ret;
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &regs,
-				 (PT_TSR + 1) * sizeof(long),
-				 PT_CSR * sizeof(long));
-	if (ret)
-		return ret;
-
-	ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
-					PT_CSR * sizeof(long),
-					(PT_CSR + 1) * sizeof(long));
-	if (ret)
-		return ret;
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &regs,
-				 (PT_CSR + 1) * sizeof(long), -1);
-	return ret;
-}
-
 enum c6x_regset {
 	REGSET_GPR,
 };
@@ -120,7 +80,6 @@ static const struct user_regset c6x_regsets[] = {
 		.size = sizeof(u32),
 		.align = sizeof(u32),
 		.get = gpr_get,
-		.set = gpr_set
 	},
 };
 
diff --git a/arch/h8300/kernel/ptrace.c b/arch/h8300/kernel/ptrace.c
index 92075544a19a..0dc1c8f622bc 100644
--- a/arch/h8300/kernel/ptrace.c
+++ b/arch/h8300/kernel/ptrace.c
@@ -95,7 +95,8 @@ static int regs_get(struct task_struct *target,
 	long *reg = (long *)&regs;
 
 	/* build user regs in buffer */
-	for (r = 0; r < ARRAY_SIZE(register_offset); r++)
+	BUILD_BUG_ON(sizeof(regs) % sizeof(long) != 0);
+	for (r = 0; r < sizeof(regs) / sizeof(long); r++)
 		*reg++ = h8300_get_reg(target, r);
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
@@ -113,7 +114,8 @@ static int regs_set(struct task_struct *target,
 	long *reg;
 
 	/* build user regs in buffer */
-	for (reg = (long *)&regs, r = 0; r < ARRAY_SIZE(register_offset); r++)
+	BUILD_BUG_ON(sizeof(regs) % sizeof(long) != 0);
+	for (reg = (long *)&regs, r = 0; r < sizeof(regs) / sizeof(long); r++)
 		*reg++ = h8300_get_reg(target, r);
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
@@ -122,7 +124,7 @@ static int regs_set(struct task_struct *target,
 		return ret;
 
 	/* write back to pt_regs */
-	for (reg = (long *)&regs, r = 0; r < ARRAY_SIZE(register_offset); r++)
+	for (reg = (long *)&regs, r = 0; r < sizeof(regs) / sizeof(long); r++)
 		h8300_put_reg(target, r, *reg++);
 	return 0;
 }
diff --git a/arch/metag/kernel/ptrace.c b/arch/metag/kernel/ptrace.c
index 7563628822bd..5e2dc7defd2c 100644
--- a/arch/metag/kernel/ptrace.c
+++ b/arch/metag/kernel/ptrace.c
@@ -24,6 +24,16 @@
  * user_regset definitions.
  */
 
+static unsigned long user_txstatus(const struct pt_regs *regs)
+{
+	unsigned long data = (unsigned long)regs->ctx.Flags;
+
+	if (regs->ctx.SaveMask & TBICTX_CBUF_BIT)
+		data |= USER_GP_REGS_STATUS_CATCH_BIT;
+
+	return data;
+}
+
 int metag_gp_regs_copyout(const struct pt_regs *regs,
 			  unsigned int pos, unsigned int count,
 			  void *kbuf, void __user *ubuf)
@@ -62,9 +72,7 @@ int metag_gp_regs_copyout(const struct pt_regs *regs,
 	if (ret)
 		goto out;
 	/* TXSTATUS */
-	data = (unsigned long)regs->ctx.Flags;
-	if (regs->ctx.SaveMask & TBICTX_CBUF_BIT)
-		data |= USER_GP_REGS_STATUS_CATCH_BIT;
+	data = user_txstatus(regs);
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				  &data, 4*25, 4*26);
 	if (ret)
@@ -119,6 +127,7 @@ int metag_gp_regs_copyin(struct pt_regs *regs,
 	if (ret)
 		goto out;
 	/* TXSTATUS */
+	data = user_txstatus(regs);
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				 &data, 4*25, 4*26);
 	if (ret)
@@ -244,6 +253,8 @@ int metag_rp_state_copyin(struct pt_regs *regs,
 	unsigned long long *ptr;
 	int ret, i;
 
+	if (count < 4*13)
+		return -EINVAL;
 	/* Read the entire pipeline before making any changes */
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				 &rp, 0, 4*13);
@@ -303,7 +314,7 @@ static int metag_tls_set(struct task_struct *target,
 			const void *kbuf, const void __user *ubuf)
 {
 	int ret;
-	void __user *tls;
+	void __user *tls = target->thread.tls_ptr;
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1);
 	if (ret)
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 74d581569778..c95bf18260f8 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -485,7 +485,8 @@ static int fpr_set(struct task_struct *target,
 					  &target->thread.fpu,
 					  0, sizeof(elf_fpregset_t));
 
-	for (i = 0; i < NUM_FPU_REGS; i++) {
+	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
+	for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) {
 		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 					 &fpr_val, i * sizeof(elf_fpreg_t),
 					 (i + 1) * sizeof(elf_fpreg_t));
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 9ddc4928a089..c1566170964f 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -311,7 +311,7 @@ static int genregs64_set(struct task_struct *target,
 	}
 
 	if (!ret) {
-		unsigned long y;
+		unsigned long y = regs->y;
 
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 					 &y,
diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index 146264a41ec8..9736f9be5447 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -597,10 +597,6 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
 
 	spin_lock_irqsave(&pctrl->lock, flags);
 
-	val = readl(pctrl->regs + g->intr_status_reg);
-	val &= ~BIT(g->intr_status_bit);
-	writel(val, pctrl->regs + g->intr_status_reg);
-
 	val = readl(pctrl->regs + g->intr_cfg_reg);
 	val |= BIT(g->intr_enable_bit);
 	writel(val, pctrl->regs + g->intr_cfg_reg);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 56f7e2521202..01d15dca940e 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -416,6 +416,8 @@ static int init_vqs(struct virtio_balloon *vb)
 		 * Prime this virtqueue with one buffer so the hypervisor can
 		 * use it to signal us later (it can't be broken yet!).
 		 */
+		update_balloon_stats(vb);
+
 		sg_init_one(&sg, vb->stats, sizeof vb->stats);
 		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
 		    < 0)
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 98d58c8ee804..22096e31a720 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -173,8 +173,6 @@ void ext4_free_crypt_info(struct ext4_crypt_info *ci)
 	if (!ci)
 		return;
 
-	if (ci->ci_keyring_key)
-		key_put(ci->ci_keyring_key);
 	crypto_free_ablkcipher(ci->ci_ctfm);
 	kmem_cache_free(ext4_crypt_info_cachep, ci);
 }
@@ -196,7 +194,7 @@ void ext4_free_encryption_info(struct inode *inode,
 	ext4_free_crypt_info(ci);
 }
 
-int _ext4_get_encryption_info(struct inode *inode)
+int ext4_get_encryption_info(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_crypt_info *crypt_info;
@@ -213,22 +211,15 @@ int _ext4_get_encryption_info(struct inode *inode)
 	char mode;
 	int res;
 
+	if (ei->i_crypt_info)
+		return 0;
+
 	if (!ext4_read_workqueue) {
 		res = ext4_init_crypto();
 		if (res)
 			return res;
 	}
 
-retry:
-	crypt_info = ACCESS_ONCE(ei->i_crypt_info);
-	if (crypt_info) {
-		if (!crypt_info->ci_keyring_key ||
-		    key_validate(crypt_info->ci_keyring_key) == 0)
-			return 0;
-		ext4_free_encryption_info(inode, crypt_info);
-		goto retry;
-	}
-
 	res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
 				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
 				 &ctx, sizeof(ctx));
@@ -251,7 +242,6 @@ retry:
 	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
 	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
 	crypt_info->ci_ctfm = NULL;
-	crypt_info->ci_keyring_key = NULL;
 	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
 	       sizeof(crypt_info->ci_master_key));
 	if (S_ISREG(inode->i_mode))
@@ -294,7 +284,6 @@ retry:
 		keyring_key = NULL;
 		goto out;
 	}
-	crypt_info->ci_keyring_key = keyring_key;
 	if (keyring_key->type != &key_type_logon) {
 		printk_once(KERN_WARNING
 			    "ext4: key type must be logon\n");
@@ -340,16 +329,13 @@ got_key:
 				       ext4_encryption_key_size(mode));
 	if (res)
 		goto out;
-	memzero_explicit(raw_key, sizeof(raw_key));
-	if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) {
-		ext4_free_crypt_info(crypt_info);
-		goto retry;
-	}
-	return 0;
 
+	if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) == NULL)
+		crypt_info = NULL;
 out:
 	if (res == -ENOKEY)
 		res = 0;
+	key_put(keyring_key);
 	ext4_free_crypt_info(crypt_info);
 	memzero_explicit(raw_key, sizeof(raw_key));
 	return res;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8be0e4d330ab..ac720a31e4ff 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2331,23 +2331,11 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
 /* crypto_key.c */
 void ext4_free_crypt_info(struct ext4_crypt_info *ci);
 void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci);
-int _ext4_get_encryption_info(struct inode *inode);
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 int ext4_has_encryption_key(struct inode *inode);
 
-static inline int ext4_get_encryption_info(struct inode *inode)
-{
-	struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
-
-	if (!ci ||
-	    (ci->ci_keyring_key &&
-	     (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
-					   (1 << KEY_FLAG_REVOKED) |
-					   (1 << KEY_FLAG_DEAD)))))
-		return _ext4_get_encryption_info(inode);
-	return 0;
-}
+int ext4_get_encryption_info(struct inode *inode);
 
 static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
 {
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index 5a1684bd083b..e52637d969db 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -80,7 +80,6 @@ struct ext4_crypt_info {
 	char		ci_filename_mode;
 	char		ci_flags;
 	struct crypto_ablkcipher *ci_ctfm;
-	struct key	*ci_keyring_key;
 	char		ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE];
 };
 
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
index 5de2d866a25c..18595d7a0efc 100644
--- a/fs/f2fs/crypto_key.c
+++ b/fs/f2fs/crypto_key.c
@@ -92,7 +92,6 @@ static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
 	if (!ci)
 		return;
 
-	key_put(ci->ci_keyring_key);
 	crypto_free_ablkcipher(ci->ci_ctfm);
 	kmem_cache_free(f2fs_crypt_info_cachep, ci);
 }
@@ -113,7 +112,7 @@ void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci)
 	f2fs_free_crypt_info(ci);
 }
 
-int _f2fs_get_encryption_info(struct inode *inode)
+int f2fs_get_encryption_info(struct inode *inode)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct f2fs_crypt_info *crypt_info;
@@ -129,18 +128,12 @@ int _f2fs_get_encryption_info(struct inode *inode)
 	char mode;
 	int res;
 
+	if (fi->i_crypt_info)
+		return 0;
+
 	res = f2fs_crypto_initialize();
 	if (res)
 		return res;
-retry:
-	crypt_info = ACCESS_ONCE(fi->i_crypt_info);
-	if (crypt_info) {
-		if (!crypt_info->ci_keyring_key ||
-				key_validate(crypt_info->ci_keyring_key) == 0)
-			return 0;
-		f2fs_free_encryption_info(inode, crypt_info);
-		goto retry;
-	}
 
 	res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
 				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
@@ -159,7 +152,6 @@ retry:
 	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
 	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
 	crypt_info->ci_ctfm = NULL;
-	crypt_info->ci_keyring_key = NULL;
 	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
 				sizeof(crypt_info->ci_master_key));
 	if (S_ISREG(inode->i_mode))
@@ -197,7 +189,6 @@ retry:
 		keyring_key = NULL;
 		goto out;
 	}
-	crypt_info->ci_keyring_key = keyring_key;
 	BUG_ON(keyring_key->type != &key_type_logon);
 	ukp = user_key_payload(keyring_key);
 	if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
@@ -230,17 +221,12 @@ retry:
 	if (res)
 		goto out;
 
-	memzero_explicit(raw_key, sizeof(raw_key));
-	if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) {
-		f2fs_free_crypt_info(crypt_info);
-		goto retry;
-	}
-	return 0;
-
+	if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) == NULL)
+		crypt_info = NULL;
 out:
 	if (res == -ENOKEY && !S_ISREG(inode->i_mode))
 		res = 0;
-
+	key_put(keyring_key);
 	f2fs_free_crypt_info(crypt_info);
 	memzero_explicit(raw_key, sizeof(raw_key));
 	return res;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9db5500d63d9..b1aeca83f4be 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2149,7 +2149,6 @@ void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *);
 
 /* crypto_key.c */
 void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *);
-int _f2fs_get_encryption_info(struct inode *inode);
 
 /* crypto_fname.c */
 bool f2fs_valid_filenames_enc_mode(uint32_t);
@@ -2170,18 +2169,7 @@ void f2fs_exit_crypto(void);
 
 int f2fs_has_encryption_key(struct inode *);
 
-static inline int f2fs_get_encryption_info(struct inode *inode)
-{
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
-	if (!ci ||
-		(ci->ci_keyring_key &&
-		 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
-					       (1 << KEY_FLAG_REVOKED) |
-					       (1 << KEY_FLAG_DEAD)))))
-		return _f2fs_get_encryption_info(inode);
-	return 0;
-}
+int f2fs_get_encryption_info(struct inode *inode);
 
 void f2fs_fname_crypto_free_buffer(struct f2fs_str *);
 int f2fs_fname_setup_filename(struct inode *, const struct qstr *,
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
index c2c1c2b63b25..f113f1a1e8c1 100644
--- a/fs/f2fs/f2fs_crypto.h
+++ b/fs/f2fs/f2fs_crypto.h
@@ -79,7 +79,6 @@ struct f2fs_crypt_info {
 	char		ci_filename_mode;
 	char		ci_flags;
 	struct crypto_ablkcipher *ci_ctfm;
-	struct key	*ci_keyring_key;
 	char		ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
 };
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9d9eb50d4059..f10b1cb255b2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1800,12 +1800,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
 			queue_push_tasks(rq);
-#else
+#endif
 		if (dl_task(rq->curr))
 			check_preempt_curr_dl(rq, p, 0);
 		else
 			resched_curr(rq);
-#endif
 	}
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8a16cba968c4..541b8494450e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2235,10 +2235,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
 			queue_push_tasks(rq);
-#else
+#endif /* CONFIG_SMP */
 		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
-#endif /* CONFIG_SMP */
 	}
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b5e665b3cfb0..36a50ef9295d 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3030,6 +3030,11 @@ static int __net_init xfrm_net_init(struct net *net)
 {
 	int rv;
 
+	/* Initialize the per-net locks here */
+	spin_lock_init(&net->xfrm.xfrm_state_lock);
+	rwlock_init(&net->xfrm.xfrm_policy_lock);
+	mutex_init(&net->xfrm.xfrm_cfg_mutex);
+
 	rv = xfrm_statistics_init(net);
 	if (rv < 0)
 		goto out_statistics;
@@ -3046,11 +3051,6 @@ static int __net_init xfrm_net_init(struct net *net)
 	if (rv < 0)
 		goto out;
 
-	/* Initialize the per-net locks here */
-	spin_lock_init(&net->xfrm.xfrm_state_lock);
-	rwlock_init(&net->xfrm.xfrm_policy_lock);
-	mutex_init(&net->xfrm.xfrm_cfg_mutex);
-
 	return 0;
 
 out:
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 805681a7d356..7a5a64e70b4d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -412,7 +412,14 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es
 	up = nla_data(rp);
 	ulen = xfrm_replay_state_esn_len(up);
 
-	if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen)
+	/* Check the overall length and the internal bitmap length to avoid
+	 * potential overflow. */
+	if (nla_len(rp) < ulen ||
+	    xfrm_replay_state_esn_len(replay_esn) != ulen ||
+	    replay_esn->bmp_len != up->bmp_len)
+		return -EINVAL;
+
+	if (up->replay_window > up->bmp_len * sizeof(__u32) * 8)
 		return -EINVAL;
 
 	return 0;

From 5156f3e9f222f9dd9bce5981b67b1517aa5625c9 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 9 Mar 2017 21:14:45 -0800
Subject: [PATCH 0745/3220] ANDROID: sdcardfs: remove unnecessary call to
 do_munmap

Adapted from wrapfs
commit 5be6de9ecf02 ("Wrapfs: use vm_munmap in ->mmap")
commit 2c9f6014a8bb ("Wrapfs: remove unnecessary call
to vm_unmap in ->mmap")

Code is unnecessary and causes deadlocks in newer kernels.

Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35766959
Change-Id: Ia252d60c60799d7e28fc5f1f0f5b5ec2430a2379
---
 fs/sdcardfs/file.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index eee4eb5896e5..7e2c50b30782 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -176,12 +176,6 @@ static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma)
 			goto out;
 		}
 		saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */
-		err = do_munmap(current->mm, vma->vm_start,
-				vma->vm_end - vma->vm_start);
-		if (err) {
-			pr_err("sdcardfs: do_munmap failed %d\n", err);
-			goto out;
-		}
 	}
 
 	/*

From b2fc288c811eb828012b13f014aafd4d0e2c59b4 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 9 Mar 2017 21:24:58 -0800
Subject: [PATCH 0746/3220] ANDROID: sdcardfs: copy lower inode attributes in
 ->ioctl

Adapted from wrapfs
commit fbc9c6f83ea6 ("Wrapfs: copy lower inode attributes in ->ioctl")
commit e97d8e26cc9e ("Wrapfs: use file_inode helper")

Some ioctls (e.g., EXT2_IOC_SETFLAGS) can change inode attributes, so copy
them from lower inode.

Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35766959
Change-Id: I0f12684b9dbd4088b4a622c7ea9c03087f40e572
---
 fs/sdcardfs/file.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 7e2c50b30782..41a550fec3f2 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -113,6 +113,10 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd,
 	if (lower_file->f_op->unlocked_ioctl)
 		err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
 
+	/* some ioctls can change inode attributes (EXT2_IOC_SETFLAGS) */
+	if (!err)
+		sdcardfs_copy_and_fix_attrs(file_inode(file),
+				      file_inode(lower_file));
 out:
 	return err;
 }

From c4e9b94bcb00e00843c7907ea3656cc2dc34e385 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 9 Mar 2017 21:42:01 -0800
Subject: [PATCH 0747/3220] ANDROID: sdcardfs: fix ->llseek to update upper and
 lower offset

Adapted from wrapfs
commit 1d1d23a47baa ("Wrapfs: fix ->llseek to update upper and lower
offsets")

Fixes bug: xfstests generic/257. f_pos consistently is required by and
only by dir_ops->wrapfs_readdir, main_ops is not affected.

Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Mengyang Li <li.mengyang@stonybrook.edu>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35766959
Change-Id: I360a1368ac37ea8966910a58972b81504031d437
---
 fs/sdcardfs/file.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 41a550fec3f2..f49df7ac370e 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -316,6 +316,29 @@ static int sdcardfs_fasync(int fd, struct file *file, int flag)
 	return err;
 }
 
+/*
+ * Sdcardfs cannot use generic_file_llseek as ->llseek, because it would
+ * only set the offset of the upper file.  So we have to implement our
+ * own method to set both the upper and lower file offsets
+ * consistently.
+ */
+static loff_t sdcardfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+	int err;
+	struct file *lower_file;
+
+	err = generic_file_llseek(file, offset, whence);
+	if (err < 0)
+		goto out;
+
+	lower_file = sdcardfs_lower_file(file);
+	err = generic_file_llseek(lower_file, offset, whence);
+
+out:
+	return err;
+}
+
+
 const struct file_operations sdcardfs_main_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= sdcardfs_read,
@@ -334,7 +357,7 @@ const struct file_operations sdcardfs_main_fops = {
 
 /* trimmed directory options */
 const struct file_operations sdcardfs_dir_fops = {
-	.llseek		= generic_file_llseek,
+	.llseek		= sdcardfs_file_llseek,
 	.read		= generic_read_dir,
 	.iterate	= sdcardfs_readdir,
 	.unlocked_ioctl	= sdcardfs_unlocked_ioctl,

From 9a1e24adfd17f380a62c0966a23198c9684724c7 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 9 Mar 2017 21:48:09 -0800
Subject: [PATCH 0748/3220] ANDROID: sdcardfs: add read_iter/write_iter
 opeations

Adapted from wrapfs
commit f398bf6a7377 ("Wrapfs: add read_iter/write_iter opeations")

Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Mengyang Li <li.mengyang@stonybrook.edu>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35766959
Change-Id: I2b3de59c9682fc705bf21df0de6df81e76fd2e40
---
 fs/sdcardfs/file.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index f49df7ac370e..c0146e03fa2e 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -338,6 +338,52 @@ out:
 	return err;
 }
 
+/*
+ * Sdcardfs read_iter, redirect modified iocb to lower read_iter
+ */
+ssize_t sdcardfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	int err;
+	struct file *file = iocb->ki_filp, *lower_file;
+
+	lower_file = sdcardfs_lower_file(file);
+	if (!lower_file->f_op->read_iter) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	get_file(lower_file); /* prevent lower_file from being released */
+	iocb->ki_filp = lower_file;
+	err = lower_file->f_op->read_iter(iocb, iter);
+	/* ? wait IO finish to update atime as ecryptfs ? */
+	iocb->ki_filp = file;
+	fput(lower_file);
+out:
+	return err;
+}
+
+/*
+ * Sdcardfs write_iter, redirect modified iocb to lower write_iter
+ */
+ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	int err;
+	struct file *file = iocb->ki_filp, *lower_file;
+
+	lower_file = sdcardfs_lower_file(file);
+	if (!lower_file->f_op->write_iter) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	get_file(lower_file); /* prevent lower_file from being released */
+	iocb->ki_filp = lower_file;
+	err = lower_file->f_op->write_iter(iocb, iter);
+	iocb->ki_filp = file;
+	fput(lower_file);
+out:
+	return err;
+}
 
 const struct file_operations sdcardfs_main_fops = {
 	.llseek		= generic_file_llseek,
@@ -353,6 +399,8 @@ const struct file_operations sdcardfs_main_fops = {
 	.release	= sdcardfs_file_release,
 	.fsync		= sdcardfs_fsync,
 	.fasync		= sdcardfs_fasync,
+	.read_iter	= sdcardfs_read_iter,
+	.write_iter	= sdcardfs_write_iter,
 };
 
 /* trimmed directory options */

From fb12388ce4461c3af91174a0c7e67a81f4c0a758 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 9 Mar 2017 22:11:08 -0800
Subject: [PATCH 0749/3220] ANDROID: sdcardfs: use d_splice_alias

adapted from wrapfs
commit 9671770ff8b9 ("Wrapfs: use d_splice_alias")

Refactor interpose code to allow lookup to use d_splice_alias.

Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35766959
Change-Id: Icf51db8658202c48456724275b03dc77f73f585b
---
 fs/sdcardfs/lookup.c | 55 +++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 16 deletions(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index cfa2b12b6411..f10f7752f514 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -164,27 +164,25 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, u
 }
 
 /*
- * Connect a sdcardfs inode dentry/inode with several lower ones.  This is
- * the classic stackable file system "vnode interposition" action.
- *
- * @dentry: sdcardfs's dentry which interposes on lower one
- * @sb: sdcardfs's super_block
- * @lower_path: the lower path (caller does path_get/put)
+ * Helper interpose routine, called directly by ->lookup to handle
+ * spliced dentries.
  */
-int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
-		     struct path *lower_path, userid_t id)
+static struct dentry *__sdcardfs_interpose(struct dentry *dentry,
+					 struct super_block *sb,
+					 struct path *lower_path,
+					 userid_t id)
 {
-	int err = 0;
 	struct inode *inode;
 	struct inode *lower_inode;
 	struct super_block *lower_sb;
+	struct dentry *ret_dentry;
 
 	lower_inode = d_inode(lower_path->dentry);
 	lower_sb = sdcardfs_lower_super(sb);
 
 	/* check that the lower file system didn't cross a mount point */
 	if (lower_inode->i_sb != lower_sb) {
-		err = -EXDEV;
+		ret_dentry = ERR_PTR(-EXDEV);
 		goto out;
 	}
 
@@ -196,14 +194,32 @@ int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
 	/* inherit lower inode number for sdcardfs's inode */
 	inode = sdcardfs_iget(sb, lower_inode, id);
 	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
+		ret_dentry = ERR_CAST(inode);
 		goto out;
 	}
 
-	d_add(dentry, inode);
+	ret_dentry = d_splice_alias(inode, dentry);
+	dentry = ret_dentry ?: dentry;
 	update_derived_permission_lock(dentry);
 out:
-	return err;
+	return ret_dentry;
+}
+
+/*
+ * Connect an sdcardfs inode dentry/inode with several lower ones.  This is
+ * the classic stackable file system "vnode interposition" action.
+ *
+ * @dentry: sdcardfs's dentry which interposes on lower one
+ * @sb: sdcardfs's super_block
+ * @lower_path: the lower path (caller does path_get/put)
+ */
+int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
+		     struct path *lower_path, userid_t id)
+{
+	struct dentry *ret_dentry;
+
+	ret_dentry = __sdcardfs_interpose(dentry, sb, lower_path, id);
+	return PTR_ERR(ret_dentry);
 }
 
 struct sdcardfs_name_data {
@@ -244,6 +260,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
 	const struct qstr *name;
 	struct path lower_path;
 	struct qstr dname;
+	struct dentry *ret_dentry = NULL;
 	struct sdcardfs_sb_info *sbi;
 
 	sbi = SDCARDFS_SB(dentry->d_sb);
@@ -330,9 +347,13 @@ put_name:
 		}
 
 		sdcardfs_set_lower_path(dentry, &lower_path);
-		err = sdcardfs_interpose(dentry, dentry->d_sb, &lower_path, id);
-		if (err) /* path_put underlying path on error */
+		ret_dentry =
+			__sdcardfs_interpose(dentry, dentry->d_sb, &lower_path, id);
+		if (IS_ERR(ret_dentry)) {
+			err = PTR_ERR(ret_dentry);
+			 /* path_put underlying path on error */
 			sdcardfs_put_reset_lower_path(dentry);
+		}
 		goto out;
 	}
 
@@ -372,7 +393,9 @@ setup_lower:
 		err = 0;
 
 out:
-	return ERR_PTR(err);
+	if (err)
+		return ERR_PTR(err);
+	return ret_dentry;
 }
 
 /*

From 24c96f78361315e1f39d9bd9836ec2f3f94f19ee Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 9 Mar 2017 20:56:05 -0800
Subject: [PATCH 0750/3220] ANDROID: sdcardfs: update module info

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: I958c7c226d4e9265fea8996803e5b004fb33d8ad
---
 fs/sdcardfs/main.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 7344635522cd..953d2156d2e9 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -471,10 +471,15 @@ static void __exit exit_sdcardfs_fs(void)
 	pr_info("Completed sdcardfs module unload\n");
 }
 
-MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University"
-	      " (http://www.fsl.cs.sunysb.edu/)");
-MODULE_DESCRIPTION("Wrapfs " SDCARDFS_VERSION
-		   " (http://wrapfs.filesystems.org/)");
+/* Original wrapfs authors */
+MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University (http://www.fsl.cs.sunysb.edu/)");
+
+/* Original sdcardfs authors */
+MODULE_AUTHOR("Woojoong Lee, Daeho Jeong, Kitae Lee, Yeongjin Gil System Memory Lab., Samsung Electronics");
+
+/* Current maintainer */
+MODULE_AUTHOR("Daniel Rosenberg, Google");
+MODULE_DESCRIPTION("Sdcardfs " SDCARDFS_VERSION);
 MODULE_LICENSE("GPL");
 
 module_init(init_sdcardfs_fs);

From 6c8d409129bbebe36cde9f8e511011756216163a Mon Sep 17 00:00:00 2001
From: zhangshuxiao <zhangshuxiao@xiaomi.com>
Date: Wed, 8 Mar 2017 16:53:24 +0800
Subject: [PATCH 0751/3220] staging: android: ashmem: lseek failed due to no
 FMODE_LSEEK.

vfs_llseek will check whether the file mode has
FMODE_LSEEK, no return failure. But ashmem can be
lseek, so add FMODE_LSEEK to ashmem file.

Change-Id: Ia78ef4c7c96adb89d52e70b63f7c00636fe60d01
Signed-off-by: zhangshuxiao <zhangshuxiao@xiaomi.com>
---
 drivers/staging/android/ashmem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 3f1133230a1a..e4530ac6d5d4 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -392,6 +392,7 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 			ret = PTR_ERR(vmfile);
 			goto out;
 		}
+		vmfile->f_mode |= FMODE_LSEEK;
 		asma->file = vmfile;
 	}
 	get_file(asma->file);

From c71ad0f6b384393d99188ad333429c145f5b594a Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 16 Nov 2016 17:31:31 +0000
Subject: [PATCH 0752/3220] BACKPORT: arm64: dts: juno: fix cluster sleep state
 entry latency on all SoC versions

The core and the cluster sleep state entry latencies can't be same as
cluster sleep involves more work compared to core level e.g. shared
cache maintenance.

Experiments have shown on an average about 100us more latency for the
cluster sleep state compared to the core level sleep. This patch fixes
the entry latency for the cluster sleep state.

Fixes: 28e10a8f3a03 ("arm64: dts: juno: Add idle-states to device tree")
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: "Jon Medhurst (Tixy)" <tixy@linaro.org>
Reviewed-by: Liviu Dudau <Liviu.Dudau@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Fixes: Change-Id: I7b2d81fa66f8ce8b229457cfefff06e9edd545c7
       (arm64: dts: juno: Add idle-states to device tree)
(cherry picked from commit 909e481e2467f202b97d42beef246e8829416a85)
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/boot/dts/arm/juno-r1.dts | 2 +-
 arch/arm64/boot/dts/arm/juno.dts    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/boot/dts/arm/juno-r1.dts b/arch/arm64/boot/dts/arm/juno-r1.dts
index 8826f834f54f..29315af22147 100644
--- a/arch/arm64/boot/dts/arm/juno-r1.dts
+++ b/arch/arm64/boot/dts/arm/juno-r1.dts
@@ -76,7 +76,7 @@
 				compatible = "arm,idle-state";
 				arm,psci-suspend-param = <0x1010000>;
 				local-timer-stop;
-				entry-latency-us = <300>;
+				entry-latency-us = <400>;
 				exit-latency-us = <1200>;
 				min-residency-us = <2500>;
 			};
diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts
index f113a80519cd..a5004931e2f1 100644
--- a/arch/arm64/boot/dts/arm/juno.dts
+++ b/arch/arm64/boot/dts/arm/juno.dts
@@ -76,7 +76,7 @@
 				compatible = "arm,idle-state";
 				arm,psci-suspend-param = <0x1010000>;
 				local-timer-stop;
-				entry-latency-us = <300>;
+				entry-latency-us = <400>;
 				exit-latency-us = <1200>;
 				min-residency-us = <2500>;
 			};

From 6a3b9c4984f9edc5a136720e42f1d2ab387857a4 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 10 Apr 2017 18:35:25 +0000
Subject: [PATCH 0753/3220] Revert "Revert "CHROMIUM: android: binder: Fix
 potential scheduling-while-atomic""

This reverts commit 13c17d0179f9a055062f37e91a6f6cf00a249ebd.

Change-Id: I8c3a7eefb72b85c0dd05996c2705636fcbc871f7
---
 drivers/android/binder.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 9cf4f9bbc711..22025bcc38e9 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -417,6 +417,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 	struct files_struct *files = proc->files;
 	unsigned long rlim_cur;
 	unsigned long irqs;
+	int ret;
 
 	if (files == NULL)
 		return -ESRCH;
@@ -427,7 +428,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
 	unlock_task_sighand(proc->tsk, &irqs);
 
-	return __alloc_fd(files, 0, rlim_cur, flags);
+	preempt_enable_no_resched();
+	ret = __alloc_fd(files, 0, rlim_cur, flags);
+	preempt_disable();
+
+	return ret;
 }
 
 /*
@@ -436,8 +441,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
-	if (proc->files)
+	if (proc->files) {
+		preempt_enable_no_resched();
 		__fd_install(proc->files, fd, file);
+		preempt_disable();
+	}
 }
 
 /*

From a2d978c2adcb7f06b24c4491188d122623384690 Mon Sep 17 00:00:00 2001
From: Lianwei Wang <lianwei.wang@gmail.com>
Date: Sun, 19 Jun 2016 23:52:27 -0700
Subject: [PATCH 0754/3220] UPSTREAM: PM / sleep: make PM notifiers called
 symmetrically

(cherry picked from commit ea00f4f4f00cc2bc3b63ad512a4e6df3b20832b9)

This makes pm notifier PREPARE/POST symmetrical: if PREPARE
fails, we will only undo what ever happened on PREPARE.

It fixes the unbalanced CPU hotplug enable in CPU PM notifier.

Change-Id: I01dce3cc95c5d6b8913b7b6be301f2909258c745
Signed-off-by: Lianwei Wang <lianwei.wang@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 20 ++++++++++++--------
 kernel/power/main.c      | 11 +++++++++--
 kernel/power/power.h     |  2 ++
 kernel/power/suspend.c   | 10 ++++++----
 kernel/power/user.c      | 14 ++++++++------
 5 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 3124cebaec31..797f19e2aaa9 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -647,7 +647,7 @@ static void power_down(void)
  */
 int hibernate(void)
 {
-	int error;
+	int error, nr_calls = 0;
 
 	if (!hibernation_available()) {
 		pr_debug("PM: Hibernation not available.\n");
@@ -662,9 +662,11 @@ int hibernate(void)
 	}
 
 	pm_prepare_console();
-	error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
-	if (error)
+	error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
 		goto Exit;
+	}
 
 	printk(KERN_INFO "PM: Syncing filesystems ... ");
 	sys_sync();
@@ -714,7 +716,7 @@ int hibernate(void)
 	/* Don't bother checking whether freezer_test_done is true */
 	freezer_test_done = false;
  Exit:
-	pm_notifier_call_chain(PM_POST_HIBERNATION);
+	__pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
 	pm_restore_console();
 	atomic_inc(&snapshot_device_available);
  Unlock:
@@ -740,7 +742,7 @@ int hibernate(void)
  */
 static int software_resume(void)
 {
-	int error;
+	int error, nr_calls = 0;
 	unsigned int flags;
 
 	/*
@@ -827,9 +829,11 @@ static int software_resume(void)
 	}
 
 	pm_prepare_console();
-	error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
-	if (error)
+	error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
 		goto Close_Finish;
+	}
 
 	pr_debug("PM: Preparing processes for restore.\n");
 	error = freeze_processes();
@@ -855,7 +859,7 @@ static int software_resume(void)
 	unlock_device_hotplug();
 	thaw_processes();
  Finish:
-	pm_notifier_call_chain(PM_POST_RESTORE);
+	__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
 	pm_restore_console();
 	atomic_inc(&snapshot_device_available);
 	/* For success case, the suspend path will release the lock */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b2dd4d999900..68c0eaae8034 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_pm_notifier);
 
-int pm_notifier_call_chain(unsigned long val)
+int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
 {
-	int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+	int ret;
+
+	ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
+						nr_to_call, nr_calls);
 
 	return notifier_to_errno(ret);
 }
+int pm_notifier_call_chain(unsigned long val)
+{
+	return __pm_notifier_call_chain(val, -1, NULL);
+}
 
 /* If set, devices may be suspended and resumed asynchronously. */
 int pm_async_enabled = 1;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index caadb566e82b..436da5bc72f4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -191,6 +191,8 @@ static inline void suspend_test_finish(const char *label) {}
 
 #ifdef CONFIG_PM_SLEEP
 /* kernel/power/main.c */
+extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
+				    int *nr_calls);
 extern int pm_notifier_call_chain(unsigned long val);
 #endif
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 024411816ccf..58209d8bfc56 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -268,16 +268,18 @@ static int suspend_test(int level)
  */
 static int suspend_prepare(suspend_state_t state)
 {
-	int error;
+	int error, nr_calls = 0;
 
 	if (!sleep_state_supported(state))
 		return -EPERM;
 
 	pm_prepare_console();
 
-	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
-	if (error)
+	error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
 		goto Finish;
+	}
 
 	trace_suspend_resume(TPS("freeze_processes"), 0, true);
 	error = suspend_freeze_processes();
@@ -288,7 +290,7 @@ static int suspend_prepare(suspend_state_t state)
 	suspend_stats.failed_freeze++;
 	dpm_save_failed_step(SUSPEND_FREEZE);
  Finish:
-	pm_notifier_call_chain(PM_POST_SUSPEND);
+	__pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
 	pm_restore_console();
 	return error;
 }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e8911460a..35310b627388 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
 static int snapshot_open(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
-	int error;
+	int error, nr_calls = 0;
 
 	if (!hibernation_available())
 		return -EPERM;
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 			swap_type_of(swsusp_resume_device, 0, NULL) : -1;
 		data->mode = O_RDONLY;
 		data->free_bitmaps = false;
-		error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+		error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
 		if (error)
-			pm_notifier_call_chain(PM_POST_HIBERNATION);
+			__pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
 	} else {
 		/*
 		 * Resuming.  We may need to wait for the image device to
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 
 		data->swap = -1;
 		data->mode = O_WRONLY;
-		error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+		error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
 		if (!error) {
 			error = create_basic_memory_bitmaps();
 			data->free_bitmaps = !error;
-		}
+		} else
+			nr_calls--;
+
 		if (error)
-			pm_notifier_call_chain(PM_POST_RESTORE);
+			__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
 	}
 	if (error)
 		atomic_inc(&snapshot_device_available);

From 8eb24ae4dcd3e2b40a9b0f7ca585ff0dab2350c0 Mon Sep 17 00:00:00 2001
From: Wei Wang <wvw@google.com>
Date: Fri, 7 Apr 2017 15:22:19 -0700
Subject: [PATCH 0755/3220] UPSTREAM: checkpatch: special audit for revert
 commit line

Currently checkpatch.pl does not recognize git's default
commit revert message and will complain about the hash format.
Add special audit for revert commit message line to fix it.

Signed-off-by: Wei Wang <wvw@google.com>
Acked-by: Joe Perches <joe@perches.com>
Bug: 37158168
Test: checkpatch.pl --patch [diff] and no longer see failure
Change-Id: I65cf9a46874621dd6d5c349d2d3ca3b862d61ba3
---
 scripts/checkpatch.pl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2b3c22808c3b..e70147742cce 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2348,6 +2348,7 @@ sub process {
 
 # Check for git id commit length and improperly formed commit descriptions
 		if ($in_commit_log && !$commit_log_possible_stack_dump &&
+		    $line !~ /^This reverts commit [0-9a-f]{7,40}/ &&
 		    ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
 		     ($line =~ /\b[0-9a-f]{12,40}\b/i &&
 		      $line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i &&

From 2295052f9de4d8288015e7d97ab0363a71604e64 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 10 Apr 2017 20:54:30 -0700
Subject: [PATCH 0756/3220] ANDROID: sdcardfs: Directly pass lower file for
 mmap

Instead of relying on a copy hack, pass the lower file
as private data. This lets the kernel find the vma
mapping for pages used by the file, allowing pages
used by mapping to be reclaimed.

This is adapted from following esdfs patches
commit 0647e638d: ("esdfs: store lower file in vm_file for mmap")
commit 064850866: ("esdfs: keep a counter for mmaped file")

Change-Id: I75b74d1e5061db1b8c13be38d184e118c0851a1a
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/file.c |  3 +++
 fs/sdcardfs/mmap.c | 57 ++++++++++++++++++----------------------------
 2 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index c0146e03fa2e..1f6921e2ffbf 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -192,6 +192,9 @@ static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma)
 	file->f_mapping->a_ops = &sdcardfs_aops; /* set our aops */
 	if (!SDCARDFS_F(file)->lower_vm_ops) /* save for our ->fault */
 		SDCARDFS_F(file)->lower_vm_ops = saved_vm_ops;
+	vma->vm_private_data = file;
+	get_file(lower_file);
+	vma->vm_file = lower_file;
 
 out:
 	return err;
diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c
index 0d4089c62c3a..b61f82275e7d 100644
--- a/fs/sdcardfs/mmap.c
+++ b/fs/sdcardfs/mmap.c
@@ -23,60 +23,45 @@
 static int sdcardfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int err;
-	struct file *file, *lower_file;
+	struct file *file;
 	const struct vm_operations_struct *lower_vm_ops;
-	struct vm_area_struct lower_vma;
 
-	memcpy(&lower_vma, vma, sizeof(struct vm_area_struct));
-	file = lower_vma.vm_file;
+	file = (struct file *)vma->vm_private_data;
 	lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
 	BUG_ON(!lower_vm_ops);
 
-	lower_file = sdcardfs_lower_file(file);
-	/*
-	 * XXX: vm_ops->fault may be called in parallel.  Because we have to
-	 * resort to temporarily changing the vma->vm_file to point to the
-	 * lower file, a concurrent invocation of sdcardfs_fault could see a
-	 * different value.  In this workaround, we keep a different copy of
-	 * the vma structure in our stack, so we never expose a different
-	 * value of the vma->vm_file called to us, even temporarily.  A
-	 * better fix would be to change the calling semantics of ->fault to
-	 * take an explicit file pointer.
-	 */
-	lower_vma.vm_file = lower_file;
-	err = lower_vm_ops->fault(&lower_vma, vmf);
+	err = lower_vm_ops->fault(vma, vmf);
 	return err;
 }
 
+static void sdcardfs_vm_open(struct vm_area_struct *vma)
+{
+	struct file *file = (struct file *)vma->vm_private_data;
+
+	get_file(file);
+}
+
+static void sdcardfs_vm_close(struct vm_area_struct *vma)
+{
+	struct file *file = (struct file *)vma->vm_private_data;
+
+	fput(file);
+}
+
 static int sdcardfs_page_mkwrite(struct vm_area_struct *vma,
 			       struct vm_fault *vmf)
 {
 	int err = 0;
-	struct file *file, *lower_file;
+	struct file *file;
 	const struct vm_operations_struct *lower_vm_ops;
-	struct vm_area_struct lower_vma;
 
-	memcpy(&lower_vma, vma, sizeof(struct vm_area_struct));
-	file = lower_vma.vm_file;
+	file = (struct file *)vma->vm_private_data;
 	lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
 	BUG_ON(!lower_vm_ops);
 	if (!lower_vm_ops->page_mkwrite)
 		goto out;
 
-	lower_file = sdcardfs_lower_file(file);
-	/*
-	 * XXX: vm_ops->page_mkwrite may be called in parallel.
-	 * Because we have to resort to temporarily changing the
-	 * vma->vm_file to point to the lower file, a concurrent
-	 * invocation of sdcardfs_page_mkwrite could see a different
-	 * value.  In this workaround, we keep a different copy of the
-	 * vma structure in our stack, so we never expose a different
-	 * value of the vma->vm_file called to us, even temporarily.
-	 * A better fix would be to change the calling semantics of
-	 * ->page_mkwrite to take an explicit file pointer.
-	 */
-	lower_vma.vm_file = lower_file;
-	err = lower_vm_ops->page_mkwrite(&lower_vma, vmf);
+	err = lower_vm_ops->page_mkwrite(vma, vmf);
 out:
 	return err;
 }
@@ -99,4 +84,6 @@ const struct address_space_operations sdcardfs_aops = {
 const struct vm_operations_struct sdcardfs_vm_ops = {
 	.fault		= sdcardfs_fault,
 	.page_mkwrite	= sdcardfs_page_mkwrite,
+	.open		= sdcardfs_vm_open,
+	.close		= sdcardfs_vm_close,
 };

From 84b6001987a05f1d16bc3939f8ebf1cf9bd672e2 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Tue, 11 Apr 2017 21:39:47 +0000
Subject: [PATCH 0757/3220] Revert "Revert "Revert "CHROMIUM: android: binder:
 Fix potential scheduling-while-atomic"""

This reverts commit 6a3b9c4984f9edc5a136720e42f1d2ab387857a4.

Sigh. Confusion reigns. The rest of the preempt_disable patch is not in common, so this shouldn't be here afterall (it is in several downstream branches that therefore need this one too).

Re-reverting. We don't want the preempt_disable stuff in common since fine-grained locking is coming soon.

Change-Id: I2595516cab28041fa72f4a38692266a0f2a01ab4
---
 drivers/android/binder.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 22025bcc38e9..9cf4f9bbc711 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -417,7 +417,6 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 	struct files_struct *files = proc->files;
 	unsigned long rlim_cur;
 	unsigned long irqs;
-	int ret;
 
 	if (files == NULL)
 		return -ESRCH;
@@ -428,11 +427,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
 	unlock_task_sighand(proc->tsk, &irqs);
 
-	preempt_enable_no_resched();
-	ret = __alloc_fd(files, 0, rlim_cur, flags);
-	preempt_disable();
-
-	return ret;
+	return __alloc_fd(files, 0, rlim_cur, flags);
 }
 
 /*
@@ -441,11 +436,8 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
-	if (proc->files) {
-		preempt_enable_no_resched();
+	if (proc->files)
 		__fd_install(proc->files, fd, file);
-		preempt_disable();
-	}
 }
 
 /*

From 8affe1f70379abeafb9bc1ee81132beae3e76b93 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 29 Mar 2017 16:11:20 +0200
Subject: [PATCH 0758/3220] UPSTREAM: net/packet: fix overflow in check for
 priv area size

Subtracting tp_sizeof_priv from tp_block_size and casting to int
to check whether one is less then the other doesn't always work
(both of them are unsigned ints).

Compare them as is instead.

Also cast tp_sizeof_priv to u64 before using BLK_PLUS_PRIV, as
it can overflow inside BLK_PLUS_PRIV otherwise.

Bug: 36725304
Upstream commit: 2b6867c2ce76c596676bec7d2d525af525fdc6e2
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Change-Id: I46bfbaf5f4a5d80f10ddce731a3030f191de4b28
---
 net/packet/af_packet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3975ac809934..d76800108ddb 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4138,8 +4138,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
 			goto out;
 		if (po->tp_version >= TPACKET_V3 &&
-		    (int)(req->tp_block_size -
-			  BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
+		    req->tp_block_size <=
+			  BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
 			goto out;
 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
 					po->tp_reserve))

From a973601dcbc98f41c9085ce91a68f314c046cf9c Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 23 Dec 2016 00:33:57 +0900
Subject: [PATCH 0759/3220] UPSTREAM: net: ipv4: Don't crash if passing a null
 sk to ip_do_redirect.

Commit e2d118a1cb5e ("net: inet: Support UID-based routing in IP
protocols.") made ip_do_redirect call sock_net(sk) to determine
the network namespace of the passed-in socket. This crashes if sk
is NULL.

Fix this by getting the network namespace from the skb instead.

Fixes: e2d118a1cb5e ("net: inet: Support UID-based routing in IP protocols.")
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Change-Id: I16a3c343cb142c482ca6dd363c28b3a12d73a46d
Fixes: Change-Id: I910504b508948057912bc188fd1e8aca28294de3
       ("net: inet: Support UID-based routing in IP protocols.")
(cherry picked from commit 7d99569460eae28b187d574aec930a4cf8b90441)
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 net/ipv4/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9af0f461b00d..d162ce41f761 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -792,6 +792,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 	struct rtable *rt;
 	struct flowi4 fl4;
 	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	struct net *net = dev_net(skb->dev);
 	int oif = skb->dev->ifindex;
 	u8 tos = RT_TOS(iph->tos);
 	u8 prot = iph->protocol;
@@ -799,7 +800,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 
 	rt = (struct rtable *) dst;
 
-	__build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0);
+	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 	__ip_do_redirect(rt, skb, &fl4, true);
 }
 

From 2a8af3ae8d93b34b4162f7c05e1b0a63cd22ec07 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Tue, 10 Jan 2017 09:30:51 +0100
Subject: [PATCH 0760/3220] UPSTREAM: net: socket: Make unnecessarily global
 sockfs_setattr() static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make sockfs_setattr() static as it is not used outside of net/socket.c

This fixes the following GCC warning:
net/socket.c:534:5: warning: no previous prototype for ‘sockfs_setattr’ [-Wmissing-prototypes]

Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.")
Cc: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Change-Id: Ie613c441b3fe081bdaec8c480d3aade482873bf8
Fixes: Change-Id: Idbc3e9a0cec91c4c6e01916b967b6237645ebe59
       ("net: core: Add a UID field to struct sock.")
(cherry picked from commit dc647ec88e029307e60e6bf9988056605f11051a)
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 net/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/socket.c b/net/socket.c
index 1489761b371e..18aff3d804ec 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -520,7 +520,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
 	return used;
 }
 
-int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	int err = simple_setattr(dentry, iattr);
 

From 9e242ec92f951165b52de630ca8ecc6a2748f655 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 22 Feb 2016 11:49:09 -0500
Subject: [PATCH 0761/3220] BACKPORT: [UPSTREAM] mbcache2: reimplement mbcache

(Cherry-pick from commit f9a61eb4e2471c56a63cd804c7474128138c38ac)

Original mbcache was designed to have more features than what ext?
filesystems ended up using. It supported entry being in more hashes, it
had a home-grown rwlocking of each entry, and one cache could cache
entries from multiple filesystems. This genericity also resulted in more
complex locking, larger cache entries, and generally more code
complexity.

This is reimplementation of the mbcache functionality to exactly fit the
purpose ext? filesystems use it for. Cache entries are now considerably
smaller (7 instead of 13 longs), the code is considerably smaller as
well (414 vs 913 lines of code), and IMO also simpler. The new code is
also much more lightweight.

I have measured the speed using artificial xattr-bench benchmark, which
spawns P processes, each process sets xattr for F different files, and
the value of xattr is randomly chosen from a pool of V values. Averages
of runtimes for 5 runs for various combinations of parameters are below.
The first value in each cell is old mbache, the second value is the new
mbcache.

V=10
F\P	1		2		4		8		16		32		64
10	0.158,0.157	0.208,0.196	0.500,0.277	0.798,0.400	3.258,0.584	13.807,1.047	61.339,2.803
100	0.172,0.167	0.279,0.222	0.520,0.275	0.825,0.341	2.981,0.505	12.022,1.202	44.641,2.943
1000	0.185,0.174	0.297,0.239	0.445,0.283	0.767,0.340	2.329,0.480	6.342,1.198	16.440,3.888

V=100
F\P	1		2		4		8		16		32		64
10	0.162,0.153	0.200,0.186	0.362,0.257	0.671,0.496	1.433,0.943	3.801,1.345	7.938,2.501
100	0.153,0.160	0.221,0.199	0.404,0.264	0.945,0.379	1.556,0.485	3.761,1.156	7.901,2.484
1000	0.215,0.191	0.303,0.246	0.471,0.288	0.960,0.347	1.647,0.479	3.916,1.176	8.058,3.160

V=1000
F\P	1		2		4		8		16		32		64
10	0.151,0.129	0.210,0.163	0.326,0.245	0.685,0.521	1.284,0.859	3.087,2.251	6.451,4.801
100	0.154,0.153	0.211,0.191	0.276,0.282	0.687,0.506	1.202,0.877	3.259,1.954	8.738,2.887
1000	0.145,0.179	0.202,0.222	0.449,0.319	0.899,0.333	1.577,0.524	4.221,1.240	9.782,3.579

V=10000
F\P	1		2		4		8		16		32		64
10	0.161,0.154	0.198,0.190	0.296,0.256	0.662,0.480	1.192,0.818	2.989,2.200	6.362,4.746
100	0.176,0.174	0.236,0.203	0.326,0.255	0.696,0.511	1.183,0.855	4.205,3.444	19.510,17.760
1000	0.199,0.183	0.240,0.227	1.159,1.014	2.286,2.154	6.023,6.039	---,10.933	---,36.620

V=100000
F\P	1		2		4		8		16		32		64
10	0.171,0.162	0.204,0.198	0.285,0.230	0.692,0.500	1.225,0.881	2.990,2.243	6.379,4.771
100	0.151,0.171	0.220,0.210	0.295,0.255	0.720,0.518	1.226,0.844	3.423,2.831	19.234,17.544
1000	0.192,0.189	0.249,0.225	1.162,1.043	2.257,2.093	5.853,4.997	---,10.399	---,32.198

We see that the new code is faster in pretty much all the cases and
starting from 4 processes there are significant gains with the new code
resulting in upto 20-times shorter runtimes. Also for large numbers of
cached entries all values for the old code could not be measured as the
kernel started hitting softlockups and died before the test completed.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Bug: 32461228
---
 fs/Makefile              |   2 +-
 fs/mbcache2.c            | 359 +++++++++++++++++++++++++++++++++++++++
 include/linux/mbcache2.h |  50 ++++++
 3 files changed, 410 insertions(+), 1 deletion(-)
 create mode 100644 fs/mbcache2.c
 create mode 100644 include/linux/mbcache2.h

diff --git a/fs/Makefile b/fs/Makefile
index 3b54070cd629..dee237540bc0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
 obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
 
-obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
+obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o mbcache2.o
 obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_COREDUMP)		+= coredump.o
diff --git a/fs/mbcache2.c b/fs/mbcache2.c
new file mode 100644
index 000000000000..5c3e1a8c38f6
--- /dev/null
+++ b/fs/mbcache2.c
@@ -0,0 +1,359 @@
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/list_bl.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mbcache2.h>
+
+/*
+ * Mbcache is a simple key-value store. Keys need not be unique, however
+ * key-value pairs are expected to be unique (we use this fact in
+ * mb2_cache_entry_delete_block()).
+ *
+ * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
+ * They use hash of a block contents as a key and block number as a value.
+ * That's why keys need not be unique (different xattr blocks may end up having
+ * the same hash). However block number always uniquely identifies a cache
+ * entry.
+ *
+ * We provide functions for creation and removal of entries, search by key,
+ * and a special "delete entry with given key-value pair" operation. Fixed
+ * size hash table is used for fast key lookups.
+ */
+
+struct mb2_cache {
+	/* Hash table of entries */
+	struct hlist_bl_head	*c_hash;
+	/* log2 of hash table size */
+	int			c_bucket_bits;
+	/* Protects c_lru_list, c_entry_count */
+	spinlock_t		c_lru_list_lock;
+	struct list_head	c_lru_list;
+	/* Number of entries in cache */
+	unsigned long		c_entry_count;
+	struct shrinker		c_shrink;
+};
+
+static struct kmem_cache *mb2_entry_cache;
+
+/*
+ * mb2_cache_entry_create - create entry in cache
+ * @cache - cache where the entry should be created
+ * @mask - gfp mask with which the entry should be allocated
+ * @key - key of the entry
+ * @block - block that contains data
+ *
+ * Creates entry in @cache with key @key and records that data is stored in
+ * block @block. The function returns -EBUSY if entry with the same key
+ * and for the same block already exists in cache. Otherwise 0 is returned.
+ */
+int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key,
+			   sector_t block)
+{
+	struct mb2_cache_entry *entry, *dup;
+	struct hlist_bl_node *dup_node;
+	struct hlist_bl_head *head;
+
+	entry = kmem_cache_alloc(mb2_entry_cache, mask);
+	if (!entry)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&entry->e_lru_list);
+	/* One ref for hash, one ref returned */
+	atomic_set(&entry->e_refcnt, 1);
+	entry->e_key = key;
+	entry->e_block = block;
+	head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
+	entry->e_hash_list_head = head;
+	hlist_bl_lock(head);
+	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
+		if (dup->e_key == key && dup->e_block == block) {
+			hlist_bl_unlock(head);
+			kmem_cache_free(mb2_entry_cache, entry);
+			return -EBUSY;
+		}
+	}
+	hlist_bl_add_head(&entry->e_hash_list, head);
+	hlist_bl_unlock(head);
+
+	spin_lock(&cache->c_lru_list_lock);
+	list_add_tail(&entry->e_lru_list, &cache->c_lru_list);
+	/* Grab ref for LRU list */
+	atomic_inc(&entry->e_refcnt);
+	cache->c_entry_count++;
+	spin_unlock(&cache->c_lru_list_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(mb2_cache_entry_create);
+
+void __mb2_cache_entry_free(struct mb2_cache_entry *entry)
+{
+	kmem_cache_free(mb2_entry_cache, entry);
+}
+EXPORT_SYMBOL(__mb2_cache_entry_free);
+
+static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache,
+					    struct mb2_cache_entry *entry,
+					    u32 key)
+{
+	struct mb2_cache_entry *old_entry = entry;
+	struct hlist_bl_node *node;
+	struct hlist_bl_head *head;
+
+	if (entry)
+		head = entry->e_hash_list_head;
+	else
+		head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
+	hlist_bl_lock(head);
+	if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
+		node = entry->e_hash_list.next;
+	else
+		node = hlist_bl_first(head);
+	while (node) {
+		entry = hlist_bl_entry(node, struct mb2_cache_entry,
+				       e_hash_list);
+		if (entry->e_key == key) {
+			atomic_inc(&entry->e_refcnt);
+			goto out;
+		}
+		node = node->next;
+	}
+	entry = NULL;
+out:
+	hlist_bl_unlock(head);
+	if (old_entry)
+		mb2_cache_entry_put(cache, old_entry);
+
+	return entry;
+}
+
+/*
+ * mb2_cache_entry_find_first - find the first entry in cache with given key
+ * @cache: cache where we should search
+ * @key: key to look for
+ *
+ * Search in @cache for entry with key @key. Grabs reference to the first
+ * entry found and returns the entry.
+ */
+struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache,
+						   u32 key)
+{
+	return __entry_find(cache, NULL, key);
+}
+EXPORT_SYMBOL(mb2_cache_entry_find_first);
+
+/*
+ * mb2_cache_entry_find_next - find next entry in cache with the same
+ * @cache: cache where we should search
+ * @entry: entry to start search from
+ *
+ * Finds next entry in the hash chain which has the same key as @entry.
+ * If @entry is unhashed (which can happen when deletion of entry races
+ * with the search), finds the first entry in the hash chain. The function
+ * drops reference to @entry and returns with a reference to the found entry.
+ */
+struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache,
+						  struct mb2_cache_entry *entry)
+{
+	return __entry_find(cache, entry, entry->e_key);
+}
+EXPORT_SYMBOL(mb2_cache_entry_find_next);
+
+/* mb2_cache_entry_delete_block - remove information about block from cache
+ * @cache - cache we work with
+ * @key - key of the entry to remove
+ * @block - block containing data for @key
+ *
+ * Remove entry from cache @cache with key @key with data stored in @block.
+ */
+void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key,
+				  sector_t block)
+{
+	struct hlist_bl_node *node;
+	struct hlist_bl_head *head;
+	struct mb2_cache_entry *entry;
+
+	head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
+	hlist_bl_lock(head);
+	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+		if (entry->e_key == key && entry->e_block == block) {
+			/* We keep hash list reference to keep entry alive */
+			hlist_bl_del_init(&entry->e_hash_list);
+			hlist_bl_unlock(head);
+			spin_lock(&cache->c_lru_list_lock);
+			if (!list_empty(&entry->e_lru_list)) {
+				list_del_init(&entry->e_lru_list);
+				cache->c_entry_count--;
+				atomic_dec(&entry->e_refcnt);
+			}
+			spin_unlock(&cache->c_lru_list_lock);
+			mb2_cache_entry_put(cache, entry);
+			return;
+		}
+	}
+	hlist_bl_unlock(head);
+}
+EXPORT_SYMBOL(mb2_cache_entry_delete_block);
+
+/* mb2_cache_entry_touch - cache entry got used
+ * @cache - cache the entry belongs to
+ * @entry - entry that got used
+ *
+ * Move entry in lru list to reflect the fact that it was used.
+ */
+void mb2_cache_entry_touch(struct mb2_cache *cache,
+			   struct mb2_cache_entry *entry)
+{
+	spin_lock(&cache->c_lru_list_lock);
+	if (!list_empty(&entry->e_lru_list))
+		list_move_tail(&cache->c_lru_list, &entry->e_lru_list);
+	spin_unlock(&cache->c_lru_list_lock);
+}
+EXPORT_SYMBOL(mb2_cache_entry_touch);
+
+static unsigned long mb2_cache_count(struct shrinker *shrink,
+				     struct shrink_control *sc)
+{
+	struct mb2_cache *cache = container_of(shrink, struct mb2_cache,
+					       c_shrink);
+
+	return cache->c_entry_count;
+}
+
+/* Shrink number of entries in cache */
+static unsigned long mb2_cache_scan(struct shrinker *shrink,
+				    struct shrink_control *sc)
+{
+	int nr_to_scan = sc->nr_to_scan;
+	struct mb2_cache *cache = container_of(shrink, struct mb2_cache,
+					      c_shrink);
+	struct mb2_cache_entry *entry;
+	struct hlist_bl_head *head;
+	unsigned int shrunk = 0;
+
+	spin_lock(&cache->c_lru_list_lock);
+	while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) {
+		entry = list_first_entry(&cache->c_lru_list,
+					 struct mb2_cache_entry, e_lru_list);
+		list_del_init(&entry->e_lru_list);
+		cache->c_entry_count--;
+		/*
+		 * We keep LRU list reference so that entry doesn't go away
+		 * from under us.
+		 */
+		spin_unlock(&cache->c_lru_list_lock);
+		head = entry->e_hash_list_head;
+		hlist_bl_lock(head);
+		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+			hlist_bl_del_init(&entry->e_hash_list);
+			atomic_dec(&entry->e_refcnt);
+		}
+		hlist_bl_unlock(head);
+		if (mb2_cache_entry_put(cache, entry))
+			shrunk++;
+		cond_resched();
+		spin_lock(&cache->c_lru_list_lock);
+	}
+	spin_unlock(&cache->c_lru_list_lock);
+
+	return shrunk;
+}
+
+/*
+ * mb2_cache_create - create cache
+ * @bucket_bits: log2 of the hash table size
+ *
+ * Create cache for keys with 2^bucket_bits hash entries.
+ */
+struct mb2_cache *mb2_cache_create(int bucket_bits)
+{
+	struct mb2_cache *cache;
+	int bucket_count = 1 << bucket_bits;
+	int i;
+
+	if (!try_module_get(THIS_MODULE))
+		return NULL;
+
+	cache = kzalloc(sizeof(struct mb2_cache), GFP_KERNEL);
+	if (!cache)
+		goto err_out;
+	cache->c_bucket_bits = bucket_bits;
+	INIT_LIST_HEAD(&cache->c_lru_list);
+	spin_lock_init(&cache->c_lru_list_lock);
+	cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
+				GFP_KERNEL);
+	if (!cache->c_hash) {
+		kfree(cache);
+		goto err_out;
+	}
+	for (i = 0; i < bucket_count; i++)
+		INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
+
+	cache->c_shrink.count_objects = mb2_cache_count;
+	cache->c_shrink.scan_objects = mb2_cache_scan;
+	cache->c_shrink.seeks = DEFAULT_SEEKS;
+	register_shrinker(&cache->c_shrink);
+
+	return cache;
+
+err_out:
+	module_put(THIS_MODULE);
+	return NULL;
+}
+EXPORT_SYMBOL(mb2_cache_create);
+
+/*
+ * mb2_cache_destroy - destroy cache
+ * @cache: the cache to destroy
+ *
+ * Free all entries in cache and cache itself. Caller must make sure nobody
+ * (except shrinker) can reach @cache when calling this.
+ */
+void mb2_cache_destroy(struct mb2_cache *cache)
+{
+	struct mb2_cache_entry *entry, *next;
+
+	unregister_shrinker(&cache->c_shrink);
+
+	/*
+	 * We don't bother with any locking. Cache must not be used at this
+	 * point.
+	 */
+	list_for_each_entry_safe(entry, next, &cache->c_lru_list, e_lru_list) {
+		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+			hlist_bl_del_init(&entry->e_hash_list);
+			atomic_dec(&entry->e_refcnt);
+		} else
+			WARN_ON(1);
+		list_del(&entry->e_lru_list);
+		WARN_ON(atomic_read(&entry->e_refcnt) != 1);
+		mb2_cache_entry_put(cache, entry);
+	}
+	kfree(cache->c_hash);
+	kfree(cache);
+	module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL(mb2_cache_destroy);
+
+static int __init mb2cache_init(void)
+{
+	mb2_entry_cache = kmem_cache_create("mbcache",
+				sizeof(struct mb2_cache_entry), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	BUG_ON(!mb2_entry_cache);
+	return 0;
+}
+
+static void __exit mb2cache_exit(void)
+{
+	kmem_cache_destroy(mb2_entry_cache);
+}
+
+module_init(mb2cache_init)
+module_exit(mb2cache_exit)
+
+MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h
new file mode 100644
index 000000000000..b6f160ff2533
--- /dev/null
+++ b/include/linux/mbcache2.h
@@ -0,0 +1,50 @@
+#ifndef _LINUX_MB2CACHE_H
+#define _LINUX_MB2CACHE_H
+
+#include <linux/hash.h>
+#include <linux/list_bl.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/fs.h>
+
+struct mb2_cache;
+
+struct mb2_cache_entry {
+	/* LRU list - protected by cache->c_lru_list_lock */
+	struct list_head	e_lru_list;
+	/* Hash table list - protected by bitlock in e_hash_list_head */
+	struct hlist_bl_node	e_hash_list;
+	atomic_t		e_refcnt;
+	/* Key in hash - stable during lifetime of the entry */
+	u32			e_key;
+	/* Block number of hashed block - stable during lifetime of the entry */
+	sector_t		e_block;
+	/* Head of hash list (for list bit lock) - stable */
+	struct hlist_bl_head	*e_hash_list_head;
+};
+
+struct mb2_cache *mb2_cache_create(int bucket_bits);
+void mb2_cache_destroy(struct mb2_cache *cache);
+
+int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key,
+			   sector_t block);
+void __mb2_cache_entry_free(struct mb2_cache_entry *entry);
+static inline int mb2_cache_entry_put(struct mb2_cache *cache,
+				      struct mb2_cache_entry *entry)
+{
+	if (!atomic_dec_and_test(&entry->e_refcnt))
+		return 0;
+	__mb2_cache_entry_free(entry);
+	return 1;
+}
+
+void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key,
+				  sector_t block);
+struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache,
+						   u32 key);
+struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache,
+						  struct mb2_cache_entry *entry);
+void mb2_cache_entry_touch(struct mb2_cache *cache,
+			   struct mb2_cache_entry *entry);
+
+#endif	/* _LINUX_MB2CACHE_H */

From e29de4e87171faa5d3ab882b0bc555bbd07c9c5b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 22 Feb 2016 11:50:13 -0500
Subject: [PATCH 0762/3220] BACKPORT [UPSTREAM] ext4: convert to mbcache2

(Cherry-pick from commit 82939d7999dfc1f1998c4b1c12e2f19edbdff272)

The conversion is generally straightforward. The only tricky part is
that xattr block corresponding to found mbcache entry can get freed
before we get buffer lock for that block. So we have to check whether
the entry is still valid after getting buffer lock.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Bug; 32461228
---
 fs/ext4/ext4.h  |   2 +-
 fs/ext4/super.c |   7 ++-
 fs/ext4/xattr.c | 136 ++++++++++++++++++++++++------------------------
 fs/ext4/xattr.h |   5 +-
 4 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ac720a31e4ff..2b3ac9fa9460 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1442,7 +1442,7 @@ struct ext4_sb_info {
 	struct list_head s_es_list;	/* List of inodes with reclaimable extents */
 	long s_es_nr_inode;
 	struct ext4_es_stats s_es_stats;
-	struct mb_cache *s_mb_cache;
+	struct mb2_cache *s_mb_cache;
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Ratelimit ext4 messages. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 68345a9e59b8..bd8831bfbafe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -816,7 +816,6 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
-	ext4_xattr_put_super(sb);
 
 	if (!(sb->s_flags & MS_RDONLY) && !aborted) {
 		ext4_clear_feature_journal_needs_recovery(sb);
@@ -3833,7 +3832,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 no_journal:
 	if (ext4_mballoc_ready) {
-		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+		sbi->s_mb_cache = ext4_xattr_create_cache();
 		if (!sbi->s_mb_cache) {
 			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
 			goto failed_mount_wq;
@@ -4065,6 +4064,10 @@ failed_mount4:
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
+	if (sbi->s_mb_cache) {
+		ext4_xattr_destroy_cache(sbi->s_mb_cache);
+		sbi->s_mb_cache = NULL;
+	}
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 263002f0389d..f7455e668474 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -53,7 +53,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/mbcache.h>
+#include <linux/mbcache2.h>
 #include <linux/quotaops.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -80,10 +80,10 @@
 # define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
-static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb2_cache *, struct buffer_head *);
 static struct buffer_head *ext4_xattr_cache_find(struct inode *,
 						 struct ext4_xattr_header *,
-						 struct mb_cache_entry **);
+						 struct mb2_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
 			      struct ext4_xattr_entry *);
 static int ext4_xattr_list(struct dentry *dentry, char *buffer,
@@ -279,7 +279,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	struct ext4_xattr_entry *entry;
 	size_t size;
 	int error;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+	struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
 		  name_index, name, buffer, (long)buffer_size);
@@ -426,7 +426,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	struct inode *inode = d_inode(dentry);
 	struct buffer_head *bh = NULL;
 	int error;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+	struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
@@ -543,11 +543,8 @@ static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 			 struct buffer_head *bh)
 {
-	struct mb_cache_entry *ce = NULL;
 	int error = 0;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
-	ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
 	BUFFER_TRACE(bh, "get_write_access");
 	error = ext4_journal_get_write_access(handle, bh);
 	if (error)
@@ -555,9 +552,15 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 
 	lock_buffer(bh);
 	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+		__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+
 		ea_bdebug(bh, "refcount now=0; freeing");
-		if (ce)
-			mb_cache_entry_free(ce);
+		/*
+		 * This must happen under buffer lock for
+		 * ext4_xattr_block_set() to reliably detect freed block
+		 */
+		mb2_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash,
+					     bh->b_blocknr);
 		get_bh(bh);
 		unlock_buffer(bh);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
@@ -565,8 +568,6 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 				 EXT4_FREE_BLOCKS_FORGET);
 	} else {
 		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
-		if (ce)
-			mb_cache_entry_release(ce);
 		/*
 		 * Beware of this ugliness: Releasing of xattr block references
 		 * from different inodes can race and so we have to protect
@@ -779,17 +780,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *new_bh = NULL;
 	struct ext4_xattr_search *s = &bs->s;
-	struct mb_cache_entry *ce = NULL;
+	struct mb2_cache_entry *ce = NULL;
 	int error = 0;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+	struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
 	if (i->value && i->value_len > sb->s_blocksize)
 		return -ENOSPC;
 	if (s->base) {
-		ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
-					bs->bh->b_blocknr);
 		BUFFER_TRACE(bs->bh, "get_write_access");
 		error = ext4_journal_get_write_access(handle, bs->bh);
 		if (error)
@@ -797,10 +796,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		lock_buffer(bs->bh);
 
 		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
-			if (ce) {
-				mb_cache_entry_free(ce);
-				ce = NULL;
-			}
+			__u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);
+
+			/*
+			 * This must happen under buffer lock for
+			 * ext4_xattr_block_set() to reliably detect modified
+			 * block
+			 */
+			mb2_cache_entry_delete_block(ext4_mb_cache, hash,
+						     bs->bh->b_blocknr);
 			ea_bdebug(bs->bh, "modifying in-place");
 			error = ext4_xattr_set_entry(i, s);
 			if (!error) {
@@ -824,10 +828,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			int offset = (char *)s->here - bs->bh->b_data;
 
 			unlock_buffer(bs->bh);
-			if (ce) {
-				mb_cache_entry_release(ce);
-				ce = NULL;
-			}
 			ea_bdebug(bs->bh, "cloning");
 			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
 			error = -ENOMEM;
@@ -882,6 +882,31 @@ inserted:
 				if (error)
 					goto cleanup_dquot;
 				lock_buffer(new_bh);
+				/*
+				 * We have to be careful about races with
+				 * freeing or rehashing of xattr block. Once we
+				 * hold buffer lock xattr block's state is
+				 * stable so we can check whether the block got
+				 * freed / rehashed or not.  Since we unhash
+				 * mbcache entry under buffer lock when freeing
+				 * / rehashing xattr block, checking whether
+				 * entry is still hashed is reliable.
+				 */
+				if (hlist_bl_unhashed(&ce->e_hash_list)) {
+					/*
+					 * Undo everything and check mbcache
+					 * again.
+					 */
+					unlock_buffer(new_bh);
+					dquot_free_block(inode,
+							 EXT4_C2B(EXT4_SB(sb),
+								  1));
+					brelse(new_bh);
+					mb2_cache_entry_put(ext4_mb_cache, ce);
+					ce = NULL;
+					new_bh = NULL;
+					goto inserted;
+				}
 				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
 				ea_bdebug(new_bh, "reusing; refcount now=%d",
 					le32_to_cpu(BHDR(new_bh)->h_refcount));
@@ -892,7 +917,8 @@ inserted:
 				if (error)
 					goto cleanup_dquot;
 			}
-			mb_cache_entry_release(ce);
+			mb2_cache_entry_touch(ext4_mb_cache, ce);
+			mb2_cache_entry_put(ext4_mb_cache, ce);
 			ce = NULL;
 		} else if (bs->bh && s->base == bs->bh->b_data) {
 			/* We were modifying this block in-place. */
@@ -957,7 +983,7 @@ getblk_failed:
 
 cleanup:
 	if (ce)
-		mb_cache_entry_release(ce);
+		mb2_cache_entry_put(ext4_mb_cache, ce);
 	brelse(new_bh);
 	if (!(bs->bh && s->base == bs->bh->b_data))
 		kfree(s->base);
@@ -1518,17 +1544,6 @@ cleanup:
 	brelse(bh);
 }
 
-/*
- * ext4_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext4_xattr_put_super(struct super_block *sb)
-{
-	mb_cache_shrink(sb->s_bdev);
-}
-
 /*
  * ext4_xattr_cache_insert()
  *
@@ -1538,28 +1553,18 @@ ext4_xattr_put_super(struct super_block *sb)
  * Returns 0, or a negative error number on failure.
  */
 static void
-ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb2_cache *ext4_mb_cache, struct buffer_head *bh)
 {
 	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
-	struct mb_cache_entry *ce;
 	int error;
 
-	ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
-	if (!ce) {
-		ea_bdebug(bh, "out of memory");
-		return;
-	}
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+	error = mb2_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
+				       bh->b_blocknr);
 	if (error) {
-		mb_cache_entry_free(ce);
-		if (error == -EBUSY) {
+		if (error == -EBUSY)
 			ea_bdebug(bh, "already in cache");
-			error = 0;
-		}
-	} else {
+	} else
 		ea_bdebug(bh, "inserting [%x]", (int)hash);
-		mb_cache_entry_release(ce);
-	}
 }
 
 /*
@@ -1612,26 +1617,19 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
  */
 static struct buffer_head *
 ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
-		      struct mb_cache_entry **pce)
+		      struct mb2_cache_entry **pce)
 {
 	__u32 hash = le32_to_cpu(header->h_hash);
-	struct mb_cache_entry *ce;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+	struct mb2_cache_entry *ce;
+	struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	if (!header->h_hash)
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
-	ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
-				       hash);
+	ce = mb2_cache_entry_find_first(ext4_mb_cache, hash);
 	while (ce) {
 		struct buffer_head *bh;
 
-		if (IS_ERR(ce)) {
-			if (PTR_ERR(ce) == -EAGAIN)
-				goto again;
-			break;
-		}
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
 			EXT4_ERROR_INODE(inode, "block %lu read error",
@@ -1647,7 +1645,7 @@ again:
 			return bh;
 		}
 		brelse(bh);
-		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+		ce = mb2_cache_entry_find_next(ext4_mb_cache, ce);
 	}
 	return NULL;
 }
@@ -1722,15 +1720,15 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 
 #define	HASH_BUCKET_BITS	10
 
-struct mb_cache *
-ext4_xattr_create_cache(char *name)
+struct mb2_cache *
+ext4_xattr_create_cache(void)
 {
-	return mb_cache_create(name, HASH_BUCKET_BITS);
+	return mb2_cache_create(HASH_BUCKET_BITS);
 }
 
-void ext4_xattr_destroy_cache(struct mb_cache *cache)
+void ext4_xattr_destroy_cache(struct mb2_cache *cache)
 {
 	if (cache)
-		mb_cache_destroy(cache);
+		mb2_cache_destroy(cache);
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ddc0957760ba..10b0f7323ed6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
 
 extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
-extern void ext4_xattr_put_super(struct super_block *);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
@@ -124,8 +123,8 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 				       struct ext4_xattr_info *i,
 				       struct ext4_xattr_ibody_find *is);
 
-extern struct mb_cache *ext4_xattr_create_cache(char *name);
-extern void ext4_xattr_destroy_cache(struct mb_cache *);
+extern struct mb2_cache *ext4_xattr_create_cache(void);
+extern void ext4_xattr_destroy_cache(struct mb2_cache *);
 
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,

From 356d8075469a6cab1ecea241923e56dd00641109 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 22 Feb 2016 11:56:38 -0500
Subject: [PATCH 0763/3220] BACKPORT: [UPSTREAM] ext2: convert to mbcache2

(Cherry-pick from commit be0726d33cb8f411945884664924bed3cb8c70ee)

The conversion is generally straightforward. We convert filesystem from
a global cache to per-fs one. Similarly to ext4 the tricky part is that
xattr block corresponding to found mbcache entry can get freed before we
get buffer lock for that block. So we have to check whether the entry is
still valid after getting the buffer lock.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Bug: 32461228
---
 fs/ext2/ext2.h  |   3 +
 fs/ext2/super.c |  25 ++++++---
 fs/ext2/xattr.c | 143 +++++++++++++++++++++++-------------------------
 fs/ext2/xattr.h |  21 ++-----
 4 files changed, 92 insertions(+), 100 deletions(-)

diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 4c69c94cafd8..f98ce7e60a0f 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -61,6 +61,8 @@ struct ext2_block_alloc_info {
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
 
+struct mb2_cache;
+
 /*
  * second extended-fs super-block data in memory
  */
@@ -111,6 +113,7 @@ struct ext2_sb_info {
 	 * of the mount options.
 	 */
 	spinlock_t s_lock;
+	struct mb2_cache *s_mb_cache;
 };
 
 static inline spinlock_t *
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 748d35afc902..111a31761ffa 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb)
 
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
-	ext2_xattr_put_super(sb);
+	if (sbi->s_mb_cache) {
+		ext2_xattr_destroy_cache(sbi->s_mb_cache);
+		sbi->s_mb_cache = NULL;
+	}
 	if (!(sb->s_flags & MS_RDONLY)) {
 		struct ext2_super_block *es = sbi->s_es;
 
@@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		ext2_msg(sb, KERN_ERR, "error: insufficient memory");
 		goto failed_mount3;
 	}
+
+#ifdef CONFIG_EXT2_FS_XATTR
+	sbi->s_mb_cache = ext2_xattr_create_cache();
+	if (!sbi->s_mb_cache) {
+		ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+		goto failed_mount3;
+	}
+#endif
 	/*
 	 * set up enough so that it can read an inode
 	 */
@@ -1149,6 +1160,8 @@ cantfind_ext2:
 			sb->s_id);
 	goto failed_mount;
 failed_mount3:
+	if (sbi->s_mb_cache)
+		ext2_xattr_destroy_cache(sbi->s_mb_cache);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2");
 
 static int __init init_ext2_fs(void)
 {
-	int err = init_ext2_xattr();
-	if (err)
-		return err;
+	int err;
+
 	err = init_inodecache();
 	if (err)
-		goto out1;
+		return err;
         err = register_filesystem(&ext2_fs_type);
 	if (err)
 		goto out;
 	return 0;
 out:
 	destroy_inodecache();
-out1:
-	exit_ext2_xattr();
 	return err;
 }
 
@@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void)
 {
 	unregister_filesystem(&ext2_fs_type);
 	destroy_inodecache();
-	exit_ext2_xattr();
 }
 
 MODULE_AUTHOR("Remy Card and others");
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index fa70848afa8f..24736c8b3d51 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -56,7 +56,7 @@
 #include <linux/buffer_head.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/mbcache.h>
+#include <linux/mbcache2.h>
 #include <linux/quotaops.h>
 #include <linux/rwsem.h>
 #include <linux/security.h>
@@ -92,14 +92,12 @@
 static int ext2_xattr_set2(struct inode *, struct buffer_head *,
 			   struct ext2_xattr_header *);
 
-static int ext2_xattr_cache_insert(struct buffer_head *);
+static int ext2_xattr_cache_insert(struct mb2_cache *, struct buffer_head *);
 static struct buffer_head *ext2_xattr_cache_find(struct inode *,
 						 struct ext2_xattr_header *);
 static void ext2_xattr_rehash(struct ext2_xattr_header *,
 			      struct ext2_xattr_entry *);
 
-static struct mb_cache *ext2_xattr_cache;
-
 static const struct xattr_handler *ext2_xattr_handler_map[] = {
 	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -154,6 +152,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
 	size_t name_len, size;
 	char *end;
 	int error;
+	struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
 
 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
 		  name_index, name, buffer, (long)buffer_size);
@@ -198,7 +197,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
 			goto found;
 		entry = next;
 	}
-	if (ext2_xattr_cache_insert(bh))
+	if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
 		ea_idebug(inode, "cache insert failed");
 	error = -ENODATA;
 	goto cleanup;
@@ -211,7 +210,7 @@ found:
 	    le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
 		goto bad_block;
 
-	if (ext2_xattr_cache_insert(bh))
+	if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
 		ea_idebug(inode, "cache insert failed");
 	if (buffer) {
 		error = -ERANGE;
@@ -249,6 +248,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	char *end;
 	size_t rest = buffer_size;
 	int error;
+	struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
 
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
@@ -283,7 +283,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
 			goto bad_block;
 		entry = next;
 	}
-	if (ext2_xattr_cache_insert(bh))
+	if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
 		ea_idebug(inode, "cache insert failed");
 
 	/* list the attribute names */
@@ -480,22 +480,23 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
 	/* Here we know that we can set the new attribute. */
 
 	if (header) {
-		struct mb_cache_entry *ce;
-
 		/* assert(header == HDR(bh)); */
-		ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
-					bh->b_blocknr);
 		lock_buffer(bh);
 		if (header->h_refcount == cpu_to_le32(1)) {
+			__u32 hash = le32_to_cpu(header->h_hash);
+
 			ea_bdebug(bh, "modifying in-place");
-			if (ce)
-				mb_cache_entry_free(ce);
+			/*
+			 * This must happen under buffer lock for
+			 * ext2_xattr_set2() to reliably detect modified block
+			 */
+			mb2_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
+						     hash, bh->b_blocknr);
+
 			/* keep the buffer locked while modifying it. */
 		} else {
 			int offset;
 
-			if (ce)
-				mb_cache_entry_release(ce);
 			unlock_buffer(bh);
 			ea_bdebug(bh, "cloning");
 			header = kmalloc(bh->b_size, GFP_KERNEL);
@@ -623,6 +624,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *new_bh = NULL;
 	int error;
+	struct mb2_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
 
 	if (header) {
 		new_bh = ext2_xattr_cache_find(inode, header);
@@ -650,7 +652,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			   don't need to change the reference count. */
 			new_bh = old_bh;
 			get_bh(new_bh);
-			ext2_xattr_cache_insert(new_bh);
+			ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
 		} else {
 			/* We need to allocate a new block */
 			ext2_fsblk_t goal = ext2_group_first_block_no(sb,
@@ -671,7 +673,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			memcpy(new_bh->b_data, header, new_bh->b_size);
 			set_buffer_uptodate(new_bh);
 			unlock_buffer(new_bh);
-			ext2_xattr_cache_insert(new_bh);
+			ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
 			
 			ext2_xattr_update_super_block(sb);
 		}
@@ -704,19 +706,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 
 	error = 0;
 	if (old_bh && old_bh != new_bh) {
-		struct mb_cache_entry *ce;
-
 		/*
 		 * If there was an old block and we are no longer using it,
 		 * release the old block.
 		 */
-		ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
-					old_bh->b_blocknr);
 		lock_buffer(old_bh);
 		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
+			__u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
+
+			/*
+			 * This must happen under buffer lock for
+			 * ext2_xattr_set2() to reliably detect freed block
+			 */
+			mb2_cache_entry_delete_block(ext2_mb_cache,
+						     hash, old_bh->b_blocknr);
 			/* Free the old block. */
-			if (ce)
-				mb_cache_entry_free(ce);
 			ea_bdebug(old_bh, "freeing");
 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
 			mark_inode_dirty(inode);
@@ -727,8 +731,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 		} else {
 			/* Decrement the refcount only. */
 			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
-			if (ce)
-				mb_cache_entry_release(ce);
 			dquot_free_block_nodirty(inode, 1);
 			mark_inode_dirty(inode);
 			mark_buffer_dirty(old_bh);
@@ -754,7 +756,6 @@ void
 ext2_xattr_delete_inode(struct inode *inode)
 {
 	struct buffer_head *bh = NULL;
-	struct mb_cache_entry *ce;
 
 	down_write(&EXT2_I(inode)->xattr_sem);
 	if (!EXT2_I(inode)->i_file_acl)
@@ -774,19 +775,22 @@ ext2_xattr_delete_inode(struct inode *inode)
 			EXT2_I(inode)->i_file_acl);
 		goto cleanup;
 	}
-	ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
 	lock_buffer(bh);
 	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
-		if (ce)
-			mb_cache_entry_free(ce);
+		__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
+
+		/*
+		 * This must happen under buffer lock for ext2_xattr_set2() to
+		 * reliably detect freed block
+		 */
+		mb2_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
+					     hash, bh->b_blocknr);
 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
 		get_bh(bh);
 		bforget(bh);
 		unlock_buffer(bh);
 	} else {
 		le32_add_cpu(&HDR(bh)->h_refcount, -1);
-		if (ce)
-			mb_cache_entry_release(ce);
 		ea_bdebug(bh, "refcount now=%d",
 			le32_to_cpu(HDR(bh)->h_refcount));
 		unlock_buffer(bh);
@@ -802,18 +806,6 @@ cleanup:
 	up_write(&EXT2_I(inode)->xattr_sem);
 }
 
-/*
- * ext2_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext2_xattr_put_super(struct super_block *sb)
-{
-	mb_cache_shrink(sb->s_bdev);
-}
-
-
 /*
  * ext2_xattr_cache_insert()
  *
@@ -823,28 +815,20 @@ ext2_xattr_put_super(struct super_block *sb)
  * Returns 0, or a negative error number on failure.
  */
 static int
-ext2_xattr_cache_insert(struct buffer_head *bh)
+ext2_xattr_cache_insert(struct mb2_cache *cache, struct buffer_head *bh)
 {
 	__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
-	struct mb_cache_entry *ce;
 	int error;
 
-	ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
-	if (!ce)
-		return -ENOMEM;
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+	error = mb2_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr);
 	if (error) {
-		mb_cache_entry_free(ce);
 		if (error == -EBUSY) {
 			ea_bdebug(bh, "already in cache (%d cache entries)",
 				atomic_read(&ext2_xattr_cache->c_entry_count));
 			error = 0;
 		}
-	} else {
-		ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
-			  atomic_read(&ext2_xattr_cache->c_entry_count));
-		mb_cache_entry_release(ce);
-	}
+	} else
+		ea_bdebug(bh, "inserting [%x]", (int)hash);
 	return error;
 }
 
@@ -900,23 +884,17 @@ static struct buffer_head *
 ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 {
 	__u32 hash = le32_to_cpu(header->h_hash);
-	struct mb_cache_entry *ce;
+	struct mb2_cache_entry *ce;
+	struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
 
 	if (!header->h_hash)
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-	ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
-				       hash);
+	ce = mb2_cache_entry_find_first(ext2_mb_cache, hash);
 	while (ce) {
 		struct buffer_head *bh;
 
-		if (IS_ERR(ce)) {
-			if (PTR_ERR(ce) == -EAGAIN)
-				goto again;
-			break;
-		}
-
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
@@ -924,7 +902,21 @@ again:
 				inode->i_ino, (unsigned long) ce->e_block);
 		} else {
 			lock_buffer(bh);
-			if (le32_to_cpu(HDR(bh)->h_refcount) >
+			/*
+			 * We have to be careful about races with freeing or
+			 * rehashing of xattr block. Once we hold buffer lock
+			 * xattr block's state is stable so we can check
+			 * whether the block got freed / rehashed or not.
+			 * Since we unhash mbcache entry under buffer lock when
+			 * freeing / rehashing xattr block, checking whether
+			 * entry is still hashed is reliable.
+			 */
+			if (hlist_bl_unhashed(&ce->e_hash_list)) {
+				mb2_cache_entry_put(ext2_mb_cache, ce);
+				unlock_buffer(bh);
+				brelse(bh);
+				goto again;
+			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
 				   EXT2_XATTR_REFCOUNT_MAX) {
 				ea_idebug(inode, "block %ld refcount %d>%d",
 					  (unsigned long) ce->e_block,
@@ -933,13 +925,14 @@ again:
 			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
 				ea_bdebug(bh, "b_count=%d",
 					  atomic_read(&(bh->b_count)));
-				mb_cache_entry_release(ce);
+				mb2_cache_entry_touch(ext2_mb_cache, ce);
+				mb2_cache_entry_put(ext2_mb_cache, ce);
 				return bh;
 			}
 			unlock_buffer(bh);
 			brelse(bh);
 		}
-		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+		ce = mb2_cache_entry_find_next(ext2_mb_cache, ce);
 	}
 	return NULL;
 }
@@ -1012,17 +1005,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
 
 #undef BLOCK_HASH_SHIFT
 
-int __init
-init_ext2_xattr(void)
+#define HASH_BUCKET_BITS 10
+
+struct mb2_cache *ext2_xattr_create_cache(void)
 {
-	ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
-	if (!ext2_xattr_cache)
-		return -ENOMEM;
-	return 0;
+	return mb2_cache_create(HASH_BUCKET_BITS);
 }
 
-void
-exit_ext2_xattr(void)
+void ext2_xattr_destroy_cache(struct mb2_cache *cache)
 {
-	mb_cache_destroy(ext2_xattr_cache);
+	if (cache)
+		mb2_cache_destroy(cache);
 }
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 60edf298644e..6ea38aa9563a 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -53,6 +53,8 @@ struct ext2_xattr_entry {
 #define EXT2_XATTR_SIZE(size) \
 	(((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
 
+struct mb2_cache;
+
 # ifdef CONFIG_EXT2_FS_XATTR
 
 extern const struct xattr_handler ext2_xattr_user_handler;
@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 
 extern void ext2_xattr_delete_inode(struct inode *);
-extern void ext2_xattr_put_super(struct super_block *);
 
-extern int init_ext2_xattr(void);
-extern void exit_ext2_xattr(void);
+extern struct mb2_cache *ext2_xattr_create_cache(void);
+extern void ext2_xattr_destroy_cache(struct mb2_cache *cache);
 
 extern const struct xattr_handler *ext2_xattr_handlers[];
 
@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 {
 }
 
-static inline void
-ext2_xattr_put_super(struct super_block *sb)
-{
-}
-
-static inline int
-init_ext2_xattr(void)
-{
-	return 0;
-}
-
-static inline void
-exit_ext2_xattr(void)
+static inline void ext2_xattr_destroy_cache(struct mb2_cache *cache)
 {
 }
 

From d657201d48f0cff729c89fafeada716410934f5c Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 17 Apr 2017 17:11:38 -0700
Subject: [PATCH 0764/3220] Android: sdcardfs: Change cache GID value

Change-Id: Ieb955dd26493da26a458bc20fbbe75bca32b094f
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 37193650
---
 fs/sdcardfs/derived_perm.c | 2 +-
 fs/sdcardfs/multiuser.h    | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index f82fedd29007..11cb402d3465 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -222,7 +222,7 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name)
 		break;
 	case PERM_ANDROID_PACKAGE_CACHE:
 		if (info->d_uid != 0)
-			gid = multiuser_get_cache_gid(info->d_uid);
+			gid = multiuser_get_ext_cache_gid(info->d_uid);
 		else
 			gid = multiuser_get_uid(info->userid, uid);
 		break;
diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h
index 2e89b5872314..d0c925cda299 100644
--- a/fs/sdcardfs/multiuser.h
+++ b/fs/sdcardfs/multiuser.h
@@ -23,6 +23,8 @@
 #define AID_APP_END          19999 /* last app user */
 #define AID_CACHE_GID_START  20000 /* start of gids for apps to mark cached data */
 #define AID_EXT_GID_START    30000 /* start of gids for apps to mark external data */
+#define AID_EXT_CACHE_GID_START 40000 /* start of gids for apps to mark external cached data */
+#define AID_EXT_CACHE_GID_END 49999   /* end of gids for apps to mark external cached data */
 #define AID_SHARED_GID_START 50000 /* start of gids for apps in each user to share */
 
 typedef uid_t userid_t;
@@ -33,9 +35,9 @@ static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id)
 	return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
 }
 
-static inline gid_t multiuser_get_cache_gid(uid_t uid)
+static inline gid_t multiuser_get_ext_cache_gid(uid_t uid)
 {
-	return uid - AID_APP_START + AID_CACHE_GID_START;
+	return uid - AID_APP_START + AID_EXT_CACHE_GID_START;
 }
 
 static inline gid_t multiuser_get_ext_gid(uid_t uid)

From 6f28e6ebccbaeee26ee72669215cf1e4d0d79537 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 18 Apr 2017 12:45:48 -0700
Subject: [PATCH 0765/3220] ANDROID: sdcardfs: ->iget fixes

Adapted from wrapfs
commit 8c49eaa0sb9c ("Wrapfs: ->iget fixes")

Change where we igrab/iput to ensure we always hold a valid lower_inode.
Return ENOMEM (not EACCES) if iget5_locked returns NULL.

Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35766959

Change-Id: Id8d4e0c0cbc685a0a77685ce73c923e9a3ddc094
---
 fs/sdcardfs/lookup.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index f10f7752f514..a0f221501b4c 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -91,7 +91,9 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, u
 	struct sdcardfs_inode_info *info;
 	struct inode_data data;
 	struct inode *inode; /* the new inode to return */
-	int err;
+
+	if (!igrab(lower_inode))
+		return ERR_PTR(-ESTALE);
 
 	data.id = id;
 	data.lower_inode = lower_inode;
@@ -106,22 +108,19 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, u
 			     sdcardfs_inode_set, /* inode init function */
 			     &data); /* data passed to test+set fxns */
 	if (!inode) {
-		err = -EACCES;
 		iput(lower_inode);
-		return ERR_PTR(err);
+		return ERR_PTR(-ENOMEM);
 	}
-	/* if found a cached inode, then just return it */
-	if (!(inode->i_state & I_NEW))
+	/* if found a cached inode, then just return it (after iput) */
+	if (!(inode->i_state & I_NEW)) {
+		iput(lower_inode);
 		return inode;
+	}
 
 	/* initialize new inode */
 	info = SDCARDFS_I(inode);
 
 	inode->i_ino = lower_inode->i_ino;
-	if (!igrab(lower_inode)) {
-		err = -ESTALE;
-		return ERR_PTR(err);
-	}
 	sdcardfs_set_lower_inode(inode, lower_inode);
 
 	inode->i_version++;

From e92f72194da2e690d85fc736661d3f0d96825c57 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 18 Apr 2017 22:25:15 -0700
Subject: [PATCH 0766/3220] Android: sdcardfs: Don't do d_add for lower fs

For file based encryption, ext4 explicitly does not
create negative dentries for encrypted files. If you
force one over it, the decrypted file will be hidden
until the cache is cleared. Instead, just fail out.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 37231161
Change-Id: Id2a9708dfa75e1c22f89915c529789caadd2ca4b
---
 fs/sdcardfs/lookup.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index a0f221501b4c..08f62fb3e075 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -368,17 +368,15 @@ put_name:
 	dname.len = name->len;
 	dname.hash = full_name_hash(dname.name, dname.len);
 	lower_dentry = d_lookup(lower_dir_dentry, &dname);
-	if (lower_dentry)
-		goto setup_lower;
-
-	lower_dentry = d_alloc(lower_dir_dentry, &dname);
 	if (!lower_dentry) {
-		err = -ENOMEM;
+		/* We called vfs_path_lookup earlier, and did not get a negative
+		 * dentry then. Don't confuse the lower filesystem by forcing one
+		 * on it now...
+		 */
+		err = -ENOENT;
 		goto out;
 	}
-	d_add(lower_dentry, NULL); /* instantiate and hash */
 
-setup_lower:
 	lower_path.dentry = lower_dentry;
 	lower_path.mnt = mntget(lower_dir_mnt);
 	sdcardfs_set_lower_path(dentry, &lower_path);

From d04f4c7320bab619062472469d149387d20095f9 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 18 Apr 2017 22:49:38 -0700
Subject: [PATCH 0767/3220] Android: sdcardfs: Don't complain in
 fixup_lower_ownership

Not all filesystems support changing the owner of a file.
We shouldn't complain if it doesn't happen.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 37488099
Change-Id: I403e44ab7230f176e6df82f6adb4e5c82ce57f33
---
 fs/sdcardfs/derived_perm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 11cb402d3465..cddfc79ea365 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -252,7 +252,7 @@ retry_deleg:
 				goto retry_deleg;
 		}
 		if (error)
-			pr_err("sdcardfs: Failed to touch up lower fs gid/uid.\n");
+			pr_debug("sdcardfs: Failed to touch up lower fs gid/uid for %s\n", name);
 	}
 	sdcardfs_put_lower_path(dentry, &path);
 }

From 0ed8679d0dbb02598c60cd4bd933f911bd636d01 Mon Sep 17 00:00:00 2001
From: Jiebing Li <jiebing.li@intel.com>
Date: Tue, 10 Mar 2015 11:24:00 +0800
Subject: [PATCH 0768/3220] ANDROID: usb: gadget: fix MTP enumeration issue
 under super speed mode

MTP function doesn't show as a drive in Windows when the device
is connected to PC's USB3 port, because device fails to respond
ACK to BULK OUT transfer request.

This patch modifies MTP OUT request length as multiple of MaxPacketSize
per databook requirement in order to fix this issue.

Patchset: mtp

Change-Id: I090d7880ff00c499dc5ba7fd644b1fe7cd87fcb5
Signed-off-by: Jiebing Li <jiebing.li@intel.com>
Signed-off-by: Wang, Yu <yu.y.wang@intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
---
 drivers/usb/gadget/function/f_mtp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
index d8b69af6e335..b37cc8d87ca4 100644
--- a/drivers/usb/gadget/function/f_mtp.c
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -540,10 +540,12 @@ static ssize_t mtp_read(struct file *fp, char __user *buf,
 	ssize_t r = count;
 	unsigned xfer;
 	int ret = 0;
+	size_t len;
 
 	DBG(cdev, "mtp_read(%zu)\n", count);
 
-	if (count > MTP_BULK_BUFFER_SIZE)
+	len = usb_ep_align_maybe(cdev->gadget, dev->ep_out, count);
+	if (len > MTP_BULK_BUFFER_SIZE)
 		return -EINVAL;
 
 	/* we will block until we're online */
@@ -567,7 +569,7 @@ static ssize_t mtp_read(struct file *fp, char __user *buf,
 requeue_req:
 	/* queue a request */
 	req = dev->rx_req[0];
-	req->length = count;
+	req->length = len;
 	dev->rx_done = 0;
 	ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL);
 	if (ret < 0) {

From 53491d941217d00fdddda6b8b755ced36676f079 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 20 Apr 2017 12:13:31 -0700
Subject: [PATCH 0769/3220] Revert "Android: sdcardfs: Don't do d_add for lower
 fs"

This reverts commit e92f72194da2e690d85fc736661d3f0d96825c57.

This change caused issues for sdcardfs on top of vfat

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: Ife2ac6d9af40e4ddb95b7261e1dad4d7817e3779
---
 fs/sdcardfs/lookup.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 08f62fb3e075..a0f221501b4c 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -368,15 +368,17 @@ put_name:
 	dname.len = name->len;
 	dname.hash = full_name_hash(dname.name, dname.len);
 	lower_dentry = d_lookup(lower_dir_dentry, &dname);
+	if (lower_dentry)
+		goto setup_lower;
+
+	lower_dentry = d_alloc(lower_dir_dentry, &dname);
 	if (!lower_dentry) {
-		/* We called vfs_path_lookup earlier, and did not get a negative
-		 * dentry then. Don't confuse the lower filesystem by forcing one
-		 * on it now...
-		 */
-		err = -ENOENT;
+		err = -ENOMEM;
 		goto out;
 	}
+	d_add(lower_dentry, NULL); /* instantiate and hash */
 
+setup_lower:
 	lower_path.dentry = lower_dentry;
 	lower_path.mnt = mntget(lower_dir_mnt);
 	sdcardfs_set_lower_path(dentry, &lower_path);

From 738e1a1a468217bd5cb3fdc19582d4a1778bb510 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 11 Apr 2016 10:40:55 +0200
Subject: [PATCH 0770/3220] UPSTREAM: char: Drop bogus dependency of DEVPORT on
 !M68K

(cherry pick from commit 309124e2648d668a0c23539c5078815660a4a850)

According to full-history-linux commit d3794f4fa7c3edc3 ("[PATCH] M68k
update (part 25)"), port operations are allowed on m68k if CONFIG_ISA is
defined.

However, commit 153dcc54df826d2f ("[PATCH] mem driver: fix conditional
on isa i/o support") accidentally changed an "||" into an "&&",
disabling it completely on m68k. This logic was retained when
introducing the DEVPORT symbol in commit 4f911d64e04a44c4 ("Make
/dev/port conditional on config symbol").

Drop the bogus dependency on !M68K to fix this.

Fixes: 153dcc54df826d2f ("[PATCH] mem driver: fix conditional on isa i/o support")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Al Stone <ahs3@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Bug: 37210310
Bug: 36604779
Change-Id: I9139bd8a5a6e9e39c2e428bde23a7d9be07e2f91
---
 drivers/char/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index a043107da2af..b130d38cf55c 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -584,7 +584,6 @@ config TELCLOCK
 
 config DEVPORT
 	bool
-	depends on !M68K
 	depends on ISA || PCI
 	default y
 

From c0fb2f9e5814e4cf8d6e64c00bdacdbf1668ab62 Mon Sep 17 00:00:00 2001
From: Max Bires <jbires@google.com>
Date: Tue, 3 Jan 2017 08:18:07 -0800
Subject: [PATCH 0771/3220] UPSTREAM: char: lack of bool string made
 CONFIG_DEVPORT always on

(cherry pick from commit f2cfa58b136e4b06a9b9db7af5ef62fbb5992f62)

Without a bool string present, using "# CONFIG_DEVPORT is not set" in
defconfig files would not actually unset devport. This esnured that
/dev/port was always on, but there are reasons a user may wish to
disable it (smaller kernel, attack surface reduction) if it's not being
used. Adding a message here in order to make this user visible.

Signed-off-by: Max Bires <jbires@google.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Bug: 37210310
Bug: 36604779
Change-Id: Ib1e947526f6c6f7cdf6389923287631056f32c36
---
 drivers/char/Kconfig | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index b130d38cf55c..3143db57ce44 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -583,9 +583,12 @@ config TELCLOCK
 	  controlling the behavior of this hardware.
 
 config DEVPORT
-	bool
+	bool "/dev/port character device"
 	depends on ISA || PCI
 	default y
+	help
+	  Say Y here if you want to support the /dev/port device. The /dev/port
+	  device is similar to /dev/mem, but for I/O ports.
 
 source "drivers/s390/char/Kconfig"
 

From ebff0104446a71da950709b94cd2d8d2c10c73be Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Thu, 13 Apr 2017 17:07:58 -0700
Subject: [PATCH 0772/3220] ANDROID: uid_sys_stats: reduce update_io_stats
 overhead

Replaced read_lock with rcu_read_lock to reduce time that preemption
is disabled.

Added a function to update io stats for specific uid and moved
hash table lookup, user_namespace out of loops.

Bug: 37319300
Change-Id: I2b81b5cd3b6399b40d08c3c14b42cad044556970
Signed-off-by: Jin Qian <jinqian@google.com>
---
 drivers/misc/uid_sys_stats.c | 61 ++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 204b23484266..618617f3e867 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -237,28 +237,28 @@ static void clean_uid_io_last_stats(struct uid_entry *uid_entry,
 	io_last->fsync -= task->ioac.syscfs;
 }
 
-static void update_io_stats_locked(void)
+static void update_io_stats_all_locked(void)
 {
 	struct uid_entry *uid_entry;
 	struct task_struct *task, *temp;
 	struct io_stats *io_bucket, *io_curr, *io_last;
+	struct user_namespace *user_ns = current_user_ns();
 	unsigned long bkt;
-
-	BUG_ON(!rt_mutex_is_locked(&uid_lock));
+	uid_t uid;
 
 	hash_for_each(hash_table, bkt, uid_entry, hash)
 		memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
 			sizeof(struct io_stats));
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	do_each_thread(temp, task) {
-		uid_entry = find_or_register_uid(from_kuid_munged(
-			current_user_ns(), task_uid(task)));
+		uid = from_kuid_munged(user_ns, task_uid(task));
+		uid_entry = find_or_register_uid(uid);
 		if (!uid_entry)
 			continue;
 		add_uid_io_curr_stats(uid_entry, task);
 	} while_each_thread(temp, task);
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	hash_for_each(hash_table, bkt, uid_entry, hash) {
 		io_bucket = &uid_entry->io[uid_entry->state];
@@ -281,6 +281,47 @@ static void update_io_stats_locked(void)
 	}
 }
 
+static void update_io_stats_uid_locked(uid_t target_uid)
+{
+	struct uid_entry *uid_entry;
+	struct task_struct *task, *temp;
+	struct io_stats *io_bucket, *io_curr, *io_last;
+	struct user_namespace *user_ns = current_user_ns();
+
+	uid_entry = find_or_register_uid(target_uid);
+	if (!uid_entry)
+		return;
+
+	memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+		sizeof(struct io_stats));
+
+	rcu_read_lock();
+	do_each_thread(temp, task) {
+		if (from_kuid_munged(user_ns, task_uid(task)) != target_uid)
+			continue;
+		add_uid_io_curr_stats(uid_entry, task);
+	} while_each_thread(temp, task);
+	rcu_read_unlock();
+
+	io_bucket = &uid_entry->io[uid_entry->state];
+	io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR];
+	io_last = &uid_entry->io[UID_STATE_TOTAL_LAST];
+
+	io_bucket->read_bytes +=
+		io_curr->read_bytes - io_last->read_bytes;
+	io_bucket->write_bytes +=
+		io_curr->write_bytes - io_last->write_bytes;
+	io_bucket->rchar += io_curr->rchar - io_last->rchar;
+	io_bucket->wchar += io_curr->wchar - io_last->wchar;
+	io_bucket->fsync += io_curr->fsync - io_last->fsync;
+
+	io_last->read_bytes = io_curr->read_bytes;
+	io_last->write_bytes = io_curr->write_bytes;
+	io_last->rchar = io_curr->rchar;
+	io_last->wchar = io_curr->wchar;
+	io_last->fsync = io_curr->fsync;
+}
+
 static int uid_io_show(struct seq_file *m, void *v)
 {
 	struct uid_entry *uid_entry;
@@ -288,7 +329,7 @@ static int uid_io_show(struct seq_file *m, void *v)
 
 	rt_mutex_lock(&uid_lock);
 
-	update_io_stats_locked();
+	update_io_stats_all_locked();
 
 	hash_for_each(hash_table, bkt, uid_entry, hash) {
 		seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
@@ -363,7 +404,7 @@ static ssize_t uid_procstat_write(struct file *file,
 		return count;
 	}
 
-	update_io_stats_locked();
+	update_io_stats_uid_locked(uid);
 
 	uid_entry->state = state;
 
@@ -401,7 +442,7 @@ static int process_notifier(struct notifier_block *self,
 	uid_entry->utime += utime;
 	uid_entry->stime += stime;
 
-	update_io_stats_locked();
+	update_io_stats_uid_locked(uid);
 	clean_uid_io_last_stats(uid_entry, task);
 
 exit:

From b834e929774513db929303125486c2c34bedba73 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 24 Apr 2017 15:50:31 +0530
Subject: [PATCH 0773/3220] Revert "USB: gadget: u_ether: Fix data stall issue
 in RNDIS tethering mode"

This reverts commit 78281f6ed79413e5a16eac6edc4a72c6836ae5a9.

This data stall fix is no longer required in AOSP. It is already
skipped in android-4.9 patchset. Also core change from this
data stall fix is already undone by android-4.4 merge commit
324e88de4aba ("Merge tag 'v4.4.32' into android-4.4.y").

This revert patch just clean up the left overs. It also reverts the
compile fix from Change-Id: I38c4f4a850b0329fb4a06b2c7e45558e16d66151
40ceb2c69964 ("usb: gadget: Fix compilation problem with tx_qlen field").

Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/usb/gadget/function/u_ether.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c
index cc5210a87614..21bf0a8423d5 100644
--- a/drivers/usb/gadget/function/u_ether.c
+++ b/drivers/usb/gadget/function/u_ether.c
@@ -66,7 +66,7 @@ struct eth_dev {
 
 	spinlock_t		req_lock;	/* guard {rx,tx}_reqs */
 	struct list_head	tx_reqs, rx_reqs;
-	unsigned		tx_qlen;
+	atomic_t		tx_qlen;
 /* Minimum number of TX USB request queued to UDC */
 #define TX_REQ_THRESHOLD	5
 	int			no_tx_req_used;
@@ -568,6 +568,7 @@ static void tx_complete(struct usb_ep *ep, struct usb_request *req)
 		dev_kfree_skb_any(skb);
 	}
 
+	atomic_dec(&dev->tx_qlen);
 	if (netif_carrier_ok(dev->net))
 		netif_wake_queue(dev->net);
 }
@@ -746,7 +747,7 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb,
 		req->no_interrupt = (((dev->gadget->speed == USB_SPEED_HIGH ||
 				       dev->gadget->speed == USB_SPEED_SUPER)) &&
 					!list_empty(&dev->tx_reqs))
-			? ((dev->tx_qlen % dev->qmult) != 0)
+			? ((atomic_read(&dev->tx_qlen) % dev->qmult) != 0)
 			: 0;
 
 	retval = usb_ep_queue(in, req, GFP_ATOMIC);
@@ -756,6 +757,7 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb,
 		break;
 	case 0:
 		net->trans_start = jiffies;
+		atomic_inc(&dev->tx_qlen);
 	}
 
 	if (retval) {
@@ -784,7 +786,7 @@ static void eth_start(struct eth_dev *dev, gfp_t gfp_flags)
 	rx_fill(dev, gfp_flags);
 
 	/* and open the tx floodgates */
-	dev->tx_qlen = 0;
+	atomic_set(&dev->tx_qlen, 0);
 	netif_wake_queue(dev->net);
 }
 

From d77312aeb21d93c3547f20b8cc09916daa928acc Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Tue, 25 Apr 2017 18:30:45 +0000
Subject: [PATCH 0774/3220] Revert "[RFC]cgroup: Change from CAP_SYS_NICE to
 CAP_SYS_RESOURCE for cgroup migration permissions"

This reverts commit 64f0245f312a849acc83b7c6cdcc8f7c1a14cca3.

Needs to be reverted since system_server no longer has CAP_SYS_RESOURCE.

Change-Id: Ic500cffb14b80a3e2a5dd41fda0b0b781b5245d6
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 253fdeabf667..8ab133d96435 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2689,7 +2689,7 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 	    !uid_eq(cred->euid, tcred->uid) &&
 	    !uid_eq(cred->euid, tcred->suid) &&
-	    !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
+	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
 		ret = -EACCES;
 
 	if (!ret && cgroup_on_dfl(dst_cgrp)) {

From 6a6a7657c231e947233c43ae0522bbd4edf0139e Mon Sep 17 00:00:00 2001
From: David Zeuthen <zeuthen@google.com>
Date: Tue, 24 Jan 2017 13:02:35 -0500
Subject: [PATCH 0775/3220] ANDROID: Update init/do_mounts_dm.c to the latest
 ChromiumOS version.

This is needed for AVB integration work.

Bug: 31796270
Test: Manually tested (other arch).
Change-Id: I32fd37c1578c6414e3e6ff277d16ad94df7886b8
Signed-off-by: David Zeuthen <zeuthen@google.com>
---
 include/linux/device-mapper.h |   7 +
 init/do_mounts_dm.c           | 562 ++++++++++++++++++----------------
 2 files changed, 310 insertions(+), 259 deletions(-)

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index b874d5b61ffc..f3f87db34429 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -415,6 +415,13 @@ union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
 
+void dm_lock_md_type(struct mapped_device *md);
+void dm_unlock_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, unsigned type);
+unsigned dm_get_md_type(struct mapped_device *md);
+int dm_setup_md_queue(struct mapped_device *md);
+unsigned dm_table_get_type(struct dm_table *t);
+
 /*
  * Geometry functions.
  */
diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c
index ecda58df9a19..bce1c2fbb915 100644
--- a/init/do_mounts_dm.c
+++ b/init/do_mounts_dm.c
@@ -5,13 +5,17 @@
  *
  * This file is released under the GPL.
  */
+#include <linux/async.h>
+#include <linux/ctype.h>
 #include <linux/device-mapper.h>
 #include <linux/fs.h>
 #include <linux/string.h>
+#include <linux/delay.h>
 
 #include "do_mounts.h"
-#include "../drivers/md/dm.h"
 
+#define DM_MAX_DEVICES 256
+#define DM_MAX_TARGETS 256
 #define DM_MAX_NAME 32
 #define DM_MAX_UUID 129
 #define DM_NO_UUID "none"
@@ -19,14 +23,47 @@
 #define DM_MSG_PREFIX "init"
 
 /* Separators used for parsing the dm= argument. */
-#define DM_FIELD_SEP ' '
-#define DM_LINE_SEP ','
+#define DM_FIELD_SEP " "
+#define DM_LINE_SEP ","
+#define DM_ANY_SEP DM_FIELD_SEP DM_LINE_SEP
 
 /*
  * When the device-mapper and any targets are compiled into the kernel
- * (not a module), one target may be created and used as the root device at
- * boot time with the parameters given with the boot line dm=...
- * The code for that is here.
+ * (not a module), one or more device-mappers may be created and used
+ * as the root device at boot time with the parameters given with the
+ * boot line dm=...
+ *
+ * Multiple device-mappers can be stacked specifing the number of
+ * devices. A device can have multiple targets if the the number of
+ * targets is specified.
+ *
+ * TODO(taysom:defect 32847)
+ * In the future, the <num> field will be mandatory.
+ *
+ * <device>        ::= [<num>] <device-mapper>+
+ * <device-mapper> ::= <head> "," <target>+
+ * <head>          ::= <name> <uuid> <mode> [<num>]
+ * <target>        ::= <start> <length> <type> <options> ","
+ * <mode>          ::= "ro" | "rw"
+ * <uuid>          ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "none"
+ * <type>          ::= "verity" | "bootcache" | ...
+ *
+ * Example:
+ * 2 vboot none ro 1,
+ *     0 1768000 bootcache
+ *       device=aa55b119-2a47-8c45-946a-5ac57765011f+1
+ *       signature=76e9be054b15884a9fa85973e9cb274c93afadb6
+ *       cache_start=1768000 max_blocks=100000 size_limit=23 max_trace=20000,
+ *   vroot none ro 1,
+ *     0 1740800 verity payload=254:0 hashtree=254:0 hashstart=1740800 alg=sha1
+ *       root_hexdigest=76e9be054b15884a9fa85973e9cb274c93afadb6
+ *       salt=5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe
+ *
+ * Notes:
+ *  1. uuid is a label for the device and we set it to "none".
+ *  2. The <num> field will be optional initially and assumed to be 1.
+ *     Once all the scripts that set these fields have been set, it will
+ *     be made mandatory.
  */
 
 struct dm_setup_target {
@@ -38,381 +75,388 @@ struct dm_setup_target {
 	struct dm_setup_target *next;
 };
 
-static struct {
+struct dm_device {
 	int minor;
 	int ro;
 	char name[DM_MAX_NAME];
 	char uuid[DM_MAX_UUID];
-	char *targets;
+	unsigned long num_targets;
 	struct dm_setup_target *target;
 	int target_count;
+	struct dm_device *next;
+};
+
+struct dm_option {
+	char *start;
+	char *next;
+	size_t len;
+	char delim;
+};
+
+static struct {
+	unsigned long num_devices;
+	char *str;
 } dm_setup_args __initdata;
 
 static __initdata int dm_early_setup;
 
-static size_t __init get_dm_option(char *str, char **next, char sep)
+static int __init get_dm_option(struct dm_option *opt, const char *accept)
 {
-	size_t len = 0;
-	char *endp = NULL;
+	char *str = opt->next;
+	char *endp;
 
 	if (!str)
 		return 0;
 
-	endp = strchr(str, sep);
+	str = skip_spaces(str);
+	opt->start = str;
+	endp = strpbrk(str, accept);
 	if (!endp) {  /* act like strchrnul */
-		len = strlen(str);
-		endp = str + len;
+		opt->len = strlen(str);
+		endp = str + opt->len;
 	} else {
-		len = endp - str;
+		opt->len = endp - str;
 	}
-
-	if (endp == str)
-		return 0;
-
-	if (!next)
-		return len;
-
+	opt->delim = *endp;
 	if (*endp == 0) {
 		/* Don't advance past the nul. */
-		*next = endp;
+		opt->next = endp;
 	} else {
-		*next = endp + 1;
+		opt->next = endp + 1;
 	}
-	return len;
+	return opt->len != 0;
 }
 
-static int __init dm_setup_args_init(void)
+static int __init dm_setup_cleanup(struct dm_device *devices)
 {
-	dm_setup_args.minor = 0;
-	dm_setup_args.ro = 0;
-	dm_setup_args.target = NULL;
-	dm_setup_args.target_count = 0;
+	struct dm_device *dev = devices;
+
+	while (dev) {
+		struct dm_device *old_dev = dev;
+		struct dm_setup_target *target = dev->target;
+		while (target) {
+			struct dm_setup_target *old_target = target;
+			kfree(target->type);
+			kfree(target->params);
+			target = target->next;
+			kfree(old_target);
+			dev->target_count--;
+		}
+		BUG_ON(dev->target_count);
+		dev = dev->next;
+		kfree(old_dev);
+	}
 	return 0;
 }
 
-static int __init dm_setup_cleanup(void)
+static char * __init dm_parse_device(struct dm_device *dev, char *str)
 {
-	struct dm_setup_target *target = dm_setup_args.target;
-	struct dm_setup_target *old_target = NULL;
-	while (target) {
-		kfree(target->type);
-		kfree(target->params);
-		old_target = target;
-		target = target->next;
-		kfree(old_target);
-		dm_setup_args.target_count--;
-	}
-	BUG_ON(dm_setup_args.target_count);
-	return 0;
-}
-
-static char * __init dm_setup_parse_device_args(char *str)
-{
-	char *next = NULL;
-	size_t len = 0;
+	struct dm_option opt;
+	size_t len;
 
 	/* Grab the logical name of the device to be exported to udev */
-	len = get_dm_option(str, &next, DM_FIELD_SEP);
-	if (!len) {
+	opt.next = str;
+	if (!get_dm_option(&opt, DM_FIELD_SEP)) {
 		DMERR("failed to parse device name");
 		goto parse_fail;
 	}
-	len = min(len + 1, sizeof(dm_setup_args.name));
-	strlcpy(dm_setup_args.name, str, len);  /* includes nul */
-	str = skip_spaces(next);
+	len = min(opt.len + 1, sizeof(dev->name));
+	strlcpy(dev->name, opt.start, len);  /* includes nul */
 
 	/* Grab the UUID value or "none" */
-	len = get_dm_option(str, &next, DM_FIELD_SEP);
-	if (!len) {
+	if (!get_dm_option(&opt, DM_FIELD_SEP)) {
 		DMERR("failed to parse device uuid");
 		goto parse_fail;
 	}
-	len = min(len + 1, sizeof(dm_setup_args.uuid));
-	strlcpy(dm_setup_args.uuid, str, len);
-	str = skip_spaces(next);
+	len = min(opt.len + 1, sizeof(dev->uuid));
+	strlcpy(dev->uuid, opt.start, len);
 
 	/* Determine if the table/device will be read only or read-write */
-	if (!strncmp("ro,", str, 3)) {
-		dm_setup_args.ro = 1;
-	} else if (!strncmp("rw,", str, 3)) {
-		dm_setup_args.ro = 0;
+	get_dm_option(&opt, DM_ANY_SEP);
+	if (!strncmp("ro", opt.start, opt.len)) {
+		dev->ro = 1;
+	} else if (!strncmp("rw", opt.start, opt.len)) {
+		dev->ro = 0;
 	} else {
 		DMERR("failed to parse table mode");
 		goto parse_fail;
 	}
-	str = skip_spaces(str + 3);
 
-	return str;
+	/* Optional number field */
+	/* XXX: The <num> field will be mandatory in the next round */
+	if (opt.delim == DM_FIELD_SEP[0]) {
+		if (!get_dm_option(&opt, DM_LINE_SEP))
+			return NULL;
+		dev->num_targets = simple_strtoul(opt.start, NULL, 10);
+	} else {
+		dev->num_targets = 1;
+	}
+	if (dev->num_targets > DM_MAX_TARGETS) {
+		DMERR("too many targets %lu > %d",
+			dev->num_targets, DM_MAX_TARGETS);
+	}
+	return opt.next;
 
 parse_fail:
 	return NULL;
 }
 
-static void __init dm_substitute_devices(char *str, size_t str_len)
+static char * __init dm_parse_targets(struct dm_device *dev, char *str)
 {
-	char *candidate = str;
-	char *candidate_end = str;
-	char old_char;
-	size_t len = 0;
-	dev_t dev;
-
-	if (str_len < 3)
-		return;
-
-	while (str && *str) {
-		candidate = strchr(str, '/');
-		if (!candidate)
-			break;
-
-		/* Avoid embedded slashes */
-		if (candidate != str && *(candidate - 1) != DM_FIELD_SEP) {
-			str = strchr(candidate, DM_FIELD_SEP);
-			continue;
-		}
-
-		len = get_dm_option(candidate, &candidate_end, DM_FIELD_SEP);
-		str = skip_spaces(candidate_end);
-		if (len < 3 || len > 37)  /* name_to_dev_t max; maj:mix min */
-			continue;
-
-		/* Temporarily terminate with a nul */
-		if (*candidate_end)
-			candidate_end--;
-		old_char = *candidate_end;
-		*candidate_end = '\0';
-
-		DMDEBUG("converting candidate device '%s' to dev_t", candidate);
-		/* Use the boot-time specific device naming */
-		dev = name_to_dev_t(candidate);
-		*candidate_end = old_char;
-
-		DMDEBUG(" -> %u", dev);
-		/* No suitable replacement found */
-		if (!dev)
-			continue;
-
-		/* Rewrite the /dev/path as a major:minor */
-		len = snprintf(candidate, len, "%u:%u", MAJOR(dev), MINOR(dev));
-		if (!len) {
-			DMERR("error substituting device major/minor.");
-			break;
-		}
-		candidate += len;
-		/* Pad out with spaces (fixing our nul) */
-		while (candidate < candidate_end)
-			*(candidate++) = DM_FIELD_SEP;
-	}
-}
-
-static int __init dm_setup_parse_targets(char *str)
-{
-	char *next = NULL;
-	size_t len = 0;
-	struct dm_setup_target **target = NULL;
+	struct dm_option opt;
+	struct dm_setup_target **target = &dev->target;
+	unsigned long num_targets = dev->num_targets;
+	unsigned long i;
 
 	/* Targets are defined as per the table format but with a
 	 * comma as a newline separator. */
-	target = &dm_setup_args.target;
-	while (str && *str) {
+	opt.next = str;
+	for (i = 0; i < num_targets; i++) {
 		*target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL);
 		if (!*target) {
-			DMERR("failed to allocate memory for target %d",
-			      dm_setup_args.target_count);
+			DMERR("failed to allocate memory for target %s<%ld>",
+				dev->name, i);
 			goto parse_fail;
 		}
-		dm_setup_args.target_count++;
+		dev->target_count++;
 
-		(*target)->begin = simple_strtoull(str, &next, 10);
-		if (!next || *next != DM_FIELD_SEP) {
-			DMERR("failed to parse starting sector for target %d",
-			      dm_setup_args.target_count - 1);
+		if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+			DMERR("failed to parse starting sector"
+				" for target %s<%ld>", dev->name, i);
 			goto parse_fail;
 		}
-		str = skip_spaces(next + 1);
+		(*target)->begin = simple_strtoull(opt.start, NULL, 10);
 
-		(*target)->length = simple_strtoull(str, &next, 10);
-		if (!next || *next != DM_FIELD_SEP) {
-			DMERR("failed to parse length for target %d",
-			      dm_setup_args.target_count - 1);
+		if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+			DMERR("failed to parse length for target %s<%ld>",
+				dev->name, i);
 			goto parse_fail;
 		}
-		str = skip_spaces(next + 1);
+		(*target)->length = simple_strtoull(opt.start, NULL, 10);
 
-		len = get_dm_option(str, &next, DM_FIELD_SEP);
-		if (!len ||
-		    !((*target)->type = kstrndup(str, len, GFP_KERNEL))) {
-			DMERR("failed to parse type for target %d",
-			      dm_setup_args.target_count - 1);
+		if (get_dm_option(&opt, DM_FIELD_SEP))
+			(*target)->type = kstrndup(opt.start, opt.len,
+							GFP_KERNEL);
+		if (!((*target)->type)) {
+			DMERR("failed to parse type for target %s<%ld>",
+				dev->name, i);
 			goto parse_fail;
 		}
-		str = skip_spaces(next);
-
-		len = get_dm_option(str, &next, DM_LINE_SEP);
-		if (!len ||
-		    !((*target)->params = kstrndup(str, len, GFP_KERNEL))) {
-			DMERR("failed to parse params for target %d",
-			      dm_setup_args.target_count - 1);
+		if (get_dm_option(&opt, DM_LINE_SEP))
+			(*target)->params = kstrndup(opt.start, opt.len,
+							GFP_KERNEL);
+		if (!((*target)->params)) {
+			DMERR("failed to parse params for target %s<%ld>",
+				dev->name, i);
 			goto parse_fail;
 		}
-		str = skip_spaces(next);
-
-		/* Before moving on, walk through the copied target and
-		 * attempt to replace all /dev/xxx with the major:minor number.
-		 * It may not be possible to resolve them traditionally at
-		 * boot-time. */
-		dm_substitute_devices((*target)->params, len);
-
 		target = &((*target)->next);
 	}
-	DMDEBUG("parsed %d targets", dm_setup_args.target_count);
+	DMDEBUG("parsed %d targets", dev->target_count);
 
-	return 0;
+	return opt.next;
 
 parse_fail:
-	return 1;
+	return NULL;
+}
+
+static struct dm_device * __init dm_parse_args(void)
+{
+	struct dm_device *devices = NULL;
+	struct dm_device **tail = &devices;
+	struct dm_device *dev;
+	char *str = dm_setup_args.str;
+	unsigned long num_devices = dm_setup_args.num_devices;
+	unsigned long i;
+
+	if (!str)
+		return NULL;
+	for (i = 0; i < num_devices; i++) {
+		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+		if (!dev) {
+			DMERR("failed to allocated memory for dev");
+			goto error;
+		}
+		*tail = dev;
+		tail = &dev->next;
+		/*
+		 * devices are given minor numbers 0 - n-1
+		 * in the order they are found in the arg
+		 * string.
+		 */
+		dev->minor = i;
+		str = dm_parse_device(dev, str);
+		if (!str)	/* NULL indicates error in parsing, bail */
+			goto error;
+
+		str = dm_parse_targets(dev, str);
+		if (!str)
+			goto error;
+	}
+	return devices;
+error:
+	dm_setup_cleanup(devices);
+	return NULL;
 }
 
 /*
  * Parse the command-line parameters given our kernel, but do not
  * actually try to invoke the DM device now; that is handled by
- * dm_setup_drive after the low-level disk drivers have initialised.
- * dm format is as follows:
- *  dm="name uuid fmode,[table line 1],[table line 2],..."
- * May be used with root=/dev/dm-0 as it always uses the first dm minor.
+ * dm_setup_drives after the low-level disk drivers have initialised.
+ * dm format is described at the top of the file.
+ *
+ * Because dm minor numbers are assigned in assending order starting with 0,
+ * You can assume the first device is /dev/dm-0, the next device is /dev/dm-1,
+ * and so forth.
  */
-
 static int __init dm_setup(char *str)
 {
-	dm_setup_args_init();
+	struct dm_option opt;
+	unsigned long num_devices;
 
-	str = dm_setup_parse_device_args(str);
 	if (!str) {
 		DMDEBUG("str is NULL");
 		goto parse_fail;
 	}
-
-	/* Target parsing is delayed until we have dynamic memory */
-	dm_setup_args.targets = str;
-
-	printk(KERN_INFO "dm: will configure '%s' on dm-%d\n",
-	       dm_setup_args.name, dm_setup_args.minor);
-
+	opt.next = str;
+	if (!get_dm_option(&opt, DM_FIELD_SEP))
+		goto parse_fail;
+	if (isdigit(opt.start[0])) {	/* XXX: Optional number field */
+		num_devices = simple_strtoul(opt.start, NULL, 10);
+		str = opt.next;
+	} else {
+		num_devices = 1;
+		/* Don't advance str */
+	}
+	if (num_devices > DM_MAX_DEVICES) {
+		DMDEBUG("too many devices %lu > %d",
+			num_devices, DM_MAX_DEVICES);
+	}
+	dm_setup_args.str = str;
+	dm_setup_args.num_devices = num_devices;
+	DMINFO("will configure %lu devices", num_devices);
 	dm_early_setup = 1;
 	return 1;
 
 parse_fail:
-	printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n");
+	DMWARN("Invalid arguments supplied to dm=.");
 	return 0;
 }
 
-
-static void __init dm_setup_drive(void)
+static void __init dm_setup_drives(void)
 {
 	struct mapped_device *md = NULL;
 	struct dm_table *table = NULL;
 	struct dm_setup_target *target;
-	char *uuid = dm_setup_args.uuid;
+	struct dm_device *dev;
+	char *uuid;
 	fmode_t fmode = FMODE_READ;
+	struct dm_device *devices;
 
-	/* Finish parsing the targets. */
-	if (dm_setup_parse_targets(dm_setup_args.targets))
-		goto parse_fail;
+	devices = dm_parse_args();
 
-	if (dm_create(dm_setup_args.minor, &md)) {
-		DMDEBUG("failed to create the device");
-		goto dm_create_fail;
-	}
-	DMDEBUG("created device '%s'", dm_device_name(md));
-
-	/* In addition to flagging the table below, the disk must be
-	 * set explicitly ro/rw. */
-	set_disk_ro(dm_disk(md), dm_setup_args.ro);
-
-	if (!dm_setup_args.ro)
-		fmode |= FMODE_WRITE;
-	if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) {
-		DMDEBUG("failed to create the table");
-		goto dm_table_create_fail;
-	}
-
-	dm_lock_md_type(md);
-	target = dm_setup_args.target;
-	while (target) {
-		DMINFO("adding target '%llu %llu %s %s'",
-		       (unsigned long long) target->begin,
-		       (unsigned long long) target->length, target->type,
-		       target->params);
-		if (dm_table_add_target(table, target->type, target->begin,
-					target->length, target->params)) {
-			DMDEBUG("failed to add the target to the table");
-			goto add_target_fail;
+	for (dev = devices; dev; dev = dev->next) {
+		if (dm_create(dev->minor, &md)) {
+			DMDEBUG("failed to create the device");
+			goto dm_create_fail;
 		}
-		target = target->next;
-	}
+		DMDEBUG("created device '%s'", dm_device_name(md));
 
-	if (dm_table_complete(table)) {
-		DMDEBUG("failed to complete the table");
-		goto table_complete_fail;
-	}
+		/*
+		 * In addition to flagging the table below, the disk must be
+		 * set explicitly ro/rw.
+		 */
+		set_disk_ro(dm_disk(md), dev->ro);
 
-	if (dm_get_md_type(md) == DM_TYPE_NONE) {
+		if (!dev->ro)
+			fmode |= FMODE_WRITE;
+		if (dm_table_create(&table, fmode, dev->target_count, md)) {
+			DMDEBUG("failed to create the table");
+			goto dm_table_create_fail;
+		}
+
+		dm_lock_md_type(md);
+
+		for (target = dev->target; target; target = target->next) {
+			DMINFO("adding target '%llu %llu %s %s'",
+			       (unsigned long long) target->begin,
+			       (unsigned long long) target->length,
+			       target->type, target->params);
+			if (dm_table_add_target(table, target->type,
+						target->begin,
+						target->length,
+						target->params)) {
+				DMDEBUG("failed to add the target"
+					" to the table");
+				goto add_target_fail;
+			}
+		}
+		if (dm_table_complete(table)) {
+			DMDEBUG("failed to complete the table");
+			goto table_complete_fail;
+		}
+
+		/* Suspend the device so that we can bind it to the table. */
+		if (dm_suspend(md, 0)) {
+			DMDEBUG("failed to suspend the device pre-bind");
+			goto suspend_fail;
+		}
+
+		/* Initial table load: acquire type of table. */
 		dm_set_md_type(md, dm_table_get_type(table));
+
+		/* Setup md->queue to reflect md's type. */
 		if (dm_setup_md_queue(md)) {
 			DMWARN("unable to set up device queue for new table.");
 			goto setup_md_queue_fail;
 		}
-	} else if (dm_get_md_type(md) != dm_table_get_type(table)) {
-		DMWARN("can't change device type after initial table load.");
-		goto setup_md_queue_fail;
-        }
 
-	/* Suspend the device so that we can bind it to the table. */
-	if (dm_suspend(md, 0)) {
-		DMDEBUG("failed to suspend the device pre-bind");
-		goto suspend_fail;
+		/*
+		 * Bind the table to the device. This is the only way
+		 * to associate md->map with the table and set the disk
+		 * capacity directly.
+		 */
+		if (dm_swap_table(md, table)) {  /* should return NULL. */
+			DMDEBUG("failed to bind the device to the table");
+			goto table_bind_fail;
+		}
+
+		/* Finally, resume and the device should be ready. */
+		if (dm_resume(md)) {
+			DMDEBUG("failed to resume the device");
+			goto resume_fail;
+		}
+
+		/* Export the dm device via the ioctl interface */
+		if (!strcmp(DM_NO_UUID, dev->uuid))
+			uuid = NULL;
+		if (dm_ioctl_export(md, dev->name, uuid)) {
+			DMDEBUG("failed to export device with given"
+				" name and uuid");
+			goto export_fail;
+		}
+
+		dm_unlock_md_type(md);
+
+		DMINFO("dm-%d is ready", dev->minor);
 	}
-
-	/* Bind the table to the device. This is the only way to associate
-	 * md->map with the table and set the disk capacity directly. */
-	if (dm_swap_table(md, table)) {  /* should return NULL. */
-		DMDEBUG("failed to bind the device to the table");
-		goto table_bind_fail;
-	}
-
-	/* Finally, resume and the device should be ready. */
-	if (dm_resume(md)) {
-		DMDEBUG("failed to resume the device");
-		goto resume_fail;
-	}
-
-	/* Export the dm device via the ioctl interface */
-	if (!strcmp(DM_NO_UUID, dm_setup_args.uuid))
-		uuid = NULL;
-	if (dm_ioctl_export(md, dm_setup_args.name, uuid)) {
-		DMDEBUG("failed to export device with given name and uuid");
-		goto export_fail;
-	}
-	printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor);
-
-	dm_unlock_md_type(md);
-	dm_setup_cleanup();
+	dm_setup_cleanup(devices);
 	return;
 
 export_fail:
 resume_fail:
 table_bind_fail:
-suspend_fail:
 setup_md_queue_fail:
+suspend_fail:
 table_complete_fail:
 add_target_fail:
 	dm_unlock_md_type(md);
 dm_table_create_fail:
 	dm_put(md);
 dm_create_fail:
-	dm_setup_cleanup();
-parse_fail:
-	printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n",
-	       dm_setup_args.minor, dm_setup_args.name);
+	DMWARN("starting dm-%d (%s) failed",
+	       dev->minor, dev->name);
+	dm_setup_cleanup(devices);
 }
 
 __setup("dm=", dm_setup);
@@ -421,6 +465,6 @@ void __init dm_run_setup(void)
 {
 	if (!dm_early_setup)
 		return;
-	printk(KERN_INFO "dm: attempting early device configuration.\n");
-	dm_setup_drive();
+	DMINFO("attempting early device configuration.");
+	dm_setup_drives();
 }

From 8d6f006d608c3b03652fb919e496945f2d4d4f1d Mon Sep 17 00:00:00 2001
From: David Zeuthen <zeuthen@google.com>
Date: Tue, 24 Jan 2017 13:17:01 -0500
Subject: [PATCH 0776/3220] ANDROID: AVB error handler to invalidate vbmeta
 partition.

If androidboot.vbmeta.device is set and points to a device with vbmeta
magic, this header will be overwritten upon an irrecoverable dm-verity
error. The side-effect of this is that the slot will fail to verify on
next reboot, effectively triggering the boot loader to fallback to
another slot. This work both if the vbmeta struct is at the start of a
partition or if there's an AVB footer at the end.

This code is based on drivers/md/dm-verity-chromeos.c from ChromiumOS.

Example:

 [    0.000000] Kernel command line: rootfstype=ext4 init=/init console=ttyS0,115200 androidboot.console=ttyS0 androidboot.hardware=uefi_x86_64 enforcing=0 androidboot.selinux=permissive androidboot.debuggable=1 buildvariant=eng dm="1 vroot none ro 1,0 2080496 verity 1 PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b 4096 4096 260062 260062 sha1 4f76354c86e430e27426d584a726f2fbffecae32 7e4085342d634065269631ac9a199e1a43f4632c 1 ignore_zero_blocks" root=0xfd00 androidboot.vbmeta.device=PARTUUID=b865935d-38fb-4c4e-b8b4-70dc67321552 androidboot.slot_suffix=_a androidboot.vbmeta.device_state=unlocked androidboot.vbmeta.hash_alg=sha256 androidboot.vbmeta.size=3200 androidboot.vbmeta.digest=14fe41c2b3696c31b7ad5eae7877d7d188995e1ab122c604aaaf4785850b91f7 skip_initramfs
 [...]
 [    0.612802] device-mapper: verity-avb: AVB error handler initialized with vbmeta device: PARTUUID=b865935d-38fb-4c4e-b8b4-70dc67321552
 [...]
 [    1.213804] device-mapper: init: attempting early device configuration.
 [    1.214752] device-mapper: init: adding target '0 2080496 verity 1 PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b 4096 4096 260062 260062 sha1 4f76354c86e430e27426d584a726f2fbffecae32 7e4085342d634065269631ac9a199e1a43f4632c 1 ignore_zero_blocks'
 [    1.217643] device-mapper: init: dm-0 is ready
 [    1.226694] device-mapper: verity: 8:6: data block 0 is corrupted
 [    1.227666] device-mapper: verity-avb: AVB error handler called for PARTUUID=b865935d-38fb-4c4e-b8b4-70dc67321552
 [    1.234308] device-mapper: verity-avb: invalidate_vbmeta: found vbmeta partition
 [    1.235848] device-mapper: verity-avb: invalidate_vbmeta: completed.
 [...]

Bug: 31622239
Test: Manually tested (other arch).
Change-Id: Idf6be32d6a3d28e15de9302aa26ad6a516d663aa
Signed-off-by: David Zeuthen <zeuthen@google.com>
---
 drivers/md/Kconfig            |  11 ++
 drivers/md/Makefile           |   4 +
 drivers/md/dm-verity-avb.c    | 217 ++++++++++++++++++++++++++++++++++
 drivers/md/dm-verity-target.c |   6 +-
 drivers/md/dm-verity.h        |   1 +
 5 files changed, 238 insertions(+), 1 deletion(-)
 create mode 100644 drivers/md/dm-verity-avb.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 9eb08a43cd27..6f2fde5d98e7 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -515,6 +515,17 @@ config DM_LOG_WRITES
 
 	  If unsure, say N.
 
+config DM_VERITY_AVB
+	tristate "Support AVB specific verity error behavior"
+	depends on DM_VERITY
+	---help---
+	  Enables Android Verified Boot platform-specific error
+	  behavior. In particular, it will modify the vbmeta partition
+	  specified on the kernel command-line when non-transient error
+	  occurs (followed by a panic).
+
+	  If unsure, say N.
+
 config DM_ANDROID_VERITY
 	bool "Android verity target support"
 	depends on DM_VERITY=y
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 32b5d0a90d60..c22cc74c9fa8 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -69,3 +69,7 @@ endif
 ifeq ($(CONFIG_DM_VERITY_FEC),y)
 dm-verity-objs			+= dm-verity-fec.o
 endif
+
+ifeq ($(CONFIG_DM_VERITY_AVB),y)
+dm-verity-objs			+= dm-verity-avb.o
+endif
diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c
new file mode 100644
index 000000000000..88487346c4c6
--- /dev/null
+++ b/drivers/md/dm-verity-avb.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2017 Google.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Based on drivers/md/dm-verity-chromeos.c
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+
+#define DM_MSG_PREFIX "verity-avb"
+
+/* Set via module parameter. */
+static char avb_vbmeta_device[64];
+
+static void invalidate_vbmeta_endio(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+static int invalidate_vbmeta_submit(struct bio *bio,
+				    struct block_device *bdev,
+				    int rw, int access_last_sector,
+				    struct page *page)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	bio->bi_private = &wait;
+	bio->bi_end_io = invalidate_vbmeta_endio;
+	bio->bi_bdev = bdev;
+
+	bio->bi_iter.bi_sector = 0;
+	if (access_last_sector) {
+		sector_t last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1;
+		bio->bi_iter.bi_sector = last_sector;
+	}
+	bio->bi_vcnt = 1;
+	bio->bi_iter.bi_idx = 0;
+	bio->bi_iter.bi_size = 512;
+	bio->bi_iter.bi_bvec_done = 0;
+	bio->bi_rw = rw;
+	bio->bi_io_vec[0].bv_page = page;
+	bio->bi_io_vec[0].bv_len = 512;
+	bio->bi_io_vec[0].bv_offset = 0;
+
+	submit_bio(rw, bio);
+	/* Wait up to 2 seconds for completion or fail. */
+	if (!wait_for_completion_timeout(&wait, msecs_to_jiffies(2000)))
+		return -EIO;
+	return 0;
+}
+
+static int invalidate_vbmeta(dev_t vbmeta_devt)
+{
+	int ret = 0;
+	struct block_device *bdev;
+	struct bio *bio;
+	struct page *page;
+	fmode_t dev_mode;
+	/* Ensure we do synchronous unblocked I/O. We may also need
+	 * sync_bdev() on completion, but it really shouldn't.
+	 */
+	int rw = REQ_SYNC | REQ_SOFTBARRIER | REQ_NOIDLE;
+	int access_last_sector = 0;
+
+	/* First we open the device for reading. */
+	dev_mode = FMODE_READ | FMODE_EXCL;
+	bdev = blkdev_get_by_dev(vbmeta_devt, dev_mode,
+				 invalidate_vbmeta);
+	if (IS_ERR(bdev)) {
+		DMERR("invalidate_kernel: could not open device for reading");
+		dev_mode = 0;
+		ret = -ENOENT;
+		goto failed_to_read;
+	}
+
+	bio = bio_alloc(GFP_NOIO, 1);
+	if (!bio) {
+		ret = -ENOMEM;
+		goto failed_bio_alloc;
+	}
+
+	page = alloc_page(GFP_NOIO);
+	if (!page) {
+		ret = -ENOMEM;
+		goto failed_to_alloc_page;
+	}
+
+	access_last_sector = 0;
+	ret = invalidate_vbmeta_submit(bio, bdev, rw, access_last_sector, page);
+	if (ret) {
+		DMERR("invalidate_vbmeta: error reading");
+		goto failed_to_submit_read;
+	}
+
+	/* We have a page. Let's make sure it looks right. */
+	if (memcmp("AVB0", page_address(page), 4) == 0) {
+		/* Stamp it. */
+		memcpy(page_address(page), "AVE0", 4);
+		DMINFO("invalidate_vbmeta: found vbmeta partition");
+	} else {
+		/* Could be this is on a AVB footer, check. Also, since the
+		 * AVB footer is in the last 64 bytes, adjust for the fact that
+		 * we're dealing with 512-byte sectors.
+		 */
+		size_t offset = (1<<SECTOR_SHIFT) - 64;
+
+		access_last_sector = 1;
+		ret = invalidate_vbmeta_submit(bio, bdev, rw,
+					       access_last_sector, page);
+		if (ret) {
+			DMERR("invalidate_vbmeta: error reading");
+			goto failed_to_submit_read;
+		}
+		if (memcmp("AVBf", page_address(page) + offset, 4) != 0) {
+			DMERR("invalidate_vbmeta called on non-vbmeta partition");
+			ret = -EINVAL;
+			goto invalid_header;
+		}
+		/* Stamp it. */
+		memcpy(page_address(page) + offset, "AVE0", 4);
+		DMINFO("invalidate_vbmeta: found vbmeta footer partition");
+	}
+
+	/* Now rewrite the changed page - the block dev was being
+	 * changed on read. Let's reopen here.
+	 */
+	blkdev_put(bdev, dev_mode);
+	dev_mode = FMODE_WRITE | FMODE_EXCL;
+	bdev = blkdev_get_by_dev(vbmeta_devt, dev_mode,
+				 invalidate_vbmeta);
+	if (IS_ERR(bdev)) {
+		DMERR("invalidate_vbmeta: could not open device for writing");
+		dev_mode = 0;
+		ret = -ENOENT;
+		goto failed_to_write;
+	}
+
+	/* We re-use the same bio to do the write after the read. Need to reset
+	 * it to initialize bio->bi_remaining.
+	 */
+	bio_reset(bio);
+
+	rw |= REQ_WRITE;
+	ret = invalidate_vbmeta_submit(bio, bdev, rw, access_last_sector, page);
+	if (ret) {
+		DMERR("invalidate_vbmeta: error writing");
+		goto failed_to_submit_write;
+	}
+
+	DMERR("invalidate_vbmeta: completed.");
+	ret = 0;
+failed_to_submit_write:
+failed_to_write:
+invalid_header:
+	__free_page(page);
+failed_to_submit_read:
+	/* Technically, we'll leak a page with the pending bio, but
+	 * we're about to reboot anyway.
+	 */
+failed_to_alloc_page:
+	bio_put(bio);
+failed_bio_alloc:
+	if (dev_mode)
+		blkdev_put(bdev, dev_mode);
+failed_to_read:
+	return ret;
+}
+
+void dm_verity_avb_error_handler(void)
+{
+	dev_t dev;
+
+	DMINFO("AVB error handler called for %s", avb_vbmeta_device);
+
+	if (avb_vbmeta_device[0] == '\0') {
+		DMERR("avb_vbmeta_device parameter not set");
+		goto fail_no_dev;
+	}
+
+	dev = name_to_dev_t(avb_vbmeta_device);
+	if (!dev) {
+		DMERR("No matching partition for device: %s",
+		      avb_vbmeta_device);
+		goto fail_no_dev;
+	}
+
+	invalidate_vbmeta(dev);
+
+fail_no_dev:
+	;
+}
+
+static int __init dm_verity_avb_init(void)
+{
+	DMINFO("AVB error handler initialized with vbmeta device: %s",
+	       avb_vbmeta_device);
+	return 0;
+}
+
+static void __exit dm_verity_avb_exit(void)
+{
+}
+
+module_init(dm_verity_avb_init);
+module_exit(dm_verity_avb_exit);
+
+MODULE_AUTHOR("David Zeuthen <zeuthen@google.com>");
+MODULE_DESCRIPTION("AVB-specific error handler for dm-verity");
+MODULE_LICENSE("GPL");
+
+/* Declare parameter with no module prefix */
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX	"androidboot.vbmeta."
+module_param_string(device, avb_vbmeta_device, sizeof(avb_vbmeta_device), 0);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index c7e97cf6e7fb..e34cf53bd068 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -233,8 +233,12 @@ out:
 	if (v->mode == DM_VERITY_MODE_LOGGING)
 		return 0;
 
-	if (v->mode == DM_VERITY_MODE_RESTART)
+	if (v->mode == DM_VERITY_MODE_RESTART) {
+#ifdef CONFIG_DM_VERITY_AVB
+		dm_verity_avb_error_handler();
+#endif
 		kernel_restart("dm-verity device corrupted");
+	}
 
 	return 1;
 }
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 75effca400a3..a90d1d416107 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -136,4 +136,5 @@ extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits);
 extern void verity_dtr(struct dm_target *ti);
 extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
 extern int verity_map(struct dm_target *ti, struct bio *bio);
+extern void dm_verity_avb_error_handler(void);
 #endif /* DM_VERITY_H */

From 0f3b6e26eb2ebb0c66f20ed53134e26496c62dd7 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 20 Apr 2017 18:05:02 -0700
Subject: [PATCH 0777/3220] ANDROID: sdcardfs: Use filesystem specific hash

We weren't accounting for FS specific hash functions,
causing us to miss negative dentries for any FS that
had one.

Similar to a patch from esdfs
commit 75bd25a9476d ("esdfs: support lower's own hash")

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: I32d1ba304d728e0ca2648cacfb4c2e441ae63608
---
 fs/sdcardfs/lookup.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index a0f221501b4c..446ef4027ebc 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -366,8 +366,13 @@ put_name:
 	/* instatiate a new negative dentry */
 	dname.name = name->name;
 	dname.len = name->len;
-	dname.hash = full_name_hash(dname.name, dname.len);
-	lower_dentry = d_lookup(lower_dir_dentry, &dname);
+
+	/* See if the low-level filesystem might want
+	 * to use its own hash
+	 */
+	lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname);
+	if (IS_ERR(lower_dentry))
+		return lower_dentry;
 	if (lower_dentry)
 		goto setup_lower;
 

From b4840d3bba899cd1f2434fb9f1277b36673a4b3e Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 20 Apr 2017 18:21:50 -0700
Subject: [PATCH 0778/3220] Revert "Revert "Android: sdcardfs: Don't do d_add
 for lower fs""

This reverts commit ffa75fdb9c408f49b9622b6d55752ed99ff61488.

Turns out we just needed the right hash.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 37231161
Change-Id: I6a6de7f7df99ad42b20fa062913b219f64020c31
---
 fs/sdcardfs/lookup.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 446ef4027ebc..509d5fbcb472 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -373,17 +373,15 @@ put_name:
 	lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname);
 	if (IS_ERR(lower_dentry))
 		return lower_dentry;
-	if (lower_dentry)
-		goto setup_lower;
-
-	lower_dentry = d_alloc(lower_dir_dentry, &dname);
 	if (!lower_dentry) {
-		err = -ENOMEM;
+		/* We called vfs_path_lookup earlier, and did not get a negative
+		 * dentry then. Don't confuse the lower filesystem by forcing
+		 * one on it now...
+		 */
+		err = -ENOENT;
 		goto out;
 	}
-	d_add(lower_dentry, NULL); /* instantiate and hash */
 
-setup_lower:
 	lower_path.dentry = lower_dentry;
 	lower_path.mnt = mntget(lower_dir_mnt);
 	sdcardfs_set_lower_path(dentry, &lower_path);

From 46d925efcc72a2b52d404ad8a457a5bdc7c18b83 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 24 Apr 2017 16:10:21 -0700
Subject: [PATCH 0779/3220] ANDROID: sdcardfs: Copy meta-data from lower inode

From wrapfs commit 3ee9b365e38c ("Wrapfs: properly copy meta-data after
AIO operations from lower inode")

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35766959
Change-Id: I9a789222e27a17b8d85ce61c45397d1839f9a675
---
 fs/sdcardfs/file.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 1f6921e2ffbf..6076c342dae6 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -358,9 +358,12 @@ ssize_t sdcardfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	get_file(lower_file); /* prevent lower_file from being released */
 	iocb->ki_filp = lower_file;
 	err = lower_file->f_op->read_iter(iocb, iter);
-	/* ? wait IO finish to update atime as ecryptfs ? */
 	iocb->ki_filp = file;
 	fput(lower_file);
+	/* update upper inode atime as needed */
+	if (err >= 0 || err == -EIOCBQUEUED)
+		fsstack_copy_attr_atime(file->f_path.dentry->d_inode,
+					file_inode(lower_file));
 out:
 	return err;
 }
@@ -384,6 +387,13 @@ ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	err = lower_file->f_op->write_iter(iocb, iter);
 	iocb->ki_filp = file;
 	fput(lower_file);
+	/* update upper inode times/sizes as needed */
+	if (err >= 0 || err == -EIOCBQUEUED) {
+		fsstack_copy_inode_size(file->f_path.dentry->d_inode,
+					file_inode(lower_file));
+		fsstack_copy_attr_times(file->f_path.dentry->d_inode,
+					file_inode(lower_file));
+	}
 out:
 	return err;
 }

From 33fddbee41d511bd1f1946f3dd66351d6895c30f Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 24 Apr 2017 16:11:03 -0700
Subject: [PATCH 0780/3220] ANDROID: sdcardfs: Avoid setting GIDs outside of
 valid ranges

When setting up the ownership of files on the lower filesystem,
ensure that these values are in reasonable ranges for apps. If
they aren't, default to AID_MEDIA_RW

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 37516160
Change-Id: I0bec76a61ac72aff0b993ab1ad04be8382178a00
---
 fs/sdcardfs/derived_perm.c | 8 ++++----
 fs/sdcardfs/multiuser.h    | 7 +++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index cddfc79ea365..b4595aab5713 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -215,16 +215,16 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name)
 		gid = AID_MEDIA_OBB;
 		break;
 	case PERM_ANDROID_PACKAGE:
-		if (info->d_uid != 0)
+		if (uid_is_app(info->d_uid))
 			gid = multiuser_get_ext_gid(info->d_uid);
 		else
-			gid = multiuser_get_uid(info->userid, uid);
+			gid = multiuser_get_uid(info->userid, AID_MEDIA_RW);
 		break;
 	case PERM_ANDROID_PACKAGE_CACHE:
-		if (info->d_uid != 0)
+		if (uid_is_app(info->d_uid))
 			gid = multiuser_get_ext_cache_gid(info->d_uid);
 		else
-			gid = multiuser_get_uid(info->userid, uid);
+			gid = multiuser_get_uid(info->userid, AID_MEDIA_RW);
 		break;
 	case PERM_PRE_ROOT:
 	default:
diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h
index d0c925cda299..85341e753f8c 100644
--- a/fs/sdcardfs/multiuser.h
+++ b/fs/sdcardfs/multiuser.h
@@ -35,6 +35,13 @@ static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id)
 	return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
 }
 
+static inline bool uid_is_app(uid_t uid)
+{
+	appid_t appid = uid % AID_USER_OFFSET;
+
+	return appid >= AID_APP_START && appid <= AID_APP_END;
+}
+
 static inline gid_t multiuser_get_ext_cache_gid(uid_t uid)
 {
 	return uid - AID_APP_START + AID_EXT_CACHE_GID_START;

From b878b260109386898ece2fc99ff2da255acd9c10 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 24 Apr 2017 19:49:02 -0700
Subject: [PATCH 0781/3220] ANDROID: sdcardfs: Call lower fs's revalidate

We should be calling the lower filesystem's revalidate
inside of sdcardfs's revalidate, as wrapfs does.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35766959
Change-Id: I939d1c4192fafc1e21678aeab43fe3d588b8e2f4
---
 fs/sdcardfs/dentry.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index 8e31d1a80f0c..7a19e77fce99 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -60,6 +60,14 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	lower_dentry = lower_path.dentry;
 	lower_cur_parent_dentry = dget_parent(lower_dentry);
 
+	if ((lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+		err = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+		if (err == 0) {
+			d_drop(dentry);
+			goto out;
+		}
+	}
+
 	spin_lock(&lower_dentry->d_lock);
 	if (d_unhashed(lower_dentry)) {
 		spin_unlock(&lower_dentry->d_lock);

From 3f0531e5775303091a1ff975cdd572cc6a935321 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Mon, 24 Apr 2017 18:20:52 -0700
Subject: [PATCH 0782/3220] BACKPORT: f2fs: sanity check log_blocks_per_seg

f2fs currently only supports 4KB block size and 2MB segment size.
Sanity check log_blocks_per_seg == 9, i.e. 2MB/4KB = (1 << 9)

Partially
(cherry-picked from commit 9a59b62fd88196844cee5fff851bee2cfd7afb6e)

f2fs: do more integrity verification for superblock

Do more sanity check for superblock during ->mount.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>

Bug: 36817013
Change-Id: I0be52e54fba82083068337ceb9f7ad985a87319f
Signed-off-by: Jin Qian <jinqian@google.com>
---
 fs/f2fs/super.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3a65e0132352..98a77b0a365d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -947,6 +947,14 @@ static int sanity_check_raw_super(struct super_block *sb,
 		return 1;
 	}
 
+	/* check log blocks per segment */
+	if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid log blocks per segment (%u)\n",
+			le32_to_cpu(raw_super->log_blocks_per_seg));
+		return 1;
+	}
+
 	/* Currently, support 512/1024/2048/4096 bytes sector size */
 	if (le32_to_cpu(raw_super->log_sectorsize) >
 				F2FS_MAX_LOG_SECTOR_SIZE ||

From 90d78776c4a0e13fb7ee5bd0787f04a1730631a6 Mon Sep 17 00:00:00 2001
From: Ganesh Mahendran <opensource.ganesh@gmail.com>
Date: Tue, 25 Apr 2017 18:07:43 +0800
Subject: [PATCH 0783/3220] ANDROID: uid_sys_stats: fix access of
 task_uid(task)

struct task_struct *task should be proteced by tasklist_lock.

Change-Id: Iefcd13442a9b9d855a2bbcde9fd838a4132fee58
Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
---
 drivers/misc/uid_sys_stats.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 618617f3e867..ad21276c8d9e 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -95,9 +95,11 @@ static int uid_cputime_show(struct seq_file *m, void *v)
 {
 	struct uid_entry *uid_entry;
 	struct task_struct *task, *temp;
+	struct user_namespace *user_ns = current_user_ns();
 	cputime_t utime;
 	cputime_t stime;
 	unsigned long bkt;
+	uid_t uid;
 
 	rt_mutex_lock(&uid_lock);
 
@@ -108,14 +110,13 @@ static int uid_cputime_show(struct seq_file *m, void *v)
 
 	read_lock(&tasklist_lock);
 	do_each_thread(temp, task) {
-		uid_entry = find_or_register_uid(from_kuid_munged(
-			current_user_ns(), task_uid(task)));
+		uid = from_kuid_munged(user_ns, task_uid(task));
+		uid_entry = find_or_register_uid(uid);
 		if (!uid_entry) {
 			read_unlock(&tasklist_lock);
 			rt_mutex_unlock(&uid_lock);
 			pr_err("%s: failed to find the uid_entry for uid %d\n",
-				__func__, from_kuid_munged(current_user_ns(),
-				task_uid(task)));
+				__func__, uid);
 			return -ENOMEM;
 		}
 		task_cputime_adjusted(task, &utime, &stime);

From 1f48a715af37ceffd86ee7a5f693dd2ca2dda892 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Fri, 28 Apr 2017 12:53:04 -0600
Subject: [PATCH 0784/3220] net: pppolac/pppopns: Add back the msg_flags

Commit 26fc40a09221330 ("net: pppolac/pppopns: Replace msg.msg_iov
with iov_iter_kvec()") removed the msg_flags when removing the
iov fields. This lead to problems with VPN data transfers.

Change-Id: Ib86ab3f927c5cf36cbad0bab501575999dc2b084
Fixes: 26fc40a09221330 ("net: pppolac/pppopns: Replace msg.msg_iov with
iov_iter_kvec()")
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 drivers/net/ppp/pppolac.c | 4 +++-
 drivers/net/ppp/pppopns.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ppp/pppolac.c b/drivers/net/ppp/pppolac.c
index 0184c96579e9..3a45cf805288 100644
--- a/drivers/net/ppp/pppolac.c
+++ b/drivers/net/ppp/pppolac.c
@@ -206,7 +206,9 @@ static void pppolac_xmit_core(struct work_struct *delivery_work)
 	while ((skb = skb_dequeue(&delivery_queue))) {
 		struct sock *sk_udp = skb->sk;
 		struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len};
-		struct msghdr msg = { 0 };
+		struct msghdr msg = {
+			.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT,
+		};
 
 		iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1,
 			      skb->len);
diff --git a/drivers/net/ppp/pppopns.c b/drivers/net/ppp/pppopns.c
index d9e06039794e..cdb4fa1af734 100644
--- a/drivers/net/ppp/pppopns.c
+++ b/drivers/net/ppp/pppopns.c
@@ -189,7 +189,9 @@ static void pppopns_xmit_core(struct work_struct *delivery_work)
 	while ((skb = skb_dequeue(&delivery_queue))) {
 		struct sock *sk_raw = skb->sk;
 		struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len};
-		struct msghdr msg = { 0 };
+		struct msghdr msg = {
+			.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT,
+		};
 
 		iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1,
 			      skb->len);

From 69a14c17de7edbaa354d56bbd25718a6e2cd6089 Mon Sep 17 00:00:00 2001
From: Witold Sciuk <witold.sciuk@intel.com>
Date: Sat, 13 Feb 2016 11:08:37 +0100
Subject: [PATCH 0785/3220] ANDROID: usb: gadget: f_mtp: Set 0xFFFFFFFF in mtp
 header ContainerLength field

Value 0xFFFFFFFF should be set according specification
of MTP for large files when fileSize + mtpHeader is greater
than 0xFFFFFFFF.
MTP Specification, Appendix H - USB Optimizations

Patchset: mtp

Change-Id: I6213de052914350be2f87b73f8135f9c0cd05d7c
Signed-off-by: Witold Sciuk <witold.sciuk@intel.com>
Signed-off-by: Konrad Leszczynski <konrad.leszczynski@intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
---
 drivers/usb/gadget/function/f_mtp.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
index b37cc8d87ca4..0fa79b0c26ab 100644
--- a/drivers/usb/gadget/function/f_mtp.c
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -43,6 +43,7 @@
 #define MTP_BULK_BUFFER_SIZE       16384
 #define INTR_BUFFER_SIZE           28
 #define MAX_INST_NAME_LEN          40
+#define MTP_MAX_FILE_SIZE          0xFFFFFFFFL
 
 /* String IDs */
 #define INTERFACE_STRING_INDEX	0
@@ -766,7 +767,12 @@ static void send_file_work(struct work_struct *data)
 		if (hdr_size) {
 			/* prepend MTP data header */
 			header = (struct mtp_data_header *)req->buf;
-			header->length = __cpu_to_le32(count);
+			/*
+                         * set file size with header according to
+                         * MTP Specification v1.0
+                         */
+			header->length = (count > MTP_MAX_FILE_SIZE) ?
+				MTP_MAX_FILE_SIZE : __cpu_to_le32(count);
 			header->type = __cpu_to_le16(2); /* data packet */
 			header->command = __cpu_to_le16(dev->xfer_command);
 			header->transaction_id =

From 4f8785184972bf230383dfbbda3e1c0b21dc2312 Mon Sep 17 00:00:00 2001
From: Konrad Leszczynski <konrad.leszczynski@intel.com>
Date: Wed, 24 Feb 2016 10:47:12 +0000
Subject: [PATCH 0786/3220] ANDROID: usb: gadget: f_audio_source: disable the
 CPU C-states upon playback

Due to the issue with the isoc transfers being interrupted
by the CPU going into the idle state, the C-states will be
disabled for the playback time.

Change-Id: If4e52673606923d7e33a1d1dbe0192b8ad24f78c
Signed-off-by: Konrad Leszczynski <konrad.leszczynski@intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
---
 drivers/usb/gadget/function/f_audio_source.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c
index db7903d19c43..8124af33b738 100644
--- a/drivers/usb/gadget/function/f_audio_source.c
+++ b/drivers/usb/gadget/function/f_audio_source.c
@@ -17,6 +17,7 @@
 #include <linux/device.h>
 #include <linux/usb/audio.h>
 #include <linux/wait.h>
+#include <linux/pm_qos.h>
 #include <sound/core.h>
 #include <sound/initval.h>
 #include <sound/pcm.h>
@@ -268,6 +269,8 @@ struct audio_dev {
 	/* number of frames sent since start_time */
 	s64				frames_sent;
 	struct audio_source_config	*config;
+	/* for creating and issuing QoS requests */
+	struct pm_qos_request pm_qos;
 };
 
 static inline struct audio_dev *func_to_audio(struct usb_function *f)
@@ -740,6 +743,10 @@ static int audio_pcm_open(struct snd_pcm_substream *substream)
 	runtime->hw.channels_max = 2;
 
 	audio->substream = substream;
+
+	/* Add the QoS request and set the latency to 0 */
+	pm_qos_add_request(&audio->pm_qos, PM_QOS_CPU_DMA_LATENCY, 0);
+
 	return 0;
 }
 
@@ -749,6 +756,10 @@ static int audio_pcm_close(struct snd_pcm_substream *substream)
 	unsigned long flags;
 
 	spin_lock_irqsave(&audio->lock, flags);
+
+	/* Remove the QoS request */
+	pm_qos_remove_request(&audio->pm_qos);
+
 	audio->substream = NULL;
 	spin_unlock_irqrestore(&audio->lock, flags);
 

From 3c72862e8f645c2e2a936646e3568b45e44c268a Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Fri, 5 May 2017 01:15:33 +0530
Subject: [PATCH 0787/3220] ANDROID: arm64: suspend: Restore the UAO state

Upstream commit d08544127d9f ("arm64: suspend: Reconfigure PSTATE
after resume from idle") when cherry-picked on LTS linux-4.4.y
removed UAO reset code because UAO is not supported in linux-4.4.y.

But common/android-4.4 has UAO support, added in Change-Id:
I1a6a74a1f33b92d54368bd99387b55cf62930903 ("UPSTREAM: arm64: kernel:
Add support for User Access Override").

This patch pick up the UAO specific changes of upstream commit
dropped in LTS cherry-pick.

Fixes: LTS commit 71710cd35a55 ("arm64: suspend: Reconfigure PSTATE after resume from idle")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/include/asm/exec.h | 3 +++
 arch/arm64/kernel/process.c   | 3 ++-
 arch/arm64/kernel/suspend.c   | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/exec.h b/arch/arm64/include/asm/exec.h
index db0563c23482..f7865dd9d868 100644
--- a/arch/arm64/include/asm/exec.h
+++ b/arch/arm64/include/asm/exec.h
@@ -18,6 +18,9 @@
 #ifndef __ASM_EXEC_H
 #define __ASM_EXEC_H
 
+#include <linux/sched.h>
+
 extern unsigned long arch_align_stack(unsigned long sp);
+void uao_thread_switch(struct task_struct *next);
 
 #endif	/* __ASM_EXEC_H */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 6f3fb46170bf..e6afea67f6c1 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -49,6 +49,7 @@
 #include <asm/alternative.h>
 #include <asm/compat.h>
 #include <asm/cacheflush.h>
+#include <asm/exec.h>
 #include <asm/fpsimd.h>
 #include <asm/mmu_context.h>
 #include <asm/processor.h>
@@ -379,7 +380,7 @@ static void tls_thread_switch(struct task_struct *next)
 }
 
 /* Restore the UAO state depending on next's addr_limit */
-static void uao_thread_switch(struct task_struct *next)
+void uao_thread_switch(struct task_struct *next)
 {
 	if (IS_ENABLED(CONFIG_ARM64_UAO)) {
 		if (task_thread_info(next)->addr_limit == KERNEL_DS)
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 20b6b9b02cc8..f42b8b8f1d0a 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -5,6 +5,7 @@
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
 #include <asm/debug-monitors.h>
+#include <asm/exec.h>
 #include <asm/pgtable.h>
 #include <asm/memory.h>
 #include <asm/mmu_context.h>
@@ -106,6 +107,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 		 */
 		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,
 				CONFIG_ARM64_PAN));
+		uao_thread_switch(current);
 
 		/*
 		 * Restore HW breakpoint registers to sane values

From 40d8db5ddf18c8415a3d789515e14d8ddbb638ee Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Fri, 5 May 2017 18:36:51 -0700
Subject: [PATCH 0788/3220] ANDROID: android-base.cfg: remove defunct options

INET6_DIAG_DESTROY and NETFILTER_TPROXY are not used anymore
so they should not be part of the base Android kernel configuration.

Bug: 37749708
Change-Id: Iab263a5723f1810e2133919b8db93cc2bb986624
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 android/configs/android-base.cfg | 2 --
 1 file changed, 2 deletions(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 30e01074f64a..748ccf37ca3e 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -29,7 +29,6 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_INET6_AH=y
-CONFIG_INET6_DIAG_DESTROY=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y
 CONFIG_INET=y
@@ -72,7 +71,6 @@ CONFIG_MODVERSIONS=y
 CONFIG_NET=y
 CONFIG_NETDEVICES=y
 CONFIG_NETFILTER=y
-CONFIG_NETFILTER_TPROXY=y
 CONFIG_NETFILTER_XT_MATCH_COMMENT=y
 CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=y

From cc756e682c1a296acd14471d122646c92f6936f3 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Sun, 7 May 2017 14:03:43 -0700
Subject: [PATCH 0789/3220] ANDROID: android-base.cfg: remove USB_OTG_WAKELOCK

CONFIG_USB_OTG_WAKELOCK is currently somewhat outdated
and as such is not applicable to all Android devices. Until
it is brought up to date, remove it from the base Android
kernel configuration.

Bug: 37750863
Change-Id: I5b1c0bef24476cc503a60003bf48ffb59eea8c94
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 android/configs/android-base.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 748ccf37ca3e..3c32c74c584a 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -171,5 +171,4 @@ CONFIG_USB_CONFIGFS_F_MTP=y
 CONFIG_USB_CONFIGFS_F_PTP=y
 CONFIG_USB_CONFIGFS_UEVENT=y
 CONFIG_USB_GADGET=y
-CONFIG_USB_OTG_WAKELOCK=y
 CONFIG_XFRM_USER=y

From b5428181f94bd0d1ff5d321bd58413bdb61f8df4 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 19 Apr 2017 14:22:47 -0700
Subject: [PATCH 0790/3220] ANDROID: Add untag hacks to inet_release function

To prevent protential risk of memory leak caused by closing socket with
out untag it from qtaguid module, the qtaguid module now do not hold any
socket file reference count. Instead, it will increase the sk_refcnt of
the sk struct to prevent a reuse of the socket pointer.  And when a socket
is released. It will delete the tag if the socket is previously tagged so
no more resources is held by xt_qtaguid moudle. A flag is added to the untag
process to prevent possible kernel crash caused by fail to delete
corresponding socket_tag_entry list.
Bug: 36374484
Test: compile and run test under system/extra/test/iptables,
      run cts -m CtsNetTestCases -t android.net.cts.SocketRefCntTest

Signed-off-by: Chenbo Feng <fengc@google.com>
Change-Id: Iea7c3bf0c59b9774a5114af905b2405f6bc9ee52
---
 include/linux/netfilter/xt_qtaguid.h |   1 +
 net/ipv4/af_inet.c                   |   4 +
 net/netfilter/xt_qtaguid.c           | 107 ++++++++++++++-------------
 net/netfilter/xt_qtaguid_internal.h  |   2 -
 net/netfilter/xt_qtaguid_print.c     |   8 +-
 5 files changed, 63 insertions(+), 59 deletions(-)

diff --git a/include/linux/netfilter/xt_qtaguid.h b/include/linux/netfilter/xt_qtaguid.h
index ca60fbdec2f3..1c671552ec37 100644
--- a/include/linux/netfilter/xt_qtaguid.h
+++ b/include/linux/netfilter/xt_qtaguid.h
@@ -10,4 +10,5 @@
 #define XT_QTAGUID_SOCKET XT_OWNER_SOCKET
 #define xt_qtaguid_match_info xt_owner_match_info
 
+int qtaguid_untag(struct socket *sock, bool kernel);
 #endif /* _XT_QTAGUID_MATCH_H */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 68bf7bdf7fdb..eb5cfc1478b1 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -89,6 +89,7 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/random.h>
 #include <linux/slab.h>
+#include <linux/netfilter/xt_qtaguid.h>
 
 #include <asm/uaccess.h>
 
@@ -413,6 +414,9 @@ int inet_release(struct socket *sock)
 	if (sk) {
 		long timeout;
 
+#ifdef CONFIG_NETFILTER_XT_MATCH_QTAGUID
+		qtaguid_untag(sock, true);
+#endif
 		/* Applications forget to leave groups before exiting */
 		ip_mc_drop_socket(sk);
 
diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index 0f5628a59917..f95aba44e649 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -321,7 +321,7 @@ static void sock_tag_tree_erase(struct rb_root *st_to_free_tree)
 			 st_entry->tag,
 			 get_uid_from_tag(st_entry->tag));
 		rb_erase(&st_entry->sock_node, st_to_free_tree);
-		sockfd_put(st_entry->socket);
+		sock_put(st_entry->sk);
 		kfree(st_entry);
 	}
 }
@@ -1929,12 +1929,12 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 {
 	struct sock_tag *sock_tag_entry = v;
 	uid_t uid;
-	long f_count;
 
 	CT_DEBUG("qtaguid: proc ctrl pid=%u tgid=%u uid=%u\n",
 		 current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
 
 	if (sock_tag_entry != SEQ_START_TOKEN) {
+		int sk_ref_count;
 		uid = get_uid_from_tag(sock_tag_entry->tag);
 		CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) "
 			 "pid=%u\n",
@@ -1943,13 +1943,13 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 			 uid,
 			 sock_tag_entry->pid
 			);
-		f_count = atomic_long_read(
-			&sock_tag_entry->socket->file->f_count);
+		sk_ref_count = atomic_read(
+			&sock_tag_entry->sk->sk_refcnt);
 		seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u "
-			   "f_count=%lu\n",
+			   "f_count=%d\n",
 			   sock_tag_entry->sk,
 			   sock_tag_entry->tag, uid,
-			   sock_tag_entry->pid, f_count);
+			   sock_tag_entry->pid, sk_ref_count);
 	} else {
 		seq_printf(m, "events: sockets_tagged=%llu "
 			   "sockets_untagged=%llu "
@@ -2245,8 +2245,8 @@ static int ctrl_cmd_tag(const char *input)
 			from_kuid(&init_user_ns, current_fsuid()));
 		goto err;
 	}
-	CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->f_count=%ld ->sk=%p\n",
-		 input, atomic_long_read(&el_socket->file->f_count),
+	CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->sk_refcnt=%d ->sk=%p\n",
+		 input, atomic_read(&el_socket->sk->sk_refcnt),
 		 el_socket->sk);
 	if (argc < 3) {
 		acct_tag = make_atag_from_value(0);
@@ -2290,16 +2290,9 @@ static int ctrl_cmd_tag(const char *input)
 		struct tag_ref *prev_tag_ref_entry;
 
 		CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p "
-			 "st@%p ...->f_count=%ld\n",
+			 "st@%p ...->sk_refcnt=%d\n",
 			 input, el_socket->sk, sock_tag_entry,
-			 atomic_long_read(&el_socket->file->f_count));
-		/*
-		 * This is a re-tagging, so release the sock_fd that was
-		 * locked at the time of the 1st tagging.
-		 * There is still the ref from this call's sockfd_lookup() so
-		 * it can be done within the spinlock.
-		 */
-		sockfd_put(sock_tag_entry->socket);
+			 atomic_read(&el_socket->sk->sk_refcnt));
 		prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag,
 						    &uid_tag_data_entry);
 		BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry));
@@ -2319,8 +2312,12 @@ static int ctrl_cmd_tag(const char *input)
 			res = -ENOMEM;
 			goto err_tag_unref_put;
 		}
+		/*
+		 * Hold the sk refcount here to make sure the sk pointer cannot
+		 * be freed and reused
+		 */
+		sock_hold(el_socket->sk);
 		sock_tag_entry->sk = el_socket->sk;
-		sock_tag_entry->socket = el_socket;
 		sock_tag_entry->pid = current->tgid;
 		sock_tag_entry->tag = combine_atag_with_uid(acct_tag, uid_int);
 		spin_lock_bh(&uid_tag_data_tree_lock);
@@ -2347,10 +2344,11 @@ static int ctrl_cmd_tag(const char *input)
 		atomic64_inc(&qtu_events.sockets_tagged);
 	}
 	spin_unlock_bh(&sock_tag_list_lock);
-	/* We keep the ref to the socket (file) until it is untagged */
-	CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->f_count=%ld\n",
+	/* We keep the ref to the sk until it is untagged */
+	CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->sk_refcnt=%d\n",
 		 input, sock_tag_entry,
-		 atomic_long_read(&el_socket->file->f_count));
+		 atomic_read(&el_socket->sk->sk_refcnt));
+	sockfd_put(el_socket);
 	return 0;
 
 err_tag_unref_put:
@@ -2358,8 +2356,8 @@ err_tag_unref_put:
 	tag_ref_entry->num_sock_tags--;
 	free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry);
 err_put:
-	CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->f_count=%ld\n",
-		 input, atomic_long_read(&el_socket->file->f_count) - 1);
+	CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->sk_refcnt=%d\n",
+		 input, atomic_read(&el_socket->sk->sk_refcnt) - 1);
 	/* Release the sock_fd that was grabbed by sockfd_lookup(). */
 	sockfd_put(el_socket);
 	return res;
@@ -2375,17 +2373,13 @@ static int ctrl_cmd_untag(const char *input)
 	int sock_fd = 0;
 	struct socket *el_socket;
 	int res, argc;
-	struct sock_tag *sock_tag_entry;
-	struct tag_ref *tag_ref_entry;
-	struct uid_tag_data *utd_entry;
-	struct proc_qtu_data *pqd_entry;
 
 	argc = sscanf(input, "%c %d", &cmd, &sock_fd);
 	CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n",
 		 input, argc, cmd, sock_fd);
 	if (argc < 2) {
 		res = -EINVAL;
-		goto err;
+		return res;
 	}
 	el_socket = sockfd_lookup(sock_fd, &res);  /* This locks the file */
 	if (!el_socket) {
@@ -2393,17 +2387,31 @@ static int ctrl_cmd_untag(const char *input)
 			" sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
 			input, sock_fd, res, current->pid, current->tgid,
 			from_kuid(&init_user_ns, current_fsuid()));
-		goto err;
+		return res;
 	}
 	CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n",
 		 input, atomic_long_read(&el_socket->file->f_count),
 		 el_socket->sk);
+	res = qtaguid_untag(el_socket, false);
+	sockfd_put(el_socket);
+	return res;
+}
+
+int qtaguid_untag(struct socket *el_socket, bool kernel)
+{
+	int res;
+	pid_t pid;
+	struct sock_tag *sock_tag_entry;
+	struct tag_ref *tag_ref_entry;
+	struct uid_tag_data *utd_entry;
+	struct proc_qtu_data *pqd_entry;
+
 	spin_lock_bh(&sock_tag_list_lock);
 	sock_tag_entry = get_sock_stat_nl(el_socket->sk);
 	if (!sock_tag_entry) {
 		spin_unlock_bh(&sock_tag_list_lock);
 		res = -EINVAL;
-		goto err_put;
+		return res;
 	}
 	/*
 	 * The socket already belongs to the current process
@@ -2415,20 +2423,26 @@ static int ctrl_cmd_untag(const char *input)
 	BUG_ON(!tag_ref_entry);
 	BUG_ON(tag_ref_entry->num_sock_tags <= 0);
 	spin_lock_bh(&uid_tag_data_tree_lock);
+	if (kernel)
+		pid = sock_tag_entry->pid;
+	else
+		pid = current->tgid;
 	pqd_entry = proc_qtu_data_tree_search(
-		&proc_qtu_data_tree, current->tgid);
+		&proc_qtu_data_tree, pid);
 	/*
 	 * TODO: remove if, and start failing.
 	 * At first, we want to catch user-space code that is not
 	 * opening the /dev/xt_qtaguid.
 	 */
-	if (IS_ERR_OR_NULL(pqd_entry))
+	if (IS_ERR_OR_NULL(pqd_entry) || !sock_tag_entry->list.next) {
 		pr_warn_once("qtaguid: %s(): "
 			     "User space forgot to open /dev/xt_qtaguid? "
-			     "pid=%u tgid=%u uid=%u\n", __func__,
-			     current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
-	else
+			     "pid=%u tgid=%u sk_pid=%u, uid=%u\n", __func__,
+			     current->pid, current->tgid, sock_tag_entry->pid,
+			     from_kuid(&init_user_ns, current_fsuid()));
+	} else {
 		list_del(&sock_tag_entry->list);
+	}
 	spin_unlock_bh(&uid_tag_data_tree_lock);
 	/*
 	 * We don't free tag_ref from the utd_entry here,
@@ -2437,30 +2451,17 @@ static int ctrl_cmd_untag(const char *input)
 	tag_ref_entry->num_sock_tags--;
 	spin_unlock_bh(&sock_tag_list_lock);
 	/*
-	 * Release the sock_fd that was grabbed at tag time,
-	 * and once more for the sockfd_lookup() here.
+	 * Release the sock_fd that was grabbed at tag time.
 	 */
-	sockfd_put(sock_tag_entry->socket);
-	CT_DEBUG("qtaguid: ctrl_untag(%s): done. st@%p ...->f_count=%ld\n",
-		 input, sock_tag_entry,
-		 atomic_long_read(&el_socket->file->f_count) - 1);
-	sockfd_put(el_socket);
+	sock_put(sock_tag_entry->sk);
+	CT_DEBUG("qtaguid: done. st@%p ...->sk_refcnt=%d\n",
+		 sock_tag_entry,
+		 atomic_read(&el_socket->sk->sk_refcnt));
 
 	kfree(sock_tag_entry);
 	atomic64_inc(&qtu_events.sockets_untagged);
 
 	return 0;
-
-err_put:
-	CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%ld\n",
-		 input, atomic_long_read(&el_socket->file->f_count) - 1);
-	/* Release the sock_fd that was grabbed by sockfd_lookup(). */
-	sockfd_put(el_socket);
-	return res;
-
-err:
-	CT_DEBUG("qtaguid: ctrl_untag(%s): done.\n", input);
-	return res;
 }
 
 static ssize_t qtaguid_ctrl_parse(const char *input, size_t count)
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
index 6dc14a9c6889..8178fbdfb036 100644
--- a/net/netfilter/xt_qtaguid_internal.h
+++ b/net/netfilter/xt_qtaguid_internal.h
@@ -256,8 +256,6 @@ struct iface_stat_work {
 struct sock_tag {
 	struct rb_node sock_node;
 	struct sock *sk;  /* Only used as a number, never dereferenced */
-	/* The socket is needed for sockfd_put() */
-	struct socket *socket;
 	/* Used to associate with a given pid */
 	struct list_head list;   /* in proc_qtu_data.sock_tag_list */
 	pid_t pid;
diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c
index f6a00a3520ed..2a7190d285e6 100644
--- a/net/netfilter/xt_qtaguid_print.c
+++ b/net/netfilter/xt_qtaguid_print.c
@@ -24,7 +24,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/spinlock_types.h>
-
+#include <net/sock.h>
 
 #include "xt_qtaguid_internal.h"
 #include "xt_qtaguid_print.h"
@@ -237,10 +237,10 @@ char *pp_sock_tag(struct sock_tag *st)
 	tag_str = pp_tag_t(&st->tag);
 	res = kasprintf(GFP_ATOMIC, "sock_tag@%p{"
 			"sock_node=rb_node{...}, "
-			"sk=%p socket=%p (f_count=%lu), list=list_head{...}, "
+			"sk=%p (f_count=%d), list=list_head{...}, "
 			"pid=%u, tag=%s}",
-			st, st->sk, st->socket, atomic_long_read(
-				&st->socket->file->f_count),
+			st, st->sk, atomic_read(
+				&st->sk->sk_refcnt),
 			st->pid, tag_str);
 	_bug_on_err_or_null(res);
 	kfree(tag_str);

From 907e828e010b0609016fabd0732e82b1872e0c98 Mon Sep 17 00:00:00 2001
From: Daniel Roseberg <drosen@google.com>
Date: Tue, 9 May 2017 13:36:35 -0700
Subject: [PATCH 0791/3220] ANDROID: sdcardfs: Don't iput if we didn't igrab

If we fail to get top, top is either NULL, or igrab found
that we're in the process of freeing that inode, and did
not grab it. Either way, we didn't grab it, and have no
business putting it.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 38117720
Change-Id: Ie2f587483b9abb5144263156a443e89bc69b767b
---
 fs/sdcardfs/inode.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index f15cb11ca8fd..4f09eebd7d95 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -618,11 +618,8 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma
 	struct inode tmp;
 	struct inode *top = grab_top(SDCARDFS_I(inode));
 
-	if (!top) {
-		release_top(SDCARDFS_I(inode));
-		WARN(1, "Top value was null!\n");
+	if (!top)
 		return -EINVAL;
-	}
 
 	/*
 	 * Permission check on sdcardfs inode.
@@ -696,10 +693,8 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	inode = d_inode(dentry);
 	top = grab_top(SDCARDFS_I(inode));
 
-	if (!top) {
-		release_top(SDCARDFS_I(inode));
+	if (!top)
 		return -EINVAL;
-	}
 
 	/*
 	 * Permission check on sdcardfs inode.

From bc86c1de1dedcb4e49a5470a85dc20291334eded Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Tue, 9 May 2017 18:05:15 -0700
Subject: [PATCH 0792/3220] ANDROID: android-base.cfg: remove
 NETFILTER_XT_MATCH_QUOTA2_LOG

There are currently a couple different implementations for this
functionality. Until things are unified, remove the requirement
for this kernel config.

Bug: 37749708
Change-Id: I10ef038edc656185644d1dcb128658136a8c994f
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 android/configs/android-base.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 3c32c74c584a..23686fc15d45 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -86,7 +86,6 @@ CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
 CONFIG_NETFILTER_XT_MATCH_POLICY=y
 CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
 CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
-CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG=y
 CONFIG_NETFILTER_XT_MATCH_QUOTA=y
 CONFIG_NETFILTER_XT_MATCH_SOCKET=y
 CONFIG_NETFILTER_XT_MATCH_STATE=y

From 21ade372581bd235525a25b4d11b72d3b4129307 Mon Sep 17 00:00:00 2001
From: Gao Xiang <gaoxiang25@huawei.com>
Date: Wed, 10 May 2017 23:01:15 +0800
Subject: [PATCH 0793/3220] ANDROID: sdcardfs: fix sdcardfs_destroy_inode for
 the inode RCU approach

According to the following commits,
fs: icache RCU free inodes
vfs: fix the stupidity with i_dentry in inode destructors

sdcardfs_destroy_inode should be fixed for the fast path safety.

Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
Change-Id: I84f43c599209d23737c7e28b499dd121cb43636d
---
 fs/sdcardfs/super.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index a3393e959c63..8a9c9c7adca2 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -192,9 +192,16 @@ static struct inode *sdcardfs_alloc_inode(struct super_block *sb)
 	return &i->vfs_inode;
 }
 
+static void i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode));
+}
+
 static void sdcardfs_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode));
+	call_rcu(&inode->i_rcu, i_callback);
 }
 
 /* sdcardfs inode cache constructor */

From a8cc97d8cf4a0d0cf8ce1d99a09546745b11133e Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Wed, 10 May 2017 23:54:04 +0900
Subject: [PATCH 0794/3220] ANDROID: make PF_KEY SHA256 use RFC-compliant
 truncation.

When using the PF_KEY interface, SHA-256 hashes are hardcoded to
use 96-bit truncation. This is a violation of RFC4868, which
specifies 128-bit truncation, but will not be fixed upstream due
to backwards compatibility concerns and because the PF_KEY
interface is deprecated in favour of netlink XFRM (which allows
the app to specify an arbitrary truncation length).

Change the hardcoded truncation length from 96 to 128 so that
PF_KEY apps such as racoon will work with standards-compliant VPN
servers.

Bug: 34114242
Change-Id: Ie46bff4b6358f18117d0be241171d677d31d33f7
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
---
 net/xfrm/xfrm_algo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index f07224d8b88f..9adfdd711b31 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -239,7 +239,7 @@ static struct xfrm_algo_desc aalg_list[] = {
 
 	.uinfo = {
 		.auth = {
-			.icv_truncbits = 96,
+			.icv_truncbits = 128,
 			.icv_fullbits = 256,
 		}
 	},

From 1b9d700c89bd1c2a3a1b303a91f8cd03fc4b727e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 May 2017 11:21:25 +0200
Subject: [PATCH 0795/3220] ANDROID: rfkill: fix unused function warning

As reported by kernelci, we get a harmless warning in this driver
when CONFIG_PM is disabled:

net/rfkill/core.c:810:12: warning: 'rfkill_resume' defined but not used [-Wunused-function]
net/rfkill/core.c:800:12: warning: 'rfkill_suspend' defined but not used [-Wunused-function]

This marks the functions as __maybe_unused to remove the #ifdef, and uses
a conditional reference to the pm operations instead, which will work
in any combination.

The patch is needed for both android-common-4.9 and 4.4.

Link: https://kernelci.org/build/id/59117c2f59b5147b06b12d54/logs/
Fixes: de6f7210e931 ("ANDROID: rfkill: Introduce CONFIG_RFKILL_PM and use instead of CONFIG_PM to power down")
Cc: Nick Pelly <npelly@google.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 net/rfkill/core.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index d778d99326df..71e1f0def5a5 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -802,8 +802,7 @@ void rfkill_resume_polling(struct rfkill *rfkill)
 }
 EXPORT_SYMBOL(rfkill_resume_polling);
 
-#ifdef CONFIG_RFKILL_PM
-static int rfkill_suspend(struct device *dev)
+static __maybe_unused int rfkill_suspend(struct device *dev)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
 
@@ -812,7 +811,7 @@ static int rfkill_suspend(struct device *dev)
 	return 0;
 }
 
-static int rfkill_resume(struct device *dev)
+static __maybe_unused int rfkill_resume(struct device *dev)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
 	bool cur;
@@ -828,19 +827,13 @@ static int rfkill_resume(struct device *dev)
 }
 
 static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume);
-#define RFKILL_PM_OPS (&rfkill_pm_ops)
-#else
-#define RFKILL_PM_OPS NULL
-#endif
 
 static struct class rfkill_class = {
 	.name		= "rfkill",
 	.dev_release	= rfkill_release,
 	.dev_groups	= rfkill_dev_groups,
 	.dev_uevent	= rfkill_dev_uevent,
-#ifdef CONFIG_RFKILL_PM
-	.pm		= RFKILL_PM_OPS,
-#endif
+	.pm		= IS_ENABLED(CONFIG_RFKILL_PM) ? &rfkill_pm_ops : NULL,
 };
 
 bool rfkill_blocked(struct rfkill *rfkill)

From d3cb1ad1837e0f7d688e64f436ca96fd1199c402 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 May 2017 11:21:27 +0200
Subject: [PATCH 0796/3220] ANDROID: memory_state_time: fix undefined behavior
 with missing DT properties

kernelci reports warnings about unintialized variable usage:

drivers/misc/memory_state_time.c:351:12: warning: 'lenf' is used uninitialized in this function [-Wuninitialized]
drivers/misc/memory_state_time.c:321:14: warning: 'lenb' is used uninitialized in this function [-Wuninitialized]

In both cases we try to continue without a DT property but use the
length that has not been assigned at this point. This rearranges the
code in the two functions to bail out earlier in case of an error.

The patch is needed for both android-common-4.9, 4.4 and 3.18.

Link: https://kernelci.org/build/id/591177f459b5147648b12d54/logs/
Fixes: ad3c02f8b3a5 ("ANDROID: Implement memory_state_time, used by qcom,cpubw")
Cc: James Carr <carrja@google.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/misc/memory_state_time.c | 78 ++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 35 deletions(-)

diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c
index 34c797a06a31..ba94dcf09169 100644
--- a/drivers/misc/memory_state_time.c
+++ b/drivers/misc/memory_state_time.c
@@ -296,27 +296,31 @@ static int get_bw_buckets(struct device *dev)
 	struct device_node *node = dev->of_node;
 
 	of_property_read_u32(node, NUM_SOURCES, &num_sources);
-	if (of_find_property(node, BW_TBL, &lenb)) {
-		bandwidths = devm_kzalloc(dev,
-				sizeof(*bandwidths) * num_sources, GFP_KERNEL);
-		if (!bandwidths)
-			return -ENOMEM;
-		lenb /= sizeof(*bw_buckets);
-		bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets),
-				GFP_KERNEL);
-		if (!bw_buckets) {
-			devm_kfree(dev, bandwidths);
-			return -ENOMEM;
-		}
-		ret = of_property_read_u32_array(node, BW_TBL, bw_buckets,
-				lenb);
-		if (ret < 0) {
-			devm_kfree(dev, bandwidths);
-			devm_kfree(dev, bw_buckets);
-			pr_err("Unable to read bandwidth table from device tree.\n");
-			return ret;
-		}
+	if (!of_find_property(node, BW_TBL, &lenb)) {
+		pr_err("Missing %s property\n", BW_TBL);
+		return -ENODATA;
 	}
+
+	bandwidths = devm_kzalloc(dev,
+			sizeof(*bandwidths) * num_sources, GFP_KERNEL);
+	if (!bandwidths)
+		return -ENOMEM;
+	lenb /= sizeof(*bw_buckets);
+	bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets),
+			GFP_KERNEL);
+	if (!bw_buckets) {
+		devm_kfree(dev, bandwidths);
+		return -ENOMEM;
+	}
+	ret = of_property_read_u32_array(node, BW_TBL, bw_buckets,
+			lenb);
+	if (ret < 0) {
+		devm_kfree(dev, bandwidths);
+		devm_kfree(dev, bw_buckets);
+		pr_err("Unable to read bandwidth table from device tree.\n");
+		return ret;
+	}
+
 	curr_bw = 0;
 	num_buckets = lenb;
 	return 0;
@@ -332,22 +336,26 @@ static int freq_buckets_init(struct device *dev)
 	int ret, lenf;
 	struct device_node *node = dev->of_node;
 
-	if (of_find_property(node, FREQ_TBL, &lenf)) {
-		lenf /= sizeof(*freq_buckets);
-		freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets),
-				GFP_KERNEL);
-		if (!freq_buckets)
-			return -ENOMEM;
-		pr_debug("freqs found len %d\n", lenf);
-		ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets,
-				lenf);
-		if (ret < 0) {
-			devm_kfree(dev, freq_buckets);
-			pr_err("Unable to read frequency table from device tree.\n");
-			return ret;
-		}
-		pr_debug("ret freq %d\n", ret);
+	if (!of_find_property(node, FREQ_TBL, &lenf)) {
+		pr_err("Missing %s property\n", FREQ_TBL);
+		return -ENODATA;
 	}
+
+	lenf /= sizeof(*freq_buckets);
+	freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets),
+			GFP_KERNEL);
+	if (!freq_buckets)
+		return -ENOMEM;
+	pr_debug("freqs found len %d\n", lenf);
+	ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets,
+			lenf);
+	if (ret < 0) {
+		devm_kfree(dev, freq_buckets);
+		pr_err("Unable to read frequency table from device tree.\n");
+		return ret;
+	}
+	pr_debug("ret freq %d\n", ret);
+
 	num_freqs = lenf;
 	curr_freq = freq_buckets[LOWEST_FREQ];
 

From a347e1c4ddc50afc383ff0cc197560777ba7d95b Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Fri, 12 May 2017 10:17:05 -0700
Subject: [PATCH 0797/3220] ANDROID: android-base.cfg: remove spurious
 CONFIG_MODULES line

CONFIG_MODULES must be enabled as part of the android base kernel
configuration. There is already a line specifying the option be
enabled, but there was a pre-existing line requiring it be disabled.
Remove it.

Bug: 38224475
Change-Id: I608de5ae68f3a03d5da4e5800bbf37cc71dff8b8
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 android/configs/android-base.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 23686fc15d45..b7b997fd58c9 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -3,7 +3,6 @@
 # CONFIG_DEVMEM is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
-# CONFIG_MODULES is not set
 # CONFIG_OABI_COMPAT is not set
 # CONFIG_SYSVIPC is not set
 # CONFIG_USELIB is not set

From 73661512d284dac703ffbeca68c675cebd648110 Mon Sep 17 00:00:00 2001
From: Jiebing Li <jiebing.li@intel.com>
Date: Tue, 10 Mar 2015 11:27:10 +0800
Subject: [PATCH 0798/3220] ANDROID: usb: f_mtp: return error code if transfer
 error in receive_file_work function

receive_file_work() function should report error if it detect the urb
is not successfully transfered.

Patchset: mtp

Change-Id: I66898afe6b6ea2c33e6ea4c5ccf47d3a56d002dc
Signed-off-by: Du, Changbin <changbin.du@gmail.com>
Signed-off-by: Wang, Yu <yu.y.wang@intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
---
 drivers/usb/gadget/function/f_mtp.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
index 0fa79b0c26ab..87ec420df0a6 100644
--- a/drivers/usb/gadget/function/f_mtp.c
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -873,6 +873,10 @@ static void receive_file_work(struct work_struct *data)
 					usb_ep_dequeue(dev->ep_out, read_req);
 				break;
 			}
+			if (read_req->status) {
+				r = read_req->status;
+				break;
+			}
 			/* if xfer_file_length is 0xFFFFFFFF, then we read until
 			 * we get a zero length packet
 			 */

From 9a83b8157a75b297e3a5f105bf209a44184ca52d Mon Sep 17 00:00:00 2001
From: Jiebing Li <jiebing.li@intel.com>
Date: Tue, 10 Mar 2015 11:25:50 +0800
Subject: [PATCH 0799/3220] ANDROID: usb: gadget: fix NULL pointer issue in
 mtp_read()

pointer dev->ep_out->desc is set to NULL if MTP function
is disabled during read operation. So we need to do pointer check
before access it and add spin lock protection in case it's modified
at another place in future.

Patchset: mtp

Change-Id: I96d3d685e93276c9065a1aa7b0cbbdc2e159aa6f
Signed-off-by: Jiebing Li <jiebing.li@intel.com>
Signed-off-by: Wang, Yu <yu.y.wang@intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
---
 drivers/usb/gadget/function/f_mtp.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
index 87ec420df0a6..b25cb3594d01 100644
--- a/drivers/usb/gadget/function/f_mtp.c
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -541,14 +541,10 @@ static ssize_t mtp_read(struct file *fp, char __user *buf,
 	ssize_t r = count;
 	unsigned xfer;
 	int ret = 0;
-	size_t len;
+	size_t len = 0;
 
 	DBG(cdev, "mtp_read(%zu)\n", count);
 
-	len = usb_ep_align_maybe(cdev->gadget, dev->ep_out, count);
-	if (len > MTP_BULK_BUFFER_SIZE)
-		return -EINVAL;
-
 	/* we will block until we're online */
 	DBG(cdev, "mtp_read: waiting for online state\n");
 	ret = wait_event_interruptible(dev->read_wq,
@@ -558,6 +554,14 @@ static ssize_t mtp_read(struct file *fp, char __user *buf,
 		goto done;
 	}
 	spin_lock_irq(&dev->lock);
+	if (dev->ep_out->desc) {
+		len = usb_ep_align_maybe(cdev->gadget, dev->ep_out, count);
+		if (len > MTP_BULK_BUFFER_SIZE) {
+			spin_unlock_irq(&dev->lock);
+			return -EINVAL;
+		}
+	}
+
 	if (dev->state == STATE_CANCELED) {
 		/* report cancelation to userspace */
 		dev->state = STATE_READY;

From 2ba2ada07d6ffeb25a9dd0d25cc334574bbaaed1 Mon Sep 17 00:00:00 2001
From: Tim Murray <timmurray@google.com>
Date: Fri, 2 Sep 2016 16:04:41 -0700
Subject: [PATCH 0800/3220] ANDROID: lowmemorykiller: account for unevictable
 pages

lowmemorykiller was not taking into account unevictable pages when
deciding what level to kill. If significant amounts of memory were
pinned, this caused lowmemorykiller to effectively stop at a much higher
level than it should.

bug 31255977

Change-Id: I763ecbfef8c56d65bb8f6147ae810692bd81b6e2
Signed-off-by: Tim Murray <timmurray@google.com>
---
 drivers/staging/android/lowmemorykiller.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index af49af0cca01..606d13a1ea74 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -94,6 +94,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 	int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
 	int other_file = global_page_state(NR_FILE_PAGES) -
 						global_page_state(NR_SHMEM) -
+						global_page_state(NR_UNEVICTABLE) -
 						total_swapcache_pages();
 
 	if (lowmem_adj_size < array_size)

From 9516c5d5de8e54e790d1db484527681ee46e3b01 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 15 May 2017 14:03:15 -0700
Subject: [PATCH 0801/3220] ANDROID: sdcardfs: Move top to its own struct

Move top, and the associated data, to its own struct.
This way, we can properly track refcounts on top
without interfering with the inode's accounting.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 38045152
Change-Id: I1968e480d966c3f234800b72e43670ca11e1d3fd
---
 fs/sdcardfs/dentry.c       |  15 +++++
 fs/sdcardfs/derived_perm.c | 130 +++++++++++++++++++------------------
 fs/sdcardfs/inode.c        |  53 ++++++++-------
 fs/sdcardfs/lookup.c       |   5 +-
 fs/sdcardfs/main.c         |   8 +--
 fs/sdcardfs/packagelist.c  |   2 +-
 fs/sdcardfs/sdcardfs.h     | 118 +++++++++++++++++++++------------
 fs/sdcardfs/super.c        |  49 ++++++++++++--
 8 files changed, 239 insertions(+), 141 deletions(-)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index 7a19e77fce99..83ae9103a0f6 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -34,6 +34,8 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	struct dentry *parent_lower_dentry = NULL;
 	struct dentry *lower_cur_parent_dentry = NULL;
 	struct dentry *lower_dentry = NULL;
+	struct inode *inode;
+	struct sdcardfs_inode_data *data;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -103,6 +105,19 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&lower_dentry->d_lock);
 	}
+	if (!err)
+		goto out;
+
+	/* If our top's inode is gone, we may be out of date */
+	inode = d_inode(dentry);
+	if (inode) {
+		data = top_data_get(SDCARDFS_I(inode));
+		if (data->abandoned) {
+			d_drop(dentry);
+			err = 0;
+		}
+		data_put(data);
+	}
 
 out:
 	dput(parent_dentry);
diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index b4595aab5713..85a60fb5ff39 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -26,28 +26,28 @@ static void inherit_derived_state(struct inode *parent, struct inode *child)
 	struct sdcardfs_inode_info *pi = SDCARDFS_I(parent);
 	struct sdcardfs_inode_info *ci = SDCARDFS_I(child);
 
-	ci->perm = PERM_INHERIT;
-	ci->userid = pi->userid;
-	ci->d_uid = pi->d_uid;
-	ci->under_android = pi->under_android;
-	ci->under_cache = pi->under_cache;
-	ci->under_obb = pi->under_obb;
-	set_top(ci, pi->top);
+	ci->data->perm = PERM_INHERIT;
+	ci->data->userid = pi->data->userid;
+	ci->data->d_uid = pi->data->d_uid;
+	ci->data->under_android = pi->data->under_android;
+	ci->data->under_cache = pi->data->under_cache;
+	ci->data->under_obb = pi->data->under_obb;
+	set_top(ci, pi->top_data);
 }
 
 /* helper function for derived state */
 void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
-						uid_t uid, bool under_android,
-						struct inode *top)
+					uid_t uid, bool under_android,
+					struct sdcardfs_inode_data *top)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
 
-	info->perm = perm;
-	info->userid = userid;
-	info->d_uid = uid;
-	info->under_android = under_android;
-	info->under_cache = false;
-	info->under_obb = false;
+	info->data->perm = perm;
+	info->data->userid = userid;
+	info->data->d_uid = uid;
+	info->data->under_android = under_android;
+	info->data->under_cache = false;
+	info->data->under_obb = false;
 	set_top(info, top);
 }
 
@@ -58,7 +58,8 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 				const struct qstr *name)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
-	struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+	struct sdcardfs_inode_data *parent_data =
+			SDCARDFS_I(d_inode(parent))->data;
 	appid_t appid;
 	unsigned long user_num;
 	int err;
@@ -82,60 +83,61 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 	if (!S_ISDIR(d_inode(dentry)->i_mode))
 		return;
 	/* Derive custom permissions based on parent and current node */
-	switch (parent_info->perm) {
+	switch (parent_data->perm) {
 	case PERM_INHERIT:
 	case PERM_ANDROID_PACKAGE_CACHE:
 		/* Already inherited above */
 		break;
 	case PERM_PRE_ROOT:
 		/* Legacy internal layout places users at top level */
-		info->perm = PERM_ROOT;
+		info->data->perm = PERM_ROOT;
 		err = kstrtoul(name->name, 10, &user_num);
 		if (err)
-			info->userid = 0;
+			info->data->userid = 0;
 		else
-			info->userid = user_num;
-		set_top(info, &info->vfs_inode);
+			info->data->userid = user_num;
+		set_top(info, info->data);
 		break;
 	case PERM_ROOT:
 		/* Assume masked off by default. */
 		if (qstr_case_eq(name, &q_Android)) {
 			/* App-specific directories inside; let anyone traverse */
-			info->perm = PERM_ANDROID;
-			info->under_android = true;
-			set_top(info, &info->vfs_inode);
+			info->data->perm = PERM_ANDROID;
+			info->data->under_android = true;
+			set_top(info, info->data);
 		}
 		break;
 	case PERM_ANDROID:
 		if (qstr_case_eq(name, &q_data)) {
 			/* App-specific directories inside; let anyone traverse */
-			info->perm = PERM_ANDROID_DATA;
-			set_top(info, &info->vfs_inode);
+			info->data->perm = PERM_ANDROID_DATA;
+			set_top(info, info->data);
 		} else if (qstr_case_eq(name, &q_obb)) {
 			/* App-specific directories inside; let anyone traverse */
-			info->perm = PERM_ANDROID_OBB;
-			info->under_obb = true;
-			set_top(info, &info->vfs_inode);
+			info->data->perm = PERM_ANDROID_OBB;
+			info->data->under_obb = true;
+			set_top(info, info->data);
 			/* Single OBB directory is always shared */
 		} else if (qstr_case_eq(name, &q_media)) {
 			/* App-specific directories inside; let anyone traverse */
-			info->perm = PERM_ANDROID_MEDIA;
-			set_top(info, &info->vfs_inode);
+			info->data->perm = PERM_ANDROID_MEDIA;
+			set_top(info, info->data);
 		}
 		break;
 	case PERM_ANDROID_OBB:
 	case PERM_ANDROID_DATA:
 	case PERM_ANDROID_MEDIA:
-		info->perm = PERM_ANDROID_PACKAGE;
+		info->data->perm = PERM_ANDROID_PACKAGE;
 		appid = get_appid(name->name);
-		if (appid != 0 && !is_excluded(name->name, parent_info->userid))
-			info->d_uid = multiuser_get_uid(parent_info->userid, appid);
-		set_top(info, &info->vfs_inode);
+		if (appid != 0 && !is_excluded(name->name, parent_data->userid))
+			info->data->d_uid =
+				multiuser_get_uid(parent_data->userid, appid);
+		set_top(info, info->data);
 		break;
 	case PERM_ANDROID_PACKAGE:
 		if (qstr_case_eq(name, &q_cache)) {
-			info->perm = PERM_ANDROID_PACKAGE_CACHE;
-			info->under_cache = true;
+			info->data->perm = PERM_ANDROID_PACKAGE_CACHE;
+			info->data->under_cache = true;
 		}
 		break;
 	}
@@ -166,7 +168,8 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name)
 	struct inode *delegated_inode = NULL;
 	int error;
 	struct sdcardfs_inode_info *info;
-	struct sdcardfs_inode_info *info_top;
+	struct sdcardfs_inode_data *info_d;
+	struct sdcardfs_inode_data *info_top;
 	perm_t perm;
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 	uid_t uid = sbi->options.fs_low_uid;
@@ -174,15 +177,16 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name)
 	struct iattr newattrs;
 
 	info = SDCARDFS_I(d_inode(dentry));
-	perm = info->perm;
-	if (info->under_obb) {
+	info_d = info->data;
+	perm = info_d->perm;
+	if (info_d->under_obb) {
 		perm = PERM_ANDROID_OBB;
-	} else if (info->under_cache) {
+	} else if (info_d->under_cache) {
 		perm = PERM_ANDROID_PACKAGE_CACHE;
 	} else if (perm == PERM_INHERIT) {
-		info_top = SDCARDFS_I(grab_top(info));
+		info_top = top_data_get(info);
 		perm = info_top->perm;
-		release_top(info);
+		data_put(info_top);
 	}
 
 	switch (perm) {
@@ -192,7 +196,7 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name)
 	case PERM_ANDROID_MEDIA:
 	case PERM_ANDROID_PACKAGE:
 	case PERM_ANDROID_PACKAGE_CACHE:
-		uid = multiuser_get_uid(info->userid, uid);
+		uid = multiuser_get_uid(info_d->userid, uid);
 		break;
 	case PERM_ANDROID_OBB:
 		uid = AID_MEDIA_OBB;
@@ -207,24 +211,24 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name)
 	case PERM_ANDROID_DATA:
 	case PERM_ANDROID_MEDIA:
 		if (S_ISDIR(d_inode(dentry)->i_mode))
-			gid = multiuser_get_uid(info->userid, AID_MEDIA_RW);
+			gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
 		else
-			gid = multiuser_get_uid(info->userid, get_type(name));
+			gid = multiuser_get_uid(info_d->userid, get_type(name));
 		break;
 	case PERM_ANDROID_OBB:
 		gid = AID_MEDIA_OBB;
 		break;
 	case PERM_ANDROID_PACKAGE:
-		if (uid_is_app(info->d_uid))
-			gid = multiuser_get_ext_gid(info->d_uid);
+		if (uid_is_app(info_d->d_uid))
+			gid = multiuser_get_ext_gid(info_d->d_uid);
 		else
-			gid = multiuser_get_uid(info->userid, AID_MEDIA_RW);
+			gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
 		break;
 	case PERM_ANDROID_PACKAGE_CACHE:
-		if (uid_is_app(info->d_uid))
-			gid = multiuser_get_ext_cache_gid(info->d_uid);
+		if (uid_is_app(info_d->d_uid))
+			gid = multiuser_get_ext_cache_gid(info_d->d_uid);
 		else
-			gid = multiuser_get_uid(info->userid, AID_MEDIA_RW);
+			gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
 		break;
 	case PERM_PRE_ROOT:
 	default:
@@ -257,11 +261,13 @@ retry_deleg:
 	sdcardfs_put_lower_path(dentry, &path);
 }
 
-static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit)
+static int descendant_may_need_fixup(struct sdcardfs_inode_data *data,
+		struct limit_search *limit)
 {
-	if (info->perm == PERM_ROOT)
-		return (limit->flags & BY_USERID)?info->userid == limit->userid:1;
-	if (info->perm == PERM_PRE_ROOT || info->perm == PERM_ANDROID)
+	if (data->perm == PERM_ROOT)
+		return (limit->flags & BY_USERID) ?
+				data->userid == limit->userid : 1;
+	if (data->perm == PERM_PRE_ROOT || data->perm == PERM_ANDROID)
 		return 1;
 	return 0;
 }
@@ -292,7 +298,7 @@ static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *
 	}
 	info = SDCARDFS_I(d_inode(dentry));
 
-	if (needs_fixup(info->perm)) {
+	if (needs_fixup(info->data->perm)) {
 		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
 			spin_lock_nested(&child->d_lock, depth + 1);
 			if (!(limit->flags & BY_NAME) || qstr_case_eq(&child->d_name, &limit->name)) {
@@ -305,7 +311,7 @@ static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *
 			}
 			spin_unlock(&child->d_lock);
 		}
-	} else if (descendant_may_need_fixup(info, limit)) {
+	} else if (descendant_may_need_fixup(info->data, limit)) {
 		list_for_each_entry(child, &dentry->d_subdirs, d_child) {
 			__fixup_perms_recursive(child, limit, depth + 1);
 		}
@@ -349,12 +355,12 @@ int need_graft_path(struct dentry *dentry)
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 	struct qstr obb = QSTR_LITERAL("obb");
 
-	if (parent_info->perm == PERM_ANDROID &&
+	if (parent_info->data->perm == PERM_ANDROID &&
 			qstr_case_eq(&dentry->d_name, &obb)) {
 
 		/* /Android/obb is the base obbpath of DERIVED_UNIFIED */
 		if (!(sbi->options.multiuser == false
-				&& parent_info->userid == 0)) {
+				&& parent_info->data->userid == 0)) {
 			ret = 1;
 		}
 	}
@@ -415,11 +421,11 @@ int is_base_obbpath(struct dentry *dentry)
 
 	spin_lock(&SDCARDFS_D(dentry)->lock);
 	if (sbi->options.multiuser) {
-		if (parent_info->perm == PERM_PRE_ROOT &&
+		if (parent_info->data->perm == PERM_PRE_ROOT &&
 				qstr_case_eq(&dentry->d_name, &q_obb)) {
 			ret = 1;
 		}
-	} else  if (parent_info->perm == PERM_ANDROID &&
+	} else  if (parent_info->data->perm == PERM_ANDROID &&
 			qstr_case_eq(&dentry->d_name, &q_obb)) {
 		ret = 1;
 	}
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 4f09eebd7d95..60fea424835f 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -23,7 +23,8 @@
 #include <linux/ratelimit.h>
 
 /* Do not directly use this function. Use OVERRIDE_CRED() instead. */
-const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_inode_info *info)
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+		struct sdcardfs_inode_data *data)
 {
 	struct cred *cred;
 	const struct cred *old_cred;
@@ -33,10 +34,10 @@ const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_
 	if (!cred)
 		return NULL;
 
-	if (info->under_obb)
+	if (data->under_obb)
 		uid = AID_MEDIA_OBB;
 	else
-		uid = multiuser_get_uid(info->userid, sbi->options.fs_low_uid);
+		uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid);
 	cred->fsuid = make_kuid(&init_user_ns, uid);
 	cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid);
 
@@ -96,7 +97,8 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	if (err)
 		goto out;
 
-	err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, SDCARDFS_I(dir)->userid);
+	err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path,
+			SDCARDFS_I(dir)->data->userid);
 	if (err)
 		goto out;
 	fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
@@ -267,7 +269,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	struct path lower_path;
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 	const struct cred *saved_cred = NULL;
-	struct sdcardfs_inode_info *pi = SDCARDFS_I(dir);
+	struct sdcardfs_inode_data *pd = SDCARDFS_I(dir)->data;
 	int touch_err = 0;
 	struct fs_struct *saved_fs;
 	struct fs_struct *copied_fs;
@@ -336,7 +338,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 			make_nomedia_in_obb = 1;
 	}
 
-	err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pi->userid);
+	err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pd->userid);
 	if (err) {
 		unlock_dir(lower_parent_dentry);
 		goto out;
@@ -349,12 +351,13 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	fixup_lower_ownership(dentry, dentry->d_name.name);
 	unlock_dir(lower_parent_dentry);
 	if ((!sbi->options.multiuser) && (qstr_case_eq(&dentry->d_name, &q_obb))
-		&& (pi->perm == PERM_ANDROID) && (pi->userid == 0))
+		&& (pd->perm == PERM_ANDROID) && (pd->userid == 0))
 		make_nomedia_in_obb = 1;
 
 	/* When creating /Android/data and /Android/obb, mark them as .nomedia */
 	if (make_nomedia_in_obb ||
-		((pi->perm == PERM_ANDROID) && (qstr_case_eq(&dentry->d_name, &q_data)))) {
+		((pd->perm == PERM_ANDROID)
+				&& (qstr_case_eq(&dentry->d_name, &q_data)))) {
 		REVERT_CRED(saved_cred);
 		OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(d_inode(dentry)));
 		set_fs_pwd(current->fs, &lower_path);
@@ -616,7 +619,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma
 {
 	int err;
 	struct inode tmp;
-	struct inode *top = grab_top(SDCARDFS_I(inode));
+	struct sdcardfs_inode_data *top = top_data_get(SDCARDFS_I(inode));
 
 	if (!top)
 		return -EINVAL;
@@ -633,10 +636,11 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma
 	 * locks must be dealt with to avoid undefined behavior.
 	 */
 	copy_attrs(&tmp, inode);
-	tmp.i_uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid);
-	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top)));
-	tmp.i_mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top));
-	release_top(SDCARDFS_I(inode));
+	tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+	tmp.i_mode = (inode->i_mode & S_IFMT)
+			| get_mode(mnt, SDCARDFS_I(inode), top);
+	data_put(top);
 	tmp.i_sb = inode->i_sb;
 	if (IS_POSIXACL(inode))
 		pr_warn("%s: This may be undefined behavior...\n", __func__);
@@ -687,11 +691,11 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	struct iattr lower_ia;
 	struct dentry *parent;
 	struct inode tmp;
-	struct inode *top;
+	struct sdcardfs_inode_data *top;
 	const struct cred *saved_cred = NULL;
 
 	inode = d_inode(dentry);
-	top = grab_top(SDCARDFS_I(inode));
+	top = top_data_get(SDCARDFS_I(inode));
 
 	if (!top)
 		return -EINVAL;
@@ -709,11 +713,12 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	 *
 	 */
 	copy_attrs(&tmp, inode);
-	tmp.i_uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid);
-	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top)));
-	tmp.i_mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top));
+	tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+	tmp.i_mode = (inode->i_mode & S_IFMT)
+			| get_mode(mnt, SDCARDFS_I(inode), top);
 	tmp.i_size = i_size_read(inode);
-	release_top(SDCARDFS_I(inode));
+	data_put(top);
 	tmp.i_sb = inode->i_sb;
 
 	/*
@@ -815,17 +820,17 @@ static int sdcardfs_fillattr(struct vfsmount *mnt,
 				struct inode *inode, struct kstat *stat)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
-	struct inode *top = grab_top(info);
+	struct sdcardfs_inode_data *top = top_data_get(info);
 
 	if (!top)
 		return -EINVAL;
 
 	stat->dev = inode->i_sb->s_dev;
 	stat->ino = inode->i_ino;
-	stat->mode = (inode->i_mode  & S_IFMT) | get_mode(mnt, SDCARDFS_I(top));
+	stat->mode = (inode->i_mode  & S_IFMT) | get_mode(mnt, info, top);
 	stat->nlink = inode->i_nlink;
-	stat->uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid);
-	stat->gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top)));
+	stat->uid = make_kuid(&init_user_ns, top->d_uid);
+	stat->gid = make_kgid(&init_user_ns, get_gid(mnt, top));
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
 	stat->atime = inode->i_atime;
@@ -833,7 +838,7 @@ static int sdcardfs_fillattr(struct vfsmount *mnt,
 	stat->ctime = inode->i_ctime;
 	stat->blksize = (1 << inode->i_blkbits);
 	stat->blocks = inode->i_blocks;
-	release_top(info);
+	data_put(top);
 	return 0;
 }
 
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 509d5fbcb472..00ae21151f52 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -71,7 +71,7 @@ struct inode_data {
 static int sdcardfs_inode_test(struct inode *inode, void *candidate_data/*void *candidate_lower_inode*/)
 {
 	struct inode *current_lower_inode = sdcardfs_lower_inode(inode);
-	userid_t current_userid = SDCARDFS_I(inode)->userid;
+	userid_t current_userid = SDCARDFS_I(inode)->data->userid;
 
 	if (current_lower_inode == ((struct inode_data *)candidate_data)->lower_inode &&
 			current_userid == ((struct inode_data *)candidate_data)->id)
@@ -438,7 +438,8 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto out;
 	}
 
-	ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path, SDCARDFS_I(dir)->userid);
+	ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path,
+				SDCARDFS_I(dir)->data->userid);
 	if (IS_ERR(ret))
 		goto out;
 	if (ret)
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 953d2156d2e9..3c5b51d49d21 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -327,13 +327,13 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	mutex_lock(&sdcardfs_super_list_lock);
 	if (sb_info->options.multiuser) {
 		setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT,
-					sb_info->options.fs_user_id, AID_ROOT,
-					false, d_inode(sb->s_root));
+				sb_info->options.fs_user_id, AID_ROOT,
+				false, SDCARDFS_I(d_inode(sb->s_root))->data);
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name);
 	} else {
 		setup_derived_state(d_inode(sb->s_root), PERM_ROOT,
-					sb_info->options.fs_user_id, AID_ROOT,
-					false, d_inode(sb->s_root));
+				sb_info->options.fs_user_id, AID_ROOT,
+				false, SDCARDFS_I(d_inode(sb->s_root))->data);
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
 	}
 	fixup_tmp_permissions(d_inode(sb->s_root));
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 89196e31073e..8495474258d5 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -156,7 +156,7 @@ int check_caller_access_to_name(struct inode *parent_node, const struct qstr *na
 	struct qstr q_android_secure = QSTR_LITERAL("android_secure");
 
 	/* Always block security-sensitive files at root */
-	if (parent_node && SDCARDFS_I(parent_node)->perm == PERM_ROOT) {
+	if (parent_node && SDCARDFS_I(parent_node)->data->perm == PERM_ROOT) {
 		if (qstr_case_eq(name, &q_autorun)
 			|| qstr_case_eq(name, &q__android_secure)
 			|| qstr_case_eq(name, &q_android_secure)) {
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 2b67b9a8ef9f..31d37d7da4f9 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -30,6 +30,7 @@
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/aio.h>
+#include <linux/kref.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
@@ -81,7 +82,8 @@
  */
 #define fixup_tmp_permissions(x)	\
 	do {						\
-		(x)->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(x)->d_uid);	\
+		(x)->i_uid = make_kuid(&init_user_ns,	\
+				SDCARDFS_I(x)->data->d_uid);	\
 		(x)->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW);	\
 		(x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\
 	} while (0)
@@ -97,16 +99,16 @@
  */
 #define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info)		\
 	do {	\
-	saved_cred = override_fsids(sdcardfs_sbi, info);	\
-	if (!saved_cred)	\
-		return -ENOMEM;	\
+		saved_cred = override_fsids(sdcardfs_sbi, info->data);	\
+		if (!saved_cred)	\
+			return -ENOMEM;	\
 	} while (0)
 
 #define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info)	\
 	do {	\
-	saved_cred = override_fsids(sdcardfs_sbi, info);	\
-	if (!saved_cred)	\
-		return ERR_PTR(-ENOMEM);	\
+		saved_cred = override_fsids(sdcardfs_sbi, info->data);	\
+		if (!saved_cred)	\
+			return ERR_PTR(-ENOMEM);	\
 	} while (0)
 
 #define REVERT_CRED(saved_cred)	revert_fsids(saved_cred)
@@ -142,9 +144,11 @@ typedef enum {
 struct sdcardfs_sb_info;
 struct sdcardfs_mount_options;
 struct sdcardfs_inode_info;
+struct sdcardfs_inode_data;
 
 /* Do not directly use this function. Use OVERRIDE_CRED() instead. */
-const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_inode_info *info);
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+			struct sdcardfs_inode_data *data);
 /* Do not directly use this function, use REVERT_CRED() instead. */
 void revert_fsids(const struct cred *old_cred);
 
@@ -178,18 +182,26 @@ struct sdcardfs_file_info {
 	const struct vm_operations_struct *lower_vm_ops;
 };
 
-/* sdcardfs inode data in memory */
-struct sdcardfs_inode_info {
-	struct inode *lower_inode;
-	/* state derived based on current position in hierachy */
+struct sdcardfs_inode_data {
+	struct kref refcount;
+	bool abandoned;
+
 	perm_t perm;
 	userid_t userid;
 	uid_t d_uid;
 	bool under_android;
 	bool under_cache;
 	bool under_obb;
+};
+
+/* sdcardfs inode data in memory */
+struct sdcardfs_inode_info {
+	struct inode *lower_inode;
+	/* state derived based on current position in hierarchy */
+	struct sdcardfs_inode_data *data;
+
 	/* top folder for ownership */
-	struct inode *top;
+	struct sdcardfs_inode_data *top_data;
 
 	struct inode vfs_inode;
 };
@@ -351,39 +363,56 @@ SDCARDFS_DENT_FUNC(orig_path)
 
 static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo)
 {
-	return sbinfo && sbinfo->sb && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC;
+	return sbinfo && sbinfo->sb
+			&& sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC;
 }
 
-/* grab a refererence if we aren't linking to ourself */
-static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top)
+static inline struct sdcardfs_inode_data *data_get(
+		struct sdcardfs_inode_data *data)
 {
-	struct inode *old_top = NULL;
-
-	BUG_ON(IS_ERR_OR_NULL(top));
-	if (info->top && info->top != &info->vfs_inode)
-		old_top = info->top;
-	if (top != &info->vfs_inode)
-		igrab(top);
-	info->top = top;
-	iput(old_top);
+	if (data)
+		kref_get(&data->refcount);
+	return data;
 }
 
-static inline struct inode *grab_top(struct sdcardfs_inode_info *info)
+static inline struct sdcardfs_inode_data *top_data_get(
+		struct sdcardfs_inode_info *info)
 {
-	struct inode *top = info->top;
+	return data_get(info->top_data);
+}
+
+extern void data_release(struct kref *ref);
+
+static inline void data_put(struct sdcardfs_inode_data *data)
+{
+	kref_put(&data->refcount, data_release);
+}
+
+static inline void release_own_data(struct sdcardfs_inode_info *info)
+{
+	/*
+	 * This happens exactly once per inode. At this point, the inode that
+	 * originally held this data is about to be freed, and all references
+	 * to it are held as a top value, and will likely be released soon.
+	 */
+	info->data->abandoned = true;
+	data_put(info->data);
+}
+
+static inline void set_top(struct sdcardfs_inode_info *info,
+			struct sdcardfs_inode_data *top)
+{
+	struct sdcardfs_inode_data *old_top = info->top_data;
 
 	if (top)
-		return igrab(top);
-	else
-		return NULL;
+		data_get(top);
+	info->top_data = top;
+	if (old_top)
+		data_put(old_top);
 }
 
-static inline void release_top(struct sdcardfs_inode_info *info)
-{
-	iput(info->top);
-}
-
-static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info)
+static inline int get_gid(struct vfsmount *mnt,
+		struct sdcardfs_inode_data *data)
 {
 	struct sdcardfs_vfsmount_options *opts = mnt->data;
 
@@ -396,10 +425,12 @@ static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info
 		 */
 		return AID_SDCARD_RW;
 	else
-		return multiuser_get_uid(info->userid, opts->gid);
+		return multiuser_get_uid(data->userid, opts->gid);
 }
 
-static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *info)
+static inline int get_mode(struct vfsmount *mnt,
+		struct sdcardfs_inode_info *info,
+		struct sdcardfs_inode_data *data)
 {
 	int owner_mode;
 	int filtered_mode;
@@ -407,12 +438,12 @@ static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *inf
 	int visible_mode = 0775 & ~opts->mask;
 
 
-	if (info->perm == PERM_PRE_ROOT) {
+	if (data->perm == PERM_PRE_ROOT) {
 		/* Top of multi-user view should always be visible to ensure
 		* secondary users can traverse inside.
 		*/
 		visible_mode = 0711;
-	} else if (info->under_android) {
+	} else if (data->under_android) {
 		/* Block "other" access to Android directories, since only apps
 		* belonging to a specific user should be in there; we still
 		* leave +x open for the default view.
@@ -481,8 +512,9 @@ struct limit_search {
 	userid_t userid;
 };
 
-extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
-			uid_t uid, bool under_android, struct inode *top);
+extern void setup_derived_state(struct inode *inode, perm_t perm,
+		userid_t userid, uid_t uid, bool under_android,
+		struct sdcardfs_inode_data *top);
 extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
 extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name);
 extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
@@ -601,7 +633,7 @@ static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct
 {
 	dest->i_mode = (src->i_mode  & S_IFMT) | S_IRWXU | S_IRWXG |
 			S_IROTH | S_IXOTH; /* 0775 */
-	dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->d_uid);
+	dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->data->d_uid);
 	dest->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW);
 	dest->i_rdev = src->i_rdev;
 	dest->i_atime = src->i_atime;
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index 8a9c9c7adca2..7f4539b4b249 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -26,6 +26,23 @@
  */
 static struct kmem_cache *sdcardfs_inode_cachep;
 
+/*
+ * To support the top references, we must track some data separately.
+ * An sdcardfs_inode_info always has a reference to its data, and once set up,
+ * also has a reference to its top. The top may be itself, in which case it
+ * holds two references to its data. When top is changed, it takes a ref to the
+ * new data and then drops the ref to the old data.
+ */
+static struct kmem_cache *sdcardfs_inode_data_cachep;
+
+void data_release(struct kref *ref)
+{
+	struct sdcardfs_inode_data *data =
+		container_of(ref, struct sdcardfs_inode_data, refcount);
+
+	kmem_cache_free(sdcardfs_inode_data_cachep, data);
+}
+
 /* final actions when unmounting a file system */
 static void sdcardfs_put_super(struct super_block *sb)
 {
@@ -166,6 +183,7 @@ static void sdcardfs_evict_inode(struct inode *inode)
 	struct inode *lower_inode;
 
 	truncate_inode_pages(&inode->i_data, 0);
+	set_top(SDCARDFS_I(inode), NULL);
 	clear_inode(inode);
 	/*
 	 * Decrement a reference to a lower_inode, which was incremented
@@ -173,13 +191,13 @@ static void sdcardfs_evict_inode(struct inode *inode)
 	 */
 	lower_inode = sdcardfs_lower_inode(inode);
 	sdcardfs_set_lower_inode(inode, NULL);
-	set_top(SDCARDFS_I(inode), inode);
 	iput(lower_inode);
 }
 
 static struct inode *sdcardfs_alloc_inode(struct super_block *sb)
 {
 	struct sdcardfs_inode_info *i;
+	struct sdcardfs_inode_data *d;
 
 	i = kmem_cache_alloc(sdcardfs_inode_cachep, GFP_KERNEL);
 	if (!i)
@@ -188,6 +206,16 @@ static struct inode *sdcardfs_alloc_inode(struct super_block *sb)
 	/* memset everything up to the inode to 0 */
 	memset(i, 0, offsetof(struct sdcardfs_inode_info, vfs_inode));
 
+	d = kmem_cache_alloc(sdcardfs_inode_data_cachep,
+					GFP_KERNEL | __GFP_ZERO);
+	if (!d) {
+		kmem_cache_free(sdcardfs_inode_cachep, i);
+		return NULL;
+	}
+
+	i->data = d;
+	kref_init(&d->refcount);
+
 	i->vfs_inode.i_version = 1;
 	return &i->vfs_inode;
 }
@@ -196,6 +224,7 @@ static void i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 
+	release_own_data(SDCARDFS_I(inode));
 	kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode));
 }
 
@@ -214,20 +243,30 @@ static void init_once(void *obj)
 
 int sdcardfs_init_inode_cache(void)
 {
-	int err = 0;
-
 	sdcardfs_inode_cachep =
 		kmem_cache_create("sdcardfs_inode_cache",
 				  sizeof(struct sdcardfs_inode_info), 0,
 				  SLAB_RECLAIM_ACCOUNT, init_once);
+
 	if (!sdcardfs_inode_cachep)
-		err = -ENOMEM;
-	return err;
+		return -ENOMEM;
+
+	sdcardfs_inode_data_cachep =
+		kmem_cache_create("sdcardfs_inode_data_cache",
+				  sizeof(struct sdcardfs_inode_data), 0,
+				  SLAB_RECLAIM_ACCOUNT, NULL);
+	if (!sdcardfs_inode_data_cachep) {
+		kmem_cache_destroy(sdcardfs_inode_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
 }
 
 /* sdcardfs inode cache destructor */
 void sdcardfs_destroy_inode_cache(void)
 {
+	kmem_cache_destroy(sdcardfs_inode_data_cachep);
 	kmem_cache_destroy(sdcardfs_inode_cachep);
 }
 

From e5272d4c20b3270520a7f860cbe4d85da3728507 Mon Sep 17 00:00:00 2001
From: David Zeuthen <zeuthen@google.com>
Date: Wed, 10 May 2017 15:12:19 -0400
Subject: [PATCH 0802/3220] ANDROID: AVB: Only invalidate vbmeta when told to
 do so.

When using AVB, the dm-verity error handling mode is now customizeable
by the bootloader. This CL is for the kernel-side support - it makes
the AVB error handler invalidate the vbmeta partition only if
androidboot.vbmeta.invalidate_on_error is set to "yes".

Bug: 38157502
Test: Manually tested all dm-verity error modes on UEFI-based bootloader.
Change-Id: If36b8a5be9ffd6120e7e0c162843e732eba2ce26
Signed-off-by: David Zeuthen <zeuthen@google.com>
---
 drivers/md/dm-verity-avb.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c
index 88487346c4c6..a0ee31e0dd09 100644
--- a/drivers/md/dm-verity-avb.c
+++ b/drivers/md/dm-verity-avb.c
@@ -12,8 +12,9 @@
 
 #define DM_MSG_PREFIX "verity-avb"
 
-/* Set via module parameter. */
+/* Set via module parameters. */
 static char avb_vbmeta_device[64];
+static char avb_invalidate_on_error[4];
 
 static void invalidate_vbmeta_endio(struct bio *bio)
 {
@@ -175,6 +176,11 @@ void dm_verity_avb_error_handler(void)
 
 	DMINFO("AVB error handler called for %s", avb_vbmeta_device);
 
+	if (strcmp(avb_invalidate_on_error, "yes") != 0) {
+		DMINFO("Not configured to invalidate");
+		return;
+	}
+
 	if (avb_vbmeta_device[0] == '\0') {
 		DMERR("avb_vbmeta_device parameter not set");
 		goto fail_no_dev;
@@ -215,3 +221,5 @@ MODULE_LICENSE("GPL");
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX	"androidboot.vbmeta."
 module_param_string(device, avb_vbmeta_device, sizeof(avb_vbmeta_device), 0);
+module_param_string(invalidate_on_error, avb_invalidate_on_error,
+                    sizeof(avb_invalidate_on_error), 0);

From a533221d29ec146359e20edabc98e1a00098a106 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 22 May 2017 13:23:56 -0700
Subject: [PATCH 0803/3220] ANDROID: sdcardfs: Check for NULL in revalidate

If the inode is in the process of being evicted,
the top value may be NULL.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 38502532
Change-Id: I0b9d04aab621e0398d44d1c5dc53293106aa5f89
---
 fs/sdcardfs/dentry.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index 83ae9103a0f6..13da7e5245bd 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -109,14 +109,16 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 		goto out;
 
 	/* If our top's inode is gone, we may be out of date */
-	inode = d_inode(dentry);
+	inode = igrab(d_inode(dentry));
 	if (inode) {
 		data = top_data_get(SDCARDFS_I(inode));
-		if (data->abandoned) {
+		if (!data || data->abandoned) {
 			d_drop(dentry);
 			err = 0;
 		}
-		data_put(data);
+		if (data)
+			data_put(data);
+		iput(inode);
 	}
 
 out:

From 815c6db10b0741ba10e6c7d3e7fcdc0777bc8918 Mon Sep 17 00:00:00 2001
From: David Zeuthen <zeuthen@google.com>
Date: Mon, 22 May 2017 17:49:40 -0400
Subject: [PATCH 0804/3220] ANDROID: AVB: Fix invalidate_vbmeta_submit().

On some boards with newer kernels the invalidate_vbmeta_submit()
function failed likely because it's using a hard-coded size of
512. Use PAGE_SIZE and also use bio_add_page(). Also print out if the
I/O actually fails.

Additionally, print major:minor of the device we're acting on since
another subset of boards get this wrong by passing the wrong GUID to
libavb in the bootloader. Having this information printed out makes it
a lot easier to pinpoint such mistakes.

Test: Manually tested.
Bug: 38451312
Change-Id: Ib58548953a3375a3c79acf74aa745be9420b8216
Signed-off-by: David Zeuthen <zeuthen@google.com>
---
 drivers/md/dm-verity-avb.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c
index a0ee31e0dd09..7486dd7fb16d 100644
--- a/drivers/md/dm-verity-avb.c
+++ b/drivers/md/dm-verity-avb.c
@@ -18,6 +18,8 @@ static char avb_invalidate_on_error[4];
 
 static void invalidate_vbmeta_endio(struct bio *bio)
 {
+	if (bio->bi_error)
+		DMERR("invalidate_vbmeta_endio: error %d", bio->bi_error);
 	complete(bio->bi_private);
 }
 
@@ -31,20 +33,17 @@ static int invalidate_vbmeta_submit(struct bio *bio,
 	bio->bi_private = &wait;
 	bio->bi_end_io = invalidate_vbmeta_endio;
 	bio->bi_bdev = bdev;
+	bio->bi_rw = rw;
 
 	bio->bi_iter.bi_sector = 0;
 	if (access_last_sector) {
 		sector_t last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1;
 		bio->bi_iter.bi_sector = last_sector;
 	}
-	bio->bi_vcnt = 1;
-	bio->bi_iter.bi_idx = 0;
-	bio->bi_iter.bi_size = 512;
-	bio->bi_iter.bi_bvec_done = 0;
-	bio->bi_rw = rw;
-	bio->bi_io_vec[0].bv_page = page;
-	bio->bi_io_vec[0].bv_len = 512;
-	bio->bi_io_vec[0].bv_offset = 0;
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
+		DMERR("invalidate_vbmeta_submit: bio_add_page error");
+		return -EIO;
+	}
 
 	submit_bio(rw, bio);
 	/* Wait up to 2 seconds for completion or fail. */
@@ -66,6 +65,9 @@ static int invalidate_vbmeta(dev_t vbmeta_devt)
 	int rw = REQ_SYNC | REQ_SOFTBARRIER | REQ_NOIDLE;
 	int access_last_sector = 0;
 
+	DMINFO("invalidate_vbmeta: acting on device %d:%d",
+	       MAJOR(vbmeta_devt), MINOR(vbmeta_devt));
+
 	/* First we open the device for reading. */
 	dev_mode = FMODE_READ | FMODE_EXCL;
 	bdev = blkdev_get_by_dev(vbmeta_devt, dev_mode,

From f0893c7ddba2580e25ce9386ec7f4569c9a8b5fb Mon Sep 17 00:00:00 2001
From: David Zeuthen <zeuthen@google.com>
Date: Tue, 23 May 2017 17:45:39 -0400
Subject: [PATCH 0805/3220] ANDROID: AVB: Fix linter errors.

Various other kernel trees complained about style problems with
dm-verity-avb.c, specifically some lines being wider than 80
characters and spaces being used instead of tabs. With these changes
checkpatch.pl is happy:

 $ scripts/checkpatch.pl --file drivers/md/dm-verity-avb.c
 total: 0 errors, 0 warnings, 229 lines checked

 drivers/md/dm-verity-avb.c has no obvious style problems and is ready for submission.

Bug: None
Test: Compiles.
Change-Id: I08913adf61c5bf2b0f78f7d9e18dbe93feaba9f7
Signed-off-by: David Zeuthen <zeuthen@google.com>
---
 drivers/md/dm-verity-avb.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c
index 7486dd7fb16d..727aacbb1480 100644
--- a/drivers/md/dm-verity-avb.c
+++ b/drivers/md/dm-verity-avb.c
@@ -37,7 +37,9 @@ static int invalidate_vbmeta_submit(struct bio *bio,
 
 	bio->bi_iter.bi_sector = 0;
 	if (access_last_sector) {
-		sector_t last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1;
+		sector_t last_sector;
+
+		last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1;
 		bio->bi_iter.bi_sector = last_sector;
 	}
 	if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
@@ -118,7 +120,7 @@ static int invalidate_vbmeta(dev_t vbmeta_devt)
 			goto failed_to_submit_read;
 		}
 		if (memcmp("AVBf", page_address(page) + offset, 4) != 0) {
-			DMERR("invalidate_vbmeta called on non-vbmeta partition");
+			DMERR("invalidate_vbmeta on non-vbmeta partition");
 			ret = -EINVAL;
 			goto invalid_header;
 		}
@@ -224,4 +226,4 @@ MODULE_LICENSE("GPL");
 #define MODULE_PARAM_PREFIX	"androidboot.vbmeta."
 module_param_string(device, avb_vbmeta_device, sizeof(avb_vbmeta_device), 0);
 module_param_string(invalidate_on_error, avb_invalidate_on_error,
-                    sizeof(avb_invalidate_on_error), 0);
+		    sizeof(avb_invalidate_on_error), 0);

From 200f49cf4a70602a18e34f171eaccba6e561e36a Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Mon, 22 May 2017 12:08:06 -0700
Subject: [PATCH 0806/3220] ANDROID: uid_sys_stats: defer io stats calulation
 for dead tasks

Store sum of dead task io stats in uid_entry and defer uid io
calulation until next uid proc stat change or dumpsys.

Bug: 37754877
Change-Id: I970f010a4c841c5ca26d0efc7e027414c3c952e0
Signed-off-by: Jin Qian <jinqian@google.com>
---
 drivers/misc/uid_sys_stats.c | 107 ++++++++++++++---------------------
 1 file changed, 42 insertions(+), 65 deletions(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index ad21276c8d9e..091370f4ea40 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -49,7 +49,8 @@ struct io_stats {
 
 #define UID_STATE_TOTAL_CURR	2
 #define UID_STATE_TOTAL_LAST	3
-#define UID_STATE_SIZE		4
+#define UID_STATE_DEAD_TASKS	4
+#define UID_STATE_SIZE		5
 
 struct uid_entry {
 	uid_t uid;
@@ -214,35 +215,44 @@ static u64 compute_write_bytes(struct task_struct *task)
 	return task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
 }
 
-static void add_uid_io_curr_stats(struct uid_entry *uid_entry,
-			struct task_struct *task)
+static void add_uid_io_stats(struct uid_entry *uid_entry,
+			struct task_struct *task, int slot)
 {
-	struct io_stats *io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR];
+	struct io_stats *io_slot = &uid_entry->io[slot];
 
-	io_curr->read_bytes += task->ioac.read_bytes;
-	io_curr->write_bytes += compute_write_bytes(task);
-	io_curr->rchar += task->ioac.rchar;
-	io_curr->wchar += task->ioac.wchar;
-	io_curr->fsync += task->ioac.syscfs;
+	io_slot->read_bytes += task->ioac.read_bytes;
+	io_slot->write_bytes += compute_write_bytes(task);
+	io_slot->rchar += task->ioac.rchar;
+	io_slot->wchar += task->ioac.wchar;
+	io_slot->fsync += task->ioac.syscfs;
 }
 
-static void clean_uid_io_last_stats(struct uid_entry *uid_entry,
-			struct task_struct *task)
+static void compute_uid_io_bucket_stats(struct io_stats *io_bucket,
+					struct io_stats *io_curr,
+					struct io_stats *io_last,
+					struct io_stats *io_dead)
 {
-	struct io_stats *io_last = &uid_entry->io[UID_STATE_TOTAL_LAST];
+	io_bucket->read_bytes += io_curr->read_bytes + io_dead->read_bytes -
+		io_last->read_bytes;
+	io_bucket->write_bytes += io_curr->write_bytes + io_dead->write_bytes -
+		io_last->write_bytes;
+	io_bucket->rchar += io_curr->rchar + io_dead->rchar - io_last->rchar;
+	io_bucket->wchar += io_curr->wchar + io_dead->wchar - io_last->wchar;
+	io_bucket->fsync += io_curr->fsync + io_dead->fsync - io_last->fsync;
 
-	io_last->read_bytes -= task->ioac.read_bytes;
-	io_last->write_bytes -= compute_write_bytes(task);
-	io_last->rchar -= task->ioac.rchar;
-	io_last->wchar -= task->ioac.wchar;
-	io_last->fsync -= task->ioac.syscfs;
+	io_last->read_bytes = io_curr->read_bytes;
+	io_last->write_bytes = io_curr->write_bytes;
+	io_last->rchar = io_curr->rchar;
+	io_last->wchar = io_curr->wchar;
+	io_last->fsync = io_curr->fsync;
+
+	memset(io_dead, 0, sizeof(struct io_stats));
 }
 
 static void update_io_stats_all_locked(void)
 {
 	struct uid_entry *uid_entry;
 	struct task_struct *task, *temp;
-	struct io_stats *io_bucket, *io_curr, *io_last;
 	struct user_namespace *user_ns = current_user_ns();
 	unsigned long bkt;
 	uid_t uid;
@@ -257,70 +267,38 @@ static void update_io_stats_all_locked(void)
 		uid_entry = find_or_register_uid(uid);
 		if (!uid_entry)
 			continue;
-		add_uid_io_curr_stats(uid_entry, task);
+		add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
 	} while_each_thread(temp, task);
 	rcu_read_unlock();
 
 	hash_for_each(hash_table, bkt, uid_entry, hash) {
-		io_bucket = &uid_entry->io[uid_entry->state];
-		io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR];
-		io_last = &uid_entry->io[UID_STATE_TOTAL_LAST];
-
-		io_bucket->read_bytes +=
-			io_curr->read_bytes - io_last->read_bytes;
-		io_bucket->write_bytes +=
-			io_curr->write_bytes - io_last->write_bytes;
-		io_bucket->rchar += io_curr->rchar - io_last->rchar;
-		io_bucket->wchar += io_curr->wchar - io_last->wchar;
-		io_bucket->fsync += io_curr->fsync - io_last->fsync;
-
-		io_last->read_bytes = io_curr->read_bytes;
-		io_last->write_bytes = io_curr->write_bytes;
-		io_last->rchar = io_curr->rchar;
-		io_last->wchar = io_curr->wchar;
-		io_last->fsync = io_curr->fsync;
+		compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state],
+					&uid_entry->io[UID_STATE_TOTAL_CURR],
+					&uid_entry->io[UID_STATE_TOTAL_LAST],
+					&uid_entry->io[UID_STATE_DEAD_TASKS]);
 	}
 }
 
-static void update_io_stats_uid_locked(uid_t target_uid)
+static void update_io_stats_uid_locked(struct uid_entry *uid_entry)
 {
-	struct uid_entry *uid_entry;
 	struct task_struct *task, *temp;
-	struct io_stats *io_bucket, *io_curr, *io_last;
 	struct user_namespace *user_ns = current_user_ns();
 
-	uid_entry = find_or_register_uid(target_uid);
-	if (!uid_entry)
-		return;
-
 	memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
 		sizeof(struct io_stats));
 
 	rcu_read_lock();
 	do_each_thread(temp, task) {
-		if (from_kuid_munged(user_ns, task_uid(task)) != target_uid)
+		if (from_kuid_munged(user_ns, task_uid(task)) != uid_entry->uid)
 			continue;
-		add_uid_io_curr_stats(uid_entry, task);
+		add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
 	} while_each_thread(temp, task);
 	rcu_read_unlock();
 
-	io_bucket = &uid_entry->io[uid_entry->state];
-	io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR];
-	io_last = &uid_entry->io[UID_STATE_TOTAL_LAST];
-
-	io_bucket->read_bytes +=
-		io_curr->read_bytes - io_last->read_bytes;
-	io_bucket->write_bytes +=
-		io_curr->write_bytes - io_last->write_bytes;
-	io_bucket->rchar += io_curr->rchar - io_last->rchar;
-	io_bucket->wchar += io_curr->wchar - io_last->wchar;
-	io_bucket->fsync += io_curr->fsync - io_last->fsync;
-
-	io_last->read_bytes = io_curr->read_bytes;
-	io_last->write_bytes = io_curr->write_bytes;
-	io_last->rchar = io_curr->rchar;
-	io_last->wchar = io_curr->wchar;
-	io_last->fsync = io_curr->fsync;
+	compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state],
+				&uid_entry->io[UID_STATE_TOTAL_CURR],
+				&uid_entry->io[UID_STATE_TOTAL_LAST],
+				&uid_entry->io[UID_STATE_DEAD_TASKS]);
 }
 
 static int uid_io_show(struct seq_file *m, void *v)
@@ -405,7 +383,7 @@ static ssize_t uid_procstat_write(struct file *file,
 		return count;
 	}
 
-	update_io_stats_uid_locked(uid);
+	update_io_stats_uid_locked(uid_entry);
 
 	uid_entry->state = state;
 
@@ -443,8 +421,7 @@ static int process_notifier(struct notifier_block *self,
 	uid_entry->utime += utime;
 	uid_entry->stime += stime;
 
-	update_io_stats_uid_locked(uid);
-	clean_uid_io_last_stats(uid_entry, task);
+	add_uid_io_stats(uid_entry, task, UID_STATE_DEAD_TASKS);
 
 exit:
 	rt_mutex_unlock(&uid_lock);

From d73d07673edbdbe78e1a7d00e7827ba9bfd86a59 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 29 May 2017 16:38:16 -0700
Subject: [PATCH 0807/3220] ANDROID: mnt: Fix next_descendent

next_descendent did not properly handle the case
where the initial mount had no slaves. In this case,
we would look for the next slave, but since don't
have a master, the check for wrapping around to the
start of the list will always fail. Instead, we check
for this case, and ensure that we end the iteration
when we come back to the root.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 62094374
Change-Id: I43dfcee041aa3730cb4b9a1161418974ef84812e
---
 fs/pnode.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index b5f97c605d98..e4e428d621e9 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -504,9 +504,14 @@ static struct mount *next_descendent(struct mount *root, struct mount *cur)
 	if (!IS_MNT_NEW(cur) && !list_empty(&cur->mnt_slave_list))
 		return first_slave(cur);
 	do {
-		if (cur->mnt_slave.next != &cur->mnt_master->mnt_slave_list)
-			return next_slave(cur);
-		cur = cur->mnt_master;
+		struct mount *master = cur->mnt_master;
+
+		if (!master || cur->mnt_slave.next != &master->mnt_slave_list) {
+			struct mount *next = next_slave(cur);
+
+			return (next == root) ? NULL : next;
+		}
+		cur = master;
 	} while (cur != root);
 	return NULL;
 }

From 7d16e880c62547936b431cde966d17e39e6e92e0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 8 May 2017 10:21:34 -0700
Subject: [PATCH 0808/3220] Revert "ext4: require encryption feature for
 EXT4_IOC_SET_ENCRYPTION_POLICY"

This reverts commit e2968fb8e7980dccc199dac2593ad476db20969f.

For various reasons, we've had to start enforcing upstream that ext4
encryption can only be used if the filesystem superblock has the
EXT4_FEATURE_INCOMPAT_ENCRYPT flag set, as was the intended design.
Unfortunately, Android isn't ready for this quite yet, since its
userspace still needs to be updated to set the flag at mkfs time, or
else fix it later with tune2fs.  It will need some more time to be fixed
properly, so for now to avoid breaking some devices, revert the kernel
change.

Bug: 36231741
Signed-off-by: Eric Biggers <ebiggers@google.com>
Change-Id: I30bd54afb68dbaf9801f8954099dffa90a2f8df1
---
 fs/ext4/ioctl.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c21826be1cb3..3a2594665b44 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -626,9 +626,6 @@ resizefs_out:
 		struct ext4_encryption_policy policy;
 		int err = 0;
 
-		if (!ext4_has_feature_encrypt(sb))
-			return -EOPNOTSUPP;
-
 		if (copy_from_user(&policy,
 				   (struct ext4_encryption_policy __user *)arg,
 				   sizeof(policy))) {

From df6e5af086d3834986af1f6e63f572edde9bc5ee Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 30 May 2017 14:46:26 -0700
Subject: [PATCH 0809/3220] ANDROID: hid: uhid: implement refcount for open and
 close

Fix concurrent open and close activity sending a UHID_CLOSE while
some consumers still have the device open.

Temporary solution for reference counts on device open and close
calls, absent a facility for this in the HID core likely to appear
in the future.

[toddpoynor@google.com: commit text]
Bug: 38448648
Signed-off-by: Todd Poynor <toddpoynor@google.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Change-Id: I57413e42ec961a960a8ddc4942228df22c730d80
---
 drivers/hid/uhid.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index 1a2032c2c1fb..690a9f0fa042 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -28,6 +28,8 @@
 #define UHID_NAME	"uhid"
 #define UHID_BUFSIZE	32
 
+static DEFINE_MUTEX(uhid_open_mutex);
+
 struct uhid_device {
 	struct mutex devlock;
 	bool running;
@@ -142,15 +144,26 @@ static void uhid_hid_stop(struct hid_device *hid)
 static int uhid_hid_open(struct hid_device *hid)
 {
 	struct uhid_device *uhid = hid->driver_data;
+	int retval = 0;
 
-	return uhid_queue_event(uhid, UHID_OPEN);
+	mutex_lock(&uhid_open_mutex);
+	if (!hid->open++) {
+		retval = uhid_queue_event(uhid, UHID_OPEN);
+		if (retval)
+			hid->open--;
+	}
+	mutex_unlock(&uhid_open_mutex);
+	return retval;
 }
 
 static void uhid_hid_close(struct hid_device *hid)
 {
 	struct uhid_device *uhid = hid->driver_data;
 
-	uhid_queue_event(uhid, UHID_CLOSE);
+	mutex_lock(&uhid_open_mutex);
+	if (!--hid->open)
+		uhid_queue_event(uhid, UHID_CLOSE);
+	mutex_unlock(&uhid_open_mutex);
 }
 
 static int uhid_hid_parse(struct hid_device *hid)

From 5df19f969e4834548c4cf930c591581e237fccd0 Mon Sep 17 00:00:00 2001
From: Ganesh Mahendran <opensource.ganesh@gmail.com>
Date: Wed, 24 May 2017 10:28:27 +0800
Subject: [PATCH 0810/3220] ANDROID: Kconfig: add depends for UID_SYS_STATS

uid_io depends on TASK_XACCT and TASK_IO_ACCOUNTING.
So add depends in Kconfig before compiling code.

Change-Id: Ie6bf57ec7c2eceffadf4da0fc2aca001ce10c36e
Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
---
 drivers/misc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 5847d3be0835..9e649992c177 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -527,7 +527,7 @@ config VEXPRESS_SYSCFG
 
 config UID_SYS_STATS
 	bool "Per-UID statistics"
-	depends on PROFILING
+	depends on PROFILING && TASK_XACCT && TASK_IO_ACCOUNTING
 	help
 	  Per UID based cpu time statistics exported to /proc/uid_cputime
 	  Per UID based io statistics exported to /proc/uid_io

From f02702dcf231c3258aabd702023286f6c01aaa21 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@linaro.org>
Date: Fri, 11 Nov 2016 14:04:43 -0800
Subject: [PATCH 0811/3220] sched: backport cpufreq hooks from 4.9-rc4

The scheduler cpufreq hooks are required by the schedutil cpufreq
governor.

Change-Id: Ied6c46262bb33b7e81bbb3d3d2761124e0c676b7
Signed-off-by: Steve Muckle <smuckle@linaro.org>
[trivial cherry-picking fixes]
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 include/linux/sched.h   | 17 +++++++++++
 kernel/sched/Makefile   |  1 +
 kernel/sched/cpufreq.c  | 63 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/deadline.c |  3 ++
 kernel/sched/fair.c     | 59 +++++++++++++++++++++++++++++++++-----
 kernel/sched/rt.c       |  3 ++
 kernel/sched/sched.h    | 52 ++++++++++++++++++++++++++++++++++
 7 files changed, 191 insertions(+), 7 deletions(-)
 create mode 100644 kernel/sched/cpufreq.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 79f70e5bae08..bc23d15244db 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3291,4 +3291,21 @@ static inline unsigned long rlimit_max(unsigned int limit)
 	return task_rlimit_max(current, limit);
 }
 
+#define SCHED_CPUFREQ_RT        (1U << 0)
+#define SCHED_CPUFREQ_DL        (1U << 1)
+#define SCHED_CPUFREQ_IOWAIT    (1U << 2)
+
+#define SCHED_CPUFREQ_RT_DL     (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
+
+#ifdef CONFIG_CPU_FREQ
+struct update_util_data {
+	void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
+};
+
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+                       void (*func)(struct update_util_data *data, u64 time,
+                                    unsigned int flags));
+void cpufreq_remove_update_util_hook(int cpu);
+#endif /* CONFIG_CPU_FREQ */
+
 #endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 623ce4bde0d5..f2d49474aab1 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -21,4 +21,5 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_SCHED_TUNE) += tune.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 000000000000..dbc51442ecbc
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,63 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.
+ *
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep.  @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
+ */
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+			void (*func)(struct update_util_data *data, u64 time,
+				     unsigned int flags))
+{
+	if (WARN_ON(!data || !func))
+		return;
+
+	if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+		return;
+
+	data->func = func;
+	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f10b1cb255b2..9ec7f19b5020 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -753,6 +753,9 @@ static void update_curr_dl(struct rq *rq)
 	if (unlikely((s64)delta_exec <= 0))
 		return;
 
+	/* kick cpufreq (see the comment in kernel/sched/sched.h). */
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6f353de3f390..baba2d34c34e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2711,6 +2711,29 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+        if (&this_rq()->cfs == cfs_rq) {
+                /*
+                 * There are a few boundary cases this might miss but it should
+                 * get called often enough that that should (hopefully) not be
+                 * a real problem -- added to that it only calls on the local
+                 * CPU, so if we enqueue remotely we'll miss an update, but
+                 * the next tick/schedule should update.
+                 *
+                 * It will not get called when we go idle, because the idle
+                 * thread is a different class (!fair), nor will the utilization
+                 * number include things like RT tasks.
+                 *
+                 * As is, the util number is not freq-invariant (we'd have to
+                 * implement arch_scale_freq_capacity() for that).
+                 *
+                 * See cpu_util().
+                 */
+                cpufreq_update_util(rq_of(cfs_rq), 0);
+        }
+}
+
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 
 /*
@@ -2731,10 +2754,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 } while (0)
 
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq,
+					 bool update_freq)
 {
 	struct sched_avg *sa = &cfs_rq->avg;
-	int decayed, removed = 0;
+	int decayed, removed = 0, removed_util = 0;
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2747,6 +2771,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
 		sub_positive(&sa->util_avg, r);
 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+		removed_util = 1;
 	}
 
 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2761,6 +2786,9 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	if (cfs_rq == &rq_of(cfs_rq)->cfs)
 		trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
 
+	if (update_freq && (decayed || removed_util))
+		cfs_rq_util_change(cfs_rq);
+
 	return decayed || removed;
 }
 
@@ -2779,7 +2807,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 			  se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
 
-	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+	if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
 		update_tg_load_avg(cfs_rq, 0);
 
 	if (entity_is_task(se))
@@ -2811,6 +2839,8 @@ skip_aging:
 	cfs_rq->avg.load_sum += se->avg.load_sum;
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
+
+	cfs_rq_util_change(cfs_rq);
 }
 
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2823,6 +2853,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+
+	cfs_rq_util_change(cfs_rq);
 }
 
 /* Add the load generated by se into cfs_rq's load average */
@@ -2840,7 +2872,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			cfs_rq->curr == se, NULL);
 	}
 
-	decayed = update_cfs_rq_load_avg(now, cfs_rq);
+	decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
 
 	cfs_rq->runnable_load_avg += sa->load_avg;
 	cfs_rq->runnable_load_sum += sa->load_sum;
@@ -2940,7 +2972,11 @@ static int idle_balance(struct rq *this_rq);
 
 #else /* CONFIG_SMP */
 
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
+{
+	cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+}
+
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
@@ -4257,6 +4293,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int task_wakeup = flags & ENQUEUE_WAKEUP;
 #endif
 
+	/*
+	 * If in_iowait is set, the code below may not trigger any cpufreq
+	 * utilization updates, so do it here explicitly with the IOWAIT flag
+	 * passed.
+	 */
+	if (p->in_iowait)
+		cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
@@ -6923,7 +6967,8 @@ static void update_blocked_averages(int cpu)
 		if (throttled_hierarchy(cfs_rq))
 			continue;
 
-		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+					   true))
 			update_tg_load_avg(cfs_rq, 0);
 	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6984,7 +7029,7 @@ static inline void update_blocked_averages(int cpu)
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	update_rq_clock(rq);
-	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 541b8494450e..6bb51d62dca4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1002,6 +1002,9 @@ static void update_curr_rt(struct rq *rq)
 	if (unlikely((s64)delta_exec <= 0))
 		return;
 
+	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0386a2cdb008..237b0bcdd304 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2038,3 +2038,55 @@ static inline void account_reset_rq(struct rq *rq)
 	rq->prev_steal_time_rq = 0;
 #endif
 }
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
+ *
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid.  Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+        struct update_util_data *data;
+
+        data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+        if (data)
+                data->func(data, rq_clock(rq), flags);
+}
+
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
+{
+        if (cpu_of(rq) == smp_processor_id())
+                cpufreq_update_util(rq, flags);
+}
+#else
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant()     (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant()     (false)
+#endif

From 6bc6115c16caeb19d2d946eef8ca00d00c154118 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@linaro.org>
Date: Fri, 11 Nov 2016 17:14:39 -0800
Subject: [PATCH 0812/3220] BACKPORT: cpufreq: schedutil: New governor based on
 scheduler utilization data

Add a new cpufreq scaling governor, called "schedutil", that uses
scheduler-provided CPU utilization information as input for making
its decisions.

Doing that is possible after commit 34e2c55 (cpufreq: Add
mechanism for registering utilization update callbacks) that
introduced cpufreq_update_util() called by the scheduler on
utilization changes (from CFS) and RT/DL task status updates.
In particular, CPU frequency scaling decisions may be based on
the the utilization data passed to cpufreq_update_util() by CFS.

The new governor is relatively simple.

The frequency selection formula used by it depends on whether or not
the utilization is frequency-invariant.  In the frequency-invariant
case the new CPU frequency is given by

	next_freq = 1.25 * max_freq * util / max

where util and max are the last two arguments of cpufreq_update_util().
In turn, if util is not frequency-invariant, the maximum frequency in
the above formula is replaced with the current frequency of the CPU:

	next_freq = 1.25 * curr_freq * util / max

The coefficient 1.25 corresponds to the frequency tipping point at
(util / max) = 0.8.

All of the computations are carried out in the utilization update
handlers provided by the new governor.  One of those handlers is
used for cpufreq policies shared between multiple CPUs and the other
one is for policies with one CPU only (and therefore it doesn't need
to use any extra synchronization means).

The governor supports fast frequency switching if that is supported
by the cpufreq driver in use and possible for the given policy.
In the fast switching case, all operations of the governor take
place in its utilization update handlers.  If fast switching cannot
be used, the frequency switch operations are carried out with the
help of a work item which only calls __cpufreq_driver_target()
(under a mutex) to trigger a frequency update (to a value already
computed beforehand in one of the utilization update handlers).

Currently, the governor treats all of the RT and DL tasks as
"unknown utilization" and sets the frequency to the allowed
maximum when updated from the RT or DL sched classes.  That
heavy-handed approach should be replaced with something more
subtle and specifically targeted at RT and DL tasks.

The governor shares some tunables management code with the
"ondemand" and "conservative" governors and uses some common
definitions from cpufreq_governor.h, but apart from that it
is stand-alone.

Change-Id: I03876e622768e4b3ee4dc28682af7cce771f2f4c
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
(cherry-picked from 9bdcb44e391da5c41b98573bf0305a0e0b1c9569)
[ Backport the schedutil cpufreq governor from 4.9. Some cpufreq
  tunable infrastructure as well as the resolve_freq API is also
  backported as those are dependencies]
Signed-off-by: Steve Muckle <smuckle@linaro.org>
[trivial cherry-picking fixes]
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
[fixed default governor machinery]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 drivers/cpufreq/Kconfig                     |  26 +
 drivers/cpufreq/Makefile                    |   2 +-
 drivers/cpufreq/cpufreq.c                   |  32 ++
 drivers/cpufreq/cpufreq_governor_attr_set.c |  84 +++
 include/linux/cpufreq.h                     |  52 ++
 kernel/sched/Makefile                       |   1 +
 kernel/sched/cpufreq_schedutil.c            | 601 ++++++++++++++++++++
 7 files changed, 797 insertions(+), 1 deletion(-)
 create mode 100644 drivers/cpufreq/cpufreq_governor_attr_set.c
 create mode 100644 kernel/sched/cpufreq_schedutil.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index d43c401ff190..91b05e2b8799 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -120,6 +120,15 @@ config CPU_FREQ_DEFAULT_GOV_SCHED
 	  cpu frequency using CPU utilization estimates from the
 	  scheduler.
 
+config CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+	bool "schedutil"
+	depends on SMP
+	select CPU_FREQ_GOV_SCHEDUTIL
+	select CPU_FREQ_GOV_PERFORMANCE
+	help
+	  Use the 'schedutil' CPUFreq governor by default. If unsure,
+	  have a look at the help section of that governor. The fallback
+	  governor will be 'performance'.
 endchoice
 
 config CPU_FREQ_GOV_PERFORMANCE
@@ -228,6 +237,23 @@ config CPU_FREQ_GOV_SCHED
 
 	  If in doubt, say N.
 
+config CPU_FREQ_GOV_SCHEDUTIL
+	bool "'schedutil' cpufreq policy governor"
+	depends on CPU_FREQ && SMP
+	select CPU_FREQ_GOV_ATTR_SET
+	select IRQ_WORK
+	help
+	  This governor makes decisions based on the utilization data provided
+	  by the scheduler.  It sets the CPU frequency to be proportional to
+	  the utilization/capacity ratio coming from the scheduler.  If the
+	  utilization is frequency-invariant, the new frequency is also
+	  proportional to the maximum available frequency.  If that is not the
+	  case, it is proportional to the current frequency of the CPU.  The
+	  frequency tipping point is at utilization/capacity equal to 80% in
+	  both cases.
+
+	  If in doubt, say N.
+
 comment "CPU frequency scaling drivers"
 
 config CPUFREQ_DT
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index b02ae1463a15..04e6324fd638 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -1,5 +1,5 @@
 # CPUfreq core
-obj-$(CONFIG_CPU_FREQ)			+= cpufreq.o freq_table.o
+obj-$(CONFIG_CPU_FREQ)			+= cpufreq.o freq_table.o cpufreq_governor_attr_set.o
 
 # CPUfreq stats
 obj-$(CONFIG_CPU_FREQ_STAT)             += cpufreq_stats.o
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 0ceb0a889ebf..56d12ed4f38c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -531,6 +531,38 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy,
 }
 EXPORT_SYMBOL_GPL(cpufreq_freq_transition_end);
 
+/**
+ * cpufreq_driver_resolve_freq - Map a target frequency to a driver-supported
+ * one.
+ * @target_freq: target frequency to resolve.
+ *
+ * The target to driver frequency mapping is cached in the policy.
+ *
+ * Return: Lowest driver-supported frequency greater than or equal to the
+ * given target_freq, subject to policy (min/max) and driver limitations.
+ */
+unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy,
+					 unsigned int target_freq)
+{
+	target_freq = clamp_val(target_freq, policy->min, policy->max);
+	policy->cached_target_freq = target_freq;
+
+	if (cpufreq_driver->target_index) {
+		int idx, rv;
+
+		rv = cpufreq_frequency_table_target(policy, policy->freq_table,
+						    target_freq,
+						    CPUFREQ_RELATION_L,
+						    &idx);
+		if (rv)
+			return target_freq;
+		policy->cached_resolved_idx = idx;
+		return policy->freq_table[idx].frequency;
+        }
+
+	return target_freq;
+}
+EXPORT_SYMBOL_GPL(cpufreq_driver_resolve_freq);
 
 /*********************************************************************
  *                          SYSFS INTERFACE                          *
diff --git a/drivers/cpufreq/cpufreq_governor_attr_set.c b/drivers/cpufreq/cpufreq_governor_attr_set.c
new file mode 100644
index 000000000000..52841f807a7e
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_governor_attr_set.c
@@ -0,0 +1,84 @@
+/*
+ * Abstract code for CPUFreq governor tunable sysfs attributes.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cpufreq_governor.h"
+
+static inline struct gov_attr_set *to_gov_attr_set(struct kobject *kobj)
+{
+	return container_of(kobj, struct gov_attr_set, kobj);
+}
+
+static inline struct governor_attr *to_gov_attr(struct attribute *attr)
+{
+	return container_of(attr, struct governor_attr, attr);
+}
+
+static ssize_t governor_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct governor_attr *gattr = to_gov_attr(attr);
+
+	return gattr->show(to_gov_attr_set(kobj), buf);
+}
+
+static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buf, size_t count)
+{
+	struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
+	struct governor_attr *gattr = to_gov_attr(attr);
+	int ret;
+
+	mutex_lock(&attr_set->update_lock);
+	ret = attr_set->usage_count ? gattr->store(attr_set, buf, count) : -EBUSY;
+	mutex_unlock(&attr_set->update_lock);
+	return ret;
+}
+
+const struct sysfs_ops governor_sysfs_ops = {
+	.show	= governor_show,
+	.store	= governor_store,
+};
+EXPORT_SYMBOL_GPL(governor_sysfs_ops);
+
+void gov_attr_set_init(struct gov_attr_set *attr_set, struct list_head *list_node)
+{
+	INIT_LIST_HEAD(&attr_set->policy_list);
+	mutex_init(&attr_set->update_lock);
+	attr_set->usage_count = 1;
+	list_add(list_node, &attr_set->policy_list);
+}
+EXPORT_SYMBOL_GPL(gov_attr_set_init);
+
+void gov_attr_set_get(struct gov_attr_set *attr_set, struct list_head *list_node)
+{
+	mutex_lock(&attr_set->update_lock);
+	attr_set->usage_count++;
+	list_add(list_node, &attr_set->policy_list);
+	mutex_unlock(&attr_set->update_lock);
+}
+EXPORT_SYMBOL_GPL(gov_attr_set_get);
+
+unsigned int gov_attr_set_put(struct gov_attr_set *attr_set, struct list_head *list_node)
+{
+	unsigned int count;
+
+	mutex_lock(&attr_set->update_lock);
+	list_del(list_node);
+	count = --attr_set->usage_count;
+	mutex_unlock(&attr_set->update_lock);
+	if (count)
+		return count;
+
+	kobject_put(&attr_set->kobj);
+	mutex_destroy(&attr_set->update_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gov_attr_set_put);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 60571292a802..b144e7445477 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -107,6 +107,22 @@ struct cpufreq_policy {
 	 */
 	struct rw_semaphore	rwsem;
 
+
+	/*
+	 * Fast switch flags:
+	 * - fast_switch_possible should be set by the driver if it can
+	 *   guarantee that frequency can be changed on any CPU sharing the
+	 *   policy and that the change will affect all of the policy CPUs then.
+	 * - fast_switch_enabled is to be set by governors that support fast
+	 *   freqnency switching with the help of cpufreq_enable_fast_switch().
+	 */
+	bool                    fast_switch_possible;
+	bool                    fast_switch_enabled;
+
+	/* Cached frequency lookup from cpufreq_driver_resolve_freq. */
+	unsigned int cached_target_freq;
+	int cached_resolved_idx;
+
 	/* Synchronization for frequency transitions */
 	bool			transition_ongoing; /* Tracks transition status */
 	spinlock_t		transition_lock;
@@ -471,6 +487,8 @@ int cpufreq_driver_target(struct cpufreq_policy *policy,
 int __cpufreq_driver_target(struct cpufreq_policy *policy,
 				   unsigned int target_freq,
 				   unsigned int relation);
+unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy,
+					 unsigned int target_freq);
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
@@ -502,8 +520,42 @@ extern struct cpufreq_governor cpufreq_gov_interactive;
 #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED)
 extern struct cpufreq_governor cpufreq_gov_sched;
 #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_sched)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL)
+extern struct cpufreq_governor cpufreq_gov_schedutil;
+#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_schedutil)
 #endif
 
+static inline void cpufreq_policy_apply_limits(struct cpufreq_policy *policy)
+{
+	if (policy->max < policy->cur)
+		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
+	else if (policy->min > policy->cur)
+		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
+}
+
+/* Governor attribute set */
+struct gov_attr_set {
+	struct kobject kobj;
+	struct list_head policy_list;
+	struct mutex update_lock;
+	int usage_count;
+};
+
+/* sysfs ops for cpufreq governors */
+extern const struct sysfs_ops governor_sysfs_ops;
+
+void gov_attr_set_init(struct gov_attr_set *attr_set, struct list_head *list_node);
+void gov_attr_set_get(struct gov_attr_set *attr_set, struct list_head *list_node);
+unsigned int gov_attr_set_put(struct gov_attr_set *attr_set, struct list_head *list_node);
+
+/* Governor sysfs attribute */
+struct governor_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gov_attr_set *attr_set, char *buf);
+	ssize_t (*store)(struct gov_attr_set *attr_set, const char *buf,
+			 size_t count);
+};
+
 /*********************************************************************
  *                     FREQUENCY TABLE HELPERS                       *
  *********************************************************************/
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f2d49474aab1..ca0d94096170 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_SCHED_TUNE) += tune.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100644
index 000000000000..7e42a1d6d88d
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,601 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+#include <trace/events/power.h>
+
+#include "sched.h"
+
+/* Stub out fast switch routines present on mainline to reduce the backport
+ * overhead. */
+#define cpufreq_driver_fast_switch(x, y) 0
+#define cpufreq_enable_fast_switch(x)
+#define cpufreq_disable_fast_switch(x)
+
+struct sugov_tunables {
+	struct gov_attr_set attr_set;
+	unsigned int rate_limit_us;
+};
+
+struct sugov_policy {
+	struct cpufreq_policy *policy;
+
+	struct sugov_tunables *tunables;
+	struct list_head tunables_hook;
+
+	raw_spinlock_t update_lock;  /* For shared policies */
+	u64 last_freq_update_time;
+	s64 freq_update_delay_ns;
+	unsigned int next_freq;
+
+	/* The next fields are only needed if fast switch cannot be used. */
+	struct irq_work irq_work;
+	struct work_struct work;
+	struct mutex work_lock;
+	bool work_in_progress;
+
+	bool need_freq_update;
+};
+
+struct sugov_cpu {
+	struct update_util_data update_util;
+	struct sugov_policy *sg_policy;
+
+	unsigned int cached_raw_freq;
+	unsigned long iowait_boost;
+	unsigned long iowait_boost_max;
+	u64 last_update;
+
+	/* The fields below are only needed when sharing a policy. */
+	unsigned long util;
+	unsigned long max;
+	unsigned int flags;
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+	s64 delta_ns;
+
+	if (sg_policy->work_in_progress)
+		return false;
+
+	if (unlikely(sg_policy->need_freq_update)) {
+		sg_policy->need_freq_update = false;
+		/*
+		 * This happens when limits change, so forget the previous
+		 * next_freq value and force an update.
+		 */
+		sg_policy->next_freq = UINT_MAX;
+		return true;
+	}
+
+	delta_ns = time - sg_policy->last_freq_update_time;
+	return delta_ns >= sg_policy->freq_update_delay_ns;
+}
+
+static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+				unsigned int next_freq)
+{
+	struct cpufreq_policy *policy = sg_policy->policy;
+
+	sg_policy->last_freq_update_time = time;
+
+	if (policy->fast_switch_enabled) {
+		if (sg_policy->next_freq == next_freq) {
+			trace_cpu_frequency(policy->cur, smp_processor_id());
+			return;
+		}
+		sg_policy->next_freq = next_freq;
+		next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+		if (next_freq == CPUFREQ_ENTRY_INVALID)
+			return;
+
+		policy->cur = next_freq;
+		trace_cpu_frequency(next_freq, smp_processor_id());
+	} else if (sg_policy->next_freq != next_freq) {
+		sg_policy->next_freq = next_freq;
+		sg_policy->work_in_progress = true;
+		irq_work_queue(&sg_policy->irq_work);
+	}
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @sg_cpu: schedutil cpu object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
+ */
+static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
+				  unsigned long max)
+{
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	struct cpufreq_policy *policy = sg_policy->policy;
+	unsigned int freq = arch_scale_freq_invariant() ?
+				policy->cpuinfo.max_freq : policy->cur;
+
+	freq = (freq + (freq >> 2)) * util / max;
+
+	if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+		return sg_policy->next_freq;
+	sg_cpu->cached_raw_freq = freq;
+	return cpufreq_driver_resolve_freq(policy, freq);
+}
+
+static void sugov_get_util(unsigned long *util, unsigned long *max)
+{
+	struct rq *rq = this_rq();
+	unsigned long cfs_max;
+
+	cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+
+	*util = min(rq->cfs.avg.util_avg, cfs_max);
+	*max = cfs_max;
+}
+
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+				   unsigned int flags)
+{
+	if (flags & SCHED_CPUFREQ_IOWAIT) {
+		sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+	} else if (sg_cpu->iowait_boost) {
+		s64 delta_ns = time - sg_cpu->last_update;
+
+		/* Clear iowait_boost if the CPU apprears to have been idle. */
+		if (delta_ns > TICK_NSEC)
+			sg_cpu->iowait_boost = 0;
+	}
+}
+
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
+			       unsigned long *max)
+{
+	unsigned long boost_util = sg_cpu->iowait_boost;
+	unsigned long boost_max = sg_cpu->iowait_boost_max;
+
+	if (!boost_util)
+		return;
+
+	if (*util * boost_max < *max * boost_util) {
+		*util = boost_util;
+		*max = boost_max;
+	}
+	sg_cpu->iowait_boost >>= 1;
+}
+
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+				unsigned int flags)
+{
+	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	struct cpufreq_policy *policy = sg_policy->policy;
+	unsigned long util, max;
+	unsigned int next_f;
+
+	sugov_set_iowait_boost(sg_cpu, time, flags);
+	sg_cpu->last_update = time;
+
+	if (!sugov_should_update_freq(sg_policy, time))
+		return;
+
+	if (flags & SCHED_CPUFREQ_RT_DL) {
+		next_f = policy->cpuinfo.max_freq;
+	} else {
+		sugov_get_util(&util, &max);
+		sugov_iowait_boost(sg_cpu, &util, &max);
+		next_f = get_next_freq(sg_cpu, util, max);
+	}
+	sugov_update_commit(sg_policy, time, next_f);
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
+					   unsigned long util, unsigned long max,
+					   unsigned int flags)
+{
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	struct cpufreq_policy *policy = sg_policy->policy;
+	unsigned int max_f = policy->cpuinfo.max_freq;
+	u64 last_freq_update_time = sg_policy->last_freq_update_time;
+	unsigned int j;
+
+	if (flags & SCHED_CPUFREQ_RT_DL)
+		return max_f;
+
+	sugov_iowait_boost(sg_cpu, &util, &max);
+
+	for_each_cpu(j, policy->cpus) {
+		struct sugov_cpu *j_sg_cpu;
+		unsigned long j_util, j_max;
+		s64 delta_ns;
+
+		if (j == smp_processor_id())
+			continue;
+
+		j_sg_cpu = &per_cpu(sugov_cpu, j);
+		/*
+		 * If the CPU utilization was last updated before the previous
+		 * frequency update and the time elapsed between the last update
+		 * of the CPU utilization and the last frequency update is long
+		 * enough, don't take the CPU into account as it probably is
+		 * idle now (and clear iowait_boost for it).
+		 */
+		delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+		if (delta_ns > TICK_NSEC) {
+			j_sg_cpu->iowait_boost = 0;
+			continue;
+		}
+		if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
+			return max_f;
+
+		j_util = j_sg_cpu->util;
+		j_max = j_sg_cpu->max;
+		if (j_util * max > j_max * util) {
+			util = j_util;
+			max = j_max;
+		}
+
+		sugov_iowait_boost(j_sg_cpu, &util, &max);
+	}
+
+	return get_next_freq(sg_cpu, util, max);
+}
+
+static void sugov_update_shared(struct update_util_data *hook, u64 time,
+				unsigned int flags)
+{
+	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	unsigned long util, max;
+	unsigned int next_f;
+
+	sugov_get_util(&util, &max);
+
+	raw_spin_lock(&sg_policy->update_lock);
+
+	sg_cpu->util = util;
+	sg_cpu->max = max;
+	sg_cpu->flags = flags;
+
+	sugov_set_iowait_boost(sg_cpu, time, flags);
+	sg_cpu->last_update = time;
+
+	if (sugov_should_update_freq(sg_policy, time)) {
+		next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+		sugov_update_commit(sg_policy, time, next_f);
+	}
+
+	raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct work_struct *work)
+{
+	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+
+	mutex_lock(&sg_policy->work_lock);
+	__cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
+				CPUFREQ_RELATION_L);
+	mutex_unlock(&sg_policy->work_lock);
+
+	sg_policy->work_in_progress = false;
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+	struct sugov_policy *sg_policy;
+
+	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+	schedule_work_on(smp_processor_id(), &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+	return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+	return sprintf(buf, "%u\n", tunables->rate_limit_us);
+}
+
+static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
+				   size_t count)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+	struct sugov_policy *sg_policy;
+	unsigned int rate_limit_us;
+
+	if (kstrtouint(buf, 10, &rate_limit_us))
+		return -EINVAL;
+
+	tunables->rate_limit_us = rate_limit_us;
+
+	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+		sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+
+	return count;
+}
+
+static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+
+static struct attribute *sugov_attributes[] = {
+	&rate_limit_us.attr,
+	NULL
+};
+
+static struct kobj_type sugov_tunables_ktype = {
+	.default_attrs = sugov_attributes,
+	.sysfs_ops = &governor_sysfs_ops,
+};
+
+/********************** cpufreq governor interface *********************/
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy;
+
+	sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+	if (!sg_policy)
+		return NULL;
+
+	sg_policy->policy = policy;
+	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+	INIT_WORK(&sg_policy->work, sugov_work);
+	mutex_init(&sg_policy->work_lock);
+	raw_spin_lock_init(&sg_policy->update_lock);
+	return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+	mutex_destroy(&sg_policy->work_lock);
+	kfree(sg_policy);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+	struct sugov_tunables *tunables;
+
+	tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+	if (tunables) {
+		gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+		if (!have_governor_per_policy())
+			global_tunables = tunables;
+	}
+	return tunables;
+}
+
+static void sugov_tunables_free(struct sugov_tunables *tunables)
+{
+	if (!have_governor_per_policy())
+		global_tunables = NULL;
+
+	kfree(tunables);
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy;
+	struct sugov_tunables *tunables;
+	unsigned int lat;
+	int ret = 0;
+
+	/* State should be equivalent to EXIT */
+	if (policy->governor_data)
+		return -EBUSY;
+
+	sg_policy = sugov_policy_alloc(policy);
+	if (!sg_policy)
+		return -ENOMEM;
+
+	mutex_lock(&global_tunables_lock);
+
+	if (global_tunables) {
+		if (WARN_ON(have_governor_per_policy())) {
+			ret = -EINVAL;
+			goto free_sg_policy;
+		}
+		policy->governor_data = sg_policy;
+		sg_policy->tunables = global_tunables;
+
+		gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+		goto out;
+	}
+
+	tunables = sugov_tunables_alloc(sg_policy);
+	if (!tunables) {
+		ret = -ENOMEM;
+		goto free_sg_policy;
+	}
+
+	tunables->rate_limit_us = 20 * USEC_PER_MSEC;
+	lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+	if (lat)
+		tunables->rate_limit_us *= lat;
+
+	policy->governor_data = sg_policy;
+	sg_policy->tunables = tunables;
+
+	ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+				   get_governor_parent_kobj(policy), "%s",
+				   cpufreq_gov_schedutil.name);
+	if (ret)
+		goto fail;
+
+ out:
+	mutex_unlock(&global_tunables_lock);
+
+	cpufreq_enable_fast_switch(policy);
+	return 0;
+
+ fail:
+	policy->governor_data = NULL;
+	sugov_tunables_free(tunables);
+
+ free_sg_policy:
+	mutex_unlock(&global_tunables_lock);
+
+	sugov_policy_free(sg_policy);
+	pr_err("initialization failed (error %d)\n", ret);
+	return ret;
+}
+
+static int sugov_exit(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy = policy->governor_data;
+	struct sugov_tunables *tunables = sg_policy->tunables;
+	unsigned int count;
+
+	cpufreq_disable_fast_switch(policy);
+
+	mutex_lock(&global_tunables_lock);
+
+	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+	policy->governor_data = NULL;
+	if (!count)
+		sugov_tunables_free(tunables);
+
+	mutex_unlock(&global_tunables_lock);
+
+	sugov_policy_free(sg_policy);
+
+	return 0;
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy = policy->governor_data;
+	unsigned int cpu;
+
+	sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+	sg_policy->last_freq_update_time = 0;
+	sg_policy->next_freq = UINT_MAX;
+	sg_policy->work_in_progress = false;
+	sg_policy->need_freq_update = false;
+
+	for_each_cpu(cpu, policy->cpus) {
+		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+		sg_cpu->sg_policy = sg_policy;
+		if (policy_is_shared(policy)) {
+			sg_cpu->util = 0;
+			sg_cpu->max = 0;
+			sg_cpu->flags = SCHED_CPUFREQ_RT;
+			sg_cpu->last_update = 0;
+			sg_cpu->cached_raw_freq = 0;
+			sg_cpu->iowait_boost = 0;
+			sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+						     sugov_update_shared);
+		} else {
+			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+						     sugov_update_single);
+		}
+	}
+	return 0;
+}
+
+static int sugov_stop(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy = policy->governor_data;
+	unsigned int cpu;
+
+	for_each_cpu(cpu, policy->cpus)
+		cpufreq_remove_update_util_hook(cpu);
+
+	synchronize_sched();
+
+	irq_work_sync(&sg_policy->irq_work);
+	cancel_work_sync(&sg_policy->work);
+
+	return 0;
+}
+
+static int sugov_limits(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy = policy->governor_data;
+
+	if (!policy->fast_switch_enabled) {
+		mutex_lock(&sg_policy->work_lock);
+		cpufreq_policy_apply_limits(policy);
+		mutex_unlock(&sg_policy->work_lock);
+	}
+
+	sg_policy->need_freq_update = true;
+
+	return 0;
+}
+
+static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,
+				unsigned int event)
+{
+	switch(event) {
+	case CPUFREQ_GOV_POLICY_INIT:
+		return sugov_init(policy);
+	case CPUFREQ_GOV_POLICY_EXIT:
+		return sugov_exit(policy);
+	case CPUFREQ_GOV_START:
+		return sugov_start(policy);
+	case CPUFREQ_GOV_STOP:
+		return sugov_stop(policy);
+	case CPUFREQ_GOV_LIMITS:
+		return sugov_limits(policy);
+	default:
+		BUG();
+	}
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil = {
+	.name = "schedutil",
+	.governor = cpufreq_schedutil_cb,
+	.owner = THIS_MODULE,
+};
+
+static int __init sugov_register(void)
+{
+	return cpufreq_register_governor(&cpufreq_gov_schedutil);
+}
+fs_initcall(sugov_register);

From ca7b7d3c9995a12d57a7e7df6bb567a63b6cad00 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@linaro.org>
Date: Mon, 24 Oct 2016 18:22:19 -0700
Subject: [PATCH 0813/3220] sched/cpufreq: fix tunables for schedfreq governor

The schedfreq governor does not currently handle cpufreq drivers which
use a global set of tunables (!have_governor_per_policy).

For example on x86 and using the acpi cpufreq driver, doing this

  cat /sys/devices/system/cpu/cpufreq/sched/up_throttle_nsec

will result in a bad pointer access.

Update the tunable code using the upstream schedutil tunable code by
Rafael Wysocki as a guide.

Includes a partial backport of the reorganized cpufreq tunable
infrastructure.

Change-Id: I7e6f8de1dac297077ad43f37dd2f6ddbfe921c98
Signed-off-by: Steve Muckle <smuckle@linaro.org>
[fixed cherry-pick issue]
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
[fixed cherry-pick issue]
Signed-off-by: Thierry Strudel <tstrudel@google.com>
---
 kernel/sched/cpufreq_sched.c | 218 +++++++++++++++++++----------------
 1 file changed, 116 insertions(+), 102 deletions(-)

diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index d751bc2d0d6e..f10d9f7d6d07 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -32,6 +32,12 @@ static struct cpufreq_governor cpufreq_gov_sched;
 static DEFINE_PER_CPU(unsigned long, enabled);
 DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
 
+struct gov_tunables {
+	struct gov_attr_set attr_set;
+	unsigned int up_throttle_nsec;
+	unsigned int down_throttle_nsec;
+};
+
 /**
  * gov_data - per-policy data internal to the governor
  * @up_throttle: next throttling period expiry if increasing OPP
@@ -53,8 +59,8 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
 struct gov_data {
 	ktime_t up_throttle;
 	ktime_t down_throttle;
-	unsigned int up_throttle_nsec;
-	unsigned int down_throttle_nsec;
+	struct gov_tunables *tunables;
+	struct list_head tunables_hook;
 	struct task_struct *task;
 	struct irq_work irq_work;
 	unsigned int requested_freq;
@@ -71,8 +77,10 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
 
 	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
 
-	gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
-	gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
+	gd->up_throttle = ktime_add_ns(ktime_get(),
+				       gd->tunables->up_throttle_nsec);
+	gd->down_throttle = ktime_add_ns(ktime_get(),
+					 gd->tunables->down_throttle_nsec);
 	up_write(&policy->rwsem);
 }
 
@@ -262,12 +270,70 @@ static inline void clear_sched_freq(void)
 	static_key_slow_dec(&__sched_freq);
 }
 
-static struct attribute_group sched_attr_group_gov_pol;
-static struct attribute_group *get_sysfs_attr(void)
+/* Tunables */
+static struct gov_tunables *global_tunables;
+
+static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set)
 {
-	return &sched_attr_group_gov_pol;
+	return container_of(attr_set, struct gov_tunables, attr_set);
 }
 
+static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct gov_tunables *tunables = to_tunables(attr_set);
+
+	return sprintf(buf, "%u\n", tunables->up_throttle_nsec);
+}
+
+static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set,
+				      const char *buf, size_t count)
+{
+	struct gov_tunables *tunables = to_tunables(attr_set);
+	int ret;
+	long unsigned int val;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret < 0)
+		return ret;
+	tunables->up_throttle_nsec = val;
+	return count;
+}
+
+static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct gov_tunables *tunables = to_tunables(attr_set);
+
+	return sprintf(buf, "%u\n", tunables->down_throttle_nsec);
+}
+
+static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set,
+					const char *buf, size_t count)
+{
+	struct gov_tunables *tunables = to_tunables(attr_set);
+	int ret;
+	long unsigned int val;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret < 0)
+		return ret;
+	tunables->down_throttle_nsec = val;
+	return count;
+}
+
+static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec);
+static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec);
+
+static struct attribute *schedfreq_attributes[] = {
+	&up_throttle_nsec.attr,
+	&down_throttle_nsec.attr,
+	NULL
+};
+
+static struct kobj_type tunables_ktype = {
+	.default_attrs = schedfreq_attributes,
+	.sysfs_ops = &governor_sysfs_ops,
+};
+
 static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 {
 	struct gov_data *gd;
@@ -282,17 +348,39 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 	if (!gd)
 		return -ENOMEM;
 
-	gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
-			    policy->cpuinfo.transition_latency :
-			    THROTTLE_UP_NSEC;
-	gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
-	pr_debug("%s: throttle threshold = %u [ns]\n",
-		  __func__, gd->up_throttle_nsec);
+	policy->governor_data = gd;
 
-	rc = sysfs_create_group(&policy->kobj, get_sysfs_attr());
-	if (rc) {
-		pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
-		goto err;
+	if (!global_tunables) {
+		gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);
+		if (!gd->tunables)
+			goto free_gd;
+
+		gd->tunables->up_throttle_nsec =
+			policy->cpuinfo.transition_latency ?
+			policy->cpuinfo.transition_latency :
+			THROTTLE_UP_NSEC;
+		gd->tunables->down_throttle_nsec =
+			THROTTLE_DOWN_NSEC;
+
+		rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,
+					  &tunables_ktype,
+					  get_governor_parent_kobj(policy),
+					  "%s", cpufreq_gov_sched.name);
+		if (rc)
+			goto free_tunables;
+
+		gov_attr_set_init(&gd->tunables->attr_set,
+				  &gd->tunables_hook);
+
+		pr_debug("%s: throttle_threshold = %u [ns]\n",
+			 __func__, gd->tunables->up_throttle_nsec);
+
+		if (!have_governor_per_policy())
+			global_tunables = gd->tunables;
+	} else {
+		gd->tunables = global_tunables;
+		gov_attr_set_get(&global_tunables->attr_set,
+				 &gd->tunables_hook);
 	}
 
 	policy->governor_data = gd;
@@ -304,7 +392,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 		if (IS_ERR_OR_NULL(gd->task)) {
 			pr_err("%s: failed to create kschedfreq thread\n",
 			       __func__);
-			goto err;
+			goto free_tunables;
 		}
 		get_task_struct(gd->task);
 		kthread_bind_mask(gd->task, policy->related_cpus);
@@ -316,7 +404,9 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 
 	return 0;
 
-err:
+free_tunables:
+	kfree(gd->tunables);
+free_gd:
 	policy->governor_data = NULL;
 	kfree(gd);
 	return -ENOMEM;
@@ -324,6 +414,7 @@ err:
 
 static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
 {
+	unsigned int count;
 	struct gov_data *gd = policy->governor_data;
 
 	clear_sched_freq();
@@ -332,7 +423,12 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
 		put_task_struct(gd->task);
 	}
 
-	sysfs_remove_group(&policy->kobj, get_sysfs_attr());
+	count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook);
+	if (!count) {
+		if (!have_governor_per_policy())
+			global_tunables = NULL;
+		kfree(gd->tunables);
+	}
 
 	policy->governor_data = NULL;
 
@@ -394,88 +490,6 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy,
 	return 0;
 }
 
-/* Tunables */
-static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
-{
-	return sprintf(buf, "%u\n", gd->up_throttle_nsec);
-}
-
-static ssize_t store_up_throttle_nsec(struct gov_data *gd,
-		const char *buf, size_t count)
-{
-	int ret;
-	long unsigned int val;
-
-	ret = kstrtoul(buf, 0, &val);
-	if (ret < 0)
-		return ret;
-	gd->up_throttle_nsec = val;
-	return count;
-}
-
-static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
-{
-	return sprintf(buf, "%u\n", gd->down_throttle_nsec);
-}
-
-static ssize_t store_down_throttle_nsec(struct gov_data *gd,
-		const char *buf, size_t count)
-{
-	int ret;
-	long unsigned int val;
-
-	ret = kstrtoul(buf, 0, &val);
-	if (ret < 0)
-		return ret;
-	gd->down_throttle_nsec = val;
-	return count;
-}
-
-/*
- * Create show/store routines
- * - sys: One governor instance for complete SYSTEM
- * - pol: One governor instance per struct cpufreq_policy
- */
-#define show_gov_pol_sys(file_name)					\
-static ssize_t show_##file_name##_gov_pol				\
-(struct cpufreq_policy *policy, char *buf)				\
-{									\
-	return show_##file_name(policy->governor_data, buf);		\
-}
-
-#define store_gov_pol_sys(file_name)					\
-static ssize_t store_##file_name##_gov_pol				\
-(struct cpufreq_policy *policy, const char *buf, size_t count)		\
-{									\
-	return store_##file_name(policy->governor_data, buf, count);	\
-}
-
-#define gov_pol_attr_rw(_name)						\
-	static struct freq_attr _name##_gov_pol =				\
-	__ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
-
-#define show_store_gov_pol_sys(file_name)				\
-	show_gov_pol_sys(file_name);						\
-	store_gov_pol_sys(file_name)
-#define tunable_handlers(file_name) \
-	show_gov_pol_sys(file_name); \
-	store_gov_pol_sys(file_name); \
-	gov_pol_attr_rw(file_name)
-
-tunable_handlers(down_throttle_nsec);
-tunable_handlers(up_throttle_nsec);
-
-/* Per policy governor instance */
-static struct attribute *sched_attributes_gov_pol[] = {
-	&up_throttle_nsec_gov_pol.attr,
-	&down_throttle_nsec_gov_pol.attr,
-	NULL,
-};
-
-static struct attribute_group sched_attr_group_gov_pol = {
-	.attrs = sched_attributes_gov_pol,
-	.name = "sched",
-};
 
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
 static

From 78213729a7d12af48a7953dd0c69a8d9b1b39e44 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Tue, 11 Oct 2016 13:55:43 -0700
Subject: [PATCH 0814/3220] BACKPORT: kthread: allow to cancel kthread work

We are going to use kthread workers more widely and sometimes we will need
to make sure that the work is neither pending nor running.

This patch implements cancel_*_sync() operations as inspired by
workqueues.  Well, we are synchronized against the other operations via
the worker lock, we use del_timer_sync() and a counter to count parallel
cancel operations.  Therefore the implementation might be easier.

First, we check if a worker is assigned.  If not, the work has newer been
queued after it was initialized.

Second, we take the worker lock.  It must be the right one.  The work must
not be assigned to another worker unless it is initialized in between.

Third, we try to cancel the timer when it exists.  The timer is deleted
synchronously to make sure that the timer call back is not running.  We
need to temporary release the worker->lock to avoid a possible deadlock
with the callback.  In the meantime, we set work->canceling counter to
avoid any queuing.

Fourth, we try to remove the work from a worker list. It might be
the list of either normal or delayed works.

Fifth, if the work is running, we call kthread_flush_work().  It might
take an arbitrary time.  We need to release the worker-lock again.  In the
meantime, we again block any queuing by the canceling counter.

As already mentioned, the check for a pending kthread work is done under a
lock.  In compare with workqueues, we do not need to fight for a single
PENDING bit to block other operations.  Therefore we do not suffer from
the thundering storm problem and all parallel canceling jobs might use
kthread_flush_work().  Any queuing is blocked until the counter gets zero.

Change-Id: I8a8ece0f93c828f311d0ad5c88d80db2388e4808
Link: http://lkml.kernel.org/r/1470754545-17632-10-git-send-email-pmladek@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Borislav Petkov <bp@suse.de>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry-picked from 37be45d49dec2a411e29d50c9597cfe8184b5645)
[major changes to the original patch while cherry-picking; only rebased
the sync variant]
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 include/linux/kthread.h |  4 ++
 kernel/kthread.c        | 96 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index e691b6a23f72..4289343ba1f9 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -75,6 +75,8 @@ struct kthread_work {
 	struct list_head	node;
 	kthread_work_func_t	func;
 	struct kthread_worker	*worker;
+	/* Number of canceling calls that are running at the moment. */
+	int			canceling;
 };
 
 #define KTHREAD_WORKER_INIT(worker)	{				\
@@ -129,4 +131,6 @@ bool queue_kthread_work(struct kthread_worker *worker,
 void flush_kthread_work(struct kthread_work *work);
 void flush_kthread_worker(struct kthread_worker *worker);
 
+bool kthread_cancel_work_sync(struct kthread_work *work);
+
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 850b255649a2..698b8dec3074 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -604,6 +604,19 @@ repeat:
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 
+/*
+ * Returns true when the work could not be queued at the moment.
+ * It happens when it is already pending in a worker list
+ * or when it is being cancelled.
+ */
+static inline bool queuing_blocked(struct kthread_worker *worker,
+				   struct kthread_work *work)
+{
+	lockdep_assert_held(&worker->lock);
+
+	return !list_empty(&work->node) || work->canceling;
+}
+
 /* insert @work before @pos in @worker */
 static void insert_kthread_work(struct kthread_worker *worker,
 			       struct kthread_work *work,
@@ -633,7 +646,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
 	unsigned long flags;
 
 	spin_lock_irqsave(&worker->lock, flags);
-	if (list_empty(&work->node)) {
+	if (!queuing_blocked(worker, work)) {
 		insert_kthread_work(worker, work, &worker->work_list);
 		ret = true;
 	}
@@ -694,6 +707,87 @@ retry:
 }
 EXPORT_SYMBOL_GPL(flush_kthread_work);
 
+/*
+ * This function removes the work from the worker queue. Also it makes sure
+ * that it won't get queued later via the delayed work's timer.
+ *
+ * The work might still be in use when this function finishes. See the
+ * current_work proceed by the worker.
+ *
+ * Return: %true if @work was pending and successfully canceled,
+ *	%false if @work was not pending
+ */
+static bool __kthread_cancel_work(struct kthread_work *work,
+				  unsigned long *flags)
+{
+	/*
+	 * Try to remove the work from a worker list. It might either
+	 * be from worker->work_list or from worker->delayed_work_list.
+	 */
+	if (!list_empty(&work->node)) {
+		list_del_init(&work->node);
+		return true;
+	}
+
+	return false;
+}
+
+static bool __kthread_cancel_work_sync(struct kthread_work *work)
+{
+	struct kthread_worker *worker = work->worker;
+	unsigned long flags;
+	int ret = false;
+
+	if (!worker)
+		goto out;
+
+	spin_lock_irqsave(&worker->lock, flags);
+	/* Work must not be used with >1 worker, see kthread_queue_work(). */
+	WARN_ON_ONCE(work->worker != worker);
+
+	ret = __kthread_cancel_work(work, &flags);
+
+	if (worker->current_work != work)
+		goto out_fast;
+
+	/*
+	 * The work is in progress and we need to wait with the lock released.
+	 * In the meantime, block any queuing by setting the canceling counter.
+	 */
+	work->canceling++;
+	spin_unlock_irqrestore(&worker->lock, flags);
+	flush_kthread_work(work);
+	spin_lock_irqsave(&worker->lock, flags);
+	work->canceling--;
+
+out_fast:
+	spin_unlock_irqrestore(&worker->lock, flags);
+out:
+	return ret;
+}
+
+/**
+ * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
+ * @work: the kthread work to cancel
+ *
+ * Cancel @work and wait for its execution to finish.  This function
+ * can be used even if the work re-queues itself. On return from this
+ * function, @work is guaranteed to be not pending or executing on any CPU.
+ *
+ * kthread_cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the worker on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return: %true if @work was pending, %false otherwise.
+ */
+bool kthread_cancel_work_sync(struct kthread_work *work)
+{
+	return __kthread_cancel_work_sync(work);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
+
 /**
  * flush_kthread_worker - flush all current works on a kthread_worker
  * @worker: worker to flush

From e2aa75a4c7812700d41ee6b45d3d85e0773805bc Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 13:53:22 +0530
Subject: [PATCH 0815/3220] cpufreq: schedutil: move slow path from workqueue
 to SCHED_FIFO task

If slow path frequency changes are conducted in a SCHED_OTHER context
then they may be delayed for some amount of time, including
indefinitely, when real time or deadline activity is taking place.

Move the slow path to a real time kernel thread. In the future the
thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set
to 50 for now.

Hackbench results on ARM Exynos, dual core A15 platform for 10
iterations:

$ hackbench -s 100 -l 100 -g 10 -f 20

Before			After
---------------------------------
1.808			1.603
1.847			1.251
2.229			1.590
1.952			1.600
1.947			1.257
1.925			1.627
2.694			1.620
1.258			1.621
1.919			1.632
1.250			1.240

Average:

1.8829			1.5041

Based on initial work by Steve Muckle.

Change-Id: I8f53037e94f353960c6d10abf07822d671631ef7
Signed-off-by: Steve Muckle <smuckle.linux@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from 02a7b1ee3baa)
[adapt to the 3.18 kthread interface]
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 kernel/sched/cpufreq_schedutil.c | 86 +++++++++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7e42a1d6d88d..784142ef9924 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,6 +12,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/cpufreq.h>
+#include <linux/kthread.h>
 #include <linux/slab.h>
 #include <trace/events/power.h>
 
@@ -22,6 +23,7 @@
 #define cpufreq_driver_fast_switch(x, y) 0
 #define cpufreq_enable_fast_switch(x)
 #define cpufreq_disable_fast_switch(x)
+#define SUGOV_KTHREAD_PRIORITY	50
 
 struct sugov_tunables {
 	struct gov_attr_set attr_set;
@@ -41,8 +43,10 @@ struct sugov_policy {
 
 	/* The next fields are only needed if fast switch cannot be used. */
 	struct irq_work irq_work;
-	struct work_struct work;
+	struct kthread_work work;
 	struct mutex work_lock;
+	struct kthread_worker worker;
+	struct task_struct *thread;
 	bool work_in_progress;
 
 	bool need_freq_update;
@@ -297,7 +301,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	raw_spin_unlock(&sg_policy->update_lock);
 }
 
-static void sugov_work(struct work_struct *work)
+static void sugov_work(struct kthread_work *work)
 {
 	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
 
@@ -314,7 +318,21 @@ static void sugov_irq_work(struct irq_work *irq_work)
 	struct sugov_policy *sg_policy;
 
 	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
-	schedule_work_on(smp_processor_id(), &sg_policy->work);
+
+	/*
+	 * For Real Time and Deadline tasks, schedutil governor shoots the
+	 * frequency to maximum. And special care must be taken to ensure that
+	 * this kthread doesn't result in that.
+	 *
+	 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+	 * updated only at the end of the sugov_work() and before that schedutil
+	 * rejects all other frequency scaling requests.
+	 *
+	 * Though there is a very rare case where the RT thread yields right
+	 * after the work_in_progress flag is cleared. The effects of that are
+	 * neglected for now.
+	 */
+	queue_kthread_work(&sg_policy->worker, &sg_policy->work);
 }
 
 /************************** sysfs interface ************************/
@@ -380,7 +398,6 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 
 	sg_policy->policy = policy;
 	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
-	INIT_WORK(&sg_policy->work, sugov_work);
 	mutex_init(&sg_policy->work_lock);
 	raw_spin_lock_init(&sg_policy->update_lock);
 	return sg_policy;
@@ -392,6 +409,51 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)
 	kfree(sg_policy);
 }
 
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+	struct task_struct *thread;
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+	struct cpufreq_policy *policy = sg_policy->policy;
+	int ret;
+
+	/* kthread only required for slow path */
+	if (policy->fast_switch_enabled)
+		return 0;
+
+	init_kthread_work(&sg_policy->work, sugov_work);
+	init_kthread_worker(&sg_policy->worker);
+	thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+				"sugov:%d",
+				cpumask_first(policy->related_cpus));
+	if (IS_ERR(thread)) {
+		pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+		return PTR_ERR(thread);
+	}
+
+	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+	if (ret) {
+		kthread_stop(thread);
+		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+		return ret;
+	}
+
+	sg_policy->thread = thread;
+	kthread_bind_mask(thread, policy->related_cpus);
+	wake_up_process(thread);
+
+	return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+	/* kthread only required for slow path */
+	if (sg_policy->policy->fast_switch_enabled)
+		return;
+
+	flush_kthread_worker(&sg_policy->worker);
+	kthread_stop(sg_policy->thread);
+}
+
 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
 {
 	struct sugov_tunables *tunables;
@@ -428,12 +490,16 @@ static int sugov_init(struct cpufreq_policy *policy)
 	if (!sg_policy)
 		return -ENOMEM;
 
+	ret = sugov_kthread_create(sg_policy);
+	if (ret)
+		goto free_sg_policy;
+
 	mutex_lock(&global_tunables_lock);
 
 	if (global_tunables) {
 		if (WARN_ON(have_governor_per_policy())) {
 			ret = -EINVAL;
-			goto free_sg_policy;
+			goto stop_kthread;
 		}
 		policy->governor_data = sg_policy;
 		sg_policy->tunables = global_tunables;
@@ -445,7 +511,7 @@ static int sugov_init(struct cpufreq_policy *policy)
 	tunables = sugov_tunables_alloc(sg_policy);
 	if (!tunables) {
 		ret = -ENOMEM;
-		goto free_sg_policy;
+		goto stop_kthread;
 	}
 
 	tunables->rate_limit_us = 20 * USEC_PER_MSEC;
@@ -472,7 +538,10 @@ static int sugov_init(struct cpufreq_policy *policy)
 	policy->governor_data = NULL;
 	sugov_tunables_free(tunables);
 
- free_sg_policy:
+stop_kthread:
+	sugov_kthread_stop(sg_policy);
+
+free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
@@ -497,6 +566,7 @@ static int sugov_exit(struct cpufreq_policy *policy)
 
 	mutex_unlock(&global_tunables_lock);
 
+	sugov_kthread_stop(sg_policy);
 	sugov_policy_free(sg_policy);
 
 	return 0;
@@ -546,7 +616,7 @@ static int sugov_stop(struct cpufreq_policy *policy)
 	synchronize_sched();
 
 	irq_work_sync(&sg_policy->irq_work);
-	cancel_work_sync(&sg_policy->work);
+	kthread_cancel_work_sync(&sg_policy->work);
 
 	return 0;
 }

From e5da6c11b20544e7afd11b252ee94721caf1f740 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@linaro.org>
Date: Thu, 25 Aug 2016 15:59:17 -0700
Subject: [PATCH 0816/3220] sched: cpufreq: use rt_avg as estimate of required
 RT CPU capacity

A policy of going to fmax on any RT activity will be detrimental
for power on many platforms. Often RT accounts for only a small amount
of CPU activity so sending the CPU frequency to fmax is overkill. Worse
still, some platforms may not be able to even complete the CPU frequency
change before the RT activity has already completed.

Cpufreq governors have not treated RT activity this way in the past so
it is not part of the expected semantics of the RT scheduling class. The
DL class offers guarantees about task completion and could be used for
this purpose.

Modify the schedutil algorithm to instead use rt_avg as an estimate of
RT utilization of the CPU.

Based on previous work by Vincent Guittot <vincent.guittot@linaro.org>.

Change-Id: I1ed605a3e2512a94d34217a8e57c3fd97cca60be
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 include/linux/sched.h            |  2 --
 kernel/sched/cpufreq_schedutil.c | 33 ++++++++++++++++++++------------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bc23d15244db..dc3da585bdad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3295,8 +3295,6 @@ static inline unsigned long rlimit_max(unsigned int limit)
 #define SCHED_CPUFREQ_DL        (1U << 1)
 #define SCHED_CPUFREQ_IOWAIT    (1U << 2)
 
-#define SCHED_CPUFREQ_RT_DL     (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
-
 #ifdef CONFIG_CPU_FREQ
 struct update_util_data {
 	void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 784142ef9924..033da8f815da 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -156,15 +156,24 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
 {
-	struct rq *rq = this_rq();
-	unsigned long cfs_max;
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long max_cap, rt;
+	s64 delta;
 
-	cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+	max_cap = arch_scale_cpu_capacity(NULL, cpu);
 
-	*util = min(rq->cfs.avg.util_avg, cfs_max);
-	*max = cfs_max;
+	sched_avg_update(rq);
+	delta = time - rq->age_stamp;
+	if (unlikely(delta < 0))
+		delta = 0;
+	rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
+	rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
+
+	*util = min(rq->cfs.avg.util_avg + rt, max_cap);
+	*max = max_cap;
 }
 
 static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
@@ -212,10 +221,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	if (!sugov_should_update_freq(sg_policy, time))
 		return;
 
-	if (flags & SCHED_CPUFREQ_RT_DL) {
+	if (flags & SCHED_CPUFREQ_DL) {
 		next_f = policy->cpuinfo.max_freq;
 	} else {
-		sugov_get_util(&util, &max);
+		sugov_get_util(&util, &max, time);
 		sugov_iowait_boost(sg_cpu, &util, &max);
 		next_f = get_next_freq(sg_cpu, util, max);
 	}
@@ -232,7 +241,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
 	u64 last_freq_update_time = sg_policy->last_freq_update_time;
 	unsigned int j;
 
-	if (flags & SCHED_CPUFREQ_RT_DL)
+	if (flags & SCHED_CPUFREQ_DL)
 		return max_f;
 
 	sugov_iowait_boost(sg_cpu, &util, &max);
@@ -258,7 +267,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
 			j_sg_cpu->iowait_boost = 0;
 			continue;
 		}
-		if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
+		if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
 			return max_f;
 
 		j_util = j_sg_cpu->util;
@@ -282,7 +291,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	unsigned long util, max;
 	unsigned int next_f;
 
-	sugov_get_util(&util, &max);
+	sugov_get_util(&util, &max, time);
 
 	raw_spin_lock(&sg_policy->update_lock);
 
@@ -590,7 +599,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 		if (policy_is_shared(policy)) {
 			sg_cpu->util = 0;
 			sg_cpu->max = 0;
-			sg_cpu->flags = SCHED_CPUFREQ_RT;
+			sg_cpu->flags = SCHED_CPUFREQ_DL;
 			sg_cpu->last_update = 0;
 			sg_cpu->cached_raw_freq = 0;
 			sg_cpu->iowait_boost = 0;

From f71d9f01c6fc165ba38cdab6cbb2e4443bd7e458 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Wed, 14 Dec 2016 16:10:10 +0000
Subject: [PATCH 0817/3220] sched/cpufreq: make schedutil use WALT signal

If WALT is available and enabled, make schedutil governor use its
utilization signal.

Change-Id: I92bc37989447a76616e9bcc4e9e8616774fb9925
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
[we need to use boosted_cpu_util for schedutil, so make it
 not static]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/cpufreq_schedutil.c | 9 +++++++++
 kernel/sched/fair.c              | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 033da8f815da..e9a9ee98daf7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -17,6 +17,11 @@
 #include <trace/events/power.h>
 
 #include "sched.h"
+#include "tune.h"
+
+#ifdef CONFIG_SCHED_WALT
+unsigned long boosted_cpu_util(int cpu);
+#endif
 
 /* Stub out fast switch routines present on mainline to reduce the backport
  * overhead. */
@@ -173,6 +178,10 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
 	rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
 
 	*util = min(rq->cfs.avg.util_avg + rt, max_cap);
+#ifdef CONFIG_SCHED_WALT
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+		*util = boosted_cpu_util(cpu);
+#endif
 	*max = max_cap;
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index baba2d34c34e..abe49df1e6f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4258,7 +4258,7 @@ static inline void hrtick_update(struct rq *rq)
 
 #ifdef CONFIG_SMP
 static bool cpu_overutilized(int cpu);
-static inline unsigned long boosted_cpu_util(int cpu);
+unsigned long boosted_cpu_util(int cpu);
 #else
 #define boosted_cpu_util(cpu) cpu_util(cpu)
 #endif
@@ -5478,7 +5478,7 @@ schedtune_task_margin(struct task_struct *task)
 
 #endif /* CONFIG_SCHED_TUNE */
 
-static inline unsigned long
+unsigned long
 boosted_cpu_util(int cpu)
 {
 	unsigned long util = cpu_util(cpu);

From 3c5c4e972472fc527d6b027fcdfa4d88444bff52 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Wed, 30 Nov 2016 11:09:42 +0000
Subject: [PATCH 0818/3220] trace/sched: add rq utilization signal for WALT

It is useful to be able to check current capacity against rq utilization
signal generated by WALT (to check how a cpufreq governor is behaving
for example).

Add rq utilization signal (same scale as capacity) to the walt_update_
task_ravg tracepoint.

Change-Id: I9aae3884a741d23ac494bef80d2303f107f135ce
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 include/trace/events/sched.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index c18d8c89bd12..1d758d18a1b7 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -984,6 +984,7 @@ TRACE_EVENT(walt_update_task_ravg,
 		__field(	 int,	cpu			)
 		__field(	u64,	cs			)
 		__field(	u64,	ps			)
+		__field(unsigned long,	util			)
 		__field(	u32,	curr_window		)
 		__field(	u32,	prev_window		)
 		__field(	u64,	nt_cs			)
@@ -1008,6 +1009,8 @@ TRACE_EVENT(walt_update_task_ravg,
 		__entry->irqtime        = irqtime;
 		__entry->cs             = rq->curr_runnable_sum;
 		__entry->ps             = rq->prev_runnable_sum;
+		__entry->util           = rq->prev_runnable_sum << SCHED_LOAD_SHIFT;
+		do_div(__entry->util, walt_ravg_window);
 		__entry->curr_window	= p->ravg.curr_window;
 		__entry->prev_window	= p->ravg.prev_window;
 		__entry->nt_cs		= rq->nt_curr_runnable_sum;
@@ -1016,16 +1019,15 @@ TRACE_EVENT(walt_update_task_ravg,
 	),
 
 	TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
-		" cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u"
+		" cs %llu ps %llu util %lu cur_window %u prev_window %u active_wins %u"
 		, __entry->wallclock, __entry->win_start, __entry->delta,
 		__entry->evt, __entry->cpu,
 		__entry->cur_freq, __entry->cur_pid,
 		__entry->pid, __entry->comm, __entry->mark_start,
 		__entry->delta_m, __entry->demand,
 		__entry->sum, __entry->irqtime,
-		__entry->cs, __entry->ps,
+		__entry->cs, __entry->ps, __entry->util,
 		__entry->curr_window, __entry->prev_window,
-		  __entry->nt_cs, __entry->nt_ps,
 		  __entry->active_windows
 		)
 );

From 51b20b214f0e6a2b52bae9f9e748c87b1820f9b2 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle.linux@gmail.com>
Date: Thu, 17 Nov 2016 10:48:45 +0530
Subject: [PATCH 0819/3220] cpufreq: schedutil: add up/down frequency
 transition rate limits

The rate-limit tunable in the schedutil governor applies to transitions
to both lower and higher frequencies. On several platforms it is not the
ideal tunable though, as it is difficult to get best power/performance
figures using the same limit in both directions.

It is common on mobile platforms with demanding user interfaces to want
to increase frequency rapidly for example but decrease slowly.

One of the example can be a case where we have short busy periods
followed by similar or longer idle periods. If we keep the rate-limit
high enough, we will not go to higher frequencies soon enough. On the
other hand, if we keep it too low, we will have too many frequency
transitions, as we will always reduce the frequency after the busy
period.

It would be very useful if we can set low rate-limit while increasing
the frequency (so that we can respond to the short busy periods quickly)
and high rate-limit while decreasing frequency (so that we don't reduce
the frequency immediately after the short busy period and that may avoid
frequency transitions before the next busy period).

Implement separate up/down transition rate limits. Note that the
governor avoids frequency recalculations for a period equal to minimum
of up and down rate-limit. A global mutex is also defined to protect
updates to min_rate_limit_us via two separate sysfs files.

Note that this wouldn't change behavior of the schedutil governor for
the platforms which wish to keep same values for both up and down rate
limits.

This is tested with the rt-app [1] on ARM Exynos, dual A15 processor
platform.

Testcase: Run a SCHED_OTHER thread on CPU0 which will emulate work-load
for X ms of busy period out of the total period of Y ms, i.e. Y - X ms
of idle period. The values of X/Y taken were: 20/40, 20/50, 20/70, i.e
idle periods of 20, 30 and 50 ms respectively. These were tested against
values of up/down rate limits as: 10/10 ms and 10/40 ms.

For every test we noticed a performance increase of 5-10% with the
schedutil governor, which was very much expected.

[Viresh]: Simplified user interface and introduced min_rate_limit_us +
	      mutex, rewrote commit log and included test results.

[1] https://github.com/scheduler-tools/rt-app/

Change-Id: I18720a83855b196b8e21dcdc8deae79131635b84
Signed-off-by: Steve Muckle <smuckle.linux@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
(applied from https://marc.info/?l=linux-kernel&m=147936011103832&w=2)
[trivial adaptations]
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 kernel/sched/cpufreq_schedutil.c | 107 ++++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e9a9ee98daf7..5c7c52077ad1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -28,11 +28,13 @@ unsigned long boosted_cpu_util(int cpu);
 #define cpufreq_driver_fast_switch(x, y) 0
 #define cpufreq_enable_fast_switch(x)
 #define cpufreq_disable_fast_switch(x)
+#define LATENCY_MULTIPLIER			(1000)
 #define SUGOV_KTHREAD_PRIORITY	50
 
 struct sugov_tunables {
 	struct gov_attr_set attr_set;
-	unsigned int rate_limit_us;
+	unsigned int up_rate_limit_us;
+	unsigned int down_rate_limit_us;
 };
 
 struct sugov_policy {
@@ -43,7 +45,9 @@ struct sugov_policy {
 
 	raw_spinlock_t update_lock;  /* For shared policies */
 	u64 last_freq_update_time;
-	s64 freq_update_delay_ns;
+	s64 min_rate_limit_ns;
+	s64 up_rate_delay_ns;
+	s64 down_rate_delay_ns;
 	unsigned int next_freq;
 
 	/* The next fields are only needed if fast switch cannot be used. */
@@ -94,7 +98,27 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 	}
 
 	delta_ns = time - sg_policy->last_freq_update_time;
-	return delta_ns >= sg_policy->freq_update_delay_ns;
+
+	/* No need to recalculate next freq for min_rate_limit_us at least */
+	return delta_ns >= sg_policy->min_rate_limit_ns;
+}
+
+static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
+				     unsigned int next_freq)
+{
+	s64 delta_ns;
+
+	delta_ns = time - sg_policy->last_freq_update_time;
+
+	if (next_freq > sg_policy->next_freq &&
+	    delta_ns < sg_policy->up_rate_delay_ns)
+			return true;
+
+	if (next_freq < sg_policy->next_freq &&
+	    delta_ns < sg_policy->down_rate_delay_ns)
+			return true;
+
+	return false;
 }
 
 static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
@@ -102,6 +126,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 {
 	struct cpufreq_policy *policy = sg_policy->policy;
 
+	if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
+		return;
+
 	sg_policy->last_freq_update_time = time;
 
 	if (policy->fast_switch_enabled) {
@@ -363,15 +390,32 @@ static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr
 	return container_of(attr_set, struct sugov_tunables, attr_set);
 }
 
-static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
+{
+	mutex_lock(&min_rate_lock);
+	sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+					   sg_policy->down_rate_delay_ns);
+	mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
 {
 	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 
-	return sprintf(buf, "%u\n", tunables->rate_limit_us);
+	return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
 }
 
-static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
-				   size_t count)
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+	return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
+}
+
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+				      const char *buf, size_t count)
 {
 	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 	struct sugov_policy *sg_policy;
@@ -380,18 +424,42 @@ static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *bu
 	if (kstrtouint(buf, 10, &rate_limit_us))
 		return -EINVAL;
 
-	tunables->rate_limit_us = rate_limit_us;
+	tunables->up_rate_limit_us = rate_limit_us;
 
-	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
-		sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+		sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+		update_min_rate_limit_us(sg_policy);
+	}
 
 	return count;
 }
 
-static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+					const char *buf, size_t count)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+	struct sugov_policy *sg_policy;
+	unsigned int rate_limit_us;
+
+	if (kstrtouint(buf, 10, &rate_limit_us))
+		return -EINVAL;
+
+	tunables->down_rate_limit_us = rate_limit_us;
+
+	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+		sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+		update_min_rate_limit_us(sg_policy);
+	}
+
+	return count;
+}
+
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
 
 static struct attribute *sugov_attributes[] = {
-	&rate_limit_us.attr,
+	&up_rate_limit_us.attr,
+	&down_rate_limit_us.attr,
 	NULL
 };
 
@@ -532,10 +600,13 @@ static int sugov_init(struct cpufreq_policy *policy)
 		goto stop_kthread;
 	}
 
-	tunables->rate_limit_us = 20 * USEC_PER_MSEC;
+	tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
+	tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
 	lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-	if (lat)
-		tunables->rate_limit_us *= lat;
+	if (lat) {
+		tunables->up_rate_limit_us *= lat;
+		tunables->down_rate_limit_us *= lat;
+	}
 
 	policy->governor_data = sg_policy;
 	sg_policy->tunables = tunables;
@@ -595,7 +666,11 @@ static int sugov_start(struct cpufreq_policy *policy)
 	struct sugov_policy *sg_policy = policy->governor_data;
 	unsigned int cpu;
 
-	sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+	sg_policy->up_rate_delay_ns =
+		sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+	sg_policy->down_rate_delay_ns =
+		sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+	update_min_rate_limit_us(sg_policy);
 	sg_policy->last_freq_update_time = 0;
 	sg_policy->next_freq = UINT_MAX;
 	sg_policy->work_in_progress = false;

From 5c015afebd4dd130ba86c12e8dabceab0f7f3d15 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 21 Feb 2017 10:15:18 +0530
Subject: [PATCH 0820/3220] FROM-LIST: cpufreq: schedutil: Redefine the
 rate_limit_us tunable

The rate_limit_us tunable is intended to reduce the possible overhead
from running the schedutil governor.  However, that overhead can be
divided into two separate parts: the governor computations and the
invocation of the scaling driver to set the CPU frequency.  The latter
is where the real overhead comes from.  The former is much less
expensive in terms of execution time and running it every time the
governor callback is invoked by the scheduler, after rate_limit_us
interval has passed since the last frequency update, would not be a
problem.

For this reason, redefine the rate_limit_us tunable so that it means the
minimum time that has to pass between two consecutive invocations of the
scaling driver by the schedutil governor (to set the CPU frequency).

Change-Id: Iced64116b826c25441ef537c27a3dabfcf81919e
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
[pulled from linux-pm linux-next https://patchwork.kernel.org/patch/9583949/ ]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/cpufreq_schedutil.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 5c7c52077ad1..191ba36a9eeb 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -129,14 +129,13 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 	if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
 		return;
 
-	sg_policy->last_freq_update_time = time;
-
 	if (policy->fast_switch_enabled) {
 		if (sg_policy->next_freq == next_freq) {
 			trace_cpu_frequency(policy->cur, smp_processor_id());
 			return;
 		}
 		sg_policy->next_freq = next_freq;
+		sg_policy->last_freq_update_time = time;
 		next_freq = cpufreq_driver_fast_switch(policy, next_freq);
 		if (next_freq == CPUFREQ_ENTRY_INVALID)
 			return;
@@ -145,6 +144,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 		trace_cpu_frequency(next_freq, smp_processor_id());
 	} else if (sg_policy->next_freq != next_freq) {
 		sg_policy->next_freq = next_freq;
+		sg_policy->last_freq_update_time = time;
 		sg_policy->work_in_progress = true;
 		irq_work_queue(&sg_policy->irq_work);
 	}

From bd6ff3505f2f9c10ab6fdebc70378f5549b0323b Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Sun, 4 Dec 2016 17:47:53 +0000
Subject: [PATCH 0821/3220] Revert "WIP: sched: Consider spare cpu capacity at
 task wake-up"

This reverts commit 75a9695b619741019363f889c99c97c7bb823797.

Change-Id: I846b21f2bdeb0b0ca30ad65683564ed07a429428
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
[ minor merge changes ]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index abe49df1e6f0..12bb2ae026f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5509,10 +5509,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		  int this_cpu, int sd_flag)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
-	struct sched_group *fit_group = NULL, *spare_group = NULL;
+	struct sched_group *fit_group = NULL;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	unsigned long fit_capacity = ULONG_MAX;
-	unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -5520,7 +5519,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		load_idx = sd->wake_idx;
 
 	do {
-		unsigned long load, avg_load, spare_capacity;
+		unsigned long load, avg_load;
 		int local_group;
 		int i;
 
@@ -5552,16 +5551,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 				fit_capacity = capacity_of(i);
 				fit_group = group;
 			}
-
-			/*
-			 * Look for group which has most spare capacity on a
-			 * single cpu.
-			 */
-			spare_capacity = capacity_of(i) - cpu_util(i);
-			if (spare_capacity > max_spare_capacity) {
-				max_spare_capacity = spare_capacity;
-				spare_group = group;
-			}
 		}
 
 		/* Adjust by relative CPU capacity of the group */
@@ -5578,9 +5567,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 	if (fit_group)
 		return fit_group;
 
-	if (spare_group)
-		return spare_group;
-
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;

From cb88574a6866aaf09ff172d30f2512bd4a0a66ba Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 25 Jan 2017 10:03:36 +0000
Subject: [PATCH 0822/3220] Partial Revert: "WIP: sched: Add cpu capacity
 awareness to wakeup balancing"

Revert the changes in find_idlest_cpu() and find_idlest_group().

Keep the infrastructure bits which are used in following EAS patches.

Change-Id: Id516ca5f3e51b9a13db1ebb8de2df3aa25f9679b
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/fair.c | 34 +++-------------------------------
 1 file changed, 3 insertions(+), 31 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 12bb2ae026f5..2150edce955a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5384,11 +5384,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu)
 	return __task_fits(p, cpu, 0);
 }
 
-static inline bool task_fits_spare(struct task_struct *p, int cpu)
-{
-	return __task_fits(p, cpu, cpu_util(cpu));
-}
-
 static bool cpu_overutilized(int cpu)
 {
 	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
@@ -5509,9 +5504,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		  int this_cpu, int sd_flag)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
-	struct sched_group *fit_group = NULL;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
-	unsigned long fit_capacity = ULONG_MAX;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -5542,15 +5535,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 				load = target_load(i, load_idx);
 
 			avg_load += load;
-
-			/*
-			 * Look for most energy-efficient group that can fit
-			 * that can fit the task.
-			 */
-			if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
-				fit_capacity = capacity_of(i);
-				fit_group = group;
-			}
 		}
 
 		/* Adjust by relative CPU capacity of the group */
@@ -5564,9 +5548,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		}
 	} while (group = group->next, group != sd->groups);
 
-	if (fit_group)
-		return fit_group;
-
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
@@ -5587,7 +5568,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
 	/* Traverse only the allowed CPUs */
 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-		if (task_fits_spare(p, i)) {
+		if (idle_cpu(i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
 			if (idle && idle->exit_latency < min_exit_latency) {
@@ -5599,8 +5580,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 				min_exit_latency = idle->exit_latency;
 				latest_idle_timestamp = rq->idle_stamp;
 				shallowest_idle_cpu = i;
-			} else if (idle_cpu(i) &&
-				   (!idle || idle->exit_latency == min_exit_latency) &&
+			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
 				   rq->idle_stamp > latest_idle_timestamp) {
 				/*
 				 * If equal or no active idle state, then
@@ -5609,13 +5589,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 				 */
 				latest_idle_timestamp = rq->idle_stamp;
 				shallowest_idle_cpu = i;
-			} else if (shallowest_idle_cpu == -1) {
-				/*
-				 * If we haven't found an idle CPU yet
-				 * pick a non-idle one that can fit the task as
-				 * fallback.
-				 */
-				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
 			load = weighted_cpuload(i);
@@ -5943,8 +5916,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE)
-		want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
-			      cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
+		want_affine = (!wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
 			      energy_aware();
 
 	rcu_read_lock();

From 986aa1d498f44463a6f425ce40ddd66059a789e5 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 22 Jun 2016 18:03:12 +0100
Subject: [PATCH 0823/3220] UPSTREAM: sched/core: Fix power to capacity
 renaming in comment

It is seems that this one escaped Nico's renaming of cpu_power to
cpu_capacity a while back.

Change-Id: Ic2569d714db7b740f1df4ccc381ba8c1772c2793
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: linux-kernel@vger.kernel.org
Cc: mgalbraith@suse.de
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1466615004-3503-2-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit bd425d4bfc7a1a6064dbbadfbac9c7eec0e426ec)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc3da585bdad..f8096788cb1b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1003,7 +1003,7 @@ extern void wake_up_q(struct wake_q_head *head);
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu power */
+#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu capacity */
 #define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */

From 3bb3d7e7d9588334b586084f32495f448745f343 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 22 Jun 2016 18:03:13 +0100
Subject: [PATCH 0824/3220] BACKPORT: sched/fair: Make the use of prev_cpu
 consistent in the wakeup path

In commit:

  ac66f5477239 ("sched/numa: Introduce migrate_swap()")

select_task_rq() got a 'cpu' argument to enable overriding of prev_cpu
in special cases (NUMA task swapping).

However, the select_task_rq_fair() helper functions: wake_affine() and
select_idle_sibling(), still use task_cpu(p) directly to work out
prev_cpu, which leads to inconsistencies.

This patch passes prev_cpu (potentially overridden by NUMA code) into
the helper functions to ensure prev_cpu is indeed the same CPU
everywhere in the wakeup path.

Change-Id: I4951c4eead2e6045e4fb34e89f6cda17d881d4d7
cc: Ingo Molnar <mingo@redhat.com>
cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: linux-kernel@vger.kernel.org
Cc: mgalbraith@suse.de
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1466615004-3503-3-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 772bd008cd9a1d4e8ce566f2edcc61d1c28fcbe5)
[merged with Android/EAS wakeup path]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2150edce955a..9b64a0e74da7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -670,7 +670,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
 /*
@@ -1404,7 +1404,8 @@ balance:
 	 * Call select_idle_sibling to maybe find a better one.
 	 */
 	if (!cur)
-		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+						   env->dst_cpu);
 
 assign:
 	assigned = true;
@@ -5280,18 +5281,18 @@ static int wake_wide(struct task_struct *p)
 	return 1;
 }
 
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+		       int prev_cpu, int sync)
 {
 	s64 this_load, load;
 	s64 this_eff_load, prev_eff_load;
-	int idx, this_cpu, prev_cpu;
+	int idx, this_cpu;
 	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
 
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
-	prev_cpu  = task_cpu(p);
 	load	  = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
@@ -5605,11 +5606,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
 	struct sched_domain *sd;
 	struct sched_group *sg;
-	int i = task_cpu(p);
 	int best_idle = -1;
 	int best_idle_cstate = -1;
 	int best_idle_capacity = INT_MAX;
@@ -5621,8 +5621,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
 		/*
 		 * If the prevous cpu is cache affine and idle, don't be stupid.
 		 */
-		if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-			return i;
+		if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+			return prev;
 	}
 
 	/*
@@ -5632,6 +5632,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
 	for_each_lower_domain(sd) {
 		sg = sd->groups;
 		do {
+			int i;
 			if (!cpumask_intersects(sched_group_cpus(sg),
 						tsk_cpus_allowed(p)))
 				goto next;
@@ -5942,7 +5943,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 	if (affine_sd) {
 		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+		if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
 			new_cpu = cpu;
 	}
 
@@ -5950,7 +5951,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
 			new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
 		else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-			new_cpu = select_idle_sibling(p, new_cpu);
+			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
 	} else while (sd) {
 		struct sched_group *group;

From bc7c939b3a3adcf2cd15e33677ed9246d77e0074 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 22 Jun 2016 18:03:14 +0100
Subject: [PATCH 0825/3220] UPSTREAM: sched/fair: Optimize find_idlest_cpu()
 when there is no choice

In the current find_idlest_group()/find_idlest_cpu() search we end up
calling find_idlest_cpu() in a sched_group containing only one CPU in
the end. Checking idle-states becomes pointless when there is no
alternative, so bail out instead.

Change-Id: Ic62bf09b53a7984143ac2431aaa69c69b204cd56
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: linux-kernel@vger.kernel.org
Cc: mgalbraith@suse.de
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1466615004-3503-4-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit eaecf41f5abf80b63c8e025fcb9ee4aa203c3038)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b64a0e74da7..5294c5f47939 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5567,6 +5567,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	int shallowest_idle_cpu = -1;
 	int i;
 
+	/* Check if we have any choice: */
+	if (group->group_weight == 1)
+		return cpumask_first(sched_group_cpus(group));
+
 	/* Traverse only the allowed CPUs */
 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
 		if (idle_cpu(i)) {

From 3e9cdd5ae95a950c9cd3fc55a7f549ebed47aba6 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Mon, 25 Jul 2016 14:34:21 +0100
Subject: [PATCH 0826/3220] UPSTREAM: sched/core: Remove unnecessary
 NULL-pointer check

Checking if the sched_domain pointer returned by sd_init() is NULL seems
pointless as sd_init() neither checks if it is valid to begin with nor
set it to NULL.

Change-Id: I5e16fd0c2ca7234b097be7c95409ddb15c5e9de9
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1469453670-2660-5-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 0e6d2a67a41321b3ef650b780a279a37855de08e)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1df6da0094f0..8c5778053a20 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7275,8 +7275,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		struct sched_domain *child, int cpu)
 {
 	struct sched_domain *sd = sd_init(tl, cpu);
-	if (!sd)
-		return child;
 
 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
 	if (child) {

From 68a3b157d92cd802f284ceec44fd26c3b3cec86d Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Mon, 25 Jul 2016 14:34:22 +0100
Subject: [PATCH 0827/3220] UPSTREAM: sched/core: Introduce SD_ASYM_CPUCAPACITY
 sched_domain topology flag

Add a topology flag to the sched_domain hierarchy indicating the lowest
domain level where the full range of CPU capacities is represented by
the domain members for asymmetric capacity topologies (e.g. ARM
big.LITTLE).

The flag is intended to indicate that extra care should be taken when
placing tasks on CPUs and this level spans all the different types of
CPUs found in the system (no need to look further up the domain
hierarchy). This information is currently only available through
iterating through the capacities of all the CPUs at parent levels in the
sched_domain hierarchy.

  SD 2      [  0      1      2      3]  SD_ASYM_CPUCAPACITY

  SD 1      [  0      1] [   2      3]  !SD_ASYM_CPUCAPACITY

  CPU:         0      1      2      3
  capacity:  756    756   1024   1024

If the topology in the example above is duplicated to create an eight
CPU example with third sched_domain level on top (SD 3), this level
should not have the flag set (!SD_ASYM_CPUCAPACITY) as its two group
would both have all CPU capacities represented within them.

Change-Id: I1526407b90567cac387419719b7d7fdc8b259a85
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1469453670-2660-6-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 1f6e6c7cb9bcd58abb5ee11243e0eefe6b36fc8e)
[trivial merge conflict]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 include/linux/sched.h |  1 +
 kernel/sched/core.c   | 21 ++++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8096788cb1b..436f36f768c6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1003,6 +1003,7 @@ extern void wake_up_q(struct wake_q_head *head);
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
+#define SD_ASYM_CPUCAPACITY	0x0040  /* Groups have different max cpu capacities */
 #define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu capacity */
 #define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8c5778053a20..1294c3cacab0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6037,6 +6037,7 @@ static int sd_degenerate(struct sched_domain *sd)
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUCAPACITY |
+			 SD_ASYM_CPUCAPACITY |
 			 SD_SHARE_PKG_RESOURCES |
 			 SD_SHARE_POWERDOMAIN |
 			 SD_SHARE_CAP_STATES)) {
@@ -6068,6 +6069,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
+				SD_ASYM_CPUCAPACITY |
 				SD_SHARE_CPUCAPACITY |
 				SD_SHARE_PKG_RESOURCES |
 				SD_PREFER_SIBLING |
@@ -6749,11 +6751,19 @@ static int sched_domains_curr_level;
 /*
  * SD_flags allowed in topology descriptions.
  *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
- * SD_SHARE_CAP_STATES    - describes shared capacity states
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *   SD_SHARE_CAP_STATES    - describes shared capacity states
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
  *
  * Odd one out:
  * SD_ASYM_PACKING        - describes SMT quirks
@@ -6763,6 +6773,7 @@ static int sched_domains_curr_level;
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA |			\
 	 SD_ASYM_PACKING |		\
+	 SD_ASYM_CPUCAPACITY |		\
 	 SD_SHARE_POWERDOMAIN |		\
 	 SD_SHARE_CAP_STATES)
 

From e51c05769490eede15d97e38b2a470230eea056b Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Mon, 25 Jul 2016 14:34:23 +0100
Subject: [PATCH 0828/3220] UPSTREAM: sched/core: Pass child domain into
 sd_init()

If behavioural sched_domain flags depend on topology flags set at higher
domain levels we need a way to update the child domain flags. Moving the
child pointer assignment inside sd_init() should make that possible.

Change-Id: If043921fdf102c310adcc9e0280afa33c48c4783
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1469453670-2660-7-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 3676b13e8524c576825fe1e731e347dba0083888)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1294c3cacab0..4dbabf491aab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6778,7 +6778,8 @@ static int sched_domains_curr_level;
 	 SD_SHARE_CAP_STATES)
 
 static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+	struct sched_domain *child, int cpu)
 {
 	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
 	int sd_weight, sd_flags = 0;
@@ -6830,6 +6831,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
 		.smt_gain		= 0,
 		.max_newidle_lb_cost	= 0,
 		.next_decay_max_lb_cost	= jiffies,
+		.child			= child,
 #ifdef CONFIG_SCHED_DEBUG
 		.name			= tl->name,
 #endif
@@ -7285,14 +7287,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)
 {
-	struct sched_domain *sd = sd_init(tl, cpu);
+	struct sched_domain *sd = sd_init(tl, child, cpu);
 
 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
 	if (child) {
 		sd->level = child->level + 1;
 		sched_domain_level_max = max(sched_domain_level_max, sd->level);
 		child->parent = sd;
-		sd->child = child;
 
 		if (!cpumask_subset(sched_domain_span(child),
 				    sched_domain_span(sd))) {

From 5173a85e22f97d5a4573b6c45aefc3ee93ae908b Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Mon, 25 Jul 2016 14:34:24 +0100
Subject: [PATCH 0829/3220] UPSTREAM: sched/core: Enable SD_BALANCE_WAKE for
 asymmetric capacity systems

A domain with the SD_ASYM_CPUCAPACITY flag set indicate that
sched_groups at this level and below do not include CPUs of all
capacities available (e.g. group containing little-only or big-only CPUs
in big.LITTLE systems). It is therefore necessary to put in more effort
in finding an appropriate CPU at task wake-up by enabling balancing at
wake-up (SD_BALANCE_WAKE) on all lower (child) levels.

Change-Id: I4615917f540d03d7e7ef7de8f0da33b1ad97387c
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1469453670-2660-8-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 9ee1cda5ee25c7dd82acf25892e0d229e818f8c7)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4dbabf491aab..fcef2588712a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6841,6 +6841,13 @@ sd_init(struct sched_domain_topology_level *tl,
 	 * Convert topological properties into behaviour.
 	 */
 
+	if (sd->flags & SD_ASYM_CPUCAPACITY) {
+		struct sched_domain *t = sd;
+
+		for_each_lower_domain(t)
+			t->flags |= SD_BALANCE_WAKE;
+	}
+
 	if (sd->flags & SD_SHARE_CPUCAPACITY) {
 		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 110;

From abfff9decebfd6b759ce09dcc810a5ed66252b04 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Mon, 25 Jul 2016 14:34:26 +0100
Subject: [PATCH 0830/3220] UPSTREAM: sched/fair: Let asymmetric CPU
 configurations balance at wake-up

Currently, SD_WAKE_AFFINE always takes priority over wakeup balancing if
SD_BALANCE_WAKE is set on the sched_domains. For asymmetric
configurations SD_WAKE_AFFINE is only desirable if the waking task's
compute demand (utilization) is suitable for the waking CPU and the
previous CPU, and all CPUs within their respective
SD_SHARE_PKG_RESOURCES domains (sd_llc). If not, let wakeup balancing
take over (find_idlest_{group, cpu}()).

This patch makes affine wake-ups conditional on whether both the waker
CPU and the previous CPU has sufficient capacity for the waking task,
or not, assuming that the CPU capacities within an SD_SHARE_PKG_RESOURCES
domain (sd_llc) are homogeneous.

Change-Id: I6d5d0426713da9ef6198f574ad9afbe58dacc1f0
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1469453670-2660-10-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 3273163c6775c4c21823985304c2364b08ca6ea2)
[removed existing definition of capacity_margin]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5294c5f47939..19edbc48b1bc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -128,6 +128,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
 
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * 1024 < capacity * margin
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
@@ -5358,8 +5364,6 @@ static inline unsigned long task_util(struct task_struct *p)
 	return p->se.avg.util_avg;
 }
 
-unsigned int capacity_margin = 1280; /* ~20% margin */
-
 static inline unsigned long boosted_task_util(struct task_struct *task);
 
 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
@@ -5496,6 +5500,13 @@ boosted_task_util(struct task_struct *task)
 	return util + margin;
 }
 
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+	return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -5899,6 +5910,45 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 	return target_cpu;
 }
 
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+	unsigned long util, capacity;
+
+	/* Task has no contribution or is new */
+	if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+		return cpu_util(cpu);
+
+	capacity = capacity_orig_of(cpu);
+	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+
+	return (util >= capacity) ? capacity : util;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+	long min_cap, max_cap;
+
+	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+	/* Minimum capacity is close to max, no need to abort wake_affine */
+	if (max_cap - min_cap < max_cap >> 3)
+		return 0;
+
+	return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5921,7 +5971,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE)
-		want_affine = (!wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
+		want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+					&& cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
 			      energy_aware();
 
 	rcu_read_lock();

From 68c27298cde48dcdd2714af992879e5b9fc74c5c Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:07 +0100
Subject: [PATCH 0831/3220] UPSTREAM: sched/fair: Compute task/cpu utilization
 at wake-up correctly

At task wake-up load-tracking isn't updated until the task is enqueued.
The task's own view of its utilization contribution may therefore not be
aligned with its contribution to the cfs_rq load-tracking which may have
been updated in the meantime. Basically, the task's own utilization
hasn't yet accounted for the sleep decay, while the cfs_rq may have
(partially). Estimating the cfs_rq utilization in case the task is
migrated at wake-up as task_rq(p)->cfs.avg.util_avg - p->se.avg.util_avg
is therefore incorrect as the two load-tracking signals aren't time
synchronized (different last update).

To solve this problem, this patch synchronizes the task utilization with
its previous rq before the task utilization is used in the wake-up path.
Currently the update/synchronization is done _after_ the task has been
placed by select_task_rq_fair(). The synchronization is done without
having to take the rq lock using the existing mechanism used in
remove_entity_load_avg().

Change-Id: I5605cca0c94c6ba43d9ce11554765a2456cf85bc
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-2-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 104cb16d9eb684f071d5bf3aa87c0d01af259b7c)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 19edbc48b1bc..615e1dcab36f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2924,6 +2924,19 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 }
 #endif
 
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 last_update_time;
+
+	last_update_time = cfs_rq_last_update_time(cfs_rq);
+	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
 /*
  * Task first catches up with cfs_rq, and then subtract
  * itself from the cfs_rq (task must be off the queue now).
@@ -2931,7 +2944,6 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 void remove_entity_load_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 last_update_time;
 
 	/*
 	 * Newly created task or never used group entity should not be removed
@@ -2940,9 +2952,7 @@ void remove_entity_load_avg(struct sched_entity *se)
 	if (se->avg.last_update_time == 0)
 		return;
 
-	last_update_time = cfs_rq_last_update_time(cfs_rq);
-
-	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+	sync_entity_load_avg(se);
 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
@@ -5946,6 +5956,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	if (max_cap - min_cap < max_cap >> 3)
 		return 0;
 
+	/* Bring task utilization in sync with prev_cpu */
+	sync_entity_load_avg(&p->se);
+
 	return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 

From f3f132b8e5503e750c2b89de83c3191e58798170 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:08 +0100
Subject: [PATCH 0832/3220] UPSTREAM: sched/fair: Consider spare capacity in
 find_idlest_group()

In low-utilization scenarios comparing relative loads in
find_idlest_group() doesn't always lead to the most optimum choice.
Systems with groups containing different numbers of cpus and/or cpus of
different compute capacity are significantly better off when considering
spare capacity rather than relative load in those scenarios.

In addition to existing load based search an alternative spare capacity
based candidate sched_group is found and selected instead if sufficient
spare capacity exists. If not, existing behaviour is preserved.

Change-Id: I6097af76c302a5a12e240ca24c70f707ad118242
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-3-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 6a0b19c0f39a7a7b7fb77d3867a733136ff059a3)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 615e1dcab36f..82eb26bd4361 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5526,7 +5526,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		  int this_cpu, int sd_flag)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
+	struct sched_group *most_spare_sg = NULL;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
+	unsigned long most_spare = 0, this_spare = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -5534,7 +5536,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		load_idx = sd->wake_idx;
 
 	do {
-		unsigned long load, avg_load;
+		unsigned long load, avg_load, spare_cap, max_spare_cap;
 		int local_group;
 		int i;
 
@@ -5546,8 +5548,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		local_group = cpumask_test_cpu(this_cpu,
 					       sched_group_cpus(group));
 
-		/* Tally up the load of all CPUs in the group */
+		/*
+		 * Tally up the load of all CPUs in the group and find
+		 * the group containing the CPU with most spare capacity.
+		 */
 		avg_load = 0;
+		max_spare_cap = 0;
 
 		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
@@ -5557,6 +5563,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 				load = target_load(i, load_idx);
 
 			avg_load += load;
+
+			spare_cap = capacity_spare_wake(i, p);
+
+			if (spare_cap > max_spare_cap)
+				max_spare_cap = spare_cap;
 		}
 
 		/* Adjust by relative CPU capacity of the group */
@@ -5564,12 +5575,33 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 		if (local_group) {
 			this_load = avg_load;
-		} else if (avg_load < min_load) {
-			min_load = avg_load;
-			idlest = group;
+			this_spare = max_spare_cap;
+		} else {
+			if (avg_load < min_load) {
+				min_load = avg_load;
+				idlest = group;
+			}
+
+			if (most_spare < max_spare_cap) {
+				most_spare = max_spare_cap;
+				most_spare_sg = group;
+			}
 		}
 	} while (group = group->next, group != sd->groups);
 
+	/*
+	 * The cross-over point between using spare capacity or least load
+	 * is too conservative for high utilization tasks on partially
+	 * utilized systems if we require spare_capacity > task_util(p),
+	 * so we allow for some task stuffing by using
+	 * spare_capacity > task_util(p)/2.
+	 */
+	if (this_spare > task_util(p) / 2 &&
+	    imbalance*this_spare > 100*most_spare)
+		return NULL;
+	else if (most_spare > task_util(p) / 2)
+		return most_spare_sg;
+
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;

From 60cc9f4e1e9fa3e951fb5a04d420da6eeb3067ee Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:09 +0100
Subject: [PATCH 0833/3220] UPSTREAM: sched/fair: Add per-CPU min capacity to
 sched_group_capacity

struct sched_group_capacity currently represents the compute capacity
sum of all CPUs in the sched_group.

Unless it is divided by the group_weight to get the average capacity
per CPU, it hides differences in CPU capacity for mixed capacity systems
(e.g. high RT/IRQ utilization or ARM big.LITTLE).

But even the average may not be sufficient if the group covers CPUs of
different capacities.

Instead, by extending struct sched_group_capacity to indicate min per-CPU
capacity in the group a suitable group for a given task utilization can
more easily be found such that CPUs with reduced capacity can be avoided
for tasks with high utilization (not implemented by this patch).

Change-Id: If3cae1be62d01a199e752bca5abb45357d5d0fbd
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-4-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit bf475ce0a3dd75b5d1df6c6c14ae25168caa15ac)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c  | 1 +
 kernel/sched/fair.c  | 7 ++++++-
 kernel/sched/sched.h | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fcef2588712a..948ef2438606 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6459,6 +6459,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		 */
 		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
 		sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+		sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
 
 		/*
 		 * Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 82eb26bd4361..d18e2c7b67e5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7268,13 +7268,14 @@ skip_unlock: __attribute__ ((unused));
 	cpu_rq(cpu)->cpu_capacity = capacity;
 	sdg->sgc->capacity = capacity;
 	sdg->sgc->max_capacity = capacity;
+	sdg->sgc->min_capacity = capacity;
 }
 
 void update_group_capacity(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
-	unsigned long capacity, max_capacity;
+	unsigned long capacity, max_capacity, min_capacity;
 	unsigned long interval;
 
 	interval = msecs_to_jiffies(sd->balance_interval);
@@ -7288,6 +7289,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
 	capacity = 0;
 	max_capacity = 0;
+	min_capacity = ULONG_MAX;
 
 	if (child->flags & SD_OVERLAP) {
 		/*
@@ -7318,6 +7320,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 			}
 
 			max_capacity = max(capacity, max_capacity);
+			min_capacity = min(capacity, min_capacity);
 		}
 	} else  {
 		/*
@@ -7331,12 +7334,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
 			capacity += sgc->capacity;
 			max_capacity = max(sgc->max_capacity, max_capacity);
+			min_capacity = min(sgc->min_capacity, min_capacity);
 			group = group->next;
 		} while (group != child->groups);
 	}
 
 	sdg->sgc->capacity = capacity;
 	sdg->sgc->max_capacity = max_capacity;
+	sdg->sgc->min_capacity = min_capacity;
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 237b0bcdd304..6bc2dd623b17 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -899,6 +899,7 @@ struct sched_group_capacity {
 	 */
 	unsigned long capacity;
 	unsigned long max_capacity; /* Max per-cpu capacity in group */
+	unsigned long min_capacity; /* Min per-CPU capacity in group */
 	unsigned long next_update;
 	int imbalance; /* XXX unrelated to capacity but shared group state */
 	/*

From adc7f08b2fb617a7b1c30e9dd87e36dcfbe9facb Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:10 +0100
Subject: [PATCH 0834/3220] UPSTREAM: sched/fair: Avoid pulling tasks from
 non-overloaded higher capacity groups

For asymmetric CPU capacity systems it is counter-productive for
throughput if low capacity CPUs are pulling tasks from non-overloaded
CPUs with higher capacity. The assumption is that higher CPU capacity is
preferred over running alone in a group with lower CPU capacity.

This patch rejects higher CPU capacity groups with one or less task per
CPU as potential busiest group which could otherwise lead to a series of
failing load-balancing attempts leading to a force-migration.

Change-Id: I428875bb6267c780026ef75e2882300738d016e7
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-5-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 9e0994c0a1c1f82c705f1f66388e1bcffcee8bb9)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d18e2c7b67e5..c00a8ae48a46 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7569,14 +7569,20 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	if (sgs->avg_load <= busiest->avg_load)
 		return false;
 
+	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+		goto asym_packing;
+
 	/*
-	 * Candiate sg has no more than one task per cpu and has higher
-	 * per-cpu capacity. No reason to pull tasks to less capable cpus.
+	 * Candidate sg has no more than one task per CPU and
+	 * has higher per-CPU capacity. Migrating tasks to less
+	 * capable CPUs may harm throughput. Maximize throughput,
+	 * power/energy consequences are not considered.
 	 */
 	if (sgs->sum_nr_running <= sgs->group_weight &&
 	    group_smaller_cpu_capacity(sds->local, sg))
 		return false;
 
+asym_packing:
 	/* This is the busiest node in its class. */
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return true;

From c6cc7ca9151303d56b519ffcf129c5ecbca9af58 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:12 +0100
Subject: [PATCH 0835/3220] UPSTREAM: sched/fair: Fix incorrect comment for
 capacity_margin

The comment for capacity_margin introduced in:

  3273163c6775 ("sched/fair: Let asymmetric CPU configurations balance at wake-up")

... got its usage the wrong way round - fix it.

Change-Id: Ie46eac3e5ff43397b5bed61d0999d2817f1a1d96
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-7-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 893c5d2279041afeb593f1fa8edd9d02edf5b7cb)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c00a8ae48a46..87c03eaacec4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -130,7 +130,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 
 /*
  * The margin used when comparing utilization with CPU capacity:
- * util * 1024 < capacity * margin
+ * util * margin < capacity * 1024
  */
 unsigned int capacity_margin = 1280; /* ~20% */
 

From 65cad23dcdf7334c785a3c7fd74bdec633156364 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 30 Nov 2016 20:36:31 +0000
Subject: [PATCH 0836/3220] arm64: Set SD_ASYM_CPUCAPACITY sched_domain flag on
 DIE level

Enable Capacity-Aware_scheduling.

This is a shortcut from the EAS integration implementation. We know
there are SoCs in use which have asymmetric cpu capacities on DIE level.

Change-Id: I7f1326e86b8fc08b65d1c1341da7d1ef1013dcbc
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit 82f95a8cdff67770f02b7e4f5b1c084f42afc0e2)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 arch/arm64/kernel/topology.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 5b2c67a510d8..7758f7ff131b 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -258,6 +258,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
 	return &cpu_topology[cpu].core_sibling;
 }
 
+static int cpu_cpu_flags(void)
+{
+	return SD_ASYM_CPUCAPACITY;
+}
+
 static inline int cpu_corepower_flags(void)
 {
 	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN | \
@@ -268,7 +273,7 @@ static struct sched_domain_topology_level arm64_topology[] = {
 #ifdef CONFIG_SCHED_MC
 	{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
 #endif
-	{ cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, cpu_cpu_flags, cpu_cluster_energy, SD_INIT_NAME(DIE) },
 	{ NULL, },
 };
 

From 168228463cacf92ee12563ddf612b7489dc3d3fa Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 1 Dec 2016 18:37:58 +0000
Subject: [PATCH 0837/3220] sched/fair: Do not force want_affine eq. true if
 EAS is enabled

This lets us use Capacity-Aware Scheduling (CAS) if EAS is enabled.

Change-Id: I2e647a201ea0b733d1487c3e153047a49fb22847
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit 00b7da2ae58bf568529e67614980f77e275b8d29)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 87c03eaacec4..976e7457b4fa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6016,9 +6016,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE)
-		want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
-					&& cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
-			      energy_aware();
+		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
 
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {

From 3b6ba235bcf3827d4911f3eb27bae5ac0f4dbbc8 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 26 Jan 2017 16:04:34 +0000
Subject: [PATCH 0838/3220] sched/fair: Decommission energy_aware_wake_cpu()

The EAS functionality in the wakeup path will be brought back by the
following patch ("sched/fair: Energy-aware wake-up task placement")
providing the function select_energy_cpu_brute().

Change-Id: I927fb9e8261cfacfe404695f853941c7959aa146
[ Trivial merge conflicts resolved. ]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit 80aee424fb7765a777267e144037642625a71304)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 120 +-------------------------------------------
 1 file changed, 1 insertion(+), 119 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 976e7457b4fa..ef16d4d3cc00 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5836,122 +5836,6 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 	return target_cpu;
 }
 
-static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
-{
-	struct sched_domain *sd;
-	struct sched_group *sg, *sg_target;
-	int target_max_cap = INT_MAX;
-	int target_cpu = task_cpu(p);
-	unsigned long task_util_boosted, new_util;
-	int i;
-
-	if (sysctl_sched_sync_hint_enable && sync) {
-		int cpu = smp_processor_id();
-		cpumask_t search_cpus;
-		cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
-		if (cpumask_test_cpu(cpu, &search_cpus))
-			return cpu;
-	}
-
-	sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
-
-	if (!sd)
-		return target;
-
-	sg = sd->groups;
-	sg_target = sg;
-
-	if (sysctl_sched_is_big_little) {
-
-		/*
-		 * Find group with sufficient capacity. We only get here if no cpu is
-		 * overutilized. We may end up overutilizing a cpu by adding the task,
-		 * but that should not be any worse than select_idle_sibling().
-		 * load_balance() should sort it out later as we get above the tipping
-		 * point.
-		 */
-		do {
-			/* Assuming all cpus are the same in group */
-			int max_cap_cpu = group_first_cpu(sg);
-
-			/*
-			 * Assume smaller max capacity means more energy-efficient.
-			 * Ideally we should query the energy model for the right
-			 * answer but it easily ends up in an exhaustive search.
-			 */
-			if (capacity_of(max_cap_cpu) < target_max_cap &&
-			    task_fits_max(p, max_cap_cpu)) {
-				sg_target = sg;
-				target_max_cap = capacity_of(max_cap_cpu);
-			}
-		} while (sg = sg->next, sg != sd->groups);
-
-		task_util_boosted = boosted_task_util(p);
-		/* Find cpu with sufficient capacity */
-		for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
-			/*
-			 * p's blocked utilization is still accounted for on prev_cpu
-			 * so prev_cpu will receive a negative bias due to the double
-			 * accounting. However, the blocked utilization may be zero.
-			 */
-			new_util = cpu_util(i) + task_util_boosted;
-
-			/*
-			 * Ensure minimum capacity to grant the required boost.
-			 * The target CPU can be already at a capacity level higher
-			 * than the one required to boost the task.
-			 */
-			if (new_util > capacity_orig_of(i))
-				continue;
-
-			if (new_util < capacity_curr_of(i)) {
-				target_cpu = i;
-				if (cpu_rq(i)->nr_running)
-					break;
-			}
-
-			/* cpu has capacity at higher OPP, keep it as fallback */
-			if (target_cpu == task_cpu(p))
-				target_cpu = i;
-		}
-	} else {
-		/*
-		 * Find a cpu with sufficient capacity
-		 */
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-		bool boosted = schedtune_task_boost(p) > 0;
-		bool prefer_idle = schedtune_prefer_idle(p) > 0;
-#else
-		bool boosted = 0;
-		bool prefer_idle = 0;
-#endif
-		int tmp_target = find_best_target(p, boosted, prefer_idle);
-		if (tmp_target >= 0) {
-			target_cpu = tmp_target;
-			if ((boosted || prefer_idle) && idle_cpu(target_cpu))
-				return target_cpu;
-		}
-	}
-
-	if (target_cpu != task_cpu(p)) {
-		struct energy_env eenv = {
-			.util_delta	= task_util(p),
-			.src_cpu	= task_cpu(p),
-			.dst_cpu	= target_cpu,
-			.task		= p,
-		};
-
-		/* Not enough spare capacity on previous cpu */
-		if (cpu_overutilized(task_cpu(p)))
-			return target_cpu;
-
-		if (energy_diff(&eenv) >= 0)
-			return task_cpu(p);
-	}
-
-	return target_cpu;
-}
-
 /*
  * cpu_util_wake: Compute cpu utilization with any contributions from
  * the waking task p removed.
@@ -6047,9 +5931,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	}
 
 	if (!sd) {
-		if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
-			new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
-		else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
 	} else while (sd) {

From 02cbde61f4d895c25a422813b066d5537f8812fa Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 30 Mar 2016 14:20:12 +0100
Subject: [PATCH 0839/3220] sched/fair: Add energy_diff dead-zone margin

It is not worth the overhead to migrate tasks for tiny insignificant
energy savings. To prevent this, an energy margin is introduced in
energy_diff() which effectively adds a dead-zone that rounds tiny energy
differences to zero. Since no scale is enforced for energy model data
the margin can't be absolute. Instead it is defined as +/-1.56% energy
saving compared to the current total estimated energy consumption.

Change-Id: I6be069c752c701fb825430896b3b768a7ab2fee4
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
[rebase: on top of msm-google/android-msm-marlin-3.18,
         massage original patch which changes code in energy_diff()
	 into __energy_diff() introduced by SchedTune]
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
(cherry picked from commit 780cb5a5fa47adf13d4fc2b77e8e94448cd56098)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef16d4d3cc00..16f15549b5d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5149,6 +5149,7 @@ static inline int __energy_diff(struct energy_env *eenv)
 	struct sched_domain *sd;
 	struct sched_group *sg;
 	int sd_cpu = -1, energy_before = 0, energy_after = 0;
+	int diff, margin;
 
 	struct energy_env eenv_before = {
 		.util_delta	= 0,
@@ -5198,6 +5199,16 @@ static inline int __energy_diff(struct energy_env *eenv)
 			eenv->cap.before, eenv->cap.after, eenv->cap.delta,
 			eenv->nrg.delta, eenv->payoff);
 
+	/*
+	 * Dead-zone margin preventing too many migrations.
+	 */
+
+	margin = eenv->nrg.before >> 6; /* ~1.56% */
+
+	diff = eenv->nrg.after - eenv->nrg.before;
+
+	eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
+
 	return eenv->nrg.diff;
 }
 

From 3935105f5775e8f37dd06c73e5cc30e5585d673c Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Wed, 30 Mar 2016 14:29:48 +0100
Subject: [PATCH 0840/3220] sched/fair: Energy-aware wake-up task placement

When the systems is not overutilized, place waking tasks on the most
energy efficient cpu. Previous attempts reduced the search space by
matching task utilization to cpu capacity before consulting the energy
model as this is an expensive operation. The search heuristics didn't
work very well and lacking any better alternatives this patch takes the
brute-force route and tries all potential targets.

This approach doesn't scale, but it might be sufficient for many
embedded applications while work is continuing on a heuristic that can
minimize the necessary computations. The heuristic must be derrived from
the platform energy model rather than make additional assumptions, such
lower capacity implies better energy efficiency. PeterZ mentioned in the
past that we might be able to derrive some simpler deciding functions
using mathematical (modal?) analysis.

Change-Id: I772bacb4c8fd599f8006fa422f842e66377a9c6c
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
[rebase: on top of msm-google/android-msm-marlin-3.18]
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit a894422dbdb7b77ea2acfe7ff909ccb5ded23514)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 57 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 16f15549b5d0..c4d9d8bf8d1f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5889,6 +5889,60 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
+{
+	int i;
+	int min_diff = 0, energy_cpu = prev_cpu, spare_cpu = prev_cpu;
+	unsigned long max_spare = 0;
+	struct sched_domain *sd;
+
+	rcu_read_lock();
+
+	sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+
+	if (!sd)
+		goto unlock;
+
+	for_each_cpu_and(i, tsk_cpus_allowed(p), sched_domain_span(sd)) {
+		int diff;
+		unsigned long spare;
+
+		struct energy_env eenv = {
+			.util_delta	= task_util(p),
+			.src_cpu	= prev_cpu,
+			.dst_cpu	= i,
+		};
+
+		spare = capacity_spare_wake(i, p);
+
+		if (i == prev_cpu)
+			continue;
+
+		if (spare > max_spare) {
+			max_spare = spare;
+			spare_cpu = i;
+		}
+
+		if (spare * 1024 < capacity_margin * task_util(p))
+			continue;
+
+		diff = energy_diff(&eenv);
+
+		if (diff < min_diff) {
+			min_diff = diff;
+			energy_cpu = i;
+		}
+	}
+
+unlock:
+	rcu_read_unlock();
+
+	if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu))
+		return prev_cpu;
+
+	return energy_cpu != prev_cpu ? energy_cpu : spare_cpu;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5914,6 +5968,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
 			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
 
+	if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+		return select_energy_cpu_brute(p, prev_cpu);
+
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))

From 81bd5ed393832c088e3a31420205bd13e667d538 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Sun, 4 Dec 2016 17:29:34 +0000
Subject: [PATCH 0841/3220] Fixup!: sched/fair.c: Set SchedTune specific struct
 energy_env.task

This has to be done in the caller function of energy_diff() version of
SchedTune to avoid Null pointer dereference in energy_diff().

Change-Id: I3f0f68dbd11efb15bbb3b1832f8294419ed85241
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit 14531d4e245d063f713ee5ed835df958e6c7838f)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c4d9d8bf8d1f..ae3cc8df331c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5911,6 +5911,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
 			.util_delta	= task_util(p),
 			.src_cpu	= prev_cpu,
 			.dst_cpu	= i,
+			.task		= p,
 		};
 
 		spare = capacity_spare_wake(i, p);

From f6f931489311aa208255192bf22d189d095f5b9a Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Mon, 5 Dec 2016 14:15:54 +0000
Subject: [PATCH 0842/3220] EAS: sched/fair: Re-integrate 'honor sync wakeups'
 into wakeup path

This patch re-integrates the part which was initially provided by
3b9d7554aeec ("EAS: sched/fair: tunable to honor sync wakeups") into
energy_aware_wake_cpu() into select_energy_cpu_brute().

Change-Id: I748fde3ecdeb44651179bce0a5bb8dd82d1903f6
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit b75b7286cb068d5761621ea134c23dd131db953f)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae3cc8df331c..5e4bf1e76275 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5889,13 +5889,21 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 
-static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
 {
 	int i;
 	int min_diff = 0, energy_cpu = prev_cpu, spare_cpu = prev_cpu;
 	unsigned long max_spare = 0;
 	struct sched_domain *sd;
 
+	if (sysctl_sched_sync_hint_enable && sync) {
+		int cpu = smp_processor_id();
+		cpumask_t search_cpus;
+		cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+		if (cpumask_test_cpu(cpu, &search_cpus))
+			return cpu;
+	}
+
 	rcu_read_lock();
 
 	sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
@@ -5970,7 +5978,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
 
 	if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
-		return select_energy_cpu_brute(p, prev_cpu);
+		return select_energy_cpu_brute(p, prev_cpu, sync);
 
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {

From 9e92e8a24fa5bfec70cc70cbfa81e2d34e887634 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 22 Mar 2017 18:16:03 +0000
Subject: [PATCH 0843/3220] sched/fair: Code !is_big_little path into
 select_energy_cpu_brute()

This patch replaces the existing EAS upstream implementation of
select_energy_cpu_brute() with the one of find_best_target() used
in Android previously.

It also removes the cpumask 'and' from select_energy_cpu_brute,
see the existing use of 'cpu = smp_processor_id()' in
select_task_rq_fair().

Change-Id: If678c002efaa87d1ba3ec9989a4e9f8df98b83ec
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
[ added guarding for non-schedtune builds ]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 66 ++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 37 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5e4bf1e76275..387950f2649d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5891,65 +5891,57 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 
 static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
 {
-	int i;
-	int min_diff = 0, energy_cpu = prev_cpu, spare_cpu = prev_cpu;
-	unsigned long max_spare = 0;
 	struct sched_domain *sd;
+	int target_cpu = prev_cpu, tmp_target;
+	bool boosted, prefer_idle;
 
 	if (sysctl_sched_sync_hint_enable && sync) {
 		int cpu = smp_processor_id();
-		cpumask_t search_cpus;
-		cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
-		if (cpumask_test_cpu(cpu, &search_cpus))
+
+		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
 			return cpu;
 	}
 
 	rcu_read_lock();
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+	boosted = schedtune_task_boost(p) > 0;
+	prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+	boosted = get_sysctl_sched_cfs_boost() > 0;
+	prefer_idle = 0;
+#endif
 
 	sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+	/* Find a cpu with sufficient capacity */
+	tmp_target = find_best_target(p, boosted, prefer_idle);
 
 	if (!sd)
 		goto unlock;
+	if (tmp_target >= 0) {
+		target_cpu = tmp_target;
+		if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+			goto unlock;
+	}
 
-	for_each_cpu_and(i, tsk_cpus_allowed(p), sched_domain_span(sd)) {
-		int diff;
-		unsigned long spare;
-
+	if (target_cpu != prev_cpu) {
 		struct energy_env eenv = {
-			.util_delta	= task_util(p),
-			.src_cpu	= prev_cpu,
-			.dst_cpu	= i,
-			.task		= p,
+			.util_delta     = task_util(p),
+			.src_cpu        = prev_cpu,
+			.dst_cpu        = target_cpu,
+			.task           = p,
 		};
 
-		spare = capacity_spare_wake(i, p);
+		/* Not enough spare capacity on previous cpu */
+		if (cpu_overutilized(prev_cpu))
+			goto unlock;
 
-		if (i == prev_cpu)
-			continue;
-
-		if (spare > max_spare) {
-			max_spare = spare;
-			spare_cpu = i;
-		}
-
-		if (spare * 1024 < capacity_margin * task_util(p))
-			continue;
-
-		diff = energy_diff(&eenv);
-
-		if (diff < min_diff) {
-			min_diff = diff;
-			energy_cpu = i;
-		}
+		if (energy_diff(&eenv) >= 0)
+			target_cpu = prev_cpu;
 	}
 
 unlock:
 	rcu_read_unlock();
-
-	if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu))
-		return prev_cpu;
-
-	return energy_cpu != prev_cpu ? energy_cpu : spare_cpu;
+	return target_cpu;
 }
 
 /*

From 242695407af782e37d53631a4d41bc213f35cf27 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 13 Jan 2017 13:51:24 +0000
Subject: [PATCH 0844/3220] sched: Remove sysctl_sched_is_big_little

With the new wakeup approach this sysctl is not necessary any more.

Change-Id: I52114b3c918791f6a4f9f30f50002919ccbc1a9c
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit 885c0d503bcdf0ef4e9b46822496f16b20aa3bbd)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 include/linux/sched/sysctl.h |  1 -
 kernel/sched/fair.c          |  1 -
 kernel/sysctl.c              | 11 ++---------
 3 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d68e88c9d4d7..2bf4520d8089 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,7 +39,6 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
-extern unsigned int sysctl_sched_is_big_little;
 extern unsigned int sysctl_sched_sync_hint_enable;
 extern unsigned int sysctl_sched_initial_task_util;
 extern unsigned int sysctl_sched_cstate_aware;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 387950f2649d..770d0ab957cc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -53,7 +53,6 @@
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
-unsigned int sysctl_sched_is_big_little = 0;
 unsigned int sysctl_sched_sync_hint_enable = 1;
 unsigned int sysctl_sched_initial_task_util = 0;
 unsigned int sysctl_sched_cstate_aware = 1;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8b0542861c42..8ea768245b92 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -306,8 +306,8 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &max_sched_granularity_ns,
 	},
 	{
-		.procname	= "sched_is_big_little",
-		.data		= &sysctl_sched_is_big_little,
+		.procname	= "sched_sync_hint_enable",
+		.data		= &sysctl_sched_sync_hint_enable,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -342,13 +342,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-	{
-		.procname	= "sched_sync_hint_enable",
-		.data		= &sysctl_sched_sync_hint_enable,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "sched_initial_task_util",
 		.data		= &sysctl_sched_initial_task_util,

From 3e44a647c057c8ddc7052a221a8968bc55accfd1 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Sun, 8 Jan 2017 14:40:38 +0000
Subject: [PATCH 0845/3220] sched/core: Remove remnants of commit fd5c98da1a42

Commit fd5c98da1a42 "WIP: sched: Store system-wide maximum cpu capacity
in root domain" was repalced by commit 8148bdfff4f5 "WIP: sched: Update
max cpu capacity in case of max frequency constraints" which didn't
remove all the now unused bits.

Change-Id: I067f6366431f43337cffa7a2a8e0de32dd33d2f9
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit 6d284a607cec51bcafca313bc396bc3103b1e876)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 948ef2438606..9efcfb3d0fc1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7332,7 +7332,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	enum s_alloc alloc_state;
 	struct sched_domain *sd;
 	struct s_data d;
-	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;
 
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -7386,7 +7385,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
-		rq = cpu_rq(i);
 		sd = *per_cpu_ptr(d.sd, i);
 		cpu_attach_domain(sd, d.rd, i);
 	}

From 633b98b6519019e4960a82eb7de07e736d10cca1 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Sun, 8 Jan 2017 16:16:59 +0000
Subject: [PATCH 0846/3220] sched/core: Add first cpu w/ max/min orig capacity
 to root domain

This will allow to start iterating from a cpu with max or min original
capacity in the wakeup path regardless on which cpu the scheduler is
currently running (smp_processor_id()) or the previous cpu of the task
(task_cpu(p)). This iteration has to happen on a sched_domain spanning
all cpus in the order of the sched_groups of this sched_domain seen by
the starting cpu.

In case of an SMP system the first cpu with max orig capacity and the
the one with min orig capacity is the same. This can temporally happen
on a big.LITTLE system with hotplug as well.

E.g. the different order of cpu iteration can be used to map schedtune
task parameter 'boosted' into the cpu iteration order in
find_best_target().

Use of READ_ONCE()/WRITE_ONCE() to avoid load/store tearing.

Change-Id: I812fbd9c7e5f506617e456c0eec3edcd2c016e92
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit fd6e9543c1fd8971a5e2e68e39b2f6e591d46114)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c  | 15 +++++++++++++++
 kernel/sched/sched.h |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9efcfb3d0fc1..495bc41907d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6155,6 +6155,9 @@ static int init_rootdomain(struct root_domain *rd)
 		goto free_rto_mask;
 
 	init_max_cpu_capacity(&rd->max_cpu_capacity);
+
+	rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
 	return 0;
 
 free_rto_mask:
@@ -7385,7 +7388,19 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
+		int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+		int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
+		if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+		    cpu_rq(max_cpu)->cpu_capacity_orig))
+			WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+
+		if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+		    cpu_rq(min_cpu)->cpu_capacity_orig))
+			WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
+
 		sd = *per_cpu_ptr(d.sd, i);
+
 		cpu_attach_domain(sd, d.rd, i);
 	}
 	rcu_read_unlock();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6bc2dd623b17..2051fecdb9e5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -561,6 +561,9 @@ struct root_domain {
 
 	/* Maximum cpu capacity in the system. */
 	struct max_cpu_capacity max_cpu_capacity;
+
+	/* First cpu with maximum and minimum original capacity */
+	int max_cap_orig_cpu, min_cap_orig_cpu;
 };
 
 extern struct root_domain def_root_domain;

From d3f5e8c3e9aca6fe9f80a0a70a3f8ef6169ff852 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 13 Jan 2017 17:54:34 +0000
Subject: [PATCH 0847/3220] sched/fair: Change cpu iteration order in
 find_best_target()

The schedtune task parameter 'boosted' is mapped into the cpu iteration
order. Currently for 'boosted' equal true the iteration starts at the
last cpu (NR_CPUS-1) whereas for 'boosted' equal false it starts at the
first cpu (0).

This only has the desired effect if the cpu topology oerdering matches
the underlying assumption. This e.g. is the case for the
Qc snapdragon 821 with its [L0 L1 b0 b1] cpu topology layout
(L=lower max freq, b=higher max freq). This results in cpus with higher
maximum capacity being given the highest logical cpu ids. However not
all big.LITTLE systems enumerate their cpus in the same way. For example,
the ARM Versatile Express Juno board has 6 cpus for which the default
configuration has topology [L0 b0 b1 L1 L2 L3].

To make this approach independent from the cpu topology layout it now
iterates over the cpus in the order of the sched_groups of the EAS
sched_domain (sd_ea). The order of cpu iteration is different for the
different cpu types in case the cpu is used to dereference sd_ea.

Considering the Qc snapdragon 821 again, for cpu L0 and L1 the order is
'b0->b1->L0->L1' whereas for b0 and b1 the order is 'b0->b1->L0->L1'.

This approach does not allow the exact same iteration order as with the
currently used flat iteration over [0 .. NR_CPUS-1] but the cpus
are ordered by the original cpu capacity.

The cpu iteration is now done in the sd_ea sched_group order required by
the 'boosted' value ['L0->L1->b0->b1'/'b0->b1->L0->L1'] rather than
forward/backward over the flat cpu space ['L0->L1->b0->b1'/
'b1->b0->L1->L0'].

Change-Id: I8fbe2073dedd2ecb1c750620c6000c11a5ff4358
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit a0c6a4272c3968c0ff50d3fed65f5865b72d777b)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 173 +++++++++++++++++++++++++-------------------
 1 file changed, 99 insertions(+), 74 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 770d0ab957cc..d9c732de3295 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5743,100 +5743,125 @@ done:
 	return target;
 }
 
+static int start_cpu(bool boosted)
+{
+	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+
+	RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(),
+			   "sched RCU must be held");
+
+	return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+}
+
 static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
 {
-	int iter_cpu;
 	int target_cpu = -1;
 	int target_util = 0;
 	int backup_capacity = 0;
 	int best_idle_cpu = -1;
 	int best_idle_cstate = INT_MAX;
 	int backup_cpu = -1;
-	unsigned long task_util_boosted, new_util;
+	unsigned long min_util;
+	unsigned long new_util;
+	struct sched_domain *sd;
+	struct sched_group *sg;
+	int cpu = start_cpu(boosted);
 
-	task_util_boosted = boosted_task_util(p);
-	for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
-		int cur_capacity;
-		struct rq *rq;
-		int idle_idx;
+	if (cpu < 0)
+		return target_cpu;
 
-		/*
-		 * Iterate from higher cpus for boosted tasks.
-		 */
-		int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+	sd = rcu_dereference(per_cpu(sd_ea, cpu));
 
-		if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
-			continue;
+	if (!sd)
+		return target_cpu;
 
-		/*
-		 * p's blocked utilization is still accounted for on prev_cpu
-		 * so prev_cpu will receive a negative bias due to the double
-		 * accounting. However, the blocked utilization may be zero.
-		 */
-		new_util = cpu_util(i) + task_util_boosted;
+	sg = sd->groups;
 
-		/*
-		 * Ensure minimum capacity to grant the required boost.
-		 * The target CPU can be already at a capacity level higher
-		 * than the one required to boost the task.
-		 */
-		if (new_util > capacity_orig_of(i))
-			continue;
+	min_util = boosted_task_util(p);
+
+	do {
+		int i;
+
+		for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+			int cur_capacity;
+			struct rq *rq;
+			int idle_idx;
+
+			if (!cpu_online(i))
+				continue;
+
+			/*
+			 * p's blocked utilization is still accounted for on prev_cpu
+			 * so prev_cpu will receive a negative bias due to the double
+			 * accounting. However, the blocked utilization may be zero.
+			 */
+			new_util = cpu_util(i) + task_util(p);
+
+			/*
+			 * Ensure minimum capacity to grant the required boost.
+			 * The target CPU can be already at a capacity level higher
+			 * than the one required to boost the task.
+			 */
+			new_util = max(min_util, new_util);
+			if (new_util > capacity_orig_of(i))
+				continue;
 
 #ifdef CONFIG_SCHED_WALT
-		if (walt_cpu_high_irqload(i))
-			continue;
+			if (walt_cpu_high_irqload(i))
+				continue;
 #endif
-		/*
-		 * Unconditionally favoring tasks that prefer idle cpus to
-		 * improve latency.
-		 */
-		if (idle_cpu(i) && prefer_idle) {
-			if (best_idle_cpu < 0)
-				best_idle_cpu = i;
-			continue;
-		}
 
-		cur_capacity = capacity_curr_of(i);
-		rq = cpu_rq(i);
-		idle_idx = idle_get_state_idx(rq);
-
-		if (new_util < cur_capacity) {
-			if (cpu_rq(i)->nr_running) {
-				if (prefer_idle) {
-					/* Find a target cpu with highest
-					 * utilization.
-					 */
-					if (target_util == 0 ||
-						target_util < new_util) {
-						target_cpu = i;
-						target_util = new_util;
-					}
-				} else {
-					/* Find a target cpu with lowest
-					 * utilization.
-					 */
-					if (target_util == 0 ||
-						target_util > new_util) {
-						target_cpu = i;
-						target_util = new_util;
-					}
-				}
-			} else if (!prefer_idle) {
-				if (best_idle_cpu < 0 ||
-					(sysctl_sched_cstate_aware &&
-						best_idle_cstate > idle_idx)) {
-					best_idle_cstate = idle_idx;
+			/*
+			 * Unconditionally favoring tasks that prefer idle cpus to
+			 * improve latency.
+			 */
+			if (idle_cpu(i) && prefer_idle) {
+				if (best_idle_cpu < 0)
 					best_idle_cpu = i;
-				}
+				continue;
+			}
+
+			cur_capacity = capacity_curr_of(i);
+			rq = cpu_rq(i);
+			idle_idx = idle_get_state_idx(rq);
+
+			if (new_util < cur_capacity) {
+				if (cpu_rq(i)->nr_running) {
+					if (!prefer_idle) {
+						/* Find a target cpu with highest
+						 * utilization.
+						 */
+						if (target_util == 0 ||
+							target_util < new_util) {
+							target_cpu = i;
+							target_util = new_util;
+						}
+					} else {
+						/* Find a target cpu with lowest
+						 * utilization.
+						 */
+						if (target_util == 0 ||
+							target_util > new_util) {
+							target_cpu = i;
+							target_util = new_util;
+						}
+					}
+				} else if (!prefer_idle) {
+					if (best_idle_cpu < 0 ||
+						(sysctl_sched_cstate_aware &&
+							best_idle_cstate > idle_idx)) {
+						best_idle_cstate = idle_idx;
+						best_idle_cpu = i;
+					}
+				}
+			} else if (backup_capacity == 0 ||
+					backup_capacity > cur_capacity) {
+				// Find a backup cpu with least capacity.
+				backup_capacity = cur_capacity;
+				backup_cpu = i;
 			}
-		} else if (backup_capacity == 0 ||
-				backup_capacity > cur_capacity) {
-			// Find a backup cpu with least capacity.
-			backup_capacity = cur_capacity;
-			backup_cpu = i;
 		}
-	}
+	} while (sg = sg->next, sg != sd->groups);
 
 	if (prefer_idle && best_idle_cpu >= 0)
 		target_cpu = best_idle_cpu;

From b31ae71ef75ee5a33cb96e829a492f2f1c2ce810 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Sat, 7 Jan 2017 14:33:51 +0000
Subject: [PATCH 0848/3220] sched/fair: refactor find_best_target() for
 simplicity

Simplify backup_capacity handling and use 'unsigned long'
data type for cpu capacity, simplify target_util handling,
simplify idle_idx handling & refactor min_util, new_util.

Also return first idle cpu for prefer_idle task immediately.

Change-Id: Ic89e140f7b369f3965703fdc8463013d16e9b94a
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 60 +++++++++++++++------------------------------
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d9c732de3295..270091323ae8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5756,13 +5756,12 @@ static int start_cpu(bool boosted)
 static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
 {
 	int target_cpu = -1;
-	int target_util = 0;
-	int backup_capacity = 0;
+	unsigned long target_util = prefer_idle ? ULONG_MAX : 0;
+	unsigned long backup_capacity = ULONG_MAX;
 	int best_idle_cpu = -1;
 	int best_idle_cstate = INT_MAX;
 	int backup_cpu = -1;
-	unsigned long min_util;
-	unsigned long new_util;
+	unsigned long min_util = boosted_task_util(p);
 	struct sched_domain *sd;
 	struct sched_group *sg;
 	int cpu = start_cpu(boosted);
@@ -5777,15 +5776,11 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 
 	sg = sd->groups;
 
-	min_util = boosted_task_util(p);
-
 	do {
 		int i;
 
 		for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
-			int cur_capacity;
-			struct rq *rq;
-			int idle_idx;
+			unsigned long cur_capacity, new_util;
 
 			if (!cpu_online(i))
 				continue;
@@ -5803,6 +5798,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 			 * than the one required to boost the task.
 			 */
 			new_util = max(min_util, new_util);
+
 			if (new_util > capacity_orig_of(i))
 				continue;
 
@@ -5815,38 +5811,25 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 			 * Unconditionally favoring tasks that prefer idle cpus to
 			 * improve latency.
 			 */
-			if (idle_cpu(i) && prefer_idle) {
-				if (best_idle_cpu < 0)
-					best_idle_cpu = i;
-				continue;
-			}
+			if (idle_cpu(i) && prefer_idle)
+				return i;
 
 			cur_capacity = capacity_curr_of(i);
-			rq = cpu_rq(i);
-			idle_idx = idle_get_state_idx(rq);
 
 			if (new_util < cur_capacity) {
 				if (cpu_rq(i)->nr_running) {
-					if (!prefer_idle) {
-						/* Find a target cpu with highest
-						 * utilization.
-						 */
-						if (target_util == 0 ||
-							target_util < new_util) {
-							target_cpu = i;
-							target_util = new_util;
-						}
-					} else {
-						/* Find a target cpu with lowest
-						 * utilization.
-						 */
-						if (target_util == 0 ||
-							target_util > new_util) {
-							target_cpu = i;
-							target_util = new_util;
-						}
+					/*
+					 * Find a target cpu with the lowest/highest
+					 * utilization if prefer_idle/!prefer_idle.
+					 */
+					if ((prefer_idle && target_util > new_util) ||
+					    (!prefer_idle && target_util < new_util)) {
+						target_util = new_util;
+						target_cpu = i;
 					}
 				} else if (!prefer_idle) {
+					int idle_idx = idle_get_state_idx(cpu_rq(i));
+
 					if (best_idle_cpu < 0 ||
 						(sysctl_sched_cstate_aware &&
 							best_idle_cstate > idle_idx)) {
@@ -5854,18 +5837,15 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 						best_idle_cpu = i;
 					}
 				}
-			} else if (backup_capacity == 0 ||
-					backup_capacity > cur_capacity) {
-				// Find a backup cpu with least capacity.
+			} else if (backup_capacity > cur_capacity) {
+				/* Find a backup cpu with least capacity. */
 				backup_capacity = cur_capacity;
 				backup_cpu = i;
 			}
 		}
 	} while (sg = sg->next, sg != sd->groups);
 
-	if (prefer_idle && best_idle_cpu >= 0)
-		target_cpu = best_idle_cpu;
-	else if (target_cpu < 0)
+	if (target_cpu < 0)
 		target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
 
 	return target_cpu;

From 4e18c8a10de0c4d435dce95e526ecbe97c77d5c5 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Mon, 16 Jan 2017 12:42:59 +0000
Subject: [PATCH 0849/3220] sched/fair: Simplify idle_idx handling in
 select_idle_sibling()

Rename best_idle to best_idle_cpu so the same name is used like in
find_best_target().

Fix if (best_idle > 0) since best_idle_cpu = 0 is a valid target.

Use 'unsigned long' data type for best_idle_capacity.

Since we're looking for the shallowest best_idle_cstate initialize
best_idle_cstate = INT_MAX. For cpus which are not idle (idle_idx = -1)
the condition 'if (idle_idx < best_idle_cstate && ...)' is never
executed.

Change-Id: Ic5b63d58478696b3d1ec6253cf739a69a574cf99
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit 8bff5e9c0968108d465e1f2a4624fc5ec2f00849)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 270091323ae8..e77917b23c79 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5677,9 +5677,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
 	struct sched_domain *sd;
 	struct sched_group *sg;
-	int best_idle = -1;
-	int best_idle_cstate = -1;
-	int best_idle_capacity = INT_MAX;
+	int best_idle_cpu = -1;
+	int best_idle_cstate = INT_MAX;
+	unsigned long best_idle_capacity = ULONG_MAX;
 
 	if (!sysctl_sched_cstate_aware) {
 		if (idle_cpu(target))
@@ -5706,18 +5706,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 
 			if (sysctl_sched_cstate_aware) {
 				for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
-					struct rq *rq = cpu_rq(i);
-					int idle_idx = idle_get_state_idx(rq);
+					int idle_idx = idle_get_state_idx(cpu_rq(i));
 					unsigned long new_usage = boosted_task_util(p);
 					unsigned long capacity_orig = capacity_orig_of(i);
+
 					if (new_usage > capacity_orig || !idle_cpu(i))
 						goto next;
 
 					if (i == target && new_usage <= capacity_curr_of(target))
 						return target;
 
-					if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
-						best_idle = i;
+					if (idle_idx < best_idle_cstate &&
+					    capacity_orig <= best_idle_capacity) {
+						best_idle_cpu = i;
 						best_idle_cstate = idle_idx;
 						best_idle_capacity = capacity_orig;
 					}
@@ -5736,8 +5737,9 @@ next:
 			sg = sg->next;
 		} while (sg != sd->groups);
 	}
-	if (best_idle > 0)
-		target = best_idle;
+
+	if (best_idle_cpu >= 0)
+		target = best_idle_cpu;
 
 done:
 	return target;

From 9de438d27c43863dabf9db696fecbb90bc5c91eb Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Wed, 30 Mar 2016 04:30:56 +0800
Subject: [PATCH 0850/3220] BACKPORT: sched/fair: Initiate a new task's util
 avg to a bounded value

A new task's util_avg is set to full utilization of a CPU (100% time
running). This accelerates a new task's utilization ramp-up, useful to
boost its execution in early time. However, it may result in
(insanely) high utilization for a transient time period when a flood
of tasks are spawned. Importantly, it violates the "fundamentally
bounded" CPU utilization, and its side effect is negative if we don't
take any measure to bound it.

This patch proposes an algorithm to address this issue. It has
two methods to approach a sensible initial util_avg:

(1) An expected (or average) util_avg based on its cfs_rq's util_avg:

  util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight

(2) A trajectory of how successive new tasks' util develops, which
gives 1/2 of the left utilization budget to a new task such that
the additional util is noticeably large (when overall util is low) or
unnoticeably small (when overall util is high enough). In the meantime,
the aggregate utilization is well bounded:

  util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n

where n denotes the nth task.

If util_avg is larger than util_avg_cap, then the effective util is
clamped to the util_avg_cap.

Change-Id: Idafe989b24d9e70911666f09800bf1d5a011e1f4
Reported-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: steve.muckle@linaro.org
Link: http://lkml.kernel.org/r/1459283456-21682-1-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 2b8c41daba327c633228169e8bd8ec067ab443f8)
[integrate with schedfreq - schedfreq has a tuneable for init task util
 but this commit removes the use of the tuneable since we have a new
 algorithm for calculating an initial utilisation. I've left the tuneable
 in place, but it is no longer used even when schedfreq is the CPUFreq
 governor]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c  |  2 ++
 kernel/sched/fair.c  | 73 +++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h |  1 +
 3 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 495bc41907d6..0daa14cd0d32 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2450,6 +2450,8 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
+	/* Post initialize new task's util average when its cfs_rq is set */
+	post_init_entity_util_avg(&p->se);
 
 	rq = __task_rq_lock(p);
 	walt_mark_task_starting(p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e77917b23c79..bb43c742a520 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -701,17 +701,81 @@ void init_entity_runnable_average(struct sched_entity *se)
 	sa->period_contrib = 1023;
 	sa->load_avg = scale_load_down(se->load.weight);
 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-	sa->util_avg =  sched_freq() ?
-		sysctl_sched_initial_task_util :
-		scale_load_down(SCHED_LOAD_SCALE);
-	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+	/*
+	 * In previous Android versions, we used to have:
+	 * 	sa->util_avg =  sched_freq() ?
+	 * 		sysctl_sched_initial_task_util :
+	 *		scale_load_down(SCHED_LOAD_SCALE);
+	 * 	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+	 * However, that functionality has been moved to enqueue.
+	 * It is unclear if we should restore this in enqueue.
+	 */
+	/*
+	 * At this point, util_avg won't be used in select_task_rq_fair anyway
+	 */
+	sa->util_avg = 0;
+	sa->util_sum = 0;
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct sched_avg *sa = &se->avg;
+	long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2;
+
+	if (cap > 0) {
+		if (cfs_rq->avg.util_avg != 0) {
+			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+			if (sa->util_avg > cap)
+				sa->util_avg = cap;
+		} else {
+			sa->util_avg = cap;
+		}
+		/*
+		 * If we wish to restore tuning via setting initial util,
+		 * this is where we should do it.
+		 */
+		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+	}
+}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
 {
 }
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
 #endif
 
 /*
@@ -9409,6 +9473,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_entity_runnable_average(se);
+		post_init_entity_util_avg(se);
 	}
 
 	return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2051fecdb9e5..50b9229d47ab 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1391,6 +1391,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
 
 static inline void __add_nr_running(struct rq *rq, unsigned count)
 {

From 3fd734a8f9b6703e8f0c6e35280e7907665ef6f0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 9 Jun 2016 15:07:50 +0200
Subject: [PATCH 0851/3220] UPSTREAM: sched/fair: Fix
 post_init_entity_util_avg() serialization

Chris Wilson reported a divide by 0 at:

 post_init_entity_util_avg():

 >    725	if (cfs_rq->avg.util_avg != 0) {
 >    726		sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 > -> 727		sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 >    728
 >    729		if (sa->util_avg > cap)
 >    730			sa->util_avg = cap;
 >    731	} else {

Which given the lack of serialization, and the code generated from
update_cfs_rq_load_avg() is entirely possible:

	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
		sa->load_avg = max_t(long, sa->load_avg - r, 0);
		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
		removed_load = 1;
	}

turns into:

  ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
  ffffffff8108706b:       48 85 c0                test   %rax,%rax
  ffffffff8108706e:       74 40                   je     ffffffff810870b0
  ffffffff81087070:       4c 89 f8                mov    %r15,%rax
  ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
  ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
  ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
  ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
  ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
  ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx

Which you'll note ends up with 'sa->load_avg - r' in memory at
ffffffff8108707a.

By calling post_init_entity_util_avg() under rq->lock we're sure to be
fully serialized against PELT updates and cannot observe intermediate
state like this.

Change-Id: I56c11886102b7859df82e26c88b1b7c200a39f6e
Reported-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Cc: bsegall@google.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: steve.muckle@linaro.org
Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value")
Link: http://lkml.kernel.org/r/20160609130750.GQ30909@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit b7fa30c9cc48c4f55663420472505d3b4f6e1705)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 3 +--
 kernel/sched/fair.c | 8 +++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0daa14cd0d32..00e969d7b2ad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2450,10 +2450,9 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-	/* Post initialize new task's util average when its cfs_rq is set */
+	rq = __task_rq_lock(p);
 	post_init_entity_util_avg(&p->se);
 
-	rq = __task_rq_lock(p);
 	walt_mark_task_starting(p);
 	activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
 	p->on_rq = TASK_ON_RQ_QUEUED;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bb43c742a520..04dea070fc6e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9444,8 +9444,9 @@ void free_fair_sched_group(struct task_group *tg)
 
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
 	int i;
 
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -9460,6 +9461,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
 	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
@@ -9473,7 +9476,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_entity_runnable_average(se);
+
+		raw_spin_lock_irq(&rq->lock);
 		post_init_entity_util_avg(se);
+		raw_spin_unlock_irq(&rq->lock);
 	}
 
 	return 1;

From dc1386b6f78755f294f19075ddad6501570a9dee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Jun 2016 14:27:50 +0200
Subject: [PATCH 0852/3220] UPSTREAM: sched/fair: Apply more PELT fixes

One additional 'rule' for using update_cfs_rq_load_avg() is that one
should call update_tg_load_avg() if it returns true.

Add a bunch of comments to hopefully clarify some of the rules:

 o  You need to update cfs_rq _before_ any entity attach/detach,
    this is important, because while for mathmatical consisency this
    isn't strictly needed, it is required for the physical
    interpretation of the model, you attach/detach _now_.

 o  When you modify the cfs_rq avg, you have to then call
    update_tg_load_avg() in order to propagate changes upwards.

 o  (Fair) entities are always attached, switched_{to,from}_fair()
    deal with !fair. This directly follows from the definition of the
    cfs_rq averages, namely that they are a direct sum of all
    (runnable or blocked) entities on that rq.

It is the second rule that this patch enforces, but it adds comments
pertaining to all of them.

Change-Id: Icdc906e98c67b84cb9582c893bc761a9886be57a
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 3d30544f02120b884bba2a9466c87dba980e3be5)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 85 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 80 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04dea070fc6e..8459caa2e8df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -718,6 +718,11 @@ void init_entity_runnable_average(struct sched_entity *se)
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
 /*
  * With new tasks being created, their initial util_avgs are extrapolated
  * based on the cfs_rq's current util_avg:
@@ -747,7 +752,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	struct sched_avg *sa = &se->avg;
-	long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2;
+	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+	u64 now = cfs_rq_clock_task(cfs_rq);
+	int tg_update;
 
 	if (cap > 0) {
 		if (cfs_rq->avg.util_avg != 0) {
@@ -765,6 +772,29 @@ void post_init_entity_util_avg(struct sched_entity *se)
 		 */
 		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 	}
+
+	if (entity_is_task(se)) {
+		struct task_struct *p = task_of(se);
+		if (p->sched_class != &fair_sched_class) {
+			/*
+			 * For !fair tasks do:
+			 *
+			update_cfs_rq_load_avg(now, cfs_rq, false);
+			attach_entity_load_avg(cfs_rq, se);
+			switched_from_fair(rq, p);
+			 *
+			 * such that the next switched_to_fair() has the
+			 * expected state.
+			 */
+			se->avg.last_update_time = now;
+			return;
+		}
+	}
+
+	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+	attach_entity_load_avg(cfs_rq, se);
+	if (tg_update)
+		update_tg_load_avg(cfs_rq, false);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
@@ -776,7 +806,10 @@ void init_entity_runnable_average(struct sched_entity *se)
 void post_init_entity_util_avg(struct sched_entity *se)
 {
 }
-#endif
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
 
 /*
  * Update the current task's runtime statistics.
@@ -2823,9 +2856,25 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 	WRITE_ONCE(*ptr, res);					\
 } while (0)
 
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq,
-					 bool update_freq)
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed utilization. It is expected
+ * that one calls update_tg_load_avg() on this condition, but after you've
+ * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * avg up.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 {
 	struct sched_avg *sa = &cfs_rq->avg;
 	int decayed, removed = 0, removed_util = 0;
@@ -2884,6 +2933,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 		trace_sched_load_avg_task(task_of(se), &se->avg);
 }
 
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (!sched_feat(ATTACH_AGE_LOAD))
@@ -2913,6 +2970,14 @@ skip_aging:
 	cfs_rq_util_change(cfs_rq);
 }
 
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -9322,6 +9387,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 now = cfs_rq_clock_task(cfs_rq);
+	int tg_update;
 
 	if (!vruntime_normalized(p)) {
 		/*
@@ -9333,13 +9400,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
 	}
 
 	/* Catch up with the cfs_rq and remove our load when we leave */
+	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
 	detach_entity_load_avg(cfs_rq, se);
+	if (tg_update)
+		update_tg_load_avg(cfs_rq, false);
 }
 
 static void attach_task_cfs_rq(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 now = cfs_rq_clock_task(cfs_rq);
+	int tg_update;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
@@ -9350,7 +9422,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
 #endif
 
 	/* Synchronize task with its cfs_rq */
+	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
 	attach_entity_load_avg(cfs_rq, se);
+	if (tg_update)
+		update_tg_load_avg(cfs_rq, false);
 
 	if (!vruntime_normalized(p))
 		se->vruntime += cfs_rq->min_vruntime;

From f9bef52c8505c29b24bff420dba15cea5ee581fe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 13 Jul 2016 10:56:25 +0200
Subject: [PATCH 0853/3220] UPSTREAM: sched/fair: Improve PELT stuff some more

Vincent noted that the update_tg_load_avg() usage in commit:

  3d30544f0212 ("sched/fair: Apply more PELT fixes")

isn't entirely sufficient. We need to call this function every time
cfs_rq->avg.load changes, this includes when update_cfs_rq_load_avg()
returns true, but {attach,detach}_entity_load_avg() themselves also
change it. This means we need to unconditionally call
update_tg_load_avg().

Also, add more comments.

Change-Id: I7e55fceb587601f73c760c8b0d47a7ef2b777b9e
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 7c3edd2c300b7ef2005a69dc727692ee07434aa5)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8459caa2e8df..3f2606842d02 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -754,7 +754,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
 	struct sched_avg *sa = &se->avg;
 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
 	u64 now = cfs_rq_clock_task(cfs_rq);
-	int tg_update;
 
 	if (cap > 0) {
 		if (cfs_rq->avg.util_avg != 0) {
@@ -791,10 +790,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
 		}
 	}
 
-	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+	update_cfs_rq_load_avg(now, cfs_rq, false);
 	attach_entity_load_avg(cfs_rq, se);
-	if (tg_update)
-		update_tg_load_avg(cfs_rq, false);
+	update_tg_load_avg(cfs_rq, false);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
@@ -2796,9 +2794,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
@@ -2868,10 +2878,10 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  *
  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
  *
- * Returns true if the load decayed or we removed utilization. It is expected
- * that one calls update_tg_load_avg() on this condition, but after you've
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
- * avg up.
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
  */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -9388,7 +9398,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
-	int tg_update;
 
 	if (!vruntime_normalized(p)) {
 		/*
@@ -9400,10 +9409,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
 	}
 
 	/* Catch up with the cfs_rq and remove our load when we leave */
-	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+	update_cfs_rq_load_avg(now, cfs_rq, false);
 	detach_entity_load_avg(cfs_rq, se);
-	if (tg_update)
-		update_tg_load_avg(cfs_rq, false);
+	update_tg_load_avg(cfs_rq, false);
 }
 
 static void attach_task_cfs_rq(struct task_struct *p)
@@ -9411,7 +9419,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
-	int tg_update;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
@@ -9422,10 +9429,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
 #endif
 
 	/* Synchronize task with its cfs_rq */
-	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+	update_cfs_rq_load_avg(now, cfs_rq, false);
 	attach_entity_load_avg(cfs_rq, se);
-	if (tg_update)
-		update_tg_load_avg(cfs_rq, false);
+	update_tg_load_avg(cfs_rq, false);
 
 	if (!vruntime_normalized(p))
 		se->vruntime += cfs_rq->min_vruntime;

From 18d09a45eceb888e6cdc62b9e6beaa7bf5e15045 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:42 +0100
Subject: [PATCH 0854/3220] UPSTREAM: sched/fair: Factorize attach/detach
 entity

Factorize post_init_entity_util_avg() and part of attach_task_cfs_rq()
in one function attach_entity_cfs_rq().

Create symmetric detach_entity_cfs_rq() function.

Change-Id: I44fc6bb5e71460be65f6b8928d4620c6c27a6a67
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-2-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit df217913e72ec7e603d8b68cc4c70646cf7000db)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 53 ++++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3f2606842d02..58b84c8d7071 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -719,9 +719,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 }
 
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static void attach_entity_cfs_rq(struct sched_entity *se);
 
 /*
  * With new tasks being created, their initial util_avgs are extrapolated
@@ -753,7 +751,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	struct sched_avg *sa = &se->avg;
 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
-	u64 now = cfs_rq_clock_task(cfs_rq);
 
 	if (cap > 0) {
 		if (cfs_rq->avg.util_avg != 0) {
@@ -785,14 +782,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
 			 * such that the next switched_to_fair() has the
 			 * expected state.
 			 */
-			se->avg.last_update_time = now;
+			se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
 			return;
 		}
 	}
 
-	update_cfs_rq_load_avg(now, cfs_rq, false);
-	attach_entity_load_avg(cfs_rq, se);
-	update_tg_load_avg(cfs_rq, false);
+	attach_entity_cfs_rq(se);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
@@ -9393,30 +9388,19 @@ static inline bool vruntime_normalized(struct task_struct *p)
 	return false;
 }
 
-static void detach_task_cfs_rq(struct task_struct *p)
+static void detach_entity_cfs_rq(struct sched_entity *se)
 {
-	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
 
-	if (!vruntime_normalized(p)) {
-		/*
-		 * Fix up our vruntime so that the current sleep doesn't
-		 * cause 'unlimited' sleep bonus.
-		 */
-		place_entity(cfs_rq, se, 0);
-		se->vruntime -= cfs_rq->min_vruntime;
-	}
-
 	/* Catch up with the cfs_rq and remove our load when we leave */
 	update_cfs_rq_load_avg(now, cfs_rq, false);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 }
 
-static void attach_task_cfs_rq(struct task_struct *p)
+static void attach_entity_cfs_rq(struct sched_entity *se)
 {
-	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
 
@@ -9428,10 +9412,35 @@ static void attach_task_cfs_rq(struct task_struct *p)
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
 
-	/* Synchronize task with its cfs_rq */
+	/* Synchronize entity with its cfs_rq */
 	update_cfs_rq_load_avg(now, cfs_rq, false);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	if (!vruntime_normalized(p)) {
+		/*
+		 * Fix up our vruntime so that the current sleep doesn't
+		 * cause 'unlimited' sleep bonus.
+		 */
+		place_entity(cfs_rq, se, 0);
+		se->vruntime -= cfs_rq->min_vruntime;
+	}
+
+	detach_entity_cfs_rq(se);
+}
+
+static void attach_task_cfs_rq(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	attach_entity_cfs_rq(se);
 
 	if (!vruntime_normalized(p))
 		se->vruntime += cfs_rq->min_vruntime;

From 723dab78719fea22340cc06992441d05a50b55e5 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:44 +0100
Subject: [PATCH 0855/3220] BACKPORT: sched/fair: Factorize PELT update

Every time we modify load/utilization of sched_entity, we start to
sync it with its cfs_rq. This update is done in different ways:

 - when attaching/detaching a sched_entity, we update cfs_rq and then
   we sync the entity with the cfs_rq.

 - when enqueueing/dequeuing the sched_entity, we update both
   sched_entity and cfs_rq metrics to now.

Use update_load_avg() everytime we have to update and sync cfs_rq and
sched_entity before changing the state of a sched_enity.

Change-Id: Ibde9a7e07ac80e9d5753bb4a0c30dfb3643cc666
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-4-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[backported FROMLIST]
Signed-off-by: Andres Oportus <andresoportus@google.com>
(cherry picked from commit d31b1a66cbe0931733583ad9d9e8c6cfd710907d)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 78 ++++++++++++++++-----------------------------
 1 file changed, 28 insertions(+), 50 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 58b84c8d7071..411f7749e73d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2916,8 +2916,14 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 	return decayed || removed;
 }
 
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG	0x1
+#define SKIP_AGE_LOAD	0x2
+
 /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
@@ -2927,11 +2933,13 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 	 * Track task load average for carrying it to new CPU after migrated, and
 	 * track group sched_entity load average for task_h_load calc in migration
 	 */
-	__update_load_avg(now, cpu, &se->avg,
+	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+		__update_load_avg(now, cpu, &se->avg,
 			  se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
+	}
 
-	if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+	if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG))
 		update_tg_load_avg(cfs_rq, 0);
 
 	if (entity_is_task(se))
@@ -2948,24 +2956,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
  */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (!sched_feat(ATTACH_AGE_LOAD))
-		goto skip_aging;
-
-	/*
-	 * If we got migrated (either between CPUs or between cgroups) we'll
-	 * have aged the average right before clearing @last_update_time.
-	 */
-	if (se->avg.last_update_time) {
-		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-				  &se->avg, 0, 0, NULL);
-
-		/*
-		 * XXX: we could have just aged the entire load away if we've been
-		 * absent from the fair class for too long.
-		 */
-	}
-
-skip_aging:
 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
 	cfs_rq->avg.load_avg += se->avg.load_avg;
 	cfs_rq->avg.load_sum += se->avg.load_sum;
@@ -2985,9 +2975,6 @@ skip_aging:
  */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
-			  cfs_rq->curr == se, NULL);
 
 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
@@ -3002,34 +2989,20 @@ static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct sched_avg *sa = &se->avg;
-	u64 now = cfs_rq_clock_task(cfs_rq);
-	int migrated, decayed;
-
-	migrated = !sa->last_update_time;
-	if (!migrated) {
-		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-			se->on_rq * scale_load_down(se->load.weight),
-			cfs_rq->curr == se, NULL);
-	}
-
-	decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
 
 	cfs_rq->runnable_load_avg += sa->load_avg;
 	cfs_rq->runnable_load_sum += sa->load_sum;
 
-	if (migrated)
+	if (!sa->last_update_time) {
 		attach_entity_load_avg(cfs_rq, se);
-
-	if (decayed || migrated)
 		update_tg_load_avg(cfs_rq, 0);
+	}
 }
 
 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
 static inline void
 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	update_load_avg(se, 1);
-
 	cfs_rq->runnable_load_avg =
 		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
 	cfs_rq->runnable_load_sum =
@@ -3122,11 +3095,16 @@ static int idle_balance(struct rq *this_rq);
 
 #else /* CONFIG_SMP */
 
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 {
-	cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+	return 0;
 }
 
+#define UPDATE_TG	0x0
+#define SKIP_AGE_LOAD	0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1){}
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
@@ -3269,6 +3247,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	update_load_avg(se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
@@ -3344,6 +3323,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	update_load_avg(se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
 	update_stats_dequeue(cfs_rq, se);
@@ -3434,7 +3414,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
-		update_load_avg(se, 1);
+		update_load_avg(se, UPDATE_TG);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
@@ -3550,7 +3530,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Ensure that runnable average is periodically updated.
 	 */
-	update_load_avg(curr, 1);
+	update_load_avg(curr, UPDATE_TG);
 	update_cfs_shares(cfs_rq);
 
 #ifdef CONFIG_SCHED_HRTICK
@@ -4479,7 +4459,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, 1);
+		update_load_avg(se, UPDATE_TG);
 		update_cfs_shares(cfs_rq);
 	}
 
@@ -4581,7 +4561,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, 1);
+		update_load_avg(se, UPDATE_TG);
 		update_cfs_shares(cfs_rq);
 	}
 
@@ -9391,10 +9371,9 @@ static inline bool vruntime_normalized(struct task_struct *p)
 static void detach_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 now = cfs_rq_clock_task(cfs_rq);
 
 	/* Catch up with the cfs_rq and remove our load when we leave */
-	update_cfs_rq_load_avg(now, cfs_rq, false);
+	update_load_avg(se, 0);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 }
@@ -9402,7 +9381,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
 static void attach_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 now = cfs_rq_clock_task(cfs_rq);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
@@ -9413,7 +9391,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 #endif
 
 	/* Synchronize entity with its cfs_rq */
-	update_cfs_rq_load_avg(now, cfs_rq, false);
+	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 }

From 8370e07d82c92981e8cc9df180df6e943db179af Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:43 +0100
Subject: [PATCH 0856/3220] UPSTREAM: sched/fair: Fix hierarchical order in
 rq->leaf_cfs_rq_list

Fix the insertion of cfs_rq in rq->leaf_cfs_rq_list to ensure that a
child will always be called before its parent.

The hierarchical order in shares update list has been introduced by
commit:

  67e86250f8ea ("sched: Introduce hierarchal order on shares update list")

With the current implementation a child can be still put after its
parent.

Lets take the example of:

       root
        \
         b
         /\
         c d*
           |
           e*

with root -> b -> c already enqueued but not d -> e so the
leaf_cfs_rq_list looks like: head -> c -> b -> root -> tail

The branch d -> e will be added the first time that they are enqueued,
starting with e then d.

When e is added, its parents is not already on the list so e is put at
the tail : head -> c -> b -> root -> e -> tail

Then, d is added at the head because its parent is already on the
list: head -> d -> c -> b -> root -> e -> tail

e is not placed at the right position and will be called the last
whereas it should be called at the beginning.

Because it follows the bottom-up enqueue sequence, we are sure that we
will finished to add either a cfs_rq without parent or a cfs_rq with a
parent that is already on the list. We can use this event to detect
when we have finished to add a new branch. For the others, whose
parents are not already added, we have to ensure that they will be
added after their children that have just been inserted the steps
before, and after any potential parents that are already in the list.
The easiest way is to put the cfs_rq just after the last inserted one
and to keep track of it untl the branch is fully added.

Change-Id: I4fe0b8502ea628c13d14e8e5c5279bce67fb8845
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-3-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 9c2791f936ef5fd04a118b5c284f2c9a95f4a647)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c  |  1 +
 kernel/sched/fair.c  | 54 ++++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |  1 +
 3 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 00e969d7b2ad..defc88cda733 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7814,6 +7814,7 @@ void __init sched_init(void)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 		/*
 		 * How much cpu bandwidth does root_task_group get?
 		 *
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 411f7749e73d..15ccfbff1bde 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -305,19 +305,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_rq->on_list) {
+		struct rq *rq = rq_of(cfs_rq);
+		int cpu = cpu_of(rq);
 		/*
 		 * Ensure we either appear before our parent (if already
 		 * enqueued) or force our parent to appear after us when it is
-		 * enqueued.  The fact that we always enqueue bottom-up
-		 * reduces this to two cases.
+		 * enqueued. The fact that we always enqueue bottom-up
+		 * reduces this to two cases and a special case for the root
+		 * cfs_rq. Furthermore, it also means that we will always reset
+		 * tmp_alone_branch either when the branch is connected
+		 * to a tree or when we reach the beg of the tree
 		 */
 		if (cfs_rq->tg->parent &&
-		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
-			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-				&rq_of(cfs_rq)->leaf_cfs_rq_list);
-		} else {
+		    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+			/*
+			 * If parent is already on the list, we add the child
+			 * just before. Thanks to circular linked property of
+			 * the list, this means to put the child at the tail
+			 * of the list that starts by parent.
+			 */
 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-				&rq_of(cfs_rq)->leaf_cfs_rq_list);
+				&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+			/*
+			 * The branch is now connected to its tree so we can
+			 * reset tmp_alone_branch to the beginning of the
+			 * list.
+			 */
+			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+		} else if (!cfs_rq->tg->parent) {
+			/*
+			 * cfs rq without parent should be put
+			 * at the tail of the list.
+			 */
+			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+				&rq->leaf_cfs_rq_list);
+			/*
+			 * We have reach the beg of a tree so we can reset
+			 * tmp_alone_branch to the beginning of the list.
+			 */
+			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+		} else {
+			/*
+			 * The parent has not already been added so we want to
+			 * make sure that it will be put after us.
+			 * tmp_alone_branch points to the beg of the branch
+			 * where we will add parent.
+			 */
+			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+				rq->tmp_alone_branch);
+			/*
+			 * update tmp_alone_branch to points to the new beg
+			 * of the branch
+			 */
+			rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 		}
 
 		cfs_rq->on_list = 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 50b9229d47ab..5660e8495ae2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -621,6 +621,7 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
+	struct list_head *tmp_alone_branch;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 	/*

From e875665411983d39589f0ba5416e0b99c6e0d2cb Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:45 +0100
Subject: [PATCH 0857/3220] UPSTREAM: sched/fair: Propagate load during
 synchronous attach/detach

When a task moves from/to a cfs_rq, we set a flag which is then used to
propagate the change at parent level (sched_entity and cfs_rq) during
next update. If the cfs_rq is throttled, the flag will stay pending until
the cfs_rq is unthrottled.

For propagating the utilization, we copy the utilization of group cfs_rq to
the sched_entity.

For propagating the load, we have to take into account the load of the
whole task group in order to evaluate the load of the sched_entity.
Similarly to what was done before the rewrite of PELT, we add a correction
factor in case the task group's load is greater than its share so it will
contribute the same load of a task of equal weight.

Change-Id: Id34a9888484716961c9027299c0b4d82881a39d1
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-5-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 09a43ace1f986b003c118fdf6ddf1fd685692d49)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c  | 240 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |   1 +
 2 files changed, 240 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 15ccfbff1bde..a8f86f214b45 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2828,6 +2828,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 	return decayed;
 }
 
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {                           \
+	typeof(_ptr) ptr = (_ptr);                              \
+	typeof(_val) val = (_val);                              \
+	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+								\
+	res = var + val;                                        \
+								\
+	if (val < 0 && res > var)                               \
+		res = 0;                                        \
+								\
+	WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /**
  * update_tg_load_avg - update the tg's load avg
@@ -2849,14 +2869,196 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 
+	/*
+	 * No need to update load_avg for root_task_group as it is not used.
+	 */
+	if (cfs_rq->tg == &root_task_group)
+		return;
+
 	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
 	}
 }
 
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+		      struct cfs_rq *prev, struct cfs_rq *next)
+{
+	if (!sched_feat(ATTACH_AGE_LOAD))
+		return;
+
+	/*
+	 * We are supposed to update the task to "current" time, then its up to
+	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+	 * getting what current time is, so simply throw away the out-of-date
+	 * time. This will result in the wakee task is less decayed, but giving
+	 * the wakee more load sounds not bad.
+	 */
+	if (se->avg.last_update_time && prev) {
+		u64 p_last_update_time;
+		u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+		u64 p_last_update_time_copy;
+		u64 n_last_update_time_copy;
+
+		do {
+			p_last_update_time_copy = prev->load_last_update_time_copy;
+			n_last_update_time_copy = next->load_last_update_time_copy;
+
+			smp_rmb();
+
+			p_last_update_time = prev->avg.last_update_time;
+			n_last_update_time = next->avg.last_update_time;
+
+		} while (p_last_update_time != p_last_update_time_copy ||
+			 n_last_update_time != n_last_update_time_copy);
+#else
+		p_last_update_time = prev->avg.last_update_time;
+		n_last_update_time = next->avg.last_update_time;
+#endif
+		__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+				  &se->avg, 0, 0, NULL);
+		se->avg.last_update_time = n_last_update_time;
+	}
+}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+	/* Nothing to update */
+	if (!delta)
+		return;
+
+	/* Set new sched_entity's utilization */
+	se->avg.util_avg = gcfs_rq->avg.util_avg;
+	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+	/* Update parent cfs_rq utilization */
+	add_positive(&cfs_rq->avg.util_avg, delta);
+	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	long delta, load = gcfs_rq->avg.load_avg;
+
+	/*
+	 * If the load of group cfs_rq is null, the load of the
+	 * sched_entity will also be null so we can skip the formula
+	 */
+	if (load) {
+		long tg_load;
+
+		/* Get tg's load and ensure tg_load > 0 */
+		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+		/* Ensure tg_load >= load and updated with current load*/
+		tg_load -= gcfs_rq->tg_load_avg_contrib;
+		tg_load += load;
+
+		/*
+		 * We need to compute a correction term in the case that the
+		 * task group is consuming more CPU than a task of equal
+		 * weight. A task with a weight equals to tg->shares will have
+		 * a load less or equal to scale_load_down(tg->shares).
+		 * Similarly, the sched_entities that represent the task group
+		 * at parent level, can't have a load higher than
+		 * scale_load_down(tg->shares). And the Sum of sched_entities'
+		 * load must be <= scale_load_down(tg->shares).
+		 */
+		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+			/* scale gcfs_rq's load into tg's shares*/
+			load *= scale_load_down(gcfs_rq->tg->shares);
+			load /= tg_load;
+		}
+	}
+
+	delta = load - se->avg.load_avg;
+
+	/* Nothing to update */
+	if (!delta)
+		return;
+
+	/* Set new sched_entity's load */
+	se->avg.load_avg = load;
+	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+	/* Update parent cfs_rq load */
+	add_positive(&cfs_rq->avg.load_avg, delta);
+	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+	/*
+	 * If the sched_entity is already enqueued, we also have to update the
+	 * runnable load avg.
+	 */
+	if (se->on_rq) {
+		/* Update parent cfs_rq runnable_load_avg */
+		add_positive(&cfs_rq->runnable_load_avg, delta);
+		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+	}
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+	cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+	if (!cfs_rq->propagate_avg)
+		return 0;
+
+	cfs_rq->propagate_avg = 0;
+	return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq;
+
+	if (entity_is_task(se))
+		return 0;
+
+	if (!test_and_clear_tg_cfs_propagate(se))
+		return 0;
+
+	cfs_rq = cfs_rq_of(se);
+
+	set_tg_cfs_propagate(cfs_rq);
+
+	update_tg_cfs_util(cfs_rq, se);
+	update_tg_cfs_load(cfs_rq, se);
+
+	return 1;
+}
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
+
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
@@ -2968,6 +3170,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
 	int cpu = cpu_of(rq_of(cfs_rq));
+	int decayed;
 
 	/*
 	 * Track task load average for carrying it to new CPU after migrated, and
@@ -2979,7 +3182,10 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
 			  cfs_rq->curr == se, NULL);
 	}
 
-	if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG))
+	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
+	decayed |= propagate_entity_load_avg(se);
+
+	if (decayed && (flags & UPDATE_TG))
 		update_tg_load_avg(cfs_rq, 0);
 
 	if (entity_is_task(se))
@@ -3001,6 +3207,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq->avg.load_sum += se->avg.load_sum;
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
+	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3020,6 +3227,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -9408,6 +9616,31 @@ static inline bool vruntime_normalized(struct task_struct *p)
 	return false;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq;
+
+	/* Start to propagate at parent */
+	se = se->parent;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
+		update_load_avg(se, UPDATE_TG);
+	}
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
 static void detach_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -9416,6 +9649,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
 	update_load_avg(se, 0);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+	propagate_entity_cfs_rq(se);
 }
 
 static void attach_entity_cfs_rq(struct sched_entity *se)
@@ -9434,6 +9668,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+	propagate_entity_cfs_rq(se);
 }
 
 static void detach_task_cfs_rq(struct task_struct *p)
@@ -9512,6 +9747,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	cfs_rq->propagate_avg = 0;
+#endif
 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
 	atomic_long_set(&cfs_rq->removed_util_avg, 0);
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5660e8495ae2..d0971e5c9269 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -376,6 +376,7 @@ struct cfs_rq {
 	unsigned long runnable_load_avg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	unsigned long tg_load_avg_contrib;
+	unsigned long propagate_avg;
 #endif
 	atomic_long_t removed_load_avg, removed_util_avg;
 #ifndef CONFIG_64BIT

From 89e4d18a6712b6452c577776abb8536913b6c1fb Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:46 +0100
Subject: [PATCH 0858/3220] UPSTREAM: sched/fair: Propagate asynchrous detach

A task can be asynchronously detached from cfs_rq when migrating
between CPUs. The load of the migrated task is then removed from
source cfs_rq during its next update. We use this event to set
propagation flag.

During the load balance, we take advantage of the update of blocked
load to propagate any pending changes.

The propagation relies on patch:

  "sched: Fix hierarchical order in rq->leaf_cfs_rq_list"

... which orders children and parents, to ensure that it's done in one pass.

Change-Id: I33782e35fc4711f5901e8c23d6aa7ec5f2ff7ee5
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-6-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 4e5160766fcc9f41bbd38bac11f92dce993644aa)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a8f86f214b45..e49408d0f590 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3131,6 +3131,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 		sub_positive(&sa->load_avg, r);
 		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
 		removed = 1;
+		set_tg_cfs_propagate(cfs_rq);
 	}
 
 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -3138,6 +3139,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 		sub_positive(&sa->util_avg, r);
 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
 		removed_util = 1;
+		set_tg_cfs_propagate(cfs_rq);
 	}
 
 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -7347,6 +7349,10 @@ static void update_blocked_averages(int cpu)
 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
 					   true))
 			update_tg_load_avg(cfs_rq, 0);
+
+		/* Propagate pending load changes to the parent */
+		if (cfs_rq->tg->se[cpu])
+			update_load_avg(cfs_rq->tg->se[cpu], 0);
 	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }

From 640c909c3470b8b8882676754485d5ee0ccccf7d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Jun 2016 15:53:54 +0200
Subject: [PATCH 0859/3220] UPSTREAM: sched/fair: Fix effective_load() to
 consistently use smoothed load

Starting with the following commit:

  fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities")

calc_tg_weight() doesn't compute the right value as expected by effective_load().

The difference is in the 'correction' term. In order to ensure \Sum
rw_j >= rw_i we cannot use tg->load_avg directly, since that might be
lagging a correction on the current cfs_rq->avg.load_avg value.
Therefore we use tg->load_avg - cfs_rq->tg_load_avg_contrib +
cfs_rq->avg.load_avg.

Now, per the referenced commit, calc_tg_weight() doesn't use
cfs_rq->avg.load_avg, as is later used in @w, but uses
cfs_rq->load.weight instead.

So stop using calc_tg_weight() and do it explicitly.

The effects of this bug are wake_affine() making randomly
poor choices in cgroup-intense workloads.

Change-Id: I1c0058ff674650cf295c8dc3b88a5a3de4bddab0
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org> # v4.3+
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 7dd4912594daf769a46744848b05bd5bc6d62469)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e49408d0f590..df6fc28f80c7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -830,8 +830,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
 	attach_entity_cfs_rq(se);
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
 {

From 20bbd92679ce1722c0b037b6eb49809a1c9cfd71 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 19 Oct 2016 14:45:23 +0200
Subject: [PATCH 0860/3220] UPSTREAM: sched/fair: Fix incorrect task group
 ->load_avg

A scheduler performance regression has been reported by Joseph Salisbury,
which he bisected back to:

  3d30544f0212 ("sched/fair: Apply more PELT fixes)

The regression triggers when several levels of task groups are involved
(read: SystemD) and cpu_possible_mask != cpu_present_mask.

The root cause is that group entity's load (tg_child->se[i]->avg.load_avg)
is initialized to scale_load_down(se->load.weight). During the creation of
a child task group, its group entities on possible CPUs are attached to
parent's cfs_rq (tg_parent) and their loads are added to the parent's load
(tg_parent->load_avg) with update_tg_load_avg().

But only the load on online CPUs will then be updated to reflect real load,
whereas load on other CPUs will stay at the initial value.

The result is a tg_parent->load_avg that is higher than the real load, the
weight of group entities (tg_parent->se[i]->load.weight) on online CPUs is
smaller than it should be, and the task group gets a less running time than
what it could expect.

( This situation can be detected with /proc/sched_debug. The ".tg_load_avg"
  of the task group will be much higher than sum of ".tg_load_avg_contrib"
  of online cfs_rqs of the task group. )

The load of group entities don't have to be intialized to something else
than 0 because their load will increase when an entity is attached.

Change-Id: Ie55021ff98ba49016adfddb2444e9c9709939226
Reported-by: Joseph Salisbury <joseph.salisbury@canonical.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@vger.kernel.org> # 4.8.x
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: joonwoop@codeaurora.org
Fixes: 3d30544f0212 ("sched/fair: Apply more PELT fixes)
Link: http://lkml.kernel.org/r/1476881123-10159-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit b5a9b340789b2b24c6896bcf7a065c31a4db671c)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df6fc28f80c7..f3fc9f4f353e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -739,7 +739,14 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 * will definitely be update (after enqueue).
 	 */
 	sa->period_contrib = 1023;
-	sa->load_avg = scale_load_down(se->load.weight);
+	/*
+	 * Tasks are intialized with full load to be seen as heavy tasks until
+	 * they get a chance to stabilize to their real load level.
+	 * Group entities are intialized with zero load to reflect the fact that
+	 * nothing has been attached to the task group yet.
+	 */
+	if (entity_is_task(se))
+		sa->load_avg = scale_load_down(se->load.weight);
 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 	/*
 	 * In previous Android versions, we used to have:

From baaa21b59be28c62746fd4f1465730bd42d1d5fc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 24 Jun 2016 16:11:02 +0200
Subject: [PATCH 0861/3220] UPSTREAM: sched/fair: Fix calc_cfs_shares() fixed
 point arithmetics width confusion

Commit:

  fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities")

did something non-obvious but also did it buggy yet latent.

The problem was exposed for real by a later commit in the v4.7 merge window:

  2159197d6677 ("sched/core: Enable increased load resolution on 64-bit kernels")

... after which tg->load_avg and cfs_rq->load.weight had different
units (10 bit fixed point and 20 bit fixed point resp.).

Add a comment to explain the use of cfs_rq->load.weight over the
'natural' cfs_rq->avg.load_avg and add scale_load_down() to correct
for the difference in unit.

Since this is (now, as per a previous commit) the only user of
calc_tg_weight(), collapse it.

The effects of this bug should be randomly inconsistent SMP-balancing
of cgroups workloads.

Change-Id: If1e565662ea163485edd94a12aef644d0e0dfe7a
Reported-by: Jirka Hladky <jhladky@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 2159197d6677 ("sched/core: Enable increased load resolution on 64-bit kernels")
Fixes: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit ea1dc6fc6242f991656e35e2ed3d90ec1cd13418)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f3fc9f4f353e..911cb335e873 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2530,28 +2530,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
-{
-	long tg_weight;
-
-	/*
-	 * Use this CPU's real-time load instead of the last load contribution
-	 * as the updating of the contribution is delayed, and we will use the
-	 * the real-time load to calc the share. See update_tg_load_avg().
-	 */
-	tg_weight = atomic_long_read(&tg->load_avg);
-	tg_weight -= cfs_rq->tg_load_avg_contrib;
-	tg_weight += cfs_rq->load.weight;
-
-	return tg_weight;
-}
-
 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	long tg_weight, load, shares;
 
-	tg_weight = calc_tg_weight(tg, cfs_rq);
-	load = cfs_rq->load.weight;
+	/*
+	 * This really should be: cfs_rq->avg.load_avg, but instead we use
+	 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+	 * the shares for small weight interactive tasks.
+	 */
+	load = scale_load_down(cfs_rq->load.weight);
+
+	tg_weight = atomic_long_read(&tg->load_avg);
+
+	/* Ensure tg_weight >= load */
+	tg_weight -= cfs_rq->tg_load_avg_contrib;
+	tg_weight += load;
 
 	shares = (tg->shares * load);
 	if (tg_weight)
@@ -2570,6 +2564,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 	return tg->shares;
 }
 # endif /* CONFIG_SMP */
+
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {

From e62a1ca36b90450f37d570fc98041a164a5778ed Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 21 Dec 2016 16:50:26 +0100
Subject: [PATCH 0862/3220] UPSTREAM: sched/core: Fix group_entity's share
 update

The update of the share of a cfs_rq is done when its load_avg is updated
but before the group_entity's load_avg has been updated for the past time
slot. This generates wrong load_avg accounting which can be significant
when small tasks are involved in the scheduling.

Let take the example of a task a that is dequeued of its task group A:
   root
  (cfs_rq)
    \
    (se)
     A
    (cfs_rq)
      \
      (se)
       a

Task "a" was the only task in task group A which becomes idle when a is
dequeued.

We have the sequence:

- dequeue_entity a->se
    - update_load_avg(a->se)
    - dequeue_entity_load_avg(A->cfs_rq, a->se)
    - update_cfs_shares(A->cfs_rq)
	A->cfs_rq->load.weight == 0
        A->se->load.weight is updated with the new share (0 in this case)
- dequeue_entity A->se
    - update_load_avg(A->se) but its weight is now null so the last time
      slot (up to a tick) will be accounted with a weight of 0 instead of
      its real weight during the time slot. The last time slot will be
      accounted as an idle one whereas it was a running one.

If the running time of task a is short enough that no tick happens when it
runs, all running time of group entity A->se will be accounted as idle
time.

Instead, we should update the share of a cfs_rq (in fact the weight of its
group entity) only after having updated the load_avg of the group_entity.

update_cfs_shares() now takes the sched_entity as a parameter instead of the
cfs_rq, and the weight of the group_entity is updated only once its load_avg
has been synced with current time.

Change-Id: Id6ce3be1767b44b444ce2a77ed1ba063e57c0664
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: pjt@google.com
Link: http://lkml.kernel.org/r/1482335426-7664-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 89ee048f3cc796db6f26906c6bef4edf0bee70fd)
[minor cherry pick stuff]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 42 +++++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 911cb335e873..d2479dc60e99 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2583,16 +2583,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
 {
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 	struct task_group *tg;
-	struct sched_entity *se;
 	long shares;
 
-	tg = cfs_rq->tg;
-	se = tg->se[cpu_of(rq_of(cfs_rq))];
-	if (!se || throttled_hierarchy(cfs_rq))
+	if (!cfs_rq)
 		return;
+
+	if (throttled_hierarchy(cfs_rq))
+		return;
+
+	tg = cfs_rq->tg;
+
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
 		return;
@@ -2601,8 +2605,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3499,8 +3504,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_curr(cfs_rq);
 	update_load_avg(se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
+	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -3573,6 +3578,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+
+	/*
+	 * When dequeuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Substract its load from the cfs_rq->runnable_avg.
+	 *   - Substract its previous weight from cfs_rq->load.weight.
+	 *   - For group entity, update its weight to reflect the new share
+	 *     of its group cfs_rq.
+	 */
 	update_load_avg(se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
@@ -3609,7 +3623,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	return_cfs_rq_runtime(cfs_rq);
 
 	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(se);
 }
 
 /*
@@ -3781,7 +3795,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_load_avg(curr, UPDATE_TG);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
 	/*
@@ -4710,7 +4724,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -4812,7 +4826,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -9920,8 +9934,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 
 		/* Possible calls to update_curr() need rq clock */
 		update_rq_clock(rq);
-		for_each_sched_entity(se)
-			update_cfs_shares(group_cfs_rq(se));
+		for_each_sched_entity(se) {
+			update_load_avg(se, UPDATE_TG);
+			update_cfs_shares(se);
+		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 

From 55af3848151fb76013d427ea8dfff9f829f98ef6 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 30 Jul 2015 16:53:30 +0100
Subject: [PATCH 0863/3220] sched: EAS & 'single cpu per cluster'/cpu hotplug
 interoperability

For Energy-Aware Scheduling (EAS) to work properly, even in the
case that there is only one cpu per cluster or that cpus are hot-plugged
out, the Energy Model (EM) data on all energy-aware sched domains (sd)
has to be present for all online cpus.

Mainline sd hierarchy setup code will remove sd's which are not useful
for task scheduling e.g. in the following situations:

1. Only 1 cpu is/remains in one cluster of a multi cluster system.

   This remaining cpu only has DIE and no MC sd.

2. A complete cluster in a two cluster system is hot-plugged out.

   The cpus of the remaining cluster only have MC and no DIE sd.

To make sure that all online cpus keep all their energy-aware sd's,
the sd degenerate functionality has been changed to not free a sd if
its first sched group (sg) contains EM data in case:

1. There is only 1 cpu left in the sd.

2. There have to be at least 2 sg's if certain sd flags are set.

Instead of freeing such a sd it now clears only its SD_LOAD_BALANCE
flag. This will make sure that the EAS functionality will always see
all energy-aware sd's for all online cpus.

It will introduce a tiny performance degradation for operations on
affected cpus since the hot-path macro for_each_domain() has to deal
with sd's not contributing to task scheduling at all now.

In most cases the exisiting code makes sure that task scheduling is not
invoked on a sd with !SD_LOAD_BALANCE.

However, a small change is necessary in update_sd_lb_stats() to make
sure that sd->parent is only initialized to !NULL in case the parent sd
contains more than 1 sg.

The handling of newidle decay values before the SD_LOAD_BALANCE check in
rebalance_domains() stays unchanged.

Test (w/ CONFIG_SCHED_DEBUG):

JUNO r0 default system:

$ cat /proc/cpuinfo | grep "^CPU part"
CPU part        : 0xd03
CPU part        : 0xd07
CPU part        : 0xd07
CPU part        : 0xd03
CPU part        : 0xd03
CPU part        : 0xd03

SD names and flags:

$ cat /proc/sys/kernel/sched_domain/cpu*/domain*/name
MC
DIE
MC
DIE
MC
DIE
MC
DIE
MC
DIE
MC
DIE

$ printf "%x\n" `cat /proc/sys/kernel/sched_domain/cpu*/domain*/flags`
832f
102f
832f
102f
832f
102f
832f
102f
832f
102f
832f
102f

Test 1: Hotplug-out one A57 (CPU part 0xd07) cpu:

$ echo 0 > /sys/devices/system/cpu/cpu1/online

$ cat /proc/cpuinfo | grep "^CPU part"
CPU part        : 0xd03
CPU part        : 0xd07
CPU part        : 0xd03
CPU part        : 0xd03
CPU part        : 0xd03

SD names and flags for remaining A57 (cpu2) cpu:

$ cat /proc/sys/kernel/sched_domain/cpu2/domain*/name
MC
DIE

$ printf "%x\n" `cat /proc/sys/kernel/sched_domain/cpu2/domain*/flags`
832e <-- MC SD with !SD_LOAD_BALANCE
102f

Test 2: Hotplug-out the entire A57 cluster:

$ echo 0 > /sys/devices/system/cpu/cpu1/online
$ echo 0 > /sys/devices/system/cpu/cpu2/online

$ cat /proc/cpuinfo | grep "^CPU part"
CPU part        : 0xd03
CPU part        : 0xd03
CPU part        : 0xd03
CPU part        : 0xd03

SD names and flags for the remaining A53 (CPU part 0xd03) cluster:

$ cat /proc/sys/kernel/sched_domain/cpu*/domain*/name
MC
DIE
MC
DIE
MC
DIE
MC
DIE

$ printf "%x\n" `cat /proc/sys/kernel/sched_domain/cpu*/domain*/flags`
832f
102e <-- DIE SD with !SD_LOAD_BALANCE
832f
102e
832f
102e
832f
102e

Change-Id: If24aa2b2628f334abbf0207d39e2a86168d9d673
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/core.c | 17 ++++++++++-------
 kernel/sched/fair.c |  7 +++++--
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index defc88cda733..da44d13f2439 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5934,9 +5934,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 	if (!(sd->flags & SD_LOAD_BALANCE)) {
 		printk("does not load-balance\n");
-		if (sd->parent)
-			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-					" has parent");
 		return -1;
 	}
 
@@ -6029,8 +6026,12 @@ static inline bool sched_debug(void)
 
 static int sd_degenerate(struct sched_domain *sd)
 {
-	if (cpumask_weight(sched_domain_span(sd)) == 1)
-		return 1;
+	if (cpumask_weight(sched_domain_span(sd)) == 1) {
+		if (sd->groups->sge)
+			sd->flags &= ~SD_LOAD_BALANCE;
+		else
+			return 1;
+	}
 
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
@@ -6076,6 +6077,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 				SD_PREFER_SIBLING |
 				SD_SHARE_POWERDOMAIN |
 				SD_SHARE_CAP_STATES);
+		if (parent->groups->sge) {
+			parent->flags &= ~SD_LOAD_BALANCE;
+			return 0;
+		}
 		if (nr_node_ids == 1)
 			pflags &= ~SD_SERIALIZE;
 	}
@@ -7353,8 +7358,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 				*per_cpu_ptr(d.sd, i) = sd;
 			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
 				sd->flags |= SD_OVERLAP;
-			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-				break;
 		}
 	}
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d2479dc60e99..c84e15e3cd56 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7971,6 +7971,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+#define lb_sd_parent(sd) \
+	(sd->parent && sd->parent->groups != sd->parent->groups->next)
+
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
@@ -8053,7 +8056,7 @@ next_group:
 
 	env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
 
-	if (!env->sd->parent) {
+	if (!lb_sd_parent(env->sd)) {
 		/* update overload indicator if we are at root domain */
 		if (env->dst_rq->rd->overload != overload)
 			env->dst_rq->rd->overload = overload;
@@ -8561,7 +8564,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			int *continue_balancing)
 {
 	int ld_moved, cur_ld_moved, active_balance = 0;
-	struct sched_domain *sd_parent = sd->parent;
+	struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
 	struct sched_group *group;
 	struct rq *busiest;
 	unsigned long flags;

From aa8882923a3f89aa880b6eff6fccc70e77b4a7c2 Mon Sep 17 00:00:00 2001
From: Andres Oportus <andresoportus@google.com>
Date: Thu, 23 Feb 2017 11:58:22 -0800
Subject: [PATCH 0864/3220] sched/core: Fix PELT jump to max OPP upon util
 increase

Change-Id: Ic80b588ec466ef707f658dcea039fd0d6b384b63
Signed-off-by: Andres Oportus <andresoportus@google.com>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da44d13f2439..e353de860bfd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2969,9 +2969,10 @@ unsigned long sum_capacity_reqs(unsigned long cfs_cap,
 	return total += scr->dl;
 }
 
+unsigned long boosted_cpu_util(int cpu);
 static void sched_freq_tick_pelt(int cpu)
 {
-	unsigned long cpu_utilization = capacity_max;
+	unsigned long cpu_utilization = boosted_cpu_util(cpu);
 	unsigned long capacity_curr = capacity_curr_of(cpu);
 	struct sched_capacity_reqs *scr;
 

From 4b85765a3dd9e1241e2cfbb8bd600f88411cfa0a Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 22 Mar 2017 18:23:13 +0000
Subject: [PATCH 0865/3220] sched/fair: Add eas (& cas) specific rq, sd and
 task stats

The statistic counter are placed in the eas (& cas) wakeup path. Each
of them has one representation for the runqueue (rq), the sched_domain
(sd) and the task.
A task counter is always incremented. A rq counter is always
incremented for the rq the scheduler is currently running on. A sd
counter is only incremented if a relation to a sd exists.

The counters are exposed:

(1) In /proc/schedstat for rq's and sd's:

$ cat /proc/schedstat
...
cpu0 71422 0 2321254 ...
eas  44144 0 0 19446 0 24698 568435 51621 156932 133 222011 17459 120279 516814 83 0 156962 359235 176439 139981
  <- runqueue for cpu0
...
domain0 3 42430 42331 ...
eas 0 0 0 14200 0 0 0 0 0 0 0 0 0 0 0 0 0 0 66355 0  <- MC sched domain for cpu0
...

The per-cpu eas vector has the following elements:

sis_attempts  sis_idle   sis_cache_affine sis_suff_cap    sis_idle_cpu    sis_count               ||
secb_attempts secb_sync  secb_idle_bt     secb_insuff_cap secb_no_nrg_sav secb_nrg_sav secb_count ||
fbt_attempts  fbt_no_cpu fbt_no_sd        fbt_pref_idle   fbt_count                               ||
cas_attempts  cas_count

The following relations exist between these counters (from cpu0 eas
vector above):

sis_attempts = sis_idle + sis_cache_affine + sis_suff_cap + sis_idle_cpu + sis_count

44144        = 0        + 0                + 19446        + 0            + 24698

secb_attempts = secb_sync + secb_idle_bt + secb_insuff_cap + secb_no_nrg_sav + secb_nrg_sav + secb_count

568435        = 51621     + 156932       + 133             + 222011          + 17459        + 120279

fbt_attempts = fbt_no_cpu + fbt_no_sd + fbt_pref_idle + fbt_count + (return -1)

516814       = 83         + 0         + 156962        + 359235    + (534)

cas_attempts = cas_count + (return -1 or smp_processor_id())

176439       = 139981    + (36458)

(2) In /proc/$PROCESS_PID/task/$TASK_PID/sched for a task.

example: main thread of system_server

$ cat /proc/1083/task/1083/sched

...
se.statistics.nr_wakeups_sis_attempts        :                  945
se.statistics.nr_wakeups_sis_idle            :                    0
se.statistics.nr_wakeups_sis_cache_affine    :                    0
se.statistics.nr_wakeups_sis_suff_cap        :                  219
se.statistics.nr_wakeups_sis_idle_cpu        :                    0
se.statistics.nr_wakeups_sis_count           :                  726
se.statistics.nr_wakeups_secb_attempts       :                10376
se.statistics.nr_wakeups_secb_sync           :                 1462
se.statistics.nr_wakeups_secb_idle_bt        :                 6984
se.statistics.nr_wakeups_secb_insuff_cap     :                    3
se.statistics.nr_wakeups_secb_no_nrg_sav     :                  927
se.statistics.nr_wakeups_secb_nrg_sav        :                  206
se.statistics.nr_wakeups_secb_count          :                  794
se.statistics.nr_wakeups_fbt_attempts        :                 8914
se.statistics.nr_wakeups_fbt_no_cpu          :                    0
se.statistics.nr_wakeups_fbt_no_sd           :                    0
se.statistics.nr_wakeups_fbt_pref_idle       :                 6987
se.statistics.nr_wakeups_fbt_count           :                 1554
se.statistics.nr_wakeups_cas_attempts        :                 3107
se.statistics.nr_wakeups_cas_count           :                 1195
...

The same relation between the counters as in the per-cpu case apply.

Change-Id: Ie7d01267c78a3f41f60a3ef52917d5a5d463f195
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 include/linux/sched.h |  62 +++++++++++++++++
 kernel/sched/debug.c  |  26 ++++++++
 kernel/sched/fair.c   | 152 ++++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h  |   2 +
 kernel/sched/stats.c  |  23 +++++++
 5 files changed, 228 insertions(+), 37 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 436f36f768c6..ad2c304b29b8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1065,6 +1065,37 @@ unsigned long capacity_curr_of(int cpu);
 
 struct sched_group;
 
+struct eas_stats {
+	/* select_idle_sibling() stats */
+	u64 sis_attempts;
+	u64 sis_idle;
+	u64 sis_cache_affine;
+	u64 sis_suff_cap;
+	u64 sis_idle_cpu;
+	u64 sis_count;
+
+	/* select_energy_cpu_brute() stats */
+	u64 secb_attempts;
+	u64 secb_sync;
+	u64 secb_idle_bt;
+	u64 secb_insuff_cap;
+	u64 secb_no_nrg_sav;
+	u64 secb_nrg_sav;
+	u64 secb_count;
+
+	/* find_best_target() stats */
+	u64 fbt_attempts;
+	u64 fbt_no_cpu;
+	u64 fbt_no_sd;
+	u64 fbt_pref_idle;
+	u64 fbt_count;
+
+	/* cas */
+	/* select_task_rq_fair() stats */
+	u64 cas_attempts;
+	u64 cas_count;
+};
+
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
@@ -1125,6 +1156,8 @@ struct sched_domain {
 	unsigned int ttwu_wake_remote;
 	unsigned int ttwu_move_affine;
 	unsigned int ttwu_move_balance;
+
+	struct eas_stats eas_stats;
 #endif
 #ifdef CONFIG_SCHED_DEBUG
 	char *name;
@@ -1283,6 +1316,35 @@ struct sched_statistics {
 	u64			nr_wakeups_affine_attempts;
 	u64			nr_wakeups_passive;
 	u64			nr_wakeups_idle;
+
+	/* select_idle_sibling() */
+	u64			nr_wakeups_sis_attempts;
+	u64			nr_wakeups_sis_idle;
+	u64			nr_wakeups_sis_cache_affine;
+	u64			nr_wakeups_sis_suff_cap;
+	u64			nr_wakeups_sis_idle_cpu;
+	u64			nr_wakeups_sis_count;
+
+	/* energy_aware_wake_cpu() */
+	u64			nr_wakeups_secb_attempts;
+	u64			nr_wakeups_secb_sync;
+	u64			nr_wakeups_secb_idle_bt;
+	u64			nr_wakeups_secb_insuff_cap;
+	u64			nr_wakeups_secb_no_nrg_sav;
+	u64			nr_wakeups_secb_nrg_sav;
+	u64			nr_wakeups_secb_count;
+
+	/* find_best_target() */
+	u64			nr_wakeups_fbt_attempts;
+	u64			nr_wakeups_fbt_no_cpu;
+	u64			nr_wakeups_fbt_no_sd;
+	u64			nr_wakeups_fbt_pref_idle;
+	u64			nr_wakeups_fbt_count;
+
+	/* cas */
+	/* select_task_rq_fair() */
+	u64			nr_wakeups_cas_attempts;
+	u64			nr_wakeups_cas_count;
 };
 #endif
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 641511771ae6..7f7116622631 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -597,6 +597,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	P(se.statistics.nr_wakeups_affine_attempts);
 	P(se.statistics.nr_wakeups_passive);
 	P(se.statistics.nr_wakeups_idle);
+	/* eas */
+	/* select_idle_sibling() */
+	P(se.statistics.nr_wakeups_sis_attempts);
+	P(se.statistics.nr_wakeups_sis_idle);
+	P(se.statistics.nr_wakeups_sis_cache_affine);
+	P(se.statistics.nr_wakeups_sis_suff_cap);
+	P(se.statistics.nr_wakeups_sis_idle_cpu);
+	P(se.statistics.nr_wakeups_sis_count);
+	/* select_energy_cpu_brute() */
+	P(se.statistics.nr_wakeups_secb_attempts);
+	P(se.statistics.nr_wakeups_secb_sync);
+	P(se.statistics.nr_wakeups_secb_idle_bt);
+	P(se.statistics.nr_wakeups_secb_insuff_cap);
+	P(se.statistics.nr_wakeups_secb_no_nrg_sav);
+	P(se.statistics.nr_wakeups_secb_nrg_sav);
+	P(se.statistics.nr_wakeups_secb_count);
+	/* find_best_target() */
+	P(se.statistics.nr_wakeups_fbt_attempts);
+	P(se.statistics.nr_wakeups_fbt_no_cpu);
+	P(se.statistics.nr_wakeups_fbt_no_sd);
+	P(se.statistics.nr_wakeups_fbt_pref_idle);
+	P(se.statistics.nr_wakeups_fbt_count);
+	/* cas */
+	/* select_task_rq_fair() */
+	P(se.statistics.nr_wakeups_cas_attempts);
+	P(se.statistics.nr_wakeups_cas_count);
 
 	{
 		u64 avg_atom, avg_per_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c84e15e3cd56..dfd150b02e63 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6059,15 +6059,24 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	int best_idle_cstate = INT_MAX;
 	unsigned long best_idle_capacity = ULONG_MAX;
 
+	schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
+	schedstat_inc(this_rq(), eas_stats.sis_attempts);
+
 	if (!sysctl_sched_cstate_aware) {
-		if (idle_cpu(target))
+		if (idle_cpu(target)) {
+			schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
+			schedstat_inc(this_rq(), eas_stats.sis_idle);
 			return target;
+		}
 
 		/*
 		 * If the prevous cpu is cache affine and idle, don't be stupid.
 		 */
-		if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+		if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
+			schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
+			schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
 			return prev;
+		}
 	}
 
 	/*
@@ -6091,8 +6100,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 					if (new_usage > capacity_orig || !idle_cpu(i))
 						goto next;
 
-					if (i == target && new_usage <= capacity_curr_of(target))
+					if (i == target && new_usage <= capacity_curr_of(target)) {
+						schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
+						schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
+						schedstat_inc(sd, eas_stats.sis_suff_cap);
 						return target;
+					}
 
 					if (idle_idx < best_idle_cstate &&
 					    capacity_orig <= best_idle_capacity) {
@@ -6109,6 +6122,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 
 				target = cpumask_first_and(sched_group_cpus(sg),
 					tsk_cpus_allowed(p));
+				schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
+				schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
+				schedstat_inc(sd, eas_stats.sis_idle_cpu);
 				goto done;
 			}
 next:
@@ -6120,6 +6136,9 @@ next:
 		target = best_idle_cpu;
 
 done:
+	schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
+	schedstat_inc(this_rq(), eas_stats.sis_count);
+
 	return target;
 }
 
@@ -6146,13 +6165,22 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 	struct sched_group *sg;
 	int cpu = start_cpu(boosted);
 
-	if (cpu < 0)
+	schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
+	schedstat_inc(this_rq(), eas_stats.fbt_attempts);
+
+	if (cpu < 0) {
+		schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
+		schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
 		return target_cpu;
+	}
 
 	sd = rcu_dereference(per_cpu(sd_ea, cpu));
 
-	if (!sd)
+	if (!sd) {
+		schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
+		schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
 		return target_cpu;
+	}
 
 	sg = sd->groups;
 
@@ -6191,8 +6219,11 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 			 * Unconditionally favoring tasks that prefer idle cpus to
 			 * improve latency.
 			 */
-			if (idle_cpu(i) && prefer_idle)
+			if (idle_cpu(i) && prefer_idle) {
+				schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
+				schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
 				return i;
+			}
 
 			cur_capacity = capacity_curr_of(i);
 
@@ -6228,6 +6259,11 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 	if (target_cpu < 0)
 		target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
 
+	if (target_cpu >= 0) {
+		schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
+		schedstat_inc(this_rq(), eas_stats.fbt_count);
+	}
+
 	return target_cpu;
 }
 
@@ -6279,11 +6315,17 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 	int target_cpu = prev_cpu, tmp_target;
 	bool boosted, prefer_idle;
 
+	schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
+	schedstat_inc(this_rq(), eas_stats.secb_attempts);
+
 	if (sysctl_sched_sync_hint_enable && sync) {
 		int cpu = smp_processor_id();
 
-		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+			schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
+			schedstat_inc(this_rq(), eas_stats.secb_sync);
 			return cpu;
+		}
 	}
 
 	rcu_read_lock();
@@ -6303,8 +6345,11 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 		goto unlock;
 	if (tmp_target >= 0) {
 		target_cpu = tmp_target;
-		if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+		if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
+			schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
+			schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
 			goto unlock;
+		}
 	}
 
 	if (target_cpu != prev_cpu) {
@@ -6316,15 +6361,30 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 		};
 
 		/* Not enough spare capacity on previous cpu */
-		if (cpu_overutilized(prev_cpu))
+		if (cpu_overutilized(prev_cpu)) {
+			schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
+			schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
 			goto unlock;
+		}
 
-		if (energy_diff(&eenv) >= 0)
+		if (energy_diff(&eenv) >= 0) {
+			schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
+			schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
 			target_cpu = prev_cpu;
+			goto unlock;
+		}
+
+		schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
+		schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
+		goto unlock;
 	}
 
+	schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
+	schedstat_inc(this_rq(), eas_stats.secb_count);
+
 unlock:
 	rcu_read_unlock();
+
 	return target_cpu;
 }
 
@@ -6387,39 +6447,57 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
-	} else while (sd) {
-		struct sched_group *group;
-		int weight;
+	} else {
+		int wu = sd_flag & SD_BALANCE_WAKE;
+		int cas_cpu = -1;
 
-		if (!(sd->flags & sd_flag)) {
-			sd = sd->child;
-			continue;
+		if (wu) {
+			schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
+			schedstat_inc(this_rq(), eas_stats.cas_attempts);
 		}
 
-		group = find_idlest_group(sd, p, cpu, sd_flag);
-		if (!group) {
-			sd = sd->child;
-			continue;
+		while (sd) {
+			struct sched_group *group;
+			int weight;
+
+			if (wu)
+				schedstat_inc(sd, eas_stats.cas_attempts);
+
+			if (!(sd->flags & sd_flag)) {
+				sd = sd->child;
+				continue;
+			}
+
+			group = find_idlest_group(sd, p, cpu, sd_flag);
+			if (!group) {
+				sd = sd->child;
+				continue;
+			}
+
+			new_cpu = find_idlest_cpu(group, p, cpu);
+			if (new_cpu == -1 || new_cpu == cpu) {
+				/* Now try balancing at a lower domain level of cpu */
+				sd = sd->child;
+				continue;
+			}
+
+			/* Now try balancing at a lower domain level of new_cpu */
+			cpu = cas_cpu = new_cpu;
+			weight = sd->span_weight;
+			sd = NULL;
+			for_each_domain(cpu, tmp) {
+				if (weight <= tmp->span_weight)
+					break;
+				if (tmp->flags & sd_flag)
+					sd = tmp;
+			}
+			/* while loop will break here if sd == NULL */
 		}
 
-		new_cpu = find_idlest_cpu(group, p, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
-			/* Now try balancing at a lower domain level of cpu */
-			sd = sd->child;
-			continue;
+		if (wu && (cas_cpu >= 0)) {
+			schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
+			schedstat_inc(this_rq(), eas_stats.cas_count);
 		}
-
-		/* Now try balancing at a lower domain level of new_cpu */
-		cpu = new_cpu;
-		weight = sd->span_weight;
-		sd = NULL;
-		for_each_domain(cpu, tmp) {
-			if (weight <= tmp->span_weight)
-				break;
-			if (tmp->flags & sd_flag)
-				sd = tmp;
-		}
-		/* while loop will break here if sd == NULL */
 	}
 	rcu_read_unlock();
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d0971e5c9269..6880fbc39760 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -734,6 +734,8 @@ struct rq {
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_count;
 	unsigned int ttwu_local;
+
+	struct eas_stats eas_stats;
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f0c33e..b63879918cd6 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -12,6 +12,26 @@
  */
 #define SCHEDSTAT_VERSION 15
 
+static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
+{
+	/* eas-specific runqueue stats */
+	seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
+	    stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
+	    stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
+
+	seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
+	    stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
+	    stats->secb_insuff_cap, stats->secb_no_nrg_sav,
+	    stats->secb_nrg_sav, stats->secb_count);
+
+	seq_printf(seq, "%llu %llu %llu %llu %llu ",
+	    stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
+	    stats->fbt_pref_idle, stats->fbt_count);
+
+	seq_printf(seq, "%llu %llu\n",
+	    stats->cas_attempts, stats->cas_count);
+}
+
 static int show_schedstat(struct seq_file *seq, void *v)
 {
 	int cpu;
@@ -39,6 +59,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 		seq_printf(seq, "\n");
 
+		show_easstat(seq, &rq->eas_stats);
 #ifdef CONFIG_SMP
 		/* domain-specific stats */
 		rcu_read_lock();
@@ -66,6 +87,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
 			    sd->ttwu_move_balance);
+
+			show_easstat(seq, &sd->eas_stats);
 		}
 		rcu_read_unlock();
 #endif

From 8ac52cbaf41b01baa48fbbe65879734168037f15 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Wed, 8 Feb 2017 14:25:35 +0000
Subject: [PATCH 0866/3220] trace:sched: Make util_avg in load_avg trace
 reflect PELT/WALT as used

With the ability to choose between WALT and PELT for utilisation tracking
we can have the situation where we're using WALT to make all the
decisions and reporting PELT figures in the sched_load_avg_(cpu|task)
trace points. This is not too much of an issue, but when analysing trace
it is nice to see numbers representing what the scheduler is using rather
than needing to add in additional sched_walt_* traces to figure it out.

Add reporting for both types, and make the util_avg member reflect what
will be seen from cpu or task_util functions in the scheduler.

Change-Id: I2abbd2c5fa70822096d0f3372b4c12b1c6af1590
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 include/trace/events/sched.h | 44 +++++++++++++++++++++++++++++++-----
 kernel/sched/fair.c          |  9 ++++++--
 2 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 1d758d18a1b7..433d3919ba00 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -638,14 +638,21 @@ TRACE_EVENT(sched_contrib_scale_f,
 
 #ifdef CONFIG_SMP
 
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int walt_ravg_window;
+extern unsigned int walt_disabled;
+#endif
+
 /*
  * Tracepoint for accounting sched averages for tasks.
  */
 TRACE_EVENT(sched_load_avg_task,
 
-	TP_PROTO(struct task_struct *tsk, struct sched_avg *avg),
+	TP_PROTO(struct task_struct *tsk, struct sched_avg *avg, void *_ravg),
 
-	TP_ARGS(tsk, avg),
+	TP_ARGS(tsk, avg, _ravg),
 
 	TP_STRUCT__entry(
 		__array( char,	comm,	TASK_COMM_LEN		)
@@ -653,6 +660,8 @@ TRACE_EVENT(sched_load_avg_task,
 		__field( int,	cpu				)
 		__field( unsigned long,	load_avg		)
 		__field( unsigned long,	util_avg		)
+		__field( unsigned long,	util_avg_pelt	)
+		__field( unsigned long,	util_avg_walt	)
 		__field( u64,		load_sum		)
 		__field( u32,		util_sum		)
 		__field( u32,		period_contrib		)
@@ -667,15 +676,25 @@ TRACE_EVENT(sched_load_avg_task,
 		__entry->load_sum		= avg->load_sum;
 		__entry->util_sum		= avg->util_sum;
 		__entry->period_contrib		= avg->period_contrib;
+		__entry->util_avg_pelt  = avg->util_avg;
+		__entry->util_avg_walt  = 0;
+#ifdef CONFIG_SCHED_WALT
+		__entry->util_avg_walt = (((unsigned long)((struct ravg*)_ravg)->demand) << SCHED_LOAD_SHIFT);
+		do_div(__entry->util_avg_walt, walt_ravg_window);
+		if (!walt_disabled && sysctl_sched_use_walt_task_util)
+			__entry->util_avg = __entry->util_avg_walt;
+#endif
 	),
-
-	TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu"
+	TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu "
+			"util_avg_pelt=%lu util_avg_walt=%lu load_sum=%llu"
 		  " util_sum=%u period_contrib=%u",
 		  __entry->comm,
 		  __entry->pid,
 		  __entry->cpu,
 		  __entry->load_avg,
 		  __entry->util_avg,
+		  __entry->util_avg_pelt,
+		  __entry->util_avg_walt,
 		  (u64)__entry->load_sum,
 		  (u32)__entry->util_sum,
 		  (u32)__entry->period_contrib)
@@ -694,16 +713,29 @@ TRACE_EVENT(sched_load_avg_cpu,
 		__field( int,	cpu				)
 		__field( unsigned long,	load_avg		)
 		__field( unsigned long,	util_avg		)
+		__field( unsigned long,	util_avg_pelt	)
+		__field( unsigned long,	util_avg_walt	)
 	),
 
 	TP_fast_assign(
 		__entry->cpu			= cpu;
 		__entry->load_avg		= cfs_rq->avg.load_avg;
 		__entry->util_avg		= cfs_rq->avg.util_avg;
+		__entry->util_avg_pelt	= cfs_rq->avg.util_avg;
+		__entry->util_avg_walt	= 0;
+#ifdef CONFIG_SCHED_WALT
+		__entry->util_avg_walt	=
+				cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT;
+		do_div(__entry->util_avg_walt, walt_ravg_window);
+		if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+			__entry->util_avg		= __entry->util_avg_walt;
+#endif
 	),
 
-	TP_printk("cpu=%d load_avg=%lu util_avg=%lu",
-		  __entry->cpu, __entry->load_avg, __entry->util_avg)
+	TP_printk("cpu=%d load_avg=%lu util_avg=%lu "
+			  "util_avg_pelt=%lu util_avg_walt=%lu",
+		  __entry->cpu, __entry->load_avg, __entry->util_avg,
+		  __entry->util_avg_pelt, __entry->util_avg_walt)
 );
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dfd150b02e63..f972df2115d7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3178,6 +3178,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
 	u64 now = cfs_rq_clock_task(cfs_rq);
 	int cpu = cpu_of(rq_of(cfs_rq));
 	int decayed;
+	void *ptr = NULL;
 
 	/*
 	 * Track task load average for carrying it to new CPU after migrated, and
@@ -3195,8 +3196,12 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
 	if (decayed && (flags & UPDATE_TG))
 		update_tg_load_avg(cfs_rq, 0);
 
-	if (entity_is_task(se))
-		trace_sched_load_avg_task(task_of(se), &se->avg);
+	if (entity_is_task(se)) {
+#ifdef CONFIG_SCHED_WALT
+		ptr = (void *)&(task_of(se)->ravg);
+#endif
+		trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
+	}
 }
 
 /**

From 8865f07600fad56a7fc6b96dc339bbe9a16eb7eb Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Tue, 28 Feb 2017 17:27:28 +0000
Subject: [PATCH 0867/3220] sched/fair: remove task util from own cpu when
 placing waking task

When we place a waking task with find_best_target, we calculate the
existing and new utilisation of each candidate cpu. However, we do
not remove any blocked load resulting from the waking task on the
previous cpu which might cause unnecessary migrations.

Switch to using cpu_util_wake which does this for us, which requires
moving cpu_util_wake a few functions earlier.

Also, we have multiple potential cpu utilization signals here, so
update the necessary bits to allow WALT to work properly (including
not subtracting task util for WALT).

When WALT is in use, cpu utilization is the utilization
in the previous completed window, whilst the task utilization
ignores fully idle windows. There seems to be no way to have a
decently accurate estimate of how much (if any) utilization from
this task remains on the prev cpu.

Instead, just return cpu_util when we're using WALT.

Change-Id: I448203ab98ffb5c020dfb6b218581eef1f5601f7
Reported-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 48 +++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f972df2115d7..27a6f5d6c86c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6147,6 +6147,34 @@ done:
 	return target;
 }
 
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+	unsigned long util, capacity;
+
+#ifdef CONFIG_SCHED_WALT
+	/*
+	 * WALT does not decay idle tasks in the same manner
+	 * as PELT, so it makes little sense to subtract task
+	 * utilization from cpu utilization. Instead just use
+	 * cpu_util for this case.
+	 */
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+		return cpu_util(cpu);
+#endif
+	/* Task has no contribution or is new */
+	if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+		return cpu_util(cpu);
+
+	capacity = capacity_orig_of(cpu);
+	util = max_t(long, cpu_util(cpu) - task_util(p), 0);
+
+	return (util >= capacity) ? capacity : util;
+}
+
 static int start_cpu(bool boosted)
 {
 	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
@@ -6203,7 +6231,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 			 * so prev_cpu will receive a negative bias due to the double
 			 * accounting. However, the blocked utilization may be zero.
 			 */
-			new_util = cpu_util(i) + task_util(p);
+			new_util = cpu_util_wake(i, p) + task_util(p);
 
 			/*
 			 * Ensure minimum capacity to grant the required boost.
@@ -6272,24 +6300,6 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 	return target_cpu;
 }
 
-/*
- * cpu_util_wake: Compute cpu utilization with any contributions from
- * the waking task p removed.
- */
-static int cpu_util_wake(int cpu, struct task_struct *p)
-{
-	unsigned long util, capacity;
-
-	/* Task has no contribution or is new */
-	if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
-		return cpu_util(cpu);
-
-	capacity = capacity_orig_of(cpu);
-	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
-
-	return (util >= capacity) ? capacity : util;
-}
-
 /*
  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.

From 83f462daa328f2f42c3c1f7f5277f71e3fa0f750 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Wed, 8 Mar 2017 13:37:32 +0000
Subject: [PATCH 0868/3220] sched/fair: ensure utilization signals are
 synchronized before use

wake_cap performs task and cpu utilization synchronization which is
what allows us to subtract current task util from prev_cpu util and
have a sensible number to work with.

It looks as though if wake_wide returns 0, we could potentially not
execute wake_cap, which would result in unsynced signals we then use
for energy calculations.

This is not necessarily an issue we've seen in traces, but it looks
as though it should be changed.

Change-Id: Ic54a3cba2a10d946ea20113a04371dea04115e82
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 27a6f5d6c86c..23e2b5f33ff6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6424,9 +6424,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 
-	if (sd_flag & SD_BALANCE_WAKE)
-		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+	if (sd_flag & SD_BALANCE_WAKE) {
+		/*
+		 * do wake_cap unconditionally as it causes task and cpu
+		 * utilization to be synced, and we need that for energy
+		 * aware wakeups
+		 */
+		int _wake_cap = wake_cap(p, cpu, prev_cpu);
+		want_affine = !wake_wide(p) && !_wake_cap
 			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+	}
 
 	if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
 		return select_energy_cpu_brute(p, prev_cpu, sync);

From fef0112a63989830cfd60f6ede53014600e751e0 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Fri, 3 Mar 2017 11:43:03 +0000
Subject: [PATCH 0869/3220] sched/fair: discount task contribution to find CPU
 with lowest utilization

In some cases, the new_util of a task can be the same on several
CPUs. This causes an issue because the target_util is only updated
if the current new_util is strictly smaller than target_util.

To fix that, the cpu_util_wake() return value is used alongside the
new_util value. If two CPUs compute the same new_util value,
we'll now also look at their cpu_util_wake() return value. In this
case, the CPU that last ran the task will be chosen in priority.

Change-Id: Ia1ea2c4b3ec39621372c2f748862317d5b497723
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
 kernel/sched/fair.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23e2b5f33ff6..fc4e2529fbd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6221,7 +6221,8 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 		int i;
 
 		for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
-			unsigned long cur_capacity, new_util;
+			unsigned long cur_capacity, new_util, wake_util;
+			unsigned long min_wake_util = ULONG_MAX;
 
 			if (!cpu_online(i))
 				continue;
@@ -6231,7 +6232,8 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 			 * so prev_cpu will receive a negative bias due to the double
 			 * accounting. However, the blocked utilization may be zero.
 			 */
-			new_util = cpu_util_wake(i, p) + task_util(p);
+			wake_util = cpu_util_wake(i, p);
+			new_util = wake_util + task_util(p);
 
 			/*
 			 * Ensure minimum capacity to grant the required boost.
@@ -6266,8 +6268,15 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 					 * Find a target cpu with the lowest/highest
 					 * utilization if prefer_idle/!prefer_idle.
 					 */
-					if ((prefer_idle && target_util > new_util) ||
-					    (!prefer_idle && target_util < new_util)) {
+					if (prefer_idle) {
+						/* Favor the CPU that last ran the task */
+						if (new_util > target_util ||
+						    wake_util > min_wake_util)
+							continue;
+						min_wake_util = wake_util;
+						target_util = new_util;
+						target_cpu = i;
+					} else if (target_util < new_util) {
 						target_util = new_util;
 						target_cpu = i;
 					}

From fc969e3bfa7aaaee77d91c55807ee5a8c6a26b73 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Mon, 6 Feb 2017 16:28:53 +0000
Subject: [PATCH 0870/3220] sched/fair: Fix sched_group_energy() to support
 per-cpu capacity states

sched_group_energy() was supposed to support per-cpu capacity states
(DVFS), however, while fixing a hotplug issue this was broken as we bail
out if there is no SD_SHARE_CAP_STATES flag set.

This patch implements the hotplug race check differently and should
therefore reinstate support for per-cpu capacity states.

Change-Id: I5b865666c9ce833dcfa6514c574580d75aa0a195
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 kernel/sched/fair.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fc4e2529fbd2..cbd6d12e91fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5442,15 +5442,7 @@ static int sched_group_energy(struct energy_env *eenv)
 		 */
 		sd = rcu_dereference(per_cpu(sd_scs, cpu));
 
-		if (!sd)
-			/*
-			 * We most probably raced with hotplug; returning a
-			 * wrong energy estimation is better than entering an
-			 * infinite loop.
-			 */
-			return -EINVAL;
-
-		if (sd->parent)
+		if (sd && sd->parent)
 			sg_shared_cap = sd->parent->groups;
 
 		for_each_domain(cpu, sd) {
@@ -5505,6 +5497,14 @@ static int sched_group_energy(struct energy_env *eenv)
 
 			} while (sg = sg->next, sg != sd->groups);
 		}
+
+		/*
+		 * If we raced with hotplug and got an sd NULL-pointer;
+		 * returning a wrong energy estimation is better than
+		 * entering an infinite loop.
+		 */
+		if (cpumask_test_cpu(cpu, &visit_cpus))
+			return -EINVAL;
 next_cpu:
 		cpumask_clear_cpu(cpu, &visit_cpus);
 		continue;

From 4c031f0e6fbc0fcea864836356c77b3aa0b29947 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Fri, 24 Mar 2017 17:37:28 +0000
Subject: [PATCH 0871/3220] cpufreq/schedutil: use boosted_cpu_util for PELT to
 match WALT

When using WALT we always used boosted cpu util for OPP selection.
This is the primary purpose for boosted cpu util, but we hadn't
changed the PELT utilization check to do the same thing.

Fix that here.

Change-Id: Id5ffb26eac23b25fe754255221f6d21b8cededfd
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/cpufreq_schedutil.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 191ba36a9eeb..75bfbb336722 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -19,9 +19,7 @@
 #include "sched.h"
 #include "tune.h"
 
-#ifdef CONFIG_SCHED_WALT
 unsigned long boosted_cpu_util(int cpu);
-#endif
 
 /* Stub out fast switch routines present on mainline to reduce the backport
  * overhead. */
@@ -188,6 +186,15 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
+static inline bool use_pelt(void)
+{
+#ifdef CONFIG_SCHED_WALT
+	return (!sysctl_sched_use_walt_cpu_util || walt_disabled);
+#else
+	return true;
+#endif
+}
+
 static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
 {
 	int cpu = smp_processor_id();
@@ -204,11 +211,10 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
 	rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
 	rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
 
-	*util = min(rq->cfs.avg.util_avg + rt, max_cap);
-#ifdef CONFIG_SCHED_WALT
-	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
-		*util = boosted_cpu_util(cpu);
-#endif
+	*util = boosted_cpu_util(cpu);
+	if (likely(use_pelt()))
+		*util = min((*util + rt), max_cap);
+
 	*max = max_cap;
 }
 

From 2e829cf17fd296cac853c2e7d2453da4af446948 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Fri, 24 Mar 2017 17:40:51 +0000
Subject: [PATCH 0872/3220] sched/tune: increase group count to 5

We use 5 groups everywhere else, this should default to the same.

Change-Id: I05a20bdcf8046ea90a2e36979940cef11246e735
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/tune.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 079b18802f17..d512b4fb218f 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -205,7 +205,7 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta,
  *    implementation especially for the computation of the per-CPU boost
  *    value
  */
-#define BOOSTGROUPS_COUNT 4
+#define BOOSTGROUPS_COUNT 5
 
 /* Array of configured boostgroups */
 static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {

From f9b83b3e6e723f5568fd8329197523080747d3a8 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Fri, 24 Mar 2017 18:56:16 +0000
Subject: [PATCH 0873/3220] sched/tune: fix sched_energy_diff tracepoint

sched_energy_diff tracepoint is in a place where it can never trace
payoff or nrg.delta. If CONFIG_SCHED_TUNE is enabled, put it in
a place where those values exist. If it is not enabled, trace from
the current location

Change-Id: Id5442f2b34ec76625491d27c0f4285433ca12699
Reported-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cbd6d12e91fe..752d7ddd0f3a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5574,13 +5574,13 @@ static inline int __energy_diff(struct energy_env *eenv)
 	eenv->nrg.after = energy_after;
 	eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
 	eenv->payoff = 0;
-
+#ifndef CONFIG_SCHED_TUNE
 	trace_sched_energy_diff(eenv->task,
 			eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
 			eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
 			eenv->cap.before, eenv->cap.after, eenv->cap.delta,
 			eenv->nrg.delta, eenv->payoff);
-
+#endif
 	/*
 	 * Dead-zone margin preventing too many migrations.
 	 */
@@ -5651,6 +5651,12 @@ energy_diff(struct energy_env *eenv)
 			eenv->cap.delta,
 			eenv->task);
 
+	trace_sched_energy_diff(eenv->task,
+			eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+			eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+			eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+			eenv->nrg.delta, eenv->payoff);
+
 	/*
 	 * When SchedTune is enabled, the energy_diff() function will return
 	 * the computed energy payoff value. Since the energy_diff() return

From 3757f957419c1ccb6fd837ade571375a2cfdc9dc Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 13 Oct 2016 17:34:47 +0100
Subject: [PATCH 0874/3220] sched/tune: report when SchedTune has not been
 initialized

Change-Id: Iba4e5e3d220451f04272d555e6b8e0af83a7f09d
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>
---
 kernel/sched/tune.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index d512b4fb218f..9770cef183d0 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -939,6 +939,7 @@ schedtune_init(void)
 	return 0;
 
 nodata:
+	pr_warning("schedtune: disabled!\n");
 	rcu_read_unlock();
 	return -EINVAL;
 }

From 41d9288e3eace3988a4fc8ce9bffef425a265ed7 Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Tue, 8 Nov 2016 14:53:44 -0800
Subject: [PATCH 0875/3220] sched/tune: Initialize raw_spin_lock in
 boosted_groups

bug: 32668852
Change-Id: Ice96230d88939d5973b1b6310085d1b3df9c47d9
Signed-off-by: Srinath Sridharan <srinathsr@google.com>
---
 kernel/sched/tune.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 9770cef183d0..5e81aaad5566 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -640,6 +640,7 @@ schedtune_boostgroup_init(struct schedtune *st)
 		bg = &per_cpu(cpu_boost_groups, cpu);
 		bg->group[st->idx].boost = 0;
 		bg->group[st->idx].tasks = 0;
+		raw_spin_lock_init(&bg->lock);
 	}
 
 	return 0;

From 7b8577d94ccfdad13cc5c8a48e9643666e8fa167 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 13 Oct 2016 17:31:24 +0100
Subject: [PATCH 0876/3220] sched/{fair,tune}: use reciprocal_value to compute
 boost margin

Change-Id: I493b07360c46eee0b72c2a046dab9ec6cb3427ef
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>
---
 kernel/sched/fair.c | 23 ++++++-----------------
 kernel/sched/tune.c |  3 +++
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 752d7ddd0f3a..cc00d92a3a2e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5805,6 +5805,8 @@ static bool cpu_overutilized(int cpu)
 
 #ifdef CONFIG_SCHED_TUNE
 
+struct reciprocal_value schedtune_spc_rdiv;
+
 static long
 schedtune_margin(unsigned long signal, long boost)
 {
@@ -5815,29 +5817,16 @@ schedtune_margin(unsigned long signal, long boost)
 	 *
 	 * The Boost (B) value is used to compute a Margin (M) which is
 	 * proportional to the complement of the original Signal (S):
-	 *   M = B * (SCHED_LOAD_SCALE - S), if B is positive
-	 *   M = B * S, if B is negative
+	 *   M = B * (SCHED_CAPACITY_SCALE - S)
 	 * The obtained M could be used by the caller to "boost" S.
 	 */
 	if (boost >= 0) {
-		margin  = SCHED_LOAD_SCALE - signal;
+		margin  = SCHED_CAPACITY_SCALE - signal;
 		margin *= boost;
 	} else
 		margin = -signal * boost;
-	/*
-	 * Fast integer division by constant:
-	 *  Constant   :                 (C) = 100
-	 *  Precision  : 0.1%            (P) = 0.1
-	 *  Reference  : C * 100 / P     (R) = 100000
-	 *
-	 * Thus:
-	 *  Shift bits : ceil(log(R,2))  (S) = 17
-	 *  Mult const : round(2^S/C)    (M) = 1311
-	 *
-	 *
-	 */
-	margin  *= 1311;
-	margin >>= 17;
+
+	margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
 
 	if (boost < 0)
 		margin *= -1;
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 5e81aaad5566..493564816679 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -17,6 +17,7 @@ static bool schedtune_initialized = false;
 
 unsigned int sysctl_sched_cfs_boost __read_mostly;
 
+extern struct reciprocal_value schedtune_spc_rdiv;
 extern struct target_nrg schedtune_target_nrg;
 
 /* Performance Boost region (B) threshold params */
@@ -937,6 +938,8 @@ schedtune_init(void)
 	pr_info("schedtune: configured to support global boosting only\n");
 #endif
 
+	schedtune_spc_rdiv = reciprocal_value(100);
+
 	return 0;
 
 nodata:

From 9e3c04bef72b0586e87a2409e0c6d9b96d27da9c Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 13 Oct 2016 18:13:20 +0100
Subject: [PATCH 0877/3220] sched/fair: use SCHED_CAPACITY_SCALE for energy
 normalization

Change-Id: I686d26975f4a7dd830ff8441ff986e35461a7d55
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cc00d92a3a2e..ef4dc3281e22 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5600,7 +5600,7 @@ struct target_nrg schedtune_target_nrg;
 
 /*
  * System energy normalization
- * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
  * corresponding to the specified energy variation.
  */
 static inline int
@@ -5620,7 +5620,7 @@ normalize_energy(int energy_diff)
 	normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
 
 	/* Scale by energy magnitude */
-	normalized_nrg <<= SCHED_LOAD_SHIFT;
+	normalized_nrg <<= SCHED_CAPACITY_SHIFT;
 
 	/* Normalize on max energy for target platform */
 	normalized_nrg = reciprocal_divide(

From c47d00b57b26a5dc9bca89375dd8a80ee1a7cc5b Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Mon, 27 Mar 2017 18:20:20 +0100
Subject: [PATCH 0878/3220] sched/tune: don't use schedtune before it is ready

When EAS is enabled during boot, we have to be careful not to use
schedtune from fair.c before it is ready or it will warn us and we'll
get a traceback in the console.

Change-Id: I1a5cf29b18af626545c636c51219f9ed497c19fa
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 9 ++++++++-
 kernel/sched/tune.c | 8 +++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef4dc3281e22..226d1990eea9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5597,7 +5597,7 @@ static inline int __energy_diff(struct energy_env *eenv)
 #ifdef CONFIG_SCHED_TUNE
 
 struct target_nrg schedtune_target_nrg;
-
+extern bool schedtune_initialized;
 /*
  * System energy normalization
  * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
@@ -5607,13 +5607,20 @@ static inline int
 normalize_energy(int energy_diff)
 {
 	u32 normalized_nrg;
+
+	/* during early setup, we don't know the extents */
+	if (unlikely(!schedtune_initialized))
+		return energy_diff < 0 ? -1 : 1 ;
+
 #ifdef CONFIG_SCHED_DEBUG
+	{
 	int max_delta;
 
 	/* Check for boundaries */
 	max_delta  = schedtune_target_nrg.max_power;
 	max_delta -= schedtune_target_nrg.min_power;
 	WARN_ON(abs(energy_diff) >= max_delta);
+	}
 #endif
 
 	/* Do scaling using positive numbers to increase the range */
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 493564816679..86d04caf1684 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -12,7 +12,7 @@
 #include "tune.h"
 
 #ifdef CONFIG_CGROUP_SCHEDTUNE
-static bool schedtune_initialized = false;
+bool schedtune_initialized = false;
 #endif
 
 unsigned int sysctl_sched_cfs_boost __read_mostly;
@@ -527,6 +527,9 @@ int schedtune_task_boost(struct task_struct *p)
 	struct schedtune *st;
 	int task_boost;
 
+	if (!unlikely(schedtune_initialized))
+		return 0;
+
 	/* Get task boost value */
 	rcu_read_lock();
 	st = task_schedtune(p);
@@ -541,6 +544,9 @@ int schedtune_prefer_idle(struct task_struct *p)
 	struct schedtune *st;
 	int prefer_idle;
 
+	if (!unlikely(schedtune_initialized))
+		return 0;
+
 	/* Get prefer_idle value */
 	rcu_read_lock();
 	st = task_schedtune(p);

From 8cbdbb9a6aa5ce9d53d2c7cf4aff2831e5b724f9 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@android.com>
Date: Thu, 11 May 2017 16:15:15 -0700
Subject: [PATCH 0879/3220] FROMLIST: f2fs: sanity check checkpoint segno and
 blkoff

Make sure segno and blkoff read from raw image are valid.

(url https://sourceforge.net/p/linux-f2fs/mailman/message/35835945)

Signed-off-by: Jin Qian <jinqian@google.com>
Bug: 36588520
Change-Id: Iba66ab97d3d0870ea48b5ef192d9075f225a934a
---
 fs/f2fs/super.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 86e1cb899957..2ac3417d9412 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1078,6 +1078,8 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	unsigned int total, fsmeta;
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	unsigned int main_segs, blocks_per_seg;
+	int i;
 
 	total = le32_to_cpu(raw_super->segment_count);
 	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -1089,6 +1091,22 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	if (unlikely(fsmeta >= total))
 		return 1;
 
+	main_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
+	blocks_per_seg = sbi->blocks_per_seg;
+
+	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
+		if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
+		    le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) {
+			return 1;
+		}
+	}
+	for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
+		if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
+		    le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) {
+			return 1;
+		}
+	}
+
 	if (unlikely(f2fs_cp_error(sbi))) {
 		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
 		return 1;

From 57e54f412025c46954ae82d67e99f899d9882043 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Fri, 2 Jun 2017 20:22:07 +0000
Subject: [PATCH 0880/3220] BACKPORT: f2fs: sanity check size of nat and sit
 cache

Make sure number of entires doesn't exceed max journal size.

Cc: stable@vger.kernel.org
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>

(url https://sourceforge.net/p/linux-f2fs/mailman/message/35872032)
Bug: 36819470

Change-Id: I0db498cdc996ec204f56617bba5c4ab5ac6ade14
---
 fs/f2fs/segment.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f77b3258454a..7965957dd0e6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1623,6 +1623,10 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 
 static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 {
+	struct f2fs_summary_block *s_sits =
+		CURSEG_I(sbi, CURSEG_COLD_DATA)->sum_blk;
+	struct f2fs_summary_block *s_nats =
+		CURSEG_I(sbi, CURSEG_HOT_DATA)->sum_blk;
 	int type = CURSEG_HOT_DATA;
 	int err;
 
@@ -1649,6 +1653,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 			return err;
 	}
 
+	/* sanity check for summary blocks */
+	if (nats_in_cursum(s_nats) > NAT_JOURNAL_ENTRIES ||
+			sits_in_cursum(s_sits) > SIT_JOURNAL_ENTRIES)
+		return -EINVAL;
+
 	return 0;
 }
 

From fce0ecf04a7e0b6f912151607758eb2e98888569 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Sat, 3 Jun 2017 15:03:03 +0100
Subject: [PATCH 0881/3220] schedstats/eas: guard properly to avoid breaking
 non-smp schedstats users

Add appropriate #ifdef guards to ensure the smp-only easstats structs
are not used when smp is not enabled. Arnd got a report from buildbot,
analysed it, and pointed out exactly what the issue was.

Reported-by: "Arnd Bergmann" <arnd@arndb.de>
Suggested-by: "Arnd Bergmann" <arnd@arndb.de>
Fixes: 4b85765a3dd9 ("sched/fair: Add eas (& cas)
 specific rq, sd and task stats")
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: I60554dea20137f6774db3f59b4afd40a06554cfc
---
 kernel/sched/sched.h | 3 ++-
 kernel/sched/stats.c | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6880fbc39760..ce364ddbb72c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -734,9 +734,10 @@ struct rq {
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_count;
 	unsigned int ttwu_local;
-
+#ifdef CONFIG_SMP
 	struct eas_stats eas_stats;
 #endif
+#endif
 
 #ifdef CONFIG_SMP
 	struct llist_head wake_list;
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index b63879918cd6..6d74a7c77c8c 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -12,6 +12,7 @@
  */
 #define SCHEDSTAT_VERSION 15
 
+#ifdef CONFIG_SMP
 static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
 {
 	/* eas-specific runqueue stats */
@@ -31,6 +32,7 @@ static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
 	seq_printf(seq, "%llu %llu\n",
 	    stats->cas_attempts, stats->cas_count);
 }
+#endif
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -59,8 +61,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 		seq_printf(seq, "\n");
 
-		show_easstat(seq, &rq->eas_stats);
 #ifdef CONFIG_SMP
+		show_easstat(seq, &rq->eas_stats);
+
 		/* domain-specific stats */
 		rcu_read_lock();
 		for_each_domain(cpu, sd) {

From 5c31a5e9a365d1111568ac0484c0dda0e4bd09fe Mon Sep 17 00:00:00 2001
From: Roberto Pereira <rpere@google.com>
Date: Mon, 5 Jun 2017 15:26:49 -0700
Subject: [PATCH 0882/3220] android: base-cfg: disable CONFIG_NFS_FS and
 CONFIG_NFSD

Signed-off-by: Roberto Pereira <rpere@google.com>
Bug:37753761
Change-Id: I1b96d7baa329dad0400c6e5c3fb12e81f1251a62
---
 android/configs/android-base.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index b7b997fd58c9..b974d01b84af 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -3,6 +3,8 @@
 # CONFIG_DEVMEM is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
+# CONFIG_NFSD is not set
+# CONFIG_NFS_FS is not set
 # CONFIG_OABI_COMPAT is not set
 # CONFIG_SYSVIPC is not set
 # CONFIG_USELIB is not set

From 4fd2931f74468752711044528129e1155bde82fd Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 7 Jun 2017 12:44:50 -0700
Subject: [PATCH 0883/3220] ANDROID: sdcardfs: d_splice_alias can return error
 values

We must check that d_splice_alias was successful before using its
output.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 62390017
Change-Id: Ifda0a052fb3f67e35c635a4e5e907876c5400978
---
 fs/sdcardfs/lookup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 00ae21151f52..676e394e07be 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -199,7 +199,8 @@ static struct dentry *__sdcardfs_interpose(struct dentry *dentry,
 
 	ret_dentry = d_splice_alias(inode, dentry);
 	dentry = ret_dentry ?: dentry;
-	update_derived_permission_lock(dentry);
+	if (!IS_ERR(dentry))
+		update_derived_permission_lock(dentry);
 out:
 	return ret_dentry;
 }

From 23874bf0d9eca04e7a537275f67d4d639318aa77 Mon Sep 17 00:00:00 2001
From: Ganesh Mahendran <opensource.ganesh@gmail.com>
Date: Thu, 25 May 2017 15:20:29 +0800
Subject: [PATCH 0884/3220] ANDROID: uid_sys_stats: check previous uid_entry
 before call find_or_register_uid

Theads in a process are stored in list struct task_struct->thread_group,
so it will be visited continiously in below loop:
do_each_thread(temp, task) {
    ...
} while_each_thread(temp, task);

I add some log in the loop, we can see below information:
[   65.033561] uid 1000, uid_entry ffffffc0f2761600
[   65.033567] uid 1000, uid_entry ffffffc0f2761600
[   65.033574] uid 1000, uid_entry ffffffc0f2761600
[   65.033581] uid 1000, uid_entry ffffffc0f2761600
[   65.033588] uid 1000, uid_entry ffffffc0f2761600
[   65.033595] uid 1000, uid_entry ffffffc0f2761600
[   65.033602] uid 1000, uid_entry ffffffc0f2761600
[   65.033609] uid 1000, uid_entry ffffffc0f2761600
[   65.033615] uid 1000, uid_entry ffffffc0f2761600
[   65.033622] uid 1000, uid_entry ffffffc0f2761600
[   65.033629] uid 1000, uid_entry ffffffc0f2761600
[   65.033637] uid 1000, uid_entry ffffffc0f2761600
[   65.033644] uid 1000, uid_entry ffffffc0f2761600
[   65.033651] uid 1000, uid_entry ffffffc0f2761600
[   65.033658] uid 1000, uid_entry ffffffc0f2761600
[   65.033665] uid 1000, uid_entry ffffffc0f2761600
[   65.033672] uid 1000, uid_entry ffffffc0f2761600
[   65.033680] uid 1000, uid_entry ffffffc0f2761600
[   65.033687] uid 1000, uid_entry ffffffc0f2761600
[   65.033694] uid 1000, uid_entry ffffffc0f2761600
[   65.033701] uid 1000, uid_entry ffffffc0f2761600
[   65.033708] uid 1000, uid_entry ffffffc0f2761600
[   65.033715] uid 1000, uid_entry ffffffc0f2761600
[   65.033722] uid 1000, uid_entry ffffffc0f2761600
[   65.033729] uid 1000, uid_entry ffffffc0f2761600
[   65.033736] uid 1000, uid_entry ffffffc0f2761600
[   65.033743] uid 1000, uid_entry ffffffc0f2761600
[   65.033750] uid 1000, uid_entry ffffffc0f2761600
[   65.033757] uid 1000, uid_entry ffffffc0f2761600
[   65.033763] uid 1000, uid_entry ffffffc0f2761600
[   65.033770] uid 1000, uid_entry ffffffc0f2761600
[   65.033777] uid 1000, uid_entry ffffffc0f2761600
[   65.033784] uid 1000, uid_entry ffffffc0f2761600
[   65.033791] uid 1000, uid_entry ffffffc0f2761600
[   65.033798] uid 1000, uid_entry ffffffc0f2761600

So we can check the previous uid_entry before calling find_or_register_uid
to save time.

Change-Id: I05ec1a1405a80c0a620cb4b4b2f6483dbfde7829
Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
---
 drivers/misc/uid_sys_stats.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 091370f4ea40..3c9d311106cd 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -94,7 +94,7 @@ static struct uid_entry *find_or_register_uid(uid_t uid)
 
 static int uid_cputime_show(struct seq_file *m, void *v)
 {
-	struct uid_entry *uid_entry;
+	struct uid_entry *uid_entry = NULL;
 	struct task_struct *task, *temp;
 	struct user_namespace *user_ns = current_user_ns();
 	cputime_t utime;
@@ -112,7 +112,8 @@ static int uid_cputime_show(struct seq_file *m, void *v)
 	read_lock(&tasklist_lock);
 	do_each_thread(temp, task) {
 		uid = from_kuid_munged(user_ns, task_uid(task));
-		uid_entry = find_or_register_uid(uid);
+		if (!uid_entry || uid_entry->uid != uid)
+			uid_entry = find_or_register_uid(uid);
 		if (!uid_entry) {
 			read_unlock(&tasklist_lock);
 			rt_mutex_unlock(&uid_lock);
@@ -251,7 +252,7 @@ static void compute_uid_io_bucket_stats(struct io_stats *io_bucket,
 
 static void update_io_stats_all_locked(void)
 {
-	struct uid_entry *uid_entry;
+	struct uid_entry *uid_entry = NULL;
 	struct task_struct *task, *temp;
 	struct user_namespace *user_ns = current_user_ns();
 	unsigned long bkt;
@@ -264,7 +265,8 @@ static void update_io_stats_all_locked(void)
 	rcu_read_lock();
 	do_each_thread(temp, task) {
 		uid = from_kuid_munged(user_ns, task_uid(task));
-		uid_entry = find_or_register_uid(uid);
+		if (!uid_entry || uid_entry->uid != uid)
+			uid_entry = find_or_register_uid(uid);
 		if (!uid_entry)
 			continue;
 		add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);

From d3135a2e661474a6c04c2957226e168004dc1824 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Thu, 8 Dec 2016 19:55:22 +0800
Subject: [PATCH 0885/3220] usb: gadget: f_fs: Fix possibe deadlock

When system try to close /dev/usb-ffs/adb/ep0 on one core, at the same
time another core try to attach new UDC, which will cause deadlock as
below scenario. Thus we should release ffs lock before issuing
unregister_gadget_item().

[   52.642225] c1 ======================================================
[   52.642228] c1 [ INFO: possible circular locking dependency detected ]
[   52.642236] c1 4.4.6+ #1 Tainted: G        W  O
[   52.642241] c1 -------------------------------------------------------
[   52.642245] c1 usb ffs open/2808 is trying to acquire lock:
[   52.642270] c0  (udc_lock){+.+.+.}, at: [<ffffffc00065aeec>]
		usb_gadget_unregister_driver+0x3c/0xc8
[   52.642272] c1  but task is already holding lock:
[   52.642283] c0  (ffs_lock){+.+.+.}, at: [<ffffffc00066b244>]
		ffs_data_clear+0x30/0x140
[   52.642285] c1 which lock already depends on the new lock.
[   52.642287] c1
               the existing dependency chain (in reverse order) is:
[   52.642295] c0
	       -> #1 (ffs_lock){+.+.+.}:
[   52.642307] c0        [<ffffffc00012340c>] __lock_acquire+0x20f0/0x2238
[   52.642314] c0        [<ffffffc000123b54>] lock_acquire+0xe4/0x298
[   52.642322] c0        [<ffffffc000aaf6e8>] mutex_lock_nested+0x7c/0x3cc
[   52.642328] c0        [<ffffffc00066f7bc>] ffs_func_bind+0x504/0x6e8
[   52.642334] c0        [<ffffffc000654004>] usb_add_function+0x84/0x184
[   52.642340] c0        [<ffffffc000658ca4>] configfs_composite_bind+0x264/0x39c
[   52.642346] c0        [<ffffffc00065b348>] udc_bind_to_driver+0x58/0x11c
[   52.642352] c0        [<ffffffc00065b49c>] usb_udc_attach_driver+0x90/0xc8
[   52.642358] c0        [<ffffffc0006598e0>] gadget_dev_desc_UDC_store+0xd4/0x128
[   52.642369] c0        [<ffffffc0002c14e8>] configfs_write_file+0xd0/0x13c
[   52.642376] c0        [<ffffffc00023c054>] vfs_write+0xb8/0x214
[   52.642381] c0        [<ffffffc00023cad4>] SyS_write+0x54/0xb0
[   52.642388] c0        [<ffffffc000085ff0>] el0_svc_naked+0x24/0x28
[   52.642395] c0
              -> #0 (udc_lock){+.+.+.}:
[   52.642401] c0        [<ffffffc00011e3d0>] print_circular_bug+0x84/0x2e4
[   52.642407] c0        [<ffffffc000123454>] __lock_acquire+0x2138/0x2238
[   52.642412] c0        [<ffffffc000123b54>] lock_acquire+0xe4/0x298
[   52.642420] c0        [<ffffffc000aaf6e8>] mutex_lock_nested+0x7c/0x3cc
[   52.642427] c0        [<ffffffc00065aeec>] usb_gadget_unregister_driver+0x3c/0xc8
[   52.642432] c0        [<ffffffc00065995c>] unregister_gadget_item+0x28/0x44
[   52.642439] c0        [<ffffffc00066b34c>] ffs_data_clear+0x138/0x140
[   52.642444] c0        [<ffffffc00066b374>] ffs_data_reset+0x20/0x6c
[   52.642450] c0        [<ffffffc00066efd0>] ffs_data_closed+0xac/0x12c
[   52.642454] c0        [<ffffffc00066f070>] ffs_ep0_release+0x20/0x2c
[   52.642460] c0        [<ffffffc00023dbe4>] __fput+0xb0/0x1f4
[   52.642466] c0        [<ffffffc00023dd9c>] ____fput+0x20/0x2c
[   52.642473] c0        [<ffffffc0000ee944>] task_work_run+0xb4/0xe8
[   52.642482] c0        [<ffffffc0000cd45c>] do_exit+0x360/0xb9c
[   52.642487] c0        [<ffffffc0000cf228>] do_group_exit+0x4c/0xb0
[   52.642494] c0        [<ffffffc0000dd3c8>] get_signal+0x380/0x89c
[   52.642501] c0        [<ffffffc00008a8f0>] do_signal+0x154/0x518
[   52.642507] c0        [<ffffffc00008af00>] do_notify_resume+0x70/0x78
[   52.642512] c0        [<ffffffc000085ee8>] work_pending+0x1c/0x20
[   52.642514] c1
              other info that might help us debug this:
[   52.642517] c1  Possible unsafe locking scenario:
[   52.642518] c1        CPU0                    CPU1
[   52.642520] c1        ----                    ----
[   52.642525] c0   lock(ffs_lock);
[   52.642529] c0                                lock(udc_lock);
[   52.642533] c0                                lock(ffs_lock);
[   52.642537] c0   lock(udc_lock);
[   52.642539] c1
                      *** DEADLOCK ***
[   52.642543] c1 1 lock held by usb ffs open/2808:
[   52.642555] c0  #0:  (ffs_lock){+.+.+.}, at: [<ffffffc00066b244>]
		ffs_data_clear+0x30/0x140
[   52.642557] c1 stack backtrace:
[   52.642563] c1 CPU: 1 PID: 2808 Comm: usb ffs open Tainted: G
[   52.642565] c1 Hardware name: Spreadtrum SP9860g Board (DT)
[   52.642568] c1 Call trace:
[   52.642573] c1 [<ffffffc00008b430>] dump_backtrace+0x0/0x170
[   52.642577] c1 [<ffffffc00008b5c0>] show_stack+0x20/0x28
[   52.642583] c1 [<ffffffc000422694>] dump_stack+0xa8/0xe0
[   52.642587] c1 [<ffffffc00011e548>] print_circular_bug+0x1fc/0x2e4
[   52.642591] c1 [<ffffffc000123454>] __lock_acquire+0x2138/0x2238
[   52.642595] c1 [<ffffffc000123b54>] lock_acquire+0xe4/0x298
[   52.642599] c1 [<ffffffc000aaf6e8>] mutex_lock_nested+0x7c/0x3cc
[   52.642604] c1 [<ffffffc00065aeec>] usb_gadget_unregister_driver+0x3c/0xc8
[   52.642608] c1 [<ffffffc00065995c>] unregister_gadget_item+0x28/0x44
[   52.642613] c1 [<ffffffc00066b34c>] ffs_data_clear+0x138/0x140
[   52.642618] c1 [<ffffffc00066b374>] ffs_data_reset+0x20/0x6c
[   52.642621] c1 [<ffffffc00066efd0>] ffs_data_closed+0xac/0x12c
[   52.642625] c1 [<ffffffc00066f070>] ffs_ep0_release+0x20/0x2c
[   52.642629] c1 [<ffffffc00023dbe4>] __fput+0xb0/0x1f4
[   52.642633] c1 [<ffffffc00023dd9c>] ____fput+0x20/0x2c
[   52.642636] c1 [<ffffffc0000ee944>] task_work_run+0xb4/0xe8
[   52.642640] c1 [<ffffffc0000cd45c>] do_exit+0x360/0xb9c
[   52.642644] c1 [<ffffffc0000cf228>] do_group_exit+0x4c/0xb0
[   52.642647] c1 [<ffffffc0000dd3c8>] get_signal+0x380/0x89c
[   52.642651] c1 [<ffffffc00008a8f0>] do_signal+0x154/0x518
[   52.642656] c1 [<ffffffc00008af00>] do_notify_resume+0x70/0x78
[   52.642659] c1 [<ffffffc000085ee8>] work_pending+0x1c/0x20

Bug: 62572621
Change-Id: I2f6bea38c449ed56234a6e873d3cb33c2e83ff29
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
(cherry picked from commit b3ce3ce02d146841af012d08506b4071db8ffde3)
---
 drivers/usb/gadget/function/f_fs.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 9ad5145d3103..194a54991497 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -3463,6 +3463,7 @@ static void ffs_closed(struct ffs_data *ffs)
 {
 	struct ffs_dev *ffs_obj;
 	struct f_fs_opts *opts;
+	struct config_item *ci;
 
 	ENTER();
 	ffs_dev_lock();
@@ -3486,8 +3487,11 @@ static void ffs_closed(struct ffs_data *ffs)
 	    || !atomic_read(&opts->func_inst.group.cg_item.ci_kref.refcount))
 		goto done;
 
-	unregister_gadget_item(ffs_obj->opts->
-			       func_inst.group.cg_item.ci_parent->ci_parent);
+	ci = opts->func_inst.group.cg_item.ci_parent->ci_parent;
+	ffs_dev_unlock();
+
+	unregister_gadget_item(ci);
+	return;
 done:
 	ffs_dev_unlock();
 }

From 44b35c3df9748f9caeab7915b7eacce713311fd0 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Tue, 13 Jun 2017 09:23:25 -0700
Subject: [PATCH 0886/3220] ANDROID: android-base.cfg: split out arm64-specific
 configs

These config options are specific to arm64 so should not be universally
required.

Bug: 62523096
Change-Id: I52bcad68f32d5314032c6aa3f37402b2ffba79be
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 android/configs/android-base-arm64.cfg | 5 +++++
 android/configs/android-base.cfg       | 4 ----
 2 files changed, 5 insertions(+), 4 deletions(-)
 create mode 100644 android/configs/android-base-arm64.cfg

diff --git a/android/configs/android-base-arm64.cfg b/android/configs/android-base-arm64.cfg
new file mode 100644
index 000000000000..43f23d6b5391
--- /dev/null
+++ b/android/configs/android-base-arm64.cfg
@@ -0,0 +1,5 @@
+#  KEEP ALPHABETICALLY SORTED
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_SWP_EMULATION=y
diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index b974d01b84af..d675e712fe8c 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -12,7 +12,6 @@ CONFIG_ANDROID=y
 CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
 CONFIG_ANDROID_BINDER_IPC=y
 CONFIG_ANDROID_LOW_MEMORY_KILLER=y
-CONFIG_ARMV8_DEPRECATED=y
 CONFIG_ASHMEM=y
 CONFIG_AUDIT=y
 CONFIG_BLK_DEV_INITRD=y
@@ -21,7 +20,6 @@ CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_SCHED=y
-CONFIG_CP15_BARRIER_EMULATION=y
 CONFIG_DEFAULT_SECURITY_SELINUX=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
@@ -155,9 +153,7 @@ CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
 CONFIG_SECURITY_SELINUX=y
-CONFIG_SETEND_EMULATION=y
 CONFIG_STAGING=y
-CONFIG_SWP_EMULATION=y
 CONFIG_SYNC=y
 CONFIG_TUN=y
 CONFIG_UID_SYS_STATS=y

From d769a181e9f03ad35cd3e61943e684b1f50a0e11 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Tue, 16 May 2017 16:48:49 -0700
Subject: [PATCH 0887/3220] ANDROID: sdcardfs: remove dead function
 open_flags_to_access_mode()

smatch warns about the suspicious formatting in the last line of
open_flags_to_access_mode().  It turns out the only caller was deleted
over a year ago by "ANDROID: sdcardfs: Bring up to date with Android M
permissions:", so we can "fix" the function's formatting by deleting it.

Change-Id: Id85946f3eb01722eef35b1815f405a6fda3aa4ff
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 fs/sdcardfs/packagelist.c | 13 -------------
 fs/sdcardfs/sdcardfs.h    |  1 -
 2 files changed, 14 deletions(-)

diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 8495474258d5..66c6f8e72a02 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -174,19 +174,6 @@ int check_caller_access_to_name(struct inode *parent_node, const struct qstr *na
 	return 1;
 }
 
-/* This function is used when file opening. The open flags must be
- * checked before calling check_caller_access_to_name()
- */
-int open_flags_to_access_mode(int open_flags)
-{
-	if ((open_flags & O_ACCMODE) == O_RDONLY)
-		return 0; /* R_OK */
-	if ((open_flags & O_ACCMODE) == O_WRONLY)
-		return 1; /* W_OK */
-	/* Probably O_RDRW, but treat as default to be safe */
-		return 1; /* R_OK | W_OK */
-}
-
 static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key,
 		appid_t value)
 {
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 31d37d7da4f9..b8624fd8b817 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -499,7 +499,6 @@ extern appid_t get_appid(const char *app_name);
 extern appid_t get_ext_gid(const char *app_name);
 extern appid_t is_excluded(const char *app_name, userid_t userid);
 extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name);
-extern int open_flags_to_access_mode(int open_flags);
 extern int packagelist_init(void);
 extern void packagelist_exit(void);
 

From 1200efcca9b5174bc8de5ac8440f49fab3bcd0f8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 24 Apr 2016 00:56:03 -0400
Subject: [PATCH 0888/3220] BACKPORT: ext4: fix data exposure after a crash

Huang has reported that in his powerfail testing he is seeing stale
block contents in some of recently allocated blocks although he mounts
ext4 in data=ordered mode. After some investigation I have found out
that indeed when delayed allocation is used, we don't add inode to
transaction's list of inodes needing flushing before commit. Originally
we were doing that but commit f3b59291a69d removed the logic with a
flawed argument that it is not needed.

The problem is that although for delayed allocated blocks we write their
contents immediately after allocating them, there is no guarantee that
the IO scheduler or device doesn't reorder things and thus transaction
allocating blocks and attaching them to inode can reach stable storage
before actual block contents. Actually whenever we attach freshly
allocated blocks to inode using a written extent, we should add inode to
transaction's ordered inode list to make sure we properly wait for block
contents to be written before committing the transaction. So that is
what we do in this patch. This also handles other cases where stale data
exposure was possible - like filling hole via mmap in
data=ordered,nodelalloc mode.

The only exception to the above rule are extending direct IO writes where
blkdev_direct_IO() waits for IO to complete before increasing i_size and
thus stale data exposure is not possible. For now we don't complicate
the code with optimizing this special case since the overhead is pretty
low. In case this is observed to be a performance problem we can always
handle it using a special flag to ext4_map_blocks().

CC: stable@vger.kernel.org
Fixes: f3b59291a69d0b734be1fc8be489fef2dd846d3d
Reported-by: "HUANG Weller (CM/ESW12-CN)" <Weller.Huang@cn.bosch.com>
Tested-by: "HUANG Weller (CM/ESW12-CN)" <Weller.Huang@cn.bosch.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
(cherry picked from commit 06bd3c36a733ac27962fea7d6f47168841376824)
Signed-off-by: Connor O'Brien <connoro@google.com>
Bug: 62198330

Change-Id: Idc78b64e4f23e6085301c60057af6029b49a8193
---
 fs/ext4/inode.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e3d425eeab4a..2ac719290869 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -659,6 +659,20 @@ has_zeroout:
 		ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
+
+		/*
+		 * Inodes with freshly allocated blocks where contents will be
+		 * visible after transaction commit must be on transaction's
+		 * ordered data list.
+		 */
+		if (map->m_flags & EXT4_MAP_NEW &&
+		    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
+		    !IS_NOQUOTA(inode) &&
+		    ext4_should_order_data(inode)) {
+			ret = ext4_jbd2_file_inode(handle, inode);
+			if (ret)
+				return ret;
+		}
 	}
 	return retval;
 }
@@ -1164,15 +1178,6 @@ static int ext4_write_end(struct file *file,
 
 	trace_android_fs_datawrite_end(inode, pos, len);
 	trace_ext4_write_end(inode, pos, len, copied);
-	if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
-		ret = ext4_jbd2_file_inode(handle, inode);
-		if (ret) {
-			unlock_page(page);
-			page_cache_release(page);
-			goto errout;
-		}
-	}
-
 	if (ext4_has_inline_data(inode)) {
 		ret = ext4_write_inline_data_end(inode, pos, len,
 						 copied, page);

From 6bb6b3e686cc90cd4b75541a6f1c95a98e0377bc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 8 May 2017 00:04:09 +0200
Subject: [PATCH 0889/3220] UPSTREAM: bpf: don't let ldimm64 leak map addresses
 on unprivileged

[ Upstream commit 0d0e57697f162da4aa218b5feafe614fb666db07 ]

The patch fixes two things at once:

1) It checks the env->allow_ptr_leaks and only prints the map address to
   the log if we have the privileges to do so, otherwise it just dumps 0
   as we would when kptr_restrict is enabled on %pK. Given the latter is
   off by default and not every distro sets it, I don't want to rely on
   this, hence the 0 by default for unprivileged.

2) Printing of ldimm64 in the verifier log is currently broken in that
   we don't print the full immediate, but only the 32 bit part of the
   first insn part for ldimm64. Thus, fix this up as well; it's okay to
   access, since we verified all ldimm64 earlier already (including just
   constants) through replace_map_fd_with_map_ptr().

Fixes: 1be7f75d1668 ("bpf: enable non-root eBPF programs")
Fixes: cbd357008604 ("bpf: verifier (add ability to receive verification log)")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Bug: 62199770
Change-Id: I62ee47d06ddc669ba2863e8cf24f8f3e7683a461
---
 kernel/bpf/verifier.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cbfba78d3db..85de5094b936 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -313,7 +313,8 @@ static const char *const bpf_jmp_string[16] = {
 	[BPF_EXIT >> 4] = "exit",
 };
 
-static void print_bpf_insn(struct bpf_insn *insn)
+static void print_bpf_insn(const struct verifier_env *env,
+			   const struct bpf_insn *insn)
 {
 	u8 class = BPF_CLASS(insn->code);
 
@@ -377,9 +378,19 @@ static void print_bpf_insn(struct bpf_insn *insn)
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->src_reg, insn->imm);
-		} else if (BPF_MODE(insn->code) == BPF_IMM) {
-			verbose("(%02x) r%d = 0x%x\n",
-				insn->code, insn->dst_reg, insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IMM &&
+			   BPF_SIZE(insn->code) == BPF_DW) {
+			/* At this point, we already made sure that the second
+			 * part of the ldimm64 insn is accessible.
+			 */
+			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+			if (map_ptr && !env->allow_ptr_leaks)
+				imm = 0;
+
+			verbose("(%02x) r%d = 0x%llx\n", insn->code,
+				insn->dst_reg, (unsigned long long)imm);
 		} else {
 			verbose("BUG_ld_%02x\n", insn->code);
 			return;
@@ -1758,7 +1769,7 @@ static int do_check(struct verifier_env *env)
 
 		if (log_level) {
 			verbose("%d: ", insn_idx);
-			print_bpf_insn(insn);
+			print_bpf_insn(env, insn);
 		}
 
 		if (class == BPF_ALU || class == BPF_ALU64) {

From fa60377966b77928967294ee8d0cbce93d6648ad Mon Sep 17 00:00:00 2001
From: William Wu <william.wu@rock-chips.com>
Date: Tue, 25 Apr 2017 17:45:48 +0800
Subject: [PATCH 0890/3220] UPSTREAM: usb: gadget: f_fs: avoid out of bounds
 access on comp_desc

Companion descriptor is only used for SuperSpeed endpoints,
if the endpoints are HighSpeed or FullSpeed, the Companion
descriptor will not allocated, so we can only access it if
gadget is SuperSpeed.

I can reproduce this issue on Rockchip platform rk3368 SoC
which supports USB 2.0, and use functionfs for ADB. Kernel
build with CONFIG_KASAN=y and CONFIG_SLUB_DEBUG=y report
the following BUG:

==================================================================
BUG: KASAN: slab-out-of-bounds in ffs_func_set_alt+0x224/0x3a0 at addr ffffffc0601f6509
Read of size 1 by task swapper/0/0
============================================================================
BUG kmalloc-256 (Not tainted): kasan: bad access detected
----------------------------------------------------------------------------

Disabling lock debugging due to kernel taint
INFO: Allocated in ffs_func_bind+0x52c/0x99c age=1275 cpu=0 pid=1
alloc_debug_processing+0x128/0x17c
___slab_alloc.constprop.58+0x50c/0x610
__slab_alloc.isra.55.constprop.57+0x24/0x34
__kmalloc+0xe0/0x250
ffs_func_bind+0x52c/0x99c
usb_add_function+0xd8/0x1d4
configfs_composite_bind+0x48c/0x570
udc_bind_to_driver+0x6c/0x170
usb_udc_attach_driver+0xa4/0xd0
gadget_dev_desc_UDC_store+0xcc/0x118
configfs_write_file+0x1a0/0x1f8
__vfs_write+0x64/0x174
vfs_write+0xe4/0x200
SyS_write+0x68/0xc8
el0_svc_naked+0x24/0x28
INFO: Freed in inode_doinit_with_dentry+0x3f0/0x7c4 age=1275 cpu=7 pid=247
...
Call trace:
[<ffffff900808aab4>] dump_backtrace+0x0/0x230
[<ffffff900808acf8>] show_stack+0x14/0x1c
[<ffffff90084ad420>] dump_stack+0xa0/0xc8
[<ffffff90082157cc>] print_trailer+0x188/0x198
[<ffffff9008215948>] object_err+0x3c/0x4c
[<ffffff900821b5ac>] kasan_report+0x324/0x4dc
[<ffffff900821aa38>] __asan_load1+0x24/0x50
[<ffffff90089eb750>] ffs_func_set_alt+0x224/0x3a0
[<ffffff90089d3760>] composite_setup+0xdcc/0x1ac8
[<ffffff90089d7394>] android_setup+0x124/0x1a0
[<ffffff90089acd18>] _setup+0x54/0x74
[<ffffff90089b6b98>] handle_ep0+0x3288/0x4390
[<ffffff90089b9b44>] dwc_otg_pcd_handle_out_ep_intr+0x14dc/0x2ae4
[<ffffff90089be85c>] dwc_otg_pcd_handle_intr+0x1ec/0x298
[<ffffff90089ad680>] dwc_otg_pcd_irq+0x10/0x20
[<ffffff9008116328>] handle_irq_event_percpu+0x124/0x3ac
[<ffffff9008116610>] handle_irq_event+0x60/0xa0
[<ffffff900811af30>] handle_fasteoi_irq+0x10c/0x1d4
[<ffffff9008115568>] generic_handle_irq+0x30/0x40
[<ffffff90081159b4>] __handle_domain_irq+0xac/0xdc
[<ffffff9008080e9c>] gic_handle_irq+0x64/0xa4
...
Memory state around the buggy address:
  ffffffc0601f6400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  ffffffc0601f6480: 00 00 00 00 00 00 00 00 00 00 06 fc fc fc fc fc
 >ffffffc0601f6500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
                       ^
  ffffffc0601f6580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffffffc0601f6600: fc fc fc fc fc fc fc fc 00 00 00 00 00 00 00 00
==================================================================

Signed-off-by: William Wu <william.wu@rock-chips.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>

(cherry picked from commit b7f73850bb4fac1e2209a4dd5e636d39be92f42c)

Signed-off-by: Jerry Zhang <zhangjerry@google.com>

Signed-off-by: Jerry Zhang <zhangjerry@google.com>
---
 drivers/usb/gadget/function/f_fs.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 194a54991497..732e6ed5d7b4 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -1668,12 +1668,12 @@ static int ffs_func_eps_enable(struct ffs_function *func)
 		ep->ep->driver_data = ep;
 		ep->ep->desc = ds;
 
-		comp_desc = (struct usb_ss_ep_comp_descriptor *)(ds +
-				USB_DT_ENDPOINT_SIZE);
-		ep->ep->maxburst = comp_desc->bMaxBurst + 1;
-
-		if (needs_comp_desc)
+		if (needs_comp_desc) {
+			comp_desc = (struct usb_ss_ep_comp_descriptor *)(ds +
+					USB_DT_ENDPOINT_SIZE);
+			ep->ep->maxburst = comp_desc->bMaxBurst + 1;
 			ep->ep->comp_desc = comp_desc;
+		}
 
 		ret = usb_ep_enable(ep->ep);
 		if (likely(!ret)) {

From 575a44c7cc5e526ff0311eb66d5c6cb2d931406c Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 20 Jun 2017 17:05:33 -0700
Subject: [PATCH 0891/3220] ANDROID: squashfs: Fix signed division issue

The value here can change depending on the type that PAGE_SIZE
has on a given architecture. To avoid the ensuing signed and
unsigned division conversions, we shift instead using PAGE_SHIFT

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35257858
Change-Id: I132cae93abea39390c3f0f91a4b2e026e97ed4c7
---
 fs/squashfs/block.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2eb66decc5ab..4e3e0863f5ea 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -211,8 +211,8 @@ static int bh_is_optional(struct squashfs_read_request *req, int idx)
 	int start_idx, end_idx;
 	struct squashfs_sb_info *msblk = req->sb->s_fs_info;
 
-	start_idx = (idx * msblk->devblksize - req->offset) / PAGE_CACHE_SIZE;
-	end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) / PAGE_CACHE_SIZE;
+	start_idx = (idx * msblk->devblksize - req->offset) >> PAGE_SHIFT;
+	end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) >> PAGE_SHIFT;
 	if (start_idx >= req->output->pages)
 		return 1;
 	if (start_idx < 0)

From 16759392d588c6042e4496943ba718ef403e9735 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 26 Jun 2017 18:21:56 -0700
Subject: [PATCH 0892/3220] ANDROID: squashfs: Fix endianness issue

Code in squashfs_process_blocks was not correctly assigning
length. Casting to u16* introduced endianness issues on some
architectures.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35257858
Change-Id: I9efaef4bc531b7469de79cf94738ade2dd6e6a8c
---
 fs/squashfs/block.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 4e3e0863f5ea..b3b95e2ae2ff 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -121,11 +121,12 @@ static void squashfs_process_blocks(struct squashfs_read_request *req)
 
 	if (req->data_processing == SQUASHFS_METADATA) {
 		/* Extract the length of the metadata block */
-		if (req->offset != msblk->devblksize - 1)
-			length = *((u16 *)(bh[0]->b_data + req->offset));
-		else {
-			length = bh[0]->b_data[req->offset];
-			length |= bh[1]->b_data[0] << 8;
+		if (req->offset != msblk->devblksize - 1) {
+			length = le16_to_cpup((__le16 *)
+					(bh[0]->b_data + req->offset));
+		} else {
+			length = (unsigned char)bh[0]->b_data[req->offset];
+			length |= (unsigned char)bh[1]->b_data[0] << 8;
 		}
 		req->compressed = SQUASHFS_COMPRESSED(length);
 		req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS

From fbea122e3ff597c35adbdad04415765b26be8d85 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Tue, 23 Feb 2016 18:22:39 +0000
Subject: [PATCH 0893/3220] UPSTREAM: drivers/perf: arm_pmu: implement CPU_PM
 notifier

(cherry picked from commit da4e4f18afe0f3729d68f3785c5802f786d36e34)

When a CPU is suspended (either through suspend-to-RAM or CPUidle),
its PMU registers content can be lost, which means that counters
registers values that were initialized on power down entry have to be
reprogrammed on power-up to make sure the counters set-up is preserved
(ie on power-up registers take the reset values on Cold or Warm reset,
which can be architecturally UNKNOWN).

To guarantee seamless profiling conditions across a core power down
this patch adds a CPU PM notifier to ARM pmus, that upon CPU PM
entry/exit from low-power states saves/restores the pmu registers
set-up (by using the ARM perf API), so that the power-down/up cycle does
not affect the perf behaviour (apart from a black-out period between
power-up/down CPU PM notifications that is unavoidable).

Change-Id: Ifbd73b82ca9dc172c58e2488cda1af9af975b14f
Cc: Will Deacon <will.deacon@arm.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Acked-by: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c       | 95 ++++++++++++++++++++++++++++++++++++
 include/linux/perf/arm_pmu.h |  1 +
 2 files changed, 96 insertions(+)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 8af1f900ea65..85b445c50fee 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -13,6 +13,7 @@
 
 #include <linux/bitmap.h>
 #include <linux/cpumask.h>
+#include <linux/cpu_pm.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/of_device.h>
@@ -718,6 +719,93 @@ static int cpu_pmu_notify(struct notifier_block *b, unsigned long action,
 	return NOTIFY_OK;
 }
 
+#ifdef CONFIG_CPU_PM
+static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd)
+{
+	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
+	struct perf_event *event;
+	int idx;
+
+	for (idx = 0; idx < armpmu->num_events; idx++) {
+		/*
+		 * If the counter is not used skip it, there is no
+		 * need of stopping/restarting it.
+		 */
+		if (!test_bit(idx, hw_events->used_mask))
+			continue;
+
+		event = hw_events->events[idx];
+
+		switch (cmd) {
+		case CPU_PM_ENTER:
+			/*
+			 * Stop and update the counter
+			 */
+			armpmu_stop(event, PERF_EF_UPDATE);
+			break;
+		case CPU_PM_EXIT:
+		case CPU_PM_ENTER_FAILED:
+			 /* Restore and enable the counter */
+			armpmu_start(event, PERF_EF_RELOAD);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
+			     void *v)
+{
+	struct arm_pmu *armpmu = container_of(b, struct arm_pmu, cpu_pm_nb);
+	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
+	int enabled = bitmap_weight(hw_events->used_mask, armpmu->num_events);
+
+	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
+		return NOTIFY_DONE;
+
+	/*
+	 * Always reset the PMU registers on power-up even if
+	 * there are no events running.
+	 */
+	if (cmd == CPU_PM_EXIT && armpmu->reset)
+		armpmu->reset(armpmu);
+
+	if (!enabled)
+		return NOTIFY_OK;
+
+	switch (cmd) {
+	case CPU_PM_ENTER:
+		armpmu->stop(armpmu);
+		cpu_pm_pmu_setup(armpmu, cmd);
+		break;
+	case CPU_PM_EXIT:
+		cpu_pm_pmu_setup(armpmu, cmd);
+	case CPU_PM_ENTER_FAILED:
+		armpmu->start(armpmu);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu)
+{
+	cpu_pmu->cpu_pm_nb.notifier_call = cpu_pm_pmu_notify;
+	return cpu_pm_register_notifier(&cpu_pmu->cpu_pm_nb);
+}
+
+static void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu)
+{
+	cpu_pm_unregister_notifier(&cpu_pmu->cpu_pm_nb);
+}
+#else
+static inline int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) { return 0; }
+static inline void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) { }
+#endif
+
 static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 {
 	int err;
@@ -733,6 +821,10 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 	if (err)
 		goto out_hw_events;
 
+	err = cpu_pm_pmu_register(cpu_pmu);
+	if (err)
+		goto out_unregister;
+
 	for_each_possible_cpu(cpu) {
 		struct pmu_hw_events *events = per_cpu_ptr(cpu_hw_events, cpu);
 		raw_spin_lock_init(&events->pmu_lock);
@@ -754,6 +846,8 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 
 	return 0;
 
+out_unregister:
+	unregister_cpu_notifier(&cpu_pmu->hotplug_nb);
 out_hw_events:
 	free_percpu(cpu_hw_events);
 	return err;
@@ -761,6 +855,7 @@ out_hw_events:
 
 static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
 {
+	cpu_pm_pmu_unregister(cpu_pmu);
 	unregister_cpu_notifier(&cpu_pmu->hotplug_nb);
 	free_percpu(cpu_pmu->hw_events);
 }
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index bfa673bb822d..0e55e4016f49 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -107,6 +107,7 @@ struct arm_pmu {
 	struct platform_device	*plat_device;
 	struct pmu_hw_events	__percpu *hw_events;
 	struct notifier_block	hotplug_nb;
+	struct notifier_block	cpu_pm_nb;
 };
 
 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))

From 25152dbcc1e5c0278670436e1f1e1f51320529c5 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Thu, 21 Apr 2016 10:24:34 +0100
Subject: [PATCH 0894/3220] UPSTREAM: drivers/perf: arm-pmu: fix RCU usage on
 pmu resume from low-power

(cherry picked from cbcc72e037b8a3eb1fad3c1ae22021df21c97a51)

Commit da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM notifier")
added code in the arm perf infrastructure that allows the kernel to
save/restore perf counters whenever the CPU enters a low-power
state. The kernel saves/restores the counters for each active event
through the armpmu_{stop/start} ARM pmu API, so that the low-power state
enter/exit cycle is emulated through pmu start/stop operations for each
event in use.

However, calling armpmu_start() for each active event on power up
executes code that requires RCU locking (perf_event_update_userpage())
to be functional, so, given that the core may call the CPU_PM notifiers
while running the idle thread in an quiescent RCU state this is not
allowed as detected through the following splat when kernel is run with
CONFIG_PROVE_LOCKING enabled:

[   49.293286]
[   49.294761] ===============================
[   49.298895] [ INFO: suspicious RCU usage. ]
[   49.303031] 4.6.0-rc3+ #421 Not tainted
[   49.306821] -------------------------------
[   49.310956] include/linux/rcupdate.h:872 rcu_read_lock() used
illegally while idle!
[   49.318530]
[   49.318530] other info that might help us debug this:
[   49.318530]
[   49.326451]
[   49.326451] RCU used illegally from idle CPU!
[   49.326451] rcu_scheduler_active = 1, debug_locks = 0
[   49.337209] RCU used illegally from extended quiescent state!
[   49.342892] 2 locks held by swapper/2/0:
[   49.346768]  #0:  (cpu_pm_notifier_lock){......}, at:
[<ffffff8008163c28>] cpu_pm_exit+0x18/0x80
[   49.355492]  #1:  (rcu_read_lock){......}, at: [<ffffff800816dc38>]
perf_event_update_userpage+0x0/0x260

This patch wraps the armpmu_start() call (that indirectly calls
perf_event_update_userpage()) on CPU_PM notifier power state exit (or
failed entry) within the RCU_NONIDLE() macro so that the RCU subsystem
is made aware the calling cpu is not idle from an RCU perspective for
the armpmu_start() call duration, therefore fixing the issue.

Change-Id: I6352b3e7970a2116d23e258fa085f0c29982f789
Fixes: da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM notifier")
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reported-by: James Morse <james.morse@arm.com>
Suggested-by: Kevin Hilman <khilman@baylibre.com>
Cc: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Cc: Kevin Hilman <khilman@baylibre.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/perf/arm_pmu.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 85b445c50fee..64b07b2d47f8 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -745,8 +745,19 @@ static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd)
 			break;
 		case CPU_PM_EXIT:
 		case CPU_PM_ENTER_FAILED:
-			 /* Restore and enable the counter */
-			armpmu_start(event, PERF_EF_RELOAD);
+			 /*
+			  * Restore and enable the counter.
+			  * armpmu_start() indirectly calls
+			  *
+			  * perf_event_update_userpage()
+			  *
+			  * that requires RCU read locking to be functional,
+			  * wrap the call within RCU_NONIDLE to make the
+			  * RCU subsystem aware this cpu is not idle from
+			  * an RCU perspective for the armpmu_start() call
+			  * duration.
+			  */
+			RCU_NONIDLE(armpmu_start(event, PERF_EF_RELOAD));
 			break;
 		default:
 			break;

From ebca043d15b000bc243540cb93109f818766fdf5 Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Tue, 20 Jun 2017 09:35:33 -0700
Subject: [PATCH 0895/3220] UPSTREAM: selinux: enable genfscon labeling for
 tracefs

In kernel version 4.1, tracefs was separated from debugfs into its
own filesystem. Prior to this split, files in
/sys/kernel/debug/tracing could be labeled during filesystem
creation using genfscon or later from userspace using setxattr. This
change re-enables support for genfscon labeling.

Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Paul Moore <paul@paul-moore.com>
(cherry picked from commit 6a3911837da0a90ed599fd0a9836472f5e7ddf1b)
Change-Id: I98ad8c829302346705c1abcdc8f019f479fdefb6
Bug: 62413700
---
 security/selinux/hooks.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 2bc8b555bca9..34427384605d 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -740,6 +740,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
 		sbsec->flags |= SE_SBPROC | SE_SBGENFS;
 
 	if (!strcmp(sb->s_type->name, "debugfs") ||
+	    !strcmp(sb->s_type->name, "tracefs") ||
 	    !strcmp(sb->s_type->name, "sysfs") ||
 	    !strcmp(sb->s_type->name, "pstore"))
 		sbsec->flags |= SE_SBGENFS;

From 92cc35433ddd0cba7a03fc19e88f8c854ee02055 Mon Sep 17 00:00:00 2001
From: Gao Xiang <gaoxiang25@huawei.com>
Date: Tue, 9 May 2017 12:30:56 +0800
Subject: [PATCH 0896/3220] ANDROID: sdcardfs: use mount_nodev and fix a issue
 in sdcardfs_kill_sb

Use the VFS mount_nodev instead of customized mount_nodev_with_options
and fix generic_shutdown_super to kill_anon_super because of set_anon_super

Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
Change-Id: Ibe46647aa2ce49d79291aa9d0295e9625cfccd80
---
 fs/sdcardfs/main.c | 47 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 3c5b51d49d21..80825b287836 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -364,41 +364,34 @@ out:
 	return err;
 }
 
-/* A feature which supports mount_nodev() with options */
-static struct dentry *mount_nodev_with_options(struct vfsmount *mnt,
-			struct file_system_type *fs_type, int flags,
-			const char *dev_name, void *data,
-			int (*fill_super)(struct vfsmount *, struct super_block *,
-						const char *, void *, int))
+struct sdcardfs_mount_private {
+	struct vfsmount *mnt;
+	const char *dev_name;
+	void *raw_data;
+};
 
+static int __sdcardfs_fill_super(
+	struct super_block *sb,
+	void *_priv, int silent)
 {
-	int error;
-	struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
+	struct sdcardfs_mount_private *priv = _priv;
 
-	if (IS_ERR(s))
-		return ERR_CAST(s);
-
-	s->s_flags = flags;
-
-	error = fill_super(mnt, s, dev_name, data, flags & MS_SILENT ? 1 : 0);
-	if (error) {
-		deactivate_locked_super(s);
-		return ERR_PTR(error);
-	}
-	s->s_flags |= MS_ACTIVE;
-	return dget(s->s_root);
+	return sdcardfs_read_super(priv->mnt,
+		sb, priv->dev_name, priv->raw_data, silent);
 }
 
 static struct dentry *sdcardfs_mount(struct vfsmount *mnt,
 		struct file_system_type *fs_type, int flags,
 			    const char *dev_name, void *raw_data)
 {
-	/*
-	 * dev_name is a lower_path_name,
-	 * raw_data is a option string.
-	 */
-	return mount_nodev_with_options(mnt, fs_type, flags, dev_name,
-						raw_data, sdcardfs_read_super);
+	struct sdcardfs_mount_private priv = {
+		.mnt = mnt,
+		.dev_name = dev_name,
+		.raw_data = raw_data
+	};
+
+	return mount_nodev(fs_type, flags,
+		&priv, __sdcardfs_fill_super);
 }
 
 static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type,
@@ -423,7 +416,7 @@ void sdcardfs_kill_sb(struct super_block *sb)
 		list_del(&sbi->list);
 		mutex_unlock(&sdcardfs_super_list_lock);
 	}
-	generic_shutdown_super(sb);
+	kill_anon_super(sb);
 }
 
 static struct file_system_type sdcardfs_fs_type = {

From 97841e574afb85dca93016347efee005d8150efc Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Tue, 11 Jul 2017 10:06:37 -0700
Subject: [PATCH 0897/3220] ANDROID: android-base.cfg: remove
 CONFIG_CGROUP_DEBUG

This config option is not required by Android.

Bug: 63578267
Change-Id: I163fa19183734a1a343d525e885a000a495c242e
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 android/configs/android-base.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index d675e712fe8c..28dce6c0516d 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -17,7 +17,6 @@ CONFIG_AUDIT=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_DEFAULT_SECURITY_SELINUX=y

From d368c6faa19ba98d26e713d3bf258cc6466d8ac2 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Thu, 1 Jun 2017 10:59:12 -0700
Subject: [PATCH 0898/3220] sched: walt: fix window misalignment when HZ=300

Due to rounding error hrtimer tick interval becomes 3333333 ns when HZ=300.
Consequently the tick time stamp nearest to the WALT's default window size
20ms will be also 19999998 (3333333 * 6).

Change-Id: I08f9bd2dbecccbb683e4490d06d8b0da703d3ab2
Suggested-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/walt.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 6e053bd9830c..92c3aae8e056 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -72,7 +72,15 @@ static cpumask_t mpc_mask = CPU_MASK_ALL;
 __read_mostly unsigned int walt_ravg_window = 20000000;
 
 /* Min window size (in ns) = 10ms */
+#ifdef CONFIG_HZ_300
+/*
+ * Tick interval becomes to 3333333 due to
+ * rounding error when HZ=300.
+ */
+#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
+#else
 #define MIN_SCHED_RAVG_WINDOW 10000000
+#endif
 
 /* Max window size (in ns) = 1s */
 #define MAX_SCHED_RAVG_WINDOW 1000000000

From ec49bb00cd72c190005a850a5832d2cab9aeb5ae Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 24 Mar 2017 15:53:53 -0700
Subject: [PATCH 0899/3220] Revert "android: binder: move global binder state
 into context struct."

This reverts commit d6bbb3276728e4a13c832c55f948a40a27bc4a33.

Signed-off-by: Todd Kjos <tkjos@google.com>
Change-Id: Ib507d62803f2beba7178c3f6f3f78bd1095b25b8
---
 drivers/android/binder.c | 392 +++++++++++++--------------------------
 1 file changed, 133 insertions(+), 259 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 9cf4f9bbc711..08cde76875d2 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -18,7 +18,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <asm/cacheflush.h>
-#include <linux/atomic.h>
 #include <linux/fdtable.h>
 #include <linux/file.h>
 #include <linux/freezer.h>
@@ -47,11 +46,19 @@
 #include <uapi/linux/android/binder.h>
 #include "binder_trace.h"
 
+static DEFINE_MUTEX(binder_main_lock);
+static DEFINE_MUTEX(binder_deferred_lock);
+static DEFINE_MUTEX(binder_mmap_lock);
+
 static HLIST_HEAD(binder_devices);
+static HLIST_HEAD(binder_procs);
+static HLIST_HEAD(binder_deferred_list);
+static HLIST_HEAD(binder_dead_nodes);
 
 static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
-atomic_t binder_last_id;
+static int binder_last_id;
+static struct workqueue_struct *binder_deferred_workqueue;
 
 #define BINDER_DEBUG_ENTRY(name) \
 static int binder_##name##_open(struct inode *inode, struct file *file) \
@@ -166,24 +173,20 @@ enum binder_stat_types {
 struct binder_stats {
 	int br[_IOC_NR(BR_FAILED_REPLY) + 1];
 	int bc[_IOC_NR(BC_REPLY_SG) + 1];
+	int obj_created[BINDER_STAT_COUNT];
+	int obj_deleted[BINDER_STAT_COUNT];
 };
 
-/* These are still global, since it's not always easy to get the context */
-struct binder_obj_stats {
-	atomic_t obj_created[BINDER_STAT_COUNT];
-	atomic_t obj_deleted[BINDER_STAT_COUNT];
-};
-
-static struct binder_obj_stats binder_obj_stats;
+static struct binder_stats binder_stats;
 
 static inline void binder_stats_deleted(enum binder_stat_types type)
 {
-	atomic_inc(&binder_obj_stats.obj_deleted[type]);
+	binder_stats.obj_deleted[type]++;
 }
 
 static inline void binder_stats_created(enum binder_stat_types type)
 {
-	atomic_inc(&binder_obj_stats.obj_created[type]);
+	binder_stats.obj_created[type]++;
 }
 
 struct binder_transaction_log_entry {
@@ -204,6 +207,8 @@ struct binder_transaction_log {
 	int full;
 	struct binder_transaction_log_entry entry[32];
 };
+static struct binder_transaction_log binder_transaction_log;
+static struct binder_transaction_log binder_transaction_log_failed;
 
 static struct binder_transaction_log_entry *binder_transaction_log_add(
 	struct binder_transaction_log *log)
@@ -224,21 +229,6 @@ struct binder_context {
 	struct binder_node *binder_context_mgr_node;
 	kuid_t binder_context_mgr_uid;
 	const char *name;
-
-	struct mutex binder_main_lock;
-	struct mutex binder_deferred_lock;
-	struct mutex binder_mmap_lock;
-
-	struct hlist_head binder_procs;
-	struct hlist_head binder_dead_nodes;
-	struct hlist_head binder_deferred_list;
-
-	struct work_struct deferred_work;
-	struct workqueue_struct *binder_deferred_workqueue;
-	struct binder_transaction_log transaction_log;
-	struct binder_transaction_log transaction_log_failed;
-
-	struct binder_stats binder_stats;
 };
 
 struct binder_device {
@@ -461,18 +451,17 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 	return retval;
 }
 
-static inline void binder_lock(struct binder_context *context, const char *tag)
+static inline void binder_lock(const char *tag)
 {
 	trace_binder_lock(tag);
-	mutex_lock(&context->binder_main_lock);
+	mutex_lock(&binder_main_lock);
 	trace_binder_locked(tag);
 }
 
-static inline void binder_unlock(struct binder_context *context,
-				 const char *tag)
+static inline void binder_unlock(const char *tag)
 {
 	trace_binder_unlock(tag);
-	mutex_unlock(&context->binder_main_lock);
+	mutex_unlock(&binder_main_lock);
 }
 
 static void binder_set_nice(long nice)
@@ -957,7 +946,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 	binder_stats_created(BINDER_STAT_NODE);
 	rb_link_node(&node->rb_node, parent, p);
 	rb_insert_color(&node->rb_node, &proc->nodes);
-	node->debug_id = atomic_inc_return(&binder_last_id);
+	node->debug_id = ++binder_last_id;
 	node->proc = proc;
 	node->ptr = ptr;
 	node->cookie = cookie;
@@ -1099,7 +1088,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 	if (new_ref == NULL)
 		return NULL;
 	binder_stats_created(BINDER_STAT_REF);
-	new_ref->debug_id = atomic_inc_return(&binder_last_id);
+	new_ref->debug_id = ++binder_last_id;
 	new_ref->proc = proc;
 	new_ref->node = node;
 	rb_link_node(&new_ref->rb_node_node, parent, p);
@@ -1859,7 +1848,7 @@ static void binder_transaction(struct binder_proc *proc,
 	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
 
-	e = binder_transaction_log_add(&context->transaction_log);
+	e = binder_transaction_log_add(&binder_transaction_log);
 	e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY);
 	e->from_proc = proc->pid;
 	e->from_thread = thread->pid;
@@ -1981,7 +1970,7 @@ static void binder_transaction(struct binder_proc *proc,
 	}
 	binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE);
 
-	t->debug_id = atomic_inc_return(&binder_last_id);
+	t->debug_id = ++binder_last_id;
 	e->debug_id = t->debug_id;
 
 	if (reply)
@@ -2245,8 +2234,7 @@ err_no_context_mgr_node:
 	{
 		struct binder_transaction_log_entry *fe;
 
-		fe = binder_transaction_log_add(
-				&context->transaction_log_failed);
+		fe = binder_transaction_log_add(&binder_transaction_log_failed);
 		*fe = *e;
 	}
 
@@ -2274,8 +2262,8 @@ static int binder_thread_write(struct binder_proc *proc,
 			return -EFAULT;
 		ptr += sizeof(uint32_t);
 		trace_binder_command(cmd);
-		if (_IOC_NR(cmd) < ARRAY_SIZE(context->binder_stats.bc)) {
-			context->binder_stats.bc[_IOC_NR(cmd)]++;
+		if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) {
+			binder_stats.bc[_IOC_NR(cmd)]++;
 			proc->stats.bc[_IOC_NR(cmd)]++;
 			thread->stats.bc[_IOC_NR(cmd)]++;
 		}
@@ -2640,8 +2628,8 @@ static void binder_stat_br(struct binder_proc *proc,
 			   struct binder_thread *thread, uint32_t cmd)
 {
 	trace_binder_return(cmd);
-	if (_IOC_NR(cmd) < ARRAY_SIZE(proc->stats.br)) {
-		proc->context->binder_stats.br[_IOC_NR(cmd)]++;
+	if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) {
+		binder_stats.br[_IOC_NR(cmd)]++;
 		proc->stats.br[_IOC_NR(cmd)]++;
 		thread->stats.br[_IOC_NR(cmd)]++;
 	}
@@ -2705,7 +2693,7 @@ retry:
 	if (wait_for_proc_work)
 		proc->ready_threads++;
 
-	binder_unlock(proc->context, __func__);
+	binder_unlock(__func__);
 
 	trace_binder_wait_for_work(wait_for_proc_work,
 				   !!thread->transaction_stack,
@@ -2732,7 +2720,7 @@ retry:
 			ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread));
 	}
 
-	binder_lock(proc->context, __func__);
+	binder_lock(__func__);
 
 	if (wait_for_proc_work)
 		proc->ready_threads--;
@@ -3119,14 +3107,14 @@ static unsigned int binder_poll(struct file *filp,
 	struct binder_thread *thread = NULL;
 	int wait_for_proc_work;
 
-	binder_lock(proc->context, __func__);
+	binder_lock(__func__);
 
 	thread = binder_get_thread(proc);
 
 	wait_for_proc_work = thread->transaction_stack == NULL &&
 		list_empty(&thread->todo) && thread->return_error == BR_OK;
 
-	binder_unlock(proc->context, __func__);
+	binder_unlock(__func__);
 
 	if (wait_for_proc_work) {
 		if (binder_has_proc_work(proc, thread))
@@ -3253,7 +3241,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int ret;
 	struct binder_proc *proc = filp->private_data;
-	struct binder_context *context = proc->context;
 	struct binder_thread *thread;
 	unsigned int size = _IOC_SIZE(cmd);
 	void __user *ubuf = (void __user *)arg;
@@ -3267,7 +3254,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	if (ret)
 		goto err_unlocked;
 
-	binder_lock(context, __func__);
+	binder_lock(__func__);
 	thread = binder_get_thread(proc);
 	if (thread == NULL) {
 		ret = -ENOMEM;
@@ -3319,7 +3306,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 err:
 	if (thread)
 		thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN;
-	binder_unlock(context, __func__);
+	binder_unlock(__func__);
 	wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
 	if (ret && ret != -ERESTARTSYS)
 		pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
@@ -3391,7 +3378,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	}
 	vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
 
-	mutex_lock(&proc->context->binder_mmap_lock);
+	mutex_lock(&binder_mmap_lock);
 	if (proc->buffer) {
 		ret = -EBUSY;
 		failure_string = "already mapped";
@@ -3406,7 +3393,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	}
 	proc->buffer = area->addr;
 	proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer;
-	mutex_unlock(&proc->context->binder_mmap_lock);
+	mutex_unlock(&binder_mmap_lock);
 
 #ifdef CONFIG_CPU_CACHE_VIPT
 	if (cache_is_vipt_aliasing()) {
@@ -3451,12 +3438,12 @@ err_alloc_small_buf_failed:
 	kfree(proc->pages);
 	proc->pages = NULL;
 err_alloc_pages_failed:
-	mutex_lock(&proc->context->binder_mmap_lock);
+	mutex_lock(&binder_mmap_lock);
 	vfree(proc->buffer);
 	proc->buffer = NULL;
 err_get_vm_area_failed:
 err_already_mapped:
-	mutex_unlock(&proc->context->binder_mmap_lock);
+	mutex_unlock(&binder_mmap_lock);
 err_bad_arg:
 	pr_err("binder_mmap: %d %lx-%lx %s failed %d\n",
 	       proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
@@ -3483,15 +3470,15 @@ static int binder_open(struct inode *nodp, struct file *filp)
 				  miscdev);
 	proc->context = &binder_dev->context;
 
-	binder_lock(proc->context, __func__);
+	binder_lock(__func__);
 
 	binder_stats_created(BINDER_STAT_PROC);
-	hlist_add_head(&proc->proc_node, &proc->context->binder_procs);
+	hlist_add_head(&proc->proc_node, &binder_procs);
 	proc->pid = current->group_leader->pid;
 	INIT_LIST_HEAD(&proc->delivered_death);
 	filp->private_data = proc;
 
-	binder_unlock(proc->context, __func__);
+	binder_unlock(__func__);
 
 	if (binder_debugfs_dir_entry_proc) {
 		char strbuf[11];
@@ -3556,7 +3543,6 @@ static int binder_release(struct inode *nodp, struct file *filp)
 static int binder_node_release(struct binder_node *node, int refs)
 {
 	struct binder_ref *ref;
-	struct binder_context *context = node->proc->context;
 	int death = 0;
 
 	list_del_init(&node->work.entry);
@@ -3572,7 +3558,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 	node->proc = NULL;
 	node->local_strong_refs = 0;
 	node->local_weak_refs = 0;
-	hlist_add_head(&node->dead_node, &context->binder_dead_nodes);
+	hlist_add_head(&node->dead_node, &binder_dead_nodes);
 
 	hlist_for_each_entry(ref, &node->refs, node_entry) {
 		refs++;
@@ -3637,8 +3623,7 @@ static void binder_deferred_release(struct binder_proc *proc)
 		node = rb_entry(n, struct binder_node, rb_node);
 		nodes++;
 		rb_erase(&node->rb_node, &proc->nodes);
-		incoming_refs = binder_node_release(node,
-						    incoming_refs);
+		incoming_refs = binder_node_release(node, incoming_refs);
 	}
 
 	outgoing_refs = 0;
@@ -3710,16 +3695,14 @@ static void binder_deferred_func(struct work_struct *work)
 {
 	struct binder_proc *proc;
 	struct files_struct *files;
-	struct binder_context *context =
-		container_of(work, struct binder_context, deferred_work);
 
 	int defer;
 
 	do {
-		binder_lock(context, __func__);
-		mutex_lock(&context->binder_deferred_lock);
-		if (!hlist_empty(&context->binder_deferred_list)) {
-			proc = hlist_entry(context->binder_deferred_list.first,
+		binder_lock(__func__);
+		mutex_lock(&binder_deferred_lock);
+		if (!hlist_empty(&binder_deferred_list)) {
+			proc = hlist_entry(binder_deferred_list.first,
 					struct binder_proc, deferred_work_node);
 			hlist_del_init(&proc->deferred_work_node);
 			defer = proc->deferred_work;
@@ -3728,7 +3711,7 @@ static void binder_deferred_func(struct work_struct *work)
 			proc = NULL;
 			defer = 0;
 		}
-		mutex_unlock(&context->binder_deferred_lock);
+		mutex_unlock(&binder_deferred_lock);
 
 		files = NULL;
 		if (defer & BINDER_DEFERRED_PUT_FILES) {
@@ -3743,24 +3726,24 @@ static void binder_deferred_func(struct work_struct *work)
 		if (defer & BINDER_DEFERRED_RELEASE)
 			binder_deferred_release(proc); /* frees proc */
 
-		binder_unlock(context, __func__);
+		binder_unlock(__func__);
 		if (files)
 			put_files_struct(files);
 	} while (proc);
 }
+static DECLARE_WORK(binder_deferred_work, binder_deferred_func);
 
 static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer)
 {
-	mutex_lock(&proc->context->binder_deferred_lock);
+	mutex_lock(&binder_deferred_lock);
 	proc->deferred_work |= defer;
 	if (hlist_unhashed(&proc->deferred_work_node)) {
 		hlist_add_head(&proc->deferred_work_node,
-				&proc->context->binder_deferred_list);
-		queue_work(proc->context->binder_deferred_workqueue,
-			   &proc->context->deferred_work);
+				&binder_deferred_list);
+		queue_work(binder_deferred_workqueue, &binder_deferred_work);
 	}
-	mutex_unlock(&proc->context->binder_deferred_lock);
+	mutex_unlock(&binder_deferred_lock);
 }
 
 static void print_binder_transaction(struct seq_file *m, const char *prefix,
@@ -3991,20 +3974,8 @@ static const char * const binder_objstat_strings[] = {
 	"transaction_complete"
 };
 
-static void add_binder_stats(struct binder_stats *from, struct binder_stats *to)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(to->bc); i++)
-		to->bc[i] += from->bc[i];
-
-	for (i = 0; i < ARRAY_SIZE(to->br); i++)
-		to->br[i] += from->br[i];
-}
-
 static void print_binder_stats(struct seq_file *m, const char *prefix,
-			       struct binder_stats *stats,
-			       struct binder_obj_stats *obj_stats)
+			       struct binder_stats *stats)
 {
 	int i;
 
@@ -4024,21 +3995,16 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
 				   binder_return_strings[i], stats->br[i]);
 	}
 
-	if (!obj_stats)
-		return;
-
-	BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) !=
+	BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
 		     ARRAY_SIZE(binder_objstat_strings));
-	BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) !=
-		     ARRAY_SIZE(obj_stats->obj_deleted));
-	for (i = 0; i < ARRAY_SIZE(obj_stats->obj_created); i++) {
-		int obj_created = atomic_read(&obj_stats->obj_created[i]);
-		int obj_deleted = atomic_read(&obj_stats->obj_deleted[i]);
-
-		if (obj_created || obj_deleted)
+	BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
+		     ARRAY_SIZE(stats->obj_deleted));
+	for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) {
+		if (stats->obj_created[i] || stats->obj_deleted[i])
 			seq_printf(m, "%s%s: active %d total %d\n", prefix,
-				   binder_objstat_strings[i],
-				   obj_created - obj_deleted, obj_created);
+				binder_objstat_strings[i],
+				stats->obj_created[i] - stats->obj_deleted[i],
+				stats->obj_created[i]);
 	}
 }
 
@@ -4093,131 +4059,85 @@ static void print_binder_proc_stats(struct seq_file *m,
 	}
 	seq_printf(m, "  pending transactions: %d\n", count);
 
-	print_binder_stats(m, "  ", &proc->stats, NULL);
+	print_binder_stats(m, "  ", &proc->stats);
 }
 
 
 static int binder_state_show(struct seq_file *m, void *unused)
 {
-	struct binder_device *device;
-	struct binder_context *context;
 	struct binder_proc *proc;
 	struct binder_node *node;
 	int do_lock = !binder_debug_no_lock;
-	bool wrote_dead_nodes_header = false;
+
+	if (do_lock)
+		binder_lock(__func__);
 
 	seq_puts(m, "binder state:\n");
 
-	hlist_for_each_entry(device, &binder_devices, hlist) {
-		context = &device->context;
-		if (do_lock)
-			binder_lock(context, __func__);
-		if (!wrote_dead_nodes_header &&
-		    !hlist_empty(&context->binder_dead_nodes)) {
-			seq_puts(m, "dead nodes:\n");
-			wrote_dead_nodes_header = true;
-		}
-		hlist_for_each_entry(node, &context->binder_dead_nodes,
-				     dead_node)
-			print_binder_node(m, node);
+	if (!hlist_empty(&binder_dead_nodes))
+		seq_puts(m, "dead nodes:\n");
+	hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
+		print_binder_node(m, node);
 
-		if (do_lock)
-			binder_unlock(context, __func__);
-	}
-
-	hlist_for_each_entry(device, &binder_devices, hlist) {
-		context = &device->context;
-		if (do_lock)
-			binder_lock(context, __func__);
-
-		hlist_for_each_entry(proc, &context->binder_procs, proc_node)
-			print_binder_proc(m, proc, 1);
-		if (do_lock)
-			binder_unlock(context, __func__);
-	}
+	hlist_for_each_entry(proc, &binder_procs, proc_node)
+		print_binder_proc(m, proc, 1);
+	if (do_lock)
+		binder_unlock(__func__);
 	return 0;
 }
 
 static int binder_stats_show(struct seq_file *m, void *unused)
 {
-	struct binder_device *device;
-	struct binder_context *context;
 	struct binder_proc *proc;
-	struct binder_stats total_binder_stats;
 	int do_lock = !binder_debug_no_lock;
 
-	memset(&total_binder_stats, 0, sizeof(struct binder_stats));
-
-	hlist_for_each_entry(device, &binder_devices, hlist) {
-		context = &device->context;
-		if (do_lock)
-			binder_lock(context, __func__);
-
-		add_binder_stats(&context->binder_stats, &total_binder_stats);
-
-		if (do_lock)
-			binder_unlock(context, __func__);
-	}
+	if (do_lock)
+		binder_lock(__func__);
 
 	seq_puts(m, "binder stats:\n");
-	print_binder_stats(m, "", &total_binder_stats, &binder_obj_stats);
 
-	hlist_for_each_entry(device, &binder_devices, hlist) {
-		context = &device->context;
-		if (do_lock)
-			binder_lock(context, __func__);
+	print_binder_stats(m, "", &binder_stats);
 
-		hlist_for_each_entry(proc, &context->binder_procs, proc_node)
-			print_binder_proc_stats(m, proc);
-		if (do_lock)
-			binder_unlock(context, __func__);
-	}
+	hlist_for_each_entry(proc, &binder_procs, proc_node)
+		print_binder_proc_stats(m, proc);
+	if (do_lock)
+		binder_unlock(__func__);
 	return 0;
 }
 
 static int binder_transactions_show(struct seq_file *m, void *unused)
 {
-	struct binder_device *device;
-	struct binder_context *context;
 	struct binder_proc *proc;
 	int do_lock = !binder_debug_no_lock;
 
-	seq_puts(m, "binder transactions:\n");
-	hlist_for_each_entry(device, &binder_devices, hlist) {
-		context = &device->context;
-		if (do_lock)
-			binder_lock(context, __func__);
+	if (do_lock)
+		binder_lock(__func__);
 
-		hlist_for_each_entry(proc, &context->binder_procs, proc_node)
-			print_binder_proc(m, proc, 0);
-		if (do_lock)
-			binder_unlock(context, __func__);
-	}
+	seq_puts(m, "binder transactions:\n");
+	hlist_for_each_entry(proc, &binder_procs, proc_node)
+		print_binder_proc(m, proc, 0);
+	if (do_lock)
+		binder_unlock(__func__);
 	return 0;
 }
 
 static int binder_proc_show(struct seq_file *m, void *unused)
 {
-	struct binder_device *device;
-	struct binder_context *context;
 	struct binder_proc *itr;
 	int pid = (unsigned long)m->private;
 	int do_lock = !binder_debug_no_lock;
 
-	hlist_for_each_entry(device, &binder_devices, hlist) {
-		context = &device->context;
-		if (do_lock)
-			binder_lock(context, __func__);
+	if (do_lock)
+		binder_lock(__func__);
 
-		hlist_for_each_entry(itr, &context->binder_procs, proc_node) {
-			if (itr->pid == pid) {
-				seq_puts(m, "binder proc state:\n");
-				print_binder_proc(m, itr, 1);
-			}
+	hlist_for_each_entry(itr, &binder_procs, proc_node) {
+		if (itr->pid == pid) {
+			seq_puts(m, "binder proc state:\n");
+			print_binder_proc(m, itr, 1);
 		}
-		if (do_lock)
-			binder_unlock(context, __func__);
 	}
+	if (do_lock)
+		binder_unlock(__func__);
 	return 0;
 }
 
@@ -4232,10 +4152,11 @@ static void print_binder_transaction_log_entry(struct seq_file *m,
 		   e->to_node, e->target_handle, e->data_size, e->offsets_size);
 }
 
-static int print_binder_transaction_log(struct seq_file *m,
-					struct binder_transaction_log *log)
+static int binder_transaction_log_show(struct seq_file *m, void *unused)
 {
+	struct binder_transaction_log *log = m->private;
 	int i;
+
 	if (log->full) {
 		for (i = log->next; i < ARRAY_SIZE(log->entry); i++)
 			print_binder_transaction_log_entry(m, &log->entry[i]);
@@ -4245,31 +4166,6 @@ static int print_binder_transaction_log(struct seq_file *m,
 	return 0;
 }
 
-static int binder_transaction_log_show(struct seq_file *m, void *unused)
-{
-	struct binder_device *device;
-	struct binder_context *context;
-
-	hlist_for_each_entry(device, &binder_devices, hlist) {
-		context = &device->context;
-		print_binder_transaction_log(m, &context->transaction_log);
-	}
-	return 0;
-}
-
-static int binder_failed_transaction_log_show(struct seq_file *m, void *unused)
-{
-	struct binder_device *device;
-	struct binder_context *context;
-
-	hlist_for_each_entry(device, &binder_devices, hlist) {
-		context = &device->context;
-		print_binder_transaction_log(m,
-					     &context->transaction_log_failed);
-	}
-	return 0;
-}
-
 static const struct file_operations binder_fops = {
 	.owner = THIS_MODULE,
 	.poll = binder_poll,
@@ -4285,20 +4181,11 @@ BINDER_DEBUG_ENTRY(state);
 BINDER_DEBUG_ENTRY(stats);
 BINDER_DEBUG_ENTRY(transactions);
 BINDER_DEBUG_ENTRY(transaction_log);
-BINDER_DEBUG_ENTRY(failed_transaction_log);
-
-static void __init free_binder_device(struct binder_device *device)
-{
-	if (device->context.binder_deferred_workqueue)
-		destroy_workqueue(device->context.binder_deferred_workqueue);
-	kfree(device);
-}
 
 static int __init init_binder_device(const char *name)
 {
 	int ret;
 	struct binder_device *binder_device;
-	struct binder_context *context;
 
 	binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL);
 	if (!binder_device)
@@ -4308,65 +4195,31 @@ static int __init init_binder_device(const char *name)
 	binder_device->miscdev.minor = MISC_DYNAMIC_MINOR;
 	binder_device->miscdev.name = name;
 
-	context = &binder_device->context;
-	context->binder_context_mgr_uid = INVALID_UID;
-	context->name = name;
-
-	mutex_init(&context->binder_main_lock);
-	mutex_init(&context->binder_deferred_lock);
-	mutex_init(&context->binder_mmap_lock);
-
-	context->binder_deferred_workqueue =
-		create_singlethread_workqueue(name);
-
-	if (!context->binder_deferred_workqueue) {
-		ret = -ENOMEM;
-		goto err_create_singlethread_workqueue_failed;
-	}
-
-	INIT_HLIST_HEAD(&context->binder_procs);
-	INIT_HLIST_HEAD(&context->binder_dead_nodes);
-	INIT_HLIST_HEAD(&context->binder_deferred_list);
-	INIT_WORK(&context->deferred_work, binder_deferred_func);
+	binder_device->context.binder_context_mgr_uid = INVALID_UID;
+	binder_device->context.name = name;
 
 	ret = misc_register(&binder_device->miscdev);
 	if (ret < 0) {
-		goto err_misc_register_failed;
+		kfree(binder_device);
+		return ret;
 	}
 
 	hlist_add_head(&binder_device->hlist, &binder_devices);
-	return ret;
-
-err_create_singlethread_workqueue_failed:
-err_misc_register_failed:
-	free_binder_device(binder_device);
 
 	return ret;
 }
 
 static int __init binder_init(void)
 {
-	int ret = 0;
+	int ret;
 	char *device_name, *device_names;
 	struct binder_device *device;
 	struct hlist_node *tmp;
 
-	/*
-	 * Copy the module_parameter string, because we don't want to
-	 * tokenize it in-place.
-	 */
-	device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
-	if (!device_names)
+	binder_deferred_workqueue = create_singlethread_workqueue("binder");
+	if (!binder_deferred_workqueue)
 		return -ENOMEM;
 
-	strcpy(device_names, binder_devices_param);
-
-	while ((device_name = strsep(&device_names, ","))) {
-		ret = init_binder_device(device_name);
-		if (ret)
-			goto err_init_binder_device_failed;
-	}
-
 	binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL);
 	if (binder_debugfs_dir_entry_root)
 		binder_debugfs_dir_entry_proc = debugfs_create_dir("proc",
@@ -4391,13 +4244,30 @@ static int __init binder_init(void)
 		debugfs_create_file("transaction_log",
 				    S_IRUGO,
 				    binder_debugfs_dir_entry_root,
-				    NULL,
+				    &binder_transaction_log,
 				    &binder_transaction_log_fops);
 		debugfs_create_file("failed_transaction_log",
 				    S_IRUGO,
 				    binder_debugfs_dir_entry_root,
-				    NULL,
-				    &binder_failed_transaction_log_fops);
+				    &binder_transaction_log_failed,
+				    &binder_transaction_log_fops);
+	}
+
+	/*
+	 * Copy the module_parameter string, because we don't want to
+	 * tokenize it in-place.
+	 */
+	device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
+	if (!device_names) {
+		ret = -ENOMEM;
+		goto err_alloc_device_names_failed;
+	}
+	strcpy(device_names, binder_devices_param);
+
+	while ((device_name = strsep(&device_names, ","))) {
+		ret = init_binder_device(device_name);
+		if (ret)
+			goto err_init_binder_device_failed;
 	}
 
 	return ret;
@@ -4406,8 +4276,12 @@ err_init_binder_device_failed:
 	hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) {
 		misc_deregister(&device->miscdev);
 		hlist_del(&device->hlist);
-		free_binder_device(device);
+		kfree(device);
 	}
+err_alloc_device_names_failed:
+	debugfs_remove_recursive(binder_debugfs_dir_entry_root);
+
+	destroy_workqueue(binder_deferred_workqueue);
 
 	return ret;
 }

From 0cebb407b2f431d748d9cc85cf7e4232a9223342 Mon Sep 17 00:00:00 2001
From: Riley Andrews <riandrews@google.com>
Date: Tue, 1 Sep 2015 12:42:07 -0700
Subject: [PATCH 0900/3220] FROMLIST: binder: Use wake up hint for synchronous
 transactions.

(from https://patchwork.kernel.org/patch/9817747)

Use wake_up_interruptible_sync() to hint to the scheduler binder
transactions are synchronous wakeups. Disable preemption while waking
to avoid ping-ponging on the binder lock.

Change-Id: Ic406a232d0873662f80148e37acefe5243d912a0
Signed-off-by: Todd Kjos <tkjos@google.com>
Git-commit: 443c026e90820170aa3db2c21d2933ae5922f900
Git-repo: https://android.googlesource.com/kernel/msm
Signed-off-by: Omprakash Dhyade <odhyade@codeaurora.org>
---
 drivers/android/binder.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 08cde76875d2..211262071198 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2201,8 +2201,12 @@ static void binder_transaction(struct binder_proc *proc,
 	list_add_tail(&t->work.entry, target_list);
 	tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
 	list_add_tail(&tcomplete->entry, &thread->todo);
-	if (target_wait)
-		wake_up_interruptible(target_wait);
+	if (target_wait) {
+		if (reply || !(t->flags & TF_ONE_WAY))
+			wake_up_interruptible_sync(target_wait);
+		else
+			wake_up_interruptible(target_wait);
+	}
 	return;
 
 err_translate_failed:

From 19a3948b3c02642a29df6339212a40424e24a570 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 10 Oct 2016 10:39:59 -0700
Subject: [PATCH 0901/3220] FROMLIST: binder: separate binder allocator
 structure from binder proc

(from https://patchwork.kernel.org/patch/9817745/)

The binder allocator is logically separate from the rest
of the binder drivers. Separating the data structures
to prepare for splitting into separate file with separate
locking.

Change-Id: I5ca4df567ac4b8c8d6ee2116ae67c0c1d75c9747
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c       | 216 ++++++++++++++++++++-------------
 drivers/android/binder_trace.h |   2 +-
 2 files changed, 132 insertions(+), 86 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 211262071198..734edbd67410 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -319,6 +319,41 @@ enum binder_deferred_state {
 	BINDER_DEFERRED_RELEASE      = 0x04,
 };
 
+/**
+ * struct binder_alloc - per-binder proc state for binder allocator
+ * @vma:               vm_area_struct passed to mmap_handler
+ *                     (invarient after mmap)
+ * @vma_vm_mm:         copy of vma->vm_mm (invarient after mmap)
+ * @buffer:            base of per-proc address space mapped via mmap
+ * @user_buffer_offset: offset between user and kernel VAs for buffer
+ * @buffers:           list of all buffers for this proc
+ * @free_buffers:      rb tree of buffers available for allocation
+ *                     sorted by size
+ * @allocated_buffers: rb tree of allocated buffers sorted by address
+ * @free_async_space:  VA space available for async buffers. This is
+ *                     initialized at mmap time to 1/2 the full VA space
+ * @pages:             array of physical page addresses for each page of
+ *                     mmap'd space
+ * @buffer_size:       size of address space (could be less than requested)
+ *
+ * Bookkeeping structure for per-proc address space management for binder
+ * buffers. It is normally initialized during binder_init() and binder_mmap()
+ * calls. The address space is used for both user-visible buffers and for
+ * struct binder_buffer objects used to track the user buffers
+ */
+struct binder_alloc {
+	struct vm_area_struct *vma;
+	struct mm_struct *vma_vm_mm;
+	void *buffer;
+	ptrdiff_t user_buffer_offset;
+	struct list_head buffers;
+	struct rb_root free_buffers;
+	struct rb_root allocated_buffers;
+	size_t free_async_space;
+	struct page **pages;
+	size_t buffer_size;
+};
+
 struct binder_proc {
 	struct hlist_node proc_node;
 	struct rb_root threads;
@@ -326,23 +361,11 @@ struct binder_proc {
 	struct rb_root refs_by_desc;
 	struct rb_root refs_by_node;
 	int pid;
-	struct vm_area_struct *vma;
-	struct mm_struct *vma_vm_mm;
 	struct task_struct *tsk;
 	struct files_struct *files;
 	struct hlist_node deferred_work_node;
 	int deferred_work;
-	void *buffer;
-	ptrdiff_t user_buffer_offset;
 
-	struct list_head buffers;
-	struct rb_root free_buffers;
-	struct rb_root allocated_buffers;
-	size_t free_async_space;
-
-	struct page **pages;
-	size_t buffer_size;
-	uint32_t buffer_free;
 	struct list_head todo;
 	wait_queue_head_t wait;
 	struct binder_stats stats;
@@ -353,6 +376,7 @@ struct binder_proc {
 	int ready_threads;
 	long default_priority;
 	struct dentry *debugfs_entry;
+	struct binder_alloc alloc;
 	struct binder_context *context;
 };
 
@@ -485,8 +509,10 @@ static void binder_set_nice(long nice)
 static size_t binder_buffer_size(struct binder_proc *proc,
 				 struct binder_buffer *buffer)
 {
-	if (list_is_last(&buffer->entry, &proc->buffers))
-		return proc->buffer + proc->buffer_size - (void *)buffer->data;
+	if (list_is_last(&buffer->entry, &proc->alloc.buffers))
+		return proc->alloc.buffer +
+				proc->alloc.buffer_size -
+				(void *)buffer->data;
 	return (size_t)list_entry(buffer->entry.next,
 			  struct binder_buffer, entry) - (size_t)buffer->data;
 }
@@ -494,7 +520,7 @@ static size_t binder_buffer_size(struct binder_proc *proc,
 static void binder_insert_free_buffer(struct binder_proc *proc,
 				      struct binder_buffer *new_buffer)
 {
-	struct rb_node **p = &proc->free_buffers.rb_node;
+	struct rb_node **p = &proc->alloc.free_buffers.rb_node;
 	struct rb_node *parent = NULL;
 	struct binder_buffer *buffer;
 	size_t buffer_size;
@@ -521,13 +547,13 @@ static void binder_insert_free_buffer(struct binder_proc *proc,
 			p = &parent->rb_right;
 	}
 	rb_link_node(&new_buffer->rb_node, parent, p);
-	rb_insert_color(&new_buffer->rb_node, &proc->free_buffers);
+	rb_insert_color(&new_buffer->rb_node, &proc->alloc.free_buffers);
 }
 
 static void binder_insert_allocated_buffer(struct binder_proc *proc,
 					   struct binder_buffer *new_buffer)
 {
-	struct rb_node **p = &proc->allocated_buffers.rb_node;
+	struct rb_node **p = &proc->alloc.allocated_buffers.rb_node;
 	struct rb_node *parent = NULL;
 	struct binder_buffer *buffer;
 
@@ -546,18 +572,19 @@ static void binder_insert_allocated_buffer(struct binder_proc *proc,
 			BUG();
 	}
 	rb_link_node(&new_buffer->rb_node, parent, p);
-	rb_insert_color(&new_buffer->rb_node, &proc->allocated_buffers);
+	rb_insert_color(&new_buffer->rb_node, &proc->alloc.allocated_buffers);
 }
 
 static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc,
 						  uintptr_t user_ptr)
 {
-	struct rb_node *n = proc->allocated_buffers.rb_node;
+	struct rb_node *n = proc->alloc.allocated_buffers.rb_node;
 	struct binder_buffer *buffer;
 	struct binder_buffer *kern_ptr;
 
-	kern_ptr = (struct binder_buffer *)(user_ptr - proc->user_buffer_offset
-		- offsetof(struct binder_buffer, data));
+	kern_ptr = (struct binder_buffer *)
+		(user_ptr - proc->alloc.user_buffer_offset -
+			offsetof(struct binder_buffer, data));
 
 	while (n) {
 		buffer = rb_entry(n, struct binder_buffer, rb_node);
@@ -598,8 +625,8 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
 
 	if (mm) {
 		down_write(&mm->mmap_sem);
-		vma = proc->vma;
-		if (vma && mm != proc->vma_vm_mm) {
+		vma = proc->alloc.vma;
+		if (vma && mm != proc->alloc.vma_vm_mm) {
 			pr_err("%d: vma mm and task mm mismatch\n",
 				proc->pid);
 			vma = NULL;
@@ -618,7 +645,8 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
 	for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
 		int ret;
 
-		page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
+		page = &proc->alloc.pages[
+			(page_addr - proc->alloc.buffer) / PAGE_SIZE];
 
 		BUG_ON(*page);
 		*page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
@@ -637,7 +665,7 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
 			goto err_map_kernel_failed;
 		}
 		user_page_addr =
-			(uintptr_t)page_addr + proc->user_buffer_offset;
+			(uintptr_t)page_addr + proc->alloc.user_buffer_offset;
 		ret = vm_insert_page(vma, user_page_addr, page[0]);
 		if (ret) {
 			pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
@@ -655,10 +683,14 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
 free_range:
 	for (page_addr = end - PAGE_SIZE; page_addr >= start;
 	     page_addr -= PAGE_SIZE) {
-		page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
+
+		page = &proc->alloc.pages[
+			(page_addr - proc->alloc.buffer) / PAGE_SIZE];
 		if (vma)
-			zap_page_range(vma, (uintptr_t)page_addr +
-				proc->user_buffer_offset, PAGE_SIZE, NULL);
+			zap_page_range(vma,
+				       (uintptr_t)page_addr +
+						proc->alloc.user_buffer_offset,
+				       PAGE_SIZE, NULL);
 err_vm_insert_page_failed:
 		unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
 err_map_kernel_failed:
@@ -681,7 +713,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 					      size_t extra_buffers_size,
 					      int is_async)
 {
-	struct rb_node *n = proc->free_buffers.rb_node;
+	struct rb_node *n = proc->alloc.free_buffers.rb_node;
 	struct binder_buffer *buffer;
 	size_t buffer_size;
 	struct rb_node *best_fit = NULL;
@@ -689,7 +721,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 	void *end_page_addr;
 	size_t size, data_offsets_size;
 
-	if (proc->vma == NULL) {
+	if (proc->alloc.vma == NULL) {
 		pr_err("%d: binder_alloc_buf, no vma\n",
 		       proc->pid);
 		return NULL;
@@ -710,7 +742,8 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 		return NULL;
 	}
 	if (is_async &&
-	    proc->free_async_space < size + sizeof(struct binder_buffer)) {
+	    proc->alloc.free_async_space <
+			size + sizeof(struct binder_buffer)) {
 		binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 			     "%d: binder_alloc_buf size %zd failed, no async space left\n",
 			      proc->pid, size);
@@ -762,7 +795,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 	    (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL))
 		return NULL;
 
-	rb_erase(best_fit, &proc->free_buffers);
+	rb_erase(best_fit, &proc->alloc.free_buffers);
 	buffer->free = 0;
 	binder_insert_allocated_buffer(proc, buffer);
 	if (buffer_size != size) {
@@ -780,10 +813,11 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 	buffer->extra_buffers_size = extra_buffers_size;
 	buffer->async_transaction = is_async;
 	if (is_async) {
-		proc->free_async_space -= size + sizeof(struct binder_buffer);
+		proc->alloc.free_async_space -=
+			size + sizeof(struct binder_buffer);
 		binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
 			     "%d: binder_alloc_buf size %zd async free %zd\n",
-			      proc->pid, size, proc->free_async_space);
+			      proc->pid, size, proc->alloc.free_async_space);
 	}
 
 	return buffer;
@@ -806,7 +840,7 @@ static void binder_delete_free_buffer(struct binder_proc *proc,
 	int free_page_end = 1;
 	int free_page_start = 1;
 
-	BUG_ON(proc->buffers.next == &buffer->entry);
+	BUG_ON(proc->alloc.buffers.next == &buffer->entry);
 	prev = list_entry(buffer->entry.prev, struct binder_buffer, entry);
 	BUG_ON(!prev->free);
 	if (buffer_end_page(prev) == buffer_start_page(buffer)) {
@@ -818,7 +852,7 @@ static void binder_delete_free_buffer(struct binder_proc *proc,
 			      proc->pid, buffer, prev);
 	}
 
-	if (!list_is_last(&buffer->entry, &proc->buffers)) {
+	if (!list_is_last(&buffer->entry, &proc->alloc.buffers)) {
 		next = list_entry(buffer->entry.next,
 				  struct binder_buffer, entry);
 		if (buffer_start_page(next) == buffer_end_page(buffer)) {
@@ -862,39 +896,40 @@ static void binder_free_buf(struct binder_proc *proc,
 	BUG_ON(buffer->free);
 	BUG_ON(size > buffer_size);
 	BUG_ON(buffer->transaction != NULL);
-	BUG_ON((void *)buffer < proc->buffer);
-	BUG_ON((void *)buffer > proc->buffer + proc->buffer_size);
+	BUG_ON((void *)buffer < proc->alloc.buffer);
+	BUG_ON((void *)buffer > proc->alloc.buffer + proc->alloc.buffer_size);
 
 	if (buffer->async_transaction) {
-		proc->free_async_space += size + sizeof(struct binder_buffer);
+		proc->alloc.free_async_space +=
+			size + sizeof(struct binder_buffer);
 
 		binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
 			     "%d: binder_free_buf size %zd async free %zd\n",
-			      proc->pid, size, proc->free_async_space);
+			      proc->pid, size, proc->alloc.free_async_space);
 	}
 
 	binder_update_page_range(proc, 0,
 		(void *)PAGE_ALIGN((uintptr_t)buffer->data),
 		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
 		NULL);
-	rb_erase(&buffer->rb_node, &proc->allocated_buffers);
+	rb_erase(&buffer->rb_node, &proc->alloc.allocated_buffers);
 	buffer->free = 1;
-	if (!list_is_last(&buffer->entry, &proc->buffers)) {
+	if (!list_is_last(&buffer->entry, &proc->alloc.buffers)) {
 		struct binder_buffer *next = list_entry(buffer->entry.next,
 						struct binder_buffer, entry);
 
 		if (next->free) {
-			rb_erase(&next->rb_node, &proc->free_buffers);
+			rb_erase(&next->rb_node, &proc->alloc.free_buffers);
 			binder_delete_free_buffer(proc, next);
 		}
 	}
-	if (proc->buffers.next != &buffer->entry) {
+	if (proc->alloc.buffers.next != &buffer->entry) {
 		struct binder_buffer *prev = list_entry(buffer->entry.prev,
 						struct binder_buffer, entry);
 
 		if (prev->free) {
 			binder_delete_free_buffer(proc, buffer);
-			rb_erase(&prev->rb_node, &proc->free_buffers);
+			rb_erase(&prev->rb_node, &proc->alloc.free_buffers);
 			buffer = prev;
 		}
 	}
@@ -1533,7 +1568,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			 * back to kernel address space to access it
 			 */
 			parent_buffer = parent->buffer -
-				proc->user_buffer_offset;
+				proc->alloc.user_buffer_offset;
 
 			fd_buf_size = sizeof(u32) * fda->num_fds;
 			if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
@@ -1751,7 +1786,7 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda,
 	 * Since the parent was already fixed up, convert it
 	 * back to the kernel address space to access it
 	 */
-	parent_buffer = parent->buffer - target_proc->user_buffer_offset;
+	parent_buffer = parent->buffer - target_proc->alloc.user_buffer_offset;
 	fd_array = (u32 *)(parent_buffer + fda->parent_offset);
 	if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) {
 		binder_user_error("%d:%d parent offset not aligned correctly.\n",
@@ -1819,7 +1854,7 @@ static int binder_fixup_parent(struct binder_transaction *t,
 		return -EINVAL;
 	}
 	parent_buffer = (u8 *)(parent->buffer -
-			       target_proc->user_buffer_offset);
+			       target_proc->alloc.user_buffer_offset);
 	*(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer;
 
 	return 0;
@@ -2159,7 +2194,7 @@ static void binder_transaction(struct binder_proc *proc,
 			}
 			/* Fixup buffer pointer to target proc address space */
 			bp->buffer = (uintptr_t)sg_bufp +
-				target_proc->user_buffer_offset;
+				target_proc->alloc.user_buffer_offset;
 			sg_bufp += ALIGN(bp->length, sizeof(u64));
 
 			ret = binder_fixup_parent(t, thread, bp, off_start,
@@ -2921,7 +2956,7 @@ retry:
 		tr.offsets_size = t->buffer->offsets_size;
 		tr.data.ptr.buffer = (binder_uintptr_t)(
 					(uintptr_t)t->buffer->data +
-					proc->user_buffer_offset);
+					proc->alloc.user_buffer_offset);
 		tr.data.ptr.offsets = tr.data.ptr.buffer +
 					ALIGN(t->buffer->data_size,
 					    sizeof(void *));
@@ -3339,8 +3374,8 @@ static void binder_vma_close(struct vm_area_struct *vma)
 		     proc->pid, vma->vm_start, vma->vm_end,
 		     (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
 		     (unsigned long)pgprot_val(vma->vm_page_prot));
-	proc->vma = NULL;
-	proc->vma_vm_mm = NULL;
+	proc->alloc.vma = NULL;
+	proc->alloc.vma_vm_mm = NULL;
 	binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
 }
 
@@ -3383,7 +3418,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
 
 	mutex_lock(&binder_mmap_lock);
-	if (proc->buffer) {
+	if (proc->alloc.buffer) {
 		ret = -EBUSY;
 		failure_string = "already mapped";
 		goto err_already_mapped;
@@ -3395,56 +3430,65 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 		failure_string = "get_vm_area";
 		goto err_get_vm_area_failed;
 	}
-	proc->buffer = area->addr;
-	proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer;
+	proc->alloc.buffer = area->addr;
+	proc->alloc.user_buffer_offset =
+		vma->vm_start - (uintptr_t)proc->alloc.buffer;
 	mutex_unlock(&binder_mmap_lock);
 
 #ifdef CONFIG_CPU_CACHE_VIPT
 	if (cache_is_vipt_aliasing()) {
-		while (CACHE_COLOUR((vma->vm_start ^ (uint32_t)proc->buffer))) {
-			pr_info("binder_mmap: %d %lx-%lx maps %p bad alignment\n", proc->pid, vma->vm_start, vma->vm_end, proc->buffer);
+		while (CACHE_COLOUR(
+			(vma->vm_start ^ (uint32_t)proc->alloc.buffer))) {
+			pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n",
+				proc->pid, vma->vm_start,
+				vma->vm_end, proc->alloc.buffer);
 			vma->vm_start += PAGE_SIZE;
 		}
 	}
 #endif
-	proc->pages = kzalloc(sizeof(proc->pages[0]) * ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL);
-	if (proc->pages == NULL) {
+	proc->alloc.pages =
+		kzalloc(sizeof(proc->alloc.pages[0]) *
+				((vma->vm_end - vma->vm_start) / PAGE_SIZE),
+			GFP_KERNEL);
+	if (proc->alloc.pages == NULL) {
 		ret = -ENOMEM;
 		failure_string = "alloc page array";
 		goto err_alloc_pages_failed;
 	}
-	proc->buffer_size = vma->vm_end - vma->vm_start;
+	proc->alloc.buffer_size = vma->vm_end - vma->vm_start;
 
 	vma->vm_ops = &binder_vm_ops;
 	vma->vm_private_data = proc;
 
-	if (binder_update_page_range(proc, 1, proc->buffer, proc->buffer + PAGE_SIZE, vma)) {
+	if (binder_update_page_range(proc, 1, proc->alloc.buffer,
+				     proc->alloc.buffer + PAGE_SIZE, vma)) {
 		ret = -ENOMEM;
 		failure_string = "alloc small buf";
 		goto err_alloc_small_buf_failed;
 	}
-	buffer = proc->buffer;
-	INIT_LIST_HEAD(&proc->buffers);
-	list_add(&buffer->entry, &proc->buffers);
+	buffer = proc->alloc.buffer;
+	INIT_LIST_HEAD(&proc->alloc.buffers);
+	list_add(&buffer->entry, &proc->alloc.buffers);
 	buffer->free = 1;
 	binder_insert_free_buffer(proc, buffer);
-	proc->free_async_space = proc->buffer_size / 2;
+	proc->alloc.free_async_space = proc->alloc.buffer_size / 2;
 	barrier();
 	proc->files = get_files_struct(current);
-	proc->vma = vma;
-	proc->vma_vm_mm = vma->vm_mm;
+	proc->alloc.vma = vma;
+	proc->alloc.vma_vm_mm = vma->vm_mm;
 
-	/*pr_info("binder_mmap: %d %lx-%lx maps %p\n",
-		 proc->pid, vma->vm_start, vma->vm_end, proc->buffer);*/
+	/*pr_info("binder_mmap: %d %lx-%lx maps %pK\n",
+	 *	proc->pid, vma->vm_start, vma->vm_end, proc->alloc.buffer);
+	 */
 	return 0;
 
 err_alloc_small_buf_failed:
-	kfree(proc->pages);
-	proc->pages = NULL;
+	kfree(proc->alloc.pages);
+	proc->alloc.pages = NULL;
 err_alloc_pages_failed:
 	mutex_lock(&binder_mmap_lock);
-	vfree(proc->buffer);
-	proc->buffer = NULL;
+	vfree(proc->alloc.buffer);
+	proc->alloc.buffer = NULL;
 err_get_vm_area_failed:
 err_already_mapped:
 	mutex_unlock(&binder_mmap_lock);
@@ -3596,7 +3640,7 @@ static void binder_deferred_release(struct binder_proc *proc)
 	int threads, nodes, incoming_refs, outgoing_refs, buffers,
 		active_transactions, page_count;
 
-	BUG_ON(proc->vma);
+	BUG_ON(proc->alloc.vma);
 	BUG_ON(proc->files);
 
 	hlist_del(&proc->proc_node);
@@ -3643,7 +3687,7 @@ static void binder_deferred_release(struct binder_proc *proc)
 	binder_release_work(&proc->delivered_death);
 
 	buffers = 0;
-	while ((n = rb_first(&proc->allocated_buffers))) {
+	while ((n = rb_first(&proc->alloc.allocated_buffers))) {
 		struct binder_buffer *buffer;
 
 		buffer = rb_entry(n, struct binder_buffer, rb_node);
@@ -3664,25 +3708,25 @@ static void binder_deferred_release(struct binder_proc *proc)
 	binder_stats_deleted(BINDER_STAT_PROC);
 
 	page_count = 0;
-	if (proc->pages) {
+	if (proc->alloc.pages) {
 		int i;
 
-		for (i = 0; i < proc->buffer_size / PAGE_SIZE; i++) {
+		for (i = 0; i < proc->alloc.buffer_size / PAGE_SIZE; i++) {
 			void *page_addr;
 
-			if (!proc->pages[i])
+			if (!proc->alloc.pages[i])
 				continue;
 
-			page_addr = proc->buffer + i * PAGE_SIZE;
+			page_addr = proc->alloc.buffer + i * PAGE_SIZE;
 			binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 				     "%s: %d: page %d at %p not freed\n",
 				     __func__, proc->pid, i, page_addr);
 			unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
-			__free_page(proc->pages[i]);
+			__free_page(proc->alloc.pages[i]);
 			page_count++;
 		}
-		kfree(proc->pages);
-		vfree(proc->buffer);
+		kfree(proc->alloc.pages);
+		vfree(proc->alloc.buffer);
 	}
 
 	put_task_struct(proc->tsk);
@@ -3912,7 +3956,8 @@ static void print_binder_proc(struct seq_file *m,
 			print_binder_ref(m, rb_entry(n, struct binder_ref,
 						     rb_node_desc));
 	}
-	for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n))
+	for (n = rb_first(&proc->alloc.allocated_buffers);
+			n != NULL; n = rb_next(n))
 		print_binder_buffer(m, "  buffer",
 				    rb_entry(n, struct binder_buffer, rb_node));
 	list_for_each_entry(w, &proc->todo, entry)
@@ -4029,7 +4074,7 @@ static void print_binder_proc_stats(struct seq_file *m,
 			"  ready threads %d\n"
 			"  free async space %zd\n", proc->requested_threads,
 			proc->requested_threads_started, proc->max_threads,
-			proc->ready_threads, proc->free_async_space);
+			proc->ready_threads, proc->alloc.free_async_space);
 	count = 0;
 	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n))
 		count++;
@@ -4047,7 +4092,8 @@ static void print_binder_proc_stats(struct seq_file *m,
 	seq_printf(m, "  refs: %d s %d w %d\n", count, strong, weak);
 
 	count = 0;
-	for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n))
+	for (n = rb_first(&proc->alloc.allocated_buffers);
+			n != NULL; n = rb_next(n))
 		count++;
 	seq_printf(m, "  buffers: %d\n", count);
 
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 7f20f3dc8369..c835f09656c1 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -280,7 +280,7 @@ TRACE_EVENT(binder_update_page_range,
 	TP_fast_assign(
 		__entry->proc = proc->pid;
 		__entry->allocate = allocate;
-		__entry->offset = start - proc->buffer;
+		__entry->offset = start - proc->alloc.buffer;
 		__entry->size = end - start;
 	),
 	TP_printk("proc=%d allocate=%d offset=%zu size=%zu",

From b582e88aa5c657c588da128821f434991f3ba86b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 21 Apr 2017 11:18:12 -0700
Subject: [PATCH 0902/3220] FROMLIST: binder: remove unneeded cleanup code

(from https://patchwork.kernel.org/patch/9817817/)

The buffer's transaction has already been freed before
binder_deferred_release. No need to do it again.

Change-Id: I412709c23879c5e45a6c7304a588bba0a5223fc3
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 734edbd67410..07be2833250d 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3634,7 +3634,6 @@ static int binder_node_release(struct binder_node *node, int refs)
 
 static void binder_deferred_release(struct binder_proc *proc)
 {
-	struct binder_transaction *t;
 	struct binder_context *context = proc->context;
 	struct rb_node *n;
 	int threads, nodes, incoming_refs, outgoing_refs, buffers,
@@ -3692,14 +3691,8 @@ static void binder_deferred_release(struct binder_proc *proc)
 
 		buffer = rb_entry(n, struct binder_buffer, rb_node);
 
-		t = buffer->transaction;
-		if (t) {
-			t->buffer = NULL;
-			buffer->transaction = NULL;
-			pr_err("release proc %d, transaction %d, not freed\n",
-			       proc->pid, t->debug_id);
-			/*BUG();*/
-		}
+		/* Transaction should already have been freed */
+		BUG_ON(buffer->transaction);
 
 		binder_free_buf(proc, buffer);
 		buffers++;

From 467545d8424090762c38d0bb2ea88da35b94fb09 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 10 Oct 2016 10:40:53 -0700
Subject: [PATCH 0903/3220] FROMLIST: binder: separate out binder_alloc
 functions

(from https://patchwork.kernel.org/patch/9817753/)

Continuation of splitting the binder allocator from the binder
driver. Separate binder_alloc functions from normal binder
functions. Protect the allocator with a separate mutex.

Bug: 33250092 32225111
Change-Id: I634637415aa03c74145159d84f1749bbe785a8f3
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c       | 655 ++++++++++++++++++++-------------
 drivers/android/binder_trace.h |   9 +-
 2 files changed, 412 insertions(+), 252 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 07be2833250d..e73481a8ea2c 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -48,7 +48,7 @@
 
 static DEFINE_MUTEX(binder_main_lock);
 static DEFINE_MUTEX(binder_deferred_lock);
-static DEFINE_MUTEX(binder_mmap_lock);
+static DEFINE_MUTEX(binder_alloc_mmap_lock);
 
 static HLIST_HEAD(binder_devices);
 static HLIST_HEAD(binder_procs);
@@ -104,9 +104,7 @@ enum {
 	BINDER_DEBUG_TRANSACTION_COMPLETE   = 1U << 10,
 	BINDER_DEBUG_FREE_BUFFER            = 1U << 11,
 	BINDER_DEBUG_INTERNAL_REFS          = 1U << 12,
-	BINDER_DEBUG_BUFFER_ALLOC           = 1U << 13,
-	BINDER_DEBUG_PRIORITY_CAP           = 1U << 14,
-	BINDER_DEBUG_BUFFER_ALLOC_ASYNC     = 1U << 15,
+	BINDER_DEBUG_PRIORITY_CAP           = 1U << 13,
 };
 static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR |
 	BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION;
@@ -159,6 +157,27 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 #define to_binder_fd_array_object(hdr) \
 	container_of(hdr, struct binder_fd_array_object, hdr)
 
+/*
+ * debug declarations for binder_alloc. To be
+ * moved to binder_alloc.c
+ */
+enum {
+	BINDER_ALLOC_DEBUG_OPEN_CLOSE             = 1U << 1,
+	BINDER_ALLOC_DEBUG_BUFFER_ALLOC           = 1U << 2,
+	BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC     = 1U << 3,
+};
+static uint32_t binder_alloc_debug_mask;
+
+module_param_named(alloc_debug_mask, binder_alloc_debug_mask,
+		   uint, S_IWUSR | S_IRUGO);
+
+#define binder_alloc_debug(mask, x...) \
+	do { \
+		if (binder_alloc_debug_mask & mask) \
+			pr_info(x); \
+	} while (0)
+/* end of binder_alloc debug declarations */
+
 enum binder_stat_types {
 	BINDER_STAT_PROC,
 	BINDER_STAT_THREAD,
@@ -342,6 +361,8 @@ enum binder_deferred_state {
  * struct binder_buffer objects used to track the user buffers
  */
 struct binder_alloc {
+	struct mutex mutex;
+	struct task_struct *tsk;
 	struct vm_area_struct *vma;
 	struct mm_struct *vma_vm_mm;
 	void *buffer;
@@ -352,6 +373,7 @@ struct binder_alloc {
 	size_t free_async_space;
 	struct page **pages;
 	size_t buffer_size;
+	int pid;
 };
 
 struct binder_proc {
@@ -423,6 +445,56 @@ struct binder_transaction {
 	kuid_t	sender_euid;
 };
 
+/*
+ * Forward declarations of binder_alloc functions.
+ * These will be moved to binder_alloc.h when
+ * binder_alloc is moved to its own files.
+ */
+extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
+						  size_t data_size,
+						  size_t offsets_size,
+						  size_t extra_buffers_size,
+						  int is_async);
+extern void binder_alloc_init(struct binder_alloc *alloc);
+extern void binder_alloc_vma_close(struct binder_alloc *alloc);
+extern struct binder_buffer *
+binder_alloc_buffer_lookup(struct binder_alloc *alloc,
+			   uintptr_t user_ptr);
+extern void binder_alloc_free_buf(struct binder_alloc *alloc,
+				  struct binder_buffer *buffer);
+extern int binder_alloc_mmap_handler(struct binder_alloc *alloc,
+				     struct vm_area_struct *vma);
+extern void binder_alloc_deferred_release(struct binder_alloc *alloc);
+extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc);
+extern void binder_alloc_print_allocated(struct seq_file *m,
+					 struct binder_alloc *alloc);
+
+static inline size_t
+binder_alloc_get_free_async_space(struct binder_alloc *alloc)
+{
+	size_t free_async_space;
+
+	mutex_lock(&alloc->mutex);
+	free_async_space = alloc->free_async_space;
+	mutex_unlock(&alloc->mutex);
+	return free_async_space;
+}
+
+static inline ptrdiff_t
+binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc)
+{
+	/*
+	 * user_buffer_offset is constant if vma is set and
+	 * undefined if vma is not set. It is possible to
+	 * get here with !alloc->vma if the target process
+	 * is dying while a transaction is being initiated.
+	 * Returning the old value is ok in this case and
+	 * the transaction will fail.
+	 */
+	return alloc->user_buffer_offset;
+}
+/* end of binder_alloc declarations */
+
 static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
 
@@ -506,21 +578,20 @@ static void binder_set_nice(long nice)
 	binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
 }
 
-static size_t binder_buffer_size(struct binder_proc *proc,
-				 struct binder_buffer *buffer)
+static size_t binder_alloc_buffer_size(struct binder_alloc *alloc,
+				       struct binder_buffer *buffer)
 {
-	if (list_is_last(&buffer->entry, &proc->alloc.buffers))
-		return proc->alloc.buffer +
-				proc->alloc.buffer_size -
-				(void *)buffer->data;
+	if (list_is_last(&buffer->entry, &alloc->buffers))
+		return alloc->buffer +
+		       alloc->buffer_size - (void *)buffer->data;
 	return (size_t)list_entry(buffer->entry.next,
 			  struct binder_buffer, entry) - (size_t)buffer->data;
 }
 
-static void binder_insert_free_buffer(struct binder_proc *proc,
+static void binder_insert_free_buffer(struct binder_alloc *alloc,
 				      struct binder_buffer *new_buffer)
 {
-	struct rb_node **p = &proc->alloc.free_buffers.rb_node;
+	struct rb_node **p = &alloc->free_buffers.rb_node;
 	struct rb_node *parent = NULL;
 	struct binder_buffer *buffer;
 	size_t buffer_size;
@@ -528,18 +599,18 @@ static void binder_insert_free_buffer(struct binder_proc *proc,
 
 	BUG_ON(!new_buffer->free);
 
-	new_buffer_size = binder_buffer_size(proc, new_buffer);
+	new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer);
 
-	binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-		     "%d: add free buffer, size %zd, at %p\n",
-		      proc->pid, new_buffer_size, new_buffer);
+	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+		     "%d: add free buffer, size %zd, at %pK\n",
+		      alloc->pid, new_buffer_size, new_buffer);
 
 	while (*p) {
 		parent = *p;
 		buffer = rb_entry(parent, struct binder_buffer, rb_node);
 		BUG_ON(!buffer->free);
 
-		buffer_size = binder_buffer_size(proc, buffer);
+		buffer_size = binder_alloc_buffer_size(alloc, buffer);
 
 		if (new_buffer_size < buffer_size)
 			p = &parent->rb_left;
@@ -547,13 +618,13 @@ static void binder_insert_free_buffer(struct binder_proc *proc,
 			p = &parent->rb_right;
 	}
 	rb_link_node(&new_buffer->rb_node, parent, p);
-	rb_insert_color(&new_buffer->rb_node, &proc->alloc.free_buffers);
+	rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers);
 }
 
-static void binder_insert_allocated_buffer(struct binder_proc *proc,
+static void binder_insert_allocated_buffer(struct binder_alloc *alloc,
 					   struct binder_buffer *new_buffer)
 {
-	struct rb_node **p = &proc->alloc.allocated_buffers.rb_node;
+	struct rb_node **p = &alloc->allocated_buffers.rb_node;
 	struct rb_node *parent = NULL;
 	struct binder_buffer *buffer;
 
@@ -572,19 +643,19 @@ static void binder_insert_allocated_buffer(struct binder_proc *proc,
 			BUG();
 	}
 	rb_link_node(&new_buffer->rb_node, parent, p);
-	rb_insert_color(&new_buffer->rb_node, &proc->alloc.allocated_buffers);
+	rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers);
 }
 
-static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc,
-						  uintptr_t user_ptr)
+static struct binder_buffer *binder_alloc_buffer_lookup_locked(
+		struct binder_alloc *alloc,
+		uintptr_t user_ptr)
 {
-	struct rb_node *n = proc->alloc.allocated_buffers.rb_node;
+	struct rb_node *n = alloc->allocated_buffers.rb_node;
 	struct binder_buffer *buffer;
 	struct binder_buffer *kern_ptr;
 
-	kern_ptr = (struct binder_buffer *)
-		(user_ptr - proc->alloc.user_buffer_offset -
-			offsetof(struct binder_buffer, data));
+	kern_ptr = (struct binder_buffer *)(user_ptr - alloc->user_buffer_offset
+		- offsetof(struct binder_buffer, data));
 
 	while (n) {
 		buffer = rb_entry(n, struct binder_buffer, rb_node);
@@ -600,7 +671,18 @@ static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc,
 	return NULL;
 }
 
-static int binder_update_page_range(struct binder_proc *proc, int allocate,
+struct binder_buffer *binder_alloc_buffer_lookup(struct binder_alloc *alloc,
+						 uintptr_t user_ptr)
+{
+	struct binder_buffer *buffer;
+
+	mutex_lock(&alloc->mutex);
+	buffer = binder_alloc_buffer_lookup_locked(alloc, user_ptr);
+	mutex_unlock(&alloc->mutex);
+	return buffer;
+}
+
+static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 				    void *start, void *end,
 				    struct vm_area_struct *vma)
 {
@@ -609,26 +691,26 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
 	struct page **page;
 	struct mm_struct *mm;
 
-	binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-		     "%d: %s pages %p-%p\n", proc->pid,
+	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+		     "%d: %s pages %pK-%pK\n", alloc->pid,
 		     allocate ? "allocate" : "free", start, end);
 
 	if (end <= start)
 		return 0;
 
-	trace_binder_update_page_range(proc, allocate, start, end);
+	trace_binder_update_page_range(alloc, allocate, start, end);
 
 	if (vma)
 		mm = NULL;
 	else
-		mm = get_task_mm(proc->tsk);
+		mm = get_task_mm(alloc->tsk);
 
 	if (mm) {
 		down_write(&mm->mmap_sem);
-		vma = proc->alloc.vma;
-		if (vma && mm != proc->alloc.vma_vm_mm) {
+		vma = alloc->vma;
+		if (vma && mm != alloc->vma_vm_mm) {
 			pr_err("%d: vma mm and task mm mismatch\n",
-				proc->pid);
+				alloc->pid);
 			vma = NULL;
 		}
 	}
@@ -638,21 +720,20 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
 
 	if (vma == NULL) {
 		pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
-			proc->pid);
+			alloc->pid);
 		goto err_no_vma;
 	}
 
 	for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
 		int ret;
 
-		page = &proc->alloc.pages[
-			(page_addr - proc->alloc.buffer) / PAGE_SIZE];
+		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
 
 		BUG_ON(*page);
 		*page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
 		if (*page == NULL) {
-			pr_err("%d: binder_alloc_buf failed for page at %p\n",
-				proc->pid, page_addr);
+			pr_err("%d: binder_alloc_buf failed for page at %pK\n",
+				alloc->pid, page_addr);
 			goto err_alloc_page_failed;
 		}
 		ret = map_kernel_range_noflush((unsigned long)page_addr,
@@ -660,16 +741,16 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
 		flush_cache_vmap((unsigned long)page_addr,
 				(unsigned long)page_addr + PAGE_SIZE);
 		if (ret != 1) {
-			pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n",
-			       proc->pid, page_addr);
+			pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n",
+			       alloc->pid, page_addr);
 			goto err_map_kernel_failed;
 		}
 		user_page_addr =
-			(uintptr_t)page_addr + proc->alloc.user_buffer_offset;
+			(uintptr_t)page_addr + alloc->user_buffer_offset;
 		ret = vm_insert_page(vma, user_page_addr, page[0]);
 		if (ret) {
 			pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
-			       proc->pid, user_page_addr);
+			       alloc->pid, user_page_addr);
 			goto err_vm_insert_page_failed;
 		}
 		/* vm_insert_page does not seem to increment the refcount */
@@ -683,14 +764,10 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
 free_range:
 	for (page_addr = end - PAGE_SIZE; page_addr >= start;
 	     page_addr -= PAGE_SIZE) {
-
-		page = &proc->alloc.pages[
-			(page_addr - proc->alloc.buffer) / PAGE_SIZE];
+		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
 		if (vma)
-			zap_page_range(vma,
-				       (uintptr_t)page_addr +
-						proc->alloc.user_buffer_offset,
-				       PAGE_SIZE, NULL);
+			zap_page_range(vma, (uintptr_t)page_addr +
+				alloc->user_buffer_offset, PAGE_SIZE, NULL);
 err_vm_insert_page_failed:
 		unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
 err_map_kernel_failed:
@@ -707,13 +784,11 @@ err_no_vma:
 	return -ENOMEM;
 }
 
-static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
-					      size_t data_size,
-					      size_t offsets_size,
-					      size_t extra_buffers_size,
-					      int is_async)
+static struct binder_buffer *binder_alloc_new_buf_locked(
+		struct binder_alloc *alloc, size_t data_size,
+		size_t offsets_size, size_t extra_buffers_size, int is_async)
 {
-	struct rb_node *n = proc->alloc.free_buffers.rb_node;
+	struct rb_node *n = alloc->free_buffers.rb_node;
 	struct binder_buffer *buffer;
 	size_t buffer_size;
 	struct rb_node *best_fit = NULL;
@@ -721,9 +796,9 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 	void *end_page_addr;
 	size_t size, data_offsets_size;
 
-	if (proc->alloc.vma == NULL) {
+	if (alloc->vma == NULL) {
 		pr_err("%d: binder_alloc_buf, no vma\n",
-		       proc->pid);
+		       alloc->pid);
 		return NULL;
 	}
 
@@ -731,29 +806,30 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 		ALIGN(offsets_size, sizeof(void *));
 
 	if (data_offsets_size < data_size || data_offsets_size < offsets_size) {
-		binder_user_error("%d: got transaction with invalid size %zd-%zd\n",
-				proc->pid, data_size, offsets_size);
+		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+				"%d: got transaction with invalid size %zd-%zd\n",
+				alloc->pid, data_size, offsets_size);
 		return NULL;
 	}
 	size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *));
 	if (size < data_offsets_size || size < extra_buffers_size) {
-		binder_user_error("%d: got transaction with invalid extra_buffers_size %zd\n",
-				  proc->pid, extra_buffers_size);
+		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+				"%d: got transaction with invalid extra_buffers_size %zd\n",
+				alloc->pid, extra_buffers_size);
 		return NULL;
 	}
 	if (is_async &&
-	    proc->alloc.free_async_space <
-			size + sizeof(struct binder_buffer)) {
-		binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
+	    alloc->free_async_space < size + sizeof(struct binder_buffer)) {
+		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
 			     "%d: binder_alloc_buf size %zd failed, no async space left\n",
-			      proc->pid, size);
+			      alloc->pid, size);
 		return NULL;
 	}
 
 	while (n) {
 		buffer = rb_entry(n, struct binder_buffer, rb_node);
 		BUG_ON(!buffer->free);
-		buffer_size = binder_buffer_size(proc, buffer);
+		buffer_size = binder_alloc_buffer_size(alloc, buffer);
 
 		if (size < buffer_size) {
 			best_fit = n;
@@ -767,17 +843,17 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 	}
 	if (best_fit == NULL) {
 		pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
-			proc->pid, size);
+			alloc->pid, size);
 		return NULL;
 	}
 	if (n == NULL) {
 		buffer = rb_entry(best_fit, struct binder_buffer, rb_node);
-		buffer_size = binder_buffer_size(proc, buffer);
+		buffer_size = binder_alloc_buffer_size(alloc, buffer);
 	}
 
-	binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-		     "%d: binder_alloc_buf size %zd got buffer %p size %zd\n",
-		      proc->pid, size, buffer, buffer_size);
+	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+		     "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n",
+		      alloc->pid, size, buffer, buffer_size);
 
 	has_page_addr =
 		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
@@ -791,38 +867,52 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
 		(void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size);
 	if (end_page_addr > has_page_addr)
 		end_page_addr = has_page_addr;
-	if (binder_update_page_range(proc, 1,
+	if (binder_update_page_range(alloc, 1,
 	    (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL))
 		return NULL;
 
-	rb_erase(best_fit, &proc->alloc.free_buffers);
+	rb_erase(best_fit, &alloc->free_buffers);
 	buffer->free = 0;
-	binder_insert_allocated_buffer(proc, buffer);
+	binder_insert_allocated_buffer(alloc, buffer);
 	if (buffer_size != size) {
 		struct binder_buffer *new_buffer = (void *)buffer->data + size;
 
 		list_add(&new_buffer->entry, &buffer->entry);
 		new_buffer->free = 1;
-		binder_insert_free_buffer(proc, new_buffer);
+		binder_insert_free_buffer(alloc, new_buffer);
 	}
-	binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-		     "%d: binder_alloc_buf size %zd got %p\n",
-		      proc->pid, size, buffer);
+	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+		     "%d: binder_alloc_buf size %zd got %pK\n",
+		      alloc->pid, size, buffer);
 	buffer->data_size = data_size;
 	buffer->offsets_size = offsets_size;
-	buffer->extra_buffers_size = extra_buffers_size;
 	buffer->async_transaction = is_async;
+	buffer->extra_buffers_size = extra_buffers_size;
 	if (is_async) {
-		proc->alloc.free_async_space -=
-			size + sizeof(struct binder_buffer);
-		binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
+		alloc->free_async_space -= size + sizeof(struct binder_buffer);
+		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC,
 			     "%d: binder_alloc_buf size %zd async free %zd\n",
-			      proc->pid, size, proc->alloc.free_async_space);
+			      alloc->pid, size, alloc->free_async_space);
 	}
 
 	return buffer;
 }
 
+struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
+					   size_t data_size,
+					   size_t offsets_size,
+					   size_t extra_buffers_size,
+					   int is_async)
+{
+	struct binder_buffer *buffer;
+
+	mutex_lock(&alloc->mutex);
+	buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size,
+					     extra_buffers_size, is_async);
+	mutex_unlock(&alloc->mutex);
+	return buffer;
+}
+
 static void *buffer_start_page(struct binder_buffer *buffer)
 {
 	return (void *)((uintptr_t)buffer & PAGE_MASK);
@@ -833,26 +923,26 @@ static void *buffer_end_page(struct binder_buffer *buffer)
 	return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK);
 }
 
-static void binder_delete_free_buffer(struct binder_proc *proc,
+static void binder_delete_free_buffer(struct binder_alloc *alloc,
 				      struct binder_buffer *buffer)
 {
 	struct binder_buffer *prev, *next = NULL;
 	int free_page_end = 1;
 	int free_page_start = 1;
 
-	BUG_ON(proc->alloc.buffers.next == &buffer->entry);
+	BUG_ON(alloc->buffers.next == &buffer->entry);
 	prev = list_entry(buffer->entry.prev, struct binder_buffer, entry);
 	BUG_ON(!prev->free);
 	if (buffer_end_page(prev) == buffer_start_page(buffer)) {
 		free_page_start = 0;
 		if (buffer_end_page(prev) == buffer_end_page(buffer))
 			free_page_end = 0;
-		binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-			     "%d: merge free, buffer %p share page with %p\n",
-			      proc->pid, buffer, prev);
+		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+			     "%d: merge free, buffer %pK share page with %pK\n",
+			      alloc->pid, buffer, prev);
 	}
 
-	if (!list_is_last(&buffer->entry, &proc->alloc.buffers)) {
+	if (!list_is_last(&buffer->entry, &alloc->buffers)) {
 		next = list_entry(buffer->entry.next,
 				  struct binder_buffer, entry);
 		if (buffer_start_page(next) == buffer_end_page(buffer)) {
@@ -860,80 +950,88 @@ static void binder_delete_free_buffer(struct binder_proc *proc,
 			if (buffer_start_page(next) ==
 			    buffer_start_page(buffer))
 				free_page_start = 0;
-			binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-				     "%d: merge free, buffer %p share page with %p\n",
-				      proc->pid, buffer, prev);
+			binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+				     "%d: merge free, buffer %pK share page with %pK\n",
+				      alloc->pid, buffer, prev);
 		}
 	}
 	list_del(&buffer->entry);
 	if (free_page_start || free_page_end) {
-		binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-			     "%d: merge free, buffer %p do not share page%s%s with %p or %p\n",
-			     proc->pid, buffer, free_page_start ? "" : " end",
+		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+			     "%d: merge free, buffer %pK do not share page%s%s with %pK or %pK\n",
+			     alloc->pid, buffer, free_page_start ? "" : " end",
 			     free_page_end ? "" : " start", prev, next);
-		binder_update_page_range(proc, 0, free_page_start ?
+		binder_update_page_range(alloc, 0, free_page_start ?
 			buffer_start_page(buffer) : buffer_end_page(buffer),
 			(free_page_end ? buffer_end_page(buffer) :
 			buffer_start_page(buffer)) + PAGE_SIZE, NULL);
 	}
 }
 
-static void binder_free_buf(struct binder_proc *proc,
-			    struct binder_buffer *buffer)
+static void binder_free_buf_locked(struct binder_alloc *alloc,
+				   struct binder_buffer *buffer)
 {
 	size_t size, buffer_size;
 
-	buffer_size = binder_buffer_size(proc, buffer);
+	buffer_size = binder_alloc_buffer_size(alloc, buffer);
 
 	size = ALIGN(buffer->data_size, sizeof(void *)) +
 		ALIGN(buffer->offsets_size, sizeof(void *)) +
 		ALIGN(buffer->extra_buffers_size, sizeof(void *));
 
-	binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-		     "%d: binder_free_buf %p size %zd buffer_size %zd\n",
-		      proc->pid, buffer, size, buffer_size);
+	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+		     "%d: binder_free_buf %pK size %zd buffer_size %zd\n",
+		      alloc->pid, buffer, size, buffer_size);
 
 	BUG_ON(buffer->free);
 	BUG_ON(size > buffer_size);
 	BUG_ON(buffer->transaction != NULL);
-	BUG_ON((void *)buffer < proc->alloc.buffer);
-	BUG_ON((void *)buffer > proc->alloc.buffer + proc->alloc.buffer_size);
+	BUG_ON((void *)buffer < alloc->buffer);
+	BUG_ON((void *)buffer > alloc->buffer + alloc->buffer_size);
 
 	if (buffer->async_transaction) {
-		proc->alloc.free_async_space +=
-			size + sizeof(struct binder_buffer);
+		alloc->free_async_space += size + sizeof(struct binder_buffer);
 
-		binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
+		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC,
 			     "%d: binder_free_buf size %zd async free %zd\n",
-			      proc->pid, size, proc->alloc.free_async_space);
+			      alloc->pid, size, alloc->free_async_space);
 	}
 
-	binder_update_page_range(proc, 0,
+	binder_update_page_range(alloc, 0,
 		(void *)PAGE_ALIGN((uintptr_t)buffer->data),
 		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
 		NULL);
-	rb_erase(&buffer->rb_node, &proc->alloc.allocated_buffers);
+
+	rb_erase(&buffer->rb_node, &alloc->allocated_buffers);
 	buffer->free = 1;
-	if (!list_is_last(&buffer->entry, &proc->alloc.buffers)) {
+	if (!list_is_last(&buffer->entry, &alloc->buffers)) {
 		struct binder_buffer *next = list_entry(buffer->entry.next,
 						struct binder_buffer, entry);
 
 		if (next->free) {
-			rb_erase(&next->rb_node, &proc->alloc.free_buffers);
-			binder_delete_free_buffer(proc, next);
+			rb_erase(&next->rb_node, &alloc->free_buffers);
+			binder_delete_free_buffer(alloc, next);
 		}
 	}
-	if (proc->alloc.buffers.next != &buffer->entry) {
+	if (alloc->buffers.next != &buffer->entry) {
 		struct binder_buffer *prev = list_entry(buffer->entry.prev,
 						struct binder_buffer, entry);
 
 		if (prev->free) {
-			binder_delete_free_buffer(proc, buffer);
-			rb_erase(&prev->rb_node, &proc->alloc.free_buffers);
+			binder_delete_free_buffer(alloc, buffer);
+			rb_erase(&prev->rb_node, &alloc->free_buffers);
 			buffer = prev;
 		}
 	}
-	binder_insert_free_buffer(proc, buffer);
+	binder_insert_free_buffer(alloc, buffer);
+}
+
+void binder_alloc_free_buf(struct binder_alloc *alloc,
+			    struct binder_buffer *buffer)
+{
+	mutex_lock(&alloc->mutex);
+	binder_free_buf_locked(alloc, buffer);
+	mutex_unlock(&alloc->mutex);
 }
 
 static struct binder_node *binder_get_node(struct binder_proc *proc,
@@ -1568,7 +1666,8 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 			 * back to kernel address space to access it
 			 */
 			parent_buffer = parent->buffer -
-				proc->alloc.user_buffer_offset;
+				binder_alloc_get_user_buffer_offset(
+						&proc->alloc);
 
 			fd_buf_size = sizeof(u32) * fda->num_fds;
 			if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
@@ -1786,7 +1885,8 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda,
 	 * Since the parent was already fixed up, convert it
 	 * back to the kernel address space to access it
 	 */
-	parent_buffer = parent->buffer - target_proc->alloc.user_buffer_offset;
+	parent_buffer = parent->buffer -
+		binder_alloc_get_user_buffer_offset(&target_proc->alloc);
 	fd_array = (u32 *)(parent_buffer + fda->parent_offset);
 	if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) {
 		binder_user_error("%d:%d parent offset not aligned correctly.\n",
@@ -1854,7 +1954,8 @@ static int binder_fixup_parent(struct binder_transaction *t,
 		return -EINVAL;
 	}
 	parent_buffer = (u8 *)(parent->buffer -
-			       target_proc->alloc.user_buffer_offset);
+			binder_alloc_get_user_buffer_offset(
+				&target_proc->alloc));
 	*(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer;
 
 	return 0;
@@ -2040,7 +2141,7 @@ static void binder_transaction(struct binder_proc *proc,
 
 	trace_binder_transaction(reply, t, target_node);
 
-	t->buffer = binder_alloc_buf(target_proc, tr->data_size,
+	t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size,
 		tr->offsets_size, extra_buffers_size,
 		!reply && (t->flags & TF_ONE_WAY));
 	if (t->buffer == NULL) {
@@ -2194,7 +2295,8 @@ static void binder_transaction(struct binder_proc *proc,
 			}
 			/* Fixup buffer pointer to target proc address space */
 			bp->buffer = (uintptr_t)sg_bufp +
-				target_proc->alloc.user_buffer_offset;
+				binder_alloc_get_user_buffer_offset(
+						&target_proc->alloc);
 			sg_bufp += ALIGN(bp->length, sizeof(u64));
 
 			ret = binder_fixup_parent(t, thread, bp, off_start,
@@ -2252,7 +2354,7 @@ err_copy_data_failed:
 	trace_binder_transaction_failed_buffer_release(t->buffer);
 	binder_transaction_buffer_release(target_proc, t->buffer, offp);
 	t->buffer->transaction = NULL;
-	binder_free_buf(target_proc, t->buffer);
+	binder_alloc_free_buf(&target_proc->alloc, t->buffer);
 err_binder_alloc_buf_failed:
 	kfree(tcomplete);
 	binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
@@ -2432,7 +2534,8 @@ static int binder_thread_write(struct binder_proc *proc,
 				return -EFAULT;
 			ptr += sizeof(binder_uintptr_t);
 
-			buffer = binder_buffer_lookup(proc, data_ptr);
+			buffer = binder_alloc_buffer_lookup(&proc->alloc,
+							    data_ptr);
 			if (buffer == NULL) {
 				binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n",
 					proc->pid, thread->pid, (u64)data_ptr);
@@ -2462,7 +2565,7 @@ static int binder_thread_write(struct binder_proc *proc,
 			}
 			trace_binder_transaction_buffer_release(buffer);
 			binder_transaction_buffer_release(proc, buffer, NULL);
-			binder_free_buf(proc, buffer);
+			binder_alloc_free_buf(&proc->alloc, buffer);
 			break;
 		}
 
@@ -2954,9 +3057,9 @@ retry:
 
 		tr.data_size = t->buffer->data_size;
 		tr.offsets_size = t->buffer->offsets_size;
-		tr.data.ptr.buffer = (binder_uintptr_t)(
-					(uintptr_t)t->buffer->data +
-					proc->alloc.user_buffer_offset);
+		tr.data.ptr.buffer = (binder_uintptr_t)
+			((uintptr_t)t->buffer->data +
+			binder_alloc_get_user_buffer_offset(&proc->alloc));
 		tr.data.ptr.offsets = tr.data.ptr.buffer +
 					ALIGN(t->buffer->data_size,
 					    sizeof(void *));
@@ -3365,6 +3468,12 @@ static void binder_vma_open(struct vm_area_struct *vma)
 		     (unsigned long)pgprot_val(vma->vm_page_prot));
 }
 
+void binder_alloc_vma_close(struct binder_alloc *alloc)
+{
+	WRITE_ONCE(alloc->vma, NULL);
+	WRITE_ONCE(alloc->vma_vm_mm, NULL);
+}
+
 static void binder_vma_close(struct vm_area_struct *vma)
 {
 	struct binder_proc *proc = vma->vm_private_data;
@@ -3374,8 +3483,7 @@ static void binder_vma_close(struct vm_area_struct *vma)
 		     proc->pid, vma->vm_start, vma->vm_end,
 		     (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
 		     (unsigned long)pgprot_val(vma->vm_page_prot));
-	proc->alloc.vma = NULL;
-	proc->alloc.vma_vm_mm = NULL;
+	binder_alloc_vma_close(&proc->alloc);
 	binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
 }
 
@@ -3390,35 +3498,16 @@ static const struct vm_operations_struct binder_vm_ops = {
 	.fault = binder_vm_fault,
 };
 
-static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
+int binder_alloc_mmap_handler(struct binder_alloc *alloc,
+			      struct vm_area_struct *vma)
 {
 	int ret;
 	struct vm_struct *area;
-	struct binder_proc *proc = filp->private_data;
 	const char *failure_string;
 	struct binder_buffer *buffer;
 
-	if (proc->tsk != current->group_leader)
-		return -EINVAL;
-
-	if ((vma->vm_end - vma->vm_start) > SZ_4M)
-		vma->vm_end = vma->vm_start + SZ_4M;
-
-	binder_debug(BINDER_DEBUG_OPEN_CLOSE,
-		     "binder_mmap: %d %lx-%lx (%ld K) vma %lx pagep %lx\n",
-		     proc->pid, vma->vm_start, vma->vm_end,
-		     (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
-		     (unsigned long)pgprot_val(vma->vm_page_prot));
-
-	if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) {
-		ret = -EPERM;
-		failure_string = "bad vm_flags";
-		goto err_bad_arg;
-	}
-	vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
-
-	mutex_lock(&binder_mmap_lock);
-	if (proc->alloc.buffer) {
+	mutex_lock(&binder_alloc_mmap_lock);
+	if (alloc->buffer) {
 		ret = -EBUSY;
 		failure_string = "already mapped";
 		goto err_already_mapped;
@@ -3430,74 +3519,111 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 		failure_string = "get_vm_area";
 		goto err_get_vm_area_failed;
 	}
-	proc->alloc.buffer = area->addr;
-	proc->alloc.user_buffer_offset =
-		vma->vm_start - (uintptr_t)proc->alloc.buffer;
-	mutex_unlock(&binder_mmap_lock);
+	alloc->buffer = area->addr;
+	alloc->user_buffer_offset =
+			vma->vm_start - (uintptr_t)alloc->buffer;
+	mutex_unlock(&binder_alloc_mmap_lock);
 
 #ifdef CONFIG_CPU_CACHE_VIPT
 	if (cache_is_vipt_aliasing()) {
 		while (CACHE_COLOUR(
-			(vma->vm_start ^ (uint32_t)proc->alloc.buffer))) {
+				(vma->vm_start ^ (uint32_t)alloc->buffer))) {
 			pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n",
-				proc->pid, vma->vm_start,
-				vma->vm_end, proc->alloc.buffer);
+				alloc->pid, vma->vm_start, vma->vm_end,
+				alloc->buffer);
 			vma->vm_start += PAGE_SIZE;
 		}
 	}
 #endif
-	proc->alloc.pages =
-		kzalloc(sizeof(proc->alloc.pages[0]) *
-				((vma->vm_end - vma->vm_start) / PAGE_SIZE),
-			GFP_KERNEL);
-	if (proc->alloc.pages == NULL) {
+	alloc->pages = kzalloc(sizeof(alloc->pages[0]) *
+				   ((vma->vm_end - vma->vm_start) / PAGE_SIZE),
+			       GFP_KERNEL);
+	if (alloc->pages == NULL) {
 		ret = -ENOMEM;
 		failure_string = "alloc page array";
 		goto err_alloc_pages_failed;
 	}
-	proc->alloc.buffer_size = vma->vm_end - vma->vm_start;
+	alloc->buffer_size = vma->vm_end - vma->vm_start;
 
-	vma->vm_ops = &binder_vm_ops;
-	vma->vm_private_data = proc;
-
-	if (binder_update_page_range(proc, 1, proc->alloc.buffer,
-				     proc->alloc.buffer + PAGE_SIZE, vma)) {
+	if (binder_update_page_range(alloc, 1, alloc->buffer,
+				     alloc->buffer + PAGE_SIZE, vma)) {
 		ret = -ENOMEM;
 		failure_string = "alloc small buf";
 		goto err_alloc_small_buf_failed;
 	}
-	buffer = proc->alloc.buffer;
-	INIT_LIST_HEAD(&proc->alloc.buffers);
-	list_add(&buffer->entry, &proc->alloc.buffers);
+	buffer = alloc->buffer;
+	INIT_LIST_HEAD(&alloc->buffers);
+	list_add(&buffer->entry, &alloc->buffers);
 	buffer->free = 1;
-	binder_insert_free_buffer(proc, buffer);
-	proc->alloc.free_async_space = proc->alloc.buffer_size / 2;
+	binder_insert_free_buffer(alloc, buffer);
+	alloc->free_async_space = alloc->buffer_size / 2;
 	barrier();
-	proc->files = get_files_struct(current);
-	proc->alloc.vma = vma;
-	proc->alloc.vma_vm_mm = vma->vm_mm;
+	alloc->vma = vma;
+	alloc->vma_vm_mm = vma->vm_mm;
 
-	/*pr_info("binder_mmap: %d %lx-%lx maps %pK\n",
-	 *	proc->pid, vma->vm_start, vma->vm_end, proc->alloc.buffer);
-	 */
 	return 0;
 
 err_alloc_small_buf_failed:
-	kfree(proc->alloc.pages);
-	proc->alloc.pages = NULL;
+	kfree(alloc->pages);
+	alloc->pages = NULL;
 err_alloc_pages_failed:
-	mutex_lock(&binder_mmap_lock);
-	vfree(proc->alloc.buffer);
-	proc->alloc.buffer = NULL;
+	mutex_lock(&binder_alloc_mmap_lock);
+	vfree(alloc->buffer);
+	alloc->buffer = NULL;
 err_get_vm_area_failed:
 err_already_mapped:
-	mutex_unlock(&binder_mmap_lock);
+	mutex_unlock(&binder_alloc_mmap_lock);
+	pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
+	       alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
+	return ret;
+}
+
+static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int ret;
+	struct binder_proc *proc = filp->private_data;
+	const char *failure_string;
+
+	if (proc->tsk != current->group_leader)
+		return -EINVAL;
+
+	if ((vma->vm_end - vma->vm_start) > SZ_4M)
+		vma->vm_end = vma->vm_start + SZ_4M;
+
+	binder_debug(BINDER_DEBUG_OPEN_CLOSE,
+		     "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n",
+		     __func__, proc->pid, vma->vm_start, vma->vm_end,
+		     (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
+		     (unsigned long)pgprot_val(vma->vm_page_prot));
+
+	if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) {
+		ret = -EPERM;
+		failure_string = "bad vm_flags";
+		goto err_bad_arg;
+	}
+	vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
+	vma->vm_ops = &binder_vm_ops;
+	vma->vm_private_data = proc;
+
+	ret = binder_alloc_mmap_handler(&proc->alloc, vma);
+	if (ret)
+		return ret;
+	proc->files = get_files_struct(current);
+	return 0;
+
 err_bad_arg:
 	pr_err("binder_mmap: %d %lx-%lx %s failed %d\n",
 	       proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
 	return ret;
 }
 
+void binder_alloc_init(struct binder_alloc *alloc)
+{
+	alloc->tsk = current->group_leader;
+	alloc->pid = current->group_leader->pid;
+	mutex_init(&alloc->mutex);
+}
+
 static int binder_open(struct inode *nodp, struct file *filp)
 {
 	struct binder_proc *proc;
@@ -3517,6 +3643,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	binder_dev = container_of(filp->private_data, struct binder_device,
 				  miscdev);
 	proc->context = &binder_dev->context;
+	binder_alloc_init(&proc->alloc);
 
 	binder_lock(__func__);
 
@@ -3632,14 +3759,61 @@ static int binder_node_release(struct binder_node *node, int refs)
 	return refs;
 }
 
+void binder_alloc_deferred_release(struct binder_alloc *alloc)
+{
+	struct rb_node *n;
+	int buffers, page_count;
+
+	BUG_ON(alloc->vma);
+
+	buffers = 0;
+	mutex_lock(&alloc->mutex);
+	while ((n = rb_first(&alloc->allocated_buffers))) {
+		struct binder_buffer *buffer;
+
+		buffer = rb_entry(n, struct binder_buffer, rb_node);
+
+		/* Transaction should already have been freed */
+		BUG_ON(buffer->transaction);
+
+		binder_free_buf_locked(alloc, buffer);
+		buffers++;
+	}
+
+	page_count = 0;
+	if (alloc->pages) {
+		int i;
+
+		for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
+			void *page_addr;
+
+			if (!alloc->pages[i])
+				continue;
+
+			page_addr = alloc->buffer + i * PAGE_SIZE;
+			binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
+				     "%s: %d: page %d at %pK not freed\n",
+				     __func__, alloc->pid, i, page_addr);
+			unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
+			__free_page(alloc->pages[i]);
+			page_count++;
+		}
+		kfree(alloc->pages);
+		vfree(alloc->buffer);
+	}
+	mutex_unlock(&alloc->mutex);
+
+	binder_alloc_debug(BINDER_ALLOC_DEBUG_OPEN_CLOSE,
+		     "%s: %d buffers %d, pages %d\n",
+		     __func__, alloc->pid, buffers, page_count);
+}
+
 static void binder_deferred_release(struct binder_proc *proc)
 {
 	struct binder_context *context = proc->context;
 	struct rb_node *n;
-	int threads, nodes, incoming_refs, outgoing_refs, buffers,
-		active_transactions, page_count;
+	int threads, nodes, incoming_refs, outgoing_refs, active_transactions;
 
-	BUG_ON(proc->alloc.vma);
 	BUG_ON(proc->files);
 
 	hlist_del(&proc->proc_node);
@@ -3685,49 +3859,15 @@ static void binder_deferred_release(struct binder_proc *proc)
 	binder_release_work(&proc->todo);
 	binder_release_work(&proc->delivered_death);
 
-	buffers = 0;
-	while ((n = rb_first(&proc->alloc.allocated_buffers))) {
-		struct binder_buffer *buffer;
-
-		buffer = rb_entry(n, struct binder_buffer, rb_node);
-
-		/* Transaction should already have been freed */
-		BUG_ON(buffer->transaction);
-
-		binder_free_buf(proc, buffer);
-		buffers++;
-	}
-
+	binder_alloc_deferred_release(&proc->alloc);
 	binder_stats_deleted(BINDER_STAT_PROC);
 
-	page_count = 0;
-	if (proc->alloc.pages) {
-		int i;
-
-		for (i = 0; i < proc->alloc.buffer_size / PAGE_SIZE; i++) {
-			void *page_addr;
-
-			if (!proc->alloc.pages[i])
-				continue;
-
-			page_addr = proc->alloc.buffer + i * PAGE_SIZE;
-			binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-				     "%s: %d: page %d at %p not freed\n",
-				     __func__, proc->pid, i, page_addr);
-			unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
-			__free_page(proc->alloc.pages[i]);
-			page_count++;
-		}
-		kfree(proc->alloc.pages);
-		vfree(proc->alloc.buffer);
-	}
-
 	put_task_struct(proc->tsk);
 
 	binder_debug(BINDER_DEBUG_OPEN_CLOSE,
-		     "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d, buffers %d, pages %d\n",
+		     "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n",
 		     __func__, proc->pid, threads, nodes, incoming_refs,
-		     outgoing_refs, active_transactions, buffers, page_count);
+		     outgoing_refs, active_transactions);
 
 	kfree(proc);
 }
@@ -3810,15 +3950,6 @@ static void print_binder_transaction(struct seq_file *m, const char *prefix,
 		   t->buffer->data);
 }
 
-static void print_binder_buffer(struct seq_file *m, const char *prefix,
-				struct binder_buffer *buffer)
-{
-	seq_printf(m, "%s %d: %p size %zd:%zd %s\n",
-		   prefix, buffer->debug_id, buffer->data,
-		   buffer->data_size, buffer->offsets_size,
-		   buffer->transaction ? "active" : "delivered");
-}
-
 static void print_binder_work(struct seq_file *m, const char *prefix,
 			      const char *transaction_prefix,
 			      struct binder_work *w)
@@ -3921,6 +4052,27 @@ static void print_binder_ref(struct seq_file *m, struct binder_ref *ref)
 		   ref->node->debug_id, ref->strong, ref->weak, ref->death);
 }
 
+static void print_binder_buffer(struct seq_file *m, const char *prefix,
+				struct binder_buffer *buffer)
+{
+	seq_printf(m, "%s %d: %pK size %zd:%zd %s\n",
+		   prefix, buffer->debug_id, buffer->data,
+		   buffer->data_size, buffer->offsets_size,
+		   buffer->transaction ? "active" : "delivered");
+}
+
+void binder_alloc_print_allocated(struct seq_file *m,
+				  struct binder_alloc *alloc)
+{
+	struct rb_node *n;
+
+	mutex_lock(&alloc->mutex);
+	for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
+		print_binder_buffer(m, "  buffer",
+				    rb_entry(n, struct binder_buffer, rb_node));
+	mutex_unlock(&alloc->mutex);
+}
+
 static void print_binder_proc(struct seq_file *m,
 			      struct binder_proc *proc, int print_all)
 {
@@ -3949,10 +4101,7 @@ static void print_binder_proc(struct seq_file *m,
 			print_binder_ref(m, rb_entry(n, struct binder_ref,
 						     rb_node_desc));
 	}
-	for (n = rb_first(&proc->alloc.allocated_buffers);
-			n != NULL; n = rb_next(n))
-		print_binder_buffer(m, "  buffer",
-				    rb_entry(n, struct binder_buffer, rb_node));
+	binder_alloc_print_allocated(m, &proc->alloc);
 	list_for_each_entry(w, &proc->todo, entry)
 		print_binder_work(m, "  ", "  pending transaction", w);
 	list_for_each_entry(w, &proc->delivered_death, entry) {
@@ -4050,6 +4199,18 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
 	}
 }
 
+int binder_alloc_get_allocated_count(struct binder_alloc *alloc)
+{
+	struct rb_node *n;
+	int count = 0;
+
+	mutex_lock(&alloc->mutex);
+	for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
+		count++;
+	mutex_unlock(&alloc->mutex);
+	return count;
+}
+
 static void print_binder_proc_stats(struct seq_file *m,
 				    struct binder_proc *proc)
 {
@@ -4067,7 +4228,8 @@ static void print_binder_proc_stats(struct seq_file *m,
 			"  ready threads %d\n"
 			"  free async space %zd\n", proc->requested_threads,
 			proc->requested_threads_started, proc->max_threads,
-			proc->ready_threads, proc->alloc.free_async_space);
+			proc->ready_threads,
+			binder_alloc_get_free_async_space(&proc->alloc));
 	count = 0;
 	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n))
 		count++;
@@ -4084,10 +4246,7 @@ static void print_binder_proc_stats(struct seq_file *m,
 	}
 	seq_printf(m, "  refs: %d s %d w %d\n", count, strong, weak);
 
-	count = 0;
-	for (n = rb_first(&proc->alloc.allocated_buffers);
-			n != NULL; n = rb_next(n))
-		count++;
+	count = binder_alloc_get_allocated_count(&proc->alloc);
 	seq_printf(m, "  buffers: %d\n", count);
 
 	count = 0;
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index c835f09656c1..50b0d21f42cf 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -23,6 +23,7 @@
 struct binder_buffer;
 struct binder_node;
 struct binder_proc;
+struct binder_alloc;
 struct binder_ref;
 struct binder_thread;
 struct binder_transaction;
@@ -268,9 +269,9 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
 	TP_ARGS(buffer));
 
 TRACE_EVENT(binder_update_page_range,
-	TP_PROTO(struct binder_proc *proc, bool allocate,
+	TP_PROTO(struct binder_alloc *alloc, bool allocate,
 		 void *start, void *end),
-	TP_ARGS(proc, allocate, start, end),
+	TP_ARGS(alloc, allocate, start, end),
 	TP_STRUCT__entry(
 		__field(int, proc)
 		__field(bool, allocate)
@@ -278,9 +279,9 @@ TRACE_EVENT(binder_update_page_range,
 		__field(size_t, size)
 	),
 	TP_fast_assign(
-		__entry->proc = proc->pid;
+		__entry->proc = alloc->pid;
 		__entry->allocate = allocate;
-		__entry->offset = start - proc->alloc.buffer;
+		__entry->offset = start - alloc->buffer;
 		__entry->size = end - start;
 	),
 	TP_printk("proc=%d allocate=%d offset=%zu size=%zu",

From 2324f70c5aab8b8e5f4c100a80d1dd4999991285 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 10 Oct 2016 10:40:53 -0700
Subject: [PATCH 0904/3220] FROMLIST: binder: move binder_alloc to separate
 file

(from https://patchwork.kernel.org/patch/9817753/)

Move the binder allocator functionality to its own file

Continuation of splitting the binder allocator from the binder
driver. Split binder_alloc functions from normal binder functions.

Add kernel doc comments to functions declared extern in
binder_alloc.h

Change-Id: I8f1a967375359078b8e63c7b6b88a752c374a64a
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/Makefile       |   2 +-
 drivers/android/binder.c       | 763 +--------------------------------
 drivers/android/binder_alloc.c | 759 ++++++++++++++++++++++++++++++++
 drivers/android/binder_alloc.h | 162 +++++++
 4 files changed, 923 insertions(+), 763 deletions(-)
 create mode 100644 drivers/android/binder_alloc.c
 create mode 100644 drivers/android/binder_alloc.h

diff --git a/drivers/android/Makefile b/drivers/android/Makefile
index 3b7e4b072c58..4b7c726bb560 100644
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -1,3 +1,3 @@
 ccflags-y += -I$(src)			# needed for trace events
 
-obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o
+obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index e73481a8ea2c..3251b7cdd717 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -24,7 +24,6 @@
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/miscdevice.h>
-#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/nsproxy.h>
@@ -34,8 +33,6 @@
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
 #include <linux/pid_namespace.h>
 #include <linux/security.h>
 
@@ -44,11 +41,11 @@
 #endif
 
 #include <uapi/linux/android/binder.h>
+#include "binder_alloc.h"
 #include "binder_trace.h"
 
 static DEFINE_MUTEX(binder_main_lock);
 static DEFINE_MUTEX(binder_deferred_lock);
-static DEFINE_MUTEX(binder_alloc_mmap_lock);
 
 static HLIST_HEAD(binder_devices);
 static HLIST_HEAD(binder_procs);
@@ -157,27 +154,6 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 #define to_binder_fd_array_object(hdr) \
 	container_of(hdr, struct binder_fd_array_object, hdr)
 
-/*
- * debug declarations for binder_alloc. To be
- * moved to binder_alloc.c
- */
-enum {
-	BINDER_ALLOC_DEBUG_OPEN_CLOSE             = 1U << 1,
-	BINDER_ALLOC_DEBUG_BUFFER_ALLOC           = 1U << 2,
-	BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC     = 1U << 3,
-};
-static uint32_t binder_alloc_debug_mask;
-
-module_param_named(alloc_debug_mask, binder_alloc_debug_mask,
-		   uint, S_IWUSR | S_IRUGO);
-
-#define binder_alloc_debug(mask, x...) \
-	do { \
-		if (binder_alloc_debug_mask & mask) \
-			pr_info(x); \
-	} while (0)
-/* end of binder_alloc debug declarations */
-
 enum binder_stat_types {
 	BINDER_STAT_PROC,
 	BINDER_STAT_THREAD,
@@ -314,68 +290,12 @@ struct binder_ref {
 	struct binder_ref_death *death;
 };
 
-struct binder_buffer {
-	struct list_head entry; /* free and allocated entries by address */
-	struct rb_node rb_node; /* free entry by size or allocated entry */
-				/* by address */
-	unsigned free:1;
-	unsigned allow_user_free:1;
-	unsigned async_transaction:1;
-	unsigned debug_id:29;
-
-	struct binder_transaction *transaction;
-
-	struct binder_node *target_node;
-	size_t data_size;
-	size_t offsets_size;
-	size_t extra_buffers_size;
-	uint8_t data[0];
-};
-
 enum binder_deferred_state {
 	BINDER_DEFERRED_PUT_FILES    = 0x01,
 	BINDER_DEFERRED_FLUSH        = 0x02,
 	BINDER_DEFERRED_RELEASE      = 0x04,
 };
 
-/**
- * struct binder_alloc - per-binder proc state for binder allocator
- * @vma:               vm_area_struct passed to mmap_handler
- *                     (invarient after mmap)
- * @vma_vm_mm:         copy of vma->vm_mm (invarient after mmap)
- * @buffer:            base of per-proc address space mapped via mmap
- * @user_buffer_offset: offset between user and kernel VAs for buffer
- * @buffers:           list of all buffers for this proc
- * @free_buffers:      rb tree of buffers available for allocation
- *                     sorted by size
- * @allocated_buffers: rb tree of allocated buffers sorted by address
- * @free_async_space:  VA space available for async buffers. This is
- *                     initialized at mmap time to 1/2 the full VA space
- * @pages:             array of physical page addresses for each page of
- *                     mmap'd space
- * @buffer_size:       size of address space (could be less than requested)
- *
- * Bookkeeping structure for per-proc address space management for binder
- * buffers. It is normally initialized during binder_init() and binder_mmap()
- * calls. The address space is used for both user-visible buffers and for
- * struct binder_buffer objects used to track the user buffers
- */
-struct binder_alloc {
-	struct mutex mutex;
-	struct task_struct *tsk;
-	struct vm_area_struct *vma;
-	struct mm_struct *vma_vm_mm;
-	void *buffer;
-	ptrdiff_t user_buffer_offset;
-	struct list_head buffers;
-	struct rb_root free_buffers;
-	struct rb_root allocated_buffers;
-	size_t free_async_space;
-	struct page **pages;
-	size_t buffer_size;
-	int pid;
-};
-
 struct binder_proc {
 	struct hlist_node proc_node;
 	struct rb_root threads;
@@ -445,56 +365,6 @@ struct binder_transaction {
 	kuid_t	sender_euid;
 };
 
-/*
- * Forward declarations of binder_alloc functions.
- * These will be moved to binder_alloc.h when
- * binder_alloc is moved to its own files.
- */
-extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
-						  size_t data_size,
-						  size_t offsets_size,
-						  size_t extra_buffers_size,
-						  int is_async);
-extern void binder_alloc_init(struct binder_alloc *alloc);
-extern void binder_alloc_vma_close(struct binder_alloc *alloc);
-extern struct binder_buffer *
-binder_alloc_buffer_lookup(struct binder_alloc *alloc,
-			   uintptr_t user_ptr);
-extern void binder_alloc_free_buf(struct binder_alloc *alloc,
-				  struct binder_buffer *buffer);
-extern int binder_alloc_mmap_handler(struct binder_alloc *alloc,
-				     struct vm_area_struct *vma);
-extern void binder_alloc_deferred_release(struct binder_alloc *alloc);
-extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc);
-extern void binder_alloc_print_allocated(struct seq_file *m,
-					 struct binder_alloc *alloc);
-
-static inline size_t
-binder_alloc_get_free_async_space(struct binder_alloc *alloc)
-{
-	size_t free_async_space;
-
-	mutex_lock(&alloc->mutex);
-	free_async_space = alloc->free_async_space;
-	mutex_unlock(&alloc->mutex);
-	return free_async_space;
-}
-
-static inline ptrdiff_t
-binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc)
-{
-	/*
-	 * user_buffer_offset is constant if vma is set and
-	 * undefined if vma is not set. It is possible to
-	 * get here with !alloc->vma if the target process
-	 * is dying while a transaction is being initiated.
-	 * Returning the old value is ok in this case and
-	 * the transaction will fail.
-	 */
-	return alloc->user_buffer_offset;
-}
-/* end of binder_alloc declarations */
-
 static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
 
@@ -578,462 +448,6 @@ static void binder_set_nice(long nice)
 	binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
 }
 
-static size_t binder_alloc_buffer_size(struct binder_alloc *alloc,
-				       struct binder_buffer *buffer)
-{
-	if (list_is_last(&buffer->entry, &alloc->buffers))
-		return alloc->buffer +
-		       alloc->buffer_size - (void *)buffer->data;
-	return (size_t)list_entry(buffer->entry.next,
-			  struct binder_buffer, entry) - (size_t)buffer->data;
-}
-
-static void binder_insert_free_buffer(struct binder_alloc *alloc,
-				      struct binder_buffer *new_buffer)
-{
-	struct rb_node **p = &alloc->free_buffers.rb_node;
-	struct rb_node *parent = NULL;
-	struct binder_buffer *buffer;
-	size_t buffer_size;
-	size_t new_buffer_size;
-
-	BUG_ON(!new_buffer->free);
-
-	new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer);
-
-	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-		     "%d: add free buffer, size %zd, at %pK\n",
-		      alloc->pid, new_buffer_size, new_buffer);
-
-	while (*p) {
-		parent = *p;
-		buffer = rb_entry(parent, struct binder_buffer, rb_node);
-		BUG_ON(!buffer->free);
-
-		buffer_size = binder_alloc_buffer_size(alloc, buffer);
-
-		if (new_buffer_size < buffer_size)
-			p = &parent->rb_left;
-		else
-			p = &parent->rb_right;
-	}
-	rb_link_node(&new_buffer->rb_node, parent, p);
-	rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers);
-}
-
-static void binder_insert_allocated_buffer(struct binder_alloc *alloc,
-					   struct binder_buffer *new_buffer)
-{
-	struct rb_node **p = &alloc->allocated_buffers.rb_node;
-	struct rb_node *parent = NULL;
-	struct binder_buffer *buffer;
-
-	BUG_ON(new_buffer->free);
-
-	while (*p) {
-		parent = *p;
-		buffer = rb_entry(parent, struct binder_buffer, rb_node);
-		BUG_ON(buffer->free);
-
-		if (new_buffer < buffer)
-			p = &parent->rb_left;
-		else if (new_buffer > buffer)
-			p = &parent->rb_right;
-		else
-			BUG();
-	}
-	rb_link_node(&new_buffer->rb_node, parent, p);
-	rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers);
-}
-
-static struct binder_buffer *binder_alloc_buffer_lookup_locked(
-		struct binder_alloc *alloc,
-		uintptr_t user_ptr)
-{
-	struct rb_node *n = alloc->allocated_buffers.rb_node;
-	struct binder_buffer *buffer;
-	struct binder_buffer *kern_ptr;
-
-	kern_ptr = (struct binder_buffer *)(user_ptr - alloc->user_buffer_offset
-		- offsetof(struct binder_buffer, data));
-
-	while (n) {
-		buffer = rb_entry(n, struct binder_buffer, rb_node);
-		BUG_ON(buffer->free);
-
-		if (kern_ptr < buffer)
-			n = n->rb_left;
-		else if (kern_ptr > buffer)
-			n = n->rb_right;
-		else
-			return buffer;
-	}
-	return NULL;
-}
-
-struct binder_buffer *binder_alloc_buffer_lookup(struct binder_alloc *alloc,
-						 uintptr_t user_ptr)
-{
-	struct binder_buffer *buffer;
-
-	mutex_lock(&alloc->mutex);
-	buffer = binder_alloc_buffer_lookup_locked(alloc, user_ptr);
-	mutex_unlock(&alloc->mutex);
-	return buffer;
-}
-
-static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
-				    void *start, void *end,
-				    struct vm_area_struct *vma)
-{
-	void *page_addr;
-	unsigned long user_page_addr;
-	struct page **page;
-	struct mm_struct *mm;
-
-	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-		     "%d: %s pages %pK-%pK\n", alloc->pid,
-		     allocate ? "allocate" : "free", start, end);
-
-	if (end <= start)
-		return 0;
-
-	trace_binder_update_page_range(alloc, allocate, start, end);
-
-	if (vma)
-		mm = NULL;
-	else
-		mm = get_task_mm(alloc->tsk);
-
-	if (mm) {
-		down_write(&mm->mmap_sem);
-		vma = alloc->vma;
-		if (vma && mm != alloc->vma_vm_mm) {
-			pr_err("%d: vma mm and task mm mismatch\n",
-				alloc->pid);
-			vma = NULL;
-		}
-	}
-
-	if (allocate == 0)
-		goto free_range;
-
-	if (vma == NULL) {
-		pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
-			alloc->pid);
-		goto err_no_vma;
-	}
-
-	for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
-		int ret;
-
-		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
-
-		BUG_ON(*page);
-		*page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
-		if (*page == NULL) {
-			pr_err("%d: binder_alloc_buf failed for page at %pK\n",
-				alloc->pid, page_addr);
-			goto err_alloc_page_failed;
-		}
-		ret = map_kernel_range_noflush((unsigned long)page_addr,
-					PAGE_SIZE, PAGE_KERNEL, page);
-		flush_cache_vmap((unsigned long)page_addr,
-				(unsigned long)page_addr + PAGE_SIZE);
-		if (ret != 1) {
-			pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n",
-			       alloc->pid, page_addr);
-			goto err_map_kernel_failed;
-		}
-		user_page_addr =
-			(uintptr_t)page_addr + alloc->user_buffer_offset;
-		ret = vm_insert_page(vma, user_page_addr, page[0]);
-		if (ret) {
-			pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
-			       alloc->pid, user_page_addr);
-			goto err_vm_insert_page_failed;
-		}
-		/* vm_insert_page does not seem to increment the refcount */
-	}
-	if (mm) {
-		up_write(&mm->mmap_sem);
-		mmput(mm);
-	}
-	return 0;
-
-free_range:
-	for (page_addr = end - PAGE_SIZE; page_addr >= start;
-	     page_addr -= PAGE_SIZE) {
-		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
-		if (vma)
-			zap_page_range(vma, (uintptr_t)page_addr +
-				alloc->user_buffer_offset, PAGE_SIZE, NULL);
-err_vm_insert_page_failed:
-		unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
-err_map_kernel_failed:
-		__free_page(*page);
-		*page = NULL;
-err_alloc_page_failed:
-		;
-	}
-err_no_vma:
-	if (mm) {
-		up_write(&mm->mmap_sem);
-		mmput(mm);
-	}
-	return -ENOMEM;
-}
-
-static struct binder_buffer *binder_alloc_new_buf_locked(
-		struct binder_alloc *alloc, size_t data_size,
-		size_t offsets_size, size_t extra_buffers_size, int is_async)
-{
-	struct rb_node *n = alloc->free_buffers.rb_node;
-	struct binder_buffer *buffer;
-	size_t buffer_size;
-	struct rb_node *best_fit = NULL;
-	void *has_page_addr;
-	void *end_page_addr;
-	size_t size, data_offsets_size;
-
-	if (alloc->vma == NULL) {
-		pr_err("%d: binder_alloc_buf, no vma\n",
-		       alloc->pid);
-		return NULL;
-	}
-
-	data_offsets_size = ALIGN(data_size, sizeof(void *)) +
-		ALIGN(offsets_size, sizeof(void *));
-
-	if (data_offsets_size < data_size || data_offsets_size < offsets_size) {
-		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-				"%d: got transaction with invalid size %zd-%zd\n",
-				alloc->pid, data_size, offsets_size);
-		return NULL;
-	}
-	size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *));
-	if (size < data_offsets_size || size < extra_buffers_size) {
-		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-				"%d: got transaction with invalid extra_buffers_size %zd\n",
-				alloc->pid, extra_buffers_size);
-		return NULL;
-	}
-	if (is_async &&
-	    alloc->free_async_space < size + sizeof(struct binder_buffer)) {
-		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-			     "%d: binder_alloc_buf size %zd failed, no async space left\n",
-			      alloc->pid, size);
-		return NULL;
-	}
-
-	while (n) {
-		buffer = rb_entry(n, struct binder_buffer, rb_node);
-		BUG_ON(!buffer->free);
-		buffer_size = binder_alloc_buffer_size(alloc, buffer);
-
-		if (size < buffer_size) {
-			best_fit = n;
-			n = n->rb_left;
-		} else if (size > buffer_size)
-			n = n->rb_right;
-		else {
-			best_fit = n;
-			break;
-		}
-	}
-	if (best_fit == NULL) {
-		pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
-			alloc->pid, size);
-		return NULL;
-	}
-	if (n == NULL) {
-		buffer = rb_entry(best_fit, struct binder_buffer, rb_node);
-		buffer_size = binder_alloc_buffer_size(alloc, buffer);
-	}
-
-	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-		     "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n",
-		      alloc->pid, size, buffer, buffer_size);
-
-	has_page_addr =
-		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
-	if (n == NULL) {
-		if (size + sizeof(struct binder_buffer) + 4 >= buffer_size)
-			buffer_size = size; /* no room for other buffers */
-		else
-			buffer_size = size + sizeof(struct binder_buffer);
-	}
-	end_page_addr =
-		(void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size);
-	if (end_page_addr > has_page_addr)
-		end_page_addr = has_page_addr;
-	if (binder_update_page_range(alloc, 1,
-	    (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL))
-		return NULL;
-
-	rb_erase(best_fit, &alloc->free_buffers);
-	buffer->free = 0;
-	binder_insert_allocated_buffer(alloc, buffer);
-	if (buffer_size != size) {
-		struct binder_buffer *new_buffer = (void *)buffer->data + size;
-
-		list_add(&new_buffer->entry, &buffer->entry);
-		new_buffer->free = 1;
-		binder_insert_free_buffer(alloc, new_buffer);
-	}
-	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-		     "%d: binder_alloc_buf size %zd got %pK\n",
-		      alloc->pid, size, buffer);
-	buffer->data_size = data_size;
-	buffer->offsets_size = offsets_size;
-	buffer->async_transaction = is_async;
-	buffer->extra_buffers_size = extra_buffers_size;
-	if (is_async) {
-		alloc->free_async_space -= size + sizeof(struct binder_buffer);
-		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC,
-			     "%d: binder_alloc_buf size %zd async free %zd\n",
-			      alloc->pid, size, alloc->free_async_space);
-	}
-
-	return buffer;
-}
-
-struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
-					   size_t data_size,
-					   size_t offsets_size,
-					   size_t extra_buffers_size,
-					   int is_async)
-{
-	struct binder_buffer *buffer;
-
-	mutex_lock(&alloc->mutex);
-	buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size,
-					     extra_buffers_size, is_async);
-	mutex_unlock(&alloc->mutex);
-	return buffer;
-}
-
-static void *buffer_start_page(struct binder_buffer *buffer)
-{
-	return (void *)((uintptr_t)buffer & PAGE_MASK);
-}
-
-static void *buffer_end_page(struct binder_buffer *buffer)
-{
-	return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK);
-}
-
-static void binder_delete_free_buffer(struct binder_alloc *alloc,
-				      struct binder_buffer *buffer)
-{
-	struct binder_buffer *prev, *next = NULL;
-	int free_page_end = 1;
-	int free_page_start = 1;
-
-	BUG_ON(alloc->buffers.next == &buffer->entry);
-	prev = list_entry(buffer->entry.prev, struct binder_buffer, entry);
-	BUG_ON(!prev->free);
-	if (buffer_end_page(prev) == buffer_start_page(buffer)) {
-		free_page_start = 0;
-		if (buffer_end_page(prev) == buffer_end_page(buffer))
-			free_page_end = 0;
-		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-			     "%d: merge free, buffer %pK share page with %pK\n",
-			      alloc->pid, buffer, prev);
-	}
-
-	if (!list_is_last(&buffer->entry, &alloc->buffers)) {
-		next = list_entry(buffer->entry.next,
-				  struct binder_buffer, entry);
-		if (buffer_start_page(next) == buffer_end_page(buffer)) {
-			free_page_end = 0;
-			if (buffer_start_page(next) ==
-			    buffer_start_page(buffer))
-				free_page_start = 0;
-			binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-				     "%d: merge free, buffer %pK share page with %pK\n",
-				      alloc->pid, buffer, prev);
-		}
-	}
-	list_del(&buffer->entry);
-	if (free_page_start || free_page_end) {
-		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-			     "%d: merge free, buffer %pK do not share page%s%s with %pK or %pK\n",
-			     alloc->pid, buffer, free_page_start ? "" : " end",
-			     free_page_end ? "" : " start", prev, next);
-		binder_update_page_range(alloc, 0, free_page_start ?
-			buffer_start_page(buffer) : buffer_end_page(buffer),
-			(free_page_end ? buffer_end_page(buffer) :
-			buffer_start_page(buffer)) + PAGE_SIZE, NULL);
-	}
-}
-
-static void binder_free_buf_locked(struct binder_alloc *alloc,
-				   struct binder_buffer *buffer)
-{
-	size_t size, buffer_size;
-
-	buffer_size = binder_alloc_buffer_size(alloc, buffer);
-
-	size = ALIGN(buffer->data_size, sizeof(void *)) +
-		ALIGN(buffer->offsets_size, sizeof(void *)) +
-		ALIGN(buffer->extra_buffers_size, sizeof(void *));
-
-	binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-		     "%d: binder_free_buf %pK size %zd buffer_size %zd\n",
-		      alloc->pid, buffer, size, buffer_size);
-
-	BUG_ON(buffer->free);
-	BUG_ON(size > buffer_size);
-	BUG_ON(buffer->transaction != NULL);
-	BUG_ON((void *)buffer < alloc->buffer);
-	BUG_ON((void *)buffer > alloc->buffer + alloc->buffer_size);
-
-	if (buffer->async_transaction) {
-		alloc->free_async_space += size + sizeof(struct binder_buffer);
-
-		binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC,
-			     "%d: binder_free_buf size %zd async free %zd\n",
-			      alloc->pid, size, alloc->free_async_space);
-	}
-
-	binder_update_page_range(alloc, 0,
-		(void *)PAGE_ALIGN((uintptr_t)buffer->data),
-		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
-		NULL);
-
-	rb_erase(&buffer->rb_node, &alloc->allocated_buffers);
-	buffer->free = 1;
-	if (!list_is_last(&buffer->entry, &alloc->buffers)) {
-		struct binder_buffer *next = list_entry(buffer->entry.next,
-						struct binder_buffer, entry);
-
-		if (next->free) {
-			rb_erase(&next->rb_node, &alloc->free_buffers);
-			binder_delete_free_buffer(alloc, next);
-		}
-	}
-	if (alloc->buffers.next != &buffer->entry) {
-		struct binder_buffer *prev = list_entry(buffer->entry.prev,
-						struct binder_buffer, entry);
-
-		if (prev->free) {
-			binder_delete_free_buffer(alloc, buffer);
-			rb_erase(&prev->rb_node, &alloc->free_buffers);
-			buffer = prev;
-		}
-	}
-	binder_insert_free_buffer(alloc, buffer);
-}
-
-void binder_alloc_free_buf(struct binder_alloc *alloc,
-			    struct binder_buffer *buffer)
-{
-	mutex_lock(&alloc->mutex);
-	binder_free_buf_locked(alloc, buffer);
-	mutex_unlock(&alloc->mutex);
-}
-
 static struct binder_node *binder_get_node(struct binder_proc *proc,
 					   binder_uintptr_t ptr)
 {
@@ -3468,12 +2882,6 @@ static void binder_vma_open(struct vm_area_struct *vma)
 		     (unsigned long)pgprot_val(vma->vm_page_prot));
 }
 
-void binder_alloc_vma_close(struct binder_alloc *alloc)
-{
-	WRITE_ONCE(alloc->vma, NULL);
-	WRITE_ONCE(alloc->vma_vm_mm, NULL);
-}
-
 static void binder_vma_close(struct vm_area_struct *vma)
 {
 	struct binder_proc *proc = vma->vm_private_data;
@@ -3498,86 +2906,6 @@ static const struct vm_operations_struct binder_vm_ops = {
 	.fault = binder_vm_fault,
 };
 
-int binder_alloc_mmap_handler(struct binder_alloc *alloc,
-			      struct vm_area_struct *vma)
-{
-	int ret;
-	struct vm_struct *area;
-	const char *failure_string;
-	struct binder_buffer *buffer;
-
-	mutex_lock(&binder_alloc_mmap_lock);
-	if (alloc->buffer) {
-		ret = -EBUSY;
-		failure_string = "already mapped";
-		goto err_already_mapped;
-	}
-
-	area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
-	if (area == NULL) {
-		ret = -ENOMEM;
-		failure_string = "get_vm_area";
-		goto err_get_vm_area_failed;
-	}
-	alloc->buffer = area->addr;
-	alloc->user_buffer_offset =
-			vma->vm_start - (uintptr_t)alloc->buffer;
-	mutex_unlock(&binder_alloc_mmap_lock);
-
-#ifdef CONFIG_CPU_CACHE_VIPT
-	if (cache_is_vipt_aliasing()) {
-		while (CACHE_COLOUR(
-				(vma->vm_start ^ (uint32_t)alloc->buffer))) {
-			pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n",
-				alloc->pid, vma->vm_start, vma->vm_end,
-				alloc->buffer);
-			vma->vm_start += PAGE_SIZE;
-		}
-	}
-#endif
-	alloc->pages = kzalloc(sizeof(alloc->pages[0]) *
-				   ((vma->vm_end - vma->vm_start) / PAGE_SIZE),
-			       GFP_KERNEL);
-	if (alloc->pages == NULL) {
-		ret = -ENOMEM;
-		failure_string = "alloc page array";
-		goto err_alloc_pages_failed;
-	}
-	alloc->buffer_size = vma->vm_end - vma->vm_start;
-
-	if (binder_update_page_range(alloc, 1, alloc->buffer,
-				     alloc->buffer + PAGE_SIZE, vma)) {
-		ret = -ENOMEM;
-		failure_string = "alloc small buf";
-		goto err_alloc_small_buf_failed;
-	}
-	buffer = alloc->buffer;
-	INIT_LIST_HEAD(&alloc->buffers);
-	list_add(&buffer->entry, &alloc->buffers);
-	buffer->free = 1;
-	binder_insert_free_buffer(alloc, buffer);
-	alloc->free_async_space = alloc->buffer_size / 2;
-	barrier();
-	alloc->vma = vma;
-	alloc->vma_vm_mm = vma->vm_mm;
-
-	return 0;
-
-err_alloc_small_buf_failed:
-	kfree(alloc->pages);
-	alloc->pages = NULL;
-err_alloc_pages_failed:
-	mutex_lock(&binder_alloc_mmap_lock);
-	vfree(alloc->buffer);
-	alloc->buffer = NULL;
-err_get_vm_area_failed:
-err_already_mapped:
-	mutex_unlock(&binder_alloc_mmap_lock);
-	pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
-	       alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
-	return ret;
-}
-
 static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	int ret;
@@ -3617,13 +2945,6 @@ err_bad_arg:
 	return ret;
 }
 
-void binder_alloc_init(struct binder_alloc *alloc)
-{
-	alloc->tsk = current->group_leader;
-	alloc->pid = current->group_leader->pid;
-	mutex_init(&alloc->mutex);
-}
-
 static int binder_open(struct inode *nodp, struct file *filp)
 {
 	struct binder_proc *proc;
@@ -3759,55 +3080,6 @@ static int binder_node_release(struct binder_node *node, int refs)
 	return refs;
 }
 
-void binder_alloc_deferred_release(struct binder_alloc *alloc)
-{
-	struct rb_node *n;
-	int buffers, page_count;
-
-	BUG_ON(alloc->vma);
-
-	buffers = 0;
-	mutex_lock(&alloc->mutex);
-	while ((n = rb_first(&alloc->allocated_buffers))) {
-		struct binder_buffer *buffer;
-
-		buffer = rb_entry(n, struct binder_buffer, rb_node);
-
-		/* Transaction should already have been freed */
-		BUG_ON(buffer->transaction);
-
-		binder_free_buf_locked(alloc, buffer);
-		buffers++;
-	}
-
-	page_count = 0;
-	if (alloc->pages) {
-		int i;
-
-		for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
-			void *page_addr;
-
-			if (!alloc->pages[i])
-				continue;
-
-			page_addr = alloc->buffer + i * PAGE_SIZE;
-			binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC,
-				     "%s: %d: page %d at %pK not freed\n",
-				     __func__, alloc->pid, i, page_addr);
-			unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
-			__free_page(alloc->pages[i]);
-			page_count++;
-		}
-		kfree(alloc->pages);
-		vfree(alloc->buffer);
-	}
-	mutex_unlock(&alloc->mutex);
-
-	binder_alloc_debug(BINDER_ALLOC_DEBUG_OPEN_CLOSE,
-		     "%s: %d buffers %d, pages %d\n",
-		     __func__, alloc->pid, buffers, page_count);
-}
-
 static void binder_deferred_release(struct binder_proc *proc)
 {
 	struct binder_context *context = proc->context;
@@ -4052,27 +3324,6 @@ static void print_binder_ref(struct seq_file *m, struct binder_ref *ref)
 		   ref->node->debug_id, ref->strong, ref->weak, ref->death);
 }
 
-static void print_binder_buffer(struct seq_file *m, const char *prefix,
-				struct binder_buffer *buffer)
-{
-	seq_printf(m, "%s %d: %pK size %zd:%zd %s\n",
-		   prefix, buffer->debug_id, buffer->data,
-		   buffer->data_size, buffer->offsets_size,
-		   buffer->transaction ? "active" : "delivered");
-}
-
-void binder_alloc_print_allocated(struct seq_file *m,
-				  struct binder_alloc *alloc)
-{
-	struct rb_node *n;
-
-	mutex_lock(&alloc->mutex);
-	for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
-		print_binder_buffer(m, "  buffer",
-				    rb_entry(n, struct binder_buffer, rb_node));
-	mutex_unlock(&alloc->mutex);
-}
-
 static void print_binder_proc(struct seq_file *m,
 			      struct binder_proc *proc, int print_all)
 {
@@ -4199,18 +3450,6 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
 	}
 }
 
-int binder_alloc_get_allocated_count(struct binder_alloc *alloc)
-{
-	struct rb_node *n;
-	int count = 0;
-
-	mutex_lock(&alloc->mutex);
-	for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
-		count++;
-	mutex_unlock(&alloc->mutex);
-	return count;
-}
-
 static void print_binder_proc_stats(struct seq_file *m,
 				    struct binder_proc *proc)
 {
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
new file mode 100644
index 000000000000..eaef8f8fb859
--- /dev/null
+++ b/drivers/android/binder_alloc.c
@@ -0,0 +1,759 @@
+/* binder_alloc.c
+ *
+ * Android IPC Subsystem
+ *
+ * Copyright (C) 2007-2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm/cacheflush.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/rtmutex.h>
+#include <linux/rbtree.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "binder_alloc.h"
+#include "binder_trace.h"
+
+static DEFINE_MUTEX(binder_alloc_mmap_lock);
+
+enum {
+	BINDER_DEBUG_OPEN_CLOSE             = 1U << 1,
+	BINDER_DEBUG_BUFFER_ALLOC           = 1U << 2,
+	BINDER_DEBUG_BUFFER_ALLOC_ASYNC     = 1U << 3,
+};
+static uint32_t binder_alloc_debug_mask;
+
+module_param_named(debug_mask, binder_alloc_debug_mask,
+		   uint, S_IWUSR | S_IRUGO);
+
+#define binder_alloc_debug(mask, x...) \
+	do { \
+		if (binder_alloc_debug_mask & mask) \
+			pr_info(x); \
+	} while (0)
+
+static size_t binder_alloc_buffer_size(struct binder_alloc *alloc,
+				       struct binder_buffer *buffer)
+{
+	if (list_is_last(&buffer->entry, &alloc->buffers))
+		return alloc->buffer +
+		       alloc->buffer_size - (void *)buffer->data;
+	return (size_t)list_entry(buffer->entry.next,
+			  struct binder_buffer, entry) - (size_t)buffer->data;
+}
+
+static void binder_insert_free_buffer(struct binder_alloc *alloc,
+				      struct binder_buffer *new_buffer)
+{
+	struct rb_node **p = &alloc->free_buffers.rb_node;
+	struct rb_node *parent = NULL;
+	struct binder_buffer *buffer;
+	size_t buffer_size;
+	size_t new_buffer_size;
+
+	BUG_ON(!new_buffer->free);
+
+	new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer);
+
+	binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+		     "%d: add free buffer, size %zd, at %pK\n",
+		      alloc->pid, new_buffer_size, new_buffer);
+
+	while (*p) {
+		parent = *p;
+		buffer = rb_entry(parent, struct binder_buffer, rb_node);
+		BUG_ON(!buffer->free);
+
+		buffer_size = binder_alloc_buffer_size(alloc, buffer);
+
+		if (new_buffer_size < buffer_size)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&new_buffer->rb_node, parent, p);
+	rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers);
+}
+
+static void binder_insert_allocated_buffer_locked(
+		struct binder_alloc *alloc, struct binder_buffer *new_buffer)
+{
+	struct rb_node **p = &alloc->allocated_buffers.rb_node;
+	struct rb_node *parent = NULL;
+	struct binder_buffer *buffer;
+
+	BUG_ON(new_buffer->free);
+
+	while (*p) {
+		parent = *p;
+		buffer = rb_entry(parent, struct binder_buffer, rb_node);
+		BUG_ON(buffer->free);
+
+		if (new_buffer < buffer)
+			p = &parent->rb_left;
+		else if (new_buffer > buffer)
+			p = &parent->rb_right;
+		else
+			BUG();
+	}
+	rb_link_node(&new_buffer->rb_node, parent, p);
+	rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers);
+}
+
+static struct binder_buffer *binder_alloc_buffer_lookup_locked(
+		struct binder_alloc *alloc,
+		uintptr_t user_ptr)
+{
+	struct rb_node *n = alloc->allocated_buffers.rb_node;
+	struct binder_buffer *buffer;
+	struct binder_buffer *kern_ptr;
+
+	kern_ptr = (struct binder_buffer *)(user_ptr - alloc->user_buffer_offset
+		- offsetof(struct binder_buffer, data));
+
+	while (n) {
+		buffer = rb_entry(n, struct binder_buffer, rb_node);
+		BUG_ON(buffer->free);
+
+		if (kern_ptr < buffer)
+			n = n->rb_left;
+		else if (kern_ptr > buffer)
+			n = n->rb_right;
+		else
+			return buffer;
+	}
+	return NULL;
+}
+
+/**
+ * binder_alloc_buffer_lookup() - get buffer given user ptr
+ * @alloc:	binder_alloc for this proc
+ * @user_ptr:	User pointer to buffer data
+ *
+ * Validate userspace pointer to buffer data and return buffer corresponding to
+ * that user pointer. Search the rb tree for buffer that matches user data
+ * pointer.
+ *
+ * Return:	Pointer to buffer or NULL
+ */
+struct binder_buffer *binder_alloc_buffer_lookup(struct binder_alloc *alloc,
+						 uintptr_t user_ptr)
+{
+	struct binder_buffer *buffer;
+
+	mutex_lock(&alloc->mutex);
+	buffer = binder_alloc_buffer_lookup_locked(alloc, user_ptr);
+	mutex_unlock(&alloc->mutex);
+	return buffer;
+}
+
+static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
+				    void *start, void *end,
+				    struct vm_area_struct *vma)
+{
+	void *page_addr;
+	unsigned long user_page_addr;
+	struct page **page;
+	struct mm_struct *mm;
+
+	binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+		     "%d: %s pages %pK-%pK\n", alloc->pid,
+		     allocate ? "allocate" : "free", start, end);
+
+	if (end <= start)
+		return 0;
+
+	trace_binder_update_page_range(alloc, allocate, start, end);
+
+	if (vma)
+		mm = NULL;
+	else
+		mm = get_task_mm(alloc->tsk);
+
+	if (mm) {
+		down_write(&mm->mmap_sem);
+		vma = alloc->vma;
+		if (vma && mm != alloc->vma_vm_mm) {
+			pr_err("%d: vma mm and task mm mismatch\n",
+				alloc->pid);
+			vma = NULL;
+		}
+	}
+
+	if (allocate == 0)
+		goto free_range;
+
+	if (vma == NULL) {
+		pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
+			alloc->pid);
+		goto err_no_vma;
+	}
+
+	for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
+		int ret;
+
+		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
+
+		BUG_ON(*page);
+		*page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+		if (*page == NULL) {
+			pr_err("%d: binder_alloc_buf failed for page at %pK\n",
+				alloc->pid, page_addr);
+			goto err_alloc_page_failed;
+		}
+		ret = map_kernel_range_noflush((unsigned long)page_addr,
+					PAGE_SIZE, PAGE_KERNEL, page);
+		flush_cache_vmap((unsigned long)page_addr,
+				(unsigned long)page_addr + PAGE_SIZE);
+		if (ret != 1) {
+			pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n",
+			       alloc->pid, page_addr);
+			goto err_map_kernel_failed;
+		}
+		user_page_addr =
+			(uintptr_t)page_addr + alloc->user_buffer_offset;
+		ret = vm_insert_page(vma, user_page_addr, page[0]);
+		if (ret) {
+			pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
+			       alloc->pid, user_page_addr);
+			goto err_vm_insert_page_failed;
+		}
+		/* vm_insert_page does not seem to increment the refcount */
+	}
+	if (mm) {
+		up_write(&mm->mmap_sem);
+		mmput(mm);
+	}
+	return 0;
+
+free_range:
+	for (page_addr = end - PAGE_SIZE; page_addr >= start;
+	     page_addr -= PAGE_SIZE) {
+		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
+		if (vma)
+			zap_page_range(vma, (uintptr_t)page_addr +
+				alloc->user_buffer_offset, PAGE_SIZE, NULL);
+err_vm_insert_page_failed:
+		unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
+err_map_kernel_failed:
+		__free_page(*page);
+		*page = NULL;
+err_alloc_page_failed:
+		;
+	}
+err_no_vma:
+	if (mm) {
+		up_write(&mm->mmap_sem);
+		mmput(mm);
+	}
+	return -ENOMEM;
+}
+
+struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
+						  size_t data_size,
+						  size_t offsets_size,
+						  size_t extra_buffers_size,
+						  int is_async)
+{
+	struct rb_node *n = alloc->free_buffers.rb_node;
+	struct binder_buffer *buffer;
+	size_t buffer_size;
+	struct rb_node *best_fit = NULL;
+	void *has_page_addr;
+	void *end_page_addr;
+	size_t size, data_offsets_size;
+
+	if (alloc->vma == NULL) {
+		pr_err("%d: binder_alloc_buf, no vma\n",
+		       alloc->pid);
+		return NULL;
+	}
+
+	data_offsets_size = ALIGN(data_size, sizeof(void *)) +
+		ALIGN(offsets_size, sizeof(void *));
+
+	if (data_offsets_size < data_size || data_offsets_size < offsets_size) {
+		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+				"%d: got transaction with invalid size %zd-%zd\n",
+				alloc->pid, data_size, offsets_size);
+		return NULL;
+	}
+	size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *));
+	if (size < data_offsets_size || size < extra_buffers_size) {
+		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+				"%d: got transaction with invalid extra_buffers_size %zd\n",
+				alloc->pid, extra_buffers_size);
+		return NULL;
+	}
+	if (is_async &&
+	    alloc->free_async_space < size + sizeof(struct binder_buffer)) {
+		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+			     "%d: binder_alloc_buf size %zd failed, no async space left\n",
+			      alloc->pid, size);
+		return NULL;
+	}
+
+	while (n) {
+		buffer = rb_entry(n, struct binder_buffer, rb_node);
+		BUG_ON(!buffer->free);
+		buffer_size = binder_alloc_buffer_size(alloc, buffer);
+
+		if (size < buffer_size) {
+			best_fit = n;
+			n = n->rb_left;
+		} else if (size > buffer_size)
+			n = n->rb_right;
+		else {
+			best_fit = n;
+			break;
+		}
+	}
+	if (best_fit == NULL) {
+		pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
+			alloc->pid, size);
+		return NULL;
+	}
+	if (n == NULL) {
+		buffer = rb_entry(best_fit, struct binder_buffer, rb_node);
+		buffer_size = binder_alloc_buffer_size(alloc, buffer);
+	}
+
+	binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+		     "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n",
+		      alloc->pid, size, buffer, buffer_size);
+
+	has_page_addr =
+		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
+	if (n == NULL) {
+		if (size + sizeof(struct binder_buffer) + 4 >= buffer_size)
+			buffer_size = size; /* no room for other buffers */
+		else
+			buffer_size = size + sizeof(struct binder_buffer);
+	}
+	end_page_addr =
+		(void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size);
+	if (end_page_addr > has_page_addr)
+		end_page_addr = has_page_addr;
+	if (binder_update_page_range(alloc, 1,
+	    (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL))
+		return NULL;
+
+	rb_erase(best_fit, &alloc->free_buffers);
+	buffer->free = 0;
+	binder_insert_allocated_buffer_locked(alloc, buffer);
+	if (buffer_size != size) {
+		struct binder_buffer *new_buffer = (void *)buffer->data + size;
+
+		list_add(&new_buffer->entry, &buffer->entry);
+		new_buffer->free = 1;
+		binder_insert_free_buffer(alloc, new_buffer);
+	}
+	binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+		     "%d: binder_alloc_buf size %zd got %pK\n",
+		      alloc->pid, size, buffer);
+	buffer->data_size = data_size;
+	buffer->offsets_size = offsets_size;
+	buffer->async_transaction = is_async;
+	buffer->extra_buffers_size = extra_buffers_size;
+	if (is_async) {
+		alloc->free_async_space -= size + sizeof(struct binder_buffer);
+		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
+			     "%d: binder_alloc_buf size %zd async free %zd\n",
+			      alloc->pid, size, alloc->free_async_space);
+	}
+	return buffer;
+}
+
+/**
+ * binder_alloc_new_buf() - Allocate a new binder buffer
+ * @alloc:              binder_alloc for this proc
+ * @data_size:          size of user data buffer
+ * @offsets_size:       user specified buffer offset
+ * @extra_buffers_size: size of extra space for meta-data (eg, security context)
+ * @is_async:           buffer for async transaction
+ *
+ * Allocate a new buffer given the requested sizes. Returns
+ * the kernel version of the buffer pointer. The size allocated
+ * is the sum of the three given sizes (each rounded up to
+ * pointer-sized boundary)
+ *
+ * Return:	The allocated buffer or %NULL if error
+ */
+struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
+					   size_t data_size,
+					   size_t offsets_size,
+					   size_t extra_buffers_size,
+					   int is_async)
+{
+	struct binder_buffer *buffer;
+
+	mutex_lock(&alloc->mutex);
+	buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size,
+					     extra_buffers_size, is_async);
+	mutex_unlock(&alloc->mutex);
+	return buffer;
+}
+
+static void *buffer_start_page(struct binder_buffer *buffer)
+{
+	return (void *)((uintptr_t)buffer & PAGE_MASK);
+}
+
+static void *buffer_end_page(struct binder_buffer *buffer)
+{
+	return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK);
+}
+
+static void binder_delete_free_buffer(struct binder_alloc *alloc,
+				      struct binder_buffer *buffer)
+{
+	struct binder_buffer *prev, *next = NULL;
+	int free_page_end = 1;
+	int free_page_start = 1;
+
+	BUG_ON(alloc->buffers.next == &buffer->entry);
+	prev = list_entry(buffer->entry.prev, struct binder_buffer, entry);
+	BUG_ON(!prev->free);
+	if (buffer_end_page(prev) == buffer_start_page(buffer)) {
+		free_page_start = 0;
+		if (buffer_end_page(prev) == buffer_end_page(buffer))
+			free_page_end = 0;
+		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+			     "%d: merge free, buffer %pK share page with %pK\n",
+			      alloc->pid, buffer, prev);
+	}
+
+	if (!list_is_last(&buffer->entry, &alloc->buffers)) {
+		next = list_entry(buffer->entry.next,
+				  struct binder_buffer, entry);
+		if (buffer_start_page(next) == buffer_end_page(buffer)) {
+			free_page_end = 0;
+			if (buffer_start_page(next) ==
+			    buffer_start_page(buffer))
+				free_page_start = 0;
+			binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+				     "%d: merge free, buffer %pK share page with %pK\n",
+				      alloc->pid, buffer, prev);
+		}
+	}
+	list_del(&buffer->entry);
+	if (free_page_start || free_page_end) {
+		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+			     "%d: merge free, buffer %pK do not share page%s%s with %pK or %pK\n",
+			     alloc->pid, buffer, free_page_start ? "" : " end",
+			     free_page_end ? "" : " start", prev, next);
+		binder_update_page_range(alloc, 0, free_page_start ?
+			buffer_start_page(buffer) : buffer_end_page(buffer),
+			(free_page_end ? buffer_end_page(buffer) :
+			buffer_start_page(buffer)) + PAGE_SIZE, NULL);
+	}
+}
+
+static void binder_free_buf_locked(struct binder_alloc *alloc,
+				   struct binder_buffer *buffer)
+{
+	size_t size, buffer_size;
+
+	buffer_size = binder_alloc_buffer_size(alloc, buffer);
+
+	size = ALIGN(buffer->data_size, sizeof(void *)) +
+		ALIGN(buffer->offsets_size, sizeof(void *)) +
+		ALIGN(buffer->extra_buffers_size, sizeof(void *));
+
+	binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+		     "%d: binder_free_buf %pK size %zd buffer_size %zd\n",
+		      alloc->pid, buffer, size, buffer_size);
+
+	BUG_ON(buffer->free);
+	BUG_ON(size > buffer_size);
+	BUG_ON(buffer->transaction != NULL);
+	BUG_ON((void *)buffer < alloc->buffer);
+	BUG_ON((void *)buffer > alloc->buffer + alloc->buffer_size);
+
+	if (buffer->async_transaction) {
+		alloc->free_async_space += size + sizeof(struct binder_buffer);
+
+		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
+			     "%d: binder_free_buf size %zd async free %zd\n",
+			      alloc->pid, size, alloc->free_async_space);
+	}
+
+	binder_update_page_range(alloc, 0,
+		(void *)PAGE_ALIGN((uintptr_t)buffer->data),
+		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
+		NULL);
+
+	rb_erase(&buffer->rb_node, &alloc->allocated_buffers);
+	buffer->free = 1;
+	if (!list_is_last(&buffer->entry, &alloc->buffers)) {
+		struct binder_buffer *next = list_entry(buffer->entry.next,
+						struct binder_buffer, entry);
+
+		if (next->free) {
+			rb_erase(&next->rb_node, &alloc->free_buffers);
+			binder_delete_free_buffer(alloc, next);
+		}
+	}
+	if (alloc->buffers.next != &buffer->entry) {
+		struct binder_buffer *prev = list_entry(buffer->entry.prev,
+						struct binder_buffer, entry);
+
+		if (prev->free) {
+			binder_delete_free_buffer(alloc, buffer);
+			rb_erase(&prev->rb_node, &alloc->free_buffers);
+			buffer = prev;
+		}
+	}
+	binder_insert_free_buffer(alloc, buffer);
+}
+
+/**
+ * binder_alloc_free_buf() - free a binder buffer
+ * @alloc:	binder_alloc for this proc
+ * @buffer:	kernel pointer to buffer
+ *
+ * Free the buffer allocated via binder_alloc_new_buffer()
+ */
+void binder_alloc_free_buf(struct binder_alloc *alloc,
+			    struct binder_buffer *buffer)
+{
+	mutex_lock(&alloc->mutex);
+	binder_free_buf_locked(alloc, buffer);
+	mutex_unlock(&alloc->mutex);
+}
+
+/**
+ * binder_alloc_mmap_handler() - map virtual address space for proc
+ * @alloc:	alloc structure for this proc
+ * @vma:	vma passed to mmap()
+ *
+ * Called by binder_mmap() to initialize the space specified in
+ * vma for allocating binder buffers
+ *
+ * Return:
+ *      0 = success
+ *      -EBUSY = address space already mapped
+ *      -ENOMEM = failed to map memory to given address space
+ */
+int binder_alloc_mmap_handler(struct binder_alloc *alloc,
+			      struct vm_area_struct *vma)
+{
+	int ret;
+	struct vm_struct *area;
+	const char *failure_string;
+	struct binder_buffer *buffer;
+
+	mutex_lock(&binder_alloc_mmap_lock);
+	if (alloc->buffer) {
+		ret = -EBUSY;
+		failure_string = "already mapped";
+		goto err_already_mapped;
+	}
+
+	area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
+	if (area == NULL) {
+		ret = -ENOMEM;
+		failure_string = "get_vm_area";
+		goto err_get_vm_area_failed;
+	}
+	alloc->buffer = area->addr;
+	alloc->user_buffer_offset =
+		vma->vm_start - (uintptr_t)alloc->buffer;
+	mutex_unlock(&binder_alloc_mmap_lock);
+
+#ifdef CONFIG_CPU_CACHE_VIPT
+	if (cache_is_vipt_aliasing()) {
+		while (CACHE_COLOUR(
+				(vma->vm_start ^ (uint32_t)alloc->buffer))) {
+			pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n",
+				alloc->pid, vma->vm_start, vma->vm_end,
+				alloc->buffer);
+			vma->vm_start += PAGE_SIZE;
+		}
+	}
+#endif
+	alloc->pages = kzalloc(sizeof(alloc->pages[0]) *
+				   ((vma->vm_end - vma->vm_start) / PAGE_SIZE),
+			       GFP_KERNEL);
+	if (alloc->pages == NULL) {
+		ret = -ENOMEM;
+		failure_string = "alloc page array";
+		goto err_alloc_pages_failed;
+	}
+	alloc->buffer_size = vma->vm_end - vma->vm_start;
+
+	if (binder_update_page_range(alloc, 1, alloc->buffer,
+				     alloc->buffer + PAGE_SIZE, vma)) {
+		ret = -ENOMEM;
+		failure_string = "alloc small buf";
+		goto err_alloc_small_buf_failed;
+	}
+	buffer = alloc->buffer;
+	INIT_LIST_HEAD(&alloc->buffers);
+	list_add(&buffer->entry, &alloc->buffers);
+	buffer->free = 1;
+	binder_insert_free_buffer(alloc, buffer);
+	alloc->free_async_space = alloc->buffer_size / 2;
+	barrier();
+	alloc->vma = vma;
+	alloc->vma_vm_mm = vma->vm_mm;
+
+	return 0;
+
+err_alloc_small_buf_failed:
+	kfree(alloc->pages);
+	alloc->pages = NULL;
+err_alloc_pages_failed:
+	mutex_lock(&binder_alloc_mmap_lock);
+	vfree(alloc->buffer);
+	alloc->buffer = NULL;
+err_get_vm_area_failed:
+err_already_mapped:
+	mutex_unlock(&binder_alloc_mmap_lock);
+	pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
+	       alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
+	return ret;
+}
+
+
+void binder_alloc_deferred_release(struct binder_alloc *alloc)
+{
+	struct rb_node *n;
+	int buffers, page_count;
+
+	BUG_ON(alloc->vma);
+
+	buffers = 0;
+	mutex_lock(&alloc->mutex);
+	while ((n = rb_first(&alloc->allocated_buffers))) {
+		struct binder_buffer *buffer;
+
+		buffer = rb_entry(n, struct binder_buffer, rb_node);
+
+		/* Transaction should already have been freed */
+		BUG_ON(buffer->transaction);
+
+		binder_free_buf_locked(alloc, buffer);
+		buffers++;
+	}
+
+	page_count = 0;
+	if (alloc->pages) {
+		int i;
+
+		for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
+			void *page_addr;
+
+			if (!alloc->pages[i])
+				continue;
+
+			page_addr = alloc->buffer + i * PAGE_SIZE;
+			binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+				     "%s: %d: page %d at %pK not freed\n",
+				     __func__, alloc->pid, i, page_addr);
+			unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
+			__free_page(alloc->pages[i]);
+			page_count++;
+		}
+		kfree(alloc->pages);
+		vfree(alloc->buffer);
+	}
+	mutex_unlock(&alloc->mutex);
+
+	binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE,
+		     "%s: %d buffers %d, pages %d\n",
+		     __func__, alloc->pid, buffers, page_count);
+}
+
+static void print_binder_buffer(struct seq_file *m, const char *prefix,
+				struct binder_buffer *buffer)
+{
+	seq_printf(m, "%s %d: %pK size %zd:%zd %s\n",
+		   prefix, buffer->debug_id, buffer->data,
+		   buffer->data_size, buffer->offsets_size,
+		   buffer->transaction ? "active" : "delivered");
+}
+
+/**
+ * binder_alloc_print_allocated() - print buffer info
+ * @m:     seq_file for output via seq_printf()
+ * @alloc: binder_alloc for this proc
+ *
+ * Prints information about every buffer associated with
+ * the binder_alloc state to the given seq_file
+ */
+void binder_alloc_print_allocated(struct seq_file *m,
+				  struct binder_alloc *alloc)
+{
+	struct rb_node *n;
+
+	mutex_lock(&alloc->mutex);
+	for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
+		print_binder_buffer(m, "  buffer",
+				    rb_entry(n, struct binder_buffer, rb_node));
+	mutex_unlock(&alloc->mutex);
+}
+
+/**
+ * binder_alloc_get_allocated_count() - return count of buffers
+ * @alloc: binder_alloc for this proc
+ *
+ * Return: count of allocated buffers
+ */
+int binder_alloc_get_allocated_count(struct binder_alloc *alloc)
+{
+	struct rb_node *n;
+	int count = 0;
+
+	mutex_lock(&alloc->mutex);
+	for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
+		count++;
+	mutex_unlock(&alloc->mutex);
+	return count;
+}
+
+
+/**
+ * binder_alloc_vma_close() - invalidate address space
+ * @alloc: binder_alloc for this proc
+ *
+ * Called from binder_vma_close() when releasing address space.
+ * Clears alloc->vma to prevent new incoming transactions from
+ * allocating more buffers.
+ */
+void binder_alloc_vma_close(struct binder_alloc *alloc)
+{
+	WRITE_ONCE(alloc->vma, NULL);
+	WRITE_ONCE(alloc->vma_vm_mm, NULL);
+}
+
+/**
+ * binder_alloc_init() - called by binder_open() for per-proc initialization
+ * @alloc: binder_alloc for this proc
+ *
+ * Called from binder_open() to initialize binder_alloc fields for
+ * new binder proc
+ */
+void binder_alloc_init(struct binder_alloc *alloc)
+{
+	alloc->tsk = current->group_leader;
+	alloc->pid = current->group_leader->pid;
+	mutex_init(&alloc->mutex);
+}
+
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
new file mode 100644
index 000000000000..721c511431f9
--- /dev/null
+++ b/drivers/android/binder_alloc.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_BINDER_ALLOC_H
+#define _LINUX_BINDER_ALLOC_H
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/rtmutex.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+struct binder_transaction;
+
+/**
+ * struct binder_buffer - buffer used for binder transactions
+ * @entry:              entry alloc->buffers
+ * @rb_node:            node for allocated_buffers/free_buffers rb trees
+ * @free:               true if buffer is free
+ * @allow_user_free:    describe the second member of struct blah,
+ * @async_transaction:  describe the second member of struct blah,
+ * @debug_id:           describe the second member of struct blah,
+ * @transaction:        describe the second member of struct blah,
+ * @target_node:        describe the second member of struct blah,
+ * @data_size:          describe the second member of struct blah,
+ * @offsets_size:       describe the second member of struct blah,
+ * @extra_buffers_size: describe the second member of struct blah,
+ * @data:i              describe the second member of struct blah,
+ *
+ * Bookkeeping structure for binder transaction buffers
+ */
+struct binder_buffer {
+	struct list_head entry; /* free and allocated entries by address */
+	struct rb_node rb_node; /* free entry by size or allocated entry */
+				/* by address */
+	unsigned free:1;
+	unsigned allow_user_free:1;
+	unsigned async_transaction:1;
+	unsigned debug_id:29;
+
+	struct binder_transaction *transaction;
+
+	struct binder_node *target_node;
+	size_t data_size;
+	size_t offsets_size;
+	size_t extra_buffers_size;
+	uint8_t data[0];
+};
+
+/**
+ * struct binder_alloc - per-binder proc state for binder allocator
+ * @vma:                vm_area_struct passed to mmap_handler
+ *                      (invarient after mmap)
+ * @tsk:                tid for task that called init for this proc
+ *                      (invariant after init)
+ * @vma_vm_mm:          copy of vma->vm_mm (invarient after mmap)
+ * @buffer:             base of per-proc address space mapped via mmap
+ * @user_buffer_offset: offset between user and kernel VAs for buffer
+ * @buffers:            list of all buffers for this proc
+ * @free_buffers:       rb tree of buffers available for allocation
+ *                      sorted by size
+ * @allocated_buffers:  rb tree of allocated buffers sorted by address
+ * @free_async_space:   VA space available for async buffers. This is
+ *                      initialized at mmap time to 1/2 the full VA space
+ * @pages:              array of physical page addresses for each
+ *                      page of mmap'd space
+ * @buffer_size:        size of address space specified via mmap
+ * @pid:                pid for associated binder_proc (invariant after init)
+ *
+ * Bookkeeping structure for per-proc address space management for binder
+ * buffers. It is normally initialized during binder_init() and binder_mmap()
+ * calls. The address space is used for both user-visible buffers and for
+ * struct binder_buffer objects used to track the user buffers
+ */
+struct binder_alloc {
+	struct mutex mutex;
+	struct task_struct *tsk;
+	struct vm_area_struct *vma;
+	struct mm_struct *vma_vm_mm;
+	void *buffer;
+	ptrdiff_t user_buffer_offset;
+	struct list_head buffers;
+	struct rb_root free_buffers;
+	struct rb_root allocated_buffers;
+	size_t free_async_space;
+	struct page **pages;
+	size_t buffer_size;
+	uint32_t buffer_free;
+	int pid;
+};
+
+extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
+						  size_t data_size,
+						  size_t offsets_size,
+						  size_t extra_buffers_size,
+						  int is_async);
+extern void binder_alloc_init(struct binder_alloc *alloc);
+extern void binder_alloc_vma_close(struct binder_alloc *alloc);
+extern struct binder_buffer *
+binder_alloc_buffer_lookup(struct binder_alloc *alloc,
+			   uintptr_t user_ptr);
+extern void binder_alloc_free_buf(struct binder_alloc *alloc,
+				  struct binder_buffer *buffer);
+extern int binder_alloc_mmap_handler(struct binder_alloc *alloc,
+				     struct vm_area_struct *vma);
+extern void binder_alloc_deferred_release(struct binder_alloc *alloc);
+extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc);
+extern void binder_alloc_print_allocated(struct seq_file *m,
+					 struct binder_alloc *alloc);
+
+/**
+ * binder_alloc_get_free_async_space() - get free space available for async
+ * @alloc:	binder_alloc for this proc
+ *
+ * Return:	the bytes remaining in the address-space for async transactions
+ */
+static inline size_t
+binder_alloc_get_free_async_space(struct binder_alloc *alloc)
+{
+	size_t free_async_space;
+
+	mutex_lock(&alloc->mutex);
+	free_async_space = alloc->free_async_space;
+	mutex_unlock(&alloc->mutex);
+	return free_async_space;
+}
+
+/**
+ * binder_alloc_get_user_buffer_offset() - get offset between kernel/user addrs
+ * @alloc:	binder_alloc for this proc
+ *
+ * Return:	the offset between kernel and user-space addresses to use for
+ * virtual address conversion
+ */
+static inline ptrdiff_t
+binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc)
+{
+	/*
+	 * user_buffer_offset is constant if vma is set and
+	 * undefined if vma is not set. It is possible to
+	 * get here with !alloc->vma if the target process
+	 * is dying while a transaction is being initiated.
+	 * Returning the old value is ok in this case and
+	 * the transaction will fail.
+	 */
+	return alloc->user_buffer_offset;
+}
+
+#endif /* _LINUX_BINDER_ALLOC_H */
+

From a19f3efc4f54593d9e17f3fe7afc1ea78e0ae5f0 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Wed, 24 May 2017 11:53:13 -0700
Subject: [PATCH 0905/3220] FROMLIST: binder: remove binder_debug_no_lock
 mechanism

(from https://patchwork.kernel.org/patch/9817811/)

With the global lock, there was a mechanism to acceess
binder driver debugging information with the global
lock disabled to debug deadlocks or other issues.
This mechanism is rarely (if ever) used anymore
and wasn't needed during the development of
fine-grained locking in the binder driver.
Removing it.

Change-Id: Ie1cf45748cefa433607a97c2ef322e7906efe0f7
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 31 ++++++++-----------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3251b7cdd717..9fdf1ed9d7f6 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -107,9 +107,6 @@ static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR |
 	BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION;
 module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO);
 
-static bool binder_debug_no_lock;
-module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO);
-
 static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
 module_param_named(devices, binder_devices_param, charp, S_IRUGO);
 
@@ -3508,10 +3505,8 @@ static int binder_state_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *proc;
 	struct binder_node *node;
-	int do_lock = !binder_debug_no_lock;
 
-	if (do_lock)
-		binder_lock(__func__);
+	binder_lock(__func__);
 
 	seq_puts(m, "binder state:\n");
 
@@ -3522,18 +3517,15 @@ static int binder_state_show(struct seq_file *m, void *unused)
 
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
 		print_binder_proc(m, proc, 1);
-	if (do_lock)
-		binder_unlock(__func__);
+	binder_unlock(__func__);
 	return 0;
 }
 
 static int binder_stats_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *proc;
-	int do_lock = !binder_debug_no_lock;
 
-	if (do_lock)
-		binder_lock(__func__);
+	binder_lock(__func__);
 
 	seq_puts(m, "binder stats:\n");
 
@@ -3541,24 +3533,20 @@ static int binder_stats_show(struct seq_file *m, void *unused)
 
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
 		print_binder_proc_stats(m, proc);
-	if (do_lock)
-		binder_unlock(__func__);
+	binder_unlock(__func__);
 	return 0;
 }
 
 static int binder_transactions_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *proc;
-	int do_lock = !binder_debug_no_lock;
 
-	if (do_lock)
-		binder_lock(__func__);
+	binder_lock(__func__);
 
 	seq_puts(m, "binder transactions:\n");
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
 		print_binder_proc(m, proc, 0);
-	if (do_lock)
-		binder_unlock(__func__);
+	binder_unlock(__func__);
 	return 0;
 }
 
@@ -3566,10 +3554,8 @@ static int binder_proc_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *itr;
 	int pid = (unsigned long)m->private;
-	int do_lock = !binder_debug_no_lock;
 
-	if (do_lock)
-		binder_lock(__func__);
+	binder_lock(__func__);
 
 	hlist_for_each_entry(itr, &binder_procs, proc_node) {
 		if (itr->pid == pid) {
@@ -3577,8 +3563,7 @@ static int binder_proc_show(struct seq_file *m, void *unused)
 			print_binder_proc(m, itr, 1);
 		}
 	}
-	if (do_lock)
-		binder_unlock(__func__);
+	binder_unlock(__func__);
 	return 0;
 }
 

From 3490fdcb77030b103e6a709a63191e09c6e64ac0 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 17 Oct 2016 12:33:15 -0700
Subject: [PATCH 0906/3220] FROMLIST: binder: add protection for non-perf cases

(from https://patchwork.kernel.org/patch/9817749/)

Add binder_dead_nodes_lock, binder_procs_lock, and
binder_context_mgr_node_lock to protect the associated global lists

Bug: 33250092 32225111
Change-Id: I9d1158536783763e0fa93b18e19fba2c488d8cfc
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 81 +++++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 18 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 9fdf1ed9d7f6..4b9816409b54 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -45,12 +45,16 @@
 #include "binder_trace.h"
 
 static DEFINE_MUTEX(binder_main_lock);
+
+static HLIST_HEAD(binder_deferred_list);
 static DEFINE_MUTEX(binder_deferred_lock);
 
 static HLIST_HEAD(binder_devices);
 static HLIST_HEAD(binder_procs);
-static HLIST_HEAD(binder_deferred_list);
+static DEFINE_MUTEX(binder_procs_lock);
+
 static HLIST_HEAD(binder_dead_nodes);
+static DEFINE_SPINLOCK(binder_dead_nodes_lock);
 
 static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
@@ -219,6 +223,8 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
 
 struct binder_context {
 	struct binder_node *binder_context_mgr_node;
+	struct mutex context_mgr_node_lock;
+
 	kuid_t binder_context_mgr_uid;
 	const char *name;
 };
@@ -571,7 +577,9 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal)
 					     "refless node %d deleted\n",
 					     node->debug_id);
 			} else {
+				spin_lock(&binder_dead_nodes_lock);
 				hlist_del(&node->dead_node);
+				spin_unlock(&binder_dead_nodes_lock);
 				binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 					     "dead node %d deleted\n",
 					     node->debug_id);
@@ -1455,11 +1463,14 @@ static void binder_transaction(struct binder_proc *proc,
 			}
 			target_node = ref->node;
 		} else {
+			mutex_lock(&context->context_mgr_node_lock);
 			target_node = context->binder_context_mgr_node;
 			if (target_node == NULL) {
 				return_error = BR_DEAD_REPLY;
+				mutex_unlock(&context->context_mgr_node_lock);
 				goto err_no_context_mgr_node;
 			}
+			mutex_unlock(&context->context_mgr_node_lock);
 		}
 		e->to_node = target_node->debug_id;
 		target_proc = target_node->proc;
@@ -1825,22 +1836,31 @@ static int binder_thread_write(struct binder_proc *proc,
 		case BC_RELEASE:
 		case BC_DECREFS: {
 			uint32_t target;
-			struct binder_ref *ref;
+			struct binder_ref *ref = NULL;
 			const char *debug_string;
 
 			if (get_user(target, (uint32_t __user *)ptr))
 				return -EFAULT;
+
 			ptr += sizeof(uint32_t);
-			if (target == 0 && context->binder_context_mgr_node &&
+			if (target == 0 &&
 			    (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) {
-				ref = binder_get_ref_for_node(proc,
-					context->binder_context_mgr_node);
-				if (ref->desc != target) {
-					binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n",
-						proc->pid, thread->pid,
-						ref->desc);
+				struct binder_node *ctx_mgr_node;
+
+				mutex_lock(&context->context_mgr_node_lock);
+				ctx_mgr_node = context->binder_context_mgr_node;
+				if (ctx_mgr_node) {
+					ref = binder_get_ref_for_node(proc,
+							ctx_mgr_node);
+					if (ref && ref->desc != target) {
+						binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n",
+							proc->pid, thread->pid,
+							ref->desc);
+					}
 				}
-			} else
+				mutex_unlock(&context->context_mgr_node_lock);
+			}
+			if (ref == NULL)
 				ref = binder_get_ref(proc, target,
 						     cmd == BC_ACQUIRE ||
 						     cmd == BC_RELEASE);
@@ -2754,9 +2774,10 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 	int ret = 0;
 	struct binder_proc *proc = filp->private_data;
 	struct binder_context *context = proc->context;
-
+	struct binder_node *new_node;
 	kuid_t curr_euid = current_euid();
 
+	mutex_lock(&context->context_mgr_node_lock);
 	if (context->binder_context_mgr_node) {
 		pr_err("BINDER_SET_CONTEXT_MGR already set\n");
 		ret = -EBUSY;
@@ -2777,16 +2798,18 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 	} else {
 		context->binder_context_mgr_uid = curr_euid;
 	}
-	context->binder_context_mgr_node = binder_new_node(proc, 0, 0);
-	if (!context->binder_context_mgr_node) {
+	new_node = binder_new_node(proc, 0, 0);
+	if (!new_node) {
 		ret = -ENOMEM;
 		goto out;
 	}
-	context->binder_context_mgr_node->local_weak_refs++;
-	context->binder_context_mgr_node->local_strong_refs++;
-	context->binder_context_mgr_node->has_strong_ref = 1;
-	context->binder_context_mgr_node->has_weak_ref = 1;
+	new_node->local_weak_refs++;
+	new_node->local_strong_refs++;
+	new_node->has_strong_ref = 1;
+	new_node->has_weak_ref = 1;
+	context->binder_context_mgr_node = new_node;
 out:
+	mutex_unlock(&context->context_mgr_node_lock);
 	return ret;
 }
 
@@ -2966,13 +2989,16 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	binder_lock(__func__);
 
 	binder_stats_created(BINDER_STAT_PROC);
-	hlist_add_head(&proc->proc_node, &binder_procs);
 	proc->pid = current->group_leader->pid;
 	INIT_LIST_HEAD(&proc->delivered_death);
 	filp->private_data = proc;
 
 	binder_unlock(__func__);
 
+	mutex_lock(&binder_procs_lock);
+	hlist_add_head(&proc->proc_node, &binder_procs);
+	mutex_unlock(&binder_procs_lock);
+
 	if (binder_debugfs_dir_entry_proc) {
 		char strbuf[11];
 
@@ -3051,7 +3077,10 @@ static int binder_node_release(struct binder_node *node, int refs)
 	node->proc = NULL;
 	node->local_strong_refs = 0;
 	node->local_weak_refs = 0;
+
+	spin_lock(&binder_dead_nodes_lock);
 	hlist_add_head(&node->dead_node, &binder_dead_nodes);
+	spin_unlock(&binder_dead_nodes_lock);
 
 	hlist_for_each_entry(ref, &node->refs, node_entry) {
 		refs++;
@@ -3085,8 +3114,11 @@ static void binder_deferred_release(struct binder_proc *proc)
 
 	BUG_ON(proc->files);
 
+	mutex_lock(&binder_procs_lock);
 	hlist_del(&proc->proc_node);
+	mutex_unlock(&binder_procs_lock);
 
+	mutex_lock(&context->context_mgr_node_lock);
 	if (context->binder_context_mgr_node &&
 	    context->binder_context_mgr_node->proc == proc) {
 		binder_debug(BINDER_DEBUG_DEAD_BINDER,
@@ -3094,6 +3126,7 @@ static void binder_deferred_release(struct binder_proc *proc)
 			     __func__, proc->pid);
 		context->binder_context_mgr_node = NULL;
 	}
+	mutex_unlock(&context->context_mgr_node_lock);
 
 	threads = 0;
 	active_transactions = 0;
@@ -3510,13 +3543,17 @@ static int binder_state_show(struct seq_file *m, void *unused)
 
 	seq_puts(m, "binder state:\n");
 
+	spin_lock(&binder_dead_nodes_lock);
 	if (!hlist_empty(&binder_dead_nodes))
 		seq_puts(m, "dead nodes:\n");
 	hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
 		print_binder_node(m, node);
+	spin_unlock(&binder_dead_nodes_lock);
 
+	mutex_lock(&binder_procs_lock);
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
 		print_binder_proc(m, proc, 1);
+	mutex_unlock(&binder_procs_lock);
 	binder_unlock(__func__);
 	return 0;
 }
@@ -3531,8 +3568,10 @@ static int binder_stats_show(struct seq_file *m, void *unused)
 
 	print_binder_stats(m, "", &binder_stats);
 
+	mutex_lock(&binder_procs_lock);
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
 		print_binder_proc_stats(m, proc);
+	mutex_unlock(&binder_procs_lock);
 	binder_unlock(__func__);
 	return 0;
 }
@@ -3544,8 +3583,10 @@ static int binder_transactions_show(struct seq_file *m, void *unused)
 	binder_lock(__func__);
 
 	seq_puts(m, "binder transactions:\n");
+	mutex_lock(&binder_procs_lock);
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
 		print_binder_proc(m, proc, 0);
+	mutex_unlock(&binder_procs_lock);
 	binder_unlock(__func__);
 	return 0;
 }
@@ -3557,12 +3598,15 @@ static int binder_proc_show(struct seq_file *m, void *unused)
 
 	binder_lock(__func__);
 
+	mutex_lock(&binder_procs_lock);
 	hlist_for_each_entry(itr, &binder_procs, proc_node) {
 		if (itr->pid == pid) {
 			seq_puts(m, "binder proc state:\n");
 			print_binder_proc(m, itr, 1);
 		}
 	}
+	mutex_unlock(&binder_procs_lock);
+
 	binder_unlock(__func__);
 	return 0;
 }
@@ -3623,6 +3667,7 @@ static int __init init_binder_device(const char *name)
 
 	binder_device->context.binder_context_mgr_uid = INVALID_UID;
 	binder_device->context.name = name;
+	mutex_init(&binder_device->context.context_mgr_node_lock);
 
 	ret = misc_register(&binder_device->miscdev);
 	if (ret < 0) {

From f716ecfc028f7733e8549cb2cb1f2851bc471253 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Thu, 13 Oct 2016 16:36:15 -0700
Subject: [PATCH 0907/3220] FROMLIST: binder: change binder_stats to atomics

(from https://patchwork.kernel.org/patch/9817755/)

Use atomics for stats to avoid needing to lock for
increments/decrements

Bug: 33250092 32225111
Change-Id: I13e69b7f0485ccf16673e25091455781e1933a98
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 48 +++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 4b9816409b54..0b6ff5f84322 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -167,22 +167,22 @@ enum binder_stat_types {
 };
 
 struct binder_stats {
-	int br[_IOC_NR(BR_FAILED_REPLY) + 1];
-	int bc[_IOC_NR(BC_REPLY_SG) + 1];
-	int obj_created[BINDER_STAT_COUNT];
-	int obj_deleted[BINDER_STAT_COUNT];
+	atomic_t br[_IOC_NR(BR_FAILED_REPLY) + 1];
+	atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1];
+	atomic_t obj_created[BINDER_STAT_COUNT];
+	atomic_t obj_deleted[BINDER_STAT_COUNT];
 };
 
 static struct binder_stats binder_stats;
 
 static inline void binder_stats_deleted(enum binder_stat_types type)
 {
-	binder_stats.obj_deleted[type]++;
+	atomic_inc(&binder_stats.obj_deleted[type]);
 }
 
 static inline void binder_stats_created(enum binder_stat_types type)
 {
-	binder_stats.obj_created[type]++;
+	atomic_inc(&binder_stats.obj_created[type]);
 }
 
 struct binder_transaction_log_entry {
@@ -1826,9 +1826,9 @@ static int binder_thread_write(struct binder_proc *proc,
 		ptr += sizeof(uint32_t);
 		trace_binder_command(cmd);
 		if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) {
-			binder_stats.bc[_IOC_NR(cmd)]++;
-			proc->stats.bc[_IOC_NR(cmd)]++;
-			thread->stats.bc[_IOC_NR(cmd)]++;
+			atomic_inc(&binder_stats.bc[_IOC_NR(cmd)]);
+			atomic_inc(&proc->stats.bc[_IOC_NR(cmd)]);
+			atomic_inc(&thread->stats.bc[_IOC_NR(cmd)]);
 		}
 		switch (cmd) {
 		case BC_INCREFS:
@@ -2202,9 +2202,9 @@ static void binder_stat_br(struct binder_proc *proc,
 {
 	trace_binder_return(cmd);
 	if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) {
-		binder_stats.br[_IOC_NR(cmd)]++;
-		proc->stats.br[_IOC_NR(cmd)]++;
-		thread->stats.br[_IOC_NR(cmd)]++;
+		atomic_inc(&binder_stats.br[_IOC_NR(cmd)]);
+		atomic_inc(&proc->stats.br[_IOC_NR(cmd)]);
+		atomic_inc(&thread->stats.br[_IOC_NR(cmd)]);
 	}
 }
 
@@ -3454,17 +3454,21 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
 	BUILD_BUG_ON(ARRAY_SIZE(stats->bc) !=
 		     ARRAY_SIZE(binder_command_strings));
 	for (i = 0; i < ARRAY_SIZE(stats->bc); i++) {
-		if (stats->bc[i])
+		int temp = atomic_read(&stats->bc[i]);
+
+		if (temp)
 			seq_printf(m, "%s%s: %d\n", prefix,
-				   binder_command_strings[i], stats->bc[i]);
+				   binder_command_strings[i], temp);
 	}
 
 	BUILD_BUG_ON(ARRAY_SIZE(stats->br) !=
 		     ARRAY_SIZE(binder_return_strings));
 	for (i = 0; i < ARRAY_SIZE(stats->br); i++) {
-		if (stats->br[i])
+		int temp = atomic_read(&stats->br[i]);
+
+		if (temp)
 			seq_printf(m, "%s%s: %d\n", prefix,
-				   binder_return_strings[i], stats->br[i]);
+				   binder_return_strings[i], temp);
 	}
 
 	BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
@@ -3472,11 +3476,15 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
 	BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
 		     ARRAY_SIZE(stats->obj_deleted));
 	for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) {
-		if (stats->obj_created[i] || stats->obj_deleted[i])
-			seq_printf(m, "%s%s: active %d total %d\n", prefix,
+		int created = atomic_read(&stats->obj_created[i]);
+		int deleted = atomic_read(&stats->obj_deleted[i]);
+
+		if (created || deleted)
+			seq_printf(m, "%s%s: active %d total %d\n",
+				prefix,
 				binder_objstat_strings[i],
-				stats->obj_created[i] - stats->obj_deleted[i],
-				stats->obj_created[i]);
+				created - deleted,
+				created);
 	}
 }
 

From be4dde1f05cda42d74035c50d944a6ec0ef365d7 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Thu, 25 May 2017 10:56:00 -0700
Subject: [PATCH 0908/3220] FROMLIST: binder: make binder_last_id an atomic

(from https://patchwork.kernel.org/patch/9817809/)

Change-Id: I12a505091d377ca9034861317b7e68c2e75f7256
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 0b6ff5f84322..edbd729f2f43 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -58,7 +58,7 @@ static DEFINE_SPINLOCK(binder_dead_nodes_lock);
 
 static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
-static int binder_last_id;
+static atomic_t binder_last_id;
 static struct workqueue_struct *binder_deferred_workqueue;
 
 #define BINDER_DEBUG_ENTRY(name) \
@@ -496,7 +496,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 	binder_stats_created(BINDER_STAT_NODE);
 	rb_link_node(&node->rb_node, parent, p);
 	rb_insert_color(&node->rb_node, &proc->nodes);
-	node->debug_id = ++binder_last_id;
+	node->debug_id = atomic_inc_return(&binder_last_id);
 	node->proc = proc;
 	node->ptr = ptr;
 	node->cookie = cookie;
@@ -640,7 +640,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 	if (new_ref == NULL)
 		return NULL;
 	binder_stats_created(BINDER_STAT_REF);
-	new_ref->debug_id = ++binder_last_id;
+	new_ref->debug_id = atomic_inc_return(&binder_last_id);
 	new_ref->proc = proc;
 	new_ref->node = node;
 	rb_link_node(&new_ref->rb_node_node, parent, p);
@@ -1528,7 +1528,7 @@ static void binder_transaction(struct binder_proc *proc,
 	}
 	binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE);
 
-	t->debug_id = ++binder_last_id;
+	t->debug_id = atomic_inc_return(&binder_last_id);
 	e->debug_id = t->debug_id;
 
 	if (reply)

From 0a0fdc1fdc20beec7da852d2392f3111e3555983 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Wed, 22 Mar 2017 17:19:52 -0700
Subject: [PATCH 0909/3220] FROMLIST: binder: add log information for binder
 transaction failures

(from https://patchwork.kernel.org/patch/9817751/)

Add additional information to determine the cause of binder
failures. Adds the following to failed transaction log and
kernel messages:
	return_error : value returned for transaction
	return_error_param : errno returned by binder allocator
	return_error_line : line number where error detected

Also, return BR_DEAD_REPLY if an allocation error indicates
a dead proc (-ESRCH)

Bug: 36406078
Change-Id: Ifc8881fa5adfcced3f2d67f9030fbd3efa3e2cab
Test: tested manually
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c       | 87 ++++++++++++++++++++++++++++++----
 drivers/android/binder_alloc.c | 20 ++++----
 2 files changed, 88 insertions(+), 19 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index edbd729f2f43..5990f7fbcbd3 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -196,6 +196,9 @@ struct binder_transaction_log_entry {
 	int to_node;
 	int data_size;
 	int offsets_size;
+	int return_error_line;
+	uint32_t return_error;
+	uint32_t return_error_param;
 	const char *context_name;
 };
 struct binder_transaction_log {
@@ -1143,7 +1146,7 @@ static int binder_translate_binder(struct flat_binder_object *fp,
 
 	ref = binder_get_ref_for_node(target_proc, node);
 	if (!ref)
-		return -EINVAL;
+		return -ENOMEM;
 
 	if (fp->hdr.type == BINDER_TYPE_BINDER)
 		fp->hdr.type = BINDER_TYPE_HANDLE;
@@ -1200,7 +1203,7 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 
 		new_ref = binder_get_ref_for_node(target_proc, ref->node);
 		if (!new_ref)
-			return -EINVAL;
+			return -ENOMEM;
 
 		fp->binder = 0;
 		fp->handle = new_ref->desc;
@@ -1398,7 +1401,9 @@ static void binder_transaction(struct binder_proc *proc,
 	wait_queue_head_t *target_wait;
 	struct binder_transaction *in_reply_to = NULL;
 	struct binder_transaction_log_entry *e;
-	uint32_t return_error;
+	uint32_t return_error = 0;
+	uint32_t return_error_param = 0;
+	uint32_t return_error_line = 0;
 	struct binder_buffer_object *last_fixup_obj = NULL;
 	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
@@ -1418,6 +1423,8 @@ static void binder_transaction(struct binder_proc *proc,
 			binder_user_error("%d:%d got reply transaction with no transaction stack\n",
 					  proc->pid, thread->pid);
 			return_error = BR_FAILED_REPLY;
+			return_error_param = -EPROTO;
+			return_error_line = __LINE__;
 			goto err_empty_call_stack;
 		}
 		binder_set_nice(in_reply_to->saved_priority);
@@ -1429,6 +1436,8 @@ static void binder_transaction(struct binder_proc *proc,
 				in_reply_to->to_thread ?
 				in_reply_to->to_thread->pid : 0);
 			return_error = BR_FAILED_REPLY;
+			return_error_param = -EPROTO;
+			return_error_line = __LINE__;
 			in_reply_to = NULL;
 			goto err_bad_call_stack;
 		}
@@ -1436,6 +1445,7 @@ static void binder_transaction(struct binder_proc *proc,
 		target_thread = in_reply_to->from;
 		if (target_thread == NULL) {
 			return_error = BR_DEAD_REPLY;
+			return_error_line = __LINE__;
 			goto err_dead_binder;
 		}
 		if (target_thread->transaction_stack != in_reply_to) {
@@ -1445,6 +1455,8 @@ static void binder_transaction(struct binder_proc *proc,
 				target_thread->transaction_stack->debug_id : 0,
 				in_reply_to->debug_id);
 			return_error = BR_FAILED_REPLY;
+			return_error_param = -EPROTO;
+			return_error_line = __LINE__;
 			in_reply_to = NULL;
 			target_thread = NULL;
 			goto err_dead_binder;
@@ -1459,6 +1471,8 @@ static void binder_transaction(struct binder_proc *proc,
 				binder_user_error("%d:%d got transaction to invalid handle\n",
 					proc->pid, thread->pid);
 				return_error = BR_FAILED_REPLY;
+				return_error_param = -EINVAL;
+				return_error_line = __LINE__;
 				goto err_invalid_target_handle;
 			}
 			target_node = ref->node;
@@ -1468,6 +1482,7 @@ static void binder_transaction(struct binder_proc *proc,
 			if (target_node == NULL) {
 				return_error = BR_DEAD_REPLY;
 				mutex_unlock(&context->context_mgr_node_lock);
+				return_error_line = __LINE__;
 				goto err_no_context_mgr_node;
 			}
 			mutex_unlock(&context->context_mgr_node_lock);
@@ -1476,11 +1491,14 @@ static void binder_transaction(struct binder_proc *proc,
 		target_proc = target_node->proc;
 		if (target_proc == NULL) {
 			return_error = BR_DEAD_REPLY;
+			return_error_line = __LINE__;
 			goto err_dead_binder;
 		}
 		if (security_binder_transaction(proc->tsk,
 						target_proc->tsk) < 0) {
 			return_error = BR_FAILED_REPLY;
+			return_error_param = -EPERM;
+			return_error_line = __LINE__;
 			goto err_invalid_target_handle;
 		}
 		if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
@@ -1494,6 +1512,8 @@ static void binder_transaction(struct binder_proc *proc,
 					tmp->to_thread ?
 					tmp->to_thread->pid : 0);
 				return_error = BR_FAILED_REPLY;
+				return_error_param = -EPROTO;
+				return_error_line = __LINE__;
 				goto err_bad_call_stack;
 			}
 			while (tmp) {
@@ -1517,6 +1537,8 @@ static void binder_transaction(struct binder_proc *proc,
 	t = kzalloc(sizeof(*t), GFP_KERNEL);
 	if (t == NULL) {
 		return_error = BR_FAILED_REPLY;
+		return_error_param = -ENOMEM;
+		return_error_line = __LINE__;
 		goto err_alloc_t_failed;
 	}
 	binder_stats_created(BINDER_STAT_TRANSACTION);
@@ -1524,6 +1546,8 @@ static void binder_transaction(struct binder_proc *proc,
 	tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL);
 	if (tcomplete == NULL) {
 		return_error = BR_FAILED_REPLY;
+		return_error_param = -ENOMEM;
+		return_error_line = __LINE__;
 		goto err_alloc_tcomplete_failed;
 	}
 	binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE);
@@ -1566,8 +1590,15 @@ static void binder_transaction(struct binder_proc *proc,
 	t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size,
 		tr->offsets_size, extra_buffers_size,
 		!reply && (t->flags & TF_ONE_WAY));
-	if (t->buffer == NULL) {
-		return_error = BR_FAILED_REPLY;
+	if (IS_ERR(t->buffer)) {
+		/*
+		 * -ESRCH indicates VMA cleared. The target is dying.
+		 */
+		return_error_param = PTR_ERR(t->buffer);
+		return_error = return_error_param == -ESRCH ?
+			BR_DEAD_REPLY : BR_FAILED_REPLY;
+		return_error_line = __LINE__;
+		t->buffer = NULL;
 		goto err_binder_alloc_buf_failed;
 	}
 	t->buffer->allow_user_free = 0;
@@ -1587,6 +1618,8 @@ static void binder_transaction(struct binder_proc *proc,
 		binder_user_error("%d:%d got transaction with invalid data ptr\n",
 				proc->pid, thread->pid);
 		return_error = BR_FAILED_REPLY;
+		return_error_param = -EFAULT;
+		return_error_line = __LINE__;
 		goto err_copy_data_failed;
 	}
 	if (copy_from_user(offp, (const void __user *)(uintptr_t)
@@ -1594,12 +1627,16 @@ static void binder_transaction(struct binder_proc *proc,
 		binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
 				proc->pid, thread->pid);
 		return_error = BR_FAILED_REPLY;
+		return_error_param = -EFAULT;
+		return_error_line = __LINE__;
 		goto err_copy_data_failed;
 	}
 	if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) {
 		binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n",
 				proc->pid, thread->pid, (u64)tr->offsets_size);
 		return_error = BR_FAILED_REPLY;
+		return_error_param = -EINVAL;
+		return_error_line = __LINE__;
 		goto err_bad_offset;
 	}
 	if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) {
@@ -1607,6 +1644,8 @@ static void binder_transaction(struct binder_proc *proc,
 				  proc->pid, thread->pid,
 				  (u64)extra_buffers_size);
 		return_error = BR_FAILED_REPLY;
+		return_error_param = -EINVAL;
+		return_error_line = __LINE__;
 		goto err_bad_offset;
 	}
 	off_end = (void *)off_start + tr->offsets_size;
@@ -1623,6 +1662,8 @@ static void binder_transaction(struct binder_proc *proc,
 					  (u64)off_min,
 					  (u64)t->buffer->data_size);
 			return_error = BR_FAILED_REPLY;
+			return_error_param = -EINVAL;
+			return_error_line = __LINE__;
 			goto err_bad_offset;
 		}
 
@@ -1637,6 +1678,8 @@ static void binder_transaction(struct binder_proc *proc,
 			ret = binder_translate_binder(fp, t, thread);
 			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
+				return_error_param = ret;
+				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
 		} break;
@@ -1648,6 +1691,8 @@ static void binder_transaction(struct binder_proc *proc,
 			ret = binder_translate_handle(fp, t, thread);
 			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
+				return_error_param = ret;
+				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
 		} break;
@@ -1659,6 +1704,8 @@ static void binder_transaction(struct binder_proc *proc,
 
 			if (target_fd < 0) {
 				return_error = BR_FAILED_REPLY;
+				return_error_param = target_fd;
+				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
 			fp->pad_binder = 0;
@@ -1675,6 +1722,8 @@ static void binder_transaction(struct binder_proc *proc,
 				binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
 						  proc->pid, thread->pid);
 				return_error = BR_FAILED_REPLY;
+				return_error_param = -EINVAL;
+				return_error_line = __LINE__;
 				goto err_bad_parent;
 			}
 			if (!binder_validate_fixup(t->buffer, off_start,
@@ -1684,12 +1733,16 @@ static void binder_transaction(struct binder_proc *proc,
 				binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
 						  proc->pid, thread->pid);
 				return_error = BR_FAILED_REPLY;
+				return_error_param = -EINVAL;
+				return_error_line = __LINE__;
 				goto err_bad_parent;
 			}
 			ret = binder_translate_fd_array(fda, parent, t, thread,
 							in_reply_to);
 			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
+				return_error_param = ret;
+				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
 			last_fixup_obj = parent;
@@ -1705,6 +1758,8 @@ static void binder_transaction(struct binder_proc *proc,
 				binder_user_error("%d:%d got transaction with too large buffer\n",
 						  proc->pid, thread->pid);
 				return_error = BR_FAILED_REPLY;
+				return_error_param = -EINVAL;
+				return_error_line = __LINE__;
 				goto err_bad_offset;
 			}
 			if (copy_from_user(sg_bufp,
@@ -1712,7 +1767,9 @@ static void binder_transaction(struct binder_proc *proc,
 					   bp->buffer, bp->length)) {
 				binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
 						  proc->pid, thread->pid);
+				return_error_param = -EFAULT;
 				return_error = BR_FAILED_REPLY;
+				return_error_line = __LINE__;
 				goto err_copy_data_failed;
 			}
 			/* Fixup buffer pointer to target proc address space */
@@ -1727,6 +1784,8 @@ static void binder_transaction(struct binder_proc *proc,
 						  last_fixup_min_off);
 			if (ret < 0) {
 				return_error = BR_FAILED_REPLY;
+				return_error_param = ret;
+				return_error_line = __LINE__;
 				goto err_translate_failed;
 			}
 			last_fixup_obj = bp;
@@ -1736,6 +1795,8 @@ static void binder_transaction(struct binder_proc *proc,
 			binder_user_error("%d:%d got transaction with invalid object type, %x\n",
 				proc->pid, thread->pid, hdr->type);
 			return_error = BR_FAILED_REPLY;
+			return_error_param = -EINVAL;
+			return_error_line = __LINE__;
 			goto err_bad_object_type;
 		}
 	}
@@ -1790,13 +1851,17 @@ err_dead_binder:
 err_invalid_target_handle:
 err_no_context_mgr_node:
 	binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
-		     "%d:%d transaction failed %d, size %lld-%lld\n",
-		     proc->pid, thread->pid, return_error,
-		     (u64)tr->data_size, (u64)tr->offsets_size);
+		     "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n",
+		     proc->pid, thread->pid, return_error, return_error_param,
+		     (u64)tr->data_size, (u64)tr->offsets_size,
+		     return_error_line);
 
 	{
 		struct binder_transaction_log_entry *fe;
 
+		e->return_error = return_error;
+		e->return_error_param = return_error_param;
+		e->return_error_line = return_error_line;
 		fe = binder_transaction_log_add(&binder_transaction_log_failed);
 		*fe = *e;
 	}
@@ -3623,11 +3688,13 @@ static void print_binder_transaction_log_entry(struct seq_file *m,
 					struct binder_transaction_log_entry *e)
 {
 	seq_printf(m,
-		   "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d\n",
+		   "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d\n",
 		   e->debug_id, (e->call_type == 2) ? "reply" :
 		   ((e->call_type == 1) ? "async" : "call "), e->from_proc,
 		   e->from_thread, e->to_proc, e->to_thread, e->context_name,
-		   e->to_node, e->target_handle, e->data_size, e->offsets_size);
+		   e->to_node, e->target_handle, e->data_size, e->offsets_size,
+		   e->return_error, e->return_error_param,
+		   e->return_error_line);
 }
 
 static int binder_transaction_log_show(struct seq_file *m, void *unused)
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index eaef8f8fb859..16bc8a3b53f5 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -262,7 +262,7 @@ err_no_vma:
 		up_write(&mm->mmap_sem);
 		mmput(mm);
 	}
-	return -ENOMEM;
+	return vma ? -ENOMEM : -ESRCH;
 }
 
 struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
@@ -278,11 +278,12 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 	void *has_page_addr;
 	void *end_page_addr;
 	size_t size, data_offsets_size;
+	int ret;
 
 	if (alloc->vma == NULL) {
 		pr_err("%d: binder_alloc_buf, no vma\n",
 		       alloc->pid);
-		return NULL;
+		return ERR_PTR(-ESRCH);
 	}
 
 	data_offsets_size = ALIGN(data_size, sizeof(void *)) +
@@ -292,21 +293,21 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
 				"%d: got transaction with invalid size %zd-%zd\n",
 				alloc->pid, data_size, offsets_size);
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	}
 	size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *));
 	if (size < data_offsets_size || size < extra_buffers_size) {
 		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
 				"%d: got transaction with invalid extra_buffers_size %zd\n",
 				alloc->pid, extra_buffers_size);
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	}
 	if (is_async &&
 	    alloc->free_async_space < size + sizeof(struct binder_buffer)) {
 		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
 			     "%d: binder_alloc_buf size %zd failed, no async space left\n",
 			      alloc->pid, size);
-		return NULL;
+		return ERR_PTR(-ENOSPC);
 	}
 
 	while (n) {
@@ -327,7 +328,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 	if (best_fit == NULL) {
 		pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
 			alloc->pid, size);
-		return NULL;
+		return ERR_PTR(-ENOSPC);
 	}
 	if (n == NULL) {
 		buffer = rb_entry(best_fit, struct binder_buffer, rb_node);
@@ -350,9 +351,10 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 		(void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size);
 	if (end_page_addr > has_page_addr)
 		end_page_addr = has_page_addr;
-	if (binder_update_page_range(alloc, 1,
-	    (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL))
-		return NULL;
+	ret = binder_update_page_range(alloc, 1,
+	    (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL);
+	if (ret)
+		return ERR_PTR(ret);
 
 	rb_erase(best_fit, &alloc->free_buffers);
 	buffer->free = 0;

From 9b9340c58afa2cf2a76ae06f3019ad631e6f7799 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Wed, 24 May 2017 10:51:01 -0700
Subject: [PATCH 0910/3220] FROMLIST: binder: refactor queue management in
 binder_thread_read

(from https://patchwork.kernel.org/patch/9817757/)

In binder_thread_read, the BINDER_WORK_NODE command is used
to communicate the references on the node to userspace. It
can take a couple of iterations in the loop to construct
the list of commands for user space. When locking is added,
the lock would need to be release on each iteration which
means the state could change. The work item is not dequeued
during this process which prevents a simpler queue management
that can just dequeue up front and handle the work item.

Fixed by changing the BINDER_WORK_NODE algorithm in
binder_thread_read to determine which commands to send
to userspace atomically in 1 pass so it stays consistent
with the kernel view.

The work item is now dequeued immediately since only
1 pass is needed.

Change-Id: I9b4109997b2d53ba661867b14d7336cd076be06d
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 147 ++++++++++++++++++++++++---------------
 1 file changed, 92 insertions(+), 55 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 5990f7fbcbd3..4e7439506924 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2286,6 +2286,37 @@ static int binder_has_thread_work(struct binder_thread *thread)
 		(thread->looper & BINDER_LOOPER_STATE_NEED_RETURN);
 }
 
+static int binder_put_node_cmd(struct binder_proc *proc,
+			       struct binder_thread *thread,
+			       void __user **ptrp,
+			       binder_uintptr_t node_ptr,
+			       binder_uintptr_t node_cookie,
+			       int node_debug_id,
+			       uint32_t cmd, const char *cmd_name)
+{
+	void __user *ptr = *ptrp;
+
+	if (put_user(cmd, (uint32_t __user *)ptr))
+		return -EFAULT;
+	ptr += sizeof(uint32_t);
+
+	if (put_user(node_ptr, (binder_uintptr_t __user *)ptr))
+		return -EFAULT;
+	ptr += sizeof(binder_uintptr_t);
+
+	if (put_user(node_cookie, (binder_uintptr_t __user *)ptr))
+		return -EFAULT;
+	ptr += sizeof(binder_uintptr_t);
+
+	binder_stat_br(proc, thread, cmd);
+	binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s %d u%016llx c%016llx\n",
+		     proc->pid, thread->pid, cmd_name, node_debug_id,
+		     (u64)node_ptr, (u64)node_cookie);
+
+	*ptrp = ptr;
+	return 0;
+}
+
 static int binder_thread_read(struct binder_proc *proc,
 			      struct binder_thread *thread,
 			      binder_uintptr_t binder_buffer, size_t size,
@@ -2411,72 +2442,78 @@ retry:
 		} break;
 		case BINDER_WORK_NODE: {
 			struct binder_node *node = container_of(w, struct binder_node, work);
-			uint32_t cmd = BR_NOOP;
-			const char *cmd_name;
-			int strong = node->internal_strong_refs || node->local_strong_refs;
-			int weak = !hlist_empty(&node->refs) || node->local_weak_refs || strong;
+			int strong, weak;
+			binder_uintptr_t node_ptr = node->ptr;
+			binder_uintptr_t node_cookie = node->cookie;
+			int node_debug_id = node->debug_id;
+			int has_weak_ref;
+			int has_strong_ref;
+			void __user *orig_ptr = ptr;
 
-			if (weak && !node->has_weak_ref) {
-				cmd = BR_INCREFS;
-				cmd_name = "BR_INCREFS";
+			BUG_ON(proc != node->proc);
+			strong = node->internal_strong_refs ||
+					node->local_strong_refs;
+			weak = !hlist_empty(&node->refs) ||
+					node->local_weak_refs || strong;
+			has_strong_ref = node->has_strong_ref;
+			has_weak_ref = node->has_weak_ref;
+
+			if (weak && !has_weak_ref) {
 				node->has_weak_ref = 1;
 				node->pending_weak_ref = 1;
 				node->local_weak_refs++;
-			} else if (strong && !node->has_strong_ref) {
-				cmd = BR_ACQUIRE;
-				cmd_name = "BR_ACQUIRE";
+			}
+			if (strong && !has_strong_ref) {
 				node->has_strong_ref = 1;
 				node->pending_strong_ref = 1;
 				node->local_strong_refs++;
-			} else if (!strong && node->has_strong_ref) {
-				cmd = BR_RELEASE;
-				cmd_name = "BR_RELEASE";
+			}
+			if (!strong && has_strong_ref)
 				node->has_strong_ref = 0;
-			} else if (!weak && node->has_weak_ref) {
-				cmd = BR_DECREFS;
-				cmd_name = "BR_DECREFS";
+			if (!weak && has_weak_ref)
 				node->has_weak_ref = 0;
-			}
-			if (cmd != BR_NOOP) {
-				if (put_user(cmd, (uint32_t __user *)ptr))
-					return -EFAULT;
-				ptr += sizeof(uint32_t);
-				if (put_user(node->ptr,
-					     (binder_uintptr_t __user *)ptr))
-					return -EFAULT;
-				ptr += sizeof(binder_uintptr_t);
-				if (put_user(node->cookie,
-					     (binder_uintptr_t __user *)ptr))
-					return -EFAULT;
-				ptr += sizeof(binder_uintptr_t);
+			list_del(&w->entry);
 
-				binder_stat_br(proc, thread, cmd);
-				binder_debug(BINDER_DEBUG_USER_REFS,
-					     "%d:%d %s %d u%016llx c%016llx\n",
-					     proc->pid, thread->pid, cmd_name,
-					     node->debug_id,
-					     (u64)node->ptr, (u64)node->cookie);
-			} else {
-				list_del_init(&w->entry);
-				if (!weak && !strong) {
-					binder_debug(BINDER_DEBUG_INTERNAL_REFS,
-						     "%d:%d node %d u%016llx c%016llx deleted\n",
-						     proc->pid, thread->pid,
-						     node->debug_id,
-						     (u64)node->ptr,
-						     (u64)node->cookie);
-					rb_erase(&node->rb_node, &proc->nodes);
-					kfree(node);
-					binder_stats_deleted(BINDER_STAT_NODE);
-				} else {
-					binder_debug(BINDER_DEBUG_INTERNAL_REFS,
-						     "%d:%d node %d u%016llx c%016llx state unchanged\n",
-						     proc->pid, thread->pid,
-						     node->debug_id,
-						     (u64)node->ptr,
-						     (u64)node->cookie);
-				}
+			if (!weak && !strong) {
+				binder_debug(BINDER_DEBUG_INTERNAL_REFS,
+					     "%d:%d node %d u%016llx c%016llx deleted\n",
+					     proc->pid, thread->pid,
+					     node_debug_id,
+					     (u64)node_ptr,
+					     (u64)node_cookie);
+				rb_erase(&node->rb_node, &proc->nodes);
+				kfree(node);
+				binder_stats_deleted(BINDER_STAT_NODE);
 			}
+			if (weak && !has_weak_ref)
+				ret = binder_put_node_cmd(
+						proc, thread, &ptr, node_ptr,
+						node_cookie, node_debug_id,
+						BR_INCREFS, "BR_INCREFS");
+			if (!ret && strong && !has_strong_ref)
+				ret = binder_put_node_cmd(
+						proc, thread, &ptr, node_ptr,
+						node_cookie, node_debug_id,
+						BR_ACQUIRE, "BR_ACQUIRE");
+			if (!ret && !strong && has_strong_ref)
+				ret = binder_put_node_cmd(
+						proc, thread, &ptr, node_ptr,
+						node_cookie, node_debug_id,
+						BR_RELEASE, "BR_RELEASE");
+			if (!ret && !weak && has_weak_ref)
+				ret = binder_put_node_cmd(
+						proc, thread, &ptr, node_ptr,
+						node_cookie, node_debug_id,
+						BR_DECREFS, "BR_DECREFS");
+			if (orig_ptr == ptr)
+				binder_debug(BINDER_DEBUG_INTERNAL_REFS,
+					     "%d:%d node %d u%016llx c%016llx state unchanged\n",
+					     proc->pid, thread->pid,
+					     node_debug_id,
+					     (u64)node_ptr,
+					     (u64)node_cookie);
+			if (ret)
+				return ret;
 		} break;
 		case BINDER_WORK_DEAD_BINDER:
 		case BINDER_WORK_DEAD_BINDER_AND_CLEAR:

From 6ea6027115718c27e01e4459901c7938492cfa8f Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Tue, 9 May 2017 08:31:32 -0700
Subject: [PATCH 0911/3220] FROMLIST: binder: avoid race conditions when
 enqueuing txn

(from https://patchwork.kernel.org/patch/9817813/)

Currently, the transaction complete work item is queued
after the transaction. This means that it is possible
for the transaction to be handled and a reply to be
enqueued in the current thread before the transaction
complete is enqueued, which violates the protocol
with userspace who may not expect the transaction
complete. Fixed by always enqueing the transaction
complete first.

Also, once the transaction is enqueued, it is unsafe
to access since it might be freed. Currently,
t->flags is accessed to determine whether a sync
wake is needed. Changed to access tr->flags
instead.

Change-Id: I6c01566e167a39cf17c9027c3817618182e56975
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 4e7439506924..5b1e45cef89e 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1800,6 +1800,9 @@ static void binder_transaction(struct binder_proc *proc,
 			goto err_bad_object_type;
 		}
 	}
+	tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
+	list_add_tail(&tcomplete->entry, &thread->todo);
+
 	if (reply) {
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_pop_transaction(target_thread, in_reply_to);
@@ -1819,10 +1822,8 @@ static void binder_transaction(struct binder_proc *proc,
 	}
 	t->work.type = BINDER_WORK_TRANSACTION;
 	list_add_tail(&t->work.entry, target_list);
-	tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
-	list_add_tail(&tcomplete->entry, &thread->todo);
 	if (target_wait) {
-		if (reply || !(t->flags & TF_ONE_WAY))
+		if (reply || !(tr->flags & TF_ONE_WAY))
 			wake_up_interruptible_sync(target_wait);
 		else
 			wake_up_interruptible(target_wait);

From afda44d0aa52f4427a6867fe784997f307b7fe31 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 6 Jan 2017 14:19:25 -0800
Subject: [PATCH 0912/3220] FROMLIST: binder: don't modify thread->looper from
 other threads

(from https://patchwork.kernel.org/patch/9817799/)

The looper member of struct binder_thread is a bitmask
of control bits. All of the existing bits are modified
by the affected thread except for BINDER_LOOPER_STATE_NEED_RETURN
which can be modified in binder_deferred_flush() by
another thread.

To avoid adding a spinlock around all read-mod-writes to
modify a bit, the BINDER_LOOPER_STATE_NEED_RETURN flag
is replaced by a separate field in struct binder_thread.

Bug: 33250092 32225111
Change-Id: Ia4cefbdbd683c6cb17c323ba7d278de5f2ca0745
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 5b1e45cef89e..6eb9fc064846 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -334,14 +334,14 @@ enum {
 	BINDER_LOOPER_STATE_EXITED      = 0x04,
 	BINDER_LOOPER_STATE_INVALID     = 0x08,
 	BINDER_LOOPER_STATE_WAITING     = 0x10,
-	BINDER_LOOPER_STATE_NEED_RETURN = 0x20
 };
 
 struct binder_thread {
 	struct binder_proc *proc;
 	struct rb_node rb_node;
 	int pid;
-	int looper;
+	int looper;              /* only modified by this thread */
+	bool looper_need_return; /* can be written by other thread */
 	struct binder_transaction *transaction_stack;
 	struct list_head todo;
 	uint32_t return_error; /* Write failed, return error code in read buf */
@@ -2277,14 +2277,13 @@ static void binder_stat_br(struct binder_proc *proc,
 static int binder_has_proc_work(struct binder_proc *proc,
 				struct binder_thread *thread)
 {
-	return !list_empty(&proc->todo) ||
-		(thread->looper & BINDER_LOOPER_STATE_NEED_RETURN);
+	return !list_empty(&proc->todo) || thread->looper_need_return;
 }
 
 static int binder_has_thread_work(struct binder_thread *thread)
 {
 	return !list_empty(&thread->todo) || thread->return_error != BR_OK ||
-		(thread->looper & BINDER_LOOPER_STATE_NEED_RETURN);
+		thread->looper_need_return;
 }
 
 static int binder_put_node_cmd(struct binder_proc *proc,
@@ -2413,8 +2412,7 @@ retry:
 					     entry);
 		} else {
 			/* no data added */
-			if (ptr - buffer == 4 &&
-			    !(thread->looper & BINDER_LOOPER_STATE_NEED_RETURN))
+			if (ptr - buffer == 4 && !thread->looper_need_return)
 				goto retry;
 			break;
 		}
@@ -2728,7 +2726,7 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc)
 		INIT_LIST_HEAD(&thread->todo);
 		rb_link_node(&thread->rb_node, parent, p);
 		rb_insert_color(&thread->rb_node, &proc->threads);
-		thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN;
+		thread->looper_need_return = true;
 		thread->return_error = BR_OK;
 		thread->return_error2 = BR_OK;
 	}
@@ -2984,7 +2982,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	ret = 0;
 err:
 	if (thread)
-		thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN;
+		thread->looper_need_return = false;
 	binder_unlock(__func__);
 	wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
 	if (ret && ret != -ERESTARTSYS)
@@ -3139,7 +3137,7 @@ static void binder_deferred_flush(struct binder_proc *proc)
 	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
 		struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node);
 
-		thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN;
+		thread->looper_need_return = true;
 		if (thread->looper & BINDER_LOOPER_STATE_WAITING) {
 			wake_up_interruptible(&thread->wait);
 			wake_count++;
@@ -3400,7 +3398,9 @@ static void print_binder_thread(struct seq_file *m,
 	size_t start_pos = m->count;
 	size_t header_pos;
 
-	seq_printf(m, "  thread %d: l %02x\n", thread->pid, thread->looper);
+	seq_printf(m, "  thread %d: l %02x need_return %d\n",
+			thread->pid, thread->looper,
+			thread->looper_need_return);
 	header_pos = m->count;
 	t = thread->transaction_stack;
 	while (t) {

From ce9b7747d6a1cb1b4b772b4e2c524ba1a56b551b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 1 May 2017 17:21:51 -0700
Subject: [PATCH 0913/3220] FROMLIST: binder: remove dead code in
 binder_get_ref_for_node

(from https://patchwork.kernel.org/patch/9817819/)

node is always non-NULL in binder_get_ref_for_node so the
conditional and else clause are not needed

Change-Id: I23f011ba59e1869d9577e6bf28e1f1dd38f45713
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 6eb9fc064846..812c86bae4db 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -671,18 +671,12 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 	}
 	rb_link_node(&new_ref->rb_node_desc, parent, p);
 	rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc);
-	if (node) {
-		hlist_add_head(&new_ref->node_entry, &node->refs);
+	hlist_add_head(&new_ref->node_entry, &node->refs);
 
-		binder_debug(BINDER_DEBUG_INTERNAL_REFS,
-			     "%d new ref %d desc %d for node %d\n",
-			      proc->pid, new_ref->debug_id, new_ref->desc,
-			      node->debug_id);
-	} else {
-		binder_debug(BINDER_DEBUG_INTERNAL_REFS,
-			     "%d new ref %d desc %d for dead node\n",
-			      proc->pid, new_ref->debug_id, new_ref->desc);
-	}
+	binder_debug(BINDER_DEBUG_INTERNAL_REFS,
+		     "%d new ref %d desc %d for node %d\n",
+		      proc->pid, new_ref->debug_id, new_ref->desc,
+		      node->debug_id);
 	return new_ref;
 }
 

From db51658467e03c756b7de23eb03ebeb63e607d4c Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 21 Apr 2017 14:32:11 -0700
Subject: [PATCH 0914/3220] FROMLIST: binder: protect against two threads
 freeing buffer

(from https://patchwork.kernel.org/patch/9817815/)

Adds protection against malicious user code freeing
the same buffer at the same time which could cause
a crash. Cannot happen under normal use.

Bug: 36650912
Change-Id: I43e078cbf31c0789aaff5ceaf8f1a94c75f79d45
Test: tested manually
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c       |  4 ++--
 drivers/android/binder_alloc.c | 22 +++++++++++++++++-----
 drivers/android/binder_alloc.h |  7 ++++---
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 812c86bae4db..d484655133ee 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2025,8 +2025,8 @@ static int binder_thread_write(struct binder_proc *proc,
 				return -EFAULT;
 			ptr += sizeof(binder_uintptr_t);
 
-			buffer = binder_alloc_buffer_lookup(&proc->alloc,
-							    data_ptr);
+			buffer = binder_alloc_prepare_to_free(&proc->alloc,
+							      data_ptr);
 			if (buffer == NULL) {
 				binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n",
 					proc->pid, thread->pid, (u64)data_ptr);
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 16bc8a3b53f5..8f79df3f3613 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -116,7 +116,7 @@ static void binder_insert_allocated_buffer_locked(
 	rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers);
 }
 
-static struct binder_buffer *binder_alloc_buffer_lookup_locked(
+static struct binder_buffer *binder_alloc_prepare_to_free_locked(
 		struct binder_alloc *alloc,
 		uintptr_t user_ptr)
 {
@@ -135,8 +135,19 @@ static struct binder_buffer *binder_alloc_buffer_lookup_locked(
 			n = n->rb_left;
 		else if (kern_ptr > buffer)
 			n = n->rb_right;
-		else
+		else {
+			/*
+			 * Guard against user threads attempting to
+			 * free the buffer twice
+			 */
+			if (buffer->free_in_progress) {
+				pr_err("%d:%d FREE_BUFFER u%016llx user freed buffer twice\n",
+				       alloc->pid, current->pid, (u64)user_ptr);
+				return NULL;
+			}
+			buffer->free_in_progress = 1;
 			return buffer;
+		}
 	}
 	return NULL;
 }
@@ -152,13 +163,13 @@ static struct binder_buffer *binder_alloc_buffer_lookup_locked(
  *
  * Return:	Pointer to buffer or NULL
  */
-struct binder_buffer *binder_alloc_buffer_lookup(struct binder_alloc *alloc,
-						 uintptr_t user_ptr)
+struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc,
+						   uintptr_t user_ptr)
 {
 	struct binder_buffer *buffer;
 
 	mutex_lock(&alloc->mutex);
-	buffer = binder_alloc_buffer_lookup_locked(alloc, user_ptr);
+	buffer = binder_alloc_prepare_to_free_locked(alloc, user_ptr);
 	mutex_unlock(&alloc->mutex);
 	return buffer;
 }
@@ -358,6 +369,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 
 	rb_erase(best_fit, &alloc->free_buffers);
 	buffer->free = 0;
+	buffer->free_in_progress = 0;
 	binder_insert_allocated_buffer_locked(alloc, buffer);
 	if (buffer_size != size) {
 		struct binder_buffer *new_buffer = (void *)buffer->data + size;
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index 721c511431f9..088e4ffc6230 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -48,7 +48,8 @@ struct binder_buffer {
 	unsigned free:1;
 	unsigned allow_user_free:1;
 	unsigned async_transaction:1;
-	unsigned debug_id:29;
+	unsigned free_in_progress:1;
+	unsigned debug_id:28;
 
 	struct binder_transaction *transaction;
 
@@ -109,8 +110,8 @@ extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
 extern void binder_alloc_init(struct binder_alloc *alloc);
 extern void binder_alloc_vma_close(struct binder_alloc *alloc);
 extern struct binder_buffer *
-binder_alloc_buffer_lookup(struct binder_alloc *alloc,
-			   uintptr_t user_ptr);
+binder_alloc_prepare_to_free(struct binder_alloc *alloc,
+			     uintptr_t user_ptr);
 extern void binder_alloc_free_buf(struct binder_alloc *alloc,
 				  struct binder_buffer *buffer);
 extern int binder_alloc_mmap_handler(struct binder_alloc *alloc,

From 42e1ca789418a06b5ed727f527bb0f329bc28c26 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Wed, 15 Mar 2017 18:22:52 +0100
Subject: [PATCH 0915/3220] FROMLIST: binder: add more debug info when
 allocation fails.

(from https://patchwork.kernel.org/patch/9817797/)

Bug: 36088202
Test: tested manually
Change-Id: Ib526a9c375e6136669b72f341e0b54d896fd1cec
Signed-off-by: Martijn Coenen <maco@android.com>
Signed-off-by: Siqi Lin <siqilin@google.com>
---
 drivers/android/binder_alloc.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 8f79df3f3613..aabfebac6e57 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -337,8 +337,36 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 		}
 	}
 	if (best_fit == NULL) {
+		size_t allocated_buffers = 0;
+		size_t largest_alloc_size = 0;
+		size_t total_alloc_size = 0;
+		size_t free_buffers = 0;
+		size_t largest_free_size = 0;
+		size_t total_free_size = 0;
+
+		for (n = rb_first(&alloc->allocated_buffers); n != NULL;
+		     n = rb_next(n)) {
+			buffer = rb_entry(n, struct binder_buffer, rb_node);
+			buffer_size = binder_alloc_buffer_size(alloc, buffer);
+			allocated_buffers++;
+			total_alloc_size += buffer_size;
+			if (buffer_size > largest_alloc_size)
+				largest_alloc_size = buffer_size;
+		}
+		for (n = rb_first(&alloc->free_buffers); n != NULL;
+		     n = rb_next(n)) {
+			buffer = rb_entry(n, struct binder_buffer, rb_node);
+			buffer_size = binder_alloc_buffer_size(alloc, buffer);
+			free_buffers++;
+			total_free_size += buffer_size;
+			if (buffer_size > largest_free_size)
+				largest_free_size = buffer_size;
+		}
 		pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
 			alloc->pid, size);
+		pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n",
+		       total_alloc_size, allocated_buffers, largest_alloc_size,
+		       total_free_size, free_buffers, largest_free_size);
 		return ERR_PTR(-ENOSPC);
 	}
 	if (n == NULL) {
@@ -698,9 +726,10 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
 static void print_binder_buffer(struct seq_file *m, const char *prefix,
 				struct binder_buffer *buffer)
 {
-	seq_printf(m, "%s %d: %pK size %zd:%zd %s\n",
+	seq_printf(m, "%s %d: %pK size %zd:%zd:%zd %s\n",
 		   prefix, buffer->debug_id, buffer->data,
 		   buffer->data_size, buffer->offsets_size,
+		   buffer->extra_buffers_size,
 		   buffer->transaction ? "active" : "delivered");
 }
 

From 0f32aeb35f8b8c3d07f1c03062cc195590145d35 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Wed, 24 May 2017 13:33:28 -0700
Subject: [PATCH 0916/3220] FROMLIST: binder: use atomic for transaction_log
 index

(from https://patchwork.kernel.org/patch/9817807/)

The log->next index for the transaction log was
not protected when incremented. This led to a
case where log->next++ resulted in an index
larger than ARRAY_SIZE(log->entry) and eventually
a bad access to memory.

Fixed by making the log index an atomic64 and
converting to an array by using "% ARRAY_SIZE(log->entry)"

Also added "complete" field to the log entry which is
written last to tell the print code whether the
entry is complete

Bug: 62038227
Test: tested manually
Change-Id: I1bb1c1a332a6ac458a626f5bedd05022b56b91f2
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 74 +++++++++++++++++++++++++++++++---------
 1 file changed, 58 insertions(+), 16 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d484655133ee..33d7e981324d 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -187,6 +187,7 @@ static inline void binder_stats_created(enum binder_stat_types type)
 
 struct binder_transaction_log_entry {
 	int debug_id;
+	int debug_id_done;
 	int call_type;
 	int from_proc;
 	int from_thread;
@@ -202,8 +203,8 @@ struct binder_transaction_log_entry {
 	const char *context_name;
 };
 struct binder_transaction_log {
-	int next;
-	int full;
+	atomic_t cur;
+	bool full;
 	struct binder_transaction_log_entry entry[32];
 };
 static struct binder_transaction_log binder_transaction_log;
@@ -213,14 +214,19 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
 	struct binder_transaction_log *log)
 {
 	struct binder_transaction_log_entry *e;
+	unsigned int cur = atomic_inc_return(&log->cur);
 
-	e = &log->entry[log->next];
-	memset(e, 0, sizeof(*e));
-	log->next++;
-	if (log->next == ARRAY_SIZE(log->entry)) {
-		log->next = 0;
+	if (cur >= ARRAY_SIZE(log->entry))
 		log->full = 1;
-	}
+	e = &log->entry[cur % ARRAY_SIZE(log->entry)];
+	WRITE_ONCE(e->debug_id_done, 0);
+	/*
+	 * write-barrier to synchronize access to e->debug_id_done.
+	 * We make sure the initialized 0 value is seen before
+	 * memset() other fields are zeroed by memset.
+	 */
+	smp_wmb();
+	memset(e, 0, sizeof(*e));
 	return e;
 }
 
@@ -1401,8 +1407,10 @@ static void binder_transaction(struct binder_proc *proc,
 	struct binder_buffer_object *last_fixup_obj = NULL;
 	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
+	int t_debug_id = atomic_inc_return(&binder_last_id);
 
 	e = binder_transaction_log_add(&binder_transaction_log);
+	e->debug_id = t_debug_id;
 	e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY);
 	e->from_proc = proc->pid;
 	e->from_thread = thread->pid;
@@ -1546,8 +1554,7 @@ static void binder_transaction(struct binder_proc *proc,
 	}
 	binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE);
 
-	t->debug_id = atomic_inc_return(&binder_last_id);
-	e->debug_id = t->debug_id;
+	t->debug_id = t_debug_id;
 
 	if (reply)
 		binder_debug(BINDER_DEBUG_TRANSACTION,
@@ -1822,6 +1829,12 @@ static void binder_transaction(struct binder_proc *proc,
 		else
 			wake_up_interruptible(target_wait);
 	}
+	/*
+	 * write barrier to synchronize with initialization
+	 * of log entry
+	 */
+	smp_wmb();
+	WRITE_ONCE(e->debug_id_done, t_debug_id);
 	return;
 
 err_translate_failed:
@@ -1859,6 +1872,13 @@ err_no_context_mgr_node:
 		e->return_error_line = return_error_line;
 		fe = binder_transaction_log_add(&binder_transaction_log_failed);
 		*fe = *e;
+		/*
+		 * write barrier to synchronize with initialization
+		 * of log entry
+		 */
+		smp_wmb();
+		WRITE_ONCE(e->debug_id_done, t_debug_id);
+		WRITE_ONCE(fe->debug_id_done, t_debug_id);
 	}
 
 	BUG_ON(thread->return_error != BR_OK);
@@ -3719,27 +3739,47 @@ static int binder_proc_show(struct seq_file *m, void *unused)
 static void print_binder_transaction_log_entry(struct seq_file *m,
 					struct binder_transaction_log_entry *e)
 {
+	int debug_id = READ_ONCE(e->debug_id_done);
+	/*
+	 * read barrier to guarantee debug_id_done read before
+	 * we print the log values
+	 */
+	smp_rmb();
 	seq_printf(m,
-		   "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d\n",
+		   "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d",
 		   e->debug_id, (e->call_type == 2) ? "reply" :
 		   ((e->call_type == 1) ? "async" : "call "), e->from_proc,
 		   e->from_thread, e->to_proc, e->to_thread, e->context_name,
 		   e->to_node, e->target_handle, e->data_size, e->offsets_size,
 		   e->return_error, e->return_error_param,
 		   e->return_error_line);
+	/*
+	 * read-barrier to guarantee read of debug_id_done after
+	 * done printing the fields of the entry
+	 */
+	smp_rmb();
+	seq_printf(m, debug_id && debug_id == READ_ONCE(e->debug_id_done) ?
+			"\n" : " (incomplete)\n");
 }
 
 static int binder_transaction_log_show(struct seq_file *m, void *unused)
 {
 	struct binder_transaction_log *log = m->private;
+	unsigned int log_cur = atomic_read(&log->cur);
+	unsigned int count;
+	unsigned int cur;
 	int i;
 
-	if (log->full) {
-		for (i = log->next; i < ARRAY_SIZE(log->entry); i++)
-			print_binder_transaction_log_entry(m, &log->entry[i]);
+	count = log_cur + 1;
+	cur = count < ARRAY_SIZE(log->entry) && !log->full ?
+		0 : count % ARRAY_SIZE(log->entry);
+	if (count > ARRAY_SIZE(log->entry) || log->full)
+		count = ARRAY_SIZE(log->entry);
+	for (i = 0; i < count; i++) {
+		unsigned int index = cur++ % ARRAY_SIZE(log->entry);
+
+		print_binder_transaction_log_entry(m, &log->entry[index]);
 	}
-	for (i = 0; i < log->next; i++)
-		print_binder_transaction_log_entry(m, &log->entry[i]);
 	return 0;
 }
 
@@ -3794,6 +3834,8 @@ static int __init binder_init(void)
 	struct binder_device *device;
 	struct hlist_node *tmp;
 
+	atomic_set(&binder_transaction_log.cur, ~0U);
+	atomic_set(&binder_transaction_log_failed.cur, ~0U);
 	binder_deferred_workqueue = create_singlethread_workqueue("binder");
 	if (!binder_deferred_workqueue)
 		return -ENOMEM;

From 162735381739bb4f3e5ad6f35a6319d99ee7e697 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Thu, 30 Mar 2017 18:02:13 -0700
Subject: [PATCH 0917/3220] FROMLIST: binder: refactor binder_pop_transaction

(from https://lkml.org/lkml/2017/6/29/754)

binder_pop_transaction needs to be split into 2 pieces to
to allow the proc lock to be held on entry to dequeue the
transaction stack, but no lock when kfree'ing the transaction.

Split into binder_pop_transaction_locked and binder_free_transaction
(the actual locks are still to be added).

Change-Id: I848ae994cc27b3cd083cff2dbd1071762784f4a3
Test: tested manually
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 33d7e981324d..2006fb7cd69a 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -769,14 +769,16 @@ static int binder_dec_ref(struct binder_ref *ref, int strong)
 static void binder_pop_transaction(struct binder_thread *target_thread,
 				   struct binder_transaction *t)
 {
-	if (target_thread) {
-		BUG_ON(target_thread->transaction_stack != t);
-		BUG_ON(target_thread->transaction_stack->from != target_thread);
-		target_thread->transaction_stack =
-			target_thread->transaction_stack->from_parent;
-		t->from = NULL;
-	}
-	t->need_reply = 0;
+	BUG_ON(!target_thread);
+	BUG_ON(target_thread->transaction_stack != t);
+	BUG_ON(target_thread->transaction_stack->from != target_thread);
+	target_thread->transaction_stack =
+		target_thread->transaction_stack->from_parent;
+	t->from = NULL;
+}
+
+static void binder_free_transaction(struct binder_transaction *t)
+{
 	if (t->buffer)
 		t->buffer->transaction = NULL;
 	kfree(t);
@@ -809,6 +811,7 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 				binder_pop_transaction(target_thread, t);
 				target_thread->return_error = error_code;
 				wake_up_interruptible(&target_thread->wait);
+				binder_free_transaction(t);
 			} else {
 				pr_err("reply failed, target thread, %d:%d, has error code %d already\n",
 					target_thread->proc->pid,
@@ -823,7 +826,7 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 			     "send failed reply for transaction %d, target dead\n",
 			     t->debug_id);
 
-		binder_pop_transaction(target_thread, t);
+		binder_free_transaction(t);
 		if (next == NULL) {
 			binder_debug(BINDER_DEBUG_DEAD_BINDER,
 				     "reply failed, no target thread at root\n");
@@ -1807,6 +1810,7 @@ static void binder_transaction(struct binder_proc *proc,
 	if (reply) {
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_pop_transaction(target_thread, in_reply_to);
+		binder_free_transaction(in_reply_to);
 	} else if (!(t->flags & TF_ONE_WAY)) {
 		BUG_ON(t->buffer->async_transaction != 0);
 		t->need_reply = 1;
@@ -2636,9 +2640,7 @@ retry:
 			t->to_thread = thread;
 			thread->transaction_stack = t;
 		} else {
-			t->buffer->transaction = NULL;
-			kfree(t);
-			binder_stats_deleted(BINDER_STAT_TRANSACTION);
+			binder_free_transaction(t);
 		}
 		break;
 	}
@@ -2681,9 +2683,7 @@ static void binder_release_work(struct list_head *list)
 				binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
 					"undelivered transaction %d\n",
 					t->debug_id);
-				t->buffer->transaction = NULL;
-				kfree(t);
-				binder_stats_deleted(BINDER_STAT_TRANSACTION);
+				binder_free_transaction(t);
 			}
 		} break;
 		case BINDER_WORK_TRANSACTION_COMPLETE: {

From 3a822b33c84bf30d2081ede2c42d53ae7ab89d85 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 21 Apr 2017 17:35:12 -0700
Subject: [PATCH 0918/3220] FROMLIST: binder: guarantee txn complete / errors
 delivered in-order

(from https://patchwork.kernel.org/patch/9817805/)

Since errors are tracked in the return_error/return_error2
fields of the binder_thread object and BR_TRANSACTION_COMPLETEs
can be tracked either in those fields or via the thread todo
work list, it is possible for errors to be reported ahead
of the associated txn complete.

Use the thread todo work list for errors to guarantee
order. Also changed binder_send_failed_reply to pop
the transaction even if it failed to send a reply.

Bug: 37218618
Test: tested manually
Change-Id: I196cfaeed09fdcd697f8ab25eea4e04241fdb08f
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 125 ++++++++++++++++++++++-----------------
 1 file changed, 72 insertions(+), 53 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 2006fb7cd69a..f62919855936 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -249,6 +249,7 @@ struct binder_work {
 	enum {
 		BINDER_WORK_TRANSACTION = 1,
 		BINDER_WORK_TRANSACTION_COMPLETE,
+		BINDER_WORK_RETURN_ERROR,
 		BINDER_WORK_NODE,
 		BINDER_WORK_DEAD_BINDER,
 		BINDER_WORK_DEAD_BINDER_AND_CLEAR,
@@ -256,6 +257,11 @@ struct binder_work {
 	} type;
 };
 
+struct binder_error {
+	struct binder_work work;
+	uint32_t cmd;
+};
+
 struct binder_node {
 	int debug_id;
 	struct binder_work work;
@@ -350,10 +356,8 @@ struct binder_thread {
 	bool looper_need_return; /* can be written by other thread */
 	struct binder_transaction *transaction_stack;
 	struct list_head todo;
-	uint32_t return_error; /* Write failed, return error code in read buf */
-	uint32_t return_error2; /* Write failed, return error code in read */
-		/* buffer. Used when sending a reply to a dead process that */
-		/* we are also waiting on */
+	struct binder_error return_error;
+	struct binder_error reply_error;
 	wait_queue_head_t wait;
 	struct binder_stats stats;
 };
@@ -795,29 +799,24 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 	while (1) {
 		target_thread = t->from;
 		if (target_thread) {
-			if (target_thread->return_error != BR_OK &&
-			   target_thread->return_error2 == BR_OK) {
-				target_thread->return_error2 =
-					target_thread->return_error;
-				target_thread->return_error = BR_OK;
-			}
-			if (target_thread->return_error == BR_OK) {
-				binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
-					     "send failed reply for transaction %d to %d:%d\n",
-					      t->debug_id,
-					      target_thread->proc->pid,
-					      target_thread->pid);
+			binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
+				     "send failed reply for transaction %d to %d:%d\n",
+				      t->debug_id,
+				      target_thread->proc->pid,
+				      target_thread->pid);
 
-				binder_pop_transaction(target_thread, t);
-				target_thread->return_error = error_code;
+			binder_pop_transaction(target_thread, t);
+			if (target_thread->reply_error.cmd == BR_OK) {
+				target_thread->reply_error.cmd = error_code;
+				list_add_tail(
+					&target_thread->reply_error.work.entry,
+					&target_thread->todo);
 				wake_up_interruptible(&target_thread->wait);
-				binder_free_transaction(t);
 			} else {
-				pr_err("reply failed, target thread, %d:%d, has error code %d already\n",
-					target_thread->proc->pid,
-					target_thread->pid,
-					target_thread->return_error);
+				WARN(1, "Unexpected reply error: %u\n",
+						target_thread->reply_error.cmd);
 			}
+			binder_free_transaction(t);
 			return;
 		}
 		next = t->from_parent;
@@ -1885,12 +1884,17 @@ err_no_context_mgr_node:
 		WRITE_ONCE(fe->debug_id_done, t_debug_id);
 	}
 
-	BUG_ON(thread->return_error != BR_OK);
+	BUG_ON(thread->return_error.cmd != BR_OK);
 	if (in_reply_to) {
-		thread->return_error = BR_TRANSACTION_COMPLETE;
+		thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
+		list_add_tail(&thread->return_error.work.entry,
+			      &thread->todo);
 		binder_send_failed_reply(in_reply_to, return_error);
-	} else
-		thread->return_error = return_error;
+	} else {
+		thread->return_error.cmd = return_error;
+		list_add_tail(&thread->return_error.work.entry,
+			      &thread->todo);
+	}
 }
 
 static int binder_thread_write(struct binder_proc *proc,
@@ -1904,7 +1908,7 @@ static int binder_thread_write(struct binder_proc *proc,
 	void __user *ptr = buffer + *consumed;
 	void __user *end = buffer + size;
 
-	while (ptr < end && thread->return_error == BR_OK) {
+	while (ptr < end && thread->return_error.cmd == BR_OK) {
 		if (get_user(cmd, (uint32_t __user *)ptr))
 			return -EFAULT;
 		ptr += sizeof(uint32_t);
@@ -2184,7 +2188,12 @@ static int binder_thread_write(struct binder_proc *proc,
 				}
 				death = kzalloc(sizeof(*death), GFP_KERNEL);
 				if (death == NULL) {
-					thread->return_error = BR_ERROR;
+					WARN_ON(thread->return_error.cmd !=
+						BR_OK);
+					thread->return_error.cmd = BR_ERROR;
+					list_add_tail(
+					    &thread->return_error.work.entry,
+					    &thread->todo);
 					binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 						     "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
 						     proc->pid, thread->pid);
@@ -2300,8 +2309,7 @@ static int binder_has_proc_work(struct binder_proc *proc,
 
 static int binder_has_thread_work(struct binder_thread *thread)
 {
-	return !list_empty(&thread->todo) || thread->return_error != BR_OK ||
-		thread->looper_need_return;
+	return !list_empty(&thread->todo) || thread->looper_need_return;
 }
 
 static int binder_put_node_cmd(struct binder_proc *proc,
@@ -2357,25 +2365,6 @@ retry:
 	wait_for_proc_work = thread->transaction_stack == NULL &&
 				list_empty(&thread->todo);
 
-	if (thread->return_error != BR_OK && ptr < end) {
-		if (thread->return_error2 != BR_OK) {
-			if (put_user(thread->return_error2, (uint32_t __user *)ptr))
-				return -EFAULT;
-			ptr += sizeof(uint32_t);
-			binder_stat_br(proc, thread, thread->return_error2);
-			if (ptr == end)
-				goto done;
-			thread->return_error2 = BR_OK;
-		}
-		if (put_user(thread->return_error, (uint32_t __user *)ptr))
-			return -EFAULT;
-		ptr += sizeof(uint32_t);
-		binder_stat_br(proc, thread, thread->return_error);
-		thread->return_error = BR_OK;
-		goto done;
-	}
-
-
 	thread->looper |= BINDER_LOOPER_STATE_WAITING;
 	if (wait_for_proc_work)
 		proc->ready_threads++;
@@ -2442,6 +2431,19 @@ retry:
 		case BINDER_WORK_TRANSACTION: {
 			t = container_of(w, struct binder_transaction, work);
 		} break;
+		case BINDER_WORK_RETURN_ERROR: {
+			struct binder_error *e = container_of(
+					w, struct binder_error, work);
+
+			WARN_ON(e->cmd == BR_OK);
+			if (put_user(e->cmd, (uint32_t __user *)ptr))
+				return -EFAULT;
+			e->cmd = BR_OK;
+			ptr += sizeof(uint32_t);
+
+			binder_stat_br(proc, thread, cmd);
+			list_del(&w->entry);
+		} break;
 		case BINDER_WORK_TRANSACTION_COMPLETE: {
 			cmd = BR_TRANSACTION_COMPLETE;
 			if (put_user(cmd, (uint32_t __user *)ptr))
@@ -2686,6 +2688,14 @@ static void binder_release_work(struct list_head *list)
 				binder_free_transaction(t);
 			}
 		} break;
+		case BINDER_WORK_RETURN_ERROR: {
+			struct binder_error *e = container_of(
+					w, struct binder_error, work);
+
+			binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
+				"undelivered TRANSACTION_ERROR: %u\n",
+				e->cmd);
+		} break;
 		case BINDER_WORK_TRANSACTION_COMPLETE: {
 			binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
 				"undelivered TRANSACTION_COMPLETE\n");
@@ -2741,8 +2751,10 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc)
 		rb_link_node(&thread->rb_node, parent, p);
 		rb_insert_color(&thread->rb_node, &proc->threads);
 		thread->looper_need_return = true;
-		thread->return_error = BR_OK;
-		thread->return_error2 = BR_OK;
+		thread->return_error.work.type = BINDER_WORK_RETURN_ERROR;
+		thread->return_error.cmd = BR_OK;
+		thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR;
+		thread->reply_error.cmd = BR_OK;
 	}
 	return thread;
 }
@@ -2800,7 +2812,7 @@ static unsigned int binder_poll(struct file *filp,
 	thread = binder_get_thread(proc);
 
 	wait_for_proc_work = thread->transaction_stack == NULL &&
-		list_empty(&thread->todo) && thread->return_error == BR_OK;
+		list_empty(&thread->todo);
 
 	binder_unlock(__func__);
 
@@ -3379,6 +3391,13 @@ static void print_binder_work(struct seq_file *m, const char *prefix,
 		t = container_of(w, struct binder_transaction, work);
 		print_binder_transaction(m, transaction_prefix, t);
 		break;
+	case BINDER_WORK_RETURN_ERROR: {
+		struct binder_error *e = container_of(
+				w, struct binder_error, work);
+
+		seq_printf(m, "%stransaction error: %u\n",
+			   prefix, e->cmd);
+	} break;
 	case BINDER_WORK_TRANSACTION_COMPLETE:
 		seq_printf(m, "%stransaction complete\n", prefix);
 		break;

From f80cbc72e1b892d9df17c84050891a6bb53cbeac Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 26 May 2017 11:56:29 -0700
Subject: [PATCH 0919/3220] FROMLIST: binder: make sure target_node has strong
 ref

(from https://patchwork.kernel.org/patch/9817787/)

When initiating a transaction, the target_node must
have a strong ref on it. Then we take a second
strong ref to make sure the node survives until the
transaction is complete.

Change-Id: If7429cb43eda520ab89d45df6c19327cee97c60c
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index f62919855936..0dec6d6c66ca 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1470,8 +1470,19 @@ static void binder_transaction(struct binder_proc *proc,
 		if (tr->target.handle) {
 			struct binder_ref *ref;
 
+			/*
+			 * There must already be a strong ref
+			 * on this node. If so, do a strong
+			 * increment on the node to ensure it
+			 * stays alive until the transaction is
+			 * done.
+			 */
 			ref = binder_get_ref(proc, tr->target.handle, true);
-			if (ref == NULL) {
+			if (ref) {
+				binder_inc_node(ref->node, 1, 0, NULL);
+				target_node = ref->node;
+			}
+			if (target_node == NULL) {
 				binder_user_error("%d:%d got transaction to invalid handle\n",
 					proc->pid, thread->pid);
 				return_error = BR_FAILED_REPLY;
@@ -1479,7 +1490,6 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error_line = __LINE__;
 				goto err_invalid_target_handle;
 			}
-			target_node = ref->node;
 		} else {
 			mutex_lock(&context->context_mgr_node_lock);
 			target_node = context->binder_context_mgr_node;
@@ -1489,6 +1499,7 @@ static void binder_transaction(struct binder_proc *proc,
 				return_error_line = __LINE__;
 				goto err_no_context_mgr_node;
 			}
+			binder_inc_node(target_node, 1, 0, NULL);
 			mutex_unlock(&context->context_mgr_node_lock);
 		}
 		e->to_node = target_node->debug_id;
@@ -1609,9 +1620,6 @@ static void binder_transaction(struct binder_proc *proc,
 	t->buffer->transaction = t;
 	t->buffer->target_node = target_node;
 	trace_binder_transaction_alloc_buf(t->buffer);
-	if (target_node)
-		binder_inc_node(target_node, 1, 0, NULL);
-
 	off_start = (binder_size_t *)(t->buffer->data +
 				      ALIGN(tr->data_size, sizeof(void *)));
 	offp = off_start;
@@ -1847,6 +1855,7 @@ err_bad_parent:
 err_copy_data_failed:
 	trace_binder_transaction_failed_buffer_release(t->buffer);
 	binder_transaction_buffer_release(target_proc, t->buffer, offp);
+	target_node = NULL;
 	t->buffer->transaction = NULL;
 	binder_alloc_free_buf(&target_proc->alloc, t->buffer);
 err_binder_alloc_buf_failed:
@@ -1861,6 +1870,9 @@ err_empty_call_stack:
 err_dead_binder:
 err_invalid_target_handle:
 err_no_context_mgr_node:
+	if (target_node)
+		binder_dec_node(target_node, 1, 0);
+
 	binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 		     "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n",
 		     proc->pid, thread->pid, return_error, return_error_param,

From e482ec39d635cd1bcf764f2aefbc4ab3149d2299 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 12 May 2017 14:42:55 -0700
Subject: [PATCH 0920/3220] FROMLIST: binder: make sure accesses to proc/thread
 are safe

(from https://patchwork.kernel.org/patch/9817787/)

binder_thread and binder_proc may be accessed by other
threads when processing transaction. Therefore they
must be prevented from being freed while a transaction
is in progress that references them.

This is done by introducing a temporary reference
counter for threads and procs that indicates that the
object is in use and must not be freed. binder_thread_dec_tmpref()
and binder_proc_dec_tmpref() are used to decrement
the temporary reference.

It is safe to free a binder_thread if there
is no reference and it has been released
(indicated by thread->is_dead).

It is safe to free a binder_proc if it has no
remaining threads and no reference.

A spinlock is added to the binder_transaction
to safely access and set references for t->from
and for debug code to safely access t->to_thread
and t->to_proc.

Change-Id: I0a00a0294c3e93aea8b3f141c6f18e77ad244078
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 233 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 206 insertions(+), 27 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 0dec6d6c66ca..a16375e133cc 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -325,6 +325,7 @@ struct binder_proc {
 	struct files_struct *files;
 	struct hlist_node deferred_work_node;
 	int deferred_work;
+	bool is_dead;
 
 	struct list_head todo;
 	wait_queue_head_t wait;
@@ -334,6 +335,7 @@ struct binder_proc {
 	int requested_threads;
 	int requested_threads_started;
 	int ready_threads;
+	int tmp_ref;
 	long default_priority;
 	struct dentry *debugfs_entry;
 	struct binder_alloc alloc;
@@ -360,6 +362,8 @@ struct binder_thread {
 	struct binder_error reply_error;
 	wait_queue_head_t wait;
 	struct binder_stats stats;
+	atomic_t tmp_ref;
+	bool is_dead;
 };
 
 struct binder_transaction {
@@ -379,10 +383,19 @@ struct binder_transaction {
 	long	priority;
 	long	saved_priority;
 	kuid_t	sender_euid;
+	/**
+	 * @lock:  protects @from, @to_proc, and @to_thread
+	 *
+	 * @from, @to_proc, and @to_thread can be set to NULL
+	 * during thread teardown
+	 */
+	spinlock_t lock;
 };
 
 static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
+static void binder_free_thread(struct binder_thread *thread);
+static void binder_free_proc(struct binder_proc *proc);
 
 static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 {
@@ -781,6 +794,79 @@ static void binder_pop_transaction(struct binder_thread *target_thread,
 	t->from = NULL;
 }
 
+/**
+ * binder_thread_dec_tmpref() - decrement thread->tmp_ref
+ * @thread:	thread to decrement
+ *
+ * A thread needs to be kept alive while being used to create or
+ * handle a transaction. binder_get_txn_from() is used to safely
+ * extract t->from from a binder_transaction and keep the thread
+ * indicated by t->from from being freed. When done with that
+ * binder_thread, this function is called to decrement the
+ * tmp_ref and free if appropriate (thread has been released
+ * and no transaction being processed by the driver)
+ */
+static void binder_thread_dec_tmpref(struct binder_thread *thread)
+{
+	/*
+	 * atomic is used to protect the counter value while
+	 * it cannot reach zero or thread->is_dead is false
+	 *
+	 * TODO: future patch adds locking to ensure that the
+	 * check of tmp_ref and is_dead is done with a lock held
+	 */
+	atomic_dec(&thread->tmp_ref);
+	if (thread->is_dead && !atomic_read(&thread->tmp_ref)) {
+		binder_free_thread(thread);
+		return;
+	}
+}
+
+/**
+ * binder_proc_dec_tmpref() - decrement proc->tmp_ref
+ * @proc:	proc to decrement
+ *
+ * A binder_proc needs to be kept alive while being used to create or
+ * handle a transaction. proc->tmp_ref is incremented when
+ * creating a new transaction or the binder_proc is currently in-use
+ * by threads that are being released. When done with the binder_proc,
+ * this function is called to decrement the counter and free the
+ * proc if appropriate (proc has been released, all threads have
+ * been released and not currenly in-use to process a transaction).
+ */
+static void binder_proc_dec_tmpref(struct binder_proc *proc)
+{
+	proc->tmp_ref--;
+	if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) &&
+			!proc->tmp_ref) {
+		binder_free_proc(proc);
+		return;
+	}
+}
+
+/**
+ * binder_get_txn_from() - safely extract the "from" thread in transaction
+ * @t:	binder transaction for t->from
+ *
+ * Atomically return the "from" thread and increment the tmp_ref
+ * count for the thread to ensure it stays alive until
+ * binder_thread_dec_tmpref() is called.
+ *
+ * Return: the value of t->from
+ */
+static struct binder_thread *binder_get_txn_from(
+		struct binder_transaction *t)
+{
+	struct binder_thread *from;
+
+	spin_lock(&t->lock);
+	from = t->from;
+	if (from)
+		atomic_inc(&from->tmp_ref);
+	spin_unlock(&t->lock);
+	return from;
+}
+
 static void binder_free_transaction(struct binder_transaction *t)
 {
 	if (t->buffer)
@@ -797,7 +883,7 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 
 	BUG_ON(t->flags & TF_ONE_WAY);
 	while (1) {
-		target_thread = t->from;
+		target_thread = binder_get_txn_from(t);
 		if (target_thread) {
 			binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 				     "send failed reply for transaction %d to %d:%d\n",
@@ -816,6 +902,7 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 				WARN(1, "Unexpected reply error: %u\n",
 						target_thread->reply_error.cmd);
 			}
+			binder_thread_dec_tmpref(target_thread);
 			binder_free_transaction(t);
 			return;
 		}
@@ -1396,7 +1483,7 @@ static void binder_transaction(struct binder_proc *proc,
 	binder_size_t *offp, *off_end, *off_start;
 	binder_size_t off_min;
 	u8 *sg_bufp, *sg_buf_end;
-	struct binder_proc *target_proc;
+	struct binder_proc *target_proc = NULL;
 	struct binder_thread *target_thread = NULL;
 	struct binder_node *target_node = NULL;
 	struct list_head *target_list;
@@ -1433,12 +1520,14 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 		binder_set_nice(in_reply_to->saved_priority);
 		if (in_reply_to->to_thread != thread) {
+			spin_lock(&in_reply_to->lock);
 			binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n",
 				proc->pid, thread->pid, in_reply_to->debug_id,
 				in_reply_to->to_proc ?
 				in_reply_to->to_proc->pid : 0,
 				in_reply_to->to_thread ?
 				in_reply_to->to_thread->pid : 0);
+			spin_unlock(&in_reply_to->lock);
 			return_error = BR_FAILED_REPLY;
 			return_error_param = -EPROTO;
 			return_error_line = __LINE__;
@@ -1446,7 +1535,7 @@ static void binder_transaction(struct binder_proc *proc,
 			goto err_bad_call_stack;
 		}
 		thread->transaction_stack = in_reply_to->to_parent;
-		target_thread = in_reply_to->from;
+		target_thread = binder_get_txn_from(in_reply_to);
 		if (target_thread == NULL) {
 			return_error = BR_DEAD_REPLY;
 			return_error_line = __LINE__;
@@ -1466,6 +1555,7 @@ static void binder_transaction(struct binder_proc *proc,
 			goto err_dead_binder;
 		}
 		target_proc = target_thread->proc;
+		target_proc->tmp_ref++;
 	} else {
 		if (tr->target.handle) {
 			struct binder_ref *ref;
@@ -1509,6 +1599,7 @@ static void binder_transaction(struct binder_proc *proc,
 			return_error_line = __LINE__;
 			goto err_dead_binder;
 		}
+		target_proc->tmp_ref++;
 		if (security_binder_transaction(proc->tsk,
 						target_proc->tsk) < 0) {
 			return_error = BR_FAILED_REPLY;
@@ -1521,19 +1612,30 @@ static void binder_transaction(struct binder_proc *proc,
 
 			tmp = thread->transaction_stack;
 			if (tmp->to_thread != thread) {
+				spin_lock(&tmp->lock);
 				binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n",
 					proc->pid, thread->pid, tmp->debug_id,
 					tmp->to_proc ? tmp->to_proc->pid : 0,
 					tmp->to_thread ?
 					tmp->to_thread->pid : 0);
+				spin_unlock(&tmp->lock);
 				return_error = BR_FAILED_REPLY;
 				return_error_param = -EPROTO;
 				return_error_line = __LINE__;
 				goto err_bad_call_stack;
 			}
 			while (tmp) {
-				if (tmp->from && tmp->from->proc == target_proc)
-					target_thread = tmp->from;
+				struct binder_thread *from;
+
+				spin_lock(&tmp->lock);
+				from = tmp->from;
+				if (from && from->proc == target_proc) {
+					atomic_inc(&from->tmp_ref);
+					target_thread = from;
+					spin_unlock(&tmp->lock);
+					break;
+				}
+				spin_unlock(&tmp->lock);
 				tmp = tmp->from_parent;
 			}
 		}
@@ -1557,6 +1659,7 @@ static void binder_transaction(struct binder_proc *proc,
 		goto err_alloc_t_failed;
 	}
 	binder_stats_created(BINDER_STAT_TRANSACTION);
+	spin_lock_init(&t->lock);
 
 	tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL);
 	if (tcomplete == NULL) {
@@ -1815,6 +1918,8 @@ static void binder_transaction(struct binder_proc *proc,
 	list_add_tail(&tcomplete->entry, &thread->todo);
 
 	if (reply) {
+		if (target_thread->is_dead)
+			goto err_dead_proc_or_thread;
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_pop_transaction(target_thread, in_reply_to);
 		binder_free_transaction(in_reply_to);
@@ -1823,6 +1928,11 @@ static void binder_transaction(struct binder_proc *proc,
 		t->need_reply = 1;
 		t->from_parent = thread->transaction_stack;
 		thread->transaction_stack = t;
+		if (target_proc->is_dead ||
+				(target_thread && target_thread->is_dead)) {
+			binder_pop_transaction(thread, t);
+			goto err_dead_proc_or_thread;
+		}
 	} else {
 		BUG_ON(target_node == NULL);
 		BUG_ON(t->buffer->async_transaction != 1);
@@ -1831,6 +1941,9 @@ static void binder_transaction(struct binder_proc *proc,
 			target_wait = NULL;
 		} else
 			target_node->has_async_transaction = 1;
+		if (target_proc->is_dead ||
+				(target_thread && target_thread->is_dead))
+			goto err_dead_proc_or_thread;
 	}
 	t->work.type = BINDER_WORK_TRANSACTION;
 	list_add_tail(&t->work.entry, target_list);
@@ -1840,6 +1953,9 @@ static void binder_transaction(struct binder_proc *proc,
 		else
 			wake_up_interruptible(target_wait);
 	}
+	if (target_thread)
+		binder_thread_dec_tmpref(target_thread);
+	binder_proc_dec_tmpref(target_proc);
 	/*
 	 * write barrier to synchronize with initialization
 	 * of log entry
@@ -1848,6 +1964,9 @@ static void binder_transaction(struct binder_proc *proc,
 	WRITE_ONCE(e->debug_id_done, t_debug_id);
 	return;
 
+err_dead_proc_or_thread:
+	return_error = BR_DEAD_REPLY;
+	return_error_line = __LINE__;
 err_translate_failed:
 err_bad_object_type:
 err_bad_offset:
@@ -1870,6 +1989,10 @@ err_empty_call_stack:
 err_dead_binder:
 err_invalid_target_handle:
 err_no_context_mgr_node:
+	if (target_thread)
+		binder_thread_dec_tmpref(target_thread);
+	if (target_proc)
+		binder_proc_dec_tmpref(target_proc);
 	if (target_node)
 		binder_dec_node(target_node, 1, 0);
 
@@ -2422,6 +2545,7 @@ retry:
 		struct binder_transaction_data tr;
 		struct binder_work *w;
 		struct binder_transaction *t = NULL;
+		struct binder_thread *t_from;
 
 		if (!list_empty(&thread->todo)) {
 			w = list_first_entry(&thread->todo, struct binder_work,
@@ -2610,8 +2734,9 @@ retry:
 		tr.flags = t->flags;
 		tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid);
 
-		if (t->from) {
-			struct task_struct *sender = t->from->proc->tsk;
+		t_from = binder_get_txn_from(t);
+		if (t_from) {
+			struct task_struct *sender = t_from->proc->tsk;
 
 			tr.sender_pid = task_tgid_nr_ns(sender,
 							task_active_pid_ns(current));
@@ -2628,11 +2753,17 @@ retry:
 					ALIGN(t->buffer->data_size,
 					    sizeof(void *));
 
-		if (put_user(cmd, (uint32_t __user *)ptr))
+		if (put_user(cmd, (uint32_t __user *)ptr)) {
+			if (t_from)
+				binder_thread_dec_tmpref(t_from);
 			return -EFAULT;
+		}
 		ptr += sizeof(uint32_t);
-		if (copy_to_user(ptr, &tr, sizeof(tr)))
+		if (copy_to_user(ptr, &tr, sizeof(tr))) {
+			if (t_from)
+				binder_thread_dec_tmpref(t_from);
 			return -EFAULT;
+		}
 		ptr += sizeof(tr);
 
 		trace_binder_transaction_received(t);
@@ -2642,11 +2773,13 @@ retry:
 			     proc->pid, thread->pid,
 			     (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" :
 			     "BR_REPLY",
-			     t->debug_id, t->from ? t->from->proc->pid : 0,
-			     t->from ? t->from->pid : 0, cmd,
+			     t->debug_id, t_from ? t_from->proc->pid : 0,
+			     t_from ? t_from->pid : 0, cmd,
 			     t->buffer->data_size, t->buffer->offsets_size,
 			     (u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets);
 
+		if (t_from)
+			binder_thread_dec_tmpref(t_from);
 		list_del(&t->work.entry);
 		t->buffer->allow_user_free = 1;
 		if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) {
@@ -2758,6 +2891,7 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc)
 		binder_stats_created(BINDER_STAT_THREAD);
 		thread->proc = proc;
 		thread->pid = current->pid;
+		atomic_set(&thread->tmp_ref, 0);
 		init_waitqueue_head(&thread->wait);
 		INIT_LIST_HEAD(&thread->todo);
 		rb_link_node(&thread->rb_node, parent, p);
@@ -2771,18 +2905,55 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc)
 	return thread;
 }
 
-static int binder_free_thread(struct binder_proc *proc,
-			      struct binder_thread *thread)
+static void binder_free_proc(struct binder_proc *proc)
+{
+	BUG_ON(!list_empty(&proc->todo));
+	BUG_ON(!list_empty(&proc->delivered_death));
+	binder_alloc_deferred_release(&proc->alloc);
+	put_task_struct(proc->tsk);
+	binder_stats_deleted(BINDER_STAT_PROC);
+	kfree(proc);
+}
+
+static void binder_free_thread(struct binder_thread *thread)
+{
+	BUG_ON(!list_empty(&thread->todo));
+	binder_stats_deleted(BINDER_STAT_THREAD);
+	binder_proc_dec_tmpref(thread->proc);
+	kfree(thread);
+}
+
+static int binder_thread_release(struct binder_proc *proc,
+				 struct binder_thread *thread)
 {
 	struct binder_transaction *t;
 	struct binder_transaction *send_reply = NULL;
 	int active_transactions = 0;
+	struct binder_transaction *last_t = NULL;
 
+	/*
+	 * take a ref on the proc so it survives
+	 * after we remove this thread from proc->threads.
+	 * The corresponding dec is when we actually
+	 * free the thread in binder_free_thread()
+	 */
+	proc->tmp_ref++;
+	/*
+	 * take a ref on this thread to ensure it
+	 * survives while we are releasing it
+	 */
+	atomic_inc(&thread->tmp_ref);
 	rb_erase(&thread->rb_node, &proc->threads);
 	t = thread->transaction_stack;
-	if (t && t->to_thread == thread)
-		send_reply = t;
+	if (t) {
+		spin_lock(&t->lock);
+		if (t->to_thread == thread)
+			send_reply = t;
+	}
+	thread->is_dead = true;
+
 	while (t) {
+		last_t = t;
 		active_transactions++;
 		binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
 			     "release %d:%d transaction %d %s, still active\n",
@@ -2803,12 +2974,15 @@ static int binder_free_thread(struct binder_proc *proc,
 			t = t->from_parent;
 		} else
 			BUG();
+		spin_unlock(&last_t->lock);
+		if (t)
+			spin_lock(&t->lock);
 	}
+
 	if (send_reply)
 		binder_send_failed_reply(send_reply, BR_DEAD_REPLY);
 	binder_release_work(&thread->todo);
-	kfree(thread);
-	binder_stats_deleted(BINDER_STAT_THREAD);
+	binder_thread_dec_tmpref(thread);
 	return active_transactions;
 }
 
@@ -2996,7 +3170,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case BINDER_THREAD_EXIT:
 		binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n",
 			     proc->pid, thread->pid);
-		binder_free_thread(proc, thread);
+		binder_thread_release(proc, thread);
 		thread = NULL;
 		break;
 	case BINDER_VERSION: {
@@ -3266,7 +3440,13 @@ static void binder_deferred_release(struct binder_proc *proc)
 		context->binder_context_mgr_node = NULL;
 	}
 	mutex_unlock(&context->context_mgr_node_lock);
+	/*
+	 * Make sure proc stays alive after we
+	 * remove all the threads
+	 */
+	proc->tmp_ref++;
 
+	proc->is_dead = true;
 	threads = 0;
 	active_transactions = 0;
 	while ((n = rb_first(&proc->threads))) {
@@ -3274,7 +3454,7 @@ static void binder_deferred_release(struct binder_proc *proc)
 
 		thread = rb_entry(n, struct binder_thread, rb_node);
 		threads++;
-		active_transactions += binder_free_thread(proc, thread);
+		active_transactions += binder_thread_release(proc, thread);
 	}
 
 	nodes = 0;
@@ -3300,17 +3480,12 @@ static void binder_deferred_release(struct binder_proc *proc)
 	binder_release_work(&proc->todo);
 	binder_release_work(&proc->delivered_death);
 
-	binder_alloc_deferred_release(&proc->alloc);
-	binder_stats_deleted(BINDER_STAT_PROC);
-
-	put_task_struct(proc->tsk);
-
 	binder_debug(BINDER_DEBUG_OPEN_CLOSE,
 		     "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n",
 		     __func__, proc->pid, threads, nodes, incoming_refs,
 		     outgoing_refs, active_transactions);
 
-	kfree(proc);
+	binder_proc_dec_tmpref(proc);
 }
 
 static void binder_deferred_func(struct work_struct *work)
@@ -3371,6 +3546,7 @@ binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer)
 static void print_binder_transaction(struct seq_file *m, const char *prefix,
 				     struct binder_transaction *t)
 {
+	spin_lock(&t->lock);
 	seq_printf(m,
 		   "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d",
 		   prefix, t->debug_id, t,
@@ -3379,6 +3555,8 @@ static void print_binder_transaction(struct seq_file *m, const char *prefix,
 		   t->to_proc ? t->to_proc->pid : 0,
 		   t->to_thread ? t->to_thread->pid : 0,
 		   t->code, t->flags, t->priority, t->need_reply);
+	spin_unlock(&t->lock);
+
 	if (t->buffer == NULL) {
 		seq_puts(m, " buffer free\n");
 		return;
@@ -3443,9 +3621,10 @@ static void print_binder_thread(struct seq_file *m,
 	size_t start_pos = m->count;
 	size_t header_pos;
 
-	seq_printf(m, "  thread %d: l %02x need_return %d\n",
+	seq_printf(m, "  thread %d: l %02x need_return %d tr %d\n",
 			thread->pid, thread->looper,
-			thread->looper_need_return);
+			thread->looper_need_return,
+			atomic_read(&thread->tmp_ref));
 	header_pos = m->count;
 	t = thread->transaction_stack;
 	while (t) {

From f7d874123e4de442dcd6aca8afe0fd0b9d2854b4 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 8 May 2017 09:16:27 -0700
Subject: [PATCH 0921/3220] FROMLIST: binder: refactor binder ref inc/dec for
 thread safety

(from https://patchwork.kernel.org/patch/9817781/)

Once locks are added, binder_ref's will only be accessed
safely with the proc lock held. Refactor the inc/dec paths
to make them atomic with the binder_get_ref* paths and
node inc/dec. For example, instead of:

  ref = binder_get_ref(proc, handle, strong);
  ...
  binder_dec_ref(ref, strong);

we now have:

  ret = binder_dec_ref_for_handle(proc, handle, strong, &rdata);

Since the actual ref is no longer exposed to callers, a
new struct binder_ref_data is introduced which can be used
to return a copy of ref state.

Change-Id: I7de22107f8ebc967cee63251d584fceb4ea56250
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c       | 484 ++++++++++++++++++++++++---------
 drivers/android/binder_trace.h |  32 ++-
 2 files changed, 379 insertions(+), 137 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index a16375e133cc..b7109a356c46 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -291,20 +291,51 @@ struct binder_ref_death {
 	binder_uintptr_t cookie;
 };
 
+/**
+ * struct binder_ref_data - binder_ref counts and id
+ * @debug_id:        unique ID for the ref
+ * @desc:            unique userspace handle for ref
+ * @strong:          strong ref count (debugging only if not locked)
+ * @weak:            weak ref count (debugging only if not locked)
+ *
+ * Structure to hold ref count and ref id information. Since
+ * the actual ref can only be accessed with a lock, this structure
+ * is used to return information about the ref to callers of
+ * ref inc/dec functions.
+ */
+struct binder_ref_data {
+	int debug_id;
+	uint32_t desc;
+	int strong;
+	int weak;
+};
+
+/**
+ * struct binder_ref - struct to track references on nodes
+ * @data:        binder_ref_data containing id, handle, and current refcounts
+ * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree
+ * @rb_node_node: node for lookup by @node in proc's rb_tree
+ * @node_entry:  list entry for node->refs list in target node
+ * @proc:        binder_proc containing ref
+ * @node:        binder_node of target node. When cleaning up a
+ *               ref for deletion in binder_cleanup_ref, a non-NULL
+ *               @node indicates the node must be freed
+ * @death:       pointer to death notification (ref_death) if requested
+ *
+ * Structure to track references from procA to target node (on procB). This
+ * structure is unsafe to access without holding @proc->outer_lock.
+ */
 struct binder_ref {
 	/* Lookups needed: */
 	/*   node + proc => ref (transaction) */
 	/*   desc + proc => ref (transaction, inc/dec ref) */
 	/*   node => refs + procs (proc exit) */
-	int debug_id;
+	struct binder_ref_data data;
 	struct rb_node rb_node_desc;
 	struct rb_node rb_node_node;
 	struct hlist_node node_entry;
 	struct binder_proc *proc;
 	struct binder_node *node;
-	uint32_t desc;
-	int strong;
-	int weak;
 	struct binder_ref_death *death;
 };
 
@@ -628,11 +659,11 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc,
 	while (n) {
 		ref = rb_entry(n, struct binder_ref, rb_node_desc);
 
-		if (desc < ref->desc) {
+		if (desc < ref->data.desc) {
 			n = n->rb_left;
-		} else if (desc > ref->desc) {
+		} else if (desc > ref->data.desc) {
 			n = n->rb_right;
-		} else if (need_strong_ref && !ref->strong) {
+		} else if (need_strong_ref && !ref->data.strong) {
 			binder_user_error("tried to use weak ref as strong ref\n");
 			return NULL;
 		} else {
@@ -642,14 +673,33 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc,
 	return NULL;
 }
 
+/**
+ * binder_get_ref_for_node() - get the ref associated with given node
+ * @proc:	binder_proc that owns the ref
+ * @node:	binder_node of target
+ * @new_ref:	newly allocated binder_ref to be initialized or %NULL
+ *
+ * Look up the ref for the given node and return it if it exists
+ *
+ * If it doesn't exist and the caller provides a newly allocated
+ * ref, initialize the fields of the newly allocated ref and insert
+ * into the given proc rb_trees and node refs list.
+ *
+ * Return:	the ref for node. It is possible that another thread
+ *		allocated/initialized the ref first in which case the
+ *		returned ref would be different than the passed-in
+ *		new_ref. new_ref must be kfree'd by the caller in
+ *		this case.
+ */
 static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
-						  struct binder_node *node)
+						  struct binder_node *node,
+						  struct binder_ref *new_ref)
 {
-	struct rb_node *n;
+	struct binder_context *context = proc->context;
 	struct rb_node **p = &proc->refs_by_node.rb_node;
 	struct rb_node *parent = NULL;
-	struct binder_ref *ref, *new_ref;
-	struct binder_context *context = proc->context;
+	struct binder_ref *ref;
+	struct rb_node *n;
 
 	while (*p) {
 		parent = *p;
@@ -662,22 +712,22 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 		else
 			return ref;
 	}
-	new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
-	if (new_ref == NULL)
+	if (!new_ref)
 		return NULL;
+
 	binder_stats_created(BINDER_STAT_REF);
-	new_ref->debug_id = atomic_inc_return(&binder_last_id);
+	new_ref->data.debug_id = atomic_inc_return(&binder_last_id);
 	new_ref->proc = proc;
 	new_ref->node = node;
 	rb_link_node(&new_ref->rb_node_node, parent, p);
 	rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node);
 
-	new_ref->desc = (node == context->binder_context_mgr_node) ? 0 : 1;
+	new_ref->data.desc = (node == context->binder_context_mgr_node) ? 0 : 1;
 	for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
 		ref = rb_entry(n, struct binder_ref, rb_node_desc);
-		if (ref->desc > new_ref->desc)
+		if (ref->data.desc > new_ref->data.desc)
 			break;
-		new_ref->desc = ref->desc + 1;
+		new_ref->data.desc = ref->data.desc + 1;
 	}
 
 	p = &proc->refs_by_desc.rb_node;
@@ -685,9 +735,9 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 		parent = *p;
 		ref = rb_entry(parent, struct binder_ref, rb_node_desc);
 
-		if (new_ref->desc < ref->desc)
+		if (new_ref->data.desc < ref->data.desc)
 			p = &(*p)->rb_left;
-		else if (new_ref->desc > ref->desc)
+		else if (new_ref->data.desc > ref->data.desc)
 			p = &(*p)->rb_right;
 		else
 			BUG();
@@ -698,89 +748,267 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 
 	binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 		     "%d new ref %d desc %d for node %d\n",
-		      proc->pid, new_ref->debug_id, new_ref->desc,
+		      proc->pid, new_ref->data.debug_id, new_ref->data.desc,
 		      node->debug_id);
 	return new_ref;
 }
 
-static void binder_delete_ref(struct binder_ref *ref)
+static void binder_cleanup_ref(struct binder_ref *ref)
 {
 	binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 		     "%d delete ref %d desc %d for node %d\n",
-		      ref->proc->pid, ref->debug_id, ref->desc,
+		      ref->proc->pid, ref->data.debug_id, ref->data.desc,
 		      ref->node->debug_id);
 
 	rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc);
 	rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node);
-	if (ref->strong)
+
+	if (ref->data.strong)
 		binder_dec_node(ref->node, 1, 1);
+
 	hlist_del(&ref->node_entry);
 	binder_dec_node(ref->node, 0, 1);
+
 	if (ref->death) {
 		binder_debug(BINDER_DEBUG_DEAD_BINDER,
 			     "%d delete ref %d desc %d has death notification\n",
-			      ref->proc->pid, ref->debug_id, ref->desc);
+			      ref->proc->pid, ref->data.debug_id,
+			      ref->data.desc);
 		list_del(&ref->death->work.entry);
-		kfree(ref->death);
 		binder_stats_deleted(BINDER_STAT_DEATH);
 	}
-	kfree(ref);
 	binder_stats_deleted(BINDER_STAT_REF);
 }
 
+/**
+ * binder_inc_ref() - increment the ref for given handle
+ * @ref:         ref to be incremented
+ * @strong:      if true, strong increment, else weak
+ * @target_list: list to queue node work on
+ *
+ * Increment the ref.
+ *
+ * Return: 0, if successful, else errno
+ */
 static int binder_inc_ref(struct binder_ref *ref, int strong,
 			  struct list_head *target_list)
 {
 	int ret;
 
 	if (strong) {
-		if (ref->strong == 0) {
+		if (ref->data.strong == 0) {
 			ret = binder_inc_node(ref->node, 1, 1, target_list);
 			if (ret)
 				return ret;
 		}
-		ref->strong++;
+		ref->data.strong++;
 	} else {
-		if (ref->weak == 0) {
+		if (ref->data.weak == 0) {
 			ret = binder_inc_node(ref->node, 0, 1, target_list);
 			if (ret)
 				return ret;
 		}
-		ref->weak++;
+		ref->data.weak++;
 	}
 	return 0;
 }
 
-
-static int binder_dec_ref(struct binder_ref *ref, int strong)
+/**
+ * binder_dec_ref() - dec the ref for given handle
+ * @ref:	ref to be decremented
+ * @strong:	if true, strong decrement, else weak
+ *
+ * Decrement the ref.
+ *
+ * TODO: kfree is avoided here since an upcoming patch
+ * will put this under a lock.
+ *
+ * Return: true if ref is cleaned up and ready to be freed
+ */
+static bool binder_dec_ref(struct binder_ref *ref, int strong)
 {
 	if (strong) {
-		if (ref->strong == 0) {
+		if (ref->data.strong == 0) {
 			binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n",
-					  ref->proc->pid, ref->debug_id,
-					  ref->desc, ref->strong, ref->weak);
-			return -EINVAL;
+					  ref->proc->pid, ref->data.debug_id,
+					  ref->data.desc, ref->data.strong,
+					  ref->data.weak);
+			return false;
 		}
-		ref->strong--;
-		if (ref->strong == 0) {
+		ref->data.strong--;
+		if (ref->data.strong == 0) {
 			int ret;
 
 			ret = binder_dec_node(ref->node, strong, 1);
 			if (ret)
-				return ret;
+				return false;
 		}
 	} else {
-		if (ref->weak == 0) {
+		if (ref->data.weak == 0) {
 			binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n",
-					  ref->proc->pid, ref->debug_id,
-					  ref->desc, ref->strong, ref->weak);
-			return -EINVAL;
+					  ref->proc->pid, ref->data.debug_id,
+					  ref->data.desc, ref->data.strong,
+					  ref->data.weak);
+			return false;
 		}
-		ref->weak--;
+		ref->data.weak--;
 	}
-	if (ref->strong == 0 && ref->weak == 0)
-		binder_delete_ref(ref);
-	return 0;
+	if (ref->data.strong == 0 && ref->data.weak == 0) {
+		binder_cleanup_ref(ref);
+		/*
+		 * TODO: we could kfree(ref) here, but an upcoming
+		 * patch will call this with a lock held, so we
+		 * return an indication that the ref should be
+		 * freed.
+		 */
+		return true;
+	}
+	return false;
+}
+
+/**
+ * binder_get_node_from_ref() - get the node from the given proc/desc
+ * @proc:	proc containing the ref
+ * @desc:	the handle associated with the ref
+ * @need_strong_ref: if true, only return node if ref is strong
+ * @rdata:	the id/refcount data for the ref
+ *
+ * Given a proc and ref handle, return the associated binder_node
+ *
+ * Return: a binder_node or NULL if not found or not strong when strong required
+ */
+static struct binder_node *binder_get_node_from_ref(
+		struct binder_proc *proc,
+		u32 desc, bool need_strong_ref,
+		struct binder_ref_data *rdata)
+{
+	struct binder_node *node;
+	struct binder_ref *ref;
+
+	ref = binder_get_ref(proc, desc, need_strong_ref);
+	if (!ref)
+		goto err_no_ref;
+	node = ref->node;
+	if (rdata)
+		*rdata = ref->data;
+
+	return node;
+
+err_no_ref:
+	return NULL;
+}
+
+/**
+ * binder_free_ref() - free the binder_ref
+ * @ref:	ref to free
+ *
+ * Free the binder_ref and the binder_ref_death indicated by ref->death.
+ */
+static void binder_free_ref(struct binder_ref *ref)
+{
+	kfree(ref->death);
+	kfree(ref);
+}
+
+/**
+ * binder_update_ref_for_handle() - inc/dec the ref for given handle
+ * @proc:	proc containing the ref
+ * @desc:	the handle associated with the ref
+ * @increment:	true=inc reference, false=dec reference
+ * @strong:	true=strong reference, false=weak reference
+ * @rdata:	the id/refcount data for the ref
+ *
+ * Given a proc and ref handle, increment or decrement the ref
+ * according to "increment" arg.
+ *
+ * Return: 0 if successful, else errno
+ */
+static int binder_update_ref_for_handle(struct binder_proc *proc,
+		uint32_t desc, bool increment, bool strong,
+		struct binder_ref_data *rdata)
+{
+	int ret = 0;
+	struct binder_ref *ref;
+	bool delete_ref = false;
+
+	ref = binder_get_ref(proc, desc, strong);
+	if (!ref) {
+		ret = -EINVAL;
+		goto err_no_ref;
+	}
+	if (increment)
+		ret = binder_inc_ref(ref, strong, NULL);
+	else
+		delete_ref = binder_dec_ref(ref, strong);
+
+	if (rdata)
+		*rdata = ref->data;
+
+	if (delete_ref)
+		binder_free_ref(ref);
+	return ret;
+
+err_no_ref:
+	return ret;
+}
+
+/**
+ * binder_dec_ref_for_handle() - dec the ref for given handle
+ * @proc:	proc containing the ref
+ * @desc:	the handle associated with the ref
+ * @strong:	true=strong reference, false=weak reference
+ * @rdata:	the id/refcount data for the ref
+ *
+ * Just calls binder_update_ref_for_handle() to decrement the ref.
+ *
+ * Return: 0 if successful, else errno
+ */
+static int binder_dec_ref_for_handle(struct binder_proc *proc,
+		uint32_t desc, bool strong, struct binder_ref_data *rdata)
+{
+	return binder_update_ref_for_handle(proc, desc, false, strong, rdata);
+}
+
+
+/**
+ * binder_inc_ref_for_node() - increment the ref for given proc/node
+ * @proc:	 proc containing the ref
+ * @node:	 target node
+ * @strong:	 true=strong reference, false=weak reference
+ * @target_list: worklist to use if node is incremented
+ * @rdata:	 the id/refcount data for the ref
+ *
+ * Given a proc and node, increment the ref. Create the ref if it
+ * doesn't already exist
+ *
+ * Return: 0 if successful, else errno
+ */
+static int binder_inc_ref_for_node(struct binder_proc *proc,
+			struct binder_node *node,
+			bool strong,
+			struct list_head *target_list,
+			struct binder_ref_data *rdata)
+{
+	struct binder_ref *ref;
+	struct binder_ref *new_ref = NULL;
+	int ret = 0;
+
+	ref = binder_get_ref_for_node(proc, node, NULL);
+	if (!ref) {
+		new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+		if (!new_ref)
+			return -ENOMEM;
+		ref = binder_get_ref_for_node(proc, node, new_ref);
+	}
+	ret = binder_inc_ref(ref, strong, target_list);
+	*rdata = ref->data;
+	if (new_ref && ref != new_ref)
+		/*
+		 * Another thread created the ref first so
+		 * free the one we allocated
+		 */
+		kfree(new_ref);
+	return ret;
 }
 
 static void binder_pop_transaction(struct binder_thread *target_thread,
@@ -1125,20 +1353,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
 			struct flat_binder_object *fp;
-			struct binder_ref *ref;
+			struct binder_ref_data rdata;
+			int ret;
 
 			fp = to_flat_binder_object(hdr);
-			ref = binder_get_ref(proc, fp->handle,
-					     hdr->type == BINDER_TYPE_HANDLE);
-			if (ref == NULL) {
-				pr_err("transaction release %d bad handle %d\n",
-				 debug_id, fp->handle);
+			ret = binder_dec_ref_for_handle(proc, fp->handle,
+				hdr->type == BINDER_TYPE_HANDLE, &rdata);
+
+			if (ret) {
+				pr_err("transaction release %d bad handle %d, ret = %d\n",
+				 debug_id, fp->handle, ret);
 				break;
 			}
 			binder_debug(BINDER_DEBUG_TRANSACTION,
-				     "        ref %d desc %d (node %d)\n",
-				     ref->debug_id, ref->desc, ref->node->debug_id);
-			binder_dec_ref(ref, hdr->type == BINDER_TYPE_HANDLE);
+				     "        ref %d desc %d\n",
+				     rdata.debug_id, rdata.desc);
 		} break;
 
 		case BINDER_TYPE_FD: {
@@ -1210,9 +1439,10 @@ static int binder_translate_binder(struct flat_binder_object *fp,
 				   struct binder_thread *thread)
 {
 	struct binder_node *node;
-	struct binder_ref *ref;
 	struct binder_proc *proc = thread->proc;
 	struct binder_proc *target_proc = t->to_proc;
+	struct binder_ref_data rdata;
+	int ret;
 
 	node = binder_get_node(proc, fp->binder);
 	if (!node) {
@@ -1233,25 +1463,25 @@ static int binder_translate_binder(struct flat_binder_object *fp,
 	if (security_binder_transfer_binder(proc->tsk, target_proc->tsk))
 		return -EPERM;
 
-	ref = binder_get_ref_for_node(target_proc, node);
-	if (!ref)
-		return -ENOMEM;
+	ret = binder_inc_ref_for_node(target_proc, node,
+			fp->hdr.type == BINDER_TYPE_BINDER,
+			&thread->todo, &rdata);
+	if (ret)
+		return ret;
 
 	if (fp->hdr.type == BINDER_TYPE_BINDER)
 		fp->hdr.type = BINDER_TYPE_HANDLE;
 	else
 		fp->hdr.type = BINDER_TYPE_WEAK_HANDLE;
 	fp->binder = 0;
-	fp->handle = ref->desc;
+	fp->handle = rdata.desc;
 	fp->cookie = 0;
-	binder_inc_ref(ref, fp->hdr.type == BINDER_TYPE_HANDLE, &thread->todo);
 
-	trace_binder_transaction_node_to_ref(t, node, ref);
+	trace_binder_transaction_node_to_ref(t, node, &rdata);
 	binder_debug(BINDER_DEBUG_TRANSACTION,
 		     "        node %d u%016llx -> ref %d desc %d\n",
 		     node->debug_id, (u64)node->ptr,
-		     ref->debug_id, ref->desc);
-
+		     rdata.debug_id, rdata.desc);
 	return 0;
 }
 
@@ -1259,13 +1489,14 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 				   struct binder_transaction *t,
 				   struct binder_thread *thread)
 {
-	struct binder_ref *ref;
 	struct binder_proc *proc = thread->proc;
 	struct binder_proc *target_proc = t->to_proc;
+	struct binder_node *node;
+	struct binder_ref_data src_rdata;
 
-	ref = binder_get_ref(proc, fp->handle,
-			     fp->hdr.type == BINDER_TYPE_HANDLE);
-	if (!ref) {
+	node = binder_get_node_from_ref(proc, fp->handle,
+			fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata);
+	if (!node) {
 		binder_user_error("%d:%d got transaction with invalid handle, %d\n",
 				  proc->pid, thread->pid, fp->handle);
 		return -EINVAL;
@@ -1273,37 +1504,41 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 	if (security_binder_transfer_binder(proc->tsk, target_proc->tsk))
 		return -EPERM;
 
-	if (ref->node->proc == target_proc) {
+	if (node->proc == target_proc) {
 		if (fp->hdr.type == BINDER_TYPE_HANDLE)
 			fp->hdr.type = BINDER_TYPE_BINDER;
 		else
 			fp->hdr.type = BINDER_TYPE_WEAK_BINDER;
-		fp->binder = ref->node->ptr;
-		fp->cookie = ref->node->cookie;
-		binder_inc_node(ref->node, fp->hdr.type == BINDER_TYPE_BINDER,
+		fp->binder = node->ptr;
+		fp->cookie = node->cookie;
+		binder_inc_node(node,
+				fp->hdr.type == BINDER_TYPE_BINDER,
 				0, NULL);
-		trace_binder_transaction_ref_to_node(t, ref);
+		trace_binder_transaction_ref_to_node(t, node, &src_rdata);
 		binder_debug(BINDER_DEBUG_TRANSACTION,
 			     "        ref %d desc %d -> node %d u%016llx\n",
-			     ref->debug_id, ref->desc, ref->node->debug_id,
-			     (u64)ref->node->ptr);
+			     src_rdata.debug_id, src_rdata.desc, node->debug_id,
+			     (u64)node->ptr);
 	} else {
-		struct binder_ref *new_ref;
+		int ret;
+		struct binder_ref_data dest_rdata;
 
-		new_ref = binder_get_ref_for_node(target_proc, ref->node);
-		if (!new_ref)
-			return -ENOMEM;
+		ret = binder_inc_ref_for_node(target_proc, node,
+				fp->hdr.type == BINDER_TYPE_HANDLE,
+				NULL, &dest_rdata);
+		if (ret)
+			return ret;
 
 		fp->binder = 0;
-		fp->handle = new_ref->desc;
+		fp->handle = dest_rdata.desc;
 		fp->cookie = 0;
-		binder_inc_ref(new_ref, fp->hdr.type == BINDER_TYPE_HANDLE,
-			       NULL);
-		trace_binder_transaction_ref_to_ref(t, ref, new_ref);
+		trace_binder_transaction_ref_to_ref(t, node, &src_rdata,
+						    &dest_rdata);
 		binder_debug(BINDER_DEBUG_TRANSACTION,
 			     "        ref %d desc %d -> ref %d desc %d (node %d)\n",
-			     ref->debug_id, ref->desc, new_ref->debug_id,
-			     new_ref->desc, ref->node->debug_id);
+			     src_rdata.debug_id, src_rdata.desc,
+			     dest_rdata.debug_id, dest_rdata.desc,
+			     node->debug_id);
 	}
 	return 0;
 }
@@ -2044,6 +2279,8 @@ static int binder_thread_write(struct binder_proc *proc,
 	void __user *end = buffer + size;
 
 	while (ptr < end && thread->return_error.cmd == BR_OK) {
+		int ret;
+
 		if (get_user(cmd, (uint32_t __user *)ptr))
 			return -EFAULT;
 		ptr += sizeof(uint32_t);
@@ -2059,62 +2296,61 @@ static int binder_thread_write(struct binder_proc *proc,
 		case BC_RELEASE:
 		case BC_DECREFS: {
 			uint32_t target;
-			struct binder_ref *ref = NULL;
 			const char *debug_string;
+			bool strong = cmd == BC_ACQUIRE || cmd == BC_RELEASE;
+			bool increment = cmd == BC_INCREFS || cmd == BC_ACQUIRE;
+			struct binder_ref_data rdata;
 
 			if (get_user(target, (uint32_t __user *)ptr))
 				return -EFAULT;
 
 			ptr += sizeof(uint32_t);
-			if (target == 0 &&
-			    (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) {
+			ret = -1;
+			if (increment && !target) {
 				struct binder_node *ctx_mgr_node;
-
 				mutex_lock(&context->context_mgr_node_lock);
 				ctx_mgr_node = context->binder_context_mgr_node;
-				if (ctx_mgr_node) {
-					ref = binder_get_ref_for_node(proc,
-							ctx_mgr_node);
-					if (ref && ref->desc != target) {
-						binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n",
-							proc->pid, thread->pid,
-							ref->desc);
-					}
-				}
+				if (ctx_mgr_node)
+					ret = binder_inc_ref_for_node(
+							proc, ctx_mgr_node,
+							strong, NULL, &rdata);
 				mutex_unlock(&context->context_mgr_node_lock);
 			}
-			if (ref == NULL)
-				ref = binder_get_ref(proc, target,
-						     cmd == BC_ACQUIRE ||
-						     cmd == BC_RELEASE);
-			if (ref == NULL) {
-				binder_user_error("%d:%d refcount change on invalid ref %d\n",
-					proc->pid, thread->pid, target);
-				break;
+			if (ret)
+				ret = binder_update_ref_for_handle(
+						proc, target, increment, strong,
+						&rdata);
+			if (!ret && rdata.desc != target) {
+				binder_user_error("%d:%d tried to acquire reference to desc %d, got %d instead\n",
+					proc->pid, thread->pid,
+					target, rdata.desc);
 			}
 			switch (cmd) {
 			case BC_INCREFS:
 				debug_string = "IncRefs";
-				binder_inc_ref(ref, 0, NULL);
 				break;
 			case BC_ACQUIRE:
 				debug_string = "Acquire";
-				binder_inc_ref(ref, 1, NULL);
 				break;
 			case BC_RELEASE:
 				debug_string = "Release";
-				binder_dec_ref(ref, 1);
 				break;
 			case BC_DECREFS:
 			default:
 				debug_string = "DecRefs";
-				binder_dec_ref(ref, 0);
+				break;
+			}
+			if (ret) {
+				binder_user_error("%d:%d %s %d refcount change on invalid ref %d ret %d\n",
+					proc->pid, thread->pid, debug_string,
+					strong, target, ret);
 				break;
 			}
 			binder_debug(BINDER_DEBUG_USER_REFS,
-				     "%d:%d %s ref %d desc %d s %d w %d for node %d\n",
-				     proc->pid, thread->pid, debug_string, ref->debug_id,
-				     ref->desc, ref->strong, ref->weak, ref->node->debug_id);
+				     "%d:%d %s ref %d desc %d s %d w %d\n",
+				     proc->pid, thread->pid, debug_string,
+				     rdata.debug_id, rdata.desc, rdata.strong,
+				     rdata.weak);
 			break;
 		}
 		case BC_INCREFS_DONE:
@@ -2312,8 +2548,9 @@ static int binder_thread_write(struct binder_proc *proc,
 				     cmd == BC_REQUEST_DEATH_NOTIFICATION ?
 				     "BC_REQUEST_DEATH_NOTIFICATION" :
 				     "BC_CLEAR_DEATH_NOTIFICATION",
-				     (u64)cookie, ref->debug_id, ref->desc,
-				     ref->strong, ref->weak, ref->node->debug_id);
+				     (u64)cookie, ref->data.debug_id,
+				     ref->data.desc, ref->data.strong,
+				     ref->data.weak, ref->node->debug_id);
 
 			if (cmd == BC_REQUEST_DEATH_NOTIFICATION) {
 				if (ref->death) {
@@ -3474,7 +3711,8 @@ static void binder_deferred_release(struct binder_proc *proc)
 
 		ref = rb_entry(n, struct binder_ref, rb_node_desc);
 		outgoing_refs++;
-		binder_delete_ref(ref);
+		binder_cleanup_ref(ref);
+		binder_free_ref(ref);
 	}
 
 	binder_release_work(&proc->todo);
@@ -3676,9 +3914,11 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node)
 
 static void print_binder_ref(struct seq_file *m, struct binder_ref *ref)
 {
-	seq_printf(m, "  ref %d: desc %d %snode %d s %d w %d d %p\n",
-		   ref->debug_id, ref->desc, ref->node->proc ? "" : "dead ",
-		   ref->node->debug_id, ref->strong, ref->weak, ref->death);
+	seq_printf(m, "  ref %d: desc %d %snode %d s %d w %d d %pK\n",
+		   ref->data.debug_id, ref->data.desc,
+		   ref->node->proc ? "" : "dead ",
+		   ref->node->debug_id, ref->data.strong,
+		   ref->data.weak, ref->death);
 }
 
 static void print_binder_proc(struct seq_file *m,
@@ -3845,8 +4085,8 @@ static void print_binder_proc_stats(struct seq_file *m,
 		struct binder_ref *ref = rb_entry(n, struct binder_ref,
 						  rb_node_desc);
 		count++;
-		strong += ref->strong;
-		weak += ref->weak;
+		strong += ref->data.strong;
+		weak += ref->data.weak;
 	}
 	seq_printf(m, "  refs: %d s %d w %d\n", count, strong, weak);
 
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 50b0d21f42cf..7967db16ba5a 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -24,7 +24,7 @@ struct binder_buffer;
 struct binder_node;
 struct binder_proc;
 struct binder_alloc;
-struct binder_ref;
+struct binder_ref_data;
 struct binder_thread;
 struct binder_transaction;
 
@@ -147,8 +147,8 @@ TRACE_EVENT(binder_transaction_received,
 
 TRACE_EVENT(binder_transaction_node_to_ref,
 	TP_PROTO(struct binder_transaction *t, struct binder_node *node,
-		 struct binder_ref *ref),
-	TP_ARGS(t, node, ref),
+		 struct binder_ref_data *rdata),
+	TP_ARGS(t, node, rdata),
 
 	TP_STRUCT__entry(
 		__field(int, debug_id)
@@ -161,8 +161,8 @@ TRACE_EVENT(binder_transaction_node_to_ref,
 		__entry->debug_id = t->debug_id;
 		__entry->node_debug_id = node->debug_id;
 		__entry->node_ptr = node->ptr;
-		__entry->ref_debug_id = ref->debug_id;
-		__entry->ref_desc = ref->desc;
+		__entry->ref_debug_id = rdata->debug_id;
+		__entry->ref_desc = rdata->desc;
 	),
 	TP_printk("transaction=%d node=%d src_ptr=0x%016llx ==> dest_ref=%d dest_desc=%d",
 		  __entry->debug_id, __entry->node_debug_id,
@@ -171,8 +171,9 @@ TRACE_EVENT(binder_transaction_node_to_ref,
 );
 
 TRACE_EVENT(binder_transaction_ref_to_node,
-	TP_PROTO(struct binder_transaction *t, struct binder_ref *ref),
-	TP_ARGS(t, ref),
+	TP_PROTO(struct binder_transaction *t, struct binder_node *node,
+		 struct binder_ref_data *rdata),
+	TP_ARGS(t, node, rdata),
 
 	TP_STRUCT__entry(
 		__field(int, debug_id)
@@ -183,10 +184,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
 	),
 	TP_fast_assign(
 		__entry->debug_id = t->debug_id;
-		__entry->ref_debug_id = ref->debug_id;
-		__entry->ref_desc = ref->desc;
-		__entry->node_debug_id = ref->node->debug_id;
-		__entry->node_ptr = ref->node->ptr;
+		__entry->ref_debug_id = rdata->debug_id;
+		__entry->ref_desc = rdata->desc;
+		__entry->node_debug_id = node->debug_id;
+		__entry->node_ptr = node->ptr;
 	),
 	TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ptr=0x%016llx",
 		  __entry->debug_id, __entry->node_debug_id,
@@ -195,9 +196,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
 );
 
 TRACE_EVENT(binder_transaction_ref_to_ref,
-	TP_PROTO(struct binder_transaction *t, struct binder_ref *src_ref,
-		 struct binder_ref *dest_ref),
-	TP_ARGS(t, src_ref, dest_ref),
+	TP_PROTO(struct binder_transaction *t, struct binder_node *node,
+		 struct binder_ref_data *src_ref,
+		 struct binder_ref_data *dest_ref),
+	TP_ARGS(t, node, src_ref, dest_ref),
 
 	TP_STRUCT__entry(
 		__field(int, debug_id)
@@ -209,7 +211,7 @@ TRACE_EVENT(binder_transaction_ref_to_ref,
 	),
 	TP_fast_assign(
 		__entry->debug_id = t->debug_id;
-		__entry->node_debug_id = src_ref->node->debug_id;
+		__entry->node_debug_id = node->debug_id;
 		__entry->src_ref_debug_id = src_ref->debug_id;
 		__entry->src_ref_desc = src_ref->desc;
 		__entry->dest_ref_debug_id = dest_ref->debug_id;

From 96dd75d9917402f0e3a8243c6152b2e5d662c37f Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Tue, 9 May 2017 11:08:05 -0700
Subject: [PATCH 0922/3220] FROMLIST: binder: use node->tmp_refs to ensure node
 safety

(from https://patchwork.kernel.org/patch/9817795/)

When obtaining a node via binder_get_node(),
binder_get_node_from_ref() or binder_new_node(),
increment node->tmp_refs to take a
temporary reference on the node to ensure the node
persists while being used.  binder_put_node() must
be called to remove the temporary reference.

Change-Id: I962b39d5cd80b2d7e4786bb87236ede7914e2db7
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 124 ++++++++++++++++++++++++++++++++-------
 1 file changed, 104 insertions(+), 20 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index b7109a356c46..4c636da38b2b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -274,6 +274,7 @@ struct binder_node {
 	int internal_strong_refs;
 	int local_weak_refs;
 	int local_strong_refs;
+	int tmp_refs;
 	binder_uintptr_t ptr;
 	binder_uintptr_t cookie;
 	unsigned has_strong_ref:1;
@@ -427,6 +428,7 @@ static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
 static void binder_free_thread(struct binder_thread *thread);
 static void binder_free_proc(struct binder_proc *proc);
+static void binder_inc_node_tmpref(struct binder_node *node);
 
 static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 {
@@ -521,8 +523,15 @@ static struct binder_node *binder_get_node(struct binder_proc *proc,
 			n = n->rb_left;
 		else if (ptr > node->ptr)
 			n = n->rb_right;
-		else
+		else {
+			/*
+			 * take an implicit weak reference
+			 * to ensure node stays alive until
+			 * call to binder_put_node()
+			 */
+			binder_inc_node_tmpref(node);
 			return node;
+		}
 	}
 	return NULL;
 }
@@ -551,6 +560,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 	if (node == NULL)
 		return NULL;
 	binder_stats_created(BINDER_STAT_NODE);
+	node->tmp_refs++;
 	rb_link_node(&node->rb_node, parent, p);
 	rb_insert_color(&node->rb_node, &proc->nodes);
 	node->debug_id = atomic_inc_return(&binder_last_id);
@@ -616,7 +626,8 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal)
 	} else {
 		if (!internal)
 			node->local_weak_refs--;
-		if (node->local_weak_refs || !hlist_empty(&node->refs))
+		if (node->local_weak_refs || node->tmp_refs ||
+				!hlist_empty(&node->refs))
 			return 0;
 	}
 	if (node->proc && (node->has_strong_ref || node->has_weak_ref)) {
@@ -626,7 +637,7 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal)
 		}
 	} else {
 		if (hlist_empty(&node->refs) && !node->local_strong_refs &&
-		    !node->local_weak_refs) {
+		    !node->local_weak_refs && !node->tmp_refs) {
 			list_del_init(&node->work.entry);
 			if (node->proc) {
 				rb_erase(&node->rb_node, &node->proc->nodes);
@@ -649,6 +660,46 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal)
 	return 0;
 }
 
+/**
+ * binder_inc_node_tmpref() - take a temporary reference on node
+ * @node:	node to reference
+ *
+ * Take reference on node to prevent the node from being freed
+ * while referenced only by a local variable
+ */
+static void binder_inc_node_tmpref(struct binder_node *node)
+{
+	/*
+	 * No call to binder_inc_node() is needed since we
+	 * don't need to inform userspace of any changes to
+	 * tmp_refs
+	 */
+	node->tmp_refs++;
+}
+
+/**
+ * binder_dec_node_tmpref() - remove a temporary reference on node
+ * @node:	node to reference
+ *
+ * Release temporary reference on node taken via binder_inc_node_tmpref()
+ */
+static void binder_dec_node_tmpref(struct binder_node *node)
+{
+	node->tmp_refs--;
+	BUG_ON(node->tmp_refs < 0);
+	/*
+	 * Call binder_dec_node() to check if all refcounts are 0
+	 * and cleanup is needed. Calling with strong=0 and internal=1
+	 * causes no actual reference to be released in binder_dec_node().
+	 * If that changes, a change is needed here too.
+	 */
+	binder_dec_node(node, 0, 1);
+}
+
+static void binder_put_node(struct binder_node *node)
+{
+	binder_dec_node_tmpref(node);
+}
 
 static struct binder_ref *binder_get_ref(struct binder_proc *proc,
 					 u32 desc, bool need_strong_ref)
@@ -889,6 +940,11 @@ static struct binder_node *binder_get_node_from_ref(
 	if (!ref)
 		goto err_no_ref;
 	node = ref->node;
+	/*
+	 * Take an implicit reference on the node to ensure
+	 * it stays alive until the call to binder_put_node()
+	 */
+	binder_inc_node_tmpref(node);
 	if (rdata)
 		*rdata = ref->data;
 
@@ -1349,6 +1405,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 				     node->debug_id, (u64)node->ptr);
 			binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER,
 					0);
+			binder_put_node(node);
 		} break;
 		case BINDER_TYPE_HANDLE:
 		case BINDER_TYPE_WEAK_HANDLE: {
@@ -1442,7 +1499,7 @@ static int binder_translate_binder(struct flat_binder_object *fp,
 	struct binder_proc *proc = thread->proc;
 	struct binder_proc *target_proc = t->to_proc;
 	struct binder_ref_data rdata;
-	int ret;
+	int ret = 0;
 
 	node = binder_get_node(proc, fp->binder);
 	if (!node) {
@@ -1458,16 +1515,19 @@ static int binder_translate_binder(struct flat_binder_object *fp,
 				  proc->pid, thread->pid, (u64)fp->binder,
 				  node->debug_id, (u64)fp->cookie,
 				  (u64)node->cookie);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto done;
+	}
+	if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) {
+		ret = -EPERM;
+		goto done;
 	}
-	if (security_binder_transfer_binder(proc->tsk, target_proc->tsk))
-		return -EPERM;
 
 	ret = binder_inc_ref_for_node(target_proc, node,
 			fp->hdr.type == BINDER_TYPE_BINDER,
 			&thread->todo, &rdata);
 	if (ret)
-		return ret;
+		goto done;
 
 	if (fp->hdr.type == BINDER_TYPE_BINDER)
 		fp->hdr.type = BINDER_TYPE_HANDLE;
@@ -1482,7 +1542,9 @@ static int binder_translate_binder(struct flat_binder_object *fp,
 		     "        node %d u%016llx -> ref %d desc %d\n",
 		     node->debug_id, (u64)node->ptr,
 		     rdata.debug_id, rdata.desc);
-	return 0;
+done:
+	binder_put_node(node);
+	return ret;
 }
 
 static int binder_translate_handle(struct flat_binder_object *fp,
@@ -1493,6 +1555,7 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 	struct binder_proc *target_proc = t->to_proc;
 	struct binder_node *node;
 	struct binder_ref_data src_rdata;
+	int ret = 0;
 
 	node = binder_get_node_from_ref(proc, fp->handle,
 			fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata);
@@ -1501,8 +1564,10 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 				  proc->pid, thread->pid, fp->handle);
 		return -EINVAL;
 	}
-	if (security_binder_transfer_binder(proc->tsk, target_proc->tsk))
-		return -EPERM;
+	if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) {
+		ret = -EPERM;
+		goto done;
+	}
 
 	if (node->proc == target_proc) {
 		if (fp->hdr.type == BINDER_TYPE_HANDLE)
@@ -1527,7 +1592,7 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 				fp->hdr.type == BINDER_TYPE_HANDLE,
 				NULL, &dest_rdata);
 		if (ret)
-			return ret;
+			goto done;
 
 		fp->binder = 0;
 		fp->handle = dest_rdata.desc;
@@ -1540,7 +1605,9 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 			     dest_rdata.debug_id, dest_rdata.desc,
 			     node->debug_id);
 	}
-	return 0;
+done:
+	binder_put_node(node);
+	return ret;
 }
 
 static int binder_translate_fd(int fd,
@@ -2382,6 +2449,7 @@ static int binder_thread_write(struct binder_proc *proc,
 					"BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
 					(u64)node_ptr, node->debug_id,
 					(u64)cookie, (u64)node->cookie);
+				binder_put_node(node);
 				break;
 			}
 			if (cmd == BC_ACQUIRE_DONE) {
@@ -2389,6 +2457,7 @@ static int binder_thread_write(struct binder_proc *proc,
 					binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n",
 						proc->pid, thread->pid,
 						node->debug_id);
+					binder_put_node(node);
 					break;
 				}
 				node->pending_strong_ref = 0;
@@ -2397,16 +2466,19 @@ static int binder_thread_write(struct binder_proc *proc,
 					binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n",
 						proc->pid, thread->pid,
 						node->debug_id);
+					binder_put_node(node);
 					break;
 				}
 				node->pending_weak_ref = 0;
 			}
 			binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0);
 			binder_debug(BINDER_DEBUG_USER_REFS,
-				     "%d:%d %s node %d ls %d lw %d\n",
+				     "%d:%d %s node %d ls %d lw %d tr %d\n",
 				     proc->pid, thread->pid,
 				     cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
-				     node->debug_id, node->local_strong_refs, node->local_weak_refs);
+				     node->debug_id, node->local_strong_refs,
+				     node->local_weak_refs, node->tmp_refs);
+			binder_put_node(node);
 			break;
 		}
 		case BC_ATTEMPT_ACQUIRE:
@@ -2846,7 +2918,8 @@ retry:
 			strong = node->internal_strong_refs ||
 					node->local_strong_refs;
 			weak = !hlist_empty(&node->refs) ||
-					node->local_weak_refs || strong;
+					node->local_weak_refs ||
+					node->tmp_refs || strong;
 			has_strong_ref = node->has_strong_ref;
 			has_weak_ref = node->has_weak_ref;
 
@@ -3358,6 +3431,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 	new_node->has_strong_ref = 1;
 	new_node->has_weak_ref = 1;
 	context->binder_context_mgr_node = new_node;
+	binder_put_node(new_node);
 out:
 	mutex_unlock(&context->context_mgr_node_lock);
 	return ret;
@@ -3616,8 +3690,11 @@ static int binder_node_release(struct binder_node *node, int refs)
 
 	list_del_init(&node->work.entry);
 	binder_release_work(&node->async_todo);
-
-	if (hlist_empty(&node->refs)) {
+	/*
+	 * The caller must have taken a temporary ref on the node,
+	 */
+	BUG_ON(!node->tmp_refs);
+	if (hlist_empty(&node->refs) && node->tmp_refs == 1) {
 		kfree(node);
 		binder_stats_deleted(BINDER_STAT_NODE);
 
@@ -3652,6 +3729,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 	binder_debug(BINDER_DEBUG_DEAD_BINDER,
 		     "node %d now dead, refs %d, death %d\n",
 		     node->debug_id, refs, death);
+	binder_put_node(node);
 
 	return refs;
 }
@@ -3701,6 +3779,12 @@ static void binder_deferred_release(struct binder_proc *proc)
 
 		node = rb_entry(n, struct binder_node, rb_node);
 		nodes++;
+		/*
+		 * take a temporary ref on the node before
+		 * calling binder_node_release() which will either
+		 * kfree() the node or call binder_put_node()
+		 */
+		binder_inc_node_tmpref(node);
 		rb_erase(&node->rb_node, &proc->nodes);
 		incoming_refs = binder_node_release(node, incoming_refs);
 	}
@@ -3896,11 +3980,11 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node)
 	hlist_for_each_entry(ref, &node->refs, node_entry)
 		count++;
 
-	seq_printf(m, "  node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d",
+	seq_printf(m, "  node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d",
 		   node->debug_id, (u64)node->ptr, (u64)node->cookie,
 		   node->has_strong_ref, node->has_weak_ref,
 		   node->local_strong_refs, node->local_weak_refs,
-		   node->internal_strong_refs, count);
+		   node->internal_strong_refs, count, node->tmp_refs);
 	if (count) {
 		seq_puts(m, " proc");
 		hlist_for_each_entry(ref, &node->refs, node_entry)

From b0f59d6d045c31a10cc0175fdb34ba5ae067b5b2 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 29 May 2017 16:44:24 -0700
Subject: [PATCH 0923/3220] FROMLIST: binder: introduce locking helper
 functions

(from https://patchwork.kernel.org/patch/9817791/)

There are 3 main spinlocks which must be acquired in this
order:
1) proc->outer_lock : protects most fields of binder_proc,
	binder_thread, and binder_ref structures. binder_proc_lock()
	and binder_proc_unlock() are used to acq/rel.
2) node->lock : protects most fields of binder_node.
	binder_node_lock() and binder_node_unlock() are
	used to acq/rel
3) proc->inner_lock : protects the thread and node lists
	(proc->threads, proc->nodes) and all todo lists associated
	with the binder_proc (proc->todo, thread->todo,
	proc->delivered_death and node->async_todo).
	binder_inner_proc_lock() and binder_inner_proc_unlock()
	are used to acq/rel

Any lock under procA must never be nested under any lock at the same
level or below on procB.

Functions that require a lock held on entry indicate which lock
in the suffix of the function name:

foo_olocked() : requires node->outer_lock
foo_nlocked() : requires node->lock
foo_ilocked() : requires proc->inner_lock
foo_iolocked(): requires proc->outer_lock and proc->inner_lock
foo_nilocked(): requires node->lock and proc->inner_lock

Change-Id: Ied42674486092a0e3bdde64356e45b2494844558
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 238 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 238 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 4c636da38b2b..86eec8db3b4f 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -15,6 +15,39 @@
  *
  */
 
+/*
+ * Locking overview
+ *
+ * There are 3 main spinlocks which must be acquired in the
+ * order shown:
+ *
+ * 1) proc->outer_lock : protects binder_ref
+ *    binder_proc_lock() and binder_proc_unlock() are
+ *    used to acq/rel.
+ * 2) node->lock : protects most fields of binder_node.
+ *    binder_node_lock() and binder_node_unlock() are
+ *    used to acq/rel
+ * 3) proc->inner_lock : protects the thread and node lists
+ *    (proc->threads, proc->nodes) and all todo lists associated
+ *    with the binder_proc (proc->todo, thread->todo,
+ *    proc->delivered_death and node->async_todo).
+ *    binder_inner_proc_lock() and binder_inner_proc_unlock()
+ *    are used to acq/rel
+ *
+ * Any lock under procA must never be nested under any lock at the same
+ * level or below on procB.
+ *
+ * Functions that require a lock held on entry indicate which lock
+ * in the suffix of the function name:
+ *
+ * foo_olocked() : requires node->outer_lock
+ * foo_nlocked() : requires node->lock
+ * foo_ilocked() : requires proc->inner_lock
+ * foo_oilocked(): requires proc->outer_lock and proc->inner_lock
+ * foo_nilocked(): requires node->lock and proc->inner_lock
+ * ...
+ */
+
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <asm/cacheflush.h>
@@ -35,6 +68,7 @@
 #include <linux/uaccess.h>
 #include <linux/pid_namespace.h>
 #include <linux/security.h>
+#include <linux/spinlock.h>
 
 #ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
 #define BINDER_IPC_32BIT 1
@@ -106,6 +140,7 @@ enum {
 	BINDER_DEBUG_FREE_BUFFER            = 1U << 11,
 	BINDER_DEBUG_INTERNAL_REFS          = 1U << 12,
 	BINDER_DEBUG_PRIORITY_CAP           = 1U << 13,
+	BINDER_DEBUG_SPINLOCKS              = 1U << 14,
 };
 static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR |
 	BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION;
@@ -262,8 +297,43 @@ struct binder_error {
 	uint32_t cmd;
 };
 
+/**
+ * struct binder_node - binder node bookkeeping
+ * @debug_id:             unique ID for debugging
+ *                        (invariant after initialized)
+ * @lock:                 lock for node fields
+ * @work:                 worklist element for node work
+ * @rb_node:              element for proc->nodes tree
+ * @dead_node:            element for binder_dead_nodes list
+ *                        (protected by binder_dead_nodes_lock)
+ * @proc:                 binder_proc that owns this node
+ *                        (invariant after initialized)
+ * @refs:                 list of references on this node
+ * @internal_strong_refs: used to take strong references when
+ *                        initiating a transaction
+ * @local_weak_refs:      weak user refs from local process
+ * @local_strong_refs:    strong user refs from local process
+ * @tmp_refs:             temporary kernel refs
+ * @ptr:                  userspace pointer for node
+ *                        (invariant, no lock needed)
+ * @cookie:               userspace cookie for node
+ *                        (invariant, no lock needed)
+ * @has_strong_ref:       userspace notified of strong ref
+ * @pending_strong_ref:   userspace has acked notification of strong ref
+ * @has_weak_ref:         userspace notified of weak ref
+ * @pending_weak_ref:     userspace has acked notification of weak ref
+ * @has_async_transaction: async transaction to node in progress
+ * @accept_fds:           file descriptor operations supported for node
+ *                        (invariant after initialized)
+ * @min_priority:         minimum scheduling priority
+ *                        (invariant after initialized)
+ * @async_todo:           list of async work items
+ *
+ * Bookkeeping structure for binder nodes.
+ */
 struct binder_node {
 	int debug_id;
+	spinlock_t lock;
 	struct binder_work work;
 	union {
 		struct rb_node rb_node;
@@ -346,6 +416,51 @@ enum binder_deferred_state {
 	BINDER_DEFERRED_RELEASE      = 0x04,
 };
 
+/**
+ * struct binder_proc - binder process bookkeeping
+ * @proc_node:            element for binder_procs list
+ * @threads:              rbtree of binder_threads in this proc
+ * @nodes:                rbtree of binder nodes associated with
+ *                        this proc ordered by node->ptr
+ * @refs_by_desc:         rbtree of refs ordered by ref->desc
+ * @refs_by_node:         rbtree of refs ordered by ref->node
+ * @pid                   PID of group_leader of process
+ *                        (invariant after initialized)
+ * @tsk                   task_struct for group_leader of process
+ *                        (invariant after initialized)
+ * @files                 files_struct for process
+ *                        (invariant after initialized)
+ * @deferred_work_node:   element for binder_deferred_list
+ *                        (protected by binder_deferred_lock)
+ * @deferred_work:        bitmap of deferred work to perform
+ *                        (protected by binder_deferred_lock)
+ * @is_dead:              process is dead and awaiting free
+ *                        when outstanding transactions are cleaned up
+ * @todo:                 list of work for this process
+ * @wait:                 wait queue head to wait for proc work
+ *                        (invariant after initialized)
+ * @stats:                per-process binder statistics
+ *                        (atomics, no lock needed)
+ * @delivered_death:      list of delivered death notification
+ * @max_threads:          cap on number of binder threads
+ * @requested_threads:    number of binder threads requested but not
+ *                        yet started. In current implementation, can
+ *                        only be 0 or 1.
+ * @requested_threads_started: number binder threads started
+ * @ready_threads:        number of threads waiting for proc work
+ * @tmp_ref:              temporary reference to indicate proc is in use
+ * @default_priority:     default scheduler priority
+ *                        (invariant after initialized)
+ * @debugfs_entry:        debugfs node
+ * @alloc:                binder allocator bookkeeping
+ * @context:              binder_context for this proc
+ *                        (invariant after initialized)
+ * @inner_lock:           can nest under outer_lock and/or node lock
+ * @outer_lock:           no nesting under innor or node lock
+ *                        Lock order: 1) outer, 2) node, 3) inner
+ *
+ * Bookkeeping structure for binder processes
+ */
 struct binder_proc {
 	struct hlist_node proc_node;
 	struct rb_root threads;
@@ -372,6 +487,8 @@ struct binder_proc {
 	struct dentry *debugfs_entry;
 	struct binder_alloc alloc;
 	struct binder_context *context;
+	spinlock_t inner_lock;
+	spinlock_t outer_lock;
 };
 
 enum {
@@ -382,6 +499,33 @@ enum {
 	BINDER_LOOPER_STATE_WAITING     = 0x10,
 };
 
+/**
+ * struct binder_thread - binder thread bookkeeping
+ * @proc:                 binder process for this thread
+ *                        (invariant after initialization)
+ * @rb_node:              element for proc->threads rbtree
+ * @pid:                  PID for this thread
+ *                        (invariant after initialization)
+ * @looper:               bitmap of looping state
+ *                        (only accessed by this thread)
+ * @looper_needs_return:  looping thread needs to exit driver
+ *                        (no lock needed)
+ * @transaction_stack:    stack of in-progress transactions for this thread
+ * @todo:                 list of work to do for this thread
+ * @return_error:         transaction errors reported by this thread
+ *                        (only accessed by this thread)
+ * @reply_error:          transaction errors reported by target thread
+ * @wait:                 wait queue for thread work
+ * @stats:                per-thread statistics
+ *                        (atomics, no lock needed)
+ * @tmp_ref:              temporary reference to indicate thread is in use
+ *                        (atomic since @proc->inner_lock cannot
+ *                        always be acquired)
+ * @is_dead:              thread is dead and awaiting free
+ *                        when outstanding transactions are cleaned up
+ *
+ * Bookkeeping structure for binder threads.
+ */
 struct binder_thread {
 	struct binder_proc *proc;
 	struct rb_node rb_node;
@@ -424,6 +568,97 @@ struct binder_transaction {
 	spinlock_t lock;
 };
 
+/**
+ * binder_proc_lock() - Acquire outer lock for given binder_proc
+ * @proc:         struct binder_proc to acquire
+ *
+ * Acquires proc->outer_lock. Used to protect binder_ref
+ * structures associated with the given proc.
+ */
+#define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__)
+static void
+_binder_proc_lock(struct binder_proc *proc, int line)
+{
+	binder_debug(BINDER_DEBUG_SPINLOCKS,
+		     "%s: line=%d\n", __func__, line);
+	spin_lock(&proc->outer_lock);
+}
+
+/**
+ * binder_proc_unlock() - Release spinlock for given binder_proc
+ * @proc:         struct binder_proc to acquire
+ *
+ * Release lock acquired via binder_proc_lock()
+ */
+#define binder_proc_unlock(_proc) _binder_proc_unlock(_proc, __LINE__)
+static void
+_binder_proc_unlock(struct binder_proc *proc, int line)
+{
+	binder_debug(BINDER_DEBUG_SPINLOCKS,
+		     "%s: line=%d\n", __func__, line);
+	spin_unlock(&proc->outer_lock);
+}
+
+/**
+ * binder_inner_proc_lock() - Acquire inner lock for given binder_proc
+ * @proc:         struct binder_proc to acquire
+ *
+ * Acquires proc->inner_lock. Used to protect todo lists
+ */
+#define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__)
+static void
+_binder_inner_proc_lock(struct binder_proc *proc, int line)
+{
+	binder_debug(BINDER_DEBUG_SPINLOCKS,
+		     "%s: line=%d\n", __func__, line);
+	spin_lock(&proc->inner_lock);
+}
+
+/**
+ * binder_inner_proc_unlock() - Release inner lock for given binder_proc
+ * @proc:         struct binder_proc to acquire
+ *
+ * Release lock acquired via binder_inner_proc_lock()
+ */
+#define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__)
+static void
+_binder_inner_proc_unlock(struct binder_proc *proc, int line)
+{
+	binder_debug(BINDER_DEBUG_SPINLOCKS,
+		     "%s: line=%d\n", __func__, line);
+	spin_unlock(&proc->inner_lock);
+}
+
+/**
+ * binder_node_lock() - Acquire spinlock for given binder_node
+ * @node:         struct binder_node to acquire
+ *
+ * Acquires node->lock. Used to protect binder_node fields
+ */
+#define binder_node_lock(node) _binder_node_lock(node, __LINE__)
+static void
+_binder_node_lock(struct binder_node *node, int line)
+{
+	binder_debug(BINDER_DEBUG_SPINLOCKS,
+		     "%s: line=%d\n", __func__, line);
+	spin_lock(&node->lock);
+}
+
+/**
+ * binder_node_unlock() - Release spinlock for given binder_proc
+ * @node:         struct binder_node to acquire
+ *
+ * Release lock acquired via binder_node_lock()
+ */
+#define binder_node_unlock(node) _binder_node_unlock(node, __LINE__)
+static void
+_binder_node_unlock(struct binder_node *node, int line)
+{
+	binder_debug(BINDER_DEBUG_SPINLOCKS,
+		     "%s: line=%d\n", __func__, line);
+	spin_unlock(&node->lock);
+}
+
 static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
 static void binder_free_thread(struct binder_thread *thread);
@@ -568,6 +803,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 	node->ptr = ptr;
 	node->cookie = cookie;
 	node->work.type = BINDER_WORK_NODE;
+	spin_lock_init(&node->lock);
 	INIT_LIST_HEAD(&node->work.entry);
 	INIT_LIST_HEAD(&node->async_todo);
 	binder_debug(BINDER_DEBUG_INTERNAL_REFS,
@@ -3600,6 +3836,8 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	proc = kzalloc(sizeof(*proc), GFP_KERNEL);
 	if (proc == NULL)
 		return -ENOMEM;
+	spin_lock_init(&proc->inner_lock);
+	spin_lock_init(&proc->outer_lock);
 	get_task_struct(current->group_leader);
 	proc->tsk = current->group_leader;
 	INIT_LIST_HEAD(&proc->todo);

From f73f378b52e81acdb9654a5bf81661d9ae461e62 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Tue, 21 Mar 2017 13:06:01 -0700
Subject: [PATCH 0924/3220] FROMLIST: binder: use inner lock to sync work dq
 and node counts

(from https://patchwork.kernel.org/patch/9817789/)

For correct behavior we need to hold the inner lock when
dequeuing and processing node work in binder_thread_read.
We now hold the inner lock when we enter the switch statement
and release it after processing anything that might be
affected by other threads.

We also need to hold the inner lock to protect the node
weak/strong ref tracking fields as long as node->proc
is non-NULL (if it is NULL then we are guaranteed that
we don't have any node work queued).

This means that other functions that manipulate these fields
must hold the inner lock. Refactored these functions to use
the inner lock.

Change-Id: I02c5cfdd3ab6dadea7f07f2a275faf3e27be77ad
Test: tested manually
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 253 +++++++++++++++++++++++++++++++--------
 1 file changed, 200 insertions(+), 53 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 86eec8db3b4f..0bc8fe034234 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -311,17 +311,36 @@ struct binder_error {
  * @refs:                 list of references on this node
  * @internal_strong_refs: used to take strong references when
  *                        initiating a transaction
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
  * @local_weak_refs:      weak user refs from local process
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
  * @local_strong_refs:    strong user refs from local process
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
  * @tmp_refs:             temporary kernel refs
+ *                        (protected by @proc->inner_lock while @proc
+ *                        is valid, and by binder_dead_nodes_lock
+ *                        if @proc is NULL. During inc/dec and node release
+ *                        it is also protected by @lock to provide safety
+ *                        as the node dies and @proc becomes NULL)
  * @ptr:                  userspace pointer for node
  *                        (invariant, no lock needed)
  * @cookie:               userspace cookie for node
  *                        (invariant, no lock needed)
  * @has_strong_ref:       userspace notified of strong ref
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
  * @pending_strong_ref:   userspace has acked notification of strong ref
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
  * @has_weak_ref:         userspace notified of weak ref
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
  * @pending_weak_ref:     userspace has acked notification of weak ref
+ *                        (protected by @proc->inner_lock if @proc
+ *                        and by @lock)
  * @has_async_transaction: async transaction to node in progress
  * @accept_fds:           file descriptor operations supported for node
  *                        (invariant after initialized)
@@ -347,13 +366,24 @@ struct binder_node {
 	int tmp_refs;
 	binder_uintptr_t ptr;
 	binder_uintptr_t cookie;
-	unsigned has_strong_ref:1;
-	unsigned pending_strong_ref:1;
-	unsigned has_weak_ref:1;
-	unsigned pending_weak_ref:1;
-	unsigned has_async_transaction:1;
-	unsigned accept_fds:1;
-	unsigned min_priority:8;
+	struct {
+		/*
+		 * bitfield elements protected by
+		 * proc inner_lock
+		 */
+		u8 has_strong_ref:1;
+		u8 pending_strong_ref:1;
+		u8 has_weak_ref:1;
+		u8 pending_weak_ref:1;
+	};
+	struct {
+		/*
+		 * invariant after initialization
+		 */
+		u8 accept_fds:1;
+		u8 min_priority;
+	};
+	bool has_async_transaction;
 	struct list_head async_todo;
 };
 
@@ -813,9 +843,18 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 	return node;
 }
 
-static int binder_inc_node(struct binder_node *node, int strong, int internal,
-			   struct list_head *target_list)
+static void binder_free_node(struct binder_node *node)
 {
+	kfree(node);
+	binder_stats_deleted(BINDER_STAT_NODE);
+}
+
+static int binder_inc_node_ilocked(struct binder_node *node, int strong,
+				   int internal,
+				   struct list_head *target_list)
+{
+	if (node->proc)
+		BUG_ON(!spin_is_locked(&node->proc->inner_lock));
 	if (strong) {
 		if (internal) {
 			if (target_list == NULL &&
@@ -850,23 +889,43 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal,
 	return 0;
 }
 
-static int binder_dec_node(struct binder_node *node, int strong, int internal)
+static int binder_inc_node(struct binder_node *node, int strong, int internal,
+			   struct list_head *target_list)
 {
+	int ret;
+
+	if (node->proc)
+		binder_inner_proc_lock(node->proc);
+	ret = binder_inc_node_ilocked(node, strong, internal, target_list);
+	if (node->proc)
+		binder_inner_proc_unlock(node->proc);
+
+	return ret;
+}
+
+static bool binder_dec_node_ilocked(struct binder_node *node,
+				    int strong, int internal)
+{
+	struct binder_proc *proc = node->proc;
+
+	if (proc)
+		BUG_ON(!spin_is_locked(&proc->inner_lock));
 	if (strong) {
 		if (internal)
 			node->internal_strong_refs--;
 		else
 			node->local_strong_refs--;
 		if (node->local_strong_refs || node->internal_strong_refs)
-			return 0;
+			return false;
 	} else {
 		if (!internal)
 			node->local_weak_refs--;
 		if (node->local_weak_refs || node->tmp_refs ||
 				!hlist_empty(&node->refs))
-			return 0;
+			return false;
 	}
-	if (node->proc && (node->has_strong_ref || node->has_weak_ref)) {
+
+	if (proc && (node->has_strong_ref || node->has_weak_ref)) {
 		if (list_empty(&node->work.entry)) {
 			list_add_tail(&node->work.entry, &node->proc->todo);
 			wake_up_interruptible(&node->proc->wait);
@@ -875,35 +934,48 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal)
 		if (hlist_empty(&node->refs) && !node->local_strong_refs &&
 		    !node->local_weak_refs && !node->tmp_refs) {
 			list_del_init(&node->work.entry);
-			if (node->proc) {
+			if (proc) {
 				rb_erase(&node->rb_node, &node->proc->nodes);
 				binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 					     "refless node %d deleted\n",
 					     node->debug_id);
 			} else {
 				spin_lock(&binder_dead_nodes_lock);
+				/*
+				 * tmp_refs could have changed so
+				 * check it again
+				 */
+				if (node->tmp_refs) {
+					spin_unlock(&binder_dead_nodes_lock);
+					return false;
+				}
 				hlist_del(&node->dead_node);
 				spin_unlock(&binder_dead_nodes_lock);
 				binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 					     "dead node %d deleted\n",
 					     node->debug_id);
 			}
-			kfree(node);
-			binder_stats_deleted(BINDER_STAT_NODE);
+			return true;
 		}
 	}
-
-	return 0;
+	return false;
 }
 
-/**
- * binder_inc_node_tmpref() - take a temporary reference on node
- * @node:	node to reference
- *
- * Take reference on node to prevent the node from being freed
- * while referenced only by a local variable
- */
-static void binder_inc_node_tmpref(struct binder_node *node)
+static void binder_dec_node(struct binder_node *node, int strong, int internal)
+{
+	bool free_node;
+
+	if (node->proc)
+		binder_inner_proc_lock(node->proc);
+	free_node = binder_dec_node_ilocked(node, strong, internal);
+	if (node->proc)
+		binder_inner_proc_unlock(node->proc);
+
+	if (free_node)
+		binder_free_node(node);
+}
+
+static void binder_inc_node_tmpref_ilocked(struct binder_node *node)
 {
 	/*
 	 * No call to binder_inc_node() is needed since we
@@ -913,6 +985,32 @@ static void binder_inc_node_tmpref(struct binder_node *node)
 	node->tmp_refs++;
 }
 
+/**
+ * binder_inc_node_tmpref() - take a temporary reference on node
+ * @node:	node to reference
+ *
+ * Take reference on node to prevent the node from being freed
+ * while referenced only by a local variable. The inner lock is
+ * needed to serialize with the node work on the queue (which
+ * isn't needed after the node is dead). If the node is dead
+ * (node->proc is NULL), use binder_dead_nodes_lock to protect
+ * node->tmp_refs against dead-node-only cases where the node
+ * lock cannot be acquired (eg traversing the dead node list to
+ * print nodes)
+ */
+static void binder_inc_node_tmpref(struct binder_node *node)
+{
+	if (node->proc)
+		binder_inner_proc_lock(node->proc);
+	else
+		spin_lock(&binder_dead_nodes_lock);
+	binder_inc_node_tmpref_ilocked(node);
+	if (node->proc)
+		binder_inner_proc_unlock(node->proc);
+	else
+		spin_unlock(&binder_dead_nodes_lock);
+}
+
 /**
  * binder_dec_node_tmpref() - remove a temporary reference on node
  * @node:	node to reference
@@ -921,15 +1019,27 @@ static void binder_inc_node_tmpref(struct binder_node *node)
  */
 static void binder_dec_node_tmpref(struct binder_node *node)
 {
+	bool free_node;
+
+	if (node->proc)
+		binder_inner_proc_lock(node->proc);
+	else
+		spin_lock(&binder_dead_nodes_lock);
 	node->tmp_refs--;
 	BUG_ON(node->tmp_refs < 0);
+	if (!node->proc)
+		spin_unlock(&binder_dead_nodes_lock);
 	/*
 	 * Call binder_dec_node() to check if all refcounts are 0
 	 * and cleanup is needed. Calling with strong=0 and internal=1
 	 * causes no actual reference to be released in binder_dec_node().
 	 * If that changes, a change is needed here too.
 	 */
-	binder_dec_node(node, 0, 1);
+	free_node = binder_dec_node_ilocked(node, 0, 1);
+	if (node->proc)
+		binder_inner_proc_unlock(node->proc);
+	if (free_node)
+		binder_free_node(node);
 }
 
 static void binder_put_node(struct binder_node *node)
@@ -1042,6 +1152,9 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 
 static void binder_cleanup_ref(struct binder_ref *ref)
 {
+	bool delete_node = false;
+	struct binder_proc *node_proc = ref->node->proc;
+
 	binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 		     "%d delete ref %d desc %d for node %d\n",
 		      ref->proc->pid, ref->data.debug_id, ref->data.desc,
@@ -1050,11 +1163,26 @@ static void binder_cleanup_ref(struct binder_ref *ref)
 	rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc);
 	rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node);
 
+	if (node_proc)
+		binder_inner_proc_lock(node_proc);
 	if (ref->data.strong)
-		binder_dec_node(ref->node, 1, 1);
+		binder_dec_node_ilocked(ref->node, 1, 1);
 
 	hlist_del(&ref->node_entry);
-	binder_dec_node(ref->node, 0, 1);
+	delete_node = binder_dec_node_ilocked(ref->node, 0, 1);
+	if (node_proc)
+		binder_inner_proc_unlock(node_proc);
+	/*
+	 * Clear ref->node unless we want the caller to free the node
+	 */
+	if (!delete_node) {
+		/*
+		 * The caller uses ref->node to determine
+		 * whether the node needs to be freed. Clear
+		 * it since the node is still alive.
+		 */
+		ref->node = NULL;
+	}
 
 	if (ref->death) {
 		binder_debug(BINDER_DEBUG_DEAD_BINDER,
@@ -1123,13 +1251,8 @@ static bool binder_dec_ref(struct binder_ref *ref, int strong)
 			return false;
 		}
 		ref->data.strong--;
-		if (ref->data.strong == 0) {
-			int ret;
-
-			ret = binder_dec_node(ref->node, strong, 1);
-			if (ret)
-				return false;
-		}
+		if (ref->data.strong == 0)
+			binder_dec_node(ref->node, strong, 1);
 	} else {
 		if (ref->data.weak == 0) {
 			binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n",
@@ -1194,10 +1317,13 @@ err_no_ref:
  * binder_free_ref() - free the binder_ref
  * @ref:	ref to free
  *
- * Free the binder_ref and the binder_ref_death indicated by ref->death.
+ * Free the binder_ref. Free the binder_node indicated by ref->node
+ * (if non-NULL) and the binder_ref_death indicated by ref->death.
  */
 static void binder_free_ref(struct binder_ref *ref)
 {
+	if (ref->node)
+		binder_free_node(ref->node);
 	kfree(ref->death);
 	kfree(ref);
 }
@@ -2688,11 +2814,13 @@ static int binder_thread_write(struct binder_proc *proc,
 				binder_put_node(node);
 				break;
 			}
+			binder_inner_proc_lock(proc);
 			if (cmd == BC_ACQUIRE_DONE) {
 				if (node->pending_strong_ref == 0) {
 					binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n",
 						proc->pid, thread->pid,
 						node->debug_id);
+					binder_inner_proc_unlock(proc);
 					binder_put_node(node);
 					break;
 				}
@@ -2702,11 +2830,13 @@ static int binder_thread_write(struct binder_proc *proc,
 					binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n",
 						proc->pid, thread->pid,
 						node->debug_id);
+					binder_inner_proc_unlock(proc);
 					binder_put_node(node);
 					break;
 				}
 				node->pending_weak_ref = 0;
 			}
+			binder_inner_proc_unlock(proc);
 			binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0);
 			binder_debug(BINDER_DEBUG_USER_REFS,
 				     "%d:%d %s node %d ls %d lw %d tr %d\n",
@@ -3092,6 +3222,7 @@ retry:
 		struct binder_transaction *t = NULL;
 		struct binder_thread *t_from;
 
+		binder_inner_proc_lock(proc);
 		if (!list_empty(&thread->todo)) {
 			w = list_first_entry(&thread->todo, struct binder_work,
 					     entry);
@@ -3105,11 +3236,15 @@ retry:
 			break;
 		}
 
-		if (end - ptr < sizeof(tr) + 4)
+		if (end - ptr < sizeof(tr) + 4) {
+			binder_inner_proc_unlock(proc);
 			break;
+		}
+		list_del_init(&w->entry);
 
 		switch (w->type) {
 		case BINDER_WORK_TRANSACTION: {
+			binder_inner_proc_unlock(proc);
 			t = container_of(w, struct binder_transaction, work);
 		} break;
 		case BINDER_WORK_RETURN_ERROR: {
@@ -3117,15 +3252,16 @@ retry:
 					w, struct binder_error, work);
 
 			WARN_ON(e->cmd == BR_OK);
+			binder_inner_proc_unlock(proc);
 			if (put_user(e->cmd, (uint32_t __user *)ptr))
 				return -EFAULT;
 			e->cmd = BR_OK;
 			ptr += sizeof(uint32_t);
 
 			binder_stat_br(proc, thread, cmd);
-			list_del(&w->entry);
 		} break;
 		case BINDER_WORK_TRANSACTION_COMPLETE: {
+			binder_inner_proc_unlock(proc);
 			cmd = BR_TRANSACTION_COMPLETE;
 			if (put_user(cmd, (uint32_t __user *)ptr))
 				return -EFAULT;
@@ -3135,8 +3271,6 @@ retry:
 			binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE,
 				     "%d:%d BR_TRANSACTION_COMPLETE\n",
 				     proc->pid, thread->pid);
-
-			list_del(&w->entry);
 			kfree(w);
 			binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
 		} break;
@@ -3173,8 +3307,6 @@ retry:
 				node->has_strong_ref = 0;
 			if (!weak && has_weak_ref)
 				node->has_weak_ref = 0;
-			list_del(&w->entry);
-
 			if (!weak && !strong) {
 				binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 					     "%d:%d node %d u%016llx c%016llx deleted\n",
@@ -3183,9 +3315,11 @@ retry:
 					     (u64)node_ptr,
 					     (u64)node_cookie);
 				rb_erase(&node->rb_node, &proc->nodes);
-				kfree(node);
-				binder_stats_deleted(BINDER_STAT_NODE);
-			}
+				binder_inner_proc_unlock(proc);
+				binder_free_node(node);
+			} else
+				binder_inner_proc_unlock(proc);
+
 			if (weak && !has_weak_ref)
 				ret = binder_put_node_cmd(
 						proc, thread, &ptr, node_ptr,
@@ -3227,6 +3361,13 @@ retry:
 				cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE;
 			else
 				cmd = BR_DEAD_BINDER;
+			/*
+			 * TODO: there is a race condition between
+			 * death notification requests and delivery
+			 * of the notifications. This will be handled
+			 * in a later patch.
+			 */
+			binder_inner_proc_unlock(proc);
 			if (put_user(cmd, (uint32_t __user *)ptr))
 				return -EFAULT;
 			ptr += sizeof(uint32_t);
@@ -3244,11 +3385,14 @@ retry:
 				      (u64)death->cookie);
 
 			if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) {
-				list_del(&w->entry);
 				kfree(death);
 				binder_stats_deleted(BINDER_STAT_DEATH);
-			} else
-				list_move(&w->entry, &proc->delivered_death);
+			} else {
+				binder_inner_proc_lock(proc);
+				list_add_tail(&w->entry,
+					      &proc->delivered_death);
+				binder_inner_proc_unlock(proc);
+			}
 			if (cmd == BR_DEAD_BINDER)
 				goto done; /* DEAD_BINDER notifications can cause transactions */
 		} break;
@@ -3326,7 +3470,6 @@ retry:
 
 		if (t_from)
 			binder_thread_dec_tmpref(t_from);
-		list_del(&t->work.entry);
 		t->buffer->allow_user_free = 1;
 		if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) {
 			t->to_parent = thread->transaction_stack;
@@ -3925,16 +4068,19 @@ static int binder_node_release(struct binder_node *node, int refs)
 {
 	struct binder_ref *ref;
 	int death = 0;
+	struct binder_proc *proc = node->proc;
 
-	list_del_init(&node->work.entry);
 	binder_release_work(&node->async_todo);
+
+	binder_inner_proc_lock(proc);
+	list_del_init(&node->work.entry);
 	/*
 	 * The caller must have taken a temporary ref on the node,
 	 */
 	BUG_ON(!node->tmp_refs);
 	if (hlist_empty(&node->refs) && node->tmp_refs == 1) {
-		kfree(node);
-		binder_stats_deleted(BINDER_STAT_NODE);
+		binder_inner_proc_unlock(proc);
+		binder_free_node(node);
 
 		return refs;
 	}
@@ -3942,6 +4088,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 	node->proc = NULL;
 	node->local_strong_refs = 0;
 	node->local_weak_refs = 0;
+	binder_inner_proc_unlock(proc);
 
 	spin_lock(&binder_dead_nodes_lock);
 	hlist_add_head(&node->dead_node, &binder_dead_nodes);

From 57628830c30714d7231f7d5757b968184c8a7e3c Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Thu, 20 Oct 2016 10:33:00 -0700
Subject: [PATCH 0925/3220] FROMLIST: binder: add spinlocks to protect todo
 lists

(from https://patchwork.kernel.org/patch/9817769/)

The todo lists in the proc, thread, and node structures
are accessed by other procs/threads to place work
items on the queue.

The todo lists are protected by the new proc->inner_lock.
No locks should ever be nested under these locks. As the
name suggests, an outer lock will be introduced in
a later patch.

Change-Id: I7720bacf5ebae4af177e22fcab0900d54c94c11a
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 355 +++++++++++++++++++++++++++++----------
 1 file changed, 269 insertions(+), 86 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 0bc8fe034234..36074ce8d1b1 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -279,8 +279,16 @@ struct binder_device {
 	struct binder_context context;
 };
 
+/**
+ * struct binder_work - work enqueued on a worklist
+ * @entry:             node enqueued on list
+ * @type:              type of work to be performed
+ *
+ * There are separate work lists for proc, thread, and node (async).
+ */
 struct binder_work {
 	struct list_head entry;
+
 	enum {
 		BINDER_WORK_TRANSACTION = 1,
 		BINDER_WORK_TRANSACTION_COMPLETE,
@@ -303,6 +311,7 @@ struct binder_error {
  *                        (invariant after initialized)
  * @lock:                 lock for node fields
  * @work:                 worklist element for node work
+ *                        (protected by @proc->inner_lock)
  * @rb_node:              element for proc->nodes tree
  * @dead_node:            element for binder_dead_nodes list
  *                        (protected by binder_dead_nodes_lock)
@@ -347,6 +356,7 @@ struct binder_error {
  * @min_priority:         minimum scheduling priority
  *                        (invariant after initialized)
  * @async_todo:           list of async work items
+ *                        (protected by @proc->inner_lock)
  *
  * Bookkeeping structure for binder nodes.
  */
@@ -388,6 +398,11 @@ struct binder_node {
 };
 
 struct binder_ref_death {
+	/**
+	 * @work: worklist element for death notifications
+	 *        (protected by inner_lock of the proc that
+	 *        this ref belongs to)
+	 */
 	struct binder_work work;
 	binder_uintptr_t cookie;
 };
@@ -467,11 +482,13 @@ enum binder_deferred_state {
  * @is_dead:              process is dead and awaiting free
  *                        when outstanding transactions are cleaned up
  * @todo:                 list of work for this process
+ *                        (protected by @inner_lock)
  * @wait:                 wait queue head to wait for proc work
  *                        (invariant after initialized)
  * @stats:                per-process binder statistics
  *                        (atomics, no lock needed)
  * @delivered_death:      list of delivered death notification
+ *                        (protected by @inner_lock)
  * @max_threads:          cap on number of binder threads
  * @requested_threads:    number of binder threads requested but not
  *                        yet started. In current implementation, can
@@ -542,6 +559,7 @@ enum {
  *                        (no lock needed)
  * @transaction_stack:    stack of in-progress transactions for this thread
  * @todo:                 list of work to do for this thread
+ *                        (protected by @proc->inner_lock)
  * @return_error:         transaction errors reported by this thread
  *                        (only accessed by this thread)
  * @reply_error:          transaction errors reported by target thread
@@ -689,6 +707,111 @@ _binder_node_unlock(struct binder_node *node, int line)
 	spin_unlock(&node->lock);
 }
 
+static bool binder_worklist_empty_ilocked(struct list_head *list)
+{
+	return list_empty(list);
+}
+
+/**
+ * binder_worklist_empty() - Check if no items on the work list
+ * @proc:       binder_proc associated with list
+ * @list:	list to check
+ *
+ * Return: true if there are no items on list, else false
+ */
+static bool binder_worklist_empty(struct binder_proc *proc,
+				  struct list_head *list)
+{
+	bool ret;
+
+	binder_inner_proc_lock(proc);
+	ret = binder_worklist_empty_ilocked(list);
+	binder_inner_proc_unlock(proc);
+	return ret;
+}
+
+static void
+binder_enqueue_work_ilocked(struct binder_work *work,
+			   struct list_head *target_list)
+{
+	BUG_ON(target_list == NULL);
+	BUG_ON(work->entry.next && !list_empty(&work->entry));
+	list_add_tail(&work->entry, target_list);
+}
+
+/**
+ * binder_enqueue_work() - Add an item to the work list
+ * @proc:         binder_proc associated with list
+ * @work:         struct binder_work to add to list
+ * @target_list:  list to add work to
+ *
+ * Adds the work to the specified list. Asserts that work
+ * is not already on a list.
+ */
+static void
+binder_enqueue_work(struct binder_proc *proc,
+		    struct binder_work *work,
+		    struct list_head *target_list)
+{
+	binder_inner_proc_lock(proc);
+	binder_enqueue_work_ilocked(work, target_list);
+	binder_inner_proc_unlock(proc);
+}
+
+static void
+binder_dequeue_work_ilocked(struct binder_work *work)
+{
+	list_del_init(&work->entry);
+}
+
+/**
+ * binder_dequeue_work() - Removes an item from the work list
+ * @proc:         binder_proc associated with list
+ * @work:         struct binder_work to remove from list
+ *
+ * Removes the specified work item from whatever list it is on.
+ * Can safely be called if work is not on any list.
+ */
+static void
+binder_dequeue_work(struct binder_proc *proc, struct binder_work *work)
+{
+	binder_inner_proc_lock(proc);
+	binder_dequeue_work_ilocked(work);
+	binder_inner_proc_unlock(proc);
+}
+
+static struct binder_work *binder_dequeue_work_head_ilocked(
+					struct list_head *list)
+{
+	struct binder_work *w;
+
+	w = list_first_entry_or_null(list, struct binder_work, entry);
+	if (w)
+		list_del_init(&w->entry);
+	return w;
+}
+
+/**
+ * binder_dequeue_work_head() - Dequeues the item at head of list
+ * @proc:         binder_proc associated with list
+ * @list:         list to dequeue head
+ *
+ * Removes the head of the list if there are items on the list
+ *
+ * Return: pointer dequeued binder_work, NULL if list was empty
+ */
+static struct binder_work *binder_dequeue_work_head(
+					struct binder_proc *proc,
+					struct list_head *list)
+{
+	struct binder_work *w;
+
+	binder_inner_proc_lock(proc);
+	w = binder_dequeue_work_head_ilocked(list);
+	binder_inner_proc_unlock(proc);
+	return w;
+}
+
 static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
 static void binder_free_thread(struct binder_thread *thread);
@@ -871,8 +994,8 @@ static int binder_inc_node_ilocked(struct binder_node *node, int strong,
 		} else
 			node->local_strong_refs++;
 		if (!node->has_strong_ref && target_list) {
-			list_del_init(&node->work.entry);
-			list_add_tail(&node->work.entry, target_list);
+			binder_dequeue_work_ilocked(&node->work);
+			binder_enqueue_work_ilocked(&node->work, target_list);
 		}
 	} else {
 		if (!internal)
@@ -883,7 +1006,7 @@ static int binder_inc_node_ilocked(struct binder_node *node, int strong,
 					node->debug_id);
 				return -EINVAL;
 			}
-			list_add_tail(&node->work.entry, target_list);
+			binder_enqueue_work_ilocked(&node->work, target_list);
 		}
 	}
 	return 0;
@@ -927,19 +1050,20 @@ static bool binder_dec_node_ilocked(struct binder_node *node,
 
 	if (proc && (node->has_strong_ref || node->has_weak_ref)) {
 		if (list_empty(&node->work.entry)) {
-			list_add_tail(&node->work.entry, &node->proc->todo);
+			binder_enqueue_work_ilocked(&node->work, &proc->todo);
 			wake_up_interruptible(&node->proc->wait);
 		}
 	} else {
 		if (hlist_empty(&node->refs) && !node->local_strong_refs &&
 		    !node->local_weak_refs && !node->tmp_refs) {
-			list_del_init(&node->work.entry);
 			if (proc) {
-				rb_erase(&node->rb_node, &node->proc->nodes);
+				binder_dequeue_work_ilocked(&node->work);
+				rb_erase(&node->rb_node, &proc->nodes);
 				binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 					     "refless node %d deleted\n",
 					     node->debug_id);
 			} else {
+				BUG_ON(!list_empty(&node->work.entry));
 				spin_lock(&binder_dead_nodes_lock);
 				/*
 				 * tmp_refs could have changed so
@@ -1189,7 +1313,7 @@ static void binder_cleanup_ref(struct binder_ref *ref)
 			     "%d delete ref %d desc %d has death notification\n",
 			      ref->proc->pid, ref->data.debug_id,
 			      ref->data.desc);
-		list_del(&ref->death->work.entry);
+		binder_dequeue_work(ref->proc, &ref->death->work);
 		binder_stats_deleted(BINDER_STAT_DEATH);
 	}
 	binder_stats_deleted(BINDER_STAT_REF);
@@ -1540,8 +1664,9 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 			binder_pop_transaction(target_thread, t);
 			if (target_thread->reply_error.cmd == BR_OK) {
 				target_thread->reply_error.cmd = error_code;
-				list_add_tail(
-					&target_thread->reply_error.work.entry,
+				binder_enqueue_work(
+					target_thread->proc,
+					&target_thread->reply_error.work,
 					&target_thread->todo);
 				wake_up_interruptible(&target_thread->wait);
 			} else {
@@ -2579,7 +2704,7 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 	}
 	tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
-	list_add_tail(&tcomplete->entry, &thread->todo);
+	binder_enqueue_work(proc, tcomplete, &thread->todo);
 
 	if (reply) {
 		if (target_thread->is_dead)
@@ -2610,7 +2735,7 @@ static void binder_transaction(struct binder_proc *proc,
 			goto err_dead_proc_or_thread;
 	}
 	t->work.type = BINDER_WORK_TRANSACTION;
-	list_add_tail(&t->work.entry, target_list);
+	binder_enqueue_work(target_proc, &t->work, target_list);
 	if (target_wait) {
 		if (reply || !(tr->flags & TF_ONE_WAY))
 			wake_up_interruptible_sync(target_wait);
@@ -2686,13 +2811,15 @@ err_no_context_mgr_node:
 	BUG_ON(thread->return_error.cmd != BR_OK);
 	if (in_reply_to) {
 		thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
-		list_add_tail(&thread->return_error.work.entry,
-			      &thread->todo);
+		binder_enqueue_work(thread->proc,
+				    &thread->return_error.work,
+				    &thread->todo);
 		binder_send_failed_reply(in_reply_to, return_error);
 	} else {
 		thread->return_error.cmd = return_error;
-		list_add_tail(&thread->return_error.work.entry,
-			      &thread->todo);
+		binder_enqueue_work(thread->proc,
+				    &thread->return_error.work,
+				    &thread->todo);
 	}
 }
 
@@ -2885,11 +3012,21 @@ static int binder_thread_write(struct binder_proc *proc,
 				buffer->transaction = NULL;
 			}
 			if (buffer->async_transaction && buffer->target_node) {
-				BUG_ON(!buffer->target_node->has_async_transaction);
-				if (list_empty(&buffer->target_node->async_todo))
-					buffer->target_node->has_async_transaction = 0;
+				struct binder_node *buf_node;
+				struct binder_work *w;
+
+				buf_node = buffer->target_node;
+				BUG_ON(!buf_node->has_async_transaction);
+				BUG_ON(buf_node->proc != proc);
+				binder_inner_proc_lock(proc);
+				w = binder_dequeue_work_head_ilocked(
+						&buf_node->async_todo);
+				if (!w)
+					buf_node->has_async_transaction = 0;
 				else
-					list_move_tail(buffer->target_node->async_todo.next, &thread->todo);
+					binder_enqueue_work_ilocked(
+							w, &thread->todo);
+				binder_inner_proc_unlock(proc);
 			}
 			trace_binder_transaction_buffer_release(buffer);
 			binder_transaction_buffer_release(proc, buffer, NULL);
@@ -3001,9 +3138,10 @@ static int binder_thread_write(struct binder_proc *proc,
 					WARN_ON(thread->return_error.cmd !=
 						BR_OK);
 					thread->return_error.cmd = BR_ERROR;
-					list_add_tail(
-					    &thread->return_error.work.entry,
-					    &thread->todo);
+					binder_enqueue_work(
+						thread->proc,
+						&thread->return_error.work,
+						&thread->todo);
 					binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 						     "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
 						     proc->pid, thread->pid);
@@ -3015,11 +3153,20 @@ static int binder_thread_write(struct binder_proc *proc,
 				ref->death = death;
 				if (ref->node->proc == NULL) {
 					ref->death->work.type = BINDER_WORK_DEAD_BINDER;
-					if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) {
-						list_add_tail(&ref->death->work.entry, &thread->todo);
-					} else {
-						list_add_tail(&ref->death->work.entry, &proc->todo);
-						wake_up_interruptible(&proc->wait);
+					if (thread->looper &
+					    (BINDER_LOOPER_STATE_REGISTERED |
+					     BINDER_LOOPER_STATE_ENTERED))
+						binder_enqueue_work(
+							proc,
+							&ref->death->work,
+							&thread->todo);
+					else {
+						binder_enqueue_work(
+							proc,
+							&ref->death->work,
+							&proc->todo);
+						wake_up_interruptible(
+								&proc->wait);
 					}
 				}
 			} else {
@@ -3037,18 +3184,27 @@ static int binder_thread_write(struct binder_proc *proc,
 					break;
 				}
 				ref->death = NULL;
+				binder_inner_proc_lock(proc);
 				if (list_empty(&death->work.entry)) {
 					death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION;
-					if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) {
-						list_add_tail(&death->work.entry, &thread->todo);
-					} else {
-						list_add_tail(&death->work.entry, &proc->todo);
-						wake_up_interruptible(&proc->wait);
+					if (thread->looper &
+					    (BINDER_LOOPER_STATE_REGISTERED |
+					     BINDER_LOOPER_STATE_ENTERED))
+						binder_enqueue_work_ilocked(
+								&death->work,
+								&thread->todo);
+					else {
+						binder_enqueue_work_ilocked(
+								&death->work,
+								&proc->todo);
+						wake_up_interruptible(
+								&proc->wait);
 					}
 				} else {
 					BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER);
 					death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR;
 				}
+				binder_inner_proc_unlock(proc);
 			}
 		} break;
 		case BC_DEAD_BINDER_DONE: {
@@ -3060,8 +3216,13 @@ static int binder_thread_write(struct binder_proc *proc,
 				return -EFAULT;
 
 			ptr += sizeof(cookie);
-			list_for_each_entry(w, &proc->delivered_death, entry) {
-				struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work);
+			binder_inner_proc_lock(proc);
+			list_for_each_entry(w, &proc->delivered_death,
+					    entry) {
+				struct binder_ref_death *tmp_death =
+					container_of(w,
+						     struct binder_ref_death,
+						     work);
 
 				if (tmp_death->cookie == cookie) {
 					death = tmp_death;
@@ -3075,19 +3236,25 @@ static int binder_thread_write(struct binder_proc *proc,
 			if (death == NULL) {
 				binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n",
 					proc->pid, thread->pid, (u64)cookie);
+				binder_inner_proc_unlock(proc);
 				break;
 			}
-
-			list_del_init(&death->work.entry);
+			binder_dequeue_work_ilocked(&death->work);
 			if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) {
 				death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION;
-				if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) {
-					list_add_tail(&death->work.entry, &thread->todo);
-				} else {
-					list_add_tail(&death->work.entry, &proc->todo);
+				if (thread->looper &
+					(BINDER_LOOPER_STATE_REGISTERED |
+					 BINDER_LOOPER_STATE_ENTERED))
+					binder_enqueue_work_ilocked(
+						&death->work, &thread->todo);
+				else {
+					binder_enqueue_work_ilocked(
+							&death->work,
+							&proc->todo);
 					wake_up_interruptible(&proc->wait);
 				}
 			}
+			binder_inner_proc_unlock(proc);
 		} break;
 
 		default:
@@ -3114,12 +3281,14 @@ static void binder_stat_br(struct binder_proc *proc,
 static int binder_has_proc_work(struct binder_proc *proc,
 				struct binder_thread *thread)
 {
-	return !list_empty(&proc->todo) || thread->looper_need_return;
+	return !binder_worklist_empty(proc, &proc->todo) ||
+		thread->looper_need_return;
 }
 
 static int binder_has_thread_work(struct binder_thread *thread)
 {
-	return !list_empty(&thread->todo) || thread->looper_need_return;
+	return !binder_worklist_empty(thread->proc, &thread->todo) ||
+		thread->looper_need_return;
 }
 
 static int binder_put_node_cmd(struct binder_proc *proc,
@@ -3173,7 +3342,7 @@ static int binder_thread_read(struct binder_proc *proc,
 
 retry:
 	wait_for_proc_work = thread->transaction_stack == NULL &&
-				list_empty(&thread->todo);
+		binder_worklist_empty(proc, &thread->todo);
 
 	thread->looper |= BINDER_LOOPER_STATE_WAITING;
 	if (wait_for_proc_work)
@@ -3183,7 +3352,7 @@ retry:
 
 	trace_binder_wait_for_work(wait_for_proc_work,
 				   !!thread->transaction_stack,
-				   !list_empty(&thread->todo));
+				   !binder_worklist_empty(proc, &thread->todo));
 	if (wait_for_proc_work) {
 		if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED |
 					BINDER_LOOPER_STATE_ENTERED))) {
@@ -3218,18 +3387,20 @@ retry:
 	while (1) {
 		uint32_t cmd;
 		struct binder_transaction_data tr;
-		struct binder_work *w;
+		struct binder_work *w = NULL;
+		struct list_head *list = NULL;
 		struct binder_transaction *t = NULL;
 		struct binder_thread *t_from;
 
 		binder_inner_proc_lock(proc);
-		if (!list_empty(&thread->todo)) {
-			w = list_first_entry(&thread->todo, struct binder_work,
-					     entry);
-		} else if (!list_empty(&proc->todo) && wait_for_proc_work) {
-			w = list_first_entry(&proc->todo, struct binder_work,
-					     entry);
-		} else {
+		if (!binder_worklist_empty_ilocked(&thread->todo))
+			list = &thread->todo;
+		else if (!binder_worklist_empty_ilocked(&proc->todo) &&
+			   wait_for_proc_work)
+			list = &proc->todo;
+		else {
+			binder_inner_proc_unlock(proc);
+
 			/* no data added */
 			if (ptr - buffer == 4 && !thread->looper_need_return)
 				goto retry;
@@ -3240,7 +3411,7 @@ retry:
 			binder_inner_proc_unlock(proc);
 			break;
 		}
-		list_del_init(&w->entry);
+		w = binder_dequeue_work_head_ilocked(list);
 
 		switch (w->type) {
 		case BINDER_WORK_TRANSACTION: {
@@ -3389,8 +3560,8 @@ retry:
 				binder_stats_deleted(BINDER_STAT_DEATH);
 			} else {
 				binder_inner_proc_lock(proc);
-				list_add_tail(&w->entry,
-					      &proc->delivered_death);
+				binder_enqueue_work_ilocked(
+						w, &proc->delivered_death);
 				binder_inner_proc_unlock(proc);
 			}
 			if (cmd == BR_DEAD_BINDER)
@@ -3500,13 +3671,16 @@ done:
 	return 0;
 }
 
-static void binder_release_work(struct list_head *list)
+static void binder_release_work(struct binder_proc *proc,
+				struct list_head *list)
 {
 	struct binder_work *w;
 
-	while (!list_empty(list)) {
-		w = list_first_entry(list, struct binder_work, entry);
-		list_del_init(&w->entry);
+	while (1) {
+		w = binder_dequeue_work_head(proc, list);
+		if (!w)
+			return;
+
 		switch (w->type) {
 		case BINDER_WORK_TRANSACTION: {
 			struct binder_transaction *t;
@@ -3670,7 +3844,7 @@ static int binder_thread_release(struct binder_proc *proc,
 
 	if (send_reply)
 		binder_send_failed_reply(send_reply, BR_DEAD_REPLY);
-	binder_release_work(&thread->todo);
+	binder_release_work(proc, &thread->todo);
 	binder_thread_dec_tmpref(thread);
 	return active_transactions;
 }
@@ -3687,7 +3861,7 @@ static unsigned int binder_poll(struct file *filp,
 	thread = binder_get_thread(proc);
 
 	wait_for_proc_work = thread->transaction_stack == NULL &&
-		list_empty(&thread->todo);
+		binder_worklist_empty(proc, &thread->todo);
 
 	binder_unlock(__func__);
 
@@ -3750,7 +3924,7 @@ static int binder_ioctl_write_read(struct file *filp,
 					 &bwr.read_consumed,
 					 filp->f_flags & O_NONBLOCK);
 		trace_binder_read_done(ret);
-		if (!list_empty(&proc->todo))
+		if (!binder_worklist_empty(proc, &proc->todo))
 			wake_up_interruptible(&proc->wait);
 		if (ret < 0) {
 			if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
@@ -4070,10 +4244,10 @@ static int binder_node_release(struct binder_node *node, int refs)
 	int death = 0;
 	struct binder_proc *proc = node->proc;
 
-	binder_release_work(&node->async_todo);
+	binder_release_work(proc, &node->async_todo);
 
 	binder_inner_proc_lock(proc);
-	list_del_init(&node->work.entry);
+	binder_dequeue_work_ilocked(&node->work);
 	/*
 	 * The caller must have taken a temporary ref on the node,
 	 */
@@ -4102,13 +4276,15 @@ static int binder_node_release(struct binder_node *node, int refs)
 
 		death++;
 
+		binder_inner_proc_lock(ref->proc);
 		if (list_empty(&ref->death->work.entry)) {
 			ref->death->work.type = BINDER_WORK_DEAD_BINDER;
-			list_add_tail(&ref->death->work.entry,
-				      &ref->proc->todo);
+			binder_enqueue_work_ilocked(&ref->death->work,
+						    &ref->proc->todo);
 			wake_up_interruptible(&ref->proc->wait);
 		} else
 			BUG();
+		binder_inner_proc_unlock(ref->proc);
 	}
 
 	binder_debug(BINDER_DEBUG_DEAD_BINDER,
@@ -4184,8 +4360,8 @@ static void binder_deferred_release(struct binder_proc *proc)
 		binder_free_ref(ref);
 	}
 
-	binder_release_work(&proc->todo);
-	binder_release_work(&proc->delivered_death);
+	binder_release_work(proc, &proc->todo);
+	binder_release_work(proc, &proc->delivered_death);
 
 	binder_debug(BINDER_DEBUG_OPEN_CLOSE,
 		     "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n",
@@ -4276,9 +4452,9 @@ static void print_binder_transaction(struct seq_file *m, const char *prefix,
 		   t->buffer->data);
 }
 
-static void print_binder_work(struct seq_file *m, const char *prefix,
-			      const char *transaction_prefix,
-			      struct binder_work *w)
+static void print_binder_work_ilocked(struct seq_file *m, const char *prefix,
+				      const char *transaction_prefix,
+				      struct binder_work *w)
 {
 	struct binder_node *node;
 	struct binder_transaction *t;
@@ -4319,15 +4495,16 @@ static void print_binder_work(struct seq_file *m, const char *prefix,
 	}
 }
 
-static void print_binder_thread(struct seq_file *m,
-				struct binder_thread *thread,
-				int print_always)
+static void print_binder_thread_ilocked(struct seq_file *m,
+					struct binder_thread *thread,
+					int print_always)
 {
 	struct binder_transaction *t;
 	struct binder_work *w;
 	size_t start_pos = m->count;
 	size_t header_pos;
 
+	WARN_ON(!spin_is_locked(&thread->proc->inner_lock));
 	seq_printf(m, "  thread %d: l %02x need_return %d tr %d\n",
 			thread->pid, thread->looper,
 			thread->looper_need_return,
@@ -4349,7 +4526,8 @@ static void print_binder_thread(struct seq_file *m,
 		}
 	}
 	list_for_each_entry(w, &thread->todo, entry) {
-		print_binder_work(m, "    ", "    pending transaction", w);
+		print_binder_work_ilocked(m, "    ",
+					  "    pending transaction", w);
 	}
 	if (!print_always && m->count == header_pos)
 		m->count = start_pos;
@@ -4376,9 +4554,13 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node)
 			seq_printf(m, " %d", ref->proc->pid);
 	}
 	seq_puts(m, "\n");
-	list_for_each_entry(w, &node->async_todo, entry)
-		print_binder_work(m, "    ",
-				  "    pending async transaction", w);
+	if (node->proc) {
+		binder_inner_proc_lock(node->proc);
+		list_for_each_entry(w, &node->async_todo, entry)
+			print_binder_work_ilocked(m, "    ",
+					  "    pending async transaction", w);
+		binder_inner_proc_unlock(node->proc);
+	}
 }
 
 static void print_binder_ref(struct seq_file *m, struct binder_ref *ref)
@@ -4402,9 +4584,11 @@ static void print_binder_proc(struct seq_file *m,
 	seq_printf(m, "context %s\n", proc->context->name);
 	header_pos = m->count;
 
+	binder_inner_proc_lock(proc);
 	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
-		print_binder_thread(m, rb_entry(n, struct binder_thread,
+		print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread,
 						rb_node), print_all);
+	binder_inner_proc_unlock(proc);
 	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
 		struct binder_node *node = rb_entry(n, struct binder_node,
 						    rb_node);
@@ -4419,12 +4603,14 @@ static void print_binder_proc(struct seq_file *m,
 						     rb_node_desc));
 	}
 	binder_alloc_print_allocated(m, &proc->alloc);
+	binder_inner_proc_lock(proc);
 	list_for_each_entry(w, &proc->todo, entry)
-		print_binder_work(m, "  ", "  pending transaction", w);
+		print_binder_work_ilocked(m, "  ", "  pending transaction", w);
 	list_for_each_entry(w, &proc->delivered_death, entry) {
 		seq_puts(m, "  has delivered dead binder\n");
 		break;
 	}
+	binder_inner_proc_unlock(proc);
 	if (!print_all && m->count == header_pos)
 		m->count = start_pos;
 }
@@ -4563,15 +4749,12 @@ static void print_binder_proc_stats(struct seq_file *m,
 	seq_printf(m, "  buffers: %d\n", count);
 
 	count = 0;
+	binder_inner_proc_lock(proc);
 	list_for_each_entry(w, &proc->todo, entry) {
-		switch (w->type) {
-		case BINDER_WORK_TRANSACTION:
+		if (w->type == BINDER_WORK_TRANSACTION)
 			count++;
-			break;
-		default:
-			break;
-		}
 	}
+	binder_inner_proc_unlock(proc);
 	seq_printf(m, "  pending transactions: %d\n", count);
 
 	print_binder_stats(m, "  ", &proc->stats);

From 14c312e9262393410ed351b04a44057220e31f6a Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Thu, 8 Jun 2017 13:45:59 -0700
Subject: [PATCH 0926/3220] FROMLIST: binder: add spinlock to protect
 binder_node

(from https://patchwork.kernel.org/patch/9817769/)

node->node_lock is used to protect elements of node. No
need to acquire for fields that are invariant: debug_id,
ptr, cookie.

Change-Id: Ib7738e52fa7689767f17136e18cc05ff548b5717
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 220 +++++++++++++++++++++++++++++----------
 1 file changed, 165 insertions(+), 55 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 36074ce8d1b1..2ca40f4d0bb5 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -318,6 +318,7 @@ struct binder_error {
  * @proc:                 binder_proc that owns this node
  *                        (invariant after initialized)
  * @refs:                 list of references on this node
+ *                        (protected by @lock)
  * @internal_strong_refs: used to take strong references when
  *                        initiating a transaction
  *                        (protected by @proc->inner_lock if @proc
@@ -351,6 +352,7 @@ struct binder_error {
  *                        (protected by @proc->inner_lock if @proc
  *                        and by @lock)
  * @has_async_transaction: async transaction to node in progress
+ *                        (protected by @lock)
  * @accept_fds:           file descriptor operations supported for node
  *                        (invariant after initialized)
  * @min_priority:         minimum scheduling priority
@@ -432,6 +434,7 @@ struct binder_ref_data {
  * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree
  * @rb_node_node: node for lookup by @node in proc's rb_tree
  * @node_entry:  list entry for node->refs list in target node
+ *               (protected by @node->lock)
  * @proc:        binder_proc containing ref
  * @node:        binder_node of target node. When cleaning up a
  *               ref for deletion in binder_cleanup_ref, a non-NULL
@@ -707,6 +710,43 @@ _binder_node_unlock(struct binder_node *node, int line)
 	spin_unlock(&node->lock);
 }
 
+/**
+ * binder_node_inner_lock() - Acquire node and inner locks
+ * @node:         struct binder_node to acquire
+ *
+ * Acquires node->lock. If node->proc also acquires
+ * proc->inner_lock. Used to protect binder_node fields
+ */
+#define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__)
+static void
+_binder_node_inner_lock(struct binder_node *node, int line)
+{
+	binder_debug(BINDER_DEBUG_SPINLOCKS,
+		     "%s: line=%d\n", __func__, line);
+	spin_lock(&node->lock);
+	if (node->proc)
+		binder_inner_proc_lock(node->proc);
+}
+
+/**
+ * binder_node_unlock() - Release node and inner locks
+ * @node:         struct binder_node to acquire
+ *
+ * Release lock acquired via binder_node_lock()
+ */
+#define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__)
+static void
+_binder_node_inner_unlock(struct binder_node *node, int line)
+{
+	struct binder_proc *proc = node->proc;
+
+	binder_debug(BINDER_DEBUG_SPINLOCKS,
+		     "%s: line=%d\n", __func__, line);
+	if (proc)
+		binder_inner_proc_unlock(proc);
+	spin_unlock(&node->lock);
+}
+
 static bool binder_worklist_empty_ilocked(struct list_head *list)
 {
 	return list_empty(list);
@@ -925,12 +965,14 @@ static struct binder_node *binder_get_node(struct binder_proc *proc,
 }
 
 static struct binder_node *binder_new_node(struct binder_proc *proc,
-					   binder_uintptr_t ptr,
-					   binder_uintptr_t cookie)
+					   struct flat_binder_object *fp)
 {
 	struct rb_node **p = &proc->nodes.rb_node;
 	struct rb_node *parent = NULL;
 	struct binder_node *node;
+	binder_uintptr_t ptr = fp ? fp->binder : 0;
+	binder_uintptr_t cookie = fp ? fp->cookie : 0;
+	__u32 flags = fp ? fp->flags : 0;
 
 	while (*p) {
 		parent = *p;
@@ -956,6 +998,8 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 	node->ptr = ptr;
 	node->cookie = cookie;
 	node->work.type = BINDER_WORK_NODE;
+	node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+	node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
 	spin_lock_init(&node->lock);
 	INIT_LIST_HEAD(&node->work.entry);
 	INIT_LIST_HEAD(&node->async_todo);
@@ -972,12 +1016,15 @@ static void binder_free_node(struct binder_node *node)
 	binder_stats_deleted(BINDER_STAT_NODE);
 }
 
-static int binder_inc_node_ilocked(struct binder_node *node, int strong,
-				   int internal,
-				   struct list_head *target_list)
+static int binder_inc_node_nilocked(struct binder_node *node, int strong,
+				    int internal,
+				    struct list_head *target_list)
 {
-	if (node->proc)
-		BUG_ON(!spin_is_locked(&node->proc->inner_lock));
+	struct binder_proc *proc = node->proc;
+
+	BUG_ON(!spin_is_locked(&node->lock));
+	if (proc)
+		BUG_ON(!spin_is_locked(&proc->inner_lock));
 	if (strong) {
 		if (internal) {
 			if (target_list == NULL &&
@@ -1017,20 +1064,19 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal,
 {
 	int ret;
 
-	if (node->proc)
-		binder_inner_proc_lock(node->proc);
-	ret = binder_inc_node_ilocked(node, strong, internal, target_list);
-	if (node->proc)
-		binder_inner_proc_unlock(node->proc);
+	binder_node_inner_lock(node);
+	ret = binder_inc_node_nilocked(node, strong, internal, target_list);
+	binder_node_inner_unlock(node);
 
 	return ret;
 }
 
-static bool binder_dec_node_ilocked(struct binder_node *node,
-				    int strong, int internal)
+static bool binder_dec_node_nilocked(struct binder_node *node,
+				     int strong, int internal)
 {
 	struct binder_proc *proc = node->proc;
 
+	BUG_ON(!spin_is_locked(&node->lock));
 	if (proc)
 		BUG_ON(!spin_is_locked(&proc->inner_lock));
 	if (strong) {
@@ -1089,12 +1135,9 @@ static void binder_dec_node(struct binder_node *node, int strong, int internal)
 {
 	bool free_node;
 
-	if (node->proc)
-		binder_inner_proc_lock(node->proc);
-	free_node = binder_dec_node_ilocked(node, strong, internal);
-	if (node->proc)
-		binder_inner_proc_unlock(node->proc);
-
+	binder_node_inner_lock(node);
+	free_node = binder_dec_node_nilocked(node, strong, internal);
+	binder_node_inner_unlock(node);
 	if (free_node)
 		binder_free_node(node);
 }
@@ -1124,6 +1167,7 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node)
  */
 static void binder_inc_node_tmpref(struct binder_node *node)
 {
+	binder_node_lock(node);
 	if (node->proc)
 		binder_inner_proc_lock(node->proc);
 	else
@@ -1133,6 +1177,7 @@ static void binder_inc_node_tmpref(struct binder_node *node)
 		binder_inner_proc_unlock(node->proc);
 	else
 		spin_unlock(&binder_dead_nodes_lock);
+	binder_node_unlock(node);
 }
 
 /**
@@ -1145,9 +1190,8 @@ static void binder_dec_node_tmpref(struct binder_node *node)
 {
 	bool free_node;
 
-	if (node->proc)
-		binder_inner_proc_lock(node->proc);
-	else
+	binder_node_inner_lock(node);
+	if (!node->proc)
 		spin_lock(&binder_dead_nodes_lock);
 	node->tmp_refs--;
 	BUG_ON(node->tmp_refs < 0);
@@ -1159,9 +1203,8 @@ static void binder_dec_node_tmpref(struct binder_node *node)
 	 * causes no actual reference to be released in binder_dec_node().
 	 * If that changes, a change is needed here too.
 	 */
-	free_node = binder_dec_node_ilocked(node, 0, 1);
-	if (node->proc)
-		binder_inner_proc_unlock(node->proc);
+	free_node = binder_dec_node_nilocked(node, 0, 1);
+	binder_node_inner_unlock(node);
 	if (free_node)
 		binder_free_node(node);
 }
@@ -1265,19 +1308,21 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 	}
 	rb_link_node(&new_ref->rb_node_desc, parent, p);
 	rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc);
+
+	binder_node_lock(node);
 	hlist_add_head(&new_ref->node_entry, &node->refs);
 
 	binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 		     "%d new ref %d desc %d for node %d\n",
 		      proc->pid, new_ref->data.debug_id, new_ref->data.desc,
 		      node->debug_id);
+	binder_node_unlock(node);
 	return new_ref;
 }
 
 static void binder_cleanup_ref(struct binder_ref *ref)
 {
 	bool delete_node = false;
-	struct binder_proc *node_proc = ref->node->proc;
 
 	binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 		     "%d delete ref %d desc %d for node %d\n",
@@ -1287,15 +1332,13 @@ static void binder_cleanup_ref(struct binder_ref *ref)
 	rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc);
 	rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node);
 
-	if (node_proc)
-		binder_inner_proc_lock(node_proc);
+	binder_node_inner_lock(ref->node);
 	if (ref->data.strong)
-		binder_dec_node_ilocked(ref->node, 1, 1);
+		binder_dec_node_nilocked(ref->node, 1, 1);
 
 	hlist_del(&ref->node_entry);
-	delete_node = binder_dec_node_ilocked(ref->node, 0, 1);
-	if (node_proc)
-		binder_inner_proc_unlock(node_proc);
+	delete_node = binder_dec_node_nilocked(ref->node, 0, 1);
+	binder_node_inner_unlock(ref->node);
 	/*
 	 * Clear ref->node unless we want the caller to free the node
 	 */
@@ -1990,12 +2033,9 @@ static int binder_translate_binder(struct flat_binder_object *fp,
 
 	node = binder_get_node(proc, fp->binder);
 	if (!node) {
-		node = binder_new_node(proc, fp->binder, fp->cookie);
+		node = binder_new_node(proc, fp);
 		if (!node)
 			return -ENOMEM;
-
-		node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
-		node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
 	}
 	if (fp->cookie != node->cookie) {
 		binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n",
@@ -2056,6 +2096,7 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 		goto done;
 	}
 
+	binder_node_lock(node);
 	if (node->proc == target_proc) {
 		if (fp->hdr.type == BINDER_TYPE_HANDLE)
 			fp->hdr.type = BINDER_TYPE_BINDER;
@@ -2063,18 +2104,24 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 			fp->hdr.type = BINDER_TYPE_WEAK_BINDER;
 		fp->binder = node->ptr;
 		fp->cookie = node->cookie;
-		binder_inc_node(node,
-				fp->hdr.type == BINDER_TYPE_BINDER,
-				0, NULL);
+		if (node->proc)
+			binder_inner_proc_lock(node->proc);
+		binder_inc_node_nilocked(node,
+					 fp->hdr.type == BINDER_TYPE_BINDER,
+					 0, NULL);
+		if (node->proc)
+			binder_inner_proc_unlock(node->proc);
 		trace_binder_transaction_ref_to_node(t, node, &src_rdata);
 		binder_debug(BINDER_DEBUG_TRANSACTION,
 			     "        ref %d desc %d -> node %d u%016llx\n",
 			     src_rdata.debug_id, src_rdata.desc, node->debug_id,
 			     (u64)node->ptr);
+		binder_node_unlock(node);
 	} else {
 		int ret;
 		struct binder_ref_data dest_rdata;
 
+		binder_node_unlock(node);
 		ret = binder_inc_ref_for_node(target_proc, node,
 				fp->hdr.type == BINDER_TYPE_HANDLE,
 				NULL, &dest_rdata);
@@ -2382,13 +2429,16 @@ static void binder_transaction(struct binder_proc *proc,
 			mutex_unlock(&context->context_mgr_node_lock);
 		}
 		e->to_node = target_node->debug_id;
+		binder_node_lock(target_node);
 		target_proc = target_node->proc;
 		if (target_proc == NULL) {
+			binder_node_unlock(target_node);
 			return_error = BR_DEAD_REPLY;
 			return_error_line = __LINE__;
 			goto err_dead_binder;
 		}
 		target_proc->tmp_ref++;
+		binder_node_unlock(target_node);
 		if (security_binder_transaction(proc->tsk,
 						target_proc->tsk) < 0) {
 			return_error = BR_FAILED_REPLY;
@@ -2705,6 +2755,7 @@ static void binder_transaction(struct binder_proc *proc,
 	}
 	tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
 	binder_enqueue_work(proc, tcomplete, &thread->todo);
+	t->work.type = BINDER_WORK_TRANSACTION;
 
 	if (reply) {
 		if (target_thread->is_dead)
@@ -2712,6 +2763,7 @@ static void binder_transaction(struct binder_proc *proc,
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_pop_transaction(target_thread, in_reply_to);
 		binder_free_transaction(in_reply_to);
+		binder_enqueue_work(target_proc, &t->work, target_list);
 	} else if (!(t->flags & TF_ONE_WAY)) {
 		BUG_ON(t->buffer->async_transaction != 0);
 		t->need_reply = 1;
@@ -2722,20 +2774,29 @@ static void binder_transaction(struct binder_proc *proc,
 			binder_pop_transaction(thread, t);
 			goto err_dead_proc_or_thread;
 		}
+		binder_enqueue_work(target_proc, &t->work, target_list);
 	} else {
 		BUG_ON(target_node == NULL);
 		BUG_ON(t->buffer->async_transaction != 1);
+		binder_node_lock(target_node);
 		if (target_node->has_async_transaction) {
 			target_list = &target_node->async_todo;
 			target_wait = NULL;
 		} else
 			target_node->has_async_transaction = 1;
+		/*
+		 * Test/set of has_async_transaction
+		 * must be atomic with enqueue on
+		 * async_todo
+		 */
 		if (target_proc->is_dead ||
-				(target_thread && target_thread->is_dead))
+				(target_thread && target_thread->is_dead)) {
+			binder_node_unlock(target_node);
 			goto err_dead_proc_or_thread;
+		}
+		binder_enqueue_work(target_proc, &t->work, target_list);
+		binder_node_unlock(target_node);
 	}
-	t->work.type = BINDER_WORK_TRANSACTION;
-	binder_enqueue_work(target_proc, &t->work, target_list);
 	if (target_wait) {
 		if (reply || !(tr->flags & TF_ONE_WAY))
 			wake_up_interruptible_sync(target_wait);
@@ -2914,6 +2975,7 @@ static int binder_thread_write(struct binder_proc *proc,
 			binder_uintptr_t node_ptr;
 			binder_uintptr_t cookie;
 			struct binder_node *node;
+			bool free_node;
 
 			if (get_user(node_ptr, (binder_uintptr_t __user *)ptr))
 				return -EFAULT;
@@ -2941,13 +3003,13 @@ static int binder_thread_write(struct binder_proc *proc,
 				binder_put_node(node);
 				break;
 			}
-			binder_inner_proc_lock(proc);
+			binder_node_inner_lock(node);
 			if (cmd == BC_ACQUIRE_DONE) {
 				if (node->pending_strong_ref == 0) {
 					binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n",
 						proc->pid, thread->pid,
 						node->debug_id);
-					binder_inner_proc_unlock(proc);
+					binder_node_inner_unlock(node);
 					binder_put_node(node);
 					break;
 				}
@@ -2957,20 +3019,22 @@ static int binder_thread_write(struct binder_proc *proc,
 					binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n",
 						proc->pid, thread->pid,
 						node->debug_id);
-					binder_inner_proc_unlock(proc);
+					binder_node_inner_unlock(node);
 					binder_put_node(node);
 					break;
 				}
 				node->pending_weak_ref = 0;
 			}
-			binder_inner_proc_unlock(proc);
-			binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0);
+			free_node = binder_dec_node_nilocked(node,
+					cmd == BC_ACQUIRE_DONE, 0);
+			WARN_ON(free_node);
 			binder_debug(BINDER_DEBUG_USER_REFS,
 				     "%d:%d %s node %d ls %d lw %d tr %d\n",
 				     proc->pid, thread->pid,
 				     cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
 				     node->debug_id, node->local_strong_refs,
 				     node->local_weak_refs, node->tmp_refs);
+			binder_node_inner_unlock(node);
 			binder_put_node(node);
 			break;
 		}
@@ -3016,9 +3080,9 @@ static int binder_thread_write(struct binder_proc *proc,
 				struct binder_work *w;
 
 				buf_node = buffer->target_node;
+				binder_node_inner_lock(buf_node);
 				BUG_ON(!buf_node->has_async_transaction);
 				BUG_ON(buf_node->proc != proc);
-				binder_inner_proc_lock(proc);
 				w = binder_dequeue_work_head_ilocked(
 						&buf_node->async_todo);
 				if (!w)
@@ -3026,7 +3090,7 @@ static int binder_thread_write(struct binder_proc *proc,
 				else
 					binder_enqueue_work_ilocked(
 							w, &thread->todo);
-				binder_inner_proc_unlock(proc);
+				binder_node_inner_unlock(buf_node);
 			}
 			trace_binder_transaction_buffer_release(buffer);
 			binder_transaction_buffer_release(proc, buffer, NULL);
@@ -3151,6 +3215,7 @@ static int binder_thread_write(struct binder_proc *proc,
 				INIT_LIST_HEAD(&death->work.entry);
 				death->cookie = cookie;
 				ref->death = death;
+				binder_node_lock(ref->node);
 				if (ref->node->proc == NULL) {
 					ref->death->work.type = BINDER_WORK_DEAD_BINDER;
 					if (thread->looper &
@@ -3169,10 +3234,13 @@ static int binder_thread_write(struct binder_proc *proc,
 								&proc->wait);
 					}
 				}
+				binder_node_unlock(ref->node);
 			} else {
+				binder_node_lock(ref->node);
 				if (ref->death == NULL) {
 					binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n",
 						proc->pid, thread->pid);
+					binder_node_unlock(ref->node);
 					break;
 				}
 				death = ref->death;
@@ -3181,6 +3249,7 @@ static int binder_thread_write(struct binder_proc *proc,
 						proc->pid, thread->pid,
 						(u64)death->cookie,
 						(u64)cookie);
+					binder_node_unlock(ref->node);
 					break;
 				}
 				ref->death = NULL;
@@ -3205,6 +3274,7 @@ static int binder_thread_write(struct binder_proc *proc,
 					death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR;
 				}
 				binder_inner_proc_unlock(proc);
+				binder_node_unlock(ref->node);
 			}
 		} break;
 		case BC_DEAD_BINDER_DONE: {
@@ -3487,6 +3557,17 @@ retry:
 					     (u64)node_cookie);
 				rb_erase(&node->rb_node, &proc->nodes);
 				binder_inner_proc_unlock(proc);
+				binder_node_lock(node);
+				/*
+				 * Acquire the node lock before freeing the
+				 * node to serialize with other threads that
+				 * may have been holding the node lock while
+				 * decrementing this node (avoids race where
+				 * this thread frees while the other thread
+				 * is unlocking the node after the final
+				 * decrement)
+				 */
+				binder_node_unlock(node);
 				binder_free_node(node);
 			} else
 				binder_inner_proc_unlock(proc);
@@ -3974,16 +4055,18 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 	} else {
 		context->binder_context_mgr_uid = curr_euid;
 	}
-	new_node = binder_new_node(proc, 0, 0);
+	new_node = binder_new_node(proc, NULL);
 	if (!new_node) {
 		ret = -ENOMEM;
 		goto out;
 	}
+	binder_node_lock(new_node);
 	new_node->local_weak_refs++;
 	new_node->local_strong_refs++;
 	new_node->has_strong_ref = 1;
 	new_node->has_weak_ref = 1;
 	context->binder_context_mgr_node = new_node;
+	binder_node_unlock(new_node);
 	binder_put_node(new_node);
 out:
 	mutex_unlock(&context->context_mgr_node_lock);
@@ -4246,6 +4329,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 
 	binder_release_work(proc, &node->async_todo);
 
+	binder_node_lock(node);
 	binder_inner_proc_lock(proc);
 	binder_dequeue_work_ilocked(&node->work);
 	/*
@@ -4254,6 +4338,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 	BUG_ON(!node->tmp_refs);
 	if (hlist_empty(&node->refs) && node->tmp_refs == 1) {
 		binder_inner_proc_unlock(proc);
+		binder_node_unlock(node);
 		binder_free_node(node);
 
 		return refs;
@@ -4290,6 +4375,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 	binder_debug(BINDER_DEBUG_DEAD_BINDER,
 		     "node %d now dead, refs %d, death %d\n",
 		     node->debug_id, refs, death);
+	binder_node_unlock(node);
 	binder_put_node(node);
 
 	return refs;
@@ -4533,12 +4619,15 @@ static void print_binder_thread_ilocked(struct seq_file *m,
 		m->count = start_pos;
 }
 
-static void print_binder_node(struct seq_file *m, struct binder_node *node)
+static void print_binder_node_nlocked(struct seq_file *m,
+				      struct binder_node *node)
 {
 	struct binder_ref *ref;
 	struct binder_work *w;
 	int count;
 
+	WARN_ON(!spin_is_locked(&node->lock));
+
 	count = 0;
 	hlist_for_each_entry(ref, &node->refs, node_entry)
 		count++;
@@ -4565,11 +4654,13 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node)
 
 static void print_binder_ref(struct seq_file *m, struct binder_ref *ref)
 {
+	binder_node_lock(ref->node);
 	seq_printf(m, "  ref %d: desc %d %snode %d s %d w %d d %pK\n",
 		   ref->data.debug_id, ref->data.desc,
 		   ref->node->proc ? "" : "dead ",
 		   ref->node->debug_id, ref->data.strong,
 		   ref->data.weak, ref->death);
+	binder_node_unlock(ref->node);
 }
 
 static void print_binder_proc(struct seq_file *m,
@@ -4592,8 +4683,10 @@ static void print_binder_proc(struct seq_file *m,
 	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
 		struct binder_node *node = rb_entry(n, struct binder_node,
 						    rb_node);
+		binder_node_lock(node);
 		if (print_all || node->has_async_transaction)
-			print_binder_node(m, node);
+			print_binder_node_nlocked(m, node);
+		binder_node_unlock(node);
 	}
 	if (print_all) {
 		for (n = rb_first(&proc->refs_by_desc);
@@ -4765,6 +4858,7 @@ static int binder_state_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *proc;
 	struct binder_node *node;
+	struct binder_node *last_node = NULL;
 
 	binder_lock(__func__);
 
@@ -4773,9 +4867,25 @@ static int binder_state_show(struct seq_file *m, void *unused)
 	spin_lock(&binder_dead_nodes_lock);
 	if (!hlist_empty(&binder_dead_nodes))
 		seq_puts(m, "dead nodes:\n");
-	hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
-		print_binder_node(m, node);
+	hlist_for_each_entry(node, &binder_dead_nodes, dead_node) {
+		/*
+		 * take a temporary reference on the node so it
+		 * survives and isn't removed from the list
+		 * while we print it.
+		 */
+		node->tmp_refs++;
+		spin_unlock(&binder_dead_nodes_lock);
+		if (last_node)
+			binder_put_node(last_node);
+		binder_node_lock(node);
+		print_binder_node_nlocked(m, node);
+		binder_node_unlock(node);
+		last_node = node;
+		spin_lock(&binder_dead_nodes_lock);
+	}
 	spin_unlock(&binder_dead_nodes_lock);
+	if (last_node)
+		binder_put_node(last_node);
 
 	mutex_lock(&binder_procs_lock);
 	hlist_for_each_entry(proc, &binder_procs, proc_node)

From 46655970b9673b2b60eb67bd902743455dfdebe3 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 12 Jun 2017 12:07:26 -0700
Subject: [PATCH 0927/3220] FROMLIST: binder: protect proc->nodes with inner
 lock

(from https://patchwork.kernel.org/patch/9817783/)

When locks for binder_ref handling are added, proc->nodes
will need to be modified while holding the outer lock

Change-Id: I17b39e981c55130c14a62fe49900eceff6e3642b
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 112 +++++++++++++++++++++++++++++++--------
 1 file changed, 89 insertions(+), 23 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 2ca40f4d0bb5..a0c0bd05a885 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -313,6 +313,7 @@ struct binder_error {
  * @work:                 worklist element for node work
  *                        (protected by @proc->inner_lock)
  * @rb_node:              element for proc->nodes tree
+ *                        (protected by @proc->inner_lock)
  * @dead_node:            element for binder_dead_nodes list
  *                        (protected by binder_dead_nodes_lock)
  * @proc:                 binder_proc that owns this node
@@ -470,6 +471,7 @@ enum binder_deferred_state {
  * @threads:              rbtree of binder_threads in this proc
  * @nodes:                rbtree of binder nodes associated with
  *                        this proc ordered by node->ptr
+ *                        (protected by @inner_lock)
  * @refs_by_desc:         rbtree of refs ordered by ref->desc
  * @refs_by_node:         rbtree of refs ordered by ref->node
  * @pid                   PID of group_leader of process
@@ -856,7 +858,7 @@ static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
 static void binder_free_thread(struct binder_thread *thread);
 static void binder_free_proc(struct binder_proc *proc);
-static void binder_inc_node_tmpref(struct binder_node *node);
+static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
 
 static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 {
@@ -938,12 +940,14 @@ static void binder_set_nice(long nice)
 	binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
 }
 
-static struct binder_node *binder_get_node(struct binder_proc *proc,
-					   binder_uintptr_t ptr)
+static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
+						   binder_uintptr_t ptr)
 {
 	struct rb_node *n = proc->nodes.rb_node;
 	struct binder_node *node;
 
+	BUG_ON(!spin_is_locked(&proc->inner_lock));
+
 	while (n) {
 		node = rb_entry(n, struct binder_node, rb_node);
 
@@ -957,15 +961,28 @@ static struct binder_node *binder_get_node(struct binder_proc *proc,
 			 * to ensure node stays alive until
 			 * call to binder_put_node()
 			 */
-			binder_inc_node_tmpref(node);
+			binder_inc_node_tmpref_ilocked(node);
 			return node;
 		}
 	}
 	return NULL;
 }
 
-static struct binder_node *binder_new_node(struct binder_proc *proc,
-					   struct flat_binder_object *fp)
+static struct binder_node *binder_get_node(struct binder_proc *proc,
+					   binder_uintptr_t ptr)
+{
+	struct binder_node *node;
+
+	binder_inner_proc_lock(proc);
+	node = binder_get_node_ilocked(proc, ptr);
+	binder_inner_proc_unlock(proc);
+	return node;
+}
+
+static struct binder_node *binder_init_node_ilocked(
+						struct binder_proc *proc,
+						struct binder_node *new_node,
+						struct flat_binder_object *fp)
 {
 	struct rb_node **p = &proc->nodes.rb_node;
 	struct rb_node *parent = NULL;
@@ -974,7 +991,9 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 	binder_uintptr_t cookie = fp ? fp->cookie : 0;
 	__u32 flags = fp ? fp->flags : 0;
 
+	BUG_ON(!spin_is_locked(&proc->inner_lock));
 	while (*p) {
+
 		parent = *p;
 		node = rb_entry(parent, struct binder_node, rb_node);
 
@@ -982,13 +1001,17 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 			p = &(*p)->rb_left;
 		else if (ptr > node->ptr)
 			p = &(*p)->rb_right;
-		else
-			return NULL;
+		else {
+			/*
+			 * A matching node is already in
+			 * the rb tree. Abandon the init
+			 * and return it.
+			 */
+			binder_inc_node_tmpref_ilocked(node);
+			return node;
+		}
 	}
-
-	node = kzalloc(sizeof(*node), GFP_KERNEL);
-	if (node == NULL)
-		return NULL;
+	node = new_node;
 	binder_stats_created(BINDER_STAT_NODE);
 	node->tmp_refs++;
 	rb_link_node(&node->rb_node, parent, p);
@@ -1007,6 +1030,27 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
 		     "%d:%d node %d u%016llx c%016llx created\n",
 		     proc->pid, current->pid, node->debug_id,
 		     (u64)node->ptr, (u64)node->cookie);
+
+	return node;
+}
+
+static struct binder_node *binder_new_node(struct binder_proc *proc,
+					   struct flat_binder_object *fp)
+{
+	struct binder_node *node;
+	struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL);
+
+	if (!new_node)
+		return NULL;
+	binder_inner_proc_lock(proc);
+	node = binder_init_node_ilocked(proc, new_node, fp);
+	binder_inner_proc_unlock(proc);
+	if (node != new_node)
+		/*
+		 * The node was already added by another thread
+		 */
+		kfree(new_node);
+
 	return node;
 }
 
@@ -4421,6 +4465,7 @@ static void binder_deferred_release(struct binder_proc *proc)
 
 	nodes = 0;
 	incoming_refs = 0;
+	binder_inner_proc_lock(proc);
 	while ((n = rb_first(&proc->nodes))) {
 		struct binder_node *node;
 
@@ -4431,10 +4476,13 @@ static void binder_deferred_release(struct binder_proc *proc)
 		 * calling binder_node_release() which will either
 		 * kfree() the node or call binder_put_node()
 		 */
-		binder_inc_node_tmpref(node);
+		binder_inc_node_tmpref_ilocked(node);
 		rb_erase(&node->rb_node, &proc->nodes);
+		binder_inner_proc_unlock(proc);
 		incoming_refs = binder_node_release(node, incoming_refs);
+		binder_inner_proc_lock(proc);
 	}
+	binder_inner_proc_unlock(proc);
 
 	outgoing_refs = 0;
 	while ((n = rb_first(&proc->refs_by_desc))) {
@@ -4619,14 +4667,16 @@ static void print_binder_thread_ilocked(struct seq_file *m,
 		m->count = start_pos;
 }
 
-static void print_binder_node_nlocked(struct seq_file *m,
-				      struct binder_node *node)
+static void print_binder_node_nilocked(struct seq_file *m,
+				       struct binder_node *node)
 {
 	struct binder_ref *ref;
 	struct binder_work *w;
 	int count;
 
 	WARN_ON(!spin_is_locked(&node->lock));
+	if (node->proc)
+		WARN_ON(!spin_is_locked(&node->proc->inner_lock));
 
 	count = 0;
 	hlist_for_each_entry(ref, &node->refs, node_entry)
@@ -4644,11 +4694,9 @@ static void print_binder_node_nlocked(struct seq_file *m,
 	}
 	seq_puts(m, "\n");
 	if (node->proc) {
-		binder_inner_proc_lock(node->proc);
 		list_for_each_entry(w, &node->async_todo, entry)
 			print_binder_work_ilocked(m, "    ",
 					  "    pending async transaction", w);
-		binder_inner_proc_unlock(node->proc);
 	}
 }
 
@@ -4670,6 +4718,7 @@ static void print_binder_proc(struct seq_file *m,
 	struct rb_node *n;
 	size_t start_pos = m->count;
 	size_t header_pos;
+	struct binder_node *last_node = NULL;
 
 	seq_printf(m, "proc %d\n", proc->pid);
 	seq_printf(m, "context %s\n", proc->context->name);
@@ -4679,15 +4728,30 @@ static void print_binder_proc(struct seq_file *m,
 	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
 		print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread,
 						rb_node), print_all);
-	binder_inner_proc_unlock(proc);
+
 	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
 		struct binder_node *node = rb_entry(n, struct binder_node,
 						    rb_node);
-		binder_node_lock(node);
-		if (print_all || node->has_async_transaction)
-			print_binder_node_nlocked(m, node);
-		binder_node_unlock(node);
+		/*
+		 * take a temporary reference on the node so it
+		 * survives and isn't removed from the tree
+		 * while we print it.
+		 */
+		binder_inc_node_tmpref_ilocked(node);
+		/* Need to drop inner lock to take node lock */
+		binder_inner_proc_unlock(proc);
+		if (last_node)
+			binder_put_node(last_node);
+		binder_node_inner_lock(node);
+		print_binder_node_nilocked(m, node);
+		binder_node_inner_unlock(node);
+		last_node = node;
+		binder_inner_proc_lock(proc);
 	}
+	binder_inner_proc_unlock(proc);
+	if (last_node)
+		binder_put_node(last_node);
+
 	if (print_all) {
 		for (n = rb_first(&proc->refs_by_desc);
 		     n != NULL;
@@ -4823,8 +4887,10 @@ static void print_binder_proc_stats(struct seq_file *m,
 			proc->ready_threads,
 			binder_alloc_get_free_async_space(&proc->alloc));
 	count = 0;
+	binder_inner_proc_lock(proc);
 	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n))
 		count++;
+	binder_inner_proc_unlock(proc);
 	seq_printf(m, "  nodes: %d\n", count);
 	count = 0;
 	strong = 0;
@@ -4878,7 +4944,7 @@ static int binder_state_show(struct seq_file *m, void *unused)
 		if (last_node)
 			binder_put_node(last_node);
 		binder_node_lock(node);
-		print_binder_node_nlocked(m, node);
+		print_binder_node_nilocked(m, node);
 		binder_node_unlock(node);
 		last_node = node;
 		spin_lock(&binder_dead_nodes_lock);

From e495123304a5949848881496ee23990791a57db3 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Thu, 25 May 2017 15:52:17 -0700
Subject: [PATCH 0928/3220] FROMLIST: binder: protect proc->threads with
 inner_lock

(from https://patchwork.kernel.org/patch/9817775/)

proc->threads will need to be accessed with higher
locks of other processes held so use proc->inner_lock
to protect it. proc->tmp_ref now needs to be protected
by proc->inner_lock.

Change-Id: I176cfeca16bf7c9b34b428c16405f93db81d2ff8
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 87 +++++++++++++++++++++++++++++-----------
 1 file changed, 63 insertions(+), 24 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index a0c0bd05a885..8735f9a6c018 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -469,6 +469,7 @@ enum binder_deferred_state {
  * struct binder_proc - binder process bookkeeping
  * @proc_node:            element for binder_procs list
  * @threads:              rbtree of binder_threads in this proc
+ *                        (protected by @inner_lock)
  * @nodes:                rbtree of binder nodes associated with
  *                        this proc ordered by node->ptr
  *                        (protected by @inner_lock)
@@ -486,6 +487,7 @@ enum binder_deferred_state {
  *                        (protected by binder_deferred_lock)
  * @is_dead:              process is dead and awaiting free
  *                        when outstanding transactions are cleaned up
+ *                        (protected by @inner_lock)
  * @todo:                 list of work for this process
  *                        (protected by @inner_lock)
  * @wait:                 wait queue head to wait for proc work
@@ -501,6 +503,7 @@ enum binder_deferred_state {
  * @requested_threads_started: number binder threads started
  * @ready_threads:        number of threads waiting for proc work
  * @tmp_ref:              temporary reference to indicate proc is in use
+ *                        (protected by @inner_lock)
  * @default_priority:     default scheduler priority
  *                        (invariant after initialized)
  * @debugfs_entry:        debugfs node
@@ -556,6 +559,7 @@ enum {
  * @proc:                 binder process for this thread
  *                        (invariant after initialization)
  * @rb_node:              element for proc->threads rbtree
+ *                        (protected by @proc->inner_lock)
  * @pid:                  PID for this thread
  *                        (invariant after initialization)
  * @looper:               bitmap of looping state
@@ -576,6 +580,7 @@ enum {
  *                        always be acquired)
  * @is_dead:              thread is dead and awaiting free
  *                        when outstanding transactions are cleaned up
+ *                        (protected by @proc->inner_lock)
  *
  * Bookkeeping structure for binder threads.
  */
@@ -1668,15 +1673,15 @@ static void binder_thread_dec_tmpref(struct binder_thread *thread)
 	/*
 	 * atomic is used to protect the counter value while
 	 * it cannot reach zero or thread->is_dead is false
-	 *
-	 * TODO: future patch adds locking to ensure that the
-	 * check of tmp_ref and is_dead is done with a lock held
 	 */
+	binder_inner_proc_lock(thread->proc);
 	atomic_dec(&thread->tmp_ref);
 	if (thread->is_dead && !atomic_read(&thread->tmp_ref)) {
+		binder_inner_proc_unlock(thread->proc);
 		binder_free_thread(thread);
 		return;
 	}
+	binder_inner_proc_unlock(thread->proc);
 }
 
 /**
@@ -1693,12 +1698,15 @@ static void binder_thread_dec_tmpref(struct binder_thread *thread)
  */
 static void binder_proc_dec_tmpref(struct binder_proc *proc)
 {
+	binder_inner_proc_lock(proc);
 	proc->tmp_ref--;
 	if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) &&
 			!proc->tmp_ref) {
+		binder_inner_proc_unlock(proc);
 		binder_free_proc(proc);
 		return;
 	}
+	binder_inner_proc_unlock(proc);
 }
 
 /**
@@ -2481,7 +2489,9 @@ static void binder_transaction(struct binder_proc *proc,
 			return_error_line = __LINE__;
 			goto err_dead_binder;
 		}
+		binder_inner_proc_lock(target_proc);
 		target_proc->tmp_ref++;
+		binder_inner_proc_unlock(target_proc);
 		binder_node_unlock(target_node);
 		if (security_binder_transaction(proc->tsk,
 						target_proc->tsk) < 0) {
@@ -3855,7 +3865,8 @@ static void binder_release_work(struct binder_proc *proc,
 
 }
 
-static struct binder_thread *binder_get_thread(struct binder_proc *proc)
+static struct binder_thread *binder_get_thread_ilocked(
+		struct binder_proc *proc, struct binder_thread *new_thread)
 {
 	struct binder_thread *thread = NULL;
 	struct rb_node *parent = NULL;
@@ -3870,25 +3881,45 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc)
 		else if (current->pid > thread->pid)
 			p = &(*p)->rb_right;
 		else
-			break;
+			return thread;
 	}
-	if (*p == NULL) {
-		thread = kzalloc(sizeof(*thread), GFP_KERNEL);
-		if (thread == NULL)
+	if (!new_thread)
+		return NULL;
+	thread = new_thread;
+	binder_stats_created(BINDER_STAT_THREAD);
+	thread->proc = proc;
+	thread->pid = current->pid;
+	atomic_set(&thread->tmp_ref, 0);
+	init_waitqueue_head(&thread->wait);
+	INIT_LIST_HEAD(&thread->todo);
+	rb_link_node(&thread->rb_node, parent, p);
+	rb_insert_color(&thread->rb_node, &proc->threads);
+	thread->looper_need_return = true;
+	thread->return_error.work.type = BINDER_WORK_RETURN_ERROR;
+	thread->return_error.cmd = BR_OK;
+	thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR;
+	thread->reply_error.cmd = BR_OK;
+
+	return thread;
+}
+
+static struct binder_thread *binder_get_thread(struct binder_proc *proc)
+{
+	struct binder_thread *thread;
+	struct binder_thread *new_thread;
+
+	binder_inner_proc_lock(proc);
+	thread = binder_get_thread_ilocked(proc, NULL);
+	binder_inner_proc_unlock(proc);
+	if (!thread) {
+		new_thread = kzalloc(sizeof(*thread), GFP_KERNEL);
+		if (new_thread == NULL)
 			return NULL;
-		binder_stats_created(BINDER_STAT_THREAD);
-		thread->proc = proc;
-		thread->pid = current->pid;
-		atomic_set(&thread->tmp_ref, 0);
-		init_waitqueue_head(&thread->wait);
-		INIT_LIST_HEAD(&thread->todo);
-		rb_link_node(&thread->rb_node, parent, p);
-		rb_insert_color(&thread->rb_node, &proc->threads);
-		thread->looper_need_return = true;
-		thread->return_error.work.type = BINDER_WORK_RETURN_ERROR;
-		thread->return_error.cmd = BR_OK;
-		thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR;
-		thread->reply_error.cmd = BR_OK;
+		binder_inner_proc_lock(proc);
+		thread = binder_get_thread_ilocked(proc, new_thread);
+		binder_inner_proc_unlock(proc);
+		if (thread != new_thread)
+			kfree(new_thread);
 	}
 	return thread;
 }
@@ -3919,6 +3950,7 @@ static int binder_thread_release(struct binder_proc *proc,
 	int active_transactions = 0;
 	struct binder_transaction *last_t = NULL;
 
+	binder_inner_proc_lock(thread->proc);
 	/*
 	 * take a ref on the proc so it survives
 	 * after we remove this thread from proc->threads.
@@ -3966,6 +3998,7 @@ static int binder_thread_release(struct binder_proc *proc,
 		if (t)
 			spin_lock(&t->lock);
 	}
+	binder_inner_proc_unlock(thread->proc);
 
 	if (send_reply)
 		binder_send_failed_reply(send_reply, BR_DEAD_REPLY);
@@ -4339,6 +4372,7 @@ static void binder_deferred_flush(struct binder_proc *proc)
 	struct rb_node *n;
 	int wake_count = 0;
 
+	binder_inner_proc_lock(proc);
 	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
 		struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node);
 
@@ -4348,6 +4382,7 @@ static void binder_deferred_flush(struct binder_proc *proc)
 			wake_count++;
 		}
 	}
+	binder_inner_proc_unlock(proc);
 	wake_up_interruptible_all(&proc->wait);
 
 	binder_debug(BINDER_DEBUG_OPEN_CLOSE,
@@ -4446,6 +4481,7 @@ static void binder_deferred_release(struct binder_proc *proc)
 		context->binder_context_mgr_node = NULL;
 	}
 	mutex_unlock(&context->context_mgr_node_lock);
+	binder_inner_proc_lock(proc);
 	/*
 	 * Make sure proc stays alive after we
 	 * remove all the threads
@@ -4459,13 +4495,14 @@ static void binder_deferred_release(struct binder_proc *proc)
 		struct binder_thread *thread;
 
 		thread = rb_entry(n, struct binder_thread, rb_node);
+		binder_inner_proc_unlock(proc);
 		threads++;
 		active_transactions += binder_thread_release(proc, thread);
+		binder_inner_proc_lock(proc);
 	}
 
 	nodes = 0;
 	incoming_refs = 0;
-	binder_inner_proc_lock(proc);
 	while ((n = rb_first(&proc->nodes))) {
 		struct binder_node *node;
 
@@ -4873,10 +4910,13 @@ static void print_binder_proc_stats(struct seq_file *m,
 	struct binder_work *w;
 	struct rb_node *n;
 	int count, strong, weak;
+	size_t free_async_space =
+		binder_alloc_get_free_async_space(&proc->alloc);
 
 	seq_printf(m, "proc %d\n", proc->pid);
 	seq_printf(m, "context %s\n", proc->context->name);
 	count = 0;
+	binder_inner_proc_lock(proc);
 	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
 		count++;
 	seq_printf(m, "  threads: %d\n", count);
@@ -4885,9 +4925,8 @@ static void print_binder_proc_stats(struct seq_file *m,
 			"  free async space %zd\n", proc->requested_threads,
 			proc->requested_threads_started, proc->max_threads,
 			proc->ready_threads,
-			binder_alloc_get_free_async_space(&proc->alloc));
+			free_async_space);
 	count = 0;
-	binder_inner_proc_lock(proc);
 	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n))
 		count++;
 	binder_inner_proc_unlock(proc);

From 89b657e0d7de20459820d7edf341bc888b69aa83 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 2 Jun 2017 13:36:52 -0700
Subject: [PATCH 0929/3220] FROMLIST: binder: protect transaction_stack with
 inner lock.

(from https://patchwork.kernel.org/patch/9817779/)

This makes future changes to priority inheritance
easier, since we want to be able to look at a thread's
transaction stack when selecting a thread to inherit
priority for.

It also allows us to take just a single lock in a
few paths, where we used to take two in succession.

Change-Id: Idb1b6e9faa5c669978b2b3011fe326be8aece586
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 96 +++++++++++++++++++++++++++++++++-------
 1 file changed, 79 insertions(+), 17 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 8735f9a6c018..328ec2cd7956 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -30,7 +30,8 @@
  * 3) proc->inner_lock : protects the thread and node lists
  *    (proc->threads, proc->nodes) and all todo lists associated
  *    with the binder_proc (proc->todo, thread->todo,
- *    proc->delivered_death and node->async_todo).
+ *    proc->delivered_death and node->async_todo), as well as
+ *    thread->transaction_stack
  *    binder_inner_proc_lock() and binder_inner_proc_unlock()
  *    are used to acq/rel
  *
@@ -567,11 +568,13 @@ enum {
  * @looper_needs_return:  looping thread needs to exit driver
  *                        (no lock needed)
  * @transaction_stack:    stack of in-progress transactions for this thread
+ *                        (protected by @proc->inner_lock)
  * @todo:                 list of work to do for this thread
  *                        (protected by @proc->inner_lock)
  * @return_error:         transaction errors reported by this thread
  *                        (only accessed by this thread)
  * @reply_error:          transaction errors reported by target thread
+ *                        (protected by @proc->inner_lock)
  * @wait:                 wait queue for thread work
  * @stats:                per-thread statistics
  *                        (atomics, no lock needed)
@@ -1645,10 +1648,11 @@ static int binder_inc_ref_for_node(struct binder_proc *proc,
 	return ret;
 }
 
-static void binder_pop_transaction(struct binder_thread *target_thread,
-				   struct binder_transaction *t)
+static void binder_pop_transaction_ilocked(struct binder_thread *target_thread,
+					   struct binder_transaction *t)
 {
 	BUG_ON(!target_thread);
+	BUG_ON(!spin_is_locked(&target_thread->proc->inner_lock));
 	BUG_ON(target_thread->transaction_stack != t);
 	BUG_ON(target_thread->transaction_stack->from != target_thread);
 	target_thread->transaction_stack =
@@ -1732,6 +1736,35 @@ static struct binder_thread *binder_get_txn_from(
 	return from;
 }
 
+/**
+ * binder_get_txn_from_and_acq_inner() - get t->from and acquire inner lock
+ * @t:	binder transaction for t->from
+ *
+ * Same as binder_get_txn_from() except it also acquires the proc->inner_lock
+ * to guarantee that the thread cannot be released while operating on it.
+ * The caller must call binder_inner_proc_unlock() to release the inner lock
+ * as well as call binder_dec_thread_txn() to release the reference.
+ *
+ * Return: the value of t->from
+ */
+static struct binder_thread *binder_get_txn_from_and_acq_inner(
+		struct binder_transaction *t)
+{
+	struct binder_thread *from;
+
+	from = binder_get_txn_from(t);
+	if (!from)
+		return NULL;
+	binder_inner_proc_lock(from->proc);
+	if (t->from) {
+		BUG_ON(from != t->from);
+		return from;
+	}
+	binder_inner_proc_unlock(from->proc);
+	binder_thread_dec_tmpref(from);
+	return NULL;
+}
+
 static void binder_free_transaction(struct binder_transaction *t)
 {
 	if (t->buffer)
@@ -1748,7 +1781,7 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 
 	BUG_ON(t->flags & TF_ONE_WAY);
 	while (1) {
-		target_thread = binder_get_txn_from(t);
+		target_thread = binder_get_txn_from_and_acq_inner(t);
 		if (target_thread) {
 			binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 				     "send failed reply for transaction %d to %d:%d\n",
@@ -1756,11 +1789,10 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 				      target_thread->proc->pid,
 				      target_thread->pid);
 
-			binder_pop_transaction(target_thread, t);
+			binder_pop_transaction_ilocked(target_thread, t);
 			if (target_thread->reply_error.cmd == BR_OK) {
 				target_thread->reply_error.cmd = error_code;
-				binder_enqueue_work(
-					target_thread->proc,
+				binder_enqueue_work_ilocked(
 					&target_thread->reply_error.work,
 					&target_thread->todo);
 				wake_up_interruptible(&target_thread->wait);
@@ -1768,6 +1800,7 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 				WARN(1, "Unexpected reply error: %u\n",
 						target_thread->reply_error.cmd);
 			}
+			binder_inner_proc_unlock(target_thread->proc);
 			binder_thread_dec_tmpref(target_thread);
 			binder_free_transaction(t);
 			return;
@@ -2397,8 +2430,10 @@ static void binder_transaction(struct binder_proc *proc,
 	e->context_name = proc->context->name;
 
 	if (reply) {
+		binder_inner_proc_lock(proc);
 		in_reply_to = thread->transaction_stack;
 		if (in_reply_to == NULL) {
+			binder_inner_proc_unlock(proc);
 			binder_user_error("%d:%d got reply transaction with no transaction stack\n",
 					  proc->pid, thread->pid);
 			return_error = BR_FAILED_REPLY;
@@ -2406,7 +2441,6 @@ static void binder_transaction(struct binder_proc *proc,
 			return_error_line = __LINE__;
 			goto err_empty_call_stack;
 		}
-		binder_set_nice(in_reply_to->saved_priority);
 		if (in_reply_to->to_thread != thread) {
 			spin_lock(&in_reply_to->lock);
 			binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n",
@@ -2416,6 +2450,7 @@ static void binder_transaction(struct binder_proc *proc,
 				in_reply_to->to_thread ?
 				in_reply_to->to_thread->pid : 0);
 			spin_unlock(&in_reply_to->lock);
+			binder_inner_proc_unlock(proc);
 			return_error = BR_FAILED_REPLY;
 			return_error_param = -EPROTO;
 			return_error_line = __LINE__;
@@ -2423,7 +2458,9 @@ static void binder_transaction(struct binder_proc *proc,
 			goto err_bad_call_stack;
 		}
 		thread->transaction_stack = in_reply_to->to_parent;
-		target_thread = binder_get_txn_from(in_reply_to);
+		binder_inner_proc_unlock(proc);
+		binder_set_nice(in_reply_to->saved_priority);
+		target_thread = binder_get_txn_from_and_acq_inner(in_reply_to);
 		if (target_thread == NULL) {
 			return_error = BR_DEAD_REPLY;
 			return_error_line = __LINE__;
@@ -2435,6 +2472,7 @@ static void binder_transaction(struct binder_proc *proc,
 				target_thread->transaction_stack ?
 				target_thread->transaction_stack->debug_id : 0,
 				in_reply_to->debug_id);
+			binder_inner_proc_unlock(target_thread->proc);
 			return_error = BR_FAILED_REPLY;
 			return_error_param = -EPROTO;
 			return_error_line = __LINE__;
@@ -2444,6 +2482,7 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 		target_proc = target_thread->proc;
 		target_proc->tmp_ref++;
+		binder_inner_proc_unlock(target_thread->proc);
 	} else {
 		if (tr->target.handle) {
 			struct binder_ref *ref;
@@ -2500,6 +2539,7 @@ static void binder_transaction(struct binder_proc *proc,
 			return_error_line = __LINE__;
 			goto err_invalid_target_handle;
 		}
+		binder_inner_proc_lock(proc);
 		if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
 			struct binder_transaction *tmp;
 
@@ -2512,6 +2552,7 @@ static void binder_transaction(struct binder_proc *proc,
 					tmp->to_thread ?
 					tmp->to_thread->pid : 0);
 				spin_unlock(&tmp->lock);
+				binder_inner_proc_unlock(proc);
 				return_error = BR_FAILED_REPLY;
 				return_error_param = -EPROTO;
 				return_error_line = __LINE__;
@@ -2532,6 +2573,7 @@ static void binder_transaction(struct binder_proc *proc,
 				tmp = tmp->from_parent;
 			}
 		}
+		binder_inner_proc_unlock(proc);
 	}
 	if (target_thread) {
 		e->to_thread = target_thread->pid;
@@ -2812,23 +2854,34 @@ static void binder_transaction(struct binder_proc *proc,
 	t->work.type = BINDER_WORK_TRANSACTION;
 
 	if (reply) {
-		if (target_thread->is_dead)
+		binder_inner_proc_lock(target_proc);
+		if (target_thread->is_dead) {
+			binder_inner_proc_unlock(target_proc);
 			goto err_dead_proc_or_thread;
+		}
 		BUG_ON(t->buffer->async_transaction != 0);
-		binder_pop_transaction(target_thread, in_reply_to);
+		binder_pop_transaction_ilocked(target_thread, in_reply_to);
+		binder_enqueue_work_ilocked(&t->work, target_list);
+		binder_inner_proc_unlock(target_proc);
 		binder_free_transaction(in_reply_to);
-		binder_enqueue_work(target_proc, &t->work, target_list);
 	} else if (!(t->flags & TF_ONE_WAY)) {
 		BUG_ON(t->buffer->async_transaction != 0);
+		binder_inner_proc_lock(proc);
 		t->need_reply = 1;
 		t->from_parent = thread->transaction_stack;
 		thread->transaction_stack = t;
+		binder_inner_proc_unlock(proc);
+		binder_inner_proc_lock(target_proc);
 		if (target_proc->is_dead ||
 				(target_thread && target_thread->is_dead)) {
-			binder_pop_transaction(thread, t);
+			binder_inner_proc_unlock(target_proc);
+			binder_inner_proc_lock(proc);
+			binder_pop_transaction_ilocked(thread, t);
+			binder_inner_proc_unlock(proc);
 			goto err_dead_proc_or_thread;
 		}
-		binder_enqueue_work(target_proc, &t->work, target_list);
+		binder_enqueue_work_ilocked(&t->work, target_list);
+		binder_inner_proc_unlock(target_proc);
 	} else {
 		BUG_ON(target_node == NULL);
 		BUG_ON(t->buffer->async_transaction != 1);
@@ -2843,12 +2896,15 @@ static void binder_transaction(struct binder_proc *proc,
 		 * must be atomic with enqueue on
 		 * async_todo
 		 */
+		binder_inner_proc_lock(target_proc);
 		if (target_proc->is_dead ||
 				(target_thread && target_thread->is_dead)) {
+			binder_inner_proc_unlock(target_proc);
 			binder_node_unlock(target_node);
 			goto err_dead_proc_or_thread;
 		}
-		binder_enqueue_work(target_proc, &t->work, target_list);
+		binder_enqueue_work_ilocked(&t->work, target_list);
+		binder_inner_proc_unlock(target_proc);
 		binder_node_unlock(target_node);
 	}
 	if (target_wait) {
@@ -3465,8 +3521,10 @@ static int binder_thread_read(struct binder_proc *proc,
 	}
 
 retry:
+	binder_inner_proc_lock(proc);
 	wait_for_proc_work = thread->transaction_stack == NULL &&
-		binder_worklist_empty(proc, &thread->todo);
+		binder_worklist_empty_ilocked(&thread->todo);
+	binder_inner_proc_unlock(proc);
 
 	thread->looper |= BINDER_LOOPER_STATE_WAITING;
 	if (wait_for_proc_work)
@@ -3778,9 +3836,11 @@ retry:
 			binder_thread_dec_tmpref(t_from);
 		t->buffer->allow_user_free = 1;
 		if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) {
+			binder_inner_proc_lock(thread->proc);
 			t->to_parent = thread->transaction_stack;
 			t->to_thread = thread;
 			thread->transaction_stack = t;
+			binder_inner_proc_unlock(thread->proc);
 		} else {
 			binder_free_transaction(t);
 		}
@@ -4018,8 +4078,10 @@ static unsigned int binder_poll(struct file *filp,
 
 	thread = binder_get_thread(proc);
 
+	binder_inner_proc_lock(thread->proc);
 	wait_for_proc_work = thread->transaction_stack == NULL &&
-		binder_worklist_empty(proc, &thread->todo);
+		binder_worklist_empty_ilocked(&thread->todo);
+	binder_inner_proc_unlock(thread->proc);
 
 	binder_unlock(__func__);
 

From 814ce251cb48216fd254fb78db26fba9e363b953 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Thu, 25 May 2017 17:35:02 -0700
Subject: [PATCH 0930/3220] FROMLIST: binder: use inner lock to protect thread
 accounting

(from https://patchwork.kernel.org/patch/9817763/)

Use the inner lock to protect thread accounting fields in
proc structure: max_threads, requested_threads,
requested_threads_started and ready_threads.

Change-Id: I5a17eb68812702f803d4e2806e7887de0b3af18e
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 328ec2cd7956..fdafd7df9171 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -498,11 +498,15 @@ enum binder_deferred_state {
  * @delivered_death:      list of delivered death notification
  *                        (protected by @inner_lock)
  * @max_threads:          cap on number of binder threads
+ *                        (protected by @inner_lock)
  * @requested_threads:    number of binder threads requested but not
  *                        yet started. In current implementation, can
  *                        only be 0 or 1.
+ *                        (protected by @inner_lock)
  * @requested_threads_started: number binder threads started
+ *                        (protected by @inner_lock)
  * @ready_threads:        number of threads waiting for proc work
+ *                        (protected by @inner_lock)
  * @tmp_ref:              temporary reference to indicate proc is in use
  *                        (protected by @inner_lock)
  * @default_priority:     default scheduler priority
@@ -3235,6 +3239,7 @@ static int binder_thread_write(struct binder_proc *proc,
 			binder_debug(BINDER_DEBUG_THREADS,
 				     "%d:%d BC_REGISTER_LOOPER\n",
 				     proc->pid, thread->pid);
+			binder_inner_proc_lock(proc);
 			if (thread->looper & BINDER_LOOPER_STATE_ENTERED) {
 				thread->looper |= BINDER_LOOPER_STATE_INVALID;
 				binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n",
@@ -3248,6 +3253,7 @@ static int binder_thread_write(struct binder_proc *proc,
 				proc->requested_threads_started++;
 			}
 			thread->looper |= BINDER_LOOPER_STATE_REGISTERED;
+			binder_inner_proc_unlock(proc);
 			break;
 		case BC_ENTER_LOOPER:
 			binder_debug(BINDER_DEBUG_THREADS,
@@ -3524,11 +3530,11 @@ retry:
 	binder_inner_proc_lock(proc);
 	wait_for_proc_work = thread->transaction_stack == NULL &&
 		binder_worklist_empty_ilocked(&thread->todo);
+	if (wait_for_proc_work)
+		proc->ready_threads++;
 	binder_inner_proc_unlock(proc);
 
 	thread->looper |= BINDER_LOOPER_STATE_WAITING;
-	if (wait_for_proc_work)
-		proc->ready_threads++;
 
 	binder_unlock(__func__);
 
@@ -3559,8 +3565,10 @@ retry:
 
 	binder_lock(__func__);
 
+	binder_inner_proc_lock(proc);
 	if (wait_for_proc_work)
 		proc->ready_threads--;
+	binder_inner_proc_unlock(proc);
 	thread->looper &= ~BINDER_LOOPER_STATE_WAITING;
 
 	if (ret)
@@ -3850,19 +3858,22 @@ retry:
 done:
 
 	*consumed = ptr - buffer;
+	binder_inner_proc_lock(proc);
 	if (proc->requested_threads + proc->ready_threads == 0 &&
 	    proc->requested_threads_started < proc->max_threads &&
 	    (thread->looper & (BINDER_LOOPER_STATE_REGISTERED |
 	     BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */
 	     /*spawn a new thread if we leave this out */) {
 		proc->requested_threads++;
+		binder_inner_proc_unlock(proc);
 		binder_debug(BINDER_DEBUG_THREADS,
 			     "%d:%d BR_SPAWN_LOOPER\n",
 			     proc->pid, thread->pid);
 		if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer))
 			return -EFAULT;
 		binder_stat_br(proc, thread, BR_SPAWN_LOOPER);
-	}
+	} else
+		binder_inner_proc_unlock(proc);
 	return 0;
 }
 
@@ -4242,12 +4253,19 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (ret)
 			goto err;
 		break;
-	case BINDER_SET_MAX_THREADS:
-		if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) {
+	case BINDER_SET_MAX_THREADS: {
+		int max_threads;
+
+		if (copy_from_user(&max_threads, ubuf,
+				   sizeof(max_threads))) {
 			ret = -EINVAL;
 			goto err;
 		}
+		binder_inner_proc_lock(proc);
+		proc->max_threads = max_threads;
+		binder_inner_proc_unlock(proc);
 		break;
+	}
 	case BINDER_SET_CONTEXT_MGR:
 		ret = binder_ioctl_set_ctx_mgr(filp);
 		if (ret)

From 6fcb2b9ac4aed1946358073c3a2802aa8bd57c3d Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Thu, 20 Oct 2016 16:43:34 -0700
Subject: [PATCH 0931/3220] FROMLIST: binder: protect binder_ref with outer
 lock

(from https://patchwork.kernel.org/patch/9817771/)

Use proc->outer_lock to protect the binder_ref structure.
The outer lock allows functions operating on the binder_ref
to do nested acquires of node and inner locks as necessary
to attach refs to nodes atomically.

Binder refs must never be accesssed without holding the
outer lock.

Change-Id: Icf6add0eddf70473b39239960b2d9a524775b53a
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 133 ++++++++++++++++++++++++---------------
 1 file changed, 83 insertions(+), 50 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index fdafd7df9171..2ef3022d969d 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -475,7 +475,9 @@ enum binder_deferred_state {
  *                        this proc ordered by node->ptr
  *                        (protected by @inner_lock)
  * @refs_by_desc:         rbtree of refs ordered by ref->desc
+ *                        (protected by @outer_lock)
  * @refs_by_node:         rbtree of refs ordered by ref->node
+ *                        (protected by @outer_lock)
  * @pid                   PID of group_leader of process
  *                        (invariant after initialized)
  * @tsk                   task_struct for group_leader of process
@@ -1270,8 +1272,8 @@ static void binder_put_node(struct binder_node *node)
 	binder_dec_node_tmpref(node);
 }
 
-static struct binder_ref *binder_get_ref(struct binder_proc *proc,
-					 u32 desc, bool need_strong_ref)
+static struct binder_ref *binder_get_ref_olocked(struct binder_proc *proc,
+						 u32 desc, bool need_strong_ref)
 {
 	struct rb_node *n = proc->refs_by_desc.rb_node;
 	struct binder_ref *ref;
@@ -1294,7 +1296,7 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc,
 }
 
 /**
- * binder_get_ref_for_node() - get the ref associated with given node
+ * binder_get_ref_for_node_olocked() - get the ref associated with given node
  * @proc:	binder_proc that owns the ref
  * @node:	binder_node of target
  * @new_ref:	newly allocated binder_ref to be initialized or %NULL
@@ -1311,9 +1313,10 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc,
  *		new_ref. new_ref must be kfree'd by the caller in
  *		this case.
  */
-static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
-						  struct binder_node *node,
-						  struct binder_ref *new_ref)
+static struct binder_ref *binder_get_ref_for_node_olocked(
+					struct binder_proc *proc,
+					struct binder_node *node,
+					struct binder_ref *new_ref)
 {
 	struct binder_context *context = proc->context;
 	struct rb_node **p = &proc->refs_by_node.rb_node;
@@ -1376,7 +1379,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
 	return new_ref;
 }
 
-static void binder_cleanup_ref(struct binder_ref *ref)
+static void binder_cleanup_ref_olocked(struct binder_ref *ref)
 {
 	bool delete_node = false;
 
@@ -1419,17 +1422,17 @@ static void binder_cleanup_ref(struct binder_ref *ref)
 }
 
 /**
- * binder_inc_ref() - increment the ref for given handle
+ * binder_inc_ref_olocked() - increment the ref for given handle
  * @ref:         ref to be incremented
  * @strong:      if true, strong increment, else weak
  * @target_list: list to queue node work on
  *
- * Increment the ref.
+ * Increment the ref. @ref->proc->outer_lock must be held on entry
  *
  * Return: 0, if successful, else errno
  */
-static int binder_inc_ref(struct binder_ref *ref, int strong,
-			  struct list_head *target_list)
+static int binder_inc_ref_olocked(struct binder_ref *ref, int strong,
+				  struct list_head *target_list)
 {
 	int ret;
 
@@ -1458,12 +1461,9 @@ static int binder_inc_ref(struct binder_ref *ref, int strong,
  *
  * Decrement the ref.
  *
- * TODO: kfree is avoided here since an upcoming patch
- * will put this under a lock.
- *
  * Return: true if ref is cleaned up and ready to be freed
  */
-static bool binder_dec_ref(struct binder_ref *ref, int strong)
+static bool binder_dec_ref_olocked(struct binder_ref *ref, int strong)
 {
 	if (strong) {
 		if (ref->data.strong == 0) {
@@ -1487,13 +1487,7 @@ static bool binder_dec_ref(struct binder_ref *ref, int strong)
 		ref->data.weak--;
 	}
 	if (ref->data.strong == 0 && ref->data.weak == 0) {
-		binder_cleanup_ref(ref);
-		/*
-		 * TODO: we could kfree(ref) here, but an upcoming
-		 * patch will call this with a lock held, so we
-		 * return an indication that the ref should be
-		 * freed.
-		 */
+		binder_cleanup_ref_olocked(ref);
 		return true;
 	}
 	return false;
@@ -1518,7 +1512,8 @@ static struct binder_node *binder_get_node_from_ref(
 	struct binder_node *node;
 	struct binder_ref *ref;
 
-	ref = binder_get_ref(proc, desc, need_strong_ref);
+	binder_proc_lock(proc);
+	ref = binder_get_ref_olocked(proc, desc, need_strong_ref);
 	if (!ref)
 		goto err_no_ref;
 	node = ref->node;
@@ -1529,10 +1524,12 @@ static struct binder_node *binder_get_node_from_ref(
 	binder_inc_node_tmpref(node);
 	if (rdata)
 		*rdata = ref->data;
+	binder_proc_unlock(proc);
 
 	return node;
 
 err_no_ref:
+	binder_proc_unlock(proc);
 	return NULL;
 }
 
@@ -1572,24 +1569,27 @@ static int binder_update_ref_for_handle(struct binder_proc *proc,
 	struct binder_ref *ref;
 	bool delete_ref = false;
 
-	ref = binder_get_ref(proc, desc, strong);
+	binder_proc_lock(proc);
+	ref = binder_get_ref_olocked(proc, desc, strong);
 	if (!ref) {
 		ret = -EINVAL;
 		goto err_no_ref;
 	}
 	if (increment)
-		ret = binder_inc_ref(ref, strong, NULL);
+		ret = binder_inc_ref_olocked(ref, strong, NULL);
 	else
-		delete_ref = binder_dec_ref(ref, strong);
+		delete_ref = binder_dec_ref_olocked(ref, strong);
 
 	if (rdata)
 		*rdata = ref->data;
+	binder_proc_unlock(proc);
 
 	if (delete_ref)
 		binder_free_ref(ref);
 	return ret;
 
 err_no_ref:
+	binder_proc_unlock(proc);
 	return ret;
 }
 
@@ -1634,15 +1634,19 @@ static int binder_inc_ref_for_node(struct binder_proc *proc,
 	struct binder_ref *new_ref = NULL;
 	int ret = 0;
 
-	ref = binder_get_ref_for_node(proc, node, NULL);
+	binder_proc_lock(proc);
+	ref = binder_get_ref_for_node_olocked(proc, node, NULL);
 	if (!ref) {
+		binder_proc_unlock(proc);
 		new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
 		if (!new_ref)
 			return -ENOMEM;
-		ref = binder_get_ref_for_node(proc, node, new_ref);
+		binder_proc_lock(proc);
+		ref = binder_get_ref_for_node_olocked(proc, node, new_ref);
 	}
-	ret = binder_inc_ref(ref, strong, target_list);
+	ret = binder_inc_ref_olocked(ref, strong, target_list);
 	*rdata = ref->data;
+	binder_proc_unlock(proc);
 	if (new_ref && ref != new_ref)
 		/*
 		 * Another thread created the ref first so
@@ -2498,11 +2502,14 @@ static void binder_transaction(struct binder_proc *proc,
 			 * stays alive until the transaction is
 			 * done.
 			 */
-			ref = binder_get_ref(proc, tr->target.handle, true);
+			binder_proc_lock(proc);
+			ref = binder_get_ref_olocked(proc, tr->target.handle,
+						     true);
 			if (ref) {
 				binder_inc_node(ref->node, 1, 0, NULL);
 				target_node = ref->node;
 			}
+			binder_proc_unlock(proc);
 			if (target_node == NULL) {
 				binder_user_error("%d:%d got transaction to invalid handle\n",
 					proc->pid, thread->pid);
@@ -3278,7 +3285,7 @@ static int binder_thread_write(struct binder_proc *proc,
 			uint32_t target;
 			binder_uintptr_t cookie;
 			struct binder_ref *ref;
-			struct binder_ref_death *death;
+			struct binder_ref_death *death = NULL;
 
 			if (get_user(target, (uint32_t __user *)ptr))
 				return -EFAULT;
@@ -3286,7 +3293,29 @@ static int binder_thread_write(struct binder_proc *proc,
 			if (get_user(cookie, (binder_uintptr_t __user *)ptr))
 				return -EFAULT;
 			ptr += sizeof(binder_uintptr_t);
-			ref = binder_get_ref(proc, target, false);
+			if (cmd == BC_REQUEST_DEATH_NOTIFICATION) {
+				/*
+				 * Allocate memory for death notification
+				 * before taking lock
+				 */
+				death = kzalloc(sizeof(*death), GFP_KERNEL);
+				if (death == NULL) {
+					WARN_ON(thread->return_error.cmd !=
+						BR_OK);
+					thread->return_error.cmd = BR_ERROR;
+					binder_enqueue_work(
+						thread->proc,
+						&thread->return_error.work,
+						&thread->todo);
+					binder_debug(
+						BINDER_DEBUG_FAILED_TRANSACTION,
+						"%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
+						proc->pid, thread->pid);
+					break;
+				}
+			}
+			binder_proc_lock(proc);
+			ref = binder_get_ref_olocked(proc, target, false);
 			if (ref == NULL) {
 				binder_user_error("%d:%d %s invalid ref %d\n",
 					proc->pid, thread->pid,
@@ -3294,6 +3323,8 @@ static int binder_thread_write(struct binder_proc *proc,
 					"BC_REQUEST_DEATH_NOTIFICATION" :
 					"BC_CLEAR_DEATH_NOTIFICATION",
 					target);
+				binder_proc_unlock(proc);
+				kfree(death);
 				break;
 			}
 
@@ -3311,20 +3342,8 @@ static int binder_thread_write(struct binder_proc *proc,
 				if (ref->death) {
 					binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n",
 						proc->pid, thread->pid);
-					break;
-				}
-				death = kzalloc(sizeof(*death), GFP_KERNEL);
-				if (death == NULL) {
-					WARN_ON(thread->return_error.cmd !=
-						BR_OK);
-					thread->return_error.cmd = BR_ERROR;
-					binder_enqueue_work(
-						thread->proc,
-						&thread->return_error.work,
-						&thread->todo);
-					binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
-						     "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
-						     proc->pid, thread->pid);
+					binder_proc_unlock(proc);
+					kfree(death);
 					break;
 				}
 				binder_stats_created(BINDER_STAT_DEATH);
@@ -3357,6 +3376,7 @@ static int binder_thread_write(struct binder_proc *proc,
 					binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n",
 						proc->pid, thread->pid);
 					binder_node_unlock(ref->node);
+					binder_proc_unlock(proc);
 					break;
 				}
 				death = ref->death;
@@ -3366,6 +3386,7 @@ static int binder_thread_write(struct binder_proc *proc,
 						(u64)death->cookie,
 						(u64)cookie);
 					binder_node_unlock(ref->node);
+					binder_proc_unlock(proc);
 					break;
 				}
 				ref->death = NULL;
@@ -3392,6 +3413,7 @@ static int binder_thread_write(struct binder_proc *proc,
 				binder_inner_proc_unlock(proc);
 				binder_node_unlock(ref->node);
 			}
+			binder_proc_unlock(proc);
 		} break;
 		case BC_DEAD_BINDER_DONE: {
 			struct binder_work *w;
@@ -4602,14 +4624,18 @@ static void binder_deferred_release(struct binder_proc *proc)
 	binder_inner_proc_unlock(proc);
 
 	outgoing_refs = 0;
+	binder_proc_lock(proc);
 	while ((n = rb_first(&proc->refs_by_desc))) {
 		struct binder_ref *ref;
 
 		ref = rb_entry(n, struct binder_ref, rb_node_desc);
 		outgoing_refs++;
-		binder_cleanup_ref(ref);
+		binder_cleanup_ref_olocked(ref);
+		binder_proc_unlock(proc);
 		binder_free_ref(ref);
+		binder_proc_lock(proc);
 	}
+	binder_proc_unlock(proc);
 
 	binder_release_work(proc, &proc->todo);
 	binder_release_work(proc, &proc->delivered_death);
@@ -4817,8 +4843,10 @@ static void print_binder_node_nilocked(struct seq_file *m,
 	}
 }
 
-static void print_binder_ref(struct seq_file *m, struct binder_ref *ref)
+static void print_binder_ref_olocked(struct seq_file *m,
+				     struct binder_ref *ref)
 {
+	WARN_ON(!spin_is_locked(&ref->proc->outer_lock));
 	binder_node_lock(ref->node);
 	seq_printf(m, "  ref %d: desc %d %snode %d s %d w %d d %pK\n",
 		   ref->data.debug_id, ref->data.desc,
@@ -4870,11 +4898,14 @@ static void print_binder_proc(struct seq_file *m,
 		binder_put_node(last_node);
 
 	if (print_all) {
+		binder_proc_lock(proc);
 		for (n = rb_first(&proc->refs_by_desc);
 		     n != NULL;
 		     n = rb_next(n))
-			print_binder_ref(m, rb_entry(n, struct binder_ref,
-						     rb_node_desc));
+			print_binder_ref_olocked(m, rb_entry(n,
+							    struct binder_ref,
+							    rb_node_desc));
+		binder_proc_unlock(proc);
 	}
 	binder_alloc_print_allocated(m, &proc->alloc);
 	binder_inner_proc_lock(proc);
@@ -5014,6 +5045,7 @@ static void print_binder_proc_stats(struct seq_file *m,
 	count = 0;
 	strong = 0;
 	weak = 0;
+	binder_proc_lock(proc);
 	for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
 		struct binder_ref *ref = rb_entry(n, struct binder_ref,
 						  rb_node_desc);
@@ -5021,6 +5053,7 @@ static void print_binder_proc_stats(struct seq_file *m,
 		strong += ref->data.strong;
 		weak += ref->data.weak;
 	}
+	binder_proc_unlock(proc);
 	seq_printf(m, "  refs: %d s %d w %d\n", count, strong, weak);
 
 	count = binder_alloc_get_allocated_count(&proc->alloc);

From da957e45dd21fee345544dadf0fc426abe503d28 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 21 Apr 2017 14:32:11 -0700
Subject: [PATCH 0932/3220] FROMLIST: binder: protect against stale pointers in
 print_binder_transaction

(from https://patchwork.kernel.org/patch/9817761/)

When printing transactions there were several race conditions
that could cause a stale pointer to be deferenced. Fixed by
reading the pointer once and using it if valid (which is
safe). The transaction buffer also needed protection via proc
lock, so it is only printed if we are holding the correct lock.

Bug: 36650912
Test: tested manually
Change-Id: I78240f99cc1a070d70a841c0d84d4306e2fd528d
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 60 ++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 2ef3022d969d..2e5016879654 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4703,35 +4703,52 @@ binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer)
 	mutex_unlock(&binder_deferred_lock);
 }
 
-static void print_binder_transaction(struct seq_file *m, const char *prefix,
-				     struct binder_transaction *t)
+static void print_binder_transaction_ilocked(struct seq_file *m,
+					     struct binder_proc *proc,
+					     const char *prefix,
+					     struct binder_transaction *t)
 {
+	struct binder_proc *to_proc;
+	struct binder_buffer *buffer = t->buffer;
+
+	WARN_ON(!spin_is_locked(&proc->inner_lock));
 	spin_lock(&t->lock);
+	to_proc = t->to_proc;
 	seq_printf(m,
 		   "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d",
 		   prefix, t->debug_id, t,
 		   t->from ? t->from->proc->pid : 0,
 		   t->from ? t->from->pid : 0,
-		   t->to_proc ? t->to_proc->pid : 0,
+		   to_proc ? to_proc->pid : 0,
 		   t->to_thread ? t->to_thread->pid : 0,
 		   t->code, t->flags, t->priority, t->need_reply);
 	spin_unlock(&t->lock);
 
-	if (t->buffer == NULL) {
+	if (proc != to_proc) {
+		/*
+		 * Can only safely deref buffer if we are holding the
+		 * correct proc inner lock for this node
+		 */
+		seq_puts(m, "\n");
+		return;
+	}
+
+	if (buffer == NULL) {
 		seq_puts(m, " buffer free\n");
 		return;
 	}
-	if (t->buffer->target_node)
-		seq_printf(m, " node %d",
-			   t->buffer->target_node->debug_id);
+	if (buffer->target_node)
+		seq_printf(m, " node %d", buffer->target_node->debug_id);
 	seq_printf(m, " size %zd:%zd data %p\n",
-		   t->buffer->data_size, t->buffer->offsets_size,
-		   t->buffer->data);
+		   buffer->data_size, buffer->offsets_size,
+		   buffer->data);
 }
 
-static void print_binder_work_ilocked(struct seq_file *m, const char *prefix,
-				      const char *transaction_prefix,
-				      struct binder_work *w)
+static void print_binder_work_ilocked(struct seq_file *m,
+				     struct binder_proc *proc,
+				     const char *prefix,
+				     const char *transaction_prefix,
+				     struct binder_work *w)
 {
 	struct binder_node *node;
 	struct binder_transaction *t;
@@ -4739,7 +4756,8 @@ static void print_binder_work_ilocked(struct seq_file *m, const char *prefix,
 	switch (w->type) {
 	case BINDER_WORK_TRANSACTION:
 		t = container_of(w, struct binder_transaction, work);
-		print_binder_transaction(m, transaction_prefix, t);
+		print_binder_transaction_ilocked(
+				m, proc, transaction_prefix, t);
 		break;
 	case BINDER_WORK_RETURN_ERROR: {
 		struct binder_error *e = container_of(
@@ -4790,20 +4808,21 @@ static void print_binder_thread_ilocked(struct seq_file *m,
 	t = thread->transaction_stack;
 	while (t) {
 		if (t->from == thread) {
-			print_binder_transaction(m,
-						 "    outgoing transaction", t);
+			print_binder_transaction_ilocked(m, thread->proc,
+					"    outgoing transaction", t);
 			t = t->from_parent;
 		} else if (t->to_thread == thread) {
-			print_binder_transaction(m,
+			print_binder_transaction_ilocked(m, thread->proc,
 						 "    incoming transaction", t);
 			t = t->to_parent;
 		} else {
-			print_binder_transaction(m, "    bad transaction", t);
+			print_binder_transaction_ilocked(m, thread->proc,
+					"    bad transaction", t);
 			t = NULL;
 		}
 	}
 	list_for_each_entry(w, &thread->todo, entry) {
-		print_binder_work_ilocked(m, "    ",
+		print_binder_work_ilocked(m, thread->proc, "    ",
 					  "    pending transaction", w);
 	}
 	if (!print_always && m->count == header_pos)
@@ -4838,7 +4857,7 @@ static void print_binder_node_nilocked(struct seq_file *m,
 	seq_puts(m, "\n");
 	if (node->proc) {
 		list_for_each_entry(w, &node->async_todo, entry)
-			print_binder_work_ilocked(m, "    ",
+			print_binder_work_ilocked(m, node->proc, "    ",
 					  "    pending async transaction", w);
 	}
 }
@@ -4910,7 +4929,8 @@ static void print_binder_proc(struct seq_file *m,
 	binder_alloc_print_allocated(m, &proc->alloc);
 	binder_inner_proc_lock(proc);
 	list_for_each_entry(w, &proc->todo, entry)
-		print_binder_work_ilocked(m, "  ", "  pending transaction", w);
+		print_binder_work_ilocked(m, proc, "  ",
+					  "  pending transaction", w);
 	list_for_each_entry(w, &proc->delivered_death, entry) {
 		seq_puts(m, "  has delivered dead binder\n");
 		break;

From 6c8ad5b3f0366b8b175d8ee4223fc8dbafc39991 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Mon, 22 May 2017 11:26:23 -0700
Subject: [PATCH 0933/3220] FROMLIST: binder: fix death race conditions

(from https://patchwork.kernel.org/patch/9817765/)

A race existed where one thread could register
a death notification for a node, while another
thread was cleaning up that node and sending
out death notifications for its references,
causing simultaneous access to ref->death
because different locks were held.

Test: boots, manual testing
Change-Id: Iff73312f34f70374f417beba4c4c82dd33cac119
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 64 ++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 2e5016879654..a161e55d1373 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -442,6 +442,7 @@ struct binder_ref_data {
  *               ref for deletion in binder_cleanup_ref, a non-NULL
  *               @node indicates the node must be freed
  * @death:       pointer to death notification (ref_death) if requested
+ *               (protected by @node->lock)
  *
  * Structure to track references from procA to target node (on procB). This
  * structure is unsafe to access without holding @proc->outer_lock.
@@ -3338,10 +3339,12 @@ static int binder_thread_write(struct binder_proc *proc,
 				     ref->data.desc, ref->data.strong,
 				     ref->data.weak, ref->node->debug_id);
 
+			binder_node_lock(ref->node);
 			if (cmd == BC_REQUEST_DEATH_NOTIFICATION) {
 				if (ref->death) {
 					binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n",
 						proc->pid, thread->pid);
+					binder_node_unlock(ref->node);
 					binder_proc_unlock(proc);
 					kfree(death);
 					break;
@@ -3350,7 +3353,6 @@ static int binder_thread_write(struct binder_proc *proc,
 				INIT_LIST_HEAD(&death->work.entry);
 				death->cookie = cookie;
 				ref->death = death;
-				binder_node_lock(ref->node);
 				if (ref->node->proc == NULL) {
 					ref->death->work.type = BINDER_WORK_DEAD_BINDER;
 					if (thread->looper &
@@ -3369,9 +3371,7 @@ static int binder_thread_write(struct binder_proc *proc,
 								&proc->wait);
 					}
 				}
-				binder_node_unlock(ref->node);
 			} else {
-				binder_node_lock(ref->node);
 				if (ref->death == NULL) {
 					binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n",
 						proc->pid, thread->pid);
@@ -3411,8 +3411,8 @@ static int binder_thread_write(struct binder_proc *proc,
 					death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR;
 				}
 				binder_inner_proc_unlock(proc);
-				binder_node_unlock(ref->node);
 			}
+			binder_node_unlock(ref->node);
 			binder_proc_unlock(proc);
 		} break;
 		case BC_DEAD_BINDER_DONE: {
@@ -3749,44 +3749,39 @@ retry:
 		case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: {
 			struct binder_ref_death *death;
 			uint32_t cmd;
+			binder_uintptr_t cookie;
 
 			death = container_of(w, struct binder_ref_death, work);
 			if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION)
 				cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE;
 			else
 				cmd = BR_DEAD_BINDER;
-			/*
-			 * TODO: there is a race condition between
-			 * death notification requests and delivery
-			 * of the notifications. This will be handled
-			 * in a later patch.
-			 */
-			binder_inner_proc_unlock(proc);
-			if (put_user(cmd, (uint32_t __user *)ptr))
-				return -EFAULT;
-			ptr += sizeof(uint32_t);
-			if (put_user(death->cookie,
-				     (binder_uintptr_t __user *)ptr))
-				return -EFAULT;
-			ptr += sizeof(binder_uintptr_t);
-			binder_stat_br(proc, thread, cmd);
+			cookie = death->cookie;
+
 			binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION,
 				     "%d:%d %s %016llx\n",
 				      proc->pid, thread->pid,
 				      cmd == BR_DEAD_BINDER ?
 				      "BR_DEAD_BINDER" :
 				      "BR_CLEAR_DEATH_NOTIFICATION_DONE",
-				      (u64)death->cookie);
-
+				      (u64)cookie);
 			if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) {
+				binder_inner_proc_unlock(proc);
 				kfree(death);
 				binder_stats_deleted(BINDER_STAT_DEATH);
 			} else {
-				binder_inner_proc_lock(proc);
 				binder_enqueue_work_ilocked(
 						w, &proc->delivered_death);
 				binder_inner_proc_unlock(proc);
 			}
+			if (put_user(cmd, (uint32_t __user *)ptr))
+				return -EFAULT;
+			ptr += sizeof(uint32_t);
+			if (put_user(cookie,
+				     (binder_uintptr_t __user *)ptr))
+				return -EFAULT;
+			ptr += sizeof(binder_uintptr_t);
+			binder_stat_br(proc, thread, cmd);
 			if (cmd == BR_DEAD_BINDER)
 				goto done; /* DEAD_BINDER notifications can cause transactions */
 		} break;
@@ -4536,20 +4531,25 @@ static int binder_node_release(struct binder_node *node, int refs)
 
 	hlist_for_each_entry(ref, &node->refs, node_entry) {
 		refs++;
-
-		if (!ref->death)
+		/*
+		 * Need the node lock to synchronize
+		 * with new notification requests and the
+		 * inner lock to synchronize with queued
+		 * death notifications.
+		 */
+		binder_inner_proc_lock(ref->proc);
+		if (!ref->death) {
+			binder_inner_proc_unlock(ref->proc);
 			continue;
+		}
 
 		death++;
 
-		binder_inner_proc_lock(ref->proc);
-		if (list_empty(&ref->death->work.entry)) {
-			ref->death->work.type = BINDER_WORK_DEAD_BINDER;
-			binder_enqueue_work_ilocked(&ref->death->work,
-						    &ref->proc->todo);
-			wake_up_interruptible(&ref->proc->wait);
-		} else
-			BUG();
+		BUG_ON(!list_empty(&ref->death->work.entry));
+		ref->death->work.type = BINDER_WORK_DEAD_BINDER;
+		binder_enqueue_work_ilocked(&ref->death->work,
+					    &ref->proc->todo);
+		wake_up_interruptible(&ref->proc->wait);
 		binder_inner_proc_unlock(ref->proc);
 	}
 

From 8881f118f526914c4705db04e6ea2e8e5f7023d4 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 14 Nov 2016 11:37:41 -0800
Subject: [PATCH 0934/3220] FROMLIST: binder: remove global binder lock

(from https://patchwork.kernel.org/patch/9817773/)

Remove global mutex and rely on fine-grained locking

Change-Id: Idd1ae2e52d654e5dd76d443a1ff97522e687fd4c
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 46 +++-------------------------------------
 1 file changed, 3 insertions(+), 43 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index a161e55d1373..5753665f76ee 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -79,8 +79,6 @@
 #include "binder_alloc.h"
 #include "binder_trace.h"
 
-static DEFINE_MUTEX(binder_main_lock);
-
 static HLIST_HEAD(binder_deferred_list);
 static DEFINE_MUTEX(binder_deferred_lock);
 
@@ -924,19 +922,6 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 	return retval;
 }
 
-static inline void binder_lock(const char *tag)
-{
-	trace_binder_lock(tag);
-	mutex_lock(&binder_main_lock);
-	trace_binder_locked(tag);
-}
-
-static inline void binder_unlock(const char *tag)
-{
-	trace_binder_unlock(tag);
-	mutex_unlock(&binder_main_lock);
-}
-
 static void binder_set_nice(long nice)
 {
 	long min_nice;
@@ -3558,8 +3543,6 @@ retry:
 
 	thread->looper |= BINDER_LOOPER_STATE_WAITING;
 
-	binder_unlock(__func__);
-
 	trace_binder_wait_for_work(wait_for_proc_work,
 				   !!thread->transaction_stack,
 				   !binder_worklist_empty(proc, &thread->todo));
@@ -3585,8 +3568,6 @@ retry:
 			ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread));
 	}
 
-	binder_lock(__func__);
-
 	binder_inner_proc_lock(proc);
 	if (wait_for_proc_work)
 		proc->ready_threads--;
@@ -4102,8 +4083,6 @@ static unsigned int binder_poll(struct file *filp,
 	struct binder_thread *thread = NULL;
 	int wait_for_proc_work;
 
-	binder_lock(__func__);
-
 	thread = binder_get_thread(proc);
 
 	binder_inner_proc_lock(thread->proc);
@@ -4111,8 +4090,6 @@ static unsigned int binder_poll(struct file *filp,
 		binder_worklist_empty_ilocked(&thread->todo);
 	binder_inner_proc_unlock(thread->proc);
 
-	binder_unlock(__func__);
-
 	if (wait_for_proc_work) {
 		if (binder_has_proc_work(proc, thread))
 			return POLLIN;
@@ -4257,7 +4234,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	if (ret)
 		goto err_unlocked;
 
-	binder_lock(__func__);
 	thread = binder_get_thread(proc);
 	if (thread == NULL) {
 		ret = -ENOMEM;
@@ -4316,7 +4292,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 err:
 	if (thread)
 		thread->looper_need_return = false;
-	binder_unlock(__func__);
 	wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
 	if (ret && ret != -ERESTARTSYS)
 		pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
@@ -4422,15 +4397,11 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	proc->context = &binder_dev->context;
 	binder_alloc_init(&proc->alloc);
 
-	binder_lock(__func__);
-
 	binder_stats_created(BINDER_STAT_PROC);
 	proc->pid = current->group_leader->pid;
 	INIT_LIST_HEAD(&proc->delivered_death);
 	filp->private_data = proc;
 
-	binder_unlock(__func__);
-
 	mutex_lock(&binder_procs_lock);
 	hlist_add_head(&proc->proc_node, &binder_procs);
 	mutex_unlock(&binder_procs_lock);
@@ -4656,7 +4627,6 @@ static void binder_deferred_func(struct work_struct *work)
 	int defer;
 
 	do {
-		binder_lock(__func__);
 		mutex_lock(&binder_deferred_lock);
 		if (!hlist_empty(&binder_deferred_list)) {
 			proc = hlist_entry(binder_deferred_list.first,
@@ -4683,7 +4653,6 @@ static void binder_deferred_func(struct work_struct *work)
 		if (defer & BINDER_DEFERRED_RELEASE)
 			binder_deferred_release(proc); /* frees proc */
 
-		binder_unlock(__func__);
 		if (files)
 			put_files_struct(files);
 	} while (proc);
@@ -5098,8 +5067,6 @@ static int binder_state_show(struct seq_file *m, void *unused)
 	struct binder_node *node;
 	struct binder_node *last_node = NULL;
 
-	binder_lock(__func__);
-
 	seq_puts(m, "binder state:\n");
 
 	spin_lock(&binder_dead_nodes_lock);
@@ -5129,7 +5096,7 @@ static int binder_state_show(struct seq_file *m, void *unused)
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
 		print_binder_proc(m, proc, 1);
 	mutex_unlock(&binder_procs_lock);
-	binder_unlock(__func__);
+
 	return 0;
 }
 
@@ -5137,8 +5104,6 @@ static int binder_stats_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *proc;
 
-	binder_lock(__func__);
-
 	seq_puts(m, "binder stats:\n");
 
 	print_binder_stats(m, "", &binder_stats);
@@ -5147,7 +5112,7 @@ static int binder_stats_show(struct seq_file *m, void *unused)
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
 		print_binder_proc_stats(m, proc);
 	mutex_unlock(&binder_procs_lock);
-	binder_unlock(__func__);
+
 	return 0;
 }
 
@@ -5155,14 +5120,12 @@ static int binder_transactions_show(struct seq_file *m, void *unused)
 {
 	struct binder_proc *proc;
 
-	binder_lock(__func__);
-
 	seq_puts(m, "binder transactions:\n");
 	mutex_lock(&binder_procs_lock);
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
 		print_binder_proc(m, proc, 0);
 	mutex_unlock(&binder_procs_lock);
-	binder_unlock(__func__);
+
 	return 0;
 }
 
@@ -5171,8 +5134,6 @@ static int binder_proc_show(struct seq_file *m, void *unused)
 	struct binder_proc *itr;
 	int pid = (unsigned long)m->private;
 
-	binder_lock(__func__);
-
 	mutex_lock(&binder_procs_lock);
 	hlist_for_each_entry(itr, &binder_procs, proc_node) {
 		if (itr->pid == pid) {
@@ -5182,7 +5143,6 @@ static int binder_proc_show(struct seq_file *m, void *unused)
 	}
 	mutex_unlock(&binder_procs_lock);
 
-	binder_unlock(__func__);
 	return 0;
 }
 

From c9cd6356f9631c9489b850c3e04d15071899fa6b Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 2 Jun 2017 11:15:44 -0700
Subject: [PATCH 0935/3220] ANDROID: binder: remove proc waitqueue

Removes the process waitqueue, so that threads
can only wait on the thread waitqueue. Whenever
there is process work to do, pick a thread and
wake it up.

This also fixes an issue with using epoll(),
since we no longer have to block on different
waitqueues.

Bug: 34461621
Change-Id: I2950b9de6fa078ee72d53c667a03cbaf587f0849
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 257 +++++++++++++++++++++++++++------------
 1 file changed, 182 insertions(+), 75 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 5753665f76ee..05185b7b1343 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -28,10 +28,10 @@
  *    binder_node_lock() and binder_node_unlock() are
  *    used to acq/rel
  * 3) proc->inner_lock : protects the thread and node lists
- *    (proc->threads, proc->nodes) and all todo lists associated
- *    with the binder_proc (proc->todo, thread->todo,
- *    proc->delivered_death and node->async_todo), as well as
- *    thread->transaction_stack
+ *    (proc->threads, proc->waiting_threads, proc->nodes)
+ *    and all todo lists associated with the binder_proc
+ *    (proc->todo, thread->todo, proc->delivered_death and
+ *    node->async_todo), as well as thread->transaction_stack
  *    binder_inner_proc_lock() and binder_inner_proc_unlock()
  *    are used to acq/rel
  *
@@ -477,6 +477,8 @@ enum binder_deferred_state {
  *                        (protected by @outer_lock)
  * @refs_by_node:         rbtree of refs ordered by ref->node
  *                        (protected by @outer_lock)
+ * @waiting_threads:      threads currently waiting for proc work
+ *                        (protected by @inner_lock)
  * @pid                   PID of group_leader of process
  *                        (invariant after initialized)
  * @tsk                   task_struct for group_leader of process
@@ -506,8 +508,6 @@ enum binder_deferred_state {
  *                        (protected by @inner_lock)
  * @requested_threads_started: number binder threads started
  *                        (protected by @inner_lock)
- * @ready_threads:        number of threads waiting for proc work
- *                        (protected by @inner_lock)
  * @tmp_ref:              temporary reference to indicate proc is in use
  *                        (protected by @inner_lock)
  * @default_priority:     default scheduler priority
@@ -528,6 +528,7 @@ struct binder_proc {
 	struct rb_root nodes;
 	struct rb_root refs_by_desc;
 	struct rb_root refs_by_node;
+	struct list_head waiting_threads;
 	int pid;
 	struct task_struct *tsk;
 	struct files_struct *files;
@@ -542,7 +543,6 @@ struct binder_proc {
 	int max_threads;
 	int requested_threads;
 	int requested_threads_started;
-	int ready_threads;
 	int tmp_ref;
 	long default_priority;
 	struct dentry *debugfs_entry;
@@ -558,6 +558,7 @@ enum {
 	BINDER_LOOPER_STATE_EXITED      = 0x04,
 	BINDER_LOOPER_STATE_INVALID     = 0x08,
 	BINDER_LOOPER_STATE_WAITING     = 0x10,
+	BINDER_LOOPER_STATE_POLL        = 0x20,
 };
 
 /**
@@ -566,6 +567,8 @@ enum {
  *                        (invariant after initialization)
  * @rb_node:              element for proc->threads rbtree
  *                        (protected by @proc->inner_lock)
+ * @waiting_thread_node:  element for @proc->waiting_threads list
+ *                        (protected by @proc->inner_lock)
  * @pid:                  PID for this thread
  *                        (invariant after initialization)
  * @looper:               bitmap of looping state
@@ -595,6 +598,7 @@ enum {
 struct binder_thread {
 	struct binder_proc *proc;
 	struct rb_node rb_node;
+	struct list_head waiting_thread_node;
 	int pid;
 	int looper;              /* only modified by this thread */
 	bool looper_need_return; /* can be written by other thread */
@@ -922,6 +926,86 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 	return retval;
 }
 
+static bool binder_has_work_ilocked(struct binder_thread *thread,
+				    bool do_proc_work)
+{
+	return !binder_worklist_empty_ilocked(&thread->todo) ||
+		thread->looper_need_return ||
+		(do_proc_work &&
+		 !binder_worklist_empty_ilocked(&thread->proc->todo));
+}
+
+static bool binder_has_work(struct binder_thread *thread, bool do_proc_work)
+{
+	bool has_work;
+
+	binder_inner_proc_lock(thread->proc);
+	has_work = binder_has_work_ilocked(thread, do_proc_work);
+	binder_inner_proc_unlock(thread->proc);
+
+	return has_work;
+}
+
+static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread)
+{
+	return !thread->transaction_stack &&
+		binder_worklist_empty_ilocked(&thread->todo) &&
+		(thread->looper & (BINDER_LOOPER_STATE_ENTERED |
+				   BINDER_LOOPER_STATE_REGISTERED));
+}
+
+static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc,
+					       bool sync)
+{
+	struct rb_node *n;
+	struct binder_thread *thread;
+
+	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
+		thread = rb_entry(n, struct binder_thread, rb_node);
+		if (thread->looper & BINDER_LOOPER_STATE_POLL &&
+		    binder_available_for_proc_work_ilocked(thread)) {
+			if (sync)
+				wake_up_interruptible_sync(&thread->wait);
+			else
+				wake_up_interruptible(&thread->wait);
+		}
+	}
+}
+
+static void binder_wakeup_proc_ilocked(struct binder_proc *proc, bool sync)
+{
+	struct binder_thread *thread;
+
+	BUG_ON(!spin_is_locked(&proc->inner_lock));
+	thread = list_first_entry_or_null(&proc->waiting_threads,
+					  struct binder_thread,
+					  waiting_thread_node);
+
+	if (thread) {
+		list_del_init(&thread->waiting_thread_node);
+		if (sync)
+			wake_up_interruptible_sync(&thread->wait);
+		else
+			wake_up_interruptible(&thread->wait);
+		return;
+	}
+
+	/* Didn't find a thread waiting for proc work; this can happen
+	 * in two scenarios:
+	 * 1. All threads are busy handling transactions
+	 *    In that case, one of those threads should call back into
+	 *    the kernel driver soon and pick up this work.
+	 * 2. Threads are using the (e)poll interface, in which case
+	 *    they may be blocked on the waitqueue without having been
+	 *    added to waiting_threads. For this case, we just iterate
+	 *    over all threads not handling transaction work, and
+	 *    wake them all up. We wake all because we don't know whether
+	 *    a thread that called into (e)poll is handling non-binder
+	 *    work currently.
+	 */
+	binder_wakeup_poll_threads_ilocked(proc, sync);
+}
+
 static void binder_set_nice(long nice)
 {
 	long min_nice;
@@ -1141,7 +1225,7 @@ static bool binder_dec_node_nilocked(struct binder_node *node,
 	if (proc && (node->has_strong_ref || node->has_weak_ref)) {
 		if (list_empty(&node->work.entry)) {
 			binder_enqueue_work_ilocked(&node->work, &proc->todo);
-			wake_up_interruptible(&node->proc->wait);
+			binder_wakeup_proc_ilocked(proc, false);
 		}
 	} else {
 		if (hlist_empty(&node->refs) && !node->local_strong_refs &&
@@ -2402,7 +2486,6 @@ static void binder_transaction(struct binder_proc *proc,
 	struct binder_thread *target_thread = NULL;
 	struct binder_node *target_node = NULL;
 	struct list_head *target_list;
-	wait_queue_head_t *target_wait;
 	struct binder_transaction *in_reply_to = NULL;
 	struct binder_transaction_log_entry *e;
 	uint32_t return_error = 0;
@@ -2412,6 +2495,7 @@ static void binder_transaction(struct binder_proc *proc,
 	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
 	int t_debug_id = atomic_inc_return(&binder_last_id);
+	bool wakeup_for_proc_work = false;
 
 	e = binder_transaction_log_add(&binder_transaction_log);
 	e->debug_id = t_debug_id;
@@ -2575,10 +2659,9 @@ static void binder_transaction(struct binder_proc *proc,
 	if (target_thread) {
 		e->to_thread = target_thread->pid;
 		target_list = &target_thread->todo;
-		target_wait = &target_thread->wait;
 	} else {
 		target_list = &target_proc->todo;
-		target_wait = &target_proc->wait;
+		wakeup_for_proc_work = true;
 	}
 	e->to_proc = target_proc->pid;
 
@@ -2885,7 +2968,7 @@ static void binder_transaction(struct binder_proc *proc,
 		binder_node_lock(target_node);
 		if (target_node->has_async_transaction) {
 			target_list = &target_node->async_todo;
-			target_wait = NULL;
+			wakeup_for_proc_work = false;
 		} else
 			target_node->has_async_transaction = 1;
 		/*
@@ -2904,11 +2987,13 @@ static void binder_transaction(struct binder_proc *proc,
 		binder_inner_proc_unlock(target_proc);
 		binder_node_unlock(target_node);
 	}
-	if (target_wait) {
-		if (reply || !(tr->flags & TF_ONE_WAY))
-			wake_up_interruptible_sync(target_wait);
-		else
-			wake_up_interruptible(target_wait);
+	if (target_thread) {
+		wake_up_interruptible_sync(&target_thread->wait);
+	} else if (wakeup_for_proc_work) {
+		binder_inner_proc_lock(target_proc);
+		binder_wakeup_proc_ilocked(target_proc,
+					   !(tr->flags & TF_ONE_WAY));
+		binder_inner_proc_unlock(target_proc);
 	}
 	if (target_thread)
 		binder_thread_dec_tmpref(target_thread);
@@ -3348,12 +3433,14 @@ static int binder_thread_write(struct binder_proc *proc,
 							&ref->death->work,
 							&thread->todo);
 					else {
-						binder_enqueue_work(
-							proc,
+						binder_inner_proc_lock(proc);
+						binder_enqueue_work_ilocked(
 							&ref->death->work,
 							&proc->todo);
-						wake_up_interruptible(
-								&proc->wait);
+						binder_wakeup_proc_ilocked(
+							proc,
+							false);
+						binder_inner_proc_unlock(proc);
 					}
 				}
 			} else {
@@ -3388,8 +3475,9 @@ static int binder_thread_write(struct binder_proc *proc,
 						binder_enqueue_work_ilocked(
 								&death->work,
 								&proc->todo);
-						wake_up_interruptible(
-								&proc->wait);
+						binder_wakeup_proc_ilocked(
+								proc,
+								false);
 					}
 				} else {
 					BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER);
@@ -3444,7 +3532,8 @@ static int binder_thread_write(struct binder_proc *proc,
 					binder_enqueue_work_ilocked(
 							&death->work,
 							&proc->todo);
-					wake_up_interruptible(&proc->wait);
+					binder_wakeup_proc_ilocked(
+							proc, false);
 				}
 			}
 			binder_inner_proc_unlock(proc);
@@ -3471,13 +3560,6 @@ static void binder_stat_br(struct binder_proc *proc,
 	}
 }
 
-static int binder_has_proc_work(struct binder_proc *proc,
-				struct binder_thread *thread)
-{
-	return !binder_worklist_empty(proc, &proc->todo) ||
-		thread->looper_need_return;
-}
-
 static int binder_has_thread_work(struct binder_thread *thread)
 {
 	return !binder_worklist_empty(thread->proc, &thread->todo) ||
@@ -3515,6 +3597,38 @@ static int binder_put_node_cmd(struct binder_proc *proc,
 	return 0;
 }
 
+static int binder_wait_for_work(struct binder_thread *thread,
+				bool do_proc_work)
+{
+	DEFINE_WAIT(wait);
+	struct binder_proc *proc = thread->proc;
+	int ret = 0;
+
+	freezer_do_not_count();
+	binder_inner_proc_lock(proc);
+	for (;;) {
+		prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE);
+		if (binder_has_work_ilocked(thread, do_proc_work))
+			break;
+		if (do_proc_work)
+			list_add(&thread->waiting_thread_node,
+				 &proc->waiting_threads);
+		binder_inner_proc_unlock(proc);
+		schedule();
+		binder_inner_proc_lock(proc);
+		list_del_init(&thread->waiting_thread_node);
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+	}
+	finish_wait(&thread->wait, &wait);
+	binder_inner_proc_unlock(proc);
+	freezer_count();
+
+	return ret;
+}
+
 static int binder_thread_read(struct binder_proc *proc,
 			      struct binder_thread *thread,
 			      binder_uintptr_t binder_buffer, size_t size,
@@ -3535,10 +3649,7 @@ static int binder_thread_read(struct binder_proc *proc,
 
 retry:
 	binder_inner_proc_lock(proc);
-	wait_for_proc_work = thread->transaction_stack == NULL &&
-		binder_worklist_empty_ilocked(&thread->todo);
-	if (wait_for_proc_work)
-		proc->ready_threads++;
+	wait_for_proc_work = binder_available_for_proc_work_ilocked(thread);
 	binder_inner_proc_unlock(proc);
 
 	thread->looper |= BINDER_LOOPER_STATE_WAITING;
@@ -3555,23 +3666,15 @@ retry:
 						 binder_stop_on_user_error < 2);
 		}
 		binder_set_nice(proc->default_priority);
-		if (non_block) {
-			if (!binder_has_proc_work(proc, thread))
-				ret = -EAGAIN;
-		} else
-			ret = wait_event_freezable_exclusive(proc->wait, binder_has_proc_work(proc, thread));
-	} else {
-		if (non_block) {
-			if (!binder_has_thread_work(thread))
-				ret = -EAGAIN;
-		} else
-			ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread));
 	}
 
-	binder_inner_proc_lock(proc);
-	if (wait_for_proc_work)
-		proc->ready_threads--;
-	binder_inner_proc_unlock(proc);
+	if (non_block) {
+		if (!binder_has_work(thread, wait_for_proc_work))
+			ret = -EAGAIN;
+	} else {
+		ret = binder_wait_for_work(thread, wait_for_proc_work);
+	}
+
 	thread->looper &= ~BINDER_LOOPER_STATE_WAITING;
 
 	if (ret)
@@ -3857,7 +3960,8 @@ done:
 
 	*consumed = ptr - buffer;
 	binder_inner_proc_lock(proc);
-	if (proc->requested_threads + proc->ready_threads == 0 &&
+	if (proc->requested_threads == 0 &&
+	    list_empty(&thread->proc->waiting_threads) &&
 	    proc->requested_threads_started < proc->max_threads &&
 	    (thread->looper & (BINDER_LOOPER_STATE_REGISTERED |
 	     BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */
@@ -3968,7 +4072,7 @@ static struct binder_thread *binder_get_thread_ilocked(
 	thread->return_error.cmd = BR_OK;
 	thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR;
 	thread->reply_error.cmd = BR_OK;
-
+	INIT_LIST_HEAD(&new_thread->waiting_thread_node);
 	return thread;
 }
 
@@ -4081,28 +4185,24 @@ static unsigned int binder_poll(struct file *filp,
 {
 	struct binder_proc *proc = filp->private_data;
 	struct binder_thread *thread = NULL;
-	int wait_for_proc_work;
+	bool wait_for_proc_work;
 
 	thread = binder_get_thread(proc);
 
 	binder_inner_proc_lock(thread->proc);
-	wait_for_proc_work = thread->transaction_stack == NULL &&
-		binder_worklist_empty_ilocked(&thread->todo);
+	thread->looper |= BINDER_LOOPER_STATE_POLL;
+	wait_for_proc_work = binder_available_for_proc_work_ilocked(thread);
+
 	binder_inner_proc_unlock(thread->proc);
 
-	if (wait_for_proc_work) {
-		if (binder_has_proc_work(proc, thread))
-			return POLLIN;
-		poll_wait(filp, &proc->wait, wait);
-		if (binder_has_proc_work(proc, thread))
-			return POLLIN;
-	} else {
-		if (binder_has_thread_work(thread))
-			return POLLIN;
-		poll_wait(filp, &thread->wait, wait);
-		if (binder_has_thread_work(thread))
-			return POLLIN;
-	}
+	if (binder_has_work(thread, wait_for_proc_work))
+		return POLLIN;
+
+	poll_wait(filp, &thread->wait, wait);
+
+	if (binder_has_thread_work(thread))
+		return POLLIN;
+
 	return 0;
 }
 
@@ -4149,8 +4249,10 @@ static int binder_ioctl_write_read(struct file *filp,
 					 &bwr.read_consumed,
 					 filp->f_flags & O_NONBLOCK);
 		trace_binder_read_done(ret);
-		if (!binder_worklist_empty(proc, &proc->todo))
-			wake_up_interruptible(&proc->wait);
+		binder_inner_proc_lock(proc);
+		if (!binder_worklist_empty_ilocked(&proc->todo))
+			binder_wakeup_proc_ilocked(proc, false);
+		binder_inner_proc_unlock(proc);
 		if (ret < 0) {
 			if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
 				ret = -EFAULT;
@@ -4390,7 +4492,6 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	get_task_struct(current->group_leader);
 	proc->tsk = current->group_leader;
 	INIT_LIST_HEAD(&proc->todo);
-	init_waitqueue_head(&proc->wait);
 	proc->default_priority = task_nice(current);
 	binder_dev = container_of(filp->private_data, struct binder_device,
 				  miscdev);
@@ -4400,6 +4501,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	binder_stats_created(BINDER_STAT_PROC);
 	proc->pid = current->group_leader->pid;
 	INIT_LIST_HEAD(&proc->delivered_death);
+	INIT_LIST_HEAD(&proc->waiting_threads);
 	filp->private_data = proc;
 
 	mutex_lock(&binder_procs_lock);
@@ -4451,7 +4553,6 @@ static void binder_deferred_flush(struct binder_proc *proc)
 		}
 	}
 	binder_inner_proc_unlock(proc);
-	wake_up_interruptible_all(&proc->wait);
 
 	binder_debug(BINDER_DEBUG_OPEN_CLOSE,
 		     "binder_flush: %d woke %d threads\n", proc->pid,
@@ -4520,7 +4621,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 		ref->death->work.type = BINDER_WORK_DEAD_BINDER;
 		binder_enqueue_work_ilocked(&ref->death->work,
 					    &ref->proc->todo);
-		wake_up_interruptible(&ref->proc->wait);
+		binder_wakeup_proc_ilocked(ref->proc, false);
 		binder_inner_proc_unlock(ref->proc);
 	}
 
@@ -5008,23 +5109,29 @@ static void print_binder_proc_stats(struct seq_file *m,
 				    struct binder_proc *proc)
 {
 	struct binder_work *w;
+	struct binder_thread *thread;
 	struct rb_node *n;
-	int count, strong, weak;
+	int count, strong, weak, ready_threads;
 	size_t free_async_space =
 		binder_alloc_get_free_async_space(&proc->alloc);
 
 	seq_printf(m, "proc %d\n", proc->pid);
 	seq_printf(m, "context %s\n", proc->context->name);
 	count = 0;
+	ready_threads = 0;
 	binder_inner_proc_lock(proc);
 	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
 		count++;
+
+	list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node)
+		ready_threads++;
+
 	seq_printf(m, "  threads: %d\n", count);
 	seq_printf(m, "  requested threads: %d+%d/%d\n"
 			"  ready threads %d\n"
 			"  free async space %zd\n", proc->requested_threads,
 			proc->requested_threads_started, proc->max_threads,
-			proc->ready_threads,
+			ready_threads,
 			free_async_space);
 	count = 0;
 	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n))

From 5347bf52735ec797553aa11189f03742f4015606 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Tue, 6 Jun 2017 15:17:46 -0700
Subject: [PATCH 0936/3220] ANDROID: binder: push new transactions to waiting
 threads.

Instead of pushing new transactions to the process
waitqueue, select a thread that is waiting on proc
work to handle the transaction. This will make it
easier to improve priority inheritance in future
patches, by setting the priority before we wake up
a thread.

If we can't find a waiting thread, submit the work
to the proc waitqueue instead as we did previously.

Change-Id: I23cbfcca867bed7b86007e22137d0a8fad4b4001
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 181 +++++++++++++++++++++++++++------------
 1 file changed, 127 insertions(+), 54 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 05185b7b1343..e2daf6b7cbd0 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -972,7 +972,20 @@ static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc,
 	}
 }
 
-static void binder_wakeup_proc_ilocked(struct binder_proc *proc, bool sync)
+/**
+ * binder_select_thread_ilocked() - selects a thread for doing proc work.
+ * @proc:	process to select a thread from
+ *
+ * Note that calling this function moves the thread off the waiting_threads
+ * list, so it can only be woken up by the caller of this function, or a
+ * signal. Therefore, callers *should* always wake up the thread this function
+ * returns.
+ *
+ * Return:	If there's a thread currently waiting for process work,
+ *		returns that thread. Otherwise returns NULL.
+ */
+static struct binder_thread *
+binder_select_thread_ilocked(struct binder_proc *proc)
 {
 	struct binder_thread *thread;
 
@@ -981,8 +994,35 @@ static void binder_wakeup_proc_ilocked(struct binder_proc *proc, bool sync)
 					  struct binder_thread,
 					  waiting_thread_node);
 
-	if (thread) {
+	if (thread)
 		list_del_init(&thread->waiting_thread_node);
+
+	return thread;
+}
+
+/**
+ * binder_wakeup_thread_ilocked() - wakes up a thread for doing proc work.
+ * @proc:	process to wake up a thread in
+ * @thread:	specific thread to wake-up (may be NULL)
+ * @sync:	whether to do a synchronous wake-up
+ *
+ * This function wakes up a thread in the @proc process.
+ * The caller may provide a specific thread to wake-up in
+ * the @thread parameter. If @thread is NULL, this function
+ * will wake up threads that have called poll().
+ *
+ * Note that for this function to work as expected, callers
+ * should first call binder_select_thread() to find a thread
+ * to handle the work (if they don't have a thread already),
+ * and pass the result into the @thread parameter.
+ */
+static void binder_wakeup_thread_ilocked(struct binder_proc *proc,
+					 struct binder_thread *thread,
+					 bool sync)
+{
+	BUG_ON(!spin_is_locked(&proc->inner_lock));
+
+	if (thread) {
 		if (sync)
 			wake_up_interruptible_sync(&thread->wait);
 		else
@@ -1006,6 +1046,13 @@ static void binder_wakeup_proc_ilocked(struct binder_proc *proc, bool sync)
 	binder_wakeup_poll_threads_ilocked(proc, sync);
 }
 
+static void binder_wakeup_proc_ilocked(struct binder_proc *proc)
+{
+	struct binder_thread *thread = binder_select_thread_ilocked(proc);
+
+	binder_wakeup_thread_ilocked(proc, thread, /* sync = */false);
+}
+
 static void binder_set_nice(long nice)
 {
 	long min_nice;
@@ -1225,7 +1272,7 @@ static bool binder_dec_node_nilocked(struct binder_node *node,
 	if (proc && (node->has_strong_ref || node->has_weak_ref)) {
 		if (list_empty(&node->work.entry)) {
 			binder_enqueue_work_ilocked(&node->work, &proc->todo);
-			binder_wakeup_proc_ilocked(proc, false);
+			binder_wakeup_proc_ilocked(proc);
 		}
 	} else {
 		if (hlist_empty(&node->refs) && !node->local_strong_refs &&
@@ -2471,6 +2518,73 @@ static int binder_fixup_parent(struct binder_transaction *t,
 	return 0;
 }
 
+/**
+ * binder_proc_transaction() - sends a transaction to a process and wakes it up
+ * @t:		transaction to send
+ * @proc:	process to send the transaction to
+ * @thread:	thread in @proc to send the transaction to (may be NULL)
+ *
+ * This function queues a transaction to the specified process. It will try
+ * to find a thread in the target process to handle the transaction and
+ * wake it up. If no thread is found, the work is queued to the proc
+ * waitqueue.
+ *
+ * If the @thread parameter is not NULL, the transaction is always queued
+ * to the waitlist of that specific thread.
+ *
+ * Return:	true if the transactions was successfully queued
+ *		false if the target process or thread is dead
+ */
+static bool binder_proc_transaction(struct binder_transaction *t,
+				    struct binder_proc *proc,
+				    struct binder_thread *thread)
+{
+	struct list_head *target_list = NULL;
+	struct binder_node *node = t->buffer->target_node;
+	bool oneway = !!(t->flags & TF_ONE_WAY);
+	bool wakeup = true;
+
+	BUG_ON(!node);
+	binder_node_lock(node);
+	if (oneway) {
+		BUG_ON(thread);
+		if (node->has_async_transaction) {
+			target_list = &node->async_todo;
+			wakeup = false;
+		} else {
+			node->has_async_transaction = 1;
+		}
+	}
+
+	binder_inner_proc_lock(proc);
+
+	if (proc->is_dead || (thread && thread->is_dead)) {
+		binder_inner_proc_unlock(proc);
+		binder_node_unlock(node);
+		return false;
+	}
+
+	if (!thread && !target_list)
+		thread = binder_select_thread_ilocked(proc);
+
+	if (thread)
+		target_list = &thread->todo;
+	else if (!target_list)
+		target_list = &proc->todo;
+	else
+		BUG_ON(target_list != &node->async_todo);
+
+	binder_enqueue_work_ilocked(&t->work, target_list);
+
+	if (wakeup)
+		binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
+
+	binder_inner_proc_unlock(proc);
+	binder_node_unlock(node);
+
+	return true;
+}
+
 static void binder_transaction(struct binder_proc *proc,
 			       struct binder_thread *thread,
 			       struct binder_transaction_data *tr, int reply,
@@ -2485,7 +2599,6 @@ static void binder_transaction(struct binder_proc *proc,
 	struct binder_proc *target_proc = NULL;
 	struct binder_thread *target_thread = NULL;
 	struct binder_node *target_node = NULL;
-	struct list_head *target_list;
 	struct binder_transaction *in_reply_to = NULL;
 	struct binder_transaction_log_entry *e;
 	uint32_t return_error = 0;
@@ -2495,7 +2608,6 @@ static void binder_transaction(struct binder_proc *proc,
 	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
 	int t_debug_id = atomic_inc_return(&binder_last_id);
-	bool wakeup_for_proc_work = false;
 
 	e = binder_transaction_log_add(&binder_transaction_log);
 	e->debug_id = t_debug_id;
@@ -2656,13 +2768,8 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 		binder_inner_proc_unlock(proc);
 	}
-	if (target_thread) {
+	if (target_thread)
 		e->to_thread = target_thread->pid;
-		target_list = &target_thread->todo;
-	} else {
-		target_list = &target_proc->todo;
-		wakeup_for_proc_work = true;
-	}
 	e->to_proc = target_proc->pid;
 
 	/* TODO: reuse incoming transaction for reply */
@@ -2941,8 +3048,9 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_pop_transaction_ilocked(target_thread, in_reply_to);
-		binder_enqueue_work_ilocked(&t->work, target_list);
+		binder_enqueue_work_ilocked(&t->work, &target_thread->todo);
 		binder_inner_proc_unlock(target_proc);
+		wake_up_interruptible_sync(&target_thread->wait);
 		binder_free_transaction(in_reply_to);
 	} else if (!(t->flags & TF_ONE_WAY)) {
 		BUG_ON(t->buffer->async_transaction != 0);
@@ -2951,49 +3059,17 @@ static void binder_transaction(struct binder_proc *proc,
 		t->from_parent = thread->transaction_stack;
 		thread->transaction_stack = t;
 		binder_inner_proc_unlock(proc);
-		binder_inner_proc_lock(target_proc);
-		if (target_proc->is_dead ||
-				(target_thread && target_thread->is_dead)) {
-			binder_inner_proc_unlock(target_proc);
+		if (!binder_proc_transaction(t, target_proc, target_thread)) {
 			binder_inner_proc_lock(proc);
 			binder_pop_transaction_ilocked(thread, t);
 			binder_inner_proc_unlock(proc);
 			goto err_dead_proc_or_thread;
 		}
-		binder_enqueue_work_ilocked(&t->work, target_list);
-		binder_inner_proc_unlock(target_proc);
 	} else {
 		BUG_ON(target_node == NULL);
 		BUG_ON(t->buffer->async_transaction != 1);
-		binder_node_lock(target_node);
-		if (target_node->has_async_transaction) {
-			target_list = &target_node->async_todo;
-			wakeup_for_proc_work = false;
-		} else
-			target_node->has_async_transaction = 1;
-		/*
-		 * Test/set of has_async_transaction
-		 * must be atomic with enqueue on
-		 * async_todo
-		 */
-		binder_inner_proc_lock(target_proc);
-		if (target_proc->is_dead ||
-				(target_thread && target_thread->is_dead)) {
-			binder_inner_proc_unlock(target_proc);
-			binder_node_unlock(target_node);
+		if (!binder_proc_transaction(t, target_proc, NULL))
 			goto err_dead_proc_or_thread;
-		}
-		binder_enqueue_work_ilocked(&t->work, target_list);
-		binder_inner_proc_unlock(target_proc);
-		binder_node_unlock(target_node);
-	}
-	if (target_thread) {
-		wake_up_interruptible_sync(&target_thread->wait);
-	} else if (wakeup_for_proc_work) {
-		binder_inner_proc_lock(target_proc);
-		binder_wakeup_proc_ilocked(target_proc,
-					   !(tr->flags & TF_ONE_WAY));
-		binder_inner_proc_unlock(target_proc);
 	}
 	if (target_thread)
 		binder_thread_dec_tmpref(target_thread);
@@ -3438,8 +3514,7 @@ static int binder_thread_write(struct binder_proc *proc,
 							&ref->death->work,
 							&proc->todo);
 						binder_wakeup_proc_ilocked(
-							proc,
-							false);
+							proc);
 						binder_inner_proc_unlock(proc);
 					}
 				}
@@ -3476,8 +3551,7 @@ static int binder_thread_write(struct binder_proc *proc,
 								&death->work,
 								&proc->todo);
 						binder_wakeup_proc_ilocked(
-								proc,
-								false);
+								proc);
 					}
 				} else {
 					BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER);
@@ -3532,8 +3606,7 @@ static int binder_thread_write(struct binder_proc *proc,
 					binder_enqueue_work_ilocked(
 							&death->work,
 							&proc->todo);
-					binder_wakeup_proc_ilocked(
-							proc, false);
+					binder_wakeup_proc_ilocked(proc);
 				}
 			}
 			binder_inner_proc_unlock(proc);
@@ -4251,7 +4324,7 @@ static int binder_ioctl_write_read(struct file *filp,
 		trace_binder_read_done(ret);
 		binder_inner_proc_lock(proc);
 		if (!binder_worklist_empty_ilocked(&proc->todo))
-			binder_wakeup_proc_ilocked(proc, false);
+			binder_wakeup_proc_ilocked(proc);
 		binder_inner_proc_unlock(proc);
 		if (ret < 0) {
 			if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
@@ -4621,7 +4694,7 @@ static int binder_node_release(struct binder_node *node, int refs)
 		ref->death->work.type = BINDER_WORK_DEAD_BINDER;
 		binder_enqueue_work_ilocked(&ref->death->work,
 					    &ref->proc->todo);
-		binder_wakeup_proc_ilocked(ref->proc, false);
+		binder_wakeup_proc_ilocked(ref->proc);
 		binder_inner_proc_unlock(ref->proc);
 	}
 

From d30e6a877a84916efb17836566aeecf2ce0d6e9f Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Tue, 6 Jun 2017 17:04:42 -0700
Subject: [PATCH 0937/3220] ANDROID: binder: add support for RT prio
 inheritance.

Adds support for SCHED_BATCH/SCHED_FIFO/SCHED_RR
priority inheritance.

Change-Id: I71f356e476be2933713a0ecfa2cc31aa141e2dc6
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 163 ++++++++++++++++++++++++++++++++-------
 1 file changed, 134 insertions(+), 29 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index e2daf6b7cbd0..85a4651361b6 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -465,6 +465,22 @@ enum binder_deferred_state {
 	BINDER_DEFERRED_RELEASE      = 0x04,
 };
 
+/**
+ * struct binder_priority - scheduler policy and priority
+ * @sched_policy            scheduler policy
+ * @prio                    [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT
+ *
+ * The binder driver supports inheriting the following scheduler policies:
+ * SCHED_NORMAL
+ * SCHED_BATCH
+ * SCHED_FIFO
+ * SCHED_RR
+ */
+struct binder_priority {
+	unsigned int sched_policy;
+	int prio;
+};
+
 /**
  * struct binder_proc - binder process bookkeeping
  * @proc_node:            element for binder_procs list
@@ -544,7 +560,7 @@ struct binder_proc {
 	int requested_threads;
 	int requested_threads_started;
 	int tmp_ref;
-	long default_priority;
+	struct binder_priority default_priority;
 	struct dentry *debugfs_entry;
 	struct binder_alloc alloc;
 	struct binder_context *context;
@@ -626,8 +642,8 @@ struct binder_transaction {
 	struct binder_buffer *buffer;
 	unsigned int	code;
 	unsigned int	flags;
-	long	priority;
-	long	saved_priority;
+	struct binder_priority	priority;
+	struct binder_priority	saved_priority;
 	kuid_t	sender_euid;
 	/**
 	 * @lock:  protects @from, @to_proc, and @to_thread
@@ -1053,22 +1069,93 @@ static void binder_wakeup_proc_ilocked(struct binder_proc *proc)
 	binder_wakeup_thread_ilocked(proc, thread, /* sync = */false);
 }
 
-static void binder_set_nice(long nice)
+static bool is_rt_policy(int policy)
 {
-	long min_nice;
+	return policy == SCHED_FIFO || policy == SCHED_RR;
+}
 
-	if (can_nice(current, nice)) {
-		set_user_nice(current, nice);
+static bool is_fair_policy(int policy)
+{
+	return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
+
+static bool binder_supported_policy(int policy)
+{
+	return is_fair_policy(policy) || is_rt_policy(policy);
+}
+
+static int to_userspace_prio(int policy, int kernel_priority)
+{
+	if (is_fair_policy(policy))
+		return PRIO_TO_NICE(kernel_priority);
+	else
+		return MAX_USER_RT_PRIO - 1 - kernel_priority;
+}
+
+static int to_kernel_prio(int policy, int user_priority)
+{
+	if (is_fair_policy(policy))
+		return NICE_TO_PRIO(user_priority);
+	else
+		return MAX_USER_RT_PRIO - 1 - user_priority;
+}
+
+static void binder_set_priority(struct task_struct *task,
+				struct binder_priority desired)
+{
+	int priority; /* user-space prio value */
+	bool has_cap_nice;
+	unsigned int policy = desired.sched_policy;
+
+	if (task->policy == policy && task->normal_prio == desired.prio)
 		return;
+
+	has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE);
+
+	priority = to_userspace_prio(policy, desired.prio);
+
+	if (is_rt_policy(policy) && !has_cap_nice) {
+		long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO);
+
+		if (max_rtprio == 0) {
+			policy = SCHED_NORMAL;
+			priority = MIN_NICE;
+		} else if (priority > max_rtprio) {
+			priority = max_rtprio;
+		}
 	}
-	min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur);
-	binder_debug(BINDER_DEBUG_PRIORITY_CAP,
-		     "%d: nice value %ld not allowed use %ld instead\n",
-		      current->pid, nice, min_nice);
-	set_user_nice(current, min_nice);
-	if (min_nice <= MAX_NICE)
-		return;
-	binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
+
+	if (is_fair_policy(policy) && !has_cap_nice) {
+		long min_nice = rlimit_to_nice(task_rlimit(task, RLIMIT_NICE));
+
+		if (min_nice > MAX_NICE) {
+			binder_user_error("%d RLIMIT_NICE not set\n",
+					  task->pid);
+			return;
+		} else if (priority < min_nice) {
+			priority = min_nice;
+		}
+	}
+
+	if (policy != desired.sched_policy ||
+	    to_kernel_prio(policy, priority) != desired.prio)
+		binder_debug(BINDER_DEBUG_PRIORITY_CAP,
+			     "%d: priority %d not allowed, using %d instead\n",
+			      task->pid, desired.prio,
+			      to_kernel_prio(policy, priority));
+
+	/* Set the actual priority */
+	if (task->policy != policy || is_rt_policy(policy)) {
+		struct sched_param params;
+
+		params.sched_priority = is_rt_policy(policy) ? priority : 0;
+
+		sched_setscheduler_nocheck(task,
+					   policy | SCHED_RESET_ON_FORK,
+					   &params);
+	}
+	if (is_fair_policy(policy))
+		set_user_nice(task, priority);
 }
 
 static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
@@ -1152,7 +1239,8 @@ static struct binder_node *binder_init_node_ilocked(
 	node->ptr = ptr;
 	node->cookie = cookie;
 	node->work.type = BINDER_WORK_NODE;
-	node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+	node->min_priority = NICE_TO_PRIO(
+			flags & FLAT_BINDER_FLAG_PRIORITY_MASK);
 	node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
 	spin_lock_init(&node->lock);
 	INIT_LIST_HEAD(&node->work.entry);
@@ -2649,7 +2737,7 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 		thread->transaction_stack = in_reply_to->to_parent;
 		binder_inner_proc_unlock(proc);
-		binder_set_nice(in_reply_to->saved_priority);
+		binder_set_priority(current, in_reply_to->saved_priority);
 		target_thread = binder_get_txn_from_and_acq_inner(in_reply_to);
 		if (target_thread == NULL) {
 			return_error = BR_DEAD_REPLY;
@@ -2822,7 +2910,15 @@ static void binder_transaction(struct binder_proc *proc,
 	t->to_thread = target_thread;
 	t->code = tr->code;
 	t->flags = tr->flags;
-	t->priority = task_nice(current);
+	if (!(t->flags & TF_ONE_WAY) &&
+	    binder_supported_policy(current->policy)) {
+		/* Inherit supported policies for synchronous transactions */
+		t->priority.sched_policy = current->policy;
+		t->priority.prio = current->normal_prio;
+	} else {
+		/* Otherwise, fall back to the default priority */
+		t->priority = target_proc->default_priority;
+	}
 
 	trace_binder_transaction(reply, t, target_node);
 
@@ -3738,7 +3834,7 @@ retry:
 			wait_event_interruptible(binder_user_error_wait,
 						 binder_stop_on_user_error < 2);
 		}
-		binder_set_nice(proc->default_priority);
+		binder_set_priority(current, proc->default_priority);
 	}
 
 	if (non_block) {
@@ -3950,16 +4046,17 @@ retry:
 		BUG_ON(t->buffer == NULL);
 		if (t->buffer->target_node) {
 			struct binder_node *target_node = t->buffer->target_node;
+			struct binder_priority prio = t->priority;
 
 			tr.target.ptr = target_node->ptr;
 			tr.cookie =  target_node->cookie;
-			t->saved_priority = task_nice(current);
-			if (t->priority < target_node->min_priority &&
-			    !(t->flags & TF_ONE_WAY))
-				binder_set_nice(t->priority);
-			else if (!(t->flags & TF_ONE_WAY) ||
-				 t->saved_priority > target_node->min_priority)
-				binder_set_nice(target_node->min_priority);
+			t->saved_priority.sched_policy = current->policy;
+			t->saved_priority.prio = current->normal_prio;
+			if (target_node->min_priority < t->priority.prio) {
+				prio.sched_policy = SCHED_NORMAL;
+				prio.prio = target_node->min_priority;
+			}
+			binder_set_priority(current, prio);
 			cmd = BR_TRANSACTION;
 		} else {
 			tr.target.ptr = 0;
@@ -4565,7 +4662,14 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	get_task_struct(current->group_leader);
 	proc->tsk = current->group_leader;
 	INIT_LIST_HEAD(&proc->todo);
-	proc->default_priority = task_nice(current);
+	if (binder_supported_policy(current->policy)) {
+		proc->default_priority.sched_policy = current->policy;
+		proc->default_priority.prio = current->normal_prio;
+	} else {
+		proc->default_priority.sched_policy = SCHED_NORMAL;
+		proc->default_priority.prio = NICE_TO_PRIO(0);
+	}
+
 	binder_dev = container_of(filp->private_data, struct binder_device,
 				  miscdev);
 	proc->context = &binder_dev->context;
@@ -4858,13 +4962,14 @@ static void print_binder_transaction_ilocked(struct seq_file *m,
 	spin_lock(&t->lock);
 	to_proc = t->to_proc;
 	seq_printf(m,
-		   "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d",
+		   "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %d:%d r%d",
 		   prefix, t->debug_id, t,
 		   t->from ? t->from->proc->pid : 0,
 		   t->from ? t->from->pid : 0,
 		   to_proc ? to_proc->pid : 0,
 		   t->to_thread ? t->to_thread->pid : 0,
-		   t->code, t->flags, t->priority, t->need_reply);
+		   t->code, t->flags, t->priority.sched_policy,
+		   t->priority.prio, t->need_reply);
 	spin_unlock(&t->lock);
 
 	if (proc != to_proc) {

From adb685439e768b120a5bdae15f333967d1e239ed Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Wed, 7 Jun 2017 09:29:14 -0700
Subject: [PATCH 0938/3220] ANDROID: binder: add min sched_policy to node.

This change adds flags to flat_binder_object.flags
to allow indicating a minimum scheduling policy for
the node. It also clarifies the valid value range
for the priority bits in the flags.

Internally, we use the priority map that the kernel
uses, e.g. [0..99] for real-time policies and [100..139]
for the SCHED_NORMAL/SCHED_BATCH policies.

Bug: 34461621
Bug: 37293077
Change-Id: I12438deecb53df432da18c6fc77460768ae726d2
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c            | 26 ++++++++++++++----
 include/uapi/linux/android/binder.h | 41 ++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 85a4651361b6..838f4dbc9af5 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -353,6 +353,8 @@ struct binder_error {
  *                        and by @lock)
  * @has_async_transaction: async transaction to node in progress
  *                        (protected by @lock)
+ * @sched_policy:         minimum scheduling policy for node
+ *                        (invariant after initialized)
  * @accept_fds:           file descriptor operations supported for node
  *                        (invariant after initialized)
  * @min_priority:         minimum scheduling priority
@@ -392,6 +394,7 @@ struct binder_node {
 		/*
 		 * invariant after initialization
 		 */
+		u8 sched_policy:2;
 		u8 accept_fds:1;
 		u8 min_priority;
 	};
@@ -1208,6 +1211,7 @@ static struct binder_node *binder_init_node_ilocked(
 	binder_uintptr_t ptr = fp ? fp->binder : 0;
 	binder_uintptr_t cookie = fp ? fp->cookie : 0;
 	__u32 flags = fp ? fp->flags : 0;
+	s8 priority;
 
 	BUG_ON(!spin_is_locked(&proc->inner_lock));
 	while (*p) {
@@ -1239,8 +1243,10 @@ static struct binder_node *binder_init_node_ilocked(
 	node->ptr = ptr;
 	node->cookie = cookie;
 	node->work.type = BINDER_WORK_NODE;
-	node->min_priority = NICE_TO_PRIO(
-			flags & FLAT_BINDER_FLAG_PRIORITY_MASK);
+	priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+	node->sched_policy = (flags & FLAT_BINDER_FLAG_PRIORITY_MASK) >>
+		FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT;
+	node->min_priority = to_kernel_prio(node->sched_policy, priority);
 	node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
 	spin_lock_init(&node->lock);
 	INIT_LIST_HEAD(&node->work.entry);
@@ -4052,8 +4058,17 @@ retry:
 			tr.cookie =  target_node->cookie;
 			t->saved_priority.sched_policy = current->policy;
 			t->saved_priority.prio = current->normal_prio;
-			if (target_node->min_priority < t->priority.prio) {
-				prio.sched_policy = SCHED_NORMAL;
+			if (target_node->min_priority < t->priority.prio ||
+			    (target_node->min_priority == t->priority.prio &&
+			     target_node->sched_policy == SCHED_FIFO)) {
+				/*
+				 * In case the minimum priority on the node is
+				 * higher (lower value), use that priority. If
+				 * the priority is the same, but the node uses
+				 * SCHED_FIFO, prefer SCHED_FIFO, since it can
+				 * run unbounded, unlike SCHED_RR.
+				 */
+				prio.sched_policy = target_node->sched_policy;
 				prio.prio = target_node->min_priority;
 			}
 			binder_set_priority(current, prio);
@@ -5092,8 +5107,9 @@ static void print_binder_node_nilocked(struct seq_file *m,
 	hlist_for_each_entry(ref, &node->refs, node_entry)
 		count++;
 
-	seq_printf(m, "  node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d",
+	seq_printf(m, "  node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d",
 		   node->debug_id, (u64)node->ptr, (u64)node->cookie,
+		   node->sched_policy, node->min_priority,
 		   node->has_strong_ref, node->has_weak_ref,
 		   node->local_strong_refs, node->local_weak_refs,
 		   node->internal_strong_refs, count, node->tmp_refs);
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 7668b5791c91..026558ac254d 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -37,9 +37,48 @@ enum {
 	BINDER_TYPE_PTR		= B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
 };
 
-enum {
+/**
+ * enum flat_binder_object_shifts: shift values for flat_binder_object_flags
+ * @FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT: shift for getting scheduler policy.
+ *
+ */
+enum flat_binder_object_shifts {
+	FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT = 9,
+};
+
+/**
+ * enum flat_binder_object_flags - flags for use in flat_binder_object.flags
+ */
+enum flat_binder_object_flags {
+	/**
+	 * @FLAT_BINDER_FLAG_PRIORITY_MASK: bit-mask for min scheduler priority
+	 *
+	 * These bits can be used to set the minimum scheduler priority
+	 * at which transactions into this node should run. Valid values
+	 * in these bits depend on the scheduler policy encoded in
+	 * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK.
+	 *
+	 * For SCHED_NORMAL/SCHED_BATCH, the valid range is between [-20..19]
+	 * For SCHED_FIFO/SCHED_RR, the value can run between [1..99]
+	 */
 	FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff,
+	/**
+	 * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds.
+	 */
 	FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100,
+	/**
+	 * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy
+	 *
+	 * These two bits can be used to set the min scheduling policy at which
+	 * transactions on this node should run. These match the UAPI
+	 * scheduler policy values, eg:
+	 * 00b: SCHED_NORMAL
+	 * 01b: SCHED_FIFO
+	 * 10b: SCHED_RR
+	 * 11b: SCHED_BATCH
+	 */
+	FLAT_BINDER_FLAG_SCHED_POLICY_MASK =
+		3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT,
 };
 
 #ifdef BINDER_IPC_32BIT

From 7230f99185e5200b959ea3c4387bbea24e30bd9b Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Wed, 7 Jun 2017 10:02:12 -0700
Subject: [PATCH 0939/3220] ANDROID: binder: improve priority inheritance.

By raising the priority of a thread selected for
a transaction *before* we wake it up.

Delay restoring the priority when doing a reply
until after we wake-up the process receiving
the reply.

Change-Id: Ic332e4e0ed7d2d3ca6ab1034da4629c9eadd3405
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c | 74 ++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 21 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 838f4dbc9af5..d046e7798b19 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -611,6 +611,7 @@ enum {
  * @is_dead:              thread is dead and awaiting free
  *                        when outstanding transactions are cleaned up
  *                        (protected by @proc->inner_lock)
+ * @task:                 struct task_struct for this thread
  *
  * Bookkeeping structure for binder threads.
  */
@@ -629,6 +630,7 @@ struct binder_thread {
 	struct binder_stats stats;
 	atomic_t tmp_ref;
 	bool is_dead;
+	struct task_struct *task;
 };
 
 struct binder_transaction {
@@ -647,6 +649,7 @@ struct binder_transaction {
 	unsigned int	flags;
 	struct binder_priority	priority;
 	struct binder_priority	saved_priority;
+	bool    set_priority_called;
 	kuid_t	sender_euid;
 	/**
 	 * @lock:  protects @from, @to_proc, and @to_thread
@@ -1161,6 +1164,38 @@ static void binder_set_priority(struct task_struct *task,
 		set_user_nice(task, priority);
 }
 
+static void binder_transaction_priority(struct task_struct *task,
+					struct binder_transaction *t,
+					struct binder_priority node_prio)
+{
+	struct binder_priority desired_prio;
+
+	if (t->set_priority_called)
+		return;
+
+	t->set_priority_called = true;
+	t->saved_priority.sched_policy = task->policy;
+	t->saved_priority.prio = task->normal_prio;
+
+	desired_prio.prio = t->priority.prio;
+	desired_prio.sched_policy = t->priority.sched_policy;
+
+	if (node_prio.prio < t->priority.prio ||
+	    (node_prio.prio == t->priority.prio &&
+	     node_prio.sched_policy == SCHED_FIFO)) {
+		/*
+		 * In case the minimum priority on the node is
+		 * higher (lower value), use that priority. If
+		 * the priority is the same, but the node uses
+		 * SCHED_FIFO, prefer SCHED_FIFO, since it can
+		 * run unbounded, unlike SCHED_RR.
+		 */
+		desired_prio = node_prio;
+	}
+
+	binder_set_priority(task, desired_prio);
+}
+
 static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
 						   binder_uintptr_t ptr)
 {
@@ -2635,11 +2670,15 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 {
 	struct list_head *target_list = NULL;
 	struct binder_node *node = t->buffer->target_node;
+	struct binder_priority node_prio;
 	bool oneway = !!(t->flags & TF_ONE_WAY);
 	bool wakeup = true;
 
 	BUG_ON(!node);
 	binder_node_lock(node);
+	node_prio.prio = node->min_priority;
+	node_prio.sched_policy = node->sched_policy;
+
 	if (oneway) {
 		BUG_ON(thread);
 		if (node->has_async_transaction) {
@@ -2661,12 +2700,14 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 	if (!thread && !target_list)
 		thread = binder_select_thread_ilocked(proc);
 
-	if (thread)
+	if (thread) {
 		target_list = &thread->todo;
-	else if (!target_list)
+		binder_transaction_priority(thread->task, t, node_prio);
+	} else if (!target_list) {
 		target_list = &proc->todo;
-	else
+	} else {
 		BUG_ON(target_list != &node->async_todo);
+	}
 
 	binder_enqueue_work_ilocked(&t->work, target_list);
 
@@ -2743,7 +2784,6 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 		thread->transaction_stack = in_reply_to->to_parent;
 		binder_inner_proc_unlock(proc);
-		binder_set_priority(current, in_reply_to->saved_priority);
 		target_thread = binder_get_txn_from_and_acq_inner(in_reply_to);
 		if (target_thread == NULL) {
 			return_error = BR_DEAD_REPLY;
@@ -3153,6 +3193,7 @@ static void binder_transaction(struct binder_proc *proc,
 		binder_enqueue_work_ilocked(&t->work, &target_thread->todo);
 		binder_inner_proc_unlock(target_proc);
 		wake_up_interruptible_sync(&target_thread->wait);
+		binder_set_priority(current, in_reply_to->saved_priority);
 		binder_free_transaction(in_reply_to);
 	} else if (!(t->flags & TF_ONE_WAY)) {
 		BUG_ON(t->buffer->async_transaction != 0);
@@ -3241,6 +3282,7 @@ err_no_context_mgr_node:
 
 	BUG_ON(thread->return_error.cmd != BR_OK);
 	if (in_reply_to) {
+		binder_set_priority(current, in_reply_to->saved_priority);
 		thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
 		binder_enqueue_work(thread->proc,
 				    &thread->return_error.work,
@@ -4052,26 +4094,13 @@ retry:
 		BUG_ON(t->buffer == NULL);
 		if (t->buffer->target_node) {
 			struct binder_node *target_node = t->buffer->target_node;
-			struct binder_priority prio = t->priority;
+			struct binder_priority node_prio;
 
 			tr.target.ptr = target_node->ptr;
 			tr.cookie =  target_node->cookie;
-			t->saved_priority.sched_policy = current->policy;
-			t->saved_priority.prio = current->normal_prio;
-			if (target_node->min_priority < t->priority.prio ||
-			    (target_node->min_priority == t->priority.prio &&
-			     target_node->sched_policy == SCHED_FIFO)) {
-				/*
-				 * In case the minimum priority on the node is
-				 * higher (lower value), use that priority. If
-				 * the priority is the same, but the node uses
-				 * SCHED_FIFO, prefer SCHED_FIFO, since it can
-				 * run unbounded, unlike SCHED_RR.
-				 */
-				prio.sched_policy = target_node->sched_policy;
-				prio.prio = target_node->min_priority;
-			}
-			binder_set_priority(current, prio);
+			node_prio.sched_policy = target_node->sched_policy;
+			node_prio.prio = target_node->min_priority;
+			binder_transaction_priority(current, t, node_prio);
 			cmd = BR_TRANSACTION;
 		} else {
 			tr.target.ptr = 0;
@@ -4247,6 +4276,8 @@ static struct binder_thread *binder_get_thread_ilocked(
 	binder_stats_created(BINDER_STAT_THREAD);
 	thread->proc = proc;
 	thread->pid = current->pid;
+	get_task_struct(current);
+	thread->task = current;
 	atomic_set(&thread->tmp_ref, 0);
 	init_waitqueue_head(&thread->wait);
 	INIT_LIST_HEAD(&thread->todo);
@@ -4297,6 +4328,7 @@ static void binder_free_thread(struct binder_thread *thread)
 	BUG_ON(!list_empty(&thread->todo));
 	binder_stats_deleted(BINDER_STAT_THREAD);
 	binder_proc_dec_tmpref(thread->proc);
+	put_task_struct(thread->task);
 	kfree(thread);
 }
 

From 39140a0f385c456a43c2dd25bc01503cc3841b97 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Fri, 23 Jun 2017 10:13:43 -0700
Subject: [PATCH 0940/3220] ANDROID: binder: add RT inheritance flag to node.

Allows a binder node to specify whether it wants to
inherit real-time scheduling policy from a caller.

Change-Id: I375b6094bf441c19f19cba06d5a6be02cd07d714
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c            | 22 +++++++++++++++++-----
 include/uapi/linux/android/binder.h |  8 ++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d046e7798b19..6b24f70dbff8 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -359,6 +359,8 @@ struct binder_error {
  *                        (invariant after initialized)
  * @min_priority:         minimum scheduling priority
  *                        (invariant after initialized)
+ * @inherit_rt:           inherit RT scheduling policy from caller
+ *                        (invariant after initialized)
  * @async_todo:           list of async work items
  *                        (protected by @proc->inner_lock)
  *
@@ -395,6 +397,7 @@ struct binder_node {
 		 * invariant after initialization
 		 */
 		u8 sched_policy:2;
+		u8 inherit_rt:1;
 		u8 accept_fds:1;
 		u8 min_priority;
 	};
@@ -1166,7 +1169,8 @@ static void binder_set_priority(struct task_struct *task,
 
 static void binder_transaction_priority(struct task_struct *task,
 					struct binder_transaction *t,
-					struct binder_priority node_prio)
+					struct binder_priority node_prio,
+					bool inherit_rt)
 {
 	struct binder_priority desired_prio;
 
@@ -1177,8 +1181,13 @@ static void binder_transaction_priority(struct task_struct *task,
 	t->saved_priority.sched_policy = task->policy;
 	t->saved_priority.prio = task->normal_prio;
 
-	desired_prio.prio = t->priority.prio;
-	desired_prio.sched_policy = t->priority.sched_policy;
+	if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) {
+		desired_prio.prio = NICE_TO_PRIO(0);
+		desired_prio.sched_policy = SCHED_NORMAL;
+	} else {
+		desired_prio.prio = t->priority.prio;
+		desired_prio.sched_policy = t->priority.sched_policy;
+	}
 
 	if (node_prio.prio < t->priority.prio ||
 	    (node_prio.prio == t->priority.prio &&
@@ -1283,6 +1292,7 @@ static struct binder_node *binder_init_node_ilocked(
 		FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT;
 	node->min_priority = to_kernel_prio(node->sched_policy, priority);
 	node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
+	node->inherit_rt = !!(flags & FLAT_BINDER_FLAG_INHERIT_RT);
 	spin_lock_init(&node->lock);
 	INIT_LIST_HEAD(&node->work.entry);
 	INIT_LIST_HEAD(&node->async_todo);
@@ -2702,7 +2712,8 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 
 	if (thread) {
 		target_list = &thread->todo;
-		binder_transaction_priority(thread->task, t, node_prio);
+		binder_transaction_priority(thread->task, t, node_prio,
+					    node->inherit_rt);
 	} else if (!target_list) {
 		target_list = &proc->todo;
 	} else {
@@ -4100,7 +4111,8 @@ retry:
 			tr.cookie =  target_node->cookie;
 			node_prio.sched_policy = target_node->sched_policy;
 			node_prio.prio = target_node->min_priority;
-			binder_transaction_priority(current, t, node_prio);
+			binder_transaction_priority(current, t, node_prio,
+						    target_node->inherit_rt);
 			cmd = BR_TRANSACTION;
 		} else {
 			tr.target.ptr = 0;
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 026558ac254d..70e252bf0be0 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -79,6 +79,14 @@ enum flat_binder_object_flags {
 	 */
 	FLAT_BINDER_FLAG_SCHED_POLICY_MASK =
 		3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT,
+
+	/**
+	 * @FLAT_BINDER_FLAG_INHERIT_RT: whether the node inherits RT policy
+	 *
+	 * Only when set, calls into this node will inherit a real-time
+	 * scheduling policy from the caller (for synchronous transactions).
+	 */
+	FLAT_BINDER_FLAG_INHERIT_RT = 0x800,
 };
 
 #ifdef BINDER_IPC_32BIT

From bab9c2fbe48e9ebe6af7fa19ade1bf558dcc2eac Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Thu, 25 May 2017 15:04:04 +0100
Subject: [PATCH 0941/3220] UPSTREAM: cpufreq: schedutil: Avoid indented labels

Switch to the more common practice of writing labels.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from commit 8e2ddb03643eb9d0bc4926946d7ce0d308eef0a5)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: Ida75c99cf3dff5cae24d3866454c83bcdb3385b9
---
 kernel/sched/cpufreq_schedutil.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 75bfbb336722..ff9da5144d8a 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -623,13 +623,13 @@ static int sugov_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto fail;
 
- out:
+out:
 	mutex_unlock(&global_tunables_lock);
 
 	cpufreq_enable_fast_switch(policy);
 	return 0;
 
- fail:
+fail:
 	policy->governor_data = NULL;
 	sugov_tunables_free(tunables);
 

From ceed1eb2b409d74ca694cbf4cdee536476a280c8 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Thu, 25 May 2017 15:06:27 +0100
Subject: [PATCH 0942/3220] UPSTREAM: cpufreq: schedutil: enable fast switch
 earlier

The fast_switch_enabled flag will be used by both sugov_policy_alloc()
and sugov_policy_free() with a later patch.

Prepare for that by moving the calls to enable and disable it to the
beginning of sugov_init() and end of sugov_exit().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from commit 4a71ce4348bb61740d411822357061f8bf870f4c)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: Ia174f423ca02d59360657ac2e77a5098ce5cf99c
---
 kernel/sched/cpufreq_schedutil.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index ff9da5144d8a..de24e1983614 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -578,9 +578,13 @@ static int sugov_init(struct cpufreq_policy *policy)
 	if (policy->governor_data)
 		return -EBUSY;
 
+	cpufreq_enable_fast_switch(policy);
+
 	sg_policy = sugov_policy_alloc(policy);
-	if (!sg_policy)
-		return -ENOMEM;
+	if (!sg_policy) {
+		ret = -ENOMEM;
+		goto disable_fast_switch;
+	}
 
 	ret = sugov_kthread_create(sg_policy);
 	if (ret)
@@ -625,8 +629,6 @@ static int sugov_init(struct cpufreq_policy *policy)
 
 out:
 	mutex_unlock(&global_tunables_lock);
-
-	cpufreq_enable_fast_switch(policy);
 	return 0;
 
 fail:
@@ -640,6 +642,10 @@ free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+	cpufreq_disable_fast_switch(policy);
+
 	pr_err("initialization failed (error %d)\n", ret);
 	return ret;
 }
@@ -650,8 +656,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
 	struct sugov_tunables *tunables = sg_policy->tunables;
 	unsigned int count;
 
-	cpufreq_disable_fast_switch(policy);
-
 	mutex_lock(&global_tunables_lock);
 
 	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -664,6 +668,7 @@ static int sugov_exit(struct cpufreq_policy *policy)
 	sugov_kthread_stop(sg_policy);
 	sugov_policy_free(sg_policy);
 
+	cpufreq_disable_fast_switch(policy);
 	return 0;
 }
 

From d9e7d036e7897f0c523fafe596316248783f61f7 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Thu, 25 May 2017 15:10:26 +0100
Subject: [PATCH 0943/3220] UPSTREAM: cpufreq: schedutil: irq-work and mutex
 are only used in slow path

Execute the irq-work specific initialization/exit code only when the
fast path isn't available.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from commit 21ef57297b15a49b0c4dd4e7135c1a08e9a29a1c)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: Icfd68f455ef71846d799fcd2d8ec6aa1bf59573e
---
 kernel/sched/cpufreq_schedutil.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index de24e1983614..8de497e64a13 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -489,15 +489,12 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 		return NULL;
 
 	sg_policy->policy = policy;
-	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
-	mutex_init(&sg_policy->work_lock);
 	raw_spin_lock_init(&sg_policy->update_lock);
 	return sg_policy;
 }
 
 static void sugov_policy_free(struct sugov_policy *sg_policy)
 {
-	mutex_destroy(&sg_policy->work_lock);
 	kfree(sg_policy);
 }
 
@@ -531,6 +528,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 
 	sg_policy->thread = thread;
 	kthread_bind_mask(thread, policy->related_cpus);
+	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+	mutex_init(&sg_policy->work_lock);
+
 	wake_up_process(thread);
 
 	return 0;
@@ -544,6 +544,7 @@ static void sugov_kthread_stop(struct sugov_policy *sg_policy)
 
 	flush_kthread_worker(&sg_policy->worker);
 	kthread_stop(sg_policy->thread);
+	mutex_destroy(&sg_policy->work_lock);
 }
 
 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
@@ -719,9 +720,10 @@ static int sugov_stop(struct cpufreq_policy *policy)
 
 	synchronize_sched();
 
-	irq_work_sync(&sg_policy->irq_work);
-	kthread_cancel_work_sync(&sg_policy->work);
-
+	if (!policy->fast_switch_enabled) {
+		irq_work_sync(&sg_policy->irq_work);
+		kthread_cancel_work_sync(&sg_policy->work);
+	}
 	return 0;
 }
 

From 69fc75780d1521448ba5288545a5d522a43ad062 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 24 Nov 2016 13:51:11 +0530
Subject: [PATCH 0944/3220] UPSTREAM: cpufreq: schedutil: Rectify comment in
 sugov_irq_work() function

This patch rectifies a comment present in sugov_irq_work() function to
follow proper grammar.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from commit d06e622d3d9206e6a2cc45a0f9a3256da8773ff4)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: Iaf996445d411725639d511432cc424086892a146
---
 kernel/sched/cpufreq_schedutil.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 8de497e64a13..b76286a8e118 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -371,15 +371,15 @@ static void sugov_irq_work(struct irq_work *irq_work)
 	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
 
 	/*
-	 * For Real Time and Deadline tasks, schedutil governor shoots the
-	 * frequency to maximum. And special care must be taken to ensure that
-	 * this kthread doesn't result in that.
+	 * For RT and deadline tasks, the schedutil governor shoots the
+	 * frequency to maximum. Special care must be taken to ensure that this
+	 * kthread doesn't result in the same behavior.
 	 *
 	 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
-	 * updated only at the end of the sugov_work() and before that schedutil
-	 * rejects all other frequency scaling requests.
+	 * updated only at the end of the sugov_work() function and before that
+	 * the schedutil governor rejects all other frequency scaling requests.
 	 *
-	 * Though there is a very rare case where the RT thread yields right
+	 * There is a very rare case though, where the RT thread yields right
 	 * after the work_in_progress flag is cleared. The effects of that are
 	 * neglected for now.
 	 */

From 0646dd35928e446d8d2f8075a349b3abb521fa06 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Thu, 25 May 2017 15:12:38 +0100
Subject: [PATCH 0945/3220] UPSTREAM: cpufreq: schedutil: move cached_raw_freq
 to struct sugov_policy

cached_raw_freq applies to the entire cpufreq policy and not individual
CPUs. Apart from wasting per-cpu memory, it is actually wrong to keep it
in struct sugov_cpu as we may end up comparing next_freq with a stale
cached_raw_freq of a random CPU.

Move cached_raw_freq to struct sugov_policy.

Fixes: 5cbea46984d6 (cpufreq: schedutil: map raw required frequency to driver frequency)
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry-picked from 6c4f0fa643cb9e775dcc976e3db00d649468ff1d)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: Ie91420f710819b383947f9031da9be1f3bb7f636
---
 kernel/sched/cpufreq_schedutil.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index b76286a8e118..a8ca00a02ded 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,6 +47,7 @@ struct sugov_policy {
 	s64 up_rate_delay_ns;
 	s64 down_rate_delay_ns;
 	unsigned int next_freq;
+	unsigned int cached_raw_freq;
 
 	/* The next fields are only needed if fast switch cannot be used. */
 	struct irq_work irq_work;
@@ -63,7 +64,6 @@ struct sugov_cpu {
 	struct update_util_data update_util;
 	struct sugov_policy *sg_policy;
 
-	unsigned int cached_raw_freq;
 	unsigned long iowait_boost;
 	unsigned long iowait_boost_max;
 	u64 last_update;
@@ -180,9 +180,9 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
 
 	freq = (freq + (freq >> 2)) * util / max;
 
-	if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+	if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
 		return sg_policy->next_freq;
-	sg_cpu->cached_raw_freq = freq;
+	sg_policy->cached_raw_freq = freq;
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
@@ -687,6 +687,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 	sg_policy->next_freq = UINT_MAX;
 	sg_policy->work_in_progress = false;
 	sg_policy->need_freq_update = false;
+	sg_policy->cached_raw_freq = 0;
 
 	for_each_cpu(cpu, policy->cpus) {
 		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
@@ -697,7 +698,6 @@ static int sugov_start(struct cpufreq_policy *policy)
 			sg_cpu->max = 0;
 			sg_cpu->flags = SCHED_CPUFREQ_DL;
 			sg_cpu->last_update = 0;
-			sg_cpu->cached_raw_freq = 0;
 			sg_cpu->iowait_boost = 0;
 			sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
 			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,

From cbaccedead5cc87296194c6a668ea38de8d7c17c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 2 Mar 2017 14:03:21 +0530
Subject: [PATCH 0946/3220] UPSTREAM: cpufreq: schedutil: Pass sg_policy to
 get_next_freq()

get_next_freq() uses sg_cpu only to get sg_policy, which the callers of
get_next_freq() already have. Pass sg_policy instead of sg_cpu to
get_next_freq(), to make it more efficient.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from commit 655cb1ebff4b7918fc560502c3297af2d3c7d114)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: Ia210058da32930a6cdb18258aa679cd1a44a747e
---
 kernel/sched/cpufreq_schedutil.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a8ca00a02ded..9bf525a960f3 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -150,7 +150,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 
 /**
  * get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @sg_cpu: schedutil cpu object to compute the new frequency for.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
  * @util: Current CPU utilization.
  * @max: CPU capacity.
  *
@@ -170,10 +170,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
  * next_freq (as calculated above) is returned, subject to policy min/max and
  * cpufreq driver limitations.
  */
-static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
-				  unsigned long max)
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+				  unsigned long util, unsigned long max)
 {
-	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned int freq = arch_scale_freq_invariant() ?
 				policy->cpuinfo.max_freq : policy->cur;
@@ -268,7 +267,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	} else {
 		sugov_get_util(&util, &max, time);
 		sugov_iowait_boost(sg_cpu, &util, &max);
-		next_f = get_next_freq(sg_cpu, util, max);
+		next_f = get_next_freq(sg_policy, util, max);
 	}
 	sugov_update_commit(sg_policy, time, next_f);
 }
@@ -322,7 +321,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
 		sugov_iowait_boost(j_sg_cpu, &util, &max);
 	}
 
-	return get_next_freq(sg_cpu, util, max);
+	return get_next_freq(sg_policy, util, max);
 }
 
 static void sugov_update_shared(struct update_util_data *hook, u64 time,

From 7378c38a80fc434f8d6cf61167d843ca388468b5 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Thu, 25 May 2017 15:15:33 +0100
Subject: [PATCH 0947/3220] UPSTREAM: cpufreq: schedutil: Fix per-CPU structure
 initialization in sugov_start()

sugov_start() only initializes struct sugov_cpu per-CPU structures
for shared policies, but it should do that for single-CPU policies too.

That in particular makes the IO-wait boost mechanism work in the
cases when cpufreq policies correspond to individual CPUs.

Fixes: 21ca6d2c52f8 (cpufreq: schedutil: Add iowait boosting)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: 4.9+ <stable@vger.kernel.org> # 4.9+
(cherry picked from commit 4296f23ed49a15d36949458adcc66ff993dee2a8)
(we use SCHED_CPUFREQ_DL instead of SCHED_CPUFREQ_RT in cpu->flags)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: I5b837a0ee4432115d85caa1a9808ea61e1e1b07f
---
 kernel/sched/cpufreq_schedutil.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 9bf525a960f3..3a9283640f84 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -691,20 +691,14 @@ static int sugov_start(struct cpufreq_policy *policy)
 	for_each_cpu(cpu, policy->cpus) {
 		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 
+		memset(sg_cpu, 0, sizeof(*sg_cpu));
 		sg_cpu->sg_policy = sg_policy;
-		if (policy_is_shared(policy)) {
-			sg_cpu->util = 0;
-			sg_cpu->max = 0;
-			sg_cpu->flags = SCHED_CPUFREQ_DL;
-			sg_cpu->last_update = 0;
-			sg_cpu->iowait_boost = 0;
-			sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
-			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
-						     sugov_update_shared);
-		} else {
-			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
-						     sugov_update_single);
-		}
+		sg_cpu->flags = SCHED_CPUFREQ_DL;
+		sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+					     policy_is_shared(policy) ?
+							sugov_update_shared :
+							sugov_update_single);
 	}
 	return 0;
 }

From a8a200d83b1ff334a6c607b8fda848551648689b Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Thu, 25 May 2017 15:22:59 +0100
Subject: [PATCH 0948/3220] UPSTREAM: cpufreq: schedutil: Refactor
 sugov_next_freq_shared()

The loop in sugov_next_freq_shared() contains an if block to skip the
loop for the current CPU. This turns out to be an unnecessary
conditional in the scheduler's hot-path for every CPU in the policy.

It would be better to drop the conditional and make the loop treat all
the CPUs in the same way. That would eliminate the need of calling
sugov_iowait_boost() at the top of the routine.

To keep the code optimized to return early if the current CPU has RT/DL
flags set, move the flags check to sugov_update_shared() instead in
order to avoid the function call entirely.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from commit cba1dfb57b94c234728b689d9b00d4267fa1a879)
(modified for SCHED_CPUFREQ_DL vs SCHED_CPUFREQ_RT)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: Ie046fdc8eda46821356750edd0fb6f7d077af363
---
 kernel/sched/cpufreq_schedutil.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3a9283640f84..e1a2ed98991f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -272,30 +272,19 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	sugov_update_commit(sg_policy, time, next_f);
 }
 
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
-					   unsigned long util, unsigned long max,
-					   unsigned int flags)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
 {
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	struct cpufreq_policy *policy = sg_policy->policy;
-	unsigned int max_f = policy->cpuinfo.max_freq;
 	u64 last_freq_update_time = sg_policy->last_freq_update_time;
+	unsigned long util = 0, max = 1;
 	unsigned int j;
 
-	if (flags & SCHED_CPUFREQ_DL)
-		return max_f;
-
-	sugov_iowait_boost(sg_cpu, &util, &max);
-
 	for_each_cpu(j, policy->cpus) {
-		struct sugov_cpu *j_sg_cpu;
+		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
 		unsigned long j_util, j_max;
 		s64 delta_ns;
 
-		if (j == smp_processor_id())
-			continue;
-
-		j_sg_cpu = &per_cpu(sugov_cpu, j);
 		/*
 		 * If the CPU utilization was last updated before the previous
 		 * frequency update and the time elapsed between the last update
@@ -309,7 +298,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
 			continue;
 		}
 		if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
-			return max_f;
+			return policy->cpuinfo.max_freq;
 
 		j_util = j_sg_cpu->util;
 		j_max = j_sg_cpu->max;
@@ -344,7 +333,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	sg_cpu->last_update = time;
 
 	if (sugov_should_update_freq(sg_policy, time)) {
-		next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+		if (flags & SCHED_CPUFREQ_DL)
+			next_f = sg_policy->policy->cpuinfo.max_freq;
+		else
+			next_f = sugov_next_freq_shared(sg_cpu);
+
 		sugov_update_commit(sg_policy, time, next_f);
 	}
 

From 537d19226a8c4d36cc376f6b177576dff989a97d Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Thu, 25 May 2017 15:24:58 +0100
Subject: [PATCH 0949/3220] UPSTREAM: cpufreq: schedutil: Avoid reducing
 frequency of busy CPUs prematurely

The way the schedutil governor uses the PELT metric causes it to
underestimate the CPU utilization in some cases.

That can be easily demonstrated by running kernel compilation on
a Sandy Bridge Intel processor, running turbostat in parallel with
it and looking at the values written to the MSR_IA32_PERF_CTL
register.  Namely, the expected result would be that when all CPUs
were 100% busy, all of them would be requested to run in the maximum
P-state, but observation shows that this clearly isn't the case.
The CPUs run in the maximum P-state for a while and then are
requested to run slower and go back to the maximum P-state after
a while again.  That causes the actual frequency of the processor to
visibly oscillate below the sustainable maximum in a jittery fashion
which clearly is not desirable.

That has been attributed to CPU utilization metric updates on task
migration that cause the total utilization value for the CPU to be
reduced by the utilization of the migrated task.  If that happens,
the schedutil governor may see a CPU utilization reduction and will
attempt to reduce the CPU frequency accordingly right away.  That
may be premature, though, for example if the system is generally
busy and there are other runnable tasks waiting to be run on that
CPU already.

This is unlikely to be an issue on systems where cpufreq policies are
shared between multiple CPUs, because in those cases the policy
utilization is computed as the maximum of the CPU utilization values
over the whole policy and if that turns out to be low, reducing the
frequency for the policy most likely is a good idea anyway.  On
systems with one CPU per policy, however, it may affect performance
adversely and even lead to increased energy consumption in some cases.

On those systems it may be addressed by taking another utilization
metric into consideration, like whether or not the CPU whose
frequency is about to be reduced has been idle recently, because if
that's not the case, the CPU is likely to be busy in the near future
and its frequency should not be reduced.

To that end, use the counter of idle calls in the timekeeping code.
Namely, make the schedutil governor look at that counter for the
current CPU every time before its frequency is about to be reduced.
If the counter has not changed since the previous iteration of the
governor computations for that CPU, the CPU has been busy for all
that time and its frequency should not be decreased, so if the new
frequency would be lower than the one set previously, the governor
will skip the frequency update.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Joel Fernandes <joelaf@google.com>
(cherry picked from commit b7eaf1aab9f8bd2e49fceed77ebc66c1b5800718)
(simple CPUFREQ_RT_DL vs CPUFREQ_DL usage conflicts)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: I531ec02c052944ee07a904dc2a25c59948ee762b
---
 include/linux/tick.h             |  1 +
 kernel/sched/cpufreq_schedutil.c | 27 +++++++++++++++++++++++++++
 kernel/time/tick-sched.c         | 12 ++++++++++++
 3 files changed, 40 insertions(+)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index e312219ff823..af5ac7f91a3b 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -103,6 +103,7 @@ extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
+extern unsigned long tick_nohz_get_idle_calls(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e1a2ed98991f..49922187b57c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -72,6 +72,11 @@ struct sugov_cpu {
 	unsigned long util;
 	unsigned long max;
 	unsigned int flags;
+
+	/* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+	unsigned long saved_idle_calls;
+#endif
 };
 
 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -247,6 +252,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 	sg_cpu->iowait_boost >>= 1;
 }
 
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+	unsigned long idle_calls = tick_nohz_get_idle_calls();
+	bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+	sg_cpu->saved_idle_calls = idle_calls;
+	return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
 static void sugov_update_single(struct update_util_data *hook, u64 time,
 				unsigned int flags)
 {
@@ -255,6 +273,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned long util, max;
 	unsigned int next_f;
+	bool busy;
 
 	sugov_set_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
@@ -262,12 +281,20 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	if (!sugov_should_update_freq(sg_policy, time))
 		return;
 
+	busy = sugov_cpu_is_busy(sg_cpu);
+
 	if (flags & SCHED_CPUFREQ_DL) {
 		next_f = policy->cpuinfo.max_freq;
 	} else {
 		sugov_get_util(&util, &max, time);
 		sugov_iowait_boost(sg_cpu, &util, &max);
 		next_f = get_next_freq(sg_policy, util, max);
+		/*
+		 * Do not reduce the frequency if the CPU has not been idle
+		 * recently, as the reduction is likely to be premature then.
+		 */
+		if (busy && next_f < sg_policy->next_freq)
+			next_f = sg_policy->next_freq;
 	}
 	sugov_update_commit(sg_policy, time, next_f);
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 22c57e191a23..34f9a9c417d9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -870,6 +870,18 @@ ktime_t tick_nohz_get_sleep_length(void)
 	return ts->sleep_length;
 }
 
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+	return ts->idle_calls;
+}
+
 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE

From 2ee9941b0bbcc6a9b047d78bcb9f18be98a15625 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Thu, 25 May 2017 15:27:07 +0100
Subject: [PATCH 0950/3220] UPSTREAM: cpufreq: schedutil: Trace frequency only
 if it has changed

sugov_update_commit() calls trace_cpu_frequency() to record the
current CPU frequency if it has not changed in the fast switch case
to prevent utilities from getting confused (they may report that the
CPU is idle if the frequency has not been recorded for too long, for
example).

However, that may cause the tracepoint to be triggered quite often
for no real reason (if the frequency doesn't change, we will not
modify the last update time stamp and governor computations may
run again shortly when that happens), so don't do that (arguably, it
is done to work around a utilities bug anyway).

That allows code duplication in sugov_update_commit() to be reduced
somewhat too.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
(cherry picked from commit 38d4ea229d25d30be6bf41bcd6cd663a587866ca)
(conflicts with sugov_up_down_rate_limit resolved)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: Ia019dda29b8c1c4cf3553da75c88d066eb5674e9
---
 kernel/sched/cpufreq_schedutil.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 49922187b57c..e12309c1b07b 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -132,22 +132,20 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 	if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
 		return;
 
+	if (sg_policy->next_freq == next_freq)
+		return;
+
+	sg_policy->next_freq = next_freq;
+	sg_policy->last_freq_update_time = time;
+
 	if (policy->fast_switch_enabled) {
-		if (sg_policy->next_freq == next_freq) {
-			trace_cpu_frequency(policy->cur, smp_processor_id());
-			return;
-		}
-		sg_policy->next_freq = next_freq;
-		sg_policy->last_freq_update_time = time;
 		next_freq = cpufreq_driver_fast_switch(policy, next_freq);
 		if (next_freq == CPUFREQ_ENTRY_INVALID)
 			return;
 
 		policy->cur = next_freq;
 		trace_cpu_frequency(next_freq, smp_processor_id());
-	} else if (sg_policy->next_freq != next_freq) {
-		sg_policy->next_freq = next_freq;
-		sg_policy->last_freq_update_time = time;
+	} else {
 		sg_policy->work_in_progress = true;
 		irq_work_queue(&sg_policy->irq_work);
 	}

From 89ce9d97e661ee44741bd1ab471c4d9a8a27d077 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Tue, 20 Jun 2017 13:54:44 -0700
Subject: [PATCH 0951/3220] Add BINDER_GET_NODE_DEBUG_INFO ioctl

The BINDER_GET_NODE_DEBUG_INFO ioctl will return debug info on
a node.  Each successive call reusing the previous return value
will return the next node.  The data will be used by
libmemunreachable to mark the pointers with kernel references
as reachable.

Bug: 28275695
Change-Id: Idbbafa648a33822dc023862cd92b51a595cf7c1c
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c            | 42 +++++++++++++++++++++++++++++
 include/uapi/linux/android/binder.h | 14 ++++++++++
 2 files changed, 56 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 6b24f70dbff8..4247d7d8b8e6 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4548,6 +4548,30 @@ out:
 	return ret;
 }
 
+static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
+				struct binder_node_debug_info *info) {
+	struct rb_node *n;
+	binder_uintptr_t ptr = info->ptr;
+
+	memset(info, 0, sizeof(*info));
+
+	binder_inner_proc_lock(proc);
+	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
+		struct binder_node *node = rb_entry(n, struct binder_node,
+						    rb_node);
+		if (node->ptr > ptr) {
+			info->ptr = node->ptr;
+			info->cookie = node->cookie;
+			info->has_strong_ref = node->has_strong_ref;
+			info->has_weak_ref = node->has_weak_ref;
+			break;
+		}
+	}
+	binder_inner_proc_unlock(proc);
+
+	return 0;
+}
+
 static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int ret;
@@ -4615,6 +4639,24 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		break;
 	}
+	case BINDER_GET_NODE_DEBUG_INFO: {
+		struct binder_node_debug_info info;
+
+		if (copy_from_user(&info, ubuf, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		ret = binder_ioctl_get_node_debug_info(proc, &info);
+		if (ret < 0)
+			goto err;
+
+		if (copy_to_user(ubuf, &info, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+		break;
+	}
 	default:
 		ret = -EINVAL;
 		goto err;
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 70e252bf0be0..5539933b3491 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -233,6 +233,19 @@ struct binder_version {
 #define BINDER_CURRENT_PROTOCOL_VERSION 8
 #endif
 
+/*
+ * Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields.
+ * Set ptr to NULL for the first call to get the info for the first node, and
+ * then repeat the call passing the previously returned value to get the next
+ * nodes.  ptr will be 0 when there are no more nodes.
+ */
+struct binder_node_debug_info {
+	binder_uintptr_t ptr;
+	binder_uintptr_t cookie;
+	__u32            has_strong_ref;
+	__u32            has_weak_ref;
+};
+
 #define BINDER_WRITE_READ		_IOWR('b', 1, struct binder_write_read)
 #define BINDER_SET_IDLE_TIMEOUT		_IOW('b', 3, __s64)
 #define BINDER_SET_MAX_THREADS		_IOW('b', 5, __u32)
@@ -240,6 +253,7 @@ struct binder_version {
 #define BINDER_SET_CONTEXT_MGR		_IOW('b', 7, __s32)
 #define BINDER_THREAD_EXIT		_IOW('b', 8, __s32)
 #define BINDER_VERSION			_IOWR('b', 9, struct binder_version)
+#define BINDER_GET_NODE_DEBUG_INFO	_IOWR('b', 11, struct binder_node_debug_info)
 
 /*
  * NOTE: Two special error codes you should check for when calling

From 76b376eac7a2e9584afb58af87d5451f55b82e3d Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Fri, 26 May 2017 10:48:56 -0700
Subject: [PATCH 0952/3220] ANDROID: binder: don't check prio permissions on
 restore.

Because we have disabled RT priority inheritance for
the regular binder domain, the following can happen:

1) thread A (prio 98) calls into thread B
2) because RT prio inheritance is disabled, thread B
   runs at the lowest nice (prio 100) instead
3) thread B calls back into A; A will run at prio 100
   for the duration of the transaction
4) When thread A is done with the call from B, we will
   try to restore the prio back to 98. But, we fail
   because the process doesn't hold CAP_SYS_NICE,
   neither is RLIMIT_RT_PRIO set.

While the proper fix going forward will be to
correctly apply CAP_SYS_NICE or RLIMIT_RT_PRIO,
for now it seems reasonable to not check permissions
on the restore path.

Change-Id: Ibede5960c9b7bb786271c001e405de50be64d944
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 4247d7d8b8e6..29e3e3f21443 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1109,8 +1109,9 @@ static int to_kernel_prio(int policy, int user_priority)
 		return MAX_USER_RT_PRIO - 1 - user_priority;
 }
 
-static void binder_set_priority(struct task_struct *task,
-				struct binder_priority desired)
+static void binder_do_set_priority(struct task_struct *task,
+				   struct binder_priority desired,
+				   bool verify)
 {
 	int priority; /* user-space prio value */
 	bool has_cap_nice;
@@ -1123,7 +1124,7 @@ static void binder_set_priority(struct task_struct *task,
 
 	priority = to_userspace_prio(policy, desired.prio);
 
-	if (is_rt_policy(policy) && !has_cap_nice) {
+	if (verify && is_rt_policy(policy) && !has_cap_nice) {
 		long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO);
 
 		if (max_rtprio == 0) {
@@ -1134,7 +1135,7 @@ static void binder_set_priority(struct task_struct *task,
 		}
 	}
 
-	if (is_fair_policy(policy) && !has_cap_nice) {
+	if (verify && is_fair_policy(policy) && !has_cap_nice) {
 		long min_nice = rlimit_to_nice(task_rlimit(task, RLIMIT_NICE));
 
 		if (min_nice > MAX_NICE) {
@@ -1167,6 +1168,18 @@ static void binder_set_priority(struct task_struct *task,
 		set_user_nice(task, priority);
 }
 
+static void binder_set_priority(struct task_struct *task,
+				struct binder_priority desired)
+{
+	binder_do_set_priority(task, desired, /* verify = */ true);
+}
+
+static void binder_restore_priority(struct task_struct *task,
+				    struct binder_priority desired)
+{
+	binder_do_set_priority(task, desired, /* verify = */ false);
+}
+
 static void binder_transaction_priority(struct task_struct *task,
 					struct binder_transaction *t,
 					struct binder_priority node_prio,
@@ -3204,7 +3217,7 @@ static void binder_transaction(struct binder_proc *proc,
 		binder_enqueue_work_ilocked(&t->work, &target_thread->todo);
 		binder_inner_proc_unlock(target_proc);
 		wake_up_interruptible_sync(&target_thread->wait);
-		binder_set_priority(current, in_reply_to->saved_priority);
+		binder_restore_priority(current, in_reply_to->saved_priority);
 		binder_free_transaction(in_reply_to);
 	} else if (!(t->flags & TF_ONE_WAY)) {
 		BUG_ON(t->buffer->async_transaction != 0);
@@ -3293,7 +3306,7 @@ err_no_context_mgr_node:
 
 	BUG_ON(thread->return_error.cmd != BR_OK);
 	if (in_reply_to) {
-		binder_set_priority(current, in_reply_to->saved_priority);
+		binder_restore_priority(current, in_reply_to->saved_priority);
 		thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
 		binder_enqueue_work(thread->proc,
 				    &thread->return_error.work,
@@ -3893,7 +3906,7 @@ retry:
 			wait_event_interruptible(binder_user_error_wait,
 						 binder_stop_on_user_error < 2);
 		}
-		binder_set_priority(current, proc->default_priority);
+		binder_restore_priority(current, proc->default_priority);
 	}
 
 	if (non_block) {

From d01a860b54f7fe60ae199d05c6e463be8e616bc1 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 19 Jul 2017 17:16:43 -0700
Subject: [PATCH 0953/3220] ANDROID: sdcardfs: Remove unnecessary lock

The mmap_sem lock does not appear to be protecting
anything, and has been removed in Samsung's more
recent versions of sdcardfs.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: I76ff3e33002716b8384fc8be368028ed63dffe4e
Bug: 63785372
---
 fs/sdcardfs/inode.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 60fea424835f..103dc45a131f 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -766,13 +766,9 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	 * afterwards in the other cases: we fsstack_copy_inode_size from
 	 * the lower level.
 	 */
-	if (current->mm)
-		down_write(&current->mm->mmap_sem);
 	if (ia->ia_valid & ATTR_SIZE) {
 		err = inode_newsize_ok(&tmp, ia->ia_size);
 		if (err) {
-			if (current->mm)
-				up_write(&current->mm->mmap_sem);
 			goto out;
 		}
 		truncate_setsize(inode, ia->ia_size);
@@ -795,8 +791,6 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	err = notify_change2(lower_mnt, lower_dentry, &lower_ia, /* note: lower_ia */
 			NULL);
 	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
-	if (current->mm)
-		up_write(&current->mm->mmap_sem);
 	if (err)
 		goto out;
 

From 492a6047e7b5a1383627189e8bff99ff181ef4a4 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Fri, 3 Jun 2016 13:16:59 -0700
Subject: [PATCH 0954/3220] ANDROID: android-verity: mark dev as rw for linear
 target

Mark as rw when adding as linear target to allow changes
to the underlying filesystem through adb disable verity
and adb remount.

(Cherry-picked from
https://partner-android-review.googlesource.com/#/c/613573/
79a3032bb62da65a5d724eb70c8bdc662945d475)

BUG: 28845874
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Change-Id: If41e9cad8e0f054f4778c09a6e2f0cb8af6fddaf
---
 drivers/md/dm-android-verity.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
index c3c9502baf18..c521df010ee3 100644
--- a/drivers/md/dm-android-verity.c
+++ b/drivers/md/dm-android-verity.c
@@ -645,6 +645,8 @@ static int add_as_linear_device(struct dm_target *ti, char *dev)
 	android_verity_target.iterate_devices = dm_linear_iterate_devices,
 	android_verity_target.io_hints = NULL;
 
+	set_disk_ro(dm_disk(dm_table_get_md(ti->table)), 0);
+
 	err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args);
 
 	if (!err) {

From 362e08d2572fd592b6a5322763977d898ebefba2 Mon Sep 17 00:00:00 2001
From: Daniel Mentz <danielmentz@google.com>
Date: Fri, 7 Jul 2017 11:27:31 -0700
Subject: [PATCH 0955/3220] Revert "proc: smaps: Allow smaps access for
 CAP_SYS_RESOURCE"

This reverts commit 9d19f72b43f495f6f1ef1268dbed1bbade8dea24.

This fixes CVE-2017-0710.

SELinux allows more fine grained control: We grant processes that need
access to smaps CAP_SYS_PTRACE but prohibit them from using ptrace
attach().

Bug: 34951864
Bug: 36468447
Change-Id: I8ea67f8771ec212950bc251ee750bd8a7e7c0643
Signed-off-by: Daniel Mentz <danielmentz@google.com>
---
 kernel/fork.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 968917653c2c..68cfda1c1800 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -827,8 +827,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 
 	mm = get_task_mm(task);
 	if (mm && mm != current->mm &&
-			!ptrace_may_access(task, mode) &&
-			!capable(CAP_SYS_RESOURCE)) {
+			!ptrace_may_access(task, mode)) {
 		mmput(mm);
 		mm = ERR_PTR(-EACCES);
 	}

From ca2ecff8649ee9db3d00fca5fc20e5a972fde18f Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Fri, 21 Jul 2017 09:41:06 -0700
Subject: [PATCH 0956/3220] ANDROID: lowmemorykiller: Add tgid to kill message

Bug: 33346201
Test: trigger LMK and verify tgid is there

Change-Id: I047abc3aa541522766d2a84ebb4c77caf57a18a3
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 drivers/staging/android/lowmemorykiller.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index 606d13a1ea74..2f4ef2120d31 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -177,11 +177,11 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 			mark_oom_victim(selected);
 		task_unlock(selected);
 		trace_lowmemory_kill(selected, cache_size, cache_limit, free);
-		lowmem_print(1, "Killing '%s' (%d), adj %hd,\n" \
+		lowmem_print(1, "Killing '%s' (%d) (tgid %d), adj %hd,\n" \
 			        "   to free %ldkB on behalf of '%s' (%d) because\n" \
 			        "   cache %ldkB is below limit %ldkB for oom_score_adj %hd\n" \
 			        "   Free memory is %ldkB above reserved\n",
-			     selected->comm, selected->pid,
+			     selected->comm, selected->pid, selected->tgid,
 			     selected_oom_score_adj,
 			     selected_tasksize * (long)(PAGE_SIZE / 1024),
 			     current->comm, current->pid,

From 3715386152f1cd1585273af3ff950d4b0378e524 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 13 Apr 2017 18:35:59 +0800
Subject: [PATCH 0957/3220] UPSTREAM: af_key: Fix sadb_x_ipsecrequest parsing

(cherry picked from commit 096f41d3a8fcbb8dde7f71379b1ca85fe213eded)

The parsing of sadb_x_ipsecrequest is broken in a number of ways.
First of all we're not verifying sadb_x_ipsecrequest_len.  This
is needed when the structure carries addresses at the end.  Worse
we don't even look at the length when we parse those optional
addresses.

The migration code had similar parsing code that's better but
it also has some deficiencies.  The length is overcounted first
of all as it includes the header itself.  It also fails to check
the length before dereferencing the sa_family field.

This patch fixes those problems in parse_sockaddr_pair and then
uses it in parse_ipsecrequest.

Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Bug: 63963140
Change-Id: Ie8854131293bc8153bfb8cb255161ec8f03667f9
---
 net/key/af_key.c | 47 ++++++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/net/key/af_key.c b/net/key/af_key.c
index e67c28e614b9..d8d95b6415e4 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -65,6 +65,10 @@ struct pfkey_sock {
 	} dump;
 };
 
+static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
+			       xfrm_address_t *saddr, xfrm_address_t *daddr,
+			       u16 *family);
+
 static inline struct pfkey_sock *pfkey_sk(struct sock *sk)
 {
 	return (struct pfkey_sock *)sk;
@@ -1922,19 +1926,14 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
 
 	/* addresses present only in tunnel mode */
 	if (t->mode == XFRM_MODE_TUNNEL) {
-		u8 *sa = (u8 *) (rq + 1);
-		int family, socklen;
+		int err;
 
-		family = pfkey_sockaddr_extract((struct sockaddr *)sa,
-						&t->saddr);
-		if (!family)
-			return -EINVAL;
-
-		socklen = pfkey_sockaddr_len(family);
-		if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen),
-					   &t->id.daddr) != family)
-			return -EINVAL;
-		t->encap_family = family;
+		err = parse_sockaddr_pair(
+			(struct sockaddr *)(rq + 1),
+			rq->sadb_x_ipsecrequest_len - sizeof(*rq),
+			&t->saddr, &t->id.daddr, &t->encap_family);
+		if (err)
+			return err;
 	} else
 		t->encap_family = xp->family;
 
@@ -1954,7 +1953,11 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
 	if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy))
 		return -EINVAL;
 
-	while (len >= sizeof(struct sadb_x_ipsecrequest)) {
+	while (len >= sizeof(*rq)) {
+		if (len < rq->sadb_x_ipsecrequest_len ||
+		    rq->sadb_x_ipsecrequest_len < sizeof(*rq))
+			return -EINVAL;
+
 		if ((err = parse_ipsecrequest(xp, rq)) < 0)
 			return err;
 		len -= rq->sadb_x_ipsecrequest_len;
@@ -2417,7 +2420,6 @@ out:
 	return err;
 }
 
-#ifdef CONFIG_NET_KEY_MIGRATE
 static int pfkey_sockaddr_pair_size(sa_family_t family)
 {
 	return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2);
@@ -2429,7 +2431,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
 {
 	int af, socklen;
 
-	if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
+	if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
 		return -EINVAL;
 
 	af = pfkey_sockaddr_extract(sa, saddr);
@@ -2445,6 +2447,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
 	return 0;
 }
 
+#ifdef CONFIG_NET_KEY_MIGRATE
 static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
 				    struct xfrm_migrate *m)
 {
@@ -2452,13 +2455,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
 	struct sadb_x_ipsecrequest *rq2;
 	int mode;
 
-	if (len <= sizeof(struct sadb_x_ipsecrequest) ||
-	    len < rq1->sadb_x_ipsecrequest_len)
+	if (len < sizeof(*rq1) ||
+	    len < rq1->sadb_x_ipsecrequest_len ||
+	    rq1->sadb_x_ipsecrequest_len < sizeof(*rq1))
 		return -EINVAL;
 
 	/* old endoints */
 	err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1),
-				  rq1->sadb_x_ipsecrequest_len,
+				  rq1->sadb_x_ipsecrequest_len - sizeof(*rq1),
 				  &m->old_saddr, &m->old_daddr,
 				  &m->old_family);
 	if (err)
@@ -2467,13 +2471,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
 	rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len);
 	len -= rq1->sadb_x_ipsecrequest_len;
 
-	if (len <= sizeof(struct sadb_x_ipsecrequest) ||
-	    len < rq2->sadb_x_ipsecrequest_len)
+	if (len <= sizeof(*rq2) ||
+	    len < rq2->sadb_x_ipsecrequest_len ||
+	    rq2->sadb_x_ipsecrequest_len < sizeof(*rq2))
 		return -EINVAL;
 
 	/* new endpoints */
 	err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1),
-				  rq2->sadb_x_ipsecrequest_len,
+				  rq2->sadb_x_ipsecrequest_len - sizeof(*rq2),
 				  &m->new_saddr, &m->new_daddr,
 				  &m->new_family);
 	if (err)

From 5680f23f20c73f6348fe73dc23a025a965d69e28 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Wed, 29 Mar 2017 09:01:06 +0100
Subject: [PATCH 0958/3220] sched/fair: streamline find_best_target heuristics

The find_best_target() code has evolved over time to integrate different
micro-optimizations to the point to be quite difficult now to follow
exactly what it's doing.

This patch rafactors the existing code to make it more readable and easy
to maintain. It does that by properly identifying the three main
use-cases and addressing them in priority order:
 A) latency sensitive tasks
 B) non latency sensitive tasks on IDLE CPUs
 C) non latency sensitive tasks on ACTIVE CPUs

The original behaviors are preserved. Some tests to compare
power/performances before and after this patch have been done using
Jankbench and YouTube and we did not noticed sensible differences.

The only difference with respect of the original code is a small update
to favor lower-capacity idle CPUs in case B. The same preference is not
enforce in case A since this can lead to a selection of a non-reserved
CPU for TOP_APP tasks, which ultimately can lead to non desirable
co-scheduling side-effects.

Change-Id: I871e5d95af89176217e4e239b64d44a420baabe8
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
(removed checkpatch whitespace error)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 262 +++++++++++++++++++++++++++++++++-----------
 kernel/sched/walt.h |   2 +
 2 files changed, 197 insertions(+), 67 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 226d1990eea9..a61c47a030a0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6189,46 +6189,54 @@ static int start_cpu(bool boosted)
 
 static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
 {
-	int target_cpu = -1;
-	unsigned long target_util = prefer_idle ? ULONG_MAX : 0;
-	unsigned long backup_capacity = ULONG_MAX;
-	int best_idle_cpu = -1;
-	int best_idle_cstate = INT_MAX;
-	int backup_cpu = -1;
+	unsigned long best_idle_min_cap_orig = ULONG_MAX;
 	unsigned long min_util = boosted_task_util(p);
+	unsigned long target_capacity = ULONG_MAX;
+	unsigned long min_wake_util = ULONG_MAX;
+	unsigned long target_max_spare_cap = 0;
+	unsigned long target_util = ULONG_MAX;
+	unsigned long best_active_util = ULONG_MAX;
+	int best_idle_cstate = INT_MAX;
 	struct sched_domain *sd;
 	struct sched_group *sg;
-	int cpu = start_cpu(boosted);
+	int best_active_cpu = -1;
+	int best_idle_cpu = -1;
+	int target_cpu = -1;
+	int cpu, i;
 
 	schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
 	schedstat_inc(this_rq(), eas_stats.fbt_attempts);
 
+	/* Find start CPU based on boost value */
+	cpu = start_cpu(boosted);
 	if (cpu < 0) {
 		schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
 		schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
-		return target_cpu;
+		return -1;
 	}
 
+	/* Find SD for the start CPU */
 	sd = rcu_dereference(per_cpu(sd_ea, cpu));
-
 	if (!sd) {
 		schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
 		schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
-		return target_cpu;
+		return -1;
 	}
 
+	/* Scan CPUs in all SDs */
 	sg = sd->groups;
-
 	do {
-		int i;
-
 		for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
-			unsigned long cur_capacity, new_util, wake_util;
-			unsigned long min_wake_util = ULONG_MAX;
+			unsigned long capacity_curr = capacity_curr_of(i);
+			unsigned long capacity_orig = capacity_orig_of(i);
+			unsigned long wake_util, new_util;
 
 			if (!cpu_online(i))
 				continue;
 
+			if (walt_cpu_high_irqload(i))
+				continue;
+
 			/*
 			 * p's blocked utilization is still accounted for on prev_cpu
 			 * so prev_cpu will receive a negative bias due to the double
@@ -6243,70 +6251,190 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 			 * than the one required to boost the task.
 			 */
 			new_util = max(min_util, new_util);
-
-			if (new_util > capacity_orig_of(i))
+			if (new_util > capacity_orig)
 				continue;
 
-#ifdef CONFIG_SCHED_WALT
-			if (walt_cpu_high_irqload(i))
-				continue;
-#endif
-
 			/*
-			 * Unconditionally favoring tasks that prefer idle cpus to
+			 * Case A) Latency sensitive tasks
+			 *
+			 * Unconditionally favoring tasks that prefer idle CPU to
 			 * improve latency.
+			 *
+			 * Looking for:
+			 * - an idle CPU, whatever its idle_state is, since
+			 *   the first CPUs we explore are more likely to be
+			 *   reserved for latency sensitive tasks.
+			 * - a non idle CPU where the task fits in its current
+			 *   capacity and has the maximum spare capacity.
+			 * - a non idle CPU with lower contention from other
+			 *   tasks and running at the lowest possible OPP.
+			 *
+			 * The last two goals tries to favor a non idle CPU
+			 * where the task can run as if it is "almost alone".
+			 * A maximum spare capacity CPU is favoured since
+			 * the task already fits into that CPU's capacity
+			 * without waiting for an OPP chance.
+			 *
+			 * The following code path is the only one in the CPUs
+			 * exploration loop which is always used by
+			 * prefer_idle tasks. It exits the loop with wither a
+			 * best_active_cpu or a target_cpu which should
+			 * represent an optimal choice for latency sensitive
+			 * tasks.
 			 */
-			if (idle_cpu(i) && prefer_idle) {
-				schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
-				schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
-				return i;
-			}
+			if (prefer_idle) {
 
-			cur_capacity = capacity_curr_of(i);
-
-			if (new_util < cur_capacity) {
-				if (cpu_rq(i)->nr_running) {
-					/*
-					 * Find a target cpu with the lowest/highest
-					 * utilization if prefer_idle/!prefer_idle.
-					 */
-					if (prefer_idle) {
-						/* Favor the CPU that last ran the task */
-						if (new_util > target_util ||
-						    wake_util > min_wake_util)
-							continue;
-						min_wake_util = wake_util;
-						target_util = new_util;
-						target_cpu = i;
-					} else if (target_util < new_util) {
-						target_util = new_util;
-						target_cpu = i;
-					}
-				} else if (!prefer_idle) {
-					int idle_idx = idle_get_state_idx(cpu_rq(i));
-
-					if (best_idle_cpu < 0 ||
-						(sysctl_sched_cstate_aware &&
-							best_idle_cstate > idle_idx)) {
-						best_idle_cstate = idle_idx;
-						best_idle_cpu = i;
-					}
+				/*
+				 * Case A.1: IDLE CPU
+				 * Return the first IDLE CPU we find.
+				 */
+				if (idle_cpu(i)) {
+					schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
+					schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
+					return i;
 				}
-			} else if (backup_capacity > cur_capacity) {
-				/* Find a backup cpu with least capacity. */
-				backup_capacity = cur_capacity;
-				backup_cpu = i;
+
+				/*
+				 * Case A.2: Target ACTIVE CPU
+				 * Favor CPUs with max spare capacity.
+				 */
+				if ((capacity_curr > new_util) &&
+					(capacity_orig - new_util > target_max_spare_cap)) {
+					target_max_spare_cap = capacity_orig - new_util;
+					target_cpu = i;
+					continue;
+				}
+				if (target_cpu != -1)
+					continue;
+
+
+				/*
+				 * Case A.3: Backup ACTIVE CPU
+				 * Favor CPUs with:
+				 * - lower utilization due to other tasks
+				 * - lower utilization with the task in
+				 */
+				if (wake_util > min_wake_util)
+					continue;
+				if (new_util > best_active_util)
+					continue;
+				min_wake_util = wake_util;
+				best_active_util = new_util;
+				best_active_cpu = i;
+				continue;
 			}
+
+			/*
+			 * Case B) Non latency sensitive tasks on IDLE CPUs.
+			 *
+			 * Find an optimal backup IDLE CPU for non latency
+			 * sensitive tasks.
+			 *
+			 * Looking for:
+			 * - minimizing the capacity_orig,
+			 *   i.e. preferring LITTLE CPUs
+			 * - favoring shallowest idle states
+			 *   i.e. avoid to wakeup deep-idle CPUs
+			 *
+			 * The following code path is used by non latency
+			 * sensitive tasks if IDLE CPUs are available. If at
+			 * least one of such CPUs are available it sets the
+			 * best_idle_cpu to the most suitable idle CPU to be
+			 * selected.
+			 *
+			 * If idle CPUs are available, favour these CPUs to
+			 * improve performances by spreading tasks.
+			 * Indeed, the energy_diff() computed by the caller
+			 * will take care to ensure the minimization of energy
+			 * consumptions without affecting performance.
+			 */
+			if (idle_cpu(i)) {
+				int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+				/* Select idle CPU with lower cap_orig */
+				if (capacity_orig > best_idle_min_cap_orig)
+					continue;
+
+				/*
+				 * Skip CPUs in deeper idle state, but only
+				 * if they are also less energy efficient.
+				 * IOW, prefer a deep IDLE LITTLE CPU vs a
+				 * shallow idle big CPU.
+				 */
+				if (sysctl_sched_cstate_aware &&
+				    best_idle_cstate <= idle_idx)
+					continue;
+
+				/* Keep track of best idle CPU */
+				best_idle_min_cap_orig = capacity_orig;
+				best_idle_cstate = idle_idx;
+				best_idle_cpu = i;
+				continue;
+			}
+
+			/*
+			 * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+			 *
+			 * Pack tasks in the most energy efficient capacities.
+			 *
+			 * This task packing strategy prefers more energy
+			 * efficient CPUs (i.e. pack on smaller maximum
+			 * capacity CPUs) while also trying to spread tasks to
+			 * run them all at the lower OPP.
+			 *
+			 * This assumes for example that it's more energy
+			 * efficient to run two tasks on two CPUs at a lower
+			 * OPP than packing both on a single CPU but running
+			 * that CPU at an higher OPP.
+			 *
+			 * Thus, this case keep track of the CPU with the
+			 * smallest maximum capacity and highest spare maximum
+			 * capacity.
+			 */
+
+			/* Favor CPUs with smaller capacity */
+			if (capacity_orig > target_capacity)
+				continue;
+
+			/* Favor CPUs with maximum spare capacity */
+			if ((capacity_orig - new_util) < target_max_spare_cap)
+				continue;
+
+			target_max_spare_cap = capacity_orig - new_util;
+			target_capacity = capacity_orig;
+			target_util = new_util;
+			target_cpu = i;
 		}
+
 	} while (sg = sg->next, sg != sd->groups);
 
-	if (target_cpu < 0)
-		target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+	/*
+	 * For non latency sensitive tasks, cases B and C in the previous loop,
+	 * we pick the best IDLE CPU only if we was not able to find a target
+	 * ACTIVE CPU.
+	 *
+	 * Policies priorities:
+	 *
+	 * - prefer_idle tasks:
+	 *
+	 *   a) IDLE CPU available, we return immediately
+	 *   b) ACTIVE CPU where task fits and has the bigger maximum spare
+	 *      capacity (i.e. target_cpu)
+	 *   c) ACTIVE CPU with less contention due to other tasks
+	 *      (i.e. best_active_cpu)
+	 *
+	 * - NON prefer_idle tasks:
+	 *
+	 *   a) ACTIVE CPU: target_cpu
+	 *   b) IDLE CPU: best_idle_cpu
+	 */
+	if (target_cpu == -1)
+		target_cpu = prefer_idle
+			? best_active_cpu
+			: best_idle_cpu;
 
-	if (target_cpu >= 0) {
-		schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
-		schedstat_inc(this_rq(), eas_stats.fbt_count);
-	}
+	schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
+	schedstat_inc(this_rq(), eas_stats.fbt_count);
 
 	return target_cpu;
 }
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index e181c87a928d..f56c4da16d0b 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -55,6 +55,8 @@ static inline void walt_migrate_sync_cpu(int cpu) { }
 static inline void walt_init_cpu_efficiency(void) { }
 static inline u64 walt_ktime_clock(void) { return 0; }
 
+#define walt_cpu_high_irqload(cpu) false
+
 #endif /* CONFIG_SCHED_WALT */
 
 extern unsigned int walt_disabled;

From bf6cd4d156b7b4ef09d00de92616eef49bb0efc7 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 29 Jun 2017 12:24:27 +0100
Subject: [PATCH 0959/3220] events: add tracepoint for find_best_target

Change-Id: I4c245ffacb207d7ea826c5763a426efe5399e0a2
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/trace/events/sched.h | 42 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          | 10 +++++++++
 2 files changed, 52 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 433d3919ba00..8b97464803f1 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -877,6 +877,48 @@ TRACE_EVENT(sched_boost_task,
 		  __entry->margin)
 );
 
+/*
+ * Tracepoint for find_best_target
+ */
+TRACE_EVENT(sched_find_best_target,
+
+	TP_PROTO(struct task_struct *tsk, bool prefer_idle,
+		unsigned long min_util, int start_cpu,
+		int best_idle, int best_active, int target),
+
+	TP_ARGS(tsk, prefer_idle, min_util, start_cpu,
+		best_idle, best_active, target),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN	)
+		__field( pid_t,	pid			)
+		__field( unsigned long,	min_util	)
+		__field( bool,	prefer_idle		)
+		__field( int,	start_cpu		)
+		__field( int,	best_idle		)
+		__field( int,	best_active		)
+		__field( int,	target			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		= tsk->pid;
+		__entry->min_util	= min_util;
+		__entry->prefer_idle	= prefer_idle;
+		__entry->start_cpu 	= start_cpu;
+		__entry->best_idle	= best_idle;
+		__entry->best_active	= best_active;
+		__entry->target		= target;
+	),
+
+	TP_printk("pid=%d comm=%s prefer_idle=%d start_cpu=%d "
+		  "best_idle=%d best_active=%d target=%d",
+		__entry->pid, __entry->comm,
+		__entry->prefer_idle, __entry->start_cpu,
+		__entry->best_idle, __entry->best_active,
+		__entry->target)
+);
+
 /*
  * Tracepoint for accounting sched group energy
  */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a61c47a030a0..e6336157feec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6291,6 +6291,12 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 				if (idle_cpu(i)) {
 					schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
 					schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
+
+					trace_sched_find_best_target(p,
+							prefer_idle, min_util,
+							cpu, best_idle_cpu,
+							best_active_cpu, i);
+
 					return i;
 				}
 
@@ -6433,6 +6439,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 			? best_active_cpu
 			: best_idle_cpu;
 
+	trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+				     best_idle_cpu, best_active_cpu,
+				     target_cpu);
+
 	schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
 	schedstat_inc(this_rq(), eas_stats.fbt_count);
 

From 7b63e1ff52134bbc27c6e30c1fd909ff9a87bc2b Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Fri, 28 Apr 2017 15:23:33 +0100
Subject: [PATCH 0960/3220] sched/fair: Update signals of nohz cpus if we are
 going idle

Stale cpu utilization signals can cause havoc for energy-aware systems,
and they are caused by no updates being performed for cpus which have
no tick running. There is open debate about when is the correct time to
update these cpus, and general recognition that something needs to be
done.

This is an attempt to do something useful.

When we are looking for a task to pull for a newly-idle cpu, we have
an opportunity to update the stats for any cpu which has no tick running
without causing too much disturbance to the system or waking it up.

Change-Id: I0280104ea9c53e56c26f1c56a62bacab5d3e951b
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
---
 kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e6336157feec..dff742f276d7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8051,6 +8051,38 @@ group_type group_classify(struct sched_group *group,
 	return group_other;
 }
 
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing data
+ *  - used by the nohz balance, but we want it available here
+ *    so that we can see which CPUs have no tick.
+ */
+static struct {
+	cpumask_var_t idle_cpus_mask;
+	atomic_t nr_cpus;
+	unsigned long next_balance;     /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
+{
+	/* only called from update_sg_lb_stats when irqs are disabled */
+	if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
+		/* rate limit updates to once-per-jiffie at most */
+		if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
+			return;
+
+		raw_spin_lock(&rq->lock);
+		update_rq_clock(rq);
+		update_idle_cpu_load(rq);
+		update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
+		raw_spin_unlock(&rq->lock);
+	}
+}
+
+#else
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
+#endif
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -8074,6 +8106,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
+		/* if we are entering idle and there are CPUs with
+		 * their tick stopped, do an update for them
+		 */
+		if (env->idle == CPU_NEWLY_IDLE)
+			update_cpu_stats_if_tickless(rq);
+
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
 			load = target_load(i, load_idx);
@@ -9314,12 +9352,6 @@ static inline int on_null_domain(struct rq *rq)
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
  */
-static struct {
-	cpumask_var_t idle_cpus_mask;
-	atomic_t nr_cpus;
-	unsigned long next_balance;     /* in jiffy units */
-} nohz ____cacheline_aligned;
-
 static inline int find_new_ilb(void)
 {
 	int ilb = cpumask_first(nohz.idle_cpus_mask);

From ebc28671a5a3a657c1f88fbde4be07c4ef395aef Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Mon, 27 Mar 2017 15:00:14 +0100
Subject: [PATCH 0961/3220] sched/fair: kick nohz idle balance for misfit task

If there have misfit task on one CPU, current code does not handle this
situation for nohz idle balance. As result, we can see the misfit task
stays run on little core for long time.

So this patch check if the CPU has misfit task or not. If has misfit
task then kick nohz idle balance so finally can execute active balance.

Change-Id: I117d3b7404296f8de11cb960a87a6b9a54a9f348
Signed-off-by: Leo Yan <leo.yan at linaro.org>
[taken from https://lists.linaro.org/pipermail/eas-dev/2016-September/000551.html]
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dff742f276d7..90b44c6cca37 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9701,6 +9701,10 @@ static inline bool nohz_kick_needed(struct rq *rq)
 	    (!energy_aware() || cpu_overutilized(cpu)))
 		return true;
 
+	/* Do idle load balance if there have misfit task */
+	if (energy_aware() && rq->misfit_task)
+		return true;
+
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 	if (sd && !energy_aware()) {

From e76348ec5f7f4e37f827cfcee2ead8c1089912c4 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 29 Jun 2017 17:24:29 +0100
Subject: [PATCH 0962/3220] Revert "sched/fair: ensure utilization signals are
 synchronized before use"

This reverts commit 83f462daa328f2f42c3c1f7f5277f71e3fa0f750.

Change-Id: I37ba36da61df2beb3a005557d9b673027f446916
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
---
 kernel/sched/fair.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 90b44c6cca37..04eb6c66a78b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6573,16 +6573,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 
-	if (sd_flag & SD_BALANCE_WAKE) {
-		/*
-		 * do wake_cap unconditionally as it causes task and cpu
-		 * utilization to be synced, and we need that for energy
-		 * aware wakeups
-		 */
-		int _wake_cap = wake_cap(p, cpu, prev_cpu);
-		want_affine = !wake_wide(p) && !_wake_cap
+	if (sd_flag & SD_BALANCE_WAKE)
+		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
 			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
-	}
 
 	if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
 		return select_energy_cpu_brute(p, prev_cpu, sync);

From d96e40472807e08d8352d22c0d3b0ce142893c53 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 29 Jun 2017 17:29:31 +0100
Subject: [PATCH 0963/3220] sched/fair: Sync task util before EAS wakeup

Before using a task's util_avg signal in EAS, we need to ensure that
it has been synced up to the last_update_time of prev_cpu's root
cfs_rq.

We previously relied on the side effect of wake_cap to do that,
however that does not happen when the waking CPU has the same
capacity as the prev_cpu. Therefore just explicitly call
sync_entity_load_avg. This may result in calling that function twice
within the same select_task_rq_fair, but since last_update_time
hasn't changed the second call will bail out very quickly.

Change-Id: I91f1fcd71dfeb96b7f5b73418f1cf9ac311d4655
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04eb6c66a78b..cd72ce142974 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6501,6 +6501,8 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 	prefer_idle = 0;
 #endif
 
+	sync_entity_load_avg(&p->se);
+
 	sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
 	/* Find a cpu with sufficient capacity */
 	tmp_target = find_best_target(p, boosted, prefer_idle);

From 6cb8fcccb26e999a1745d208135c8cea7091639c Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Tue, 4 Jul 2017 10:23:03 +0100
Subject: [PATCH 0964/3220] sched/fair: Try to estimate possible idle states.

In the current EAS group energy calculations, we only use
the idle state of the group as it is right now. This means
that there are times when EAS cannot see that we are about
to remove all utilization from a group which is likely to
result in us being able to idle that entire group.

This is an attempt to detect that situation and at least
allow the energy calculation to include savings in that
scenario, regardless of what we might be able to actually
achieve in the real world. If a cluster or cpu looks like
it will have some idle time available to it, we try to
map the utilization onto an idle state.

Change-Id: I8fcb1e507f65ae6a2c5647eeef75a4bf28c7a0c0
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 55 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cd72ce142974..d1445e7b298a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5396,9 +5396,11 @@ static int find_new_capacity(struct energy_env *eenv,
 	return idx;
 }
 
-static int group_idle_state(struct sched_group *sg)
+static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
 {
 	int i, state = INT_MAX;
+	int src_in_grp, dst_in_grp;
+	long grp_util = 0;
 
 	/* Find the shallowest idle state in the sched group. */
 	for_each_cpu(i, sched_group_cpus(sg))
@@ -5407,6 +5409,54 @@ static int group_idle_state(struct sched_group *sg)
 	/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
 	state++;
 
+	/*
+	 * Try to estimate if a deeper idle state is
+	 * achievable when we move the task.
+	 */
+	for_each_cpu(i, sched_group_cpus(sg))
+		grp_util += cpu_util(i);
+
+	src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
+	dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+	if (src_in_grp == dst_in_grp) {
+		/* both CPUs under consideration are in the same group or not in
+		 * either group, migration should leave idle state the same.
+		 */
+		goto end;
+	}
+	/* add or remove util as appropriate to indicate what group util
+	 * will be (worst case - no concurrent execution) after moving the task
+	 */
+	grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta;
+
+	if (grp_util <=
+		((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
+		/* after moving, this group is at most partly
+		 * occupied, so it should have some idle time.
+		 */
+		int max_idle_state_idx = sg->sge->nr_idle_states - 2;
+		int new_state = grp_util * max_idle_state_idx;
+		if (grp_util <= 0)
+			/* group will have no util, use lowest state */
+			new_state = max_idle_state_idx + 1;
+		else {
+			/* for partially idle, linearly map util to idle
+			 * states, excluding the lowest one. This does not
+			 * correspond to the state we expect to enter in
+			 * reality, but an indication of what might happen.
+			 */
+			new_state = min(max_idle_state_idx, (int)
+					(new_state / sg->sgc->max_capacity));
+			new_state = max_idle_state_idx - new_state;
+		}
+		state = new_state;
+	} else {
+		/* After moving, the group will be fully occupied
+		 * so assume it will not be idle at all.
+		 */
+		state = 0;
+	}
+end:
 	return state;
 }
 
@@ -5479,8 +5529,9 @@ static int sched_group_energy(struct energy_env *eenv)
 					}
 				}
 
-				idle_idx = group_idle_state(sg);
+				idle_idx = group_idle_state(eenv, sg);
 				group_util = group_norm_util(eenv, sg);
+
 				sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
 								>> SCHED_CAPACITY_SHIFT;
 				sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)

From 2e13f308a9854f5ff2b3735f4dc58c2a10cf7ea1 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Tue, 4 Jul 2017 10:19:58 +0100
Subject: [PATCH 0965/3220] sched/fair: Add a backup_cpu to find_best_target

Sometimes we find a target cpu but then we do not use it as
the energy_diff indicates that we would increase energy usage
or not save anything. To offer an additional option for those
cases, we return a second option which is what we would have
selected if the target CPU had not been found. This gives us
another chance to try to save some energy.

Change-Id: I42c4f20aba10e4cf65b51ac4153e2e00e534c8c7
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 kernel/sched/fair.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d1445e7b298a..c193e9b1c38f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6238,7 +6238,8 @@ static int start_cpu(bool boosted)
 	return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
 }
 
-static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+				   bool boosted, bool prefer_idle)
 {
 	unsigned long best_idle_min_cap_orig = ULONG_MAX;
 	unsigned long min_util = boosted_task_util(p);
@@ -6255,6 +6256,8 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 	int target_cpu = -1;
 	int cpu, i;
 
+	*backup_cpu = -1;
+
 	schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
 	schedstat_inc(this_rq(), eas_stats.fbt_attempts);
 
@@ -6489,6 +6492,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre
 		target_cpu = prefer_idle
 			? best_active_cpu
 			: best_idle_cpu;
+	else
+		*backup_cpu = prefer_idle
+		? best_active_cpu
+		: best_idle_cpu;
 
 	trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
 				     best_idle_cpu, best_active_cpu,
@@ -6527,7 +6534,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
 {
 	struct sched_domain *sd;
-	int target_cpu = prev_cpu, tmp_target;
+	int target_cpu = prev_cpu, tmp_target, tmp_backup;
 	bool boosted, prefer_idle;
 
 	schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
@@ -6556,7 +6563,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 
 	sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
 	/* Find a cpu with sufficient capacity */
-	tmp_target = find_best_target(p, boosted, prefer_idle);
+	tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
 
 	if (!sd)
 		goto unlock;
@@ -6585,10 +6592,15 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 		}
 
 		if (energy_diff(&eenv) >= 0) {
-			schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
-			schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
-			target_cpu = prev_cpu;
-			goto unlock;
+			/* No energy saving for target_cpu, try backup */
+			target_cpu = tmp_backup;
+			eenv.dst_cpu = target_cpu;
+			if (tmp_backup < 0 || energy_diff(&eenv) >= 0) {
+				schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
+				schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
+				target_cpu = prev_cpu;
+				goto unlock;
+			}
 		}
 
 		schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);

From 7eeebce62c213a3380c61d0e815ccd7ae1d4b410 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Thu, 27 Jul 2017 23:52:24 +0200
Subject: [PATCH 0966/3220] ANDROID: binder: Don't BUG_ON(!spin_is_locked()).

Because is_spin_locked() always returns false on UP
systems.

Use assert_spin_locked() instead, and remove the
WARN_ON() instances, since those were easy to verify.

Bug: 64073116
Change-Id: I9080991c6d67e91928282a3ee64db23e50c7d66a
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 29e3e3f21443..4cf8e05c7a03 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1014,7 +1014,7 @@ binder_select_thread_ilocked(struct binder_proc *proc)
 {
 	struct binder_thread *thread;
 
-	BUG_ON(!spin_is_locked(&proc->inner_lock));
+	assert_spin_locked(&proc->inner_lock);
 	thread = list_first_entry_or_null(&proc->waiting_threads,
 					  struct binder_thread,
 					  waiting_thread_node);
@@ -1045,7 +1045,7 @@ static void binder_wakeup_thread_ilocked(struct binder_proc *proc,
 					 struct binder_thread *thread,
 					 bool sync)
 {
-	BUG_ON(!spin_is_locked(&proc->inner_lock));
+	assert_spin_locked(&proc->inner_lock);
 
 	if (thread) {
 		if (sync)
@@ -1224,7 +1224,7 @@ static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
 	struct rb_node *n = proc->nodes.rb_node;
 	struct binder_node *node;
 
-	BUG_ON(!spin_is_locked(&proc->inner_lock));
+	assert_spin_locked(&proc->inner_lock);
 
 	while (n) {
 		node = rb_entry(n, struct binder_node, rb_node);
@@ -1270,7 +1270,8 @@ static struct binder_node *binder_init_node_ilocked(
 	__u32 flags = fp ? fp->flags : 0;
 	s8 priority;
 
-	BUG_ON(!spin_is_locked(&proc->inner_lock));
+	assert_spin_locked(&proc->inner_lock);
+
 	while (*p) {
 
 		parent = *p;
@@ -1349,9 +1350,9 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong,
 {
 	struct binder_proc *proc = node->proc;
 
-	BUG_ON(!spin_is_locked(&node->lock));
+	assert_spin_locked(&node->lock);
 	if (proc)
-		BUG_ON(!spin_is_locked(&proc->inner_lock));
+		assert_spin_locked(&proc->inner_lock);
 	if (strong) {
 		if (internal) {
 			if (target_list == NULL &&
@@ -1403,9 +1404,9 @@ static bool binder_dec_node_nilocked(struct binder_node *node,
 {
 	struct binder_proc *proc = node->proc;
 
-	BUG_ON(!spin_is_locked(&node->lock));
+	assert_spin_locked(&node->lock);
 	if (proc)
-		BUG_ON(!spin_is_locked(&proc->inner_lock));
+		assert_spin_locked(&proc->inner_lock);
 	if (strong) {
 		if (internal)
 			node->internal_strong_refs--;
@@ -1929,7 +1930,7 @@ static void binder_pop_transaction_ilocked(struct binder_thread *target_thread,
 					   struct binder_transaction *t)
 {
 	BUG_ON(!target_thread);
-	BUG_ON(!spin_is_locked(&target_thread->proc->inner_lock));
+	assert_spin_locked(&target_thread->proc->inner_lock);
 	BUG_ON(target_thread->transaction_stack != t);
 	BUG_ON(target_thread->transaction_stack->from != target_thread);
 	target_thread->transaction_stack =
@@ -5072,7 +5073,6 @@ static void print_binder_transaction_ilocked(struct seq_file *m,
 	struct binder_proc *to_proc;
 	struct binder_buffer *buffer = t->buffer;
 
-	WARN_ON(!spin_is_locked(&proc->inner_lock));
 	spin_lock(&t->lock);
 	to_proc = t->to_proc;
 	seq_printf(m,
@@ -5161,7 +5161,6 @@ static void print_binder_thread_ilocked(struct seq_file *m,
 	size_t start_pos = m->count;
 	size_t header_pos;
 
-	WARN_ON(!spin_is_locked(&thread->proc->inner_lock));
 	seq_printf(m, "  thread %d: l %02x need_return %d tr %d\n",
 			thread->pid, thread->looper,
 			thread->looper_need_return,
@@ -5198,10 +5197,6 @@ static void print_binder_node_nilocked(struct seq_file *m,
 	struct binder_work *w;
 	int count;
 
-	WARN_ON(!spin_is_locked(&node->lock));
-	if (node->proc)
-		WARN_ON(!spin_is_locked(&node->proc->inner_lock));
-
 	count = 0;
 	hlist_for_each_entry(ref, &node->refs, node_entry)
 		count++;
@@ -5228,7 +5223,6 @@ static void print_binder_node_nilocked(struct seq_file *m,
 static void print_binder_ref_olocked(struct seq_file *m,
 				     struct binder_ref *ref)
 {
-	WARN_ON(!spin_is_locked(&ref->proc->outer_lock));
 	binder_node_lock(ref->node);
 	seq_printf(m, "  ref %d: desc %d %snode %d s %d w %d d %pK\n",
 		   ref->data.debug_id, ref->data.desc,

From b315101a358facc225f4bdeff1adc4b0b95f95c9 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@google.com>
Date: Thu, 6 Jul 2017 19:12:22 -0700
Subject: [PATCH 0967/3220] ANDROID: sdcardfs: override credential for ioctl to
 lower fs

Otherwise, lower_fs->ioctl() fails due to inode_owner_or_capable().

Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
Bug: 63260873
Change-Id: I623a6c7c5f8a3cbd7ec73ef89e18ddb093c43805
---
 fs/sdcardfs/file.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 6076c342dae6..5ac0b0bbb0ec 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -104,12 +104,19 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd,
 {
 	long err = -ENOTTY;
 	struct file *lower_file;
+	const struct cred *saved_cred = NULL;
+	struct dentry *dentry = file->f_path.dentry;
+	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 
 	lower_file = sdcardfs_lower_file(file);
 
 	/* XXX: use vfs_ioctl if/when VFS exports it */
 	if (!lower_file || !lower_file->f_op)
 		goto out;
+
+	/* save current_cred and override it */
+	OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file)));
+
 	if (lower_file->f_op->unlocked_ioctl)
 		err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
 
@@ -117,6 +124,7 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd,
 	if (!err)
 		sdcardfs_copy_and_fix_attrs(file_inode(file),
 				      file_inode(lower_file));
+	REVERT_CRED(saved_cred);
 out:
 	return err;
 }
@@ -127,15 +135,23 @@ static long sdcardfs_compat_ioctl(struct file *file, unsigned int cmd,
 {
 	long err = -ENOTTY;
 	struct file *lower_file;
+	const struct cred *saved_cred = NULL;
+	struct dentry *dentry = file->f_path.dentry;
+	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 
 	lower_file = sdcardfs_lower_file(file);
 
 	/* XXX: use vfs_ioctl if/when VFS exports it */
 	if (!lower_file || !lower_file->f_op)
 		goto out;
+
+	/* save current_cred and override it */
+	OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file)));
+
 	if (lower_file->f_op->compat_ioctl)
 		err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
 
+	REVERT_CRED(saved_cred);
 out:
 	return err;
 }

From 50d3f7d55a4fa7c46a0fec709faeae636e5841fd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 24 Jun 2016 15:09:37 -0700
Subject: [PATCH 0968/3220] UPSTREAM: Clarify naming of thread info/stack
 allocators

We've had the thread info allocated together with the thread stack for
most architectures for a long time (since the thread_info was split off
from the task struct), but that is about to change.

But the patches that move the thread info to be off-stack (and a part of
the task struct instead) made it clear how confused the allocator and
freeing functions are.

Because the common case was that we share an allocation with the thread
stack and the thread_info, the two pointers were identical.  That
identity then meant that we would have things like

	ti = alloc_thread_info_node(tsk, node);
	...
	tsk->stack = ti;

which certainly _worked_ (since stack and thread_info have the same
value), but is rather confusing: why are we assigning a thread_info to
the stack? And if we move the thread_info away, the "confusing" code
just gets to be entirely bogus.

So remove all this confusion, and make it clear that we are doing the
stack allocation by renaming and clarifying the function names to be
about the stack.  The fact that the thread_info then shares the
allocation is an implementation detail, and not really about the
allocation itself.

This is a pure renaming and type fix: we pass in the same pointer, it's
just that we clarify what the pointer means.

The ia64 code that actually only has one single allocation (for all of
task_struct, thread_info and kernel thread stack) now looks a bit odd,
but since "tsk->stack" is actually not even used there, that oddity
doesn't matter.  It would be a separate thing to clean that up, I
intentionally left the ia64 changes as a pure brute-force renaming and
type change.

Acked-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 38331309
Change-Id: I870b5476fc900c9145134f9dd3ed18a32a490162
(cherry picked from commit b235beea9e996a4d36fed6cfef4801a3e7d7a9a5)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/Kconfig                           |  4 +-
 arch/ia64/Kconfig                      |  2 +-
 arch/ia64/include/asm/thread_info.h    |  8 ++--
 arch/mn10300/include/asm/thread_info.h |  2 +-
 arch/mn10300/kernel/kgdb.c             |  3 +-
 arch/tile/include/asm/thread_info.h    |  2 +-
 arch/tile/kernel/process.c             |  3 +-
 include/linux/sched.h                  |  2 +-
 init/main.c                            |  4 +-
 kernel/fork.c                          | 52 +++++++++++++-------------
 10 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 98f64ad1caf1..ed2539c590bf 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -225,8 +225,8 @@ config ARCH_INIT_TASK
 config ARCH_TASK_STRUCT_ALLOCATOR
 	bool
 
-# Select if arch has its private alloc_thread_info() function
-config ARCH_THREAD_INFO_ALLOCATOR
+# Select if arch has its private alloc_thread_stack() function
+config ARCH_THREAD_STACK_ALLOCATOR
 	bool
 
 # Select if arch wants to size task_struct dynamically via arch_task_struct_size:
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index eb0249e37981..7534fd34f79d 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -45,7 +45,7 @@ config IA64
 	select GENERIC_SMP_IDLE_THREAD
 	select ARCH_INIT_TASK
 	select ARCH_TASK_STRUCT_ALLOCATOR
-	select ARCH_THREAD_INFO_ALLOCATOR
+	select ARCH_THREAD_STACK_ALLOCATOR
 	select ARCH_CLOCKSOURCE_DATA
 	select GENERIC_TIME_VSYSCALL_OLD
 	select SYSCTL_ARCH_UNALIGN_NO_WARN
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index aa995b67c3f5..d1212b84fb83 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -48,15 +48,15 @@ struct thread_info {
 #ifndef ASM_OFFSETS_C
 /* how to get the thread information struct from C */
 #define current_thread_info()	((struct thread_info *) ((char *) current + IA64_TASK_SIZE))
-#define alloc_thread_info_node(tsk, node)	\
-		((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
+#define alloc_thread_stack_node(tsk, node)	\
+		((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE))
 #define task_thread_info(tsk)	((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
 #else
 #define current_thread_info()	((struct thread_info *) 0)
-#define alloc_thread_info_node(tsk, node)	((struct thread_info *) 0)
+#define alloc_thread_stack_node(tsk, node)	((unsigned long *) 0)
 #define task_thread_info(tsk)	((struct thread_info *) 0)
 #endif
-#define free_thread_info(ti)	/* nothing */
+#define free_thread_stack(ti)	/* nothing */
 #define task_stack_page(tsk)	((void *)(tsk))
 
 #define __HAVE_THREAD_FUNCTIONS
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index 4861a78c7160..f5f90bbf019d 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -115,7 +115,7 @@ static inline unsigned long current_stack_pointer(void)
 }
 
 #ifndef CONFIG_KGDB
-void arch_release_thread_info(struct thread_info *ti);
+void arch_release_thread_stack(unsigned long *stack);
 #endif
 #define get_thread_info(ti)	get_task_struct((ti)->task)
 #define put_thread_info(ti)	put_task_struct((ti)->task)
diff --git a/arch/mn10300/kernel/kgdb.c b/arch/mn10300/kernel/kgdb.c
index 99770823451a..2d7986c386fe 100644
--- a/arch/mn10300/kernel/kgdb.c
+++ b/arch/mn10300/kernel/kgdb.c
@@ -397,8 +397,9 @@ static bool kgdb_arch_undo_singlestep(struct pt_regs *regs)
  * single-step state is cleared.  At this point the breakpoints should have
  * been removed by __switch_to().
  */
-void arch_release_thread_info(struct thread_info *ti)
+void arch_release_thread_stack(unsigned long *stack)
 {
+	struct thread_info *ti = (void *)stack;
 	if (kgdb_sstep_thread == ti) {
 		kgdb_sstep_thread = NULL;
 
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index dc1fb28d9636..489b15016303 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -78,7 +78,7 @@ struct thread_info {
 
 #ifndef __ASSEMBLY__
 
-void arch_release_thread_info(struct thread_info *info);
+void arch_release_thread_stack(unsigned long *stack);
 
 /* How to get the thread information struct from C. */
 register unsigned long stack_pointer __asm__("sp");
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 7d5769310bef..a97ab1a69a90 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -73,8 +73,9 @@ void arch_cpu_idle(void)
 /*
  * Release a thread_info structure
  */
-void arch_release_thread_info(struct thread_info *info)
+void arch_release_thread_stack(unsigned long *stack)
 {
+	struct thread_info *info = (void *)stack;
 	struct single_step_state *step_state = info->step_state;
 
 	if (step_state) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad2c304b29b8..85cf2d2f02cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2986,7 +2986,7 @@ static inline int object_is_on_stack(void *obj)
 	return (obj >= stack) && (obj < (stack + THREAD_SIZE));
 }
 
-extern void thread_info_cache_init(void);
+extern void thread_stack_cache_init(void);
 
 #ifdef CONFIG_DEBUG_STACK_USAGE
 static inline unsigned long stack_not_used(struct task_struct *p)
diff --git a/init/main.c b/init/main.c
index fbafa271531c..86f5ce9ede86 100644
--- a/init/main.c
+++ b/init/main.c
@@ -468,7 +468,7 @@ void __init __weak smp_setup_processor_id(void)
 }
 
 # if THREAD_SIZE >= PAGE_SIZE
-void __init __weak thread_info_cache_init(void)
+void __init __weak thread_stack_cache_init(void)
 {
 }
 #endif
@@ -645,7 +645,7 @@ asmlinkage __visible void __init start_kernel(void)
 	/* Should be run before the first non-init thread is created */
 	init_espfix_bsp();
 #endif
-	thread_info_cache_init();
+	thread_stack_cache_init();
 	cred_init();
 	fork_init();
 	proc_caches_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 68cfda1c1800..5ee818516a1c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -147,18 +147,18 @@ static inline void free_task_struct(struct task_struct *tsk)
 }
 #endif
 
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
 {
 }
 
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
 
 /*
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
 # if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
 	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
@@ -167,29 +167,31 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 	return page ? page_address(page) : NULL;
 }
 
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(unsigned long *stack)
 {
-	free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+	struct page *page = virt_to_page(stack);
+
+	__free_kmem_pages(page, THREAD_SIZE_ORDER);
 }
 # else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
 
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
-	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_info(struct thread_info *ti)
+static void free_stack(unsigned long *stack)
 {
-	kmem_cache_free(thread_info_cache, ti);
+	kmem_cache_free(thread_stack_cache, stack);
 }
 
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
 {
-	thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+	thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
 					      THREAD_SIZE, 0, NULL);
-	BUG_ON(thread_info_cache == NULL);
+	BUG_ON(thread_stack_cache == NULL);
 }
 # endif
 #endif
@@ -212,9 +214,9 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(unsigned long *stack, int account)
 {
-	struct zone *zone = page_zone(virt_to_page(ti));
+	struct zone *zone = page_zone(virt_to_page(stack));
 
 	mod_zone_page_state(zone, NR_KERNEL_STACK, account);
 }
@@ -222,8 +224,8 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
 	account_kernel_stack(tsk->stack, -1);
-	arch_release_thread_info(tsk->stack);
-	free_thread_info(tsk->stack);
+	arch_release_thread_stack(tsk->stack);
+	free_thread_stack(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
 	put_seccomp_filter(tsk);
@@ -334,7 +336,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
-	struct thread_info *ti;
+	unsigned long *stack;
 	int err;
 
 	if (node == NUMA_NO_NODE)
@@ -343,15 +345,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (!tsk)
 		return NULL;
 
-	ti = alloc_thread_info_node(tsk, node);
-	if (!ti)
+	stack = alloc_thread_stack_node(tsk, node);
+	if (!stack)
 		goto free_tsk;
 
 	err = arch_dup_task_struct(tsk, orig);
 	if (err)
-		goto free_ti;
+		goto free_stack;
 
-	tsk->stack = ti;
+	tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
@@ -383,12 +385,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->task_frag.page = NULL;
 	tsk->wake_q.next = NULL;
 
-	account_kernel_stack(ti, 1);
+	account_kernel_stack(stack, 1);
 
 	return tsk;
 
-free_ti:
-	free_thread_info(ti);
+free_stack:
+	free_thread_stack(stack);
 free_tsk:
 	free_task_struct(tsk);
 	return NULL;

From f707c0f98f32e79c148b6c4284fa1681b441ee6d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 24 Jun 2016 17:07:33 -0700
Subject: [PATCH 0969/3220] UPSTREAM: fix up initial thread stack pointer vs
 thread_info confusion

The INIT_TASK() initializer was similarly confused about the stack vs
thread_info allocation that the allocators had, and that were fixed in
commit b235beea9e99 ("Clarify naming of thread info/stack allocators").

The task ->stack pointer only incidentally ends up having the same value
as the thread_info, and in fact that will change.

So fix the initial task struct initializer to point to 'init_stack'
instead of 'init_thread_info', and make sure the ia64 definition for
that exists.

This actually makes the ia64 tsk->stack pointer be sensible for the
initial task, but not for any other task.  As mentioned in commit
b235beea9e99, that whole pointer isn't actually used on ia64, since
task_stack_page() there just points to the (single) allocation.

All the other architectures seem to have copied the 'init_stack'
definition, even if it tended to be generally unusued.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 38331309
Change-Id: Ia96e9225b07e38df2f4af2b9a7eb2aa972d8845a
(cherry picked from commit 7f1a00b6fcd0e3c19beba2e92d157dc0c2cf3494)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/ia64/kernel/init_task.c | 1 +
 include/linux/init_task.h    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index f9efe9739d3f..0eaa89f3defd 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -26,6 +26,7 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * handled. This is done by having a special ".data..init_task" section...
  */
 #define init_thread_info	init_task_mem.s.thread_info
+#define init_stack		init_task_mem.stack
 
 union {
 	struct {
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1c1ff7e4faa4..9a0056499337 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -190,7 +190,7 @@ extern struct task_group root_task_group;
 #define INIT_TASK(tsk)	\
 {									\
 	.state		= 0,						\
-	.stack		= &init_thread_info,				\
+	.stack		= init_stack,					\
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= PF_KTHREAD,					\
 	.prio		= MAX_PRIO-20,					\

From 242f841e5485d19b39393ef90c1a133f3a7c990f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jul 2016 15:48:23 -0700
Subject: [PATCH 0970/3220] UPSTREAM: printk: when dumping regs, show the
 stack, not thread_info

We currently show:

  task: <current> ti: <current_thread_info()> task.ti: <task_thread_info(current)>"

"ti" and "task.ti" are redundant, and neither is actually what we want
to show, which the the base of the thread stack.  Change the display to
show the stack pointer explicitly.

Link: http://lkml.kernel.org/r/543ac5bd66ff94000a57a02e11af7239571a3055.1468523549.git.luto@kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 38331309
Change-Id: I7d4b915d38770d0c9384695b2064e4c66b22e94e
(cherry picked from commit 8b70ca65616b3588ea1907e87f0df6d2530350df)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 kernel/printk/printk.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1a698158face..b4573b55b435 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3176,9 +3176,8 @@ void show_regs_print_info(const char *log_lvl)
 {
 	dump_stack_print_info(log_lvl);
 
-	printk("%stask: %p ti: %p task.ti: %p\n",
-	       log_lvl, current, current_thread_info(),
-	       task_thread_info(current));
+	printk("%stask: %p task.stack: %p\n",
+	       log_lvl, current, task_stack_page(current));
 }
 
 #endif

From 8bc69d462ad300364c836616b249055ca7cb19e9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 13 Sep 2016 14:29:24 -0700
Subject: [PATCH 0971/3220] UPSTREAM: sched/core: Allow putting thread_info
 into task_struct

If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT,
then thread_info is defined as a single 'u32 flags' and is the first
entry of task_struct.  thread_info::task is removed (it serves no
purpose if thread_info is embedded in task_struct), and
thread_info::cpu gets its own slot in task_struct.

This is heavily based on a patch written by Linus.

Originally-from: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jann Horn <jann@thejh.net>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a0898196f0476195ca02713691a5037a14f2aac5.1473801993.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 38331309
Change-Id: I25e5a830f2ada5e74fa93661e97e5e701b1b70d2
(cherry picked from commit c65eacbe290b8141554c71b2c94489e73ade8c8d)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 include/linux/init_task.h   |  9 +++++++++
 include/linux/sched.h       | 36 ++++++++++++++++++++++++++++++++++--
 include/linux/thread_info.h | 15 +++++++++++++++
 init/Kconfig                |  7 +++++++
 init/init_task.c            |  7 +++++--
 kernel/sched/sched.h        |  4 ++++
 6 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9a0056499337..021b1e9ff6cd 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,8 @@
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
 
+#include <asm/thread_info.h>
+
 #ifdef CONFIG_SMP
 # define INIT_PUSHABLE_TASKS(tsk)					\
 	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
@@ -183,12 +185,19 @@ extern struct task_group root_task_group;
 # define INIT_KASAN(tsk)
 #endif
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk),
+#else
+# define INIT_TASK_TI(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
 #define INIT_TASK(tsk)	\
 {									\
+	INIT_TASK_TI(tsk)						\
 	.state		= 0,						\
 	.stack		= init_stack,					\
 	.usage		= ATOMIC_INIT(2),				\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 85cf2d2f02cb..d8c1b4340283 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1518,6 +1518,13 @@ struct tlbflush_unmap_batch {
 };
 
 struct task_struct {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	/*
+	 * For reasons of header soup (see current_thread_info()), this
+	 * must be the first element of task_struct.
+	 */
+	struct thread_info thread_info;
+#endif
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
 	atomic_t usage;
@@ -1527,6 +1534,9 @@ struct task_struct {
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
 	int on_cpu;
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	unsigned int cpu;	/* current CPU */
+#endif
 	unsigned int wakee_flips;
 	unsigned long wakee_flip_decay_ts;
 	struct task_struct *last_wakee;
@@ -2556,7 +2566,9 @@ extern void set_curr_task(int cpu, struct task_struct *p);
 void yield(void);
 
 union thread_union {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
 	struct thread_info thread_info;
+#endif
 	unsigned long stack[THREAD_SIZE/sizeof(long)];
 };
 
@@ -2946,10 +2958,26 @@ static inline void threadgroup_change_end(struct task_struct *tsk)
 	cgroup_threadgroup_change_end(tsk);
 }
 
-#ifndef __HAVE_THREAD_FUNCTIONS
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+
+static inline struct thread_info *task_thread_info(struct task_struct *task)
+{
+	return &task->thread_info;
+}
+static inline void *task_stack_page(const struct task_struct *task)
+{
+	return task->stack;
+}
+#define setup_thread_stack(new,old)	do { } while(0)
+static inline unsigned long *end_of_stack(const struct task_struct *task)
+{
+	return task->stack;
+}
+
+#elif !defined(__HAVE_THREAD_FUNCTIONS)
 
 #define task_thread_info(task)	((struct thread_info *)(task)->stack)
-#define task_stack_page(task)	((task)->stack)
+#define task_stack_page(task)	((void *)(task)->stack)
 
 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
 {
@@ -3241,7 +3269,11 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
 
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	return p->cpu;
+#else
 	return task_thread_info(p)->cpu;
+#endif
 }
 
 static inline int task_node(const struct task_struct *p)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 4cf89517783a..8784cebd0f51 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -13,6 +13,21 @@
 struct timespec;
 struct compat_timespec;
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+struct thread_info {
+	u32			flags;		/* low level flags */
+};
+
+#define INIT_THREAD_INFO(tsk)			\
+{						\
+	.flags		= 0,			\
+}
+#endif
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+#define current_thread_info() ((struct thread_info *)current)
+#endif
+
 /*
  * System call restart block.
  */
diff --git a/init/Kconfig b/init/Kconfig
index 445af1262134..ae7995c2fce8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -26,6 +26,13 @@ config IRQ_WORK
 config BUILDTIME_EXTABLE_SORT
 	bool
 
+config THREAD_INFO_IN_TASK
+	bool
+	help
+	  Select this to move thread_info off the stack into task_struct.  To
+	  make this work, an arch will need to remove all thread_info fields
+	  except flags and fix any runtime bugs.
+
 menu "General setup"
 
 config BROKEN
diff --git a/init/init_task.c b/init/init_task.c
index ba0a7f362d9e..11f83be1fa79 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -22,5 +22,8 @@ EXPORT_SYMBOL(init_task);
  * Initial thread structure. Alignment of this is handled by a special
  * linker map entry.
  */
-union thread_union init_thread_union __init_task_data =
-	{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data = {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+	INIT_THREAD_INFO(init_task)
+#endif
+};
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 430fff0d005d..029cf2bbeda2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1028,7 +1028,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	p->cpu = cpu;
+#else
 	task_thread_info(p)->cpu = cpu;
+#endif
 	p->wake_cpu = cpu;
 #endif
 }

From 99cf9fa9a00606ee4d51d876af421362f1818160 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 15 Sep 2016 22:45:43 -0700
Subject: [PATCH 0972/3220] UPSTREAM: sched/core: Add try_get_task_stack() and
 put_task_stack()

There are a few places in the kernel that access stack memory
belonging to a different task.  Before we can start freeing task
stacks before the task_struct is freed, we need a way for those code
paths to pin the stack.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jann Horn <jann@thejh.net>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/17a434f50ad3d77000104f21666575e10a9c1fbd.1474003868.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 38331309
Change-Id: I414853e9b72ecb0967d5e1cbfc77b4929bf3f4f5
(cherry picked from commit c6c314a613cd7d03fb97713e0d642b493de42e69)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 include/linux/sched.h | 16 ++++++++++++++++
 init/Kconfig          |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d8c1b4340283..0e6744bb2779 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2964,11 +2964,19 @@ static inline struct thread_info *task_thread_info(struct task_struct *task)
 {
 	return &task->thread_info;
 }
+
+/*
+ * When accessing the stack of a non-current task that might exit, use
+ * try_get_task_stack() instead.  task_stack_page will return a pointer
+ * that could get freed out from under you.
+ */
 static inline void *task_stack_page(const struct task_struct *task)
 {
 	return task->stack;
 }
+
 #define setup_thread_stack(new,old)	do { } while(0)
+
 static inline unsigned long *end_of_stack(const struct task_struct *task)
 {
 	return task->stack;
@@ -3004,6 +3012,14 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
 }
 
 #endif
+
+static inline void *try_get_task_stack(struct task_struct *tsk)
+{
+	return task_stack_page(tsk);
+}
+
+static inline void put_task_stack(struct task_struct *tsk) {}
+
 #define task_stack_end_corrupted(task) \
 		(*(end_of_stack(task)) != STACK_END_MAGIC)
 
diff --git a/init/Kconfig b/init/Kconfig
index ae7995c2fce8..f5500e552254 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -33,6 +33,9 @@ config THREAD_INFO_IN_TASK
 	  make this work, an arch will need to remove all thread_info fields
 	  except flags and fix any runtime bugs.
 
+	  One subtle change that will be needed is to use try_get_task_stack()
+	  and put_task_stack() in save_thread_stack_tsk() and get_wchan().
+
 menu "General setup"
 
 config BROKEN

From a960dbced9c6e34bdb1fc5f69d673352d6c4d932 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 15 Sep 2016 22:45:44 -0700
Subject: [PATCH 0973/3220] UPSTREAM: kthread: Pin the stack via
 try_get_task_stack()/put_task_stack() in to_live_kthread() function

get_task_struct(tsk) no longer pins tsk->stack so all users of
to_live_kthread() should do try_get_task_stack/put_task_stack to protect
"struct kthread" which lives on kthread's stack.

TODO: Kill to_live_kthread(), perhaps we can even kill "struct kthread" too,
and rework kthread_stop(), it can use task_work_add() to sync with the exiting
kernel thread.

Message-Id: <20160629180357.GA7178@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jann Horn <jann@thejh.net>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/cb9b16bbc19d4aea4507ab0552e4644c1211d130.1474003868.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 38331309
Change-Id: I2872658e56dcb1ab4173c490ef8f52affa54a404
(cherry picked from commit 23196f2e5f5d810578a772785807dcdc2b9fdce9)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 kernel/kthread.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 698b8dec3074..d9b0be5c6a5f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -65,7 +65,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 static struct kthread *to_live_kthread(struct task_struct *k)
 {
 	struct completion *vfork = ACCESS_ONCE(k->vfork_done);
-	if (likely(vfork))
+	if (likely(vfork) && try_get_task_stack(k))
 		return __to_kthread(vfork);
 	return NULL;
 }
@@ -427,8 +427,10 @@ void kthread_unpark(struct task_struct *k)
 {
 	struct kthread *kthread = to_live_kthread(k);
 
-	if (kthread)
+	if (kthread) {
 		__kthread_unpark(k, kthread);
+		put_task_stack(k);
+	}
 }
 EXPORT_SYMBOL_GPL(kthread_unpark);
 
@@ -457,6 +459,7 @@ int kthread_park(struct task_struct *k)
 				wait_for_completion(&kthread->parked);
 			}
 		}
+		put_task_stack(k);
 		ret = 0;
 	}
 	return ret;
@@ -492,6 +495,7 @@ int kthread_stop(struct task_struct *k)
 		__kthread_unpark(k, kthread);
 		wake_up_process(k);
 		wait_for_completion(&kthread->exited);
+		put_task_stack(k);
 	}
 	ret = k->exit_code;
 	put_task_struct(k);

From 264c551c4c77c9645a1c5a03735a71ed37348bc4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 19 Oct 2016 19:28:12 +0100
Subject: [PATCH 0974/3220] UPSTREAM: thread_info: factor out restart_block

Since commit f56141e3e2d9aabf ("all arches, signal: move restart_block
to struct task_struct"), thread_info and restart_block have been
logically distinct, yet struct restart_block is still defined in
<linux/thread_info.h>.

At least one architecture (erroneously) uses restart_block as part of
its thread_info, and thus the definition of restart_block must come
before the include of <asm/thread_info>. Subsequent patches in this
series need to shuffle the order of includes and definitions in
<linux/thread_info.h>, and will make this ordering fragile.

This patch moves the definition of restart_block out to its own header.
This serves as generic cleanup, logically separating thread_info and
restart_block, and also makes it easier to avoid fragility.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: I4283c87072c092179e2b6c02cbf7248b4a1c2d22
(cherry picked from commit 53d74d056a4e306a72b8883d325b5d853c0618e6)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 include/linux/restart_block.h | 51 +++++++++++++++++++++++++++++++++++
 include/linux/thread_info.h   | 41 +---------------------------
 2 files changed, 52 insertions(+), 40 deletions(-)
 create mode 100644 include/linux/restart_block.h

diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
new file mode 100644
index 000000000000..0d905d8ec553
--- /dev/null
+++ b/include/linux/restart_block.h
@@ -0,0 +1,51 @@
+/*
+ * Common syscall restarting data
+ */
+#ifndef __LINUX_RESTART_BLOCK_H
+#define __LINUX_RESTART_BLOCK_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+struct timespec;
+struct compat_timespec;
+struct pollfd;
+
+/*
+ * System call restart block.
+ */
+struct restart_block {
+	long (*fn)(struct restart_block *);
+	union {
+		/* For futex_wait and futex_wait_requeue_pi */
+		struct {
+			u32 __user *uaddr;
+			u32 val;
+			u32 flags;
+			u32 bitset;
+			u64 time;
+			u32 __user *uaddr2;
+		} futex;
+		/* For nanosleep */
+		struct {
+			clockid_t clockid;
+			struct timespec __user *rmtp;
+#ifdef CONFIG_COMPAT
+			struct compat_timespec __user *compat_rmtp;
+#endif
+			u64 expires;
+		} nanosleep;
+		/* For poll */
+		struct {
+			struct pollfd __user *ufds;
+			int nfds;
+			int has_timeout;
+			unsigned long tv_sec;
+			unsigned long tv_nsec;
+		} poll;
+	};
+};
+
+extern long do_no_restart_syscall(struct restart_block *parm);
+
+#endif /* __LINUX_RESTART_BLOCK_H */
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 8784cebd0f51..e8369b0d71e1 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -9,9 +9,7 @@
 
 #include <linux/types.h>
 #include <linux/bug.h>
-
-struct timespec;
-struct compat_timespec;
+#include <linux/restart_block.h>
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 struct thread_info {
@@ -28,43 +26,6 @@ struct thread_info {
 #define current_thread_info() ((struct thread_info *)current)
 #endif
 
-/*
- * System call restart block.
- */
-struct restart_block {
-	long (*fn)(struct restart_block *);
-	union {
-		/* For futex_wait and futex_wait_requeue_pi */
-		struct {
-			u32 __user *uaddr;
-			u32 val;
-			u32 flags;
-			u32 bitset;
-			u64 time;
-			u32 __user *uaddr2;
-		} futex;
-		/* For nanosleep */
-		struct {
-			clockid_t clockid;
-			struct timespec __user *rmtp;
-#ifdef CONFIG_COMPAT
-			struct compat_timespec __user *compat_rmtp;
-#endif
-			u64 expires;
-		} nanosleep;
-		/* For poll */
-		struct {
-			struct pollfd __user *ufds;
-			int nfds;
-			int has_timeout;
-			unsigned long tv_sec;
-			unsigned long tv_nsec;
-		} poll;
-	};
-};
-
-extern long do_no_restart_syscall(struct restart_block *parm);
-
 #include <linux/bitops.h>
 #include <asm/thread_info.h>
 

From f2b8210f0a7c3f717b82880a1160aaa9255ceecf Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 19 Oct 2016 19:28:13 +0100
Subject: [PATCH 0975/3220] UPSTREAM: thread_info: include <current.h> for
 THREAD_INFO_IN_TASK

When CONFIG_THREAD_INFO_IN_TASK is selected, the current_thread_info()
macro relies on current having been defined prior to its use. However,
not all users of current_thread_info() include <asm/current.h>, and thus
current is not guaranteed to be defined.

When CONFIG_THREAD_INFO_IN_TASK is not selected, it's possible that
get_current() / current are based upon current_thread_info(), and
<asm/current.h> includes <asm/thread_info.h>. Thus always including
<asm/current.h> would result in circular dependences on some platforms.

To ensure both cases work, this patch includes <asm/current.h>, but only
when CONFIG_THREAD_INFO_IN_TASK is selected.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: Ia981a829798d60a54d4e3eb679d8e24b01228357
(cherry picked from commit dc3d2a679cd8631b8a570fc8ca5f4712d7d25698)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 include/linux/thread_info.h | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e8369b0d71e1..8933ecc2bc9f 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -12,17 +12,12 @@
 #include <linux/restart_block.h>
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
-struct thread_info {
-	u32			flags;		/* low level flags */
-};
-
-#define INIT_THREAD_INFO(tsk)			\
-{						\
-	.flags		= 0,			\
-}
-#endif
-
-#ifdef CONFIG_THREAD_INFO_IN_TASK
+/*
+ * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
+ * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
+ * including <asm/current.h> can cause a circular dependency on some platforms.
+ */
+#include <asm/current.h>
 #define current_thread_info() ((struct thread_info *)current)
 #endif
 

From b4674788d072c3e2036dedbab83d4abd233cc41c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:03 +0000
Subject: [PATCH 0976/3220] UPSTREAM: arm64: thread_info remove stale items

We have a comment claiming __switch_to() cares about where cpu_context
is located relative to cpu_domain in thread_info. However arm64 has
never had a thread_info::cpu_domain field, and neither __switch_to nor
cpu_switch_to care where the cpu_context field is relative to others.

Additionally, the init_thread_info alias is never used anywhere in the
kernel, and will shortly become problematic when thread_info is moved
into task_struct.

This patch removes both.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: Ia4769ddcc6fc556e9eb6193d64fc99fe2d9e39ab
(cherry picked from commit dcbe02855f048fdf1e13ebc697e83c8d297f9f5a)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/include/asm/thread_info.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 794d22603f04..9224bf2d59d5 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -42,7 +42,6 @@ typedef unsigned long mm_segment_t;
 
 /*
  * low level task data that entry.S needs immediate access to.
- * __switch_to() assumes cpu_context follows immediately after cpu_domain.
  */
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
@@ -63,7 +62,6 @@ struct thread_info {
 	.addr_limit	= KERNEL_DS,					\
 }
 
-#define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
 /*

From d6d869da8de519c04a0c596af796d29f05f79ccc Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:04 +0000
Subject: [PATCH 0977/3220] BACKPORT: arm64: asm-offsets: remove unused
 definitions

Subsequent patches will move the thread_info::{task,cpu} fields, and the
current TI_{TASK,CPU} offset definitions are not used anywhere.

This patch removes the redundant definitions.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

This is a modification of Mark Rutland's original patch. Guards to
check if CONFIG_THREAD_INFO_IN_TASK is used has been inserted.

Bug: 38331309
Change-Id: I95903e0f862fc5dcf89e51926afa22389f2f7cee
(cherry picked from commit 3fe12da4c7fa6491e0fb7c5371716ac7f8ea80a5)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/kernel/asm-offsets.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index b84d8e85d19d..66357a43c097 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -36,8 +36,10 @@ int main(void)
   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
+#ifndef CONFIG_THREAD_INFO_IN_TASK
   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
   DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
+#endif
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
   DEFINE(TSK_TI_TTBR0,		offsetof(struct thread_info, ttbr0));
 #endif

From 725d3aa599931ab3834d379c90a07056e62fa179 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:05 +0000
Subject: [PATCH 0978/3220] UPSTREAM: arm64: factor out current_stack_pointer

We define current_stack_pointer in <asm/thread_info.h>, though other
files and header relying upon it do not have this necessary include, and
are thus fragile to changes in the header soup.

Subsequent patches will affect the header soup such that directly
including <asm/thread_info.h> may result in a circular header include in
some of these cases, so we can't simply include <asm/thread_info.h>.

Instead, factor current_thread_info into its own header, and have all
existing users include this explicitly.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: I4d6bc27bef686d0dade1d6abe1ce947cf6c4dfb3
(cherry picked from commit a9ea0017ebe8889dfa136cac2aa7ae0ee6915e1f)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/include/asm/percpu.h        | 2 ++
 arch/arm64/include/asm/perf_event.h    | 2 ++
 arch/arm64/include/asm/stack_pointer.h | 9 +++++++++
 arch/arm64/include/asm/thread_info.h   | 6 +-----
 arch/arm64/kernel/return_address.c     | 1 +
 arch/arm64/kernel/stacktrace.c         | 1 +
 arch/arm64/kernel/traps.c              | 1 +
 7 files changed, 17 insertions(+), 5 deletions(-)
 create mode 100644 arch/arm64/include/asm/stack_pointer.h

diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 8a336852eeba..2ce1a0262a59 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -16,6 +16,8 @@
 #ifndef __ASM_PERCPU_H
 #define __ASM_PERCPU_H
 
+#include <asm/stack_pointer.h>
+
 static inline void set_my_cpu_offset(unsigned long off)
 {
 	asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory");
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 7bd3cdb533ea..91b6be092ce2 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -17,6 +17,8 @@
 #ifndef __ASM_PERF_EVENT_H
 #define __ASM_PERF_EVENT_H
 
+#include <asm/stack_pointer.h>
+
 #ifdef CONFIG_PERF_EVENTS
 struct pt_regs;
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/arm64/include/asm/stack_pointer.h b/arch/arm64/include/asm/stack_pointer.h
new file mode 100644
index 000000000000..ffcdf742cddf
--- /dev/null
+++ b/arch/arm64/include/asm/stack_pointer.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_STACK_POINTER_H
+#define __ASM_STACK_POINTER_H
+
+/*
+ * how to get the current stack pointer from C
+ */
+register unsigned long current_stack_pointer asm ("sp");
+
+#endif /* __ASM_STACK_POINTER_H */
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 9224bf2d59d5..41e4fd78e345 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -36,6 +36,7 @@
 
 struct task_struct;
 
+#include <asm/stack_pointer.h>
 #include <asm/types.h>
 
 typedef unsigned long mm_segment_t;
@@ -64,11 +65,6 @@ struct thread_info {
 
 #define init_stack		(init_thread_union.stack)
 
-/*
- * how to get the current stack pointer from C
- */
-register unsigned long current_stack_pointer asm ("sp");
-
 /*
  * how to get the thread information struct from C
  */
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
index 1718706fde83..12a87f2600f2 100644
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/ftrace.h>
 
+#include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 
 struct return_address_data {
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index cfd46c227c8c..d4606014b48a 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -22,6 +22,7 @@
 #include <linux/stacktrace.h>
 
 #include <asm/irq.h>
+#include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 
 /*
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index f5c82c76cf7c..34904e1aec31 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -38,6 +38,7 @@
 #include <asm/esr.h>
 #include <asm/insn.h>
 #include <asm/traps.h>
+#include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 #include <asm/exception.h>
 #include <asm/system_misc.h>

From 62c30ed34e4fa9fa237276e90d089057d7332dc1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:06 +0000
Subject: [PATCH 0979/3220] UPSTREAM: arm64: traps: simplify die() and __die()

In arm64's die and __die routines we pass around a thread_info, and
subsequently use this to determine the relevant task_struct, and the end
of the thread's stack. Subsequent patches will decouple thread_info from
the stack, and this approach will no longer work.

To figure out the end of the stack, we can use the new generic
end_of_stack() helper. As we only call __die() from die(), and die()
always deals with the current task, we can remove the parameter and have
both acquire current directly, which also makes it clear that __die
can't be called for arbitrary tasks.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: Ie1a96a0a8e244d458a7f147001b64216403e07c4
(cherry picked from commit 876e7a38e8788773aac768091aaa3b42e470c03b)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/kernel/traps.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 34904e1aec31..3be84e579a8c 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -235,10 +235,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
 #endif
 #define S_SMP " SMP"
 
-static int __die(const char *str, int err, struct thread_info *thread,
-		 struct pt_regs *regs)
+static int __die(const char *str, int err, struct pt_regs *regs)
 {
-	struct task_struct *tsk = thread->task;
+	struct task_struct *tsk = current;
 	static int die_counter;
 	int ret;
 
@@ -253,7 +252,8 @@ static int __die(const char *str, int err, struct thread_info *thread,
 	print_modules();
 	__show_regs(regs);
 	pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n",
-		 TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1);
+		 TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk),
+		 end_of_stack(tsk));
 
 	if (!user_mode(regs) || in_interrupt()) {
 		dump_mem(KERN_EMERG, "Stack: ", regs->sp,
@@ -273,7 +273,6 @@ static DEFINE_RAW_SPINLOCK(die_lock);
  */
 void die(const char *str, struct pt_regs *regs, int err)
 {
-	struct thread_info *thread = current_thread_info();
 	int ret;
 
 	oops_enter();
@@ -281,9 +280,9 @@ void die(const char *str, struct pt_regs *regs, int err)
 	raw_spin_lock_irq(&die_lock);
 	console_verbose();
 	bust_spinlocks(1);
-	ret = __die(str, err, thread, regs);
+	ret = __die(str, err, regs);
 
-	if (regs && kexec_should_crash(thread->task))
+	if (regs && kexec_should_crash(current))
 		crash_kexec(regs);
 
 	bust_spinlocks(0);

From f00a4a09f4c3e5ac2580d8588a01f473fea0e554 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:07 +0000
Subject: [PATCH 0980/3220] UPSTREAM: arm64: unexport walk_stackframe

The walk_stackframe functions is architecture-specific, with a varying
prototype, and common code should not use it directly. None of its
current users can be built as modules. With THREAD_INFO_IN_TASK, users
will also need to hold a stack reference before calling it.

There's no reason for it to be exported, and it's very easy to misuse,
so unexport it for now.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: Ibe0dca36cc7d35f92c6bc13b373755d82f0eb9ef
(cherry picked from commit 2020a5ae7c8c2c8504565004915017507b135c63)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/kernel/stacktrace.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index d4606014b48a..191cc6cd3b59 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -126,7 +126,6 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
 			break;
 	}
 }
-EXPORT_SYMBOL(walk_stackframe);
 
 #ifdef CONFIG_STACKTRACE
 struct stack_trace_data {

From 0f9f933796581345224426a06d58edf1bbd5a26b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:08 +0000
Subject: [PATCH 0981/3220] UPSTREAM: arm64: prep stack walkers for
 THREAD_INFO_IN_TASK

When CONFIG_THREAD_INFO_IN_TASK is selected, task stacks may be freed
before a task is destroyed. To account for this, the stacks are
refcounted, and when manipulating the stack of another task, it is
necessary to get/put the stack to ensure it isn't freed and/or re-used
while we do so.

This patch reworks the arm64 stack walking code to account for this.
When CONFIG_THREAD_INFO_IN_TASK is not selected these perform no
refcounting, and this should only be a structural change that does not
affect behaviour.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: I89c4f53c4fea0d0be2f88221489c0c7f43366810
(cherry picked from commit 9bbd4c56b0b642f04396da378296e68096d5afca)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/kernel/process.c    | 20 ++++++++++++++------
 arch/arm64/kernel/stacktrace.c |  5 +++++
 arch/arm64/kernel/traps.c      | 10 ++++++++++
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index e6afea67f6c1..75dac2c0d437 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -419,27 +419,35 @@ struct task_struct *__switch_to(struct task_struct *prev,
 unsigned long get_wchan(struct task_struct *p)
 {
 	struct stackframe frame;
-	unsigned long stack_page;
+	unsigned long stack_page, ret = 0;
 	int count = 0;
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
 
+	stack_page = (unsigned long)try_get_task_stack(p);
+	if (!stack_page)
+		return 0;
+
 	frame.fp = thread_saved_fp(p);
 	frame.sp = thread_saved_sp(p);
 	frame.pc = thread_saved_pc(p);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	frame.graph = p->curr_ret_stack;
 #endif
-	stack_page = (unsigned long)task_stack_page(p);
 	do {
 		if (frame.sp < stack_page ||
 		    frame.sp >= stack_page + THREAD_SIZE ||
 		    unwind_frame(p, &frame))
-			return 0;
-		if (!in_sched_functions(frame.pc))
-			return frame.pc;
+			goto out;
+		if (!in_sched_functions(frame.pc)) {
+			ret = frame.pc;
+			goto out;
+		}
 	} while (count ++ < 16);
-	return 0;
+
+out:
+	put_task_stack(p);
+	return ret;
 }
 
 unsigned long arch_align_stack(unsigned long sp)
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 191cc6cd3b59..4faaf1af88fd 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -157,6 +157,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 	struct stack_trace_data data;
 	struct stackframe frame;
 
+	if (!try_get_task_stack(tsk))
+		return;
+
 	data.trace = trace;
 	data.skip = trace->skip;
 
@@ -178,6 +181,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 	walk_stackframe(tsk, &frame, save_trace, &data);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
+
+	put_task_stack(tsk);
 }
 
 void save_stack_trace(struct stack_trace *trace)
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 3be84e579a8c..e63708dc7b86 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -150,6 +150,14 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 	unsigned long irq_stack_ptr;
 	int skip;
 
+	pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
+
+	if (!tsk)
+		tsk = current;
+
+	if (!try_get_task_stack(tsk))
+		return;
+
 	/*
 	 * Switching between stacks is valid when tracing current and in
 	 * non-preemptible context.
@@ -220,6 +228,8 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 				 stack + sizeof(struct pt_regs), false);
 		}
 	}
+
+	put_task_stack(tsk);
 }
 
 void show_stack(struct task_struct *tsk, unsigned long *sp)

From 48dd80cb1343a0e535cc6065f078dfde9b60f5ba Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:09 +0000
Subject: [PATCH 0982/3220] BACKPORT: arm64: move sp_el0 and tpidr_el1 into
 cpu_suspend_ctx

When returning from idle, we rely on the fact that thread_info lives at
the end of the kernel stack, and restore this by masking the saved stack
pointer. Subsequent patches will sever the relationship between the
stack and thread_info, and to cater for this we must save/restore sp_el0
explicitly, storing it in cpu_suspend_ctx.

As cpu_suspend_ctx must be doubleword aligned, this leaves us with an
extra slot in cpu_suspend_ctx. We can use this to save/restore tpidr_el1
in the same way, which simplifies the code, avoiding pointer chasing on
the restore path (as we no longer need to load thread_info::cpu followed
by the relevant slot in __per_cpu_offset based on this).

This patch stashes both registers in cpu_suspend_ctx.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

This is a modification of Mark Rutland's original patch. The differences
from the original patch are as follows :-
	- NR_CTX_REGS is set to 13 instead of 12
	- x13 and x14 are used as temporary registers to hold sp_el0 and
	  tpidr_el1 instead of x11 and x12.
	- The values are temporarily stashed at offset 88 and 96 of
	  cpu_suspend_ctx instead of 80 and 88.

The original patch would not apply cleanly and these changes were made
to resolve this.

Bug: 38331309
Change-Id: I4e72aebd51e99d3767487383c14a1ba784312bf1
(cherry picked from commit 623b476fc815464a0241ea7483da7b3580b7d8ac)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/include/asm/suspend.h |  2 +-
 arch/arm64/kernel/sleep.S        |  3 ---
 arch/arm64/kernel/suspend.c      |  6 ------
 arch/arm64/mm/proc.S             | 10 ++++++++--
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index 59a5b0f1e81c..4d19a03d316e 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_SUSPEND_H
 #define __ASM_SUSPEND_H
 
-#define NR_CTX_REGS 11
+#define NR_CTX_REGS 13
 
 /*
  * struct cpu_suspend_ctx must be 16-byte aligned since it is allocated on
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index e33fe33876ab..f586f7c875e2 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -173,9 +173,6 @@ ENTRY(cpu_resume)
 	/* load physical address of identity map page table in x1 */
 	adrp	x1, idmap_pg_dir
 	mov	sp, x2
-	/* save thread_info */
-	and	x2, x2, #~(THREAD_SIZE - 1)
-	msr	sp_el0, x2
 	/*
 	 * cpu_do_resume expects x0 to contain context physical address
 	 * pointer and x1 to contain physical address of 1:1 page tables
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index f42b8b8f1d0a..e7a96462ca2d 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -95,12 +95,6 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 		 */
 		cpu_uninstall_idmap();
 
-		/*
-		 * Restore per-cpu offset before any kernel
-		 * subsystem relying on it has a chance to run.
-		 */
-		set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
-
 		/*
 		 * PSTATE was not saved over suspend/resume, re-enable any
 		 * detected features that might not have been set correctly.
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 85a542b21575..3b3a4710dcd6 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -71,12 +71,15 @@ ENTRY(cpu_do_suspend)
 	mrs	x10, mdscr_el1
 	mrs	x11, oslsr_el1
 	mrs	x12, sctlr_el1
+	mrs	x13, tpidr_el1
+	mrs	x14, sp_el0
 	stp	x2, x3, [x0]
 	stp	x4, x5, [x0, #16]
 	stp	x6, x7, [x0, #32]
 	stp	x8, x9, [x0, #48]
 	stp	x10, x11, [x0, #64]
-	str	x12, [x0, #80]
+	stp	x12, x13, [x0, #80]
+	str	x14, [x0, #96]
 	ret
 ENDPROC(cpu_do_suspend)
 
@@ -99,7 +102,8 @@ ENTRY(cpu_do_resume)
 	ldp	x6, x7, [x0, #32]
 	ldp	x8, x9, [x0, #48]
 	ldp	x10, x11, [x0, #64]
-	ldr	x12, [x0, #80]
+	ldp	x12, x13, [x0, #80]
+	ldr	x14, [x0, #96]
 	msr	tpidr_el0, x2
 	msr	tpidrro_el0, x3
 	msr	contextidr_el1, x4
@@ -111,6 +115,8 @@ ENTRY(cpu_do_resume)
 	msr	tcr_el1, x8
 	msr	vbar_el1, x9
 	msr	mdscr_el1, x10
+	msr	tpidr_el1, x13
+	msr	sp_el0, x14
 	/*
 	 * Restore oslsr_el1 by writing oslar_el1
 	 */

From d8cd9de39f1d1c16354e3342f27366ab0203474f Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:10 +0000
Subject: [PATCH 0983/3220] UPSTREAM: arm64: smp: prepare for
 smp_processor_id() rework

Subsequent patches will make smp_processor_id() use a percpu variable.
This will make smp_processor_id() dependent on the percpu offset, and
thus we cannot use smp_processor_id() to figure out what to initialise
the offset to.

Prepare for this by initialising the percpu offset based on
current::cpu, which will work regardless of how smp_processor_id() is
implemented. Also, make this relationship obvious by placing this code
together at the start of secondary_start_kernel().

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: I43304d06602216fbb5b968ff83e0face11e238f5
(cherry picked from commit 580efaa7ccfb8c0790dce4396434f0e5ac8d86ee)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/kernel/smp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index a84623d91410..8526ff64e4b2 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -135,7 +135,10 @@ static void smp_store_cpu_info(unsigned int cpuid)
 asmlinkage void secondary_start_kernel(void)
 {
 	struct mm_struct *mm = &init_mm;
-	unsigned int cpu = smp_processor_id();
+	unsigned int cpu;
+
+	cpu = task_cpu(current);
+	set_my_cpu_offset(per_cpu_offset(cpu));
 
 	/*
 	 * All kernel threads share the same mm context; grab a
@@ -144,8 +147,6 @@ asmlinkage void secondary_start_kernel(void)
 	atomic_inc(&mm->mm_count);
 	current->active_mm = mm;
 
-	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
-
 	/*
 	 * TTBR0 is only used for the identity mapping at this stage. Make it
 	 * point to zero page to avoid speculatively fetching new entries.

From f7f69dfb0277528bfcad3c08558dc7d60bc906dd Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:11 +0000
Subject: [PATCH 0984/3220] UPSTREAM: arm64: make cpu number a percpu variable

In the absence of CONFIG_THREAD_INFO_IN_TASK, core code maintains
thread_info::cpu, and low-level architecture code can access this to
build raw_smp_processor_id(). With CONFIG_THREAD_INFO_IN_TASK, core code
maintains task_struct::cpu, which for reasons of hte header soup is not
accessible to low-level arch code.

Instead, we can maintain a percpu variable containing the cpu number.

For both the old and new implementation of raw_smp_processor_id(), we
read a syreg into a GPR, add an offset, and load the result. As the
offset is now larger, it may not be folded into the load, but otherwise
the assembly shouldn't change much.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: I154927b0f9fc0ebbbed88c9958408bbb19cf09de
(cherry picked from commit 57c82954e77fa12c1023e87210d2ede77aaa0058)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/include/asm/smp.h | 11 ++++++++++-
 arch/arm64/kernel/smp.c      |  5 +++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2013a4dc5124..d7e851c5bc42 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -16,11 +16,20 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H
 
+#include <asm/percpu.h>
+
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/thread_info.h>
 
-#define raw_smp_processor_id() (current_thread_info()->cpu)
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
+
+/*
+ * We don't use this_cpu_read(cpu_number) as that has implicit writes to
+ * preempt_count, and associated (compiler) barriers, that we'd like to avoid
+ * the expense of. If we're preemptible, the value can be stale at use anyway.
+ */
+#define raw_smp_processor_id() (*this_cpu_ptr(&cpu_number))
 
 struct seq_file;
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 8526ff64e4b2..0b66134345f7 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -57,6 +57,9 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ipi.h>
 
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
 /*
  * as from 2.5, kernels no longer have an init_tasks structure
  * so we need some other way of telling a new secondary core
@@ -608,6 +611,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		if (max_cpus == 0)
 			break;
 
+		per_cpu(cpu_number, cpu) = cpu;
+
 		if (cpu == smp_processor_id())
 			continue;
 

From 1cdfc007f328200a950b65f8ddd69b41cd2fb8fc Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:12 +0000
Subject: [PATCH 0985/3220] UPSTREAM: arm64: assembler: introduce ldr_this_cpu

Shortly we will want to load a percpu variable in the return from
userspace path. We can save an instruction by folding the addition of
the percpu offset into the load instruction, and this patch adds a new
helper to do so.

At the same time, we clean up this_cpu_ptr for consistency. As with
{adr,ldr,str}_l, we change the template to take the destination register
first, and name this dst. Secondly, we rename the macro to adr_this_cpu,
following the scheme of adr_l, and matching the newly added
ldr_this_cpu.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Bug: 38331309
Change-Id: Iaaf4ea9674ab89289badee216b5305204172895e
(cherry picked from commit 1b7e2296a822dfd2349960addc42a139360ce769)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/include/asm/assembler.h | 19 +++++++++++++++----
 arch/arm64/kernel/entry.S          |  2 +-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index d8855ca6068a..e450bb6d21bd 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -223,14 +223,25 @@ lr	.req	x30		// link register
 	.endm
 
 	/*
+	 * @dst: Result of per_cpu(sym, smp_processor_id())
 	 * @sym: The name of the per-cpu variable
-	 * @reg: Result of per_cpu(sym, smp_processor_id())
 	 * @tmp: scratch register
 	 */
-	.macro this_cpu_ptr, sym, reg, tmp
-	adr_l	\reg, \sym
+	.macro adr_this_cpu, dst, sym, tmp
+	adr_l	\dst, \sym
 	mrs	\tmp, tpidr_el1
-	add	\reg, \reg, \tmp
+	add	\dst, \dst, \tmp
+	.endm
+
+	/*
+	 * @dst: Result of READ_ONCE(per_cpu(sym, smp_processor_id()))
+	 * @sym: The name of the per-cpu variable
+	 * @tmp: scratch register
+	 */
+	.macro ldr_this_cpu dst, sym, tmp
+	adr_l	\dst, \sym
+	mrs	\tmp, tpidr_el1
+	ldr	\dst, [\dst, \tmp]
 	.endm
 
 /*
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 3f9d78612e57..ed2df0570cc8 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -266,7 +266,7 @@ alternative_endif
 	cmp	x25, tsk
 	b.ne	9998f
 
-	this_cpu_ptr irq_stack, x25, x26
+	adr_this_cpu x25, irq_stack, x26
 	mov	x26, #IRQ_STACK_START_SP
 	add	x26, x25, x26
 

From 4ca3c2cf00be2fa1cd9cc576dd53adef96640701 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:13 +0000
Subject: [PATCH 0986/3220] BACKPORT: arm64: split thread_info from task stack

This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.

Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.

This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.

Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).

Both secondary entry and idle are updated to stash the sp and task
pointer separately.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

This is a modification of Mark Rutland's original patch. Guards to check
if CONFIG_THREAD_INFO_IN_TASK is used has been inserted. get_current()
for when CONFIG_THREAD_INFO_IN_TASK is not used has been added to
arch/arm64/include/asm/current.h.

Bug: 38331309
Change-Id: Ic5eae344a7c2baea0864f6ae16be1e9c60c0a74a
(cherry picked from commit c02433dd6de32f042cf3ffe476746b1115b8c096)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 arch/arm64/Kconfig                   |  1 +
 arch/arm64/include/asm/current.h     | 27 +++++++++++++
 arch/arm64/include/asm/smp.h         |  3 ++
 arch/arm64/include/asm/thread_info.h | 16 +++++++-
 arch/arm64/kernel/asm-offsets.c      | 14 +++++--
 arch/arm64/kernel/entry.S            | 58 ++++++++++++++++++++++++++--
 arch/arm64/kernel/head.S             | 18 ++++++++-
 arch/arm64/kernel/process.c          | 22 +++++++++++
 arch/arm64/kernel/smp.c              |  6 +++
 9 files changed, 156 insertions(+), 9 deletions(-)
 create mode 100644 arch/arm64/include/asm/current.h

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 35be8566140e..da007c26de52 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -98,6 +98,7 @@ config ARM64
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	select HAVE_CONTEXT_TRACKING
+	select THREAD_INFO_IN_TASK
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h
new file mode 100644
index 000000000000..2e61d21294ba
--- /dev/null
+++ b/arch/arm64/include/asm/current.h
@@ -0,0 +1,27 @@
+#ifndef __ASM_CURRENT_H
+#define __ASM_CURRENT_H
+
+#include <linux/compiler.h>
+
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+struct task_struct;
+
+static __always_inline struct task_struct *get_current(void)
+{
+	return (struct task_struct *)read_sysreg(sp_el0);
+}
+#define current get_current()
+#else
+#include <linux/thread_info.h>
+#define get_current() (current_thread_info()->task)
+#define current get_current()
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_CURRENT_H */
+
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index d7e851c5bc42..a05033beb2a2 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -66,6 +66,9 @@ asmlinkage void secondary_start_kernel(void);
  */
 struct secondary_data {
 	void *stack;
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	struct task_struct *task;
+#endif
 };
 extern struct secondary_data secondary_data;
 extern void secondary_entry(void);
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 41e4fd78e345..ec4f8c04aeda 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -47,14 +47,25 @@ typedef unsigned long mm_segment_t;
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	mm_segment_t		addr_limit;	/* address limit */
+#ifndef CONFIG_THREAD_INFO_IN_TASK
 	struct task_struct	*task;		/* main task structure */
+#endif
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	u64			ttbr0;		/* saved TTBR0_EL1 */
 #endif
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
+#ifndef CONFIG_THREAD_INFO_IN_TASK
 	int			cpu;		/* cpu */
+#endif
 };
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+#define INIT_THREAD_INFO(tsk)						\
+{									\
+	.preempt_count	= INIT_PREEMPT_COUNT,				\
+	.addr_limit	= KERNEL_DS,					\
+}
+#else
 #define INIT_THREAD_INFO(tsk)						\
 {									\
 	.task		= &tsk,						\
@@ -63,8 +74,6 @@ struct thread_info {
 	.addr_limit	= KERNEL_DS,					\
 }
 
-#define init_stack		(init_thread_union.stack)
-
 /*
  * how to get the thread information struct from C
  */
@@ -81,6 +90,9 @@ static inline struct thread_info *current_thread_info(void)
 
 	return (struct thread_info *)sp_el0;
 }
+#endif
+
+#define init_stack		(init_thread_union.stack)
 
 #define thread_saved_pc(tsk)	\
 	((unsigned long)(tsk->thread.cpu_context.pc))
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 66357a43c097..24e65f0897ee 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -33,12 +33,15 @@ int main(void)
 {
   DEFINE(TSK_ACTIVE_MM,		offsetof(struct task_struct, active_mm));
   BLANK();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+  DEFINE(TSK_TI_FLAGS,		offsetof(struct task_struct, thread_info.flags));
+  DEFINE(TSK_TI_PREEMPT,	offsetof(struct task_struct, thread_info.preempt_count));
+  DEFINE(TSK_TI_ADDR_LIMIT,	offsetof(struct task_struct, thread_info.addr_limit));
+  DEFINE(TSK_STACK,		offsetof(struct task_struct, stack));
+#else
   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
-#ifndef CONFIG_THREAD_INFO_IN_TASK
-  DEFINE(TI_TASK,		offsetof(struct thread_info, task));
-  DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
 #endif
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
   DEFINE(TSK_TI_TTBR0,		offsetof(struct thread_info, ttbr0));
@@ -113,6 +116,11 @@ int main(void)
   DEFINE(TZ_MINWEST,		offsetof(struct timezone, tz_minuteswest));
   DEFINE(TZ_DSTTIME,		offsetof(struct timezone, tz_dsttime));
   BLANK();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+  DEFINE(CPU_BOOT_STACK,	offsetof(struct secondary_data, stack));
+  DEFINE(CPU_BOOT_TASK,		offsetof(struct secondary_data, task));
+  BLANK();
+#endif
 #ifdef CONFIG_KVM_ARM_HOST
   DEFINE(VCPU_CONTEXT,		offsetof(struct kvm_vcpu, arch.ctxt));
   DEFINE(CPU_GP_REGS,		offsetof(struct kvm_cpu_context, gp_regs));
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ed2df0570cc8..dba3aceaed2f 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -93,9 +93,14 @@
 
 	.if	\el == 0
 	mrs	x21, sp_el0
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	ldr_this_cpu	tsk, __entry_task, x20	// Ensure MDSCR_EL1.SS is clear,
+	ldr	x19, [tsk, #TSK_TI_FLAGS]	// since we can unmask debug
+#else
 	mov	tsk, sp
 	and	tsk, tsk, #~(THREAD_SIZE - 1)	// Ensure MDSCR_EL1.SS is clear,
 	ldr	x19, [tsk, #TI_FLAGS]		// since we can unmask debug
+#endif
 	disable_step_tsk x19, x20		// exceptions when scheduling.
 
 	mov	x29, xzr			// fp pointed to user-space
@@ -103,10 +108,18 @@
 	add	x21, sp, #S_FRAME_SIZE
 	get_thread_info tsk
 	/* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	ldr	x20, [tsk, #TSK_TI_ADDR_LIMIT]
+#else
 	ldr	x20, [tsk, #TI_ADDR_LIMIT]
+#endif
 	str	x20, [sp, #S_ORIG_ADDR_LIMIT]
 	mov	x20, #TASK_SIZE_64
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	str	x20, [tsk, #TSK_TI_ADDR_LIMIT]
+#else
 	str	x20, [tsk, #TI_ADDR_LIMIT]
+#endif
 	ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO)
 	.endif /* \el == 0 */
 	mrs	x22, elr_el1
@@ -168,7 +181,11 @@ alternative_else_nop_endif
 	.if	\el != 0
 	/* Restore the task's original addr_limit. */
 	ldr	x20, [sp, #S_ORIG_ADDR_LIMIT]
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	str	x20, [tsk, #TSK_TI_ADDR_LIMIT]
+#else
 	str	x20, [tsk, #TI_ADDR_LIMIT]
+#endif
 
 	/* No need to restore UAO, it will be restored from SPSR_EL1 */
 	.endif
@@ -258,13 +275,20 @@ alternative_endif
 	mov	x19, sp			// preserve the original sp
 
 	/*
-	 * Compare sp with the current thread_info, if the top
-	 * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and
-	 * should switch to the irq stack.
+	 * Compare sp with the base of the task stack.
+	 * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
+	 * and should switch to the irq stack.
 	 */
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	ldr	x25, [tsk, TSK_STACK]
+	eor	x25, x25, x19
+	and	x25, x25, #~(THREAD_SIZE - 1)
+	cbnz	x25, 9998f
+#else
 	and	x25, x19, #~(THREAD_SIZE - 1)
 	cmp	x25, tsk
 	b.ne	9998f
+#endif
 
 	adr_this_cpu x25, irq_stack, x26
 	mov	x26, #IRQ_STACK_START_SP
@@ -493,9 +517,17 @@ el1_irq:
 	irq_handler
 
 #ifdef CONFIG_PREEMPT
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	ldr	w24, [tsk, #TSK_TI_PREEMPT]	// get preempt count
+#else
 	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
+#endif
 	cbnz	w24, 1f				// preempt count != 0
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	ldr	x0, [tsk, #TSK_TI_FLAGS]	// get flags
+#else
 	ldr	x0, [tsk, #TI_FLAGS]		// get flags
+#endif
 	tbz	x0, #TIF_NEED_RESCHED, 1f	// needs rescheduling?
 	bl	el1_preempt
 1:
@@ -510,7 +542,11 @@ ENDPROC(el1_irq)
 el1_preempt:
 	mov	x24, lr
 1:	bl	preempt_schedule_irq		// irq en/disable is done inside
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	ldr	x0, [tsk, #TSK_TI_FLAGS]	// get new tasks TI_FLAGS
+#else
 	ldr	x0, [tsk, #TI_FLAGS]		// get new tasks TI_FLAGS
+#endif
 	tbnz	x0, #TIF_NEED_RESCHED, 1b	// needs rescheduling?
 	ret	x24
 #endif
@@ -730,8 +766,12 @@ ENTRY(cpu_switch_to)
 	ldp	x29, x9, [x8], #16
 	ldr	lr, [x8]
 	mov	sp, x9
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	msr	sp_el0, x1
+#else
 	and	x9, x9, #~(THREAD_SIZE - 1)
 	msr	sp_el0, x9
+#endif
 	ret
 ENDPROC(cpu_switch_to)
 
@@ -742,7 +782,11 @@ ENDPROC(cpu_switch_to)
 ret_fast_syscall:
 	disable_irq				// disable interrupts
 	str	x0, [sp, #S_X0]			// returned x0
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
+#else
 	ldr	x1, [tsk, #TI_FLAGS]		// re-check for syscall tracing
+#endif
 	and	x2, x1, #_TIF_SYSCALL_WORK
 	cbnz	x2, ret_fast_syscall_trace
 	and	x2, x1, #_TIF_WORK_MASK
@@ -774,7 +818,11 @@ work_resched:
  */
 ret_to_user:
 	disable_irq				// disable interrupts
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	ldr	x1, [tsk, #TSK_TI_FLAGS]
+#else
 	ldr	x1, [tsk, #TI_FLAGS]
+#endif
 	and	x2, x1, #_TIF_WORK_MASK
 	cbnz	x2, work_pending
 	enable_step_tsk x1, x2
@@ -806,7 +854,11 @@ el0_svc_naked:					// compat entry point
 	enable_dbg_and_irq
 	ct_user_exit 1
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	ldr	x16, [tsk, #TSK_TI_FLAGS]	// check for syscall hooks
+#else
 	ldr	x16, [tsk, #TI_FLAGS]		// check for syscall hooks
+#endif
 	tst	x16, #_TIF_SYSCALL_WORK
 	b.ne	__sys_trace
 	cmp     scno, sc_nr                     // check upper syscall limit
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 99710399aa38..16d0820fff3a 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -424,6 +424,7 @@ kernel_img_size:
 	.set	initial_sp, init_thread_union + THREAD_START_SP
 __mmap_switched:
 	mov	x28, lr				// preserve LR
+
 	adr_l	x8, vectors			// load VBAR_EL1 with virtual
 	msr	vbar_el1, x8			// vector table address
 	isb
@@ -474,10 +475,18 @@ __mmap_switched:
 	dsb	sy				// with MMU off
 #endif
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+        adrp    x4, init_thread_union
+        add     sp, x4, #THREAD_SIZE
+        adr_l   x5, init_task
+        msr     sp_el0, x5                      // Save thread_info
+#else
 	adr_l	sp, initial_sp, x4
 	mov	x4, sp
 	and	x4, x4, #~(THREAD_SIZE - 1)
 	msr	sp_el0, x4			// Save thread_info
+#endif
+
 	str_l	x21, __fdt_pointer, x5		// Save FDT pointer
 
 	ldr_l	x4, kimage_vaddr		// Save the offset between
@@ -689,11 +698,18 @@ ENTRY(__secondary_switched)
 	adr_l	x5, vectors
 	msr	vbar_el1, x5
 	isb
-
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	adr_l	x0, secondary_data
+	ldr	x1, [x0, #CPU_BOOT_STACK]	// get secondary_data.stack
+	mov	sp, x1
+	ldr	x2, [x0, #CPU_BOOT_TASK]
+	msr	sp_el0, x2
+#else
 	ldr_l	x0, secondary_data		// get secondary_data.stack
 	mov	sp, x0
 	and	x0, x0, #~(THREAD_SIZE - 1)
 	msr	sp_el0, x0			// save thread_info
+#endif
 	mov	x29, #0
 	b	secondary_start_kernel
 ENDPROC(__secondary_switched)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 75dac2c0d437..e34bcf3f2c35 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -45,6 +45,9 @@
 #include <linux/personality.h>
 #include <linux/notifier.h>
 #include <trace/events/power.h>
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+#include <linux/percpu.h>
+#endif
 
 #include <asm/alternative.h>
 #include <asm/compat.h>
@@ -390,6 +393,22 @@ void uao_thread_switch(struct task_struct *next)
 	}
 }
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+/*
+ * We store our current task in sp_el0, which is clobbered by userspace. Keep a
+ * shadow copy so that we can restore this upon entry from userspace.
+ *
+ * This is *only* for exception entry from EL0, and is not valid until we
+ * __switch_to() a user task.
+ */
+DEFINE_PER_CPU(struct task_struct *, __entry_task);
+
+static void entry_task_switch(struct task_struct *next)
+{
+	__this_cpu_write(__entry_task, next);
+}
+#endif
+
 /*
  * Thread switching.
  */
@@ -402,6 +421,9 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	tls_thread_switch(next);
 	hw_breakpoint_thread_switch(next);
 	contextidr_thread_switch(next);
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	entry_task_switch(next);
+#endif
 	uao_thread_switch(next);
 
 	/*
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 0b66134345f7..ac899acec6eb 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -98,6 +98,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	 * We need to tell the secondary core where to find its stack and the
 	 * page tables.
 	 */
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	secondary_data.task = idle;
+#endif
 	secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
 	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
 
@@ -121,6 +124,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 		pr_err("CPU%u: failed to boot: %d\n", cpu, ret);
 	}
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	secondary_data.task = NULL;
+#endif
 	secondary_data.stack = NULL;
 
 	return ret;

From b1c79e32b7a8b42487f4f4fe970e5bc7d5dd48a9 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Thu, 10 Aug 2017 10:13:39 -0700
Subject: [PATCH 0987/3220] android: configs: move quota-related configs to
 recommended

Bug: 64468882
Change-Id: Ifdd59c83ca52ecaca00ddcea6a003a2611bf8694
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 android/configs/android-base.cfg        | 5 -----
 android/configs/android-recommended.cfg | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 28dce6c0516d..48b2cdbe8d49 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -139,11 +139,6 @@ CONFIG_PPP_DEFLATE=y
 CONFIG_PPP_MPPE=y
 CONFIG_PREEMPT=y
 CONFIG_PROFILING=y
-CONFIG_QFMT_V2=y
-CONFIG_QUOTA=y
-CONFIG_QUOTACTL=y
-CONFIG_QUOTA_NETLINK_INTERFACE=y
-CONFIG_QUOTA_TREE=y
 CONFIG_RANDOMIZE_BASE=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
index eecf8d80453a..3d7e5e168940 100644
--- a/android/configs/android-recommended.cfg
+++ b/android/configs/android-recommended.cfg
@@ -109,6 +109,11 @@ CONFIG_POWER_SUPPLY=y
 CONFIG_PSTORE=y
 CONFIG_PSTORE_CONSOLE=y
 CONFIG_PSTORE_RAM=y
+CONFIG_QFMT_V2=y
+CONFIG_QUOTA=y
+CONFIG_QUOTACTL=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+CONFIG_QUOTA_TREE=y
 CONFIG_SCHEDSTATS=y
 CONFIG_SMARTJOYPLUS_FF=y
 CONFIG_SND=y

From 4d666b500c2de3edc65e2ab5d5fe36780c230604 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Thu, 10 Aug 2017 12:32:00 +0200
Subject: [PATCH 0988/3220] ANDROID: binder: call poll_wait() unconditionally.

Because we're not guaranteed that subsequent calls
to poll() will have a poll_table_struct parameter
with _qproc set. When _qproc is not set, poll_wait()
is a noop, and we won't be woken up correctly.

Bug: 64552728
Change-Id: I5b904c9886b6b0994d1631a636f5c5e5f6327950
Test: binderLibTest stops hanging with new test
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 4cf8e05c7a03..ea718f2995d5 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3802,12 +3802,6 @@ static void binder_stat_br(struct binder_proc *proc,
 	}
 }
 
-static int binder_has_thread_work(struct binder_thread *thread)
-{
-	return !binder_worklist_empty(thread->proc, &thread->todo) ||
-		thread->looper_need_return;
-}
-
 static int binder_put_node_cmd(struct binder_proc *proc,
 			       struct binder_thread *thread,
 			       void __user **ptrp,
@@ -4438,12 +4432,9 @@ static unsigned int binder_poll(struct file *filp,
 
 	binder_inner_proc_unlock(thread->proc);
 
-	if (binder_has_work(thread, wait_for_proc_work))
-		return POLLIN;
-
 	poll_wait(filp, &thread->wait, wait);
 
-	if (binder_has_thread_work(thread))
+	if (binder_has_work(thread, wait_for_proc_work))
 		return POLLIN;
 
 	return 0;

From c825eca5ec45a467e78e66425fadfb001a8199df Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Thu, 10 Aug 2017 13:50:52 +0200
Subject: [PATCH 0989/3220] ANDROID: binder: don't enqueue death notifications
 to thread todo.

This allows userspace to request death notifications without
having to worry about getting an immediate callback on the same
thread; one scenario where this would be problematic is if the
death recipient handler grabs a lock that was already taken
earlier (eg as part of a nested transaction).

Bug: 23525545
Test: binderLibTest.DeathNotificationThread passes
Change-Id: I955e16306fe3110dacb9a391ffff1bf869249495
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index ea718f2995d5..061a7258779e 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3670,22 +3670,12 @@ static int binder_thread_write(struct binder_proc *proc,
 				ref->death = death;
 				if (ref->node->proc == NULL) {
 					ref->death->work.type = BINDER_WORK_DEAD_BINDER;
-					if (thread->looper &
-					    (BINDER_LOOPER_STATE_REGISTERED |
-					     BINDER_LOOPER_STATE_ENTERED))
-						binder_enqueue_work(
-							proc,
-							&ref->death->work,
-							&thread->todo);
-					else {
-						binder_inner_proc_lock(proc);
-						binder_enqueue_work_ilocked(
-							&ref->death->work,
-							&proc->todo);
-						binder_wakeup_proc_ilocked(
-							proc);
-						binder_inner_proc_unlock(proc);
-					}
+
+					binder_inner_proc_lock(proc);
+					binder_enqueue_work_ilocked(
+						&ref->death->work, &proc->todo);
+					binder_wakeup_proc_ilocked(proc);
+					binder_inner_proc_unlock(proc);
 				}
 			} else {
 				if (ref->death == NULL) {

From 6f227409a1797b448402a3ebf7523a229b8b6cbd Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Thu, 10 Aug 2017 13:56:16 +0200
Subject: [PATCH 0990/3220] ANDROID: binder: don't queue async transactions to
 thread.

This can cause issues with processes using the poll()
interface:

1) client sends two oneway transactions
2) the second one gets queued on async_todo
   (because the server didn't handle the first one
    yet)
3) server returns from poll(), picks up the
   first transaction and does transaction work
4) server is done with the transaction, sends
   BC_FREE_BUFFER, and the second transaction gets
   moved to thread->todo
5) libbinder's handlePolledCommands() only handles
   the commands in the current data buffer, so
   doesn't see the new transaction
6) the server continues running and issues a new
   outgoing transaction. Now, it suddenly finds
   the incoming oneway transaction on its thread
   todo, and returns that to userspace.
7) userspace does not expect this to happen; it
   may be holding a lock while making the outgoing
   transaction, and if handling the incoming
   trasnaction requires taking the same lock,
   userspace will deadlock.

By queueing the async transaction to the proc
workqueue, we make sure it's only picked up when
a thread is ready for proc work.

Bug: 38201220
Bug: 63075553
Bug: 63079216

Change-Id: I84268cc112f735d7e3173793873dfdb4b268468b
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 061a7258779e..bfdd52ea0d1c 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3522,11 +3522,13 @@ static int binder_thread_write(struct binder_proc *proc,
 				BUG_ON(buf_node->proc != proc);
 				w = binder_dequeue_work_head_ilocked(
 						&buf_node->async_todo);
-				if (!w)
+				if (!w) {
 					buf_node->has_async_transaction = 0;
-				else
+				} else {
 					binder_enqueue_work_ilocked(
-							w, &thread->todo);
+							w, &proc->todo);
+					binder_wakeup_proc_ilocked(proc);
+				}
 				binder_node_inner_unlock(buf_node);
 			}
 			trace_binder_transaction_buffer_release(buffer);

From eac37ad2df7ecf81b8dba0b64e9ed52351f73d63 Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Wed, 26 Jul 2017 12:14:41 -0700
Subject: [PATCH 0991/3220] ANDROID: keychord: Fix a slab out-of-bounds read.

Fix a slab out of bounds read in keychord_write(), detected by KASAN.

Signed-off-by: Mohan Srinivasan <srmohan@google.com>
Bug: 63962952
Change-Id: Iafef48b5d7283750ac0f39f5aaa767b1c3bf2004
(cherry picked from commit 913d980e07d84a843f5323acc55d185212a2abec)
---
 drivers/input/misc/keychord.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c
index a5ea27ad0e16..f148b937b4e5 100644
--- a/drivers/input/misc/keychord.c
+++ b/drivers/input/misc/keychord.c
@@ -232,9 +232,11 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer,
 {
 	struct keychord_device *kdev = file->private_data;
 	struct input_keychord *keychords = 0;
-	struct input_keychord *keychord, *next, *end;
+	struct input_keychord *keychord;
 	int ret, i, key;
 	unsigned long flags;
+	size_t resid = count;
+	size_t key_bytes;
 
 	if (count < sizeof(struct input_keychord))
 		return -EINVAL;
@@ -265,15 +267,29 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer,
 	kdev->head = kdev->tail = 0;
 
 	keychord = keychords;
-	end = (struct input_keychord *)((char *)keychord + count);
 
-	while (keychord < end) {
-		next = NEXT_KEYCHORD(keychord);
-		if (keychord->count <= 0 || next > end) {
+	while (resid > 0) {
+		/* Is the entire keychord entry header present ? */
+		if (resid < sizeof(struct input_keychord)) {
+			pr_err("keychord: Insufficient bytes present for header %lu\n",
+			       resid);
+			goto err_unlock_return;
+		}
+		resid -= sizeof(struct input_keychord);
+		if (keychord->count <= 0) {
 			pr_err("keychord: invalid keycode count %d\n",
 				keychord->count);
 			goto err_unlock_return;
 		}
+		key_bytes = keychord->count * sizeof(keychord->keycodes[0]);
+		/* Do we have all the expected keycodes ? */
+		if (resid < key_bytes) {
+			pr_err("keychord: Insufficient bytes present for keycount %lu\n",
+			       resid);
+			goto err_unlock_return;
+		}
+		resid -= key_bytes;
+
 		if (keychord->version != KEYCHORD_VERSION) {
 			pr_err("keychord: unsupported version %d\n",
 				keychord->version);
@@ -292,7 +308,7 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer,
 		}
 
 		kdev->keychord_count++;
-		keychord = next;
+		keychord = NEXT_KEYCHORD(keychord);
 	}
 
 	kdev->keychords = keychords;

From dd5826152c53c1ea77986c1f3d99050d538e65e3 Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Thu, 27 Jul 2017 11:30:32 -0700
Subject: [PATCH 0992/3220] Use %zu to print resid (size_t).

Print resid (size_t) portably.

Signed-off-by: Mohan Srinivasan <srmohan@google.com>
Change-Id: Ic5c9dc498bfeef2be21594ec5efd45a98a3c4b4d
(cherry picked from commit a1e4c795e1b6de6b34b8cbc75499d1675608c36b)
---
 drivers/input/misc/keychord.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c
index f148b937b4e5..c5ab3ddda456 100644
--- a/drivers/input/misc/keychord.c
+++ b/drivers/input/misc/keychord.c
@@ -271,7 +271,7 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer,
 	while (resid > 0) {
 		/* Is the entire keychord entry header present ? */
 		if (resid < sizeof(struct input_keychord)) {
-			pr_err("keychord: Insufficient bytes present for header %lu\n",
+			pr_err("keychord: Insufficient bytes present for header %zu\n",
 			       resid);
 			goto err_unlock_return;
 		}
@@ -284,7 +284,7 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer,
 		key_bytes = keychord->count * sizeof(keychord->keycodes[0]);
 		/* Do we have all the expected keycodes ? */
 		if (resid < key_bytes) {
-			pr_err("keychord: Insufficient bytes present for keycount %lu\n",
+			pr_err("keychord: Insufficient bytes present for keycount %zu\n",
 			       resid);
 			goto err_unlock_return;
 		}

From 462acca2816ee795fde81bdd99ca6e10bfe00d27 Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Wed, 9 Aug 2017 12:16:56 -0700
Subject: [PATCH 0993/3220] ANDROID: keychord: Fix races in keychord_write.

There are multiple bugs caused by threads racing in keychord_write.
1) Threads racing through this function can cause the same element to
be added to a linked list twice (multiple calls to
input_register_handler() for the same input_handler struct). And the
races can also cause an element in a linked list that doesn't exist
attempted to be removed (multiple calls to input_unregister_handler()
with the same input_handler struct).
2) The races can also cause duplicate kfree's of the keychords
struct.

Bug: 64133562
Bug: 63974334
Change-Id: I6329a4d58c665fab5d3e96ef96391e07b4941e80
Signed-off-by: Mohan Srinivasan <srmohan@google.com>
(cherry picked from commit 59584701f1e2ce8ce024570576b206bea6ac69cf)
---
 drivers/input/misc/keychord.c | 61 ++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c
index c5ab3ddda456..1673e4239403 100644
--- a/drivers/input/misc/keychord.c
+++ b/drivers/input/misc/keychord.c
@@ -60,6 +60,10 @@ struct keychord_device {
 	unsigned char		head;
 	unsigned char		tail;
 	__u16			buff[BUFFER_SIZE];
+	/* Bit to serialize writes to this device */
+#define KEYCHORD_BUSY			0x01
+	unsigned long		flags;
+	wait_queue_head_t	write_waitq;
 };
 
 static int check_keychord(struct keychord_device *kdev,
@@ -172,7 +176,6 @@ static int keychord_connect(struct input_handler *handler,
 		goto err_input_open_device;
 
 	pr_info("keychord: using input dev %s for fevent\n", dev->name);
-
 	return 0;
 
 err_input_open_device:
@@ -224,6 +227,41 @@ static ssize_t keychord_read(struct file *file, char __user *buffer,
 	return count;
 }
 
+/*
+ * serializes writes on a device. can use mutex_lock_interruptible()
+ * for this particular use case as well - a matter of preference.
+ */
+static int
+keychord_write_lock(struct keychord_device *kdev)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kdev->lock, flags);
+	while (kdev->flags & KEYCHORD_BUSY) {
+		spin_unlock_irqrestore(&kdev->lock, flags);
+		ret = wait_event_interruptible(kdev->write_waitq,
+			       ((kdev->flags & KEYCHORD_BUSY) == 0));
+		if (ret)
+			return ret;
+		spin_lock_irqsave(&kdev->lock, flags);
+	}
+	kdev->flags |= KEYCHORD_BUSY;
+	spin_unlock_irqrestore(&kdev->lock, flags);
+	return 0;
+}
+
+static void
+keychord_write_unlock(struct keychord_device *kdev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kdev->lock, flags);
+	kdev->flags &= ~KEYCHORD_BUSY;
+	spin_unlock_irqrestore(&kdev->lock, flags);
+	wake_up_interruptible(&kdev->write_waitq);
+}
+
 /*
  * keychord_write is used to configure the driver
  */
@@ -250,6 +288,22 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer,
 		return -EFAULT;
 	}
 
+	/*
+	 * Serialize writes to this device to prevent various races.
+	 * 1) writers racing here could do duplicate input_unregister_handler()
+	 *    calls, resulting in attempting to unlink a node from a list that
+	 *    does not exist.
+	 * 2) writers racing here could do duplicate input_register_handler() calls
+	 *    below, resulting in a duplicate insertion of a node into the list.
+	 * 3) a double kfree of keychords can occur (in the event that
+	 *    input_register_handler() fails below.
+	 */
+	ret = keychord_write_lock(kdev);
+	if (ret) {
+		kfree(keychords);
+		return ret;
+	}
+
 	/* unregister handler before changing configuration */
 	if (kdev->registered) {
 		input_unregister_handler(&kdev->input_handler);
@@ -318,15 +372,19 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer,
 	if (ret) {
 		kfree(keychords);
 		kdev->keychords = 0;
+		keychord_write_unlock(kdev);
 		return ret;
 	}
 	kdev->registered = 1;
 
+	keychord_write_unlock(kdev);
+
 	return count;
 
 err_unlock_return:
 	spin_unlock_irqrestore(&kdev->lock, flags);
 	kfree(keychords);
+	keychord_write_unlock(kdev);
 	return -EINVAL;
 }
 
@@ -352,6 +410,7 @@ static int keychord_open(struct inode *inode, struct file *file)
 
 	spin_lock_init(&kdev->lock);
 	init_waitqueue_head(&kdev->waitq);
+	init_waitqueue_head(&kdev->write_waitq);
 
 	kdev->input_handler.event = keychord_event;
 	kdev->input_handler.connect = keychord_connect;

From 60366263e6aeac89f9c92978d075529ef3303c0a Mon Sep 17 00:00:00 2001
From: Mohan Srinivasan <srmohan@google.com>
Date: Wed, 9 Aug 2017 12:36:33 -0700
Subject: [PATCH 0994/3220] ANDROID: keychord: Fix for a memory leak in
 keychord.

Fixes a steady memory leak in the keychord release code. A close of
the keychord device will leak 1 keychord structure. Easily
reproducible by a simple program that does an open()->write()->close()
of the keychord device.

Bug: 64483974
Change-Id: I1fa402c666cffb00b8cfd6379d9fe47a0989152c
Signed-off-by: Mohan Srinivasan <srmohan@google.com>
(cherry picked from commit 72a8dae2c25d0277e48672ee85b70236268add01)
---
 drivers/input/misc/keychord.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c
index 1673e4239403..fdcc14653b64 100644
--- a/drivers/input/misc/keychord.c
+++ b/drivers/input/misc/keychord.c
@@ -432,6 +432,7 @@ static int keychord_release(struct inode *inode, struct file *file)
 
 	if (kdev->registered)
 		input_unregister_handler(&kdev->input_handler);
+	kfree(kdev->keychords);
 	kfree(kdev);
 
 	return 0;

From c932c1b7730408e592a7f46229dcb325da28d4c4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jul 2016 15:48:20 -0700
Subject: [PATCH 0995/3220] UPSTREAM: kdb: use task_cpu() instead of
 task_thread_info()->cpu

commit e558af65be65713ef2e8b2aa637c6263caeed172 upstream.

We'll need this cleanup to make the cpu field in thread_info be
optional.

Link: http://lkml.kernel.org/r/da298328dc77ea494576c2f20a934218e758a6fa.1468523549.git.luto@kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: I0cd616f086f0eb54ed997ea153382fbf6188dba9
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 include/linux/kdb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index a19bcf9e762e..410decacff8f 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -177,7 +177,7 @@ extern int kdb_get_kbd_char(void);
 static inline
 int kdb_process_cpu(const struct task_struct *p)
 {
-	unsigned int cpu = task_thread_info(p)->cpu;
+	unsigned int cpu = task_cpu(p);
 	if (cpu > num_possible_cpus())
 		cpu = 0;
 	return cpu;

From e991aa38ea3856f3fb8f66b30fa44c4e85b502cd Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Fri, 11 Aug 2017 14:21:53 +0530
Subject: [PATCH 0996/3220] ANDROID: arm64: fix undeclared 'init_thread_info'
 error

init_thread_info is deprecated in favour of THREAD_INFO_IN_TASK
related changes, see Change-Id: Ia4769ddcc6fc556e9eb6193d64fc99fe2d9e39ab
("UPSTREAM: arm64: thread_info remove stale items").

Use init_task.thread_info instead, to fix following build error:

arch/arm64/kernel/setup.c: In function 'setup_arch':
arch/arm64/kernel/setup.c:356:2: error: 'init_thread_info' undeclared (first use in this function)
  init_thread_info.ttbr0 = virt_to_phys(empty_zero_page);
  ^

Change-Id: I13bf03211f0d918d388d1436099d286c10a23e5d
Fixes: Change-Id: I85a49f70e13b153b9903851edf56f6531c14e6de
       ("BACKPORT: arm64: Disable TTBR0_EL1 during normal kernel execution")
Fixes: Change-Id: Ia4769ddcc6fc556e9eb6193d64fc99fe2d9e39ab
       ("UPSTREAM: arm64: thread_info remove stale items")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/include/asm/thread_info.h | 2 ++
 arch/arm64/kernel/setup.c            | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index ec4f8c04aeda..67dd228c3f17 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -90,6 +90,8 @@ static inline struct thread_info *current_thread_info(void)
 
 	return (struct thread_info *)sp_el0;
 }
+
+#define init_thread_info	(init_thread_union.thread_info)
 #endif
 
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 6591bf23422b..ede6cc373f43 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -349,11 +349,15 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	/*
-	 * Make sure init_thread_info.ttbr0 always generates translation
+	 * Make sure thread_info.ttbr0 always generates translation
 	 * faults in case uaccess_enable() is inadvertently called by the init
 	 * thread.
 	 */
-	init_thread_info.ttbr0 = virt_to_phys(empty_zero_page);
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page);
+#else
+	init_thread_info.ttbr0 = (init_thread_union.thread_info);
+#endif
 #endif
 
 #ifdef CONFIG_VT

From 286f536cbfe03c0a10ddac2fc214d805c494e4c0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 23 Jun 2016 12:11:17 -0700
Subject: [PATCH 0997/3220] UPSTREAM: locking: avoid passing around
 'thread_info' in mutex debugging code

commit 6720a305df74ca30bcc10fc316881641b6ff0c80 upstream.

None of the code actually wants a thread_info, it all wants a
task_struct, and it's just converting back and forth between the two
("ti->task" to get the task_struct from the thread_info, and
"task_thread_info(task)" to go the other way).

No semantic change.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: Idd7d0fe0b2b57ece9a969d178576a12f3ae90701
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 kernel/locking/mutex-debug.c | 12 ++++++------
 kernel/locking/mutex-debug.h |  4 ++--
 kernel/locking/mutex.c       |  6 +++---
 kernel/locking/mutex.h       |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 3ef3736002d8..9c951fade415 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 }
 
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-			    struct thread_info *ti)
+			    struct task_struct *task)
 {
 	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
 
 	/* Mark the current thread as blocked on the lock: */
-	ti->task->blocked_on = waiter;
+	task->blocked_on = waiter;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-			 struct thread_info *ti)
+			 struct task_struct *task)
 {
 	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
-	DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
-	DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
-	ti->task->blocked_on = NULL;
+	DEBUG_LOCKS_WARN_ON(waiter->task != task);
+	DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
+	task->blocked_on = NULL;
 
 	list_del_init(&waiter->list);
 	waiter->task = NULL;
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..d06ae3bb46c5 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,
 extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
 extern void debug_mutex_add_waiter(struct mutex *lock,
 				   struct mutex_waiter *waiter,
-				   struct thread_info *ti);
+				   struct task_struct *task);
 extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-				struct thread_info *ti);
+				struct task_struct *task);
 extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 89350f924c85..f42f83a36506 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -537,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
-	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+	debug_mutex_add_waiter(lock, &waiter, task);
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
@@ -584,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 	__set_task_state(task, TASK_RUNNING);
 
-	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_remove_waiter(lock, &waiter, task);
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -605,7 +605,7 @@ skip_wait:
 	return 0;
 
 err:
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, task);
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..a68bae5e852a 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -13,7 +13,7 @@
 		do { spin_lock(lock); (void)(flags); } while (0)
 #define spin_unlock_mutex(lock, flags) \
 		do { spin_unlock(lock); (void)(flags); } while (0)
-#define mutex_remove_waiter(lock, waiter, ti) \
+#define mutex_remove_waiter(lock, waiter, task) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER

From 107c1d57f89d1c758723eab704ca9cd44b86ebfe Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Tue, 15 Aug 2017 15:55:46 +0530
Subject: [PATCH 0998/3220] ANDROID: arm64: Fix a copy-paste error in prior
 init_thread_info build fix

Fix a really embarrassing copy-paste error introduced
in Change-Id: I13bf03211f0d918d388d1436099d286c10a23e5d
to fix init_thread_info build error.

Fixes: Change-Id: I13bf03211f0d918d388d1436099d286c10a23e5d
       ("ANDROID: arm64: fix undeclared 'init_thread_info' error")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index ede6cc373f43..4a24719026a9 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -356,7 +356,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page);
 #else
-	init_thread_info.ttbr0 = (init_thread_union.thread_info);
+	init_thread_info.ttbr0 = virt_to_phys(empty_zero_page);
 #endif
 #endif
 

From b32ed10511d469f0e05b0ada753379f4834c2169 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 3 Jan 2017 18:27:01 +0000
Subject: [PATCH 0999/3220] UPSTREAM: arm64: restore get_current() optimisation

commit 9d84fb27fa135c99c9fe3de33628774a336a70a8 upstream.

Commit c02433dd6de32f04 ("arm64: split thread_info from task stack")
inverted the relationship between get_current() and
current_thread_info(), with sp_el0 now holding the current task_struct
rather than the current thead_info. The new implementation of
get_current() prevents the compiler from being able to optimize repeated
calls to either, resulting in a noticeable penalty in some
microbenchmarks.

This patch restores the previous optimisation by implementing
get_current() in the same way as our old current_thread_info(), using a
non-volatile asm statement.

Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/include/asm/current.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h
index 2e61d21294ba..483a6c9d3e10 100644
--- a/arch/arm64/include/asm/current.h
+++ b/arch/arm64/include/asm/current.h
@@ -10,9 +10,17 @@
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 struct task_struct;
 
+/*
+ * We don't use read_sysreg() as we want the compiler to cache the value where
+ * possible.
+ */
 static __always_inline struct task_struct *get_current(void)
 {
-	return (struct task_struct *)read_sysreg(sp_el0);
+	unsigned long sp_el0;
+
+	asm ("mrs %0, sp_el0" : "=r" (sp_el0));
+
+	return (struct task_struct *)sp_el0;
 }
 #define current get_current()
 #else

From 39a15ad2d7b90450f2defb57518dc8518fe43e66 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 1 Dec 2016 15:55:13 +0000
Subject: [PATCH 1000/3220] UPSTREAM: arm64: smp: Prevent
 raw_smp_processor_id() recursion

Under CONFIG_DEBUG_PREEMPT=y, this_cpu_ptr() ends up calling back into
raw_smp_processor_id(), resulting in some hilariously catastrophic
infinite recursion. In the normal case, we have:

  #define this_cpu_ptr(ptr) raw_cpu_ptr(ptr)

and everything is dandy. However for CONFIG_DEBUG_PREEMPT, this_cpu_ptr()
is defined in terms of my_cpu_offset, wherein the fun begins:

  #define my_cpu_offset per_cpu_offset(smp_processor_id())
  ...
  #define smp_processor_id() debug_smp_processor_id()
  ...
  notrace unsigned int debug_smp_processor_id(void)
  {
  	return check_preemption_disabled("smp_processor_id", "");
  ...
  notrace static unsigned int check_preemption_disabled(const char *what1,
  							const char *what2)
  {
  	int this_cpu = raw_smp_processor_id();

and bang. Use raw_cpu_ptr() directly to avoid that.

Fixes: 57c82954e77f ("arm64: make cpu number a percpu variable")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
(cherry picked from commit 34a6980c82fb1342e7064844c95aa4cf933e5ecc)
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/arm64/include/asm/smp.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index a05033beb2a2..4325b3622a92 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -28,8 +28,10 @@ DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
  * We don't use this_cpu_read(cpu_number) as that has implicit writes to
  * preempt_count, and associated (compiler) barriers, that we'd like to avoid
  * the expense of. If we're preemptible, the value can be stale at use anyway.
+ * And we can't use this_cpu_ptr() either, as that winds up recursing back
+ * here under CONFIG_DEBUG_PREEMPT=y.
  */
-#define raw_smp_processor_id() (*this_cpu_ptr(&cpu_number))
+#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number))
 
 struct seq_file;
 

From 623f33f213deb39994decbdc7d3b8cea82c4d558 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Thu, 20 Apr 2017 18:54:13 -0700
Subject: [PATCH 1001/3220] ANDROID: Use sk_uid to replace uid get from socket
 file

Retrieve socket uid from the sk_uid field added to struct sk instead of
read it from sk->socket->file. It prevent the packet been dropped when
the socket file doesn't exist.

Bug: 37524657
Signed-off-by: Chenbo Feng <fengc@google.com>
Change-Id: I10545a308d61a2124077cb6f05fb0f5d9b4559ad
---
 net/netfilter/xt_qtaguid.c          | 50 ++++++++++++-----------------
 net/netfilter/xt_qtaguid_internal.h |  4 +--
 2 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index f95aba44e649..fbe4e1ecce6a 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -1717,18 +1717,9 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	}
 	MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n",
 		 par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par));
-	if (sk != NULL) {
-		set_sk_callback_lock = true;
-		read_lock_bh(&sk->sk_callback_lock);
-		MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
-			par->hooknum, sk, sk->sk_socket,
-			sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
-		filp = sk->sk_socket ? sk->sk_socket->file : NULL;
-		MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
-			par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
-	}
 
-	if (sk == NULL || sk->sk_socket == NULL) {
+
+	if (sk == NULL) {
 		/*
 		 * Here, the qtaguid_find_sk() using connection tracking
 		 * couldn't find the owner, so for now we just count them
@@ -1741,9 +1732,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		 */
 		if (!(info->match & XT_QTAGUID_UID))
 			account_for_uid(skb, sk, 0, par);
-		MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n",
-			par->hooknum,
-			sk ? sk->sk_socket : NULL);
+		MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum);
 		res = (info->match ^ info->invert) == 0;
 		atomic64_inc(&qtu_events.match_no_sk);
 		goto put_sock_ret_res;
@@ -1751,16 +1740,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		res = false;
 		goto put_sock_ret_res;
 	}
-	filp = sk->sk_socket->file;
-	if (filp == NULL) {
-		MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum);
-		account_for_uid(skb, sk, 0, par);
-		res = ((info->match ^ info->invert) &
-			(XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0;
-		atomic64_inc(&qtu_events.match_no_sk_file);
-		goto put_sock_ret_res;
-	}
-	sock_uid = filp->f_cred->fsuid;
+	sock_uid = sk->sk_uid;
 	/*
 	 * TODO: unhack how to force just accounting.
 	 * For now we only do iface stats when the uid-owner is not requested
@@ -1778,8 +1758,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
 		kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
 
-		if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
-		     uid_lte(filp->f_cred->fsuid, uid_max)) ^
+		if ((uid_gte(sk->sk_uid, uid_min) &&
+		     uid_lte(sk->sk_uid, uid_max)) ^
 		    !(info->invert & XT_QTAGUID_UID)) {
 			MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
 				 par->hooknum);
@@ -1790,7 +1770,19 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (info->match & XT_QTAGUID_GID) {
 		kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
 		kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
-
+		set_sk_callback_lock = true;
+		read_lock_bh(&sk->sk_callback_lock);
+		MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
+			par->hooknum, sk, sk->sk_socket,
+			sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
+		filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+		if (!filp) {
+			res = ((info->match ^ info->invert) & XT_QTAGUID_GID) == 0;
+			atomic64_inc(&qtu_events.match_no_sk_gid);
+			goto put_sock_ret_res;
+		}
+		MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
+			par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
 		if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
 				gid_lte(filp->f_cred->fsgid, gid_max)) ^
 			!(info->invert & XT_QTAGUID_GID)) {
@@ -1962,7 +1954,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 			   "match_found_sk_in_ct=%llu "
 			   "match_found_no_sk_in_ct=%llu "
 			   "match_no_sk=%llu "
-			   "match_no_sk_file=%llu\n",
+			   "match_no_sk_gid=%llu\n",
 			   (u64)atomic64_read(&qtu_events.sockets_tagged),
 			   (u64)atomic64_read(&qtu_events.sockets_untagged),
 			   (u64)atomic64_read(&qtu_events.counter_set_changes),
@@ -1974,7 +1966,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 			   (u64)atomic64_read(&qtu_events.match_found_sk_in_ct),
 			   (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct),
 			   (u64)atomic64_read(&qtu_events.match_no_sk),
-			   (u64)atomic64_read(&qtu_events.match_no_sk_file));
+			   (u64)atomic64_read(&qtu_events.match_no_sk_gid));
 
 		/* Count the following as part of the last item_index. No need
 		 * to lock the sock_tag_list here since it is already locked when
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
index 8178fbdfb036..c7052707a6a4 100644
--- a/net/netfilter/xt_qtaguid_internal.h
+++ b/net/netfilter/xt_qtaguid_internal.h
@@ -289,10 +289,10 @@ struct qtaguid_event_counts {
 	 */
 	atomic64_t match_no_sk;
 	/*
-	 * The file ptr in the sk_socket wasn't there.
+	 * The file ptr in the sk_socket wasn't there and we couldn't get GID.
 	 * This might happen for traffic while the socket is being closed.
 	 */
-	atomic64_t match_no_sk_file;
+	atomic64_t match_no_sk_gid;
 };
 
 /* Track the set active_set for the given tag. */

From c5c3d46ccc5dbce050a070cd6c70dd36c67a15e5 Mon Sep 17 00:00:00 2001
From: Yang Jin <yajin@google.com>
Date: Wed, 26 Jul 2017 12:52:22 -0700
Subject: [PATCH 1002/3220] uid_sys_stats: log task io with a debug flag

Add a hashmap inside each uid_entry to keep track of task name and io.
Task full name is a combination of thread and process name.

Bug: 63739275
Change-Id: I30083b757eaef8c61e55a213a883ce8d0c9cf2b1
Signed-off-by: Yang Jin <yajin@google.com>
---
 drivers/misc/Kconfig         |   7 +
 drivers/misc/uid_sys_stats.c | 303 ++++++++++++++++++++++++++++++-----
 2 files changed, 266 insertions(+), 44 deletions(-)

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 9e649992c177..88056d1e8feb 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -533,6 +533,13 @@ config UID_SYS_STATS
 	  Per UID based io statistics exported to /proc/uid_io
 	  Per UID based procstat control in /proc/uid_procstat
 
+config UID_SYS_STATS_DEBUG
+	bool "Per-TASK statistics"
+	depends on UID_SYS_STATS
+	default n
+	help
+	  Per TASK based io statistics exported to /proc/uid_io
+
 config MEMORY_STATE_TIME
 	tristate "Memory freq/bandwidth time statistics"
 	depends on PROFILING
diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 3c9d311106cd..2e4364b96b9a 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -52,6 +52,15 @@ struct io_stats {
 #define UID_STATE_DEAD_TASKS	4
 #define UID_STATE_SIZE		5
 
+#define MAX_TASK_COMM_LEN 256
+
+struct task_entry {
+	char comm[MAX_TASK_COMM_LEN];
+	pid_t pid;
+	struct io_stats io[UID_STATE_SIZE];
+	struct hlist_node hash;
+};
+
 struct uid_entry {
 	uid_t uid;
 	cputime_t utime;
@@ -61,8 +70,231 @@ struct uid_entry {
 	int state;
 	struct io_stats io[UID_STATE_SIZE];
 	struct hlist_node hash;
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+	DECLARE_HASHTABLE(task_entries, UID_HASH_BITS);
+#endif
 };
 
+static u64 compute_write_bytes(struct task_struct *task)
+{
+	if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes)
+		return 0;
+
+	return task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+}
+
+static void compute_io_bucket_stats(struct io_stats *io_bucket,
+					struct io_stats *io_curr,
+					struct io_stats *io_last,
+					struct io_stats *io_dead)
+{
+	/* tasks could switch to another uid group, but its io_last in the
+	 * previous uid group could still be positive.
+	 * therefore before each update, do an overflow check first
+	 */
+	int64_t delta;
+
+	delta = io_curr->read_bytes + io_dead->read_bytes -
+		io_last->read_bytes;
+	io_bucket->read_bytes += delta > 0 ? delta : 0;
+	delta = io_curr->write_bytes + io_dead->write_bytes -
+		io_last->write_bytes;
+	io_bucket->write_bytes += delta > 0 ? delta : 0;
+	delta = io_curr->rchar + io_dead->rchar - io_last->rchar;
+	io_bucket->rchar += delta > 0 ? delta : 0;
+	delta = io_curr->wchar + io_dead->wchar - io_last->wchar;
+	io_bucket->wchar += delta > 0 ? delta : 0;
+	delta = io_curr->fsync + io_dead->fsync - io_last->fsync;
+	io_bucket->fsync += delta > 0 ? delta : 0;
+
+	io_last->read_bytes = io_curr->read_bytes;
+	io_last->write_bytes = io_curr->write_bytes;
+	io_last->rchar = io_curr->rchar;
+	io_last->wchar = io_curr->wchar;
+	io_last->fsync = io_curr->fsync;
+
+	memset(io_dead, 0, sizeof(struct io_stats));
+}
+
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+static void get_full_task_comm(struct task_entry *task_entry,
+		struct task_struct *task)
+{
+	int i = 0, offset = 0, len = 0;
+	/* save one byte for terminating null character */
+	int unused_len = MAX_TASK_COMM_LEN - TASK_COMM_LEN - 1;
+	char buf[unused_len];
+	struct mm_struct *mm = task->mm;
+
+	/* fill the first TASK_COMM_LEN bytes with thread name */
+	get_task_comm(task_entry->comm, task);
+	i = strlen(task_entry->comm);
+	while (i < TASK_COMM_LEN)
+		task_entry->comm[i++] = ' ';
+
+	/* next the executable file name */
+	if (mm) {
+		down_read(&mm->mmap_sem);
+		if (mm->exe_file) {
+			char *pathname = d_path(&mm->exe_file->f_path, buf,
+					unused_len);
+
+			if (!IS_ERR(pathname)) {
+				len = strlcpy(task_entry->comm + i, pathname,
+						unused_len);
+				i += len;
+				task_entry->comm[i++] = ' ';
+				unused_len--;
+			}
+		}
+		up_read(&mm->mmap_sem);
+	}
+	unused_len -= len;
+
+	/* fill the rest with command line argument
+	 * replace each null or new line character
+	 * between args in argv with whitespace */
+	len = get_cmdline(task, buf, unused_len);
+	while (offset < len) {
+		if (buf[offset] != '\0' && buf[offset] != '\n')
+			task_entry->comm[i++] = buf[offset];
+		else
+			task_entry->comm[i++] = ' ';
+		offset++;
+	}
+
+	/* get rid of trailing whitespaces in case when arg is memset to
+	 * zero before being reset in userspace
+	 */
+	while (task_entry->comm[i-1] == ' ')
+		i--;
+	task_entry->comm[i] = '\0';
+}
+
+static struct task_entry *find_task_entry(struct uid_entry *uid_entry,
+		struct task_struct *task)
+{
+	struct task_entry *task_entry;
+
+	hash_for_each_possible(uid_entry->task_entries, task_entry, hash,
+			task->pid) {
+		if (task->pid == task_entry->pid) {
+			/* if thread name changed, update the entire command */
+			int len = strnchr(task_entry->comm, ' ', TASK_COMM_LEN)
+				- task_entry->comm;
+
+			if (strncmp(task_entry->comm, task->comm, len))
+				get_full_task_comm(task_entry, task);
+			return task_entry;
+		}
+	}
+	return NULL;
+}
+
+static struct task_entry *find_or_register_task(struct uid_entry *uid_entry,
+		struct task_struct *task)
+{
+	struct task_entry *task_entry;
+	pid_t pid = task->pid;
+
+	task_entry = find_task_entry(uid_entry, task);
+	if (task_entry)
+		return task_entry;
+
+	task_entry = kzalloc(sizeof(struct task_entry), GFP_ATOMIC);
+	if (!task_entry)
+		return NULL;
+
+	get_full_task_comm(task_entry, task);
+
+	task_entry->pid = pid;
+	hash_add(uid_entry->task_entries, &task_entry->hash, (unsigned int)pid);
+
+	return task_entry;
+}
+
+static void remove_uid_tasks(struct uid_entry *uid_entry)
+{
+	struct task_entry *task_entry;
+	unsigned long bkt_task;
+	struct hlist_node *tmp_task;
+
+	hash_for_each_safe(uid_entry->task_entries, bkt_task,
+			tmp_task, task_entry, hash) {
+		hash_del(&task_entry->hash);
+		kfree(task_entry);
+	}
+}
+
+static void set_io_uid_tasks_zero(struct uid_entry *uid_entry)
+{
+	struct task_entry *task_entry;
+	unsigned long bkt_task;
+
+	hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+		memset(&task_entry->io[UID_STATE_TOTAL_CURR], 0,
+			sizeof(struct io_stats));
+	}
+}
+
+static void add_uid_tasks_io_stats(struct uid_entry *uid_entry,
+		struct task_struct *task, int slot)
+{
+	struct task_entry *task_entry = find_or_register_task(uid_entry, task);
+	struct io_stats *task_io_slot = &task_entry->io[slot];
+
+	task_io_slot->read_bytes += task->ioac.read_bytes;
+	task_io_slot->write_bytes += compute_write_bytes(task);
+	task_io_slot->rchar += task->ioac.rchar;
+	task_io_slot->wchar += task->ioac.wchar;
+	task_io_slot->fsync += task->ioac.syscfs;
+}
+
+static void compute_io_uid_tasks(struct uid_entry *uid_entry)
+{
+	struct task_entry *task_entry;
+	unsigned long bkt_task;
+
+	hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+		compute_io_bucket_stats(&task_entry->io[uid_entry->state],
+					&task_entry->io[UID_STATE_TOTAL_CURR],
+					&task_entry->io[UID_STATE_TOTAL_LAST],
+					&task_entry->io[UID_STATE_DEAD_TASKS]);
+	}
+}
+
+static void show_io_uid_tasks(struct seq_file *m, struct uid_entry *uid_entry)
+{
+	struct task_entry *task_entry;
+	unsigned long bkt_task;
+
+	hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+		/* Separated by comma because space exists in task comm */
+		seq_printf(m, "task,%s,%lu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n",
+				task_entry->comm,
+				(unsigned long)task_entry->pid,
+				task_entry->io[UID_STATE_FOREGROUND].rchar,
+				task_entry->io[UID_STATE_FOREGROUND].wchar,
+				task_entry->io[UID_STATE_FOREGROUND].read_bytes,
+				task_entry->io[UID_STATE_FOREGROUND].write_bytes,
+				task_entry->io[UID_STATE_BACKGROUND].rchar,
+				task_entry->io[UID_STATE_BACKGROUND].wchar,
+				task_entry->io[UID_STATE_BACKGROUND].read_bytes,
+				task_entry->io[UID_STATE_BACKGROUND].write_bytes,
+				task_entry->io[UID_STATE_FOREGROUND].fsync,
+				task_entry->io[UID_STATE_BACKGROUND].fsync);
+	}
+}
+#else
+static void remove_uid_tasks(struct uid_entry *uid_entry) {};
+static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) {};
+static void add_uid_tasks_io_stats(struct uid_entry *uid_entry,
+		struct task_struct *task, int slot) {};
+static void compute_io_uid_tasks(struct uid_entry *uid_entry) {};
+static void show_io_uid_tasks(struct seq_file *m,
+		struct uid_entry *uid_entry) {}
+#endif
+
 static struct uid_entry *find_uid_entry(uid_t uid)
 {
 	struct uid_entry *uid_entry;
@@ -86,7 +318,9 @@ static struct uid_entry *find_or_register_uid(uid_t uid)
 		return NULL;
 
 	uid_entry->uid = uid;
-
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+	hash_init(uid_entry->task_entries);
+#endif
 	hash_add(hash_table, &uid_entry->hash, uid);
 
 	return uid_entry;
@@ -192,6 +426,7 @@ static ssize_t uid_remove_write(struct file *file,
 		hash_for_each_possible_safe(hash_table, uid_entry, tmp,
 							hash, (uid_t)uid_start) {
 			if (uid_start == uid_entry->uid) {
+				remove_uid_tasks(uid_entry);
 				hash_del(&uid_entry->hash);
 				kfree(uid_entry);
 			}
@@ -208,13 +443,6 @@ static const struct file_operations uid_remove_fops = {
 	.write		= uid_remove_write,
 };
 
-static u64 compute_write_bytes(struct task_struct *task)
-{
-	if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes)
-		return 0;
-
-	return task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
-}
 
 static void add_uid_io_stats(struct uid_entry *uid_entry,
 			struct task_struct *task, int slot)
@@ -226,28 +454,8 @@ static void add_uid_io_stats(struct uid_entry *uid_entry,
 	io_slot->rchar += task->ioac.rchar;
 	io_slot->wchar += task->ioac.wchar;
 	io_slot->fsync += task->ioac.syscfs;
-}
 
-static void compute_uid_io_bucket_stats(struct io_stats *io_bucket,
-					struct io_stats *io_curr,
-					struct io_stats *io_last,
-					struct io_stats *io_dead)
-{
-	io_bucket->read_bytes += io_curr->read_bytes + io_dead->read_bytes -
-		io_last->read_bytes;
-	io_bucket->write_bytes += io_curr->write_bytes + io_dead->write_bytes -
-		io_last->write_bytes;
-	io_bucket->rchar += io_curr->rchar + io_dead->rchar - io_last->rchar;
-	io_bucket->wchar += io_curr->wchar + io_dead->wchar - io_last->wchar;
-	io_bucket->fsync += io_curr->fsync + io_dead->fsync - io_last->fsync;
-
-	io_last->read_bytes = io_curr->read_bytes;
-	io_last->write_bytes = io_curr->write_bytes;
-	io_last->rchar = io_curr->rchar;
-	io_last->wchar = io_curr->wchar;
-	io_last->fsync = io_curr->fsync;
-
-	memset(io_dead, 0, sizeof(struct io_stats));
+	add_uid_tasks_io_stats(uid_entry, task, slot);
 }
 
 static void update_io_stats_all_locked(void)
@@ -258,9 +466,11 @@ static void update_io_stats_all_locked(void)
 	unsigned long bkt;
 	uid_t uid;
 
-	hash_for_each(hash_table, bkt, uid_entry, hash)
+	hash_for_each(hash_table, bkt, uid_entry, hash) {
 		memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
 			sizeof(struct io_stats));
+		set_io_uid_tasks_zero(uid_entry);
+	}
 
 	rcu_read_lock();
 	do_each_thread(temp, task) {
@@ -274,10 +484,11 @@ static void update_io_stats_all_locked(void)
 	rcu_read_unlock();
 
 	hash_for_each(hash_table, bkt, uid_entry, hash) {
-		compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state],
+		compute_io_bucket_stats(&uid_entry->io[uid_entry->state],
 					&uid_entry->io[UID_STATE_TOTAL_CURR],
 					&uid_entry->io[UID_STATE_TOTAL_LAST],
 					&uid_entry->io[UID_STATE_DEAD_TASKS]);
+		compute_io_uid_tasks(uid_entry);
 	}
 }
 
@@ -288,6 +499,7 @@ static void update_io_stats_uid_locked(struct uid_entry *uid_entry)
 
 	memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
 		sizeof(struct io_stats));
+	set_io_uid_tasks_zero(uid_entry);
 
 	rcu_read_lock();
 	do_each_thread(temp, task) {
@@ -297,12 +509,14 @@ static void update_io_stats_uid_locked(struct uid_entry *uid_entry)
 	} while_each_thread(temp, task);
 	rcu_read_unlock();
 
-	compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state],
+	compute_io_bucket_stats(&uid_entry->io[uid_entry->state],
 				&uid_entry->io[UID_STATE_TOTAL_CURR],
 				&uid_entry->io[UID_STATE_TOTAL_LAST],
 				&uid_entry->io[UID_STATE_DEAD_TASKS]);
+	compute_io_uid_tasks(uid_entry);
 }
 
+
 static int uid_io_show(struct seq_file *m, void *v)
 {
 	struct uid_entry *uid_entry;
@@ -314,21 +528,22 @@ static int uid_io_show(struct seq_file *m, void *v)
 
 	hash_for_each(hash_table, bkt, uid_entry, hash) {
 		seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
-			uid_entry->uid,
-			uid_entry->io[UID_STATE_FOREGROUND].rchar,
-			uid_entry->io[UID_STATE_FOREGROUND].wchar,
-			uid_entry->io[UID_STATE_FOREGROUND].read_bytes,
-			uid_entry->io[UID_STATE_FOREGROUND].write_bytes,
-			uid_entry->io[UID_STATE_BACKGROUND].rchar,
-			uid_entry->io[UID_STATE_BACKGROUND].wchar,
-			uid_entry->io[UID_STATE_BACKGROUND].read_bytes,
-			uid_entry->io[UID_STATE_BACKGROUND].write_bytes,
-			uid_entry->io[UID_STATE_FOREGROUND].fsync,
-			uid_entry->io[UID_STATE_BACKGROUND].fsync);
+				uid_entry->uid,
+				uid_entry->io[UID_STATE_FOREGROUND].rchar,
+				uid_entry->io[UID_STATE_FOREGROUND].wchar,
+				uid_entry->io[UID_STATE_FOREGROUND].read_bytes,
+				uid_entry->io[UID_STATE_FOREGROUND].write_bytes,
+				uid_entry->io[UID_STATE_BACKGROUND].rchar,
+				uid_entry->io[UID_STATE_BACKGROUND].wchar,
+				uid_entry->io[UID_STATE_BACKGROUND].read_bytes,
+				uid_entry->io[UID_STATE_BACKGROUND].write_bytes,
+				uid_entry->io[UID_STATE_FOREGROUND].fsync,
+				uid_entry->io[UID_STATE_BACKGROUND].fsync);
+
+		show_io_uid_tasks(m, uid_entry);
 	}
 
 	rt_mutex_unlock(&uid_lock);
-
 	return 0;
 }
 

From ad0af9b183c73b40f0448b587e32bf42227b5d08 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Thu, 17 Aug 2017 18:52:04 +0530
Subject: [PATCH 1003/3220] ANDROID: uid_sys_stats: Fix implicit declaration of
 get_cmdline()

Include linux/mm.h for get_cmdline() declaration.

Change-Id: Icad6ef7deef4d93d92d423c96bfa61fb5d66d0b7
Fixes: Change-Id: I30083b757eaef8c61e55a213a883ce8d0c9cf2b1
       ("uid_sys_stats: log task io with a debug flag")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 drivers/misc/uid_sys_stats.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 2e4364b96b9a..031320e51522 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/profile.h>
 #include <linux/rtmutex.h>

From 1f3f566d9b41ceffb15e73da5e32c0e249acd535 Mon Sep 17 00:00:00 2001
From: Danilo Krummrich <danilokrummrich@gmail.com>
Date: Mon, 14 Aug 2017 12:45:38 +0200
Subject: [PATCH 1004/3220] ANDROID: usb: gadget: configfs: fix null ptr in
 android_disconnect

There's a race between usb_gadget_udc_stop() which is likely
to set the gadget driver to NULL in the udc driver and this drivers
gadget disconnect fn which likely checks for the gadget driver to
a null ptr. It happens that unbind (doing set_gadget_data(NULL))
is called before the gadget driver is set to NULL and the udc driver
calls disconnect fn which results in cdev being a null ptr.

As a workaround we check cdev in android_disconnect() to prevent
the following panic:

Unable to handle kernel NULL pointer dereference at virtual address 000000a8
pgd = ffffff800940a000
[000000a8] *pgd=00000000be1fe003, *pud=00000000be1fe003, *pmd=0000000000000000
Internal error: Oops: 96000046 [#1] PREEMPT SMP
CPU: 7 PID: 1134 Comm: kworker/u16:3 Tainted: G S 4.9.41-g75cd2a0231ea-dirty #4
Hardware name: HiKey960 (DT)
Workqueue: events_power_efficient event_work
task: ffffffc0b5f4f000 task.stack: ffffffc0b5b94000
PC is at android_disconnect+0x54/0xa4
LR is at android_disconnect+0x54/0xa4
pc : [<ffffff8008855938>] lr : [<ffffff8008855938>] pstate: 80000185
sp : ffffffc0b5b97bf0
x29: ffffffc0b5b97bf0 x28: 0000000000000003
x27: ffffffc0b5181c54 x26: ffffffc0b5181c68
x25: ffffff8008dc1000 x24: ffffffc0b5181d70
x23: ffffff8008dc18a0 x22: ffffffc0b5f5a018
x21: ffffffc0b5894ad8 x20: 0000000000000000
x19: ffffff8008ddaec8 x18: 0000000000000000
x17: 0000000000000000 x16: 0000000000000000
x15: 0000000000000000 x14: 00000000007c9ccd
x13: 0000000000000000 x12: 0000000000000000
x11: 0000000000000001 x10: 0000000000000001
x9 : ffffff800930f1a8 x8 : ffffff800932a133
x7 : 0000000000000000 x6 : 0000000000000000
x5 : ffffffc0b5b97a50 x4 : ffffffc0be19f090
x3 : 0000000000000000 x2 : ffffff80091ca000
x1 : 000000000000002f x0 : 000000000000002f

This happened on a hikey960 with the following backtrace:

[<ffffff8008855938>] android_disconnect+0x54/0xa4
[<ffffff80089def38>] dwc3_disconnect_gadget.part.19+0x114.888119]
[<ffffff80087f7d48>] dwc3_gadget_suspend+0x6c/0x70
[<ffffff80087ee674>] dwc3_suspend_device+0x58/0xa0
[<ffffff80087fb418>] dwc3_otg_work+0x214/0x474
[<ffffff80087fdc74>] event_work+0x3bc/0x5ac
[<ffffff80080e5d88>] process_one_work+0x14c/0x43c
[<ffffff80080e60d4>] worker_thread+0x5c/0x438
[<ffffff80080ece68>] kthread+0xec/0x100
[<ffffff8008083680>] ret_from_fork+0x10/0x50

dwc3_otg_work tries to handle a switch from host to device mode
and therefore calls disconnect on the gadget driver.

To reproduce the issue it is enaugh to enable tethering (rndis gadget),
unplug and plug in again the usb connector which causes the change
from device to host and back to device mode.

Signed-off-by: Danilo Krummrich <danilokrummrich@gmail.com>
---
 drivers/usb/gadget/configfs.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index 54849fe9cb01..a0bd3c542d06 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -1524,6 +1524,18 @@ static void android_disconnect(struct usb_gadget *gadget)
 	struct usb_composite_dev        *cdev = get_gadget_data(gadget);
 	struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
 
+	/* FIXME: There's a race between usb_gadget_udc_stop() which is likely
+	 * to set the gadget driver to NULL in the udc driver and this drivers
+	 * gadget disconnect fn which likely checks for the gadget driver to
+	 * be a null ptr. It happens that unbind (doing set_gadget_data(NULL))
+	 * is called before the gadget driver is set to NULL and the udc driver
+	 * calls disconnect fn which results in cdev being a null ptr.
+	 */
+	if (cdev == NULL) {
+		WARN(1, "%s: gadget driver already disconnected\n", __func__);
+		return;
+	}
+
 	/* accessory HID support can be active while the
 		accessory function is not actually enabled,
 		so we need to inform it when we are disconnected.

From 967ca302456550859bfea3cda9971a3035b6889f Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <Badhri@google.com>
Date: Fri, 21 Jul 2017 14:58:16 -0700
Subject: [PATCH 1005/3220] ANDROID: usb: gadget: assign no-op request complete
 callbacks

The req->complete seems to presist the callback pointer for
the control requests. This causes the serial of the accessory
to be overridden when an accessory function specific out
control request is issued right after the  ACCESSORY_SEND_STRING
control request. Therefore, assign a no-op req complete function
when nothing needs to be done once the request is completed.

Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Bug: 63867169
Change-Id: I78c1602e9a044b8718b270b8a068cf5afc83f984
---
 drivers/usb/gadget/function/f_accessory.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c
index 76b8ae08a551..d31c0809046f 100644
--- a/drivers/usb/gadget/function/f_accessory.c
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -812,6 +812,14 @@ static struct hid_driver acc_hid_driver = {
 	.probe = acc_hid_probe,
 };
 
+static void acc_complete_setup_noop(struct usb_ep *ep, struct usb_request *req)
+{
+	/*
+	 * Default no-op function when nothing needs to be done for the
+	 * setup request
+	 */
+}
+
 int acc_ctrlrequest(struct usb_composite_dev *cdev,
 				const struct usb_ctrlrequest *ctrl)
 {
@@ -839,6 +847,7 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev,
 			schedule_delayed_work(
 				&dev->start_work, msecs_to_jiffies(10));
 			value = 0;
+			cdev->req->complete = acc_complete_setup_noop;
 		} else if (b_request == ACCESSORY_SEND_STRING) {
 			dev->string_index = w_index;
 			cdev->gadget->ep0->driver_data = dev;
@@ -847,10 +856,13 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev,
 		} else if (b_request == ACCESSORY_SET_AUDIO_MODE &&
 				w_index == 0 && w_length == 0) {
 			dev->audio_mode = w_value;
+			cdev->req->complete = acc_complete_setup_noop;
 			value = 0;
 		} else if (b_request == ACCESSORY_REGISTER_HID) {
+			cdev->req->complete = acc_complete_setup_noop;
 			value = acc_register_hid(dev, w_value, w_index);
 		} else if (b_request == ACCESSORY_UNREGISTER_HID) {
+			cdev->req->complete = acc_complete_setup_noop;
 			value = acc_unregister_hid(dev, w_value);
 		} else if (b_request == ACCESSORY_SET_HID_REPORT_DESC) {
 			spin_lock_irqsave(&dev->lock, flags);
@@ -885,7 +897,7 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev,
 		if (b_request == ACCESSORY_GET_PROTOCOL) {
 			*((u16 *)cdev->req->buf) = PROTOCOL_VERSION;
 			value = sizeof(u16);
-
+			cdev->req->complete = acc_complete_setup_noop;
 			/* clear any string left over from a previous session */
 			memset(dev->manufacturer, 0, sizeof(dev->manufacturer));
 			memset(dev->model, 0, sizeof(dev->model));

From 9e6afd4e3684b5a6664349fc87c6b9fb48c375ae Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Wed, 2 Dec 2015 10:06:45 -0600
Subject: [PATCH 1006/3220] BACKPORT: usb: dwc3: gadget: handle request->zero

So far, dwc3 has always missed request->zero
handling for every endpoint. Let's implement
that so we can handle cases where transfer must
be finished with a ZLP.

Note that dwc3 is a little special. Even though
we're dealing with a ZLP, we still need a buffer
of wMaxPacketSize bytes; to hide that detail from
every gadget driver, we have a preallocated buffer
of 1024 bytes (biggest bulk size) to use (and
share) among all endpoints.

(cherry-picked from commit
04c03d10e507052cfce6910ddf34091196e79e1c)

Reported-by: Ravi B <ravibabu@ti.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Bug: 63867169
Change-Id: Ied4093e92dd080f2d9dd1fd9aefb36406f66bb67
---
 drivers/usb/dwc3/core.h   |  3 +++
 drivers/usb/dwc3/gadget.c | 49 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h
index 68d11d7d4028..d5b604a2c325 100644
--- a/drivers/usb/dwc3/core.h
+++ b/drivers/usb/dwc3/core.h
@@ -37,6 +37,7 @@
 #define DWC3_MSG_MAX	500
 
 /* Global constants */
+#define DWC3_ZLP_BUF_SIZE	1024	/* size of a superspeed bulk */
 #define DWC3_EP0_BOUNCE_SIZE	512
 #define DWC3_ENDPOINTS_NUM	32
 #define DWC3_XHCI_RESOURCES_NUM	2
@@ -645,6 +646,7 @@ struct dwc3_scratchpad_array {
  * @ctrl_req: usb control request which is used for ep0
  * @ep0_trb: trb which is used for the ctrl_req
  * @ep0_bounce: bounce buffer for ep0
+ * @zlp_buf: used when request->zero is set
  * @setup_buf: used while precessing STD USB requests
  * @ctrl_req_addr: dma address of ctrl_req
  * @ep0_trb: dma address of ep0_trb
@@ -732,6 +734,7 @@ struct dwc3 {
 	struct usb_ctrlrequest	*ctrl_req;
 	struct dwc3_trb		*ep0_trb;
 	void			*ep0_bounce;
+	void			*zlp_buf;
 	void			*scratchbuf;
 	u8			*setup_buf;
 	dma_addr_t		ctrl_req_addr;
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index d3bd1afd6302..c0d21367d107 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1200,6 +1200,32 @@ out:
 	return ret;
 }
 
+static void __dwc3_gadget_ep_zlp_complete(struct usb_ep *ep,
+		struct usb_request *request)
+{
+	dwc3_gadget_ep_free_request(ep, request);
+}
+
+static int __dwc3_gadget_ep_queue_zlp(struct dwc3 *dwc, struct dwc3_ep *dep)
+{
+	struct dwc3_request		*req;
+	struct usb_request		*request;
+	struct usb_ep			*ep = &dep->endpoint;
+
+	dwc3_trace(trace_dwc3_gadget, "queueing ZLP\n");
+	request = dwc3_gadget_ep_alloc_request(ep, GFP_ATOMIC);
+	if (!request)
+		return -ENOMEM;
+
+	request->length = 0;
+	request->buf = dwc->zlp_buf;
+	request->complete = __dwc3_gadget_ep_zlp_complete;
+
+	req = to_dwc3_request(request);
+
+	return __dwc3_gadget_ep_queue(dep, req);
+}
+
 static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request,
 	gfp_t gfp_flags)
 {
@@ -1227,6 +1253,15 @@ static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request,
 
 	ret = __dwc3_gadget_ep_queue(dep, req);
 
+	/*
+	 * Okay, here's the thing, if gadget driver has requested for a ZLP by
+	 * setting request->zero, instead of doing magic, we will just queue an
+	 * extra usb_request ourselves so that it gets handled the same way as
+	 * any other request.
+	 */
+	if (ret == 0 && request->zero && (request->length % ep->maxpacket == 0))
+		ret = __dwc3_gadget_ep_queue_zlp(dwc, dep);
+
 out:
 	spin_unlock_irqrestore(&dwc->lock, flags);
 
@@ -2798,6 +2833,12 @@ int dwc3_gadget_init(struct dwc3 *dwc)
 		goto err3;
 	}
 
+	dwc->zlp_buf = kzalloc(DWC3_ZLP_BUF_SIZE, GFP_KERNEL);
+	if (!dwc->zlp_buf) {
+		ret = -ENOMEM;
+		goto err4;
+	}
+
 	dwc->gadget.ops			= &dwc3_gadget_ops;
 	dwc->gadget.speed		= USB_SPEED_UNKNOWN;
 	dwc->gadget.sg_supported	= true;
@@ -2839,16 +2880,19 @@ int dwc3_gadget_init(struct dwc3 *dwc)
 
 	ret = dwc3_gadget_init_endpoints(dwc);
 	if (ret)
-		goto err4;
+		goto err5;
 
 	ret = usb_add_gadget_udc(dwc->dev, &dwc->gadget);
 	if (ret) {
 		dev_err(dwc->dev, "failed to register udc\n");
-		goto err4;
+		goto err5;
 	}
 
 	return 0;
 
+err5:
+	kfree(dwc->zlp_buf);
+
 err4:
 	dwc3_gadget_free_endpoints(dwc);
 	dma_free_coherent(dwc->dev, DWC3_EP0_BOUNCE_SIZE,
@@ -2881,6 +2925,7 @@ void dwc3_gadget_exit(struct dwc3 *dwc)
 			dwc->ep0_bounce, dwc->ep0_bounce_addr);
 
 	kfree(dwc->setup_buf);
+	kfree(dwc->zlp_buf);
 
 	dma_free_coherent(dwc->dev, sizeof(*dwc->ep0_trb) * 2,
 			dwc->ep0_trb, dwc->ep0_trb_addr);

From 8e7cf0b11e316ba11c7fc3b9bc3ba331f589cc98 Mon Sep 17 00:00:00 2001
From: John Youn <John.Youn@synopsys.com>
Date: Tue, 22 Dec 2015 12:23:20 -0800
Subject: [PATCH 1007/3220] UPSTREAM: usb: dwc3: gadget: don't send extra ZLP

If the request->length is zero, a ZLP should already be sent due to that
and another ZLP is not needed to terminate the transfer.

(cherry-picked from commit
d9261898a4b2c143c28568dc686a1becfc637a99)

Fixes: 04c03d10e507 ("usb: dwc3: gadget: handle request->zero")
Signed-off-by: John Youn <johnyoun@synopsys.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
Bug: 63867169
Change-Id: I97a1801c57dc169b6e9371d5aec599a14f316aff
---
 drivers/usb/dwc3/gadget.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index c0d21367d107..3368c3a88a57 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1259,7 +1259,8 @@ static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request,
 	 * extra usb_request ourselves so that it gets handled the same way as
 	 * any other request.
 	 */
-	if (ret == 0 && request->zero && (request->length % ep->maxpacket == 0))
+	if (ret == 0 && request->zero && request->length &&
+	    (request->length % ep->maxpacket == 0))
 		ret = __dwc3_gadget_ep_queue_zlp(dwc, dep);
 
 out:

From eb4610b979435c98821eefd716b6117a7ca0478d Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 17 Aug 2017 10:43:14 -0700
Subject: [PATCH 1008/3220] ANDROID: NFC: st21nfca: Fix out of bounds kernel
 access when handling ATR_REQ

Out of bounds kernel accesses in st21nfca's NFC HCI layer
might happen when handling ATR_REQ events if user-specified
atr_req->length is bigger than the buffer size. In
that case memcpy() inside st21nfca_tm_send_atr_res() will
read extra bytes resulting in OOB read from the kernel heap.

Bug: 62679012

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 drivers/nfc/st21nfca/dep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c
index 798a32bbac5d..206285210ab5 100644
--- a/drivers/nfc/st21nfca/dep.c
+++ b/drivers/nfc/st21nfca/dep.c
@@ -217,7 +217,8 @@ static int st21nfca_tm_recv_atr_req(struct nfc_hci_dev *hdev,
 
 	atr_req = (struct st21nfca_atr_req *)skb->data;
 
-	if (atr_req->length < sizeof(struct st21nfca_atr_req)) {
+	if (atr_req->length < sizeof(struct st21nfca_atr_req) ||
+	    atr_req->length > skb->len) {
 		r = -EPROTO;
 		goto exit;
 	}

From 723c3e0fec28b9bdd15bbc968a14cb584fdd014b Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 17 Aug 2017 16:30:47 -0700
Subject: [PATCH 1009/3220] ANDROID: nfc: fdp: Fix possible buffer overflow in
 WCS4000 NFC driver

Possible buffer overflow when reading next_read_size bytes into
tmp buffer after next_read_size was extracted from a previous packet.

Bug: 62678828

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 drivers/nfc/fdp/i2c.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c
index a5d7332dfce5..2d043415f326 100644
--- a/drivers/nfc/fdp/i2c.c
+++ b/drivers/nfc/fdp/i2c.c
@@ -177,6 +177,16 @@ static int fdp_nci_i2c_read(struct fdp_i2c_phy *phy, struct sk_buff **skb)
 		/* Packet that contains a length */
 		if (tmp[0] == 0 && tmp[1] == 0) {
 			phy->next_read_size = (tmp[2] << 8) + tmp[3] + 3;
+			/*
+			 * Ensure next_read_size does not exceed sizeof(tmp)
+			 * for reading that many bytes during next iteration
+			 */
+			if (phy->next_read_size > FDP_NCI_I2C_MAX_PAYLOAD) {
+				dev_dbg(&client->dev, "%s: corrupted packet\n",
+					__func__);
+				phy->next_read_size = 5;
+				goto flush;
+			}
 		} else {
 			phy->next_read_size = FDP_NCI_I2C_MIN_PAYLOAD;
 

From 3f4427d43b3b684ec867be2ca2ae3287ba6dde46 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 17 Aug 2017 15:50:54 -0700
Subject: [PATCH 1010/3220] ANDROID: NFC: Fix possible memory corruption when
 handling SHDLC I-Frame commands

When handling SHDLC I-Frame commands "pipe" field used for indexing
into an array should be checked before usage. If left unchecked it
might access memory outside of the array of size NFC_HCI_MAX_PIPES(127).

Bug: 62679701

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 net/nfc/hci/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index 2b0f0ac498d2..5a58f9f38095 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
 		}
 		create_info = (struct hci_create_pipe_resp *)skb->data;
 
+		if (create_info->pipe >= NFC_HCI_MAX_PIPES) {
+			status = NFC_HCI_ANY_E_NOK;
+			goto exit;
+		}
+
 		/* Save the new created pipe and bind with local gate,
 		 * the description for skb->data[3] is destination gate id
 		 * but since we received this cmd from host controller, we
@@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
 		}
 		delete_info = (struct hci_delete_pipe_noti *)skb->data;
 
+		if (delete_info->pipe >= NFC_HCI_MAX_PIPES) {
+			status = NFC_HCI_ANY_E_NOK;
+			goto exit;
+		}
+
 		hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE;
 		hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST;
 		break;

From 14a42657da62150b5ef6358d5cbb63250fd12c76 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 15 Aug 2017 15:12:24 -0700
Subject: [PATCH 1011/3220] ANDROID: check dir value of xfrm_userpolicy_id

Check user provided dir value to prevent out-of-bound access
which may occur if dir is not less than XFRM_POLICY_MAX.

(url: http://seclists.org/bugtraq/2017/Jul/30)

Bug: 64257838
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I5bbdf95e14a61bdf5207977d9a5a4465bc848da0
---
 net/xfrm/xfrm_user.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 7a5a64e70b4d..4c696d4d5ce3 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1691,6 +1691,10 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
 	struct sk_buff *skb;
 	int err;
 
+	err = verify_policy_dir(dir);
+	if (err)
+		return ERR_PTR(err);
+
 	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
@@ -2216,6 +2220,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int n = 0;
 	struct net *net = sock_net(skb->sk);
 
+	err = verify_policy_dir(pi->dir);
+	if (err)
+		return err;
+
 	if (attrs[XFRMA_MIGRATE] == NULL)
 		return -EINVAL;
 
@@ -2331,6 +2339,11 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 {
 	struct net *net = &init_net;
 	struct sk_buff *skb;
+	int err;
+
+	err = verify_policy_dir(dir);
+	if (err)
+		return err;
 
 	skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k), GFP_ATOMIC);
 	if (skb == NULL)
@@ -2985,6 +2998,11 @@ out_free_skb:
 
 static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
 {
+	int err;
+
+	err = verify_policy_dir(dir);
+	if (err)
+		return err;
 
 	switch (c->event) {
 	case XFRM_MSG_NEWPOLICY:

From efc949fedd30023a30ea34586b485554932c878f Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 17 Aug 2017 13:58:40 -0700
Subject: [PATCH 1012/3220] ANDROID: NFC: st21nfca: Fix memory OOB and leak
 issues in connectivity events handler

Overflow on memcpy is possible in kernel driver for st21nfca's
NFC HCI layer when handling connectivity events if aid_len or
params_len are bigger than the buffer size.
Memory leak is possible when parameter tag is invalid.

Bug: 62679581

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 drivers/nfc/st21nfca/se.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c
index c79d99b24c96..2d4b6e910b87 100644
--- a/drivers/nfc/st21nfca/se.c
+++ b/drivers/nfc/st21nfca/se.c
@@ -321,23 +321,33 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host,
 		 * AID		81	5 to 16
 		 * PARAMETERS	82	0 to 255
 		 */
-		if (skb->len < NFC_MIN_AID_LENGTH + 2 &&
+		if (skb->len < NFC_MIN_AID_LENGTH + 2 ||
 		    skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG)
 			return -EPROTO;
 
+		/*
+		 * Buffer should have enough space for at least
+		 * two tag fields + two length fields + aid_len (skb->data[1])
+		 */
+		if (skb->len < skb->data[1] + 4)
+			return -EPROTO;
+
 		transaction = (struct nfc_evt_transaction *)devm_kzalloc(dev,
 						   skb->len - 2, GFP_KERNEL);
 
 		transaction->aid_len = skb->data[1];
 		memcpy(transaction->aid, &skb->data[2],
 		       transaction->aid_len);
-
-		/* Check next byte is PARAMETERS tag (82) */
-		if (skb->data[transaction->aid_len + 2] !=
-		    NFC_EVT_TRANSACTION_PARAMS_TAG)
-			return -EPROTO;
-
 		transaction->params_len = skb->data[transaction->aid_len + 3];
+
+		/* Check next byte is PARAMETERS tag (82) and the length field */
+		if (skb->data[transaction->aid_len + 2] !=
+		    NFC_EVT_TRANSACTION_PARAMS_TAG ||
+		    skb->len < transaction->aid_len + transaction->params_len + 4) {
+			devm_kfree(dev, transaction);
+			return -EPROTO;
+		}
+
 		memcpy(transaction->params, skb->data +
 		       transaction->aid_len + 4, transaction->params_len);
 

From 1ed33cf95476666656d510fab15d87ee04e2327d Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 28 Aug 2017 09:56:27 -0700
Subject: [PATCH 1013/3220] UPSTREAM: cpufreq: schedutil: Make iowait boost
 more energy efficient

Currently the iowait_boost feature in schedutil makes the frequency go
to max on iowait wakeups.  This feature was added to handle a case that
Peter described where the throughput of operations involving continuous
I/O requests [1] is reduced due to running at a lower frequency, however
the lower throughput itself causes utilization to be low and hence
causing frequency to be low hence its "stuck".

Instead of going to max, its also possible to achieve the same effect by
ramping up to max if there are repeated in_iowait wakeups happening.
This patch is an attempt to do that. We start from a lower frequency
(policy->min) and double the boost for every consecutive iowait update
until we reach the maximum iowait boost frequency (iowait_boost_max).

I ran a synthetic test (continuous O_DIRECT writes in a loop) on an x86
machine with intel_pstate in passive mode using schedutil. In this test
the iowait_boost value ramped from 800MHz to 4GHz in 60ms. The patch
achieves the desired improved throughput as the existing behavior.

[1] https://patchwork.kernel.org/patch/9735885/

Change-Id: I4a018434a50f4ca29ec15b03465f6dc212e54423
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/sched/cpufreq_schedutil.c | 38 +++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e12309c1b07b..f6b95ca15023 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -64,6 +64,7 @@ struct sugov_cpu {
 	struct update_util_data update_util;
 	struct sugov_policy *sg_policy;
 
+	bool iowait_boost_pending;
 	unsigned long iowait_boost;
 	unsigned long iowait_boost_max;
 	u64 last_update;
@@ -224,30 +225,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 				   unsigned int flags)
 {
 	if (flags & SCHED_CPUFREQ_IOWAIT) {
-		sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+		if (sg_cpu->iowait_boost_pending)
+			return;
+
+		sg_cpu->iowait_boost_pending = true;
+
+		if (sg_cpu->iowait_boost) {
+			sg_cpu->iowait_boost <<= 1;
+			if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+				sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+		} else {
+			sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+		}
 	} else if (sg_cpu->iowait_boost) {
 		s64 delta_ns = time - sg_cpu->last_update;
 
 		/* Clear iowait_boost if the CPU apprears to have been idle. */
-		if (delta_ns > TICK_NSEC)
+		if (delta_ns > TICK_NSEC) {
 			sg_cpu->iowait_boost = 0;
+			sg_cpu->iowait_boost_pending = false;
+		}
 	}
 }
 
 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 			       unsigned long *max)
 {
-	unsigned long boost_util = sg_cpu->iowait_boost;
-	unsigned long boost_max = sg_cpu->iowait_boost_max;
+	unsigned long boost_util, boost_max;
 
-	if (!boost_util)
+	if (!sg_cpu->iowait_boost)
 		return;
 
+	if (sg_cpu->iowait_boost_pending) {
+		sg_cpu->iowait_boost_pending = false;
+	} else {
+		sg_cpu->iowait_boost >>= 1;
+		if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+			sg_cpu->iowait_boost = 0;
+			return;
+		}
+	}
+
+	boost_util = sg_cpu->iowait_boost;
+	boost_max = sg_cpu->iowait_boost_max;
+
 	if (*util * boost_max < *max * boost_util) {
 		*util = boost_util;
 		*max = boost_max;
 	}
-	sg_cpu->iowait_boost >>= 1;
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -320,6 +345,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
 		delta_ns = last_freq_update_time - j_sg_cpu->last_update;
 		if (delta_ns > TICK_NSEC) {
 			j_sg_cpu->iowait_boost = 0;
+			j_sg_cpu->iowait_boost_pending = false;
 			continue;
 		}
 		if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)

From 7842de4545c71363ef599173b66ab5e1933c43e1 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Sun, 23 Jul 2017 08:54:26 -0700
Subject: [PATCH 1014/3220] UPSTREAM: cpufreq: schedutil: Use unsigned int for
 iowait boost

Make iowait_boost and iowait_boost_max as unsigned int since its unit is kHz
and this is consistent with struct cpufreq_policy. Also change the local
variables in sugov_iowait_boost to match this.

Change-Id: I6c67ed94c57c4bdb24bada4b97045593fcb95d2e
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/sched/cpufreq_schedutil.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f6b95ca15023..a0bcf488fb81 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -65,8 +65,8 @@ struct sugov_cpu {
 	struct sugov_policy *sg_policy;
 
 	bool iowait_boost_pending;
-	unsigned long iowait_boost;
-	unsigned long iowait_boost_max;
+	unsigned int iowait_boost;
+	unsigned int iowait_boost_max;
 	u64 last_update;
 
 	/* The fields below are only needed when sharing a policy. */
@@ -251,7 +251,7 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 			       unsigned long *max)
 {
-	unsigned long boost_util, boost_max;
+	unsigned int boost_util, boost_max;
 
 	if (!sg_cpu->iowait_boost)
 		return;

From 9c4d6ba9981838ba4c703cf432ec52b6ca36aaa2 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Tue, 29 Aug 2017 10:36:40 +0530
Subject: [PATCH 1015/3220] android: android-base.config: enable
 IP6_NF_MATCH_RPFILTER

USB tethering has a hard dependency on ip6t_rpfilter module now.
So enable IP6_NF_MATCH_RPFILTER config to make it work again,
otherwise we run into following failures when we try to enable
USB tethering on Hikey:

W IptablesRestoreController: iptables-restore process 1893 terminated status=256
E IptablesRestoreController: iptables error:
E IptablesRestoreController: ------- COMMAND -------
E IptablesRestoreController: *raw
E IptablesRestoreController: -A natctrl_raw_PREROUTING -i usb0 -m rpfilter --invert ! -s fe80::/64 -j DROP
E IptablesRestoreController: COMMIT
E IptablesRestoreController:
E IptablesRestoreController: ------- ERROR -------
E IptablesRestoreController: ip6tables-restore: line 319 failed
E IptablesRestoreController: ----------------------
E NatController: Error setting forward rules
E Tethering: [usb0] ERROR Exception enabling NAT: java.lang.IllegalStateException: command '55 nat enable
  usb0 wlan0 1 192.168.42.0/24' failed with '400 55 Nat operation failed (No such device)'

Change-Id: I4a4e192ee1c81a7a425eefee9a2a53dd41e1fa0e
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 android/configs/android-base.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
index 48b2cdbe8d49..7199561cd42b 100644
--- a/android/configs/android-base.cfg
+++ b/android/configs/android-base.cfg
@@ -36,6 +36,7 @@ CONFIG_INET_XFRM_MODE_TUNNEL=y
 CONFIG_IP6_NF_FILTER=y
 CONFIG_IP6_NF_IPTABLES=y
 CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_MATCH_RPFILTER=y
 CONFIG_IP6_NF_RAW=y
 CONFIG_IP6_NF_TARGET_REJECT=y
 CONFIG_IPV6=y

From 0e05bd2dc0e79d18a31bcfb2474c2ecaf7d54073 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Fri, 30 Jun 2017 10:22:23 -0700
Subject: [PATCH 1016/3220] FROMLIST: android: binder: Refactor prev and next
 buffer into a helper function

(from https://patchwork.kernel.org/patch/9928605/)

Use helper functions buffer_next and buffer_prev instead
of list_entry to get the next and previous buffers.

Bug: 36007193
Change-Id: I422dce84afde3d2138a6d976593b109a9cc49003
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder_alloc.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index aabfebac6e57..134c649e5a24 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -48,14 +48,23 @@ module_param_named(debug_mask, binder_alloc_debug_mask,
 			pr_info(x); \
 	} while (0)
 
+static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer)
+{
+	return list_entry(buffer->entry.next, struct binder_buffer, entry);
+}
+
+static struct binder_buffer *binder_buffer_prev(struct binder_buffer *buffer)
+{
+	return list_entry(buffer->entry.prev, struct binder_buffer, entry);
+}
+
 static size_t binder_alloc_buffer_size(struct binder_alloc *alloc,
 				       struct binder_buffer *buffer)
 {
 	if (list_is_last(&buffer->entry, &alloc->buffers))
 		return alloc->buffer +
 		       alloc->buffer_size - (void *)buffer->data;
-	return (size_t)list_entry(buffer->entry.next,
-			  struct binder_buffer, entry) - (size_t)buffer->data;
+	return (size_t)binder_buffer_next(buffer) - (size_t)buffer->data;
 }
 
 static void binder_insert_free_buffer(struct binder_alloc *alloc,
@@ -470,7 +479,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc,
 	int free_page_start = 1;
 
 	BUG_ON(alloc->buffers.next == &buffer->entry);
-	prev = list_entry(buffer->entry.prev, struct binder_buffer, entry);
+	prev = binder_buffer_prev(buffer);
 	BUG_ON(!prev->free);
 	if (buffer_end_page(prev) == buffer_start_page(buffer)) {
 		free_page_start = 0;
@@ -482,8 +491,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc,
 	}
 
 	if (!list_is_last(&buffer->entry, &alloc->buffers)) {
-		next = list_entry(buffer->entry.next,
-				  struct binder_buffer, entry);
+		next = binder_buffer_next(buffer);
 		if (buffer_start_page(next) == buffer_end_page(buffer)) {
 			free_page_end = 0;
 			if (buffer_start_page(next) ==
@@ -544,8 +552,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc,
 	rb_erase(&buffer->rb_node, &alloc->allocated_buffers);
 	buffer->free = 1;
 	if (!list_is_last(&buffer->entry, &alloc->buffers)) {
-		struct binder_buffer *next = list_entry(buffer->entry.next,
-						struct binder_buffer, entry);
+		struct binder_buffer *next = binder_buffer_next(buffer);
 
 		if (next->free) {
 			rb_erase(&next->rb_node, &alloc->free_buffers);
@@ -553,8 +560,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc,
 		}
 	}
 	if (alloc->buffers.next != &buffer->entry) {
-		struct binder_buffer *prev = list_entry(buffer->entry.prev,
-						struct binder_buffer, entry);
+		struct binder_buffer *prev = binder_buffer_prev(buffer);
 
 		if (prev->free) {
 			binder_delete_free_buffer(alloc, buffer);

From 3de14ff34cd4a21a7890e3d41f3532ab97424ae3 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Thu, 22 Jun 2017 14:37:45 -0700
Subject: [PATCH 1017/3220] FROMLIST: android: binder: Add allocator selftest

(from https://patchwork.kernel.org/patch/9928609/)

binder_alloc_selftest tests that alloc_new_buf handles page allocation and
deallocation properly when allocate and free buffers. The test allocates 5
buffers of various sizes to cover all possible page alignment cases, and
frees the buffers using a list of exhaustive freeing order.

Test: boot the device with ANDROID_BINDER_IPC_SELFTEST config option
enabled. Allocator selftest passes.

Bug: 36007193
Change-Id: I2fe396232b7dfe4bbc50bdba99ca0de9be63cc37
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/Kconfig                 |  10 +
 drivers/android/Makefile                |   1 +
 drivers/android/binder.c                |   2 +
 drivers/android/binder_alloc.h          |   5 +
 drivers/android/binder_alloc_selftest.c | 271 ++++++++++++++++++++++++
 5 files changed, 289 insertions(+)
 create mode 100644 drivers/android/binder_alloc_selftest.c

diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index 4d4cdc1a6e25..01de42c8b74b 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -44,6 +44,16 @@ config ANDROID_BINDER_IPC_32BIT
 
 	  Note that enabling this will break newer Android user-space.
 
+config ANDROID_BINDER_IPC_SELFTEST
+	bool "Android Binder IPC Driver Selftest"
+	depends on ANDROID_BINDER_IPC
+	---help---
+	  This feature allows binder selftest to run.
+
+	  Binder selftest checks the allocation and free of binder buffers
+	  exhaustively with combinations of various buffer sizes and
+	  alignments.
+
 endif # if ANDROID
 
 endmenu
diff --git a/drivers/android/Makefile b/drivers/android/Makefile
index 4b7c726bb560..a01254c43ee3 100644
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -1,3 +1,4 @@
 ccflags-y += -I$(src)			# needed for trace events
 
 obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
+obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index bfdd52ea0d1c..75cbbd2e986a 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4580,6 +4580,8 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	/*pr_info("binder_ioctl: %d:%d %x %lx\n",
 			proc->pid, current->pid, cmd, arg);*/
 
+	binder_selftest_alloc(&proc->alloc);
+
 	trace_binder_ioctl(cmd, arg);
 
 	ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index 088e4ffc6230..4f02cc084c15 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -102,6 +102,11 @@ struct binder_alloc {
 	int pid;
 };
 
+#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
+void binder_selftest_alloc(struct binder_alloc *alloc);
+#else
+static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
+#endif
 extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
 						  size_t data_size,
 						  size_t offsets_size,
diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c
new file mode 100644
index 000000000000..cc00ab6ee29d
--- /dev/null
+++ b/drivers/android/binder_alloc_selftest.c
@@ -0,0 +1,271 @@
+/* binder_alloc_selftest.c
+ *
+ * Android IPC Subsystem
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm_types.h>
+#include <linux/err.h>
+#include "binder_alloc.h"
+
+#define BUFFER_NUM 5
+#define BUFFER_MIN_SIZE (PAGE_SIZE / 8)
+
+static bool binder_selftest_run = true;
+static int binder_selftest_failures;
+static DEFINE_MUTEX(binder_selftest_lock);
+
+/**
+ * enum buf_end_align_type - Page alignment of a buffer
+ * end with regard to the end of the previous buffer.
+ *
+ * In the pictures below, buf2 refers to the buffer we
+ * are aligning. buf1 refers to previous buffer by addr.
+ * Symbol [ means the start of a buffer, ] means the end
+ * of a buffer, and | means page boundaries.
+ */
+enum buf_end_align_type {
+	/**
+	 * @SAME_PAGE_UNALIGNED: The end of this buffer is on
+	 * the same page as the end of the previous buffer and
+	 * is not page aligned. Examples:
+	 * buf1 ][ buf2 ][ ...
+	 * buf1 ]|[ buf2 ][ ...
+	 */
+	SAME_PAGE_UNALIGNED = 0,
+	/**
+	 * @SAME_PAGE_ALIGNED: When the end of the previous buffer
+	 * is not page aligned, the end of this buffer is on the
+	 * same page as the end of the previous buffer and is page
+	 * aligned. When the previous buffer is page aligned, the
+	 * end of this buffer is aligned to the next page boundary.
+	 * Examples:
+	 * buf1 ][ buf2 ]| ...
+	 * buf1 ]|[ buf2 ]| ...
+	 */
+	SAME_PAGE_ALIGNED,
+	/**
+	 * @NEXT_PAGE_UNALIGNED: The end of this buffer is on
+	 * the page next to the end of the previous buffer and
+	 * is not page aligned. Examples:
+	 * buf1 ][ buf2 | buf2 ][ ...
+	 * buf1 ]|[ buf2 | buf2 ][ ...
+	 */
+	NEXT_PAGE_UNALIGNED,
+	/**
+	 * @NEXT_PAGE_ALIGNED: The end of this buffer is on
+	 * the page next to the end of the previous buffer and
+	 * is page aligned. Examples:
+	 * buf1 ][ buf2 | buf2 ]| ...
+	 * buf1 ]|[ buf2 | buf2 ]| ...
+	 */
+	NEXT_PAGE_ALIGNED,
+	/**
+	 * @NEXT_NEXT_UNALIGNED: The end of this buffer is on
+	 * the page that follows the page after the end of the
+	 * previous buffer and is not page aligned. Examples:
+	 * buf1 ][ buf2 | buf2 | buf2 ][ ...
+	 * buf1 ]|[ buf2 | buf2 | buf2 ][ ...
+	 */
+	NEXT_NEXT_UNALIGNED,
+	LOOP_END,
+};
+
+static void pr_err_size_seq(size_t *sizes, int *seq)
+{
+	int i;
+
+	pr_err("alloc sizes: ");
+	for (i = 0; i < BUFFER_NUM; i++)
+		pr_cont("[%zu]", sizes[i]);
+	pr_cont("\n");
+	pr_err("free seq: ");
+	for (i = 0; i < BUFFER_NUM; i++)
+		pr_cont("[%d]", seq[i]);
+	pr_cont("\n");
+}
+
+static bool check_buffer_pages_allocated(struct binder_alloc *alloc,
+					 struct binder_buffer *buffer,
+					 size_t size)
+{
+	void *page_addr, *end;
+	int page_index;
+
+	end = (void *)PAGE_ALIGN((uintptr_t)buffer + size);
+	for (page_addr = buffer; page_addr < end; page_addr += PAGE_SIZE) {
+		page_index = (page_addr - alloc->buffer) / PAGE_SIZE;
+		if (!alloc->pages[page_index]) {
+			pr_err("incorrect alloc state at page index %d\n",
+			       page_index);
+			return false;
+		}
+	}
+	return true;
+}
+
+static void binder_selftest_alloc_buf(struct binder_alloc *alloc,
+				      struct binder_buffer *buffers[],
+				      size_t *sizes, int *seq)
+{
+	int i;
+
+	for (i = 0; i < BUFFER_NUM; i++) {
+		buffers[i] = binder_alloc_new_buf(alloc, sizes[i], 0, 0, 0);
+		if (IS_ERR(buffers[i]) ||
+		    !check_buffer_pages_allocated(alloc, buffers[i],
+						  sizes[i])) {
+			pr_err_size_seq(sizes, seq);
+			binder_selftest_failures++;
+		}
+	}
+}
+
+static void binder_selftest_free_buf(struct binder_alloc *alloc,
+				     struct binder_buffer *buffers[],
+				     size_t *sizes, int *seq)
+{
+	int i;
+
+	for (i = 0; i < BUFFER_NUM; i++)
+		binder_alloc_free_buf(alloc, buffers[seq[i]]);
+
+	for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) {
+		if ((!alloc->pages[i]) == (i == 0)) {
+			pr_err("incorrect free state at page index %d\n", i);
+			binder_selftest_failures++;
+		}
+	}
+}
+
+static void binder_selftest_alloc_free(struct binder_alloc *alloc,
+				       size_t *sizes, int *seq)
+{
+	struct binder_buffer *buffers[BUFFER_NUM];
+
+	binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
+	binder_selftest_free_buf(alloc, buffers, sizes, seq);
+}
+
+static bool is_dup(int *seq, int index, int val)
+{
+	int i;
+
+	for (i = 0; i < index; i++) {
+		if (seq[i] == val)
+			return true;
+	}
+	return false;
+}
+
+/* Generate BUFFER_NUM factorial free orders. */
+static void binder_selftest_free_seq(struct binder_alloc *alloc,
+				     size_t *sizes, int *seq, int index)
+{
+	int i;
+
+	if (index == BUFFER_NUM) {
+		binder_selftest_alloc_free(alloc, sizes, seq);
+		return;
+	}
+	for (i = 0; i < BUFFER_NUM; i++) {
+		if (is_dup(seq, index, i))
+			continue;
+		seq[index] = i;
+		binder_selftest_free_seq(alloc, sizes, seq, index + 1);
+	}
+}
+
+static void binder_selftest_alloc_size(struct binder_alloc *alloc,
+				       size_t *end_offset)
+{
+	int i;
+	int seq[BUFFER_NUM] = {0};
+	size_t front_sizes[BUFFER_NUM];
+	size_t back_sizes[BUFFER_NUM];
+	size_t last_offset, offset = 0;
+
+	for (i = 0; i < BUFFER_NUM; i++) {
+		last_offset = offset;
+		offset = end_offset[i];
+		front_sizes[i] = offset - last_offset;
+		back_sizes[BUFFER_NUM - i - 1] = front_sizes[i];
+	}
+	/*
+	 * Buffers share the first or last few pages.
+	 * Only BUFFER_NUM - 1 buffer sizes are adjustable since
+	 * we need one giant buffer before getting to the last page.
+	 */
+	back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1]
+		- sizeof(struct binder_buffer) * BUFFER_NUM;
+	binder_selftest_free_seq(alloc, front_sizes, seq, 0);
+	binder_selftest_free_seq(alloc, back_sizes, seq, 0);
+}
+
+static void binder_selftest_alloc_offset(struct binder_alloc *alloc,
+					 size_t *end_offset, int index)
+{
+	int align;
+	size_t end, prev;
+
+	if (index == BUFFER_NUM) {
+		binder_selftest_alloc_size(alloc, end_offset);
+		return;
+	}
+	prev = index == 0 ? 0 : end_offset[index - 1];
+	end = prev;
+
+	BUILD_BUG_ON((BUFFER_MIN_SIZE + sizeof(struct binder_buffer))
+		     * BUFFER_NUM >= PAGE_SIZE);
+
+	for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) {
+		if (align % 2)
+			end = ALIGN(end, PAGE_SIZE);
+		else
+			end += BUFFER_MIN_SIZE;
+		end_offset[index] = end;
+		binder_selftest_alloc_offset(alloc, end_offset, index + 1);
+	}
+}
+
+/**
+ * binder_selftest_alloc() - Test alloc and free of buffer pages.
+ * @alloc: Pointer to alloc struct.
+ *
+ * Allocate BUFFER_NUM buffers to cover all page alignment cases,
+ * then free them in all orders possible. Check that pages are
+ * allocated after buffer alloc and freed after freeing buffer.
+ */
+void binder_selftest_alloc(struct binder_alloc *alloc)
+{
+	size_t end_offset[BUFFER_NUM];
+
+	if (!binder_selftest_run)
+		return;
+	mutex_lock(&binder_selftest_lock);
+	if (!binder_selftest_run || !alloc->vma)
+		goto done;
+	pr_info("STARTED\n");
+	binder_selftest_alloc_offset(alloc, end_offset, 0);
+	binder_selftest_run = false;
+	if (binder_selftest_failures > 0)
+		pr_info("%d tests FAILED\n", binder_selftest_failures);
+	else
+		pr_info("PASSED\n");
+
+done:
+	mutex_unlock(&binder_selftest_lock);
+}

From 7a6d4b157e1c046f4c4de9f2f1f1f7abfa6abb21 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Thu, 3 Aug 2017 11:33:53 -0700
Subject: [PATCH 1018/3220] FROMLIST: android: binder: Move buffer out of area
 shared with user space

(from https://patchwork.kernel.org/patch/9928607/)

Binder driver allocates buffer meta data in a region that is mapped
in user space. These meta data contain pointers in the kernel.

This patch allocates buffer meta data on the kernel heap that is
not mapped in user space, and uses a pointer to refer to the data mapped.

Also move alloc->buffers initialization from mmap to init since it's
now used even when mmap failed or was not called.

Bug: 36007193
Change-Id: Id5136048bdb7b796f59de066de7ea7df410498f5
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder_alloc.c          | 146 ++++++++++++++----------
 drivers/android/binder_alloc.h          |   2 +-
 drivers/android/binder_alloc_selftest.c |  11 +-
 3 files changed, 91 insertions(+), 68 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 134c649e5a24..2ca2b02f82bb 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -62,9 +62,9 @@ static size_t binder_alloc_buffer_size(struct binder_alloc *alloc,
 				       struct binder_buffer *buffer)
 {
 	if (list_is_last(&buffer->entry, &alloc->buffers))
-		return alloc->buffer +
-		       alloc->buffer_size - (void *)buffer->data;
-	return (size_t)binder_buffer_next(buffer) - (size_t)buffer->data;
+		return (u8 *)alloc->buffer +
+			alloc->buffer_size - (u8 *)buffer->data;
+	return (u8 *)binder_buffer_next(buffer)->data - (u8 *)buffer->data;
 }
 
 static void binder_insert_free_buffer(struct binder_alloc *alloc,
@@ -114,9 +114,9 @@ static void binder_insert_allocated_buffer_locked(
 		buffer = rb_entry(parent, struct binder_buffer, rb_node);
 		BUG_ON(buffer->free);
 
-		if (new_buffer < buffer)
+		if (new_buffer->data < buffer->data)
 			p = &parent->rb_left;
-		else if (new_buffer > buffer)
+		else if (new_buffer->data > buffer->data)
 			p = &parent->rb_right;
 		else
 			BUG();
@@ -131,18 +131,17 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked(
 {
 	struct rb_node *n = alloc->allocated_buffers.rb_node;
 	struct binder_buffer *buffer;
-	struct binder_buffer *kern_ptr;
+	void *kern_ptr;
 
-	kern_ptr = (struct binder_buffer *)(user_ptr - alloc->user_buffer_offset
-		- offsetof(struct binder_buffer, data));
+	kern_ptr = (void *)(user_ptr - alloc->user_buffer_offset);
 
 	while (n) {
 		buffer = rb_entry(n, struct binder_buffer, rb_node);
 		BUG_ON(buffer->free);
 
-		if (kern_ptr < buffer)
+		if (kern_ptr < buffer->data)
 			n = n->rb_left;
-		else if (kern_ptr > buffer)
+		else if (kern_ptr > buffer->data)
 			n = n->rb_right;
 		else {
 			/*
@@ -330,6 +329,9 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 		return ERR_PTR(-ENOSPC);
 	}
 
+	/* Pad 0-size buffers so they get assigned unique addresses */
+	size = max(size, sizeof(void *));
+
 	while (n) {
 		buffer = rb_entry(n, struct binder_buffer, rb_node);
 		BUG_ON(!buffer->free);
@@ -389,14 +391,9 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 
 	has_page_addr =
 		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
-	if (n == NULL) {
-		if (size + sizeof(struct binder_buffer) + 4 >= buffer_size)
-			buffer_size = size; /* no room for other buffers */
-		else
-			buffer_size = size + sizeof(struct binder_buffer);
-	}
+	WARN_ON(n && buffer_size != size);
 	end_page_addr =
-		(void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size);
+		(void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
 	if (end_page_addr > has_page_addr)
 		end_page_addr = has_page_addr;
 	ret = binder_update_page_range(alloc, 1,
@@ -404,17 +401,25 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 	if (ret)
 		return ERR_PTR(ret);
 
-	rb_erase(best_fit, &alloc->free_buffers);
-	buffer->free = 0;
-	buffer->free_in_progress = 0;
-	binder_insert_allocated_buffer_locked(alloc, buffer);
 	if (buffer_size != size) {
-		struct binder_buffer *new_buffer = (void *)buffer->data + size;
+		struct binder_buffer *new_buffer;
 
+		new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
+		if (!new_buffer) {
+			pr_err("%s: %d failed to alloc new buffer struct\n",
+			       __func__, alloc->pid);
+			goto err_alloc_buf_struct_failed;
+		}
+		new_buffer->data = (u8 *)buffer->data + size;
 		list_add(&new_buffer->entry, &buffer->entry);
 		new_buffer->free = 1;
 		binder_insert_free_buffer(alloc, new_buffer);
 	}
+
+	rb_erase(best_fit, &alloc->free_buffers);
+	buffer->free = 0;
+	buffer->free_in_progress = 0;
+	binder_insert_allocated_buffer_locked(alloc, buffer);
 	binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
 		     "%d: binder_alloc_buf size %zd got %pK\n",
 		      alloc->pid, size, buffer);
@@ -429,6 +434,12 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 			      alloc->pid, size, alloc->free_async_space);
 	}
 	return buffer;
+
+err_alloc_buf_struct_failed:
+	binder_update_page_range(alloc, 0,
+				 (void *)PAGE_ALIGN((uintptr_t)buffer->data),
+				 end_page_addr, NULL);
+	return ERR_PTR(-ENOMEM);
 }
 
 /**
@@ -463,56 +474,59 @@ struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
 
 static void *buffer_start_page(struct binder_buffer *buffer)
 {
-	return (void *)((uintptr_t)buffer & PAGE_MASK);
+	return (void *)((uintptr_t)buffer->data & PAGE_MASK);
 }
 
-static void *buffer_end_page(struct binder_buffer *buffer)
+static void *prev_buffer_end_page(struct binder_buffer *buffer)
 {
-	return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK);
+	return (void *)(((uintptr_t)(buffer->data) - 1) & PAGE_MASK);
 }
 
 static void binder_delete_free_buffer(struct binder_alloc *alloc,
 				      struct binder_buffer *buffer)
 {
 	struct binder_buffer *prev, *next = NULL;
-	int free_page_end = 1;
-	int free_page_start = 1;
-
+	bool to_free = true;
 	BUG_ON(alloc->buffers.next == &buffer->entry);
 	prev = binder_buffer_prev(buffer);
 	BUG_ON(!prev->free);
-	if (buffer_end_page(prev) == buffer_start_page(buffer)) {
-		free_page_start = 0;
-		if (buffer_end_page(prev) == buffer_end_page(buffer))
-			free_page_end = 0;
+	if (prev_buffer_end_page(prev) == buffer_start_page(buffer)) {
+		to_free = false;
 		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
-			     "%d: merge free, buffer %pK share page with %pK\n",
-			      alloc->pid, buffer, prev);
+				   "%d: merge free, buffer %pK share page with %pK\n",
+				   alloc->pid, buffer->data, prev->data);
 	}
 
 	if (!list_is_last(&buffer->entry, &alloc->buffers)) {
 		next = binder_buffer_next(buffer);
-		if (buffer_start_page(next) == buffer_end_page(buffer)) {
-			free_page_end = 0;
-			if (buffer_start_page(next) ==
-			    buffer_start_page(buffer))
-				free_page_start = 0;
+		if (buffer_start_page(next) == buffer_start_page(buffer)) {
+			to_free = false;
 			binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
-				     "%d: merge free, buffer %pK share page with %pK\n",
-				      alloc->pid, buffer, prev);
+					   "%d: merge free, buffer %pK share page with %pK\n",
+					   alloc->pid,
+					   buffer->data,
+					   next->data);
 		}
 	}
-	list_del(&buffer->entry);
-	if (free_page_start || free_page_end) {
+
+	if (PAGE_ALIGNED(buffer->data)) {
 		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
-			     "%d: merge free, buffer %pK do not share page%s%s with %pK or %pK\n",
-			     alloc->pid, buffer, free_page_start ? "" : " end",
-			     free_page_end ? "" : " start", prev, next);
-		binder_update_page_range(alloc, 0, free_page_start ?
-			buffer_start_page(buffer) : buffer_end_page(buffer),
-			(free_page_end ? buffer_end_page(buffer) :
-			buffer_start_page(buffer)) + PAGE_SIZE, NULL);
+				   "%d: merge free, buffer start %pK is page aligned\n",
+				   alloc->pid, buffer->data);
+		to_free = false;
 	}
+
+	if (to_free) {
+		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+				   "%d: merge free, buffer %pK do not share page with %pK or %pK\n",
+				   alloc->pid, buffer->data,
+				   prev->data, next->data);
+		binder_update_page_range(alloc, 0, buffer_start_page(buffer),
+					 buffer_start_page(buffer) + PAGE_SIZE,
+					 NULL);
+	}
+	list_del(&buffer->entry);
+	kfree(buffer);
 }
 
 static void binder_free_buf_locked(struct binder_alloc *alloc,
@@ -533,8 +547,8 @@ static void binder_free_buf_locked(struct binder_alloc *alloc,
 	BUG_ON(buffer->free);
 	BUG_ON(size > buffer_size);
 	BUG_ON(buffer->transaction != NULL);
-	BUG_ON((void *)buffer < alloc->buffer);
-	BUG_ON((void *)buffer > alloc->buffer + alloc->buffer_size);
+	BUG_ON(buffer->data < alloc->buffer);
+	BUG_ON(buffer->data > alloc->buffer + alloc->buffer_size);
 
 	if (buffer->async_transaction) {
 		alloc->free_async_space += size + sizeof(struct binder_buffer);
@@ -646,14 +660,14 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
 	}
 	alloc->buffer_size = vma->vm_end - vma->vm_start;
 
-	if (binder_update_page_range(alloc, 1, alloc->buffer,
-				     alloc->buffer + PAGE_SIZE, vma)) {
+	buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
+	if (!buffer) {
 		ret = -ENOMEM;
-		failure_string = "alloc small buf";
-		goto err_alloc_small_buf_failed;
+		failure_string = "alloc buffer struct";
+		goto err_alloc_buf_struct_failed;
 	}
-	buffer = alloc->buffer;
-	INIT_LIST_HEAD(&alloc->buffers);
+
+	buffer->data = alloc->buffer;
 	list_add(&buffer->entry, &alloc->buffers);
 	buffer->free = 1;
 	binder_insert_free_buffer(alloc, buffer);
@@ -664,7 +678,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
 
 	return 0;
 
-err_alloc_small_buf_failed:
+err_alloc_buf_struct_failed:
 	kfree(alloc->pages);
 	alloc->pages = NULL;
 err_alloc_pages_failed:
@@ -684,14 +698,13 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
 {
 	struct rb_node *n;
 	int buffers, page_count;
+	struct binder_buffer *buffer;
 
 	BUG_ON(alloc->vma);
 
 	buffers = 0;
 	mutex_lock(&alloc->mutex);
 	while ((n = rb_first(&alloc->allocated_buffers))) {
-		struct binder_buffer *buffer;
-
 		buffer = rb_entry(n, struct binder_buffer, rb_node);
 
 		/* Transaction should already have been freed */
@@ -701,6 +714,16 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
 		buffers++;
 	}
 
+	while (!list_empty(&alloc->buffers)) {
+		buffer = list_first_entry(&alloc->buffers,
+					  struct binder_buffer, entry);
+		WARN_ON(!buffer->free);
+
+		list_del(&buffer->entry);
+		WARN_ON_ONCE(!list_empty(&alloc->buffers));
+		kfree(buffer);
+	}
+
 	page_count = 0;
 	if (alloc->pages) {
 		int i;
@@ -804,5 +827,6 @@ void binder_alloc_init(struct binder_alloc *alloc)
 	alloc->tsk = current->group_leader;
 	alloc->pid = current->group_leader->pid;
 	mutex_init(&alloc->mutex);
+	INIT_LIST_HEAD(&alloc->buffers);
 }
 
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index 4f02cc084c15..dd5649bf6469 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -57,7 +57,7 @@ struct binder_buffer {
 	size_t data_size;
 	size_t offsets_size;
 	size_t extra_buffers_size;
-	uint8_t data[0];
+	void *data;
 };
 
 /**
diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c
index cc00ab6ee29d..0bf72079a9da 100644
--- a/drivers/android/binder_alloc_selftest.c
+++ b/drivers/android/binder_alloc_selftest.c
@@ -105,8 +105,9 @@ static bool check_buffer_pages_allocated(struct binder_alloc *alloc,
 	void *page_addr, *end;
 	int page_index;
 
-	end = (void *)PAGE_ALIGN((uintptr_t)buffer + size);
-	for (page_addr = buffer; page_addr < end; page_addr += PAGE_SIZE) {
+	end = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
+	page_addr = buffer->data;
+	for (; page_addr < end; page_addr += PAGE_SIZE) {
 		page_index = (page_addr - alloc->buffer) / PAGE_SIZE;
 		if (!alloc->pages[page_index]) {
 			pr_err("incorrect alloc state at page index %d\n",
@@ -209,8 +210,7 @@ static void binder_selftest_alloc_size(struct binder_alloc *alloc,
 	 * Only BUFFER_NUM - 1 buffer sizes are adjustable since
 	 * we need one giant buffer before getting to the last page.
 	 */
-	back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1]
-		- sizeof(struct binder_buffer) * BUFFER_NUM;
+	back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1];
 	binder_selftest_free_seq(alloc, front_sizes, seq, 0);
 	binder_selftest_free_seq(alloc, back_sizes, seq, 0);
 }
@@ -228,8 +228,7 @@ static void binder_selftest_alloc_offset(struct binder_alloc *alloc,
 	prev = index == 0 ? 0 : end_offset[index - 1];
 	end = prev;
 
-	BUILD_BUG_ON((BUFFER_MIN_SIZE + sizeof(struct binder_buffer))
-		     * BUFFER_NUM >= PAGE_SIZE);
+	BUILD_BUG_ON(BUFFER_MIN_SIZE * BUFFER_NUM >= PAGE_SIZE);
 
 	for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) {
 		if (align % 2)

From f73e8e762516ed18aa84084c1c2e7ddde3c2213a Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Sat, 29 Jul 2017 13:24:11 -0700
Subject: [PATCH 1019/3220] FROMLIST: android: binder: Add global lru shrinker
 to binder

(from https://patchwork.kernel.org/patch/9928615/)

Hold on to the pages allocated and mapped for transaction
buffers until the system is under memory pressure. When
that happens, use linux shrinker to free pages. Without
using shrinker, patch "android: binder: Move buffer out
of area shared with user space" will cause a significant
slow down for small transactions that fit into the first
page because free list buffer header used to be inlined
with buffer data.

In addition to prevent the performance regression for
small transactions, this patch improves the performance
for transactions that take up more than one page.

Modify alloc selftest to work with the shrinker change.

Test: Run memory intensive applications (Chrome and Camera)
to trigger shrinker callbacks. Binder frees memory as expected.
Test: Run binderThroughputTest with high memory pressure
option enabled.

Bug: 63926541
Change-Id: I3abfc43b405e7e0a6228da37e0689a4b944f0e00
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder.c                |   2 +
 drivers/android/binder_alloc.c          | 173 ++++++++++++++++++++----
 drivers/android/binder_alloc.h          |  23 +++-
 drivers/android/binder_alloc_selftest.c |  68 ++++++++--
 4 files changed, 226 insertions(+), 40 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 75cbbd2e986a..3ce96db8e5d4 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -5623,6 +5623,8 @@ static int __init binder_init(void)
 	struct binder_device *device;
 	struct hlist_node *tmp;
 
+	binder_alloc_shrinker_init();
+
 	atomic_set(&binder_transaction_log.cur, ~0U);
 	atomic_set(&binder_transaction_log_failed.cur, ~0U);
 	binder_deferred_workqueue = create_singlethread_workqueue("binder");
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 2ca2b02f82bb..bcdaf684a658 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -27,9 +27,12 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/list_lru.h>
 #include "binder_alloc.h"
 #include "binder_trace.h"
 
+struct list_lru binder_alloc_lru;
+
 static DEFINE_MUTEX(binder_alloc_mmap_lock);
 
 enum {
@@ -188,8 +191,9 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 {
 	void *page_addr;
 	unsigned long user_page_addr;
-	struct page **page;
-	struct mm_struct *mm;
+	struct binder_lru_page *page;
+	struct mm_struct *mm = NULL;
+	bool need_mm = false;
 
 	binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
 		     "%d: %s pages %pK-%pK\n", alloc->pid,
@@ -200,9 +204,18 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 
 	trace_binder_update_page_range(alloc, allocate, start, end);
 
-	if (vma)
-		mm = NULL;
-	else
+	if (allocate == 0)
+		goto free_range;
+
+	for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
+		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
+		if (!page->page_ptr) {
+			need_mm = true;
+			break;
+		}
+	}
+
+	if (!vma && need_mm)
 		mm = get_task_mm(alloc->tsk);
 
 	if (mm) {
@@ -215,10 +228,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		}
 	}
 
-	if (allocate == 0)
-		goto free_range;
-
-	if (vma == NULL) {
+	if (!vma && need_mm) {
 		pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
 			alloc->pid);
 		goto err_no_vma;
@@ -226,18 +236,33 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 
 	for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
 		int ret;
+		bool on_lru;
 
 		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
 
-		BUG_ON(*page);
-		*page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
-		if (*page == NULL) {
+		if (page->page_ptr) {
+			on_lru = list_lru_del(&binder_alloc_lru, &page->lru);
+			WARN_ON(!on_lru);
+			continue;
+		}
+
+		if (WARN_ON(!vma))
+			goto err_page_ptr_cleared;
+
+		page->page_ptr = alloc_page(GFP_KERNEL |
+					    __GFP_HIGHMEM |
+					    __GFP_ZERO);
+		if (!page->page_ptr) {
 			pr_err("%d: binder_alloc_buf failed for page at %pK\n",
 				alloc->pid, page_addr);
 			goto err_alloc_page_failed;
 		}
+		page->alloc = alloc;
+		INIT_LIST_HEAD(&page->lru);
+
 		ret = map_kernel_range_noflush((unsigned long)page_addr,
-					PAGE_SIZE, PAGE_KERNEL, page);
+					       PAGE_SIZE, PAGE_KERNEL,
+					       &page->page_ptr);
 		flush_cache_vmap((unsigned long)page_addr,
 				(unsigned long)page_addr + PAGE_SIZE);
 		if (ret != 1) {
@@ -247,7 +272,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		}
 		user_page_addr =
 			(uintptr_t)page_addr + alloc->user_buffer_offset;
-		ret = vm_insert_page(vma, user_page_addr, page[0]);
+		ret = vm_insert_page(vma, user_page_addr, page[0].page_ptr);
 		if (ret) {
 			pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
 			       alloc->pid, user_page_addr);
@@ -264,16 +289,21 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 free_range:
 	for (page_addr = end - PAGE_SIZE; page_addr >= start;
 	     page_addr -= PAGE_SIZE) {
+		bool ret;
+
 		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
-		if (vma)
-			zap_page_range(vma, (uintptr_t)page_addr +
-				alloc->user_buffer_offset, PAGE_SIZE, NULL);
+
+		ret = list_lru_add(&binder_alloc_lru, &page->lru);
+		WARN_ON(!ret);
+		continue;
+
 err_vm_insert_page_failed:
 		unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
 err_map_kernel_failed:
-		__free_page(*page);
-		*page = NULL;
+		__free_page(page->page_ptr);
+		page->page_ptr = NULL;
 err_alloc_page_failed:
+err_page_ptr_cleared:
 		;
 	}
 err_no_vma:
@@ -730,16 +760,20 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
 
 		for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
 			void *page_addr;
+			bool on_lru;
 
-			if (!alloc->pages[i])
+			if (!alloc->pages[i].page_ptr)
 				continue;
 
+			on_lru = list_lru_del(&binder_alloc_lru,
+					      &alloc->pages[i].lru);
 			page_addr = alloc->buffer + i * PAGE_SIZE;
 			binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
-				     "%s: %d: page %d at %pK not freed\n",
-				     __func__, alloc->pid, i, page_addr);
+				     "%s: %d: page %d at %pK %s\n",
+				     __func__, alloc->pid, i, page_addr,
+				     on_lru ? "on lru" : "active");
 			unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
-			__free_page(alloc->pages[i]);
+			__free_page(alloc->pages[i].page_ptr);
 			page_count++;
 		}
 		kfree(alloc->pages);
@@ -815,6 +849,94 @@ void binder_alloc_vma_close(struct binder_alloc *alloc)
 	WRITE_ONCE(alloc->vma_vm_mm, NULL);
 }
 
+/**
+ * binder_alloc_free_page() - shrinker callback to free pages
+ * @item:   item to free
+ * @lock:   lock protecting the item
+ * @cb_arg: callback argument
+ *
+ * Called from list_lru_walk() in binder_shrink_scan() to free
+ * up pages when the system is under memory pressure.
+ */
+enum lru_status binder_alloc_free_page(struct list_head *item,
+				       struct list_lru_one *lru,
+				       spinlock_t *lock,
+				       void *cb_arg)
+{
+	struct mm_struct *mm = NULL;
+	struct binder_lru_page *page = container_of(item,
+						    struct binder_lru_page,
+						    lru);
+	struct binder_alloc *alloc;
+	uintptr_t page_addr;
+	size_t index;
+
+	alloc = page->alloc;
+	if (!mutex_trylock(&alloc->mutex))
+		goto err_get_alloc_mutex_failed;
+
+	if (!page->page_ptr)
+		goto err_page_already_freed;
+
+	index = page - alloc->pages;
+	page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
+	if (alloc->vma) {
+		mm = get_task_mm(alloc->tsk);
+		if (!mm)
+			goto err_get_task_mm_failed;
+		if (!down_write_trylock(&mm->mmap_sem))
+			goto err_down_write_mmap_sem_failed;
+
+		zap_page_range(alloc->vma,
+			       page_addr +
+			       alloc->user_buffer_offset,
+			       PAGE_SIZE, NULL);
+
+		up_write(&mm->mmap_sem);
+		mmput(mm);
+	}
+
+	unmap_kernel_range(page_addr, PAGE_SIZE);
+	__free_page(page->page_ptr);
+	page->page_ptr = NULL;
+
+	list_lru_isolate(lru, item);
+
+	mutex_unlock(&alloc->mutex);
+	return LRU_REMOVED;
+
+err_down_write_mmap_sem_failed:
+	mmput(mm);
+err_get_task_mm_failed:
+err_page_already_freed:
+	mutex_unlock(&alloc->mutex);
+err_get_alloc_mutex_failed:
+	return LRU_SKIP;
+}
+
+static unsigned long
+binder_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	unsigned long ret = list_lru_count(&binder_alloc_lru);
+	return ret;
+}
+
+static unsigned long
+binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	unsigned long ret;
+
+	ret = list_lru_walk(&binder_alloc_lru, binder_alloc_free_page,
+			    NULL, sc->nr_to_scan);
+	return ret;
+}
+
+struct shrinker binder_shrinker = {
+	.count_objects = binder_shrink_count,
+	.scan_objects = binder_shrink_scan,
+	.seeks = DEFAULT_SEEKS,
+};
+
 /**
  * binder_alloc_init() - called by binder_open() for per-proc initialization
  * @alloc: binder_alloc for this proc
@@ -830,3 +952,8 @@ void binder_alloc_init(struct binder_alloc *alloc)
 	INIT_LIST_HEAD(&alloc->buffers);
 }
 
+void binder_alloc_shrinker_init(void)
+{
+	list_lru_init(&binder_alloc_lru);
+	register_shrinker(&binder_shrinker);
+}
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index dd5649bf6469..fa707cc63393 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -21,7 +21,9 @@
 #include <linux/rtmutex.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/list_lru.h>
 
+extern struct list_lru binder_alloc_lru;
 struct binder_transaction;
 
 /**
@@ -60,6 +62,18 @@ struct binder_buffer {
 	void *data;
 };
 
+/**
+ * struct binder_lru_page - page object used for binder shrinker
+ * @page_ptr: pointer to physical page in mmap'd space
+ * @lru:      entry in binder_alloc_lru
+ * @alloc:    binder_alloc for a proc
+ */
+struct binder_lru_page {
+	struct list_head lru;
+	struct page *page_ptr;
+	struct binder_alloc *alloc;
+};
+
 /**
  * struct binder_alloc - per-binder proc state for binder allocator
  * @vma:                vm_area_struct passed to mmap_handler
@@ -75,8 +89,7 @@ struct binder_buffer {
  * @allocated_buffers:  rb tree of allocated buffers sorted by address
  * @free_async_space:   VA space available for async buffers. This is
  *                      initialized at mmap time to 1/2 the full VA space
- * @pages:              array of physical page addresses for each
- *                      page of mmap'd space
+ * @pages:              array of binder_lru_page
  * @buffer_size:        size of address space specified via mmap
  * @pid:                pid for associated binder_proc (invariant after init)
  *
@@ -96,7 +109,7 @@ struct binder_alloc {
 	struct rb_root free_buffers;
 	struct rb_root allocated_buffers;
 	size_t free_async_space;
-	struct page **pages;
+	struct binder_lru_page *pages;
 	size_t buffer_size;
 	uint32_t buffer_free;
 	int pid;
@@ -107,12 +120,16 @@ void binder_selftest_alloc(struct binder_alloc *alloc);
 #else
 static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
 #endif
+enum lru_status binder_alloc_free_page(struct list_head *item,
+				       struct list_lru_one *lru,
+				       spinlock_t *lock, void *cb_arg);
 extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
 						  size_t data_size,
 						  size_t offsets_size,
 						  size_t extra_buffers_size,
 						  int is_async);
 extern void binder_alloc_init(struct binder_alloc *alloc);
+void binder_alloc_shrinker_init(void);
 extern void binder_alloc_vma_close(struct binder_alloc *alloc);
 extern struct binder_buffer *
 binder_alloc_prepare_to_free(struct binder_alloc *alloc,
diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c
index 0bf72079a9da..8bd7bcef967d 100644
--- a/drivers/android/binder_alloc_selftest.c
+++ b/drivers/android/binder_alloc_selftest.c
@@ -109,9 +109,11 @@ static bool check_buffer_pages_allocated(struct binder_alloc *alloc,
 	page_addr = buffer->data;
 	for (; page_addr < end; page_addr += PAGE_SIZE) {
 		page_index = (page_addr - alloc->buffer) / PAGE_SIZE;
-		if (!alloc->pages[page_index]) {
-			pr_err("incorrect alloc state at page index %d\n",
-			       page_index);
+		if (!alloc->pages[page_index].page_ptr ||
+		    !list_empty(&alloc->pages[page_index].lru)) {
+			pr_err("expect alloc but is %s at page index %d\n",
+			       alloc->pages[page_index].page_ptr ?
+			       "lru" : "free", page_index);
 			return false;
 		}
 	}
@@ -137,28 +139,63 @@ static void binder_selftest_alloc_buf(struct binder_alloc *alloc,
 
 static void binder_selftest_free_buf(struct binder_alloc *alloc,
 				     struct binder_buffer *buffers[],
-				     size_t *sizes, int *seq)
+				     size_t *sizes, int *seq, size_t end)
 {
 	int i;
 
 	for (i = 0; i < BUFFER_NUM; i++)
 		binder_alloc_free_buf(alloc, buffers[seq[i]]);
 
+	for (i = 0; i < end / PAGE_SIZE; i++) {
+		/**
+		 * Error message on a free page can be false positive
+		 * if binder shrinker ran during binder_alloc_free_buf
+		 * calls above.
+		 */
+		if (list_empty(&alloc->pages[i].lru)) {
+			pr_err_size_seq(sizes, seq);
+			pr_err("expect lru but is %s at page index %d\n",
+			       alloc->pages[i].page_ptr ? "alloc" : "free", i);
+			binder_selftest_failures++;
+		}
+	}
+}
+
+static void binder_selftest_free_page(struct binder_alloc *alloc)
+{
+	int i;
+	unsigned long count;
+
+	while ((count = list_lru_count(&binder_alloc_lru))) {
+		list_lru_walk(&binder_alloc_lru, binder_alloc_free_page,
+			      NULL, count);
+	}
+
 	for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) {
-		if ((!alloc->pages[i]) == (i == 0)) {
-			pr_err("incorrect free state at page index %d\n", i);
+		if (alloc->pages[i].page_ptr) {
+			pr_err("expect free but is %s at page index %d\n",
+			       list_empty(&alloc->pages[i].lru) ?
+			       "alloc" : "lru", i);
 			binder_selftest_failures++;
 		}
 	}
 }
 
 static void binder_selftest_alloc_free(struct binder_alloc *alloc,
-				       size_t *sizes, int *seq)
+				       size_t *sizes, int *seq, size_t end)
 {
 	struct binder_buffer *buffers[BUFFER_NUM];
 
 	binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
-	binder_selftest_free_buf(alloc, buffers, sizes, seq);
+	binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
+
+	/* Allocate from lru. */
+	binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
+	if (list_lru_count(&binder_alloc_lru))
+		pr_err("lru list should be empty but is not\n");
+
+	binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
+	binder_selftest_free_page(alloc);
 }
 
 static bool is_dup(int *seq, int index, int val)
@@ -174,19 +211,20 @@ static bool is_dup(int *seq, int index, int val)
 
 /* Generate BUFFER_NUM factorial free orders. */
 static void binder_selftest_free_seq(struct binder_alloc *alloc,
-				     size_t *sizes, int *seq, int index)
+				     size_t *sizes, int *seq,
+				     int index, size_t end)
 {
 	int i;
 
 	if (index == BUFFER_NUM) {
-		binder_selftest_alloc_free(alloc, sizes, seq);
+		binder_selftest_alloc_free(alloc, sizes, seq, end);
 		return;
 	}
 	for (i = 0; i < BUFFER_NUM; i++) {
 		if (is_dup(seq, index, i))
 			continue;
 		seq[index] = i;
-		binder_selftest_free_seq(alloc, sizes, seq, index + 1);
+		binder_selftest_free_seq(alloc, sizes, seq, index + 1, end);
 	}
 }
 
@@ -211,8 +249,9 @@ static void binder_selftest_alloc_size(struct binder_alloc *alloc,
 	 * we need one giant buffer before getting to the last page.
 	 */
 	back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1];
-	binder_selftest_free_seq(alloc, front_sizes, seq, 0);
-	binder_selftest_free_seq(alloc, back_sizes, seq, 0);
+	binder_selftest_free_seq(alloc, front_sizes, seq, 0,
+				 end_offset[BUFFER_NUM - 1]);
+	binder_selftest_free_seq(alloc, back_sizes, seq, 0, alloc->buffer_size);
 }
 
 static void binder_selftest_alloc_offset(struct binder_alloc *alloc,
@@ -246,7 +285,8 @@ static void binder_selftest_alloc_offset(struct binder_alloc *alloc,
  *
  * Allocate BUFFER_NUM buffers to cover all page alignment cases,
  * then free them in all orders possible. Check that pages are
- * allocated after buffer alloc and freed after freeing buffer.
+ * correctly allocated, put onto lru when buffers are freed, and
+ * are freed when binder_alloc_free_page is called.
  */
 void binder_selftest_alloc(struct binder_alloc *alloc)
 {

From 850d57dceae07291ec4bf6c55e7a6433bbe7bf30 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Wed, 2 Aug 2017 14:02:37 -0700
Subject: [PATCH 1020/3220] FROMLIST: android: binder: Add shrinker tracepoints

(from https://patchwork.kernel.org/patch/9928613/)

Add tracepoints in binder transaction allocator to
record lru hits and alloc/free page.

Bug: 63926541
Change-Id: I2e24fe8e7b6534349df4a87ff865a6843ac9a30b
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder_alloc.c | 27 +++++++++++++++--
 drivers/android/binder_trace.h | 55 ++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index bcdaf684a658..28dc76473ff6 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -237,18 +237,25 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 	for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
 		int ret;
 		bool on_lru;
+		size_t index;
 
-		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
+		index = (page_addr - alloc->buffer) / PAGE_SIZE;
+		page = &alloc->pages[index];
 
 		if (page->page_ptr) {
+			trace_binder_alloc_lru_start(alloc, index);
+
 			on_lru = list_lru_del(&binder_alloc_lru, &page->lru);
 			WARN_ON(!on_lru);
+
+			trace_binder_alloc_lru_end(alloc, index);
 			continue;
 		}
 
 		if (WARN_ON(!vma))
 			goto err_page_ptr_cleared;
 
+		trace_binder_alloc_page_start(alloc, index);
 		page->page_ptr = alloc_page(GFP_KERNEL |
 					    __GFP_HIGHMEM |
 					    __GFP_ZERO);
@@ -278,6 +285,8 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 			       alloc->pid, user_page_addr);
 			goto err_vm_insert_page_failed;
 		}
+
+		trace_binder_alloc_page_end(alloc, index);
 		/* vm_insert_page does not seem to increment the refcount */
 	}
 	if (mm) {
@@ -290,11 +299,17 @@ free_range:
 	for (page_addr = end - PAGE_SIZE; page_addr >= start;
 	     page_addr -= PAGE_SIZE) {
 		bool ret;
+		size_t index;
 
-		page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
+		index = (page_addr - alloc->buffer) / PAGE_SIZE;
+		page = &alloc->pages[index];
+
+		trace_binder_free_lru_start(alloc, index);
 
 		ret = list_lru_add(&binder_alloc_lru, &page->lru);
 		WARN_ON(!ret);
+
+		trace_binder_free_lru_end(alloc, index);
 		continue;
 
 err_vm_insert_page_failed:
@@ -887,19 +902,27 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 		if (!down_write_trylock(&mm->mmap_sem))
 			goto err_down_write_mmap_sem_failed;
 
+		trace_binder_unmap_user_start(alloc, index);
+
 		zap_page_range(alloc->vma,
 			       page_addr +
 			       alloc->user_buffer_offset,
 			       PAGE_SIZE, NULL);
 
+		trace_binder_unmap_user_end(alloc, index);
+
 		up_write(&mm->mmap_sem);
 		mmput(mm);
 	}
 
+	trace_binder_unmap_kernel_start(alloc, index);
+
 	unmap_kernel_range(page_addr, PAGE_SIZE);
 	__free_page(page->page_ptr);
 	page->page_ptr = NULL;
 
+	trace_binder_unmap_kernel_end(alloc, index);
+
 	list_lru_isolate(lru, item);
 
 	mutex_unlock(&alloc->mutex);
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 7967db16ba5a..76e3b9c8a8a2 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -291,6 +291,61 @@ TRACE_EVENT(binder_update_page_range,
 		  __entry->offset, __entry->size)
 );
 
+DECLARE_EVENT_CLASS(binder_lru_page_class,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index),
+	TP_STRUCT__entry(
+		__field(int, proc)
+		__field(size_t, page_index)
+	),
+	TP_fast_assign(
+		__entry->proc = alloc->pid;
+		__entry->page_index = page_index;
+	),
+	TP_printk("proc=%d page_index=%zu",
+		  __entry->proc, __entry->page_index)
+);
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_free_lru_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_free_lru_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
 TRACE_EVENT(binder_command,
 	TP_PROTO(uint32_t cmd),
 	TP_ARGS(cmd),

From 798dfdd839e4401bd7ab9a5e107f890e5c82867a Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Tue, 22 Aug 2017 17:26:57 -0700
Subject: [PATCH 1021/3220] FROMLIST: android: binder: Add page usage in binder
 stats

(from https://patchwork.kernel.org/patch/9928611/)

Add the number of active, lru, and free pages for
each binder process in binder stats

Bug: 63926541
Change-Id: I12618e4eb8ecc08f4f05fe4cba454a88830897f9
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder.c       |  2 ++
 drivers/android/binder_alloc.c | 28 ++++++++++++++++++++++++++++
 drivers/android/binder_alloc.h |  2 ++
 3 files changed, 32 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3ce96db8e5d4..799de03d75a8 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -5427,6 +5427,8 @@ static void print_binder_proc_stats(struct seq_file *m,
 	count = binder_alloc_get_allocated_count(&proc->alloc);
 	seq_printf(m, "  buffers: %d\n", count);
 
+	binder_alloc_print_pages(m, &proc->alloc);
+
 	count = 0;
 	binder_inner_proc_lock(proc);
 	list_for_each_entry(w, &proc->todo, entry) {
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 28dc76473ff6..1c1a7d9c86ed 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -831,6 +831,34 @@ void binder_alloc_print_allocated(struct seq_file *m,
 	mutex_unlock(&alloc->mutex);
 }
 
+/**
+ * binder_alloc_print_pages() - print page usage
+ * @m:     seq_file for output via seq_printf()
+ * @alloc: binder_alloc for this proc
+ */
+void binder_alloc_print_pages(struct seq_file *m,
+			      struct binder_alloc *alloc)
+{
+	struct binder_lru_page *page;
+	int i;
+	int active = 0;
+	int lru = 0;
+	int free = 0;
+
+	mutex_lock(&alloc->mutex);
+	for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
+		page = &alloc->pages[i];
+		if (!page->page_ptr)
+			free++;
+		else if (list_empty(&page->lru))
+			active++;
+		else
+			lru++;
+	}
+	mutex_unlock(&alloc->mutex);
+	seq_printf(m, "  pages: %d:%d:%d\n", active, lru, free);
+}
+
 /**
  * binder_alloc_get_allocated_count() - return count of buffers
  * @alloc: binder_alloc for this proc
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index fa707cc63393..a3a3602c689c 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -142,6 +142,8 @@ extern void binder_alloc_deferred_release(struct binder_alloc *alloc);
 extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc);
 extern void binder_alloc_print_allocated(struct seq_file *m,
 					 struct binder_alloc *alloc);
+void binder_alloc_print_pages(struct seq_file *m,
+			      struct binder_alloc *alloc);
 
 /**
  * binder_alloc_get_free_async_space() - get free space available for async

From 26b37261ea25c6928c6a69e77a8f7d39ee3267c9 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Fri, 26 May 2017 11:11:47 -0700
Subject: [PATCH 1022/3220] sched: deadline: WALT: account cumulative runnable
 avg

Account cumulative runnable average for WALT CPU utilization accounting.

Change-Id: I56934894e626dec183740eeaf89a57d2ef638143
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/deadline.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9ec7f19b5020..2aae8b8b68e8 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -18,6 +18,8 @@
 
 #include <linux/slab.h>
 
+#include "walt.h"
+
 struct dl_bandwidth def_dl_bandwidth;
 
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -882,6 +884,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	WARN_ON(!dl_prio(prio));
 	dl_rq->dl_nr_running++;
 	add_nr_running(rq_of_dl_rq(dl_rq), 1);
+	walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
 
 	inc_dl_deadline(dl_rq, deadline);
 	inc_dl_migration(dl_se, dl_rq);
@@ -896,6 +899,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	WARN_ON(!dl_rq->dl_nr_running);
 	dl_rq->dl_nr_running--;
 	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+	walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
 
 	dec_dl_deadline(dl_rq, dl_se->deadline);
 	dec_dl_migration(dl_se, dl_rq);

From 48f67ea85de468a9b3e47e723e7681cf7771dea6 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Fri, 26 May 2017 11:19:36 -0700
Subject: [PATCH 1023/3220] sched: WALT: fix broken cumulative runnable average
 accounting

When running tasks's ravg.demand is changed update_history() adjusts
rq->cumulative_runnable_avg to reflect change of CPU load.  Currently
this fixup is broken by accumulating task's new demand without
subtracting the task's old demand.

Fix the fixup logic to subtract the task's old demand.

Change-Id: I61beb32a4850879ccb39b733f5564251e465bfeb
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/walt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 92c3aae8e056..166641ed1f39 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -111,8 +111,10 @@ walt_dec_cumulative_runnable_avg(struct rq *rq,
 
 static void
 fixup_cumulative_runnable_avg(struct rq *rq,
-			      struct task_struct *p, s64 task_load_delta)
+			      struct task_struct *p, u64 new_task_load)
 {
+	s64 task_load_delta = (s64)new_task_load - task_load(p);
+
 	rq->cumulative_runnable_avg += task_load_delta;
 	if ((s64)rq->cumulative_runnable_avg < 0)
 		panic("cra less than zero: tld: %lld, task_load(p) = %u\n",

From ee4cebd75ed7b77132c39c0093923f9ff1bcafaa Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Thu, 8 Dec 2016 16:12:12 -0800
Subject: [PATCH 1024/3220] sched: EAS/WALT: use cr_avg instead of
 prev_runnable_sum

WALT accounts two major statistics; CPU load and cumulative tasks
demand.

The CPU load which is account of accumulated each CPU's absolute
execution time is for CPU frequency guidance.  Whereas cumulative
tasks demand which is each CPU's instantaneous load to reflect
CPU's load at given time is for task placement decision.

Use cumulative tasks demand for cpu_util() for task placement and
introduce cpu_util_freq() for frequency guidance.

Change-Id: Id928f01dbc8cb2a617cdadc584c1f658022565c5
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/core.c  |  2 +-
 kernel/sched/fair.c  |  4 ++--
 kernel/sched/sched.h | 16 +++++++++++++++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9307827cc7b1..4f97de8e0b18 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2992,7 +2992,7 @@ static void sched_freq_tick_pelt(int cpu)
 #ifdef CONFIG_SCHED_WALT
 static void sched_freq_tick_walt(int cpu)
 {
-	unsigned long cpu_utilization = cpu_util(cpu);
+	unsigned long cpu_utilization = cpu_util_freq(cpu);
 	unsigned long capacity_curr = capacity_curr_of(cpu);
 
 	if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c193e9b1c38f..3641dad3d4cc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4659,7 +4659,7 @@ static inline void hrtick_update(struct rq *rq)
 static bool cpu_overutilized(int cpu);
 unsigned long boosted_cpu_util(int cpu);
 #else
-#define boosted_cpu_util(cpu) cpu_util(cpu)
+#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
 #endif
 
 #ifdef CONFIG_SMP
@@ -5937,7 +5937,7 @@ schedtune_task_margin(struct task_struct *task)
 unsigned long
 boosted_cpu_util(int cpu)
 {
-	unsigned long util = cpu_util(cpu);
+	unsigned long util = cpu_util_freq(cpu);
 	long margin = schedtune_cpu_margin(util, cpu);
 
 	trace_sched_boost_cpu(cpu, util, margin);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 029cf2bbeda2..73077f535e95 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1592,7 +1592,7 @@ static inline unsigned long __cpu_util(int cpu, int delta)
 
 #ifdef CONFIG_SCHED_WALT
 	if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
-		util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT;
+		util = cpu_rq(cpu)->cumulative_runnable_avg << SCHED_LOAD_SHIFT;
 		do_div(util, walt_ravg_window);
 	}
 #endif
@@ -1608,6 +1608,20 @@ static inline unsigned long cpu_util(int cpu)
 	return __cpu_util(cpu, 0);
 }
 
+static inline unsigned long cpu_util_freq(int cpu)
+{
+	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+	unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+		util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT;
+		do_div(util, walt_ravg_window);
+	}
+#endif
+	return (util >= capacity) ? capacity : util;
+}
+
 #endif
 
 #ifdef CONFIG_CPU_FREQ_GOV_SCHED

From 2d7da09705d6560c61676376be3ae05a5019600a Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Mon, 16 Jan 2017 18:09:20 -0800
Subject: [PATCH 1025/3220] sched: EAS: schedfreq: fix CPU util over estimation

WALT CPU utilization reports CPU load of all the scheduler classes.
Therefore adding RT class's load additionally will cause frequency
overshooting.  Fix such issue by not accounting RT class load when
requesting capacity.

Change-Id: I29600d7af7ca8c00e0d2ff1e13872024ccaa72bf
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/cpufreq_sched.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index f10d9f7d6d07..6ffb23adbcef 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -235,6 +235,18 @@ out:
 	cpufreq_cpu_put(policy);
 }
 
+#ifdef CONFIG_SCHED_WALT
+static inline unsigned long
+requested_capacity(struct sched_capacity_reqs *scr)
+{
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+		return scr->cfs;
+	return scr->cfs + scr->rt;
+}
+#else
+#define requested_capacity(scr) (scr->cfs + scr->rt)
+#endif
+
 void update_cpu_capacity_request(int cpu, bool request)
 {
 	unsigned long new_capacity;
@@ -245,7 +257,7 @@ void update_cpu_capacity_request(int cpu, bool request)
 
 	scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
 
-	new_capacity = scr->cfs + scr->rt;
+	new_capacity = requested_capacity(scr);
 	new_capacity = new_capacity * capacity_margin
 		/ SCHED_CAPACITY_SCALE;
 	new_capacity += scr->dl;

From c8b8c92bbc8954ae18cba7eb0c1f3a06a244129c Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Fri, 20 Jan 2017 11:10:15 -0800
Subject: [PATCH 1026/3220] sched: WALT: fix potential overflow

Task demand and CPU util are in u64.

Change-Id: If7ec1623e723026d3346201122aab0303a6d2ba2
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/sched.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 73077f535e95..d4613c5be81d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1591,10 +1591,9 @@ static inline unsigned long __cpu_util(int cpu, int delta)
 	unsigned long capacity = capacity_orig_of(cpu);
 
 #ifdef CONFIG_SCHED_WALT
-	if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
-		util = cpu_rq(cpu)->cumulative_runnable_avg << SCHED_LOAD_SHIFT;
-		do_div(util, walt_ravg_window);
-	}
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+		util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
+				 walt_ravg_window >> SCHED_LOAD_SHIFT);
 #endif
 	delta += util;
 	if (delta < 0)
@@ -1614,10 +1613,9 @@ static inline unsigned long cpu_util_freq(int cpu)
 	unsigned long capacity = capacity_orig_of(cpu);
 
 #ifdef CONFIG_SCHED_WALT
-	if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
-		util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT;
-		do_div(util, walt_ravg_window);
-	}
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+		util = div64_u64(cpu_rq(cpu)->prev_runnable_sum,
+				 walt_ravg_window >> SCHED_LOAD_SHIFT);
 #endif
 	return (util >= capacity) ? capacity : util;
 }

From f94958ffa75d4f18d6d6838629d71edee41103c5 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Wed, 25 Jan 2017 17:07:19 -0800
Subject: [PATCH 1027/3220] cpufreq: sched: WALT: don't apply capacity margin
 twice

With WALT all the scheduler classes' load are accounted in scr->cfs and
update_cpu_capacity_request() adds capacity margin.  At present, at tick
path, scheduler also adds capacity margin.  Therefore the margin applied
twice.

Fix such error by using margin applied cpu utilization only for checking
whether frequency increase is needed.

Change-Id: Id7d8cc73b2e4eec70b274ca66e09bb0b16bf6f09
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
(trivial rebase conflict)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4f97de8e0b18..0b14b4634b8c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2999,13 +2999,13 @@ static void sched_freq_tick_walt(int cpu)
 		return sched_freq_tick_pelt(cpu);
 
 	/*
-	 * Add a margin to the WALT utilization.
+	 * Add a margin to the WALT utilization to check if we will need to
+	 * increase frequency.
 	 * NOTE: WALT tracks a single CPU signal for all the scheduling
 	 * classes, thus this margin is going to be added to the DL class as
 	 * well, which is something we do not do in sched_freq_tick_pelt case.
 	 */
-	cpu_utilization = add_capacity_margin(cpu_utilization);
-	if (cpu_utilization <= capacity_curr)
+	if (add_capacity_margin(cpu_utilization) <= capacity_curr)
 		return;
 
 	/*

From 94e5c965075b55a5dfd1c4cce580e2dfb0c7ffc3 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Wed, 25 Jan 2017 17:45:56 -0800
Subject: [PATCH 1028/3220] sched: EAS/WALT: take into account of waking task's
 load

WALT's function cpu_util(cpu) reports CPU's load without taking into
account of waking task's load.  Thus currently cpu_overutilized()
underestimates load on the previous CPU of waking task.

Take into account of task's load to determine whether previous CPU is
overutilzed to bail out early without running energy_diff() which is
expensive.

Change-Id: I30f146984a880ad2cc1b8a4ce35bd239a8c9a607
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
(minor rebase conflicts)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3641dad3d4cc..94bb5786d8a7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4656,6 +4656,7 @@ static inline void hrtick_update(struct rq *rq)
 #endif
 
 #ifdef CONFIG_SMP
+static bool __cpu_overutilized(int cpu, int delta);
 static bool cpu_overutilized(int cpu);
 unsigned long boosted_cpu_util(int cpu);
 #else
@@ -5856,9 +5857,14 @@ static inline bool task_fits_max(struct task_struct *p, int cpu)
 	return __task_fits(p, cpu, 0);
 }
 
+static bool __cpu_overutilized(int cpu, int delta)
+{
+	return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
+}
+
 static bool cpu_overutilized(int cpu)
 {
-	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+	return __cpu_overutilized(cpu, 0);
 }
 
 #ifdef CONFIG_SCHED_TUNE
@@ -6577,6 +6583,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 	}
 
 	if (target_cpu != prev_cpu) {
+		int delta = 0;
 		struct energy_env eenv = {
 			.util_delta     = task_util(p),
 			.src_cpu        = prev_cpu,
@@ -6584,8 +6591,13 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 			.task           = p,
 		};
 
+
+#ifdef CONFIG_SCHED_WALT
+		if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+			delta = task_util(p);
+#endif
 		/* Not enough spare capacity on previous cpu */
-		if (cpu_overutilized(prev_cpu)) {
+		if (__cpu_overutilized(prev_cpu, delta)) {
 			schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
 			schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
 			goto unlock;

From 11b618a0b2fc8598474a518add7037d4005d83e8 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Mon, 24 Jul 2017 13:23:05 +0100
Subject: [PATCH 1029/3220] sched: EAS: fix incorrect energy delta calculation
 due to rounding error

In order to calculate energy difference we currently iterates CPUs under
the same sched doamin to accumulate total energy cost and compare before
and after :

  for_each_domain(cpu)
          total_energy_before += (cpu_util * power) >> SCHED_CAPACITY_SHIFT;

  for_each_domain(cpu)
          total_energy_after += (cpu_util * power) >> SCHED_CAPACITY_SHIFT;

Doing such can incorrectly calculate and report abs(delta) > 0 when
there is actually no energy delta between before and after because the
same total accumulated cpu_util of all the CPUs can be distributed
differently before and after and it causes different amount of rounding
error.

Fix such incorrectness by shifting just once with accumulated
total_energy.

Change-Id: I82f1e2e358367058960938b4ef81714f57e921cf
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
(moved part to another commit)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 94bb5786d8a7..61c8952c7aaf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5473,10 +5473,8 @@ end:
  */
 static int sched_group_energy(struct energy_env *eenv)
 {
-	struct sched_domain *sd;
-	int cpu, total_energy = 0;
 	struct cpumask visit_cpus;
-	struct sched_group *sg;
+	u64 total_energy = 0;
 
 	WARN_ON(!eenv->sg_top->sge);
 
@@ -5484,8 +5482,8 @@ static int sched_group_energy(struct energy_env *eenv)
 
 	while (!cpumask_empty(&visit_cpus)) {
 		struct sched_group *sg_shared_cap = NULL;
-
-		cpu = cpumask_first(&visit_cpus);
+		int cpu = cpumask_first(&visit_cpus);
+		struct sched_domain *sd;
 
 		/*
 		 * Is the group utilization affected by cpus outside this
@@ -5497,7 +5495,7 @@ static int sched_group_energy(struct energy_env *eenv)
 			sg_shared_cap = sd->parent->groups;
 
 		for_each_domain(cpu, sd) {
-			sg = sd->groups;
+			struct sched_group *sg = sd->groups;
 
 			/* Has this sched_domain already been visited? */
 			if (sd->child && group_first_cpu(sg) != cpu)
@@ -5533,11 +5531,9 @@ static int sched_group_energy(struct energy_env *eenv)
 				idle_idx = group_idle_state(eenv, sg);
 				group_util = group_norm_util(eenv, sg);
 
-				sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
-								>> SCHED_CAPACITY_SHIFT;
+				sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
 				sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
-								* sg->sge->idle_states[idle_idx].power)
-								>> SCHED_CAPACITY_SHIFT;
+								* sg->sge->idle_states[idle_idx].power);
 
 				total_energy += sg_busy_energy + sg_idle_energy;
 
@@ -5562,7 +5558,7 @@ next_cpu:
 		continue;
 	}
 
-	eenv->energy = total_energy;
+	eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
 	return 0;
 }
 

From 3989a247e2744f437233a36a0183edf3b3dfc8f1 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Tue, 7 Mar 2017 18:08:24 -0800
Subject: [PATCH 1030/3220] sched: EAS: kill incorrect nohz idle cpu kick

EAS won't allow NOHZ idle balancer until CPU's over utilized.  However
nohz_kick_needed() can return true.  This causes idle CPU wake up for
nothing.

Change-Id: I6e548442e29e4f85cda695e4c7101dd591b12fe6
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 61c8952c7aaf..0f43ece69e8c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9768,12 +9768,12 @@ static inline bool nohz_kick_needed(struct rq *rq)
 		return true;
 
 	/* Do idle load balance if there have misfit task */
-	if (energy_aware() && rq->misfit_task)
-		return true;
+	if (energy_aware())
+		return rq->misfit_task;
 
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
-	if (sd && !energy_aware()) {
+	if (sd) {
 		sgc = sd->groups->sgc;
 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
 

From 0caf1df0c520325c9352149795266709e989b222 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Tue, 16 May 2017 11:13:00 -0700
Subject: [PATCH 1031/3220] sched: WALT: fix window mis-alignment

The initial window start needs to be close to ktime ns = 0 to be
aligned with scheduler tick.

Change-Id: Ia91f74efce2f910106622a054a6fcd507e763ca5
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/walt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 166641ed1f39..28e999554463 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -804,11 +804,11 @@ void walt_set_window_start(struct rq *rq)
 	int cpu = cpu_of(rq);
 	struct rq *sync_rq = cpu_rq(sync_cpu);
 
-	if (rq->window_start)
+	if (likely(rq->window_start))
 		return;
 
 	if (cpu == sync_cpu) {
-		rq->window_start = walt_ktime_clock();
+		rq->window_start = 1;
 	} else {
 		raw_spin_unlock(&rq->lock);
 		double_rq_lock(rq, sync_rq);

From ab10c4d8d6f5a42665e61acec78d64e021f4b68c Mon Sep 17 00:00:00 2001
From: Xu YiPing <xuyiping@hisilicon.com>
Date: Mon, 22 May 2017 11:26:23 -0700
Subject: [PATCH 1032/3220] FROMLIST: binder: fix memory corruption in
 binder_transaction binder

(from https://patchwork.kernel.org/patch/9939405/)

commit 7a4408c6bd3e ("binder: make sure accesses to proc/thread are
safe") made a change to enqueue tcomplete to thread->todo before
enqueuing the transaction. However, in err_dead_proc_or_thread case,
the tcomplete is directly freed, without dequeued. It may cause the
thread->todo list to be corrupted.

So, dequeue it before freeing.

Bug: 65333488
Change-Id: Id063a4db18deaa634f4d44aa6ebca47bea32537a
Signed-off-by: Xu YiPing <xuyiping@hisilicon.com>
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 799de03d75a8..810ff70104c1 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3253,6 +3253,7 @@ static void binder_transaction(struct binder_proc *proc,
 err_dead_proc_or_thread:
 	return_error = BR_DEAD_REPLY;
 	return_error_line = __LINE__;
+	binder_dequeue_work(proc, tcomplete);
 err_translate_failed:
 err_bad_object_type:
 err_bad_offset:

From b13d0fb3390670995a7fc90f96c17542e305ca59 Mon Sep 17 00:00:00 2001
From: Xu YiPing <xuyiping@hisilicon.com>
Date: Tue, 5 Sep 2017 10:00:59 -0700
Subject: [PATCH 1033/3220] FROMLIST: binder: fix an ret value override

(from https://patchwork.kernel.org/patch/9939409/)

commit 372e3147df70 ("binder: guarantee txn complete / errors delivered
in-order") incorrectly defined a local ret value.  This ret value will
be invalid when out of the if block

Change-Id: If7bd963ac7e67d135aa949133263aac27bf15d1a
Signed-off-by: Xu YiPing <xuyiping@hislicon.com>
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 810ff70104c1..7c9f9dde8397 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2481,7 +2481,6 @@ static int binder_translate_handle(struct flat_binder_object *fp,
 			     (u64)node->ptr);
 		binder_node_unlock(node);
 	} else {
-		int ret;
 		struct binder_ref_data dest_rdata;
 
 		binder_node_unlock(node);

From 3482bbea6b3e9ed685ce42a77927dfeaec0be122 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 11 Apr 2017 00:20:41 +0200
Subject: [PATCH 1034/3220] BACKPORT: cpufreq: schedutil: Use policy-dependent
 transition delays

Make the schedutil governor take the initial (default) value of the
rate_limit_us sysfs attribute from the (new) transition_delay_us
policy parameter (to be set by the scaling driver).

That will allow scaling drivers to make schedutil use smaller default
values of rate_limit_us and reduce the default average time interval
between consecutive frequency changes.

Make intel_pstate set transition_delay_us to 500.

BACKPORT: Modified to support the separate up_rate_limit_us and
down_rate_limit_us (upstream just has a single rate_limit_us). Also
dropped the changes for intel_pstate as there's a merge conflict.

Change-Id: I62a8543879a4d8582cdcb31ebd55607705d1c8b1
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
(cherry picked from commit 1b72e7fd304639f1cd49d1e11955c4974936d88c)
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
---
 include/linux/cpufreq.h          | 10 +++++++++-
 kernel/sched/cpufreq_schedutil.c | 20 +++++++++++++-------
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index b144e7445477..490d577aa144 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -119,7 +119,15 @@ struct cpufreq_policy {
 	bool                    fast_switch_possible;
 	bool                    fast_switch_enabled;
 
-	/* Cached frequency lookup from cpufreq_driver_resolve_freq. */
+	/*
+	 * Preferred average time interval between consecutive invocations of
+	 * the driver to set the frequency for this policy.  To be set by the
+	 * scaling driver (0, which is the default, means no preference).
+	 */
+	unsigned int		up_transition_delay_us;
+	unsigned int		down_transition_delay_us;
+
+	 /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
 	unsigned int cached_target_freq;
 	int cached_resolved_idx;
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a0bcf488fb81..b90f7434e13b 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -615,7 +615,6 @@ static int sugov_init(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy;
 	struct sugov_tunables *tunables;
-	unsigned int lat;
 	int ret = 0;
 
 	/* State should be equivalent to EXIT */
@@ -654,12 +653,19 @@ static int sugov_init(struct cpufreq_policy *policy)
 		goto stop_kthread;
 	}
 
-	tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
-	tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
-	lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-	if (lat) {
-		tunables->up_rate_limit_us *= lat;
-		tunables->down_rate_limit_us *= lat;
+	if (policy->up_transition_delay_us && policy->down_transition_delay_us) {
+		tunables->up_rate_limit_us = policy->up_transition_delay_us;
+		tunables->down_rate_limit_us = policy->down_transition_delay_us;
+	} else {
+		unsigned int lat;
+
+                tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
+                tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
+		lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+		if (lat) {
+                        tunables->up_rate_limit_us *= lat;
+                        tunables->down_rate_limit_us *= lat;
+                }
 	}
 
 	policy->governor_data = sg_policy;

From 1bab88a22447c96f832aca1115370cfdd75fb168 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 17 Aug 2017 14:38:21 +0100
Subject: [PATCH 1035/3220] ANDROID: cpufreq-dt: Set sane defaults for
 schedutil rate limits

schedfreq used transition_latency as the default value for
rate-limiting upward frequency transitions. schedutil instead uses a
separate transition_delay_us field - if this is not set, it
uses transition_latency * 1000, which is not sensible.

The reason for the two separate values (transition_delay and
transition_latency) is that they represent different quantities:
"latency" reports how long it takes to execute a frequency
transition. "delay" is a tunable whose ideal value is one that best
balances the power cost of making a frequency transition with the
benefit of more often selecting the correct frequency. "latency" is
probably a lower bound on "delay".

AFAIK we don't currently have a way to directly express the desired
transition_delay_us field in the device tree, so for now let's just
set it to the same as transition_latency, so we get similar
default behaviour for schedutil as we got for schedfreq.

This doesn't make any difference if userspace is setting the cpufreq
tunables itself, it is just making the defaults saner.

Change-Id: Iec92d710c79d9c10d0bef1b617942185448fc214
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
---
 drivers/cpufreq/cpufreq-dt.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 90d64081ddb3..7bd7a8a5a1f6 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -359,6 +359,13 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 
 	policy->cpuinfo.transition_latency = transition_latency;
 
+        /*
+         * Android: set default parameters for parity between schedutil and
+         * schedfreq
+         */
+	policy->up_transition_delay_us = transition_latency / NSEC_PER_USEC;
+	policy->down_transition_delay_us = 50000; /* 50ms */
+
 	of_node_put(np);
 
 	return 0;

From 3479acf11d421ba1bef80b6d613c0cdc23a01850 Mon Sep 17 00:00:00 2001
From: Greg Kaiser <gkaiser@google.com>
Date: Tue, 5 Sep 2017 15:55:41 -0700
Subject: [PATCH 1036/3220] ANDROID: fiq_debugger: Fix minor bug in code

We fix a typo in the code which had us comparing a pointer instead
of the value which was being pointed to.  This turns out to be
a relatively benign bug, as we'd incorrectly pass in the empty
string instead of NULL to the function, and the function can handle
both.  But we fix it so the code is clearly doing what we intend.

Signed-off-by: Greg Kaiser <gkaiser@google.com>
Change-Id: Ib059819775a3bebca357d4ce684be779853156e3
---
 drivers/staging/android/fiq_debugger/fiq_debugger.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c
index b132cff14f01..ef9fa483ac00 100644
--- a/drivers/staging/android/fiq_debugger/fiq_debugger.c
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c
@@ -401,7 +401,7 @@ static void fiq_debugger_work(struct work_struct *work)
 		cmd += 6;
 		while (*cmd == ' ')
 			cmd++;
-		if (cmd != '\0')
+		if (*cmd != '\0')
 			kernel_restart(cmd);
 		else
 			kernel_restart(NULL);

From f238ef77588e2723965a853243b0a70ce3067959 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Thu, 24 Aug 2017 16:57:20 -0700
Subject: [PATCH 1037/3220] Revert "ANDROID: Use sk_uid to replace uid get from
 socket file"

This reverts commit 623f33f213deb39994decbdc7d3b8cea82c4d558.

Bug: 37524657
Change-Id: I214dbf417e720c032d28d62b8a74ece5de9cd919
Signed-off-by: Chenbo Feng <fengc@google.com>
---
 net/netfilter/xt_qtaguid.c          | 50 +++++++++++++++++------------
 net/netfilter/xt_qtaguid_internal.h |  4 +--
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index fbe4e1ecce6a..f95aba44e649 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -1717,9 +1717,18 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	}
 	MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n",
 		 par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par));
+	if (sk != NULL) {
+		set_sk_callback_lock = true;
+		read_lock_bh(&sk->sk_callback_lock);
+		MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
+			par->hooknum, sk, sk->sk_socket,
+			sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
+		filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+		MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
+			par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
+	}
 
-
-	if (sk == NULL) {
+	if (sk == NULL || sk->sk_socket == NULL) {
 		/*
 		 * Here, the qtaguid_find_sk() using connection tracking
 		 * couldn't find the owner, so for now we just count them
@@ -1732,7 +1741,9 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		 */
 		if (!(info->match & XT_QTAGUID_UID))
 			account_for_uid(skb, sk, 0, par);
-		MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum);
+		MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n",
+			par->hooknum,
+			sk ? sk->sk_socket : NULL);
 		res = (info->match ^ info->invert) == 0;
 		atomic64_inc(&qtu_events.match_no_sk);
 		goto put_sock_ret_res;
@@ -1740,7 +1751,16 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		res = false;
 		goto put_sock_ret_res;
 	}
-	sock_uid = sk->sk_uid;
+	filp = sk->sk_socket->file;
+	if (filp == NULL) {
+		MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum);
+		account_for_uid(skb, sk, 0, par);
+		res = ((info->match ^ info->invert) &
+			(XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0;
+		atomic64_inc(&qtu_events.match_no_sk_file);
+		goto put_sock_ret_res;
+	}
+	sock_uid = filp->f_cred->fsuid;
 	/*
 	 * TODO: unhack how to force just accounting.
 	 * For now we only do iface stats when the uid-owner is not requested
@@ -1758,8 +1778,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
 		kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
 
-		if ((uid_gte(sk->sk_uid, uid_min) &&
-		     uid_lte(sk->sk_uid, uid_max)) ^
+		if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
+		     uid_lte(filp->f_cred->fsuid, uid_max)) ^
 		    !(info->invert & XT_QTAGUID_UID)) {
 			MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
 				 par->hooknum);
@@ -1770,19 +1790,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (info->match & XT_QTAGUID_GID) {
 		kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
 		kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
-		set_sk_callback_lock = true;
-		read_lock_bh(&sk->sk_callback_lock);
-		MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
-			par->hooknum, sk, sk->sk_socket,
-			sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
-		filp = sk->sk_socket ? sk->sk_socket->file : NULL;
-		if (!filp) {
-			res = ((info->match ^ info->invert) & XT_QTAGUID_GID) == 0;
-			atomic64_inc(&qtu_events.match_no_sk_gid);
-			goto put_sock_ret_res;
-		}
-		MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
-			par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
+
 		if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
 				gid_lte(filp->f_cred->fsgid, gid_max)) ^
 			!(info->invert & XT_QTAGUID_GID)) {
@@ -1954,7 +1962,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 			   "match_found_sk_in_ct=%llu "
 			   "match_found_no_sk_in_ct=%llu "
 			   "match_no_sk=%llu "
-			   "match_no_sk_gid=%llu\n",
+			   "match_no_sk_file=%llu\n",
 			   (u64)atomic64_read(&qtu_events.sockets_tagged),
 			   (u64)atomic64_read(&qtu_events.sockets_untagged),
 			   (u64)atomic64_read(&qtu_events.counter_set_changes),
@@ -1966,7 +1974,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 			   (u64)atomic64_read(&qtu_events.match_found_sk_in_ct),
 			   (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct),
 			   (u64)atomic64_read(&qtu_events.match_no_sk),
-			   (u64)atomic64_read(&qtu_events.match_no_sk_gid));
+			   (u64)atomic64_read(&qtu_events.match_no_sk_file));
 
 		/* Count the following as part of the last item_index. No need
 		 * to lock the sock_tag_list here since it is already locked when
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
index c7052707a6a4..8178fbdfb036 100644
--- a/net/netfilter/xt_qtaguid_internal.h
+++ b/net/netfilter/xt_qtaguid_internal.h
@@ -289,10 +289,10 @@ struct qtaguid_event_counts {
 	 */
 	atomic64_t match_no_sk;
 	/*
-	 * The file ptr in the sk_socket wasn't there and we couldn't get GID.
+	 * The file ptr in the sk_socket wasn't there.
 	 * This might happen for traffic while the socket is being closed.
 	 */
-	atomic64_t match_no_sk_gid;
+	atomic64_t match_no_sk_file;
 };
 
 /* Track the set active_set for the given tag. */

From 5ce41fd1a024dc0f3beff98952d67e514e104602 Mon Sep 17 00:00:00 2001
From: JP Abgrall <jpa@google.com>
Date: Fri, 20 Dec 2013 16:51:11 -0800
Subject: [PATCH 1038/3220] ANDROID: nf: xt_qtaguid: fix handling for cases
 where tunnels are used.

* fix skb->dev vs par->in/out
When there is some forwarding going on, it introduces extra state
around devs associated with xt_action_param->in/out and sk_buff->dev.
E.g.
   par->in and par->out are both set, or
   skb->dev and par->out are both set (and different)
This would lead qtaguid to make the wrong assumption about the
direction and update the wrong device stats.
Now we rely more on par->in/out.

* Fix handling when qtaguid is used as "owner"
When qtaguid is used as an owner module, and sk_socket->file is
not there (happens when tunnels are involved), it would
incorrectly do a tag stats update.

* Correct debug messages.

Bug: 11687690
Change-Id: I2b1ff8bd7131969ce9e25f8291d83a6280b3ba7f
CRs-Fixed: 747810
Signed-off-by: JP Abgrall <jpa@google.com>
Git-commit: 2b71479d6f5fe8f33b335f713380f72037244395
Git-repo: https://www.codeaurora.org/cgit/quic/la/kernel/mediatek
[imaund@codeaurora.org: Resolved trivial context conflicts.]
Signed-off-by: Ian Maund <imaund@codeaurora.org>
[bflowers@codeaurora.org: Resolved merge conflicts]
Signed-off-by: Bryse Flowers <bflowers@codeaurora.org>
Signed-off-by: Chenbo Feng <fengc@google.com>
---
 net/netfilter/xt_qtaguid.c | 150 +++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 81 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index f95aba44e649..fef69cb21f6f 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -1175,6 +1175,38 @@ static void iface_stat_update(struct net_device *net_dev, bool stash_only)
 	spin_unlock_bh(&iface_stat_list_lock);
 }
 
+/* Guarantied to return a net_device that has a name */
+static void get_dev_and_dir(const struct sk_buff *skb,
+			    struct xt_action_param *par,
+			    enum ifs_tx_rx *direction,
+			    const struct net_device **el_dev)
+{
+	BUG_ON(!direction || !el_dev);
+
+	if (par->in) {
+		*el_dev = par->in;
+		*direction = IFS_RX;
+	} else if (par->out) {
+		*el_dev = par->out;
+		*direction = IFS_TX;
+	} else {
+		pr_err("qtaguid[%d]: %s(): no par->in/out?!!\n",
+		       par->hooknum, __func__);
+		BUG();
+	}
+	if (unlikely(!(*el_dev)->name)) {
+		pr_err("qtaguid[%d]: %s(): no dev->name?!!\n",
+		       par->hooknum, __func__);
+		BUG();
+	}
+	if (skb->dev && *el_dev != skb->dev) {
+		MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs par->%s=%p %s\n",
+			 par->hooknum, skb->dev, skb->dev->name,
+			 *direction == IFS_RX ? "in" : "out",  *el_dev,
+			 (*el_dev)->name);
+	}
+}
+
 /*
  * Update stats for the specified interface from the skb.
  * Do nothing if the entry
@@ -1186,50 +1218,27 @@ static void iface_stat_update_from_skb(const struct sk_buff *skb,
 {
 	struct iface_stat *entry;
 	const struct net_device *el_dev;
-	enum ifs_tx_rx direction = par->in ? IFS_RX : IFS_TX;
+	enum ifs_tx_rx direction;
 	int bytes = skb->len;
 	int proto;
 
-	if (!skb->dev) {
-		MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
-		el_dev = par->in ? : par->out;
-	} else {
-		const struct net_device *other_dev;
-		el_dev = skb->dev;
-		other_dev = par->in ? : par->out;
-		if (el_dev != other_dev) {
-			MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
-				 "par->(in/out)=%p %s\n",
-				 par->hooknum, el_dev, el_dev->name, other_dev,
-				 other_dev->name);
-		}
-	}
-
-	if (unlikely(!el_dev)) {
-		pr_err_ratelimited("qtaguid[%d]: %s(): no par->in/out?!!\n",
-				   par->hooknum, __func__);
-		BUG();
-	} else if (unlikely(!el_dev->name)) {
-		pr_err_ratelimited("qtaguid[%d]: %s(): no dev->name?!!\n",
-				   par->hooknum, __func__);
-		BUG();
-	} else {
-		proto = ipx_proto(skb, par);
-		MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n",
-			 par->hooknum, el_dev->name, el_dev->type,
-			 par->family, proto);
-	}
+	get_dev_and_dir(skb, par, &direction, &el_dev);
+	proto = ipx_proto(skb, par);
+	MT_DEBUG("qtaguid[%d]: iface_stat: %s(%s): "
+		 "type=%d fam=%d proto=%d dir=%d\n",
+		 par->hooknum, __func__, el_dev->name, el_dev->type,
+		 par->family, proto, direction);
 
 	spin_lock_bh(&iface_stat_list_lock);
 	entry = get_iface_entry(el_dev->name);
 	if (entry == NULL) {
-		IF_DEBUG("qtaguid: iface_stat: %s(%s): not tracked\n",
-			 __func__, el_dev->name);
+		IF_DEBUG("qtaguid[%d]: iface_stat: %s(%s): not tracked\n",
+			 par->hooknum, __func__, el_dev->name);
 		spin_unlock_bh(&iface_stat_list_lock);
 		return;
 	}
 
-	IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+	IF_DEBUG("qtaguid[%d]: %s(%s): entry=%p\n", par->hooknum,  __func__,
 		 el_dev->name, entry);
 
 	data_counters_update(&entry->totals_via_skb, 0, direction, proto,
@@ -1294,14 +1303,14 @@ static void if_tag_stat_update(const char *ifname, uid_t uid,
 	spin_lock_bh(&iface_stat_list_lock);
 	iface_entry = get_iface_entry(ifname);
 	if (!iface_entry) {
-		pr_err_ratelimited("qtaguid: iface_stat: stat_update() "
+		pr_err_ratelimited("qtaguid: tag_stat: stat_update() "
 				   "%s not found\n", ifname);
 		spin_unlock_bh(&iface_stat_list_lock);
 		return;
 	}
 	/* It is ok to process data when an iface_entry is inactive */
 
-	MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n",
+	MT_DEBUG("qtaguid: tag_stat: stat_update() dev=%s entry=%p\n",
 		 ifname, iface_entry);
 
 	/*
@@ -1318,7 +1327,7 @@ static void if_tag_stat_update(const char *ifname, uid_t uid,
 		tag = combine_atag_with_uid(acct_tag, uid);
 		uid_tag = make_tag_from_uid(uid);
 	}
-	MT_DEBUG("qtaguid: iface_stat: stat_update(): "
+	MT_DEBUG("qtaguid: tag_stat: stat_update(): "
 		 " looking for tag=0x%llx (uid=%u) in ife=%p\n",
 		 tag, get_uid_from_tag(tag), iface_entry);
 	/* Loop over tag list under this interface for {acct_tag,uid_tag} */
@@ -1578,8 +1587,8 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
 	struct sock *sk;
 	unsigned int hook_mask = (1 << par->hooknum);
 
-	MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb,
-		 par->hooknum, par->family);
+	MT_DEBUG("qtaguid[%d]: find_sk(skb=%p) family=%d\n",
+		 par->hooknum, skb, par->family);
 
 	/*
 	 * Let's not abuse the the xt_socket_get*_sk(), or else it will
@@ -1600,8 +1609,8 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
 	}
 
 	if (sk) {
-		MT_DEBUG("qtaguid: %p->sk_proto=%u "
-			 "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state);
+		MT_DEBUG("qtaguid[%d]: %p->sk_proto=%u->sk_state=%d\n",
+			 par->hooknum, sk, sk->sk_protocol, sk->sk_state);
 		/*
 		 * When in TCP_TIME_WAIT the sk is not a "struct sock" but
 		 * "struct inet_timewait_sock" which is missing fields.
@@ -1619,37 +1628,19 @@ static void account_for_uid(const struct sk_buff *skb,
 			    struct xt_action_param *par)
 {
 	const struct net_device *el_dev;
+	enum ifs_tx_rx direction;
+	int proto;
 
-	if (!skb->dev) {
-		MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
-		el_dev = par->in ? : par->out;
-	} else {
-		const struct net_device *other_dev;
-		el_dev = skb->dev;
-		other_dev = par->in ? : par->out;
-		if (el_dev != other_dev) {
-			MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
-				"par->(in/out)=%p %s\n",
-				par->hooknum, el_dev, el_dev->name, other_dev,
-				other_dev->name);
-		}
-	}
+	get_dev_and_dir(skb, par, &direction, &el_dev);
+	proto = ipx_proto(skb, par);
+	MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d dir=%d\n",
+		 par->hooknum, el_dev->name, el_dev->type,
+		 par->family, proto, direction);
 
-	if (unlikely(!el_dev)) {
-		pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum);
-	} else if (unlikely(!el_dev->name)) {
-		pr_info("qtaguid[%d]: no dev->name?!!\n", par->hooknum);
-	} else {
-		int proto = ipx_proto(skb, par);
-		MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n",
-			 par->hooknum, el_dev->name, el_dev->type,
-			 par->family, proto);
-
-		if_tag_stat_update(el_dev->name, uid,
-				skb->sk ? skb->sk : alternate_sk,
-				par->in ? IFS_RX : IFS_TX,
-				proto, skb->len);
-	}
+	if_tag_stat_update(el_dev->name, uid,
+			   skb->sk ? skb->sk : alternate_sk,
+			   direction,
+			   proto, skb->len);
 }
 
 static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -1661,6 +1652,11 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	kuid_t sock_uid;
 	bool res;
 	bool set_sk_callback_lock = false;
+	/*
+	 * TODO: unhack how to force just accounting.
+	 * For now we only do tag stats when the uid-owner is not requested
+	 */
+	bool do_tag_stat = !(info->match & XT_QTAGUID_UID);
 
 	if (unlikely(module_passive))
 		return (info->match ^ info->invert) == 0;
@@ -1734,12 +1730,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		 * couldn't find the owner, so for now we just count them
 		 * against the system.
 		 */
-		/*
-		 * TODO: unhack how to force just accounting.
-		 * For now we only do iface stats when the uid-owner is not
-		 * requested.
-		 */
-		if (!(info->match & XT_QTAGUID_UID))
+		if (do_tag_stat)
 			account_for_uid(skb, sk, 0, par);
 		MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n",
 			par->hooknum,
@@ -1754,18 +1745,15 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	filp = sk->sk_socket->file;
 	if (filp == NULL) {
 		MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum);
-		account_for_uid(skb, sk, 0, par);
+		if (do_tag_stat)
+			account_for_uid(skb, sk, 0, par);
 		res = ((info->match ^ info->invert) &
 			(XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0;
 		atomic64_inc(&qtu_events.match_no_sk_file);
 		goto put_sock_ret_res;
 	}
 	sock_uid = filp->f_cred->fsuid;
-	/*
-	 * TODO: unhack how to force just accounting.
-	 * For now we only do iface stats when the uid-owner is not requested
-	 */
-	if (!(info->match & XT_QTAGUID_UID))
+	if (do_tag_stat)
 		account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), par);
 
 	/*

From 1e07bd20e4227a009101c8f5c5f3ed66b5ef30d3 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Thu, 20 Apr 2017 18:54:13 -0700
Subject: [PATCH 1039/3220] ANDROID: Use sk_uid to replace uid get from socket
 file

Retrieve socket uid from the sk_uid field added to struct sk instead of
read it from sk->socket->file. It prevent the packet been dropped when
the socket file doesn't exist.

Bug: 37524657
Signed-off-by: Chenbo Feng <fengc@google.com>
Change-Id: Ic58239c1f9aa7e0eb1d4d1c09d40b845fd4e8e57
---
 net/netfilter/xt_qtaguid.c          | 55 +++++++++++++----------------
 net/netfilter/xt_qtaguid_internal.h |  4 +--
 2 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index fef69cb21f6f..6f5cdc5b505f 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -1713,18 +1713,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	}
 	MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n",
 		 par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par));
-	if (sk != NULL) {
-		set_sk_callback_lock = true;
-		read_lock_bh(&sk->sk_callback_lock);
-		MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
-			par->hooknum, sk, sk->sk_socket,
-			sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
-		filp = sk->sk_socket ? sk->sk_socket->file : NULL;
-		MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
-			par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
-	}
 
-	if (sk == NULL || sk->sk_socket == NULL) {
+	if (!sk) {
 		/*
 		 * Here, the qtaguid_find_sk() using connection tracking
 		 * couldn't find the owner, so for now we just count them
@@ -1732,9 +1722,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		 */
 		if (do_tag_stat)
 			account_for_uid(skb, sk, 0, par);
-		MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n",
-			par->hooknum,
-			sk ? sk->sk_socket : NULL);
+		MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum);
 		res = (info->match ^ info->invert) == 0;
 		atomic64_inc(&qtu_events.match_no_sk);
 		goto put_sock_ret_res;
@@ -1742,19 +1730,10 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		res = false;
 		goto put_sock_ret_res;
 	}
-	filp = sk->sk_socket->file;
-	if (filp == NULL) {
-		MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum);
-		if (do_tag_stat)
-			account_for_uid(skb, sk, 0, par);
-		res = ((info->match ^ info->invert) &
-			(XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0;
-		atomic64_inc(&qtu_events.match_no_sk_file);
-		goto put_sock_ret_res;
-	}
-	sock_uid = filp->f_cred->fsuid;
+	sock_uid = sk->sk_uid;
 	if (do_tag_stat)
-		account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), par);
+		account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid),
+				par);
 
 	/*
 	 * The following two tests fail the match when:
@@ -1766,8 +1745,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
 		kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
 
-		if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
-		     uid_lte(filp->f_cred->fsuid, uid_max)) ^
+		if ((uid_gte(sock_uid, uid_min) &&
+		     uid_lte(sock_uid, uid_max)) ^
 		    !(info->invert & XT_QTAGUID_UID)) {
 			MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
 				 par->hooknum);
@@ -1778,7 +1757,21 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (info->match & XT_QTAGUID_GID) {
 		kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
 		kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
-
+		set_sk_callback_lock = true;
+		read_lock_bh(&sk->sk_callback_lock);
+		MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
+			 par->hooknum, sk, sk->sk_socket,
+			 sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
+		filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+		if (!filp) {
+			res = ((info->match ^ info->invert) &
+			       XT_QTAGUID_GID) == 0;
+			atomic64_inc(&qtu_events.match_no_sk_gid);
+			goto put_sock_ret_res;
+		}
+		MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
+			 par->hooknum, filp ?
+			 from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
 		if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
 				gid_lte(filp->f_cred->fsgid, gid_max)) ^
 			!(info->invert & XT_QTAGUID_GID)) {
@@ -1950,7 +1943,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 			   "match_found_sk_in_ct=%llu "
 			   "match_found_no_sk_in_ct=%llu "
 			   "match_no_sk=%llu "
-			   "match_no_sk_file=%llu\n",
+			   "match_no_sk_gid=%llu\n",
 			   (u64)atomic64_read(&qtu_events.sockets_tagged),
 			   (u64)atomic64_read(&qtu_events.sockets_untagged),
 			   (u64)atomic64_read(&qtu_events.counter_set_changes),
@@ -1962,7 +1955,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
 			   (u64)atomic64_read(&qtu_events.match_found_sk_in_ct),
 			   (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct),
 			   (u64)atomic64_read(&qtu_events.match_no_sk),
-			   (u64)atomic64_read(&qtu_events.match_no_sk_file));
+			   (u64)atomic64_read(&qtu_events.match_no_sk_gid));
 
 		/* Count the following as part of the last item_index. No need
 		 * to lock the sock_tag_list here since it is already locked when
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
index 8178fbdfb036..c7052707a6a4 100644
--- a/net/netfilter/xt_qtaguid_internal.h
+++ b/net/netfilter/xt_qtaguid_internal.h
@@ -289,10 +289,10 @@ struct qtaguid_event_counts {
 	 */
 	atomic64_t match_no_sk;
 	/*
-	 * The file ptr in the sk_socket wasn't there.
+	 * The file ptr in the sk_socket wasn't there and we couldn't get GID.
 	 * This might happen for traffic while the socket is being closed.
 	 */
-	atomic64_t match_no_sk_file;
+	atomic64_t match_no_sk_gid;
 };
 
 /* Track the set active_set for the given tag. */

From 101ef8fff5eb3b130917ba8c8bfc0def91b6e423 Mon Sep 17 00:00:00 2001
From: gaurav jindal <gauravjindal1104@gmail.com>
Date: Fri, 8 Sep 2017 00:07:43 +0530
Subject: [PATCH 1040/3220] drivers: cpufreq: checks to avoid kernel crash in
 cpufreq_interactive

In cpufreq_governor_interactive, driver throws warning with WARN_ON
for !tunables and event != CPUFREQ_GOV_POLICY_INIT.
In case when tunables is NULL for event other than
CPUFREQ_GOV_POLICY_INIT, kernel will crash as there is no safe check
available before accessing tunables. So to handle such case and avoid
the kernel crash, return -EINVAL if WARN_ON returns TRUE.

Change-Id: I7a3a22d58e3c8a315a1cc1d31143649dc8807dee
Signed-off-by: gaurav jindal <gauravjindal1104@gmail.com>
---
 drivers/cpufreq/cpufreq_interactive.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
index 889c9b8b2237..52ef2ea6b65c 100644
--- a/drivers/cpufreq/cpufreq_interactive.c
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -1148,7 +1148,8 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
 	else
 		tunables = common_tunables;
 
-	WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));
+	if (WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT)))
+		return -EINVAL;
 
 	switch (event) {
 	case CPUFREQ_GOV_POLICY_INIT:

From e698796fd28862a9ee91f004820016eb8117a195 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 6 Sep 2017 13:06:43 -0700
Subject: [PATCH 1041/3220] ANDROID: mnt: Fix freeing of mount data

Fix double free on error paths

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: I1c25a175e87e5dd5cafcdcf9d78bf4c0dc3f88ef
Bug: 65386954
Fixes: 6b42d02561d3 ("ANDROID: mnt: Add filesystem private data to mount points")
---
 fs/namespace.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 15b91b36ecab..7e14bf1c851c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -227,6 +227,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		mnt->mnt_count = 1;
 		mnt->mnt_writers = 0;
 #endif
+		mnt->mnt.data = NULL;
 
 		INIT_HLIST_NODE(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
@@ -976,7 +977,6 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 
-	mnt->mnt.data = NULL;
 	if (type->alloc_mnt_data) {
 		mnt->mnt.data = type->alloc_mnt_data();
 		if (!mnt->mnt.data) {
@@ -990,7 +990,6 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 
 	root = mount_fs(type, flags, name, &mnt->mnt, data);
 	if (IS_ERR(root)) {
-		kfree(mnt->mnt.data);
 		mnt_free_id(mnt);
 		free_vfsmnt(mnt);
 		return ERR_CAST(root);
@@ -1094,7 +1093,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	return mnt;
 
  out_free:
-	kfree(mnt->mnt.data);
 	mnt_free_id(mnt);
 	free_vfsmnt(mnt);
 	return ERR_PTR(err);

From eb6a6f7a16aa5ffac7e56050ec13d8618e35a462 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 19 Jul 2017 17:25:07 -0700
Subject: [PATCH 1042/3220] ANDROID: Sdcardfs: Move gid derivation under flag

This moves the code to adjust the gid/uid of lower filesystem
files under the mount flag derive_gid.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: I44eaad4ef67c7fcfda3b6ea3502afab94442610c
Bug: 63245673
---
 fs/sdcardfs/derived_perm.c |  3 +++
 fs/sdcardfs/inode.c        | 12 ++++++++----
 fs/sdcardfs/main.c         |  6 ++++++
 fs/sdcardfs/sdcardfs.h     |  1 +
 fs/sdcardfs/super.c        |  2 ++
 5 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 85a60fb5ff39..e0b74dc77c5f 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -176,6 +176,9 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name)
 	gid_t gid = sbi->options.fs_low_gid;
 	struct iattr newattrs;
 
+	if (!sbi->options.gid_derivation)
+		return;
+
 	info = SDCARDFS_I(d_inode(dentry));
 	info_d = info->data;
 	perm = info_d->perm;
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 103dc45a131f..85eb8ebb5372 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -34,10 +34,14 @@ const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
 	if (!cred)
 		return NULL;
 
-	if (data->under_obb)
-		uid = AID_MEDIA_OBB;
-	else
-		uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid);
+	if (sbi->options.gid_derivation) {
+		if (data->under_obb)
+			uid = AID_MEDIA_OBB;
+		else
+			uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid);
+	} else {
+		uid = sbi->options.fs_low_uid;
+	}
 	cred->fsuid = make_kuid(&init_user_ns, uid);
 	cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid);
 
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 80825b287836..1e73b3ace563 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -32,6 +32,7 @@ enum {
 	Opt_multiuser,
 	Opt_userid,
 	Opt_reserved_mb,
+	Opt_gid_derivation,
 	Opt_err,
 };
 
@@ -43,6 +44,7 @@ static const match_table_t sdcardfs_tokens = {
 	{Opt_mask, "mask=%u"},
 	{Opt_userid, "userid=%d"},
 	{Opt_multiuser, "multiuser"},
+	{Opt_gid_derivation, "derive_gid"},
 	{Opt_reserved_mb, "reserved_mb=%u"},
 	{Opt_err, NULL}
 };
@@ -64,6 +66,8 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 	vfsopts->gid = 0;
 	/* by default, 0MB is reserved */
 	opts->reserved_mb = 0;
+	/* by default, gid derivation is off */
+	opts->gid_derivation = false;
 
 	*debug = 0;
 
@@ -115,6 +119,8 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 				return 0;
 			opts->reserved_mb = option;
 			break;
+		case Opt_gid_derivation:
+			opts->gid_derivation = true;
 		/* unknown option */
 		default:
 			if (!silent)
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index b8624fd8b817..88b92b2f1872 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -219,6 +219,7 @@ struct sdcardfs_mount_options {
 	gid_t fs_low_gid;
 	userid_t fs_user_id;
 	bool multiuser;
+	bool gid_derivation;
 	unsigned int reserved_mb;
 };
 
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index 7f4539b4b249..b89947d878e3 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -302,6 +302,8 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
 		seq_printf(m, ",mask=%u", vfsopts->mask);
 	if (opts->fs_user_id)
 		seq_printf(m, ",userid=%u", opts->fs_user_id);
+	if (opts->gid_derivation)
+		seq_puts(m, ",derive_gid");
 	if (opts->reserved_mb != 0)
 		seq_printf(m, ",reserved=%uMB", opts->reserved_mb);
 

From 3ecb1c95c89295caff001a692f496fcd00ada35d Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 8 Sep 2017 17:20:06 -0700
Subject: [PATCH 1043/3220] ANDROID: sdcardfs: Add missing break

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 63245673
Change-Id: I5fc596420301045895e5a9a7e297fd05434babf9
---
 fs/sdcardfs/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 1e73b3ace563..0a2b5167e9a2 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -121,6 +121,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 			break;
 		case Opt_gid_derivation:
 			opts->gid_derivation = true;
+			break;
 		/* unknown option */
 		default:
 			if (!silent)

From 2e26e045de9385990ebca686be2130e7dbfe008f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Sat, 25 Jun 2016 21:53:30 +1000
Subject: [PATCH 1044/3220] UPSTREAM: Fix build break in fork.c when
 THREAD_SIZE < PAGE_SIZE

Commit b235beea9e99 ("Clarify naming of thread info/stack allocators")
breaks the build on some powerpc configs, where THREAD_SIZE < PAGE_SIZE:

  kernel/fork.c:235:2: error: implicit declaration of function 'free_thread_stack'
  kernel/fork.c:355:8: error: assignment from incompatible pointer type
    stack = alloc_thread_stack_node(tsk, node);
    ^

Fix it by renaming free_stack() to free_thread_stack(), and updating the
return type of alloc_thread_stack_node().

Fixes: b235beea9e99 ("Clarify naming of thread info/stack allocators")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 38331309
Change-Id: I5b7f920b459fb84adf5fc75f83bb488b855c4deb
(cherry picked from commit 9521d39976db20f8ef9b56af66661482a17d5364)
Signed-off-by: Zubin Mithra <zsm@google.com>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 5ee818516a1c..f98b3360397c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -176,13 +176,13 @@ static inline void free_thread_stack(unsigned long *stack)
 # else
 static struct kmem_cache *thread_stack_cache;
 
-static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
 	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_stack(unsigned long *stack)
+static void free_thread_stack(unsigned long *stack)
 {
 	kmem_cache_free(thread_stack_cache, stack);
 }

From 2876169271fc1455142d2eb78489814fe213ffea Mon Sep 17 00:00:00 2001
From: gaurav jindal <gauravjindal1104@gmail.com>
Date: Mon, 18 Sep 2017 00:58:43 +0530
Subject: [PATCH 1045/3220] drivers: cpufreq_interactive: handle error for
 module load fail

If the cpufreq_register_governor fails, resources for thread
speedchange_task should be released.
currently, concerned  resources are released in module_exit,
but if module loading fails, exit will not be called
and resources will remain acquired. this may leave kernel
in an unstable state.

Change-Id: Ic33f058c069d30bfd114fa1c1380325c8e00b51c
Signed-off-by: gaurav jindal <gauravjindal1104@gmail.com>
---
 drivers/cpufreq/cpufreq_interactive.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
index 52ef2ea6b65c..e0a586895136 100644
--- a/drivers/cpufreq/cpufreq_interactive.c
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -1316,6 +1316,7 @@ static int __init cpufreq_interactive_init(void)
 	unsigned int i;
 	struct cpufreq_interactive_cpuinfo *pcpu;
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	int ret = 0;
 
 	/* Initalize per-cpu timers */
 	for_each_possible_cpu(i) {
@@ -1344,7 +1345,12 @@ static int __init cpufreq_interactive_init(void)
 	/* NB: wake up so the thread does not look hung to the freezer */
 	wake_up_process(speedchange_task);
 
-	return cpufreq_register_governor(&cpufreq_gov_interactive);
+	ret = cpufreq_register_governor(&cpufreq_gov_interactive);
+	if (ret) {
+		kthread_stop(speedchange_task);
+		put_task_struct(speedchange_task);
+	}
+	return ret;
 }
 
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE

From 9cfefbcfaab85139d3b5515d971d487f02d424e8 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Fri, 15 Sep 2017 16:17:26 -0700
Subject: [PATCH 1046/3220] ANDROID: configs: remove config fragments

The kernel config fragments for Android have moved into
their own repository located at

https://android.googlesource.com/kernel/configs/

Bug: 63994171
Change-Id: I837bac54cb5c90e6a6eb0f6f0ad5c90588c1a46a
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 android/configs/README                  |  15 ---
 android/configs/android-base-arm64.cfg  |   5 -
 android/configs/android-base.cfg        | 165 ------------------------
 android/configs/android-recommended.cfg | 139 --------------------
 4 files changed, 324 deletions(-)
 delete mode 100644 android/configs/README
 delete mode 100644 android/configs/android-base-arm64.cfg
 delete mode 100644 android/configs/android-base.cfg
 delete mode 100644 android/configs/android-recommended.cfg

diff --git a/android/configs/README b/android/configs/README
deleted file mode 100644
index 8798731f8904..000000000000
--- a/android/configs/README
+++ /dev/null
@@ -1,15 +0,0 @@
-The files in this directory are meant to be used as a base for an Android
-kernel config. All devices should have the options in android-base.cfg enabled.
-While not mandatory, the options in android-recommended.cfg enable advanced
-Android features.
-
-Assuming you already have a minimalist defconfig for your device, a possible
-way to enable these options would be:
-
-     ARCH=<arch> scripts/kconfig/merge_config.sh <path_to>/<device>_defconfig android/configs/android-base.cfg android/configs/android-recommended.cfg
-
-This will generate a .config that can then be used to save a new defconfig or
-compile a new kernel with Android features enabled.
-
-Because there is no tool to consistently generate these config fragments,
-lets keep them alphabetically sorted instead of random.
diff --git a/android/configs/android-base-arm64.cfg b/android/configs/android-base-arm64.cfg
deleted file mode 100644
index 43f23d6b5391..000000000000
--- a/android/configs/android-base-arm64.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-#  KEEP ALPHABETICALLY SORTED
-CONFIG_ARMV8_DEPRECATED=y
-CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_SETEND_EMULATION=y
-CONFIG_SWP_EMULATION=y
diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
deleted file mode 100644
index 7199561cd42b..000000000000
--- a/android/configs/android-base.cfg
+++ /dev/null
@@ -1,165 +0,0 @@
-#  KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVKMEM is not set
-# CONFIG_DEVMEM is not set
-# CONFIG_FHANDLE is not set
-# CONFIG_INET_LRO is not set
-# CONFIG_NFSD is not set
-# CONFIG_NFS_FS is not set
-# CONFIG_OABI_COMPAT is not set
-# CONFIG_SYSVIPC is not set
-# CONFIG_USELIB is not set
-CONFIG_ANDROID=y
-CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
-CONFIG_ANDROID_BINDER_IPC=y
-CONFIG_ANDROID_LOW_MEMORY_KILLER=y
-CONFIG_ASHMEM=y
-CONFIG_AUDIT=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_DEFAULT_SECURITY_SELINUX=y
-CONFIG_EMBEDDED=y
-CONFIG_FB=y
-CONFIG_HARDENED_USERCOPY=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_INET=y
-CONFIG_INET_DIAG_DESTROY=y
-CONFIG_INET_ESP=y
-CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_IP6_NF_FILTER=y
-CONFIG_IP6_NF_IPTABLES=y
-CONFIG_IP6_NF_MANGLE=y
-CONFIG_IP6_NF_MATCH_RPFILTER=y
-CONFIG_IP6_NF_RAW=y
-CONFIG_IP6_NF_TARGET_REJECT=y
-CONFIG_IPV6=y
-CONFIG_IPV6_MIP6=y
-CONFIG_IPV6_MULTIPLE_TABLES=y
-CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_ROUTER_PREF=y
-CONFIG_IPV6_ROUTE_INFO=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_MULTIPLE_TABLES=y
-CONFIG_IP_NF_ARPFILTER=y
-CONFIG_IP_NF_ARPTABLES=y
-CONFIG_IP_NF_ARP_MANGLE=y
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_MANGLE=y
-CONFIG_IP_NF_MATCH_AH=y
-CONFIG_IP_NF_MATCH_ECN=y
-CONFIG_IP_NF_MATCH_TTL=y
-CONFIG_IP_NF_NAT=y
-CONFIG_IP_NF_RAW=y
-CONFIG_IP_NF_SECURITY=y
-CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_IP_NF_TARGET_NETMAP=y
-CONFIG_IP_NF_TARGET_REDIRECT=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_NET=y
-CONFIG_NETDEVICES=y
-CONFIG_NETFILTER=y
-CONFIG_NETFILTER_XT_MATCH_COMMENT=y
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_HELPER=y
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
-CONFIG_NETFILTER_XT_MATCH_LENGTH=y
-CONFIG_NETFILTER_XT_MATCH_LIMIT=y
-CONFIG_NETFILTER_XT_MATCH_MAC=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
-CONFIG_NETFILTER_XT_MATCH_POLICY=y
-CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
-CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
-CONFIG_NETFILTER_XT_MATCH_QUOTA=y
-CONFIG_NETFILTER_XT_MATCH_SOCKET=y
-CONFIG_NETFILTER_XT_MATCH_STATE=y
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
-CONFIG_NETFILTER_XT_MATCH_STRING=y
-CONFIG_NETFILTER_XT_MATCH_TIME=y
-CONFIG_NETFILTER_XT_MATCH_U32=y
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
-CONFIG_NETFILTER_XT_TARGET_NFLOG=y
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
-CONFIG_NETFILTER_XT_TARGET_SECMARK=y
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
-CONFIG_NETFILTER_XT_TARGET_TPROXY=y
-CONFIG_NETFILTER_XT_TARGET_TRACE=y
-CONFIG_NET_CLS_ACT=y
-CONFIG_NET_CLS_U32=y
-CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_U32=y
-CONFIG_NET_KEY=y
-CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_HTB=y
-CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_AMANDA=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CONNTRACK_FTP=y
-CONFIG_NF_CONNTRACK_H323=y
-CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_IPV6=y
-CONFIG_NF_CONNTRACK_IRC=y
-CONFIG_NF_CONNTRACK_NETBIOS_NS=y
-CONFIG_NF_CONNTRACK_PPTP=y
-CONFIG_NF_CONNTRACK_SANE=y
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_TFTP=y
-CONFIG_NF_CT_NETLINK=y
-CONFIG_NF_CT_PROTO_DCCP=y
-CONFIG_NF_CT_PROTO_SCTP=y
-CONFIG_NF_CT_PROTO_UDPLITE=y
-CONFIG_NF_NAT=y
-CONFIG_NO_HZ=y
-CONFIG_PACKET=y
-CONFIG_PM_AUTOSLEEP=y
-CONFIG_PM_WAKELOCKS=y
-CONFIG_PPP=y
-CONFIG_PPPOLAC=y
-CONFIG_PPPOPNS=y
-CONFIG_PPP_BSDCOMP=y
-CONFIG_PPP_DEFLATE=y
-CONFIG_PPP_MPPE=y
-CONFIG_PREEMPT=y
-CONFIG_PROFILING=y
-CONFIG_RANDOMIZE_BASE=y
-CONFIG_RTC_CLASS=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_SECCOMP=y
-CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
-CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
-CONFIG_SECURITY_SELINUX=y
-CONFIG_STAGING=y
-CONFIG_SYNC=y
-CONFIG_TUN=y
-CONFIG_UID_SYS_STATS=y
-CONFIG_UNIX=y
-CONFIG_USB_CONFIGFS=y
-CONFIG_USB_CONFIGFS_F_ACC=y
-CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
-CONFIG_USB_CONFIGFS_F_FS=y
-CONFIG_USB_CONFIGFS_F_MIDI=y
-CONFIG_USB_CONFIGFS_F_MTP=y
-CONFIG_USB_CONFIGFS_F_PTP=y
-CONFIG_USB_CONFIGFS_UEVENT=y
-CONFIG_USB_GADGET=y
-CONFIG_XFRM_USER=y
diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
deleted file mode 100644
index 3d7e5e168940..000000000000
--- a/android/configs/android-recommended.cfg
+++ /dev/null
@@ -1,139 +0,0 @@
-#  KEEP ALPHABETICALLY SORTED
-# CONFIG_AIO is not set
-# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_NF_CONNTRACK_SIP is not set
-# CONFIG_PM_WAKELOCKS_GC is not set
-# CONFIG_VT is not set
-CONFIG_ANDROID_TIMED_GPIO=y
-CONFIG_ARM64_SW_TTBR0_PAN=y
-CONFIG_ARM_KERNMEM_PERMS=y
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
-CONFIG_BLK_DEV_DM=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_CC_STACKPROTECTOR_STRONG=y
-CONFIG_COMPACTION=y
-CONFIG_CPU_SW_DOMAIN_PAN=y
-CONFIG_DEBUG_RODATA=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_UEVENT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
-CONFIG_DRAGONRISE_FF=y
-CONFIG_ENABLE_DEFAULT_TRACERS=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_SECURITY=y
-CONFIG_FUSE_FS=y
-CONFIG_GREENASIA_FF=y
-CONFIG_HIDRAW=y
-CONFIG_HID_A4TECH=y
-CONFIG_HID_ACRUX=y
-CONFIG_HID_ACRUX_FF=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-CONFIG_HID_DRAGONRISE=y
-CONFIG_HID_ELECOM=y
-CONFIG_HID_EMS_FF=y
-CONFIG_HID_EZKEY=y
-CONFIG_HID_GREENASIA=y
-CONFIG_HID_GYRATION=y
-CONFIG_HID_HOLTEK=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_KEYTOUCH=y
-CONFIG_HID_KYE=y
-CONFIG_HID_LCPOWER=y
-CONFIG_HID_LOGITECH=y
-CONFIG_HID_LOGITECH_DJ=y
-CONFIG_HID_MAGICMOUSE=y
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
-CONFIG_HID_MULTITOUCH=y
-CONFIG_HID_NTRIG=y
-CONFIG_HID_ORTEK=y
-CONFIG_HID_PANTHERLORD=y
-CONFIG_HID_PETALYNX=y
-CONFIG_HID_PICOLCD=y
-CONFIG_HID_PRIMAX=y
-CONFIG_HID_PRODIKEYS=y
-CONFIG_HID_ROCCAT=y
-CONFIG_HID_SAITEK=y
-CONFIG_HID_SAMSUNG=y
-CONFIG_HID_SMARTJOYPLUS=y
-CONFIG_HID_SONY=y
-CONFIG_HID_SPEEDLINK=y
-CONFIG_HID_SUNPLUS=y
-CONFIG_HID_THRUSTMASTER=y
-CONFIG_HID_TIVO=y
-CONFIG_HID_TOPSEED=y
-CONFIG_HID_TWINHAN=y
-CONFIG_HID_UCLOGIC=y
-CONFIG_HID_WACOM=y
-CONFIG_HID_WALTOP=y
-CONFIG_HID_WIIMOTE=y
-CONFIG_HID_ZEROPLUS=y
-CONFIG_HID_ZYDACRON=y
-CONFIG_INPUT_EVDEV=y
-CONFIG_INPUT_GPIO=y
-CONFIG_INPUT_JOYSTICK=y
-CONFIG_INPUT_KEYCHORD=y
-CONFIG_INPUT_KEYRESET=y
-CONFIG_INPUT_MISC=y
-CONFIG_INPUT_TABLET=y
-CONFIG_INPUT_UINPUT=y
-CONFIG_ION=y
-CONFIG_JOYSTICK_XPAD=y
-CONFIG_JOYSTICK_XPAD_FF=y
-CONFIG_JOYSTICK_XPAD_LEDS=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_KSM=y
-CONFIG_LOGIG940_FF=y
-CONFIG_LOGIRUMBLEPAD2_FF=y
-CONFIG_LOGITECH_FF=y
-CONFIG_MD=y
-CONFIG_MEDIA_SUPPORT=y
-CONFIG_MEMORY_STATE_TIME=y
-CONFIG_MSDOS_FS=y
-CONFIG_PANIC_TIMEOUT=5
-CONFIG_PANTHERLORD_FF=y
-CONFIG_PERF_EVENTS=y
-CONFIG_PM_DEBUG=y
-CONFIG_PM_RUNTIME=y
-CONFIG_PM_WAKELOCKS_LIMIT=0
-CONFIG_POWER_SUPPLY=y
-CONFIG_PSTORE=y
-CONFIG_PSTORE_CONSOLE=y
-CONFIG_PSTORE_RAM=y
-CONFIG_QFMT_V2=y
-CONFIG_QUOTA=y
-CONFIG_QUOTACTL=y
-CONFIG_QUOTA_NETLINK_INTERFACE=y
-CONFIG_QUOTA_TREE=y
-CONFIG_SCHEDSTATS=y
-CONFIG_SMARTJOYPLUS_FF=y
-CONFIG_SND=y
-CONFIG_SOUND=y
-CONFIG_SUSPEND_TIME=y
-CONFIG_TABLET_USB_ACECAD=y
-CONFIG_TABLET_USB_AIPTEK=y
-CONFIG_TABLET_USB_GTCO=y
-CONFIG_TABLET_USB_HANWANG=y
-CONFIG_TABLET_USB_KBTAB=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_TASK_XACCT=y
-CONFIG_TIMER_STATS=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_UHID=y
-CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB_USBNET=y
-CONFIG_VFAT_FS=y

From 849c7764d8081c029cd7d5d2a5f9ace1a4bd084f Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Fri, 8 Sep 2017 02:09:26 -0400
Subject: [PATCH 1047/3220] FROMLIST: android: binder: Drop lru lock in isolate
 callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(from https://patchwork.kernel.org/patch/9945123/)

Drop the global lru lock in isolate callback
before calling zap_page_range which calls
cond_resched, and re-acquire the global lru
lock before returning. Also change return
code to LRU_REMOVED_RETRY.

Use mmput_async when fail to acquire mmap sem
in an atomic context.

Fix "BUG: sleeping function called from invalid context"
errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled.

Bug: 63926541
Change-Id: I45dbada421b715abed9a66d03d30ae2285671ca1
Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder")
Reported-by: Kyle Yan <kyan@codeaurora.org>
Acked-by: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder_alloc.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 1c1a7d9c86ed..9839afa84dfd 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 	struct binder_alloc *alloc;
 	uintptr_t page_addr;
 	size_t index;
+	struct vm_area_struct *vma;
 
 	alloc = page->alloc;
 	if (!mutex_trylock(&alloc->mutex))
@@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 	index = page - alloc->pages;
 	page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
-	if (alloc->vma) {
+	vma = alloc->vma;
+	if (vma) {
 		mm = get_task_mm(alloc->tsk);
 		if (!mm)
 			goto err_get_task_mm_failed;
 		if (!down_write_trylock(&mm->mmap_sem))
 			goto err_down_write_mmap_sem_failed;
+	}
 
+	list_lru_isolate(lru, item);
+	spin_unlock(lock);
+
+	if (vma) {
 		trace_binder_unmap_user_start(alloc, index);
 
-		zap_page_range(alloc->vma,
+		zap_page_range(vma,
 			       page_addr +
 			       alloc->user_buffer_offset,
 			       PAGE_SIZE, NULL);
@@ -951,13 +958,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 	trace_binder_unmap_kernel_end(alloc, index);
 
-	list_lru_isolate(lru, item);
-
+	spin_lock(lock);
 	mutex_unlock(&alloc->mutex);
-	return LRU_REMOVED;
+	return LRU_REMOVED_RETRY;
 
 err_down_write_mmap_sem_failed:
-	mmput(mm);
+	mmput_async(mm);
 err_get_task_mm_failed:
 err_page_already_freed:
 	mutex_unlock(&alloc->mutex);

From e6fa28a9a9c0f4ad120f1b6d4bfe2300e658d514 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Fri, 15 Sep 2017 20:40:03 -0400
Subject: [PATCH 1048/3220] FROMLIST: android: binder: Remove unused vma
 argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(from https://patchwork.kernel.org/patch/9954123/)

The vma argument in update_page_range is no longer
used after 74310e06 ("android: binder: Move buffer
out of area shared with user space"), since mmap_handler
no longer calls update_page_range with a vma.

Test: ran binderLibTest, throughputtest, interfacetest and mempressure
Bug: 36007193
Change-Id: Ibd6f24c11750f8f7e6ed56e40dd18c08e02ace25
Acked-by: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder_alloc.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 9839afa84dfd..cfc3f5664015 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -186,12 +186,12 @@ struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc,
 }
 
 static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
-				    void *start, void *end,
-				    struct vm_area_struct *vma)
+				    void *start, void *end)
 {
 	void *page_addr;
 	unsigned long user_page_addr;
 	struct binder_lru_page *page;
+	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = NULL;
 	bool need_mm = false;
 
@@ -215,7 +215,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		}
 	}
 
-	if (!vma && need_mm)
+	if (need_mm)
 		mm = get_task_mm(alloc->tsk);
 
 	if (mm) {
@@ -442,7 +442,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 	if (end_page_addr > has_page_addr)
 		end_page_addr = has_page_addr;
 	ret = binder_update_page_range(alloc, 1,
-	    (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL);
+	    (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -483,7 +483,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
 err_alloc_buf_struct_failed:
 	binder_update_page_range(alloc, 0,
 				 (void *)PAGE_ALIGN((uintptr_t)buffer->data),
-				 end_page_addr, NULL);
+				 end_page_addr);
 	return ERR_PTR(-ENOMEM);
 }
 
@@ -567,8 +567,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc,
 				   alloc->pid, buffer->data,
 				   prev->data, next->data);
 		binder_update_page_range(alloc, 0, buffer_start_page(buffer),
-					 buffer_start_page(buffer) + PAGE_SIZE,
-					 NULL);
+					 buffer_start_page(buffer) + PAGE_SIZE);
 	}
 	list_del(&buffer->entry);
 	kfree(buffer);
@@ -605,8 +604,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc,
 
 	binder_update_page_range(alloc, 0,
 		(void *)PAGE_ALIGN((uintptr_t)buffer->data),
-		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
-		NULL);
+		(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK));
 
 	rb_erase(&buffer->rb_node, &alloc->allocated_buffers);
 	buffer->free = 1;

From 9b9d7cf19106f1b2429ca192b8bfd8ffce7b859b Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Fri, 15 Sep 2017 21:12:15 -0400
Subject: [PATCH 1049/3220] FROMLIST: android: binder: Don't get mm from task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(from https://patchwork.kernel.org/patch/9954125/)

Use binder_alloc struct's mm_struct rather than getting
a reference to the mm struct through get_task_mm to
avoid a potential deadlock between lru lock, task lock and
dentry lock, since a thread can be holding the task lock
and the dentry lock while trying to acquire the lru lock.

Test: ran binderLibTest, throughputtest, interfacetest and
mempressure w/lockdep
Bug: 63926541
Change-Id: Icc661404eb7a4a2ecc5234b1bf8f0104665f9b45
Acked-by: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder_alloc.c | 25 ++++++++++++-------------
 drivers/android/binder_alloc.h |  1 -
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index cfc3f5664015..b95da16fd938 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -215,17 +215,13 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		}
 	}
 
-	if (need_mm)
-		mm = get_task_mm(alloc->tsk);
+	/* Same as mmget_not_zero() in later kernel versions */
+	if (need_mm && atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users))
+		mm = alloc->vma_vm_mm;
 
 	if (mm) {
 		down_write(&mm->mmap_sem);
 		vma = alloc->vma;
-		if (vma && mm != alloc->vma_vm_mm) {
-			pr_err("%d: vma mm and task mm mismatch\n",
-				alloc->pid);
-			vma = NULL;
-		}
 	}
 
 	if (!vma && need_mm) {
@@ -718,6 +714,8 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
 	barrier();
 	alloc->vma = vma;
 	alloc->vma_vm_mm = vma->vm_mm;
+	/* Same as mmgrab() in later kernel versions */
+	atomic_inc(&alloc->vma_vm_mm->mm_count);
 
 	return 0;
 
@@ -793,6 +791,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
 		vfree(alloc->buffer);
 	}
 	mutex_unlock(&alloc->mutex);
+	if (alloc->vma_vm_mm)
+		mmdrop(alloc->vma_vm_mm);
 
 	binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE,
 		     "%s: %d buffers %d, pages %d\n",
@@ -887,7 +887,6 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc)
 void binder_alloc_vma_close(struct binder_alloc *alloc)
 {
 	WRITE_ONCE(alloc->vma, NULL);
-	WRITE_ONCE(alloc->vma_vm_mm, NULL);
 }
 
 /**
@@ -924,9 +923,10 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 	page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
 	vma = alloc->vma;
 	if (vma) {
-		mm = get_task_mm(alloc->tsk);
-		if (!mm)
-			goto err_get_task_mm_failed;
+		/* Same as mmget_not_zero() in later kernel versions */
+		if (!atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users))
+			goto err_mmget;
+		mm = alloc->vma_vm_mm;
 		if (!down_write_trylock(&mm->mmap_sem))
 			goto err_down_write_mmap_sem_failed;
 	}
@@ -962,7 +962,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 err_down_write_mmap_sem_failed:
 	mmput_async(mm);
-err_get_task_mm_failed:
+err_mmget:
 err_page_already_freed:
 	mutex_unlock(&alloc->mutex);
 err_get_alloc_mutex_failed:
@@ -1001,7 +1001,6 @@ struct shrinker binder_shrinker = {
  */
 void binder_alloc_init(struct binder_alloc *alloc)
 {
-	alloc->tsk = current->group_leader;
 	alloc->pid = current->group_leader->pid;
 	mutex_init(&alloc->mutex);
 	INIT_LIST_HEAD(&alloc->buffers);
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index a3a3602c689c..2dd33b6df104 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -100,7 +100,6 @@ struct binder_lru_page {
  */
 struct binder_alloc {
 	struct mutex mutex;
-	struct task_struct *tsk;
 	struct vm_area_struct *vma;
 	struct mm_struct *vma_vm_mm;
 	void *buffer;

From 047200481e7f98e6334d751010c9470efb531b71 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:21 -0700
Subject: [PATCH 1050/3220] BACKPORT: partial: mm, oom_reaper: do not mmput
 synchronously from the oom reaper context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(cherry picked from commit ec8d7c14ea14922fe21945b458a75e39f11dd832)

Tetsuo has properly noted that mmput slow path might get blocked waiting
for another party (e.g.  exit_aio waits for an IO).  If that happens the
oom_reaper would be put out of the way and will not be able to process
next oom victim.  We should strive for making this context as reliable
and independent on other subsystems as much as possible.

Introduce mmput_async which will perform the slow path from an async
(WQ) context.  This will delay the operation but that shouldn't be a
problem because the oom_reaper has reclaimed the victim's address space
for most cases as much as possible and the remaining context shouldn't
bind too much memory anymore.  The only exception is when mmap_sem
trylock has failed which shouldn't happen too often.

The issue is only theoretical but not impossible.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Only backports mmput_async.

Change-Id: I5fe54abcc629e7d9eab9fe03908903d1174177f1
Signed-off-by: Arve Hjønnevåg <arve@android.com>
---
 include/linux/mm_types.h |  2 ++
 include/linux/sched.h    |  5 ++++
 kernel/fork.c            | 52 +++++++++++++++++++++++++++-------------
 3 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b32cb7add09c..907f94428ccc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
+#include <linux/workqueue.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -523,6 +524,7 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+	struct work_struct async_put_work;
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a6800fb3f4ea..d0d0b1b45418 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2764,6 +2764,11 @@ static inline void mmdrop(struct mm_struct * mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
+/* same as above but performs the slow path from the async kontext. Can
+ * be called from the atomic context as well
+ */
+extern void mmput_async(struct mm_struct *);
+
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index f98b3360397c..2633d5f61d3c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -695,6 +695,26 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+static inline void __mmput(struct mm_struct *mm)
+{
+	VM_BUG_ON(atomic_read(&mm->mm_users));
+
+	uprobe_clear_state(mm);
+	exit_aio(mm);
+	ksm_exit(mm);
+	khugepaged_exit(mm); /* must run before exit_mmap */
+	exit_mmap(mm);
+	set_mm_exe_file(mm, NULL);
+	if (!list_empty(&mm->mmlist)) {
+		spin_lock(&mmlist_lock);
+		list_del(&mm->mmlist);
+		spin_unlock(&mmlist_lock);
+	}
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+	mmdrop(mm);
+}
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
@@ -702,25 +722,25 @@ void mmput(struct mm_struct *mm)
 {
 	might_sleep();
 
-	if (atomic_dec_and_test(&mm->mm_users)) {
-		uprobe_clear_state(mm);
-		exit_aio(mm);
-		ksm_exit(mm);
-		khugepaged_exit(mm); /* must run before exit_mmap */
-		exit_mmap(mm);
-		set_mm_exe_file(mm, NULL);
-		if (!list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_del(&mm->mmlist);
-			spin_unlock(&mmlist_lock);
-		}
-		if (mm->binfmt)
-			module_put(mm->binfmt->module);
-		mmdrop(mm);
-	}
+	if (atomic_dec_and_test(&mm->mm_users))
+		__mmput(mm);
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_users)) {
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
+	}
+}
+
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
  *

From 650b6a5c418563f2a6cb4f12b3402a5b4870eea8 Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Thu, 21 Sep 2017 17:24:24 -0700
Subject: [PATCH 1051/3220] Revert "ANDROID: sched/tune: Initialize
 raw_spin_lock in boosted_groups"

This reverts commit c5616f2f874faa20b59b116177b99bf3948586df.

If we re-init the per-cpu boostgroup spinlock every time that
we add a new boosted cgroup, we can easily wipe out (reinit)
a spinlock struct while in a critical section. We should only
be setting up the per-cpu boostgroup data, and the spin_lock
initialization need only happen once - which we're already
doing in a postcore_initcall.

For example:

     -------- CPU 0 --------   | -------- CPU1 --------
cgroupX boost group added      |
schedtune_enqueue_task         |
  acquires(bg->lock)           | cgroupY boost group added
                               |  for_each_cpu()
                               |    raw_spin_lock_init(bg->lock)
  releases(bg->lock)           |
      BUG (already unlocked)   |
                               |

This results in the following BUG from the debug spinlock code:
	BUG: spinlock already unlocked on CPU#5, rcuop/6/68

Change-Id: I3016702780b461a0cd95e26c538cd18df27d6316
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/tune.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 86d04caf1684..d444fc1a4d58 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -647,7 +647,6 @@ schedtune_boostgroup_init(struct schedtune *st)
 		bg = &per_cpu(cpu_boost_groups, cpu);
 		bg->group[st->idx].boost = 0;
 		bg->group[st->idx].tasks = 0;
-		raw_spin_lock_init(&bg->lock);
 	}
 
 	return 0;

From c1286ff41c2f610df0cc853e78819ca67a48ffe2 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 12 Oct 2016 17:18:56 -0700
Subject: [PATCH 1052/3220] f2fs: backport from (4c1fad64 - Merge tag
 'for-f2fs-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs)

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.txt         |   18 +-
 fs/Kconfig                                 |    2 +
 fs/Makefile                                |    1 +
 fs/crypto/Kconfig                          |   18 +
 fs/crypto/Makefile                         |    3 +
 fs/crypto/crypto.c                         |  568 +++++++++
 fs/{f2fs/crypto_fname.c => crypto/fname.c} |  278 ++--
 fs/crypto/keyinfo.c                        |  304 +++++
 fs/crypto/policy.c                         |  246 ++++
 fs/f2fs/Kconfig                            |   21 +-
 fs/f2fs/Makefile                           |    2 -
 fs/f2fs/acl.c                              |   29 +-
 fs/f2fs/acl.h                              |    3 +-
 fs/f2fs/checkpoint.c                       |  529 +++++---
 fs/f2fs/crypto.c                           |  491 --------
 fs/f2fs/data.c                             | 1250 ++++++++++--------
 fs/f2fs/debug.c                            |   80 +-
 fs/f2fs/dir.c                              |  477 +++----
 fs/f2fs/extent_cache.c                     |  315 ++---
 fs/f2fs/f2fs.h                             | 1001 +++++++++------
 fs/f2fs/file.c                             | 1328 ++++++++++++++------
 fs/f2fs/gc.c                               |  353 ++++--
 fs/f2fs/gc.h                               |    8 -
 fs/f2fs/hash.c                             |    7 +-
 fs/f2fs/inline.c                           |  268 ++--
 fs/f2fs/inode.c                            |  176 +--
 fs/f2fs/namei.c                            |  409 +++---
 fs/f2fs/node.c                             |  733 +++++++----
 fs/f2fs/node.h                             |  123 +-
 fs/f2fs/recovery.c                         |  274 ++--
 fs/f2fs/segment.c                          |  698 +++++-----
 fs/f2fs/segment.h                          |   47 +-
 fs/f2fs/shrinker.c                         |    8 +-
 fs/f2fs/super.c                            |  788 +++++++++---
 fs/f2fs/trace.c                            |    6 +-
 fs/f2fs/xattr.c                            |   69 +-
 fs/f2fs/xattr.h                            |    3 +-
 include/linux/dcache.h                     |    1 +
 include/linux/f2fs_fs.h                    |   44 +-
 include/linux/fs.h                         |    7 +
 include/linux/fscrypto.h                   |  435 +++++++
 include/trace/events/f2fs.h                |   64 +-
 include/uapi/linux/fs.h                    |   18 +
 43 files changed, 7560 insertions(+), 3943 deletions(-)
 create mode 100644 fs/crypto/Kconfig
 create mode 100644 fs/crypto/Makefile
 create mode 100644 fs/crypto/crypto.c
 rename fs/{f2fs/crypto_fname.c => crypto/fname.c} (53%)
 create mode 100644 fs/crypto/keyinfo.c
 create mode 100644 fs/crypto/policy.c
 delete mode 100644 fs/f2fs/crypto.c
 create mode 100644 include/linux/fscrypto.h

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index b102b436563e..753dd4f96afe 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -102,14 +102,16 @@ background_gc=%s       Turn on/off cleaning operations, namely garbage
                        collection, triggered in background when I/O subsystem is
                        idle. If background_gc=on, it will turn on the garbage
                        collection and if background_gc=off, garbage collection
-                       will be truned off. If background_gc=sync, it will turn
+                       will be turned off. If background_gc=sync, it will turn
                        on synchronous garbage collection running in background.
                        Default value for this option is on. So garbage
                        collection is on by default.
 disable_roll_forward   Disable the roll-forward recovery routine
 norecovery             Disable the roll-forward recovery routine, mounted read-
                        only (i.e., -o ro,disable_roll_forward)
-discard                Issue discard/TRIM commands when a segment is cleaned.
+discard/nodiscard      Enable/disable real-time discard in f2fs, if discard is
+                       enabled, f2fs will issue discard/TRIM commands when a
+		       segment is cleaned.
 no_heap                Disable heap-style segment allocation which finds free
                        segments for data from the beginning of main area, while
 		       for node from the end of main area.
@@ -129,6 +131,7 @@ inline_dentry          Enable the inline dir feature: data in new created
                        directory entries can be written into inode block. The
                        space of inode block which is used to store inline
                        dentries is limited to ~3.4k.
+noinline_dentry        Diable the inline dentry feature.
 flush_merge	       Merge concurrent cache_flush commands as much as possible
                        to eliminate redundant command issues. If the underlying
 		       device handles the cache_flush command relatively slowly,
@@ -145,10 +148,15 @@ extent_cache           Enable an extent cache based on rb-tree, it can cache
                        as many as extent which map between contiguous logical
                        address and physical address per inode, resulting in
                        increasing the cache hit ratio. Set by default.
-noextent_cache         Diable an extent cache based on rb-tree explicitly, see
+noextent_cache         Disable an extent cache based on rb-tree explicitly, see
                        the above extent_cache mount option.
 noinline_data          Disable the inline data feature, inline data feature is
                        enabled by default.
+data_flush             Enable data flushing before checkpoint in order to
+                       persist data of regular and symlink.
+mode=%s                Control block allocation mode which supports "adaptive"
+                       and "lfs". In "lfs" mode, there should be no random
+                       writes towards main area.
 
 ================================================================================
 DEBUGFS ENTRIES
@@ -192,7 +200,7 @@ Files in /sys/fs/f2fs/<devname>
                               policy for garbage collection. Setting gc_idle = 0
                               (default) will disable this option. Setting
                               gc_idle = 1 will select the Cost Benefit approach
-                              & setting gc_idle = 2 will select the greedy aproach.
+                              & setting gc_idle = 2 will select the greedy approach.
 
  reclaim_segments             This parameter controls the number of prefree
                               segments to be reclaimed. If the number of prefree
@@ -298,7 +306,7 @@ The dump.f2fs shows the information of specific inode and dumps SSA and SIT to
 file. Each file is dump_ssa and dump_sit.
 
 The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem.
-It shows on-disk inode information reconized by a given inode number, and is
+It shows on-disk inode information recognized by a given inode number, and is
 able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and
 ./dump_sit respectively.
 
diff --git a/fs/Kconfig b/fs/Kconfig
index a5d2dc39ba07..80af05163579 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -73,6 +73,8 @@ config FILE_LOCKING
           for filesystems like NFS and for the flock() system
           call. Disabling this option saves about 11k.
 
+source "fs/crypto/Kconfig"
+
 source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index dee237540bc0..4644db462ba9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FS_DAX)		+= dax.o
+obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
new file mode 100644
index 000000000000..92348faf9865
--- /dev/null
+++ b/fs/crypto/Kconfig
@@ -0,0 +1,18 @@
+config FS_ENCRYPTION
+	tristate "FS Encryption (Per-file encryption)"
+	depends on BLOCK
+	select CRYPTO
+	select CRYPTO_AES
+	select CRYPTO_CBC
+	select CRYPTO_ECB
+	select CRYPTO_XTS
+	select CRYPTO_CTS
+	select CRYPTO_CTR
+	select CRYPTO_SHA256
+	select KEYS
+	select ENCRYPTED_KEYS
+	help
+	  Enable encryption of files and directories.  This
+	  feature is similar to ecryptfs, but it is more memory
+	  efficient since it avoids caching the encrypted and
+	  decrypted pages in the page cache.
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
new file mode 100644
index 000000000000..f17684c48739
--- /dev/null
+++ b/fs/crypto/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
+
+fscrypto-y := crypto.o fname.o policy.o keyinfo.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
new file mode 100644
index 000000000000..2fc8c43ce531
--- /dev/null
+++ b/fs/crypto/crypto.c
@@ -0,0 +1,568 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *	Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *	Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ *	Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/fscrypto.h>
+#include <linux/ecryptfs.h>
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+		"Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+		"Number of crypto contexts to preallocate");
+
+static mempool_t *fscrypt_bounce_page_pool = NULL;
+
+static LIST_HEAD(fscrypt_free_ctxs);
+static DEFINE_SPINLOCK(fscrypt_ctx_lock);
+
+static struct workqueue_struct *fscrypt_read_workqueue;
+static DEFINE_MUTEX(fscrypt_init_mutex);
+
+static struct kmem_cache *fscrypt_ctx_cachep;
+struct kmem_cache *fscrypt_info_cachep;
+
+/**
+ * fscrypt_release_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+	unsigned long flags;
+
+	if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+		mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
+		ctx->w.bounce_page = NULL;
+	}
+	ctx->w.control_page = NULL;
+	if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+		kmem_cache_free(fscrypt_ctx_cachep, ctx);
+	} else {
+		spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+		list_add(&ctx->free_list, &fscrypt_free_ctxs);
+		spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+	}
+}
+EXPORT_SYMBOL(fscrypt_release_ctx);
+
+/**
+ * fscrypt_get_ctx() - Gets an encryption context
+ * @inode:       The inode for which we are doing the crypto
+ * @gfp_flags:   The gfp flag for memory allocation
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
+{
+	struct fscrypt_ctx *ctx = NULL;
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	unsigned long flags;
+
+	if (ci == NULL)
+		return ERR_PTR(-ENOKEY);
+
+	/*
+	 * We first try getting the ctx from a free list because in
+	 * the common case the ctx will have an allocated and
+	 * initialized crypto tfm, so it's probably a worthwhile
+	 * optimization. For the bounce page, we first try getting it
+	 * from the kernel allocator because that's just about as fast
+	 * as getting it from a list and because a cache of free pages
+	 * should generally be a "last resort" option for a filesystem
+	 * to be able to do its job.
+	 */
+	spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+	ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
+					struct fscrypt_ctx, free_list);
+	if (ctx)
+		list_del(&ctx->free_list);
+	spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+	if (!ctx) {
+		ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags);
+		if (!ctx)
+			return ERR_PTR(-ENOMEM);
+		ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	} else {
+		ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	}
+	ctx->flags &= ~FS_WRITE_PATH_FL;
+	return ctx;
+}
+EXPORT_SYMBOL(fscrypt_get_ctx);
+
+/**
+ * fscrypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void fscrypt_complete(struct crypto_async_request *req, int res)
+{
+	struct fscrypt_completion_result *ecr = req->data;
+
+	if (res == -EINPROGRESS)
+		return;
+	ecr->res = res;
+	complete(&ecr->completion);
+}
+
+typedef enum {
+	FS_DECRYPT = 0,
+	FS_ENCRYPT,
+} fscrypt_direction_t;
+
+static int do_page_crypto(struct inode *inode,
+			fscrypt_direction_t rw, pgoff_t index,
+			struct page *src_page, struct page *dest_page,
+			gfp_t gfp_flags)
+{
+	u8 xts_tweak[FS_XTS_TWEAK_SIZE];
+	struct skcipher_request *req = NULL;
+	DECLARE_FS_COMPLETION_RESULT(ecr);
+	struct scatterlist dst, src;
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	int res = 0;
+
+	req = skcipher_request_alloc(tfm, gfp_flags);
+	if (!req) {
+		printk_ratelimited(KERN_ERR
+				"%s: crypto_request_alloc() failed\n",
+				__func__);
+		return -ENOMEM;
+	}
+
+	skcipher_request_set_callback(
+		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		fscrypt_complete, &ecr);
+
+	BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index));
+	memcpy(xts_tweak, &index, sizeof(index));
+	memset(&xts_tweak[sizeof(index)], 0,
+			FS_XTS_TWEAK_SIZE - sizeof(index));
+
+	sg_init_table(&dst, 1);
+	sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
+	sg_init_table(&src, 1);
+	sg_set_page(&src, src_page, PAGE_SIZE, 0);
+	skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE,
+					xts_tweak);
+	if (rw == FS_DECRYPT)
+		res = crypto_skcipher_decrypt(req);
+	else
+		res = crypto_skcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	skcipher_request_free(req);
+	if (res) {
+		printk_ratelimited(KERN_ERR
+			"%s: crypto_skcipher_encrypt() returned %d\n",
+			__func__, res);
+		return res;
+	}
+	return 0;
+}
+
+static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
+{
+	ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
+	if (ctx->w.bounce_page == NULL)
+		return ERR_PTR(-ENOMEM);
+	ctx->flags |= FS_WRITE_PATH_FL;
+	return ctx->w.bounce_page;
+}
+
+/**
+ * fscypt_encrypt_page() - Encrypts a page
+ * @inode:          The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ * @gfp_flags:      The gfp flag for memory allocation
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path.  The caller must call
+ * fscrypt_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *fscrypt_encrypt_page(struct inode *inode,
+				struct page *plaintext_page, gfp_t gfp_flags)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *ciphertext_page = NULL;
+	int err;
+
+	BUG_ON(!PageLocked(plaintext_page));
+
+	ctx = fscrypt_get_ctx(inode, gfp_flags);
+	if (IS_ERR(ctx))
+		return (struct page *)ctx;
+
+	/* The encryption operation will require a bounce page. */
+	ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
+	if (IS_ERR(ciphertext_page))
+		goto errout;
+
+	ctx->w.control_page = plaintext_page;
+	err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
+					plaintext_page, ciphertext_page,
+					gfp_flags);
+	if (err) {
+		ciphertext_page = ERR_PTR(err);
+		goto errout;
+	}
+	SetPagePrivate(ciphertext_page);
+	set_page_private(ciphertext_page, (unsigned long)ctx);
+	lock_page(ciphertext_page);
+	return ciphertext_page;
+
+errout:
+	fscrypt_release_ctx(ctx);
+	return ciphertext_page;
+}
+EXPORT_SYMBOL(fscrypt_encrypt_page);
+
+/**
+ * f2crypt_decrypt_page() - Decrypts a page in-place
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_decrypt_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	return do_page_crypto(page->mapping->host,
+			FS_DECRYPT, page->index, page, page, GFP_NOFS);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_page);
+
+int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
+				sector_t pblk, unsigned int len)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *ciphertext_page = NULL;
+	struct bio *bio;
+	int ret, err = 0;
+
+	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
+
+	ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT);
+	if (IS_ERR(ciphertext_page)) {
+		err = PTR_ERR(ciphertext_page);
+		goto errout;
+	}
+
+	while (len--) {
+		err = do_page_crypto(inode, FS_ENCRYPT, lblk,
+					ZERO_PAGE(0), ciphertext_page,
+					GFP_NOFS);
+		if (err)
+			goto errout;
+
+		bio = bio_alloc(GFP_NOWAIT, 1);
+		if (!bio) {
+			err = -ENOMEM;
+			goto errout;
+		}
+		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio->bi_iter.bi_sector =
+			pblk << (inode->i_sb->s_blocksize_bits - 9);
+		ret = bio_add_page(bio, ciphertext_page,
+					inode->i_sb->s_blocksize, 0);
+		if (ret != inode->i_sb->s_blocksize) {
+			/* should never happen! */
+			WARN_ON(1);
+			bio_put(bio);
+			err = -EIO;
+			goto errout;
+		}
+		err = submit_bio_wait(WRITE, bio);
+		if ((err == 0) && bio->bi_error)
+			err = -EIO;
+		bio_put(bio);
+		if (err)
+			goto errout;
+		lblk++;
+		pblk++;
+	}
+	err = 0;
+errout:
+	fscrypt_release_ctx(ctx);
+	return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct dentry *dir;
+	struct fscrypt_info *ci;
+	int dir_has_key, cached_with_key;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	dir = dget_parent(dentry);
+	if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) {
+		dput(dir);
+		return 0;
+	}
+
+	ci = d_inode(dir)->i_crypt_info;
+	if (ci && ci->ci_keyring_key &&
+	    (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+					  (1 << KEY_FLAG_REVOKED) |
+					  (1 << KEY_FLAG_DEAD))))
+		ci = NULL;
+
+	/* this should eventually be an flag in d_flags */
+	spin_lock(&dentry->d_lock);
+	cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
+	spin_unlock(&dentry->d_lock);
+	dir_has_key = (ci != NULL);
+	dput(dir);
+
+	/*
+	 * If the dentry was cached without the key, and it is a
+	 * negative dentry, it might be a valid name.  We can't check
+	 * if the key has since been made available due to locking
+	 * reasons, so we fail the validation so ext4_lookup() can do
+	 * this check.
+	 *
+	 * We also fail the validation if the dentry was created with
+	 * the key present, but we no longer have the key, or vice versa.
+	 */
+	if ((!cached_with_key && d_is_negative(dentry)) ||
+			(!cached_with_key && dir_has_key) ||
+			(cached_with_key && !dir_has_key))
+		return 0;
+	return 1;
+}
+
+const struct dentry_operations fscrypt_d_ops = {
+	.d_revalidate = fscrypt_d_revalidate,
+};
+EXPORT_SYMBOL(fscrypt_d_ops);
+
+/*
+ * Call fscrypt_decrypt_page on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+	struct fscrypt_ctx *ctx =
+		container_of(work, struct fscrypt_ctx, r.work);
+	struct bio *bio = ctx->r.bio;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+		int ret = fscrypt_decrypt_page(page);
+
+		if (ret) {
+			WARN_ON_ONCE(1);
+			SetPageError(page);
+		} else {
+			SetPageUptodate(page);
+		}
+		unlock_page(page);
+	}
+	fscrypt_release_ctx(ctx);
+	bio_put(bio);
+}
+
+void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+	INIT_WORK(&ctx->r.work, completion_pages);
+	ctx->r.bio = bio;
+	queue_work(fscrypt_read_workqueue, &ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *bounce_page;
+
+	/* The bounce data pages are unmapped. */
+	if ((*page)->mapping)
+		return;
+
+	/* The bounce data page is unmapped. */
+	bounce_page = *page;
+	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+
+	/* restore control page */
+	*page = ctx->w.control_page;
+
+	if (restore)
+		fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+
+void fscrypt_restore_control_page(struct page *page)
+{
+	struct fscrypt_ctx *ctx;
+
+	ctx = (struct fscrypt_ctx *)page_private(page);
+	set_page_private(page, (unsigned long)NULL);
+	ClearPagePrivate(page);
+	unlock_page(page);
+	fscrypt_release_ctx(ctx);
+}
+EXPORT_SYMBOL(fscrypt_restore_control_page);
+
+static void fscrypt_destroy(void)
+{
+	struct fscrypt_ctx *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list)
+		kmem_cache_free(fscrypt_ctx_cachep, pos);
+	INIT_LIST_HEAD(&fscrypt_free_ctxs);
+	mempool_destroy(fscrypt_bounce_page_pool);
+	fscrypt_bounce_page_pool = NULL;
+}
+
+/**
+ * fscrypt_initialize() - allocate major buffers for fs encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_initialize(void)
+{
+	int i, res = -ENOMEM;
+
+	if (fscrypt_bounce_page_pool)
+		return 0;
+
+	mutex_lock(&fscrypt_init_mutex);
+	if (fscrypt_bounce_page_pool)
+		goto already_initialized;
+
+	for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+		struct fscrypt_ctx *ctx;
+
+		ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+		if (!ctx)
+			goto fail;
+		list_add(&ctx->free_list, &fscrypt_free_ctxs);
+	}
+
+	fscrypt_bounce_page_pool =
+		mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+	if (!fscrypt_bounce_page_pool)
+		goto fail;
+
+already_initialized:
+	mutex_unlock(&fscrypt_init_mutex);
+	return 0;
+fail:
+	fscrypt_destroy();
+	mutex_unlock(&fscrypt_init_mutex);
+	return res;
+}
+EXPORT_SYMBOL(fscrypt_initialize);
+
+/**
+ * fscrypt_init() - Set up for fs encryption.
+ */
+static int __init fscrypt_init(void)
+{
+	fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
+							WQ_HIGHPRI, 0);
+	if (!fscrypt_read_workqueue)
+		goto fail;
+
+	fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT);
+	if (!fscrypt_ctx_cachep)
+		goto fail_free_queue;
+
+	fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
+	if (!fscrypt_info_cachep)
+		goto fail_free_ctx;
+
+	return 0;
+
+fail_free_ctx:
+	kmem_cache_destroy(fscrypt_ctx_cachep);
+fail_free_queue:
+	destroy_workqueue(fscrypt_read_workqueue);
+fail:
+	return -ENOMEM;
+}
+module_init(fscrypt_init)
+
+/**
+ * fscrypt_exit() - Shutdown the fs encryption system
+ */
+static void __exit fscrypt_exit(void)
+{
+	fscrypt_destroy();
+
+	if (fscrypt_read_workqueue)
+		destroy_workqueue(fscrypt_read_workqueue);
+	kmem_cache_destroy(fscrypt_ctx_cachep);
+	kmem_cache_destroy(fscrypt_info_cachep);
+}
+module_exit(fscrypt_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/crypto_fname.c b/fs/crypto/fname.c
similarity index 53%
rename from fs/f2fs/crypto_fname.c
rename to fs/crypto/fname.c
index 38349ed5ea51..5d6d49113efa 100644
--- a/fs/f2fs/crypto_fname.c
+++ b/fs/crypto/fname.c
@@ -1,46 +1,32 @@
 /*
- * linux/fs/f2fs/crypto_fname.c
- *
- * Copied from linux/fs/ext4/crypto.c
+ * This contains functions for filename crypto management
  *
  * Copyright (C) 2015, Google, Inc.
  * Copyright (C) 2015, Motorola Mobility
  *
- * This contains functions for filename crypto management in f2fs
- *
  * Written by Uday Savagaonkar, 2014.
- *
- * Adjust f2fs dentry structure
- *	Jaegeuk Kim, 2015.
+ * Modified by Jaegeuk Kim, 2015.
  *
  * This has not yet undergone a rigorous security audit.
  */
-#include <crypto/hash.h>
-#include <crypto/sha.h>
+
 #include <keys/encrypted-type.h>
 #include <keys/user-type.h>
-#include <linux/crypto.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/random.h>
 #include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
 #include <linux/ratelimit.h>
+#include <linux/fscrypto.h>
 
-#include "f2fs.h"
-#include "f2fs_crypto.h"
-#include "xattr.h"
+static u32 size_round_up(size_t size, size_t blksize)
+{
+	return ((size + blksize - 1) / blksize) * blksize;
+}
 
 /**
- * f2fs_dir_crypt_complete() -
+ * dir_crypt_complete() -
  */
-static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
+static void dir_crypt_complete(struct crypto_async_request *req, int res)
 {
-	struct f2fs_completion_result *ecr = req->data;
+	struct fscrypt_completion_result *ecr = req->data;
 
 	if (res == -EINPROGRESS)
 		return;
@@ -48,45 +34,35 @@ static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
 	complete(&ecr->completion);
 }
 
-bool f2fs_valid_filenames_enc_mode(uint32_t mode)
-{
-	return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS);
-}
-
-static unsigned max_name_len(struct inode *inode)
-{
-	return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
-					F2FS_NAME_LEN;
-}
-
 /**
- * f2fs_fname_encrypt() -
+ * fname_encrypt() -
  *
  * This function encrypts the input filename, and returns the length of the
  * ciphertext. Errors are returned as negative numbers.  We trust the caller to
  * allocate sufficient memory to oname string.
  */
-static int f2fs_fname_encrypt(struct inode *inode,
-			const struct qstr *iname, struct f2fs_str *oname)
+static int fname_encrypt(struct inode *inode,
+			const struct qstr *iname, struct fscrypt_str *oname)
 {
 	u32 ciphertext_len;
-	struct ablkcipher_request *req = NULL;
-	DECLARE_F2FS_COMPLETION_RESULT(ecr);
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+	struct skcipher_request *req = NULL;
+	DECLARE_FS_COMPLETION_RESULT(ecr);
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
 	int res = 0;
-	char iv[F2FS_CRYPTO_BLOCK_SIZE];
+	char iv[FS_CRYPTO_BLOCK_SIZE];
 	struct scatterlist src_sg, dst_sg;
-	int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
+	int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
 	char *workbuf, buf[32], *alloc_buf = NULL;
-	unsigned lim = max_name_len(inode);
+	unsigned lim;
 
+	lim = inode->i_sb->s_cop->max_namelen(inode);
 	if (iname->len <= 0 || iname->len > lim)
 		return -EIO;
 
-	ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ?
-		F2FS_CRYPTO_BLOCK_SIZE : iname->len;
-	ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding);
+	ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ?
+					FS_CRYPTO_BLOCK_SIZE : iname->len;
+	ciphertext_len = size_round_up(ciphertext_len, padding);
 	ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
 
 	if (ciphertext_len <= sizeof(buf)) {
@@ -99,16 +75,16 @@ static int f2fs_fname_encrypt(struct inode *inode,
 	}
 
 	/* Allocate request */
-	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	req = skcipher_request_alloc(tfm, GFP_NOFS);
 	if (!req) {
 		printk_ratelimited(KERN_ERR
 			"%s: crypto_request_alloc() failed\n", __func__);
 		kfree(alloc_buf);
 		return -ENOMEM;
 	}
-	ablkcipher_request_set_callback(req,
+	skcipher_request_set_callback(req,
 			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-			f2fs_dir_crypt_complete, &ecr);
+			dir_crypt_complete, &ecr);
 
 	/* Copy the input */
 	memcpy(workbuf, iname->name, iname->len);
@@ -116,79 +92,78 @@ static int f2fs_fname_encrypt(struct inode *inode,
 		memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
 
 	/* Initialize IV */
-	memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
 
 	/* Create encryption request */
 	sg_init_one(&src_sg, workbuf, ciphertext_len);
 	sg_init_one(&dst_sg, oname->name, ciphertext_len);
-	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
-	res = crypto_ablkcipher_encrypt(req);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+	res = crypto_skcipher_encrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
-		BUG_ON(req->base.data != &ecr);
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
 	kfree(alloc_buf);
-	ablkcipher_request_free(req);
-	if (res < 0) {
+	skcipher_request_free(req);
+	if (res < 0)
 		printk_ratelimited(KERN_ERR
 				"%s: Error (error code %d)\n", __func__, res);
-	}
+
 	oname->len = ciphertext_len;
 	return res;
 }
 
 /*
- * f2fs_fname_decrypt()
+ * fname_decrypt()
  *	This function decrypts the input filename, and returns
  *	the length of the plaintext.
  *	Errors are returned as negative numbers.
  *	We trust the caller to allocate sufficient memory to oname string.
  */
-static int f2fs_fname_decrypt(struct inode *inode,
-			const struct f2fs_str *iname, struct f2fs_str *oname)
+static int fname_decrypt(struct inode *inode,
+				const struct fscrypt_str *iname,
+				struct fscrypt_str *oname)
 {
-	struct ablkcipher_request *req = NULL;
-	DECLARE_F2FS_COMPLETION_RESULT(ecr);
+	struct skcipher_request *req = NULL;
+	DECLARE_FS_COMPLETION_RESULT(ecr);
 	struct scatterlist src_sg, dst_sg;
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
 	int res = 0;
-	char iv[F2FS_CRYPTO_BLOCK_SIZE];
-	unsigned lim = max_name_len(inode);
+	char iv[FS_CRYPTO_BLOCK_SIZE];
+	unsigned lim;
 
+	lim = inode->i_sb->s_cop->max_namelen(inode);
 	if (iname->len <= 0 || iname->len > lim)
 		return -EIO;
 
 	/* Allocate request */
-	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	req = skcipher_request_alloc(tfm, GFP_NOFS);
 	if (!req) {
 		printk_ratelimited(KERN_ERR
 			"%s: crypto_request_alloc() failed\n",  __func__);
 		return -ENOMEM;
 	}
-	ablkcipher_request_set_callback(req,
+	skcipher_request_set_callback(req,
 		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-		f2fs_dir_crypt_complete, &ecr);
+		dir_crypt_complete, &ecr);
 
 	/* Initialize IV */
-	memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
 
 	/* Create decryption request */
 	sg_init_one(&src_sg, iname->name, iname->len);
 	sg_init_one(&dst_sg, oname->name, oname->len);
-	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
-	res = crypto_ablkcipher_decrypt(req);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+	res = crypto_skcipher_decrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
-		BUG_ON(req->base.data != &ecr);
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
-	ablkcipher_request_free(req);
+	skcipher_request_free(req);
 	if (res < 0) {
 		printk_ratelimited(KERN_ERR
-			"%s: Error in f2fs_fname_decrypt (error code %d)\n",
-			__func__, res);
+				"%s: Error (error code %d)\n", __func__, res);
 		return res;
 	}
 
@@ -200,7 +175,7 @@ static const char *lookup_table =
 	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
 
 /**
- * f2fs_fname_encode_digest() -
+ * digest_encode() -
  *
  * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
  * The encoded string is roughly 4/3 times the size of the input string.
@@ -249,148 +224,152 @@ static int digest_decode(const char *src, int len, char *dst)
 	return cp - dst;
 }
 
-/**
- * f2fs_fname_crypto_round_up() -
- *
- * Return: The next multiple of block size
- */
-u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize)
+u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
 {
-	return ((size + blksize - 1) / blksize) * blksize;
+	int padding = 32;
+	struct fscrypt_info *ci = inode->i_crypt_info;
+
+	if (ci)
+		padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
+	if (ilen < FS_CRYPTO_BLOCK_SIZE)
+		ilen = FS_CRYPTO_BLOCK_SIZE;
+	return size_round_up(ilen, padding);
 }
+EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
 
 /**
- * f2fs_fname_crypto_alloc_obuff() -
+ * fscrypt_fname_crypto_alloc_obuff() -
  *
  * Allocates an output buffer that is sufficient for the crypto operation
  * specified by the context and the direction.
  */
-int f2fs_fname_crypto_alloc_buffer(struct inode *inode,
-				   u32 ilen, struct f2fs_str *crypto_str)
+int fscrypt_fname_alloc_buffer(struct inode *inode,
+				u32 ilen, struct fscrypt_str *crypto_str)
 {
-	unsigned int olen;
-	int padding = 16;
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+	unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
 
-	if (ci)
-		padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
-	if (padding < F2FS_CRYPTO_BLOCK_SIZE)
-		padding = F2FS_CRYPTO_BLOCK_SIZE;
-	olen = f2fs_fname_crypto_round_up(ilen, padding);
 	crypto_str->len = olen;
-	if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
-		olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
-	/* Allocated buffer can hold one more character to null-terminate the
-	 * string */
+	if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
+		olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+	/*
+	 * Allocated buffer can hold one more character to null-terminate the
+	 * string
+	 */
 	crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
 	if (!(crypto_str->name))
 		return -ENOMEM;
 	return 0;
 }
+EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
 
 /**
- * f2fs_fname_crypto_free_buffer() -
+ * fscrypt_fname_crypto_free_buffer() -
  *
  * Frees the buffer allocated for crypto operation.
  */
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str)
+void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
 {
 	if (!crypto_str)
 		return;
 	kfree(crypto_str->name);
 	crypto_str->name = NULL;
 }
+EXPORT_SYMBOL(fscrypt_fname_free_buffer);
 
 /**
- * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space
+ * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
+ * space
  */
-int f2fs_fname_disk_to_usr(struct inode *inode,
-			f2fs_hash_t *hash,
-			const struct f2fs_str *iname,
-			struct f2fs_str *oname)
+int fscrypt_fname_disk_to_usr(struct inode *inode,
+			u32 hash, u32 minor_hash,
+			const struct fscrypt_str *iname,
+			struct fscrypt_str *oname)
 {
 	const struct qstr qname = FSTR_TO_QSTR(iname);
 	char buf[24];
 	int ret;
 
-	if (is_dot_dotdot(&qname)) {
+	if (fscrypt_is_dot_dotdot(&qname)) {
 		oname->name[0] = '.';
 		oname->name[iname->len - 1] = '.';
 		oname->len = iname->len;
 		return oname->len;
 	}
 
-	if (F2FS_I(inode)->i_crypt_info)
-		return f2fs_fname_decrypt(inode, iname, oname);
+	if (iname->len < FS_CRYPTO_BLOCK_SIZE)
+		return -EUCLEAN;
 
-	if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) {
+	if (inode->i_crypt_info)
+		return fname_decrypt(inode, iname, oname);
+
+	if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
 		ret = digest_encode(iname->name, iname->len, oname->name);
 		oname->len = ret;
 		return ret;
 	}
 	if (hash) {
-		memcpy(buf, hash, 4);
-		memset(buf + 4, 0, 4);
-	} else
+		memcpy(buf, &hash, 4);
+		memcpy(buf + 4, &minor_hash, 4);
+	} else {
 		memset(buf, 0, 8);
-	memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16);
+	}
+	memcpy(buf + 8, iname->name + iname->len - 16, 16);
 	oname->name[0] = '_';
 	ret = digest_encode(buf, 24, oname->name + 1);
 	oname->len = ret + 1;
 	return ret + 1;
 }
+EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
 
 /**
- * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space
+ * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
+ * space
  */
-int f2fs_fname_usr_to_disk(struct inode *inode,
+int fscrypt_fname_usr_to_disk(struct inode *inode,
 			const struct qstr *iname,
-			struct f2fs_str *oname)
+			struct fscrypt_str *oname)
 {
-	int res;
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
-	if (is_dot_dotdot(iname)) {
+	if (fscrypt_is_dot_dotdot(iname)) {
 		oname->name[0] = '.';
 		oname->name[iname->len - 1] = '.';
 		oname->len = iname->len;
 		return oname->len;
 	}
-
-	if (ci) {
-		res = f2fs_fname_encrypt(inode, iname, oname);
-		return res;
-	}
-	/* Without a proper key, a user is not allowed to modify the filenames
+	if (inode->i_crypt_info)
+		return fname_encrypt(inode, iname, oname);
+	/*
+	 * Without a proper key, a user is not allowed to modify the filenames
 	 * in a directory. Consequently, a user space name cannot be mapped to
-	 * a disk-space name */
+	 * a disk-space name
+	 */
 	return -EACCES;
 }
+EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
 
-int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
-			      int lookup, struct f2fs_filename *fname)
+int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
+			      int lookup, struct fscrypt_name *fname)
 {
-	struct f2fs_crypt_info *ci;
 	int ret = 0, bigname = 0;
 
-	memset(fname, 0, sizeof(struct f2fs_filename));
+	memset(fname, 0, sizeof(struct fscrypt_name));
 	fname->usr_fname = iname;
 
-	if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) {
+	if (!dir->i_sb->s_cop->is_encrypted(dir) ||
+				fscrypt_is_dot_dotdot(iname)) {
 		fname->disk_name.name = (unsigned char *)iname->name;
 		fname->disk_name.len = iname->len;
 		return 0;
 	}
-	ret = f2fs_get_encryption_info(dir);
-	if (ret)
+	ret = get_crypt_info(dir);
+	if (ret && ret != -EOPNOTSUPP)
 		return ret;
-	ci = F2FS_I(dir)->i_crypt_info;
-	if (ci) {
-		ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len,
-						     &fname->crypto_buf);
+
+	if (dir->i_crypt_info) {
+		ret = fscrypt_fname_alloc_buffer(dir, iname->len,
+							&fname->crypto_buf);
 		if (ret < 0)
 			return ret;
-		ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf);
+		ret = fname_encrypt(dir, iname, &fname->crypto_buf);
 		if (ret < 0)
 			goto errout;
 		fname->disk_name.name = fname->crypto_buf.name;
@@ -400,18 +379,19 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
 	if (!lookup)
 		return -EACCES;
 
-	/* We don't have the key and we are doing a lookup; decode the
+	/*
+	 * We don't have the key and we are doing a lookup; decode the
 	 * user-supplied name
 	 */
 	if (iname->name[0] == '_')
 		bigname = 1;
-	if ((bigname && (iname->len != 33)) ||
-	    (!bigname && (iname->len > 43)))
+	if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
 		return -ENOENT;
 
 	fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
 	if (fname->crypto_buf.name == NULL)
 		return -ENOMEM;
+
 	ret = digest_decode(iname->name + bigname, iname->len - bigname,
 				fname->crypto_buf.name);
 	if (ret < 0) {
@@ -421,20 +401,24 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
 	fname->crypto_buf.len = ret;
 	if (bigname) {
 		memcpy(&fname->hash, fname->crypto_buf.name, 4);
+		memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
 	} else {
 		fname->disk_name.name = fname->crypto_buf.name;
 		fname->disk_name.len = fname->crypto_buf.len;
 	}
 	return 0;
+
 errout:
-	f2fs_fname_crypto_free_buffer(&fname->crypto_buf);
+	fscrypt_fname_free_buffer(&fname->crypto_buf);
 	return ret;
 }
+EXPORT_SYMBOL(fscrypt_setup_filename);
 
-void f2fs_fname_free_filename(struct f2fs_filename *fname)
+void fscrypt_free_filename(struct fscrypt_name *fname)
 {
 	kfree(fname->crypto_buf.name);
 	fname->crypto_buf.name = NULL;
 	fname->usr_fname = NULL;
 	fname->disk_name.name = NULL;
 }
+EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
new file mode 100644
index 000000000000..1ac263eddc4e
--- /dev/null
+++ b/fs/crypto/keyinfo.c
@@ -0,0 +1,304 @@
+/*
+ * key management facility for FS encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+#include <linux/fscrypto.h>
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+	struct fscrypt_completion_result *ecr = req->data;
+
+	if (rc == -EINPROGRESS)
+		return;
+
+	ecr->res = rc;
+	complete(&ecr->completion);
+}
+
+/**
+ * derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivation.
+ * @source_key:   Source key to which to apply derivation.
+ * @derived_key:  Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
+				u8 source_key[FS_AES_256_XTS_KEY_SIZE],
+				u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+{
+	int res = 0;
+	struct skcipher_request *req = NULL;
+	DECLARE_FS_COMPLETION_RESULT(ecr);
+	struct scatterlist src_sg, dst_sg;
+	struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+
+	if (IS_ERR(tfm)) {
+		res = PTR_ERR(tfm);
+		tfm = NULL;
+		goto out;
+	}
+	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	req = skcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		res = -ENOMEM;
+		goto out;
+	}
+	skcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			derive_crypt_complete, &ecr);
+	res = crypto_skcipher_setkey(tfm, deriving_key,
+					FS_AES_128_ECB_KEY_SIZE);
+	if (res < 0)
+		goto out;
+
+	sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
+	sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg,
+					FS_AES_256_XTS_KEY_SIZE, NULL);
+	res = crypto_skcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+out:
+	skcipher_request_free(req);
+	crypto_free_skcipher(tfm);
+	return res;
+}
+
+static int validate_user_key(struct fscrypt_info *crypt_info,
+			struct fscrypt_context *ctx, u8 *raw_key,
+			u8 *prefix, int prefix_size)
+{
+	u8 *full_key_descriptor;
+	struct key *keyring_key;
+	struct fscrypt_key *master_key;
+	const struct user_key_payload *ukp;
+	int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
+	int res;
+
+	full_key_descriptor = kmalloc(full_key_len, GFP_NOFS);
+	if (!full_key_descriptor)
+		return -ENOMEM;
+
+	memcpy(full_key_descriptor, prefix, prefix_size);
+	sprintf(full_key_descriptor + prefix_size,
+			"%*phN", FS_KEY_DESCRIPTOR_SIZE,
+			ctx->master_key_descriptor);
+	full_key_descriptor[full_key_len - 1] = '\0';
+	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+	kfree(full_key_descriptor);
+	if (IS_ERR(keyring_key))
+		return PTR_ERR(keyring_key);
+
+	if (keyring_key->type != &key_type_logon) {
+		printk_once(KERN_WARNING
+				"%s: key type must be logon\n", __func__);
+		res = -ENOKEY;
+		goto out;
+	}
+	down_read(&keyring_key->sem);
+	ukp = user_key_payload(keyring_key);
+	if (ukp->datalen != sizeof(struct fscrypt_key)) {
+		res = -EINVAL;
+		up_read(&keyring_key->sem);
+		goto out;
+	}
+	master_key = (struct fscrypt_key *)ukp->data;
+	BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+
+	if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+		printk_once(KERN_WARNING
+				"%s: key size incorrect: %d\n",
+				__func__, master_key->size);
+		res = -ENOKEY;
+		up_read(&keyring_key->sem);
+		goto out;
+	}
+	res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
+	up_read(&keyring_key->sem);
+	if (res)
+		goto out;
+
+	crypt_info->ci_keyring_key = keyring_key;
+	return 0;
+out:
+	key_put(keyring_key);
+	return res;
+}
+
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+	if (!ci)
+		return;
+
+	key_put(ci->ci_keyring_key);
+	crypto_free_skcipher(ci->ci_ctfm);
+	kmem_cache_free(fscrypt_info_cachep, ci);
+}
+
+int get_crypt_info(struct inode *inode)
+{
+	struct fscrypt_info *crypt_info;
+	struct fscrypt_context ctx;
+	struct crypto_skcipher *ctfm;
+	const char *cipher_str;
+	u8 raw_key[FS_MAX_KEY_SIZE];
+	u8 mode;
+	int res;
+
+	res = fscrypt_initialize();
+	if (res)
+		return res;
+
+	if (!inode->i_sb->s_cop->get_context)
+		return -EOPNOTSUPP;
+retry:
+	crypt_info = ACCESS_ONCE(inode->i_crypt_info);
+	if (crypt_info) {
+		if (!crypt_info->ci_keyring_key ||
+				key_validate(crypt_info->ci_keyring_key) == 0)
+			return 0;
+		fscrypt_put_encryption_info(inode, crypt_info);
+		goto retry;
+	}
+
+	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res < 0) {
+		if (!fscrypt_dummy_context_enabled(inode))
+			return res;
+		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+		ctx.flags = 0;
+	} else if (res != sizeof(ctx)) {
+		return -EINVAL;
+	}
+	res = 0;
+
+	crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
+	if (!crypt_info)
+		return -ENOMEM;
+
+	crypt_info->ci_flags = ctx.flags;
+	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+	crypt_info->ci_ctfm = NULL;
+	crypt_info->ci_keyring_key = NULL;
+	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+				sizeof(crypt_info->ci_master_key));
+	if (S_ISREG(inode->i_mode))
+		mode = crypt_info->ci_data_mode;
+	else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		mode = crypt_info->ci_filename_mode;
+	else
+		BUG();
+
+	switch (mode) {
+	case FS_ENCRYPTION_MODE_AES_256_XTS:
+		cipher_str = "xts(aes)";
+		break;
+	case FS_ENCRYPTION_MODE_AES_256_CTS:
+		cipher_str = "cts(cbc(aes))";
+		break;
+	default:
+		printk_once(KERN_WARNING
+			    "%s: unsupported key mode %d (ino %u)\n",
+			    __func__, mode, (unsigned) inode->i_ino);
+		res = -ENOKEY;
+		goto out;
+	}
+	if (fscrypt_dummy_context_enabled(inode)) {
+		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
+		goto got_key;
+	}
+
+	res = validate_user_key(crypt_info, &ctx, raw_key,
+			FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE);
+	if (res && inode->i_sb->s_cop->key_prefix) {
+		u8 *prefix = NULL;
+		int prefix_size, res2;
+
+		prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix);
+		res2 = validate_user_key(crypt_info, &ctx, raw_key,
+							prefix, prefix_size);
+		if (res2) {
+			if (res2 == -ENOKEY)
+				res = -ENOKEY;
+			goto out;
+		}
+	} else if (res) {
+		goto out;
+	}
+got_key:
+	ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
+	if (!ctfm || IS_ERR(ctfm)) {
+		res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+		printk(KERN_DEBUG
+		       "%s: error %d (inode %u) allocating crypto tfm\n",
+		       __func__, res, (unsigned) inode->i_ino);
+		goto out;
+	}
+	crypt_info->ci_ctfm = ctfm;
+	crypto_skcipher_clear_flags(ctfm, ~0);
+	crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode));
+	if (res)
+		goto out;
+
+	memzero_explicit(raw_key, sizeof(raw_key));
+	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
+		put_crypt_info(crypt_info);
+		goto retry;
+	}
+	return 0;
+
+out:
+	if (res == -ENOKEY)
+		res = 0;
+	put_crypt_info(crypt_info);
+	memzero_explicit(raw_key, sizeof(raw_key));
+	return res;
+}
+
+void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
+{
+	struct fscrypt_info *prev;
+
+	if (ci == NULL)
+		ci = ACCESS_ONCE(inode->i_crypt_info);
+	if (ci == NULL)
+		return;
+
+	prev = cmpxchg(&inode->i_crypt_info, ci, NULL);
+	if (prev != ci)
+		return;
+
+	put_crypt_info(ci);
+}
+EXPORT_SYMBOL(fscrypt_put_encryption_info);
+
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+	struct fscrypt_info *ci = inode->i_crypt_info;
+
+	if (!ci ||
+		(ci->ci_keyring_key &&
+		 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+					       (1 << KEY_FLAG_REVOKED) |
+					       (1 << KEY_FLAG_DEAD)))))
+		return get_crypt_info(inode);
+	return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
new file mode 100644
index 000000000000..ed115acb5dee
--- /dev/null
+++ b/fs/crypto/policy.c
@@ -0,0 +1,246 @@
+/*
+ * Encryption policy functions for per-file encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/fscrypto.h>
+#include <linux/mount.h>
+
+static int inode_has_encryption_context(struct inode *inode)
+{
+	if (!inode->i_sb->s_cop->get_context)
+		return 0;
+	return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0);
+}
+
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int is_encryption_context_consistent_with_policy(struct inode *inode,
+				const struct fscrypt_policy *policy)
+{
+	struct fscrypt_context ctx;
+	int res;
+
+	if (!inode->i_sb->s_cop->get_context)
+		return 0;
+
+	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res != sizeof(ctx))
+		return 0;
+
+	return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+			FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+			(ctx.flags == policy->flags) &&
+			(ctx.contents_encryption_mode ==
+			 policy->contents_encryption_mode) &&
+			(ctx.filenames_encryption_mode ==
+			 policy->filenames_encryption_mode));
+}
+
+static int create_encryption_context_from_policy(struct inode *inode,
+				const struct fscrypt_policy *policy)
+{
+	struct fscrypt_context ctx;
+	int res;
+
+	if (!inode->i_sb->s_cop->set_context)
+		return -EOPNOTSUPP;
+
+	if (inode->i_sb->s_cop->prepare_context) {
+		res = inode->i_sb->s_cop->prepare_context(inode);
+		if (res)
+			return res;
+	}
+
+	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+					FS_KEY_DESCRIPTOR_SIZE);
+
+	if (!fscrypt_valid_contents_enc_mode(
+				policy->contents_encryption_mode)) {
+		printk(KERN_WARNING
+		       "%s: Invalid contents encryption mode %d\n", __func__,
+			policy->contents_encryption_mode);
+		return -EINVAL;
+	}
+
+	if (!fscrypt_valid_filenames_enc_mode(
+				policy->filenames_encryption_mode)) {
+		printk(KERN_WARNING
+			"%s: Invalid filenames encryption mode %d\n", __func__,
+			policy->filenames_encryption_mode);
+		return -EINVAL;
+	}
+
+	if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+		return -EINVAL;
+
+	ctx.contents_encryption_mode = policy->contents_encryption_mode;
+	ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+	ctx.flags = policy->flags;
+	BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE);
+	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+
+	return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
+}
+
+int fscrypt_process_policy(struct file *filp,
+				const struct fscrypt_policy *policy)
+{
+	struct inode *inode = file_inode(filp);
+	int ret;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	if (policy->version != 0)
+		return -EINVAL;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	if (!inode_has_encryption_context(inode)) {
+		if (!S_ISDIR(inode->i_mode))
+			ret = -EINVAL;
+		else if (!inode->i_sb->s_cop->empty_dir)
+			ret = -EOPNOTSUPP;
+		else if (!inode->i_sb->s_cop->empty_dir(inode))
+			ret = -ENOTEMPTY;
+		else
+			ret = create_encryption_context_from_policy(inode,
+								    policy);
+	} else if (!is_encryption_context_consistent_with_policy(inode,
+								 policy)) {
+		printk(KERN_WARNING
+		       "%s: Policy inconsistent with encryption context\n",
+		       __func__);
+		ret = -EINVAL;
+	}
+
+	mnt_drop_write_file(filp);
+	return ret;
+}
+EXPORT_SYMBOL(fscrypt_process_policy);
+
+int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+{
+	struct fscrypt_context ctx;
+	int res;
+
+	if (!inode->i_sb->s_cop->get_context ||
+			!inode->i_sb->s_cop->is_encrypted(inode))
+		return -ENODATA;
+
+	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res != sizeof(ctx))
+		return -ENODATA;
+	if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+		return -EINVAL;
+
+	policy->version = 0;
+	policy->contents_encryption_mode = ctx.contents_encryption_mode;
+	policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+	policy->flags = ctx.flags;
+	memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+				FS_KEY_DESCRIPTOR_SIZE);
+	return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_policy);
+
+int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
+{
+	struct fscrypt_info *parent_ci, *child_ci;
+	int res;
+
+	if ((parent == NULL) || (child == NULL)) {
+		printk(KERN_ERR	"parent %p child %p\n", parent, child);
+		BUG_ON(1);
+	}
+
+	/* no restrictions if the parent directory is not encrypted */
+	if (!parent->i_sb->s_cop->is_encrypted(parent))
+		return 1;
+	/* if the child directory is not encrypted, this is always a problem */
+	if (!parent->i_sb->s_cop->is_encrypted(child))
+		return 0;
+	res = fscrypt_get_encryption_info(parent);
+	if (res)
+		return 0;
+	res = fscrypt_get_encryption_info(child);
+	if (res)
+		return 0;
+	parent_ci = parent->i_crypt_info;
+	child_ci = child->i_crypt_info;
+	if (!parent_ci && !child_ci)
+		return 1;
+	if (!parent_ci || !child_ci)
+		return 0;
+
+	return (memcmp(parent_ci->ci_master_key,
+			child_ci->ci_master_key,
+			FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+		(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+		(parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
+		(parent_ci->ci_flags == child_ci->ci_flags));
+}
+EXPORT_SYMBOL(fscrypt_has_permitted_context);
+
+/**
+ * fscrypt_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child:  Child inode that inherits the context from @parent.
+ * @fs_data:  private data given by FS.
+ * @preload:  preload child i_crypt_info
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int fscrypt_inherit_context(struct inode *parent, struct inode *child,
+						void *fs_data, bool preload)
+{
+	struct fscrypt_context ctx;
+	struct fscrypt_info *ci;
+	int res;
+
+	if (!parent->i_sb->s_cop->set_context)
+		return -EOPNOTSUPP;
+
+	res = fscrypt_get_encryption_info(parent);
+	if (res < 0)
+		return res;
+
+	ci = parent->i_crypt_info;
+	if (ci == NULL)
+		return -ENOKEY;
+
+	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+	if (fscrypt_dummy_context_enabled(parent)) {
+		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+		ctx.flags = 0;
+		memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
+		res = 0;
+	} else {
+		ctx.contents_encryption_mode = ci->ci_data_mode;
+		ctx.filenames_encryption_mode = ci->ci_filename_mode;
+		ctx.flags = ci->ci_flags;
+		memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+				FS_KEY_DESCRIPTOR_SIZE);
+	}
+	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+	res = parent->i_sb->s_cop->set_context(child, &ctx,
+						sizeof(ctx), fs_data);
+	if (res)
+		return res;
+	return preload ? fscrypt_get_encryption_info(child): 0;
+}
+EXPORT_SYMBOL(fscrypt_inherit_context);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index b0a9dc929f88..1852d99df97b 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -1,6 +1,9 @@
 config F2FS_FS
 	tristate "F2FS filesystem support"
 	depends on BLOCK
+	select CRYPTO
+	select KEYS
+	select CRYPTO_CRC32
 	help
 	  F2FS is based on Log-structured File System (LFS), which supports
 	  versatile "flash-friendly" features. The design has been focused on
@@ -76,15 +79,7 @@ config F2FS_FS_ENCRYPTION
 	bool "F2FS Encryption"
 	depends on F2FS_FS
 	depends on F2FS_FS_XATTR
-	select CRYPTO_AES
-	select CRYPTO_CBC
-	select CRYPTO_ECB
-	select CRYPTO_XTS
-	select CRYPTO_CTS
-	select CRYPTO_CTR
-	select CRYPTO_SHA256
-	select KEYS
-	select ENCRYPTED_KEYS
+	select FS_ENCRYPTION
 	help
 	  Enable encryption of f2fs files and directories.  This
 	  feature is similar to ecryptfs, but it is more memory
@@ -100,3 +95,11 @@ config F2FS_IO_TRACE
 	  information and block IO patterns in the filesystem level.
 
 	  If unsure, say N.
+
+config F2FS_FAULT_INJECTION
+	bool "F2FS fault injection facility"
+	depends on F2FS_FS
+	help
+	  Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on.
+
+	  If unsure, say N.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 08e101ed914c..ca949ea7c02f 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
 f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
-f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \
-		crypto_key.o crypto_fname.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 83dcf7bfd7b8..fb0744b94c2f 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -109,14 +109,16 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
-static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
+static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
+				const struct posix_acl *acl, size_t *size)
 {
 	struct f2fs_acl_header *f2fs_acl;
 	struct f2fs_acl_entry *entry;
 	int i;
 
-	f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
-			sizeof(struct f2fs_acl_entry), GFP_NOFS);
+	f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) +
+			acl->a_count * sizeof(struct f2fs_acl_entry),
+			GFP_NOFS);
 	if (!f2fs_acl)
 		return ERR_PTR(-ENOMEM);
 
@@ -175,7 +177,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
 
 	retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage);
 	if (retval > 0) {
-		value = kmalloc(retval, GFP_F2FS_ZERO);
+		value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO);
 		if (!value)
 			return ERR_PTR(-ENOMEM);
 		retval = f2fs_getxattr(inode, name_index, "", value,
@@ -204,7 +206,6 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
 static int __f2fs_set_acl(struct inode *inode, int type,
 			struct posix_acl *acl, struct page *ipage)
 {
-	struct f2fs_inode_info *fi = F2FS_I(inode);
 	int name_index;
 	void *value = NULL;
 	size_t size = 0;
@@ -213,11 +214,13 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
-		if (acl && !ipage) {
-			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
-			if (error)
+		if (acl) {
+			error = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (error < 0)
 				return error;
-			set_acl_inode(fi, inode->i_mode);
+			set_acl_inode(inode, inode->i_mode);
+			if (error == 0)
+				acl = NULL;
 		}
 		break;
 
@@ -232,9 +235,9 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	}
 
 	if (acl) {
-		value = f2fs_acl_to_disk(acl, &size);
+		value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size);
 		if (IS_ERR(value)) {
-			clear_inode_flag(fi, FI_ACL_MODE);
+			clear_inode_flag(inode, FI_ACL_MODE);
 			return (int)PTR_ERR(value);
 		}
 	}
@@ -245,7 +248,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	if (!error)
 		set_cached_acl(inode, type, acl);
 
-	clear_inode_flag(fi, FI_ACL_MODE);
+	clear_inode_flag(inode, FI_ACL_MODE);
 	return error;
 }
 
@@ -386,6 +389,8 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
 	if (error)
 		return error;
 
+	f2fs_mark_inode_dirty_sync(inode);
+
 	if (default_acl) {
 		error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
 				       ipage);
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 997ca8edb6cb..2c685185c24d 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -37,11 +37,10 @@ struct f2fs_acl_header {
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 
 extern struct posix_acl *f2fs_get_acl(struct inode *, int);
-extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int f2fs_set_acl(struct inode *, struct posix_acl *, int);
 extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
 							struct page *);
 #else
-#define f2fs_check_acl	NULL
 #define f2fs_get_acl	NULL
 #define f2fs_set_acl	NULL
 
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f661d80474be..cb23d6cf676b 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -26,6 +26,14 @@
 static struct kmem_cache *ino_entry_slab;
 struct kmem_cache *inode_entry_slab;
 
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
+{
+	set_ckpt_flags(sbi, CP_ERROR_FLAG);
+	sbi->sb->s_flags |= MS_RDONLY;
+	if (!end_io)
+		f2fs_flush_merged_bios(sbi);
+}
+
 /*
  * We guarantee no failure on the returned page.
  */
@@ -34,13 +42,14 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 	struct address_space *mapping = META_MAPPING(sbi);
 	struct page *page = NULL;
 repeat:
-	page = grab_cache_page(mapping, index);
+	page = f2fs_grab_cache_page(mapping, index, false);
 	if (!page) {
 		cond_resched();
 		goto repeat;
 	}
-	f2fs_wait_on_page_writeback(page, META);
-	SetPageUptodate(page);
+	f2fs_wait_on_page_writeback(page, META, true);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
 	return page;
 }
 
@@ -56,14 +65,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
 		.sbi = sbi,
 		.type = META,
 		.rw = READ_SYNC | REQ_META | REQ_PRIO,
-		.blk_addr = index,
+		.old_blkaddr = index,
+		.new_blkaddr = index,
 		.encrypted_page = NULL,
 	};
 
 	if (unlikely(!is_meta))
 		fio.rw &= ~REQ_META;
 repeat:
-	page = grab_cache_page(mapping, index);
+	page = f2fs_grab_cache_page(mapping, index, false);
 	if (!page) {
 		cond_resched();
 		goto repeat;
@@ -90,7 +100,7 @@ repeat:
 	 * meta page.
 	 */
 	if (unlikely(!PageUptodate(page)))
-		f2fs_stop_checkpoint(sbi);
+		f2fs_stop_checkpoint(sbi, false);
 out:
 	return page;
 }
@@ -143,7 +153,6 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
 int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 							int type, bool sync)
 {
-	block_t prev_blk_addr = 0;
 	struct page *page;
 	block_t blkno = start;
 	struct f2fs_io_info fio = {
@@ -152,10 +161,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 		.rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
 		.encrypted_page = NULL,
 	};
+	struct blk_plug plug;
 
 	if (unlikely(type == META_POR))
 		fio.rw &= ~REQ_META;
 
+	blk_start_plug(&plug);
 	for (; nrpages-- > 0; blkno++) {
 
 		if (!is_valid_blkaddr(sbi, blkno, type))
@@ -167,27 +178,25 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 					NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
 				blkno = 0;
 			/* get nat block addr */
-			fio.blk_addr = current_nat_addr(sbi,
+			fio.new_blkaddr = current_nat_addr(sbi,
 					blkno * NAT_ENTRY_PER_BLOCK);
 			break;
 		case META_SIT:
 			/* get sit block addr */
-			fio.blk_addr = current_sit_addr(sbi,
+			fio.new_blkaddr = current_sit_addr(sbi,
 					blkno * SIT_ENTRY_PER_BLOCK);
-			if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
-				goto out;
-			prev_blk_addr = fio.blk_addr;
 			break;
 		case META_SSA:
 		case META_CP:
 		case META_POR:
-			fio.blk_addr = blkno;
+			fio.new_blkaddr = blkno;
 			break;
 		default:
 			BUG();
 		}
 
-		page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
+		page = f2fs_grab_cache_page(META_MAPPING(sbi),
+						fio.new_blkaddr, false);
 		if (!page)
 			continue;
 		if (PageUptodate(page)) {
@@ -196,11 +205,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 		}
 
 		fio.page = page;
+		fio.old_blkaddr = fio.new_blkaddr;
 		f2fs_submit_page_mbio(&fio);
 		f2fs_put_page(page, 0);
 	}
 out:
 	f2fs_submit_merged_bio(sbi, META, READ);
+	blk_finish_plug(&plug);
 	return blkno - start;
 }
 
@@ -210,7 +221,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
 	bool readahead = false;
 
 	page = find_get_page(META_MAPPING(sbi), index);
-	if (!page || (page && !PageUptodate(page)))
+	if (!page || !PageUptodate(page))
 		readahead = true;
 	f2fs_put_page(page, 0);
 
@@ -232,13 +243,17 @@ static int f2fs_write_meta_page(struct page *page,
 	if (unlikely(f2fs_cp_error(sbi)))
 		goto redirty_out;
 
-	f2fs_wait_on_page_writeback(page, META);
 	write_meta_page(sbi, page);
 	dec_page_count(sbi, F2FS_DIRTY_META);
-	unlock_page(page);
 
 	if (wbc->for_reclaim)
+		f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE);
+
+	unlock_page(page);
+
+	if (unlikely(f2fs_cp_error(sbi)))
 		f2fs_submit_merged_bio(sbi, META, WRITE);
+
 	return 0;
 
 redirty_out:
@@ -252,13 +267,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
 	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
 	long diff, written;
 
-	trace_f2fs_writepages(mapping->host, wbc, META);
-
 	/* collect a number of dirty meta pages and write together */
 	if (wbc->for_kupdate ||
 		get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
 		goto skip_write;
 
+	trace_f2fs_writepages(mapping->host, wbc, META);
+
 	/* if mounting is failed, skip writing node pages */
 	mutex_lock(&sbi->cp_mutex);
 	diff = nr_pages_to_write(sbi, META, wbc);
@@ -269,6 +284,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
 
 skip_write:
 	wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
+	trace_f2fs_writepages(mapping->host, wbc, META);
 	return 0;
 }
 
@@ -276,15 +292,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 						long nr_to_write)
 {
 	struct address_space *mapping = META_MAPPING(sbi);
-	pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX;
+	pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
 	struct pagevec pvec;
 	long nwritten = 0;
 	struct writeback_control wbc = {
 		.for_reclaim = 0,
 	};
+	struct blk_plug plug;
 
 	pagevec_init(&pvec, 0);
 
+	blk_start_plug(&plug);
+
 	while (index <= end) {
 		int i, nr_pages;
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
@@ -296,7 +315,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			if (prev == LONG_MAX)
+			if (prev == ULONG_MAX)
 				prev = page->index - 1;
 			if (nr_to_write != LONG_MAX && page->index != prev + 1) {
 				pagevec_release(&pvec);
@@ -315,6 +334,9 @@ continue_unlock:
 				goto continue_unlock;
 			}
 
+			f2fs_wait_on_page_writeback(page, META, true);
+
+			BUG_ON(PageWriteback(page));
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
@@ -334,6 +356,8 @@ stop:
 	if (nwritten)
 		f2fs_submit_merged_bio(sbi, type, WRITE);
 
+	blk_finish_plug(&plug);
+
 	return nwritten;
 }
 
@@ -341,9 +365,10 @@ static int f2fs_set_meta_page_dirty(struct page *page)
 {
 	trace_f2fs_set_page_dirty(page, META);
 
-	SetPageUptodate(page);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
 	if (!PageDirty(page)) {
-		__set_page_dirty_nobuffers(page);
+		f2fs_set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
 		SetPagePrivate(page);
 		f2fs_trace_pid(page);
@@ -358,6 +383,9 @@ const struct address_space_operations f2fs_meta_aops = {
 	.set_page_dirty	= f2fs_set_meta_page_dirty,
 	.invalidatepage = f2fs_invalidate_page,
 	.releasepage	= f2fs_release_page,
+#ifdef CONFIG_MIGRATION
+	.migratepage    = f2fs_migrate_page,
+#endif
 };
 
 static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -410,13 +438,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 	spin_unlock(&im->ino_lock);
 }
 
-void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 {
 	/* add new dirty ino entry into list */
 	__add_ino_entry(sbi, ino, type);
 }
 
-void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 {
 	/* remove dirty ino entry from list */
 	__remove_ino_entry(sbi, ino, type);
@@ -434,12 +462,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
 	return e ? true : false;
 }
 
-void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
 {
 	struct ino_entry *e, *tmp;
 	int i;
 
-	for (i = APPEND_INO; i <= UPDATE_INO; i++) {
+	for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) {
 		struct inode_management *im = &sbi->im[i];
 
 		spin_lock(&im->ino_lock);
@@ -459,6 +487,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 	int err = 0;
 
 	spin_lock(&im->ino_lock);
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_ORPHAN)) {
+		spin_unlock(&im->ino_lock);
+		return -ENOSPC;
+	}
+#endif
 	if (unlikely(im->ino_num >= sbi->max_orphans))
 		err = -ENOSPC;
 	else
@@ -478,10 +513,11 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
 	spin_unlock(&im->ino_lock);
 }
 
-void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+void add_orphan_inode(struct inode *inode)
 {
 	/* add new orphan ino entry into list */
-	__add_ino_entry(sbi, ino, ORPHAN_INO);
+	__add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO);
+	update_inode_page(inode);
 }
 
 void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -493,8 +529,20 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	struct inode *inode;
+	struct node_info ni;
+	int err = acquire_orphan_inode(sbi);
 
-	inode = f2fs_iget(sbi->sb, ino);
+	if (err) {
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		f2fs_msg(sbi->sb, KERN_WARNING,
+				"%s: orphan failed (ino=%x), run fsck to fix.",
+				__func__, ino);
+		return err;
+	}
+
+	__add_ino_entry(sbi, ino, ORPHAN_INO);
+
+	inode = f2fs_iget_retry(sbi->sb, ino);
 	if (IS_ERR(inode)) {
 		/*
 		 * there should be a bug that we can't find the entry
@@ -508,6 +556,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 
 	/* truncate all the data during iput */
 	iput(inode);
+
+	get_node_info(sbi, ino, &ni);
+
+	/* ENOMEM was fully retried in f2fs_evict_inode. */
+	if (ni.blk_addr != NULL_ADDR) {
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"%s: orphan failed (ino=%x), run fsck to fix.",
+				__func__, ino);
+		return -EIO;
+	}
+	__remove_ino_entry(sbi, ino, ORPHAN_INO);
 	return 0;
 }
 
@@ -516,7 +576,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	block_t start_blk, orphan_blocks, i, j;
 	int err;
 
-	if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
+	if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
 		return 0;
 
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -540,7 +600,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 		f2fs_put_page(page, 1);
 	}
 	/* clear Orphan Flag */
-	clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
+	clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
 	return 0;
 }
 
@@ -601,45 +661,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 	}
 }
 
+static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
+		struct f2fs_checkpoint **cp_block, struct page **cp_page,
+		unsigned long long *version)
+{
+	unsigned long blk_size = sbi->blocksize;
+	size_t crc_offset = 0;
+	__u32 crc = 0;
+
+	*cp_page = get_meta_page(sbi, cp_addr);
+	*cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
+
+	crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
+	if (crc_offset >= blk_size) {
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"invalid crc_offset: %zu", crc_offset);
+		return -EINVAL;
+	}
+
+	crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block
+							+ crc_offset)));
+	if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) {
+		f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value");
+		return -EINVAL;
+	}
+
+	*version = cur_cp_version(*cp_block);
+	return 0;
+}
+
 static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 				block_t cp_addr, unsigned long long *version)
 {
-	struct page *cp_page_1, *cp_page_2 = NULL;
-	unsigned long blk_size = sbi->blocksize;
-	struct f2fs_checkpoint *cp_block;
+	struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
+	struct f2fs_checkpoint *cp_block = NULL;
 	unsigned long long cur_version = 0, pre_version = 0;
-	size_t crc_offset;
-	__u32 crc = 0;
+	int err;
 
-	/* Read the 1st cp block in this CP pack */
-	cp_page_1 = get_meta_page(sbi, cp_addr);
-
-	/* get the version number */
-	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
-	crc_offset = le32_to_cpu(cp_block->checksum_offset);
-	if (crc_offset >= blk_size)
+	err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+					&cp_page_1, version);
+	if (err)
 		goto invalid_cp1;
+	pre_version = *version;
 
-	crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
-	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
-		goto invalid_cp1;
-
-	pre_version = cur_cp_version(cp_block);
-
-	/* Read the 2nd cp block in this CP pack */
 	cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
-	cp_page_2 = get_meta_page(sbi, cp_addr);
-
-	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
-	crc_offset = le32_to_cpu(cp_block->checksum_offset);
-	if (crc_offset >= blk_size)
+	err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+					&cp_page_2, version);
+	if (err)
 		goto invalid_cp2;
-
-	crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
-	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
-		goto invalid_cp2;
-
-	cur_version = cur_cp_version(cp_block);
+	cur_version = *version;
 
 	if (cur_version == pre_version) {
 		*version = cur_version;
@@ -696,6 +766,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 	cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
 	memcpy(sbi->ckpt, cp_block, blk_size);
 
+	/* Sanity checking of checkpoint */
+	if (sanity_check_ckpt(sbi))
+		goto fail_no_cp;
+
 	if (cp_blks <= 1)
 		goto done;
 
@@ -722,118 +796,94 @@ fail_no_cp:
 	return -EINVAL;
 }
 
-static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
+static void __add_dirty_inode(struct inode *inode, enum inode_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
 
-	if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
-		return -EEXIST;
+	if (is_inode_flag_set(inode, flag))
+		return;
 
-	set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
-	F2FS_I(inode)->dirty_dir = new;
-	list_add_tail(&new->list, &sbi->dir_inode_list);
-	stat_inc_dirty_dir(sbi);
-	return 0;
+	set_inode_flag(inode, flag);
+	list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
+	stat_inc_dirty_inode(sbi, type);
+}
+
+static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
+{
+	int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
+
+	if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag))
+		return;
+
+	list_del_init(&F2FS_I(inode)->dirty_list);
+	clear_inode_flag(inode, flag);
+	stat_dec_dirty_inode(F2FS_I_SB(inode), type);
 }
 
 void update_dirty_page(struct inode *inode, struct page *page)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct inode_entry *new;
-	int ret = 0;
+	enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
 
 	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
 			!S_ISLNK(inode->i_mode))
 		return;
 
-	if (!S_ISDIR(inode->i_mode)) {
-		inode_inc_dirty_pages(inode);
-		goto out;
-	}
-
-	new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
-	new->inode = inode;
-	INIT_LIST_HEAD(&new->list);
-
-	spin_lock(&sbi->dir_inode_lock);
-	ret = __add_dirty_inode(inode, new);
+	spin_lock(&sbi->inode_lock[type]);
+	if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH))
+		__add_dirty_inode(inode, type);
 	inode_inc_dirty_pages(inode);
-	spin_unlock(&sbi->dir_inode_lock);
+	spin_unlock(&sbi->inode_lock[type]);
 
-	if (ret)
-		kmem_cache_free(inode_entry_slab, new);
-out:
 	SetPagePrivate(page);
 	f2fs_trace_pid(page);
 }
 
-void add_dirty_dir_inode(struct inode *inode)
+void remove_dirty_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct inode_entry *new =
-			f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
-	int ret = 0;
+	enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
 
-	new->inode = inode;
-	INIT_LIST_HEAD(&new->list);
-
-	spin_lock(&sbi->dir_inode_lock);
-	ret = __add_dirty_inode(inode, new);
-	spin_unlock(&sbi->dir_inode_lock);
-
-	if (ret)
-		kmem_cache_free(inode_entry_slab, new);
-}
-
-void remove_dirty_dir_inode(struct inode *inode)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct inode_entry *entry;
-
-	if (!S_ISDIR(inode->i_mode))
+	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+			!S_ISLNK(inode->i_mode))
 		return;
 
-	spin_lock(&sbi->dir_inode_lock);
-	if (get_dirty_pages(inode) ||
-			!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
-		spin_unlock(&sbi->dir_inode_lock);
+	if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH))
 		return;
-	}
 
-	entry = F2FS_I(inode)->dirty_dir;
-	list_del(&entry->list);
-	F2FS_I(inode)->dirty_dir = NULL;
-	clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
-	stat_dec_dirty_dir(sbi);
-	spin_unlock(&sbi->dir_inode_lock);
-	kmem_cache_free(inode_entry_slab, entry);
-
-	/* Only from the recovery routine */
-	if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
-		clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
-		iput(inode);
-	}
+	spin_lock(&sbi->inode_lock[type]);
+	__remove_dirty_inode(inode, type);
+	spin_unlock(&sbi->inode_lock[type]);
 }
 
-void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
 {
 	struct list_head *head;
-	struct inode_entry *entry;
 	struct inode *inode;
+	struct f2fs_inode_info *fi;
+	bool is_dir = (type == DIR_INODE);
+
+	trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
+				get_pages(sbi, is_dir ?
+				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
 retry:
 	if (unlikely(f2fs_cp_error(sbi)))
-		return;
+		return -EIO;
 
-	spin_lock(&sbi->dir_inode_lock);
+	spin_lock(&sbi->inode_lock[type]);
 
-	head = &sbi->dir_inode_list;
+	head = &sbi->inode_list[type];
 	if (list_empty(head)) {
-		spin_unlock(&sbi->dir_inode_lock);
-		return;
+		spin_unlock(&sbi->inode_lock[type]);
+		trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
+				get_pages(sbi, is_dir ?
+				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
+		return 0;
 	}
-	entry = list_entry(head->next, struct inode_entry, list);
-	inode = igrab(entry->inode);
-	spin_unlock(&sbi->dir_inode_lock);
+	fi = list_entry(head->next, struct f2fs_inode_info, dirty_list);
+	inode = igrab(&fi->vfs_inode);
+	spin_unlock(&sbi->inode_lock[type]);
 	if (inode) {
 		filemap_fdatawrite(inode->i_mapping);
 		iput(inode);
@@ -848,6 +898,34 @@ retry:
 	goto retry;
 }
 
+int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
+{
+	struct list_head *head = &sbi->inode_list[DIRTY_META];
+	struct inode *inode;
+	struct f2fs_inode_info *fi;
+	s64 total = get_pages(sbi, F2FS_DIRTY_IMETA);
+
+	while (total--) {
+		if (unlikely(f2fs_cp_error(sbi)))
+			return -EIO;
+
+		spin_lock(&sbi->inode_lock[DIRTY_META]);
+		if (list_empty(head)) {
+			spin_unlock(&sbi->inode_lock[DIRTY_META]);
+			return 0;
+		}
+		fi = list_entry(head->next, struct f2fs_inode_info,
+							gdirty_list);
+		inode = igrab(&fi->vfs_inode);
+		spin_unlock(&sbi->inode_lock[DIRTY_META]);
+		if (inode) {
+			update_inode_page(inode);
+			iput(inode);
+		}
+	};
+	return 0;
+}
+
 /*
  * Freeze all the FS-operations for checkpoint.
  */
@@ -868,11 +946,17 @@ retry_flush_dents:
 	/* write all the dirty dentry pages */
 	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
 		f2fs_unlock_all(sbi);
-		sync_dirty_dir_inodes(sbi);
-		if (unlikely(f2fs_cp_error(sbi))) {
-			err = -EIO;
+		err = sync_dirty_inodes(sbi, DIR_INODE);
+		if (err)
+			goto out;
+		goto retry_flush_dents;
+	}
+
+	if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
+		f2fs_unlock_all(sbi);
+		err = f2fs_sync_inode_meta(sbi);
+		if (err)
 			goto out;
-		}
 		goto retry_flush_dents;
 	}
 
@@ -885,10 +969,9 @@ retry_flush_nodes:
 
 	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
 		up_write(&sbi->node_write);
-		sync_node_pages(sbi, 0, &wbc);
-		if (unlikely(f2fs_cp_error(sbi))) {
+		err = sync_node_pages(sbi, &wbc);
+		if (err) {
 			f2fs_unlock_all(sbi);
-			err = -EIO;
 			goto out;
 		}
 		goto retry_flush_nodes;
@@ -901,6 +984,8 @@ out:
 static void unblock_operations(struct f2fs_sb_info *sbi)
 {
 	up_write(&sbi->node_write);
+
+	build_free_nids(sbi);
 	f2fs_unlock_all(sbi);
 }
 
@@ -911,18 +996,48 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
 	for (;;) {
 		prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		if (!get_pages(sbi, F2FS_WRITEBACK))
+		if (!atomic_read(&sbi->nr_wb_bios))
 			break;
 
-		io_schedule();
+		io_schedule_timeout(5*HZ);
 	}
 	finish_wait(&sbi->cp_wait, &wait);
 }
 
-static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+	unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+
+	spin_lock(&sbi->cp_lock);
+
+	if (cpc->reason == CP_UMOUNT)
+		__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+	else
+		__clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+
+	if (cpc->reason == CP_FASTBOOT)
+		__set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+	else
+		__clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+
+	if (orphan_num)
+		__set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+	else
+		__clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+
+	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+		__set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+
+	/* set this flag to activate crc|cp_ver for recovery */
+	__set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
+
+	spin_unlock(&sbi->cp_lock);
+}
+
+static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
-	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
 	nid_t last_nid = nm_i->next_scan_nid;
@@ -931,21 +1046,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	__u32 crc32 = 0;
 	int i;
 	int cp_payload_blks = __cp_payload(sbi);
-	block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
-	bool invalidate = false;
-
-	/*
-	 * This avoids to conduct wrong roll-forward operations and uses
-	 * metapages, so should be called prior to sync_meta_pages below.
-	 */
-	if (discard_next_dnode(sbi, discard_blk))
-		invalidate = true;
+	struct super_block *sb = sbi->sb;
+	struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+	u64 kbytes_written;
 
 	/* Flush all the NAT/SIT pages */
 	while (get_pages(sbi, F2FS_DIRTY_META)) {
 		sync_meta_pages(sbi, META, LONG_MAX);
 		if (unlikely(f2fs_cp_error(sbi)))
-			return;
+			return -EIO;
 	}
 
 	next_free_nid(sbi, &last_nid);
@@ -980,10 +1089,12 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	/* 2 cp  + n data seg summary + orphan inode blocks */
 	data_sum_blocks = npages_for_summary_flush(sbi, false);
+	spin_lock(&sbi->cp_lock);
 	if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
-		set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+		__set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
 	else
-		clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+		__clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+	spin_unlock(&sbi->cp_lock);
 
 	orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
 	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
@@ -998,29 +1109,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 				cp_payload_blks + data_sum_blocks +
 				orphan_blocks);
 
-	if (cpc->reason == CP_UMOUNT)
-		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-	else
-		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-
-	if (cpc->reason == CP_FASTBOOT)
-		set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
-	else
-		clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
-
-	if (orphan_num)
-		set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
-	else
-		clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
-
-	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
-		set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+	/* update ckpt flag for checkpoint */
+	update_ckpt_flags(sbi, cpc);
 
 	/* update SIT/NAT bitmap */
 	get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
 	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
 
-	crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
+	crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset));
 	*((__le32 *)((unsigned char *)ckpt +
 				le32_to_cpu(ckpt->checksum_offset)))
 				= cpu_to_le32(crc32);
@@ -1030,7 +1126,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* need to wait for end_io results */
 	wait_on_all_pages_writeback(sbi);
 	if (unlikely(f2fs_cp_error(sbi)))
-		return;
+		return -EIO;
 
 	/* write out checkpoint buffer at block 0 */
 	update_meta_page(sbi, ckpt, start_blk++);
@@ -1046,6 +1142,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	write_data_summaries(sbi, start_blk);
 	start_blk += data_sum_blocks;
+
+	/* Record write statistics in the hot node summary */
+	kbytes_written = sbi->kbytes_written;
+	if (sb->s_bdev->bd_part)
+		kbytes_written += BD_PART_WRITTEN(sbi);
+
+	seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
+
 	if (__remain_node_summaries(cpc->reason)) {
 		write_node_summaries(sbi, start_blk);
 		start_blk += NR_CURSEG_NODE_TYPE;
@@ -1058,14 +1162,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	wait_on_all_pages_writeback(sbi);
 
 	if (unlikely(f2fs_cp_error(sbi)))
-		return;
+		return -EIO;
 
-	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
-	filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
+	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
+	filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
 
 	/* update user_block_counts */
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
-	sbi->alloc_valid_block_count = 0;
+	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
 
 	/* Here, we only have one bio having CP pack */
 	sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
@@ -1073,30 +1177,36 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* wait for previous submitted meta pages writeback */
 	wait_on_all_pages_writeback(sbi);
 
-	/*
-	 * invalidate meta page which is used temporarily for zeroing out
-	 * block at the end of warm node chain.
-	 */
-	if (invalidate)
-		invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
-								discard_blk);
-
-	release_dirty_inode(sbi);
+	release_ino_entry(sbi, false);
 
 	if (unlikely(f2fs_cp_error(sbi)))
-		return;
+		return -EIO;
 
 	clear_prefree_segments(sbi, cpc);
 	clear_sbi_flag(sbi, SBI_IS_DIRTY);
+	clear_sbi_flag(sbi, SBI_NEED_CP);
+
+	/*
+	 * redirty superblock if metadata like node page or inode cache is
+	 * updated during writing checkpoint.
+	 */
+	if (get_pages(sbi, F2FS_DIRTY_NODES) ||
+			get_pages(sbi, F2FS_DIRTY_IMETA))
+		set_sbi_flag(sbi, SBI_IS_DIRTY);
+
+	f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS));
+
+	return 0;
 }
 
 /*
  * We guarantee that this checkpoint procedure will not fail.
  */
-void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	unsigned long long ckpt_ver;
+	int err = 0;
 
 	mutex_lock(&sbi->cp_mutex);
 
@@ -1104,21 +1214,35 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		(cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
 		(cpc->reason == CP_DISCARD && !sbi->discard_blks)))
 		goto out;
-	if (unlikely(f2fs_cp_error(sbi)))
+	if (unlikely(f2fs_cp_error(sbi))) {
+		err = -EIO;
 		goto out;
-	if (f2fs_readonly(sbi->sb))
+	}
+	if (f2fs_readonly(sbi->sb)) {
+		err = -EROFS;
 		goto out;
+	}
 
 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
 
-	if (block_operations(sbi))
+	err = block_operations(sbi);
+	if (err)
 		goto out;
 
 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
 
-	f2fs_submit_merged_bio(sbi, DATA, WRITE);
-	f2fs_submit_merged_bio(sbi, NODE, WRITE);
-	f2fs_submit_merged_bio(sbi, META, WRITE);
+	f2fs_flush_merged_bios(sbi);
+
+	/* this is the case of multiple fstrims without any changes */
+	if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) {
+		f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt);
+		f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries);
+		f2fs_bug_on(sbi, prefree_segments(sbi));
+		flush_sit_entries(sbi, cpc);
+		clear_prefree_segments(sbi, cpc);
+		unblock_operations(sbi);
+		goto out;
+	}
 
 	/*
 	 * update checkpoint pack index
@@ -1133,7 +1257,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	flush_sit_entries(sbi, cpc);
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
-	do_checkpoint(sbi, cpc);
+	err = do_checkpoint(sbi, cpc);
 
 	unblock_operations(sbi);
 	stat_inc_cp_count(sbi->stat_info);
@@ -1143,10 +1267,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			"checkpoint: version = %llx", ckpt_ver);
 
 	/* do checkpoint periodically */
-	sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval);
+	f2fs_update_time(sbi, CP_TIME);
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
 out:
 	mutex_unlock(&sbi->cp_mutex);
-	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
+	return err;
 }
 
 void init_ino_entry_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c
deleted file mode 100644
index 4a62ef14e932..000000000000
--- a/fs/f2fs/crypto.c
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * linux/fs/f2fs/crypto.c
- *
- * Copied from linux/fs/ext4/crypto.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility
- *
- * This contains encryption functions for f2fs
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- *	Uday Savagaonkar, 2014
- * Encryption policy handling additions
- *	Ildar Muslukhov, 2014
- * Remove ext4_encrypted_zeroout(),
- *   add f2fs_restore_and_release_control_page()
- *	Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
- */
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-#include <keys/user-type.h>
-#include <keys/encrypted-type.h>
-#include <linux/crypto.h>
-#include <linux/ecryptfs.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
-#include <linux/ratelimit.h>
-#include <linux/bio.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-/* Encryption added and removed here! (L: */
-
-static unsigned int num_prealloc_crypto_pages = 32;
-static unsigned int num_prealloc_crypto_ctxs = 128;
-
-module_param(num_prealloc_crypto_pages, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_pages,
-		"Number of crypto pages to preallocate");
-module_param(num_prealloc_crypto_ctxs, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
-		"Number of crypto contexts to preallocate");
-
-static mempool_t *f2fs_bounce_page_pool;
-
-static LIST_HEAD(f2fs_free_crypto_ctxs);
-static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock);
-
-static struct workqueue_struct *f2fs_read_workqueue;
-static DEFINE_MUTEX(crypto_init);
-
-static struct kmem_cache *f2fs_crypto_ctx_cachep;
-struct kmem_cache *f2fs_crypt_info_cachep;
-
-/**
- * f2fs_release_crypto_ctx() - Releases an encryption context
- * @ctx: The encryption context to release.
- *
- * If the encryption context was allocated from the pre-allocated pool, returns
- * it to that pool. Else, frees it.
- *
- * If there's a bounce page in the context, this frees that.
- */
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx)
-{
-	unsigned long flags;
-
-	if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) {
-		mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
-		ctx->w.bounce_page = NULL;
-	}
-	ctx->w.control_page = NULL;
-	if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
-		kmem_cache_free(f2fs_crypto_ctx_cachep, ctx);
-	} else {
-		spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
-		list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
-		spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
-	}
-}
-
-/**
- * f2fs_get_crypto_ctx() - Gets an encryption context
- * @inode:       The inode for which we are doing the crypto
- *
- * Allocates and initializes an encryption context.
- *
- * Return: An allocated and initialized encryption context on success; error
- * value or NULL otherwise.
- */
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode)
-{
-	struct f2fs_crypto_ctx *ctx = NULL;
-	unsigned long flags;
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
-	if (ci == NULL)
-		return ERR_PTR(-ENOKEY);
-
-	/*
-	 * We first try getting the ctx from a free list because in
-	 * the common case the ctx will have an allocated and
-	 * initialized crypto tfm, so it's probably a worthwhile
-	 * optimization. For the bounce page, we first try getting it
-	 * from the kernel allocator because that's just about as fast
-	 * as getting it from a list and because a cache of free pages
-	 * should generally be a "last resort" option for a filesystem
-	 * to be able to do its job.
-	 */
-	spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
-	ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs,
-					struct f2fs_crypto_ctx, free_list);
-	if (ctx)
-		list_del(&ctx->free_list);
-	spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
-	if (!ctx) {
-		ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS);
-		if (!ctx)
-			return ERR_PTR(-ENOMEM);
-		ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
-	} else {
-		ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
-	}
-	ctx->flags &= ~F2FS_WRITE_PATH_FL;
-	return ctx;
-}
-
-/*
- * Call f2fs_decrypt on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
-	struct f2fs_crypto_ctx *ctx =
-		container_of(work, struct f2fs_crypto_ctx, r.work);
-	struct bio *bio = ctx->r.bio;
-	struct bio_vec *bv;
-	int i;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		struct page *page = bv->bv_page;
-		int ret = f2fs_decrypt(ctx, page);
-
-		if (ret) {
-			WARN_ON_ONCE(1);
-			SetPageError(page);
-		} else
-			SetPageUptodate(page);
-		unlock_page(page);
-	}
-	f2fs_release_crypto_ctx(ctx);
-	bio_put(bio);
-}
-
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio)
-{
-	INIT_WORK(&ctx->r.work, completion_pages);
-	ctx->r.bio = bio;
-	queue_work(f2fs_read_workqueue, &ctx->r.work);
-}
-
-static void f2fs_crypto_destroy(void)
-{
-	struct f2fs_crypto_ctx *pos, *n;
-
-	list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list)
-		kmem_cache_free(f2fs_crypto_ctx_cachep, pos);
-	INIT_LIST_HEAD(&f2fs_free_crypto_ctxs);
-	if (f2fs_bounce_page_pool)
-		mempool_destroy(f2fs_bounce_page_pool);
-	f2fs_bounce_page_pool = NULL;
-}
-
-/**
- * f2fs_crypto_initialize() - Set up for f2fs encryption.
- *
- * We only call this when we start accessing encrypted files, since it
- * results in memory getting allocated that wouldn't otherwise be used.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_crypto_initialize(void)
-{
-	int i, res = -ENOMEM;
-
-	if (f2fs_bounce_page_pool)
-		return 0;
-
-	mutex_lock(&crypto_init);
-	if (f2fs_bounce_page_pool)
-		goto already_initialized;
-
-	for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
-		struct f2fs_crypto_ctx *ctx;
-
-		ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL);
-		if (!ctx)
-			goto fail;
-		list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
-	}
-
-	/* must be allocated at the last step to avoid race condition above */
-	f2fs_bounce_page_pool =
-		mempool_create_page_pool(num_prealloc_crypto_pages, 0);
-	if (!f2fs_bounce_page_pool)
-		goto fail;
-
-already_initialized:
-	mutex_unlock(&crypto_init);
-	return 0;
-fail:
-	f2fs_crypto_destroy();
-	mutex_unlock(&crypto_init);
-	return res;
-}
-
-/**
- * f2fs_exit_crypto() - Shutdown the f2fs encryption system
- */
-void f2fs_exit_crypto(void)
-{
-	f2fs_crypto_destroy();
-
-	if (f2fs_read_workqueue)
-		destroy_workqueue(f2fs_read_workqueue);
-	if (f2fs_crypto_ctx_cachep)
-		kmem_cache_destroy(f2fs_crypto_ctx_cachep);
-	if (f2fs_crypt_info_cachep)
-		kmem_cache_destroy(f2fs_crypt_info_cachep);
-}
-
-int __init f2fs_init_crypto(void)
-{
-	int res = -ENOMEM;
-
-	f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0);
-	if (!f2fs_read_workqueue)
-		goto fail;
-
-	f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx,
-						SLAB_RECLAIM_ACCOUNT);
-	if (!f2fs_crypto_ctx_cachep)
-		goto fail;
-
-	f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info,
-						SLAB_RECLAIM_ACCOUNT);
-	if (!f2fs_crypt_info_cachep)
-		goto fail;
-
-	return 0;
-fail:
-	f2fs_exit_crypto();
-	return res;
-}
-
-void f2fs_restore_and_release_control_page(struct page **page)
-{
-	struct f2fs_crypto_ctx *ctx;
-	struct page *bounce_page;
-
-	/* The bounce data pages are unmapped. */
-	if ((*page)->mapping)
-		return;
-
-	/* The bounce data page is unmapped. */
-	bounce_page = *page;
-	ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page);
-
-	/* restore control page */
-	*page = ctx->w.control_page;
-
-	f2fs_restore_control_page(bounce_page);
-}
-
-void f2fs_restore_control_page(struct page *data_page)
-{
-	struct f2fs_crypto_ctx *ctx =
-		(struct f2fs_crypto_ctx *)page_private(data_page);
-
-	set_page_private(data_page, (unsigned long)NULL);
-	ClearPagePrivate(data_page);
-	unlock_page(data_page);
-	f2fs_release_crypto_ctx(ctx);
-}
-
-/**
- * f2fs_crypt_complete() - The completion callback for page encryption
- * @req: The asynchronous encryption request context
- * @res: The result of the encryption operation
- */
-static void f2fs_crypt_complete(struct crypto_async_request *req, int res)
-{
-	struct f2fs_completion_result *ecr = req->data;
-
-	if (res == -EINPROGRESS)
-		return;
-	ecr->res = res;
-	complete(&ecr->completion);
-}
-
-typedef enum {
-	F2FS_DECRYPT = 0,
-	F2FS_ENCRYPT,
-} f2fs_direction_t;
-
-static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx,
-				struct inode *inode,
-				f2fs_direction_t rw,
-				pgoff_t index,
-				struct page *src_page,
-				struct page *dest_page)
-{
-	u8 xts_tweak[F2FS_XTS_TWEAK_SIZE];
-	struct ablkcipher_request *req = NULL;
-	DECLARE_F2FS_COMPLETION_RESULT(ecr);
-	struct scatterlist dst, src;
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
-	int res = 0;
-
-	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
-	if (!req) {
-		printk_ratelimited(KERN_ERR
-				"%s: crypto_request_alloc() failed\n",
-				__func__);
-		return -ENOMEM;
-	}
-	ablkcipher_request_set_callback(
-		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-		f2fs_crypt_complete, &ecr);
-
-	BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index));
-	memcpy(xts_tweak, &index, sizeof(index));
-	memset(&xts_tweak[sizeof(index)], 0,
-			F2FS_XTS_TWEAK_SIZE - sizeof(index));
-
-	sg_init_table(&dst, 1);
-	sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
-	sg_init_table(&src, 1);
-	sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
-	ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
-					xts_tweak);
-	if (rw == F2FS_DECRYPT)
-		res = crypto_ablkcipher_decrypt(req);
-	else
-		res = crypto_ablkcipher_encrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		BUG_ON(req->base.data != &ecr);
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
-	ablkcipher_request_free(req);
-	if (res) {
-		printk_ratelimited(KERN_ERR
-			"%s: crypto_ablkcipher_encrypt() returned %d\n",
-			__func__, res);
-		return res;
-	}
-	return 0;
-}
-
-static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx)
-{
-	ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT);
-	if (ctx->w.bounce_page == NULL)
-		return ERR_PTR(-ENOMEM);
-	ctx->flags |= F2FS_WRITE_PATH_FL;
-	return ctx->w.bounce_page;
-}
-
-/**
- * f2fs_encrypt() - Encrypts a page
- * @inode:          The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- *
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
- *
- * Called on the page write path.  The caller must call
- * f2fs_restore_control_page() on the returned ciphertext page to
- * release the bounce buffer and the encryption context.
- *
- * Return: An allocated page with the encrypted content on success. Else, an
- * error value or NULL.
- */
-struct page *f2fs_encrypt(struct inode *inode,
-			  struct page *plaintext_page)
-{
-	struct f2fs_crypto_ctx *ctx;
-	struct page *ciphertext_page = NULL;
-	int err;
-
-	BUG_ON(!PageLocked(plaintext_page));
-
-	ctx = f2fs_get_crypto_ctx(inode);
-	if (IS_ERR(ctx))
-		return (struct page *)ctx;
-
-	/* The encryption operation will require a bounce page. */
-	ciphertext_page = alloc_bounce_page(ctx);
-	if (IS_ERR(ciphertext_page))
-		goto err_out;
-
-	ctx->w.control_page = plaintext_page;
-	err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index,
-					plaintext_page, ciphertext_page);
-	if (err) {
-		ciphertext_page = ERR_PTR(err);
-		goto err_out;
-	}
-
-	SetPagePrivate(ciphertext_page);
-	set_page_private(ciphertext_page, (unsigned long)ctx);
-	lock_page(ciphertext_page);
-	return ciphertext_page;
-
-err_out:
-	f2fs_release_crypto_ctx(ctx);
-	return ciphertext_page;
-}
-
-/**
- * f2fs_decrypt() - Decrypts a page in-place
- * @ctx:  The encryption context.
- * @page: The page to decrypt. Must be locked.
- *
- * Decrypts page in-place using the ctx encryption context.
- *
- * Called from the read completion callback.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page)
-{
-	BUG_ON(!PageLocked(page));
-
-	return f2fs_page_crypto(ctx, page->mapping->host,
-				F2FS_DECRYPT, page->index, page, page);
-}
-
-/*
- * Convenience function which takes care of allocating and
- * deallocating the encryption context
- */
-int f2fs_decrypt_one(struct inode *inode, struct page *page)
-{
-	struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode);
-	int ret;
-
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-	ret = f2fs_decrypt(ctx, page);
-	f2fs_release_crypto_ctx(ctx);
-	return ret;
-}
-
-bool f2fs_valid_contents_enc_mode(uint32_t mode)
-{
-	return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS);
-}
-
-/**
- * f2fs_validate_encryption_key_size() - Validate the encryption key size
- * @mode: The key mode.
- * @size: The key size to validate.
- *
- * Return: The validated key size for @mode. Zero if invalid.
- */
-uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size)
-{
-	if (size == f2fs_encryption_key_size(mode))
-		return size;
-	return 0;
-}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8936044dee4c..fc9bfd8f566d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -19,6 +19,8 @@
 #include <linux/bio.h>
 #include <linux/prefetch.h>
 #include <linux/uio.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
 #include <linux/cleancache.h>
 
 #include "f2fs.h"
@@ -33,11 +35,16 @@ static void f2fs_read_end_io(struct bio *bio)
 	struct bio_vec *bvec;
 	int i;
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO))
+		bio->bi_error = -EIO;
+#endif
+
 	if (f2fs_bio_encrypted(bio)) {
 		if (bio->bi_error) {
-			f2fs_release_crypto_ctx(bio->bi_private);
+			fscrypt_release_ctx(bio->bi_private);
 		} else {
-			f2fs_end_io_crypto_work(bio->bi_private, bio);
+			fscrypt_decrypt_bio_pages(bio->bi_private, bio);
 			return;
 		}
 	}
@@ -46,7 +53,8 @@ static void f2fs_read_end_io(struct bio *bio)
 		struct page *page = bvec->bv_page;
 
 		if (!bio->bi_error) {
-			SetPageUptodate(page);
+			if (!PageUptodate(page))
+				SetPageUptodate(page);
 		} else {
 			ClearPageUptodate(page);
 			SetPageError(page);
@@ -65,19 +73,16 @@ static void f2fs_write_end_io(struct bio *bio)
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 
-		f2fs_restore_and_release_control_page(&page);
+		fscrypt_pullback_bio_page(&page, true);
 
 		if (unlikely(bio->bi_error)) {
-			set_page_dirty(page);
 			set_bit(AS_EIO, &page->mapping->flags);
-			f2fs_stop_checkpoint(sbi);
+			f2fs_stop_checkpoint(sbi, true);
 		}
 		end_page_writeback(page);
-		dec_page_count(sbi, F2FS_WRITEBACK);
 	}
-
-	if (!get_pages(sbi, F2FS_WRITEBACK) &&
-			!list_empty(&sbi->cp_wait.task_list))
+	if (atomic_dec_and_test(&sbi->nr_wb_bios) &&
+				wq_has_sleeper(&sbi->cp_wait))
 		wake_up(&sbi->cp_wait);
 
 	bio_put(bio);
@@ -101,6 +106,18 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 	return bio;
 }
 
+static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw,
+			struct bio *bio, enum page_type type)
+{
+	if (!is_read_io(rw)) {
+		atomic_inc(&sbi->nr_wb_bios);
+		if (f2fs_sb_mounted_hmsmr(sbi->sb) &&
+			current->plug && (type == DATA || type == NODE))
+			blk_finish_plug(current->plug);
+	}
+	submit_bio(rw, bio);
+}
+
 static void __submit_merged_bio(struct f2fs_bio_info *io)
 {
 	struct f2fs_io_info *fio = &io->fio;
@@ -113,12 +130,58 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
 	else
 		trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
 
-	submit_bio(fio->rw, io->bio);
+	__submit_bio(io->sbi, fio->rw, io->bio, fio->type);
 	io->bio = NULL;
 }
 
-void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
-				enum page_type type, int rw)
+static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
+						struct page *page, nid_t ino)
+{
+	struct bio_vec *bvec;
+	struct page *target;
+	int i;
+
+	if (!io->bio)
+		return false;
+
+	if (!inode && !page && !ino)
+		return true;
+
+	bio_for_each_segment_all(bvec, io->bio, i) {
+
+		if (bvec->bv_page->mapping)
+			target = bvec->bv_page;
+		else
+			target = fscrypt_control_page(bvec->bv_page);
+
+		if (inode && inode == target->mapping->host)
+			return true;
+		if (page && page == target)
+			return true;
+		if (ino && ino == ino_of_node(target))
+			return true;
+	}
+
+	return false;
+}
+
+static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
+						struct page *page, nid_t ino,
+						enum page_type type)
+{
+	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+	struct f2fs_bio_info *io = &sbi->write_io[btype];
+	bool ret;
+
+	down_read(&io->io_rwsem);
+	ret = __has_merged_page(io, inode, page, ino);
+	up_read(&io->io_rwsem);
+	return ret;
+}
+
+static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+				struct inode *inode, struct page *page,
+				nid_t ino, enum page_type type, int rw)
 {
 	enum page_type btype = PAGE_TYPE_OF_BIO(type);
 	struct f2fs_bio_info *io;
@@ -127,6 +190,9 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 
 	down_write(&io->io_rwsem);
 
+	if (!__has_merged_page(io, inode, page, ino))
+		goto out;
+
 	/* change META to META_FLUSH in the checkpoint procedure */
 	if (type >= META_FLUSH) {
 		io->fio.type = META_FLUSH;
@@ -136,9 +202,31 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 			io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
 	}
 	__submit_merged_bio(io);
+out:
 	up_write(&io->io_rwsem);
 }
 
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type,
+									int rw)
+{
+	__f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw);
+}
+
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi,
+				struct inode *inode, struct page *page,
+				nid_t ino, enum page_type type, int rw)
+{
+	if (has_merged_page(sbi, inode, page, ino, type))
+		__f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw);
+}
+
+void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
+{
+	f2fs_submit_merged_bio(sbi, DATA, WRITE);
+	f2fs_submit_merged_bio(sbi, NODE, WRITE);
+	f2fs_submit_merged_bio(sbi, META, WRITE);
+}
+
 /*
  * Fill the locked page with data located in the block address.
  * Return unlocked page.
@@ -146,20 +234,21 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 {
 	struct bio *bio;
-	struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+	struct page *page = fio->encrypted_page ?
+			fio->encrypted_page : fio->page;
 
 	trace_f2fs_submit_page_bio(page, fio);
 	f2fs_trace_ios(fio, 0);
 
 	/* Allocate a new bio */
-	bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw));
+	bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw));
 
-	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		bio_put(bio);
 		return -EFAULT;
 	}
 
-	submit_bio(fio->rw, bio);
+	__submit_bio(fio->sbi, fio->rw, bio, fio->type);
 	return 0;
 }
 
@@ -173,39 +262,49 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 
 	io = is_read ? &sbi->read_io : &sbi->write_io[btype];
 
-	verify_block_addr(sbi, fio->blk_addr);
+	if (fio->old_blkaddr != NEW_ADDR)
+		verify_block_addr(sbi, fio->old_blkaddr);
+	verify_block_addr(sbi, fio->new_blkaddr);
 
 	down_write(&io->io_rwsem);
 
-	if (!is_read)
-		inc_page_count(sbi, F2FS_WRITEBACK);
-
-	if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
+	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
 						io->fio.rw != fio->rw))
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
 		int bio_blocks = MAX_BIO_BLOCKS(sbi);
 
-		io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
+		io->bio = __bio_alloc(sbi, fio->new_blkaddr,
+						bio_blocks, is_read);
 		io->fio = *fio;
 	}
 
 	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
-	if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) <
-							PAGE_CACHE_SIZE) {
+	if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
+							PAGE_SIZE) {
 		__submit_merged_bio(io);
 		goto alloc_new;
 	}
 
-	io->last_block_in_bio = fio->blk_addr;
+	io->last_block_in_bio = fio->new_blkaddr;
 	f2fs_trace_ios(fio, 0);
 
 	up_write(&io->io_rwsem);
 	trace_f2fs_submit_page_mbio(fio->page, fio);
 }
 
+static void __set_data_blkaddr(struct dnode_of_data *dn)
+{
+	struct f2fs_node *rn = F2FS_NODE(dn->node_page);
+	__le32 *addr_array;
+
+	/* Get physical address of data block */
+	addr_array = blkaddr_in_node(rn);
+	addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+}
+
 /*
  * Lock ordering for the change of data block address:
  * ->data_page
@@ -214,39 +313,63 @@ alloc_new:
  */
 void set_data_blkaddr(struct dnode_of_data *dn)
 {
-	struct f2fs_node *rn;
-	__le32 *addr_array;
-	struct page *node_page = dn->node_page;
-	unsigned int ofs_in_node = dn->ofs_in_node;
-
-	f2fs_wait_on_page_writeback(node_page, NODE);
-
-	rn = F2FS_NODE(node_page);
-
-	/* Get physical address of data block */
-	addr_array = blkaddr_in_node(rn);
-	addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
-	set_page_dirty(node_page);
+	f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+	__set_data_blkaddr(dn);
+	if (set_page_dirty(dn->node_page))
+		dn->node_changed = true;
 }
 
-int reserve_new_block(struct dnode_of_data *dn)
+void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
+{
+	dn->data_blkaddr = blkaddr;
+	set_data_blkaddr(dn);
+	f2fs_update_extent_cache(dn);
+}
+
+/* dn->ofs_in_node will be returned with up-to-date last block pointer */
+int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 
-	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+	if (!count)
+		return 0;
+
+	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
 		return -EPERM;
-	if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+	if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
 		return -ENOSPC;
 
-	trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
+	trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
+						dn->ofs_in_node, count);
 
-	dn->data_blkaddr = NEW_ADDR;
-	set_data_blkaddr(dn);
-	mark_inode_dirty(dn->inode);
-	sync_inode_page(dn);
+	f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+
+	for (; count > 0; dn->ofs_in_node++) {
+		block_t blkaddr =
+			datablock_addr(dn->node_page, dn->ofs_in_node);
+		if (blkaddr == NULL_ADDR) {
+			dn->data_blkaddr = NEW_ADDR;
+			__set_data_blkaddr(dn);
+			count--;
+		}
+	}
+
+	if (set_page_dirty(dn->node_page))
+		dn->node_changed = true;
 	return 0;
 }
 
+/* Should keep dn->ofs_in_node unchanged */
+int reserve_new_block(struct dnode_of_data *dn)
+{
+	unsigned int ofs_in_node = dn->ofs_in_node;
+	int ret;
+
+	ret = reserve_new_blocks(dn, 1);
+	dn->ofs_in_node = ofs_in_node;
+	return ret;
+}
+
 int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 {
 	bool need_put = dn->inode_page ? false : true;
@@ -326,13 +449,14 @@ got_it:
 	 * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
 	 */
 	if (dn.data_blkaddr == NEW_ADDR) {
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
-		SetPageUptodate(page);
+		zero_user_segment(page, 0, PAGE_SIZE);
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
 		unlock_page(page);
 		return page;
 	}
 
-	fio.blk_addr = dn.data_blkaddr;
+	fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
 	fio.page = page;
 	err = f2fs_submit_page_bio(&fio);
 	if (err)
@@ -386,14 +510,14 @@ repeat:
 
 	/* wait for read completion */
 	lock_page(page);
-	if (unlikely(!PageUptodate(page))) {
-		f2fs_put_page(page, 1);
-		return ERR_PTR(-EIO);
-	}
 	if (unlikely(page->mapping != mapping)) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
+	if (unlikely(!PageUptodate(page))) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-EIO);
+	}
 	return page;
 }
 
@@ -413,7 +537,7 @@ struct page *get_new_data_page(struct inode *inode,
 	struct page *page;
 	struct dnode_of_data dn;
 	int err;
-repeat:
+
 	page = f2fs_grab_cache_page(mapping, index, true);
 	if (!page) {
 		/*
@@ -437,45 +561,42 @@ repeat:
 		goto got_it;
 
 	if (dn.data_blkaddr == NEW_ADDR) {
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
-		SetPageUptodate(page);
+		zero_user_segment(page, 0, PAGE_SIZE);
+		if (!PageUptodate(page))
+			SetPageUptodate(page);
 	} else {
 		f2fs_put_page(page, 1);
 
-		page = get_read_data_page(inode, index, READ_SYNC, true);
+		/* if ipage exists, blkaddr should be NEW_ADDR */
+		f2fs_bug_on(F2FS_I_SB(inode), ipage);
+		page = get_lock_data_page(inode, index, true);
 		if (IS_ERR(page))
-			goto repeat;
-
-		/* wait for read completion */
-		lock_page(page);
+			return page;
 	}
 got_it:
 	if (new_i_size && i_size_read(inode) <
-				((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) {
-		i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT));
-		/* Only the directory inode sets new_i_size */
-		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
-	}
+				((loff_t)(index + 1) << PAGE_SHIFT))
+		f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT));
 	return page;
 }
 
 static int __allocate_data_block(struct dnode_of_data *dn)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
-	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
 	struct f2fs_summary sum;
 	struct node_info ni;
 	int seg = CURSEG_WARM_DATA;
 	pgoff_t fofs;
+	blkcnt_t count = 1;
 
-	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
 		return -EPERM;
 
 	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
 	if (dn->data_blkaddr == NEW_ADDR)
 		goto alloc;
 
-	if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+	if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
 		return -ENOSPC;
 
 alloc:
@@ -490,72 +611,43 @@ alloc:
 	set_data_blkaddr(dn);
 
 	/* update i_size */
-	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
 							dn->ofs_in_node;
-	if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
-		i_size_write(dn->inode,
-				((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT));
-
-	/* direct IO doesn't use extent cache to maximize the performance */
-	f2fs_drop_largest_extent(dn->inode, fofs);
-
+	if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
+		f2fs_i_size_write(dn->inode,
+				((loff_t)(fofs + 1) << PAGE_SHIFT));
 	return 0;
 }
 
-static void __allocate_data_blocks(struct inode *inode, loff_t offset,
-							size_t count)
+ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct dnode_of_data dn;
-	u64 start = F2FS_BYTES_TO_BLK(offset);
-	u64 len = F2FS_BYTES_TO_BLK(count);
-	bool allocated;
-	u64 end_offset;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct f2fs_map_blocks map;
+	ssize_t ret = 0;
 
-	while (len) {
-		f2fs_balance_fs(sbi);
-		f2fs_lock_op(sbi);
+	map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
+	map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
+	if (map.m_len > map.m_lblk)
+		map.m_len -= map.m_lblk;
+	else
+		map.m_len = 0;
 
-		/* When reading holes, we need its node page */
-		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		if (get_dnode_of_data(&dn, start, ALLOC_NODE))
-			goto out;
+	map.m_next_pgofs = NULL;
 
-		allocated = false;
-		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
-
-		while (dn.ofs_in_node < end_offset && len) {
-			block_t blkaddr;
-
-			if (unlikely(f2fs_cp_error(sbi)))
-				goto sync_out;
-
-			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
-			if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
-				if (__allocate_data_block(&dn))
-					goto sync_out;
-				allocated = true;
-			}
-			len--;
-			start++;
-			dn.ofs_in_node++;
-		}
-
-		if (allocated)
-			sync_inode_page(&dn);
-
-		f2fs_put_dnode(&dn);
-		f2fs_unlock_op(sbi);
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = f2fs_convert_inline_inode(inode);
+		if (ret)
+			return ret;
+		return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
 	}
-	return;
-
-sync_out:
-	if (allocated)
-		sync_inode_page(&dn);
-	f2fs_put_dnode(&dn);
-out:
-	f2fs_unlock_op(sbi);
-	return;
+	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
+		ret = f2fs_convert_inline_inode(inode);
+		if (ret)
+			return ret;
+	}
+	if (!f2fs_has_inline_data(inode))
+		return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+	return ret;
 }
 
 /*
@@ -567,156 +659,181 @@ out:
  *     b. do not use extent cache for better performance
  *     c. give the block addresses to blockdev
  */
-static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 						int create, int flag)
 {
 	unsigned int maxblocks = map->m_len;
 	struct dnode_of_data dn;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
-	pgoff_t pgofs, end_offset;
+	int mode = create ? ALLOC_NODE : LOOKUP_NODE;
+	pgoff_t pgofs, end_offset, end;
 	int err = 0, ofs = 1;
+	unsigned int ofs_in_node, last_ofs_in_node;
+	blkcnt_t prealloc;
 	struct extent_info ei;
 	bool allocated = false;
+	block_t blkaddr;
+
+	if (!maxblocks)
+		return 0;
 
 	map->m_len = 0;
 	map->m_flags = 0;
 
 	/* it only supports block size == page size */
 	pgofs =	(pgoff_t)map->m_lblk;
+	end = pgofs + maxblocks;
 
-	if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+	if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
 		map->m_pblk = ei.blk + pgofs - ei.fofs;
 		map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
 		map->m_flags = F2FS_MAP_MAPPED;
 		goto out;
 	}
 
+next_dnode:
 	if (create)
-		f2fs_lock_op(F2FS_I_SB(inode));
+		f2fs_lock_op(sbi);
 
 	/* When reading holes, we need its node page */
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, pgofs, mode);
 	if (err) {
-		if (err == -ENOENT)
+		if (flag == F2FS_GET_BLOCK_BMAP)
+			map->m_pblk = 0;
+		if (err == -ENOENT) {
 			err = 0;
+			if (map->m_next_pgofs)
+				*map->m_next_pgofs =
+					get_next_page_offset(&dn, pgofs);
+		}
 		goto unlock_out;
 	}
 
-	if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
+	prealloc = 0;
+	ofs_in_node = dn.ofs_in_node;
+	end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+
+next_block:
+	blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+
+	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
 		if (create) {
 			if (unlikely(f2fs_cp_error(sbi))) {
 				err = -EIO;
-				goto put_out;
+				goto sync_out;
+			}
+			if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+				if (blkaddr == NULL_ADDR) {
+					prealloc++;
+					last_ofs_in_node = dn.ofs_in_node;
+				}
+			} else {
+				err = __allocate_data_block(&dn);
+				if (!err) {
+					set_inode_flag(inode, FI_APPEND_WRITE);
+					allocated = true;
+				}
 			}
-			err = __allocate_data_block(&dn);
 			if (err)
-				goto put_out;
-			allocated = true;
+				goto sync_out;
 			map->m_flags = F2FS_MAP_NEW;
+			blkaddr = dn.data_blkaddr;
 		} else {
-			if (flag != F2FS_GET_BLOCK_FIEMAP ||
-						dn.data_blkaddr != NEW_ADDR) {
-				if (flag == F2FS_GET_BLOCK_BMAP)
-					err = -ENOENT;
-				goto put_out;
+			if (flag == F2FS_GET_BLOCK_BMAP) {
+				map->m_pblk = 0;
+				goto sync_out;
 			}
-
-			/*
-			 * preallocated unwritten block should be mapped
-			 * for fiemap.
-			 */
-			if (dn.data_blkaddr == NEW_ADDR)
-				map->m_flags = F2FS_MAP_UNWRITTEN;
+			if (flag == F2FS_GET_BLOCK_FIEMAP &&
+						blkaddr == NULL_ADDR) {
+				if (map->m_next_pgofs)
+					*map->m_next_pgofs = pgofs + 1;
+			}
+			if (flag != F2FS_GET_BLOCK_FIEMAP ||
+						blkaddr != NEW_ADDR)
+				goto sync_out;
 		}
 	}
 
-	map->m_flags |= F2FS_MAP_MAPPED;
-	map->m_pblk = dn.data_blkaddr;
-	map->m_len = 1;
+	if (flag == F2FS_GET_BLOCK_PRE_AIO)
+		goto skip;
 
-	end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+	if (map->m_len == 0) {
+		/* preallocated unwritten block should be mapped for fiemap. */
+		if (blkaddr == NEW_ADDR)
+			map->m_flags |= F2FS_MAP_UNWRITTEN;
+		map->m_flags |= F2FS_MAP_MAPPED;
+
+		map->m_pblk = blkaddr;
+		map->m_len = 1;
+	} else if ((map->m_pblk != NEW_ADDR &&
+			blkaddr == (map->m_pblk + ofs)) ||
+			(map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
+			flag == F2FS_GET_BLOCK_PRE_DIO) {
+		ofs++;
+		map->m_len++;
+	} else {
+		goto sync_out;
+	}
+
+skip:
 	dn.ofs_in_node++;
 	pgofs++;
 
-get_next:
-	if (dn.ofs_in_node >= end_offset) {
-		if (allocated)
-			sync_inode_page(&dn);
-		allocated = false;
-		f2fs_put_dnode(&dn);
+	/* preallocate blocks in batch for one dnode page */
+	if (flag == F2FS_GET_BLOCK_PRE_AIO &&
+			(pgofs == end || dn.ofs_in_node == end_offset)) {
 
-		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		err = get_dnode_of_data(&dn, pgofs, mode);
-		if (err) {
-			if (err == -ENOENT)
-				err = 0;
-			goto unlock_out;
+		dn.ofs_in_node = ofs_in_node;
+		err = reserve_new_blocks(&dn, prealloc);
+		if (err)
+			goto sync_out;
+		allocated = dn.node_changed;
+
+		map->m_len += dn.ofs_in_node - ofs_in_node;
+		if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
+			err = -ENOSPC;
+			goto sync_out;
 		}
-
-		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+		dn.ofs_in_node = end_offset;
 	}
 
-	if (maxblocks > map->m_len) {
-		block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+	if (pgofs >= end)
+		goto sync_out;
+	else if (dn.ofs_in_node < end_offset)
+		goto next_block;
 
-		if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
-			if (create) {
-				if (unlikely(f2fs_cp_error(sbi))) {
-					err = -EIO;
-					goto sync_out;
-				}
-				err = __allocate_data_block(&dn);
-				if (err)
-					goto sync_out;
-				allocated = true;
-				map->m_flags |= F2FS_MAP_NEW;
-				blkaddr = dn.data_blkaddr;
-			} else {
-				/*
-				 * we only merge preallocated unwritten blocks
-				 * for fiemap.
-				 */
-				if (flag != F2FS_GET_BLOCK_FIEMAP ||
-						blkaddr != NEW_ADDR)
-					goto sync_out;
-			}
-		}
+	f2fs_put_dnode(&dn);
 
-		/* Give more consecutive addresses for the readahead */
-		if ((map->m_pblk != NEW_ADDR &&
-				blkaddr == (map->m_pblk + ofs)) ||
-				(map->m_pblk == NEW_ADDR &&
-				blkaddr == NEW_ADDR)) {
-			ofs++;
-			dn.ofs_in_node++;
-			pgofs++;
-			map->m_len++;
-			goto get_next;
-		}
+	if (create) {
+		f2fs_unlock_op(sbi);
+		f2fs_balance_fs(sbi, allocated);
 	}
+	allocated = false;
+	goto next_dnode;
+
 sync_out:
-	if (allocated)
-		sync_inode_page(&dn);
-put_out:
 	f2fs_put_dnode(&dn);
 unlock_out:
-	if (create)
-		f2fs_unlock_op(F2FS_I_SB(inode));
+	if (create) {
+		f2fs_unlock_op(sbi);
+		f2fs_balance_fs(sbi, allocated);
+	}
 out:
 	trace_f2fs_map_blocks(inode, map, err);
 	return err;
 }
 
 static int __get_data_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh, int create, int flag)
+			struct buffer_head *bh, int create, int flag,
+			pgoff_t *next_pgofs)
 {
 	struct f2fs_map_blocks map;
 	int ret;
 
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
+	map.m_next_pgofs = next_pgofs;
 
 	ret = f2fs_map_blocks(inode, &map, create, flag);
 	if (!ret) {
@@ -728,23 +845,29 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 }
 
 static int get_data_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create, int flag)
+			struct buffer_head *bh_result, int create, int flag,
+			pgoff_t *next_pgofs)
 {
-	return __get_data_block(inode, iblock, bh_result, create, flag);
+	return __get_data_block(inode, iblock, bh_result, create,
+							flag, next_pgofs);
 }
 
 static int get_data_block_dio(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	return __get_data_block(inode, iblock, bh_result, create,
-						F2FS_GET_BLOCK_DIO);
+						F2FS_GET_BLOCK_DIO, NULL);
 }
 
 static int get_data_block_bmap(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
+	/* Block number less than F2FS MAX BLOCKS */
+	if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks))
+		return -EFBIG;
+
 	return __get_data_block(inode, iblock, bh_result, create,
-						F2FS_GET_BLOCK_BMAP);
+						F2FS_GET_BLOCK_BMAP, NULL);
 }
 
 static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -762,10 +885,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 {
 	struct buffer_head map_bh;
 	sector_t start_blk, last_blk;
-	loff_t isize = i_size_read(inode);
+	pgoff_t next_pgofs;
+	loff_t isize;
 	u64 logical = 0, phys = 0, size = 0;
 	u32 flags = 0;
-	bool past_eof = false, whole_file = false;
 	int ret = 0;
 
 	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
@@ -778,82 +901,64 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			return ret;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
-	if (len >= isize) {
-		whole_file = true;
-		len = isize;
-	}
+	isize = i_size_read(inode);
+	if (start >= isize)
+		goto out;
+
+	if (start + len > isize)
+		len = isize - start;
 
 	if (logical_to_blk(inode, len) == 0)
 		len = blk_to_logical(inode, 1);
 
 	start_blk = logical_to_blk(inode, start);
 	last_blk = logical_to_blk(inode, start + len - 1);
+
 next:
 	memset(&map_bh, 0, sizeof(struct buffer_head));
 	map_bh.b_size = len;
 
 	ret = get_data_block(inode, start_blk, &map_bh, 0,
-					F2FS_GET_BLOCK_FIEMAP);
+					F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
 	if (ret)
 		goto out;
 
 	/* HOLE */
 	if (!buffer_mapped(&map_bh)) {
-		start_blk++;
-
-		if (!past_eof && blk_to_logical(inode, start_blk) >= isize)
-			past_eof = 1;
-
-		if (past_eof && size) {
-			flags |= FIEMAP_EXTENT_LAST;
-			ret = fiemap_fill_next_extent(fieinfo, logical,
-					phys, size, flags);
-		} else if (size) {
-			ret = fiemap_fill_next_extent(fieinfo, logical,
-					phys, size, flags);
-			size = 0;
-		}
-
-		/* if we have holes up to/past EOF then we're done */
-		if (start_blk > last_blk || past_eof || ret)
-			goto out;
-	} else {
-		if (start_blk > last_blk && !whole_file) {
-			ret = fiemap_fill_next_extent(fieinfo, logical,
-					phys, size, flags);
-			goto out;
-		}
-
-		/*
-		 * if size != 0 then we know we already have an extent
-		 * to add, so add it.
+		start_blk = next_pgofs;
+		/* Go through holes util pass the EOF */
+		if (blk_to_logical(inode, start_blk) < isize)
+			goto prep_next;
+		/* Found a hole beyond isize means no more extents.
+		 * Note that the premise is that filesystems don't
+		 * punch holes beyond isize and keep size unchanged.
 		 */
-		if (size) {
-			ret = fiemap_fill_next_extent(fieinfo, logical,
-					phys, size, flags);
-			if (ret)
-				goto out;
-		}
-
-		logical = blk_to_logical(inode, start_blk);
-		phys = blk_to_logical(inode, map_bh.b_blocknr);
-		size = map_bh.b_size;
-		flags = 0;
-		if (buffer_unwritten(&map_bh))
-			flags = FIEMAP_EXTENT_UNWRITTEN;
-
-		start_blk += logical_to_blk(inode, size);
-
-		/*
-		 * If we are past the EOF, then we need to make sure as
-		 * soon as we find a hole that the last extent we found
-		 * is marked with FIEMAP_EXTENT_LAST
-		 */
-		if (!past_eof && logical + size >= isize)
-			past_eof = true;
+		flags |= FIEMAP_EXTENT_LAST;
 	}
+
+	if (size) {
+		if (f2fs_encrypted_inode(inode))
+			flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
+
+		ret = fiemap_fill_next_extent(fieinfo, logical,
+				phys, size, flags);
+	}
+
+	if (start_blk > last_blk || ret)
+		goto out;
+
+	logical = blk_to_logical(inode, start_blk);
+	phys = blk_to_logical(inode, map_bh.b_blocknr);
+	size = map_bh.b_size;
+	flags = 0;
+	if (buffer_unwritten(&map_bh))
+		flags = FIEMAP_EXTENT_UNWRITTEN;
+
+	start_blk += logical_to_blk(inode, size);
+
+prep_next:
 	cond_resched();
 	if (fatal_signal_pending(current))
 		ret = -EINTR;
@@ -863,10 +968,41 @@ out:
 	if (ret == 1)
 		ret = 0;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
+static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
+				 unsigned nr_pages)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct fscrypt_ctx *ctx = NULL;
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct bio *bio;
+
+	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+		ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+		if (IS_ERR(ctx))
+			return ERR_CAST(ctx);
+
+		/* wait the page to be moved by cleaning */
+		f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+	}
+
+	bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+	if (!bio) {
+		if (ctx)
+			fscrypt_release_ctx(ctx);
+		return ERR_PTR(-ENOMEM);
+	}
+	bio->bi_bdev = bdev;
+	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
+	bio->bi_end_io = f2fs_read_end_io;
+	bio->bi_private = ctx;
+
+	return bio;
+}
+
 /*
  * This function was originally taken from fs/mpage.c, and customized for f2fs.
  * Major change was from block_size == page_size in f2fs by default.
@@ -885,13 +1021,13 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 	sector_t last_block;
 	sector_t last_block_in_file;
 	sector_t block_nr;
-	struct block_device *bdev = inode->i_sb->s_bdev;
 	struct f2fs_map_blocks map;
 
 	map.m_pblk = 0;
 	map.m_lblk = 0;
 	map.m_len = 0;
 	map.m_flags = 0;
+	map.m_next_pgofs = NULL;
 
 	for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
 
@@ -930,7 +1066,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 			map.m_len = last_block - block_in_file;
 
 			if (f2fs_map_blocks(inode, &map, 0,
-							F2FS_GET_BLOCK_READ))
+						F2FS_GET_BLOCK_READ))
 				goto set_error_page;
 		}
 got_it:
@@ -943,8 +1079,9 @@ got_it:
 				goto confused;
 			}
 		} else {
-			zero_user_segment(page, 0, PAGE_CACHE_SIZE);
-			SetPageUptodate(page);
+			zero_user_segment(page, 0, PAGE_SIZE);
+			if (!PageUptodate(page))
+				SetPageUptodate(page);
 			unlock_page(page);
 			goto next_page;
 		}
@@ -955,35 +1092,15 @@ got_it:
 		 */
 		if (bio && (last_block_in_bio != block_nr - 1)) {
 submit_and_realloc:
-			submit_bio(READ, bio);
+			__submit_bio(F2FS_I_SB(inode), READ, bio, DATA);
 			bio = NULL;
 		}
 		if (bio == NULL) {
-			struct f2fs_crypto_ctx *ctx = NULL;
-
-			if (f2fs_encrypted_inode(inode) &&
-					S_ISREG(inode->i_mode)) {
-
-				ctx = f2fs_get_crypto_ctx(inode);
-				if (IS_ERR(ctx))
-					goto set_error_page;
-
-				/* wait the page to be moved by cleaning */
-				f2fs_wait_on_encrypted_page_writeback(
-						F2FS_I_SB(inode), block_nr);
-			}
-
-			bio = bio_alloc(GFP_KERNEL,
-				min_t(int, nr_pages, BIO_MAX_PAGES));
-			if (!bio) {
-				if (ctx)
-					f2fs_release_crypto_ctx(ctx);
+			bio = f2fs_grab_bio(inode, block_nr, nr_pages);
+			if (IS_ERR(bio)) {
+				bio = NULL;
 				goto set_error_page;
 			}
-			bio->bi_bdev = bdev;
-			bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr);
-			bio->bi_end_io = f2fs_read_end_io;
-			bio->bi_private = ctx;
 		}
 
 		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
@@ -993,22 +1110,22 @@ submit_and_realloc:
 		goto next_page;
 set_error_page:
 		SetPageError(page);
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 		unlock_page(page);
 		goto next_page;
 confused:
 		if (bio) {
-			submit_bio(READ, bio);
+			__submit_bio(F2FS_I_SB(inode), READ, bio, DATA);
 			bio = NULL;
 		}
 		unlock_page(page);
 next_page:
 		if (pages)
-			page_cache_release(page);
+			put_page(page);
 	}
 	BUG_ON(pages && !list_empty(pages));
 	if (bio)
-		submit_bio(READ, bio);
+		__submit_bio(F2FS_I_SB(inode), READ, bio, DATA);
 	return 0;
 }
 
@@ -1055,23 +1172,33 @@ int do_write_data_page(struct f2fs_io_info *fio)
 	if (err)
 		return err;
 
-	fio->blk_addr = dn.data_blkaddr;
+	fio->old_blkaddr = dn.data_blkaddr;
 
 	/* This page is already truncated */
-	if (fio->blk_addr == NULL_ADDR) {
+	if (fio->old_blkaddr == NULL_ADDR) {
 		ClearPageUptodate(page);
 		goto out_writepage;
 	}
 
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+		gfp_t gfp_flags = GFP_NOFS;
 
 		/* wait for GCed encrypted page writeback */
 		f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
-							fio->blk_addr);
-
-		fio->encrypted_page = f2fs_encrypt(inode, fio->page);
+							fio->old_blkaddr);
+retry_encrypt:
+		fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+								gfp_flags);
 		if (IS_ERR(fio->encrypted_page)) {
 			err = PTR_ERR(fio->encrypted_page);
+			if (err == -ENOMEM) {
+				/* flush pending ios and wait for a while */
+				f2fs_flush_merged_bios(F2FS_I_SB(inode));
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				gfp_flags |= __GFP_NOFAIL;
+				err = 0;
+				goto retry_encrypt;
+			}
 			goto out_writepage;
 		}
 	}
@@ -1082,20 +1209,19 @@ int do_write_data_page(struct f2fs_io_info *fio)
 	 * If current allocation needs SSR,
 	 * it had better in-place writes for updated data.
 	 */
-	if (unlikely(fio->blk_addr != NEW_ADDR &&
+	if (unlikely(fio->old_blkaddr != NEW_ADDR &&
 			!is_cold_data(page) &&
+			!IS_ATOMIC_WRITTEN_PAGE(page) &&
 			need_inplace_update(inode))) {
 		rewrite_data_page(fio);
-		set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
+		set_inode_flag(inode, FI_UPDATE_WRITE);
 		trace_f2fs_do_write_data_page(page, IPU);
 	} else {
 		write_data_page(&dn, fio);
-		set_data_blkaddr(&dn);
-		f2fs_update_extent_cache(&dn);
 		trace_f2fs_do_write_data_page(page, OPU);
-		set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+		set_inode_flag(inode, FI_APPEND_WRITE);
 		if (page->index == 0)
-			set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+			set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
 	}
 out_writepage:
 	f2fs_put_dnode(&dn);
@@ -1109,7 +1235,8 @@ static int f2fs_write_data_page(struct page *page,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = ((unsigned long long) i_size)
-							>> PAGE_CACHE_SHIFT;
+							>> PAGE_SHIFT;
+	loff_t psize = (page->index + 1) << PAGE_SHIFT;
 	unsigned offset = 0;
 	bool need_balance_fs = false;
 	int err = 0;
@@ -1130,37 +1257,37 @@ static int f2fs_write_data_page(struct page *page,
 	 * If the offset is out-of-range of file size,
 	 * this page does not have to be written to disk.
 	 */
-	offset = i_size & (PAGE_CACHE_SIZE - 1);
+	offset = i_size & (PAGE_SIZE - 1);
 	if ((page->index >= end_index + 1) || !offset)
 		goto out;
 
-	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	zero_user_segment(page, offset, PAGE_SIZE);
 write:
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
 	if (f2fs_is_drop_cache(inode))
 		goto out;
-	if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim &&
-			available_free_memory(sbi, BASE_CHECK))
+	/* we should not write 0'th page having journal header */
+	if (f2fs_is_volatile_file(inode) && (!page->index ||
+			(!wbc->for_reclaim &&
+			available_free_memory(sbi, BASE_CHECK))))
 		goto redirty_out;
 
+	/* we should bypass data pages to proceed the kworkder jobs */
+	if (unlikely(f2fs_cp_error(sbi))) {
+		mapping_set_error(page->mapping, -EIO);
+		goto out;
+	}
+
 	/* Dentry blocks are controlled by checkpoint */
 	if (S_ISDIR(inode->i_mode)) {
-		if (unlikely(f2fs_cp_error(sbi)))
-			goto redirty_out;
 		err = do_write_data_page(&fio);
 		goto done;
 	}
 
-	/* we should bypass data pages to proceed the kworkder jobs */
-	if (unlikely(f2fs_cp_error(sbi))) {
-		SetPageError(page);
-		goto out;
-	}
-
 	if (!wbc->for_reclaim)
 		need_balance_fs = true;
-	else if (has_not_enough_free_secs(sbi, 0))
+	else if (has_not_enough_free_secs(sbi, 0, 0))
 		goto redirty_out;
 
 	err = -EAGAIN;
@@ -1169,6 +1296,8 @@ write:
 		err = f2fs_write_inline_data(inode, page);
 	if (err == -EAGAIN)
 		err = do_write_data_page(&fio);
+	if (F2FS_I(inode)->last_disk_size < psize)
+		F2FS_I(inode)->last_disk_size = psize;
 	f2fs_unlock_op(sbi);
 done:
 	if (err && err != -ENOENT)
@@ -1179,25 +1308,24 @@ out:
 	inode_dec_dirty_pages(inode);
 	if (err)
 		ClearPageUptodate(page);
+
+	if (wbc->for_reclaim) {
+		f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE);
+		remove_dirty_inode(inode);
+	}
+
 	unlock_page(page);
-	if (need_balance_fs)
-		f2fs_balance_fs(sbi);
-	if (wbc->for_reclaim)
+	f2fs_balance_fs(sbi, need_balance_fs);
+
+	if (unlikely(f2fs_cp_error(sbi)))
 		f2fs_submit_merged_bio(sbi, DATA, WRITE);
+
 	return 0;
 
 redirty_out:
 	redirty_page_for_writepage(wbc, page);
-	return AOP_WRITEPAGE_ACTIVATE;
-}
-
-static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
-			void *data)
-{
-	struct address_space *mapping = data;
-	int ret = mapping->a_ops->writepage(page, wbc);
-	mapping_set_error(mapping, ret);
-	return ret;
+	unlock_page(page);
+	return err;
 }
 
 /*
@@ -1206,8 +1334,7 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
  * warm/hot data page.
  */
 static int f2fs_write_cache_pages(struct address_space *mapping,
-			struct writeback_control *wbc, writepage_t writepage,
-			void *data)
+					struct writeback_control *wbc)
 {
 	int ret = 0;
 	int done = 0;
@@ -1220,10 +1347,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	int cycled;
 	int range_whole = 0;
 	int tag;
-	int step = 0;
+	int nwritten = 0;
 
 	pagevec_init(&pvec, 0);
-next:
+
 	if (wbc->range_cyclic) {
 		writeback_index = mapping->writeback_index; /* prev offset */
 		index = writeback_index;
@@ -1233,8 +1360,8 @@ next:
 			cycled = 0;
 		end = -1;
 	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
@@ -1278,12 +1405,10 @@ continue_unlock:
 				goto continue_unlock;
 			}
 
-			if (step == is_cold_data(page))
-				goto continue_unlock;
-
 			if (PageWriteback(page)) {
 				if (wbc->sync_mode != WB_SYNC_NONE)
-					f2fs_wait_on_page_writeback(page, DATA);
+					f2fs_wait_on_page_writeback(page,
+								DATA, true);
 				else
 					goto continue_unlock;
 			}
@@ -1292,16 +1417,13 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			ret = (*writepage)(page, wbc, data);
+			ret = mapping->a_ops->writepage(page, wbc);
 			if (unlikely(ret)) {
-				if (ret == AOP_WRITEPAGE_ACTIVATE) {
-					unlock_page(page);
-					ret = 0;
-				} else {
-					done_index = page->index + 1;
-					done = 1;
-					break;
-				}
+				done_index = page->index + 1;
+				done = 1;
+				break;
+			} else {
+				nwritten++;
 			}
 
 			if (--wbc->nr_to_write <= 0 &&
@@ -1314,11 +1436,6 @@ continue_unlock:
 		cond_resched();
 	}
 
-	if (step < 1) {
-		step++;
-		goto next;
-	}
-
 	if (!cycled && !done) {
 		cycled = 1;
 		index = 0;
@@ -1328,6 +1445,10 @@ continue_unlock:
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = done_index;
 
+	if (nwritten)
+		f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host,
+							NULL, 0, DATA, WRITE);
+
 	return ret;
 }
 
@@ -1336,11 +1457,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	bool locked = false;
+	struct blk_plug plug;
 	int ret;
-	long diff;
-
-	trace_f2fs_writepages(mapping->host, wbc, DATA);
 
 	/* deal with chardevs and other special file */
 	if (!mapping->a_ops->writepage)
@@ -1355,41 +1473,119 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 			available_free_memory(sbi, DIRTY_DENTS))
 		goto skip_write;
 
+	/* skip writing during file defragment */
+	if (is_inode_flag_set(inode, FI_DO_DEFRAG))
+		goto skip_write;
+
 	/* during POR, we don't need to trigger writepage at all. */
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto skip_write;
 
-	diff = nr_pages_to_write(sbi, DATA, wbc);
+	trace_f2fs_writepages(mapping->host, wbc, DATA);
 
-	if (!S_ISDIR(inode->i_mode)) {
-		mutex_lock(&sbi->writepages);
-		locked = true;
-	}
-	ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
-	f2fs_submit_merged_bio(sbi, DATA, WRITE);
-	if (locked)
-		mutex_unlock(&sbi->writepages);
+	blk_start_plug(&plug);
+	ret = f2fs_write_cache_pages(mapping, wbc);
+	blk_finish_plug(&plug);
+	/*
+	 * if some pages were truncated, we cannot guarantee its mapping->host
+	 * to detect pending bios.
+	 */
 
-	remove_dirty_dir_inode(inode);
-
-	wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
+	remove_dirty_inode(inode);
 	return ret;
 
 skip_write:
 	wbc->pages_skipped += get_dirty_pages(inode);
+	trace_f2fs_writepages(mapping->host, wbc, DATA);
 	return 0;
 }
 
 static void f2fs_write_failed(struct address_space *mapping, loff_t to)
 {
 	struct inode *inode = mapping->host;
+	loff_t i_size = i_size_read(inode);
 
-	if (to > inode->i_size) {
-		truncate_pagecache(inode, inode->i_size);
-		truncate_blocks(inode, inode->i_size, true);
+	if (to > i_size) {
+		truncate_pagecache(inode, i_size);
+		truncate_blocks(inode, i_size, true);
 	}
 }
 
+static int prepare_write_begin(struct f2fs_sb_info *sbi,
+			struct page *page, loff_t pos, unsigned len,
+			block_t *blk_addr, bool *node_changed)
+{
+	struct inode *inode = page->mapping->host;
+	pgoff_t index = page->index;
+	struct dnode_of_data dn;
+	struct page *ipage;
+	bool locked = false;
+	struct extent_info ei;
+	int err = 0;
+
+	/*
+	 * we already allocated all the blocks, so we don't need to get
+	 * the block addresses when there is no need to fill the page.
+	 */
+	if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE)
+		return 0;
+
+	if (f2fs_has_inline_data(inode) ||
+			(pos & PAGE_MASK) >= i_size_read(inode)) {
+		f2fs_lock_op(sbi);
+		locked = true;
+	}
+restart:
+	/* check inline_data */
+	ipage = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(ipage)) {
+		err = PTR_ERR(ipage);
+		goto unlock_out;
+	}
+
+	set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+	if (f2fs_has_inline_data(inode)) {
+		if (pos + len <= MAX_INLINE_DATA) {
+			read_inline_data(page, ipage);
+			set_inode_flag(inode, FI_DATA_EXIST);
+			if (inode->i_nlink)
+				set_inline_node(ipage);
+		} else {
+			err = f2fs_convert_inline_page(&dn, page);
+			if (err)
+				goto out;
+			if (dn.data_blkaddr == NULL_ADDR)
+				err = f2fs_get_block(&dn, index);
+		}
+	} else if (locked) {
+		err = f2fs_get_block(&dn, index);
+	} else {
+		if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+			dn.data_blkaddr = ei.blk + index - ei.fofs;
+		} else {
+			/* hole case */
+			err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+			if (err || dn.data_blkaddr == NULL_ADDR) {
+				f2fs_put_dnode(&dn);
+				f2fs_lock_op(sbi);
+				locked = true;
+				goto restart;
+			}
+		}
+	}
+
+	/* convert_inline_page can make node_changed */
+	*blk_addr = dn.data_blkaddr;
+	*node_changed = dn.node_changed;
+out:
+	f2fs_put_dnode(&dn);
+unlock_out:
+	if (locked)
+		f2fs_unlock_op(sbi);
+	return err;
+}
+
 static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata)
@@ -1397,9 +1593,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *page = NULL;
-	struct page *ipage;
-	pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
-	struct dnode_of_data dn;
+	pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
+	bool need_balance = false;
+	block_t blkaddr = NULL_ADDR;
 	int err = 0;
 
 	if (trace_android_fs_datawrite_start_enabled()) {
@@ -1414,8 +1610,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	}
 	trace_f2fs_write_begin(inode, pos, len, flags);
 
-	f2fs_balance_fs(sbi);
-
 	/*
 	 * We should check this at this moment to avoid deadlock on inode page
 	 * and #0 page. The locking rule for inline_data conversion should be:
@@ -1435,98 +1629,63 @@ repeat:
 
 	*pagep = page;
 
-	f2fs_lock_op(sbi);
-
-	/* check inline_data */
-	ipage = get_node_page(sbi, inode->i_ino);
-	if (IS_ERR(ipage)) {
-		err = PTR_ERR(ipage);
-		goto unlock_fail;
-	}
-
-	set_new_dnode(&dn, inode, ipage, ipage, 0);
-
-	if (f2fs_has_inline_data(inode)) {
-		if (pos + len <= MAX_INLINE_DATA) {
-			read_inline_data(page, ipage);
-			set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
-			sync_inode_page(&dn);
-			goto put_next;
-		}
-		err = f2fs_convert_inline_page(&dn, page);
-		if (err)
-			goto put_fail;
-	}
-
-	err = f2fs_get_block(&dn, index);
+	err = prepare_write_begin(sbi, page, pos, len,
+					&blkaddr, &need_balance);
 	if (err)
-		goto put_fail;
-put_next:
-	f2fs_put_dnode(&dn);
-	f2fs_unlock_op(sbi);
+		goto fail;
 
-	f2fs_wait_on_page_writeback(page, DATA);
+	if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) {
+		unlock_page(page);
+		f2fs_balance_fs(sbi, true);
+		lock_page(page);
+		if (page->mapping != mapping) {
+			/* The page got truncated from under us */
+			f2fs_put_page(page, 1);
+			goto repeat;
+		}
+	}
+
+	f2fs_wait_on_page_writeback(page, DATA, false);
 
 	/* wait for GCed encrypted page writeback */
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-		f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+		f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
 
-	if (len == PAGE_CACHE_SIZE)
-		goto out_update;
-	if (PageUptodate(page))
-		goto out_clear;
+	if (len == PAGE_SIZE || PageUptodate(page))
+		return 0;
 
-	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
-		unsigned start = pos & (PAGE_CACHE_SIZE - 1);
-		unsigned end = start + len;
-
-		/* Reading beyond i_size is simple: memset to zero */
-		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
-		goto out_update;
-	}
-
-	if (dn.data_blkaddr == NEW_ADDR) {
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	if (blkaddr == NEW_ADDR) {
+		zero_user_segment(page, 0, PAGE_SIZE);
+		SetPageUptodate(page);
 	} else {
-		struct f2fs_io_info fio = {
-			.sbi = sbi,
-			.type = DATA,
-			.rw = READ_SYNC,
-			.blk_addr = dn.data_blkaddr,
-			.page = page,
-			.encrypted_page = NULL,
-		};
-		err = f2fs_submit_page_bio(&fio);
-		if (err)
-			goto fail;
+		struct bio *bio;
 
-		lock_page(page);
-		if (unlikely(!PageUptodate(page))) {
-			err = -EIO;
+		bio = f2fs_grab_bio(inode, blkaddr, 1);
+		if (IS_ERR(bio)) {
+			err = PTR_ERR(bio);
 			goto fail;
 		}
+
+		if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+			bio_put(bio);
+			err = -EFAULT;
+			goto fail;
+		}
+
+		__submit_bio(sbi, READ_SYNC, bio, DATA);
+
+		lock_page(page);
 		if (unlikely(page->mapping != mapping)) {
 			f2fs_put_page(page, 1);
 			goto repeat;
 		}
-
-		/* avoid symlink page */
-		if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
-			err = f2fs_decrypt_one(inode, page);
-			if (err)
-				goto fail;
+		if (unlikely(!PageUptodate(page))) {
+			err = -EIO;
+			goto fail;
 		}
 	}
-out_update:
-	SetPageUptodate(page);
-out_clear:
-	clear_cold_data(page);
 	return 0;
 
-put_fail:
-	f2fs_put_dnode(&dn);
-unlock_fail:
-	f2fs_unlock_op(sbi);
 fail:
 	f2fs_put_page(page, 1);
 	f2fs_write_failed(mapping, pos + len);
@@ -1543,15 +1702,28 @@ static int f2fs_write_end(struct file *file,
 	trace_android_fs_datawrite_end(inode, pos, len);
 	trace_f2fs_write_end(inode, pos, len, copied);
 
-	set_page_dirty(page);
-
-	if (pos + copied > i_size_read(inode)) {
-		i_size_write(inode, pos + copied);
-		mark_inode_dirty(inode);
-		update_inode_page(inode);
+	/*
+	 * This should be come from len == PAGE_SIZE, and we expect copied
+	 * should be PAGE_SIZE. Otherwise, we treat it with zero copied and
+	 * let generic_perform_write() try to copy data again through copied=0.
+	 */
+	if (!PageUptodate(page)) {
+		if (unlikely(copied != PAGE_SIZE))
+			copied = 0;
+		else
+			SetPageUptodate(page);
 	}
+	if (!copied)
+		goto unlock_out;
 
+	set_page_dirty(page);
+	clear_cold_data(page);
+
+	if (pos + copied > i_size_read(inode))
+		f2fs_i_size_write(inode, pos + copied);
+unlock_out:
 	f2fs_put_page(page, 1);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return copied;
 }
 
@@ -1570,29 +1742,22 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
 }
 
 static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
+				loff_t offset)
 {
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
 	size_t count = iov_iter_count(iter);
+	int rw = iov_iter_rw(iter);
 	int err;
 
-	/* we don't need to use inline_data strictly */
-	if (f2fs_has_inline_data(inode)) {
-		err = f2fs_convert_inline_inode(inode);
-		if (err)
-			return err;
-	}
-
-	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-		return 0;
-
 	err = check_direct_IO(inode, iter, offset);
 	if (err)
 		return err;
 
-	trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
+	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+		return 0;
+	if (test_opt(F2FS_I_SB(inode), LFS))
+		return 0;
 
 	if (trace_android_fs_dataread_start_enabled() &&
 	    (iov_iter_rw(iter) == READ)) {
@@ -1616,18 +1781,18 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 						 current->pid, path,
 						 current->comm);
 	}
-	if (iov_iter_rw(iter) == WRITE) {
-		__allocate_data_blocks(inode, offset, count);
-		if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
-			err = -EIO;
-			goto out;
-		}
-	}
+	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
 
+	down_read(&F2FS_I(inode)->dio_rwsem[rw]);
 	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
-out:
-	if (err < 0 && iov_iter_rw(iter) == WRITE)
-		f2fs_write_failed(mapping, offset + count);
+	up_read(&F2FS_I(inode)->dio_rwsem[rw]);
+
+	if (rw == WRITE) {
+		if (err > 0)
+			set_inode_flag(inode, FI_UPDATE_WRITE);
+		else if (err < 0)
+			f2fs_write_failed(mapping, offset + count);
+	}
 
 	if (trace_android_fs_dataread_start_enabled() &&
 	    (iov_iter_rw(iter) == READ))
@@ -1636,7 +1801,7 @@ out:
 	    (iov_iter_rw(iter) == WRITE))
 		trace_android_fs_datawrite_end(inode, offset, count);
 
-	trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err);
+	trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
 
 	return err;
 }
@@ -1648,7 +1813,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
 	if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
-		(offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
+		(offset % PAGE_SIZE || length != PAGE_SIZE))
 		return;
 
 	if (PageDirty(page)) {
@@ -1664,6 +1829,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 	if (IS_ATOMIC_WRITTEN_PAGE(page))
 		return;
 
+	set_page_private(page, 0);
 	ClearPagePrivate(page);
 }
 
@@ -1677,10 +1843,42 @@ int f2fs_release_page(struct page *page, gfp_t wait)
 	if (IS_ATOMIC_WRITTEN_PAGE(page))
 		return 0;
 
+	set_page_private(page, 0);
 	ClearPagePrivate(page);
 	return 1;
 }
 
+/*
+ * This was copied from __set_page_dirty_buffers which gives higher performance
+ * in very high speed storages. (e.g., pmem)
+ */
+void f2fs_set_page_dirty_nobuffers(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct mem_cgroup *memcg;
+	unsigned long flags;
+
+	if (unlikely(!mapping))
+		return;
+
+	spin_lock(&mapping->private_lock);
+	memcg = mem_cgroup_begin_page_stat(page);
+	SetPageDirty(page);
+	spin_unlock(&mapping->private_lock);
+
+	spin_lock_irqsave(&mapping->tree_lock, flags);
+	WARN_ON_ONCE(!PageUptodate(page));
+	account_page_dirtied(page, mapping, memcg);
+	radix_tree_tag_set(&mapping->page_tree,
+			page_index(page), PAGECACHE_TAG_DIRTY);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+	mem_cgroup_end_page_stat(memcg);
+
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+	return;
+}
+
 static int f2fs_set_data_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
@@ -1688,7 +1886,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
 
 	trace_f2fs_set_page_dirty(page, DATA);
 
-	SetPageUptodate(page);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
 
 	if (f2fs_is_atomic_file(inode)) {
 		if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
@@ -1703,7 +1902,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
 	}
 
 	if (!PageDirty(page)) {
-		__set_page_dirty_nobuffers(page);
+		f2fs_set_page_dirty_nobuffers(page);
 		update_dirty_page(inode, page);
 		return 1;
 	}
@@ -1724,6 +1923,58 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, get_data_block_bmap);
 }
 
+#ifdef CONFIG_MIGRATION
+#include <linux/migrate.h>
+
+int f2fs_migrate_page(struct address_space *mapping,
+		struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+	int rc, extra_count;
+	struct f2fs_inode_info *fi = F2FS_I(mapping->host);
+	bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page);
+
+	BUG_ON(PageWriteback(page));
+
+	/* migrating an atomic written page is safe with the inmem_lock hold */
+	if (atomic_written && !mutex_trylock(&fi->inmem_lock))
+		return -EAGAIN;
+
+	/*
+	 * A reference is expected if PagePrivate set when move mapping,
+	 * however F2FS breaks this for maintaining dirty page counts when
+	 * truncating pages. So here adjusting the 'extra_count' make it work.
+	 */
+	extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
+	rc = migrate_page_move_mapping(mapping, newpage,
+				page, NULL, mode, extra_count);
+	if (rc != MIGRATEPAGE_SUCCESS) {
+		if (atomic_written)
+			mutex_unlock(&fi->inmem_lock);
+		return rc;
+	}
+
+	if (atomic_written) {
+		struct inmem_pages *cur;
+		list_for_each_entry(cur, &fi->inmem_pages, list)
+			if (cur->page == page) {
+				cur->page = newpage;
+				break;
+			}
+		mutex_unlock(&fi->inmem_lock);
+		put_page(page);
+		get_page(newpage);
+	}
+
+	if (PagePrivate(page))
+		SetPagePrivate(newpage);
+	set_page_private(newpage, page_private(page));
+
+	migrate_page_copy(newpage, page);
+
+	return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
 const struct address_space_operations f2fs_dblock_aops = {
 	.readpage	= f2fs_read_data_page,
 	.readpages	= f2fs_read_data_pages,
@@ -1736,4 +1987,7 @@ const struct address_space_operations f2fs_dblock_aops = {
 	.releasepage	= f2fs_release_page,
 	.direct_IO	= f2fs_direct_IO,
 	.bmap		= f2fs_bmap,
+#ifdef CONFIG_MIGRATION
+	.migratepage    = f2fs_migrate_page,
+#endif
 };
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 24d6a51b48d1..1c35e80732e0 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -38,23 +38,30 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
 	si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
 	si->total_ext = atomic64_read(&sbi->total_hit_ext);
-	si->ext_tree = sbi->total_ext_tree;
+	si->ext_tree = atomic_read(&sbi->total_ext_tree);
+	si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
 	si->ext_node = atomic_read(&sbi->total_ext_node);
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
-	si->ndirty_dirs = sbi->n_dirty_dirs;
 	si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+	si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+	si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
+	si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
+	si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+	si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
-	si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
+	si->wb_bios = atomic_read(&sbi->nr_wb_bios);
 	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
 	si->rsvd_segs = reserved_segments(sbi);
 	si->overp_segs = overprovision_segments(sbi);
 	si->valid_count = valid_user_blocks(sbi);
+	si->discard_blks = discard_blocks(sbi);
 	si->valid_node_count = valid_node_count(sbi);
 	si->valid_inode_count = valid_inode_count(sbi);
 	si->inline_xattr = atomic_read(&sbi->inline_xattr);
 	si->inline_inode = atomic_read(&sbi->inline_inode);
 	si->inline_dir = atomic_read(&sbi->inline_dir);
+	si->orphans = sbi->im[ORPHAN_INO].ino_num;
 	si->utilization = utilization(sbi);
 
 	si->free_segs = free_segments(sbi);
@@ -105,7 +112,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 
 	bimodal = 0;
 	total_vblocks = 0;
-	blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+	blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
 	hblks_per_sec = blks_per_sec / 2;
 	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
 		vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
@@ -140,6 +147,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
 	si->base_mem += 2 * sizeof(struct f2fs_inode_info);
 	si->base_mem += sizeof(*sbi->ckpt);
+	si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
 
 	/* build sm */
 	si->base_mem += sizeof(struct f2fs_sm_info);
@@ -148,7 +156,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	si->base_mem += sizeof(struct sit_info);
 	si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
 	si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
-	si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+	if (f2fs_discard_en(sbi))
+		si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
 	si->base_mem += SIT_VBLOCK_MAP_SIZE;
 	if (sbi->segs_per_sec > 1)
 		si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
@@ -161,7 +171,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 
 	/* build curseg */
 	si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
-	si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
+	si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE;
 
 	/* build dirty segmap */
 	si->base_mem += sizeof(struct dirty_seglist_info);
@@ -189,18 +199,18 @@ get_cache:
 	si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
 					sizeof(struct nat_entry_set);
 	si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
-	si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
-	for (i = 0; i <= UPDATE_INO; i++)
+	for (i = 0; i <= ORPHAN_INO; i++)
 		si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
-	si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree);
+	si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+						sizeof(struct extent_tree);
 	si->cache_mem += atomic_read(&sbi->total_ext_node) *
 						sizeof(struct extent_node);
 
 	si->page_mem = 0;
 	npages = NODE_MAPPING(sbi)->nrpages;
-	si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
+	si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 	npages = META_MAPPING(sbi)->nrpages;
-	si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
+	si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 }
 
 static int stat_show(struct seq_file *s, void *v)
@@ -211,20 +221,24 @@ static int stat_show(struct seq_file *s, void *v)
 
 	mutex_lock(&f2fs_stat_mutex);
 	list_for_each_entry(si, &f2fs_stat_list, stat_list) {
-		char devname[BDEVNAME_SIZE];
-
 		update_general_status(si->sbi);
 
-		seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
-			bdevname(si->sbi->sb->s_bdev, devname), i++);
+		seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n",
+			si->sbi->sb->s_bdev, i++,
+			f2fs_readonly(si->sbi->sb) ? "RO": "RW");
 		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
 			   si->sit_area_segs, si->nat_area_segs);
 		seq_printf(s, "[SSA: %d] [MAIN: %d",
 			   si->ssa_area_segs, si->main_area_segs);
 		seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
 			   si->overp_segs, si->rsvd_segs);
-		seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
-			   si->utilization, si->valid_count);
+		if (test_opt(si->sbi, DISCARD))
+			seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n",
+				si->utilization, si->valid_count, si->discard_blks);
+		else
+			seq_printf(s, "Utilization: %u%% (%u valid blocks)\n",
+				si->utilization, si->valid_count);
+
 		seq_printf(s, "  - Node: %u (Inode: %u, ",
 			   si->valid_node_count, si->valid_inode_count);
 		seq_printf(s, "Other: %u)\n  - Data: %u\n",
@@ -236,6 +250,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->inline_inode);
 		seq_printf(s, "  - Inline_dentry Inode: %u\n",
 			   si->inline_dir);
+		seq_printf(s, "  - Orphan Inode: %u\n",
+			   si->orphans);
 		seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
 			   si->main_area_segs, si->main_area_sections,
 			   si->main_area_zones);
@@ -269,7 +285,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->dirty_count);
 		seq_printf(s, "  - Prefree: %d\n  - Free: %d (%d)\n\n",
 			   si->prefree_count, si->free_segs, si->free_secs);
-		seq_printf(s, "CP calls: %d\n", si->cp_count);
+		seq_printf(s, "CP calls: %d (BG: %d)\n",
+				si->cp_count, si->bg_cp_count);
 		seq_printf(s, "GC calls: %d (BG: %d)\n",
 			   si->call_count, si->bg_gc);
 		seq_printf(s, "  - data segments : %d (%d)\n",
@@ -290,17 +307,21 @@ static int stat_show(struct seq_file *s, void *v)
 				!si->total_ext ? 0 :
 				div64_u64(si->hit_total * 100, si->total_ext),
 				si->hit_total, si->total_ext);
-		seq_printf(s, "  - Inner Struct Count: tree: %d, node: %d\n",
-				si->ext_tree, si->ext_node);
+		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
+				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - inmem: %4d, wb: %4d\n",
-			   si->inmem_pages, si->wb_pages);
-		seq_printf(s, "  - nodes: %4d in %4d\n",
+		seq_printf(s, "  - inmem: %4lld, wb_bios: %4d\n",
+			   si->inmem_pages, si->wb_bios);
+		seq_printf(s, "  - nodes: %4lld in %4d\n",
 			   si->ndirty_node, si->node_pages);
-		seq_printf(s, "  - dents: %4d in dirs:%4d\n",
-			   si->ndirty_dent, si->ndirty_dirs);
-		seq_printf(s, "  - meta: %4d in %4d\n",
+		seq_printf(s, "  - dents: %4lld in dirs:%4d (%4d)\n",
+			   si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
+		seq_printf(s, "  - datas: %4lld in files:%4d\n",
+			   si->ndirty_data, si->ndirty_files);
+		seq_printf(s, "  - meta: %4lld in %4d\n",
 			   si->ndirty_meta, si->meta_pages);
+		seq_printf(s, "  - imeta: %4lld\n",
+			   si->ndirty_imeta);
 		seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
 			   si->dirty_nats, si->nats, si->dirty_sits, si->sits);
 		seq_printf(s, "  - free_nids: %9d\n",
@@ -407,20 +428,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
 	kfree(si);
 }
 
-void __init f2fs_create_root_stats(void)
+int __init f2fs_create_root_stats(void)
 {
 	struct dentry *file;
 
 	f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
 	if (!f2fs_debugfs_root)
-		return;
+		return -ENOMEM;
 
 	file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
 			NULL, &stat_fops);
 	if (!file) {
 		debugfs_remove(f2fs_debugfs_root);
 		f2fs_debugfs_root = NULL;
+		return -ENOMEM;
 	}
+
+	return 0;
 }
 
 void f2fs_destroy_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 60972a559685..e634a637c443 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -17,8 +17,8 @@
 
 static unsigned long dir_blocks(struct inode *inode)
 {
-	return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
-							>> PAGE_CACHE_SHIFT;
+	return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1))
+							>> PAGE_SHIFT;
 }
 
 static unsigned int dir_buckets(unsigned int level, int dir_level)
@@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level)
 		return 4;
 }
 
-unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
+static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
 	[F2FS_FT_UNKNOWN]	= DT_UNKNOWN,
 	[F2FS_FT_REG_FILE]	= DT_REG,
 	[F2FS_FT_DIR]		= DT_DIR,
@@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
 	[F2FS_FT_SYMLINK]	= DT_LNK,
 };
 
-#define S_SHIFT 12
 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFREG >> S_SHIFT]	= F2FS_FT_REG_FILE,
 	[S_IFDIR >> S_SHIFT]	= F2FS_FT_DIR,
@@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
 	de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
 }
 
+unsigned char get_de_type(struct f2fs_dir_entry *de)
+{
+	if (de->file_type < F2FS_FT_MAX)
+		return f2fs_filetype_table[de->file_type];
+	return DT_UNKNOWN;
+}
+
 static unsigned long dir_block_index(unsigned int level,
 				int dir_level, unsigned int idx)
 {
@@ -77,7 +83,7 @@ static unsigned long dir_block_index(unsigned int level,
 }
 
 static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
-				struct f2fs_filename *fname,
+				struct fscrypt_name *fname,
 				f2fs_hash_t namehash,
 				int *max_slots,
 				struct page **res_page)
@@ -95,23 +101,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
 	else
 		kunmap(dentry_page);
 
-	/*
-	 * For the most part, it should be a bug when name_len is zero.
-	 * We stop here for figuring out where the bugs has occurred.
-	 */
-	f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0);
 	return de;
 }
 
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname,
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
 			f2fs_hash_t namehash, int *max_slots,
 			struct f2fs_dentry_ptr *d)
 {
 	struct f2fs_dir_entry *de;
 	unsigned long bit_pos = 0;
 	int max_len = 0;
-	struct f2fs_str de_name = FSTR_INIT(NULL, 0);
-	struct f2fs_str *name = &fname->disk_name;
+	struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
+	struct fscrypt_str *name = &fname->disk_name;
 
 	if (max_slots)
 		*max_slots = 0;
@@ -124,37 +125,28 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname,
 
 		de = &d->dentry[bit_pos];
 
-		if (de->hash_code != namehash)
-			goto not_match;
+		if (unlikely(!de->name_len)) {
+			bit_pos++;
+			continue;
+		}
 
+		/* encrypted case */
 		de_name.name = d->filename[bit_pos];
 		de_name.len = le16_to_cpu(de->name_len);
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-		if (unlikely(!name->name)) {
-			if (fname->usr_fname->name[0] == '_') {
-				if (de_name.len > 32 &&
-					!memcmp(de_name.name + ((de_name.len - 17) & ~15),
-						fname->crypto_buf.name + 8, 16))
-					goto found;
-				goto not_match;
-			}
-			name->name = fname->crypto_buf.name;
-			name->len = fname->crypto_buf.len;
-		}
-#endif
-		if (de_name.len == name->len &&
-				!memcmp(de_name.name, name->name, name->len))
+		/* show encrypted name */
+		if (fname->hash) {
+			if (de->hash_code == fname->hash)
+				goto found;
+		} else if (de_name.len == name->len &&
+			de->hash_code == namehash &&
+			!memcmp(de_name.name, name->name, name->len))
 			goto found;
-not_match:
+
 		if (max_slots && max_len > *max_slots)
 			*max_slots = max_len;
 		max_len = 0;
 
-		/* remain bug on condition */
-		if (unlikely(!de->name_len))
-			d->max = -1;
-
 		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
 	}
 
@@ -167,7 +159,7 @@ found:
 
 static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 					unsigned int level,
-					struct f2fs_filename *fname,
+					struct fscrypt_name *fname,
 					struct page **res_page)
 {
 	struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
@@ -180,9 +172,10 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 	int max_slots;
 	f2fs_hash_t namehash;
 
-	namehash = f2fs_dentry_hash(&name, fname);
-
-	f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
+	if(fname->hash)
+		namehash = cpu_to_le32(fname->hash);
+	else
+		namehash = f2fs_dentry_hash(&name);
 
 	nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
 	nblock = bucket_blocks(level);
@@ -195,8 +188,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 		/* no need to allocate new dentry pages to all the indices */
 		dentry_page = find_data_page(dir, bidx);
 		if (IS_ERR(dentry_page)) {
-			room = true;
-			continue;
+			if (PTR_ERR(dentry_page) == -ENOENT) {
+				room = true;
+				continue;
+			} else {
+				*res_page = dentry_page;
+				break;
+			}
 		}
 
 		de = find_in_block(dentry_page, fname, namehash, &max_slots,
@@ -217,6 +215,44 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 	return de;
 }
 
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
+			struct fscrypt_name *fname, struct page **res_page)
+{
+	unsigned long npages = dir_blocks(dir);
+	struct f2fs_dir_entry *de = NULL;
+	unsigned int max_depth;
+	unsigned int level;
+
+	if (f2fs_has_inline_dentry(dir)) {
+		*res_page = NULL;
+		de = find_in_inline_dir(dir, fname, res_page);
+		goto out;
+	}
+
+	if (npages == 0) {
+		*res_page = NULL;
+		goto out;
+	}
+
+	max_depth = F2FS_I(dir)->i_current_depth;
+	if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
+		f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING,
+				"Corrupted max_depth of %lu: %u",
+				dir->i_ino, max_depth);
+		max_depth = MAX_DIR_HASH_DEPTH;
+		f2fs_i_depth_write(dir, max_depth);
+	}
+
+	for (level = 0; level < max_depth; level++) {
+		*res_page = NULL;
+		de = find_in_level(dir, level, fname, res_page);
+		if (de || IS_ERR(*res_page))
+			break;
+	}
+out:
+	return de;
+}
+
 /*
  * Find an entry in the specified directory with the wanted name.
  * It returns the page where the entry was found (as a parameter - res_page),
@@ -224,72 +260,42 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
  * Entry is guaranteed to be valid.
  */
 struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
-			struct qstr *child, struct page **res_page)
+			const struct qstr *child, struct page **res_page)
 {
-	unsigned long npages = dir_blocks(dir);
 	struct f2fs_dir_entry *de = NULL;
-	unsigned int max_depth;
-	unsigned int level;
-	struct f2fs_filename fname;
+	struct fscrypt_name fname;
 	int err;
 
-	*res_page = NULL;
-
-	err = f2fs_fname_setup_filename(dir, child, 1, &fname);
-	if (err)
+	err = fscrypt_setup_filename(dir, child, 1, &fname);
+	if (err) {
+		*res_page = ERR_PTR(err);
 		return NULL;
-
-	if (f2fs_has_inline_dentry(dir)) {
-		de = find_in_inline_dir(dir, &fname, res_page);
-		goto out;
 	}
 
-	if (npages == 0)
-		goto out;
+	de = __f2fs_find_entry(dir, &fname, res_page);
 
-	max_depth = F2FS_I(dir)->i_current_depth;
-
-	for (level = 0; level < max_depth; level++) {
-		de = find_in_level(dir, level, &fname, res_page);
-		if (de)
-			break;
-	}
-out:
-	f2fs_fname_free_filename(&fname);
+	fscrypt_free_filename(&fname);
 	return de;
 }
 
 struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
 {
-	struct page *page;
-	struct f2fs_dir_entry *de;
-	struct f2fs_dentry_block *dentry_blk;
+	struct qstr dotdot = QSTR_INIT("..", 2);
 
-	if (f2fs_has_inline_dentry(dir))
-		return f2fs_parent_inline_dir(dir, p);
-
-	page = get_lock_data_page(dir, 0, false);
-	if (IS_ERR(page))
-		return NULL;
-
-	dentry_blk = kmap(page);
-	de = &dentry_blk->dentry[1];
-	*p = page;
-	unlock_page(page);
-	return de;
+	return f2fs_find_entry(dir, &dotdot, p);
 }
 
-ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
+ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
+							struct page **page)
 {
 	ino_t res = 0;
 	struct f2fs_dir_entry *de;
-	struct page *page;
 
-	de = f2fs_find_entry(dir, qstr, &page);
+	de = f2fs_find_entry(dir, qstr, page);
 	if (de) {
 		res = le32_to_cpu(de->ino);
-		f2fs_dentry_kunmap(dir, page);
-		f2fs_put_page(page, 0);
+		f2fs_dentry_kunmap(dir, *page);
+		f2fs_put_page(*page, 0);
 	}
 
 	return res;
@@ -300,14 +306,14 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 {
 	enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
 	lock_page(page);
-	f2fs_wait_on_page_writeback(page, type);
+	f2fs_wait_on_page_writeback(page, type, true);
 	de->ino = cpu_to_le32(inode->i_ino);
 	set_de_type(de, inode->i_mode);
 	f2fs_dentry_kunmap(dir, page);
 	set_page_dirty(page);
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-	mark_inode_dirty(dir);
 
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	f2fs_mark_inode_dirty_sync(dir);
 	f2fs_put_page(page, 1);
 }
 
@@ -315,7 +321,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
 	struct f2fs_inode *ri;
 
-	f2fs_wait_on_page_writeback(ipage, NODE);
+	f2fs_wait_on_page_writeback(ipage, NODE, true);
 
 	/* copy name info. to this inode page */
 	ri = F2FS_INODE(ipage);
@@ -345,24 +351,14 @@ int update_dent_inode(struct inode *inode, struct inode *to,
 void do_make_empty_dir(struct inode *inode, struct inode *parent,
 					struct f2fs_dentry_ptr *d)
 {
-	struct f2fs_dir_entry *de;
+	struct qstr dot = QSTR_INIT(".", 1);
+	struct qstr dotdot = QSTR_INIT("..", 2);
 
-	de = &d->dentry[0];
-	de->name_len = cpu_to_le16(1);
-	de->hash_code = 0;
-	de->ino = cpu_to_le32(inode->i_ino);
-	memcpy(d->filename[0], ".", 1);
-	set_de_type(de, inode->i_mode);
+	/* update dirent of "." */
+	f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0);
 
-	de = &d->dentry[1];
-	de->hash_code = 0;
-	de->name_len = cpu_to_le16(2);
-	de->ino = cpu_to_le32(parent->i_ino);
-	memcpy(d->filename[1], "..", 2);
-	set_de_type(de, parent->i_mode);
-
-	test_and_set_bit_le(0, (void *)d->bitmap);
-	test_and_set_bit_le(1, (void *)d->bitmap);
+	/* update dirent of ".." */
+	f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1);
 }
 
 static int make_empty_dir(struct inode *inode,
@@ -392,32 +388,38 @@ static int make_empty_dir(struct inode *inode,
 }
 
 struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
-			const struct qstr *name, struct page *dpage)
+			const struct qstr *new_name, const struct qstr *orig_name,
+			struct page *dpage)
 {
 	struct page *page;
 	int err;
 
-	if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+	if (is_inode_flag_set(inode, FI_NEW_INODE)) {
 		page = new_inode_page(inode);
 		if (IS_ERR(page))
 			return page;
 
 		if (S_ISDIR(inode->i_mode)) {
+			/* in order to handle error case */
+			get_page(page);
 			err = make_empty_dir(inode, dir, page);
-			if (err)
-				goto error;
+			if (err) {
+				lock_page(page);
+				goto put_error;
+			}
+			put_page(page);
 		}
 
 		err = f2fs_init_acl(inode, dir, page, dpage);
 		if (err)
 			goto put_error;
 
-		err = f2fs_init_security(inode, dir, name, page);
+		err = f2fs_init_security(inode, dir, orig_name, page);
 		if (err)
 			goto put_error;
 
 		if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
-			err = f2fs_inherit_context(dir, inode, page);
+			err = fscrypt_inherit_context(dir, inode, page, false);
 			if (err)
 				goto put_error;
 		}
@@ -429,14 +431,14 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
 		set_cold_node(inode, page);
 	}
 
-	if (name)
-		init_dent_inode(name, page);
+	if (new_name)
+		init_dent_inode(new_name, page);
 
 	/*
 	 * This file should be checkpointed during fsync.
 	 * We lost i_pino from now on.
 	 */
-	if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+	if (is_inode_flag_set(inode, FI_INC_LINK)) {
 		file_lost_pino(inode);
 		/*
 		 * If link the tmpfile to alias through linkat path,
@@ -444,41 +446,33 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
 		 */
 		if (inode->i_nlink == 0)
 			remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
-		inc_nlink(inode);
+		f2fs_i_links_write(inode, true);
 	}
 	return page;
 
 put_error:
+	clear_nlink(inode);
+	update_inode(inode, page);
 	f2fs_put_page(page, 1);
-error:
-	/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
-	truncate_inode_pages(&inode->i_data, 0);
-	truncate_blocks(inode, 0, false);
-	remove_dirty_dir_inode(inode);
-	remove_inode_page(inode);
 	return ERR_PTR(err);
 }
 
 void update_parent_metadata(struct inode *dir, struct inode *inode,
 						unsigned int current_depth)
 {
-	if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
-		if (S_ISDIR(inode->i_mode)) {
-			inc_nlink(dir);
-			set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
-		}
-		clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+	if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) {
+		if (S_ISDIR(inode->i_mode))
+			f2fs_i_links_write(dir, true);
+		clear_inode_flag(inode, FI_NEW_INODE);
 	}
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-	mark_inode_dirty(dir);
+	f2fs_mark_inode_dirty_sync(dir);
 
-	if (F2FS_I(dir)->i_current_depth != current_depth) {
-		F2FS_I(dir)->i_current_depth = current_depth;
-		set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
-	}
+	if (F2FS_I(dir)->i_current_depth != current_depth)
+		f2fs_i_depth_write(dir, current_depth);
 
-	if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
-		clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	if (inode && is_inode_flag_set(inode, FI_INC_LINK))
+		clear_inode_flag(inode, FI_INC_LINK);
 }
 
 int room_for_filename(const void *bitmap, int slots, int max_slots)
@@ -515,15 +509,16 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
 	memcpy(d->filename[bit_pos], name->name, name->len);
 	de->ino = cpu_to_le32(ino);
 	set_de_type(de, mode);
-	for (i = 0; i < slots; i++)
-		test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
+	for (i = 0; i < slots; i++) {
+		__set_bit_le(bit_pos + i, (void *)d->bitmap);
+		/* avoid wrong garbage data for readdir */
+		if (i)
+			(de + i)->name_len = 0;
+	}
 }
 
-/*
- * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op().
- */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
+				const struct qstr *orig_name,
 				struct inode *inode, nid_t ino, umode_t mode)
 {
 	unsigned int bit_pos;
@@ -536,28 +531,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct f2fs_dentry_ptr d;
 	struct page *page = NULL;
-	struct f2fs_filename fname;
-	struct qstr new_name;
-	int slots, err;
-
-	err = f2fs_fname_setup_filename(dir, name, 0, &fname);
-	if (err)
-		return err;
-
-	new_name.name = fname_name(&fname);
-	new_name.len = fname_len(&fname);
-
-	if (f2fs_has_inline_dentry(dir)) {
-		err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
-		if (!err || err != -EAGAIN)
-			goto out;
-		else
-			err = 0;
-	}
+	int slots, err = 0;
 
 	level = 0;
-	slots = GET_DENTRY_SLOTS(new_name.len);
-	dentry_hash = f2fs_dentry_hash(&new_name, NULL);
+	slots = GET_DENTRY_SLOTS(new_name->len);
+	dentry_hash = f2fs_dentry_hash(new_name);
 
 	current_depth = F2FS_I(dir)->i_current_depth;
 	if (F2FS_I(dir)->chash == dentry_hash) {
@@ -566,10 +544,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	}
 
 start:
-	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) {
-		err = -ENOSPC;
-		goto out;
-	}
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH))
+		return -ENOSPC;
+#endif
+	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
+		return -ENOSPC;
 
 	/* Increase the depth, if required */
 	if (level == current_depth)
@@ -583,10 +563,8 @@ start:
 
 	for (block = bidx; block <= (bidx + nblock - 1); block++) {
 		dentry_page = get_new_data_page(dir, NULL, block, true);
-		if (IS_ERR(dentry_page)) {
-			err = PTR_ERR(dentry_page);
-			goto out;
-		}
+		if (IS_ERR(dentry_page))
+			return PTR_ERR(dentry_page);
 
 		dentry_blk = kmap(dentry_page);
 		bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
@@ -602,11 +580,12 @@ start:
 	++level;
 	goto start;
 add_dentry:
-	f2fs_wait_on_page_writeback(dentry_page, DATA);
+	f2fs_wait_on_page_writeback(dentry_page, DATA, true);
 
 	if (inode) {
 		down_write(&F2FS_I(inode)->i_sem);
-		page = init_inode_metadata(inode, dir, &new_name, NULL);
+		page = init_inode_metadata(inode, dir, new_name,
+						orig_name, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto fail;
@@ -616,14 +595,12 @@ add_dentry:
 	}
 
 	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
-	f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos);
+	f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
 
 	set_page_dirty(dentry_page);
 
 	if (inode) {
-		/* we don't need to mark_inode_dirty now */
-		F2FS_I(inode)->i_pino = dir->i_ino;
-		update_inode(inode, page);
+		f2fs_i_pino_write(inode, dir->i_ino);
 		f2fs_put_page(page, 1);
 	}
 
@@ -632,14 +609,49 @@ fail:
 	if (inode)
 		up_write(&F2FS_I(inode)->i_sem);
 
-	if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
-		update_inode_page(dir);
-		clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
-	}
 	kunmap(dentry_page);
 	f2fs_put_page(dentry_page, 1);
-out:
-	f2fs_fname_free_filename(&fname);
+
+	return err;
+}
+
+int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname,
+				struct inode *inode, nid_t ino, umode_t mode)
+{
+	struct qstr new_name;
+	int err = -EAGAIN;
+
+	new_name.name = fname_name(fname);
+	new_name.len = fname_len(fname);
+
+	if (f2fs_has_inline_dentry(dir))
+		err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname,
+							inode, ino, mode);
+	if (err == -EAGAIN)
+		err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname,
+							inode, ino, mode);
+
+	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+	return err;
+}
+
+/*
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ */
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+				struct inode *inode, nid_t ino, umode_t mode)
+{
+	struct fscrypt_name fname;
+	int err;
+
+	err = fscrypt_setup_filename(dir, name, 0, &fname);
+	if (err)
+		return err;
+
+	err = __f2fs_do_add_link(dir, &fname, inode, ino, mode);
+
+	fscrypt_free_filename(&fname);
 	return err;
 }
 
@@ -649,46 +661,39 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
 	int err = 0;
 
 	down_write(&F2FS_I(inode)->i_sem);
-	page = init_inode_metadata(inode, dir, NULL, NULL);
+	page = init_inode_metadata(inode, dir, NULL, NULL, NULL);
 	if (IS_ERR(page)) {
 		err = PTR_ERR(page);
 		goto fail;
 	}
-	/* we don't need to mark_inode_dirty now */
-	update_inode(inode, page);
 	f2fs_put_page(page, 1);
 
-	clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+	clear_inode_flag(inode, FI_NEW_INODE);
 fail:
 	up_write(&F2FS_I(inode)->i_sem);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return err;
 }
 
-void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page)
+void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 
 	down_write(&F2FS_I(inode)->i_sem);
 
-	if (S_ISDIR(inode->i_mode)) {
-		drop_nlink(dir);
-		if (page)
-			update_inode(dir, page);
-		else
-			update_inode_page(dir);
-	}
+	if (S_ISDIR(inode->i_mode))
+		f2fs_i_links_write(dir, false);
 	inode->i_ctime = CURRENT_TIME;
 
-	drop_nlink(inode);
+	f2fs_i_links_write(inode, false);
 	if (S_ISDIR(inode->i_mode)) {
-		drop_nlink(inode);
-		i_size_write(inode, 0);
+		f2fs_i_links_write(inode, false);
+		f2fs_i_size_write(inode, 0);
 	}
 	up_write(&F2FS_I(inode)->i_sem);
-	update_inode_page(inode);
 
 	if (inode->i_nlink == 0)
-		add_orphan_inode(sbi, inode->i_ino);
+		add_orphan_inode(inode);
 	else
 		release_orphan_inode(sbi);
 }
@@ -705,11 +710,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
 	int i;
 
+	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+
 	if (f2fs_has_inline_dentry(dir))
 		return f2fs_delete_inline_entry(dentry, page, dir, inode);
 
 	lock_page(page);
-	f2fs_wait_on_page_writeback(page, DATA);
+	f2fs_wait_on_page_writeback(page, DATA, true);
 
 	dentry_blk = page_address(page);
 	bit_pos = dentry - dentry_blk->dentry;
@@ -724,9 +731,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	set_page_dirty(page);
 
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	f2fs_mark_inode_dirty_sync(dir);
 
 	if (inode)
-		f2fs_drop_nlink(dir, inode, NULL);
+		f2fs_drop_nlink(dir, inode);
 
 	if (bit_pos == NR_DENTRY_IN_BLOCK &&
 			!truncate_hole(dir, page->index, page->index + 1)) {
@@ -777,12 +785,12 @@ bool f2fs_empty_dir(struct inode *dir)
 }
 
 bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
-				unsigned int start_pos, struct f2fs_str *fstr)
+			unsigned int start_pos, struct fscrypt_str *fstr)
 {
 	unsigned char d_type = DT_UNKNOWN;
 	unsigned int bit_pos;
 	struct f2fs_dir_entry *de = NULL;
-	struct f2fs_str de_name = FSTR_INIT(NULL, 0);
+	struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
 
 	bit_pos = ((unsigned long)ctx->pos % d->max);
 
@@ -792,10 +800,13 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			break;
 
 		de = &d->dentry[bit_pos];
-		if (de->file_type < F2FS_FT_MAX)
-			d_type = f2fs_filetype_table[de->file_type];
-		else
-			d_type = DT_UNKNOWN;
+		if (de->name_len == 0) {
+			bit_pos++;
+			ctx->pos = start_pos + bit_pos;
+			continue;
+		}
+
+		d_type = get_de_type(de);
 
 		de_name.name = d->filename[bit_pos];
 		de_name.len = le16_to_cpu(de->name_len);
@@ -804,15 +815,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			int save_len = fstr->len;
 			int ret;
 
-			de_name.name = kmalloc(de_name.len, GFP_NOFS);
-			if (!de_name.name)
-				return false;
-
-			memcpy(de_name.name, d->filename[bit_pos], de_name.len);
-
-			ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
-							&de_name, fstr);
-			kfree(de_name.name);
+			ret = fscrypt_fname_disk_to_usr(d->inode,
+						(u32)de->hash_code, 0,
+						&de_name, fstr);
 			if (ret < 0)
 				return true;
 
@@ -839,16 +844,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	struct file_ra_state *ra = &file->f_ra;
 	unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
 	struct f2fs_dentry_ptr d;
-	struct f2fs_str fstr = FSTR_INIT(NULL, 0);
+	struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
 	int err = 0;
 
 	if (f2fs_encrypted_inode(inode)) {
-		err = f2fs_get_encryption_info(inode);
-		if (err)
+		err = fscrypt_get_encryption_info(inode);
+		if (err && err != -ENOKEY)
 			return err;
 
-		err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN,
-								&fstr);
+		err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
 		if (err < 0)
 			return err;
 	}
@@ -865,36 +869,47 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 
 	for (; n < npages; n++) {
 		dentry_page = get_lock_data_page(inode, n, false);
-		if (IS_ERR(dentry_page))
-			continue;
+		if (IS_ERR(dentry_page)) {
+			err = PTR_ERR(dentry_page);
+			if (err == -ENOENT)
+				continue;
+			else
+				goto out;
+		}
 
 		dentry_blk = kmap(dentry_page);
 
 		make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
 
-		if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr))
-			goto stop;
+		if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
+			kunmap(dentry_page);
+			f2fs_put_page(dentry_page, 1);
+			break;
+		}
 
 		ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
-		dentry_page = NULL;
-	}
-stop:
-	if (dentry_page && !IS_ERR(dentry_page)) {
-		kunmap(dentry_page);
-		f2fs_put_page(dentry_page, 1);
 	}
+	err = 0;
 out:
-	f2fs_fname_crypto_free_buffer(&fstr);
+	fscrypt_fname_free_buffer(&fstr);
 	return err;
 }
 
+static int f2fs_dir_open(struct inode *inode, struct file *filp)
+{
+	if (f2fs_encrypted_inode(inode))
+		return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
+	return 0;
+}
+
 const struct file_operations f2fs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.iterate	= f2fs_readdir,
 	.fsync		= f2fs_sync_file,
+	.open		= f2fs_dir_open,
 	.unlocked_ioctl	= f2fs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = f2fs_compat_ioctl,
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 7ddba812e11b..2b06d4fcd954 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -33,10 +33,11 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
 
 	en->ei = *ei;
 	INIT_LIST_HEAD(&en->list);
+	en->et = et;
 
 	rb_link_node(&en->rb_node, parent, p);
 	rb_insert_color(&en->rb_node, &et->root);
-	et->count++;
+	atomic_inc(&et->node_cnt);
 	atomic_inc(&sbi->total_ext_node);
 	return en;
 }
@@ -45,11 +46,29 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_node *en)
 {
 	rb_erase(&en->rb_node, &et->root);
-	et->count--;
+	atomic_dec(&et->node_cnt);
 	atomic_dec(&sbi->total_ext_node);
 
 	if (et->cached_en == en)
 		et->cached_en = NULL;
+	kmem_cache_free(extent_node_slab, en);
+}
+
+/*
+ * Flow to release an extent_node:
+ * 1. list_del_init
+ * 2. __detach_extent_node
+ * 3. kmem_cache_free.
+ */
+static void __release_extent_node(struct f2fs_sb_info *sbi,
+			struct extent_tree *et, struct extent_node *en)
+{
+	spin_lock(&sbi->extent_lock);
+	f2fs_bug_on(sbi, list_empty(&en->list));
+	list_del_init(&en->list);
+	spin_unlock(&sbi->extent_lock);
+
+	__detach_extent_node(sbi, et, en);
 }
 
 static struct extent_tree *__grab_extent_tree(struct inode *inode)
@@ -68,11 +87,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 		et->root = RB_ROOT;
 		et->cached_en = NULL;
 		rwlock_init(&et->lock);
-		atomic_set(&et->refcount, 0);
-		et->count = 0;
-		sbi->total_ext_tree++;
+		INIT_LIST_HEAD(&et->list);
+		atomic_set(&et->node_cnt, 0);
+		atomic_inc(&sbi->total_ext_tree);
+	} else {
+		atomic_dec(&sbi->total_zombie_tree);
+		list_del_init(&et->list);
 	}
-	atomic_inc(&et->refcount);
 	up_write(&sbi->extent_tree_lock);
 
 	/* never died until evict_inode */
@@ -127,32 +148,21 @@ static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
 }
 
 static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
-					struct extent_tree *et, bool free_all)
+					struct extent_tree *et)
 {
 	struct rb_node *node, *next;
 	struct extent_node *en;
-	unsigned int count = et->count;
+	unsigned int count = atomic_read(&et->node_cnt);
 
 	node = rb_first(&et->root);
 	while (node) {
 		next = rb_next(node);
 		en = rb_entry(node, struct extent_node, rb_node);
-
-		if (free_all) {
-			spin_lock(&sbi->extent_lock);
-			if (!list_empty(&en->list))
-				list_del_init(&en->list);
-			spin_unlock(&sbi->extent_lock);
-		}
-
-		if (free_all || list_empty(&en->list)) {
-			__detach_extent_node(sbi, et, en);
-			kmem_cache_free(extent_node_slab, en);
-		}
+		__release_extent_node(sbi, et, en);
 		node = next;
 	}
 
-	return count - et->count;
+	return count - atomic_read(&et->node_cnt);
 }
 
 static void __drop_largest_extent(struct inode *inode,
@@ -160,38 +170,38 @@ static void __drop_largest_extent(struct inode *inode,
 {
 	struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
 
-	if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs)
+	if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
 		largest->len = 0;
+		f2fs_mark_inode_dirty_sync(inode);
+	}
 }
 
-void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
-{
-	if (!f2fs_may_extent_tree(inode))
-		return;
-
-	__drop_largest_extent(inode, fofs, 1);
-}
-
-void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+/* return true, if inode page is changed */
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_tree *et;
 	struct extent_node *en;
 	struct extent_info ei;
 
-	if (!f2fs_may_extent_tree(inode))
-		return;
+	if (!f2fs_may_extent_tree(inode)) {
+		/* drop largest extent */
+		if (i_ext && i_ext->len) {
+			i_ext->len = 0;
+			return true;
+		}
+		return false;
+	}
 
 	et = __grab_extent_tree(inode);
 
-	if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
-		return;
+	if (!i_ext || !i_ext->len)
+		return false;
 
-	set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
-		le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
+	get_extent_info(&ei, i_ext);
 
 	write_lock(&et->lock);
-	if (et->count)
+	if (atomic_read(&et->node_cnt))
 		goto out;
 
 	en = __init_extent_tree(sbi, et, &ei);
@@ -202,6 +212,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
 	}
 out:
 	write_unlock(&et->lock);
+	return false;
 }
 
 static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
@@ -230,9 +241,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 	if (en) {
 		*ei = en->ei;
 		spin_lock(&sbi->extent_lock);
-		if (!list_empty(&en->list))
+		if (!list_empty(&en->list)) {
 			list_move_tail(&en->list, &sbi->extent_list);
-		et->cached_en = en;
+			et->cached_en = en;
+		}
 		spin_unlock(&sbi->extent_lock);
 		ret = true;
 	}
@@ -325,12 +337,12 @@ lookup_neighbors:
 	return en;
 }
 
-static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
+static struct extent_node *__try_merge_extent_node(struct inode *inode,
 				struct extent_tree *et, struct extent_info *ei,
-				struct extent_node **den,
 				struct extent_node *prev_ex,
 				struct extent_node *next_ex)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_node *en = NULL;
 
 	if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
@@ -340,28 +352,34 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
 	}
 
 	if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
-		if (en) {
-			__detach_extent_node(sbi, et, prev_ex);
-			*den = prev_ex;
-		}
+		if (en)
+			__release_extent_node(sbi, et, prev_ex);
 		next_ex->ei.fofs = ei->fofs;
 		next_ex->ei.blk = ei->blk;
 		next_ex->ei.len += ei->len;
 		en = next_ex;
 	}
 
-	if (en) {
-		__try_update_largest_extent(et, en);
+	if (!en)
+		return NULL;
+
+	__try_update_largest_extent(inode, et, en);
+
+	spin_lock(&sbi->extent_lock);
+	if (!list_empty(&en->list)) {
+		list_move_tail(&en->list, &sbi->extent_list);
 		et->cached_en = en;
 	}
+	spin_unlock(&sbi->extent_lock);
 	return en;
 }
 
-static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
+static struct extent_node *__insert_extent_tree(struct inode *inode,
 				struct extent_tree *et, struct extent_info *ei,
 				struct rb_node **insert_p,
 				struct rb_node *insert_parent)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct rb_node **p = &et->root.rb_node;
 	struct rb_node *parent = NULL;
 	struct extent_node *en = NULL;
@@ -388,8 +406,13 @@ do_insert:
 	if (!en)
 		return NULL;
 
-	__try_update_largest_extent(et, en);
+	__try_update_largest_extent(inode, et, en);
+
+	/* update in global extent list */
+	spin_lock(&sbi->extent_lock);
+	list_add_tail(&en->list, &sbi->extent_list);
 	et->cached_en = en;
+	spin_unlock(&sbi->extent_lock);
 	return en;
 }
 
@@ -412,7 +435,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 
 	write_lock(&et->lock);
 
-	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
+	if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
 		write_unlock(&et->lock);
 		return false;
 	}
@@ -454,7 +477,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 				set_extent_info(&ei, end,
 						end - dei.fofs + dei.blk,
 						org_end - end);
-				en1 = __insert_extent_tree(sbi, et, &ei,
+				en1 = __insert_extent_tree(inode, et, &ei,
 							NULL, NULL);
 				next_en = en1;
 			} else {
@@ -475,9 +498,9 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 		}
 
 		if (parts)
-			__try_update_largest_extent(et, en);
+			__try_update_largest_extent(inode, et, en);
 		else
-			__detach_extent_node(sbi, et, en);
+			__release_extent_node(sbi, et, en);
 
 		/*
 		 * if original extent is split into zero or two parts, extent
@@ -488,58 +511,28 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 			insert_p = NULL;
 			insert_parent = NULL;
 		}
-
-		/* update in global extent list */
-		spin_lock(&sbi->extent_lock);
-		if (!parts && !list_empty(&en->list))
-			list_del(&en->list);
-		if (en1)
-			list_add_tail(&en1->list, &sbi->extent_list);
-		spin_unlock(&sbi->extent_lock);
-
-		/* release extent node */
-		if (!parts)
-			kmem_cache_free(extent_node_slab, en);
-
 		en = next_en;
 	}
 
 	/* 3. update extent in extent cache */
 	if (blkaddr) {
-		struct extent_node *den = NULL;
 
 		set_extent_info(&ei, fofs, blkaddr, len);
-		en1 = __try_merge_extent_node(sbi, et, &ei, &den,
-							prev_en, next_en);
-		if (!en1)
-			en1 = __insert_extent_tree(sbi, et, &ei,
+		if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en))
+			__insert_extent_tree(inode, et, &ei,
 						insert_p, insert_parent);
 
 		/* give up extent_cache, if split and small updates happen */
 		if (dei.len >= 1 &&
 				prev.len < F2FS_MIN_EXTENT_LEN &&
 				et->largest.len < F2FS_MIN_EXTENT_LEN) {
-			et->largest.len = 0;
-			set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
+			__drop_largest_extent(inode, 0, UINT_MAX);
+			set_inode_flag(inode, FI_NO_EXTENT);
 		}
-
-		spin_lock(&sbi->extent_lock);
-		if (en1) {
-			if (list_empty(&en1->list))
-				list_add_tail(&en1->list, &sbi->extent_list);
-			else
-				list_move_tail(&en1->list, &sbi->extent_list);
-		}
-		if (den && !list_empty(&den->list))
-			list_del(&den->list);
-		spin_unlock(&sbi->extent_lock);
-
-		if (den)
-			kmem_cache_free(extent_node_slab, den);
 	}
 
-	if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
-		__free_extent_tree(sbi, et, true);
+	if (is_inode_flag_set(inode, FI_NO_EXTENT))
+		__free_extent_tree(sbi, et);
 
 	write_unlock(&et->lock);
 
@@ -548,46 +541,42 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 {
-	struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
-	struct extent_node *en, *tmp;
-	unsigned long ino = F2FS_ROOT_INO(sbi);
-	struct radix_tree_root *root = &sbi->extent_tree_root;
-	unsigned int found;
+	struct extent_tree *et, *next;
+	struct extent_node *en;
 	unsigned int node_cnt = 0, tree_cnt = 0;
 	int remained;
 
 	if (!test_opt(sbi, EXTENT_CACHE))
 		return 0;
 
+	if (!atomic_read(&sbi->total_zombie_tree))
+		goto free_node;
+
 	if (!down_write_trylock(&sbi->extent_tree_lock))
 		goto out;
 
 	/* 1. remove unreferenced extent tree */
-	while ((found = radix_tree_gang_lookup(root,
-				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
-		unsigned i;
-
-		ino = treevec[found - 1]->ino + 1;
-		for (i = 0; i < found; i++) {
-			struct extent_tree *et = treevec[i];
-
-			if (!atomic_read(&et->refcount)) {
-				write_lock(&et->lock);
-				node_cnt += __free_extent_tree(sbi, et, true);
-				write_unlock(&et->lock);
-
-				radix_tree_delete(root, et->ino);
-				kmem_cache_free(extent_tree_slab, et);
-				sbi->total_ext_tree--;
-				tree_cnt++;
-
-				if (node_cnt + tree_cnt >= nr_shrink)
-					goto unlock_out;
-			}
+	list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+		if (atomic_read(&et->node_cnt)) {
+			write_lock(&et->lock);
+			node_cnt += __free_extent_tree(sbi, et);
+			write_unlock(&et->lock);
 		}
+		f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
+		list_del_init(&et->list);
+		radix_tree_delete(&sbi->extent_tree_root, et->ino);
+		kmem_cache_free(extent_tree_slab, et);
+		atomic_dec(&sbi->total_ext_tree);
+		atomic_dec(&sbi->total_zombie_tree);
+		tree_cnt++;
+
+		if (node_cnt + tree_cnt >= nr_shrink)
+			goto unlock_out;
+		cond_resched();
 	}
 	up_write(&sbi->extent_tree_lock);
 
+free_node:
 	/* 2. remove LRU extent entries */
 	if (!down_write_trylock(&sbi->extent_tree_lock))
 		goto out;
@@ -595,34 +584,29 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	remained = nr_shrink - (node_cnt + tree_cnt);
 
 	spin_lock(&sbi->extent_lock);
-	list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
-		if (!remained--)
+	for (; remained > 0; remained--) {
+		if (list_empty(&sbi->extent_list))
 			break;
+		en = list_first_entry(&sbi->extent_list,
+					struct extent_node, list);
+		et = en->et;
+		if (!write_trylock(&et->lock)) {
+			/* refresh this extent node's position in extent list */
+			list_move_tail(&en->list, &sbi->extent_list);
+			continue;
+		}
+
 		list_del_init(&en->list);
+		spin_unlock(&sbi->extent_lock);
+
+		__detach_extent_node(sbi, et, en);
+
+		write_unlock(&et->lock);
+		node_cnt++;
+		spin_lock(&sbi->extent_lock);
 	}
 	spin_unlock(&sbi->extent_lock);
 
-	/*
-	 * reset ino for searching victims from beginning of global extent tree.
-	 */
-	ino = F2FS_ROOT_INO(sbi);
-
-	while ((found = radix_tree_gang_lookup(root,
-				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
-		unsigned i;
-
-		ino = treevec[found - 1]->ino + 1;
-		for (i = 0; i < found; i++) {
-			struct extent_tree *et = treevec[i];
-
-			write_lock(&et->lock);
-			node_cnt += __free_extent_tree(sbi, et, false);
-			write_unlock(&et->lock);
-
-			if (node_cnt + tree_cnt >= nr_shrink)
-				goto unlock_out;
-		}
-	}
 unlock_out:
 	up_write(&sbi->extent_tree_lock);
 out:
@@ -637,16 +621,29 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 	unsigned int node_cnt = 0;
 
-	if (!et)
+	if (!et || !atomic_read(&et->node_cnt))
 		return 0;
 
 	write_lock(&et->lock);
-	node_cnt = __free_extent_tree(sbi, et, true);
+	node_cnt = __free_extent_tree(sbi, et);
 	write_unlock(&et->lock);
 
 	return node_cnt;
 }
 
+void f2fs_drop_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+
+	set_inode_flag(inode, FI_NO_EXTENT);
+
+	write_lock(&et->lock);
+	__free_extent_tree(sbi, et);
+	__drop_largest_extent(inode, 0, UINT_MAX);
+	write_unlock(&et->lock);
+}
+
 void f2fs_destroy_extent_tree(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -656,8 +653,12 @@ void f2fs_destroy_extent_tree(struct inode *inode)
 	if (!et)
 		return;
 
-	if (inode->i_nlink && !is_bad_inode(inode) && et->count) {
-		atomic_dec(&et->refcount);
+	if (inode->i_nlink && !is_bad_inode(inode) &&
+					atomic_read(&et->node_cnt)) {
+		down_write(&sbi->extent_tree_lock);
+		list_add_tail(&et->list, &sbi->zombie_list);
+		atomic_inc(&sbi->total_zombie_tree);
+		up_write(&sbi->extent_tree_lock);
 		return;
 	}
 
@@ -666,11 +667,10 @@ void f2fs_destroy_extent_tree(struct inode *inode)
 
 	/* delete extent tree entry in radix tree */
 	down_write(&sbi->extent_tree_lock);
-	atomic_dec(&et->refcount);
-	f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
+	f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
 	radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
 	kmem_cache_free(extent_tree_slab, et);
-	sbi->total_ext_tree--;
+	atomic_dec(&sbi->total_ext_tree);
 	up_write(&sbi->extent_tree_lock);
 
 	F2FS_I(inode)->extent_tree = NULL;
@@ -689,20 +689,20 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
 
 void f2fs_update_extent_cache(struct dnode_of_data *dn)
 {
-	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
 	pgoff_t fofs;
+	block_t blkaddr;
 
 	if (!f2fs_may_extent_tree(dn->inode))
 		return;
 
-	f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
+	if (dn->data_blkaddr == NEW_ADDR)
+		blkaddr = NULL_ADDR;
+	else
+		blkaddr = dn->data_blkaddr;
 
-
-	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
-							dn->ofs_in_node;
-
-	if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1))
-		sync_inode_page(dn);
+	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+								dn->ofs_in_node;
+	f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
 }
 
 void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
@@ -712,8 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
 	if (!f2fs_may_extent_tree(dn->inode))
 		return;
 
-	if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len))
-		sync_inode_page(dn);
+	f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
 }
 
 void init_extent_cache_info(struct f2fs_sb_info *sbi)
@@ -722,7 +721,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi)
 	init_rwsem(&sbi->extent_tree_lock);
 	INIT_LIST_HEAD(&sbi->extent_list);
 	spin_lock_init(&sbi->extent_lock);
-	sbi->total_ext_tree = 0;
+	atomic_set(&sbi->total_ext_tree, 0);
+	INIT_LIST_HEAD(&sbi->zombie_list);
+	atomic_set(&sbi->total_zombie_tree, 0);
 	atomic_set(&sbi->total_ext_node, 0);
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2871576fbca4..af293e84e5cd 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,12 @@
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/fscrypto.h>
+#include <crypto/hash.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
-#define f2fs_down_write(x, y)	down_write_nest_lock(x, y)
 #else
 #define f2fs_bug_on(sbi, condition)					\
 	do {								\
@@ -33,7 +35,30 @@
 			set_sbi_flag(sbi, SBI_NEED_FSCK);		\
 		}							\
 	} while (0)
-#define f2fs_down_write(x, y)	down_write(x)
+#endif
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+enum {
+	FAULT_KMALLOC,
+	FAULT_PAGE_ALLOC,
+	FAULT_ALLOC_NID,
+	FAULT_ORPHAN,
+	FAULT_BLOCK,
+	FAULT_DIR_DEPTH,
+	FAULT_EVICT_INODE,
+	FAULT_IO,
+	FAULT_CHECKPOINT,
+	FAULT_MAX,
+};
+
+struct f2fs_fault_info {
+	atomic_t inject_ops;
+	unsigned int inject_rate;
+	unsigned int inject_type;
+};
+
+extern char *fault_name[FAULT_MAX];
+#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type)))
 #endif
 
 /*
@@ -54,6 +79,10 @@
 #define F2FS_MOUNT_FASTBOOT		0x00001000
 #define F2FS_MOUNT_EXTENT_CACHE		0x00002000
 #define F2FS_MOUNT_FORCE_FG_GC		0x00004000
+#define F2FS_MOUNT_DATA_FLUSH		0x00008000
+#define F2FS_MOUNT_FAULT_INJECTION	0x00010000
+#define F2FS_MOUNT_ADAPTIVE		0x00020000
+#define F2FS_MOUNT_LFS			0x00040000
 
 #define clear_opt(sbi, option)	(sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	(sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -74,6 +103,7 @@ struct f2fs_mount_info {
 };
 
 #define F2FS_FEATURE_ENCRYPT	0x0001
+#define F2FS_FEATURE_HMSMR	0x0002
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -82,25 +112,30 @@ struct f2fs_mount_info {
 #define F2FS_CLEAR_FEATURE(sb, mask)					\
 	F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
 
-#define CRCPOLY_LE 0xedb88320
-
-static inline __u32 f2fs_crc32(void *buf, size_t len)
+/**
+ * wq_has_sleeper - check if there are any waiting processes
+ * @wq: wait queue head
+ *
+ * Returns true if wq has waiting processes
+ *
+ * Please refer to the comment for waitqueue_active.
+ */
+static inline bool wq_has_sleeper(wait_queue_head_t *wq)
 {
-	unsigned char *p = (unsigned char *)buf;
-	__u32 crc = F2FS_SUPER_MAGIC;
-	int i;
-
-	while (len--) {
-		crc ^= *p++;
-		for (i = 0; i < 8; i++)
-			crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
-	}
-	return crc;
+	/*
+	 * We need to be sure we are in sync with the
+	 * add_wait_queue modifications to the wait queue.
+	 *
+	 * This memory barrier should be paired with one on the
+	 * waiting side.
+	 */
+	smp_mb();
+	return waitqueue_active(wq);
 }
 
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
+static inline void inode_nohighmem(struct inode *inode)
 {
-	return f2fs_crc32(buf, buf_size) == blk_crc;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
 }
 
 /*
@@ -119,12 +154,13 @@ enum {
 	CP_DISCARD,
 };
 
-#define DEF_BATCHED_TRIM_SECTIONS	32
+#define DEF_BATCHED_TRIM_SECTIONS	2
 #define BATCHED_TRIM_SEGMENTS(sbi)	\
 		(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
 #define BATCHED_TRIM_BLOCKS(sbi)	\
 		(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
 #define DEF_CP_INTERVAL			60	/* 60 secs */
+#define DEF_IDLE_INTERVAL		5	/* 5 secs */
 
 struct cp_control {
 	int reason;
@@ -158,13 +194,7 @@ struct ino_entry {
 	nid_t ino;		/* inode number */
 };
 
-/*
- * for the list of directory inodes or gc inodes.
- * NOTE: there are two slab users for this structure, if we add/modify/delete
- * fields in structure for one of slab users, it may affect fields or size of
- * other one, in this condition, it's better to split both of slab and related
- * data structure.
- */
+/* for the list of inodes to be GCed */
 struct inode_entry {
 	struct list_head list;	/* list head */
 	struct inode *inode;	/* vfs inode pointer */
@@ -177,46 +207,52 @@ struct discard_entry {
 	int len;		/* # of consecutive blocks of the discard */
 };
 
+struct bio_entry {
+	struct list_head list;
+	struct bio *bio;
+	struct completion event;
+	int error;
+};
+
 /* for the list of fsync inodes, used only during recovery */
 struct fsync_inode_entry {
 	struct list_head list;	/* list head */
 	struct inode *inode;	/* vfs inode pointer */
 	block_t blkaddr;	/* block address locating the last fsync */
 	block_t last_dentry;	/* block address locating the last dentry */
-	block_t last_inode;	/* block address locating the last inode */
 };
 
-#define nats_in_cursum(sum)		(le16_to_cpu(sum->n_nats))
-#define sits_in_cursum(sum)		(le16_to_cpu(sum->n_sits))
+#define nats_in_cursum(jnl)		(le16_to_cpu(jnl->n_nats))
+#define sits_in_cursum(jnl)		(le16_to_cpu(jnl->n_sits))
 
-#define nat_in_journal(sum, i)		(sum->nat_j.entries[i].ne)
-#define nid_in_journal(sum, i)		(sum->nat_j.entries[i].nid)
-#define sit_in_journal(sum, i)		(sum->sit_j.entries[i].se)
-#define segno_in_journal(sum, i)	(sum->sit_j.entries[i].segno)
+#define nat_in_journal(jnl, i)		(jnl->nat_j.entries[i].ne)
+#define nid_in_journal(jnl, i)		(jnl->nat_j.entries[i].nid)
+#define sit_in_journal(jnl, i)		(jnl->sit_j.entries[i].se)
+#define segno_in_journal(jnl, i)	(jnl->sit_j.entries[i].segno)
 
-#define MAX_NAT_JENTRIES(sum)	(NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
-#define MAX_SIT_JENTRIES(sum)	(SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+#define MAX_NAT_JENTRIES(jnl)	(NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
+#define MAX_SIT_JENTRIES(jnl)	(SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
 
-static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
 {
-	int before = nats_in_cursum(rs);
-	rs->n_nats = cpu_to_le16(before + i);
+	int before = nats_in_cursum(journal);
+	journal->n_nats = cpu_to_le16(before + i);
 	return before;
 }
 
-static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i)
 {
-	int before = sits_in_cursum(rs);
-	rs->n_sits = cpu_to_le16(before + i);
+	int before = sits_in_cursum(journal);
+	journal->n_sits = cpu_to_le16(before + i);
 	return before;
 }
 
-static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
-								int type)
+static inline bool __has_cursum_space(struct f2fs_journal *journal,
+							int size, int type)
 {
 	if (type == NAT_JOURNAL)
-		return size <= MAX_NAT_JENTRIES(sum);
-	return size <= MAX_SIT_JENTRIES(sum);
+		return size <= MAX_NAT_JENTRIES(journal);
+	return size <= MAX_SIT_JENTRIES(journal);
 }
 
 /*
@@ -234,13 +270,13 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
 #define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
 #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
+#define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
+#define F2FS_IOC_MOVE_RANGE		_IOWR(F2FS_IOCTL_MAGIC, 9,	\
+						struct f2fs_move_range)
 
-#define F2FS_IOC_SET_ENCRYPTION_POLICY					\
-		_IOR('f', 19, struct f2fs_encryption_policy)
-#define F2FS_IOC_GET_ENCRYPTION_PWSALT					\
-		_IOW('f', 20, __u8[16])
-#define F2FS_IOC_GET_ENCRYPTION_POLICY					\
-		_IOW('f', 21, struct f2fs_encryption_policy)
+#define F2FS_IOC_SET_ENCRYPTION_POLICY	FS_IOC_SET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_POLICY	FS_IOC_GET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_PWSALT	FS_IOC_GET_ENCRYPTION_PWSALT
 
 /*
  * should be same as XFS_IOC_GOINGDOWN.
@@ -256,33 +292,27 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 /*
  * ioctl commands in 32 bit emulation
  */
-#define F2FS_IOC32_GETFLAGS             FS_IOC32_GETFLAGS
-#define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETVERSION		FS_IOC32_GETVERSION
 #endif
 
+struct f2fs_defragment {
+	u64 start;
+	u64 len;
+};
+
+struct f2fs_move_range {
+	u32 dst_fd;		/* destination fd */
+	u64 pos_in;		/* start position in src_fd */
+	u64 pos_out;		/* start position in dst_fd */
+	u64 len;		/* size to move */
+};
+
 /*
  * For INODE and NODE manager
  */
 /* for directory operations */
-struct f2fs_str {
-	unsigned char *name;
-	u32 len;
-};
-
-struct f2fs_filename {
-	const struct qstr *usr_fname;
-	struct f2fs_str disk_name;
-	f2fs_hash_t hash;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	struct f2fs_str crypto_buf;
-#endif
-};
-
-#define FSTR_INIT(n, l)		{ .name = n, .len = l }
-#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p)		((p)->disk_name.name)
-#define fname_len(p)		((p)->disk_name.len)
-
 struct f2fs_dentry_ptr {
 	struct inode *inode;
 	const void *bitmap;
@@ -350,6 +380,7 @@ struct extent_node {
 	struct rb_node rb_node;		/* rb node located in rb-tree */
 	struct list_head list;		/* node in global extent list of sbi */
 	struct extent_info ei;		/* extent info */
+	struct extent_tree *et;		/* extent tree pointer */
 };
 
 struct extent_tree {
@@ -357,9 +388,9 @@ struct extent_tree {
 	struct rb_root root;		/* root of extent info rb-tree */
 	struct extent_node *cached_en;	/* recently accessed extent node */
 	struct extent_info largest;	/* largested extent info */
+	struct list_head list;		/* to be used by sbi->zombie_list */
 	rwlock_t lock;			/* protect extent info rb-tree */
-	atomic_t refcount;		/* reference count of rb-tree */
-	unsigned int count;		/* # of extent node in rb-tree*/
+	atomic_t node_cnt;		/* # of extent node in rb-tree*/
 };
 
 /*
@@ -378,6 +409,7 @@ struct f2fs_map_blocks {
 	block_t m_lblk;
 	unsigned int m_len;
 	unsigned int m_flags;
+	pgoff_t *m_next_pgofs;		/* point next possible non-hole pgofs */
 };
 
 /* for flag in get_data_block */
@@ -385,6 +417,8 @@ struct f2fs_map_blocks {
 #define F2FS_GET_BLOCK_DIO		1
 #define F2FS_GET_BLOCK_FIEMAP		2
 #define F2FS_GET_BLOCK_BMAP		3
+#define F2FS_GET_BLOCK_PRE_DIO		4
+#define F2FS_GET_BLOCK_PRE_AIO		5
 
 /*
  * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -406,15 +440,6 @@ struct f2fs_map_blocks {
 #define file_enc_name(inode)	is_file(inode, FADVISE_ENC_NAME_BIT)
 #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
 
-/* Encryption algorithms */
-#define F2FS_ENCRYPTION_MODE_INVALID		0
-#define F2FS_ENCRYPTION_MODE_AES_256_XTS	1
-#define F2FS_ENCRYPTION_MODE_AES_256_GCM	2
-#define F2FS_ENCRYPTION_MODE_AES_256_CBC	3
-#define F2FS_ENCRYPTION_MODE_AES_256_CTS	4
-
-#include "f2fs_crypto.h"
-
 #define DEF_DIR_LEVEL		0
 
 struct f2fs_inode_info {
@@ -429,30 +454,27 @@ struct f2fs_inode_info {
 	/* Use below internally in f2fs*/
 	unsigned long flags;		/* use to pass per-file flags */
 	struct rw_semaphore i_sem;	/* protect fi info */
-	atomic_t dirty_pages;		/* # of dirty pages */
+	struct percpu_counter dirty_pages;	/* # of dirty pages */
 	f2fs_hash_t chash;		/* hash value of given file name */
 	unsigned int clevel;		/* maximum level of given file name */
 	nid_t i_xattr_nid;		/* node id that contains xattrs */
 	unsigned long long xattr_ver;	/* cp version of xattr modification */
-	struct inode_entry *dirty_dir;	/* the pointer of dirty dir */
+	loff_t	last_disk_size;		/* lastly written file size */
 
+	struct list_head dirty_list;	/* dirty list for dirs and files */
+	struct list_head gdirty_list;	/* linked in global dirty list */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
-
 	struct extent_tree *extent_tree;	/* cached extent_tree entry */
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	/* Encryption params */
-	struct f2fs_crypt_info *i_crypt_info;
-#endif
+	struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
-					struct f2fs_extent i_ext)
+					struct f2fs_extent *i_ext)
 {
-	ext->fofs = le32_to_cpu(i_ext.fofs);
-	ext->blk = le32_to_cpu(i_ext.blk);
-	ext->len = le32_to_cpu(i_ext.len);
+	ext->fofs = le32_to_cpu(i_ext->fofs);
+	ext->blk = le32_to_cpu(i_ext->blk);
+	ext->len = le32_to_cpu(i_ext->len);
 }
 
 static inline void set_raw_extent(struct extent_info *ext,
@@ -497,11 +519,14 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
 	return __is_extent_mergeable(cur, front);
 }
 
-static inline void __try_update_largest_extent(struct extent_tree *et,
-						struct extent_node *en)
+extern void f2fs_mark_inode_dirty_sync(struct inode *);
+static inline void __try_update_largest_extent(struct inode *inode,
+			struct extent_tree *et, struct extent_node *en)
 {
-	if (en->ei.len > et->largest.len)
+	if (en->ei.len > et->largest.len) {
 		et->largest = en->ei;
+		f2fs_mark_inode_dirty_sync(inode);
+	}
 }
 
 struct f2fs_nm_info {
@@ -511,6 +536,7 @@ struct f2fs_nm_info {
 	nid_t next_scan_nid;		/* the next nid to be scanned */
 	unsigned int ram_thresh;	/* control the memory footprint */
 	unsigned int ra_nid_pages;	/* # of nid pages to be readaheaded */
+	unsigned int dirty_nats_ratio;	/* control dirty nats ratio threshold */
 
 	/* NAT cache management */
 	struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -544,6 +570,9 @@ struct dnode_of_data {
 	nid_t nid;			/* node id of the direct node block */
 	unsigned int ofs_in_node;	/* data offset in the node page */
 	bool inode_page_locked;		/* inode page is locked or not */
+	bool node_changed;		/* is node block changed */
+	char cur_level;			/* level of hole node page */
+	char max_level;			/* level of current page located */
 	block_t	data_blkaddr;		/* block address of the node block */
 };
 
@@ -594,6 +623,7 @@ struct flush_cmd {
 struct flush_cmd_control {
 	struct task_struct *f2fs_issue_flush;	/* flush thread */
 	wait_queue_head_t flush_wait_queue;	/* waiting queue for wake-up */
+	atomic_t submit_flush;			/* # of issued flushes */
 	struct llist_head issue_list;		/* list for command issue */
 	struct llist_node *dispatch_list;	/* list for command dispatch */
 };
@@ -618,6 +648,7 @@ struct f2fs_sm_info {
 
 	/* for small discard management */
 	struct list_head discard_list;		/* 4KB discard list */
+	struct list_head wait_list;		/* linked with issued discard bio */
 	int nr_discards;			/* # of discards in the list */
 	int max_discards;			/* max. discards to be issued */
 
@@ -645,11 +676,12 @@ struct f2fs_sm_info {
  * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
  */
 enum count_type {
-	F2FS_WRITEBACK,
 	F2FS_DIRTY_DENTS,
+	F2FS_DIRTY_DATA,
 	F2FS_DIRTY_NODES,
 	F2FS_DIRTY_META,
 	F2FS_INMEM_PAGES,
+	F2FS_DIRTY_IMETA,
 	NR_COUNT_TYPE,
 };
 
@@ -673,6 +705,7 @@ enum page_type {
 	META_FLUSH,
 	INMEM,		/* the below types are used by tracepoints only. */
 	INMEM_DROP,
+	INMEM_REVOKE,
 	IPU,
 	OPU,
 };
@@ -681,7 +714,8 @@ struct f2fs_io_info {
 	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	int rw;			/* contains R/RS/W/WS with REQ_META/REQ_PRIO */
-	block_t blk_addr;	/* block address to be written */
+	block_t new_blkaddr;	/* new block address to be written */
+	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
 	struct page *encrypted_page;	/* encrypted page */
 };
@@ -695,6 +729,13 @@ struct f2fs_bio_info {
 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
 };
 
+enum inode_type {
+	DIR_INODE,			/* for dirty dir inode */
+	FILE_INODE,			/* for dirty regular/symlink inode */
+	DIRTY_META,			/* for all dirtied inode metadata */
+	NR_INODE_TYPE,
+};
+
 /* for inner inode cache management */
 struct inode_management {
 	struct radix_tree_root ino_root;	/* ino entry array */
@@ -709,15 +750,31 @@ enum {
 	SBI_IS_CLOSE,				/* specify unmounting */
 	SBI_NEED_FSCK,				/* need fsck.f2fs to fix */
 	SBI_POR_DOING,				/* recovery is doing or not */
+	SBI_NEED_SB_WRITE,			/* need to recover superblock */
+	SBI_NEED_CP,				/* need to checkpoint */
 };
 
+enum {
+	CP_TIME,
+	REQ_TIME,
+	MAX_TIME,
+};
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#define F2FS_KEY_DESC_PREFIX "f2fs:"
+#define F2FS_KEY_DESC_PREFIX_SIZE 5
+#endif
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
-	struct buffer_head *raw_super_buf;	/* buffer head of raw sb */
 	struct f2fs_super_block *raw_super;	/* raw super block pointer */
-	int s_flag;				/* flags for sbi */
+	int valid_super_block;			/* valid super block no */
+	unsigned long s_flag;				/* flags for sbi */
 
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
+	u8 key_prefix_size;
+#endif
 	/* for node-related operations */
 	struct f2fs_nm_info *nm_info;		/* node manager */
 	struct inode *node_inode;		/* cache node blocks */
@@ -728,32 +785,36 @@ struct f2fs_sb_info {
 	/* for bio operations */
 	struct f2fs_bio_info read_io;			/* for read bios */
 	struct f2fs_bio_info write_io[NR_PAGE_TYPE];	/* for write bios */
+	struct mutex wio_mutex[NODE + 1];	/* bio ordering for NODE/DATA */
 
 	/* for checkpoint */
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
+	spinlock_t cp_lock;			/* for flag in ckpt */
 	struct inode *meta_inode;		/* cache meta blocks */
 	struct mutex cp_mutex;			/* checkpoint procedure lock */
 	struct rw_semaphore cp_rwsem;		/* blocking FS operations */
 	struct rw_semaphore node_write;		/* locking node writes */
-	struct mutex writepages;		/* mutex for writepages() */
 	wait_queue_head_t cp_wait;
-	long cp_expires, cp_interval;		/* next expected periodic cp */
+	unsigned long last_time[MAX_TIME];	/* to store time in jiffies */
+	long interval_time[MAX_TIME];		/* to store thresholds */
 
 	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
 
 	/* for orphan inode, use 0'th array */
 	unsigned int max_orphans;		/* max orphan inodes */
 
-	/* for directory inode management */
-	struct list_head dir_inode_list;	/* dir inode list */
-	spinlock_t dir_inode_lock;		/* for dir inode list lock */
+	/* for inode management */
+	struct list_head inode_list[NR_INODE_TYPE];	/* dirty inode list */
+	spinlock_t inode_lock[NR_INODE_TYPE];	/* for dirty inode list lock */
 
 	/* for extent tree cache */
 	struct radix_tree_root extent_tree_root;/* cache extent cache entries */
 	struct rw_semaphore extent_tree_lock;	/* locking extent radix tree */
 	struct list_head extent_list;		/* lru list for shrinker */
 	spinlock_t extent_lock;			/* locking extent lru list */
-	int total_ext_tree;			/* extent tree count */
+	atomic_t total_ext_tree;		/* extent tree count */
+	struct list_head zombie_list;		/* extent zombie tree list */
+	atomic_t total_zombie_tree;		/* extent zombie tree count */
 	atomic_t total_ext_node;		/* extent info count */
 
 	/* basic filesystem units */
@@ -770,17 +831,24 @@ struct f2fs_sb_info {
 	unsigned int total_sections;		/* total section count */
 	unsigned int total_node_count;		/* total node block count */
 	unsigned int total_valid_node_count;	/* valid node block count */
-	unsigned int total_valid_inode_count;	/* valid inode count */
+	loff_t max_file_blocks;			/* max block index of file */
 	int active_logs;			/* # of active logs */
 	int dir_level;				/* directory level */
 
 	block_t user_block_count;		/* # of user blocks */
 	block_t total_valid_block_count;	/* # of valid blocks */
-	block_t alloc_valid_block_count;	/* # of allocated blocks */
 	block_t discard_blks;			/* discard command candidats */
 	block_t last_valid_block_count;		/* for recovery */
 	u32 s_next_generation;			/* for NFS support */
-	atomic_t nr_pages[NR_COUNT_TYPE];	/* # of pages, see count_type */
+	atomic_t nr_wb_bios;			/* # of writeback bios */
+
+	/* # of pages, see count_type */
+	struct percpu_counter nr_pages[NR_COUNT_TYPE];
+	/* # of allocated blocks */
+	struct percpu_counter alloc_valid_block_count;
+
+	/* valid inode count */
+	struct percpu_counter total_valid_inode_count;
 
 	struct f2fs_mount_info mount_opt;	/* mount options */
 
@@ -809,7 +877,7 @@ struct f2fs_sb_info {
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
 	int bg_gc;				/* background gc calls */
-	unsigned int n_dirty_dirs;		/* # of dir inodes */
+	unsigned int ndirty_inode[NR_INODE_TYPE];	/* # of dirty inodes */
 #endif
 	unsigned int last_victim[2];		/* last victim segment # */
 	spinlock_t stat_lock;			/* lock for stat operations */
@@ -822,11 +890,102 @@ struct f2fs_sb_info {
 	struct list_head s_list;
 	struct mutex umount_mutex;
 	unsigned int shrinker_run_no;
+
+	/* For write statistics */
+	u64 sectors_written_start;
+	u64 kbytes_written;
+
+	/* Reference to checksum algorithm driver via cryptoapi */
+	struct crypto_shash *s_chksum_driver;
+
+	/* For fault injection */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	struct f2fs_fault_info fault_info;
+#endif
 };
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
+{
+	struct f2fs_fault_info *ffi = &sbi->fault_info;
+
+	if (!ffi->inject_rate)
+		return false;
+
+	if (!IS_FAULT_SET(ffi, type))
+		return false;
+
+	atomic_inc(&ffi->inject_ops);
+	if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
+		atomic_set(&ffi->inject_ops, 0);
+		printk("%sF2FS-fs : inject %s in %pF\n",
+				KERN_INFO,
+				fault_name[type],
+				__builtin_return_address(0));
+		return true;
+	}
+	return false;
+}
+#endif
+
+/* For write statistics. Suppose sector size is 512 bytes,
+ * and the return value is in kbytes. s is of struct f2fs_sb_info.
+ */
+#define BD_PART_WRITTEN(s)						 \
+(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) -		 \
+		s->sectors_written_start) >> 1)
+
+static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
+{
+	sbi->last_time[type] = jiffies;
+}
+
+static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
+{
+	struct timespec ts = {sbi->interval_time[type], 0};
+	unsigned long interval = timespec_to_jiffies(&ts);
+
+	return time_after(jiffies, sbi->last_time[type] + interval);
+}
+
+static inline bool is_idle(struct f2fs_sb_info *sbi)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct request_list *rl = &q->root_rl;
+
+	if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
+		return 0;
+
+	return f2fs_time_over(sbi, REQ_TIME);
+}
+
 /*
  * Inline functions
  */
+static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
+			   unsigned int length)
+{
+	SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
+	u32 *ctx = (u32 *)shash_desc_ctx(shash);
+	int err;
+
+	shash->tfm = sbi->s_chksum_driver;
+	shash->flags = 0;
+	*ctx = F2FS_SUPER_MAGIC;
+
+	err = crypto_shash_update(shash, address, length);
+	BUG_ON(err);
+
+	return *ctx;
+}
+
+static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
+				  void *buf, size_t buf_size)
+{
+	return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
+}
+
 static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
 {
 	return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -909,17 +1068,17 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
 
 static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
 {
-	return sbi->s_flag & (0x01 << type);
+	return test_bit(type, &sbi->s_flag);
 }
 
 static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
 {
-	sbi->s_flag |= (0x01 << type);
+	set_bit(type, &sbi->s_flag);
 }
 
 static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
 {
-	sbi->s_flag &= ~(0x01 << type);
+	clear_bit(type, &sbi->s_flag);
 }
 
 static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
@@ -927,26 +1086,57 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
 	return le64_to_cpu(cp->checkpoint_ver);
 }
 
-static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 {
 	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+
 	return ckpt_flags & f;
 }
 
-static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
 {
-	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+	return __is_set_ckpt_flags(F2FS_CKPT(sbi), f);
+}
+
+static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+	unsigned int ckpt_flags;
+
+	ckpt_flags = le32_to_cpu(cp->ckpt_flags);
 	ckpt_flags |= f;
 	cp->ckpt_flags = cpu_to_le32(ckpt_flags);
 }
 
-static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
 {
-	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+	spin_lock(&sbi->cp_lock);
+	__set_ckpt_flags(F2FS_CKPT(sbi), f);
+	spin_unlock(&sbi->cp_lock);
+}
+
+static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+	unsigned int ckpt_flags;
+
+	ckpt_flags = le32_to_cpu(cp->ckpt_flags);
 	ckpt_flags &= (~f);
 	cp->ckpt_flags = cpu_to_le32(ckpt_flags);
 }
 
+static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
+{
+	spin_lock(&sbi->cp_lock);
+	__clear_ckpt_flags(F2FS_CKPT(sbi), f);
+	spin_unlock(&sbi->cp_lock);
+}
+
+static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+{
+	struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+
+	return blk_queue_discard(q);
+}
+
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 {
 	down_read(&sbi->cp_rwsem);
@@ -959,7 +1149,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
 
 static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
 {
-	f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
+	down_write(&sbi->cp_rwsem);
 }
 
 static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -985,8 +1175,8 @@ static inline bool __remain_node_summaries(int reason)
 
 static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
 {
-	return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
-			is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
+	return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) ||
+			is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG));
 }
 
 /*
@@ -1019,22 +1209,37 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
 	return ofs == XATTR_NODE_OFFSET;
 }
 
+static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool);
 static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
-				 struct inode *inode, blkcnt_t count)
+				 struct inode *inode, blkcnt_t *count)
 {
-	block_t	valid_block_count;
+	blkcnt_t diff;
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_BLOCK))
+		return false;
+#endif
+	/*
+	 * let's increase this in prior to actual block count change in order
+	 * for f2fs_sync_file to avoid data races when deciding checkpoint.
+	 */
+	percpu_counter_add(&sbi->alloc_valid_block_count, (*count));
 
 	spin_lock(&sbi->stat_lock);
-	valid_block_count =
-		sbi->total_valid_block_count + (block_t)count;
-	if (unlikely(valid_block_count > sbi->user_block_count)) {
-		spin_unlock(&sbi->stat_lock);
-		return false;
+	sbi->total_valid_block_count += (block_t)(*count);
+	if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) {
+		diff = sbi->total_valid_block_count - sbi->user_block_count;
+		*count -= diff;
+		sbi->total_valid_block_count = sbi->user_block_count;
+		if (!*count) {
+			spin_unlock(&sbi->stat_lock);
+			percpu_counter_sub(&sbi->alloc_valid_block_count, diff);
+			return false;
+		}
 	}
-	inode->i_blocks += count;
-	sbi->total_valid_block_count = valid_block_count;
-	sbi->alloc_valid_block_count += (block_t)count;
 	spin_unlock(&sbi->stat_lock);
+
+	f2fs_i_blocks_write(inode, *count, true);
 	return true;
 }
 
@@ -1045,27 +1250,31 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 	spin_lock(&sbi->stat_lock);
 	f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
 	f2fs_bug_on(sbi, inode->i_blocks < count);
-	inode->i_blocks -= count;
 	sbi->total_valid_block_count -= (block_t)count;
 	spin_unlock(&sbi->stat_lock);
+	f2fs_i_blocks_write(inode, count, false);
 }
 
 static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
-	atomic_inc(&sbi->nr_pages[count_type]);
+	percpu_counter_inc(&sbi->nr_pages[count_type]);
+
+	if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
+		return;
+
 	set_sbi_flag(sbi, SBI_IS_DIRTY);
 }
 
 static inline void inode_inc_dirty_pages(struct inode *inode)
 {
-	atomic_inc(&F2FS_I(inode)->dirty_pages);
-	if (S_ISDIR(inode->i_mode))
-		inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+	percpu_counter_inc(&F2FS_I(inode)->dirty_pages);
+	inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
 }
 
 static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
-	atomic_dec(&sbi->nr_pages[count_type]);
+	percpu_counter_dec(&sbi->nr_pages[count_type]);
 }
 
 static inline void inode_dec_dirty_pages(struct inode *inode)
@@ -1074,28 +1283,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
 			!S_ISLNK(inode->i_mode))
 		return;
 
-	atomic_dec(&F2FS_I(inode)->dirty_pages);
-
-	if (S_ISDIR(inode->i_mode))
-		dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+	percpu_counter_dec(&F2FS_I(inode)->dirty_pages);
+	dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
 }
 
-static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
+static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
 {
-	return atomic_read(&sbi->nr_pages[count_type]);
+	return percpu_counter_sum_positive(&sbi->nr_pages[count_type]);
 }
 
-static inline int get_dirty_pages(struct inode *inode)
+static inline s64 get_dirty_pages(struct inode *inode)
 {
-	return atomic_read(&F2FS_I(inode)->dirty_pages);
+	return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages);
 }
 
 static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
 {
-	unsigned int pages_per_sec = sbi->segs_per_sec *
-					(1 << sbi->log_blocks_per_seg);
-	return ((get_pages(sbi, block_type) + pages_per_sec - 1)
-			>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+	unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
+	unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >>
+						sbi->log_blocks_per_seg;
+
+	return segs / sbi->segs_per_sec;
 }
 
 static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
@@ -1103,6 +1312,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
 	return sbi->total_valid_block_count;
 }
 
+static inline block_t discard_blocks(struct f2fs_sb_info *sbi)
+{
+	return sbi->discard_blks;
+}
+
 static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1182,13 +1396,13 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
 	}
 
 	if (inode)
-		inode->i_blocks++;
+		f2fs_i_blocks_write(inode, 1, true);
 
-	sbi->alloc_valid_block_count++;
 	sbi->total_valid_node_count++;
 	sbi->total_valid_block_count++;
 	spin_unlock(&sbi->stat_lock);
 
+	percpu_counter_inc(&sbi->alloc_valid_block_count);
 	return true;
 }
 
@@ -1201,7 +1415,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
 	f2fs_bug_on(sbi, !sbi->total_valid_node_count);
 	f2fs_bug_on(sbi, !inode->i_blocks);
 
-	inode->i_blocks--;
+	f2fs_i_blocks_write(inode, 1, false);
 	sbi->total_valid_node_count--;
 	sbi->total_valid_block_count--;
 
@@ -1215,28 +1429,30 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
 
 static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
 {
-	spin_lock(&sbi->stat_lock);
-	f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
-	sbi->total_valid_inode_count++;
-	spin_unlock(&sbi->stat_lock);
+	percpu_counter_inc(&sbi->total_valid_inode_count);
 }
 
 static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
 {
-	spin_lock(&sbi->stat_lock);
-	f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
-	sbi->total_valid_inode_count--;
-	spin_unlock(&sbi->stat_lock);
+	percpu_counter_dec(&sbi->total_valid_inode_count);
 }
 
-static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
+static inline s64 valid_inode_count(struct f2fs_sb_info *sbi)
 {
-	return sbi->total_valid_inode_count;
+	return percpu_counter_sum_positive(&sbi->total_valid_inode_count);
 }
 
 static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
 						pgoff_t index, bool for_write)
 {
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	struct page *page = find_lock_page(mapping, index);
+	if (page)
+		return page;
+
+	if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC))
+		return NULL;
+#endif
 	if (!for_write)
 		return grab_cache_page(mapping, index);
 	return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
@@ -1261,7 +1477,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
 		f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
 		unlock_page(page);
 	}
-	page_cache_release(page);
+	put_page(page);
 }
 
 static inline void f2fs_put_dnode(struct dnode_of_data *dn)
@@ -1396,13 +1612,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 enum {
 	FI_NEW_INODE,		/* indicate newly allocated inode */
 	FI_DIRTY_INODE,		/* indicate inode is dirty or not */
+	FI_AUTO_RECOVER,	/* indicate inode is recoverable */
 	FI_DIRTY_DIR,		/* indicate directory has dirty pages */
 	FI_INC_LINK,		/* need to increment i_nlink */
 	FI_ACL_MODE,		/* indicate acl mode */
 	FI_NO_ALLOC,		/* should not allocate any blocks */
 	FI_FREE_NID,		/* free allocated nide */
-	FI_UPDATE_DIR,		/* should update inode block for consistency */
-	FI_DELAY_IPUT,		/* used for the recovery */
 	FI_NO_EXTENT,		/* not to use the extent cache */
 	FI_INLINE_XATTR,	/* used for inline xattr */
 	FI_INLINE_DATA,		/* used for inline data*/
@@ -1416,71 +1631,152 @@ enum {
 	FI_DROP_CACHE,		/* drop dirty page cache */
 	FI_DATA_EXIST,		/* indicate data exists */
 	FI_INLINE_DOTS,		/* indicate inline dot dentries */
+	FI_DO_DEFRAG,		/* indicate defragment is running */
+	FI_DIRTY_FILE,		/* indicate regular/symlink has dirty pages */
 };
 
-static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
+static inline void __mark_inode_dirty_flag(struct inode *inode,
+						int flag, bool set)
 {
-	if (!test_bit(flag, &fi->flags))
-		set_bit(flag, &fi->flags);
+	switch (flag) {
+	case FI_INLINE_XATTR:
+	case FI_INLINE_DATA:
+	case FI_INLINE_DENTRY:
+		if (set)
+			return;
+	case FI_DATA_EXIST:
+	case FI_INLINE_DOTS:
+		f2fs_mark_inode_dirty_sync(inode);
+	}
 }
 
-static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
+static inline void set_inode_flag(struct inode *inode, int flag)
 {
-	return test_bit(flag, &fi->flags);
+	if (!test_bit(flag, &F2FS_I(inode)->flags))
+		set_bit(flag, &F2FS_I(inode)->flags);
+	__mark_inode_dirty_flag(inode, flag, true);
 }
 
-static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+static inline int is_inode_flag_set(struct inode *inode, int flag)
 {
-	if (test_bit(flag, &fi->flags))
-		clear_bit(flag, &fi->flags);
+	return test_bit(flag, &F2FS_I(inode)->flags);
 }
 
-static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
+static inline void clear_inode_flag(struct inode *inode, int flag)
 {
-	fi->i_acl_mode = mode;
-	set_inode_flag(fi, FI_ACL_MODE);
+	if (test_bit(flag, &F2FS_I(inode)->flags))
+		clear_bit(flag, &F2FS_I(inode)->flags);
+	__mark_inode_dirty_flag(inode, flag, false);
 }
 
-static inline void get_inline_info(struct f2fs_inode_info *fi,
-					struct f2fs_inode *ri)
+static inline void set_acl_inode(struct inode *inode, umode_t mode)
 {
+	F2FS_I(inode)->i_acl_mode = mode;
+	set_inode_flag(inode, FI_ACL_MODE);
+	f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void f2fs_i_links_write(struct inode *inode, bool inc)
+{
+	if (inc)
+		inc_nlink(inode);
+	else
+		drop_nlink(inode);
+	f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void f2fs_i_blocks_write(struct inode *inode,
+					blkcnt_t diff, bool add)
+{
+	bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
+	bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
+
+	inode->i_blocks = add ? inode->i_blocks + diff :
+				inode->i_blocks - diff;
+	f2fs_mark_inode_dirty_sync(inode);
+	if (clean || recover)
+		set_inode_flag(inode, FI_AUTO_RECOVER);
+}
+
+static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+	bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
+	bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
+
+	if (i_size_read(inode) == i_size)
+		return;
+
+	i_size_write(inode, i_size);
+	f2fs_mark_inode_dirty_sync(inode);
+	if (clean || recover)
+		set_inode_flag(inode, FI_AUTO_RECOVER);
+}
+
+static inline bool f2fs_skip_inode_update(struct inode *inode)
+{
+	if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
+		return false;
+	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
+}
+
+static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
+{
+	F2FS_I(inode)->i_current_depth = depth;
+	f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
+{
+	F2FS_I(inode)->i_xattr_nid = xnid;
+	f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
+{
+	F2FS_I(inode)->i_pino = pino;
+	f2fs_mark_inode_dirty_sync(inode);
+}
+
+static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+
 	if (ri->i_inline & F2FS_INLINE_XATTR)
-		set_inode_flag(fi, FI_INLINE_XATTR);
+		set_bit(FI_INLINE_XATTR, &fi->flags);
 	if (ri->i_inline & F2FS_INLINE_DATA)
-		set_inode_flag(fi, FI_INLINE_DATA);
+		set_bit(FI_INLINE_DATA, &fi->flags);
 	if (ri->i_inline & F2FS_INLINE_DENTRY)
-		set_inode_flag(fi, FI_INLINE_DENTRY);
+		set_bit(FI_INLINE_DENTRY, &fi->flags);
 	if (ri->i_inline & F2FS_DATA_EXIST)
-		set_inode_flag(fi, FI_DATA_EXIST);
+		set_bit(FI_DATA_EXIST, &fi->flags);
 	if (ri->i_inline & F2FS_INLINE_DOTS)
-		set_inode_flag(fi, FI_INLINE_DOTS);
+		set_bit(FI_INLINE_DOTS, &fi->flags);
 }
 
-static inline void set_raw_inline(struct f2fs_inode_info *fi,
-					struct f2fs_inode *ri)
+static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
 {
 	ri->i_inline = 0;
 
-	if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+	if (is_inode_flag_set(inode, FI_INLINE_XATTR))
 		ri->i_inline |= F2FS_INLINE_XATTR;
-	if (is_inode_flag_set(fi, FI_INLINE_DATA))
+	if (is_inode_flag_set(inode, FI_INLINE_DATA))
 		ri->i_inline |= F2FS_INLINE_DATA;
-	if (is_inode_flag_set(fi, FI_INLINE_DENTRY))
+	if (is_inode_flag_set(inode, FI_INLINE_DENTRY))
 		ri->i_inline |= F2FS_INLINE_DENTRY;
-	if (is_inode_flag_set(fi, FI_DATA_EXIST))
+	if (is_inode_flag_set(inode, FI_DATA_EXIST))
 		ri->i_inline |= F2FS_DATA_EXIST;
-	if (is_inode_flag_set(fi, FI_INLINE_DOTS))
+	if (is_inode_flag_set(inode, FI_INLINE_DOTS))
 		ri->i_inline |= F2FS_INLINE_DOTS;
 }
 
 static inline int f2fs_has_inline_xattr(struct inode *inode)
 {
-	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
+	return is_inode_flag_set(inode, FI_INLINE_XATTR);
 }
 
-static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
+static inline unsigned int addrs_per_inode(struct inode *inode)
 {
-	if (f2fs_has_inline_xattr(&fi->vfs_inode))
+	if (f2fs_has_inline_xattr(inode))
 		return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
 	return DEF_ADDRS_PER_INODE;
 }
@@ -1502,43 +1798,43 @@ static inline int inline_xattr_size(struct inode *inode)
 
 static inline int f2fs_has_inline_data(struct inode *inode)
 {
-	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+	return is_inode_flag_set(inode, FI_INLINE_DATA);
 }
 
 static inline void f2fs_clear_inline_inode(struct inode *inode)
 {
-	clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
-	clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+	clear_inode_flag(inode, FI_INLINE_DATA);
+	clear_inode_flag(inode, FI_DATA_EXIST);
 }
 
 static inline int f2fs_exist_data(struct inode *inode)
 {
-	return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST);
+	return is_inode_flag_set(inode, FI_DATA_EXIST);
 }
 
 static inline int f2fs_has_inline_dots(struct inode *inode)
 {
-	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS);
+	return is_inode_flag_set(inode, FI_INLINE_DOTS);
 }
 
 static inline bool f2fs_is_atomic_file(struct inode *inode)
 {
-	return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+	return is_inode_flag_set(inode, FI_ATOMIC_FILE);
 }
 
 static inline bool f2fs_is_volatile_file(struct inode *inode)
 {
-	return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+	return is_inode_flag_set(inode, FI_VOLATILE_FILE);
 }
 
 static inline bool f2fs_is_first_block_written(struct inode *inode)
 {
-	return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+	return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN);
 }
 
 static inline bool f2fs_is_drop_cache(struct inode *inode)
 {
-	return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
+	return is_inode_flag_set(inode, FI_DROP_CACHE);
 }
 
 static inline void *inline_data_addr(struct page *page)
@@ -1549,7 +1845,7 @@ static inline void *inline_data_addr(struct page *page)
 
 static inline int f2fs_has_inline_dentry(struct inode *inode)
 {
-	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY);
+	return is_inode_flag_set(inode, FI_INLINE_DENTRY);
 }
 
 static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
@@ -1566,11 +1862,13 @@ static inline int is_file(struct inode *inode, int type)
 static inline void set_file(struct inode *inode, int type)
 {
 	F2FS_I(inode)->i_advise |= type;
+	f2fs_mark_inode_dirty_sync(inode);
 }
 
 static inline void clear_file(struct inode *inode, int type)
 {
 	F2FS_I(inode)->i_advise &= ~type;
+	f2fs_mark_inode_dirty_sync(inode);
 }
 
 static inline int f2fs_readonly(struct super_block *sb)
@@ -1580,13 +1878,7 @@ static inline int f2fs_readonly(struct super_block *sb)
 
 static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
 {
-	return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
-}
-
-static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
-{
-	set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
-	sbi->sb->s_flags |= MS_RDONLY;
+	return is_set_ckpt_flags(sbi, CP_ERROR_FLAG);
 }
 
 static inline bool is_dot_dotdot(const struct qstr *str)
@@ -1602,13 +1894,21 @@ static inline bool is_dot_dotdot(const struct qstr *str)
 
 static inline bool f2fs_may_extent_tree(struct inode *inode)
 {
-	mode_t mode = inode->i_mode;
-
 	if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
-			is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+			is_inode_flag_set(inode, FI_NO_EXTENT))
 		return false;
 
-	return S_ISREG(mode);
+	return S_ISREG(inode->i_mode);
+}
+
+static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
+					size_t size, gfp_t flags)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_KMALLOC))
+		return NULL;
+#endif
+	return kmalloc(size, flags);
 }
 
 static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
@@ -1632,14 +1932,14 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
 }
 
 #define get_inode_mode(i) \
-	((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+	((is_inode_flag_set(i, FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
 
 /* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, fi)				\
-	((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) :	\
-	(pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) /	\
-	ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
+#define PGOFS_OF_NEXT_DNODE(pgofs, inode)				\
+	((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) :	\
+	(pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) /	\
+	ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
 
 /*
  * file.c
@@ -1647,7 +1947,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
 int truncate_blocks(struct inode *, u64, bool);
-int f2fs_truncate(struct inode *, bool);
+int f2fs_truncate(struct inode *);
 int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
 int truncate_hole(struct inode *, pgoff_t, pgoff_t);
@@ -1660,9 +1960,10 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
  */
 void f2fs_set_inode_flags(struct inode *);
 struct inode *f2fs_iget(struct super_block *, unsigned long);
+struct inode *f2fs_iget_retry(struct super_block *, unsigned long);
 int try_to_free_nats(struct f2fs_sb_info *, int);
-void update_inode(struct inode *, struct page *);
-void update_inode_page(struct inode *);
+int update_inode(struct inode *, struct page *);
+int update_inode_page(struct inode *);
 int f2fs_write_inode(struct inode *, struct writeback_control *);
 void f2fs_evict_inode(struct inode *);
 void handle_failed_inode(struct inode *);
@@ -1675,29 +1976,34 @@ struct dentry *f2fs_get_parent(struct dentry *child);
 /*
  * dir.c
  */
-extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
 void set_de_type(struct f2fs_dir_entry *, umode_t);
-
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *,
+unsigned char get_de_type(struct f2fs_dir_entry *);
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
 			f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
 bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
-			unsigned int, struct f2fs_str *);
+			unsigned int, struct fscrypt_str *);
 void do_make_empty_dir(struct inode *, struct inode *,
 			struct f2fs_dentry_ptr *);
 struct page *init_inode_metadata(struct inode *, struct inode *,
-			const struct qstr *, struct page *);
+		const struct qstr *, const struct qstr *, struct page *);
 void update_parent_metadata(struct inode *, struct inode *, unsigned int);
 int room_for_filename(const void *, int, int);
-void f2fs_drop_nlink(struct inode *, struct inode *, struct page *);
-struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
+void f2fs_drop_nlink(struct inode *, struct inode *);
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *,
+							struct page **);
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *,
 							struct page **);
 struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
-ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
+ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **);
 void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
 				struct page *, struct inode *);
 int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
 void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
 			const struct qstr *, f2fs_hash_t , unsigned int);
+int f2fs_add_regular_entry(struct inode *, const struct qstr *,
+			const struct qstr *, struct inode *, nid_t, umode_t);
+int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *,
+			nid_t, umode_t);
 int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
 			umode_t);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
@@ -1714,16 +2020,18 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
 /*
  * super.c
  */
+int f2fs_inode_dirtied(struct inode *);
+void f2fs_inode_synced(struct inode *);
 int f2fs_commit_super(struct f2fs_sb_info *, bool);
 int f2fs_sync_fs(struct super_block *, int);
 extern __printf(3, 4)
 void f2fs_msg(struct super_block *, const char *, const char *, ...);
+int sanity_check_ckpt(struct f2fs_sb_info *sbi);
 
 /*
  * hash.c
  */
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
-				struct f2fs_filename *fname);
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *);
 
 /*
  * node.c
@@ -1736,6 +2044,7 @@ int need_dentry_mark(struct f2fs_sb_info *, nid_t);
 bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
 bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
 void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
+pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int truncate_xattr_node(struct inode *, struct page *);
@@ -1746,8 +2055,11 @@ struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
 struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_node_page_ra(struct page *, int);
-void sync_inode_page(struct dnode_of_data *);
-int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
+void move_node_page(struct page *, int);
+int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
+			struct writeback_control *, bool);
+int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
+void build_free_nids(struct f2fs_sb_info *);
 bool alloc_nid(struct f2fs_sb_info *, nid_t *);
 void alloc_nid_done(struct f2fs_sb_info *, nid_t);
 void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
@@ -1767,8 +2079,9 @@ void destroy_node_manager_caches(void);
  * segment.c
  */
 void register_inmem_page(struct inode *, struct page *);
-int commit_inmem_pages(struct inode *, bool);
-void f2fs_balance_fs(struct f2fs_sb_info *);
+void drop_inmem_pages(struct inode *);
+int commit_inmem_pages(struct inode *);
+void f2fs_balance_fs(struct f2fs_sb_info *, bool);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
 int f2fs_issue_flush(struct f2fs_sb_info *);
 int create_flush_cmd_control(struct f2fs_sb_info *);
@@ -1778,7 +2091,6 @@ bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
 void release_discard_addrs(struct f2fs_sb_info *);
-bool discard_next_dnode(struct f2fs_sb_info *, block_t);
 int npages_for_summary_flush(struct f2fs_sb_info *, bool);
 void allocate_new_segments(struct f2fs_sb_info *);
 int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
@@ -1788,16 +2100,17 @@ void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(unsigned int, struct f2fs_io_info *);
 void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
 void rewrite_data_page(struct f2fs_io_info *);
+void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *,
+					block_t, block_t, bool, bool);
 void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
-				block_t, block_t, unsigned char, bool);
+				block_t, block_t, unsigned char, bool, bool);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
 		block_t, block_t *, struct f2fs_summary *, int);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
 void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
 void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
-int lookup_journal_in_cursum(struct f2fs_summary_block *,
-					int, unsigned int, int);
+int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int);
 void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
 int build_segment_manager(struct f2fs_sb_info *);
 void destroy_segment_manager(struct f2fs_sb_info *);
@@ -1807,6 +2120,7 @@ void destroy_segment_manager_caches(void);
 /*
  * checkpoint.c
  */
+void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool);
 struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
@@ -1814,21 +2128,21 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
 int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
 void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void release_dirty_inode(struct f2fs_sb_info *);
+void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void release_ino_entry(struct f2fs_sb_info *, bool);
 bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
+int f2fs_sync_inode_meta(struct f2fs_sb_info *);
 int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
-void add_orphan_inode(struct f2fs_sb_info *, nid_t);
+void add_orphan_inode(struct inode *);
 void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
 int recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void update_dirty_page(struct inode *, struct page *);
-void add_dirty_dir_inode(struct inode *);
-void remove_dirty_dir_inode(struct inode *);
-void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
+void remove_dirty_inode(struct inode *);
+int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type);
+int write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
 void init_ino_entry_info(struct f2fs_sb_info *);
 int __init create_checkpoint_caches(void);
 void destroy_checkpoint_caches(void);
@@ -1837,34 +2151,46 @@ void destroy_checkpoint_caches(void);
  * data.c
  */
 void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
+				struct page *, nid_t, enum page_type, int);
+void f2fs_flush_merged_bios(struct f2fs_sb_info *);
 int f2fs_submit_page_bio(struct f2fs_io_info *);
 void f2fs_submit_page_mbio(struct f2fs_io_info *);
 void set_data_blkaddr(struct dnode_of_data *);
+void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
+int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_get_block(struct dnode_of_data *, pgoff_t);
+ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
 struct page *find_data_page(struct inode *, pgoff_t);
 struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int do_write_data_page(struct f2fs_io_info *);
+int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void f2fs_set_page_dirty_nobuffers(struct page *);
 void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
 int f2fs_release_page(struct page *, gfp_t);
+#ifdef CONFIG_MIGRATION
+int f2fs_migrate_page(struct address_space *, struct page *, struct page *,
+				enum migrate_mode);
+#endif
 
 /*
  * gc.c
  */
 int start_gc_thread(struct f2fs_sb_info *);
 void stop_gc_thread(struct f2fs_sb_info *);
-block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
+block_t start_bidx_of_node(unsigned int, struct inode *);
 int f2fs_gc(struct f2fs_sb_info *, bool);
 void build_gc_manager(struct f2fs_sb_info *);
 
 /*
  * recovery.c
  */
-int recover_fsync_data(struct f2fs_sb_info *);
+int recover_fsync_data(struct f2fs_sb_info *, bool);
 bool space_for_roll_forward(struct f2fs_sb_info *);
 
 /*
@@ -1878,18 +2204,20 @@ struct f2fs_stat_info {
 	int main_area_segs, main_area_sections, main_area_zones;
 	unsigned long long hit_largest, hit_cached, hit_rbtree;
 	unsigned long long hit_total, total_ext;
-	int ext_tree, ext_node;
-	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
+	int ext_tree, zombie_tree, ext_node;
+	s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+	s64 inmem_pages;
+	unsigned int ndirty_dirs, ndirty_files, ndirty_all;
 	int nats, dirty_nats, sits, dirty_sits, fnids;
 	int total_count, utilization;
-	int bg_gc, inmem_pages, wb_pages;
-	int inline_xattr, inline_inode, inline_dir;
-	unsigned int valid_count, valid_node_count, valid_inode_count;
+	int bg_gc, wb_bios;
+	int inline_xattr, inline_inode, inline_dir, orphans;
+	unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
 	unsigned int bimodal, avg_vblocks;
 	int util_free, util_valid, util_invalid;
 	int rsvd_segs, overp_segs;
 	int dirty_count, node_pages, meta_pages;
-	int prefree_count, call_count, cp_count;
+	int prefree_count, call_count, cp_count, bg_cp_count;
 	int tot_segs, node_segs, data_segs, free_segs, free_secs;
 	int bg_node_segs, bg_data_segs;
 	int tot_blks, data_blks, node_blks;
@@ -1910,10 +2238,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 }
 
 #define stat_inc_cp_count(si)		((si)->cp_count++)
+#define stat_inc_bg_cp_count(si)	((si)->bg_cp_count++)
 #define stat_inc_call_count(si)		((si)->call_count++)
 #define stat_inc_bggc_count(sbi)	((sbi)->bg_gc++)
-#define stat_inc_dirty_dir(sbi)		((sbi)->n_dirty_dirs++)
-#define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
+#define stat_inc_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]++)
+#define stat_dec_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]--)
 #define stat_inc_total_hit(sbi)		(atomic64_inc(&(sbi)->total_hit_ext))
 #define stat_inc_rbtree_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_rbtree))
 #define stat_inc_largest_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_largest))
@@ -1988,14 +2317,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 
 int f2fs_build_stats(struct f2fs_sb_info *);
 void f2fs_destroy_stats(struct f2fs_sb_info *);
-void __init f2fs_create_root_stats(void);
+int __init f2fs_create_root_stats(void);
 void f2fs_destroy_root_stats(void);
 #else
 #define stat_inc_cp_count(si)
+#define stat_inc_bg_cp_count(si)
 #define stat_inc_call_count(si)
 #define stat_inc_bggc_count(si)
-#define stat_inc_dirty_dir(sbi)
-#define stat_dec_dirty_dir(sbi)
+#define stat_inc_dirty_inode(sbi, type)
+#define stat_dec_dirty_inode(sbi, type)
 #define stat_inc_total_hit(sb)
 #define stat_inc_rbtree_node_hit(sb)
 #define stat_inc_largest_node_hit(sbi)
@@ -2016,7 +2346,7 @@ void f2fs_destroy_root_stats(void);
 
 static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline void __init f2fs_create_root_stats(void) { }
+static inline int __init f2fs_create_root_stats(void) { return 0; }
 static inline void f2fs_destroy_root_stats(void) { }
 #endif
 
@@ -2045,16 +2375,15 @@ int f2fs_convert_inline_inode(struct inode *);
 int f2fs_write_inline_data(struct inode *, struct page *);
 bool recover_inline_data(struct inode *, struct page *);
 struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
-				struct f2fs_filename *, struct page **);
-struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
+				struct fscrypt_name *, struct page **);
 int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
-int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
-						nid_t, umode_t);
+int f2fs_add_inline_entry(struct inode *, const struct qstr *,
+		const struct qstr *, struct inode *, nid_t, umode_t);
 void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
 						struct inode *, struct inode *);
 bool f2fs_empty_inline_dir(struct inode *);
 int f2fs_read_inline_dir(struct file *, struct dir_context *,
-						struct f2fs_str *);
+						struct fscrypt_str *);
 int f2fs_inline_data_fiemap(struct inode *,
 		struct fiemap_extent_info *, __u64, __u64);
 
@@ -2070,8 +2399,8 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *);
  * extent_cache.c
  */
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
-void f2fs_drop_largest_extent(struct inode *, pgoff_t);
-void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+void f2fs_drop_extent_tree(struct inode *);
 unsigned int f2fs_destroy_extent_node(struct inode *);
 void f2fs_destroy_extent_tree(struct inode *);
 bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
@@ -2085,13 +2414,9 @@ void destroy_extent_cache(void);
 /*
  * crypto support
  */
-static inline int f2fs_encrypted_inode(struct inode *inode)
+static inline bool f2fs_encrypted_inode(struct inode *inode)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
 	return file_is_encrypt(inode);
-#else
-	return 0;
-#endif
 }
 
 static inline void f2fs_set_encrypted_inode(struct inode *inode)
@@ -2103,26 +2428,38 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
 
 static inline bool f2fs_bio_encrypted(struct bio *bio)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	return unlikely(bio->bi_private != NULL);
-#else
-	return false;
-#endif
+	return bio->bi_private != NULL;
 }
 
 static inline int f2fs_sb_has_crypto(struct super_block *sb)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
-#else
-	return 0;
-#endif
+}
+
+static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR);
+}
+
+static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
+{
+	clear_opt(sbi, ADAPTIVE);
+	clear_opt(sbi, LFS);
+
+	switch (mt) {
+	case F2FS_MOUNT_ADAPTIVE:
+		set_opt(sbi, ADAPTIVE);
+		break;
+	case F2FS_MOUNT_LFS:
+		set_opt(sbi, LFS);
+		break;
+	}
 }
 
 static inline bool f2fs_may_encrypt(struct inode *inode)
 {
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
-	mode_t mode = inode->i_mode;
+	umode_t mode = inode->i_mode;
 
 	return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
 #else
@@ -2130,74 +2467,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #endif
 }
 
-/* crypto_policy.c */
-int f2fs_is_child_context_consistent_with_parent(struct inode *,
-							struct inode *);
-int f2fs_inherit_context(struct inode *, struct inode *, struct page *);
-int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *);
-int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *);
-
-/* crypt.c */
-extern struct kmem_cache *f2fs_crypt_info_cachep;
-bool f2fs_valid_contents_enc_mode(uint32_t);
-uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t);
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *);
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *);
-struct page *f2fs_encrypt(struct inode *, struct page *);
-int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *);
-int f2fs_decrypt_one(struct inode *, struct page *);
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *);
-
-/* crypto_key.c */
-void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *);
-
-/* crypto_fname.c */
-bool f2fs_valid_filenames_enc_mode(uint32_t);
-u32 f2fs_fname_crypto_round_up(u32, u32);
-int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *);
-int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *,
-			const struct f2fs_str *, struct f2fs_str *);
-int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *,
-			struct f2fs_str *);
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-void f2fs_restore_and_release_control_page(struct page **);
-void f2fs_restore_control_page(struct page *);
-
-int __init f2fs_init_crypto(void);
-int f2fs_crypto_initialize(void);
-void f2fs_exit_crypto(void);
-
-int f2fs_has_encryption_key(struct inode *);
-
-int f2fs_get_encryption_info(struct inode *inode);
-
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *);
-int f2fs_fname_setup_filename(struct inode *, const struct qstr *,
-				int lookup, struct f2fs_filename *);
-void f2fs_fname_free_filename(struct f2fs_filename *);
-#else
-static inline void f2fs_restore_and_release_control_page(struct page **p) { }
-static inline void f2fs_restore_control_page(struct page *p) { }
-
-static inline int __init f2fs_init_crypto(void) { return 0; }
-static inline void f2fs_exit_crypto(void) { }
-
-static inline int f2fs_has_encryption_key(struct inode *i) { return 0; }
-static inline int f2fs_get_encryption_info(struct inode *i) { return 0; }
-static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { }
-
-static inline int f2fs_fname_setup_filename(struct inode *dir,
-					const struct qstr *iname,
-					int lookup, struct f2fs_filename *fname)
-{
-	memset(fname, 0, sizeof(struct f2fs_filename));
-	fname->usr_fname = iname;
-	fname->disk_name.name = (unsigned char *)iname->name;
-	fname->disk_name.len = iname->len;
-	return 0;
-}
-
-static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { }
+#ifndef CONFIG_F2FS_FS_ENCRYPTION
+#define fscrypt_set_d_op(i)
+#define fscrypt_get_ctx			fscrypt_notsupp_get_ctx
+#define fscrypt_release_ctx		fscrypt_notsupp_release_ctx
+#define fscrypt_encrypt_page		fscrypt_notsupp_encrypt_page
+#define fscrypt_decrypt_page		fscrypt_notsupp_decrypt_page
+#define fscrypt_decrypt_bio_pages	fscrypt_notsupp_decrypt_bio_pages
+#define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
+#define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
+#define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
+#define fscrypt_process_policy		fscrypt_notsupp_process_policy
+#define fscrypt_get_policy		fscrypt_notsupp_get_policy
+#define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
+#define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
+#define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
+#define fscrypt_put_encryption_info	fscrypt_notsupp_put_encryption_info
+#define fscrypt_setup_filename		fscrypt_notsupp_setup_filename
+#define fscrypt_free_filename		fscrypt_notsupp_free_filename
+#define fscrypt_fname_encrypted_size	fscrypt_notsupp_fname_encrypted_size
+#define fscrypt_fname_alloc_buffer	fscrypt_notsupp_fname_alloc_buffer
+#define fscrypt_fname_free_buffer	fscrypt_notsupp_fname_free_buffer
+#define fscrypt_fname_disk_to_usr	fscrypt_notsupp_fname_disk_to_usr
+#define fscrypt_fname_usr_to_disk	fscrypt_notsupp_fname_usr_to_disk
 #endif
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 4b449d263333..c6e33258fabf 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -21,6 +21,8 @@
 #include <linux/mount.h>
 #include <linux/pagevec.h>
 #include <linux/random.h>
+#include <linux/uuid.h>
+#include <linux/file.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -40,8 +42,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	struct dnode_of_data dn;
 	int err;
 
-	f2fs_balance_fs(sbi);
-
 	sb_start_pagefault(inode->i_sb);
 
 	f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -57,6 +57,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	f2fs_put_dnode(&dn);
 	f2fs_unlock_op(sbi);
 
+	f2fs_balance_fs(sbi, dn.node_changed);
+
 	file_update_time(vma->vm_file);
 	lock_page(page);
 	if (unlikely(page->mapping != inode->i_mapping ||
@@ -74,19 +76,20 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 		goto mapped;
 
 	/* page is wholly or partially inside EOF */
-	if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) >
+	if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
 						i_size_read(inode)) {
 		unsigned offset;
-		offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
-		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+		offset = i_size_read(inode) & ~PAGE_MASK;
+		zero_user_segment(page, offset, PAGE_SIZE);
 	}
 	set_page_dirty(page);
-	SetPageUptodate(page);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
 
 	trace_f2fs_vm_page_mkwrite(page, DATA);
 mapped:
 	/* fill the page */
-	f2fs_wait_on_page_writeback(page, DATA);
+	f2fs_wait_on_page_writeback(page, DATA, false);
 
 	/* wait for GCed encrypted page writeback */
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -96,6 +99,7 @@ mapped:
 	clear_cold_data(page);
 out:
 	sb_end_pagefault(inode->i_sb);
+	f2fs_update_time(sbi, REQ_TIME);
 	return block_page_mkwrite_return(err);
 }
 
@@ -132,7 +136,7 @@ static inline bool need_do_checkpoint(struct inode *inode)
 
 	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
 		need_cp = true;
-	else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino))
+	else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
 		need_cp = true;
 	else if (file_wrong_pino(inode))
 		need_cp = true;
@@ -170,21 +174,16 @@ static void try_to_fix_pino(struct inode *inode)
 	fi->xattr_ver = 0;
 	if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
 			get_parent_ino(inode, &pino)) {
-		fi->i_pino = pino;
+		f2fs_i_pino_write(inode, pino);
 		file_got_pino(inode);
-		up_write(&fi->i_sem);
-
-		mark_inode_dirty_sync(inode);
-		f2fs_write_inode(inode, NULL);
-	} else {
-		up_write(&fi->i_sem);
 	}
+	up_write(&fi->i_sem);
 }
 
-int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
+						int datasync, bool atomic)
 {
 	struct inode *inode = file->f_mapping->host;
-	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t ino = inode->i_ino;
 	int ret = 0;
@@ -201,10 +200,10 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	trace_f2fs_sync_file_enter(inode);
 
 	/* if fdatasync is triggered, let's do in-place-update */
-	if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
-		set_inode_flag(fi, FI_NEED_IPU);
+	if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
+		set_inode_flag(inode, FI_NEED_IPU);
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	clear_inode_flag(fi, FI_NEED_IPU);
+	clear_inode_flag(inode, FI_NEED_IPU);
 
 	if (ret) {
 		trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -212,7 +211,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	}
 
 	/* if the inode is dirty, let's recover all the time */
-	if (!datasync) {
+	if (!datasync && !f2fs_skip_inode_update(inode)) {
 		f2fs_write_inode(inode, NULL);
 		goto go_write;
 	}
@@ -220,29 +219,26 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	/*
 	 * if there is no written data, don't waste time to write recovery info.
 	 */
-	if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
+	if (!is_inode_flag_set(inode, FI_APPEND_WRITE) &&
 			!exist_written_data(sbi, ino, APPEND_INO)) {
 
 		/* it may call write_inode just prior to fsync */
 		if (need_inode_page_update(sbi, ino))
 			goto go_write;
 
-		if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
+		if (is_inode_flag_set(inode, FI_UPDATE_WRITE) ||
 				exist_written_data(sbi, ino, UPDATE_INO))
 			goto flush_out;
 		goto out;
 	}
 go_write:
-	/* guarantee free sections for fsync */
-	f2fs_balance_fs(sbi);
-
 	/*
 	 * Both of fdatasync() and fsync() are able to be recovered from
 	 * sudden-power-off.
 	 */
-	down_read(&fi->i_sem);
+	down_read(&F2FS_I(inode)->i_sem);
 	need_cp = need_do_checkpoint(inode);
-	up_read(&fi->i_sem);
+	up_read(&F2FS_I(inode)->i_sem);
 
 	if (need_cp) {
 		/* all the dirty node pages should be flushed for POR */
@@ -253,19 +249,23 @@ go_write:
 		 * will be used only for fsynced inodes after checkpoint.
 		 */
 		try_to_fix_pino(inode);
-		clear_inode_flag(fi, FI_APPEND_WRITE);
-		clear_inode_flag(fi, FI_UPDATE_WRITE);
+		clear_inode_flag(inode, FI_APPEND_WRITE);
+		clear_inode_flag(inode, FI_UPDATE_WRITE);
 		goto out;
 	}
 sync_nodes:
-	sync_node_pages(sbi, ino, &wbc);
-
-	/* if cp_error was enabled, we should avoid infinite loop */
-	if (unlikely(f2fs_cp_error(sbi)))
+	ret = fsync_node_pages(sbi, inode, &wbc, atomic);
+	if (ret)
 		goto out;
 
+	/* if cp_error was enabled, we should avoid infinite loop */
+	if (unlikely(f2fs_cp_error(sbi))) {
+		ret = -EIO;
+		goto out;
+	}
+
 	if (need_inode_block_update(sbi, ino)) {
-		mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode);
 		f2fs_write_inode(inode, NULL);
 		goto sync_nodes;
 	}
@@ -275,18 +275,24 @@ sync_nodes:
 		goto out;
 
 	/* once recovery info is written, don't need to tack this */
-	remove_dirty_inode(sbi, ino, APPEND_INO);
-	clear_inode_flag(fi, FI_APPEND_WRITE);
+	remove_ino_entry(sbi, ino, APPEND_INO);
+	clear_inode_flag(inode, FI_APPEND_WRITE);
 flush_out:
-	remove_dirty_inode(sbi, ino, UPDATE_INO);
-	clear_inode_flag(fi, FI_UPDATE_WRITE);
+	remove_ino_entry(sbi, ino, UPDATE_INO);
+	clear_inode_flag(inode, FI_UPDATE_WRITE);
 	ret = f2fs_issue_flush(sbi);
+	f2fs_update_time(sbi, REQ_TIME);
 out:
 	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
 	f2fs_trace_ios(NULL, 1);
 	return ret;
 }
 
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	return f2fs_do_sync_file(file, start, end, datasync, false);
+}
+
 static pgoff_t __get_first_dirty_index(struct address_space *mapping,
 						pgoff_t pgofs, int whence)
 {
@@ -300,7 +306,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping,
 	pagevec_init(&pvec, 0);
 	nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
 					PAGECACHE_TAG_DIRTY, 1);
-	pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX;
+	pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX;
 	pagevec_release(&pvec);
 	return pgofs;
 }
@@ -332,7 +338,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 	loff_t isize;
 	int err = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 	if (offset >= isize)
@@ -345,32 +351,31 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 		goto found;
 	}
 
-	pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);
+	pgofs = (pgoff_t)(offset >> PAGE_SHIFT);
 
 	dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
 
-	for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
+	for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+		err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE);
 		if (err && err != -ENOENT) {
 			goto fail;
 		} else if (err == -ENOENT) {
 			/* direct node does not exists */
 			if (whence == SEEK_DATA) {
-				pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
-							F2FS_I(inode));
+				pgofs = get_next_page_offset(&dn, pgofs);
 				continue;
 			} else {
 				goto found;
 			}
 		}
 
-		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+		end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
 
 		/* find data/hole in dnode block */
 		for (; dn.ofs_in_node < end_offset;
 				dn.ofs_in_node++, pgofs++,
-				data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
+				data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
 			block_t blkaddr;
 			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 
@@ -387,10 +392,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 found:
 	if (whence == SEEK_HOLE && data_ofs > isize)
 		data_ofs = isize;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return vfs_setpos(file, data_ofs, maxbytes);
 fail:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return -ENXIO;
 }
 
@@ -418,19 +423,20 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
 static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
+	int err;
 
 	if (f2fs_encrypted_inode(inode)) {
-		int err = f2fs_get_encryption_info(inode);
+		err = fscrypt_get_encryption_info(inode);
 		if (err)
 			return 0;
+		if (!f2fs_encrypted_inode(inode))
+			return -ENOKEY;
 	}
 
 	/* we don't need to use inline_data strictly */
-	if (f2fs_has_inline_data(inode)) {
-		int err = f2fs_convert_inline_inode(inode);
-		if (err)
-			return err;
-	}
+	err = f2fs_convert_inline_inode(inode);
+	if (err)
+		return err;
 
 	file_accessed(file);
 	vma->vm_ops = &f2fs_file_vm_ops;
@@ -440,12 +446,22 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 static int f2fs_file_open(struct inode *inode, struct file *filp)
 {
 	int ret = generic_file_open(inode, filp);
+	struct dentry *dir;
 
 	if (!ret && f2fs_encrypted_inode(inode)) {
-		ret = f2fs_get_encryption_info(inode);
+		ret = fscrypt_get_encryption_info(inode);
 		if (ret)
-			ret = -EACCES;
+			return -EACCES;
+		if (!fscrypt_has_encryption_key(inode))
+			return -ENOKEY;
 	}
+	dir = dget_parent(file_dentry(filp));
+	if (f2fs_encrypted_inode(d_inode(dir)) &&
+			!fscrypt_has_permitted_context(d_inode(dir), inode)) {
+		dput(dir);
+		return -EPERM;
+	}
+	dput(dir);
 	return ret;
 }
 
@@ -468,8 +484,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 		set_data_blkaddr(dn);
 		invalidate_blocks(sbi, blkaddr);
 		if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
-			clear_inode_flag(F2FS_I(dn->inode),
-						FI_FIRST_BLOCK_WRITTEN);
+			clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
 		nr_free++;
 	}
 
@@ -480,14 +495,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 		 * we will invalidate all blkaddr in the whole range.
 		 */
 		fofs = start_bidx_of_node(ofs_of_node(dn->node_page),
-						F2FS_I(dn->inode)) + ofs;
+							dn->inode) + ofs;
 		f2fs_update_extent_cache_range(dn, fofs, 0, len);
 		dec_valid_block_count(sbi, dn->inode, nr_free);
-		set_page_dirty(dn->node_page);
-		sync_inode_page(dn);
 	}
 	dn->ofs_in_node = ofs;
 
+	f2fs_update_time(sbi, REQ_TIME);
 	trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
 					 dn->ofs_in_node, nr_free);
 	return nr_free;
@@ -501,8 +515,8 @@ void truncate_data_blocks(struct dnode_of_data *dn)
 static int truncate_partial_data_page(struct inode *inode, u64 from,
 								bool cache_only)
 {
-	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_SIZE - 1);
+	pgoff_t index = from >> PAGE_SHIFT;
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 
@@ -510,7 +524,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 		return 0;
 
 	if (cache_only) {
-		page = f2fs_grab_cache_page(mapping, index, false);
+		page = find_lock_page(mapping, index);
 		if (page && PageUptodate(page))
 			goto truncate_out;
 		f2fs_put_page(page, 1);
@@ -521,9 +535,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 	if (IS_ERR(page))
 		return 0;
 truncate_out:
-	f2fs_wait_on_page_writeback(page, DATA);
-	zero_user(page, offset, PAGE_CACHE_SIZE - offset);
-	if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+	f2fs_wait_on_page_writeback(page, DATA, true);
+	zero_user(page, offset, PAGE_SIZE - offset);
+	if (!cache_only || !f2fs_encrypted_inode(inode) ||
+					!S_ISREG(inode->i_mode))
 		set_page_dirty(page);
 	f2fs_put_page(page, 1);
 	return 0;
@@ -543,6 +558,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
 
 	free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
 
+	if (free_from >= sbi->max_file_blocks)
+		goto free_partial;
+
 	if (lock)
 		f2fs_lock_op(sbi);
 
@@ -561,14 +579,14 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
 	}
 
 	set_new_dnode(&dn, inode, ipage, NULL, 0);
-	err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
+	err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA);
 	if (err) {
 		if (err == -ENOENT)
 			goto free_next;
 		goto out;
 	}
 
-	count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+	count = ADDRS_PER_PAGE(dn.node_page, inode);
 
 	count -= dn.ofs_in_node;
 	f2fs_bug_on(sbi, count < 0);
@@ -584,7 +602,7 @@ free_next:
 out:
 	if (lock)
 		f2fs_unlock_op(sbi);
-
+free_partial:
 	/* lastly zero out the first data page */
 	if (!err)
 		err = truncate_partial_data_page(inode, from, truncate_page);
@@ -593,7 +611,7 @@ out:
 	return err;
 }
 
-int f2fs_truncate(struct inode *inode, bool lock)
+int f2fs_truncate(struct inode *inode)
 {
 	int err;
 
@@ -604,18 +622,18 @@ int f2fs_truncate(struct inode *inode, bool lock)
 	trace_f2fs_truncate(inode);
 
 	/* we should check inline_data size */
-	if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) {
+	if (!f2fs_may_inline_data(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
 			return err;
 	}
 
-	err = truncate_blocks(inode, i_size_read(inode), lock);
+	err = truncate_blocks(inode, i_size_read(inode), true);
 	if (err)
 		return err;
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	mark_inode_dirty(inode);
+	f2fs_mark_inode_dirty_sync(inode);
 	return 0;
 }
 
@@ -631,7 +649,6 @@ int f2fs_getattr(struct vfsmount *mnt,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 static void __setattr_copy(struct inode *inode, const struct iattr *attr)
 {
-	struct f2fs_inode_info *fi = F2FS_I(inode);
 	unsigned int ia_valid = attr->ia_valid;
 
 	if (ia_valid & ATTR_UID)
@@ -652,7 +669,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
 
 		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
 			mode &= ~S_ISGID;
-		set_acl_inode(fi, mode);
+		set_acl_inode(inode, mode);
 	}
 }
 #else
@@ -662,7 +679,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
 int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
-	struct f2fs_inode_info *fi = F2FS_I(inode);
 	int err;
 
 	err = inode_change_ok(inode, attr);
@@ -671,21 +687,28 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (f2fs_encrypted_inode(inode) &&
-				f2fs_get_encryption_info(inode))
+				fscrypt_get_encryption_info(inode))
 			return -EACCES;
 
 		if (attr->ia_size <= i_size_read(inode)) {
 			truncate_setsize(inode, attr->ia_size);
-			err = f2fs_truncate(inode, true);
+			err = f2fs_truncate(inode);
 			if (err)
 				return err;
-			f2fs_balance_fs(F2FS_I_SB(inode));
+			f2fs_balance_fs(F2FS_I_SB(inode), true);
 		} else {
 			/*
 			 * do not trim all blocks after i_size if target size is
 			 * larger than i_size.
 			 */
 			truncate_setsize(inode, attr->ia_size);
+
+			/* should convert inline inode here */
+			if (!f2fs_may_inline_data(inode)) {
+				err = f2fs_convert_inline_inode(inode);
+				if (err)
+					return err;
+			}
 			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		}
 	}
@@ -694,13 +717,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (attr->ia_valid & ATTR_MODE) {
 		err = posix_acl_chmod(inode, get_inode_mode(inode));
-		if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
-			inode->i_mode = fi->i_acl_mode;
-			clear_inode_flag(fi, FI_ACL_MODE);
+		if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
+			inode->i_mode = F2FS_I(inode)->i_acl_mode;
+			clear_inode_flag(inode, FI_ACL_MODE);
 		}
 	}
 
-	mark_inode_dirty(inode);
+	f2fs_mark_inode_dirty_sync(inode);
 	return err;
 }
 
@@ -727,7 +750,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
 	if (!len)
 		return 0;
 
-	f2fs_balance_fs(sbi);
+	f2fs_balance_fs(sbi, true);
 
 	f2fs_lock_op(sbi);
 	page = get_new_data_page(inode, NULL, index, false);
@@ -736,7 +759,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 
-	f2fs_wait_on_page_writeback(page, DATA);
+	f2fs_wait_on_page_writeback(page, DATA, true);
 	zero_user(page, start, len);
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
@@ -761,7 +784,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
 			return err;
 		}
 
-		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+		end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
 		count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
 
 		f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
@@ -778,19 +801,17 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	pgoff_t pg_start, pg_end;
 	loff_t off_start, off_end;
-	int ret = 0;
+	int ret;
 
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-	}
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
-	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
-	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+	pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+	pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
 
-	off_start = offset & (PAGE_CACHE_SIZE - 1);
-	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+	off_start = offset & (PAGE_SIZE - 1);
+	off_end = (offset + len) & (PAGE_SIZE - 1);
 
 	if (pg_start == pg_end) {
 		ret = fill_zero(inode, pg_start, off_start,
@@ -800,7 +821,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	} else {
 		if (off_start) {
 			ret = fill_zero(inode, pg_start++, off_start,
-						PAGE_CACHE_SIZE - off_start);
+						PAGE_SIZE - off_start);
 			if (ret)
 				return ret;
 		}
@@ -815,10 +836,10 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			loff_t blk_start, blk_end;
 			struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
-			f2fs_balance_fs(sbi);
+			f2fs_balance_fs(sbi, true);
 
-			blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT;
-			blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT;
+			blk_start = (loff_t)pg_start << PAGE_SHIFT;
+			blk_end = (loff_t)pg_end << PAGE_SHIFT;
 			truncate_inode_pages_range(mapping, blk_start,
 					blk_end - 1);
 
@@ -831,83 +852,199 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	return ret;
 }
 
-static int __exchange_data_block(struct inode *inode, pgoff_t src,
-					pgoff_t dst, bool full)
+static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr,
+				int *do_replace, pgoff_t off, pgoff_t len)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn;
-	block_t new_addr;
-	bool do_replace = false;
-	int ret;
+	int ret, done, i;
 
+next_dnode:
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA);
+	ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA);
 	if (ret && ret != -ENOENT) {
 		return ret;
 	} else if (ret == -ENOENT) {
-		new_addr = NULL_ADDR;
-	} else {
-		new_addr = dn.data_blkaddr;
-		if (!is_checkpointed_data(sbi, new_addr)) {
-			dn.data_blkaddr = NULL_ADDR;
+		if (dn.max_level == 0)
+			return -ENOENT;
+		done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len);
+		blkaddr += done;
+		do_replace += done;
+		goto next;
+	}
+
+	done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
+							dn.ofs_in_node, len);
+	for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
+		*blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+		if (!is_checkpointed_data(sbi, *blkaddr)) {
+
+			if (test_opt(sbi, LFS)) {
+				f2fs_put_dnode(&dn);
+				return -ENOTSUPP;
+			}
+
 			/* do not invalidate this block address */
-			set_data_blkaddr(&dn);
-			f2fs_update_extent_cache(&dn);
-			do_replace = true;
+			f2fs_update_data_blkaddr(&dn, NULL_ADDR);
+			*do_replace = 1;
+		}
+	}
+	f2fs_put_dnode(&dn);
+next:
+	len -= done;
+	off += done;
+	if (len)
+		goto next_dnode;
+	return 0;
+}
+
+static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr,
+				int *do_replace, pgoff_t off, int len)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct dnode_of_data dn;
+	int ret, i;
+
+	for (i = 0; i < len; i++, do_replace++, blkaddr++) {
+		if (*do_replace == 0)
+			continue;
+
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA);
+		if (ret) {
+			dec_valid_block_count(sbi, inode, 1);
+			invalidate_blocks(sbi, *blkaddr);
+		} else {
+			f2fs_update_data_blkaddr(&dn, *blkaddr);
 		}
 		f2fs_put_dnode(&dn);
 	}
+	return 0;
+}
 
-	if (new_addr == NULL_ADDR)
-		return full ? truncate_hole(inode, dst, dst + 1) : 0;
+static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
+			block_t *blkaddr, int *do_replace,
+			pgoff_t src, pgoff_t dst, pgoff_t len, bool full)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode);
+	pgoff_t i = 0;
+	int ret;
 
-	if (do_replace) {
-		struct page *ipage = get_node_page(sbi, inode->i_ino);
-		struct node_info ni;
-
-		if (IS_ERR(ipage)) {
-			ret = PTR_ERR(ipage);
-			goto err_out;
+	while (i < len) {
+		if (blkaddr[i] == NULL_ADDR && !full) {
+			i++;
+			continue;
 		}
 
-		set_new_dnode(&dn, inode, ipage, NULL, 0);
-		ret = f2fs_reserve_block(&dn, dst);
-		if (ret)
-			goto err_out;
+		if (do_replace[i] || blkaddr[i] == NULL_ADDR) {
+			struct dnode_of_data dn;
+			struct node_info ni;
+			size_t new_size;
+			pgoff_t ilen;
 
-		truncate_data_blocks_range(&dn, 1);
+			set_new_dnode(&dn, dst_inode, NULL, NULL, 0);
+			ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE);
+			if (ret)
+				return ret;
 
-		get_node_info(sbi, dn.nid, &ni);
-		f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
-				ni.version, true);
-		f2fs_put_dnode(&dn);
-	} else {
-		struct page *psrc, *pdst;
+			get_node_info(sbi, dn.nid, &ni);
+			ilen = min((pgoff_t)
+				ADDRS_PER_PAGE(dn.node_page, dst_inode) -
+						dn.ofs_in_node, len - i);
+			do {
+				dn.data_blkaddr = datablock_addr(dn.node_page,
+								dn.ofs_in_node);
+				truncate_data_blocks_range(&dn, 1);
 
-		psrc = get_lock_data_page(inode, src, true);
-		if (IS_ERR(psrc))
-			return PTR_ERR(psrc);
-		pdst = get_new_data_page(inode, NULL, dst, false);
-		if (IS_ERR(pdst)) {
+				if (do_replace[i]) {
+					f2fs_i_blocks_write(src_inode,
+								1, false);
+					f2fs_i_blocks_write(dst_inode,
+								1, true);
+					f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+					blkaddr[i], ni.version, true, false);
+
+					do_replace[i] = 0;
+				}
+				dn.ofs_in_node++;
+				i++;
+				new_size = (dst + i) << PAGE_SHIFT;
+				if (dst_inode->i_size < new_size)
+					f2fs_i_size_write(dst_inode, new_size);
+			} while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen);
+
+			f2fs_put_dnode(&dn);
+		} else {
+			struct page *psrc, *pdst;
+
+			psrc = get_lock_data_page(src_inode, src + i, true);
+			if (IS_ERR(psrc))
+				return PTR_ERR(psrc);
+			pdst = get_new_data_page(dst_inode, NULL, dst + i,
+								true);
+			if (IS_ERR(pdst)) {
+				f2fs_put_page(psrc, 1);
+				return PTR_ERR(pdst);
+			}
+			f2fs_copy_page(psrc, pdst);
+			set_page_dirty(pdst);
+			f2fs_put_page(pdst, 1);
 			f2fs_put_page(psrc, 1);
-			return PTR_ERR(pdst);
-		}
-		f2fs_copy_page(psrc, pdst);
-		set_page_dirty(pdst);
-		f2fs_put_page(pdst, 1);
-		f2fs_put_page(psrc, 1);
 
-		return truncate_hole(inode, src, src + 1);
+			ret = truncate_hole(src_inode, src + i, src + i + 1);
+			if (ret)
+				return ret;
+			i++;
+		}
+	}
+	return 0;
+}
+
+static int __exchange_data_block(struct inode *src_inode,
+			struct inode *dst_inode, pgoff_t src, pgoff_t dst,
+			pgoff_t len, bool full)
+{
+	block_t *src_blkaddr;
+	int *do_replace;
+	pgoff_t olen;
+	int ret;
+
+	while (len) {
+		olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
+
+		src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+		if (!src_blkaddr)
+			return -ENOMEM;
+
+		do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+		if (!do_replace) {
+			kvfree(src_blkaddr);
+			return -ENOMEM;
+		}
+
+		ret = __read_out_blkaddrs(src_inode, src_blkaddr,
+					do_replace, src, olen);
+		if (ret)
+			goto roll_back;
+
+		ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr,
+					do_replace, src, dst, olen, full);
+		if (ret)
+			goto roll_back;
+
+		src += olen;
+		dst += olen;
+		len -= olen;
+
+		kvfree(src_blkaddr);
+		kvfree(do_replace);
 	}
 	return 0;
 
-err_out:
-	if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
-		dn.data_blkaddr = new_addr;
-		set_data_blkaddr(&dn);
-		f2fs_update_extent_cache(&dn);
-		f2fs_put_dnode(&dn);
-	}
+roll_back:
+	__roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len);
+	kvfree(src_blkaddr);
+	kvfree(do_replace);
 	return ret;
 }
 
@@ -915,16 +1052,15 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
-	int ret = 0;
+	int ret;
 
-	for (; end < nrpages; start++, end++) {
-		f2fs_balance_fs(sbi);
-		f2fs_lock_op(sbi);
-		ret = __exchange_data_block(inode, end, start, true);
-		f2fs_unlock_op(sbi);
-		if (ret)
-			break;
-	}
+	f2fs_balance_fs(sbi, true);
+	f2fs_lock_op(sbi);
+
+	f2fs_drop_extent_tree(inode);
+
+	ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
+	f2fs_unlock_op(sbi);
 	return ret;
 }
 
@@ -941,16 +1077,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
 		return -EINVAL;
 
-	f2fs_balance_fs(F2FS_I_SB(inode));
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-	}
-
-	pg_start = offset >> PAGE_CACHE_SHIFT;
-	pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+	pg_start = offset >> PAGE_SHIFT;
+	pg_end = (offset + len) >> PAGE_SHIFT;
 
 	/* write out all dirty pages from offset */
 	ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -972,7 +1104,50 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 
 	ret = truncate_blocks(inode, new_size, true);
 	if (!ret)
-		i_size_write(inode, new_size);
+		f2fs_i_size_write(inode, new_size);
+
+	return ret;
+}
+
+static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
+								pgoff_t end)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	pgoff_t index = start;
+	unsigned int ofs_in_node = dn->ofs_in_node;
+	blkcnt_t count = 0;
+	int ret;
+
+	for (; index < end; index++, dn->ofs_in_node++) {
+		if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR)
+			count++;
+	}
+
+	dn->ofs_in_node = ofs_in_node;
+	ret = reserve_new_blocks(dn, count);
+	if (ret)
+		return ret;
+
+	dn->ofs_in_node = ofs_in_node;
+	for (index = start; index < end; index++, dn->ofs_in_node++) {
+		dn->data_blkaddr =
+				datablock_addr(dn->node_page, dn->ofs_in_node);
+		/*
+		 * reserve_new_blocks will not guarantee entire block
+		 * allocation.
+		 */
+		if (dn->data_blkaddr == NULL_ADDR) {
+			ret = -ENOSPC;
+			break;
+		}
+		if (dn->data_blkaddr != NEW_ADDR) {
+			invalidate_blocks(sbi, dn->data_blkaddr);
+			dn->data_blkaddr = NEW_ADDR;
+			set_data_blkaddr(dn);
+		}
+	}
+
+	f2fs_update_extent_cache_range(dn, start, 0, index - start);
 
 	return ret;
 }
@@ -991,13 +1166,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 	if (ret)
 		return ret;
 
-	f2fs_balance_fs(sbi);
-
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-	}
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
 	ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
 	if (ret)
@@ -1005,11 +1176,11 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 
 	truncate_pagecache_range(inode, offset, offset + len - 1);
 
-	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
-	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+	pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+	pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
 
-	off_start = offset & (PAGE_CACHE_SIZE - 1);
-	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+	off_start = offset & (PAGE_SIZE - 1);
+	off_end = (offset + len) & (PAGE_SIZE - 1);
 
 	if (pg_start == pg_end) {
 		ret = fill_zero(inode, pg_start, off_start,
@@ -1023,48 +1194,40 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 	} else {
 		if (off_start) {
 			ret = fill_zero(inode, pg_start++, off_start,
-						PAGE_CACHE_SIZE - off_start);
+						PAGE_SIZE - off_start);
 			if (ret)
 				return ret;
 
 			new_size = max_t(loff_t, new_size,
-					(loff_t)pg_start << PAGE_CACHE_SHIFT);
+					(loff_t)pg_start << PAGE_SHIFT);
 		}
 
-		for (index = pg_start; index < pg_end; index++) {
+		for (index = pg_start; index < pg_end;) {
 			struct dnode_of_data dn;
-			struct page *ipage;
+			unsigned int end_offset;
+			pgoff_t end;
 
 			f2fs_lock_op(sbi);
 
-			ipage = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(ipage)) {
-				ret = PTR_ERR(ipage);
-				f2fs_unlock_op(sbi);
-				goto out;
-			}
-
-			set_new_dnode(&dn, inode, ipage, NULL, 0);
-			ret = f2fs_reserve_block(&dn, index);
+			set_new_dnode(&dn, inode, NULL, NULL, 0);
+			ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
 			if (ret) {
 				f2fs_unlock_op(sbi);
 				goto out;
 			}
 
-			if (dn.data_blkaddr != NEW_ADDR) {
-				invalidate_blocks(sbi, dn.data_blkaddr);
+			end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+			end = min(pg_end, end_offset - dn.ofs_in_node + index);
 
-				dn.data_blkaddr = NEW_ADDR;
-				set_data_blkaddr(&dn);
-
-				dn.data_blkaddr = NULL_ADDR;
-				f2fs_update_extent_cache(&dn);
-			}
+			ret = f2fs_do_zero_range(&dn, index, end);
 			f2fs_put_dnode(&dn);
 			f2fs_unlock_op(sbi);
+			if (ret)
+				goto out;
 
+			index = end;
 			new_size = max_t(loff_t, new_size,
-				(loff_t)(index + 1) << PAGE_CACHE_SHIFT);
+					(loff_t)index << PAGE_SHIFT);
 		}
 
 		if (off_end) {
@@ -1077,11 +1240,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 	}
 
 out:
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) {
-		i_size_write(inode, new_size);
-		mark_inode_dirty(inode);
-		update_inode_page(inode);
-	}
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
+		f2fs_i_size_write(inode, new_size);
 
 	return ret;
 }
@@ -1089,7 +1249,7 @@ out:
 static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	pgoff_t pg_start, pg_end, delta, nrpages, idx;
+	pgoff_t nr, pg_start, pg_end, delta, idx;
 	loff_t new_size;
 	int ret = 0;
 
@@ -1104,13 +1264,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
 		return -EINVAL;
 
-	f2fs_balance_fs(sbi);
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-	}
+	f2fs_balance_fs(sbi, true);
 
 	ret = truncate_blocks(inode, i_size_read(inode), true);
 	if (ret)
@@ -1123,17 +1281,23 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	truncate_pagecache(inode, offset);
 
-	pg_start = offset >> PAGE_CACHE_SHIFT;
-	pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+	pg_start = offset >> PAGE_SHIFT;
+	pg_end = (offset + len) >> PAGE_SHIFT;
 	delta = pg_end - pg_start;
-	nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+	idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+
+	while (!ret && idx > pg_start) {
+		nr = idx - pg_start;
+		if (nr > delta)
+			nr = delta;
+		idx -= nr;
 
-	for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) {
 		f2fs_lock_op(sbi);
-		ret = __exchange_data_block(inode, idx, idx + delta, false);
+		f2fs_drop_extent_tree(inode);
+
+		ret = __exchange_data_block(inode, inode, idx,
+					idx + delta, nr, false);
 		f2fs_unlock_op(sbi);
-		if (ret)
-			break;
 	}
 
 	/* write out all moved pages, if possible */
@@ -1141,7 +1305,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	truncate_pagecache(inode, offset);
 
 	if (!ret)
-		i_size_write(inode, new_size);
+		f2fs_i_size_write(inode, new_size);
 	return ret;
 }
 
@@ -1149,60 +1313,48 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 					loff_t len, int mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	pgoff_t index, pg_start, pg_end;
+	struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+	pgoff_t pg_end;
 	loff_t new_size = i_size_read(inode);
-	loff_t off_start, off_end;
-	int ret = 0;
-
-	f2fs_balance_fs(sbi);
+	loff_t off_end;
+	int ret;
 
 	ret = inode_newsize_ok(inode, (len + offset));
 	if (ret)
 		return ret;
 
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
+
+	f2fs_balance_fs(sbi, true);
+
+	pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT;
+	off_end = (offset + len) & (PAGE_SIZE - 1);
+
+	map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT;
+	map.m_len = pg_end - map.m_lblk;
+	if (off_end)
+		map.m_len++;
+
+	ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+	if (ret) {
+		pgoff_t last_off;
+
+		if (!map.m_len)
 			return ret;
+
+		last_off = map.m_lblk + map.m_len - 1;
+
+		/* update new size to the failed position */
+		new_size = (last_off == pg_end) ? offset + len:
+					(loff_t)(last_off + 1) << PAGE_SHIFT;
+	} else {
+		new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
 	}
 
-	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
-	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
-
-	off_start = offset & (PAGE_CACHE_SIZE - 1);
-	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
-
-	f2fs_lock_op(sbi);
-
-	for (index = pg_start; index <= pg_end; index++) {
-		struct dnode_of_data dn;
-
-		if (index == pg_end && !off_end)
-			goto noalloc;
-
-		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		ret = f2fs_reserve_block(&dn, index);
-		if (ret)
-			break;
-noalloc:
-		if (pg_start == pg_end)
-			new_size = offset + len;
-		else if (index == pg_start && off_start)
-			new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT;
-		else if (index == pg_end)
-			new_size = ((loff_t)index << PAGE_CACHE_SHIFT) +
-								off_end;
-		else
-			new_size += PAGE_CACHE_SIZE;
-	}
-
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-		i_size_read(inode) < new_size) {
-		i_size_write(inode, new_size);
-		mark_inode_dirty(inode);
-		update_inode_page(inode);
-	}
-	f2fs_unlock_op(sbi);
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
+		f2fs_i_size_write(inode, new_size);
 
 	return ret;
 }
@@ -1226,7 +1378,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 			FALLOC_FL_INSERT_RANGE))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		if (offset >= inode->i_size)
@@ -1245,11 +1397,12 @@ static long f2fs_fallocate(struct file *file, int mode,
 
 	if (!ret) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		mark_inode_dirty(inode);
+		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	trace_f2fs_fallocate(inode, mode, offset, len, ret);
 	return ret;
@@ -1257,13 +1410,22 @@ out:
 
 static int f2fs_release_file(struct inode *inode, struct file *filp)
 {
+	/*
+	 * f2fs_relase_file is called at every close calls. So we should
+	 * not drop any inmemory pages by close called by other process.
+	 */
+	if (!(filp->f_mode & FMODE_WRITE) ||
+			atomic_read(&inode->i_writecount) != 1)
+		return 0;
+
 	/* some remained atomic pages should discarded */
 	if (f2fs_is_atomic_file(inode))
-		commit_inmem_pages(inode, true);
+		drop_inmem_pages(inode);
 	if (f2fs_is_volatile_file(inode)) {
-		set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+		clear_inode_flag(inode, FI_VOLATILE_FILE);
+		set_inode_flag(inode, FI_DROP_CACHE);
 		filemap_fdatawrite(inode->i_mapping);
-		clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+		clear_inode_flag(inode, FI_DROP_CACHE);
 	}
 	return 0;
 }
@@ -1293,33 +1455,29 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
-	unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+	unsigned int flags;
 	unsigned int oldflags;
 	int ret;
 
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	if (get_user(flags, (int __user *)arg))
+		return -EFAULT;
+
 	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
 
-	if (!inode_owner_or_capable(inode)) {
-		ret = -EACCES;
-		goto out;
-	}
-
-	if (get_user(flags, (int __user *)arg)) {
-		ret = -EFAULT;
-		goto out;
-	}
-
 	flags = f2fs_mask_flags(inode->i_mode, flags);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	oldflags = fi->i_flags;
 
 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
 		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			ret = -EPERM;
 			goto out;
 		}
@@ -1328,11 +1486,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 	flags = flags & FS_FL_USER_MODIFIABLE;
 	flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
 	fi->i_flags = flags;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
-	f2fs_set_inode_flags(inode);
 	inode->i_ctime = CURRENT_TIME;
-	mark_inode_dirty(inode);
+	f2fs_set_inode_flags(inode);
 out:
 	mnt_drop_write_file(filp);
 	return ret;
@@ -1353,17 +1510,35 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	f2fs_balance_fs(F2FS_I_SB(inode));
-
-	if (f2fs_is_atomic_file(inode))
-		return 0;
-
-	ret = f2fs_convert_inline_inode(inode);
+	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
 
-	set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-	return 0;
+	inode_lock(inode);
+
+	if (f2fs_is_atomic_file(inode))
+		goto out;
+
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		goto out;
+
+	set_inode_flag(inode, FI_ATOMIC_FILE);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
+	if (!get_dirty_pages(inode))
+		goto out;
+
+	f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+		"Unexpected flush for atomic writes: ino=%lu, npages=%lld",
+					inode->i_ino, get_dirty_pages(inode));
+	ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+	if (ret)
+		clear_inode_flag(inode, FI_ATOMIC_FILE);
+out:
+	inode_unlock(inode);
+	mnt_drop_write_file(filp);
+	return ret;
 }
 
 static int f2fs_ioc_commit_atomic_write(struct file *filp)
@@ -1374,22 +1549,27 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	if (f2fs_is_volatile_file(inode))
-		return 0;
-
 	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
 
+	inode_lock(inode);
+
+	if (f2fs_is_volatile_file(inode))
+		goto err_out;
+
 	if (f2fs_is_atomic_file(inode)) {
-		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-		ret = commit_inmem_pages(inode, false);
-		if (ret)
+		clear_inode_flag(inode, FI_ATOMIC_FILE);
+		ret = commit_inmem_pages(inode);
+		if (ret) {
+			set_inode_flag(inode, FI_ATOMIC_FILE);
 			goto err_out;
+		}
 	}
 
-	ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+	ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
 err_out:
+	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
 }
@@ -1402,31 +1582,54 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	if (f2fs_is_volatile_file(inode))
-		return 0;
-
-	ret = f2fs_convert_inline_inode(inode);
+	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
 
-	set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
-	return 0;
+	inode_lock(inode);
+
+	if (f2fs_is_volatile_file(inode))
+		goto out;
+
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		goto out;
+
+	set_inode_flag(inode, FI_VOLATILE_FILE);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+out:
+	inode_unlock(inode);
+	mnt_drop_write_file(filp);
+	return ret;
 }
 
 static int f2fs_ioc_release_volatile_write(struct file *filp)
 {
 	struct inode *inode = file_inode(filp);
+	int ret;
 
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	inode_lock(inode);
+
 	if (!f2fs_is_volatile_file(inode))
-		return 0;
+		goto out;
 
-	if (!f2fs_is_first_block_written(inode))
-		return truncate_partial_data_page(inode, 0, true);
+	if (!f2fs_is_first_block_written(inode)) {
+		ret = truncate_partial_data_page(inode, 0, true);
+		goto out;
+	}
 
-	return punch_hole(inode, 0, F2FS_BLKSIZE);
+	ret = punch_hole(inode, 0, F2FS_BLKSIZE);
+out:
+	inode_unlock(inode);
+	mnt_drop_write_file(filp);
+	return ret;
 }
 
 static int f2fs_ioc_abort_volatile_write(struct file *filp)
@@ -1441,13 +1644,19 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 	if (ret)
 		return ret;
 
-	f2fs_balance_fs(F2FS_I_SB(inode));
+	inode_lock(inode);
 
-	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
-	commit_inmem_pages(inode, true);
+	if (f2fs_is_atomic_file(inode))
+		drop_inmem_pages(inode);
+	if (f2fs_is_volatile_file(inode)) {
+		clear_inode_flag(inode, FI_VOLATILE_FILE);
+		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+	}
+
+	inode_unlock(inode);
 
 	mnt_drop_write_file(filp);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return ret;
 }
 
@@ -1457,6 +1666,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct super_block *sb = sbi->sb;
 	__u32 in;
+	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1464,30 +1674,38 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 	if (get_user(in, (__u32 __user *)arg))
 		return -EFAULT;
 
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
 	switch (in) {
 	case F2FS_GOING_DOWN_FULLSYNC:
 		sb = freeze_bdev(sb->s_bdev);
 		if (sb && !IS_ERR(sb)) {
-			f2fs_stop_checkpoint(sbi);
+			f2fs_stop_checkpoint(sbi, false);
 			thaw_bdev(sb->s_bdev, sb);
 		}
 		break;
 	case F2FS_GOING_DOWN_METASYNC:
 		/* do checkpoint only */
 		f2fs_sync_fs(sb, 1);
-		f2fs_stop_checkpoint(sbi);
+		f2fs_stop_checkpoint(sbi, false);
 		break;
 	case F2FS_GOING_DOWN_NOSYNC:
-		f2fs_stop_checkpoint(sbi);
+		f2fs_stop_checkpoint(sbi, false);
 		break;
 	case F2FS_GOING_DOWN_METAFLUSH:
 		sync_meta_pages(sbi, META, LONG_MAX);
-		f2fs_stop_checkpoint(sbi);
+		f2fs_stop_checkpoint(sbi, false);
 		break;
 	default:
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
-	return 0;
+	f2fs_update_time(sbi, REQ_TIME);
+out:
+	mnt_drop_write_file(filp);
+	return ret;
 }
 
 static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
@@ -1508,15 +1726,21 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
 				sizeof(range)))
 		return -EFAULT;
 
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
 	range.minlen = max((unsigned int)range.minlen,
 				q->limits.discard_granularity);
 	ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+	mnt_drop_write_file(filp);
 	if (ret < 0)
 		return ret;
 
 	if (copy_to_user((struct fstrim_range __user *)arg, &range,
 				sizeof(range)))
 		return -EFAULT;
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return 0;
 }
 
@@ -1532,45 +1756,31 @@ static bool uuid_is_nonzero(__u8 u[16])
 
 static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	struct f2fs_encryption_policy policy;
+	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
-	int err;
 
-	if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg,
-				sizeof(policy)))
+	if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
+							sizeof(policy)))
 		return -EFAULT;
 
-	mutex_lock(&inode->i_mutex);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 
-	err = f2fs_process_policy(&policy, inode);
-
-	mutex_unlock(&inode->i_mutex);
-
-	return err;
-#else
-	return -EOPNOTSUPP;
-#endif
+	return fscrypt_process_policy(filp, &policy);
 }
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	struct f2fs_encryption_policy policy;
+	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 	int err;
 
-	err = f2fs_get_policy(inode, &policy);
+	err = fscrypt_get_policy(inode, &policy);
 	if (err)
 		return err;
 
-	if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy,
-							sizeof(policy)))
+	if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
 		return -EFAULT;
 	return 0;
-#else
-	return -EOPNOTSUPP;
-#endif
 }
 
 static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1593,13 +1803,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
 	generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
 
 	err = f2fs_commit_super(sbi, false);
-
-	mnt_drop_write_file(filp);
 	if (err) {
 		/* undo new data */
 		memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
+		mnt_drop_write_file(filp);
 		return err;
 	}
+	mnt_drop_write_file(filp);
 got_it:
 	if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
 									16))
@@ -1612,6 +1822,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
 	struct inode *inode = file_inode(filp);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	__u32 sync;
+	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1622,21 +1833,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
 	if (f2fs_readonly(sbi->sb))
 		return -EROFS;
 
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
 	if (!sync) {
-		if (!mutex_trylock(&sbi->gc_mutex))
-			return -EBUSY;
+		if (!mutex_trylock(&sbi->gc_mutex)) {
+			ret = -EBUSY;
+			goto out;
+		}
 	} else {
 		mutex_lock(&sbi->gc_mutex);
 	}
 
-	return f2fs_gc(sbi, sync);
+	ret = f2fs_gc(sbi, sync);
+out:
+	mnt_drop_write_file(filp);
+	return ret;
 }
 
 static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct cp_control cpc;
+	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1644,13 +1864,343 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
 	if (f2fs_readonly(sbi->sb))
 		return -EROFS;
 
-	cpc.reason = __get_cp_reason(sbi);
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
 
-	mutex_lock(&sbi->gc_mutex);
-	write_checkpoint(sbi, &cpc);
-	mutex_unlock(&sbi->gc_mutex);
+	ret = f2fs_sync_fs(sbi->sb, 1);
 
-	return 0;
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
+					struct file *filp,
+					struct f2fs_defragment *range)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+	struct extent_info ei;
+	pgoff_t pg_start, pg_end;
+	unsigned int blk_per_seg = sbi->blocks_per_seg;
+	unsigned int total = 0, sec_num;
+	unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
+	block_t blk_end = 0;
+	bool fragmented = false;
+	int err;
+
+	/* if in-place-update policy is enabled, don't waste time here */
+	if (need_inplace_update(inode))
+		return -EINVAL;
+
+	pg_start = range->start >> PAGE_SHIFT;
+	pg_end = (range->start + range->len) >> PAGE_SHIFT;
+
+	f2fs_balance_fs(sbi, true);
+
+	inode_lock(inode);
+
+	/* writeback all dirty pages in the range */
+	err = filemap_write_and_wait_range(inode->i_mapping, range->start,
+						range->start + range->len - 1);
+	if (err)
+		goto out;
+
+	/*
+	 * lookup mapping info in extent cache, skip defragmenting if physical
+	 * block addresses are continuous.
+	 */
+	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+		if (ei.fofs + ei.len >= pg_end)
+			goto out;
+	}
+
+	map.m_lblk = pg_start;
+
+	/*
+	 * lookup mapping info in dnode page cache, skip defragmenting if all
+	 * physical block addresses are continuous even if there are hole(s)
+	 * in logical blocks.
+	 */
+	while (map.m_lblk < pg_end) {
+		map.m_len = pg_end - map.m_lblk;
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+		if (err)
+			goto out;
+
+		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+			map.m_lblk++;
+			continue;
+		}
+
+		if (blk_end && blk_end != map.m_pblk) {
+			fragmented = true;
+			break;
+		}
+		blk_end = map.m_pblk + map.m_len;
+
+		map.m_lblk += map.m_len;
+	}
+
+	if (!fragmented)
+		goto out;
+
+	map.m_lblk = pg_start;
+	map.m_len = pg_end - pg_start;
+
+	sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+
+	/*
+	 * make sure there are enough free section for LFS allocation, this can
+	 * avoid defragment running in SSR mode when free section are allocated
+	 * intensively
+	 */
+	if (has_not_enough_free_secs(sbi, 0, sec_num)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	while (map.m_lblk < pg_end) {
+		pgoff_t idx;
+		int cnt = 0;
+
+do_map:
+		map.m_len = pg_end - map.m_lblk;
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+		if (err)
+			goto clear_out;
+
+		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+			map.m_lblk++;
+			continue;
+		}
+
+		set_inode_flag(inode, FI_DO_DEFRAG);
+
+		idx = map.m_lblk;
+		while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+			struct page *page;
+
+			page = get_lock_data_page(inode, idx, true);
+			if (IS_ERR(page)) {
+				err = PTR_ERR(page);
+				goto clear_out;
+			}
+
+			set_page_dirty(page);
+			f2fs_put_page(page, 1);
+
+			idx++;
+			cnt++;
+			total++;
+		}
+
+		map.m_lblk = idx;
+
+		if (idx < pg_end && cnt < blk_per_seg)
+			goto do_map;
+
+		clear_inode_flag(inode, FI_DO_DEFRAG);
+
+		err = filemap_fdatawrite(inode->i_mapping);
+		if (err)
+			goto out;
+	}
+clear_out:
+	clear_inode_flag(inode, FI_DO_DEFRAG);
+out:
+	inode_unlock(inode);
+	if (!err)
+		range->len = (u64)total << PAGE_SHIFT;
+	return err;
+}
+
+static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_defragment range;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	err = mnt_want_write_file(filp);
+	if (err)
+		return err;
+
+	if (f2fs_readonly(sbi->sb)) {
+		err = -EROFS;
+		goto out;
+	}
+
+	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
+							sizeof(range))) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	/* verify alignment of offset & size */
+	if (range.start & (F2FS_BLKSIZE - 1) ||
+		range.len & (F2FS_BLKSIZE - 1)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = f2fs_defragment_range(sbi, filp, &range);
+	f2fs_update_time(sbi, REQ_TIME);
+	if (err < 0)
+		goto out;
+
+	if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
+							sizeof(range)))
+		err = -EFAULT;
+out:
+	mnt_drop_write_file(filp);
+	return err;
+}
+
+static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
+			struct file *file_out, loff_t pos_out, size_t len)
+{
+	struct inode *src = file_inode(file_in);
+	struct inode *dst = file_inode(file_out);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(src);
+	size_t olen = len, dst_max_i_size = 0;
+	size_t dst_osize;
+	int ret;
+
+	if (file_in->f_path.mnt != file_out->f_path.mnt ||
+				src->i_sb != dst->i_sb)
+		return -EXDEV;
+
+	if (unlikely(f2fs_readonly(src->i_sb)))
+		return -EROFS;
+
+	if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
+		return -EINVAL;
+
+	if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
+		return -EOPNOTSUPP;
+
+	if (src == dst) {
+		if (pos_in == pos_out)
+			return 0;
+		if (pos_out > pos_in && pos_out < pos_in + len)
+			return -EINVAL;
+	}
+
+	inode_lock(src);
+	if (src != dst) {
+		if (!inode_trylock(dst)) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	ret = -EINVAL;
+	if (pos_in + len > src->i_size || pos_in + len < pos_in)
+		goto out_unlock;
+	if (len == 0)
+		olen = len = src->i_size - pos_in;
+	if (pos_in + len == src->i_size)
+		len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in;
+	if (len == 0) {
+		ret = 0;
+		goto out_unlock;
+	}
+
+	dst_osize = dst->i_size;
+	if (pos_out + olen > dst->i_size)
+		dst_max_i_size = pos_out + olen;
+
+	/* verify the end result is block aligned */
+	if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) ||
+			!IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) ||
+			!IS_ALIGNED(pos_out, F2FS_BLKSIZE))
+		goto out_unlock;
+
+	ret = f2fs_convert_inline_inode(src);
+	if (ret)
+		goto out_unlock;
+
+	ret = f2fs_convert_inline_inode(dst);
+	if (ret)
+		goto out_unlock;
+
+	/* write out all dirty pages from offset */
+	ret = filemap_write_and_wait_range(src->i_mapping,
+					pos_in, pos_in + len);
+	if (ret)
+		goto out_unlock;
+
+	ret = filemap_write_and_wait_range(dst->i_mapping,
+					pos_out, pos_out + len);
+	if (ret)
+		goto out_unlock;
+
+	f2fs_balance_fs(sbi, true);
+	f2fs_lock_op(sbi);
+	ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS,
+				pos_out >> F2FS_BLKSIZE_BITS,
+				len >> F2FS_BLKSIZE_BITS, false);
+
+	if (!ret) {
+		if (dst_max_i_size)
+			f2fs_i_size_write(dst, dst_max_i_size);
+		else if (dst_osize != dst->i_size)
+			f2fs_i_size_write(dst, dst_osize);
+	}
+	f2fs_unlock_op(sbi);
+out_unlock:
+	if (src != dst)
+		inode_unlock(dst);
+out:
+	inode_unlock(src);
+	return ret;
+}
+
+static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
+{
+	struct f2fs_move_range range;
+	struct fd dst;
+	int err;
+
+	if (!(filp->f_mode & FMODE_READ) ||
+			!(filp->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	if (copy_from_user(&range, (struct f2fs_move_range __user *)arg,
+							sizeof(range)))
+		return -EFAULT;
+
+	dst = fdget(range.dst_fd);
+	if (!dst.file)
+		return -EBADF;
+
+	if (!(dst.file->f_mode & FMODE_WRITE)) {
+		err = -EBADF;
+		goto err_out;
+	}
+
+	err = mnt_want_write_file(filp);
+	if (err)
+		goto err_out;
+
+	err = f2fs_move_file_range(filp, range.pos_in, dst.file,
+					range.pos_out, range.len);
+
+	mnt_drop_write_file(filp);
+
+	if (copy_to_user((struct f2fs_move_range __user *)arg,
+						&range, sizeof(range)))
+		err = -EFAULT;
+err_out:
+	fdput(dst);
+	return err;
 }
 
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -1686,6 +2236,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_gc(filp, arg);
 	case F2FS_IOC_WRITE_CHECKPOINT:
 		return f2fs_ioc_write_checkpoint(filp, arg);
+	case F2FS_IOC_DEFRAGMENT:
+		return f2fs_ioc_defragment(filp, arg);
+	case F2FS_IOC_MOVE_RANGE:
+		return f2fs_ioc_move_range(filp, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -1693,14 +2247,36 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct inode *inode = file_inode(iocb->ki_filp);
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct blk_plug plug;
+	ssize_t ret;
 
 	if (f2fs_encrypted_inode(inode) &&
-				!f2fs_has_encryption_key(inode) &&
-				f2fs_get_encryption_info(inode))
+				!fscrypt_has_encryption_key(inode) &&
+				fscrypt_get_encryption_info(inode))
 		return -EACCES;
 
-	return generic_file_write_iter(iocb, from);
+	inode_lock(inode);
+	ret = generic_write_checks(iocb, from);
+	if (ret > 0) {
+		ret = f2fs_preallocate_blocks(iocb, from);
+		if (!ret) {
+			blk_start_plug(&plug);
+			ret = __generic_file_write_iter(iocb, from);
+			blk_finish_plug(&plug);
+		}
+	}
+	inode_unlock(inode);
+
+	if (ret > 0) {
+		ssize_t err;
+
+		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
 }
 
 #ifdef CONFIG_COMPAT
@@ -1713,6 +2289,24 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC32_SETFLAGS:
 		cmd = F2FS_IOC_SETFLAGS;
 		break;
+	case F2FS_IOC32_GETVERSION:
+		cmd = F2FS_IOC_GETVERSION;
+		break;
+	case F2FS_IOC_START_ATOMIC_WRITE:
+	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+	case F2FS_IOC_START_VOLATILE_WRITE:
+	case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+	case F2FS_IOC_ABORT_VOLATILE_WRITE:
+	case F2FS_IOC_SHUTDOWN:
+	case F2FS_IOC_SET_ENCRYPTION_POLICY:
+	case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+	case F2FS_IOC_GET_ENCRYPTION_POLICY:
+	case F2FS_IOC_GARBAGE_COLLECT:
+	case F2FS_IOC_WRITE_CHECKPOINT:
+	case F2FS_IOC_DEFRAGMENT:
+		break;
+	case F2FS_IOC_MOVE_RANGE:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index fedbf67a0842..0a0a1ad1fe1f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -16,7 +16,6 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
-#include <linux/blkdev.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -48,6 +47,11 @@ static int gc_thread_func(void *data)
 			continue;
 		}
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+		if (time_to_inject(sbi, FAULT_CHECKPOINT))
+			f2fs_stop_checkpoint(sbi, false);
+#endif
+
 		/*
 		 * [GC triggering condition]
 		 * 0. GC is not conducted currently.
@@ -97,7 +101,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
 	int err = 0;
 
-	gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+	gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
 	if (!gc_th) {
 		err = -ENOMEM;
 		goto out;
@@ -173,9 +177,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
 {
 	/* SSR allocates in a segment unit */
 	if (p->alloc_mode == SSR)
-		return 1 << sbi->log_blocks_per_seg;
+		return sbi->blocks_per_seg;
 	if (p->gc_mode == GC_GREEDY)
-		return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+		return sbi->blocks_per_seg * p->ofs_unit;
 	else if (p->gc_mode == GC_CB)
 		return UINT_MAX;
 	else /* No other gc_mode */
@@ -246,6 +250,18 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 		return get_cb_cost(sbi, segno);
 }
 
+static unsigned int count_bits(const unsigned long *addr,
+				unsigned int offset, unsigned int len)
+{
+	unsigned int end = offset + len, sum = 0;
+
+	while (offset < end) {
+		if (test_bit(offset++, addr))
+			++sum;
+	}
+	return sum;
+}
+
 /*
  * This function is called from two paths.
  * One is garbage collection and the other is SSR segment selection.
@@ -259,9 +275,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct victim_sel_policy p;
-	unsigned int secno, max_cost;
+	unsigned int secno, last_victim;
 	unsigned int last_segment = MAIN_SEGS(sbi);
-	int nsearched = 0;
+	unsigned int nsearched = 0;
 
 	mutex_lock(&dirty_i->seglist_lock);
 
@@ -269,11 +285,12 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 	select_policy(sbi, gc_type, type, &p);
 
 	p.min_segno = NULL_SEGNO;
-	p.min_cost = max_cost = get_max_cost(sbi, &p);
+	p.min_cost = get_max_cost(sbi, &p);
 
 	if (p.max_search == 0)
 		goto out;
 
+	last_victim = sbi->last_victim[p.gc_mode];
 	if (p.alloc_mode == LFS && gc_type == FG_GC) {
 		p.min_segno = check_bg_victims(sbi);
 		if (p.min_segno != NULL_SEGNO)
@@ -296,27 +313,35 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 		}
 
 		p.offset = segno + p.ofs_unit;
-		if (p.ofs_unit > 1)
+		if (p.ofs_unit > 1) {
 			p.offset -= segno % p.ofs_unit;
+			nsearched += count_bits(p.dirty_segmap,
+						p.offset - p.ofs_unit,
+						p.ofs_unit);
+		} else {
+			nsearched++;
+		}
+
 
 		secno = GET_SECNO(sbi, segno);
 
 		if (sec_usage_check(sbi, secno))
-			continue;
+			goto next;
 		if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
-			continue;
+			goto next;
 
 		cost = get_gc_cost(sbi, segno, &p);
 
 		if (p.min_cost > cost) {
 			p.min_segno = segno;
 			p.min_cost = cost;
-		} else if (unlikely(cost == max_cost)) {
-			continue;
 		}
-
-		if (nsearched++ >= p.max_search) {
-			sbi->last_victim[p.gc_mode] = segno;
+next:
+		if (nsearched >= p.max_search) {
+			if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
+				sbi->last_victim[p.gc_mode] = last_victim + 1;
+			else
+				sbi->last_victim[p.gc_mode] = segno + 1;
 			break;
 		}
 	}
@@ -400,13 +425,13 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
  * On validity, copy that node with cold status, otherwise (invalid node)
  * ignore that.
  */
-static int gc_node_segment(struct f2fs_sb_info *sbi,
+static void gc_node_segment(struct f2fs_sb_info *sbi,
 		struct f2fs_summary *sum, unsigned int segno, int gc_type)
 {
-	bool initial = true;
 	struct f2fs_summary *entry;
 	block_t start_addr;
 	int off;
+	int phase = 0;
 
 	start_addr = START_BLOCK(sbi, segno);
 
@@ -419,16 +444,24 @@ next_step:
 		struct node_info ni;
 
 		/* stop BG_GC if there is not enough free sections. */
-		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
-			return 0;
+		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
+			return;
 
 		if (check_valid_map(sbi, segno, off) == 0)
 			continue;
 
-		if (initial) {
+		if (phase == 0) {
+			ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+							META_NAT, true);
+			continue;
+		}
+
+		if (phase == 1) {
 			ra_node_page(sbi, nid);
 			continue;
 		}
+
+		/* phase == 2 */
 		node_page = get_node_page(sbi, nid);
 		if (IS_ERR(node_page))
 			continue;
@@ -445,36 +478,12 @@ next_step:
 			continue;
 		}
 
-		/* set page dirty and write it */
-		if (gc_type == FG_GC) {
-			f2fs_wait_on_page_writeback(node_page, NODE);
-			set_page_dirty(node_page);
-		} else {
-			if (!PageWriteback(node_page))
-				set_page_dirty(node_page);
-		}
-		f2fs_put_page(node_page, 1);
+		move_node_page(node_page, gc_type);
 		stat_inc_node_blk_count(sbi, 1, gc_type);
 	}
 
-	if (initial) {
-		initial = false;
+	if (++phase < 3)
 		goto next_step;
-	}
-
-	if (gc_type == FG_GC) {
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = LONG_MAX,
-			.for_reclaim = 0,
-		};
-		sync_node_pages(sbi, 0, &wbc);
-
-		/* return 1 only if FG_GC succefully reclaimed one */
-		if (get_valid_blocks(sbi, segno, 1) == 0)
-			return 1;
-	}
-	return 0;
 }
 
 /*
@@ -484,7 +493,7 @@ next_step:
  * as indirect or double indirect node blocks, are given, it must be a caller's
  * bug.
  */
-block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
+block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
 {
 	unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
 	unsigned int bidx;
@@ -501,7 +510,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
 		int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
 		bidx = node_ofs - 5 - dec;
 	}
-	return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
+	return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
 }
 
 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -547,6 +556,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	struct f2fs_summary sum;
 	struct node_info ni;
 	struct page *page;
+	block_t newaddr;
 	int err;
 
 	/* do not read out */
@@ -568,21 +578,24 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	 * don't cache encrypted data into meta inode until previous dirty
 	 * data were writebacked to avoid racing between GC and flush.
 	 */
-	f2fs_wait_on_page_writeback(page, DATA);
+	f2fs_wait_on_page_writeback(page, DATA, true);
 
 	get_node_info(fio.sbi, dn.nid, &ni);
 	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
 
 	/* read page */
 	fio.page = page;
-	fio.blk_addr = dn.data_blkaddr;
+	fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
 
-	fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi),
-					fio.blk_addr,
-					FGP_LOCK|FGP_CREAT,
-					GFP_NOFS);
-	if (!fio.encrypted_page)
-		goto put_out;
+	allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
+							&sum, CURSEG_COLD_DATA);
+
+	fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
+					FGP_LOCK | FGP_CREAT, GFP_NOFS);
+	if (!fio.encrypted_page) {
+		err = -ENOMEM;
+		goto recover_block;
+	}
 
 	err = f2fs_submit_page_bio(&fio);
 	if (err)
@@ -591,33 +604,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	/* write page */
 	lock_page(fio.encrypted_page);
 
-	if (unlikely(!PageUptodate(fio.encrypted_page)))
+	if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
+		err = -EIO;
 		goto put_page_out;
-	if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi)))
+	}
+	if (unlikely(!PageUptodate(fio.encrypted_page))) {
+		err = -EIO;
 		goto put_page_out;
+	}
 
 	set_page_dirty(fio.encrypted_page);
-	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA);
+	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
 	if (clear_page_dirty_for_io(fio.encrypted_page))
 		dec_page_count(fio.sbi, F2FS_DIRTY_META);
 
 	set_page_writeback(fio.encrypted_page);
 
 	/* allocate block address */
-	f2fs_wait_on_page_writeback(dn.node_page, NODE);
-	allocate_data_block(fio.sbi, NULL, fio.blk_addr,
-					&fio.blk_addr, &sum, CURSEG_COLD_DATA);
+	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+
 	fio.rw = WRITE_SYNC;
+	fio.new_blkaddr = newaddr;
 	f2fs_submit_page_mbio(&fio);
 
-	dn.data_blkaddr = fio.blk_addr;
-	set_data_blkaddr(&dn);
-	f2fs_update_extent_cache(&dn);
-	set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+	f2fs_update_data_blkaddr(&dn, newaddr);
+	set_inode_flag(inode, FI_APPEND_WRITE);
 	if (page->index == 0)
-		set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
 put_page_out:
 	f2fs_put_page(fio.encrypted_page, 1);
+recover_block:
+	if (err)
+		__f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
+								true, true);
 put_out:
 	f2fs_put_dnode(&dn);
 out:
@@ -645,12 +664,23 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
 			.page = page,
 			.encrypted_page = NULL,
 		};
+		bool is_dirty = PageDirty(page);
+		int err;
+
+retry:
 		set_page_dirty(page);
-		f2fs_wait_on_page_writeback(page, DATA);
+		f2fs_wait_on_page_writeback(page, DATA, true);
 		if (clear_page_dirty_for_io(page))
 			inode_dec_dirty_pages(inode);
+
 		set_cold_data(page);
-		do_write_data_page(&fio);
+
+		err = do_write_data_page(&fio);
+		if (err == -ENOMEM && is_dirty) {
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			goto retry;
+		}
+
 		clear_cold_data(page);
 	}
 out:
@@ -664,7 +694,7 @@ out:
  * If the parent node is not valid or the data block address is different,
  * the victim data block is ignored.
  */
-static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
 {
 	struct super_block *sb = sbi->sb;
@@ -684,16 +714,23 @@ next_step:
 		struct node_info dni; /* dnode info for the data */
 		unsigned int ofs_in_node, nofs;
 		block_t start_bidx;
+		nid_t nid = le32_to_cpu(entry->nid);
 
 		/* stop BG_GC if there is not enough free sections. */
-		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
-			return 0;
+		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
+			return;
 
 		if (check_valid_map(sbi, segno, off) == 0)
 			continue;
 
 		if (phase == 0) {
-			ra_node_page(sbi, le32_to_cpu(entry->nid));
+			ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+							META_NAT, true);
+			continue;
+		}
+
+		if (phase == 1) {
+			ra_node_page(sbi, nid);
 			continue;
 		}
 
@@ -701,14 +738,14 @@ next_step:
 		if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
 			continue;
 
-		if (phase == 1) {
+		if (phase == 2) {
 			ra_node_page(sbi, dni.ino);
 			continue;
 		}
 
 		ofs_in_node = le16_to_cpu(entry->ofs_in_node);
 
-		if (phase == 2) {
+		if (phase == 3) {
 			inode = f2fs_iget(sb, dni.ino);
 			if (IS_ERR(inode) || is_bad_inode(inode))
 				continue;
@@ -720,7 +757,7 @@ next_step:
 				continue;
 			}
 
-			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+			start_bidx = start_bidx_of_node(nofs, inode);
 			data_page = get_read_data_page(inode,
 					start_bidx + ofs_in_node, READA, true);
 			if (IS_ERR(data_page)) {
@@ -733,30 +770,41 @@ next_step:
 			continue;
 		}
 
-		/* phase 3 */
+		/* phase 4 */
 		inode = find_gc_inode(gc_list, dni.ino);
 		if (inode) {
-			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode))
+			struct f2fs_inode_info *fi = F2FS_I(inode);
+			bool locked = false;
+
+			if (S_ISREG(inode->i_mode)) {
+				if (!down_write_trylock(&fi->dio_rwsem[READ]))
+					continue;
+				if (!down_write_trylock(
+						&fi->dio_rwsem[WRITE])) {
+					up_write(&fi->dio_rwsem[READ]);
+					continue;
+				}
+				locked = true;
+			}
+
+			start_bidx = start_bidx_of_node(nofs, inode)
 								+ ofs_in_node;
 			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 				move_encrypted_block(inode, start_bidx);
 			else
 				move_data_page(inode, start_bidx, gc_type);
+
+			if (locked) {
+				up_write(&fi->dio_rwsem[WRITE]);
+				up_write(&fi->dio_rwsem[READ]);
+			}
+
 			stat_inc_data_blk_count(sbi, 1, gc_type);
 		}
 	}
 
-	if (++phase < 4)
+	if (++phase < 5)
 		goto next_step;
-
-	if (gc_type == FG_GC) {
-		f2fs_submit_merged_bio(sbi, DATA, WRITE);
-
-		/* return 1 only if FG_GC succefully reclaimed one */
-		if (get_valid_blocks(sbi, segno, 1) == 0)
-			return 1;
-	}
-	return 0;
 }
 
 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -772,51 +820,84 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
 	return ret;
 }
 
-static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+static int do_garbage_collect(struct f2fs_sb_info *sbi,
+				unsigned int start_segno,
 				struct gc_inode_list *gc_list, int gc_type)
 {
 	struct page *sum_page;
 	struct f2fs_summary_block *sum;
 	struct blk_plug plug;
-	int nfree = 0;
+	unsigned int segno = start_segno;
+	unsigned int end_segno = start_segno + sbi->segs_per_sec;
+	int sec_freed = 0;
+	unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
+						SUM_TYPE_DATA : SUM_TYPE_NODE;
 
-	/* read segment summary of victim */
-	sum_page = get_sum_page(sbi, segno);
+	/* readahead multi ssa blocks those have contiguous address */
+	if (sbi->segs_per_sec > 1)
+		ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
+					sbi->segs_per_sec, META_SSA, true);
+
+	/* reference all summary page */
+	while (segno < end_segno) {
+		sum_page = get_sum_page(sbi, segno++);
+		unlock_page(sum_page);
+	}
 
 	blk_start_plug(&plug);
 
-	sum = page_address(sum_page);
+	for (segno = start_segno; segno < end_segno; segno++) {
 
-	/*
-	 * this is to avoid deadlock:
-	 * - lock_page(sum_page)         - f2fs_replace_block
-	 *  - check_valid_map()            - mutex_lock(sentry_lock)
-	 *   - mutex_lock(sentry_lock)     - change_curseg()
-	 *                                  - lock_page(sum_page)
-	 */
-	unlock_page(sum_page);
+		if (get_valid_blocks(sbi, segno, 1) == 0 ||
+					unlikely(f2fs_cp_error(sbi)))
+			goto next;
 
-	switch (GET_SUM_TYPE((&sum->footer))) {
-	case SUM_TYPE_NODE:
-		nfree = gc_node_segment(sbi, sum->entries, segno, gc_type);
-		break;
-	case SUM_TYPE_DATA:
-		nfree = gc_data_segment(sbi, sum->entries, gc_list,
-							segno, gc_type);
-		break;
+		/* find segment summary of victim */
+		sum_page = find_get_page(META_MAPPING(sbi),
+					GET_SUM_BLOCK(sbi, segno));
+		f2fs_bug_on(sbi, !PageUptodate(sum_page));
+		f2fs_put_page(sum_page, 0);
+
+		sum = page_address(sum_page);
+		f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer)));
+
+		/*
+		 * this is to avoid deadlock:
+		 * - lock_page(sum_page)         - f2fs_replace_block
+		 *  - check_valid_map()            - mutex_lock(sentry_lock)
+		 *   - mutex_lock(sentry_lock)     - change_curseg()
+		 *                                  - lock_page(sum_page)
+		 */
+
+		if (type == SUM_TYPE_NODE)
+			gc_node_segment(sbi, sum->entries, segno, gc_type);
+		else
+			gc_data_segment(sbi, sum->entries, gc_list, segno,
+								gc_type);
+
+		stat_inc_seg_count(sbi, type, gc_type);
+next:
+		f2fs_put_page(sum_page, 0);
 	}
+
+	if (gc_type == FG_GC)
+		f2fs_submit_merged_bio(sbi,
+				(type == SUM_TYPE_NODE) ? NODE : DATA, WRITE);
+
 	blk_finish_plug(&plug);
 
-	stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
+	if (gc_type == FG_GC &&
+		get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0)
+		sec_freed = 1;
+
 	stat_inc_call_count(sbi->stat_info);
 
-	f2fs_put_page(sum_page, 0);
-	return nfree;
+	return sec_freed;
 }
 
 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
 {
-	unsigned int segno, i;
+	unsigned int segno;
 	int gc_type = sync ? FG_GC : BG_GC;
 	int sec_freed = 0;
 	int ret = -EINVAL;
@@ -832,46 +913,48 @@ gc_more:
 
 	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
 		goto stop;
-	if (unlikely(f2fs_cp_error(sbi)))
+	if (unlikely(f2fs_cp_error(sbi))) {
+		ret = -EIO;
 		goto stop;
+	}
 
-	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
+	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) {
 		gc_type = FG_GC;
-		if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
-			write_checkpoint(sbi, &cpc);
+		/*
+		 * If there is no victim and no prefree segment but still not
+		 * enough free sections, we should flush dent/node blocks and do
+		 * garbage collections.
+		 */
+		if (__get_victim(sbi, &segno, gc_type) ||
+						prefree_segments(sbi)) {
+			ret = write_checkpoint(sbi, &cpc);
+			if (ret)
+				goto stop;
+			segno = NULL_SEGNO;
+		} else if (has_not_enough_free_secs(sbi, 0, 0)) {
+			ret = write_checkpoint(sbi, &cpc);
+			if (ret)
+				goto stop;
+		}
 	}
 
 	if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
 		goto stop;
 	ret = 0;
 
-	/* readahead multi ssa blocks those have contiguous address */
-	if (sbi->segs_per_sec > 1)
-		ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
-							META_SSA, true);
-
-	for (i = 0; i < sbi->segs_per_sec; i++) {
-		/*
-		 * for FG_GC case, halt gcing left segments once failed one
-		 * of segments in selected section to avoid long latency.
-		 */
-		if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
-				gc_type == FG_GC)
-			break;
-	}
-
-	if (i == sbi->segs_per_sec && gc_type == FG_GC)
+	if (do_garbage_collect(sbi, segno, &gc_list, gc_type) &&
+			gc_type == FG_GC)
 		sec_freed++;
 
 	if (gc_type == FG_GC)
 		sbi->cur_victim_sec = NULL_SEGNO;
 
 	if (!sync) {
-		if (has_not_enough_free_secs(sbi, sec_freed))
+		if (has_not_enough_free_secs(sbi, sec_freed, 0))
 			goto gc_more;
 
 		if (gc_type == FG_GC)
-			write_checkpoint(sbi, &cpc);
+			ret = write_checkpoint(sbi, &cpc);
 	}
 stop:
 	mutex_unlock(&sbi->gc_mutex);
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index b4a65be9f7d3..a993967dcdb9 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
 		return true;
 	return false;
 }
-
-static inline int is_idle(struct f2fs_sb_info *sbi)
-{
-	struct block_device *bdev = sbi->sb->s_bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
-	struct request_list *rl = &q->root_rl;
-	return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
-}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index b238d2fec3e5..71b7206c431e 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -70,8 +70,7 @@ static void str2hashbuf(const unsigned char *msg, size_t len,
 		*buf++ = pad;
 }
 
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
-				struct f2fs_filename *fname)
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
 {
 	__u32 hash;
 	f2fs_hash_t f2fs_hash;
@@ -80,10 +79,6 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
 	const unsigned char *name = name_info->name;
 	size_t len = name_info->len;
 
-	/* encrypted bigname case */
-	if (fname && !fname->disk_name.name)
-		return cpu_to_le32(fname->hash);
-
 	if (is_dot_dotdot(name_info))
 		return 0;
 
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index f35f3eb3541f..b150d03beec1 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -17,9 +17,6 @@
 
 bool f2fs_may_inline_data(struct inode *inode)
 {
-	if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
-		return false;
-
 	if (f2fs_is_atomic_file(inode))
 		return false;
 
@@ -55,7 +52,7 @@ void read_inline_data(struct page *page, struct page *ipage)
 
 	f2fs_bug_on(F2FS_P_SB(page), page->index);
 
-	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+	zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
 
 	/* Copy the whole inline data block */
 	src_addr = inline_data_addr(ipage);
@@ -63,7 +60,8 @@ void read_inline_data(struct page *page, struct page *ipage)
 	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
 	flush_dcache_page(page);
 	kunmap_atomic(dst_addr);
-	SetPageUptodate(page);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
 }
 
 bool truncate_inline_inode(struct page *ipage, u64 from)
@@ -75,9 +73,9 @@ bool truncate_inline_inode(struct page *ipage, u64 from)
 
 	addr = inline_data_addr(ipage);
 
-	f2fs_wait_on_page_writeback(ipage, NODE);
+	f2fs_wait_on_page_writeback(ipage, NODE, true);
 	memset(addr + from, 0, MAX_INLINE_DATA - from);
-
+	set_page_dirty(ipage);
 	return true;
 }
 
@@ -112,11 +110,12 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 	}
 
 	if (page->index)
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 	else
 		read_inline_data(page, ipage);
 
-	SetPageUptodate(page);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
 	f2fs_put_page(ipage, 1);
 	trace_android_fs_dataread_end(inode, page_offset(page),
 				      PAGE_SIZE);
@@ -126,7 +125,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 
 int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 {
-	void *src_addr, *dst_addr;
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(dn->inode),
 		.type = DATA,
@@ -136,8 +134,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	};
 	int dirty, err;
 
-	f2fs_bug_on(F2FS_I_SB(dn->inode), page->index);
-
 	if (!f2fs_exist_data(dn->inode))
 		goto clear_out;
 
@@ -145,21 +141,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	if (err)
 		return err;
 
-	f2fs_wait_on_page_writeback(page, DATA);
+	f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
 
-	if (PageUptodate(page))
-		goto no_update;
-
-	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
-
-	/* Copy the whole inline data block */
-	src_addr = inline_data_addr(dn->inode_page);
-	dst_addr = kmap_atomic(page);
-	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
-	flush_dcache_page(page);
-	kunmap_atomic(dst_addr);
-	SetPageUptodate(page);
-no_update:
+	read_inline_data(page, dn->inode_page);
 	set_page_dirty(page);
 
 	/* clear dirty state */
@@ -167,23 +151,21 @@ no_update:
 
 	/* write data page to try to make data consistent */
 	set_page_writeback(page);
-	fio.blk_addr = dn->data_blkaddr;
+	fio.old_blkaddr = dn->data_blkaddr;
 	write_data_page(dn, &fio);
-	set_data_blkaddr(dn);
-	f2fs_update_extent_cache(dn);
-	f2fs_wait_on_page_writeback(page, DATA);
+	f2fs_wait_on_page_writeback(page, DATA, true);
 	if (dirty)
 		inode_dec_dirty_pages(dn->inode);
 
 	/* this converted inline_data should be recovered. */
-	set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
+	set_inode_flag(dn->inode, FI_APPEND_WRITE);
 
 	/* clear inline data and flag after data writeback */
 	truncate_inline_inode(dn->inode_page, 0);
+	clear_inline_node(dn->inode_page);
 clear_out:
 	stat_dec_inline_inode(dn->inode);
 	f2fs_clear_inline_inode(dn->inode);
-	sync_inode_page(dn);
 	f2fs_put_dnode(dn);
 	return 0;
 }
@@ -195,7 +177,10 @@ int f2fs_convert_inline_inode(struct inode *inode)
 	struct page *ipage, *page;
 	int err = 0;
 
-	page = grab_cache_page(inode->i_mapping, 0);
+	if (!f2fs_has_inline_data(inode))
+		return 0;
+
+	page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
 	if (!page)
 		return -ENOMEM;
 
@@ -217,6 +202,9 @@ out:
 	f2fs_unlock_op(sbi);
 
 	f2fs_put_page(page, 1);
+
+	f2fs_balance_fs(sbi, dn.node_changed);
+
 	return err;
 }
 
@@ -238,16 +226,17 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
 
 	f2fs_bug_on(F2FS_I_SB(inode), page->index);
 
-	f2fs_wait_on_page_writeback(dn.inode_page, NODE);
+	f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
 	src_addr = kmap_atomic(page);
 	dst_addr = inline_data_addr(dn.inode_page);
 	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
 	kunmap_atomic(src_addr);
+	set_page_dirty(dn.inode_page);
 
-	set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
-	set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+	set_inode_flag(inode, FI_APPEND_WRITE);
+	set_inode_flag(inode, FI_DATA_EXIST);
 
-	sync_inode_page(&dn);
+	clear_inline_node(dn.inode_page);
 	f2fs_put_dnode(&dn);
 	return 0;
 }
@@ -276,16 +265,16 @@ process_inline:
 		ipage = get_node_page(sbi, inode->i_ino);
 		f2fs_bug_on(sbi, IS_ERR(ipage));
 
-		f2fs_wait_on_page_writeback(ipage, NODE);
+		f2fs_wait_on_page_writeback(ipage, NODE, true);
 
 		src_addr = inline_data_addr(npage);
 		dst_addr = inline_data_addr(ipage);
 		memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
 
-		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
-		set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+		set_inode_flag(inode, FI_INLINE_DATA);
+		set_inode_flag(inode, FI_DATA_EXIST);
 
-		update_inode(inode, ipage);
+		set_page_dirty(ipage);
 		f2fs_put_page(ipage, 1);
 		return true;
 	}
@@ -296,7 +285,6 @@ process_inline:
 		if (!truncate_inline_inode(ipage, 0))
 			return false;
 		f2fs_clear_inline_inode(inode);
-		update_inode(inode, ipage);
 		f2fs_put_page(ipage, 1);
 	} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
 		if (truncate_blocks(inode, 0, false))
@@ -307,7 +295,7 @@ process_inline:
 }
 
 struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
-			struct f2fs_filename *fname, struct page **res_page)
+			struct fscrypt_name *fname, struct page **res_page)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
 	struct f2fs_inline_dentry *inline_dentry;
@@ -318,10 +306,12 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
 	f2fs_hash_t namehash;
 
 	ipage = get_node_page(sbi, dir->i_ino);
-	if (IS_ERR(ipage))
+	if (IS_ERR(ipage)) {
+		*res_page = ipage;
 		return NULL;
+	}
 
-	namehash = f2fs_dentry_hash(&name, fname);
+	namehash = f2fs_dentry_hash(&name);
 
 	inline_dentry = inline_data_addr(ipage);
 
@@ -333,30 +323,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
 	else
 		f2fs_put_page(ipage, 0);
 
-	/*
-	 * For the most part, it should be a bug when name_len is zero.
-	 * We stop here for figuring out where the bugs has occurred.
-	 */
-	f2fs_bug_on(sbi, d.max < 0);
-	return de;
-}
-
-struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir,
-							struct page **p)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
-	struct page *ipage;
-	struct f2fs_dir_entry *de;
-	struct f2fs_inline_dentry *dentry_blk;
-
-	ipage = get_node_page(sbi, dir->i_ino);
-	if (IS_ERR(ipage))
-		return NULL;
-
-	dentry_blk = inline_data_addr(ipage);
-	de = &dentry_blk->dentry[1];
-	*p = ipage;
-	unlock_page(ipage);
 	return de;
 }
 
@@ -374,10 +340,8 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
 	set_page_dirty(ipage);
 
 	/* update i_size to MAX_INLINE_DATA */
-	if (i_size_read(inode) < MAX_INLINE_DATA) {
-		i_size_write(inode, MAX_INLINE_DATA);
-		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
-	}
+	if (i_size_read(inode) < MAX_INLINE_DATA)
+		f2fs_i_size_write(inode, MAX_INLINE_DATA);
 	return 0;
 }
 
@@ -385,7 +349,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
  * NOTE: ipage is grabbed by caller, but if any error occurs, we should
  * release ipage in this function.
  */
-static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 				struct f2fs_inline_dentry *inline_dentry)
 {
 	struct page *page;
@@ -393,7 +357,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
 	struct f2fs_dentry_block *dentry_blk;
 	int err;
 
-	page = grab_cache_page(dir->i_mapping, 0);
+	page = f2fs_grab_cache_page(dir->i_mapping, 0, false);
 	if (!page) {
 		f2fs_put_page(ipage, 1);
 		return -ENOMEM;
@@ -404,8 +368,8 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
 	if (err)
 		goto out;
 
-	f2fs_wait_on_page_writeback(page, DATA);
-	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+	f2fs_wait_on_page_writeback(page, DATA, true);
+	zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
 
 	dentry_blk = kmap_atomic(page);
 
@@ -426,37 +390,132 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
 					NR_INLINE_DENTRY * F2FS_SLOT_LEN);
 
 	kunmap_atomic(dentry_blk);
-	SetPageUptodate(page);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
 	set_page_dirty(page);
 
 	/* clear inline dir and flag after data writeback */
 	truncate_inline_inode(ipage, 0);
 
 	stat_dec_inline_dir(dir);
-	clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
+	clear_inode_flag(dir, FI_INLINE_DENTRY);
 
-	if (i_size_read(dir) < PAGE_CACHE_SIZE) {
-		i_size_write(dir, PAGE_CACHE_SIZE);
-		set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
-	}
-
-	sync_inode_page(&dn);
+	f2fs_i_depth_write(dir, 1);
+	if (i_size_read(dir) < PAGE_SIZE)
+		f2fs_i_size_write(dir, PAGE_SIZE);
 out:
 	f2fs_put_page(page, 1);
 	return err;
 }
 
-int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
-			struct inode *inode, nid_t ino, umode_t mode)
+static int f2fs_add_inline_entries(struct inode *dir,
+			struct f2fs_inline_dentry *inline_dentry)
+{
+	struct f2fs_dentry_ptr d;
+	unsigned long bit_pos = 0;
+	int err = 0;
+
+	make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+
+	while (bit_pos < d.max) {
+		struct f2fs_dir_entry *de;
+		struct qstr new_name;
+		nid_t ino;
+		umode_t fake_mode;
+
+		if (!test_bit_le(bit_pos, d.bitmap)) {
+			bit_pos++;
+			continue;
+		}
+
+		de = &d.dentry[bit_pos];
+
+		if (unlikely(!de->name_len)) {
+			bit_pos++;
+			continue;
+		}
+
+		new_name.name = d.filename[bit_pos];
+		new_name.len = de->name_len;
+
+		ino = le32_to_cpu(de->ino);
+		fake_mode = get_de_type(de) << S_SHIFT;
+
+		err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL,
+							ino, fake_mode);
+		if (err)
+			goto punch_dentry_pages;
+
+		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+	}
+	return 0;
+punch_dentry_pages:
+	truncate_inode_pages(&dir->i_data, 0);
+	truncate_blocks(dir, 0, false);
+	remove_dirty_inode(dir);
+	return err;
+}
+
+static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
+				struct f2fs_inline_dentry *inline_dentry)
+{
+	struct f2fs_inline_dentry *backup_dentry;
+	int err;
+
+	backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir),
+			sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO);
+	if (!backup_dentry) {
+		f2fs_put_page(ipage, 1);
+		return -ENOMEM;
+	}
+
+	memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
+	truncate_inline_inode(ipage, 0);
+
+	unlock_page(ipage);
+
+	err = f2fs_add_inline_entries(dir, backup_dentry);
+	if (err)
+		goto recover;
+
+	lock_page(ipage);
+
+	stat_dec_inline_dir(dir);
+	clear_inode_flag(dir, FI_INLINE_DENTRY);
+	kfree(backup_dentry);
+	return 0;
+recover:
+	lock_page(ipage);
+	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+	f2fs_i_depth_write(dir, 0);
+	f2fs_i_size_write(dir, MAX_INLINE_DATA);
+	set_page_dirty(ipage);
+	f2fs_put_page(ipage, 1);
+
+	kfree(backup_dentry);
+	return err;
+}
+
+static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+				struct f2fs_inline_dentry *inline_dentry)
+{
+	if (!F2FS_I(dir)->i_dir_level)
+		return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
+	else
+		return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
+}
+
+int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
+				const struct qstr *orig_name,
+				struct inode *inode, nid_t ino, umode_t mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct page *ipage;
 	unsigned int bit_pos;
 	f2fs_hash_t name_hash;
-	size_t namelen = name->len;
 	struct f2fs_inline_dentry *dentry_blk = NULL;
 	struct f2fs_dentry_ptr d;
-	int slots = GET_DENTRY_SLOTS(namelen);
+	int slots = GET_DENTRY_SLOTS(new_name->len);
 	struct page *page = NULL;
 	int err = 0;
 
@@ -477,25 +536,27 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
 
 	if (inode) {
 		down_write(&F2FS_I(inode)->i_sem);
-		page = init_inode_metadata(inode, dir, name, ipage);
+		page = init_inode_metadata(inode, dir, new_name,
+						orig_name, ipage);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto fail;
 		}
+		if (f2fs_encrypted_inode(dir))
+			file_set_enc_name(inode);
 	}
 
-	f2fs_wait_on_page_writeback(ipage, NODE);
+	f2fs_wait_on_page_writeback(ipage, NODE, true);
 
-	name_hash = f2fs_dentry_hash(name, NULL);
+	name_hash = f2fs_dentry_hash(new_name);
 	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
-	f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos);
+	f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
 
 	set_page_dirty(ipage);
 
 	/* we don't need to mark_inode_dirty now */
 	if (inode) {
-		F2FS_I(inode)->i_pino = dir->i_ino;
-		update_inode(inode, page);
+		f2fs_i_pino_write(inode, dir->i_ino);
 		f2fs_put_page(page, 1);
 	}
 
@@ -503,11 +564,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
 fail:
 	if (inode)
 		up_write(&F2FS_I(inode)->i_sem);
-
-	if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
-		update_inode(dir, ipage);
-		clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
-	}
 out:
 	f2fs_put_page(ipage, 1);
 	return err;
@@ -522,22 +578,22 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	int i;
 
 	lock_page(page);
-	f2fs_wait_on_page_writeback(page, NODE);
+	f2fs_wait_on_page_writeback(page, NODE, true);
 
 	inline_dentry = inline_data_addr(page);
 	bit_pos = dentry - inline_dentry->dentry;
 	for (i = 0; i < slots; i++)
-		test_and_clear_bit_le(bit_pos + i,
+		__clear_bit_le(bit_pos + i,
 				&inline_dentry->dentry_bitmap);
 
 	set_page_dirty(page);
+	f2fs_put_page(page, 1);
 
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	f2fs_mark_inode_dirty_sync(dir);
 
 	if (inode)
-		f2fs_drop_nlink(dir, inode, page);
-
-	f2fs_put_page(page, 1);
+		f2fs_drop_nlink(dir, inode);
 }
 
 bool f2fs_empty_inline_dir(struct inode *dir)
@@ -565,7 +621,7 @@ bool f2fs_empty_inline_dir(struct inode *dir)
 }
 
 int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
-				struct f2fs_str *fstr)
+				struct fscrypt_str *fstr)
 {
 	struct inode *inode = file_inode(file);
 	struct f2fs_inline_dentry *inline_dentry = NULL;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 97e20decacb4..d7369895a78a 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
 #include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
 #include <linux/writeback.h>
 
 #include "f2fs.h"
@@ -18,6 +19,13 @@
 
 #include <trace/events/f2fs.h>
 
+void f2fs_mark_inode_dirty_sync(struct inode *inode)
+{
+	if (f2fs_inode_dirtied(inode))
+		return;
+	mark_inode_dirty_sync(inode);
+}
+
 void f2fs_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = F2FS_I(inode)->i_flags;
@@ -35,6 +43,7 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_DIRSYNC;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	f2fs_mark_inode_dirty_sync(inode);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -83,10 +92,10 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
 
 	while (start < end) {
 		if (*start++) {
-			f2fs_wait_on_page_writeback(ipage, NODE);
+			f2fs_wait_on_page_writeback(ipage, NODE, true);
 
-			set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
-			set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
+			set_inode_flag(inode, FI_DATA_EXIST);
+			set_raw_inline(inode, F2FS_INODE(ipage));
 			set_page_dirty(ipage);
 			return;
 		}
@@ -138,9 +147,10 @@ static int do_read_inode(struct inode *inode)
 	fi->i_pino = le32_to_cpu(ri->i_pino);
 	fi->i_dir_level = ri->i_dir_level;
 
-	f2fs_init_extent_tree(inode, &ri->i_ext);
+	if (f2fs_init_extent_tree(inode, &ri->i_ext))
+		set_page_dirty(node_page);
 
-	get_inline_info(fi, ri);
+	get_inline_info(inode, ri);
 
 	/* check data exist */
 	if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
@@ -150,7 +160,10 @@ static int do_read_inode(struct inode *inode)
 	__get_inode_rdev(inode, ri);
 
 	if (__written_first_block(ri))
-		set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+
+	if (!need_inode_block_update(sbi, inode->i_ino))
+		fi->last_disk_size = inode->i_size;
 
 	f2fs_put_page(node_page, 1);
 
@@ -202,6 +215,7 @@ make_now:
 			inode->i_op = &f2fs_encrypted_symlink_inode_operations;
 		else
 			inode->i_op = &f2fs_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
@@ -221,11 +235,27 @@ bad_inode:
 	return ERR_PTR(ret);
 }
 
-void update_inode(struct inode *inode, struct page *node_page)
+struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+retry:
+	inode = f2fs_iget(sb, ino);
+	if (IS_ERR(inode)) {
+		if (PTR_ERR(inode) == -ENOMEM) {
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			goto retry;
+		}
+	}
+	return inode;
+}
+
+int update_inode(struct inode *inode, struct page *node_page)
 {
 	struct f2fs_inode *ri;
 
-	f2fs_wait_on_page_writeback(node_page, NODE);
+	f2fs_inode_synced(inode);
+
+	f2fs_wait_on_page_writeback(node_page, NODE, true);
 
 	ri = F2FS_INODE(node_page);
 
@@ -242,7 +272,7 @@ void update_inode(struct inode *inode, struct page *node_page)
 							&ri->i_ext);
 	else
 		memset(&ri->i_ext, 0, sizeof(ri->i_ext));
-	set_raw_inline(F2FS_I(inode), ri);
+	set_raw_inline(inode, ri);
 
 	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
 	ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
@@ -259,15 +289,19 @@ void update_inode(struct inode *inode, struct page *node_page)
 
 	__set_inode_rdev(inode, ri);
 	set_cold_node(inode, node_page);
-	set_page_dirty(node_page);
 
-	clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+	/* deleted inode */
+	if (inode->i_nlink == 0)
+		clear_inline_node(node_page);
+
+	return set_page_dirty(node_page);
 }
 
-void update_inode_page(struct inode *inode)
+int update_inode_page(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *node_page;
+	int ret = 0;
 retry:
 	node_page = get_node_page(sbi, inode->i_ino);
 	if (IS_ERR(node_page)) {
@@ -276,12 +310,14 @@ retry:
 			cond_resched();
 			goto retry;
 		} else if (err != -ENOENT) {
-			f2fs_stop_checkpoint(sbi);
+			f2fs_stop_checkpoint(sbi, false);
 		}
-		return;
+		f2fs_inode_synced(inode);
+		return 0;
 	}
-	update_inode(inode, node_page);
+	ret = update_inode(inode, node_page);
 	f2fs_put_page(node_page, 1);
+	return ret;
 }
 
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -292,16 +328,15 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 			inode->i_ino == F2FS_META_INO(sbi))
 		return 0;
 
-	if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+	if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
 		return 0;
 
 	/*
 	 * We need to balance fs here to prevent from producing dirty node pages
 	 * during the urgent cleaning time when runing out of free sections.
 	 */
-	update_inode_page(inode);
-
-	f2fs_balance_fs(sbi);
+	if (update_inode_page(inode))
+		f2fs_balance_fs(sbi, true);
 	return 0;
 }
 
@@ -311,13 +346,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 void f2fs_evict_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	nid_t xnid = fi->i_xattr_nid;
+	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 	int err = 0;
 
 	/* some remained atomic pages should discarded */
 	if (f2fs_is_atomic_file(inode))
-		commit_inmem_pages(inode, true);
+		drop_inmem_pages(inode);
 
 	trace_f2fs_evict_inode(inode);
 	truncate_inode_pages_final(&inode->i_data);
@@ -327,19 +361,24 @@ void f2fs_evict_inode(struct inode *inode)
 		goto out_clear;
 
 	f2fs_bug_on(sbi, get_dirty_pages(inode));
-	remove_dirty_dir_inode(inode);
+	remove_dirty_inode(inode);
 
 	f2fs_destroy_extent_tree(inode);
 
 	if (inode->i_nlink || is_bad_inode(inode))
 		goto no_delete;
 
-	sb_start_intwrite(inode->i_sb);
-	set_inode_flag(fi, FI_NO_ALLOC);
-	i_size_write(inode, 0);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_EVICT_INODE))
+		goto no_delete;
+#endif
 
+	sb_start_intwrite(inode->i_sb);
+	set_inode_flag(inode, FI_NO_ALLOC);
+	i_size_write(inode, 0);
+retry:
 	if (F2FS_HAS_BLOCKS(inode))
-		err = f2fs_truncate(inode, true);
+		err = f2fs_truncate(inode);
 
 	if (!err) {
 		f2fs_lock_op(sbi);
@@ -347,6 +386,14 @@ void f2fs_evict_inode(struct inode *inode)
 		f2fs_unlock_op(sbi);
 	}
 
+	/* give more chances, if ENOMEM case */
+	if (err == -ENOMEM) {
+		err = 0;
+		goto retry;
+	}
+
+	if (err)
+		update_inode_page(inode);
 	sb_end_intwrite(inode->i_sb);
 no_delete:
 	stat_dec_inline_xattr(inode);
@@ -356,36 +403,18 @@ no_delete:
 	invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
 	if (xnid)
 		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
-	if (is_inode_flag_set(fi, FI_APPEND_WRITE))
-		add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
-	if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
-		add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
-	if (is_inode_flag_set(fi, FI_FREE_NID)) {
-		if (err && err != -ENOENT)
-			alloc_nid_done(sbi, inode->i_ino);
-		else
-			alloc_nid_failed(sbi, inode->i_ino);
-		clear_inode_flag(fi, FI_FREE_NID);
-	}
-
-	if (err && err != -ENOENT) {
-		if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) {
-			/*
-			 * get here because we failed to release resource
-			 * of inode previously, reminder our user to run fsck
-			 * for fixing.
-			 */
-			set_sbi_flag(sbi, SBI_NEED_FSCK);
-			f2fs_msg(sbi->sb, KERN_WARNING,
-				"inode (ino:%lu) resource leak, run fsck "
-				"to fix this issue!", inode->i_ino);
-		}
+	if (is_inode_flag_set(inode, FI_APPEND_WRITE))
+		add_ino_entry(sbi, inode->i_ino, APPEND_INO);
+	if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
+		add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+	if (is_inode_flag_set(inode, FI_FREE_NID)) {
+		alloc_nid_failed(sbi, inode->i_ino);
+		clear_inode_flag(inode, FI_FREE_NID);
 	}
+	f2fs_bug_on(sbi, err &&
+		!exist_written_data(sbi, inode->i_ino, ORPHAN_INO));
 out_clear:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	if (fi->i_crypt_info)
-		f2fs_free_encryption_info(inode, fi->i_crypt_info);
-#endif
+	fscrypt_put_encryption_info(inode, NULL);
 	clear_inode(inode);
 }
 
@@ -393,37 +422,32 @@ out_clear:
 void handle_failed_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	int err = 0;
+	struct node_info ni;
 
-	clear_nlink(inode);
-	make_bad_inode(inode);
+	/* don't make bad inode, since it becomes a regular file. */
 	unlock_new_inode(inode);
 
-	i_size_write(inode, 0);
-	if (F2FS_HAS_BLOCKS(inode))
-		err = f2fs_truncate(inode, false);
-
-	if (!err)
-		err = remove_inode_page(inode);
-
 	/*
-	 * if we skip truncate_node in remove_inode_page bacause we failed
-	 * before, it's better to find another way to release resource of
-	 * this inode (e.g. valid block count, node block or nid). Here we
-	 * choose to add this inode to orphan list, so that we can call iput
-	 * for releasing in orphan recovery flow.
-	 *
 	 * Note: we should add inode to orphan list before f2fs_unlock_op()
 	 * so we can prevent losing this orphan when encoutering checkpoint
 	 * and following suddenly power-off.
 	 */
-	if (err && err != -ENOENT) {
-		err = acquire_orphan_inode(sbi);
-		if (!err)
-			add_orphan_inode(sbi, inode->i_ino);
+	get_node_info(sbi, inode->i_ino, &ni);
+
+	if (ni.blk_addr != NULL_ADDR) {
+		int err = acquire_orphan_inode(sbi);
+		if (err) {
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			f2fs_msg(sbi->sb, KERN_WARNING,
+				"Too many orphan inodes, run fsck to fix.");
+		} else {
+			add_orphan_inode(inode);
+		}
+		alloc_nid_done(sbi, inode->i_ino);
+	} else {
+		set_inode_flag(inode, FI_FREE_NID);
 	}
 
-	set_inode_flag(F2FS_I(inode), FI_FREE_NID);
 	f2fs_unlock_op(sbi);
 
 	/* iput will drop the inode object */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 2c32110f9fc0..0f071a70522d 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -60,10 +60,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
 		f2fs_set_encrypted_inode(inode);
 
-	if (f2fs_may_inline_data(inode))
-		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+	set_inode_flag(inode, FI_NEW_INODE);
+
+	if (test_opt(sbi, INLINE_XATTR))
+		set_inode_flag(inode, FI_INLINE_XATTR);
+	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+		set_inode_flag(inode, FI_INLINE_DATA);
 	if (f2fs_may_inline_dentry(inode))
-		set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
+		set_inode_flag(inode, FI_INLINE_DENTRY);
 
 	f2fs_init_extent_tree(inode, NULL);
 
@@ -72,14 +76,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	stat_inc_inline_dir(inode);
 
 	trace_f2fs_new_inode(inode, 0);
-	mark_inode_dirty(inode);
 	return inode;
 
 fail:
 	trace_f2fs_new_inode(inode, err);
 	make_bad_inode(inode);
 	if (nid_free)
-		set_inode_flag(F2FS_I(inode), FI_FREE_NID);
+		set_inode_flag(inode, FI_FREE_NID);
 	iput(inode);
 	return ERR_PTR(err);
 }
@@ -88,18 +91,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
 {
 	size_t slen = strlen(s);
 	size_t sublen = strlen(sub);
+	int i;
 
 	/*
 	 * filename format of multimedia file should be defined as:
-	 * "filename + '.' + extension".
+	 * "filename + '.' + extension + (optional: '.' + temp extension)".
 	 */
 	if (slen < sublen + 2)
 		return 0;
 
-	if (s[slen - sublen - 1] != '.')
-		return 0;
+	for (i = 1; i < slen - sublen; i++) {
+		if (s[i] != '.')
+			continue;
+		if (!strncasecmp(s + i + 1, sub, sublen))
+			return 1;
+	}
 
-	return !strncasecmp(s + slen - sublen, sub, sublen);
+	return 0;
 }
 
 /*
@@ -128,8 +136,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	nid_t ino = 0;
 	int err;
 
-	f2fs_balance_fs(sbi);
-
 	inode = f2fs_new_inode(dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -142,6 +148,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	ino = inode->i_ino;
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -169,15 +177,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	int err;
 
 	if (f2fs_encrypted_inode(dir) &&
-		!f2fs_is_child_context_consistent_with_parent(dir, inode))
+			!fscrypt_has_permitted_context(dir, inode))
 		return -EPERM;
 
-	f2fs_balance_fs(sbi);
+	f2fs_balance_fs(sbi, true);
 
 	inode->i_ctime = CURRENT_TIME;
 	ihold(inode);
 
-	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	set_inode_flag(inode, FI_INC_LINK);
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -190,7 +198,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 		f2fs_sync_fs(sbi->sb, 1);
 	return 0;
 out:
-	clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	clear_inode_flag(inode, FI_INC_LINK);
 	iput(inode);
 	f2fs_unlock_op(sbi);
 	return err;
@@ -199,10 +207,14 @@ out:
 struct dentry *f2fs_get_parent(struct dentry *child)
 {
 	struct qstr dotdot = QSTR_INIT("..", 2);
-	unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot);
-	if (!ino)
+	struct page *page;
+	unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page);
+	if (!ino) {
+		if (IS_ERR(page))
+			return ERR_CAST(page);
 		return ERR_PTR(-ENOENT);
-	return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino));
+	}
+	return d_obtain_alias(f2fs_iget(child->d_sb, ino));
 }
 
 static int __recover_dot_dentries(struct inode *dir, nid_t pino)
@@ -214,12 +226,24 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
 	struct page *page;
 	int err = 0;
 
+	if (f2fs_readonly(sbi->sb)) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"skip recovering inline_dots inode (ino:%lu, pino:%u) "
+			"in readonly mountpoint", dir->i_ino, pino);
+		return 0;
+	}
+
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 
 	de = f2fs_find_entry(dir, &dot, &page);
 	if (de) {
 		f2fs_dentry_kunmap(dir, page);
 		f2fs_put_page(page, 0);
+	} else if (IS_ERR(page)) {
+		err = PTR_ERR(page);
+		goto out;
 	} else {
 		err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
 		if (err)
@@ -230,14 +254,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
 	if (de) {
 		f2fs_dentry_kunmap(dir, page);
 		f2fs_put_page(page, 0);
+	} else if (IS_ERR(page)) {
+		err = PTR_ERR(page);
 	} else {
 		err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
 	}
 out:
-	if (!err) {
-		clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS);
-		mark_inode_dirty(dir);
-	}
+	if (!err)
+		clear_inode_flag(dir, FI_INLINE_DOTS);
 
 	f2fs_unlock_op(sbi);
 	return err;
@@ -251,13 +275,32 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	struct page *page;
 	nid_t ino;
 	int err = 0;
+	unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
+
+	if (f2fs_encrypted_inode(dir)) {
+		int res = fscrypt_get_encryption_info(dir);
+
+		/*
+		 * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
+		 * created while the directory was encrypted and we
+		 * don't have access to the key.
+		 */
+		if (fscrypt_has_encryption_key(dir))
+			fscrypt_set_encrypted_dentry(dentry);
+		fscrypt_set_d_op(dentry);
+		if (res && res != -ENOKEY)
+			return ERR_PTR(res);
+	}
 
 	if (dentry->d_name.len > F2FS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
-	if (!de)
+	if (!de) {
+		if (IS_ERR(page))
+			return (struct dentry *)page;
 		return d_splice_alias(inode, dentry);
+	}
 
 	ino = le32_to_cpu(de->ino);
 	f2fs_dentry_kunmap(dir, page);
@@ -267,15 +310,29 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
+	if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
+		err = __recover_dot_dentries(dir, root_ino);
+		if (err)
+			goto err_out;
+	}
+
 	if (f2fs_has_inline_dots(inode)) {
 		err = __recover_dot_dentries(inode, dir->i_ino);
 		if (err)
 			goto err_out;
 	}
+	if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) &&
+			(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+			!fscrypt_has_permitted_context(dir, inode)) {
+		bool nokey = f2fs_encrypted_inode(inode) &&
+			!fscrypt_has_encryption_key(inode);
+		err = nokey ? -ENOKEY : -EPERM;
+		goto err_out;
+	}
 	return d_splice_alias(inode, dentry);
 
 err_out:
-	iget_failed(inode);
+	iput(inode);
 	return ERR_PTR(err);
 }
 
@@ -288,11 +345,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 	int err = -ENOENT;
 
 	trace_f2fs_unlink_enter(dir, dentry);
-	f2fs_balance_fs(sbi);
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
-	if (!de)
+	if (!de) {
+		if (IS_ERR(page))
+			err = PTR_ERR(page);
 		goto fail;
+	}
+
+	f2fs_balance_fs(sbi, true);
 
 	f2fs_lock_op(sbi);
 	err = acquire_orphan_inode(sbi);
@@ -305,9 +366,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 	f2fs_delete_entry(de, page, dir, inode);
 	f2fs_unlock_op(sbi);
 
-	/* In order to evict this inode, we set it dirty */
-	mark_inode_dirty(inode);
-
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
 fail:
@@ -332,16 +390,24 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
 	size_t len = strlen(symname);
-	size_t p_len;
-	char *p_str;
-	struct f2fs_str disk_link = FSTR_INIT(NULL, 0);
-	struct f2fs_encrypted_symlink_data *sd = NULL;
+	struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
+	struct fscrypt_symlink_data *sd = NULL;
 	int err;
 
-	if (len > dir->i_sb->s_blocksize)
-		return -ENAMETOOLONG;
+	if (f2fs_encrypted_inode(dir)) {
+		err = fscrypt_get_encryption_info(dir);
+		if (err)
+			return err;
 
-	f2fs_balance_fs(sbi);
+		if (!fscrypt_has_encryption_key(dir))
+			return -EPERM;
+
+		disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+				sizeof(struct fscrypt_symlink_data));
+	}
+
+	if (disk_link.len > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
 
 	inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
 	if (IS_ERR(inode))
@@ -351,8 +417,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &f2fs_encrypted_symlink_inode_operations;
 	else
 		inode->i_op = &f2fs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -360,42 +429,36 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	f2fs_unlock_op(sbi);
 	alloc_nid_done(sbi, inode->i_ino);
 
-	if (f2fs_encrypted_inode(dir)) {
+	if (f2fs_encrypted_inode(inode)) {
 		struct qstr istr = QSTR_INIT(symname, len);
+		struct fscrypt_str ostr;
 
-		err = f2fs_get_encryption_info(inode);
-		if (err)
-			goto err_out;
-
-		err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link);
-		if (err)
-			goto err_out;
-
-		err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link);
-		if (err < 0)
-			goto err_out;
-
-		p_len = encrypted_symlink_data_len(disk_link.len) + 1;
-
-		if (p_len > dir->i_sb->s_blocksize) {
-			err = -ENAMETOOLONG;
-			goto err_out;
-		}
-
-		sd = kzalloc(p_len, GFP_NOFS);
+		sd = kzalloc(disk_link.len, GFP_NOFS);
 		if (!sd) {
 			err = -ENOMEM;
 			goto err_out;
 		}
-		memcpy(sd->encrypted_path, disk_link.name, disk_link.len);
-		sd->len = cpu_to_le16(disk_link.len);
-		p_str = (char *)sd;
-	} else {
-		p_len = len + 1;
-		p_str = (char *)symname;
+
+		err = fscrypt_get_encryption_info(inode);
+		if (err)
+			goto err_out;
+
+		if (!fscrypt_has_encryption_key(inode)) {
+			err = -EPERM;
+			goto err_out;
+		}
+
+		ostr.name = sd->encrypted_path;
+		ostr.len = disk_link.len;
+		err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
+		if (err < 0)
+			goto err_out;
+
+		sd->len = cpu_to_le16(ostr.len);
+		disk_link.name = (char *)sd;
 	}
 
-	err = page_symlink(inode, p_str, p_len);
+	err = page_symlink(inode, disk_link.name, disk_link.len);
 
 err_out:
 	d_instantiate(dentry, inode);
@@ -411,7 +474,8 @@ err_out:
 	 * performance regression.
 	 */
 	if (!err) {
-		filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
+		filemap_write_and_wait_range(inode->i_mapping, 0,
+							disk_link.len - 1);
 
 		if (IS_DIRSYNC(dir))
 			f2fs_sync_fs(sbi->sb, 1);
@@ -420,7 +484,6 @@ err_out:
 	}
 
 	kfree(sd);
-	f2fs_fname_crypto_free_buffer(&disk_link);
 	return err;
 out:
 	handle_failed_inode(inode);
@@ -433,8 +496,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	int err;
 
-	f2fs_balance_fs(sbi);
-
 	inode = f2fs_new_inode(dir, S_IFDIR | mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -444,7 +505,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
 
-	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	f2fs_balance_fs(sbi, true);
+
+	set_inode_flag(inode, FI_INC_LINK);
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -461,7 +524,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 
 out_fail:
-	clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	clear_inode_flag(inode, FI_INC_LINK);
 	handle_failed_inode(inode);
 	return err;
 }
@@ -481,8 +544,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err = 0;
 
-	f2fs_balance_fs(sbi);
-
 	inode = f2fs_new_inode(dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -490,6 +551,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	init_special_inode(inode, inode->i_mode, rdev);
 	inode->i_op = &f2fs_special_inode_operations;
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -516,9 +579,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err;
 
-	if (!whiteout)
-		f2fs_balance_fs(sbi);
-
 	inode = f2fs_new_inode(dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -532,6 +592,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	}
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 	err = acquire_orphan_inode(sbi);
 	if (err)
@@ -545,17 +607,17 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
 	 * add this non-linked tmpfile to orphan list, in this way we could
 	 * remove all unused data of tmpfile after abnormal power-off.
 	 */
-	add_orphan_inode(sbi, inode->i_ino);
-	f2fs_unlock_op(sbi);
-
+	add_orphan_inode(inode);
 	alloc_nid_done(sbi, inode->i_ino);
 
 	if (whiteout) {
-		inode_dec_link_count(inode);
+		f2fs_i_links_write(inode, false);
 		*whiteout = inode;
 	} else {
 		d_tmpfile(dentry, inode);
 	}
+	/* link_count was changed by d_tmpfile as well. */
+	f2fs_unlock_op(sbi);
 	unlock_new_inode(inode);
 	return 0;
 
@@ -569,7 +631,7 @@ out:
 static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	if (f2fs_encrypted_inode(dir)) {
-		int err = f2fs_get_encryption_info(dir);
+		int err = fscrypt_get_encryption_info(dir);
 		if (err)
 			return err;
 	}
@@ -595,26 +657,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct f2fs_dir_entry *old_dir_entry = NULL;
 	struct f2fs_dir_entry *old_entry;
 	struct f2fs_dir_entry *new_entry;
+	bool is_old_inline = f2fs_has_inline_dentry(old_dir);
 	int err = -ENOENT;
 
 	if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
-		!f2fs_is_child_context_consistent_with_parent(new_dir,
-							old_inode)) {
+			!fscrypt_has_permitted_context(new_dir, old_inode)) {
 		err = -EPERM;
 		goto out;
 	}
 
-	f2fs_balance_fs(sbi);
-
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
-	if (!old_entry)
+	if (!old_entry) {
+		if (IS_ERR(old_page))
+			err = PTR_ERR(old_page);
 		goto out;
+	}
 
 	if (S_ISDIR(old_inode->i_mode)) {
-		err = -EIO;
 		old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
-		if (!old_dir_entry)
+		if (!old_dir_entry) {
+			if (IS_ERR(old_dir_page))
+				err = PTR_ERR(old_dir_page);
 			goto out_old;
+		}
 	}
 
 	if (flags & RENAME_WHITEOUT) {
@@ -632,8 +697,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		err = -ENOENT;
 		new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
 						&new_page);
-		if (!new_entry)
+		if (!new_entry) {
+			if (IS_ERR(new_page))
+				err = PTR_ERR(new_page);
 			goto out_whiteout;
+		}
+
+		f2fs_balance_fs(sbi, true);
 
 		f2fs_lock_op(sbi);
 
@@ -641,8 +711,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (err)
 			goto put_out_dir;
 
-		if (update_dent_inode(old_inode, new_inode,
-						&new_dentry->d_name)) {
+		err = update_dent_inode(old_inode, new_inode,
+						&new_dentry->d_name);
+		if (err) {
 			release_orphan_inode(sbi);
 			goto put_out_dir;
 		}
@@ -652,20 +723,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_inode->i_ctime = CURRENT_TIME;
 		down_write(&F2FS_I(new_inode)->i_sem);
 		if (old_dir_entry)
-			drop_nlink(new_inode);
-		drop_nlink(new_inode);
+			f2fs_i_links_write(new_inode, false);
+		f2fs_i_links_write(new_inode, false);
 		up_write(&F2FS_I(new_inode)->i_sem);
 
-		mark_inode_dirty(new_inode);
-
 		if (!new_inode->i_nlink)
-			add_orphan_inode(sbi, new_inode->i_ino);
+			add_orphan_inode(new_inode);
 		else
 			release_orphan_inode(sbi);
-
-		update_inode_page(old_inode);
-		update_inode_page(new_inode);
 	} else {
+		f2fs_balance_fs(sbi, true);
+
 		f2fs_lock_op(sbi);
 
 		err = f2fs_add_link(new_dentry, old_inode);
@@ -674,9 +742,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_whiteout;
 		}
 
-		if (old_dir_entry) {
-			inc_nlink(new_dir);
-			update_inode_page(new_dir);
+		if (old_dir_entry)
+			f2fs_i_links_write(new_dir, true);
+
+		/*
+		 * old entry and new entry can locate in the same inline
+		 * dentry in inode, when attaching new entry in inline dentry,
+		 * it could force inline dentry conversion, after that,
+		 * old_entry and old_page will point to wrong address, in
+		 * order to avoid this, let's do the check and update here.
+		 */
+		if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) {
+			f2fs_put_page(old_page, 0);
+			old_page = NULL;
+
+			old_entry = f2fs_find_entry(old_dir,
+						&old_dentry->d_name, &old_page);
+			if (!old_entry) {
+				err = -ENOENT;
+				if (IS_ERR(old_page))
+					err = PTR_ERR(old_page);
+				f2fs_unlock_op(sbi);
+				goto out_whiteout;
+			}
 		}
 	}
 
@@ -687,13 +775,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	up_write(&F2FS_I(old_inode)->i_sem);
 
 	old_inode->i_ctime = CURRENT_TIME;
-	mark_inode_dirty(old_inode);
+	f2fs_mark_inode_dirty_sync(old_inode);
 
 	f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
 
 	if (whiteout) {
 		whiteout->i_state |= I_LINKABLE;
-		set_inode_flag(F2FS_I(whiteout), FI_INC_LINK);
+		set_inode_flag(whiteout, FI_INC_LINK);
 		err = f2fs_add_link(old_dentry, whiteout);
 		if (err)
 			goto put_out_dir;
@@ -705,14 +793,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (old_dir != new_dir && !whiteout) {
 			f2fs_set_link(old_inode, old_dir_entry,
 						old_dir_page, new_dir);
-			update_inode_page(old_inode);
 		} else {
 			f2fs_dentry_kunmap(old_inode, old_dir_page);
 			f2fs_put_page(old_dir_page, 0);
 		}
-		drop_nlink(old_dir);
-		mark_inode_dirty(old_dir);
-		update_inode_page(old_dir);
+		f2fs_i_links_write(old_dir, false);
 	}
 
 	f2fs_unlock_op(sbi);
@@ -756,39 +841,45 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int err = -ENOENT;
 
 	if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
-		(old_dir != new_dir) &&
-		(!f2fs_is_child_context_consistent_with_parent(new_dir,
-								old_inode) ||
-		!f2fs_is_child_context_consistent_with_parent(old_dir,
-								new_inode)))
+			(old_dir != new_dir) &&
+			(!fscrypt_has_permitted_context(new_dir, old_inode) ||
+			 !fscrypt_has_permitted_context(old_dir, new_inode)))
 		return -EPERM;
 
-	f2fs_balance_fs(sbi);
-
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
-	if (!old_entry)
+	if (!old_entry) {
+		if (IS_ERR(old_page))
+			err = PTR_ERR(old_page);
 		goto out;
+	}
 
 	new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page);
-	if (!new_entry)
+	if (!new_entry) {
+		if (IS_ERR(new_page))
+			err = PTR_ERR(new_page);
 		goto out_old;
+	}
 
 	/* prepare for updating ".." directory entry info later */
 	if (old_dir != new_dir) {
 		if (S_ISDIR(old_inode->i_mode)) {
-			err = -EIO;
 			old_dir_entry = f2fs_parent_dir(old_inode,
 							&old_dir_page);
-			if (!old_dir_entry)
+			if (!old_dir_entry) {
+				if (IS_ERR(old_dir_page))
+					err = PTR_ERR(old_dir_page);
 				goto out_new;
+			}
 		}
 
 		if (S_ISDIR(new_inode->i_mode)) {
-			err = -EIO;
 			new_dir_entry = f2fs_parent_dir(new_inode,
 							&new_dir_page);
-			if (!new_dir_entry)
+			if (!new_dir_entry) {
+				if (IS_ERR(new_dir_page))
+					err = PTR_ERR(new_dir_page);
 				goto out_old_dir;
+			}
 		}
 	}
 
@@ -807,6 +898,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_new_dir;
 	}
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 
 	err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
@@ -836,19 +929,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	file_lost_pino(old_inode);
 	up_write(&F2FS_I(old_inode)->i_sem);
 
-	update_inode_page(old_inode);
-
 	old_dir->i_ctime = CURRENT_TIME;
 	if (old_nlink) {
 		down_write(&F2FS_I(old_dir)->i_sem);
-		if (old_nlink < 0)
-			drop_nlink(old_dir);
-		else
-			inc_nlink(old_dir);
+		f2fs_i_links_write(old_dir, old_nlink > 0);
 		up_write(&F2FS_I(old_dir)->i_sem);
 	}
-	mark_inode_dirty(old_dir);
-	update_inode_page(old_dir);
+	f2fs_mark_inode_dirty_sync(old_dir);
 
 	/* update directory entry info of new dir inode */
 	f2fs_set_link(new_dir, new_entry, new_page, old_inode);
@@ -857,19 +944,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	file_lost_pino(new_inode);
 	up_write(&F2FS_I(new_inode)->i_sem);
 
-	update_inode_page(new_inode);
-
 	new_dir->i_ctime = CURRENT_TIME;
 	if (new_nlink) {
 		down_write(&F2FS_I(new_dir)->i_sem);
-		if (new_nlink < 0)
-			drop_nlink(new_dir);
-		else
-			inc_nlink(new_dir);
+		f2fs_i_links_write(new_dir, new_nlink > 0);
 		up_write(&F2FS_I(new_dir)->i_sem);
 	}
-	mark_inode_dirty(new_dir);
-	update_inode_page(new_dir);
+	f2fs_mark_inode_dirty_sync(new_dir);
 
 	f2fs_unlock_op(sbi);
 
@@ -922,89 +1003,85 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 	return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
 static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
-	struct f2fs_str cstr;
-	struct f2fs_str pstr = FSTR_INIT(NULL, 0);
+	struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
+	struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
+	struct fscrypt_symlink_data *sd;
 	struct inode *inode = d_inode(dentry);
-	struct f2fs_encrypted_symlink_data *sd;
-	loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
 	u32 max_size = inode->i_sb->s_blocksize;
 	int res;
 
-	res = f2fs_get_encryption_info(inode);
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	res = fscrypt_get_encryption_info(inode);
 	if (res)
 		return ERR_PTR(res);
 
 	cpage = read_mapping_page(inode->i_mapping, 0, NULL);
 	if (IS_ERR(cpage))
 		return ERR_CAST(cpage);
-	caddr = kmap(cpage);
-	caddr[size] = 0;
+	caddr = page_address(cpage);
 
 	/* Symlink is encrypted */
-	sd = (struct f2fs_encrypted_symlink_data *)caddr;
+	sd = (struct fscrypt_symlink_data *)caddr;
+	cstr.name = sd->encrypted_path;
 	cstr.len = le16_to_cpu(sd->len);
-	cstr.name = kmalloc(cstr.len, GFP_NOFS);
-	if (!cstr.name) {
-		res = -ENOMEM;
-		goto errout;
-	}
-	memcpy(cstr.name, sd->encrypted_path, cstr.len);
 
 	/* this is broken symlink case */
-	if (cstr.name[0] == 0 && cstr.len == 0) {
+	if (unlikely(cstr.len == 0)) {
 		res = -ENOENT;
 		goto errout;
 	}
 
-	if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) >
-								max_size) {
+	if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
 		/* Symlink data on the disk is corrupted */
 		res = -EIO;
 		goto errout;
 	}
-	res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr);
+	res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
 	if (res)
 		goto errout;
 
-	res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+	res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
 	if (res < 0)
 		goto errout;
 
-	kfree(cstr.name);
+	/* this is broken symlink case */
+	if (unlikely(pstr.name[0] == 0)) {
+		res = -ENOENT;
+		goto errout;
+	}
 
 	paddr = pstr.name;
 
 	/* Null-terminate the name */
 	paddr[res] = '\0';
 
-	kunmap(cpage);
-	page_cache_release(cpage);
+	put_page(cpage);
 	return *cookie = paddr;
 errout:
-	kfree(cstr.name);
-	f2fs_fname_crypto_free_buffer(&pstr);
-	kunmap(cpage);
-	page_cache_release(cpage);
+	fscrypt_fname_free_buffer(&pstr);
+	put_page(cpage);
 	return ERR_PTR(res);
 }
 
 const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
 	.readlink       = generic_readlink,
-	.follow_link    = f2fs_encrypted_follow_link,
-	.put_link       = kfree_put_link,
+	.follow_link	= f2fs_encrypted_follow_link,
+	.put_link	= kfree_put_link,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= f2fs_listxattr,
 	.removexattr	= generic_removexattr,
-};
 #endif
+};
 
 const struct inode_operations f2fs_dir_inode_operations = {
 	.create		= f2fs_create,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7bcbc6e9c40d..b1e615ed2bef 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -46,12 +46,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	 */
 	if (type == FREE_NIDS) {
 		mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
-							PAGE_CACHE_SHIFT;
+							PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == NAT_ENTRIES) {
 		mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
-							PAGE_CACHE_SHIFT;
+							PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
+		if (excess_cached_nats(sbi))
+			res = false;
 	} else if (type == DIRTY_DENTS) {
 		if (sbi->sb->s_bdi->wb.dirty_exceeded)
 			return false;
@@ -62,16 +64,17 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 
 		for (i = 0; i <= UPDATE_INO; i++)
 			mem_size += (sbi->im[i].ino_num *
-				sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
+				sizeof(struct ino_entry)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == EXTENT_CACHE) {
-		mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) +
+		mem_size = (atomic_read(&sbi->total_ext_tree) *
+				sizeof(struct extent_tree) +
 				atomic_read(&sbi->total_ext_node) *
-				sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
+				sizeof(struct extent_node)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else {
-		if (sbi->sb->s_bdi->wb.dirty_exceeded)
-			return false;
+		if (!sbi->sb->s_bdi->wb.dirty_exceeded)
+			return true;
 	}
 	return res;
 }
@@ -120,7 +123,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 
 	src_addr = page_address(src_page);
 	dst_addr = page_address(dst_page);
-	memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+	memcpy(dst_addr, src_addr, PAGE_SIZE);
 	set_page_dirty(dst_page);
 	f2fs_put_page(src_page, 1);
 
@@ -256,18 +259,21 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
 	return new;
 }
 
-static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 						struct f2fs_nat_entry *ne)
 {
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
 
-	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
 	if (!e) {
 		e = grab_nat_entry(nm_i, nid);
 		node_info_from_raw_nat(&e->ni, ne);
+	} else {
+		f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
+				nat_get_blkaddr(e) != ne->block_addr ||
+				nat_get_version(e) != ne->version);
 	}
-	up_write(&nm_i->nat_tree_lock);
 }
 
 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -355,7 +361,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct f2fs_journal *journal = curseg->journal;
 	nid_t start_nid = START_NID(nid);
 	struct f2fs_nat_block *nat_blk;
 	struct page *page = NULL;
@@ -372,21 +378,20 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 		ni->ino = nat_get_ino(e);
 		ni->blk_addr = nat_get_blkaddr(e);
 		ni->version = nat_get_version(e);
-	}
-	up_read(&nm_i->nat_tree_lock);
-	if (e)
+		up_read(&nm_i->nat_tree_lock);
 		return;
+	}
 
 	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
 
 	/* Check current segment summary */
-	mutex_lock(&curseg->curseg_mutex);
-	i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
+	down_read(&curseg->journal_rwsem);
+	i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
 	if (i >= 0) {
-		ne = nat_in_journal(sum, i);
+		ne = nat_in_journal(journal, i);
 		node_info_from_raw_nat(ni, &ne);
 	}
-	mutex_unlock(&curseg->curseg_mutex);
+	up_read(&curseg->journal_rwsem);
 	if (i >= 0)
 		goto cache;
 
@@ -397,18 +402,75 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 	node_info_from_raw_nat(ni, &ne);
 	f2fs_put_page(page, 1);
 cache:
+	up_read(&nm_i->nat_tree_lock);
 	/* cache nat entry */
-	cache_nat_entry(NM_I(sbi), nid, &ne);
+	down_write(&nm_i->nat_tree_lock);
+	cache_nat_entry(sbi, nid, &ne);
+	up_write(&nm_i->nat_tree_lock);
+}
+
+/*
+ * readahead MAX_RA_NODE number of node pages.
+ */
+static void ra_node_pages(struct page *parent, int start, int n)
+{
+	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+	struct blk_plug plug;
+	int i, end;
+	nid_t nid;
+
+	blk_start_plug(&plug);
+
+	/* Then, try readahead for siblings of the desired node */
+	end = start + n;
+	end = min(end, NIDS_PER_BLOCK);
+	for (i = start; i < end; i++) {
+		nid = get_nid(parent, i, false);
+		ra_node_page(sbi, nid);
+	}
+
+	blk_finish_plug(&plug);
+}
+
+pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
+{
+	const long direct_index = ADDRS_PER_INODE(dn->inode);
+	const long direct_blks = ADDRS_PER_BLOCK;
+	const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+	unsigned int skipped_unit = ADDRS_PER_BLOCK;
+	int cur_level = dn->cur_level;
+	int max_level = dn->max_level;
+	pgoff_t base = 0;
+
+	if (!dn->max_level)
+		return pgofs + 1;
+
+	while (max_level-- > cur_level)
+		skipped_unit *= NIDS_PER_BLOCK;
+
+	switch (dn->max_level) {
+	case 3:
+		base += 2 * indirect_blks;
+	case 2:
+		base += 2 * direct_blks;
+	case 1:
+		base += direct_index;
+		break;
+	default:
+		f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
+	}
+
+	return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
 }
 
 /*
  * The maximum depth is four.
  * Offset[0] will have raw inode offset.
  */
-static int get_node_path(struct f2fs_inode_info *fi, long block,
+static int get_node_path(struct inode *inode, long block,
 				int offset[4], unsigned int noffset[4])
 {
-	const long direct_index = ADDRS_PER_INODE(fi);
+	const long direct_index = ADDRS_PER_INODE(inode);
 	const long direct_blks = ADDRS_PER_BLOCK;
 	const long dptrs_per_blk = NIDS_PER_BLOCK;
 	const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
@@ -493,10 +555,10 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 	int offset[4];
 	unsigned int noffset[4];
 	nid_t nids[4];
-	int level, i;
+	int level, i = 0;
 	int err = 0;
 
-	level = get_node_path(F2FS_I(dn->inode), index, offset, noffset);
+	level = get_node_path(dn->inode, index, offset, noffset);
 
 	nids[0] = dn->inode->i_ino;
 	npage[0] = dn->inode_page;
@@ -583,6 +645,11 @@ release_pages:
 release_out:
 	dn->inode_page = NULL;
 	dn->node_page = NULL;
+	if (err == -ENOENT) {
+		dn->cur_level = i;
+		dn->max_level = level;
+		dn->ofs_in_node = offset[level];
+	}
 	return err;
 }
 
@@ -606,8 +673,7 @@ static void truncate_node(struct dnode_of_data *dn)
 	if (dn->nid == dn->inode->i_ino) {
 		remove_orphan_inode(sbi, dn->nid);
 		dec_valid_inode_count(sbi);
-	} else {
-		sync_inode_page(dn);
+		f2fs_inode_synced(dn->inode);
 	}
 invalidate:
 	clear_node_page_dirty(dn->node_page);
@@ -666,6 +732,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 		return PTR_ERR(page);
 	}
 
+	ra_node_pages(page, ofs, NIDS_PER_BLOCK);
+
 	rn = F2FS_NODE(page);
 	if (depth < 3) {
 		for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
@@ -676,7 +744,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 			ret = truncate_dnode(&rdn);
 			if (ret < 0)
 				goto out_err;
-			set_nid(page, i, 0, false);
+			if (set_nid(page, i, 0, false))
+				dn->node_changed = true;
 		}
 	} else {
 		child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
@@ -689,7 +758,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 			rdn.nid = child_nid;
 			ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
 			if (ret == (NIDS_PER_BLOCK + 1)) {
-				set_nid(page, i, 0, false);
+				if (set_nid(page, i, 0, false))
+					dn->node_changed = true;
 				child_nofs += ret;
 			} else if (ret < 0 && ret != -ENOENT) {
 				goto out_err;
@@ -741,6 +811,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
 		nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
 	}
 
+	ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
+
 	/* free direct nodes linked to a partial indirect node */
 	for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
 		child_nid = get_nid(pages[idx], i, false);
@@ -750,7 +822,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
 		err = truncate_dnode(dn);
 		if (err < 0)
 			goto fail;
-		set_nid(pages[idx], i, 0, false);
+		if (set_nid(pages[idx], i, 0, false))
+			dn->node_changed = true;
 	}
 
 	if (offset[idx + 1] == 0) {
@@ -787,8 +860,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 
 	trace_f2fs_truncate_inode_blocks_enter(inode, from);
 
-	level = get_node_path(F2FS_I(inode), from, offset, noffset);
-restart:
+	level = get_node_path(inode, from, offset, noffset);
+
 	page = get_node_page(sbi, inode->i_ino);
 	if (IS_ERR(page)) {
 		trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
@@ -852,11 +925,8 @@ skip_partial:
 		if (offset[1] == 0 &&
 				ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
 			lock_page(page);
-			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
-				f2fs_put_page(page, 1);
-				goto restart;
-			}
-			f2fs_wait_on_page_writeback(page, NODE);
+			BUG_ON(page->mapping != NODE_MAPPING(sbi));
+			f2fs_wait_on_page_writeback(page, NODE, true);
 			ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
 			set_page_dirty(page);
 			unlock_page(page);
@@ -885,7 +955,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
 	if (IS_ERR(npage))
 		return PTR_ERR(npage);
 
-	F2FS_I(inode)->i_xattr_nid = 0;
+	f2fs_i_xnid_write(inode, 0);
 
 	/* need to do checkpoint during fsync */
 	F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
@@ -951,10 +1021,10 @@ struct page *new_node_page(struct dnode_of_data *dn,
 	struct page *page;
 	int err;
 
-	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
 		return ERR_PTR(-EPERM);
 
-	page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
+	page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -971,23 +1041,19 @@ struct page *new_node_page(struct dnode_of_data *dn,
 	new_ni.ino = dn->inode->i_ino;
 	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
 
-	f2fs_wait_on_page_writeback(page, NODE);
+	f2fs_wait_on_page_writeback(page, NODE, true);
 	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
 	set_cold_node(dn->inode, page);
-	SetPageUptodate(page);
-	set_page_dirty(page);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	if (set_page_dirty(page))
+		dn->node_changed = true;
 
 	if (f2fs_has_xattr_block(ofs))
-		F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
+		f2fs_i_xnid_write(dn->inode, dn->nid);
 
-	dn->node_page = page;
-	if (ipage)
-		update_inode(dn->inode, ipage);
-	else
-		sync_inode_page(dn);
 	if (ofs == 0)
 		inc_valid_inode_count(sbi);
-
 	return page;
 
 fail:
@@ -1013,6 +1079,9 @@ static int read_node_page(struct page *page, int rw)
 		.encrypted_page = NULL,
 	};
 
+	if (PageUptodate(page))
+		return LOCKED_PAGE;
+
 	get_node_info(sbi, page->index, &ni);
 
 	if (unlikely(ni.blk_addr == NULL_ADDR)) {
@@ -1020,10 +1089,7 @@ static int read_node_page(struct page *page, int rw)
 		return -ENOENT;
 	}
 
-	if (PageUptodate(page))
-		return LOCKED_PAGE;
-
-	fio.blk_addr = ni.blk_addr;
+	fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
 	return f2fs_submit_page_bio(&fio);
 }
 
@@ -1035,14 +1101,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 	struct page *apage;
 	int err;
 
-	apage = find_get_page(NODE_MAPPING(sbi), nid);
-	if (apage && PageUptodate(apage)) {
-		f2fs_put_page(apage, 0);
+	if (!nid)
 		return;
-	}
-	f2fs_put_page(apage, 0);
+	f2fs_bug_on(sbi, check_nid_range(sbi, nid));
 
-	apage = grab_cache_page(NODE_MAPPING(sbi), nid);
+	rcu_read_lock();
+	apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
+	rcu_read_unlock();
+	if (apage)
+		return;
+
+	apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
 	if (!apage)
 		return;
 
@@ -1050,53 +1119,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 	f2fs_put_page(apage, err ? 1 : 0);
 }
 
-struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
+					struct page *parent, int start)
 {
 	struct page *page;
 	int err;
-repeat:
-	page = grab_cache_page(NODE_MAPPING(sbi), nid);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
 
-	err = read_node_page(page, READ_SYNC);
-	if (err < 0) {
-		f2fs_put_page(page, 1);
-		return ERR_PTR(err);
-	} else if (err != LOCKED_PAGE) {
-		lock_page(page);
-	}
-
-	if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
-		ClearPageUptodate(page);
-		f2fs_put_page(page, 1);
-		return ERR_PTR(-EIO);
-	}
-	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
-		f2fs_put_page(page, 1);
-		goto repeat;
-	}
-	return page;
-}
-
-/*
- * Return a locked page for the desired node page.
- * And, readahead MAX_RA_NODE number of node pages.
- */
-struct page *get_node_page_ra(struct page *parent, int start)
-{
-	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
-	struct blk_plug plug;
-	struct page *page;
-	int err, i, end;
-	nid_t nid;
-
-	/* First, try getting the desired direct node. */
-	nid = get_nid(parent, start, false);
 	if (!nid)
 		return ERR_PTR(-ENOENT);
+	f2fs_bug_on(sbi, check_nid_range(sbi, nid));
 repeat:
-	page = grab_cache_page(NODE_MAPPING(sbi), nid);
+	page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -1108,61 +1141,116 @@ repeat:
 		goto page_hit;
 	}
 
-	blk_start_plug(&plug);
-
-	/* Then, try readahead for siblings of the desired node */
-	end = start + MAX_RA_NODE;
-	end = min(end, NIDS_PER_BLOCK);
-	for (i = start + 1; i < end; i++) {
-		nid = get_nid(parent, i, false);
-		if (!nid)
-			continue;
-		ra_node_page(sbi, nid);
-	}
-
-	blk_finish_plug(&plug);
+	if (parent)
+		ra_node_pages(parent, start + 1, MAX_RA_NODE);
 
 	lock_page(page);
+
 	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
+
+	if (unlikely(!PageUptodate(page)))
+		goto out_err;
 page_hit:
-	if (unlikely(!PageUptodate(page))) {
+	if(unlikely(nid != nid_of_node(page))) {
+		f2fs_bug_on(sbi, 1);
+		ClearPageUptodate(page);
+out_err:
 		f2fs_put_page(page, 1);
 		return ERR_PTR(-EIO);
 	}
 	return page;
 }
 
-void sync_inode_page(struct dnode_of_data *dn)
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
 {
-	if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
-		update_inode(dn->inode, dn->node_page);
-	} else if (dn->inode_page) {
-		if (!dn->inode_page_locked)
-			lock_page(dn->inode_page);
-		update_inode(dn->inode, dn->inode_page);
-		if (!dn->inode_page_locked)
-			unlock_page(dn->inode_page);
-	} else {
-		update_inode_page(dn->inode);
-	}
+	return __get_node_page(sbi, nid, NULL, 0);
 }
 
-int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
-					struct writeback_control *wbc)
+struct page *get_node_page_ra(struct page *parent, int start)
+{
+	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+	nid_t nid = get_nid(parent, start, false);
+
+	return __get_node_page(sbi, nid, parent, start);
+}
+
+static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	struct inode *inode;
+	struct page *page;
+	int ret;
+
+	/* should flush inline_data before evict_inode */
+	inode = ilookup(sbi->sb, ino);
+	if (!inode)
+		return;
+
+	page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
+	if (!page)
+		goto iput_out;
+
+	if (!PageUptodate(page))
+		goto page_out;
+
+	if (!PageDirty(page))
+		goto page_out;
+
+	if (!clear_page_dirty_for_io(page))
+		goto page_out;
+
+	ret = f2fs_write_inline_data(inode, page);
+	inode_dec_dirty_pages(inode);
+	if (ret)
+		set_page_dirty(page);
+page_out:
+	f2fs_put_page(page, 1);
+iput_out:
+	iput(inode);
+}
+
+void move_node_page(struct page *node_page, int gc_type)
+{
+	if (gc_type == FG_GC) {
+		struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 1,
+			.for_reclaim = 0,
+		};
+
+		set_page_dirty(node_page);
+		f2fs_wait_on_page_writeback(node_page, NODE, true);
+
+		f2fs_bug_on(sbi, PageWriteback(node_page));
+		if (!clear_page_dirty_for_io(node_page))
+			goto out_page;
+
+		if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
+			unlock_page(node_page);
+		goto release_page;
+	} else {
+		/* set page dirty and write it */
+		if (!PageWriteback(node_page))
+			set_page_dirty(node_page);
+	}
+out_page:
+	unlock_page(node_page);
+release_page:
+	f2fs_put_page(node_page, 0);
+}
+
+static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	pgoff_t index, end;
 	struct pagevec pvec;
-	int step = ino ? 2 : 0;
-	int nwritten = 0, wrote = 0;
+	struct page *last_page = NULL;
 
 	pagevec_init(&pvec, 0);
-
-next_step:
 	index = 0;
-	end = LONG_MAX;
+	end = ULONG_MAX;
 
 	while (index <= end) {
 		int i, nr_pages;
@@ -1175,6 +1263,190 @@ next_step:
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
+			if (unlikely(f2fs_cp_error(sbi))) {
+				f2fs_put_page(last_page, 0);
+				pagevec_release(&pvec);
+				return ERR_PTR(-EIO);
+			}
+
+			if (!IS_DNODE(page) || !is_cold_node(page))
+				continue;
+			if (ino_of_node(page) != ino)
+				continue;
+
+			lock_page(page);
+
+			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+				unlock_page(page);
+				continue;
+			}
+			if (ino_of_node(page) != ino)
+				goto continue_unlock;
+
+			if (!PageDirty(page)) {
+				/* someone wrote it for us */
+				goto continue_unlock;
+			}
+
+			if (last_page)
+				f2fs_put_page(last_page, 0);
+
+			get_page(page);
+			last_page = page;
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return last_page;
+}
+
+int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
+			struct writeback_control *wbc, bool atomic)
+{
+	pgoff_t index, end;
+	struct pagevec pvec;
+	int ret = 0;
+	struct page *last_page = NULL;
+	bool marked = false;
+	nid_t ino = inode->i_ino;
+	int nwritten = 0;
+
+	if (atomic) {
+		last_page = last_fsync_dnode(sbi, ino);
+		if (IS_ERR_OR_NULL(last_page))
+			return PTR_ERR_OR_ZERO(last_page);
+	}
+retry:
+	pagevec_init(&pvec, 0);
+	index = 0;
+	end = ULONG_MAX;
+
+	while (index <= end) {
+		int i, nr_pages;
+		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_DIRTY,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (unlikely(f2fs_cp_error(sbi))) {
+				f2fs_put_page(last_page, 0);
+				pagevec_release(&pvec);
+				return -EIO;
+			}
+
+			if (!IS_DNODE(page) || !is_cold_node(page))
+				continue;
+			if (ino_of_node(page) != ino)
+				continue;
+
+			lock_page(page);
+
+			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+				unlock_page(page);
+				continue;
+			}
+			if (ino_of_node(page) != ino)
+				goto continue_unlock;
+
+			if (!PageDirty(page) && page != last_page) {
+				/* someone wrote it for us */
+				goto continue_unlock;
+			}
+
+			f2fs_wait_on_page_writeback(page, NODE, true);
+			BUG_ON(PageWriteback(page));
+
+			if (!atomic || page == last_page) {
+				set_fsync_mark(page, 1);
+				if (IS_INODE(page)) {
+					if (is_inode_flag_set(inode,
+								FI_DIRTY_INODE))
+						update_inode(inode, page);
+					set_dentry_mark(page,
+						need_dentry_mark(sbi, ino));
+				}
+				/*  may be written by other thread */
+				if (!PageDirty(page))
+					set_page_dirty(page);
+			}
+
+			if (!clear_page_dirty_for_io(page))
+				goto continue_unlock;
+
+			ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
+			if (ret) {
+				unlock_page(page);
+				f2fs_put_page(last_page, 0);
+				break;
+			} else {
+				nwritten++;
+			}
+
+			if (page == last_page) {
+				f2fs_put_page(page, 0);
+				marked = true;
+				break;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+
+		if (ret || marked)
+			break;
+	}
+	if (!ret && atomic && !marked) {
+		f2fs_msg(sbi->sb, KERN_DEBUG,
+			"Retry to write fsync mark: ino=%u, idx=%lx",
+					ino, last_page->index);
+		lock_page(last_page);
+		set_page_dirty(last_page);
+		unlock_page(last_page);
+		goto retry;
+	}
+
+	if (nwritten)
+		f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE);
+	return ret ? -EIO: 0;
+}
+
+int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
+{
+	pgoff_t index, end;
+	struct pagevec pvec;
+	int step = 0;
+	int nwritten = 0;
+	int ret = 0;
+
+	pagevec_init(&pvec, 0);
+
+next_step:
+	index = 0;
+	end = ULONG_MAX;
+
+	while (index <= end) {
+		int i, nr_pages;
+		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_DIRTY,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (unlikely(f2fs_cp_error(sbi))) {
+				pagevec_release(&pvec);
+				ret = -EIO;
+				goto out;
+			}
+
 			/*
 			 * flushing sequence with step:
 			 * 0. indirect nodes
@@ -1189,14 +1461,8 @@ next_step:
 			if (step == 2 && (!IS_DNODE(page) ||
 						!is_cold_node(page)))
 				continue;
-
-			/*
-			 * If an fsync mode,
-			 * we should not skip writing node pages.
-			 */
-			if (ino && ino_of_node(page) == ino)
-				lock_page(page);
-			else if (!trylock_page(page))
+lock_node:
+			if (!trylock_page(page))
 				continue;
 
 			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
@@ -1204,33 +1470,33 @@ continue_unlock:
 				unlock_page(page);
 				continue;
 			}
-			if (ino && ino_of_node(page) != ino)
-				goto continue_unlock;
 
 			if (!PageDirty(page)) {
 				/* someone wrote it for us */
 				goto continue_unlock;
 			}
 
+			/* flush inline_data */
+			if (is_inline_node(page)) {
+				clear_inline_node(page);
+				unlock_page(page);
+				flush_inline_data(sbi, ino_of_node(page));
+				goto lock_node;
+			}
+
+			f2fs_wait_on_page_writeback(page, NODE, true);
+
+			BUG_ON(PageWriteback(page));
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			/* called by fsync() */
-			if (ino && IS_DNODE(page)) {
-				set_fsync_mark(page, 1);
-				if (IS_INODE(page))
-					set_dentry_mark(page,
-						need_dentry_mark(sbi, ino));
-				nwritten++;
-			} else {
-				set_fsync_mark(page, 0);
-				set_dentry_mark(page, 0);
-			}
+			set_fsync_mark(page, 0);
+			set_dentry_mark(page, 0);
 
 			if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
 				unlock_page(page);
 			else
-				wrote++;
+				nwritten++;
 
 			if (--wbc->nr_to_write == 0)
 				break;
@@ -1248,15 +1514,15 @@ continue_unlock:
 		step++;
 		goto next_step;
 	}
-
-	if (wrote)
+out:
+	if (nwritten)
 		f2fs_submit_merged_bio(sbi, NODE, WRITE);
-	return nwritten;
+	return ret;
 }
 
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	pgoff_t index = 0, end = LONG_MAX;
+	pgoff_t index = 0, end = ULONG_MAX;
 	struct pagevec pvec;
 	int ret2 = 0, ret = 0;
 
@@ -1278,7 +1544,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 				continue;
 
 			if (ino && ino_of_node(page) == ino) {
-				f2fs_wait_on_page_writeback(page, NODE);
+				f2fs_wait_on_page_writeback(page, NODE, true);
 				if (TestClearPageError(page))
 					ret = -EIO;
 			}
@@ -1317,8 +1583,6 @@ static int f2fs_write_node_page(struct page *page,
 	if (unlikely(f2fs_cp_error(sbi)))
 		goto redirty_out;
 
-	f2fs_wait_on_page_writeback(page, NODE);
-
 	/* get old block addr of this node page */
 	nid = nid_of_node(page);
 	f2fs_bug_on(sbi, page->index != nid);
@@ -1342,14 +1606,18 @@ static int f2fs_write_node_page(struct page *page,
 	}
 
 	set_page_writeback(page);
-	fio.blk_addr = ni.blk_addr;
+	fio.old_blkaddr = ni.blk_addr;
 	write_node_page(nid, &fio);
-	set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
+	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
 	dec_page_count(sbi, F2FS_DIRTY_NODES);
 	up_read(&sbi->node_write);
-	unlock_page(page);
 
 	if (wbc->for_reclaim)
+		f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE);
+
+	unlock_page(page);
+
+	if (unlikely(f2fs_cp_error(sbi)))
 		f2fs_submit_merged_bio(sbi, NODE, WRITE);
 
 	return 0;
@@ -1363,10 +1631,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
 	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+	struct blk_plug plug;
 	long diff;
 
-	trace_f2fs_writepages(mapping->host, wbc, NODE);
-
 	/* balancing f2fs's metadata in background */
 	f2fs_balance_fs_bg(sbi);
 
@@ -1374,14 +1641,19 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 	if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
 		goto skip_write;
 
+	trace_f2fs_writepages(mapping->host, wbc, NODE);
+
 	diff = nr_pages_to_write(sbi, NODE, wbc);
 	wbc->sync_mode = WB_SYNC_NONE;
-	sync_node_pages(sbi, 0, wbc);
+	blk_start_plug(&plug);
+	sync_node_pages(sbi, wbc);
+	blk_finish_plug(&plug);
 	wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
 	return 0;
 
 skip_write:
 	wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
+	trace_f2fs_writepages(mapping->host, wbc, NODE);
 	return 0;
 }
 
@@ -1389,9 +1661,10 @@ static int f2fs_set_node_page_dirty(struct page *page)
 {
 	trace_f2fs_set_page_dirty(page, NODE);
 
-	SetPageUptodate(page);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
 	if (!PageDirty(page)) {
-		__set_page_dirty_nobuffers(page);
+		f2fs_set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
 		SetPagePrivate(page);
 		f2fs_trace_pid(page);
@@ -1409,6 +1682,9 @@ const struct address_space_operations f2fs_node_aops = {
 	.set_page_dirty	= f2fs_set_node_page_dirty,
 	.invalidatepage	= f2fs_invalidate_page,
 	.releasepage	= f2fs_release_page,
+#ifdef CONFIG_MIGRATION
+	.migratepage    = f2fs_migrate_page,
+#endif
 };
 
 static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
@@ -1429,7 +1705,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i;
 	struct nat_entry *ne;
-	bool allocated = false;
 
 	if (!available_free_memory(sbi, FREE_NIDS))
 		return -1;
@@ -1440,14 +1715,9 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 	if (build) {
 		/* do not add allocated nids */
-		down_read(&nm_i->nat_tree_lock);
 		ne = __lookup_nat_cache(nm_i, nid);
-		if (ne &&
-			(!get_nat_flag(ne, IS_CHECKPOINTED) ||
+		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
 				nat_get_blkaddr(ne) != NULL_ADDR))
-			allocated = true;
-		up_read(&nm_i->nat_tree_lock);
-		if (allocated)
 			return 0;
 	}
 
@@ -1516,22 +1786,24 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 	}
 }
 
-static void build_free_nids(struct f2fs_sb_info *sbi)
+void build_free_nids(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct f2fs_journal *journal = curseg->journal;
 	int i = 0;
 	nid_t nid = nm_i->next_scan_nid;
 
 	/* Enough entries */
-	if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK)
+	if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK)
 		return;
 
 	/* readahead nat pages to be scanned */
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
 							META_NAT, true);
 
+	down_read(&nm_i->nat_tree_lock);
+
 	while (1) {
 		struct page *page = get_current_nat_page(sbi, nid);
 
@@ -1550,16 +1822,19 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 	nm_i->next_scan_nid = nid;
 
 	/* find free nids from current sum_pages */
-	mutex_lock(&curseg->curseg_mutex);
-	for (i = 0; i < nats_in_cursum(sum); i++) {
-		block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
-		nid = le32_to_cpu(nid_in_journal(sum, i));
+	down_read(&curseg->journal_rwsem);
+	for (i = 0; i < nats_in_cursum(journal); i++) {
+		block_t addr;
+
+		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+		nid = le32_to_cpu(nid_in_journal(journal, i));
 		if (addr == NULL_ADDR)
 			add_free_nid(sbi, nid, true);
 		else
 			remove_free_nid(nm_i, nid);
 	}
-	mutex_unlock(&curseg->curseg_mutex);
+	up_read(&curseg->journal_rwsem);
+	up_read(&nm_i->nat_tree_lock);
 
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
 					nm_i->ra_nid_pages, META_NAT, false);
@@ -1575,6 +1850,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i = NULL;
 retry:
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_ALLOC_NID))
+		return false;
+#endif
 	if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
 		return false;
 
@@ -1582,8 +1861,6 @@ retry:
 
 	/* We should not use stale free nids created by build_free_nids */
 	if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
-		struct node_info ni;
-
 		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
 		list_for_each_entry(i, &nm_i->free_nid_list, list)
 			if (i->state == NID_NEW)
@@ -1594,13 +1871,6 @@ retry:
 		i->state = NID_ALLOC;
 		nm_i->fcnt--;
 		spin_unlock(&nm_i->free_nid_list_lock);
-
-		/* check nid is allocated already */
-		get_node_info(sbi, *nid, &ni);
-		if (ni.blk_addr != NULL_ADDR) {
-			alloc_nid_done(sbi, *nid);
-			goto retry;
-		}
 		return true;
 	}
 	spin_unlock(&nm_i->free_nid_list_lock);
@@ -1663,12 +1933,15 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
 	struct free_nid *i, *next;
 	int nr = nr_shrink;
 
+	if (nm_i->fcnt <= MAX_FREE_NIDS)
+		return 0;
+
 	if (!mutex_trylock(&nm_i->build_lock))
 		return 0;
 
 	spin_lock(&nm_i->free_nid_list_lock);
 	list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
-		if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK)
+		if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS)
 			break;
 		if (i->state == NID_ALLOC)
 			continue;
@@ -1695,7 +1968,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
 
 	ri = F2FS_INODE(page);
 	if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
-		clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+		clear_inode_flag(inode, FI_INLINE_XATTR);
 		goto update_inode;
 	}
 
@@ -1703,7 +1976,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
 	src_addr = inline_xattr_addr(page);
 	inline_size = inline_xattr_size(inode);
 
-	f2fs_wait_on_page_writeback(ipage, NODE);
+	f2fs_wait_on_page_writeback(ipage, NODE, true);
 	memcpy(dst_addr, src_addr, inline_size);
 update_inode:
 	update_inode(inode, ipage);
@@ -1737,13 +2010,11 @@ recover_xnid:
 	get_node_info(sbi, new_xnid, &ni);
 	ni.ino = inode->i_ino;
 	set_node_addr(sbi, &ni, NEW_ADDR, false);
-	F2FS_I(inode)->i_xattr_nid = new_xnid;
+	f2fs_i_xnid_write(inode, new_xnid);
 
 	/* 3: update xattr blkaddr */
 	refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
 	set_node_addr(sbi, &ni, blkaddr, false);
-
-	update_inode_page(inode);
 }
 
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1757,15 +2028,18 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 
 	if (unlikely(old_ni.blk_addr != NULL_ADDR))
 		return -EINVAL;
-
-	ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
-	if (!ipage)
-		return -ENOMEM;
+retry:
+	ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
+	if (!ipage) {
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		goto retry;
+	}
 
 	/* Should not use this inode from free nid list */
 	remove_free_nid(NM_I(sbi), ino);
 
-	SetPageUptodate(ipage);
+	if (!PageUptodate(ipage))
+		SetPageUptodate(ipage);
 	fill_node_footer(ipage, ino, ino, 0, true);
 
 	src = F2FS_INODE(page);
@@ -1831,28 +2105,26 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct f2fs_journal *journal = curseg->journal;
 	int i;
 
-	mutex_lock(&curseg->curseg_mutex);
-	for (i = 0; i < nats_in_cursum(sum); i++) {
+	down_write(&curseg->journal_rwsem);
+	for (i = 0; i < nats_in_cursum(journal); i++) {
 		struct nat_entry *ne;
 		struct f2fs_nat_entry raw_ne;
-		nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
+		nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
 
-		raw_ne = nat_in_journal(sum, i);
+		raw_ne = nat_in_journal(journal, i);
 
-		down_write(&nm_i->nat_tree_lock);
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (!ne) {
 			ne = grab_nat_entry(nm_i, nid);
 			node_info_from_raw_nat(&ne->ni, &raw_ne);
 		}
 		__set_nat_cache_dirty(nm_i, ne);
-		up_write(&nm_i->nat_tree_lock);
 	}
-	update_nats_in_cursum(sum, -i);
-	mutex_unlock(&curseg->curseg_mutex);
+	update_nats_in_cursum(journal, -i);
+	up_write(&curseg->journal_rwsem);
 }
 
 static void __adjust_nat_entry_set(struct nat_entry_set *nes,
@@ -1877,24 +2149,23 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 					struct nat_entry_set *set)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct f2fs_journal *journal = curseg->journal;
 	nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
 	bool to_journal = true;
 	struct f2fs_nat_block *nat_blk;
 	struct nat_entry *ne, *cur;
 	struct page *page = NULL;
-	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
 	/*
 	 * there are two steps to flush nat entries:
 	 * #1, flush nat entries to journal in current hot data summary block.
 	 * #2, flush nat entries to nat page.
 	 */
-	if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+	if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
 		to_journal = false;
 
 	if (to_journal) {
-		mutex_lock(&curseg->curseg_mutex);
+		down_write(&curseg->journal_rwsem);
 	} else {
 		page = get_next_nat_page(sbi, start_nid);
 		nat_blk = page_address(page);
@@ -1911,35 +2182,29 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 			continue;
 
 		if (to_journal) {
-			offset = lookup_journal_in_cursum(sum,
+			offset = lookup_journal_in_cursum(journal,
 							NAT_JOURNAL, nid, 1);
 			f2fs_bug_on(sbi, offset < 0);
-			raw_ne = &nat_in_journal(sum, offset);
-			nid_in_journal(sum, offset) = cpu_to_le32(nid);
+			raw_ne = &nat_in_journal(journal, offset);
+			nid_in_journal(journal, offset) = cpu_to_le32(nid);
 		} else {
 			raw_ne = &nat_blk->entries[nid - start_nid];
 		}
 		raw_nat_from_node_info(raw_ne, &ne->ni);
-
-		down_write(&NM_I(sbi)->nat_tree_lock);
 		nat_reset_flag(ne);
 		__clear_nat_cache_dirty(NM_I(sbi), ne);
-		up_write(&NM_I(sbi)->nat_tree_lock);
-
 		if (nat_get_blkaddr(ne) == NULL_ADDR)
 			add_free_nid(sbi, nid, false);
 	}
 
 	if (to_journal)
-		mutex_unlock(&curseg->curseg_mutex);
+		up_write(&curseg->journal_rwsem);
 	else
 		f2fs_put_page(page, 1);
 
 	f2fs_bug_on(sbi, set->entry_cnt);
 
-	down_write(&nm_i->nat_tree_lock);
 	radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
-	up_write(&nm_i->nat_tree_lock);
 	kmem_cache_free(nat_entry_set_slab, set);
 }
 
@@ -1950,7 +2215,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct f2fs_journal *journal = curseg->journal;
 	struct nat_entry_set *setvec[SETVEC_SIZE];
 	struct nat_entry_set *set, *tmp;
 	unsigned int found;
@@ -1959,29 +2224,32 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 
 	if (!nm_i->dirty_nat_cnt)
 		return;
+
+	down_write(&nm_i->nat_tree_lock);
+
 	/*
 	 * if there are no enough space in journal to store dirty nat
 	 * entries, remove all entries from journal and merge them
 	 * into nat entry set.
 	 */
-	if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+	if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
 		remove_nats_in_journal(sbi);
 
-	down_write(&nm_i->nat_tree_lock);
 	while ((found = __gang_lookup_nat_set(nm_i,
 					set_idx, SETVEC_SIZE, setvec))) {
 		unsigned idx;
 		set_idx = setvec[found - 1]->set + 1;
 		for (idx = 0; idx < found; idx++)
 			__adjust_nat_entry_set(setvec[idx], &sets,
-							MAX_NAT_JENTRIES(sum));
+						MAX_NAT_JENTRIES(journal));
 	}
-	up_write(&nm_i->nat_tree_lock);
 
 	/* flush dirty nats in nat entry set */
 	list_for_each_entry_safe(set, tmp, &sets, set_list)
 		__flush_nat_entry_set(sbi, set);
 
+	up_write(&nm_i->nat_tree_lock);
+
 	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
 }
 
@@ -2006,6 +2274,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	nm_i->nat_cnt = 0;
 	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
 	nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
+	nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
 
 	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
 	INIT_LIST_HEAD(&nm_i->free_nid_list);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index e4fffd2d98c4..868bec65e51c 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -15,15 +15,21 @@
 #define	NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
 
 /* # of pages to perform synchronous readahead before building free nids */
-#define FREE_NID_PAGES 4
+#define FREE_NID_PAGES	8
+#define MAX_FREE_NIDS	(NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
 
-#define DEF_RA_NID_PAGES	4	/* # of nid pages to be readaheaded */
+#define DEF_RA_NID_PAGES	0	/* # of nid pages to be readaheaded */
 
 /* maximum readahead size for node during getting data blocks */
 #define MAX_RA_NODE		128
 
 /* control the memory footprint threshold (10MB per 1GB ram) */
-#define DEF_RAM_THRESHOLD	10
+#define DEF_RAM_THRESHOLD	1
+
+/* control dirty nats ratio threshold (default: 10% over max nid count) */
+#define DEF_DIRTY_NAT_RATIO_THRESHOLD		10
+/* control total # of nats */
+#define DEF_NAT_CACHE_THRESHOLD			100000
 
 /* vector size for gang look-up from nat cache that consists of radix tree */
 #define NATVEC_SIZE	64
@@ -117,6 +123,17 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
 	raw_ne->version = ni->version;
 }
 
+static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
+{
+	return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
+					NM_I(sbi)->dirty_nats_ratio / 100;
+}
+
+static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
+{
+	return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
+}
+
 enum mem_type {
 	FREE_NIDS,	/* indicates the free nid list */
 	NAT_ENTRIES,	/* indicates the cached nat entry */
@@ -183,7 +200,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
 
 	block_addr = (pgoff_t)(nm_i->nat_blkaddr +
 		(seg_off << sbi->log_blocks_per_seg << 1) +
-		(block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+		(block_off & (sbi->blocks_per_seg - 1)));
 
 	if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
 		block_addr += sbi->blocks_per_seg;
@@ -212,6 +229,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
 	f2fs_change_bit(block_off, nm_i->nat_bitmap);
 }
 
+static inline nid_t ino_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le32_to_cpu(rn->footer.ino);
+}
+
+static inline nid_t nid_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le32_to_cpu(rn->footer.nid);
+}
+
+static inline unsigned int ofs_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	unsigned flag = le32_to_cpu(rn->footer.flag);
+	return flag >> OFFSET_BIT_SHIFT;
+}
+
+static inline __u64 cpver_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le64_to_cpu(rn->footer.cp_ver);
+}
+
+static inline block_t next_blkaddr_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le32_to_cpu(rn->footer.next_blkaddr);
+}
+
 static inline void fill_node_footer(struct page *page, nid_t nid,
 				nid_t ino, unsigned int ofs, bool reset)
 {
@@ -242,40 +290,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
 	struct f2fs_node *rn = F2FS_NODE(page);
+	size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
+	__u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver);
 
-	rn->footer.cp_ver = ckpt->checkpoint_ver;
+	if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
+		__u64 crc = le32_to_cpu(*((__le32 *)
+				((unsigned char *)ckpt + crc_offset)));
+		cp_ver |= (crc << 32);
+	}
+	rn->footer.cp_ver = cpu_to_le64(cp_ver);
 	rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
 }
 
-static inline nid_t ino_of_node(struct page *node_page)
+static inline bool is_recoverable_dnode(struct page *page)
 {
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	return le32_to_cpu(rn->footer.ino);
-}
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
+	size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
+	__u64 cp_ver = cur_cp_version(ckpt);
 
-static inline nid_t nid_of_node(struct page *node_page)
-{
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	return le32_to_cpu(rn->footer.nid);
-}
-
-static inline unsigned int ofs_of_node(struct page *node_page)
-{
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	unsigned flag = le32_to_cpu(rn->footer.flag);
-	return flag >> OFFSET_BIT_SHIFT;
-}
-
-static inline unsigned long long cpver_of_node(struct page *node_page)
-{
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	return le64_to_cpu(rn->footer.cp_ver);
-}
-
-static inline block_t next_blkaddr_of_node(struct page *node_page)
-{
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	return le32_to_cpu(rn->footer.next_blkaddr);
+	if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
+		__u64 crc = le32_to_cpu(*((__le32 *)
+				((unsigned char *)ckpt + crc_offset)));
+		cp_ver |= (crc << 32);
+	}
+	return cpu_to_le64(cp_ver) == cpver_of_node(page);
 }
 
 /*
@@ -317,17 +355,17 @@ static inline bool IS_DNODE(struct page *node_page)
 	return true;
 }
 
-static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
 {
 	struct f2fs_node *rn = F2FS_NODE(p);
 
-	f2fs_wait_on_page_writeback(p, NODE);
+	f2fs_wait_on_page_writeback(p, NODE, true);
 
 	if (i)
 		rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
 	else
 		rn->in.nid[off] = cpu_to_le32(nid);
-	set_page_dirty(p);
+	return set_page_dirty(p);
 }
 
 static inline nid_t get_nid(struct page *p, int off, bool i)
@@ -370,6 +408,21 @@ static inline int is_node(struct page *page, int type)
 #define is_fsync_dnode(page)	is_node(page, FSYNC_BIT_SHIFT)
 #define is_dent_dnode(page)	is_node(page, DENT_BIT_SHIFT)
 
+static inline int is_inline_node(struct page *page)
+{
+	return PageChecked(page);
+}
+
+static inline void set_inline_node(struct page *page)
+{
+	SetPageChecked(page);
+}
+
+static inline void clear_inline_node(struct page *page)
+{
+	ClearPageChecked(page);
+}
+
 static inline void set_cold_node(struct inode *inode, struct page *page)
 {
 	struct f2fs_node *rn = F2FS_NODE(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index cbf74f47cce8..2fc84a991325 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab;
 
 bool space_for_roll_forward(struct f2fs_sb_info *sbi)
 {
-	if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
-			> sbi->user_block_count)
+	s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
+
+	if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
 		return false;
 	return true;
 }
@@ -67,42 +68,71 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 	return NULL;
 }
 
-static int recover_dentry(struct inode *inode, struct page *ipage)
+static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
+					struct list_head *head, nid_t ino)
+{
+	struct inode *inode;
+	struct fsync_inode_entry *entry;
+
+	inode = f2fs_iget_retry(sbi->sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
+	entry->inode = inode;
+	list_add_tail(&entry->list, head);
+
+	return entry;
+}
+
+static void del_fsync_inode(struct fsync_inode_entry *entry)
+{
+	iput(entry->inode);
+	list_del(&entry->list);
+	kmem_cache_free(fsync_entry_slab, entry);
+}
+
+static int recover_dentry(struct inode *inode, struct page *ipage,
+						struct list_head *dir_list)
 {
 	struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
 	nid_t pino = le32_to_cpu(raw_inode->i_pino);
 	struct f2fs_dir_entry *de;
-	struct qstr name;
+	struct fscrypt_name fname;
 	struct page *page;
 	struct inode *dir, *einode;
+	struct fsync_inode_entry *entry;
 	int err = 0;
+	char *name;
 
-	dir = f2fs_iget(inode->i_sb, pino);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
+	entry = get_fsync_inode(dir_list, pino);
+	if (!entry) {
+		entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino);
+		if (IS_ERR(entry)) {
+			dir = ERR_CAST(entry);
+			err = PTR_ERR(entry);
+			goto out;
+		}
 	}
 
-	if (file_enc_name(inode)) {
-		iput(dir);
-		return 0;
-	}
+	dir = entry->inode;
 
-	name.len = le32_to_cpu(raw_inode->i_namelen);
-	name.name = raw_inode->i_name;
+	memset(&fname, 0, sizeof(struct fscrypt_name));
+	fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen);
+	fname.disk_name.name = raw_inode->i_name;
 
-	if (unlikely(name.len > F2FS_NAME_LEN)) {
+	if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) {
 		WARN_ON(1);
 		err = -ENAMETOOLONG;
-		goto out_err;
+		goto out;
 	}
 retry:
-	de = f2fs_find_entry(dir, &name, &page);
+	de = __f2fs_find_entry(dir, &fname, &page);
 	if (de && inode->i_ino == le32_to_cpu(de->ino))
 		goto out_unmap_put;
 
 	if (de) {
-		einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+		einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino));
 		if (IS_ERR(einode)) {
 			WARN_ON(1);
 			err = PTR_ERR(einode);
@@ -118,29 +148,27 @@ retry:
 		f2fs_delete_entry(de, page, dir, einode);
 		iput(einode);
 		goto retry;
-	}
-	err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
-	if (err)
-		goto out_err;
-
-	if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) {
-		iput(dir);
+	} else if (IS_ERR(page)) {
+		err = PTR_ERR(page);
 	} else {
-		add_dirty_dir_inode(dir);
-		set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+		err = __f2fs_do_add_link(dir, &fname, inode,
+					inode->i_ino, inode->i_mode);
 	}
-
+	if (err == -ENOMEM)
+		goto retry;
 	goto out;
 
 out_unmap_put:
 	f2fs_dentry_kunmap(dir, page);
 	f2fs_put_page(page, 0);
-out_err:
-	iput(dir);
 out:
+	if (file_enc_name(inode))
+		name = "<encrypted>";
+	else
+		name = raw_inode->i_name;
 	f2fs_msg(inode->i_sb, KERN_NOTICE,
 			"%s: ino = %x, name = %s, dir = %lx, err = %d",
-			__func__, ino_of_node(ipage), raw_inode->i_name,
+			__func__, ino_of_node(ipage), name,
 			IS_ERR(dir) ? 0 : dir->i_ino, err);
 	return err;
 }
@@ -151,7 +179,7 @@ static void recover_inode(struct inode *inode, struct page *page)
 	char *name;
 
 	inode->i_mode = le16_to_cpu(raw->i_mode);
-	i_size_write(inode, le64_to_cpu(raw->i_size));
+	f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
 	inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
 	inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
 	inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
@@ -168,9 +196,34 @@ static void recover_inode(struct inode *inode, struct page *page)
 			ino_of_node(page), name);
 }
 
+static bool is_same_inode(struct inode *inode, struct page *ipage)
+{
+	struct f2fs_inode *ri = F2FS_INODE(ipage);
+	struct timespec disk;
+
+	if (!IS_INODE(ipage))
+		return true;
+
+	disk.tv_sec = le64_to_cpu(ri->i_ctime);
+	disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
+	if (timespec_compare(&inode->i_ctime, &disk) > 0)
+		return false;
+
+	disk.tv_sec = le64_to_cpu(ri->i_atime);
+	disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
+	if (timespec_compare(&inode->i_atime, &disk) > 0)
+		return false;
+
+	disk.tv_sec = le64_to_cpu(ri->i_mtime);
+	disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+	if (timespec_compare(&inode->i_mtime, &disk) > 0)
+		return false;
+
+	return true;
+}
+
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 {
-	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
 	struct page *page = NULL;
 	block_t blkaddr;
@@ -180,8 +233,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
-	ra_meta_pages(sbi, blkaddr, 1, META_POR, true);
-
 	while (1) {
 		struct fsync_inode_entry *entry;
 
@@ -190,49 +241,41 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 
 		page = get_tmp_page(sbi, blkaddr);
 
-		if (cp_ver != cpver_of_node(page))
+		if (!is_recoverable_dnode(page))
 			break;
 
 		if (!is_fsync_dnode(page))
 			goto next;
 
 		entry = get_fsync_inode(head, ino_of_node(page));
-		if (!entry) {
+		if (entry) {
+			if (!is_same_inode(entry->inode, page))
+				goto next;
+		} else {
 			if (IS_INODE(page) && is_dent_dnode(page)) {
 				err = recover_inode_page(sbi, page);
 				if (err)
 					break;
 			}
 
-			/* add this fsync inode to the list */
-			entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
-			if (!entry) {
-				err = -ENOMEM;
-				break;
-			}
 			/*
 			 * CP | dnode(F) | inode(DF)
 			 * For this case, we should not give up now.
 			 */
-			entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
-			if (IS_ERR(entry->inode)) {
-				err = PTR_ERR(entry->inode);
-				kmem_cache_free(fsync_entry_slab, entry);
+			entry = add_fsync_inode(sbi, head, ino_of_node(page));
+			if (IS_ERR(entry)) {
+				err = PTR_ERR(entry);
 				if (err == -ENOENT) {
 					err = 0;
 					goto next;
 				}
 				break;
 			}
-			list_add_tail(&entry->list, head);
 		}
 		entry->blkaddr = blkaddr;
 
-		if (IS_INODE(page)) {
-			entry->last_inode = blkaddr;
-			if (is_dent_dnode(page))
-				entry->last_dentry = blkaddr;
-		}
+		if (IS_INODE(page) && is_dent_dnode(page))
+			entry->last_dentry = blkaddr;
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
@@ -248,11 +291,8 @@ static void destroy_fsync_dnodes(struct list_head *head)
 {
 	struct fsync_inode_entry *entry, *tmp;
 
-	list_for_each_entry_safe(entry, tmp, head, list) {
-		iput(entry->inode);
-		list_del(&entry->list);
-		kmem_cache_free(fsync_entry_slab, entry);
-	}
+	list_for_each_entry_safe(entry, tmp, head, list)
+		del_fsync_inode(entry);
 }
 
 static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
@@ -314,15 +354,14 @@ got_it:
 
 	if (ino != dn->inode->i_ino) {
 		/* Deallocate previous index in the node page */
-		inode = f2fs_iget(sbi->sb, ino);
+		inode = f2fs_iget_retry(sbi->sb, ino);
 		if (IS_ERR(inode))
 			return PTR_ERR(inode);
 	} else {
 		inode = dn->inode;
 	}
 
-	bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
-			le16_to_cpu(sum.ofs_in_node);
+	bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node);
 
 	/*
 	 * if inode page is locked, unlock temporarily, but its reference
@@ -357,10 +396,9 @@ truncate_out:
 static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 					struct page *page, block_t blkaddr)
 {
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	unsigned int start, end;
 	struct dnode_of_data dn;
 	struct node_info ni;
+	unsigned int start, end;
 	int err = 0, recovered = 0;
 
 	/* step 1: recover xattr */
@@ -380,16 +418,21 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 		goto out;
 
 	/* step 3: recover data indices */
-	start = start_bidx_of_node(ofs_of_node(page), fi);
-	end = start + ADDRS_PER_PAGE(page, fi);
+	start = start_bidx_of_node(ofs_of_node(page), inode);
+	end = start + ADDRS_PER_PAGE(page, inode);
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-
+retry_dn:
 	err = get_dnode_of_data(&dn, start, ALLOC_NODE);
-	if (err)
+	if (err) {
+		if (err == -ENOMEM) {
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			goto retry_dn;
+		}
 		goto out;
+	}
 
-	f2fs_wait_on_page_writeback(dn.node_page, NODE);
+	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
 
 	get_node_info(sbi, dn.nid, &ni);
 	f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
@@ -411,14 +454,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 			continue;
 		}
 
+		if ((start + 1) << PAGE_SHIFT > i_size_read(inode))
+			f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
+
 		/*
 		 * dest is reserved block, invalidate src block
 		 * and then reserve one new block in dnode page.
 		 */
 		if (dest == NEW_ADDR) {
 			truncate_data_blocks_range(&dn, 1);
-			err = reserve_new_block(&dn);
-			f2fs_bug_on(sbi, err);
+			reserve_new_block(&dn);
 			continue;
 		}
 
@@ -427,25 +472,33 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 
 			if (src == NULL_ADDR) {
 				err = reserve_new_block(&dn);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+				while (err)
+					err = reserve_new_block(&dn);
+#endif
 				/* We should not get -ENOSPC */
 				f2fs_bug_on(sbi, err);
+				if (err)
+					goto err;
 			}
-
+retry_prev:
 			/* Check the previous node page having this index */
 			err = check_index_in_prev_nodes(sbi, dest, &dn);
-			if (err)
+			if (err) {
+				if (err == -ENOMEM) {
+					congestion_wait(BLK_RW_ASYNC, HZ/50);
+					goto retry_prev;
+				}
 				goto err;
+			}
 
 			/* write dummy data page */
 			f2fs_replace_block(sbi, &dn, src, dest,
-							ni.version, false);
+						ni.version, false, false);
 			recovered++;
 		}
 	}
 
-	if (IS_INODE(dn.node_page))
-		sync_inode_page(&dn);
-
 	copy_node_footer(dn.node_page, page);
 	fill_node_footer(dn.node_page, dn.nid, ni.ino,
 					ofs_of_node(page), false);
@@ -459,17 +512,16 @@ out:
 	return err;
 }
 
-static int recover_data(struct f2fs_sb_info *sbi,
-				struct list_head *head, int type)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
+						struct list_head *dir_list)
 {
-	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
 	struct page *page = NULL;
 	int err = 0;
 	block_t blkaddr;
 
 	/* get node pages in the current segment */
-	curseg = CURSEG_I(sbi, type);
+	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
 	while (1) {
@@ -482,12 +534,12 @@ static int recover_data(struct f2fs_sb_info *sbi,
 
 		page = get_tmp_page(sbi, blkaddr);
 
-		if (cp_ver != cpver_of_node(page)) {
+		if (!is_recoverable_dnode(page)) {
 			f2fs_put_page(page, 1);
 			break;
 		}
 
-		entry = get_fsync_inode(head, ino_of_node(page));
+		entry = get_fsync_inode(inode_list, ino_of_node(page));
 		if (!entry)
 			goto next;
 		/*
@@ -495,10 +547,10 @@ static int recover_data(struct f2fs_sb_info *sbi,
 		 * In this case, we can lose the latest inode(x).
 		 * So, call recover_inode for the inode update.
 		 */
-		if (entry->last_inode == blkaddr)
+		if (IS_INODE(page))
 			recover_inode(entry->inode, page);
 		if (entry->last_dentry == blkaddr) {
-			err = recover_dentry(entry->inode, page);
+			err = recover_dentry(entry->inode, page, dir_list);
 			if (err) {
 				f2fs_put_page(page, 1);
 				break;
@@ -510,11 +562,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
 			break;
 		}
 
-		if (entry->blkaddr == blkaddr) {
-			iput(entry->inode);
-			list_del(&entry->list);
-			kmem_cache_free(fsync_entry_slab, entry);
-		}
+		if (entry->blkaddr == blkaddr)
+			del_fsync_inode(entry);
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
@@ -525,12 +574,14 @@ next:
 	return err;
 }
 
-int recover_fsync_data(struct f2fs_sb_info *sbi)
+int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	struct list_head inode_list;
+	struct list_head dir_list;
 	block_t blkaddr;
 	int err;
+	int ret = 0;
 	bool need_writecp = false;
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -539,6 +590,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&inode_list);
+	INIT_LIST_HEAD(&dir_list);
 
 	/* prevent checkpoint */
 	mutex_lock(&sbi->cp_mutex);
@@ -547,25 +599,26 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 
 	/* step #1: find fsynced inode numbers */
 	err = find_fsync_dnodes(sbi, &inode_list);
-	if (err)
+	if (err || list_empty(&inode_list))
 		goto out;
 
-	if (list_empty(&inode_list))
+	if (check_only) {
+		ret = 1;
 		goto out;
+	}
 
 	need_writecp = true;
 
 	/* step #2: recover data */
-	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+	err = recover_data(sbi, &inode_list, &dir_list);
 	if (!err)
 		f2fs_bug_on(sbi, !list_empty(&inode_list));
 out:
 	destroy_fsync_dnodes(&inode_list);
-	kmem_cache_destroy(fsync_entry_slab);
 
 	/* truncate meta pages to be used by the recovery */
 	truncate_inode_pages_range(META_MAPPING(sbi),
-			(loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+			(loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1);
 
 	if (err) {
 		truncate_inode_pages_final(NODE_MAPPING(sbi));
@@ -573,31 +626,20 @@ out:
 	}
 
 	clear_sbi_flag(sbi, SBI_POR_DOING);
-	if (err) {
-		bool invalidate = false;
+	if (err)
+		set_ckpt_flags(sbi, CP_ERROR_FLAG);
+	mutex_unlock(&sbi->cp_mutex);
 
-		if (discard_next_dnode(sbi, blkaddr))
-			invalidate = true;
+	/* let's drop all the directory inodes for clean checkpoint */
+	destroy_fsync_dnodes(&dir_list);
 
-		/* Flush all the NAT/SIT pages */
-		while (get_pages(sbi, F2FS_DIRTY_META))
-			sync_meta_pages(sbi, META, LONG_MAX);
-
-		/* invalidate temporary meta page */
-		if (invalidate)
-			invalidate_mapping_pages(META_MAPPING(sbi),
-							blkaddr, blkaddr);
-
-		set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
-		mutex_unlock(&sbi->cp_mutex);
-	} else if (need_writecp) {
+	if (!err && need_writecp) {
 		struct cp_control cpc = {
 			.reason = CP_RECOVERY,
 		};
-		mutex_unlock(&sbi->cp_mutex);
-		write_checkpoint(sbi, &cpc);
-	} else {
-		mutex_unlock(&sbi->cp_mutex);
+		err = write_checkpoint(sbi, &cpc);
 	}
-	return err;
+
+	kmem_cache_destroy(fsync_entry_slab);
+	return ret ? ret: err;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7965957dd0e6..b3c61ae37f92 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word)
 /*
  * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
  * f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * @size must be integral times of unsigned long.
  * Example:
  *                             MSB <--> LSB
  *   f2fs_set_bit(0, bitmap) => 1000 0000
@@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr,
 			unsigned long size, unsigned long offset)
 {
 	const unsigned long *p = addr + BIT_WORD(offset);
-	unsigned long result = offset & ~(BITS_PER_LONG - 1);
+	unsigned long result = size;
 	unsigned long tmp;
 
 	if (offset >= size)
 		return size;
 
-	size -= result;
+	size -= (offset & ~(BITS_PER_LONG - 1));
 	offset %= BITS_PER_LONG;
-	if (!offset)
-		goto aligned;
 
-	tmp = __reverse_ulong((unsigned char *)p);
-	tmp &= ~0UL >> offset;
+	while (1) {
+		if (*p == 0)
+			goto pass;
 
-	if (size < BITS_PER_LONG)
-		goto found_first;
-	if (tmp)
-		goto found_middle;
-
-	size -= BITS_PER_LONG;
-	result += BITS_PER_LONG;
-	p++;
-aligned:
-	while (size & ~(BITS_PER_LONG-1)) {
 		tmp = __reverse_ulong((unsigned char *)p);
+
+		tmp &= ~0UL >> offset;
+		if (size < BITS_PER_LONG)
+			tmp &= (~0UL << (BITS_PER_LONG - size));
 		if (tmp)
-			goto found_middle;
-		result += BITS_PER_LONG;
+			goto found;
+pass:
+		if (size <= BITS_PER_LONG)
+			break;
 		size -= BITS_PER_LONG;
+		offset = 0;
 		p++;
 	}
-	if (!size)
-		return result;
-
-	tmp = __reverse_ulong((unsigned char *)p);
-found_first:
-	tmp &= (~0UL << (BITS_PER_LONG - size));
-	if (!tmp)		/* Are any bits set? */
-		return result + size;   /* Nope. */
-found_middle:
-	return result + __reverse_ffs(tmp);
+	return result;
+found:
+	return result - size + __reverse_ffs(tmp);
 }
 
 static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
 			unsigned long size, unsigned long offset)
 {
 	const unsigned long *p = addr + BIT_WORD(offset);
-	unsigned long result = offset & ~(BITS_PER_LONG - 1);
+	unsigned long result = size;
 	unsigned long tmp;
 
 	if (offset >= size)
 		return size;
 
-	size -= result;
+	size -= (offset & ~(BITS_PER_LONG - 1));
 	offset %= BITS_PER_LONG;
-	if (!offset)
-		goto aligned;
 
-	tmp = __reverse_ulong((unsigned char *)p);
-	tmp |= ~((~0UL << offset) >> offset);
+	while (1) {
+		if (*p == ~0UL)
+			goto pass;
 
-	if (size < BITS_PER_LONG)
-		goto found_first;
-	if (tmp != ~0UL)
-		goto found_middle;
-
-	size -= BITS_PER_LONG;
-	result += BITS_PER_LONG;
-	p++;
-aligned:
-	while (size & ~(BITS_PER_LONG - 1)) {
 		tmp = __reverse_ulong((unsigned char *)p);
+
+		if (offset)
+			tmp |= ~0UL << (BITS_PER_LONG - offset);
+		if (size < BITS_PER_LONG)
+			tmp |= ~0UL >> size;
 		if (tmp != ~0UL)
-			goto found_middle;
-		result += BITS_PER_LONG;
+			goto found;
+pass:
+		if (size <= BITS_PER_LONG)
+			break;
 		size -= BITS_PER_LONG;
+		offset = 0;
 		p++;
 	}
-	if (!size)
-		return result;
-
-	tmp = __reverse_ulong((unsigned char *)p);
-found_first:
-	tmp |= ~(~0UL << (BITS_PER_LONG - size));
-	if (tmp == ~0UL)	/* Are any bits zero? */
-		return result + size;   /* Nope. */
-found_middle:
-	return result + __reverse_ffz(tmp);
+	return result;
+found:
+	return result - size + __reverse_ffz(tmp);
 }
 
 void register_inmem_page(struct inode *inode, struct page *page)
@@ -211,69 +191,149 @@ void register_inmem_page(struct inode *inode, struct page *page)
 	trace_f2fs_register_inmem_page(page, INMEM);
 }
 
-int commit_inmem_pages(struct inode *inode, bool abort)
+static int __revoke_inmem_pages(struct inode *inode,
+				struct list_head *head, bool drop, bool recover)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct inmem_pages *cur, *tmp;
+	int err = 0;
+
+	list_for_each_entry_safe(cur, tmp, head, list) {
+		struct page *page = cur->page;
+
+		if (drop)
+			trace_f2fs_commit_inmem_page(page, INMEM_DROP);
+
+		lock_page(page);
+
+		if (recover) {
+			struct dnode_of_data dn;
+			struct node_info ni;
+
+			trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
+
+			set_new_dnode(&dn, inode, NULL, NULL, 0);
+			if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+				err = -EAGAIN;
+				goto next;
+			}
+			get_node_info(sbi, dn.nid, &ni);
+			f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+					cur->old_addr, ni.version, true, true);
+			f2fs_put_dnode(&dn);
+		}
+next:
+		/* we don't need to invalidate this in the sccessful status */
+		if (drop || recover)
+			ClearPageUptodate(page);
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+		f2fs_put_page(page, 1);
+
+		list_del(&cur->list);
+		kmem_cache_free(inmem_entry_slab, cur);
+		dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+	}
+	return err;
+}
+
+void drop_inmem_pages(struct inode *inode)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+
+	clear_inode_flag(inode, FI_ATOMIC_FILE);
+
+	mutex_lock(&fi->inmem_lock);
+	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+	mutex_unlock(&fi->inmem_lock);
+}
+
+static int __commit_inmem_pages(struct inode *inode,
+					struct list_head *revoke_list)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct inmem_pages *cur, *tmp;
-	bool submit_bio = false;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = DATA,
 		.rw = WRITE_SYNC | REQ_PRIO,
 		.encrypted_page = NULL,
 	};
+	bool submit_bio = false;
 	int err = 0;
 
-	/*
-	 * The abort is true only when f2fs_evict_inode is called.
-	 * Basically, the f2fs_evict_inode doesn't produce any data writes, so
-	 * that we don't need to call f2fs_balance_fs.
-	 * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
-	 * inode becomes free by iget_locked in f2fs_iget.
-	 */
-	if (!abort) {
-		f2fs_balance_fs(sbi);
-		f2fs_lock_op(sbi);
+	list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
+		struct page *page = cur->page;
+
+		lock_page(page);
+		if (page->mapping == inode->i_mapping) {
+			trace_f2fs_commit_inmem_page(page, INMEM);
+
+			set_page_dirty(page);
+			f2fs_wait_on_page_writeback(page, DATA, true);
+			if (clear_page_dirty_for_io(page))
+				inode_dec_dirty_pages(inode);
+
+			fio.page = page;
+			err = do_write_data_page(&fio);
+			if (err) {
+				unlock_page(page);
+				break;
+			}
+
+			/* record old blkaddr for revoking */
+			cur->old_addr = fio.old_blkaddr;
+
+			clear_cold_data(page);
+			submit_bio = true;
+		}
+		unlock_page(page);
+		list_move_tail(&cur->list, revoke_list);
 	}
 
-	mutex_lock(&fi->inmem_lock);
-	list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
-		lock_page(cur->page);
-		if (!abort) {
-			if (cur->page->mapping == inode->i_mapping) {
-				set_page_dirty(cur->page);
-				f2fs_wait_on_page_writeback(cur->page, DATA);
-				if (clear_page_dirty_for_io(cur->page))
-					inode_dec_dirty_pages(inode);
-				trace_f2fs_commit_inmem_page(cur->page, INMEM);
-				fio.page = cur->page;
-				err = do_write_data_page(&fio);
-				if (err) {
-					unlock_page(cur->page);
-					break;
-				}
-				clear_cold_data(cur->page);
-				submit_bio = true;
-			}
-		} else {
-			trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
-		}
-		set_page_private(cur->page, 0);
-		ClearPagePrivate(cur->page);
-		f2fs_put_page(cur->page, 1);
+	if (submit_bio)
+		f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
 
-		list_del(&cur->list);
-		kmem_cache_free(inmem_entry_slab, cur);
-		dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+	if (!err)
+		__revoke_inmem_pages(inode, revoke_list, false, false);
+
+	return err;
+}
+
+int commit_inmem_pages(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct list_head revoke_list;
+	int err;
+
+	INIT_LIST_HEAD(&revoke_list);
+	f2fs_balance_fs(sbi, true);
+	f2fs_lock_op(sbi);
+
+	mutex_lock(&fi->inmem_lock);
+	err = __commit_inmem_pages(inode, &revoke_list);
+	if (err) {
+		int ret;
+		/*
+		 * try to revoke all committed pages, but still we could fail
+		 * due to no memory or other reason, if that happened, EAGAIN
+		 * will be returned, which means in such case, transaction is
+		 * already not integrity, caller should use journal to do the
+		 * recovery or rewrite & commit last transaction. For other
+		 * error number, revoking was done by filesystem itself.
+		 */
+		ret = __revoke_inmem_pages(inode, &revoke_list, false, true);
+		if (ret)
+			err = ret;
+
+		/* drop all uncommitted pages */
+		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
 	}
 	mutex_unlock(&fi->inmem_lock);
 
-	if (!abort) {
-		f2fs_unlock_op(sbi);
-		if (submit_bio)
-			f2fs_submit_merged_bio(sbi, DATA, WRITE);
-	}
+	f2fs_unlock_op(sbi);
 	return err;
 }
 
@@ -281,13 +341,25 @@ int commit_inmem_pages(struct inode *inode, bool abort)
  * This function balances dirty node and dentry pages.
  * In addition, it controls garbage collection.
  */
-void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
 {
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_CHECKPOINT))
+		f2fs_stop_checkpoint(sbi, false);
+#endif
+
+	if (!need)
+		return;
+
+	/* balance_fs_bg is able to be pending */
+	if (excess_cached_nats(sbi))
+		f2fs_balance_fs_bg(sbi);
+
 	/*
 	 * We should do GC or end up with checkpoint, if there are so many dirty
 	 * dir/node pages without enough free segments.
 	 */
-	if (has_not_enough_free_secs(sbi, 0)) {
+	if (has_not_enough_free_secs(sbi, 0, 0)) {
 		mutex_lock(&sbi->gc_mutex);
 		f2fs_gc(sbi, false);
 	}
@@ -304,14 +376,26 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 		try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
 
 	if (!available_free_memory(sbi, FREE_NIDS))
-		try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES);
+		try_to_free_nids(sbi, MAX_FREE_NIDS);
+	else
+		build_free_nids(sbi);
 
 	/* checkpoint is the only way to shrink partial cached entries */
 	if (!available_free_memory(sbi, NAT_ENTRIES) ||
-			excess_prefree_segs(sbi) ||
 			!available_free_memory(sbi, INO_ENTRIES) ||
-			jiffies > sbi->cp_expires)
+			excess_prefree_segs(sbi) ||
+			excess_dirty_nats(sbi) ||
+			(is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+		if (test_opt(sbi, DATA_FLUSH)) {
+			struct blk_plug plug;
+
+			blk_start_plug(&plug);
+			sync_dirty_inodes(sbi, FILE_INODE);
+			blk_finish_plug(&plug);
+		}
 		f2fs_sync_fs(sbi->sb, true);
+		stat_inc_bg_cp_count(sbi->stat_info);
+	}
 }
 
 static int issue_flush_thread(void *data)
@@ -361,24 +445,28 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 	if (test_opt(sbi, NOBARRIER))
 		return 0;
 
-	if (!test_opt(sbi, FLUSH_MERGE)) {
+	if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
 		struct bio *bio = f2fs_bio_alloc(0);
 		int ret;
 
+		atomic_inc(&fcc->submit_flush);
 		bio->bi_bdev = sbi->sb->s_bdev;
 		ret = submit_bio_wait(WRITE_FLUSH, bio);
+		atomic_dec(&fcc->submit_flush);
 		bio_put(bio);
 		return ret;
 	}
 
 	init_completion(&cmd.wait);
 
+	atomic_inc(&fcc->submit_flush);
 	llist_add(&cmd.llnode, &fcc->issue_list);
 
 	if (!fcc->dispatch_list)
 		wake_up(&fcc->flush_wait_queue);
 
 	wait_for_completion(&cmd.wait);
+	atomic_dec(&fcc->submit_flush);
 
 	return cmd.ret;
 }
@@ -392,6 +480,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
 	if (!fcc)
 		return -ENOMEM;
+	atomic_set(&fcc->submit_flush, 0);
 	init_waitqueue_head(&fcc->flush_wait_queue);
 	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->cmd_control_info = fcc;
@@ -513,28 +602,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 	return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
 }
 
-bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
-{
-	int err = -ENOTSUPP;
-
-	if (test_opt(sbi, DISCARD)) {
-		struct seg_entry *se = get_seg_entry(sbi,
-				GET_SEGNO(sbi, blkaddr));
-		unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
-
-		if (f2fs_test_bit(offset, se->discard_map))
-			return false;
-
-		err = f2fs_issue_discard(sbi, blkaddr, 1);
-	}
-
-	if (err) {
-		update_meta_page(sbi, NULL, blkaddr);
-		return true;
-	}
-	return false;
-}
-
 static void __add_discard_entry(struct f2fs_sb_info *sbi,
 		struct cp_control *cpc, struct seg_entry *se,
 		unsigned int start, unsigned int end)
@@ -573,7 +640,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	bool force = (cpc->reason == CP_DISCARD);
 	int i;
 
-	if (se->valid_blocks == max_blocks)
+	if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
 		return;
 
 	if (!force) {
@@ -593,6 +660,10 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			break;
 
 		end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+		if (force && start && end != max_blocks
+					&& (end - start) < cpc->trim_minlen)
+			continue;
+
 		__add_discard_entry(sbi, cpc, se, start, end);
 	}
 }
@@ -630,6 +701,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
 	unsigned int start = 0, end = -1;
+	unsigned int secno, start_segno;
+	bool force = (cpc->reason == CP_DISCARD);
 
 	mutex_lock(&dirty_i->seglist_lock);
 
@@ -646,17 +719,31 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 		dirty_i->nr_dirty[PRE] -= end - start;
 
-		if (!test_opt(sbi, DISCARD))
+		if (force || !test_opt(sbi, DISCARD))
 			continue;
 
-		f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
+		if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) {
+			f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
 				(end - start) << sbi->log_blocks_per_seg);
+			continue;
+		}
+next:
+		secno = GET_SECNO(sbi, start);
+		start_segno = secno * sbi->segs_per_sec;
+		if (!IS_CURSEC(sbi, secno) &&
+			!get_valid_blocks(sbi, start, sbi->segs_per_sec))
+			f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
+				sbi->segs_per_sec << sbi->log_blocks_per_seg);
+
+		start = start_segno + sbi->segs_per_sec;
+		if (start < end)
+			goto next;
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
 
 	/* send small discards */
 	list_for_each_entry_safe(entry, this, head, list) {
-		if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen)
+		if (force && entry->len < cpc->trim_minlen)
 			goto skip;
 		f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
 		cpc->trimmed += entry->len;
@@ -711,12 +798,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 	if (del > 0) {
 		if (f2fs_test_and_set_bit(offset, se->cur_valid_map))
 			f2fs_bug_on(sbi, 1);
-		if (!f2fs_test_and_set_bit(offset, se->discard_map))
+		if (f2fs_discard_en(sbi) &&
+			!f2fs_test_and_set_bit(offset, se->discard_map))
 			sbi->discard_blks--;
 	} else {
 		if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map))
 			f2fs_bug_on(sbi, 1);
-		if (f2fs_test_and_clear_bit(offset, se->discard_map))
+		if (f2fs_discard_en(sbi) &&
+			f2fs_test_and_clear_bit(offset, se->discard_map))
 			sbi->discard_blks++;
 	}
 	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
@@ -817,12 +906,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
 		}
 	}
 
-	sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
+	sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE -
 			SUM_FOOTER_SIZE) / SUMMARY_SIZE;
 	if (valid_sum_count <= sum_in_page)
 		return 1;
 	else if ((valid_sum_count - sum_in_page) <=
-		(PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
+		(PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
 		return 2;
 	return 3;
 }
@@ -841,9 +930,9 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
 	void *dst = page_address(page);
 
 	if (src)
-		memcpy(dst, src, PAGE_CACHE_SIZE);
+		memcpy(dst, src, PAGE_SIZE);
 	else
-		memset(dst, 0, PAGE_CACHE_SIZE);
+		memset(dst, 0, PAGE_SIZE);
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
 }
@@ -854,6 +943,31 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
 	update_meta_page(sbi, (void *)sum_blk, blk_addr);
 }
 
+static void write_current_sum_page(struct f2fs_sb_info *sbi,
+						int type, block_t blk_addr)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	struct page *page = grab_meta_page(sbi, blk_addr);
+	struct f2fs_summary_block *src = curseg->sum_blk;
+	struct f2fs_summary_block *dst;
+
+	dst = (struct f2fs_summary_block *)page_address(page);
+
+	mutex_lock(&curseg->curseg_mutex);
+
+	down_read(&curseg->journal_rwsem);
+	memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
+	up_read(&curseg->journal_rwsem);
+
+	memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
+	memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
+
+	mutex_unlock(&curseg->curseg_mutex);
+
+	set_page_dirty(page);
+	f2fs_put_page(page, 1);
+}
+
 static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -886,9 +1000,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
 
 	if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
 		segno = find_next_zero_bit(free_i->free_segmap,
-					MAIN_SEGS(sbi), *newseg + 1);
-		if (segno - *newseg < sbi->segs_per_sec -
-					(*newseg % sbi->segs_per_sec))
+				(hint + 1) * sbi->segs_per_sec, *newseg + 1);
+		if (segno < (hint + 1) * sbi->segs_per_sec)
 			goto got_it;
 	}
 find_other_zone:
@@ -1071,7 +1184,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 	const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
 
-	if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0))
+	if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0))
 		return v_ops->get_victim(sbi,
 				&(curseg)->next_segno, BG_GC, type, SSR);
 
@@ -1120,6 +1233,9 @@ void allocate_new_segments(struct f2fs_sb_info *sbi)
 {
 	int i;
 
+	if (test_opt(sbi, LFS))
+		return;
+
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
 		__allocate_new_segments(sbi, i);
 }
@@ -1134,6 +1250,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
 	unsigned int start_segno, end_segno;
 	struct cp_control cpc;
+	int err = 0;
 
 	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
 		return -EINVAL;
@@ -1142,6 +1259,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	if (end <= MAIN_BLKADDR(sbi))
 		goto out;
 
+	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"Found FS corruption, run fsck to fix.");
+		goto out;
+	}
+
 	/* start/end segment number in main_area */
 	start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
 	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
@@ -1164,12 +1287,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 				sbi->segs_per_sec) - 1, end_segno);
 
 		mutex_lock(&sbi->gc_mutex);
-		write_checkpoint(sbi, &cpc);
+		err = write_checkpoint(sbi, &cpc);
 		mutex_unlock(&sbi->gc_mutex);
+		if (err)
+			break;
+
+		schedule();
 	}
 out:
 	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
-	return 0;
+	return err;
 }
 
 static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
@@ -1256,7 +1383,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 
 	/* direct_io'ed data is aligned to the segment for better performance */
 	if (direct_io && curseg->next_blkoff &&
-				!has_not_enough_free_secs(sbi, 0))
+				!has_not_enough_free_secs(sbi, 0, 0))
 		__allocate_new_segments(sbi, type);
 
 	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
@@ -1292,11 +1419,17 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 {
 	int type = __get_segment_type(fio->page, fio->type);
 
-	allocate_data_block(fio->sbi, fio->page, fio->blk_addr,
-					&fio->blk_addr, sum, type);
+	if (fio->type == NODE || fio->type == DATA)
+		mutex_lock(&fio->sbi->wio_mutex[fio->type]);
+
+	allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+					&fio->new_blkaddr, sum, type);
 
 	/* writeout dirty page into bdev */
 	f2fs_submit_page_mbio(fio);
+
+	if (fio->type == NODE || fio->type == DATA)
+		mutex_unlock(&fio->sbi->wio_mutex[fio->type]);
 }
 
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1305,7 +1438,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 		.sbi = sbi,
 		.type = META,
 		.rw = WRITE_SYNC | REQ_META | REQ_PRIO,
-		.blk_addr = page->index,
+		.old_blkaddr = page->index,
+		.new_blkaddr = page->index,
 		.page = page,
 		.encrypted_page = NULL,
 	};
@@ -1335,19 +1469,19 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
 	get_node_info(sbi, dn->nid, &ni);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 	do_write_page(&sum, fio);
-	dn->data_blkaddr = fio->blk_addr;
+	f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
 }
 
 void rewrite_data_page(struct f2fs_io_info *fio)
 {
+	fio->new_blkaddr = fio->old_blkaddr;
 	stat_inc_inplace_blocks(fio->sbi);
 	f2fs_submit_page_mbio(fio);
 }
 
-static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
-				struct f2fs_summary *sum,
+void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 				block_t old_blkaddr, block_t new_blkaddr,
-				bool recover_curseg)
+				bool recover_curseg, bool recover_newaddr)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg;
@@ -1390,7 +1524,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
 	__add_sum_entry(sbi, type, sum);
 
-	if (!recover_curseg)
+	if (!recover_curseg || recover_newaddr)
 		update_sit_entry(sbi, new_blkaddr, 1);
 	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
 		update_sit_entry(sbi, old_blkaddr, -1);
@@ -1414,66 +1548,30 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 
 void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
 				block_t old_addr, block_t new_addr,
-				unsigned char version, bool recover_curseg)
+				unsigned char version, bool recover_curseg,
+				bool recover_newaddr)
 {
 	struct f2fs_summary sum;
 
 	set_summary(&sum, dn->nid, dn->ofs_in_node, version);
 
-	__f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg);
+	__f2fs_replace_block(sbi, &sum, old_addr, new_addr,
+					recover_curseg, recover_newaddr);
 
-	dn->data_blkaddr = new_addr;
-	set_data_blkaddr(dn);
-	f2fs_update_extent_cache(dn);
-}
-
-static inline bool is_merged_page(struct f2fs_sb_info *sbi,
-					struct page *page, enum page_type type)
-{
-	enum page_type btype = PAGE_TYPE_OF_BIO(type);
-	struct f2fs_bio_info *io = &sbi->write_io[btype];
-	struct bio_vec *bvec;
-	struct page *target;
-	int i;
-
-	down_read(&io->io_rwsem);
-	if (!io->bio) {
-		up_read(&io->io_rwsem);
-		return false;
-	}
-
-	bio_for_each_segment_all(bvec, io->bio, i) {
-
-		if (bvec->bv_page->mapping) {
-			target = bvec->bv_page;
-		} else {
-			struct f2fs_crypto_ctx *ctx;
-
-			/* encrypted page */
-			ctx = (struct f2fs_crypto_ctx *)page_private(
-								bvec->bv_page);
-			target = ctx->w.control_page;
-		}
-
-		if (page == target) {
-			up_read(&io->io_rwsem);
-			return true;
-		}
-	}
-
-	up_read(&io->io_rwsem);
-	return false;
+	f2fs_update_data_blkaddr(dn, new_addr);
 }
 
 void f2fs_wait_on_page_writeback(struct page *page,
-				enum page_type type)
+				enum page_type type, bool ordered)
 {
 	if (PageWriteback(page)) {
 		struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 
-		if (is_merged_page(sbi, page, type))
-			f2fs_submit_merged_bio(sbi, type, WRITE);
-		wait_on_page_writeback(page);
+		f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE);
+		if (ordered)
+			wait_on_page_writeback(page);
+		else
+			wait_for_stable_page(page);
 	}
 }
 
@@ -1482,14 +1580,12 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
 {
 	struct page *cpage;
 
-	if (blkaddr == NEW_ADDR)
+	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
 		return;
 
-	f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
-
 	cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
 	if (cpage) {
-		f2fs_wait_on_page_writeback(cpage, DATA);
+		f2fs_wait_on_page_writeback(cpage, DATA, true);
 		f2fs_put_page(cpage, 1);
 	}
 }
@@ -1510,12 +1606,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 
 	/* Step 1: restore nat cache */
 	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
+	memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE);
 
 	/* Step 2: restore sit cache */
 	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
-	memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
-						SUM_JOURNAL_SIZE);
+	memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE);
 	offset = 2 * SUM_JOURNAL_SIZE;
 
 	/* Step 3: restore summary entries */
@@ -1539,7 +1634,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 			s = (struct f2fs_summary *)(kaddr + offset);
 			seg_i->sum_blk->entries[j] = *s;
 			offset += SUMMARY_SIZE;
-			if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+			if (offset + SUMMARY_SIZE <= PAGE_SIZE -
 						SUM_FOOTER_SIZE)
 				continue;
 
@@ -1611,7 +1706,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 	/* set uncompleted segment to curseg */
 	curseg = CURSEG_I(sbi, type);
 	mutex_lock(&curseg->curseg_mutex);
-	memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
+
+	/* update journal info */
+	down_write(&curseg->journal_rwsem);
+	memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
+	up_write(&curseg->journal_rwsem);
+
+	memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
+	memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
 	curseg->next_segno = segno;
 	reset_curseg(sbi, type, 0);
 	curseg->alloc_type = ckpt->alloc_type[type];
@@ -1623,14 +1725,10 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 
 static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 {
-	struct f2fs_summary_block *s_sits =
-		CURSEG_I(sbi, CURSEG_COLD_DATA)->sum_blk;
-	struct f2fs_summary_block *s_nats =
-		CURSEG_I(sbi, CURSEG_HOT_DATA)->sum_blk;
 	int type = CURSEG_HOT_DATA;
 	int err;
 
-	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+	if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) {
 		int npages = npages_for_summary_flush(sbi, true);
 
 		if (npages >= 2)
@@ -1653,11 +1751,6 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 			return err;
 	}
 
-	/* sanity check for summary blocks */
-	if (nats_in_cursum(s_nats) > NAT_JOURNAL_ENTRIES ||
-			sits_in_cursum(s_sits) > SIT_JOURNAL_ENTRIES)
-		return -EINVAL;
-
 	return 0;
 }
 
@@ -1675,13 +1768,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
 
 	/* Step 1: write nat cache */
 	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
+	memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE);
 	written_size += SUM_JOURNAL_SIZE;
 
 	/* Step 2: write sit cache */
 	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
-	memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
-						SUM_JOURNAL_SIZE);
+	memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE);
 	written_size += SUM_JOURNAL_SIZE;
 
 	/* Step 3: write summary entries */
@@ -1703,7 +1795,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
 			*summary = seg_i->sum_blk->entries[j];
 			written_size += SUMMARY_SIZE;
 
-			if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+			if (written_size + SUMMARY_SIZE <= PAGE_SIZE -
 							SUM_FOOTER_SIZE)
 				continue;
 
@@ -1727,17 +1819,13 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
 	else
 		end = type + NR_CURSEG_NODE_TYPE;
 
-	for (i = type; i < end; i++) {
-		struct curseg_info *sum = CURSEG_I(sbi, i);
-		mutex_lock(&sum->curseg_mutex);
-		write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
-		mutex_unlock(&sum->curseg_mutex);
-	}
+	for (i = type; i < end; i++)
+		write_current_sum_page(sbi, i, blkaddr + (i - type));
 }
 
 void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 {
-	if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
+	if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG))
 		write_compacted_summaries(sbi, start_blk);
 	else
 		write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
@@ -1748,24 +1836,24 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 	write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
 }
 
-int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
+int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
 					unsigned int val, int alloc)
 {
 	int i;
 
 	if (type == NAT_JOURNAL) {
-		for (i = 0; i < nats_in_cursum(sum); i++) {
-			if (le32_to_cpu(nid_in_journal(sum, i)) == val)
+		for (i = 0; i < nats_in_cursum(journal); i++) {
+			if (le32_to_cpu(nid_in_journal(journal, i)) == val)
 				return i;
 		}
-		if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
-			return update_nats_in_cursum(sum, 1);
+		if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL))
+			return update_nats_in_cursum(journal, 1);
 	} else if (type == SIT_JOURNAL) {
-		for (i = 0; i < sits_in_cursum(sum); i++)
-			if (le32_to_cpu(segno_in_journal(sum, i)) == val)
+		for (i = 0; i < sits_in_cursum(journal); i++)
+			if (le32_to_cpu(segno_in_journal(journal, i)) == val)
 				return i;
-		if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
-			return update_sits_in_cursum(sum, 1);
+		if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL))
+			return update_sits_in_cursum(journal, 1);
 	}
 	return -1;
 }
@@ -1794,7 +1882,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
 
 	src_addr = page_address(src_page);
 	dst_addr = page_address(dst_page);
-	memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+	memcpy(dst_addr, src_addr, PAGE_SIZE);
 
 	set_page_dirty(dst_page);
 	f2fs_put_page(src_page, 1);
@@ -1869,20 +1957,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi)
 static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
-	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct f2fs_journal *journal = curseg->journal;
 	int i;
 
-	for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+	down_write(&curseg->journal_rwsem);
+	for (i = 0; i < sits_in_cursum(journal); i++) {
 		unsigned int segno;
 		bool dirtied;
 
-		segno = le32_to_cpu(segno_in_journal(sum, i));
+		segno = le32_to_cpu(segno_in_journal(journal, i));
 		dirtied = __mark_sit_entry_dirty(sbi, segno);
 
 		if (!dirtied)
 			add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
 	}
-	update_sits_in_cursum(sum, -sits_in_cursum(sum));
+	update_sits_in_cursum(journal, -i);
+	up_write(&curseg->journal_rwsem);
 }
 
 /*
@@ -1894,13 +1984,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct sit_info *sit_i = SIT_I(sbi);
 	unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
-	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct f2fs_journal *journal = curseg->journal;
 	struct sit_entry_set *ses, *tmp;
 	struct list_head *head = &SM_I(sbi)->sit_entry_set;
 	bool to_journal = true;
 	struct seg_entry *se;
 
-	mutex_lock(&curseg->curseg_mutex);
 	mutex_lock(&sit_i->sentry_lock);
 
 	if (!sit_i->dirty_sentries)
@@ -1917,7 +2006,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	 * entries, remove all entries from journal and add and account
 	 * them in sit entry set.
 	 */
-	if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+	if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL))
 		remove_sits_in_journal(sbi);
 
 	/*
@@ -1934,10 +2023,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		unsigned int segno = start_segno;
 
 		if (to_journal &&
-			!__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+			!__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL))
 			to_journal = false;
 
-		if (!to_journal) {
+		if (to_journal) {
+			down_write(&curseg->journal_rwsem);
+		} else {
 			page = get_next_sit_page(sbi, start_segno);
 			raw_sit = page_address(page);
 		}
@@ -1955,13 +2046,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			}
 
 			if (to_journal) {
-				offset = lookup_journal_in_cursum(sum,
+				offset = lookup_journal_in_cursum(journal,
 							SIT_JOURNAL, segno, 1);
 				f2fs_bug_on(sbi, offset < 0);
-				segno_in_journal(sum, offset) =
+				segno_in_journal(journal, offset) =
 							cpu_to_le32(segno);
 				seg_info_to_raw_sit(se,
-						&sit_in_journal(sum, offset));
+					&sit_in_journal(journal, offset));
 			} else {
 				sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
 				seg_info_to_raw_sit(se,
@@ -1973,7 +2064,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			ses->entry_cnt--;
 		}
 
-		if (!to_journal)
+		if (to_journal)
+			up_write(&curseg->journal_rwsem);
+		else
 			f2fs_put_page(page, 1);
 
 		f2fs_bug_on(sbi, ses->entry_cnt);
@@ -1988,7 +2081,6 @@ out:
 			add_discard_addrs(sbi, cpc);
 	}
 	mutex_unlock(&sit_i->sentry_lock);
-	mutex_unlock(&curseg->curseg_mutex);
 
 	set_prefree_as_free_segments(sbi);
 }
@@ -2024,12 +2116,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 		sit_i->sentries[start].ckpt_valid_map
 			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
-		sit_i->sentries[start].discard_map
-			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 		if (!sit_i->sentries[start].cur_valid_map ||
-				!sit_i->sentries[start].ckpt_valid_map ||
-				!sit_i->sentries[start].discard_map)
+				!sit_i->sentries[start].ckpt_valid_map)
 			return -ENOMEM;
+
+		if (f2fs_discard_en(sbi)) {
+			sit_i->sentries[start].discard_map
+				= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+			if (!sit_i->sentries[start].discard_map)
+				return -ENOMEM;
+		}
 	}
 
 	sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
@@ -2117,9 +2213,14 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < NR_CURSEG_TYPE; i++) {
 		mutex_init(&array[i].curseg_mutex);
-		array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!array[i].sum_blk)
 			return -ENOMEM;
+		init_rwsem(&array[i].journal_rwsem);
+		array[i].journal = kzalloc(sizeof(struct f2fs_journal),
+							GFP_KERNEL);
+		if (!array[i].journal)
+			return -ENOMEM;
 		array[i].segno = NULL_SEGNO;
 		array[i].next_blkoff = 0;
 	}
@@ -2130,11 +2231,13 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
-	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct f2fs_journal *journal = curseg->journal;
+	struct seg_entry *se;
+	struct f2fs_sit_entry sit;
 	int sit_blk_cnt = SIT_BLK_CNT(sbi);
 	unsigned int i, start, end;
 	unsigned int readed, start_blk = 0;
-	int nrpages = MAX_BIO_BLOCKS(sbi);
+	int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
 
 	do {
 		readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
@@ -2143,41 +2246,58 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 		end = (start_blk + readed) * sit_i->sents_per_block;
 
 		for (; start < end && start < MAIN_SEGS(sbi); start++) {
-			struct seg_entry *se = &sit_i->sentries[start];
 			struct f2fs_sit_block *sit_blk;
-			struct f2fs_sit_entry sit;
 			struct page *page;
 
-			mutex_lock(&curseg->curseg_mutex);
-			for (i = 0; i < sits_in_cursum(sum); i++) {
-				if (le32_to_cpu(segno_in_journal(sum, i))
-								== start) {
-					sit = sit_in_journal(sum, i);
-					mutex_unlock(&curseg->curseg_mutex);
-					goto got_it;
-				}
-			}
-			mutex_unlock(&curseg->curseg_mutex);
-
+			se = &sit_i->sentries[start];
 			page = get_current_sit_page(sbi, start);
 			sit_blk = (struct f2fs_sit_block *)page_address(page);
 			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
 			f2fs_put_page(page, 1);
-got_it:
+
 			check_block_count(sbi, start, &sit);
 			seg_info_from_raw_sit(se, &sit);
 
 			/* build discard map only one time */
-			memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
-			sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks;
-
-			if (sbi->segs_per_sec > 1) {
-				struct sec_entry *e = get_sec_entry(sbi, start);
-				e->valid_blocks += se->valid_blocks;
+			if (f2fs_discard_en(sbi)) {
+				memcpy(se->discard_map, se->cur_valid_map,
+							SIT_VBLOCK_MAP_SIZE);
+				sbi->discard_blks += sbi->blocks_per_seg -
+							se->valid_blocks;
 			}
+
+			if (sbi->segs_per_sec > 1)
+				get_sec_entry(sbi, start)->valid_blocks +=
+							se->valid_blocks;
 		}
 		start_blk += readed;
 	} while (start_blk < sit_blk_cnt);
+
+	down_read(&curseg->journal_rwsem);
+	for (i = 0; i < sits_in_cursum(journal); i++) {
+		unsigned int old_valid_blocks;
+
+		start = le32_to_cpu(segno_in_journal(journal, i));
+		se = &sit_i->sentries[start];
+		sit = sit_in_journal(journal, i);
+
+		old_valid_blocks = se->valid_blocks;
+
+		check_block_count(sbi, start, &sit);
+		seg_info_from_raw_sit(se, &sit);
+
+		if (f2fs_discard_en(sbi)) {
+			memcpy(se->discard_map, se->cur_valid_map,
+						SIT_VBLOCK_MAP_SIZE);
+			sbi->discard_blks += old_valid_blocks -
+						se->valid_blocks;
+		}
+
+		if (sbi->segs_per_sec > 1)
+			get_sec_entry(sbi, start)->valid_blocks +=
+				se->valid_blocks - old_valid_blocks;
+	}
+	up_read(&curseg->journal_rwsem);
 }
 
 static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -2310,7 +2430,11 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
 	sm_info->rec_prefree_segments = sm_info->main_segments *
 					DEF_RECLAIM_PREFREE_SEGMENTS / 100;
-	sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
+	if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
+		sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
+
+	if (!test_opt(sbi, LFS))
+		sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
 	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
 	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
 
@@ -2392,8 +2516,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
 	if (!array)
 		return;
 	SM_I(sbi)->curseg_array = NULL;
-	for (i = 0; i < NR_CURSEG_TYPE; i++)
+	for (i = 0; i < NR_CURSEG_TYPE; i++) {
 		kfree(array[i].sum_blk);
+		kfree(array[i].journal);
+	}
 	kfree(array);
 }
 
@@ -2459,7 +2585,7 @@ int __init create_segment_manager_caches(void)
 	sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
 			sizeof(struct sit_entry_set));
 	if (!sit_entry_set_slab)
-		goto destory_discard_entry;
+		goto destroy_discard_entry;
 
 	inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
 			sizeof(struct inmem_pages));
@@ -2469,7 +2595,7 @@ int __init create_segment_manager_caches(void)
 
 destroy_sit_entry_set:
 	kmem_cache_destroy(sit_entry_set_slab);
-destory_discard_entry:
+destroy_discard_entry:
 	kmem_cache_destroy(discard_entry_slab);
 fail:
 	return -ENOMEM;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ee44d346ea44..fecb856ad874 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -16,6 +16,7 @@
 #define NULL_SECNO			((unsigned int)(~0))
 
 #define DEF_RECLAIM_PREFREE_SEGMENTS	5	/* 5% over total segments */
+#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS	4096	/* 8GB in maximum */
 
 /* L: Logical segment # in volume, R: Relative segment # in main area */
 #define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno)
@@ -158,16 +159,17 @@ struct victim_sel_policy {
 };
 
 struct seg_entry {
-	unsigned short valid_blocks;	/* # of valid blocks */
+	unsigned int type:6;		/* segment type like CURSEG_XXX_TYPE */
+	unsigned int valid_blocks:10;	/* # of valid blocks */
+	unsigned int ckpt_valid_blocks:10;	/* # of valid blocks last cp */
+	unsigned int padding:6;		/* padding */
 	unsigned char *cur_valid_map;	/* validity bitmap of blocks */
 	/*
 	 * # of valid blocks and the validity bitmap stored in the the last
 	 * checkpoint pack. This information is used by the SSR mode.
 	 */
-	unsigned short ckpt_valid_blocks;
-	unsigned char *ckpt_valid_map;
+	unsigned char *ckpt_valid_map;	/* validity bitmap of blocks last cp */
 	unsigned char *discard_map;
-	unsigned char type;		/* segment type like CURSEG_XXX_TYPE */
 	unsigned long long mtime;	/* modification time of the segment */
 };
 
@@ -183,7 +185,7 @@ struct segment_allocation {
  * this value is set in page as a private data which indicate that
  * the page is atomically written, and it is in inmem_pages list.
  */
-#define ATOMIC_WRITTEN_PAGE		0x0000ffff
+#define ATOMIC_WRITTEN_PAGE		((unsigned long)-1)
 
 #define IS_ATOMIC_WRITTEN_PAGE(page)			\
 		(page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
@@ -191,6 +193,7 @@ struct segment_allocation {
 struct inmem_pages {
 	struct list_head list;
 	struct page *page;
+	block_t old_addr;		/* for revoking when fail to commit */
 };
 
 struct sit_info {
@@ -257,6 +260,8 @@ struct victim_selection {
 struct curseg_info {
 	struct mutex curseg_mutex;		/* lock for consistency */
 	struct f2fs_summary_block *sum_blk;	/* cached summary block */
+	struct rw_semaphore journal_rwsem;	/* protect journal area */
+	struct f2fs_journal *journal;		/* cached journal info */
 	unsigned char alloc_type;		/* current allocation type */
 	unsigned int segno;			/* current segment number */
 	unsigned short next_blkoff;		/* next block offset to write */
@@ -466,20 +471,27 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
 {
 	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
 	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+
+	if (test_opt(sbi, LFS))
+		return false;
+
 	return free_sections(sbi) <= (node_secs + 2 * dent_secs +
 						reserved_sections(sbi) + 1);
 }
 
-static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
+					int freed, int needed)
 {
 	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
 	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
 
+	node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		return false;
 
-	return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
-						reserved_sections(sbi));
+	return (free_sections(sbi) + freed) <=
+		(node_secs + 2 * dent_secs + reserved_sections(sbi) + needed);
 }
 
 static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
@@ -527,6 +539,9 @@ static inline bool need_inplace_update(struct inode *inode)
 	if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
 		return false;
 
+	if (test_opt(sbi, LFS))
+		return false;
+
 	if (policy & (0x1 << F2FS_IPU_FORCE))
 		return true;
 	if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
@@ -540,7 +555,7 @@ static inline bool need_inplace_update(struct inode *inode)
 
 	/* this is only set during fdatasync */
 	if (policy & (0x1 << F2FS_IPU_FSYNC) &&
-			is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+			is_inode_flag_set(inode, FI_NEED_IPU))
 		return true;
 
 	return false;
@@ -573,8 +588,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 
 static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 {
-	f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi)
-					|| blk_addr >= MAX_BLKADDR(sbi));
+	BUG_ON(blk_addr < SEG0_BLKADDR(sbi)
+			|| blk_addr >= MAX_BLKADDR(sbi));
 }
 
 /*
@@ -702,9 +717,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 	if (type == DATA)
 		return sbi->blocks_per_seg;
 	else if (type == NODE)
-		return 3 * sbi->blocks_per_seg;
+		return 8 * sbi->blocks_per_seg;
 	else if (type == META)
-		return MAX_BIO_BLOCKS(sbi);
+		return 8 * MAX_BIO_BLOCKS(sbi);
 	else
 		return 0;
 }
@@ -722,10 +737,8 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
 
 	nr_to_write = wbc->nr_to_write;
 
-	if (type == DATA)
-		desired = 4096;
-	else if (type == NODE)
-		desired = 3 * max_hw_blocks(sbi);
+	if (type == NODE)
+		desired = 2 * max_hw_blocks(sbi);
 	else
 		desired = MAX_BIO_BLOCKS(sbi);
 
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index da0d8e0b55a5..46c915425923 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -13,6 +13,7 @@
 #include <linux/f2fs_fs.h>
 
 #include "f2fs.h"
+#include "node.h"
 
 static LIST_HEAD(f2fs_list);
 static DEFINE_SPINLOCK(f2fs_list_lock);
@@ -25,14 +26,15 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
 
 static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 {
-	if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK)
-		return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK;
+	if (NM_I(sbi)->fcnt > MAX_FREE_NIDS)
+		return NM_I(sbi)->fcnt - MAX_FREE_NIDS;
 	return 0;
 }
 
 static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
 {
-	return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node);
+	return atomic_read(&sbi->total_zombie_tree) +
+				atomic_read(&sbi->total_ext_node);
 }
 
 unsigned long f2fs_shrink_count(struct shrinker *shrink,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 2ac3417d9412..fd249cc9b96e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -39,6 +39,35 @@ static struct proc_dir_entry *f2fs_proc_root;
 static struct kmem_cache *f2fs_inode_cachep;
 static struct kset *f2fs_kset;
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+
+char *fault_name[FAULT_MAX] = {
+	[FAULT_KMALLOC]		= "kmalloc",
+	[FAULT_PAGE_ALLOC]	= "page alloc",
+	[FAULT_ALLOC_NID]	= "alloc nid",
+	[FAULT_ORPHAN]		= "orphan",
+	[FAULT_BLOCK]		= "no more block",
+	[FAULT_DIR_DEPTH]	= "too big dir depth",
+	[FAULT_EVICT_INODE]	= "evict_inode fail",
+	[FAULT_IO]		= "IO error",
+	[FAULT_CHECKPOINT]	= "checkpoint error",
+};
+
+static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
+						unsigned int rate)
+{
+	struct f2fs_fault_info *ffi = &sbi->fault_info;
+
+	if (rate) {
+		atomic_set(&ffi->inject_ops, 0);
+		ffi->inject_rate = rate;
+		ffi->inject_type = (1 << FAULT_MAX) - 1;
+	} else {
+		memset(ffi, 0, sizeof(struct f2fs_fault_info));
+	}
+}
+#endif
+
 /* f2fs-wide shrinker description */
 static struct shrinker f2fs_shrinker_info = {
 	.scan_objects = f2fs_shrink_scan,
@@ -51,6 +80,7 @@ enum {
 	Opt_disable_roll_forward,
 	Opt_norecovery,
 	Opt_discard,
+	Opt_nodiscard,
 	Opt_noheap,
 	Opt_user_xattr,
 	Opt_nouser_xattr,
@@ -61,12 +91,19 @@ enum {
 	Opt_inline_xattr,
 	Opt_inline_data,
 	Opt_inline_dentry,
+	Opt_noinline_dentry,
 	Opt_flush_merge,
+	Opt_noflush_merge,
 	Opt_nobarrier,
 	Opt_fastboot,
 	Opt_extent_cache,
 	Opt_noextent_cache,
 	Opt_noinline_data,
+	Opt_data_flush,
+	Opt_mode,
+	Opt_fault_injection,
+	Opt_lazytime,
+	Opt_nolazytime,
 	Opt_err,
 };
 
@@ -75,6 +112,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_disable_roll_forward, "disable_roll_forward"},
 	{Opt_norecovery, "norecovery"},
 	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
 	{Opt_noheap, "no_heap"},
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
@@ -85,12 +123,19 @@ static match_table_t f2fs_tokens = {
 	{Opt_inline_xattr, "inline_xattr"},
 	{Opt_inline_data, "inline_data"},
 	{Opt_inline_dentry, "inline_dentry"},
+	{Opt_noinline_dentry, "noinline_dentry"},
 	{Opt_flush_merge, "flush_merge"},
+	{Opt_noflush_merge, "noflush_merge"},
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_fastboot, "fastboot"},
 	{Opt_extent_cache, "extent_cache"},
 	{Opt_noextent_cache, "noextent_cache"},
 	{Opt_noinline_data, "noinline_data"},
+	{Opt_data_flush, "data_flush"},
+	{Opt_mode, "mode=%s"},
+	{Opt_fault_injection, "fault_injection=%u"},
+	{Opt_lazytime, "lazytime"},
+	{Opt_nolazytime, "nolazytime"},
 	{Opt_err, NULL},
 };
 
@@ -100,6 +145,10 @@ enum {
 	SM_INFO,	/* struct f2fs_sm_info */
 	NM_INFO,	/* struct f2fs_nm_info */
 	F2FS_SBI,	/* struct f2fs_sb_info */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	FAULT_INFO_RATE,	/* struct f2fs_fault_info */
+	FAULT_INFO_TYPE,	/* struct f2fs_fault_info */
+#endif
 };
 
 struct f2fs_attr {
@@ -121,9 +170,27 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
 		return (unsigned char *)NM_I(sbi);
 	else if (struct_type == F2FS_SBI)
 		return (unsigned char *)sbi;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	else if (struct_type == FAULT_INFO_RATE ||
+					struct_type == FAULT_INFO_TYPE)
+		return (unsigned char *)&sbi->fault_info;
+#endif
 	return NULL;
 }
 
+static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)(sbi->kbytes_written +
+			BD_PART_WRITTEN(sbi)));
+}
+
 static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 			struct f2fs_sb_info *sbi, char *buf)
 {
@@ -157,6 +224,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
 	ret = kstrtoul(skip_spaces(buf), 0, &t);
 	if (ret < 0)
 		return ret;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
+		return -EINVAL;
+#endif
 	*ui = t;
 	return count;
 }
@@ -202,6 +273,9 @@ static struct f2fs_attr f2fs_attr_##_name = {			\
 		f2fs_sbi_show, f2fs_sbi_store,			\
 		offsetof(struct struct_name, elname))
 
+#define F2FS_GENERAL_RO_ATTR(name) \
+static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
+
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
@@ -214,9 +288,16 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
+F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
+#endif
+F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -234,7 +315,14 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(dir_level),
 	ATTR_LIST(ram_thresh),
 	ATTR_LIST(ra_nid_pages),
+	ATTR_LIST(dirty_nats_ratio),
 	ATTR_LIST(cp_interval),
+	ATTR_LIST(idle_interval),
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	ATTR_LIST(inject_rate),
+	ATTR_LIST(inject_type),
+#endif
+	ATTR_LIST(lifetime_write_kbytes),
 	NULL,
 };
 
@@ -330,6 +418,8 @@ static int parse_options(struct super_block *sb, char *options)
 					"the device does not support discard");
 			}
 			break;
+		case Opt_nodiscard:
+			clear_opt(sbi, DISCARD);
 		case Opt_noheap:
 			set_opt(sbi, NOHEAP);
 			break;
@@ -388,9 +478,15 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_inline_dentry:
 			set_opt(sbi, INLINE_DENTRY);
 			break;
+		case Opt_noinline_dentry:
+			clear_opt(sbi, INLINE_DENTRY);
+			break;
 		case Opt_flush_merge:
 			set_opt(sbi, FLUSH_MERGE);
 			break;
+		case Opt_noflush_merge:
+			clear_opt(sbi, FLUSH_MERGE);
+			break;
 		case Opt_nobarrier:
 			set_opt(sbi, NOBARRIER);
 			break;
@@ -406,6 +502,42 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_noinline_data:
 			clear_opt(sbi, INLINE_DATA);
 			break;
+		case Opt_data_flush:
+			set_opt(sbi, DATA_FLUSH);
+			break;
+		case Opt_mode:
+			name = match_strdup(&args[0]);
+
+			if (!name)
+				return -ENOMEM;
+			if (strlen(name) == 8 &&
+					!strncmp(name, "adaptive", 8)) {
+				set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+			} else if (strlen(name) == 3 &&
+					!strncmp(name, "lfs", 3)) {
+				set_opt_mode(sbi, F2FS_MOUNT_LFS);
+			} else {
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			break;
+		case Opt_fault_injection:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+			f2fs_build_fault_attr(sbi, arg);
+#else
+			f2fs_msg(sb, KERN_INFO,
+				"FAULT_INJECTION was not selected");
+#endif
+			break;
+		case Opt_lazytime:
+			sb->s_flags |= MS_LAZYTIME;
+			break;
+		case Opt_nolazytime:
+			sb->s_flags &= ~MS_LAZYTIME;
+			break;
 		default:
 			f2fs_msg(sb, KERN_ERR,
 				"Unrecognized mount option \"%s\" or missing value",
@@ -426,26 +558,25 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 
 	init_once((void *) fi);
 
+	if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) {
+		kmem_cache_free(f2fs_inode_cachep, fi);
+		return NULL;
+	}
+
 	/* Initialize f2fs-specific inode info */
 	fi->vfs_inode.i_version = 1;
-	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
 	init_rwsem(&fi->i_sem);
+	INIT_LIST_HEAD(&fi->dirty_list);
+	INIT_LIST_HEAD(&fi->gdirty_list);
 	INIT_LIST_HEAD(&fi->inmem_pages);
 	mutex_init(&fi->inmem_lock);
-
-	set_inode_flag(fi, FI_NEW_INODE);
-
-	if (test_opt(F2FS_SB(sb), INLINE_XATTR))
-		set_inode_flag(fi, FI_INLINE_XATTR);
+	init_rwsem(&fi->dio_rwsem[READ]);
+	init_rwsem(&fi->dio_rwsem[WRITE]);
 
 	/* Will be used by directory only */
 	fi->i_dir_level = F2FS_SB(sb)->dir_level;
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	fi->i_crypt_info = NULL;
-#endif
 	return &fi->vfs_inode;
 }
 
@@ -458,7 +589,7 @@ static int f2fs_drop_inode(struct inode *inode)
 	 *    - f2fs_gc -> iput -> evict
 	 *       - inode_wait_for_writeback(inode)
 	 */
-	if (!inode_unhashed(inode) && inode->i_state & I_SYNC) {
+	if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) {
 		if (!inode->i_nlink && !is_bad_inode(inode)) {
 			/* to avoid evict_inode call simultaneously */
 			atomic_inc(&inode->i_count);
@@ -466,32 +597,66 @@ static int f2fs_drop_inode(struct inode *inode)
 
 			/* some remained atomic pages should discarded */
 			if (f2fs_is_atomic_file(inode))
-				commit_inmem_pages(inode, true);
+				drop_inmem_pages(inode);
 
 			/* should remain fi->extent_tree for writepage */
 			f2fs_destroy_extent_node(inode);
 
 			sb_start_intwrite(inode->i_sb);
-			i_size_write(inode, 0);
+			f2fs_i_size_write(inode, 0);
 
 			if (F2FS_HAS_BLOCKS(inode))
-				f2fs_truncate(inode, true);
+				f2fs_truncate(inode);
 
 			sb_end_intwrite(inode->i_sb);
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-			if (F2FS_I(inode)->i_crypt_info)
-				f2fs_free_encryption_info(inode,
-					F2FS_I(inode)->i_crypt_info);
-#endif
+			fscrypt_put_encryption_info(inode, NULL);
 			spin_lock(&inode->i_lock);
 			atomic_dec(&inode->i_count);
 		}
 		return 0;
 	}
+
 	return generic_drop_inode(inode);
 }
 
+int f2fs_inode_dirtied(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	spin_lock(&sbi->inode_lock[DIRTY_META]);
+	if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+		spin_unlock(&sbi->inode_lock[DIRTY_META]);
+		return 1;
+	}
+
+	set_inode_flag(inode, FI_DIRTY_INODE);
+	list_add_tail(&F2FS_I(inode)->gdirty_list,
+				&sbi->inode_list[DIRTY_META]);
+	inc_page_count(sbi, F2FS_DIRTY_IMETA);
+	stat_inc_dirty_inode(sbi, DIRTY_META);
+	spin_unlock(&sbi->inode_lock[DIRTY_META]);
+
+	return 0;
+}
+
+void f2fs_inode_synced(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	spin_lock(&sbi->inode_lock[DIRTY_META]);
+	if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+		spin_unlock(&sbi->inode_lock[DIRTY_META]);
+		return;
+	}
+	list_del_init(&F2FS_I(inode)->gdirty_list);
+	clear_inode_flag(inode, FI_DIRTY_INODE);
+	clear_inode_flag(inode, FI_AUTO_RECOVER);
+	dec_page_count(sbi, F2FS_DIRTY_IMETA);
+	stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META);
+	spin_unlock(&sbi->inode_lock[DIRTY_META]);
+}
+
 /*
  * f2fs_dirty_inode() is called from __mark_inode_dirty()
  *
@@ -499,7 +664,19 @@ static int f2fs_drop_inode(struct inode *inode)
  */
 static void f2fs_dirty_inode(struct inode *inode, int flags)
 {
-	set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+			inode->i_ino == F2FS_META_INO(sbi))
+		return;
+
+	if (flags == I_DIRTY_TIME)
+		return;
+
+	if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
+		clear_inode_flag(inode, FI_AUTO_RECOVER);
+
+	f2fs_inode_dirtied(inode);
 }
 
 static void f2fs_i_callback(struct rcu_head *head)
@@ -510,15 +687,27 @@ static void f2fs_i_callback(struct rcu_head *head)
 
 static void f2fs_destroy_inode(struct inode *inode)
 {
+	percpu_counter_destroy(&F2FS_I(inode)->dirty_pages);
 	call_rcu(&inode->i_rcu, f2fs_i_callback);
 }
 
+static void destroy_percpu_info(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	for (i = 0; i < NR_COUNT_TYPE; i++)
+		percpu_counter_destroy(&sbi->nr_pages[i]);
+	percpu_counter_destroy(&sbi->alloc_valid_block_count);
+	percpu_counter_destroy(&sbi->total_valid_inode_count);
+}
+
 static void f2fs_put_super(struct super_block *sb)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 
 	if (sbi->s_proc) {
 		remove_proc_entry("segment_info", sbi->s_proc);
+		remove_proc_entry("segment_bits", sbi->s_proc);
 		remove_proc_entry(sb->s_id, f2fs_proc_root);
 	}
 	kobject_del(&sbi->s_kobj);
@@ -534,7 +723,7 @@ static void f2fs_put_super(struct super_block *sb)
 	 * clean checkpoint again.
 	 */
 	if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
-			!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) {
+			!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
 		struct cp_control cpc = {
 			.reason = CP_UMOUNT,
 		};
@@ -548,12 +737,15 @@ static void f2fs_put_super(struct super_block *sb)
 	 * normally superblock is clean, so we need to release this.
 	 * In addition, EIO will skip do checkpoint, we need this as well.
 	 */
-	release_dirty_inode(sbi);
+	release_ino_entry(sbi, true);
 	release_discard_addrs(sbi);
 
 	f2fs_leave_shrinker(sbi);
 	mutex_unlock(&sbi->umount_mutex);
 
+	/* our cp_error case, we can wait for any writeback page */
+	f2fs_flush_merged_bios(sbi);
+
 	iput(sbi->node_inode);
 	iput(sbi->meta_inode);
 
@@ -566,13 +758,18 @@ static void f2fs_put_super(struct super_block *sb)
 	wait_for_completion(&sbi->s_kobj_unregister);
 
 	sb->s_fs_info = NULL;
-	brelse(sbi->raw_super_buf);
+	if (sbi->s_chksum_driver)
+		crypto_free_shash(sbi->s_chksum_driver);
+	kfree(sbi->raw_super);
+
+	destroy_percpu_info(sbi);
 	kfree(sbi);
 }
 
 int f2fs_sync_fs(struct super_block *sb, int sync)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	int err = 0;
 
 	trace_f2fs_sync_fs(sb, sync);
 
@@ -582,14 +779,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 		cpc.reason = __get_cp_reason(sbi);
 
 		mutex_lock(&sbi->gc_mutex);
-		write_checkpoint(sbi, &cpc);
+		err = write_checkpoint(sbi, &cpc);
 		mutex_unlock(&sbi->gc_mutex);
-	} else {
-		f2fs_balance_fs(sbi);
 	}
 	f2fs_trace_ios(NULL, 1);
 
-	return 0;
+	return err;
 }
 
 static int f2fs_freeze(struct super_block *sb)
@@ -623,7 +818,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bsize = sbi->blocksize;
 
 	buf->f_blocks = total_count - start_count;
-	buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
+	buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
 	buf->f_bavail = user_block_count - valid_user_blocks(sbi);
 
 	buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
@@ -676,6 +871,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",noinline_data");
 	if (test_opt(sbi, INLINE_DENTRY))
 		seq_puts(seq, ",inline_dentry");
+	else
+		seq_puts(seq, ",noinline_dentry");
 	if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
 		seq_puts(seq, ",flush_merge");
 	if (test_opt(sbi, NOBARRIER))
@@ -686,6 +883,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",extent_cache");
 	else
 		seq_puts(seq, ",noextent_cache");
+	if (test_opt(sbi, DATA_FLUSH))
+		seq_puts(seq, ",data_flush");
+
+	seq_puts(seq, ",mode=");
+	if (test_opt(sbi, ADAPTIVE))
+		seq_puts(seq, "adaptive");
+	else if (test_opt(sbi, LFS))
+		seq_puts(seq, "lfs");
 	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
 
 	return 0;
@@ -718,19 +923,47 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
 	return 0;
 }
 
-static int segment_info_open_fs(struct inode *inode, struct file *file)
+static int segment_bits_seq_show(struct seq_file *seq, void *offset)
 {
-	return single_open(file, segment_info_seq_show, PDE_DATA(inode));
+	struct super_block *sb = seq->private;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	unsigned int total_segs =
+			le32_to_cpu(sbi->raw_super->segment_count_main);
+	int i, j;
+
+	seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
+		"segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
+	for (i = 0; i < total_segs; i++) {
+		struct seg_entry *se = get_seg_entry(sbi, i);
+
+		seq_printf(seq, "%-10d", i);
+		seq_printf(seq, "%d|%-3u|", se->type,
+					get_valid_blocks(sbi, i, 1));
+		for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
+			seq_printf(seq, " %.2x", se->cur_valid_map[j]);
+		seq_putc(seq, '\n');
+	}
+	return 0;
 }
 
-static const struct file_operations f2fs_seq_segment_info_fops = {
-	.owner = THIS_MODULE,
-	.open = segment_info_open_fs,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
+#define F2FS_PROC_FILE_DEF(_name)					\
+static int _name##_open_fs(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, _name##_seq_show, PDE_DATA(inode));	\
+}									\
+									\
+static const struct file_operations f2fs_seq_##_name##_fops = {		\
+	.owner = THIS_MODULE,						\
+	.open = _name##_open_fs,					\
+	.read = seq_read,						\
+	.llseek = seq_lseek,						\
+	.release = single_release,					\
 };
 
+F2FS_PROC_FILE_DEF(segment_info);
+F2FS_PROC_FILE_DEF(segment_bits);
+
 static void default_options(struct f2fs_sb_info *sbi)
 {
 	/* init some FS parameters */
@@ -738,7 +971,16 @@ static void default_options(struct f2fs_sb_info *sbi)
 
 	set_opt(sbi, BG_GC);
 	set_opt(sbi, INLINE_DATA);
+	set_opt(sbi, INLINE_DENTRY);
 	set_opt(sbi, EXTENT_CACHE);
+	sbi->sb->s_flags |= MS_LAZYTIME;
+	set_opt(sbi, FLUSH_MERGE);
+	if (f2fs_sb_mounted_hmsmr(sbi->sb)) {
+		set_opt_mode(sbi, F2FS_MOUNT_LFS);
+		set_opt(sbi, DISCARD);
+	} else {
+		set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+	}
 
 #ifdef CONFIG_F2FS_FS_XATTR
 	set_opt(sbi, XATTR_USER);
@@ -746,6 +988,10 @@ static void default_options(struct f2fs_sb_info *sbi)
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 	set_opt(sbi, POSIX_ACL);
 #endif
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	f2fs_build_fault_attr(sbi, 0);
+#endif
 }
 
 static int f2fs_remount(struct super_block *sb, int *flags, char *data)
@@ -756,8 +1002,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool need_restart_gc = false;
 	bool need_stop_gc = false;
 	bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
-
-	sync_filesystem(sb);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	struct f2fs_fault_info ffi = sbi->fault_info;
+#endif
 
 	/*
 	 * Save the old mount options in case we
@@ -766,6 +1013,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	org_mount_opt = sbi->mount_opt;
 	active_logs = sbi->active_logs;
 
+	/* recover superblocks we couldn't write due to previous RO mount */
+	if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+		err = f2fs_commit_super(sbi, false);
+		f2fs_msg(sb, KERN_INFO,
+			"Try to recover all the superblocks, ret: %d", err);
+		if (!err)
+			clear_sbi_flag(sbi, SBI_NEED_SB_WRITE);
+	}
+
 	sbi->mount_opt.opt = 0;
 	default_options(sbi);
 
@@ -797,7 +1053,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
 		if (sbi->gc_thread) {
 			stop_gc_thread(sbi);
-			f2fs_sync_fs(sb, 1);
 			need_restart_gc = true;
 		}
 	} else if (!sbi->gc_thread) {
@@ -807,6 +1062,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		need_stop_gc = true;
 	}
 
+	if (*flags & MS_RDONLY) {
+		writeback_inodes_sb(sb, WB_REASON_SYNC);
+		sync_inodes_sb(sb);
+
+		set_sbi_flag(sbi, SBI_IS_DIRTY);
+		set_sbi_flag(sbi, SBI_IS_CLOSE);
+		f2fs_sync_fs(sb, 1);
+		clear_sbi_flag(sbi, SBI_IS_CLOSE);
+	}
+
 	/*
 	 * We stop issue flush thread if FS is mounted as RO
 	 * or if flush_merge is not passed in mount option.
@@ -820,8 +1085,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	}
 skip:
 	/* Update the POSIXACL Flag */
-	 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+
 	return 0;
 restore_gc:
 	if (need_restart_gc) {
@@ -834,6 +1100,9 @@ restore_gc:
 restore_opts:
 	sbi->mount_opt = org_mount_opt;
 	sbi->active_logs = active_logs;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	sbi->fault_info = ffi;
+#endif
 	return err;
 }
 
@@ -853,6 +1122,48 @@ static struct super_operations f2fs_sops = {
 	.remount_fs	= f2fs_remount,
 };
 
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
+{
+	return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+				ctx, len, NULL);
+}
+
+static int f2fs_key_prefix(struct inode *inode, u8 **key)
+{
+	*key = F2FS_I_SB(inode)->key_prefix;
+	return F2FS_I_SB(inode)->key_prefix_size;
+}
+
+static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
+							void *fs_data)
+{
+	return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+				ctx, len, fs_data, XATTR_CREATE);
+}
+
+static unsigned f2fs_max_namelen(struct inode *inode)
+{
+	return S_ISLNK(inode->i_mode) ?
+			inode->i_sb->s_blocksize : F2FS_NAME_LEN;
+}
+
+static struct fscrypt_operations f2fs_cryptops = {
+	.get_context	= f2fs_get_context,
+	.key_prefix	= f2fs_key_prefix,
+	.set_context	= f2fs_set_context,
+	.is_encrypted	= f2fs_encrypted_inode,
+	.empty_dir	= f2fs_empty_dir,
+	.max_namelen	= f2fs_max_namelen,
+};
+#else
+static struct fscrypt_operations f2fs_cryptops = {
+	.is_encrypted	= f2fs_encrypted_inode,
+};
+#endif
+
 static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
 		u64 ino, u32 generation)
 {
@@ -898,7 +1209,7 @@ static const struct export_operations f2fs_export_ops = {
 	.get_parent = f2fs_get_parent,
 };
 
-static loff_t max_file_size(unsigned bits)
+static loff_t max_file_blocks(void)
 {
 	loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
 	loff_t leaf_count = ADDRS_PER_BLOCK;
@@ -914,13 +1225,29 @@ static loff_t max_file_size(unsigned bits)
 	leaf_count *= NIDS_PER_BLOCK;
 	result += leaf_count;
 
-	result <<= bits;
 	return result;
 }
 
-static inline bool sanity_check_area_boundary(struct super_block *sb,
-					struct f2fs_super_block *raw_super)
+static int __f2fs_commit_super(struct buffer_head *bh,
+			struct f2fs_super_block *super)
 {
+	lock_buffer(bh);
+	if (super)
+		memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
+	set_buffer_uptodate(bh);
+	set_buffer_dirty(bh);
+	unlock_buffer(bh);
+
+	/* it's rare case, we can do fua all the time */
+	return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+}
+
+static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
+					struct buffer_head *bh)
+{
+	struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
+					(bh->b_data + F2FS_SUPER_OFFSET);
+	struct super_block *sb = sbi->sb;
 	u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
 	u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr);
 	u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr);
@@ -934,6 +1261,10 @@ static inline bool sanity_check_area_boundary(struct super_block *sb,
 	u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main);
 	u32 segment_count = le32_to_cpu(raw_super->segment_count);
 	u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+	u64 main_end_blkaddr = main_blkaddr +
+				(segment_count_main << log_blocks_per_seg);
+	u64 seg_end_blkaddr = segment0_blkaddr +
+				(segment_count << log_blocks_per_seg);
 
 	if (segment0_blkaddr != cp_blkaddr) {
 		f2fs_msg(sb, KERN_INFO,
@@ -978,22 +1309,47 @@ static inline bool sanity_check_area_boundary(struct super_block *sb,
 		return true;
 	}
 
-	if (main_blkaddr + (segment_count_main << log_blocks_per_seg) !=
-		segment0_blkaddr + (segment_count << log_blocks_per_seg)) {
+	if (main_end_blkaddr > seg_end_blkaddr) {
 		f2fs_msg(sb, KERN_INFO,
-			"Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)",
+			"Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)",
 			main_blkaddr,
-			segment0_blkaddr + (segment_count << log_blocks_per_seg),
+			segment0_blkaddr +
+				(segment_count << log_blocks_per_seg),
 			segment_count_main << log_blocks_per_seg);
 		return true;
-	}
+	} else if (main_end_blkaddr < seg_end_blkaddr) {
+		int err = 0;
+		char *res;
 
+		/* fix in-memory information all the time */
+		raw_super->segment_count = cpu_to_le32((main_end_blkaddr -
+				segment0_blkaddr) >> log_blocks_per_seg);
+
+		if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) {
+			set_sbi_flag(sbi, SBI_NEED_SB_WRITE);
+			res = "internally";
+		} else {
+			err = __f2fs_commit_super(bh, NULL);
+			res = err ? "failed" : "done";
+		}
+		f2fs_msg(sb, KERN_INFO,
+			"Fix alignment : %s, start(%u) end(%u) block(%u)",
+			res, main_blkaddr,
+			segment0_blkaddr +
+				(segment_count << log_blocks_per_seg),
+			segment_count_main << log_blocks_per_seg);
+		if (err)
+			return true;
+	}
 	return false;
 }
 
-static int sanity_check_raw_super(struct super_block *sb,
-			struct f2fs_super_block *raw_super)
+static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
+				struct buffer_head *bh)
 {
+	struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
+					(bh->b_data + F2FS_SUPER_OFFSET);
+	struct super_block *sb = sbi->sb;
 	unsigned int blocksize;
 
 	if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
@@ -1004,10 +1360,10 @@ static int sanity_check_raw_super(struct super_block *sb,
 	}
 
 	/* Currently, support only 4KB page cache size */
-	if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) {
+	if (F2FS_BLKSIZE != PAGE_SIZE) {
 		f2fs_msg(sb, KERN_INFO,
 			"Invalid page_cache_size (%lu), supports only 4KB\n",
-			PAGE_CACHE_SIZE);
+			PAGE_SIZE);
 		return 1;
 	}
 
@@ -1059,27 +1415,18 @@ static int sanity_check_raw_super(struct super_block *sb,
 		return 1;
 	}
 
-	if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) {
-		f2fs_msg(sb, KERN_INFO,
-			"Invalid segment count (%u)",
-			le32_to_cpu(raw_super->segment_count));
-		return 1;
-	}
-
 	/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
-	if (sanity_check_area_boundary(sb, raw_super))
+	if (sanity_check_area_boundary(sbi, bh))
 		return 1;
 
 	return 0;
 }
 
-static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
+int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 {
 	unsigned int total, fsmeta;
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
-	unsigned int main_segs, blocks_per_seg;
-	int i;
 
 	total = le32_to_cpu(raw_super->segment_count);
 	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -1091,22 +1438,6 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	if (unlikely(fsmeta >= total))
 		return 1;
 
-	main_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
-	blocks_per_seg = sbi->blocks_per_seg;
-
-	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
-		if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
-		    le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) {
-			return 1;
-		}
-	}
-	for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
-		if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
-		    le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) {
-			return 1;
-		}
-	}
-
 	if (unlikely(f2fs_cp_error(sbi))) {
 		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
 		return 1;
@@ -1117,7 +1448,6 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 static void init_sb_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = sbi->raw_super;
-	int i;
 
 	sbi->log_sectors_per_block =
 		le32_to_cpu(raw_super->log_sectors_per_block);
@@ -1137,111 +1467,131 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->cur_victim_sec = NULL_SECNO;
 	sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
 
-	for (i = 0; i < NR_COUNT_TYPE; i++)
-		atomic_set(&sbi->nr_pages[i], 0);
-
 	sbi->dir_level = DEF_DIR_LEVEL;
-	sbi->cp_interval = DEF_CP_INTERVAL;
+	sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
+	sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
 	clear_sbi_flag(sbi, SBI_NEED_FSCK);
 
 	INIT_LIST_HEAD(&sbi->s_list);
 	mutex_init(&sbi->umount_mutex);
+	mutex_init(&sbi->wio_mutex[NODE]);
+	mutex_init(&sbi->wio_mutex[DATA]);
+	spin_lock_init(&sbi->cp_lock);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
+				F2FS_KEY_DESC_PREFIX_SIZE);
+	sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE;
+#endif
+}
+
+static int init_percpu_info(struct f2fs_sb_info *sbi)
+{
+	int i, err;
+
+	for (i = 0; i < NR_COUNT_TYPE; i++) {
+		err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
+		if (err)
+			return err;
+	}
+
+	err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL);
+	if (err)
+		return err;
+
+	return percpu_counter_init(&sbi->total_valid_inode_count, 0,
+								GFP_KERNEL);
 }
 
 /*
  * Read f2fs raw super block.
- * Because we have two copies of super block, so read the first one at first,
- * if the first one is invalid, move to read the second one.
+ * Because we have two copies of super block, so read both of them
+ * to get the first valid one. If any one of them is broken, we pass
+ * them recovery flag back to the caller.
  */
-static int read_raw_super_block(struct super_block *sb,
+static int read_raw_super_block(struct f2fs_sb_info *sbi,
 			struct f2fs_super_block **raw_super,
-			struct buffer_head **raw_super_buf,
-			int *recovery)
+			int *valid_super_block, int *recovery)
 {
-	int block = 0;
-	struct buffer_head *buffer;
+	struct super_block *sb = sbi->sb;
+	int block;
+	struct buffer_head *bh;
 	struct f2fs_super_block *super;
 	int err = 0;
 
-retry:
-	buffer = sb_bread(sb, block);
-	if (!buffer) {
-		*recovery = 1;
-		f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
+	super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
+	if (!super)
+		return -ENOMEM;
+
+	for (block = 0; block < 2; block++) {
+		bh = sb_bread(sb, block);
+		if (!bh) {
+			f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
 				block + 1);
-		if (block == 0) {
-			block++;
-			goto retry;
-		} else {
 			err = -EIO;
-			goto out;
+			continue;
 		}
-	}
 
-	super = (struct f2fs_super_block *)
-		((char *)(buffer)->b_data + F2FS_SUPER_OFFSET);
-
-	/* sanity checking of raw super */
-	if (sanity_check_raw_super(sb, super)) {
-		brelse(buffer);
-		*recovery = 1;
-		f2fs_msg(sb, KERN_ERR,
-			"Can't find valid F2FS filesystem in %dth superblock",
-								block + 1);
-		if (block == 0) {
-			block++;
-			goto retry;
-		} else {
+		/* sanity checking of raw super */
+		if (sanity_check_raw_super(sbi, bh)) {
+			f2fs_msg(sb, KERN_ERR,
+				"Can't find valid F2FS filesystem in %dth superblock",
+				block + 1);
 			err = -EINVAL;
-			goto out;
+			brelse(bh);
+			continue;
 		}
+
+		if (!*raw_super) {
+			memcpy(super, bh->b_data + F2FS_SUPER_OFFSET,
+							sizeof(*super));
+			*valid_super_block = block;
+			*raw_super = super;
+		}
+		brelse(bh);
 	}
 
-	if (!*raw_super) {
-		*raw_super_buf = buffer;
-		*raw_super = super;
-	} else {
-		/* already have a valid superblock */
-		brelse(buffer);
-	}
+	/* Fail to read any one of the superblocks*/
+	if (err < 0)
+		*recovery = 1;
 
-	/* check the validity of the second superblock */
-	if (block == 0) {
-		block++;
-		goto retry;
-	}
-
-out:
 	/* No valid superblock */
 	if (!*raw_super)
-		return err;
+		kfree(super);
+	else
+		err = 0;
 
-	return 0;
+	return err;
 }
 
 int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 {
-	struct buffer_head *sbh = sbi->raw_super_buf;
-	sector_t block = sbh->b_blocknr;
+	struct buffer_head *bh;
 	int err;
 
-	/* write back-up superblock first */
-	sbh->b_blocknr = block ? 0 : 1;
-	mark_buffer_dirty(sbh);
-	err = sync_dirty_buffer(sbh);
+	if ((recover && f2fs_readonly(sbi->sb)) ||
+				bdev_read_only(sbi->sb->s_bdev)) {
+		set_sbi_flag(sbi, SBI_NEED_SB_WRITE);
+		return -EROFS;
+	}
 
-	sbh->b_blocknr = block;
+	/* write back-up superblock first */
+	bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1);
+	if (!bh)
+		return -EIO;
+	err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
+	brelse(bh);
 
 	/* if we are in recovery path, skip writing valid superblock */
 	if (recover || err)
-		goto out;
+		return err;
 
 	/* write current valid superblock */
-	mark_buffer_dirty(sbh);
-	err = sync_dirty_buffer(sbh);
-out:
-	clear_buffer_write_io_error(sbh);
-	set_buffer_uptodate(sbh);
+	bh = sb_getblk(sbi->sb, sbi->valid_super_block);
+	if (!bh)
+		return -EIO;
+	err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
+	brelse(bh);
 	return err;
 }
 
@@ -1249,17 +1599,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct f2fs_sb_info *sbi;
 	struct f2fs_super_block *raw_super;
-	struct buffer_head *raw_super_buf;
 	struct inode *root;
-	long err;
+	int err;
 	bool retry = true, need_fsck = false;
 	char *options = NULL;
-	int recovery, i;
+	int recovery, i, valid_super_block;
+	struct curseg_info *seg_i;
 
 try_onemore:
 	err = -EINVAL;
 	raw_super = NULL;
-	raw_super_buf = NULL;
+	valid_super_block = -1;
 	recovery = 0;
 
 	/* allocate memory for f2fs-specific super block info */
@@ -1267,17 +1617,31 @@ try_onemore:
 	if (!sbi)
 		return -ENOMEM;
 
+	sbi->sb = sb;
+
+	/* Load the checksum driver */
+	sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
+	if (IS_ERR(sbi->s_chksum_driver)) {
+		f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver.");
+		err = PTR_ERR(sbi->s_chksum_driver);
+		sbi->s_chksum_driver = NULL;
+		goto free_sbi;
+	}
+
 	/* set a block size */
 	if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
 		f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
 		goto free_sbi;
 	}
 
-	err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery);
+	err = read_raw_super_block(sbi, &raw_super, &valid_super_block,
+								&recovery);
 	if (err)
 		goto free_sbi;
 
 	sb->s_fs_info = sbi;
+	sbi->raw_super = raw_super;
+
 	default_options(sbi);
 	/* parse mount options */
 	options = kstrdup((const char *)data, GFP_KERNEL);
@@ -1290,11 +1654,14 @@ try_onemore:
 	if (err)
 		goto free_options;
 
-	sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+	sbi->max_file_blocks = max_file_blocks();
+	sb->s_maxbytes = sbi->max_file_blocks <<
+				le32_to_cpu(raw_super->log_blocksize);
 	sb->s_max_links = F2FS_LINK_MAX;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 
 	sb->s_op = &f2fs_sops;
+	sb->s_cop = &f2fs_cryptops;
 	sb->s_xattr = f2fs_xattr_handlers;
 	sb->s_export_op = &f2fs_export_ops;
 	sb->s_magic = F2FS_SUPER_MAGIC;
@@ -1304,11 +1671,8 @@ try_onemore:
 	memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
 
 	/* init f2fs-specific super block info */
-	sbi->sb = sb;
-	sbi->raw_super = raw_super;
-	sbi->raw_super_buf = raw_super_buf;
+	sbi->valid_super_block = valid_super_block;
 	mutex_init(&sbi->gc_mutex);
-	mutex_init(&sbi->writepages);
 	mutex_init(&sbi->cp_mutex);
 	init_rwsem(&sbi->node_write);
 
@@ -1329,6 +1693,10 @@ try_onemore:
 	init_waitqueue_head(&sbi->cp_wait);
 	init_sb_info(sbi);
 
+	err = init_percpu_info(sbi);
+	if (err)
+		goto free_options;
+
 	/* get an inode for meta space */
 	sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
 	if (IS_ERR(sbi->meta_inode)) {
@@ -1343,24 +1711,19 @@ try_onemore:
 		goto free_meta_inode;
 	}
 
-	/* sanity checking of checkpoint */
-	err = -EINVAL;
-	if (sanity_check_ckpt(sbi)) {
-		f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
-		goto free_cp;
-	}
-
 	sbi->total_valid_node_count =
 				le32_to_cpu(sbi->ckpt->valid_node_count);
-	sbi->total_valid_inode_count =
-				le32_to_cpu(sbi->ckpt->valid_inode_count);
+	percpu_counter_set(&sbi->total_valid_inode_count,
+				le32_to_cpu(sbi->ckpt->valid_inode_count));
 	sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
 	sbi->total_valid_block_count =
 				le64_to_cpu(sbi->ckpt->valid_block_count);
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
-	sbi->alloc_valid_block_count = 0;
-	INIT_LIST_HEAD(&sbi->dir_inode_list);
-	spin_lock_init(&sbi->dir_inode_lock);
+
+	for (i = 0; i < NR_INODE_TYPE; i++) {
+		INIT_LIST_HEAD(&sbi->inode_list[i]);
+		spin_lock_init(&sbi->inode_lock[i]);
+	}
 
 	init_extent_cache_info(sbi);
 
@@ -1380,6 +1743,17 @@ try_onemore:
 		goto free_nm;
 	}
 
+	/* For write statistics */
+	if (sb->s_bdev->bd_part)
+		sbi->sectors_written_start =
+			(u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+
+	/* Read accumulated write IO statistics if exists */
+	seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+	if (__exist_node_summaries(sbi))
+		sbi->kbytes_written =
+			le64_to_cpu(seg_i->journal->info.kbytes_written);
+
 	build_gc_manager(sbi);
 
 	/* get an inode for node space */
@@ -1423,9 +1797,12 @@ try_onemore:
 	if (f2fs_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
 
-	if (sbi->s_proc)
+	if (sbi->s_proc) {
 		proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
 				 &f2fs_seq_segment_info_fops, sb);
+		proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
+				 &f2fs_seq_segment_bits_fops, sb);
+	}
 
 	sbi->s_kobj.kset = f2fs_kset;
 	init_completion(&sbi->s_kobj_unregister);
@@ -1441,7 +1818,7 @@ try_onemore:
 		 * previous checkpoint was not done by clean system shutdown.
 		 */
 		if (bdev_read_only(sb->s_bdev) &&
-				!is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) {
+				!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
 			err = -EROFS;
 			goto free_kobj;
 		}
@@ -1449,14 +1826,27 @@ try_onemore:
 		if (need_fsck)
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
 
-		err = recover_fsync_data(sbi);
-		if (err) {
+		if (!retry)
+			goto skip_recovery;
+
+		err = recover_fsync_data(sbi, false);
+		if (err < 0) {
 			need_fsck = true;
 			f2fs_msg(sb, KERN_ERR,
-				"Cannot recover all fsync data errno=%ld", err);
+				"Cannot recover all fsync data errno=%d", err);
+			goto free_kobj;
+		}
+	} else {
+		err = recover_fsync_data(sbi, true);
+
+		if (!f2fs_readonly(sb) && err > 0) {
+			err = -EINVAL;
+			f2fs_msg(sb, KERN_ERR,
+				"Need to recover fsync data");
 			goto free_kobj;
 		}
 	}
+skip_recovery:
 	/* recover_fsync_data() cleared this already */
 	clear_sbi_flag(sbi, SBI_POR_DOING);
 
@@ -1473,20 +1863,26 @@ try_onemore:
 	kfree(options);
 
 	/* recover broken superblock */
-	if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) {
-		f2fs_msg(sb, KERN_INFO, "Recover invalid superblock");
-		f2fs_commit_super(sbi, true);
+	if (recovery) {
+		err = f2fs_commit_super(sbi, true);
+		f2fs_msg(sb, KERN_INFO,
+			"Try to recover %dth superblock, ret: %d",
+			sbi->valid_super_block ? 1 : 2, err);
 	}
 
-	sbi->cp_expires = round_jiffies_up(jiffies);
-
+	f2fs_update_time(sbi, CP_TIME);
+	f2fs_update_time(sbi, REQ_TIME);
 	return 0;
 
 free_kobj:
+	f2fs_sync_inode_meta(sbi);
 	kobject_del(&sbi->s_kobj);
+	kobject_put(&sbi->s_kobj);
+	wait_for_completion(&sbi->s_kobj_unregister);
 free_proc:
 	if (sbi->s_proc) {
 		remove_proc_entry("segment_info", sbi->s_proc);
+		remove_proc_entry("segment_bits", sbi->s_proc);
 		remove_proc_entry(sb->s_id, f2fs_proc_root);
 	}
 	f2fs_destroy_stats(sbi);
@@ -1494,7 +1890,9 @@ free_root_inode:
 	dput(sb->s_root);
 	sb->s_root = NULL;
 free_node_inode:
+	truncate_inode_pages_final(NODE_MAPPING(sbi));
 	mutex_lock(&sbi->umount_mutex);
+	release_ino_entry(sbi, true);
 	f2fs_leave_shrinker(sbi);
 	iput(sbi->node_inode);
 	mutex_unlock(&sbi->umount_mutex);
@@ -1502,16 +1900,18 @@ free_nm:
 	destroy_node_manager(sbi);
 free_sm:
 	destroy_segment_manager(sbi);
-free_cp:
 	kfree(sbi->ckpt);
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);
 	iput(sbi->meta_inode);
 free_options:
+	destroy_percpu_info(sbi);
 	kfree(options);
 free_sb_buf:
-	brelse(raw_super_buf);
+	kfree(raw_super);
 free_sbi:
+	if (sbi->s_chksum_driver)
+		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi);
 
 	/* give only one another chance */
@@ -1547,8 +1947,9 @@ MODULE_ALIAS_FS("f2fs");
 
 static int __init init_inodecache(void)
 {
-	f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
-			sizeof(struct f2fs_inode_info));
+	f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+			sizeof(struct f2fs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT, NULL);
 	if (!f2fs_inode_cachep)
 		return -ENOMEM;
 	return 0;
@@ -1590,25 +1991,23 @@ static int __init init_f2fs_fs(void)
 		err = -ENOMEM;
 		goto free_extent_cache;
 	}
-	err = f2fs_init_crypto();
-	if (err)
-		goto free_kset;
-
 	err = register_shrinker(&f2fs_shrinker_info);
 	if (err)
-		goto free_crypto;
+		goto free_kset;
 
 	err = register_filesystem(&f2fs_fs_type);
 	if (err)
 		goto free_shrinker;
-	f2fs_create_root_stats();
+	err = f2fs_create_root_stats();
+	if (err)
+		goto free_filesystem;
 	f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
 	return 0;
 
+free_filesystem:
+	unregister_filesystem(&f2fs_fs_type);
 free_shrinker:
 	unregister_shrinker(&f2fs_shrinker_info);
-free_crypto:
-	f2fs_exit_crypto();
 free_kset:
 	kset_unregister(f2fs_kset);
 free_extent_cache:
@@ -1629,15 +2028,14 @@ static void __exit exit_f2fs_fs(void)
 {
 	remove_proc_entry("fs/f2fs", NULL);
 	f2fs_destroy_root_stats();
-	unregister_shrinker(&f2fs_shrinker_info);
 	unregister_filesystem(&f2fs_fs_type);
-	f2fs_exit_crypto();
+	unregister_shrinker(&f2fs_shrinker_info);
+	kset_unregister(f2fs_kset);
 	destroy_extent_cache();
 	destroy_checkpoint_caches();
 	destroy_segment_manager_caches();
 	destroy_node_manager_caches();
 	destroy_inodecache();
-	kset_unregister(f2fs_kset);
 	f2fs_destroy_trace_ios();
 }
 
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 145fb659ad44..562ce0821559 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -29,7 +29,8 @@ static inline void __print_last_io(void)
 			last_io.major, last_io.minor,
 			last_io.pid, "----------------",
 			last_io.type,
-			last_io.fio.rw, last_io.fio.blk_addr,
+			last_io.fio.rw,
+			last_io.fio.new_blkaddr,
 			last_io.len);
 	memset(&last_io, 0, sizeof(last_io));
 }
@@ -101,7 +102,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
 			last_io.pid == pid &&
 			last_io.type == __file_type(inode, pid) &&
 			last_io.fio.rw == fio->rw &&
-			last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
+			last_io.fio.new_blkaddr + last_io.len ==
+							fio->new_blkaddr) {
 		last_io.len++;
 		return;
 	}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 862368a32e53..69c6bb9cf207 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
 		return -EINVAL;
 
 	F2FS_I(inode)->i_advise |= *(char *)value;
-	mark_inode_dirty(inode);
+	f2fs_mark_inode_dirty_sync(inode);
 	return 0;
 }
 
@@ -264,18 +264,20 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
 	return entry;
 }
 
-static void *read_all_xattrs(struct inode *inode, struct page *ipage)
+static int read_all_xattrs(struct inode *inode, struct page *ipage,
+							void **base_addr)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_xattr_header *header;
 	size_t size = PAGE_SIZE, inline_size = 0;
 	void *txattr_addr;
+	int err;
 
 	inline_size = inline_xattr_size(inode);
 
 	txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
 	if (!txattr_addr)
-		return NULL;
+		return -ENOMEM;
 
 	/* read from inline xattr */
 	if (inline_size) {
@@ -286,8 +288,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
 			inline_addr = inline_xattr_addr(ipage);
 		} else {
 			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page))
+			if (IS_ERR(page)) {
+				err = PTR_ERR(page);
 				goto fail;
+			}
 			inline_addr = inline_xattr_addr(page);
 		}
 		memcpy(txattr_addr, inline_addr, inline_size);
@@ -301,8 +305,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
 
 		/* The inode already has an extended attribute block. */
 		xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
-		if (IS_ERR(xpage))
+		if (IS_ERR(xpage)) {
+			err = PTR_ERR(xpage);
 			goto fail;
+		}
 
 		xattr_addr = page_address(xpage);
 		memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
@@ -316,10 +322,11 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
 		header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
 		header->h_refcount = cpu_to_le32(1);
 	}
-	return txattr_addr;
+	*base_addr = txattr_addr;
+	return 0;
 fail:
 	kzfree(txattr_addr);
-	return NULL;
+	return err;
 }
 
 static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
@@ -345,7 +352,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 
 		if (ipage) {
 			inline_addr = inline_xattr_addr(ipage);
-			f2fs_wait_on_page_writeback(ipage, NODE);
+			f2fs_wait_on_page_writeback(ipage, NODE, true);
+			set_page_dirty(ipage);
 		} else {
 			page = get_node_page(sbi, inode->i_ino);
 			if (IS_ERR(page)) {
@@ -353,7 +361,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 				return PTR_ERR(page);
 			}
 			inline_addr = inline_xattr_addr(page);
-			f2fs_wait_on_page_writeback(page, NODE);
+			f2fs_wait_on_page_writeback(page, NODE, true);
 		}
 		memcpy(inline_addr, txattr_addr, inline_size);
 		f2fs_put_page(page, 1);
@@ -374,7 +382,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 			return PTR_ERR(xpage);
 		}
 		f2fs_bug_on(sbi, new_nid);
-		f2fs_wait_on_page_writeback(xpage, NODE);
+		f2fs_wait_on_page_writeback(xpage, NODE, true);
 	} else {
 		struct dnode_of_data dn;
 		set_new_dnode(&dn, inode, NULL, NULL, new_nid);
@@ -412,9 +420,9 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 	if (len > F2FS_NAME_LEN)
 		return -ERANGE;
 
-	base_addr = read_all_xattrs(inode, ipage);
-	if (!base_addr)
-		return -ENOMEM;
+	error = read_all_xattrs(inode, ipage, &base_addr);
+	if (error)
+		return error;
 
 	entry = __find_xattr(base_addr, index, len, name);
 	if (IS_XATTR_LAST_ENTRY(entry)) {
@@ -448,9 +456,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 	int error = 0;
 	size_t rest = buffer_size;
 
-	base_addr = read_all_xattrs(inode, NULL);
-	if (!base_addr)
-		return -ENOMEM;
+	error = read_all_xattrs(inode, NULL, &base_addr);
+	if (error)
+		return error;
 
 	list_for_each_xattr(entry, base_addr) {
 		const struct xattr_handler *handler =
@@ -481,13 +489,12 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 			const char *name, const void *value, size_t size,
 			struct page *ipage, int flags)
 {
-	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct f2fs_xattr_entry *here, *last;
 	void *base_addr;
 	int found, newsize;
 	size_t len;
 	__u32 new_hsize;
-	int error = -ENOMEM;
+	int error = 0;
 
 	if (name == NULL)
 		return -EINVAL;
@@ -503,9 +510,9 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 	if (size > MAX_VALUE_LEN(inode))
 		return -E2BIG;
 
-	base_addr = read_all_xattrs(inode, ipage);
-	if (!base_addr)
-		goto exit;
+	error = read_all_xattrs(inode, ipage, &base_addr);
+	if (error)
+		return error;
 
 	/* find entry with wanted name. */
 	here = __find_xattr(base_addr, index, len, name);
@@ -538,7 +545,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 			free = free + ENTRY_SIZE(here);
 
 		if (unlikely(free < newsize)) {
-			error = -ENOSPC;
+			error = -E2BIG;
 			goto exit;
 		}
 	}
@@ -566,7 +573,6 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 		 * Before we come here, old entry is removed.
 		 * We just write new entry.
 		 */
-		memset(last, 0, newsize);
 		last->e_name_index = index;
 		last->e_name_len = len;
 		memcpy(last->e_name, name, len);
@@ -580,19 +586,17 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 	if (error)
 		goto exit;
 
-	if (is_inode_flag_set(fi, FI_ACL_MODE)) {
-		inode->i_mode = fi->i_acl_mode;
+	if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+		inode->i_mode = F2FS_I(inode)->i_acl_mode;
 		inode->i_ctime = CURRENT_TIME;
-		clear_inode_flag(fi, FI_ACL_MODE);
+		clear_inode_flag(inode, FI_ACL_MODE);
 	}
 	if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
 			!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
 		f2fs_set_encrypted_inode(inode);
-
-	if (ipage)
-		update_inode(inode, ipage);
-	else
-		update_inode_page(inode);
+	f2fs_mark_inode_dirty_sync(inode);
+	if (!error && S_ISDIR(inode->i_mode))
+		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
 exit:
 	kzfree(base_addr);
 	return error;
@@ -609,7 +613,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
 	if (ipage)
 		return __f2fs_setxattr(inode, index, name, value,
 						size, ipage, flags);
-	f2fs_balance_fs(sbi);
+	f2fs_balance_fs(sbi, true);
 
 	f2fs_lock_op(sbi);
 	/* protect xattr_ver */
@@ -618,5 +622,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
 	up_write(&F2FS_I(inode)->i_sem);
 	f2fs_unlock_op(sbi);
 
+	f2fs_update_time(sbi, REQ_TIME);
 	return err;
 }
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 71a7100d5492..d2fd0387a3c7 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -126,7 +126,8 @@ extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
 
 #define f2fs_xattr_handlers	NULL
 static inline int f2fs_setxattr(struct inode *inode, int index,
-		const char *name, const void *value, size_t size, int flags)
+		const char *name, const void *value, size_t size,
+		struct page *page, int flags)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 01448e01b40d..c066f6b56e58 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -229,6 +229,7 @@ struct dentry_operations {
 #define DCACHE_MAY_FREE			0x00800000
 #define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
 #define DCACHE_OP_SELECT_INODE		0x02000000 /* Unioned entry: dcache op selects inode */
+#define DCACHE_ENCRYPTED_WITH_KEY	0x04000000 /* dir is encrypted with a valid key */
 #define DCACHE_OP_REAL			0x08000000
 
 extern seqlock_t rename_lock;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 3d6e6ce44c5c..e46e7d10312b 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -21,7 +21,7 @@
 #define F2FS_BLKSIZE			4096	/* support only 4KB block */
 #define F2FS_BLKSIZE_BITS		12	/* bits for F2FS_BLKSIZE */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
-#define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) / F2FS_BLKSIZE)
+#define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS)
 
 #define NULL_ADDR		((block_t)0)	/* used as block_t addresses */
 #define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
@@ -51,6 +51,7 @@
 #define MAX_ACTIVE_DATA_LOGS	8
 
 #define VERSION_LEN	256
+#define MAX_VOLUME_NAME		512
 
 /*
  * For superblock
@@ -84,7 +85,7 @@ struct f2fs_super_block {
 	__le32 node_ino;		/* node inode number */
 	__le32 meta_ino;		/* meta inode number */
 	__u8 uuid[16];			/* 128-bit uuid for volume */
-	__le16 volume_name[512];	/* volume name */
+	__le16 volume_name[MAX_VOLUME_NAME];	/* volume name */
 	__le32 extension_count;		/* # of extensions below */
 	__u8 extension_list[F2FS_MAX_EXTENSION][8];	/* extension array */
 	__le32 cp_payload;
@@ -99,6 +100,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_CRC_RECOVERY_FLAG	0x00000040
 #define CP_FASTBOOT_FLAG	0x00000020
 #define CP_FSCK_FLAG		0x00000010
 #define CP_ERROR_FLAG		0x00000008
@@ -169,12 +171,12 @@ struct f2fs_extent {
 #define F2FS_INLINE_XATTR_ADDRS	50	/* 200 bytes for inline xattrs */
 #define DEF_ADDRS_PER_INODE	923	/* Address Pointers in an Inode */
 #define DEF_NIDS_PER_INODE	5	/* Node IDs in an Inode */
-#define ADDRS_PER_INODE(fi)	addrs_per_inode(fi)
+#define ADDRS_PER_INODE(inode)	addrs_per_inode(inode)
 #define ADDRS_PER_BLOCK		1018	/* Address Pointers in a Direct Block */
 #define NIDS_PER_BLOCK		1018	/* Node IDs in an Indirect Block */
 
-#define ADDRS_PER_PAGE(page, fi)	\
-	(IS_INODE(page) ? ADDRS_PER_INODE(fi) : ADDRS_PER_BLOCK)
+#define ADDRS_PER_PAGE(page, inode)	\
+	(IS_INODE(page) ? ADDRS_PER_INODE(inode) : ADDRS_PER_BLOCK)
 
 #define	NODE_DIR1_BLOCK		(DEF_ADDRS_PER_INODE + 1)
 #define	NODE_DIR2_BLOCK		(DEF_ADDRS_PER_INODE + 2)
@@ -261,7 +263,7 @@ struct f2fs_node {
 /*
  * For NAT entries
  */
-#define NAT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_nat_entry))
+#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
 
 struct f2fs_nat_entry {
 	__u8 version;		/* latest version of cached nat entry */
@@ -281,7 +283,7 @@ struct f2fs_nat_block {
  * Not allow to change this.
  */
 #define SIT_VBLOCK_MAP_SIZE 64
-#define SIT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_sit_entry))
+#define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry))
 
 /*
  * F2FS uses 4 bytes to represent block address. As a result, supported size of
@@ -350,7 +352,7 @@ struct f2fs_summary {
 
 struct summary_footer {
 	unsigned char entry_type;	/* SUM_TYPE_XXX */
-	__u32 check_sum;		/* summary checksum */
+	__le32 check_sum;		/* summary checksum */
 } __packed;
 
 #define SUM_JOURNAL_SIZE	(F2FS_BLKSIZE - SUM_FOOTER_SIZE -\
@@ -363,6 +365,12 @@ struct summary_footer {
 				sizeof(struct sit_journal_entry))
 #define SIT_JOURNAL_RESERVED	((SUM_JOURNAL_SIZE - 2) %\
 				sizeof(struct sit_journal_entry))
+
+/* Reserved area should make size of f2fs_extra_info equals to
+ * that of nat_journal and sit_journal.
+ */
+#define EXTRA_INFO_RESERVED	(SUM_JOURNAL_SIZE - 2 - 8)
+
 /*
  * frequently updated NAT/SIT entries can be stored in the spare area in
  * summary blocks
@@ -392,18 +400,28 @@ struct sit_journal {
 	__u8 reserved[SIT_JOURNAL_RESERVED];
 } __packed;
 
-/* 4KB-sized summary block structure */
-struct f2fs_summary_block {
-	struct f2fs_summary entries[ENTRIES_IN_SUM];
+struct f2fs_extra_info {
+	__le64 kbytes_written;
+	__u8 reserved[EXTRA_INFO_RESERVED];
+} __packed;
+
+struct f2fs_journal {
 	union {
 		__le16 n_nats;
 		__le16 n_sits;
 	};
-	/* spare area is used by NAT or SIT journals */
+	/* spare area is used by NAT or SIT journals or extra info */
 	union {
 		struct nat_journal nat_j;
 		struct sit_journal sit_j;
+		struct f2fs_extra_info info;
 	};
+} __packed;
+
+/* 4KB-sized summary block structure */
+struct f2fs_summary_block {
+	struct f2fs_summary entries[ENTRIES_IN_SUM];
+	struct f2fs_journal journal;
 	struct summary_footer footer;
 } __packed;
 
@@ -497,4 +515,6 @@ enum {
 	F2FS_FT_MAX
 };
 
+#define S_SHIFT 12
+
 #endif  /* _LINUX_F2FS_FS_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bff4ce57b77e..5b79adb9782e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -52,6 +52,8 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
+struct fscrypt_info;
+struct fscrypt_operations;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -677,6 +679,9 @@ struct inode {
 	struct hlist_head	i_fsnotify_marks;
 #endif
 
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	struct fscrypt_info	*i_crypt_info;
+#endif
 	void			*i_private; /* fs or device private pointer */
 };
 
@@ -1337,6 +1342,8 @@ struct super_block {
 #endif
 	const struct xattr_handler **s_xattr;
 
+	const struct fscrypt_operations	*s_cop;
+
 	struct hlist_bl_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
new file mode 100644
index 000000000000..76cff18bb032
--- /dev/null
+++ b/include/linux/fscrypto.h
@@ -0,0 +1,435 @@
+/*
+ * General per-file encryption definition
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#ifndef _LINUX_FSCRYPTO_H
+#define _LINUX_FSCRYPTO_H
+
+#include <linux/key.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <crypto/skcipher.h>
+#include <uapi/linux/fs.h>
+
+#define FS_KEY_DERIVATION_NONCE_SIZE		16
+#define FS_ENCRYPTION_CONTEXT_FORMAT_V1		1
+
+#define FS_POLICY_FLAGS_PAD_4		0x00
+#define FS_POLICY_FLAGS_PAD_8		0x01
+#define FS_POLICY_FLAGS_PAD_16		0x02
+#define FS_POLICY_FLAGS_PAD_32		0x03
+#define FS_POLICY_FLAGS_PAD_MASK	0x03
+#define FS_POLICY_FLAGS_VALID		0x03
+
+/* Encryption algorithms */
+#define FS_ENCRYPTION_MODE_INVALID		0
+#define FS_ENCRYPTION_MODE_AES_256_XTS		1
+#define FS_ENCRYPTION_MODE_AES_256_GCM		2
+#define FS_ENCRYPTION_MODE_AES_256_CBC		3
+#define FS_ENCRYPTION_MODE_AES_256_CTS		4
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ *  1 byte: Protector format (1 = this version)
+ *  1 byte: File contents encryption mode
+ *  1 byte: File names encryption mode
+ *  1 byte: Flags
+ *  8 bytes: Master Key descriptor
+ *  16 bytes: Encryption Key derivation nonce
+ */
+struct fscrypt_context {
+	u8 format;
+	u8 contents_encryption_mode;
+	u8 filenames_encryption_mode;
+	u8 flags;
+	u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+} __packed;
+
+/* Encryption parameters */
+#define FS_XTS_TWEAK_SIZE		16
+#define FS_AES_128_ECB_KEY_SIZE		16
+#define FS_AES_256_GCM_KEY_SIZE		32
+#define FS_AES_256_CBC_KEY_SIZE		32
+#define FS_AES_256_CTS_KEY_SIZE		32
+#define FS_AES_256_XTS_KEY_SIZE		64
+#define FS_MAX_KEY_SIZE			64
+
+#define FS_KEY_DESC_PREFIX		"fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE		8
+
+/* This is passed in from userspace into the kernel keyring */
+struct fscrypt_key {
+	u32 mode;
+	u8 raw[FS_MAX_KEY_SIZE];
+	u32 size;
+} __packed;
+
+struct fscrypt_info {
+	u8 ci_data_mode;
+	u8 ci_filename_mode;
+	u8 ci_flags;
+	struct crypto_skcipher *ci_ctfm;
+	struct key *ci_keyring_key;
+	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
+#define FS_WRITE_PATH_FL			0x00000002
+
+struct fscrypt_ctx {
+	union {
+		struct {
+			struct page *bounce_page;	/* Ciphertext page */
+			struct page *control_page;	/* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;	/* Free list */
+	};
+	u8 flags;				/* Flags */
+	u8 mode;				/* Encryption mode for tfm */
+};
+
+struct fscrypt_completion_result {
+	struct completion completion;
+	int res;
+};
+
+#define DECLARE_FS_COMPLETION_RESULT(ecr) \
+	struct fscrypt_completion_result ecr = { \
+		COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+static inline int fscrypt_key_size(int mode)
+{
+	switch (mode) {
+	case FS_ENCRYPTION_MODE_AES_256_XTS:
+		return FS_AES_256_XTS_KEY_SIZE;
+	case FS_ENCRYPTION_MODE_AES_256_GCM:
+		return FS_AES_256_GCM_KEY_SIZE;
+	case FS_ENCRYPTION_MODE_AES_256_CBC:
+		return FS_AES_256_CBC_KEY_SIZE;
+	case FS_ENCRYPTION_MODE_AES_256_CTS:
+		return FS_AES_256_CTS_KEY_SIZE;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+#define FS_FNAME_NUM_SCATTER_ENTRIES	4
+#define FS_CRYPTO_BLOCK_SIZE		16
+#define FS_FNAME_CRYPTO_DIGEST_SIZE	32
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __packed;
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 fscrypt_symlink_data_len(u32 l)
+{
+	if (l < FS_CRYPTO_BLOCK_SIZE)
+		l = FS_CRYPTO_BLOCK_SIZE;
+	return (l + sizeof(struct fscrypt_symlink_data) - 1);
+}
+
+struct fscrypt_str {
+	unsigned char *name;
+	u32 len;
+};
+
+struct fscrypt_name {
+	const struct qstr *usr_fname;
+	struct fscrypt_str disk_name;
+	u32 hash;
+	u32 minor_hash;
+	struct fscrypt_str crypto_buf;
+};
+
+#define FSTR_INIT(n, l)		{ .name = n, .len = l }
+#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p)		((p)->disk_name.name)
+#define fname_len(p)		((p)->disk_name.len)
+
+/*
+ * crypto opertions for filesystems
+ */
+struct fscrypt_operations {
+	int (*get_context)(struct inode *, void *, size_t);
+	int (*key_prefix)(struct inode *, u8 **);
+	int (*prepare_context)(struct inode *);
+	int (*set_context)(struct inode *, const void *, size_t, void *);
+	int (*dummy_context)(struct inode *);
+	bool (*is_encrypted)(struct inode *);
+	bool (*empty_dir)(struct inode *);
+	unsigned (*max_namelen)(struct inode *);
+};
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	if (inode->i_sb->s_cop->dummy_context &&
+				inode->i_sb->s_cop->dummy_context(inode))
+		return true;
+	return false;
+}
+
+static inline bool fscrypt_valid_contents_enc_mode(u32 mode)
+{
+	return (mode == FS_ENCRYPTION_MODE_AES_256_XTS);
+}
+
+static inline bool fscrypt_valid_filenames_enc_mode(u32 mode)
+{
+	return (mode == FS_ENCRYPTION_MODE_AES_256_CTS);
+}
+
+static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size)
+{
+	if (size == fscrypt_key_size(mode))
+		return size;
+	return 0;
+}
+
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
+{
+	if (str->len == 1 && str->name[0] == '.')
+		return true;
+
+	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+		return true;
+
+	return false;
+}
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+#else
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+#endif
+}
+
+static inline int fscrypt_has_encryption_key(struct inode *inode)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	return (inode->i_crypt_info != NULL);
+#else
+	return 0;
+#endif
+}
+
+static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+	spin_unlock(&dentry->d_lock);
+#endif
+}
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+extern const struct dentry_operations fscrypt_d_ops;
+#endif
+
+static inline void fscrypt_set_d_op(struct dentry *dentry)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	d_set_d_op(dentry, &fscrypt_d_ops);
+#endif
+}
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+/* crypto.c */
+extern struct kmem_cache *fscrypt_info_cachep;
+int fscrypt_initialize(void);
+
+extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t);
+extern void fscrypt_release_ctx(struct fscrypt_ctx *);
+extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t);
+extern int fscrypt_decrypt_page(struct page *);
+extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
+extern void fscrypt_pullback_bio_page(struct page **, bool);
+extern void fscrypt_restore_control_page(struct page *);
+extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t,
+						unsigned int);
+/* policy.c */
+extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *);
+extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *);
+extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
+extern int fscrypt_inherit_context(struct inode *, struct inode *,
+					void *, bool);
+/* keyinfo.c */
+extern int get_crypt_info(struct inode *);
+extern int fscrypt_get_encryption_info(struct inode *);
+extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
+
+/* fname.c */
+extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
+				int lookup, struct fscrypt_name *);
+extern void fscrypt_free_filename(struct fscrypt_name *);
+extern u32 fscrypt_fname_encrypted_size(struct inode *, u32);
+extern int fscrypt_fname_alloc_buffer(struct inode *, u32,
+				struct fscrypt_str *);
+extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
+extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
+			const struct fscrypt_str *, struct fscrypt_str *);
+extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
+			struct fscrypt_str *);
+#endif
+
+/* crypto.c */
+static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i,
+							gfp_t f)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c)
+{
+	return;
+}
+
+static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i,
+						struct page *p, gfp_t f)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline int fscrypt_notsupp_decrypt_page(struct page *p)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c,
+						struct bio *b)
+{
+	return;
+}
+
+static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b)
+{
+	return;
+}
+
+static inline void fscrypt_notsupp_restore_control_page(struct page *p)
+{
+	return;
+}
+
+static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p,
+					sector_t s, unsigned int f)
+{
+	return -EOPNOTSUPP;
+}
+
+/* policy.c */
+static inline int fscrypt_notsupp_process_policy(struct file *f,
+				const struct fscrypt_policy *p)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_notsupp_get_policy(struct inode *i,
+				struct fscrypt_policy *p)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_notsupp_has_permitted_context(struct inode *p,
+				struct inode *i)
+{
+	return 0;
+}
+
+static inline int fscrypt_notsupp_inherit_context(struct inode *p,
+				struct inode *i, void *v, bool b)
+{
+	return -EOPNOTSUPP;
+}
+
+/* keyinfo.c */
+static inline int fscrypt_notsupp_get_encryption_info(struct inode *i)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_notsupp_put_encryption_info(struct inode *i,
+					struct fscrypt_info *f)
+{
+	return;
+}
+
+ /* fname.c */
+static inline int fscrypt_notsupp_setup_filename(struct inode *dir,
+			const struct qstr *iname,
+			int lookup, struct fscrypt_name *fname)
+{
+	if (dir->i_sb->s_cop->is_encrypted(dir))
+		return -EOPNOTSUPP;
+
+	memset(fname, 0, sizeof(struct fscrypt_name));
+	fname->usr_fname = iname;
+	fname->disk_name.name = (unsigned char *)iname->name;
+	fname->disk_name.len = iname->len;
+	return 0;
+}
+
+static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname)
+{
+	return;
+}
+
+static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s)
+{
+	/* never happens */
+	WARN_ON(1);
+	return 0;
+}
+
+static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode,
+				u32 ilen, struct fscrypt_str *crypto_str)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c)
+{
+	return;
+}
+
+static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode,
+			u32 hash, u32 minor_hash,
+			const struct fscrypt_str *iname,
+			struct fscrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode,
+			const struct qstr *iname,
+			struct fscrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+#endif	/* _LINUX_FSCRYPTO_H */
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 00b4a6308249..3a09bb4dc3b2 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -52,6 +52,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ META_FLUSH,	"META_FLUSH" },				\
 		{ INMEM,	"INMEM" },				\
 		{ INMEM_DROP,	"INMEM_DROP" },				\
+		{ INMEM_REVOKE,	"INMEM_REVOKE" },			\
 		{ IPU,		"IN-PLACE" },				\
 		{ OPU,		"OUT-OF-PLACE" })
 
@@ -693,28 +694,32 @@ TRACE_EVENT(f2fs_direct_IO_exit,
 		__entry->ret)
 );
 
-TRACE_EVENT(f2fs_reserve_new_block,
+TRACE_EVENT(f2fs_reserve_new_blocks,
 
-	TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node),
+	TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node,
+							blkcnt_t count),
 
-	TP_ARGS(inode, nid, ofs_in_node),
+	TP_ARGS(inode, nid, ofs_in_node, count),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(nid_t, nid)
 		__field(unsigned int, ofs_in_node)
+		__field(blkcnt_t, count)
 	),
 
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->nid	= nid;
 		__entry->ofs_in_node = ofs_in_node;
+		__entry->count = count;
 	),
 
-	TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u",
+	TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu",
 		show_dev(__entry),
 		(unsigned int)__entry->nid,
-		__entry->ofs_in_node)
+		__entry->ofs_in_node,
+		(unsigned long long)__entry->count)
 );
 
 DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
@@ -727,7 +732,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		__field(dev_t, dev)
 		__field(ino_t, ino)
 		__field(pgoff_t, index)
-		__field(block_t, blkaddr)
+		__field(block_t, old_blkaddr)
+		__field(block_t, new_blkaddr)
 		__field(int, rw)
 		__field(int, type)
 	),
@@ -736,16 +742,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		__entry->dev		= page->mapping->host->i_sb->s_dev;
 		__entry->ino		= page->mapping->host->i_ino;
 		__entry->index		= page->index;
-		__entry->blkaddr	= fio->blk_addr;
+		__entry->old_blkaddr	= fio->old_blkaddr;
+		__entry->new_blkaddr	= fio->new_blkaddr;
 		__entry->rw		= fio->rw;
 		__entry->type		= fio->type;
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
-		"blkaddr = 0x%llx, rw = %s%s, type = %s",
+		"oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s",
 		show_dev_ino(__entry),
 		(unsigned long)__entry->index,
-		(unsigned long long)__entry->blkaddr,
+		(unsigned long long)__entry->old_blkaddr,
+		(unsigned long long)__entry->new_blkaddr,
 		show_bio_type(__entry->rw),
 		show_block_type(__entry->type))
 );
@@ -1265,6 +1273,44 @@ TRACE_EVENT(f2fs_destroy_extent_tree,
 		__entry->node_cnt)
 );
 
+DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes,
+
+	TP_PROTO(struct super_block *sb, int type, s64 count),
+
+	TP_ARGS(sb, type, count),
+
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, type)
+		__field(s64, count)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->type	= type;
+		__entry->count	= count;
+	),
+
+	TP_printk("dev = (%d,%d), %s, dirty count = %lld",
+		show_dev(__entry),
+		show_file_type(__entry->type),
+		__entry->count)
+);
+
+DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_enter,
+
+	TP_PROTO(struct super_block *sb, int type, s64 count),
+
+	TP_ARGS(sb, type, count)
+);
+
+DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit,
+
+	TP_PROTO(struct super_block *sb, int type, s64 count),
+
+	TP_ARGS(sb, type, count)
+);
+
 #endif /* _TRACE_F2FS_H */
 
  /* This part must be outside protection */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index cd0c1a1a9ccf..15048097910f 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -172,6 +172,24 @@ struct inodes_stat_t {
 #define FS_IOC32_GETVERSION		_IOR('v', 1, int)
 #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
 
+/*
+ * File system encryption support
+ */
+/* Policy provided via an ioctl on the topmost directory */
+#define FS_KEY_DESCRIPTOR_SIZE	8
+
+struct fscrypt_policy {
+	__u8 version;
+	__u8 contents_encryption_mode;
+	__u8 filenames_encryption_mode;
+	__u8 flags;
+	__u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+} __packed;
+
+#define FS_IOC_SET_ENCRYPTION_POLICY	_IOR('f', 19, struct fscrypt_policy)
+#define FS_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])
+#define FS_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct fscrypt_policy)
+
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
  */

From ae81ccb3bdcd879f889d36dcec76fccdd2f329c4 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 12 Oct 2016 13:38:41 -0700
Subject: [PATCH 1053/3220] f2fs: fix wrong sum_page pointer in f2fs_gc

commit de0dcc40f6e24d6bac6b60e36eac4659bbbd3f00 upstream.

This patch fixes using a wrong pointer for sum_page in f2fs_gc.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 0a0a1ad1fe1f..4336807cc690 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -848,16 +848,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 
 	for (segno = start_segno; segno < end_segno; segno++) {
 
-		if (get_valid_blocks(sbi, segno, 1) == 0 ||
-					unlikely(f2fs_cp_error(sbi)))
-			goto next;
-
 		/* find segment summary of victim */
 		sum_page = find_get_page(META_MAPPING(sbi),
 					GET_SUM_BLOCK(sbi, segno));
-		f2fs_bug_on(sbi, !PageUptodate(sum_page));
 		f2fs_put_page(sum_page, 0);
 
+		if (get_valid_blocks(sbi, segno, 1) == 0 ||
+				!PageUptodate(sum_page) ||
+				unlikely(f2fs_cp_error(sbi)))
+			goto next;
+
 		sum = page_address(sum_page);
 		f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer)));
 

From 91d38ba8414bdbe7161d2d02eddf5e671db0024d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 22 Nov 2016 14:06:03 -0800
Subject: [PATCH 1054/3220] posix_acl: Clear SGID bit when setting file
 permissions

commit 073931017b49d9458aa351605b43a7e34598caef upstream.

Cherry-pick to f2fs only for generic/375 from:

(073931017: posix_acl: Clear SGID bit when setting file permissions)

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/acl.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index fb0744b94c2f..4a34040932e9 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -215,12 +215,10 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl) {
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
+			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (error)
 				return error;
 			set_acl_inode(inode, inode->i_mode);
-			if (error == 0)
-				acl = NULL;
 		}
 		break;
 

From ab6f3626a82db73b2a490804fe6591750c5b2233 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 23 Nov 2016 10:51:17 -0800
Subject: [PATCH 1055/3220] f2fs: fix overflow due to condition check order

commit e87f7329bbd6760c2acc4f1eb423362b08851a71 upstream.

In the last ilen case, i was already increased, resulting in accessing out-
of-boundary entry of do_replace and blkaddr.
Fix to check ilen first to exit the loop.

Fixes: 2aa8fbb9693020 ("f2fs: refactor __exchange_data_block for speed up")
Cc: stable@vger.kernel.org # 4.8+
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c6e33258fabf..5c4ea4cf2fb1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -971,7 +971,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
 				new_size = (dst + i) << PAGE_SHIFT;
 				if (dst_inode->i_size < new_size)
 					f2fs_i_size_write(dst_inode, new_size);
-			} while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen);
+			} while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
 
 			f2fs_put_dnode(&dn);
 		} else {

From 6b266c3a9951abd948effac21d19ced0dee07dda Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 24 Nov 2016 12:45:15 -0800
Subject: [PATCH 1056/3220] f2fs: fix to determine start_cp_addr by
 sbi->cur_cp_pack

commit 8508e44ae98622f841f5ef29d0bf3d5db4e0c1cc upstream.

We don't guarantee cp_addr is fixed by cp_version.
This is to sync with f2fs-tools.

Cc: stable@vger.kernel.org
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  8 +++++++-
 fs/f2fs/f2fs.h       | 28 +++++++++++++++++-----------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index cb23d6cf676b..1608ae8eea97 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -770,6 +770,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 	if (sanity_check_ckpt(sbi))
 		goto fail_no_cp;
 
+	if (cur_page == cp1)
+		sbi->cur_cp_pack = 1;
+	else
+		sbi->cur_cp_pack = 2;
+
 	if (cp_blks <= 1)
 		goto done;
 
@@ -1121,7 +1126,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 				le32_to_cpu(ckpt->checksum_offset)))
 				= cpu_to_le32(crc32);
 
-	start_blk = __start_cp_addr(sbi);
+	start_blk = __start_cp_next_addr(sbi);
 
 	/* need to wait for end_io results */
 	wait_on_all_pages_writeback(sbi);
@@ -1185,6 +1190,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	clear_prefree_segments(sbi, cpc);
 	clear_sbi_flag(sbi, SBI_IS_DIRTY);
 	clear_sbi_flag(sbi, SBI_NEED_CP);
+	__set_cp_next_pack(sbi);
 
 	/*
 	 * redirty superblock if metadata like node page or inode cache is
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index af293e84e5cd..45d1e4522760 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -789,6 +789,7 @@ struct f2fs_sb_info {
 
 	/* for checkpoint */
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
+	int cur_cp_pack;			/* remain current cp pack */
 	spinlock_t cp_lock;			/* for flag in ckpt */
 	struct inode *meta_inode;		/* cache meta blocks */
 	struct mutex cp_mutex;			/* checkpoint procedure lock */
@@ -1354,22 +1355,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 
 static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
 {
-	block_t start_addr;
-	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
-	unsigned long long ckpt_version = cur_cp_version(ckpt);
+	block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
 
-	start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
-
-	/*
-	 * odd numbered checkpoint should at cp segment 0
-	 * and even segment must be at cp segment 1
-	 */
-	if (!(ckpt_version & 1))
+	if (sbi->cur_cp_pack == 2)
 		start_addr += sbi->blocks_per_seg;
-
 	return start_addr;
 }
 
+static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
+{
+	block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+
+	if (sbi->cur_cp_pack == 1)
+		start_addr += sbi->blocks_per_seg;
+	return start_addr;
+}
+
+static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi)
+{
+	sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1;
+}
+
 static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
 {
 	return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);

From 311aa690ef7bd32732a65d957c2490143c7b9830 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:31:34 +0800
Subject: [PATCH 1057/3220] f2fs: exclude free nids building and allocation

commit 2411cf5befa5804e4ced4c45a3212d7653869286 upstream.

During nid allocation, it needs to exclude building and allocating flow
of free nids, this is because while building free nid cache, there are two
steps: a) load free nids from unused nat entries in NAT pages, b) update
free nid cache by checking nat journal. The two steps should be atomical,
otherwise an used nid can be allocated as free one after a) and before b).

This patch adds missing lock which covers build_free_nids in
unlock_operation and f2fs_balance_fs_bg to avoid that.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b1e615ed2bef..a8c2bd3e5029 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1786,7 +1786,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 	}
 }
 
-void build_free_nids(struct f2fs_sb_info *sbi)
+void __build_free_nids(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1840,6 +1840,13 @@ void build_free_nids(struct f2fs_sb_info *sbi)
 					nm_i->ra_nid_pages, META_NAT, false);
 }
 
+void build_free_nids(struct f2fs_sb_info *sbi)
+{
+	mutex_lock(&NM_I(sbi)->build_lock);
+	__build_free_nids(sbi);
+	mutex_unlock(&NM_I(sbi)->build_lock);
+}
+
 /*
  * If this function returns success, caller can obtain a new nid
  * from second parameter of this function.
@@ -1876,9 +1883,7 @@ retry:
 	spin_unlock(&nm_i->free_nid_list_lock);
 
 	/* Let's scan nat pages and its caches to get free nids */
-	mutex_lock(&nm_i->build_lock);
 	build_free_nids(sbi);
-	mutex_unlock(&nm_i->build_lock);
 	goto retry;
 }
 

From ab38818bdd9a67f9ce6aade05f61e373d45ae132 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:57:00 +0800
Subject: [PATCH 1058/3220] f2fs: fix to release discard entries during
 checkpoint

commit 2dd15654ac0abe587a245a09a7823bbbd588bfb7 upstream.

In f2fs_fill_super, if there is any IO error occurs during recovery,
cached discard entries will be leaked, in order to avoid this, make
write_checkpoint() handle memory release by itself, besides, move
clear_prefree_segments to write_checkpoint for readability.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 5 ++++-
 fs/f2fs/super.c      | 1 -
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 1608ae8eea97..63ca342a3cc8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1187,7 +1187,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
-	clear_prefree_segments(sbi, cpc);
 	clear_sbi_flag(sbi, SBI_IS_DIRTY);
 	clear_sbi_flag(sbi, SBI_NEED_CP);
 	__set_cp_next_pack(sbi);
@@ -1264,6 +1263,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
 	err = do_checkpoint(sbi, cpc);
+	if (err)
+		release_discard_addrs(sbi);
+	else
+		clear_prefree_segments(sbi, cpc);
 
 	unblock_operations(sbi);
 	stat_inc_cp_count(sbi->stat_info);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fd249cc9b96e..006138a6c5ab 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -738,7 +738,6 @@ static void f2fs_put_super(struct super_block *sb)
 	 * In addition, EIO will skip do checkpoint, we need this as well.
 	 */
 	release_ino_entry(sbi, true);
-	release_discard_addrs(sbi);
 
 	f2fs_leave_shrinker(sbi);
 	mutex_unlock(&sbi->umount_mutex);

From 75bb19d8b7479bfa62aef3a5e13bf737144ef76b Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:57:01 +0800
Subject: [PATCH 1059/3220] f2fs: give a chance to detach from dirty list

commit 933439c8f3474e329709b715b43b0b8168bbecf8 upstream.

If there is no dirty pages in inode, we should give a chance to detach
the inode from global dirty list, otherwise it needs to call another
unnecessary .writepages for detaching.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    | 8 +++++---
 fs/f2fs/dir.c     | 1 +
 fs/f2fs/gc.c      | 4 +++-
 fs/f2fs/inline.c  | 4 +++-
 fs/f2fs/node.c    | 1 +
 fs/f2fs/segment.c | 4 +++-
 6 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fc9bfd8f566d..4000ccb85a78 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1817,12 +1817,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 		return;
 
 	if (PageDirty(page)) {
-		if (inode->i_ino == F2FS_META_INO(sbi))
+		if (inode->i_ino == F2FS_META_INO(sbi)) {
 			dec_page_count(sbi, F2FS_DIRTY_META);
-		else if (inode->i_ino == F2FS_NODE_INO(sbi))
+		} else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
 			dec_page_count(sbi, F2FS_DIRTY_NODES);
-		else
+		} else {
 			inode_dec_dirty_pages(inode);
+			remove_dirty_inode(inode);
+		}
 	}
 
 	/* This is atomic written page, keep Private */
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index e634a637c443..c0dba11519cf 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 		ClearPagePrivate(page);
 		ClearPageUptodate(page);
 		inode_dec_dirty_pages(dir);
+		remove_dirty_inode(dir);
 	}
 	f2fs_put_page(page, 1);
 }
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 4336807cc690..72a0ca08f901 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -670,8 +670,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
 retry:
 		set_page_dirty(page);
 		f2fs_wait_on_page_writeback(page, DATA, true);
-		if (clear_page_dirty_for_io(page))
+		if (clear_page_dirty_for_io(page)) {
 			inode_dec_dirty_pages(inode);
+			remove_dirty_inode(inode);
+		}
 
 		set_cold_data(page);
 
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index b150d03beec1..a1acbc67d827 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -154,8 +154,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	fio.old_blkaddr = dn->data_blkaddr;
 	write_data_page(dn, &fio);
 	f2fs_wait_on_page_writeback(page, DATA, true);
-	if (dirty)
+	if (dirty) {
 		inode_dec_dirty_pages(dn->inode);
+		remove_dirty_inode(dn->inode);
+	}
 
 	/* this converted inline_data should be recovered. */
 	set_inode_flag(dn->inode, FI_APPEND_WRITE);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a8c2bd3e5029..97eb2c0811b5 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1203,6 +1203,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
 
 	ret = f2fs_write_inline_data(inode, page);
 	inode_dec_dirty_pages(inode);
+	remove_dirty_inode(inode);
 	if (ret)
 		set_page_dirty(page);
 page_out:
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index b3c61ae37f92..75477ec6c535 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -272,8 +272,10 @@ static int __commit_inmem_pages(struct inode *inode,
 
 			set_page_dirty(page);
 			f2fs_wait_on_page_writeback(page, DATA, true);
-			if (clear_page_dirty_for_io(page))
+			if (clear_page_dirty_for_io(page)) {
 				inode_dec_dirty_pages(inode);
+				remove_dirty_inode(inode);
+			}
 
 			fio.page = page;
 			err = do_write_data_page(&fio);

From 4c7eae1fef0c32f292507632ed7cbc9b0416b00e Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:57:02 +0800
Subject: [PATCH 1060/3220] f2fs: add missing f2fs_balance_fs in
 f2fs_zero_range

commit 9434fcde1fa0f48e1a29fbdd9d436fa279aeb909 upstream.

f2fs_balance_fs should be called in between node page updating, otherwise
node page count will exceeded far beyond watermark of triggering
foreground garbage collection, result in facing high risk of hitting LFS
allocation failure.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5c4ea4cf2fb1..c0774c98dce4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1222,6 +1222,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 			ret = f2fs_do_zero_range(&dn, index, end);
 			f2fs_put_dnode(&dn);
 			f2fs_unlock_op(sbi);
+
+			f2fs_balance_fs(sbi, dn.node_changed);
+
 			if (ret)
 				goto out;
 

From 332f40b43f2f4a86eec4a2aaf1d3e3295ed673f2 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:57:03 +0800
Subject: [PATCH 1061/3220] f2fs: don't miss any f2fs_balance_fs cases

commit 6f2d8ed654bfa391854df4de854953f772a16a9d upstream.

In f2fs_map_blocks, let f2fs_balance_fs detects node page modification
with dn.node_changed to avoid miss some corner cases.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4000ccb85a78..c390542917e4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -671,7 +671,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	unsigned int ofs_in_node, last_ofs_in_node;
 	blkcnt_t prealloc;
 	struct extent_info ei;
-	bool allocated = false;
 	block_t blkaddr;
 
 	if (!maxblocks)
@@ -730,10 +729,8 @@ next_block:
 				}
 			} else {
 				err = __allocate_data_block(&dn);
-				if (!err) {
+				if (!err)
 					set_inode_flag(inode, FI_APPEND_WRITE);
-					allocated = true;
-				}
 			}
 			if (err)
 				goto sync_out;
@@ -788,7 +785,6 @@ skip:
 		err = reserve_new_blocks(&dn, prealloc);
 		if (err)
 			goto sync_out;
-		allocated = dn.node_changed;
 
 		map->m_len += dn.ofs_in_node - ofs_in_node;
 		if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
@@ -807,9 +803,8 @@ skip:
 
 	if (create) {
 		f2fs_unlock_op(sbi);
-		f2fs_balance_fs(sbi, allocated);
+		f2fs_balance_fs(sbi, dn.node_changed);
 	}
-	allocated = false;
 	goto next_dnode;
 
 sync_out:
@@ -817,7 +812,7 @@ sync_out:
 unlock_out:
 	if (create) {
 		f2fs_unlock_op(sbi);
-		f2fs_balance_fs(sbi, allocated);
+		f2fs_balance_fs(sbi, dn.node_changed);
 	}
 out:
 	trace_f2fs_map_blocks(inode, map, err);

From 9e3d0bf6d3714bc3481e32828bd56eecc0700d29 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:57:04 +0800
Subject: [PATCH 1062/3220] f2fs: be aware of extent beyond EOF in fiemap

commit 58736fa60f6ae659ac72da8b1580c308b47e8edd upstream.

f2fs can support fallocating blocks beyond file size without changing the
size, but ->fiemap of f2fs was restricted and can't detect these extents
fallocated past EOF, now relieve the restriction.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c390542917e4..0cdabfce6c17 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -881,7 +881,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	struct buffer_head map_bh;
 	sector_t start_blk, last_blk;
 	pgoff_t next_pgofs;
-	loff_t isize;
 	u64 logical = 0, phys = 0, size = 0;
 	u32 flags = 0;
 	int ret = 0;
@@ -898,13 +897,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
 	inode_lock(inode);
 
-	isize = i_size_read(inode);
-	if (start >= isize)
-		goto out;
-
-	if (start + len > isize)
-		len = isize - start;
-
 	if (logical_to_blk(inode, len) == 0)
 		len = blk_to_logical(inode, 1);
 
@@ -923,13 +915,11 @@ next:
 	/* HOLE */
 	if (!buffer_mapped(&map_bh)) {
 		start_blk = next_pgofs;
-		/* Go through holes util pass the EOF */
-		if (blk_to_logical(inode, start_blk) < isize)
+
+		if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
+					F2FS_I_SB(inode)->max_file_blocks))
 			goto prep_next;
-		/* Found a hole beyond isize means no more extents.
-		 * Note that the premise is that filesystems don't
-		 * punch holes beyond isize and keep size unchanged.
-		 */
+
 		flags |= FIEMAP_EXTENT_LAST;
 	}
 

From 1158df42b2c66b4c9f150615ae37a1cc7edb3c74 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:57:05 +0800
Subject: [PATCH 1063/3220] f2fs: fix to update largest extent under lock

commit b691d98fdd4cc2514c60fd6975e6016da203e64f upstream.

In order to avoid racing problem, make largest extent cache being updated
under lock.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inode.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d7369895a78a..1fbebcb33a9d 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -252,6 +252,7 @@ retry:
 int update_inode(struct inode *inode, struct page *node_page)
 {
 	struct f2fs_inode *ri;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 
 	f2fs_inode_synced(inode);
 
@@ -267,11 +268,13 @@ int update_inode(struct inode *inode, struct page *node_page)
 	ri->i_size = cpu_to_le64(i_size_read(inode));
 	ri->i_blocks = cpu_to_le64(inode->i_blocks);
 
-	if (F2FS_I(inode)->extent_tree)
-		set_raw_extent(&F2FS_I(inode)->extent_tree->largest,
-							&ri->i_ext);
-	else
+	if (et) {
+		read_lock(&et->lock);
+		set_raw_extent(&et->largest, &ri->i_ext);
+		read_unlock(&et->lock);
+	} else {
 		memset(&ri->i_ext, 0, sizeof(ri->i_ext));
+	}
 	set_raw_inline(inode, ri);
 
 	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);

From c675400f4a1149234ba1efeeb8a7d6df8a4bb880 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:57:06 +0800
Subject: [PATCH 1064/3220] f2fs: fix error handling in fsync_node_pages

commit 9de69279750e9740bc7221c7051a40c0516a58fb upstream.

In fsync_node_pages, if f2fs was taged with CP_ERROR_FLAG, make sure bio
cache was flushed before return.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 97eb2c0811b5..bc38e5a92b4b 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1338,7 +1338,8 @@ retry:
 			if (unlikely(f2fs_cp_error(sbi))) {
 				f2fs_put_page(last_page, 0);
 				pagevec_release(&pvec);
-				return -EIO;
+				ret = -EIO;
+				goto out;
 			}
 
 			if (!IS_DNODE(page) || !is_cold_node(page))
@@ -1411,7 +1412,7 @@ continue_unlock:
 		unlock_page(last_page);
 		goto retry;
 	}
-
+out:
 	if (nwritten)
 		f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE);
 	return ret ? -EIO: 0;

From fbeee49e0605a0c723bd0d870d59175a764377c6 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 11 Oct 2016 10:36:12 -0700
Subject: [PATCH 1065/3220] f2fs: fix sparse warnings

commit 0c0b471e43e7acf0747c6eb410863bf78c14750d upstream.

f2fs contained a number of endianness conversion bugs.

Also, one function should have been 'static'.

Found with sparse by running 'make C=2 CF=-D__CHECK_ENDIAN__ fs/f2fs/'

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c    | 2 +-
 fs/f2fs/inline.c | 2 +-
 fs/f2fs/node.c   | 5 +++--
 fs/f2fs/node.h   | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index c0dba11519cf..7136dc1ade11 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
 
 		/* show encrypted name */
 		if (fname->hash) {
-			if (de->hash_code == fname->hash)
+			if (de->hash_code == cpu_to_le32(fname->hash))
 				goto found;
 		} else if (de_name.len == name->len &&
 			de->hash_code == namehash &&
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index a1acbc67d827..56363c18489a 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -438,7 +438,7 @@ static int f2fs_add_inline_entries(struct inode *dir,
 		}
 
 		new_name.name = d.filename[bit_pos];
-		new_name.len = de->name_len;
+		new_name.len = le16_to_cpu(de->name_len);
 
 		ino = le32_to_cpu(de->ino);
 		fake_mode = get_de_type(de) << S_SHIFT;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index bc38e5a92b4b..d2ba37a84f8e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 		e = grab_nat_entry(nm_i, nid);
 		node_info_from_raw_nat(&e->ni, ne);
 	} else {
-		f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
-				nat_get_blkaddr(e) != ne->block_addr ||
+		f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
+				nat_get_blkaddr(e) !=
+					le32_to_cpu(ne->block_addr) ||
 				nat_get_version(e) != ne->version);
 	}
 }
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 868bec65e51c..cfdcf98516a1 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -313,7 +313,7 @@ static inline bool is_recoverable_dnode(struct page *page)
 				((unsigned char *)ckpt + crc_offset)));
 		cp_ver |= (crc << 32);
 	}
-	return cpu_to_le64(cp_ver) == cpver_of_node(page);
+	return cp_ver == cpver_of_node(page);
 }
 
 /*

From 8db338877d72a24743a3808d4bc774f74a058f8f Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:56:59 +0800
Subject: [PATCH 1066/3220] f2fs: clear nlink if fail to add_link

commit a11b9f65eae766b17ec3451a6a1766f0a9d1dbff upstream.

We don't need to keep incomplete created inode in cache, so if we fail to
add link into directory during new inode creation, it's better to set
nlink of inode to zero, then we can evict inode immediately. Otherwise
release of nid belong to inode will be delayed until inode cache is being
shrunk, it may cause a seemingly endless loop while allocating free nids
in time of testing generic/269 case of fstest suit.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: add update_inode_page to fix kernel panic]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inode.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 1fbebcb33a9d..d32fd0343eae 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -387,6 +387,8 @@ retry:
 		f2fs_lock_op(sbi);
 		err = remove_inode_page(inode);
 		f2fs_unlock_op(sbi);
+		if (err == -ENOENT)
+			err = 0;
 	}
 
 	/* give more chances, if ENOMEM case */
@@ -427,6 +429,18 @@ void handle_failed_inode(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct node_info ni;
 
+	/*
+	 * clear nlink of inode in order to release resource of inode
+	 * immediately.
+	 */
+	clear_nlink(inode);
+
+	/*
+	 * we must call this to avoid inode being remained as dirty, resulting
+	 * in a panic when flushing dirty inodes in gdirty_list.
+	 */
+	update_inode_page(inode);
+
 	/* don't make bad inode, since it becomes a regular file. */
 	unlock_new_inode(inode);
 

From e18c2624506f44b87f505adc36b78782eacd8628 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 12 Oct 2016 19:28:29 +0800
Subject: [PATCH 1067/3220] f2fs: split free nid list

commit b8559dc242d1d47dcf99660a4d6afded727e0cc0 upstream.

During free nid allocation, in order to do preallocation, we will tag free
nid entry as allocated one and still leave it in free nid list, for other
allocators who want to grab free nids, it needs to traverse the free nid
list for lookup. It becomes overhead in scenario of allocating free nid
intensively by multithreads.

This patch splits free nid list to two list: {free,alloc}_nid_list, to
keep free nids and preallocated free nids separately, after that, traverse
latency will be gone, besides split nid_cnt for separate statistic.

Additionally, introduce __insert_nid_to_list and __remove_nid_from_list for
cleanup.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: modify f2fs_bug_on to avoid needless branches]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c    |  11 ++--
 fs/f2fs/f2fs.h     |  14 +++--
 fs/f2fs/node.c     | 136 +++++++++++++++++++++++++++------------------
 fs/f2fs/node.h     |  11 ++--
 fs/f2fs/shrinker.c |   4 +-
 5 files changed, 108 insertions(+), 68 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 1c35e80732e0..5cab80dfa4dc 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -74,7 +74,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
 	si->sits = MAIN_SEGS(sbi);
 	si->dirty_sits = SIT_I(sbi)->dirty_sentries;
-	si->fnids = NM_I(sbi)->fcnt;
+	si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+	si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
 	si->bg_gc = sbi->bg_gc;
 	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
 		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
@@ -194,7 +195,9 @@ get_cache:
 		si->cache_mem += sizeof(struct flush_cmd_control);
 
 	/* free nids */
-	si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
+	si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
+				NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) *
+				sizeof(struct free_nid);
 	si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
 	si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
 					sizeof(struct nat_entry_set);
@@ -324,8 +327,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->ndirty_imeta);
 		seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
 			   si->dirty_nats, si->nats, si->dirty_sits, si->sits);
-		seq_printf(s, "  - free_nids: %9d\n",
-			   si->fnids);
+		seq_printf(s, "  - free_nids: %9d, alloc_nids: %9d\n",
+			   si->free_nids, si->alloc_nids);
 		seq_puts(s, "\nDistribution of User Blocks:");
 		seq_puts(s, " [ valid | invalid | free ]\n");
 		seq_puts(s, "  [");
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 45d1e4522760..cec025852c22 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -529,6 +529,12 @@ static inline void __try_update_largest_extent(struct inode *inode,
 	}
 }
 
+enum nid_list {
+	FREE_NID_LIST,
+	ALLOC_NID_LIST,
+	MAX_NID_LIST,
+};
+
 struct f2fs_nm_info {
 	block_t nat_blkaddr;		/* base disk address of NAT */
 	nid_t max_nid;			/* maximum possible node ids */
@@ -548,9 +554,9 @@ struct f2fs_nm_info {
 
 	/* free node ids management */
 	struct radix_tree_root free_nid_root;/* root of the free_nid cache */
-	struct list_head free_nid_list;	/* a list for free nids */
-	spinlock_t free_nid_list_lock;	/* protect free nid list */
-	unsigned int fcnt;		/* the number of free node id */
+	struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
+	unsigned int nid_cnt[MAX_NID_LIST];	/* the number of free node id */
+	spinlock_t nid_list_lock;	/* protect nid lists ops */
 	struct mutex build_lock;	/* lock for build free nids */
 
 	/* for checkpoint */
@@ -2214,7 +2220,7 @@ struct f2fs_stat_info {
 	s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
 	s64 inmem_pages;
 	unsigned int ndirty_dirs, ndirty_files, ndirty_all;
-	int nats, dirty_nats, sits, dirty_sits, fnids;
+	int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
 	int total_count, utilization;
 	int bg_gc, wb_bios;
 	int inline_xattr, inline_inode, inline_dir, orphans;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d2ba37a84f8e..5bb2fa324e68 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
 	 */
 	if (type == FREE_NIDS) {
-		mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
-							PAGE_SHIFT;
+		mem_size = (nm_i->nid_cnt[FREE_NID_LIST] *
+				sizeof(struct free_nid)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == NAT_ENTRIES) {
 		mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
@@ -1699,10 +1699,31 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
 static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
 						struct free_nid *i)
 {
-	list_del(&i->list);
 	radix_tree_delete(&nm_i->free_nid_root, i->nid);
 }
 
+static void __insert_nid_to_list(struct f2fs_sb_info *sbi,
+					struct free_nid *i, enum nid_list list)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+						i->state != NID_ALLOC);
+	nm_i->nid_cnt[list]++;
+	list_add_tail(&i->list, &nm_i->nid_list[list]);
+}
+
+static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
+					struct free_nid *i, enum nid_list list)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+						i->state != NID_ALLOC);
+	nm_i->nid_cnt[list]--;
+	list_del(&i->list);
+}
+
 static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1733,33 +1754,33 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		return 0;
 	}
 
-	spin_lock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
 	if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
-		spin_unlock(&nm_i->free_nid_list_lock);
+		spin_unlock(&nm_i->nid_list_lock);
 		radix_tree_preload_end();
 		kmem_cache_free(free_nid_slab, i);
 		return 0;
 	}
-	list_add_tail(&i->list, &nm_i->free_nid_list);
-	nm_i->fcnt++;
-	spin_unlock(&nm_i->free_nid_list_lock);
+	__insert_nid_to_list(sbi, i, FREE_NID_LIST);
+	spin_unlock(&nm_i->nid_list_lock);
 	radix_tree_preload_end();
 	return 1;
 }
 
-static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 {
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i;
 	bool need_free = false;
 
-	spin_lock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
 	if (i && i->state == NID_NEW) {
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST);
 		__del_from_free_nid_list(nm_i, i);
-		nm_i->fcnt--;
 		need_free = true;
 	}
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 
 	if (need_free)
 		kmem_cache_free(free_nid_slab, i);
@@ -1798,7 +1819,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi)
 	nid_t nid = nm_i->next_scan_nid;
 
 	/* Enough entries */
-	if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK)
+	if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
 		return;
 
 	/* readahead nat pages to be scanned */
@@ -1834,7 +1855,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi)
 		if (addr == NULL_ADDR)
 			add_free_nid(sbi, nid, true);
 		else
-			remove_free_nid(nm_i, nid);
+			remove_free_nid(sbi, nid);
 	}
 	up_read(&curseg->journal_rwsem);
 	up_read(&nm_i->nat_tree_lock);
@@ -1867,23 +1888,22 @@ retry:
 	if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
 		return false;
 
-	spin_lock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
 
 	/* We should not use stale free nids created by build_free_nids */
-	if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
-		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
-		list_for_each_entry(i, &nm_i->free_nid_list, list)
-			if (i->state == NID_NEW)
-				break;
-
-		f2fs_bug_on(sbi, i->state != NID_NEW);
+	if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
+		f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
+		i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+					struct free_nid, list);
 		*nid = i->nid;
+
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST);
 		i->state = NID_ALLOC;
-		nm_i->fcnt--;
-		spin_unlock(&nm_i->free_nid_list_lock);
+		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST);
+		spin_unlock(&nm_i->nid_list_lock);
 		return true;
 	}
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 
 	/* Let's scan nat pages and its caches to get free nids */
 	build_free_nids(sbi);
@@ -1898,11 +1918,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i;
 
-	spin_lock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
-	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
+	f2fs_bug_on(sbi, !i);
+	__remove_nid_from_list(sbi, i, ALLOC_NID_LIST);
 	__del_from_free_nid_list(nm_i, i);
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 
 	kmem_cache_free(free_nid_slab, i);
 }
@@ -1919,17 +1940,20 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 	if (!nid)
 		return;
 
-	spin_lock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
-	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
+	f2fs_bug_on(sbi, !i);
+
+	__remove_nid_from_list(sbi, i, ALLOC_NID_LIST);
+
 	if (!available_free_memory(sbi, FREE_NIDS)) {
 		__del_from_free_nid_list(nm_i, i);
 		need_free = true;
 	} else {
 		i->state = NID_NEW;
-		nm_i->fcnt++;
+		__insert_nid_to_list(sbi, i, FREE_NID_LIST);
 	}
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 
 	if (need_free)
 		kmem_cache_free(free_nid_slab, i);
@@ -1941,24 +1965,26 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
 	struct free_nid *i, *next;
 	int nr = nr_shrink;
 
-	if (nm_i->fcnt <= MAX_FREE_NIDS)
+	if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
 		return 0;
 
 	if (!mutex_trylock(&nm_i->build_lock))
 		return 0;
 
-	spin_lock(&nm_i->free_nid_list_lock);
-	list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
-		if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS)
+	spin_lock(&nm_i->nid_list_lock);
+	list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST],
+									list) {
+		if (nr_shrink <= 0 ||
+				nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
 			break;
-		if (i->state == NID_ALLOC)
-			continue;
+
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST);
 		__del_from_free_nid_list(nm_i, i);
+
 		kmem_cache_free(free_nid_slab, i);
-		nm_i->fcnt--;
 		nr_shrink--;
 	}
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 	mutex_unlock(&nm_i->build_lock);
 
 	return nr - nr_shrink;
@@ -2014,7 +2040,7 @@ recover_xnid:
 	if (unlikely(!inc_valid_node_count(sbi, inode)))
 		f2fs_bug_on(sbi, 1);
 
-	remove_free_nid(NM_I(sbi), new_xnid);
+	remove_free_nid(sbi, new_xnid);
 	get_node_info(sbi, new_xnid, &ni);
 	ni.ino = inode->i_ino;
 	set_node_addr(sbi, &ni, NEW_ADDR, false);
@@ -2044,7 +2070,7 @@ retry:
 	}
 
 	/* Should not use this inode from free nid list */
-	remove_free_nid(NM_I(sbi), ino);
+	remove_free_nid(sbi, ino);
 
 	if (!PageUptodate(ipage))
 		SetPageUptodate(ipage);
@@ -2278,20 +2304,22 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
 	nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
-	nm_i->fcnt = 0;
+	nm_i->nid_cnt[FREE_NID_LIST] = 0;
+	nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
 	nm_i->nat_cnt = 0;
 	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
 	nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
 	nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
 
 	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
-	INIT_LIST_HEAD(&nm_i->free_nid_list);
+	INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]);
+	INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]);
 	INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
 	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
 	INIT_LIST_HEAD(&nm_i->nat_entries);
 
 	mutex_init(&nm_i->build_lock);
-	spin_lock_init(&nm_i->free_nid_list_lock);
+	spin_lock_init(&nm_i->nid_list_lock);
 	init_rwsem(&nm_i->nat_tree_lock);
 
 	nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
@@ -2336,17 +2364,19 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 		return;
 
 	/* destroy free nid list */
-	spin_lock(&nm_i->free_nid_list_lock);
-	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
-		f2fs_bug_on(sbi, i->state == NID_ALLOC);
+	spin_lock(&nm_i->nid_list_lock);
+	list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
+									list) {
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST);
 		__del_from_free_nid_list(nm_i, i);
-		nm_i->fcnt--;
-		spin_unlock(&nm_i->free_nid_list_lock);
+		spin_unlock(&nm_i->nid_list_lock);
 		kmem_cache_free(free_nid_slab, i);
-		spin_lock(&nm_i->free_nid_list_lock);
+		spin_lock(&nm_i->nid_list_lock);
 	}
-	f2fs_bug_on(sbi, nm_i->fcnt);
-	spin_unlock(&nm_i->free_nid_list_lock);
+	f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]);
+	f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]);
+	f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST]));
+	spin_unlock(&nm_i->nid_list_lock);
 
 	/* destroy nat cache */
 	down_write(&nm_i->nat_tree_lock);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index cfdcf98516a1..e7997e240366 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *fnid;
 
-	spin_lock(&nm_i->free_nid_list_lock);
-	if (nm_i->fcnt <= 0) {
-		spin_unlock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
+	if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) {
+		spin_unlock(&nm_i->nid_list_lock);
 		return;
 	}
-	fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+	fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next,
+						struct free_nid, list);
 	*nid = fnid->nid;
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 }
 
 /*
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 46c915425923..ec539f407cc4 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -26,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
 
 static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 {
-	if (NM_I(sbi)->fcnt > MAX_FREE_NIDS)
-		return NM_I(sbi)->fcnt - MAX_FREE_NIDS;
+	if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS)
+		return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
 	return 0;
 }
 

From 6a248819a280ff0c72ef4d9943ead88d7d6de0cb Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 12 Oct 2016 10:09:59 -0700
Subject: [PATCH 1068/3220] f2fs: clean up free nid list operations

commit eb0aa4b80784b8551bd5be577024e067bc83ef94 upstream.

This patch cleans up to use consistent free nid list ops.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 56 +++++++++++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 5bb2fa324e68..ef5357c7af24 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1696,25 +1696,26 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
 	return radix_tree_lookup(&nm_i->free_nid_root, n);
 }
 
-static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
-						struct free_nid *i)
-{
-	radix_tree_delete(&nm_i->free_nid_root, i->nid);
-}
-
-static void __insert_nid_to_list(struct f2fs_sb_info *sbi,
-					struct free_nid *i, enum nid_list list)
+static int __insert_nid_to_list(struct f2fs_sb_info *sbi,
+			struct free_nid *i, enum nid_list list, bool new)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
+	if (new) {
+		int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+		if (err)
+			return err;
+	}
+
 	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
 						i->state != NID_ALLOC);
 	nm_i->nid_cnt[list]++;
 	list_add_tail(&i->list, &nm_i->nid_list[list]);
+	return 0;
 }
 
 static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
-					struct free_nid *i, enum nid_list list)
+			struct free_nid *i, enum nid_list list, bool reuse)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
@@ -1722,6 +1723,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
 						i->state != NID_ALLOC);
 	nm_i->nid_cnt[list]--;
 	list_del(&i->list);
+	if (!reuse)
+		radix_tree_delete(&nm_i->free_nid_root, i->nid);
 }
 
 static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
@@ -1729,6 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i;
 	struct nat_entry *ne;
+	int err;
 
 	if (!available_free_memory(sbi, FREE_NIDS))
 		return -1;
@@ -1755,15 +1759,13 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	}
 
 	spin_lock(&nm_i->nid_list_lock);
-	if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
-		spin_unlock(&nm_i->nid_list_lock);
-		radix_tree_preload_end();
+	err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+	spin_unlock(&nm_i->nid_list_lock);
+	radix_tree_preload_end();
+	if (err) {
 		kmem_cache_free(free_nid_slab, i);
 		return 0;
 	}
-	__insert_nid_to_list(sbi, i, FREE_NID_LIST);
-	spin_unlock(&nm_i->nid_list_lock);
-	radix_tree_preload_end();
 	return 1;
 }
 
@@ -1776,8 +1778,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
 	if (i && i->state == NID_NEW) {
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST);
-		__del_from_free_nid_list(nm_i, i);
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
 		need_free = true;
 	}
 	spin_unlock(&nm_i->nid_list_lock);
@@ -1897,9 +1898,9 @@ retry:
 					struct free_nid, list);
 		*nid = i->nid;
 
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST);
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
 		i->state = NID_ALLOC;
-		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST);
+		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
 		spin_unlock(&nm_i->nid_list_lock);
 		return true;
 	}
@@ -1921,8 +1922,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
 	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
 	f2fs_bug_on(sbi, !i);
-	__remove_nid_from_list(sbi, i, ALLOC_NID_LIST);
-	__del_from_free_nid_list(nm_i, i);
+	__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
 	spin_unlock(&nm_i->nid_list_lock);
 
 	kmem_cache_free(free_nid_slab, i);
@@ -1944,14 +1944,13 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 	i = __lookup_free_nid_list(nm_i, nid);
 	f2fs_bug_on(sbi, !i);
 
-	__remove_nid_from_list(sbi, i, ALLOC_NID_LIST);
-
 	if (!available_free_memory(sbi, FREE_NIDS)) {
-		__del_from_free_nid_list(nm_i, i);
+		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
 		need_free = true;
 	} else {
+		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true);
 		i->state = NID_NEW;
-		__insert_nid_to_list(sbi, i, FREE_NID_LIST);
+		__insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
 	}
 	spin_unlock(&nm_i->nid_list_lock);
 
@@ -1978,9 +1977,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
 				nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
 			break;
 
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST);
-		__del_from_free_nid_list(nm_i, i);
-
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
 		kmem_cache_free(free_nid_slab, i);
 		nr_shrink--;
 	}
@@ -2367,8 +2364,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 	spin_lock(&nm_i->nid_list_lock);
 	list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
 									list) {
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST);
-		__del_from_free_nid_list(nm_i, i);
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
 		spin_unlock(&nm_i->nid_list_lock);
 		kmem_cache_free(free_nid_slab, i);
 		spin_lock(&nm_i->nid_list_lock);

From e07f457ef777a1af0642b81af98b1e824748b265 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:31:35 +0800
Subject: [PATCH 1069/3220] f2fs: don't interrupt free nids building during nid
 allocation

commit 3a2ad5672bb36ee9c07bab97dadc8b0f70d391f4 upstream.

Let build_free_nids support sync/async methods, in allocation flow of nids,
we use synchronuous method, so that we can avoid looping in alloc_nid when
free memory is low; in unblock_operations and f2fs_balance_fs_bg we use
asynchronuous method in where low memory condition can interrupt us.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  2 +-
 fs/f2fs/f2fs.h       |  2 +-
 fs/f2fs/node.c       | 22 ++++++++++------------
 fs/f2fs/segment.c    |  2 +-
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 63ca342a3cc8..1d273f51bc1c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -990,7 +990,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
 {
 	up_write(&sbi->node_write);
 
-	build_free_nids(sbi);
+	build_free_nids(sbi, false);
 	f2fs_unlock_all(sbi);
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cec025852c22..9e8d3c9af54a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2071,7 +2071,7 @@ void move_node_page(struct page *, int);
 int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
 			struct writeback_control *, bool);
 int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
-void build_free_nids(struct f2fs_sb_info *);
+void build_free_nids(struct f2fs_sb_info *, bool);
 bool alloc_nid(struct f2fs_sb_info *, nid_t *);
 void alloc_nid_done(struct f2fs_sb_info *, nid_t);
 void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ef5357c7af24..5800a1082fe8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1734,9 +1734,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	struct nat_entry *ne;
 	int err;
 
-	if (!available_free_memory(sbi, FREE_NIDS))
-		return -1;
-
 	/* 0 nid should not be used */
 	if (unlikely(nid == 0))
 		return 0;
@@ -1804,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
 		f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
-		if (blk_addr == NULL_ADDR) {
-			if (add_free_nid(sbi, start_nid, true) < 0)
-				break;
-		}
+		if (blk_addr == NULL_ADDR)
+			add_free_nid(sbi, start_nid, true);
 	}
 }
 
-void __build_free_nids(struct f2fs_sb_info *sbi)
+void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1823,6 +1818,9 @@ void __build_free_nids(struct f2fs_sb_info *sbi)
 	if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
 		return;
 
+	if (!sync && !available_free_memory(sbi, FREE_NIDS))
+		return;
+
 	/* readahead nat pages to be scanned */
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
 							META_NAT, true);
@@ -1865,10 +1863,10 @@ void __build_free_nids(struct f2fs_sb_info *sbi)
 					nm_i->ra_nid_pages, META_NAT, false);
 }
 
-void build_free_nids(struct f2fs_sb_info *sbi)
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync)
 {
 	mutex_lock(&NM_I(sbi)->build_lock);
-	__build_free_nids(sbi);
+	__build_free_nids(sbi, sync);
 	mutex_unlock(&NM_I(sbi)->build_lock);
 }
 
@@ -1907,7 +1905,7 @@ retry:
 	spin_unlock(&nm_i->nid_list_lock);
 
 	/* Let's scan nat pages and its caches to get free nids */
-	build_free_nids(sbi);
+	build_free_nids(sbi, true);
 	goto retry;
 }
 
@@ -2344,7 +2342,7 @@ int build_node_manager(struct f2fs_sb_info *sbi)
 	if (err)
 		return err;
 
-	build_free_nids(sbi);
+	build_free_nids(sbi, true);
 	return 0;
 }
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 75477ec6c535..48903702de27 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -380,7 +380,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	if (!available_free_memory(sbi, FREE_NIDS))
 		try_to_free_nids(sbi, MAX_FREE_NIDS);
 	else
-		build_free_nids(sbi);
+		build_free_nids(sbi, false);
 
 	/* checkpoint is the only way to shrink partial cached entries */
 	if (!available_free_memory(sbi, NAT_ENTRIES) ||

From 9f99694bb7e0272568e804e96fb4ae5772acc198 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 11 Oct 2016 22:31:36 +0800
Subject: [PATCH 1070/3220] f2fs: avoid casted negative value as shrink count

commit 02110a4fd53164db7cce3bb2780dce4d6c4e058f upstream.

This patch makes sure it returns a positive value instead of a probable
casted negative value as shrink count.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/shrinker.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index ec539f407cc4..5c60fc28ec75 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -21,14 +21,16 @@ static unsigned int shrinker_run_no;
 
 static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
 {
-	return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+	long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+
+	return count > 0 ? count : 0;
 }
 
 static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 {
-	if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS)
-		return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
-	return 0;
+	long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
+
+	return count > 0 ? count : 0;
 }
 
 static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)

From beb74f7757f803290f80b7d1dd5b85e64c28d223 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 14 Oct 2016 13:28:05 -0700
Subject: [PATCH 1071/3220] f2fs: count dirty inodes to flush node pages during
 checkpoint

commit b9610bdfcbdbb6017802ec6d1e073f445c98157d upstream.

If there are a lot of dirty inodes, we need to flush all of them when doing
checkpoint. So, we need to count this for enough free space.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index fecb856ad874..762743988426 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
 {
 	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
 	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+	int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
 
 	if (test_opt(sbi, LFS))
 		return false;
 
-	return free_sections(sbi) <= (node_secs + 2 * dent_secs +
+	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
 						reserved_sections(sbi) + 1);
 }
 
@@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
 {
 	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
 	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-
-	node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+	int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
 
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		return false;
 
 	return (free_sections(sbi) + freed) <=
-		(node_secs + 2 * dent_secs + reserved_sections(sbi) + needed);
+		(node_secs + 2 * dent_secs + imeta_secs +
+		reserved_sections(sbi) + needed);
 }
 
 static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)

From 92c9dec3429503a3cd84ef5fa121c8782bf58af0 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 14 Oct 2016 13:30:31 -0700
Subject: [PATCH 1072/3220] f2fs: call f2fs_balance_fs for setattr

commit 15d04354555fdfe8005e1365009e349148fb5f90 upstream.

If inode becomes dirty, we need to check the # of dirty inodes whether or not
further checkpoint would be required.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c0774c98dce4..53ba384cb675 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -695,7 +695,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			err = f2fs_truncate(inode);
 			if (err)
 				return err;
-			f2fs_balance_fs(F2FS_I_SB(inode), true);
 		} else {
 			/*
 			 * do not trim all blocks after i_size if target size is
@@ -724,6 +723,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	f2fs_mark_inode_dirty_sync(inode);
+
+	/* inode change will produce dirty node pages flushed by checkpoint */
+	f2fs_balance_fs(F2FS_I_SB(inode), true);
+
 	return err;
 }
 

From c01ce254c71fd8e4a3539a5f46da77dc43859fe7 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 17 Oct 2016 15:36:31 -0700
Subject: [PATCH 1073/3220] f2fs: declare static function for __build_free_nids

commit 3e7b5bbbef7f5eb8a19aa61b611c704bf8230937 upstream.

This patch avoids build warning.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 5800a1082fe8..e1ce0b8438fc 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1806,7 +1806,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 	}
 }
 
-void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
+static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);

From 04030d21a7b118fc9ba865223fb30017a0331cb6 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 18 Oct 2016 11:07:45 -0700
Subject: [PATCH 1074/3220] f2fs: use BIO_MAX_PAGES for bio allocation

commit 664ba972df9b96942191db3068274cc1db899774 upstream.

We don't need to allocate bio partially in order to maximize sequential writes.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  2 +-
 fs/f2fs/data.c       |  4 +---
 fs/f2fs/node.c       |  3 +--
 fs/f2fs/segment.c    |  4 ++--
 fs/f2fs/segment.h    | 17 +++--------------
 5 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 1d273f51bc1c..1dffe86651be 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -226,7 +226,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
 	f2fs_put_page(page, 0);
 
 	if (readahead)
-		ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
+		ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
 }
 
 static int f2fs_write_meta_page(struct page *page,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0cdabfce6c17..e92cf046d9ff 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -273,10 +273,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
-		int bio_blocks = MAX_BIO_BLOCKS(sbi);
-
 		io->bio = __bio_alloc(sbi, fio->new_blkaddr,
-						bio_blocks, is_read);
+						BIO_MAX_PAGES, is_read);
 		io->fio = *fio;
 	}
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e1ce0b8438fc..389be7f6e07c 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2099,7 +2099,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 	struct f2fs_node *rn;
 	struct f2fs_summary *sum_entry;
 	block_t addr;
-	int bio_blocks = MAX_BIO_BLOCKS(sbi);
 	int i, idx, last_offset, nrpages;
 
 	/* scan the node segment */
@@ -2108,7 +2107,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 	sum_entry = &sum->entries[0];
 
 	for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
-		nrpages = min(last_offset - i, bio_blocks);
+		nrpages = min(last_offset - i, BIO_MAX_PAGES);
 
 		/* readahead node pages */
 		ra_meta_pages(sbi, addr, nrpages, META_POR, true);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 48903702de27..ec4d74c26067 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2239,10 +2239,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 	int sit_blk_cnt = SIT_BLK_CNT(sbi);
 	unsigned int i, start, end;
 	unsigned int readed, start_blk = 0;
-	int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
 
 	do {
-		readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
+		readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
+							META_SIT, true);
 
 		start = start_blk * sit_i->sents_per_block;
 		end = (start_blk + readed) * sit_i->sents_per_block;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 762743988426..89ab4301ef02 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -102,8 +102,6 @@
 	(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
 #define SECTOR_TO_BLOCK(sectors)					\
 	(sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
-#define MAX_BIO_BLOCKS(sbi)						\
-	((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
 
 /*
  * indicate a block allocation direction: RIGHT and LEFT.
@@ -696,13 +694,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
 	return false;
 }
 
-static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
-{
-	struct block_device *bdev = sbi->sb->s_bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
-	return SECTOR_TO_BLOCK(queue_max_sectors(q));
-}
-
 /*
  * It is very important to gather dirty pages and write at once, so that we can
  * submit a big bio without interfering other data writes.
@@ -720,7 +711,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 	else if (type == NODE)
 		return 8 * sbi->blocks_per_seg;
 	else if (type == META)
-		return 8 * MAX_BIO_BLOCKS(sbi);
+		return 8 * BIO_MAX_PAGES;
 	else
 		return 0;
 }
@@ -737,11 +728,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
 		return 0;
 
 	nr_to_write = wbc->nr_to_write;
-
+	desired = BIO_MAX_PAGES;
 	if (type == NODE)
-		desired = 2 * max_hw_blocks(sbi);
-	else
-		desired = MAX_BIO_BLOCKS(sbi);
+		desired <<= 1;
 
 	wbc->nr_to_write = desired;
 	return desired - nr_to_write;

From dafac77e8d20988a0a5f4ef81fd73fb79a2426e5 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 10 Nov 2016 18:04:05 -0800
Subject: [PATCH 1075/3220] f2fs: Replace CURRENT_TIME_SEC with current_time()
 for inode timestamps

commit 02027d42c3f747945f19111d3da2092ed2148ac8 upstream.

This is for backport only.

fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c    |  8 ++++----
 fs/f2fs/f2fs.h   | 22 ++++++++++++++++++++++
 fs/f2fs/file.c   |  8 ++++----
 fs/f2fs/inline.c |  2 +-
 fs/f2fs/namei.c  |  8 ++++----
 fs/f2fs/xattr.c  |  2 +-
 6 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 7136dc1ade11..3b8ebec0450b 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -312,7 +312,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 	f2fs_dentry_kunmap(dir, page);
 	set_page_dirty(page);
 
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	dir->i_mtime = dir->i_ctime = current_time(dir);
 	f2fs_mark_inode_dirty_sync(dir);
 	f2fs_put_page(page, 1);
 }
@@ -465,7 +465,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
 			f2fs_i_links_write(dir, true);
 		clear_inode_flag(inode, FI_NEW_INODE);
 	}
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	dir->i_mtime = dir->i_ctime = current_time(dir);
 	f2fs_mark_inode_dirty_sync(dir);
 
 	if (F2FS_I(dir)->i_current_depth != current_depth)
@@ -683,7 +683,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
 
 	if (S_ISDIR(inode->i_mode))
 		f2fs_i_links_write(dir, false);
-	inode->i_ctime = CURRENT_TIME;
+	inode->i_ctime = current_time(inode);
 
 	f2fs_i_links_write(inode, false);
 	if (S_ISDIR(inode->i_mode)) {
@@ -730,7 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	kunmap(page); /* kunmap - pair of f2fs_find_entry */
 	set_page_dirty(page);
 
-	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	dir->i_ctime = dir->i_mtime = current_time(dir);
 	f2fs_mark_inode_dirty_sync(dir);
 
 	if (inode)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9e8d3c9af54a..1938fe457041 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -138,6 +138,28 @@ static inline void inode_nohighmem(struct inode *inode)
 	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
 }
 
+/**
+ * current_time - Return FS time
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs.
+ *
+ * Note that inode and inode->sb cannot be NULL.
+ * Otherwise, the function warns and returns time without truncation.
+ */
+static inline struct timespec current_time(struct inode *inode)
+{
+	struct timespec now = current_kernel_time();
+
+	if (unlikely(!inode->i_sb)) {
+		WARN(1, "current_time() called with uninitialized super_block in the inode");
+		return now; 
+	}    
+
+	return timespec_trunc(now, inode->i_sb->s_time_gran);
+}
+
 /*
  * For checkpoint manager
  */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 53ba384cb675..04a3205d2934 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -632,7 +632,7 @@ int f2fs_truncate(struct inode *inode)
 	if (err)
 		return err;
 
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	f2fs_mark_inode_dirty_sync(inode);
 	return 0;
 }
@@ -708,7 +708,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 				if (err)
 					return err;
 			}
-			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+			inode->i_mtime = inode->i_ctime = current_time(inode);
 		}
 	}
 
@@ -1402,7 +1402,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 	}
 
 	if (!ret) {
-		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_mtime = inode->i_ctime = current_time(inode);
 		f2fs_mark_inode_dirty_sync(inode);
 		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
@@ -1494,7 +1494,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 	fi->i_flags = flags;
 	inode_unlock(inode);
 
-	inode->i_ctime = CURRENT_TIME;
+	inode->i_ctime = current_time(inode);
 	f2fs_set_inode_flags(inode);
 out:
 	mnt_drop_write_file(filp);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 56363c18489a..3b2f88788e02 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -591,7 +591,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
 
-	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	dir->i_ctime = dir->i_mtime = current_time(dir);
 	f2fs_mark_inode_dirty_sync(dir);
 
 	if (inode)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 0f071a70522d..ae29726afff0 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_generation = sbi->s_next_generation++;
 
 	err = insert_inode_locked(inode);
@@ -182,7 +182,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 
 	f2fs_balance_fs(sbi, true);
 
-	inode->i_ctime = CURRENT_TIME;
+	inode->i_ctime = current_time(inode);
 	ihold(inode);
 
 	set_inode_flag(inode, FI_INC_LINK);
@@ -720,7 +720,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 		f2fs_set_link(new_dir, new_entry, new_page, old_inode);
 
-		new_inode->i_ctime = CURRENT_TIME;
+		new_inode->i_ctime = current_time(new_inode);
 		down_write(&F2FS_I(new_inode)->i_sem);
 		if (old_dir_entry)
 			f2fs_i_links_write(new_inode, false);
@@ -774,7 +774,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		file_set_enc_name(old_inode);
 	up_write(&F2FS_I(old_inode)->i_sem);
 
-	old_inode->i_ctime = CURRENT_TIME;
+	old_inode->i_ctime = current_time(old_inode);
 	f2fs_mark_inode_dirty_sync(old_inode);
 
 	f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 69c6bb9cf207..3a42405b6515 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -588,7 +588,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 
 	if (is_inode_flag_set(inode, FI_ACL_MODE)) {
 		inode->i_mode = F2FS_I(inode)->i_acl_mode;
-		inode->i_ctime = CURRENT_TIME;
+		inode->i_ctime = current_time(inode);
 		clear_inode_flag(inode, FI_ACL_MODE);
 	}
 	if (index == F2FS_XATTR_INDEX_ENCRYPTION &&

From 0ef31c7bfa50fba5a0fb63dff3f0bef28d961427 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 14 Oct 2016 11:51:23 -0700
Subject: [PATCH 1076/3220] f2fs: keep dirty inodes selectively for checkpoint

commit 7c45729a4d6d1c90879e6c5c2df325c2f6db7191 upstream.

This is to avoid no free segment bug during checkpoint caused by a number of
dirty inodes.

The case was reported by Chao like this.
1. mount with lazytime option
2. fill 4k file until disk is full
3. sync filesystem
4. read all files in the image
5. umount

In this case, we actually don't need to flush dirty inode to inode page during
checkpoint.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/acl.c          |  2 +-
 fs/f2fs/dir.c          |  6 +++---
 fs/f2fs/extent_cache.c |  2 +-
 fs/f2fs/f2fs.h         | 26 +++++++++++++-------------
 fs/f2fs/file.c         |  9 +++++----
 fs/f2fs/inline.c       |  2 +-
 fs/f2fs/inode.c        |  7 ++++---
 fs/f2fs/namei.c        |  6 +++---
 fs/f2fs/super.c        | 29 ++++++++++++++++-------------
 fs/f2fs/xattr.c        |  4 ++--
 10 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 4a34040932e9..a45d1f4b7b0f 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -387,7 +387,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
 	if (error)
 		return error;
 
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 
 	if (default_acl) {
 		error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 3b8ebec0450b..5594667c2f41 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 	set_page_dirty(page);
 
 	dir->i_mtime = dir->i_ctime = current_time(dir);
-	f2fs_mark_inode_dirty_sync(dir);
+	f2fs_mark_inode_dirty_sync(dir, false);
 	f2fs_put_page(page, 1);
 }
 
@@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
 		clear_inode_flag(inode, FI_NEW_INODE);
 	}
 	dir->i_mtime = dir->i_ctime = current_time(dir);
-	f2fs_mark_inode_dirty_sync(dir);
+	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (F2FS_I(dir)->i_current_depth != current_depth)
 		f2fs_i_depth_write(dir, current_depth);
@@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	set_page_dirty(page);
 
 	dir->i_ctime = dir->i_mtime = current_time(dir);
-	f2fs_mark_inode_dirty_sync(dir);
+	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (inode)
 		f2fs_drop_nlink(dir, inode);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 2b06d4fcd954..4db44da7ef69 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode,
 
 	if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
 		largest->len = 0;
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1938fe457041..0d2502fdf892 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -541,13 +541,13 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
 	return __is_extent_mergeable(cur, front);
 }
 
-extern void f2fs_mark_inode_dirty_sync(struct inode *);
+extern void f2fs_mark_inode_dirty_sync(struct inode *, bool);
 static inline void __try_update_largest_extent(struct inode *inode,
 			struct extent_tree *et, struct extent_node *en)
 {
 	if (en->ei.len > et->largest.len) {
 		et->largest = en->ei;
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
 
@@ -1680,7 +1680,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
 			return;
 	case FI_DATA_EXIST:
 	case FI_INLINE_DOTS:
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
 
@@ -1707,7 +1707,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode)
 {
 	F2FS_I(inode)->i_acl_mode = mode;
 	set_inode_flag(inode, FI_ACL_MODE);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, false);
 }
 
 static inline void f2fs_i_links_write(struct inode *inode, bool inc)
@@ -1716,7 +1716,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc)
 		inc_nlink(inode);
 	else
 		drop_nlink(inode);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void f2fs_i_blocks_write(struct inode *inode,
@@ -1727,7 +1727,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode,
 
 	inode->i_blocks = add ? inode->i_blocks + diff :
 				inode->i_blocks - diff;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 	if (clean || recover)
 		set_inode_flag(inode, FI_AUTO_RECOVER);
 }
@@ -1741,7 +1741,7 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
 		return;
 
 	i_size_write(inode, i_size);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 	if (clean || recover)
 		set_inode_flag(inode, FI_AUTO_RECOVER);
 }
@@ -1756,19 +1756,19 @@ static inline bool f2fs_skip_inode_update(struct inode *inode)
 static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
 {
 	F2FS_I(inode)->i_current_depth = depth;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
 {
 	F2FS_I(inode)->i_xattr_nid = xnid;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
 {
 	F2FS_I(inode)->i_pino = pino;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
@@ -1896,13 +1896,13 @@ static inline int is_file(struct inode *inode, int type)
 static inline void set_file(struct inode *inode, int type)
 {
 	F2FS_I(inode)->i_advise |= type;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void clear_file(struct inode *inode, int type)
 {
 	F2FS_I(inode)->i_advise &= ~type;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline int f2fs_readonly(struct super_block *sb)
@@ -2054,7 +2054,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
 /*
  * super.c
  */
-int f2fs_inode_dirtied(struct inode *);
+int f2fs_inode_dirtied(struct inode *, bool);
 void f2fs_inode_synced(struct inode *);
 int f2fs_commit_super(struct f2fs_sb_info *, bool);
 int f2fs_sync_fs(struct super_block *, int);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 04a3205d2934..ce38a350fb38 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -265,7 +265,7 @@ sync_nodes:
 	}
 
 	if (need_inode_block_update(sbi, ino)) {
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, true);
 		f2fs_write_inode(inode, NULL);
 		goto sync_nodes;
 	}
@@ -633,7 +633,7 @@ int f2fs_truncate(struct inode *inode)
 		return err;
 
 	inode->i_mtime = inode->i_ctime = current_time(inode);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, false);
 	return 0;
 }
 
@@ -722,7 +722,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	f2fs_mark_inode_dirty_sync(inode);
+	/* update attributes only */
+	f2fs_mark_inode_dirty_sync(inode, false);
 
 	/* inode change will produce dirty node pages flushed by checkpoint */
 	f2fs_balance_fs(F2FS_I_SB(inode), true);
@@ -1403,7 +1404,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 
 	if (!ret) {
 		inode->i_mtime = inode->i_ctime = current_time(inode);
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, false);
 		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
 
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3b2f88788e02..b859d5e377ae 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -592,7 +592,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	f2fs_put_page(page, 1);
 
 	dir->i_ctime = dir->i_mtime = current_time(dir);
-	f2fs_mark_inode_dirty_sync(dir);
+	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (inode)
 		f2fs_drop_nlink(dir, inode);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d32fd0343eae..bfa512dde4ab 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -19,10 +19,11 @@
 
 #include <trace/events/f2fs.h>
 
-void f2fs_mark_inode_dirty_sync(struct inode *inode)
+void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
 {
-	if (f2fs_inode_dirtied(inode))
+	if (f2fs_inode_dirtied(inode, sync))
 		return;
+
 	mark_inode_dirty_sync(inode);
 }
 
@@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_DIRSYNC;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, false);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ae29726afff0..7f2fdb154180 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -775,7 +775,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	up_write(&F2FS_I(old_inode)->i_sem);
 
 	old_inode->i_ctime = current_time(old_inode);
-	f2fs_mark_inode_dirty_sync(old_inode);
+	f2fs_mark_inode_dirty_sync(old_inode, false);
 
 	f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
 
@@ -935,7 +935,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 		f2fs_i_links_write(old_dir, old_nlink > 0);
 		up_write(&F2FS_I(old_dir)->i_sem);
 	}
-	f2fs_mark_inode_dirty_sync(old_dir);
+	f2fs_mark_inode_dirty_sync(old_dir, false);
 
 	/* update directory entry info of new dir inode */
 	f2fs_set_link(new_dir, new_entry, new_page, old_inode);
@@ -950,7 +950,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 		f2fs_i_links_write(new_dir, new_nlink > 0);
 		up_write(&F2FS_I(new_dir)->i_sem);
 	}
-	f2fs_mark_inode_dirty_sync(new_dir);
+	f2fs_mark_inode_dirty_sync(new_dir, false);
 
 	f2fs_unlock_op(sbi);
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 006138a6c5ab..23190a94840b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -620,24 +620,25 @@ static int f2fs_drop_inode(struct inode *inode)
 	return generic_drop_inode(inode);
 }
 
-int f2fs_inode_dirtied(struct inode *inode)
+int f2fs_inode_dirtied(struct inode *inode, bool sync)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int ret = 0;
 
 	spin_lock(&sbi->inode_lock[DIRTY_META]);
 	if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
-		spin_unlock(&sbi->inode_lock[DIRTY_META]);
-		return 1;
+		ret = 1;
+	} else {
+		set_inode_flag(inode, FI_DIRTY_INODE);
+		stat_inc_dirty_inode(sbi, DIRTY_META);
 	}
-
-	set_inode_flag(inode, FI_DIRTY_INODE);
-	list_add_tail(&F2FS_I(inode)->gdirty_list,
+	if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) {
+		list_add_tail(&F2FS_I(inode)->gdirty_list,
 				&sbi->inode_list[DIRTY_META]);
-	inc_page_count(sbi, F2FS_DIRTY_IMETA);
-	stat_inc_dirty_inode(sbi, DIRTY_META);
+		inc_page_count(sbi, F2FS_DIRTY_IMETA);
+	}
 	spin_unlock(&sbi->inode_lock[DIRTY_META]);
-
-	return 0;
+	return ret;
 }
 
 void f2fs_inode_synced(struct inode *inode)
@@ -649,10 +650,12 @@ void f2fs_inode_synced(struct inode *inode)
 		spin_unlock(&sbi->inode_lock[DIRTY_META]);
 		return;
 	}
-	list_del_init(&F2FS_I(inode)->gdirty_list);
+	if (!list_empty(&F2FS_I(inode)->gdirty_list)) {
+		list_del_init(&F2FS_I(inode)->gdirty_list);
+		dec_page_count(sbi, F2FS_DIRTY_IMETA);
+	}
 	clear_inode_flag(inode, FI_DIRTY_INODE);
 	clear_inode_flag(inode, FI_AUTO_RECOVER);
-	dec_page_count(sbi, F2FS_DIRTY_IMETA);
 	stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META);
 	spin_unlock(&sbi->inode_lock[DIRTY_META]);
 }
@@ -676,7 +679,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags)
 	if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
 		clear_inode_flag(inode, FI_AUTO_RECOVER);
 
-	f2fs_inode_dirtied(inode);
+	f2fs_inode_dirtied(inode, false);
 }
 
 static void f2fs_i_callback(struct rcu_head *head)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3a42405b6515..1c4d5e39586c 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
 		return -EINVAL;
 
 	F2FS_I(inode)->i_advise |= *(char *)value;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 	return 0;
 }
 
@@ -594,7 +594,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 	if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
 			!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
 		f2fs_set_encrypted_inode(inode);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 	if (!error && S_ISDIR(inode->i_mode))
 		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
 exit:

From a15f017e8a270f7c8d0f4d4599229a62454c0c7a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 19 Oct 2016 18:27:56 -0700
Subject: [PATCH 1077/3220] f2fs: make clean inodes when flushing inode page

commit 18340edc8da20b0d399eb25ba4bb631b27652f46 upstream.

This patch tries to make more clean inodes when flushing dirty inodes in
checkpoint.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 6 +++++-
 fs/f2fs/inode.c      | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 1dffe86651be..ed79757c36e0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -924,7 +924,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
 		inode = igrab(&fi->vfs_inode);
 		spin_unlock(&sbi->inode_lock[DIRTY_META]);
 		if (inode) {
-			update_inode_page(inode);
+			sync_inode_metadata(inode, 0);
+
+			/* it's on eviction */
+			if (is_inode_flag_set(inode, FI_DIRTY_INODE))
+				update_inode_page(inode);
 			iput(inode);
 		}
 	};
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index bfa512dde4ab..7b5e402f0a72 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -339,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	 * We need to balance fs here to prevent from producing dirty node pages
 	 * during the urgent cleaning time when runing out of free sections.
 	 */
-	if (update_inode_page(inode))
+	if (update_inode_page(inode) && wbc && wbc->nr_to_write)
 		f2fs_balance_fs(sbi, true);
 	return 0;
 }

From 3f137dda709e78201a92067f7f23b1747302dd9b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 20 Oct 2016 19:09:57 -0700
Subject: [PATCH 1078/3220] f2fs: remove percpu_count due to performance
 regression

commit 35782b233f37e48ecc469d9c7232f3f6a7fad41a upstream.

This patch removes percpu_count usage due to performance regression in iozone.

Fixes: 523be8a6b3 ("f2fs: use percpu_counter for page counters")
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c | 12 ++++++------
 fs/f2fs/f2fs.h  | 12 ++++++------
 fs/f2fs/super.c | 16 +++++-----------
 3 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 5cab80dfa4dc..678733c78f55 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -313,17 +313,17 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - inmem: %4lld, wb_bios: %4d\n",
+		seq_printf(s, "  - inmem: %4d, wb_bios: %4d\n",
 			   si->inmem_pages, si->wb_bios);
-		seq_printf(s, "  - nodes: %4lld in %4d\n",
+		seq_printf(s, "  - nodes: %4d in %4d\n",
 			   si->ndirty_node, si->node_pages);
-		seq_printf(s, "  - dents: %4lld in dirs:%4d (%4d)\n",
+		seq_printf(s, "  - dents: %4d in dirs:%4d (%4d)\n",
 			   si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
-		seq_printf(s, "  - datas: %4lld in files:%4d\n",
+		seq_printf(s, "  - datas: %4d in files:%4d\n",
 			   si->ndirty_data, si->ndirty_files);
-		seq_printf(s, "  - meta: %4lld in %4d\n",
+		seq_printf(s, "  - meta: %4d in %4d\n",
 			   si->ndirty_meta, si->meta_pages);
-		seq_printf(s, "  - imeta: %4lld\n",
+		seq_printf(s, "  - imeta: %4d\n",
 			   si->ndirty_imeta);
 		seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
 			   si->dirty_nats, si->nats, si->dirty_sits, si->sits);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0d2502fdf892..932c53f441db 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -872,7 +872,7 @@ struct f2fs_sb_info {
 	atomic_t nr_wb_bios;			/* # of writeback bios */
 
 	/* # of pages, see count_type */
-	struct percpu_counter nr_pages[NR_COUNT_TYPE];
+	atomic_t nr_pages[NR_COUNT_TYPE];
 	/* # of allocated blocks */
 	struct percpu_counter alloc_valid_block_count;
 
@@ -1286,7 +1286,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 
 static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
-	percpu_counter_inc(&sbi->nr_pages[count_type]);
+	atomic_inc(&sbi->nr_pages[count_type]);
 
 	if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
 		return;
@@ -1303,7 +1303,7 @@ static inline void inode_inc_dirty_pages(struct inode *inode)
 
 static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
-	percpu_counter_dec(&sbi->nr_pages[count_type]);
+	atomic_dec(&sbi->nr_pages[count_type]);
 }
 
 static inline void inode_dec_dirty_pages(struct inode *inode)
@@ -1319,7 +1319,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
 
 static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
 {
-	return percpu_counter_sum_positive(&sbi->nr_pages[count_type]);
+	return atomic_read(&sbi->nr_pages[count_type]);
 }
 
 static inline s64 get_dirty_pages(struct inode *inode)
@@ -2239,8 +2239,8 @@ struct f2fs_stat_info {
 	unsigned long long hit_largest, hit_cached, hit_rbtree;
 	unsigned long long hit_total, total_ext;
 	int ext_tree, zombie_tree, ext_node;
-	s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
-	s64 inmem_pages;
+	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+	int inmem_pages;
 	unsigned int ndirty_dirs, ndirty_files, ndirty_all;
 	int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
 	int total_count, utilization;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 23190a94840b..6034d51fc5fc 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -696,10 +696,6 @@ static void f2fs_destroy_inode(struct inode *inode)
 
 static void destroy_percpu_info(struct f2fs_sb_info *sbi)
 {
-	int i;
-
-	for (i = 0; i < NR_COUNT_TYPE; i++)
-		percpu_counter_destroy(&sbi->nr_pages[i]);
 	percpu_counter_destroy(&sbi->alloc_valid_block_count);
 	percpu_counter_destroy(&sbi->total_valid_inode_count);
 }
@@ -1450,6 +1446,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 static void init_sb_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = sbi->raw_super;
+	int i;
 
 	sbi->log_sectors_per_block =
 		le32_to_cpu(raw_super->log_sectors_per_block);
@@ -1474,6 +1471,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
 	clear_sbi_flag(sbi, SBI_NEED_FSCK);
 
+	for (i = 0; i < NR_COUNT_TYPE; i++)
+		atomic_set(&sbi->nr_pages[i], 0);
+
 	INIT_LIST_HEAD(&sbi->s_list);
 	mutex_init(&sbi->umount_mutex);
 	mutex_init(&sbi->wio_mutex[NODE]);
@@ -1489,13 +1489,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
 {
-	int i, err;
-
-	for (i = 0; i < NR_COUNT_TYPE; i++) {
-		err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
-		if (err)
-			return err;
-	}
+	int err;
 
 	err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL);
 	if (err)

From b3441f8c71f2e30b96a76ca007d51c2c2d6be354 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 2 Nov 2016 14:52:15 +0100
Subject: [PATCH 1079/3220] f2fs: hide a maybe-uninitialized warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 230436b3ef3fd7d4a1da19edf5e87bb2d74e0fc2 upstream.

gcc is unsure about the use of last_ofs_in_node, which might happen
without a prior initialization:

fs/f2fs//git/arm-soc/fs/f2fs/data.c: In function ‘f2fs_map_blocks’:
fs/f2fs/data.c:799:54: warning: ‘last_ofs_in_node’ may be used uninitialized in this function [-Wmaybe-uninitialized]
   if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {

As pointed out by Chao Yu, the code is actually correct as 'prealloc'
is only set if the last_ofs_in_node has been set, the two always
get updated together.

This initializes last_ofs_in_node to dn.ofs_in_node for each
new dnode at the start of the 'next_block' loop, which at that
point is a correct initialization as well. I assume that compilers
that correctly track the contents of the variables and do not
warn about the condition also figure out that they can eliminate
the extra assignment here.

Fixes: 46008c6d4232 ("f2fs: support in batch multi blocks preallocation")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e92cf046d9ff..f03c34d3797f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -708,7 +708,7 @@ next_dnode:
 	}
 
 	prealloc = 0;
-	ofs_in_node = dn.ofs_in_node;
+	last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
 	end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
 
 next_block:

From d4ec990d259694a05fc3cb38ef0b70f6159c3eb8 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 1 Dec 2016 10:44:44 -0800
Subject: [PATCH 1080/3220] fs/crypto: catch up 4.9-rc6

commit d117b9acaeada0b243f31e0fe83e111fcc9a6644 upstream.

Merge tag 'ext4_for_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o:
"A security fix (so a maliciously corrupted file system image won't
panic the kernel) and some fixes for CONFIG_VMAP_STACK"

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/crypto/crypto.c       |  26 ++++----
 fs/crypto/fname.c        | 132 ++++++++++++++++++---------------------
 fs/crypto/keyinfo.c      |  85 ++++++++++++++++---------
 fs/crypto/policy.c       |   4 ++
 fs/f2fs/dir.c            |   6 +-
 fs/f2fs/namei.c          |   6 +-
 include/linux/fscrypto.h |  24 -------
 7 files changed, 141 insertions(+), 142 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 2fc8c43ce531..2d40ab9edc9f 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -28,7 +28,6 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/fscrypto.h>
-#include <linux/ecryptfs.h>
 
 static unsigned int num_prealloc_crypto_pages = 32;
 static unsigned int num_prealloc_crypto_ctxs = 128;
@@ -128,11 +127,11 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
 EXPORT_SYMBOL(fscrypt_get_ctx);
 
 /**
- * fscrypt_complete() - The completion callback for page encryption
- * @req: The asynchronous encryption request context
- * @res: The result of the encryption operation
+ * page_crypt_complete() - completion callback for page crypto
+ * @req: The asynchronous cipher request context
+ * @res: The result of the cipher operation
  */
-static void fscrypt_complete(struct crypto_async_request *req, int res)
+static void page_crypt_complete(struct crypto_async_request *req, int res)
 {
 	struct fscrypt_completion_result *ecr = req->data;
 
@@ -152,7 +151,10 @@ static int do_page_crypto(struct inode *inode,
 			struct page *src_page, struct page *dest_page,
 			gfp_t gfp_flags)
 {
-	u8 xts_tweak[FS_XTS_TWEAK_SIZE];
+	struct {
+		__le64 index;
+		u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)];
+	} xts_tweak;
 	struct skcipher_request *req = NULL;
 	DECLARE_FS_COMPLETION_RESULT(ecr);
 	struct scatterlist dst, src;
@@ -170,19 +172,17 @@ static int do_page_crypto(struct inode *inode,
 
 	skcipher_request_set_callback(
 		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-		fscrypt_complete, &ecr);
+		page_crypt_complete, &ecr);
 
-	BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index));
-	memcpy(xts_tweak, &index, sizeof(index));
-	memset(&xts_tweak[sizeof(index)], 0,
-			FS_XTS_TWEAK_SIZE - sizeof(index));
+	BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
+	xts_tweak.index = cpu_to_le64(index);
+	memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
 
 	sg_init_table(&dst, 1);
 	sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
 	sg_init_table(&src, 1);
 	sg_set_page(&src, src_page, PAGE_SIZE, 0);
-	skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE,
-					xts_tweak);
+	skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak);
 	if (rw == FS_DECRYPT)
 		res = crypto_skcipher_decrypt(req);
 	else
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 5d6d49113efa..9b774f4b50c8 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -10,21 +10,16 @@
  * This has not yet undergone a rigorous security audit.
  */
 
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
 #include <linux/scatterlist.h>
 #include <linux/ratelimit.h>
 #include <linux/fscrypto.h>
 
-static u32 size_round_up(size_t size, size_t blksize)
-{
-	return ((size + blksize - 1) / blksize) * blksize;
-}
-
 /**
- * dir_crypt_complete() -
+ * fname_crypt_complete() - completion callback for filename crypto
+ * @req: The asynchronous cipher request context
+ * @res: The result of the cipher operation
  */
-static void dir_crypt_complete(struct crypto_async_request *req, int res)
+static void fname_crypt_complete(struct crypto_async_request *req, int res)
 {
 	struct fscrypt_completion_result *ecr = req->data;
 
@@ -35,90 +30,80 @@ static void dir_crypt_complete(struct crypto_async_request *req, int res)
 }
 
 /**
- * fname_encrypt() -
+ * fname_encrypt() - encrypt a filename
  *
- * This function encrypts the input filename, and returns the length of the
- * ciphertext. Errors are returned as negative numbers.  We trust the caller to
- * allocate sufficient memory to oname string.
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
  */
 static int fname_encrypt(struct inode *inode,
 			const struct qstr *iname, struct fscrypt_str *oname)
 {
-	u32 ciphertext_len;
 	struct skcipher_request *req = NULL;
 	DECLARE_FS_COMPLETION_RESULT(ecr);
 	struct fscrypt_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_ctfm;
 	int res = 0;
 	char iv[FS_CRYPTO_BLOCK_SIZE];
-	struct scatterlist src_sg, dst_sg;
+	struct scatterlist sg;
 	int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
-	char *workbuf, buf[32], *alloc_buf = NULL;
-	unsigned lim;
+	unsigned int lim;
+	unsigned int cryptlen;
 
 	lim = inode->i_sb->s_cop->max_namelen(inode);
 	if (iname->len <= 0 || iname->len > lim)
 		return -EIO;
 
-	ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ?
-					FS_CRYPTO_BLOCK_SIZE : iname->len;
-	ciphertext_len = size_round_up(ciphertext_len, padding);
-	ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
+	/*
+	 * Copy the filename to the output buffer for encrypting in-place and
+	 * pad it with the needed number of NUL bytes.
+	 */
+	cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE);
+	cryptlen = round_up(cryptlen, padding);
+	cryptlen = min(cryptlen, lim);
+	memcpy(oname->name, iname->name, iname->len);
+	memset(oname->name + iname->len, 0, cryptlen - iname->len);
 
-	if (ciphertext_len <= sizeof(buf)) {
-		workbuf = buf;
-	} else {
-		alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
-		if (!alloc_buf)
-			return -ENOMEM;
-		workbuf = alloc_buf;
-	}
+	/* Initialize the IV */
+	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
 
-	/* Allocate request */
+	/* Set up the encryption request */
 	req = skcipher_request_alloc(tfm, GFP_NOFS);
 	if (!req) {
 		printk_ratelimited(KERN_ERR
-			"%s: crypto_request_alloc() failed\n", __func__);
-		kfree(alloc_buf);
+			"%s: skcipher_request_alloc() failed\n", __func__);
 		return -ENOMEM;
 	}
 	skcipher_request_set_callback(req,
 			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-			dir_crypt_complete, &ecr);
+			fname_crypt_complete, &ecr);
+	sg_init_one(&sg, oname->name, cryptlen);
+	skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv);
 
-	/* Copy the input */
-	memcpy(workbuf, iname->name, iname->len);
-	if (iname->len < ciphertext_len)
-		memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
-
-	/* Initialize IV */
-	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
-
-	/* Create encryption request */
-	sg_init_one(&src_sg, workbuf, ciphertext_len);
-	sg_init_one(&dst_sg, oname->name, ciphertext_len);
-	skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+	/* Do the encryption */
 	res = crypto_skcipher_encrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
+		/* Request is being completed asynchronously; wait for it */
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
-	kfree(alloc_buf);
 	skcipher_request_free(req);
-	if (res < 0)
+	if (res < 0) {
 		printk_ratelimited(KERN_ERR
 				"%s: Error (error code %d)\n", __func__, res);
+		return res;
+	}
 
-	oname->len = ciphertext_len;
-	return res;
+	oname->len = cryptlen;
+	return 0;
 }
 
-/*
- * fname_decrypt()
- *	This function decrypts the input filename, and returns
- *	the length of the plaintext.
- *	Errors are returned as negative numbers.
- *	We trust the caller to allocate sufficient memory to oname string.
+/**
+ * fname_decrypt() - decrypt a filename
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
  */
 static int fname_decrypt(struct inode *inode,
 				const struct fscrypt_str *iname,
@@ -146,7 +131,7 @@ static int fname_decrypt(struct inode *inode,
 	}
 	skcipher_request_set_callback(req,
 		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-		dir_crypt_complete, &ecr);
+		fname_crypt_complete, &ecr);
 
 	/* Initialize IV */
 	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
@@ -168,7 +153,7 @@ static int fname_decrypt(struct inode *inode,
 	}
 
 	oname->len = strnlen(oname->name, iname->len);
-	return oname->len;
+	return 0;
 }
 
 static const char *lookup_table =
@@ -231,9 +216,8 @@ u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
 
 	if (ci)
 		padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
-	if (ilen < FS_CRYPTO_BLOCK_SIZE)
-		ilen = FS_CRYPTO_BLOCK_SIZE;
-	return size_round_up(ilen, padding);
+	ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE);
+	return round_up(ilen, padding);
 }
 EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
 
@@ -279,6 +263,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer);
 /**
  * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
  * space
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
  */
 int fscrypt_fname_disk_to_usr(struct inode *inode,
 			u32 hash, u32 minor_hash,
@@ -287,13 +275,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
 {
 	const struct qstr qname = FSTR_TO_QSTR(iname);
 	char buf[24];
-	int ret;
 
 	if (fscrypt_is_dot_dotdot(&qname)) {
 		oname->name[0] = '.';
 		oname->name[iname->len - 1] = '.';
 		oname->len = iname->len;
-		return oname->len;
+		return 0;
 	}
 
 	if (iname->len < FS_CRYPTO_BLOCK_SIZE)
@@ -303,9 +290,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
 		return fname_decrypt(inode, iname, oname);
 
 	if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
-		ret = digest_encode(iname->name, iname->len, oname->name);
-		oname->len = ret;
-		return ret;
+		oname->len = digest_encode(iname->name, iname->len,
+					   oname->name);
+		return 0;
 	}
 	if (hash) {
 		memcpy(buf, &hash, 4);
@@ -315,15 +302,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
 	}
 	memcpy(buf + 8, iname->name + iname->len - 16, 16);
 	oname->name[0] = '_';
-	ret = digest_encode(buf, 24, oname->name + 1);
-	oname->len = ret + 1;
-	return ret + 1;
+	oname->len = 1 + digest_encode(buf, 24, oname->name + 1);
+	return 0;
 }
 EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
 
 /**
  * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
  * space
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
  */
 int fscrypt_fname_usr_to_disk(struct inode *inode,
 			const struct qstr *iname,
@@ -333,7 +323,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode,
 		oname->name[0] = '.';
 		oname->name[iname->len - 1] = '.';
 		oname->len = iname->len;
-		return oname->len;
+		return 0;
 	}
 	if (inode->i_crypt_info)
 		return fname_encrypt(inode, iname, oname);
@@ -367,10 +357,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 	if (dir->i_crypt_info) {
 		ret = fscrypt_fname_alloc_buffer(dir, iname->len,
 							&fname->crypto_buf);
-		if (ret < 0)
+		if (ret)
 			return ret;
 		ret = fname_encrypt(dir, iname, &fname->crypto_buf);
-		if (ret < 0)
+		if (ret)
 			goto errout;
 		fname->disk_name.name = fname->crypto_buf.name;
 		fname->disk_name.len = fname->crypto_buf.len;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 1ac263eddc4e..67fb6d8876d0 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -8,11 +8,8 @@
  * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
  */
 
-#include <keys/encrypted-type.h>
 #include <keys/user-type.h>
-#include <linux/random.h>
 #include <linux/scatterlist.h>
-#include <uapi/linux/keyctl.h>
 #include <linux/fscrypto.h>
 
 static void derive_crypt_complete(struct crypto_async_request *req, int rc)
@@ -139,6 +136,38 @@ out:
 	return res;
 }
 
+static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
+				 const char **cipher_str_ret, int *keysize_ret)
+{
+	if (S_ISREG(inode->i_mode)) {
+		if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) {
+			*cipher_str_ret = "xts(aes)";
+			*keysize_ret = FS_AES_256_XTS_KEY_SIZE;
+			return 0;
+		}
+		pr_warn_once("fscrypto: unsupported contents encryption mode "
+			     "%d for inode %lu\n",
+			     ci->ci_data_mode, inode->i_ino);
+		return -ENOKEY;
+	}
+
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+		if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) {
+			*cipher_str_ret = "cts(cbc(aes))";
+			*keysize_ret = FS_AES_256_CTS_KEY_SIZE;
+			return 0;
+		}
+		pr_warn_once("fscrypto: unsupported filenames encryption mode "
+			     "%d for inode %lu\n",
+			     ci->ci_filename_mode, inode->i_ino);
+		return -ENOKEY;
+	}
+
+	pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n",
+		     (inode->i_mode & S_IFMT), inode->i_ino);
+	return -ENOKEY;
+}
+
 static void put_crypt_info(struct fscrypt_info *ci)
 {
 	if (!ci)
@@ -155,8 +184,8 @@ int get_crypt_info(struct inode *inode)
 	struct fscrypt_context ctx;
 	struct crypto_skcipher *ctfm;
 	const char *cipher_str;
-	u8 raw_key[FS_MAX_KEY_SIZE];
-	u8 mode;
+	int keysize;
+	u8 *raw_key = NULL;
 	int res;
 
 	res = fscrypt_initialize();
@@ -179,13 +208,19 @@ retry:
 	if (res < 0) {
 		if (!fscrypt_dummy_context_enabled(inode))
 			return res;
+		ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
 		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
 		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
 		ctx.flags = 0;
 	} else if (res != sizeof(ctx)) {
 		return -EINVAL;
 	}
-	res = 0;
+
+	if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+		return -EINVAL;
+
+	if (ctx.flags & ~FS_POLICY_FLAGS_VALID)
+		return -EINVAL;
 
 	crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
 	if (!crypt_info)
@@ -198,27 +233,20 @@ retry:
 	crypt_info->ci_keyring_key = NULL;
 	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
 				sizeof(crypt_info->ci_master_key));
-	if (S_ISREG(inode->i_mode))
-		mode = crypt_info->ci_data_mode;
-	else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		mode = crypt_info->ci_filename_mode;
-	else
-		BUG();
 
-	switch (mode) {
-	case FS_ENCRYPTION_MODE_AES_256_XTS:
-		cipher_str = "xts(aes)";
-		break;
-	case FS_ENCRYPTION_MODE_AES_256_CTS:
-		cipher_str = "cts(cbc(aes))";
-		break;
-	default:
-		printk_once(KERN_WARNING
-			    "%s: unsupported key mode %d (ino %u)\n",
-			    __func__, mode, (unsigned) inode->i_ino);
-		res = -ENOKEY;
+	res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize);
+	if (res)
 		goto out;
-	}
+
+	/*
+	 * This cannot be a stack buffer because it is passed to the scatterlist
+	 * crypto API as part of key derivation.
+	 */
+	res = -ENOMEM;
+	raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS);
+	if (!raw_key)
+		goto out;
+
 	if (fscrypt_dummy_context_enabled(inode)) {
 		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
 		goto got_key;
@@ -253,11 +281,12 @@ got_key:
 	crypt_info->ci_ctfm = ctfm;
 	crypto_skcipher_clear_flags(ctfm, ~0);
 	crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
-	res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode));
+	res = crypto_skcipher_setkey(ctfm, raw_key, keysize);
 	if (res)
 		goto out;
 
-	memzero_explicit(raw_key, sizeof(raw_key));
+	kzfree(raw_key);
+	raw_key = NULL;
 	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
 		put_crypt_info(crypt_info);
 		goto retry;
@@ -268,7 +297,7 @@ out:
 	if (res == -ENOKEY)
 		res = 0;
 	put_crypt_info(crypt_info);
-	memzero_explicit(raw_key, sizeof(raw_key));
+	kzfree(raw_key);
 	return res;
 }
 
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index ed115acb5dee..6865663aac69 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -109,6 +109,8 @@ int fscrypt_process_policy(struct file *filp,
 	if (ret)
 		return ret;
 
+	inode_lock(inode);
+
 	if (!inode_has_encryption_context(inode)) {
 		if (!S_ISDIR(inode->i_mode))
 			ret = -EINVAL;
@@ -127,6 +129,8 @@ int fscrypt_process_policy(struct file *filp,
 		ret = -EINVAL;
 	}
 
+	inode_unlock(inode);
+
 	mnt_drop_write_file(filp);
 	return ret;
 }
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 5594667c2f41..210082783d5a 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -814,12 +814,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 
 		if (f2fs_encrypted_inode(d->inode)) {
 			int save_len = fstr->len;
-			int ret;
+			int err;
 
-			ret = fscrypt_fname_disk_to_usr(d->inode,
+			err = fscrypt_fname_disk_to_usr(d->inode,
 						(u32)de->hash_code, 0,
 						&de_name, fstr);
-			if (ret < 0)
+			if (err)
 				return true;
 
 			de_name = *fstr;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 7f2fdb154180..468b2dbe6d34 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -451,7 +451,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 		ostr.name = sd->encrypted_path;
 		ostr.len = disk_link.len;
 		err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
-		if (err < 0)
+		if (err)
 			goto err_out;
 
 		sd->len = cpu_to_le16(ostr.len);
@@ -1047,7 +1047,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 		goto errout;
 
 	res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
-	if (res < 0)
+	if (res)
 		goto errout;
 
 	/* this is broken symlink case */
@@ -1059,7 +1059,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 	paddr = pstr.name;
 
 	/* Null-terminate the name */
-	paddr[res] = '\0';
+	paddr[pstr.len] = '\0';
 
 	put_page(cpage);
 	return *cookie = paddr;
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 76cff18bb032..ff8b11b26f31 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -111,23 +111,6 @@ struct fscrypt_completion_result {
 	struct fscrypt_completion_result ecr = { \
 		COMPLETION_INITIALIZER((ecr).completion), 0 }
 
-static inline int fscrypt_key_size(int mode)
-{
-	switch (mode) {
-	case FS_ENCRYPTION_MODE_AES_256_XTS:
-		return FS_AES_256_XTS_KEY_SIZE;
-	case FS_ENCRYPTION_MODE_AES_256_GCM:
-		return FS_AES_256_GCM_KEY_SIZE;
-	case FS_ENCRYPTION_MODE_AES_256_CBC:
-		return FS_AES_256_CBC_KEY_SIZE;
-	case FS_ENCRYPTION_MODE_AES_256_CTS:
-		return FS_AES_256_CTS_KEY_SIZE;
-	default:
-		BUG();
-	}
-	return 0;
-}
-
 #define FS_FNAME_NUM_SCATTER_ENTRIES	4
 #define FS_CRYPTO_BLOCK_SIZE		16
 #define FS_FNAME_CRYPTO_DIGEST_SIZE	32
@@ -202,13 +185,6 @@ static inline bool fscrypt_valid_filenames_enc_mode(u32 mode)
 	return (mode == FS_ENCRYPTION_MODE_AES_256_CTS);
 }
 
-static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size)
-{
-	if (size == fscrypt_key_size(mode))
-		return size;
-	return 0;
-}
-
 static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
 {
 	if (str->len == 1 && str->name[0] == '.')

From 7d2eab1921a170ba724bdaf5247d76336ed5472e Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 29 Oct 2016 18:46:34 +0800
Subject: [PATCH 1081/3220] f2fs: report error of f2fs_fill_dentries

commit ed6bd4b146527e7c6934e3582c47d7b857802676 upstream.

Report error of f2fs_fill_dentries to ->iterate_shared, otherwise when
error ocurrs, user may just list part of dirents in target directory
without any hints.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c    | 21 ++++++++++++---------
 fs/f2fs/f2fs.h   |  2 +-
 fs/f2fs/inline.c |  6 ++++--
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 210082783d5a..4436079dbf0c 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -785,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir)
 	return true;
 }
 
-bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			unsigned int start_pos, struct fscrypt_str *fstr)
 {
 	unsigned char d_type = DT_UNKNOWN;
@@ -820,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 						(u32)de->hash_code, 0,
 						&de_name, fstr);
 			if (err)
-				return true;
+				return err;
 
 			de_name = *fstr;
 			fstr->len = save_len;
@@ -828,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 
 		if (!dir_emit(ctx, de_name.name, de_name.len,
 					le32_to_cpu(de->ino), d_type))
-			return true;
+			return 1;
 
 		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
 		ctx->pos = start_pos + bit_pos;
 	}
-	return false;
+	return 0;
 }
 
 static int f2fs_readdir(struct file *file, struct dir_context *ctx)
@@ -872,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		dentry_page = get_lock_data_page(inode, n, false);
 		if (IS_ERR(dentry_page)) {
 			err = PTR_ERR(dentry_page);
-			if (err == -ENOENT)
+			if (err == -ENOENT) {
+				err = 0;
 				continue;
-			else
+			} else {
 				goto out;
+			}
 		}
 
 		dentry_blk = kmap(dentry_page);
 
 		make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
 
-		if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
+		err = f2fs_fill_dentries(ctx, &d,
+				n * NR_DENTRY_IN_BLOCK, &fstr);
+		if (err) {
 			kunmap(dentry_page);
 			f2fs_put_page(dentry_page, 1);
 			break;
@@ -892,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 	}
-	err = 0;
 out:
 	fscrypt_fname_free_buffer(&fstr);
-	return err;
+	return err < 0 ? err : 0;
 }
 
 static int f2fs_dir_open(struct inode *inode, struct file *filp)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 932c53f441db..4b13d70d716c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2014,7 +2014,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t);
 unsigned char get_de_type(struct f2fs_dir_entry *);
 struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
 			f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
-bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
+int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
 			unsigned int, struct fscrypt_str *);
 void do_make_empty_dir(struct inode *, struct inode *,
 			struct f2fs_dentry_ptr *);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index b859d5e377ae..b85987703d1e 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -629,6 +629,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 	struct f2fs_inline_dentry *inline_dentry = NULL;
 	struct page *ipage = NULL;
 	struct f2fs_dentry_ptr d;
+	int err;
 
 	if (ctx->pos == NR_INLINE_DENTRY)
 		return 0;
@@ -641,11 +642,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 
 	make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
 
-	if (!f2fs_fill_dentries(ctx, &d, 0, fstr))
+	err = f2fs_fill_dentries(ctx, &d, 0, fstr);
+	if (!err)
 		ctx->pos = NR_INLINE_DENTRY;
 
 	f2fs_put_page(ipage, 1);
-	return 0;
+	return err < 0 ? err : 0;
 }
 
 int f2fs_inline_data_fiemap(struct inode *inode,

From a91b9fe273685a11e958a38f373798847e44c41d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 31 Oct 2016 14:01:41 -0700
Subject: [PATCH 1082/3220] f2fs: avoid infinite loop in the EIO case on
 recover_orphan_inodes

commit 099228000eff6b25e0f76b276043cd65cd4eba5a upstream.

This patch should fix an infinite loop case below.

F2FS-fs : inject IO error in f2fs_read_end_io+0xf3/0x120 [f2fs]
F2FS-fs (nvme0n1p1): recover_orphan_inode: orphan failed (ino=39ac1a), run fsck to fix.
...
[<ffffffffc0b11ede>] sync_meta_pages+0xae/0x270 [f2fs]
[<ffffffffc0b288dd>] ? flush_sit_entries+0x8d/0x960 [f2fs]
[<ffffffffc0b13801>] write_checkpoint+0x361/0xf20 [f2fs]
[<ffffffffb40e979d>] ? trace_hardirqs_on+0xd/0x10
[<ffffffffc0b0a199>] ? f2fs_sync_fs+0x79/0x190 [f2fs]
[<ffffffffc0b0a1a5>] f2fs_sync_fs+0x85/0x190 [f2fs]
[<ffffffffc0b2560e>] f2fs_balance_fs_bg+0x7e/0x1c0 [f2fs]
[<ffffffffc0b216c4>] f2fs_write_node_pages+0x34/0x320 [f2fs]
[<ffffffffb41dff21>] do_writepages+0x21/0x30
[<ffffffffb429edb1>] __writeback_single_inode+0x61/0x760
[<ffffffffb490a937>] ? _raw_spin_unlock+0x27/0x40
[<ffffffffb42a0805>] writeback_single_inode+0xd5/0x190
[<ffffffffb42a0959>] write_inode_now+0x99/0xc0
[<ffffffffb4289a16>] iput+0x1f6/0x2c0
[<ffffffffc0b0e3be>] f2fs_fill_super+0xe0e/0x1300 [f2fs]
[<ffffffffb426c394>] ? sget_userns+0x4f4/0x530
[<ffffffffb426c692>] mount_bdev+0x182/0x1b0
[<ffffffffc0b0d5b0>] ? f2fs_commit_super+0x100/0x100 [f2fs]
[<ffffffffc0b0a375>] f2fs_mount+0x15/0x20 [f2fs]
[<ffffffffb426d038>] mount_fs+0x38/0x170
[<ffffffffb428ec9b>] vfs_kern_mount+0x6b/0x160
[<ffffffffb4291d9e>] do_mount+0x1be/0xd60
[<ffffffffb4291a57>] ? copy_mount_options+0xb7/0x220
[<ffffffffb4292c54>] SyS_mount+0x94/0xd0
[<ffffffffb490b345>] entry_SYSCALL_64_fastpath+0x23/0xc6

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6034d51fc5fc..e007c011ec53 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1890,6 +1890,13 @@ free_node_inode:
 	mutex_lock(&sbi->umount_mutex);
 	release_ino_entry(sbi, true);
 	f2fs_leave_shrinker(sbi);
+	/*
+	 * Some dirty meta pages can be produced by recover_orphan_inodes()
+	 * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
+	 * followed by write_checkpoint() through f2fs_write_node_pages(), which
+	 * falls into an infinite loop in sync_meta_pages().
+	 */
+	truncate_inode_pages_final(META_MAPPING(sbi));
 	iput(sbi->node_inode);
 	mutex_unlock(&sbi->umount_mutex);
 free_nm:

From 97df49a0c31a5d14f1053ea8c737173e3f38653a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 28 Oct 2016 17:44:59 +0900
Subject: [PATCH 1083/3220] f2fs: Add missing break in switch-case

commit 487df616dec33231c99294b906d720d256a2de16 upstream.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e007c011ec53..4fd34e7bcf60 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -420,6 +420,7 @@ static int parse_options(struct super_block *sb, char *options)
 			break;
 		case Opt_nodiscard:
 			clear_opt(sbi, DISCARD);
+			break;
 		case Opt_noheap:
 			set_opt(sbi, NOHEAP);
 			break;

From 22bbc1efdb5ecb7ad98e5085eb7b42830503f432 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 28 Oct 2016 17:45:00 +0900
Subject: [PATCH 1084/3220] f2fs: Use generic zoned block device terminology

commit 0bfd7a091c19132489a0f977b8dbf9f6b5ae0a1c upstream.

SMR stands for "Shingled Magnetic Recording" which makes sense
only for hard disk drives (spinning rust). The ZBC/ZAC standards
enable management of SMR disks, but solid state drives may also
support those standards. So rename the HMSMR feature to BLKZONED
to avoid a HDD centric terminology. For the same reason, rename
f2fs_sb_mounted_hmsmr to f2fs_sb_mounted_blkzoned.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c  | 2 +-
 fs/f2fs/f2fs.h  | 6 +++---
 fs/f2fs/super.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f03c34d3797f..a60efb8fdd3a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -111,7 +111,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw,
 {
 	if (!is_read_io(rw)) {
 		atomic_inc(&sbi->nr_wb_bios);
-		if (f2fs_sb_mounted_hmsmr(sbi->sb) &&
+		if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
 			current->plug && (type == DATA || type == NODE))
 			blk_finish_plug(current->plug);
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4b13d70d716c..5a563a9f3a52 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -103,7 +103,7 @@ struct f2fs_mount_info {
 };
 
 #define F2FS_FEATURE_ENCRYPT	0x0001
-#define F2FS_FEATURE_HMSMR	0x0002
+#define F2FS_FEATURE_BLKZONED	0x0002
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -2470,9 +2470,9 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
 }
 
-static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb)
+static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
 {
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR);
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
 }
 
 static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4fd34e7bcf60..3574d0620dc4 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -974,7 +974,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, EXTENT_CACHE);
 	sbi->sb->s_flags |= MS_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
-	if (f2fs_sb_mounted_hmsmr(sbi->sb)) {
+	if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
 		set_opt_mode(sbi, F2FS_MOUNT_LFS);
 		set_opt(sbi, DISCARD);
 	} else {

From 1529b8f943a1082e8be64d63c7206b9d72073e1a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 28 Oct 2016 17:45:01 +0900
Subject: [PATCH 1085/3220] f2fs: Check zoned block feature for host-managed
 zoned block devices

commit d1b959c8770260b611b9a1f0c5e8b12b7cb5b9d2 upstream.

The F2FS_FEATURE_BLKZONED feature indicates that the drive was formatted
 with zone alignment optimization. This is optional for host-aware
devices, but mandatory for host-managed zoned block devices.
So check that the feature is set in this latter case.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3574d0620dc4..4187e3b9a83e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1639,6 +1639,26 @@ try_onemore:
 	sb->s_fs_info = sbi;
 	sbi->raw_super = raw_super;
 
+	/*
+	 * The BLKZONED feature indicates that the drive was formatted with
+	 * zone alignment optimization. This is optional for host-aware
+	 * devices, but mandatory for host-managed zoned block devices.
+	 */
+#ifndef CONFIG_BLK_DEV_ZONED
+	if (f2fs_sb_mounted_blkzoned(sb)) {
+		f2fs_msg(sb, KERN_ERR,
+			 "Zoned block device support is not enabled\n");
+		goto free_sb_buf;
+	}
+#else
+	if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM &&
+	    !f2fs_sb_mounted_blkzoned(sb)) {
+		f2fs_msg(sb, KERN_ERR,
+			 "Zoned block device feature not enabled\n");
+		goto free_sb_buf;
+	}
+#endif
+
 	default_options(sbi);
 	/* parse mount options */
 	options = kstrdup((const char *)data, GFP_KERNEL);

From ecc252e7a4d71b2c4197a70f085a3c2435ff8273 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 28 Oct 2016 17:45:02 +0900
Subject: [PATCH 1086/3220] f2fs: Suppress discard warning message for zoned
 block devices

commit 0ab0299835738cd407569401da1fef4c97b4419c upstream.

For zoned block devices, discard is replaced by zone reset. So
do not warn if the device does not supports discard.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4187e3b9a83e..3e57ec837de9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -412,7 +412,7 @@ static int parse_options(struct super_block *sb, char *options)
 			q = bdev_get_queue(sb->s_bdev);
 			if (blk_queue_discard(q)) {
 				set_opt(sbi, DISCARD);
-			} else {
+			} else if (!f2fs_sb_mounted_blkzoned(sb)) {
 				f2fs_msg(sb, KERN_WARNING,
 					"mounting with \"discard\" option, but "
 					"the device does not support discard");

From 4b1d4ef0b7dd50ee42c1de77b03dd4039ee25672 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 28 Oct 2016 17:45:03 +0900
Subject: [PATCH 1087/3220] f2fs: Always enable discard for zoned blocks
 devices

commit 96ba2decb4241aa2c6b61cfc8489d648769eff99 upstream.

Zone write pointer reset acts as discard for zoned block
devices. So if the zoned block device feature is enabled,
always declare that discard is enabled, even if the device
does not actually support the command.
For the same reason, prevent the use the "nodicard" mount
option.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 14 +++++++-------
 fs/f2fs/super.c |  5 +++++
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5a563a9f3a52..4fd31208965d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1159,13 +1159,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
 	spin_unlock(&sbi->cp_lock);
 }
 
-static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
-{
-	struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
-
-	return blk_queue_discard(q);
-}
-
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 {
 	down_read(&sbi->cp_rwsem);
@@ -2475,6 +2468,13 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
 }
 
+static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+{
+	struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+
+	return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb);
+}
+
 static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
 {
 	clear_opt(sbi, ADAPTIVE);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3e57ec837de9..33676c1e35d7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -419,6 +419,11 @@ static int parse_options(struct super_block *sb, char *options)
 			}
 			break;
 		case Opt_nodiscard:
+			if (f2fs_sb_mounted_blkzoned(sb)) {
+				f2fs_msg(sb, KERN_WARNING,
+					"discard is required for zoned block devices");
+				return -EINVAL;
+			}
 			clear_opt(sbi, DISCARD);
 			break;
 		case Opt_noheap:

From d9d8c376e4929feda317453b75bffd020a4fbd31 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 28 Oct 2016 17:45:04 +0900
Subject: [PATCH 1088/3220] f2fs: Do not allow adaptive mode for host-managed
 zoned block devices

commit 3adc57e97792e4ac9f228bde802829e2e9840afe upstream.

The LFS mode is mandatory for host-managed zoned block devices as
update in place optimizations are not possible for segments in
sequential zones.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 33676c1e35d7..6bc0810969b7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -518,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options)
 				return -ENOMEM;
 			if (strlen(name) == 8 &&
 					!strncmp(name, "adaptive", 8)) {
+				if (f2fs_sb_mounted_blkzoned(sb)) {
+					f2fs_msg(sb, KERN_WARNING,
+						 "adaptive mode is not allowed with "
+						 "zoned block device feature");
+					kfree(name);
+					return -EINVAL;
+				}
 				set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
 			} else if (strlen(name) == 3 &&
 					!strncmp(name, "lfs", 3)) {

From 018fc18e28098712f19e23712ce24310ba0efdf5 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 28 Oct 2016 17:45:05 +0900
Subject: [PATCH 1089/3220] f2fs: Cache zoned block devices zone type

commit 178053e2f1f9ccdb61ff6c2bd8644b53fc98e72e upstream.

With the zoned block device feature enabled, section discard
need to do a zone reset for sections contained in sequential
zones, and a regular discard (if supported) for sections
stored in conventional zones. Avoid the need for a costly
report zones to obtain a section zone type when discarding it
by caching the types of the device zones in the super block
information. This cache is initialized at mount time for mounts
with the zoned block device feature enabled.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 18 +++++++++++++
 fs/f2fs/super.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4fd31208965d..c6dba704b0fe 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -803,6 +803,14 @@ struct f2fs_sb_info {
 	u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
 	u8 key_prefix_size;
 #endif
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	unsigned int nr_blkz;			/* Total number of zones */
+	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
+	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
+	u8 *blkz_type;				/* Array of zones type */
+#endif
+
 	/* for node-related operations */
 	struct f2fs_nm_info *nm_info;		/* node manager */
 	struct inode *node_inode;		/* cache node blocks */
@@ -2468,6 +2476,16 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline int get_blkz_type(struct f2fs_sb_info *sbi,
+				block_t blkaddr)
+{
+	unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
+
+	return sbi->blkz_type[zno];
+}
+#endif
+
 static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
 {
 	struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6bc0810969b7..d777a18df958 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1512,6 +1512,65 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
 								GFP_KERNEL);
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+static int init_blkz_info(struct f2fs_sb_info *sbi)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	sector_t nr_sectors = bdev->bd_part->nr_sects;
+	sector_t sector = 0;
+	struct blk_zone *zones;
+	unsigned int i, nr_zones;
+	unsigned int n = 0;
+	int err = -EIO;
+
+	if (!f2fs_sb_mounted_blkzoned(sbi->sb))
+		return 0;
+
+	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
+	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
+	sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
+		sbi->log_blocks_per_blkz;
+	if (nr_sectors & (bdev_zone_size(bdev) - 1))
+		sbi->nr_blkz++;
+
+	sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL);
+	if (!sbi->blkz_type)
+		return -ENOMEM;
+
+#define F2FS_REPORT_NR_ZONES   4096
+
+	zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone),
+			GFP_KERNEL);
+	if (!zones)
+		return -ENOMEM;
+
+	/* Get block zones type */
+	while (zones && sector < nr_sectors) {
+
+		nr_zones = F2FS_REPORT_NR_ZONES;
+		err = blkdev_report_zones(bdev, sector,
+					  zones, &nr_zones,
+					  GFP_KERNEL);
+		if (err)
+			break;
+		if (!nr_zones) {
+			err = -EIO;
+			break;
+		}
+
+		for (i = 0; i < nr_zones; i++) {
+			sbi->blkz_type[n] = zones[i].type;
+			sector += zones[i].len;
+			n++;
+		}
+	}
+
+	kfree(zones);
+
+	return err;
+}
+#endif
+
 /*
  * Read f2fs raw super block.
  * Because we have two copies of super block, so read both of them
@@ -1758,6 +1817,15 @@ try_onemore:
 
 	init_ino_entry_info(sbi);
 
+#ifdef CONFIG_BLK_DEV_ZONED
+	err = init_blkz_info(sbi);
+	if (err) {
+		f2fs_msg(sb, KERN_ERR,
+			"Failed to initialize F2FS blkzone information");
+		goto free_blkz;
+	}
+#endif
+
 	/* setup f2fs internal modules */
 	err = build_segment_manager(sbi);
 	if (err) {
@@ -1936,6 +2004,10 @@ free_nm:
 	destroy_node_manager(sbi);
 free_sm:
 	destroy_segment_manager(sbi);
+#ifdef CONFIG_BLK_DEV_ZONED
+free_blkz:
+	kfree(sbi->blkz_type);
+#endif
 	kfree(sbi->ckpt);
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);

From 0573aa06787e9f274bcce187687c0e64f0dd69d2 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 28 Oct 2016 17:45:06 +0900
Subject: [PATCH 1090/3220] f2fs: Reset sequential zones on zoned block devices

commit f46e8809e88d113ba096bcfc772b5182ce00b941 upstream.

When a zoned block device is mounted, discarding sections
contained in sequential zones must reset the zone write pointer.
For sections contained in conventional zones, the regular discard
is used if the drive supports it.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index ec4d74c26067..8e4863bd36f5 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -16,6 +16,7 @@
 #include <linux/kthread.h>
 #include <linux/swap.h>
 #include <linux/timer.h>
+#include <linux/timer.h>
 
 #include "f2fs.h"
 #include "segment.h"
@@ -584,6 +585,45 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 	mutex_unlock(&dirty_i->seglist_lock);
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
+					block_t blkstart, block_t blklen)
+{
+	sector_t sector = SECTOR_FROM_BLOCK(blkstart);
+	sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
+	struct block_device *bdev = sbi->sb->s_bdev;
+
+	if (nr_sects != bdev_zone_size(bdev)) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			 "Unaligned discard attempted (sector %llu + %llu)",
+			 (unsigned long long)sector,
+			 (unsigned long long)nr_sects);
+		return -EIO;
+	}
+
+	/*
+	 * We need to know the type of the zone: for conventional zones,
+	 * use regular discard if the drive supports it. For sequential
+	 * zones, reset the zone write pointer.
+	 */
+	switch (get_blkz_type(sbi, blkstart)) {
+
+	case BLK_ZONE_TYPE_CONVENTIONAL:
+		if (!blk_queue_discard(bdev_get_queue(bdev)))
+			return 0;
+		return blkdev_issue_discard(bdev, sector, nr_sects,
+						GFP_NOFS, 0);
+	case BLK_ZONE_TYPE_SEQWRITE_REQ:
+	case BLK_ZONE_TYPE_SEQWRITE_PREF:
+		return blkdev_reset_zones(bdev, sector,
+					  nr_sects, GFP_NOFS);
+	default:
+		/* Unknown zone type: broken device ? */
+		return -EIO;
+	}
+}
+#endif
+
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 				block_t blkstart, block_t blklen)
 {
@@ -601,6 +641,11 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 			sbi->discard_blks--;
 	}
 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	if (f2fs_sb_mounted_blkzoned(sbi->sb))
+		return f2fs_issue_discard_zone(sbi, blkstart, blklen);
+#endif
 	return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
 }
 

From e96476298b9745df77f1a57d0bf93ff134b174b4 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 28 Oct 2016 17:45:07 +0900
Subject: [PATCH 1091/3220] f2fs: Trace reset zone events

commit 126606c7a99b32ba8265a51fab01533fe40c9ecc upstream.

Similarly to the regular discard, trace zone reset events.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c           |  1 +
 include/trace/events/f2fs.h | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8e4863bd36f5..06b9d16a19f6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -615,6 +615,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 						GFP_NOFS, 0);
 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
+		trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
 		return blkdev_reset_zones(bdev, sector,
 					  nr_sects, GFP_NOFS);
 	default:
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 3a09bb4dc3b2..90d6ad49a9c5 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -1110,6 +1110,27 @@ TRACE_EVENT(f2fs_issue_discard,
 		(unsigned long long)__entry->blklen)
 );
 
+TRACE_EVENT(f2fs_issue_reset_zone,
+
+	TP_PROTO(struct super_block *sb, block_t blkstart),
+
+	TP_ARGS(sb, blkstart),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(block_t, blkstart)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->blkstart = blkstart;
+	),
+
+	TP_printk("dev = (%d,%d), reset zone at block = 0x%llx",
+		show_dev(__entry),
+		(unsigned long long)__entry->blkstart)
+);
+
 TRACE_EVENT(f2fs_issue_flush,
 
 	TP_PROTO(struct super_block *sb, unsigned int nobarrier,

From c82c6b15a1b0775c6f72ad43a4ed79d37b9c5352 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 2 Nov 2016 20:43:21 +0800
Subject: [PATCH 1092/3220] f2fs: record inode updating status correctly

commit 60dcedc9972d136fab1600c919dd32080bcc26f7 upstream.

We should record updating status of inode only for living inode, for those
unlinked inode it needs to clear its ino cache, otherwise after the ino
was been reused, it will cause unneeded node page writing during ->fsync.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inode.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 7b5e402f0a72..af06bda51a54 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -377,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode)
 		goto no_delete;
 #endif
 
+	remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
+	remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+
 	sb_start_intwrite(inode->i_sb);
 	set_inode_flag(inode, FI_NO_ALLOC);
 	i_size_write(inode, 0);
@@ -409,10 +412,12 @@ no_delete:
 	invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
 	if (xnid)
 		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
-	if (is_inode_flag_set(inode, FI_APPEND_WRITE))
-		add_ino_entry(sbi, inode->i_ino, APPEND_INO);
-	if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
-		add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+	if (inode->i_nlink) {
+		if (is_inode_flag_set(inode, FI_APPEND_WRITE))
+			add_ino_entry(sbi, inode->i_ino, APPEND_INO);
+		if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
+			add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+	}
 	if (is_inode_flag_set(inode, FI_FREE_NID)) {
 		alloc_nid_failed(sbi, inode->i_ino);
 		clear_inode_flag(inode, FI_FREE_NID);

From b2942d459ad6e47c6bcdd55b8df0137f8841d29b Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 4 Nov 2016 00:26:55 +0800
Subject: [PATCH 1093/3220] f2fs: fix wrong i_atime recovery

commit 9f0552e078b8a25fcf9c3950a05ea1398616d2b9 upstream.

Shouldn't update in-memory i_atime with on-disk i_mtime of inode when
recovering inode.

Shuoran found this bug which is hidden for a long time, honour is belong
to him.

Signed-off-by: Shuoran Liu <liushuoran@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/recovery.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 2fc84a991325..d2ba4da08ec3 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -180,10 +180,10 @@ static void recover_inode(struct inode *inode, struct page *page)
 
 	inode->i_mode = le16_to_cpu(raw->i_mode);
 	f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
-	inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+	inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
 	inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
 	inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
-	inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+	inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
 	inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
 

From 0ec5bcd0fe77090d0a8b91012553ff8f85dfa216 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 4 Nov 2016 14:33:57 -0700
Subject: [PATCH 1094/3220] f2fs: assign segments correctly for direct_io

commit bdb7d964c4fb73078ab21c20e8c7b7bf7e641bb6 upstream.

Previously, we assigned CURSEG_WARM_DATA for direct_io, but if we have two or
four logs, we do not use that type at all.
Let's fix it.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 06b9d16a19f6..4bdf1191a36f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1422,8 +1422,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	struct curseg_info *curseg;
 	bool direct_io = (type == CURSEG_DIRECT_IO);
 
-	type = direct_io ? CURSEG_WARM_DATA : type;
-
+	if (direct_io) {
+		if (sbi->active_logs <= 4)
+			type = CURSEG_HOT_DATA;
+		else
+			type = CURSEG_WARM_DATA;
+	}
 	curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);

From 67609b128649f686d29047cff009ad28b01eac16 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 4 Nov 2016 14:59:15 -0700
Subject: [PATCH 1095/3220] f2fs: remove checkpoint in f2fs_freeze

commit b4b9d34c855ef383402cd1acefb1e33a515ae5f5 upstream.

The generic freeze_super() calls sync_filesystems() before f2fs_freeze().
So, basically we don't need to do checkpoint in f2fs_freeze(). But, in xfs/068,
it triggers circular locking problem below due to gc_mutex for checkpoint.

======================================================
[ INFO: possible circular locking dependency detected ]
4.9.0-rc1+ #132 Tainted: G           OE
-------------------------------------------------------

1. wait for __sb_start_write() by

 [<ffffffff9845f353>] dump_stack+0x85/0xc2
 [<ffffffff980e80bf>] print_circular_bug+0x1cf/0x230
 [<ffffffff980eb4d0>] __lock_acquire+0x19e0/0x1bc0
 [<ffffffff980ebdcb>] lock_acquire+0x11b/0x220
 [<ffffffffc08c7c3b>] ? f2fs_drop_inode+0x9b/0x160 [f2fs]
 [<ffffffff9826bdd0>] __sb_start_write+0x130/0x200
 [<ffffffffc08c7c3b>] ? f2fs_drop_inode+0x9b/0x160 [f2fs]
 [<ffffffffc08c7c3b>] f2fs_drop_inode+0x9b/0x160 [f2fs]
 [<ffffffff98289991>] iput+0x171/0x2c0
 [<ffffffffc08cfccf>] f2fs_sync_inode_meta+0x3f/0xf0 [f2fs]
 [<ffffffffc08cfe04>] block_operations+0x84/0x110 [f2fs]
 [<ffffffffc08cff78>] write_checkpoint+0xe8/0xf20 [f2fs]
 [<ffffffff980e979d>] ? trace_hardirqs_on+0xd/0x10
 [<ffffffffc08c6de9>] ? f2fs_sync_fs+0x79/0x190 [f2fs]
 [<ffffffff9803e9d9>] ? sched_clock+0x9/0x10
 [<ffffffffc08c6de9>] ? f2fs_sync_fs+0x79/0x190 [f2fs]
 [<ffffffffc08c6df5>] f2fs_sync_fs+0x85/0x190 [f2fs]
 [<ffffffff982a4f90>] ? do_fsync+0x70/0x70
 [<ffffffff982a4f90>] ? do_fsync+0x70/0x70
 [<ffffffff982a4fb0>] sync_fs_one_sb+0x20/0x30
 [<ffffffff9826ca3e>] iterate_supers+0xae/0x100
 [<ffffffff982a50b5>] sys_sync+0x55/0x90
 [<ffffffff9890b345>] entry_SYSCALL_64_fastpath+0x23/0xc6

2. wait for sbi->gc_mutex by

 [<ffffffff980ebdcb>] lock_acquire+0x11b/0x220
 [<ffffffff989063d6>] mutex_lock_nested+0x76/0x3f0
 [<ffffffffc08c6de9>] f2fs_sync_fs+0x79/0x190 [f2fs]
 [<ffffffffc08c7a6c>] f2fs_freeze+0x1c/0x20 [f2fs]
 [<ffffffff9826b6ef>] freeze_super+0xcf/0x190
 [<ffffffff9827eebc>] do_vfs_ioctl+0x53c/0x6a0
 [<ffffffff9827f099>] SyS_ioctl+0x79/0x90
 [<ffffffff9890b345>] entry_SYSCALL_64_fastpath+0x23/0xc6

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d777a18df958..7a0634b0bee8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -800,13 +800,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 
 static int f2fs_freeze(struct super_block *sb)
 {
-	int err;
-
 	if (f2fs_readonly(sb))
 		return 0;
 
-	err = f2fs_sync_fs(sb, 1);
-	return err;
+	/* IO error happened before */
+	if (unlikely(f2fs_cp_error(F2FS_SB(sb))))
+		return -EIO;
+
+	/* must be clean, since sync_filesystem() was already called */
+	if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
+		return -EINVAL;
+	return 0;
 }
 
 static int f2fs_unfreeze(struct super_block *sb)
@@ -2153,3 +2157,4 @@ module_exit(exit_f2fs_fs)
 MODULE_AUTHOR("Samsung Electronics's Praesto Team");
 MODULE_DESCRIPTION("Flash Friendly File System");
 MODULE_LICENSE("GPL");
+

From e035bdfc8b8af673fc8bfe0c4b5aeff6f9d57a9d Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 5 Nov 2016 11:12:40 +0800
Subject: [PATCH 1096/3220] Revert "f2fs: do not recover from previous remained
 wrong dnodes"

commit d47b8715953ad0cf82bb0a9d30d7b11d83bc9c11 upstream.

i_times of inode will be set with current system time which can be
configured through 'date', so it's not safe to judge dnode block as
garbage data or unchanged inode depend on i_times.

Now, we have used enhanced 'cp_ver + cp' crc method to verify valid
dnode block, so I expect recoverying invalid dnode is almost not
possible.

This reverts commit 807b1e1c8e08452948495b1a9985ab46d329e5c2.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/recovery.c | 31 +------------------------------
 1 file changed, 1 insertion(+), 30 deletions(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d2ba4da08ec3..62523b217571 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -196,32 +196,6 @@ static void recover_inode(struct inode *inode, struct page *page)
 			ino_of_node(page), name);
 }
 
-static bool is_same_inode(struct inode *inode, struct page *ipage)
-{
-	struct f2fs_inode *ri = F2FS_INODE(ipage);
-	struct timespec disk;
-
-	if (!IS_INODE(ipage))
-		return true;
-
-	disk.tv_sec = le64_to_cpu(ri->i_ctime);
-	disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
-	if (timespec_compare(&inode->i_ctime, &disk) > 0)
-		return false;
-
-	disk.tv_sec = le64_to_cpu(ri->i_atime);
-	disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
-	if (timespec_compare(&inode->i_atime, &disk) > 0)
-		return false;
-
-	disk.tv_sec = le64_to_cpu(ri->i_mtime);
-	disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
-	if (timespec_compare(&inode->i_mtime, &disk) > 0)
-		return false;
-
-	return true;
-}
-
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 {
 	struct curseg_info *curseg;
@@ -248,10 +222,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 			goto next;
 
 		entry = get_fsync_inode(head, ino_of_node(page));
-		if (entry) {
-			if (!is_same_inode(entry->inode, page))
-				goto next;
-		} else {
+		if (!entry) {
 			if (IS_INODE(page) && is_dent_dnode(page)) {
 				err = recover_inode_page(sbi, page);
 				if (err)

From 5065008a77ef2b08339b4cc1d1f11a645821dce3 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Mon, 7 Nov 2016 21:22:31 +0800
Subject: [PATCH 1097/3220] f2fs: return directly if block has been removed
 from the victim

commit 206147112860418fa9363cf94ec2c172bdf4313a upstream.

If one block has been to written to a new place, just return
in move data process. This patch check it again with holding
page lock.

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 72a0ca08f901..744031194934 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -544,7 +544,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	return true;
 }
 
-static void move_encrypted_block(struct inode *inode, block_t bidx)
+static void move_encrypted_block(struct inode *inode, block_t bidx,
+							unsigned int segno, int off)
 {
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(inode),
@@ -564,6 +565,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	if (!page)
 		return;
 
+	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+		goto out;
+
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
 	if (err)
@@ -643,7 +647,8 @@ out:
 	f2fs_put_page(page, 1);
 }
 
-static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
+static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
+							unsigned int segno, int off)
 {
 	struct page *page;
 
@@ -651,6 +656,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
 	if (IS_ERR(page))
 		return;
 
+	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+		goto out;
+
 	if (gc_type == BG_GC) {
 		if (PageWriteback(page))
 			goto out;
@@ -792,9 +800,9 @@ next_step:
 			start_bidx = start_bidx_of_node(nofs, inode)
 								+ ofs_in_node;
 			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-				move_encrypted_block(inode, start_bidx);
+				move_encrypted_block(inode, start_bidx, segno, off);
 			else
-				move_data_page(inode, start_bidx, gc_type);
+				move_data_page(inode, start_bidx, gc_type, segno, off);
 
 			if (locked) {
 				up_write(&fi->dio_rwsem[WRITE]);

From 00e5a211f9c0e5f2672165c7ee12aadc467d8a2b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 11 Nov 2016 12:31:40 -0800
Subject: [PATCH 1098/3220] f2fs: revert segment allocation for direct IO

commit 6ae1be13e85f4c42c8ca371fda50ae39eebbfd96 upstream.

Now we don't need to be too much careful about storage alignment for dio, since
its speed becomes quite fast and we'd better avoid any misalignment first.

Revert: 38aa0889b250 (f2fs: align direct_io'ed data to section)

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    |  6 +-----
 fs/f2fs/f2fs.h    |  1 -
 fs/f2fs/segment.c | 36 +++++++++---------------------------
 3 files changed, 10 insertions(+), 33 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a60efb8fdd3a..beb747132858 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -583,7 +583,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct f2fs_summary sum;
 	struct node_info ni;
-	int seg = CURSEG_WARM_DATA;
 	pgoff_t fofs;
 	blkcnt_t count = 1;
 
@@ -601,11 +600,8 @@ alloc:
 	get_node_info(sbi, dn->nid, &ni);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 
-	if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
-		seg = CURSEG_DIRECT_IO;
-
 	allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
-								&sum, seg);
+						&sum, CURSEG_WARM_DATA);
 	set_data_blkaddr(dn);
 
 	/* update i_size */
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c6dba704b0fe..4d4bfbaeb788 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -639,7 +639,6 @@ enum {
 	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
 	CURSEG_COLD_NODE,	/* indirect node blocks */
 	NO_CHECK_TYPE,
-	CURSEG_DIRECT_IO,	/* to use for the direct IO path */
 };
 
 struct flush_cmd {
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4bdf1191a36f..19ab2e63d8d7 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1267,25 +1267,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
 	stat_inc_seg_type(sbi, curseg);
 }
 
-static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
-{
-	struct curseg_info *curseg = CURSEG_I(sbi, type);
-	unsigned int old_segno;
-
-	old_segno = curseg->segno;
-	SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
-	locate_dirty_segment(sbi, old_segno);
-}
-
 void allocate_new_segments(struct f2fs_sb_info *sbi)
 {
+	struct curseg_info *curseg;
+	unsigned int old_segno;
 	int i;
 
 	if (test_opt(sbi, LFS))
 		return;
 
-	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
-		__allocate_new_segments(sbi, i);
+	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+		curseg = CURSEG_I(sbi, i);
+		old_segno = curseg->segno;
+		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+		locate_dirty_segment(sbi, old_segno);
+	}
 }
 
 static const struct segment_allocation default_salloc_ops = {
@@ -1419,25 +1415,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 		struct f2fs_summary *sum, int type)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
-	struct curseg_info *curseg;
-	bool direct_io = (type == CURSEG_DIRECT_IO);
-
-	if (direct_io) {
-		if (sbi->active_logs <= 4)
-			type = CURSEG_HOT_DATA;
-		else
-			type = CURSEG_WARM_DATA;
-	}
-	curseg = CURSEG_I(sbi, type);
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);
 	mutex_lock(&sit_i->sentry_lock);
 
-	/* direct_io'ed data is aligned to the segment for better performance */
-	if (direct_io && curseg->next_blkoff &&
-				!has_not_enough_free_secs(sbi, 0, 0))
-		__allocate_new_segments(sbi, type);
-
 	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
 	/*

From f9baf967bddd70a2f3d67793a16ac51c640a3bdd Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 11 Nov 2016 12:08:22 -0800
Subject: [PATCH 1099/3220] f2fs: allow dio read for LFS mode

commit e57e9ae5b179a6b243c42bf6d9549d1595c27089 upstream.

We can allow dio reads for LFS mode, while doing buffered writes for dio writes.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index beb747132858..f59c91765b1d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1735,7 +1735,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 		return 0;
-	if (test_opt(F2FS_I_SB(inode), LFS))
+	if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS))
 		return 0;
 
 	if (trace_android_fs_dataread_start_enabled() &&

From 07f010798cc238d227dad56cc52e956094914099 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 6 Oct 2016 19:02:05 -0700
Subject: [PATCH 1100/3220] f2fs: support multiple devices

commit 3c62be17d4f562f43fe1d03b48194399caa35aa5 upstream.

This patch implements multiple devices support for f2fs.
Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big
volume under one f2fs instance.

Internal block management is very simple, but we will modify block allocation
and background GC policy to boost IO speed by exploiting them accoording to
each device speed.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          |  55 ++++++++++++++--
 fs/f2fs/f2fs.h          |  29 +++++++--
 fs/f2fs/segment.c       | 112 +++++++++++++++++++++++---------
 fs/f2fs/super.c         | 138 +++++++++++++++++++++++++++++++---------
 include/linux/f2fs_fs.h |  10 ++-
 5 files changed, 274 insertions(+), 70 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f59c91765b1d..31e9c64e0f1d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -88,6 +88,46 @@ static void f2fs_write_end_io(struct bio *bio)
 	bio_put(bio);
 }
 
+/*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+				block_t blk_addr, struct bio *bio)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++) {
+		if (FDEV(i).start_blk <= blk_addr &&
+					FDEV(i).end_blk >= blk_addr) {
+			blk_addr -= FDEV(i).start_blk;
+			bdev = FDEV(i).bdev;
+			break;
+		}
+	}
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+	}
+	return bdev;
+}
+
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++)
+		if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
+			return i;
+	return 0;
+}
+
+static bool __same_bdev(struct f2fs_sb_info *sbi,
+				block_t blk_addr, struct bio *bio)
+{
+	return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
+}
+
 /*
  * Low-level block read/write IO operations.
  */
@@ -98,8 +138,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 
 	bio = f2fs_bio_alloc(npages);
 
-	bio->bi_bdev = sbi->sb->s_bdev;
-	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+	f2fs_target_device(sbi, blk_addr, bio);
 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
 	bio->bi_private = is_read ? NULL : sbi;
 
@@ -269,7 +308,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 	down_write(&io->io_rwsem);
 
 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
-						io->fio.rw != fio->rw))
+			(io->fio.rw != fio->rw) ||
+			!__same_bdev(sbi, fio->new_blkaddr, io->bio)))
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
@@ -956,7 +996,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct fscrypt_ctx *ctx = NULL;
-	struct block_device *bdev = sbi->sb->s_bdev;
 	struct bio *bio;
 
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
@@ -974,8 +1013,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
 			fscrypt_release_ctx(ctx);
 		return ERR_PTR(-ENOMEM);
 	}
-	bio->bi_bdev = bdev;
-	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
+	f2fs_target_device(sbi, blkaddr, bio);
 	bio->bi_end_io = f2fs_read_end_io;
 	bio->bi_private = ctx;
 
@@ -1069,7 +1107,8 @@ got_it:
 		 * This page will go to BIO.  Do we need to send this
 		 * BIO off first?
 		 */
-		if (bio && (last_block_in_bio != block_nr - 1)) {
+		if (bio && (last_block_in_bio != block_nr - 1 ||
+			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
 submit_and_realloc:
 			__submit_bio(F2FS_I_SB(inode), READ, bio, DATA);
 			bio = NULL;
@@ -1737,6 +1776,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 	if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS))
 		return 0;
+	if (F2FS_I_SB(inode)->s_ndevs)
+		return 0;
 
 	if (trace_android_fs_dataread_start_enabled() &&
 	    (iov_iter_rw(iter) == READ)) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4d4bfbaeb788..04f6dddc6d91 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -756,6 +756,20 @@ struct f2fs_bio_info {
 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
 };
 
+#define FDEV(i)				(sbi->devs[i])
+#define RDEV(i)				(raw_super->devs[i])
+struct f2fs_dev_info {
+	struct block_device *bdev;
+	char path[MAX_PATH_LEN];
+	unsigned int total_segments;
+	block_t start_blk;
+	block_t end_blk;
+#ifdef CONFIG_BLK_DEV_ZONED
+	unsigned int nr_blkz;			/* Total number of zones */
+	u8 *blkz_type;				/* Array of zones type */
+#endif
+};
+
 enum inode_type {
 	DIR_INODE,			/* for dirty dir inode */
 	FILE_INODE,			/* for dirty regular/symlink inode */
@@ -804,10 +818,8 @@ struct f2fs_sb_info {
 #endif
 
 #ifdef CONFIG_BLK_DEV_ZONED
-	unsigned int nr_blkz;			/* Total number of zones */
 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
 	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
-	u8 *blkz_type;				/* Array of zones type */
 #endif
 
 	/* for node-related operations */
@@ -924,6 +936,8 @@ struct f2fs_sb_info {
 
 	/* For shrinker support */
 	struct list_head s_list;
+	int s_ndevs;				/* number of devices */
+	struct f2fs_dev_info *devs;		/* for device list */
 	struct mutex umount_mutex;
 	unsigned int shrinker_run_no;
 
@@ -2190,6 +2204,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
 void f2fs_flush_merged_bios(struct f2fs_sb_info *);
 int f2fs_submit_page_bio(struct f2fs_io_info *);
 void f2fs_submit_page_mbio(struct f2fs_io_info *);
+struct block_device *f2fs_target_device(struct f2fs_sb_info *,
+				block_t, struct bio *);
+int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
 void set_data_blkaddr(struct dnode_of_data *);
 void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
 int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
@@ -2477,11 +2494,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
-				block_t blkaddr)
+			struct block_device *bdev, block_t blkaddr)
 {
 	unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
+	int i;
 
-	return sbi->blkz_type[zno];
+	for (i = 0; i < sbi->s_ndevs; i++)
+		if (FDEV(i).bdev == bdev)
+			return FDEV(i).blkz_type[zno];
+	return -EINVAL;
 }
 #endif
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 19ab2e63d8d7..30d0e9a76c62 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -401,6 +401,32 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	}
 }
 
+static int __submit_flush_wait(struct block_device *bdev)
+{
+	struct bio *bio = f2fs_bio_alloc(0);
+	int ret;
+
+	bio->bi_bdev = bdev;
+	ret = submit_bio_wait(WRITE_FLUSH, bio);
+	bio_put(bio);
+	return ret;
+}
+
+static int submit_flush_wait(struct f2fs_sb_info *sbi)
+{
+	int ret = __submit_flush_wait(sbi->sb->s_bdev);
+	int i;
+
+	if (sbi->s_ndevs && !ret) {
+		for (i = 1; i < sbi->s_ndevs; i++) {
+			ret = __submit_flush_wait(FDEV(i).bdev);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static int issue_flush_thread(void *data)
 {
 	struct f2fs_sb_info *sbi = data;
@@ -411,24 +437,18 @@ repeat:
 		return 0;
 
 	if (!llist_empty(&fcc->issue_list)) {
-		struct bio *bio;
 		struct flush_cmd *cmd, *next;
 		int ret;
 
-		bio = f2fs_bio_alloc(0);
-
 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
-		bio->bi_bdev = sbi->sb->s_bdev;
-		ret = submit_bio_wait(WRITE_FLUSH, bio);
-
+		ret = submit_flush_wait(sbi);
 		llist_for_each_entry_safe(cmd, next,
 					  fcc->dispatch_list, llnode) {
 			cmd->ret = ret;
 			complete(&cmd->wait);
 		}
-		bio_put(bio);
 		fcc->dispatch_list = NULL;
 	}
 
@@ -449,14 +469,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		return 0;
 
 	if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
-		struct bio *bio = f2fs_bio_alloc(0);
 		int ret;
 
 		atomic_inc(&fcc->submit_flush);
-		bio->bi_bdev = sbi->sb->s_bdev;
-		ret = submit_bio_wait(WRITE_FLUSH, bio);
+		ret = submit_flush_wait(sbi);
 		atomic_dec(&fcc->submit_flush);
-		bio_put(bio);
 		return ret;
 	}
 
@@ -586,18 +603,24 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
-static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
-					block_t blkstart, block_t blklen)
+static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
+		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
-	sector_t sector = SECTOR_FROM_BLOCK(blkstart);
 	sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
-	struct block_device *bdev = sbi->sb->s_bdev;
+	sector_t sector;
+	int devi = 0;
 
-	if (nr_sects != bdev_zone_size(bdev)) {
+	if (sbi->s_ndevs) {
+		devi = f2fs_target_device_index(sbi, blkstart);
+		blkstart -= FDEV(devi).start_blk;
+	}
+	sector = SECTOR_FROM_BLOCK(blkstart);
+
+	if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) {
 		f2fs_msg(sbi->sb, KERN_INFO,
-			 "Unaligned discard attempted (sector %llu + %llu)",
-			 (unsigned long long)sector,
-			 (unsigned long long)nr_sects);
+			"(%d) %s: Unaligned discard attempted (block %x + %x)",
+			devi, sbi->s_ndevs ? FDEV(devi).path: "",
+			blkstart, blklen);
 		return -EIO;
 	}
 
@@ -606,7 +629,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 	 * use regular discard if the drive supports it. For sequential
 	 * zones, reset the zone write pointer.
 	 */
-	switch (get_blkz_type(sbi, blkstart)) {
+	switch (get_blkz_type(sbi, bdev, blkstart)) {
 
 	case BLK_ZONE_TYPE_CONVENTIONAL:
 		if (!blk_queue_discard(bdev_get_queue(bdev)))
@@ -625,29 +648,60 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 }
 #endif
 
-static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
-				block_t blkstart, block_t blklen)
+static int __issue_discard_async(struct f2fs_sb_info *sbi,
+		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
 	sector_t start = SECTOR_FROM_BLOCK(blkstart);
 	sector_t len = SECTOR_FROM_BLOCK(blklen);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
+				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
+		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
+#endif
+	return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0);
+}
+
+static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
+				block_t blkstart, block_t blklen)
+{
+	sector_t start = blkstart, len = 0;
+	struct block_device *bdev;
 	struct seg_entry *se;
 	unsigned int offset;
 	block_t i;
+	int err = 0;
+
+	bdev = f2fs_target_device(sbi, blkstart, NULL);
+
+	for (i = blkstart; i < blkstart + blklen; i++, len++) {
+		if (i != start) {
+			struct block_device *bdev2 =
+				f2fs_target_device(sbi, i, NULL);
+
+			if (bdev2 != bdev) {
+				err = __issue_discard_async(sbi, bdev,
+						start, len);
+				if (err)
+					return err;
+				bdev = bdev2;
+				start = i;
+				len = 0;
+			}
+		}
 
-	for (i = blkstart; i < blkstart + blklen; i++) {
 		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
 		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
 
 		if (!f2fs_test_and_set_bit(offset, se->discard_map))
 			sbi->discard_blks--;
 	}
-	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
 
-#ifdef CONFIG_BLK_DEV_ZONED
-	if (f2fs_sb_mounted_blkzoned(sbi->sb))
-		return f2fs_issue_discard_zone(sbi, blkstart, blklen);
-#endif
-	return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
+	if (len)
+		err = __issue_discard_async(sbi, bdev, start, len);
+
+	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+	return err;
 }
 
 static void __add_discard_entry(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 7a0634b0bee8..2d332a16de71 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
 	percpu_counter_destroy(&sbi->total_valid_inode_count);
 }
 
+static void destroy_device_list(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++) {
+		blkdev_put(FDEV(i).bdev, FMODE_EXCL);
+#ifdef CONFIG_BLK_DEV_ZONED
+		kfree(FDEV(i).blkz_type);
+#endif
+	}
+	kfree(sbi->devs);
+}
+
 static void f2fs_put_super(struct super_block *sb)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb)
 		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi->raw_super);
 
+	destroy_device_list(sbi);
+
 	destroy_percpu_info(sbi);
 	kfree(sbi);
 }
@@ -1517,9 +1532,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
-static int init_blkz_info(struct f2fs_sb_info *sbi)
+static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 {
-	struct block_device *bdev = sbi->sb->s_bdev;
+	struct block_device *bdev = FDEV(devi).bdev;
 	sector_t nr_sectors = bdev->bd_part->nr_sects;
 	sector_t sector = 0;
 	struct blk_zone *zones;
@@ -1530,15 +1545,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
 	if (!f2fs_sb_mounted_blkzoned(sbi->sb))
 		return 0;
 
+	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
+				SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
+		return -EINVAL;
 	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
+	if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
+				__ilog2_u32(sbi->blocks_per_blkz))
+		return -EINVAL;
 	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
-	sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
-		sbi->log_blocks_per_blkz;
+	FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
+					sbi->log_blocks_per_blkz;
 	if (nr_sectors & (bdev_zone_size(bdev) - 1))
-		sbi->nr_blkz++;
+		FDEV(devi).nr_blkz++;
 
-	sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL);
-	if (!sbi->blkz_type)
+	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
+	if (!FDEV(devi).blkz_type)
 		return -ENOMEM;
 
 #define F2FS_REPORT_NR_ZONES   4096
@@ -1563,7 +1584,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
 		}
 
 		for (i = 0; i < nr_zones; i++) {
-			sbi->blkz_type[n] = zones[i].type;
+			FDEV(devi).blkz_type[n] = zones[i].type;
 			sector += zones[i].len;
 			n++;
 		}
@@ -1667,6 +1688,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 	return err;
 }
 
+static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+	int i;
+
+	for (i = 0; i < MAX_DEVICES; i++) {
+		if (!RDEV(i).path[0])
+			return 0;
+
+		if (i == 0) {
+			sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
+						MAX_DEVICES, GFP_KERNEL);
+			if (!sbi->devs)
+				return -ENOMEM;
+		}
+
+		memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
+		FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
+		if (i == 0) {
+			FDEV(i).start_blk = 0;
+			FDEV(i).end_blk = FDEV(i).start_blk +
+				(FDEV(i).total_segments <<
+				sbi->log_blocks_per_seg) - 1 +
+				le32_to_cpu(raw_super->segment0_blkaddr);
+		} else {
+			FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
+			FDEV(i).end_blk = FDEV(i).start_blk +
+				(FDEV(i).total_segments <<
+				sbi->log_blocks_per_seg) - 1;
+		}
+
+		FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+					sbi->sb->s_mode, sbi->sb->s_type);
+		if (IS_ERR(FDEV(i).bdev))
+			return PTR_ERR(FDEV(i).bdev);
+
+		/* to release errored devices */
+		sbi->s_ndevs = i + 1;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
+				!f2fs_sb_mounted_blkzoned(sbi->sb)) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+				"Zoned block device feature not enabled\n");
+			return -EINVAL;
+		}
+		if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+			if (init_blkz_info(sbi, i)) {
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Failed to initialize F2FS blkzone information");
+				return -EINVAL;
+			}
+			f2fs_msg(sbi->sb, KERN_INFO,
+				"Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+				i, FDEV(i).path,
+				FDEV(i).total_segments,
+				FDEV(i).start_blk, FDEV(i).end_blk,
+				bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
+				"Host-aware" : "Host-managed");
+			continue;
+		}
+#endif
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"Mount Device [%2d]: %20s, %8u, %8x - %8x",
+				i, FDEV(i).path,
+				FDEV(i).total_segments,
+				FDEV(i).start_blk, FDEV(i).end_blk);
+	}
+	return 0;
+}
+
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct f2fs_sb_info *sbi;
@@ -1725,15 +1817,7 @@ try_onemore:
 			 "Zoned block device support is not enabled\n");
 		goto free_sb_buf;
 	}
-#else
-	if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM &&
-	    !f2fs_sb_mounted_blkzoned(sb)) {
-		f2fs_msg(sb, KERN_ERR,
-			 "Zoned block device feature not enabled\n");
-		goto free_sb_buf;
-	}
 #endif
-
 	default_options(sbi);
 	/* parse mount options */
 	options = kstrdup((const char *)data, GFP_KERNEL);
@@ -1803,6 +1887,13 @@ try_onemore:
 		goto free_meta_inode;
 	}
 
+	/* Initialize device list */
+	err = f2fs_scan_devices(sbi);
+	if (err) {
+		f2fs_msg(sb, KERN_ERR, "Failed to find devices");
+		goto free_devices;
+	}
+
 	sbi->total_valid_node_count =
 				le32_to_cpu(sbi->ckpt->valid_node_count);
 	percpu_counter_set(&sbi->total_valid_inode_count,
@@ -1821,15 +1912,6 @@ try_onemore:
 
 	init_ino_entry_info(sbi);
 
-#ifdef CONFIG_BLK_DEV_ZONED
-	err = init_blkz_info(sbi);
-	if (err) {
-		f2fs_msg(sb, KERN_ERR,
-			"Failed to initialize F2FS blkzone information");
-		goto free_blkz;
-	}
-#endif
-
 	/* setup f2fs internal modules */
 	err = build_segment_manager(sbi);
 	if (err) {
@@ -2008,10 +2090,8 @@ free_nm:
 	destroy_node_manager(sbi);
 free_sm:
 	destroy_segment_manager(sbi);
-#ifdef CONFIG_BLK_DEV_ZONED
-free_blkz:
-	kfree(sbi->blkz_type);
-#endif
+free_devices:
+	destroy_device_list(sbi);
 	kfree(sbi->ckpt);
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index e46e7d10312b..3e5972ef5019 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -52,10 +52,17 @@
 
 #define VERSION_LEN	256
 #define MAX_VOLUME_NAME		512
+#define MAX_PATH_LEN		64
+#define MAX_DEVICES		8
 
 /*
  * For superblock
  */
+struct f2fs_device {
+	__u8 path[MAX_PATH_LEN];
+	__le32 total_segments;
+} __packed;
+
 struct f2fs_super_block {
 	__le32 magic;			/* Magic Number */
 	__le16 major_ver;		/* Major Version */
@@ -94,7 +101,8 @@ struct f2fs_super_block {
 	__le32 feature;			/* defined features */
 	__u8 encryption_level;		/* versioning level for encryption */
 	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
-	__u8 reserved[871];		/* valid reserved region */
+	struct f2fs_device devs[MAX_DEVICES];	/* device list */
+	__u8 reserved[327];		/* valid reserved region */
 } __packed;
 
 /*

From bd8e41540a703c57994fd4398753dbb9b6679d3e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 11 Nov 2016 16:31:56 -0800
Subject: [PATCH 1101/3220] f2fs: use err for f2fs_preallocate_blocks

commit a7de608691f766cd148971a71d4f13aa1692d4c8 upstream.

This patch has no functional change.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 26 +++++++++++++-------------
 fs/f2fs/f2fs.h |  2 +-
 fs/f2fs/file.c | 35 +++++++++++++++++++----------------
 3 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 31e9c64e0f1d..87e07665832e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -653,11 +653,11 @@ alloc:
 	return 0;
 }
 
-ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
+int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct f2fs_map_blocks map;
-	ssize_t ret = 0;
+	int err = 0;
 
 	map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
 	map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
@@ -669,19 +669,19 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	map.m_next_pgofs = NULL;
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
 		return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
 	}
 	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
 	}
 	if (!f2fs_has_inline_data(inode))
 		return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
-	return ret;
+	return err;
 }
 
 /*
@@ -858,19 +858,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 			pgoff_t *next_pgofs)
 {
 	struct f2fs_map_blocks map;
-	int ret;
+	int err;
 
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 	map.m_next_pgofs = next_pgofs;
 
-	ret = f2fs_map_blocks(inode, &map, create, flag);
-	if (!ret) {
+	err = f2fs_map_blocks(inode, &map, create, flag);
+	if (!err) {
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
 		bh->b_size = map.m_len << inode->i_blkbits;
 	}
-	return ret;
+	return err;
 }
 
 static int get_data_block(struct inode *inode, sector_t iblock,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 04f6dddc6d91..8dc378d82f67 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2212,7 +2212,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
 int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_get_block(struct dnode_of_data *, pgoff_t);
-ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
+int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
 struct page *find_data_page(struct inode *, pgoff_t);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ce38a350fb38..fbfcd809baec 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1324,15 +1324,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	pgoff_t pg_end;
 	loff_t new_size = i_size_read(inode);
 	loff_t off_end;
-	int ret;
+	int err;
 
-	ret = inode_newsize_ok(inode, (len + offset));
-	if (ret)
-		return ret;
+	err = inode_newsize_ok(inode, (len + offset));
+	if (err)
+		return err;
 
-	ret = f2fs_convert_inline_inode(inode);
-	if (ret)
-		return ret;
+	err = f2fs_convert_inline_inode(inode);
+	if (err)
+		return err;
 
 	f2fs_balance_fs(sbi, true);
 
@@ -1344,12 +1344,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	if (off_end)
 		map.m_len++;
 
-	ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
-	if (ret) {
+	err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+	if (err) {
 		pgoff_t last_off;
 
 		if (!map.m_len)
-			return ret;
+			return err;
 
 		last_off = map.m_lblk + map.m_len - 1;
 
@@ -1363,7 +1363,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
 		f2fs_i_size_write(inode, new_size);
 
-	return ret;
+	return err;
 }
 
 static long f2fs_fallocate(struct file *file, int mode,
@@ -2267,12 +2267,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0) {
-		ret = f2fs_preallocate_blocks(iocb, from);
-		if (!ret) {
-			blk_start_plug(&plug);
-			ret = __generic_file_write_iter(iocb, from);
-			blk_finish_plug(&plug);
+		int err = f2fs_preallocate_blocks(iocb, from);
+
+		if (err) {
+			inode_unlock(inode);
+			return err;
 		}
+		blk_start_plug(&plug);
+		ret = __generic_file_write_iter(iocb, from);
+		blk_finish_plug(&plug);
 	}
 	inode_unlock(inode);
 

From 8b2c7581e85f6b3464763952ef54c82691b71c4e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 11 Nov 2016 16:46:40 -0800
Subject: [PATCH 1102/3220] f2fs: fix redundant block allocation

commit c040ff9d69fd1d782fe577ba9e35c1f5798158ae upstream.

In direct_IO path of f2fs_file_write_iter(),
1. f2fs_preallocate_blocks(F2FS_GET_BLOCK_PRE_DIO)
   -> allocate LBA X
2. f2fs_direct_IO()
   -> return 0;

Then,
f2fs_write_data_page() will allocate another LBA X+1.

This makes EIO triggered by HM-SMR.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 87e07665832e..ed1ac24ba7c4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -653,6 +653,13 @@ alloc:
 	return 0;
 }
 
+static inline bool __force_buffered_io(struct inode *inode, int rw)
+{
+	return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) ||
+			(rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
+			F2FS_I_SB(inode)->s_ndevs);
+}
+
 int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
@@ -672,7 +679,10 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
 			return err;
-		return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+		return f2fs_map_blocks(inode, &map, 1,
+			__force_buffered_io(inode, WRITE) ?
+				F2FS_GET_BLOCK_PRE_AIO :
+				F2FS_GET_BLOCK_PRE_DIO);
 	}
 	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
 		err = f2fs_convert_inline_inode(inode);
@@ -1772,11 +1782,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	if (err)
 		return err;
 
-	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-		return 0;
-	if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS))
-		return 0;
-	if (F2FS_I_SB(inode)->s_ndevs)
+	if (__force_buffered_io(inode, rw))
 		return 0;
 
 	if (trace_android_fs_dataread_start_enabled() &&

From 79ba046b115eaa4c0f30104bccb6f41ae710e952 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 14 Nov 2016 17:38:35 -0800
Subject: [PATCH 1103/3220] f2fs: avoid BG_GC in f2fs_balance_fs

commit 7702bdbe505a22380dd958e2ee35124c7c414806 upstream.

If many threads hit has_not_enough_free_secs() in f2fs_balance_fs() at the same
time, all the threads would do FG_GC or BG_GC.
In this critical path, we totally don't need to do BG_GC at all.
Let's avoid that.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    | 2 +-
 fs/f2fs/file.c    | 2 +-
 fs/f2fs/gc.c      | 7 +++++--
 fs/f2fs/segment.c | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8dc378d82f67..687ab43a6cd8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2235,7 +2235,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *,
 int start_gc_thread(struct f2fs_sb_info *);
 void stop_gc_thread(struct f2fs_sb_info *);
 block_t start_bidx_of_node(unsigned int, struct inode *);
-int f2fs_gc(struct f2fs_sb_info *, bool);
+int f2fs_gc(struct f2fs_sb_info *, bool, bool);
 void build_gc_manager(struct f2fs_sb_info *);
 
 /*
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index fbfcd809baec..84f4572ae959 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1853,7 +1853,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
 		mutex_lock(&sbi->gc_mutex);
 	}
 
-	ret = f2fs_gc(sbi, sync);
+	ret = f2fs_gc(sbi, sync, true);
 out:
 	mnt_drop_write_file(filp);
 	return ret;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 744031194934..54d06c21af07 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -82,7 +82,7 @@ static int gc_thread_func(void *data)
 		stat_inc_bggc_count(sbi);
 
 		/* if return value is not zero, no victim was selected */
-		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
+		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true))
 			wait_ms = gc_th->no_gc_sleep_time;
 
 		trace_f2fs_background_gc(sbi->sb, wait_ms,
@@ -905,7 +905,7 @@ next:
 	return sec_freed;
 }
 
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background)
 {
 	unsigned int segno;
 	int gc_type = sync ? FG_GC : BG_GC;
@@ -946,6 +946,9 @@ gc_more:
 			if (ret)
 				goto stop;
 		}
+	} else if (gc_type == BG_GC && !background) {
+		/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
+		goto stop;
 	}
 
 	if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 30d0e9a76c62..27e1b7c56e4c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -364,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
 	 */
 	if (has_not_enough_free_secs(sbi, 0, 0)) {
 		mutex_lock(&sbi->gc_mutex);
-		f2fs_gc(sbi, false);
+		f2fs_gc(sbi, false, false);
 	}
 }
 

From df3f20f12b9fcf4c93706ac24edcb6a3ff9f58a5 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 14 Nov 2016 18:20:10 -0800
Subject: [PATCH 1104/3220] f2fs: fix wrong written_valid_blocks counting

commit c79b7ff1d3c7710c23d8828a69d8cbc5597ad19f upstream.

Previously, written_valid_blocks was got by ckpt->valid_block_count. But if
the last checkpoint has some NEW_ADDR due to power-cut, we can get wrong value.
Fix it to get the number from actual written block count from sit entries.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 27e1b7c56e4c..58cae4a541a7 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2176,7 +2176,6 @@ out:
 static int build_sit_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
-	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	struct sit_info *sit_i;
 	unsigned int sit_segs, start;
 	char *src_bitmap, *dst_bitmap;
@@ -2243,7 +2242,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 
 	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
 	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
-	sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
+	sit_i->written_valid_blocks = 0;
 	sit_i->sit_bitmap = dst_bitmap;
 	sit_i->bitmap_size = bitmap_size;
 	sit_i->dirty_sentries = 0;
@@ -2397,6 +2396,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
 		struct seg_entry *sentry = get_seg_entry(sbi, start);
 		if (!sentry->valid_blocks)
 			__set_free(sbi, start);
+		else
+			SIT_I(sbi)->written_valid_blocks +=
+						sentry->valid_blocks;
 	}
 
 	/* set use the current segments */

From 9e266223b33dac867e29e7d0393e363ff86e092a Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 16 Nov 2016 10:41:20 +0800
Subject: [PATCH 1105/3220] f2fs: don't wait writeback for datas during
 checkpoint

commit 36951b38d13ac7cce9fcf89e0e01c22ed0d05688 upstream.

Normally, while committing checkpoint, we will wait on all pages to be
writebacked no matter the page is data or metadata, so in scenario where
there are lots of data IO being submitted with metadata, we may suffer
long latency for waiting writeback during checkpoint.

Indeed, we only care about persistence for pages with metadata, but not
pages with data, as file system consistent are only related to metadate,
so in order to avoid encountering long latency in above scenario, let's
recognize and reference metadata in submitted IOs, wait writeback only
for metadatas.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  2 +-
 fs/f2fs/data.c       | 35 +++++++++++++++++++++++++++++------
 fs/f2fs/debug.c      |  7 ++++---
 fs/f2fs/f2fs.h       |  9 ++++++---
 fs/f2fs/file.c       |  2 --
 fs/f2fs/gc.c         |  2 --
 fs/f2fs/segment.c    |  1 -
 7 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ed79757c36e0..889317e07122 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1005,7 +1005,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
 	for (;;) {
 		prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		if (!atomic_read(&sbi->nr_wb_bios))
+		if (!get_pages(sbi, F2FS_WB_CP_DATA))
 			break;
 
 		io_schedule_timeout(5*HZ);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ed1ac24ba7c4..fa254daac970 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -30,6 +30,26 @@
 #include <trace/events/f2fs.h>
 #include <trace/events/android_fs.h>
 
+static bool __is_cp_guaranteed(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	struct f2fs_sb_info *sbi;
+
+	if (!mapping)
+		return false;
+
+	inode = mapping->host;
+	sbi = F2FS_I_SB(inode);
+
+	if (inode->i_ino == F2FS_META_INO(sbi) ||
+			inode->i_ino ==  F2FS_NODE_INO(sbi) ||
+			S_ISDIR(inode->i_mode) ||
+			is_cold_data(page))
+		return true;
+	return false;
+}
+
 static void f2fs_read_end_io(struct bio *bio)
 {
 	struct bio_vec *bvec;
@@ -72,6 +92,7 @@ static void f2fs_write_end_io(struct bio *bio)
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
+		enum count_type type = WB_DATA_TYPE(page);
 
 		fscrypt_pullback_bio_page(&page, true);
 
@@ -79,9 +100,11 @@ static void f2fs_write_end_io(struct bio *bio)
 			set_bit(AS_EIO, &page->mapping->flags);
 			f2fs_stop_checkpoint(sbi, true);
 		}
+		dec_page_count(sbi, type);
+		clear_cold_data(page);
 		end_page_writeback(page);
 	}
-	if (atomic_dec_and_test(&sbi->nr_wb_bios) &&
+	if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
 				wq_has_sleeper(&sbi->cp_wait))
 		wake_up(&sbi->cp_wait);
 
@@ -149,7 +172,6 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw,
 			struct bio *bio, enum page_type type)
 {
 	if (!is_read_io(rw)) {
-		atomic_inc(&sbi->nr_wb_bios);
 		if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
 			current->plug && (type == DATA || type == NODE))
 			blk_finish_plug(current->plug);
@@ -305,6 +327,11 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 		verify_block_addr(sbi, fio->old_blkaddr);
 	verify_block_addr(sbi, fio->new_blkaddr);
 
+	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+
+	if (!is_read)
+		inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+
 	down_write(&io->io_rwsem);
 
 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
@@ -318,8 +345,6 @@ alloc_new:
 		io->fio = *fio;
 	}
 
-	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
-
 	if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
 							PAGE_SIZE) {
 		__submit_merged_bio(io);
@@ -1331,7 +1356,6 @@ done:
 	if (err && err != -ENOENT)
 		goto redirty_out;
 
-	clear_cold_data(page);
 out:
 	inode_dec_dirty_pages(inode);
 	if (err)
@@ -1745,7 +1769,6 @@ static int f2fs_write_end(struct file *file,
 		goto unlock_out;
 
 	set_page_dirty(page);
-	clear_cold_data(page);
 
 	if (pos + copied > i_size_read(inode))
 		f2fs_i_size_write(inode, pos + copied);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 678733c78f55..fbd5184140d0 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
 	si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
-	si->wb_bios = atomic_read(&sbi->nr_wb_bios);
+	si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
+	si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
 	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
 	si->rsvd_segs = reserved_segments(sbi);
 	si->overp_segs = overprovision_segments(sbi);
@@ -313,8 +314,8 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - inmem: %4d, wb_bios: %4d\n",
-			   si->inmem_pages, si->wb_bios);
+		seq_printf(s, "  - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n",
+			   si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data);
 		seq_printf(s, "  - nodes: %4d in %4d\n",
 			   si->ndirty_node, si->node_pages);
 		seq_printf(s, "  - dents: %4d in dirs:%4d (%4d)\n",
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 687ab43a6cd8..d6119ea3b86d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -702,6 +702,7 @@ struct f2fs_sm_info {
  * f2fs monitors the number of several block types such as on-writeback,
  * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
  */
+#define WB_DATA_TYPE(p)	(__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
 enum count_type {
 	F2FS_DIRTY_DENTS,
 	F2FS_DIRTY_DATA,
@@ -709,6 +710,8 @@ enum count_type {
 	F2FS_DIRTY_META,
 	F2FS_INMEM_PAGES,
 	F2FS_DIRTY_IMETA,
+	F2FS_WB_CP_DATA,
+	F2FS_WB_DATA,
 	NR_COUNT_TYPE,
 };
 
@@ -888,7 +891,6 @@ struct f2fs_sb_info {
 	block_t discard_blks;			/* discard command candidats */
 	block_t last_valid_block_count;		/* for recovery */
 	u32 s_next_generation;			/* for NFS support */
-	atomic_t nr_wb_bios;			/* # of writeback bios */
 
 	/* # of pages, see count_type */
 	atomic_t nr_pages[NR_COUNT_TYPE];
@@ -1302,7 +1304,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
 	atomic_inc(&sbi->nr_pages[count_type]);
 
-	if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
+	if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
+		count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA)
 		return;
 
 	set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -2261,7 +2264,7 @@ struct f2fs_stat_info {
 	unsigned int ndirty_dirs, ndirty_files, ndirty_all;
 	int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
 	int total_count, utilization;
-	int bg_gc, wb_bios;
+	int bg_gc, nr_wb_cp_data, nr_wb_data;
 	int inline_xattr, inline_inode, inline_dir, orphans;
 	unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
 	unsigned int bimodal, avg_vblocks;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 84f4572ae959..bab65f0a5bb5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -95,8 +95,6 @@ mapped:
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 		f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
 
-	/* if gced page is attached, don't write to cold segment */
-	clear_cold_data(page);
 out:
 	sb_end_pagefault(inode->i_sb);
 	f2fs_update_time(sbi, REQ_TIME);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 54d06c21af07..6390d45c1b68 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -690,8 +690,6 @@ retry:
 			congestion_wait(BLK_RW_ASYNC, HZ/50);
 			goto retry;
 		}
-
-		clear_cold_data(page);
 	}
 out:
 	f2fs_put_page(page, 1);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 58cae4a541a7..23e8892c4e60 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -288,7 +288,6 @@ static int __commit_inmem_pages(struct inode *inode,
 			/* record old blkaddr for revoking */
 			cur->old_addr = fio.old_blkaddr;
 
-			clear_cold_data(page);
 			submit_bio = true;
 		}
 		unlock_page(page);

From beaab6afb4c86fd7daa47577220d597cdc183a46 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Wed, 16 Nov 2016 17:26:24 +0800
Subject: [PATCH 1106/3220] f2fs: fix an infinite loop when flush nodes in cp

commit d40a43af0a57a017eba9ad2679183791587ceb6a upstream.

Thread A			Thread B

- write_checkpoint
 - block_operations
   -blk_start_plug
    -sync_node_pages		- f2fs_do_sync_file
				 - fsync_node_pages
				  - f2fs_wait_on_page_writeback

Thread A wait for global F2FS_DIRTY_NODES decreased to zero,
it start a plug list, some requests have been added to this list.
Thread B lock one dirty node page, and wait this page write back.
But this page has been in plug list of thread A with PG_writeback flag.
Thread A keep on running and its plug list has no chance to finish,
so it seems a deadlock between cp and fsync path.

This patch add a wait on page write back before set node page dirty
to avoid this problem.

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Signed-off-by: Pengyang Hou <houpengyang@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 389be7f6e07c..59cc29e6b73c 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1409,6 +1409,7 @@ continue_unlock:
 			"Retry to write fsync mark: ino=%u, idx=%lx",
 					ino, last_page->index);
 		lock_page(last_page);
+		f2fs_wait_on_page_writeback(last_page, NODE, true);
 		set_page_dirty(last_page);
 		unlock_page(last_page);
 		goto retry;

From 031017c6f9922c9c7b7819e876531de3f7ecb065 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 17 Nov 2016 20:53:11 +0800
Subject: [PATCH 1107/3220] f2fs: fix to account total free nid correctly

commit 04d47e673863c637a2b44ad34a558aeb5d0a727e upstream.

Thread A		Thread B		Thread C
- f2fs_create
 - f2fs_new_inode
  - f2fs_lock_op
   - alloc_nid
    alloc last nid
  - f2fs_unlock_op
			- f2fs_create
			 - f2fs_new_inode
			  - f2fs_lock_op
			   - alloc_nid
			    as node count still not
			    be increased, we will
			    loop in alloc_nid
						- f2fs_write_node_pages
						 - f2fs_balance_fs_bg
						  - f2fs_sync_fs
						   - write_checkpoint
						    - block_operations
						     - f2fs_lock_all
 - f2fs_lock_op

While creating new inode, we do not allocate and account nid atomically,
so that when there is almost no free nids left, we may encounter deadloop
like above stack.

In order to avoid that, reuse nm_i::available_nids for accounting free nids
and make nid allocation and counting being atomical during node creation.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h |  2 +-
 fs/f2fs/node.c | 34 +++++++++++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d6119ea3b86d..973ca74404de 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -560,7 +560,7 @@ enum nid_list {
 struct f2fs_nm_info {
 	block_t nat_blkaddr;		/* base disk address of NAT */
 	nid_t max_nid;			/* maximum possible node ids */
-	nid_t available_nids;		/* maximum available node ids */
+	nid_t available_nids;		/* # of available node ids */
 	nid_t next_scan_nid;		/* the next nid to be scanned */
 	unsigned int ram_thresh;	/* control the memory footprint */
 	unsigned int ra_nid_pages;	/* # of nid pages to be readaheaded */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 59cc29e6b73c..edacbabb92cf 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1885,11 +1885,13 @@ retry:
 	if (time_to_inject(sbi, FAULT_ALLOC_NID))
 		return false;
 #endif
-	if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
-		return false;
-
 	spin_lock(&nm_i->nid_list_lock);
 
+	if (unlikely(nm_i->available_nids == 0)) {
+		spin_unlock(&nm_i->nid_list_lock);
+		return false;
+	}
+
 	/* We should not use stale free nids created by build_free_nids */
 	if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
 		f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
@@ -1900,6 +1902,7 @@ retry:
 		__remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
 		i->state = NID_ALLOC;
 		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
+		nm_i->available_nids--;
 		spin_unlock(&nm_i->nid_list_lock);
 		return true;
 	}
@@ -1951,6 +1954,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 		i->state = NID_NEW;
 		__insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
 	}
+
+	nm_i->available_nids++;
+
 	spin_unlock(&nm_i->nid_list_lock);
 
 	if (need_free)
@@ -2150,6 +2156,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 			ne = grab_nat_entry(nm_i, nid);
 			node_info_from_raw_nat(&ne->ni, &raw_ne);
 		}
+
+		/*
+		 * if a free nat in journal has not been used after last
+		 * checkpoint, we should remove it from available nids,
+		 * since later we will add it again.
+		 */
+		if (!get_nat_flag(ne, IS_DIRTY) &&
+				le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
+			spin_lock(&nm_i->nid_list_lock);
+			nm_i->available_nids--;
+			spin_unlock(&nm_i->nid_list_lock);
+		}
+
 		__set_nat_cache_dirty(nm_i, ne);
 	}
 	update_nats_in_cursum(journal, -i);
@@ -2222,8 +2241,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 		raw_nat_from_node_info(raw_ne, &ne->ni);
 		nat_reset_flag(ne);
 		__clear_nat_cache_dirty(NM_I(sbi), ne);
-		if (nat_get_blkaddr(ne) == NULL_ADDR)
+		if (nat_get_blkaddr(ne) == NULL_ADDR) {
 			add_free_nid(sbi, nid, false);
+			spin_lock(&NM_I(sbi)->nid_list_lock);
+			NM_I(sbi)->available_nids++;
+			spin_unlock(&NM_I(sbi)->nid_list_lock);
+		}
 	}
 
 	if (to_journal)
@@ -2298,7 +2321,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
 
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
-	nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
+	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
+							F2FS_RESERVED_NODE_NUM;
 	nm_i->nid_cnt[FREE_NID_LIST] = 0;
 	nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
 	nm_i->nat_cnt = 0;

From 8f5fcb8034c939bb9e6c03fe748a08df85a8503a Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 17 Nov 2016 20:53:31 +0800
Subject: [PATCH 1108/3220] f2fs: fix fdatasync

commit 281518c694a5228d6c46fac83529fb3e2c331281 upstream.

For below two cases, we can't guarantee data consistence:

a)
1. xfs_io "pwrite 0 4195328" "fsync"
2. xfs_io "pwrite 4195328 1024" "fdatasync"
3. godown
4. umount & mount
--> isize we updated before fdatasync won't be recovered

b)
1. xfs_io "pwrite -S 0xcc 0 4202496" "fsync"
2. xfs_io "fpunch 4194304 4096" "fdatasync"
3. godown
4. umount & mount
--> dnode we punched before fdatasync won't be recovered

The reason is that normally fdatasync won't be aware of modification
of metadata in file, e.g. isize changing, dnode updating, so in ->fsync
we will skip flushing node pages for above cases, result in making
fdatasynced file being lost during recovery.

Currently we have introduced DIRTY_META global list in sbi for tracking
dirty inode selectively, so in fdatasync we can choose to flush nodes
depend on dirty state of current inode in the list.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 11 ++++++++++-
 fs/f2fs/file.c |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 973ca74404de..16bedd87022d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1763,8 +1763,17 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
 		set_inode_flag(inode, FI_AUTO_RECOVER);
 }
 
-static inline bool f2fs_skip_inode_update(struct inode *inode)
+static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 {
+	if (dsync) {
+		struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+		bool ret;
+
+		spin_lock(&sbi->inode_lock[DIRTY_META]);
+		ret = list_empty(&F2FS_I(inode)->gdirty_list);
+		spin_unlock(&sbi->inode_lock[DIRTY_META]);
+		return ret;
+	}
 	if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
 		return false;
 	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index bab65f0a5bb5..7fd8e7cffe9b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -209,7 +209,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	}
 
 	/* if the inode is dirty, let's recover all the time */
-	if (!datasync && !f2fs_skip_inode_update(inode)) {
+	if (!f2fs_skip_inode_update(inode, datasync)) {
 		f2fs_write_inode(inode, NULL);
 		goto go_write;
 	}

From 7ab9a6acd086ecdc9a5e86785b7abb3076118ba1 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 16 Nov 2016 15:09:48 -0800
Subject: [PATCH 1109/3220] f2fs: do not recover i_size if it's valid

commit 3a3a5ead7b6d2c9a29f493791ba23f264052db34 upstream.

If i_size is already valid during roll_forward recovery, we should not update
it according to the block alignment.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 62523b217571..687c176f0b56 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -425,7 +425,7 @@ retry_dn:
 			continue;
 		}
 
-		if ((start + 1) << PAGE_SHIFT > i_size_read(inode))
+		if (i_size_read(inode) <= (start << PAGE_SHIFT))
 			f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
 
 		/*

From 7ce8cbc7faa97d91231383dc662316226e2dcd15 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 16 Nov 2016 18:53:16 -0800
Subject: [PATCH 1110/3220] f2fs: fix wrong AUTO_RECOVER condition

commit 97dd26ad834739d4e4ea35fd7ab5f92824de4cbb upstream.

If i_size is not aligned to the f2fs's block size, we should not skip inode
update during fsync.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 16bedd87022d..fa66e5baa58a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1774,7 +1774,8 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 		spin_unlock(&sbi->inode_lock[DIRTY_META]);
 		return ret;
 	}
-	if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
+	if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
+			i_size_read(inode) & PAGE_MASK)
 		return false;
 	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
 }

From 23eb5177568d70646669871d2268106947630990 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Fri, 18 Nov 2016 22:21:13 +0800
Subject: [PATCH 1111/3220] f2fs: drop duplicate header timer.h

commit b4ceec29219e340178baa9c5f17bf97a42951cc8 upstream.

Drop duplicate header timer.h from segment.c.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 23e8892c4e60..ba715d60c738 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -16,7 +16,6 @@
 #include <linux/kthread.h>
 #include <linux/swap.h>
 #include <linux/timer.h>
-#include <linux/timer.h>
 
 #include "f2fs.h"
 #include "segment.h"

From a680707a3f4161761e35effaedbdba02ee7ee069 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 18 Nov 2016 22:27:41 +0800
Subject: [PATCH 1112/3220] f2fs: fix incorrect free inode count in ->statfs

commit b08b12d2ddc85b977a0531470cf6a7158289aaaf upstream.

While calculating inode count that we can create at most in the left space,
we should consider space which data/node blocks occupied, since we create
data/node mixly in main area. So fix the wrong calculation in ->statfs.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 2d332a16de71..a288456c17c0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -852,7 +852,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = user_block_count - valid_user_blocks(sbi);
 
 	buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
-	buf->f_ffree = buf->f_files - valid_inode_count(sbi);
+	buf->f_ffree = min(buf->f_files - valid_node_count(sbi),
+							buf->f_bavail);
 
 	buf->f_namelen = F2FS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;

From 55342cce61558f36270ba7056f29260b2e769d7e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 22 Nov 2016 15:20:16 +0100
Subject: [PATCH 1113/3220] f2fs: fix 32-bit build

commit 19c526515f6b998039d5d71fea879d255f173746 upstream.

The addition of multiple-device support broke CONFIG_BLK_DEV_ZONED
on 32-bit machines because of a 64-bit division:

fs/f2fs/f2fs.o: In function `__issue_discard_async':
extent_cache.c:(.text.__issue_discard_async+0xd4): undefined reference to `__aeabi_uldivmod'

Fortunately, bdev_zone_size() is guaranteed to return a power-of-two
number, so we can replace the % operator with a cheaper bit mask.

Fixes: 792b84b74b54 ("f2fs: support multiple devices")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index ba715d60c738..4f557e5e789d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -614,7 +614,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 	}
 	sector = SECTOR_FROM_BLOCK(blkstart);
 
-	if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) {
+	if (sector & (bdev_zone_size(bdev) - 1) ||
+				nr_sects != bdev_zone_size(bdev)) {
 		f2fs_msg(sbi->sb, KERN_INFO,
 			"(%d) %s: Unaligned discard attempted (block %x + %x)",
 			devi, sbi->s_ndevs ? FDEV(devi).path: "",

From 9f495d826bf3dc4894f3cb4ef61af77102a38d9a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 28 Nov 2016 15:33:38 -0800
Subject: [PATCH 1114/3220] f2fs: do not activate auto_recovery for fallocated
 i_size

commit 26787236b36660baf4d136281d40b5bb33a570ec upstream.

If a file needs to keep its i_size by fallocate, we need to turn off auto
recovery during roll-forward recovery.

This will resolve the below scenario.

1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 4096" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "falloc -k 4096 4096" -c "fsync"
3. md5sum /mnt/f2fs/file;
4. godown /mnt/f2fs/
5. umount /mnt/f2fs/
6. mount -t f2fs /dev/sdx /mnt/f2fs
7. md5sum /mnt/f2fs/file

Reported-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h     | 38 +++++++++++++++++++++-----------------
 fs/f2fs/file.c     |  2 ++
 fs/f2fs/recovery.c | 11 ++++++++---
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fa66e5baa58a..4f1046be0d74 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -449,6 +449,7 @@ struct f2fs_map_blocks {
 #define FADVISE_LOST_PINO_BIT	0x02
 #define FADVISE_ENCRYPT_BIT	0x04
 #define FADVISE_ENC_NAME_BIT	0x08
+#define FADVISE_KEEP_SIZE_BIT	0x10
 
 #define file_is_cold(inode)	is_file(inode, FADVISE_COLD_BIT)
 #define file_wrong_pino(inode)	is_file(inode, FADVISE_LOST_PINO_BIT)
@@ -461,6 +462,8 @@ struct f2fs_map_blocks {
 #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
 #define file_enc_name(inode)	is_file(inode, FADVISE_ENC_NAME_BIT)
 #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+#define file_keep_isize(inode)	is_file(inode, FADVISE_KEEP_SIZE_BIT)
+#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
 
 #define DEF_DIR_LEVEL		0
 
@@ -1763,23 +1766,6 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
 		set_inode_flag(inode, FI_AUTO_RECOVER);
 }
 
-static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
-{
-	if (dsync) {
-		struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-		bool ret;
-
-		spin_lock(&sbi->inode_lock[DIRTY_META]);
-		ret = list_empty(&F2FS_I(inode)->gdirty_list);
-		spin_unlock(&sbi->inode_lock[DIRTY_META]);
-		return ret;
-	}
-	if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
-			i_size_read(inode) & PAGE_MASK)
-		return false;
-	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
-}
-
 static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
 {
 	F2FS_I(inode)->i_current_depth = depth;
@@ -1932,6 +1918,24 @@ static inline void clear_file(struct inode *inode, int type)
 	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
+static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
+{
+	if (dsync) {
+		struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+		bool ret;
+
+		spin_lock(&sbi->inode_lock[DIRTY_META]);
+		ret = list_empty(&F2FS_I(inode)->gdirty_list);
+		spin_unlock(&sbi->inode_lock[DIRTY_META]);
+		return ret;
+	}
+	if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
+			file_keep_isize(inode) ||
+			i_size_read(inode) & PAGE_MASK)
+		return false;
+	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
+}
+
 static inline int f2fs_readonly(struct super_block *sb)
 {
 	return sb->s_flags & MS_RDONLY;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7fd8e7cffe9b..57b6dbcbbd88 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1403,6 +1403,8 @@ static long f2fs_fallocate(struct file *file, int mode,
 	if (!ret) {
 		inode->i_mtime = inode->i_ctime = current_time(inode);
 		f2fs_mark_inode_dirty_sync(inode, false);
+		if (mode & FALLOC_FL_KEEP_SIZE)
+			file_set_keep_isize(inode);
 		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
 
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 687c176f0b56..981a9584b62f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -187,6 +187,8 @@ static void recover_inode(struct inode *inode, struct page *page)
 	inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
 
+	F2FS_I(inode)->i_advise = raw->i_advise;
+
 	if (file_enc_name(inode))
 		name = "<encrypted>";
 	else
@@ -425,7 +427,8 @@ retry_dn:
 			continue;
 		}
 
-		if (i_size_read(inode) <= (start << PAGE_SHIFT))
+		if (!file_keep_isize(inode) &&
+				(i_size_read(inode) <= (start << PAGE_SHIFT)))
 			f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
 
 		/*
@@ -478,8 +481,10 @@ err:
 	f2fs_put_dnode(&dn);
 out:
 	f2fs_msg(sbi->sb, KERN_NOTICE,
-		"recover_data: ino = %lx, recovered = %d blocks, err = %d",
-		inode->i_ino, recovered, err);
+		"recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
+		inode->i_ino,
+		file_keep_isize(inode) ? "keep" : "recover",
+		recovered, err);
 	return err;
 }
 

From 9a82dd23e414b084ed191d8506b99cb85f401028 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Mon, 28 Nov 2016 19:13:43 -0800
Subject: [PATCH 1115/3220] f2fs: return AOP_WRITEPAGE_ACTIVATE for writepage

commit 0002b61bdaac732bcff364a18f5bd57c95def0a5 upstream.

We should use AOP_WRITEPAGE_ACTIVATE when we bypass writing pages.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Miao Xie <miaoxie@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fa254daac970..283fc9de4762 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1376,6 +1376,8 @@ out:
 
 redirty_out:
 	redirty_page_for_writepage(wbc, page);
+	if (!err)
+		return AOP_WRITEPAGE_ACTIVATE;
 	unlock_page(page);
 	return err;
 }
@@ -1471,6 +1473,15 @@ continue_unlock:
 
 			ret = mapping->a_ops->writepage(page, wbc);
 			if (unlikely(ret)) {
+				/*
+				 * keep nr_to_write, since vfs uses this to
+				 * get # of written pages.
+				 */
+				if (ret == AOP_WRITEPAGE_ACTIVATE) {
+					unlock_page(page);
+					ret = 0;
+					continue;
+				}
 				done_index = page->index + 1;
 				done = 1;
 				break;

From 036ed1b8ebbbe2db084f83bb8e03d83a0d3d6ee6 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 2 Dec 2016 15:11:32 -0800
Subject: [PATCH 1116/3220] Revert "f2fs: use percpu_counter for # of dirty
 pages in inode"

commit 204706c7accfabb67b97eef9f9a28361b6201199 upstream.

This reverts commit 1beba1b3a953107c3ff5448ab4e4297db4619c76.

The perpcu_counter doesn't provide atomicity in single core and consume more
DRAM. That incurs fs_mark test failure due to ENOMEM.

Cc: stable@vger.kernel.org # 4.7+
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 10 +++++-----
 fs/f2fs/file.c  |  2 +-
 fs/f2fs/super.c |  7 +------
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4f1046be0d74..c7eb2ff398ce 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -479,7 +479,7 @@ struct f2fs_inode_info {
 	/* Use below internally in f2fs*/
 	unsigned long flags;		/* use to pass per-file flags */
 	struct rw_semaphore i_sem;	/* protect fi info */
-	struct percpu_counter dirty_pages;	/* # of dirty pages */
+	atomic_t dirty_pages;		/* # of dirty pages */
 	f2fs_hash_t chash;		/* hash value of given file name */
 	unsigned int clevel;		/* maximum level of given file name */
 	nid_t i_xattr_nid;		/* node id that contains xattrs */
@@ -1316,7 +1316,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 
 static inline void inode_inc_dirty_pages(struct inode *inode)
 {
-	percpu_counter_inc(&F2FS_I(inode)->dirty_pages);
+	atomic_inc(&F2FS_I(inode)->dirty_pages);
 	inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
 }
@@ -1332,7 +1332,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
 			!S_ISLNK(inode->i_mode))
 		return;
 
-	percpu_counter_dec(&F2FS_I(inode)->dirty_pages);
+	atomic_dec(&F2FS_I(inode)->dirty_pages);
 	dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
 }
@@ -1342,9 +1342,9 @@ static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
 	return atomic_read(&sbi->nr_pages[count_type]);
 }
 
-static inline s64 get_dirty_pages(struct inode *inode)
+static inline int get_dirty_pages(struct inode *inode)
 {
-	return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages);
+	return atomic_read(&F2FS_I(inode)->dirty_pages);
 }
 
 static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 57b6dbcbbd88..5c0500813efe 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1537,7 +1537,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 		goto out;
 
 	f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
-		"Unexpected flush for atomic writes: ino=%lu, npages=%lld",
+		"Unexpected flush for atomic writes: ino=%lu, npages=%u",
 					inode->i_ino, get_dirty_pages(inode));
 	ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
 	if (ret)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a288456c17c0..ce09191891f8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -571,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 
 	init_once((void *) fi);
 
-	if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) {
-		kmem_cache_free(f2fs_inode_cachep, fi);
-		return NULL;
-	}
-
 	/* Initialize f2fs-specific inode info */
 	fi->vfs_inode.i_version = 1;
+	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
 	init_rwsem(&fi->i_sem);
@@ -703,7 +699,6 @@ static void f2fs_i_callback(struct rcu_head *head)
 
 static void f2fs_destroy_inode(struct inode *inode)
 {
-	percpu_counter_destroy(&F2FS_I(inode)->dirty_pages);
 	call_rcu(&inode->i_rcu, f2fs_i_callback);
 }
 

From d1c2c35718474b52d19a58acffe7a698d6dc764c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 5 Dec 2016 11:37:14 -0800
Subject: [PATCH 1117/3220] f2fs: call sync_fs when f2fs is idle

commit f455c8a5f0a24090e99249eb7280012376adec2c upstream.

The sync_fs in f2fs_balance_fs_bg must avoid interrupting current user requests.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4f557e5e789d..b95f07559d90 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -381,12 +381,15 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	else
 		build_free_nids(sbi, false);
 
+	if (!is_idle(sbi))
+		return;
+
 	/* checkpoint is the only way to shrink partial cached entries */
 	if (!available_free_memory(sbi, NAT_ENTRIES) ||
 			!available_free_memory(sbi, INO_ENTRIES) ||
 			excess_prefree_segs(sbi) ||
 			excess_dirty_nats(sbi) ||
-			(is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+			f2fs_time_over(sbi, CP_TIME)) {
 		if (test_opt(sbi, DATA_FLUSH)) {
 			struct blk_plug plug;
 

From 5b80a5e2bef9d5291c832cd367dd038eeaa753d4 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 5 Dec 2016 13:56:04 -0800
Subject: [PATCH 1118/3220] f2fs: detect wrong layout

commit 2040fce83fe17763b07c97c1f691da2bb85e4135 upstream.

Previous mkfs.f2fs allows small partition inappropriately, so f2fs should detect
that as well.

Refer this in f2fs-tools.

mkfs.f2fs: detect small partition by overprovision ratio and # of segments

Reported-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.h |  2 ++
 fs/f2fs/super.c   | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 89ab4301ef02..9d44ce83acb2 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -18,6 +18,8 @@
 #define DEF_RECLAIM_PREFREE_SEGMENTS	5	/* 5% over total segments */
 #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS	4096	/* 8GB in maximum */
 
+#define F2FS_MIN_SEGMENTS	9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
+
 /* L: Logical segment # in volume, R: Relative segment # in main area */
 #define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno)
 #define GET_R2L_SEGNO(free_i, segno)	(segno + free_i->start_segno)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ce09191891f8..07f4ba444733 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1453,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	unsigned int total, fsmeta;
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	unsigned int ovp_segments, reserved_segments;
 
 	total = le32_to_cpu(raw_super->segment_count);
 	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -1464,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	if (unlikely(fsmeta >= total))
 		return 1;
 
+	ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+	reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+
+	if (unlikely(fsmeta < F2FS_MIN_SEGMENTS ||
+			ovp_segments == 0 || reserved_segments == 0)) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+			"Wrong layout: check mkfs.f2fs version");
+		return 1;
+	}
+
 	if (unlikely(f2fs_cp_error(sbi))) {
 		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
 		return 1;

From 80ea4ddbb01c6e1f89432bf6544dcebb2e691cf2 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 5 Dec 2016 17:25:32 -0800
Subject: [PATCH 1119/3220] f2fs: free meta pages if sanity check for ckpt is
 failed

commit a2125ff7dd1ed3a2a53cdc1f8f9c9cec9cfaa7ab upstream.

This fixes missing freeing meta pages in the error case.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 889317e07122..640f28576e88 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -768,7 +768,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 
 	/* Sanity checking of checkpoint */
 	if (sanity_check_ckpt(sbi))
-		goto fail_no_cp;
+		goto free_fail_no_cp;
 
 	if (cur_page == cp1)
 		sbi->cur_cp_pack = 1;
@@ -796,6 +796,9 @@ done:
 	f2fs_put_page(cp2, 1);
 	return 0;
 
+free_fail_no_cp:
+	f2fs_put_page(cp1, 1);
+	f2fs_put_page(cp2, 1);
 fail_no_cp:
 	kfree(sbi->ckpt);
 	return -EINVAL;

From be4b8492172a2f44774ac81384e60cdc396a6d6f Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 7 Dec 2016 16:23:32 -0800
Subject: [PATCH 1120/3220] f2fs: fix to access nullified flush_cmd_control
 pointer

commit 5eba8c5d1fb3af28b2073ba5228d4998196c1bcc upstream.

f2fs_sync_file()             remount_ro
 - f2fs_readonly
                               - destroy_flush_cmd_control
 - f2fs_issue_flush
   - no fcc pointer!

So, this patch doesn't free fcc in this case, but just stop its kernel thread
which sends flush commands.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    |  2 +-
 fs/f2fs/segment.c | 33 +++++++++++++++++++++++++--------
 fs/f2fs/super.c   |  5 +++--
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c7eb2ff398ce..3ef2d93ab936 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2150,7 +2150,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
 int f2fs_issue_flush(struct f2fs_sb_info *);
 int create_flush_cmd_control(struct f2fs_sb_info *);
-void destroy_flush_cmd_control(struct f2fs_sb_info *);
+void destroy_flush_cmd_control(struct f2fs_sb_info *, bool);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
 bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index b95f07559d90..a288de069164 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -486,8 +486,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 	if (!fcc->dispatch_list)
 		wake_up(&fcc->flush_wait_queue);
 
-	wait_for_completion(&cmd.wait);
-	atomic_dec(&fcc->submit_flush);
+	if (fcc->f2fs_issue_flush) {
+		wait_for_completion(&cmd.wait);
+		atomic_dec(&fcc->submit_flush);
+	} else {
+		llist_del_all(&fcc->issue_list);
+		atomic_set(&fcc->submit_flush, 0);
+	}
 
 	return cmd.ret;
 }
@@ -498,6 +503,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	struct flush_cmd_control *fcc;
 	int err = 0;
 
+	if (SM_I(sbi)->cmd_control_info) {
+		fcc = SM_I(sbi)->cmd_control_info;
+		goto init_thread;
+	}
+
 	fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
 	if (!fcc)
 		return -ENOMEM;
@@ -505,6 +515,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	init_waitqueue_head(&fcc->flush_wait_queue);
 	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->cmd_control_info = fcc;
+init_thread:
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(fcc->f2fs_issue_flush)) {
@@ -517,14 +528,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	return err;
 }
 
-void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
+void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
 {
 	struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
 
-	if (fcc && fcc->f2fs_issue_flush)
-		kthread_stop(fcc->f2fs_issue_flush);
-	kfree(fcc);
-	SM_I(sbi)->cmd_control_info = NULL;
+	if (fcc && fcc->f2fs_issue_flush) {
+		struct task_struct *flush_thread = fcc->f2fs_issue_flush;
+
+		fcc->f2fs_issue_flush = NULL;
+		kthread_stop(flush_thread);
+	}
+	if (free) {
+		kfree(fcc);
+		SM_I(sbi)->cmd_control_info = NULL;
+	}
 }
 
 static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
@@ -2658,7 +2675,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
 
 	if (!sm_info)
 		return;
-	destroy_flush_cmd_control(sbi);
+	destroy_flush_cmd_control(sbi, true);
 	destroy_dirty_segmap(sbi);
 	destroy_curseg(sbi);
 	destroy_free_segmap(sbi);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 07f4ba444733..e6d8d011786c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1103,8 +1103,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * or if flush_merge is not passed in mount option.
 	 */
 	if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
-		destroy_flush_cmd_control(sbi);
-	} else if (!SM_I(sbi)->cmd_control_info) {
+		clear_opt(sbi, FLUSH_MERGE);
+		destroy_flush_cmd_control(sbi, false);
+	} else {
 		err = create_flush_cmd_control(sbi);
 		if (err)
 			goto restore_gc;

From 02dcb93a1d1707c7776e92cf302a33178916ce2a Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Sun, 11 Dec 2016 15:35:15 +0800
Subject: [PATCH 1121/3220] f2fs: fix a missing size change in f2fs_setattr

commit c0ed4405a99ec9be2a0f062eaafc002d8d26c99f upstream.

This patch fix a missing size change in f2fs_setattr

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5c0500813efe..5808d5c709a7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -678,6 +678,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int err;
+	bool size_changed = false;
 
 	err = inode_change_ok(inode, attr);
 	if (err)
@@ -708,6 +709,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			}
 			inode->i_mtime = inode->i_ctime = current_time(inode);
 		}
+
+		size_changed = true;
 	}
 
 	__setattr_copy(inode, attr);
@@ -720,8 +723,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	/* update attributes only */
-	f2fs_mark_inode_dirty_sync(inode, false);
+	/* file size may changed here */
+	f2fs_mark_inode_dirty_sync(inode, size_changed);
 
 	/* inode change will produce dirty node pages flushed by checkpoint */
 	f2fs_balance_fs(F2FS_I_SB(inode), true);

From a886cc1d3a501af4c867c3728d4a5594fc80f4c7 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:22 +0200
Subject: [PATCH 1122/3220] UPSTREAM: ipv6: fib: Unlink replaced routes from
 their nodes

When a route is deleted its node pointer is set to NULL to indicate it's
no longer linked to its node. Do the same for routes that are replaced.

This will later allow us to test if a route is still in the FIB by
checking its node pointer instead of its reference count.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Cherry-pick from: 7483cea79957312e9f8e9cf760a1bc5d6c507113
Bug: 64978549

Change-Id: Ibfa54cf918084138b6b19437e9ef86bfaea5deae
---
 net/ipv6/ip6_fib.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index c23e02a7ccb0..bf3824b59597 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -909,6 +909,7 @@ add:
 			fn->fn_flags |= RTN_RTINFO;
 		}
 		nsiblings = iter->rt6i_nsiblings;
+		iter->rt6i_node = NULL;
 		fib6_purge_rt(iter, fn, info->nl_net);
 		if (fn->rr_ptr == iter)
 			fn->rr_ptr = NULL;
@@ -923,6 +924,7 @@ add:
 					break;
 				if (rt6_qualify_for_ecmp(iter)) {
 					*ins = iter->dst.rt6_next;
+					iter->rt6i_node = NULL;
 					fib6_purge_rt(iter, fn, info->nl_net);
 					if (fn->rr_ptr == iter)
 						fn->rr_ptr = NULL;

From 642da1dadeb8bdbeab8acd138afbdc041a723ae9 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Mon, 25 Sep 2017 08:55:09 -0700
Subject: [PATCH 1123/3220] FROMLIST: binder: fix use-after-free in
 binder_transaction()

(from https://patchwork.kernel.org/patch/9978801/)

User-space normally keeps the node alive when creating a transaction
since it has a reference to the target. The local strong ref keeps it
alive if the sending process dies before the target process processes
the transaction. If the source process is malicious or has a reference
counting bug, this can fail.

In this case, when we attempt to decrement the node in the failure
path, the node has already been freed.

This is fixed by taking a tmpref on the node while constructing
the transaction. To avoid re-acquiring the node lock and inner
proc lock to increment the proc's tmpref, a helper is used that
does the ref increments on both the node and proc.

Bug: 66899329
Change-Id: Iad40e1e0bccee88234900494fb52a510a37fe8d7
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 95 ++++++++++++++++++++++++++++------------
 1 file changed, 67 insertions(+), 28 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 7c9f9dde8397..32a2b2f44691 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2744,6 +2744,48 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 	return true;
 }
 
+/**
+ * binder_get_node_refs_for_txn() - Get required refs on node for txn
+ * @node:         struct binder_node for which to get refs
+ * @proc:         returns @node->proc if valid
+ * @error:        if no @proc then returns BR_DEAD_REPLY
+ *
+ * User-space normally keeps the node alive when creating a transaction
+ * since it has a reference to the target. The local strong ref keeps it
+ * alive if the sending process dies before the target process processes
+ * the transaction. If the source process is malicious or has a reference
+ * counting bug, relying on the local strong ref can fail.
+ *
+ * Since user-space can cause the local strong ref to go away, we also take
+ * a tmpref on the node to ensure it survives while we are constructing
+ * the transaction. We also need a tmpref on the proc while we are
+ * constructing the transaction, so we take that here as well.
+ *
+ * Return: The target_node with refs taken or NULL if no @node->proc is NULL.
+ * Also sets @proc if valid. If the @node->proc is NULL indicating that the
+ * target proc has died, @error is set to BR_DEAD_REPLY
+ */
+static struct binder_node *binder_get_node_refs_for_txn(
+		struct binder_node *node,
+		struct binder_proc **procp,
+		uint32_t *error)
+{
+	struct binder_node *target_node = NULL;
+
+	binder_node_inner_lock(node);
+	if (node->proc) {
+		target_node = node;
+		binder_inc_node_nilocked(node, 1, 0, NULL);
+		binder_inc_node_tmpref_ilocked(node);
+		node->proc->tmp_ref++;
+		*procp = node->proc;
+	} else
+		*error = BR_DEAD_REPLY;
+	binder_node_inner_unlock(node);
+
+	return target_node;
+}
+
 static void binder_transaction(struct binder_proc *proc,
 			       struct binder_thread *thread,
 			       struct binder_transaction_data *tr, int reply,
@@ -2846,43 +2888,35 @@ static void binder_transaction(struct binder_proc *proc,
 			ref = binder_get_ref_olocked(proc, tr->target.handle,
 						     true);
 			if (ref) {
-				binder_inc_node(ref->node, 1, 0, NULL);
-				target_node = ref->node;
+				target_node = binder_get_node_refs_for_txn(
+						ref->node, &target_proc,
+						&return_error);
+			} else {
+				binder_user_error("%d:%d got transaction to invalid handle\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
 			}
 			binder_proc_unlock(proc);
-			if (target_node == NULL) {
-				binder_user_error("%d:%d got transaction to invalid handle\n",
-					proc->pid, thread->pid);
-				return_error = BR_FAILED_REPLY;
-				return_error_param = -EINVAL;
-				return_error_line = __LINE__;
-				goto err_invalid_target_handle;
-			}
 		} else {
 			mutex_lock(&context->context_mgr_node_lock);
 			target_node = context->binder_context_mgr_node;
-			if (target_node == NULL) {
+			if (target_node)
+				target_node = binder_get_node_refs_for_txn(
+						target_node, &target_proc,
+						&return_error);
+			else
 				return_error = BR_DEAD_REPLY;
-				mutex_unlock(&context->context_mgr_node_lock);
-				return_error_line = __LINE__;
-				goto err_no_context_mgr_node;
-			}
-			binder_inc_node(target_node, 1, 0, NULL);
 			mutex_unlock(&context->context_mgr_node_lock);
 		}
-		e->to_node = target_node->debug_id;
-		binder_node_lock(target_node);
-		target_proc = target_node->proc;
-		if (target_proc == NULL) {
-			binder_node_unlock(target_node);
-			return_error = BR_DEAD_REPLY;
+		if (!target_node) {
+			/*
+			 * return_error is set above
+			 */
+			return_error_param = -EINVAL;
 			return_error_line = __LINE__;
 			goto err_dead_binder;
 		}
-		binder_inner_proc_lock(target_proc);
-		target_proc->tmp_ref++;
-		binder_inner_proc_unlock(target_proc);
-		binder_node_unlock(target_node);
+		e->to_node = target_node->debug_id;
 		if (security_binder_transaction(proc->tsk,
 						target_proc->tsk) < 0) {
 			return_error = BR_FAILED_REPLY;
@@ -3241,6 +3275,8 @@ static void binder_transaction(struct binder_proc *proc,
 	if (target_thread)
 		binder_thread_dec_tmpref(target_thread);
 	binder_proc_dec_tmpref(target_proc);
+	if (target_node)
+		binder_dec_node_tmpref(target_node);
 	/*
 	 * write barrier to synchronize with initialization
 	 * of log entry
@@ -3260,6 +3296,8 @@ err_bad_parent:
 err_copy_data_failed:
 	trace_binder_transaction_failed_buffer_release(t->buffer);
 	binder_transaction_buffer_release(target_proc, t->buffer, offp);
+	if (target_node)
+		binder_dec_node_tmpref(target_node);
 	target_node = NULL;
 	t->buffer->transaction = NULL;
 	binder_alloc_free_buf(&target_proc->alloc, t->buffer);
@@ -3274,13 +3312,14 @@ err_bad_call_stack:
 err_empty_call_stack:
 err_dead_binder:
 err_invalid_target_handle:
-err_no_context_mgr_node:
 	if (target_thread)
 		binder_thread_dec_tmpref(target_thread);
 	if (target_proc)
 		binder_proc_dec_tmpref(target_proc);
-	if (target_node)
+	if (target_node) {
 		binder_dec_node(target_node, 1, 0);
+		binder_dec_node_tmpref(target_node);
+	}
 
 	binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 		     "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n",

From 6b1f845ef3460e721da7aeeecc37e41229285f85 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 11 Jan 2017 09:55:38 -0800
Subject: [PATCH 1124/3220] f2fs: remove wrong backported codes

Kconfig and dentry RCU mode fixes.

Fixes: c1286ff41c2f610df ("f2fs: backport from 'for-f2fs-4.9'")
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/Kconfig | 1 -
 fs/f2fs/namei.c | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 1852d99df97b..378c221d68a9 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -2,7 +2,6 @@ config F2FS_FS
 	tristate "F2FS filesystem support"
 	depends on BLOCK
 	select CRYPTO
-	select KEYS
 	select CRYPTO_CRC32
 	help
 	  F2FS is based on Log-structured File System (LFS), which supports
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 468b2dbe6d34..523bf073642e 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1014,9 +1014,6 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 	u32 max_size = inode->i_sb->s_blocksize;
 	int res;
 
-	if (!dentry)
-		return ERR_PTR(-ECHILD);
-
 	res = fscrypt_get_encryption_info(inode);
 	if (res)
 		return ERR_PTR(res);

From dc45fd9e28345dccdfef354d4ba16e7d406f3301 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 11 Jan 2017 18:24:54 -0800
Subject: [PATCH 1125/3220] f2fs: resolve op and op_flags confilcts

commit 70fd76140a6cb63262bd47b68d57b42e889c10ee upstream.

This patch backported ("block,fs: use REQ_* flags directly")

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c        | 11 +++++---
 fs/f2fs/data.c              | 54 ++++++++++++++++++++-----------------
 fs/f2fs/f2fs.h              | 24 +++++++++++++++--
 fs/f2fs/gc.c                | 12 ++++++---
 fs/f2fs/inline.c            |  3 ++-
 fs/f2fs/node.c              | 12 +++++----
 fs/f2fs/segment.c           |  9 ++++---
 fs/f2fs/trace.c             |  7 ++---
 include/trace/events/f2fs.h | 19 ++++++++-----
 9 files changed, 98 insertions(+), 53 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 640f28576e88..2ed785e5ffbb 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -64,14 +64,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = META,
-		.rw = READ_SYNC | REQ_META | REQ_PRIO,
+		.op = REQ_OP_READ,
+		.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
 		.old_blkaddr = index,
 		.new_blkaddr = index,
 		.encrypted_page = NULL,
 	};
 
 	if (unlikely(!is_meta))
-		fio.rw &= ~REQ_META;
+		fio.op_flags &= ~REQ_META;
 repeat:
 	page = f2fs_grab_cache_page(mapping, index, false);
 	if (!page) {
@@ -158,13 +159,15 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = META,
-		.rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
+		.op = REQ_OP_READ,
+		.op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) :
+						REQ_RAHEAD,
 		.encrypted_page = NULL,
 	};
 	struct blk_plug plug;
 
 	if (unlikely(type == META_POR))
-		fio.rw &= ~REQ_META;
+		fio.op_flags &= ~REQ_META;
 
 	blk_start_plug(&plug);
 	for (; nrpages-- > 0; blkno++) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 283fc9de4762..2eddf1daf995 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -168,15 +168,15 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 	return bio;
 }
 
-static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw,
-			struct bio *bio, enum page_type type)
+static inline void __submit_bio(struct f2fs_sb_info *sbi,
+				struct bio *bio, enum page_type type)
 {
-	if (!is_read_io(rw)) {
+	if (!is_read_io(bio_op(bio))) {
 		if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
 			current->plug && (type == DATA || type == NODE))
 			blk_finish_plug(current->plug);
 	}
-	submit_bio(rw, bio);
+	submit_bio(0, bio);
 }
 
 static void __submit_merged_bio(struct f2fs_bio_info *io)
@@ -186,12 +186,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
 	if (!io->bio)
 		return;
 
-	if (is_read_io(fio->rw))
+	if (is_read_io(fio->op))
 		trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
 	else
 		trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
 
-	__submit_bio(io->sbi, fio->rw, io->bio, fio->type);
+	bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
+
+	__submit_bio(io->sbi, io->bio, fio->type);
 	io->bio = NULL;
 }
 
@@ -257,10 +259,10 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 	/* change META to META_FLUSH in the checkpoint procedure */
 	if (type >= META_FLUSH) {
 		io->fio.type = META_FLUSH;
-		if (test_opt(sbi, NOBARRIER))
-			io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO;
-		else
-			io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+		io->fio.op = REQ_OP_WRITE;
+		io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
+		if (!test_opt(sbi, NOBARRIER))
+			io->fio.op_flags |= REQ_FUA;
 	}
 	__submit_merged_bio(io);
 out:
@@ -302,14 +304,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	f2fs_trace_ios(fio, 0);
 
 	/* Allocate a new bio */
-	bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw));
+	bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op));
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		bio_put(bio);
 		return -EFAULT;
 	}
+	bio_set_op_attrs(bio, fio->op, fio->op_flags);
 
-	__submit_bio(fio->sbi, fio->rw, bio, fio->type);
+	__submit_bio(fio->sbi, bio, fio->type);
 	return 0;
 }
 
@@ -318,7 +321,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 	struct f2fs_sb_info *sbi = fio->sbi;
 	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
 	struct f2fs_bio_info *io;
-	bool is_read = is_read_io(fio->rw);
+	bool is_read = is_read_io(fio->op);
 	struct page *bio_page;
 
 	io = is_read ? &sbi->read_io : &sbi->write_io[btype];
@@ -335,7 +338,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 	down_write(&io->io_rwsem);
 
 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
-			(io->fio.rw != fio->rw) ||
+	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
 			!__same_bdev(sbi, fio->new_blkaddr, io->bio)))
 		__submit_merged_bio(io);
 alloc_new:
@@ -463,7 +466,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
 }
 
 struct page *get_read_data_page(struct inode *inode, pgoff_t index,
-						int rw, bool for_write)
+						int op_flags, bool for_write)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
@@ -473,7 +476,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index,
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(inode),
 		.type = DATA,
-		.rw = rw,
+		.op = REQ_OP_READ,
+		.op_flags = op_flags,
 		.encrypted_page = NULL,
 	};
 
@@ -541,7 +545,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
 		return page;
 	f2fs_put_page(page, 0);
 
-	page = get_read_data_page(inode, index, READ_SYNC, false);
+	page = get_read_data_page(inode, index, REQ_SYNC, false);
 	if (IS_ERR(page))
 		return page;
 
@@ -567,7 +571,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 repeat:
-	page = get_read_data_page(inode, index, READ_SYNC, for_write);
+	page = get_read_data_page(inode, index, REQ_SYNC, for_write);
 	if (IS_ERR(page))
 		return page;
 
@@ -1145,7 +1149,7 @@ got_it:
 		if (bio && (last_block_in_bio != block_nr - 1 ||
 			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
 submit_and_realloc:
-			__submit_bio(F2FS_I_SB(inode), READ, bio, DATA);
+			__submit_bio(F2FS_I_SB(inode), bio, DATA);
 			bio = NULL;
 		}
 		if (bio == NULL) {
@@ -1154,6 +1158,7 @@ submit_and_realloc:
 				bio = NULL;
 				goto set_error_page;
 			}
+			bio_set_op_attrs(bio, REQ_OP_READ, 0);
 		}
 
 		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
@@ -1168,7 +1173,7 @@ set_error_page:
 		goto next_page;
 confused:
 		if (bio) {
-			__submit_bio(F2FS_I_SB(inode), READ, bio, DATA);
+			__submit_bio(F2FS_I_SB(inode), bio, DATA);
 			bio = NULL;
 		}
 		unlock_page(page);
@@ -1178,7 +1183,7 @@ next_page:
 	}
 	BUG_ON(pages && !list_empty(pages));
 	if (bio)
-		__submit_bio(F2FS_I_SB(inode), READ, bio, DATA);
+		__submit_bio(F2FS_I_SB(inode), bio, DATA);
 	return 0;
 }
 
@@ -1296,7 +1301,8 @@ static int f2fs_write_data_page(struct page *page,
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = DATA,
-		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+		.op = REQ_OP_WRITE,
+		.op_flags = wbc_to_write_flags(wbc),
 		.page = page,
 		.encrypted_page = NULL,
 	};
@@ -1728,14 +1734,14 @@ repeat:
 			err = PTR_ERR(bio);
 			goto fail;
 		}
-
+		bio->bi_rw = READ_SYNC;
 		if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 			bio_put(bio);
 			err = -EFAULT;
 			goto fail;
 		}
 
-		__submit_bio(sbi, READ_SYNC, bio, DATA);
+		__submit_bio(sbi, bio, DATA);
 
 		lock_page(page);
 		if (unlikely(page->mapping != mapping)) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3ef2d93ab936..d0c7decdd3ac 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,6 +24,7 @@
 #include <linux/blkdev.h>
 #include <linux/fscrypto.h>
 #include <crypto/hash.h>
+#include <linux/writeback.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
@@ -112,6 +113,24 @@ struct f2fs_mount_info {
 #define F2FS_CLEAR_FEATURE(sb, mask)					\
 	F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
 
+/* bio stuffs */
+#define REQ_OP_READ	READ
+#define REQ_OP_WRITE	WRITE
+#define bio_op(bio)	((bio)->bi_rw & 1)
+
+static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
+		unsigned op_flags)
+{
+	bio->bi_rw = op | op_flags;
+}
+
+static inline int wbc_to_write_flags(struct writeback_control *wbc)
+{
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		return REQ_SYNC;
+	return 0;
+}
+
 /**
  * wq_has_sleeper - check if there are any waiting processes
  * @wq: wait queue head
@@ -746,14 +765,15 @@ enum page_type {
 struct f2fs_io_info {
 	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
-	int rw;			/* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+	int op;			/* contains REQ_OP_ */
+	int op_flags;		/* req_flag_bits */
 	block_t new_blkaddr;	/* new block address to be written */
 	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
 	struct page *encrypted_page;	/* encrypted page */
 };
 
-#define is_read_io(rw)	(((rw) & 1) == READ)
+#define is_read_io(rw) (rw == READ)
 struct f2fs_bio_info {
 	struct f2fs_sb_info *sbi;	/* f2fs superblock */
 	struct bio *bio;		/* bios to merge */
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6390d45c1b68..d3a36e4b442c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -550,7 +550,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(inode),
 		.type = DATA,
-		.rw = READ_SYNC,
+		.op = REQ_OP_READ,
+		.op_flags = REQ_SYNC,
 		.encrypted_page = NULL,
 	};
 	struct dnode_of_data dn;
@@ -627,7 +628,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
 	/* allocate block address */
 	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
 
-	fio.rw = WRITE_SYNC;
+	fio.op = REQ_OP_WRITE;
+	fio.op_flags = REQ_SYNC | REQ_NOIDLE;
 	fio.new_blkaddr = newaddr;
 	f2fs_submit_page_mbio(&fio);
 
@@ -668,7 +670,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
 		struct f2fs_io_info fio = {
 			.sbi = F2FS_I_SB(inode),
 			.type = DATA,
-			.rw = WRITE_SYNC,
+			.op = REQ_OP_WRITE,
+			.op_flags = REQ_SYNC | REQ_NOIDLE,
 			.page = page,
 			.encrypted_page = NULL,
 		};
@@ -767,7 +770,8 @@ next_step:
 
 			start_bidx = start_bidx_of_node(nofs, inode);
 			data_page = get_read_data_page(inode,
-					start_bidx + ofs_in_node, READA, true);
+					start_bidx + ofs_in_node, REQ_RAHEAD,
+					true);
 			if (IS_ERR(data_page)) {
 				iput(inode);
 				continue;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index b85987703d1e..fb5d7d1f34aa 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -128,7 +128,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(dn->inode),
 		.type = DATA,
-		.rw = WRITE_SYNC | REQ_PRIO,
+		.op = REQ_OP_WRITE,
+		.op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO,
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index edacbabb92cf..26a745c544fc 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1068,14 +1068,15 @@ fail:
  * 0: f2fs_put_page(page, 0)
  * LOCKED_PAGE or error: f2fs_put_page(page, 1)
  */
-static int read_node_page(struct page *page, int rw)
+static int read_node_page(struct page *page, int op_flags)
 {
 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 	struct node_info ni;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = NODE,
-		.rw = rw,
+		.op = REQ_OP_READ,
+		.op_flags = op_flags,
 		.page = page,
 		.encrypted_page = NULL,
 	};
@@ -1116,7 +1117,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 	if (!apage)
 		return;
 
-	err = read_node_page(apage, READA);
+	err = read_node_page(apage, REQ_RAHEAD);
 	f2fs_put_page(apage, err ? 1 : 0);
 }
 
@@ -1134,7 +1135,7 @@ repeat:
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	err = read_node_page(page, READ_SYNC);
+	err = read_node_page(page, REQ_SYNC);
 	if (err < 0) {
 		f2fs_put_page(page, 1);
 		return ERR_PTR(err);
@@ -1575,7 +1576,8 @@ static int f2fs_write_node_page(struct page *page,
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = NODE,
-		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+		.op = REQ_OP_WRITE,
+		.op_flags = wbc_to_write_flags(wbc),
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a288de069164..70aec4a8de13 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -257,7 +257,8 @@ static int __commit_inmem_pages(struct inode *inode,
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = DATA,
-		.rw = WRITE_SYNC | REQ_PRIO,
+		.op = REQ_OP_WRITE,
+		.op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO,
 		.encrypted_page = NULL,
 	};
 	bool submit_bio = false;
@@ -407,6 +408,7 @@ static int __submit_flush_wait(struct block_device *bdev)
 	struct bio *bio = f2fs_bio_alloc(0);
 	int ret;
 
+	bio->bi_rw = REQ_OP_WRITE;
 	bio->bi_bdev = bdev;
 	ret = submit_bio_wait(WRITE_FLUSH, bio);
 	bio_put(bio);
@@ -1544,7 +1546,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = META,
-		.rw = WRITE_SYNC | REQ_META | REQ_PRIO,
+		.op = REQ_OP_WRITE,
+		.op_flags = REQ_SYNC | REQ_NOIDLE | REQ_META | REQ_PRIO,
 		.old_blkaddr = page->index,
 		.new_blkaddr = page->index,
 		.page = page,
@@ -1552,7 +1555,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 	};
 
 	if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
-		fio.rw &= ~REQ_META;
+		fio.op_flags &= ~REQ_META;
 
 	set_page_writeback(page);
 	f2fs_submit_page_mbio(&fio);
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 562ce0821559..73b4e1d1912a 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -25,11 +25,11 @@ static inline void __print_last_io(void)
 	if (!last_io.len)
 		return;
 
-	trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n",
+	trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n",
 			last_io.major, last_io.minor,
 			last_io.pid, "----------------",
 			last_io.type,
-			last_io.fio.rw,
+			last_io.fio.op, last_io.fio.op_flags,
 			last_io.fio.new_blkaddr,
 			last_io.len);
 	memset(&last_io, 0, sizeof(last_io));
@@ -101,7 +101,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
 	if (last_io.major == major && last_io.minor == minor &&
 			last_io.pid == pid &&
 			last_io.type == __file_type(inode, pid) &&
-			last_io.fio.rw == fio->rw &&
+			last_io.fio.op == fio->op &&
+			last_io.fio.op_flags == fio->op_flags &&
 			last_io.fio.new_blkaddr + last_io.len ==
 							fio->new_blkaddr) {
 		last_io.len++;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 90d6ad49a9c5..7ad46e8a89e6 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -59,7 +59,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 #define F2FS_BIO_MASK(t)	(t & (READA | WRITE_FLUSH_FUA))
 #define F2FS_BIO_EXTRA_MASK(t)	(t & (REQ_META | REQ_PRIO))
 
-#define show_bio_type(type)	show_bio_base(type), show_bio_extra(type)
+#define show_bio_type(op, op_flags)					\
+		show_bio_base((op|op_flags)), show_bio_extra((op|op_flags))
 
 #define show_bio_base(type)						\
 	__print_symbolic(F2FS_BIO_MASK(type),				\
@@ -734,7 +735,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		__field(pgoff_t, index)
 		__field(block_t, old_blkaddr)
 		__field(block_t, new_blkaddr)
-		__field(int, rw)
+		__field(int, op)
+		__field(int, op_flags)
 		__field(int, type)
 	),
 
@@ -744,7 +746,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		__entry->index		= page->index;
 		__entry->old_blkaddr	= fio->old_blkaddr;
 		__entry->new_blkaddr	= fio->new_blkaddr;
-		__entry->rw		= fio->rw;
+		__entry->op		= fio->op;
+		__entry->op_flags	= fio->op_flags;
 		__entry->type		= fio->type;
 	),
 
@@ -754,7 +757,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		(unsigned long)__entry->index,
 		(unsigned long long)__entry->old_blkaddr,
 		(unsigned long long)__entry->new_blkaddr,
-		show_bio_type(__entry->rw),
+		show_bio_type(__entry->op, __entry->op_flags),
 		show_block_type(__entry->type))
 );
 
@@ -785,7 +788,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
-		__field(int,	rw)
+		__field(int,	op)
+		__field(int,	op_flags)
 		__field(int,	type)
 		__field(sector_t,	sector)
 		__field(unsigned int,	size)
@@ -793,7 +797,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
 
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
-		__entry->rw		= fio->rw;
+		__entry->op		= fio->op;
+		__entry->op_flags	= fio->op_flags;
 		__entry->type		= fio->type;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->size		= bio->bi_iter.bi_size;
@@ -801,7 +806,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
 
 	TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u",
 		show_dev(__entry),
-		show_bio_type(__entry->rw),
+		show_bio_type(__entry->op, __entry->op_flags),
 		show_block_type(__entry->type),
 		(unsigned long long)__entry->sector,
 		__entry->size)

From 401c465b81a78fb2a531d37a63804a4eb8ee2027 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 11 Jan 2017 16:41:25 -0800
Subject: [PATCH 1126/3220] f2fs: support async discard based on v4.9

commit 275b66b09e85cf0520dc610dd89706952751a473 upstream.

This patch is based on commit 275b66b09e85 (f2fs: support async discard).

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |   7 +-
 fs/f2fs/f2fs.h       |   3 +-
 fs/f2fs/segment.c    | 183 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 181 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 2ed785e5ffbb..d485bea3d6bb 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1255,6 +1255,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		f2fs_bug_on(sbi, prefree_segments(sbi));
 		flush_sit_entries(sbi, cpc);
 		clear_prefree_segments(sbi, cpc);
+		f2fs_wait_all_discard_bio(sbi);
 		unblock_operations(sbi);
 		goto out;
 	}
@@ -1273,10 +1274,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
 	err = do_checkpoint(sbi, cpc);
-	if (err)
+	if (err) {
 		release_discard_addrs(sbi);
-	else
+	} else {
 		clear_prefree_segments(sbi, cpc);
+		f2fs_wait_all_discard_bio(sbi);
+	}
 
 	unblock_operations(sbi);
 	stat_inc_cp_count(sbi->stat_info);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d0c7decdd3ac..883d3ab388c1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -127,7 +127,7 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
 static inline int wbc_to_write_flags(struct writeback_control *wbc)
 {
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		return REQ_SYNC;
+		return REQ_SYNC | REQ_NOIDLE;
 	return 0;
 }
 
@@ -2174,6 +2174,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
 bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
+void f2fs_wait_all_discard_bio(struct f2fs_sb_info *);
 void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
 void release_discard_addrs(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *, bool);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 70aec4a8de13..13bea6e5120e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -26,6 +26,7 @@
 #define __reverse_ffz(x) __reverse_ffs(~(x))
 
 static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *bio_entry_slab;
 static struct kmem_cache *sit_entry_set_slab;
 static struct kmem_cache *inmem_entry_slab;
 
@@ -622,6 +623,162 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 	mutex_unlock(&dirty_i->seglist_lock);
 }
 
+static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi,
+							struct bio *bio)
+{
+	struct list_head *wait_list = &(SM_I(sbi)->wait_list);
+	struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
+
+	INIT_LIST_HEAD(&be->list);
+	be->bio = bio;
+	init_completion(&be->event);
+	list_add_tail(&be->list, wait_list);
+
+	return be;
+}
+
+void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi)
+{
+	struct list_head *wait_list = &(SM_I(sbi)->wait_list);
+	struct bio_entry *be, *tmp;
+
+	list_for_each_entry_safe(be, tmp, wait_list, list) {
+		struct bio *bio = be->bio;
+		int err;
+
+		wait_for_completion_io(&be->event);
+		err = be->error;
+		if (err == -EOPNOTSUPP)
+			err = 0;
+
+		if (err)
+			f2fs_msg(sbi->sb, KERN_INFO,
+				"Issue discard failed, ret: %d", err);
+
+		bio_put(bio);
+		list_del(&be->list);
+		kmem_cache_free(bio_entry_slab, be);
+	}
+}
+
+static void f2fs_submit_bio_wait_endio(struct bio *bio)
+{
+	struct bio_entry *be = (struct bio_entry *)bio->bi_private;
+
+	be->error = bio->bi_error;
+	complete(&be->event);
+}
+
+/* copied from block/blk-lib.c in 4.10-rc1 */
+static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, int flags,
+		struct bio **biop)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct bio *bio = *biop;
+	unsigned int granularity;
+	int op = REQ_WRITE | REQ_DISCARD;
+	int alignment;
+	sector_t bs_mask;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	if (flags & BLKDEV_DISCARD_SECURE) {
+		if (!blk_queue_secdiscard(q))
+			return -EOPNOTSUPP;
+		op |= REQ_SECURE;
+	}
+
+	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+	if ((sector | nr_sects) & bs_mask)
+		return -EINVAL;
+
+	/* Zero-sector (unknown) and one-sector granularities are the same.  */
+	granularity = max(q->limits.discard_granularity >> 9, 1U);
+	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+	while (nr_sects) {
+		unsigned int req_sects;
+		sector_t end_sect, tmp;
+
+		/* Make sure bi_size doesn't overflow */
+		req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
+
+		/**
+		 * If splitting a request, and the next starting sector would be
+		 * misaligned, stop the discard at the previous aligned sector.
+		 */
+		end_sect = sector + req_sects;
+		tmp = end_sect;
+		if (req_sects < nr_sects &&
+		    sector_div(tmp, granularity) != alignment) {
+			end_sect = end_sect - alignment;
+			sector_div(end_sect, granularity);
+			end_sect = end_sect * granularity + alignment;
+			req_sects = end_sect - sector;
+		}
+
+		if (bio) {
+			int ret = submit_bio_wait(0, bio);
+			bio_put(bio);
+			if (ret)
+				return ret;
+		}
+		bio = f2fs_bio_alloc(0);
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_bdev = bdev;
+		bio_set_op_attrs(bio, op, 0);
+
+		bio->bi_iter.bi_size = req_sects << 9;
+		nr_sects -= req_sects;
+		sector = end_sect;
+
+		/*
+		 * We can loop for a long time in here, if someone does
+		 * full device discards (like mkfs). Be nice and allow
+		 * us to schedule out to avoid softlocking if preempt
+		 * is disabled.
+		 */
+		cond_resched();
+	}
+
+	*biop = bio;
+	return 0;
+}
+
+/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
+static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
+		struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+	struct bio *bio = NULL;
+	int err;
+
+	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+
+	if (sbi->s_ndevs) {
+		int devi = f2fs_target_device_index(sbi, blkstart);
+
+		blkstart -= FDEV(devi).start_blk;
+	}
+	err = __blkdev_issue_discard(bdev,
+				SECTOR_FROM_BLOCK(blkstart),
+				SECTOR_FROM_BLOCK(blklen),
+				GFP_NOFS, 0, &bio);
+	if (!err && bio) {
+		struct bio_entry *be = __add_bio_entry(sbi, bio);
+
+		bio->bi_private = be;
+		bio->bi_end_io = f2fs_submit_bio_wait_endio;
+		submit_bio(REQ_SYNC, bio);
+	}
+
+	return err;
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t blkstart, block_t blklen)
@@ -655,8 +812,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 	case BLK_ZONE_TYPE_CONVENTIONAL:
 		if (!blk_queue_discard(bdev_get_queue(bdev)))
 			return 0;
-		return blkdev_issue_discard(bdev, sector, nr_sects,
-						GFP_NOFS, 0);
+		return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
 		trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
@@ -672,15 +828,12 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 static int __issue_discard_async(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
-	sector_t start = SECTOR_FROM_BLOCK(blkstart);
-	sector_t len = SECTOR_FROM_BLOCK(blklen);
-
 #ifdef CONFIG_BLK_DEV_ZONED
 	if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
 				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
 		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
 #endif
-	return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0);
+	return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
 }
 
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
@@ -720,8 +873,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 
 	if (len)
 		err = __issue_discard_async(sbi, bdev, start, len);
-
-	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
 	return err;
 }
 
@@ -822,11 +973,14 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct list_head *head = &(SM_I(sbi)->discard_list);
 	struct discard_entry *entry, *this;
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	struct blk_plug plug;
 	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
 	unsigned int start = 0, end = -1;
 	unsigned int secno, start_segno;
 	bool force = (cpc->reason == CP_DISCARD);
 
+	blk_start_plug(&plug);
+
 	mutex_lock(&dirty_i->seglist_lock);
 
 	while (1) {
@@ -875,6 +1029,8 @@ skip:
 		SM_I(sbi)->nr_discards -= entry->len;
 		kmem_cache_free(discard_entry_slab, entry);
 	}
+
+	blk_finish_plug(&plug);
 }
 
 static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -2551,6 +2707,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
 
 	INIT_LIST_HEAD(&sm_info->discard_list);
+	INIT_LIST_HEAD(&sm_info->wait_list);
 	sm_info->nr_discards = 0;
 	sm_info->max_discards = 0;
 
@@ -2694,10 +2851,15 @@ int __init create_segment_manager_caches(void)
 	if (!discard_entry_slab)
 		goto fail;
 
+	bio_entry_slab = f2fs_kmem_cache_create("bio_entry",
+			sizeof(struct bio_entry));
+	if (!bio_entry_slab)
+		goto destroy_discard_entry;
+
 	sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
 			sizeof(struct sit_entry_set));
 	if (!sit_entry_set_slab)
-		goto destroy_discard_entry;
+		goto destroy_bio_entry;
 
 	inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
 			sizeof(struct inmem_pages));
@@ -2707,6 +2869,8 @@ int __init create_segment_manager_caches(void)
 
 destroy_sit_entry_set:
 	kmem_cache_destroy(sit_entry_set_slab);
+destroy_bio_entry:
+	kmem_cache_destroy(bio_entry_slab);
 destroy_discard_entry:
 	kmem_cache_destroy(discard_entry_slab);
 fail:
@@ -2716,6 +2880,7 @@ fail:
 void destroy_segment_manager_caches(void)
 {
 	kmem_cache_destroy(sit_entry_set_slab);
+	kmem_cache_destroy(bio_entry_slab);
 	kmem_cache_destroy(discard_entry_slab);
 	kmem_cache_destroy(inmem_entry_slab);
 }

From 2ed473dc9155c4e031cefdb75c6edfc3d4a42817 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Tue, 13 Dec 2016 17:23:37 +0800
Subject: [PATCH 1127/3220] f2fs: remove unused values in recover_fsync_data

commit fed24668482e07421b8e746a4886e7725434050a upstream.

This patch remove unused values in function recover_fsync_data

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/recovery.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 983c35da6bce..4a3c48c24c10 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -552,10 +552,8 @@ next:
 
 int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 {
-	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	struct list_head inode_list;
 	struct list_head dir_list;
-	block_t blkaddr;
 	int err;
 	int ret = 0;
 	bool need_writecp = false;
@@ -571,8 +569,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	/* prevent checkpoint */
 	mutex_lock(&sbi->cp_mutex);
 
-	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
-
 	/* step #1: find fsynced inode numbers */
 	err = find_fsync_dnodes(sbi, &inode_list);
 	if (err || list_empty(&inode_list))

From 1b05b5e173183cfdfc8791c34f04e8de2edb42f7 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 13 Dec 2016 18:54:59 +0800
Subject: [PATCH 1128/3220] f2fs: don't cache nat entry if out of memory

commit 5c9e418436f3445d7cc4f3ba2964f231a4b33f17 upstream.

If we run out of memory, in cache_nat_entry, it's better to avoid loop
for allocating memory to cache nat entry, so in low memory scenario, for
read path of node block, I expect this can avoid unneeded latency.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 26a745c544fc..b01b01cfc39e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -245,12 +245,24 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
 	return need_update;
 }
 
-static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
+static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+								bool no_fail)
 {
 	struct nat_entry *new;
 
-	new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
-	f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
+	if (no_fail) {
+		new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
+		f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
+	} else {
+		new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
+		if (!new)
+			return NULL;
+		if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
+			kmem_cache_free(nat_entry_slab, new);
+			return NULL;
+		}
+	}
+
 	memset(new, 0, sizeof(struct nat_entry));
 	nat_set_nid(new, nid);
 	nat_reset_flag(new);
@@ -267,8 +279,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 
 	e = __lookup_nat_cache(nm_i, nid);
 	if (!e) {
-		e = grab_nat_entry(nm_i, nid);
-		node_info_from_raw_nat(&e->ni, ne);
+		e = grab_nat_entry(nm_i, nid, false);
+		if (e)
+			node_info_from_raw_nat(&e->ni, ne);
 	} else {
 		f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
 				nat_get_blkaddr(e) !=
@@ -286,7 +299,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, ni->nid);
 	if (!e) {
-		e = grab_nat_entry(nm_i, ni->nid);
+		e = grab_nat_entry(nm_i, ni->nid, true);
 		copy_node_info(&e->ni, ni);
 		f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
 	} else if (new_blkaddr == NEW_ADDR) {
@@ -2155,7 +2168,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (!ne) {
-			ne = grab_nat_entry(nm_i, nid);
+			ne = grab_nat_entry(nm_i, nid, true);
 			node_info_from_raw_nat(&ne->ni, &raw_ne);
 		}
 

From a1c90b43fc83873b86b4637ba7293bea5d1b7eec Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 16 Dec 2016 11:18:15 +0300
Subject: [PATCH 1129/3220] f2fs: remove unneeded condition

commit 07fe8d44409f88be8f4a4e8f22b47ee709a22657 upstream.

We checked that "inode" is not an error pointer earlier so there is
no need to check again here.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 523bf073642e..ca9e2f85eae8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -321,9 +321,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 		if (err)
 			goto err_out;
 	}
-	if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) &&
-			(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
-			!fscrypt_has_permitted_context(dir, inode)) {
+	if (f2fs_encrypted_inode(dir) &&
+	    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+	    !fscrypt_has_permitted_context(dir, inode)) {
 		bool nokey = f2fs_encrypted_inode(inode) &&
 			!fscrypt_has_encryption_key(inode);
 		err = nokey ? -ENOKEY : -EPERM;

From 75487d02a75b16fc40c9640459f09604185cd77b Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Mon, 19 Dec 2016 20:10:48 +0800
Subject: [PATCH 1130/3220] f2fs: fix a problem of using memory after free

commit 7855eba4d6102f811b6dd142d6c749f53b591fa3 upstream.

This patch fix a problem of using memory after free
in function __try_merge_extent_node.

Fixes: 0f825ee6e873 ("f2fs: add new interfaces for extent tree")
Cc: <stable@vger.kernel.org>
Signed-off-by: Yunlei He <heyunlei@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 4db44da7ef69..e02c3d88dc9a 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -352,11 +352,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode,
 	}
 
 	if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
-		if (en)
-			__release_extent_node(sbi, et, prev_ex);
 		next_ex->ei.fofs = ei->fofs;
 		next_ex->ei.blk = ei->blk;
 		next_ex->ei.len += ei->len;
+		if (en)
+			__release_extent_node(sbi, et, prev_ex);
+
 		en = next_ex;
 	}
 

From ff9199293b056f202f447e07bb4160581ae48cac Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Tue, 20 Dec 2016 11:11:35 +0800
Subject: [PATCH 1131/3220] f2fs: add a case of no need to read a page in write
 begin

commit 746e2403927efbd7c7f2e796314e3cfb3cfabaa4 upstream.

If the range we write cover the whole valid data in the last page,
we do not need to read it.

Signed-off-by: Yunlei He <heyunlei@huawei.com>
[Jaegeuk Kim: nullify the remaining area (fix: xfstests/f2fs/001)]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2eddf1daf995..d7c20880e53e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1723,6 +1723,11 @@ repeat:
 	if (len == PAGE_SIZE || PageUptodate(page))
 		return 0;
 
+	if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) {
+		zero_user_segment(page, len, PAGE_SIZE);
+		return 0;
+	}
+
 	if (blkaddr == NEW_ADDR) {
 		zero_user_segment(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
@@ -1777,7 +1782,7 @@ static int f2fs_write_end(struct file *file,
 	 * let generic_perform_write() try to copy data again through copied=0.
 	 */
 	if (!PageUptodate(page)) {
-		if (unlikely(copied != PAGE_SIZE))
+		if (unlikely(copied != len))
 			copied = 0;
 		else
 			SetPageUptodate(page);

From 574da11960668d46dc8fb2d36050bf1aecf782ea Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Tue, 20 Dec 2016 21:57:42 +0800
Subject: [PATCH 1132/3220] f2fs: use rb_entry_safe

commit ed0b56209fe79a1309653d4b03f5c3147f580f6b upstream.

Use rb_entry_safe() instead of open-coding it.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index e02c3d88dc9a..6ed6424807b6 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -311,28 +311,24 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
 	tmp_node = parent;
 	if (parent && fofs > en->ei.fofs)
 		tmp_node = rb_next(parent);
-	*next_ex = tmp_node ?
-		rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+	*next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
 
 	tmp_node = parent;
 	if (parent && fofs < en->ei.fofs)
 		tmp_node = rb_prev(parent);
-	*prev_ex = tmp_node ?
-		rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+	*prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
 	return NULL;
 
 lookup_neighbors:
 	if (fofs == en->ei.fofs) {
 		/* lookup prev node for merging backward later */
 		tmp_node = rb_prev(&en->rb_node);
-		*prev_ex = tmp_node ?
-			rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+		*prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
 	}
 	if (fofs == en->ei.fofs + en->ei.len - 1) {
 		/* lookup next node for merging frontward later */
 		tmp_node = rb_next(&en->rb_node);
-		*next_ex = tmp_node ?
-			rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+		*next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
 	}
 	return en;
 }
@@ -493,9 +489,8 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 		if (!next_en) {
 			struct rb_node *node = rb_next(&en->rb_node);
 
-			next_en = node ?
-				rb_entry(node, struct extent_node, rb_node)
-				: NULL;
+			next_en = rb_entry_safe(node, struct extent_node,
+						rb_node);
 		}
 
 		if (parts)

From c70e14cdaf37a57230a1f6bd671651d4161ce040 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Thu, 22 Dec 2016 11:46:24 +0800
Subject: [PATCH 1133/3220] f2fs: fix a missing discard prefree segments

commit 650d3c4e56e1e92ee6e004648c9deb243e5963e0 upstream.

If userspace issue a fstrim with a range not involve prefree segments,
it will reuse these segments without discard. This patch fix it.

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 13bea6e5120e..f4e41f997ae3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -996,9 +996,13 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 		dirty_i->nr_dirty[PRE] -= end - start;
 
-		if (force || !test_opt(sbi, DISCARD))
+		if (!test_opt(sbi, DISCARD))
 			continue;
 
+		if (force && start >= cpc->trim_start &&
+					(end - 1) <= cpc->trim_end)
+				continue;
+
 		if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) {
 			f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
 				(end - start) << sbi->log_blocks_per_seg);
@@ -2343,8 +2347,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	f2fs_bug_on(sbi, sit_i->dirty_sentries);
 out:
 	if (cpc->reason == CP_DISCARD) {
+		__u64 trim_start = cpc->trim_start;
+
 		for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
 			add_discard_addrs(sbi, cpc);
+
+		cpc->trim_start = trim_start;
 	}
 	mutex_unlock(&sit_i->sentry_lock);
 

From d4e5223d818323b401fdefaff076a0e67c44894d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 21 Dec 2016 11:51:32 -0800
Subject: [PATCH 1134/3220] f2fs: reassign new segment for mode=lfs

commit 9d52a504db6db9e4e254576130aa867838daff55 upstream.

Otherwise we can remain wrong curseg->next_blkoff, resulting in fsck failure.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f4e41f997ae3..e6d3f3d4b028 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1508,9 +1508,6 @@ void allocate_new_segments(struct f2fs_sb_info *sbi)
 	unsigned int old_segno;
 	int i;
 
-	if (test_opt(sbi, LFS))
-		return;
-
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
 		curseg = CURSEG_I(sbi, i);
 		old_segno = curseg->segno;

From 8ef4f0ca7b4d0cefe0f0b42af6106e6510f39ae6 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 21 Dec 2016 12:13:03 -0800
Subject: [PATCH 1135/3220] f2fs: add submit_bio tracepoint

commit 554b5125f5cfca6653461fd52bad24d4ef35ec29 upstream.

This patch adds final submit_bio() tracepoint.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c              | 14 +++++++-----
 include/trace/events/f2fs.h | 45 ++++++++++++++++++++++++-------------
 2 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d7c20880e53e..ccb6fb142d56 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -176,6 +176,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
 			current->plug && (type == DATA || type == NODE))
 			blk_finish_plug(current->plug);
 	}
+	if (is_read_io(bio_op(bio)))
+		trace_f2fs_submit_read_bio(sbi->sb, type, bio);
+	else
+		trace_f2fs_submit_write_bio(sbi->sb, type, bio);
 	submit_bio(0, bio);
 }
 
@@ -186,13 +190,13 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
 	if (!io->bio)
 		return;
 
-	if (is_read_io(fio->op))
-		trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
-	else
-		trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
-
 	bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
 
+	if (is_read_io(fio->op))
+		trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio);
+	else
+		trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio);
+
 	__submit_bio(io->sbi, io->bio, fio->type);
 	io->bio = NULL;
 }
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 7ad46e8a89e6..217691582dd4 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -779,12 +779,11 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio,
 	TP_CONDITION(page->mapping)
 );
 
-DECLARE_EVENT_CLASS(f2fs__submit_bio,
+DECLARE_EVENT_CLASS(f2fs__bio,
 
-	TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio,
-						struct bio *bio),
+	TP_PROTO(struct super_block *sb, int type, struct bio *bio),
 
-	TP_ARGS(sb, fio, bio),
+	TP_ARGS(sb, type, bio),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
@@ -797,9 +796,9 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
 
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
-		__entry->op		= fio->op;
-		__entry->op_flags	= fio->op_flags;
-		__entry->type		= fio->type;
+		__entry->op		= bio_op(bio);
+		__entry->op_flags	= bio->bi_rw;
+		__entry->type		= type;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->size		= bio->bi_iter.bi_size;
 	),
@@ -812,22 +811,38 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
 		__entry->size)
 );
 
-DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio,
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio,
 
-	TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio,
-							struct bio *bio),
+	TP_PROTO(struct super_block *sb, int type, struct bio *bio),
 
-	TP_ARGS(sb, fio, bio),
+	TP_ARGS(sb, type, bio),
 
 	TP_CONDITION(bio)
 );
 
-DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio,
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio,
 
-	TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio,
-							struct bio *bio),
+	TP_PROTO(struct super_block *sb, int type, struct bio *bio),
 
-	TP_ARGS(sb, fio, bio),
+	TP_ARGS(sb, type, bio),
+
+	TP_CONDITION(bio)
+);
+
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio,
+
+	TP_PROTO(struct super_block *sb, int type, struct bio *bio),
+
+	TP_ARGS(sb, type, bio),
+
+	TP_CONDITION(bio)
+);
+
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio,
+
+	TP_PROTO(struct super_block *sb, int type, struct bio *bio),
+
+	TP_ARGS(sb, type, bio),
 
 	TP_CONDITION(bio)
 );

From b3fcb70064065bd1aa2083d98a5ec711d099a1ee Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 14 Dec 2016 10:12:56 -0800
Subject: [PATCH 1136/3220] f2fs: support IO alignment for DATA and NODE writes

commit 0a595ebaaa6b53a2226d3fee2a2fd616ea5ba378 upstream.

This patch implements IO alignment by filling dummy blocks in DATA and NODE
write bios. If we can guarantee, for example, 32KB or 64KB for such the IOs,
we can eliminate underlying dummy page problem which FTL conducts in order to
close MLC or TLC partial written pages.

Note that,
 - it requires "-o mode=lfs".
 - IO size should be power of 2, not exceed BIO_MAX_PAGES, 256.
 - read IO is still 4KB.
 - do checkpoint at fsync, if dummy NODE page was written.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          | 55 +++++++++++++++++++++++++++++++++++++++--
 fs/f2fs/f2fs.h          |  4 ++-
 fs/f2fs/segment.c       |  9 +++++--
 fs/f2fs/segment.h       |  3 +++
 fs/f2fs/super.c         | 13 +++++++++-
 include/linux/f2fs_fs.h |  6 +++++
 6 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ccb6fb142d56..6c8465d26bb1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -94,6 +94,17 @@ static void f2fs_write_end_io(struct bio *bio)
 		struct page *page = bvec->bv_page;
 		enum count_type type = WB_DATA_TYPE(page);
 
+		if (IS_DUMMY_WRITTEN_PAGE(page)) {
+			set_page_private(page, (unsigned long)NULL);
+			ClearPagePrivate(page);
+			unlock_page(page);
+			mempool_free(page, sbi->write_io_dummy);
+
+			if (unlikely(bio->bi_error))
+				f2fs_stop_checkpoint(sbi, true);
+			continue;
+		}
+
 		fscrypt_pullback_bio_page(&page, true);
 
 		if (unlikely(bio->bi_error)) {
@@ -172,10 +183,42 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
 				struct bio *bio, enum page_type type)
 {
 	if (!is_read_io(bio_op(bio))) {
+		unsigned int start;
+
 		if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
 			current->plug && (type == DATA || type == NODE))
 			blk_finish_plug(current->plug);
+
+		if (type != DATA && type != NODE)
+			goto submit_io;
+
+		start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
+		start %= F2FS_IO_SIZE(sbi);
+
+		if (start == 0)
+			goto submit_io;
+
+		/* fill dummy pages */
+		for (; start < F2FS_IO_SIZE(sbi); start++) {
+			struct page *page =
+				mempool_alloc(sbi->write_io_dummy,
+					GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL);
+			f2fs_bug_on(sbi, !page);
+
+			SetPagePrivate(page);
+			set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
+			lock_page(page);
+			if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+				f2fs_bug_on(sbi, 1);
+		}
+		/*
+		 * In the NODE case, we lose next block address chain. So, we
+		 * need to do checkpoint in f2fs_sync_file.
+		 */
+		if (type == NODE)
+			set_sbi_flag(sbi, SBI_NEED_CP);
 	}
+submit_io:
 	if (is_read_io(bio_op(bio)))
 		trace_f2fs_submit_read_bio(sbi->sb, type, bio);
 	else
@@ -320,13 +363,14 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	return 0;
 }
 
-void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
+int f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 {
 	struct f2fs_sb_info *sbi = fio->sbi;
 	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
 	struct f2fs_bio_info *io;
 	bool is_read = is_read_io(fio->op);
 	struct page *bio_page;
+	int err = 0;
 
 	io = is_read ? &sbi->read_io : &sbi->write_io[btype];
 
@@ -347,6 +391,12 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
+		if ((fio->type == DATA || fio->type == NODE) &&
+				fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
+			err = -EAGAIN;
+			dec_page_count(sbi, WB_DATA_TYPE(bio_page));
+			goto out_fail;
+		}
 		io->bio = __bio_alloc(sbi, fio->new_blkaddr,
 						BIO_MAX_PAGES, is_read);
 		io->fio = *fio;
@@ -360,9 +410,10 @@ alloc_new:
 
 	io->last_block_in_bio = fio->new_blkaddr;
 	f2fs_trace_ios(fio, 0);
-
+out_fail:
 	up_write(&io->io_rwsem);
 	trace_f2fs_submit_page_mbio(fio->page, fio);
+	return err;
 }
 
 static void __set_data_blkaddr(struct dnode_of_data *dn)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 883d3ab388c1..f9a739ffca0f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -859,6 +859,8 @@ struct f2fs_sb_info {
 	struct f2fs_bio_info read_io;			/* for read bios */
 	struct f2fs_bio_info write_io[NR_PAGE_TYPE];	/* for write bios */
 	struct mutex wio_mutex[NODE + 1];	/* bio ordering for NODE/DATA */
+	int write_io_size_bits;			/* Write IO size bits */
+	mempool_t *write_io_dummy;		/* Dummy pages */
 
 	/* for checkpoint */
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
@@ -2241,7 +2243,7 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
 				struct page *, nid_t, enum page_type, int);
 void f2fs_flush_merged_bios(struct f2fs_sb_info *);
 int f2fs_submit_page_bio(struct f2fs_io_info *);
-void f2fs_submit_page_mbio(struct f2fs_io_info *);
+int f2fs_submit_page_mbio(struct f2fs_io_info *);
 struct block_device *f2fs_target_device(struct f2fs_sb_info *,
 				block_t, struct bio *);
 int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index e6d3f3d4b028..a7bb97826445 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1684,15 +1684,20 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 {
 	int type = __get_segment_type(fio->page, fio->type);
+	int err;
 
 	if (fio->type == NODE || fio->type == DATA)
 		mutex_lock(&fio->sbi->wio_mutex[fio->type]);
-
+reallocate:
 	allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
 					&fio->new_blkaddr, sum, type);
 
 	/* writeout dirty page into bdev */
-	f2fs_submit_page_mbio(fio);
+	err = f2fs_submit_page_mbio(fio);
+	if (err == -EAGAIN) {
+		fio->old_blkaddr = fio->new_blkaddr;
+		goto reallocate;
+	}
 
 	if (fio->type == NODE || fio->type == DATA)
 		mutex_unlock(&fio->sbi->wio_mutex[fio->type]);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 9d44ce83acb2..08f1455c812c 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -186,9 +186,12 @@ struct segment_allocation {
  * the page is atomically written, and it is in inmem_pages list.
  */
 #define ATOMIC_WRITTEN_PAGE		((unsigned long)-1)
+#define DUMMY_WRITTEN_PAGE		((unsigned long)-2)
 
 #define IS_ATOMIC_WRITTEN_PAGE(page)			\
 		(page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
+#define IS_DUMMY_WRITTEN_PAGE(page)			\
+		(page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
 
 struct inmem_pages {
 	struct list_head list;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e6d8d011786c..fb9f6c09fa11 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1764,6 +1764,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 				FDEV(i).total_segments,
 				FDEV(i).start_blk, FDEV(i).end_blk);
 	}
+	f2fs_msg(sbi->sb, KERN_INFO,
+			"IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi));
 	return 0;
 }
 
@@ -1881,12 +1883,19 @@ try_onemore:
 	if (err)
 		goto free_options;
 
+	if (F2FS_IO_SIZE(sbi) > 1) {
+		sbi->write_io_dummy =
+			mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0);
+		if (!sbi->write_io_dummy)
+			goto free_options;
+	}
+
 	/* get an inode for meta space */
 	sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
 	if (IS_ERR(sbi->meta_inode)) {
 		f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
 		err = PTR_ERR(sbi->meta_inode);
-		goto free_options;
+		goto free_io_dummy;
 	}
 
 	err = get_valid_checkpoint(sbi);
@@ -2104,6 +2113,8 @@ free_devices:
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);
 	iput(sbi->meta_inode);
+free_io_dummy:
+	mempool_destroy(sbi->write_io_dummy);
 free_options:
 	destroy_percpu_info(sbi);
 	kfree(options);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 3e5972ef5019..cf54a312993f 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -36,6 +36,12 @@
 #define F2FS_NODE_INO(sbi)	(sbi->node_ino_num)
 #define F2FS_META_INO(sbi)	(sbi->meta_ino_num)
 
+#define F2FS_IO_SIZE(sbi)	(1 << (sbi)->write_io_size_bits) /* Blocks */
+#define F2FS_IO_SIZE_KB(sbi)	(1 << ((sbi)->write_io_size_bits + 2)) /* KB */
+#define F2FS_IO_SIZE_BYTES(sbi)	(1 << ((sbi)->write_io_size_bits + 12)) /* B */
+#define F2FS_IO_SIZE_BITS(sbi)	((sbi)->write_io_size_bits) /* power of 2 */
+#define F2FS_IO_SIZE_MASK(sbi)	(F2FS_IO_SIZE(sbi) - 1)
+
 /* This flag is used by node and meta inodes, and by recovery */
 #define GFP_F2FS_ZERO		(GFP_NOFS | __GFP_ZERO)
 #define GFP_F2FS_HIGH_ZERO	(GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM)

From 7b214391b292efcfe6d55202498766f20875f5b1 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 21 Dec 2016 17:09:19 -0800
Subject: [PATCH 1137/3220] f2fs: get io size bit from mount option

commit ec91538dccd44329ad83d3aae1aa6a8389b5c75f upstream.

This patch adds to set io_size_bits from mount option.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.txt |  2 ++
 fs/f2fs/super.c                    | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 753dd4f96afe..d99faced79cb 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -157,6 +157,8 @@ data_flush             Enable data flushing before checkpoint in order to
 mode=%s                Control block allocation mode which supports "adaptive"
                        and "lfs". In "lfs" mode, there should be no random
                        writes towards main area.
+io_bits=%u             Set the bit size of write IO requests. It should be set
+                       with "mode=lfs".
 
 ================================================================================
 DEBUGFS ENTRIES
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fb9f6c09fa11..3b169927408e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -101,6 +101,7 @@ enum {
 	Opt_noinline_data,
 	Opt_data_flush,
 	Opt_mode,
+	Opt_io_size_bits,
 	Opt_fault_injection,
 	Opt_lazytime,
 	Opt_nolazytime,
@@ -133,6 +134,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_noinline_data, "noinline_data"},
 	{Opt_data_flush, "data_flush"},
 	{Opt_mode, "mode=%s"},
+	{Opt_io_size_bits, "io_bits=%u"},
 	{Opt_fault_injection, "fault_injection=%u"},
 	{Opt_lazytime, "lazytime"},
 	{Opt_nolazytime, "nolazytime"},
@@ -535,6 +537,17 @@ static int parse_options(struct super_block *sb, char *options)
 			}
 			kfree(name);
 			break;
+		case Opt_io_size_bits:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			if (arg > __ilog2_u32(BIO_MAX_PAGES)) {
+				f2fs_msg(sb, KERN_WARNING,
+					"Not support %d, larger than %d",
+					1 << arg, BIO_MAX_PAGES);
+				return -EINVAL;
+			}
+			sbi->write_io_size_bits = arg;
+			break;
 		case Opt_fault_injection:
 			if (args->from && match_int(args, &arg))
 				return -EINVAL;
@@ -558,6 +571,13 @@ static int parse_options(struct super_block *sb, char *options)
 			return -EINVAL;
 		}
 	}
+
+	if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
+		f2fs_msg(sb, KERN_ERR,
+				"Should set mode=lfs with %uKB-sized IO",
+				F2FS_IO_SIZE_KB(sbi));
+		return -EINVAL;
+	}
 	return 0;
 }
 
@@ -918,6 +938,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 	else if (test_opt(sbi, LFS))
 		seq_puts(seq, "lfs");
 	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
+	if (F2FS_IO_SIZE_BITS(sbi))
+		seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi));
 
 	return 0;
 }

From 97a43c7059c1323ac34693b1c7539e7a42740ee6 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 28 Dec 2016 13:55:09 -0800
Subject: [PATCH 1138/3220] f2fs: show the max number of atomic operations

commit 26a28a0c1eb756ba18bfb1f93309c4b4406b9cd9 upstream.

This patch adds to show the max number of atomic operations which are
conducting concurrently.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c   |  7 +++++++
 fs/f2fs/f2fs.h    | 17 +++++++++++++++++
 fs/f2fs/file.c    |  8 ++++++--
 fs/f2fs/segment.c |  1 +
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fbd5184140d0..29cdf0c1da1d 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -50,6 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
 	si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
+	si->aw_cnt = atomic_read(&sbi->aw_cnt);
+	si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
 	si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
 	si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
 	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
@@ -256,6 +258,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->inline_dir);
 		seq_printf(s, "  - Orphan Inode: %u\n",
 			   si->orphans);
+		seq_printf(s, "  - Atomic write count: %4d (Max. %4d)\n",
+			   si->aw_cnt, si->max_aw_cnt);
 		seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
 			   si->main_area_segs, si->main_area_sections,
 			   si->main_area_zones);
@@ -414,6 +418,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	atomic_set(&sbi->inline_dir, 0);
 	atomic_set(&sbi->inplace_count, 0);
 
+	atomic_set(&sbi->aw_cnt, 0);
+	atomic_set(&sbi->max_aw_cnt, 0);
+
 	mutex_lock(&f2fs_stat_mutex);
 	list_add_tail(&si->stat_list, &f2fs_stat_list);
 	mutex_unlock(&f2fs_stat_mutex);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f9a739ffca0f..19e054b1c4f8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -951,6 +951,8 @@ struct f2fs_sb_info {
 	atomic_t inline_xattr;			/* # of inline_xattr inodes */
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
+	atomic_t aw_cnt;			/* # of atomic writes */
+	atomic_t max_aw_cnt;			/* max # of atomic writes */
 	int bg_gc;				/* background gc calls */
 	unsigned int ndirty_inode[NR_INODE_TYPE];	/* # of dirty inodes */
 #endif
@@ -2303,6 +2305,7 @@ struct f2fs_stat_info {
 	int total_count, utilization;
 	int bg_gc, nr_wb_cp_data, nr_wb_data;
 	int inline_xattr, inline_inode, inline_dir, orphans;
+	int aw_cnt, max_aw_cnt;
 	unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
 	unsigned int bimodal, avg_vblocks;
 	int util_free, util_valid, util_invalid;
@@ -2374,6 +2377,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 		((sbi)->block_count[(curseg)->alloc_type]++)
 #define stat_inc_inplace_blocks(sbi)					\
 		(atomic_inc(&(sbi)->inplace_count))
+#define stat_inc_atomic_write(inode)					\
+		(atomic_inc(&F2FS_I_SB(inode)->aw_cnt));
+#define stat_dec_atomic_write(inode)					\
+		(atomic_dec(&F2FS_I_SB(inode)->aw_cnt));
+#define stat_update_max_atomic_write(inode)				\
+	do {								\
+		int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt);	\
+		int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt);	\
+		if (cur > max)						\
+			atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur);	\
+	} while (0)
 #define stat_inc_seg_count(sbi, type, gc_type)				\
 	do {								\
 		struct f2fs_stat_info *si = F2FS_STAT(sbi);		\
@@ -2427,6 +2441,9 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_inline_inode(inode)
 #define stat_inc_inline_dir(inode)
 #define stat_dec_inline_dir(inode)
+#define stat_inc_atomic_write(inode)
+#define stat_dec_atomic_write(inode)
+#define stat_update_max_atomic_write(inode)
 #define stat_inc_seg_type(sbi, curseg)
 #define stat_inc_block_count(sbi, curseg)
 #define stat_inc_inplace_blocks(sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5808d5c709a7..d7eacef08797 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1546,6 +1546,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	if (ret)
 		clear_inode_flag(inode, FI_ATOMIC_FILE);
 out:
+	stat_inc_atomic_write(inode);
+	stat_update_max_atomic_write(inode);
 	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
@@ -1575,9 +1577,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 			set_inode_flag(inode, FI_ATOMIC_FILE);
 			goto err_out;
 		}
+		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+		stat_dec_atomic_write(inode);
+	} else {
+		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
 	}
-
-	ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
 err_out:
 	inode_unlock(inode);
 	mnt_drop_write_file(filp);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a7bb97826445..353ec85b3835 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -243,6 +243,7 @@ void drop_inmem_pages(struct inode *inode)
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
+	stat_dec_atomic_write(inode);
 
 	mutex_lock(&fi->inmem_lock);
 	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);

From dde5a6f8fd97e05a437995f121cc2f50353bef9c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 28 Dec 2016 17:31:15 -0800
Subject: [PATCH 1139/3220] f2fs: don't allow encrypted operations without keys

commit 363fa4e078cbdc97a172c19d19dc04b41b52ebc8 upstream.

This patch fixes the renaming bug on encrypted filenames, which was pointed by

 (ext4: don't allow encrypted operations without keys)

Cc: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ca9e2f85eae8..db3079cd665d 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -660,6 +660,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	bool is_old_inline = f2fs_has_inline_dentry(old_dir);
 	int err = -ENOENT;
 
+	if ((f2fs_encrypted_inode(old_dir) &&
+			!fscrypt_has_encryption_key(old_dir)) ||
+			(f2fs_encrypted_inode(new_dir) &&
+			!fscrypt_has_encryption_key(new_dir)))
+		return -ENOKEY;
+
 	if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
 			!fscrypt_has_permitted_context(new_dir, old_inode)) {
 		err = -EPERM;
@@ -840,6 +846,12 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int old_nlink = 0, new_nlink = 0;
 	int err = -ENOENT;
 
+	if ((f2fs_encrypted_inode(old_dir) &&
+			!fscrypt_has_encryption_key(old_dir)) ||
+			(f2fs_encrypted_inode(new_dir) &&
+			!fscrypt_has_encryption_key(new_dir)))
+		return -ENOKEY;
+
 	if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
 			(old_dir != new_dir) &&
 			(!fscrypt_has_permitted_context(new_dir, old_inode) ||

From 66e2310bf98024a658fcdd10d52f827d302ef91e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 3 Jan 2017 17:19:30 -0800
Subject: [PATCH 1140/3220] f2fs: drop exist_data for inline_data when
 truncated to 0

commit bb95d9ab2a9d4afd03b59a603cccb2c601f68b78 upstream.

A test program gets the SEEK_DATA with two values between
a new created file and the exist file on f2fs filesystem.

F2FS filesystem,  (the first "test1" is a new file)
SEEK_DATA size != 0 (offset = 8192)
SEEK_DATA size != 0 (offset = 4096)

PNFS filesystem, (the first "test1" is a new file)
SEEK_DATA size != 0 (offset = 4096)
SEEK_DATA size != 0 (offset = 4096)

int main(int argc, char **argv)
{
        char *filename = argv[1];
        int offset = 1, i = 0, fd = -1;

        if (argc < 2) {
                printf("Usage: %s f2fsfilename\n", argv[0]);
                return -1;
        }

        /*
        if (!access(filename, F_OK) || errno != ENOENT) {
                printf("Needs a new file for test, %m\n");
                return -1;
        }*/

        fd = open(filename, O_RDWR | O_CREAT, 0777);
        if (fd < 0) {
                printf("Create test file %s failed, %m\n", filename);
                return -1;
        }

        for (i = 0; i < 20; i++) {
                offset = 1 << i;
                ftruncate(fd, 0);
                lseek(fd, offset, SEEK_SET);
                write(fd, "test", 5);
                /* Get the alloc size by seek data equal zero*/
                if (lseek(fd, 0, SEEK_DATA)) {
                        printf("SEEK_DATA size != 0 (offset = %d)\n", offset);
                        break;
                }
        }

        close(fd);
        return 0;
}

Reported-and-Tested-by: Kinglong Mee <kinglongmee@gmail.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d7eacef08797..9da13847cda4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -571,6 +571,8 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
 	if (f2fs_has_inline_data(inode)) {
 		if (truncate_inline_inode(ipage, from))
 			set_page_dirty(ipage);
+		if (from == 0)
+			clear_inode_flag(inode, FI_DATA_EXIST);
 		f2fs_put_page(ipage, 1);
 		truncate_page = true;
 		goto out;

From 132263ddad34592c1c469f5ed1e192c6df6b9a13 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 29 Dec 2016 14:07:53 -0800
Subject: [PATCH 1141/3220] f2fs: relax async discard commands more

commit 4e6a8d9b224f886362ea6e8f6046b541437c944f upstream.

This patch relaxes async discard commands to avoid waiting its end_io during
checkpoint.
Instead of waiting them during checkpoint, it will be done when actually reusing
them.

Test on initial partition of nvme drive.

 # time fstrim /mnt/test

Before : 6.158s
After : 4.822s

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  7 ++-----
 fs/f2fs/f2fs.h       |  4 +++-
 fs/f2fs/segment.c    | 24 +++++++++++++++++++-----
 fs/f2fs/super.c      |  3 +++
 4 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index d485bea3d6bb..2ed785e5ffbb 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1255,7 +1255,6 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		f2fs_bug_on(sbi, prefree_segments(sbi));
 		flush_sit_entries(sbi, cpc);
 		clear_prefree_segments(sbi, cpc);
-		f2fs_wait_all_discard_bio(sbi);
 		unblock_operations(sbi);
 		goto out;
 	}
@@ -1274,12 +1273,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
 	err = do_checkpoint(sbi, cpc);
-	if (err) {
+	if (err)
 		release_discard_addrs(sbi);
-	} else {
+	else
 		clear_prefree_segments(sbi, cpc);
-		f2fs_wait_all_discard_bio(sbi);
-	}
 
 	unblock_operations(sbi);
 	stat_inc_cp_count(sbi->stat_info);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 19e054b1c4f8..3409392dde9c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -250,6 +250,8 @@ struct discard_entry {
 
 struct bio_entry {
 	struct list_head list;
+	block_t lstart;
+	block_t len;
 	struct bio *bio;
 	struct completion event;
 	int error;
@@ -2178,7 +2180,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
 bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
-void f2fs_wait_all_discard_bio(struct f2fs_sb_info *);
+void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
 void release_discard_addrs(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *, bool);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 353ec85b3835..fa3d4f8db389 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -625,20 +625,23 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 }
 
 static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi,
-							struct bio *bio)
+			struct bio *bio, block_t lstart, block_t len)
 {
 	struct list_head *wait_list = &(SM_I(sbi)->wait_list);
 	struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
 
 	INIT_LIST_HEAD(&be->list);
 	be->bio = bio;
+	be->lstart = lstart;
+	be->len = len;
 	init_completion(&be->event);
 	list_add_tail(&be->list, wait_list);
 
 	return be;
 }
 
-void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi)
+/* This should be covered by global mutex, &sit_i->sentry_lock */
+void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
 {
 	struct list_head *wait_list = &(SM_I(sbi)->wait_list);
 	struct bio_entry *be, *tmp;
@@ -647,7 +650,15 @@ void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi)
 		struct bio *bio = be->bio;
 		int err;
 
-		wait_for_completion_io(&be->event);
+		if (!completion_done(&be->event)) {
+			if ((be->lstart <= blkaddr &&
+					blkaddr < be->lstart + be->len) ||
+					blkaddr == NULL_ADDR)
+				wait_for_completion_io(&be->event);
+			else
+				continue;
+		}
+
 		err = be->error;
 		if (err == -EOPNOTSUPP)
 			err = 0;
@@ -756,6 +767,7 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
 	struct bio *bio = NULL;
+	block_t lblkstart = blkstart;
 	int err;
 
 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
@@ -770,13 +782,13 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
 				SECTOR_FROM_BLOCK(blklen),
 				GFP_NOFS, 0, &bio);
 	if (!err && bio) {
-		struct bio_entry *be = __add_bio_entry(sbi, bio);
+		struct bio_entry *be = __add_bio_entry(sbi, bio,
+						lblkstart, blklen);
 
 		bio->bi_private = be;
 		bio->bi_end_io = f2fs_submit_bio_wait_endio;
 		submit_bio(REQ_SYNC, bio);
 	}
-
 	return err;
 }
 
@@ -1655,6 +1667,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 
 	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
+	f2fs_wait_discard_bio(sbi, *new_blkaddr);
+
 	/*
 	 * __add_sum_entry should be resided under the curseg_mutex
 	 * because, this function updates a summary entry in the
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3b169927408e..84d5686c4aa4 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -770,6 +770,9 @@ static void f2fs_put_super(struct super_block *sb)
 		write_checkpoint(sbi, &cpc);
 	}
 
+	/* be sure to wait for any on-going discard commands */
+	f2fs_wait_discard_bio(sbi, NULL_ADDR);
+
 	/* write_checkpoint can update stat informaion */
 	f2fs_destroy_stats(sbi);
 

From d051ccbd1bfcd04283be206edbe6e5e9dbfe6cb8 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 29 Dec 2016 16:58:54 -0800
Subject: [PATCH 1142/3220] f2fs: avoid needless checkpoint in f2fs_trim_fs

commit 0333ad4e4f49e14217256e1db1134a70cf60b2de upstream.

The f2fs_trim_fs() doesn't need to do checkpoint if there are newly allocated
data blocks only which didn't change the critical checkpoint data such as nat
and sit entries.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 2ed785e5ffbb..886b96c12c31 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1249,14 +1249,15 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	f2fs_flush_merged_bios(sbi);
 
 	/* this is the case of multiple fstrims without any changes */
-	if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) {
-		f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt);
-		f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries);
-		f2fs_bug_on(sbi, prefree_segments(sbi));
-		flush_sit_entries(sbi, cpc);
-		clear_prefree_segments(sbi, cpc);
-		unblock_operations(sbi);
-		goto out;
+	if (cpc->reason == CP_DISCARD) {
+		if (NM_I(sbi)->dirty_nat_cnt == 0 &&
+				SIT_I(sbi)->dirty_sentries == 0 &&
+				prefree_segments(sbi) == 0) {
+			flush_sit_entries(sbi, cpc);
+			clear_prefree_segments(sbi, cpc);
+			unblock_operations(sbi);
+			goto out;
+		}
 	}
 
 	/*

From 556f5ba3497207ee447f1a163133278e2a8f8376 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 29 Dec 2016 22:06:15 -0800
Subject: [PATCH 1143/3220] f2fs: return fs_trim if there is no candidate

commit 25290fa5591d81767713db304e0d567bf991786f upstream.

If there is no candidate to submit discard command during f2sf_trim_fs, let's
return without checkpoint.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  5 +++++
 fs/f2fs/f2fs.h       |  1 +
 fs/f2fs/segment.c    | 28 +++++++++++++++++++++++-----
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 886b96c12c31..fbf04d4d7964 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1250,6 +1250,11 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	/* this is the case of multiple fstrims without any changes */
 	if (cpc->reason == CP_DISCARD) {
+		if (!exist_trim_candidates(sbi, cpc)) {
+			unblock_operations(sbi);
+			goto out;
+		}
+
 		if (NM_I(sbi)->dirty_nat_cnt == 0 &&
 				SIT_I(sbi)->dirty_sentries == 0 &&
 				prefree_segments(sbi) == 0) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3409392dde9c..3eb53e3a8eae 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2186,6 +2186,7 @@ void release_discard_addrs(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *, bool);
 void allocate_new_segments(struct f2fs_sb_info *);
 int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
+bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
 void update_meta_page(struct f2fs_sb_info *, void *, block_t);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fa3d4f8db389..12f8d5ab7ccf 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -914,7 +914,8 @@ done:
 	SM_I(sbi)->nr_discards += end - start;
 }
 
-static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
+							bool check_only)
 {
 	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
 	int max_blocks = sbi->blocks_per_seg;
@@ -928,12 +929,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	int i;
 
 	if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
-		return;
+		return false;
 
 	if (!force) {
 		if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
 		    SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)
-			return;
+			return false;
 	}
 
 	/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
@@ -951,8 +952,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 					&& (end - start) < cpc->trim_minlen)
 			continue;
 
+		if (check_only)
+			return true;
+
 		__add_discard_entry(sbi, cpc, se, start, end);
 	}
+	return false;
 }
 
 void release_discard_addrs(struct f2fs_sb_info *sbi)
@@ -1533,6 +1538,19 @@ static const struct segment_allocation default_salloc_ops = {
 	.allocate_segment = allocate_segment_by_default,
 };
 
+bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+	__u64 trim_start = cpc->trim_start;
+
+	mutex_lock(&SIT_I(sbi)->sentry_lock);
+	for (; trim_start <= cpc->trim_end; trim_start++)
+		if (add_discard_addrs(sbi, cpc, true))
+			break;
+	mutex_unlock(&SIT_I(sbi)->sentry_lock);
+
+	return trim_start <= cpc->trim_end;
+}
+
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 {
 	__u64 start = F2FS_BYTES_TO_BLK(range->start);
@@ -2329,7 +2347,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			/* add discard candidates */
 			if (cpc->reason != CP_DISCARD) {
 				cpc->trim_start = segno;
-				add_discard_addrs(sbi, cpc);
+				add_discard_addrs(sbi, cpc, false);
 			}
 
 			if (to_journal) {
@@ -2367,7 +2385,7 @@ out:
 		__u64 trim_start = cpc->trim_start;
 
 		for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
-			add_discard_addrs(sbi, cpc);
+			add_discard_addrs(sbi, cpc, false);
 
 		cpc->trim_start = trim_start;
 	}

From 7129702a487ad1a4689bdf41c3657f829472c18c Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 7 Jan 2017 18:49:42 +0800
Subject: [PATCH 1144/3220] f2fs: clean up with list_{first, last}_entry

commit 939afa943c5290a3b92f01612a792af17bc98115 upstream.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 4 ++--
 fs/f2fs/data.c       | 4 ++--
 fs/f2fs/node.h       | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index fbf04d4d7964..45ef3b6bfb04 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -892,7 +892,7 @@ retry:
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
 		return 0;
 	}
-	fi = list_entry(head->next, struct f2fs_inode_info, dirty_list);
+	fi = list_first_entry(head, struct f2fs_inode_info, dirty_list);
 	inode = igrab(&fi->vfs_inode);
 	spin_unlock(&sbi->inode_lock[type]);
 	if (inode) {
@@ -925,7 +925,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
 			spin_unlock(&sbi->inode_lock[DIRTY_META]);
 			return 0;
 		}
-		fi = list_entry(head->next, struct f2fs_inode_info,
+		fi = list_first_entry(head, struct f2fs_inode_info,
 							gdirty_list);
 		inode = igrab(&fi->vfs_inode);
 		spin_unlock(&sbi->inode_lock[DIRTY_META]);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 6c8465d26bb1..f1001c871cb6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1144,7 +1144,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 
 		prefetchw(&page->flags);
 		if (pages) {
-			page = list_entry(pages->prev, struct page, lru);
+			page = list_last_entry(pages, struct page, lru);
 			list_del(&page->lru);
 			if (add_to_page_cache_lru(page, mapping,
 						  page->index, GFP_KERNEL))
@@ -1262,7 +1262,7 @@ static int f2fs_read_data_pages(struct file *file,
 			struct list_head *pages, unsigned nr_pages)
 {
 	struct inode *inode = file->f_mapping->host;
-	struct page *page = list_entry(pages->prev, struct page, lru);
+	struct page *page = list_last_entry(pages, struct page, lru);
 
 	trace_f2fs_readpages(inode, page, nr_pages);
 
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index e7997e240366..9278b21ee073 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -174,7 +174,7 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 		spin_unlock(&nm_i->nid_list_lock);
 		return;
 	}
-	fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next,
+	fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
 						struct free_nid, list);
 	*nid = fnid->nid;
 	spin_unlock(&nm_i->nid_list_lock);

From dc8b8cea1e70b6649498a9850745c394e39d290c Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 7 Jan 2017 18:50:26 +0800
Subject: [PATCH 1145/3220] f2fs: introduce FI_ATOMIC_COMMIT

commit 5fe457430e554a2f5188f13c1a2e36ad845640c5 upstream.

This patch introduces a new flag to indicate inode status of doing atomic
write committing, so that, we can keep atomic write status for inode
during atomic committing, then we can skip GCing pages of atomic write inode,
that avoids random GCed datas being mixed with current transaction, so
isolation of transaction can be kept.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    |  2 +-
 fs/f2fs/f2fs.h    |  6 ++++++
 fs/f2fs/file.c    | 11 ++++++-----
 fs/f2fs/gc.c      |  6 ++++++
 fs/f2fs/segment.c | 10 +++++++---
 5 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f1001c871cb6..4ac72a3f920a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2017,7 +2017,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 
-	if (f2fs_is_atomic_file(inode)) {
+	if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
 		if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
 			register_inmem_page(inode, page);
 			return 1;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3eb53e3a8eae..807855d37c63 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1706,6 +1706,7 @@ enum {
 	FI_UPDATE_WRITE,	/* inode has in-place-update data */
 	FI_NEED_IPU,		/* used for ipu per file */
 	FI_ATOMIC_FILE,		/* indicate atomic file */
+	FI_ATOMIC_COMMIT,	/* indicate the state of atomical committing */
 	FI_VOLATILE_FILE,	/* indicate volatile file */
 	FI_FIRST_BLOCK_WRITTEN,	/* indicate #0 data block was written */
 	FI_DROP_CACHE,		/* drop dirty page cache */
@@ -1895,6 +1896,11 @@ static inline bool f2fs_is_atomic_file(struct inode *inode)
 	return is_inode_flag_set(inode, FI_ATOMIC_FILE);
 }
 
+static inline bool f2fs_is_commit_atomic_write(struct inode *inode)
+{
+	return is_inode_flag_set(inode, FI_ATOMIC_COMMIT);
+}
+
 static inline bool f2fs_is_volatile_file(struct inode *inode)
 {
 	return is_inode_flag_set(inode, FI_VOLATILE_FILE);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9da13847cda4..e4e5d76d80b0 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1573,14 +1573,15 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 		goto err_out;
 
 	if (f2fs_is_atomic_file(inode)) {
-		clear_inode_flag(inode, FI_ATOMIC_FILE);
 		ret = commit_inmem_pages(inode);
-		if (ret) {
-			set_inode_flag(inode, FI_ATOMIC_FILE);
+		if (ret)
 			goto err_out;
-		}
+
 		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
-		stat_dec_atomic_write(inode);
+		if (!ret) {
+			clear_inode_flag(inode, FI_ATOMIC_FILE);
+			stat_dec_atomic_write(inode);
+		}
 	} else {
 		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
 	}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d3a36e4b442c..7f0c3e02408c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -569,6 +569,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
 	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
 		goto out;
 
+	if (f2fs_is_atomic_file(inode))
+		goto out;
+
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
 	if (err)
@@ -661,6 +664,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
 	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
 		goto out;
 
+	if (f2fs_is_atomic_file(inode))
+		goto out;
+
 	if (gc_type == BG_GC) {
 		if (PageWriteback(page))
 			goto out;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 12f8d5ab7ccf..6a870677d58a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -242,12 +242,12 @@ void drop_inmem_pages(struct inode *inode)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	clear_inode_flag(inode, FI_ATOMIC_FILE);
-	stat_dec_atomic_write(inode);
-
 	mutex_lock(&fi->inmem_lock);
 	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
 	mutex_unlock(&fi->inmem_lock);
+
+	clear_inode_flag(inode, FI_ATOMIC_FILE);
+	stat_dec_atomic_write(inode);
 }
 
 static int __commit_inmem_pages(struct inode *inode,
@@ -316,6 +316,8 @@ int commit_inmem_pages(struct inode *inode)
 	f2fs_balance_fs(sbi, true);
 	f2fs_lock_op(sbi);
 
+	set_inode_flag(inode, FI_ATOMIC_COMMIT);
+
 	mutex_lock(&fi->inmem_lock);
 	err = __commit_inmem_pages(inode, &revoke_list);
 	if (err) {
@@ -337,6 +339,8 @@ int commit_inmem_pages(struct inode *inode)
 	}
 	mutex_unlock(&fi->inmem_lock);
 
+	clear_inode_flag(inode, FI_ATOMIC_COMMIT);
+
 	f2fs_unlock_op(sbi);
 	return err;
 }

From dd5804b2146ef7a3c8e4017be426e07ec25a49fa Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 7 Jan 2017 18:51:01 +0800
Subject: [PATCH 1146/3220] f2fs: check in-memory block bitmap

commit 355e78913c0d57492076d545b6f44b94fec2bf6b upstream.

This patch adds a mirror for valid block bitmap, and use it to detect
in-memory bitmap corruption which may be caused by bit-transition of
cache or memory overflow.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 32 ++++++++++++++++++++++++++++++--
 fs/f2fs/segment.h |  6 ++++++
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6a870677d58a..aae1c2ea7a1d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1101,14 +1101,32 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 
 	/* Update valid block bitmap */
 	if (del > 0) {
-		if (f2fs_test_and_set_bit(offset, se->cur_valid_map))
+		if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) {
+#ifdef CONFIG_F2FS_CHECK_FS
+			if (f2fs_test_and_set_bit(offset,
+						se->cur_valid_map_mir))
+				f2fs_bug_on(sbi, 1);
+			else
+				WARN_ON(1);
+#else
 			f2fs_bug_on(sbi, 1);
+#endif
+		}
 		if (f2fs_discard_en(sbi) &&
 			!f2fs_test_and_set_bit(offset, se->discard_map))
 			sbi->discard_blks--;
 	} else {
-		if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map))
+		if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) {
+#ifdef CONFIG_F2FS_CHECK_FS
+			if (!f2fs_test_and_clear_bit(offset,
+						se->cur_valid_map_mir))
+				f2fs_bug_on(sbi, 1);
+			else
+				WARN_ON(1);
+#else
 			f2fs_bug_on(sbi, 1);
+#endif
+		}
 		if (f2fs_discard_en(sbi) &&
 			f2fs_test_and_clear_bit(offset, se->discard_map))
 			sbi->discard_blks++;
@@ -2432,6 +2450,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 				!sit_i->sentries[start].ckpt_valid_map)
 			return -ENOMEM;
 
+#ifdef CONFIG_F2FS_CHECK_FS
+		sit_i->sentries[start].cur_valid_map_mir
+			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+		if (!sit_i->sentries[start].cur_valid_map_mir)
+			return -ENOMEM;
+#endif
+
 		if (f2fs_discard_en(sbi)) {
 			sit_i->sentries[start].discard_map
 				= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
@@ -2861,6 +2886,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
 	if (sit_i->sentries) {
 		for (start = 0; start < MAIN_SEGS(sbi); start++) {
 			kfree(sit_i->sentries[start].cur_valid_map);
+#ifdef CONFIG_F2FS_CHECK_FS
+			kfree(sit_i->sentries[start].cur_valid_map_mir);
+#endif
 			kfree(sit_i->sentries[start].ckpt_valid_map);
 			kfree(sit_i->sentries[start].discard_map);
 		}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 08f1455c812c..9af95194db06 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -164,6 +164,9 @@ struct seg_entry {
 	unsigned int ckpt_valid_blocks:10;	/* # of valid blocks last cp */
 	unsigned int padding:6;		/* padding */
 	unsigned char *cur_valid_map;	/* validity bitmap of blocks */
+#ifdef CONFIG_F2FS_CHECK_FS
+	unsigned char *cur_valid_map_mir;	/* mirror of current valid bitmap */
+#endif
 	/*
 	 * # of valid blocks and the validity bitmap stored in the the last
 	 * checkpoint pack. This information is used by the SSR mode.
@@ -320,6 +323,9 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se,
 	se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs);
 	memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
 	memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+#ifdef CONFIG_F2FS_CHECK_FS
+	memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+#endif
 	se->type = GET_SIT_TYPE(rs);
 	se->mtime = le64_to_cpu(rs->mtime);
 }

From 5c53448ff2e90f63fbb7bc955fe67e66d5974dc1 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 7 Jan 2017 18:52:01 +0800
Subject: [PATCH 1147/3220] f2fs: check in-memory nat version bitmap

commit 599a09b2c1ac222e6aad0c22515d1ccde7c3b702 upstream.

This patch adds a mirror for nat version bitmap, and use it to detect
in-memory bitmap corruption which may be caused by bit-transition of
cache or memory overflow.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h |  3 +++
 fs/f2fs/node.c | 11 +++++++++++
 fs/f2fs/node.h | 15 +++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 807855d37c63..d4783d9cf4e0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -607,6 +607,9 @@ struct f2fs_nm_info {
 
 	/* for checkpoint */
 	char *nat_bitmap;		/* NAT bitmap pointer */
+#ifdef CONFIG_F2FS_CHECK_FS
+	char *nat_bitmap_mir;		/* NAT bitmap mirror */
+#endif
 	int bitmap_size;		/* bitmap size */
 };
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b01b01cfc39e..bc67dc323f7e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2366,6 +2366,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 					GFP_KERNEL);
 	if (!nm_i->nat_bitmap)
 		return -ENOMEM;
+
+#ifdef CONFIG_F2FS_CHECK_FS
+	nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
+					GFP_KERNEL);
+	if (!nm_i->nat_bitmap_mir)
+		return -ENOMEM;
+#endif
+
 	return 0;
 }
 
@@ -2440,6 +2448,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 	up_write(&nm_i->nat_tree_lock);
 
 	kfree(nm_i->nat_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+	kfree(nm_i->nat_bitmap_mir);
+#endif
 	sbi->nm_info = NULL;
 	kfree(nm_i);
 }
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 9278b21ee073..29ff783eb9c3 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -186,6 +186,12 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+#ifdef CONFIG_F2FS_CHECK_FS
+	if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir,
+						nm_i->bitmap_size))
+		f2fs_bug_on(sbi, 1);
+#endif
 	memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size);
 }
 
@@ -203,6 +209,12 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
 		(seg_off << sbi->log_blocks_per_seg << 1) +
 		(block_off & (sbi->blocks_per_seg - 1)));
 
+#ifdef CONFIG_F2FS_CHECK_FS
+	if (f2fs_test_bit(block_off, nm_i->nat_bitmap) !=
+			f2fs_test_bit(block_off, nm_i->nat_bitmap_mir))
+		f2fs_bug_on(sbi, 1);
+#endif
+
 	if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
 		block_addr += sbi->blocks_per_seg;
 
@@ -228,6 +240,9 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
 	unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
 
 	f2fs_change_bit(block_off, nm_i->nat_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+	f2fs_change_bit(block_off, nm_i->nat_bitmap_mir);
+#endif
 }
 
 static inline nid_t ino_of_node(struct page *node_page)

From fb2e2f44afe51eb5b13cb15c7d722296224d6cde Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 7 Jan 2017 18:52:34 +0800
Subject: [PATCH 1148/3220] f2fs: check in-memory sit version bitmap

commit ae27d62e6befd3cac4ffa702e644cc52019642e8 upstream.

This patch adds a mirror for sit version bitmap, and use it to detect
in-memory bitmap corruption which may be caused by bit-transition of
cache or memory overflow.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 16 ++++++++++++----
 fs/f2fs/segment.h | 18 ++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index aae1c2ea7a1d..c39bbffb0cac 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2421,7 +2421,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct sit_info *sit_i;
 	unsigned int sit_segs, start;
-	char *src_bitmap, *dst_bitmap;
+	char *src_bitmap;
 	unsigned int bitmap_size;
 
 	/* allocate memory for SIT information */
@@ -2483,17 +2483,22 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
 	src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
 
-	dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
-	if (!dst_bitmap)
+	sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
+	if (!sit_i->sit_bitmap)
 		return -ENOMEM;
 
+#ifdef CONFIG_F2FS_CHECK_FS
+	sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
+	if (!sit_i->sit_bitmap_mir)
+		return -ENOMEM;
+#endif
+
 	/* init SIT information */
 	sit_i->s_ops = &default_salloc_ops;
 
 	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
 	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
 	sit_i->written_valid_blocks = 0;
-	sit_i->sit_bitmap = dst_bitmap;
 	sit_i->bitmap_size = bitmap_size;
 	sit_i->dirty_sentries = 0;
 	sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
@@ -2901,6 +2906,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
 
 	SM_I(sbi)->sit_info = NULL;
 	kfree(sit_i->sit_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+	kfree(sit_i->sit_bitmap_mir);
+#endif
 	kfree(sit_i);
 }
 
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 9af95194db06..5cb5755c75d9 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -209,6 +209,9 @@ struct sit_info {
 	block_t sit_blocks;		/* # of blocks used by SIT area */
 	block_t written_valid_blocks;	/* # of valid blocks in main area */
 	char *sit_bitmap;		/* SIT bitmap pointer */
+#ifdef CONFIG_F2FS_CHECK_FS
+	char *sit_bitmap_mir;		/* SIT bitmap mirror */
+#endif
 	unsigned int bitmap_size;	/* SIT bitmap size */
 
 	unsigned long *tmp_map;			/* bitmap for temporal use */
@@ -423,6 +426,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
 		void *dst_addr)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
+
+#ifdef CONFIG_F2FS_CHECK_FS
+	if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir,
+						sit_i->bitmap_size))
+		f2fs_bug_on(sbi, 1);
+#endif
 	memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size);
 }
 
@@ -643,6 +652,12 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
 
 	check_seg_range(sbi, start);
 
+#ifdef CONFIG_F2FS_CHECK_FS
+	if (f2fs_test_bit(offset, sit_i->sit_bitmap) !=
+			f2fs_test_bit(offset, sit_i->sit_bitmap_mir))
+		f2fs_bug_on(sbi, 1);
+#endif
+
 	/* calculate sit block address */
 	if (f2fs_test_bit(offset, sit_i->sit_bitmap))
 		blk_addr += sit_i->sit_blocks;
@@ -668,6 +683,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
 	unsigned int block_off = SIT_BLOCK_OFFSET(start);
 
 	f2fs_change_bit(block_off, sit_i->sit_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+	f2fs_change_bit(block_off, sit_i->sit_bitmap_mir);
+#endif
 }
 
 static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)

From eee3f1f5105a3984bb0572750c2107051ffb2d90 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 9 Jan 2017 14:13:03 -0800
Subject: [PATCH 1149/3220] f2fs: clean up flush/discard command namings

commit b01a92019cac30398ef75b560d2668b399f4e393 upstream.

This patch simply cleans up the names for flush/discard commands.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c   |  2 +-
 fs/f2fs/f2fs.h    | 20 +++++-----
 fs/f2fs/segment.c | 98 +++++++++++++++++++++++------------------------
 3 files changed, 59 insertions(+), 61 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 29cdf0c1da1d..883f1ea9e0b6 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -194,7 +194,7 @@ get_cache:
 		si->cache_mem += sizeof(struct f2fs_gc_kthread);
 
 	/* build merge flush thread */
-	if (SM_I(sbi)->cmd_control_info)
+	if (SM_I(sbi)->fcc_info)
 		si->cache_mem += sizeof(struct flush_cmd_control);
 
 	/* free nids */
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d4783d9cf4e0..167c5f841b5f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -248,13 +248,12 @@ struct discard_entry {
 	int len;		/* # of consecutive blocks of the discard */
 };
 
-struct bio_entry {
-	struct list_head list;
-	block_t lstart;
-	block_t len;
-	struct bio *bio;
-	struct completion event;
-	int error;
+struct discard_cmd {
+	struct list_head list;		/* command list */
+	struct completion wait;		/* compleation */
+	block_t lstart;			/* logical start address */
+	block_t len;			/* length */
+	struct bio *bio;		/* bio */
 };
 
 /* for the list of fsync inodes, used only during recovery */
@@ -701,8 +700,8 @@ struct f2fs_sm_info {
 	unsigned int rec_prefree_segments;
 
 	/* for small discard management */
-	struct list_head discard_list;		/* 4KB discard list */
-	struct list_head wait_list;		/* linked with issued discard bio */
+	struct list_head discard_entry_list;	/* 4KB discard entry list */
+	struct list_head discard_cmd_list;	/* discard cmd list */
 	int nr_discards;			/* # of discards in the list */
 	int max_discards;			/* max. discards to be issued */
 
@@ -716,8 +715,7 @@ struct f2fs_sm_info {
 	unsigned int min_fsync_blocks;	/* threshold for fsync */
 
 	/* for flush command control */
-	struct flush_cmd_control *cmd_control_info;
-
+	struct flush_cmd_control *fcc_info;
 };
 
 /*
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c39bbffb0cac..289b3facd2d8 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -26,7 +26,7 @@
 #define __reverse_ffz(x) __reverse_ffs(~(x))
 
 static struct kmem_cache *discard_entry_slab;
-static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *discard_cmd_slab;
 static struct kmem_cache *sit_entry_set_slab;
 static struct kmem_cache *inmem_entry_slab;
 
@@ -439,7 +439,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi)
 static int issue_flush_thread(void *data)
 {
 	struct f2fs_sb_info *sbi = data;
-	struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
 	wait_queue_head_t *q = &fcc->flush_wait_queue;
 repeat:
 	if (kthread_should_stop())
@@ -468,7 +468,7 @@ repeat:
 
 int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 {
-	struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
 	struct flush_cmd cmd;
 
 	trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER),
@@ -511,8 +511,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	struct flush_cmd_control *fcc;
 	int err = 0;
 
-	if (SM_I(sbi)->cmd_control_info) {
-		fcc = SM_I(sbi)->cmd_control_info;
+	if (SM_I(sbi)->fcc_info) {
+		fcc = SM_I(sbi)->fcc_info;
 		goto init_thread;
 	}
 
@@ -522,14 +522,14 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	atomic_set(&fcc->submit_flush, 0);
 	init_waitqueue_head(&fcc->flush_wait_queue);
 	init_llist_head(&fcc->issue_list);
-	SM_I(sbi)->cmd_control_info = fcc;
+	SM_I(sbi)->fcc_info = fcc;
 init_thread:
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(fcc->f2fs_issue_flush)) {
 		err = PTR_ERR(fcc->f2fs_issue_flush);
 		kfree(fcc);
-		SM_I(sbi)->cmd_control_info = NULL;
+		SM_I(sbi)->fcc_info = NULL;
 		return err;
 	}
 
@@ -538,7 +538,7 @@ init_thread:
 
 void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
 {
-	struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
 
 	if (fcc && fcc->f2fs_issue_flush) {
 		struct task_struct *flush_thread = fcc->f2fs_issue_flush;
@@ -548,7 +548,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
 	}
 	if (free) {
 		kfree(fcc);
-		SM_I(sbi)->cmd_control_info = NULL;
+		SM_I(sbi)->fcc_info = NULL;
 	}
 }
 
@@ -628,42 +628,43 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 	mutex_unlock(&dirty_i->seglist_lock);
 }
 
-static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi,
+static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi,
 			struct bio *bio, block_t lstart, block_t len)
 {
-	struct list_head *wait_list = &(SM_I(sbi)->wait_list);
-	struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
+	struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list);
+	struct discard_cmd *dc;
 
-	INIT_LIST_HEAD(&be->list);
-	be->bio = bio;
-	be->lstart = lstart;
-	be->len = len;
-	init_completion(&be->event);
-	list_add_tail(&be->list, wait_list);
+	dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS);
+	INIT_LIST_HEAD(&dc->list);
+	dc->bio = bio;
+	dc->lstart = lstart;
+	dc->len = len;
+	init_completion(&dc->wait);
+	list_add_tail(&dc->list, wait_list);
 
-	return be;
+	return dc;
 }
 
 /* This should be covered by global mutex, &sit_i->sentry_lock */
 void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
 {
-	struct list_head *wait_list = &(SM_I(sbi)->wait_list);
-	struct bio_entry *be, *tmp;
+	struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list);
+	struct discard_cmd *dc, *tmp;
 
-	list_for_each_entry_safe(be, tmp, wait_list, list) {
-		struct bio *bio = be->bio;
+	list_for_each_entry_safe(dc, tmp, wait_list, list) {
+		struct bio *bio = dc->bio;
 		int err;
 
-		if (!completion_done(&be->event)) {
-			if ((be->lstart <= blkaddr &&
-					blkaddr < be->lstart + be->len) ||
+		if (!completion_done(&dc->wait)) {
+			if ((dc->lstart <= blkaddr &&
+					blkaddr < dc->lstart + dc->len) ||
 					blkaddr == NULL_ADDR)
-				wait_for_completion_io(&be->event);
+				wait_for_completion_io(&dc->wait);
 			else
 				continue;
 		}
 
-		err = be->error;
+		err = bio->bi_error;
 		if (err == -EOPNOTSUPP)
 			err = 0;
 
@@ -672,17 +673,16 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
 				"Issue discard failed, ret: %d", err);
 
 		bio_put(bio);
-		list_del(&be->list);
-		kmem_cache_free(bio_entry_slab, be);
+		list_del(&dc->list);
+		kmem_cache_free(discard_cmd_slab, dc);
 	}
 }
 
-static void f2fs_submit_bio_wait_endio(struct bio *bio)
+static void f2fs_submit_discard_endio(struct bio *bio)
 {
-	struct bio_entry *be = (struct bio_entry *)bio->bi_private;
+	struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
 
-	be->error = bio->bi_error;
-	complete(&be->event);
+	complete(&dc->wait);
 }
 
 /* copied from block/blk-lib.c in 4.10-rc1 */
@@ -786,11 +786,11 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
 				SECTOR_FROM_BLOCK(blklen),
 				GFP_NOFS, 0, &bio);
 	if (!err && bio) {
-		struct bio_entry *be = __add_bio_entry(sbi, bio,
+		struct discard_cmd *dc = __add_discard_cmd(sbi, bio,
 						lblkstart, blklen);
 
-		bio->bi_private = be;
-		bio->bi_end_io = f2fs_submit_bio_wait_endio;
+		bio->bi_private = dc;
+		bio->bi_end_io = f2fs_submit_discard_endio;
 		submit_bio(REQ_SYNC, bio);
 	}
 	return err;
@@ -897,7 +897,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi,
 		struct cp_control *cpc, struct seg_entry *se,
 		unsigned int start, unsigned int end)
 {
-	struct list_head *head = &SM_I(sbi)->discard_list;
+	struct list_head *head = &SM_I(sbi)->discard_entry_list;
 	struct discard_entry *new, *last;
 
 	if (!list_empty(head)) {
@@ -966,7 +966,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 
 void release_discard_addrs(struct f2fs_sb_info *sbi)
 {
-	struct list_head *head = &(SM_I(sbi)->discard_list);
+	struct list_head *head = &(SM_I(sbi)->discard_entry_list);
 	struct discard_entry *entry, *this;
 
 	/* drop caches */
@@ -992,7 +992,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 
 void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
-	struct list_head *head = &(SM_I(sbi)->discard_list);
+	struct list_head *head = &(SM_I(sbi)->discard_entry_list);
 	struct discard_entry *entry, *this;
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct blk_plug plug;
@@ -2783,8 +2783,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
 	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
 
-	INIT_LIST_HEAD(&sm_info->discard_list);
-	INIT_LIST_HEAD(&sm_info->wait_list);
+	INIT_LIST_HEAD(&sm_info->discard_entry_list);
+	INIT_LIST_HEAD(&sm_info->discard_cmd_list);
 	sm_info->nr_discards = 0;
 	sm_info->max_discards = 0;
 
@@ -2934,15 +2934,15 @@ int __init create_segment_manager_caches(void)
 	if (!discard_entry_slab)
 		goto fail;
 
-	bio_entry_slab = f2fs_kmem_cache_create("bio_entry",
-			sizeof(struct bio_entry));
-	if (!bio_entry_slab)
+	discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd",
+			sizeof(struct discard_cmd));
+	if (!discard_cmd_slab)
 		goto destroy_discard_entry;
 
 	sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
 			sizeof(struct sit_entry_set));
 	if (!sit_entry_set_slab)
-		goto destroy_bio_entry;
+		goto destroy_discard_cmd;
 
 	inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
 			sizeof(struct inmem_pages));
@@ -2952,8 +2952,8 @@ int __init create_segment_manager_caches(void)
 
 destroy_sit_entry_set:
 	kmem_cache_destroy(sit_entry_set_slab);
-destroy_bio_entry:
-	kmem_cache_destroy(bio_entry_slab);
+destroy_discard_cmd:
+	kmem_cache_destroy(discard_cmd_slab);
 destroy_discard_entry:
 	kmem_cache_destroy(discard_entry_slab);
 fail:
@@ -2963,7 +2963,7 @@ fail:
 void destroy_segment_manager_caches(void)
 {
 	kmem_cache_destroy(sit_entry_set_slab);
-	kmem_cache_destroy(bio_entry_slab);
+	kmem_cache_destroy(discard_cmd_slab);
 	kmem_cache_destroy(discard_entry_slab);
 	kmem_cache_destroy(inmem_entry_slab);
 }

From d78a12988cc5d5080a2c1d03a9308b648c45a8d3 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 11 Jan 2017 10:21:15 -0800
Subject: [PATCH 1150/3220] f2fs: reorganize stat information

commit d4adb30f25f5f2aa9b205891e395251d2a9098be upstream.

This patch modifies stat information more clearly.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 883f1ea9e0b6..cd338ca24941 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -258,8 +258,6 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->inline_dir);
 		seq_printf(s, "  - Orphan Inode: %u\n",
 			   si->orphans);
-		seq_printf(s, "  - Atomic write count: %4d (Max. %4d)\n",
-			   si->aw_cnt, si->max_aw_cnt);
 		seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
 			   si->main_area_segs, si->main_area_sections,
 			   si->main_area_zones);
@@ -318,8 +316,10 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n",
-			   si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data);
+		seq_printf(s, "  - IO (CP: %4d, Data: %4d)\n",
+			   si->nr_wb_cp_data, si->nr_wb_data);
+		seq_printf(s, "  - inmem: %4d, atomic IO: %4d (Max. %4d)\n",
+			   si->inmem_pages, si->aw_cnt, si->max_aw_cnt);
 		seq_printf(s, "  - nodes: %4d in %4d\n",
 			   si->ndirty_node, si->node_pages);
 		seq_printf(s, "  - dents: %4d in dirs:%4d (%4d)\n",

From 69fbcb521aae48b51b8b0b673390ed05264bdd60 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Wed, 27 Sep 2017 17:18:48 -0700
Subject: [PATCH 1151/3220] ANDROID: add script to fetch android kernel config
 fragments

The Android kernel config fragments now live in a separate repository.
To prevent others from having to search for this location, add a script
to fetch and unpack the fragments.

Update .gitignore to include these fragments.

Change-Id: If2d4a59b86e4573b0a9b3190025dfe4191870b46
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 .gitignore                               | 3 +++
 android/configs/android-fetch-configs.sh | 4 ++++
 2 files changed, 7 insertions(+)
 create mode 100755 android/configs/android-fetch-configs.sh

diff --git a/.gitignore b/.gitignore
index fd3a35592543..fa3e5f1d0808 100644
--- a/.gitignore
+++ b/.gitignore
@@ -112,3 +112,6 @@ all.config
 
 # Kdevelop4
 *.kdev4
+
+# fetched Android config fragments
+android/configs/android-*.cfg
diff --git a/android/configs/android-fetch-configs.sh b/android/configs/android-fetch-configs.sh
new file mode 100755
index 000000000000..9915c1356ed3
--- /dev/null
+++ b/android/configs/android-fetch-configs.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+curl https://android.googlesource.com/kernel/configs/+archive/master/android-4.4.tar.gz | tar xzv
+

From b0fa18e1caa5390619cfb6878b1b63879908ed10 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Wed, 3 May 2017 14:30:48 +0100
Subject: [PATCH 1152/3220] UPSTREAM: cpufreq: schedutil: use now as reference
 when aggregating shared policy requests

Currently, sugov_next_freq_shared() uses last_freq_update_time as a
reference to decide when to start considering CPU contributions as
stale.

However, since last_freq_update_time is set by the last CPU that issued
a frequency transition, this might cause problems in certain cases. In
practice, the detection of stale utilization values fails whenever the
CPU with such values was the last to update the policy. For example (and
please note again that the SCHED_CPUFREQ_RT flag is not the problem
here, but only the detection of after how much time that flag has to be
considered stale), suppose a policy with 2 CPUs:

               CPU0                |               CPU1
                                   |
                                   |     RT task scheduled
                                   |     SCHED_CPUFREQ_RT is set
                                   |     CPU1->last_update = now
                                   |     freq transition to max
                                   |     last_freq_update_time = now
                                   |

                        more than TICK_NSEC nsecs

                                   |
     a small CFS wakes up          |
     CPU0->last_update = now1      |
     delta_ns(CPU0) < TICK_NSEC*   |
     CPU0's util is considered     |
     delta_ns(CPU1) =              |
      last_freq_update_time -      |
      CPU1->last_update = 0        |
      < TICK_NSEC                  |
     CPU1 is still considered      |
     CPU1->SCHED_CPUFREQ_RT is set |
     we stay at max (until CPU1    |
     exits from idle)              |

* delta_ns is actually negative as now1 > last_freq_update_time

While last_freq_update_time is a sensible reference for rate limiting,
it doesn't seem to be useful for working around stale CPU states.

Fix the problem by always considering now (time) as the reference for
deciding when CPUs have stale contributions.

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from commit d86ab9cff8b936aadde444d0e263a8db5ff0349b)
---
 kernel/sched/cpufreq_schedutil.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index b90f7434e13b..28977799017b 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -322,11 +322,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	sugov_update_commit(sg_policy, time, next_f);
 }
 
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 {
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	struct cpufreq_policy *policy = sg_policy->policy;
-	u64 last_freq_update_time = sg_policy->last_freq_update_time;
 	unsigned long util = 0, max = 1;
 	unsigned int j;
 
@@ -342,7 +341,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
 		 * enough, don't take the CPU into account as it probably is
 		 * idle now (and clear iowait_boost for it).
 		 */
-		delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+		delta_ns = time - j_sg_cpu->last_update;
 		if (delta_ns > TICK_NSEC) {
 			j_sg_cpu->iowait_boost = 0;
 			j_sg_cpu->iowait_boost_pending = false;
@@ -387,7 +386,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 		if (flags & SCHED_CPUFREQ_DL)
 			next_f = sg_policy->policy->cpuinfo.max_freq;
 		else
-			next_f = sugov_next_freq_shared(sg_cpu);
+			next_f = sugov_next_freq_shared(sg_cpu, time);
 
 		sugov_update_commit(sg_policy, time, next_f);
 	}

From 13f002354db13a029aef84aa1a8bfa51bd6b8d56 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 9 Jan 2017 18:16:29 -0800
Subject: [PATCH 1153/3220] f2fs: catch up to v4.14-rc1

This is cherry-picked from upstrea-f2fs-stable-linux-4.4.y.

Changes include:

commit c7fd9e2b4a6876 ("f2fs: hurry up to issue discard after io interruption")
commit 603dde39653d6d ("f2fs: fix to show correct discard_granularity in sysfs")
...
commit 565f0225f95f15 ("f2fs: factor out discard command info into discard_cmd_control")
commit c4cc29d19eaf01 ("f2fs: remove batched discard in f2fs_trim_fs")

Change-Id: Icd8a85ac0c19a8aa25cd2591a12b4e9b85bdf1c5
Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |   53 +
 Documentation/filesystems/f2fs.txt      |   22 +
 fs/crypto/Makefile                      |    1 +
 fs/crypto/bio.c                         |  143 +++
 fs/crypto/crypto.c                      |  278 ++---
 fs/crypto/fname.c                       |  111 +-
 fs/crypto/fscrypt_private.h             |  107 ++
 fs/crypto/keyinfo.c                     |  270 +++--
 fs/crypto/policy.c                      |  243 ++--
 fs/f2fs/Makefile                        |    2 +-
 fs/f2fs/acl.c                           |    9 +-
 fs/f2fs/checkpoint.c                    |  217 +++-
 fs/f2fs/crypto_key.c                    |  240 ----
 fs/f2fs/crypto_policy.c                 |  248 -----
 fs/f2fs/data.c                          |  638 +++++++----
 fs/f2fs/debug.c                         |   67 +-
 fs/f2fs/dir.c                           |  106 +-
 fs/f2fs/extent_cache.c                  |  368 +++---
 fs/f2fs/f2fs.h                          | 1356 +++++++++++++++--------
 fs/f2fs/f2fs_crypto.h                   |  150 ---
 fs/f2fs/file.c                          |  473 +++++---
 fs/f2fs/gc.c                            |  265 +++--
 fs/f2fs/gc.h                            |   27 +-
 fs/f2fs/hash.c                          |    7 +-
 fs/f2fs/inline.c                        |  174 +--
 fs/f2fs/inode.c                         |  168 ++-
 fs/f2fs/namei.c                         |  182 ++-
 fs/f2fs/node.c                          |  723 ++++++++----
 fs/f2fs/node.h                          |   65 +-
 fs/f2fs/recovery.c                      |  102 +-
 fs/f2fs/segment.c                       | 1292 ++++++++++++++++-----
 fs/f2fs/segment.h                       |  197 ++--
 fs/f2fs/super.c                         | 1255 ++++++++++++++-------
 fs/f2fs/sysfs.c                         |  556 ++++++++++
 fs/f2fs/trace.c                         |    4 +-
 fs/f2fs/xattr.c                         |  178 ++-
 fs/f2fs/xattr.h                         |   11 +-
 include/linux/f2fs_fs.h                 |   43 +-
 include/linux/fscrypt_common.h          |  138 +++
 include/linux/fscrypt_notsupp.h         |  177 +++
 include/linux/fscrypt_supp.h            |  145 +++
 include/linux/fscrypto.h                |   12 +-
 include/trace/events/f2fs.h             |  213 +++-
 include/uapi/linux/fs.h                 |   30 +
 mm/util.c                               |    1 +
 45 files changed, 7588 insertions(+), 3479 deletions(-)
 create mode 100644 fs/crypto/bio.c
 create mode 100644 fs/crypto/fscrypt_private.h
 delete mode 100644 fs/f2fs/crypto_key.c
 delete mode 100644 fs/f2fs/crypto_policy.c
 delete mode 100644 fs/f2fs/f2fs_crypto.h
 create mode 100644 fs/f2fs/sysfs.c
 create mode 100644 include/linux/fscrypt_common.h
 create mode 100644 include/linux/fscrypt_notsupp.h
 create mode 100644 include/linux/fscrypt_supp.h

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 0345f2d1c727..500c60403653 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -57,6 +57,15 @@ Contact:	"Jaegeuk Kim" <jaegeuk.kim@samsung.com>
 Description:
 		 Controls the issue rate of small discard commands.
 
+What:          /sys/fs/f2fs/<disk>/discard_granularity
+Date:          July 2017
+Contact:       "Chao Yu" <yuchao0@huawei.com>
+Description:
+		Controls discard granularity of inner discard thread, inner thread
+		will not issue discards with size that is smaller than granularity.
+		The unit size is one block, now only support configuring in range
+		of [1, 512].
+
 What:		/sys/fs/f2fs/<disk>/max_victim_search
 Date:		January 2014
 Contact:	"Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@@ -92,3 +101,47 @@ Date:		October 2015
 Contact:	"Chao Yu" <chao2.yu@samsung.com>
 Description:
 		 Controls the count of nid pages to be readaheaded.
+
+What:		/sys/fs/f2fs/<disk>/dirty_nats_ratio
+Date:		January 2016
+Contact:	"Chao Yu" <chao2.yu@samsung.com>
+Description:
+		 Controls dirty nat entries ratio threshold, if current
+		 ratio exceeds configured threshold, checkpoint will
+		 be triggered for flushing dirty nat entries.
+
+What:		/sys/fs/f2fs/<disk>/lifetime_write_kbytes
+Date:		January 2016
+Contact:	"Shuoran Liu" <liushuoran@huawei.com>
+Description:
+		 Shows total written kbytes issued to disk.
+
+What:		/sys/fs/f2fs/<disk>/inject_rate
+Date:		May 2016
+Contact:	"Sheng Yong" <shengyong1@huawei.com>
+Description:
+		 Controls the injection rate.
+
+What:		/sys/fs/f2fs/<disk>/inject_type
+Date:		May 2016
+Contact:	"Sheng Yong" <shengyong1@huawei.com>
+Description:
+		 Controls the injection type.
+
+What:		/sys/fs/f2fs/<disk>/reserved_blocks
+Date:		June 2017
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Description:
+		 Controls current reserved blocks in system.
+
+What:		/sys/fs/f2fs/<disk>/gc_urgent
+Date:		August 2017
+Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+		 Do background GC agressively
+
+What:		/sys/fs/f2fs/<disk>/gc_urgent_sleep_time
+Date:		August 2017
+Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+		 Controls sleep time of GC urgent mode
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index d99faced79cb..6cf9ad12c57f 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -125,6 +125,7 @@ active_logs=%u         Support configuring the number of active logs. In the
 disable_ext_identify   Disable the extension list configured by mkfs, so f2fs
                        does not aware of cold files such as media files.
 inline_xattr           Enable the inline xattrs feature.
+noinline_xattr         Disable the inline xattrs feature.
 inline_data            Enable the inline data feature: New created small(<~3.4k)
                        files can be written into inode block.
 inline_dentry          Enable the inline dir feature: data in new created
@@ -159,6 +160,18 @@ mode=%s                Control block allocation mode which supports "adaptive"
                        writes towards main area.
 io_bits=%u             Set the bit size of write IO requests. It should be set
                        with "mode=lfs".
+usrquota               Enable plain user disk quota accounting.
+grpquota               Enable plain group disk quota accounting.
+prjquota               Enable plain project quota accounting.
+usrjquota=<file>       Appoint specified file and type during mount, so that quota
+grpjquota=<file>       information can be properly updated during recovery flow,
+prjjquota=<file>       <quota file>: must be in root directory;
+jqfmt=<quota type>     <quota type>: [vfsold,vfsv0,vfsv1].
+offusrjquota           Turn off user journelled quota.
+offgrpjquota           Turn off group journelled quota.
+offprjjquota           Turn off project journelled quota.
+quota                  Enable plain user disk quota accounting.
+noquota                Disable all plain disk quota option.
 
 ================================================================================
 DEBUGFS ENTRIES
@@ -204,6 +217,15 @@ Files in /sys/fs/f2fs/<devname>
                               gc_idle = 1 will select the Cost Benefit approach
                               & setting gc_idle = 2 will select the greedy approach.
 
+ gc_urgent                    This parameter controls triggering background GCs
+                              urgently or not. Setting gc_urgent = 0 [default]
+                              makes back to default behavior, while if it is set
+                              to 1, background thread starts to do GC by given
+                              gc_urgent_sleep_time interval.
+
+ gc_urgent_sleep_time         This parameter controls sleep time for gc_urgent.
+                              500 ms is set by default. See above gc_urgent.
+
  reclaim_segments             This parameter controls the number of prefree
                               segments to be reclaimed. If the number of prefree
 			      segments is larger than the number of segments
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index f17684c48739..9f6607f17b53 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
 
 fscrypto-y := crypto.o fname.o policy.o keyinfo.o
+fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
new file mode 100644
index 000000000000..a91ed46fe503
--- /dev/null
+++ b/fs/crypto/bio.c
@@ -0,0 +1,143 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *	Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *	Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ *	Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/namei.h>
+#include "fscrypt_private.h"
+
+/*
+ * Call fscrypt_decrypt_page on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+	struct fscrypt_ctx *ctx =
+		container_of(work, struct fscrypt_ctx, r.work);
+	struct bio *bio = ctx->r.bio;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+		int ret = fscrypt_decrypt_page(page->mapping->host, page,
+				PAGE_SIZE, 0, page->index);
+
+		if (ret) {
+			WARN_ON_ONCE(1);
+			SetPageError(page);
+		} else {
+			SetPageUptodate(page);
+		}
+		unlock_page(page);
+	}
+	fscrypt_release_ctx(ctx);
+	bio_put(bio);
+}
+
+void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+	INIT_WORK(&ctx->r.work, completion_pages);
+	ctx->r.bio = bio;
+	queue_work(fscrypt_read_workqueue, &ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *bounce_page;
+
+	/* The bounce data pages are unmapped. */
+	if ((*page)->mapping)
+		return;
+
+	/* The bounce data page is unmapped. */
+	bounce_page = *page;
+	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+
+	/* restore control page */
+	*page = ctx->w.control_page;
+
+	if (restore)
+		fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+
+int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+				sector_t pblk, unsigned int len)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *ciphertext_page = NULL;
+	struct bio *bio;
+	int ret, err = 0;
+
+	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
+
+	ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT);
+	if (IS_ERR(ciphertext_page)) {
+		err = PTR_ERR(ciphertext_page);
+		goto errout;
+	}
+
+	while (len--) {
+		err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk,
+					     ZERO_PAGE(0), ciphertext_page,
+					     PAGE_SIZE, 0, GFP_NOFS);
+		if (err)
+			goto errout;
+
+		bio = bio_alloc(GFP_NOWAIT, 1);
+		if (!bio) {
+			err = -ENOMEM;
+			goto errout;
+		}
+		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio->bi_iter.bi_sector =
+			pblk << (inode->i_sb->s_blocksize_bits - 9);
+		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+		ret = bio_add_page(bio, ciphertext_page,
+					inode->i_sb->s_blocksize, 0);
+		if (ret != inode->i_sb->s_blocksize) {
+			/* should never happen! */
+			WARN_ON(1);
+			bio_put(bio);
+			err = -EIO;
+			goto errout;
+		}
+		err = submit_bio_wait(0, bio);
+		bio_put(bio);
+		if (err)
+			goto errout;
+		lblk++;
+		pblk++;
+	}
+	err = 0;
+errout:
+	fscrypt_release_ctx(ctx);
+	return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 2d40ab9edc9f..c7835df7e7b8 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -24,10 +24,10 @@
 #include <linux/module.h>
 #include <linux/scatterlist.h>
 #include <linux/ratelimit.h>
-#include <linux/bio.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
-#include <linux/fscrypto.h>
+#include <crypto/aes.h>
+#include "fscrypt_private.h"
 
 static unsigned int num_prealloc_crypto_pages = 32;
 static unsigned int num_prealloc_crypto_ctxs = 128;
@@ -44,7 +44,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static LIST_HEAD(fscrypt_free_ctxs);
 static DEFINE_SPINLOCK(fscrypt_ctx_lock);
 
-static struct workqueue_struct *fscrypt_read_workqueue;
+struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
 static struct kmem_cache *fscrypt_ctx_cachep;
@@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
 {
 	unsigned long flags;
 
-	if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+	if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) {
 		mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
 		ctx->w.bounce_page = NULL;
 	}
@@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx);
  * Return: An allocated and initialized encryption context on success; error
  * value or NULL otherwise.
  */
-struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
+struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
 {
 	struct fscrypt_ctx *ctx = NULL;
 	struct fscrypt_info *ci = inode->i_crypt_info;
@@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
 	} else {
 		ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
 	}
-	ctx->flags &= ~FS_WRITE_PATH_FL;
+	ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL;
 	return ctx;
 }
 EXPORT_SYMBOL(fscrypt_get_ctx);
@@ -141,20 +141,15 @@ static void page_crypt_complete(struct crypto_async_request *req, int res)
 	complete(&ecr->completion);
 }
 
-typedef enum {
-	FS_DECRYPT = 0,
-	FS_ENCRYPT,
-} fscrypt_direction_t;
-
-static int do_page_crypto(struct inode *inode,
-			fscrypt_direction_t rw, pgoff_t index,
-			struct page *src_page, struct page *dest_page,
-			gfp_t gfp_flags)
+int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
+			   u64 lblk_num, struct page *src_page,
+			   struct page *dest_page, unsigned int len,
+			   unsigned int offs, gfp_t gfp_flags)
 {
 	struct {
 		__le64 index;
-		u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)];
-	} xts_tweak;
+		u8 padding[FS_IV_SIZE - sizeof(__le64)];
+	} iv;
 	struct skcipher_request *req = NULL;
 	DECLARE_FS_COMPLETION_RESULT(ecr);
 	struct scatterlist dst, src;
@@ -162,6 +157,18 @@ static int do_page_crypto(struct inode *inode,
 	struct crypto_skcipher *tfm = ci->ci_ctfm;
 	int res = 0;
 
+	BUG_ON(len == 0);
+
+	BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE);
+	BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE);
+	iv.index = cpu_to_le64(lblk_num);
+	memset(iv.padding, 0, sizeof(iv.padding));
+
+	if (ci->ci_essiv_tfm != NULL) {
+		crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv,
+					  (u8 *)&iv);
+	}
+
 	req = skcipher_request_alloc(tfm, gfp_flags);
 	if (!req) {
 		printk_ratelimited(KERN_ERR
@@ -174,15 +181,11 @@ static int do_page_crypto(struct inode *inode,
 		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 		page_crypt_complete, &ecr);
 
-	BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
-	xts_tweak.index = cpu_to_le64(index);
-	memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
-
 	sg_init_table(&dst, 1);
-	sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
+	sg_set_page(&dst, dest_page, len, offs);
 	sg_init_table(&src, 1);
-	sg_set_page(&src, src_page, PAGE_SIZE, 0);
-	skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak);
+	sg_set_page(&src, src_page, len, offs);
+	skcipher_request_set_crypt(req, &src, &dst, len, &iv);
 	if (rw == FS_DECRYPT)
 		res = crypto_skcipher_decrypt(req);
 	else
@@ -202,53 +205,86 @@ static int do_page_crypto(struct inode *inode,
 	return 0;
 }
 
-static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
+struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+				       gfp_t gfp_flags)
 {
 	ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
 	if (ctx->w.bounce_page == NULL)
 		return ERR_PTR(-ENOMEM);
-	ctx->flags |= FS_WRITE_PATH_FL;
+	ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL;
 	return ctx->w.bounce_page;
 }
 
 /**
  * fscypt_encrypt_page() - Encrypts a page
- * @inode:          The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- * @gfp_flags:      The gfp flag for memory allocation
+ * @inode:     The inode for which the encryption should take place
+ * @page:      The page to encrypt. Must be locked for bounce-page
+ *             encryption.
+ * @len:       Length of data to encrypt in @page and encrypted
+ *             data in returned page.
+ * @offs:      Offset of data within @page and returned
+ *             page holding encrypted data.
+ * @lblk_num:  Logical block number. This must be unique for multiple
+ *             calls with same inode, except when overwriting
+ *             previously written data.
+ * @gfp_flags: The gfp flag for memory allocation
  *
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
+ * Encrypts @page using the ctx encryption context. Performs encryption
+ * either in-place or into a newly allocated bounce page.
+ * Called on the page write path.
  *
- * Called on the page write path.  The caller must call
+ * Bounce page allocation is the default.
+ * In this case, the contents of @page are encrypted and stored in an
+ * allocated bounce page. @page has to be locked and the caller must call
  * fscrypt_restore_control_page() on the returned ciphertext page to
  * release the bounce buffer and the encryption context.
  *
- * Return: An allocated page with the encrypted content on success. Else, an
+ * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in
+ * fscrypt_operations. Here, the input-page is returned with its content
+ * encrypted.
+ *
+ * Return: A page with the encrypted content on success. Else, an
  * error value or NULL.
  */
-struct page *fscrypt_encrypt_page(struct inode *inode,
-				struct page *plaintext_page, gfp_t gfp_flags)
+struct page *fscrypt_encrypt_page(const struct inode *inode,
+				struct page *page,
+				unsigned int len,
+				unsigned int offs,
+				u64 lblk_num, gfp_t gfp_flags)
+
 {
 	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = NULL;
+	struct page *ciphertext_page = page;
 	int err;
 
-	BUG_ON(!PageLocked(plaintext_page));
+	BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
+
+	if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
+		/* with inplace-encryption we just encrypt the page */
+		err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page,
+					     ciphertext_page, len, offs,
+					     gfp_flags);
+		if (err)
+			return ERR_PTR(err);
+
+		return ciphertext_page;
+	}
+
+	BUG_ON(!PageLocked(page));
 
 	ctx = fscrypt_get_ctx(inode, gfp_flags);
 	if (IS_ERR(ctx))
 		return (struct page *)ctx;
 
 	/* The encryption operation will require a bounce page. */
-	ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
+	ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags);
 	if (IS_ERR(ciphertext_page))
 		goto errout;
 
-	ctx->w.control_page = plaintext_page;
-	err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
-					plaintext_page, ciphertext_page,
-					gfp_flags);
+	ctx->w.control_page = page;
+	err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+				     page, ciphertext_page, len, offs,
+				     gfp_flags);
 	if (err) {
 		ciphertext_page = ERR_PTR(err);
 		goto errout;
@@ -265,8 +301,13 @@ errout:
 EXPORT_SYMBOL(fscrypt_encrypt_page);
 
 /**
- * f2crypt_decrypt_page() - Decrypts a page in-place
- * @page: The page to decrypt. Must be locked.
+ * fscrypt_decrypt_page() - Decrypts a page in-place
+ * @inode:     The corresponding inode for the page to decrypt.
+ * @page:      The page to decrypt. Must be locked in case
+ *             it is a writeback page (FS_CFLG_OWN_PAGES unset).
+ * @len:       Number of bytes in @page to be decrypted.
+ * @offs:      Start of data in @page.
+ * @lblk_num:  Logical block number.
  *
  * Decrypts page in-place using the ctx encryption context.
  *
@@ -274,75 +315,17 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
  *
  * Return: Zero on success, non-zero otherwise.
  */
-int fscrypt_decrypt_page(struct page *page)
+int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
+			unsigned int len, unsigned int offs, u64 lblk_num)
 {
-	BUG_ON(!PageLocked(page));
+	if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
+		BUG_ON(!PageLocked(page));
 
-	return do_page_crypto(page->mapping->host,
-			FS_DECRYPT, page->index, page, page, GFP_NOFS);
+	return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page,
+				      len, offs, GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);
 
-int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
-				sector_t pblk, unsigned int len)
-{
-	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = NULL;
-	struct bio *bio;
-	int ret, err = 0;
-
-	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
-
-	ctx = fscrypt_get_ctx(inode, GFP_NOFS);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT);
-	if (IS_ERR(ciphertext_page)) {
-		err = PTR_ERR(ciphertext_page);
-		goto errout;
-	}
-
-	while (len--) {
-		err = do_page_crypto(inode, FS_ENCRYPT, lblk,
-					ZERO_PAGE(0), ciphertext_page,
-					GFP_NOFS);
-		if (err)
-			goto errout;
-
-		bio = bio_alloc(GFP_NOWAIT, 1);
-		if (!bio) {
-			err = -ENOMEM;
-			goto errout;
-		}
-		bio->bi_bdev = inode->i_sb->s_bdev;
-		bio->bi_iter.bi_sector =
-			pblk << (inode->i_sb->s_blocksize_bits - 9);
-		ret = bio_add_page(bio, ciphertext_page,
-					inode->i_sb->s_blocksize, 0);
-		if (ret != inode->i_sb->s_blocksize) {
-			/* should never happen! */
-			WARN_ON(1);
-			bio_put(bio);
-			err = -EIO;
-			goto errout;
-		}
-		err = submit_bio_wait(WRITE, bio);
-		if ((err == 0) && bio->bi_error)
-			err = -EIO;
-		bio_put(bio);
-		if (err)
-			goto errout;
-		lblk++;
-		pblk++;
-	}
-	err = 0;
-errout:
-	fscrypt_release_ctx(ctx);
-	return err;
-}
-EXPORT_SYMBOL(fscrypt_zeroout_range);
-
 /*
  * Validate dentries for encrypted directories to make sure we aren't
  * potentially caching stale data after a key has been added or
@@ -351,7 +334,6 @@ EXPORT_SYMBOL(fscrypt_zeroout_range);
 static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *dir;
-	struct fscrypt_info *ci;
 	int dir_has_key, cached_with_key;
 
 	if (flags & LOOKUP_RCU)
@@ -363,18 +345,11 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
 		return 0;
 	}
 
-	ci = d_inode(dir)->i_crypt_info;
-	if (ci && ci->ci_keyring_key &&
-	    (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
-					  (1 << KEY_FLAG_REVOKED) |
-					  (1 << KEY_FLAG_DEAD))))
-		ci = NULL;
-
 	/* this should eventually be an flag in d_flags */
 	spin_lock(&dentry->d_lock);
 	cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
 	spin_unlock(&dentry->d_lock);
-	dir_has_key = (ci != NULL);
+	dir_has_key = (d_inode(dir)->i_crypt_info != NULL);
 	dput(dir);
 
 	/*
@@ -399,63 +374,6 @@ const struct dentry_operations fscrypt_d_ops = {
 };
 EXPORT_SYMBOL(fscrypt_d_ops);
 
-/*
- * Call fscrypt_decrypt_page on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
-	struct fscrypt_ctx *ctx =
-		container_of(work, struct fscrypt_ctx, r.work);
-	struct bio *bio = ctx->r.bio;
-	struct bio_vec *bv;
-	int i;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		struct page *page = bv->bv_page;
-		int ret = fscrypt_decrypt_page(page);
-
-		if (ret) {
-			WARN_ON_ONCE(1);
-			SetPageError(page);
-		} else {
-			SetPageUptodate(page);
-		}
-		unlock_page(page);
-	}
-	fscrypt_release_ctx(ctx);
-	bio_put(bio);
-}
-
-void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
-{
-	INIT_WORK(&ctx->r.work, completion_pages);
-	ctx->r.bio = bio;
-	queue_work(fscrypt_read_workqueue, &ctx->r.work);
-}
-EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
-
-void fscrypt_pullback_bio_page(struct page **page, bool restore)
-{
-	struct fscrypt_ctx *ctx;
-	struct page *bounce_page;
-
-	/* The bounce data pages are unmapped. */
-	if ((*page)->mapping)
-		return;
-
-	/* The bounce data page is unmapped. */
-	bounce_page = *page;
-	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
-
-	/* restore control page */
-	*page = ctx->w.control_page;
-
-	if (restore)
-		fscrypt_restore_control_page(bounce_page);
-}
-EXPORT_SYMBOL(fscrypt_pullback_bio_page);
-
 void fscrypt_restore_control_page(struct page *page)
 {
 	struct fscrypt_ctx *ctx;
@@ -481,17 +399,22 @@ static void fscrypt_destroy(void)
 
 /**
  * fscrypt_initialize() - allocate major buffers for fs encryption.
+ * @cop_flags:  fscrypt operations flags
  *
  * We only call this when we start accessing encrypted files, since it
  * results in memory getting allocated that wouldn't otherwise be used.
  *
  * Return: Zero on success, non-zero otherwise.
  */
-int fscrypt_initialize(void)
+int fscrypt_initialize(unsigned int cop_flags)
 {
 	int i, res = -ENOMEM;
 
-	if (fscrypt_bounce_page_pool)
+	/*
+	 * No need to allocate a bounce page pool if there already is one or
+	 * this FS won't use it.
+	 */
+	if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool)
 		return 0;
 
 	mutex_lock(&fscrypt_init_mutex);
@@ -520,7 +443,6 @@ fail:
 	mutex_unlock(&fscrypt_init_mutex);
 	return res;
 }
-EXPORT_SYMBOL(fscrypt_initialize);
 
 /**
  * fscrypt_init() - Set up for fs encryption.
@@ -562,6 +484,8 @@ static void __exit fscrypt_exit(void)
 		destroy_workqueue(fscrypt_read_workqueue);
 	kmem_cache_destroy(fscrypt_ctx_cachep);
 	kmem_cache_destroy(fscrypt_info_cachep);
+
+	fscrypt_essiv_cleanup();
 }
 module_exit(fscrypt_exit);
 
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 9b774f4b50c8..ad9f814fdead 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -12,7 +12,7 @@
 
 #include <linux/scatterlist.h>
 #include <linux/ratelimit.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
 
 /**
  * fname_crypt_complete() - completion callback for filename crypto
@@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode,
 static const char *lookup_table =
 	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
 
+#define BASE64_CHARS(nbytes)	DIV_ROUND_UP((nbytes) * 4, 3)
+
 /**
  * digest_encode() -
  *
@@ -209,7 +211,7 @@ static int digest_decode(const char *src, int len, char *dst)
 	return cp - dst;
 }
 
-u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
+u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen)
 {
 	int padding = 32;
 	struct fscrypt_info *ci = inode->i_crypt_info;
@@ -227,14 +229,17 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
  * Allocates an output buffer that is sufficient for the crypto operation
  * specified by the context and the direction.
  */
-int fscrypt_fname_alloc_buffer(struct inode *inode,
+int fscrypt_fname_alloc_buffer(const struct inode *inode,
 				u32 ilen, struct fscrypt_str *crypto_str)
 {
-	unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
+	u32 olen = fscrypt_fname_encrypted_size(inode, ilen);
+	const u32 max_encoded_len =
+		max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE),
+		      1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)));
 
 	crypto_str->len = olen;
-	if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
-		olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+	olen = max(olen, max_encoded_len);
+
 	/*
 	 * Allocated buffer can hold one more character to null-terminate the
 	 * string
@@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer);
  *
  * The caller must have allocated sufficient memory for the @oname string.
  *
+ * If the key is available, we'll decrypt the disk name; otherwise, we'll encode
+ * it for presentation.  Short names are directly base64-encoded, while long
+ * names are encoded in fscrypt_digested_name format.
+ *
  * Return: 0 on success, -errno on failure
  */
 int fscrypt_fname_disk_to_usr(struct inode *inode,
@@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
 			struct fscrypt_str *oname)
 {
 	const struct qstr qname = FSTR_TO_QSTR(iname);
-	char buf[24];
+	struct fscrypt_digested_name digested_name;
 
 	if (fscrypt_is_dot_dotdot(&qname)) {
 		oname->name[0] = '.';
@@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
 	if (inode->i_crypt_info)
 		return fname_decrypt(inode, iname, oname);
 
-	if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
+	if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
 		oname->len = digest_encode(iname->name, iname->len,
 					   oname->name);
 		return 0;
 	}
 	if (hash) {
-		memcpy(buf, &hash, 4);
-		memcpy(buf + 4, &minor_hash, 4);
+		digested_name.hash = hash;
+		digested_name.minor_hash = minor_hash;
 	} else {
-		memset(buf, 0, 8);
+		digested_name.hash = 0;
+		digested_name.minor_hash = 0;
 	}
-	memcpy(buf + 8, iname->name + iname->len - 16, 16);
+	memcpy(digested_name.digest,
+	       FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
+	       FSCRYPT_FNAME_DIGEST_SIZE);
 	oname->name[0] = '_';
-	oname->len = 1 + digest_encode(buf, 24, oname->name + 1);
+	oname->len = 1 + digest_encode((const char *)&digested_name,
+				       sizeof(digested_name), oname->name + 1);
 	return 0;
 }
 EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
@@ -332,14 +345,39 @@ int fscrypt_fname_usr_to_disk(struct inode *inode,
 	 * in a directory. Consequently, a user space name cannot be mapped to
 	 * a disk-space name
 	 */
-	return -EACCES;
+	return -ENOKEY;
 }
 EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
 
+/**
+ * fscrypt_setup_filename() - prepare to search a possibly encrypted directory
+ * @dir: the directory that will be searched
+ * @iname: the user-provided filename being searched for
+ * @lookup: 1 if we're allowed to proceed without the key because it's
+ *	->lookup() or we're finding the dir_entry for deletion; 0 if we cannot
+ *	proceed without the key because we're going to create the dir_entry.
+ * @fname: the filename information to be filled in
+ *
+ * Given a user-provided filename @iname, this function sets @fname->disk_name
+ * to the name that would be stored in the on-disk directory entry, if possible.
+ * If the directory is unencrypted this is simply @iname.  Else, if we have the
+ * directory's encryption key, then @iname is the plaintext, so we encrypt it to
+ * get the disk_name.
+ *
+ * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
+ * we decode it to get either the ciphertext disk_name (for short names) or the
+ * fscrypt_digested_name (for long names).  Non-@lookup operations will be
+ * impossible in this case, so we fail them with ENOKEY.
+ *
+ * If successful, fscrypt_free_filename() must be called later to clean up.
+ *
+ * Return: 0 on success, -errno on failure
+ */
 int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 			      int lookup, struct fscrypt_name *fname)
 {
-	int ret = 0, bigname = 0;
+	int ret;
+	int digested;
 
 	memset(fname, 0, sizeof(struct fscrypt_name));
 	fname->usr_fname = iname;
@@ -350,7 +388,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 		fname->disk_name.len = iname->len;
 		return 0;
 	}
-	ret = get_crypt_info(dir);
+	ret = fscrypt_get_encryption_info(dir);
 	if (ret && ret != -EOPNOTSUPP)
 		return ret;
 
@@ -367,31 +405,43 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 		return 0;
 	}
 	if (!lookup)
-		return -EACCES;
+		return -ENOKEY;
 
 	/*
 	 * We don't have the key and we are doing a lookup; decode the
 	 * user-supplied name
 	 */
-	if (iname->name[0] == '_')
-		bigname = 1;
-	if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
-		return -ENOENT;
+	if (iname->name[0] == '_') {
+		if (iname->len !=
+		    1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)))
+			return -ENOENT;
+		digested = 1;
+	} else {
+		if (iname->len >
+		    BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE))
+			return -ENOENT;
+		digested = 0;
+	}
 
-	fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+	fname->crypto_buf.name =
+		kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE,
+			      sizeof(struct fscrypt_digested_name)),
+			GFP_KERNEL);
 	if (fname->crypto_buf.name == NULL)
 		return -ENOMEM;
 
-	ret = digest_decode(iname->name + bigname, iname->len - bigname,
+	ret = digest_decode(iname->name + digested, iname->len - digested,
 				fname->crypto_buf.name);
 	if (ret < 0) {
 		ret = -ENOENT;
 		goto errout;
 	}
 	fname->crypto_buf.len = ret;
-	if (bigname) {
-		memcpy(&fname->hash, fname->crypto_buf.name, 4);
-		memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
+	if (digested) {
+		const struct fscrypt_digested_name *n =
+			(const void *)fname->crypto_buf.name;
+		fname->hash = n->hash;
+		fname->minor_hash = n->minor_hash;
 	} else {
 		fname->disk_name.name = fname->crypto_buf.name;
 		fname->disk_name.len = fname->crypto_buf.len;
@@ -403,12 +453,3 @@ errout:
 	return ret;
 }
 EXPORT_SYMBOL(fscrypt_setup_filename);
-
-void fscrypt_free_filename(struct fscrypt_name *fname)
-{
-	kfree(fname->crypto_buf.name);
-	fname->crypto_buf.name = NULL;
-	fname->usr_fname = NULL;
-	fname->disk_name.name = NULL;
-}
-EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
new file mode 100644
index 000000000000..79d79755d79b
--- /dev/null
+++ b/fs/crypto/fscrypt_private.h
@@ -0,0 +1,107 @@
+/*
+ * fscrypt_private.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#ifndef _FSCRYPT_PRIVATE_H
+#define _FSCRYPT_PRIVATE_H
+
+#include <linux/fscrypt_supp.h>
+#include <crypto/hash.h>
+
+/* Encryption parameters */
+#define FS_IV_SIZE			16
+#define FS_AES_128_ECB_KEY_SIZE		16
+#define FS_AES_128_CBC_KEY_SIZE		16
+#define FS_AES_128_CTS_KEY_SIZE		16
+#define FS_AES_256_GCM_KEY_SIZE		32
+#define FS_AES_256_CBC_KEY_SIZE		32
+#define FS_AES_256_CTS_KEY_SIZE		32
+#define FS_AES_256_XTS_KEY_SIZE		64
+
+#define FS_KEY_DERIVATION_NONCE_SIZE		16
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ *  1 byte: Protector format (1 = this version)
+ *  1 byte: File contents encryption mode
+ *  1 byte: File names encryption mode
+ *  1 byte: Flags
+ *  8 bytes: Master Key descriptor
+ *  16 bytes: Encryption Key derivation nonce
+ */
+struct fscrypt_context {
+	u8 format;
+	u8 contents_encryption_mode;
+	u8 filenames_encryption_mode;
+	u8 flags;
+	u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+} __packed;
+
+#define FS_ENCRYPTION_CONTEXT_FORMAT_V1		1
+
+/*
+ * A pointer to this structure is stored in the file system's in-core
+ * representation of an inode.
+ */
+struct fscrypt_info {
+	u8 ci_data_mode;
+	u8 ci_filename_mode;
+	u8 ci_flags;
+	struct crypto_skcipher *ci_ctfm;
+	struct crypto_cipher *ci_essiv_tfm;
+	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+typedef enum {
+	FS_DECRYPT = 0,
+	FS_ENCRYPT,
+} fscrypt_direction_t;
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
+#define FS_CTX_HAS_BOUNCE_BUFFER_FL		0x00000002
+
+struct fscrypt_completion_result {
+	struct completion completion;
+	int res;
+};
+
+#define DECLARE_FS_COMPLETION_RESULT(ecr) \
+	struct fscrypt_completion_result ecr = { \
+		COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 }
+
+/* bio stuffs */
+#define REQ_OP_READ	READ
+#define REQ_OP_WRITE	WRITE
+#define bio_op(bio)	((bio)->bi_rw & 1)
+
+static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
+		unsigned op_flags)
+{
+	bio->bi_rw = op | op_flags;
+}
+
+/* crypto.c */
+extern int fscrypt_initialize(unsigned int cop_flags);
+extern struct workqueue_struct *fscrypt_read_workqueue;
+extern int fscrypt_do_page_crypto(const struct inode *inode,
+				  fscrypt_direction_t rw, u64 lblk_num,
+				  struct page *src_page,
+				  struct page *dest_page,
+				  unsigned int len, unsigned int offs,
+				  gfp_t gfp_flags);
+extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+					      gfp_t gfp_flags);
+
+/* keyinfo.c */
+extern void __exit fscrypt_essiv_cleanup(void);
+
+#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 67fb6d8876d0..66e0728e9bbe 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,7 +10,12 @@
 
 #include <keys/user-type.h>
 #include <linux/scatterlist.h>
-#include <linux/fscrypto.h>
+#include <linux/ratelimit.h>
+#include <crypto/aes.h>
+#include <crypto/sha.h>
+#include "fscrypt_private.h"
+
+static struct crypto_shash *essiv_hash_tfm;
 
 static void derive_crypt_complete(struct crypto_async_request *req, int rc)
 {
@@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc)
  * derive_key_aes() - Derive a key using AES-128-ECB
  * @deriving_key: Encryption key used for derivation.
  * @source_key:   Source key to which to apply derivation.
- * @derived_key:  Derived key.
+ * @derived_raw_key:  Derived raw key.
  *
  * Return: Zero on success; non-zero otherwise.
  */
 static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
-				u8 source_key[FS_AES_256_XTS_KEY_SIZE],
-				u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+				const struct fscrypt_key *source_key,
+				u8 derived_raw_key[FS_MAX_KEY_SIZE])
 {
 	int res = 0;
 	struct skcipher_request *req = NULL;
@@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
 	if (res < 0)
 		goto out;
 
-	sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
-	sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
-	skcipher_request_set_crypt(req, &src_sg, &dst_sg,
-					FS_AES_256_XTS_KEY_SIZE, NULL);
+	sg_init_one(&src_sg, source_key->raw, source_key->size);
+	sg_init_one(&dst_sg, derived_raw_key, source_key->size);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size,
+				   NULL);
 	res = crypto_skcipher_encrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
 		wait_for_completion(&ecr.completion);
@@ -77,28 +82,25 @@ out:
 
 static int validate_user_key(struct fscrypt_info *crypt_info,
 			struct fscrypt_context *ctx, u8 *raw_key,
-			u8 *prefix, int prefix_size)
+			const char *prefix, int min_keysize)
 {
-	u8 *full_key_descriptor;
+	char *description;
 	struct key *keyring_key;
 	struct fscrypt_key *master_key;
 	const struct user_key_payload *ukp;
-	int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
 	int res;
 
-	full_key_descriptor = kmalloc(full_key_len, GFP_NOFS);
-	if (!full_key_descriptor)
+	description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+				FS_KEY_DESCRIPTOR_SIZE,
+				ctx->master_key_descriptor);
+	if (!description)
 		return -ENOMEM;
 
-	memcpy(full_key_descriptor, prefix, prefix_size);
-	sprintf(full_key_descriptor + prefix_size,
-			"%*phN", FS_KEY_DESCRIPTOR_SIZE,
-			ctx->master_key_descriptor);
-	full_key_descriptor[full_key_len - 1] = '\0';
-	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
-	kfree(full_key_descriptor);
+	keyring_key = request_key(&key_type_logon, description, NULL);
+	kfree(description);
 	if (IS_ERR(keyring_key))
 		return PTR_ERR(keyring_key);
+	down_read(&keyring_key->sem);
 
 	if (keyring_key->type != &key_type_logon) {
 		printk_once(KERN_WARNING
@@ -106,66 +108,68 @@ static int validate_user_key(struct fscrypt_info *crypt_info,
 		res = -ENOKEY;
 		goto out;
 	}
-	down_read(&keyring_key->sem);
 	ukp = user_key_payload(keyring_key);
 	if (ukp->datalen != sizeof(struct fscrypt_key)) {
 		res = -EINVAL;
-		up_read(&keyring_key->sem);
 		goto out;
 	}
 	master_key = (struct fscrypt_key *)ukp->data;
 	BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
 
-	if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+	if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE
+	    || master_key->size % AES_BLOCK_SIZE != 0) {
 		printk_once(KERN_WARNING
 				"%s: key size incorrect: %d\n",
 				__func__, master_key->size);
 		res = -ENOKEY;
-		up_read(&keyring_key->sem);
 		goto out;
 	}
-	res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
-	up_read(&keyring_key->sem);
-	if (res)
-		goto out;
-
-	crypt_info->ci_keyring_key = keyring_key;
-	return 0;
+	res = derive_key_aes(ctx->nonce, master_key, raw_key);
 out:
+	up_read(&keyring_key->sem);
 	key_put(keyring_key);
 	return res;
 }
 
+static const struct {
+	const char *cipher_str;
+	int keysize;
+} available_modes[] = {
+	[FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)",
+					     FS_AES_256_XTS_KEY_SIZE },
+	[FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))",
+					     FS_AES_256_CTS_KEY_SIZE },
+	[FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)",
+					     FS_AES_128_CBC_KEY_SIZE },
+	[FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))",
+					     FS_AES_128_CTS_KEY_SIZE },
+};
+
 static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
 				 const char **cipher_str_ret, int *keysize_ret)
 {
+	u32 mode;
+
+	if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) {
+		pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n",
+				    inode->i_ino,
+				    ci->ci_data_mode, ci->ci_filename_mode);
+		return -EINVAL;
+	}
+
 	if (S_ISREG(inode->i_mode)) {
-		if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) {
-			*cipher_str_ret = "xts(aes)";
-			*keysize_ret = FS_AES_256_XTS_KEY_SIZE;
-			return 0;
-		}
-		pr_warn_once("fscrypto: unsupported contents encryption mode "
-			     "%d for inode %lu\n",
-			     ci->ci_data_mode, inode->i_ino);
-		return -ENOKEY;
+		mode = ci->ci_data_mode;
+	} else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+		mode = ci->ci_filename_mode;
+	} else {
+		WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+			  inode->i_ino, (inode->i_mode & S_IFMT));
+		return -EINVAL;
 	}
 
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
-		if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) {
-			*cipher_str_ret = "cts(cbc(aes))";
-			*keysize_ret = FS_AES_256_CTS_KEY_SIZE;
-			return 0;
-		}
-		pr_warn_once("fscrypto: unsupported filenames encryption mode "
-			     "%d for inode %lu\n",
-			     ci->ci_filename_mode, inode->i_ino);
-		return -ENOKEY;
-	}
-
-	pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n",
-		     (inode->i_mode & S_IFMT), inode->i_ino);
-	return -ENOKEY;
+	*cipher_str_ret = available_modes[mode].cipher_str;
+	*keysize_ret = available_modes[mode].keysize;
+	return 0;
 }
 
 static void put_crypt_info(struct fscrypt_info *ci)
@@ -173,12 +177,78 @@ static void put_crypt_info(struct fscrypt_info *ci)
 	if (!ci)
 		return;
 
-	key_put(ci->ci_keyring_key);
 	crypto_free_skcipher(ci->ci_ctfm);
+	crypto_free_cipher(ci->ci_essiv_tfm);
 	kmem_cache_free(fscrypt_info_cachep, ci);
 }
 
-int get_crypt_info(struct inode *inode)
+static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
+{
+	struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
+
+	/* init hash transform on demand */
+	if (unlikely(!tfm)) {
+		struct crypto_shash *prev_tfm;
+
+		tfm = crypto_alloc_shash("sha256", 0, 0);
+		if (IS_ERR(tfm)) {
+			pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n",
+					    PTR_ERR(tfm));
+			return PTR_ERR(tfm);
+		}
+		prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
+		if (prev_tfm) {
+			crypto_free_shash(tfm);
+			tfm = prev_tfm;
+		}
+	}
+
+	{
+		SHASH_DESC_ON_STACK(desc, tfm);
+		desc->tfm = tfm;
+		desc->flags = 0;
+
+		return crypto_shash_digest(desc, key, keysize, salt);
+	}
+}
+
+static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
+				int keysize)
+{
+	int err;
+	struct crypto_cipher *essiv_tfm;
+	u8 salt[SHA256_DIGEST_SIZE];
+
+	essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(essiv_tfm))
+		return PTR_ERR(essiv_tfm);
+
+	ci->ci_essiv_tfm = essiv_tfm;
+
+	err = derive_essiv_salt(raw_key, keysize, salt);
+	if (err)
+		goto out;
+
+	/*
+	 * Using SHA256 to derive the salt/key will result in AES-256 being
+	 * used for IV generation. File contents encryption will still use the
+	 * configured keysize (AES-128) nevertheless.
+	 */
+	err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
+	if (err)
+		goto out;
+
+out:
+	memzero_explicit(salt, sizeof(salt));
+	return err;
+}
+
+void __exit fscrypt_essiv_cleanup(void)
+{
+	crypto_free_shash(essiv_hash_tfm);
+}
+
+int fscrypt_get_encryption_info(struct inode *inode)
 {
 	struct fscrypt_info *crypt_info;
 	struct fscrypt_context ctx;
@@ -188,30 +258,24 @@ int get_crypt_info(struct inode *inode)
 	u8 *raw_key = NULL;
 	int res;
 
-	res = fscrypt_initialize();
+	if (inode->i_crypt_info)
+		return 0;
+
+	res = fscrypt_initialize(inode->i_sb->s_cop->flags);
 	if (res)
 		return res;
 
-	if (!inode->i_sb->s_cop->get_context)
-		return -EOPNOTSUPP;
-retry:
-	crypt_info = ACCESS_ONCE(inode->i_crypt_info);
-	if (crypt_info) {
-		if (!crypt_info->ci_keyring_key ||
-				key_validate(crypt_info->ci_keyring_key) == 0)
-			return 0;
-		fscrypt_put_encryption_info(inode, crypt_info);
-		goto retry;
-	}
-
 	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
 	if (res < 0) {
-		if (!fscrypt_dummy_context_enabled(inode))
+		if (!fscrypt_dummy_context_enabled(inode) ||
+		    inode->i_sb->s_cop->is_encrypted(inode))
 			return res;
+		/* Fake up a context for an unencrypted directory */
+		memset(&ctx, 0, sizeof(ctx));
 		ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
 		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
 		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
-		ctx.flags = 0;
+		memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
 	} else if (res != sizeof(ctx)) {
 		return -EINVAL;
 	}
@@ -230,7 +294,7 @@ retry:
 	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
 	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
 	crypt_info->ci_ctfm = NULL;
-	crypt_info->ci_keyring_key = NULL;
+	crypt_info->ci_essiv_tfm = NULL;
 	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
 				sizeof(crypt_info->ci_master_key));
 
@@ -247,20 +311,12 @@ retry:
 	if (!raw_key)
 		goto out;
 
-	if (fscrypt_dummy_context_enabled(inode)) {
-		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
-		goto got_key;
-	}
-
-	res = validate_user_key(crypt_info, &ctx, raw_key,
-			FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE);
+	res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX,
+				keysize);
 	if (res && inode->i_sb->s_cop->key_prefix) {
-		u8 *prefix = NULL;
-		int prefix_size, res2;
-
-		prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix);
-		res2 = validate_user_key(crypt_info, &ctx, raw_key,
-							prefix, prefix_size);
+		int res2 = validate_user_key(crypt_info, &ctx, raw_key,
+					     inode->i_sb->s_cop->key_prefix,
+					     keysize);
 		if (res2) {
 			if (res2 == -ENOKEY)
 				res = -ENOKEY;
@@ -269,30 +325,35 @@ retry:
 	} else if (res) {
 		goto out;
 	}
-got_key:
 	ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
 	if (!ctfm || IS_ERR(ctfm)) {
 		res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
-		printk(KERN_DEBUG
-		       "%s: error %d (inode %u) allocating crypto tfm\n",
-		       __func__, res, (unsigned) inode->i_ino);
+		pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n",
+			 __func__, res, inode->i_ino);
 		goto out;
 	}
 	crypt_info->ci_ctfm = ctfm;
 	crypto_skcipher_clear_flags(ctfm, ~0);
 	crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	/*
+	 * if the provided key is longer than keysize, we use the first
+	 * keysize bytes of the derived key only
+	 */
 	res = crypto_skcipher_setkey(ctfm, raw_key, keysize);
 	if (res)
 		goto out;
 
-	kzfree(raw_key);
-	raw_key = NULL;
-	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
-		put_crypt_info(crypt_info);
-		goto retry;
+	if (S_ISREG(inode->i_mode) &&
+	    crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) {
+		res = init_essiv_generator(crypt_info, raw_key, keysize);
+		if (res) {
+			pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n",
+				 __func__, res, inode->i_ino);
+			goto out;
+		}
 	}
-	return 0;
-
+	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL)
+		crypt_info = NULL;
 out:
 	if (res == -ENOKEY)
 		res = 0;
@@ -300,6 +361,7 @@ out:
 	kzfree(raw_key);
 	return res;
 }
+EXPORT_SYMBOL(fscrypt_get_encryption_info);
 
 void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
 {
@@ -317,17 +379,3 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
 	put_crypt_info(ci);
 }
 EXPORT_SYMBOL(fscrypt_put_encryption_info);
-
-int fscrypt_get_encryption_info(struct inode *inode)
-{
-	struct fscrypt_info *ci = inode->i_crypt_info;
-
-	if (!ci ||
-		(ci->ci_keyring_key &&
-		 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
-					       (1 << KEY_FLAG_REVOKED) |
-					       (1 << KEY_FLAG_DEAD)))))
-		return get_crypt_info(inode);
-	return 0;
-}
-EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 6865663aac69..9914d51dff86 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,76 +10,37 @@
 
 #include <linux/random.h>
 #include <linux/string.h>
-#include <linux/fscrypto.h>
 #include <linux/mount.h>
-
-static int inode_has_encryption_context(struct inode *inode)
-{
-	if (!inode->i_sb->s_cop->get_context)
-		return 0;
-	return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0);
-}
+#include "fscrypt_private.h"
 
 /*
- * check whether the policy is consistent with the encryption context
- * for the inode
+ * check whether an encryption policy is consistent with an encryption context
  */
-static int is_encryption_context_consistent_with_policy(struct inode *inode,
+static bool is_encryption_context_consistent_with_policy(
+				const struct fscrypt_context *ctx,
 				const struct fscrypt_policy *policy)
 {
-	struct fscrypt_context ctx;
-	int res;
-
-	if (!inode->i_sb->s_cop->get_context)
-		return 0;
-
-	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
-	if (res != sizeof(ctx))
-		return 0;
-
-	return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
-			FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-			(ctx.flags == policy->flags) &&
-			(ctx.contents_encryption_mode ==
-			 policy->contents_encryption_mode) &&
-			(ctx.filenames_encryption_mode ==
-			 policy->filenames_encryption_mode));
+	return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor,
+		      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+		(ctx->flags == policy->flags) &&
+		(ctx->contents_encryption_mode ==
+		 policy->contents_encryption_mode) &&
+		(ctx->filenames_encryption_mode ==
+		 policy->filenames_encryption_mode);
 }
 
 static int create_encryption_context_from_policy(struct inode *inode,
 				const struct fscrypt_policy *policy)
 {
 	struct fscrypt_context ctx;
-	int res;
-
-	if (!inode->i_sb->s_cop->set_context)
-		return -EOPNOTSUPP;
-
-	if (inode->i_sb->s_cop->prepare_context) {
-		res = inode->i_sb->s_cop->prepare_context(inode);
-		if (res)
-			return res;
-	}
 
 	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
 	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
 					FS_KEY_DESCRIPTOR_SIZE);
 
-	if (!fscrypt_valid_contents_enc_mode(
-				policy->contents_encryption_mode)) {
-		printk(KERN_WARNING
-		       "%s: Invalid contents encryption mode %d\n", __func__,
-			policy->contents_encryption_mode);
+	if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+				     policy->filenames_encryption_mode))
 		return -EINVAL;
-	}
-
-	if (!fscrypt_valid_filenames_enc_mode(
-				policy->filenames_encryption_mode)) {
-		printk(KERN_WARNING
-			"%s: Invalid filenames encryption mode %d\n", __func__,
-			policy->filenames_encryption_mode);
-		return -EINVAL;
-	}
 
 	if (policy->flags & ~FS_POLICY_FLAGS_VALID)
 		return -EINVAL;
@@ -93,16 +54,20 @@ static int create_encryption_context_from_policy(struct inode *inode,
 	return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
 }
 
-int fscrypt_process_policy(struct file *filp,
-				const struct fscrypt_policy *policy)
+int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
 {
+	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 	int ret;
+	struct fscrypt_context ctx;
+
+	if (copy_from_user(&policy, arg, sizeof(policy)))
+		return -EFAULT;
 
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	if (policy->version != 0)
+	if (policy.version != 0)
 		return -EINVAL;
 
 	ret = mnt_want_write_file(filp);
@@ -111,22 +76,23 @@ int fscrypt_process_policy(struct file *filp,
 
 	inode_lock(inode);
 
-	if (!inode_has_encryption_context(inode)) {
+	ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (ret == -ENODATA) {
 		if (!S_ISDIR(inode->i_mode))
-			ret = -EINVAL;
-		else if (!inode->i_sb->s_cop->empty_dir)
-			ret = -EOPNOTSUPP;
+			ret = -ENOTDIR;
 		else if (!inode->i_sb->s_cop->empty_dir(inode))
 			ret = -ENOTEMPTY;
 		else
 			ret = create_encryption_context_from_policy(inode,
-								    policy);
-	} else if (!is_encryption_context_consistent_with_policy(inode,
-								 policy)) {
-		printk(KERN_WARNING
-		       "%s: Policy inconsistent with encryption context\n",
-		       __func__);
-		ret = -EINVAL;
+								    &policy);
+	} else if (ret == sizeof(ctx) &&
+		   is_encryption_context_consistent_with_policy(&ctx,
+								&policy)) {
+		/* The file already uses the same encryption policy. */
+		ret = 0;
+	} else if (ret >= 0 || ret == -ERANGE) {
+		/* The file already uses a different encryption policy. */
+		ret = -EEXIST;
 	}
 
 	inode_unlock(inode);
@@ -134,49 +100,94 @@ int fscrypt_process_policy(struct file *filp,
 	mnt_drop_write_file(filp);
 	return ret;
 }
-EXPORT_SYMBOL(fscrypt_process_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_set_policy);
 
-int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
 {
+	struct inode *inode = file_inode(filp);
 	struct fscrypt_context ctx;
+	struct fscrypt_policy policy;
 	int res;
 
-	if (!inode->i_sb->s_cop->get_context ||
-			!inode->i_sb->s_cop->is_encrypted(inode))
+	if (!inode->i_sb->s_cop->is_encrypted(inode))
 		return -ENODATA;
 
 	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res < 0 && res != -ERANGE)
+		return res;
 	if (res != sizeof(ctx))
-		return -ENODATA;
+		return -EINVAL;
 	if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
 		return -EINVAL;
 
-	policy->version = 0;
-	policy->contents_encryption_mode = ctx.contents_encryption_mode;
-	policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
-	policy->flags = ctx.flags;
-	memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+	policy.version = 0;
+	policy.contents_encryption_mode = ctx.contents_encryption_mode;
+	policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
+	policy.flags = ctx.flags;
+	memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
 				FS_KEY_DESCRIPTOR_SIZE);
+
+	if (copy_to_user(arg, &policy, sizeof(policy)))
+		return -EFAULT;
 	return 0;
 }
-EXPORT_SYMBOL(fscrypt_get_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
 
+/**
+ * fscrypt_has_permitted_context() - is a file's encryption policy permitted
+ *				     within its directory?
+ *
+ * @parent: inode for parent directory
+ * @child: inode for file being looked up, opened, or linked into @parent
+ *
+ * Filesystems must call this before permitting access to an inode in a
+ * situation where the parent directory is encrypted (either before allowing
+ * ->lookup() to succeed, or for a regular file before allowing it to be opened)
+ * and before any operation that involves linking an inode into an encrypted
+ * directory, including link, rename, and cross rename.  It enforces the
+ * constraint that within a given encrypted directory tree, all files use the
+ * same encryption policy.  The pre-access check is needed to detect potentially
+ * malicious offline violations of this constraint, while the link and rename
+ * checks are needed to prevent online violations of this constraint.
+ *
+ * Return: 1 if permitted, 0 if forbidden.  If forbidden, the caller must fail
+ * the filesystem operation with EPERM.
+ */
 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 {
-	struct fscrypt_info *parent_ci, *child_ci;
+	const struct fscrypt_operations *cops = parent->i_sb->s_cop;
+	const struct fscrypt_info *parent_ci, *child_ci;
+	struct fscrypt_context parent_ctx, child_ctx;
 	int res;
 
-	if ((parent == NULL) || (child == NULL)) {
-		printk(KERN_ERR	"parent %p child %p\n", parent, child);
-		BUG_ON(1);
-	}
-
-	/* no restrictions if the parent directory is not encrypted */
-	if (!parent->i_sb->s_cop->is_encrypted(parent))
+	/* No restrictions on file types which are never encrypted */
+	if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
+	    !S_ISLNK(child->i_mode))
 		return 1;
-	/* if the child directory is not encrypted, this is always a problem */
-	if (!parent->i_sb->s_cop->is_encrypted(child))
+
+	/* No restrictions if the parent directory is unencrypted */
+	if (!cops->is_encrypted(parent))
+		return 1;
+
+	/* Encrypted directories must not contain unencrypted files */
+	if (!cops->is_encrypted(child))
 		return 0;
+
+	/*
+	 * Both parent and child are encrypted, so verify they use the same
+	 * encryption policy.  Compare the fscrypt_info structs if the keys are
+	 * available, otherwise retrieve and compare the fscrypt_contexts.
+	 *
+	 * Note that the fscrypt_context retrieval will be required frequently
+	 * when accessing an encrypted directory tree without the key.
+	 * Performance-wise this is not a big deal because we already don't
+	 * really optimize for file access without the key (to the extent that
+	 * such access is even possible), given that any attempted access
+	 * already causes a fscrypt_context retrieval and keyring search.
+	 *
+	 * In any case, if an unexpected error occurs, fall back to "forbidden".
+	 */
+
 	res = fscrypt_get_encryption_info(parent);
 	if (res)
 		return 0;
@@ -185,17 +196,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 		return 0;
 	parent_ci = parent->i_crypt_info;
 	child_ci = child->i_crypt_info;
-	if (!parent_ci && !child_ci)
-		return 1;
-	if (!parent_ci || !child_ci)
+
+	if (parent_ci && child_ci) {
+		return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
+			      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+			(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+			(parent_ci->ci_filename_mode ==
+			 child_ci->ci_filename_mode) &&
+			(parent_ci->ci_flags == child_ci->ci_flags);
+	}
+
+	res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx));
+	if (res != sizeof(parent_ctx))
 		return 0;
 
-	return (memcmp(parent_ci->ci_master_key,
-			child_ci->ci_master_key,
-			FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-		(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
-		(parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
-		(parent_ci->ci_flags == child_ci->ci_flags));
+	res = cops->get_context(child, &child_ctx, sizeof(child_ctx));
+	if (res != sizeof(child_ctx))
+		return 0;
+
+	return memcmp(parent_ctx.master_key_descriptor,
+		      child_ctx.master_key_descriptor,
+		      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+		(parent_ctx.contents_encryption_mode ==
+		 child_ctx.contents_encryption_mode) &&
+		(parent_ctx.filenames_encryption_mode ==
+		 child_ctx.filenames_encryption_mode) &&
+		(parent_ctx.flags == child_ctx.flags);
 }
 EXPORT_SYMBOL(fscrypt_has_permitted_context);
 
@@ -204,9 +230,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context);
  * @parent: Parent inode from which the context is inherited.
  * @child:  Child inode that inherits the context from @parent.
  * @fs_data:  private data given by FS.
- * @preload:  preload child i_crypt_info
+ * @preload:  preload child i_crypt_info if true
  *
- * Return: Zero on success, non-zero otherwise
+ * Return: 0 on success, -errno on failure
  */
 int fscrypt_inherit_context(struct inode *parent, struct inode *child,
 						void *fs_data, bool preload)
@@ -215,9 +241,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
 	struct fscrypt_info *ci;
 	int res;
 
-	if (!parent->i_sb->s_cop->set_context)
-		return -EOPNOTSUPP;
-
 	res = fscrypt_get_encryption_info(parent);
 	if (res < 0)
 		return res;
@@ -227,19 +250,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
 		return -ENOKEY;
 
 	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-	if (fscrypt_dummy_context_enabled(parent)) {
-		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
-		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
-		ctx.flags = 0;
-		memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
-		res = 0;
-	} else {
-		ctx.contents_encryption_mode = ci->ci_data_mode;
-		ctx.filenames_encryption_mode = ci->ci_filename_mode;
-		ctx.flags = ci->ci_flags;
-		memcpy(ctx.master_key_descriptor, ci->ci_master_key,
-				FS_KEY_DESCRIPTOR_SIZE);
-	}
+	ctx.contents_encryption_mode = ci->ci_data_mode;
+	ctx.filenames_encryption_mode = ci->ci_filename_mode;
+	ctx.flags = ci->ci_flags;
+	memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+	       FS_KEY_DESCRIPTOR_SIZE);
 	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
 	res = parent->i_sb->s_cop->set_context(child, &ctx,
 						sizeof(ctx), fs_data);
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index ca949ea7c02f..a0dc559b1b47 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o
 
 f2fs-y		:= dir.o file.o inode.o namei.o hash.o super.o inline.o
 f2fs-y		+= checkpoint.o gc.o data.o node.o segment.o recovery.o
-f2fs-y		+= shrinker.o extent_cache.o
+f2fs-y		+= shrinker.o extent_cache.o sysfs.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index a45d1f4b7b0f..112f8e04c549 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -210,15 +210,16 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	void *value = NULL;
 	size_t size = 0;
 	int error;
+	umode_t mode = inode->i_mode;
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
-		if (acl) {
-			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+		if (acl && !ipage) {
+			error = posix_acl_update_mode(inode, &mode, &acl);
 			if (error)
 				return error;
-			set_acl_inode(inode, inode->i_mode);
+			set_acl_inode(inode, mode);
 		}
 		break;
 
@@ -236,7 +237,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 		value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size);
 		if (IS_ERR(value)) {
 			clear_inode_flag(inode, FI_ACL_MODE);
-			return (int)PTR_ERR(value);
+			return PTR_ERR(value);
 		}
 	}
 
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 45ef3b6bfb04..e86f67ac96c6 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -31,7 +31,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
 	set_ckpt_flags(sbi, CP_ERROR_FLAG);
 	sbi->sb->s_flags |= MS_RDONLY;
 	if (!end_io)
-		f2fs_flush_merged_bios(sbi);
+		f2fs_flush_merged_writes(sbi);
 }
 
 /*
@@ -163,6 +163,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 		.op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) :
 						REQ_RAHEAD,
 		.encrypted_page = NULL,
+		.in_list = false,
 	};
 	struct blk_plug plug;
 
@@ -208,12 +209,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 		}
 
 		fio.page = page;
-		fio.old_blkaddr = fio.new_blkaddr;
-		f2fs_submit_page_mbio(&fio);
+		f2fs_submit_page_bio(&fio);
 		f2fs_put_page(page, 0);
 	}
 out:
-	f2fs_submit_merged_bio(sbi, META, READ);
 	blk_finish_plug(&plug);
 	return blkno - start;
 }
@@ -232,8 +231,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
 		ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
 }
 
-static int f2fs_write_meta_page(struct page *page,
-				struct writeback_control *wbc)
+static int __f2fs_write_meta_page(struct page *page,
+				struct writeback_control *wbc,
+				enum iostat_type io_type)
 {
 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 
@@ -246,16 +246,17 @@ static int f2fs_write_meta_page(struct page *page,
 	if (unlikely(f2fs_cp_error(sbi)))
 		goto redirty_out;
 
-	write_meta_page(sbi, page);
+	write_meta_page(sbi, page, io_type);
 	dec_page_count(sbi, F2FS_DIRTY_META);
 
 	if (wbc->for_reclaim)
-		f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE);
+		f2fs_submit_merged_write_cond(sbi, page->mapping->host,
+						0, page->index, META);
 
 	unlock_page(page);
 
 	if (unlikely(f2fs_cp_error(sbi)))
-		f2fs_submit_merged_bio(sbi, META, WRITE);
+		f2fs_submit_merged_write(sbi, META);
 
 	return 0;
 
@@ -264,23 +265,33 @@ redirty_out:
 	return AOP_WRITEPAGE_ACTIVATE;
 }
 
+static int f2fs_write_meta_page(struct page *page,
+				struct writeback_control *wbc)
+{
+	return __f2fs_write_meta_page(page, wbc, FS_META_IO);
+}
+
 static int f2fs_write_meta_pages(struct address_space *mapping,
 				struct writeback_control *wbc)
 {
 	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
 	long diff, written;
 
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		goto skip_write;
+
 	/* collect a number of dirty meta pages and write together */
 	if (wbc->for_kupdate ||
 		get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
 		goto skip_write;
 
-	trace_f2fs_writepages(mapping->host, wbc, META);
+	/* if locked failed, cp will flush dirty pages instead */
+	if (!mutex_trylock(&sbi->cp_mutex))
+		goto skip_write;
 
-	/* if mounting is failed, skip writing node pages */
-	mutex_lock(&sbi->cp_mutex);
+	trace_f2fs_writepages(mapping->host, wbc, META);
 	diff = nr_pages_to_write(sbi, META, wbc);
-	written = sync_meta_pages(sbi, META, wbc->nr_to_write);
+	written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
 	mutex_unlock(&sbi->cp_mutex);
 	wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
 	return 0;
@@ -292,7 +303,7 @@ skip_write:
 }
 
 long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
-						long nr_to_write)
+				long nr_to_write, enum iostat_type io_type)
 {
 	struct address_space *mapping = META_MAPPING(sbi);
 	pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
@@ -343,7 +354,7 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			if (mapping->a_ops->writepage(page, &wbc)) {
+			if (__f2fs_write_meta_page(page, &wbc, io_type)) {
 				unlock_page(page);
 				break;
 			}
@@ -357,7 +368,7 @@ continue_unlock:
 	}
 stop:
 	if (nwritten)
-		f2fs_submit_merged_bio(sbi, type, WRITE);
+		f2fs_submit_merged_write(sbi, type);
 
 	blk_finish_plug(&plug);
 
@@ -494,6 +505,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	if (time_to_inject(sbi, FAULT_ORPHAN)) {
 		spin_unlock(&im->ino_lock);
+		f2fs_show_injection_info(FAULT_ORPHAN);
 		return -ENOSPC;
 	}
 #endif
@@ -566,7 +578,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 	if (ni.blk_addr != NULL_ADDR) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
 		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: orphan failed (ino=%x), run fsck to fix.",
+			"%s: orphan failed (ino=%x) by kernel, retry mount.",
 				__func__, ino);
 		return -EIO;
 	}
@@ -577,11 +589,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 {
 	block_t start_blk, orphan_blocks, i, j;
-	int err;
+	unsigned int s_flags = sbi->sb->s_flags;
+	int err = 0;
 
 	if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
 		return 0;
 
+	if (s_flags & MS_RDONLY) {
+		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
+		sbi->sb->s_flags &= ~MS_RDONLY;
+	}
+
+#ifdef CONFIG_QUOTA
+	/* Needed for iput() to work correctly and not trash data */
+	sbi->sb->s_flags |= MS_ACTIVE;
+	/* Turn on quotas so that they are updated correctly */
+	f2fs_enable_quota_files(sbi);
+#endif
+
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
 	orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
 
@@ -597,14 +622,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 			err = recover_orphan_inode(sbi, ino);
 			if (err) {
 				f2fs_put_page(page, 1);
-				return err;
+				goto out;
 			}
 		}
 		f2fs_put_page(page, 1);
 	}
 	/* clear Orphan Flag */
 	clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
-	return 0;
+out:
+#ifdef CONFIG_QUOTA
+	/* Turn quotas off */
+	f2fs_quota_off_umount(sbi->sb);
+#endif
+	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+
+	return err;
 }
 
 static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
@@ -676,14 +708,13 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
 	*cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
 
 	crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
-	if (crc_offset >= blk_size) {
+	if (crc_offset > (blk_size - sizeof(__le32))) {
 		f2fs_msg(sbi->sb, KERN_WARNING,
 			"invalid crc_offset: %zu", crc_offset);
 		return -EINVAL;
 	}
 
-	crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block
-							+ crc_offset)));
+	crc = cur_cp_crc(*cp_block);
 	if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) {
 		f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value");
 		return -EINVAL;
@@ -816,7 +847,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type)
 		return;
 
 	set_inode_flag(inode, flag);
-	list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
+	if (!f2fs_is_volatile_file(inode))
+		list_add_tail(&F2FS_I(inode)->dirty_list,
+						&sbi->inode_list[type]);
 	stat_inc_dirty_inode(sbi, type);
 }
 
@@ -874,6 +907,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
 	struct inode *inode;
 	struct f2fs_inode_info *fi;
 	bool is_dir = (type == DIR_INODE);
+	unsigned long ino = 0;
 
 	trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
 				get_pages(sbi, is_dir ?
@@ -896,14 +930,30 @@ retry:
 	inode = igrab(&fi->vfs_inode);
 	spin_unlock(&sbi->inode_lock[type]);
 	if (inode) {
+		unsigned long cur_ino = inode->i_ino;
+
+		if (is_dir)
+			F2FS_I(inode)->cp_task = current;
+
 		filemap_fdatawrite(inode->i_mapping);
+
+		if (is_dir)
+			F2FS_I(inode)->cp_task = NULL;
+
 		iput(inode);
+		/* We need to give cpu to another writers. */
+		if (ino == cur_ino) {
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			cond_resched();
+		} else {
+			ino = cur_ino;
+		}
 	} else {
 		/*
 		 * We should submit bio, since it exists several
 		 * wribacking dentry pages in the freeing inode.
 		 */
-		f2fs_submit_merged_bio(sbi, DATA, WRITE);
+		f2fs_submit_merged_write(sbi, DATA);
 		cond_resched();
 	}
 	goto retry;
@@ -941,6 +991,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
 	return 0;
 }
 
+static void __prepare_cp_block(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	nid_t last_nid = nm_i->next_scan_nid;
+
+	next_free_nid(sbi, &last_nid);
+	ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+	ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+	ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+	ckpt->next_free_nid = cpu_to_le32(last_nid);
+}
+
 /*
  * Freeze all the FS-operations for checkpoint.
  */
@@ -964,33 +1027,47 @@ retry_flush_dents:
 		err = sync_dirty_inodes(sbi, DIR_INODE);
 		if (err)
 			goto out;
-		goto retry_flush_dents;
-	}
-
-	if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
-		f2fs_unlock_all(sbi);
-		err = f2fs_sync_inode_meta(sbi);
-		if (err)
-			goto out;
+		cond_resched();
 		goto retry_flush_dents;
 	}
 
 	/*
 	 * POR: we should ensure that there are no dirty node pages
-	 * until finishing nat/sit flush.
+	 * until finishing nat/sit flush. inode->i_blocks can be updated.
 	 */
+	down_write(&sbi->node_change);
+
+	if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
+		up_write(&sbi->node_change);
+		f2fs_unlock_all(sbi);
+		err = f2fs_sync_inode_meta(sbi);
+		if (err)
+			goto out;
+		cond_resched();
+		goto retry_flush_dents;
+	}
+
 retry_flush_nodes:
 	down_write(&sbi->node_write);
 
 	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
 		up_write(&sbi->node_write);
-		err = sync_node_pages(sbi, &wbc);
+		err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
 		if (err) {
+			up_write(&sbi->node_change);
 			f2fs_unlock_all(sbi);
 			goto out;
 		}
+		cond_resched();
 		goto retry_flush_nodes;
 	}
+
+	/*
+	 * sbi->node_change is used only for AIO write_begin path which produces
+	 * dirty node blocks and some checkpoint values by block allocation.
+	 */
+	__prepare_cp_block(sbi);
+	up_write(&sbi->node_change);
 out:
 	blk_finish_plug(&plug);
 	return err;
@@ -999,8 +1076,6 @@ out:
 static void unblock_operations(struct f2fs_sb_info *sbi)
 {
 	up_write(&sbi->node_write);
-
-	build_free_nids(sbi, false);
 	f2fs_unlock_all(sbi);
 }
 
@@ -1023,15 +1098,24 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	unsigned long flags;
 
-	spin_lock(&sbi->cp_lock);
+	spin_lock_irqsave(&sbi->cp_lock, flags);
 
-	if (cpc->reason == CP_UMOUNT)
+	if ((cpc->reason & CP_UMOUNT) &&
+			le32_to_cpu(ckpt->cp_pack_total_block_count) >
+			sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
+		disable_nat_bits(sbi, false);
+
+	if (cpc->reason & CP_TRIMMED)
+		__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+
+	if (cpc->reason & CP_UMOUNT)
 		__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 	else
 		__clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 
-	if (cpc->reason == CP_FASTBOOT)
+	if (cpc->reason & CP_FASTBOOT)
 		__set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
 	else
 		__clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
@@ -1047,15 +1131,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* set this flag to activate crc|cp_ver for recovery */
 	__set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
 
-	spin_unlock(&sbi->cp_lock);
+	spin_unlock_irqrestore(&sbi->cp_lock, flags);
 }
 
 static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
-	nid_t last_nid = nm_i->next_scan_nid;
+	unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags;
 	block_t start_blk;
 	unsigned int data_sum_blocks, orphan_blocks;
 	__u32 crc32 = 0;
@@ -1067,19 +1150,16 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	/* Flush all the NAT/SIT pages */
 	while (get_pages(sbi, F2FS_DIRTY_META)) {
-		sync_meta_pages(sbi, META, LONG_MAX);
+		sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
 		if (unlikely(f2fs_cp_error(sbi)))
 			return -EIO;
 	}
 
-	next_free_nid(sbi, &last_nid);
-
 	/*
 	 * modify checkpoint
 	 * version number is already updated
 	 */
 	ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
-	ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
 	ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
 	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
 		ckpt->cur_node_segno[i] =
@@ -1098,18 +1178,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 				curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
 	}
 
-	ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
-	ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
-	ckpt->next_free_nid = cpu_to_le32(last_nid);
-
 	/* 2 cp  + n data seg summary + orphan inode blocks */
 	data_sum_blocks = npages_for_summary_flush(sbi, false);
-	spin_lock(&sbi->cp_lock);
+	spin_lock_irqsave(&sbi->cp_lock, flags);
 	if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
 		__set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
 	else
 		__clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
-	spin_unlock(&sbi->cp_lock);
+	spin_unlock_irqrestore(&sbi->cp_lock, flags);
 
 	orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
 	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
@@ -1138,6 +1214,27 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	start_blk = __start_cp_next_addr(sbi);
 
+	/* write nat bits */
+	if (enabled_nat_bits(sbi, cpc)) {
+		__u64 cp_ver = cur_cp_version(ckpt);
+		block_t blk;
+
+		cp_ver |= ((__u64)crc32 << 32);
+		*(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
+
+		blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks;
+		for (i = 0; i < nm_i->nat_bits_blocks; i++)
+			update_meta_page(sbi, nm_i->nat_bits +
+					(i << F2FS_BLKSIZE_BITS), blk + i);
+
+		/* Flush all the NAT BITS pages */
+		while (get_pages(sbi, F2FS_DIRTY_META)) {
+			sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+			if (unlikely(f2fs_cp_error(sbi)))
+				return -EIO;
+		}
+	}
+
 	/* need to wait for end_io results */
 	wait_on_all_pages_writeback(sbi);
 	if (unlikely(f2fs_cp_error(sbi)))
@@ -1187,7 +1284,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
 
 	/* Here, we only have one bio having CP pack */
-	sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+	sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
 
 	/* wait for previous submitted meta pages writeback */
 	wait_on_all_pages_writeback(sbi);
@@ -1226,8 +1323,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	mutex_lock(&sbi->cp_mutex);
 
 	if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
-		(cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
-		(cpc->reason == CP_DISCARD && !sbi->discard_blks)))
+		((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
+		((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
 		goto out;
 	if (unlikely(f2fs_cp_error(sbi))) {
 		err = -EIO;
@@ -1246,10 +1343,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
 
-	f2fs_flush_merged_bios(sbi);
+	f2fs_flush_merged_writes(sbi);
 
 	/* this is the case of multiple fstrims without any changes */
-	if (cpc->reason == CP_DISCARD) {
+	if (cpc->reason & CP_DISCARD) {
 		if (!exist_trim_candidates(sbi, cpc)) {
 			unblock_operations(sbi);
 			goto out;
@@ -1274,7 +1371,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
 
 	/* write cached NAT/SIT entries to NAT/SIT area */
-	flush_nat_entries(sbi);
+	flush_nat_entries(sbi, cpc);
 	flush_sit_entries(sbi, cpc);
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
@@ -1287,7 +1384,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	unblock_operations(sbi);
 	stat_inc_cp_count(sbi->stat_info);
 
-	if (cpc->reason == CP_RECOVERY)
+	if (cpc->reason & CP_RECOVERY)
 		f2fs_msg(sbi->sb, KERN_NOTICE,
 			"checkpoint: version = %llx", ckpt_ver);
 
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
deleted file mode 100644
index 18595d7a0efc..000000000000
--- a/fs/f2fs/crypto_key.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * linux/fs/f2fs/crypto_key.c
- *
- * Copied from linux/fs/f2fs/crypto_key.c
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption key functions for f2fs
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
- */
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <uapi/linux/keyctl.h>
-#include <crypto/hash.h>
-#include <linux/f2fs_fs.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
-	struct f2fs_completion_result *ecr = req->data;
-
-	if (rc == -EINPROGRESS)
-		return;
-
-	ecr->res = rc;
-	complete(&ecr->completion);
-}
-
-/**
- * f2fs_derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivatio.
- * @source_key:   Source key to which to apply derivation.
- * @derived_key:  Derived key.
- *
- * Return: Zero on success; non-zero otherwise.
- */
-static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE],
-				char source_key[F2FS_AES_256_XTS_KEY_SIZE],
-				char derived_key[F2FS_AES_256_XTS_KEY_SIZE])
-{
-	int res = 0;
-	struct ablkcipher_request *req = NULL;
-	DECLARE_F2FS_COMPLETION_RESULT(ecr);
-	struct scatterlist src_sg, dst_sg;
-	struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
-								0);
-
-	if (IS_ERR(tfm)) {
-		res = PTR_ERR(tfm);
-		tfm = NULL;
-		goto out;
-	}
-	crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
-	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
-	if (!req) {
-		res = -ENOMEM;
-		goto out;
-	}
-	ablkcipher_request_set_callback(req,
-			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-			derive_crypt_complete, &ecr);
-	res = crypto_ablkcipher_setkey(tfm, deriving_key,
-				F2FS_AES_128_ECB_KEY_SIZE);
-	if (res < 0)
-		goto out;
-
-	sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE);
-	sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE);
-	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
-					F2FS_AES_256_XTS_KEY_SIZE, NULL);
-	res = crypto_ablkcipher_encrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		BUG_ON(req->base.data != &ecr);
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
-out:
-	if (req)
-		ablkcipher_request_free(req);
-	if (tfm)
-		crypto_free_ablkcipher(tfm);
-	return res;
-}
-
-static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
-{
-	if (!ci)
-		return;
-
-	crypto_free_ablkcipher(ci->ci_ctfm);
-	kmem_cache_free(f2fs_crypt_info_cachep, ci);
-}
-
-void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci)
-{
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	struct f2fs_crypt_info *prev;
-
-	if (ci == NULL)
-		ci = ACCESS_ONCE(fi->i_crypt_info);
-	if (ci == NULL)
-		return;
-	prev = cmpxchg(&fi->i_crypt_info, ci, NULL);
-	if (prev != ci)
-		return;
-
-	f2fs_free_crypt_info(ci);
-}
-
-int f2fs_get_encryption_info(struct inode *inode)
-{
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	struct f2fs_crypt_info *crypt_info;
-	char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
-				(F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
-	struct key *keyring_key = NULL;
-	struct f2fs_encryption_key *master_key;
-	struct f2fs_encryption_context ctx;
-	const struct user_key_payload *ukp;
-	struct crypto_ablkcipher *ctfm;
-	const char *cipher_str;
-	char raw_key[F2FS_MAX_KEY_SIZE];
-	char mode;
-	int res;
-
-	if (fi->i_crypt_info)
-		return 0;
-
-	res = f2fs_crypto_initialize();
-	if (res)
-		return res;
-
-	res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
-				&ctx, sizeof(ctx), NULL);
-	if (res < 0)
-		return res;
-	else if (res != sizeof(ctx))
-		return -EINVAL;
-	res = 0;
-
-	crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS);
-	if (!crypt_info)
-		return -ENOMEM;
-
-	crypt_info->ci_flags = ctx.flags;
-	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
-	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
-	crypt_info->ci_ctfm = NULL;
-	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
-				sizeof(crypt_info->ci_master_key));
-	if (S_ISREG(inode->i_mode))
-		mode = crypt_info->ci_data_mode;
-	else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		mode = crypt_info->ci_filename_mode;
-	else
-		BUG();
-
-	switch (mode) {
-	case F2FS_ENCRYPTION_MODE_AES_256_XTS:
-		cipher_str = "xts(aes)";
-		break;
-	case F2FS_ENCRYPTION_MODE_AES_256_CTS:
-		cipher_str = "cts(cbc(aes))";
-		break;
-	default:
-		printk_once(KERN_WARNING
-			    "f2fs: unsupported key mode %d (ino %u)\n",
-			    mode, (unsigned) inode->i_ino);
-		res = -ENOKEY;
-		goto out;
-	}
-
-	memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX,
-					F2FS_KEY_DESC_PREFIX_SIZE);
-	sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE,
-					"%*phN", F2FS_KEY_DESCRIPTOR_SIZE,
-					ctx.master_key_descriptor);
-	full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
-					(2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0';
-	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
-	if (IS_ERR(keyring_key)) {
-		res = PTR_ERR(keyring_key);
-		keyring_key = NULL;
-		goto out;
-	}
-	BUG_ON(keyring_key->type != &key_type_logon);
-	ukp = user_key_payload(keyring_key);
-	if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
-		res = -EINVAL;
-		goto out;
-	}
-	master_key = (struct f2fs_encryption_key *)ukp->data;
-	BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE !=
-				F2FS_KEY_DERIVATION_NONCE_SIZE);
-	BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE);
-	res = f2fs_derive_key_aes(ctx.nonce, master_key->raw,
-				  raw_key);
-	if (res)
-		goto out;
-
-	ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
-	if (!ctfm || IS_ERR(ctfm)) {
-		res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
-		printk(KERN_DEBUG
-		       "%s: error %d (inode %u) allocating crypto tfm\n",
-		       __func__, res, (unsigned) inode->i_ino);
-		goto out;
-	}
-	crypt_info->ci_ctfm = ctfm;
-	crypto_ablkcipher_clear_flags(ctfm, ~0);
-	crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
-			     CRYPTO_TFM_REQ_WEAK_KEY);
-	res = crypto_ablkcipher_setkey(ctfm, raw_key,
-					f2fs_encryption_key_size(mode));
-	if (res)
-		goto out;
-
-	if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) == NULL)
-		crypt_info = NULL;
-out:
-	if (res == -ENOKEY && !S_ISREG(inode->i_mode))
-		res = 0;
-	key_put(keyring_key);
-	f2fs_free_crypt_info(crypt_info);
-	memzero_explicit(raw_key, sizeof(raw_key));
-	return res;
-}
-
-int f2fs_has_encryption_key(struct inode *inode)
-{
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-
-	return (fi->i_crypt_info != NULL);
-}
diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c
deleted file mode 100644
index 884f3f0fe29d..000000000000
--- a/fs/f2fs/crypto_policy.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * copied from linux/fs/ext4/crypto_policy.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility.
- *
- * This contains encryption policy functions for f2fs with some modifications
- * to support f2fs-specific xattr APIs.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#include <linux/random.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/f2fs_fs.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-static int f2fs_inode_has_encryption_context(struct inode *inode)
-{
-	int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-			F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL);
-	return (res > 0);
-}
-
-/*
- * check whether the policy is consistent with the encryption context
- * for the inode
- */
-static int f2fs_is_encryption_context_consistent_with_policy(
-	struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
-	struct f2fs_encryption_context ctx;
-	int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
-				sizeof(ctx), NULL);
-
-	if (res != sizeof(ctx))
-		return 0;
-
-	return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
-				F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-			(ctx.flags == policy->flags) &&
-			(ctx.contents_encryption_mode ==
-			 policy->contents_encryption_mode) &&
-			(ctx.filenames_encryption_mode ==
-			 policy->filenames_encryption_mode));
-}
-
-static int f2fs_create_encryption_context_from_policy(
-	struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
-	struct f2fs_encryption_context ctx;
-
-	ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
-			F2FS_KEY_DESCRIPTOR_SIZE);
-
-	if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) {
-		printk(KERN_WARNING
-		       "%s: Invalid contents encryption mode %d\n", __func__,
-			policy->contents_encryption_mode);
-		return -EINVAL;
-	}
-
-	if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
-		printk(KERN_WARNING
-		       "%s: Invalid filenames encryption mode %d\n", __func__,
-			policy->filenames_encryption_mode);
-		return -EINVAL;
-	}
-
-	if (policy->flags & ~F2FS_POLICY_FLAGS_VALID)
-		return -EINVAL;
-
-	ctx.contents_encryption_mode = policy->contents_encryption_mode;
-	ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
-	ctx.flags = policy->flags;
-	BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE);
-	get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
-
-	return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-			F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
-			sizeof(ctx), NULL, XATTR_CREATE);
-}
-
-int f2fs_process_policy(const struct f2fs_encryption_policy *policy,
-			struct inode *inode)
-{
-	if (!inode_owner_or_capable(inode))
-		return -EACCES;
-
-	if (policy->version != 0)
-		return -EINVAL;
-
-	if (!S_ISDIR(inode->i_mode))
-		return -EINVAL;
-
-	if (!f2fs_inode_has_encryption_context(inode)) {
-		if (!f2fs_empty_dir(inode))
-			return -ENOTEMPTY;
-		return f2fs_create_encryption_context_from_policy(inode,
-								  policy);
-	}
-
-	if (f2fs_is_encryption_context_consistent_with_policy(inode, policy))
-		return 0;
-
-	printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
-	       __func__);
-	return -EINVAL;
-}
-
-int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy)
-{
-	struct f2fs_encryption_context ctx;
-	int res;
-
-	if (!f2fs_encrypted_inode(inode))
-		return -ENODATA;
-
-	res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
-				&ctx, sizeof(ctx), NULL);
-	if (res != sizeof(ctx))
-		return -ENODATA;
-	if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1)
-		return -EINVAL;
-
-	policy->version = 0;
-	policy->contents_encryption_mode = ctx.contents_encryption_mode;
-	policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
-	policy->flags = ctx.flags;
-	memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
-			F2FS_KEY_DESCRIPTOR_SIZE);
-	return 0;
-}
-
-int f2fs_is_child_context_consistent_with_parent(struct inode *parent,
-						struct inode *child)
-{
-	const struct f2fs_crypt_info *parent_ci, *child_ci;
-	struct f2fs_encryption_context parent_ctx, child_ctx;
-	int res;
-
-	/* No restrictions on file types which are never encrypted */
-	if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
-	    !S_ISLNK(child->i_mode))
-		return 1;
-
-	/* No restrictions if the parent directory is unencrypted */
-	if (!f2fs_encrypted_inode(parent))
-		return 1;
-
-	/* Encrypted directories must not contain unencrypted files */
-	if (!f2fs_encrypted_inode(child))
-		return 0;
-
-	/*
-	 * Both parent and child are encrypted, so verify they use the same
-	 * encryption policy.  Compare the fscrypt_info structs if the keys are
-	 * available, otherwise retrieve and compare the fscrypt_contexts.
-	 *
-	 * Note that the fscrypt_context retrieval will be required frequently
-	 * when accessing an encrypted directory tree without the key.
-	 * Performance-wise this is not a big deal because we already don't
-	 * really optimize for file access without the key (to the extent that
-	 * such access is even possible), given that any attempted access
-	 * already causes a fscrypt_context retrieval and keyring search.
-	 *
-	 * In any case, if an unexpected error occurs, fall back to "forbidden".
-	 */
-
-	res = f2fs_get_encryption_info(parent);
-	if (res)
-		return 0;
-	res = f2fs_get_encryption_info(child);
-	if (res)
-		return 0;
-	parent_ci = F2FS_I(parent)->i_crypt_info;
-	child_ci = F2FS_I(child)->i_crypt_info;
-	if (parent_ci && child_ci) {
-		return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
-			      F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-			(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
-			(parent_ci->ci_filename_mode ==
-			 child_ci->ci_filename_mode) &&
-			(parent_ci->ci_flags == child_ci->ci_flags);
-	}
-
-	res = f2fs_getxattr(parent, F2FS_XATTR_INDEX_ENCRYPTION,
-			    F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
-			    &parent_ctx, sizeof(parent_ctx), NULL);
-	if (res != sizeof(parent_ctx))
-		return 0;
-
-	res = f2fs_getxattr(child, F2FS_XATTR_INDEX_ENCRYPTION,
-			    F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
-			    &child_ctx, sizeof(child_ctx), NULL);
-	if (res != sizeof(child_ctx))
-		return 0;
-
-	return memcmp(parent_ctx.master_key_descriptor,
-		      child_ctx.master_key_descriptor,
-		      F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-		(parent_ctx.contents_encryption_mode ==
-		 child_ctx.contents_encryption_mode) &&
-		(parent_ctx.filenames_encryption_mode ==
-		 child_ctx.filenames_encryption_mode) &&
-		(parent_ctx.flags == child_ctx.flags);
-}
-
-/**
- * f2fs_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child:  Child inode that inherits the context from @parent.
- *
- * Return: Zero on success, non-zero otherwise
- */
-int f2fs_inherit_context(struct inode *parent, struct inode *child,
-						struct page *ipage)
-{
-	struct f2fs_encryption_context ctx;
-	struct f2fs_crypt_info *ci;
-	int res;
-
-	res = f2fs_get_encryption_info(parent);
-	if (res < 0)
-		return res;
-
-	ci = F2FS_I(parent)->i_crypt_info;
-	BUG_ON(ci == NULL);
-
-	ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-
-	ctx.contents_encryption_mode = ci->ci_data_mode;
-	ctx.filenames_encryption_mode = ci->ci_filename_mode;
-	ctx.flags = ci->ci_flags;
-	memcpy(ctx.master_key_descriptor, ci->ci_master_key,
-			F2FS_KEY_DESCRIPTOR_SIZE);
-
-	get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
-	return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION,
-				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
-				sizeof(ctx), ipage, XATTR_CREATE);
-}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4ac72a3f920a..c8583d7a1845 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -56,8 +56,10 @@ static void f2fs_read_end_io(struct bio *bio)
 	int i;
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-	if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO))
+	if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) {
+		f2fs_show_injection_info(FAULT_IO);
 		bio->bi_error = -EIO;
+	}
 #endif
 
 	if (f2fs_bio_encrypted(bio)) {
@@ -223,7 +225,7 @@ submit_io:
 		trace_f2fs_submit_read_bio(sbi->sb, type, bio);
 	else
 		trace_f2fs_submit_write_bio(sbi->sb, type, bio);
-	submit_bio(0, bio);
+	submit_bio(bio_op(bio), bio);
 }
 
 static void __submit_merged_bio(struct f2fs_bio_info *io)
@@ -244,8 +246,8 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
 	io->bio = NULL;
 }
 
-static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
-						struct page *page, nid_t ino)
+static bool __has_merged_page(struct f2fs_bio_info *io,
+				struct inode *inode, nid_t ino, pgoff_t idx)
 {
 	struct bio_vec *bvec;
 	struct page *target;
@@ -254,7 +256,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
 	if (!io->bio)
 		return false;
 
-	if (!inode && !page && !ino)
+	if (!inode && !ino)
 		return true;
 
 	bio_for_each_segment_all(bvec, io->bio, i) {
@@ -264,10 +266,11 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
 		else
 			target = fscrypt_control_page(bvec->bv_page);
 
+		if (idx != target->index)
+			continue;
+
 		if (inode && inode == target->mapping->host)
 			return true;
-		if (page && page == target)
-			return true;
 		if (ino && ino == ino_of_node(target))
 			return true;
 	}
@@ -276,70 +279,88 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
 }
 
 static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
-						struct page *page, nid_t ino,
-						enum page_type type)
+				nid_t ino, pgoff_t idx, enum page_type type)
 {
 	enum page_type btype = PAGE_TYPE_OF_BIO(type);
-	struct f2fs_bio_info *io = &sbi->write_io[btype];
-	bool ret;
+	enum temp_type temp;
+	struct f2fs_bio_info *io;
+	bool ret = false;
 
-	down_read(&io->io_rwsem);
-	ret = __has_merged_page(io, inode, page, ino);
-	up_read(&io->io_rwsem);
+	for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
+		io = sbi->write_io[btype] + temp;
+
+		down_read(&io->io_rwsem);
+		ret = __has_merged_page(io, inode, ino, idx);
+		up_read(&io->io_rwsem);
+
+		/* TODO: use HOT temp only for meta pages now. */
+		if (ret || btype == META)
+			break;
+	}
 	return ret;
 }
 
-static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
-				struct inode *inode, struct page *page,
-				nid_t ino, enum page_type type, int rw)
+static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
+				enum page_type type, enum temp_type temp)
 {
 	enum page_type btype = PAGE_TYPE_OF_BIO(type);
-	struct f2fs_bio_info *io;
-
-	io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+	struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
 
 	down_write(&io->io_rwsem);
 
-	if (!__has_merged_page(io, inode, page, ino))
-		goto out;
-
 	/* change META to META_FLUSH in the checkpoint procedure */
 	if (type >= META_FLUSH) {
 		io->fio.type = META_FLUSH;
 		io->fio.op = REQ_OP_WRITE;
-		io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
+		io->fio.op_flags = REQ_META | REQ_PRIO;
 		if (!test_opt(sbi, NOBARRIER))
-			io->fio.op_flags |= REQ_FUA;
+			io->fio.op_flags |= WRITE_FLUSH | REQ_FUA;
 	}
 	__submit_merged_bio(io);
-out:
 	up_write(&io->io_rwsem);
 }
 
-void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type,
-									int rw)
+static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
+				struct inode *inode, nid_t ino, pgoff_t idx,
+				enum page_type type, bool force)
 {
-	__f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw);
+	enum temp_type temp;
+
+	if (!force && !has_merged_page(sbi, inode, ino, idx, type))
+		return;
+
+	for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
+
+		__f2fs_submit_merged_write(sbi, type, temp);
+
+		/* TODO: use HOT temp only for meta pages now. */
+		if (type >= META)
+			break;
+	}
 }
 
-void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi,
-				struct inode *inode, struct page *page,
-				nid_t ino, enum page_type type, int rw)
+void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type)
 {
-	if (has_merged_page(sbi, inode, page, ino, type))
-		__f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw);
+	__submit_merged_write_cond(sbi, NULL, 0, 0, type, true);
 }
 
-void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
+void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
+				struct inode *inode, nid_t ino, pgoff_t idx,
+				enum page_type type)
 {
-	f2fs_submit_merged_bio(sbi, DATA, WRITE);
-	f2fs_submit_merged_bio(sbi, NODE, WRITE);
-	f2fs_submit_merged_bio(sbi, META, WRITE);
+	__submit_merged_write_cond(sbi, inode, ino, idx, type, false);
+}
+
+void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
+{
+	f2fs_submit_merged_write(sbi, DATA);
+	f2fs_submit_merged_write(sbi, NODE);
+	f2fs_submit_merged_write(sbi, META);
 }
 
 /*
  * Fill the locked page with data located in the block address.
- * Return unlocked page.
+ * A caller needs to unlock the page on failure.
  */
 int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 {
@@ -360,19 +381,35 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	bio_set_op_attrs(bio, fio->op, fio->op_flags);
 
 	__submit_bio(fio->sbi, bio, fio->type);
+
+	if (!is_read_io(fio->op))
+		inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page));
 	return 0;
 }
 
-int f2fs_submit_page_mbio(struct f2fs_io_info *fio)
+int f2fs_submit_page_write(struct f2fs_io_info *fio)
 {
 	struct f2fs_sb_info *sbi = fio->sbi;
 	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
-	struct f2fs_bio_info *io;
-	bool is_read = is_read_io(fio->op);
+	struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
 	struct page *bio_page;
 	int err = 0;
 
-	io = is_read ? &sbi->read_io : &sbi->write_io[btype];
+	f2fs_bug_on(sbi, is_read_io(fio->op));
+
+	down_write(&io->io_rwsem);
+next:
+	if (fio->in_list) {
+		spin_lock(&io->io_lock);
+		if (list_empty(&io->io_list)) {
+			spin_unlock(&io->io_lock);
+			goto out_fail;
+		}
+		fio = list_first_entry(&io->io_list,
+						struct f2fs_io_info, list);
+		list_del(&fio->list);
+		spin_unlock(&io->io_lock);
+	}
 
 	if (fio->old_blkaddr != NEW_ADDR)
 		verify_block_addr(sbi, fio->old_blkaddr);
@@ -380,10 +417,10 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 
 	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
-	if (!is_read)
-		inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+	/* set submitted = 1 as a return value */
+	fio->submitted = 1;
 
-	down_write(&io->io_rwsem);
+	inc_page_count(sbi, WB_DATA_TYPE(bio_page));
 
 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
 	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
@@ -398,32 +435,86 @@ alloc_new:
 			goto out_fail;
 		}
 		io->bio = __bio_alloc(sbi, fio->new_blkaddr,
-						BIO_MAX_PAGES, is_read);
+						BIO_MAX_PAGES, false);
 		io->fio = *fio;
 	}
 
-	if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
-							PAGE_SIZE) {
+	if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		__submit_merged_bio(io);
 		goto alloc_new;
 	}
 
 	io->last_block_in_bio = fio->new_blkaddr;
 	f2fs_trace_ios(fio, 0);
+
+	trace_f2fs_submit_page_write(fio->page, fio);
+
+	if (fio->in_list)
+		goto next;
 out_fail:
 	up_write(&io->io_rwsem);
-	trace_f2fs_submit_page_mbio(fio->page, fio);
 	return err;
 }
 
+static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
+							 unsigned nr_pages)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct fscrypt_ctx *ctx = NULL;
+	struct bio *bio;
+
+	if (f2fs_encrypted_file(inode)) {
+		ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+		if (IS_ERR(ctx))
+			return ERR_CAST(ctx);
+
+		/* wait the page to be moved by cleaning */
+		f2fs_wait_on_block_writeback(sbi, blkaddr);
+	}
+
+	bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+	if (!bio) {
+		if (ctx)
+			fscrypt_release_ctx(ctx);
+		return ERR_PTR(-ENOMEM);
+	}
+	f2fs_target_device(sbi, blkaddr, bio);
+	bio->bi_end_io = f2fs_read_end_io;
+	bio->bi_private = ctx;
+	bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+	return bio;
+}
+
+/* This can handle encryption stuffs */
+static int f2fs_submit_page_read(struct inode *inode, struct page *page,
+							block_t blkaddr)
+{
+	struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1);
+
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		bio_put(bio);
+		return -EFAULT;
+	}
+	__submit_bio(F2FS_I_SB(inode), bio, DATA);
+	return 0;
+}
+
 static void __set_data_blkaddr(struct dnode_of_data *dn)
 {
 	struct f2fs_node *rn = F2FS_NODE(dn->node_page);
 	__le32 *addr_array;
+	int base = 0;
+
+	if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
+		base = get_extra_isize(dn->inode);
 
 	/* Get physical address of data block */
 	addr_array = blkaddr_in_node(rn);
-	addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+	addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
 }
 
 /*
@@ -451,14 +542,15 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
 int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	int err;
 
 	if (!count)
 		return 0;
 
 	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
 		return -EPERM;
-	if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
-		return -ENOSPC;
+	if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
+		return err;
 
 	trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
 						dn->ofs_in_node, count);
@@ -466,8 +558,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
 	f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
 
 	for (; count > 0; dn->ofs_in_node++) {
-		block_t blkaddr =
-			datablock_addr(dn->node_page, dn->ofs_in_node);
+		block_t blkaddr = datablock_addr(dn->inode,
+					dn->node_page, dn->ofs_in_node);
 		if (blkaddr == NULL_ADDR) {
 			dn->data_blkaddr = NEW_ADDR;
 			__set_data_blkaddr(dn);
@@ -509,7 +601,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 
 int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
 {
-	struct extent_info ei;
+	struct extent_info ei  = {0,0,0};
 	struct inode *inode = dn->inode;
 
 	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
@@ -526,18 +618,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
 	struct page *page;
-	struct extent_info ei;
+	struct extent_info ei = {0,0,0};
 	int err;
-	struct f2fs_io_info fio = {
-		.sbi = F2FS_I_SB(inode),
-		.type = DATA,
-		.op = REQ_OP_READ,
-		.op_flags = op_flags,
-		.encrypted_page = NULL,
-	};
-
-	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-		return read_mapping_page(mapping, index, NULL);
 
 	page = f2fs_grab_cache_page(mapping, index, for_write);
 	if (!page)
@@ -578,9 +660,7 @@ got_it:
 		return page;
 	}
 
-	fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
-	fio.page = page;
-	err = f2fs_submit_page_bio(&fio);
+	err = f2fs_submit_page_read(inode, page, dn.data_blkaddr);
 	if (err)
 		goto put_err;
 	return page;
@@ -709,23 +789,25 @@ static int __allocate_data_block(struct dnode_of_data *dn)
 	struct node_info ni;
 	pgoff_t fofs;
 	blkcnt_t count = 1;
+	int err;
 
 	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
 		return -EPERM;
 
-	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+	dn->data_blkaddr = datablock_addr(dn->inode,
+				dn->node_page, dn->ofs_in_node);
 	if (dn->data_blkaddr == NEW_ADDR)
 		goto alloc;
 
-	if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
-		return -ENOSPC;
+	if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
+		return err;
 
 alloc:
 	get_node_info(sbi, dn->nid, &ni);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 
 	allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
-						&sum, CURSEG_WARM_DATA);
+					&sum, CURSEG_WARM_DATA, NULL, false);
 	set_data_blkaddr(dn);
 
 	/* update i_size */
@@ -739,7 +821,7 @@ alloc:
 
 static inline bool __force_buffered_io(struct inode *inode, int rw)
 {
-	return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) ||
+	return (f2fs_encrypted_file(inode) ||
 			(rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
 			F2FS_I_SB(inode)->s_ndevs);
 }
@@ -750,6 +832,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	struct f2fs_map_blocks map;
 	int err = 0;
 
+	if (is_inode_flag_set(inode, FI_NO_PREALLOC))
+		return 0;
+
 	map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
 	map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
 	if (map.m_len > map.m_lblk)
@@ -768,7 +853,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 				F2FS_GET_BLOCK_PRE_AIO :
 				F2FS_GET_BLOCK_PRE_DIO);
 	}
-	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
+	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
 			return err;
@@ -778,6 +863,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	return err;
 }
 
+static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+{
+	if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+		if (lock)
+			down_read(&sbi->node_change);
+		else
+			up_read(&sbi->node_change);
+	} else {
+		if (lock)
+			f2fs_lock_op(sbi);
+		else
+			f2fs_unlock_op(sbi);
+	}
+}
+
 /*
  * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
  * f2fs_map_blocks structure.
@@ -798,7 +898,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	int err = 0, ofs = 1;
 	unsigned int ofs_in_node, last_ofs_in_node;
 	blkcnt_t prealloc;
-	struct extent_info ei;
+	struct extent_info ei = {0,0,0};
 	block_t blkaddr;
 
 	if (!maxblocks)
@@ -820,7 +920,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 
 next_dnode:
 	if (create)
-		f2fs_lock_op(sbi);
+		__do_map_lock(sbi, flag, true);
 
 	/* When reading holes, we need its node page */
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -842,7 +942,7 @@ next_dnode:
 	end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
 
 next_block:
-	blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+	blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
 
 	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
 		if (create) {
@@ -862,7 +962,7 @@ next_block:
 			}
 			if (err)
 				goto sync_out;
-			map->m_flags = F2FS_MAP_NEW;
+			map->m_flags |= F2FS_MAP_NEW;
 			blkaddr = dn.data_blkaddr;
 		} else {
 			if (flag == F2FS_GET_BLOCK_BMAP) {
@@ -930,7 +1030,7 @@ skip:
 	f2fs_put_dnode(&dn);
 
 	if (create) {
-		f2fs_unlock_op(sbi);
+		__do_map_lock(sbi, flag, false);
 		f2fs_balance_fs(sbi, dn.node_changed);
 	}
 	goto next_dnode;
@@ -939,7 +1039,7 @@ sync_out:
 	f2fs_put_dnode(&dn);
 unlock_out:
 	if (create) {
-		f2fs_unlock_op(sbi);
+		__do_map_lock(sbi, flag, false);
 		f2fs_balance_fs(sbi, dn.node_changed);
 	}
 out:
@@ -962,7 +1062,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 	if (!err) {
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
-		bh->b_size = map.m_len << inode->i_blkbits;
+		bh->b_size = (u64)map.m_len << inode->i_blkbits;
 	}
 	return err;
 }
@@ -979,7 +1079,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	return __get_data_block(inode, iblock, bh_result, create,
-						F2FS_GET_BLOCK_DIO, NULL);
+						F2FS_GET_BLOCK_DEFAULT, NULL);
 }
 
 static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -1085,35 +1185,6 @@ out:
 	return ret;
 }
 
-static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
-				 unsigned nr_pages)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct fscrypt_ctx *ctx = NULL;
-	struct bio *bio;
-
-	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
-		ctx = fscrypt_get_ctx(inode, GFP_NOFS);
-		if (IS_ERR(ctx))
-			return ERR_CAST(ctx);
-
-		/* wait the page to be moved by cleaning */
-		f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
-	}
-
-	bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
-	if (!bio) {
-		if (ctx)
-			fscrypt_release_ctx(ctx);
-		return ERR_PTR(-ENOMEM);
-	}
-	f2fs_target_device(sbi, blkaddr, bio);
-	bio->bi_end_io = f2fs_read_end_io;
-	bio->bi_private = ctx;
-
-	return bio;
-}
-
 /*
  * This function was originally taken from fs/mpage.c, and customized for f2fs.
  * Major change was from block_size == page_size in f2fs by default.
@@ -1142,9 +1213,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 
 	for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
 
-		prefetchw(&page->flags);
 		if (pages) {
 			page = list_last_entry(pages, struct page, lru);
+
+			prefetchw(&page->flags);
 			list_del(&page->lru);
 			if (add_to_page_cache_lru(page, mapping,
 						  page->index, GFP_KERNEL))
@@ -1177,7 +1249,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 			map.m_len = last_block - block_in_file;
 
 			if (f2fs_map_blocks(inode, &map, 0,
-						F2FS_GET_BLOCK_READ))
+						F2FS_GET_BLOCK_DEFAULT))
 				goto set_error_page;
 		}
 got_it:
@@ -1208,12 +1280,11 @@ submit_and_realloc:
 			bio = NULL;
 		}
 		if (bio == NULL) {
-			bio = f2fs_grab_bio(inode, block_nr, nr_pages);
+			bio = f2fs_grab_read_bio(inode, block_nr, nr_pages);
 			if (IS_ERR(bio)) {
 				bio = NULL;
 				goto set_error_page;
 			}
-			bio_set_op_attrs(bio, REQ_OP_READ, 0);
 		}
 
 		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
@@ -1273,17 +1344,84 @@ static int f2fs_read_data_pages(struct file *file,
 	return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages);
 }
 
+static int encrypt_one_page(struct f2fs_io_info *fio)
+{
+	struct inode *inode = fio->page->mapping->host;
+	gfp_t gfp_flags = GFP_NOFS;
+
+	if (!f2fs_encrypted_file(inode))
+		return 0;
+
+	/* wait for GCed encrypted page writeback */
+	f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr);
+
+retry_encrypt:
+	fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+			PAGE_SIZE, 0, fio->page->index, gfp_flags);
+	if (!IS_ERR(fio->encrypted_page))
+		return 0;
+
+	/* flush pending IOs and wait for a while in the ENOMEM case */
+	if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
+		f2fs_flush_merged_writes(fio->sbi);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		gfp_flags |= __GFP_NOFAIL;
+		goto retry_encrypt;
+	}
+	return PTR_ERR(fio->encrypted_page);
+}
+
+static inline bool need_inplace_update(struct f2fs_io_info *fio)
+{
+	struct inode *inode = fio->page->mapping->host;
+
+	if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
+		return false;
+	if (is_cold_data(fio->page))
+		return false;
+	if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
+		return false;
+
+	return need_inplace_update_policy(inode, fio);
+}
+
+static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio)
+{
+	if (fio->old_blkaddr == NEW_ADDR)
+		return false;
+	if (fio->old_blkaddr == NULL_ADDR)
+		return false;
+	return true;
+}
+
 int do_write_data_page(struct f2fs_io_info *fio)
 {
 	struct page *page = fio->page;
 	struct inode *inode = page->mapping->host;
 	struct dnode_of_data dn;
+	struct extent_info ei = {0,0,0};
+	bool ipu_force = false;
 	int err = 0;
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	if (need_inplace_update(fio) &&
+			f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+		fio->old_blkaddr = ei.blk + page->index - ei.fofs;
+
+		if (valid_ipu_blkaddr(fio)) {
+			ipu_force = true;
+			fio->need_lock = LOCK_DONE;
+			goto got_it;
+		}
+	}
+
+	/* Deadlock due to between page->lock and f2fs_lock_op */
+	if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi))
+		return -EAGAIN;
+
 	err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
 	if (err)
-		return err;
+		goto out;
 
 	fio->old_blkaddr = dn.data_blkaddr;
 
@@ -1292,57 +1430,57 @@ int do_write_data_page(struct f2fs_io_info *fio)
 		ClearPageUptodate(page);
 		goto out_writepage;
 	}
-
-	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
-		gfp_t gfp_flags = GFP_NOFS;
-
-		/* wait for GCed encrypted page writeback */
-		f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
-							fio->old_blkaddr);
-retry_encrypt:
-		fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
-								gfp_flags);
-		if (IS_ERR(fio->encrypted_page)) {
-			err = PTR_ERR(fio->encrypted_page);
-			if (err == -ENOMEM) {
-				/* flush pending ios and wait for a while */
-				f2fs_flush_merged_bios(F2FS_I_SB(inode));
-				congestion_wait(BLK_RW_ASYNC, HZ/50);
-				gfp_flags |= __GFP_NOFAIL;
-				err = 0;
-				goto retry_encrypt;
-			}
-			goto out_writepage;
-		}
-	}
-
-	set_page_writeback(page);
-
+got_it:
 	/*
 	 * If current allocation needs SSR,
 	 * it had better in-place writes for updated data.
 	 */
-	if (unlikely(fio->old_blkaddr != NEW_ADDR &&
-			!is_cold_data(page) &&
-			!IS_ATOMIC_WRITTEN_PAGE(page) &&
-			need_inplace_update(inode))) {
-		rewrite_data_page(fio);
+	if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) {
+		err = encrypt_one_page(fio);
+		if (err)
+			goto out_writepage;
+
+		set_page_writeback(page);
+		f2fs_put_dnode(&dn);
+		if (fio->need_lock == LOCK_REQ)
+			f2fs_unlock_op(fio->sbi);
+		err = rewrite_data_page(fio);
+		trace_f2fs_do_write_data_page(fio->page, IPU);
 		set_inode_flag(inode, FI_UPDATE_WRITE);
-		trace_f2fs_do_write_data_page(page, IPU);
-	} else {
-		write_data_page(&dn, fio);
-		trace_f2fs_do_write_data_page(page, OPU);
-		set_inode_flag(inode, FI_APPEND_WRITE);
-		if (page->index == 0)
-			set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+		return err;
 	}
+
+	if (fio->need_lock == LOCK_RETRY) {
+		if (!f2fs_trylock_op(fio->sbi)) {
+			err = -EAGAIN;
+			goto out_writepage;
+		}
+		fio->need_lock = LOCK_REQ;
+	}
+
+	err = encrypt_one_page(fio);
+	if (err)
+		goto out_writepage;
+
+	set_page_writeback(page);
+
+	/* LFS mode write path */
+	write_data_page(&dn, fio);
+	trace_f2fs_do_write_data_page(page, OPU);
+	set_inode_flag(inode, FI_APPEND_WRITE);
+	if (page->index == 0)
+		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
 out_writepage:
 	f2fs_put_dnode(&dn);
+out:
+	if (fio->need_lock == LOCK_REQ)
+		f2fs_unlock_op(fio->sbi);
 	return err;
 }
 
-static int f2fs_write_data_page(struct page *page,
-					struct writeback_control *wbc)
+static int __write_data_page(struct page *page, bool *submitted,
+				struct writeback_control *wbc,
+				enum iostat_type io_type)
 {
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1358,12 +1496,19 @@ static int f2fs_write_data_page(struct page *page,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = wbc_to_write_flags(wbc),
+		.old_blkaddr = NULL_ADDR,
 		.page = page,
 		.encrypted_page = NULL,
+		.submitted = false,
+		.need_lock = LOCK_RETRY,
+		.io_type = io_type,
 	};
 
 	trace_f2fs_writepage(page, DATA);
 
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		goto redirty_out;
+
 	if (page->index < end_index)
 		goto write;
 
@@ -1377,8 +1522,6 @@ static int f2fs_write_data_page(struct page *page,
 
 	zero_user_segment(page, offset, PAGE_SIZE);
 write:
-	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
-		goto redirty_out;
 	if (f2fs_is_drop_cache(inode))
 		goto out;
 	/* we should not write 0'th page having journal header */
@@ -1395,6 +1538,7 @@ write:
 
 	/* Dentry blocks are controlled by checkpoint */
 	if (S_ISDIR(inode->i_mode)) {
+		fio.need_lock = LOCK_DONE;
 		err = do_write_data_page(&fio);
 		goto done;
 	}
@@ -1403,16 +1547,26 @@ write:
 		need_balance_fs = true;
 	else if (has_not_enough_free_secs(sbi, 0, 0))
 		goto redirty_out;
+	else
+		set_inode_flag(inode, FI_HOT_DATA);
 
 	err = -EAGAIN;
-	f2fs_lock_op(sbi);
-	if (f2fs_has_inline_data(inode))
+	if (f2fs_has_inline_data(inode)) {
 		err = f2fs_write_inline_data(inode, page);
-	if (err == -EAGAIN)
+		if (!err)
+			goto out;
+	}
+
+	if (err == -EAGAIN) {
 		err = do_write_data_page(&fio);
+		if (err == -EAGAIN) {
+			fio.need_lock = LOCK_REQ;
+			err = do_write_data_page(&fio);
+		}
+	}
 	if (F2FS_I(inode)->last_disk_size < psize)
 		F2FS_I(inode)->last_disk_size = psize;
-	f2fs_unlock_op(sbi);
+
 done:
 	if (err && err != -ENOENT)
 		goto redirty_out;
@@ -1423,15 +1577,23 @@ out:
 		ClearPageUptodate(page);
 
 	if (wbc->for_reclaim) {
-		f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE);
+		f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA);
+		clear_inode_flag(inode, FI_HOT_DATA);
 		remove_dirty_inode(inode);
+		submitted = NULL;
 	}
 
 	unlock_page(page);
-	f2fs_balance_fs(sbi, need_balance_fs);
+	if (!S_ISDIR(inode->i_mode))
+		f2fs_balance_fs(sbi, need_balance_fs);
 
-	if (unlikely(f2fs_cp_error(sbi)))
-		f2fs_submit_merged_bio(sbi, DATA, WRITE);
+	if (unlikely(f2fs_cp_error(sbi))) {
+		f2fs_submit_merged_write(sbi, DATA);
+		submitted = NULL;
+	}
+
+	if (submitted)
+		*submitted = fio.submitted;
 
 	return 0;
 
@@ -1443,13 +1605,20 @@ redirty_out:
 	return err;
 }
 
+static int f2fs_write_data_page(struct page *page,
+					struct writeback_control *wbc)
+{
+	return __write_data_page(page, NULL, wbc, FS_DATA_IO);
+}
+
 /*
  * This function was copied from write_cche_pages from mm/page-writeback.c.
  * The major change is making write step of cold data page separately from
  * warm/hot data page.
  */
 static int f2fs_write_cache_pages(struct address_space *mapping,
-					struct writeback_control *wbc)
+					struct writeback_control *wbc,
+					enum iostat_type io_type)
 {
 	int ret = 0;
 	int done = 0;
@@ -1459,13 +1628,19 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	pgoff_t done_index;
+	pgoff_t last_idx = ULONG_MAX;
 	int cycled;
 	int range_whole = 0;
 	int tag;
-	int nwritten = 0;
 
 	pagevec_init(&pvec, 0);
 
+	if (get_dirty_pages(mapping->host) <=
+				SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
+		set_inode_flag(mapping->host, FI_HOT_DATA);
+	else
+		clear_inode_flag(mapping->host, FI_HOT_DATA);
+
 	if (wbc->range_cyclic) {
 		writeback_index = mapping->writeback_index; /* prev offset */
 		index = writeback_index;
@@ -1499,6 +1674,7 @@ retry:
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
+			bool submitted = false;
 
 			if (page->index > end) {
 				done = 1;
@@ -1506,7 +1682,7 @@ retry:
 			}
 
 			done_index = page->index;
-
+retry_write:
 			lock_page(page);
 
 			if (unlikely(page->mapping != mapping)) {
@@ -1532,7 +1708,7 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			ret = mapping->a_ops->writepage(page, wbc);
+			ret = __write_data_page(page, &submitted, wbc, io_type);
 			if (unlikely(ret)) {
 				/*
 				 * keep nr_to_write, since vfs uses this to
@@ -1542,16 +1718,27 @@ continue_unlock:
 					unlock_page(page);
 					ret = 0;
 					continue;
+				} else if (ret == -EAGAIN) {
+					ret = 0;
+					if (wbc->sync_mode == WB_SYNC_ALL) {
+						cond_resched();
+						congestion_wait(BLK_RW_ASYNC,
+									HZ/50);
+						goto retry_write;
+					}
+					continue;
 				}
 				done_index = page->index + 1;
 				done = 1;
 				break;
-			} else {
-				nwritten++;
+			} else if (submitted) {
+				last_idx = page->index;
 			}
 
-			if (--wbc->nr_to_write <= 0 &&
-			    wbc->sync_mode == WB_SYNC_NONE) {
+			/* give a priority to WB_SYNC threads */
+			if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) ||
+					--wbc->nr_to_write <= 0) &&
+					wbc->sync_mode == WB_SYNC_NONE) {
 				done = 1;
 				break;
 			}
@@ -1569,15 +1756,16 @@ continue_unlock:
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = done_index;
 
-	if (nwritten)
-		f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host,
-							NULL, 0, DATA, WRITE);
+	if (last_idx != ULONG_MAX)
+		f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host,
+						0, last_idx, DATA);
 
 	return ret;
 }
 
-static int f2fs_write_data_pages(struct address_space *mapping,
-			    struct writeback_control *wbc)
+int __f2fs_write_data_pages(struct address_space *mapping,
+						struct writeback_control *wbc,
+						enum iostat_type io_type)
 {
 	struct inode *inode = mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1592,6 +1780,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
 		return 0;
 
+	/* during POR, we don't need to trigger writepage at all. */
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		goto skip_write;
+
 	if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
 			get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
 			available_free_memory(sbi, DIRTY_DENTS))
@@ -1601,15 +1793,20 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	if (is_inode_flag_set(inode, FI_DO_DEFRAG))
 		goto skip_write;
 
-	/* during POR, we don't need to trigger writepage at all. */
-	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
-		goto skip_write;
-
 	trace_f2fs_writepages(mapping->host, wbc, DATA);
 
+	/* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		atomic_inc(&sbi->wb_sync_req);
+	else if (atomic_read(&sbi->wb_sync_req))
+		goto skip_write;
+
 	blk_start_plug(&plug);
-	ret = f2fs_write_cache_pages(mapping, wbc);
+	ret = f2fs_write_cache_pages(mapping, wbc, io_type);
 	blk_finish_plug(&plug);
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		atomic_dec(&sbi->wb_sync_req);
 	/*
 	 * if some pages were truncated, we cannot guarantee its mapping->host
 	 * to detect pending bios.
@@ -1624,14 +1821,26 @@ skip_write:
 	return 0;
 }
 
+static int f2fs_write_data_pages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+
+	return __f2fs_write_data_pages(mapping, wbc,
+			F2FS_I(inode)->cp_task == current ?
+			FS_CP_DATA_IO : FS_DATA_IO);
+}
+
 static void f2fs_write_failed(struct address_space *mapping, loff_t to)
 {
 	struct inode *inode = mapping->host;
 	loff_t i_size = i_size_read(inode);
 
 	if (to > i_size) {
+		down_write(&F2FS_I(inode)->i_mmap_sem);
 		truncate_pagecache(inode, i_size);
 		truncate_blocks(inode, i_size, true);
+		up_write(&F2FS_I(inode)->i_mmap_sem);
 	}
 }
 
@@ -1644,19 +1853,20 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 	struct dnode_of_data dn;
 	struct page *ipage;
 	bool locked = false;
-	struct extent_info ei;
+	struct extent_info ei = {0,0,0};
 	int err = 0;
 
 	/*
 	 * we already allocated all the blocks, so we don't need to get
 	 * the block addresses when there is no need to fill the page.
 	 */
-	if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE)
+	if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE &&
+			!is_inode_flag_set(inode, FI_NO_PREALLOC))
 		return 0;
 
 	if (f2fs_has_inline_data(inode) ||
 			(pos & PAGE_MASK) >= i_size_read(inode)) {
-		f2fs_lock_op(sbi);
+		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
 		locked = true;
 	}
 restart:
@@ -1670,7 +1880,7 @@ restart:
 	set_new_dnode(&dn, inode, ipage, ipage, 0);
 
 	if (f2fs_has_inline_data(inode)) {
-		if (pos + len <= MAX_INLINE_DATA) {
+		if (pos + len <= MAX_INLINE_DATA(inode)) {
 			read_inline_data(page, ipage);
 			set_inode_flag(inode, FI_DATA_EXIST);
 			if (inode->i_nlink)
@@ -1692,7 +1902,8 @@ restart:
 			err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
 			if (err || dn.data_blkaddr == NULL_ADDR) {
 				f2fs_put_dnode(&dn);
-				f2fs_lock_op(sbi);
+				__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
+								true);
 				locked = true;
 				goto restart;
 			}
@@ -1706,7 +1917,7 @@ out:
 	f2fs_put_dnode(&dn);
 unlock_out:
 	if (locked)
-		f2fs_unlock_op(sbi);
+		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
 	return err;
 }
 
@@ -1745,7 +1956,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 			goto fail;
 	}
 repeat:
-	page = grab_cache_page_write_begin(mapping, index, flags);
+	/*
+	 * Do not use grab_cache_page_write_begin() to avoid deadlock due to
+	 * wait_for_stable_page. Will wait that below with our IO control.
+	 */
+	page = grab_cache_page(mapping, index);
 	if (!page) {
 		err = -ENOMEM;
 		goto fail;
@@ -1772,8 +1987,8 @@ repeat:
 	f2fs_wait_on_page_writeback(page, DATA, false);
 
 	/* wait for GCed encrypted page writeback */
-	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-		f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+	if (f2fs_encrypted_file(inode))
+		f2fs_wait_on_block_writeback(sbi, blkaddr);
 
 	if (len == PAGE_SIZE || PageUptodate(page))
 		return 0;
@@ -1787,21 +2002,9 @@ repeat:
 		zero_user_segment(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
 	} else {
-		struct bio *bio;
-
-		bio = f2fs_grab_bio(inode, blkaddr, 1);
-		if (IS_ERR(bio)) {
-			err = PTR_ERR(bio);
+		err = f2fs_submit_page_read(inode, page, blkaddr);
+		if (err)
 			goto fail;
-		}
-		bio->bi_rw = READ_SYNC;
-		if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-			bio_put(bio);
-			err = -EFAULT;
-			goto fail;
-		}
-
-		__submit_bio(sbi, bio, DATA);
 
 		lock_page(page);
 		if (unlikely(page->mapping != mapping)) {
@@ -1914,10 +2117,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	up_read(&F2FS_I(inode)->dio_rwsem[rw]);
 
 	if (rw == WRITE) {
-		if (err > 0)
+		if (err > 0) {
+			f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
+									err);
 			set_inode_flag(inode, FI_UPDATE_WRITE);
-		else if (err < 0)
+		} else if (err < 0) {
 			f2fs_write_failed(mapping, offset + count);
+		}
 	}
 
 	if (trace_android_fs_dataread_start_enabled() &&
@@ -1955,7 +2161,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 
 	/* This is atomic written page, keep Private */
 	if (IS_ATOMIC_WRITTEN_PAGE(page))
-		return;
+		return drop_inmem_page(inode, page);
 
 	set_page_private(page, 0);
 	ClearPagePrivate(page);
@@ -2064,8 +2270,12 @@ int f2fs_migrate_page(struct address_space *mapping,
 	BUG_ON(PageWriteback(page));
 
 	/* migrating an atomic written page is safe with the inmem_lock hold */
-	if (atomic_written && !mutex_trylock(&fi->inmem_lock))
-		return -EAGAIN;
+	if (atomic_written) {
+		if (mode != MIGRATE_SYNC)
+			return -EBUSY;
+		if (!mutex_trylock(&fi->inmem_lock))
+			return -EAGAIN;
+	}
 
 	/*
 	 * A reference is expected if PagePrivate set when move mapping,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index cd338ca24941..87f449845f5f 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -51,9 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
 	si->aw_cnt = atomic_read(&sbi->aw_cnt);
+	si->vw_cnt = atomic_read(&sbi->vw_cnt);
 	si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
+	si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
 	si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
 	si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
+	if (SM_I(sbi) && SM_I(sbi)->fcc_info) {
+		si->nr_flushed =
+			atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
+		si->nr_flushing =
+			atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+	}
+	if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
+		si->nr_discarded =
+			atomic_read(&SM_I(sbi)->dcc_info->issued_discard);
+		si->nr_discarding =
+			atomic_read(&SM_I(sbi)->dcc_info->issing_discard);
+		si->nr_discard_cmd =
+			atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+		si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
+	}
 	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
 	si->rsvd_segs = reserved_segments(sbi);
 	si->overp_segs = overprovision_segments(sbi);
@@ -64,6 +81,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->inline_xattr = atomic_read(&sbi->inline_xattr);
 	si->inline_inode = atomic_read(&sbi->inline_inode);
 	si->inline_dir = atomic_read(&sbi->inline_dir);
+	si->append = sbi->im[APPEND_INO].ino_num;
+	si->update = sbi->im[UPDATE_INO].ino_num;
 	si->orphans = sbi->im[ORPHAN_INO].ino_num;
 	si->utilization = utilization(sbi);
 
@@ -78,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->sits = MAIN_SEGS(sbi);
 	si->dirty_sits = SIT_I(sbi)->dirty_sentries;
 	si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+	si->avail_nids = NM_I(sbi)->available_nids;
 	si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
 	si->bg_gc = sbi->bg_gc;
 	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -91,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
 		struct curseg_info *curseg = CURSEG_I(sbi, i);
 		si->curseg[i] = curseg->segno;
-		si->cursec[i] = curseg->segno / sbi->segs_per_sec;
-		si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+		si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
+		si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
 	}
 
 	for (i = 0; i < 2; i++) {
@@ -116,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 
 	bimodal = 0;
 	total_vblocks = 0;
-	blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
+	blks_per_sec = BLKS_PER_SEC(sbi);
 	hblks_per_sec = blks_per_sec / 2;
 	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
-		vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+		vblocks = get_valid_blocks(sbi, segno, true);
 		dist = abs(vblocks - hblks_per_sec);
 		bimodal += dist * dist;
 
@@ -148,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	if (si->base_mem)
 		goto get_cache;
 
-	si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+	/* build stat */
+	si->base_mem = sizeof(struct f2fs_stat_info);
+
+	/* build superblock */
+	si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
 	si->base_mem += 2 * sizeof(struct f2fs_inode_info);
 	si->base_mem += sizeof(*sbi->ckpt);
 	si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
@@ -185,6 +209,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	/* build nm */
 	si->base_mem += sizeof(struct f2fs_nm_info);
 	si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+	si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS);
+	si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE;
+	si->base_mem += NM_I(sbi)->nat_blocks / 8;
+	si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short);
 
 get_cache:
 	si->cache_mem = 0;
@@ -196,6 +224,11 @@ get_cache:
 	/* build merge flush thread */
 	if (SM_I(sbi)->fcc_info)
 		si->cache_mem += sizeof(struct flush_cmd_control);
+	if (SM_I(sbi)->dcc_info) {
+		si->cache_mem += sizeof(struct discard_cmd_control);
+		si->cache_mem += sizeof(struct discard_cmd) *
+			atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+	}
 
 	/* free nids */
 	si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
@@ -256,8 +289,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->inline_inode);
 		seq_printf(s, "  - Inline_dentry Inode: %u\n",
 			   si->inline_dir);
-		seq_printf(s, "  - Orphan Inode: %u\n",
-			   si->orphans);
+		seq_printf(s, "  - Orphan/Append/Update Inode: %u, %u, %u\n",
+			   si->orphans, si->append, si->update);
 		seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
 			   si->main_area_segs, si->main_area_sections,
 			   si->main_area_zones);
@@ -316,10 +349,16 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - IO (CP: %4d, Data: %4d)\n",
-			   si->nr_wb_cp_data, si->nr_wb_data);
-		seq_printf(s, "  - inmem: %4d, atomic IO: %4d (Max. %4d)\n",
-			   si->inmem_pages, si->aw_cnt, si->max_aw_cnt);
+		seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), "
+			"Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
+			   si->nr_wb_cp_data, si->nr_wb_data,
+			   si->nr_flushing, si->nr_flushed,
+			   si->nr_discarding, si->nr_discarded,
+			   si->nr_discard_cmd, si->undiscard_blks);
+		seq_printf(s, "  - inmem: %4d, atomic IO: %4d (Max. %4d), "
+			"volatile IO: %4d (Max. %4d)\n",
+			   si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
+			   si->vw_cnt, si->max_vw_cnt);
 		seq_printf(s, "  - nodes: %4d in %4d\n",
 			   si->ndirty_node, si->node_pages);
 		seq_printf(s, "  - dents: %4d in dirs:%4d (%4d)\n",
@@ -332,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->ndirty_imeta);
 		seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
 			   si->dirty_nats, si->nats, si->dirty_sits, si->sits);
-		seq_printf(s, "  - free_nids: %9d, alloc_nids: %9d\n",
-			   si->free_nids, si->alloc_nids);
+		seq_printf(s, "  - free_nids: %9d/%9d\n  - alloc_nids: %9d\n",
+			   si->free_nids, si->avail_nids, si->alloc_nids);
 		seq_puts(s, "\nDistribution of User Blocks:");
 		seq_puts(s, " [ valid | invalid | free ]\n");
 		seq_puts(s, "  [");
@@ -419,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	atomic_set(&sbi->inplace_count, 0);
 
 	atomic_set(&sbi->aw_cnt, 0);
+	atomic_set(&sbi->vw_cnt, 0);
 	atomic_set(&sbi->max_aw_cnt, 0);
+	atomic_set(&sbi->max_vw_cnt, 0);
 
 	mutex_lock(&f2fs_stat_mutex);
 	list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 4436079dbf0c..4f2a8fedb313 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
 
 	dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
 
-	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+	make_dentry_ptr_block(NULL, &d, dentry_blk);
 	de = find_target_dentry(fname, namehash, max_slots, &d);
 	if (de)
 		*res_page = dentry_page;
@@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
 	struct f2fs_dir_entry *de;
 	unsigned long bit_pos = 0;
 	int max_len = 0;
-	struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
-	struct fscrypt_str *name = &fname->disk_name;
 
 	if (max_slots)
 		*max_slots = 0;
@@ -130,17 +128,9 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
 			continue;
 		}
 
-		/* encrypted case */
-		de_name.name = d->filename[bit_pos];
-		de_name.len = le16_to_cpu(de->name_len);
-
-		/* show encrypted name */
-		if (fname->hash) {
-			if (de->hash_code == cpu_to_le32(fname->hash))
-				goto found;
-		} else if (de_name.len == name->len &&
-			de->hash_code == namehash &&
-			!memcmp(de_name.name, name->name, name->len))
+		if (de->hash_code == namehash &&
+		    fscrypt_match_name(fname, d->filename[bit_pos],
+				       le16_to_cpu(de->name_len)))
 			goto found;
 
 		if (max_slots && max_len > *max_slots)
@@ -170,12 +160,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 	struct f2fs_dir_entry *de = NULL;
 	bool room = false;
 	int max_slots;
-	f2fs_hash_t namehash;
-
-	if(fname->hash)
-		namehash = cpu_to_le32(fname->hash);
-	else
-		namehash = f2fs_dentry_hash(&name);
+	f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname);
 
 	nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
 	nblock = bucket_blocks(level);
@@ -250,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
 			break;
 	}
 out:
+	/* This is to increase the speed of f2fs_create */
+	if (!de)
+		F2FS_I(dir)->task = current;
 	return de;
 }
 
@@ -268,7 +256,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 
 	err = fscrypt_setup_filename(dir, child, 1, &fname);
 	if (err) {
-		*res_page = ERR_PTR(err);
+		if (err == -ENOENT)
+			*res_page = NULL;
+		else
+			*res_page = ERR_PTR(err);
 		return NULL;
 	}
 
@@ -330,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
 	set_page_dirty(ipage);
 }
 
-int update_dent_inode(struct inode *inode, struct inode *to,
-					const struct qstr *name)
-{
-	struct page *page;
-
-	if (file_enc_name(to))
-		return 0;
-
-	page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
-	if (IS_ERR(page))
-		return PTR_ERR(page);
-
-	init_dent_inode(name, page);
-	f2fs_put_page(page, 1);
-
-	return 0;
-}
-
 void do_make_empty_dir(struct inode *inode, struct inode *parent,
 					struct f2fs_dentry_ptr *d)
 {
@@ -377,7 +350,7 @@ static int make_empty_dir(struct inode *inode,
 
 	dentry_blk = kmap_atomic(dentry_page);
 
-	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+	make_dentry_ptr_block(NULL, &d, dentry_blk);
 	do_make_empty_dir(inode, parent, &d);
 
 	kunmap_atomic(dentry_blk);
@@ -431,15 +404,19 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
 		set_cold_node(inode, page);
 	}
 
-	if (new_name)
+	if (new_name) {
 		init_dent_inode(new_name, page);
+		if (f2fs_encrypted_inode(dir))
+			file_set_enc_name(inode);
+	}
 
 	/*
 	 * This file should be checkpointed during fsync.
 	 * We lost i_pino from now on.
 	 */
 	if (is_inode_flag_set(inode, FI_INC_LINK)) {
-		file_lost_pino(inode);
+		if (!S_ISDIR(inode->i_mode))
+			file_lost_pino(inode);
 		/*
 		 * If link the tmpfile to alias through linkat path,
 		 * we should remove this inode from orphan list.
@@ -535,7 +512,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
 
 	level = 0;
 	slots = GET_DENTRY_SLOTS(new_name->len);
-	dentry_hash = f2fs_dentry_hash(new_name);
+	dentry_hash = f2fs_dentry_hash(new_name, NULL);
 
 	current_depth = F2FS_I(dir)->i_current_depth;
 	if (F2FS_I(dir)->chash == dentry_hash) {
@@ -545,8 +522,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
 
 start:
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-	if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH))
+	if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) {
+		f2fs_show_injection_info(FAULT_DIR_DEPTH);
 		return -ENOSPC;
+	}
 #endif
 	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
 		return -ENOSPC;
@@ -590,11 +569,9 @@ add_dentry:
 			err = PTR_ERR(page);
 			goto fail;
 		}
-		if (f2fs_encrypted_inode(dir))
-			file_set_enc_name(inode);
 	}
 
-	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+	make_dentry_ptr_block(NULL, &d, dentry_blk);
 	f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
 
 	set_page_dirty(dentry_page);
@@ -643,14 +620,34 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 				struct inode *inode, nid_t ino, umode_t mode)
 {
 	struct fscrypt_name fname;
+	struct page *page = NULL;
+	struct f2fs_dir_entry *de = NULL;
 	int err;
 
 	err = fscrypt_setup_filename(dir, name, 0, &fname);
 	if (err)
 		return err;
 
-	err = __f2fs_do_add_link(dir, &fname, inode, ino, mode);
-
+	/*
+	 * An immature stakable filesystem shows a race condition between lookup
+	 * and create. If we have same task when doing lookup and create, it's
+	 * definitely fine as expected by VFS normally. Otherwise, let's just
+	 * verify on-disk dentry one more time, which guarantees filesystem
+	 * consistency more.
+	 */
+	if (current != F2FS_I(dir)->task) {
+		de = __f2fs_find_entry(dir, &fname, &page);
+		F2FS_I(dir)->task = NULL;
+	}
+	if (de) {
+		f2fs_dentry_kunmap(dir, page);
+		f2fs_put_page(page, 0);
+		err = -EEXIST;
+	} else if (IS_ERR(page)) {
+		err = PTR_ERR(page);
+	} else {
+		err = __f2fs_do_add_link(dir, &fname, inode, ino, mode);
+	}
 	fscrypt_free_filename(&fname);
 	return err;
 }
@@ -708,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	struct	f2fs_dentry_block *dentry_blk;
 	unsigned int bit_pos;
 	int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+	struct address_space *mapping = page_mapping(page);
+	unsigned long flags;
 	int i;
 
 	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
@@ -721,7 +720,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	dentry_blk = page_address(page);
 	bit_pos = dentry - dentry_blk->dentry;
 	for (i = 0; i < slots; i++)
-		clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+		__clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
 
 	/* Let's check and deallocate this dentry page */
 	bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
@@ -738,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 
 	if (bit_pos == NR_DENTRY_IN_BLOCK &&
 			!truncate_hole(dir, page->index, page->index + 1)) {
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		radix_tree_tag_clear(&mapping->page_tree, page_index(page),
+				     PAGECACHE_TAG_DIRTY);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
 		clear_page_dirty_for_io(page);
 		ClearPagePrivate(page);
 		ClearPageUptodate(page);
@@ -882,7 +886,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 
 		dentry_blk = kmap(dentry_page);
 
-		make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
+		make_dentry_ptr_block(inode, &d, dentry_blk);
 
 		err = f2fs_fill_dentries(ctx, &d,
 				n * NR_DENTRY_IN_BLOCK, &fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 6ed6424807b6..ff2352a0ed15 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -18,6 +18,179 @@
 #include "node.h"
 #include <trace/events/f2fs.h>
 
+static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
+							unsigned int ofs)
+{
+	if (cached_re) {
+		if (cached_re->ofs <= ofs &&
+				cached_re->ofs + cached_re->len > ofs) {
+			return cached_re;
+		}
+	}
+	return NULL;
+}
+
+static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root,
+							unsigned int ofs)
+{
+	struct rb_node *node = root->rb_node;
+	struct rb_entry *re;
+
+	while (node) {
+		re = rb_entry(node, struct rb_entry, rb_node);
+
+		if (ofs < re->ofs)
+			node = node->rb_left;
+		else if (ofs >= re->ofs + re->len)
+			node = node->rb_right;
+		else
+			return re;
+	}
+	return NULL;
+}
+
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+				struct rb_entry *cached_re, unsigned int ofs)
+{
+	struct rb_entry *re;
+
+	re = __lookup_rb_tree_fast(cached_re, ofs);
+	if (!re)
+		return __lookup_rb_tree_slow(root, ofs);
+
+	return re;
+}
+
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+				struct rb_root *root, struct rb_node **parent,
+				unsigned int ofs)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_entry *re;
+
+	while (*p) {
+		*parent = *p;
+		re = rb_entry(*parent, struct rb_entry, rb_node);
+
+		if (ofs < re->ofs)
+			p = &(*p)->rb_left;
+		else if (ofs >= re->ofs + re->len)
+			p = &(*p)->rb_right;
+		else
+			f2fs_bug_on(sbi, 1);
+	}
+
+	return p;
+}
+
+/*
+ * lookup rb entry in position of @ofs in rb-tree,
+ * if hit, return the entry, otherwise, return NULL
+ * @prev_ex: extent before ofs
+ * @next_ex: extent after ofs
+ * @insert_p: insert point for new extent at ofs
+ * in order to simpfy the insertion after.
+ * tree must stay unchanged between lookup and insertion.
+ */
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+				struct rb_entry *cached_re,
+				unsigned int ofs,
+				struct rb_entry **prev_entry,
+				struct rb_entry **next_entry,
+				struct rb_node ***insert_p,
+				struct rb_node **insert_parent,
+				bool force)
+{
+	struct rb_node **pnode = &root->rb_node;
+	struct rb_node *parent = NULL, *tmp_node;
+	struct rb_entry *re = cached_re;
+
+	*insert_p = NULL;
+	*insert_parent = NULL;
+	*prev_entry = NULL;
+	*next_entry = NULL;
+
+	if (RB_EMPTY_ROOT(root))
+		return NULL;
+
+	if (re) {
+		if (re->ofs <= ofs && re->ofs + re->len > ofs)
+			goto lookup_neighbors;
+	}
+
+	while (*pnode) {
+		parent = *pnode;
+		re = rb_entry(*pnode, struct rb_entry, rb_node);
+
+		if (ofs < re->ofs)
+			pnode = &(*pnode)->rb_left;
+		else if (ofs >= re->ofs + re->len)
+			pnode = &(*pnode)->rb_right;
+		else
+			goto lookup_neighbors;
+	}
+
+	*insert_p = pnode;
+	*insert_parent = parent;
+
+	re = rb_entry(parent, struct rb_entry, rb_node);
+	tmp_node = parent;
+	if (parent && ofs > re->ofs)
+		tmp_node = rb_next(parent);
+	*next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+
+	tmp_node = parent;
+	if (parent && ofs < re->ofs)
+		tmp_node = rb_prev(parent);
+	*prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+	return NULL;
+
+lookup_neighbors:
+	if (ofs == re->ofs || force) {
+		/* lookup prev node for merging backward later */
+		tmp_node = rb_prev(&re->rb_node);
+		*prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+	}
+	if (ofs == re->ofs + re->len - 1 || force) {
+		/* lookup next node for merging frontward later */
+		tmp_node = rb_next(&re->rb_node);
+		*next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+	}
+	return re;
+}
+
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+						struct rb_root *root)
+{
+#ifdef CONFIG_F2FS_CHECK_FS
+	struct rb_node *cur = rb_first(root), *next;
+	struct rb_entry *cur_re, *next_re;
+
+	if (!cur)
+		return true;
+
+	while (cur) {
+		next = rb_next(cur);
+		if (!next)
+			return true;
+
+		cur_re = rb_entry(cur, struct rb_entry, rb_node);
+		next_re = rb_entry(next, struct rb_entry, rb_node);
+
+		if (cur_re->ofs + cur_re->len > next_re->ofs) {
+			f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, "
+				"cur(%u, %u) next(%u, %u)",
+				cur_re->ofs, cur_re->len,
+				next_re->ofs, next_re->len);
+			return false;
+		}
+
+		cur = next;
+	}
+#endif
+	return true;
+}
+
 static struct kmem_cache *extent_tree_slab;
 static struct kmem_cache *extent_node_slab;
 
@@ -77,7 +250,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 	struct extent_tree *et;
 	nid_t ino = inode->i_ino;
 
-	down_write(&sbi->extent_tree_lock);
+	mutex_lock(&sbi->extent_tree_lock);
 	et = radix_tree_lookup(&sbi->extent_tree_root, ino);
 	if (!et) {
 		et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
@@ -94,7 +267,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 		atomic_dec(&sbi->total_zombie_tree);
 		list_del_init(&et->list);
 	}
-	up_write(&sbi->extent_tree_lock);
+	mutex_unlock(&sbi->extent_tree_lock);
 
 	/* never died until evict_inode */
 	F2FS_I(inode)->extent_tree = et;
@@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 	return et;
 }
 
-static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, unsigned int fofs)
-{
-	struct rb_node *node = et->root.rb_node;
-	struct extent_node *en = et->cached_en;
-
-	if (en) {
-		struct extent_info *cei = &en->ei;
-
-		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) {
-			stat_inc_cached_node_hit(sbi);
-			return en;
-		}
-	}
-
-	while (node) {
-		en = rb_entry(node, struct extent_node, rb_node);
-
-		if (fofs < en->ei.fofs) {
-			node = node->rb_left;
-		} else if (fofs >= en->ei.fofs + en->ei.len) {
-			node = node->rb_right;
-		} else {
-			stat_inc_rbtree_node_hit(sbi);
-			return en;
-		}
-	}
-	return NULL;
-}
-
 static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_info *ei)
 {
@@ -177,7 +320,7 @@ static void __drop_largest_extent(struct inode *inode,
 }
 
 /* return true, if inode page is changed */
-bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_tree *et;
@@ -215,6 +358,16 @@ out:
 	return false;
 }
 
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+{
+	bool ret =  __f2fs_init_extent_tree(inode, i_ext);
+
+	if (!F2FS_I(inode)->extent_tree)
+		set_inode_flag(inode, FI_NO_EXTENT);
+
+	return ret;
+}
+
 static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 							struct extent_info *ei)
 {
@@ -237,17 +390,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 		goto out;
 	}
 
-	en = __lookup_extent_tree(sbi, et, pgofs);
-	if (en) {
-		*ei = en->ei;
-		spin_lock(&sbi->extent_lock);
-		if (!list_empty(&en->list)) {
-			list_move_tail(&en->list, &sbi->extent_list);
-			et->cached_en = en;
-		}
-		spin_unlock(&sbi->extent_lock);
-		ret = true;
+	en = (struct extent_node *)__lookup_rb_tree(&et->root,
+				(struct rb_entry *)et->cached_en, pgofs);
+	if (!en)
+		goto out;
+
+	if (en == et->cached_en)
+		stat_inc_cached_node_hit(sbi);
+	else
+		stat_inc_rbtree_node_hit(sbi);
+
+	*ei = en->ei;
+	spin_lock(&sbi->extent_lock);
+	if (!list_empty(&en->list)) {
+		list_move_tail(&en->list, &sbi->extent_list);
+		et->cached_en = en;
 	}
+	spin_unlock(&sbi->extent_lock);
+	ret = true;
 out:
 	stat_inc_total_hit(sbi);
 	read_unlock(&et->lock);
@@ -256,83 +416,6 @@ out:
 	return ret;
 }
 
-
-/*
- * lookup extent at @fofs, if hit, return the extent
- * if not, return NULL and
- * @prev_ex: extent before fofs
- * @next_ex: extent after fofs
- * @insert_p: insert point for new extent at fofs
- * in order to simpfy the insertion after.
- * tree must stay unchanged between lookup and insertion.
- */
-static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
-				unsigned int fofs,
-				struct extent_node **prev_ex,
-				struct extent_node **next_ex,
-				struct rb_node ***insert_p,
-				struct rb_node **insert_parent)
-{
-	struct rb_node **pnode = &et->root.rb_node;
-	struct rb_node *parent = NULL, *tmp_node;
-	struct extent_node *en = et->cached_en;
-
-	*insert_p = NULL;
-	*insert_parent = NULL;
-	*prev_ex = NULL;
-	*next_ex = NULL;
-
-	if (RB_EMPTY_ROOT(&et->root))
-		return NULL;
-
-	if (en) {
-		struct extent_info *cei = &en->ei;
-
-		if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
-			goto lookup_neighbors;
-	}
-
-	while (*pnode) {
-		parent = *pnode;
-		en = rb_entry(*pnode, struct extent_node, rb_node);
-
-		if (fofs < en->ei.fofs)
-			pnode = &(*pnode)->rb_left;
-		else if (fofs >= en->ei.fofs + en->ei.len)
-			pnode = &(*pnode)->rb_right;
-		else
-			goto lookup_neighbors;
-	}
-
-	*insert_p = pnode;
-	*insert_parent = parent;
-
-	en = rb_entry(parent, struct extent_node, rb_node);
-	tmp_node = parent;
-	if (parent && fofs > en->ei.fofs)
-		tmp_node = rb_next(parent);
-	*next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-
-	tmp_node = parent;
-	if (parent && fofs < en->ei.fofs)
-		tmp_node = rb_prev(parent);
-	*prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-	return NULL;
-
-lookup_neighbors:
-	if (fofs == en->ei.fofs) {
-		/* lookup prev node for merging backward later */
-		tmp_node = rb_prev(&en->rb_node);
-		*prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-	}
-	if (fofs == en->ei.fofs + en->ei.len - 1) {
-		/* lookup next node for merging frontward later */
-		tmp_node = rb_next(&en->rb_node);
-		*next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-	}
-	return en;
-}
-
 static struct extent_node *__try_merge_extent_node(struct inode *inode,
 				struct extent_tree *et, struct extent_info *ei,
 				struct extent_node *prev_ex,
@@ -387,17 +470,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode,
 		goto do_insert;
 	}
 
-	while (*p) {
-		parent = *p;
-		en = rb_entry(parent, struct extent_node, rb_node);
-
-		if (ei->fofs < en->ei.fofs)
-			p = &(*p)->rb_left;
-		else if (ei->fofs >= en->ei.fofs + en->ei.len)
-			p = &(*p)->rb_right;
-		else
-			f2fs_bug_on(sbi, 1);
-	}
+	p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs);
 do_insert:
 	en = __attach_extent_node(sbi, et, ei, parent, p);
 	if (!en)
@@ -413,7 +486,7 @@ do_insert:
 	return en;
 }
 
-static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
+static void f2fs_update_extent_tree_range(struct inode *inode,
 				pgoff_t fofs, block_t blkaddr, unsigned int len)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -426,7 +499,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 	unsigned int pos = (unsigned int)fofs;
 
 	if (!et)
-		return false;
+		return;
 
 	trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len);
 
@@ -434,7 +507,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 
 	if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
 		write_unlock(&et->lock);
-		return false;
+		return;
 	}
 
 	prev = et->largest;
@@ -447,8 +520,11 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 	__drop_largest_extent(inode, fofs, len);
 
 	/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
-	en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
-					&insert_p, &insert_parent);
+	en = (struct extent_node *)__lookup_rb_tree_ret(&et->root,
+					(struct rb_entry *)et->cached_en, fofs,
+					(struct rb_entry **)&prev_en,
+					(struct rb_entry **)&next_en,
+					&insert_p, &insert_parent, false);
 	if (!en)
 		en = next_en;
 
@@ -531,8 +607,6 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 		__free_extent_tree(sbi, et);
 
 	write_unlock(&et->lock);
-
-	return !__is_extent_same(&prev, &et->largest);
 }
 
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -548,7 +622,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	if (!atomic_read(&sbi->total_zombie_tree))
 		goto free_node;
 
-	if (!down_write_trylock(&sbi->extent_tree_lock))
+	if (!mutex_trylock(&sbi->extent_tree_lock))
 		goto out;
 
 	/* 1. remove unreferenced extent tree */
@@ -570,11 +644,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 			goto unlock_out;
 		cond_resched();
 	}
-	up_write(&sbi->extent_tree_lock);
+	mutex_unlock(&sbi->extent_tree_lock);
 
 free_node:
 	/* 2. remove LRU extent entries */
-	if (!down_write_trylock(&sbi->extent_tree_lock))
+	if (!mutex_trylock(&sbi->extent_tree_lock))
 		goto out;
 
 	remained = nr_shrink - (node_cnt + tree_cnt);
@@ -604,7 +678,7 @@ free_node:
 	spin_unlock(&sbi->extent_lock);
 
 unlock_out:
-	up_write(&sbi->extent_tree_lock);
+	mutex_unlock(&sbi->extent_tree_lock);
 out:
 	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
 
@@ -651,10 +725,10 @@ void f2fs_destroy_extent_tree(struct inode *inode)
 
 	if (inode->i_nlink && !is_bad_inode(inode) &&
 					atomic_read(&et->node_cnt)) {
-		down_write(&sbi->extent_tree_lock);
+		mutex_lock(&sbi->extent_tree_lock);
 		list_add_tail(&et->list, &sbi->zombie_list);
 		atomic_inc(&sbi->total_zombie_tree);
-		up_write(&sbi->extent_tree_lock);
+		mutex_unlock(&sbi->extent_tree_lock);
 		return;
 	}
 
@@ -662,12 +736,12 @@ void f2fs_destroy_extent_tree(struct inode *inode)
 	node_cnt = f2fs_destroy_extent_node(inode);
 
 	/* delete extent tree entry in radix tree */
-	down_write(&sbi->extent_tree_lock);
+	mutex_lock(&sbi->extent_tree_lock);
 	f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
 	radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
 	kmem_cache_free(extent_tree_slab, et);
 	atomic_dec(&sbi->total_ext_tree);
-	up_write(&sbi->extent_tree_lock);
+	mutex_unlock(&sbi->extent_tree_lock);
 
 	F2FS_I(inode)->extent_tree = NULL;
 
@@ -714,7 +788,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
 void init_extent_cache_info(struct f2fs_sb_info *sbi)
 {
 	INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
-	init_rwsem(&sbi->extent_tree_lock);
+	mutex_init(&sbi->extent_tree_lock);
 	INIT_LIST_HEAD(&sbi->extent_list);
 	spin_lock_init(&sbi->extent_lock);
 	atomic_set(&sbi->total_ext_tree, 0);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 167c5f841b5f..ff694127243a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,7 +22,12 @@
 #include <linux/vmalloc.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/fscrypto.h>
+#include <linux/quotaops.h>
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#include <linux/fscrypt_supp.h>
+#else
+#include <linux/fscrypt_notsupp.h>
+#endif
 #include <crypto/hash.h>
 #include <linux/writeback.h>
 
@@ -47,6 +52,7 @@ enum {
 	FAULT_BLOCK,
 	FAULT_DIR_DEPTH,
 	FAULT_EVICT_INODE,
+	FAULT_TRUNCATE,
 	FAULT_IO,
 	FAULT_CHECKPOINT,
 	FAULT_MAX,
@@ -59,7 +65,7 @@ struct f2fs_fault_info {
 };
 
 extern char *fault_name[FAULT_MAX];
-#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type)))
+#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
 #endif
 
 /*
@@ -84,10 +90,14 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_FAULT_INJECTION	0x00010000
 #define F2FS_MOUNT_ADAPTIVE		0x00020000
 #define F2FS_MOUNT_LFS			0x00040000
+#define F2FS_MOUNT_USRQUOTA		0x00080000
+#define F2FS_MOUNT_GRPQUOTA		0x00100000
+#define F2FS_MOUNT_PRJQUOTA		0x00200000
+#define F2FS_MOUNT_QUOTA		0x00400000
 
-#define clear_opt(sbi, option)	(sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
-#define set_opt(sbi, option)	(sbi->mount_opt.opt |= F2FS_MOUNT_##option)
-#define test_opt(sbi, option)	(sbi->mount_opt.opt & F2FS_MOUNT_##option)
+#define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option)	((sbi)->mount_opt.opt & F2FS_MOUNT_##option)
 
 #define ver_after(a, b)	(typecheck(unsigned long long, a) &&		\
 		typecheck(unsigned long long, b) &&			\
@@ -103,15 +113,19 @@ struct f2fs_mount_info {
 	unsigned int	opt;
 };
 
-#define F2FS_FEATURE_ENCRYPT	0x0001
-#define F2FS_FEATURE_BLKZONED	0x0002
+#define F2FS_FEATURE_ENCRYPT		0x0001
+#define F2FS_FEATURE_BLKZONED		0x0002
+#define F2FS_FEATURE_ATOMIC_WRITE	0x0004
+#define F2FS_FEATURE_EXTRA_ATTR		0x0008
+#define F2FS_FEATURE_PRJQUOTA		0x0010
+#define F2FS_FEATURE_INODE_CHKSUM	0x0020
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
 #define F2FS_SET_FEATURE(sb, mask)					\
-	F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)
+	(F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask))
 #define F2FS_CLEAR_FEATURE(sb, mask)					\
-	F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
+	(F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask))
 
 /* bio stuffs */
 #define REQ_OP_READ	READ
@@ -187,19 +201,22 @@ enum {
 	SIT_BITMAP
 };
 
-enum {
-	CP_UMOUNT,
-	CP_FASTBOOT,
-	CP_SYNC,
-	CP_RECOVERY,
-	CP_DISCARD,
-};
+#define	CP_UMOUNT	0x00000001
+#define	CP_FASTBOOT	0x00000002
+#define	CP_SYNC		0x00000004
+#define	CP_RECOVERY	0x00000008
+#define	CP_DISCARD	0x00000010
+#define CP_TRIMMED	0x00000020
 
-#define DEF_BATCHED_TRIM_SECTIONS	2
+#define DEF_BATCHED_TRIM_SECTIONS	2048
 #define BATCHED_TRIM_SEGMENTS(sbi)	\
-		(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
+		(GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections))
 #define BATCHED_TRIM_BLOCKS(sbi)	\
 		(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
+#define MAX_DISCARD_BLOCKS(sbi)		BLKS_PER_SEC(sbi)
+#define DISCARD_ISSUE_RATE		8
+#define DEF_MIN_DISCARD_ISSUE_TIME	50	/* 50 ms, if exists */
+#define DEF_MAX_DISCARD_ISSUE_TIME	60000	/* 60 s, if no candidates */
 #define DEF_CP_INTERVAL			60	/* 60 secs */
 #define DEF_IDLE_INTERVAL		5	/* 5 secs */
 
@@ -241,19 +258,73 @@ struct inode_entry {
 	struct inode *inode;	/* vfs inode pointer */
 };
 
-/* for the list of blockaddresses to be discarded */
+/* for the bitmap indicate blocks to be discarded */
 struct discard_entry {
 	struct list_head list;	/* list head */
-	block_t blkaddr;	/* block address to be discarded */
-	int len;		/* # of consecutive blocks of the discard */
+	block_t start_blkaddr;	/* start blockaddr of current segment */
+	unsigned char discard_map[SIT_VBLOCK_MAP_SIZE];	/* segment discard bitmap */
+};
+
+/* default discard granularity of inner discard thread, unit: block count */
+#define DEFAULT_DISCARD_GRANULARITY		16
+
+/* max discard pend list number */
+#define MAX_PLIST_NUM		512
+#define plist_idx(blk_num)	((blk_num) >= MAX_PLIST_NUM ?		\
+					(MAX_PLIST_NUM - 1) : (blk_num - 1))
+
+#define P_ACTIVE	0x01
+#define P_TRIM		0x02
+#define plist_issue(tag)	(((tag) & P_ACTIVE) || ((tag) & P_TRIM))
+
+enum {
+	D_PREP,
+	D_SUBMIT,
+	D_DONE,
+};
+
+struct discard_info {
+	block_t lstart;			/* logical start address */
+	block_t len;			/* length */
+	block_t start;			/* actual start address in dev */
 };
 
 struct discard_cmd {
+	struct rb_node rb_node;		/* rb node located in rb-tree */
+	union {
+		struct {
+			block_t lstart;	/* logical start address */
+			block_t len;	/* length */
+			block_t start;	/* actual start address in dev */
+		};
+		struct discard_info di;	/* discard info */
+
+	};
 	struct list_head list;		/* command list */
 	struct completion wait;		/* compleation */
-	block_t lstart;			/* logical start address */
-	block_t len;			/* length */
-	struct bio *bio;		/* bio */
+	struct block_device *bdev;	/* bdev */
+	unsigned short ref;		/* reference count */
+	unsigned char state;		/* state */
+	int error;			/* bio error */
+};
+
+struct discard_cmd_control {
+	struct task_struct *f2fs_issue_discard;	/* discard thread */
+	struct list_head entry_list;		/* 4KB discard entry list */
+	struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
+	unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */
+	struct list_head wait_list;		/* store on-flushing entries */
+	wait_queue_head_t discard_wait_queue;	/* waiting queue for wake-up */
+	unsigned int discard_wake;		/* to wake up discard thread */
+	struct mutex cmd_lock;
+	unsigned int nr_discards;		/* # of discards in the list */
+	unsigned int max_discards;		/* max. discards to be issued */
+	unsigned int discard_granularity;	/* discard granularity */
+	unsigned int undiscard_blks;		/* # of undiscard blocks */
+	atomic_t issued_discard;		/* # of issued discard */
+	atomic_t issing_discard;		/* # of issing discard */
+	atomic_t discard_cmd_cnt;		/* # of cached cmd count */
+	struct rb_root root;			/* root of discard rb-tree */
 };
 
 /* for the list of fsync inodes, used only during recovery */
@@ -264,13 +335,13 @@ struct fsync_inode_entry {
 	block_t last_dentry;	/* block address locating the last dentry */
 };
 
-#define nats_in_cursum(jnl)		(le16_to_cpu(jnl->n_nats))
-#define sits_in_cursum(jnl)		(le16_to_cpu(jnl->n_sits))
+#define nats_in_cursum(jnl)		(le16_to_cpu((jnl)->n_nats))
+#define sits_in_cursum(jnl)		(le16_to_cpu((jnl)->n_sits))
 
-#define nat_in_journal(jnl, i)		(jnl->nat_j.entries[i].ne)
-#define nid_in_journal(jnl, i)		(jnl->nat_j.entries[i].nid)
-#define sit_in_journal(jnl, i)		(jnl->sit_j.entries[i].se)
-#define segno_in_journal(jnl, i)	(jnl->sit_j.entries[i].segno)
+#define nat_in_journal(jnl, i)		((jnl)->nat_j.entries[i].ne)
+#define nid_in_journal(jnl, i)		((jnl)->nat_j.entries[i].nid)
+#define sit_in_journal(jnl, i)		((jnl)->sit_j.entries[i].se)
+#define segno_in_journal(jnl, i)	((jnl)->sit_j.entries[i].segno)
 
 #define MAX_NAT_JENTRIES(jnl)	(NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
 #define MAX_SIT_JENTRIES(jnl)	(SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
@@ -278,6 +349,7 @@ struct fsync_inode_entry {
 static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
 {
 	int before = nats_in_cursum(journal);
+
 	journal->n_nats = cpu_to_le16(before + i);
 	return before;
 }
@@ -285,6 +357,7 @@ static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
 static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i)
 {
 	int before = sits_in_cursum(journal);
+
 	journal->n_sits = cpu_to_le16(before + i);
 	return before;
 }
@@ -310,11 +383,17 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
 #define F2FS_IOC_START_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 3)
 #define F2FS_IOC_RELEASE_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 4)
 #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
-#define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
+#define F2FS_IOC_GARBAGE_COLLECT	_IOW(F2FS_IOCTL_MAGIC, 6, __u32)
 #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
-#define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
+#define F2FS_IOC_DEFRAGMENT		_IOWR(F2FS_IOCTL_MAGIC, 8,	\
+						struct f2fs_defragment)
 #define F2FS_IOC_MOVE_RANGE		_IOWR(F2FS_IOCTL_MAGIC, 9,	\
 						struct f2fs_move_range)
+#define F2FS_IOC_FLUSH_DEVICE		_IOW(F2FS_IOCTL_MAGIC, 10,	\
+						struct f2fs_flush_device)
+#define F2FS_IOC_GARBAGE_COLLECT_RANGE	_IOW(F2FS_IOCTL_MAGIC, 11,	\
+						struct f2fs_gc_range)
+#define F2FS_IOC_GET_FEATURES		_IOR(F2FS_IOCTL_MAGIC, 12, __u32)
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY	FS_IOC_SET_ENCRYPTION_POLICY
 #define F2FS_IOC_GET_ENCRYPTION_POLICY	FS_IOC_GET_ENCRYPTION_POLICY
@@ -339,6 +418,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
 #define F2FS_IOC32_GETVERSION		FS_IOC32_GETVERSION
 #endif
 
+struct f2fs_gc_range {
+	u32 sync;
+	u64 start;
+	u64 len;
+};
+
 struct f2fs_defragment {
 	u64 start;
 	u64 len;
@@ -351,36 +436,68 @@ struct f2fs_move_range {
 	u64 len;		/* size to move */
 };
 
+struct f2fs_flush_device {
+	u32 dev_num;		/* device number to flush */
+	u32 segments;		/* # of segments to flush */
+};
+
+/* for inline stuff */
+#define DEF_INLINE_RESERVED_SIZE	1
+static inline int get_extra_isize(struct inode *inode);
+#define MAX_INLINE_DATA(inode)	(sizeof(__le32) * \
+				(CUR_ADDRS_PER_INODE(inode) - \
+				DEF_INLINE_RESERVED_SIZE - \
+				F2FS_INLINE_XATTR_ADDRS))
+
+/* for inline dir */
+#define NR_INLINE_DENTRY(inode)	(MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
+				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+				BITS_PER_BYTE + 1))
+#define INLINE_DENTRY_BITMAP_SIZE(inode)	((NR_INLINE_DENTRY(inode) + \
+					BITS_PER_BYTE - 1) / BITS_PER_BYTE)
+#define INLINE_RESERVED_SIZE(inode)	(MAX_INLINE_DATA(inode) - \
+				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+				NR_INLINE_DENTRY(inode) + \
+				INLINE_DENTRY_BITMAP_SIZE(inode)))
+
 /*
  * For INODE and NODE manager
  */
 /* for directory operations */
 struct f2fs_dentry_ptr {
 	struct inode *inode;
-	const void *bitmap;
+	void *bitmap;
 	struct f2fs_dir_entry *dentry;
 	__u8 (*filename)[F2FS_SLOT_LEN];
 	int max;
+	int nr_bitmap;
 };
 
-static inline void make_dentry_ptr(struct inode *inode,
-		struct f2fs_dentry_ptr *d, void *src, int type)
+static inline void make_dentry_ptr_block(struct inode *inode,
+		struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t)
 {
 	d->inode = inode;
+	d->max = NR_DENTRY_IN_BLOCK;
+	d->nr_bitmap = SIZE_OF_DENTRY_BITMAP;
+	d->bitmap = &t->dentry_bitmap;
+	d->dentry = t->dentry;
+	d->filename = t->filename;
+}
 
-	if (type == 1) {
-		struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src;
-		d->max = NR_DENTRY_IN_BLOCK;
-		d->bitmap = &t->dentry_bitmap;
-		d->dentry = t->dentry;
-		d->filename = t->filename;
-	} else {
-		struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src;
-		d->max = NR_INLINE_DENTRY;
-		d->bitmap = &t->dentry_bitmap;
-		d->dentry = t->dentry;
-		d->filename = t->filename;
-	}
+static inline void make_dentry_ptr_inline(struct inode *inode,
+					struct f2fs_dentry_ptr *d, void *t)
+{
+	int entry_cnt = NR_INLINE_DENTRY(inode);
+	int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode);
+	int reserved_size = INLINE_RESERVED_SIZE(inode);
+
+	d->inode = inode;
+	d->max = entry_cnt;
+	d->nr_bitmap = bitmap_size;
+	d->bitmap = t;
+	d->dentry = t + bitmap_size + reserved_size;
+	d->filename = t + bitmap_size + reserved_size +
+					SIZE_OF_DIR_ENTRY * entry_cnt;
 }
 
 /*
@@ -412,16 +529,30 @@ enum {
 /* number of extent info in extent cache we try to shrink */
 #define EXTENT_CACHE_SHRINK_NUMBER	128
 
+struct rb_entry {
+	struct rb_node rb_node;		/* rb node located in rb-tree */
+	unsigned int ofs;		/* start offset of the entry */
+	unsigned int len;		/* length of the entry */
+};
+
 struct extent_info {
 	unsigned int fofs;		/* start offset in a file */
-	u32 blk;			/* start block address of the extent */
 	unsigned int len;		/* length of the extent */
+	u32 blk;			/* start block address of the extent */
 };
 
 struct extent_node {
-	struct rb_node rb_node;		/* rb node located in rb-tree */
+	struct rb_node rb_node;
+	union {
+		struct {
+			unsigned int fofs;
+			unsigned int len;
+			u32 blk;
+		};
+		struct extent_info ei;	/* extent info */
+
+	};
 	struct list_head list;		/* node in global extent list of sbi */
-	struct extent_info ei;		/* extent info */
 	struct extent_tree *et;		/* extent tree pointer */
 };
 
@@ -455,12 +586,13 @@ struct f2fs_map_blocks {
 };
 
 /* for flag in get_data_block */
-#define F2FS_GET_BLOCK_READ		0
-#define F2FS_GET_BLOCK_DIO		1
-#define F2FS_GET_BLOCK_FIEMAP		2
-#define F2FS_GET_BLOCK_BMAP		3
-#define F2FS_GET_BLOCK_PRE_DIO		4
-#define F2FS_GET_BLOCK_PRE_AIO		5
+enum {
+	F2FS_GET_BLOCK_DEFAULT,
+	F2FS_GET_BLOCK_FIEMAP,
+	F2FS_GET_BLOCK_BMAP,
+	F2FS_GET_BLOCK_PRE_DIO,
+	F2FS_GET_BLOCK_PRE_AIO,
+};
 
 /*
  * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -502,16 +634,29 @@ struct f2fs_inode_info {
 	atomic_t dirty_pages;		/* # of dirty pages */
 	f2fs_hash_t chash;		/* hash value of given file name */
 	unsigned int clevel;		/* maximum level of given file name */
+	struct task_struct *task;	/* lookup and create consistency */
+	struct task_struct *cp_task;	/* separate cp/wb IO stats*/
 	nid_t i_xattr_nid;		/* node id that contains xattrs */
-	unsigned long long xattr_ver;	/* cp version of xattr modification */
 	loff_t	last_disk_size;		/* lastly written file size */
 
+#ifdef CONFIG_QUOTA
+	struct dquot *i_dquot[MAXQUOTAS];
+
+	/* quota space reservation, managed internally by quota code */
+	qsize_t i_reserved_quota;
+#endif
 	struct list_head dirty_list;	/* dirty list for dirs and files */
 	struct list_head gdirty_list;	/* linked in global dirty list */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
+	struct task_struct *inmem_task;	/* store inmemory task */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
 	struct extent_tree *extent_tree;	/* cached extent_tree entry */
 	struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
+	struct rw_semaphore i_mmap_sem;
+	struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
+
+	int i_extra_isize;		/* size of extra space located in i_addr */
+	kprojid_t i_projid;		/* id for project quota */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -538,11 +683,22 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
 	ei->len = len;
 }
 
-static inline bool __is_extent_same(struct extent_info *ei1,
-						struct extent_info *ei2)
+static inline bool __is_discard_mergeable(struct discard_info *back,
+						struct discard_info *front)
 {
-	return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk &&
-						ei1->len == ei2->len);
+	return back->lstart + back->len == front->lstart;
+}
+
+static inline bool __is_discard_back_mergeable(struct discard_info *cur,
+						struct discard_info *back)
+{
+	return __is_discard_mergeable(back, cur);
+}
+
+static inline bool __is_discard_front_mergeable(struct discard_info *cur,
+						struct discard_info *front)
+{
+	return __is_discard_mergeable(cur, front);
 }
 
 static inline bool __is_extent_mergeable(struct extent_info *back,
@@ -564,7 +720,7 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
 	return __is_extent_mergeable(cur, front);
 }
 
-extern void f2fs_mark_inode_dirty_sync(struct inode *, bool);
+extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
 static inline void __try_update_largest_extent(struct inode *inode,
 			struct extent_tree *et, struct extent_node *en)
 {
@@ -596,6 +752,7 @@ struct f2fs_nm_info {
 	struct list_head nat_entries;	/* cached nat entry list (clean) */
 	unsigned int nat_cnt;		/* the # of cached nat entries */
 	unsigned int dirty_nat_cnt;	/* total num of nat entries in set */
+	unsigned int nat_blocks;	/* # of nat blocks */
 
 	/* free node ids management */
 	struct radix_tree_root free_nid_root;/* root of the free_nid cache */
@@ -603,9 +760,17 @@ struct f2fs_nm_info {
 	unsigned int nid_cnt[MAX_NID_LIST];	/* the number of free node id */
 	spinlock_t nid_list_lock;	/* protect nid lists ops */
 	struct mutex build_lock;	/* lock for build free nids */
+	unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
+	unsigned char *nat_block_bitmap;
+	unsigned short *free_nid_count;	/* free nid count of NAT block */
 
 	/* for checkpoint */
 	char *nat_bitmap;		/* NAT bitmap pointer */
+
+	unsigned int nat_bits_blocks;	/* # of nat bits blocks */
+	unsigned char *nat_bits;	/* NAT bits blocks */
+	unsigned char *full_nat_bits;	/* full NAT pages */
+	unsigned char *empty_nat_bits;	/* empty NAT pages */
 #ifdef CONFIG_F2FS_CHECK_FS
 	char *nat_bitmap_mir;		/* NAT bitmap mirror */
 #endif
@@ -676,7 +841,8 @@ struct flush_cmd {
 struct flush_cmd_control {
 	struct task_struct *f2fs_issue_flush;	/* flush thread */
 	wait_queue_head_t flush_wait_queue;	/* waiting queue for wake-up */
-	atomic_t submit_flush;			/* # of issued flushes */
+	atomic_t issued_flush;			/* # of issued flushes */
+	atomic_t issing_flush;			/* # of issing flushes */
 	struct llist_head issue_list;		/* list for command issue */
 	struct llist_node *dispatch_list;	/* list for command dispatch */
 };
@@ -699,12 +865,6 @@ struct f2fs_sm_info {
 	/* a threshold to reclaim prefree segments */
 	unsigned int rec_prefree_segments;
 
-	/* for small discard management */
-	struct list_head discard_entry_list;	/* 4KB discard entry list */
-	struct list_head discard_cmd_list;	/* discard cmd list */
-	int nr_discards;			/* # of discards in the list */
-	int max_discards;			/* max. discards to be issued */
-
 	/* for batched trimming */
 	unsigned int trim_sections;		/* # of sections to trim */
 
@@ -713,9 +873,13 @@ struct f2fs_sm_info {
 	unsigned int ipu_policy;	/* in-place-update policy */
 	unsigned int min_ipu_util;	/* in-place-update threshold */
 	unsigned int min_fsync_blocks;	/* threshold for fsync */
+	unsigned int min_hot_blocks;	/* threshold for hot block allocation */
 
 	/* for flush command control */
 	struct flush_cmd_control *fcc_info;
+
+	/* for discard command control */
+	struct discard_cmd_control *dcc_info;
 };
 
 /*
@@ -760,29 +924,68 @@ enum page_type {
 	META_FLUSH,
 	INMEM,		/* the below types are used by tracepoints only. */
 	INMEM_DROP,
+	INMEM_INVALIDATE,
 	INMEM_REVOKE,
 	IPU,
 	OPU,
 };
 
+enum temp_type {
+	HOT = 0,	/* must be zero for meta bio */
+	WARM,
+	COLD,
+	NR_TEMP_TYPE,
+};
+
+enum need_lock_type {
+	LOCK_REQ = 0,
+	LOCK_DONE,
+	LOCK_RETRY,
+};
+
+enum iostat_type {
+	APP_DIRECT_IO,			/* app direct IOs */
+	APP_BUFFERED_IO,		/* app buffered IOs */
+	APP_WRITE_IO,			/* app write IOs */
+	APP_MAPPED_IO,			/* app mapped IOs */
+	FS_DATA_IO,			/* data IOs from kworker/fsync/reclaimer */
+	FS_NODE_IO,			/* node IOs from kworker/fsync/reclaimer */
+	FS_META_IO,			/* meta IOs from kworker/reclaimer */
+	FS_GC_DATA_IO,			/* data IOs from forground gc */
+	FS_GC_NODE_IO,			/* node IOs from forground gc */
+	FS_CP_DATA_IO,			/* data IOs from checkpoint */
+	FS_CP_NODE_IO,			/* node IOs from checkpoint */
+	FS_CP_META_IO,			/* meta IOs from checkpoint */
+	FS_DISCARD,			/* discard */
+	NR_IO_TYPE,
+};
+
 struct f2fs_io_info {
 	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
+	enum temp_type temp;	/* contains HOT/WARM/COLD */
 	int op;			/* contains REQ_OP_ */
 	int op_flags;		/* req_flag_bits */
 	block_t new_blkaddr;	/* new block address to be written */
 	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
 	struct page *encrypted_page;	/* encrypted page */
+	struct list_head list;		/* serialize IOs */
+	bool submitted;		/* indicate IO submission */
+	int need_lock;		/* indicate we need to lock cp_rwsem */
+	bool in_list;		/* indicate fio is in io_list */
+	enum iostat_type io_type;	/* io type */
 };
 
-#define is_read_io(rw) (rw == READ)
+#define is_read_io(rw) ((rw) == READ)
 struct f2fs_bio_info {
 	struct f2fs_sb_info *sbi;	/* f2fs superblock */
 	struct bio *bio;		/* bios to merge */
 	sector_t last_block_in_bio;	/* last block number */
 	struct f2fs_io_info fio;	/* store buffered io info. */
 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
+	spinlock_t io_lock;		/* serialize DATA/NODE IOs */
+	struct list_head io_list;	/* track fios */
 };
 
 #define FDEV(i)				(sbi->devs[i])
@@ -830,10 +1033,6 @@ enum {
 	MAX_TIME,
 };
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-#define F2FS_KEY_DESC_PREFIX "f2fs:"
-#define F2FS_KEY_DESC_PREFIX_SIZE 5
-#endif
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
@@ -841,11 +1040,6 @@ struct f2fs_sb_info {
 	int valid_super_block;			/* valid super block no */
 	unsigned long s_flag;				/* flags for sbi */
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
-	u8 key_prefix_size;
-#endif
-
 #ifdef CONFIG_BLK_DEV_ZONED
 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
 	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
@@ -859,9 +1053,9 @@ struct f2fs_sb_info {
 	struct f2fs_sm_info *sm_info;		/* segment manager */
 
 	/* for bio operations */
-	struct f2fs_bio_info read_io;			/* for read bios */
-	struct f2fs_bio_info write_io[NR_PAGE_TYPE];	/* for write bios */
-	struct mutex wio_mutex[NODE + 1];	/* bio ordering for NODE/DATA */
+	struct f2fs_bio_info *write_io[NR_PAGE_TYPE];	/* for write bios */
+	struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE];
+						/* bio ordering for NODE/DATA */
 	int write_io_size_bits;			/* Write IO size bits */
 	mempool_t *write_io_dummy;		/* Dummy pages */
 
@@ -873,6 +1067,7 @@ struct f2fs_sb_info {
 	struct mutex cp_mutex;			/* checkpoint procedure lock */
 	struct rw_semaphore cp_rwsem;		/* blocking FS operations */
 	struct rw_semaphore node_write;		/* locking node writes */
+	struct rw_semaphore node_change;	/* locking node change */
 	wait_queue_head_t cp_wait;
 	unsigned long last_time[MAX_TIME];	/* to store time in jiffies */
 	long interval_time[MAX_TIME];		/* to store thresholds */
@@ -888,7 +1083,7 @@ struct f2fs_sb_info {
 
 	/* for extent tree cache */
 	struct radix_tree_root extent_tree_root;/* cache extent cache entries */
-	struct rw_semaphore extent_tree_lock;	/* locking extent radix tree */
+	struct mutex extent_tree_lock;	/* locking extent radix tree */
 	struct list_head extent_list;		/* lru list for shrinker */
 	spinlock_t extent_lock;			/* locking extent lru list */
 	atomic_t total_ext_tree;		/* extent tree count */
@@ -918,6 +1113,8 @@ struct f2fs_sb_info {
 	block_t total_valid_block_count;	/* # of valid blocks */
 	block_t discard_blks;			/* discard command candidats */
 	block_t last_valid_block_count;		/* for recovery */
+	block_t reserved_blocks;		/* configurable reserved blocks */
+
 	u32 s_next_generation;			/* for NFS support */
 
 	/* # of pages, see count_type */
@@ -925,6 +1122,9 @@ struct f2fs_sb_info {
 	/* # of allocated blocks */
 	struct percpu_counter alloc_valid_block_count;
 
+	/* writeback control */
+	atomic_t wb_sync_req;			/* count # of WB_SYNC threads */
+
 	/* valid inode count */
 	struct percpu_counter total_valid_inode_count;
 
@@ -935,6 +1135,9 @@ struct f2fs_sb_info {
 	struct f2fs_gc_kthread	*gc_thread;	/* GC thread */
 	unsigned int cur_victim_sec;		/* current victim section num */
 
+	/* threshold for converting bg victims for fg */
+	u64 fggc_threshold;
+
 	/* maximum # of trials to find a victim segment for SSR and GC */
 	unsigned int max_victim_search;
 
@@ -955,13 +1158,19 @@ struct f2fs_sb_info {
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
 	atomic_t aw_cnt;			/* # of atomic writes */
+	atomic_t vw_cnt;			/* # of volatile writes */
 	atomic_t max_aw_cnt;			/* max # of atomic writes */
+	atomic_t max_vw_cnt;			/* max # of volatile writes */
 	int bg_gc;				/* background gc calls */
 	unsigned int ndirty_inode[NR_INODE_TYPE];	/* # of dirty inodes */
 #endif
-	unsigned int last_victim[2];		/* last victim segment # */
 	spinlock_t stat_lock;			/* lock for stat operations */
 
+	/* For app/fs IO statistics */
+	spinlock_t iostat_lock;
+	unsigned long long write_iostat[NR_IO_TYPE];
+	bool iostat_enable;
+
 	/* For sysfs suppport */
 	struct kobject s_kobj;
 	struct completion s_kobj_unregister;
@@ -980,13 +1189,26 @@ struct f2fs_sb_info {
 	/* Reference to checksum algorithm driver via cryptoapi */
 	struct crypto_shash *s_chksum_driver;
 
+	/* Precomputed FS UUID checksum for seeding other checksums */
+	__u32 s_chksum_seed;
+
 	/* For fault injection */
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	struct f2fs_fault_info fault_info;
 #endif
+
+#ifdef CONFIG_QUOTA
+	/* Names of quota files with journalled quota */
+	char *s_qf_names[MAXQUOTAS];
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
 };
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
+#define f2fs_show_injection_info(type)				\
+	printk("%sF2FS-fs : inject %s in %s of %pF\n",		\
+		KERN_INFO, fault_name[type],			\
+		__func__, __builtin_return_address(0))
 static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
 {
 	struct f2fs_fault_info *ffi = &sbi->fault_info;
@@ -1000,10 +1222,6 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
 	atomic_inc(&ffi->inject_ops);
 	if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
 		atomic_set(&ffi->inject_ops, 0);
-		printk("%sF2FS-fs : inject %s in %pF\n",
-				KERN_INFO,
-				fault_name[type],
-				__builtin_return_address(0));
 		return true;
 	}
 	return false;
@@ -1014,8 +1232,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
  * and the return value is in kbytes. s is of struct f2fs_sb_info.
  */
 #define BD_PART_WRITTEN(s)						 \
-(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) -		 \
-		s->sectors_written_start) >> 1)
+(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) -		 \
+		(s)->sectors_written_start) >> 1)
 
 static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
 {
@@ -1068,6 +1286,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
 	return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
 }
 
+static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc,
+			      const void *address, unsigned int length)
+{
+	struct {
+		struct shash_desc shash;
+		char ctx[4];
+	} desc;
+	int err;
+
+	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx));
+
+	desc.shash.tfm = sbi->s_chksum_driver;
+	desc.shash.flags = 0;
+	*(u32 *)desc.ctx = crc;
+
+	err = crypto_shash_update(&desc.shash, address, length);
+	BUG_ON(err);
+
+	return *(u32 *)desc.ctx;
+}
+
 static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
 {
 	return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -1168,6 +1407,12 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
 	return le64_to_cpu(cp->checkpoint_ver);
 }
 
+static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp)
+{
+	size_t crc_offset = le32_to_cpu(cp->checksum_offset);
+	return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset)));
+}
+
 static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 {
 	unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
@@ -1191,9 +1436,11 @@ static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 
 static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
 {
-	spin_lock(&sbi->cp_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->cp_lock, flags);
 	__set_ckpt_flags(F2FS_CKPT(sbi), f);
-	spin_unlock(&sbi->cp_lock);
+	spin_unlock_irqrestore(&sbi->cp_lock, flags);
 }
 
 static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
@@ -1207,9 +1454,34 @@ static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f
 
 static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
 {
-	spin_lock(&sbi->cp_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->cp_lock, flags);
 	__clear_ckpt_flags(F2FS_CKPT(sbi), f);
-	spin_unlock(&sbi->cp_lock);
+	spin_unlock_irqrestore(&sbi->cp_lock, flags);
+}
+
+static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
+{
+	unsigned long flags;
+
+	set_sbi_flag(sbi, SBI_NEED_FSCK);
+
+	if (lock)
+		spin_lock_irqsave(&sbi->cp_lock, flags);
+	__clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
+	kfree(NM_I(sbi)->nat_bits);
+	NM_I(sbi)->nat_bits = NULL;
+	if (lock)
+		spin_unlock_irqrestore(&sbi->cp_lock, flags);
+}
+
+static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
+					struct cp_control *cpc)
+{
+	bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+
+	return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
 }
 
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
@@ -1217,6 +1489,11 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 	down_read(&sbi->cp_rwsem);
 }
 
+static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
+{
+	return down_read_trylock(&sbi->cp_rwsem);
+}
+
 static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
 {
 	up_read(&sbi->cp_rwsem);
@@ -1245,7 +1522,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
 
 static inline bool __remain_node_summaries(int reason)
 {
-	return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+	return (reason & (CP_UMOUNT | CP_FASTBOOT));
 }
 
 static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
@@ -1266,17 +1543,14 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
 	return 0;
 }
 
-#define F2FS_DEFAULT_ALLOCATED_BLOCKS	1
-
 /*
  * Check whether the inode has blocks or not
  */
 static inline int F2FS_HAS_BLOCKS(struct inode *inode)
 {
-	if (F2FS_I(inode)->i_xattr_nid)
-		return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1;
-	else
-		return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
+	block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0;
+
+	return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block;
 }
 
 static inline bool f2fs_has_xattr_block(unsigned int ofs)
@@ -1284,15 +1558,24 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
 	return ofs == XATTR_NODE_OFFSET;
 }
 
-static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool);
-static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
+static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool);
+static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 				 struct inode *inode, blkcnt_t *count)
 {
-	blkcnt_t diff;
+	blkcnt_t diff = 0, release = 0;
+	block_t avail_user_block_count;
+	int ret;
+
+	ret = dquot_reserve_block(inode, *count);
+	if (ret)
+		return ret;
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-	if (time_to_inject(sbi, FAULT_BLOCK))
-		return false;
+	if (time_to_inject(sbi, FAULT_BLOCK)) {
+		f2fs_show_injection_info(FAULT_BLOCK);
+		release = *count;
+		goto enospc;
+	}
 #endif
 	/*
 	 * let's increase this in prior to actual block count change in order
@@ -1302,32 +1585,42 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
 
 	spin_lock(&sbi->stat_lock);
 	sbi->total_valid_block_count += (block_t)(*count);
-	if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) {
-		diff = sbi->total_valid_block_count - sbi->user_block_count;
+	avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks;
+	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
+		diff = sbi->total_valid_block_count - avail_user_block_count;
 		*count -= diff;
-		sbi->total_valid_block_count = sbi->user_block_count;
+		release = diff;
+		sbi->total_valid_block_count = avail_user_block_count;
 		if (!*count) {
 			spin_unlock(&sbi->stat_lock);
 			percpu_counter_sub(&sbi->alloc_valid_block_count, diff);
-			return false;
+			goto enospc;
 		}
 	}
 	spin_unlock(&sbi->stat_lock);
 
-	f2fs_i_blocks_write(inode, *count, true);
-	return true;
+	if (release)
+		dquot_release_reservation_block(inode, release);
+	f2fs_i_blocks_write(inode, *count, true, true);
+	return 0;
+
+enospc:
+	dquot_release_reservation_block(inode, release);
+	return -ENOSPC;
 }
 
 static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 						struct inode *inode,
-						blkcnt_t count)
+						block_t count)
 {
+	blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK;
+
 	spin_lock(&sbi->stat_lock);
 	f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
-	f2fs_bug_on(sbi, inode->i_blocks < count);
+	f2fs_bug_on(sbi, inode->i_blocks < sectors);
 	sbi->total_valid_block_count -= (block_t)count;
 	spin_unlock(&sbi->stat_lock);
-	f2fs_i_blocks_write(inode, count, false);
+	f2fs_i_blocks_write(inode, count, false, true);
 }
 
 static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1456,51 +1749,70 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
 	return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
 }
 
-static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
-						struct inode *inode)
+static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
+					struct inode *inode, bool is_inode)
 {
 	block_t	valid_block_count;
 	unsigned int valid_node_count;
+	bool quota = inode && !is_inode;
+
+	if (quota) {
+		int ret = dquot_reserve_block(inode, 1);
+		if (ret)
+			return ret;
+	}
 
 	spin_lock(&sbi->stat_lock);
 
 	valid_block_count = sbi->total_valid_block_count + 1;
-	if (unlikely(valid_block_count > sbi->user_block_count)) {
+	if (unlikely(valid_block_count + sbi->reserved_blocks >
+						sbi->user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
-		return false;
+		goto enospc;
 	}
 
 	valid_node_count = sbi->total_valid_node_count + 1;
 	if (unlikely(valid_node_count > sbi->total_node_count)) {
 		spin_unlock(&sbi->stat_lock);
-		return false;
+		goto enospc;
 	}
 
-	if (inode)
-		f2fs_i_blocks_write(inode, 1, true);
-
 	sbi->total_valid_node_count++;
 	sbi->total_valid_block_count++;
 	spin_unlock(&sbi->stat_lock);
 
+	if (inode) {
+		if (is_inode)
+			f2fs_mark_inode_dirty_sync(inode, true);
+		else
+			f2fs_i_blocks_write(inode, 1, true, true);
+	}
+
 	percpu_counter_inc(&sbi->alloc_valid_block_count);
-	return true;
+	return 0;
+
+enospc:
+	if (quota)
+		dquot_release_reservation_block(inode, 1);
+	return -ENOSPC;
 }
 
 static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
-						struct inode *inode)
+					struct inode *inode, bool is_inode)
 {
 	spin_lock(&sbi->stat_lock);
 
 	f2fs_bug_on(sbi, !sbi->total_valid_block_count);
 	f2fs_bug_on(sbi, !sbi->total_valid_node_count);
-	f2fs_bug_on(sbi, !inode->i_blocks);
+	f2fs_bug_on(sbi, !is_inode && !inode->i_blocks);
 
-	f2fs_i_blocks_write(inode, 1, false);
 	sbi->total_valid_node_count--;
 	sbi->total_valid_block_count--;
 
 	spin_unlock(&sbi->stat_lock);
+
+	if (!is_inode)
+		f2fs_i_blocks_write(inode, 1, false, true);
 }
 
 static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
@@ -1528,11 +1840,14 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
 {
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	struct page *page = find_lock_page(mapping, index);
+
 	if (page)
 		return page;
 
-	if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC))
+	if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) {
+		f2fs_show_injection_info(FAULT_PAGE_ALLOC);
 		return NULL;
+	}
 #endif
 	if (!for_write)
 		return grab_cache_page(mapping, index);
@@ -1611,22 +1926,42 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
 static inline bool IS_INODE(struct page *page)
 {
 	struct f2fs_node *p = F2FS_NODE(page);
+
 	return RAW_IS_INODE(p);
 }
 
+static inline int offset_in_addr(struct f2fs_inode *i)
+{
+	return (i->i_inline & F2FS_EXTRA_ATTR) ?
+			(le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0;
+}
+
 static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
 {
 	return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
 }
 
-static inline block_t datablock_addr(struct page *node_page,
-		unsigned int offset)
+static inline int f2fs_has_extra_attr(struct inode *inode);
+static inline block_t datablock_addr(struct inode *inode,
+			struct page *node_page, unsigned int offset)
 {
 	struct f2fs_node *raw_node;
 	__le32 *addr_array;
+	int base = 0;
+	bool is_inode = IS_INODE(node_page);
+
 	raw_node = F2FS_NODE(node_page);
+
+	/* from GC path only */
+	if (!inode) {
+		if (is_inode)
+			base = offset_in_addr(&raw_node->i);
+	} else if (f2fs_has_extra_attr(inode) && is_inode) {
+		base = get_extra_isize(inode);
+	}
+
 	addr_array = blkaddr_in_node(raw_node);
-	return le32_to_cpu(addr_array[offset]);
+	return le32_to_cpu(addr_array[base + offset]);
 }
 
 static inline int f2fs_test_bit(unsigned int nr, char *addr)
@@ -1689,6 +2024,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 	*addr ^= mask;
 }
 
+#define F2FS_REG_FLMASK		(~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+#define F2FS_OTHER_FLMASK	(FS_NODUMP_FL | FS_NOATIME_FL)
+#define F2FS_FL_INHERITED	(FS_PROJINHERIT_FL)
+
+static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & F2FS_REG_FLMASK;
+	else
+		return flags & F2FS_OTHER_FLMASK;
+}
+
 /* used for f2fs_inode_info->flags */
 enum {
 	FI_NEW_INODE,		/* indicate newly allocated inode */
@@ -1715,6 +2064,10 @@ enum {
 	FI_INLINE_DOTS,		/* indicate inline dot dentries */
 	FI_DO_DEFRAG,		/* indicate defragment is running */
 	FI_DIRTY_FILE,		/* indicate regular/symlink has dirty pages */
+	FI_NO_PREALLOC,		/* indicate skipped preallocated blocks */
+	FI_HOT_DATA,		/* indicate file is hot */
+	FI_EXTRA_ATTR,		/* indicate file has extra attribute */
+	FI_PROJ_INHERIT,	/* indicate file inherits projectid */
 };
 
 static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -1768,13 +2121,21 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc)
 }
 
 static inline void f2fs_i_blocks_write(struct inode *inode,
-					blkcnt_t diff, bool add)
+					block_t diff, bool add, bool claim)
 {
 	bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
 	bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
 
-	inode->i_blocks = add ? inode->i_blocks + diff :
-				inode->i_blocks - diff;
+	/* add = 1, claim = 1 should be dquot_reserve_block in pair */
+	if (add) {
+		if (claim)
+			dquot_claim_block(inode, diff);
+		else
+			dquot_alloc_block_nofail(inode, diff);
+	} else {
+		dquot_free_block(inode, diff);
+	}
+
 	f2fs_mark_inode_dirty_sync(inode, true);
 	if (clean || recover)
 		set_inode_flag(inode, FI_AUTO_RECOVER);
@@ -1826,6 +2187,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
 		set_bit(FI_DATA_EXIST, &fi->flags);
 	if (ri->i_inline & F2FS_INLINE_DOTS)
 		set_bit(FI_INLINE_DOTS, &fi->flags);
+	if (ri->i_inline & F2FS_EXTRA_ATTR)
+		set_bit(FI_EXTRA_ATTR, &fi->flags);
 }
 
 static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -1842,6 +2205,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
 		ri->i_inline |= F2FS_DATA_EXIST;
 	if (is_inode_flag_set(inode, FI_INLINE_DOTS))
 		ri->i_inline |= F2FS_INLINE_DOTS;
+	if (is_inode_flag_set(inode, FI_EXTRA_ATTR))
+		ri->i_inline |= F2FS_EXTRA_ATTR;
+}
+
+static inline int f2fs_has_extra_attr(struct inode *inode)
+{
+	return is_inode_flag_set(inode, FI_EXTRA_ATTR);
 }
 
 static inline int f2fs_has_inline_xattr(struct inode *inode)
@@ -1852,13 +2222,14 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
 static inline unsigned int addrs_per_inode(struct inode *inode)
 {
 	if (f2fs_has_inline_xattr(inode))
-		return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
-	return DEF_ADDRS_PER_INODE;
+		return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS;
+	return CUR_ADDRS_PER_INODE(inode);
 }
 
 static inline void *inline_xattr_addr(struct page *page)
 {
 	struct f2fs_inode *ri = F2FS_INODE(page);
+
 	return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
 					F2FS_INLINE_XATTR_ADDRS]);
 }
@@ -1876,12 +2247,6 @@ static inline int f2fs_has_inline_data(struct inode *inode)
 	return is_inode_flag_set(inode, FI_INLINE_DATA);
 }
 
-static inline void f2fs_clear_inline_inode(struct inode *inode)
-{
-	clear_inode_flag(inode, FI_INLINE_DATA);
-	clear_inode_flag(inode, FI_DATA_EXIST);
-}
-
 static inline int f2fs_exist_data(struct inode *inode)
 {
 	return is_inode_flag_set(inode, FI_DATA_EXIST);
@@ -1917,10 +2282,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode)
 	return is_inode_flag_set(inode, FI_DROP_CACHE);
 }
 
-static inline void *inline_data_addr(struct page *page)
+static inline void *inline_data_addr(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *ri = F2FS_INODE(page);
-	return (void *)&(ri->i_addr[1]);
+	int extra_size = get_extra_isize(inode);
+
+	return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]);
 }
 
 static inline int f2fs_has_inline_dentry(struct inode *inode)
@@ -2003,13 +2370,15 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
 					size_t size, gfp_t flags)
 {
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-	if (time_to_inject(sbi, FAULT_KMALLOC))
+	if (time_to_inject(sbi, FAULT_KMALLOC)) {
+		f2fs_show_injection_info(FAULT_KMALLOC);
 		return NULL;
+	}
 #endif
 	return kmalloc(size, flags);
 }
 
-static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
+static inline void *kvmalloc(size_t size, gfp_t flags)
 {
 	void *ret;
 
@@ -2019,7 +2388,7 @@ static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
 	return ret;
 }
 
-static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
+static inline void *kvzalloc(size_t size, gfp_t flags)
 {
 	void *ret;
 
@@ -2029,42 +2398,79 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
 	return ret;
 }
 
+static inline int get_extra_isize(struct inode *inode)
+{
+	return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
+}
+
 #define get_inode_mode(i) \
 	((is_inode_flag_set(i, FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
 
-/* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, inode)				\
-	((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) :	\
-	(pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) /	\
-	ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
+#define F2FS_TOTAL_EXTRA_ATTR_SIZE			\
+	(offsetof(struct f2fs_inode, i_extra_end) -	\
+	offsetof(struct f2fs_inode, i_extra_isize))	\
+
+#define F2FS_OLD_ATTRIBUTE_SIZE	(offsetof(struct f2fs_inode, i_addr))
+#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field)		\
+		((offsetof(typeof(*f2fs_inode), field) +	\
+		sizeof((f2fs_inode)->field))			\
+		<= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize))	\
+
+static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	spin_lock(&sbi->iostat_lock);
+	for (i = 0; i < NR_IO_TYPE; i++)
+		sbi->write_iostat[i] = 0;
+	spin_unlock(&sbi->iostat_lock);
+}
+
+static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+			enum iostat_type type, unsigned long long io_bytes)
+{
+	if (!sbi->iostat_enable)
+		return;
+	spin_lock(&sbi->iostat_lock);
+	sbi->write_iostat[type] += io_bytes;
+
+	if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
+		sbi->write_iostat[APP_BUFFERED_IO] =
+			sbi->write_iostat[APP_WRITE_IO] -
+			sbi->write_iostat[APP_DIRECT_IO];
+	spin_unlock(&sbi->iostat_lock);
+}
 
 /*
  * file.c
  */
-int f2fs_sync_file(struct file *, loff_t, loff_t, int);
-void truncate_data_blocks(struct dnode_of_data *);
-int truncate_blocks(struct inode *, u64, bool);
-int f2fs_truncate(struct inode *);
-int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-int f2fs_setattr(struct dentry *, struct iattr *);
-int truncate_hole(struct inode *, pgoff_t, pgoff_t);
-int truncate_data_blocks_range(struct dnode_of_data *, int);
-long f2fs_ioctl(struct file *, unsigned int, unsigned long);
-long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+void truncate_data_blocks(struct dnode_of_data *dn);
+int truncate_blocks(struct inode *inode, u64 from, bool lock);
+int f2fs_truncate(struct inode *inode);
+int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat);
+int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
+int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count);
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 
 /*
  * inode.c
  */
-void f2fs_set_inode_flags(struct inode *);
-struct inode *f2fs_iget(struct super_block *, unsigned long);
-struct inode *f2fs_iget_retry(struct super_block *, unsigned long);
-int try_to_free_nats(struct f2fs_sb_info *, int);
-int update_inode(struct inode *, struct page *);
-int update_inode_page(struct inode *);
-int f2fs_write_inode(struct inode *, struct writeback_control *);
-void f2fs_evict_inode(struct inode *);
-void handle_failed_inode(struct inode *);
+void f2fs_set_inode_flags(struct inode *inode);
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page);
+struct inode *f2fs_iget(struct super_block *sb, unsigned long ino);
+struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino);
+int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink);
+int update_inode(struct inode *inode, struct page *node_page);
+int update_inode_page(struct inode *inode);
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
+void f2fs_evict_inode(struct inode *inode);
+void handle_failed_inode(struct inode *inode);
 
 /*
  * namei.c
@@ -2074,40 +2480,45 @@ struct dentry *f2fs_get_parent(struct dentry *child);
 /*
  * dir.c
  */
-void set_de_type(struct f2fs_dir_entry *, umode_t);
-unsigned char get_de_type(struct f2fs_dir_entry *);
-struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
-			f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
-int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
-			unsigned int, struct fscrypt_str *);
-void do_make_empty_dir(struct inode *, struct inode *,
-			struct f2fs_dentry_ptr *);
-struct page *init_inode_metadata(struct inode *, struct inode *,
-		const struct qstr *, const struct qstr *, struct page *);
-void update_parent_metadata(struct inode *, struct inode *, unsigned int);
-int room_for_filename(const void *, int, int);
-void f2fs_drop_nlink(struct inode *, struct inode *);
-struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *,
-							struct page **);
-struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *,
-							struct page **);
-struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
-ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **);
-void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
-				struct page *, struct inode *);
-int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
-void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
-			const struct qstr *, f2fs_hash_t , unsigned int);
-int f2fs_add_regular_entry(struct inode *, const struct qstr *,
-			const struct qstr *, struct inode *, nid_t, umode_t);
-int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *,
-			nid_t, umode_t);
-int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
-			umode_t);
-void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
-							struct inode *);
-int f2fs_do_tmpfile(struct inode *, struct inode *);
-bool f2fs_empty_dir(struct inode *);
+void set_de_type(struct f2fs_dir_entry *de, umode_t mode);
+unsigned char get_de_type(struct f2fs_dir_entry *de);
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
+			f2fs_hash_t namehash, int *max_slots,
+			struct f2fs_dentry_ptr *d);
+int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+			unsigned int start_pos, struct fscrypt_str *fstr);
+void do_make_empty_dir(struct inode *inode, struct inode *parent,
+			struct f2fs_dentry_ptr *d);
+struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
+			const struct qstr *new_name,
+			const struct qstr *orig_name, struct page *dpage);
+void update_parent_metadata(struct inode *dir, struct inode *inode,
+			unsigned int current_depth);
+int room_for_filename(const void *bitmap, int slots, int max_slots);
+void f2fs_drop_nlink(struct inode *dir, struct inode *inode);
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
+			struct fscrypt_name *fname, struct page **res_page);
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+			const struct qstr *child, struct page **res_page);
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p);
+ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
+			struct page **page);
+void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
+			struct page *page, struct inode *inode);
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
+			const struct qstr *name, f2fs_hash_t name_hash,
+			unsigned int bit_pos);
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
+			const struct qstr *orig_name,
+			struct inode *inode, nid_t ino, umode_t mode);
+int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname,
+			struct inode *inode, nid_t ino, umode_t mode);
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+			struct inode *inode, nid_t ino, umode_t mode);
+void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
+			struct inode *dir, struct inode *inode);
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir);
+bool f2fs_empty_dir(struct inode *dir);
 
 static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
 {
@@ -2118,18 +2529,21 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
 /*
  * super.c
  */
-int f2fs_inode_dirtied(struct inode *, bool);
-void f2fs_inode_synced(struct inode *);
-int f2fs_commit_super(struct f2fs_sb_info *, bool);
-int f2fs_sync_fs(struct super_block *, int);
+int f2fs_inode_dirtied(struct inode *inode, bool sync);
+void f2fs_inode_synced(struct inode *inode);
+void f2fs_enable_quota_files(struct f2fs_sb_info *sbi);
+void f2fs_quota_off_umount(struct super_block *sb);
+int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
+int f2fs_sync_fs(struct super_block *sb, int sync);
 extern __printf(3, 4)
-void f2fs_msg(struct super_block *, const char *, const char *, ...);
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...);
 int sanity_check_ckpt(struct f2fs_sb_info *sbi);
 
 /*
  * hash.c
  */
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *);
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
+				struct fscrypt_name *fname);
 
 /*
  * node.c
@@ -2137,164 +2551,190 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *);
 struct dnode_of_data;
 struct node_info;
 
-bool available_free_memory(struct f2fs_sb_info *, int);
-int need_dentry_mark(struct f2fs_sb_info *, nid_t);
-bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
-void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
-pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t);
-int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
-int truncate_inode_blocks(struct inode *, pgoff_t);
-int truncate_xattr_node(struct inode *, struct page *);
-int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
-int remove_inode_page(struct inode *);
-struct page *new_inode_page(struct inode *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
-void ra_node_page(struct f2fs_sb_info *, nid_t);
-struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_node_page_ra(struct page *, int);
-void move_node_page(struct page *, int);
-int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
-			struct writeback_control *, bool);
-int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
-void build_free_nids(struct f2fs_sb_info *, bool);
-bool alloc_nid(struct f2fs_sb_info *, nid_t *);
-void alloc_nid_done(struct f2fs_sb_info *, nid_t);
-void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
-int try_to_free_nids(struct f2fs_sb_info *, int);
-void recover_inline_xattr(struct inode *, struct page *);
-void recover_xattr_data(struct inode *, struct page *, block_t);
-int recover_inode_page(struct f2fs_sb_info *, struct page *);
-int restore_node_summary(struct f2fs_sb_info *, unsigned int,
-				struct f2fs_summary_block *);
-void flush_nat_entries(struct f2fs_sb_info *);
-int build_node_manager(struct f2fs_sb_info *);
-void destroy_node_manager(struct f2fs_sb_info *);
+bool available_free_memory(struct f2fs_sb_info *sbi, int type);
+int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
+bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
+void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni);
+pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
+int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
+int truncate_inode_blocks(struct inode *inode, pgoff_t from);
+int truncate_xattr_node(struct inode *inode, struct page *page);
+int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
+int remove_inode_page(struct inode *inode);
+struct page *new_inode_page(struct inode *inode);
+struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs);
+void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
+struct page *get_node_page_ra(struct page *parent, int start);
+void move_node_page(struct page *node_page, int gc_type);
+int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
+			struct writeback_control *wbc, bool atomic);
+int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
+			bool do_balance, enum iostat_type io_type);
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount);
+bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid);
+void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
+void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid);
+int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink);
+void recover_inline_xattr(struct inode *inode, struct page *page);
+int recover_xattr_data(struct inode *inode, struct page *page,
+			block_t blkaddr);
+int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
+int restore_node_summary(struct f2fs_sb_info *sbi,
+			unsigned int segno, struct f2fs_summary_block *sum);
+void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int build_node_manager(struct f2fs_sb_info *sbi);
+void destroy_node_manager(struct f2fs_sb_info *sbi);
 int __init create_node_manager_caches(void);
 void destroy_node_manager_caches(void);
 
 /*
  * segment.c
  */
-void register_inmem_page(struct inode *, struct page *);
-void drop_inmem_pages(struct inode *);
-int commit_inmem_pages(struct inode *);
-void f2fs_balance_fs(struct f2fs_sb_info *, bool);
-void f2fs_balance_fs_bg(struct f2fs_sb_info *);
-int f2fs_issue_flush(struct f2fs_sb_info *);
-int create_flush_cmd_control(struct f2fs_sb_info *);
-void destroy_flush_cmd_control(struct f2fs_sb_info *, bool);
-void invalidate_blocks(struct f2fs_sb_info *, block_t);
-bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
-void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
-void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t);
-void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
-void release_discard_addrs(struct f2fs_sb_info *);
-int npages_for_summary_flush(struct f2fs_sb_info *, bool);
-void allocate_new_segments(struct f2fs_sb_info *);
-int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
-bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *);
-struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
-void update_meta_page(struct f2fs_sb_info *, void *, block_t);
-void write_meta_page(struct f2fs_sb_info *, struct page *);
-void write_node_page(unsigned int, struct f2fs_io_info *);
-void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
-void rewrite_data_page(struct f2fs_io_info *);
-void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *,
-					block_t, block_t, bool, bool);
-void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
-				block_t, block_t, unsigned char, bool, bool);
-void allocate_data_block(struct f2fs_sb_info *, struct page *,
-		block_t, block_t *, struct f2fs_summary *, int);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
-void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
-void write_data_summaries(struct f2fs_sb_info *, block_t);
-void write_node_summaries(struct f2fs_sb_info *, block_t);
-int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int);
-void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
-int build_segment_manager(struct f2fs_sb_info *);
-void destroy_segment_manager(struct f2fs_sb_info *);
+bool need_SSR(struct f2fs_sb_info *sbi);
+void register_inmem_page(struct inode *inode, struct page *page);
+void drop_inmem_pages(struct inode *inode);
+void drop_inmem_page(struct inode *inode, struct page *page);
+int commit_inmem_pages(struct inode *inode);
+void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
+int f2fs_issue_flush(struct f2fs_sb_info *sbi);
+int create_flush_cmd_control(struct f2fs_sb_info *sbi);
+void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
+void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
+bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
+void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
+void stop_discard_thread(struct f2fs_sb_info *sbi);
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+void release_discard_addrs(struct f2fs_sb_info *sbi);
+int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
+void allocate_new_segments(struct f2fs_sb_info *sbi);
+int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
+bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno);
+void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr);
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+						enum iostat_type io_type);
+void write_node_page(unsigned int nid, struct f2fs_io_info *fio);
+void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio);
+int rewrite_data_page(struct f2fs_io_info *fio);
+void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+			block_t old_blkaddr, block_t new_blkaddr,
+			bool recover_curseg, bool recover_newaddr);
+void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
+			block_t old_addr, block_t new_addr,
+			unsigned char version, bool recover_curseg,
+			bool recover_newaddr);
+void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+			block_t old_blkaddr, block_t *new_blkaddr,
+			struct f2fs_summary *sum, int type,
+			struct f2fs_io_info *fio, bool add_list);
+void f2fs_wait_on_page_writeback(struct page *page,
+			enum page_type type, bool ordered);
+void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr);
+void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
+void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
+int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
+			unsigned int val, int alloc);
+void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int build_segment_manager(struct f2fs_sb_info *sbi);
+void destroy_segment_manager(struct f2fs_sb_info *sbi);
 int __init create_segment_manager_caches(void);
 void destroy_segment_manager_caches(void);
 
 /*
  * checkpoint.c
  */
-void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool);
-struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
-bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
-int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
-void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
-long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
-void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
-void release_ino_entry(struct f2fs_sb_info *, bool);
-bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
-int f2fs_sync_inode_meta(struct f2fs_sb_info *);
-int acquire_orphan_inode(struct f2fs_sb_info *);
-void release_orphan_inode(struct f2fs_sb_info *);
-void add_orphan_inode(struct inode *);
-void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
-int recover_orphan_inodes(struct f2fs_sb_info *);
-int get_valid_checkpoint(struct f2fs_sb_info *);
-void update_dirty_page(struct inode *, struct page *);
-void remove_dirty_inode(struct inode *);
-int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type);
-int write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
-void init_ino_entry_info(struct f2fs_sb_info *);
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io);
+struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
+struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index);
+bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type);
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
+			int type, bool sync);
+void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index);
+long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
+			long nr_to_write, enum iostat_type io_type);
+void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
+void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
+void release_ino_entry(struct f2fs_sb_info *sbi, bool all);
+bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode);
+int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi);
+int acquire_orphan_inode(struct f2fs_sb_info *sbi);
+void release_orphan_inode(struct f2fs_sb_info *sbi);
+void add_orphan_inode(struct inode *inode);
+void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino);
+int recover_orphan_inodes(struct f2fs_sb_info *sbi);
+int get_valid_checkpoint(struct f2fs_sb_info *sbi);
+void update_dirty_page(struct inode *inode, struct page *page);
+void remove_dirty_inode(struct inode *inode);
+int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
+int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+void init_ino_entry_info(struct f2fs_sb_info *sbi);
 int __init create_checkpoint_caches(void);
 void destroy_checkpoint_caches(void);
 
 /*
  * data.c
  */
-void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
-void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
-				struct page *, nid_t, enum page_type, int);
-void f2fs_flush_merged_bios(struct f2fs_sb_info *);
-int f2fs_submit_page_bio(struct f2fs_io_info *);
-int f2fs_submit_page_mbio(struct f2fs_io_info *);
-struct block_device *f2fs_target_device(struct f2fs_sb_info *,
-				block_t, struct bio *);
-int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
-void set_data_blkaddr(struct dnode_of_data *);
-void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
-int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
-int reserve_new_block(struct dnode_of_data *);
-int f2fs_get_block(struct dnode_of_data *, pgoff_t);
-int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
-int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
-struct page *find_data_page(struct inode *, pgoff_t);
-struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
-struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int do_write_data_page(struct f2fs_io_info *);
-int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
-int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
-void f2fs_set_page_dirty_nobuffers(struct page *);
-void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
-int f2fs_release_page(struct page *, gfp_t);
+void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
+void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
+				struct inode *inode, nid_t ino, pgoff_t idx,
+				enum page_type type);
+void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
+int f2fs_submit_page_bio(struct f2fs_io_info *fio);
+int f2fs_submit_page_write(struct f2fs_io_info *fio);
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+			block_t blk_addr, struct bio *bio);
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr);
+void set_data_blkaddr(struct dnode_of_data *dn);
+void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
+int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count);
+int reserve_new_block(struct dnode_of_data *dn);
+int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
+int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from);
+int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
+struct page *get_read_data_page(struct inode *inode, pgoff_t index,
+			int op_flags, bool for_write);
+struct page *find_data_page(struct inode *inode, pgoff_t index);
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
+			bool for_write);
+struct page *get_new_data_page(struct inode *inode,
+			struct page *ipage, pgoff_t index, bool new_i_size);
+int do_write_data_page(struct f2fs_io_info *fio);
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+			int create, int flag);
+int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+			u64 start, u64 len);
+void f2fs_set_page_dirty_nobuffers(struct page *page);
+int __f2fs_write_data_pages(struct address_space *mapping,
+						struct writeback_control *wbc,
+						enum iostat_type io_type);
+void f2fs_invalidate_page(struct page *page, unsigned int offset,
+			unsigned int length);
+int f2fs_release_page(struct page *page, gfp_t wait);
 #ifdef CONFIG_MIGRATION
-int f2fs_migrate_page(struct address_space *, struct page *, struct page *,
-				enum migrate_mode);
+int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
+			struct page *page, enum migrate_mode mode);
 #endif
 
 /*
  * gc.c
  */
-int start_gc_thread(struct f2fs_sb_info *);
-void stop_gc_thread(struct f2fs_sb_info *);
-block_t start_bidx_of_node(unsigned int, struct inode *);
-int f2fs_gc(struct f2fs_sb_info *, bool, bool);
-void build_gc_manager(struct f2fs_sb_info *);
+int start_gc_thread(struct f2fs_sb_info *sbi);
+void stop_gc_thread(struct f2fs_sb_info *sbi);
+block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
+			unsigned int segno);
+void build_gc_manager(struct f2fs_sb_info *sbi);
 
 /*
  * recovery.c
  */
-int recover_fsync_data(struct f2fs_sb_info *, bool);
-bool space_for_roll_forward(struct f2fs_sb_info *);
+int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only);
+bool space_for_roll_forward(struct f2fs_sb_info *sbi);
 
 /*
  * debug.c
@@ -2311,11 +2751,15 @@ struct f2fs_stat_info {
 	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
 	int inmem_pages;
 	unsigned int ndirty_dirs, ndirty_files, ndirty_all;
-	int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
+	int nats, dirty_nats, sits, dirty_sits;
+	int free_nids, avail_nids, alloc_nids;
 	int total_count, utilization;
 	int bg_gc, nr_wb_cp_data, nr_wb_data;
-	int inline_xattr, inline_inode, inline_dir, orphans;
-	int aw_cnt, max_aw_cnt;
+	int nr_flushing, nr_flushed, nr_discarding, nr_discarded;
+	int nr_discard_cmd;
+	unsigned int undiscard_blks;
+	int inline_xattr, inline_inode, inline_dir, append, update, orphans;
+	int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
 	unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
 	unsigned int bimodal, avg_vblocks;
 	int util_free, util_valid, util_invalid;
@@ -2388,9 +2832,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_inc_inplace_blocks(sbi)					\
 		(atomic_inc(&(sbi)->inplace_count))
 #define stat_inc_atomic_write(inode)					\
-		(atomic_inc(&F2FS_I_SB(inode)->aw_cnt));
+		(atomic_inc(&F2FS_I_SB(inode)->aw_cnt))
 #define stat_dec_atomic_write(inode)					\
-		(atomic_dec(&F2FS_I_SB(inode)->aw_cnt));
+		(atomic_dec(&F2FS_I_SB(inode)->aw_cnt))
 #define stat_update_max_atomic_write(inode)				\
 	do {								\
 		int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt);	\
@@ -2398,11 +2842,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 		if (cur > max)						\
 			atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur);	\
 	} while (0)
+#define stat_inc_volatile_write(inode)					\
+		(atomic_inc(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_dec_volatile_write(inode)					\
+		(atomic_dec(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_update_max_volatile_write(inode)				\
+	do {								\
+		int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt);	\
+		int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt);	\
+		if (cur > max)						\
+			atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur);	\
+	} while (0)
 #define stat_inc_seg_count(sbi, type, gc_type)				\
 	do {								\
 		struct f2fs_stat_info *si = F2FS_STAT(sbi);		\
-		(si)->tot_segs++;					\
-		if (type == SUM_TYPE_DATA) {				\
+		si->tot_segs++;						\
+		if ((type) == SUM_TYPE_DATA) {				\
 			si->data_segs++;				\
 			si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0;	\
 		} else {						\
@@ -2412,14 +2867,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 	} while (0)
 
 #define stat_inc_tot_blk_count(si, blks)				\
-	(si->tot_blks += (blks))
+	((si)->tot_blks += (blks))
 
 #define stat_inc_data_blk_count(sbi, blks, gc_type)			\
 	do {								\
 		struct f2fs_stat_info *si = F2FS_STAT(sbi);		\
 		stat_inc_tot_blk_count(si, blks);			\
 		si->data_blks += (blks);				\
-		si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0;	\
+		si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0;	\
 	} while (0)
 
 #define stat_inc_node_blk_count(sbi, blks, gc_type)			\
@@ -2427,40 +2882,43 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 		struct f2fs_stat_info *si = F2FS_STAT(sbi);		\
 		stat_inc_tot_blk_count(si, blks);			\
 		si->node_blks += (blks);				\
-		si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0;	\
+		si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0;	\
 	} while (0)
 
-int f2fs_build_stats(struct f2fs_sb_info *);
-void f2fs_destroy_stats(struct f2fs_sb_info *);
+int f2fs_build_stats(struct f2fs_sb_info *sbi);
+void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
 int __init f2fs_create_root_stats(void);
 void f2fs_destroy_root_stats(void);
 #else
-#define stat_inc_cp_count(si)
-#define stat_inc_bg_cp_count(si)
-#define stat_inc_call_count(si)
-#define stat_inc_bggc_count(si)
-#define stat_inc_dirty_inode(sbi, type)
-#define stat_dec_dirty_inode(sbi, type)
-#define stat_inc_total_hit(sb)
-#define stat_inc_rbtree_node_hit(sb)
-#define stat_inc_largest_node_hit(sbi)
-#define stat_inc_cached_node_hit(sbi)
-#define stat_inc_inline_xattr(inode)
-#define stat_dec_inline_xattr(inode)
-#define stat_inc_inline_inode(inode)
-#define stat_dec_inline_inode(inode)
-#define stat_inc_inline_dir(inode)
-#define stat_dec_inline_dir(inode)
-#define stat_inc_atomic_write(inode)
-#define stat_dec_atomic_write(inode)
-#define stat_update_max_atomic_write(inode)
-#define stat_inc_seg_type(sbi, curseg)
-#define stat_inc_block_count(sbi, curseg)
-#define stat_inc_inplace_blocks(sbi)
-#define stat_inc_seg_count(sbi, type, gc_type)
-#define stat_inc_tot_blk_count(si, blks)
-#define stat_inc_data_blk_count(sbi, blks, gc_type)
-#define stat_inc_node_blk_count(sbi, blks, gc_type)
+#define stat_inc_cp_count(si)				do { } while (0)
+#define stat_inc_bg_cp_count(si)			do { } while (0)
+#define stat_inc_call_count(si)				do { } while (0)
+#define stat_inc_bggc_count(si)				do { } while (0)
+#define stat_inc_dirty_inode(sbi, type)			do { } while (0)
+#define stat_dec_dirty_inode(sbi, type)			do { } while (0)
+#define stat_inc_total_hit(sb)				do { } while (0)
+#define stat_inc_rbtree_node_hit(sb)			do { } while (0)
+#define stat_inc_largest_node_hit(sbi)			do { } while (0)
+#define stat_inc_cached_node_hit(sbi)			do { } while (0)
+#define stat_inc_inline_xattr(inode)			do { } while (0)
+#define stat_dec_inline_xattr(inode)			do { } while (0)
+#define stat_inc_inline_inode(inode)			do { } while (0)
+#define stat_dec_inline_inode(inode)			do { } while (0)
+#define stat_inc_inline_dir(inode)			do { } while (0)
+#define stat_dec_inline_dir(inode)			do { } while (0)
+#define stat_inc_atomic_write(inode)			do { } while (0)
+#define stat_dec_atomic_write(inode)			do { } while (0)
+#define stat_update_max_atomic_write(inode)		do { } while (0)
+#define stat_inc_volatile_write(inode)			do { } while (0)
+#define stat_dec_volatile_write(inode)			do { } while (0)
+#define stat_update_max_volatile_write(inode)		do { } while (0)
+#define stat_inc_seg_type(sbi, curseg)			do { } while (0)
+#define stat_inc_block_count(sbi, curseg)		do { } while (0)
+#define stat_inc_inplace_blocks(sbi)			do { } while (0)
+#define stat_inc_seg_count(sbi, type, gc_type)		do { } while (0)
+#define stat_inc_tot_blk_count(si, blks)		do { } while (0)
+#define stat_inc_data_blk_count(sbi, blks, gc_type)	do { } while (0)
+#define stat_inc_node_blk_count(sbi, blks, gc_type)	do { } while (0)
 
 static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
@@ -2483,52 +2941,78 @@ extern struct kmem_cache *inode_entry_slab;
 /*
  * inline.c
  */
-bool f2fs_may_inline_data(struct inode *);
-bool f2fs_may_inline_dentry(struct inode *);
-void read_inline_data(struct page *, struct page *);
-bool truncate_inline_inode(struct page *, u64);
-int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
-int f2fs_convert_inline_inode(struct inode *);
-int f2fs_write_inline_data(struct inode *, struct page *);
-bool recover_inline_data(struct inode *, struct page *);
-struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
-				struct fscrypt_name *, struct page **);
-int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
-int f2fs_add_inline_entry(struct inode *, const struct qstr *,
-		const struct qstr *, struct inode *, nid_t, umode_t);
-void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
-						struct inode *, struct inode *);
-bool f2fs_empty_inline_dir(struct inode *);
-int f2fs_read_inline_dir(struct file *, struct dir_context *,
-						struct fscrypt_str *);
-int f2fs_inline_data_fiemap(struct inode *,
-		struct fiemap_extent_info *, __u64, __u64);
+bool f2fs_may_inline_data(struct inode *inode);
+bool f2fs_may_inline_dentry(struct inode *inode);
+void read_inline_data(struct page *page, struct page *ipage);
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from);
+int f2fs_read_inline_data(struct inode *inode, struct page *page);
+int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
+int f2fs_convert_inline_inode(struct inode *inode);
+int f2fs_write_inline_data(struct inode *inode, struct page *page);
+bool recover_inline_data(struct inode *inode, struct page *npage);
+struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
+			struct fscrypt_name *fname, struct page **res_page);
+int make_empty_inline_dir(struct inode *inode, struct inode *parent,
+			struct page *ipage);
+int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
+			const struct qstr *orig_name,
+			struct inode *inode, nid_t ino, umode_t mode);
+void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
+			struct inode *dir, struct inode *inode);
+bool f2fs_empty_inline_dir(struct inode *dir);
+int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
+			struct fscrypt_str *fstr);
+int f2fs_inline_data_fiemap(struct inode *inode,
+			struct fiemap_extent_info *fieinfo,
+			__u64 start, __u64 len);
 
 /*
  * shrinker.c
  */
-unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *);
-unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *);
-void f2fs_join_shrinker(struct f2fs_sb_info *);
-void f2fs_leave_shrinker(struct f2fs_sb_info *);
+unsigned long f2fs_shrink_count(struct shrinker *shrink,
+			struct shrink_control *sc);
+unsigned long f2fs_shrink_scan(struct shrinker *shrink,
+			struct shrink_control *sc);
+void f2fs_join_shrinker(struct f2fs_sb_info *sbi);
+void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
 
 /*
  * extent_cache.c
  */
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
-bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
-void f2fs_drop_extent_tree(struct inode *);
-unsigned int f2fs_destroy_extent_node(struct inode *);
-void f2fs_destroy_extent_tree(struct inode *);
-bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
-void f2fs_update_extent_cache(struct dnode_of_data *);
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+				struct rb_entry *cached_re, unsigned int ofs);
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+				struct rb_root *root, struct rb_node **parent,
+				unsigned int ofs);
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+		struct rb_entry *cached_re, unsigned int ofs,
+		struct rb_entry **prev_entry, struct rb_entry **next_entry,
+		struct rb_node ***insert_p, struct rb_node **insert_parent,
+		bool force);
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+						struct rb_root *root);
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
+void f2fs_drop_extent_tree(struct inode *inode);
+unsigned int f2fs_destroy_extent_node(struct inode *inode);
+void f2fs_destroy_extent_tree(struct inode *inode);
+bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+			struct extent_info *ei);
+void f2fs_update_extent_cache(struct dnode_of_data *dn);
 void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
-						pgoff_t, block_t, unsigned int);
-void init_extent_cache_info(struct f2fs_sb_info *);
+			pgoff_t fofs, block_t blkaddr, unsigned int len);
+void init_extent_cache_info(struct f2fs_sb_info *sbi);
 int __init create_extent_cache(void);
 void destroy_extent_cache(void);
 
+/*
+ * sysfs.c
+ */
+int __init f2fs_init_sysfs(void);
+void f2fs_exit_sysfs(void);
+int f2fs_register_sysfs(struct f2fs_sb_info *sbi);
+void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi);
+
 /*
  * crypto support
  */
@@ -2537,6 +3021,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode)
 	return file_is_encrypt(inode);
 }
 
+static inline bool f2fs_encrypted_file(struct inode *inode)
+{
+	return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode);
+}
+
 static inline void f2fs_set_encrypted_inode(struct inode *inode)
 {
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
@@ -2559,6 +3048,21 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
 }
 
+static inline int f2fs_sb_has_extra_attr(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR);
+}
+
+static inline int f2fs_sb_has_project_quota(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA);
+}
+
+static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
@@ -2606,28 +3110,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #endif
 }
 
-#ifndef CONFIG_F2FS_FS_ENCRYPTION
-#define fscrypt_set_d_op(i)
-#define fscrypt_get_ctx			fscrypt_notsupp_get_ctx
-#define fscrypt_release_ctx		fscrypt_notsupp_release_ctx
-#define fscrypt_encrypt_page		fscrypt_notsupp_encrypt_page
-#define fscrypt_decrypt_page		fscrypt_notsupp_decrypt_page
-#define fscrypt_decrypt_bio_pages	fscrypt_notsupp_decrypt_bio_pages
-#define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
-#define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
-#define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy		fscrypt_notsupp_process_policy
-#define fscrypt_get_policy		fscrypt_notsupp_get_policy
-#define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
-#define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
-#define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
-#define fscrypt_put_encryption_info	fscrypt_notsupp_put_encryption_info
-#define fscrypt_setup_filename		fscrypt_notsupp_setup_filename
-#define fscrypt_free_filename		fscrypt_notsupp_free_filename
-#define fscrypt_fname_encrypted_size	fscrypt_notsupp_fname_encrypted_size
-#define fscrypt_fname_alloc_buffer	fscrypt_notsupp_fname_alloc_buffer
-#define fscrypt_fname_free_buffer	fscrypt_notsupp_fname_free_buffer
-#define fscrypt_fname_disk_to_usr	fscrypt_notsupp_fname_disk_to_usr
-#define fscrypt_fname_usr_to_disk	fscrypt_notsupp_fname_usr_to_disk
-#endif
 #endif
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
deleted file mode 100644
index f113f1a1e8c1..000000000000
--- a/fs/f2fs/f2fs_crypto.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * linux/fs/f2fs/f2fs_crypto.h
- *
- * Copied from linux/fs/ext4/ext4_crypto.h
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption header content for f2fs
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#ifndef _F2FS_CRYPTO_H
-#define _F2FS_CRYPTO_H
-
-#include <linux/fs.h>
-
-#define F2FS_KEY_DESCRIPTOR_SIZE	8
-
-/* Policy provided via an ioctl on the topmost directory */
-struct f2fs_encryption_policy {
-	char version;
-	char contents_encryption_mode;
-	char filenames_encryption_mode;
-	char flags;
-	char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
-} __attribute__((__packed__));
-
-#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1	1
-#define F2FS_KEY_DERIVATION_NONCE_SIZE		16
-
-#define F2FS_POLICY_FLAGS_PAD_4		0x00
-#define F2FS_POLICY_FLAGS_PAD_8		0x01
-#define F2FS_POLICY_FLAGS_PAD_16	0x02
-#define F2FS_POLICY_FLAGS_PAD_32	0x03
-#define F2FS_POLICY_FLAGS_PAD_MASK	0x03
-#define F2FS_POLICY_FLAGS_VALID		0x03
-
-/**
- * Encryption context for inode
- *
- * Protector format:
- *  1 byte: Protector format (1 = this version)
- *  1 byte: File contents encryption mode
- *  1 byte: File names encryption mode
- *  1 byte: Flags
- *  8 bytes: Master Key descriptor
- *  16 bytes: Encryption Key derivation nonce
- */
-struct f2fs_encryption_context {
-	char format;
-	char contents_encryption_mode;
-	char filenames_encryption_mode;
-	char flags;
-	char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
-	char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE];
-} __attribute__((__packed__));
-
-/* Encryption parameters */
-#define F2FS_XTS_TWEAK_SIZE 16
-#define F2FS_AES_128_ECB_KEY_SIZE 16
-#define F2FS_AES_256_GCM_KEY_SIZE 32
-#define F2FS_AES_256_CBC_KEY_SIZE 32
-#define F2FS_AES_256_CTS_KEY_SIZE 32
-#define F2FS_AES_256_XTS_KEY_SIZE 64
-#define F2FS_MAX_KEY_SIZE 64
-
-#define F2FS_KEY_DESC_PREFIX "f2fs:"
-#define F2FS_KEY_DESC_PREFIX_SIZE 5
-
-struct f2fs_encryption_key {
-	__u32 mode;
-	char raw[F2FS_MAX_KEY_SIZE];
-	__u32 size;
-} __attribute__((__packed__));
-
-struct f2fs_crypt_info {
-	char		ci_data_mode;
-	char		ci_filename_mode;
-	char		ci_flags;
-	struct crypto_ablkcipher *ci_ctfm;
-	char		ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
-};
-
-#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL             0x00000001
-#define F2FS_WRITE_PATH_FL			      0x00000002
-
-struct f2fs_crypto_ctx {
-	union {
-		struct {
-			struct page *bounce_page;       /* Ciphertext page */
-			struct page *control_page;      /* Original page  */
-		} w;
-		struct {
-			struct bio *bio;
-			struct work_struct work;
-		} r;
-		struct list_head free_list;     /* Free list */
-	};
-	char flags;                      /* Flags */
-};
-
-struct f2fs_completion_result {
-	struct completion completion;
-	int res;
-};
-
-#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \
-	struct f2fs_completion_result ecr = { \
-		COMPLETION_INITIALIZER((ecr).completion), 0 }
-
-static inline int f2fs_encryption_key_size(int mode)
-{
-	switch (mode) {
-	case F2FS_ENCRYPTION_MODE_AES_256_XTS:
-		return F2FS_AES_256_XTS_KEY_SIZE;
-	case F2FS_ENCRYPTION_MODE_AES_256_GCM:
-		return F2FS_AES_256_GCM_KEY_SIZE;
-	case F2FS_ENCRYPTION_MODE_AES_256_CBC:
-		return F2FS_AES_256_CBC_KEY_SIZE;
-	case F2FS_ENCRYPTION_MODE_AES_256_CTS:
-		return F2FS_AES_256_CTS_KEY_SIZE;
-	default:
-		BUG();
-	}
-	return 0;
-}
-
-#define F2FS_FNAME_NUM_SCATTER_ENTRIES	4
-#define F2FS_CRYPTO_BLOCK_SIZE		16
-#define F2FS_FNAME_CRYPTO_DIGEST_SIZE	32
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct f2fs_encrypted_symlink_data {
-	__le16 len;
-	char encrypted_path[1];
-} __attribute__((__packed__));
-
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 encrypted_symlink_data_len(u32 l)
-{
-	return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1);
-}
-#endif	/* _F2FS_CRYPTO_H */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index e4e5d76d80b0..531379f513fa 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -21,6 +21,7 @@
 #include <linux/mount.h>
 #include <linux/pagevec.h>
 #include <linux/random.h>
+#include <linux/uio.h>
 #include <linux/uuid.h>
 #include <linux/file.h>
 
@@ -33,6 +34,19 @@
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
+static int f2fs_filemap_fault(struct vm_area_struct *vma,
+					struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	int err;
+
+	down_read(&F2FS_I(inode)->i_mmap_sem);
+	err = filemap_fault(vma, vmf);
+	up_read(&F2FS_I(inode)->i_mmap_sem);
+
+	return err;
+}
+
 static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 						struct vm_fault *vmf)
 {
@@ -60,13 +74,14 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	f2fs_balance_fs(sbi, dn.node_changed);
 
 	file_update_time(vma->vm_file);
+	down_read(&F2FS_I(inode)->i_mmap_sem);
 	lock_page(page);
 	if (unlikely(page->mapping != inode->i_mapping ||
 			page_offset(page) > i_size_read(inode) ||
 			!PageUptodate(page))) {
 		unlock_page(page);
 		err = -EFAULT;
-		goto out;
+		goto out_sem;
 	}
 
 	/*
@@ -86,15 +101,19 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 
+	f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE);
+
 	trace_f2fs_vm_page_mkwrite(page, DATA);
 mapped:
 	/* fill the page */
 	f2fs_wait_on_page_writeback(page, DATA, false);
 
 	/* wait for GCed encrypted page writeback */
-	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-		f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+	if (f2fs_encrypted_file(inode))
+		f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr);
 
+out_sem:
+	up_read(&F2FS_I(inode)->i_mmap_sem);
 out:
 	sb_end_pagefault(inode->i_sb);
 	f2fs_update_time(sbi, REQ_TIME);
@@ -102,7 +121,7 @@ out:
 }
 
 static const struct vm_operations_struct f2fs_file_vm_ops = {
-	.fault		= filemap_fault,
+	.fault		= f2fs_filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= f2fs_vm_page_mkwrite,
 };
@@ -117,11 +136,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
 	if (!dentry)
 		return 0;
 
-	if (update_dent_inode(inode, inode, &dentry->d_name)) {
-		dput(dentry);
-		return 0;
-	}
-
 	*pino = parent_ino(dentry);
 	dput(dentry);
 	return 1;
@@ -142,8 +156,6 @@ static inline bool need_do_checkpoint(struct inode *inode)
 		need_cp = true;
 	else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
 		need_cp = true;
-	else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
-		need_cp = true;
 	else if (test_opt(sbi, FASTBOOT))
 		need_cp = true;
 	else if (sbi->active_logs == 2)
@@ -169,7 +181,6 @@ static void try_to_fix_pino(struct inode *inode)
 	nid_t pino;
 
 	down_write(&fi->i_sem);
-	fi->xattr_ver = 0;
 	if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
 			get_parent_ino(inode, &pino)) {
 		f2fs_i_pino_write(inode, pino);
@@ -268,9 +279,19 @@ sync_nodes:
 		goto sync_nodes;
 	}
 
-	ret = wait_on_node_pages_writeback(sbi, ino);
-	if (ret)
-		goto out;
+	/*
+	 * If it's atomic_write, it's just fine to keep write ordering. So
+	 * here we don't need to wait for node write completion, since we use
+	 * node chain which serializes node blocks. If one of node writes are
+	 * reordered, we can see simply broken chain, resulting in stopping
+	 * roll-forward recovery. It means we'll recover all or none node blocks
+	 * given fsync mark.
+	 */
+	if (!atomic) {
+		ret = wait_on_node_pages_writeback(sbi, ino);
+		if (ret)
+			goto out;
+	}
 
 	/* once recovery info is written, don't need to tack this */
 	remove_ino_entry(sbi, ino, APPEND_INO);
@@ -278,7 +299,8 @@ sync_nodes:
 flush_out:
 	remove_ino_entry(sbi, ino, UPDATE_INO);
 	clear_inode_flag(inode, FI_UPDATE_WRITE);
-	ret = f2fs_issue_flush(sbi);
+	if (!atomic)
+		ret = f2fs_issue_flush(sbi);
 	f2fs_update_time(sbi, REQ_TIME);
 out:
 	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -375,7 +397,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 				dn.ofs_in_node++, pgofs++,
 				data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
 			block_t blkaddr;
-			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+			blkaddr = datablock_addr(dn.inode,
+					dn.node_page, dn.ofs_in_node);
 
 			if (__found_offset(blkaddr, dirty, pgofs, whence)) {
 				f2fs_put_dnode(&dn);
@@ -423,14 +446,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct inode *inode = file_inode(file);
 	int err;
 
-	if (f2fs_encrypted_inode(inode)) {
-		err = fscrypt_get_encryption_info(inode);
-		if (err)
-			return 0;
-		if (!f2fs_encrypted_inode(inode))
-			return -ENOKEY;
-	}
-
 	/* we don't need to use inline_data strictly */
 	err = f2fs_convert_inline_inode(inode);
 	if (err)
@@ -443,11 +458,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 static int f2fs_file_open(struct inode *inode, struct file *filp)
 {
-	int ret = generic_file_open(inode, filp);
 	struct dentry *dir;
 
-	if (!ret && f2fs_encrypted_inode(inode)) {
-		ret = fscrypt_get_encryption_info(inode);
+	if (f2fs_encrypted_inode(inode)) {
+		int ret = fscrypt_get_encryption_info(inode);
 		if (ret)
 			return -EACCES;
 		if (!fscrypt_has_encryption_key(inode))
@@ -460,7 +474,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
 		return -EPERM;
 	}
 	dput(dir);
-	return ret;
+	return dquot_file_open(inode, filp);
 }
 
 int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
@@ -469,9 +483,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 	struct f2fs_node *raw_node;
 	int nr_free = 0, ofs = dn->ofs_in_node, len = count;
 	__le32 *addr;
+	int base = 0;
+
+	if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
+		base = get_extra_isize(dn->inode);
 
 	raw_node = F2FS_NODE(dn->node_page);
-	addr = blkaddr_in_node(raw_node) + ofs;
+	addr = blkaddr_in_node(raw_node) + base + ofs;
 
 	for (; count > 0; count--, addr++, dn->ofs_in_node++) {
 		block_t blkaddr = le32_to_cpu(*addr);
@@ -531,12 +549,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 
 	page = get_lock_data_page(inode, index, true);
 	if (IS_ERR(page))
-		return 0;
+		return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
 truncate_out:
 	f2fs_wait_on_page_writeback(page, DATA, true);
 	zero_user(page, offset, PAGE_SIZE - offset);
-	if (!cache_only || !f2fs_encrypted_inode(inode) ||
-					!S_ISREG(inode->i_mode))
+
+	/* An encrypted inode should have a key and truncate the last page. */
+	f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode));
+	if (!cache_only)
 		set_page_dirty(page);
 	f2fs_put_page(page, 1);
 	return 0;
@@ -569,10 +589,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
 	}
 
 	if (f2fs_has_inline_data(inode)) {
-		if (truncate_inline_inode(ipage, from))
-			set_page_dirty(ipage);
-		if (from == 0)
-			clear_inode_flag(inode, FI_DATA_EXIST);
+		truncate_inline_inode(inode, ipage, from);
 		f2fs_put_page(ipage, 1);
 		truncate_page = true;
 		goto out;
@@ -621,6 +638,12 @@ int f2fs_truncate(struct inode *inode)
 
 	trace_f2fs_truncate(inode);
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
+		f2fs_show_injection_info(FAULT_TRUNCATE);
+		return -EIO;
+	}
+#endif
 	/* we should check inline_data size */
 	if (!f2fs_may_inline_data(inode)) {
 		err = f2fs_convert_inline_inode(inode);
@@ -642,7 +665,6 @@ int f2fs_getattr(struct vfsmount *mnt,
 {
 	struct inode *inode = d_inode(dentry);
 	generic_fillattr(inode, stat);
-	stat->blocks <<= 3;
 	return 0;
 }
 
@@ -686,14 +708,34 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		return err;
 
+	if (is_quota_modification(inode, attr)) {
+		err = dquot_initialize(inode);
+		if (err)
+			return err;
+	}
+	if ((attr->ia_valid & ATTR_UID &&
+		!uid_eq(attr->ia_uid, inode->i_uid)) ||
+		(attr->ia_valid & ATTR_GID &&
+		!gid_eq(attr->ia_gid, inode->i_gid))) {
+		err = dquot_transfer(inode, attr);
+		if (err)
+			return err;
+	}
+
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (f2fs_encrypted_inode(inode) &&
-				fscrypt_get_encryption_info(inode))
-			return -EACCES;
+		if (f2fs_encrypted_inode(inode)) {
+			err = fscrypt_get_encryption_info(inode);
+			if (err)
+				return err;
+			if (!fscrypt_has_encryption_key(inode))
+				return -ENOKEY;
+		}
 
 		if (attr->ia_size <= i_size_read(inode)) {
+			down_write(&F2FS_I(inode)->i_mmap_sem);
 			truncate_setsize(inode, attr->ia_size);
 			err = f2fs_truncate(inode);
+			up_write(&F2FS_I(inode)->i_mmap_sem);
 			if (err)
 				return err;
 		} else {
@@ -701,7 +743,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			 * do not trim all blocks after i_size if target size is
 			 * larger than i_size.
 			 */
+			down_write(&F2FS_I(inode)->i_mmap_sem);
 			truncate_setsize(inode, attr->ia_size);
+			up_write(&F2FS_I(inode)->i_mmap_sem);
 
 			/* should convert inline inode here */
 			if (!f2fs_may_inline_data(inode)) {
@@ -847,12 +891,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 			blk_start = (loff_t)pg_start << PAGE_SHIFT;
 			blk_end = (loff_t)pg_end << PAGE_SHIFT;
+			down_write(&F2FS_I(inode)->i_mmap_sem);
 			truncate_inode_pages_range(mapping, blk_start,
 					blk_end - 1);
 
 			f2fs_lock_op(sbi);
 			ret = truncate_hole(inode, pg_start, pg_end);
 			f2fs_unlock_op(sbi);
+			up_write(&F2FS_I(inode)->i_mmap_sem);
 		}
 	}
 
@@ -883,7 +929,8 @@ next_dnode:
 	done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
 							dn.ofs_in_node, len);
 	for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
-		*blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+		*blkaddr = datablock_addr(dn.inode,
+					dn.node_page, dn.ofs_in_node);
 		if (!is_checkpointed_data(sbi, *blkaddr)) {
 
 			if (test_opt(sbi, LFS)) {
@@ -959,15 +1006,15 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
 				ADDRS_PER_PAGE(dn.node_page, dst_inode) -
 						dn.ofs_in_node, len - i);
 			do {
-				dn.data_blkaddr = datablock_addr(dn.node_page,
-								dn.ofs_in_node);
+				dn.data_blkaddr = datablock_addr(dn.inode,
+						dn.node_page, dn.ofs_in_node);
 				truncate_data_blocks_range(&dn, 1);
 
 				if (do_replace[i]) {
 					f2fs_i_blocks_write(src_inode,
-								1, false);
+							1, false, false);
 					f2fs_i_blocks_write(dst_inode,
-								1, true);
+							1, true, false);
 					f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
 					blkaddr[i], ni.version, true, false);
 
@@ -1019,11 +1066,11 @@ static int __exchange_data_block(struct inode *src_inode,
 	while (len) {
 		olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
 
-		src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+		src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
 		if (!src_blkaddr)
 			return -ENOMEM;
 
-		do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+		do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL);
 		if (!do_replace) {
 			kvfree(src_blkaddr);
 			return -ENOMEM;
@@ -1091,16 +1138,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	pg_start = offset >> PAGE_SHIFT;
 	pg_end = (offset + len) >> PAGE_SHIFT;
 
+	down_write(&F2FS_I(inode)->i_mmap_sem);
 	/* write out all dirty pages from offset */
 	ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
 	if (ret)
-		return ret;
+		goto out;
 
 	truncate_pagecache(inode, offset);
 
 	ret = f2fs_do_collapse(inode, pg_start, pg_end);
 	if (ret)
-		return ret;
+		goto out;
 
 	/* write out all moved pages, if possible */
 	filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -1113,6 +1161,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
 
+out:
+	up_write(&F2FS_I(inode)->i_mmap_sem);
 	return ret;
 }
 
@@ -1126,7 +1176,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
 	int ret;
 
 	for (; index < end; index++, dn->ofs_in_node++) {
-		if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR)
+		if (datablock_addr(dn->inode, dn->node_page,
+					dn->ofs_in_node) == NULL_ADDR)
 			count++;
 	}
 
@@ -1137,8 +1188,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
 
 	dn->ofs_in_node = ofs_in_node;
 	for (index = start; index < end; index++, dn->ofs_in_node++) {
-		dn->data_blkaddr =
-				datablock_addr(dn->node_page, dn->ofs_in_node);
+		dn->data_blkaddr = datablock_addr(dn->inode,
+					dn->node_page, dn->ofs_in_node);
 		/*
 		 * reserve_new_blocks will not guarantee entire block
 		 * allocation.
@@ -1177,9 +1228,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 	if (ret)
 		return ret;
 
+	down_write(&F2FS_I(inode)->i_mmap_sem);
 	ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
 	if (ret)
-		return ret;
+		goto out_sem;
 
 	truncate_pagecache_range(inode, offset, offset + len - 1);
 
@@ -1193,17 +1245,15 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 		ret = fill_zero(inode, pg_start, off_start,
 						off_end - off_start);
 		if (ret)
-			return ret;
+			goto out_sem;
 
-		if (offset + len > new_size)
-			new_size = offset + len;
 		new_size = max_t(loff_t, new_size, offset + len);
 	} else {
 		if (off_start) {
 			ret = fill_zero(inode, pg_start++, off_start,
 						PAGE_SIZE - off_start);
 			if (ret)
-				return ret;
+				goto out_sem;
 
 			new_size = max_t(loff_t, new_size,
 					(loff_t)pg_start << PAGE_SHIFT);
@@ -1252,6 +1302,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 out:
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
 		f2fs_i_size_write(inode, new_size);
+out_sem:
+	up_write(&F2FS_I(inode)->i_mmap_sem);
 
 	return ret;
 }
@@ -1264,8 +1316,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	int ret = 0;
 
 	new_size = i_size_read(inode) + len;
-	if (new_size > inode->i_sb->s_maxbytes)
-		return -EFBIG;
+	ret = inode_newsize_ok(inode, new_size);
+	if (ret)
+		return ret;
 
 	if (offset >= i_size_read(inode))
 		return -EINVAL;
@@ -1280,14 +1333,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	f2fs_balance_fs(sbi, true);
 
+	down_write(&F2FS_I(inode)->i_mmap_sem);
 	ret = truncate_blocks(inode, i_size_read(inode), true);
 	if (ret)
-		return ret;
+		goto out;
 
 	/* write out all dirty pages from offset */
 	ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
 	if (ret)
-		return ret;
+		goto out;
 
 	truncate_pagecache(inode, offset);
 
@@ -1316,6 +1370,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
+out:
+	up_write(&F2FS_I(inode)->i_mmap_sem);
 	return ret;
 }
 
@@ -1435,6 +1491,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
 		drop_inmem_pages(inode);
 	if (f2fs_is_volatile_file(inode)) {
 		clear_inode_flag(inode, FI_VOLATILE_FILE);
+		stat_dec_volatile_write(inode);
 		set_inode_flag(inode, FI_DROP_CACHE);
 		filemap_fdatawrite(inode->i_mapping);
 		clear_inode_flag(inode, FI_DROP_CACHE);
@@ -1442,17 +1499,20 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-#define F2FS_REG_FLMASK		(~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
-#define F2FS_OTHER_FLMASK	(FS_NODUMP_FL | FS_NOATIME_FL)
-
-static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+static int f2fs_file_flush(struct file *file, fl_owner_t id)
 {
-	if (S_ISDIR(mode))
-		return flags;
-	else if (S_ISREG(mode))
-		return flags & F2FS_REG_FLMASK;
-	else
-		return flags & F2FS_OTHER_FLMASK;
+	struct inode *inode = file_inode(file);
+
+	/*
+	 * If the process doing a transaction is crashed, we should do
+	 * roll-back. Otherwise, other reader/write can see corrupted database
+	 * until all the writers close its file. Since this should be done
+	 * before dropping file lock, it needs to do in ->flush.
+	 */
+	if (f2fs_is_atomic_file(inode) &&
+			F2FS_I(inode)->inmem_task == current)
+		drop_inmem_pages(inode);
+	return 0;
 }
 
 static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
@@ -1481,28 +1541,34 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 	if (ret)
 		return ret;
 
-	flags = f2fs_mask_flags(inode->i_mode, flags);
-
 	inode_lock(inode);
 
+	/* Is it quota file? Do not allow user to mess with it */
+	if (IS_NOQUOTA(inode)) {
+		ret = -EPERM;
+		goto unlock_out;
+	}
+
+	flags = f2fs_mask_flags(inode->i_mode, flags);
+
 	oldflags = fi->i_flags;
 
 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
 		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			inode_unlock(inode);
 			ret = -EPERM;
-			goto out;
+			goto unlock_out;
 		}
 	}
 
 	flags = flags & FS_FL_USER_MODIFIABLE;
 	flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
 	fi->i_flags = flags;
-	inode_unlock(inode);
 
 	inode->i_ctime = current_time(inode);
 	f2fs_set_inode_flags(inode);
-out:
+	f2fs_mark_inode_dirty_sync(inode, false);
+unlock_out:
+	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
 }
@@ -1522,6 +1588,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
 	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
@@ -1536,20 +1605,27 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 		goto out;
 
 	set_inode_flag(inode, FI_ATOMIC_FILE);
+	set_inode_flag(inode, FI_HOT_DATA);
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 
 	if (!get_dirty_pages(inode))
-		goto out;
+		goto inc_stat;
 
 	f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
 		"Unexpected flush for atomic writes: ino=%lu, npages=%u",
 					inode->i_ino, get_dirty_pages(inode));
 	ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
-	if (ret)
+	if (ret) {
 		clear_inode_flag(inode, FI_ATOMIC_FILE);
-out:
+		clear_inode_flag(inode, FI_HOT_DATA);
+		goto out;
+	}
+
+inc_stat:
+	F2FS_I(inode)->inmem_task = current;
 	stat_inc_atomic_write(inode);
 	stat_update_max_atomic_write(inode);
+out:
 	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
@@ -1580,10 +1656,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
 		if (!ret) {
 			clear_inode_flag(inode, FI_ATOMIC_FILE);
+			clear_inode_flag(inode, FI_HOT_DATA);
 			stat_dec_atomic_write(inode);
 		}
 	} else {
-		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
 	}
 err_out:
 	inode_unlock(inode);
@@ -1599,6 +1676,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
 	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
@@ -1612,6 +1692,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
 	if (ret)
 		goto out;
 
+	stat_inc_volatile_write(inode);
+	stat_update_max_volatile_write(inode);
+
 	set_inode_flag(inode, FI_VOLATILE_FILE);
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 out:
@@ -1667,6 +1750,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 		drop_inmem_pages(inode);
 	if (f2fs_is_volatile_file(inode)) {
 		clear_inode_flag(inode, FI_VOLATILE_FILE);
+		stat_dec_volatile_write(inode);
 		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
 	}
 
@@ -1712,7 +1796,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 		f2fs_stop_checkpoint(sbi, false);
 		break;
 	case F2FS_GOING_DOWN_METAFLUSH:
-		sync_meta_pages(sbi, META, LONG_MAX);
+		sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO);
 		f2fs_stop_checkpoint(sbi, false);
 		break;
 	default:
@@ -1773,31 +1857,16 @@ static bool uuid_is_nonzero(__u8 u[16])
 
 static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
-	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 
-	if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
-							sizeof(policy)))
-		return -EFAULT;
-
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 
-	return fscrypt_process_policy(filp, &policy);
+	return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
 }
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
-	struct fscrypt_policy policy;
-	struct inode *inode = file_inode(filp);
-	int err;
-
-	err = fscrypt_get_policy(inode, &policy);
-	if (err)
-		return err;
-
-	if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
-		return -EFAULT;
-	return 0;
+	return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 }
 
 static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1863,7 +1932,51 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
 		mutex_lock(&sbi->gc_mutex);
 	}
 
-	ret = f2fs_gc(sbi, sync, true);
+	ret = f2fs_gc(sbi, sync, true, NULL_SEGNO);
+out:
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_gc_range range;
+	u64 end;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg,
+							sizeof(range)))
+		return -EFAULT;
+
+	if (f2fs_readonly(sbi->sb))
+		return -EROFS;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	end = range.start + range.len;
+	if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi))
+		return -EINVAL;
+do_more:
+	if (!range.sync) {
+		if (!mutex_trylock(&sbi->gc_mutex)) {
+			ret = -EBUSY;
+			goto out;
+		}
+	} else {
+		mutex_lock(&sbi->gc_mutex);
+	}
+
+	ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start));
+	range.start += sbi->blocks_per_seg;
+	if (range.start <= end)
+		goto do_more;
 out:
 	mnt_drop_write_file(filp);
 	return ret;
@@ -1897,17 +2010,16 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 {
 	struct inode *inode = file_inode(filp);
 	struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
-	struct extent_info ei;
+	struct extent_info ei = {0,0,0};
 	pgoff_t pg_start, pg_end;
 	unsigned int blk_per_seg = sbi->blocks_per_seg;
 	unsigned int total = 0, sec_num;
-	unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
 	block_t blk_end = 0;
 	bool fragmented = false;
 	int err;
 
 	/* if in-place-update policy is enabled, don't waste time here */
-	if (need_inplace_update(inode))
+	if (need_inplace_update_policy(inode, NULL))
 		return -EINVAL;
 
 	pg_start = range->start >> PAGE_SHIFT;
@@ -1941,7 +2053,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	 */
 	while (map.m_lblk < pg_end) {
 		map.m_len = pg_end - map.m_lblk;
-		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
 		if (err)
 			goto out;
 
@@ -1965,7 +2077,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	map.m_lblk = pg_start;
 	map.m_len = pg_end - pg_start;
 
-	sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+	sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi);
 
 	/*
 	 * make sure there are enough free section for LFS allocation, this can
@@ -1983,7 +2095,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 
 do_map:
 		map.m_len = pg_end - map.m_lblk;
-		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
 		if (err)
 			goto clear_out;
 
@@ -2042,42 +2154,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!S_ISREG(inode->i_mode))
+	if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode))
+		return -EINVAL;
+
+	if (f2fs_readonly(sbi->sb))
+		return -EROFS;
+
+	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
+							sizeof(range)))
+		return -EFAULT;
+
+	/* verify alignment of offset & size */
+	if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1))
+		return -EINVAL;
+
+	if (unlikely((range.start + range.len) >> PAGE_SHIFT >
+					sbi->max_file_blocks))
 		return -EINVAL;
 
 	err = mnt_want_write_file(filp);
 	if (err)
 		return err;
 
-	if (f2fs_readonly(sbi->sb)) {
-		err = -EROFS;
-		goto out;
-	}
-
-	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
-							sizeof(range))) {
-		err = -EFAULT;
-		goto out;
-	}
-
-	/* verify alignment of offset & size */
-	if (range.start & (F2FS_BLKSIZE - 1) ||
-		range.len & (F2FS_BLKSIZE - 1)) {
-		err = -EINVAL;
-		goto out;
-	}
-
 	err = f2fs_defragment_range(sbi, filp, &range);
+	mnt_drop_write_file(filp);
+
 	f2fs_update_time(sbi, REQ_TIME);
 	if (err < 0)
-		goto out;
+		return err;
 
 	if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
 							sizeof(range)))
-		err = -EFAULT;
-out:
-	mnt_drop_write_file(filp);
-	return err;
+		return -EFAULT;
+
+	return 0;
 }
 
 static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
@@ -2211,6 +2321,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
 					range.pos_out, range.len);
 
 	mnt_drop_write_file(filp);
+	if (err)
+		goto err_out;
 
 	if (copy_to_user((struct f2fs_move_range __user *)arg,
 						&range, sizeof(range)))
@@ -2220,6 +2332,79 @@ err_out:
 	return err;
 }
 
+static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct sit_info *sm = SIT_I(sbi);
+	unsigned int start_segno = 0, end_segno = 0;
+	unsigned int dev_start_segno = 0, dev_end_segno = 0;
+	struct f2fs_flush_device range;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (f2fs_readonly(sbi->sb))
+		return -EROFS;
+
+	if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg,
+							sizeof(range)))
+		return -EFAULT;
+
+	if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
+			sbi->segs_per_sec != 1) {
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"Can't flush %u in %d for segs_per_sec %u != 1\n",
+				range.dev_num, sbi->s_ndevs,
+				sbi->segs_per_sec);
+		return -EINVAL;
+	}
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	if (range.dev_num != 0)
+		dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk);
+	dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk);
+
+	start_segno = sm->last_victim[FLUSH_DEVICE];
+	if (start_segno < dev_start_segno || start_segno >= dev_end_segno)
+		start_segno = dev_start_segno;
+	end_segno = min(start_segno + range.segments, dev_end_segno);
+
+	while (start_segno < end_segno) {
+		if (!mutex_trylock(&sbi->gc_mutex)) {
+			ret = -EBUSY;
+			goto out;
+		}
+		sm->last_victim[GC_CB] = end_segno + 1;
+		sm->last_victim[GC_GREEDY] = end_segno + 1;
+		sm->last_victim[ALLOC_NEXT] = end_segno + 1;
+		ret = f2fs_gc(sbi, true, true, start_segno);
+		if (ret == -EAGAIN)
+			ret = 0;
+		else if (ret < 0)
+			break;
+		start_segno++;
+	}
+out:
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+static int f2fs_ioc_get_features(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature);
+
+	/* Must validate to set it with SQLite behavior in Android. */
+	sb_feature |= F2FS_FEATURE_ATOMIC_WRITE;
+
+	return put_user(sb_feature, (u32 __user *)arg);
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -2251,12 +2436,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_get_encryption_pwsalt(filp, arg);
 	case F2FS_IOC_GARBAGE_COLLECT:
 		return f2fs_ioc_gc(filp, arg);
+	case F2FS_IOC_GARBAGE_COLLECT_RANGE:
+		return f2fs_ioc_gc_range(filp, arg);
 	case F2FS_IOC_WRITE_CHECKPOINT:
 		return f2fs_ioc_write_checkpoint(filp, arg);
 	case F2FS_IOC_DEFRAGMENT:
 		return f2fs_ioc_defragment(filp, arg);
 	case F2FS_IOC_MOVE_RANGE:
 		return f2fs_ioc_move_range(filp, arg);
+	case F2FS_IOC_FLUSH_DEVICE:
+		return f2fs_ioc_flush_device(filp, arg);
+	case F2FS_IOC_GET_FEATURES:
+		return f2fs_ioc_get_features(filp, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -2269,16 +2460,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct blk_plug plug;
 	ssize_t ret;
 
-	if (f2fs_encrypted_inode(inode) &&
-				!fscrypt_has_encryption_key(inode) &&
-				fscrypt_get_encryption_info(inode))
-		return -EACCES;
-
 	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0) {
-		int err = f2fs_preallocate_blocks(iocb, from);
+		int err;
 
+		if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
+			set_inode_flag(inode, FI_NO_PREALLOC);
+
+		err = f2fs_preallocate_blocks(iocb, from);
 		if (err) {
 			inode_unlock(inode);
 			return err;
@@ -2286,6 +2476,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		blk_start_plug(&plug);
 		ret = __generic_file_write_iter(iocb, from);
 		blk_finish_plug(&plug);
+		clear_inode_flag(inode, FI_NO_PREALLOC);
+
+		if (ret > 0)
+			f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
 	}
 	inode_unlock(inode);
 
@@ -2322,10 +2516,12 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC_GET_ENCRYPTION_PWSALT:
 	case F2FS_IOC_GET_ENCRYPTION_POLICY:
 	case F2FS_IOC_GARBAGE_COLLECT:
+	case F2FS_IOC_GARBAGE_COLLECT_RANGE:
 	case F2FS_IOC_WRITE_CHECKPOINT:
 	case F2FS_IOC_DEFRAGMENT:
-		break;
 	case F2FS_IOC_MOVE_RANGE:
+	case F2FS_IOC_FLUSH_DEVICE:
+	case F2FS_IOC_GET_FEATURES:
 		break;
 	default:
 		return -ENOIOCTLCMD;
@@ -2341,6 +2537,7 @@ const struct file_operations f2fs_file_operations = {
 	.open		= f2fs_file_open,
 	.release	= f2fs_release_file,
 	.mmap		= f2fs_file_mmap,
+	.flush		= f2fs_file_flush,
 	.fsync		= f2fs_sync_file,
 	.fallocate	= f2fs_fallocate,
 	.unlocked_ioctl	= f2fs_ioctl,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 7f0c3e02408c..bd16e6631cf3 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -28,17 +28,23 @@ static int gc_thread_func(void *data)
 	struct f2fs_sb_info *sbi = data;
 	struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
 	wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
-	long wait_ms;
+	unsigned int wait_ms;
 
 	wait_ms = gc_th->min_sleep_time;
 
+	set_freezable();
 	do {
+		wait_event_interruptible_timeout(*wq,
+				kthread_should_stop() || freezing(current) ||
+				gc_th->gc_wake,
+				msecs_to_jiffies(wait_ms));
+
+		/* give it a try one time */
+		if (gc_th->gc_wake)
+			gc_th->gc_wake = 0;
+
 		if (try_to_freeze())
 			continue;
-		else
-			wait_event_interruptible_timeout(*wq,
-						kthread_should_stop(),
-						msecs_to_jiffies(wait_ms));
 		if (kthread_should_stop())
 			break;
 
@@ -48,10 +54,15 @@ static int gc_thread_func(void *data)
 		}
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-		if (time_to_inject(sbi, FAULT_CHECKPOINT))
+		if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
+			f2fs_show_injection_info(FAULT_CHECKPOINT);
 			f2fs_stop_checkpoint(sbi, false);
+		}
 #endif
 
+		if (!sb_start_write_trylock(sbi->sb))
+			continue;
+
 		/*
 		 * [GC triggering condition]
 		 * 0. GC is not conducted currently.
@@ -66,23 +77,28 @@ static int gc_thread_func(void *data)
 		 * So, I'd like to wait some time to collect dirty segments.
 		 */
 		if (!mutex_trylock(&sbi->gc_mutex))
-			continue;
+			goto next;
+
+		if (gc_th->gc_urgent) {
+			wait_ms = gc_th->urgent_sleep_time;
+			goto do_gc;
+		}
 
 		if (!is_idle(sbi)) {
 			increase_sleep_time(gc_th, &wait_ms);
 			mutex_unlock(&sbi->gc_mutex);
-			continue;
+			goto next;
 		}
 
 		if (has_enough_invalid_blocks(sbi))
 			decrease_sleep_time(gc_th, &wait_ms);
 		else
 			increase_sleep_time(gc_th, &wait_ms);
-
+do_gc:
 		stat_inc_bggc_count(sbi);
 
 		/* if return value is not zero, no victim was selected */
-		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true))
+		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
 			wait_ms = gc_th->no_gc_sleep_time;
 
 		trace_f2fs_background_gc(sbi->sb, wait_ms,
@@ -90,6 +106,8 @@ static int gc_thread_func(void *data)
 
 		/* balancing f2fs's metadata periodically */
 		f2fs_balance_fs_bg(sbi);
+next:
+		sb_end_write(sbi->sb);
 
 	} while (!kthread_should_stop());
 	return 0;
@@ -107,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
 		goto out;
 	}
 
+	gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
 	gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
 	gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
 	gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
 
 	gc_th->gc_idle = 0;
+	gc_th->gc_urgent = 0;
+	gc_th->gc_wake= 0;
 
 	sbi->gc_thread = gc_th;
 	init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
@@ -166,10 +187,15 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
 		p->ofs_unit = sbi->segs_per_sec;
 	}
 
-	if (p->max_search > sbi->max_victim_search)
+	/* we need to check every dirty segments in the FG_GC case */
+	if (gc_type != FG_GC && p->max_search > sbi->max_victim_search)
 		p->max_search = sbi->max_victim_search;
 
-	p->offset = sbi->last_victim[p->gc_mode];
+	/* let's select beginning hot/small space first */
+	if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+		p->offset = 0;
+	else
+		p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
 }
 
 static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
@@ -179,7 +205,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
 	if (p->alloc_mode == SSR)
 		return sbi->blocks_per_seg;
 	if (p->gc_mode == GC_GREEDY)
-		return sbi->blocks_per_seg * p->ofs_unit;
+		return 2 * sbi->blocks_per_seg * p->ofs_unit;
 	else if (p->gc_mode == GC_CB)
 		return UINT_MAX;
 	else /* No other gc_mode */
@@ -199,8 +225,12 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
 	for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
 		if (sec_usage_check(sbi, secno))
 			continue;
+
+		if (no_fggc_candidate(sbi, secno))
+			continue;
+
 		clear_bit(secno, dirty_i->victim_secmap);
-		return secno * sbi->segs_per_sec;
+		return GET_SEG_FROM_SEC(sbi, secno);
 	}
 	return NULL_SEGNO;
 }
@@ -208,8 +238,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
-	unsigned int secno = GET_SECNO(sbi, segno);
-	unsigned int start = secno * sbi->segs_per_sec;
+	unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+	unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
 	unsigned long long mtime = 0;
 	unsigned int vblocks;
 	unsigned char age = 0;
@@ -218,7 +248,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 
 	for (i = 0; i < sbi->segs_per_sec; i++)
 		mtime += get_seg_entry(sbi, start + i)->mtime;
-	vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+	vblocks = get_valid_blocks(sbi, segno, true);
 
 	mtime = div_u64(mtime, sbi->segs_per_sec);
 	vblocks = div_u64(vblocks, sbi->segs_per_sec);
@@ -237,6 +267,16 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 	return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
 }
 
+static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
+						unsigned int segno)
+{
+	unsigned int valid_blocks =
+			get_valid_blocks(sbi, segno, true);
+
+	return IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
+				valid_blocks * 2 : valid_blocks;
+}
+
 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct victim_sel_policy *p)
 {
@@ -245,7 +285,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 
 	/* alloc_mode == LFS */
 	if (p->gc_mode == GC_GREEDY)
-		return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+		return get_greedy_cost(sbi, segno);
 	else
 		return get_cb_cost(sbi, segno);
 }
@@ -274,6 +314,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 		unsigned int *result, int gc_type, int type, char alloc_mode)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+	struct sit_info *sm = SIT_I(sbi);
 	struct victim_sel_policy p;
 	unsigned int secno, last_victim;
 	unsigned int last_segment = MAIN_SEGS(sbi);
@@ -287,10 +328,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 	p.min_segno = NULL_SEGNO;
 	p.min_cost = get_max_cost(sbi, &p);
 
+	if (*result != NULL_SEGNO) {
+		if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
+			get_valid_blocks(sbi, *result, false) &&
+			!sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
+			p.min_segno = *result;
+		goto out;
+	}
+
 	if (p.max_search == 0)
 		goto out;
 
-	last_victim = sbi->last_victim[p.gc_mode];
+	last_victim = sm->last_victim[p.gc_mode];
 	if (p.alloc_mode == LFS && gc_type == FG_GC) {
 		p.min_segno = check_bg_victims(sbi);
 		if (p.min_segno != NULL_SEGNO)
@@ -303,9 +352,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 
 		segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
 		if (segno >= last_segment) {
-			if (sbi->last_victim[p.gc_mode]) {
-				last_segment = sbi->last_victim[p.gc_mode];
-				sbi->last_victim[p.gc_mode] = 0;
+			if (sm->last_victim[p.gc_mode]) {
+				last_segment =
+					sm->last_victim[p.gc_mode];
+				sm->last_victim[p.gc_mode] = 0;
 				p.offset = 0;
 				continue;
 			}
@@ -322,13 +372,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 			nsearched++;
 		}
 
-
-		secno = GET_SECNO(sbi, segno);
+		secno = GET_SEC_FROM_SEG(sbi, segno);
 
 		if (sec_usage_check(sbi, secno))
 			goto next;
 		if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
 			goto next;
+		if (gc_type == FG_GC && p.alloc_mode == LFS &&
+					no_fggc_candidate(sbi, secno))
+			goto next;
 
 		cost = get_gc_cost(sbi, segno, &p);
 
@@ -338,17 +390,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 		}
 next:
 		if (nsearched >= p.max_search) {
-			if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
-				sbi->last_victim[p.gc_mode] = last_victim + 1;
+			if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
+				sm->last_victim[p.gc_mode] = last_victim + 1;
 			else
-				sbi->last_victim[p.gc_mode] = segno + 1;
+				sm->last_victim[p.gc_mode] = segno + 1;
+			sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
 			break;
 		}
 	}
 	if (p.min_segno != NULL_SEGNO) {
 got_it:
 		if (p.alloc_mode == LFS) {
-			secno = GET_SECNO(sbi, p.min_segno);
+			secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
 			if (gc_type == FG_GC)
 				sbi->cur_victim_sec = secno;
 			else
@@ -531,12 +584,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	get_node_info(sbi, nid, dni);
 
 	if (sum->version != dni->version) {
-		f2fs_put_page(node_page, 1);
-		return false;
+		f2fs_msg(sbi->sb, KERN_WARNING,
+				"%s: valid data with mismatched node version.",
+				__func__);
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
 	}
 
 	*nofs = ofs_of_node(node_page);
-	source_blkaddr = datablock_addr(node_page, ofs_in_node);
+	source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
 	f2fs_put_page(node_page, 1);
 
 	if (source_blkaddr != blkaddr)
@@ -544,15 +599,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	return true;
 }
 
-static void move_encrypted_block(struct inode *inode, block_t bidx,
-							unsigned int segno, int off)
+/*
+ * Move data block via META_MAPPING while keeping locked data page.
+ * This can be used to move blocks, aka LBAs, directly on disk.
+ */
+static void move_data_block(struct inode *inode, block_t bidx,
+					unsigned int segno, int off)
 {
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(inode),
 		.type = DATA,
+		.temp = COLD,
 		.op = REQ_OP_READ,
 		.op_flags = REQ_SYNC,
 		.encrypted_page = NULL,
+		.in_list = false,
 	};
 	struct dnode_of_data dn;
 	struct f2fs_summary sum;
@@ -596,7 +657,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
 	fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
 
 	allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
-							&sum, CURSEG_COLD_DATA);
+					&sum, CURSEG_COLD_DATA, NULL, false);
 
 	fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
 					FGP_LOCK | FGP_CREAT, GFP_NOFS);
@@ -634,7 +695,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
 	fio.op = REQ_OP_WRITE;
 	fio.op_flags = REQ_SYNC | REQ_NOIDLE;
 	fio.new_blkaddr = newaddr;
-	f2fs_submit_page_mbio(&fio);
+	f2fs_submit_page_write(&fio);
+
+	f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
 
 	f2fs_update_data_blkaddr(&dn, newaddr);
 	set_inode_flag(inode, FI_APPEND_WRITE);
@@ -676,10 +739,14 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
 		struct f2fs_io_info fio = {
 			.sbi = F2FS_I_SB(inode),
 			.type = DATA,
+			.temp = COLD,
 			.op = REQ_OP_WRITE,
-			.op_flags = REQ_SYNC | REQ_NOIDLE,
+			.op_flags = REQ_SYNC,
+			.old_blkaddr = NULL_ADDR,
 			.page = page,
 			.encrypted_page = NULL,
+			.need_lock = LOCK_REQ,
+			.io_type = FS_GC_DATA_IO,
 		};
 		bool is_dirty = PageDirty(page);
 		int err;
@@ -768,8 +835,7 @@ next_step:
 				continue;
 
 			/* if encrypted inode, let's go phase 3 */
-			if (f2fs_encrypted_inode(inode) &&
-						S_ISREG(inode->i_mode)) {
+			if (f2fs_encrypted_file(inode)) {
 				add_gc_inode(gc_list, inode);
 				continue;
 			}
@@ -803,14 +869,18 @@ next_step:
 					continue;
 				}
 				locked = true;
+
+				/* wait for all inflight aio data */
+				inode_dio_wait(inode);
 			}
 
 			start_bidx = start_bidx_of_node(nofs, inode)
 								+ ofs_in_node;
-			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-				move_encrypted_block(inode, start_bidx, segno, off);
+			if (f2fs_encrypted_file(inode))
+				move_data_block(inode, start_bidx, segno, off);
 			else
-				move_data_page(inode, start_bidx, gc_type, segno, off);
+				move_data_page(inode, start_bidx, gc_type,
+								segno, off);
 
 			if (locked) {
 				up_write(&fi->dio_rwsem[WRITE]);
@@ -847,7 +917,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 	struct blk_plug plug;
 	unsigned int segno = start_segno;
 	unsigned int end_segno = start_segno + sbi->segs_per_sec;
-	int sec_freed = 0;
+	int seg_freed = 0;
 	unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
 						SUM_TYPE_DATA : SUM_TYPE_NODE;
 
@@ -871,7 +941,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 					GET_SUM_BLOCK(sbi, segno));
 		f2fs_put_page(sum_page, 0);
 
-		if (get_valid_blocks(sbi, segno, 1) == 0 ||
+		if (get_valid_blocks(sbi, segno, false) == 0 ||
 				!PageUptodate(sum_page) ||
 				unlikely(f2fs_cp_error(sbi)))
 			goto next;
@@ -886,7 +956,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 		 *   - mutex_lock(sentry_lock)     - change_curseg()
 		 *                                  - lock_page(sum_page)
 		 */
-
 		if (type == SUM_TYPE_NODE)
 			gc_node_segment(sbi, sum->entries, segno, gc_type);
 		else
@@ -894,90 +963,113 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 								gc_type);
 
 		stat_inc_seg_count(sbi, type, gc_type);
+
+		if (gc_type == FG_GC &&
+				get_valid_blocks(sbi, segno, false) == 0)
+			seg_freed++;
 next:
 		f2fs_put_page(sum_page, 0);
 	}
 
 	if (gc_type == FG_GC)
-		f2fs_submit_merged_bio(sbi,
-				(type == SUM_TYPE_NODE) ? NODE : DATA, WRITE);
+		f2fs_submit_merged_write(sbi,
+				(type == SUM_TYPE_NODE) ? NODE : DATA);
 
 	blk_finish_plug(&plug);
 
-	if (gc_type == FG_GC &&
-		get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0)
-		sec_freed = 1;
-
 	stat_inc_call_count(sbi->stat_info);
 
-	return sec_freed;
+	return seg_freed;
 }
 
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
+			bool background, unsigned int segno)
 {
-	unsigned int segno;
 	int gc_type = sync ? FG_GC : BG_GC;
-	int sec_freed = 0;
-	int ret = -EINVAL;
+	int sec_freed = 0, seg_freed = 0, total_freed = 0;
+	int ret = 0;
 	struct cp_control cpc;
+	unsigned int init_segno = segno;
 	struct gc_inode_list gc_list = {
 		.ilist = LIST_HEAD_INIT(gc_list.ilist),
 		.iroot = RADIX_TREE_INIT(GFP_NOFS),
 	};
 
+	trace_f2fs_gc_begin(sbi->sb, sync, background,
+				get_pages(sbi, F2FS_DIRTY_NODES),
+				get_pages(sbi, F2FS_DIRTY_DENTS),
+				get_pages(sbi, F2FS_DIRTY_IMETA),
+				free_sections(sbi),
+				free_segments(sbi),
+				reserved_segments(sbi),
+				prefree_segments(sbi));
+
 	cpc.reason = __get_cp_reason(sbi);
 gc_more:
-	segno = NULL_SEGNO;
-
-	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
+	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) {
+		ret = -EINVAL;
 		goto stop;
+	}
 	if (unlikely(f2fs_cp_error(sbi))) {
 		ret = -EIO;
 		goto stop;
 	}
 
-	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) {
-		gc_type = FG_GC;
+	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
 		/*
-		 * If there is no victim and no prefree segment but still not
-		 * enough free sections, we should flush dent/node blocks and do
-		 * garbage collections.
+		 * For example, if there are many prefree_segments below given
+		 * threshold, we can make them free by checkpoint. Then, we
+		 * secure free segments which doesn't need fggc any more.
 		 */
-		if (__get_victim(sbi, &segno, gc_type) ||
-						prefree_segments(sbi)) {
-			ret = write_checkpoint(sbi, &cpc);
-			if (ret)
-				goto stop;
-			segno = NULL_SEGNO;
-		} else if (has_not_enough_free_secs(sbi, 0, 0)) {
+		if (prefree_segments(sbi)) {
 			ret = write_checkpoint(sbi, &cpc);
 			if (ret)
 				goto stop;
 		}
-	} else if (gc_type == BG_GC && !background) {
-		/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
+		if (has_not_enough_free_secs(sbi, 0, 0))
+			gc_type = FG_GC;
+	}
+
+	/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
+	if (gc_type == BG_GC && !background) {
+		ret = -EINVAL;
+		goto stop;
+	}
+	if (!__get_victim(sbi, &segno, gc_type)) {
+		ret = -ENODATA;
 		goto stop;
 	}
 
-	if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
-		goto stop;
-	ret = 0;
-
-	if (do_garbage_collect(sbi, segno, &gc_list, gc_type) &&
-			gc_type == FG_GC)
+	seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
+	if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
 		sec_freed++;
+	total_freed += seg_freed;
 
 	if (gc_type == FG_GC)
 		sbi->cur_victim_sec = NULL_SEGNO;
 
 	if (!sync) {
-		if (has_not_enough_free_secs(sbi, sec_freed, 0))
+		if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
+			segno = NULL_SEGNO;
 			goto gc_more;
+		}
 
 		if (gc_type == FG_GC)
 			ret = write_checkpoint(sbi, &cpc);
 	}
 stop:
+	SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
+	SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
+
+	trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
+				get_pages(sbi, F2FS_DIRTY_NODES),
+				get_pages(sbi, F2FS_DIRTY_DENTS),
+				get_pages(sbi, F2FS_DIRTY_IMETA),
+				free_sections(sbi),
+				free_segments(sbi),
+				reserved_segments(sbi),
+				prefree_segments(sbi));
+
 	mutex_unlock(&sbi->gc_mutex);
 
 	put_gc_inode(&gc_list);
@@ -989,5 +1081,20 @@ stop:
 
 void build_gc_manager(struct f2fs_sb_info *sbi)
 {
+	u64 main_count, resv_count, ovp_count;
+
 	DIRTY_I(sbi)->v_ops = &default_v_ops;
+
+	/* threshold of # of valid blocks in a section for victims of FG_GC */
+	main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg;
+	resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg;
+	ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
+
+	sbi->fggc_threshold = div64_u64((main_count - ovp_count) *
+				BLKS_PER_SEC(sbi), (main_count - resv_count));
+
+	/* give warm/cold data area from slower device */
+	if (sbi->s_ndevs && sbi->segs_per_sec == 1)
+		SIT_I(sbi)->last_victim[ALLOC_NEXT] =
+				GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
 }
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index a993967dcdb9..9325191fab2d 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -13,6 +13,7 @@
 						 * whether IO subsystem is idle
 						 * or not
 						 */
+#define DEF_GC_THREAD_URGENT_SLEEP_TIME	500	/* 500 ms */
 #define DEF_GC_THREAD_MIN_SLEEP_TIME	30000	/* milliseconds */
 #define DEF_GC_THREAD_MAX_SLEEP_TIME	60000
 #define DEF_GC_THREAD_NOGC_SLEEP_TIME	300000	/* wait 5 min */
@@ -27,12 +28,15 @@ struct f2fs_gc_kthread {
 	wait_queue_head_t gc_wait_queue_head;
 
 	/* for gc sleep time */
+	unsigned int urgent_sleep_time;
 	unsigned int min_sleep_time;
 	unsigned int max_sleep_time;
 	unsigned int no_gc_sleep_time;
 
 	/* for changing gc mode */
 	unsigned int gc_idle;
+	unsigned int gc_urgent;
+	unsigned int gc_wake;
 };
 
 struct gc_inode_list {
@@ -65,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
 }
 
 static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
-								long *wait)
+							unsigned int *wait)
 {
+	unsigned int min_time = gc_th->min_sleep_time;
+	unsigned int max_time = gc_th->max_sleep_time;
+
 	if (*wait == gc_th->no_gc_sleep_time)
 		return;
 
-	*wait += gc_th->min_sleep_time;
-	if (*wait > gc_th->max_sleep_time)
-		*wait = gc_th->max_sleep_time;
+	if ((long long)*wait + (long long)min_time > (long long)max_time)
+		*wait = max_time;
+	else
+		*wait += min_time;
 }
 
 static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
-								long *wait)
+							unsigned int *wait)
 {
+	unsigned int min_time = gc_th->min_sleep_time;
+
 	if (*wait == gc_th->no_gc_sleep_time)
 		*wait = gc_th->max_sleep_time;
 
-	*wait -= gc_th->min_sleep_time;
-	if (*wait <= gc_th->min_sleep_time)
-		*wait = gc_th->min_sleep_time;
+	if ((long long)*wait - (long long)min_time < (long long)min_time)
+		*wait = min_time;
+	else
+		*wait -= min_time;
 }
 
 static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 71b7206c431e..eb2e031ea887 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len,
 		*buf++ = pad;
 }
 
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
+				struct fscrypt_name *fname)
 {
 	__u32 hash;
 	f2fs_hash_t f2fs_hash;
@@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
 	const unsigned char *name = name_info->name;
 	size_t len = name_info->len;
 
+	/* encrypted bigname case */
+	if (fname && !fname->disk_name.name)
+		return cpu_to_le32(fname->hash);
+
 	if (is_dot_dotdot(name_info))
 		return 0;
 
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index fb5d7d1f34aa..fbf22b0f667f 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -23,10 +23,10 @@ bool f2fs_may_inline_data(struct inode *inode)
 	if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
 		return false;
 
-	if (i_size_read(inode) > MAX_INLINE_DATA)
+	if (i_size_read(inode) > MAX_INLINE_DATA(inode))
 		return false;
 
-	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+	if (f2fs_encrypted_file(inode))
 		return false;
 
 	return true;
@@ -45,6 +45,7 @@ bool f2fs_may_inline_dentry(struct inode *inode)
 
 void read_inline_data(struct page *page, struct page *ipage)
 {
+	struct inode *inode = page->mapping->host;
 	void *src_addr, *dst_addr;
 
 	if (PageUptodate(page))
@@ -52,31 +53,33 @@ void read_inline_data(struct page *page, struct page *ipage)
 
 	f2fs_bug_on(F2FS_P_SB(page), page->index);
 
-	zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+	zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE);
 
 	/* Copy the whole inline data block */
-	src_addr = inline_data_addr(ipage);
+	src_addr = inline_data_addr(inode, ipage);
 	dst_addr = kmap_atomic(page);
-	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+	memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
 	flush_dcache_page(page);
 	kunmap_atomic(dst_addr);
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 }
 
-bool truncate_inline_inode(struct page *ipage, u64 from)
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from)
 {
 	void *addr;
 
-	if (from >= MAX_INLINE_DATA)
-		return false;
+	if (from >= MAX_INLINE_DATA(inode))
+		return;
 
-	addr = inline_data_addr(ipage);
+	addr = inline_data_addr(inode, ipage);
 
 	f2fs_wait_on_page_writeback(ipage, NODE, true);
-	memset(addr + from, 0, MAX_INLINE_DATA - from);
+	memset(addr + from, 0, MAX_INLINE_DATA(inode) - from);
 	set_page_dirty(ipage);
-	return true;
+
+	if (from == 0)
+		clear_inode_flag(inode, FI_DATA_EXIST);
 }
 
 int f2fs_read_inline_data(struct inode *inode, struct page *page)
@@ -132,6 +135,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 		.op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO,
 		.page = page,
 		.encrypted_page = NULL,
+		.io_type = FS_DATA_IO,
 	};
 	int dirty, err;
 
@@ -153,6 +157,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	/* write data page to try to make data consistent */
 	set_page_writeback(page);
 	fio.old_blkaddr = dn->data_blkaddr;
+	set_inode_flag(dn->inode, FI_HOT_DATA);
 	write_data_page(dn, &fio);
 	f2fs_wait_on_page_writeback(page, DATA, true);
 	if (dirty) {
@@ -164,11 +169,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	set_inode_flag(dn->inode, FI_APPEND_WRITE);
 
 	/* clear inline data and flag after data writeback */
-	truncate_inline_inode(dn->inode_page, 0);
+	truncate_inline_inode(dn->inode, dn->inode_page, 0);
 	clear_inline_node(dn->inode_page);
 clear_out:
 	stat_dec_inline_inode(dn->inode);
-	f2fs_clear_inline_inode(dn->inode);
+	clear_inode_flag(dn->inode, FI_INLINE_DATA);
 	f2fs_put_dnode(dn);
 	return 0;
 }
@@ -215,6 +220,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
 {
 	void *src_addr, *dst_addr;
 	struct dnode_of_data dn;
+	struct address_space *mapping = page_mapping(page);
+	unsigned long flags;
 	int err;
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -231,11 +238,16 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
 
 	f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
 	src_addr = kmap_atomic(page);
-	dst_addr = inline_data_addr(dn.inode_page);
-	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+	dst_addr = inline_data_addr(inode, dn.inode_page);
+	memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
 	kunmap_atomic(src_addr);
 	set_page_dirty(dn.inode_page);
 
+	spin_lock_irqsave(&mapping->tree_lock, flags);
+	radix_tree_tag_clear(&mapping->page_tree, page_index(page),
+			     PAGECACHE_TAG_DIRTY);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
 	set_inode_flag(inode, FI_APPEND_WRITE);
 	set_inode_flag(inode, FI_DATA_EXIST);
 
@@ -270,9 +282,9 @@ process_inline:
 
 		f2fs_wait_on_page_writeback(ipage, NODE, true);
 
-		src_addr = inline_data_addr(npage);
-		dst_addr = inline_data_addr(ipage);
-		memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+		src_addr = inline_data_addr(inode, npage);
+		dst_addr = inline_data_addr(inode, ipage);
+		memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
 
 		set_inode_flag(inode, FI_INLINE_DATA);
 		set_inode_flag(inode, FI_DATA_EXIST);
@@ -285,9 +297,8 @@ process_inline:
 	if (f2fs_has_inline_data(inode)) {
 		ipage = get_node_page(sbi, inode->i_ino);
 		f2fs_bug_on(sbi, IS_ERR(ipage));
-		if (!truncate_inline_inode(ipage, 0))
-			return false;
-		f2fs_clear_inline_inode(inode);
+		truncate_inline_inode(inode, ipage, 0);
+		clear_inode_flag(inode, FI_INLINE_DATA);
 		f2fs_put_page(ipage, 1);
 	} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
 		if (truncate_blocks(inode, 0, false))
@@ -301,11 +312,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
 			struct fscrypt_name *fname, struct page **res_page)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-	struct f2fs_inline_dentry *inline_dentry;
 	struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
 	struct f2fs_dir_entry *de;
 	struct f2fs_dentry_ptr d;
 	struct page *ipage;
+	void *inline_dentry;
 	f2fs_hash_t namehash;
 
 	ipage = get_node_page(sbi, dir->i_ino);
@@ -314,11 +325,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
 		return NULL;
 	}
 
-	namehash = f2fs_dentry_hash(&name);
+	namehash = f2fs_dentry_hash(&name, fname);
 
-	inline_dentry = inline_data_addr(ipage);
+	inline_dentry = inline_data_addr(dir, ipage);
 
-	make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
 	de = find_target_dentry(fname, namehash, NULL, &d);
 	unlock_page(ipage);
 	if (de)
@@ -332,19 +343,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
 int make_empty_inline_dir(struct inode *inode, struct inode *parent,
 							struct page *ipage)
 {
-	struct f2fs_inline_dentry *dentry_blk;
 	struct f2fs_dentry_ptr d;
+	void *inline_dentry;
 
-	dentry_blk = inline_data_addr(ipage);
+	inline_dentry = inline_data_addr(inode, ipage);
 
-	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
+	make_dentry_ptr_inline(inode, &d, inline_dentry);
 	do_make_empty_dir(inode, parent, &d);
 
 	set_page_dirty(ipage);
 
 	/* update i_size to MAX_INLINE_DATA */
-	if (i_size_read(inode) < MAX_INLINE_DATA)
-		f2fs_i_size_write(inode, MAX_INLINE_DATA);
+	if (i_size_read(inode) < MAX_INLINE_DATA(inode))
+		f2fs_i_size_write(inode, MAX_INLINE_DATA(inode));
 	return 0;
 }
 
@@ -353,11 +364,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
  * release ipage in this function.
  */
 static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
-				struct f2fs_inline_dentry *inline_dentry)
+							void *inline_dentry)
 {
 	struct page *page;
 	struct dnode_of_data dn;
 	struct f2fs_dentry_block *dentry_blk;
+	struct f2fs_dentry_ptr src, dst;
 	int err;
 
 	page = f2fs_grab_cache_page(dir->i_mapping, 0, false);
@@ -372,25 +384,24 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 		goto out;
 
 	f2fs_wait_on_page_writeback(page, DATA, true);
-	zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+	zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE);
 
 	dentry_blk = kmap_atomic(page);
 
+	make_dentry_ptr_inline(dir, &src, inline_dentry);
+	make_dentry_ptr_block(dir, &dst, dentry_blk);
+
 	/* copy data from inline dentry block to new dentry block */
-	memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap,
-					INLINE_DENTRY_BITMAP_SIZE);
-	memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0,
-			SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE);
+	memcpy(dst.bitmap, src.bitmap, src.nr_bitmap);
+	memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap);
 	/*
 	 * we do not need to zero out remainder part of dentry and filename
 	 * field, since we have used bitmap for marking the usage status of
 	 * them, besides, we can also ignore copying/zeroing reserved space
 	 * of dentry block, because them haven't been used so far.
 	 */
-	memcpy(dentry_blk->dentry, inline_dentry->dentry,
-			sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY);
-	memcpy(dentry_blk->filename, inline_dentry->filename,
-					NR_INLINE_DENTRY * F2FS_SLOT_LEN);
+	memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max);
+	memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN);
 
 	kunmap_atomic(dentry_blk);
 	if (!PageUptodate(page))
@@ -398,7 +409,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 	set_page_dirty(page);
 
 	/* clear inline dir and flag after data writeback */
-	truncate_inline_inode(ipage, 0);
+	truncate_inline_inode(dir, ipage, 0);
 
 	stat_dec_inline_dir(dir);
 	clear_inode_flag(dir, FI_INLINE_DENTRY);
@@ -411,14 +422,13 @@ out:
 	return err;
 }
 
-static int f2fs_add_inline_entries(struct inode *dir,
-			struct f2fs_inline_dentry *inline_dentry)
+static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
 {
 	struct f2fs_dentry_ptr d;
 	unsigned long bit_pos = 0;
 	int err = 0;
 
-	make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
 
 	while (bit_pos < d.max) {
 		struct f2fs_dir_entry *de;
@@ -460,20 +470,20 @@ punch_dentry_pages:
 }
 
 static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
-				struct f2fs_inline_dentry *inline_dentry)
+							void *inline_dentry)
 {
-	struct f2fs_inline_dentry *backup_dentry;
+	void *backup_dentry;
 	int err;
 
 	backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir),
-			sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO);
+				MAX_INLINE_DATA(dir), GFP_F2FS_ZERO);
 	if (!backup_dentry) {
 		f2fs_put_page(ipage, 1);
 		return -ENOMEM;
 	}
 
-	memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
-	truncate_inline_inode(ipage, 0);
+	memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir));
+	truncate_inline_inode(dir, ipage, 0);
 
 	unlock_page(ipage);
 
@@ -489,9 +499,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
 	return 0;
 recover:
 	lock_page(ipage);
-	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir));
 	f2fs_i_depth_write(dir, 0);
-	f2fs_i_size_write(dir, MAX_INLINE_DATA);
+	f2fs_i_size_write(dir, MAX_INLINE_DATA(dir));
 	set_page_dirty(ipage);
 	f2fs_put_page(ipage, 1);
 
@@ -500,7 +510,7 @@ recover:
 }
 
 static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
-				struct f2fs_inline_dentry *inline_dentry)
+							void *inline_dentry)
 {
 	if (!F2FS_I(dir)->i_dir_level)
 		return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
@@ -516,7 +526,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
 	struct page *ipage;
 	unsigned int bit_pos;
 	f2fs_hash_t name_hash;
-	struct f2fs_inline_dentry *dentry_blk = NULL;
+	void *inline_dentry = NULL;
 	struct f2fs_dentry_ptr d;
 	int slots = GET_DENTRY_SLOTS(new_name->len);
 	struct page *page = NULL;
@@ -526,11 +536,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
 
-	dentry_blk = inline_data_addr(ipage);
-	bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
-						slots, NR_INLINE_DENTRY);
-	if (bit_pos >= NR_INLINE_DENTRY) {
-		err = f2fs_convert_inline_dir(dir, ipage, dentry_blk);
+	inline_dentry = inline_data_addr(dir, ipage);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
+
+	bit_pos = room_for_filename(d.bitmap, slots, d.max);
+	if (bit_pos >= d.max) {
+		err = f2fs_convert_inline_dir(dir, ipage, inline_dentry);
 		if (err)
 			return err;
 		err = -EAGAIN;
@@ -545,14 +556,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
 			err = PTR_ERR(page);
 			goto fail;
 		}
-		if (f2fs_encrypted_inode(dir))
-			file_set_enc_name(inode);
 	}
 
 	f2fs_wait_on_page_writeback(ipage, NODE, true);
 
-	name_hash = f2fs_dentry_hash(new_name);
-	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
+	name_hash = f2fs_dentry_hash(new_name, NULL);
 	f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
 
 	set_page_dirty(ipage);
@@ -575,7 +583,8 @@ out:
 void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 					struct inode *dir, struct inode *inode)
 {
-	struct f2fs_inline_dentry *inline_dentry;
+	struct f2fs_dentry_ptr d;
+	void *inline_dentry;
 	int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
 	unsigned int bit_pos;
 	int i;
@@ -583,11 +592,12 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	lock_page(page);
 	f2fs_wait_on_page_writeback(page, NODE, true);
 
-	inline_dentry = inline_data_addr(page);
-	bit_pos = dentry - inline_dentry->dentry;
+	inline_dentry = inline_data_addr(dir, page);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
+
+	bit_pos = dentry - d.dentry;
 	for (i = 0; i < slots; i++)
-		__clear_bit_le(bit_pos + i,
-				&inline_dentry->dentry_bitmap);
+		__clear_bit_le(bit_pos + i, d.bitmap);
 
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
@@ -604,20 +614,21 @@ bool f2fs_empty_inline_dir(struct inode *dir)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct page *ipage;
 	unsigned int bit_pos = 2;
-	struct f2fs_inline_dentry *dentry_blk;
+	void *inline_dentry;
+	struct f2fs_dentry_ptr d;
 
 	ipage = get_node_page(sbi, dir->i_ino);
 	if (IS_ERR(ipage))
 		return false;
 
-	dentry_blk = inline_data_addr(ipage);
-	bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
-					NR_INLINE_DENTRY,
-					bit_pos);
+	inline_dentry = inline_data_addr(dir, ipage);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
+
+	bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos);
 
 	f2fs_put_page(ipage, 1);
 
-	if (bit_pos < NR_INLINE_DENTRY)
+	if (bit_pos < d.max)
 		return false;
 
 	return true;
@@ -627,25 +638,27 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 				struct fscrypt_str *fstr)
 {
 	struct inode *inode = file_inode(file);
-	struct f2fs_inline_dentry *inline_dentry = NULL;
 	struct page *ipage = NULL;
 	struct f2fs_dentry_ptr d;
+	void *inline_dentry = NULL;
 	int err;
 
-	if (ctx->pos == NR_INLINE_DENTRY)
+	make_dentry_ptr_inline(inode, &d, inline_dentry);
+
+	if (ctx->pos == d.max)
 		return 0;
 
 	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
 
-	inline_dentry = inline_data_addr(ipage);
+	inline_dentry = inline_data_addr(inode, ipage);
 
-	make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
+	make_dentry_ptr_inline(inode, &d, inline_dentry);
 
 	err = f2fs_fill_dentries(ctx, &d, 0, fstr);
 	if (!err)
-		ctx->pos = NR_INLINE_DENTRY;
+		ctx->pos = d.max;
 
 	f2fs_put_page(ipage, 1);
 	return err < 0 ? err : 0;
@@ -670,7 +683,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
 		goto out;
 	}
 
-	ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode));
+	ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode));
 	if (start >= ilen)
 		goto out;
 	if (start + len < ilen)
@@ -679,7 +692,8 @@ int f2fs_inline_data_fiemap(struct inode *inode,
 
 	get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
 	byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
-	byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage);
+	byteaddr += (char *)inline_data_addr(inode, ipage) -
+					(char *)F2FS_INODE(ipage);
 	err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
 out:
 	f2fs_put_page(ipage, 1);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index af06bda51a54..50c88e37ed66 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -16,6 +16,7 @@
 
 #include "f2fs.h"
 #include "node.h"
+#include "segment.h"
 
 #include <trace/events/f2fs.h>
 
@@ -44,25 +45,26 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_DIRSYNC;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-	f2fs_mark_inode_dirty_sync(inode, false);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 {
+	int extra_size = get_extra_isize(inode);
+
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
-		if (ri->i_addr[0])
-			inode->i_rdev =
-				old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+		if (ri->i_addr[extra_size])
+			inode->i_rdev = old_decode_dev(
+				le32_to_cpu(ri->i_addr[extra_size]));
 		else
-			inode->i_rdev =
-				new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+			inode->i_rdev = new_decode_dev(
+				le32_to_cpu(ri->i_addr[extra_size + 1]));
 	}
 }
 
 static bool __written_first_block(struct f2fs_inode *ri)
 {
-	block_t addr = le32_to_cpu(ri->i_addr[0]);
+	block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]);
 
 	if (addr != NEW_ADDR && addr != NULL_ADDR)
 		return true;
@@ -71,25 +73,27 @@ static bool __written_first_block(struct f2fs_inode *ri)
 
 static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 {
+	int extra_size = get_extra_isize(inode);
+
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
-			ri->i_addr[0] =
+			ri->i_addr[extra_size] =
 				cpu_to_le32(old_encode_dev(inode->i_rdev));
-			ri->i_addr[1] = 0;
+			ri->i_addr[extra_size + 1] = 0;
 		} else {
-			ri->i_addr[0] = 0;
-			ri->i_addr[1] =
+			ri->i_addr[extra_size] = 0;
+			ri->i_addr[extra_size + 1] =
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
-			ri->i_addr[2] = 0;
+			ri->i_addr[extra_size + 2] = 0;
 		}
 	}
 }
 
 static void __recover_inline_status(struct inode *inode, struct page *ipage)
 {
-	void *inline_data = inline_data_addr(ipage);
+	void *inline_data = inline_data_addr(inode, ipage);
 	__le32 *start = inline_data;
-	__le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
+	__le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32);
 
 	while (start < end) {
 		if (*start++) {
@@ -104,12 +108,84 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
 	return;
 }
 
+static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+	int extra_isize = le32_to_cpu(ri->i_extra_isize);
+
+	if (!f2fs_sb_has_inode_chksum(sbi->sb))
+		return false;
+
+	if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR))
+		return false;
+
+	if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum))
+		return false;
+
+	return true;
+}
+
+static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_node *node = F2FS_NODE(page);
+	struct f2fs_inode *ri = &node->i;
+	__le32 ino = node->footer.ino;
+	__le32 gen = ri->i_generation;
+	__u32 chksum, chksum_seed;
+	__u32 dummy_cs = 0;
+	unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum);
+	unsigned int cs_size = sizeof(dummy_cs);
+
+	chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino,
+							sizeof(ino));
+	chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen));
+
+	chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset);
+	chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size);
+	offset += cs_size;
+	chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset,
+						F2FS_BLKSIZE - offset);
+	return chksum;
+}
+
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_inode *ri;
+	__u32 provided, calculated;
+
+	if (!f2fs_enable_inode_chksum(sbi, page) ||
+			PageDirty(page) || PageWriteback(page))
+		return true;
+
+	ri = &F2FS_NODE(page)->i;
+	provided = le32_to_cpu(ri->i_inode_checksum);
+	calculated = f2fs_inode_chksum(sbi, page);
+
+	if (provided != calculated)
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"checksum invalid, ino = %x, %x vs. %x",
+			ino_of_node(page), provided, calculated);
+
+	return provided == calculated;
+}
+
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+
+	if (!f2fs_enable_inode_chksum(sbi, page))
+		return;
+
+	ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page));
+}
+
 static int do_read_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct page *node_page;
 	struct f2fs_inode *ri;
+	projid_t i_projid;
 
 	/* Check if ino is within scope */
 	if (check_nid_range(sbi, inode->i_ino)) {
@@ -130,7 +206,7 @@ static int do_read_inode(struct inode *inode)
 	i_gid_write(inode, le32_to_cpu(ri->i_gid));
 	set_nlink(inode, le32_to_cpu(ri->i_links));
 	inode->i_size = le64_to_cpu(ri->i_size);
-	inode->i_blocks = le64_to_cpu(ri->i_blocks);
+	inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1);
 
 	inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
 	inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
@@ -153,6 +229,9 @@ static int do_read_inode(struct inode *inode)
 
 	get_inline_info(inode, ri);
 
+	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
+					le16_to_cpu(ri->i_extra_isize) : 0;
+
 	/* check data exist */
 	if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
 		__recover_inline_status(inode, node_page);
@@ -166,6 +245,16 @@ static int do_read_inode(struct inode *inode)
 	if (!need_inode_block_update(sbi, inode->i_ino))
 		fi->last_disk_size = inode->i_size;
 
+	if (fi->i_flags & FS_PROJINHERIT_FL)
+		set_inode_flag(inode, FI_PROJ_INHERIT);
+
+	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) &&
+			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
+		i_projid = (projid_t)le32_to_cpu(ri->i_projid);
+	else
+		i_projid = F2FS_DEF_PROJID;
+	fi->i_projid = make_kprojid(&init_user_ns, i_projid);
+
 	f2fs_put_page(node_page, 1);
 
 	stat_inc_inline_xattr(inode);
@@ -226,6 +315,7 @@ make_now:
 		ret = -EIO;
 		goto bad_inode;
 	}
+	f2fs_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	trace_f2fs_iget(inode);
 	return inode;
@@ -267,7 +357,7 @@ int update_inode(struct inode *inode, struct page *node_page)
 	ri->i_gid = cpu_to_le32(i_gid_read(inode));
 	ri->i_links = cpu_to_le32(inode->i_nlink);
 	ri->i_size = cpu_to_le64(i_size_read(inode));
-	ri->i_blocks = cpu_to_le64(inode->i_blocks);
+	ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1);
 
 	if (et) {
 		read_lock(&et->lock);
@@ -291,6 +381,20 @@ int update_inode(struct inode *inode, struct page *node_page)
 	ri->i_generation = cpu_to_le32(inode->i_generation);
 	ri->i_dir_level = F2FS_I(inode)->i_dir_level;
 
+	if (f2fs_has_extra_attr(inode)) {
+		ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
+
+		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
+			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
+								i_projid)) {
+			projid_t i_projid;
+
+			i_projid = from_kprojid(&init_user_ns,
+						F2FS_I(inode)->i_projid);
+			ri->i_projid = cpu_to_le32(i_projid);
+		}
+	}
+
 	__set_inode_rdev(inode, ri);
 	set_cold_node(inode, node_page);
 
@@ -316,7 +420,6 @@ retry:
 		} else if (err != -ENOENT) {
 			f2fs_stop_checkpoint(sbi, false);
 		}
-		f2fs_inode_synced(inode);
 		return 0;
 	}
 	ret = update_inode(inode, node_page);
@@ -339,7 +442,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	 * We need to balance fs here to prevent from producing dirty node pages
 	 * during the urgent cleaning time when runing out of free sections.
 	 */
-	if (update_inode_page(inode) && wbc && wbc->nr_to_write)
+	update_inode_page(inode);
+	if (wbc && wbc->nr_to_write)
 		f2fs_balance_fs(sbi, true);
 	return 0;
 }
@@ -372,10 +476,7 @@ void f2fs_evict_inode(struct inode *inode)
 	if (inode->i_nlink || is_bad_inode(inode))
 		goto no_delete;
 
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-	if (time_to_inject(sbi, FAULT_EVICT_INODE))
-		goto no_delete;
-#endif
+	dquot_initialize(inode);
 
 	remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
 	remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
@@ -387,6 +488,12 @@ retry:
 	if (F2FS_HAS_BLOCKS(inode))
 		err = f2fs_truncate(inode);
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
+		f2fs_show_injection_info(FAULT_EVICT_INODE);
+		err = -EIO;
+	}
+#endif
 	if (!err) {
 		f2fs_lock_op(sbi);
 		err = remove_inode_page(inode);
@@ -403,13 +510,22 @@ retry:
 
 	if (err)
 		update_inode_page(inode);
+	dquot_free_inode(inode);
 	sb_end_intwrite(inode->i_sb);
 no_delete:
+	dquot_drop(inode);
+
 	stat_dec_inline_xattr(inode);
 	stat_dec_inline_dir(inode);
 	stat_dec_inline_inode(inode);
 
-	invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
+	if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))
+		f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
+
+	/* ino == 0, if f2fs_new_inode() was failed t*/
+	if (inode->i_ino)
+		invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
+							inode->i_ino);
 	if (xnid)
 		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
 	if (inode->i_nlink) {
@@ -421,9 +537,10 @@ no_delete:
 	if (is_inode_flag_set(inode, FI_FREE_NID)) {
 		alloc_nid_failed(sbi, inode->i_ino);
 		clear_inode_flag(inode, FI_FREE_NID);
+	} else {
+		f2fs_bug_on(sbi, err &&
+			!exist_written_data(sbi, inode->i_ino, ORPHAN_INO));
 	}
-	f2fs_bug_on(sbi, err &&
-		!exist_written_data(sbi, inode->i_ino, ORPHAN_INO));
 out_clear:
 	fscrypt_put_encryption_info(inode, NULL);
 	clear_inode(inode);
@@ -446,6 +563,7 @@ void handle_failed_inode(struct inode *inode)
 	 * in a panic when flushing dirty inodes in gdirty_list.
 	 */
 	update_inode_page(inode);
+	f2fs_inode_synced(inode);
 
 	/* don't make bad inode, since it becomes a regular file. */
 	unlock_new_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index db3079cd665d..d92b8e9064cb 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -15,6 +15,7 @@
 #include <linux/ctype.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
+#include <linux/quotaops.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -42,6 +43,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	}
 	f2fs_unlock_op(sbi);
 
+	nid_free = true;
+
 	inode_init_owner(inode, dir, mode);
 
 	inode->i_ino = ino;
@@ -52,16 +55,35 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	err = insert_inode_locked(inode);
 	if (err) {
 		err = -EINVAL;
-		nid_free = true;
 		goto fail;
 	}
 
+	if (f2fs_sb_has_project_quota(sbi->sb) &&
+		(F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL))
+		F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
+	else
+		F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
+							F2FS_DEF_PROJID);
+
+	err = dquot_initialize(inode);
+	if (err)
+		goto fail_drop;
+
+	err = dquot_alloc_inode(inode);
+	if (err)
+		goto fail_drop;
+
 	/* If the directory encrypted, then we should encrypt the inode. */
 	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
 		f2fs_set_encrypted_inode(inode);
 
 	set_inode_flag(inode, FI_NEW_INODE);
 
+	if (f2fs_sb_has_extra_attr(sbi->sb)) {
+		set_inode_flag(inode, FI_EXTRA_ATTR);
+		F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
+	}
+
 	if (test_opt(sbi, INLINE_XATTR))
 		set_inode_flag(inode, FI_INLINE_XATTR);
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
@@ -75,6 +97,15 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	stat_inc_inline_inode(inode);
 	stat_inc_inline_dir(inode);
 
+	F2FS_I(inode)->i_flags =
+		f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
+
+	if (S_ISDIR(inode->i_mode))
+		F2FS_I(inode)->i_flags |= FS_INDEX_FL;
+
+	if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL)
+		set_inode_flag(inode, FI_PROJ_INHERIT);
+
 	trace_f2fs_new_inode(inode, 0);
 	return inode;
 
@@ -85,6 +116,16 @@ fail:
 		set_inode_flag(inode, FI_FREE_NID);
 	iput(inode);
 	return ERR_PTR(err);
+fail_drop:
+	trace_f2fs_new_inode(inode, err);
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
+	if (nid_free)
+		set_inode_flag(inode, FI_FREE_NID);
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return ERR_PTR(err);
 }
 
 static int is_multimedia_file(const unsigned char *s, const char *sub)
@@ -136,6 +177,10 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	nid_t ino = 0;
 	int err;
 
+	err = dquot_initialize(dir);
+	if (err)
+		return err;
+
 	inode = f2fs_new_inode(dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -148,8 +193,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	ino = inode->i_ino;
 
-	f2fs_balance_fs(sbi, true);
-
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -163,6 +206,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
+
+	f2fs_balance_fs(sbi, true);
 	return 0;
 out:
 	handle_failed_inode(inode);
@@ -180,6 +225,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 			!fscrypt_has_permitted_context(dir, inode))
 		return -EPERM;
 
+	if (is_inode_flag_set(dir, FI_PROJ_INHERIT) &&
+			(!projid_eq(F2FS_I(dir)->i_projid,
+			F2FS_I(old_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
+	err = dquot_initialize(dir);
+	if (err)
+		return err;
+
 	f2fs_balance_fs(sbi, true);
 
 	inode->i_ctime = current_time(inode);
@@ -233,6 +287,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
 		return 0;
 	}
 
+	err = dquot_initialize(dir);
+	if (err)
+		return err;
+
 	f2fs_balance_fs(sbi, true);
 
 	f2fs_lock_op(sbi);
@@ -324,9 +382,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	if (f2fs_encrypted_inode(dir) &&
 	    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
 	    !fscrypt_has_permitted_context(dir, inode)) {
-		bool nokey = f2fs_encrypted_inode(inode) &&
-			!fscrypt_has_encryption_key(inode);
-		err = nokey ? -ENOKEY : -EPERM;
+		f2fs_msg(inode->i_sb, KERN_WARNING,
+			 "Inconsistent encryption contexts: %lu/%lu",
+			 dir->i_ino, inode->i_ino);
+		err = -EPERM;
 		goto err_out;
 	}
 	return d_splice_alias(inode, dentry);
@@ -346,6 +405,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 
 	trace_f2fs_unlink_enter(dir, dentry);
 
+	err = dquot_initialize(dir);
+	if (err)
+		return err;
+
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
 	if (!de) {
 		if (IS_ERR(page))
@@ -400,7 +463,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 			return err;
 
 		if (!fscrypt_has_encryption_key(dir))
-			return -EPERM;
+			return -ENOKEY;
 
 		disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
 				sizeof(struct fscrypt_symlink_data));
@@ -409,6 +472,10 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	if (disk_link.len > dir->i_sb->s_blocksize)
 		return -ENAMETOOLONG;
 
+	err = dquot_initialize(dir);
+	if (err)
+		return err;
+
 	inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -420,8 +487,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 
-	f2fs_balance_fs(sbi, true);
-
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -444,7 +509,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 			goto err_out;
 
 		if (!fscrypt_has_encryption_key(inode)) {
-			err = -EPERM;
+			err = -ENOKEY;
 			goto err_out;
 		}
 
@@ -484,6 +549,8 @@ err_out:
 	}
 
 	kfree(sd);
+
+	f2fs_balance_fs(sbi, true);
 	return err;
 out:
 	handle_failed_inode(inode);
@@ -496,6 +563,10 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	int err;
 
+	err = dquot_initialize(dir);
+	if (err)
+		return err;
+
 	inode = f2fs_new_inode(dir, S_IFDIR | mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -505,8 +576,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
 
-	f2fs_balance_fs(sbi, true);
-
 	set_inode_flag(inode, FI_INC_LINK);
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
@@ -521,6 +590,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
+
+	f2fs_balance_fs(sbi, true);
 	return 0;
 
 out_fail:
@@ -544,6 +615,10 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err = 0;
 
+	err = dquot_initialize(dir);
+	if (err)
+		return err;
+
 	inode = f2fs_new_inode(dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -551,8 +626,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	init_special_inode(inode, inode->i_mode, rdev);
 	inode->i_op = &f2fs_special_inode_operations;
 
-	f2fs_balance_fs(sbi, true);
-
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -566,6 +639,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
+
+	f2fs_balance_fs(sbi, true);
 	return 0;
 out:
 	handle_failed_inode(inode);
@@ -579,6 +654,10 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err;
 
+	err = dquot_initialize(dir);
+	if (err)
+		return err;
+
 	inode = f2fs_new_inode(dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -592,8 +671,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	}
 
-	f2fs_balance_fs(sbi, true);
-
 	f2fs_lock_op(sbi);
 	err = acquire_orphan_inode(sbi);
 	if (err)
@@ -619,6 +696,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
 	/* link_count was changed by d_tmpfile as well. */
 	f2fs_unlock_op(sbi);
 	unlock_new_inode(inode);
+
+	f2fs_balance_fs(sbi, true);
 	return 0;
 
 release_out:
@@ -672,6 +751,19 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out;
 	}
 
+	if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+			(!projid_eq(F2FS_I(new_dir)->i_projid,
+			F2FS_I(old_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
+	err = dquot_initialize(old_dir);
+	if (err)
+		goto out;
+
+	err = dquot_initialize(new_dir);
+	if (err)
+		goto out;
+
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_entry) {
 		if (IS_ERR(old_page))
@@ -717,13 +809,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (err)
 			goto put_out_dir;
 
-		err = update_dent_inode(old_inode, new_inode,
-						&new_dentry->d_name);
-		if (err) {
-			release_orphan_inode(sbi);
-			goto put_out_dir;
-		}
-
 		f2fs_set_link(new_dir, new_entry, new_page, old_inode);
 
 		new_inode->i_ctime = current_time(new_inode);
@@ -775,9 +860,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 
 	down_write(&F2FS_I(old_inode)->i_sem);
-	file_lost_pino(old_inode);
-	if (new_inode && file_enc_name(new_inode))
-		file_set_enc_name(old_inode);
+	if (!old_dir_entry || whiteout)
+		file_lost_pino(old_inode);
+	else
+		F2FS_I(old_inode)->i_pino = new_dir->i_ino;
 	up_write(&F2FS_I(old_inode)->i_sem);
 
 	old_inode->i_ctime = current_time(old_inode);
@@ -858,6 +944,22 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 			 !fscrypt_has_permitted_context(old_dir, new_inode)))
 		return -EPERM;
 
+	if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+			!projid_eq(F2FS_I(new_dir)->i_projid,
+			F2FS_I(old_dentry->d_inode)->i_projid)) ||
+	    (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+			!projid_eq(F2FS_I(old_dir)->i_projid,
+			F2FS_I(new_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
+	err = dquot_initialize(old_dir);
+	if (err)
+		goto out;
+
+	err = dquot_initialize(new_dir);
+	if (err)
+		goto out;
+
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_entry) {
 		if (IS_ERR(old_page))
@@ -905,8 +1007,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 		old_nlink = old_dir_entry ? -1 : 1;
 		new_nlink = -old_nlink;
 		err = -EMLINK;
-		if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) ||
-			(new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX))
+		if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) ||
+			(new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX))
 			goto out_new_dir;
 	}
 
@@ -914,18 +1016,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	f2fs_lock_op(sbi);
 
-	err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
-	if (err)
-		goto out_unlock;
-	if (file_enc_name(new_inode))
-		file_set_enc_name(old_inode);
-
-	err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name);
-	if (err)
-		goto out_undo;
-	if (file_enc_name(old_inode))
-		file_set_enc_name(new_inode);
-
 	/* update ".." directory entry info of old dentry */
 	if (old_dir_entry)
 		f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
@@ -941,7 +1031,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	file_lost_pino(old_inode);
 	up_write(&F2FS_I(old_inode)->i_sem);
 
-	old_dir->i_ctime = CURRENT_TIME;
+	old_dir->i_ctime = current_time(old_dir);
 	if (old_nlink) {
 		down_write(&F2FS_I(old_dir)->i_sem);
 		f2fs_i_links_write(old_dir, old_nlink > 0);
@@ -956,7 +1046,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	file_lost_pino(new_inode);
 	up_write(&F2FS_I(new_inode)->i_sem);
 
-	new_dir->i_ctime = CURRENT_TIME;
+	new_dir->i_ctime = current_time(new_dir);
 	if (new_nlink) {
 		down_write(&F2FS_I(new_dir)->i_sem);
 		f2fs_i_links_write(new_dir, new_nlink > 0);
@@ -969,14 +1059,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		f2fs_sync_fs(sbi->sb, 1);
 	return 0;
-out_undo:
-	/*
-	 * Still we may fail to recover name info of f2fs_inode here
-	 * Drop it, once its name is set as encrypted
-	 */
-	update_dent_inode(old_inode, old_inode, &old_dentry->d_name);
-out_unlock:
-	f2fs_unlock_op(sbi);
 out_new_dir:
 	if (new_dir_entry) {
 		f2fs_dentry_kunmap(new_inode, new_dir_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index bc67dc323f7e..32474db18ad9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,10 +19,11 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "xattr.h"
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
-#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
+#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
 
 static struct kmem_cache *nat_entry_slab;
 static struct kmem_cache *free_nid_slab;
@@ -63,8 +64,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 		int i;
 
 		for (i = 0; i <= UPDATE_INO; i++)
-			mem_size += (sbi->im[i].ino_num *
-				sizeof(struct ino_entry)) >> PAGE_SHIFT;
+			mem_size += sbi->im[i].ino_num *
+						sizeof(struct ino_entry);
+		mem_size >>= PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == EXTENT_CACHE) {
 		mem_size = (atomic_read(&sbi->total_ext_tree) *
@@ -157,9 +159,6 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
 	struct nat_entry_set *head;
 
-	if (get_nat_flag(ne, IS_DIRTY))
-		return;
-
 	head = radix_tree_lookup(&nm_i->nat_set_root, set);
 	if (!head) {
 		head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
@@ -170,25 +169,27 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 		head->entry_cnt = 0;
 		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
 	}
-	list_move_tail(&ne->list, &head->entry_list);
+
+	if (get_nat_flag(ne, IS_DIRTY))
+		goto refresh_list;
+
 	nm_i->dirty_nat_cnt++;
 	head->entry_cnt++;
 	set_nat_flag(ne, IS_DIRTY, true);
+refresh_list:
+	if (nat_get_blkaddr(ne) == NEW_ADDR)
+		list_del_init(&ne->list);
+	else
+		list_move_tail(&ne->list, &head->entry_list);
 }
 
 static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
-						struct nat_entry *ne)
+		struct nat_entry_set *set, struct nat_entry *ne)
 {
-	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
-	struct nat_entry_set *head;
-
-	head = radix_tree_lookup(&nm_i->nat_set_root, set);
-	if (head) {
-		list_move_tail(&ne->list, &nm_i->nat_entries);
-		set_nat_flag(ne, IS_DIRTY, false);
-		head->entry_cnt--;
-		nm_i->dirty_nat_cnt--;
-	}
+	list_move_tail(&ne->list, &nm_i->nat_entries);
+	set_nat_flag(ne, IS_DIRTY, false);
+	set->entry_cnt--;
+	nm_i->dirty_nat_cnt--;
 }
 
 static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
@@ -381,6 +382,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 	struct page *page = NULL;
 	struct f2fs_nat_entry ne;
 	struct nat_entry *e;
+	pgoff_t index;
 	int i;
 
 	ni->nid = nid;
@@ -406,17 +408,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 		node_info_from_raw_nat(ni, &ne);
 	}
 	up_read(&curseg->journal_rwsem);
-	if (i >= 0)
+	if (i >= 0) {
+		up_read(&nm_i->nat_tree_lock);
 		goto cache;
+	}
 
 	/* Fill node_info from nat page */
-	page = get_current_nat_page(sbi, start_nid);
+	index = current_nat_addr(sbi, nid);
+	up_read(&nm_i->nat_tree_lock);
+
+	page = get_meta_page(sbi, index);
 	nat_blk = (struct f2fs_nat_block *)page_address(page);
 	ne = nat_blk->entries[nid - start_nid];
 	node_info_from_raw_nat(ni, &ne);
 	f2fs_put_page(page, 1);
 cache:
-	up_read(&nm_i->nat_tree_lock);
 	/* cache nat entry */
 	down_write(&nm_i->nat_tree_lock);
 	cache_nat_entry(sbi, nid, &ne);
@@ -549,7 +555,7 @@ static int get_node_path(struct inode *inode, long block,
 		level = 3;
 		goto got;
 	} else {
-		BUG();
+		return -E2BIG;
 	}
 got:
 	return level;
@@ -573,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 	int err = 0;
 
 	level = get_node_path(dn->inode, index, offset, noffset);
+	if (level < 0)
+		return level;
 
 	nids[0] = dn->inode->i_ino;
 	npage[0] = dn->inode_page;
@@ -608,7 +616,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 			}
 
 			dn->nid = nids[i];
-			npage[i] = new_node_page(dn, noffset[i], NULL);
+			npage[i] = new_node_page(dn, noffset[i]);
 			if (IS_ERR(npage[i])) {
 				alloc_nid_failed(sbi, nids[i]);
 				err = PTR_ERR(npage[i]);
@@ -649,7 +657,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 	dn->nid = nids[level];
 	dn->ofs_in_node = offset[level];
 	dn->node_page = npage[level];
-	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+	dn->data_blkaddr = datablock_addr(dn->inode,
+				dn->node_page, dn->ofs_in_node);
 	return 0;
 
 release_pages:
@@ -673,15 +682,11 @@ static void truncate_node(struct dnode_of_data *dn)
 	struct node_info ni;
 
 	get_node_info(sbi, dn->nid, &ni);
-	if (dn->inode->i_blocks == 0) {
-		f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
-		goto invalidate;
-	}
 	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
 
 	/* Deallocate node address */
 	invalidate_blocks(sbi, ni.blk_addr);
-	dec_valid_node_count(sbi, dn->inode);
+	dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino);
 	set_node_addr(sbi, &ni, NULL_ADDR, false);
 
 	if (dn->nid == dn->inode->i_ino) {
@@ -689,7 +694,7 @@ static void truncate_node(struct dnode_of_data *dn)
 		dec_valid_inode_count(sbi);
 		f2fs_inode_synced(dn->inode);
 	}
-invalidate:
+
 	clear_node_page_dirty(dn->node_page);
 	set_sbi_flag(sbi, SBI_IS_DIRTY);
 
@@ -875,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 	trace_f2fs_truncate_inode_blocks_enter(inode, from);
 
 	level = get_node_path(inode, from, offset, noffset);
+	if (level < 0)
+		return level;
 
 	page = get_node_page(sbi, inode->i_ino);
 	if (IS_ERR(page)) {
@@ -971,9 +978,6 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
 
 	f2fs_i_xnid_write(inode, 0);
 
-	/* need to do checkpoint during fsync */
-	F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
-
 	set_new_dnode(&dn, inode, page, npage, nid);
 
 	if (page)
@@ -1009,7 +1013,7 @@ int remove_inode_page(struct inode *inode)
 
 	/* 0 is possible, after f2fs_new_inode() has failed */
 	f2fs_bug_on(F2FS_I_SB(inode),
-			inode->i_blocks != 0 && inode->i_blocks != 1);
+			inode->i_blocks != 0 && inode->i_blocks != 8);
 
 	/* will put inode & node pages */
 	truncate_node(&dn);
@@ -1024,14 +1028,13 @@ struct page *new_inode_page(struct inode *inode)
 	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
 
 	/* caller should f2fs_put_page(page, 1); */
-	return new_node_page(&dn, 0, NULL);
+	return new_node_page(&dn, 0);
 }
 
-struct page *new_node_page(struct dnode_of_data *dn,
-				unsigned int ofs, struct page *ipage)
+struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
-	struct node_info old_ni, new_ni;
+	struct node_info new_ni;
 	struct page *page;
 	int err;
 
@@ -1042,17 +1045,18 @@ struct page *new_node_page(struct dnode_of_data *dn,
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
-		err = -ENOSPC;
+	if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs))))
 		goto fail;
-	}
 
-	get_node_info(sbi, dn->nid, &old_ni);
-
-	/* Reinitialize old_ni with new node page */
-	f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
-	new_ni = old_ni;
+#ifdef CONFIG_F2FS_CHECK_FS
+	get_node_info(sbi, dn->nid, &new_ni);
+	f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
+#endif
+	new_ni.nid = dn->nid;
 	new_ni.ino = dn->inode->i_ino;
+	new_ni.blk_addr = NULL_ADDR;
+	new_ni.flag = 0;
+	new_ni.version = 0;
 	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
 
 	f2fs_wait_on_page_writeback(page, NODE, true);
@@ -1153,6 +1157,7 @@ repeat:
 		f2fs_put_page(page, 1);
 		return ERR_PTR(err);
 	} else if (err == LOCKED_PAGE) {
+		err = 0;
 		goto page_hit;
 	}
 
@@ -1166,15 +1171,27 @@ repeat:
 		goto repeat;
 	}
 
-	if (unlikely(!PageUptodate(page)))
+	if (unlikely(!PageUptodate(page))) {
+		err = -EIO;
 		goto out_err;
+	}
+
+	if (!f2fs_inode_chksum_verify(sbi, page)) {
+		err = -EBADMSG;
+		goto out_err;
+	}
 page_hit:
 	if(unlikely(nid != nid_of_node(page))) {
-		f2fs_bug_on(sbi, 1);
-		ClearPageUptodate(page);
+		f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, "
+			"nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+			nid, nid_of_node(page), ino_of_node(page),
+			ofs_of_node(page), cpver_of_node(page),
+			next_blkaddr_of_node(page));
+		err = -EINVAL;
 out_err:
+		ClearPageUptodate(page);
 		f2fs_put_page(page, 1);
-		return ERR_PTR(-EIO);
+		return ERR_PTR(err);
 	}
 	return page;
 }
@@ -1318,16 +1335,103 @@ continue_unlock:
 	return last_page;
 }
 
+static int __write_node_page(struct page *page, bool atomic, bool *submitted,
+				struct writeback_control *wbc, bool do_balance,
+				enum iostat_type io_type)
+{
+	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+	nid_t nid;
+	struct node_info ni;
+	struct f2fs_io_info fio = {
+		.sbi = sbi,
+		.type = NODE,
+		.op = REQ_OP_WRITE,
+		.op_flags = wbc_to_write_flags(wbc),
+		.page = page,
+		.encrypted_page = NULL,
+		.submitted = false,
+		.io_type = io_type,
+	};
+
+	trace_f2fs_writepage(page, NODE);
+
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		goto redirty_out;
+	if (unlikely(f2fs_cp_error(sbi)))
+		goto redirty_out;
+
+	/* get old block addr of this node page */
+	nid = nid_of_node(page);
+	f2fs_bug_on(sbi, page->index != nid);
+
+	if (wbc->for_reclaim) {
+		if (!down_read_trylock(&sbi->node_write))
+			goto redirty_out;
+	} else {
+		down_read(&sbi->node_write);
+	}
+
+	get_node_info(sbi, nid, &ni);
+
+	/* This page is already truncated */
+	if (unlikely(ni.blk_addr == NULL_ADDR)) {
+		ClearPageUptodate(page);
+		dec_page_count(sbi, F2FS_DIRTY_NODES);
+		up_read(&sbi->node_write);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (atomic && !test_opt(sbi, NOBARRIER))
+		fio.op_flags |= WRITE_FLUSH_FUA;
+
+	set_page_writeback(page);
+	fio.old_blkaddr = ni.blk_addr;
+	write_node_page(nid, &fio);
+	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
+	dec_page_count(sbi, F2FS_DIRTY_NODES);
+	up_read(&sbi->node_write);
+
+	if (wbc->for_reclaim) {
+		f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0,
+						page->index, NODE);
+		submitted = NULL;
+	}
+
+	unlock_page(page);
+
+	if (unlikely(f2fs_cp_error(sbi))) {
+		f2fs_submit_merged_write(sbi, NODE);
+		submitted = NULL;
+	}
+	if (submitted)
+		*submitted = fio.submitted;
+
+	if (do_balance)
+		f2fs_balance_fs(sbi, false);
+	return 0;
+
+redirty_out:
+	redirty_page_for_writepage(wbc, page);
+	return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int f2fs_write_node_page(struct page *page,
+				struct writeback_control *wbc)
+{
+	return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
+}
+
 int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
 			struct writeback_control *wbc, bool atomic)
 {
 	pgoff_t index, end;
+	pgoff_t last_idx = ULONG_MAX;
 	struct pagevec pvec;
 	int ret = 0;
 	struct page *last_page = NULL;
 	bool marked = false;
 	nid_t ino = inode->i_ino;
-	int nwritten = 0;
 
 	if (atomic) {
 		last_page = last_fsync_dnode(sbi, ino);
@@ -1349,6 +1453,7 @@ retry:
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
+			bool submitted = false;
 
 			if (unlikely(f2fs_cp_error(sbi))) {
 				f2fs_put_page(last_page, 0);
@@ -1380,6 +1485,9 @@ continue_unlock:
 			f2fs_wait_on_page_writeback(page, NODE, true);
 			BUG_ON(PageWriteback(page));
 
+			set_fsync_mark(page, 0);
+			set_dentry_mark(page, 0);
+
 			if (!atomic || page == last_page) {
 				set_fsync_mark(page, 1);
 				if (IS_INODE(page)) {
@@ -1397,13 +1505,16 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
+			ret = __write_node_page(page, atomic &&
+						page == last_page,
+						&submitted, wbc, true,
+						FS_NODE_IO);
 			if (ret) {
 				unlock_page(page);
 				f2fs_put_page(last_page, 0);
 				break;
-			} else {
-				nwritten++;
+			} else if (submitted) {
+				last_idx = page->index;
 			}
 
 			if (page == last_page) {
@@ -1429,12 +1540,13 @@ continue_unlock:
 		goto retry;
 	}
 out:
-	if (nwritten)
-		f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE);
+	if (last_idx != ULONG_MAX)
+		f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE);
 	return ret ? -EIO: 0;
 }
 
-int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
+int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
+				bool do_balance, enum iostat_type io_type)
 {
 	pgoff_t index, end;
 	struct pagevec pvec;
@@ -1458,6 +1570,7 @@ next_step:
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
+			bool submitted = false;
 
 			if (unlikely(f2fs_cp_error(sbi))) {
 				pagevec_release(&pvec);
@@ -1511,9 +1624,11 @@ continue_unlock:
 			set_fsync_mark(page, 0);
 			set_dentry_mark(page, 0);
 
-			if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+			ret = __write_node_page(page, false, &submitted,
+						wbc, do_balance, io_type);
+			if (ret)
 				unlock_page(page);
-			else
+			else if (submitted)
 				nwritten++;
 
 			if (--wbc->nr_to_write == 0)
@@ -1534,7 +1649,7 @@ continue_unlock:
 	}
 out:
 	if (nwritten)
-		f2fs_submit_merged_bio(sbi, NODE, WRITE);
+		f2fs_submit_merged_write(sbi, NODE);
 	return ret;
 }
 
@@ -1580,72 +1695,6 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 	return ret;
 }
 
-static int f2fs_write_node_page(struct page *page,
-				struct writeback_control *wbc)
-{
-	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
-	nid_t nid;
-	struct node_info ni;
-	struct f2fs_io_info fio = {
-		.sbi = sbi,
-		.type = NODE,
-		.op = REQ_OP_WRITE,
-		.op_flags = wbc_to_write_flags(wbc),
-		.page = page,
-		.encrypted_page = NULL,
-	};
-
-	trace_f2fs_writepage(page, NODE);
-
-	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
-		goto redirty_out;
-	if (unlikely(f2fs_cp_error(sbi)))
-		goto redirty_out;
-
-	/* get old block addr of this node page */
-	nid = nid_of_node(page);
-	f2fs_bug_on(sbi, page->index != nid);
-
-	if (wbc->for_reclaim) {
-		if (!down_read_trylock(&sbi->node_write))
-			goto redirty_out;
-	} else {
-		down_read(&sbi->node_write);
-	}
-
-	get_node_info(sbi, nid, &ni);
-
-	/* This page is already truncated */
-	if (unlikely(ni.blk_addr == NULL_ADDR)) {
-		ClearPageUptodate(page);
-		dec_page_count(sbi, F2FS_DIRTY_NODES);
-		up_read(&sbi->node_write);
-		unlock_page(page);
-		return 0;
-	}
-
-	set_page_writeback(page);
-	fio.old_blkaddr = ni.blk_addr;
-	write_node_page(nid, &fio);
-	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
-	dec_page_count(sbi, F2FS_DIRTY_NODES);
-	up_read(&sbi->node_write);
-
-	if (wbc->for_reclaim)
-		f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE);
-
-	unlock_page(page);
-
-	if (unlikely(f2fs_cp_error(sbi)))
-		f2fs_submit_merged_bio(sbi, NODE, WRITE);
-
-	return 0;
-
-redirty_out:
-	redirty_page_for_writepage(wbc, page);
-	return AOP_WRITEPAGE_ACTIVATE;
-}
-
 static int f2fs_write_node_pages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
@@ -1653,6 +1702,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 	struct blk_plug plug;
 	long diff;
 
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		goto skip_write;
+
 	/* balancing f2fs's metadata in background */
 	f2fs_balance_fs_bg(sbi);
 
@@ -1665,7 +1717,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 	diff = nr_pages_to_write(sbi, NODE, wbc);
 	wbc->sync_mode = WB_SYNC_NONE;
 	blk_start_plug(&plug);
-	sync_node_pages(sbi, wbc);
+	sync_node_pages(sbi, wbc, true, FS_NODE_IO);
 	blk_finish_plug(&plug);
 	wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
 	return 0;
@@ -1743,43 +1795,71 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
 		radix_tree_delete(&nm_i->free_nid_root, i->nid);
 }
 
-static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
+/* return if the nid is recognized as free */
+static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct free_nid *i;
+	struct free_nid *i, *e;
 	struct nat_entry *ne;
-	int err;
+	int err = -EINVAL;
+	bool ret = false;
 
 	/* 0 nid should not be used */
 	if (unlikely(nid == 0))
-		return 0;
-
-	if (build) {
-		/* do not add allocated nids */
-		ne = __lookup_nat_cache(nm_i, nid);
-		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
-				nat_get_blkaddr(ne) != NULL_ADDR))
-			return 0;
-	}
+		return false;
 
 	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
 	i->nid = nid;
 	i->state = NID_NEW;
 
-	if (radix_tree_preload(GFP_NOFS)) {
-		kmem_cache_free(free_nid_slab, i);
-		return 0;
-	}
+	if (radix_tree_preload(GFP_NOFS))
+		goto err;
 
 	spin_lock(&nm_i->nid_list_lock);
+
+	if (build) {
+		/*
+		 *   Thread A             Thread B
+		 *  - f2fs_create
+		 *   - f2fs_new_inode
+		 *    - alloc_nid
+		 *     - __insert_nid_to_list(ALLOC_NID_LIST)
+		 *                     - f2fs_balance_fs_bg
+		 *                      - build_free_nids
+		 *                       - __build_free_nids
+		 *                        - scan_nat_page
+		 *                         - add_free_nid
+		 *                          - __lookup_nat_cache
+		 *  - f2fs_add_link
+		 *   - init_inode_metadata
+		 *    - new_inode_page
+		 *     - new_node_page
+		 *      - set_node_addr
+		 *  - alloc_nid_done
+		 *   - __remove_nid_from_list(ALLOC_NID_LIST)
+		 *                         - __insert_nid_to_list(FREE_NID_LIST)
+		 */
+		ne = __lookup_nat_cache(nm_i, nid);
+		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+				nat_get_blkaddr(ne) != NULL_ADDR))
+			goto err_out;
+
+		e = __lookup_free_nid_list(nm_i, nid);
+		if (e) {
+			if (e->state == NID_NEW)
+				ret = true;
+			goto err_out;
+		}
+	}
+	ret = true;
 	err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+err_out:
 	spin_unlock(&nm_i->nid_list_lock);
 	radix_tree_preload_end();
-	if (err) {
+err:
+	if (err)
 		kmem_cache_free(free_nid_slab, i);
-		return 0;
-	}
-	return 1;
+	return ret;
 }
 
 static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
@@ -1800,17 +1880,45 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 		kmem_cache_free(free_nid_slab, i);
 }
 
+static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
+							bool set, bool build)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
+	unsigned int nid_ofs = nid - START_NID(nid);
+
+	if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
+		return;
+
+	if (set)
+		__set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+	else
+		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+
+	if (set)
+		nm_i->free_nid_count[nat_ofs]++;
+	else if (!build)
+		nm_i->free_nid_count[nat_ofs]--;
+}
+
 static void scan_nat_page(struct f2fs_sb_info *sbi,
 			struct page *nat_page, nid_t start_nid)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct f2fs_nat_block *nat_blk = page_address(nat_page);
 	block_t blk_addr;
+	unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
 	int i;
 
+	if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
+		return;
+
+	__set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
+
 	i = start_nid % NAT_ENTRY_PER_BLOCK;
 
 	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
+		bool freed = false;
 
 		if (unlikely(start_nid >= nm_i->max_nid))
 			break;
@@ -1818,11 +1926,58 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
 		f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
 		if (blk_addr == NULL_ADDR)
-			add_free_nid(sbi, start_nid, true);
+			freed = add_free_nid(sbi, start_nid, true);
+		spin_lock(&NM_I(sbi)->nid_list_lock);
+		update_free_nid_bitmap(sbi, start_nid, freed, true);
+		spin_unlock(&NM_I(sbi)->nid_list_lock);
 	}
 }
 
-static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
+static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	struct f2fs_journal *journal = curseg->journal;
+	unsigned int i, idx;
+
+	down_read(&nm_i->nat_tree_lock);
+
+	for (i = 0; i < nm_i->nat_blocks; i++) {
+		if (!test_bit_le(i, nm_i->nat_block_bitmap))
+			continue;
+		if (!nm_i->free_nid_count[i])
+			continue;
+		for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
+			nid_t nid;
+
+			if (!test_bit_le(idx, nm_i->free_nid_bitmap[i]))
+				continue;
+
+			nid = i * NAT_ENTRY_PER_BLOCK + idx;
+			add_free_nid(sbi, nid, true);
+
+			if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS)
+				goto out;
+		}
+	}
+out:
+	down_read(&curseg->journal_rwsem);
+	for (i = 0; i < nats_in_cursum(journal); i++) {
+		block_t addr;
+		nid_t nid;
+
+		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+		nid = le32_to_cpu(nid_in_journal(journal, i));
+		if (addr == NULL_ADDR)
+			add_free_nid(sbi, nid, true);
+		else
+			remove_free_nid(sbi, nid);
+	}
+	up_read(&curseg->journal_rwsem);
+	up_read(&nm_i->nat_tree_lock);
+}
+
+static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1830,6 +1985,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
 	int i = 0;
 	nid_t nid = nm_i->next_scan_nid;
 
+	if (unlikely(nid >= nm_i->max_nid))
+		nid = 0;
+
 	/* Enough entries */
 	if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
 		return;
@@ -1837,6 +1995,14 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
 	if (!sync && !available_free_memory(sbi, FREE_NIDS))
 		return;
 
+	if (!mount) {
+		/* try to find free nids in free_nid_bitmap */
+		scan_free_nid_bits(sbi);
+
+		if (nm_i->nid_cnt[FREE_NID_LIST])
+			return;
+	}
+
 	/* readahead nat pages to be scanned */
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
 							META_NAT, true);
@@ -1879,10 +2045,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
 					nm_i->ra_nid_pages, META_NAT, false);
 }
 
-void build_free_nids(struct f2fs_sb_info *sbi, bool sync)
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 {
 	mutex_lock(&NM_I(sbi)->build_lock);
-	__build_free_nids(sbi, sync);
+	__build_free_nids(sbi, sync, mount);
 	mutex_unlock(&NM_I(sbi)->build_lock);
 }
 
@@ -1897,8 +2063,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 	struct free_nid *i = NULL;
 retry:
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-	if (time_to_inject(sbi, FAULT_ALLOC_NID))
+	if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
+		f2fs_show_injection_info(FAULT_ALLOC_NID);
 		return false;
+	}
 #endif
 	spin_lock(&nm_i->nid_list_lock);
 
@@ -1918,13 +2086,16 @@ retry:
 		i->state = NID_ALLOC;
 		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
 		nm_i->available_nids--;
+
+		update_free_nid_bitmap(sbi, *nid, false, false);
+
 		spin_unlock(&nm_i->nid_list_lock);
 		return true;
 	}
 	spin_unlock(&nm_i->nid_list_lock);
 
 	/* Let's scan nat pages and its caches to get free nids */
-	build_free_nids(sbi, true);
+	build_free_nids(sbi, true, false);
 	goto retry;
 }
 
@@ -1972,6 +2143,8 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 
 	nm_i->available_nids++;
 
+	update_free_nid_bitmap(sbi, nid, true, false);
+
 	spin_unlock(&nm_i->nid_list_lock);
 
 	if (need_free)
@@ -2034,38 +2207,47 @@ update_inode:
 	f2fs_put_page(ipage, 1);
 }
 
-void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
-	nid_t new_xnid = nid_of_node(page);
+	nid_t new_xnid;
+	struct dnode_of_data dn;
 	struct node_info ni;
+	struct page *xpage;
 
-	/* 1: invalidate the previous xattr nid */
 	if (!prev_xnid)
 		goto recover_xnid;
 
-	/* Deallocate node address */
+	/* 1: invalidate the previous xattr nid */
 	get_node_info(sbi, prev_xnid, &ni);
 	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
 	invalidate_blocks(sbi, ni.blk_addr);
-	dec_valid_node_count(sbi, inode);
+	dec_valid_node_count(sbi, inode, false);
 	set_node_addr(sbi, &ni, NULL_ADDR, false);
 
 recover_xnid:
-	/* 2: allocate new xattr nid */
-	if (unlikely(!inc_valid_node_count(sbi, inode)))
-		f2fs_bug_on(sbi, 1);
+	/* 2: update xattr nid in inode */
+	if (!alloc_nid(sbi, &new_xnid))
+		return -ENOSPC;
 
-	remove_free_nid(sbi, new_xnid);
-	get_node_info(sbi, new_xnid, &ni);
-	ni.ino = inode->i_ino;
-	set_node_addr(sbi, &ni, NEW_ADDR, false);
-	f2fs_i_xnid_write(inode, new_xnid);
+	set_new_dnode(&dn, inode, NULL, NULL, new_xnid);
+	xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
+	if (IS_ERR(xpage)) {
+		alloc_nid_failed(sbi, new_xnid);
+		return PTR_ERR(xpage);
+	}
 
-	/* 3: update xattr blkaddr */
-	refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
-	set_node_addr(sbi, &ni, blkaddr, false);
+	alloc_nid_done(sbi, new_xnid);
+	update_inode_page(inode);
+
+	/* 3: update and set xattr node page dirty */
+	memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
+
+	set_page_dirty(xpage);
+	f2fs_put_page(xpage, 1);
+
+	return 0;
 }
 
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -2101,12 +2283,19 @@ retry:
 	dst->i_blocks = cpu_to_le64(1);
 	dst->i_links = cpu_to_le32(1);
 	dst->i_xattr_nid = 0;
-	dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
+	dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
+	if (dst->i_inline & F2FS_EXTRA_ATTR) {
+		dst->i_extra_isize = src->i_extra_isize;
+		if (f2fs_sb_has_project_quota(sbi->sb) &&
+			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+								i_projid))
+			dst->i_projid = src->i_projid;
+	}
 
 	new_ni = old_ni;
 	new_ni.ino = ino;
 
-	if (unlikely(!inc_valid_node_count(sbi, NULL)))
+	if (unlikely(inc_valid_node_count(sbi, NULL, true)))
 		WARN_ON(1);
 	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
 	inc_valid_inode_count(sbi);
@@ -2208,8 +2397,39 @@ add_out:
 	list_add_tail(&nes->set_list, head);
 }
 
+static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
+						struct page *page)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
+	struct f2fs_nat_block *nat_blk = page_address(page);
+	int valid = 0;
+	int i;
+
+	if (!enabled_nat_bits(sbi, NULL))
+		return;
+
+	for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) {
+		if (start_nid == 0 && i == 0)
+			valid++;
+		if (nat_blk->entries[i].block_addr)
+			valid++;
+	}
+	if (valid == 0) {
+		__set_bit_le(nat_index, nm_i->empty_nat_bits);
+		__clear_bit_le(nat_index, nm_i->full_nat_bits);
+		return;
+	}
+
+	__clear_bit_le(nat_index, nm_i->empty_nat_bits);
+	if (valid == NAT_ENTRY_PER_BLOCK)
+		__set_bit_le(nat_index, nm_i->full_nat_bits);
+	else
+		__clear_bit_le(nat_index, nm_i->full_nat_bits);
+}
+
 static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
-					struct nat_entry_set *set)
+		struct nat_entry_set *set, struct cp_control *cpc)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
 	struct f2fs_journal *journal = curseg->journal;
@@ -2224,7 +2444,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 	 * #1, flush nat entries to journal in current hot data summary block.
 	 * #2, flush nat entries to nat page.
 	 */
-	if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
+	if (enabled_nat_bits(sbi, cpc) ||
+		!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
 		to_journal = false;
 
 	if (to_journal) {
@@ -2241,8 +2462,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 		nid_t nid = nat_get_nid(ne);
 		int offset;
 
-		if (nat_get_blkaddr(ne) == NEW_ADDR)
-			continue;
+		f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);
 
 		if (to_journal) {
 			offset = lookup_journal_in_cursum(journal,
@@ -2255,30 +2475,38 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 		}
 		raw_nat_from_node_info(raw_ne, &ne->ni);
 		nat_reset_flag(ne);
-		__clear_nat_cache_dirty(NM_I(sbi), ne);
+		__clear_nat_cache_dirty(NM_I(sbi), set, ne);
 		if (nat_get_blkaddr(ne) == NULL_ADDR) {
 			add_free_nid(sbi, nid, false);
 			spin_lock(&NM_I(sbi)->nid_list_lock);
 			NM_I(sbi)->available_nids++;
+			update_free_nid_bitmap(sbi, nid, true, false);
+			spin_unlock(&NM_I(sbi)->nid_list_lock);
+		} else {
+			spin_lock(&NM_I(sbi)->nid_list_lock);
+			update_free_nid_bitmap(sbi, nid, false, false);
 			spin_unlock(&NM_I(sbi)->nid_list_lock);
 		}
 	}
 
-	if (to_journal)
+	if (to_journal) {
 		up_write(&curseg->journal_rwsem);
-	else
+	} else {
+		__update_nat_bits(sbi, start_nid, page);
 		f2fs_put_page(page, 1);
+	}
 
-	f2fs_bug_on(sbi, set->entry_cnt);
-
-	radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
-	kmem_cache_free(nat_entry_set_slab, set);
+	/* Allow dirty nats by node block allocation in write_begin */
+	if (!set->entry_cnt) {
+		radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+		kmem_cache_free(nat_entry_set_slab, set);
+	}
 }
 
 /*
  * This function is called during the checkpointing process.
  */
-void flush_nat_entries(struct f2fs_sb_info *sbi)
+void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -2299,7 +2527,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 	 * entries, remove all entries from journal and merge them
 	 * into nat entry set.
 	 */
-	if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+	if (enabled_nat_bits(sbi, cpc) ||
+		!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
 		remove_nats_in_journal(sbi);
 
 	while ((found = __gang_lookup_nat_set(nm_i,
@@ -2313,11 +2542,86 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 
 	/* flush dirty nats in nat entry set */
 	list_for_each_entry_safe(set, tmp, &sets, set_list)
-		__flush_nat_entry_set(sbi, set);
+		__flush_nat_entry_set(sbi, set, cpc);
 
 	up_write(&nm_i->nat_tree_lock);
+	/* Allow dirty nats by node block allocation in write_begin */
+}
 
-	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
+static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
+	unsigned int i;
+	__u64 cp_ver = cur_cp_version(ckpt);
+	block_t nat_bits_addr;
+
+	if (!enabled_nat_bits(sbi, NULL))
+		return 0;
+
+	nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
+						F2FS_BLKSIZE - 1);
+	nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS,
+						GFP_KERNEL);
+	if (!nm_i->nat_bits)
+		return -ENOMEM;
+
+	nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
+						nm_i->nat_bits_blocks;
+	for (i = 0; i < nm_i->nat_bits_blocks; i++) {
+		struct page *page = get_meta_page(sbi, nat_bits_addr++);
+
+		memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
+					page_address(page), F2FS_BLKSIZE);
+		f2fs_put_page(page, 1);
+	}
+
+	cp_ver |= (cur_cp_crc(ckpt) << 32);
+	if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
+		disable_nat_bits(sbi, true);
+		return 0;
+	}
+
+	nm_i->full_nat_bits = nm_i->nat_bits + 8;
+	nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
+
+	f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
+	return 0;
+}
+
+static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int i = 0;
+	nid_t nid, last_nid;
+
+	if (!enabled_nat_bits(sbi, NULL))
+		return;
+
+	for (i = 0; i < nm_i->nat_blocks; i++) {
+		i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
+		if (i >= nm_i->nat_blocks)
+			break;
+
+		__set_bit_le(i, nm_i->nat_block_bitmap);
+
+		nid = i * NAT_ENTRY_PER_BLOCK;
+		last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK;
+
+		spin_lock(&NM_I(sbi)->nid_list_lock);
+		for (; nid < last_nid; nid++)
+			update_free_nid_bitmap(sbi, nid, true, true);
+		spin_unlock(&NM_I(sbi)->nid_list_lock);
+	}
+
+	for (i = 0; i < nm_i->nat_blocks; i++) {
+		i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
+		if (i >= nm_i->nat_blocks)
+			break;
+
+		__set_bit_le(i, nm_i->nat_block_bitmap);
+	}
 }
 
 static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -2325,15 +2629,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	unsigned char *version_bitmap;
-	unsigned int nat_segs, nat_blocks;
+	unsigned int nat_segs;
+	int err;
 
 	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
 
 	/* segment_count_nat includes pair segment so divide to 2. */
 	nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
-	nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
-
-	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+	nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
+	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
 
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
 	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
@@ -2367,6 +2671,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	if (!nm_i->nat_bitmap)
 		return -ENOMEM;
 
+	err = __get_nat_bitmaps(sbi);
+	if (err)
+		return err;
+
 #ifdef CONFIG_F2FS_CHECK_FS
 	nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
 					GFP_KERNEL);
@@ -2377,6 +2685,27 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	return 0;
 }
 
+static int init_free_nid_cache(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks *
+					NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
+	if (!nm_i->free_nid_bitmap)
+		return -ENOMEM;
+
+	nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8,
+								GFP_KERNEL);
+	if (!nm_i->nat_block_bitmap)
+		return -ENOMEM;
+
+	nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks *
+					sizeof(unsigned short), GFP_KERNEL);
+	if (!nm_i->free_nid_count)
+		return -ENOMEM;
+	return 0;
+}
+
 int build_node_manager(struct f2fs_sb_info *sbi)
 {
 	int err;
@@ -2389,7 +2718,14 @@ int build_node_manager(struct f2fs_sb_info *sbi)
 	if (err)
 		return err;
 
-	build_free_nids(sbi, true);
+	err = init_free_nid_cache(sbi);
+	if (err)
+		return err;
+
+	/* load free nid status from nat_bits table */
+	load_free_nid_bitmap(sbi);
+
+	build_free_nids(sbi, true, true);
 	return 0;
 }
 
@@ -2447,7 +2783,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 	}
 	up_write(&nm_i->nat_tree_lock);
 
+	kvfree(nm_i->nat_block_bitmap);
+	kvfree(nm_i->free_nid_bitmap);
+	kvfree(nm_i->free_nid_count);
+
 	kfree(nm_i->nat_bitmap);
+	kfree(nm_i->nat_bits);
 #ifdef CONFIG_F2FS_CHECK_FS
 	kfree(nm_i->nat_bitmap_mir);
 #endif
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 29ff783eb9c3..bb53e9955ff2 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -9,10 +9,10 @@
  * published by the Free Software Foundation.
  */
 /* start node id of a node block dedicated to the given node id */
-#define	START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+#define	START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
 
 /* node block offset on the NAT area dedicated to the given start node id */
-#define	NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+#define	NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK)
 
 /* # of pages to perform synchronous readahead before building free nids */
 #define FREE_NID_PAGES	8
@@ -62,16 +62,16 @@ struct nat_entry {
 	struct node_info ni;	/* in-memory node information */
 };
 
-#define nat_get_nid(nat)		(nat->ni.nid)
-#define nat_set_nid(nat, n)		(nat->ni.nid = n)
-#define nat_get_blkaddr(nat)		(nat->ni.blk_addr)
-#define nat_set_blkaddr(nat, b)		(nat->ni.blk_addr = b)
-#define nat_get_ino(nat)		(nat->ni.ino)
-#define nat_set_ino(nat, i)		(nat->ni.ino = i)
-#define nat_get_version(nat)		(nat->ni.version)
-#define nat_set_version(nat, v)		(nat->ni.version = v)
+#define nat_get_nid(nat)		((nat)->ni.nid)
+#define nat_set_nid(nat, n)		((nat)->ni.nid = (n))
+#define nat_get_blkaddr(nat)		((nat)->ni.blk_addr)
+#define nat_set_blkaddr(nat, b)		((nat)->ni.blk_addr = (b))
+#define nat_get_ino(nat)		((nat)->ni.ino)
+#define nat_set_ino(nat, i)		((nat)->ni.ino = (i))
+#define nat_get_version(nat)		((nat)->ni.version)
+#define nat_set_version(nat, v)		((nat)->ni.version = (v))
 
-#define inc_node_version(version)	(++version)
+#define inc_node_version(version)	(++(version))
 
 static inline void copy_node_info(struct node_info *dst,
 						struct node_info *src)
@@ -200,21 +200,18 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	pgoff_t block_off;
 	pgoff_t block_addr;
-	int seg_off;
 
+	/*
+	 * block_off = segment_off * 512 + off_in_segment
+	 * OLD = (segment_off * 512) * 2 + off_in_segment
+	 * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment
+	 */
 	block_off = NAT_BLOCK_OFFSET(start);
-	seg_off = block_off >> sbi->log_blocks_per_seg;
 
 	block_addr = (pgoff_t)(nm_i->nat_blkaddr +
-		(seg_off << sbi->log_blocks_per_seg << 1) +
+		(block_off << 1) -
 		(block_off & (sbi->blocks_per_seg - 1)));
 
-#ifdef CONFIG_F2FS_CHECK_FS
-	if (f2fs_test_bit(block_off, nm_i->nat_bitmap) !=
-			f2fs_test_bit(block_off, nm_i->nat_bitmap_mir))
-		f2fs_bug_on(sbi, 1);
-#endif
-
 	if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
 		block_addr += sbi->blocks_per_seg;
 
@@ -227,11 +224,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
 	block_addr -= nm_i->nat_blkaddr;
-	if ((block_addr >> sbi->log_blocks_per_seg) % 2)
-		block_addr -= sbi->blocks_per_seg;
-	else
-		block_addr += sbi->blocks_per_seg;
-
+	block_addr ^= 1 << sbi->log_blocks_per_seg;
 	return block_addr + nm_i->nat_blkaddr;
 }
 
@@ -306,14 +299,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
 	struct f2fs_node *rn = F2FS_NODE(page);
-	size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
-	__u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver);
+	__u64 cp_ver = cur_cp_version(ckpt);
+
+	if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
+		cp_ver |= (cur_cp_crc(ckpt) << 32);
 
-	if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
-		__u64 crc = le32_to_cpu(*((__le32 *)
-				((unsigned char *)ckpt + crc_offset)));
-		cp_ver |= (crc << 32);
-	}
 	rn->footer.cp_ver = cpu_to_le64(cp_ver);
 	rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
 }
@@ -321,14 +311,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
 static inline bool is_recoverable_dnode(struct page *page)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
-	size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
 	__u64 cp_ver = cur_cp_version(ckpt);
 
-	if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
-		__u64 crc = le32_to_cpu(*((__le32 *)
-				((unsigned char *)ckpt + crc_offset)));
-		cp_ver |= (crc << 32);
-	}
+	if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
+		cp_ver |= (cur_cp_crc(ckpt) << 32);
+
 	return cp_ver == cpver_of_node(page);
 }
 
@@ -358,7 +345,7 @@ static inline bool IS_DNODE(struct page *node_page)
 	unsigned int ofs = ofs_of_node(node_page);
 
 	if (f2fs_has_xattr_block(ofs))
-		return false;
+		return true;
 
 	if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
 			ofs == 5 + 2 * NIDS_PER_BLOCK)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 4a3c48c24c10..9626758bc762 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 }
 
 static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
-					struct list_head *head, nid_t ino)
+			struct list_head *head, nid_t ino, bool quota_inode)
 {
 	struct inode *inode;
 	struct fsync_inode_entry *entry;
+	int err;
 
 	inode = f2fs_iget_retry(sbi->sb, ino);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
+	err = dquot_initialize(inode);
+	if (err)
+		goto err_out;
+
+	if (quota_inode) {
+		err = dquot_alloc_inode(inode);
+		if (err)
+			goto err_out;
+	}
+
 	entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
 	entry->inode = inode;
 	list_add_tail(&entry->list, head);
 
 	return entry;
+err_out:
+	iput(inode);
+	return ERR_PTR(err);
 }
 
 static void del_fsync_inode(struct fsync_inode_entry *entry)
@@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
 
 	entry = get_fsync_inode(dir_list, pino);
 	if (!entry) {
-		entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino);
+		entry = add_fsync_inode(F2FS_I_SB(inode), dir_list,
+							pino, false);
 		if (IS_ERR(entry)) {
 			dir = ERR_CAST(entry);
 			err = PTR_ERR(entry);
@@ -140,6 +155,13 @@ retry:
 				err = -EEXIST;
 			goto out_unmap_put;
 		}
+
+		err = dquot_initialize(einode);
+		if (err) {
+			iput(einode);
+			goto out_unmap_put;
+		}
+
 		err = acquire_orphan_inode(F2FS_I_SB(inode));
 		if (err) {
 			iput(einode);
@@ -198,7 +220,8 @@ static void recover_inode(struct inode *inode, struct page *page)
 			ino_of_node(page), name);
 }
 
-static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
+				bool check_only)
 {
 	struct curseg_info *curseg;
 	struct page *page = NULL;
@@ -225,17 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 
 		entry = get_fsync_inode(head, ino_of_node(page));
 		if (!entry) {
-			if (IS_INODE(page) && is_dent_dnode(page)) {
+			bool quota_inode = false;
+
+			if (!check_only &&
+					IS_INODE(page) && is_dent_dnode(page)) {
 				err = recover_inode_page(sbi, page);
 				if (err)
 					break;
+				quota_inode = true;
 			}
 
 			/*
 			 * CP | dnode(F) | inode(DF)
 			 * For this case, we should not give up now.
 			 */
-			entry = add_fsync_inode(sbi, head, ino_of_node(page));
+			entry = add_fsync_inode(sbi, head, ino_of_node(page),
+								quota_inode);
 			if (IS_ERR(entry)) {
 				err = PTR_ERR(entry);
 				if (err == -ENOENT) {
@@ -326,10 +354,18 @@ got_it:
 	f2fs_put_page(node_page, 1);
 
 	if (ino != dn->inode->i_ino) {
+		int ret;
+
 		/* Deallocate previous index in the node page */
 		inode = f2fs_iget_retry(sbi->sb, ino);
 		if (IS_ERR(inode))
 			return PTR_ERR(inode);
+
+		ret = dquot_initialize(inode);
+		if (ret) {
+			iput(inode);
+			return ret;
+		}
 	} else {
 		inode = dn->inode;
 	}
@@ -359,7 +395,8 @@ out:
 	return 0;
 
 truncate_out:
-	if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
+	if (datablock_addr(tdn.inode, tdn.node_page,
+					tdn.ofs_in_node) == blkaddr)
 		truncate_data_blocks_range(&tdn, 1);
 	if (dn->inode->i_ino == nid && !dn->inode_page_locked)
 		unlock_page(dn->inode_page);
@@ -378,11 +415,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	if (IS_INODE(page)) {
 		recover_inline_xattr(inode, page);
 	} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
-		/*
-		 * Deprecated; xattr blocks should be found from cold log.
-		 * But, we should remain this for backward compatibility.
-		 */
-		recover_xattr_data(inode, page, blkaddr);
+		err = recover_xattr_data(inode, page, blkaddr);
+		if (!err)
+			recovered++;
 		goto out;
 	}
 
@@ -414,8 +449,8 @@ retry_dn:
 	for (; start < end; start++, dn.ofs_in_node++) {
 		block_t src, dest;
 
-		src = datablock_addr(dn.node_page, dn.ofs_in_node);
-		dest = datablock_addr(page, dn.ofs_in_node);
+		src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
+		dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
 
 		/* skip recovering if dest is the same as src */
 		if (src == dest)
@@ -428,8 +463,9 @@ retry_dn:
 		}
 
 		if (!file_keep_isize(inode) &&
-				(i_size_read(inode) <= (start << PAGE_SHIFT)))
-			f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
+			(i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT)))
+			f2fs_i_size_write(inode,
+				(loff_t)(start + 1) << PAGE_SHIFT);
 
 		/*
 		 * dest is reserved block, invalidate src block
@@ -556,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	struct list_head dir_list;
 	int err;
 	int ret = 0;
+	unsigned long s_flags = sbi->sb->s_flags;
 	bool need_writecp = false;
 
+	if (s_flags & MS_RDONLY) {
+		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
+		sbi->sb->s_flags &= ~MS_RDONLY;
+	}
+
+#ifdef CONFIG_QUOTA
+	/* Needed for iput() to work correctly and not trash data */
+	sbi->sb->s_flags |= MS_ACTIVE;
+	/* Turn on quotas so that they are updated correctly */
+	f2fs_enable_quota_files(sbi);
+#endif
+
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
 			sizeof(struct fsync_inode_entry));
-	if (!fsync_entry_slab)
-		return -ENOMEM;
+	if (!fsync_entry_slab) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	INIT_LIST_HEAD(&inode_list);
 	INIT_LIST_HEAD(&dir_list);
@@ -570,13 +621,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	mutex_lock(&sbi->cp_mutex);
 
 	/* step #1: find fsynced inode numbers */
-	err = find_fsync_dnodes(sbi, &inode_list);
+	err = find_fsync_dnodes(sbi, &inode_list, check_only);
 	if (err || list_empty(&inode_list))
-		goto out;
+		goto skip;
 
 	if (check_only) {
 		ret = 1;
-		goto out;
+		goto skip;
 	}
 
 	need_writecp = true;
@@ -585,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	err = recover_data(sbi, &inode_list, &dir_list);
 	if (!err)
 		f2fs_bug_on(sbi, !list_empty(&inode_list));
-out:
+skip:
 	destroy_fsync_dnodes(&inode_list);
 
 	/* truncate meta pages to be used by the recovery */
@@ -598,8 +649,6 @@ out:
 	}
 
 	clear_sbi_flag(sbi, SBI_POR_DOING);
-	if (err)
-		set_ckpt_flags(sbi, CP_ERROR_FLAG);
 	mutex_unlock(&sbi->cp_mutex);
 
 	/* let's drop all the directory inodes for clean checkpoint */
@@ -613,5 +662,12 @@ out:
 	}
 
 	kmem_cache_destroy(fsync_entry_slab);
+out:
+#ifdef CONFIG_QUOTA
+	/* Turn quotas off */
+	f2fs_quota_off_umount(sbi->sb);
+#endif
+	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+
 	return ret ? ret: err;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 289b3facd2d8..059a219b7740 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -16,10 +16,13 @@
 #include <linux/kthread.h>
 #include <linux/swap.h>
 #include <linux/timer.h>
+#include <linux/freezer.h>
+#include <linux/sched.h>
 
 #include "f2fs.h"
 #include "segment.h"
 #include "node.h"
+#include "gc.h"
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
@@ -166,6 +169,21 @@ found:
 	return result - size + __reverse_ffz(tmp);
 }
 
+bool need_SSR(struct f2fs_sb_info *sbi)
+{
+	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+	int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+
+	if (test_opt(sbi, LFS))
+		return false;
+	if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
+		return true;
+
+	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
+						2 * reserved_sections(sbi));
+}
+
 void register_inmem_page(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -212,9 +230,15 @@ static int __revoke_inmem_pages(struct inode *inode,
 			struct node_info ni;
 
 			trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
-
+retry:
 			set_new_dnode(&dn, inode, NULL, NULL, 0);
-			if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+			err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+			if (err) {
+				if (err == -ENOMEM) {
+					congestion_wait(BLK_RW_ASYNC, HZ/50);
+					cond_resched();
+					goto retry;
+				}
 				err = -EAGAIN;
 				goto next;
 			}
@@ -247,9 +271,40 @@ void drop_inmem_pages(struct inode *inode)
 	mutex_unlock(&fi->inmem_lock);
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
+	clear_inode_flag(inode, FI_HOT_DATA);
 	stat_dec_atomic_write(inode);
 }
 
+void drop_inmem_page(struct inode *inode, struct page *page)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct list_head *head = &fi->inmem_pages;
+	struct inmem_pages *cur = NULL;
+
+	f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page));
+
+	mutex_lock(&fi->inmem_lock);
+	list_for_each_entry(cur, head, list) {
+		if (cur->page == page)
+			break;
+	}
+
+	f2fs_bug_on(sbi, !cur || cur->page != page);
+	list_del(&cur->list);
+	mutex_unlock(&fi->inmem_lock);
+
+	dec_page_count(sbi, F2FS_INMEM_PAGES);
+	kmem_cache_free(inmem_entry_slab, cur);
+
+	ClearPageUptodate(page);
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+	f2fs_put_page(page, 0);
+
+	trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
+}
+
 static int __commit_inmem_pages(struct inode *inode,
 					struct list_head *revoke_list)
 {
@@ -260,10 +315,10 @@ static int __commit_inmem_pages(struct inode *inode,
 		.sbi = sbi,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO,
-		.encrypted_page = NULL,
+		.op_flags = REQ_SYNC | REQ_PRIO,
+		.io_type = FS_DATA_IO,
 	};
-	bool submit_bio = false;
+	pgoff_t last_idx = ULONG_MAX;
 	int err = 0;
 
 	list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
@@ -279,25 +334,31 @@ static int __commit_inmem_pages(struct inode *inode,
 				inode_dec_dirty_pages(inode);
 				remove_dirty_inode(inode);
 			}
-
+retry:
 			fio.page = page;
+			fio.old_blkaddr = NULL_ADDR;
+			fio.encrypted_page = NULL;
+			fio.need_lock = LOCK_DONE;
 			err = do_write_data_page(&fio);
 			if (err) {
+				if (err == -ENOMEM) {
+					congestion_wait(BLK_RW_ASYNC, HZ/50);
+					cond_resched();
+					goto retry;
+				}
 				unlock_page(page);
 				break;
 			}
-
 			/* record old blkaddr for revoking */
 			cur->old_addr = fio.old_blkaddr;
-
-			submit_bio = true;
+			last_idx = page->index;
 		}
 		unlock_page(page);
 		list_move_tail(&cur->list, revoke_list);
 	}
 
-	if (submit_bio)
-		f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
+	if (last_idx != ULONG_MAX)
+		f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA);
 
 	if (!err)
 		__revoke_inmem_pages(inode, revoke_list, false, false);
@@ -352,15 +413,14 @@ int commit_inmem_pages(struct inode *inode)
 void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
 {
 #ifdef CONFIG_F2FS_FAULT_INJECTION
-	if (time_to_inject(sbi, FAULT_CHECKPOINT))
+	if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
+		f2fs_show_injection_info(FAULT_CHECKPOINT);
 		f2fs_stop_checkpoint(sbi, false);
+	}
 #endif
 
-	if (!need)
-		return;
-
 	/* balance_fs_bg is able to be pending */
-	if (excess_cached_nats(sbi))
+	if (need && excess_cached_nats(sbi))
 		f2fs_balance_fs_bg(sbi);
 
 	/*
@@ -369,7 +429,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
 	 */
 	if (has_not_enough_free_secs(sbi, 0, 0)) {
 		mutex_lock(&sbi->gc_mutex);
-		f2fs_gc(sbi, false, false);
+		f2fs_gc(sbi, false, false, NULL_SEGNO);
 	}
 }
 
@@ -386,9 +446,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	if (!available_free_memory(sbi, FREE_NIDS))
 		try_to_free_nids(sbi, MAX_FREE_NIDS);
 	else
-		build_free_nids(sbi, false);
+		build_free_nids(sbi, false, false);
 
-	if (!is_idle(sbi))
+	if (!is_idle(sbi) && !excess_dirty_nats(sbi))
 		return;
 
 	/* checkpoint is the only way to shrink partial cached entries */
@@ -409,7 +469,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	}
 }
 
-static int __submit_flush_wait(struct block_device *bdev)
+static int __submit_flush_wait(struct f2fs_sb_info *sbi,
+				struct block_device *bdev)
 {
 	struct bio *bio = f2fs_bio_alloc(0);
 	int ret;
@@ -418,20 +479,24 @@ static int __submit_flush_wait(struct block_device *bdev)
 	bio->bi_bdev = bdev;
 	ret = submit_bio_wait(WRITE_FLUSH, bio);
 	bio_put(bio);
+
+	trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
+				test_opt(sbi, FLUSH_MERGE), ret);
 	return ret;
 }
 
 static int submit_flush_wait(struct f2fs_sb_info *sbi)
 {
-	int ret = __submit_flush_wait(sbi->sb->s_bdev);
+	int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev);
 	int i;
 
-	if (sbi->s_ndevs && !ret) {
-		for (i = 1; i < sbi->s_ndevs; i++) {
-			ret = __submit_flush_wait(FDEV(i).bdev);
-			if (ret)
-				break;
-		}
+	if (!sbi->s_ndevs || ret)
+		return ret;
+
+	for (i = 1; i < sbi->s_ndevs; i++) {
+		ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+		if (ret)
+			break;
 	}
 	return ret;
 }
@@ -445,6 +510,8 @@ repeat:
 	if (kthread_should_stop())
 		return 0;
 
+	sb_start_intwrite(sbi->sb);
+
 	if (!llist_empty(&fcc->issue_list)) {
 		struct flush_cmd *cmd, *next;
 		int ret;
@@ -453,6 +520,8 @@ repeat:
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
 		ret = submit_flush_wait(sbi);
+		atomic_inc(&fcc->issued_flush);
+
 		llist_for_each_entry_safe(cmd, next,
 					  fcc->dispatch_list, llnode) {
 			cmd->ret = ret;
@@ -461,6 +530,8 @@ repeat:
 		fcc->dispatch_list = NULL;
 	}
 
+	sb_end_intwrite(sbi->sb);
+
 	wait_event_interruptible(*q,
 		kthread_should_stop() || !llist_empty(&fcc->issue_list));
 	goto repeat;
@@ -470,36 +541,60 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 {
 	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
 	struct flush_cmd cmd;
-
-	trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER),
-					test_opt(sbi, FLUSH_MERGE));
+	int ret;
 
 	if (test_opt(sbi, NOBARRIER))
 		return 0;
 
-	if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
-		int ret;
-
-		atomic_inc(&fcc->submit_flush);
+	if (!test_opt(sbi, FLUSH_MERGE)) {
 		ret = submit_flush_wait(sbi);
-		atomic_dec(&fcc->submit_flush);
+		atomic_inc(&fcc->issued_flush);
+		return ret;
+	}
+
+	if (atomic_inc_return(&fcc->issing_flush) == 1) {
+		ret = submit_flush_wait(sbi);
+		atomic_dec(&fcc->issing_flush);
+
+		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}
 
 	init_completion(&cmd.wait);
 
-	atomic_inc(&fcc->submit_flush);
 	llist_add(&cmd.llnode, &fcc->issue_list);
 
-	if (!fcc->dispatch_list)
+	/* update issue_list before we wake up issue_flush thread */
+	smp_mb();
+
+	if (waitqueue_active(&fcc->flush_wait_queue))
 		wake_up(&fcc->flush_wait_queue);
 
 	if (fcc->f2fs_issue_flush) {
 		wait_for_completion(&cmd.wait);
-		atomic_dec(&fcc->submit_flush);
+		atomic_dec(&fcc->issing_flush);
 	} else {
-		llist_del_all(&fcc->issue_list);
-		atomic_set(&fcc->submit_flush, 0);
+		struct llist_node *list;
+
+		list = llist_del_all(&fcc->issue_list);
+		if (!list) {
+			wait_for_completion(&cmd.wait);
+			atomic_dec(&fcc->issing_flush);
+		} else {
+			struct flush_cmd *tmp, *next;
+
+			ret = submit_flush_wait(sbi);
+
+			llist_for_each_entry_safe(tmp, next, list, llnode) {
+				if (tmp == &cmd) {
+					cmd.ret = ret;
+					atomic_dec(&fcc->issing_flush);
+					continue;
+				}
+				tmp->ret = ret;
+				complete(&tmp->wait);
+			}
+		}
 	}
 
 	return cmd.ret;
@@ -513,16 +608,22 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 
 	if (SM_I(sbi)->fcc_info) {
 		fcc = SM_I(sbi)->fcc_info;
+		if (fcc->f2fs_issue_flush)
+			return err;
 		goto init_thread;
 	}
 
 	fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
 	if (!fcc)
 		return -ENOMEM;
-	atomic_set(&fcc->submit_flush, 0);
+	atomic_set(&fcc->issued_flush, 0);
+	atomic_set(&fcc->issing_flush, 0);
 	init_waitqueue_head(&fcc->flush_wait_queue);
 	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->fcc_info = fcc;
+	if (!test_opt(sbi, FLUSH_MERGE))
+		return err;
+
 init_thread:
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
@@ -592,8 +693,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
 			dirty_i->nr_dirty[t]--;
 
-		if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
-			clear_bit(GET_SECNO(sbi, segno),
+		if (get_valid_blocks(sbi, segno, true) == 0)
+			clear_bit(GET_SEC_FROM_SEG(sbi, segno),
 						dirty_i->victim_secmap);
 	}
 }
@@ -613,7 +714,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 
 	mutex_lock(&dirty_i->seglist_lock);
 
-	valid_blocks = get_valid_blocks(sbi, segno, 0);
+	valid_blocks = get_valid_blocks(sbi, segno, false);
 
 	if (valid_blocks == 0) {
 		__locate_dirty_segment(sbi, segno, PRE);
@@ -628,61 +729,91 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 	mutex_unlock(&dirty_i->seglist_lock);
 }
 
-static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi,
-			struct bio *bio, block_t lstart, block_t len)
+static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
+		struct block_device *bdev, block_t lstart,
+		block_t start, block_t len)
 {
-	struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list);
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct list_head *pend_list;
 	struct discard_cmd *dc;
 
+	f2fs_bug_on(sbi, !len);
+
+	pend_list = &dcc->pend_list[plist_idx(len)];
+
 	dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS);
 	INIT_LIST_HEAD(&dc->list);
-	dc->bio = bio;
+	dc->bdev = bdev;
 	dc->lstart = lstart;
+	dc->start = start;
 	dc->len = len;
+	dc->ref = 0;
+	dc->state = D_PREP;
+	dc->error = 0;
 	init_completion(&dc->wait);
-	list_add_tail(&dc->list, wait_list);
+	list_add_tail(&dc->list, pend_list);
+	atomic_inc(&dcc->discard_cmd_cnt);
+	dcc->undiscard_blks += len;
 
 	return dc;
 }
 
-/* This should be covered by global mutex, &sit_i->sentry_lock */
-void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi,
+				struct block_device *bdev, block_t lstart,
+				block_t start, block_t len,
+				struct rb_node *parent, struct rb_node **p)
 {
-	struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list);
-	struct discard_cmd *dc, *tmp;
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct discard_cmd *dc;
 
-	list_for_each_entry_safe(dc, tmp, wait_list, list) {
-		struct bio *bio = dc->bio;
-		int err;
+	dc = __create_discard_cmd(sbi, bdev, lstart, start, len);
 
-		if (!completion_done(&dc->wait)) {
-			if ((dc->lstart <= blkaddr &&
-					blkaddr < dc->lstart + dc->len) ||
-					blkaddr == NULL_ADDR)
-				wait_for_completion_io(&dc->wait);
-			else
-				continue;
-		}
+	rb_link_node(&dc->rb_node, parent, p);
+	rb_insert_color(&dc->rb_node, &dcc->root);
 
-		err = bio->bi_error;
-		if (err == -EOPNOTSUPP)
-			err = 0;
+	return dc;
+}
 
-		if (err)
-			f2fs_msg(sbi->sb, KERN_INFO,
-				"Issue discard failed, ret: %d", err);
+static void __detach_discard_cmd(struct discard_cmd_control *dcc,
+							struct discard_cmd *dc)
+{
+	if (dc->state == D_DONE)
+		atomic_dec(&dcc->issing_discard);
 
-		bio_put(bio);
-		list_del(&dc->list);
-		kmem_cache_free(discard_cmd_slab, dc);
-	}
+	list_del(&dc->list);
+	rb_erase(&dc->rb_node, &dcc->root);
+	dcc->undiscard_blks -= dc->len;
+
+	kmem_cache_free(discard_cmd_slab, dc);
+
+	atomic_dec(&dcc->discard_cmd_cnt);
+}
+
+static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
+							struct discard_cmd *dc)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
+	f2fs_bug_on(sbi, dc->ref);
+
+	if (dc->error == -EOPNOTSUPP)
+		dc->error = 0;
+
+	if (dc->error)
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"Issue discard(%u, %u, %u) failed, ret: %d",
+			dc->lstart, dc->start, dc->len, dc->error);
+	__detach_discard_cmd(dcc, dc);
 }
 
 static void f2fs_submit_discard_endio(struct bio *bio)
 {
 	struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
 
-	complete(&dc->wait);
+	dc->error = bio->bi_error;
+	dc->state = D_DONE;
+	complete_all(&dc->wait);
+	bio_put(bio);
 }
 
 /* copied from block/blk-lib.c in 4.10-rc1 */
@@ -739,12 +870,12 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		}
 
 		if (bio) {
-			int ret = submit_bio_wait(0, bio);
+			int ret = submit_bio_wait(op, bio);
 			bio_put(bio);
 			if (ret)
 				return ret;
 		}
-		bio = f2fs_bio_alloc(0);
+		bio = f2fs_bio_alloc(1);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
 		bio_set_op_attrs(bio, op, 0);
@@ -766,58 +897,471 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	return 0;
 }
 
+void __check_sit_bitmap(struct f2fs_sb_info *sbi,
+				block_t start, block_t end)
+{
+#ifdef CONFIG_F2FS_CHECK_FS
+	struct seg_entry *sentry;
+	unsigned int segno;
+	block_t blk = start;
+	unsigned long offset, size, max_blocks = sbi->blocks_per_seg;
+	unsigned long *map;
+
+	while (blk < end) {
+		segno = GET_SEGNO(sbi, blk);
+		sentry = get_seg_entry(sbi, segno);
+		offset = GET_BLKOFF_FROM_SEG0(sbi, blk);
+
+		if (end < START_BLOCK(sbi, segno + 1))
+			size = GET_BLKOFF_FROM_SEG0(sbi, end);
+		else
+			size = max_blocks;
+		map = (unsigned long *)(sentry->cur_valid_map);
+		offset = __find_rev_next_bit(map, size, offset);
+		f2fs_bug_on(sbi, offset != size);
+		blk = START_BLOCK(sbi, segno + 1);
+	}
+#endif
+}
+
 /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
-static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
+static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
+				struct discard_cmd *dc)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct bio *bio = NULL;
+
+	if (dc->state != D_PREP)
+		return;
+
+	trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len);
+
+	dc->error = __blkdev_issue_discard(dc->bdev,
+				SECTOR_FROM_BLOCK(dc->start),
+				SECTOR_FROM_BLOCK(dc->len),
+				GFP_NOFS, 0, &bio);
+	if (!dc->error) {
+		/* should keep before submission to avoid D_DONE right away */
+		dc->state = D_SUBMIT;
+		atomic_inc(&dcc->issued_discard);
+		atomic_inc(&dcc->issing_discard);
+		if (bio) {
+			bio->bi_private = dc;
+			bio->bi_end_io = f2fs_submit_discard_endio;
+			submit_bio(REQ_SYNC, bio);
+			list_move_tail(&dc->list, &dcc->wait_list);
+			__check_sit_bitmap(sbi, dc->start, dc->start + dc->len);
+
+			f2fs_update_iostat(sbi, FS_DISCARD, 1);
+		}
+	} else {
+		__remove_discard_cmd(sbi, dc);
+	}
+}
+
+static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
+				struct block_device *bdev, block_t lstart,
+				block_t start, block_t len,
+				struct rb_node **insert_p,
+				struct rb_node *insert_parent)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct rb_node **p = &dcc->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct discard_cmd *dc = NULL;
+
+	if (insert_p && insert_parent) {
+		parent = insert_parent;
+		p = insert_p;
+		goto do_insert;
+	}
+
+	p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart);
+do_insert:
+	dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p);
+	if (!dc)
+		return NULL;
+
+	return dc;
+}
+
+static void __relocate_discard_cmd(struct discard_cmd_control *dcc,
+						struct discard_cmd *dc)
+{
+	list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]);
+}
+
+static void __punch_discard_cmd(struct f2fs_sb_info *sbi,
+				struct discard_cmd *dc, block_t blkaddr)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct discard_info di = dc->di;
+	bool modified = false;
+
+	if (dc->state == D_DONE || dc->len == 1) {
+		__remove_discard_cmd(sbi, dc);
+		return;
+	}
+
+	dcc->undiscard_blks -= di.len;
+
+	if (blkaddr > di.lstart) {
+		dc->len = blkaddr - dc->lstart;
+		dcc->undiscard_blks += dc->len;
+		__relocate_discard_cmd(dcc, dc);
+		modified = true;
+	}
+
+	if (blkaddr < di.lstart + di.len - 1) {
+		if (modified) {
+			__insert_discard_tree(sbi, dc->bdev, blkaddr + 1,
+					di.start + blkaddr + 1 - di.lstart,
+					di.lstart + di.len - 1 - blkaddr,
+					NULL, NULL);
+		} else {
+			dc->lstart++;
+			dc->len--;
+			dc->start++;
+			dcc->undiscard_blks += dc->len;
+			__relocate_discard_cmd(dcc, dc);
+		}
+	}
+}
+
+static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
+				struct block_device *bdev, block_t lstart,
+				block_t start, block_t len)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+	struct discard_cmd *dc;
+	struct discard_info di = {0};
+	struct rb_node **insert_p = NULL, *insert_parent = NULL;
+	block_t end = lstart + len;
+
+	mutex_lock(&dcc->cmd_lock);
+
+	dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+					NULL, lstart,
+					(struct rb_entry **)&prev_dc,
+					(struct rb_entry **)&next_dc,
+					&insert_p, &insert_parent, true);
+	if (dc)
+		prev_dc = dc;
+
+	if (!prev_dc) {
+		di.lstart = lstart;
+		di.len = next_dc ? next_dc->lstart - lstart : len;
+		di.len = min(di.len, len);
+		di.start = start;
+	}
+
+	while (1) {
+		struct rb_node *node;
+		bool merged = false;
+		struct discard_cmd *tdc = NULL;
+
+		if (prev_dc) {
+			di.lstart = prev_dc->lstart + prev_dc->len;
+			if (di.lstart < lstart)
+				di.lstart = lstart;
+			if (di.lstart >= end)
+				break;
+
+			if (!next_dc || next_dc->lstart > end)
+				di.len = end - di.lstart;
+			else
+				di.len = next_dc->lstart - di.lstart;
+			di.start = start + di.lstart - lstart;
+		}
+
+		if (!di.len)
+			goto next;
+
+		if (prev_dc && prev_dc->state == D_PREP &&
+			prev_dc->bdev == bdev &&
+			__is_discard_back_mergeable(&di, &prev_dc->di)) {
+			prev_dc->di.len += di.len;
+			dcc->undiscard_blks += di.len;
+			__relocate_discard_cmd(dcc, prev_dc);
+			di = prev_dc->di;
+			tdc = prev_dc;
+			merged = true;
+		}
+
+		if (next_dc && next_dc->state == D_PREP &&
+			next_dc->bdev == bdev &&
+			__is_discard_front_mergeable(&di, &next_dc->di)) {
+			next_dc->di.lstart = di.lstart;
+			next_dc->di.len += di.len;
+			next_dc->di.start = di.start;
+			dcc->undiscard_blks += di.len;
+			__relocate_discard_cmd(dcc, next_dc);
+			if (tdc)
+				__remove_discard_cmd(sbi, tdc);
+			merged = true;
+		}
+
+		if (!merged) {
+			__insert_discard_tree(sbi, bdev, di.lstart, di.start,
+							di.len, NULL, NULL);
+		}
+ next:
+		prev_dc = next_dc;
+		if (!prev_dc)
+			break;
+
+		node = rb_next(&prev_dc->rb_node);
+		next_dc = rb_entry_safe(node, struct discard_cmd, rb_node);
+	}
+
+	mutex_unlock(&dcc->cmd_lock);
+}
+
+static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
-	struct bio *bio = NULL;
 	block_t lblkstart = blkstart;
-	int err;
 
-	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+	trace_f2fs_queue_discard(bdev, blkstart, blklen);
 
 	if (sbi->s_ndevs) {
 		int devi = f2fs_target_device_index(sbi, blkstart);
 
 		blkstart -= FDEV(devi).start_blk;
 	}
-	err = __blkdev_issue_discard(bdev,
-				SECTOR_FROM_BLOCK(blkstart),
-				SECTOR_FROM_BLOCK(blklen),
-				GFP_NOFS, 0, &bio);
-	if (!err && bio) {
-		struct discard_cmd *dc = __add_discard_cmd(sbi, bio,
-						lblkstart, blklen);
+	__update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
+	return 0;
+}
 
-		bio->bi_private = dc;
-		bio->bi_end_io = f2fs_submit_discard_endio;
-		submit_bio(REQ_SYNC, bio);
+static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct list_head *pend_list;
+	struct discard_cmd *dc, *tmp;
+	struct blk_plug plug;
+	int iter = 0, issued = 0;
+	int i;
+	bool io_interrupted = false;
+
+	mutex_lock(&dcc->cmd_lock);
+	f2fs_bug_on(sbi,
+		!__check_rb_tree_consistence(sbi, &dcc->root));
+	blk_start_plug(&plug);
+	for (i = MAX_PLIST_NUM - 1;
+			i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+		pend_list = &dcc->pend_list[i];
+		list_for_each_entry_safe(dc, tmp, pend_list, list) {
+			f2fs_bug_on(sbi, dc->state != D_PREP);
+
+			/* Hurry up to finish fstrim */
+			if (dcc->pend_list_tag[i] & P_TRIM) {
+				__submit_discard_cmd(sbi, dc);
+				issued++;
+
+				if (fatal_signal_pending(current))
+					break;
+				continue;
+			}
+
+			if (!issue_cond) {
+				__submit_discard_cmd(sbi, dc);
+				issued++;
+				continue;
+			}
+
+			if (is_idle(sbi)) {
+				__submit_discard_cmd(sbi, dc);
+				issued++;
+			} else {
+				io_interrupted = true;
+			}
+
+			if (++iter >= DISCARD_ISSUE_RATE)
+				goto out;
+		}
+		if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM)
+			dcc->pend_list_tag[i] &= (~P_TRIM);
 	}
-	return err;
+out:
+	blk_finish_plug(&plug);
+	mutex_unlock(&dcc->cmd_lock);
+
+	if (!issued && io_interrupted)
+		issued = -1;
+
+	return issued;
+}
+
+static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct list_head *pend_list;
+	struct discard_cmd *dc, *tmp;
+	int i;
+
+	mutex_lock(&dcc->cmd_lock);
+	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+		pend_list = &dcc->pend_list[i];
+		list_for_each_entry_safe(dc, tmp, pend_list, list) {
+			f2fs_bug_on(sbi, dc->state != D_PREP);
+			__remove_discard_cmd(sbi, dc);
+		}
+	}
+	mutex_unlock(&dcc->cmd_lock);
+}
+
+static void __wait_one_discard_bio(struct f2fs_sb_info *sbi,
+							struct discard_cmd *dc)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
+	wait_for_completion_io(&dc->wait);
+	mutex_lock(&dcc->cmd_lock);
+	f2fs_bug_on(sbi, dc->state != D_DONE);
+	dc->ref--;
+	if (!dc->ref)
+		__remove_discard_cmd(sbi, dc);
+	mutex_unlock(&dcc->cmd_lock);
+}
+
+static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct list_head *wait_list = &(dcc->wait_list);
+	struct discard_cmd *dc, *tmp;
+	bool need_wait;
+
+next:
+	need_wait = false;
+
+	mutex_lock(&dcc->cmd_lock);
+	list_for_each_entry_safe(dc, tmp, wait_list, list) {
+		if (!wait_cond || (dc->state == D_DONE && !dc->ref)) {
+			wait_for_completion_io(&dc->wait);
+			__remove_discard_cmd(sbi, dc);
+		} else {
+			dc->ref++;
+			need_wait = true;
+			break;
+		}
+	}
+	mutex_unlock(&dcc->cmd_lock);
+
+	if (need_wait) {
+		__wait_one_discard_bio(sbi, dc);
+		goto next;
+	}
+}
+
+/* This should be covered by global mutex, &sit_i->sentry_lock */
+void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct discard_cmd *dc;
+	bool need_wait = false;
+
+	mutex_lock(&dcc->cmd_lock);
+	dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr);
+	if (dc) {
+		if (dc->state == D_PREP) {
+			__punch_discard_cmd(sbi, dc, blkaddr);
+		} else {
+			dc->ref++;
+			need_wait = true;
+		}
+	}
+	mutex_unlock(&dcc->cmd_lock);
+
+	if (need_wait)
+		__wait_one_discard_bio(sbi, dc);
+}
+
+void stop_discard_thread(struct f2fs_sb_info *sbi)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
+	if (dcc && dcc->f2fs_issue_discard) {
+		struct task_struct *discard_thread = dcc->f2fs_issue_discard;
+
+		dcc->f2fs_issue_discard = NULL;
+		kthread_stop(discard_thread);
+	}
+}
+
+/* This comes from f2fs_put_super and f2fs_trim_fs */
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+{
+	__issue_discard_cmd(sbi, false);
+	__drop_discard_cmd(sbi);
+	__wait_discard_cmd(sbi, false);
+}
+
+static void mark_discard_range_all(struct f2fs_sb_info *sbi)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	int i;
+
+	mutex_lock(&dcc->cmd_lock);
+	for (i = 0; i < MAX_PLIST_NUM; i++)
+		dcc->pend_list_tag[i] |= P_TRIM;
+	mutex_unlock(&dcc->cmd_lock);
+}
+
+static int issue_discard_thread(void *data)
+{
+	struct f2fs_sb_info *sbi = data;
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	wait_queue_head_t *q = &dcc->discard_wait_queue;
+	unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+	int issued;
+
+	set_freezable();
+
+	do {
+		wait_event_interruptible_timeout(*q,
+				kthread_should_stop() || freezing(current) ||
+				dcc->discard_wake,
+				msecs_to_jiffies(wait_ms));
+		if (try_to_freeze())
+			continue;
+		if (kthread_should_stop())
+			return 0;
+
+		if (dcc->discard_wake) {
+			dcc->discard_wake = 0;
+			if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
+				mark_discard_range_all(sbi);
+		}
+
+		sb_start_intwrite(sbi->sb);
+
+		issued = __issue_discard_cmd(sbi, true);
+		if (issued) {
+			__wait_discard_cmd(sbi, true);
+			wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+		} else {
+			wait_ms = DEF_MAX_DISCARD_ISSUE_TIME;
+		}
+
+		sb_end_intwrite(sbi->sb);
+
+	} while (!kthread_should_stop());
+	return 0;
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
-	sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
-	sector_t sector;
+	sector_t sector, nr_sects;
+	block_t lblkstart = blkstart;
 	int devi = 0;
 
 	if (sbi->s_ndevs) {
 		devi = f2fs_target_device_index(sbi, blkstart);
 		blkstart -= FDEV(devi).start_blk;
 	}
-	sector = SECTOR_FROM_BLOCK(blkstart);
-
-	if (sector & (bdev_zone_size(bdev) - 1) ||
-				nr_sects != bdev_zone_size(bdev)) {
-		f2fs_msg(sbi->sb, KERN_INFO,
-			"(%d) %s: Unaligned discard attempted (block %x + %x)",
-			devi, sbi->s_ndevs ? FDEV(devi).path: "",
-			blkstart, blklen);
-		return -EIO;
-	}
 
 	/*
 	 * We need to know the type of the zone: for conventional zones,
@@ -829,10 +1373,21 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 	case BLK_ZONE_TYPE_CONVENTIONAL:
 		if (!blk_queue_discard(bdev_get_queue(bdev)))
 			return 0;
-		return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+		return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
-		trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
+		sector = SECTOR_FROM_BLOCK(blkstart);
+		nr_sects = SECTOR_FROM_BLOCK(blklen);
+
+		if (sector & (bdev_zone_sectors(bdev) - 1) ||
+				nr_sects != bdev_zone_sectors(bdev)) {
+			f2fs_msg(sbi->sb, KERN_INFO,
+				"(%d) %s: Unaligned discard attempted (block %x + %x)",
+				devi, sbi->s_ndevs ? FDEV(devi).path: "",
+				blkstart, blklen);
+			return -EIO;
+		}
+		trace_f2fs_issue_reset_zone(bdev, blkstart);
 		return blkdev_reset_zones(bdev, sector,
 					  nr_sects, GFP_NOFS);
 	default:
@@ -850,7 +1405,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
 				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
 		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
 #endif
-	return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+	return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
 }
 
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
@@ -893,31 +1448,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 	return err;
 }
 
-static void __add_discard_entry(struct f2fs_sb_info *sbi,
-		struct cp_control *cpc, struct seg_entry *se,
-		unsigned int start, unsigned int end)
-{
-	struct list_head *head = &SM_I(sbi)->discard_entry_list;
-	struct discard_entry *new, *last;
-
-	if (!list_empty(head)) {
-		last = list_last_entry(head, struct discard_entry, list);
-		if (START_BLOCK(sbi, cpc->trim_start) + start ==
-						last->blkaddr + last->len) {
-			last->len += end - start;
-			goto done;
-		}
-	}
-
-	new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
-	INIT_LIST_HEAD(&new->list);
-	new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
-	new->len = end - start;
-	list_add_tail(&new->list, head);
-done:
-	SM_I(sbi)->nr_discards += end - start;
-}
-
 static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 							bool check_only)
 {
@@ -929,7 +1459,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 	unsigned long *discard_map = (unsigned long *)se->discard_map;
 	unsigned long *dmap = SIT_I(sbi)->tmp_map;
 	unsigned int start = 0, end = -1;
-	bool force = (cpc->reason == CP_DISCARD);
+	bool force = (cpc->reason & CP_DISCARD);
+	struct discard_entry *de = NULL;
+	struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
 	int i;
 
 	if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
@@ -937,7 +1469,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 
 	if (!force) {
 		if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
-		    SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)
+			SM_I(sbi)->dcc_info->nr_discards >=
+				SM_I(sbi)->dcc_info->max_discards)
 			return false;
 	}
 
@@ -946,7 +1479,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 		dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] :
 				(cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
 
-	while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+	while (force || SM_I(sbi)->dcc_info->nr_discards <=
+				SM_I(sbi)->dcc_info->max_discards) {
 		start = __find_rev_next_bit(dmap, max_blocks, end + 1);
 		if (start >= max_blocks)
 			break;
@@ -959,14 +1493,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 		if (check_only)
 			return true;
 
-		__add_discard_entry(sbi, cpc, se, start, end);
+		if (!de) {
+			de = f2fs_kmem_cache_alloc(discard_entry_slab,
+								GFP_F2FS_ZERO);
+			de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
+			list_add_tail(&de->list, head);
+		}
+
+		for (i = start; i < end; i++)
+			__set_bit_le(i, (void *)de->discard_map);
+
+		SM_I(sbi)->dcc_info->nr_discards += end - start;
 	}
 	return false;
 }
 
 void release_discard_addrs(struct f2fs_sb_info *sbi)
 {
-	struct list_head *head = &(SM_I(sbi)->discard_entry_list);
+	struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
 	struct discard_entry *entry, *this;
 
 	/* drop caches */
@@ -992,16 +1536,14 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 
 void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
-	struct list_head *head = &(SM_I(sbi)->discard_entry_list);
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct list_head *head = &dcc->entry_list;
 	struct discard_entry *entry, *this;
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	struct blk_plug plug;
 	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
 	unsigned int start = 0, end = -1;
 	unsigned int secno, start_segno;
-	bool force = (cpc->reason == CP_DISCARD);
-
-	blk_start_plug(&plug);
+	bool force = (cpc->reason & CP_DISCARD);
 
 	mutex_lock(&dirty_i->seglist_lock);
 
@@ -1031,32 +1573,117 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			continue;
 		}
 next:
-		secno = GET_SECNO(sbi, start);
-		start_segno = secno * sbi->segs_per_sec;
+		secno = GET_SEC_FROM_SEG(sbi, start);
+		start_segno = GET_SEG_FROM_SEC(sbi, secno);
 		if (!IS_CURSEC(sbi, secno) &&
-			!get_valid_blocks(sbi, start, sbi->segs_per_sec))
+			!get_valid_blocks(sbi, start, true))
 			f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
 				sbi->segs_per_sec << sbi->log_blocks_per_seg);
 
 		start = start_segno + sbi->segs_per_sec;
 		if (start < end)
 			goto next;
+		else
+			end = start - 1;
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
 
 	/* send small discards */
 	list_for_each_entry_safe(entry, this, head, list) {
-		if (force && entry->len < cpc->trim_minlen)
-			goto skip;
-		f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
-		cpc->trimmed += entry->len;
+		unsigned int cur_pos = 0, next_pos, len, total_len = 0;
+		bool is_valid = test_bit_le(0, entry->discard_map);
+
+find_next:
+		if (is_valid) {
+			next_pos = find_next_zero_bit_le(entry->discard_map,
+					sbi->blocks_per_seg, cur_pos);
+			len = next_pos - cur_pos;
+
+			if (f2fs_sb_mounted_blkzoned(sbi->sb) ||
+			    (force && len < cpc->trim_minlen))
+				goto skip;
+
+			f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
+									len);
+			cpc->trimmed += len;
+			total_len += len;
+		} else {
+			next_pos = find_next_bit_le(entry->discard_map,
+					sbi->blocks_per_seg, cur_pos);
+		}
 skip:
+		cur_pos = next_pos;
+		is_valid = !is_valid;
+
+		if (cur_pos < sbi->blocks_per_seg)
+			goto find_next;
+
 		list_del(&entry->list);
-		SM_I(sbi)->nr_discards -= entry->len;
+		dcc->nr_discards -= total_len;
 		kmem_cache_free(discard_entry_slab, entry);
 	}
 
-	blk_finish_plug(&plug);
+	wake_up_discard_thread(sbi, false);
+}
+
+static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
+{
+	dev_t dev = sbi->sb->s_bdev->bd_dev;
+	struct discard_cmd_control *dcc;
+	int err = 0, i;
+
+	if (SM_I(sbi)->dcc_info) {
+		dcc = SM_I(sbi)->dcc_info;
+		goto init_thread;
+	}
+
+	dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL);
+	if (!dcc)
+		return -ENOMEM;
+
+	dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
+	INIT_LIST_HEAD(&dcc->entry_list);
+	for (i = 0; i < MAX_PLIST_NUM; i++) {
+		INIT_LIST_HEAD(&dcc->pend_list[i]);
+		if (i >= dcc->discard_granularity - 1)
+			dcc->pend_list_tag[i] |= P_ACTIVE;
+	}
+	INIT_LIST_HEAD(&dcc->wait_list);
+	mutex_init(&dcc->cmd_lock);
+	atomic_set(&dcc->issued_discard, 0);
+	atomic_set(&dcc->issing_discard, 0);
+	atomic_set(&dcc->discard_cmd_cnt, 0);
+	dcc->nr_discards = 0;
+	dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+	dcc->undiscard_blks = 0;
+	dcc->root = RB_ROOT;
+
+	init_waitqueue_head(&dcc->discard_wait_queue);
+	SM_I(sbi)->dcc_info = dcc;
+init_thread:
+	dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
+				"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
+	if (IS_ERR(dcc->f2fs_issue_discard)) {
+		err = PTR_ERR(dcc->f2fs_issue_discard);
+		kfree(dcc);
+		SM_I(sbi)->dcc_info = NULL;
+		return err;
+	}
+
+	return err;
+}
+
+static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
+	if (!dcc)
+		return;
+
+	stop_discard_thread(sbi);
+
+	kfree(dcc);
+	SM_I(sbi)->dcc_info = NULL;
 }
 
 static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -1085,6 +1712,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 	struct seg_entry *se;
 	unsigned int segno, offset;
 	long int new_vblocks;
+	bool exist;
+#ifdef CONFIG_F2FS_CHECK_FS
+	bool mir_exist;
+#endif
 
 	segno = GET_SEGNO(sbi, blkaddr);
 
@@ -1101,32 +1732,54 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 
 	/* Update valid block bitmap */
 	if (del > 0) {
-		if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) {
+		exist = f2fs_test_and_set_bit(offset, se->cur_valid_map);
 #ifdef CONFIG_F2FS_CHECK_FS
-			if (f2fs_test_and_set_bit(offset,
-						se->cur_valid_map_mir))
-				f2fs_bug_on(sbi, 1);
-			else
-				WARN_ON(1);
-#else
+		mir_exist = f2fs_test_and_set_bit(offset,
+						se->cur_valid_map_mir);
+		if (unlikely(exist != mir_exist)) {
+			f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error "
+				"when setting bitmap, blk:%u, old bit:%d",
+				blkaddr, exist);
 			f2fs_bug_on(sbi, 1);
-#endif
 		}
+#endif
+		if (unlikely(exist)) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+				"Bitmap was wrongly set, blk:%u", blkaddr);
+			f2fs_bug_on(sbi, 1);
+			se->valid_blocks--;
+			del = 0;
+		}
+
 		if (f2fs_discard_en(sbi) &&
 			!f2fs_test_and_set_bit(offset, se->discard_map))
 			sbi->discard_blks--;
-	} else {
-		if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) {
-#ifdef CONFIG_F2FS_CHECK_FS
-			if (!f2fs_test_and_clear_bit(offset,
-						se->cur_valid_map_mir))
-				f2fs_bug_on(sbi, 1);
-			else
-				WARN_ON(1);
-#else
-			f2fs_bug_on(sbi, 1);
-#endif
+
+		/* don't overwrite by SSR to keep node chain */
+		if (se->type == CURSEG_WARM_NODE) {
+			if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
+				se->ckpt_valid_blocks++;
 		}
+	} else {
+		exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map);
+#ifdef CONFIG_F2FS_CHECK_FS
+		mir_exist = f2fs_test_and_clear_bit(offset,
+						se->cur_valid_map_mir);
+		if (unlikely(exist != mir_exist)) {
+			f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error "
+				"when clearing bitmap, blk:%u, old bit:%d",
+				blkaddr, exist);
+			f2fs_bug_on(sbi, 1);
+		}
+#endif
+		if (unlikely(!exist)) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+				"Bitmap was wrongly cleared, blk:%u", blkaddr);
+			f2fs_bug_on(sbi, 1);
+			se->valid_blocks++;
+			del = 0;
+		}
+
 		if (f2fs_discard_en(sbi) &&
 			f2fs_test_and_clear_bit(offset, se->discard_map))
 			sbi->discard_blks++;
@@ -1312,8 +1965,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
 	struct free_segmap_info *free_i = FREE_I(sbi);
 	unsigned int segno, secno, zoneno;
 	unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
-	unsigned int hint = *newseg / sbi->segs_per_sec;
-	unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+	unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
+	unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
 	unsigned int left_start = hint;
 	bool init = true;
 	int go_left = 0;
@@ -1323,8 +1976,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
 
 	if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
 		segno = find_next_zero_bit(free_i->free_segmap,
-				(hint + 1) * sbi->segs_per_sec, *newseg + 1);
-		if (segno < (hint + 1) * sbi->segs_per_sec)
+			GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
+		if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
 			goto got_it;
 	}
 find_other_zone:
@@ -1355,8 +2008,8 @@ find_other_zone:
 	secno = left_start;
 skip_left:
 	hint = secno;
-	segno = secno * sbi->segs_per_sec;
-	zoneno = secno / sbi->secs_per_zone;
+	segno = GET_SEG_FROM_SEC(sbi, secno);
+	zoneno = GET_ZONE_FROM_SEC(sbi, secno);
 
 	/* give up on finding another zone */
 	if (!init)
@@ -1400,7 +2053,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
 	struct summary_footer *sum_footer;
 
 	curseg->segno = curseg->next_segno;
-	curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+	curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
 	curseg->next_blkoff = 0;
 	curseg->next_segno = NULL_SEGNO;
 
@@ -1413,6 +2066,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
 	__set_sit_entry_type(sbi, type, curseg->segno, modified);
 }
 
+static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
+{
+	/* if segs_per_sec is large than 1, we need to keep original policy. */
+	if (sbi->segs_per_sec != 1)
+		return CURSEG_I(sbi, type)->segno;
+
+	if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+		return 0;
+
+	if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
+		return SIT_I(sbi)->last_victim[ALLOC_NEXT];
+	return CURSEG_I(sbi, type)->segno;
+}
+
 /*
  * Allocate a current working segment.
  * This function always allocates a free segment in LFS manner.
@@ -1431,6 +2098,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
 	if (test_opt(sbi, NOHEAP))
 		dir = ALLOC_RIGHT;
 
+	segno = __get_next_segno(sbi, type);
 	get_new_segment(sbi, &segno, new_sec, dir);
 	curseg->next_segno = segno;
 	reset_curseg(sbi, type, 1);
@@ -1473,7 +2141,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
  * This function always allocates a used segment(from dirty seglist) by SSR
  * manner, so it should recover the existing segment information of valid blocks
  */
-static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
+static void change_curseg(struct f2fs_sb_info *sbi, int type)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -1494,28 +2162,53 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
 	curseg->alloc_type = SSR;
 	__next_free_blkoff(sbi, curseg, 0);
 
-	if (reuse) {
-		sum_page = get_sum_page(sbi, new_segno);
-		sum_node = (struct f2fs_summary_block *)page_address(sum_page);
-		memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
-		f2fs_put_page(sum_page, 1);
-	}
+	sum_page = get_sum_page(sbi, new_segno);
+	sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+	memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
+	f2fs_put_page(sum_page, 1);
 }
 
 static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 	const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+	unsigned segno = NULL_SEGNO;
+	int i, cnt;
+	bool reversed = false;
 
-	if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0))
-		return v_ops->get_victim(sbi,
-				&(curseg)->next_segno, BG_GC, type, SSR);
+	/* need_SSR() already forces to do this */
+	if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) {
+		curseg->next_segno = segno;
+		return 1;
+	}
 
-	/* For data segments, let's do SSR more intensively */
-	for (; type >= CURSEG_HOT_DATA; type--)
-		if (v_ops->get_victim(sbi, &(curseg)->next_segno,
-						BG_GC, type, SSR))
+	/* For node segments, let's do SSR more intensively */
+	if (IS_NODESEG(type)) {
+		if (type >= CURSEG_WARM_NODE) {
+			reversed = true;
+			i = CURSEG_COLD_NODE;
+		} else {
+			i = CURSEG_HOT_NODE;
+		}
+		cnt = NR_CURSEG_NODE_TYPE;
+	} else {
+		if (type >= CURSEG_WARM_DATA) {
+			reversed = true;
+			i = CURSEG_COLD_DATA;
+		} else {
+			i = CURSEG_HOT_DATA;
+		}
+		cnt = NR_CURSEG_DATA_TYPE;
+	}
+
+	for (; cnt-- > 0; reversed ? i-- : i++) {
+		if (i == type)
+			continue;
+		if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) {
+			curseg->next_segno = segno;
 			return 1;
+		}
+	}
 	return 0;
 }
 
@@ -1530,12 +2223,13 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
 
 	if (force)
 		new_curseg(sbi, type, true);
-	else if (type == CURSEG_WARM_NODE)
+	else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
+					type == CURSEG_WARM_NODE)
 		new_curseg(sbi, type, false);
 	else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
 		new_curseg(sbi, type, false);
 	else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
-		change_curseg(sbi, type, true);
+		change_curseg(sbi, type);
 	else
 		new_curseg(sbi, type, false);
 
@@ -1563,14 +2257,19 @@ static const struct segment_allocation default_salloc_ops = {
 bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	__u64 trim_start = cpc->trim_start;
+	bool has_candidate = false;
 
 	mutex_lock(&SIT_I(sbi)->sentry_lock);
-	for (; trim_start <= cpc->trim_end; trim_start++)
-		if (add_discard_addrs(sbi, cpc, true))
+	for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) {
+		if (add_discard_addrs(sbi, cpc, true)) {
+			has_candidate = true;
 			break;
+		}
+	}
 	mutex_unlock(&SIT_I(sbi)->sentry_lock);
 
-	return trim_start <= cpc->trim_end;
+	cpc->trim_start = trim_start;
+	return has_candidate;
 }
 
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
@@ -1623,6 +2322,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 
 		schedule();
 	}
+	/* It's time to issue all the filed discards */
+	mark_discard_range_all(sbi);
+	f2fs_wait_discard_bios(sbi);
 out:
 	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
 	return err;
@@ -1636,68 +2338,80 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 	return false;
 }
 
-static int __get_segment_type_2(struct page *page, enum page_type p_type)
+static int __get_segment_type_2(struct f2fs_io_info *fio)
 {
-	if (p_type == DATA)
+	if (fio->type == DATA)
 		return CURSEG_HOT_DATA;
 	else
 		return CURSEG_HOT_NODE;
 }
 
-static int __get_segment_type_4(struct page *page, enum page_type p_type)
+static int __get_segment_type_4(struct f2fs_io_info *fio)
 {
-	if (p_type == DATA) {
-		struct inode *inode = page->mapping->host;
+	if (fio->type == DATA) {
+		struct inode *inode = fio->page->mapping->host;
 
 		if (S_ISDIR(inode->i_mode))
 			return CURSEG_HOT_DATA;
 		else
 			return CURSEG_COLD_DATA;
 	} else {
-		if (IS_DNODE(page) && is_cold_node(page))
+		if (IS_DNODE(fio->page) && is_cold_node(fio->page))
 			return CURSEG_WARM_NODE;
 		else
 			return CURSEG_COLD_NODE;
 	}
 }
 
-static int __get_segment_type_6(struct page *page, enum page_type p_type)
+static int __get_segment_type_6(struct f2fs_io_info *fio)
 {
-	if (p_type == DATA) {
-		struct inode *inode = page->mapping->host;
+	if (fio->type == DATA) {
+		struct inode *inode = fio->page->mapping->host;
 
-		if (S_ISDIR(inode->i_mode))
-			return CURSEG_HOT_DATA;
-		else if (is_cold_data(page) || file_is_cold(inode))
+		if (is_cold_data(fio->page) || file_is_cold(inode))
 			return CURSEG_COLD_DATA;
-		else
-			return CURSEG_WARM_DATA;
+		if (is_inode_flag_set(inode, FI_HOT_DATA))
+			return CURSEG_HOT_DATA;
+		return CURSEG_WARM_DATA;
 	} else {
-		if (IS_DNODE(page))
-			return is_cold_node(page) ? CURSEG_WARM_NODE :
+		if (IS_DNODE(fio->page))
+			return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
 						CURSEG_HOT_NODE;
-		else
-			return CURSEG_COLD_NODE;
+		return CURSEG_COLD_NODE;
 	}
 }
 
-static int __get_segment_type(struct page *page, enum page_type p_type)
+static int __get_segment_type(struct f2fs_io_info *fio)
 {
-	switch (F2FS_P_SB(page)->active_logs) {
+	int type = 0;
+
+	switch (fio->sbi->active_logs) {
 	case 2:
-		return __get_segment_type_2(page, p_type);
+		type = __get_segment_type_2(fio);
+		break;
 	case 4:
-		return __get_segment_type_4(page, p_type);
+		type = __get_segment_type_4(fio);
+		break;
+	case 6:
+		type = __get_segment_type_6(fio);
+		break;
+	default:
+		f2fs_bug_on(fio->sbi, true);
 	}
-	/* NR_CURSEG_TYPE(6) logs by default */
-	f2fs_bug_on(F2FS_P_SB(page),
-		F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
-	return __get_segment_type_6(page, p_type);
+
+	if (IS_HOT(type))
+		fio->temp = HOT;
+	else if (IS_WARM(type))
+		fio->temp = WARM;
+	else
+		fio->temp = COLD;
+	return type;
 }
 
 void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 		block_t old_blkaddr, block_t *new_blkaddr,
-		struct f2fs_summary *sum, int type)
+		struct f2fs_summary *sum, int type,
+		struct f2fs_io_info *fio, bool add_list)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -1723,42 +2437,52 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	if (!__has_curseg_space(sbi, type))
 		sit_i->s_ops->allocate_segment(sbi, type, false);
 	/*
-	 * SIT information should be updated before segment allocation,
-	 * since SSR needs latest valid block information.
+	 * SIT information should be updated after segment allocation,
+	 * since we need to keep dirty segments precisely under SSR.
 	 */
 	refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
 
 	mutex_unlock(&sit_i->sentry_lock);
 
-	if (page && IS_NODESEG(type))
+	if (page && IS_NODESEG(type)) {
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
 
+		f2fs_inode_chksum_set(sbi, page);
+	}
+
+	if (add_list) {
+		struct f2fs_bio_info *io;
+
+		INIT_LIST_HEAD(&fio->list);
+		fio->in_list = true;
+		io = sbi->write_io[fio->type] + fio->temp;
+		spin_lock(&io->io_lock);
+		list_add_tail(&fio->list, &io->io_list);
+		spin_unlock(&io->io_lock);
+	}
+
 	mutex_unlock(&curseg->curseg_mutex);
 }
 
 static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 {
-	int type = __get_segment_type(fio->page, fio->type);
+	int type = __get_segment_type(fio);
 	int err;
 
-	if (fio->type == NODE || fio->type == DATA)
-		mutex_lock(&fio->sbi->wio_mutex[fio->type]);
 reallocate:
 	allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
-					&fio->new_blkaddr, sum, type);
+			&fio->new_blkaddr, sum, type, fio, true);
 
 	/* writeout dirty page into bdev */
-	err = f2fs_submit_page_mbio(fio);
+	err = f2fs_submit_page_write(fio);
 	if (err == -EAGAIN) {
 		fio->old_blkaddr = fio->new_blkaddr;
 		goto reallocate;
 	}
-
-	if (fio->type == NODE || fio->type == DATA)
-		mutex_unlock(&fio->sbi->wio_mutex[fio->type]);
 }
 
-void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+					enum iostat_type io_type)
 {
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
@@ -1769,13 +2493,16 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 		.new_blkaddr = page->index,
 		.page = page,
 		.encrypted_page = NULL,
+		.in_list = false,
 	};
 
 	if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
 		fio.op_flags &= ~REQ_META;
 
 	set_page_writeback(page);
-	f2fs_submit_page_mbio(&fio);
+	f2fs_submit_page_write(&fio);
+
+	f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);
 }
 
 void write_node_page(unsigned int nid, struct f2fs_io_info *fio)
@@ -1784,6 +2511,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio)
 
 	set_summary(&sum, nid, 0, 0);
 	do_write_page(&sum, fio);
+
+	f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
 }
 
 void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
@@ -1797,13 +2526,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 	do_write_page(&sum, fio);
 	f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
+
+	f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE);
 }
 
-void rewrite_data_page(struct f2fs_io_info *fio)
+int rewrite_data_page(struct f2fs_io_info *fio)
 {
+	int err;
+
 	fio->new_blkaddr = fio->old_blkaddr;
 	stat_inc_inplace_blocks(fio->sbi);
-	f2fs_submit_page_mbio(fio);
+
+	err = f2fs_submit_page_bio(fio);
+
+	f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+
+	return err;
 }
 
 void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -1845,7 +2583,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	/* change the current segment */
 	if (segno != curseg->segno) {
 		curseg->next_segno = segno;
-		change_curseg(sbi, type, true);
+		change_curseg(sbi, type);
 	}
 
 	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -1864,7 +2602,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	if (recover_curseg) {
 		if (old_cursegno != curseg->segno) {
 			curseg->next_segno = old_cursegno;
-			change_curseg(sbi, type, true);
+			change_curseg(sbi, type);
 		}
 		curseg->next_blkoff = old_blkoff;
 	}
@@ -1894,7 +2632,8 @@ void f2fs_wait_on_page_writeback(struct page *page,
 	if (PageWriteback(page)) {
 		struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 
-		f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE);
+		f2fs_submit_merged_write_cond(sbi, page->mapping->host,
+						0, page->index, type);
 		if (ordered)
 			wait_on_page_writeback(page);
 		else
@@ -1902,8 +2641,7 @@ void f2fs_wait_on_page_writeback(struct page *page,
 	}
 }
 
-void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
-							block_t blkaddr)
+void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr)
 {
 	struct page *cpage;
 
@@ -2052,6 +2790,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 
 static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 {
+	struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal;
+	struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal;
 	int type = CURSEG_HOT_DATA;
 	int err;
 
@@ -2078,6 +2818,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 			return err;
 	}
 
+	/* sanity check for summary blocks */
+	if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES ||
+			sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES)
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -2367,7 +3112,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			se = get_seg_entry(sbi, segno);
 
 			/* add discard candidates */
-			if (cpc->reason != CP_DISCARD) {
+			if (!(cpc->reason & CP_DISCARD)) {
 				cpc->trim_start = segno;
 				add_discard_addrs(sbi, cpc, false);
 			}
@@ -2403,7 +3148,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	f2fs_bug_on(sbi, !list_empty(head));
 	f2fs_bug_on(sbi, sit_i->dirty_sentries);
 out:
-	if (cpc->reason == CP_DISCARD) {
+	if (cpc->reason & CP_DISCARD) {
 		__u64 trim_start = cpc->trim_start;
 
 		for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
@@ -2431,13 +3176,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 
 	SM_I(sbi)->sit_info = sit_i;
 
-	sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+	sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) *
 					sizeof(struct seg_entry), GFP_KERNEL);
 	if (!sit_i->sentries)
 		return -ENOMEM;
 
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
-	sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+	sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
 	if (!sit_i->dirty_sentries_bitmap)
 		return -ENOMEM;
 
@@ -2470,7 +3215,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 		return -ENOMEM;
 
 	if (sbi->segs_per_sec > 1) {
-		sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+		sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) *
 					sizeof(struct sec_entry), GFP_KERNEL);
 		if (!sit_i->sec_entries)
 			return -ENOMEM;
@@ -2521,12 +3266,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
 	SM_I(sbi)->free_info = free_i;
 
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
-	free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
+	free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL);
 	if (!free_i->free_segmap)
 		return -ENOMEM;
 
 	sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
-	free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
+	free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL);
 	if (!free_i->free_secmap)
 		return -ENOMEM;
 
@@ -2602,10 +3347,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 
 			/* build discard map only one time */
 			if (f2fs_discard_en(sbi)) {
-				memcpy(se->discard_map, se->cur_valid_map,
-							SIT_VBLOCK_MAP_SIZE);
-				sbi->discard_blks += sbi->blocks_per_seg -
-							se->valid_blocks;
+				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+					memset(se->discard_map, 0xff,
+						SIT_VBLOCK_MAP_SIZE);
+				} else {
+					memcpy(se->discard_map,
+						se->cur_valid_map,
+						SIT_VBLOCK_MAP_SIZE);
+					sbi->discard_blks +=
+						sbi->blocks_per_seg -
+						se->valid_blocks;
+				}
 			}
 
 			if (sbi->segs_per_sec > 1)
@@ -2629,10 +3381,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 		seg_info_from_raw_sit(se, &sit);
 
 		if (f2fs_discard_en(sbi)) {
-			memcpy(se->discard_map, se->cur_valid_map,
-						SIT_VBLOCK_MAP_SIZE);
-			sbi->discard_blks += old_valid_blocks -
-						se->valid_blocks;
+			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+				memset(se->discard_map, 0xff,
+							SIT_VBLOCK_MAP_SIZE);
+			} else {
+				memcpy(se->discard_map, se->cur_valid_map,
+							SIT_VBLOCK_MAP_SIZE);
+				sbi->discard_blks += old_valid_blocks -
+							se->valid_blocks;
+			}
 		}
 
 		if (sbi->segs_per_sec > 1)
@@ -2676,7 +3433,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 		if (segno >= MAIN_SEGS(sbi))
 			break;
 		offset = segno + 1;
-		valid_blocks = get_valid_blocks(sbi, segno, 0);
+		valid_blocks = get_valid_blocks(sbi, segno, false);
 		if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
 			continue;
 		if (valid_blocks > sbi->blocks_per_seg) {
@@ -2694,7 +3451,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
 
-	dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+	dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL);
 	if (!dirty_i->victim_secmap)
 		return -ENOMEM;
 	return 0;
@@ -2716,7 +3473,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
 
 	for (i = 0; i < NR_DIRTY_TYPE; i++) {
-		dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+		dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL);
 		if (!dirty_i->dirty_segmap[i])
 			return -ENOMEM;
 	}
@@ -2782,22 +3539,22 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 		sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
 	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
 	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
-
-	INIT_LIST_HEAD(&sm_info->discard_entry_list);
-	INIT_LIST_HEAD(&sm_info->discard_cmd_list);
-	sm_info->nr_discards = 0;
-	sm_info->max_discards = 0;
+	sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
 
 	sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
 
 	INIT_LIST_HEAD(&sm_info->sit_entry_set);
 
-	if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
+	if (!f2fs_readonly(sbi->sb)) {
 		err = create_flush_cmd_control(sbi);
 		if (err)
 			return err;
 	}
 
+	err = create_discard_cmd_control(sbi);
+	if (err)
+		return err;
+
 	err = build_sit_info(sbi);
 	if (err)
 		return err;
@@ -2919,6 +3676,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
 	if (!sm_info)
 		return;
 	destroy_flush_cmd_control(sbi, true);
+	destroy_discard_cmd_control(sbi);
 	destroy_dirty_segmap(sbi);
 	destroy_curseg(sbi);
 	destroy_free_segmap(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5cb5755c75d9..ffa11274b0ce 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -21,78 +21,88 @@
 #define F2FS_MIN_SEGMENTS	9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
 
 /* L: Logical segment # in volume, R: Relative segment # in main area */
-#define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno)
-#define GET_R2L_SEGNO(free_i, segno)	(segno + free_i->start_segno)
+#define GET_L2R_SEGNO(free_i, segno)	((segno) - (free_i)->start_segno)
+#define GET_R2L_SEGNO(free_i, segno)	((segno) + (free_i)->start_segno)
 
-#define IS_DATASEG(t)	(t <= CURSEG_COLD_DATA)
-#define IS_NODESEG(t)	(t >= CURSEG_HOT_NODE)
+#define IS_DATASEG(t)	((t) <= CURSEG_COLD_DATA)
+#define IS_NODESEG(t)	((t) >= CURSEG_HOT_NODE)
+
+#define IS_HOT(t)	((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA)
+#define IS_WARM(t)	((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA)
+#define IS_COLD(t)	((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA)
 
 #define IS_CURSEG(sbi, seg)						\
-	((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||	\
-	 (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||	\
-	 (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||	\
-	 (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||	\
-	 (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||	\
-	 (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+	(((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||	\
+	 ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||	\
+	 ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||	\
+	 ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||	\
+	 ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||	\
+	 ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
 
 #define IS_CURSEC(sbi, secno)						\
-	((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /		\
-	  sbi->segs_per_sec) ||	\
-	 (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno /		\
-	  sbi->segs_per_sec) ||	\
-	 (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno /		\
-	  sbi->segs_per_sec) ||	\
-	 (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno /		\
-	  sbi->segs_per_sec) ||	\
-	 (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno /		\
-	  sbi->segs_per_sec) ||	\
-	 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /		\
-	  sbi->segs_per_sec))	\
+	(((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /		\
+	  (sbi)->segs_per_sec) ||	\
+	 ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno /		\
+	  (sbi)->segs_per_sec) ||	\
+	 ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno /		\
+	  (sbi)->segs_per_sec) ||	\
+	 ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno /		\
+	  (sbi)->segs_per_sec) ||	\
+	 ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno /		\
+	  (sbi)->segs_per_sec) ||	\
+	 ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /		\
+	  (sbi)->segs_per_sec))	\
 
 #define MAIN_BLKADDR(sbi)	(SM_I(sbi)->main_blkaddr)
 #define SEG0_BLKADDR(sbi)	(SM_I(sbi)->seg0_blkaddr)
 
 #define MAIN_SEGS(sbi)	(SM_I(sbi)->main_segments)
-#define MAIN_SECS(sbi)	(sbi->total_sections)
+#define MAIN_SECS(sbi)	((sbi)->total_sections)
 
 #define TOTAL_SEGS(sbi)	(SM_I(sbi)->segment_count)
-#define TOTAL_BLKS(sbi)	(TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+#define TOTAL_BLKS(sbi)	(TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
 
 #define MAX_BLKADDR(sbi)	(SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
-#define SEGMENT_SIZE(sbi)	(1ULL << (sbi->log_blocksize +		\
-					sbi->log_blocks_per_seg))
+#define SEGMENT_SIZE(sbi)	(1ULL << ((sbi)->log_blocksize +	\
+					(sbi)->log_blocks_per_seg))
 
 #define START_BLOCK(sbi, segno)	(SEG0_BLKADDR(sbi) +			\
-	 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+	 (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg))
 
 #define NEXT_FREE_BLKADDR(sbi, curseg)					\
-	(START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+	(START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff)
 
 #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)	((blk_addr) - SEG0_BLKADDR(sbi))
 #define GET_SEGNO_FROM_SEG0(sbi, blk_addr)				\
-	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg)
 #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr)				\
-	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
 
 #define GET_SEGNO(sbi, blk_addr)					\
-	(((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ?		\
+	((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ?	\
 	NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),			\
 		GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
-#define GET_SECNO(sbi, segno)					\
-	((segno) / sbi->segs_per_sec)
-#define GET_ZONENO_FROM_SEGNO(sbi, segno)				\
-	((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+#define BLKS_PER_SEC(sbi)					\
+	((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
+#define GET_SEC_FROM_SEG(sbi, segno)				\
+	((segno) / (sbi)->segs_per_sec)
+#define GET_SEG_FROM_SEC(sbi, secno)				\
+	((secno) * (sbi)->segs_per_sec)
+#define GET_ZONE_FROM_SEC(sbi, secno)				\
+	((secno) / (sbi)->secs_per_zone)
+#define GET_ZONE_FROM_SEG(sbi, segno)				\
+	GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
 
 #define GET_SUM_BLOCK(sbi, segno)				\
-	((sbi->sm_info->ssa_blkaddr) + segno)
+	((sbi)->sm_info->ssa_blkaddr + (segno))
 
 #define GET_SUM_TYPE(footer) ((footer)->entry_type)
-#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type))
 
 #define SIT_ENTRY_OFFSET(sit_i, segno)					\
-	(segno % sit_i->sents_per_block)
+	((segno) % (sit_i)->sents_per_block)
 #define SIT_BLOCK_OFFSET(segno)					\
-	(segno / SIT_ENTRY_PER_BLOCK)
+	((segno) / SIT_ENTRY_PER_BLOCK)
 #define	START_SEGNO(segno)		\
 	(SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
 #define SIT_BLK_CNT(sbi)			\
@@ -103,7 +113,7 @@
 #define SECTOR_FROM_BLOCK(blk_addr)					\
 	(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
 #define SECTOR_TO_BLOCK(sectors)					\
-	(sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
+	((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK)
 
 /*
  * indicate a block allocation direction: RIGHT and LEFT.
@@ -132,7 +142,10 @@ enum {
  */
 enum {
 	GC_CB = 0,
-	GC_GREEDY
+	GC_GREEDY,
+	ALLOC_NEXT,
+	FLUSH_DEVICE,
+	MAX_GC_POLICY,
 };
 
 /*
@@ -227,6 +240,8 @@ struct sit_info {
 	unsigned long long mounted_time;	/* mount time */
 	unsigned long long min_mtime;		/* min. modification time */
 	unsigned long long max_mtime;		/* max. modification time */
+
+	unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */
 };
 
 struct free_segmap_info {
@@ -303,17 +318,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
 						unsigned int segno)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
-	return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+	return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)];
 }
 
 static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
-				unsigned int segno, int section)
+				unsigned int segno, bool use_section)
 {
 	/*
 	 * In order to get # of valid blocks in a section instantly from many
 	 * segments, f2fs manages two counting structures separately.
 	 */
-	if (section > 1)
+	if (use_section && sbi->segs_per_sec > 1)
 		return get_sec_entry(sbi, segno)->valid_blocks;
 	else
 		return get_seg_entry(sbi, segno)->valid_blocks;
@@ -358,8 +373,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
 static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
 {
 	struct free_segmap_info *free_i = FREE_I(sbi);
-	unsigned int secno = segno / sbi->segs_per_sec;
-	unsigned int start_segno = secno * sbi->segs_per_sec;
+	unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+	unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
 	unsigned int next;
 
 	spin_lock(&free_i->segmap_lock);
@@ -379,7 +394,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi,
 		unsigned int segno)
 {
 	struct free_segmap_info *free_i = FREE_I(sbi);
-	unsigned int secno = segno / sbi->segs_per_sec;
+	unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
 	set_bit(segno, free_i->free_segmap);
 	free_i->free_segments--;
 	if (!test_and_set_bit(secno, free_i->free_secmap))
@@ -390,8 +406,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
 		unsigned int segno)
 {
 	struct free_segmap_info *free_i = FREE_I(sbi);
-	unsigned int secno = segno / sbi->segs_per_sec;
-	unsigned int start_segno = secno * sbi->segs_per_sec;
+	unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+	unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
 	unsigned int next;
 
 	spin_lock(&free_i->segmap_lock);
@@ -412,7 +428,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
 		unsigned int segno)
 {
 	struct free_segmap_info *free_i = FREE_I(sbi);
-	unsigned int secno = segno / sbi->segs_per_sec;
+	unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
 	spin_lock(&free_i->segmap_lock);
 	if (!test_and_set_bit(segno, free_i->free_segmap)) {
 		free_i->free_segments--;
@@ -475,27 +492,9 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
 	return SM_I(sbi)->ovp_segments;
 }
 
-static inline int overprovision_sections(struct f2fs_sb_info *sbi)
-{
-	return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
-}
-
 static inline int reserved_sections(struct f2fs_sb_info *sbi)
 {
-	return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
-}
-
-static inline bool need_SSR(struct f2fs_sb_info *sbi)
-{
-	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
-	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-	int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
-
-	if (test_opt(sbi, LFS))
-		return false;
-
-	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
-						reserved_sections(sbi) + 1);
+	return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
 }
 
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
@@ -540,6 +539,7 @@ static inline int utilization(struct f2fs_sb_info *sbi)
  */
 #define DEF_MIN_IPU_UTIL	70
 #define DEF_MIN_FSYNC_BLOCKS	8
+#define DEF_MIN_HOT_BLOCKS	16
 
 enum {
 	F2FS_IPU_FORCE,
@@ -547,20 +547,22 @@ enum {
 	F2FS_IPU_UTIL,
 	F2FS_IPU_SSR_UTIL,
 	F2FS_IPU_FSYNC,
+	F2FS_IPU_ASYNC,
 };
 
-static inline bool need_inplace_update(struct inode *inode)
+static inline bool need_inplace_update_policy(struct inode *inode,
+				struct f2fs_io_info *fio)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	unsigned int policy = SM_I(sbi)->ipu_policy;
 
-	/* IPU can be done only for the user data */
-	if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
-		return false;
-
 	if (test_opt(sbi, LFS))
 		return false;
 
+	/* if this is cold file, we should overwrite to avoid fragmentation */
+	if (file_is_cold(inode))
+		return true;
+
 	if (policy & (0x1 << F2FS_IPU_FORCE))
 		return true;
 	if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
@@ -572,6 +574,15 @@ static inline bool need_inplace_update(struct inode *inode)
 			utilization(sbi) > SM_I(sbi)->min_ipu_util)
 		return true;
 
+	/*
+	 * IPU for rewrite async pages
+	 */
+	if (policy & (0x1 << F2FS_IPU_ASYNC) &&
+			fio && fio->op == REQ_OP_WRITE &&
+			!(fio->op_flags & REQ_SYNC) &&
+			!f2fs_encrypted_inode(inode))
+		return true;
+
 	/* this is only set during fdatasync */
 	if (policy & (0x1 << F2FS_IPU_FSYNC) &&
 			is_inode_flag_set(inode, FI_NEED_IPU))
@@ -716,6 +727,15 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
 				- (base + 1) + type;
 }
 
+static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
+						unsigned int secno)
+{
+	if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >=
+						sbi->fggc_threshold)
+		return true;
+	return false;
+}
+
 static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
 {
 	if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
@@ -727,8 +747,8 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
  * It is very important to gather dirty pages and write at once, so that we can
  * submit a big bio without interfering other data writes.
  * By default, 512 pages for directory data,
- * 512 pages (2MB) * 3 for three types of nodes, and
- * max_bio_blocks for meta are set.
+ * 512 pages (2MB) * 8 for nodes, and
+ * 256 pages * 8 for meta are set.
  */
 static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 {
@@ -764,3 +784,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
 	wbc->nr_to_write = desired;
 	return desired - nr_to_write;
 }
+
+static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	bool wakeup = false;
+	int i;
+
+	if (force)
+		goto wake_up;
+
+	mutex_lock(&dcc->cmd_lock);
+	for (i = MAX_PLIST_NUM - 1;
+			i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+		if (!list_empty(&dcc->pend_list[i])) {
+			wakeup = true;
+			break;
+		}
+	}
+	mutex_unlock(&dcc->cmd_lock);
+	if (!wakeup)
+		return;
+wake_up:
+	dcc->discard_wake = 1;
+	wake_up_interruptible_all(&dcc->discard_wait_queue);
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 84d5686c4aa4..315e59ad1483 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -22,8 +22,10 @@
 #include <linux/random.h>
 #include <linux/exportfs.h>
 #include <linux/blkdev.h>
+#include <linux/quotaops.h>
 #include <linux/f2fs_fs.h>
 #include <linux/sysfs.h>
+#include <linux/quota.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -35,9 +37,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/f2fs.h>
 
-static struct proc_dir_entry *f2fs_proc_root;
 static struct kmem_cache *f2fs_inode_cachep;
-static struct kset *f2fs_kset;
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 
@@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = {
 	[FAULT_BLOCK]		= "no more block",
 	[FAULT_DIR_DEPTH]	= "too big dir depth",
 	[FAULT_EVICT_INODE]	= "evict_inode fail",
+	[FAULT_TRUNCATE]	= "truncate fail",
 	[FAULT_IO]		= "IO error",
 	[FAULT_CHECKPOINT]	= "checkpoint error",
 };
@@ -82,6 +83,7 @@ enum {
 	Opt_discard,
 	Opt_nodiscard,
 	Opt_noheap,
+	Opt_heap,
 	Opt_user_xattr,
 	Opt_nouser_xattr,
 	Opt_acl,
@@ -89,6 +91,7 @@ enum {
 	Opt_active_logs,
 	Opt_disable_ext_identify,
 	Opt_inline_xattr,
+	Opt_noinline_xattr,
 	Opt_inline_data,
 	Opt_inline_dentry,
 	Opt_noinline_dentry,
@@ -105,6 +108,20 @@ enum {
 	Opt_fault_injection,
 	Opt_lazytime,
 	Opt_nolazytime,
+	Opt_quota,
+	Opt_noquota,
+	Opt_usrquota,
+	Opt_grpquota,
+	Opt_prjquota,
+	Opt_usrjquota,
+	Opt_grpjquota,
+	Opt_prjjquota,
+	Opt_offusrjquota,
+	Opt_offgrpjquota,
+	Opt_offprjjquota,
+	Opt_jqfmt_vfsold,
+	Opt_jqfmt_vfsv0,
+	Opt_jqfmt_vfsv1,
 	Opt_err,
 };
 
@@ -115,6 +132,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
 	{Opt_noheap, "no_heap"},
+	{Opt_heap, "heap"},
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
@@ -122,6 +140,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_active_logs, "active_logs=%u"},
 	{Opt_disable_ext_identify, "disable_ext_identify"},
 	{Opt_inline_xattr, "inline_xattr"},
+	{Opt_noinline_xattr, "noinline_xattr"},
 	{Opt_inline_data, "inline_data"},
 	{Opt_inline_dentry, "inline_dentry"},
 	{Opt_noinline_dentry, "noinline_dentry"},
@@ -138,207 +157,23 @@ static match_table_t f2fs_tokens = {
 	{Opt_fault_injection, "fault_injection=%u"},
 	{Opt_lazytime, "lazytime"},
 	{Opt_nolazytime, "nolazytime"},
+	{Opt_quota, "quota"},
+	{Opt_noquota, "noquota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_prjquota, "prjquota"},
+	{Opt_usrjquota, "usrjquota=%s"},
+	{Opt_grpjquota, "grpjquota=%s"},
+	{Opt_prjjquota, "prjjquota=%s"},
+	{Opt_offusrjquota, "usrjquota="},
+	{Opt_offgrpjquota, "grpjquota="},
+	{Opt_offprjjquota, "prjjquota="},
+	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
 	{Opt_err, NULL},
 };
 
-/* Sysfs support for f2fs */
-enum {
-	GC_THREAD,	/* struct f2fs_gc_thread */
-	SM_INFO,	/* struct f2fs_sm_info */
-	NM_INFO,	/* struct f2fs_nm_info */
-	F2FS_SBI,	/* struct f2fs_sb_info */
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-	FAULT_INFO_RATE,	/* struct f2fs_fault_info */
-	FAULT_INFO_TYPE,	/* struct f2fs_fault_info */
-#endif
-};
-
-struct f2fs_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
-	ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
-			 const char *, size_t);
-	int struct_type;
-	int offset;
-};
-
-static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
-{
-	if (struct_type == GC_THREAD)
-		return (unsigned char *)sbi->gc_thread;
-	else if (struct_type == SM_INFO)
-		return (unsigned char *)SM_I(sbi);
-	else if (struct_type == NM_INFO)
-		return (unsigned char *)NM_I(sbi);
-	else if (struct_type == F2FS_SBI)
-		return (unsigned char *)sbi;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-	else if (struct_type == FAULT_INFO_RATE ||
-					struct_type == FAULT_INFO_TYPE)
-		return (unsigned char *)&sbi->fault_info;
-#endif
-	return NULL;
-}
-
-static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
-		struct f2fs_sb_info *sbi, char *buf)
-{
-	struct super_block *sb = sbi->sb;
-
-	if (!sb->s_bdev->bd_part)
-		return snprintf(buf, PAGE_SIZE, "0\n");
-
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)(sbi->kbytes_written +
-			BD_PART_WRITTEN(sbi)));
-}
-
-static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
-			struct f2fs_sb_info *sbi, char *buf)
-{
-	unsigned char *ptr = NULL;
-	unsigned int *ui;
-
-	ptr = __struct_ptr(sbi, a->struct_type);
-	if (!ptr)
-		return -EINVAL;
-
-	ui = (unsigned int *)(ptr + a->offset);
-
-	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
-			struct f2fs_sb_info *sbi,
-			const char *buf, size_t count)
-{
-	unsigned char *ptr;
-	unsigned long t;
-	unsigned int *ui;
-	ssize_t ret;
-
-	ptr = __struct_ptr(sbi, a->struct_type);
-	if (!ptr)
-		return -EINVAL;
-
-	ui = (unsigned int *)(ptr + a->offset);
-
-	ret = kstrtoul(skip_spaces(buf), 0, &t);
-	if (ret < 0)
-		return ret;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-	if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
-		return -EINVAL;
-#endif
-	*ui = t;
-	return count;
-}
-
-static ssize_t f2fs_attr_show(struct kobject *kobj,
-				struct attribute *attr, char *buf)
-{
-	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
-								s_kobj);
-	struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
-
-	return a->show ? a->show(a, sbi, buf) : 0;
-}
-
-static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr,
-						const char *buf, size_t len)
-{
-	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
-									s_kobj);
-	struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
-
-	return a->store ? a->store(a, sbi, buf, len) : 0;
-}
-
-static void f2fs_sb_release(struct kobject *kobj)
-{
-	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
-								s_kobj);
-	complete(&sbi->s_kobj_unregister);
-}
-
-#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
-static struct f2fs_attr f2fs_attr_##_name = {			\
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
-	.show	= _show,					\
-	.store	= _store,					\
-	.struct_type = _struct_type,				\
-	.offset = _offset					\
-}
-
-#define F2FS_RW_ATTR(struct_type, struct_name, name, elname)	\
-	F2FS_ATTR_OFFSET(struct_type, name, 0644,		\
-		f2fs_sbi_show, f2fs_sbi_store,			\
-		offsetof(struct struct_name, elname))
-
-#define F2FS_GENERAL_RO_ATTR(name) \
-static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
-
-F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
-F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
-F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
-F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
-F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
-F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
-F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
-F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
-#endif
-F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
-
-#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
-static struct attribute *f2fs_attrs[] = {
-	ATTR_LIST(gc_min_sleep_time),
-	ATTR_LIST(gc_max_sleep_time),
-	ATTR_LIST(gc_no_gc_sleep_time),
-	ATTR_LIST(gc_idle),
-	ATTR_LIST(reclaim_segments),
-	ATTR_LIST(max_small_discards),
-	ATTR_LIST(batched_trim_sections),
-	ATTR_LIST(ipu_policy),
-	ATTR_LIST(min_ipu_util),
-	ATTR_LIST(min_fsync_blocks),
-	ATTR_LIST(max_victim_search),
-	ATTR_LIST(dir_level),
-	ATTR_LIST(ram_thresh),
-	ATTR_LIST(ra_nid_pages),
-	ATTR_LIST(dirty_nats_ratio),
-	ATTR_LIST(cp_interval),
-	ATTR_LIST(idle_interval),
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-	ATTR_LIST(inject_rate),
-	ATTR_LIST(inject_type),
-#endif
-	ATTR_LIST(lifetime_write_kbytes),
-	NULL,
-};
-
-static const struct sysfs_ops f2fs_attr_ops = {
-	.show	= f2fs_attr_show,
-	.store	= f2fs_attr_store,
-};
-
-static struct kobj_type f2fs_ktype = {
-	.default_attrs	= f2fs_attrs,
-	.sysfs_ops	= &f2fs_attr_ops,
-	.release	= f2fs_sb_release,
-};
-
 void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
 {
 	struct va_format vaf;
@@ -347,7 +182,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
+	printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
 	va_end(args);
 }
 
@@ -358,6 +193,104 @@ static void init_once(void *foo)
 	inode_init_once(&fi->vfs_inode);
 }
 
+#ifdef CONFIG_QUOTA
+static const char * const quotatypes[] = INITQFNAMES;
+#define QTYPE2NAME(t) (quotatypes[t])
+static int f2fs_set_qf_name(struct super_block *sb, int qtype,
+							substring_t *args)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	char *qname;
+	int ret = -EINVAL;
+
+	if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) {
+		f2fs_msg(sb, KERN_ERR,
+			"Cannot change journaled "
+			"quota options when quota turned on");
+		return -EINVAL;
+	}
+	qname = match_strdup(args);
+	if (!qname) {
+		f2fs_msg(sb, KERN_ERR,
+			"Not enough memory for storing quotafile name");
+		return -EINVAL;
+	}
+	if (sbi->s_qf_names[qtype]) {
+		if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+			ret = 0;
+		else
+			f2fs_msg(sb, KERN_ERR,
+				 "%s quota file already specified",
+				 QTYPE2NAME(qtype));
+		goto errout;
+	}
+	if (strchr(qname, '/')) {
+		f2fs_msg(sb, KERN_ERR,
+			"quotafile must be on filesystem root");
+		goto errout;
+	}
+	sbi->s_qf_names[qtype] = qname;
+	set_opt(sbi, QUOTA);
+	return 0;
+errout:
+	kfree(qname);
+	return ret;
+}
+
+static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+	if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) {
+		f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+			" when quota turned on");
+		return -EINVAL;
+	}
+	kfree(sbi->s_qf_names[qtype]);
+	sbi->s_qf_names[qtype] = NULL;
+	return 0;
+}
+
+static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
+{
+	/*
+	 * We do the test below only for project quotas. 'usrquota' and
+	 * 'grpquota' mount options are allowed even without quota feature
+	 * to support legacy quotas in quota files.
+	 */
+	if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) {
+		f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. "
+			 "Cannot enable project quota enforcement.");
+		return -1;
+	}
+	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] ||
+			sbi->s_qf_names[PRJQUOTA]) {
+		if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+			clear_opt(sbi, USRQUOTA);
+
+		if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+			clear_opt(sbi, GRPQUOTA);
+
+		if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA])
+			clear_opt(sbi, PRJQUOTA);
+
+		if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) ||
+				test_opt(sbi, PRJQUOTA)) {
+			f2fs_msg(sbi->sb, KERN_ERR, "old and new quota "
+					"format mixing");
+			return -1;
+		}
+
+		if (!sbi->s_jquota_fmt) {
+			f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format "
+					"not specified");
+			return -1;
+		}
+	}
+	return 0;
+}
+#endif
+
 static int parse_options(struct super_block *sb, char *options)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -365,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options)
 	substring_t args[MAX_OPT_ARGS];
 	char *p, *name;
 	int arg = 0;
+#ifdef CONFIG_QUOTA
+	int ret;
+#endif
 
 	if (!options)
 		return 0;
@@ -431,6 +367,9 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_noheap:
 			set_opt(sbi, NOHEAP);
 			break;
+		case Opt_heap:
+			clear_opt(sbi, NOHEAP);
+			break;
 #ifdef CONFIG_F2FS_FS_XATTR
 		case Opt_user_xattr:
 			set_opt(sbi, XATTR_USER);
@@ -441,6 +380,9 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_inline_xattr:
 			set_opt(sbi, INLINE_XATTR);
 			break;
+		case Opt_noinline_xattr:
+			clear_opt(sbi, INLINE_XATTR);
+			break;
 #else
 		case Opt_user_xattr:
 			f2fs_msg(sb, KERN_INFO,
@@ -454,6 +396,10 @@ static int parse_options(struct super_block *sb, char *options)
 			f2fs_msg(sb, KERN_INFO,
 				"inline_xattr options not supported");
 			break;
+		case Opt_noinline_xattr:
+			f2fs_msg(sb, KERN_INFO,
+				"noinline_xattr options not supported");
+			break;
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 		case Opt_acl:
@@ -553,6 +499,7 @@ static int parse_options(struct super_block *sb, char *options)
 				return -EINVAL;
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 			f2fs_build_fault_attr(sbi, arg);
+			set_opt(sbi, FAULT_INJECTION);
 #else
 			f2fs_msg(sb, KERN_INFO,
 				"FAULT_INJECTION was not selected");
@@ -564,6 +511,81 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_nolazytime:
 			sb->s_flags &= ~MS_LAZYTIME;
 			break;
+#ifdef CONFIG_QUOTA
+		case Opt_quota:
+		case Opt_usrquota:
+			set_opt(sbi, USRQUOTA);
+			break;
+		case Opt_grpquota:
+			set_opt(sbi, GRPQUOTA);
+			break;
+		case Opt_prjquota:
+			set_opt(sbi, PRJQUOTA);
+			break;
+		case Opt_usrjquota:
+			ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]);
+			if (ret)
+				return ret;
+			break;
+		case Opt_grpjquota:
+			ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]);
+			if (ret)
+				return ret;
+			break;
+		case Opt_prjjquota:
+			ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]);
+			if (ret)
+				return ret;
+			break;
+		case Opt_offusrjquota:
+			ret = f2fs_clear_qf_name(sb, USRQUOTA);
+			if (ret)
+				return ret;
+			break;
+		case Opt_offgrpjquota:
+			ret = f2fs_clear_qf_name(sb, GRPQUOTA);
+			if (ret)
+				return ret;
+			break;
+		case Opt_offprjjquota:
+			ret = f2fs_clear_qf_name(sb, PRJQUOTA);
+			if (ret)
+				return ret;
+			break;
+		case Opt_jqfmt_vfsold:
+			sbi->s_jquota_fmt = QFMT_VFS_OLD;
+			break;
+		case Opt_jqfmt_vfsv0:
+			sbi->s_jquota_fmt = QFMT_VFS_V0;
+			break;
+		case Opt_jqfmt_vfsv1:
+			sbi->s_jquota_fmt = QFMT_VFS_V1;
+			break;
+		case Opt_noquota:
+			clear_opt(sbi, QUOTA);
+			clear_opt(sbi, USRQUOTA);
+			clear_opt(sbi, GRPQUOTA);
+			clear_opt(sbi, PRJQUOTA);
+			break;
+#else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
+		case Opt_prjquota:
+		case Opt_usrjquota:
+		case Opt_grpjquota:
+		case Opt_prjjquota:
+		case Opt_offusrjquota:
+		case Opt_offgrpjquota:
+		case Opt_offprjjquota:
+		case Opt_jqfmt_vfsold:
+		case Opt_jqfmt_vfsv0:
+		case Opt_jqfmt_vfsv1:
+		case Opt_noquota:
+			f2fs_msg(sb, KERN_INFO,
+					"quota operations not supported");
+			break;
+#endif
 		default:
 			f2fs_msg(sb, KERN_ERR,
 				"Unrecognized mount option \"%s\" or missing value",
@@ -571,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options)
 			return -EINVAL;
 		}
 	}
+#ifdef CONFIG_QUOTA
+	if (f2fs_check_quota_options(sbi))
+		return -EINVAL;
+#endif
 
 	if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
 		f2fs_msg(sb, KERN_ERR,
@@ -603,14 +629,22 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	mutex_init(&fi->inmem_lock);
 	init_rwsem(&fi->dio_rwsem[READ]);
 	init_rwsem(&fi->dio_rwsem[WRITE]);
+	init_rwsem(&fi->i_mmap_sem);
+	init_rwsem(&fi->i_xattr_sem);
 
+#ifdef CONFIG_QUOTA
+	memset(&fi->i_dquot, 0, sizeof(fi->i_dquot));
+	fi->i_reserved_quota = 0;
+#endif
 	/* Will be used by directory only */
 	fi->i_dir_level = F2FS_SB(sb)->dir_level;
+
 	return &fi->vfs_inode;
 }
 
 static int f2fs_drop_inode(struct inode *inode)
 {
+	int ret;
 	/*
 	 * This is to avoid a deadlock condition like below.
 	 * writeback_single_inode(inode)
@@ -643,10 +677,12 @@ static int f2fs_drop_inode(struct inode *inode)
 			spin_lock(&inode->i_lock);
 			atomic_dec(&inode->i_count);
 		}
+		trace_f2fs_drop_inode(inode, 0);
 		return 0;
 	}
-
-	return generic_drop_inode(inode);
+	ret = generic_drop_inode(inode);
+	trace_f2fs_drop_inode(inode, ret);
+	return ret;
 }
 
 int f2fs_inode_dirtied(struct inode *inode, bool sync)
@@ -744,15 +780,9 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
 static void f2fs_put_super(struct super_block *sb)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	int i;
 
-	if (sbi->s_proc) {
-		remove_proc_entry("segment_info", sbi->s_proc);
-		remove_proc_entry("segment_bits", sbi->s_proc);
-		remove_proc_entry(sb->s_id, f2fs_proc_root);
-	}
-	kobject_del(&sbi->s_kobj);
-
-	stop_gc_thread(sbi);
+	f2fs_quota_off_umount(sb);
 
 	/* prevent remaining shrinker jobs */
 	mutex_lock(&sbi->umount_mutex);
@@ -771,7 +801,14 @@ static void f2fs_put_super(struct super_block *sb)
 	}
 
 	/* be sure to wait for any on-going discard commands */
-	f2fs_wait_discard_bio(sbi, NULL_ADDR);
+	f2fs_wait_discard_bios(sbi);
+
+	if (f2fs_discard_en(sbi) && !sbi->discard_blks) {
+		struct cp_control cpc = {
+			.reason = CP_UMOUNT | CP_TRIMMED,
+		};
+		write_checkpoint(sbi, &cpc);
+	}
 
 	/* write_checkpoint can update stat informaion */
 	f2fs_destroy_stats(sbi);
@@ -786,7 +823,7 @@ static void f2fs_put_super(struct super_block *sb)
 	mutex_unlock(&sbi->umount_mutex);
 
 	/* our cp_error case, we can wait for any writeback page */
-	f2fs_flush_merged_bios(sbi);
+	f2fs_flush_merged_writes(sbi);
 
 	iput(sbi->node_inode);
 	iput(sbi->meta_inode);
@@ -796,8 +833,8 @@ static void f2fs_put_super(struct super_block *sb)
 	destroy_segment_manager(sbi);
 
 	kfree(sbi->ckpt);
-	kobject_put(&sbi->s_kobj);
-	wait_for_completion(&sbi->s_kobj_unregister);
+
+	f2fs_unregister_sysfs(sbi);
 
 	sb->s_fs_info = NULL;
 	if (sbi->s_chksum_driver)
@@ -805,8 +842,15 @@ static void f2fs_put_super(struct super_block *sb)
 	kfree(sbi->raw_super);
 
 	destroy_device_list(sbi);
-
+	if (sbi->write_io_dummy)
+		mempool_destroy(sbi->write_io_dummy);
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
 	destroy_percpu_info(sbi);
+	for (i = 0; i < NR_PAGE_TYPE; i++)
+		kfree(sbi->write_io[i]);
 	kfree(sbi);
 }
 
@@ -817,6 +861,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 
 	trace_f2fs_sync_fs(sb, sync);
 
+	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+		return -EAGAIN;
+
 	if (sync) {
 		struct cp_control cpc;
 
@@ -851,12 +898,55 @@ static int f2fs_unfreeze(struct super_block *sb)
 	return 0;
 }
 
+#ifdef CONFIG_QUOTA
+static int f2fs_statfs_project(struct super_block *sb,
+				kprojid_t projid, struct kstatfs *buf)
+{
+	struct kqid qid;
+	struct dquot *dquot;
+	u64 limit;
+	u64 curblock;
+
+	qid = make_kqid_projid(projid);
+	dquot = dqget(sb, qid);
+	if (IS_ERR(dquot))
+		return PTR_ERR(dquot);
+	spin_lock(&dq_data_lock);
+
+	limit = (dquot->dq_dqb.dqb_bsoftlimit ?
+		 dquot->dq_dqb.dqb_bsoftlimit :
+		 dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+	if (limit && buf->f_blocks > limit) {
+		curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+		buf->f_blocks = limit;
+		buf->f_bfree = buf->f_bavail =
+			(buf->f_blocks > curblock) ?
+			 (buf->f_blocks - curblock) : 0;
+	}
+
+	limit = dquot->dq_dqb.dqb_isoftlimit ?
+		dquot->dq_dqb.dqb_isoftlimit :
+		dquot->dq_dqb.dqb_ihardlimit;
+	if (limit && buf->f_files > limit) {
+		buf->f_files = limit;
+		buf->f_ffree =
+			(buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
+			 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+	}
+
+	spin_unlock(&dq_data_lock);
+	dqput(dquot);
+	return 0;
+}
+#endif
+
 static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 	block_t total_count, user_block_count, start_count, ovp_count;
+	u64 avail_node_count;
 
 	total_count = le64_to_cpu(sbi->raw_super->block_count);
 	user_block_count = sbi->user_block_count;
@@ -867,19 +957,67 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	buf->f_blocks = total_count - start_count;
 	buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
-	buf->f_bavail = user_block_count - valid_user_blocks(sbi);
+	buf->f_bavail = user_block_count - valid_user_blocks(sbi) -
+						sbi->reserved_blocks;
 
-	buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
-	buf->f_ffree = min(buf->f_files - valid_node_count(sbi),
-							buf->f_bavail);
+	avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+
+	if (avail_node_count > user_block_count) {
+		buf->f_files = user_block_count;
+		buf->f_ffree = buf->f_bavail;
+	} else {
+		buf->f_files = avail_node_count;
+		buf->f_ffree = min(avail_node_count - valid_node_count(sbi),
+					buf->f_bavail);
+	}
 
 	buf->f_namelen = F2FS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 
+#ifdef CONFIG_QUOTA
+	if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) &&
+			sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
+		f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf);
+	}
+#endif
 	return 0;
 }
 
+static inline void f2fs_show_quota_options(struct seq_file *seq,
+					   struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+	if (sbi->s_jquota_fmt) {
+		char *fmtname = "";
+
+		switch (sbi->s_jquota_fmt) {
+		case QFMT_VFS_OLD:
+			fmtname = "vfsold";
+			break;
+		case QFMT_VFS_V0:
+			fmtname = "vfsv0";
+			break;
+		case QFMT_VFS_V1:
+			fmtname = "vfsv1";
+			break;
+		}
+		seq_printf(seq, ",jqfmt=%s", fmtname);
+	}
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
+
+	if (sbi->s_qf_names[PRJQUOTA])
+		seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]);
+#endif
+}
+
 static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
@@ -897,7 +1035,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 	if (test_opt(sbi, DISCARD))
 		seq_puts(seq, ",discard");
 	if (test_opt(sbi, NOHEAP))
-		seq_puts(seq, ",no_heap_alloc");
+		seq_puts(seq, ",no_heap");
+	else
+		seq_puts(seq, ",heap");
 #ifdef CONFIG_F2FS_FS_XATTR
 	if (test_opt(sbi, XATTR_USER))
 		seq_puts(seq, ",user_xattr");
@@ -905,6 +1045,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",nouser_xattr");
 	if (test_opt(sbi, INLINE_XATTR))
 		seq_puts(seq, ",inline_xattr");
+	else
+		seq_puts(seq, ",noinline_xattr");
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 	if (test_opt(sbi, POSIX_ACL))
@@ -943,87 +1085,37 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
 	if (F2FS_IO_SIZE_BITS(sbi))
 		seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi));
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (test_opt(sbi, FAULT_INJECTION))
+		seq_printf(seq, ",fault_injection=%u",
+				sbi->fault_info.inject_rate);
+#endif
+#ifdef CONFIG_QUOTA
+	if (test_opt(sbi, QUOTA))
+		seq_puts(seq, ",quota");
+	if (test_opt(sbi, USRQUOTA))
+		seq_puts(seq, ",usrquota");
+	if (test_opt(sbi, GRPQUOTA))
+		seq_puts(seq, ",grpquota");
+	if (test_opt(sbi, PRJQUOTA))
+		seq_puts(seq, ",prjquota");
+#endif
+	f2fs_show_quota_options(seq, sbi->sb);
 
 	return 0;
 }
 
-static int segment_info_seq_show(struct seq_file *seq, void *offset)
-{
-	struct super_block *sb = seq->private;
-	struct f2fs_sb_info *sbi = F2FS_SB(sb);
-	unsigned int total_segs =
-			le32_to_cpu(sbi->raw_super->segment_count_main);
-	int i;
-
-	seq_puts(seq, "format: segment_type|valid_blocks\n"
-		"segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
-
-	for (i = 0; i < total_segs; i++) {
-		struct seg_entry *se = get_seg_entry(sbi, i);
-
-		if ((i % 10) == 0)
-			seq_printf(seq, "%-10d", i);
-		seq_printf(seq, "%d|%-3u", se->type,
-					get_valid_blocks(sbi, i, 1));
-		if ((i % 10) == 9 || i == (total_segs - 1))
-			seq_putc(seq, '\n');
-		else
-			seq_putc(seq, ' ');
-	}
-
-	return 0;
-}
-
-static int segment_bits_seq_show(struct seq_file *seq, void *offset)
-{
-	struct super_block *sb = seq->private;
-	struct f2fs_sb_info *sbi = F2FS_SB(sb);
-	unsigned int total_segs =
-			le32_to_cpu(sbi->raw_super->segment_count_main);
-	int i, j;
-
-	seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
-		"segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
-
-	for (i = 0; i < total_segs; i++) {
-		struct seg_entry *se = get_seg_entry(sbi, i);
-
-		seq_printf(seq, "%-10d", i);
-		seq_printf(seq, "%d|%-3u|", se->type,
-					get_valid_blocks(sbi, i, 1));
-		for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
-			seq_printf(seq, " %.2x", se->cur_valid_map[j]);
-		seq_putc(seq, '\n');
-	}
-	return 0;
-}
-
-#define F2FS_PROC_FILE_DEF(_name)					\
-static int _name##_open_fs(struct inode *inode, struct file *file)	\
-{									\
-	return single_open(file, _name##_seq_show, PDE_DATA(inode));	\
-}									\
-									\
-static const struct file_operations f2fs_seq_##_name##_fops = {		\
-	.owner = THIS_MODULE,						\
-	.open = _name##_open_fs,					\
-	.read = seq_read,						\
-	.llseek = seq_lseek,						\
-	.release = single_release,					\
-};
-
-F2FS_PROC_FILE_DEF(segment_info);
-F2FS_PROC_FILE_DEF(segment_bits);
-
 static void default_options(struct f2fs_sb_info *sbi)
 {
 	/* init some FS parameters */
 	sbi->active_logs = NR_CURSEG_TYPE;
 
 	set_opt(sbi, BG_GC);
+	set_opt(sbi, INLINE_XATTR);
 	set_opt(sbi, INLINE_DATA);
 	set_opt(sbi, INLINE_DENTRY);
 	set_opt(sbi, EXTENT_CACHE);
+	set_opt(sbi, NOHEAP);
 	sbi->sb->s_flags |= MS_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
 	if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
@@ -1049,6 +1141,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct f2fs_mount_info org_mount_opt;
+	unsigned long old_sb_flags;
 	int err, active_logs;
 	bool need_restart_gc = false;
 	bool need_stop_gc = false;
@@ -1056,14 +1149,37 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	struct f2fs_fault_info ffi = sbi->fault_info;
 #endif
+#ifdef CONFIG_QUOTA
+	int s_jquota_fmt;
+	char *s_qf_names[MAXQUOTAS];
+	int i, j;
+#endif
 
 	/*
 	 * Save the old mount options in case we
 	 * need to restore them.
 	 */
 	org_mount_opt = sbi->mount_opt;
+	old_sb_flags = sb->s_flags;
 	active_logs = sbi->active_logs;
 
+#ifdef CONFIG_QUOTA
+	s_jquota_fmt = sbi->s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sbi->s_qf_names[i]) {
+			s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+							 GFP_KERNEL);
+			if (!s_qf_names[i]) {
+				for (j = 0; j < i; j++)
+					kfree(s_qf_names[j]);
+				return -ENOMEM;
+			}
+		} else {
+			s_qf_names[i] = NULL;
+		}
+	}
+#endif
+
 	/* recover superblocks we couldn't write due to previous RO mount */
 	if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
 		err = f2fs_commit_super(sbi, false);
@@ -1073,7 +1189,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 			clear_sbi_flag(sbi, SBI_NEED_SB_WRITE);
 	}
 
-	sbi->mount_opt.opt = 0;
 	default_options(sbi);
 
 	/* parse mount options */
@@ -1088,6 +1203,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
 		goto skip;
 
+	if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
+		err = dquot_suspend(sb, -1);
+		if (err < 0)
+			goto restore_opts;
+	} else {
+		/* dquot_resume needs RW */
+		sb->s_flags &= ~MS_RDONLY;
+		dquot_resume(sb, -1);
+	}
+
 	/* disallow enable/disable extent_cache dynamically */
 	if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
 		err = -EINVAL;
@@ -1136,6 +1261,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore_gc;
 	}
 skip:
+#ifdef CONFIG_QUOTA
+	/* Release old quota file names */
+	for (i = 0; i < MAXQUOTAS; i++)
+		kfree(s_qf_names[i]);
+#endif
 	/* Update the POSIXACL Flag */
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -1150,21 +1280,289 @@ restore_gc:
 		stop_gc_thread(sbi);
 	}
 restore_opts:
+#ifdef CONFIG_QUOTA
+	sbi->s_jquota_fmt = s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++) {
+		kfree(sbi->s_qf_names[i]);
+		sbi->s_qf_names[i] = s_qf_names[i];
+	}
+#endif
 	sbi->mount_opt = org_mount_opt;
 	sbi->active_logs = active_logs;
+	sb->s_flags = old_sb_flags;
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	sbi->fault_info = ffi;
 #endif
 	return err;
 }
 
-static struct super_operations f2fs_sops = {
+#ifdef CONFIG_QUOTA
+/* Read data from quotafile */
+static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	struct address_space *mapping = inode->i_mapping;
+	block_t blkidx = F2FS_BYTES_TO_BLK(off);
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	loff_t i_size = i_size_read(inode);
+	struct page *page;
+	char *kaddr;
+
+	if (off > i_size)
+		return 0;
+
+	if (off + len > i_size)
+		len = i_size - off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
+repeat:
+		page = read_mapping_page(mapping, blkidx, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		lock_page(page);
+
+		if (unlikely(page->mapping != mapping)) {
+			f2fs_put_page(page, 1);
+			goto repeat;
+		}
+		if (unlikely(!PageUptodate(page))) {
+			f2fs_put_page(page, 1);
+			return -EIO;
+		}
+
+		kaddr = kmap_atomic(page);
+		memcpy(data, kaddr + offset, tocopy);
+		kunmap_atomic(kaddr);
+		f2fs_put_page(page, 1);
+
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blkidx++;
+	}
+	return len;
+}
+
+/* Write to quotafile */
+static ssize_t f2fs_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	struct address_space *mapping = inode->i_mapping;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	int offset = off & (sb->s_blocksize - 1);
+	size_t towrite = len;
+	struct page *page;
+	char *kaddr;
+	int err = 0;
+	int tocopy;
+
+	while (towrite > 0) {
+		tocopy = min_t(unsigned long, sb->s_blocksize - offset,
+								towrite);
+
+		err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
+							&page, NULL);
+		if (unlikely(err))
+			break;
+
+		kaddr = kmap_atomic(page);
+		memcpy(kaddr + offset, data, tocopy);
+		kunmap_atomic(kaddr);
+		flush_dcache_page(page);
+
+		a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
+						page, NULL);
+		offset = 0;
+		towrite -= tocopy;
+		off += tocopy;
+		data += tocopy;
+		cond_resched();
+	}
+
+	if (len == towrite)
+		return 0;
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = current_time(inode);
+	f2fs_mark_inode_dirty_sync(inode, false);
+	return len - towrite;
+}
+
+static struct dquot **f2fs_get_dquots(struct inode *inode)
+{
+	return F2FS_I(inode)->i_dquot;
+}
+
+static qsize_t *f2fs_get_reserved_space(struct inode *inode)
+{
+	return &F2FS_I(inode)->i_reserved_quota;
+}
+
+static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
+{
+	return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type],
+						sbi->s_jquota_fmt, type);
+}
+
+void f2fs_enable_quota_files(struct f2fs_sb_info *sbi)
+{
+	int i, ret;
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sbi->s_qf_names[i]) {
+			ret = f2fs_quota_on_mount(sbi, i);
+			if (ret < 0)
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Cannot turn on journaled "
+					"quota: error %d", ret);
+		}
+	}
+}
+
+static int f2fs_quota_sync(struct super_block *sb, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	int cnt;
+	int ret;
+
+	ret = dquot_writeback_dquots(sb, type);
+	if (ret)
+		return ret;
+
+	/*
+	 * Now when everything is written we can discard the pagecache so
+	 * that userspace sees the changes.
+	 */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+
+		ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping);
+		if (ret)
+			return ret;
+
+		inode_lock(dqopt->files[cnt]);
+		truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
+		inode_unlock(dqopt->files[cnt]);
+	}
+	return 0;
+}
+
+static int f2fs_quota_on(struct super_block *sb, int type, int format_id,
+							struct path *path)
+{
+	struct inode *inode;
+	int err;
+
+	err = f2fs_quota_sync(sb, type);
+	if (err)
+		return err;
+
+	err = dquot_quota_on(sb, type, format_id, path);
+	if (err)
+		return err;
+
+	inode = d_inode(path->dentry);
+
+	inode_lock(inode);
+	F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL;
+	inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+					S_NOATIME | S_IMMUTABLE);
+	inode_unlock(inode);
+	f2fs_mark_inode_dirty_sync(inode, false);
+
+	return 0;
+}
+
+static int f2fs_quota_off(struct super_block *sb, int type)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	int err;
+
+	if (!inode || !igrab(inode))
+		return dquot_quota_off(sb, type);
+
+	f2fs_quota_sync(sb, type);
+
+	err = dquot_quota_off(sb, type);
+	if (err)
+		goto out_put;
+
+	inode_lock(inode);
+	F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL);
+	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
+	inode_unlock(inode);
+	f2fs_mark_inode_dirty_sync(inode, false);
+out_put:
+	iput(inode);
+	return err;
+}
+
+void f2fs_quota_off_umount(struct super_block *sb)
+{
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		f2fs_quota_off(sb, type);
+}
+
+#if 0
+int f2fs_get_projid(struct inode *inode, kprojid_t *projid)
+{
+	*projid = F2FS_I(inode)->i_projid;
+	return 0;
+}
+#endif
+
+static const struct dquot_operations f2fs_quota_operations = {
+	.get_reserved_space = f2fs_get_reserved_space,
+	.write_dquot	= dquot_commit,
+	.acquire_dquot	= dquot_acquire,
+	.release_dquot	= dquot_release,
+	.mark_dirty	= dquot_mark_dquot_dirty,
+	.write_info	= dquot_commit_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
+#if 0
+	.get_projid	= f2fs_get_projid,
+	.get_next_id	= dquot_get_next_id,
+#endif
+};
+
+static const struct quotactl_ops f2fs_quotactl_ops = {
+	.quota_on	= f2fs_quota_on,
+	.quota_off	= f2fs_quota_off,
+	.quota_sync	= f2fs_quota_sync,
+	.get_state	= dquot_get_state,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk,
+};
+#else
+void f2fs_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
+static const struct super_operations f2fs_sops = {
 	.alloc_inode	= f2fs_alloc_inode,
 	.drop_inode	= f2fs_drop_inode,
 	.destroy_inode	= f2fs_destroy_inode,
 	.write_inode	= f2fs_write_inode,
 	.dirty_inode	= f2fs_dirty_inode,
 	.show_options	= f2fs_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= f2fs_quota_read,
+	.quota_write	= f2fs_quota_write,
+	.get_dquots	= f2fs_get_dquots,
+#endif
 	.evict_inode	= f2fs_evict_inode,
 	.put_super	= f2fs_put_super,
 	.sync_fs	= f2fs_sync_fs,
@@ -1182,12 +1580,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
 				ctx, len, NULL);
 }
 
-static int f2fs_key_prefix(struct inode *inode, u8 **key)
-{
-	*key = F2FS_I_SB(inode)->key_prefix;
-	return F2FS_I_SB(inode)->key_prefix_size;
-}
-
 static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
@@ -1202,16 +1594,16 @@ static unsigned f2fs_max_namelen(struct inode *inode)
 			inode->i_sb->s_blocksize : F2FS_NAME_LEN;
 }
 
-static struct fscrypt_operations f2fs_cryptops = {
+static const struct fscrypt_operations f2fs_cryptops = {
+	.key_prefix	= "f2fs:",
 	.get_context	= f2fs_get_context,
-	.key_prefix	= f2fs_key_prefix,
 	.set_context	= f2fs_set_context,
 	.is_encrypted	= f2fs_encrypted_inode,
 	.empty_dir	= f2fs_empty_dir,
 	.max_namelen	= f2fs_max_namelen,
 };
 #else
-static struct fscrypt_operations f2fs_cryptops = {
+static const struct fscrypt_operations f2fs_cryptops = {
 	.is_encrypted	= f2fs_encrypted_inode,
 };
 #endif
@@ -1263,9 +1655,16 @@ static const struct export_operations f2fs_export_ops = {
 
 static loff_t max_file_blocks(void)
 {
-	loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
+	loff_t result = 0;
 	loff_t leaf_count = ADDRS_PER_BLOCK;
 
+	/*
+	 * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
+	 * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
+	 * space in inode.i_addr, it will be more safe to reassign
+	 * result as zero.
+	 */
+
 	/* two direct node blocks */
 	result += (leaf_count * 2);
 
@@ -1467,6 +1866,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 		return 1;
 	}
 
+	if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid segment count (%u)",
+			le32_to_cpu(raw_super->segment_count));
+		return 1;
+	}
+
 	/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
 	if (sanity_check_area_boundary(sbi, bh))
 		return 1;
@@ -1480,6 +1886,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	unsigned int ovp_segments, reserved_segments;
+	unsigned int main_segs, blocks_per_seg;
+	int i;
 
 	total = le32_to_cpu(raw_super->segment_count);
 	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -1501,6 +1909,20 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 		return 1;
 	}
 
+	main_segs = le32_to_cpu(raw_super->segment_count_main);
+	blocks_per_seg = sbi->blocks_per_seg;
+
+	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
+		if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
+			le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg)
+			return 1;
+	}
+	for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
+		if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
+			le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg)
+			return 1;
+	}
+
 	if (unlikely(f2fs_cp_error(sbi))) {
 		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
 		return 1;
@@ -1511,7 +1933,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 static void init_sb_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = sbi->raw_super;
-	int i;
+	int i, j;
 
 	sbi->log_sectors_per_block =
 		le32_to_cpu(raw_super->log_sectors_per_block);
@@ -1539,17 +1961,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	for (i = 0; i < NR_COUNT_TYPE; i++)
 		atomic_set(&sbi->nr_pages[i], 0);
 
+	atomic_set(&sbi->wb_sync_req, 0);
+
 	INIT_LIST_HEAD(&sbi->s_list);
 	mutex_init(&sbi->umount_mutex);
-	mutex_init(&sbi->wio_mutex[NODE]);
-	mutex_init(&sbi->wio_mutex[DATA]);
+	for (i = 0; i < NR_PAGE_TYPE - 1; i++)
+		for (j = HOT; j < NR_TEMP_TYPE; j++)
+			mutex_init(&sbi->wio_mutex[i][j]);
 	spin_lock_init(&sbi->cp_lock);
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
-				F2FS_KEY_DESC_PREFIX_SIZE);
-	sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE;
-#endif
 }
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -1579,16 +1998,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 		return 0;
 
 	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
-				SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
+				SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)))
 		return -EINVAL;
-	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
+	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev));
 	if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
 				__ilog2_u32(sbi->blocks_per_blkz))
 		return -EINVAL;
 	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
 	FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
 					sbi->log_blocks_per_blkz;
-	if (nr_sectors & (bdev_zone_size(bdev) - 1))
+	if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
 		FDEV(devi).nr_blkz++;
 
 	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
@@ -1724,36 +2143,59 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+	unsigned int max_devices = MAX_DEVICES;
 	int i;
 
-	for (i = 0; i < MAX_DEVICES; i++) {
-		if (!RDEV(i).path[0])
+	/* Initialize single device information */
+	if (!RDEV(0).path[0]) {
+#ifdef CONFIG_BLK_DEV_ZONED
+		if (!bdev_is_zoned(sbi->sb->s_bdev))
 			return 0;
+		max_devices = 1;
+#else
+		return 0;
+#endif
+	}
 
-		if (i == 0) {
-			sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
-						MAX_DEVICES, GFP_KERNEL);
-			if (!sbi->devs)
-				return -ENOMEM;
-		}
+	/*
+	 * Initialize multiple devices information, or single
+	 * zoned block device information.
+	 */
+	sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info),
+				GFP_KERNEL);
+	if (!sbi->devs)
+		return -ENOMEM;
 
-		memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
-		FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
-		if (i == 0) {
-			FDEV(i).start_blk = 0;
-			FDEV(i).end_blk = FDEV(i).start_blk +
-				(FDEV(i).total_segments <<
-				sbi->log_blocks_per_seg) - 1 +
-				le32_to_cpu(raw_super->segment0_blkaddr);
-		} else {
-			FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
-			FDEV(i).end_blk = FDEV(i).start_blk +
-				(FDEV(i).total_segments <<
-				sbi->log_blocks_per_seg) - 1;
-		}
+	for (i = 0; i < max_devices; i++) {
 
-		FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+		if (i > 0 && !RDEV(i).path[0])
+			break;
+
+		if (max_devices == 1) {
+			/* Single zoned block device mount */
+			FDEV(0).bdev =
+				blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev,
 					sbi->sb->s_mode, sbi->sb->s_type);
+		} else {
+			/* Multi-device mount */
+			memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
+			FDEV(i).total_segments =
+				le32_to_cpu(RDEV(i).total_segments);
+			if (i == 0) {
+				FDEV(i).start_blk = 0;
+				FDEV(i).end_blk = FDEV(i).start_blk +
+				    (FDEV(i).total_segments <<
+				    sbi->log_blocks_per_seg) - 1 +
+				    le32_to_cpu(raw_super->segment0_blkaddr);
+			} else {
+				FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
+				FDEV(i).end_blk = FDEV(i).start_blk +
+					(FDEV(i).total_segments <<
+					sbi->log_blocks_per_seg) - 1;
+			}
+			FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+					sbi->sb->s_mode, sbi->sb->s_type);
+		}
 		if (IS_ERR(FDEV(i).bdev))
 			return PTR_ERR(FDEV(i).bdev);
 
@@ -1773,6 +2215,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 					"Failed to initialize F2FS blkzone information");
 				return -EINVAL;
 			}
+			if (max_devices == 1)
+				break;
 			f2fs_msg(sbi->sb, KERN_INFO,
 				"Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
 				i, FDEV(i).path,
@@ -1841,6 +2285,11 @@ try_onemore:
 	sb->s_fs_info = sbi;
 	sbi->raw_super = raw_super;
 
+	/* precompute checksum seed for metadata */
+	if (f2fs_sb_has_inode_chksum(sb))
+		sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
+						sizeof(raw_super->uuid));
+
 	/*
 	 * The BLKZONED feature indicates that the drive was formatted with
 	 * zone alignment optimization. This is optional for host-aware
@@ -1850,6 +2299,7 @@ try_onemore:
 	if (f2fs_sb_mounted_blkzoned(sb)) {
 		f2fs_msg(sb, KERN_ERR,
 			 "Zoned block device support is not enabled\n");
+		err = -EOPNOTSUPP;
 		goto free_sb_buf;
 	}
 #endif
@@ -1871,6 +2321,12 @@ try_onemore:
 	sb->s_max_links = F2FS_LINK_MAX;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 
+#ifdef CONFIG_QUOTA
+	sb->dq_op = &f2fs_quota_operations;
+	sb->s_qcop = &f2fs_quotactl_ops;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
+#endif
+
 	sb->s_op = &f2fs_sops;
 	sb->s_cop = &f2fs_cryptops;
 	sb->s_xattr = f2fs_xattr_handlers;
@@ -1886,18 +2342,34 @@ try_onemore:
 	mutex_init(&sbi->gc_mutex);
 	mutex_init(&sbi->cp_mutex);
 	init_rwsem(&sbi->node_write);
+	init_rwsem(&sbi->node_change);
 
 	/* disallow all the data/node/meta page writes */
 	set_sbi_flag(sbi, SBI_POR_DOING);
 	spin_lock_init(&sbi->stat_lock);
 
-	init_rwsem(&sbi->read_io.io_rwsem);
-	sbi->read_io.sbi = sbi;
-	sbi->read_io.bio = NULL;
+	/* init iostat info */
+	spin_lock_init(&sbi->iostat_lock);
+	sbi->iostat_enable = false;
+
 	for (i = 0; i < NR_PAGE_TYPE; i++) {
-		init_rwsem(&sbi->write_io[i].io_rwsem);
-		sbi->write_io[i].sbi = sbi;
-		sbi->write_io[i].bio = NULL;
+		int n = (i == META) ? 1: NR_TEMP_TYPE;
+		int j;
+
+		sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info),
+								GFP_KERNEL);
+		if (!sbi->write_io[i]) {
+			err = -ENOMEM;
+			goto free_options;
+		}
+
+		for (j = HOT; j < n; j++) {
+			init_rwsem(&sbi->write_io[i][j].io_rwsem);
+			sbi->write_io[i][j].sbi = sbi;
+			sbi->write_io[i][j].bio = NULL;
+			spin_lock_init(&sbi->write_io[i][j].io_lock);
+			INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
+		}
 	}
 
 	init_rwsem(&sbi->cp_rwsem);
@@ -1910,9 +2382,11 @@ try_onemore:
 
 	if (F2FS_IO_SIZE(sbi) > 1) {
 		sbi->write_io_dummy =
-			mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0);
-		if (!sbi->write_io_dummy)
+			mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
+		if (!sbi->write_io_dummy) {
+			err = -ENOMEM;
 			goto free_options;
+		}
 	}
 
 	/* get an inode for meta space */
@@ -1944,6 +2418,7 @@ try_onemore:
 	sbi->total_valid_block_count =
 				le64_to_cpu(sbi->ckpt->valid_block_count);
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
+	sbi->reserved_blocks = 0;
 
 	for (i = 0; i < NR_INODE_TYPE; i++) {
 		INIT_LIST_HEAD(&sbi->inode_list[i]);
@@ -1991,10 +2466,9 @@ try_onemore:
 
 	f2fs_join_shrinker(sbi);
 
-	/* if there are nt orphan nodes free them */
-	err = recover_orphan_inodes(sbi);
+	err = f2fs_build_stats(sbi);
 	if (err)
-		goto free_node_inode;
+		goto free_nm;
 
 	/* read root inode and dentry */
 	root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
@@ -2015,26 +2489,14 @@ try_onemore:
 		goto free_root_inode;
 	}
 
-	err = f2fs_build_stats(sbi);
+	err = f2fs_register_sysfs(sbi);
 	if (err)
 		goto free_root_inode;
 
-	if (f2fs_proc_root)
-		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
-
-	if (sbi->s_proc) {
-		proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
-				 &f2fs_seq_segment_info_fops, sb);
-		proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
-				 &f2fs_seq_segment_bits_fops, sb);
-	}
-
-	sbi->s_kobj.kset = f2fs_kset;
-	init_completion(&sbi->s_kobj_unregister);
-	err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
-							"%s", sb->s_id);
+	/* if there are nt orphan nodes free them */
+	err = recover_orphan_inodes(sbi);
 	if (err)
-		goto free_proc;
+		goto free_sysfs;
 
 	/* recover fsynced data */
 	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -2045,7 +2507,7 @@ try_onemore:
 		if (bdev_read_only(sb->s_bdev) &&
 				!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
 			err = -EROFS;
-			goto free_kobj;
+			goto free_meta;
 		}
 
 		if (need_fsck)
@@ -2059,7 +2521,7 @@ try_onemore:
 			need_fsck = true;
 			f2fs_msg(sb, KERN_ERR,
 				"Cannot recover all fsync data errno=%d", err);
-			goto free_kobj;
+			goto free_meta;
 		}
 	} else {
 		err = recover_fsync_data(sbi, true);
@@ -2068,7 +2530,7 @@ try_onemore:
 			err = -EINVAL;
 			f2fs_msg(sb, KERN_ERR,
 				"Need to recover fsync data");
-			goto free_kobj;
+			goto free_sysfs;
 		}
 	}
 skip_recovery:
@@ -2083,7 +2545,7 @@ skip_recovery:
 		/* After POR, we can run background GC thread.*/
 		err = start_gc_thread(sbi);
 		if (err)
-			goto free_kobj;
+			goto free_meta;
 	}
 	kfree(options);
 
@@ -2095,22 +2557,23 @@ skip_recovery:
 			sbi->valid_super_block ? 1 : 2, err);
 	}
 
+	f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx",
+				cur_cp_version(F2FS_CKPT(sbi)));
 	f2fs_update_time(sbi, CP_TIME);
 	f2fs_update_time(sbi, REQ_TIME);
 	return 0;
 
-free_kobj:
+free_meta:
 	f2fs_sync_inode_meta(sbi);
-	kobject_del(&sbi->s_kobj);
-	kobject_put(&sbi->s_kobj);
-	wait_for_completion(&sbi->s_kobj_unregister);
-free_proc:
-	if (sbi->s_proc) {
-		remove_proc_entry("segment_info", sbi->s_proc);
-		remove_proc_entry("segment_bits", sbi->s_proc);
-		remove_proc_entry(sb->s_id, f2fs_proc_root);
-	}
-	f2fs_destroy_stats(sbi);
+	/*
+	 * Some dirty meta pages can be produced by recover_orphan_inodes()
+	 * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
+	 * followed by write_checkpoint() through f2fs_write_node_pages(), which
+	 * falls into an infinite loop in sync_meta_pages().
+	 */
+	truncate_inode_pages_final(META_MAPPING(sbi));
+free_sysfs:
+	f2fs_unregister_sysfs(sbi);
 free_root_inode:
 	dput(sb->s_root);
 	sb->s_root = NULL;
@@ -2119,15 +2582,9 @@ free_node_inode:
 	mutex_lock(&sbi->umount_mutex);
 	release_ino_entry(sbi, true);
 	f2fs_leave_shrinker(sbi);
-	/*
-	 * Some dirty meta pages can be produced by recover_orphan_inodes()
-	 * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
-	 * followed by write_checkpoint() through f2fs_write_node_pages(), which
-	 * falls into an infinite loop in sync_meta_pages().
-	 */
-	truncate_inode_pages_final(META_MAPPING(sbi));
 	iput(sbi->node_inode);
 	mutex_unlock(&sbi->umount_mutex);
+	f2fs_destroy_stats(sbi);
 free_nm:
 	destroy_node_manager(sbi);
 free_sm:
@@ -2141,7 +2598,13 @@ free_meta_inode:
 free_io_dummy:
 	mempool_destroy(sbi->write_io_dummy);
 free_options:
+	for (i = 0; i < NR_PAGE_TYPE; i++)
+		kfree(sbi->write_io[i]);
 	destroy_percpu_info(sbi);
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
 	kfree(options);
 free_sb_buf:
 	kfree(raw_super);
@@ -2167,8 +2630,11 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
 
 static void kill_f2fs_super(struct super_block *sb)
 {
-	if (sb->s_root)
+	if (sb->s_root) {
 		set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE);
+		stop_gc_thread(F2FS_SB(sb));
+		stop_discard_thread(F2FS_SB(sb));
+	}
 	kill_block_super(sb);
 }
 
@@ -2222,30 +2688,26 @@ static int __init init_f2fs_fs(void)
 	err = create_extent_cache();
 	if (err)
 		goto free_checkpoint_caches;
-	f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
-	if (!f2fs_kset) {
-		err = -ENOMEM;
+	err = f2fs_init_sysfs();
+	if (err)
 		goto free_extent_cache;
-	}
 	err = register_shrinker(&f2fs_shrinker_info);
 	if (err)
-		goto free_kset;
-
+		goto free_sysfs;
 	err = register_filesystem(&f2fs_fs_type);
 	if (err)
 		goto free_shrinker;
 	err = f2fs_create_root_stats();
 	if (err)
 		goto free_filesystem;
-	f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
 	return 0;
 
 free_filesystem:
 	unregister_filesystem(&f2fs_fs_type);
 free_shrinker:
 	unregister_shrinker(&f2fs_shrinker_info);
-free_kset:
-	kset_unregister(f2fs_kset);
+free_sysfs:
+	f2fs_exit_sysfs();
 free_extent_cache:
 	destroy_extent_cache();
 free_checkpoint_caches:
@@ -2262,11 +2724,10 @@ fail:
 
 static void __exit exit_f2fs_fs(void)
 {
-	remove_proc_entry("fs/f2fs", NULL);
 	f2fs_destroy_root_stats();
 	unregister_filesystem(&f2fs_fs_type);
 	unregister_shrinker(&f2fs_shrinker_info);
-	kset_unregister(f2fs_kset);
+	f2fs_exit_sysfs();
 	destroy_extent_cache();
 	destroy_checkpoint_caches();
 	destroy_segment_manager_caches();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
new file mode 100644
index 000000000000..e2c258f717cd
--- /dev/null
+++ b/fs/f2fs/sysfs.c
@@ -0,0 +1,556 @@
+/*
+ * f2fs sysfs interface
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com/
+ * Copyright (c) 2017 Chao Yu <chao@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/proc_fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/seq_file.h>
+
+#include "f2fs.h"
+#include "segment.h"
+#include "gc.h"
+
+static struct proc_dir_entry *f2fs_proc_root;
+
+/* Sysfs support for f2fs */
+enum {
+	GC_THREAD,	/* struct f2fs_gc_thread */
+	SM_INFO,	/* struct f2fs_sm_info */
+	DCC_INFO,	/* struct discard_cmd_control */
+	NM_INFO,	/* struct f2fs_nm_info */
+	F2FS_SBI,	/* struct f2fs_sb_info */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	FAULT_INFO_RATE,	/* struct f2fs_fault_info */
+	FAULT_INFO_TYPE,	/* struct f2fs_fault_info */
+#endif
+	RESERVED_BLOCKS,
+};
+
+struct f2fs_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
+	ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
+			 const char *, size_t);
+	int struct_type;
+	int offset;
+	int id;
+};
+
+static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
+{
+	if (struct_type == GC_THREAD)
+		return (unsigned char *)sbi->gc_thread;
+	else if (struct_type == SM_INFO)
+		return (unsigned char *)SM_I(sbi);
+	else if (struct_type == DCC_INFO)
+		return (unsigned char *)SM_I(sbi)->dcc_info;
+	else if (struct_type == NM_INFO)
+		return (unsigned char *)NM_I(sbi);
+	else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS)
+		return (unsigned char *)sbi;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	else if (struct_type == FAULT_INFO_RATE ||
+					struct_type == FAULT_INFO_TYPE)
+		return (unsigned char *)&sbi->fault_info;
+#endif
+	return NULL;
+}
+
+static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)(sbi->kbytes_written +
+			BD_PART_WRITTEN(sbi)));
+}
+
+static ssize_t features_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->sb;
+	int len = 0;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+
+	if (f2fs_sb_has_crypto(sb))
+		len += snprintf(buf, PAGE_SIZE - len, "%s",
+						"encryption");
+	if (f2fs_sb_mounted_blkzoned(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "blkzoned");
+	if (f2fs_sb_has_extra_attr(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "extra_attr");
+	if (f2fs_sb_has_project_quota(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "projquota");
+	if (f2fs_sb_has_inode_chksum(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "inode_checksum");
+	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
+	return len;
+}
+
+static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
+			struct f2fs_sb_info *sbi, char *buf)
+{
+	unsigned char *ptr = NULL;
+	unsigned int *ui;
+
+	ptr = __struct_ptr(sbi, a->struct_type);
+	if (!ptr)
+		return -EINVAL;
+
+	ui = (unsigned int *)(ptr + a->offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
+			struct f2fs_sb_info *sbi,
+			const char *buf, size_t count)
+{
+	unsigned char *ptr;
+	unsigned long t;
+	unsigned int *ui;
+	ssize_t ret;
+
+	ptr = __struct_ptr(sbi, a->struct_type);
+	if (!ptr)
+		return -EINVAL;
+
+	ui = (unsigned int *)(ptr + a->offset);
+
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret < 0)
+		return ret;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
+		return -EINVAL;
+#endif
+	if (a->struct_type == RESERVED_BLOCKS) {
+		spin_lock(&sbi->stat_lock);
+		if ((unsigned long)sbi->total_valid_block_count + t >
+				(unsigned long)sbi->user_block_count) {
+			spin_unlock(&sbi->stat_lock);
+			return -EINVAL;
+		}
+		*ui = t;
+		spin_unlock(&sbi->stat_lock);
+		return count;
+	}
+
+	if (!strcmp(a->attr.name, "discard_granularity")) {
+		struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+		int i;
+
+		if (t == 0 || t > MAX_PLIST_NUM)
+			return -EINVAL;
+		if (t == *ui)
+			return count;
+
+		mutex_lock(&dcc->cmd_lock);
+		for (i = 0; i < MAX_PLIST_NUM; i++) {
+			if (i >= t - 1)
+				dcc->pend_list_tag[i] |= P_ACTIVE;
+			else
+				dcc->pend_list_tag[i] &= (~P_ACTIVE);
+		}
+		mutex_unlock(&dcc->cmd_lock);
+
+		*ui = t;
+		return count;
+	}
+
+	*ui = t;
+
+	if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)
+		f2fs_reset_iostat(sbi);
+	if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) {
+		sbi->gc_thread->gc_wake = 1;
+		wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head);
+		wake_up_discard_thread(sbi, true);
+	}
+
+	return count;
+}
+
+static ssize_t f2fs_attr_show(struct kobject *kobj,
+				struct attribute *attr, char *buf)
+{
+	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+								s_kobj);
+	struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+	return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr,
+						const char *buf, size_t len)
+{
+	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+									s_kobj);
+	struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+	return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void f2fs_sb_release(struct kobject *kobj)
+{
+	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+								s_kobj);
+	complete(&sbi->s_kobj_unregister);
+}
+
+enum feat_id {
+	FEAT_CRYPTO = 0,
+	FEAT_BLKZONED,
+	FEAT_ATOMIC_WRITE,
+	FEAT_EXTRA_ATTR,
+	FEAT_PROJECT_QUOTA,
+	FEAT_INODE_CHECKSUM,
+};
+
+static ssize_t f2fs_feature_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	switch (a->id) {
+	case FEAT_CRYPTO:
+	case FEAT_BLKZONED:
+	case FEAT_ATOMIC_WRITE:
+	case FEAT_EXTRA_ATTR:
+	case FEAT_PROJECT_QUOTA:
+	case FEAT_INODE_CHECKSUM:
+		return snprintf(buf, PAGE_SIZE, "supported\n");
+	}
+	return 0;
+}
+
+#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
+static struct f2fs_attr f2fs_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+	.struct_type = _struct_type,				\
+	.offset = _offset					\
+}
+
+#define F2FS_RW_ATTR(struct_type, struct_name, name, elname)	\
+	F2FS_ATTR_OFFSET(struct_type, name, 0644,		\
+		f2fs_sbi_show, f2fs_sbi_store,			\
+		offsetof(struct struct_name, elname))
+
+#define F2FS_GENERAL_RO_ATTR(name) \
+static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
+
+#define F2FS_FEATURE_RO_ATTR(_name, _id)			\
+static struct f2fs_attr f2fs_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = 0444 },	\
+	.show	= f2fs_feature_show,				\
+	.id	= _id,						\
+}
+
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time,
+							urgent_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
+F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
+F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
+#endif
+F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
+F2FS_GENERAL_RO_ATTR(features);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
+#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED);
+#endif
+F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
+F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
+F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
+F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
+
+#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
+static struct attribute *f2fs_attrs[] = {
+	ATTR_LIST(gc_urgent_sleep_time),
+	ATTR_LIST(gc_min_sleep_time),
+	ATTR_LIST(gc_max_sleep_time),
+	ATTR_LIST(gc_no_gc_sleep_time),
+	ATTR_LIST(gc_idle),
+	ATTR_LIST(gc_urgent),
+	ATTR_LIST(reclaim_segments),
+	ATTR_LIST(max_small_discards),
+	ATTR_LIST(discard_granularity),
+	ATTR_LIST(batched_trim_sections),
+	ATTR_LIST(ipu_policy),
+	ATTR_LIST(min_ipu_util),
+	ATTR_LIST(min_fsync_blocks),
+	ATTR_LIST(min_hot_blocks),
+	ATTR_LIST(max_victim_search),
+	ATTR_LIST(dir_level),
+	ATTR_LIST(ram_thresh),
+	ATTR_LIST(ra_nid_pages),
+	ATTR_LIST(dirty_nats_ratio),
+	ATTR_LIST(cp_interval),
+	ATTR_LIST(idle_interval),
+	ATTR_LIST(iostat_enable),
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	ATTR_LIST(inject_rate),
+	ATTR_LIST(inject_type),
+#endif
+	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(features),
+	ATTR_LIST(reserved_blocks),
+	NULL,
+};
+
+static struct attribute *f2fs_feat_attrs[] = {
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	ATTR_LIST(encryption),
+#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+	ATTR_LIST(block_zoned),
+#endif
+	ATTR_LIST(atomic_write),
+	ATTR_LIST(extra_attr),
+	ATTR_LIST(project_quota),
+	ATTR_LIST(inode_checksum),
+	NULL,
+};
+
+static const struct sysfs_ops f2fs_attr_ops = {
+	.show	= f2fs_attr_show,
+	.store	= f2fs_attr_store,
+};
+
+static struct kobj_type f2fs_sb_ktype = {
+	.default_attrs	= f2fs_attrs,
+	.sysfs_ops	= &f2fs_attr_ops,
+	.release	= f2fs_sb_release,
+};
+
+static struct kobj_type f2fs_ktype = {
+	.sysfs_ops	= &f2fs_attr_ops,
+};
+
+static struct kset f2fs_kset = {
+	.kobj   = {.ktype = &f2fs_ktype},
+};
+
+static struct kobj_type f2fs_feat_ktype = {
+	.default_attrs	= f2fs_feat_attrs,
+	.sysfs_ops	= &f2fs_attr_ops,
+};
+
+static struct kobject f2fs_feat = {
+	.kset	= &f2fs_kset,
+};
+
+static int segment_info_seq_show(struct seq_file *seq, void *offset)
+{
+	struct super_block *sb = seq->private;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	unsigned int total_segs =
+			le32_to_cpu(sbi->raw_super->segment_count_main);
+	int i;
+
+	seq_puts(seq, "format: segment_type|valid_blocks\n"
+		"segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
+	for (i = 0; i < total_segs; i++) {
+		struct seg_entry *se = get_seg_entry(sbi, i);
+
+		if ((i % 10) == 0)
+			seq_printf(seq, "%-10d", i);
+		seq_printf(seq, "%d|%-3u", se->type,
+					get_valid_blocks(sbi, i, false));
+		if ((i % 10) == 9 || i == (total_segs - 1))
+			seq_putc(seq, '\n');
+		else
+			seq_putc(seq, ' ');
+	}
+
+	return 0;
+}
+
+static int segment_bits_seq_show(struct seq_file *seq, void *offset)
+{
+	struct super_block *sb = seq->private;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	unsigned int total_segs =
+			le32_to_cpu(sbi->raw_super->segment_count_main);
+	int i, j;
+
+	seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
+		"segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
+	for (i = 0; i < total_segs; i++) {
+		struct seg_entry *se = get_seg_entry(sbi, i);
+
+		seq_printf(seq, "%-10d", i);
+		seq_printf(seq, "%d|%-3u|", se->type,
+					get_valid_blocks(sbi, i, false));
+		for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
+			seq_printf(seq, " %.2x", se->cur_valid_map[j]);
+		seq_putc(seq, '\n');
+	}
+	return 0;
+}
+
+static int iostat_info_seq_show(struct seq_file *seq, void *offset)
+{
+	struct super_block *sb = seq->private;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	time64_t now = ktime_get_real_seconds();
+
+	if (!sbi->iostat_enable)
+		return 0;
+
+	seq_printf(seq, "time:		%-16llu\n", now);
+
+	/* print app IOs */
+	seq_printf(seq, "app buffered:	%-16llu\n",
+				sbi->write_iostat[APP_BUFFERED_IO]);
+	seq_printf(seq, "app direct:	%-16llu\n",
+				sbi->write_iostat[APP_DIRECT_IO]);
+	seq_printf(seq, "app mapped:	%-16llu\n",
+				sbi->write_iostat[APP_MAPPED_IO]);
+
+	/* print fs IOs */
+	seq_printf(seq, "fs data:	%-16llu\n",
+				sbi->write_iostat[FS_DATA_IO]);
+	seq_printf(seq, "fs node:	%-16llu\n",
+				sbi->write_iostat[FS_NODE_IO]);
+	seq_printf(seq, "fs meta:	%-16llu\n",
+				sbi->write_iostat[FS_META_IO]);
+	seq_printf(seq, "fs gc data:	%-16llu\n",
+				sbi->write_iostat[FS_GC_DATA_IO]);
+	seq_printf(seq, "fs gc node:	%-16llu\n",
+				sbi->write_iostat[FS_GC_NODE_IO]);
+	seq_printf(seq, "fs cp data:	%-16llu\n",
+				sbi->write_iostat[FS_CP_DATA_IO]);
+	seq_printf(seq, "fs cp node:	%-16llu\n",
+				sbi->write_iostat[FS_CP_NODE_IO]);
+	seq_printf(seq, "fs cp meta:	%-16llu\n",
+				sbi->write_iostat[FS_CP_META_IO]);
+	seq_printf(seq, "fs discard:	%-16llu\n",
+				sbi->write_iostat[FS_DISCARD]);
+
+	return 0;
+}
+
+#define F2FS_PROC_FILE_DEF(_name)					\
+static int _name##_open_fs(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, _name##_seq_show, PDE_DATA(inode));	\
+}									\
+									\
+static const struct file_operations f2fs_seq_##_name##_fops = {		\
+	.open = _name##_open_fs,					\
+	.read = seq_read,						\
+	.llseek = seq_lseek,						\
+	.release = single_release,					\
+};
+
+F2FS_PROC_FILE_DEF(segment_info);
+F2FS_PROC_FILE_DEF(segment_bits);
+F2FS_PROC_FILE_DEF(iostat_info);
+
+int __init f2fs_init_sysfs(void)
+{
+	int ret;
+
+	kobject_set_name(&f2fs_kset.kobj, "f2fs");
+	f2fs_kset.kobj.parent = fs_kobj;
+	ret = kset_register(&f2fs_kset);
+	if (ret)
+		return ret;
+
+	ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype,
+				   NULL, "features");
+	if (ret)
+		kset_unregister(&f2fs_kset);
+	else
+		f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+	return ret;
+}
+
+void f2fs_exit_sysfs(void)
+{
+	kobject_put(&f2fs_feat);
+	kset_unregister(&f2fs_kset);
+	remove_proc_entry("fs/f2fs", NULL);
+	f2fs_proc_root = NULL;
+}
+
+int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
+{
+	struct super_block *sb = sbi->sb;
+	int err;
+
+	sbi->s_kobj.kset = &f2fs_kset;
+	init_completion(&sbi->s_kobj_unregister);
+	err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
+				"%s", sb->s_id);
+	if (err)
+		return err;
+
+	if (f2fs_proc_root)
+		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
+
+	if (sbi->s_proc) {
+		proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
+				 &f2fs_seq_segment_info_fops, sb);
+		proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
+				 &f2fs_seq_segment_bits_fops, sb);
+		proc_create_data("iostat_info", S_IRUGO, sbi->s_proc,
+				&f2fs_seq_iostat_info_fops, sb);
+	}
+	return 0;
+}
+
+void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
+{
+	if (sbi->s_proc) {
+		remove_proc_entry("iostat_info", sbi->s_proc);
+		remove_proc_entry("segment_info", sbi->s_proc);
+		remove_proc_entry("segment_bits", sbi->s_proc);
+		remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
+	}
+	kobject_del(&sbi->s_kobj);
+}
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 73b4e1d1912a..bccbbf2616d2 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page)
 	pid_t pid = task_pid_nr(current);
 	void *p;
 
-	page->private = pid;
+	set_page_private(page, (unsigned long)pid);
 
 	if (radix_tree_preload(GFP_NOFS))
 		return;
@@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
 
 	radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
 		results[ret] = iter.index;
-		if (++ret == PIDVEC_SIZE)
+		if (++ret == max_items)
 			break;
 	}
 	return ret;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 1c4d5e39586c..ab658419552b 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -264,18 +264,123 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
 	return entry;
 }
 
+static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
+					void **last_addr, int index,
+					size_t len, const char *name)
+{
+	struct f2fs_xattr_entry *entry;
+	unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2;
+
+	list_for_each_xattr(entry, base_addr) {
+		if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
+			(void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) >
+			base_addr + inline_size) {
+			*last_addr = entry;
+			return NULL;
+		}
+		if (entry->e_name_index != index)
+			continue;
+		if (entry->e_name_len != len)
+			continue;
+		if (!memcmp(entry->e_name, name, len))
+			break;
+	}
+	return entry;
+}
+
+static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
+				unsigned int index, unsigned int len,
+				const char *name, struct f2fs_xattr_entry **xe,
+				void **base_addr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	void *cur_addr, *txattr_addr, *last_addr = NULL;
+	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+	unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
+	unsigned int inline_size = inline_xattr_size(inode);
+	int err = 0;
+
+	if (!size && !inline_size)
+		return -ENODATA;
+
+	txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
+							GFP_F2FS_ZERO);
+	if (!txattr_addr)
+		return -ENOMEM;
+
+	/* read from inline xattr */
+	if (inline_size) {
+		struct page *page = NULL;
+		void *inline_addr;
+
+		if (ipage) {
+			inline_addr = inline_xattr_addr(ipage);
+		} else {
+			page = get_node_page(sbi, inode->i_ino);
+			if (IS_ERR(page)) {
+				err = PTR_ERR(page);
+				goto out;
+			}
+			inline_addr = inline_xattr_addr(page);
+		}
+		memcpy(txattr_addr, inline_addr, inline_size);
+		f2fs_put_page(page, 1);
+
+		*xe = __find_inline_xattr(txattr_addr, &last_addr,
+						index, len, name);
+		if (*xe)
+			goto check;
+	}
+
+	/* read from xattr node block */
+	if (xnid) {
+		struct page *xpage;
+		void *xattr_addr;
+
+		/* The inode already has an extended attribute block. */
+		xpage = get_node_page(sbi, xnid);
+		if (IS_ERR(xpage)) {
+			err = PTR_ERR(xpage);
+			goto out;
+		}
+
+		xattr_addr = page_address(xpage);
+		memcpy(txattr_addr + inline_size, xattr_addr, size);
+		f2fs_put_page(xpage, 1);
+	}
+
+	if (last_addr)
+		cur_addr = XATTR_HDR(last_addr) - 1;
+	else
+		cur_addr = txattr_addr;
+
+	*xe = __find_xattr(cur_addr, index, len, name);
+check:
+	if (IS_XATTR_LAST_ENTRY(*xe)) {
+		err = -ENODATA;
+		goto out;
+	}
+
+	*base_addr = txattr_addr;
+	return 0;
+out:
+	kzfree(txattr_addr);
+	return err;
+}
+
 static int read_all_xattrs(struct inode *inode, struct page *ipage,
 							void **base_addr)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_xattr_header *header;
-	size_t size = PAGE_SIZE, inline_size = 0;
+	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+	unsigned int size = VALID_XATTR_BLOCK_SIZE;
+	unsigned int inline_size = inline_xattr_size(inode);
 	void *txattr_addr;
 	int err;
 
-	inline_size = inline_xattr_size(inode);
-
-	txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
+	txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
+							GFP_F2FS_ZERO);
 	if (!txattr_addr)
 		return -ENOMEM;
 
@@ -299,19 +404,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
 	}
 
 	/* read from xattr node block */
-	if (F2FS_I(inode)->i_xattr_nid) {
+	if (xnid) {
 		struct page *xpage;
 		void *xattr_addr;
 
 		/* The inode already has an extended attribute block. */
-		xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+		xpage = get_node_page(sbi, xnid);
 		if (IS_ERR(xpage)) {
 			err = PTR_ERR(xpage);
 			goto fail;
 		}
 
 		xattr_addr = page_address(xpage);
-		memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
+		memcpy(txattr_addr + inline_size, xattr_addr, size);
 		f2fs_put_page(xpage, 1);
 	}
 
@@ -333,14 +438,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 				void *txattr_addr, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	size_t inline_size = 0;
+	size_t inline_size = inline_xattr_size(inode);
 	void *xattr_addr;
 	struct page *xpage;
 	nid_t new_nid = 0;
 	int err;
 
-	inline_size = inline_xattr_size(inode);
-
 	if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
 		if (!alloc_nid(sbi, &new_nid))
 			return -ENOSPC;
@@ -386,7 +489,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 	} else {
 		struct dnode_of_data dn;
 		set_new_dnode(&dn, inode, NULL, NULL, new_nid);
-		xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
+		xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
 		if (IS_ERR(xpage)) {
 			alloc_nid_failed(sbi, new_nid);
 			return PTR_ERR(xpage);
@@ -395,23 +498,20 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 	}
 
 	xattr_addr = page_address(xpage);
-	memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE -
-						sizeof(struct node_footer));
+	memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
 	set_page_dirty(xpage);
 	f2fs_put_page(xpage, 1);
 
-	/* need to checkpoint during fsync */
-	F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
 	return 0;
 }
 
 int f2fs_getxattr(struct inode *inode, int index, const char *name,
 		void *buffer, size_t buffer_size, struct page *ipage)
 {
-	struct f2fs_xattr_entry *entry;
-	void *base_addr;
+	struct f2fs_xattr_entry *entry = NULL;
 	int error = 0;
-	size_t size, len;
+	unsigned int size, len;
+	void *base_addr = NULL;
 
 	if (name == NULL)
 		return -EINVAL;
@@ -420,21 +520,18 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 	if (len > F2FS_NAME_LEN)
 		return -ERANGE;
 
-	error = read_all_xattrs(inode, ipage, &base_addr);
+	down_read(&F2FS_I(inode)->i_xattr_sem);
+	error = lookup_all_xattrs(inode, ipage, index, len, name,
+				&entry, &base_addr);
+	up_read(&F2FS_I(inode)->i_xattr_sem);
 	if (error)
 		return error;
 
-	entry = __find_xattr(base_addr, index, len, name);
-	if (IS_XATTR_LAST_ENTRY(entry)) {
-		error = -ENODATA;
-		goto cleanup;
-	}
-
 	size = le16_to_cpu(entry->e_value_size);
 
 	if (buffer && size > buffer_size) {
 		error = -ERANGE;
-		goto cleanup;
+		goto out;
 	}
 
 	if (buffer) {
@@ -442,8 +539,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 		memcpy(buffer, pval, size);
 	}
 	error = size;
-
-cleanup:
+out:
 	kzfree(base_addr);
 	return error;
 }
@@ -456,7 +552,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 	int error = 0;
 	size_t rest = buffer_size;
 
+	down_read(&F2FS_I(inode)->i_xattr_sem);
 	error = read_all_xattrs(inode, NULL, &base_addr);
+	up_read(&F2FS_I(inode)->i_xattr_sem);
 	if (error)
 		return error;
 
@@ -485,6 +583,15 @@ cleanup:
 	return error;
 }
 
+static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry,
+					const void *value, size_t size)
+{
+	void *pval = entry->e_name + entry->e_name_len;
+
+	return (le16_to_cpu(entry->e_value_size) == size) &&
+					!memcmp(pval, value, size);
+}
+
 static int __f2fs_setxattr(struct inode *inode, int index,
 			const char *name, const void *value, size_t size,
 			struct page *ipage, int flags)
@@ -519,12 +626,17 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 
 	found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1;
 
-	if ((flags & XATTR_REPLACE) && !found) {
+	if (found) {
+		if ((flags & XATTR_CREATE)) {
+			error = -EEXIST;
+			goto exit;
+		}
+
+		if (f2fs_xattr_value_same(here, value, size))
+			goto exit;
+	} else if ((flags & XATTR_REPLACE)) {
 		error = -ENODATA;
 		goto exit;
-	} else if ((flags & XATTR_CREATE) && found) {
-		error = -EEXIST;
-		goto exit;
 	}
 
 	last = here;
@@ -618,7 +730,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
 	f2fs_lock_op(sbi);
 	/* protect xattr_ver */
 	down_write(&F2FS_I(inode)->i_sem);
+	down_write(&F2FS_I(inode)->i_xattr_sem);
 	err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
+	up_write(&F2FS_I(inode)->i_xattr_sem);
 	up_write(&F2FS_I(inode)->i_sem);
 	f2fs_unlock_op(sbi);
 
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index d2fd0387a3c7..08a4840d6d7d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -58,10 +58,10 @@ struct f2fs_xattr_entry {
 #define XATTR_FIRST_ENTRY(ptr)	(XATTR_ENTRY(XATTR_HDR(ptr) + 1))
 #define XATTR_ROUND		(3)
 
-#define XATTR_ALIGN(size)	((size + XATTR_ROUND) & ~XATTR_ROUND)
+#define XATTR_ALIGN(size)	(((size) + XATTR_ROUND) & ~XATTR_ROUND)
 
 #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
-			entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+			(entry)->e_name_len + le16_to_cpu((entry)->e_value_size)))
 
 #define XATTR_NEXT_ENTRY(entry)	((struct f2fs_xattr_entry *)((char *)(entry) +\
 			ENTRY_SIZE(entry)))
@@ -72,9 +72,10 @@ struct f2fs_xattr_entry {
 		for (entry = XATTR_FIRST_ENTRY(addr);\
 				!IS_XATTR_LAST_ENTRY(entry);\
 				entry = XATTR_NEXT_ENTRY(entry))
-
-#define MIN_OFFSET(i)	XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE -	\
-				sizeof(struct node_footer) - sizeof(__u32))
+#define VALID_XATTR_BLOCK_SIZE	(PAGE_SIZE - sizeof(struct node_footer))
+#define XATTR_PADDING_SIZE	(sizeof(__u32))
+#define MIN_OFFSET(i)		XATTR_ALIGN(inline_xattr_size(i) +	\
+						VALID_XATTR_BLOCK_SIZE)
 
 #define MAX_VALUE_LEN(i)	(MIN_OFFSET(i) -			\
 				sizeof(struct f2fs_xattr_header) -	\
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index cf54a312993f..c2a975e4a711 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -114,6 +114,8 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_TRIMMED_FLAG		0x00000100
+#define CP_NAT_BITS_FLAG	0x00000080
 #define CP_CRC_RECOVERY_FLAG	0x00000040
 #define CP_FASTBOOT_FLAG	0x00000020
 #define CP_FSCK_FLAG		0x00000010
@@ -184,6 +186,8 @@ struct f2fs_extent {
 #define F2FS_NAME_LEN		255
 #define F2FS_INLINE_XATTR_ADDRS	50	/* 200 bytes for inline xattrs */
 #define DEF_ADDRS_PER_INODE	923	/* Address Pointers in an Inode */
+#define CUR_ADDRS_PER_INODE(inode)	(DEF_ADDRS_PER_INODE - \
+					get_extra_isize(inode))
 #define DEF_NIDS_PER_INODE	5	/* Node IDs in an Inode */
 #define ADDRS_PER_INODE(inode)	addrs_per_inode(inode)
 #define ADDRS_PER_BLOCK		1018	/* Address Pointers in a Direct Block */
@@ -203,9 +207,7 @@ struct f2fs_extent {
 #define F2FS_INLINE_DENTRY	0x04	/* file inline dentry flag */
 #define F2FS_DATA_EXIST		0x08	/* file inline data exist flag */
 #define F2FS_INLINE_DOTS	0x10	/* file having implicit dot dentries */
-
-#define MAX_INLINE_DATA		(sizeof(__le32) * (DEF_ADDRS_PER_INODE - \
-						F2FS_INLINE_XATTR_ADDRS - 1))
+#define F2FS_EXTRA_ATTR		0x20	/* file having extra attribute */
 
 struct f2fs_inode {
 	__le16 i_mode;			/* file mode */
@@ -233,8 +235,16 @@ struct f2fs_inode {
 
 	struct f2fs_extent i_ext;	/* caching a largest extent */
 
-	__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
-
+	union {
+		struct {
+			__le16 i_extra_isize;	/* extra inode attribute size */
+			__le16 i_padding;	/* padding */
+			__le32 i_projid;	/* project id */
+			__le32 i_inode_checksum;/* inode meta checksum */
+			__le32 i_extra_end[0];	/* for attribute size calculation */
+		};
+		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
+	};
 	__le32 i_nid[DEF_NIDS_PER_INODE];	/* direct(2), indirect(2),
 						double_indirect(1) node id */
 } __packed;
@@ -278,6 +288,7 @@ struct f2fs_node {
  * For NAT entries
  */
 #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
+#define NAT_ENTRY_BITMAP_SIZE	((NAT_ENTRY_PER_BLOCK + 7) / 8)
 
 struct f2fs_nat_entry {
 	__u8 version;		/* latest version of cached nat entry */
@@ -462,7 +473,7 @@ typedef __le32	f2fs_hash_t;
 #define MAX_DIR_BUCKETS		(1 << ((MAX_DIR_HASH_DEPTH / 2) - 1))
 
 /*
- * space utilization of regular dentry and inline dentry
+ * space utilization of regular dentry and inline dentry (w/o extra reservation)
  *		regular dentry			inline dentry
  * bitmap	1 * 27 = 27			1 * 23 = 23
  * reserved	1 * 3 = 3			1 * 7 = 7
@@ -498,24 +509,6 @@ struct f2fs_dentry_block {
 	__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
 } __packed;
 
-/* for inline dir */
-#define NR_INLINE_DENTRY	(MAX_INLINE_DATA * BITS_PER_BYTE / \
-				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
-				BITS_PER_BYTE + 1))
-#define INLINE_DENTRY_BITMAP_SIZE	((NR_INLINE_DENTRY + \
-					BITS_PER_BYTE - 1) / BITS_PER_BYTE)
-#define INLINE_RESERVED_SIZE	(MAX_INLINE_DATA - \
-				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
-				NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE))
-
-/* inline directory entry structure */
-struct f2fs_inline_dentry {
-	__u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE];
-	__u8 reserved[INLINE_RESERVED_SIZE];
-	struct f2fs_dir_entry dentry[NR_INLINE_DENTRY];
-	__u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN];
-} __packed;
-
 /* file types used in inode_info->flags */
 enum {
 	F2FS_FT_UNKNOWN,
@@ -531,4 +524,6 @@ enum {
 
 #define S_SHIFT 12
 
+#define	F2FS_DEF_PROJID		0	/* default project ID */
+
 #endif  /* _LINUX_F2FS_FS_H */
diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h
new file mode 100644
index 000000000000..4022c61f7e9b
--- /dev/null
+++ b/include/linux/fscrypt_common.h
@@ -0,0 +1,138 @@
+/*
+ * fscrypt_common.h: common declarations for per-file encryption
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#ifndef _LINUX_FSCRYPT_COMMON_H
+#define _LINUX_FSCRYPT_COMMON_H
+
+#include <linux/key.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <crypto/skcipher.h>
+#include <uapi/linux/fs.h>
+
+#define FS_CRYPTO_BLOCK_SIZE		16
+
+struct fscrypt_info;
+
+struct fscrypt_ctx {
+	union {
+		struct {
+			struct page *bounce_page;	/* Ciphertext page */
+			struct page *control_page;	/* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;	/* Free list */
+	};
+	u8 flags;				/* Flags */
+};
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __packed;
+
+struct fscrypt_str {
+	unsigned char *name;
+	u32 len;
+};
+
+struct fscrypt_name {
+	const struct qstr *usr_fname;
+	struct fscrypt_str disk_name;
+	u32 hash;
+	u32 minor_hash;
+	struct fscrypt_str crypto_buf;
+};
+
+#define FSTR_INIT(n, l)		{ .name = n, .len = l }
+#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p)		((p)->disk_name.name)
+#define fname_len(p)		((p)->disk_name.len)
+
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto opertions for filesystems
+ */
+struct fscrypt_operations {
+	unsigned int flags;
+	const char *key_prefix;
+	int (*get_context)(struct inode *, void *, size_t);
+	int (*set_context)(struct inode *, const void *, size_t, void *);
+	int (*dummy_context)(struct inode *);
+	bool (*is_encrypted)(struct inode *);
+	bool (*empty_dir)(struct inode *);
+	unsigned (*max_namelen)(struct inode *);
+};
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	if (inode->i_sb->s_cop->dummy_context &&
+				inode->i_sb->s_cop->dummy_context(inode))
+		return true;
+	return false;
+}
+
+static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
+					u32 filenames_mode)
+{
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+		return true;
+
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+		return true;
+
+	return false;
+}
+
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
+{
+	if (str->len == 1 && str->name[0] == '.')
+		return true;
+
+	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+		return true;
+
+	return false;
+}
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+#else
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+#endif
+}
+
+static inline int fscrypt_has_encryption_key(const struct inode *inode)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	return (inode->i_crypt_info != NULL);
+#else
+	return 0;
+#endif
+}
+
+#endif	/* _LINUX_FSCRYPT_COMMON_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
new file mode 100644
index 000000000000..ec406aed2f2f
--- /dev/null
+++ b/include/linux/fscrypt_notsupp.h
@@ -0,0 +1,177 @@
+/*
+ * fscrypt_notsupp.h
+ *
+ * This stubs out the fscrypt functions for filesystems configured without
+ * encryption support.
+ */
+
+#ifndef _LINUX_FSCRYPT_NOTSUPP_H
+#define _LINUX_FSCRYPT_NOTSUPP_H
+
+#include <linux/fscrypt_common.h>
+
+/* crypto.c */
+static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
+						  gfp_t gfp_flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+	return;
+}
+
+static inline struct page *fscrypt_encrypt_page(const struct inode *inode,
+						struct page *page,
+						unsigned int len,
+						unsigned int offs,
+						u64 lblk_num, gfp_t gfp_flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline int fscrypt_decrypt_page(const struct inode *inode,
+				       struct page *page,
+				       unsigned int len, unsigned int offs,
+				       u64 lblk_num)
+{
+	return -EOPNOTSUPP;
+}
+
+
+static inline void fscrypt_restore_control_page(struct page *page)
+{
+	return;
+}
+
+static inline void fscrypt_set_d_op(struct dentry *dentry)
+{
+	return;
+}
+
+static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
+{
+	return;
+}
+
+/* policy.c */
+static inline int fscrypt_ioctl_set_policy(struct file *filp,
+					   const void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_has_permitted_context(struct inode *parent,
+						struct inode *child)
+{
+	return 0;
+}
+
+static inline int fscrypt_inherit_context(struct inode *parent,
+					  struct inode *child,
+					  void *fs_data, bool preload)
+{
+	return -EOPNOTSUPP;
+}
+
+/* keyinfo.c */
+static inline int fscrypt_get_encryption_info(struct inode *inode)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_put_encryption_info(struct inode *inode,
+					       struct fscrypt_info *ci)
+{
+	return;
+}
+
+ /* fname.c */
+static inline int fscrypt_setup_filename(struct inode *dir,
+					 const struct qstr *iname,
+					 int lookup, struct fscrypt_name *fname)
+{
+	if (dir->i_sb->s_cop->is_encrypted(dir))
+		return -EOPNOTSUPP;
+
+	memset(fname, 0, sizeof(struct fscrypt_name));
+	fname->usr_fname = iname;
+	fname->disk_name.name = (unsigned char *)iname->name;
+	fname->disk_name.len = iname->len;
+	return 0;
+}
+
+static inline void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+	return;
+}
+
+static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode,
+					       u32 ilen)
+{
+	/* never happens */
+	WARN_ON(1);
+	return 0;
+}
+
+static inline int fscrypt_fname_alloc_buffer(const struct inode *inode,
+					     u32 ilen,
+					     struct fscrypt_str *crypto_str)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
+{
+	return;
+}
+
+static inline int fscrypt_fname_disk_to_usr(struct inode *inode,
+					    u32 hash, u32 minor_hash,
+					    const struct fscrypt_str *iname,
+					    struct fscrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_fname_usr_to_disk(struct inode *inode,
+					    const struct qstr *iname,
+					    struct fscrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+				      const u8 *de_name, u32 de_name_len)
+{
+	/* Encryption support disabled; use standard comparison */
+	if (de_name_len != fname->disk_name.len)
+		return false;
+	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* bio.c */
+static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx,
+					     struct bio *bio)
+{
+	return;
+}
+
+static inline void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+	return;
+}
+
+static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+					sector_t pblk, unsigned int len)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif	/* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
new file mode 100644
index 000000000000..32e2fcf13b01
--- /dev/null
+++ b/include/linux/fscrypt_supp.h
@@ -0,0 +1,145 @@
+/*
+ * fscrypt_supp.h
+ *
+ * This is included by filesystems configured with encryption support.
+ */
+
+#ifndef _LINUX_FSCRYPT_SUPP_H
+#define _LINUX_FSCRYPT_SUPP_H
+
+#include <linux/fscrypt_common.h>
+
+/* crypto.c */
+extern struct kmem_cache *fscrypt_info_cachep;
+extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
+extern void fscrypt_release_ctx(struct fscrypt_ctx *);
+extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
+						unsigned int, unsigned int,
+						u64, gfp_t);
+extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
+				unsigned int, u64);
+extern void fscrypt_restore_control_page(struct page *);
+
+extern const struct dentry_operations fscrypt_d_ops;
+
+static inline void fscrypt_set_d_op(struct dentry *dentry)
+{
+	d_set_d_op(dentry, &fscrypt_d_ops);
+}
+
+static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+	spin_unlock(&dentry->d_lock);
+}
+
+/* policy.c */
+extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
+extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
+extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
+extern int fscrypt_inherit_context(struct inode *, struct inode *,
+					void *, bool);
+/* keyinfo.c */
+extern int fscrypt_get_encryption_info(struct inode *);
+extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
+
+/* fname.c */
+extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
+				int lookup, struct fscrypt_name *);
+
+static inline void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+	kfree(fname->crypto_buf.name);
+}
+
+extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32);
+extern int fscrypt_fname_alloc_buffer(const struct inode *, u32,
+				struct fscrypt_str *);
+extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
+extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
+			const struct fscrypt_str *, struct fscrypt_str *);
+extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
+			struct fscrypt_str *);
+
+#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE	32
+
+/* Extracts the second-to-last ciphertext block; see explanation below */
+#define FSCRYPT_FNAME_DIGEST(name, len)	\
+	((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \
+			     FS_CRYPTO_BLOCK_SIZE))
+
+#define FSCRYPT_FNAME_DIGEST_SIZE	FS_CRYPTO_BLOCK_SIZE
+
+/**
+ * fscrypt_digested_name - alternate identifier for an on-disk filename
+ *
+ * When userspace lists an encrypted directory without access to the key,
+ * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE
+ * bytes are shown in this abbreviated form (base64-encoded) rather than as the
+ * full ciphertext (base64-encoded).  This is necessary to allow supporting
+ * filenames up to NAME_MAX bytes, since base64 encoding expands the length.
+ *
+ * To make it possible for filesystems to still find the correct directory entry
+ * despite not knowing the full on-disk name, we encode any filesystem-specific
+ * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups,
+ * followed by the second-to-last ciphertext block of the filename.  Due to the
+ * use of the CBC-CTS encryption mode, the second-to-last ciphertext block
+ * depends on the full plaintext.  (Note that ciphertext stealing causes the
+ * last two blocks to appear "flipped".)  This makes accidental collisions very
+ * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they
+ * share the same filesystem-specific hashes.
+ *
+ * However, this scheme isn't immune to intentional collisions, which can be
+ * created by anyone able to create arbitrary plaintext filenames and view them
+ * without the key.  Making the "digest" be a real cryptographic hash like
+ * SHA-256 over the full ciphertext would prevent this, although it would be
+ * less efficient and harder to implement, especially since the filesystem would
+ * need to calculate it for each directory entry examined during a search.
+ */
+struct fscrypt_digested_name {
+	u32 hash;
+	u32 minor_hash;
+	u8 digest[FSCRYPT_FNAME_DIGEST_SIZE];
+};
+
+/**
+ * fscrypt_match_name() - test whether the given name matches a directory entry
+ * @fname: the name being searched for
+ * @de_name: the name from the directory entry
+ * @de_name_len: the length of @de_name in bytes
+ *
+ * Normally @fname->disk_name will be set, and in that case we simply compare
+ * that to the name stored in the directory entry.  The only exception is that
+ * if we don't have the key for an encrypted directory and a filename in it is
+ * very long, then we won't have the full disk_name and we'll instead need to
+ * match against the fscrypt_digested_name.
+ *
+ * Return: %true if the name matches, otherwise %false.
+ */
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+				      const u8 *de_name, u32 de_name_len)
+{
+	if (unlikely(!fname->disk_name.name)) {
+		const struct fscrypt_digested_name *n =
+			(const void *)fname->crypto_buf.name;
+		if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_'))
+			return false;
+		if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)
+			return false;
+		return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len),
+			       n->digest, FSCRYPT_FNAME_DIGEST_SIZE);
+	}
+
+	if (de_name_len != fname->disk_name.len)
+		return false;
+	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* bio.c */
+extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
+extern void fscrypt_pullback_bio_page(struct page **, bool);
+extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
+				 unsigned int);
+
+#endif	/* _LINUX_FSCRYPT_SUPP_H */
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index ff8b11b26f31..e6e53a36104b 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -250,8 +250,8 @@ extern void fscrypt_restore_control_page(struct page *);
 extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t,
 						unsigned int);
 /* policy.c */
-extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *);
-extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *);
+extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
+extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
 extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
 extern int fscrypt_inherit_context(struct inode *, struct inode *,
 					void *, bool);
@@ -320,14 +320,14 @@ static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p,
 }
 
 /* policy.c */
-static inline int fscrypt_notsupp_process_policy(struct file *f,
-				const struct fscrypt_policy *p)
+static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f,
+				const void __user *arg)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int fscrypt_notsupp_get_policy(struct inode *i,
-				struct fscrypt_policy *p)
+static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f,
+				void __user *arg)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 217691582dd4..7063bbcca03b 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -6,8 +6,8 @@
 
 #include <linux/tracepoint.h>
 
-#define show_dev(entry)		MAJOR(entry->dev), MINOR(entry->dev)
-#define show_dev_ino(entry)	show_dev(entry), (unsigned long)entry->ino
+#define show_dev(dev)		MAJOR(dev), MINOR(dev)
+#define show_dev_ino(entry)	show_dev(entry->dev), (unsigned long)entry->ino
 
 TRACE_DEFINE_ENUM(NODE);
 TRACE_DEFINE_ENUM(DATA);
@@ -15,6 +15,7 @@ TRACE_DEFINE_ENUM(META);
 TRACE_DEFINE_ENUM(META_FLUSH);
 TRACE_DEFINE_ENUM(INMEM);
 TRACE_DEFINE_ENUM(INMEM_DROP);
+TRACE_DEFINE_ENUM(INMEM_INVALIDATE);
 TRACE_DEFINE_ENUM(IPU);
 TRACE_DEFINE_ENUM(OPU);
 TRACE_DEFINE_ENUM(CURSEG_HOT_DATA);
@@ -43,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT);
 TRACE_DEFINE_ENUM(CP_SYNC);
 TRACE_DEFINE_ENUM(CP_RECOVERY);
 TRACE_DEFINE_ENUM(CP_DISCARD);
+TRACE_DEFINE_ENUM(CP_TRIMMED);
 
 #define show_block_type(type)						\
 	__print_symbolic(type,						\
@@ -52,6 +54,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ META_FLUSH,	"META_FLUSH" },				\
 		{ INMEM,	"INMEM" },				\
 		{ INMEM_DROP,	"INMEM_DROP" },				\
+		{ INMEM_INVALIDATE,	"INMEM_INVALIDATE" },		\
 		{ INMEM_REVOKE,	"INMEM_REVOKE" },			\
 		{ IPU,		"IN-PLACE" },				\
 		{ OPU,		"OUT-OF-PLACE" })
@@ -80,6 +83,12 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ REQ_META | REQ_PRIO,	"(MP)" },			\
 		{ 0, " \b" })
 
+#define show_block_temp(temp)						\
+	__print_symbolic(temp,						\
+		{ HOT,		"HOT" },				\
+		{ WARM,		"WARM" },				\
+		{ COLD,		"COLD" })
+
 #define show_data_type(type)						\
 	__print_symbolic(type,						\
 		{ CURSEG_HOT_DATA, 	"Hot DATA" },			\
@@ -116,7 +125,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ CP_FASTBOOT,	"Fastboot" },				\
 		{ CP_SYNC,	"Sync" },				\
 		{ CP_RECOVERY,	"Recovery" },				\
-		{ CP_DISCARD,	"Discard" })
+		{ CP_DISCARD,	"Discard" },				\
+		{ CP_UMOUNT | CP_TRIMMED,	"Umount,Trimmed" })
 
 struct victim_sel_policy;
 struct f2fs_map_blocks;
@@ -239,7 +249,7 @@ TRACE_EVENT(f2fs_sync_fs,
 	),
 
 	TP_printk("dev = (%d,%d), superblock is %s, wait = %d",
-		show_dev(__entry),
+		show_dev(__entry->dev),
 		__entry->dirty ? "dirty" : "not dirty",
 		__entry->wait)
 );
@@ -309,6 +319,13 @@ DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit,
 	TP_ARGS(inode, ret)
 );
 
+DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode,
+
+	TP_PROTO(struct inode *inode, int ret),
+
+	TP_ARGS(inode, ret)
+);
+
 DEFINE_EVENT(f2fs__inode, f2fs_truncate,
 
 	TP_PROTO(struct inode *inode),
@@ -518,14 +535,14 @@ TRACE_EVENT(f2fs_map_blocks,
 
 TRACE_EVENT(f2fs_background_gc,
 
-	TP_PROTO(struct super_block *sb, long wait_ms,
+	TP_PROTO(struct super_block *sb, unsigned int wait_ms,
 			unsigned int prefree, unsigned int free),
 
 	TP_ARGS(sb, wait_ms, prefree, free),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
-		__field(long,	wait_ms)
+		__field(unsigned int,	wait_ms)
 		__field(unsigned int,	prefree)
 		__field(unsigned int,	free)
 	),
@@ -537,13 +554,120 @@ TRACE_EVENT(f2fs_background_gc,
 		__entry->free		= free;
 	),
 
-	TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u",
-		show_dev(__entry),
+	TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u",
+		show_dev(__entry->dev),
 		__entry->wait_ms,
 		__entry->prefree,
 		__entry->free)
 );
 
+TRACE_EVENT(f2fs_gc_begin,
+
+	TP_PROTO(struct super_block *sb, bool sync, bool background,
+			long long dirty_nodes, long long dirty_dents,
+			long long dirty_imeta, unsigned int free_sec,
+			unsigned int free_seg, int reserved_seg,
+			unsigned int prefree_seg),
+
+	TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta,
+		free_sec, free_seg, reserved_seg, prefree_seg),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev)
+		__field(bool,		sync)
+		__field(bool,		background)
+		__field(long long,	dirty_nodes)
+		__field(long long,	dirty_dents)
+		__field(long long,	dirty_imeta)
+		__field(unsigned int,	free_sec)
+		__field(unsigned int,	free_seg)
+		__field(int,		reserved_seg)
+		__field(unsigned int,	prefree_seg)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sb->s_dev;
+		__entry->sync		= sync;
+		__entry->background	= background;
+		__entry->dirty_nodes	= dirty_nodes;
+		__entry->dirty_dents	= dirty_dents;
+		__entry->dirty_imeta	= dirty_imeta;
+		__entry->free_sec	= free_sec;
+		__entry->free_seg	= free_seg;
+		__entry->reserved_seg	= reserved_seg;
+		__entry->prefree_seg	= prefree_seg;
+	),
+
+	TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, "
+		"dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, "
+		"rsv_seg:%d, prefree_seg:%u",
+		show_dev(__entry->dev),
+		__entry->sync,
+		__entry->background,
+		__entry->dirty_nodes,
+		__entry->dirty_dents,
+		__entry->dirty_imeta,
+		__entry->free_sec,
+		__entry->free_seg,
+		__entry->reserved_seg,
+		__entry->prefree_seg)
+);
+
+TRACE_EVENT(f2fs_gc_end,
+
+	TP_PROTO(struct super_block *sb, int ret, int seg_freed,
+			int sec_freed, long long dirty_nodes,
+			long long dirty_dents, long long dirty_imeta,
+			unsigned int free_sec, unsigned int free_seg,
+			int reserved_seg, unsigned int prefree_seg),
+
+	TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents,
+		dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev)
+		__field(int,		ret)
+		__field(int,		seg_freed)
+		__field(int,		sec_freed)
+		__field(long long,	dirty_nodes)
+		__field(long long,	dirty_dents)
+		__field(long long,	dirty_imeta)
+		__field(unsigned int,	free_sec)
+		__field(unsigned int,	free_seg)
+		__field(int,		reserved_seg)
+		__field(unsigned int,	prefree_seg)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sb->s_dev;
+		__entry->ret		= ret;
+		__entry->seg_freed	= seg_freed;
+		__entry->sec_freed	= sec_freed;
+		__entry->dirty_nodes	= dirty_nodes;
+		__entry->dirty_dents	= dirty_dents;
+		__entry->dirty_imeta	= dirty_imeta;
+		__entry->free_sec	= free_sec;
+		__entry->free_seg	= free_seg;
+		__entry->reserved_seg	= reserved_seg;
+		__entry->prefree_seg	= prefree_seg;
+	),
+
+	TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, "
+		"nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, "
+		"free_seg:%u, rsv_seg:%d, prefree_seg:%u",
+		show_dev(__entry->dev),
+		__entry->ret,
+		__entry->seg_freed,
+		__entry->sec_freed,
+		__entry->dirty_nodes,
+		__entry->dirty_dents,
+		__entry->dirty_imeta,
+		__entry->free_sec,
+		__entry->free_seg,
+		__entry->reserved_seg,
+		__entry->prefree_seg)
+);
+
 TRACE_EVENT(f2fs_get_victim,
 
 	TP_PROTO(struct super_block *sb, int type, int gc_type,
@@ -580,7 +704,7 @@ TRACE_EVENT(f2fs_get_victim,
 
 	TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), victim = %u "
 		"ofs_unit = %u, pre_victim_secno = %d, prefree = %u, free = %u",
-		show_dev(__entry),
+		show_dev(__entry->dev),
 		show_data_type(__entry->type),
 		show_gc_type(__entry->gc_type),
 		show_alloc_mode(__entry->alloc_mode),
@@ -717,7 +841,7 @@ TRACE_EVENT(f2fs_reserve_new_blocks,
 	),
 
 	TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu",
-		show_dev(__entry),
+		show_dev(__entry->dev),
 		(unsigned int)__entry->nid,
 		__entry->ofs_in_node,
 		(unsigned long long)__entry->count)
@@ -737,6 +861,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		__field(block_t, new_blkaddr)
 		__field(int, op)
 		__field(int, op_flags)
+		__field(int, temp)
 		__field(int, type)
 	),
 
@@ -748,16 +873,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
 		__entry->new_blkaddr	= fio->new_blkaddr;
 		__entry->op		= fio->op;
 		__entry->op_flags	= fio->op_flags;
+		__entry->temp		= fio->temp;
 		__entry->type		= fio->type;
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
-		"oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s",
+		"oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s",
 		show_dev_ino(__entry),
 		(unsigned long)__entry->index,
 		(unsigned long long)__entry->old_blkaddr,
 		(unsigned long long)__entry->new_blkaddr,
 		show_bio_type(__entry->op, __entry->op_flags),
+		show_block_temp(__entry->temp),
 		show_block_type(__entry->type))
 );
 
@@ -770,7 +897,7 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio,
 	TP_CONDITION(page->mapping)
 );
 
-DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio,
+DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write,
 
 	TP_PROTO(struct page *page, struct f2fs_io_info *fio),
 
@@ -787,6 +914,7 @@ DECLARE_EVENT_CLASS(f2fs__bio,
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
+		__field(dev_t,	target)
 		__field(int,	op)
 		__field(int,	op_flags)
 		__field(int,	type)
@@ -796,6 +924,7 @@ DECLARE_EVENT_CLASS(f2fs__bio,
 
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
+		__entry->target		= bio->bi_bdev->bd_dev;
 		__entry->op		= bio_op(bio);
 		__entry->op_flags	= bio->bi_rw;
 		__entry->type		= type;
@@ -803,8 +932,9 @@ DECLARE_EVENT_CLASS(f2fs__bio,
 		__entry->size		= bio->bi_iter.bi_size;
 	),
 
-	TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u",
-		show_dev(__entry),
+	TP_printk("dev = (%d,%d)/(%d,%d), rw = %s%s, %s, sector = %lld, size = %u",
+		show_dev(__entry->target),
+		show_dev(__entry->dev),
 		show_bio_type(__entry->op, __entry->op_flags),
 		show_block_type(__entry->type),
 		(unsigned long long)__entry->sector,
@@ -1101,16 +1231,16 @@ TRACE_EVENT(f2fs_write_checkpoint,
 	),
 
 	TP_printk("dev = (%d,%d), checkpoint for %s, state = %s",
-		show_dev(__entry),
+		show_dev(__entry->dev),
 		show_cpreason(__entry->reason),
 		__entry->msg)
 );
 
-TRACE_EVENT(f2fs_issue_discard,
+DECLARE_EVENT_CLASS(f2fs_discard,
 
-	TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen),
+	TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
 
-	TP_ARGS(sb, blkstart, blklen),
+	TP_ARGS(dev, blkstart, blklen),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
@@ -1119,22 +1249,36 @@ TRACE_EVENT(f2fs_issue_discard,
 	),
 
 	TP_fast_assign(
-		__entry->dev	= sb->s_dev;
+		__entry->dev	= dev->bd_dev;
 		__entry->blkstart = blkstart;
 		__entry->blklen = blklen;
 	),
 
 	TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx",
-		show_dev(__entry),
+		show_dev(__entry->dev),
 		(unsigned long long)__entry->blkstart,
 		(unsigned long long)__entry->blklen)
 );
 
+DEFINE_EVENT(f2fs_discard, f2fs_queue_discard,
+
+	TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+	TP_ARGS(dev, blkstart, blklen)
+);
+
+DEFINE_EVENT(f2fs_discard, f2fs_issue_discard,
+
+	TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+	TP_ARGS(dev, blkstart, blklen)
+);
+
 TRACE_EVENT(f2fs_issue_reset_zone,
 
-	TP_PROTO(struct super_block *sb, block_t blkstart),
+	TP_PROTO(struct block_device *dev, block_t blkstart),
 
-	TP_ARGS(sb, blkstart),
+	TP_ARGS(dev, blkstart),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
@@ -1142,38 +1286,41 @@ TRACE_EVENT(f2fs_issue_reset_zone,
 	),
 
 	TP_fast_assign(
-		__entry->dev	= sb->s_dev;
+		__entry->dev	= dev->bd_dev;
 		__entry->blkstart = blkstart;
 	),
 
 	TP_printk("dev = (%d,%d), reset zone at block = 0x%llx",
-		show_dev(__entry),
+		show_dev(__entry->dev),
 		(unsigned long long)__entry->blkstart)
 );
 
 TRACE_EVENT(f2fs_issue_flush,
 
-	TP_PROTO(struct super_block *sb, unsigned int nobarrier,
-					unsigned int flush_merge),
+	TP_PROTO(struct block_device *dev, unsigned int nobarrier,
+				unsigned int flush_merge, int ret),
 
-	TP_ARGS(sb, nobarrier, flush_merge),
+	TP_ARGS(dev, nobarrier, flush_merge, ret),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(unsigned int, nobarrier)
 		__field(unsigned int, flush_merge)
+		__field(int,  ret)
 	),
 
 	TP_fast_assign(
-		__entry->dev	= sb->s_dev;
+		__entry->dev	= dev->bd_dev;
 		__entry->nobarrier = nobarrier;
 		__entry->flush_merge = flush_merge;
+		__entry->ret = ret;
 	),
 
-	TP_printk("dev = (%d,%d), %s %s",
-		show_dev(__entry),
+	TP_printk("dev = (%d,%d), %s %s, ret = %d",
+		show_dev(__entry->dev),
 		__entry->nobarrier ? "skip (nobarrier)" : "issue",
-		__entry->flush_merge ? " with flush_merge" : "")
+		__entry->flush_merge ? " with flush_merge" : "",
+		__entry->ret)
 );
 
 TRACE_EVENT(f2fs_lookup_extent_tree_start,
@@ -1286,7 +1433,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree,
 	),
 
 	TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u",
-		show_dev(__entry),
+		show_dev(__entry->dev),
 		__entry->node_cnt,
 		__entry->tree_cnt)
 );
@@ -1333,7 +1480,7 @@ DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes,
 	),
 
 	TP_printk("dev = (%d,%d), %s, dirty count = %lld",
-		show_dev(__entry),
+		show_dev(__entry->dev),
 		show_file_type(__entry->type),
 		__entry->count)
 );
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 15048097910f..60d27496c328 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -178,6 +178,23 @@ struct inodes_stat_t {
 /* Policy provided via an ioctl on the topmost directory */
 #define FS_KEY_DESCRIPTOR_SIZE	8
 
+#define FS_POLICY_FLAGS_PAD_4		0x00
+#define FS_POLICY_FLAGS_PAD_8		0x01
+#define FS_POLICY_FLAGS_PAD_16		0x02
+#define FS_POLICY_FLAGS_PAD_32		0x03
+#define FS_POLICY_FLAGS_PAD_MASK	0x03
+#define FS_POLICY_FLAGS_VALID		0x03
+
+/* Encryption algorithms */
+#define FS_ENCRYPTION_MODE_INVALID		0
+#define FS_ENCRYPTION_MODE_AES_256_XTS		1
+#define FS_ENCRYPTION_MODE_AES_256_GCM		2
+#define FS_ENCRYPTION_MODE_AES_256_CBC		3
+#define FS_ENCRYPTION_MODE_AES_256_CTS		4
+#define FS_ENCRYPTION_MODE_AES_128_CBC		5
+#define FS_ENCRYPTION_MODE_AES_128_CTS		6
+
+
 struct fscrypt_policy {
 	__u8 version;
 	__u8 contents_encryption_mode;
@@ -190,6 +207,19 @@ struct fscrypt_policy {
 #define FS_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])
 #define FS_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct fscrypt_policy)
 
+/* Parameters for passing an encryption key into the kernel keyring */
+#define FS_KEY_DESC_PREFIX		"fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE		8
+
+/* Structure that userspace passes to the kernel keyring */
+#define FS_MAX_KEY_SIZE			64
+
+struct fscrypt_key {
+	__u32 mode;
+	__u8 raw[FS_MAX_KEY_SIZE];
+	__u32 size;
+};
+
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
  */
diff --git a/mm/util.c b/mm/util.c
index d5259b62f8d7..d7b1065644be 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -348,6 +348,7 @@ struct address_space *page_mapping(struct page *page)
 		return NULL;
 	return page->mapping;
 }
+EXPORT_SYMBOL(page_mapping);
 
 int overcommit_ratio_handler(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp,

From e5e42eca05ada917895af063bc46badf1515043c Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Mon, 8 May 2017 09:33:22 -0700
Subject: [PATCH 1154/3220] ANDROID: binder: Add tracing for binder priority
 inheritance.

Bug: 34461621
Change-Id: I5ebb1c0c49fd42a89ee250a1d70221f767c82c7c
Signed-off-by: Martijn Coenen <maco@google.com>
---
 drivers/android/binder.c       |  4 ++++
 drivers/android/binder_trace.h | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 32a2b2f44691..64f393239f6f 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1154,6 +1154,10 @@ static void binder_do_set_priority(struct task_struct *task,
 			      task->pid, desired.prio,
 			      to_kernel_prio(policy, priority));
 
+	trace_binder_set_priority(task->tgid, task->pid, task->normal_prio,
+				  to_kernel_prio(policy, priority),
+				  desired.prio);
+
 	/* Set the actual priority */
 	if (task->policy != policy || is_rt_policy(policy)) {
 		struct sched_param params;
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 76e3b9c8a8a2..b11dffc521e8 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -85,6 +85,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done);
 DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done);
 DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done);
 
+TRACE_EVENT(binder_set_priority,
+	TP_PROTO(int proc, int thread, unsigned int old_prio,
+		 unsigned int desired_prio, unsigned int new_prio),
+	TP_ARGS(proc, thread, old_prio, new_prio, desired_prio),
+
+	TP_STRUCT__entry(
+		__field(int, proc)
+		__field(int, thread)
+		__field(unsigned int, old_prio)
+		__field(unsigned int, new_prio)
+		__field(unsigned int, desired_prio)
+	),
+	TP_fast_assign(
+		__entry->proc = proc;
+		__entry->thread = thread;
+		__entry->old_prio = old_prio;
+		__entry->new_prio = new_prio;
+		__entry->desired_prio = desired_prio;
+	),
+	TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d",
+		  __entry->proc, __entry->thread, __entry->old_prio,
+		  __entry->new_prio, __entry->desired_prio)
+);
+
 TRACE_EVENT(binder_wait_for_work,
 	TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo),
 	TP_ARGS(proc_work, transaction_stack, thread_todo),

From 3cc621033b682479fb7fdd4028d50e7b7458a08e Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Thu, 24 Aug 2017 15:23:36 +0200
Subject: [PATCH 1155/3220] ANDROID: binder: fix transaction leak.

If a call to put_user() fails, we failed to
properly free a transaction and send a failed
reply (if necessary).

Bug: 63117588
Test: binderLibTest

Change-Id: Ia98db8cd82ce354a4cdc8811c969988d585c7e31
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 64f393239f6f..bc8d9ecfebec 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2106,6 +2106,26 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 	}
 }
 
+/**
+ * binder_cleanup_transaction() - cleans up undelivered transaction
+ * @t:		transaction that needs to be cleaned up
+ * @reason:	reason the transaction wasn't delivered
+ * @error_code:	error to return to caller (if synchronous call)
+ */
+static void binder_cleanup_transaction(struct binder_transaction *t,
+				       const char *reason,
+				       uint32_t error_code)
+{
+	if (t->buffer->target_node && !(t->flags & TF_ONE_WAY)) {
+		binder_send_failed_reply(t, error_code);
+	} else {
+		binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
+			"undelivered transaction %d, %s\n",
+			t->debug_id, reason);
+		binder_free_transaction(t);
+	}
+}
+
 /**
  * binder_validate_object() - checks for a valid metadata object in a buffer.
  * @buffer:	binder_buffer that we're parsing.
@@ -4188,12 +4208,20 @@ retry:
 		if (put_user(cmd, (uint32_t __user *)ptr)) {
 			if (t_from)
 				binder_thread_dec_tmpref(t_from);
+
+			binder_cleanup_transaction(t, "put_user failed",
+						   BR_FAILED_REPLY);
+
 			return -EFAULT;
 		}
 		ptr += sizeof(uint32_t);
 		if (copy_to_user(ptr, &tr, sizeof(tr))) {
 			if (t_from)
 				binder_thread_dec_tmpref(t_from);
+
+			binder_cleanup_transaction(t, "copy_to_user failed",
+						   BR_FAILED_REPLY);
+
 			return -EFAULT;
 		}
 		ptr += sizeof(tr);
@@ -4263,15 +4291,9 @@ static void binder_release_work(struct binder_proc *proc,
 			struct binder_transaction *t;
 
 			t = container_of(w, struct binder_transaction, work);
-			if (t->buffer->target_node &&
-			    !(t->flags & TF_ONE_WAY)) {
-				binder_send_failed_reply(t, BR_DEAD_REPLY);
-			} else {
-				binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
-					"undelivered transaction %d\n",
-					t->debug_id);
-				binder_free_transaction(t);
-			}
+
+			binder_cleanup_transaction(t, "process died.",
+						   BR_DEAD_REPLY);
 		} break;
 		case BINDER_WORK_RETURN_ERROR: {
 			struct binder_error *e = container_of(

From 139ac8ac89e5918fcbafa24e849e8c277fe2f66c Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 5 Oct 2017 17:54:31 -0700
Subject: [PATCH 1156/3220] FROMLIST: tracing: Prepare to add preempt and irq
 trace events

In preparation of adding irqsoff and preemptsoff enable and disable trace
events, move required functions and code to make it easier to add these events
in a later patch. This patch is just code movement and no functional change.

Change-Id: I587d411da5efbc4959bcccd7a05c7a66c231e1e0
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kernel-team@android.com
Link: https://patchwork.kernel.org/patch/9988159/
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/trace/trace_irqsoff.c | 100 ++++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 26 deletions(-)

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index be3222b7d72e..0e56eace0dde 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,7 @@
 
 #include "trace.h"
 
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
 
@@ -450,64 +451,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 
 #else /* !CONFIG_PROVE_LOCKING */
 
-/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
 /*
  * We are only interested in hardirq on/off events:
  */
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
-EXPORT_SYMBOL(trace_hardirqs_on);
 
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
-EXPORT_SYMBOL(trace_hardirqs_off);
 
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
 }
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 #endif /* CONFIG_PROVE_LOCKING */
 #endif /*  CONFIG_IRQSOFF_TRACER */
 
 #ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
 {
 	if (preempt_trace() && !irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
 {
 	if (preempt_trace() && !irq_trace())
 		start_critical_timing(a0, a1);
@@ -770,3 +751,70 @@ __init static int init_irqsoff_tracer(void)
 	return 0;
 }
 core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+void trace_hardirqs_on(void)
+{
+	tracer_hardirqs_on();
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+	tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	tracer_hardirqs_on_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	tracer_preempt_off(a0, a1);
+}
+#endif

From e5486e9c8991aa5b55959c96b6b375645b12dad6 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 5 Oct 2017 17:54:32 -0700
Subject: [PATCH 1157/3220] FROMLIST: tracing: Add support for preempt and irq
 enable/disable events

Preempt and irq trace events can be used for tracing the start and
end of an atomic section which can be used by a trace viewer like
systrace to graphically view the start and end of an atomic section and
correlate them with latencies and scheduling issues.

This also serves as a prelude to using synthetic events or probes to
rewrite the preempt and irqsoff tracers, along with numerous benefits of
using trace events features for these events.

Change-Id: I718d40f7c3c48579adf9d7121b21495a669c89bd
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zilstra <peterz@infradead.org>
Cc: kernel-team@android.com
Link: https://patchwork.kernel.org/patch/9988157/
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 include/linux/ftrace.h            |  3 +-
 include/trace/events/preemptirq.h | 70 +++++++++++++++++++++++++++++++
 kernel/trace/Kconfig              | 11 +++++
 kernel/trace/Makefile             |  1 +
 kernel/trace/trace_irqsoff.c      | 35 +++++++++++++++-
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/preemptirq.h

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 60048c50404e..ed94cea9eaff 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -702,7 +702,8 @@ static inline void __ftrace_enabled_restore(int enabled)
   static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
 #endif
 
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+	(defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
   extern void trace_preempt_on(unsigned long a0, unsigned long a1);
   extern void trace_preempt_off(unsigned long a0, unsigned long a1);
 #else
diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
new file mode 100644
index 000000000000..f5024c560d8f
--- /dev/null
+++ b/include/trace/events/preemptirq.h
@@ -0,0 +1,70 @@
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM preemptirq
+
+#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PREEMPTIRQ_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+#include <linux/string.h>
+#include <asm/sections.h>
+
+DECLARE_EVENT_CLASS(preemptirq_template,
+
+	TP_PROTO(unsigned long ip, unsigned long parent_ip),
+
+	TP_ARGS(ip, parent_ip),
+
+	TP_STRUCT__entry(
+		__field(u32, caller_offs)
+		__field(u32, parent_offs)
+	),
+
+	TP_fast_assign(
+		__entry->caller_offs = (u32)(ip - (unsigned long)_stext);
+		__entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
+	),
+
+	TP_printk("caller=%pF parent=%pF",
+		  (void *)((unsigned long)(_stext) + __entry->caller_offs),
+		  (void *)((unsigned long)(_stext) + __entry->parent_offs))
+);
+
+#ifndef CONFIG_PROVE_LOCKING
+DEFINE_EVENT(preemptirq_template, irq_disable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, irq_enable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+DEFINE_EVENT(preemptirq_template, preempt_disable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, preempt_enable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+#endif
+
+#endif /* _TRACE_PREEMPTIRQ_H */
+
+#include <trace/define_trace.h>
+
+#else /* !CONFIG_PREEMPTIRQ_EVENTS */
+
+#define trace_irq_enable(...)
+#define trace_irq_disable(...)
+#define trace_preempt_enable(...)
+#define trace_preempt_disable(...)
+#define trace_irq_enable_rcuidle(...)
+#define trace_irq_disable_rcuidle(...)
+#define trace_preempt_enable_rcuidle(...)
+#define trace_preempt_disable_rcuidle(...)
+
+#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5f5b66a2f156..006eefb6ede0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -165,6 +165,17 @@ config FUNCTION_GRAPH_TRACER
 	  address on the current task structure into a stack of calls.
 
 
+config PREEMPTIRQ_EVENTS
+	bool "Enable trace events for preempt and irq disable/enable"
+	select TRACE_IRQFLAGS
+	depends on DEBUG_PREEMPT || !PROVE_LOCKING
+	default n
+	help
+	  Enable tracing of disable and enable events for preemption and irqs.
+	  For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+	  enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+	  be disabled.
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index a9bba37fab5a..4b35fb97ae44 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 0e56eace0dde..21b162c07e83 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,9 @@
 
 #include "trace.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
 #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
@@ -765,27 +768,54 @@ static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
 static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
 #endif
 
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
 #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
 void trace_hardirqs_on(void)
 {
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
 	tracer_hardirqs_on();
+
+	this_cpu_write(tracing_irq_cpu, 0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 void trace_hardirqs_off(void)
 {
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
 	tracer_hardirqs_off();
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
 	tracer_hardirqs_on_caller(caller_addr);
+
+	this_cpu_write(tracing_irq_cpu, 0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
 	tracer_hardirqs_off_caller(caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
@@ -807,14 +837,17 @@ inline void print_irqtrace_events(struct task_struct *curr)
 }
 #endif
 
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+	(defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
+	trace_preempt_enable_rcuidle(a0, a1);
 	tracer_preempt_on(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
+	trace_preempt_disable_rcuidle(a0, a1);
 	tracer_preempt_off(a0, a1);
 }
 #endif

From 3ef0a0c8590b07fa8dfe0adb2bef6cc3baec5261 Mon Sep 17 00:00:00 2001
From: Behan Webster <behanw@converseincode.com>
Date: Fri, 21 Apr 2017 11:20:01 -0700
Subject: [PATCH 1158/3220] UPSTREAM: kbuild: Add better clang cross build
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add cross target to CC if using clang. Also add custom gcc toolchain
path for fallback gcc tools.

Clang will fallback to using things like ld, as, and libgcc if
(respectively) one of the llvm linkers isn't available, the integrated
assembler is turned off, or an appropriately cross-compiled version of
compiler-rt isn't available. To this end, you can specify the path to
this fallback gcc toolchain with GCC_TOOLCHAIN.

Signed-off-by: Behan Webster <behanw@converseincode.com>
Reviewed-by: Jan-Simon Möller <dl9pf@gmx.de>
Reviewed-by: Mark Charlebois <charlebm@gmail.com>
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked from commit 785f11aa595bc3d4e74096cbd598ada54ecc0d81)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I9e4ca1a149bc793b749952f1b5734bbc11777e65
---
 Makefile | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Makefile b/Makefile
index 0b037152f590..e87105a95b4e 100644
--- a/Makefile
+++ b/Makefile
@@ -694,6 +694,15 @@ endif
 KBUILD_CFLAGS += $(stackp-flag)
 
 ifeq ($(cc-name),clang)
+ifneq ($(CROSS_COMPILE),)
+CLANG_TARGET	:= -target $(notdir $(CROSS_COMPILE:%-=%))
+GCC_TOOLCHAIN	:= $(realpath $(dir $(shell which $(LD)))/..)
+endif
+ifneq ($(GCC_TOOLCHAIN),)
+CLANG_GCC_TC	:= -gcc-toolchain $(GCC_TOOLCHAIN)
+endif
+KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
+KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
 KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)

From 8a89d5fc7cce80477a8e8405d8f9c811758a16ed Mon Sep 17 00:00:00 2001
From: Michael Davidson <md@google.com>
Date: Tue, 25 Apr 2017 15:47:35 -0700
Subject: [PATCH 1159/3220] UPSTREAM: kbuild: clang: add -no-integrated-as to
 KBUILD_[AC]FLAGS

The Linux Kernel relies on GCC's acceptance of inline assembly as an
opaque object which will not have any validation performed on the content.
The current behaviour in LLVM is to perform validation of the contents by
means of parsing the input if the MC layer can handle it.

Disable clangs integrated assembler and use the GNU assembler instead.

Wording-mostly-from: Saleem Abdulrasool <compnerd@compnerd.org>
Signed-off-by: Michael Davidson <md@google.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked from commit a37c45cd82e62a361706b9688a984a3a63957321)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: Iae412ad5294f8f5e4d66f0085a5dd70f5464ac91
---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index e87105a95b4e..75895a4b74b8 100644
--- a/Makefile
+++ b/Makefile
@@ -715,6 +715,8 @@ KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
 # See modpost pattern 2
 KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
 KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
+KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
+KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
 else
 
 # These warnings generated too much noise in a regular build.

From a3337e2374f1bddb7fcbc4d14a76b95989f949bb Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 12 Apr 2017 12:43:52 -0700
Subject: [PATCH 1160/3220] UPSTREAM: kbuild: Consolidate header generation
 from ASM offset information

Largely redundant code is used in different places to generate C headers
from offset information extracted from assembly language output.
Consolidate the code in Makefile.lib and use this instead.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked from commit ebf003f0cfb3705e60d40dedc3ec949176c741af)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I0acd54dd27c0cf0868f221bd63728a9b67320b25
---
 Kbuild                    | 25 -------------------------
 arch/ia64/kernel/Makefile | 26 ++------------------------
 scripts/Makefile.lib      | 28 ++++++++++++++++++++++++++++
 scripts/mod/Makefile      | 28 ++--------------------------
 4 files changed, 32 insertions(+), 75 deletions(-)

diff --git a/Kbuild b/Kbuild
index f55cefd9bf29..f56ed561a284 100644
--- a/Kbuild
+++ b/Kbuild
@@ -6,31 +6,6 @@
 # 3) Generate asm-offsets.h (may need bounds.h and timeconst.h)
 # 4) Check for missing system calls
 
-# Default sed regexp - multiline due to syntax constraints
-define sed-y
-	"/^->/{s:->#\(.*\):/* \1 */:; \
-	s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:->::; p;}"
-endef
-
-# Use filechk to avoid rebuilds when a header changes, but the resulting file
-# does not
-define filechk_offsets
-	(set -e; \
-	 echo "#ifndef $2"; \
-	 echo "#define $2"; \
-	 echo "/*"; \
-	 echo " * DO NOT MODIFY."; \
-	 echo " *"; \
-	 echo " * This file was generated by Kbuild"; \
-	 echo " */"; \
-	 echo ""; \
-	 sed -ne $(sed-y); \
-	 echo ""; \
-	 echo "#endif" )
-endef
-
 #####
 # 1) Generate bounds.h
 
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 3686d6abafde..9edda5466020 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -50,32 +50,10 @@ CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
 # The gate DSO image is built using a special linker script.
 include $(src)/Makefile.gate
 
-# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
-define sed-y
-	"/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"
-endef
-quiet_cmd_nr_irqs = GEN     $@
-define cmd_nr_irqs
-	(set -e; \
-	 echo "#ifndef __ASM_NR_IRQS_H__"; \
-	 echo "#define __ASM_NR_IRQS_H__"; \
-	 echo "/*"; \
-	 echo " * DO NOT MODIFY."; \
-	 echo " *"; \
-	 echo " * This file was generated by Kbuild"; \
-	 echo " *"; \
-	 echo " */"; \
-	 echo ""; \
-	 sed -ne $(sed-y) $<; \
-	 echo ""; \
-	 echo "#endif" ) > $@
-endef
-
 # We use internal kbuild rules to avoid the "is up to date" message from make
 arch/$(SRCARCH)/kernel/nr-irqs.s: arch/$(SRCARCH)/kernel/nr-irqs.c
 	$(Q)mkdir -p $(dir $@)
 	$(call if_changed_dep,cc_s_c)
 
-include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s
-	$(Q)mkdir -p $(dir $@)
-	$(call cmd,nr_irqs)
+include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s FORCE
+	$(call filechk,offsets,__ASM_NR_IRQS_H__)
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index c84080885ad4..a6a330671eba 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -394,3 +394,31 @@ quiet_cmd_xzmisc = XZMISC  $@
 cmd_xzmisc = (cat $(filter-out FORCE,$^) | \
 	xz --check=crc32 --lzma2=dict=1MiB) > $@ || \
 	(rm -f $@ ; false)
+
+# ASM offsets
+# ---------------------------------------------------------------------------
+
+# Default sed regexp - multiline due to syntax constraints
+define sed-offsets
+	"/^->/{s:->#\(.*\):/* \1 */:; \
+	s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
+	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
+	s:->::; p;}"
+endef
+
+# Use filechk to avoid rebuilds when a header changes, but the resulting file
+# does not
+define filechk_offsets
+	(set -e; \
+	 echo "#ifndef $2"; \
+	 echo "#define $2"; \
+	 echo "/*"; \
+	 echo " * DO NOT MODIFY."; \
+	 echo " *"; \
+	 echo " * This file was generated by Kbuild"; \
+	 echo " */"; \
+	 echo ""; \
+	 sed -ne $(sed-offsets); \
+	 echo ""; \
+	 echo "#endif" )
+endef
diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile
index c11212ff3510..e0cb2e4a3b15 100644
--- a/scripts/mod/Makefile
+++ b/scripts/mod/Makefile
@@ -5,32 +5,8 @@ modpost-objs	:= modpost.o file2alias.o sumversion.o
 
 devicetable-offsets-file := devicetable-offsets.h
 
-define sed-y
-	"/^->/{s:->#\(.*\):/* \1 */:; \
-	s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:->::; p;}"
-endef
-
-quiet_cmd_offsets = GEN     $@
-define cmd_offsets
-	(set -e; \
-	 echo "#ifndef __DEVICETABLE_OFFSETS_H__"; \
-	 echo "#define __DEVICETABLE_OFFSETS_H__"; \
-	 echo "/*"; \
-	 echo " * DO NOT MODIFY."; \
-	 echo " *"; \
-	 echo " * This file was generated by Kbuild"; \
-	 echo " *"; \
-	 echo " */"; \
-	 echo ""; \
-	 sed -ne $(sed-y) $<; \
-	 echo ""; \
-	 echo "#endif" ) > $@
-endef
-
-$(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s
-	$(call if_changed,offsets)
+$(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s FORCE
+	$(call filechk,offsets,__DEVICETABLE_OFFSETS_H__)
 
 targets += $(devicetable-offsets-file) devicetable-offsets.s
 

From 54d8c15081750c6a44369bc9372e1277361fd480 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 21 Apr 2017 15:21:10 +0900
Subject: [PATCH 1161/3220] UPSTREAM: kbuild: consolidate redundant sed script
 ASM offset generation

This part ended up in redundant code after touched by multiple
people.

[1] Commit 3234282f33b2 ("x86, asm: Fix CFI macro invocations to
deal with shortcomings in gas") added parentheses for defined
expressions to support old gas for x86.

[2] Commit a22dcdb0032c ("x86, asm: Fix ancient-GAS workaround")
split the pattern into two to avoid parentheses for non-numeric
expressions.

[3] Commit 95a2f6f72d37 ("Partially revert patch that encloses
asm-offset.h numbers in brackets") removed parentheses from numeric
expressions as well because parentheses in MN10300 assembly have a
special meaning (pointer access).

Apparently, there is a conflict between [1] and [3].  After all,
[3] took precedence, and a long time has passed since then.

Now, merge the two patterns again because the first one is covered
by the other.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
(cherry picked from commit 7dd47b95b0f54f2057d40af6e66d477e3fe95d13)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: Idf9e632df984fbc9cb834e7f7b5d33f21da87dbc
---
 scripts/Makefile.lib | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index a6a330671eba..6219c5fcb801 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -401,7 +401,6 @@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \
 # Default sed regexp - multiline due to syntax constraints
 define sed-offsets
 	"/^->/{s:->#\(.*\):/* \1 */:; \
-	s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
 	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
 	s:->::; p;}"
 endef

From 03e66b365eda022da0e82df4685e1aefcbdc656e Mon Sep 17 00:00:00 2001
From: Jeroen Hofstee <jeroen@myspectrum.nl>
Date: Fri, 21 Apr 2017 15:21:11 +0900
Subject: [PATCH 1162/3220] UPSTREAM: kbuild: fix asm-offset generation to work
 with clang

KBuild abuses the asm statement to write to a file and
clang chokes about these invalid asm statements. Hack it
even more by fooling this is actual valid asm code.

[masahiro:
 Import Jeroen's work for U-Boot:
 http://patchwork.ozlabs.org/patch/375026/
 Tweak sed script a little to avoid garbage '#' for GCC case, like
 #define NR_PAGEFLAGS 23 /* __NR_PAGEFLAGS       # */ ]

Signed-off-by: Jeroen Hofstee <jeroen@myspectrum.nl>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Tested-by: Matthias Kaehlcke <mka@chromium.org>
(cherry picked from commit cf0c3e68aa81f992b0301f62e341b710d385bf68)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: Ifbfd4eff59a7f4304f0d8fdcba4075100244562f
---
 include/linux/kbuild.h | 6 +++---
 scripts/Makefile.lib   | 8 ++++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/linux/kbuild.h b/include/linux/kbuild.h
index 22a72198c14b..4e80f3a9ad58 100644
--- a/include/linux/kbuild.h
+++ b/include/linux/kbuild.h
@@ -2,14 +2,14 @@
 #define __LINUX_KBUILD_H
 
 #define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+	asm volatile("\n.ascii \"->" #sym " %0 " #val "\"" : : "i" (val))
 
-#define BLANK() asm volatile("\n->" : : )
+#define BLANK() asm volatile("\n.ascii \"->\"" : : )
 
 #define OFFSET(sym, str, mem) \
 	DEFINE(sym, offsetof(struct str, mem))
 
 #define COMMENT(x) \
-	asm volatile("\n->#" x)
+	asm volatile("\n.ascii \"->#" x "\"")
 
 #endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 6219c5fcb801..c3b3c94e2446 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -399,10 +399,14 @@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \
 # ---------------------------------------------------------------------------
 
 # Default sed regexp - multiline due to syntax constraints
+#
+# Use [:space:] because LLVM's integrated assembler inserts <tab> around
+# the .ascii directive whereas GCC keeps the <space> as-is.
 define sed-offsets
-	"/^->/{s:->#\(.*\):/* \1 */:; \
+	's:^[[:space:]]*\.ascii[[:space:]]*"\(.*\)".*:\1:; \
+	/^->/{s:->#\(.*\):/* \1 */:; \
 	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:->::; p;}"
+	s:->::; p;}'
 endef
 
 # Use filechk to avoid rebuilds when a header changes, but the resulting file

From 4b44c97fed0555b2406188ad49ac96d6be0fc329 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 13 Apr 2017 07:25:21 +0900
Subject: [PATCH 1163/3220] UPSTREAM: kbuild: drop -Wno-unknown-warning-option
 from clang options

Since commit c3f0d0bc5b01 ("kbuild, LLVMLinux: Add -Werror to
cc-option to support clang"), cc-option and friends work nicely
for clang.

However, -Wno-unknown-warning-option makes clang happy with any
unknown warning options even if -Werror is specified.

Once -Wno-unknown-warning-option is added, any succeeding call of
cc-disable-warning is evaluated positive, then unknown warning
options are accepted.  This should be dropped.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked from commit a0ae981eba8f07dbc74bce38fd3a462b69a5bc8e)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I0535e20fbcecc2d431e9f08b1f274c5d96626af1
---
 Makefile                   | 1 -
 scripts/Makefile.extrawarn | 1 -
 2 files changed, 2 deletions(-)

diff --git a/Makefile b/Makefile
index 75895a4b74b8..ecc86bec38e5 100644
--- a/Makefile
+++ b/Makefile
@@ -704,7 +704,6 @@ endif
 KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
-KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
 KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
 KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index da3386a9d244..abe5f47b1ab0 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn
@@ -61,7 +61,6 @@ ifeq ($(cc-name),clang)
 KBUILD_CFLAGS += $(call cc-disable-warning, initializer-overrides)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-value)
 KBUILD_CFLAGS += $(call cc-disable-warning, format)
-KBUILD_CFLAGS += $(call cc-disable-warning, unknown-warning-option)
 KBUILD_CFLAGS += $(call cc-disable-warning, sign-compare)
 KBUILD_CFLAGS += $(call cc-disable-warning, format-zero-length)
 KBUILD_CFLAGS += $(call cc-disable-warning, uninitialized)

From 8c4e0602b862c3f655a99e81b26cec52cbc80667 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 5 Apr 2017 14:29:32 -0700
Subject: [PATCH 1164/3220] BACKPORT: kbuild, LLVMLinux: Add -Werror to
 cc-option to support clang

Clang will warn about unknown warnings but will not return false
unless -Werror is set. GCC will return false if an unknown
warning is passed.

Adding -Werror make both compiler behave the same.

[arnd: it turns out we need the same patch for testing whether -ffunction-sections
       works right with gcc. I've build tested extensively with this patch
       applied, so let's just merge this one now.]

Upstream commit: c3f0d0bc5b01

Change-Id: I72c97bab5deaa47adef1bc535dcf19b7d2e0dbdf
Signed-off-by: Mark Charlebois <charlebm@gmail.com>
Signed-off-by: Behan Webster <behanw@converseincode.com>
Reviewed-by: Jan-Simon Mller <dl9pf@gmx.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 scripts/Kbuild.include | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 1db6d73c8dd2..30d9343f0c4b 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -111,12 +111,12 @@ as-instr = $(call try-run,\
 # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586)
 
 cc-option = $(call try-run,\
-	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
+	$(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
 
 # cc-option-yn
 # Usage: flag := $(call cc-option-yn,-march=winchip-c6)
 cc-option-yn = $(call try-run,\
-	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n)
+	$(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n)
 
 # cc-option-align
 # Prefix align with either -falign or -malign
@@ -126,7 +126,7 @@ cc-option-align = $(subst -functions=0,,\
 # cc-disable-warning
 # Usage: cflags-y += $(call cc-disable-warning,unused-but-set-variable)
 cc-disable-warning = $(call try-run,\
-	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
+	$(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
 
 # cc-name
 # Expands to either gcc or clang

From 6602531121707294674793ed27100118be4b87bc Mon Sep 17 00:00:00 2001
From: Behan Webster <behanw@converseincode.com>
Date: Mon, 27 Mar 2017 18:19:09 -0700
Subject: [PATCH 1165/3220] UPSTREAM: kbuild: use -Oz instead of -Os when using
 clang

This generates smaller resulting object code when compiled with clang.

Signed-off-by: Behan Webster <behanw@converseincode.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked from commit 6748cb3c299de1ffbe56733647b01dbcc398c419)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I5336ef3af6c7c638d9a68661c3c0e3f22693fdc8
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index ecc86bec38e5..ce950588a177 100644
--- a/Makefile
+++ b/Makefile
@@ -624,7 +624,7 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, format-overflow)
 KBUILD_CFLAGS	+= $(call cc-disable-warning, int-in-bool-context)
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
-KBUILD_CFLAGS	+= -Os
+KBUILD_CFLAGS	+= $(call cc-option,-Oz,-Os)
 else
 ifdef CONFIG_PROFILE_ALL_BRANCHES
 KBUILD_CFLAGS	+= -O2

From 588ae6ad5f5f31ea2c065553252a681a015a4c79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vin=C3=ADcius=20Tinti?= <viniciustinti@gmail.com>
Date: Mon, 24 Apr 2017 13:04:58 -0700
Subject: [PATCH 1166/3220] BACKPORT: kbuild: Add support to generate LLVM
 assembly files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add rules to kbuild in order to generate LLVM assembly files with the .ll
extension when using clang.

  # from c code
  make CC=clang kernel/pid.ll

Signed-off-by: Vinícius Tinti <viniciustinti@gmail.com>
Signed-off-by: Behan Webster <behanw@converseincode.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked from commit 433db3e260bc8134d4a46ddf20b3668937e12556)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I1fcc7ec14357e19e46cc2dd1772c5c258aec91d1
---
 .gitignore             | 1 +
 Makefile               | 5 +++++
 scripts/Makefile.build | 8 ++++++++
 3 files changed, 14 insertions(+)

diff --git a/.gitignore b/.gitignore
index fa3e5f1d0808..b17dcd217d83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,7 @@
 *.lzo
 *.patch
 *.gcno
+*.ll
 modules.builtin
 Module.symvers
 *.dwo
diff --git a/Makefile b/Makefile
index ce950588a177..482f92e952d4 100644
--- a/Makefile
+++ b/Makefile
@@ -1291,6 +1291,8 @@ help:
 	@echo  '                    (default: $$(INSTALL_MOD_PATH)/lib/firmware)'
 	@echo  '  dir/            - Build all files in dir and below'
 	@echo  '  dir/file.[ois]  - Build specified target only'
+	@echo  '  dir/file.ll     - Build the LLVM assembly file'
+	@echo  '                    (requires compiler support for LLVM assembly generation)'
 	@echo  '  dir/file.lst    - Build specified mixed source/assembly target only'
 	@echo  '                    (requires a recent binutils and recent build (System.map))'
 	@echo  '  dir/file.ko     - Build module including final link'
@@ -1466,6 +1468,7 @@ clean: $(clean-dirs)
 		-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
 		-o -name '*.symtypes' -o -name 'modules.order' \
 		-o -name modules.builtin -o -name '.tmp_*.o.*' \
+		-o -name '*.ll' \
 		-o -name '*.gcno' \) -type f -print | xargs rm -f
 
 # Generate tags for editors
@@ -1569,6 +1572,8 @@ endif
 	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
 %.symtypes: %.c prepare scripts FORCE
 	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
+%.ll: %.c prepare scripts FORCE
+	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
 
 # Modules
 /: prepare scripts FORCE
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 01df30af4d4a..411281c36898 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -174,6 +174,14 @@ cmd_cc_symtypes_c =                                                         \
 $(obj)/%.symtypes : $(src)/%.c FORCE
 	$(call cmd,cc_symtypes_c)
 
+# LLVM assembly
+# Generate .ll files from .c
+quiet_cmd_cc_ll_c = CC $(quiet_modtag)  $@
+      cmd_cc_ll_c = $(CC) $(c_flags) -emit-llvm -S -o $@ $<
+
+$(obj)/%.ll: $(src)/%.c FORCE
+	$(call if_changed_dep,cc_ll_c)
+
 # C (.c) files
 # The C file is compiled and updated dependency information is generated.
 # (See cmd_cc_o_c + relevant part of rule_cc_o_c)

From b8a6c2329c3e11c6a671ef0349f709ec2a91384a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 1 Feb 2017 18:00:14 +0100
Subject: [PATCH 1167/3220] UPSTREAM: modules: mark __inittest/__exittest as
 __maybe_unused

clang warns about unused inline functions by default:

arch/arm/crypto/aes-cipher-glue.c:68:1: warning: unused function '__inittest' [-Wunused-function]
arch/arm/crypto/aes-cipher-glue.c:69:1: warning: unused function '__exittest' [-Wunused-function]

As these appear in every single module, let's just disable the warnings by marking the
two functions as __maybe_unused.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
(cherry picked from commit 1f318a8bafcfba9f0d623f4870c4e890fd22e659)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I39c75bdb61834020320d41a678dfcc9442f07e4b
---
 include/linux/module.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index b229a9961d02..e2324653c8ef 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -125,13 +125,13 @@ extern void cleanup_module(void);
 
 /* Each module must use one module_init(). */
 #define module_init(initfn)					\
-	static inline initcall_t __inittest(void)		\
+	static inline initcall_t __maybe_unused __inittest(void)		\
 	{ return initfn; }					\
 	int init_module(void) __attribute__((alias(#initfn)));
 
 /* This is only required if you want to be unloadable. */
 #define module_exit(exitfn)					\
-	static inline exitcall_t __exittest(void)		\
+	static inline exitcall_t __maybe_unused __exittest(void)		\
 	{ return exitfn; }					\
 	void cleanup_module(void) __attribute__((alias(#exitfn)));
 

From f01b0c528e75ee1e8ec13da27bd987f34b8fbe10 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 8 Feb 2016 15:38:32 +0100
Subject: [PATCH 1168/3220] UPSTREAM: Kbuild: provide a __UNIQUE_ID for clang

The default __UNIQUE_ID macro in compiler.h fails to work for some drivers:

drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:615:1: error: redefinition of
      '__UNIQUE_ID_firmware615'
BRCMF_FW_NVRAM_DEF(4354, "brcmfmac4354-sdio.bin", "brcmfmac4354-sdio.txt");

This adds a copy of the version we use for gcc-4.3 and higher, as the same
one works with all versions of clang that I could find in svn (2.6 and higher).

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Michal Marek <mmarek@suse.com>
(cherry picked from commit b41c29b0527c7fd6a95d0f71274abb79933bf960)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I161dfa3ccb6b226966c3c87bba6b2fff1561bc61
---
 include/linux/compiler-clang.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index d1e49d52b640..de179993e039 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -10,3 +10,8 @@
 #undef uninitialized_var
 #define uninitialized_var(x) x = *(&(x))
 #endif
+
+/* same as gcc, this was present in clang-2.6 so we can assume it works
+ * with any version that can compile the kernel
+ */
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)

From 79ec10f9925af458c1cb5dfe31cbb12a7f841d1c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 6 Jun 2017 13:36:24 -0700
Subject: [PATCH 1169/3220] UPSTREAM: compiler, clang: suppress warning for
 unused static inline functions

GCC explicitly does not warn for unused static inline functions for
-Wunused-function.  The manual states:

	Warn whenever a static function is declared but not defined or
	a non-inline static function is unused.

Clang does warn for static inline functions that are unused.

It turns out that suppressing the warnings avoids potentially complex

Suppress the warning for clang.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit abb2ea7dfd82451d85ce669b811310c05ab5ca46)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I68e6246b03c962cc87b9d0bf4b7fefeda27068c0
---
 include/linux/compiler-clang.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index de179993e039..ea9126006a69 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -15,3 +15,10 @@
  * with any version that can compile the kernel
  */
 #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+
+/*
+ * GCC does not warn about unused static inline functions for
+ * -Wunused-function.  This turns out to avoid the need for complex #ifdef
+ * directives.  Suppress the warning in clang as well.
+ */
+#define inline inline __attribute__((unused))

From f2ea3999ecf1108b3ae0dcc14eab420701e4cf0a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 11 Jun 2017 15:51:56 -0700
Subject: [PATCH 1170/3220] UPSTREAM: compiler, clang: properly override
 'inline' for clang

Commit abb2ea7dfd82 ("compiler, clang: suppress warning for unused
static inline functions") just caused more warnings due to re-defining
the 'inline' macro.

So undef it before re-defining it, and also add the 'notrace' attribute
like the gcc version that this is overriding does.

Maybe this makes clang happier.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 6d53cefb18e4646fb4bf62ccb6098fb3808486df)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: Ie01b45583954c6104c854a3810e35c1171764e78
---
 include/linux/compiler-clang.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index ea9126006a69..d614c5ea1b5e 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -21,4 +21,5 @@
  * -Wunused-function.  This turns out to avoid the need for complex #ifdef
  * directives.  Suppress the warning in clang as well.
  */
-#define inline inline __attribute__((unused))
+#undef inline
+#define inline inline __attribute__((unused)) notrace

From 3717411e828d8af7428987070f14819d9d809636 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 13 Apr 2017 10:26:09 -0700
Subject: [PATCH 1171/3220] UPSTREAM: x86/kbuild: Use cc-option to enable
 -falign-{jumps/loops}

clang currently does not support these optimizations, only enable them
when they are available.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Davidson <md@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: grundler@chromium.org
Link: http://lkml.kernel.org/r/20170413172609.118122-1-mka@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 2c4fd1ac3ff167c91272dc43c7bfd2269ef61557)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: Id040421dcf782c9a5b20a72cf68360b36da8f824
---
 arch/x86/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 53949c886341..360713b8e258 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -88,10 +88,10 @@ else
         KBUILD_CFLAGS += -m64
 
         # Align jump targets to 1 byte, not the default 16 bytes:
-        KBUILD_CFLAGS += -falign-jumps=1
+        KBUILD_CFLAGS += $(call cc-option,-falign-jumps=1)
 
         # Pack loops tightly as well:
-        KBUILD_CFLAGS += -falign-loops=1
+        KBUILD_CFLAGS += $(call cc-option,-falign-loops=1)
 
         # Don't autogenerate traditional x87 instructions
         KBUILD_CFLAGS += $(call cc-option,-mno-80387)

From 2a3a3862527dbdba157c5973cbd231a20b20d253 Mon Sep 17 00:00:00 2001
From: Michael Davidson <md@google.com>
Date: Wed, 15 Mar 2017 15:36:00 -0700
Subject: [PATCH 1172/3220] UPSTREAM: crypto, x86: aesni - fix token pasting
 for clang

aes_ctrby8_avx-x86_64.S uses the C preprocessor for token pasting
of character sequences that are not valid preprocessor tokens.
While this is allowed when preprocessing assembler files it exposes
an incompatibilty between the clang and gcc preprocessors where
clang does not strip leading white space from macro parameters,
leading to the CONCAT(%xmm, i) macro expansion on line 96 resulting
in a token with a space character embedded in it.

While this could be resolved by deleting the offending space character,
the assembler is perfectly capable of doing the token pasting correctly
for itself so we can just get rid of the preprocessor macros.

Signed-off-by: Michael Davidson <md@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
(cherry picked from commit fdb2726f4e61c5e3abc052f547d5a5f6c0dc5504)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I087414d3575ea7b8703f39d429ccbf0361b314ae
---
 arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index a916c4a61165..5f6a5af9c489 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -65,7 +65,6 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 
-#define CONCAT(a,b)	a##b
 #define VMOVDQ		vmovdqu
 
 #define xdata0		%xmm0
@@ -92,8 +91,6 @@
 #define num_bytes	%r8
 
 #define tmp		%r10
-#define	DDQ(i)		CONCAT(ddq_add_,i)
-#define	XMM(i)		CONCAT(%xmm, i)
 #define	DDQ_DATA	0
 #define	XDATA		1
 #define KEY_128		1
@@ -131,12 +128,12 @@ ddq_add_8:
 /* generate a unique variable for ddq_add_x */
 
 .macro setddq n
-	var_ddq_add = DDQ(\n)
+	var_ddq_add = ddq_add_\n
 .endm
 
 /* generate a unique variable for xmm register */
 .macro setxdata n
-	var_xdata = XMM(\n)
+	var_xdata = %xmm\n
 .endm
 
 /* club the numeric 'id' to the symbol 'name' */

From cafc5bc76374ba807d570055be4e217572d33a49 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Fri, 5 May 2017 09:50:49 -0700
Subject: [PATCH 1173/3220] BACKPORT: x86/mm/kaslr: Use the _ASM_MUL macro for
 multiplication to work around Clang incompatibility

The constraint "rm" allows the compiler to put mix_const into memory.
When the input operand is a memory location then MUL needs an operand
size suffix, since Clang can't infer the multiplication width from the
operand.

Add and use the _ASM_MUL macro which determines the operand size and
resolves to the NUL instruction with the corresponding suffix.

This fixes the following error when building with clang:

  CC      arch/x86/lib/kaslr.o
  /tmp/kaslr-dfe1ad.s: Assembler messages:
  /tmp/kaslr-dfe1ad.s:182: Error: no instruction mnemonic suffix given and no register operands; can't size instruction

Upstream commit: 121843eb02a6

Change-Id: I53f51839705dabeb6c950d1def3a45881294129c
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Davidson <md@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170501224741.133938-1-mka@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/x86/boot/compressed/aslr.c | 2 +-
 arch/x86/include/asm/asm.h      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 6a9b96b4624d..4a5fbd2da658 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -94,7 +94,7 @@ static unsigned long get_random_long(void)
 	}
 
 	/* Circular multiply for better bit diffusion */
-	asm("mul %3"
+	asm(_ASM_MUL "%3"
 	    : "=a" (random), "=d" (raw)
 	    : "a" (random), "rm" (mix_const));
 	random += raw;
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..1f16ec50abeb 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -32,6 +32,7 @@
 #define _ASM_ADD	__ASM_SIZE(add)
 #define _ASM_SUB	__ASM_SIZE(sub)
 #define _ASM_XADD	__ASM_SIZE(xadd)
+#define _ASM_MUL	__ASM_SIZE(mul)
 
 #define _ASM_AX		__ASM_REG(ax)
 #define _ASM_BX		__ASM_REG(bx)

From fd5df2a435a5f2015b3b11504574e500ca3ab452 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 30 May 2016 12:56:27 +0200
Subject: [PATCH 1174/3220] BACKPORT: x86/hweight: Get rid of the special
 calling convention

People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench
into kcov, lto, etc, experimentations.

Add asm versions for __sw_hweight{32,64}() and do explicit saving and
restoring of clobbered registers. This gets rid of the special calling
convention. We get to call those functions on !X86_FEATURE_POPCNT CPUs.

We still need to hardcode POPCNT and register operands as some old gas
versions which we support, do not know about POPCNT.

Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives
can do padding now.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1464605787-20603-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit f5967101e9de12addcda4510dfbac66d7c5779c3)
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Conflicts:
	lib/Makefile

Change-Id: Ie7e6dce51c7093b1162337ec8bfc5abde0d79688
---
 arch/x86/Kconfig                    |  5 --
 arch/x86/include/asm/arch_hweight.h | 24 ++++-----
 arch/x86/kernel/i386_ksyms_32.c     |  2 +
 arch/x86/kernel/x8664_ksyms_64.c    |  3 ++
 arch/x86/lib/Makefile               |  2 +-
 arch/x86/lib/hweight.S              | 77 +++++++++++++++++++++++++++++
 lib/Makefile                        |  2 -
 lib/hweight.c                       |  4 ++
 8 files changed, 97 insertions(+), 22 deletions(-)
 create mode 100644 arch/x86/lib/hweight.S

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc9dbc5aa80f..412166ac999f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -296,11 +296,6 @@ config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
 
-config ARCH_HWEIGHT_CFLAGS
-	string
-	default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
-	default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 259a7c1ef709..44f825c80ed5 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,8 +2,8 @@
 #define _ASM_X86_HWEIGHT_H
 
 #ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
 /* popcnt %rdi, %rax */
 #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
 #define REG_IN "D"
@@ -15,19 +15,15 @@
 #define REG_OUT "a"
 #endif
 
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
+#define __HAVE_ARCH_SW_HWEIGHT
+
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
-	unsigned int res = 0;
+	unsigned int res;
 
 	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+			 : "="REG_OUT (res)
+			 : REG_IN (w));
 
 	return res;
 }
@@ -51,11 +47,11 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #else
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
-	unsigned long res = 0;
+	unsigned long res;
 
 	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
-		     : "="REG_OUT (res)
-		     : REG_IN (w));
+			 : "="REG_OUT (res)
+			 : REG_IN (w));
 
 	return res;
 }
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 64341aa485ae..d40ee8a38fed 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(___preempt_schedule);
 EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
+
+EXPORT_SYMBOL(__sw_hweight32);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..c7efd394c42b 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -42,6 +42,9 @@ EXPORT_SYMBOL(clear_page);
 
 EXPORT_SYMBOL(csum_partial);
 
+EXPORT_SYMBOL(__sw_hweight32);
+EXPORT_SYMBOL(__sw_hweight64);
+
 /*
  * Export string functions. We normally rely on gcc builtin for most of these,
  * but gcc sometimes decides not to inline them.
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2587888d987..db4a1c6ea785 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -22,7 +22,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
new file mode 100644
index 000000000000..02de3d74d2c5
--- /dev/null
+++ b/arch/x86/lib/hweight.S
@@ -0,0 +1,77 @@
+#include <linux/linkage.h>
+
+#include <asm/asm.h>
+
+/*
+ * unsigned int __sw_hweight32(unsigned int w)
+ * %rdi: w
+ */
+ENTRY(__sw_hweight32)
+
+#ifdef CONFIG_X86_64
+	movl %edi, %eax				# w
+#endif
+	__ASM_SIZE(push,) %__ASM_REG(dx)
+	movl %eax, %edx				# w -> t
+	shrl %edx				# t >>= 1
+	andl $0x55555555, %edx			# t &= 0x55555555
+	subl %edx, %eax				# w -= t
+
+	movl %eax, %edx				# w -> t
+	shrl $2, %eax				# w_tmp >>= 2
+	andl $0x33333333, %edx			# t	&= 0x33333333
+	andl $0x33333333, %eax			# w_tmp &= 0x33333333
+	addl %edx, %eax				# w = w_tmp + t
+
+	movl %eax, %edx				# w -> t
+	shrl $4, %edx				# t >>= 4
+	addl %edx, %eax				# w_tmp += t
+	andl  $0x0f0f0f0f, %eax			# w_tmp &= 0x0f0f0f0f
+	imull $0x01010101, %eax, %eax		# w_tmp *= 0x01010101
+	shrl $24, %eax				# w = w_tmp >> 24
+	__ASM_SIZE(pop,) %__ASM_REG(dx)
+	ret
+ENDPROC(__sw_hweight32)
+
+ENTRY(__sw_hweight64)
+#ifdef CONFIG_X86_64
+	pushq   %rdx
+
+	movq    %rdi, %rdx                      # w -> t
+	movabsq $0x5555555555555555, %rax
+	shrq    %rdx                            # t >>= 1
+	andq    %rdx, %rax                      # t &= 0x5555555555555555
+	movabsq $0x3333333333333333, %rdx
+	subq    %rax, %rdi                      # w -= t
+
+	movq    %rdi, %rax                      # w -> t
+	shrq    $2, %rdi                        # w_tmp >>= 2
+	andq    %rdx, %rax                      # t     &= 0x3333333333333333
+	andq    %rdi, %rdx                      # w_tmp &= 0x3333333333333333
+	addq    %rdx, %rax                      # w = w_tmp + t
+
+	movq    %rax, %rdx                      # w -> t
+	shrq    $4, %rdx                        # t >>= 4
+	addq    %rdx, %rax                      # w_tmp += t
+	movabsq $0x0f0f0f0f0f0f0f0f, %rdx
+	andq    %rdx, %rax                      # w_tmp &= 0x0f0f0f0f0f0f0f0f
+	movabsq $0x0101010101010101, %rdx
+	imulq   %rdx, %rax                      # w_tmp *= 0x0101010101010101
+	shrq    $56, %rax                       # w = w_tmp >> 56
+
+	popq    %rdx
+	ret
+#else /* CONFIG_X86_32 */
+	/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
+	pushl   %ecx
+
+	call    __sw_hweight32
+	movl    %eax, %ecx                      # stash away result
+	movl    %edx, %eax                      # second part of input
+	call    __sw_hweight32
+	addl    %ecx, %eax                      # result
+
+	popl    %ecx
+	ret
+#endif
+ENDPROC(__sw_hweight64)
diff --git a/lib/Makefile b/lib/Makefile
index 7f1de26613d2..cb4f6aa95013 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -58,8 +58,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
diff --git a/lib/hweight.c b/lib/hweight.c
index 9a5c1f221558..43273a7d83cf 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -9,6 +9,7 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned int __sw_hweight32(unsigned int w)
 {
 #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
@@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight32);
+#endif
 
 unsigned int __sw_hweight16(unsigned int w)
 {
@@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w)
 }
 EXPORT_SYMBOL(__sw_hweight8);
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned long __sw_hweight64(__u64 w)
 {
 #if BITS_PER_LONG == 32
@@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight64);
+#endif

From 3afdf761c528f1407cb1e5facb258619f55f3b72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Mon, 8 Aug 2016 20:35:29 +0300
Subject: [PATCH 1175/3220] UPSTREAM: x86/hweight: Don't clobber %rdi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The caller expects %rdi to remain intact, push+pop it make that happen.

Fixes the following kind of explosions on my core2duo machine when
trying to reboot or shut down:

  general protection fault: 0000 [#1] PREEMPT SMP
  Modules linked in: i915 i2c_algo_bit drm_kms_helper cfbfillrect syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea drm netconsole configfs binfmt_misc iTCO_wdt psmouse pcspkr snd_hda_codec_idt e100 coretemp hwmon snd_hda_codec_generic i2c_i801 mii i2c_smbus lpc_ich mfd_core snd_hda_intel uhci_hcd snd_hda_codec snd_hwdep snd_hda_core ehci_pci 8250 ehci_hcd snd_pcm 8250_base usbcore evdev serial_core usb_common parport_pc parport snd_timer snd soundcore
  CPU: 0 PID: 3070 Comm: reboot Not tainted 4.8.0-rc1-perf-dirty #69
  Hardware name:                  /D946GZIS, BIOS TS94610J.86A.0087.2007.1107.1049 11/07/2007
  task: ffff88012a0b4080 task.stack: ffff880123850000
  RIP: 0010:[<ffffffff81003c92>]  [<ffffffff81003c92>] x86_perf_event_update+0x52/0xc0
  RSP: 0018:ffff880123853b60  EFLAGS: 00010087
  RAX: 0000000000000001 RBX: ffff88012fc0a3c0 RCX: 000000000000001e
  RDX: 0000000000000000 RSI: 0000000040000000 RDI: ffff88012b014800
  RBP: ffff880123853b88 R08: ffffffffffffffff R09: 0000000000000000
  R10: ffffea0004a012c0 R11: ffffea0004acedc0 R12: ffffffff80000001
  R13: ffff88012b0149c0 R14: ffff88012b014800 R15: 0000000000000018
  FS:  00007f8b155cd700(0000) GS:ffff88012fc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f8b155f5000 CR3: 000000012a2d7000 CR4: 00000000000006f0
  Stack:
   ffff88012fc0a3c0 ffff88012b014800 0000000000000004 0000000000000001
   ffff88012fc1b750 ffff880123853bb0 ffffffff81003d59 ffff88012b014800
   ffff88012fc0a3c0 ffff88012b014800 ffff880123853bd8 ffffffff81003e13
  Call Trace:
   [<ffffffff81003d59>] x86_pmu_stop+0x59/0xd0
   [<ffffffff81003e13>] x86_pmu_del+0x43/0x140
   [<ffffffff8111705d>] event_sched_out.isra.105+0xbd/0x260
   [<ffffffff8111738d>] __perf_remove_from_context+0x2d/0xb0
   [<ffffffff8111745d>] __perf_event_exit_context+0x4d/0x70
   [<ffffffff810c8826>] generic_exec_single+0xb6/0x140
   [<ffffffff81117410>] ? __perf_remove_from_context+0xb0/0xb0
   [<ffffffff81117410>] ? __perf_remove_from_context+0xb0/0xb0
   [<ffffffff810c898f>] smp_call_function_single+0xdf/0x140
   [<ffffffff81113d27>] perf_event_exit_cpu_context+0x87/0xc0
   [<ffffffff81113d73>] perf_reboot+0x13/0x40
   [<ffffffff8107578a>] notifier_call_chain+0x4a/0x70
   [<ffffffff81075ad7>] __blocking_notifier_call_chain+0x47/0x60
   [<ffffffff81075b06>] blocking_notifier_call_chain+0x16/0x20
   [<ffffffff81076a1d>] kernel_restart_prepare+0x1d/0x40
   [<ffffffff81076ae2>] kernel_restart+0x12/0x60
   [<ffffffff81076d56>] SYSC_reboot+0xf6/0x1b0
   [<ffffffff811a823c>] ? mntput_no_expire+0x2c/0x1b0
   [<ffffffff811a83e4>] ? mntput+0x24/0x40
   [<ffffffff811894fc>] ? __fput+0x16c/0x1e0
   [<ffffffff811895ae>] ? ____fput+0xe/0x10
   [<ffffffff81072fc3>] ? task_work_run+0x83/0xa0
   [<ffffffff81001623>] ? exit_to_usermode_loop+0x53/0xc0
   [<ffffffff8100105a>] ? trace_hardirqs_on_thunk+0x1a/0x1c
   [<ffffffff81076e6e>] SyS_reboot+0xe/0x10
   [<ffffffff814c4ba5>] entry_SYSCALL_64_fastpath+0x18/0xa3
  Code: 7c 4c 8d af c0 01 00 00 49 89 fe eb 10 48 09 c2 4c 89 e0 49 0f b1 55 00 4c 39 e0 74 35 4d 8b a6 c0 01 00 00 41 8b 8e 60 01 00 00 <0f> 33 8b 35 6e 02 8c 00 48 c1 e2 20 85 f6 7e d2 48 89 d3 89 cf
  RIP  [<ffffffff81003c92>] x86_perf_event_update+0x52/0xc0
   RSP <ffff880123853b60>
  ---[ end trace 7ec95181faf211be ]---
  note: reboot[3070] exited with preempt_count 2

Cc: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Fixes: f5967101e9de ("x86/hweight: Get rid of the special calling convention")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 65ea11ec6a82b1d44aba62b59e9eb20247e57c6e)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: Ib004aa044ba9fc73cfff97fe78c8607008ca3846
---
 arch/x86/lib/hweight.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 02de3d74d2c5..8a602a1e404a 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -35,6 +35,7 @@ ENDPROC(__sw_hweight32)
 
 ENTRY(__sw_hweight64)
 #ifdef CONFIG_X86_64
+	pushq   %rdi
 	pushq   %rdx
 
 	movq    %rdi, %rdx                      # w -> t
@@ -60,6 +61,7 @@ ENTRY(__sw_hweight64)
 	shrq    $56, %rax                       # w = w_tmp >> 56
 
 	popq    %rdx
+	popq    %rdi
 	ret
 #else /* CONFIG_X86_32 */
 	/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */

From 423ba0fede66c9e367d2bab64339f85d6ce1d6f8 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 21 Jun 2017 16:28:03 -0700
Subject: [PATCH 1176/3220] BACKPORT: kbuild: Add __cc-option macro

cc-option uses KBUILD_CFLAGS and KBUILD_CPPFLAGS when it determines
whether an option is supported or not. This is fine for options used to
build the kernel itself, however some components like the x86 boot code
use a different set of flags.

Add the new macro __cc-option which is a more generic version of
cc-option with additional parameters. One parameter is the compiler
with which the check should be performed, the other the compiler options
to be used instead KBUILD_C*FLAGS.

Refactor cc-option and hostcc-option to use __cc-option and move
hostcc-option to scripts/Kbuild.include.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Michal Marek <mmarek@suse.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked from commit 9f3f1fd299768782465cb32cdf0dd4528d11f26b)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Conflicts:
	scripts/Kbuild.include

Change-Id: I4c8288b9c74bd6b9199307a0e04b78a27e28361d
---
 Makefile               |  2 +-
 scripts/Kbuild.include | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 482f92e952d4..9f32fb972fb6 100644
--- a/Makefile
+++ b/Makefile
@@ -301,7 +301,7 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \
 
 HOSTCC       = gcc
 HOSTCXX      = g++
-HOSTCFLAGS   = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89
+HOSTCFLAGS   := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89
 HOSTCXXFLAGS = -O2
 
 ifeq ($(shell $(HOSTCC) -v 2>&1 | grep -c "clang version"), 1)
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 30d9343f0c4b..8a1bb64f1dcd 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -107,11 +107,25 @@ as-option = $(call try-run,\
 as-instr = $(call try-run,\
 	printf "%b\n" "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3))
 
+# __cc-option
+# Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586)
+__cc-option = $(call try-run,\
+	$(1) -Werror $(2) $(3) -c -x c /dev/null -o "$$TMP",$(3),$(4))
+
+# Do not attempt to build with gcc plugins during cc-option tests.
+# (And this uses delayed resolution so the flags will be up to date.)
+CC_OPTION_CFLAGS = $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS))
+
 # cc-option
 # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586)
 
-cc-option = $(call try-run,\
-	$(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
+cc-option = $(call __cc-option, $(CC),\
+	$(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS),$(1),$(2))
+
+# hostcc-option
+# Usage: cflags-y += $(call hostcc-option,-march=winchip-c6,-march=i586)
+hostcc-option = $(call __cc-option, $(HOSTCC),\
+	$(HOSTCFLAGS) $(HOST_EXTRACFLAGS),$(1),$(2))
 
 # cc-option-yn
 # Usage: flag := $(call cc-option-yn,-march=winchip-c6)

From 203bd5f980384f6c5449191a42865950e5d86b3a Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 21 Jun 2017 16:28:04 -0700
Subject: [PATCH 1177/3220] UPSTREAM: x86/build: Use __cc-option for boot code
 compiler options

cc-option is used to enable compiler options for the boot code if they
are available. The macro uses KBUILD_CFLAGS and KBUILD_CPPFLAGS for the
check, however these flags aren't used to build the boot code, in
consequence cc-option can yield wrong results. For example
-mpreferred-stack-boundary=2 is never set with a 64-bit compiler,
since the setting is only valid for 16 and 32-bit binaries. This
is also the case for 32-bit kernel builds, because the option -m32 is
added to KBUILD_CFLAGS after the assignment of REALMODE_CFLAGS.

Use __cc-option instead of cc-option for the boot mode options.
The macro receives the compiler options as parameter instead of using
KBUILD_C*FLAGS, for the boot code we pass REALMODE_CFLAGS.

Also use separate statements for the __cc-option checks instead
of performing them in the initial assignment of REALMODE_CFLAGS since
the variable is an input of the macro.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked commit 032a2c4f65a2f81c93e161a11197ba19bc14a909)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I7756f875771edb00238eb770be912f713407681a
---
 arch/x86/Makefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 360713b8e258..e3b8c237b828 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -24,10 +24,11 @@ REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -D__KERNEL__ \
 		   -DDISABLE_BRANCH_PROFILING \
 		   -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
 		   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-		   -mno-mmx -mno-sse \
-		   $(call cc-option, -ffreestanding) \
-		   $(call cc-option, -fno-stack-protector) \
-		   $(call cc-option, -mpreferred-stack-boundary=2)
+		   -mno-mmx -mno-sse
+
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -mpreferred-stack-boundary=2)
 export REALMODE_CFLAGS
 
 # BITS is used as extension for files which are available in a 32 bit

From 6613aeae39d692d5a1f0432b76184b7ef84b4072 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 21 Jun 2017 16:28:05 -0700
Subject: [PATCH 1178/3220] UPSTREAM: x86/build: Specify stack alignment for
 clang

For gcc stack alignment is configured with -mpreferred-stack-boundary=N,
clang has the option -mstack-alignment=N for that purpose. Use the same
alignment as with gcc.

If the alignment is not specified clang assumes an alignment of
16 bytes, as required by the standard ABI. However as mentioned in
d9b0cde91c60 ("x86-64, gcc: Use -mpreferred-stack-boundary=3 if
supported") the standard kernel entry on x86-64 leaves the stack
on an 8-byte boundary, as a consequence clang will keep the stack
misaligned.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked commit d77698df39a512911586834d303275ea5fda74d0)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I4283d10c6fe31cf194b35adc5371732b89eb3ae3
---
 arch/x86/Makefile | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index e3b8c237b828..cb6af304d456 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -11,6 +11,14 @@ else
         KBUILD_DEFCONFIG := $(ARCH)_defconfig
 endif
 
+# For gcc stack alignment is specified with -mpreferred-stack-boundary,
+# clang has the option -mstack-alignment for that purpose.
+ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
+        cc_stack_align_opt := -mpreferred-stack-boundary
+else ifneq ($(call cc-option, -mstack-alignment=4),)
+        cc_stack_align_opt := -mstack-alignment
+endif
+
 # How to compile the 16-bit code.  Note we always compile for -march=i386;
 # that way we can complain to the user if the CPU is insufficient.
 #
@@ -28,7 +36,7 @@ REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -D__KERNEL__ \
 
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -mpreferred-stack-boundary=2)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2)
 export REALMODE_CFLAGS
 
 # BITS is used as extension for files which are available in a 32 bit
@@ -65,8 +73,10 @@ ifeq ($(CONFIG_X86_32),y)
         # with nonstandard options
         KBUILD_CFLAGS += -fno-pic
 
-        # prevent gcc from keeping the stack 16 byte aligned
-        KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
+        # Align the stack to the register width instead of using the default
+        # alignment of 16 bytes. This reduces stack usage and the number of
+        # alignment instructions.
+        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2)
 
         # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
         # a lot more stack due to the lack of sharing of stacklots:
@@ -100,8 +110,14 @@ else
 
         KBUILD_CFLAGS += -fno-pic
 
-	# Use -mpreferred-stack-boundary=3 if supported.
-	KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
+        # By default gcc and clang use a stack alignment of 16 bytes for x86.
+        # However the standard kernel entry on x86-64 leaves the stack on an
+        # 8-byte boundary. If the compiler isn't informed about the actual
+        # alignment it will generate extra alignment instructions for the
+        # default alignment which keep the stack *mis*aligned.
+        # Furthermore an alignment to the register width reduces stack usage
+        # and the number of alignment instructions.
+        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3)
 
 	# Use -mskip-rax-setup if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)

From ea2f9b02ed2fb2a0ab3d0f3d427db27ef550dab2 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Fri, 21 Apr 2017 14:39:30 -0700
Subject: [PATCH 1179/3220] UPSTREAM: kbuild: clang: Disable
 'address-of-packed-member' warning

clang generates plenty of these warnings in different parts of the code,
to an extent that the warnings are little more than noise. Disable the
'address-of-packed-member' warning.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked from commit bfb38988c51e440fd7062ddf3157f7d8b1dd5d70)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I35ecf1b35a908d41ee791a8a651e3cfb4edd081b
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 9f32fb972fb6..c8d44abe2df2 100644
--- a/Makefile
+++ b/Makefile
@@ -707,6 +707,7 @@ KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
 KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
 KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
+KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
 # Quiet clang warning: comparison of unsigned expression < 0 is always false
 KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
 # CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the

From b9c115c89fcc8c3d8038cd7d14f8e754cebbcb93 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 26 Apr 2017 17:11:32 +0100
Subject: [PATCH 1180/3220] UPSTREAM: crypto: arm64/sha - avoid non-standard
 inline asm tricks

Replace the inline asm which exports struct offsets as ELF symbols
with proper const variables exposing the same values. This works
around an issue with Clang which does not interpret the "i" (or "I")
constraints in the same way as GCC.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
(cherry picked from commit f4857f4c2ee9aa4e2aacac1a845352b00197fb57)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I1f882de15bd447d6fc41858dfc0cbfd3f6e2466c
---
 arch/arm64/crypto/sha1-ce-core.S |  6 ++++--
 arch/arm64/crypto/sha1-ce-glue.c | 11 +++--------
 arch/arm64/crypto/sha2-ce-core.S |  6 ++++--
 arch/arm64/crypto/sha2-ce-glue.c | 13 +++++--------
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
index c98e7e849f06..8550408735a0 100644
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -82,7 +82,8 @@ ENTRY(sha1_ce_transform)
 	ldr		dgb, [x0, #16]
 
 	/* load sha1_ce_state::finalize */
-	ldr		w4, [x0, #:lo12:sha1_ce_offsetof_finalize]
+	ldr_l		w4, sha1_ce_offsetof_finalize, x4
+	ldr		w4, [x0, x4]
 
 	/* load input */
 0:	ld1		{v8.4s-v11.4s}, [x1], #64
@@ -132,7 +133,8 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
 	 * the padding is handled by the C code in that case.
 	 */
 	cbz		x4, 3f
-	ldr		x4, [x0, #:lo12:sha1_ce_offsetof_count]
+	ldr_l		w4, sha1_ce_offsetof_count, x4
+	ldr		x4, [x0, x4]
 	movi		v9.2d, #0
 	mov		x8, #0x80000000
 	movi		v10.2d, #0
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index aefda9868627..ea319c055f5d 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -17,9 +17,6 @@
 #include <linux/crypto.h>
 #include <linux/module.h>
 
-#define ASM_EXPORT(sym, val) \
-	asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val));
-
 MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
@@ -32,6 +29,9 @@ struct sha1_ce_state {
 asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
 				  int blocks);
 
+const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
+const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize);
+
 static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
 			  unsigned int len)
 {
@@ -52,11 +52,6 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
 	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
 	bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE);
 
-	ASM_EXPORT(sha1_ce_offsetof_count,
-		   offsetof(struct sha1_ce_state, sst.count));
-	ASM_EXPORT(sha1_ce_offsetof_finalize,
-		   offsetof(struct sha1_ce_state, finalize));
-
 	/*
 	 * Allow the asm code to perform the finalization if there is no
 	 * partial data and the input is a round multiple of the block size.
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 01cfee066837..679c6c002f4f 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -88,7 +88,8 @@ ENTRY(sha2_ce_transform)
 	ld1		{dgav.4s, dgbv.4s}, [x0]
 
 	/* load sha256_ce_state::finalize */
-	ldr		w4, [x0, #:lo12:sha256_ce_offsetof_finalize]
+	ldr_l		w4, sha256_ce_offsetof_finalize, x4
+	ldr		w4, [x0, x4]
 
 	/* load input */
 0:	ld1		{v16.4s-v19.4s}, [x1], #64
@@ -136,7 +137,8 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	 * the padding is handled by the C code in that case.
 	 */
 	cbz		x4, 3f
-	ldr		x4, [x0, #:lo12:sha256_ce_offsetof_count]
+	ldr_l		w4, sha256_ce_offsetof_count, x4
+	ldr		x4, [x0, x4]
 	movi		v17.2d, #0
 	mov		x8, #0x80000000
 	movi		v18.2d, #0
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 7cd587564a41..0ed9486f75dd 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -17,9 +17,6 @@
 #include <linux/crypto.h>
 #include <linux/module.h>
 
-#define ASM_EXPORT(sym, val) \
-	asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val));
-
 MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
@@ -32,6 +29,11 @@ struct sha256_ce_state {
 asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
 				  int blocks);
 
+const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
+					      sst.count);
+const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,
+						 finalize);
+
 static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int len)
 {
@@ -52,11 +54,6 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
 	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
 	bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE);
 
-	ASM_EXPORT(sha256_ce_offsetof_count,
-		   offsetof(struct sha256_ce_state, sst.count));
-	ASM_EXPORT(sha256_ce_offsetof_finalize,
-		   offsetof(struct sha256_ce_state, finalize));
-
 	/*
 	 * Allow the asm code to perform the finalization if there is no
 	 * partial data and the input is a round multiple of the block size.

From e221e075ddd44d40fc4a51613c667168752e0051 Mon Sep 17 00:00:00 2001
From: Michael Davidson <md@google.com>
Date: Mon, 24 Jul 2017 16:51:55 -0700
Subject: [PATCH 1181/3220] UPSTREAM: x86/boot: #undef memcpy() et al in
 string.c

undef memcpy() and friends in boot/string.c so that the functions
defined here will have the correct names, otherwise we end up
up trying to redefine __builtin_memcpy() etc.

Surprisingly, GCC allows this (and, helpfully, discards the
__builtin_ prefix from the function name when compiling it),
but clang does not.

Adding these #undef's appears to preserve what I assume was
the original intent of the code.

(cherry picked from commit 18d5e6c34a8eda438d5ad8b3b15f42dab01bf05d)

Change-Id: I616a6a8ece533166367d987597e8c405c96441a2
Signed-off-by: Michael Davidson <md@google.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bernhard.Rosenkranzer@linaro.org
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170724235155.79255-1-mka@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/x86/boot/string.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 06ceddb3a22e..1d56adea8a7c 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -16,6 +16,15 @@
 #include "ctype.h"
 #include "string.h"
 
+/*
+ * Undef these macros so that the functions that we provide
+ * here will have the correct names regardless of how string.h
+ * may have chosen to #define them.
+ */
+#undef memcpy
+#undef memset
+#undef memcmp
+
 int memcmp(const void *s1, const void *s2, size_t len)
 {
 	u8 diff;

From 75eb3438b28512e33877354349d01f2dd659f0e6 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 6 Jul 2017 15:35:24 -0700
Subject: [PATCH 1182/3220] UPSTREAM: compiler, clang: always inline when
 CONFIG_OPTIMIZE_INLINING is disabled

The motivation for commit abb2ea7dfd82 ("compiler, clang: suppress
warning for unused static inline functions") was to suppress clang's
warnings about unused static inline functions.

For configs without CONFIG_OPTIMIZE_INLINING enabled, such as any non-x86
architecture, `inline' in the kernel implies that
__attribute__((always_inline)) is used.

Some code depends on that behavior, see
  https://lkml.org/lkml/2017/6/13/918:

  net/built-in.o: In function `__xchg_mb':
  arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99'
  arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99

The full fix would be to identify these breakages and annotate the
functions with __always_inline instead of `inline'.  But since we are
late in the 4.12-rc cycle, simply carry forward the forced inlining
behavior and work toward moving arm64, and other architectures, toward
CONFIG_OPTIMIZE_INLINING behavior.

(cherry picked from commit 9a04dbcfb33b4012d0ce8c0282f1e3ca694675b1)

Change-Id: I13891c2f1e588d8c7febe5d2d57134abb31d6ecd
Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1706261552200.1075@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Sodagudi Prasad <psodagud@codeaurora.org>
Tested-by: Sodagudi Prasad <psodagud@codeaurora.org>
Tested-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 include/linux/compiler-clang.h |  8 --------
 include/linux/compiler-gcc.h   | 18 +++++++++++-------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index d614c5ea1b5e..de179993e039 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -15,11 +15,3 @@
  * with any version that can compile the kernel
  */
 #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-/*
- * GCC does not warn about unused static inline functions for
- * -Wunused-function.  This turns out to avoid the need for complex #ifdef
- * directives.  Suppress the warning in clang as well.
- */
-#undef inline
-#define inline inline __attribute__((unused)) notrace
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 287e698c28de..557dae96ce74 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -66,18 +66,22 @@
 
 /*
  * Force always-inline if the user requests it so via the .config,
- * or if gcc is too old:
+ * or if gcc is too old.
+ * GCC does not warn about unused static inline functions for
+ * -Wunused-function.  This turns out to avoid the need for complex #ifdef
+ * directives.  Suppress the warning in clang as well by using "unused"
+ * function attribute, which is redundant but not harmful for gcc.
  */
 #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) ||		\
     !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4)
-#define inline		inline		__attribute__((always_inline)) notrace
-#define __inline__	__inline__	__attribute__((always_inline)) notrace
-#define __inline	__inline	__attribute__((always_inline)) notrace
+#define inline inline		__attribute__((always_inline,unused)) notrace
+#define __inline__ __inline__	__attribute__((always_inline,unused)) notrace
+#define __inline __inline	__attribute__((always_inline,unused)) notrace
 #else
 /* A lot of inline functions can cause havoc with function tracing */
-#define inline		inline		notrace
-#define __inline__	__inline__	notrace
-#define __inline	__inline	notrace
+#define inline inline		__attribute__((unused)) notrace
+#define __inline__ __inline__	__attribute__((unused)) notrace
+#define __inline __inline	__attribute__((unused)) notrace
 #endif
 
 #define __always_inline	inline __attribute__((always_inline))

From 00b6078ea64bfbbc539a8924a328053cc6d16114 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 18 Aug 2017 20:49:36 +0100
Subject: [PATCH 1183/3220] BACKPORT: efi/libstub/arm64: Force 'hidden'
 visibility for section markers

To prevent the compiler from emitting absolute references to the section
markers when running in PIC mode, override the visibility to 'hidden' for
all contents of asm/sections.h

Tested-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20170818194947.19347-4-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 0426a4e68f18d75515414361de9e3e1445d2644e)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: Ia438c3f0aa6abdbd9057dfe1db732a25aa98ef40
---
 drivers/firmware/efi/libstub/arm64-stub.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index e0e6b74fef8f..1b78215d6cd2 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -9,9 +9,17 @@
  * published by the Free Software Foundation.
  *
  */
+
+/*
+ * To prevent the compiler from emitting GOT-indirected (and thus absolute)
+ * references to the section markers, override their visibility as 'hidden'
+ */
+#pragma GCC visibility push(hidden)
+#include <asm/sections.h>
+#pragma GCC visibility pop
+
 #include <linux/efi.h>
 #include <asm/efi.h>
-#include <asm/sections.h>
 
 #include "efistub.h"
 

From 2f2860a504a30a7645c6a0ec06767c5c7677a4ea Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 18 Aug 2017 20:49:37 +0100
Subject: [PATCH 1184/3220] UPSTREAM: efi/libstub/arm64: Set -fpie when
 building the EFI stub

Clang may emit absolute symbol references when building in non-PIC mode,
even when using the default 'small' code model, which is already mostly
position independent to begin with, due to its use of adrp/add pairs
that have a relative range of +/- 4 GB. The remedy is to pass the -fpie
flag, which can be done safely now that the code has been updated to avoid
GOT indirections (which may be emitted due to the compiler assuming that
the PIC/PIE code may end up in a shared library that is subject to ELF
symbol preemption)

Passing -fpie when building code that needs to execute at an a priori
unknown offset is arguably an improvement in any case, and given that
the recent visibility changes allow the PIC build to pass with GCC as
well, let's add -fpie for all arm64 builds rather than only for Clang.

Tested-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20170818194947.19347-5-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 91ee5b21ee026c49e4e7483de69b55b8b47042be)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I0a011945239d39a2d1eb04c20bf1b9ceb7d2b91d
---
 drivers/firmware/efi/libstub/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 68bdd9f23e13..d1f1bde85d3f 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -10,7 +10,7 @@ cflags-$(CONFIG_X86)		+= -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
 				   -fPIC -fno-strict-aliasing -mno-red-zone \
 				   -mno-mmx -mno-sse
 
-cflags-$(CONFIG_ARM64)		:= $(subst -pg,,$(KBUILD_CFLAGS))
+cflags-$(CONFIG_ARM64)		:= $(subst -pg,,$(KBUILD_CFLAGS)) -fpie
 cflags-$(CONFIG_ARM)		:= $(subst -pg,,$(KBUILD_CFLAGS)) \
 				   -fno-builtin -fpic -mno-single-pic-base
 

From a61090a6d556ce3a8024b4b084bd7b41ffd2908e Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 16 Aug 2017 17:47:40 -0700
Subject: [PATCH 1185/3220] UPSTREAM: x86/build: Fix stack alignment for CLang

Commit:

  d77698df39a5 ("x86/build: Specify stack alignment for clang")

intended to use the same stack alignment for clang as with gcc.

The two compilers use different options to configure the stack alignment
(gcc: -mpreferred-stack-boundary=n, clang: -mstack-alignment=n).

The above commit assumes that the clang option uses the same parameter
type as gcc, i.e. that the alignment is specified as 2^n. However clang
interprets the value of this option literally to use an alignment of n,
in consequence the stack remains misaligned.

Change the values used with -mstack-alignment to be the actual alignment
instead of a power of two.

cc-option isn't used here with the typical pattern of KBUILD_CFLAGS +=
$(call cc-option ...). The reason is that older gcc versions don't
support the -mpreferred-stack-boundary option, since cc-option doesn't
verify whether the alternative option is valid it would incorrectly
select the clang option -mstack-alignment..

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bernhard.Rosenkranzer@linaro.org
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Davidson <md@google.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hines <srhines@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dianders@chromium.org
Link: http://lkml.kernel.org/r/20170817004740.170588-1-mka@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 8f91869766c00622b2eaa8ee567db4f333b78c1a)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: I7991bfed754f5ac10ac8b383c20ec89d56b2afc0
---
 arch/x86/Makefile | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index cb6af304d456..d2ec032a649a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -14,9 +14,11 @@ endif
 # For gcc stack alignment is specified with -mpreferred-stack-boundary,
 # clang has the option -mstack-alignment for that purpose.
 ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
-        cc_stack_align_opt := -mpreferred-stack-boundary
-else ifneq ($(call cc-option, -mstack-alignment=4),)
-        cc_stack_align_opt := -mstack-alignment
+      cc_stack_align4 := -mpreferred-stack-boundary=2
+      cc_stack_align8 := -mpreferred-stack-boundary=3
+else ifneq ($(call cc-option, -mstack-alignment=16),)
+      cc_stack_align4 := -mstack-alignment=4
+      cc_stack_align8 := -mstack-alignment=8
 endif
 
 # How to compile the 16-bit code.  Note we always compile for -march=i386;
@@ -36,7 +38,7 @@ REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -D__KERNEL__ \
 
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2)
+REALMODE_CFLAGS += $(cc_stack_align4)
 export REALMODE_CFLAGS
 
 # BITS is used as extension for files which are available in a 32 bit
@@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y)
         # Align the stack to the register width instead of using the default
         # alignment of 16 bytes. This reduces stack usage and the number of
         # alignment instructions.
-        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2)
+        KBUILD_CFLAGS += $(cc_stack_align4)
 
         # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
         # a lot more stack due to the lack of sharing of stacklots:
@@ -117,7 +119,7 @@ else
         # default alignment which keep the stack *mis*aligned.
         # Furthermore an alignment to the register width reduces stack usage
         # and the number of alignment instructions.
-        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3)
+        KBUILD_CFLAGS += $(cc_stack_align8)
 
 	# Use -mskip-rax-setup if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)

From 8e248849793b9b1172ee95aa4000377eeae1a062 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 17 Aug 2017 11:20:47 -0700
Subject: [PATCH 1186/3220] UPSTREAM: x86/build: Use cc-option to validate
 stack alignment parameter

With the following commit:

  8f91869766c0 ("x86/build: Fix stack alignment for CLang")

cc-option is only used to determine the name of the stack alignment option
supported by the compiler, but not to verify that the actual parameter
<option>=N is valid in combination with the other CFLAGS.

This causes problems (as reported by the kbuild robot) with older GCC versions
which only support stack alignment on a boundary of 16 bytes or higher.

Also use (__)cc_option to add the stack alignment option to CFLAGS to
make sure only valid options are added.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bernhard.Rosenkranzer@linaro.org
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Davidson <md@google.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hines <srhines@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dianders@chromium.org
Fixes: 8f91869766c0 ("x86/build: Fix stack alignment for CLang")
Link: http://lkml.kernel.org/r/20170817182047.176752-1-mka@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 9e8730b178a2472fca3123e909d6e69cc8127778)
Signed-off-by: Greg Hackmann <ghackmann@google.com>

Change-Id: Ia2c932ede0096fe399131e958e0aaf4835039294
---
 arch/x86/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index d2ec032a649a..0048d9886725 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -38,7 +38,7 @@ REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -D__KERNEL__ \
 
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
-REALMODE_CFLAGS += $(cc_stack_align4)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
 export REALMODE_CFLAGS
 
 # BITS is used as extension for files which are available in a 32 bit
@@ -78,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y)
         # Align the stack to the register width instead of using the default
         # alignment of 16 bytes. This reduces stack usage and the number of
         # alignment instructions.
-        KBUILD_CFLAGS += $(cc_stack_align4)
+        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4))
 
         # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
         # a lot more stack due to the lack of sharing of stacklots:
@@ -119,7 +119,7 @@ else
         # default alignment which keep the stack *mis*aligned.
         # Furthermore an alignment to the register width reduces stack usage
         # and the number of alignment instructions.
-        KBUILD_CFLAGS += $(cc_stack_align8)
+        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8))
 
 	# Use -mskip-rax-setup if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)

From 2376d9b464c58b8d346154ed075902913055896f Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Fri, 21 Apr 2017 14:04:28 -0700
Subject: [PATCH 1187/3220] CHROMIUM: kbuild: clang: Disable the
 'duplicate-decl-specifier' warning

clang generates plenty of these warnings in different parts of the code.
They are mostly caused by container_of() and other macros which declare
a "const <type> *" variable for their internal use which triggers a
"duplicate 'const' specifier" warning if the <type> is already const
qualified.

Change-Id: I85ffb201003d3a04fe8b8ff94478344250d2db68
Wording-mostly-from: Michael Davidson <md@google.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index c8d44abe2df2..41a3fd57bf20 100644
--- a/Makefile
+++ b/Makefile
@@ -708,6 +708,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
 KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
 KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
 KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
+KBUILD_CFLAGS += $(call cc-disable-warning, duplicate-decl-specifier)
 # Quiet clang warning: comparison of unsigned expression < 0 is always false
 KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
 # CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the

From e9186826bb135f8cef70cb21689291cd8e6961af Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Fri, 21 Apr 2017 16:00:56 -0700
Subject: [PATCH 1188/3220] CHROMIUM: arm64: Disable asm-operand-width warning
 for clang

clang raises 'asm-operand-widths' warnings in inline assembly code when
the size of an operand is < 64 bits and the operand width is unspecified.
Most warnings are raised in macros, i.e. the datatype of the operand may
vary. Most of these warnings are fixed in upstream, however we consider it
isn't worth the effort/risk to backport all the necessary changes. On
future CrOS kernels >= v4.13 the warning should be re-enabled.

Change-Id: Ia331bc83d44b8c1499450aefb45c576cd29ebf55
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/arm64/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index ba55698f71e8..98c69f13cb09 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -66,6 +66,10 @@ else
 TEXT_OFFSET := 0x00080000
 endif
 
+ifeq ($(cc-name),clang)
+KBUILD_CFLAGS += $(call cc-disable-warning, asm-operand-widths)
+endif
+
 # KASAN_SHADOW_OFFSET = VA_START + (1 << (VA_BITS - 3)) - (1 << 61)
 # in 32-bit arithmetic
 KASAN_SHADOW_OFFSET := $(shell printf "0x%08x00000000\n" $$(( \

From f0907aa15ed9f9c7541bb244ed3f52c376ced19c Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Tue, 25 Oct 2016 13:59:59 -0700
Subject: [PATCH 1189/3220] ANDROID: Kbuild, LLVMLinux: allow overriding clang
 target triple

Android has an unusual setup where the kernel needs to target
[arch]-linux-gnu to avoid Android userspace-specific flags and
optimizations, but AOSP doesn't ship a matching binutils.

Add a new variable CLANG_TRIPLE which can override the "-target" triple
used to compile the kernel, while using a different CROSS_COMPILE to
pick the binutils/gcc installation.  For Android you'd do something
like:

  export CLANG_TRIPLE=aarch64-linux-gnu-
  export CROSS_COMPILE=aarch64-linux-android-

If you don't need something like this, leave CLANG_TRIPLE unset and it
will default to CROSS_COMPILE.

Change-Id: Ib544c37f4ee4ed005437471b2984486a3e7c0da7
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 41a3fd57bf20..12927ed75bf5 100644
--- a/Makefile
+++ b/Makefile
@@ -695,7 +695,8 @@ KBUILD_CFLAGS += $(stackp-flag)
 
 ifeq ($(cc-name),clang)
 ifneq ($(CROSS_COMPILE),)
-CLANG_TARGET	:= -target $(notdir $(CROSS_COMPILE:%-=%))
+CLANG_TRIPLE    ?= $(CROSS_COMPILE)
+CLANG_TARGET	:= -target $(notdir $(CLANG_TRIPLE:%-=%))
 GCC_TOOLCHAIN	:= $(realpath $(dir $(shell which $(LD)))/..)
 endif
 ifneq ($(GCC_TOOLCHAIN),)

From 93ea173930ba39aa67b320a3dab022665a8ca50f Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 16 Dec 2015 13:20:43 -0800
Subject: [PATCH 1190/3220] UPSTREAM: net: l3mdev: Add master device lookup by
 index

Add helper to lookup l3mdev master index given a device index.

[cherry-pick of upstream 1a8524794fc7c70f44ac28e3a6e8fd637bc41f14]

Bug: 63589535
Change-Id: I3d0758a5d0eb03791726014c9c1e32e187391e6f
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 5689a0c749f7..5567d46b3cff 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -51,6 +51,24 @@ static inline int l3mdev_master_ifindex(struct net_device *dev)
 	return ifindex;
 }
 
+static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
+{
+	struct net_device *dev;
+	int rc = 0;
+
+	if (likely(ifindex)) {
+		rcu_read_lock();
+
+		dev = dev_get_by_index_rcu(net, ifindex);
+		if (dev)
+			rc = l3mdev_master_ifindex_rcu(dev);
+
+		rcu_read_unlock();
+	}
+
+	return rc;
+}
+
 /* get index of an interface to use for FIB lookups. For devices
  * enslaved to an L3 master device FIB lookups are based on the
  * master index
@@ -170,6 +188,11 @@ static inline int l3mdev_master_ifindex(struct net_device *dev)
 	return 0;
 }
 
+static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
+{
+	return 0;
+}
+
 static inline int l3mdev_fib_oif_rcu(struct net_device *dev)
 {
 	return dev ? dev->ifindex : 0;

From 511953dbd56f3a911e8bbf9aa0e4e067fe8d564e Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 14 Aug 2016 19:52:56 -0700
Subject: [PATCH 1191/3220] UPSTREAM: xfrm: Only add l3mdev oif to dst lookups

Subash reported that commit 42a7b32b73d6 ("xfrm: Add oif to dst lookups")
broke a wifi use case that uses fib rules and xfrms. The intent of
42a7b32b73d6 was driven by VRFs with IPsec. As a compromise relax the
use of oif in xfrm lookups to L3 master devices only (ie., oif is either
an L3 master device or is enslaved to a master device).

[cherry-pick of upstream 11d7a0bb95eaaba1741bb24a7c3c169c82f09c7b]

Bug: 63589535
Change-Id: Ibadb15341f6c6c7077eccfaa2c66b3bb86b251bf
Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups")
Reported-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_policy.c | 2 +-
 net/ipv6/xfrm6_policy.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 7b0edb37a115..e07ed8b1deb3 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -29,7 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 	memset(fl4, 0, sizeof(*fl4));
 	fl4->daddr = daddr->a4;
 	fl4->flowi4_tos = tos;
-	fl4->flowi4_oif = oif;
+	fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
 	if (saddr)
 		fl4->saddr = saddr->a4;
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index c074771a10f7..dd84ecd1221b 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -36,7 +36,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 	int err;
 
 	memset(&fl6, 0, sizeof(fl6));
-	fl6.flowi6_oif = oif;
+	fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
 	fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
 	memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
 	if (saddr)

From c6a272aa6dbdc3a453c4e8c8386db6f218cf161c Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 11 Aug 2017 02:11:33 +0900
Subject: [PATCH 1192/3220] BACKPORT: net: xfrm: support setting an output
 mark.

On systems that use mark-based routing it may be necessary for
routing lookups to use marks in order for packets to be routed
correctly. An example of such a system is Android, which uses
socket marks to route packets via different networks.

Currently, routing lookups in tunnel mode always use a mark of
zero, making routing incorrect on such systems.

This patch adds a new output_mark element to the xfrm state and
a corresponding XFRMA_OUTPUT_MARK netlink attribute. The output
mark differs from the existing xfrm mark in two ways:

1. The xfrm mark is used to match xfrm policies and states, while
   the xfrm output mark is used to set the mark (and influence
   the routing) of the packets emitted by those states.
2. The existing mark is constrained to be a subset of the bits of
   the originating socket or transformed packet, but the output
   mark is arbitrary and depends only on the state.

The use of a separate mark provides additional flexibility. For
example:

- A packet subject to two transforms (e.g., transport mode inside
  tunnel mode) can have two different output marks applied to it,
  one for the transport mode SA and one for the tunnel mode SA.
- On a system where socket marks determine routing, the packets
  emitted by an IPsec tunnel can be routed based on a mark that
  is determined by the tunnel, not by the marks of the
  unencrypted packets.
- Support for setting the output marks can be introduced without
  breaking any existing setups that employ both mark-based
  routing and xfrm tunnel mode. Simply changing the code to use
  the xfrm mark for routing output packets could xfrm mark could
  change behaviour in a way that breaks these setups.

If the output mark is unspecified or set to zero, the mark is not
set or changed.

[backport of upstream 077fbac405bfc6d41419ad6c1725804ad4e9887c]

Bug: 63589535
Test: https://android-review.googlesource.com/452776/ passes
Tested: make allyesconfig; make -j64
Tested: https://android-review.googlesource.com/452776
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Change-Id: I76120fba036e21780ced31ad390faf491ea81e52
---
 include/net/xfrm.h        |  7 +++++--
 include/uapi/linux/xfrm.h |  3 +++
 net/ipv4/xfrm4_policy.c   | 14 +++++++++-----
 net/ipv6/xfrm6_policy.c   |  9 ++++++---
 net/xfrm/xfrm_output.c    |  3 +++
 net/xfrm/xfrm_policy.c    | 17 +++++++++--------
 net/xfrm/xfrm_user.c      | 11 +++++++++++
 7 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 185fb037b332..9e1325e36415 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -159,6 +159,7 @@ struct xfrm_state {
 		int		header_len;
 		int		trailer_len;
 		u32		extra_flags;
+		u32		output_mark;
 	} props;
 
 	struct xfrm_lifetime_cfg lft;
@@ -288,10 +289,12 @@ struct xfrm_policy_afinfo {
 	struct dst_entry	*(*dst_lookup)(struct net *net,
 					       int tos, int oif,
 					       const xfrm_address_t *saddr,
-					       const xfrm_address_t *daddr);
+					       const xfrm_address_t *daddr,
+					       u32 mark);
 	int			(*get_saddr)(struct net *net, int oif,
 					     xfrm_address_t *saddr,
-					     xfrm_address_t *daddr);
+					     xfrm_address_t *daddr,
+					     u32 mark);
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 2cd9e608d0d1..8cb0f24a33f0 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -302,6 +302,9 @@ enum xfrm_attr_type_t {
 	XFRMA_SA_EXTRA_FLAGS,	/* __u32 */
 	XFRMA_PROTO,		/* __u8 */
 	XFRMA_ADDRESS_FILTER,	/* struct xfrm_address_filter */
+	XFRMA_PAD,
+	XFRMA_OFFLOAD_DEV,	/* struct xfrm_state_offload */
+	XFRMA_OUTPUT_MARK,	/* __u32 */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e07ed8b1deb3..39eebc7b2831 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -22,7 +22,8 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
 static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 					    int tos, int oif,
 					    const xfrm_address_t *saddr,
-					    const xfrm_address_t *daddr)
+					    const xfrm_address_t *daddr,
+					    u32 mark)
 {
 	struct rtable *rt;
 
@@ -30,6 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 	fl4->daddr = daddr->a4;
 	fl4->flowi4_tos = tos;
 	fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
+	fl4->flowi4_mark = mark;
 	if (saddr)
 		fl4->saddr = saddr->a4;
 
@@ -44,20 +46,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 
 static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
-					  const xfrm_address_t *daddr)
+					  const xfrm_address_t *daddr,
+					  u32 mark)
 {
 	struct flowi4 fl4;
 
-	return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr);
+	return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark);
 }
 
 static int xfrm4_get_saddr(struct net *net, int oif,
-			   xfrm_address_t *saddr, xfrm_address_t *daddr)
+			   xfrm_address_t *saddr, xfrm_address_t *daddr,
+			   u32 mark)
 {
 	struct dst_entry *dst;
 	struct flowi4 fl4;
 
-	dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr);
+	dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark);
 	if (IS_ERR(dst))
 		return -EHOSTUNREACH;
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index dd84ecd1221b..1a8608cc104c 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -29,7 +29,8 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
 
 static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
-					  const xfrm_address_t *daddr)
+					  const xfrm_address_t *daddr,
+					  u32 mark)
 {
 	struct flowi6 fl6;
 	struct dst_entry *dst;
@@ -38,6 +39,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
 	fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
+	fl6.flowi6_mark = mark;
 	memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
 	if (saddr)
 		memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
@@ -54,12 +56,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 }
 
 static int xfrm6_get_saddr(struct net *net, int oif,
-			   xfrm_address_t *saddr, xfrm_address_t *daddr)
+			   xfrm_address_t *saddr, xfrm_address_t *daddr,
+			   u32 mark)
 {
 	struct dst_entry *dst;
 	struct net_device *dev;
 
-	dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr);
+	dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark);
 	if (IS_ERR(dst))
 		return -EHOSTUNREACH;
 
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index ff4a91fcab9f..16e828f2540f 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,6 +66,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
 			goto error_nolock;
 		}
 
+		if (x->props.output_mark)
+			skb->mark = x->props.output_mark;
+
 		err = x->outer_mode->output(x, skb);
 		if (err) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 0e01250f2072..4096f699ba00 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -119,7 +119,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
 						  int tos, int oif,
 						  const xfrm_address_t *saddr,
 						  const xfrm_address_t *daddr,
-						  int family)
+						  int family, u32 mark)
 {
 	struct xfrm_policy_afinfo *afinfo;
 	struct dst_entry *dst;
@@ -128,7 +128,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
 	if (unlikely(afinfo == NULL))
 		return ERR_PTR(-EAFNOSUPPORT);
 
-	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
+	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark);
 
 	xfrm_policy_put_afinfo(afinfo);
 
@@ -139,7 +139,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
 						int tos, int oif,
 						xfrm_address_t *prev_saddr,
 						xfrm_address_t *prev_daddr,
-						int family)
+						int family, u32 mark)
 {
 	struct net *net = xs_net(x);
 	xfrm_address_t *saddr = &x->props.saddr;
@@ -155,7 +155,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
 		daddr = x->coaddr;
 	}
 
-	dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family);
+	dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark);
 
 	if (!IS_ERR(dst)) {
 		if (prev_saddr != saddr)
@@ -1395,14 +1395,14 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 
 static int
 xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
-	       xfrm_address_t *remote, unsigned short family)
+	       xfrm_address_t *remote, unsigned short family, u32 mark)
 {
 	int err;
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
 	if (unlikely(afinfo == NULL))
 		return -EINVAL;
-	err = afinfo->get_saddr(net, oif, local, remote);
+	err = afinfo->get_saddr(net, oif, local, remote, mark);
 	xfrm_policy_put_afinfo(afinfo);
 	return err;
 }
@@ -1433,7 +1433,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
 			if (xfrm_addr_any(local, tmpl->encap_family)) {
 				error = xfrm_get_saddr(net, fl->flowi_oif,
 						       &tmp, remote,
-						       tmpl->encap_family);
+						       tmpl->encap_family, 0);
 				if (error)
 					goto fail;
 				local = &tmp;
@@ -1712,7 +1712,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
 			family = xfrm[i]->props.family;
 			dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
-					      &saddr, &daddr, family);
+					      &saddr, &daddr, family,
+					      xfrm[i]->props.output_mark);
 			err = PTR_ERR(dst);
 			if (IS_ERR(dst))
 				goto put_states;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4c696d4d5ce3..68010a01ea36 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -584,6 +584,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 
 	xfrm_mark_get(attrs, &x->mark);
 
+	if (attrs[XFRMA_OUTPUT_MARK])
+		x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]);
+
 	err = __xfrm_init_state(x, false);
 	if (err)
 		goto error;
@@ -867,6 +870,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 		goto out;
 	if (x->security)
 		ret = copy_sec_ctx(x->security, skb);
+	if (x->props.output_mark) {
+		ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark);
+		if (ret)
+			goto out;
+	}
 out:
 	return ret;
 }
@@ -2419,6 +2427,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_SA_EXTRA_FLAGS]	= { .type = NLA_U32 },
 	[XFRMA_PROTO]		= { .type = NLA_U8 },
 	[XFRMA_ADDRESS_FILTER]	= { .len = sizeof(struct xfrm_address_filter) },
+	[XFRMA_OUTPUT_MARK]	= { .len = NLA_U32 },
 };
 
 static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2635,6 +2644,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
 		l += nla_total_size(sizeof(*x->coaddr));
 	if (x->props.extra_flags)
 		l += nla_total_size(sizeof(x->props.extra_flags));
+	if (x->props.output_mark)
+		l += nla_total_size(sizeof(x->props.output_mark));
 
 	/* Must count x->lastused as it may become non-zero behind our back. */
 	l += nla_total_size(sizeof(u64));

From 35091a1d8b61ccc35eba80850f20deed1f543928 Mon Sep 17 00:00:00 2001
From: Ganesh Mahendran <opensource.ganesh@gmail.com>
Date: Wed, 27 Sep 2017 15:12:25 +0800
Subject: [PATCH 1193/3220] ANDROID: binder: init desired_prio.sched_policy
 before use it

In function binder_transaction_priority(), we access
desired_prio before initialzing it.

This patch fix this.

Change-Id: I9d14d50f9a128010476a65b52631630899a44633
Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
---
 drivers/android/binder.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index bc8d9ecfebec..49b8ad929bfe 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1189,7 +1189,7 @@ static void binder_transaction_priority(struct task_struct *task,
 					struct binder_priority node_prio,
 					bool inherit_rt)
 {
-	struct binder_priority desired_prio;
+	struct binder_priority desired_prio = t->priority;
 
 	if (t->set_priority_called)
 		return;
@@ -1201,9 +1201,6 @@ static void binder_transaction_priority(struct task_struct *task,
 	if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) {
 		desired_prio.prio = NICE_TO_PRIO(0);
 		desired_prio.sched_policy = SCHED_NORMAL;
-	} else {
-		desired_prio.prio = t->priority.prio;
-		desired_prio.sched_policy = t->priority.sched_policy;
 	}
 
 	if (node_prio.prio < t->priority.prio ||

From 79fd4ce980df5504c490155e85f7c967b2f9003c Mon Sep 17 00:00:00 2001
From: Ganesh Mahendran <opensource.ganesh@gmail.com>
Date: Tue, 26 Sep 2017 17:56:25 +0800
Subject: [PATCH 1194/3220] ANDROID: binder: fix node sched policy calculation

We should use FLAT_BINDER_FLAG_SCHED_POLICY_MASK as
the mask to calculate sched policy.

Change-Id: Ic252fd7c68495830690130d792802c02f99fc8fc
Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
---
 drivers/android/binder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 49b8ad929bfe..112e61a4806b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1303,7 +1303,7 @@ static struct binder_node *binder_init_node_ilocked(
 	node->cookie = cookie;
 	node->work.type = BINDER_WORK_NODE;
 	priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
-	node->sched_policy = (flags & FLAT_BINDER_FLAG_PRIORITY_MASK) >>
+	node->sched_policy = (flags & FLAT_BINDER_FLAG_SCHED_POLICY_MASK) >>
 		FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT;
 	node->min_priority = to_kernel_prio(node->sched_policy, priority);
 	node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);

From 5d3e940b3a354b87d5a5c1e91fb014f57b6a8732 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@google.com>
Date: Wed, 4 Oct 2017 16:56:43 -0700
Subject: [PATCH 1195/3220] ANDROID: fscrypt: remove unnecessary fscrypto.h

This patch removes fscrypto.h which was missed to remove when porting the
latest fs/crypto stuffs.

Change-Id: Ib17cba17d48535b0d754ae52537053098248a036
Fixes: 13f002354db13a02 ("f2fs: catch up to v4.14-rc1")
Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
---
 include/linux/fscrypto.h | 411 ---------------------------------------
 1 file changed, 411 deletions(-)
 delete mode 100644 include/linux/fscrypto.h

diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
deleted file mode 100644
index e6e53a36104b..000000000000
--- a/include/linux/fscrypto.h
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
- * General per-file encryption definition
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-
-#ifndef _LINUX_FSCRYPTO_H
-#define _LINUX_FSCRYPTO_H
-
-#include <linux/key.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/bio.h>
-#include <linux/dcache.h>
-#include <crypto/skcipher.h>
-#include <uapi/linux/fs.h>
-
-#define FS_KEY_DERIVATION_NONCE_SIZE		16
-#define FS_ENCRYPTION_CONTEXT_FORMAT_V1		1
-
-#define FS_POLICY_FLAGS_PAD_4		0x00
-#define FS_POLICY_FLAGS_PAD_8		0x01
-#define FS_POLICY_FLAGS_PAD_16		0x02
-#define FS_POLICY_FLAGS_PAD_32		0x03
-#define FS_POLICY_FLAGS_PAD_MASK	0x03
-#define FS_POLICY_FLAGS_VALID		0x03
-
-/* Encryption algorithms */
-#define FS_ENCRYPTION_MODE_INVALID		0
-#define FS_ENCRYPTION_MODE_AES_256_XTS		1
-#define FS_ENCRYPTION_MODE_AES_256_GCM		2
-#define FS_ENCRYPTION_MODE_AES_256_CBC		3
-#define FS_ENCRYPTION_MODE_AES_256_CTS		4
-
-/**
- * Encryption context for inode
- *
- * Protector format:
- *  1 byte: Protector format (1 = this version)
- *  1 byte: File contents encryption mode
- *  1 byte: File names encryption mode
- *  1 byte: Flags
- *  8 bytes: Master Key descriptor
- *  16 bytes: Encryption Key derivation nonce
- */
-struct fscrypt_context {
-	u8 format;
-	u8 contents_encryption_mode;
-	u8 filenames_encryption_mode;
-	u8 flags;
-	u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
-	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
-} __packed;
-
-/* Encryption parameters */
-#define FS_XTS_TWEAK_SIZE		16
-#define FS_AES_128_ECB_KEY_SIZE		16
-#define FS_AES_256_GCM_KEY_SIZE		32
-#define FS_AES_256_CBC_KEY_SIZE		32
-#define FS_AES_256_CTS_KEY_SIZE		32
-#define FS_AES_256_XTS_KEY_SIZE		64
-#define FS_MAX_KEY_SIZE			64
-
-#define FS_KEY_DESC_PREFIX		"fscrypt:"
-#define FS_KEY_DESC_PREFIX_SIZE		8
-
-/* This is passed in from userspace into the kernel keyring */
-struct fscrypt_key {
-	u32 mode;
-	u8 raw[FS_MAX_KEY_SIZE];
-	u32 size;
-} __packed;
-
-struct fscrypt_info {
-	u8 ci_data_mode;
-	u8 ci_filename_mode;
-	u8 ci_flags;
-	struct crypto_skcipher *ci_ctfm;
-	struct key *ci_keyring_key;
-	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
-};
-
-#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
-#define FS_WRITE_PATH_FL			0x00000002
-
-struct fscrypt_ctx {
-	union {
-		struct {
-			struct page *bounce_page;	/* Ciphertext page */
-			struct page *control_page;	/* Original page  */
-		} w;
-		struct {
-			struct bio *bio;
-			struct work_struct work;
-		} r;
-		struct list_head free_list;	/* Free list */
-	};
-	u8 flags;				/* Flags */
-	u8 mode;				/* Encryption mode for tfm */
-};
-
-struct fscrypt_completion_result {
-	struct completion completion;
-	int res;
-};
-
-#define DECLARE_FS_COMPLETION_RESULT(ecr) \
-	struct fscrypt_completion_result ecr = { \
-		COMPLETION_INITIALIZER((ecr).completion), 0 }
-
-#define FS_FNAME_NUM_SCATTER_ENTRIES	4
-#define FS_CRYPTO_BLOCK_SIZE		16
-#define FS_FNAME_CRYPTO_DIGEST_SIZE	32
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct fscrypt_symlink_data {
-	__le16 len;
-	char encrypted_path[1];
-} __packed;
-
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 fscrypt_symlink_data_len(u32 l)
-{
-	if (l < FS_CRYPTO_BLOCK_SIZE)
-		l = FS_CRYPTO_BLOCK_SIZE;
-	return (l + sizeof(struct fscrypt_symlink_data) - 1);
-}
-
-struct fscrypt_str {
-	unsigned char *name;
-	u32 len;
-};
-
-struct fscrypt_name {
-	const struct qstr *usr_fname;
-	struct fscrypt_str disk_name;
-	u32 hash;
-	u32 minor_hash;
-	struct fscrypt_str crypto_buf;
-};
-
-#define FSTR_INIT(n, l)		{ .name = n, .len = l }
-#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p)		((p)->disk_name.name)
-#define fname_len(p)		((p)->disk_name.len)
-
-/*
- * crypto opertions for filesystems
- */
-struct fscrypt_operations {
-	int (*get_context)(struct inode *, void *, size_t);
-	int (*key_prefix)(struct inode *, u8 **);
-	int (*prepare_context)(struct inode *);
-	int (*set_context)(struct inode *, const void *, size_t, void *);
-	int (*dummy_context)(struct inode *);
-	bool (*is_encrypted)(struct inode *);
-	bool (*empty_dir)(struct inode *);
-	unsigned (*max_namelen)(struct inode *);
-};
-
-static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
-{
-	if (inode->i_sb->s_cop->dummy_context &&
-				inode->i_sb->s_cop->dummy_context(inode))
-		return true;
-	return false;
-}
-
-static inline bool fscrypt_valid_contents_enc_mode(u32 mode)
-{
-	return (mode == FS_ENCRYPTION_MODE_AES_256_XTS);
-}
-
-static inline bool fscrypt_valid_filenames_enc_mode(u32 mode)
-{
-	return (mode == FS_ENCRYPTION_MODE_AES_256_CTS);
-}
-
-static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
-{
-	if (str->len == 1 && str->name[0] == '.')
-		return true;
-
-	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
-		return true;
-
-	return false;
-}
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
-#else
-	WARN_ON_ONCE(1);
-	return ERR_PTR(-EINVAL);
-#endif
-}
-
-static inline int fscrypt_has_encryption_key(struct inode *inode)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	return (inode->i_crypt_info != NULL);
-#else
-	return 0;
-#endif
-}
-
-static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
-	spin_unlock(&dentry->d_lock);
-#endif
-}
-
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-extern const struct dentry_operations fscrypt_d_ops;
-#endif
-
-static inline void fscrypt_set_d_op(struct dentry *dentry)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	d_set_d_op(dentry, &fscrypt_d_ops);
-#endif
-}
-
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-/* crypto.c */
-extern struct kmem_cache *fscrypt_info_cachep;
-int fscrypt_initialize(void);
-
-extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t);
-extern void fscrypt_release_ctx(struct fscrypt_ctx *);
-extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t);
-extern int fscrypt_decrypt_page(struct page *);
-extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
-extern void fscrypt_pullback_bio_page(struct page **, bool);
-extern void fscrypt_restore_control_page(struct page *);
-extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t,
-						unsigned int);
-/* policy.c */
-extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
-extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
-extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
-extern int fscrypt_inherit_context(struct inode *, struct inode *,
-					void *, bool);
-/* keyinfo.c */
-extern int get_crypt_info(struct inode *);
-extern int fscrypt_get_encryption_info(struct inode *);
-extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
-
-/* fname.c */
-extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
-				int lookup, struct fscrypt_name *);
-extern void fscrypt_free_filename(struct fscrypt_name *);
-extern u32 fscrypt_fname_encrypted_size(struct inode *, u32);
-extern int fscrypt_fname_alloc_buffer(struct inode *, u32,
-				struct fscrypt_str *);
-extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
-extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
-			const struct fscrypt_str *, struct fscrypt_str *);
-extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
-			struct fscrypt_str *);
-#endif
-
-/* crypto.c */
-static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i,
-							gfp_t f)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c)
-{
-	return;
-}
-
-static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i,
-						struct page *p, gfp_t f)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline int fscrypt_notsupp_decrypt_page(struct page *p)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c,
-						struct bio *b)
-{
-	return;
-}
-
-static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b)
-{
-	return;
-}
-
-static inline void fscrypt_notsupp_restore_control_page(struct page *p)
-{
-	return;
-}
-
-static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p,
-					sector_t s, unsigned int f)
-{
-	return -EOPNOTSUPP;
-}
-
-/* policy.c */
-static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f,
-				const void __user *arg)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f,
-				void __user *arg)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_notsupp_has_permitted_context(struct inode *p,
-				struct inode *i)
-{
-	return 0;
-}
-
-static inline int fscrypt_notsupp_inherit_context(struct inode *p,
-				struct inode *i, void *v, bool b)
-{
-	return -EOPNOTSUPP;
-}
-
-/* keyinfo.c */
-static inline int fscrypt_notsupp_get_encryption_info(struct inode *i)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_notsupp_put_encryption_info(struct inode *i,
-					struct fscrypt_info *f)
-{
-	return;
-}
-
- /* fname.c */
-static inline int fscrypt_notsupp_setup_filename(struct inode *dir,
-			const struct qstr *iname,
-			int lookup, struct fscrypt_name *fname)
-{
-	if (dir->i_sb->s_cop->is_encrypted(dir))
-		return -EOPNOTSUPP;
-
-	memset(fname, 0, sizeof(struct fscrypt_name));
-	fname->usr_fname = iname;
-	fname->disk_name.name = (unsigned char *)iname->name;
-	fname->disk_name.len = iname->len;
-	return 0;
-}
-
-static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname)
-{
-	return;
-}
-
-static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s)
-{
-	/* never happens */
-	WARN_ON(1);
-	return 0;
-}
-
-static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode,
-				u32 ilen, struct fscrypt_str *crypto_str)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c)
-{
-	return;
-}
-
-static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode,
-			u32 hash, u32 minor_hash,
-			const struct fscrypt_str *iname,
-			struct fscrypt_str *oname)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode,
-			const struct qstr *iname,
-			struct fscrypt_str *oname)
-{
-	return -EOPNOTSUPP;
-}
-#endif	/* _LINUX_FSCRYPTO_H */

From fbbe0f6b7fc1db480a613849f67690610ed13ef1 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Mon, 2 Oct 2017 02:50:16 +0800
Subject: [PATCH 1196/3220] UPSTREAM: f2fs: fix potential panic during fstrim

As Ju Hyung Park reported:

"When 'fstrim' is called for manual trim, a BUG() can be triggered
randomly with this patch.

I'm seeing this issue on both x86 Desktop and arm64 Android phone.

On x86 Desktop, this was caused during Ubuntu boot-up. I have a
cronjob installed which calls 'fstrim -v /' during boot. On arm64
Android, this was caused during GC looping with 1ms gc_min_sleep_time
& gc_max_sleep_time."

Root cause of this issue is that f2fs_wait_discard_bios can only be
used by f2fs_put_super, because during put_super there must be no
other referrers, so it can ignore discard entry's reference count
when removing the entry, otherwise in other caller we will hit bug_on
in __remove_discard_cmd as there may be other issuer added reference
count in discard entry.

Thread A				Thread B
					- issue_discard_thread
- f2fs_ioc_fitrim
 - f2fs_trim_fs
  - f2fs_wait_discard_bios
   - __issue_discard_cmd
    - __submit_discard_cmd
					 - __wait_discard_cmd
					  - dc->ref++
					  - __wait_one_discard_bio
   - __wait_discard_cmd
    - __remove_discard_cmd
     - f2fs_bug_on(sbi, dc->ref)

Change-Id: I8fb5c8215e6222ae853e7781218d5084e1f11166
Fixes: 969d1b180d987c2be02de890d0fff0f66a0e80de
Reported-by: Ju Hyung Park <qkrwngud825@gmail.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
(cherry picked from commit 638164a2718f337ea224b747cf5977ef143166a4)
---
 fs/f2fs/f2fs.h    | 2 +-
 fs/f2fs/segment.c | 6 +++---
 fs/f2fs/super.c   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ff694127243a..dd840f60e172 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2606,7 +2606,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
 void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
 void stop_discard_thread(struct f2fs_sb_info *sbi);
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount);
 void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 void release_discard_addrs(struct f2fs_sb_info *sbi);
 int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 059a219b7740..f5c494389483 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1290,11 +1290,11 @@ void stop_discard_thread(struct f2fs_sb_info *sbi)
 }
 
 /* This comes from f2fs_put_super and f2fs_trim_fs */
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount)
 {
 	__issue_discard_cmd(sbi, false);
 	__drop_discard_cmd(sbi);
-	__wait_discard_cmd(sbi, false);
+	__wait_discard_cmd(sbi, !umount);
 }
 
 static void mark_discard_range_all(struct f2fs_sb_info *sbi)
@@ -2324,7 +2324,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	}
 	/* It's time to issue all the filed discards */
 	mark_discard_range_all(sbi);
-	f2fs_wait_discard_bios(sbi);
+	f2fs_wait_discard_bios(sbi, false);
 out:
 	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
 	return err;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 315e59ad1483..482bb0333806 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb)
 	}
 
 	/* be sure to wait for any on-going discard commands */
-	f2fs_wait_discard_bios(sbi);
+	f2fs_wait_discard_bios(sbi, true);
 
 	if (f2fs_discard_en(sbi) && !sbi->discard_blks) {
 		struct cp_control cpc = {

From 3947a03066051482ed4ca7ac34aea4a86c4d8a92 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Fri, 2 Jun 2017 11:28:54 -0400
Subject: [PATCH 1197/3220] crypto: Work around deallocated stack frame
 reference gcc bug on sparc.

commit d41519a69b35b10af7fda867fb9100df24fdf403 upstream.

On sparc, if we have an alloca() like situation, as is the case with
SHASH_DESC_ON_STACK(), we can end up referencing deallocated stack
memory.  The result can be that the value is clobbered if a trap
or interrupt arrives at just the right instruction.

It only occurs if the function ends returning a value from that
alloca() area and that value can be placed into the return value
register using a single instruction.

For example, in lib/libcrc32c.c:crc32c() we end up with a return
sequence like:

        return  %i7+8
         lduw   [%o5+16], %o0   ! MEM[(u32 *)__shash_desc.1_10 + 16B],

%o5 holds the base of the on-stack area allocated for the shash
descriptor.  But the return released the stack frame and the
register window.

So if an intererupt arrives between 'return' and 'lduw', then
the value read at %o5+16 can be corrupted.

Add a data compiler barrier to work around this problem.  This is
exactly what the gcc fix will end up doing as well, and it absolutely
should not change the code generated for other cpus (unless gcc
on them has the same bug :-)

With crucial insight from Eric Sandeen.

Reported-by: Anatoly Pugachev <matorola@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/hash.c | 5 ++++-
 fs/f2fs/f2fs.h  | 5 ++++-
 lib/libcrc32c.c | 6 ++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index aae520b2aee5..6fc94fcba072 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -33,6 +33,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
 {
 	SHASH_DESC_ON_STACK(shash, tfm);
 	u32 *ctx = (u32 *)shash_desc_ctx(shash);
+	u32 retval;
 	int err;
 
 	shash->tfm = tfm;
@@ -42,5 +43,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
 	err = crypto_shash_update(shash, address, length);
 	BUG_ON(err);
 
-	return *ctx;
+	retval = *ctx;
+	barrier_data(ctx);
+	return retval;
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index dd840f60e172..c1a0aef8efc6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1268,6 +1268,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
 {
 	SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
 	u32 *ctx = (u32 *)shash_desc_ctx(shash);
+	u32 retval;
 	int err;
 
 	shash->tfm = sbi->s_chksum_driver;
@@ -1277,7 +1278,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
 	err = crypto_shash_update(shash, address, length);
 	BUG_ON(err);
 
-	return *ctx;
+	retval = *ctx;
+	barrier_data(ctx);
+	return retval;
 }
 
 static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index acf9da449f81..2c573f08adb7 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -42,7 +42,7 @@ static struct crypto_shash *tfm;
 u32 crc32c(u32 crc, const void *address, unsigned int length)
 {
 	SHASH_DESC_ON_STACK(shash, tfm);
-	u32 *ctx = (u32 *)shash_desc_ctx(shash);
+	u32 ret, *ctx = (u32 *)shash_desc_ctx(shash);
 	int err;
 
 	shash->tfm = tfm;
@@ -52,7 +52,9 @@ u32 crc32c(u32 crc, const void *address, unsigned int length)
 	err = crypto_shash_update(shash, address, length);
 	BUG_ON(err);
 
-	return *ctx;
+	ret = *ctx;
+	barrier_data(ctx);
+	return ret;
 }
 
 EXPORT_SYMBOL(crc32c);

From 8526e9f06400ad95647c78617439216edf18977b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 13 Oct 2017 10:27:45 -0700
Subject: [PATCH 1198/3220] FROMLIST: f2fs: expose some sectors to user in
 inline data or dentry case

(from https://patchwork.kernel.org/patch/10005409/)

If there's some data written through inline data or dentry, we need to shouw
st_blocks. This fixes reporting zero blocks even though there is small written
data.

Bug: 67651285
Bug: 67600404
Change-Id: I9ad5cb137eb627b9fd22740d2ab98e0221433c95
Cc: stable@vger.kernel.org
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 531379f513fa..a9e1655a6bf8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -665,6 +665,11 @@ int f2fs_getattr(struct vfsmount *mnt,
 {
 	struct inode *inode = d_inode(dentry);
 	generic_fillattr(inode, stat);
+
+	/* we need to show initial sectors used for inline_data/dentries */
+	if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
+		stat->blocks += (stat->size + 511) >> 9;
+
 	return 0;
 }
 

From ec888d46d8993b2bf205ed375e538a3819c23659 Mon Sep 17 00:00:00 2001
From: Olav Haugan <ohaugan@codeaurora.org>
Date: Wed, 5 Aug 2015 08:45:21 -0700
Subject: [PATCH 1199/3220] sched: Update task->on_rq when tasks are moving
 between runqueues

Task->on_rq has three states:
	0 - Task is not on runqueue (rq)
	1 (TASK_ON_RQ_QUEUED) - Task is on rq
	2 (TASK_ON_RQ_MIGRATING) - Task is on rq but in the
	process of being migrated to another rq

When a task is moving between rqs task->on_rq state should be
TASK_ON_RQ_MIGRATING in order for WALT to account rq's cumulative
runnable average correctly.  Without such state marking for all the
classes, WALT's update_history() would try to fixup task's demand
which was never contributed to any of CPUs during migration.

Change-Id: Iced3428f3924fe8ab5d0075698273ead04f12d5b
Signed-off-by: Olav Haugan <ohaugan@codeaurora.org>
[joonwoop: Reinforced changelog to explain why this is needed by WALT.
           Fixed conflicts in deadline.c]
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/core.c     | 2 ++
 kernel/sched/deadline.c | 4 ++++
 kernel/sched/rt.c       | 4 ++++
 3 files changed, 10 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb6076ca98d2..5e7f755b4a21 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1331,7 +1331,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		dst_rq = cpu_rq(cpu);
 
 		deactivate_task(src_rq, p, 0);
+		p->on_rq = TASK_ON_RQ_MIGRATING;
 		set_task_cpu(p, cpu);
+		p->on_rq = TASK_ON_RQ_QUEUED;
 		activate_task(dst_rq, p, 0);
 		check_preempt_curr(dst_rq, p, 0);
 	} else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2aae8b8b68e8..ab1a9a99660d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1587,7 +1587,9 @@ retry:
 
 	deactivate_task(rq, next_task, 0);
 	clear_average_bw(&next_task->dl, &rq->dl);
+	next_task->on_rq = TASK_ON_RQ_MIGRATING;
 	set_task_cpu(next_task, later_rq->cpu);
+	next_task->on_rq = TASK_ON_RQ_QUEUED;
 	add_average_bw(&next_task->dl, &later_rq->dl);
 	activate_task(later_rq, next_task, 0);
 	ret = 1;
@@ -1677,7 +1679,9 @@ static void pull_dl_task(struct rq *this_rq)
 
 			deactivate_task(src_rq, p, 0);
 			clear_average_bw(&p->dl, &src_rq->dl);
+			p->on_rq = TASK_ON_RQ_MIGRATING;
 			set_task_cpu(p, this_cpu);
+			p->on_rq = TASK_ON_RQ_QUEUED;
 			add_average_bw(&p->dl, &this_rq->dl);
 			activate_task(this_rq, p, 0);
 			dmin = p->dl.deadline;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6bb51d62dca4..3715473fd8f8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1882,7 +1882,9 @@ retry:
 	}
 
 	deactivate_task(rq, next_task, 0);
+	next_task->on_rq = TASK_ON_RQ_MIGRATING;
 	set_task_cpu(next_task, lowest_rq->cpu);
+	next_task->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(lowest_rq, next_task, 0);
 	ret = 1;
 
@@ -2136,7 +2138,9 @@ static void pull_rt_task(struct rq *this_rq)
 			resched = true;
 
 			deactivate_task(src_rq, p, 0);
+			p->on_rq = TASK_ON_RQ_MIGRATING;
 			set_task_cpu(p, this_cpu);
+			p->on_rq = TASK_ON_RQ_QUEUED;
 			activate_task(this_rq, p, 0);
 			/*
 			 * We continue with the search, just in

From 20ea95166e628343b388139f1b521c50f5fd0b44 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Mon, 10 Apr 2017 11:23:25 -0700
Subject: [PATCH 1200/3220] ANDROID: HACK: arm64: use -mno-implicit-float
 instead of -mgeneral-regs-only

LLVM bug 30792 causes clang's AArch64 backend to crash compiling
arch/arm64/crypto/aes-ce-cipher.c.  Replacing -mgeneral-regs-only with
-mno-implicit-float is the suggested workaround.

Drop this patch once the clang bug has been fixed.

Change-Id: I7c7bb9315a281970698120a6d2a9fcd126aad65e
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
---
 arch/arm64/Makefile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 98c69f13cb09..c3bc092fc128 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -30,7 +30,14 @@ $(warning LSE atomics not supported by binutils)
   endif
 endif
 
-KBUILD_CFLAGS	+= -mgeneral-regs-only $(lseinstr)
+ifeq ($(cc-name),clang)
+# This is a workaround for https://bugs.llvm.org/show_bug.cgi?id=30792.
+# TODO: revert when this is fixed in LLVM.
+KBUILD_CFLAGS	+= -mno-implicit-float
+else
+KBUILD_CFLAGS	+= -mgeneral-regs-only
+endif
+KBUILD_CFLAGS	+= $(lseinstr)
 KBUILD_CFLAGS	+= -fno-pic
 KBUILD_CFLAGS	+= -fno-asynchronous-unwind-tables
 KBUILD_CFLAGS	+= $(call cc-option, -mpc-relative-literal-loads)

From be832f69a95eb97b4ed0bc8c2b716ee7837ea617 Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Thu, 24 Aug 2017 11:38:00 -0700
Subject: [PATCH 1201/3220] sched: walt: Leverage existing helper APIs to apply
 invariance

There's no need for a separate hierarchy of notifiers, APIs
and variables in walt.c for the purpose of applying frequency
and IPC invariance. Let's just use capacity_curr_of and get
rid of a lot of the infrastructure relating to capacity,
load_scale_factor etc.

Change-Id: Ia220e2c896373fa535db05bff60f9aa33aefc978
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 include/trace/events/sched.h |   7 +-
 kernel/sched/core.c          |   1 -
 kernel/sched/sched.h         |  11 --
 kernel/sched/walt.c          | 309 +----------------------------------
 4 files changed, 9 insertions(+), 319 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 8b97464803f1..d4173039c599 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -1045,7 +1045,6 @@ TRACE_EVENT(walt_update_task_ravg,
 		__array(	char,	comm,   TASK_COMM_LEN	)
 		__field(	pid_t,	pid			)
 		__field(	pid_t,	cur_pid			)
-		__field(unsigned int,	cur_freq		)
 		__field(	u64,	wallclock		)
 		__field(	u64,	mark_start		)
 		__field(	u64,	delta_m			)
@@ -1073,7 +1072,6 @@ TRACE_EVENT(walt_update_task_ravg,
 		__entry->evt            = evt;
 		__entry->cpu            = rq->cpu;
 		__entry->cur_pid        = rq->curr->pid;
-		__entry->cur_freq       = rq->cur_freq;
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid            = p->pid;
 		__entry->mark_start     = p->ravg.mark_start;
@@ -1092,11 +1090,10 @@ TRACE_EVENT(walt_update_task_ravg,
 		__entry->active_windows	= p->ravg.active_windows;
 	),
 
-	TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
+	TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
 		" cs %llu ps %llu util %lu cur_window %u prev_window %u active_wins %u"
 		, __entry->wallclock, __entry->win_start, __entry->delta,
-		__entry->evt, __entry->cpu,
-		__entry->cur_freq, __entry->cur_pid,
+		__entry->evt, __entry->cpu, __entry->cur_pid,
 		__entry->pid, __entry->comm, __entry->mark_start,
 		__entry->delta_m, __entry->demand,
 		__entry->sum, __entry->irqtime,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e7f755b4a21..fec7b2c4f9fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7702,7 +7702,6 @@ void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
 
-	walt_init_cpu_efficiency();
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d4613c5be81d..9f3d89faacdc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -673,18 +673,7 @@ struct rq {
 #endif
 
 #ifdef CONFIG_SCHED_WALT
-	/*
-	 * max_freq = user or thermal defined maximum
-	 * max_possible_freq = maximum supported by hardware
-	 */
-	unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
-	struct cpumask freq_domain_cpumask;
-
 	u64 cumulative_runnable_avg;
-	int efficiency; /* Differentiate cpus with different IPC capability */
-	int load_scale_factor;
-	int capacity;
-	int max_possible_capacity;
 	u64 window_start;
 	u64 curr_runnable_sum;
 	u64 prev_runnable_sum;
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 28e999554463..441cba01bc04 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -20,7 +20,6 @@
  */
 
 #include <linux/syscore_ops.h>
-#include <linux/cpufreq.h>
 #include <trace/events/sched.h>
 #include "sched.h"
 #include "walt.h"
@@ -45,29 +44,6 @@ unsigned int sysctl_sched_walt_init_task_load_pct = 15;
 /* 1 -> use PELT based load stats, 0 -> use window-based load stats */
 unsigned int __read_mostly walt_disabled = 0;
 
-static unsigned int max_possible_efficiency = 1024;
-static unsigned int min_possible_efficiency = 1024;
-
-/*
- * Maximum possible frequency across all cpus. Task demand and cpu
- * capacity (cpu_power) metrics are scaled in reference to it.
- */
-static unsigned int max_possible_freq = 1;
-
-/*
- * Minimum possible max_freq across all cpus. This will be same as
- * max_possible_freq on homogeneous systems and could be different from
- * max_possible_freq on heterogenous systems. min_max_freq is used to derive
- * capacity (cpu_power) of cpus.
- */
-static unsigned int min_max_freq = 1;
-
-static unsigned int max_load_scale_factor = 1024;
-static unsigned int max_possible_capacity = 1024;
-
-/* Mask of all CPUs that have  max_possible_capacity */
-static cpumask_t mpc_mask = CPU_MASK_ALL;
-
 /* Window size (in ns) */
 __read_mostly unsigned int walt_ravg_window = 20000000;
 
@@ -206,24 +182,16 @@ update_window_start(struct rq *rq, u64 wallclock)
 	rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
 }
 
+/*
+ * Translate absolute delta time accounted on a CPU
+ * to a scale where 1024 is the capacity of the most
+ * capable CPU running at FMAX
+ */
 static u64 scale_exec_time(u64 delta, struct rq *rq)
 {
-	unsigned int cur_freq = rq->cur_freq;
-	int sf;
+	unsigned long capcurr = capacity_curr_of(cpu_of(rq));
 
-	if (unlikely(cur_freq > max_possible_freq))
-		cur_freq = rq->max_possible_freq;
-
-	/* round up div64 */
-	delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
-			  max_possible_freq);
-
-	sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
-
-	delta *= sf;
-	delta >>= 10;
-
-	return delta;
+	return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
 }
 
 static int cpu_is_waiting_on_io(struct rq *rq)
@@ -746,33 +714,6 @@ done:
 	p->ravg.mark_start = wallclock;
 }
 
-unsigned long __weak arch_get_cpu_efficiency(int cpu)
-{
-	return SCHED_LOAD_SCALE;
-}
-
-void walt_init_cpu_efficiency(void)
-{
-	int i, efficiency;
-	unsigned int max = 0, min = UINT_MAX;
-
-	for_each_possible_cpu(i) {
-		efficiency = arch_get_cpu_efficiency(i);
-		cpu_rq(i)->efficiency = efficiency;
-
-		if (efficiency > max)
-			max = efficiency;
-		if (efficiency < min)
-			min = efficiency;
-	}
-
-	if (max)
-		max_possible_efficiency = max;
-
-	if (min)
-		min_possible_efficiency = min;
-}
-
 static void reset_task_stats(struct task_struct *p)
 {
 	u32 sum = 0;
@@ -877,242 +818,6 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
 		double_rq_unlock(src_rq, dest_rq);
 }
 
-/*
- * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
- * least efficient cpu gets capacity of 1024
- */
-static unsigned long capacity_scale_cpu_efficiency(int cpu)
-{
-	return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
-}
-
-/*
- * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
- * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
- */
-static unsigned long capacity_scale_cpu_freq(int cpu)
-{
-	return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
- * that "most" efficient cpu gets a load_scale_factor of 1
- */
-static unsigned long load_scale_cpu_efficiency(int cpu)
-{
-	return DIV_ROUND_UP(1024 * max_possible_efficiency,
-			    cpu_rq(cpu)->efficiency);
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to cpu with best max_freq
- * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
- * of 1.
- */
-static unsigned long load_scale_cpu_freq(int cpu)
-{
-	return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
-}
-
-static int compute_capacity(int cpu)
-{
-	int capacity = 1024;
-
-	capacity *= capacity_scale_cpu_efficiency(cpu);
-	capacity >>= 10;
-
-	capacity *= capacity_scale_cpu_freq(cpu);
-	capacity >>= 10;
-
-	return capacity;
-}
-
-static int compute_load_scale_factor(int cpu)
-{
-	int load_scale = 1024;
-
-	/*
-	 * load_scale_factor accounts for the fact that task load
-	 * is in reference to "best" performing cpu. Task's load will need to be
-	 * scaled (up) by a factor to determine suitability to be placed on a
-	 * (little) cpu.
-	 */
-	load_scale *= load_scale_cpu_efficiency(cpu);
-	load_scale >>= 10;
-
-	load_scale *= load_scale_cpu_freq(cpu);
-	load_scale >>= 10;
-
-	return load_scale;
-}
-
-static int cpufreq_notifier_policy(struct notifier_block *nb,
-		unsigned long val, void *data)
-{
-	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
-	int i, update_max = 0;
-	u64 highest_mpc = 0, highest_mplsf = 0;
-	const struct cpumask *cpus = policy->related_cpus;
-	unsigned int orig_min_max_freq = min_max_freq;
-	unsigned int orig_max_possible_freq = max_possible_freq;
-	/* Initialized to policy->max in case policy->related_cpus is empty! */
-	unsigned int orig_max_freq = policy->max;
-
-	if (val != CPUFREQ_NOTIFY)
-		return 0;
-
-	for_each_cpu(i, policy->related_cpus) {
-		cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
-			     policy->related_cpus);
-		orig_max_freq = cpu_rq(i)->max_freq;
-		cpu_rq(i)->min_freq = policy->min;
-		cpu_rq(i)->max_freq = policy->max;
-		cpu_rq(i)->cur_freq = policy->cur;
-		cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
-	}
-
-	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
-	if (min_max_freq == 1)
-		min_max_freq = UINT_MAX;
-	min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
-	BUG_ON(!min_max_freq);
-	BUG_ON(!policy->max);
-
-	/* Changes to policy other than max_freq don't require any updates */
-	if (orig_max_freq == policy->max)
-		return 0;
-
-	/*
-	 * A changed min_max_freq or max_possible_freq (possible during bootup)
-	 * needs to trigger re-computation of load_scale_factor and capacity for
-	 * all possible cpus (even those offline). It also needs to trigger
-	 * re-computation of nr_big_task count on all online cpus.
-	 *
-	 * A changed rq->max_freq otoh needs to trigger re-computation of
-	 * load_scale_factor and capacity for just the cluster of cpus involved.
-	 * Since small task definition depends on max_load_scale_factor, a
-	 * changed load_scale_factor of one cluster could influence
-	 * classification of tasks in another cluster. Hence a changed
-	 * rq->max_freq will need to trigger re-computation of nr_big_task
-	 * count on all online cpus.
-	 *
-	 * While it should be sufficient for nr_big_tasks to be
-	 * re-computed for only online cpus, we have inadequate context
-	 * information here (in policy notifier) with regard to hotplug-safety
-	 * context in which notification is issued. As a result, we can't use
-	 * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
-	 * fixed up to issue notification always in hotplug-safe context,
-	 * re-compute nr_big_task for all possible cpus.
-	 */
-
-	if (orig_min_max_freq != min_max_freq ||
-		orig_max_possible_freq != max_possible_freq) {
-			cpus = cpu_possible_mask;
-			update_max = 1;
-	}
-
-	/*
-	 * Changed load_scale_factor can trigger reclassification of tasks as
-	 * big or small. Make this change "atomic" so that tasks are accounted
-	 * properly due to changed load_scale_factor
-	 */
-	for_each_cpu(i, cpus) {
-		struct rq *rq = cpu_rq(i);
-
-		rq->capacity = compute_capacity(i);
-		rq->load_scale_factor = compute_load_scale_factor(i);
-
-		if (update_max) {
-			u64 mpc, mplsf;
-
-			mpc = div_u64(((u64) rq->capacity) *
-				rq->max_possible_freq, rq->max_freq);
-			rq->max_possible_capacity = (int) mpc;
-
-			mplsf = div_u64(((u64) rq->load_scale_factor) *
-				rq->max_possible_freq, rq->max_freq);
-
-			if (mpc > highest_mpc) {
-				highest_mpc = mpc;
-				cpumask_clear(&mpc_mask);
-				cpumask_set_cpu(i, &mpc_mask);
-			} else if (mpc == highest_mpc) {
-				cpumask_set_cpu(i, &mpc_mask);
-			}
-
-			if (mplsf > highest_mplsf)
-				highest_mplsf = mplsf;
-		}
-	}
-
-	if (update_max) {
-		max_possible_capacity = highest_mpc;
-		max_load_scale_factor = highest_mplsf;
-	}
-
-	return 0;
-}
-
-static int cpufreq_notifier_trans(struct notifier_block *nb,
-		unsigned long val, void *data)
-{
-	struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
-	unsigned int cpu = freq->cpu, new_freq = freq->new;
-	unsigned long flags;
-	int i;
-
-	if (val != CPUFREQ_POSTCHANGE)
-		return 0;
-
-	BUG_ON(!new_freq);
-
-	if (cpu_rq(cpu)->cur_freq == new_freq)
-		return 0;
-
-	for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
-		struct rq *rq = cpu_rq(i);
-
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
-				      walt_ktime_clock(), 0);
-		rq->cur_freq = new_freq;
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-	}
-
-	return 0;
-}
-
-static struct notifier_block notifier_policy_block = {
-	.notifier_call = cpufreq_notifier_policy
-};
-
-static struct notifier_block notifier_trans_block = {
-	.notifier_call = cpufreq_notifier_trans
-};
-
-static int register_sched_callback(void)
-{
-	int ret;
-
-	ret = cpufreq_register_notifier(&notifier_policy_block,
-						CPUFREQ_POLICY_NOTIFIER);
-
-	if (!ret)
-		ret = cpufreq_register_notifier(&notifier_trans_block,
-						CPUFREQ_TRANSITION_NOTIFIER);
-
-	return 0;
-}
-
-/*
- * cpufreq callbacks can be registered at core_initcall or later time.
- * Any registration done prior to that is "forgotten" by cpufreq. See
- * initialization of variable init_cpufreq_transition_notifier_list_called
- * for further information.
- */
-core_initcall(register_sched_callback);
-
 void walt_init_new_task_load(struct task_struct *p)
 {
 	int i;

From 7ab48e4c8d9e0652bd978f3df26c29e64b5ea85a Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Fri, 21 Oct 2016 13:39:31 -0700
Subject: [PATCH 1202/3220] sched/fair: prevent meaningless active migration

At present need_active_balance() determines whether an active
upmigration is needed by using capacity_of(). A CPU's capacity
may be reduced by RT pressure, and therefore distinguishing
capability differences with capacity_of() may lead to suboptimal
active migrations to less capable CPUs. Use capacity_orig_of
to distinguish differently capable CPUs in addition to
capacity_of(), thus avoiding placing tasks on less capable CPUs
due to instantaneous RT pressure.

Change-Id: I3e1435246a8edc3ad618ef98a34866cfbd8c16a5
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
[markivx: Reworked the commit text a bit]
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f43ece69e8c..d7bea4761a55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8863,6 +8863,7 @@ static int need_active_balance(struct lb_env *env)
 	}
 
 	if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+	    ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
 				env->src_rq->cfs.h_nr_running == 1 &&
 				cpu_overutilized(env->src_cpu) &&
 				!cpu_overutilized(env->dst_cpu)) {

From 40c3aaa56a68455a1c8eccff1bb23d853d8e239c Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Tue, 20 Dec 2016 15:23:53 -0800
Subject: [PATCH 1203/3220] cpufreq: sched: update capacity request upon tick
 always

At present, sched_freq_tick() skips updating of capacity update when
current frequency is fmax.  This can cause incorrect frequency drop
when a CPU bound task goes into sleep for example :
  1) A task (A) enqueues onto CPU 0 and executes for long time.
  2) A new task (B) which has low task demand enqueues onto CPU 1 and
     executes long so becomes a CPU bound task.
  3) Both CPU 0 and 1 gets scheduler tick but skip sched_freq_tick()
     since current frequency is fmax.
  4) Task (A) sleeps and lower the CPU 0's capacity request.
  5) Because task (B) voted CPU capacity at step 2 with low demand and
     skipped to request afterwards, cluster frequency for both CPU 0
     and 1 drops to match capacity voted by CPU 1 at step 2 even though
     task (B) on CPU 1 requires max capacity.

Fix such incorrectness by not skipping CPU capacity voting at tick
path.

Change-Id: Ieb46af1ac96ffce7a5532c58c7f07bf1ada06b86
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/core.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fec7b2c4f9fd..f747e373ed6f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3025,16 +3025,9 @@ static void sched_freq_tick_walt(int cpu)
 
 static void sched_freq_tick(int cpu)
 {
-	unsigned long capacity_orig, capacity_curr;
-
 	if (!sched_freq())
 		return;
 
-	capacity_orig = capacity_orig_of(cpu);
-	capacity_curr = capacity_curr_of(cpu);
-	if (capacity_curr == capacity_orig)
-		return;
-
 	_sched_freq_tick(cpu);
 }
 #else

From ed9e74966829b60ca2a2cd39cdcb0824a87a4da2 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Thu, 22 Dec 2016 12:08:50 -0800
Subject: [PATCH 1204/3220] sched: EAS/WALT: finish accounting prior to
 task_tick

In order to set rq->misfit_task in time, call update_task_ravg() prior
to task_tick.  This reduces upmigration delay by 1 scheduler window.

Change-Id: I7cc80badd423f2e7684125fbfd853b0a3610f0e8
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f747e373ed6f..83f7c682032b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3048,11 +3048,11 @@ void scheduler_tick(void)
 
 	raw_spin_lock(&rq->lock);
 	walt_set_window_start(rq);
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+			walt_ktime_clock(), 0);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
-	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
-			walt_ktime_clock(), 0);
 	calc_global_load_tick(rq);
 	sched_freq_tick(cpu);
 	raw_spin_unlock(&rq->lock);

From 724091f67f61e8362d00e15ddda4c1e224671f7e Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 29 Jun 2017 12:22:54 +0100
Subject: [PATCH 1205/3220] sched/fair: remove erroneous RCU_LOCKDEP_WARN from
 start_cpu()

Fixes: https://bugs.linaro.org/show_bug.cgi?id=3075

Change-Id: I62d714fc4b9366a9b2535649aa92d1edc840cf94
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7bea4761a55..b1df3873b6fd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6234,9 +6234,6 @@ static int start_cpu(bool boosted)
 {
 	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 
-	RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(),
-			   "sched RCU must be held");
-
 	return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
 }
 

From 18b484845c57c7814146d58c9d8848bcaf6b37ba Mon Sep 17 00:00:00 2001
From: Ke Wang <ke.wang@spreadtrum.com>
Date: Wed, 18 Oct 2017 15:49:33 +0800
Subject: [PATCH 1206/3220] trace: sched: Fix util_avg_walt in
 sched_load_avg_cpu trace

cumulative_runnable_avg was introduced in commit ee4cebd75ed7 ("sched:
EAS/WALT: use cr_avg instead of prev_runnable_sum") in cpu_util() for
task placement, which is used to replace prev_runnable_sum.

Fix util_avg_walt in sched_load_avg_cpu trace, which use prev_runnable_sum
for cpu_util().

Moreover, fix potential overflow due to cumulative_runnable_avg is in u64.

Change-Id: I1220477bf2ff32a6e34a34b6280b15a8178203a8
Signed-off-by: Ke Wang <ke.wang@spreadtrum.com>
---
 include/trace/events/sched.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index d4173039c599..bf96bf05be82 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -724,9 +724,9 @@ TRACE_EVENT(sched_load_avg_cpu,
 		__entry->util_avg_pelt	= cfs_rq->avg.util_avg;
 		__entry->util_avg_walt	= 0;
 #ifdef CONFIG_SCHED_WALT
-		__entry->util_avg_walt	=
-				cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT;
-		do_div(__entry->util_avg_walt, walt_ravg_window);
+		__entry->util_avg_walt =
+				div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
+						  walt_ravg_window >> SCHED_LOAD_SHIFT);
 		if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
 			__entry->util_avg		= __entry->util_avg_walt;
 #endif

From d34d2c97ae56961ca73fc8704aec2304bb820668 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Tue, 20 Jun 2017 12:12:49 +0100
Subject: [PATCH 1207/3220] cpufreq/sched: Use cpu max freq rather than policy
 max

When we convert capacity into frequency, we used policy->max to get
the max freq of the cpu. Since this can be changed by userspace policy
or thermal events, we are potentially asking for a lower frequency
than the utilization demands.

Change over to using cpuinfo.max which is the max freq supported by
that cpu rather than the currently-chosen max. Frequency granted still
honours the max policy.

Tested by setting a userspace policy and observing the relevant vars
in a trace. In this instance, we ask for around 1ghz instead of 620MHz.

freq_new=1013512
unfixed_freq_new=624487
capacity=546
cpuinfo_max=1900800
policy_max=1171200

Change-Id: I8c5694db42243c6fb78bb9be9046b06ac81295e7
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/cpufreq_sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index 6ffb23adbcef..ec0aed7a8f96 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -202,7 +202,7 @@ static void update_fdomain_capacity_request(int cpu)
 	}
 
 	/* Convert the new maximum capacity request into a cpu frequency */
-	freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+	freq_new = capacity * policy->cpuinfo.max_freq >> SCHED_CAPACITY_SHIFT;
 	if (cpufreq_frequency_table_target(policy, policy->freq_table,
 					   freq_new, CPUFREQ_RELATION_L,
 					   &index_new))

From c9391ba6ee7930ccfda71748cec2d8d210510cc8 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Thu, 5 Oct 2017 17:00:57 -0400
Subject: [PATCH 1208/3220] FROMLIST: android: binder: Change binder_shrinker
 to static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(from https://patchwork.kernel.org/patch/9990321/)

binder_shrinker struct is not used anywhere outside of
binder_alloc.c and should be static.

Bug: 63926541
Change-Id: I7a13d4ddbaaf3721cddfe1d860e34c7be80dd082
Acked-by: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index b95da16fd938..22cb64f8d96c 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -986,7 +986,7 @@ binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 	return ret;
 }
 
-struct shrinker binder_shrinker = {
+static struct shrinker binder_shrinker = {
 	.count_objects = binder_shrink_count,
 	.scan_objects = binder_shrink_scan,
 	.seeks = DEFAULT_SEEKS,

From a7e1d33c7a1101936cf7f048a0ef7cac62e32cd0 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Thu, 5 Oct 2017 17:13:47 -0400
Subject: [PATCH 1209/3220] FROMLIST: android: binder: Fix null ptr dereference
 in debug msg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(from https://patchwork.kernel.org/patch/9990323/)

Don't access next->data in kernel debug message when the
next buffer is null.

Bug: 36007193
Change-Id: Ib8240d7e9a7087a2256e88c0ae84b9df0f2d0224
Acked-by: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Sherry Yang <sherryy@android.com>
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 22cb64f8d96c..e431cfcfb59b 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -561,7 +561,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc,
 		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
 				   "%d: merge free, buffer %pK do not share page with %pK or %pK\n",
 				   alloc->pid, buffer->data,
-				   prev->data, next->data);
+				   prev->data, next ? next->data : NULL);
 		binder_update_page_range(alloc, 0, buffer_start_page(buffer),
 					 buffer_start_page(buffer) + PAGE_SIZE);
 	}

From 774481506a83d305353330e44b8e0f6ad80a34a8 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Fri, 15 Sep 2017 08:25:32 +0800
Subject: [PATCH 1210/3220] cpufreq: schedutil: clamp util to CPU maximum
 capacity

The code is to get the CPU util by accumulate different scheduling
classes and when the total util value is larger than CPU capacity
then it clamps util to CPU maximum capacity. So we can get correct util
value when use PELT signal but if with WALT signal it misses to clamp
util value.

On the other hand, WALT doesn't accumulate different class utilization
but it needs to applying boost margin for WALT signal the CPU util
value is possible to be larger than CPU capacity; so this patch is to
always clamp util to CPU maximum capacity.

Change-Id: I05481ddbf20246bb9be15b6bd21b6ec039015ea8
Signed-off-by: Leo Yan <leo.yan@linaro.org>
---
 kernel/sched/cpufreq_schedutil.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 28977799017b..d3765f0cb699 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -216,8 +216,9 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
 
 	*util = boosted_cpu_util(cpu);
 	if (likely(use_pelt()))
-		*util = min((*util + rt), max_cap);
+		*util = *util + rt;
 
+	*util = min(*util, max_cap);
 	*max = max_cap;
 }
 

From 89805266af7825f6b8ccb8ff23a8e3aec4418dea Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Tue, 24 Oct 2017 12:42:10 -0700
Subject: [PATCH 1211/3220] Revert "UPSTREAM: efi/libstub/arm64: Set -fpie when
 building the EFI stub"

It break boot with UEFI bootloader

This reverts commit 2f2860a504a30a7645c6a0ec06767c5c7677a4ea.
---
 drivers/firmware/efi/libstub/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index d1f1bde85d3f..68bdd9f23e13 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -10,7 +10,7 @@ cflags-$(CONFIG_X86)		+= -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
 				   -fPIC -fno-strict-aliasing -mno-red-zone \
 				   -mno-mmx -mno-sse
 
-cflags-$(CONFIG_ARM64)		:= $(subst -pg,,$(KBUILD_CFLAGS)) -fpie
+cflags-$(CONFIG_ARM64)		:= $(subst -pg,,$(KBUILD_CFLAGS))
 cflags-$(CONFIG_ARM)		:= $(subst -pg,,$(KBUILD_CFLAGS)) \
 				   -fno-builtin -fpic -mno-single-pic-base
 

From 4f8767d1ca307ab8c03889534a3c5a525ce7485d Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Wed, 25 Oct 2017 17:25:20 +0100
Subject: [PATCH 1212/3220] ANDROID: sched/fair: Select correct capacity state
 for energy_diff

The util returned from group_max_util is not capped at the max util
present in the group, so it can be larger than the capacity stored in
the array. Ensure that when this happens, we always use the last entry
in the array to fetch energy from.

Tested with synthetics on Juno board.

Bug: 38159576
Change-Id: I89fb52fb7e68fa3e682e308acc232596672d03f7
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b1df3873b6fd..5cac6a77b2bc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5384,17 +5384,20 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
 static int find_new_capacity(struct energy_env *eenv,
 	const struct sched_group_energy * const sge)
 {
-	int idx;
+	int idx, max_idx = sge->nr_cap_states - 1;
 	unsigned long util = group_max_util(eenv);
 
+	/* default is max_cap if we don't find a match */
+	eenv->cap_idx = max_idx;
+
 	for (idx = 0; idx < sge->nr_cap_states; idx++) {
-		if (sge->cap_states[idx].cap >= util)
+		if (sge->cap_states[idx].cap >= util) {
+			eenv->cap_idx = idx;
 			break;
+		}
 	}
 
-	eenv->cap_idx = idx;
-
-	return idx;
+	return eenv->cap_idx;
 }
 
 static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)

From cc005cde424b9ae89e1b6fe47da4f2c4a827c73f Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Fri, 4 Aug 2017 10:17:00 -0700
Subject: [PATCH 1213/3220] UPSTREAM: arm64: compat: Remove leftover variable
 declaration

(cherry picked from commit 82d24d114f249d919b918ff8eefde4117db8f088)

Commit a1d5ebaf8ccd ("arm64: big-endian: don't treat code as data when
copying sigret code") moved the 32-bit sigreturn trampoline code from
the aarch32_sigret_code array to kuser32.S. The commit removed the
array definition from signal32.c, but not its declaration in
signal32.h. Remove the leftover declaration.

Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Signed-off-by: Mark Salyzyn <salyzyn@android.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Bug: 20045882
Bug: 63737556
Change-Id: Ic8a5f0e367f0ecd5c5ddd9e3885d0285f91cf89e
---
 arch/arm64/include/asm/signal32.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/include/asm/signal32.h b/arch/arm64/include/asm/signal32.h
index eeaa97559bab..81abea0b7650 100644
--- a/arch/arm64/include/asm/signal32.h
+++ b/arch/arm64/include/asm/signal32.h
@@ -22,8 +22,6 @@
 
 #define AARCH32_KERN_SIGRET_CODE_OFFSET	0x500
 
-extern const compat_ulong_t aarch32_sigret_code[6];
-
 int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set,
 		       struct pt_regs *regs);
 int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,

From 95317055df213596ee7c25836a6ac64bb56c3cb5 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Thu, 19 Oct 2017 15:04:46 +0200
Subject: [PATCH 1214/3220] ANDROID: binder: Add thread->process_todo flag.

This flag determines whether the thread should currently
process the work in the thread->todo worklist.

The prime usecase for this is improving the performance
of synchronous transactions: all synchronous transactions
post a BR_TRANSACTION_COMPLETE to the calling thread,
but there's no reason to return that command to userspace
right away - userspace anyway needs to wait for the reply.

Likewise, a synchronous transaction that contains a binder
object can cause a BC_ACQUIRE/BC_INCREFS to be returned to
userspace; since the caller must anyway hold a strong/weak
ref for the duration of the call, postponing these commands
until the reply comes in is not a problem.

Note that this flag is not used to determine whether a
thread can handle process work; a thread should never pick
up process work when thread work is still pending.

Before patch:
------------------------------------------------------------------
Benchmark                           Time           CPU Iterations
------------------------------------------------------------------
BM_sendVec_binderize/4          45959 ns      20288 ns      34351
BM_sendVec_binderize/8          45603 ns      20080 ns      34909
BM_sendVec_binderize/16         45528 ns      20113 ns      34863
BM_sendVec_binderize/32         45551 ns      20122 ns      34881
BM_sendVec_binderize/64         45701 ns      20183 ns      34864
BM_sendVec_binderize/128        45824 ns      20250 ns      34576
BM_sendVec_binderize/256        45695 ns      20171 ns      34759
BM_sendVec_binderize/512        45743 ns      20211 ns      34489
BM_sendVec_binderize/1024       46169 ns      20430 ns      34081

After patch:
------------------------------------------------------------------
Benchmark                           Time           CPU Iterations
------------------------------------------------------------------
BM_sendVec_binderize/4          42939 ns      17262 ns      40653
BM_sendVec_binderize/8          42823 ns      17243 ns      40671
BM_sendVec_binderize/16         42898 ns      17243 ns      40594
BM_sendVec_binderize/32         42838 ns      17267 ns      40527
BM_sendVec_binderize/64         42854 ns      17249 ns      40379
BM_sendVec_binderize/128        42881 ns      17288 ns      40427
BM_sendVec_binderize/256        42917 ns      17297 ns      40429
BM_sendVec_binderize/512        43184 ns      17395 ns      40411
BM_sendVec_binderize/1024       43119 ns      17357 ns      40432

Signed-off-by: Martijn Coenen <maco@android.com>

Change-Id: Ia70287066d62aba64e98ac44ff1214e37ca75693
---
 drivers/android/binder.c | 144 +++++++++++++++++++++++++++------------
 1 file changed, 100 insertions(+), 44 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 112e61a4806b..6dfcc80f3fc6 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -601,6 +601,8 @@ enum {
  *                        (protected by @proc->inner_lock)
  * @todo:                 list of work to do for this thread
  *                        (protected by @proc->inner_lock)
+ * @process_todo:         whether work in @todo should be processed
+ *                        (protected by @proc->inner_lock)
  * @return_error:         transaction errors reported by this thread
  *                        (only accessed by this thread)
  * @reply_error:          transaction errors reported by target thread
@@ -627,6 +629,7 @@ struct binder_thread {
 	bool looper_need_return; /* can be written by other thread */
 	struct binder_transaction *transaction_stack;
 	struct list_head todo;
+	bool process_todo;
 	struct binder_error return_error;
 	struct binder_error reply_error;
 	wait_queue_head_t wait;
@@ -814,6 +817,16 @@ static bool binder_worklist_empty(struct binder_proc *proc,
 	return ret;
 }
 
+/**
+ * binder_enqueue_work_ilocked() - Add an item to the work list
+ * @work:         struct binder_work to add to list
+ * @target_list:  list to add work to
+ *
+ * Adds the work to the specified list. Asserts that work
+ * is not already on a list.
+ *
+ * Requires the proc->inner_lock to be held.
+ */
 static void
 binder_enqueue_work_ilocked(struct binder_work *work,
 			   struct list_head *target_list)
@@ -824,22 +837,56 @@ binder_enqueue_work_ilocked(struct binder_work *work,
 }
 
 /**
- * binder_enqueue_work() - Add an item to the work list
- * @proc:         binder_proc associated with list
+ * binder_enqueue_thread_work_ilocked_nowake() - Add thread work
+ * @thread:       thread to queue work to
  * @work:         struct binder_work to add to list
- * @target_list:  list to add work to
  *
- * Adds the work to the specified list. Asserts that work
- * is not already on a list.
+ * Adds the work to the todo list of the thread. Doesn't set the process_todo
+ * flag, which means that (if it wasn't already set) the thread will go to
+ * sleep without handling this work when it calls read.
+ *
+ * Requires the proc->inner_lock to be held.
  */
 static void
-binder_enqueue_work(struct binder_proc *proc,
-		    struct binder_work *work,
-		    struct list_head *target_list)
+binder_enqueue_thread_work_ilocked_nowake(struct binder_thread *thread,
+					  struct binder_work *work)
 {
-	binder_inner_proc_lock(proc);
-	binder_enqueue_work_ilocked(work, target_list);
-	binder_inner_proc_unlock(proc);
+	binder_enqueue_work_ilocked(work, &thread->todo);
+}
+
+/**
+ * binder_enqueue_thread_work_ilocked() - Add an item to the thread work list
+ * @thread:       thread to queue work to
+ * @work:         struct binder_work to add to list
+ *
+ * Adds the work to the todo list of the thread, and enables processing
+ * of the todo queue.
+ *
+ * Requires the proc->inner_lock to be held.
+ */
+static void
+binder_enqueue_thread_work_ilocked(struct binder_thread *thread,
+				   struct binder_work *work)
+{
+	binder_enqueue_work_ilocked(work, &thread->todo);
+	thread->process_todo = true;
+}
+
+/**
+ * binder_enqueue_thread_work() - Add an item to the thread work list
+ * @thread:       thread to queue work to
+ * @work:         struct binder_work to add to list
+ *
+ * Adds the work to the todo list of the thread, and enables processing
+ * of the todo queue.
+ */
+static void
+binder_enqueue_thread_work(struct binder_thread *thread,
+			   struct binder_work *work)
+{
+	binder_inner_proc_lock(thread->proc);
+	binder_enqueue_thread_work_ilocked(thread, work);
+	binder_inner_proc_unlock(thread->proc);
 }
 
 static void
@@ -954,7 +1001,7 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 static bool binder_has_work_ilocked(struct binder_thread *thread,
 				    bool do_proc_work)
 {
-	return !binder_worklist_empty_ilocked(&thread->todo) ||
+	return thread->process_todo ||
 		thread->looper_need_return ||
 		(do_proc_work &&
 		 !binder_worklist_empty_ilocked(&thread->proc->todo));
@@ -1371,6 +1418,17 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong,
 			node->local_strong_refs++;
 		if (!node->has_strong_ref && target_list) {
 			binder_dequeue_work_ilocked(&node->work);
+			/*
+			 * Note: this function is the only place where we queue
+			 * directly to a thread->todo without using the
+			 * corresponding binder_enqueue_thread_work() helper
+			 * functions; in this case it's ok to not set the
+			 * process_todo flag, since we know this node work will
+			 * always be followed by other work that starts queue
+			 * processing: in case of synchronous transactions, a
+			 * BR_REPLY or BR_ERROR; in case of oneway
+			 * transactions, a BR_TRANSACTION_COMPLETE.
+			 */
 			binder_enqueue_work_ilocked(&node->work, target_list);
 		}
 	} else {
@@ -1382,6 +1440,9 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong,
 					node->debug_id);
 				return -EINVAL;
 			}
+			/*
+			 * See comment above
+			 */
 			binder_enqueue_work_ilocked(&node->work, target_list);
 		}
 	}
@@ -2071,9 +2132,9 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 			binder_pop_transaction_ilocked(target_thread, t);
 			if (target_thread->reply_error.cmd == BR_OK) {
 				target_thread->reply_error.cmd = error_code;
-				binder_enqueue_work_ilocked(
-					&target_thread->reply_error.work,
-					&target_thread->todo);
+				binder_enqueue_thread_work_ilocked(
+					target_thread,
+					&target_thread->reply_error.work);
 				wake_up_interruptible(&target_thread->wait);
 			} else {
 				WARN(1, "Unexpected reply error: %u\n",
@@ -2712,11 +2773,10 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 				    struct binder_proc *proc,
 				    struct binder_thread *thread)
 {
-	struct list_head *target_list = NULL;
 	struct binder_node *node = t->buffer->target_node;
 	struct binder_priority node_prio;
 	bool oneway = !!(t->flags & TF_ONE_WAY);
-	bool wakeup = true;
+	bool pending_async = false;
 
 	BUG_ON(!node);
 	binder_node_lock(node);
@@ -2726,8 +2786,7 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 	if (oneway) {
 		BUG_ON(thread);
 		if (node->has_async_transaction) {
-			target_list = &node->async_todo;
-			wakeup = false;
+			pending_async = true;
 		} else {
 			node->has_async_transaction = 1;
 		}
@@ -2741,22 +2800,20 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 		return false;
 	}
 
-	if (!thread && !target_list)
+	if (!thread && !pending_async)
 		thread = binder_select_thread_ilocked(proc);
 
 	if (thread) {
-		target_list = &thread->todo;
 		binder_transaction_priority(thread->task, t, node_prio,
 					    node->inherit_rt);
-	} else if (!target_list) {
-		target_list = &proc->todo;
+		binder_enqueue_thread_work_ilocked(thread, &t->work);
+	} else if (!pending_async) {
+		binder_enqueue_work_ilocked(&t->work, &proc->todo);
 	} else {
-		BUG_ON(target_list != &node->async_todo);
+		binder_enqueue_work_ilocked(&t->work, &node->async_todo);
 	}
 
-	binder_enqueue_work_ilocked(&t->work, target_list);
-
-	if (wakeup)
+	if (!pending_async)
 		binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
 
 	binder_inner_proc_unlock(proc);
@@ -3258,10 +3315,10 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 	}
 	tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
-	binder_enqueue_work(proc, tcomplete, &thread->todo);
 	t->work.type = BINDER_WORK_TRANSACTION;
 
 	if (reply) {
+		binder_enqueue_thread_work(thread, tcomplete);
 		binder_inner_proc_lock(target_proc);
 		if (target_thread->is_dead) {
 			binder_inner_proc_unlock(target_proc);
@@ -3269,7 +3326,7 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_pop_transaction_ilocked(target_thread, in_reply_to);
-		binder_enqueue_work_ilocked(&t->work, &target_thread->todo);
+		binder_enqueue_thread_work_ilocked(target_thread, &t->work);
 		binder_inner_proc_unlock(target_proc);
 		wake_up_interruptible_sync(&target_thread->wait);
 		binder_restore_priority(current, in_reply_to->saved_priority);
@@ -3277,6 +3334,7 @@ static void binder_transaction(struct binder_proc *proc,
 	} else if (!(t->flags & TF_ONE_WAY)) {
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_inner_proc_lock(proc);
+		binder_enqueue_thread_work_ilocked_nowake(thread, tcomplete);
 		t->need_reply = 1;
 		t->from_parent = thread->transaction_stack;
 		thread->transaction_stack = t;
@@ -3290,6 +3348,7 @@ static void binder_transaction(struct binder_proc *proc,
 	} else {
 		BUG_ON(target_node == NULL);
 		BUG_ON(t->buffer->async_transaction != 1);
+		binder_enqueue_thread_work(thread, tcomplete);
 		if (!binder_proc_transaction(t, target_proc, NULL))
 			goto err_dead_proc_or_thread;
 	}
@@ -3369,15 +3428,11 @@ err_invalid_target_handle:
 	if (in_reply_to) {
 		binder_restore_priority(current, in_reply_to->saved_priority);
 		thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
-		binder_enqueue_work(thread->proc,
-				    &thread->return_error.work,
-				    &thread->todo);
+		binder_enqueue_thread_work(thread, &thread->return_error.work);
 		binder_send_failed_reply(in_reply_to, return_error);
 	} else {
 		thread->return_error.cmd = return_error;
-		binder_enqueue_work(thread->proc,
-				    &thread->return_error.work,
-				    &thread->todo);
+		binder_enqueue_thread_work(thread, &thread->return_error.work);
 	}
 }
 
@@ -3681,10 +3736,9 @@ static int binder_thread_write(struct binder_proc *proc,
 					WARN_ON(thread->return_error.cmd !=
 						BR_OK);
 					thread->return_error.cmd = BR_ERROR;
-					binder_enqueue_work(
-						thread->proc,
-						&thread->return_error.work,
-						&thread->todo);
+					binder_enqueue_thread_work(
+						thread,
+						&thread->return_error.work);
 					binder_debug(
 						BINDER_DEBUG_FAILED_TRANSACTION,
 						"%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
@@ -3764,9 +3818,9 @@ static int binder_thread_write(struct binder_proc *proc,
 					if (thread->looper &
 					    (BINDER_LOOPER_STATE_REGISTERED |
 					     BINDER_LOOPER_STATE_ENTERED))
-						binder_enqueue_work_ilocked(
-								&death->work,
-								&thread->todo);
+						binder_enqueue_thread_work_ilocked(
+								thread,
+								&death->work);
 					else {
 						binder_enqueue_work_ilocked(
 								&death->work,
@@ -3821,8 +3875,8 @@ static int binder_thread_write(struct binder_proc *proc,
 				if (thread->looper &
 					(BINDER_LOOPER_STATE_REGISTERED |
 					 BINDER_LOOPER_STATE_ENTERED))
-					binder_enqueue_work_ilocked(
-						&death->work, &thread->todo);
+					binder_enqueue_thread_work_ilocked(
+						thread, &death->work);
 				else {
 					binder_enqueue_work_ilocked(
 							&death->work,
@@ -3996,6 +4050,8 @@ retry:
 			break;
 		}
 		w = binder_dequeue_work_head_ilocked(list);
+		if (binder_worklist_empty_ilocked(&thread->todo))
+			thread->process_todo = false;
 
 		switch (w->type) {
 		case BINDER_WORK_TRANSACTION: {

From c8bc3e3a3ede641b48360ab0bd35eb316168008b Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Tue, 24 Oct 2017 16:37:39 +0200
Subject: [PATCH 1215/3220] ANDROID: binder: show high watermark of
 alloc->pages.

Show the high watermark of the index into the alloc->pages
array, to facilitate sizing the buffer on a per-process
basis.

Change-Id: I2b40cd16628e0ee45216c51dc9b3c5b0c862032e
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder_alloc.c | 4 ++++
 drivers/android/binder_alloc.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index e431cfcfb59b..3a4279d219f7 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -282,6 +282,9 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 			goto err_vm_insert_page_failed;
 		}
 
+		if (index + 1 > alloc->pages_high)
+			alloc->pages_high = index + 1;
+
 		trace_binder_alloc_page_end(alloc, index);
 		/* vm_insert_page does not seem to increment the refcount */
 	}
@@ -855,6 +858,7 @@ void binder_alloc_print_pages(struct seq_file *m,
 	}
 	mutex_unlock(&alloc->mutex);
 	seq_printf(m, "  pages: %d:%d:%d\n", active, lru, free);
+	seq_printf(m, "  pages high watermark: %zu\n", alloc->pages_high);
 }
 
 /**
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index 2dd33b6df104..0b145307f1fd 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -92,6 +92,7 @@ struct binder_lru_page {
  * @pages:              array of binder_lru_page
  * @buffer_size:        size of address space specified via mmap
  * @pid:                pid for associated binder_proc (invariant after init)
+ * @pages_high:         high watermark of offset in @pages
  *
  * Bookkeeping structure for per-proc address space management for binder
  * buffers. It is normally initialized during binder_init() and binder_mmap()
@@ -112,6 +113,7 @@ struct binder_alloc {
 	size_t buffer_size;
 	uint32_t buffer_free;
 	int pid;
+	size_t pages_high;
 };
 
 #ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST

From fac311be26e5af64612c386f5a041984fe7c59a2 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Tue, 25 Apr 2017 10:37:58 +0100
Subject: [PATCH 1216/3220] cpufreq/sched: Consider max cpu capacity when
 choosing frequencies

When using schedfreq on cpus with max capacity significantly smaller than
1024, the tick update uses non-normalised capacities - this leads to
selecting an incorrect OPP as we were scaling the frequency as if the
max capacity achievable was 1024 rather than the max for that particular
cpu or group. This could result in a cpu being stuck at the lowest OPP
and unable to generate enough utilisation to climb out if the max
capacity is significantly smaller than 1024.

Instead, normalize the capacity to be in the range 0-1024 in the tick
so that when we later select a frequency, we get the correct one.

Also comments updated to be clearer about what is needed.

Change-Id: Id84391c7ac015311002ada21813a353ee13bee60
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c  | 4 ++++
 kernel/sched/fair.c  | 4 ++--
 kernel/sched/sched.h | 4 ++++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 83f7c682032b..9cf530a6123e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2987,7 +2987,9 @@ static void sched_freq_tick_pelt(int cpu)
 	 * utilization and to harm its performance the least, request
 	 * a jump to a higher OPP as soon as the margin of free capacity
 	 * is impacted (specified by capacity_margin).
+	 * Remember CPU utilization in sched_capacity_reqs should be normalised.
 	 */
+	cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
 	set_cfs_cpu_capacity(cpu, true, cpu_utilization);
 }
 
@@ -3014,7 +3016,9 @@ static void sched_freq_tick_walt(int cpu)
 	 * It is likely that the load is growing so we
 	 * keep the added margin in our request as an
 	 * extra boost.
+	 * Remember CPU utilization in sched_capacity_reqs should be normalised.
 	 */
+	cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
 	set_cfs_cpu_capacity(cpu, true, cpu_utilization);
 
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5cac6a77b2bc..e6b2461d07d6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4671,7 +4671,7 @@ static void update_capacity_of(int cpu)
 	if (!sched_freq())
 		return;
 
-	/* Convert scale-invariant capacity to cpu. */
+	/* Normalize scale-invariant capacity to cpu. */
 	req_cap = boosted_cpu_util(cpu);
 	req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
 	set_cfs_cpu_capacity(cpu, true, req_cap);
@@ -4864,7 +4864,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			if (rq->cfs.nr_running)
 				update_capacity_of(cpu_of(rq));
 			else if (sched_freq())
-				set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+				set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */
 		}
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9f3d89faacdc..5256f05a26e8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1621,6 +1621,10 @@ static inline bool sched_freq(void)
 	return static_key_false(&__sched_freq);
 }
 
+/*
+ * sched_capacity_reqs expects capacity requests to be normalised.
+ * All capacities should sum to the range of 0-1024.
+ */
 DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
 void update_cpu_capacity_request(int cpu, bool request);
 

From 792510d9b392b392c763c5e395f046d5c246c66e Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Tue, 30 May 2017 14:51:53 +0100
Subject: [PATCH 1217/3220] BACKPORT: sched/fair: Make it possible to account
 fair load avg consistently

While set_task_rq_fair() is introduced in mainline by commit ad936d8658fd
("sched/fair: Make it possible to account fair load avg consistently"),
the function results to be introduced here by the backport of
commit 09a43ace1f98 ("sched/fair: Propagate load during synchronous
attach/detach"). The problem (apart from the confusion introduced by the
backport) is actually that set_task_rq_fair() is currently not called at
all.

Fix the problem by backporting again commit ad936d8658fd
("sched/fair: Make it possible to account fair load avg consistently").

Original change log:

The current code accounts for the time a task was absent from the fair
class (per ATTACH_AGE_LOAD). However it does not work correctly when a
task got migrated or moved to another cgroup while outside of the fair
class.

This patch tries to address that by aging on migration. We locklessly
read the 'last_update_time' stamp from both the old and new cfs_rq,
ages the load upto the old time, and sets it to the new time.

These timestamps should in general not be more than 1 tick apart from
one another, so there is a definite bound on things.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
[ Changelog, a few edits and !SMP build fix ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1445616981-29904-2-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry-picked from ad936d8658fd348338cb7d42c577dac77892b074)
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: I17294ab0ada3901d35895014715fd60952949358
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
---
 kernel/sched/core.c  |  4 ++++
 kernel/sched/sched.h | 11 ++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9cf530a6123e..e3242eed60e5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2173,6 +2173,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	INIT_LIST_HEAD(&p->se.group_node);
 	walt_init_new_task_load(p);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	p->se.cfs_rq			= NULL;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5256f05a26e8..1d52ca8a613c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -335,7 +335,15 @@ extern void sched_move_task(struct task_struct *tsk);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+			     struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+			     struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #else /* CONFIG_CGROUP_SCHED */
 
@@ -987,6 +995,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
 	p->se.cfs_rq = tg->cfs_rq[cpu];
 	p->se.parent = tg->se[cpu];
 #endif

From 6b02ab68ec78f41f647ebc32b1c37bd27fdb034e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jun 2016 18:51:48 +0200
Subject: [PATCH 1218/3220] UPSTREAM: sched/fair: Fix and optimize the fork()
 path

The task_fork_fair() callback already calls __set_task_cpu() and takes
rq->lock.

If we move the sched_class::task_fork callback in sched_fork() under
the existing p->pi_lock, right after its set_task_cpu() call, we can
avoid doing two such calls and omit the IRQ disabling on the rq->lock.

Change to __set_task_cpu() to skip the migration bits, this is a new
task, not a migration. Similarly, make wake_up_new_task() use
__set_task_cpu() for the same reason, the task hasn't actually
migrated as it hasn't ever ran.

This cures the problem of calling migrate_task_rq_fair(), which does
remove_entity_from_load_avg() on tasks that have never been added to
the load avg to begin with.

This bug would result in transiently messed up load_avg values, averaged
out after a few dozen milliseconds. This is probably the reason why
this bug was not found for such a long time.

Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit e210bffd39d01b649c94b820c28ff112673266dd)
Change-Id: Icbddbaa6e8c1071859673d8685bc3f38955cf144
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 16 +++++++++++-----
 kernel/sched/fair.c | 27 ++++++---------------------
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e3242eed60e5..7e696bb3291a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2300,9 +2300,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		p->sched_class = &fair_sched_class;
 	}
 
-	if (p->sched_class->task_fork)
-		p->sched_class->task_fork(p);
-
 	/*
 	 * The child is not yet in the pid-hash so no cgroup attach races,
 	 * and the cgroup is pinned to this child due to cgroup_fork()
@@ -2311,7 +2308,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 * Silence PROVE_RCU.
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	set_task_cpu(p, cpu);
+	/*
+	 * We're setting the cpu for the first time, we don't migrate,
+	 * so use __set_task_cpu().
+	 */
+	__set_task_cpu(p, cpu);
+	if (p->sched_class->task_fork)
+		p->sched_class->task_fork(p);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 #ifdef CONFIG_SCHED_INFO
@@ -2453,8 +2456,11 @@ void wake_up_new_task(struct task_struct *p)
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_allowed can change in the fork path
 	 *  - any previously selected cpu might disappear through hotplug
+	 *
+	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+	 * as we're not fully set-up yet.
 	 */
-	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 	rq = __task_rq_lock(p);
 	post_init_entity_util_avg(&p->se);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e6b2461d07d6..d42a54f7b2e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4712,7 +4712,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		 *
 		 * note: in the case of encountering a throttled cfs_rq we will
 		 * post the final h_nr_running increment below.
-		*/
+		 */
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running++;
@@ -9901,31 +9901,17 @@ static void task_fork_fair(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se, *curr;
-	int this_cpu = smp_processor_id();
 	struct rq *rq = this_rq();
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
 
+	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 
 	cfs_rq = task_cfs_rq(current);
 	curr = cfs_rq->curr;
-
-	/*
-	 * Not only the cpu but also the task_group of the parent might have
-	 * been changed after parent->se.parent,cfs_rq were copied to
-	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
-	 * of child point to valid ones.
-	 */
-	rcu_read_lock();
-	__set_task_cpu(p, this_cpu);
-	rcu_read_unlock();
-
-	update_curr(cfs_rq);
-
-	if (curr)
+	if (curr) {
+		update_curr(cfs_rq);
 		se->vruntime = curr->vruntime;
+	}
 	place_entity(cfs_rq, se, 1);
 
 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -9938,8 +9924,7 @@ static void task_fork_fair(struct task_struct *p)
 	}
 
 	se->vruntime -= cfs_rq->min_vruntime;
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_unlock(&rq->lock);
 }
 
 /*

From 138a670d97ca84a4cab83515a0920d4ef8eeb22a Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 17 Jun 2016 13:38:55 +0200
Subject: [PATCH 1219/3220] BACKPORT: sched/cgroup: Fix cpu_cgroup_fork()
 handling

A new fair task is detached and attached from/to task_group with:

  cgroup_post_fork()
    ss->fork(child) := cpu_cgroup_fork()
      sched_move_task()
        task_move_group_fair()

Which is wrong, because at this point in fork() the task isn't fully
initialized and it cannot 'move' to another group, because its not
attached to any group as yet.

In fact, cpu_cgroup_fork() needs a small part of sched_move_task() so we
can just call this small part directly instead sched_move_task(). And
the task doesn't really migrate because it is not yet attached so we
need the following sequence:

  do_fork()
    sched_fork()
      __set_task_cpu()

    cgroup_post_fork()
      set_task_rq() # set task group and runqueue

    wake_up_new_task()
      select_task_rq() can select a new cpu
      __set_task_cpu
      post_init_entity_util_avg
        attach_task_cfs_rq()
      activate_task
        enqueue_task

This patch makes that happen.

BACKPORT: Difference from original commit:

- Removed use of DEQUEUE_MOVE (which isn't defined in 4.4) in
  dequeue_task flags
- Replaced "struct rq_flags rf" with "unsigned long flags".

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
[ Added TASK_SET_GROUP to set depth properly. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit ea86cb4b7621e1298a37197005bf0abcc86348d4)
Change-Id: I8126fd923288acf961218431ffd29d6bf6fd8d72
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c  | 63 ++++++++++++++++++++++++++++----------------
 kernel/sched/fair.c  | 23 +++++++++++++++-
 kernel/sched/sched.h |  5 +++-
 3 files changed, 67 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7e696bb3291a..0b5c588929e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8188,14 +8188,37 @@ void sched_offline_group(struct task_group *tg)
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
 
-/* change task's runqueue when it moves between groups.
- *	The caller of this function should have put the task in its new group
- *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- *	reflect its new group.
+static void sched_change_group(struct task_struct *tsk, int type)
+{
+	struct task_group *tg;
+
+	/*
+	 * All callers are synchronized by task_rq_lock(); we do not use RCU
+	 * which is pointless here. Thus, we pass "true" to task_css_check()
+	 * to prevent lockdep warnings.
+	 */
+	tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
+			  struct task_group, css);
+	tg = autogroup_task_group(tsk, tg);
+	tsk->sched_task_group = tg;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (tsk->sched_class->task_change_group)
+		tsk->sched_class->task_change_group(tsk, type);
+	else
+#endif
+		set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
  */
 void sched_move_task(struct task_struct *tsk)
 {
-	struct task_group *tg;
 	int queued, running;
 	unsigned long flags;
 	struct rq *rq;
@@ -8210,22 +8233,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
-	/*
-	 * All callers are synchronized by task_rq_lock(); we do not use RCU
-	 * which is pointless here. Thus, we pass "true" to task_css_check()
-	 * to prevent lockdep warnings.
-	 */
-	tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
-			  struct task_group, css);
-	tg = autogroup_task_group(tsk, tg);
-	tsk->sched_task_group = tg;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	if (tsk->sched_class->task_move_group)
-		tsk->sched_class->task_move_group(tsk);
-	else
-#endif
-		set_task_rq(tsk, task_cpu(tsk));
+	sched_change_group(tsk, TASK_MOVE_GROUP);
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
@@ -8662,9 +8670,20 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 	sched_free_group(tg);
 }
 
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
 static void cpu_cgroup_fork(struct task_struct *task, void *private)
 {
-	sched_move_task(task);
+	unsigned long flags;
+	struct rq *rq;
+
+	rq = task_rq_lock(task, &flags);
+
+	sched_change_group(task, TASK_SET_GROUP);
+
+	task_rq_unlock(rq, task, &flags);
 }
 
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d42a54f7b2e3..2afd81bfda99 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10116,6 +10116,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+
+	set_task_rq(p, task_cpu(p));
+	se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
 static void task_move_group_fair(struct task_struct *p)
 {
 	detach_task_cfs_rq(p);
@@ -10128,6 +10136,19 @@ static void task_move_group_fair(struct task_struct *p)
 	attach_task_cfs_rq(p);
 }
 
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+	switch (type) {
+	case TASK_SET_GROUP:
+		task_set_group_fair(p);
+		break;
+
+	case TASK_MOVE_GROUP:
+		task_move_group_fair(p);
+		break;
+	}
+}
+
 void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
@@ -10354,7 +10375,7 @@ const struct sched_class fair_sched_class = {
 	.update_curr		= update_curr_fair,
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	.task_move_group	= task_move_group_fair,
+	.task_change_group	= task_change_group_fair,
 #endif
 };
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1d52ca8a613c..8d3712107e61 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1293,8 +1293,11 @@ struct sched_class {
 
 	void (*update_curr) (struct rq *rq);
 
+#define TASK_SET_GROUP  0
+#define TASK_MOVE_GROUP	1
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	void (*task_move_group) (struct task_struct *p);
+	void (*task_change_group)(struct task_struct *p, int type);
 #endif
 };
 

From 97cb74f48599ca1ae6c17955882f167a4c3aaad2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jun 2016 13:29:28 +0200
Subject: [PATCH 1220/3220] BACKPORT: sched/fair: Fix PELT integrity for new
 tasks

Vincent and Yuyang found another few scenarios in which entity
tracking goes wobbly.

The scenarios are basically due to the fact that new tasks are not
immediately attached and thereby differ from the normal situation -- a
task is always attached to a cfs_rq load average (such that it
includes its blocked contribution) and are explicitly
detached/attached on migration to another cfs_rq.

Scenario 1: switch to fair class

  p->sched_class = fair_class;
  if (queued)
    enqueue_task(p);
      ...
        enqueue_entity()
	  enqueue_entity_load_avg()
	    migrated = !sa->last_update_time (true)
	    if (migrated)
	      attach_entity_load_avg()
  check_class_changed()
    switched_from() (!fair)
    switched_to()   (fair)
      switched_to_fair()
        attach_entity_load_avg()

If @p is a new task that hasn't been fair before, it will have
!last_update_time and, per the above, end up in
attach_entity_load_avg() _twice_.

Scenario 2: change between cgroups

  sched_move_group(p)
    if (queued)
      dequeue_task()
    task_move_group_fair()
      detach_task_cfs_rq()
        detach_entity_load_avg()
      set_task_rq()
      attach_task_cfs_rq()
        attach_entity_load_avg()
    if (queued)
      enqueue_task();
        ...
          enqueue_entity()
	    enqueue_entity_load_avg()
	      migrated = !sa->last_update_time (true)
	      if (migrated)
	        attach_entity_load_avg()

Similar as with scenario 1, if @p is a new task, it will have
!load_update_time and we'll end up in attach_entity_load_avg()
_twice_.

Furthermore, notice how we do a detach_entity_load_avg() on something
that wasn't attached to begin with.

As stated above; the problem is that the new task isn't yet attached
to the load tracking and thereby violates the invariant assumption.

This patch remedies this by ensuring a new task is indeed properly
attached to the load tracking on creation, through
post_init_entity_util_avg().

Of course, this isn't entirely as straightforward as one might think,
since the task is hashed before we call wake_up_new_task() and thus
can be poked at. We avoid this by adding TASK_NEW and teaching
cpu_cgroup_can_attach() to refuse such tasks.

.:: BACKPORT

Complicated by the fact that mch of the lines changed by the original
of this commit were then changed by:

df217913e72e sched/fair: Factorize attach/detach entity <Vincent Guittot>

and then

d31b1a66cbe0 sched/fair: Factorize PELT update <Vincent Guittot>

, which have both already been backported here.

Reported-by: Yuyang Du <yuyang.du@intel.com>
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 7dc603c9028ea5d4354e0e317e8481df99b06d7e)
Change-Id: Ibc59eb52310a62709d49a744bd5a24e8b97c4ae8
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 include/linux/sched.h |  5 +++--
 kernel/sched/core.c   | 26 +++++++++++++++++++++++---
 kernel/sched/fair.c   | 15 ++++++++++-----
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d0d0b1b45418..1872d19d1702 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -222,9 +222,10 @@ extern void proc_sched_set_task(struct task_struct *p);
 #define TASK_WAKING		256
 #define TASK_PARKED		512
 #define TASK_NOLOAD		1024
-#define TASK_STATE_MAX		2048
+#define TASK_NEW		2048
+#define TASK_STATE_MAX		4096
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b5c588929e9..31cb76a915a8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2259,11 +2259,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 	__sched_fork(clone_flags, p);
 	/*
-	 * We mark the process as running here. This guarantees that
+	 * We mark the process as NEW here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
-	p->state = TASK_RUNNING;
+	p->state = TASK_NEW;
 
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
@@ -2300,6 +2300,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		p->sched_class = &fair_sched_class;
 	}
 
+	init_entity_runnable_average(&p->se);
+
 	/*
 	 * The child is not yet in the pid-hash so no cgroup attach races,
 	 * and the cgroup is pinned to this child due to cgroup_fork()
@@ -2446,6 +2448,7 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq *rq;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	p->state = TASK_RUNNING;
 
 	walt_init_new_task_load(p);
 
@@ -8690,6 +8693,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	struct cgroup_subsys_state *css;
+	int ret = 0;
 
 	cgroup_taskset_for_each(task, css, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -8700,8 +8704,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 		if (task->sched_class != &fair_sched_class)
 			return -EINVAL;
 #endif
+		/*
+		 * Serialize against wake_up_new_task() such that if its
+		 * running, we're sure to observe its full state.
+		 */
+		raw_spin_lock_irq(&task->pi_lock);
+		/*
+		 * Avoid calling sched_move_task() before wake_up_new_task()
+		 * has happened. This would lead to problems with PELT, due to
+		 * move wanting to detach+attach while we're not attached yet.
+		 */
+		if (task->state == TASK_NEW)
+			ret = -EINVAL;
+		raw_spin_unlock_irq(&task->pi_lock);
+
+		if (ret)
+			break;
 	}
-	return 0;
+	return ret;
 }
 
 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2afd81bfda99..76af6e25e82e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -766,7 +766,9 @@ void init_entity_runnable_average(struct sched_entity *se)
 }
 
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
 static void attach_entity_cfs_rq(struct sched_entity *se);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
 /*
  * With new tasks being created, their initial util_avgs are extrapolated
@@ -837,7 +839,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
 	attach_entity_cfs_rq(se);
 }
 
-#else
+#else /* !CONFIG_SMP */
 void init_entity_runnable_average(struct sched_entity *se)
 {
 }
@@ -3312,11 +3314,14 @@ void remove_entity_load_avg(struct sched_entity *se)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 	/*
-	 * Newly created task or never used group entity should not be removed
-	 * from its (source) cfs_rq
+	 * tasks cannot exit without having gone through wake_up_new_task() ->
+	 * post_init_entity_util_avg() which will have added things to the
+	 * cfs_rq, so we can remove unconditionally.
+	 *
+	 * Similarly for groups, they will have passed through
+	 * post_init_entity_util_avg() before unregister_sched_fair_group()
+	 * calls this.
 	 */
-	if (se->avg.last_update_time == 0)
-		return;
 
 	sync_entity_load_avg(se);
 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);

From c14c9b6e3e489062efe08c364196e76e705b8ea3 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 8 Dec 2016 17:56:53 +0100
Subject: [PATCH 1221/3220] UPSTREAM: sched/core: Fix find_idlest_group() for
 fork

During fork, the utilization of a task is init once the rq has been
selected because the current utilization level of the rq is used to
set the utilization of the fork task. As the task's utilization is
still 0 at this step of the fork sequence, it doesn't make sense to
look for some spare capacity that can fit the task's utilization.
Furthermore, I can see perf regressions for the test:

   hackbench -P -g 1

because the least loaded policy is always bypassed and tasks are not
spread during fork.

With this patch and the fix below, we are back to same performances as
for v4.8. The fix below is only a temporary one used for the test
until a smarter solution is found because we can't simply remove the
test which is useful for others benchmarks

| @@ -5708,13 +5708,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
|
|	avg_cost = this_sd->avg_scan_cost;
|
| -	/*
| -	 * Due to large variance we need a large fuzz factor; hackbench in
| -	 * particularly is sensitive here.
| -	 */
| -	if ((avg_idle / 512) < avg_cost)
| -		return -1;
| -
|	time = local_clock();
|
|	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {

Tested-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Acked-by: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: kernellwp@gmail.com
Cc: umgwanakikbuti@gmail.com
Cc: yuyang.du@intel.comc
Link: http://lkml.kernel.org/r/1481216215-24651-2-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit f519a3f1c6b7a990e5aed37a8f853c6ecfdee945)
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Change-Id: I86cc2ad81af3467c0b2f82b995111f428248baa4
---
 kernel/sched/fair.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 76af6e25e82e..4a20c392c0eb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6051,13 +6051,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 	 * utilized systems if we require spare_capacity > task_util(p),
 	 * so we allow for some task stuffing by using
 	 * spare_capacity > task_util(p)/2.
+	 *
+	 * Spare capacity can't be used for fork because the utilization has
+	 * not been set yet, we must first select a rq to compute the initial
+	 * utilization.
 	 */
+	if (sd_flag & SD_BALANCE_FORK)
+		goto skip_spare;
+
 	if (this_spare > task_util(p) / 2 &&
 	    imbalance*this_spare > 100*most_spare)
 		return NULL;
 	else if (most_spare > task_util(p) / 2)
 		return most_spare_sg;
 
+skip_spare:
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;

From 4863faf5e4df76322999ad06502cbe27d0ec86dc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Oct 2016 16:20:59 +0200
Subject: [PATCH 1222/3220] UPSTREAM: sched/core: Add missing update_rq_clock()
 in post_init_entity_util_avg()

Address this rq-clock update bug:

  WARNING: CPU: 0 PID: 0 at ../kernel/sched/sched.h:797 post_init_entity_util_avg()
  rq->clock_update_flags < RQCF_ACT_SKIP

  Call Trace:
    __warn()
    post_init_entity_util_avg()
    wake_up_new_task()
    _do_fork()
    kernel_thread()
    rest_init()
    start_kernel()

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 4126bad6717336abe5d666440ae15555563ca53f)
Change-Id: Ibe9a73386896377f96483d195e433259218755a5
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 31cb76a915a8..7da9ce69e707 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2466,6 +2466,7 @@ void wake_up_new_task(struct task_struct *p)
 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 	rq = __task_rq_lock(p);
+	update_rq_clock(rq);
 	post_init_entity_util_avg(&p->se);
 
 	walt_mark_task_starting(p);

From bea1b621d952079c4cebc1178ea580c290d0446e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Oct 2016 16:28:37 +0200
Subject: [PATCH 1223/3220] UPSTREAM: sched/core: Add missing update_rq_clock()
 in detach_task_cfs_rq()

Instead of adding the update_rq_clock() all the way at the bottom of
the callstack, add one at the top, this to aid later effort to
minimize update_rq_lock() calls.

  WARNING: CPU: 0 PID: 1 at ../kernel/sched/sched.h:797 detach_task_cfs_rq()
  rq->clock_update_flags < RQCF_ACT_SKIP

  Call Trace:
    dump_stack()
    __warn()
    warn_slowpath_fmt()
    detach_task_cfs_rq()
    switched_from_fair()
    __sched_setscheduler()
    _sched_setscheduler()
    sched_set_stop_task()
    cpu_stop_create()
    __smpboot_create_thread.part.2()
    smpboot_register_percpu_thread_cpumask()
    cpu_stop_init()
    do_one_initcall()
    ? print_cpu_info()
    kernel_init_freeable()
    ? rest_init()
    kernel_init()
    ret_from_fork()

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 80f5c1b84baa8180c3c27b7e227429712cd967b6)
Change-Id: Ibffde077d18eabec4c2984158bd9d6d73bd0fb96
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7da9ce69e707..4097f9fa541b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3578,6 +3578,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	BUG_ON(prio > MAX_PRIO);
 
 	rq = __task_rq_lock(p);
+	update_rq_clock(rq);
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -4095,6 +4096,7 @@ recheck:
 	 * runqueue lock must be held.
 	 */
 	rq = task_rq_lock(p, &flags);
+	update_rq_clock(rq);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
@@ -8685,6 +8687,7 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private)
 
 	rq = task_rq_lock(task, &flags);
 
+	update_rq_clock(rq);
 	sched_change_group(task, TASK_SET_GROUP);
 
 	task_rq_unlock(rq, task, &flags);

From bab39eb879251debe02e573f5453bd93ad5350bd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Oct 2016 16:35:32 +0200
Subject: [PATCH 1224/3220] UPSTREAM: sched/core: Add missing update_rq_clock()
 call for task_hot()

Add the update_rq_clock() call at the top of the callstack instead of
at the bottom where we find it missing, this to aid later effort to
minimize the number of update_rq_lock() calls.

  WARNING: CPU: 30 PID: 194 at ../kernel/sched/sched.h:797 assert_clock_updated()
  rq->clock_update_flags < RQCF_ACT_SKIP

  Call Trace:
    dump_stack()
    __warn()
    warn_slowpath_fmt()
    assert_clock_updated.isra.63.part.64()
    can_migrate_task()
    load_balance()
    pick_next_task_fair()
    __schedule()
    schedule()
    worker_thread()
    kthread()

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 3bed5e2166a5e433bf62162f3cd3c5174d335934)
Change-Id: Ief5070dcce486535334dcb739ee16b989ea9df42
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a20c392c0eb..f6e730bcde27 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8998,6 +8998,7 @@ redo:
 
 more_balance:
 		raw_spin_lock_irqsave(&busiest->lock, flags);
+		update_rq_clock(busiest);
 
 		/*
 		 * cur_ld_moved - load moved in current iteration
@@ -9395,6 +9396,7 @@ static int active_load_balance_cpu_stop(void *data)
 		};
 
 		schedstat_inc(sd, alb_count);
+		update_rq_clock(busiest_rq);
 
 		p = detach_one_task(&env);
 		if (p) {

From fd4a95dab858fa1350cc94ce4b435b7295db3ed3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Oct 2016 16:44:25 +0200
Subject: [PATCH 1225/3220] UPSTREAM: sched/core: Add missing update_rq_clock()
 call in set_user_nice()

Address this rq-clock update bug:

  WARNING: CPU: 30 PID: 195 at ../kernel/sched/sched.h:797 set_next_entity()
  rq->clock_update_flags < RQCF_ACT_SKIP

  Call Trace:
    dump_stack()
    __warn()
    warn_slowpath_fmt()
    set_next_entity()
    ? _raw_spin_lock()
    set_curr_task_fair()
    set_user_nice.part.85()
    set_user_nice()
    create_worker()
    worker_thread()
    kthread()
    ret_from_fork()

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 2fb8d36787affe26f3536c3d8ec094995a48037d)
Change-Id: I53ba056e72820c7fadb3f022e4ee3b821c0de17d
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4097f9fa541b..ca9d72fa8e66 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3670,6 +3670,8 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
+	update_rq_clock(rq);
+
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected

From 795a6867cfe1ea0bbe967d2b037ae856484556a4 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Mon, 7 Aug 2017 17:39:00 +0100
Subject: [PATCH 1226/3220] UPSTREAM: sched/fair: Force balancing on nohz
 balance if local group has capacity

The "goto force_balance" here is intended to mitigate the fact that
avg_load calculations can result in bad placement decisions when
priority is asymmetrical.

The original commit that adds it:

  fab476228ba3 ("sched: Force balancing on newidle balance if local group has capacity")

explains:

    Under certain situations, such as a niced down task (i.e. nice =
    -15) in the presence of nr_cpus NICE0 tasks, the niced task lands
    on a sched group and kicks away other tasks because of its large
    weight. This leads to sub-optimal utilization of the
    machine. Even though the sched group has capacity, it does not
    pull tasks because sds.this_load >> sds.max_load, and f_b_g()
    returns NULL.

A similar but inverted issue also affects ARM big.LITTLE (asymmetrical CPU
capacity) systems - consider 8 always-running, same-priority tasks on a
system with 4 "big" and 4 "little" CPUs. Suppose that 5 of them end up on
the "big" CPUs (which will be represented by one sched_group in the DIE
sched_domain) and 3 on the "little" (the other sched_group in DIE), leaving
one CPU unused. Because the "big" group has a higher group_capacity its
avg_load may not present an imbalance that would cause migrating a
task to the idle "little".

The force_balance case here solves the problem but currently only for
CPU_NEWLY_IDLE balances, which in theory might never happen on the
unused CPU. Including CPU_IDLE in the force_balance case means
there's an upper bound on the time before we can attempt to solve the
underutilization: after DIE's sd->balance_interval has passed the
next nohz balance kick will help us out.

Change-Id: I807ba5cba0ef1b8bbec02cbcd4755fd32af10135
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170807163900.25180-1-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry-picked-from: commit 583ffd99d765 tip:sched/core)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f6e730bcde27..ae94c1124655 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8707,8 +8707,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (busiest->group_type == group_imbalanced)
 		goto force_balance;
 
-	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+	/*
+	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+	 * capacities from resulting in underutilization due to avg_load.
+	 */
+	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
 	    busiest->group_no_capacity)
 		goto force_balance;
 

From 0f743ce7458c3c7d4be2367c8ed538069f1472b6 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 31 Aug 2017 12:57:58 +0100
Subject: [PATCH 1227/3220] BACKPORT: sched/fair: Move select_task_rq_fair
 slow-path into its own function

In preparation for changes that would otherwise require adding a new
level of indentation to the while(sd) loop, create a new function
find_idlest_cpu() which contains this loop, and rename the existing
find_idlest_cpu() to find_idlest_group_cpu().

Code inside the while(sd) loop is unchanged. @new_cpu is added as a
variable in the new function, with the same initial value as the
@new_cpu in select_task_rq_fair().

Change-Id: I9842308cab00dc9cd6c513fc38c609089a1aaaaf
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-2-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(reworked for eas/cas schedstats added in Android)
(cherry-picked commit 18bd1b4bd53a from tip:sched/core)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 114 ++++++++++++++++++++++++--------------------
 1 file changed, 62 insertions(+), 52 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae94c1124655..561d1c505512 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6072,10 +6072,10 @@ skip_spare:
 }
 
 /*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	unsigned int min_exit_latency = UINT_MAX;
@@ -6122,6 +6122,65 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	}
 
 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ }
+
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+				  int cpu, int prev_cpu, int sd_flag)
+{
+	int new_cpu = prev_cpu;
+	int wu = sd_flag & SD_BALANCE_WAKE;
+	int cas_cpu = -1;
+
+	if (wu) {
+		schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
+		schedstat_inc(this_rq(), eas_stats.cas_attempts);
+	}
+
+	while (sd) {
+		struct sched_group *group;
+		struct sched_domain *tmp;
+		int weight;
+
+		if (wu)
+			schedstat_inc(sd, eas_stats.cas_attempts);
+
+		if (!(sd->flags & sd_flag)) {
+			sd = sd->child;
+			continue;
+		}
+
+		group = find_idlest_group(sd, p, cpu, sd_flag);
+		if (!group) {
+			sd = sd->child;
+			continue;
+		}
+
+		new_cpu = find_idlest_group_cpu(group, p, cpu);
+		if (new_cpu == -1 || new_cpu == cpu) {
+			/* Now try balancing at a lower domain level of cpu */
+			sd = sd->child;
+			continue;
+		}
+
+		/* Now try balancing at a lower domain level of new_cpu */
+		cpu = cas_cpu = new_cpu;
+		weight = sd->span_weight;
+		sd = NULL;
+		for_each_domain(cpu, tmp) {
+			if (weight <= tmp->span_weight)
+				break;
+			if (tmp->flags & sd_flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	if (wu && (cas_cpu >= 0)) {
+		schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
+		schedstat_inc(this_rq(), eas_stats.cas_count);
+	}
+
+	return new_cpu;
 }
 
 /*
@@ -6698,56 +6757,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
 	} else {
-		int wu = sd_flag & SD_BALANCE_WAKE;
-		int cas_cpu = -1;
-
-		if (wu) {
-			schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
-			schedstat_inc(this_rq(), eas_stats.cas_attempts);
-		}
-
-		while (sd) {
-			struct sched_group *group;
-			int weight;
-
-			if (wu)
-				schedstat_inc(sd, eas_stats.cas_attempts);
-
-			if (!(sd->flags & sd_flag)) {
-				sd = sd->child;
-				continue;
-			}
-
-			group = find_idlest_group(sd, p, cpu, sd_flag);
-			if (!group) {
-				sd = sd->child;
-				continue;
-			}
-
-			new_cpu = find_idlest_cpu(group, p, cpu);
-			if (new_cpu == -1 || new_cpu == cpu) {
-				/* Now try balancing at a lower domain level of cpu */
-				sd = sd->child;
-				continue;
-			}
-
-			/* Now try balancing at a lower domain level of new_cpu */
-			cpu = cas_cpu = new_cpu;
-			weight = sd->span_weight;
-			sd = NULL;
-			for_each_domain(cpu, tmp) {
-				if (weight <= tmp->span_weight)
-					break;
-				if (tmp->flags & sd_flag)
-					sd = tmp;
-			}
-			/* while loop will break here if sd == NULL */
-		}
-
-		if (wu && (cas_cpu >= 0)) {
-			schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
-			schedstat_inc(this_rq(), eas_stats.cas_count);
-		}
+		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
 	}
 	rcu_read_unlock();
 

From 529def2ffe532855f570a3a00e9f78ce59c8d84b Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 31 Aug 2017 12:57:59 +0100
Subject: [PATCH 1228/3220] UPSTREAM: sched/fair: Remove unnecessary comparison
 with -1

Since commit:

  83a0a96a5f26 ("sched/fair: Leverage the idle state info when choosing the "idlest" cpu")

find_idlest_group_cpu() (formerly find_idlest_cpu) no longer returns -1,
so we can simplify the checking of the return value in find_idlest_cpu().

Change-Id: I98f4b9f178cd93a30408e024e608d36771764c7b
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-3-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry-picked-from commit e90381eaecf6 in tip:sched/core)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 561d1c505512..cf28d82fad41 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6156,7 +6156,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 		}
 
 		new_cpu = find_idlest_group_cpu(group, p, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
+		if (new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
 			continue;

From 9c825cf6165c1662ce588a1fe11a912eaeaec928 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 31 Aug 2017 12:58:00 +0100
Subject: [PATCH 1229/3220] BACKPORT: sched/fair: Fix find_idlest_group when
 local group is not allowed

When the local group is not allowed we do not modify this_*_load from
their initial value of 0. That means that the load checks at the end
of find_idlest_group cause us to incorrectly return NULL. Fixing the
initial values to ULONG_MAX means we will instead return the idlest
remote group in that case.

BACKPORT: Note 4.4 is missing commit 6b94780e45c1 "sched/core: Use
load_avg for selecting idlest group", so we only have to fix
this_load instead of this_runnable_load and this_avg_load.

Change-Id: I41f775b0e7c8f5e675c2780f955bb130a563cba7
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171005114516.18617-4-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry-picked-from: commit 0d10ab952e99 tip:sched/core)
(backport changes described above)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cf28d82fad41..dc685b67d08b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5983,7 +5983,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
 	struct sched_group *most_spare_sg = NULL;
-	unsigned long min_load = ULONG_MAX, this_load = 0;
+	unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
 	unsigned long most_spare = 0, this_spare = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;

From 411654764590c071ef55242ffd50c4e74930353b Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 31 Aug 2017 12:58:01 +0100
Subject: [PATCH 1230/3220] UPSTREAM: sched/fair: Fix usage of
 find_idlest_group() when no groups are allowed

When 'p' is not allowed on any of the CPUs in the sched_domain, we
currently return NULL from find_idlest_group(), and pointlessly
continue the search on lower sched_domain levels (where 'p' is also not
allowed) before returning prev_cpu regardless (as we have not updated
new_cpu).

Add an explicit check for this case, and add a comment to
find_idlest_group(). Now when find_idlest_group() returns NULL, it always
means that the local group is allowed and idlest.

Change-Id: I5f2648d2f7fb0465677961ecb7473df3d06f0057
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-5-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry-picked-from: commit 6fee85ccbc76 tip:sched/core)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dc685b67d08b..0e3a9935515c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5976,6 +5976,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -6136,6 +6138,9 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 		schedstat_inc(this_rq(), eas_stats.cas_attempts);
 	}
 
+	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+		return prev_cpu;
+
 	while (sd) {
 		struct sched_group *group;
 		struct sched_domain *tmp;

From 5a8663664915417aec806da2b0bc534d675f413d Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 31 Aug 2017 12:58:02 +0100
Subject: [PATCH 1231/3220] UPSTREAM: sched/fair: Fix usage of
 find_idlest_group() when the local group is idlest

find_idlest_group() returns NULL when the local group is idlest. The
caller then continues the find_idlest_group() search at a lower level
of the current CPU's sched_domain hierarchy. find_idlest_group_cpu() is
not consulted and, crucially, @new_cpu is not updated. This means the
search is pointless and we return @prev_cpu from select_task_rq_fair().

This is fixed by initialising @new_cpu to @cpu instead of @prev_cpu.

Change-Id: Ie531f5bb29775952bdc4c148b6e974b2f5f32b7a
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-6-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry-picked-from: commit 93f50f90247e tip:sched/core)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0e3a9935515c..bf7b7a7e778b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6129,7 +6129,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
 				  int cpu, int prev_cpu, int sd_flag)
 {
-	int new_cpu = prev_cpu;
+	int new_cpu = cpu;
 	int wu = sd_flag & SD_BALANCE_WAKE;
 	int cas_cpu = -1;
 

From 2f30db8df4076c54f92ac2ebe26734e27c164fd3 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Tue, 1 Aug 2017 15:48:37 +0100
Subject: [PATCH 1232/3220] UPSTREAM: sched/fair: Sync task util before
 slow-path wakeup

We use task_util() in find_idlest_group() via capacity_spare_wake().
This task_util() updated in wake_cap(). However wake_cap() is not the
only reason for ending up in find_idlest_group() - we could have been sent
there by wake_wide(). So explicitly sync the task util with prev_cpu
when we are about to head to find_idlest_group().

We could simply do this at the beginning of
select_task_rq_fair() (i.e. irrespective of whether we're heading to
select_idle_sibling() or find_idlest_group() & co), but I didn't want to
slow down the select_idle_sibling() path more than necessary.

Don't do this during fork balancing, we won't need the task_util and
we'd just clobber the last_update_time, which is supposed to be 0.

Change-Id: I935f4bfdfec3e8b914457aac3387ce264d5fd484
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andres Oportus <andresoportus@google.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20170808095519.10077-1-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry-picked-from: commit ea16f0ea6c3d tip:sched/core)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf7b7a7e778b..5d8081b77f8a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6757,6 +6757,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			new_cpu = cpu;
 	}
 
+	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+		/*
+		 * We're going to need the task's util for capacity_spare_wake
+		 * in find_idlest_group. Sync it up to prev_cpu's
+		 * last_update_time.
+		 */
+		sync_entity_load_avg(&p->se);
+	}
+
 	if (!sd) {
 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);

From 2aada289d7be37116f21d70177cb06a929b5d961 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 7 Sep 2017 12:24:45 +0100
Subject: [PATCH 1233/3220] sched/fair: trace energy_diff for non boosted tasks

In systems where SchedTune is enabled, we do not report energy diff for non
boosted tasks. Let's fix this by always genereting an energy_diff event where
however:
  nrg.delta = 0, since we skip energy normalization
  payoff = nrg.diff, since the payoff is defined just by the energy difference

Change-Id: I9a11ec19b6f56da04147f5ae5b47daf1dd180445
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5d8081b77f8a..a17820385ee2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5702,8 +5702,14 @@ energy_diff(struct energy_env *eenv)
 	__energy_diff(eenv);
 
 	/* Return energy diff when boost margin is 0 */
-	if (boost == 0)
+	if (boost == 0) {
+		trace_sched_energy_diff(eenv->task,
+				eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+				eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+				eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+				0, -eenv->nrg.diff);
 		return eenv->nrg.diff;
+	}
 
 	/* Compute normalized energy diff */
 	nrg_delta = normalize_energy(eenv->nrg.diff);

From 4edc5b0e387a906a00cb45af2abf76aa3a930438 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 7 Sep 2017 12:27:56 +0100
Subject: [PATCH 1234/3220] sched/fair: ignore backup CPU when not valid

The find_best_target can sometimes not return a valid backup CPU, either
because it cannot find one or just becasue it returns prev_cpu as a backup.
In these cases we should skip the energy_diff evaluation for the backup CPU.

Change-Id: I3787dbdfe74122348dd7a7485b88c4679051bd32
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a17820385ee2..417b373e4074 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6686,7 +6686,9 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 			/* No energy saving for target_cpu, try backup */
 			target_cpu = tmp_backup;
 			eenv.dst_cpu = target_cpu;
-			if (tmp_backup < 0 || energy_diff(&eenv) >= 0) {
+			if (tmp_backup < 0 ||
+			    tmp_backup == prev_cpu ||
+			    energy_diff(&eenv) >= 0) {
 				schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
 				schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
 				target_cpu = prev_cpu;

From ca42e804464bc9cb81e2ad07fe689c9f8514d5fe Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Mon, 17 Jul 2017 15:54:39 +0100
Subject: [PATCH 1235/3220] sched/fair: enforce EAS mode

For non latency sensitive tasks the goal is to optimize for energy efficiency.
Thus, we should try our best to avoid moving a task on a CPU which is then
going to be marked as overutilized.

Let's use the capacity_margin metric to verify if a candidate target CPU
should be considered without risking to bail out of EAS mode.

Change-Id: Ib3697106f4073aedf4a6c6ce42bd5d000fa8c007
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 417b373e4074..0784864eeb30 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6469,6 +6469,19 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 				continue;
 			}
 
+			/*
+			 * Enforce EAS mode
+			 *
+			 * For non latency sensitive tasks, skip CPUs that
+			 * will be overutilized by moving the task there.
+			 *
+			 * The goal here is to remain in EAS mode as long as
+			 * possible at least for !prefer_idle tasks.
+			 */
+			if ((new_util * capacity_margin) >
+			    (capacity_orig * SCHED_CAPACITY_SCALE))
+				continue;
+
 			/*
 			 * Case B) Non latency sensitive tasks on IDLE CPUs.
 			 *

From 5f8b3a757d6561e5668cb09c75b856347263718b Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Tue, 12 Sep 2017 14:44:24 +0100
Subject: [PATCH 1236/3220] sched/fair: consider task utilization in
 group_norm_util()

The group_norm_util() function is used to compute the normalized
utilization of a SG given a certain energy_env configuration.
The main client of this function is the energy_diff function when it
comes to compute the SG energy for one of the before/after scheduling
candidates.

Currently, the energy_diff function sets util_delta = 0 when it wants to
compute the energy corresponding to the scheduling candidate where the
task runs in the previous CPU. This implies that, for the task waking up
in the previous CPU we consider only its blocked load tracked by the CPU
RQ. However, in case of a medium-big task which is waking up on a long
time idle CPU, this blocked load can be already completely decayed.

More in general, the current approach is biased towards under-estimating
the energy consumption for the "before" scheduling candidate.

This patch fixes this by:
- always use the cpu_util_wake() to properly get the utilization of a CPU
  without any (partially decayed) contribution of the waking up task
- adding the task utilization to the cpu_util_wake just for the
  target cpu

The "target CPU" is defined by the energy_env to be either the src_cpu
or the dst_cpu, depending on which scheduling candidate we are
considering.

This patch update also the definition of __cpu_norm_util(), which is
currently called just by the group_norm_util() function. This allows to
simplify the code by using this function just to normalize a specified
utilization with respect to a given capacity.

This update allows to completely remove any dependency of
group_norm_util() from calc_util_delta().

Change-Id: I3b6ec50ce8decb1521faae660e326ab3319d3c82
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 58 ++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0784864eeb30..9575473f0cf6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5297,6 +5297,7 @@ struct energy_env {
 	int			util_delta;
 	int			src_cpu;
 	int			dst_cpu;
+	int			trg_cpu;
 	int			energy;
 	int			payoff;
 	struct task_struct	*task;
@@ -5313,11 +5314,14 @@ struct energy_env {
 	} cap;
 };
 
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
 /*
  * __cpu_norm_util() returns the cpu util relative to a specific capacity,
- * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
- * energy calculations. Using the scale-invariant util returned by
- * cpu_util() and approximating scale-invariant util by:
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
+ * energy calculations.
+ *
+ * Since util is a scale-invariant utilization defined as:
  *
  *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
  *
@@ -5327,10 +5331,8 @@ struct energy_env {
  *
  *   norm_util = running_time/time ~ util/capacity
  */
-static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
 {
-	int util = __cpu_util(cpu, delta);
-
 	if (util >= capacity)
 		return SCHED_CAPACITY_SCALE;
 
@@ -5362,28 +5364,37 @@ unsigned long group_max_util(struct energy_env *eenv)
 
 /*
  * group_norm_util() returns the approximated group util relative to it's
- * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
- * energy calculations. Since task executions may or may not overlap in time in
- * the group the true normalized util is between max(cpu_norm_util(i)) and
- * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
- * latter is used as the estimate as it leads to a more pessimistic energy
+ * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
+ * in energy calculations.
+ *
+ * Since task executions may or may not overlap in time in the group the true
+ * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
+ * when iterating over all CPUs in the group.
+ * The latter estimate is used as it leads to a more pessimistic energy
  * estimate (more busy).
  */
 static unsigned
 long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
 {
-	int i, delta;
-	unsigned long util_sum = 0;
 	unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+	unsigned long util, util_sum = 0;
+	int cpu;
 
-	for_each_cpu(i, sched_group_cpus(sg)) {
-		delta = calc_util_delta(eenv, i);
-		util_sum += __cpu_norm_util(i, capacity, delta);
+	for_each_cpu(cpu, sched_group_cpus(sg)) {
+		util = cpu_util_wake(cpu, eenv->task);
+
+		/*
+		 * If we are looking at the target CPU specified by the eenv,
+		 * then we should add the (estimated) utilization of the task
+		 * assuming we will wake it up on that CPU.
+		 */
+		if (unlikely(cpu == eenv->trg_cpu))
+			util += eenv->util_delta;
+
+		util_sum += __cpu_norm_util(util, capacity);
 	}
 
-	if (util_sum > SCHED_CAPACITY_SCALE)
-		return SCHED_CAPACITY_SCALE;
-	return util_sum;
+	return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
 }
 
 static int find_new_capacity(struct energy_env *eenv,
@@ -5575,6 +5586,8 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
 	return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
 }
 
+static inline unsigned long task_util(struct task_struct *p);
+
 /*
  * energy_diff(): Estimate the energy impact of changing the utilization
  * distribution. eenv specifies the change: utilisation amount, source, and
@@ -5590,11 +5603,13 @@ static inline int __energy_diff(struct energy_env *eenv)
 	int diff, margin;
 
 	struct energy_env eenv_before = {
-		.util_delta	= 0,
+		.util_delta	= task_util(eenv->task),
 		.src_cpu	= eenv->src_cpu,
 		.dst_cpu	= eenv->dst_cpu,
+		.trg_cpu	= eenv->src_cpu,
 		.nrg		= { 0, 0, 0, 0},
 		.cap		= { 0, 0, 0 },
+		.task		= eenv->task,
 	};
 
 	if (eenv->src_cpu == eenv->dst_cpu)
@@ -5972,8 +5987,6 @@ boosted_task_util(struct task_struct *task)
 	return util + margin;
 }
 
-static int cpu_util_wake(int cpu, struct task_struct *p);
-
 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 {
 	return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
@@ -6681,6 +6694,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 			.src_cpu        = prev_cpu,
 			.dst_cpu        = target_cpu,
 			.task           = p,
+			.trg_cpu	= target_cpu,
 		};
 
 

From 3c71cbb896fe15de4f365223af45096c4098c309 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 1 Jun 2017 16:40:22 +0100
Subject: [PATCH 1237/3220] sched/fair: consider task utilization in
 group_max_util()

The group_max_util() function is used to compute the maximum utilization
across the CPUs of a certain energy_env configuration.
Its main client is the energy_diff function when it needs to compute the
SG capacity for one of the before/after scheduling candidates.

Currently, the energy_diff function sets util_delta = 0 when it wants to
compute the energy corresponding to the scheduling candidate where the
task runs in the previous CPU. This implies that, for the task waking up
in the previous CPU we consider only its blocked load tracked by the CPU
RQ. However, in case of a medium-big task which is waking up on a long
time idle CPU, this blocked load can be already completely decayed.

More in general, the current approach is biased towards under-estimating
the capacity requirements for the "before" scheduling candidate.

This patch fixes this by:
- always use the cpu_util_wake() to properly get the utilization of a CPU
  without any (partially decayed) contribution of the waking up task
- adding the task utilization to the cpu_util_wake just for the target
  cpu

The "target CPU" is defined by the energy_env to be either the src_cpu or
the dst_cpu, depending on which scheduling candidate we are considering.

Finally, since this update removes the last usage of calc_util_delta()
this function is now safely removed.

Change-Id: I20ee1bcf40cee6bf6e265fb2d32ef79061ad6ced
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9575473f0cf6..e9919aeacdd5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5339,24 +5339,24 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
 	return (util << SCHED_CAPACITY_SHIFT)/capacity;
 }
 
-static int calc_util_delta(struct energy_env *eenv, int cpu)
+static unsigned long group_max_util(struct energy_env *eenv)
 {
-	if (cpu == eenv->src_cpu)
-		return -eenv->util_delta;
-	if (cpu == eenv->dst_cpu)
-		return eenv->util_delta;
-	return 0;
-}
-
-static
-unsigned long group_max_util(struct energy_env *eenv)
-{
-	int i, delta;
 	unsigned long max_util = 0;
+	unsigned long util;
+	int cpu;
 
-	for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
-		delta = calc_util_delta(eenv, i);
-		max_util = max(max_util, __cpu_util(i, delta));
+	for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
+		util = cpu_util_wake(cpu, eenv->task);
+
+		/*
+		 * If we are looking at the target CPU specified by the eenv,
+		 * then we should add the (estimated) utilization of the task
+		 * assuming we will wake it up on that CPU.
+		 */
+		if (unlikely(cpu == eenv->trg_cpu))
+			util += eenv->util_delta;
+
+		max_util = max(max_util, util);
 	}
 
 	return max_util;

From e3ba92c160d3fb21af5b28a21c63851fad21b168 Mon Sep 17 00:00:00 2001
From: Russ Weight <russell.h.weight@intel.com>
Date: Thu, 8 Jun 2017 11:38:59 -0700
Subject: [PATCH 1238/3220] sched/tune: access schedtune_initialized under
 CGROUP_SCHEDTUNE

schedtune_initialized is protected by CONFIG_CGROUP_SCHEDTUNE, but
is being used without CONFIG_CGROUP_SCHEDTUNE being defined. Add
appropriate ifdefs around the usage of schedtune_initialized to
avoid a compilation error when CONFIG_CGROUP_SCHEDTUNE is not
defined.

Change-Id: Iab79bf053d74db3eeb84c09d71d43b4e39746ed2
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Signed-off-by: Fei Yang <fei.yang@intel.com>
---
 kernel/sched/fair.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9919aeacdd5..6023d9e3a9f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5668,7 +5668,11 @@ static inline int __energy_diff(struct energy_env *eenv)
 #ifdef CONFIG_SCHED_TUNE
 
 struct target_nrg schedtune_target_nrg;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
 extern bool schedtune_initialized;
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
 /*
  * System energy normalization
  * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
@@ -5679,9 +5683,11 @@ normalize_energy(int energy_diff)
 {
 	u32 normalized_nrg;
 
+#ifdef CONFIG_CGROUP_SCHEDTUNE
 	/* during early setup, we don't know the extents */
 	if (unlikely(!schedtune_initialized))
 		return energy_diff < 0 ? -1 : 1 ;
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
 
 #ifdef CONFIG_SCHED_DEBUG
 	{

From effc721b3c9b3bd258313450cdfd3d0e644f4d85 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Mon, 7 Aug 2017 18:14:37 +0800
Subject: [PATCH 1239/3220] sched/fair: remove useless variable in
 find_best_target

Patch 5680f23f20c7 ("sched/fair: streamline find_best_target
heuristics") has reworked function find_best_target, as result the
variable "target_util" is useless now. So remove it.

Change-Id: I5447062419e5828a49115119984fac6cd37db034
Signed-off-by: Leo Yan <leo.yan@linaro.org>
---
 kernel/sched/fair.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6023d9e3a9f5..9bc717ef81f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6350,7 +6350,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 	unsigned long target_capacity = ULONG_MAX;
 	unsigned long min_wake_util = ULONG_MAX;
 	unsigned long target_max_spare_cap = 0;
-	unsigned long target_util = ULONG_MAX;
 	unsigned long best_active_util = ULONG_MAX;
 	int best_idle_cstate = INT_MAX;
 	struct sched_domain *sd;
@@ -6579,7 +6578,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 
 			target_max_spare_cap = capacity_orig - new_util;
 			target_capacity = capacity_orig;
-			target_util = new_util;
 			target_cpu = i;
 		}
 

From 43bd960dfe728284e1059f11d6c686d23887c1c6 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Fri, 3 Feb 2017 11:15:31 -0800
Subject: [PATCH 1240/3220] sched: WALT: account cumulative window demand

Energy cost estimation has been a long lasting challenge for WALT
because WALT guides CPU frequency based on the CPU utilization of
previous window.  Consequently it's not possible to know newly
waking-up task's energy cost until WALT's end of the current window.

The WALT already tracks 'Previous Runnable Sum' (prev_runnable_sum)
and 'Cumulative Runnable Average' (cr_avg).  They are designed for
CPU frequency guidance and task placement but unfortunately both
are not suitable for the energy cost estimation.

It's because using prev_runnable_sum for energy cost calculation would
make us to account CPU and task's energy solely based on activity in the
previous window so for example, any task didn't have an activity in the
previous window will be accounted as a 'zero energy cost' task.
Energy estimation with cr_avg is what energy_diff() relies on at present.
However cr_avg can only represent instantaneous picture of energy cost
thus for example, if a CPU was fully occupied for an entire WALT window
and became idle just before window boundary, and if there is a wake-up,
energy_diff() accounts that CPU is a 'zero energy cost' CPU.

As a result, introduce a new accounting unit 'Cumulative Window Demand'.
The cumulative window demand tracks all the tasks' demands have seen in
current window which is neither instantaneous nor actual execution time.
Because task demand represents estimated scaled execution time when the
task runs a full window, accumulation of all the demands represents
predicted CPU load at the end of window.

Thus we can estimate CPU's frequency at the end of current WALT window
with the cumulative window demand.

The use of prev_runnable_sum for the CPU frequency guidance and cr_avg
for the task placement have not changed and these are going to be used
for both purpose while this patch aims to add an additional statistics.

Change-Id: I9908c77ead9973a26dea2b36c001c2baf944d4f5
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/core.c   |  8 +++++++
 kernel/sched/sched.h  | 12 ++++++++++
 kernel/sched/walt.c   | 56 ++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1872d19d1702..62e179f84aef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1568,6 +1568,7 @@ struct task_struct {
 	 * of this task
 	 */
 	u32 init_load_pct;
+	u64 last_sleep_ts;
 #endif
 
 #ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca9d72fa8e66..563f316f5330 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2170,6 +2170,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
+#ifdef CONFIG_SCHED_WALT
+	p->last_sleep_ts		= 0;
+#endif
+
 	INIT_LIST_HEAD(&p->se.group_node);
 	walt_init_new_task_load(p);
 
@@ -3379,6 +3383,10 @@ static void __sched notrace __schedule(bool preempt)
 	rq->clock_skip_update = 0;
 
 	if (likely(prev != next)) {
+#ifdef CONFIG_SCHED_WALT
+		if (!prev->on_rq)
+			prev->last_sleep_ts = wallclock;
+#endif
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8d3712107e61..deba080af845 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -690,6 +690,7 @@ struct rq {
 	u64 cur_irqload;
 	u64 avg_irqload;
 	u64 irqload_ts;
+	u64 cum_window_demand;
 #endif /* CONFIG_SCHED_WALT */
 
 
@@ -2101,6 +2102,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 
+#ifdef CONFIG_SCHED_WALT
+
+static inline bool
+walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+	return cpu_of(rq) == task_cpu(p) &&
+	       (p->on_rq || p->last_sleep_ts >= rq->window_start);
+}
+
+#endif /* CONFIG_SCHED_WALT */
+
 #ifdef arch_scale_freq_capacity
 #ifndef arch_scale_freq_invariant
 #define arch_scale_freq_invariant()     (true)
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 441cba01bc04..93f61486f989 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -70,11 +70,28 @@ static unsigned int task_load(struct task_struct *p)
 	return p->ravg.demand;
 }
 
+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+	rq->cum_window_demand += delta;
+	if (unlikely((s64)rq->cum_window_demand < 0))
+		rq->cum_window_demand = 0;
+}
+
 void
 walt_inc_cumulative_runnable_avg(struct rq *rq,
 				 struct task_struct *p)
 {
 	rq->cumulative_runnable_avg += p->ravg.demand;
+
+	/*
+	 * Add a task's contribution to the cumulative window demand when
+	 *
+	 * (1) task is enqueued with on_rq = 1 i.e migration,
+	 *     prio/cgroup/class change.
+	 * (2) task is waking for the first time in this window.
+	 */
+	if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+		fixup_cum_window_demand(rq, p->ravg.demand);
 }
 
 void
@@ -83,6 +100,14 @@ walt_dec_cumulative_runnable_avg(struct rq *rq,
 {
 	rq->cumulative_runnable_avg -= p->ravg.demand;
 	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+	/*
+	 * on_rq will be 1 for sleeping tasks. So check if the task
+	 * is migrating or dequeuing in RUNNING state to change the
+	 * prio/cgroup/class.
+	 */
+	if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+		fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
 }
 
 static void
@@ -95,6 +120,8 @@ fixup_cumulative_runnable_avg(struct rq *rq,
 	if ((s64)rq->cumulative_runnable_avg < 0)
 		panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
 			task_load_delta, task_load(p));
+
+	fixup_cum_window_demand(rq, task_load_delta);
 }
 
 u64 walt_ktime_clock(void)
@@ -180,6 +207,8 @@ update_window_start(struct rq *rq, u64 wallclock)
 
 	nr_windows = div64_u64(delta, walt_ravg_window);
 	rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+	rq->cum_window_demand = rq->cumulative_runnable_avg;
 }
 
 /*
@@ -568,10 +597,20 @@ static void update_history(struct rq *rq, struct task_struct *p,
 	 * A throttled deadline sched class task gets dequeued without
 	 * changing p->on_rq. Since the dequeue decrements hmp stats
 	 * avoid decrementing it here again.
+	 *
+	 * When window is rolled over, the cumulative window demand
+	 * is reset to the cumulative runnable average (contribution from
+	 * the tasks on the runqueue). If the current task is dequeued
+	 * already, it's demand is not included in the cumulative runnable
+	 * average. So add the task demand separately to cumulative window
+	 * demand.
 	 */
-	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
-						!p->dl.dl_throttled))
-		fixup_cumulative_runnable_avg(rq, p, demand);
+	if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+		if (task_on_rq_queued(p))
+			fixup_cumulative_runnable_avg(rq, p, demand);
+		else if (rq->curr == p)
+			fixup_cum_window_demand(rq, demand);
+	}
 
 	p->ravg.demand = demand;
 
@@ -792,6 +831,17 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
 
 	walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
 
+	/*
+	 * When a task is migrating during the wakeup, adjust
+	 * the task's contribution towards cumulative window
+	 * demand.
+	 */
+	if (p->state == TASK_WAKING &&
+	    p->last_sleep_ts >= src_rq->window_start) {
+		fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+		fixup_cum_window_demand(dest_rq, p->ravg.demand);
+	}
+
 	if (p->ravg.curr_window) {
 		src_rq->curr_runnable_sum -= p->ravg.curr_window;
 		dest_rq->curr_runnable_sum += p->ravg.curr_window;

From 38ddcff85af052ad99e4f3f0b6e9659b0ca10dcf Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Mon, 7 Aug 2017 15:46:13 +0100
Subject: [PATCH 1241/3220] FROMLIST: sched/fair: Use wake_q length as a hint
 for wake_wide

(from https://patchwork.kernel.org/patch/9895261/)

This patch adds a parameter to select_task_rq, sibling_count_hint
allowing the caller, where it has this information, to inform the
sched_class the number of tasks that are being woken up as part of
the same event.

The wake_q mechanism is one case where this information is available.

select_task_rq_fair can then use the information to detect that it
needs to widen the search space for task placement in order to avoid
overloading the last-level cache domain's CPUs.

                               * * *

The reason I am investigating this change is the following use case
on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which
all repeatedly do X amount of work then
pthread_barrier_wait (i.e. sleep until the last task finishes its X
and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU
finish faster, and then those CPUs pull over the tasks that are still
running:

     v CPU v           ->time->

                    -------------
   0  (big)         11111  /333
                    -------------
   1  (big)         22222   /444|
                    -------------
   2  (LITTLE)      333333/
                    -------------
   3  (LITTLE)      444444/
                    -------------

Now when task 4 hits the barrier (at |) and wakes the others up,
there are 4 tasks with prev_cpu=<big> and 0 tasks with
prev_cpu=<little>. want_affine therefore means that we'll only look
in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled
on the bigs until the next load balance, something like this:

     v CPU v           ->time->

                    ------------------------
   0  (big)         11111  /333  31313\33333
                    ------------------------
   1  (big)         22222   /444|424\4444444
                    ------------------------
   2  (LITTLE)      333333/          \222222
                    ------------------------
   3  (LITTLE)      444444/            \1111
                    ------------------------
                                 ^^^
                           underutilization

So, I'm trying to get want_affine = 0 for these tasks.

I don't _think_ any incarnation of the wakee_flips mechanism can help
us here because which task is waker and which tasks are wakees
generally changes with each iteration.

However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the
nice property that we know exactly how many tasks are being woken, so
we can cheat.

It might be a disadvantage that we "widen" _every_ task that's woken in
an event, while select_idle_sibling would work fine for the first
sd_llc_size - 1 tasks.

IIUC, if wake_affine() behaves correctly this trick wouldn't be
necessary on SMP systems, so it might be best guarded by the presence
of SD_ASYM_CPUCAPACITY?

                               * * *

Final note..

In order to observe "perfect" behaviour for this use case, I also had
to disable the TTWU_QUEUE sched feature. Suppose during the wakeup
above we are working through the work queue and have placed tasks 3
and 2, and are about to place task 1:

     v CPU v           ->time->

                    --------------
   0  (big)         11111  /333  3
                    --------------
   1  (big)         22222   /444|4
                    --------------
   2  (LITTLE)      333333/      2
                    --------------
   3  (LITTLE)      444444/          <- Task 1 should go here
                    --------------

If TTWU_QUEUE is enabled, we will not yet have enqueued task
2 (having instead sent a reschedule IPI) or attached its load to CPU
2. So we are likely to also place task 1 on cpu 2. Disabling
TTWU_QUEUE means that we enqueue task 2 before placing task 1,
solving this issue. TTWU_QUEUE is there to minimise rq lock
contention, and I guess that this contention is less of an issue on
big.LITTLE systems since they have relatively few CPUs, which
suggests the trade-off makes sense here.

Change-Id: I2080302839a263e0841a89efea8589ea53bbda9c
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
---
 include/linux/sched.h    |  3 ++-
 kernel/sched/core.c      | 35 +++++++++++++++++++++++------------
 kernel/sched/deadline.c  |  3 ++-
 kernel/sched/fair.c      | 21 ++++++++++++++-------
 kernel/sched/idle_task.c |  3 ++-
 kernel/sched/rt.c        |  3 ++-
 kernel/sched/sched.h     |  3 ++-
 kernel/sched/stop_task.c |  3 ++-
 8 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62e179f84aef..60af9d2fa922 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -993,12 +993,13 @@ struct wake_q_node {
 struct wake_q_head {
 	struct wake_q_node *first;
 	struct wake_q_node **lastp;
+	int count;
 };
 
 #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
 
 #define WAKE_Q(name)					\
-	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+	struct wake_q_head name = { WAKE_Q_TAIL, &name.first, 0 }
 
 extern void wake_q_add(struct wake_q_head *head,
 		       struct task_struct *task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 563f316f5330..18d607f9a417 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -546,6 +546,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
 		return;
 
+	head->count++;
+
 	get_task_struct(task);
 
 	/*
@@ -555,6 +557,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 	head->lastp = &node->next;
 }
 
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+	       int sibling_count_hint);
+
 void wake_up_q(struct wake_q_head *head)
 {
 	struct wake_q_node *node = head->first;
@@ -569,10 +575,10 @@ void wake_up_q(struct wake_q_head *head)
 		task->wake_q.next = NULL;
 
 		/*
-		 * wake_up_process() implies a wmb() to pair with the queueing
+		 * try_to_wake_up() implies a wmb() to pair with the queueing
 		 * in wake_q_add() so as not to miss wakeups.
 		 */
-		wake_up_process(task);
+		try_to_wake_up(task, TASK_NORMAL, 0, head->count);
 		put_task_struct(task);
 	}
 }
@@ -1642,12 +1648,14 @@ out:
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
+		   int sibling_count_hint)
 {
 	lockdep_assert_held(&p->pi_lock);
 
 	if (p->nr_cpus_allowed > 1)
-		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
+						     sibling_count_hint);
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -1932,6 +1940,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
  * @p: the thread to be awakened
  * @state: the mask of task states that can be woken
  * @wake_flags: wake modifier flags (WF_*)
+ * @sibling_count_hint: A hint at the number of threads that are being woken up
+ *                      in this event.
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
@@ -1943,7 +1953,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
  * or @state didn't match @p's state.
  */
 static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+	       int sibling_count_hint)
 {
 	unsigned long flags;
 	int cpu, success = 0;
@@ -2044,8 +2055,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	if (p->sched_class->task_waking)
 		p->sched_class->task_waking(p);
 
-	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
-
+	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
+			     sibling_count_hint);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
@@ -2127,13 +2138,13 @@ out:
  */
 int wake_up_process(struct task_struct *p)
 {
-	return try_to_wake_up(p, TASK_NORMAL, 0);
+	return try_to_wake_up(p, TASK_NORMAL, 0, 1);
 }
 EXPORT_SYMBOL(wake_up_process);
 
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	return try_to_wake_up(p, state, 0, 1);
 }
 
 /*
@@ -2467,7 +2478,7 @@ void wake_up_new_task(struct task_struct *p)
 	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
 	 * as we're not fully set-up yet.
 	 */
-	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
 #endif
 	rq = __task_rq_lock(p);
 	update_rq_clock(rq);
@@ -2905,7 +2916,7 @@ void sched_exec(void)
 	int dest_cpu;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 
@@ -3560,7 +3571,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
-	return try_to_wake_up(curr->private, mode, wake_flags);
+	return try_to_wake_up(curr->private, mode, wake_flags, 1);
 }
 EXPORT_SYMBOL(default_wake_function);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ab1a9a99660d..cdf2a0b97611 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1070,7 +1070,8 @@ static void yield_task_dl(struct rq *rq)
 static int find_later_rq(struct task_struct *task);
 
 static int
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
+		  int sibling_count_hint)
 {
 	struct task_struct *curr;
 	struct rq *rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9bc717ef81f5..b904a023be95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5773,15 +5773,18 @@ energy_diff(struct energy_env *eenv)
  * being client/server, worker/dispatcher, interrupt source or whatever is
  * irrelevant, spread criteria is apparent partner count exceeds socket size.
  */
-static int wake_wide(struct task_struct *p)
+static int wake_wide(struct task_struct *p, int sibling_count_hint)
 {
 	unsigned int master = current->wakee_flips;
 	unsigned int slave = p->wakee_flips;
-	int factor = this_cpu_read(sd_llc_size);
+	int llc_size = this_cpu_read(sd_llc_size);
+
+	if (sibling_count_hint >= llc_size)
+		return 1;
 
 	if (master < slave)
 		swap(master, slave);
-	if (slave < factor || master < slave * factor)
+	if (slave < llc_size || master < slave * llc_size)
 		return 0;
 	return 1;
 }
@@ -6754,7 +6757,8 @@ unlock:
  * preempt must be disabled.
  */
 static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
+		    int sibling_count_hint)
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
@@ -6762,9 +6766,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 
-	if (sd_flag & SD_BALANCE_WAKE)
-		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
-			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+	if (sd_flag & SD_BALANCE_WAKE) {
+		record_wakee(p);
+		want_affine = !wake_wide(p, sibling_count_hint) &&
+			      !wake_cap(p, cpu, prev_cpu) &&
+			      cpumask_test_cpu(cpu, &p->cpus_allowed);
+	}
 
 	if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
 		return select_energy_cpu_brute(p, prev_cpu, sync);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c4ae0f1fdf9b..33d7003fa1b8 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,8 @@
 
 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags,
+		    int sibling_count_hint)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3715473fd8f8..069f8982867f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1372,7 +1372,8 @@ static void yield_task_rt(struct rq *rq)
 static int find_lowest_rq(struct task_struct *task);
 
 static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
+		  int sibling_count_hint)
 {
 	struct task_struct *curr;
 	struct rq *rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index deba080af845..a2d5de8415e1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1261,7 +1261,8 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags,
+			       int subling_count_hint);
 	void (*migrate_task_rq)(struct task_struct *p);
 
 	void (*task_waking) (struct task_struct *task);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 61f852d46858..a5567ccd8803 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -12,7 +12,8 @@
 
 #ifdef CONFIG_SMP
 static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
+		    int sibling_count_hint)
 {
 	return task_cpu(p); /* stop tasks as never migrate */
 }

From e79f447a9762f68d6ecf8371bcf3e970bceb662a Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Thu, 10 Aug 2017 17:26:20 -0700
Subject: [PATCH 1242/3220] sched: walt: Correct WALT window size
 initialization

It is preferable that WALT window rollover occurs just
before a tick, since the tick is an opportune moment
to record a complete window's statistics, as well as report
those stats to the cpu frequency governor. When CONFIG_HZ
results in a TICK_NSEC that isn't a integral number, this
requirement may be violated. Account for this by reducing
the WALT window size to the nearest multiple of TICK_NSEC.

Commit d368c6faa19b ("sched: walt: fix window misalignment
when HZ=300") attempted to do this but WALT isn't using
MIN_SCHED_RAVG_WINDOW as the window size and the patch was
doing nothing.

Also, change the type of 'walt_disabled' to bool and warn
if an invalid window size causes WALT to be disabled.

Change-Id: Ie3dcfc21a3df4408254ca1165a355bbe391ed5c7
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 include/trace/events/sched.h |  2 +-
 kernel/sched/sched.h         |  2 +-
 kernel/sched/walt.c          | 46 ++++++++++++++++++++++--------------
 kernel/sched/walt.h          |  2 +-
 4 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index bf96bf05be82..9f7fd0c6662b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -642,7 +642,7 @@ TRACE_EVENT(sched_contrib_scale_f,
 extern unsigned int sysctl_sched_use_walt_cpu_util;
 extern unsigned int sysctl_sched_use_walt_task_util;
 extern unsigned int walt_ravg_window;
-extern unsigned int walt_disabled;
+extern bool walt_disabled;
 #endif
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a2d5de8415e1..dd86072eaf4e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1560,7 +1560,7 @@ static inline unsigned long capacity_orig_of(int cpu)
 
 extern unsigned int sysctl_sched_use_walt_cpu_util;
 extern unsigned int walt_ravg_window;
-extern unsigned int walt_disabled;
+extern bool walt_disabled;
 
 /*
  * cpu_util returns the amount of capacity of a CPU that is used by CFS
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 93f61486f989..8d25ffbe4fed 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -41,25 +41,17 @@ static __read_mostly unsigned int walt_io_is_busy = 0;
 
 unsigned int sysctl_sched_walt_init_task_load_pct = 15;
 
-/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
-unsigned int __read_mostly walt_disabled = 0;
+/* true -> use PELT based load stats, false -> use window-based load stats */
+bool __read_mostly walt_disabled = false;
 
-/* Window size (in ns) */
-__read_mostly unsigned int walt_ravg_window = 20000000;
-
-/* Min window size (in ns) = 10ms */
-#ifdef CONFIG_HZ_300
 /*
- * Tick interval becomes to 3333333 due to
- * rounding error when HZ=300.
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
  */
-#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
-#else
-#define MIN_SCHED_RAVG_WINDOW 10000000
-#endif
-
-/* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
+__read_mostly unsigned int walt_ravg_window =
+					    (20000000 / TICK_NSEC) * TICK_NSEC;
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
 
 static unsigned int sync_cpu;
 static ktime_t ktime_last;
@@ -180,10 +172,28 @@ static int exiting_task(struct task_struct *p)
 
 static int __init set_walt_ravg_window(char *str)
 {
+	unsigned int adj_window;
+	bool no_walt = walt_disabled;
+
 	get_option(&str, &walt_ravg_window);
 
-	walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
-				walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+	/* Adjust for CONFIG_HZ */
+	adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
+
+	/* Warn if we're a bit too far away from the expected window size */
+	WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
+	     "tick-adjusted window size %u, original was %u\n", adj_window,
+	     walt_ravg_window);
+
+	walt_ravg_window = adj_window;
+
+	walt_disabled = walt_disabled ||
+			(walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+			 walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+	WARN(!no_walt && walt_disabled,
+	     "invalid window size, disabling WALT\n");
+
 	return 0;
 }
 
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index f56c4da16d0b..de7edac43674 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -59,6 +59,6 @@ static inline u64 walt_ktime_clock(void) { return 0; }
 
 #endif /* CONFIG_SCHED_WALT */
 
-extern unsigned int walt_disabled;
+extern bool walt_disabled;
 
 #endif

From a21299785a502ca4b3592a0f977aa1202b105260 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 22 Sep 2016 12:25:56 +0100
Subject: [PATCH 1243/3220] sched/core: Warn if ENERGY_AWARE is enabled but
 data is missing

If the EAS energy model is missing or incomplete, i.e. sd_scs is NULL, then
sched_group_energy will return -EINVAL on the assumption that it raced with a
CPU hotplug event. In that case, energy_diff will return 0 and the energy-aware
wake path will silently fail to trigger any migrations.

This case can be triggered by disabling CONFIG_SCHED_MC on existing platforms,
so that there are no sched_groups with the SD_SHARE_CAP_STATES flag, so that
sd_scs is NULL.

Add checks so that a warning is printed if EAS is ever enabled while the
necessary data is not present.

Change-Id: Id233a510b5ad8b7fcecac0b1d789e730bbfc7c4a
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
---
 kernel/sched/core.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18d607f9a417..4f11b84eaf0a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,6 +91,8 @@
 #include <trace/events/sched.h>
 #include "walt.h"
 
+static bool have_sched_energy_data(void);
+
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -193,6 +195,10 @@ static int sched_feat_set(char *cmp)
 				sysctl_sched_features &= ~(1UL << i);
 				sched_feat_disable(i);
 			} else {
+				if (i == __SCHED_FEAT_ENERGY_AWARE)
+					WARN(!have_sched_energy_data(),
+					     "Missing sched energy data\n");
+
 				sysctl_sched_features |= (1UL << i);
 				sched_feat_enable(i);
 			}
@@ -6649,6 +6655,19 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 
+static bool have_sched_energy_data(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (!rcu_dereference(per_cpu(sd_scs, cpu)) ||
+		    !rcu_dereference(per_cpu(sd_ea, cpu)))
+			return false;
+	}
+
+	return true;
+}
+
 /*
  * Check that the per-cpu provided sd energy data is consistent for all cpus
  * within the mask.
@@ -7461,6 +7480,9 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	}
 	rcu_read_unlock();
 
+	WARN(sched_feat(ENERGY_AWARE) && !have_sched_energy_data(),
+	     "Missing data for energy aware scheduling\n");
+
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);

From a899b9085c8d5d581b214c24dc707466e8cb479f Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Fri, 27 Oct 2017 13:23:05 -0700
Subject: [PATCH 1244/3220] sched/core: fix have_sched_energy_data build
 warning

have_sched_energy_data is defined only for CONFIG_SMP, so declare it
only with CONFIG_SMP.

Fixes warning from intel bot:

tree:   https://android.googlesource.com/kernel/msm android-4.4
head:   a21299785a502ca4b3592a0f977aa1202b105260
commit: a21299785a502ca4b3592a0f977aa1202b105260 [5/5] sched/core: Warn
if ENERGY_AWARE is enabled but data is missing
config: i386-randconfig-x002-201743 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        git checkout a21299785a502ca4b3592a0f977aa1202b105260
        # save the attached .config to linux build tree
        make ARCH=i386

All warnings (new ones prefixed by >>):

>> kernel//sched/core.c:94:13: warning: 'have_sched_energy_data' used
but never defined
    static bool have_sched_energy_data(void);
                ^~~~~~~~~~~~~~~~~~~~~~

vim +/have_sched_energy_data +94 kernel//sched/core.c

    93
  > 94  static bool have_sched_energy_data(void);
    95

Change-Id: I266b63ece6fb31d2b5b11821a8244e147ba6d3a4
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/sched/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4f11b84eaf0a..4e5298a1977e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,7 +91,9 @@
 #include <trace/events/sched.h>
 #include "walt.h"
 
+#ifdef CONFIG_SMP
 static bool have_sched_energy_data(void);
+#endif
 
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -195,10 +197,11 @@ static int sched_feat_set(char *cmp)
 				sysctl_sched_features &= ~(1UL << i);
 				sched_feat_disable(i);
 			} else {
+#ifdef CONFIG_SMP
 				if (i == __SCHED_FEAT_ENERGY_AWARE)
 					WARN(!have_sched_energy_data(),
 					     "Missing sched energy data\n");
-
+#endif
 				sysctl_sched_features |= (1UL << i);
 				sched_feat_enable(i);
 			}

From 8f012745e7f6c847ae0843e4ed171e372748b822 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 25 Oct 2017 09:29:46 -0700
Subject: [PATCH 1245/3220] BACKPORT: arm64: relocatable: suppress
 R_AARCH64_ABS64 relocations in vmlinux

The linker routines that we rely on to produce a relocatable PIE binary
treat it as a shared ELF object in some ways, i.e., it emits symbol based
R_AARCH64_ABS64 relocations into the final binary since doing so would be
appropriate when linking a shared library that is subject to symbol
preemption. (This means that an executable can override certain symbols
that are exported by a shared library it is linked with, and that the
shared library *must* update all its internal references as well, and point
them to the version provided by the executable.)

Symbol preemption does not occur for OS hosted PIE executables, let alone
for vmlinux, and so we would prefer to get rid of these symbol based
relocations. This would allow us to simplify the relocation routines, and
to strip the .dynsym, .dynstr and .hash sections from the binary. (Note
that these are tiny, and are placed in the .init segment, but they clutter
up the vmlinux binary.)

Note that these R_AARCH64_ABS64 relocations are only emitted for absolute
references to symbols defined in the linker script, all other relocatable
quantities are covered by anonymous R_AARCH64_RELATIVE relocations that
simply list the offsets to all 64-bit values in the binary that need to be
fixed up based on the offset between the link time and run time addresses.

Fortunately, GNU ld has a -Bsymbolic option, which is intended for shared
libraries to allow them to ignore symbol preemption, and unconditionally
bind all internal symbol references to its own definitions. So set it for
our PIE binary as well, and get rid of the asoociated sections and the
relocation code that processes them.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
[will: fixed conflict with __dynsym_offset linker script entry]
Signed-off-by: Will Deacon <will.deacon@arm.com>

Note: This backport only adds -Bsymbolic to LDFLAGS_vmlinux, but doesn't
remove R_AARCH64_ABS64 relocation handling, because those changes depend
on later refactoring of the code that we don't need in android-4.4.

Bug: 66932127
Change-Id: I56f664e02bc8d2fa3e5f496fb041bc3a8e1a4094
(cherry picked from commit 08cc55b2afd97a654f71b3bebf8bb0ec89fdc498)
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index c3bc092fc128..53c054491254 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -16,7 +16,7 @@ OBJCOPYFLAGS	:=-O binary -R .note -R .note.gnu.build-id -R .comment -S
 GZFLAGS		:=-9
 
 ifneq ($(CONFIG_RELOCATABLE),)
-LDFLAGS_vmlinux		+= -pie
+LDFLAGS_vmlinux		+= -pie -Bsymbolic
 endif
 
 KBUILD_DEFCONFIG := defconfig

From 506964fd02a84cb56a9256f08593b9bcaf463e6d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 4 Apr 2017 17:09:08 +0100
Subject: [PATCH 1246/3220] BACKPORT: efi/libstub: Unify command line param
 parsing

Merge the parsing of the command line carried out in arm-stub.c with
the handling in efi_parse_options(). Note that this also fixes the
missing handling of CONFIG_CMDLINE_FORCE=y, in which case the builtin
command line should supersede the one passed by the firmware.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bhe@redhat.com
Cc: bhsharma@redhat.com
Cc: bp@alien8.de
Cc: eugene@hp.com
Cc: evgeny.kalugin@intel.com
Cc: jhugo@codeaurora.org
Cc: leif.lindholm@linaro.org
Cc: linux-efi@vger.kernel.org
Cc: mark.rutland@arm.com
Cc: roy.franz@cavium.com
Cc: rruigrok@codeaurora.org
Link: http://lkml.kernel.org/r/20170404160910.28115-1-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 60f38de7a8d4e816100ceafd1b382df52527bd50)

Change-Id: I936ac5f634bc677fa3dcc2f7bdc8b1b06603d57a
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 drivers/firmware/efi/libstub/arm-stub.c       | 24 ++++++-------------
 drivers/firmware/efi/libstub/arm64-stub.c     |  4 +---
 .../firmware/efi/libstub/efi-stub-helper.c    | 13 +++++++++-
 drivers/firmware/efi/libstub/efistub.h        |  2 ++
 include/linux/efi.h                           |  2 +-
 5 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index d5aa1d16154f..832df3c58e2f 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -18,8 +18,6 @@
 
 #include "efistub.h"
 
-bool __nokaslr;
-
 static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg)
 {
 	static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID;
@@ -221,18 +219,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail;
 	}
 
-	/* check whether 'nokaslr' was passed on the command line */
-	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
-		static const u8 default_cmdline[] = CONFIG_CMDLINE;
-		const u8 *str, *cmdline = cmdline_ptr;
-
-		if (IS_ENABLED(CONFIG_CMDLINE_FORCE))
-			cmdline = default_cmdline;
-		str = strstr(cmdline, "nokaslr");
-		if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
-			__nokaslr = true;
-	}
-
 	status = handle_kernel_image(sys_table, image_addr, &image_size,
 				     &reserve_addr,
 				     &reserve_size,
@@ -242,9 +228,13 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail_free_cmdline;
 	}
 
-	status = efi_parse_options(cmdline_ptr);
-	if (status != EFI_SUCCESS)
-		pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
+	if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+	    IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+	    cmdline_size == 0)
+		efi_parse_options(CONFIG_CMDLINE);
+
+	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
+		efi_parse_options(cmdline_ptr);
 
 	/*
 	 * Unauthenticated device tree data is a security hazard, so
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index 1b78215d6cd2..f90913850224 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -23,8 +23,6 @@
 
 #include "efistub.h"
 
-extern bool __nokaslr;
-
 efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg,
 					unsigned long *image_addr,
 					unsigned long *image_size,
@@ -40,7 +38,7 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg,
 	u64 phys_seed = 0;
 
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
-		if (!__nokaslr) {
+		if (!nokaslr()) {
 			status = efi_get_random_bytes(sys_table_arg,
 						      sizeof(phys_seed),
 						      (u8 *)&phys_seed);
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 29ed2f9b218c..58539c1280a5 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -41,6 +41,13 @@ static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 #define EFI_ALLOC_ALIGN		EFI_PAGE_SIZE
 #endif
 
+static int __section(.data) __nokaslr;
+
+int __pure nokaslr(void)
+{
+	return __nokaslr;
+}
+
 struct file_info {
 	efi_file_handle_t *handle;
 	u64 size;
@@ -313,10 +320,14 @@ void efi_free(efi_system_table_t *sys_table_arg, unsigned long size,
  * environments, first in the early boot environment of the EFI boot
  * stub, and subsequently during the kernel boot.
  */
-efi_status_t efi_parse_options(char *cmdline)
+efi_status_t efi_parse_options(char const *cmdline)
 {
 	char *str;
 
+	str = strstr(cmdline, "nokaslr");
+	if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+		__nokaslr = 1;
+
 	/*
 	 * If no EFI parameters were specified on the cmdline we've got
 	 * nothing to do.
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 5ed3d3f38166..a5eaa3ac0a5d 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -5,6 +5,8 @@
 /* error code which can't be mistaken for valid address */
 #define EFI_ERROR	(~0UL)
 
+extern int __pure nokaslr(void);
+
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 333d0ca6940f..516d83041206 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1292,7 +1292,7 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
 				  unsigned long *load_addr,
 				  unsigned long *load_size);
 
-efi_status_t efi_parse_options(char *cmdline);
+efi_status_t efi_parse_options(char const *cmdline);
 
 bool efi_runtime_disabled(void);
 #endif /* _LINUX_EFI_H */

From fc7a7876ca622b6f7dd339240857251923bfd8a5 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Thu, 26 Oct 2017 15:39:19 -0700
Subject: [PATCH 1247/3220] Revert "Revert "UPSTREAM: efi/libstub/arm64: Set
 -fpie when building the EFI stub""

In isolation, the original change will break building efistub for ARM64
with gcc.  This wasn't an issue upstream due to the earlier change
60f38de7a8d4 ("efi/libstub: Unify command line param parsing").  That's
now been backported to AOSP too.

This reverts commit 89805266af7825f6b8ccb8ff23a8e3aec4418dea.

Change-Id: I44eff2d17809b18181e2084abaf129ca4e2eb8d6
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 drivers/firmware/efi/libstub/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 68bdd9f23e13..d1f1bde85d3f 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -10,7 +10,7 @@ cflags-$(CONFIG_X86)		+= -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
 				   -fPIC -fno-strict-aliasing -mno-red-zone \
 				   -mno-mmx -mno-sse
 
-cflags-$(CONFIG_ARM64)		:= $(subst -pg,,$(KBUILD_CFLAGS))
+cflags-$(CONFIG_ARM64)		:= $(subst -pg,,$(KBUILD_CFLAGS)) -fpie
 cflags-$(CONFIG_ARM)		:= $(subst -pg,,$(KBUILD_CFLAGS)) \
 				   -fno-builtin -fpic -mno-single-pic-base
 

From 47caff4cd18c97fae90166ba144374aa31fde770 Mon Sep 17 00:00:00 2001
From: David Lin <dtwlin@google.com>
Date: Fri, 20 Oct 2017 14:09:13 -0700
Subject: [PATCH 1248/3220] FROMLIST: kbuild: clang: fix build failures with
 sparse check

We should avoid using the space character when passing arguments to
clang, because static code analysis check tool such as sparse may
misinterpret the arguments followed by spaces as build targets hence
cause the build to fail.

Signed-off-by: David Lin <dtwlin@google.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
(cherry picked from linux-kbuild commit bb3f38c3c5b759163e09b9152629cc789731de47)

Bug: 66969589
Change-Id: I055719cbc89e7a12d1f46f0a9c2152738a278d2a
[ghackmann@google.com: tweak to preserve AOSP-specific CLANG_TRIPLE]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index be69a48baebd..5a1c6ebe7b23 100644
--- a/Makefile
+++ b/Makefile
@@ -696,11 +696,11 @@ KBUILD_CFLAGS += $(stackp-flag)
 ifeq ($(cc-name),clang)
 ifneq ($(CROSS_COMPILE),)
 CLANG_TRIPLE    ?= $(CROSS_COMPILE)
-CLANG_TARGET	:= -target $(notdir $(CLANG_TRIPLE:%-=%))
+CLANG_TARGET	:= --target=$(notdir $(CLANG_TRIPLE:%-=%))
 GCC_TOOLCHAIN	:= $(realpath $(dir $(shell which $(LD)))/..)
 endif
 ifneq ($(GCC_TOOLCHAIN),)
-CLANG_GCC_TC	:= -gcc-toolchain $(GCC_TOOLCHAIN)
+CLANG_GCC_TC	:= --gcc-toolchain=$(GCC_TOOLCHAIN)
 endif
 KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)

From c2f18159a2f3c2da04445fe4295da050e4a7a1d9 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 30 Oct 2017 21:12:57 +0000
Subject: [PATCH 1249/3220] Revert "sched/core: fix have_sched_energy_data
 build warning"

This reverts commit a899b9085c8d5d581b214c24dc707466e8cb479f.

Reverting for now due to suspend warning issues.

Change-Id: I9387297b52552a73d5a253b9e8e4467b366479a5
---
 kernel/sched/core.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4e5298a1977e..4f11b84eaf0a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,9 +91,7 @@
 #include <trace/events/sched.h>
 #include "walt.h"
 
-#ifdef CONFIG_SMP
 static bool have_sched_energy_data(void);
-#endif
 
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -197,11 +195,10 @@ static int sched_feat_set(char *cmp)
 				sysctl_sched_features &= ~(1UL << i);
 				sched_feat_disable(i);
 			} else {
-#ifdef CONFIG_SMP
 				if (i == __SCHED_FEAT_ENERGY_AWARE)
 					WARN(!have_sched_energy_data(),
 					     "Missing sched energy data\n");
-#endif
+
 				sysctl_sched_features |= (1UL << i);
 				sched_feat_enable(i);
 			}

From 3a353d6cea7db5b758d6cec01398cb932cdef919 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 30 Oct 2017 21:13:39 +0000
Subject: [PATCH 1250/3220] Revert "sched/core: Warn if ENERGY_AWARE is enabled
 but data is missing"

This reverts commit a21299785a502ca4b3592a0f977aa1202b105260.

Change-Id: Idb707c80788d8b6d26d400a59d9b14f854cce89f
---
 kernel/sched/core.c | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4f11b84eaf0a..18d607f9a417 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,8 +91,6 @@
 #include <trace/events/sched.h>
 #include "walt.h"
 
-static bool have_sched_energy_data(void);
-
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -195,10 +193,6 @@ static int sched_feat_set(char *cmp)
 				sysctl_sched_features &= ~(1UL << i);
 				sched_feat_disable(i);
 			} else {
-				if (i == __SCHED_FEAT_ENERGY_AWARE)
-					WARN(!have_sched_energy_data(),
-					     "Missing sched energy data\n");
-
 				sysctl_sched_features |= (1UL << i);
 				sched_feat_enable(i);
 			}
@@ -6655,19 +6649,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 
-static bool have_sched_energy_data(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		if (!rcu_dereference(per_cpu(sd_scs, cpu)) ||
-		    !rcu_dereference(per_cpu(sd_ea, cpu)))
-			return false;
-	}
-
-	return true;
-}
-
 /*
  * Check that the per-cpu provided sd energy data is consistent for all cpus
  * within the mask.
@@ -7480,9 +7461,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	}
 	rcu_read_unlock();
 
-	WARN(sched_feat(ENERGY_AWARE) && !have_sched_energy_data(),
-	     "Missing data for energy aware scheduling\n");
-
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);

From 2da014c0d8de3ed95222eff8bde75c3d50539468 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@codeaurora.org>
Date: Mon, 31 Mar 2014 10:34:41 -0700
Subject: [PATCH 1251/3220] sched: Extend active balance to accept 'push_task'
 argument

Active balance currently picks one task to migrate from busy cpu to
a chosen cpu (push_cpu). This patch extends active load balance to
recognize a particular task ('push_task') that needs to be migrated to
'push_cpu'. This capability will be leveraged by HMP-aware task
placement in a subsequent patch.

Change-Id: If31320111e6cc7044e617b5c3fd6d8e0c0e16952
Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org>
[rameezmustafa@codeaurora.org]: Port to msm-3.18]
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
---
 kernel/sched/core.c  |  1 +
 kernel/sched/fair.c  | 42 ++++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h |  1 +
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18d607f9a417..679791f6c2df 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7909,6 +7909,7 @@ void __init sched_init(void)
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
+		rq->push_task = NULL;
 		rq->cpu = i;
 		rq->online = 0;
 		rq->idle_stamp = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b904a023be95..716aa3495b63 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9429,8 +9429,18 @@ static int active_load_balance_cpu_stop(void *data)
 	int busiest_cpu = cpu_of(busiest_rq);
 	int target_cpu = busiest_rq->push_cpu;
 	struct rq *target_rq = cpu_rq(target_cpu);
-	struct sched_domain *sd;
+	struct sched_domain *sd = NULL;
 	struct task_struct *p = NULL;
+	struct task_struct *push_task;
+	int push_task_detached = 0;
+	struct lb_env env = {
+		.sd		= sd,
+		.dst_cpu	= target_cpu,
+		.dst_rq		= target_rq,
+		.src_cpu	= busiest_rq->cpu,
+		.src_rq		= busiest_rq,
+		.idle		= CPU_IDLE,
+	};
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
@@ -9450,6 +9460,16 @@ static int active_load_balance_cpu_stop(void *data)
 	 */
 	BUG_ON(busiest_rq == target_rq);
 
+	push_task = busiest_rq->push_task;
+	if (push_task) {
+		if (task_on_rq_queued(push_task) &&
+				task_cpu(push_task) == busiest_cpu) {
+			detach_task(push_task, &env);
+			push_task_detached = 1;
+		}
+		goto out_unlock;
+	}
+
 	/* Search for an sd spanning us and the target CPU. */
 	rcu_read_lock();
 	for_each_domain(target_cpu, sd) {
@@ -9459,15 +9479,7 @@ static int active_load_balance_cpu_stop(void *data)
 	}
 
 	if (likely(sd)) {
-		struct lb_env env = {
-			.sd		= sd,
-			.dst_cpu	= target_cpu,
-			.dst_rq		= target_rq,
-			.src_cpu	= busiest_rq->cpu,
-			.src_rq		= busiest_rq,
-			.idle		= CPU_IDLE,
-		};
-
+		env.sd = sd;
 		schedstat_inc(sd, alb_count);
 		update_rq_clock(busiest_rq);
 
@@ -9485,8 +9497,18 @@ static int active_load_balance_cpu_stop(void *data)
 	rcu_read_unlock();
 out_unlock:
 	busiest_rq->active_balance = 0;
+
+	if (push_task)
+		busiest_rq->push_task = NULL;
+
 	raw_spin_unlock(&busiest_rq->lock);
 
+	if (push_task) {
+		if (push_task_detached)
+			attach_one_task(target_rq, push_task);
+		put_task_struct(push_task);
+	}
+
 	if (p)
 		attach_one_task(target_rq, p);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dd86072eaf4e..af2fd9ccaddf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -664,6 +664,7 @@ struct rq {
 	/* For active balancing */
 	int active_balance;
 	int push_cpu;
+	struct task_struct *push_task;
 	struct cpu_stop_work active_balance_work;
 	/* cpu of this runqueue: */
 	int cpu;

From dc626b28ee7cf9269eaadc512a2c497acfae0109 Mon Sep 17 00:00:00 2001
From: Prasad Sodagudi <psodagud@codeaurora.org>
Date: Sat, 5 Jul 2014 13:08:54 +0530
Subject: [PATCH 1252/3220] sched: avoid pushing tasks to an offline CPU

Currently active_load_balance_cpu_stop is run by cpu stopper and it
pushes running tasks off the busiest CPU onto idle target CPU. But
there is no check to see whether target cpu is offline or not before
pushing the tasks.  With the introduction of active migration in the
scheduler tick path (see check_for_migration()) there have been
instances of attempts to migrate tasks to offline CPUs.

Add a check as to whether the target cpu is online or not to prevent
scheduling on offline CPUs.

Change-Id: Ib8ac7f8aeabd3ca7365f3eae977075952dab4f21
Signed-off-by: Prasad Sodagudi <psodagud@codeaurora.org>
[rameezmustafa@codeaurora.org]: Port to msm-3.18]
Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 716aa3495b63..8ffebf95a522 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9463,7 +9463,8 @@ static int active_load_balance_cpu_stop(void *data)
 	push_task = busiest_rq->push_task;
 	if (push_task) {
 		if (task_on_rq_queued(push_task) &&
-				task_cpu(push_task) == busiest_cpu) {
+			task_cpu(push_task) == busiest_cpu &&
+					cpu_online(target_cpu)) {
 			detach_task(push_task, &env);
 			push_task_detached = 1;
 		}

From 9e293db0522f2332c3c89f431c488a3f525bc4e6 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Thu, 18 May 2017 17:43:58 -0700
Subject: [PATCH 1253/3220] sched: EAS: upmigrate misfit current task

Upmigrate misfit current task upon scheduler tick with stopper.

We can kick an random (not necessarily big CPU) NOHZ idle CPU when a
CPU bound task is in need of upmigration.  But it's not efficient as that
way needs following unnecessary wakeups:

  1. Busy little CPU A to kick idle B
  2. B runs idle balancer and enqueue migration/A
  3. B goes idle
  4. A runs migration/A, enqueues busy task on B.
  5. B wakes up again.

This change makes active upmigration more efficiently by doing:

  1. Busy little CPU A find target CPU B upon tick.
  2. CPU A enqueues migration/A.

Change-Id: Ie865738054ea3296f28e6ba01710635efa7193c0
[joonwoop: The original version had logic to reserve CPU.  The logic is
 omitted in this version.]
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/core.c  |  3 +++
 kernel/sched/fair.c  | 48 ++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h |  2 ++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 679791f6c2df..0c9e332ceb3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3097,6 +3097,9 @@ void scheduler_tick(void)
 	trigger_load_balance(rq);
 #endif
 	rq_last_tick_reset(rq);
+
+	if (curr->sched_class == &fair_sched_class)
+		check_for_migration(rq, curr);
 }
 
 #ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8ffebf95a522..ac22d32a6255 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6312,7 +6312,9 @@ done:
 
 /*
  * cpu_util_wake: Compute cpu utilization with any contributions from
- * the waking task p removed.
+ * the waking task p removed.  check_for_migration() looks for a better CPU of
+ * rq->curr. For that case we should return cpu util with contributions from
+ * currently running task p removed.
  */
 static int cpu_util_wake(int cpu, struct task_struct *p)
 {
@@ -6325,7 +6327,8 @@ static int cpu_util_wake(int cpu, struct task_struct *p)
 	 * utilization from cpu utilization. Instead just use
 	 * cpu_util for this case.
 	 */
-	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+	    p->state == TASK_WAKING)
 		return cpu_util(cpu);
 #endif
 	/* Task has no contribution or is new */
@@ -9974,6 +9977,47 @@ static void rq_offline_fair(struct rq *rq)
 	unthrottle_offline_cfs_rqs(rq);
 }
 
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+	int rc = 0;
+
+	/* Invoke active balance to force migrate currently running task */
+	raw_spin_lock(&rq->lock);
+	if (!rq->active_balance) {
+		rq->active_balance = 1;
+		rq->push_cpu = new_cpu;
+		get_task_struct(p);
+		rq->push_task = p;
+		rc = 1;
+	}
+	raw_spin_unlock(&rq->lock);
+
+	return rc;
+}
+
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+	int new_cpu;
+	int active_balance;
+	int cpu = task_cpu(p);
+
+	if (rq->misfit_task) {
+		if (rq->curr->state != TASK_RUNNING ||
+		    rq->curr->nr_cpus_allowed == 1)
+			return;
+
+		new_cpu = select_energy_cpu_brute(p, cpu, 0);
+		if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
+			active_balance = kick_active_balance(rq, p, new_cpu);
+			if (active_balance)
+				stop_one_cpu_nowait(cpu,
+						active_load_balance_cpu_stop,
+						rq, &rq->active_balance_work);
+		}
+	}
+}
+
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index af2fd9ccaddf..0238e94b0a1e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -31,8 +31,10 @@ extern long calc_load_fold_active(struct rq *this_rq);
 
 #ifdef CONFIG_SMP
 extern void update_cpu_load_active(struct rq *this_rq);
+extern void check_for_migration(struct rq *rq, struct task_struct *p);
 #else
 static inline void update_cpu_load_active(struct rq *this_rq) { }
+static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
 #endif
 
 /*

From 1cb87c38cb81847938bbb4776d9c6f3afe8fd938 Mon Sep 17 00:00:00 2001
From: Ke Wang <ke.wang@spreadtrum.com>
Date: Wed, 1 Nov 2017 14:11:06 +0800
Subject: [PATCH 1254/3220] sched: EAS: Fix the condition to distinguish energy
 before/after

Before commit 5f8b3a757d65 ("sched/fair: consider task utilization in
group_norm_util()"), eenv->util_delta is used to distinguish energy
before and energy after in sched_group_energy(). After that commit,
eenv->util_delta can not do that any more.

In this commit, use trg_cpu to distinguish energy before/after in
sched_group_energy().

Before apply this commit, cap_before/cap_delta is not correct:
<idle>-0 [001] 147504.608920: sched_energy_diff: pid=7 comm=rcu_preempt
src_cpu=1 dst_cpu=3 usage_delta=7 nrg_before=250 nrg_after=250 nrg_diff=0
cap_before=0 cap_after=528 cap_delta=1056 nrg_delta=0 nrg_payoff=0

After apply this commit, cap_before/cap_delta retrun to normal:
<idle>-0 [001] 220.494011: sched_energy_diff:    pid=7 comm=rcu_preempt
src_cpu=1 dst_cpu=2 usage_delta=3 nrg_before=248 nrg_after=248 nrg_diff=0
cap_before=528 cap_after=528 cap_delta=0 nrg_delta=0 nrg_payoff=0

Signed-off-by: Ke Wang <ke.wang@spreadtrum.com>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac22d32a6255..06b814b58d20 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5534,13 +5534,13 @@ static int sched_group_energy(struct energy_env *eenv)
 
 				if (sg->group_weight == 1) {
 					/* Remove capacity of src CPU (before task move) */
-					if (eenv->util_delta == 0 &&
+					if (eenv->trg_cpu == eenv->src_cpu &&
 					    cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
 						eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
 						eenv->cap.delta -= eenv->cap.before;
 					}
 					/* Add capacity of dst CPU  (after task move) */
-					if (eenv->util_delta != 0 &&
+					if (eenv->trg_cpu == eenv->dst_cpu &&
 					    cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
 						eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
 						eenv->cap.delta += eenv->cap.after;

From 7d5a251c66be3516c14cffa80e6b076b37736971 Mon Sep 17 00:00:00 2001
From: Ke Wang <ke.wang@spreadtrum.com>
Date: Mon, 30 Oct 2017 17:38:16 +0800
Subject: [PATCH 1255/3220] sched: EAS: update trg_cpu to backup_cpu if no
 energy saving for target_cpu

If no energy saving for target_cpu in the calculation of energy_diff(),
backup_cpu will be set as the new dst_cpu for the next calculation. At this
point, we also need update the new trg_cpu as backup_cpu to make sure the
subsequent calculation of energy_diff() is correct.

Signed-off-by: Ke Wang <ke.wang@spreadtrum.com>
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 06b814b58d20..3b429c5ce721 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6723,6 +6723,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 			/* No energy saving for target_cpu, try backup */
 			target_cpu = tmp_backup;
 			eenv.dst_cpu = target_cpu;
+			eenv.trg_cpu = target_cpu;
 			if (tmp_backup < 0 ||
 			    tmp_backup == prev_cpu ||
 			    energy_diff(&eenv) >= 0) {

From 47c87b2654376e7dda646ca5a2af067c5d368ca7 Mon Sep 17 00:00:00 2001
From: Ke Wang <ke.wang@spreadtrum.com>
Date: Wed, 1 Nov 2017 16:07:38 +0800
Subject: [PATCH 1256/3220] sched: EAS: Fix the calculation of group util in
 group_idle_state()

util_delta becomes not zero in eenv_before, which will affect the
calculation of grp_util in group_idle_state(). Fix it under the
new condition.

Change-Id: Ic3853bb45876a8e388afcbe4e72d25fc42b1d7b0
Signed-off-by: Ke Wang <ke.wang@spreadtrum.com>
---
 kernel/sched/fair.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b429c5ce721..5c65f3ad6da1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5429,13 +5429,6 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
 	/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
 	state++;
 
-	/*
-	 * Try to estimate if a deeper idle state is
-	 * achievable when we move the task.
-	 */
-	for_each_cpu(i, sched_group_cpus(sg))
-		grp_util += cpu_util(i);
-
 	src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
 	dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
 	if (src_in_grp == dst_in_grp) {
@@ -5444,10 +5437,16 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
 		 */
 		goto end;
 	}
-	/* add or remove util as appropriate to indicate what group util
-	 * will be (worst case - no concurrent execution) after moving the task
+
+	/*
+	 * Try to estimate if a deeper idle state is
+	 * achievable when we move the task.
 	 */
-	grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta;
+	for_each_cpu(i, sched_group_cpus(sg)) {
+		grp_util += cpu_util_wake(i, eenv->task);
+		if (unlikely(i == eenv->trg_cpu))
+			grp_util += eenv->util_delta;
+	}
 
 	if (grp_util <=
 		((long)sg->sgc->max_capacity * (int)sg->group_weight)) {

From 05aa89aa8402ae319108731770a6c7d23f88c431 Mon Sep 17 00:00:00 2001
From: Evgenii Stepanov <eugenis@google.com>
Date: Wed, 1 Nov 2017 15:04:42 -0700
Subject: [PATCH 1257/3220] ANDROID: Revert "arm: move ELF_ET_DYN_BASE to 4MB"

This ARM mmap change breaks AddressSanitizer:

Shadow memory range interleaves with an existing memory mapping. ASan cannot proceed correctly. ABORTING.

Revert it until ASAN runtime library is updated to handle it.

Bug: 67425063

This reverts commit d2471b5e84f32de4e09b58f5436a4ce3ee935e32.
---
 arch/arm/include/asm/elf.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index f13ae153fb24..d2315ffd8f12 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -112,8 +112,12 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
 #define CORE_DUMP_USE_REGSET
 #define ELF_EXEC_PAGESIZE	4096
 
-/* This is the base location for PIE (ET_DYN with INTERP) loads. */
-#define ELF_ET_DYN_BASE		0x400000UL
+/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
+   use of this is to invoke "./ld.so someprog" to test out a new version of
+   the loader.  We need to make sure that it is out of the way of the program
+   that it will "exec", and that there is sufficient room for the brk.  */
+
+#define ELF_ET_DYN_BASE	(TASK_SIZE / 3 * 2)
 
 /* When the program starts, a1 contains a pointer to a function to be 
    registered with atexit, as per the SVR4 ABI.  A value of 0 means we 

From e080c59bfb7cf2e0498883d352c9053e5f2bf960 Mon Sep 17 00:00:00 2001
From: Evgenii Stepanov <eugenis@google.com>
Date: Wed, 1 Nov 2017 15:10:12 -0700
Subject: [PATCH 1258/3220] ANDROID: Revert "arm64: move ELF_ET_DYN_BASE to 4GB
 / 4MB"

Part of the above change was reverted in 240628085effc47e86f51fc3fb37bc0e628f9a85;
this change reverts the rest.

This ARM mmap change breaks AddressSanitizer:

Shadow memory range interleaves with an existing memory mapping. ASan cannot proceed correctly. ABORTING.

Revert it until ASAN runtime library is updated to handle it.

Bug: 67425063
---
 arch/arm64/include/asm/elf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index cd3dfa346649..f9d64ed4fd2b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -169,7 +169,7 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 #ifdef CONFIG_COMPAT
 
 /* PIE load location for compat arm. Must match ARM ELF_ET_DYN_BASE. */
-#define COMPAT_ELF_ET_DYN_BASE		0x000400000UL
+#define COMPAT_ELF_ET_DYN_BASE		(2 * TASK_SIZE_32 / 3)
 
 /* AArch32 registers. */
 #define COMPAT_ELF_NGREG		18

From cd04e987d1da0eadc25c2186bd6bd93f22c3e851 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 11 Sep 2017 17:05:49 -0700
Subject: [PATCH 1259/3220] ANDROID: sched/rt: add schedtune accounting

This patch adds schedtune enqueue/dequeue to RT scheduling class.

Change-Id: If416e64319d62191f3aedd675d3e9a21fe2102fb
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/sched/rt.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 069f8982867f..88f28e996249 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
 #include <linux/irq_work.h>
 
 #include "walt.h"
+#include "tune.h"
 
 int sched_rr_timeslice = RR_TIMESLICE;
 
@@ -1321,6 +1322,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
+
+	schedtune_enqueue_task(p, cpu_of(rq));
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1332,6 +1335,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	walt_dec_cumulative_runnable_avg(rq, p);
 
 	dequeue_pushable_task(rq, p);
+	schedtune_dequeue_task(p, cpu_of(rq));
 }
 
 /*

From d194ba5d712f051ff6c025f3484bb72f219764e3 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 11 Sep 2017 17:10:37 -0700
Subject: [PATCH 1260/3220] ANDROID: sched/rt: schedtune: Add boost retention
 to RT

Boosted RT tasks can be deboosted quickly, this makes boost usless
for RT tasks and causes lots of glitching. Use timers to prevent
de-boost too soon and wait for long enough such that next enqueue
happens after a threshold.

While this can be solved in the governor, there are following
advantages:
- The approach used is governor-independent
- Reduces boost group lock contention for frequently sleepers/wakers
- Works with schedfreq without any other schedfreq hacks.

Bug: 30210506

Change-Id: I41788b235586988be446505deb7c0529758a9898
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 include/linux/sched.h |   4 ++
 kernel/sched/core.c   |   1 +
 kernel/sched/rt.c     | 154 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h  |   1 +
 4 files changed, 160 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 60af9d2fa922..6197d9c8e5a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1433,6 +1433,10 @@ struct sched_rt_entity {
 	unsigned long watchdog_stamp;
 	unsigned int time_slice;
 
+	/* Accesses for these must be guarded by rq->lock of the task's rq */
+	bool schedtune_enqueued;
+	struct hrtimer schedtune_timer;
+
 	struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity	*parent;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0c9e332ceb3b..3030633d8900 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2200,6 +2200,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	init_dl_task_timer(&p->dl);
 	__dl_clear_params(p);
 
+	init_rt_schedtune_timer(&p->rt);
 	INIT_LIST_HEAD(&p->rt.run_list);
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 88f28e996249..f41435e7f75d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,7 @@
 
 #include <linux/slab.h>
 #include <linux/irq_work.h>
+#include <linux/hrtimer.h>
 
 #include "walt.h"
 #include "tune.h"
@@ -986,6 +987,73 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 	return 0;
 }
 
+#define RT_SCHEDTUNE_INTERVAL 50000000ULL
+
+static void sched_rt_update_capacity_req(struct rq *rq);
+
+static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
+{
+	struct sched_rt_entity *rt_se = container_of(timer,
+			struct sched_rt_entity,
+			schedtune_timer);
+	struct task_struct *p = rt_task_of(rt_se);
+	struct rq *rq = task_rq(p);
+
+	raw_spin_lock(&rq->lock);
+
+	/*
+	 * Nothing to do if:
+	 * - task has switched runqueues
+	 * - task isn't RT anymore
+	 */
+	if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
+		goto out;
+
+	/*
+	 * If task got enqueued back during callback time, it means we raced
+	 * with the enqueue on another cpu, that's Ok, just do nothing as
+	 * enqueue path would have tried to cancel us and we shouldn't run
+	 * Also check the schedtune_enqueued flag as class-switch on a
+	 * sleeping task may have already canceled the timer and done dq
+	 */
+	if (p->on_rq || !rt_se->schedtune_enqueued)
+		goto out;
+
+	/*
+	 * RT task is no longer active, cancel boost
+	 */
+	rt_se->schedtune_enqueued = false;
+	schedtune_dequeue_task(p, cpu_of(rq));
+	sched_rt_update_capacity_req(rq);
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+out:
+	raw_spin_unlock(&rq->lock);
+
+	/*
+	 * This can free the task_struct if no more references.
+	 */
+	put_task_struct(p);
+
+	return HRTIMER_NORESTART;
+}
+
+void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+	struct hrtimer *timer = &rt_se->schedtune_timer;
+
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	timer->function = rt_schedtune_timer;
+	rt_se->schedtune_enqueued = false;
+}
+
+static void start_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+	struct hrtimer *timer = &rt_se->schedtune_timer;
+
+	hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
+			HRTIMER_MODE_REL_PINNED);
+}
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -1323,7 +1391,33 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 
+	if (!schedtune_task_boost(p))
+		return;
+
+	/*
+	 * If schedtune timer is active, that means a boost was already
+	 * done, just cancel the timer so that deboost doesn't happen.
+	 * Otherwise, increase the boost. If an enqueued timer was
+	 * cancelled, put the task reference.
+	 */
+	if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+		put_task_struct(p);
+
+	/*
+	 * schedtune_enqueued can be true in the following situation:
+	 * enqueue_task_rt grabs rq lock before timer fires
+	 *    or before its callback acquires rq lock
+	 * schedtune_enqueued can be false if timer callback is running
+	 * and timer just released rq lock, or if the timer finished
+	 * running and canceling the boost
+	 */
+	if (rt_se->schedtune_enqueued)
+		return;
+
+	rt_se->schedtune_enqueued = true;
 	schedtune_enqueue_task(p, cpu_of(rq));
+	sched_rt_update_capacity_req(rq);
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1335,7 +1429,20 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	walt_dec_cumulative_runnable_avg(rq, p);
 
 	dequeue_pushable_task(rq, p);
+
+	if (!rt_se->schedtune_enqueued)
+		return;
+
+	if (flags == DEQUEUE_SLEEP) {
+		get_task_struct(p);
+		start_schedtune_timer(rt_se);
+		return;
+	}
+
+	rt_se->schedtune_enqueued = false;
 	schedtune_dequeue_task(p, cpu_of(rq));
+	sched_rt_update_capacity_req(rq);
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
 }
 
 /*
@@ -1375,6 +1482,33 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
+/*
+ * Perform a schedtune dequeue and cancelation of boost timers if needed.
+ * Should be called only with the rq->lock held.
+ */
+static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
+{
+	struct sched_rt_entity *rt_se = &p->rt;
+
+	BUG_ON(!raw_spin_is_locked(&rq->lock));
+
+	if (!rt_se->schedtune_enqueued)
+		return;
+
+	/*
+	 * Incase of class change cancel any active timers. If an enqueued
+	 * timer was cancelled, put the task ref.
+	 */
+	if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+		put_task_struct(p);
+
+	/* schedtune_enqueued is true, deboost it */
+	rt_se->schedtune_enqueued = false;
+	schedtune_dequeue_task(p, task_cpu(p));
+	sched_rt_update_capacity_req(rq);
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+}
+
 static int
 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
 		  int sibling_count_hint)
@@ -1429,6 +1563,19 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
 	rcu_read_unlock();
 
 out:
+	/*
+	 * If previous CPU was different, make sure to cancel any active
+	 * schedtune timers and deboost.
+	 */
+	if (task_cpu(p) != cpu) {
+		unsigned long fl;
+		struct rq *prq = task_rq(p);
+
+		raw_spin_lock_irqsave(&prq->lock, fl);
+		schedtune_dequeue_rt(prq, p);
+		raw_spin_unlock_irqrestore(&prq->lock, fl);
+	}
+
 	return cpu;
 }
 
@@ -2205,6 +2352,13 @@ static void rq_offline_rt(struct rq *rq)
  */
 static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
+	/*
+	 * On class switch from rt, always cancel active schedtune timers,
+	 * this handles the cases where we switch class for a task that is
+	 * already rt-dequeued but has a running timer.
+	 */
+	schedtune_dequeue_rt(rq, p);
+
 	/*
 	 * If there are other RT tasks then we will reschedule
 	 * and the scheduling of the other RT tasks will handle
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0238e94b0a1e..028e232103c2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1398,6 +1398,7 @@ extern void resched_cpu(int cpu);
 
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se);
 
 extern struct dl_bandwidth def_dl_bandwidth;
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);

From df147c9e336cfcb4183db1eb9552b0429060cd0d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 2 Nov 2017 15:13:26 +0530
Subject: [PATCH 1261/3220] cpufreq: Drop schedfreq governor

We all should be using (and improving) the schedutil governor now. Get
rid of the non-upstream governor.

Tested on Hikey.

Change-Id: Ic660756536e5da51952738c3c18b94e31f58cd57
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/Kconfig      |  13 -
 include/linux/sched/sysctl.h |   1 -
 kernel/sched/Makefile        |   1 -
 kernel/sched/core.c          |  86 ------
 kernel/sched/cpufreq_sched.c | 525 -----------------------------------
 kernel/sched/fair.c          |  89 +-----
 kernel/sched/rt.c            |  49 +---
 kernel/sched/sched.h         |  75 -----
 kernel/sysctl.c              |   7 -
 9 files changed, 4 insertions(+), 842 deletions(-)
 delete mode 100644 kernel/sched/cpufreq_sched.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 91b05e2b8799..91e2743f057b 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -224,19 +224,6 @@ config CPU_FREQ_GOV_CONSERVATIVE
 
 	  If in doubt, say N.
 
-config CPU_FREQ_GOV_SCHED
-	bool "'sched' cpufreq governor"
-	depends on CPU_FREQ
-	depends on SMP
-	select CPU_FREQ_GOV_COMMON
-	help
-	  'sched' - this governor scales cpu frequency from the
-	  scheduler as a function of cpu capacity utilization. It does
-	  not evaluate utilization on a periodic basis (as ondemand
-	  does) but instead is event-driven by the scheduler.
-
-	  If in doubt, say N.
-
 config CPU_FREQ_GOV_SCHEDUTIL
 	bool "'schedutil' cpufreq policy governor"
 	depends on CPU_FREQ && SMP
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 2bf4520d8089..167279a4e24b 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -40,7 +40,6 @@ extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_sync_hint_enable;
-extern unsigned int sysctl_sched_initial_task_util;
 extern unsigned int sysctl_sched_cstate_aware;
 #ifdef CONFIG_SCHED_WALT
 extern unsigned int sysctl_sched_use_walt_cpu_util;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ca0d94096170..d7ec4f7dd0d9 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -22,5 +22,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_SCHED_TUNE) += tune.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
-obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3030633d8900..889fb1aff1e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2983,91 +2983,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
-#ifdef CONFIG_CPU_FREQ_GOV_SCHED
-
-static inline
-unsigned long add_capacity_margin(unsigned long cpu_capacity)
-{
-	cpu_capacity  = cpu_capacity * capacity_margin;
-	cpu_capacity /= SCHED_CAPACITY_SCALE;
-	return cpu_capacity;
-}
-
-static inline
-unsigned long sum_capacity_reqs(unsigned long cfs_cap,
-				struct sched_capacity_reqs *scr)
-{
-	unsigned long total = add_capacity_margin(cfs_cap + scr->rt);
-	return total += scr->dl;
-}
-
-unsigned long boosted_cpu_util(int cpu);
-static void sched_freq_tick_pelt(int cpu)
-{
-	unsigned long cpu_utilization = boosted_cpu_util(cpu);
-	unsigned long capacity_curr = capacity_curr_of(cpu);
-	struct sched_capacity_reqs *scr;
-
-	scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-	if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
-		return;
-
-	/*
-	 * To make free room for a task that is building up its "real"
-	 * utilization and to harm its performance the least, request
-	 * a jump to a higher OPP as soon as the margin of free capacity
-	 * is impacted (specified by capacity_margin).
-	 * Remember CPU utilization in sched_capacity_reqs should be normalised.
-	 */
-	cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
-	set_cfs_cpu_capacity(cpu, true, cpu_utilization);
-}
-
-#ifdef CONFIG_SCHED_WALT
-static void sched_freq_tick_walt(int cpu)
-{
-	unsigned long cpu_utilization = cpu_util_freq(cpu);
-	unsigned long capacity_curr = capacity_curr_of(cpu);
-
-	if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
-		return sched_freq_tick_pelt(cpu);
-
-	/*
-	 * Add a margin to the WALT utilization to check if we will need to
-	 * increase frequency.
-	 * NOTE: WALT tracks a single CPU signal for all the scheduling
-	 * classes, thus this margin is going to be added to the DL class as
-	 * well, which is something we do not do in sched_freq_tick_pelt case.
-	 */
-	if (add_capacity_margin(cpu_utilization) <= capacity_curr)
-		return;
-
-	/*
-	 * It is likely that the load is growing so we
-	 * keep the added margin in our request as an
-	 * extra boost.
-	 * Remember CPU utilization in sched_capacity_reqs should be normalised.
-	 */
-	cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
-	set_cfs_cpu_capacity(cpu, true, cpu_utilization);
-
-}
-#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
-#else
-#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
-#endif /* CONFIG_SCHED_WALT */
-
-static void sched_freq_tick(int cpu)
-{
-	if (!sched_freq())
-		return;
-
-	_sched_freq_tick(cpu);
-}
-#else
-static inline void sched_freq_tick(int cpu) { }
-#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
-
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -3088,7 +3003,6 @@ void scheduler_tick(void)
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
 	calc_global_load_tick(rq);
-	sched_freq_tick(cpu);
 	raw_spin_unlock(&rq->lock);
 
 	perf_event_task_tick();
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
deleted file mode 100644
index ec0aed7a8f96..000000000000
--- a/kernel/sched/cpufreq_sched.c
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/cpufreq.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/percpu.h>
-#include <linux/irq_work.h>
-#include <linux/delay.h>
-#include <linux/string.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/cpufreq_sched.h>
-
-#include "sched.h"
-
-#define THROTTLE_DOWN_NSEC	50000000 /* 50ms default */
-#define THROTTLE_UP_NSEC	500000 /* 500us default */
-
-struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
-static bool __read_mostly cpufreq_driver_slow;
-
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
-static struct cpufreq_governor cpufreq_gov_sched;
-#endif
-
-static DEFINE_PER_CPU(unsigned long, enabled);
-DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
-
-struct gov_tunables {
-	struct gov_attr_set attr_set;
-	unsigned int up_throttle_nsec;
-	unsigned int down_throttle_nsec;
-};
-
-/**
- * gov_data - per-policy data internal to the governor
- * @up_throttle: next throttling period expiry if increasing OPP
- * @down_throttle: next throttling period expiry if decreasing OPP
- * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP
- * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP
- * @task: worker thread for dvfs transition that may block/sleep
- * @irq_work: callback used to wake up worker thread
- * @requested_freq: last frequency requested by the sched governor
- *
- * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
- * per-policy instance of it is created when the cpufreq_sched governor receives
- * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
- * member of struct cpufreq_policy.
- *
- * Readers of this data must call down_read(policy->rwsem). Writers must
- * call down_write(policy->rwsem).
- */
-struct gov_data {
-	ktime_t up_throttle;
-	ktime_t down_throttle;
-	struct gov_tunables *tunables;
-	struct list_head tunables_hook;
-	struct task_struct *task;
-	struct irq_work irq_work;
-	unsigned int requested_freq;
-};
-
-static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
-					    unsigned int freq)
-{
-	struct gov_data *gd = policy->governor_data;
-
-	/* avoid race with cpufreq_sched_stop */
-	if (!down_write_trylock(&policy->rwsem))
-		return;
-
-	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
-
-	gd->up_throttle = ktime_add_ns(ktime_get(),
-				       gd->tunables->up_throttle_nsec);
-	gd->down_throttle = ktime_add_ns(ktime_get(),
-					 gd->tunables->down_throttle_nsec);
-	up_write(&policy->rwsem);
-}
-
-static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)
-{
-	ktime_t now = ktime_get();
-
-	ktime_t throttle = gd->requested_freq < cur_freq ?
-		gd->down_throttle : gd->up_throttle;
-
-	if (ktime_after(now, throttle))
-		return false;
-
-	while (1) {
-		int usec_left = ktime_to_ns(ktime_sub(throttle, now));
-
-		usec_left /= NSEC_PER_USEC;
-		trace_cpufreq_sched_throttled(usec_left);
-		usleep_range(usec_left, usec_left + 100);
-		now = ktime_get();
-		if (ktime_after(now, throttle))
-			return true;
-	}
-}
-
-/*
- * we pass in struct cpufreq_policy. This is safe because changing out the
- * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- * which tears down all of the data structures and __cpufreq_governor(policy,
- * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- * new policy pointer
- */
-static int cpufreq_sched_thread(void *data)
-{
-	struct sched_param param;
-	struct cpufreq_policy *policy;
-	struct gov_data *gd;
-	unsigned int new_request = 0;
-	unsigned int last_request = 0;
-	int ret;
-
-	policy = (struct cpufreq_policy *) data;
-	gd = policy->governor_data;
-
-	param.sched_priority = 50;
-	ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
-	if (ret) {
-		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
-		do_exit(-EINVAL);
-	} else {
-		pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
-				__func__, gd->task->pid);
-	}
-
-	do {
-		new_request = gd->requested_freq;
-		if (new_request == last_request) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			if (kthread_should_stop())
-				break;
-			schedule();
-		} else {
-			/*
-			 * if the frequency thread sleeps while waiting to be
-			 * unthrottled, start over to check for a newer request
-			 */
-			if (finish_last_request(gd, policy->cur))
-				continue;
-			last_request = new_request;
-			cpufreq_sched_try_driver_target(policy, new_request);
-		}
-	} while (!kthread_should_stop());
-
-	return 0;
-}
-
-static void cpufreq_sched_irq_work(struct irq_work *irq_work)
-{
-	struct gov_data *gd;
-
-	gd = container_of(irq_work, struct gov_data, irq_work);
-	if (!gd)
-		return;
-
-	wake_up_process(gd->task);
-}
-
-static void update_fdomain_capacity_request(int cpu)
-{
-	unsigned int freq_new, index_new, cpu_tmp;
-	struct cpufreq_policy *policy;
-	struct gov_data *gd;
-	unsigned long capacity = 0;
-
-	/*
-	 * Avoid grabbing the policy if possible. A test is still
-	 * required after locking the CPU's policy to avoid racing
-	 * with the governor changing.
-	 */
-	if (!per_cpu(enabled, cpu))
-		return;
-
-	policy = cpufreq_cpu_get(cpu);
-	if (IS_ERR_OR_NULL(policy))
-		return;
-
-	if (policy->governor != &cpufreq_gov_sched ||
-	    !policy->governor_data)
-		goto out;
-
-	gd = policy->governor_data;
-
-	/* find max capacity requested by cpus in this policy */
-	for_each_cpu(cpu_tmp, policy->cpus) {
-		struct sched_capacity_reqs *scr;
-
-		scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);
-		capacity = max(capacity, scr->total);
-	}
-
-	/* Convert the new maximum capacity request into a cpu frequency */
-	freq_new = capacity * policy->cpuinfo.max_freq >> SCHED_CAPACITY_SHIFT;
-	if (cpufreq_frequency_table_target(policy, policy->freq_table,
-					   freq_new, CPUFREQ_RELATION_L,
-					   &index_new))
-		goto out;
-	freq_new = policy->freq_table[index_new].frequency;
-
-	if (freq_new > policy->max)
-		freq_new = policy->max;
-
-	if (freq_new < policy->min)
-		freq_new = policy->min;
-
-	trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
-					gd->requested_freq);
-	if (freq_new == gd->requested_freq)
-		goto out;
-
-	gd->requested_freq = freq_new;
-
-	/*
-	 * Throttling is not yet supported on platforms with fast cpufreq
-	 * drivers.
-	 */
-	if (cpufreq_driver_slow)
-		irq_work_queue_on(&gd->irq_work, cpu);
-	else
-		cpufreq_sched_try_driver_target(policy, freq_new);
-
-out:
-	cpufreq_cpu_put(policy);
-}
-
-#ifdef CONFIG_SCHED_WALT
-static inline unsigned long
-requested_capacity(struct sched_capacity_reqs *scr)
-{
-	if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
-		return scr->cfs;
-	return scr->cfs + scr->rt;
-}
-#else
-#define requested_capacity(scr) (scr->cfs + scr->rt)
-#endif
-
-void update_cpu_capacity_request(int cpu, bool request)
-{
-	unsigned long new_capacity;
-	struct sched_capacity_reqs *scr;
-
-	/* The rq lock serializes access to the CPU's sched_capacity_reqs. */
-	lockdep_assert_held(&cpu_rq(cpu)->lock);
-
-	scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-
-	new_capacity = requested_capacity(scr);
-	new_capacity = new_capacity * capacity_margin
-		/ SCHED_CAPACITY_SCALE;
-	new_capacity += scr->dl;
-
-	if (new_capacity == scr->total)
-		return;
-
-	trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);
-
-	scr->total = new_capacity;
-	if (request)
-		update_fdomain_capacity_request(cpu);
-}
-
-static inline void set_sched_freq(void)
-{
-	static_key_slow_inc(&__sched_freq);
-}
-
-static inline void clear_sched_freq(void)
-{
-	static_key_slow_dec(&__sched_freq);
-}
-
-/* Tunables */
-static struct gov_tunables *global_tunables;
-
-static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set)
-{
-	return container_of(attr_set, struct gov_tunables, attr_set);
-}
-
-static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
-{
-	struct gov_tunables *tunables = to_tunables(attr_set);
-
-	return sprintf(buf, "%u\n", tunables->up_throttle_nsec);
-}
-
-static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set,
-				      const char *buf, size_t count)
-{
-	struct gov_tunables *tunables = to_tunables(attr_set);
-	int ret;
-	long unsigned int val;
-
-	ret = kstrtoul(buf, 0, &val);
-	if (ret < 0)
-		return ret;
-	tunables->up_throttle_nsec = val;
-	return count;
-}
-
-static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
-{
-	struct gov_tunables *tunables = to_tunables(attr_set);
-
-	return sprintf(buf, "%u\n", tunables->down_throttle_nsec);
-}
-
-static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set,
-					const char *buf, size_t count)
-{
-	struct gov_tunables *tunables = to_tunables(attr_set);
-	int ret;
-	long unsigned int val;
-
-	ret = kstrtoul(buf, 0, &val);
-	if (ret < 0)
-		return ret;
-	tunables->down_throttle_nsec = val;
-	return count;
-}
-
-static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec);
-static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec);
-
-static struct attribute *schedfreq_attributes[] = {
-	&up_throttle_nsec.attr,
-	&down_throttle_nsec.attr,
-	NULL
-};
-
-static struct kobj_type tunables_ktype = {
-	.default_attrs = schedfreq_attributes,
-	.sysfs_ops = &governor_sysfs_ops,
-};
-
-static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
-{
-	struct gov_data *gd;
-	int cpu;
-	int rc;
-
-	for_each_cpu(cpu, policy->cpus)
-		memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
-		       sizeof(struct sched_capacity_reqs));
-
-	gd = kzalloc(sizeof(*gd), GFP_KERNEL);
-	if (!gd)
-		return -ENOMEM;
-
-	policy->governor_data = gd;
-
-	if (!global_tunables) {
-		gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);
-		if (!gd->tunables)
-			goto free_gd;
-
-		gd->tunables->up_throttle_nsec =
-			policy->cpuinfo.transition_latency ?
-			policy->cpuinfo.transition_latency :
-			THROTTLE_UP_NSEC;
-		gd->tunables->down_throttle_nsec =
-			THROTTLE_DOWN_NSEC;
-
-		rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,
-					  &tunables_ktype,
-					  get_governor_parent_kobj(policy),
-					  "%s", cpufreq_gov_sched.name);
-		if (rc)
-			goto free_tunables;
-
-		gov_attr_set_init(&gd->tunables->attr_set,
-				  &gd->tunables_hook);
-
-		pr_debug("%s: throttle_threshold = %u [ns]\n",
-			 __func__, gd->tunables->up_throttle_nsec);
-
-		if (!have_governor_per_policy())
-			global_tunables = gd->tunables;
-	} else {
-		gd->tunables = global_tunables;
-		gov_attr_set_get(&global_tunables->attr_set,
-				 &gd->tunables_hook);
-	}
-
-	policy->governor_data = gd;
-	if (cpufreq_driver_is_slow()) {
-		cpufreq_driver_slow = true;
-		gd->task = kthread_create(cpufreq_sched_thread, policy,
-					  "kschedfreq:%d",
-					  cpumask_first(policy->related_cpus));
-		if (IS_ERR_OR_NULL(gd->task)) {
-			pr_err("%s: failed to create kschedfreq thread\n",
-			       __func__);
-			goto free_tunables;
-		}
-		get_task_struct(gd->task);
-		kthread_bind_mask(gd->task, policy->related_cpus);
-		wake_up_process(gd->task);
-		init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
-	}
-
-	set_sched_freq();
-
-	return 0;
-
-free_tunables:
-	kfree(gd->tunables);
-free_gd:
-	policy->governor_data = NULL;
-	kfree(gd);
-	return -ENOMEM;
-}
-
-static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
-{
-	unsigned int count;
-	struct gov_data *gd = policy->governor_data;
-
-	clear_sched_freq();
-	if (cpufreq_driver_slow) {
-		kthread_stop(gd->task);
-		put_task_struct(gd->task);
-	}
-
-	count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook);
-	if (!count) {
-		if (!have_governor_per_policy())
-			global_tunables = NULL;
-		kfree(gd->tunables);
-	}
-
-	policy->governor_data = NULL;
-
-	kfree(gd);
-	return 0;
-}
-
-static int cpufreq_sched_start(struct cpufreq_policy *policy)
-{
-	int cpu;
-
-	for_each_cpu(cpu, policy->cpus)
-		per_cpu(enabled, cpu) = 1;
-
-	return 0;
-}
-
-static void cpufreq_sched_limits(struct cpufreq_policy *policy)
-{
-	unsigned int clamp_freq;
-	struct gov_data *gd = policy->governor_data;;
-
-	pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
-		policy->cpu, policy->min, policy->max,
-		policy->cur);
-
-	clamp_freq = clamp(gd->requested_freq, policy->min, policy->max);
-
-	if (policy->cur != clamp_freq)
-		__cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L);
-}
-
-static int cpufreq_sched_stop(struct cpufreq_policy *policy)
-{
-	int cpu;
-
-	for_each_cpu(cpu, policy->cpus)
-		per_cpu(enabled, cpu) = 0;
-
-	return 0;
-}
-
-static int cpufreq_sched_setup(struct cpufreq_policy *policy,
-			       unsigned int event)
-{
-	switch (event) {
-	case CPUFREQ_GOV_POLICY_INIT:
-		return cpufreq_sched_policy_init(policy);
-	case CPUFREQ_GOV_POLICY_EXIT:
-		return cpufreq_sched_policy_exit(policy);
-	case CPUFREQ_GOV_START:
-		return cpufreq_sched_start(policy);
-	case CPUFREQ_GOV_STOP:
-		return cpufreq_sched_stop(policy);
-	case CPUFREQ_GOV_LIMITS:
-		cpufreq_sched_limits(policy);
-		break;
-	}
-	return 0;
-}
-
-
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
-static
-#endif
-struct cpufreq_governor cpufreq_gov_sched = {
-	.name			= "sched",
-	.governor		= cpufreq_sched_setup,
-	.owner			= THIS_MODULE,
-};
-
-static int __init cpufreq_sched_init(void)
-{
-	int cpu;
-
-	for_each_cpu(cpu, cpu_possible_mask)
-		per_cpu(enabled, cpu) = 0;
-	return cpufreq_register_governor(&cpufreq_gov_sched);
-}
-
-/* Try to make this the default governor */
-fs_initcall(cpufreq_sched_init);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c65f3ad6da1..b5ea66e5551c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -54,7 +54,6 @@ unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
 unsigned int sysctl_sched_sync_hint_enable = 1;
-unsigned int sysctl_sched_initial_task_util = 0;
 unsigned int sysctl_sched_cstate_aware = 1;
 
 #ifdef CONFIG_SCHED_WALT
@@ -750,9 +749,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 	/*
 	 * In previous Android versions, we used to have:
-	 * 	sa->util_avg =  sched_freq() ?
-	 * 		sysctl_sched_initial_task_util :
-	 *		scale_load_down(SCHED_LOAD_SCALE);
+	 * 	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
 	 * 	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 	 * However, that functionality has been moved to enqueue.
 	 * It is unclear if we should restore this in enqueue.
@@ -4668,21 +4665,6 @@ unsigned long boosted_cpu_util(int cpu);
 #define boosted_cpu_util(cpu) cpu_util_freq(cpu)
 #endif
 
-#ifdef CONFIG_SMP
-static void update_capacity_of(int cpu)
-{
-	unsigned long req_cap;
-
-	if (!sched_freq())
-		return;
-
-	/* Normalize scale-invariant capacity to cpu. */
-	req_cap = boosted_cpu_util(cpu);
-	req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
-	set_cfs_cpu_capacity(cpu, true, req_cap);
-}
-#endif
-
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -4695,7 +4677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 #ifdef CONFIG_SMP
 	int task_new = flags & ENQUEUE_WAKEUP_NEW;
-	int task_wakeup = flags & ENQUEUE_WAKEUP;
 #endif
 
 	/*
@@ -4769,16 +4750,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			rq->rd->overutilized = true;
 			trace_sched_overutilized(true);
 		}
-
-		/*
-		 * We want to potentially trigger a freq switch
-		 * request only for tasks that are waking up; this is
-		 * because we get here also during load balancing, but
-		 * in these cases it seems wise to trigger as single
-		 * request after load balancing is done.
-		 */
-		if (task_new || task_wakeup)
-			update_capacity_of(cpu_of(rq));
 	}
 
 #endif /* CONFIG_SMP */
@@ -4854,25 +4825,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	schedtune_dequeue_task(p, cpu_of(rq));
 
-	if (!se) {
+	if (!se)
 		walt_dec_cumulative_runnable_avg(rq, p);
-
-		/*
-		 * We want to potentially trigger a freq switch
-		 * request only for tasks that are going to sleep;
-		 * this is because we get here also during load
-		 * balancing, but in these cases it seems wise to
-		 * trigger as single request after load balancing is
-		 * done.
-		 */
-		if (task_sleep) {
-			if (rq->cfs.nr_running)
-				update_capacity_of(cpu_of(rq));
-			else if (sched_freq())
-				set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */
-		}
-	}
-
 #endif /* CONFIG_SMP */
 
 	hrtick_update(rq);
@@ -7709,10 +7663,6 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
 {
 	raw_spin_lock(&rq->lock);
 	attach_task(rq, p);
-	/*
-	 * We want to potentially raise target_cpu's OPP.
-	 */
-	update_capacity_of(cpu_of(rq));
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -7734,11 +7684,6 @@ static void attach_tasks(struct lb_env *env)
 		attach_task(env->dst_rq, p);
 	}
 
-	/*
-	 * We want to potentially raise env.dst_cpu's OPP.
-	 */
-	update_capacity_of(env->dst_cpu);
-
 	raw_spin_unlock(&env->dst_rq->lock);
 }
 
@@ -9081,11 +9026,6 @@ more_balance:
 		 * ld_moved     - cumulative load moved across iterations
 		 */
 		cur_ld_moved = detach_tasks(&env);
-		/*
-		 * We want to potentially lower env.src_cpu's OPP.
-		 */
-		if (cur_ld_moved)
-			update_capacity_of(env.src_cpu);
 
 		/*
 		 * We've detached some tasks from busiest_rq. Every
@@ -9310,7 +9250,6 @@ static int idle_balance(struct rq *this_rq)
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	u64 curr_cost = 0;
-	long removed_util=0;
 
 	idle_enter_fair(this_rq);
 
@@ -9334,17 +9273,6 @@ static int idle_balance(struct rq *this_rq)
 
 	raw_spin_unlock(&this_rq->lock);
 
-	/*
-	 * If removed_util_avg is !0 we most probably migrated some task away
-	 * from this_cpu. In this case we might be willing to trigger an OPP
-	 * update, but we want to do so if we don't find anybody else to pull
-	 * here (we will trigger an OPP update with the pulled task's enqueue
-	 * anyway).
-	 *
-	 * Record removed_util before calling update_blocked_averages, and use
-	 * it below (before returning) to see if an OPP update is required.
-	 */
-	removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
 	update_blocked_averages(this_cpu);
 	rcu_read_lock();
 	for_each_domain(this_cpu, sd) {
@@ -9409,12 +9337,6 @@ out:
 	if (pulled_task) {
 		idle_exit_fair(this_rq);
 		this_rq->idle_stamp = 0;
-	} else if (removed_util) {
-		/*
-		 * No task pulled and someone has been migrated away.
-		 * Good case to trigger an OPP update.
-		 */
-		update_capacity_of(this_cpu);
 	}
 
 	return pulled_task;
@@ -9488,13 +9410,8 @@ static int active_load_balance_cpu_stop(void *data)
 		update_rq_clock(busiest_rq);
 
 		p = detach_one_task(&env);
-		if (p) {
+		if (p)
 			schedstat_inc(sd, alb_pushed);
-			/*
-			 * We want to potentially lower env.src_cpu's OPP.
-			 */
-			update_capacity_of(env.src_cpu);
-		}
 		else
 			schedstat_inc(sd, alb_failed);
 	}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f41435e7f75d..ebf0d9329c86 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1636,41 +1636,6 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
 #endif
 }
 
-#ifdef CONFIG_SMP
-static void sched_rt_update_capacity_req(struct rq *rq)
-{
-	u64 total, used, age_stamp, avg;
-	s64 delta;
-
-	if (!sched_freq())
-		return;
-
-	sched_avg_update(rq);
-	/*
-	 * Since we're reading these variables without serialization make sure
-	 * we read them once before doing sanity checks on them.
-	 */
-	age_stamp = READ_ONCE(rq->age_stamp);
-	avg = READ_ONCE(rq->rt_avg);
-	delta = rq_clock(rq) - age_stamp;
-
-	if (unlikely(delta < 0))
-		delta = 0;
-
-	total = sched_avg_period() + delta;
-
-	used = div_u64(avg, total);
-	if (unlikely(used > SCHED_CAPACITY_SCALE))
-		used = SCHED_CAPACITY_SCALE;
-
-	set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
-}
-#else
-static inline void sched_rt_update_capacity_req(struct rq *rq)
-{ }
-
-#endif
-
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 						   struct rt_rq *rt_rq)
 {
@@ -1739,17 +1704,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	if (prev->sched_class == &rt_sched_class)
 		update_curr_rt(rq);
 
-	if (!rt_rq->rt_queued) {
-		/*
-		 * The next task to be picked on this rq will have a lower
-		 * priority than rt tasks so we can spend some time to update
-		 * the capacity used by rt tasks based on the last activity.
-		 * This value will be the used as an estimation of the next
-		 * activity.
-		 */
-		sched_rt_update_capacity_req(rq);
+	if (!rt_rq->rt_queued)
 		return NULL;
-	}
 
 	put_prev_task(rq, prev);
 
@@ -2476,9 +2432,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 
 	update_curr_rt(rq);
 
-	if (rq->rt.rt_nr_running)
-		sched_rt_update_capacity_req(rq);
-
 	watchdog(rq, p);
 
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 028e232103c2..782746140711 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1629,81 +1629,6 @@ static inline unsigned long cpu_util_freq(int cpu)
 
 #endif
 
-#ifdef CONFIG_CPU_FREQ_GOV_SCHED
-#define capacity_max SCHED_CAPACITY_SCALE
-extern unsigned int capacity_margin;
-extern struct static_key __sched_freq;
-
-static inline bool sched_freq(void)
-{
-	return static_key_false(&__sched_freq);
-}
-
-/*
- * sched_capacity_reqs expects capacity requests to be normalised.
- * All capacities should sum to the range of 0-1024.
- */
-DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
-void update_cpu_capacity_request(int cpu, bool request);
-
-static inline void set_cfs_cpu_capacity(int cpu, bool request,
-					unsigned long capacity)
-{
-	struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-
-#ifdef CONFIG_SCHED_WALT
-       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
-		int rtdl = scr->rt + scr->dl;
-		/*
-		 * WALT tracks the utilization of a CPU considering the load
-		 * generated by all the scheduling classes.
-		 * Since the following call to:
-		 *    update_cpu_capacity
-		 * is already adding the RT and DL utilizations let's remove
-		 * these contributions from the WALT signal.
-		 */
-		if (capacity > rtdl)
-			capacity -= rtdl;
-		else
-			capacity = 0;
-	}
-#endif
-	if (scr->cfs != capacity) {
-		scr->cfs = capacity;
-		update_cpu_capacity_request(cpu, request);
-	}
-}
-
-static inline void set_rt_cpu_capacity(int cpu, bool request,
-				       unsigned long capacity)
-{
-	if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
-		per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
-		update_cpu_capacity_request(cpu, request);
-	}
-}
-
-static inline void set_dl_cpu_capacity(int cpu, bool request,
-				       unsigned long capacity)
-{
-	if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
-		per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
-		update_cpu_capacity_request(cpu, request);
-	}
-}
-#else
-static inline bool sched_freq(void) { return false; }
-static inline void set_cfs_cpu_capacity(int cpu, bool request,
-					unsigned long capacity)
-{ }
-static inline void set_rt_cpu_capacity(int cpu, bool request,
-				       unsigned long capacity)
-{ }
-static inline void set_dl_cpu_capacity(int cpu, bool request,
-				       unsigned long capacity)
-{ }
-#endif
-
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 55caf81a833f..4e2f98dd2052 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -342,13 +342,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-	{
-		.procname	= "sched_initial_task_util",
-		.data		= &sysctl_sched_initial_task_util,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "sched_cstate_aware",
 		.data		= &sysctl_sched_cstate_aware,

From 3822fe484cef0ef3e37e7106bfd684639f64e77b Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Wed, 8 Nov 2017 00:43:37 +0000
Subject: [PATCH 1262/3220] Revert "ANDROID: sched/rt: schedtune: Add boost
 retention to RT"

This reverts commit d194ba5d712f051ff6c025f3484bb72f219764e3.

Reason for revert: Broke some builds. Will fix and resubmit.

Change-Id: I4e6fa1562346eda1bbf058f1d5ace5ba6256ce07
---
 include/linux/sched.h |   4 --
 kernel/sched/core.c   |   1 -
 kernel/sched/rt.c     | 154 ------------------------------------------
 kernel/sched/sched.h  |   1 -
 4 files changed, 160 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6197d9c8e5a0..60af9d2fa922 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1433,10 +1433,6 @@ struct sched_rt_entity {
 	unsigned long watchdog_stamp;
 	unsigned int time_slice;
 
-	/* Accesses for these must be guarded by rq->lock of the task's rq */
-	bool schedtune_enqueued;
-	struct hrtimer schedtune_timer;
-
 	struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity	*parent;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 889fb1aff1e0..1eb91a696069 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2200,7 +2200,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	init_dl_task_timer(&p->dl);
 	__dl_clear_params(p);
 
-	init_rt_schedtune_timer(&p->rt);
 	INIT_LIST_HEAD(&p->rt.run_list);
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ebf0d9329c86..c8322ab130eb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,7 +7,6 @@
 
 #include <linux/slab.h>
 #include <linux/irq_work.h>
-#include <linux/hrtimer.h>
 
 #include "walt.h"
 #include "tune.h"
@@ -987,73 +986,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 	return 0;
 }
 
-#define RT_SCHEDTUNE_INTERVAL 50000000ULL
-
-static void sched_rt_update_capacity_req(struct rq *rq);
-
-static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
-{
-	struct sched_rt_entity *rt_se = container_of(timer,
-			struct sched_rt_entity,
-			schedtune_timer);
-	struct task_struct *p = rt_task_of(rt_se);
-	struct rq *rq = task_rq(p);
-
-	raw_spin_lock(&rq->lock);
-
-	/*
-	 * Nothing to do if:
-	 * - task has switched runqueues
-	 * - task isn't RT anymore
-	 */
-	if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
-		goto out;
-
-	/*
-	 * If task got enqueued back during callback time, it means we raced
-	 * with the enqueue on another cpu, that's Ok, just do nothing as
-	 * enqueue path would have tried to cancel us and we shouldn't run
-	 * Also check the schedtune_enqueued flag as class-switch on a
-	 * sleeping task may have already canceled the timer and done dq
-	 */
-	if (p->on_rq || !rt_se->schedtune_enqueued)
-		goto out;
-
-	/*
-	 * RT task is no longer active, cancel boost
-	 */
-	rt_se->schedtune_enqueued = false;
-	schedtune_dequeue_task(p, cpu_of(rq));
-	sched_rt_update_capacity_req(rq);
-	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
-out:
-	raw_spin_unlock(&rq->lock);
-
-	/*
-	 * This can free the task_struct if no more references.
-	 */
-	put_task_struct(p);
-
-	return HRTIMER_NORESTART;
-}
-
-void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
-{
-	struct hrtimer *timer = &rt_se->schedtune_timer;
-
-	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	timer->function = rt_schedtune_timer;
-	rt_se->schedtune_enqueued = false;
-}
-
-static void start_schedtune_timer(struct sched_rt_entity *rt_se)
-{
-	struct hrtimer *timer = &rt_se->schedtune_timer;
-
-	hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
-			HRTIMER_MODE_REL_PINNED);
-}
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -1391,33 +1323,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 
-	if (!schedtune_task_boost(p))
-		return;
-
-	/*
-	 * If schedtune timer is active, that means a boost was already
-	 * done, just cancel the timer so that deboost doesn't happen.
-	 * Otherwise, increase the boost. If an enqueued timer was
-	 * cancelled, put the task reference.
-	 */
-	if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
-		put_task_struct(p);
-
-	/*
-	 * schedtune_enqueued can be true in the following situation:
-	 * enqueue_task_rt grabs rq lock before timer fires
-	 *    or before its callback acquires rq lock
-	 * schedtune_enqueued can be false if timer callback is running
-	 * and timer just released rq lock, or if the timer finished
-	 * running and canceling the boost
-	 */
-	if (rt_se->schedtune_enqueued)
-		return;
-
-	rt_se->schedtune_enqueued = true;
 	schedtune_enqueue_task(p, cpu_of(rq));
-	sched_rt_update_capacity_req(rq);
-	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1429,20 +1335,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	walt_dec_cumulative_runnable_avg(rq, p);
 
 	dequeue_pushable_task(rq, p);
-
-	if (!rt_se->schedtune_enqueued)
-		return;
-
-	if (flags == DEQUEUE_SLEEP) {
-		get_task_struct(p);
-		start_schedtune_timer(rt_se);
-		return;
-	}
-
-	rt_se->schedtune_enqueued = false;
 	schedtune_dequeue_task(p, cpu_of(rq));
-	sched_rt_update_capacity_req(rq);
-	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
 }
 
 /*
@@ -1482,33 +1375,6 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-/*
- * Perform a schedtune dequeue and cancelation of boost timers if needed.
- * Should be called only with the rq->lock held.
- */
-static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
-{
-	struct sched_rt_entity *rt_se = &p->rt;
-
-	BUG_ON(!raw_spin_is_locked(&rq->lock));
-
-	if (!rt_se->schedtune_enqueued)
-		return;
-
-	/*
-	 * Incase of class change cancel any active timers. If an enqueued
-	 * timer was cancelled, put the task ref.
-	 */
-	if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
-		put_task_struct(p);
-
-	/* schedtune_enqueued is true, deboost it */
-	rt_se->schedtune_enqueued = false;
-	schedtune_dequeue_task(p, task_cpu(p));
-	sched_rt_update_capacity_req(rq);
-	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
-}
-
 static int
 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
 		  int sibling_count_hint)
@@ -1563,19 +1429,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
 	rcu_read_unlock();
 
 out:
-	/*
-	 * If previous CPU was different, make sure to cancel any active
-	 * schedtune timers and deboost.
-	 */
-	if (task_cpu(p) != cpu) {
-		unsigned long fl;
-		struct rq *prq = task_rq(p);
-
-		raw_spin_lock_irqsave(&prq->lock, fl);
-		schedtune_dequeue_rt(prq, p);
-		raw_spin_unlock_irqrestore(&prq->lock, fl);
-	}
-
 	return cpu;
 }
 
@@ -2308,13 +2161,6 @@ static void rq_offline_rt(struct rq *rq)
  */
 static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
-	/*
-	 * On class switch from rt, always cancel active schedtune timers,
-	 * this handles the cases where we switch class for a task that is
-	 * already rt-dequeued but has a running timer.
-	 */
-	schedtune_dequeue_rt(rq, p);
-
 	/*
 	 * If there are other RT tasks then we will reschedule
 	 * and the scheduling of the other RT tasks will handle
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 782746140711..203d64a0c947 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1398,7 +1398,6 @@ extern void resched_cpu(int cpu);
 
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se);
 
 extern struct dl_bandwidth def_dl_bandwidth;
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);

From faa82dcc9d4dc3cc362d28832216df54de963e6e Mon Sep 17 00:00:00 2001
From: Jonathan Basseri <misterikkit@google.com>
Date: Wed, 25 Oct 2017 09:52:27 -0700
Subject: [PATCH 1263/3220] BACKPORT: xfrm: Clear sk_dst_cache when applying
 per-socket policy.

If a socket has a valid dst cache, then xfrm_lookup_route will get
skipped. However, the cache is not invalidated when applying policy to a
socket (i.e. IPV6_XFRM_POLICY). The result is that new policies are
sometimes ignored on those sockets. (Note: This was broken for IPv4 and
IPv6 at different times.)

This can be demonstrated like so,
1. Create UDP socket.
2. connect() the socket.
3. Apply an outbound XFRM policy to the socket. (setsockopt)
4. send() data on the socket.

Packets will continue to be sent in the clear instead of matching an
xfrm or returning a no-match error (EAGAIN). This affects calls to
send() and not sendto().

Invalidating the sk_dst_cache is necessary to correctly apply xfrm
policies. Since we do this in xfrm_user_policy(), the sk_lock was
already acquired in either do_ip_setsockopt() or do_ipv6_setsockopt(),
and we may call __sk_dst_reset().

Performance impact should be negligible, since this code is only called
when changing xfrm policy, and only affects the socket in question.

Change-Id: I54b4ec422aa5f4e31652a8c6913696f0a5610a51
Fixes: 00bc0ef5880d ("ipv6: Skip XFRM lookup if dst_entry in socket cache is valid")
Tested: https://android-review.googlesource.com/517555
Tested: https://android-review.googlesource.com/418659
Signed-off-by: Jonathan Basseri <misterikkit@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
(cherry picked from commit 2b06cdf3e688b98fcc9945873b5d42792bd4eee0)
---
 net/xfrm/xfrm_state.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 9895a8c56d8c..c0aa1b18fe8f 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1869,6 +1869,7 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen
 	if (err >= 0) {
 		xfrm_sk_policy_insert(sk, err, pol);
 		xfrm_pol_put(pol);
+		__sk_dst_reset(sk);
 		err = 0;
 	}
 

From 04af12976bc6d6da87438cd9a7f7f195a34e2d60 Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <salyzyn@android.com>
Date: Tue, 24 Oct 2017 07:47:14 -0700
Subject: [PATCH 1264/3220] FROMLIST: arm64: Avoid aligning normal memory
 pointers in __memcpy_{to,from}io

(cherry picked from arm64/for-next/core commit 9ca255bf041ddc7698b6906dbd846c0ba64b1fe1)

__memcpy_{to,from}io fall back to byte-at-a-time copying if both the
source and destination pointers are not 8-byte aligned. Since one of the
pointers always points at normal memory, this is unnecessary and
detrimental to performance, so only do byte copying until we hit an 8-byte
boundary for the device pointer.

This change was motivated by performance issues in the pstore driver.
On a test platform, measuring probe time for pstore, console buffer
size of 1/4MB and pmsg of 1/2MB, was in the 90-107ms region. Change
managed to reduce it to 10-25ms, an improvement in boot time.

Cc: Kees Cook <keescook@chromium.org>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Mark Salyzyn <salyzyn@android.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Bug: 63716230
Change-Id: I245545e8243a54b44d30fbb0d0c71a9b8a77ef63
---
 arch/arm64/kernel/io.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index 354be2a872ae..79b17384effa 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -25,8 +25,7 @@
  */
 void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
 {
-	while (count && (!IS_ALIGNED((unsigned long)from, 8) ||
-			 !IS_ALIGNED((unsigned long)to, 8))) {
+	while (count && !IS_ALIGNED((unsigned long)from, 8)) {
 		*(u8 *)to = __raw_readb(from);
 		from++;
 		to++;
@@ -54,23 +53,22 @@ EXPORT_SYMBOL(__memcpy_fromio);
  */
 void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
 {
-	while (count && (!IS_ALIGNED((unsigned long)to, 8) ||
-			 !IS_ALIGNED((unsigned long)from, 8))) {
-		__raw_writeb(*(volatile u8 *)from, to);
+	while (count && !IS_ALIGNED((unsigned long)to, 8)) {
+		__raw_writeb(*(u8 *)from, to);
 		from++;
 		to++;
 		count--;
 	}
 
 	while (count >= 8) {
-		__raw_writeq(*(volatile u64 *)from, to);
+		__raw_writeq(*(u64 *)from, to);
 		from += 8;
 		to += 8;
 		count -= 8;
 	}
 
 	while (count) {
-		__raw_writeb(*(volatile u8 *)from, to);
+		__raw_writeb(*(u8 *)from, to);
 		from++;
 		to++;
 		count--;

From 6b6637fd4678e15dd6b4d19fb9b6539a201bdbd6 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Mon, 13 Nov 2017 09:55:21 +0100
Subject: [PATCH 1265/3220] ANDROID: binder: clarify deferred thread work.

Rename the function to more accurately reflect what
it does, and add a comment explaining why we use it.

Change-Id: I8d011c017dfc6e24b5b54fc462578f8e153e5926
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 6dfcc80f3fc6..1177a6617964 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -837,7 +837,7 @@ binder_enqueue_work_ilocked(struct binder_work *work,
 }
 
 /**
- * binder_enqueue_thread_work_ilocked_nowake() - Add thread work
+ * binder_enqueue_deferred_thread_work_ilocked() - Add deferred thread work
  * @thread:       thread to queue work to
  * @work:         struct binder_work to add to list
  *
@@ -848,8 +848,8 @@ binder_enqueue_work_ilocked(struct binder_work *work,
  * Requires the proc->inner_lock to be held.
  */
 static void
-binder_enqueue_thread_work_ilocked_nowake(struct binder_thread *thread,
-					  struct binder_work *work)
+binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread,
+					    struct binder_work *work)
 {
 	binder_enqueue_work_ilocked(work, &thread->todo);
 }
@@ -3334,7 +3334,14 @@ static void binder_transaction(struct binder_proc *proc,
 	} else if (!(t->flags & TF_ONE_WAY)) {
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_inner_proc_lock(proc);
-		binder_enqueue_thread_work_ilocked_nowake(thread, tcomplete);
+		/*
+		 * Defer the TRANSACTION_COMPLETE, so we don't return to
+		 * userspace immediately; this allows the target process to
+		 * immediately start processing this transaction, reducing
+		 * latency. We will then return the TRANSACTION_COMPLETE when
+		 * the target replies (or there is an error).
+		 */
+		binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete);
 		t->need_reply = 1;
 		t->from_parent = thread->transaction_stack;
 		thread->transaction_stack = t;

From 19ef30ef23a4de3f0bd8979a5bb5216dd381bc75 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 10 Nov 2017 15:30:27 -0800
Subject: [PATCH 1266/3220] FROMLIST: binder: fix proc->files use-after-free

(from https://patchwork.kernel.org/patch/10058587/)

proc->files cleanup is initiated by binder_vma_close. Therefore
a reference on the binder_proc is not enough to prevent the
files_struct from being released while the binder_proc still has
a reference. This can lead to an attempt to dereference the
stale pointer obtained from proc->files prior to proc->files
cleanup. This has been seen once in task_get_unused_fd_flags()
when __alloc_fd() is called with a stale "files".

The fix is to always use get_files_struct() to obtain struct_files
so that the refcount on the files_struct is used to prevent
a premature free. proc->files is removed since we get it every
time.

Bug: 69164715
Change-Id: I6431027d3d569e76913935c21885201505627982
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c | 63 +++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 1177a6617964..008e448536de 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -466,9 +466,8 @@ struct binder_ref {
 };
 
 enum binder_deferred_state {
-	BINDER_DEFERRED_PUT_FILES    = 0x01,
-	BINDER_DEFERRED_FLUSH        = 0x02,
-	BINDER_DEFERRED_RELEASE      = 0x04,
+	BINDER_DEFERRED_FLUSH        = 0x01,
+	BINDER_DEFERRED_RELEASE      = 0x02,
 };
 
 /**
@@ -505,8 +504,6 @@ struct binder_priority {
  *                        (invariant after initialized)
  * @tsk                   task_struct for group_leader of process
  *                        (invariant after initialized)
- * @files                 files_struct for process
- *                        (invariant after initialized)
  * @deferred_work_node:   element for binder_deferred_list
  *                        (protected by binder_deferred_lock)
  * @deferred_work:        bitmap of deferred work to perform
@@ -553,7 +550,6 @@ struct binder_proc {
 	struct list_head waiting_threads;
 	int pid;
 	struct task_struct *tsk;
-	struct files_struct *files;
 	struct hlist_node deferred_work_node;
 	int deferred_work;
 	bool is_dead;
@@ -949,22 +945,34 @@ static void binder_free_thread(struct binder_thread *thread);
 static void binder_free_proc(struct binder_proc *proc);
 static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
 
+struct files_struct *binder_get_files_struct(struct binder_proc *proc)
+{
+	return get_files_struct(proc->tsk);
+}
+
 static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 {
-	struct files_struct *files = proc->files;
+	struct files_struct *files;
 	unsigned long rlim_cur;
 	unsigned long irqs;
+	int ret;
 
+	files = binder_get_files_struct(proc);
 	if (files == NULL)
 		return -ESRCH;
 
-	if (!lock_task_sighand(proc->tsk, &irqs))
-		return -EMFILE;
+	if (!lock_task_sighand(proc->tsk, &irqs)) {
+		ret = -EMFILE;
+		goto err;
+	}
 
 	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
 	unlock_task_sighand(proc->tsk, &irqs);
 
-	return __alloc_fd(files, 0, rlim_cur, flags);
+	ret = __alloc_fd(files, 0, rlim_cur, flags);
+err:
+	put_files_struct(files);
+	return ret;
 }
 
 /*
@@ -973,8 +981,12 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
-	if (proc->files)
-		__fd_install(proc->files, fd, file);
+	struct files_struct *files = binder_get_files_struct(proc);
+
+	if (files) {
+		__fd_install(files, fd, file);
+		put_files_struct(files);
+	}
 }
 
 /*
@@ -982,18 +994,20 @@ static void task_fd_install(
  */
 static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 {
+	struct files_struct *files = binder_get_files_struct(proc);
 	int retval;
 
-	if (proc->files == NULL)
+	if (files == NULL)
 		return -ESRCH;
 
-	retval = __close_fd(proc->files, fd);
+	retval = __close_fd(files, fd);
 	/* can't restart close syscall because file table entry was cleared */
 	if (unlikely(retval == -ERESTARTSYS ||
 		     retval == -ERESTARTNOINTR ||
 		     retval == -ERESTARTNOHAND ||
 		     retval == -ERESTART_RESTARTBLOCK))
 		retval = -EINTR;
+	put_files_struct(files);
 
 	return retval;
 }
@@ -4818,7 +4832,6 @@ static void binder_vma_close(struct vm_area_struct *vma)
 		     (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
 		     (unsigned long)pgprot_val(vma->vm_page_prot));
 	binder_alloc_vma_close(&proc->alloc);
-	binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
 }
 
 static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -4860,10 +4873,8 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	vma->vm_private_data = proc;
 
 	ret = binder_alloc_mmap_handler(&proc->alloc, vma);
-	if (ret)
-		return ret;
-	proc->files = get_files_struct(current);
-	return 0;
+
+	return ret;
 
 err_bad_arg:
 	pr_err("binder_mmap: %d %lx-%lx %s failed %d\n",
@@ -5042,8 +5053,6 @@ static void binder_deferred_release(struct binder_proc *proc)
 	struct rb_node *n;
 	int threads, nodes, incoming_refs, outgoing_refs, active_transactions;
 
-	BUG_ON(proc->files);
-
 	mutex_lock(&binder_procs_lock);
 	hlist_del(&proc->proc_node);
 	mutex_unlock(&binder_procs_lock);
@@ -5125,8 +5134,6 @@ static void binder_deferred_release(struct binder_proc *proc)
 static void binder_deferred_func(struct work_struct *work)
 {
 	struct binder_proc *proc;
-	struct files_struct *files;
-
 	int defer;
 
 	do {
@@ -5143,21 +5150,11 @@ static void binder_deferred_func(struct work_struct *work)
 		}
 		mutex_unlock(&binder_deferred_lock);
 
-		files = NULL;
-		if (defer & BINDER_DEFERRED_PUT_FILES) {
-			files = proc->files;
-			if (files)
-				proc->files = NULL;
-		}
-
 		if (defer & BINDER_DEFERRED_FLUSH)
 			binder_deferred_flush(proc);
 
 		if (defer & BINDER_DEFERRED_RELEASE)
 			binder_deferred_release(proc); /* frees proc */
-
-		if (files)
-			put_files_struct(files);
 	} while (proc);
 }
 static DECLARE_WORK(binder_deferred_work, binder_deferred_func);

From b11250d2adedb7fc5965ff4ac410df40fb073f3a Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 10 Jan 2017 13:35:42 -0800
Subject: [PATCH 1267/3220] UPSTREAM: mm: Introduce lm_alias

(cherry-pick from commit 568c5fe5a54f2654f5a4c599c45b8a62ed9a2013)

Certain architectures may have the kernel image mapped separately to
alias the linear map. Introduce a macro lm_alias to translate a kernel
image symbol into its linear alias. This is used in part with work to
add CONFIG_DEBUG_VIRTUAL support for arm64.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Bug: 20045882
Bug: 63737556
Change-Id: I93ba4ef8df77f7e52717ab6e208d06122ac3a72b
---
 include/linux/mm.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e51aafbc4b3a..941c7f711ada 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -70,6 +70,10 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
 #endif
 
+#ifndef lm_alias
+#define lm_alias(x)	__va(__pa_symbol(x))
+#endif
+
 /*
  * To prevent common memory management code establishing
  * a zero page mapping on a read fault.

From 427c567580d9e492090526614ea9bfcb62a72546 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 10 Jan 2017 13:35:49 -0800
Subject: [PATCH 1268/3220] BACKPORT: arm64: Use __pa_symbol for kernel symbols

(cherry-pick from commit 2077be6783b5936c3daa838d8addbb635667927f)

__pa_symbol is technically the marcro that should be used for kernel
symbols. Switch to this as a pre-requisite for DEBUG_VIRTUAL which
will do bounds checking.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Bug: 20045882
Bug: 63737556
Change-Id: Ibef89e5935c9562fa69e946778c705636c1ca61e
---
 arch/arm64/include/asm/kvm_mmu.h          |  2 +-
 arch/arm64/include/asm/memory.h           |  1 +
 arch/arm64/include/asm/mmu_context.h      |  6 ++--
 arch/arm64/include/asm/pgtable.h          |  2 +-
 arch/arm64/kernel/acpi_parking_protocol.c |  3 +-
 arch/arm64/kernel/cpufeature.c            |  1 +
 arch/arm64/kernel/insn.c                  |  2 +-
 arch/arm64/kernel/psci.c                  |  4 ++-
 arch/arm64/kernel/setup.c                 |  9 +++---
 arch/arm64/kernel/smp_spin_table.c        |  3 +-
 arch/arm64/kernel/vdso.c                  |  8 ++++--
 arch/arm64/mm/init.c                      |  9 +++---
 arch/arm64/mm/kasan_init.c                | 25 +++++++++++-----
 arch/arm64/mm/mmu.c                       | 35 +++++++++++++++--------
 14 files changed, 72 insertions(+), 38 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 819b21a9851c..2b1020a056ad 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -265,7 +265,7 @@ static inline void __kvm_flush_dcache_pud(pud_t pud)
 	kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
 }
 
-#define kvm_virt_to_phys(x)		__virt_to_phys((unsigned long)(x))
+#define kvm_virt_to_phys(x)		__pa_symbol(x)
 
 void kvm_set_way_flush(struct kvm_vcpu *vcpu);
 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index ba1b3409d7ed..2600d4a9c1c2 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -185,6 +185,7 @@ static inline void *phys_to_virt(phys_addr_t x)
 #define __va(x)			((void *)__phys_to_virt((phys_addr_t)(x)))
 #define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
 #define virt_to_pfn(x)      __phys_to_pfn(__virt_to_phys(x))
+#define sym_to_pfn(x)	    __phys_to_pfn(__pa_symbol(x))
 
 /*
  *  virt_to_page(k)	convert a _valid_ virtual address to struct page *
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index e53d30c6f779..386956aae2dc 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -50,7 +50,7 @@ static inline void contextidr_thread_switch(struct task_struct *next)
  */
 static inline void cpu_set_reserved_ttbr0(void)
 {
-	unsigned long ttbr = virt_to_phys(empty_zero_page);
+	unsigned long ttbr = __pa_symbol(empty_zero_page);
 
 	asm(
 	"	msr	ttbr0_el1, %0			// set TTBR0\n"
@@ -124,7 +124,7 @@ static inline void cpu_install_idmap(void)
 	local_flush_tlb_all();
 	cpu_set_idmap_tcr_t0sz();
 
-	cpu_switch_mm(idmap_pg_dir, &init_mm);
+	cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm);
 }
 
 /*
@@ -139,7 +139,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgd)
 
 	phys_addr_t pgd_phys = virt_to_phys(pgd);
 
-	replace_phys = (void *)virt_to_phys(idmap_cpu_replace_ttbr1);
+	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
 
 	cpu_install_idmap();
 	replace_phys(pgd_phys);
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0e7e8007e6fc..fa210858b873 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -119,7 +119,7 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
  * for zero-mapped memory areas etc..
  */
 extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr)	virt_to_page(empty_zero_page)
+#define ZERO_PAGE(vaddr)	phys_to_page(__pa_symbol(empty_zero_page))
 
 #define pte_ERROR(pte)		__pte_error(__FILE__, __LINE__, pte_val(pte))
 
diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c
index 4b1e5a7a98da..89c96bd1aab9 100644
--- a/arch/arm64/kernel/acpi_parking_protocol.c
+++ b/arch/arm64/kernel/acpi_parking_protocol.c
@@ -17,6 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include <linux/acpi.h>
+#include <linux/mm.h>
 #include <linux/types.h>
 
 #include <asm/cpu_ops.h>
@@ -102,7 +103,7 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu)
 	 * that read this address need to convert this address to the
 	 * Boot-Loader's endianness before jumping.
 	 */
-	writeq_relaxed(__pa(secondary_entry), &mailbox->entry_point);
+	writeq_relaxed(__pa_symbol(secondary_entry), &mailbox->entry_point);
 	writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id);
 
 	arch_send_wakeup_ipi_mask(cpumask_of(cpu));
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index ae888588024e..56cbcfba0c02 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -23,6 +23,7 @@
 #include <linux/sort.h>
 #include <linux/stop_machine.h>
 #include <linux/types.h>
+#include <linux/mm.h>
 #include <asm/cpu.h>
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index c08b9ad6f429..67d3100369c6 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -96,7 +96,7 @@ static void __kprobes *patch_map(void *addr, int fixmap)
 	if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX))
 		page = vmalloc_to_page(addr);
 	else if (!module && IS_ENABLED(CONFIG_DEBUG_RODATA))
-		page = virt_to_page(addr);
+		page = phys_to_page(__pa_symbol(addr));
 	else
 		return addr;
 
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index f67f35b6edb1..0b6d5d3d25c3 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -19,6 +19,7 @@
 #include <linux/of.h>
 #include <linux/smp.h>
 #include <linux/delay.h>
+#include <linux/mm.h>
 #include <linux/psci.h>
 #include <linux/slab.h>
 
@@ -113,7 +114,8 @@ static int __init cpu_psci_cpu_prepare(unsigned int cpu)
 
 static int cpu_psci_cpu_boot(unsigned int cpu)
 {
-	int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa(secondary_entry));
+	int err = psci_ops.cpu_on(cpu_logical_map(cpu),
+				  __pa_symbol(secondary_entry));
 	if (err)
 		pr_err("failed to boot CPU%d (%d)\n", cpu, err);
 
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 4a24719026a9..554f6e83fc2e 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -44,6 +44,7 @@
 #include <linux/of_platform.h>
 #include <linux/efi.h>
 #include <linux/psci.h>
+#include <linux/mm.h>
 
 #include <asm/acpi.h>
 #include <asm/fixmap.h>
@@ -201,10 +202,10 @@ static void __init request_standard_resources(void)
 	struct memblock_region *region;
 	struct resource *res;
 
-	kernel_code.start   = virt_to_phys(_text);
-	kernel_code.end     = virt_to_phys(__init_begin - 1);
-	kernel_data.start   = virt_to_phys(_sdata);
-	kernel_data.end     = virt_to_phys(_end - 1);
+	kernel_code.start   = __pa_symbol(_text);
+	kernel_code.end     = __pa_symbol(__init_begin - 1);
+	kernel_data.start   = __pa_symbol(_sdata);
+	kernel_data.end     = __pa_symbol(_end - 1);
 
 	for_each_memblock(memory, region) {
 		res = alloc_bootmem_low(sizeof(*res));
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index aef3605a8c47..2ccb883353d9 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -21,6 +21,7 @@
 #include <linux/of.h>
 #include <linux/smp.h>
 #include <linux/types.h>
+#include <linux/mm.h>
 
 #include <asm/cacheflush.h>
 #include <asm/cpu_ops.h>
@@ -96,7 +97,7 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
 	 * boot-loader's endianess before jumping. This is mandated by
 	 * the boot protocol.
 	 */
-	writeq_relaxed(__pa(secondary_holding_pen), release_addr);
+	writeq_relaxed(__pa_symbol(secondary_holding_pen), release_addr);
 	__flush_dcache_area((__force void *)release_addr,
 			    sizeof(*release_addr));
 
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 3b8acfae7797..b7730ba782dd 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -114,6 +114,7 @@ static struct vm_special_mapping vdso_spec[2];
 static int __init vdso_init(void)
 {
 	int i;
+	unsigned long pfn;
 
 	if (memcmp(&vdso_start, "\177ELF", 4)) {
 		pr_err("vDSO is not a valid ELF object!\n");
@@ -131,11 +132,14 @@ static int __init vdso_init(void)
 		return -ENOMEM;
 
 	/* Grab the vDSO data page. */
-	vdso_pagelist[0] = virt_to_page(vdso_data);
+	vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data));
+
 
 	/* Grab the vDSO code pages. */
+	pfn = sym_to_pfn(&vdso_start);
+
 	for (i = 0; i < vdso_pages; i++)
-		vdso_pagelist[i + 1] = virt_to_page(&vdso_start + i * PAGE_SIZE);
+		vdso_pagelist[i + 1] = pfn_to_page(pfn + i);
 
 	/* Populate the special mapping structures */
 	vdso_spec[0] = (struct vm_special_mapping) {
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 678878077996..4fe090aa4637 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -34,6 +34,7 @@
 #include <linux/dma-contiguous.h>
 #include <linux/efi.h>
 #include <linux/swiotlb.h>
+#include <linux/mm.h>
 
 #include <asm/boot.h>
 #include <asm/fixmap.h>
@@ -190,8 +191,8 @@ void __init arm64_memblock_init(void)
 	 * linear mapping. Take care not to clip the kernel which may be
 	 * high in memory.
 	 */
-	memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)),
-			ULLONG_MAX);
+	memblock_remove(max_t(u64, memstart_addr + linear_region_size,
+			__pa_symbol(_end)), ULLONG_MAX);
 	if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
 		/* ensure that memstart_addr remains sufficiently aligned */
 		memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
@@ -206,7 +207,7 @@ void __init arm64_memblock_init(void)
 	 */
 	if (memory_limit != (phys_addr_t)ULLONG_MAX) {
 		memblock_enforce_memory_limit(memory_limit);
-		memblock_add(__pa(_text), (u64)(_end - _text));
+		memblock_add(__pa_symbol(_text), (u64)(_end - _text));
 	}
 
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
@@ -230,7 +231,7 @@ void __init arm64_memblock_init(void)
 	 * Register the kernel text, kernel data, initrd, and initial
 	 * pagetables with memblock.
 	 */
-	memblock_reserve(__pa(_text), _end - _text);
+	memblock_reserve(__pa_symbol(_text), _end - _text);
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start) {
 		memblock_reserve(initrd_start, initrd_end - initrd_start);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 757009daa9ed..03588d136f93 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/memblock.h>
 #include <linux/start_kernel.h>
+#include <linux/mm.h>
 
 #include <asm/mmu_context.h>
 #include <asm/kernel-pgtable.h>
@@ -26,6 +27,13 @@
 
 static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
 
+/*
+ * The p*d_populate functions call virt_to_phys implicitly so they can't be used
+ * directly on kernel symbols (bm_p*d). All the early functions are called too
+ * early to use lm_alias so __p*d_populate functions must be used to populate
+ * with the physical address from __pa_symbol.
+ */
+
 static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
 					unsigned long end)
 {
@@ -33,12 +41,13 @@ static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
 	unsigned long next;
 
 	if (pmd_none(*pmd))
-		pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+		__pmd_populate(pmd, __pa_symbol(kasan_zero_pte),
+			       PMD_TYPE_TABLE);
 
 	pte = pte_offset_kimg(pmd, addr);
 	do {
 		next = addr + PAGE_SIZE;
-		set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page),
+		set_pte(pte, pfn_pte(sym_to_pfn(kasan_zero_page),
 					PAGE_KERNEL));
 	} while (pte++, addr = next, addr != end && pte_none(*pte));
 }
@@ -51,7 +60,8 @@ static void __init kasan_early_pmd_populate(pud_t *pud,
 	unsigned long next;
 
 	if (pud_none(*pud))
-		pud_populate(&init_mm, pud, kasan_zero_pmd);
+		__pud_populate(pud, __pa_symbol(kasan_zero_pmd),
+			       PMD_TYPE_TABLE);
 
 	pmd = pmd_offset_kimg(pud, addr);
 	do {
@@ -68,7 +78,8 @@ static void __init kasan_early_pud_populate(pgd_t *pgd,
 	unsigned long next;
 
 	if (pgd_none(*pgd))
-		pgd_populate(&init_mm, pgd, kasan_zero_pud);
+		__pgd_populate(pgd, __pa_symbol(kasan_zero_pud),
+			       PUD_TYPE_TABLE);
 
 	pud = pud_offset_kimg(pgd, addr);
 	do {
@@ -148,7 +159,7 @@ void __init kasan_init(void)
 	 */
 	memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir));
 	dsb(ishst);
-	cpu_replace_ttbr1(tmp_pg_dir);
+	cpu_replace_ttbr1(lm_alias(tmp_pg_dir));
 
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
@@ -199,10 +210,10 @@ void __init kasan_init(void)
 	 */
 	for (i = 0; i < PTRS_PER_PTE; i++)
 		set_pte(&kasan_zero_pte[i],
-			pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
+			pfn_pte(sym_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
-	cpu_replace_ttbr1(swapper_pg_dir);
+	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
 	/* At this point kasan is fully initialized. Enable error messages */
 	init_task.kasan_depth = 0;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 6d2e411b2535..e6409dc0cea9 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -29,6 +29,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/stop_machine.h>
+#include <linux/mm.h>
 
 #include <asm/barrier.h>
 #include <asm/cputype.h>
@@ -385,8 +386,8 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 
 static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
 {
-	unsigned long kernel_start = __pa(_stext);
-	unsigned long kernel_end = __pa(__init_begin);
+	unsigned long kernel_start = __pa_symbol(_stext);
+	unsigned long kernel_end = __pa_symbol(__init_begin);
 
 	/*
 	 * Take care not to create a writable alias for the
@@ -450,14 +451,15 @@ void mark_rodata_ro(void)
 	unsigned long section_size;
 
 	section_size = (unsigned long)_etext - (unsigned long)_stext;
-	create_mapping_late(__pa(_stext), (unsigned long)_stext,
+	create_mapping_late(__pa_symbol(_stext), (unsigned long)_stext,
 			    section_size, PAGE_KERNEL_ROX);
 	/*
 	 * mark .rodata as read only. Use __init_begin rather than __end_rodata
 	 * to cover NOTES and EXCEPTION_TABLE.
 	 */
 	section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
-	create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata,
+	create_mapping_late(__pa_symbol(__start_rodata),
+			    (unsigned long)__start_rodata,
 			    section_size, PAGE_KERNEL_RO);
 }
 
@@ -474,7 +476,7 @@ void fixup_init(void)
 static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end,
 				    pgprot_t prot, struct vm_struct *vma)
 {
-	phys_addr_t pa_start = __pa(va_start);
+	phys_addr_t pa_start = __pa_symbol(va_start);
 	unsigned long size = va_end - va_start;
 
 	BUG_ON(!PAGE_ALIGNED(pa_start));
@@ -522,7 +524,7 @@ static void __init map_kernel(pgd_t *pgd)
 		 */
 		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
 		set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
-			__pud(__pa(bm_pmd) | PUD_TYPE_TABLE));
+			__pud(__pa_symbol(bm_pmd) | PUD_TYPE_TABLE));
 		pud_clear_fixmap();
 	} else {
 		BUG();
@@ -553,7 +555,7 @@ void __init paging_init(void)
 	 */
 	cpu_replace_ttbr1(__va(pgd_phys));
 	memcpy(swapper_pg_dir, pgd, PAGE_SIZE);
-	cpu_replace_ttbr1(swapper_pg_dir);
+	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
 	pgd_clear_fixmap();
 	memblock_free(pgd_phys, PAGE_SIZE);
@@ -565,7 +567,7 @@ void __init paging_init(void)
 	 * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd
 	 * allocated with it.
 	 */
-	memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE,
+	memblock_free(__pa_symbol(swapper_pg_dir) + PAGE_SIZE,
 		      SWAPPER_DIR_SIZE - PAGE_SIZE);
 
 	bootmem_init();
@@ -678,6 +680,12 @@ static inline pte_t * fixmap_pte(unsigned long addr)
 	return &bm_pte[pte_index(addr)];
 }
 
+/*
+ * The p*d_populate functions call virt_to_phys implicitly so they can't be used
+ * directly on kernel symbols (bm_p*d). This function is called too early to use
+ * lm_alias so __p*d_populate functions must be used to populate with the
+ * physical address from __pa_symbol.
+ */
 void __init early_fixmap_init(void)
 {
 	pgd_t *pgd;
@@ -687,7 +695,7 @@ void __init early_fixmap_init(void)
 
 	pgd = pgd_offset_k(addr);
 	if (CONFIG_PGTABLE_LEVELS > 3 &&
-	    !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) {
+	    !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa_symbol(bm_pud))) {
 		/*
 		 * We only end up here if the kernel mapping and the fixmap
 		 * share the top level pgd entry, which should only happen on
@@ -696,12 +704,15 @@ void __init early_fixmap_init(void)
 		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
 		pud = pud_offset_kimg(pgd, addr);
 	} else {
-		pgd_populate(&init_mm, pgd, bm_pud);
+		if (pgd_none(*pgd))
+			__pgd_populate(pgd, __pa_symbol(bm_pud),
+				       PUD_TYPE_TABLE);
 		pud = fixmap_pud(addr);
 	}
-	pud_populate(&init_mm, pud, bm_pmd);
+	if (pud_none(*pud))
+		__pud_populate(pud, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
 	pmd = fixmap_pmd(addr);
-	pmd_populate_kernel(&init_mm, pmd, bm_pte);
+	__pmd_populate(pmd, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
 
 	/*
 	 * The boot-ioremap range spans multiple pmds, for which

From 875988cc0f1e08db707a5f61663968e6dbd17423 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 24 Jan 2017 12:43:40 +0100
Subject: [PATCH 1269/3220] BACKPORT: arm64: Use __pa_symbol for
 empty_zero_page

(cherry-pick commit from cbb999dd0b452991f4f698142aa7ffe566c0b415)

If CONFIG_DEBUG_VIRTUAL=y and CONFIG_ARM64_SW_TTBR0_PAN=y:

    virt_to_phys used for non-linear address: ffffff8008cc0000 (empty_zero_page+0x0/0x1000)
    WARNING: CPU: 0 PID: 0 at arch/arm64/mm/physaddr.c:14 __virt_to_phys+0x28/0x60
    ...
    [<ffffff800809abb4>] __virt_to_phys+0x28/0x60
    [<ffffff8008a02600>] setup_arch+0x46c/0x4d4

Fixes: 2077be6783b5936c ("arm64: Use __pa_symbol for kernel symbols")
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Bug: 20045882
Bug: 63737556
Change-Id: Ida933e532d0423e074b3621207a1e2a5f8609742
---
 arch/arm64/kernel/setup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 554f6e83fc2e..45f44e9cf06c 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -355,9 +355,9 @@ void __init setup_arch(char **cmdline_p)
 	 * thread.
 	 */
 #ifdef CONFIG_THREAD_INFO_IN_TASK
-	init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page);
+	init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page);
 #else
-	init_thread_info.ttbr0 = virt_to_phys(empty_zero_page);
+	init_thread_info.ttbr0 = __pa_symbol(empty_zero_page);
 #endif
 #endif
 

From 663d2e5444a6791c41d690b02c2ae027b1cdb062 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 5 Sep 2017 10:56:13 +0200
Subject: [PATCH 1270/3220] UPSTREAM: android: binder: fix type mismatch
 warning

Allowing binder to expose the 64-bit API on 32-bit kernels caused a
build warning:

drivers/android/binder.c: In function 'binder_transaction_buffer_release':
drivers/android/binder.c:2220:15: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast]
    fd_array = (u32 *)(parent_buffer + fda->parent_offset);
               ^
drivers/android/binder.c: In function 'binder_translate_fd_array':
drivers/android/binder.c:2445:13: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast]
  fd_array = (u32 *)(parent_buffer + fda->parent_offset);
             ^
drivers/android/binder.c: In function 'binder_fixup_parent':
drivers/android/binder.c:2511:18: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast]

This adds extra type casts to avoid the warning.

However, there is another problem with the Kconfig option: turning
it on or off creates two incompatible ABI versions, a kernel that
has this enabled cannot run user space that was built without it
or vice versa. A better solution might be to leave the option hidden
until the binder code is fixed to deal with both ABI versions.

Fixes: e8d2ed7db7c3 ("Revert "staging: Fix build issues with new binder API"")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 1c363eaece2752c5f8b1b874cb4ae435de06aa66)

Change-Id: Id09185a6f86905926699e92a2b30201b8a5e83e5
---
 drivers/android/binder.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 008e448536de..d559596988d7 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2468,7 +2468,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 				       debug_id, (u64)fda->num_fds);
 				continue;
 			}
-			fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+			fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset);
 			for (fd_index = 0; fd_index < fda->num_fds; fd_index++)
 				task_close_fd(proc, fd_array[fd_index]);
 		} break;
@@ -2692,7 +2692,7 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda,
 	 */
 	parent_buffer = parent->buffer -
 		binder_alloc_get_user_buffer_offset(&target_proc->alloc);
-	fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+	fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset);
 	if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) {
 		binder_user_error("%d:%d parent offset not aligned correctly.\n",
 				  proc->pid, thread->pid);
@@ -2758,7 +2758,7 @@ static int binder_fixup_parent(struct binder_transaction *t,
 				  proc->pid, thread->pid);
 		return -EINVAL;
 	}
-	parent_buffer = (u8 *)(parent->buffer -
+	parent_buffer = (u8 *)((uintptr_t)parent->buffer -
 			binder_alloc_get_user_buffer_offset(
 				&target_proc->alloc));
 	*(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer;

From 353c16247d72937c90e7ef04cbd603502581019a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@google.com>
Date: Sat, 23 Sep 2017 17:02:18 +0800
Subject: [PATCH 1271/3220] f2fs: updates on 4.15-rc1

Pull f2fs updates from Jaegeuk Kim:
 "In this round, we introduce sysfile-based quota support which is
  required for Android by default. In addition, we allow that users are
  able to reserve some blocks in runtime to mitigate performance drops
  in low free space.

 Enhancements:
  - assign proper data segments according to write_hints given by user
  - issue cache_flush on dirty devices only among multiple devices
  - exploit cp_error flag and add more faults to enhance fault
    injection test
  - conduct more readaheads during f2fs_readdir
  - add a range for discard commands

Bug fixes:
 - fix zero stat->st_blocks when inline_data is set
 - drop crypto key and free stale memory pointer while evict_inode is
   failing
 - fix some corner cases in free space and segment management
 - fix wrong last_disk_size

This series includes lots of clean-ups and code enhancement in terms
of xattr operations, discard/flush command control. In addition, it
adds versatile debugfs entries to monitor f2fs status"

Cherry-picked from origin/upstream-f2fs-stable-linux-4.4.y:

56a07b070510 f2fs: deny accessing encryption policy if encryption is off
c394842e26e5 f2fs: inject fault in inc_valid_node_count
926292251022 f2fs: fix to clear FI_NO_PREALLOC
e6cfc5de2d05 f2fs: expose quota information in debugfs
c4cd2efe835b f2fs: separate nat entry mem alloc from nat_tree_lock
48c72b4c8c50 f2fs: validate before set/clear free nat bitmap
baf9275a4bbd f2fs: avoid opened loop codes in __add_ino_entry
47af6c72d944 f2fs: apply write hints to select the type of segments for buffered write
ac9819160586 f2fs: introduce scan_curseg_cache for cleanup
ca28e9670e80 f2fs: optimize the way of traversing free_nid_bitmap
460688b59e8b f2fs: keep scanning until enough free nids are acquired
0186182c0c4d f2fs: trace checkpoint reason in fsync()
5d4b6efcfd09 f2fs: keep isize once block is reserved cross EOF
3c8f767e1374 f2fs: avoid race in between GC and block exchange
4423778adf0e f2fs: save a multiplication for last_nid calculation
3e3b40557525 f2fs: fix summary info corruption
44889e487981 f2fs: remove dead code in update_meta_page
55c7b9595bb9 f2fs: remove unneeded semicolon
8b92814117d5 f2fs: don't bother with inode->i_version
42c7c71824fc f2fs: check curseg space before foreground GC
c5470498e59b f2fs: use rw_semaphore to protect SIT cache
82750d346ab7 f2fs: support quota sys files
26dfec49b25a f2fs: add quota_ino feature infra
ddb8e2ae9811 f2fs: optimize __update_nat_bits
f46ae958c701 f2fs: modify for accurate fggc node io stat
c713fdb5a23c Revert "f2fs: handle dirty segments inside refresh_sit_entry"
873ec505cb07 f2fs: add a function to move nid
ae66786296b4 f2fs: export SSR allocation threshold
90c28a18d2a4 f2fs: give correct trimmed blocks in fstrim
5612922fb0ac f2fs: support bio allocation error injection
583b7a274c27 f2fs: support get_page error injection
09a073cc8c56 f2fs: add missing sysfs description
e945474a9c1b f2fs: support soft block reservation
b7b2e629b6f6 f2fs: handle error case when adding xattr entry
7368e30495c5 f2fs: support flexible inline xattr size
ada4061e191b f2fs: show current cp state
5b8ff1301a61 f2fs: add missing quota_initialize
46d4a691f035 f2fs: show # of dirty segments via sysfs
fc13f9d7ce1e f2fs: stop all the operations by cp_error flag
91bea0c391b3 f2fs: remove several redundant assignments
807486c79534 f2fs: avoid using timespec
03b1cb0bb4a2 f2fs: fix to correct no_fggc_candidate
5c15033ceaea Revert "f2fs: return wrong error number on f2fs_quota_write"
5f5f59322240 f2fs: remove obsolete pointer for truncate_xattr_node
032a6906825a f2fs: retry ENOMEM for quota_read|write
171b638fc49b f2fs: limit # of inmemory pages
83ed7a615f0a f2fs: update ctx->pos correctly when hitting hole in directory
4d6e68be2534 f2fs: relocate readahead codes in readdir()
c8be47b54018 f2fs: allow readdir() to be interrupted
2b903fe94cd0 f2fs: trace f2fs_readdir
bb0db666d4bc f2fs: trace f2fs_lookup
40d6250f046a f2fs: skip searching non-exist range in truncate_hole
8e84f379df61 f2fs: expose some sectors to user in inline data or dentry case
cb98f70dea02 f2fs: avoid stale fi->gdirty_list pointer
5562a3c53963 f2fs/crypto: drop crypto key at evict_inode only
85853e7e38d7 f2fs: fix to avoid race when accessing last_disk_size
0c47a892d555 f2fs: Fix bool initialization/comparison
68e801abc520 f2fs: give up CP_TRIMMED_FLAG if it drops discards
df74eacb2075 f2fs: trace f2fs_remove_discard
bd502c6e3e7a f2fs: reduce cmd_lock coverage in __issue_discard_cmd
a34ab5ca4f94 f2fs: split discard policy
1e65afd14d32 f2fs: wrap discard policy
684447dad138 f2fs: support issuing/waiting discard in range
27eaad09380f f2fs: fix to flush multiple device in checkpoint
08bb9d68d51b f2fs: enhance multiple device flush
9c2526ac2ecb f2fs: fix to show ino management cache size correctly
814b463d262f f2fs: drop FI_UPDATE_WRITE tag after f2fs_issue_flush
f555b0a117d3 f2fs: obsolete ALLOC_NID_LIST list
75d3164ae128 f2fs: convert inline data for direct I/O & FI_NO_PREALLOC
4de0ceb6b7ef f2fs: allow readpages with NULL file pointer
322a45d17212 f2fs: show flush list status in sysfs
6d625a93b4a8 f2fs: introduce read_xattr_block
8ea6e1c327c5 f2fs: introduce read_inline_xattr
dbce11e9ee5b Revert "f2fs: reuse nids more aggressively"
131bc9f6b7f9 Revert "f2fs: node segment is prior to data segment selected victim"

Change-Id: I93b9cd867b859a667a448b39299ff44a2b841b8c
Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |  43 +-
 fs/f2fs/acl.c                           |   3 +
 fs/f2fs/checkpoint.c                    |  64 ++-
 fs/f2fs/data.c                          |  38 +-
 fs/f2fs/debug.c                         |  31 +-
 fs/f2fs/dir.c                           |  32 +-
 fs/f2fs/f2fs.h                          | 223 +++++++---
 fs/f2fs/file.c                          | 120 ++++--
 fs/f2fs/gc.c                            |  37 +-
 fs/f2fs/inline.c                        |   1 +
 fs/f2fs/inode.c                         |  26 +-
 fs/f2fs/namei.c                         | 101 ++++-
 fs/f2fs/node.c                          | 420 ++++++++++---------
 fs/f2fs/node.h                          |  16 +-
 fs/f2fs/recovery.c                      |   8 +-
 fs/f2fs/segment.c                       | 516 ++++++++++++++++++------
 fs/f2fs/segment.h                       |  39 +-
 fs/f2fs/shrinker.c                      |   2 +-
 fs/f2fs/super.c                         | 219 ++++++++--
 fs/f2fs/sysfs.c                         |  53 ++-
 fs/f2fs/xattr.c                         | 178 ++++----
 include/linux/f2fs_fs.h                 |  10 +-
 include/trace/events/f2fs.h             | 116 +++++-
 23 files changed, 1664 insertions(+), 632 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 500c60403653..2baed1151eac 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -51,6 +51,18 @@ Description:
 		 Controls the dirty page count condition for the in-place-update
 		 policies.
 
+What:		/sys/fs/f2fs/<disk>/min_hot_blocks
+Date:		March 2017
+Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+		 Controls the dirty page count condition for redefining hot data.
+
+What:		/sys/fs/f2fs/<disk>/min_ssr_sections
+Date:		October 2017
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Description:
+		 Controls the fee section threshold to trigger SSR allocation.
+
 What:		/sys/fs/f2fs/<disk>/max_small_discards
 Date:		November 2013
 Contact:	"Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@@ -96,6 +108,18 @@ Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
 Description:
 		 Controls the checkpoint timing.
 
+What:		/sys/fs/f2fs/<disk>/idle_interval
+Date:		January 2016
+Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+		 Controls the idle timing.
+
+What:		/sys/fs/f2fs/<disk>/iostat_enable
+Date:		August 2017
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Description:
+		 Controls to enable/disable IO stat.
+
 What:		/sys/fs/f2fs/<disk>/ra_nid_pages
 Date:		October 2015
 Contact:	"Chao Yu" <chao2.yu@samsung.com>
@@ -116,6 +140,12 @@ Contact:	"Shuoran Liu" <liushuoran@huawei.com>
 Description:
 		 Shows total written kbytes issued to disk.
 
+What:		/sys/fs/f2fs/<disk>/feature
+Date:		July 2017
+Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+		 Shows all enabled features in current device.
+
 What:		/sys/fs/f2fs/<disk>/inject_rate
 Date:		May 2016
 Contact:	"Sheng Yong" <shengyong1@huawei.com>
@@ -132,7 +162,18 @@ What:		/sys/fs/f2fs/<disk>/reserved_blocks
 Date:		June 2017
 Contact:	"Chao Yu" <yuchao0@huawei.com>
 Description:
-		 Controls current reserved blocks in system.
+		 Controls target reserved blocks in system, the threshold
+		 is soft, it could exceed current available user space.
+
+What:		/sys/fs/f2fs/<disk>/current_reserved_blocks
+Date:		October 2017
+Contact:	"Yunlong Song" <yunlong.song@huawei.com>
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Description:
+		 Shows current reserved blocks in system, it may be temporarily
+		 smaller than target_reserved_blocks, but will gradually
+		 increase to target_reserved_blocks when more free blocks are
+		 freed by user later.
 
 What:		/sys/fs/f2fs/<disk>/gc_urgent
 Date:		August 2017
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 112f8e04c549..3f52efa0f94f 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -253,6 +253,9 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 
 int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	return __f2fs_set_acl(inode, type, acl, NULL);
 }
 
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index e86f67ac96c6..2eb778174a9b 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab;
 void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
 {
 	set_ckpt_flags(sbi, CP_ERROR_FLAG);
-	sbi->sb->s_flags |= MS_RDONLY;
 	if (!end_io)
 		f2fs_flush_merged_writes(sbi);
 }
@@ -402,24 +401,23 @@ const struct address_space_operations f2fs_meta_aops = {
 #endif
 };
 
-static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
+						unsigned int devidx, int type)
 {
 	struct inode_management *im = &sbi->im[type];
 	struct ino_entry *e, *tmp;
 
 	tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
-retry:
+
 	radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
 
 	spin_lock(&im->ino_lock);
 	e = radix_tree_lookup(&im->ino_root, ino);
 	if (!e) {
 		e = tmp;
-		if (radix_tree_insert(&im->ino_root, ino, e)) {
-			spin_unlock(&im->ino_lock);
-			radix_tree_preload_end();
-			goto retry;
-		}
+		if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
+			f2fs_bug_on(sbi, 1);
+
 		memset(e, 0, sizeof(struct ino_entry));
 		e->ino = ino;
 
@@ -427,6 +425,10 @@ retry:
 		if (type != ORPHAN_INO)
 			im->ino_num++;
 	}
+
+	if (type == FLUSH_INO)
+		f2fs_set_bit(devidx, (char *)&e->dirty_device);
+
 	spin_unlock(&im->ino_lock);
 	radix_tree_preload_end();
 
@@ -455,7 +457,7 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 {
 	/* add new dirty ino entry into list */
-	__add_ino_entry(sbi, ino, type);
+	__add_ino_entry(sbi, ino, 0, type);
 }
 
 void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -481,7 +483,7 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
 	struct ino_entry *e, *tmp;
 	int i;
 
-	for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) {
+	for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) {
 		struct inode_management *im = &sbi->im[i];
 
 		spin_lock(&im->ino_lock);
@@ -495,6 +497,27 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
 	}
 }
 
+void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type)
+{
+	__add_ino_entry(sbi, ino, devidx, type);
+}
+
+bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type)
+{
+	struct inode_management *im = &sbi->im[type];
+	struct ino_entry *e;
+	bool is_dirty = false;
+
+	spin_lock(&im->ino_lock);
+	e = radix_tree_lookup(&im->ino_root, ino);
+	if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device))
+		is_dirty = true;
+	spin_unlock(&im->ino_lock);
+	return is_dirty;
+}
+
 int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 {
 	struct inode_management *im = &sbi->im[ORPHAN_INO];
@@ -531,7 +554,7 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
 void add_orphan_inode(struct inode *inode)
 {
 	/* add new orphan ino entry into list */
-	__add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO);
+	__add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO);
 	update_inode_page(inode);
 }
 
@@ -555,7 +578,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 		return err;
 	}
 
-	__add_ino_entry(sbi, ino, ORPHAN_INO);
+	__add_ino_entry(sbi, ino, 0, ORPHAN_INO);
 
 	inode = f2fs_iget_retry(sbi->sb, ino);
 	if (IS_ERR(inode)) {
@@ -591,6 +614,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	block_t start_blk, orphan_blocks, i, j;
 	unsigned int s_flags = sbi->sb->s_flags;
 	int err = 0;
+#ifdef CONFIG_QUOTA
+	int quota_enabled;
+#endif
 
 	if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
 		return 0;
@@ -603,8 +629,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
 	sbi->sb->s_flags |= MS_ACTIVE;
+
 	/* Turn on quotas so that they are updated correctly */
-	f2fs_enable_quota_files(sbi);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
 #endif
 
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -632,7 +659,8 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 out:
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
-	f2fs_quota_off_umount(sbi->sb);
+	if (quota_enabled)
+		f2fs_quota_off_umount(sbi->sb);
 #endif
 	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 
@@ -987,7 +1015,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
 				update_inode_page(inode);
 			iput(inode);
 		}
-	};
+	}
 	return 0;
 }
 
@@ -1147,6 +1175,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct super_block *sb = sbi->sb;
 	struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
 	u64 kbytes_written;
+	int err;
 
 	/* Flush all the NAT/SIT pages */
 	while (get_pages(sbi, F2FS_DIRTY_META)) {
@@ -1240,6 +1269,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
+	/* flush all device cache */
+	err = f2fs_flush_device_cache(sbi);
+	if (err)
+		return err;
+
 	/* write out checkpoint buffer at block 0 */
 	update_meta_page(sbi, ckpt, start_blk++);
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c8583d7a1845..cdccc429325b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -172,7 +172,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 {
 	struct bio *bio;
 
-	bio = f2fs_bio_alloc(npages);
+	bio = f2fs_bio_alloc(sbi, npages, true);
 
 	f2fs_target_device(sbi, blk_addr, bio);
 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
@@ -417,8 +417,8 @@ next:
 
 	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
-	/* set submitted = 1 as a return value */
-	fio->submitted = 1;
+	/* set submitted = true as a return value */
+	fio->submitted = true;
 
 	inc_page_count(sbi, WB_DATA_TYPE(bio_page));
 
@@ -472,7 +472,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 		f2fs_wait_on_block_writeback(sbi, blkaddr);
 	}
 
-	bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+	bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
 	if (!bio) {
 		if (ctx)
 			fscrypt_release_ctx(ctx);
@@ -832,6 +832,13 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	struct f2fs_map_blocks map;
 	int err = 0;
 
+	/* convert inline data for Direct I/O*/
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
+	}
+
 	if (is_inode_flag_set(inode, FI_NO_PREALLOC))
 		return 0;
 
@@ -844,15 +851,11 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 
 	map.m_next_pgofs = NULL;
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		err = f2fs_convert_inline_inode(inode);
-		if (err)
-			return err;
+	if (iocb->ki_flags & IOCB_DIRECT)
 		return f2fs_map_blocks(inode, &map, 1,
 			__force_buffered_io(inode, WRITE) ?
 				F2FS_GET_BLOCK_PRE_AIO :
 				F2FS_GET_BLOCK_PRE_DIO);
-	}
 	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
@@ -1332,7 +1335,7 @@ static int f2fs_read_data_pages(struct file *file,
 			struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages)
 {
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	struct page *page = list_last_entry(pages, struct page, lru);
 
 	trace_f2fs_readpages(inode, page, nr_pages);
@@ -1493,6 +1496,7 @@ static int __write_data_page(struct page *page, bool *submitted,
 	int err = 0;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
+		.ino = inode->i_ino,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = wbc_to_write_flags(wbc),
@@ -1564,8 +1568,11 @@ write:
 			err = do_write_data_page(&fio);
 		}
 	}
+
+	down_write(&F2FS_I(inode)->i_sem);
 	if (F2FS_I(inode)->last_disk_size < psize)
 		F2FS_I(inode)->last_disk_size = psize;
+	up_write(&F2FS_I(inode)->i_sem);
 
 done:
 	if (err && err != -ENOENT)
@@ -1945,6 +1952,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	}
 	trace_f2fs_write_begin(inode, pos, len, flags);
 
+	if (f2fs_is_atomic_file(inode) &&
+			!available_free_memory(sbi, INMEM_PAGES)) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
 	/*
 	 * We should check this at this moment to avoid deadlock on inode page
 	 * and #0 page. The locking rule for inline_data conversion should be:
@@ -1960,7 +1973,8 @@ repeat:
 	 * Do not use grab_cache_page_write_begin() to avoid deadlock due to
 	 * wait_for_stable_page. Will wait that below with our IO control.
 	 */
-	page = grab_cache_page(mapping, index);
+	page = f2fs_pagecache_get_page(mapping, index,
+				FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS);
 	if (!page) {
 		err = -ENOMEM;
 		goto fail;
@@ -2021,6 +2035,8 @@ repeat:
 fail:
 	f2fs_put_page(page, 1);
 	f2fs_write_failed(mapping, pos + len);
+	if (f2fs_is_atomic_file(inode))
+		drop_inmem_pages_all(sbi);
 	return err;
 }
 
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 87f449845f5f..ecada8425268 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -45,9 +45,18 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
 	si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
 	si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+	si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
 	si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
 	si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
 	si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+
+	si->nquota_files = 0;
+	if (f2fs_sb_has_quota_ino(sbi->sb)) {
+		for (i = 0; i < MAXQUOTAS; i++) {
+			if (f2fs_qf_ino(sbi->sb, i))
+				si->nquota_files++;
+		}
+	}
 	si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
 	si->aw_cnt = atomic_read(&sbi->aw_cnt);
@@ -61,6 +70,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 			atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
 		si->nr_flushing =
 			atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+		si->flush_list_empty =
+			llist_empty(&SM_I(sbi)->fcc_info->issue_list);
 	}
 	if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
 		si->nr_discarded =
@@ -96,9 +107,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
 	si->sits = MAIN_SEGS(sbi);
 	si->dirty_sits = SIT_I(sbi)->dirty_sentries;
-	si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+	si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
 	si->avail_nids = NM_I(sbi)->available_nids;
-	si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
+	si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
 	si->bg_gc = sbi->bg_gc;
 	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
 		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
@@ -231,14 +242,14 @@ get_cache:
 	}
 
 	/* free nids */
-	si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
-				NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) *
+	si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] +
+				NM_I(sbi)->nid_cnt[PREALLOC_NID]) *
 				sizeof(struct free_nid);
 	si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
 	si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
 					sizeof(struct nat_entry_set);
 	si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
-	for (i = 0; i <= ORPHAN_INO; i++)
+	for (i = 0; i < MAX_INO_ENTRY; i++)
 		si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
 	si->cache_mem += atomic_read(&sbi->total_ext_tree) *
 						sizeof(struct extent_tree);
@@ -262,9 +273,10 @@ static int stat_show(struct seq_file *s, void *v)
 	list_for_each_entry(si, &f2fs_stat_list, stat_list) {
 		update_general_status(si->sbi);
 
-		seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n",
+		seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
 			si->sbi->sb->s_bdev, i++,
-			f2fs_readonly(si->sbi->sb) ? "RO": "RW");
+			f2fs_readonly(si->sbi->sb) ? "RO": "RW",
+			f2fs_cp_error(si->sbi) ? "Error": "Good");
 		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
 			   si->sit_area_segs, si->nat_area_segs);
 		seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -349,10 +361,11 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), "
+		seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
 			"Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
 			   si->nr_wb_cp_data, si->nr_wb_data,
 			   si->nr_flushing, si->nr_flushed,
+			   si->flush_list_empty,
 			   si->nr_discarding, si->nr_discarded,
 			   si->nr_discard_cmd, si->undiscard_blks);
 		seq_printf(s, "  - inmem: %4d, atomic IO: %4d (Max. %4d), "
@@ -365,6 +378,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
 		seq_printf(s, "  - datas: %4d in files:%4d\n",
 			   si->ndirty_data, si->ndirty_files);
+		seq_printf(s, "  - quota datas: %4d in quota files:%4d\n",
+			   si->ndirty_qdata, si->nquota_files);
 		seq_printf(s, "  - meta: %4d in %4d\n",
 			   si->ndirty_meta, si->meta_pages);
 		seq_printf(s, "  - imeta: %4d\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 4f2a8fedb313..1955707b138b 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -10,10 +10,12 @@
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
+#include <linux/sched.h>
 #include "f2fs.h"
 #include "node.h"
 #include "acl.h"
 #include "xattr.h"
+#include <trace/events/f2fs.h>
 
 static unsigned long dir_blocks(struct inode *inode)
 {
@@ -847,6 +849,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct page *dentry_page = NULL;
 	struct file_ra_state *ra = &file->f_ra;
+	loff_t start_pos = ctx->pos;
 	unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
 	struct f2fs_dentry_ptr d;
 	struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
@@ -855,24 +858,32 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	if (f2fs_encrypted_inode(inode)) {
 		err = fscrypt_get_encryption_info(inode);
 		if (err && err != -ENOKEY)
-			return err;
+			goto out;
 
 		err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
 		if (err < 0)
-			return err;
+			goto out;
 	}
 
 	if (f2fs_has_inline_dentry(inode)) {
 		err = f2fs_read_inline_dir(file, ctx, &fstr);
-		goto out;
+		goto out_free;
 	}
 
-	/* readahead for multi pages of dir */
-	if (npages - n > 1 && !ra_has_index(ra, n))
-		page_cache_sync_readahead(inode->i_mapping, ra, file, n,
+	for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+
+		/* allow readdir() to be interrupted */
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			goto out_free;
+		}
+		cond_resched();
+
+		/* readahead for multi pages of dir */
+		if (npages - n > 1 && !ra_has_index(ra, n))
+			page_cache_sync_readahead(inode->i_mapping, ra, file, n,
 				min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
 
-	for (; n < npages; n++) {
 		dentry_page = get_lock_data_page(inode, n, false);
 		if (IS_ERR(dentry_page)) {
 			err = PTR_ERR(dentry_page);
@@ -880,7 +891,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 				err = 0;
 				continue;
 			} else {
-				goto out;
+				goto out_free;
 			}
 		}
 
@@ -896,12 +907,13 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 		}
 
-		ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 	}
-out:
+out_free:
 	fscrypt_fname_free_buffer(&fstr);
+out:
+	trace_f2fs_readdir(inode, start_pos, ctx->pos, err);
 	return err < 0 ? err : 0;
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c1a0aef8efc6..081ec493baae 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -47,6 +47,8 @@
 enum {
 	FAULT_KMALLOC,
 	FAULT_PAGE_ALLOC,
+	FAULT_PAGE_GET,
+	FAULT_ALLOC_BIO,
 	FAULT_ALLOC_NID,
 	FAULT_ORPHAN,
 	FAULT_BLOCK,
@@ -94,6 +96,7 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_GRPQUOTA		0x00100000
 #define F2FS_MOUNT_PRJQUOTA		0x00200000
 #define F2FS_MOUNT_QUOTA		0x00400000
+#define F2FS_MOUNT_INLINE_XATTR_SIZE	0x00800000
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -119,6 +122,8 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_EXTRA_ATTR		0x0008
 #define F2FS_FEATURE_PRJQUOTA		0x0010
 #define F2FS_FEATURE_INODE_CHKSUM	0x0020
+#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR	0x0040
+#define F2FS_FEATURE_QUOTA_INO		0x0080
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -214,7 +219,7 @@ enum {
 #define BATCHED_TRIM_BLOCKS(sbi)	\
 		(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
 #define MAX_DISCARD_BLOCKS(sbi)		BLKS_PER_SEC(sbi)
-#define DISCARD_ISSUE_RATE		8
+#define DEF_MAX_DISCARD_REQUEST		8	/* issue 8 discards per round */
 #define DEF_MIN_DISCARD_ISSUE_TIME	50	/* 50 ms, if exists */
 #define DEF_MAX_DISCARD_ISSUE_TIME	60000	/* 60 s, if no candidates */
 #define DEF_CP_INTERVAL			60	/* 60 secs */
@@ -225,7 +230,6 @@ struct cp_control {
 	__u64 trim_start;
 	__u64 trim_end;
 	__u64 trim_minlen;
-	__u64 trimmed;
 };
 
 /*
@@ -244,12 +248,14 @@ enum {
 	ORPHAN_INO,		/* for orphan ino list */
 	APPEND_INO,		/* for append ino list */
 	UPDATE_INO,		/* for update ino list */
+	FLUSH_INO,		/* for multiple device flushing */
 	MAX_INO_ENTRY,		/* max. list */
 };
 
 struct ino_entry {
-	struct list_head list;	/* list head */
-	nid_t ino;		/* inode number */
+	struct list_head list;		/* list head */
+	nid_t ino;			/* inode number */
+	unsigned int dirty_device;	/* dirty device bitmap */
 };
 
 /* for the list of inodes to be GCed */
@@ -273,10 +279,6 @@ struct discard_entry {
 #define plist_idx(blk_num)	((blk_num) >= MAX_PLIST_NUM ?		\
 					(MAX_PLIST_NUM - 1) : (blk_num - 1))
 
-#define P_ACTIVE	0x01
-#define P_TRIM		0x02
-#define plist_issue(tag)	(((tag) & P_ACTIVE) || ((tag) & P_TRIM))
-
 enum {
 	D_PREP,
 	D_SUBMIT,
@@ -308,12 +310,32 @@ struct discard_cmd {
 	int error;			/* bio error */
 };
 
+enum {
+	DPOLICY_BG,
+	DPOLICY_FORCE,
+	DPOLICY_FSTRIM,
+	DPOLICY_UMOUNT,
+	MAX_DPOLICY,
+};
+
+struct discard_policy {
+	int type;			/* type of discard */
+	unsigned int min_interval;	/* used for candidates exist */
+	unsigned int max_interval;	/* used for candidates not exist */
+	unsigned int max_requests;	/* # of discards issued per round */
+	unsigned int io_aware_gran;	/* minimum granularity discard not be aware of I/O */
+	bool io_aware;			/* issue discard in idle time */
+	bool sync;			/* submit discard with REQ_SYNC flag */
+	unsigned int granularity;	/* discard granularity */
+};
+
 struct discard_cmd_control {
 	struct task_struct *f2fs_issue_discard;	/* discard thread */
 	struct list_head entry_list;		/* 4KB discard entry list */
 	struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
 	unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */
 	struct list_head wait_list;		/* store on-flushing entries */
+	struct list_head fstrim_list;		/* in-flight discard from fstrim */
 	wait_queue_head_t discard_wait_queue;	/* waiting queue for wake-up */
 	unsigned int discard_wake;		/* to wake up discard thread */
 	struct mutex cmd_lock;
@@ -443,11 +465,14 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
+#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
-#define MAX_INLINE_DATA(inode)	(sizeof(__le32) * \
-				(CUR_ADDRS_PER_INODE(inode) - \
-				DEF_INLINE_RESERVED_SIZE - \
-				F2FS_INLINE_XATTR_ADDRS))
+static inline int get_inline_xattr_addrs(struct inode *inode);
+#define F2FS_INLINE_XATTR_ADDRS(inode)	get_inline_xattr_addrs(inode)
+#define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
+				(CUR_ADDRS_PER_INODE(inode) -		\
+				F2FS_INLINE_XATTR_ADDRS(inode) -	\
+				DEF_INLINE_RESERVED_SIZE))
 
 /* for inline dir */
 #define NR_INLINE_DENTRY(inode)	(MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
@@ -647,6 +672,7 @@ struct f2fs_inode_info {
 #endif
 	struct list_head dirty_list;	/* dirty list for dirs and files */
 	struct list_head gdirty_list;	/* linked in global dirty list */
+	struct list_head inmem_ilist;	/* list for inmem inodes */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct task_struct *inmem_task;	/* store inmemory task */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
@@ -657,6 +683,7 @@ struct f2fs_inode_info {
 
 	int i_extra_isize;		/* size of extra space located in i_addr */
 	kprojid_t i_projid;		/* id for project quota */
+	int i_inline_xattr_size;	/* inline xattr size */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -730,10 +757,13 @@ static inline void __try_update_largest_extent(struct inode *inode,
 	}
 }
 
-enum nid_list {
-	FREE_NID_LIST,
-	ALLOC_NID_LIST,
-	MAX_NID_LIST,
+/*
+ * For free nid management
+ */
+enum nid_state {
+	FREE_NID,		/* newly added to free nid list */
+	PREALLOC_NID,		/* it is preallocated */
+	MAX_NID_STATE,
 };
 
 struct f2fs_nm_info {
@@ -756,8 +786,8 @@ struct f2fs_nm_info {
 
 	/* free node ids management */
 	struct radix_tree_root free_nid_root;/* root of the free_nid cache */
-	struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
-	unsigned int nid_cnt[MAX_NID_LIST];	/* the number of free node id */
+	struct list_head free_nid_list;		/* list for free nids excluding preallocated nids */
+	unsigned int nid_cnt[MAX_NID_STATE];	/* the number of free node id */
 	spinlock_t nid_list_lock;	/* protect nid lists ops */
 	struct mutex build_lock;	/* lock for build free nids */
 	unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
@@ -835,6 +865,7 @@ enum {
 struct flush_cmd {
 	struct completion wait;
 	struct llist_node llnode;
+	nid_t ino;
 	int ret;
 };
 
@@ -853,6 +884,8 @@ struct f2fs_sm_info {
 	struct dirty_seglist_info *dirty_info;	/* dirty segment information */
 	struct curseg_info *curseg_array;	/* active segment information */
 
+	struct rw_semaphore curseg_lock;	/* for preventing curseg change */
+
 	block_t seg0_blkaddr;		/* block address of 0'th segment */
 	block_t main_blkaddr;		/* start block address of main area */
 	block_t ssa_blkaddr;		/* start block address of SSA area */
@@ -874,6 +907,7 @@ struct f2fs_sm_info {
 	unsigned int min_ipu_util;	/* in-place-update threshold */
 	unsigned int min_fsync_blocks;	/* threshold for fsync */
 	unsigned int min_hot_blocks;	/* threshold for hot block allocation */
+	unsigned int min_ssr_sections;	/* threshold to trigger SSR allocation */
 
 	/* for flush command control */
 	struct flush_cmd_control *fcc_info;
@@ -895,6 +929,7 @@ struct f2fs_sm_info {
 enum count_type {
 	F2FS_DIRTY_DENTS,
 	F2FS_DIRTY_DATA,
+	F2FS_DIRTY_QDATA,
 	F2FS_DIRTY_NODES,
 	F2FS_DIRTY_META,
 	F2FS_INMEM_PAGES,
@@ -943,6 +978,18 @@ enum need_lock_type {
 	LOCK_RETRY,
 };
 
+enum cp_reason_type {
+	CP_NO_NEEDED,
+	CP_NON_REGULAR,
+	CP_HARDLINK,
+	CP_SB_NEED_CP,
+	CP_WRONG_PINO,
+	CP_NO_SPC_ROLL,
+	CP_NODE_NEED_CP,
+	CP_FASTBOOT_MODE,
+	CP_SPEC_LOG_NUM,
+};
+
 enum iostat_type {
 	APP_DIRECT_IO,			/* app direct IOs */
 	APP_BUFFERED_IO,		/* app buffered IOs */
@@ -962,6 +1009,7 @@ enum iostat_type {
 
 struct f2fs_io_info {
 	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
+	nid_t ino;		/* inode number */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	enum temp_type temp;	/* contains HOT/WARM/COLD */
 	int op;			/* contains REQ_OP_ */
@@ -1006,6 +1054,7 @@ enum inode_type {
 	DIR_INODE,			/* for dirty dir inode */
 	FILE_INODE,			/* for dirty regular/symlink inode */
 	DIRTY_META,			/* for all dirtied inode metadata */
+	ATOMIC_FILE,			/* for all atomic files */
 	NR_INODE_TYPE,
 };
 
@@ -1108,12 +1157,15 @@ struct f2fs_sb_info {
 	loff_t max_file_blocks;			/* max block index of file */
 	int active_logs;			/* # of active logs */
 	int dir_level;				/* directory level */
+	int inline_xattr_size;			/* inline xattr size */
+	unsigned int trigger_ssr_threshold;	/* threshold to trigger ssr */
 
 	block_t user_block_count;		/* # of user blocks */
 	block_t total_valid_block_count;	/* # of valid blocks */
 	block_t discard_blks;			/* discard command candidats */
 	block_t last_valid_block_count;		/* for recovery */
 	block_t reserved_blocks;		/* configurable reserved blocks */
+	block_t current_reserved_blocks;	/* current reserved blocks */
 
 	u32 s_next_generation;			/* for NFS support */
 
@@ -1179,6 +1231,8 @@ struct f2fs_sb_info {
 	struct list_head s_list;
 	int s_ndevs;				/* number of devices */
 	struct f2fs_dev_info *devs;		/* for device list */
+	unsigned int dirty_device;		/* for checkpoint data flush */
+	spinlock_t dev_lock;			/* protect dirty_device */
 	struct mutex umount_mutex;
 	unsigned int shrinker_run_no;
 
@@ -1242,8 +1296,7 @@ static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
 
 static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
 {
-	struct timespec ts = {sbi->interval_time[type], 0};
-	unsigned long interval = timespec_to_jiffies(&ts);
+	unsigned long interval = sbi->interval_time[type] * HZ;
 
 	return time_after(jiffies, sbi->last_time[type] + interval);
 }
@@ -1410,6 +1463,13 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
 	return le64_to_cpu(cp->checkpoint_ver);
 }
 
+static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type)
+{
+	if (type < F2FS_MAX_QUOTAS)
+		return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]);
+	return 0;
+}
+
 static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp)
 {
 	size_t crc_offset = le32_to_cpu(cp->checksum_offset);
@@ -1588,7 +1648,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 
 	spin_lock(&sbi->stat_lock);
 	sbi->total_valid_block_count += (block_t)(*count);
-	avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks;
+	avail_user_block_count = sbi->user_block_count -
+					sbi->current_reserved_blocks;
 	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
 		diff = sbi->total_valid_block_count - avail_user_block_count;
 		*count -= diff;
@@ -1622,6 +1683,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 	f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
 	f2fs_bug_on(sbi, inode->i_blocks < sectors);
 	sbi->total_valid_block_count -= (block_t)count;
+	if (sbi->reserved_blocks &&
+		sbi->current_reserved_blocks < sbi->reserved_blocks)
+		sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+					sbi->current_reserved_blocks + count);
 	spin_unlock(&sbi->stat_lock);
 	f2fs_i_blocks_write(inode, count, false, true);
 }
@@ -1642,6 +1707,8 @@ static inline void inode_inc_dirty_pages(struct inode *inode)
 	atomic_inc(&F2FS_I(inode)->dirty_pages);
 	inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+	if (IS_NOQUOTA(inode))
+		inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
 }
 
 static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1658,6 +1725,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
 	atomic_dec(&F2FS_I(inode)->dirty_pages);
 	dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+	if (IS_NOQUOTA(inode))
+		dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
 }
 
 static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -1765,10 +1834,17 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 			return ret;
 	}
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_BLOCK)) {
+		f2fs_show_injection_info(FAULT_BLOCK);
+		goto enospc;
+	}
+#endif
+
 	spin_lock(&sbi->stat_lock);
 
 	valid_block_count = sbi->total_valid_block_count + 1;
-	if (unlikely(valid_block_count + sbi->reserved_blocks >
+	if (unlikely(valid_block_count + sbi->current_reserved_blocks >
 						sbi->user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
 		goto enospc;
@@ -1811,6 +1887,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
 
 	sbi->total_valid_node_count--;
 	sbi->total_valid_block_count--;
+	if (sbi->reserved_blocks &&
+		sbi->current_reserved_blocks < sbi->reserved_blocks)
+		sbi->current_reserved_blocks++;
 
 	spin_unlock(&sbi->stat_lock);
 
@@ -1857,6 +1936,19 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
 	return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
 }
 
+static inline struct page *f2fs_pagecache_get_page(
+				struct address_space *mapping, pgoff_t index,
+				int fgp_flags, gfp_t gfp_mask)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
+		f2fs_show_injection_info(FAULT_PAGE_GET);
+		return NULL;
+	}
+#endif
+	return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
+}
+
 static inline void f2fs_copy_page(struct page *src, struct page *dst)
 {
 	char *src_kaddr = kmap(src);
@@ -1906,15 +1998,25 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
 	return entry;
 }
 
-static inline struct bio *f2fs_bio_alloc(int npages)
+static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
+						int npages, bool no_fail)
 {
 	struct bio *bio;
 
-	/* No failure on bio allocation */
-	bio = bio_alloc(GFP_NOIO, npages);
-	if (!bio)
-		bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
-	return bio;
+	if (no_fail) {
+		/* No failure on bio allocation */
+		bio = bio_alloc(GFP_NOIO, npages);
+		if (!bio)
+			bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
+		return bio;
+	}
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
+		f2fs_show_injection_info(FAULT_ALLOC_BIO);
+		return NULL;
+	}
+#endif
+	return bio_alloc(GFP_KERNEL, npages);
 }
 
 static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
@@ -2224,25 +2326,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
 
 static inline unsigned int addrs_per_inode(struct inode *inode)
 {
-	if (f2fs_has_inline_xattr(inode))
-		return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS;
-	return CUR_ADDRS_PER_INODE(inode);
+	return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode);
 }
 
-static inline void *inline_xattr_addr(struct page *page)
+static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *ri = F2FS_INODE(page);
 
 	return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
-					F2FS_INLINE_XATTR_ADDRS]);
+					F2FS_INLINE_XATTR_ADDRS(inode)]);
 }
 
 static inline int inline_xattr_size(struct inode *inode)
 {
-	if (f2fs_has_inline_xattr(inode))
-		return F2FS_INLINE_XATTR_ADDRS << 2;
-	else
-		return 0;
+	return get_inline_xattr_addrs(inode) * sizeof(__le32);
 }
 
 static inline int f2fs_has_inline_data(struct inode *inode)
@@ -2323,9 +2420,10 @@ static inline void clear_file(struct inode *inode, int type)
 
 static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 {
+	bool ret;
+
 	if (dsync) {
 		struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-		bool ret;
 
 		spin_lock(&sbi->inode_lock[DIRTY_META]);
 		ret = list_empty(&F2FS_I(inode)->gdirty_list);
@@ -2336,9 +2434,15 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 			file_keep_isize(inode) ||
 			i_size_read(inode) & PAGE_MASK)
 		return false;
-	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
+
+	down_read(&F2FS_I(inode)->i_sem);
+	ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
+	up_read(&F2FS_I(inode)->i_sem);
+
+	return ret;
 }
 
+#define sb_rdonly	f2fs_readonly
 static inline int f2fs_readonly(struct super_block *sb)
 {
 	return sb->s_flags & MS_RDONLY;
@@ -2406,6 +2510,12 @@ static inline int get_extra_isize(struct inode *inode)
 	return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
 }
 
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb);
+static inline int get_inline_xattr_addrs(struct inode *inode)
+{
+	return F2FS_I(inode)->i_inline_xattr_size;
+}
+
 #define get_inode_mode(i) \
 	((is_inode_flag_set(i, FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -2534,7 +2644,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
  */
 int f2fs_inode_dirtied(struct inode *inode, bool sync);
 void f2fs_inode_synced(struct inode *inode);
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi);
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
 void f2fs_quota_off_umount(struct super_block *sb);
 int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
 int f2fs_sync_fs(struct super_block *sb, int sync);
@@ -2562,7 +2672,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni);
 pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
 int truncate_inode_blocks(struct inode *inode, pgoff_t from);
-int truncate_xattr_node(struct inode *inode, struct page *page);
+int truncate_xattr_node(struct inode *inode);
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
 int remove_inode_page(struct inode *inode);
 struct page *new_inode_page(struct inode *inode);
@@ -2597,19 +2707,22 @@ void destroy_node_manager_caches(void);
  */
 bool need_SSR(struct f2fs_sb_info *sbi);
 void register_inmem_page(struct inode *inode, struct page *page);
+void drop_inmem_pages_all(struct f2fs_sb_info *sbi);
 void drop_inmem_pages(struct inode *inode);
 void drop_inmem_page(struct inode *inode, struct page *page);
 int commit_inmem_pages(struct inode *inode);
 void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
-int f2fs_issue_flush(struct f2fs_sb_info *sbi);
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
 int create_flush_cmd_control(struct f2fs_sb_info *sbi);
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
 void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
-void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
+void init_discard_policy(struct discard_policy *dpolicy, int discard_type,
+						unsigned int granularity);
 void stop_discard_thread(struct f2fs_sb_info *sbi);
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount);
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
 void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 void release_discard_addrs(struct f2fs_sb_info *sbi);
 int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
@@ -2664,6 +2777,10 @@ void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
 void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
 void release_ino_entry(struct f2fs_sb_info *sbi, bool all);
 bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode);
+void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type);
+bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type);
 int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi);
 int acquire_orphan_inode(struct f2fs_sb_info *sbi);
 void release_orphan_inode(struct f2fs_sb_info *sbi);
@@ -2751,14 +2868,16 @@ struct f2fs_stat_info {
 	unsigned long long hit_largest, hit_cached, hit_rbtree;
 	unsigned long long hit_total, total_ext;
 	int ext_tree, zombie_tree, ext_node;
-	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
+	int ndirty_data, ndirty_qdata;
 	int inmem_pages;
-	unsigned int ndirty_dirs, ndirty_files, ndirty_all;
+	unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
 	int nats, dirty_nats, sits, dirty_sits;
 	int free_nids, avail_nids, alloc_nids;
 	int total_count, utilization;
 	int bg_gc, nr_wb_cp_data, nr_wb_data;
-	int nr_flushing, nr_flushed, nr_discarding, nr_discarded;
+	int nr_flushing, nr_flushed, flush_list_empty;
+	int nr_discarding, nr_discarded;
 	int nr_discard_cmd;
 	unsigned int undiscard_blks;
 	int inline_xattr, inline_inode, inline_dir, append, update, orphans;
@@ -3066,6 +3185,16 @@ static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
 }
 
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR);
+}
+
+static inline int f2fs_sb_has_quota_ino(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index a9e1655a6bf8..bfff53f658e1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -56,6 +56,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	struct dnode_of_data dn;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi))) {
+		err = -EIO;
+		goto err;
+	}
+
 	sb_start_pagefault(inode->i_sb);
 
 	f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -117,6 +122,7 @@ out_sem:
 out:
 	sb_end_pagefault(inode->i_sb);
 	f2fs_update_time(sbi, REQ_TIME);
+err:
 	return block_page_mkwrite_return(err);
 }
 
@@ -141,27 +147,29 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
 	return 1;
 }
 
-static inline bool need_do_checkpoint(struct inode *inode)
+static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	bool need_cp = false;
+	enum cp_reason_type cp_reason = CP_NO_NEEDED;
 
-	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
-		need_cp = true;
+	if (!S_ISREG(inode->i_mode))
+		cp_reason = CP_NON_REGULAR;
+	else if (inode->i_nlink != 1)
+		cp_reason = CP_HARDLINK;
 	else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
-		need_cp = true;
+		cp_reason = CP_SB_NEED_CP;
 	else if (file_wrong_pino(inode))
-		need_cp = true;
+		cp_reason = CP_WRONG_PINO;
 	else if (!space_for_roll_forward(sbi))
-		need_cp = true;
+		cp_reason = CP_NO_SPC_ROLL;
 	else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
-		need_cp = true;
+		cp_reason = CP_NODE_NEED_CP;
 	else if (test_opt(sbi, FASTBOOT))
-		need_cp = true;
+		cp_reason = CP_FASTBOOT_MODE;
 	else if (sbi->active_logs == 2)
-		need_cp = true;
+		cp_reason = CP_SPEC_LOG_NUM;
 
-	return need_cp;
+	return cp_reason;
 }
 
 static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
@@ -196,7 +204,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t ino = inode->i_ino;
 	int ret = 0;
-	bool need_cp = false;
+	enum cp_reason_type cp_reason = 0;
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
@@ -215,7 +223,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	clear_inode_flag(inode, FI_NEED_IPU);
 
 	if (ret) {
-		trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+		trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
 		return ret;
 	}
 
@@ -246,10 +254,10 @@ go_write:
 	 * sudden-power-off.
 	 */
 	down_read(&F2FS_I(inode)->i_sem);
-	need_cp = need_do_checkpoint(inode);
+	cp_reason = need_do_checkpoint(inode);
 	up_read(&F2FS_I(inode)->i_sem);
 
-	if (need_cp) {
+	if (cp_reason) {
 		/* all the dirty node pages should be flushed for POR */
 		ret = f2fs_sync_fs(inode->i_sb, 1);
 
@@ -297,19 +305,24 @@ sync_nodes:
 	remove_ino_entry(sbi, ino, APPEND_INO);
 	clear_inode_flag(inode, FI_APPEND_WRITE);
 flush_out:
-	remove_ino_entry(sbi, ino, UPDATE_INO);
-	clear_inode_flag(inode, FI_UPDATE_WRITE);
 	if (!atomic)
-		ret = f2fs_issue_flush(sbi);
+		ret = f2fs_issue_flush(sbi, inode->i_ino);
+	if (!ret) {
+		remove_ino_entry(sbi, ino, UPDATE_INO);
+		clear_inode_flag(inode, FI_UPDATE_WRITE);
+		remove_ino_entry(sbi, ino, FLUSH_INO);
+	}
 	f2fs_update_time(sbi, REQ_TIME);
 out:
-	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+	trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
 	f2fs_trace_ios(NULL, 1);
 	return ret;
 }
 
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file)))))
+		return -EIO;
 	return f2fs_do_sync_file(file, start, end, datasync, false);
 }
 
@@ -446,6 +459,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct inode *inode = file_inode(file);
 	int err;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	/* we don't need to use inline_data strictly */
 	err = f2fs_convert_inline_inode(inode);
 	if (err)
@@ -632,6 +648,9 @@ int f2fs_truncate(struct inode *inode)
 {
 	int err;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 				S_ISLNK(inode->i_mode)))
 		return 0;
@@ -667,7 +686,8 @@ int f2fs_getattr(struct vfsmount *mnt,
 	generic_fillattr(inode, stat);
 
 	/* we need to show initial sectors used for inline_data/dentries */
-	if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
+	if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
+					f2fs_has_inline_dentry(inode))
 		stat->blocks += (stat->size + 511) >> 9;
 
 	return 0;
@@ -709,6 +729,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	int err;
 	bool size_changed = false;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	err = inode_change_ok(inode, attr);
 	if (err)
 		return err;
@@ -761,6 +784,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			inode->i_mtime = inode->i_ctime = current_time(inode);
 		}
 
+		down_write(&F2FS_I(inode)->i_sem);
+		F2FS_I(inode)->last_disk_size = i_size_read(inode);
+		up_write(&F2FS_I(inode)->i_sem);
+
 		size_changed = true;
 	}
 
@@ -834,7 +861,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
 		err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
 		if (err) {
 			if (err == -ENOENT) {
-				pg_start++;
+				pg_start = get_next_page_offset(&dn, pg_start);
 				continue;
 			}
 			return err;
@@ -1149,11 +1176,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		goto out;
 
+	/* avoid gc operation during block exchange */
+	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
 	truncate_pagecache(inode, offset);
 
 	ret = f2fs_do_collapse(inode, pg_start, pg_end);
 	if (ret)
-		goto out;
+		goto out_unlock;
 
 	/* write out all moved pages, if possible */
 	filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -1165,7 +1195,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	ret = truncate_blocks(inode, new_size, true);
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
-
+out_unlock:
+	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 out:
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 	return ret;
@@ -1348,6 +1379,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		goto out;
 
+	/* avoid gc operation during block exchange */
+	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
 	truncate_pagecache(inode, offset);
 
 	pg_start = offset >> PAGE_SHIFT;
@@ -1375,6 +1409,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
+
+	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 out:
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 	return ret;
@@ -1424,8 +1460,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 		new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
 	}
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
-		f2fs_i_size_write(inode, new_size);
+	if (new_size > i_size_read(inode)) {
+		if (mode & FALLOC_FL_KEEP_SIZE)
+			file_set_keep_isize(inode);
+		else
+			f2fs_i_size_write(inode, new_size);
+	}
 
 	return err;
 }
@@ -1436,6 +1476,9 @@ static long f2fs_fallocate(struct file *file, int mode,
 	struct inode *inode = file_inode(file);
 	long ret = 0;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	/* f2fs only support ->fallocate for regular file */
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
@@ -1469,8 +1512,6 @@ static long f2fs_fallocate(struct file *file, int mode,
 	if (!ret) {
 		inode->i_mtime = inode->i_ctime = current_time(inode);
 		f2fs_mark_inode_dirty_sync(inode, false);
-		if (mode & FALLOC_FL_KEEP_SIZE)
-			file_set_keep_isize(inode);
 		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
 
@@ -1864,6 +1905,9 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 
+	if (!f2fs_sb_has_crypto(inode->i_sb))
+		return -EOPNOTSUPP;
+
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 
 	return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
@@ -1871,6 +1915,8 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
+	if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb))
+		return -EOPNOTSUPP;
 	return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 }
 
@@ -2226,9 +2272,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 	}
 
 	inode_lock(src);
+	down_write(&F2FS_I(src)->dio_rwsem[WRITE]);
 	if (src != dst) {
-		if (!inode_trylock(dst)) {
-			ret = -EBUSY;
+		ret = -EBUSY;
+		if (!inode_trylock(dst))
+			goto out;
+		if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) {
+			inode_unlock(dst);
 			goto out;
 		}
 	}
@@ -2288,9 +2338,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 	}
 	f2fs_unlock_op(sbi);
 out_unlock:
-	if (src != dst)
+	if (src != dst) {
+		up_write(&F2FS_I(dst)->dio_rwsem[WRITE]);
 		inode_unlock(dst);
+	}
 out:
+	up_write(&F2FS_I(src)->dio_rwsem[WRITE]);
 	inode_unlock(src);
 	return ret;
 }
@@ -2412,6 +2465,9 @@ static int f2fs_ioc_get_features(struct file *filp, unsigned long arg)
 
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
+		return -EIO;
+
 	switch (cmd) {
 	case F2FS_IOC_GETFLAGS:
 		return f2fs_ioc_getflags(filp, arg);
@@ -2465,6 +2521,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct blk_plug plug;
 	ssize_t ret;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0) {
@@ -2475,6 +2534,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 		err = f2fs_preallocate_blocks(iocb, from);
 		if (err) {
+			clear_inode_flag(inode, FI_NO_PREALLOC);
 			inode_unlock(inode);
 			return err;
 		}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index bd16e6631cf3..be9fd616736b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -267,16 +267,6 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 	return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
 }
 
-static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
-						unsigned int segno)
-{
-	unsigned int valid_blocks =
-			get_valid_blocks(sbi, segno, true);
-
-	return IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
-				valid_blocks * 2 : valid_blocks;
-}
-
 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct victim_sel_policy *p)
 {
@@ -285,7 +275,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 
 	/* alloc_mode == LFS */
 	if (p->gc_mode == GC_GREEDY)
-		return get_greedy_cost(sbi, segno);
+		return get_valid_blocks(sbi, segno, true);
 	else
 		return get_cb_cost(sbi, segno);
 }
@@ -466,10 +456,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
 	struct seg_entry *sentry;
 	int ret;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_read(&sit_i->sentry_lock);
 	sentry = get_seg_entry(sbi, segno);
 	ret = f2fs_test_bit(offset, sentry->cur_valid_map);
-	mutex_unlock(&sit_i->sentry_lock);
+	up_read(&sit_i->sentry_lock);
 	return ret;
 }
 
@@ -608,6 +598,7 @@ static void move_data_block(struct inode *inode, block_t bidx,
 {
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(inode),
+		.ino = inode->i_ino,
 		.type = DATA,
 		.temp = COLD,
 		.op = REQ_OP_READ,
@@ -659,8 +650,8 @@ static void move_data_block(struct inode *inode, block_t bidx,
 	allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
 					&sum, CURSEG_COLD_DATA, NULL, false);
 
-	fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
-					FGP_LOCK | FGP_CREAT, GFP_NOFS);
+	fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
+				newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
 	if (!fio.encrypted_page) {
 		err = -ENOMEM;
 		goto recover_block;
@@ -738,6 +729,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
 	} else {
 		struct f2fs_io_info fio = {
 			.sbi = F2FS_I_SB(inode),
+			.ino = inode->i_ino,
 			.type = DATA,
 			.temp = COLD,
 			.op = REQ_OP_WRITE,
@@ -840,10 +832,17 @@ next_step:
 				continue;
 			}
 
+			if (!down_write_trylock(
+				&F2FS_I(inode)->dio_rwsem[WRITE])) {
+				iput(inode);
+				continue;
+			}
+
 			start_bidx = start_bidx_of_node(nofs, inode);
 			data_page = get_read_data_page(inode,
 					start_bidx + ofs_in_node, REQ_RAHEAD,
 					true);
+			up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 			if (IS_ERR(data_page)) {
 				iput(inode);
 				continue;
@@ -901,10 +900,10 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
 	struct sit_info *sit_i = SIT_I(sbi);
 	int ret;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 	ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
 					      NO_CHECK_TYPE, LFS);
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 	return ret;
 }
 
@@ -952,8 +951,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 		/*
 		 * this is to avoid deadlock:
 		 * - lock_page(sum_page)         - f2fs_replace_block
-		 *  - check_valid_map()            - mutex_lock(sentry_lock)
-		 *   - mutex_lock(sentry_lock)     - change_curseg()
+		 *  - check_valid_map()            - down_write(sentry_lock)
+		 *   - down_read(sentry_lock)     - change_curseg()
 		 *                                  - lock_page(sum_page)
 		 */
 		if (type == SUM_TYPE_NODE)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index fbf22b0f667f..91d5d831be72 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -130,6 +130,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 {
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(dn->inode),
+		.ino = dn->inode->i_ino,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO,
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 50c88e37ed66..9684d53563f1 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -232,6 +232,23 @@ static int do_read_inode(struct inode *inode)
 	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
 					le16_to_cpu(ri->i_extra_isize) : 0;
 
+	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+		fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
+	} else if (f2fs_has_inline_xattr(inode) ||
+				f2fs_has_inline_dentry(inode)) {
+		fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	} else {
+
+		/*
+		 * Previous inline data or directory always reserved 200 bytes
+		 * in inode layout, even if inline_xattr is disabled. In order
+		 * to keep inline_dentry's structure for backward compatibility,
+		 * we get the space back only from inline_data.
+		 */
+		fi->i_inline_xattr_size = 0;
+	}
+
 	/* check data exist */
 	if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
 		__recover_inline_status(inode, node_page);
@@ -384,6 +401,10 @@ int update_inode(struct inode *inode, struct page *node_page)
 	if (f2fs_has_extra_attr(inode)) {
 		ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
 
+		if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb))
+			ri->i_inline_xattr_size =
+				cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
+
 		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
 			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
 								i_projid)) {
@@ -480,6 +501,7 @@ void f2fs_evict_inode(struct inode *inode)
 
 	remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
 	remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+	remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
 
 	sb_start_intwrite(inode->i_sb);
 	set_inode_flag(inode, FI_NO_ALLOC);
@@ -519,8 +541,10 @@ no_delete:
 	stat_dec_inline_dir(inode);
 	stat_dec_inline_inode(inode);
 
-	if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))
+	if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)))
 		f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
+	else
+		f2fs_inode_synced(inode);
 
 	/* ino == 0, if f2fs_new_inode() was failed t*/
 	if (inode->i_ino)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index d92b8e9064cb..cf8f4370d256 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -29,6 +29,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	nid_t ino;
 	struct inode *inode;
 	bool nid_free = false;
+	int xattr_size = 0;
 	int err;
 
 	inode = new_inode(dir->i_sb);
@@ -86,11 +87,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	if (test_opt(sbi, INLINE_XATTR))
 		set_inode_flag(inode, FI_INLINE_XATTR);
+
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
 		set_inode_flag(inode, FI_INLINE_DATA);
 	if (f2fs_may_inline_dentry(inode))
 		set_inode_flag(inode, FI_INLINE_DENTRY);
 
+	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+		if (f2fs_has_inline_xattr(inode))
+			xattr_size = sbi->inline_xattr_size;
+		/* Otherwise, will be 0 */
+	} else if (f2fs_has_inline_xattr(inode) ||
+				f2fs_has_inline_dentry(inode)) {
+		xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	}
+	F2FS_I(inode)->i_inline_xattr_size = xattr_size;
+
 	f2fs_init_extent_tree(inode, NULL);
 
 	stat_inc_inline_xattr(inode);
@@ -177,6 +190,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	nid_t ino = 0;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -221,6 +237,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if (f2fs_encrypted_inode(dir) &&
 			!fscrypt_has_permitted_context(dir, inode))
 		return -EPERM;
@@ -331,12 +350,15 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct f2fs_dir_entry *de;
 	struct page *page;
-	nid_t ino;
+	struct dentry *new;
+	nid_t ino = -1;
 	int err = 0;
 	unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
 
+	trace_f2fs_lookup_start(dir, dentry, flags);
+
 	if (f2fs_encrypted_inode(dir)) {
-		int res = fscrypt_get_encryption_info(dir);
+		err = fscrypt_get_encryption_info(dir);
 
 		/*
 		 * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
@@ -346,18 +368,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 		if (fscrypt_has_encryption_key(dir))
 			fscrypt_set_encrypted_dentry(dentry);
 		fscrypt_set_d_op(dentry);
-		if (res && res != -ENOKEY)
-			return ERR_PTR(res);
+		if (err && err != -ENOKEY)
+			goto out;
 	}
 
-	if (dentry->d_name.len > F2FS_NAME_LEN)
-		return ERR_PTR(-ENAMETOOLONG);
+	if (dentry->d_name.len > F2FS_NAME_LEN) {
+		err = -ENAMETOOLONG;
+		goto out;
+	}
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
 	if (!de) {
-		if (IS_ERR(page))
-			return (struct dentry *)page;
-		return d_splice_alias(inode, dentry);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out;
+		}
+		goto out_splice;
 	}
 
 	ino = le32_to_cpu(de->ino);
@@ -365,19 +391,21 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	f2fs_put_page(page, 0);
 
 	inode = f2fs_iget(dir->i_sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
 
 	if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
 		err = __recover_dot_dentries(dir, root_ino);
 		if (err)
-			goto err_out;
+			goto out_iput;
 	}
 
 	if (f2fs_has_inline_dots(inode)) {
 		err = __recover_dot_dentries(inode, dir->i_ino);
 		if (err)
-			goto err_out;
+			goto out_iput;
 	}
 	if (f2fs_encrypted_inode(dir) &&
 	    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
@@ -386,12 +414,18 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 			 "Inconsistent encryption contexts: %lu/%lu",
 			 dir->i_ino, inode->i_ino);
 		err = -EPERM;
-		goto err_out;
+		goto out_iput;
 	}
-	return d_splice_alias(inode, dentry);
-
-err_out:
+out_splice:
+	new = d_splice_alias(inode, dentry);
+	if (IS_ERR(new))
+		err = PTR_ERR(new);
+	trace_f2fs_lookup_end(dir, dentry, ino, err);
+	return new;
+out_iput:
 	iput(inode);
+out:
+	trace_f2fs_lookup_end(dir, dentry, ino, err);
 	return ERR_PTR(err);
 }
 
@@ -405,7 +439,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 
 	trace_f2fs_unlink_enter(dir, dentry);
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
+	if (err)
+		return err;
+	err = dquot_initialize(inode);
 	if (err)
 		return err;
 
@@ -457,6 +497,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	struct fscrypt_symlink_data *sd = NULL;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if (f2fs_encrypted_inode(dir)) {
 		err = fscrypt_get_encryption_info(dir);
 		if (err)
@@ -563,6 +606,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -615,6 +661,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err = 0;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -709,6 +758,9 @@ out:
 
 static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+		return -EIO;
+
 	if (f2fs_encrypted_inode(dir)) {
 		int err = fscrypt_get_encryption_info(dir);
 		if (err)
@@ -720,6 +772,9 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+		return -EIO;
+
 	return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
 }
 
@@ -739,6 +794,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	bool is_old_inline = f2fs_has_inline_dentry(old_dir);
 	int err = -ENOENT;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if ((f2fs_encrypted_inode(old_dir) &&
 			!fscrypt_has_encryption_key(old_dir)) ||
 			(f2fs_encrypted_inode(new_dir) &&
@@ -764,6 +822,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (err)
 		goto out;
 
+	if (new_inode) {
+		err = dquot_initialize(new_inode);
+		if (err)
+			goto out;
+	}
+
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_entry) {
 		if (IS_ERR(old_page))
@@ -932,6 +996,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int old_nlink = 0, new_nlink = 0;
 	int err = -ENOENT;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if ((f2fs_encrypted_inode(old_dir) &&
 			!fscrypt_has_encryption_key(old_dir)) ||
 			(f2fs_encrypted_inode(new_dir) &&
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 32474db18ad9..964c99655942 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -46,7 +46,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
 	 */
 	if (type == FREE_NIDS) {
-		mem_size = (nm_i->nid_cnt[FREE_NID_LIST] *
+		mem_size = (nm_i->nid_cnt[FREE_NID] *
 				sizeof(struct free_nid)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == NAT_ENTRIES) {
@@ -63,7 +63,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	} else if (type == INO_ENTRIES) {
 		int i;
 
-		for (i = 0; i <= UPDATE_INO; i++)
+		for (i = 0; i < MAX_INO_ENTRY; i++)
 			mem_size += sbi->im[i].ino_num *
 						sizeof(struct ino_entry);
 		mem_size >>= PAGE_SHIFT;
@@ -74,6 +74,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 				atomic_read(&sbi->total_ext_node) *
 				sizeof(struct extent_node)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+	} else if (type == INMEM_PAGES) {
+		/* it allows 20% / total_ram for inmemory pages */
+		mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
+		res = mem_size < (val.totalram / 5);
 	} else {
 		if (!sbi->sb->s_bdi->wb.dirty_exceeded)
 			return true;
@@ -134,6 +138,44 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 	return dst_page;
 }
 
+static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
+{
+	struct nat_entry *new;
+
+	if (no_fail)
+		new = f2fs_kmem_cache_alloc(nat_entry_slab,
+						GFP_NOFS | __GFP_ZERO);
+	else
+		new = kmem_cache_alloc(nat_entry_slab,
+						GFP_NOFS | __GFP_ZERO);
+	if (new) {
+		nat_set_nid(new, nid);
+		nat_reset_flag(new);
+	}
+	return new;
+}
+
+static void __free_nat_entry(struct nat_entry *e)
+{
+	kmem_cache_free(nat_entry_slab, e);
+}
+
+/* must be locked by nat_tree_lock */
+static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
+	struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
+{
+	if (no_fail)
+		f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
+	else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
+		return NULL;
+
+	if (raw_ne)
+		node_info_from_raw_nat(&ne->ni, raw_ne);
+	list_add_tail(&ne->list, &nm_i->nat_entries);
+	nm_i->nat_cnt++;
+	return ne;
+}
+
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
 {
 	return radix_tree_lookup(&nm_i->nat_root, n);
@@ -150,7 +192,7 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
 	list_del(&e->list);
 	radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
 	nm_i->nat_cnt--;
-	kmem_cache_free(nat_entry_slab, e);
+	__free_nat_entry(e);
 }
 
 static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
@@ -246,49 +288,29 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
 	return need_update;
 }
 
-static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
-								bool no_fail)
-{
-	struct nat_entry *new;
-
-	if (no_fail) {
-		new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
-		f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
-	} else {
-		new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
-		if (!new)
-			return NULL;
-		if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
-			kmem_cache_free(nat_entry_slab, new);
-			return NULL;
-		}
-	}
-
-	memset(new, 0, sizeof(struct nat_entry));
-	nat_set_nid(new, nid);
-	nat_reset_flag(new);
-	list_add_tail(&new->list, &nm_i->nat_entries);
-	nm_i->nat_cnt++;
-	return new;
-}
-
+/* must be locked by nat_tree_lock */
 static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 						struct f2fs_nat_entry *ne)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct nat_entry *e;
+	struct nat_entry *new, *e;
 
+	new = __alloc_nat_entry(nid, false);
+	if (!new)
+		return;
+
+	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
-	if (!e) {
-		e = grab_nat_entry(nm_i, nid, false);
-		if (e)
-			node_info_from_raw_nat(&e->ni, ne);
-	} else {
+	if (!e)
+		e = __init_nat_entry(nm_i, new, ne, false);
+	else
 		f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
 				nat_get_blkaddr(e) !=
 					le32_to_cpu(ne->block_addr) ||
 				nat_get_version(e) != ne->version);
-	}
+	up_write(&nm_i->nat_tree_lock);
+	if (e != new)
+		__free_nat_entry(new);
 }
 
 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -296,11 +318,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
+	struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
 
 	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, ni->nid);
 	if (!e) {
-		e = grab_nat_entry(nm_i, ni->nid, true);
+		e = __init_nat_entry(nm_i, new, NULL, true);
 		copy_node_info(&e->ni, ni);
 		f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
 	} else if (new_blkaddr == NEW_ADDR) {
@@ -312,6 +335,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 		copy_node_info(&e->ni, ni);
 		f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
 	}
+	/* let's free early to reduce memory consumption */
+	if (e != new)
+		__free_nat_entry(new);
 
 	/* sanity check */
 	f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
@@ -327,10 +353,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
 		unsigned char version = nat_get_version(e);
 		nat_set_version(e, inc_node_version(version));
-
-		/* in order to reuse the nid */
-		if (nm_i->next_scan_nid > ni->nid)
-			nm_i->next_scan_nid = ni->nid;
 	}
 
 	/* change address */
@@ -424,9 +446,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 	f2fs_put_page(page, 1);
 cache:
 	/* cache nat entry */
-	down_write(&nm_i->nat_tree_lock);
 	cache_nat_entry(sbi, nid, &ne);
-	up_write(&nm_i->nat_tree_lock);
 }
 
 /*
@@ -962,7 +982,8 @@ fail:
 	return err > 0 ? 0 : err;
 }
 
-int truncate_xattr_node(struct inode *inode, struct page *page)
+/* caller must lock inode page */
+int truncate_xattr_node(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t nid = F2FS_I(inode)->i_xattr_nid;
@@ -978,10 +999,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
 
 	f2fs_i_xnid_write(inode, 0);
 
-	set_new_dnode(&dn, inode, page, npage, nid);
-
-	if (page)
-		dn.inode_page_locked = true;
+	set_new_dnode(&dn, inode, NULL, npage, nid);
 	truncate_node(&dn);
 	return 0;
 }
@@ -1000,7 +1018,7 @@ int remove_inode_page(struct inode *inode)
 	if (err)
 		return err;
 
-	err = truncate_xattr_node(inode, dn.inode_page);
+	err = truncate_xattr_node(inode);
 	if (err) {
 		f2fs_put_dnode(&dn);
 		return err;
@@ -1220,7 +1238,8 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
 	if (!inode)
 		return;
 
-	page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
+	page = f2fs_pagecache_get_page(inode->i_mapping, 0,
+					FGP_LOCK|FGP_NOWAIT, 0);
 	if (!page)
 		goto iput_out;
 
@@ -1244,37 +1263,6 @@ iput_out:
 	iput(inode);
 }
 
-void move_node_page(struct page *node_page, int gc_type)
-{
-	if (gc_type == FG_GC) {
-		struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = 1,
-			.for_reclaim = 0,
-		};
-
-		set_page_dirty(node_page);
-		f2fs_wait_on_page_writeback(node_page, NODE, true);
-
-		f2fs_bug_on(sbi, PageWriteback(node_page));
-		if (!clear_page_dirty_for_io(node_page))
-			goto out_page;
-
-		if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
-			unlock_page(node_page);
-		goto release_page;
-	} else {
-		/* set page dirty and write it */
-		if (!PageWriteback(node_page))
-			set_page_dirty(node_page);
-	}
-out_page:
-	unlock_page(node_page);
-release_page:
-	f2fs_put_page(node_page, 0);
-}
-
 static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	pgoff_t index, end;
@@ -1344,6 +1332,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 	struct node_info ni;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
+		.ino = ino_of_node(page),
 		.type = NODE,
 		.op = REQ_OP_WRITE,
 		.op_flags = wbc_to_write_flags(wbc),
@@ -1416,6 +1405,37 @@ redirty_out:
 	return AOP_WRITEPAGE_ACTIVATE;
 }
 
+void move_node_page(struct page *node_page, int gc_type)
+{
+	if (gc_type == FG_GC) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 1,
+			.for_reclaim = 0,
+		};
+
+		set_page_dirty(node_page);
+		f2fs_wait_on_page_writeback(node_page, NODE, true);
+
+		f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
+		if (!clear_page_dirty_for_io(node_page))
+			goto out_page;
+
+		if (__write_node_page(node_page, false, NULL,
+					&wbc, false, FS_GC_NODE_IO))
+			unlock_page(node_page);
+		goto release_page;
+	} else {
+		/* set page dirty and write it */
+		if (!PageWriteback(node_page))
+			set_page_dirty(node_page);
+	}
+out_page:
+	unlock_page(node_page);
+release_page:
+	f2fs_put_page(node_page, 0);
+}
+
 static int f2fs_write_node_page(struct page *page,
 				struct writeback_control *wbc)
 {
@@ -1764,35 +1784,54 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
 	return radix_tree_lookup(&nm_i->free_nid_root, n);
 }
 
-static int __insert_nid_to_list(struct f2fs_sb_info *sbi,
-			struct free_nid *i, enum nid_list list, bool new)
+static int __insert_free_nid(struct f2fs_sb_info *sbi,
+			struct free_nid *i, enum nid_state state)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
-	if (new) {
-		int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
-		if (err)
-			return err;
-	}
+	int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+	if (err)
+		return err;
 
-	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
-						i->state != NID_ALLOC);
-	nm_i->nid_cnt[list]++;
-	list_add_tail(&i->list, &nm_i->nid_list[list]);
+	f2fs_bug_on(sbi, state != i->state);
+	nm_i->nid_cnt[state]++;
+	if (state == FREE_NID)
+		list_add_tail(&i->list, &nm_i->free_nid_list);
 	return 0;
 }
 
-static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
-			struct free_nid *i, enum nid_list list, bool reuse)
+static void __remove_free_nid(struct f2fs_sb_info *sbi,
+			struct free_nid *i, enum nid_state state)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
-	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
-						i->state != NID_ALLOC);
-	nm_i->nid_cnt[list]--;
-	list_del(&i->list);
-	if (!reuse)
-		radix_tree_delete(&nm_i->free_nid_root, i->nid);
+	f2fs_bug_on(sbi, state != i->state);
+	nm_i->nid_cnt[state]--;
+	if (state == FREE_NID)
+		list_del(&i->list);
+	radix_tree_delete(&nm_i->free_nid_root, i->nid);
+}
+
+static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
+			enum nid_state org_state, enum nid_state dst_state)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	f2fs_bug_on(sbi, org_state != i->state);
+	i->state = dst_state;
+	nm_i->nid_cnt[org_state]--;
+	nm_i->nid_cnt[dst_state]++;
+
+	switch (dst_state) {
+	case PREALLOC_NID:
+		list_del(&i->list);
+		break;
+	case FREE_NID:
+		list_add_tail(&i->list, &nm_i->free_nid_list);
+		break;
+	default:
+		BUG_ON(1);
+	}
 }
 
 /* return if the nid is recognized as free */
@@ -1810,7 +1849,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
 	i->nid = nid;
-	i->state = NID_NEW;
+	i->state = FREE_NID;
 
 	if (radix_tree_preload(GFP_NOFS))
 		goto err;
@@ -1823,7 +1862,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		 *  - f2fs_create
 		 *   - f2fs_new_inode
 		 *    - alloc_nid
-		 *     - __insert_nid_to_list(ALLOC_NID_LIST)
+		 *     - __insert_nid_to_list(PREALLOC_NID)
 		 *                     - f2fs_balance_fs_bg
 		 *                      - build_free_nids
 		 *                       - __build_free_nids
@@ -1836,8 +1875,8 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		 *     - new_node_page
 		 *      - set_node_addr
 		 *  - alloc_nid_done
-		 *   - __remove_nid_from_list(ALLOC_NID_LIST)
-		 *                         - __insert_nid_to_list(FREE_NID_LIST)
+		 *   - __remove_nid_from_list(PREALLOC_NID)
+		 *                         - __insert_nid_to_list(FREE_NID)
 		 */
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
@@ -1846,13 +1885,13 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 		e = __lookup_free_nid_list(nm_i, nid);
 		if (e) {
-			if (e->state == NID_NEW)
+			if (e->state == FREE_NID)
 				ret = true;
 			goto err_out;
 		}
 	}
 	ret = true;
-	err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+	err = __insert_free_nid(sbi, i, FREE_NID);
 err_out:
 	spin_unlock(&nm_i->nid_list_lock);
 	radix_tree_preload_end();
@@ -1870,8 +1909,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 
 	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
-	if (i && i->state == NID_NEW) {
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+	if (i && i->state == FREE_NID) {
+		__remove_free_nid(sbi, i, FREE_NID);
 		need_free = true;
 	}
 	spin_unlock(&nm_i->nid_list_lock);
@@ -1890,15 +1929,18 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
 	if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
 		return;
 
-	if (set)
+	if (set) {
+		if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+			return;
 		__set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-	else
-		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-
-	if (set)
 		nm_i->free_nid_count[nat_ofs]++;
-	else if (!build)
-		nm_i->free_nid_count[nat_ofs]--;
+	} else {
+		if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+			return;
+		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+		if (!build)
+			nm_i->free_nid_count[nat_ofs]--;
+	}
 }
 
 static void scan_nat_page(struct f2fs_sb_info *sbi,
@@ -1933,34 +1975,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 	}
 }
 
-static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+static void scan_curseg_cache(struct f2fs_sb_info *sbi)
 {
-	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
 	struct f2fs_journal *journal = curseg->journal;
-	unsigned int i, idx;
+	int i;
 
-	down_read(&nm_i->nat_tree_lock);
-
-	for (i = 0; i < nm_i->nat_blocks; i++) {
-		if (!test_bit_le(i, nm_i->nat_block_bitmap))
-			continue;
-		if (!nm_i->free_nid_count[i])
-			continue;
-		for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
-			nid_t nid;
-
-			if (!test_bit_le(idx, nm_i->free_nid_bitmap[i]))
-				continue;
-
-			nid = i * NAT_ENTRY_PER_BLOCK + idx;
-			add_free_nid(sbi, nid, true);
-
-			if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS)
-				goto out;
-		}
-	}
-out:
 	down_read(&curseg->journal_rwsem);
 	for (i = 0; i < nats_in_cursum(journal); i++) {
 		block_t addr;
@@ -1974,14 +1994,43 @@ out:
 			remove_free_nid(sbi, nid);
 	}
 	up_read(&curseg->journal_rwsem);
+}
+
+static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int i, idx;
+	nid_t nid;
+
+	down_read(&nm_i->nat_tree_lock);
+
+	for (i = 0; i < nm_i->nat_blocks; i++) {
+		if (!test_bit_le(i, nm_i->nat_block_bitmap))
+			continue;
+		if (!nm_i->free_nid_count[i])
+			continue;
+		for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
+			idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
+						NAT_ENTRY_PER_BLOCK, idx);
+			if (idx >= NAT_ENTRY_PER_BLOCK)
+				break;
+
+			nid = i * NAT_ENTRY_PER_BLOCK + idx;
+			add_free_nid(sbi, nid, true);
+
+			if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
+				goto out;
+		}
+	}
+out:
+	scan_curseg_cache(sbi);
+
 	up_read(&nm_i->nat_tree_lock);
 }
 
 static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	struct f2fs_journal *journal = curseg->journal;
 	int i = 0;
 	nid_t nid = nm_i->next_scan_nid;
 
@@ -1989,7 +2038,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 		nid = 0;
 
 	/* Enough entries */
-	if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
+	if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
 		return;
 
 	if (!sync && !available_free_memory(sbi, FREE_NIDS))
@@ -1999,7 +2048,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 		/* try to find free nids in free_nid_bitmap */
 		scan_free_nid_bits(sbi);
 
-		if (nm_i->nid_cnt[FREE_NID_LIST])
+		if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
 			return;
 	}
 
@@ -2027,18 +2076,8 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 	nm_i->next_scan_nid = nid;
 
 	/* find free nids from current sum_pages */
-	down_read(&curseg->journal_rwsem);
-	for (i = 0; i < nats_in_cursum(journal); i++) {
-		block_t addr;
+	scan_curseg_cache(sbi);
 
-		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
-		nid = le32_to_cpu(nid_in_journal(journal, i));
-		if (addr == NULL_ADDR)
-			add_free_nid(sbi, nid, true);
-		else
-			remove_free_nid(sbi, nid);
-	}
-	up_read(&curseg->journal_rwsem);
 	up_read(&nm_i->nat_tree_lock);
 
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
@@ -2076,15 +2115,13 @@ retry:
 	}
 
 	/* We should not use stale free nids created by build_free_nids */
-	if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
-		f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
-		i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+	if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) {
+		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
+		i = list_first_entry(&nm_i->free_nid_list,
 					struct free_nid, list);
 		*nid = i->nid;
 
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
-		i->state = NID_ALLOC;
-		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
+		__move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
 		nm_i->available_nids--;
 
 		update_free_nid_bitmap(sbi, *nid, false, false);
@@ -2110,7 +2147,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
 	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
 	f2fs_bug_on(sbi, !i);
-	__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+	__remove_free_nid(sbi, i, PREALLOC_NID);
 	spin_unlock(&nm_i->nid_list_lock);
 
 	kmem_cache_free(free_nid_slab, i);
@@ -2133,12 +2170,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 	f2fs_bug_on(sbi, !i);
 
 	if (!available_free_memory(sbi, FREE_NIDS)) {
-		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+		__remove_free_nid(sbi, i, PREALLOC_NID);
 		need_free = true;
 	} else {
-		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true);
-		i->state = NID_NEW;
-		__insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
+		__move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
 	}
 
 	nm_i->available_nids++;
@@ -2157,20 +2192,19 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
 	struct free_nid *i, *next;
 	int nr = nr_shrink;
 
-	if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+	if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
 		return 0;
 
 	if (!mutex_trylock(&nm_i->build_lock))
 		return 0;
 
 	spin_lock(&nm_i->nid_list_lock);
-	list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST],
-									list) {
+	list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
 		if (nr_shrink <= 0 ||
-				nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+				nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
 			break;
 
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+		__remove_free_nid(sbi, i, FREE_NID);
 		kmem_cache_free(free_nid_slab, i);
 		nr_shrink--;
 	}
@@ -2196,8 +2230,8 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
 		goto update_inode;
 	}
 
-	dst_addr = inline_xattr_addr(ipage);
-	src_addr = inline_xattr_addr(page);
+	dst_addr = inline_xattr_addr(inode, ipage);
+	src_addr = inline_xattr_addr(inode, page);
 	inline_size = inline_xattr_size(inode);
 
 	f2fs_wait_on_page_writeback(ipage, NODE, true);
@@ -2286,6 +2320,12 @@ retry:
 	dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
 	if (dst->i_inline & F2FS_EXTRA_ATTR) {
 		dst->i_extra_isize = src->i_extra_isize;
+
+		if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
+			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+							i_inline_xattr_size))
+			dst->i_inline_xattr_size = src->i_inline_xattr_size;
+
 		if (f2fs_sb_has_project_quota(sbi->sb) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 								i_projid))
@@ -2357,8 +2397,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (!ne) {
-			ne = grab_nat_entry(nm_i, nid, true);
-			node_info_from_raw_nat(&ne->ni, &raw_ne);
+			ne = __alloc_nat_entry(nid, true);
+			__init_nat_entry(nm_i, ne, &raw_ne, true);
 		}
 
 		/*
@@ -2404,15 +2444,17 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
 	unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
 	struct f2fs_nat_block *nat_blk = page_address(page);
 	int valid = 0;
-	int i;
+	int i = 0;
 
 	if (!enabled_nat_bits(sbi, NULL))
 		return;
 
-	for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) {
-		if (start_nid == 0 && i == 0)
-			valid++;
-		if (nat_blk->entries[i].block_addr)
+	if (nat_index == 0) {
+		valid = 1;
+		i = 1;
+	}
+	for (; i < NAT_ENTRY_PER_BLOCK; i++) {
+		if (nat_blk->entries[i].block_addr != NULL_ADDR)
 			valid++;
 	}
 	if (valid == 0) {
@@ -2607,7 +2649,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
 		__set_bit_le(i, nm_i->nat_block_bitmap);
 
 		nid = i * NAT_ENTRY_PER_BLOCK;
-		last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK;
+		last_nid = nid + NAT_ENTRY_PER_BLOCK;
 
 		spin_lock(&NM_I(sbi)->nid_list_lock);
 		for (; nid < last_nid; nid++)
@@ -2642,16 +2684,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
 	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
 							F2FS_RESERVED_NODE_NUM;
-	nm_i->nid_cnt[FREE_NID_LIST] = 0;
-	nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
+	nm_i->nid_cnt[FREE_NID] = 0;
+	nm_i->nid_cnt[PREALLOC_NID] = 0;
 	nm_i->nat_cnt = 0;
 	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
 	nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
 	nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
 
 	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
-	INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]);
-	INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]);
+	INIT_LIST_HEAD(&nm_i->free_nid_list);
 	INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
 	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
 	INIT_LIST_HEAD(&nm_i->nat_entries);
@@ -2743,16 +2784,15 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 
 	/* destroy free nid list */
 	spin_lock(&nm_i->nid_list_lock);
-	list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
-									list) {
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
+		__remove_free_nid(sbi, i, FREE_NID);
 		spin_unlock(&nm_i->nid_list_lock);
 		kmem_cache_free(free_nid_slab, i);
 		spin_lock(&nm_i->nid_list_lock);
 	}
-	f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]);
-	f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]);
-	f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST]));
+	f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
+	f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
+	f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
 	spin_unlock(&nm_i->nid_list_lock);
 
 	/* destroy nat cache */
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index bb53e9955ff2..0ee3e5ff49a3 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -140,6 +140,7 @@ enum mem_type {
 	DIRTY_DENTS,	/* indicates dirty dentry pages */
 	INO_ENTRIES,	/* indicates inode entries */
 	EXTENT_CACHE,	/* indicates extent cache */
+	INMEM_PAGES,	/* indicates inmemory pages */
 	BASE_CHECK,	/* check kernel status */
 };
 
@@ -150,18 +151,10 @@ struct nat_entry_set {
 	unsigned int entry_cnt;		/* the # of nat entries in set */
 };
 
-/*
- * For free nid mangement
- */
-enum nid_state {
-	NID_NEW,	/* newly added to free nid list */
-	NID_ALLOC	/* it is allocated */
-};
-
 struct free_nid {
 	struct list_head list;	/* for free node id list */
 	nid_t nid;		/* node id */
-	int state;		/* in use or not: NID_NEW or NID_ALLOC */
+	int state;		/* in use or not: FREE_NID or PREALLOC_NID */
 };
 
 static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
@@ -170,12 +163,11 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 	struct free_nid *fnid;
 
 	spin_lock(&nm_i->nid_list_lock);
-	if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) {
+	if (nm_i->nid_cnt[FREE_NID] <= 0) {
 		spin_unlock(&nm_i->nid_list_lock);
 		return;
 	}
-	fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
-						struct free_nid, list);
+	fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list);
 	*nid = fnid->nid;
 	spin_unlock(&nm_i->nid_list_lock);
 }
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9626758bc762..92c57ace1939 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -594,6 +594,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	int ret = 0;
 	unsigned long s_flags = sbi->sb->s_flags;
 	bool need_writecp = false;
+#ifdef CONFIG_QUOTA
+	int quota_enabled;
+#endif
 
 	if (s_flags & MS_RDONLY) {
 		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
@@ -604,7 +607,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	/* Needed for iput() to work correctly and not trash data */
 	sbi->sb->s_flags |= MS_ACTIVE;
 	/* Turn on quotas so that they are updated correctly */
-	f2fs_enable_quota_files(sbi);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
 #endif
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -665,7 +668,8 @@ skip:
 out:
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
-	f2fs_quota_off_umount(sbi->sb);
+	if (quota_enabled)
+		f2fs_quota_off_umount(sbi->sb);
 #endif
 	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f5c494389483..94939a5a96c8 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -181,11 +181,12 @@ bool need_SSR(struct f2fs_sb_info *sbi)
 		return true;
 
 	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
-						2 * reserved_sections(sbi));
+			SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
 }
 
 void register_inmem_page(struct inode *inode, struct page *page)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct inmem_pages *new;
 
@@ -204,6 +205,10 @@ void register_inmem_page(struct inode *inode, struct page *page)
 	mutex_lock(&fi->inmem_lock);
 	get_page(page);
 	list_add_tail(&new->list, &fi->inmem_pages);
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (list_empty(&fi->inmem_ilist))
+		list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
 	mutex_unlock(&fi->inmem_lock);
 
@@ -262,12 +267,41 @@ next:
 	return err;
 }
 
+void drop_inmem_pages_all(struct f2fs_sb_info *sbi)
+{
+	struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
+	struct inode *inode;
+	struct f2fs_inode_info *fi;
+next:
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (list_empty(head)) {
+		spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+		return;
+	}
+	fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
+	inode = igrab(&fi->vfs_inode);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+
+	if (inode) {
+		drop_inmem_pages(inode);
+		iput(inode);
+	}
+	congestion_wait(BLK_RW_ASYNC, HZ/50);
+	cond_resched();
+	goto next;
+}
+
 void drop_inmem_pages(struct inode *inode)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
 	mutex_lock(&fi->inmem_lock);
 	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (!list_empty(&fi->inmem_ilist))
+		list_del_init(&fi->inmem_ilist);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	mutex_unlock(&fi->inmem_lock);
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
@@ -313,6 +347,7 @@ static int __commit_inmem_pages(struct inode *inode,
 	struct inmem_pages *cur, *tmp;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
+		.ino = inode->i_ino,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = REQ_SYNC | REQ_PRIO,
@@ -398,6 +433,10 @@ int commit_inmem_pages(struct inode *inode)
 		/* drop all uncommitted pages */
 		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
 	}
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (!list_empty(&fi->inmem_ilist))
+		list_del_init(&fi->inmem_ilist);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	mutex_unlock(&fi->inmem_lock);
 
 	clear_inode_flag(inode, FI_ATOMIC_COMMIT);
@@ -472,7 +511,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 				struct block_device *bdev)
 {
-	struct bio *bio = f2fs_bio_alloc(0);
+	struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
 	int ret;
 
 	bio->bi_rw = REQ_OP_WRITE;
@@ -485,15 +524,17 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 	return ret;
 }
 
-static int submit_flush_wait(struct f2fs_sb_info *sbi)
+static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev);
+	int ret = 0;
 	int i;
 
-	if (!sbi->s_ndevs || ret)
-		return ret;
+	if (!sbi->s_ndevs)
+		return __submit_flush_wait(sbi, sbi->sb->s_bdev);
 
-	for (i = 1; i < sbi->s_ndevs; i++) {
+	for (i = 0; i < sbi->s_ndevs; i++) {
+		if (!is_dirty_device(sbi, ino, i, FLUSH_INO))
+			continue;
 		ret = __submit_flush_wait(sbi, FDEV(i).bdev);
 		if (ret)
 			break;
@@ -519,7 +560,9 @@ repeat:
 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
-		ret = submit_flush_wait(sbi);
+		cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode);
+
+		ret = submit_flush_wait(sbi, cmd->ino);
 		atomic_inc(&fcc->issued_flush);
 
 		llist_for_each_entry_safe(cmd, next,
@@ -537,7 +580,7 @@ repeat:
 	goto repeat;
 }
 
-int f2fs_issue_flush(struct f2fs_sb_info *sbi)
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
 	struct flush_cmd cmd;
@@ -547,19 +590,20 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		return 0;
 
 	if (!test_opt(sbi, FLUSH_MERGE)) {
-		ret = submit_flush_wait(sbi);
+		ret = submit_flush_wait(sbi, ino);
 		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}
 
-	if (atomic_inc_return(&fcc->issing_flush) == 1) {
-		ret = submit_flush_wait(sbi);
+	if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) {
+		ret = submit_flush_wait(sbi, ino);
 		atomic_dec(&fcc->issing_flush);
 
 		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}
 
+	cmd.ino = ino;
 	init_completion(&cmd.wait);
 
 	llist_add(&cmd.llnode, &fcc->issue_list);
@@ -583,7 +627,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		} else {
 			struct flush_cmd *tmp, *next;
 
-			ret = submit_flush_wait(sbi);
+			ret = submit_flush_wait(sbi, ino);
 
 			llist_for_each_entry_safe(tmp, next, list, llnode) {
 				if (tmp == &cmd) {
@@ -653,6 +697,28 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
 	}
 }
 
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
+{
+	int ret = 0, i;
+
+	if (!sbi->s_ndevs)
+		return 0;
+
+	for (i = 1; i < sbi->s_ndevs; i++) {
+		if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
+			continue;
+		ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+		if (ret)
+			break;
+
+		spin_lock(&sbi->dev_lock);
+		f2fs_clear_bit(i, (char *)&sbi->dirty_device);
+		spin_unlock(&sbi->dev_lock);
+	}
+
+	return ret;
+}
+
 static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 		enum dirty_type dirty_type)
 {
@@ -794,6 +860,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 
+	trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len);
+
 	f2fs_bug_on(sbi, dc->ref);
 
 	if (dc->error == -EOPNOTSUPP)
@@ -875,7 +943,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			if (ret)
 				return ret;
 		}
-		bio = f2fs_bio_alloc(1);
+		bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, 1);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
 		bio_set_op_attrs(bio, op, 0);
@@ -926,10 +994,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi,
 
 /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
 static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
-				struct discard_cmd *dc)
+						struct discard_policy *dpolicy,
+						struct discard_cmd *dc)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+					&(dcc->fstrim_list) : &(dcc->wait_list);
 	struct bio *bio = NULL;
+	int flag = dpolicy->sync ? REQ_SYNC : 0;
 
 	if (dc->state != D_PREP)
 		return;
@@ -948,8 +1020,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
 		if (bio) {
 			bio->bi_private = dc;
 			bio->bi_end_io = f2fs_submit_discard_endio;
-			submit_bio(REQ_SYNC, bio);
-			list_move_tail(&dc->list, &dcc->wait_list);
+			submit_bio(flag, bio);
+			list_move_tail(&dc->list, wait_list);
 			__check_sit_bitmap(sbi, dc->start, dc->start + dc->len);
 
 			f2fs_update_iostat(sbi, FS_DISCARD, 1);
@@ -966,7 +1038,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
 				struct rb_node *insert_parent)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	struct rb_node **p = &dcc->root.rb_node;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct discard_cmd *dc = NULL;
 
@@ -1134,58 +1206,107 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
 	return 0;
 }
 
-static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
+static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
+					struct discard_policy *dpolicy,
+					unsigned int start, unsigned int end)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+	struct rb_node **insert_p = NULL, *insert_parent = NULL;
+	struct discard_cmd *dc;
+	struct blk_plug plug;
+	int issued;
+
+next:
+	issued = 0;
+
+	mutex_lock(&dcc->cmd_lock);
+	f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+
+	dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+					NULL, start,
+					(struct rb_entry **)&prev_dc,
+					(struct rb_entry **)&next_dc,
+					&insert_p, &insert_parent, true);
+	if (!dc)
+		dc = next_dc;
+
+	blk_start_plug(&plug);
+
+	while (dc && dc->lstart <= end) {
+		struct rb_node *node;
+
+		if (dc->len < dpolicy->granularity)
+			goto skip;
+
+		if (dc->state != D_PREP) {
+			list_move_tail(&dc->list, &dcc->fstrim_list);
+			goto skip;
+		}
+
+		__submit_discard_cmd(sbi, dpolicy, dc);
+
+		if (++issued >= dpolicy->max_requests) {
+			start = dc->lstart + dc->len;
+
+			blk_finish_plug(&plug);
+			mutex_unlock(&dcc->cmd_lock);
+
+			schedule();
+
+			goto next;
+		}
+skip:
+		node = rb_next(&dc->rb_node);
+		dc = rb_entry_safe(node, struct discard_cmd, rb_node);
+
+		if (fatal_signal_pending(current))
+			break;
+	}
+
+	blk_finish_plug(&plug);
+	mutex_unlock(&dcc->cmd_lock);
+}
+
+static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
+					struct discard_policy *dpolicy)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct list_head *pend_list;
 	struct discard_cmd *dc, *tmp;
 	struct blk_plug plug;
-	int iter = 0, issued = 0;
-	int i;
+	int i, iter = 0, issued = 0;
 	bool io_interrupted = false;
 
-	mutex_lock(&dcc->cmd_lock);
-	f2fs_bug_on(sbi,
-		!__check_rb_tree_consistence(sbi, &dcc->root));
-	blk_start_plug(&plug);
-	for (i = MAX_PLIST_NUM - 1;
-			i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+		if (i + 1 < dpolicy->granularity)
+			break;
 		pend_list = &dcc->pend_list[i];
+
+		mutex_lock(&dcc->cmd_lock);
+		f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+		blk_start_plug(&plug);
 		list_for_each_entry_safe(dc, tmp, pend_list, list) {
 			f2fs_bug_on(sbi, dc->state != D_PREP);
 
-			/* Hurry up to finish fstrim */
-			if (dcc->pend_list_tag[i] & P_TRIM) {
-				__submit_discard_cmd(sbi, dc);
-				issued++;
-
-				if (fatal_signal_pending(current))
-					break;
-				continue;
-			}
-
-			if (!issue_cond) {
-				__submit_discard_cmd(sbi, dc);
-				issued++;
-				continue;
-			}
-
-			if (is_idle(sbi)) {
-				__submit_discard_cmd(sbi, dc);
-				issued++;
-			} else {
+			if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
+								!is_idle(sbi)) {
 				io_interrupted = true;
+				goto skip;
 			}
 
-			if (++iter >= DISCARD_ISSUE_RATE)
-				goto out;
+			__submit_discard_cmd(sbi, dpolicy, dc);
+			issued++;
+skip:
+			if (++iter >= dpolicy->max_requests)
+				break;
 		}
-		if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM)
-			dcc->pend_list_tag[i] &= (~P_TRIM);
+		blk_finish_plug(&plug);
+		mutex_unlock(&dcc->cmd_lock);
+
+		if (iter >= dpolicy->max_requests)
+			break;
 	}
-out:
-	blk_finish_plug(&plug);
-	mutex_unlock(&dcc->cmd_lock);
 
 	if (!issued && io_interrupted)
 		issued = -1;
@@ -1193,12 +1314,13 @@ out:
 	return issued;
 }
 
-static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
+static bool __drop_discard_cmd(struct f2fs_sb_info *sbi)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct list_head *pend_list;
 	struct discard_cmd *dc, *tmp;
 	int i;
+	bool dropped = false;
 
 	mutex_lock(&dcc->cmd_lock);
 	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
@@ -1206,39 +1328,58 @@ static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
 		list_for_each_entry_safe(dc, tmp, pend_list, list) {
 			f2fs_bug_on(sbi, dc->state != D_PREP);
 			__remove_discard_cmd(sbi, dc);
+			dropped = true;
 		}
 	}
 	mutex_unlock(&dcc->cmd_lock);
+
+	return dropped;
 }
 
-static void __wait_one_discard_bio(struct f2fs_sb_info *sbi,
+static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi,
 							struct discard_cmd *dc)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	unsigned int len = 0;
 
 	wait_for_completion_io(&dc->wait);
 	mutex_lock(&dcc->cmd_lock);
 	f2fs_bug_on(sbi, dc->state != D_DONE);
 	dc->ref--;
-	if (!dc->ref)
+	if (!dc->ref) {
+		if (!dc->error)
+			len = dc->len;
 		__remove_discard_cmd(sbi, dc);
+	}
 	mutex_unlock(&dcc->cmd_lock);
+
+	return len;
 }
 
-static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond)
+static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
+						struct discard_policy *dpolicy,
+						block_t start, block_t end)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	struct list_head *wait_list = &(dcc->wait_list);
+	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+					&(dcc->fstrim_list) : &(dcc->wait_list);
 	struct discard_cmd *dc, *tmp;
 	bool need_wait;
+	unsigned int trimmed = 0;
 
 next:
 	need_wait = false;
 
 	mutex_lock(&dcc->cmd_lock);
 	list_for_each_entry_safe(dc, tmp, wait_list, list) {
-		if (!wait_cond || (dc->state == D_DONE && !dc->ref)) {
+		if (dc->lstart + dc->len <= start || end <= dc->lstart)
+			continue;
+		if (dc->len < dpolicy->granularity)
+			continue;
+		if (dc->state == D_DONE && !dc->ref) {
 			wait_for_completion_io(&dc->wait);
+			if (!dc->error)
+				trimmed += dc->len;
 			__remove_discard_cmd(sbi, dc);
 		} else {
 			dc->ref++;
@@ -1249,9 +1390,17 @@ next:
 	mutex_unlock(&dcc->cmd_lock);
 
 	if (need_wait) {
-		__wait_one_discard_bio(sbi, dc);
+		trimmed += __wait_one_discard_bio(sbi, dc);
 		goto next;
 	}
+
+	return trimmed;
+}
+
+static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
+						struct discard_policy *dpolicy)
+{
+	__wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
 }
 
 /* This should be covered by global mutex, &sit_i->sentry_lock */
@@ -1289,23 +1438,19 @@ void stop_discard_thread(struct f2fs_sb_info *sbi)
 	}
 }
 
-/* This comes from f2fs_put_super and f2fs_trim_fs */
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount)
-{
-	__issue_discard_cmd(sbi, false);
-	__drop_discard_cmd(sbi);
-	__wait_discard_cmd(sbi, !umount);
-}
-
-static void mark_discard_range_all(struct f2fs_sb_info *sbi)
+/* This comes from f2fs_put_super */
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	int i;
+	struct discard_policy dpolicy;
+	bool dropped;
 
-	mutex_lock(&dcc->cmd_lock);
-	for (i = 0; i < MAX_PLIST_NUM; i++)
-		dcc->pend_list_tag[i] |= P_TRIM;
-	mutex_unlock(&dcc->cmd_lock);
+	init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity);
+	__issue_discard_cmd(sbi, &dpolicy);
+	dropped = __drop_discard_cmd(sbi);
+	__wait_all_discard_cmd(sbi, &dpolicy);
+
+	return dropped;
 }
 
 static int issue_discard_thread(void *data)
@@ -1313,12 +1458,16 @@ static int issue_discard_thread(void *data)
 	struct f2fs_sb_info *sbi = data;
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	wait_queue_head_t *q = &dcc->discard_wait_queue;
+	struct discard_policy dpolicy;
 	unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
 	int issued;
 
 	set_freezable();
 
 	do {
+		init_discard_policy(&dpolicy, DPOLICY_BG,
+					dcc->discard_granularity);
+
 		wait_event_interruptible_timeout(*q,
 				kthread_should_stop() || freezing(current) ||
 				dcc->discard_wake,
@@ -1331,17 +1480,18 @@ static int issue_discard_thread(void *data)
 		if (dcc->discard_wake) {
 			dcc->discard_wake = 0;
 			if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
-				mark_discard_range_all(sbi);
+				init_discard_policy(&dpolicy,
+							DPOLICY_FORCE, 1);
 		}
 
 		sb_start_intwrite(sbi->sb);
 
-		issued = __issue_discard_cmd(sbi, true);
+		issued = __issue_discard_cmd(sbi, &dpolicy);
 		if (issued) {
-			__wait_discard_cmd(sbi, true);
-			wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+			__wait_all_discard_cmd(sbi, &dpolicy);
+			wait_ms = dpolicy.min_interval;
 		} else {
-			wait_ms = DEF_MAX_DISCARD_ISSUE_TIME;
+			wait_ms = dpolicy.max_interval;
 		}
 
 		sb_end_intwrite(sbi->sb);
@@ -1605,7 +1755,6 @@ find_next:
 
 			f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
 									len);
-			cpc->trimmed += len;
 			total_len += len;
 		} else {
 			next_pos = find_next_bit_le(entry->discard_map,
@@ -1626,6 +1775,37 @@ skip:
 	wake_up_discard_thread(sbi, false);
 }
 
+void init_discard_policy(struct discard_policy *dpolicy,
+				int discard_type, unsigned int granularity)
+{
+	/* common policy */
+	dpolicy->type = discard_type;
+	dpolicy->sync = true;
+	dpolicy->granularity = granularity;
+
+	if (discard_type == DPOLICY_BG) {
+		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = true;
+	} else if (discard_type == DPOLICY_FORCE) {
+		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = true;
+	} else if (discard_type == DPOLICY_FSTRIM) {
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = false;
+	} else if (discard_type == DPOLICY_UMOUNT) {
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = false;
+	}
+}
+
 static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
@@ -1643,12 +1823,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 
 	dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
 	INIT_LIST_HEAD(&dcc->entry_list);
-	for (i = 0; i < MAX_PLIST_NUM; i++) {
+	for (i = 0; i < MAX_PLIST_NUM; i++)
 		INIT_LIST_HEAD(&dcc->pend_list[i]);
-		if (i >= dcc->discard_granularity - 1)
-			dcc->pend_list_tag[i] |= P_ACTIVE;
-	}
 	INIT_LIST_HEAD(&dcc->wait_list);
+	INIT_LIST_HEAD(&dcc->fstrim_list);
 	mutex_init(&dcc->cmd_lock);
 	atomic_set(&dcc->issued_discard, 0);
 	atomic_set(&dcc->issing_discard, 0);
@@ -1796,16 +1974,6 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 		get_sec_entry(sbi, segno)->valid_blocks += del;
 }
 
-void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
-{
-	update_sit_entry(sbi, new, 1);
-	if (GET_SEGNO(sbi, old) != NULL_SEGNO)
-		update_sit_entry(sbi, old, -1);
-
-	locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
-	locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
-}
-
 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 {
 	unsigned int segno = GET_SEGNO(sbi, addr);
@@ -1816,14 +1984,14 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 		return;
 
 	/* add it into sit main buffer */
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	update_sit_entry(sbi, addr, -1);
 
 	/* add it into dirty seglist */
 	locate_dirty_segment(sbi, segno);
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 }
 
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
@@ -1836,7 +2004,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
 	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
 		return true;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_read(&sit_i->sentry_lock);
 
 	segno = GET_SEGNO(sbi, blkaddr);
 	se = get_seg_entry(sbi, segno);
@@ -1845,7 +2013,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
 	if (f2fs_test_bit(offset, se->ckpt_valid_map))
 		is_cp = true;
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_read(&sit_i->sentry_lock);
 
 	return is_cp;
 }
@@ -1903,12 +2071,8 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
 void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
 {
 	struct page *page = grab_meta_page(sbi, blk_addr);
-	void *dst = page_address(page);
 
-	if (src)
-		memcpy(dst, src, PAGE_SIZE);
-	else
-		memset(dst, 0, PAGE_SIZE);
+	memcpy(page_address(page), src, PAGE_SIZE);
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
 }
@@ -2007,7 +2171,6 @@ find_other_zone:
 	}
 	secno = left_start;
 skip_left:
-	hint = secno;
 	segno = GET_SEG_FROM_SEC(sbi, secno);
 	zoneno = GET_ZONE_FROM_SEC(sbi, secno);
 
@@ -2242,12 +2405,16 @@ void allocate_new_segments(struct f2fs_sb_info *sbi)
 	unsigned int old_segno;
 	int i;
 
+	down_write(&SIT_I(sbi)->sentry_lock);
+
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
 		curseg = CURSEG_I(sbi, i);
 		old_segno = curseg->segno;
 		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
 		locate_dirty_segment(sbi, old_segno);
 	}
+
+	up_write(&SIT_I(sbi)->sentry_lock);
 }
 
 static const struct segment_allocation default_salloc_ops = {
@@ -2259,14 +2426,14 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	__u64 trim_start = cpc->trim_start;
 	bool has_candidate = false;
 
-	mutex_lock(&SIT_I(sbi)->sentry_lock);
+	down_write(&SIT_I(sbi)->sentry_lock);
 	for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) {
 		if (add_discard_addrs(sbi, cpc, true)) {
 			has_candidate = true;
 			break;
 		}
 	}
-	mutex_unlock(&SIT_I(sbi)->sentry_lock);
+	up_write(&SIT_I(sbi)->sentry_lock);
 
 	cpc->trim_start = trim_start;
 	return has_candidate;
@@ -2276,14 +2443,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 {
 	__u64 start = F2FS_BYTES_TO_BLK(range->start);
 	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
-	unsigned int start_segno, end_segno;
+	unsigned int start_segno, end_segno, cur_segno;
+	block_t start_block, end_block;
 	struct cp_control cpc;
+	struct discard_policy dpolicy;
+	unsigned long long trimmed = 0;
 	int err = 0;
 
 	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
 		return -EINVAL;
 
-	cpc.trimmed = 0;
 	if (end <= MAIN_BLKADDR(sbi))
 		goto out;
 
@@ -2297,12 +2466,14 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
 	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
 						GET_SEGNO(sbi, end);
+
 	cpc.reason = CP_DISCARD;
 	cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
 
 	/* do checkpoint to issue discard commands safely */
-	for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
-		cpc.trim_start = start_segno;
+	for (cur_segno = start_segno; cur_segno <= end_segno;
+					cur_segno = cpc.trim_end + 1) {
+		cpc.trim_start = cur_segno;
 
 		if (sbi->discard_blks == 0)
 			break;
@@ -2310,7 +2481,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 			cpc.trim_end = end_segno;
 		else
 			cpc.trim_end = min_t(unsigned int,
-				rounddown(start_segno +
+				rounddown(cur_segno +
 				BATCHED_TRIM_SEGMENTS(sbi),
 				sbi->segs_per_sec) - 1, end_segno);
 
@@ -2322,11 +2493,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 
 		schedule();
 	}
-	/* It's time to issue all the filed discards */
-	mark_discard_range_all(sbi);
-	f2fs_wait_discard_bios(sbi, false);
+
+	start_block = START_BLOCK(sbi, start_segno);
+	end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1);
+
+	init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
+	__issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block);
+	trimmed = __wait_discard_cmd_range(sbi, &dpolicy,
+					start_block, end_block);
 out:
-	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
+	range->len = F2FS_BLK_TO_BYTES(trimmed);
 	return err;
 }
 
@@ -2338,6 +2514,20 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 	return false;
 }
 
+#if 0
+int rw_hint_to_seg_type(enum rw_hint hint)
+{
+	switch (hint) {
+	case WRITE_LIFE_SHORT:
+		return CURSEG_HOT_DATA;
+	case WRITE_LIFE_EXTREME:
+		return CURSEG_COLD_DATA;
+	default:
+		return CURSEG_WARM_DATA;
+	}
+}
+#endif
+
 static int __get_segment_type_2(struct f2fs_io_info *fio)
 {
 	if (fio->type == DATA)
@@ -2372,6 +2562,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
 			return CURSEG_COLD_DATA;
 		if (is_inode_flag_set(inode, FI_HOT_DATA))
 			return CURSEG_HOT_DATA;
+		/* rw_hint_to_seg_type(inode->i_write_hint); */
 		return CURSEG_WARM_DATA;
 	} else {
 		if (IS_DNODE(fio->page))
@@ -2416,8 +2607,10 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 
+	down_read(&SM_I(sbi)->curseg_lock);
+
 	mutex_lock(&curseg->curseg_mutex);
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
@@ -2434,15 +2627,26 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 
 	stat_inc_block_count(sbi, curseg);
 
+	/*
+	 * SIT information should be updated before segment allocation,
+	 * since SSR needs latest valid block information.
+	 */
+	update_sit_entry(sbi, *new_blkaddr, 1);
+	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+		update_sit_entry(sbi, old_blkaddr, -1);
+
 	if (!__has_curseg_space(sbi, type))
 		sit_i->s_ops->allocate_segment(sbi, type, false);
-	/*
-	 * SIT information should be updated after segment allocation,
-	 * since we need to keep dirty segments precisely under SSR.
-	 */
-	refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
 
-	mutex_unlock(&sit_i->sentry_lock);
+	/*
+	 * segment dirty status should be updated after segment allocation,
+	 * so we just need to update status only one time after previous
+	 * segment being closed.
+	 */
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
+
+	up_write(&sit_i->sentry_lock);
 
 	if (page && IS_NODESEG(type)) {
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
@@ -2462,6 +2666,29 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	}
 
 	mutex_unlock(&curseg->curseg_mutex);
+
+	up_read(&SM_I(sbi)->curseg_lock);
+}
+
+static void update_device_state(struct f2fs_io_info *fio)
+{
+	struct f2fs_sb_info *sbi = fio->sbi;
+	unsigned int devidx;
+
+	if (!sbi->s_ndevs)
+		return;
+
+	devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
+
+	/* update device state for fsync */
+	set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
+
+	/* update device state for checkpoint */
+	if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
+		spin_lock(&sbi->dev_lock);
+		f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
+		spin_unlock(&sbi->dev_lock);
+	}
 }
 
 static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
@@ -2478,6 +2705,8 @@ reallocate:
 	if (err == -EAGAIN) {
 		fio->old_blkaddr = fio->new_blkaddr;
 		goto reallocate;
+	} else if (!err) {
+		update_device_state(fio);
 	}
 }
 
@@ -2538,12 +2767,26 @@ int rewrite_data_page(struct f2fs_io_info *fio)
 	stat_inc_inplace_blocks(fio->sbi);
 
 	err = f2fs_submit_page_bio(fio);
+	if (!err)
+		update_device_state(fio);
 
 	f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
 
 	return err;
 }
 
+static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi,
+						unsigned int segno)
+{
+	int i;
+
+	for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
+		if (CURSEG_I(sbi, i)->segno == segno)
+			break;
+	}
+	return i;
+}
+
 void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 				block_t old_blkaddr, block_t new_blkaddr,
 				bool recover_curseg, bool recover_newaddr)
@@ -2559,6 +2802,8 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	se = get_seg_entry(sbi, segno);
 	type = se->type;
 
+	down_write(&SM_I(sbi)->curseg_lock);
+
 	if (!recover_curseg) {
 		/* for recovery flow */
 		if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
@@ -2568,14 +2813,19 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 				type = CURSEG_WARM_DATA;
 		}
 	} else {
-		if (!IS_CURSEG(sbi, segno))
+		if (IS_CURSEG(sbi, segno)) {
+			/* se->type is volatile as SSR allocation */
+			type = __f2fs_get_curseg(sbi, segno);
+			f2fs_bug_on(sbi, type == NO_CHECK_TYPE);
+		} else {
 			type = CURSEG_WARM_DATA;
+		}
 	}
 
 	curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	old_cursegno = curseg->segno;
 	old_blkoff = curseg->next_blkoff;
@@ -2607,8 +2857,9 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		curseg->next_blkoff = old_blkoff;
 	}
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 	mutex_unlock(&curseg->curseg_mutex);
+	up_write(&SM_I(sbi)->curseg_lock);
 }
 
 void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
@@ -3062,7 +3313,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	bool to_journal = true;
 	struct seg_entry *se;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	if (!sit_i->dirty_sentries)
 		goto out;
@@ -3156,7 +3407,7 @@ out:
 
 		cpc->trim_start = trim_start;
 	}
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 
 	set_prefree_as_free_segments(sbi);
 }
@@ -3249,7 +3500,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
 	sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
 	sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
-	mutex_init(&sit_i->sentry_lock);
+	init_rwsem(&sit_i->sentry_lock);
 	return 0;
 }
 
@@ -3490,7 +3741,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
 	struct sit_info *sit_i = SIT_I(sbi);
 	unsigned int segno;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	sit_i->min_mtime = LLONG_MAX;
 
@@ -3507,7 +3758,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
 			sit_i->min_mtime = mtime;
 	}
 	sit_i->max_mtime = get_mtime(sbi);
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 }
 
 int build_segment_manager(struct f2fs_sb_info *sbi)
@@ -3540,11 +3791,14 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
 	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
 	sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
+	sm_info->min_ssr_sections = reserved_sections(sbi);
 
 	sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
 
 	INIT_LIST_HEAD(&sm_info->sit_entry_set);
 
+	init_rwsem(&sm_info->curseg_lock);
+
 	if (!f2fs_readonly(sbi->sb)) {
 		err = create_flush_cmd_control(sbi);
 		if (err)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ffa11274b0ce..5264b6ed120c 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -231,7 +231,7 @@ struct sit_info {
 	unsigned long *dirty_sentries_bitmap;	/* bitmap for dirty sentries */
 	unsigned int dirty_sentries;		/* # of dirty sentries */
 	unsigned int sents_per_block;		/* # of SIT entries per block */
-	struct mutex sentry_lock;		/* to protect SIT cache */
+	struct rw_semaphore sentry_lock;	/* to protect SIT cache */
 	struct seg_entry *sentries;		/* SIT segment-level cache */
 	struct sec_entry *sec_entries;		/* SIT section-level cache */
 
@@ -497,6 +497,33 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
 	return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
 }
 
+static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
+{
+	unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
+					get_pages(sbi, F2FS_DIRTY_DENTS);
+	unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+	unsigned int segno, left_blocks;
+	int i;
+
+	/* check current node segment */
+	for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
+		segno = CURSEG_I(sbi, i)->segno;
+		left_blocks = sbi->blocks_per_seg -
+			get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+
+		if (node_blocks > left_blocks)
+			return false;
+	}
+
+	/* check current data segment */
+	segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
+	left_blocks = sbi->blocks_per_seg -
+			get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+	if (dent_blocks > left_blocks)
+		return false;
+	return true;
+}
+
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
 					int freed, int needed)
 {
@@ -507,6 +534,9 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		return false;
 
+	if (free_sections(sbi) + freed == reserved_sections(sbi) + needed &&
+			has_curseg_enough_space(sbi))
+		return false;
 	return (free_sections(sbi) + freed) <=
 		(node_secs + 2 * dent_secs + imeta_secs +
 		reserved_sections(sbi) + needed);
@@ -730,7 +760,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
 static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
 						unsigned int secno)
 {
-	if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >=
+	if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >
 						sbi->fggc_threshold)
 		return true;
 	return false;
@@ -795,8 +825,9 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
 		goto wake_up;
 
 	mutex_lock(&dcc->cmd_lock);
-	for (i = MAX_PLIST_NUM - 1;
-			i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+		if (i + 1 < dcc->discard_granularity)
+			break;
 		if (!list_empty(&dcc->pend_list[i])) {
 			wakeup = true;
 			break;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 5c60fc28ec75..0b5664a1a6cc 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -28,7 +28,7 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
 
 static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 {
-	long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
+	long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS;
 
 	return count > 0 ? count : 0;
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 482bb0333806..76e2f1518224 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -44,6 +44,8 @@ static struct kmem_cache *f2fs_inode_cachep;
 char *fault_name[FAULT_MAX] = {
 	[FAULT_KMALLOC]		= "kmalloc",
 	[FAULT_PAGE_ALLOC]	= "page alloc",
+	[FAULT_PAGE_GET]	= "page get",
+	[FAULT_ALLOC_BIO]	= "alloc bio",
 	[FAULT_ALLOC_NID]	= "alloc nid",
 	[FAULT_ORPHAN]		= "orphan",
 	[FAULT_BLOCK]		= "no more block",
@@ -92,6 +94,7 @@ enum {
 	Opt_disable_ext_identify,
 	Opt_inline_xattr,
 	Opt_noinline_xattr,
+	Opt_inline_xattr_size,
 	Opt_inline_data,
 	Opt_inline_dentry,
 	Opt_noinline_dentry,
@@ -141,6 +144,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_disable_ext_identify, "disable_ext_identify"},
 	{Opt_inline_xattr, "inline_xattr"},
 	{Opt_noinline_xattr, "noinline_xattr"},
+	{Opt_inline_xattr_size, "inline_xattr_size=%u"},
 	{Opt_inline_data, "inline_data"},
 	{Opt_inline_dentry, "inline_dentry"},
 	{Opt_noinline_dentry, "noinline_dentry"},
@@ -209,6 +213,12 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 			"quota options when quota turned on");
 		return -EINVAL;
 	}
+	if (f2fs_sb_has_quota_ino(sb)) {
+		f2fs_msg(sb, KERN_INFO,
+			"QUOTA feature is enabled, so ignore qf_name");
+		return 0;
+	}
+
 	qname = match_strdup(args);
 	if (!qname) {
 		f2fs_msg(sb, KERN_ERR,
@@ -287,6 +297,18 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
 			return -1;
 		}
 	}
+
+	if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"QUOTA feature is enabled, so ignore jquota_fmt");
+		sbi->s_jquota_fmt = 0;
+	}
+	if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			 "Filesystem with quota feature cannot be mounted RDWR "
+			 "without CONFIG_QUOTA");
+		return -1;
+	}
 	return 0;
 }
 #endif
@@ -383,6 +405,12 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_noinline_xattr:
 			clear_opt(sbi, INLINE_XATTR);
 			break;
+		case Opt_inline_xattr_size:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			set_opt(sbi, INLINE_XATTR_SIZE);
+			sbi->inline_xattr_size = arg;
+			break;
 #else
 		case Opt_user_xattr:
 			f2fs_msg(sb, KERN_INFO,
@@ -604,6 +632,24 @@ static int parse_options(struct super_block *sb, char *options)
 				F2FS_IO_SIZE_KB(sbi));
 		return -EINVAL;
 	}
+
+	if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+		if (!test_opt(sbi, INLINE_XATTR)) {
+			f2fs_msg(sb, KERN_ERR,
+					"inline_xattr_size option should be "
+					"set with inline_xattr option");
+			return -EINVAL;
+		}
+		if (!sbi->inline_xattr_size ||
+			sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE -
+					F2FS_TOTAL_EXTRA_ATTR_SIZE -
+					DEF_INLINE_RESERVED_SIZE -
+					DEF_MIN_INLINE_SIZE) {
+			f2fs_msg(sb, KERN_ERR,
+					"inline xattr size is out of range");
+			return -EINVAL;
+		}
+	}
 	return 0;
 }
 
@@ -618,13 +664,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	init_once((void *) fi);
 
 	/* Initialize f2fs-specific inode info */
-	fi->vfs_inode.i_version = 1;
 	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
 	init_rwsem(&fi->i_sem);
 	INIT_LIST_HEAD(&fi->dirty_list);
 	INIT_LIST_HEAD(&fi->gdirty_list);
+	INIT_LIST_HEAD(&fi->inmem_ilist);
 	INIT_LIST_HEAD(&fi->inmem_pages);
 	mutex_init(&fi->inmem_lock);
 	init_rwsem(&fi->dio_rwsem[READ]);
@@ -673,7 +719,6 @@ static int f2fs_drop_inode(struct inode *inode)
 
 			sb_end_intwrite(inode->i_sb);
 
-			fscrypt_put_encryption_info(inode, NULL);
 			spin_lock(&inode->i_lock);
 			atomic_dec(&inode->i_count);
 		}
@@ -781,6 +826,7 @@ static void f2fs_put_super(struct super_block *sb)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	int i;
+	bool dropped;
 
 	f2fs_quota_off_umount(sb);
 
@@ -801,9 +847,9 @@ static void f2fs_put_super(struct super_block *sb)
 	}
 
 	/* be sure to wait for any on-going discard commands */
-	f2fs_wait_discard_bios(sbi, true);
+	dropped = f2fs_wait_discard_bios(sbi);
 
-	if (f2fs_discard_en(sbi) && !sbi->discard_blks) {
+	if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) {
 		struct cp_control cpc = {
 			.reason = CP_UMOUNT | CP_TRIMMED,
 		};
@@ -859,6 +905,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	int err = 0;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return 0;
+
 	trace_f2fs_sync_fs(sb, sync);
 
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@@ -958,7 +1007,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks = total_count - start_count;
 	buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
 	buf->f_bavail = user_block_count - valid_user_blocks(sbi) -
-						sbi->reserved_blocks;
+						sbi->current_reserved_blocks;
 
 	avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
 
@@ -1047,6 +1096,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",inline_xattr");
 	else
 		seq_puts(seq, ",noinline_xattr");
+	if (test_opt(sbi, INLINE_XATTR_SIZE))
+		seq_printf(seq, ",inline_xattr_size=%u",
+					sbi->inline_xattr_size);
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 	if (test_opt(sbi, POSIX_ACL))
@@ -1109,6 +1161,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 {
 	/* init some FS parameters */
 	sbi->active_logs = NR_CURSEG_TYPE;
+	sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
 
 	set_opt(sbi, BG_GC);
 	set_opt(sbi, INLINE_XATTR);
@@ -1137,6 +1190,9 @@ static void default_options(struct f2fs_sb_info *sbi)
 #endif
 }
 
+#ifdef CONFIG_QUOTA
+static int f2fs_enable_quotas(struct super_block *sb);
+#endif
 static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -1203,6 +1259,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
 		goto skip;
 
+#ifdef CONFIG_QUOTA
 	if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
 		err = dquot_suspend(sb, -1);
 		if (err < 0)
@@ -1210,9 +1267,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	} else {
 		/* dquot_resume needs RW */
 		sb->s_flags &= ~MS_RDONLY;
-		dquot_resume(sb, -1);
+		if (sb_any_quota_suspended(sb)) {
+			dquot_resume(sb, -1);
+		} else if (f2fs_sb_has_quota_ino(sb)) {
+			err = f2fs_enable_quotas(sb);
+			if (err)
+				goto restore_opts;
+		}
 	}
-
+#endif
 	/* disallow enable/disable extent_cache dynamically */
 	if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
 		err = -EINVAL;
@@ -1321,8 +1384,13 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
 		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
 repeat:
 		page = read_mapping_page(mapping, blkidx, NULL);
-		if (IS_ERR(page))
+		if (IS_ERR(page)) {
+			if (PTR_ERR(page) == -ENOMEM) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto repeat;
+			}
 			return PTR_ERR(page);
+		}
 
 		lock_page(page);
 
@@ -1365,11 +1433,16 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
 	while (towrite > 0) {
 		tocopy = min_t(unsigned long, sb->s_blocksize - offset,
 								towrite);
-
+retry:
 		err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
 							&page, NULL);
-		if (unlikely(err))
+		if (unlikely(err)) {
+			if (err == -ENOMEM) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto retry;
+			}
 			break;
+		}
 
 		kaddr = kmap_atomic(page);
 		memcpy(kaddr + offset, data, tocopy);
@@ -1386,8 +1459,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
 	}
 
 	if (len == towrite)
-		return 0;
-	inode->i_version++;
+		return err;
 	inode->i_mtime = inode->i_ctime = current_time(inode);
 	f2fs_mark_inode_dirty_sync(inode, false);
 	return len - towrite;
@@ -1409,19 +1481,91 @@ static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
 						sbi->s_jquota_fmt, type);
 }
 
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi)
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
 {
-	int i, ret;
+	int enabled = 0;
+	int i, err;
+
+	if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) {
+		err = f2fs_enable_quotas(sbi->sb);
+		if (err) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+					"Cannot turn on quota_ino: %d", err);
+			return 0;
+		}
+		return 1;
+	}
 
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (sbi->s_qf_names[i]) {
-			ret = f2fs_quota_on_mount(sbi, i);
-			if (ret < 0)
-				f2fs_msg(sbi->sb, KERN_ERR,
-					"Cannot turn on journaled "
-					"quota: error %d", ret);
+			err = f2fs_quota_on_mount(sbi, i);
+			if (!err) {
+				enabled = 1;
+				continue;
+			}
+			f2fs_msg(sbi->sb, KERN_ERR,
+				"Cannot turn on quotas: %d on %d", err, i);
 		}
 	}
+	return enabled;
+}
+
+static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
+			     unsigned int flags)
+{
+	struct inode *qf_inode;
+	unsigned long qf_inum;
+	int err;
+
+	BUG_ON(!f2fs_sb_has_quota_ino(sb));
+
+	qf_inum = f2fs_qf_ino(sb, type);
+	if (!qf_inum)
+		return -EPERM;
+
+	qf_inode = f2fs_iget(sb, qf_inum);
+	if (IS_ERR(qf_inode)) {
+		f2fs_msg(sb, KERN_ERR,
+			"Bad quota inode %u:%lu", type, qf_inum);
+		return PTR_ERR(qf_inode);
+	}
+
+	/* Don't account quota for quota files to avoid recursion */
+	qf_inode->i_flags |= S_NOQUOTA;
+	err = dquot_enable(qf_inode, type, format_id, flags);
+	iput(qf_inode);
+	return err;
+}
+
+static int f2fs_enable_quotas(struct super_block *sb)
+{
+	int type, err = 0;
+	unsigned long qf_inum;
+	bool quota_mopt[MAXQUOTAS] = {
+		test_opt(F2FS_SB(sb), USRQUOTA),
+		test_opt(F2FS_SB(sb), GRPQUOTA),
+		test_opt(F2FS_SB(sb), PRJQUOTA),
+	};
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		qf_inum = f2fs_qf_ino(sb, type);
+		if (qf_inum) {
+			err = f2fs_quota_enable(sb, type, QFMT_VFS_V1,
+				DQUOT_USAGE_ENABLED |
+				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
+			if (err) {
+				f2fs_msg(sb, KERN_ERR,
+					"Failed to enable quota tracking "
+					"(type=%d, err=%d). Please run "
+					"fsck to fix.", type, err);
+				for (type--; type >= 0; type--)
+					dquot_quota_off(sb, type);
+				return err;
+			}
+		}
+	}
+	return 0;
 }
 
 static int f2fs_quota_sync(struct super_block *sb, int type)
@@ -1492,7 +1636,7 @@ static int f2fs_quota_off(struct super_block *sb, int type)
 	f2fs_quota_sync(sb, type);
 
 	err = dquot_quota_off(sb, type);
-	if (err)
+	if (err || f2fs_sb_has_quota_ino(sb))
 		goto out_put;
 
 	inode_lock(inode);
@@ -1660,7 +1804,7 @@ static loff_t max_file_blocks(void)
 
 	/*
 	 * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
-	 * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
+	 * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
 	 * space in inode.i_addr, it will be more safe to reassign
 	 * result as zero.
 	 */
@@ -1969,6 +2113,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 		for (j = HOT; j < NR_TEMP_TYPE; j++)
 			mutex_init(&sbi->wio_mutex[i][j]);
 	spin_lock_init(&sbi->cp_lock);
+
+	sbi->dirty_device = 0;
+	spin_lock_init(&sbi->dev_lock);
 }
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -2323,7 +2470,10 @@ try_onemore:
 
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &f2fs_quota_operations;
-	sb->s_qcop = &f2fs_quotactl_ops;
+	if (f2fs_sb_has_quota_ino(sb))
+		sb->s_qcop = &dquot_quotactl_sysfile_ops;
+	else
+		sb->s_qcop = &f2fs_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 #endif
 
@@ -2419,6 +2569,7 @@ try_onemore:
 				le64_to_cpu(sbi->ckpt->valid_block_count);
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
 	sbi->reserved_blocks = 0;
+	sbi->current_reserved_blocks = 0;
 
 	for (i = 0; i < NR_INODE_TYPE; i++) {
 		INIT_LIST_HEAD(&sbi->inode_list[i]);
@@ -2493,10 +2644,24 @@ try_onemore:
 	if (err)
 		goto free_root_inode;
 
+#ifdef CONFIG_QUOTA
+	/*
+	 * Turn on quotas which were not enabled for read-only mounts if
+	 * filesystem has quota feature, so that they are updated correctly.
+	 */
+	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) {
+		err = f2fs_enable_quotas(sb);
+		if (err) {
+			f2fs_msg(sb, KERN_ERR,
+				"Cannot turn on quotas: error %d", err);
+			goto free_sysfs;
+		}
+	}
+#endif
 	/* if there are nt orphan nodes free them */
 	err = recover_orphan_inodes(sbi);
 	if (err)
-		goto free_sysfs;
+		goto free_meta;
 
 	/* recover fsynced data */
 	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -2530,7 +2695,7 @@ try_onemore:
 			err = -EINVAL;
 			f2fs_msg(sb, KERN_ERR,
 				"Need to recover fsync data");
-			goto free_sysfs;
+			goto free_meta;
 		}
 	}
 skip_recovery:
@@ -2564,6 +2729,10 @@ skip_recovery:
 	return 0;
 
 free_meta:
+#ifdef CONFIG_QUOTA
+	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb))
+		f2fs_quota_off_umount(sbi->sb);
+#endif
 	f2fs_sync_inode_meta(sbi);
 	/*
 	 * Some dirty meta pages can be produced by recover_orphan_inodes()
@@ -2572,7 +2741,9 @@ free_meta:
 	 * falls into an infinite loop in sync_meta_pages().
 	 */
 	truncate_inode_pages_final(META_MAPPING(sbi));
+#ifdef CONFIG_QUOTA
 free_sysfs:
+#endif
 	f2fs_unregister_sysfs(sbi);
 free_root_inode:
 	dput(sb->s_root);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index e2c258f717cd..9835348b6e5d 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -30,7 +30,7 @@ enum {
 	FAULT_INFO_RATE,	/* struct f2fs_fault_info */
 	FAULT_INFO_TYPE,	/* struct f2fs_fault_info */
 #endif
-	RESERVED_BLOCKS,
+	RESERVED_BLOCKS,	/* struct f2fs_sb_info */
 };
 
 struct f2fs_attr {
@@ -63,6 +63,13 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
 	return NULL;
 }
 
+static ssize_t dirty_segments_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)(dirty_segments(sbi)));
+}
+
 static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
@@ -100,10 +107,22 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_inode_chksum(sb))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "inode_checksum");
+	if (f2fs_sb_has_flexible_inline_xattr(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "flexible_inline_xattr");
+	if (f2fs_sb_has_quota_ino(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "quota_ino");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
 
+static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
+					struct f2fs_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks);
+}
+
 static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 			struct f2fs_sb_info *sbi, char *buf)
 {
@@ -143,34 +162,22 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
 #endif
 	if (a->struct_type == RESERVED_BLOCKS) {
 		spin_lock(&sbi->stat_lock);
-		if ((unsigned long)sbi->total_valid_block_count + t >
-				(unsigned long)sbi->user_block_count) {
+		if (t > (unsigned long)sbi->user_block_count) {
 			spin_unlock(&sbi->stat_lock);
 			return -EINVAL;
 		}
 		*ui = t;
+		sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+				sbi->user_block_count - valid_user_blocks(sbi));
 		spin_unlock(&sbi->stat_lock);
 		return count;
 	}
 
 	if (!strcmp(a->attr.name, "discard_granularity")) {
-		struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-		int i;
-
 		if (t == 0 || t > MAX_PLIST_NUM)
 			return -EINVAL;
 		if (t == *ui)
 			return count;
-
-		mutex_lock(&dcc->cmd_lock);
-		for (i = 0; i < MAX_PLIST_NUM; i++) {
-			if (i >= t - 1)
-				dcc->pend_list_tag[i] |= P_ACTIVE;
-			else
-				dcc->pend_list_tag[i] &= (~P_ACTIVE);
-		}
-		mutex_unlock(&dcc->cmd_lock);
-
 		*ui = t;
 		return count;
 	}
@@ -222,6 +229,8 @@ enum feat_id {
 	FEAT_EXTRA_ATTR,
 	FEAT_PROJECT_QUOTA,
 	FEAT_INODE_CHECKSUM,
+	FEAT_FLEXIBLE_INLINE_XATTR,
+	FEAT_QUOTA_INO,
 };
 
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -234,6 +243,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 	case FEAT_EXTRA_ATTR:
 	case FEAT_PROJECT_QUOTA:
 	case FEAT_INODE_CHECKSUM:
+	case FEAT_FLEXIBLE_INLINE_XATTR:
+	case FEAT_QUOTA_INO:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
 	return 0;
@@ -279,6 +290,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
@@ -291,8 +303,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
 F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
 F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
 #endif
+F2FS_GENERAL_RO_ATTR(dirty_segments);
 F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
 F2FS_GENERAL_RO_ATTR(features);
+F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
 
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
 F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
@@ -304,6 +318,8 @@ F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
 F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
 F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
 F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
+F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
+F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -321,6 +337,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(min_ipu_util),
 	ATTR_LIST(min_fsync_blocks),
 	ATTR_LIST(min_hot_blocks),
+	ATTR_LIST(min_ssr_sections),
 	ATTR_LIST(max_victim_search),
 	ATTR_LIST(dir_level),
 	ATTR_LIST(ram_thresh),
@@ -333,9 +350,11 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(inject_rate),
 	ATTR_LIST(inject_type),
 #endif
+	ATTR_LIST(dirty_segments),
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(features),
 	ATTR_LIST(reserved_blocks),
+	ATTR_LIST(current_reserved_blocks),
 	NULL,
 };
 
@@ -350,6 +369,8 @@ static struct attribute *f2fs_feat_attrs[] = {
 	ATTR_LIST(extra_attr),
 	ATTR_LIST(project_quota),
 	ATTR_LIST(inode_checksum),
+	ATTR_LIST(flexible_inline_xattr),
+	ATTR_LIST(quota_ino),
 	NULL,
 };
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index ab658419552b..7acf56ebda65 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -264,12 +264,12 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
 	return entry;
 }
 
-static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
-					void **last_addr, int index,
-					size_t len, const char *name)
+static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
+				void *base_addr, void **last_addr, int index,
+				size_t len, const char *name)
 {
 	struct f2fs_xattr_entry *entry;
-	unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2;
+	unsigned int inline_size = inline_xattr_size(inode);
 
 	list_for_each_xattr(entry, base_addr) {
 		if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
@@ -288,12 +288,54 @@ static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
 	return entry;
 }
 
+static int read_inline_xattr(struct inode *inode, struct page *ipage,
+							void *txattr_addr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	unsigned int inline_size = inline_xattr_size(inode);
+	struct page *page = NULL;
+	void *inline_addr;
+
+	if (ipage) {
+		inline_addr = inline_xattr_addr(inode, ipage);
+	} else {
+		page = get_node_page(sbi, inode->i_ino);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		inline_addr = inline_xattr_addr(inode, page);
+	}
+	memcpy(txattr_addr, inline_addr, inline_size);
+	f2fs_put_page(page, 1);
+
+	return 0;
+}
+
+static int read_xattr_block(struct inode *inode, void *txattr_addr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+	unsigned int inline_size = inline_xattr_size(inode);
+	struct page *xpage;
+	void *xattr_addr;
+
+	/* The inode already has an extended attribute block. */
+	xpage = get_node_page(sbi, xnid);
+	if (IS_ERR(xpage))
+		return PTR_ERR(xpage);
+
+	xattr_addr = page_address(xpage);
+	memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE);
+	f2fs_put_page(xpage, 1);
+
+	return 0;
+}
+
 static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 				unsigned int index, unsigned int len,
 				const char *name, struct f2fs_xattr_entry **xe,
 				void **base_addr)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	void *cur_addr, *txattr_addr, *last_addr = NULL;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 	unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
@@ -310,23 +352,11 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 	/* read from inline xattr */
 	if (inline_size) {
-		struct page *page = NULL;
-		void *inline_addr;
+		err = read_inline_xattr(inode, ipage, txattr_addr);
+		if (err)
+			goto out;
 
-		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
-		} else {
-			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page)) {
-				err = PTR_ERR(page);
-				goto out;
-			}
-			inline_addr = inline_xattr_addr(page);
-		}
-		memcpy(txattr_addr, inline_addr, inline_size);
-		f2fs_put_page(page, 1);
-
-		*xe = __find_inline_xattr(txattr_addr, &last_addr,
+		*xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
 						index, len, name);
 		if (*xe)
 			goto check;
@@ -334,19 +364,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 	/* read from xattr node block */
 	if (xnid) {
-		struct page *xpage;
-		void *xattr_addr;
-
-		/* The inode already has an extended attribute block. */
-		xpage = get_node_page(sbi, xnid);
-		if (IS_ERR(xpage)) {
-			err = PTR_ERR(xpage);
+		err = read_xattr_block(inode, txattr_addr);
+		if (err)
 			goto out;
-		}
-
-		xattr_addr = page_address(xpage);
-		memcpy(txattr_addr + inline_size, xattr_addr, size);
-		f2fs_put_page(xpage, 1);
 	}
 
 	if (last_addr)
@@ -371,7 +391,6 @@ out:
 static int read_all_xattrs(struct inode *inode, struct page *ipage,
 							void **base_addr)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_xattr_header *header;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 	unsigned int size = VALID_XATTR_BLOCK_SIZE;
@@ -386,38 +405,16 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
 
 	/* read from inline xattr */
 	if (inline_size) {
-		struct page *page = NULL;
-		void *inline_addr;
-
-		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
-		} else {
-			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page)) {
-				err = PTR_ERR(page);
-				goto fail;
-			}
-			inline_addr = inline_xattr_addr(page);
-		}
-		memcpy(txattr_addr, inline_addr, inline_size);
-		f2fs_put_page(page, 1);
+		err = read_inline_xattr(inode, ipage, txattr_addr);
+		if (err)
+			goto fail;
 	}
 
 	/* read from xattr node block */
 	if (xnid) {
-		struct page *xpage;
-		void *xattr_addr;
-
-		/* The inode already has an extended attribute block. */
-		xpage = get_node_page(sbi, xnid);
-		if (IS_ERR(xpage)) {
-			err = PTR_ERR(xpage);
+		err = read_xattr_block(inode, txattr_addr);
+		if (err)
 			goto fail;
-		}
-
-		xattr_addr = page_address(xpage);
-		memcpy(txattr_addr + inline_size, xattr_addr, size);
-		f2fs_put_page(xpage, 1);
 	}
 
 	header = XATTR_HDR(txattr_addr);
@@ -439,10 +436,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	size_t inline_size = inline_xattr_size(inode);
+	struct page *in_page = NULL;
 	void *xattr_addr;
+	void *inline_addr = NULL;
 	struct page *xpage;
 	nid_t new_nid = 0;
-	int err;
+	int err = 0;
 
 	if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
 		if (!alloc_nid(sbi, &new_nid))
@@ -450,30 +449,30 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 
 	/* write to inline xattr */
 	if (inline_size) {
-		struct page *page = NULL;
-		void *inline_addr;
-
 		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
-			f2fs_wait_on_page_writeback(ipage, NODE, true);
-			set_page_dirty(ipage);
+			inline_addr = inline_xattr_addr(inode, ipage);
 		} else {
-			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page)) {
+			in_page = get_node_page(sbi, inode->i_ino);
+			if (IS_ERR(in_page)) {
 				alloc_nid_failed(sbi, new_nid);
-				return PTR_ERR(page);
+				return PTR_ERR(in_page);
 			}
-			inline_addr = inline_xattr_addr(page);
-			f2fs_wait_on_page_writeback(page, NODE, true);
+			inline_addr = inline_xattr_addr(inode, in_page);
 		}
-		memcpy(inline_addr, txattr_addr, inline_size);
-		f2fs_put_page(page, 1);
 
+		f2fs_wait_on_page_writeback(ipage ? ipage : in_page,
+							NODE, true);
 		/* no need to use xattr node block */
 		if (hsize <= inline_size) {
-			err = truncate_xattr_node(inode, ipage);
+			err = truncate_xattr_node(inode);
 			alloc_nid_failed(sbi, new_nid);
-			return err;
+			if (err) {
+				f2fs_put_page(in_page, 1);
+				return err;
+			}
+			memcpy(inline_addr, txattr_addr, inline_size);
+			set_page_dirty(ipage ? ipage : in_page);
+			goto in_page_out;
 		}
 	}
 
@@ -482,7 +481,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 		xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
 		if (IS_ERR(xpage)) {
 			alloc_nid_failed(sbi, new_nid);
-			return PTR_ERR(xpage);
+			goto in_page_out;
 		}
 		f2fs_bug_on(sbi, new_nid);
 		f2fs_wait_on_page_writeback(xpage, NODE, true);
@@ -492,17 +491,24 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 		xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
 		if (IS_ERR(xpage)) {
 			alloc_nid_failed(sbi, new_nid);
-			return PTR_ERR(xpage);
+			goto in_page_out;
 		}
 		alloc_nid_done(sbi, new_nid);
 	}
-
 	xattr_addr = page_address(xpage);
-	memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
-	set_page_dirty(xpage);
-	f2fs_put_page(xpage, 1);
 
-	return 0;
+	if (inline_size)
+		memcpy(inline_addr, txattr_addr, inline_size);
+	memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
+
+	if (inline_size)
+		set_page_dirty(ipage ? ipage : in_page);
+	set_page_dirty(xpage);
+
+	f2fs_put_page(xpage, 1);
+in_page_out:
+	f2fs_put_page(in_page, 1);
+	return err;
 }
 
 int f2fs_getxattr(struct inode *inode, int index, const char *name,
@@ -721,6 +727,10 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int err;
 
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
+
 	/* this case is only from init_inode_metadata */
 	if (ipage)
 		return __f2fs_setxattr(inode, index, name, value,
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index c2a975e4a711..fef1caeddf54 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -36,6 +36,8 @@
 #define F2FS_NODE_INO(sbi)	(sbi->node_ino_num)
 #define F2FS_META_INO(sbi)	(sbi->meta_ino_num)
 
+#define F2FS_MAX_QUOTAS		3
+
 #define F2FS_IO_SIZE(sbi)	(1 << (sbi)->write_io_size_bits) /* Blocks */
 #define F2FS_IO_SIZE_KB(sbi)	(1 << ((sbi)->write_io_size_bits + 2)) /* KB */
 #define F2FS_IO_SIZE_BYTES(sbi)	(1 << ((sbi)->write_io_size_bits + 12)) /* B */
@@ -108,7 +110,8 @@ struct f2fs_super_block {
 	__u8 encryption_level;		/* versioning level for encryption */
 	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
 	struct f2fs_device devs[MAX_DEVICES];	/* device list */
-	__u8 reserved[327];		/* valid reserved region */
+	__le32 qf_ino[F2FS_MAX_QUOTAS];	/* quota inode numbers */
+	__u8 reserved[315];		/* valid reserved region */
 } __packed;
 
 /*
@@ -184,7 +187,8 @@ struct f2fs_extent {
 } __packed;
 
 #define F2FS_NAME_LEN		255
-#define F2FS_INLINE_XATTR_ADDRS	50	/* 200 bytes for inline xattrs */
+/* 200 bytes for inline xattrs by default */
+#define DEFAULT_INLINE_XATTR_ADDRS	50
 #define DEF_ADDRS_PER_INODE	923	/* Address Pointers in an Inode */
 #define CUR_ADDRS_PER_INODE(inode)	(DEF_ADDRS_PER_INODE - \
 					get_extra_isize(inode))
@@ -238,7 +242,7 @@ struct f2fs_inode {
 	union {
 		struct {
 			__le16 i_extra_isize;	/* extra inode attribute size */
-			__le16 i_padding;	/* padding */
+			__le16 i_inline_xattr_size;	/* inline xattr size, unit: 4 bytes */
 			__le32 i_projid;	/* project id */
 			__le32 i_inode_checksum;/* inode meta checksum */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 7063bbcca03b..589df6f73789 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -128,6 +128,18 @@ TRACE_DEFINE_ENUM(CP_TRIMMED);
 		{ CP_DISCARD,	"Discard" },				\
 		{ CP_UMOUNT | CP_TRIMMED,	"Umount,Trimmed" })
 
+#define show_fsync_cpreason(type)					\
+	__print_symbolic(type,						\
+		{ CP_NO_NEEDED,		"no needed" },			\
+		{ CP_NON_REGULAR,	"non regular" },		\
+		{ CP_HARDLINK,		"hardlink" },			\
+		{ CP_SB_NEED_CP,	"sb needs cp" },		\
+		{ CP_WRONG_PINO,	"wrong pino" },			\
+		{ CP_NO_SPC_ROLL,	"no space roll forward" },	\
+		{ CP_NODE_NEED_CP,	"node needs cp" },		\
+		{ CP_FASTBOOT_MODE,	"fastboot mode" },		\
+		{ CP_SPEC_LOG_NUM,	"log type is 2" })
+
 struct victim_sel_policy;
 struct f2fs_map_blocks;
 
@@ -202,14 +214,14 @@ DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter,
 
 TRACE_EVENT(f2fs_sync_file_exit,
 
-	TP_PROTO(struct inode *inode, int need_cp, int datasync, int ret),
+	TP_PROTO(struct inode *inode, int cp_reason, int datasync, int ret),
 
-	TP_ARGS(inode, need_cp, datasync, ret),
+	TP_ARGS(inode, cp_reason, datasync, ret),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(ino_t,	ino)
-		__field(int,	need_cp)
+		__field(int,	cp_reason)
 		__field(int,	datasync)
 		__field(int,	ret)
 	),
@@ -217,15 +229,15 @@ TRACE_EVENT(f2fs_sync_file_exit,
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
-		__entry->need_cp	= need_cp;
+		__entry->cp_reason	= cp_reason;
 		__entry->datasync	= datasync;
 		__entry->ret		= ret;
 	),
 
-	TP_printk("dev = (%d,%d), ino = %lu, checkpoint is %s, "
+	TP_printk("dev = (%d,%d), ino = %lu, cp_reason: %s, "
 		"datasync = %d, ret = %d",
 		show_dev_ino(__entry),
-		__entry->need_cp ? "needed" : "not needed",
+		show_fsync_cpreason(__entry->cp_reason),
 		__entry->datasync,
 		__entry->ret)
 );
@@ -716,6 +728,91 @@ TRACE_EVENT(f2fs_get_victim,
 		__entry->free)
 );
 
+TRACE_EVENT(f2fs_lookup_start,
+
+	TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags),
+
+	TP_ARGS(dir, dentry, flags),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(const char *,	name)
+		__field(unsigned int, flags)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= dir->i_sb->s_dev;
+		__entry->ino	= dir->i_ino;
+		__entry->name	= dentry->d_name.name;
+		__entry->flags	= flags;
+	),
+
+	TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u",
+		show_dev_ino(__entry),
+		__entry->name,
+		__entry->flags)
+);
+
+TRACE_EVENT(f2fs_lookup_end,
+
+	TP_PROTO(struct inode *dir, struct dentry *dentry, nid_t ino,
+		int err),
+
+	TP_ARGS(dir, dentry, ino, err),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(const char *,	name)
+		__field(nid_t,	cino)
+		__field(int,	err)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= dir->i_sb->s_dev;
+		__entry->ino	= dir->i_ino;
+		__entry->name	= dentry->d_name.name;
+		__entry->cino	= ino;
+		__entry->err	= err;
+	),
+
+	TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d",
+		show_dev_ino(__entry),
+		__entry->name,
+		__entry->cino,
+		__entry->err)
+);
+
+TRACE_EVENT(f2fs_readdir,
+
+	TP_PROTO(struct inode *dir, loff_t start_pos, loff_t end_pos, int err),
+
+	TP_ARGS(dir, start_pos, end_pos, err),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(loff_t,	start)
+		__field(loff_t,	end)
+		__field(int,	err)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= dir->i_sb->s_dev;
+		__entry->ino	= dir->i_ino;
+		__entry->start	= start_pos;
+		__entry->end	= end_pos;
+		__entry->err	= err;
+	),
+
+	TP_printk("dev = (%d,%d), ino = %lu, start_pos:%llu, end_pos:%llu, err:%d",
+		show_dev_ino(__entry),
+		__entry->start,
+		__entry->end,
+		__entry->err)
+);
+
 TRACE_EVENT(f2fs_fallocate,
 
 	TP_PROTO(struct inode *inode, int mode,
@@ -1274,6 +1371,13 @@ DEFINE_EVENT(f2fs_discard, f2fs_issue_discard,
 	TP_ARGS(dev, blkstart, blklen)
 );
 
+DEFINE_EVENT(f2fs_discard, f2fs_remove_discard,
+
+	TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+	TP_ARGS(dev, blkstart, blklen)
+);
+
 TRACE_EVENT(f2fs_issue_reset_zone,
 
 	TP_PROTO(struct block_device *dev, block_t blkstart),

From 67a8ab4adc2d30e3f3442a0659b887b4da83f7c2 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 30 Oct 2017 21:23:19 +0000
Subject: [PATCH 1272/3220] UPSTREAM: arm64: vdso: fix clock_getres for
 4GiB-aligned res

(cherry pick from commit c80ed088a519da53f27b798a69748eaabc66aadf)

The vdso tries to check for a NULL res pointer in __kernel_clock_getres,
but only checks the lower 32 bits as is uses CBZ on the W register the
res pointer is held in.

Thus, if the res pointer happened to be aligned to a 4GiB boundary, we'd
spuriously skip storing the timespec to it, while returning a zero error code
to the caller.

Prevent this by checking the whole pointer, using CBZ on the X register
the res pointer is held in.

Fixes: 9031fefde6f2ac1d ("arm64: VDSO support")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Andrew Pinski <apinski@cavium.com>
Reported-by: Mark Salyzyn <salyzyn@android.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Bug: 20045882
Bug: 63737556
Change-Id: Iab5449d8515f9d655e792e3d7ce43a8f016fa2a0
---
 arch/arm64/kernel/vdso/gettimeofday.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S
index e00b4671bd7c..c97ce91cf023 100644
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ b/arch/arm64/kernel/vdso/gettimeofday.S
@@ -310,7 +310,7 @@ ENTRY(__kernel_clock_getres)
 	b.ne	4f
 	ldr	x2, 6f
 2:
-	cbz	w1, 3f
+	cbz	x1, 3f
 	stp	xzr, x2, [x1]
 
 3:	/* res == NULL. */

From 28850c79d071b4eef0001bb76af2cfb29402f4ac Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 8 Jun 2017 16:44:21 -0700
Subject: [PATCH 1273/3220] BACKPORT: time: Fix CLOCK_MONOTONIC_RAW
 sub-nanosecond accounting

(cherry pick from commit 3d88d56c5873f6eebe23e05c3da701960146b801)

Due to how the MONOTONIC_RAW accumulation logic was handled,
there is the potential for a 1ns discontinuity when we do
accumulations. This small discontinuity has for the most part
gone un-noticed, but since ARM64 enabled CLOCK_MONOTONIC_RAW
in their vDSO clock_gettime implementation, we've seen failures
with the inconsistency-check test in kselftest.

This patch addresses the issue by using the same sub-ns
accumulation handling that CLOCK_MONOTONIC uses, which avoids
the issue for in-kernel users.

Since the ARM64 vDSO implementation has its own clock_gettime
calculation logic, this patch reduces the frequency of errors,
but failures are still seen. The ARM64 vDSO will need to be
updated to include the sub-nanosecond xtime_nsec values in its
calculation for this issue to be completely fixed.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Tested-by: Daniel Mentz <danielmentz@google.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: "stable #4 . 8+" <stable@vger.kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Link: http://lkml.kernel.org/r/1496965462-20003-3-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Bug: 20045882
Bug: 63737556
Change-Id: I6c55dd7685f6bd212c6af9d09c527528e1dd5fa1
---
 include/linux/timekeeper_internal.h |  4 ++--
 kernel/time/timekeeping.c           | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index f0f1793cfa49..115216ec7cfe 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -56,7 +56,7 @@ struct tk_read_base {
  *			interval.
  * @xtime_remainder:	Shifted nano seconds left over when rounding
  *			@cycle_interval
- * @raw_interval:	Raw nano seconds accumulated per NTP interval.
+ * @raw_interval:	Shifted raw nano seconds accumulated per NTP interval.
  * @ntp_error:		Difference between accumulated time and NTP time in ntp
  *			shifted nano seconds.
  * @ntp_error_shift:	Shift conversion between clock shifted nano seconds and
@@ -97,7 +97,7 @@ struct timekeeper {
 	cycle_t			cycle_interval;
 	u64			xtime_interval;
 	s64			xtime_remainder;
-	u32			raw_interval;
+	u64			raw_interval;
 	/* The ntp_tick_length() value currently being used.
 	 * This cached copy ensures we consistently apply the tick
 	 * length for an entire tick, as ntp_tick_length may change
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 738f3467d169..5a514e6002d2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -277,8 +277,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	/* Go back from cycles -> shifted ns */
 	tk->xtime_interval = (u64) interval * clock->mult;
 	tk->xtime_remainder = ntpinterval - tk->xtime_interval;
-	tk->raw_interval =
-		((u64) interval * clock->mult) >> clock->shift;
+	tk->raw_interval = interval * clock->mult;
 
 	 /* if changing clocks, convert xtime_nsec shift units */
 	if (old_clock) {
@@ -1796,7 +1795,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 						unsigned int *clock_set)
 {
 	cycle_t interval = tk->cycle_interval << shift;
-	u64 raw_nsecs;
+	u64 snsec_per_sec;
 
 	/* If the offset is smaller than a shifted interval, do nothing */
 	if (offset < interval)
@@ -1811,14 +1810,15 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 	*clock_set |= accumulate_nsecs_to_secs(tk);
 
 	/* Accumulate raw time */
-	raw_nsecs = (u64)tk->raw_interval << shift;
-	raw_nsecs += tk->raw_time.tv_nsec;
-	if (raw_nsecs >= NSEC_PER_SEC) {
-		u64 raw_secs = raw_nsecs;
-		raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
-		tk->raw_time.tv_sec += raw_secs;
+	tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
+	tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
+	snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+	while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
+		tk->tkr_raw.xtime_nsec -= snsec_per_sec;
+		tk->raw_time.tv_sec++;
 	}
-	tk->raw_time.tv_nsec = raw_nsecs;
+	tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift;
+	tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
 
 	/* Accumulate error between NTP and clock interval */
 	tk->ntp_error += tk->ntp_tick << shift;

From 1d35c0438678c7ad4c367135082685d5754eed20 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 22 May 2017 17:20:20 -0700
Subject: [PATCH 1274/3220] BACKPORT: time: Clean up CLOCK_MONOTONIC_RAW time
 handling

(cherry pick from commit fc6eead7c1e2e5376c25d2795d4539fdacbc0648)

Now that we fixed the sub-ns handling for CLOCK_MONOTONIC_RAW,
remove the duplicitive tk->raw_time.tv_nsec, which can be
stored in tk->tkr_raw.xtime_nsec (similarly to how its handled
for monotonic time).

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Daniel Mentz <danielmentz@google.com>
Tested-by: Daniel Mentz <danielmentz@google.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Bug: 20045882
Bug: 63737556
Change-Id: I243827d21b08703a09d2d2fe738a9258be224582
---
 arch/arm64/kernel/vdso.c            |  4 +--
 include/linux/timekeeper_internal.h |  4 +--
 kernel/time/timekeeping.c           | 45 ++++++++++++++++-------------
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index b7730ba782dd..7e9dd94452bb 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -218,8 +218,8 @@ void update_vsyscall(struct timekeeper *tk)
 	if (!use_syscall) {
 		/* tkr_mono.cycle_last == tkr_raw.cycle_last */
 		vdso_data->cs_cycle_last	= tk->tkr_mono.cycle_last;
-		vdso_data->raw_time_sec		= tk->raw_time.tv_sec;
-		vdso_data->raw_time_nsec	= tk->raw_time.tv_nsec;
+		vdso_data->raw_time_sec         = tk->raw_sec;
+		vdso_data->raw_time_nsec        = tk->tkr_raw.xtime_nsec;
 		vdso_data->xtime_clock_sec	= tk->xtime_sec;
 		vdso_data->xtime_clock_nsec	= tk->tkr_mono.xtime_nsec;
 		/* tkr_raw.xtime_nsec == 0 */
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 115216ec7cfe..3a5af09af18b 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -50,7 +50,7 @@ struct tk_read_base {
  * @tai_offset:		The current UTC to TAI offset in seconds
  * @clock_was_set_seq:	The sequence number of clock was set events
  * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
- * @raw_time:		Monotonic raw base time in timespec64 format
+ * @raw_sec:		CLOCK_MONOTONIC_RAW  time in seconds
  * @cycle_interval:	Number of clock cycles in one NTP interval
  * @xtime_interval:	Number of clock shifted nano seconds in one NTP
  *			interval.
@@ -91,7 +91,7 @@ struct timekeeper {
 	s32			tai_offset;
 	unsigned int		clock_was_set_seq;
 	ktime_t			next_leap_ktime;
-	struct timespec64	raw_time;
+	u64			raw_sec;
 
 	/* The following members are for timekeeping internal use */
 	cycle_t			cycle_interval;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5a514e6002d2..fc86fdcce932 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
 		tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
 		tk->xtime_sec++;
 	}
+	while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
+		tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+		tk->raw_sec++;
+	}
 }
 
 static inline struct timespec64 tk_xtime(struct timekeeper *tk)
@@ -282,12 +286,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	 /* if changing clocks, convert xtime_nsec shift units */
 	if (old_clock) {
 		int shift_change = clock->shift - old_clock->shift;
-		if (shift_change < 0)
+		if (shift_change < 0) {
 			tk->tkr_mono.xtime_nsec >>= -shift_change;
-		else
+			tk->tkr_raw.xtime_nsec >>= -shift_change;
+		} else {
 			tk->tkr_mono.xtime_nsec <<= shift_change;
+			tk->tkr_raw.xtime_nsec <<= shift_change;
+		}
 	}
-	tk->tkr_raw.xtime_nsec = 0;
 
 	tk->tkr_mono.shift = clock->shift;
 	tk->tkr_raw.shift = clock->shift;
@@ -616,9 +622,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	nsec = (u32) tk->wall_to_monotonic.tv_nsec;
 	tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
 
-	/* Update the monotonic raw base */
-	tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
-
 	/*
 	 * The sum of the nanoseconds portions of xtime and
 	 * wall_to_monotonic can be greater/equal one second. Take
@@ -628,6 +631,11 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	if (nsec >= NSEC_PER_SEC)
 		seconds++;
 	tk->ktime_sec = seconds;
+
+	/* Update the monotonic raw base */
+	seconds = tk->raw_sec;
+	nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
+	tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
 }
 
 /* must hold timekeeper_lock */
@@ -669,7 +677,6 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
 	cycle_t cycle_now, delta;
-	s64 nsec;
 
 	cycle_now = tk_clock_read(&tk->tkr_mono);
 	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
@@ -681,10 +688,13 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	/* If arch requires, add in get_arch_timeoffset() */
 	tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
 
-	tk_normalize_xtime(tk);
 
-	nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
-	timespec64_add_ns(&tk->raw_time, nsec);
+	tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+
+	/* If arch requires, add in get_arch_timeoffset() */
+	tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
+
+	tk_normalize_xtime(tk);
 }
 
 /**
@@ -1178,19 +1188,18 @@ int timekeeping_notify(struct clocksource *clock)
 void getrawmonotonic64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	struct timespec64 ts64;
 	unsigned long seq;
 	s64 nsecs;
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
+		ts->tv_sec = tk->raw_sec;
 		nsecs = timekeeping_get_ns(&tk->tkr_raw);
-		ts64 = tk->raw_time;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	timespec64_add_ns(&ts64, nsecs);
-	*ts = ts64;
+	ts->tv_nsec = 0;
+	timespec64_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getrawmonotonic64);
 
@@ -1314,8 +1323,7 @@ void __init timekeeping_init(void)
 	tk_setup_internals(tk, clock);
 
 	tk_set_xtime(tk, &now);
-	tk->raw_time.tv_sec = 0;
-	tk->raw_time.tv_nsec = 0;
+	tk->raw_sec = 0;
 	if (boot.tv_sec == 0 && boot.tv_nsec == 0)
 		boot = tk_xtime(tk);
 
@@ -1810,15 +1818,12 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 	*clock_set |= accumulate_nsecs_to_secs(tk);
 
 	/* Accumulate raw time */
-	tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
 	tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
 	snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
 	while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
 		tk->tkr_raw.xtime_nsec -= snsec_per_sec;
-		tk->raw_time.tv_sec++;
+		tk->raw_sec++;
 	}
-	tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift;
-	tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
 
 	/* Accumulate error between NTP and clock interval */
 	tk->ntp_error += tk->ntp_tick << shift;

From 84baff98495f8e1910890d70e564a497b130669e Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Mon, 20 Nov 2017 19:26:02 +0900
Subject: [PATCH 1275/3220] UPSTREAM: net: xfrm: allow clearing socket xfrm
 policies.

Currently it is possible to add or update socket policies, but
not clear them. Therefore, once a socket policy has been applied,
the socket cannot be used for unencrypted traffic.

This patch allows (privileged) users to clear socket policies by
passing in a NULL pointer and zero length argument to the
{IP,IPV6}_{IPSEC,XFRM}_POLICY setsockopts. This results in both
the incoming and outgoing policies being cleared.

The simple approach taken in this patch cannot clear socket
policies in only one direction. If desired this could be added
in the future, for example by continuing to pass in a length of
zero (which currently is guaranteed to return EMSGSIZE) and
making the policy be a pointer to an integer that contains one
of the XFRM_POLICY_{IN,OUT} enum values.

An alternative would have been to interpret the length as a
signed integer and use XFRM_POLICY_IN (i.e., 0) to clear the
input policy and -XFRM_POLICY_OUT (i.e., -1) to clear the output
policy.

Bug: 65857891
Tested: https://android-review.googlesource.com/539816
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 2 +-
 net/xfrm/xfrm_state.c  | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4096f699ba00..5b3e5f54c79e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1307,7 +1307,7 @@ EXPORT_SYMBOL(xfrm_policy_delete);
 
 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 {
-	struct net *net = xp_net(pol);
+	struct net *net = sock_net(sk);
 	struct xfrm_policy *old_pol;
 
 #ifdef CONFIG_XFRM_SUB_POLICY
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index c0aa1b18fe8f..7944daeb7378 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1845,6 +1845,13 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen
 	struct xfrm_mgr *km;
 	struct xfrm_policy *pol = NULL;
 
+	if (!optval && !optlen) {
+		xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
+		xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
+		__sk_dst_reset(sk);
+		return 0;
+	}
+
 	if (optlen <= 0 || optlen > PAGE_SIZE)
 		return -EMSGSIZE;
 

From 6d7ed3d3cf8119dc3547c8f8cfd1af2c3083ed86 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Mon, 4 Jan 2016 15:37:32 +0100
Subject: [PATCH 1276/3220] ARM: 8478/2: arm/arm64: add arm-smccc

Adds helpers to do SMC and HVC based on ARM SMC Calling Convention.
CONFIG_HAVE_ARM_SMCCC is enabled for architectures that may support the
SMC or HVC instruction. It's the responsibility of the caller to know if
the SMC instruction is supported by the platform.

This patch doesn't provide an implementation of the declared functions.
Later patches will bring in implementations and set
CONFIG_HAVE_ARM_SMCCC for ARM and ARM64 respectively.

Change-Id: I36d7927543c5bbb5313417e93e45fec1fb9997af
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
(cherry picked from commit 98dd64f34f47ce19b388d9015f767f48393a81eb)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/firmware/Kconfig  |   3 ++
 include/linux/arm-smccc.h | 104 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 include/linux/arm-smccc.h

diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index cf478fe6b335..49a3a1185bb6 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -173,6 +173,9 @@ config QCOM_SCM_64
 	def_bool y
 	depends on QCOM_SCM && ARM64
 
+config HAVE_ARM_SMCCC
+	bool
+
 source "drivers/firmware/broadcom/Kconfig"
 source "drivers/firmware/google/Kconfig"
 source "drivers/firmware/efi/Kconfig"
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
new file mode 100644
index 000000000000..b5abfda80465
--- /dev/null
+++ b/include/linux/arm-smccc.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __LINUX_ARM_SMCCC_H
+#define __LINUX_ARM_SMCCC_H
+
+#include <linux/linkage.h>
+#include <linux/types.h>
+
+/*
+ * This file provides common defines for ARM SMC Calling Convention as
+ * specified in
+ * http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html
+ */
+
+#define ARM_SMCCC_STD_CALL		0
+#define ARM_SMCCC_FAST_CALL		1
+#define ARM_SMCCC_TYPE_SHIFT		31
+
+#define ARM_SMCCC_SMC_32		0
+#define ARM_SMCCC_SMC_64		1
+#define ARM_SMCCC_CALL_CONV_SHIFT	30
+
+#define ARM_SMCCC_OWNER_MASK		0x3F
+#define ARM_SMCCC_OWNER_SHIFT		24
+
+#define ARM_SMCCC_FUNC_MASK		0xFFFF
+
+#define ARM_SMCCC_IS_FAST_CALL(smc_val)	\
+	((smc_val) & (ARM_SMCCC_FAST_CALL << ARM_SMCCC_TYPE_SHIFT))
+#define ARM_SMCCC_IS_64(smc_val) \
+	((smc_val) & (ARM_SMCCC_SMC_64 << ARM_SMCCC_CALL_CONV_SHIFT))
+#define ARM_SMCCC_FUNC_NUM(smc_val)	((smc_val) & ARM_SMCCC_FUNC_MASK)
+#define ARM_SMCCC_OWNER_NUM(smc_val) \
+	(((smc_val) >> ARM_SMCCC_OWNER_SHIFT) & ARM_SMCCC_OWNER_MASK)
+
+#define ARM_SMCCC_CALL_VAL(type, calling_convention, owner, func_num) \
+	(((type) << ARM_SMCCC_TYPE_SHIFT) | \
+	((calling_convention) << ARM_SMCCC_CALL_CONV_SHIFT) | \
+	(((owner) & ARM_SMCCC_OWNER_MASK) << ARM_SMCCC_OWNER_SHIFT) | \
+	((func_num) & ARM_SMCCC_FUNC_MASK))
+
+#define ARM_SMCCC_OWNER_ARCH		0
+#define ARM_SMCCC_OWNER_CPU		1
+#define ARM_SMCCC_OWNER_SIP		2
+#define ARM_SMCCC_OWNER_OEM		3
+#define ARM_SMCCC_OWNER_STANDARD	4
+#define ARM_SMCCC_OWNER_TRUSTED_APP	48
+#define ARM_SMCCC_OWNER_TRUSTED_APP_END	49
+#define ARM_SMCCC_OWNER_TRUSTED_OS	50
+#define ARM_SMCCC_OWNER_TRUSTED_OS_END	63
+
+/**
+ * struct arm_smccc_res - Result from SMC/HVC call
+ * @a0-a3 result values from registers 0 to 3
+ */
+struct arm_smccc_res {
+	unsigned long a0;
+	unsigned long a1;
+	unsigned long a2;
+	unsigned long a3;
+};
+
+/**
+ * arm_smccc_smc() - make SMC calls
+ * @a0-a7: arguments passed in registers 0 to 7
+ * @res: result values from registers 0 to 3
+ *
+ * This function is used to make SMC calls following SMC Calling Convention.
+ * The content of the supplied param are copied to registers 0 to 7 prior
+ * to the SMC instruction. The return values are updated with the content
+ * from register 0 to 3 on return from the SMC instruction.
+ */
+asmlinkage void arm_smccc_smc(unsigned long a0, unsigned long a1,
+			unsigned long a2, unsigned long a3, unsigned long a4,
+			unsigned long a5, unsigned long a6, unsigned long a7,
+			struct arm_smccc_res *res);
+
+/**
+ * arm_smccc_hvc() - make HVC calls
+ * @a0-a7: arguments passed in registers 0 to 7
+ * @res: result values from registers 0 to 3
+ *
+ * This function is used to make HVC calls following SMC Calling
+ * Convention.  The content of the supplied param are copied to registers 0
+ * to 7 prior to the HVC instruction. The return values are updated with
+ * the content from register 0 to 3 on return from the HVC instruction.
+ */
+asmlinkage void arm_smccc_hvc(unsigned long a0, unsigned long a1,
+			unsigned long a2, unsigned long a3, unsigned long a4,
+			unsigned long a5, unsigned long a6, unsigned long a7,
+			struct arm_smccc_res *res);
+
+#endif /*__LINUX_ARM_SMCCC_H*/

From e9f7e56e94604d8ff2647c4a79ca751b67b1608d Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Mon, 4 Jan 2016 15:42:55 +0100
Subject: [PATCH 1277/3220] ARM: 8479/2: add implementation for arm-smccc

Adds implementation for arm-smccc and enables CONFIG_HAVE_SMCCC for
architectures that may support arm-smccc. It's the responsibility of the
caller to know if the SMC instruction is supported by the platform.

Change-Id: I11bacaf436a1178fee13169f8f65214011044783
Reviewed-by: Lars Persson <lars.persson@axis.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
(cherry picked from commit b329f95d70f3f955093e9a2b18ac1ed3587a8f73)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 arch/arm/Kconfig             |  1 +
 arch/arm/kernel/Makefile     |  2 ++
 arch/arm/kernel/armksyms.c   |  6 ++++
 arch/arm/kernel/smccc-call.S | 62 ++++++++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+)
 create mode 100644 arch/arm/kernel/smccc-call.S

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 586134e01928..e4bde66b34e8 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -39,6 +39,7 @@ config ARM
 	select HAVE_ARCH_MMAP_RND_BITS if MMU
 	select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_ARM_SMCCC if CPU_V7
 	select HAVE_BPF_JIT
 	select HAVE_CC_STACKPROTECTOR
 	select HAVE_CONTEXT_TRACKING
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 3c789496297f..599c950468fc 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -91,4 +91,6 @@ obj-y				+= psci-call.o
 obj-$(CONFIG_SMP)		+= psci_smp.o
 endif
 
+obj-$(CONFIG_HAVE_ARM_SMCCC)	+= smccc-call.o
+
 extra-y := $(head-y) vmlinux.lds
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index f89811fb9a55..7e45f69a0ddc 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -16,6 +16,7 @@
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/arm-smccc.h>
 
 #include <asm/checksum.h>
 #include <asm/ftrace.h>
@@ -175,3 +176,8 @@ EXPORT_SYMBOL(__gnu_mcount_nc);
 EXPORT_SYMBOL(__pv_phys_pfn_offset);
 EXPORT_SYMBOL(__pv_offset);
 #endif
+
+#ifdef CONFIG_HAVE_ARM_SMCCC
+EXPORT_SYMBOL(arm_smccc_smc);
+EXPORT_SYMBOL(arm_smccc_hvc);
+#endif
diff --git a/arch/arm/kernel/smccc-call.S b/arch/arm/kernel/smccc-call.S
new file mode 100644
index 000000000000..2e48b674aab1
--- /dev/null
+++ b/arch/arm/kernel/smccc-call.S
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/linkage.h>
+
+#include <asm/opcodes-sec.h>
+#include <asm/opcodes-virt.h>
+#include <asm/unwind.h>
+
+	/*
+	 * Wrap c macros in asm macros to delay expansion until after the
+	 * SMCCC asm macro is expanded.
+	 */
+	.macro SMCCC_SMC
+	__SMC(0)
+	.endm
+
+	.macro SMCCC_HVC
+	__HVC(0)
+	.endm
+
+	.macro SMCCC instr
+UNWIND(	.fnstart)
+	mov	r12, sp
+	push	{r4-r7}
+UNWIND(	.save	{r4-r7})
+	ldm	r12, {r4-r7}
+	\instr
+	pop	{r4-r7}
+	ldr	r12, [sp, #(4 * 4)]
+	stm	r12, {r0-r3}
+	bx	lr
+UNWIND(	.fnend)
+	.endm
+
+/*
+ * void smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2,
+ *		  unsigned long a3, unsigned long a4, unsigned long a5,
+ *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_smc)
+	SMCCC SMCCC_SMC
+ENDPROC(arm_smccc_smc)
+
+/*
+ * void smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2,
+ *		  unsigned long a3, unsigned long a4, unsigned long a5,
+ *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_hvc)
+	SMCCC SMCCC_HVC
+ENDPROC(arm_smccc_hvc)

From a794759d492ef0970327134785895942cd2eff92 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Mon, 4 Jan 2016 15:44:32 +0100
Subject: [PATCH 1278/3220] ARM: 8480/2: arm64: add implementation for
 arm-smccc

Adds implementation for arm-smccc and enables CONFIG_HAVE_SMCCC.

Change-Id: Iee20985ae688c68c5f3d6494545718c70057661b
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
(cherry picked from commit 14457459f9ca2ff8521686168ea179edc3a56a44)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 arch/arm64/Kconfig              |  1 +
 arch/arm64/kernel/Makefile      |  2 +-
 arch/arm64/kernel/arm64ksyms.c  |  5 ++++
 arch/arm64/kernel/asm-offsets.c |  3 +++
 arch/arm64/kernel/smccc-call.S  | 43 +++++++++++++++++++++++++++++++++
 5 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kernel/smccc-call.S

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index da007c26de52..1f7ba1848c64 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -99,6 +99,7 @@ config ARM64
 	select SYSCTL_EXCEPTION_TRACE
 	select HAVE_CONTEXT_TRACKING
 	select THREAD_INFO_IN_TASK
+	select HAVE_ARM_SMCCC
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index b570e9e36812..08aebac80e3c 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -17,7 +17,7 @@ arm64-obj-y		:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   hyp-stub.o psci.o psci-call.o cpu_ops.o insn.o	\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
-			   smp.o smp_spin_table.o topology.o
+			   smp.o smp_spin_table.o topology.o smccc-call.o
 
 extra-$(CONFIG_EFI)			:= efi-entry.o
 
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index c654df05b7d7..2dc44406a7ad 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -26,6 +26,7 @@
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/arm-smccc.h>
 
 #include <asm/checksum.h>
 
@@ -68,3 +69,7 @@ EXPORT_SYMBOL(test_and_change_bit);
 #ifdef CONFIG_FUNCTION_TRACER
 EXPORT_SYMBOL(_mcount);
 #endif
+
+	/* arm-smccc */
+EXPORT_SYMBOL(arm_smccc_smc);
+EXPORT_SYMBOL(arm_smccc_hvc);
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 24e65f0897ee..3288c0cae9e6 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -28,6 +28,7 @@
 #include <asm/suspend.h>
 #include <asm/vdso_datapage.h>
 #include <linux/kbuild.h>
+#include <linux/arm-smccc.h>
 
 int main(void)
 {
@@ -179,5 +180,7 @@ int main(void)
   DEFINE(SLEEP_SAVE_SP_PHYS,	offsetof(struct sleep_save_sp, save_ptr_stash_phys));
   DEFINE(SLEEP_SAVE_SP_VIRT,	offsetof(struct sleep_save_sp, save_ptr_stash));
 #endif
+  DEFINE(ARM_SMCCC_RES_X0_OFFS,	offsetof(struct arm_smccc_res, a0));
+  DEFINE(ARM_SMCCC_RES_X2_OFFS,	offsetof(struct arm_smccc_res, a2));
   return 0;
 }
diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
new file mode 100644
index 000000000000..ae0496fa4235
--- /dev/null
+++ b/arch/arm64/kernel/smccc-call.S
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License Version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+	.macro SMCCC instr
+	.cfi_startproc
+	\instr	#0
+	ldr	x4, [sp]
+	stp	x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
+	stp	x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS]
+	ret
+	.cfi_endproc
+	.endm
+
+/*
+ * void arm_smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2,
+ *		  unsigned long a3, unsigned long a4, unsigned long a5,
+ *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_smc)
+	SMCCC	smc
+ENDPROC(arm_smccc_smc)
+
+/*
+ * void arm_smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2,
+ *		  unsigned long a3, unsigned long a4, unsigned long a5,
+ *		  unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_hvc)
+	SMCCC	hvc
+ENDPROC(arm_smccc_hvc)

From 11b4c995ebcb1a3b92602ad25a65c21d842b3c5e Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Mon, 4 Jan 2016 15:46:47 +0100
Subject: [PATCH 1279/3220] ARM: 8481/2: drivers: psci: replace psci firmware
 calls

Switch to use a generic interface for issuing SMC/HVC based on ARM SMC
Calling Convention. Removes now the now unused psci-call.S.

Change-Id: Ic3be85884c9d4726211346c6ecfff60ca51b08b7
Acked-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
(cherry picked from commit e679660dbb8347f275fe5d83a5dd59c1fb6c8e63)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 arch/arm/Kconfig              |  2 +-
 arch/arm/kernel/Makefile      |  1 -
 arch/arm/kernel/psci-call.S   | 31 -------------------------------
 arch/arm64/kernel/Makefile    |  2 +-
 arch/arm64/kernel/psci-call.S | 28 ----------------------------
 drivers/firmware/psci.c       | 23 +++++++++++++++++++++--
 6 files changed, 23 insertions(+), 64 deletions(-)
 delete mode 100644 arch/arm/kernel/psci-call.S
 delete mode 100644 arch/arm64/kernel/psci-call.S

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e4bde66b34e8..07ac8f7df8bb 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1492,7 +1492,7 @@ config HOTPLUG_CPU
 
 config ARM_PSCI
 	bool "Support for the ARM Power State Coordination Interface (PSCI)"
-	depends on CPU_V7
+	depends on HAVE_ARM_SMCCC
 	select ARM_PSCI_FW
 	help
 	  Say Y here if you want Linux to communicate with system firmware
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 599c950468fc..82bdac0f2804 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -87,7 +87,6 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 
 obj-$(CONFIG_ARM_VIRT_EXT)	+= hyp-stub.o
 ifeq ($(CONFIG_ARM_PSCI),y)
-obj-y				+= psci-call.o
 obj-$(CONFIG_SMP)		+= psci_smp.o
 endif
 
diff --git a/arch/arm/kernel/psci-call.S b/arch/arm/kernel/psci-call.S
deleted file mode 100644
index a78e9e1e206d..000000000000
--- a/arch/arm/kernel/psci-call.S
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * Copyright (C) 2015 ARM Limited
- *
- * Author: Mark Rutland <mark.rutland@arm.com>
- */
-
-#include <linux/linkage.h>
-
-#include <asm/opcodes-sec.h>
-#include <asm/opcodes-virt.h>
-
-/* int __invoke_psci_fn_hvc(u32 function_id, u32 arg0, u32 arg1, u32 arg2) */
-ENTRY(__invoke_psci_fn_hvc)
-	__HVC(0)
-	bx	lr
-ENDPROC(__invoke_psci_fn_hvc)
-
-/* int __invoke_psci_fn_smc(u32 function_id, u32 arg0, u32 arg1, u32 arg2) */
-ENTRY(__invoke_psci_fn_smc)
-	__SMC(0)
-	bx	lr
-ENDPROC(__invoke_psci_fn_smc)
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 08aebac80e3c..d10034327423 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -14,7 +14,7 @@ CFLAGS_REMOVE_return_address.o = -pg
 arm64-obj-y		:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   entry-fpsimd.o process.o ptrace.o setup.o signal.o	\
 			   sys.o stacktrace.o time.o traps.o io.o vdso.o	\
-			   hyp-stub.o psci.o psci-call.o cpu_ops.o insn.o	\
+			   hyp-stub.o psci.o cpu_ops.o insn.o	\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o
diff --git a/arch/arm64/kernel/psci-call.S b/arch/arm64/kernel/psci-call.S
deleted file mode 100644
index cf83e61cd3b5..000000000000
--- a/arch/arm64/kernel/psci-call.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * Copyright (C) 2015 ARM Limited
- *
- * Author: Will Deacon <will.deacon@arm.com>
- */
-
-#include <linux/linkage.h>
-
-/* int __invoke_psci_fn_hvc(u64 function_id, u64 arg0, u64 arg1, u64 arg2) */
-ENTRY(__invoke_psci_fn_hvc)
-	hvc	#0
-	ret
-ENDPROC(__invoke_psci_fn_hvc)
-
-/* int __invoke_psci_fn_smc(u64 function_id, u64 arg0, u64 arg1, u64 arg2) */
-ENTRY(__invoke_psci_fn_smc)
-	smc	#0
-	ret
-ENDPROC(__invoke_psci_fn_smc)
diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c
index ae70d2485ca1..b38305ba0965 100644
--- a/drivers/firmware/psci.c
+++ b/drivers/firmware/psci.c
@@ -13,6 +13,7 @@
 
 #define pr_fmt(fmt) "psci: " fmt
 
+#include <linux/arm-smccc.h>
 #include <linux/errno.h>
 #include <linux/linkage.h>
 #include <linux/of.h>
@@ -58,8 +59,6 @@ struct psci_operations psci_ops;
 
 typedef unsigned long (psci_fn)(unsigned long, unsigned long,
 				unsigned long, unsigned long);
-asmlinkage psci_fn __invoke_psci_fn_hvc;
-asmlinkage psci_fn __invoke_psci_fn_smc;
 static psci_fn *invoke_psci_fn;
 
 enum psci_function {
@@ -107,6 +106,26 @@ bool psci_power_state_is_valid(u32 state)
 	return !(state & ~valid_mask);
 }
 
+static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
+			unsigned long arg0, unsigned long arg1,
+			unsigned long arg2)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_hvc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);
+	return res.a0;
+}
+
+static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
+			unsigned long arg0, unsigned long arg1,
+			unsigned long arg2)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_smc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);
+	return res.a0;
+}
+
 static int psci_to_linux_errno(int errno)
 {
 	switch (errno) {

From c51866baa4fba000c0075f8625139014a6c248f5 Mon Sep 17 00:00:00 2001
From: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Date: Tue, 26 Apr 2016 12:32:27 -0300
Subject: [PATCH 1280/3220] kernel.h: add u64_to_user_ptr()

This function had copies in 3 different files. Unify them in kernel.h.

Change-Id: Iff8fbe002d3ceb81068e9cbd0acdf5fd5be14783
Cc: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Rob Clark <robdclark@gmail.com>
Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Acked-by: Daniel Vetter <daniel.vetter@intel.com>	[drm/i915/]
Acked-by: Rob Clark <robdclark@gmail.com>		[drm/msm/]
Acked-by: Lucas Stach <l.stach@pengutronix.de>		[drm/etinav/]
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 3ed605bc8a0a688d8750a1e2eff39c854418c5cf)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/gpu/drm/i915/i915_drv.h            |  5 -----
 drivers/gpu/drm/i915/i915_gem.c            | 14 +++++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 14 +++++++-------
 drivers/gpu/drm/msm/msm_gem_submit.c       | 11 +++--------
 include/linux/kernel.h                     |  7 +++++++
 5 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5044f2257e89..6fca39e1c419 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3475,11 +3475,6 @@ static inline uint32_t i915_vgacntrl_reg(struct drm_device *dev)
 		return VGACNTRL;
 }
 
-static inline void __user *to_user_ptr(u64 address)
-{
-	return (void __user *)(uintptr_t)address;
-}
-
 static inline unsigned long msecs_to_jiffies_timeout(const unsigned int m)
 {
 	unsigned long j = msecs_to_jiffies(m);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f56af0aaafde..659b90657f36 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -324,7 +324,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 {
 	struct drm_device *dev = obj->base.dev;
 	void *vaddr = obj->phys_handle->vaddr + args->offset;
-	char __user *user_data = to_user_ptr(args->data_ptr);
+	char __user *user_data = u64_to_user_ptr(args->data_ptr);
 	int ret = 0;
 
 	/* We manually control the domain here and pretend that it
@@ -605,7 +605,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 	int needs_clflush = 0;
 	struct sg_page_iter sg_iter;
 
-	user_data = to_user_ptr(args->data_ptr);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
@@ -692,7 +692,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 		return 0;
 
 	if (!access_ok(VERIFY_WRITE,
-		       to_user_ptr(args->data_ptr),
+		       u64_to_user_ptr(args->data_ptr),
 		       args->size))
 		return -EFAULT;
 
@@ -783,7 +783,7 @@ i915_gem_gtt_pwrite_fast(struct drm_device *dev,
 	if (ret)
 		goto out_unpin;
 
-	user_data = to_user_ptr(args->data_ptr);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	offset = i915_gem_obj_ggtt_offset(obj) + args->offset;
@@ -907,7 +907,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 	int needs_clflush_before = 0;
 	struct sg_page_iter sg_iter;
 
-	user_data = to_user_ptr(args->data_ptr);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
@@ -1036,12 +1036,12 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 		return 0;
 
 	if (!access_ok(VERIFY_READ,
-		       to_user_ptr(args->data_ptr),
+		       u64_to_user_ptr(args->data_ptr),
 		       args->size))
 		return -EFAULT;
 
 	if (likely(!i915.prefault_disable)) {
-		ret = fault_in_multipages_readable(to_user_ptr(args->data_ptr),
+		ret = fault_in_multipages_readable(u64_to_user_ptr(args->data_ptr),
 						   args->size);
 		if (ret)
 			return -EFAULT;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6ed7d63a0688..c66307c58a7f 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -492,7 +492,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	int remain, ret;
 
-	user_relocs = to_user_ptr(entry->relocs_ptr);
+	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
 
 	remain = entry->relocation_count;
 	while (remain) {
@@ -833,7 +833,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		u64 invalid_offset = (u64)-1;
 		int j;
 
-		user_relocs = to_user_ptr(exec[i].relocs_ptr);
+		user_relocs = u64_to_user_ptr(exec[i].relocs_ptr);
 
 		if (copy_from_user(reloc+total, user_relocs,
 				   exec[i].relocation_count * sizeof(*reloc))) {
@@ -977,7 +977,7 @@ validate_exec_list(struct drm_device *dev,
 		invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
 
 	for (i = 0; i < count; i++) {
-		char __user *ptr = to_user_ptr(exec[i].relocs_ptr);
+		char __user *ptr = u64_to_user_ptr(exec[i].relocs_ptr);
 		int length; /* limited by fault_in_pages_readable() */
 
 		if (exec[i].flags & invalid_flags)
@@ -1635,7 +1635,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 		return -ENOMEM;
 	}
 	ret = copy_from_user(exec_list,
-			     to_user_ptr(args->buffers_ptr),
+			     u64_to_user_ptr(args->buffers_ptr),
 			     sizeof(*exec_list) * args->buffer_count);
 	if (ret != 0) {
 		DRM_DEBUG("copy %d exec entries failed %d\n",
@@ -1671,7 +1671,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 	ret = i915_gem_do_execbuffer(dev, data, file, &exec2, exec2_list);
 	if (!ret) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
-			to_user_ptr(args->buffers_ptr);
+			u64_to_user_ptr(args->buffers_ptr);
 
 		/* Copy the new buffer offsets back to the user's exec list. */
 		for (i = 0; i < args->buffer_count; i++) {
@@ -1723,7 +1723,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -ENOMEM;
 	}
 	ret = copy_from_user(exec2_list,
-			     to_user_ptr(args->buffers_ptr),
+			     u64_to_user_ptr(args->buffers_ptr),
 			     sizeof(*exec2_list) * args->buffer_count);
 	if (ret != 0) {
 		DRM_DEBUG("copy %d exec entries failed %d\n",
@@ -1736,7 +1736,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 	if (!ret) {
 		/* Copy the new buffer offsets back to the user's exec list. */
 		struct drm_i915_gem_exec_object2 __user *user_exec_list =
-				   to_user_ptr(args->buffers_ptr);
+				   u64_to_user_ptr(args->buffers_ptr);
 		int i;
 
 		for (i = 0; i < args->buffer_count; i++) {
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index f4eaccb191d4..2422be9a6828 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -28,11 +28,6 @@
 #define BO_LOCKED   0x4000
 #define BO_PINNED   0x2000
 
-static inline void __user *to_user_ptr(u64 address)
-{
-	return (void __user *)(uintptr_t)address;
-}
-
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
 		struct msm_gpu *gpu, uint32_t nr)
 {
@@ -80,7 +75,7 @@ static int submit_lookup_objects(struct msm_gem_submit *submit,
 		struct drm_gem_object *obj;
 		struct msm_gem_object *msm_obj;
 		void __user *userptr =
-			to_user_ptr(args->bos + (i * sizeof(submit_bo)));
+			u64_to_user_ptr(args->bos + (i * sizeof(submit_bo)));
 
 		ret = copy_from_user_inatomic(&submit_bo, userptr, sizeof(submit_bo));
 		if (unlikely(ret)) {
@@ -278,7 +273,7 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob
 	for (i = 0; i < nr_relocs; i++) {
 		struct drm_msm_gem_submit_reloc submit_reloc;
 		void __user *userptr =
-			to_user_ptr(relocs + (i * sizeof(submit_reloc)));
+			u64_to_user_ptr(relocs + (i * sizeof(submit_reloc)));
 		uint32_t iova, off;
 		bool valid;
 
@@ -378,7 +373,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	for (i = 0; i < args->nr_cmds; i++) {
 		struct drm_msm_gem_submit_cmd submit_cmd;
 		void __user *userptr =
-			to_user_ptr(args->cmds + (i * sizeof(submit_cmd)));
+			u64_to_user_ptr(args->cmds + (i * sizeof(submit_cmd)));
 		struct msm_gem_object *msm_obj;
 		uint32_t iova;
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 50220cab738c..05b63a1e9f84 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -53,6 +53,13 @@
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
+#define u64_to_user_ptr(x) (		\
+{					\
+	typecheck(u64, x);		\
+	(void __user *)(uintptr_t)x;	\
+}					\
+)
+
 /*
  * This looks more complex than it should be. But we need to
  * get the type for the ~ right in round_down (it needs to be

From 9aad2ea1a413dc97fadb344763cc80862734dd3a Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Thu, 21 May 2015 07:47:09 +0200
Subject: [PATCH 1281/3220] dt/bindings: add bindings for optee

Introduces linaro prefix and adds bindings for ARM TrustZone based OP-TEE
implementation.

Change-Id: If4b626d0e8a14e2f1cdeac8aa0837e8ddc4dcbdd
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit c8bfafb1594435889b571b79325011e8b7fd087b)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 .../bindings/arm/firmware/linaro,optee-tz.txt | 31 +++++++++++++++++++
 .../devicetree/bindings/vendor-prefixes.txt   |  1 +
 2 files changed, 32 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt

diff --git a/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt
new file mode 100644
index 000000000000..d38834c67dff
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt
@@ -0,0 +1,31 @@
+OP-TEE Device Tree Bindings
+
+OP-TEE is a piece of software using hardware features to provide a Trusted
+Execution Environment. The security can be provided with ARM TrustZone, but
+also by virtualization or a separate chip.
+
+We're using "linaro" as the first part of the compatible property for
+the reference implementation maintained by Linaro.
+
+* OP-TEE based on ARM TrustZone required properties:
+
+- compatible     : should contain "linaro,optee-tz"
+
+- method         : The method of calling the OP-TEE Trusted OS. Permitted
+                   values are:
+
+                   "smc" : SMC #0, with the register assignments specified
+		           in drivers/tee/optee/optee_smc.h
+
+                   "hvc" : HVC #0, with the register assignments specified
+		           in drivers/tee/optee/optee_smc.h
+
+
+
+Example:
+	firmware {
+		optee {
+			compatible = "linaro,optee-tz";
+			method = "smc";
+		};
+	};
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index 98dc17507a84..7004fa94c368 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -128,6 +128,7 @@ lacie	LaCie
 lantiq	Lantiq Semiconductor
 lenovo	Lenovo Group Ltd.
 lg	LG Corporation
+linaro	Linaro Limited
 linux	Linux-specific binding
 lsi	LSI Corp. (LSI Logic)
 lltc	Linear Technology Corporation

From 048370cd16c142d16171e1ad3f91ee5f80d3b6dc Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Wed, 11 Mar 2015 14:39:39 +0100
Subject: [PATCH 1282/3220] tee: generic TEE subsystem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Initial patch for generic TEE subsystem.
This subsystem provides:
* Registration/un-registration of TEE drivers.
* Shared memory between normal world and secure world.
* Ioctl interface for interaction with user space.
* Sysfs implementation_id of TEE driver

A TEE (Trusted Execution Environment) driver is a driver that interfaces
with a trusted OS running in some secure environment, for example,
TrustZone on ARM cpus, or a separate secure co-processor etc.

The TEE subsystem can serve a TEE driver for a Global Platform compliant
TEE, but it's not limited to only Global Platform TEEs.

This patch builds on other similar implementations trying to solve
the same problem:
* "optee_linuxdriver" by among others
  Jean-michel DELORME<jean-michel.delorme@st.com> and
  Emmanuel MICHEL <emmanuel.michel@st.com>
* "Generic TrustZone Driver" by Javier González <javier@javigon.com>

Change-Id: I35b763e23b706383df5013c429c510c68d7f4176
Acked-by: Andreas Dannenberg <dannenberg@ti.com>
Tested-by: Jerome Forissier <jerome.forissier@linaro.org> (HiKey)
Tested-by: Volodymyr Babchuk <vlad.babchuk@gmail.com> (RCAR H3)
Tested-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 967c9cca2cc50569efc65945325c173cecba83bd)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 Documentation/ioctl/ioctl-number.txt |   1 +
 MAINTAINERS                          |   7 +
 drivers/Kconfig                      |   2 +
 drivers/Makefile                     |   1 +
 drivers/tee/Kconfig                  |   8 +
 drivers/tee/Makefile                 |   4 +
 drivers/tee/tee_core.c               | 893 +++++++++++++++++++++++++++
 drivers/tee/tee_private.h            | 129 ++++
 drivers/tee/tee_shm.c                | 358 +++++++++++
 drivers/tee/tee_shm_pool.c           | 156 +++++
 include/linux/tee_drv.h              | 277 +++++++++
 include/uapi/linux/tee.h             | 346 +++++++++++
 12 files changed, 2182 insertions(+)
 create mode 100644 drivers/tee/Kconfig
 create mode 100644 drivers/tee/Makefile
 create mode 100644 drivers/tee/tee_core.c
 create mode 100644 drivers/tee/tee_private.h
 create mode 100644 drivers/tee/tee_shm.c
 create mode 100644 drivers/tee/tee_shm_pool.c
 create mode 100644 include/linux/tee_drv.h
 create mode 100644 include/uapi/linux/tee.h

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 91261a32a573..b5ce7b6c3576 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -307,6 +307,7 @@ Code  Seq#(hex)	Include File		Comments
 0xA3	80-8F	Port ACL		in development:
 					<mailto:tlewis@mindspring.com>
 0xA3	90-9F	linux/dtlk.h
+0xA4	00-1F	uapi/linux/tee.h	Generic TEE subsystem
 0xAA	00-3F	linux/uapi/linux/userfaultfd.h
 0xAB	00-1F	linux/nbd.h
 0xAC	00-1F	linux/raw.h
diff --git a/MAINTAINERS b/MAINTAINERS
index ab65bbecb159..ef689b0afc53 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9361,6 +9361,13 @@ F:	drivers/hwtracing/stm/
 F:	include/linux/stm.h
 F:	include/uapi/linux/stm.h
 
+TEE SUBSYSTEM
+M:	Jens Wiklander <jens.wiklander@linaro.org>
+S:	Maintained
+F:	include/linux/tee_drv.h
+F:	include/uapi/linux/tee.h
+F:	drivers/tee/
+
 THUNDERBOLT DRIVER
 M:	Andreas Noever <andreas.noever@gmail.com>
 S:	Maintained
diff --git a/drivers/Kconfig b/drivers/Kconfig
index d2ac339de85f..63baceb6c118 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -198,4 +198,6 @@ source "drivers/hwtracing/intel_th/Kconfig"
 
 source "drivers/fpga/Kconfig"
 
+source "drivers/tee/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 098997f2cc3a..a2a1fbd53bbe 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -173,3 +173,4 @@ obj-$(CONFIG_STM)		+= hwtracing/stm/
 obj-$(CONFIG_ANDROID)		+= android/
 obj-$(CONFIG_NVMEM)		+= nvmem/
 obj-$(CONFIG_FPGA)		+= fpga/
+obj-$(CONFIG_TEE)		+= tee/
diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig
new file mode 100644
index 000000000000..50c244ead46d
--- /dev/null
+++ b/drivers/tee/Kconfig
@@ -0,0 +1,8 @@
+# Generic Trusted Execution Environment Configuration
+config TEE
+	tristate "Trusted Execution Environment support"
+	select DMA_SHARED_BUFFER
+	select GENERIC_ALLOCATOR
+	help
+	  This implements a generic interface towards a Trusted Execution
+	  Environment (TEE).
diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile
new file mode 100644
index 000000000000..ec64047a86e2
--- /dev/null
+++ b/drivers/tee/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_TEE) += tee.o
+tee-objs += tee_core.o
+tee-objs += tee_shm.o
+tee-objs += tee_shm_pool.o
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
new file mode 100644
index 000000000000..5c60bf4423e6
--- /dev/null
+++ b/drivers/tee/tee_core.c
@@ -0,0 +1,893 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include <linux/uaccess.h>
+#include "tee_private.h"
+
+#define TEE_NUM_DEVICES	32
+
+#define TEE_IOCTL_PARAM_SIZE(x) (sizeof(struct tee_param) * (x))
+
+/*
+ * Unprivileged devices in the lower half range and privileged devices in
+ * the upper half range.
+ */
+static DECLARE_BITMAP(dev_mask, TEE_NUM_DEVICES);
+static DEFINE_SPINLOCK(driver_lock);
+
+static struct class *tee_class;
+static dev_t tee_devt;
+
+static int tee_open(struct inode *inode, struct file *filp)
+{
+	int rc;
+	struct tee_device *teedev;
+	struct tee_context *ctx;
+
+	teedev = container_of(inode->i_cdev, struct tee_device, cdev);
+	if (!tee_device_get(teedev))
+		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	ctx->teedev = teedev;
+	INIT_LIST_HEAD(&ctx->list_shm);
+	filp->private_data = ctx;
+	rc = teedev->desc->ops->open(ctx);
+	if (rc)
+		goto err;
+
+	return 0;
+err:
+	kfree(ctx);
+	tee_device_put(teedev);
+	return rc;
+}
+
+static int tee_release(struct inode *inode, struct file *filp)
+{
+	struct tee_context *ctx = filp->private_data;
+	struct tee_device *teedev = ctx->teedev;
+	struct tee_shm *shm;
+
+	ctx->teedev->desc->ops->release(ctx);
+	mutex_lock(&ctx->teedev->mutex);
+	list_for_each_entry(shm, &ctx->list_shm, link)
+		shm->ctx = NULL;
+	mutex_unlock(&ctx->teedev->mutex);
+	kfree(ctx);
+	tee_device_put(teedev);
+	return 0;
+}
+
+static int tee_ioctl_version(struct tee_context *ctx,
+			     struct tee_ioctl_version_data __user *uvers)
+{
+	struct tee_ioctl_version_data vers;
+
+	ctx->teedev->desc->ops->get_version(ctx->teedev, &vers);
+	if (copy_to_user(uvers, &vers, sizeof(vers)))
+		return -EFAULT;
+	return 0;
+}
+
+static int tee_ioctl_shm_alloc(struct tee_context *ctx,
+			       struct tee_ioctl_shm_alloc_data __user *udata)
+{
+	long ret;
+	struct tee_ioctl_shm_alloc_data data;
+	struct tee_shm *shm;
+
+	if (copy_from_user(&data, udata, sizeof(data)))
+		return -EFAULT;
+
+	/* Currently no input flags are supported */
+	if (data.flags)
+		return -EINVAL;
+
+	data.id = -1;
+
+	shm = tee_shm_alloc(ctx, data.size, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF);
+	if (IS_ERR(shm))
+		return PTR_ERR(shm);
+
+	data.id = shm->id;
+	data.flags = shm->flags;
+	data.size = shm->size;
+
+	if (copy_to_user(udata, &data, sizeof(data)))
+		ret = -EFAULT;
+	else
+		ret = tee_shm_get_fd(shm);
+
+	/*
+	 * When user space closes the file descriptor the shared memory
+	 * should be freed or if tee_shm_get_fd() failed then it will
+	 * be freed immediately.
+	 */
+	tee_shm_put(shm);
+	return ret;
+}
+
+static int params_from_user(struct tee_context *ctx, struct tee_param *params,
+			    size_t num_params,
+			    struct tee_ioctl_param __user *uparams)
+{
+	size_t n;
+
+	for (n = 0; n < num_params; n++) {
+		struct tee_shm *shm;
+		struct tee_ioctl_param ip;
+
+		if (copy_from_user(&ip, uparams + n, sizeof(ip)))
+			return -EFAULT;
+
+		/* All unused attribute bits has to be zero */
+		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_TYPE_MASK)
+			return -EINVAL;
+
+		params[n].attr = ip.attr;
+		switch (ip.attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_NONE:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			params[n].u.value.a = ip.a;
+			params[n].u.value.b = ip.b;
+			params[n].u.value.c = ip.c;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			/*
+			 * If we fail to get a pointer to a shared memory
+			 * object (and increase the ref count) from an
+			 * identifier we return an error. All pointers that
+			 * has been added in params have an increased ref
+			 * count. It's the callers responibility to do
+			 * tee_shm_put() on all resolved pointers.
+			 */
+			shm = tee_shm_get_from_id(ctx, ip.c);
+			if (IS_ERR(shm))
+				return PTR_ERR(shm);
+
+			params[n].u.memref.shm_offs = ip.a;
+			params[n].u.memref.size = ip.b;
+			params[n].u.memref.shm = shm;
+			break;
+		default:
+			/* Unknown attribute */
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int params_to_user(struct tee_ioctl_param __user *uparams,
+			  size_t num_params, struct tee_param *params)
+{
+	size_t n;
+
+	for (n = 0; n < num_params; n++) {
+		struct tee_ioctl_param __user *up = uparams + n;
+		struct tee_param *p = params + n;
+
+		switch (p->attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			if (put_user(p->u.value.a, &up->a) ||
+			    put_user(p->u.value.b, &up->b) ||
+			    put_user(p->u.value.c, &up->c))
+				return -EFAULT;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			if (put_user((u64)p->u.memref.size, &up->b))
+				return -EFAULT;
+		default:
+			break;
+		}
+	}
+	return 0;
+}
+
+static bool param_is_memref(struct tee_param *param)
+{
+	switch (param->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int tee_ioctl_open_session(struct tee_context *ctx,
+				  struct tee_ioctl_buf_data __user *ubuf)
+{
+	int rc;
+	size_t n;
+	struct tee_ioctl_buf_data buf;
+	struct tee_ioctl_open_session_arg __user *uarg;
+	struct tee_ioctl_open_session_arg arg;
+	struct tee_ioctl_param __user *uparams = NULL;
+	struct tee_param *params = NULL;
+	bool have_session = false;
+
+	if (!ctx->teedev->desc->ops->open_session)
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, sizeof(buf)))
+		return -EFAULT;
+
+	if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+	    buf.buf_len < sizeof(struct tee_ioctl_open_session_arg))
+		return -EINVAL;
+
+	uarg = u64_to_user_ptr(buf.buf_ptr);
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len)
+		return -EINVAL;
+
+	if (arg.num_params) {
+		params = kcalloc(arg.num_params, sizeof(struct tee_param),
+				 GFP_KERNEL);
+		if (!params)
+			return -ENOMEM;
+		uparams = uarg->params;
+		rc = params_from_user(ctx, params, arg.num_params, uparams);
+		if (rc)
+			goto out;
+	}
+
+	rc = ctx->teedev->desc->ops->open_session(ctx, &arg, params);
+	if (rc)
+		goto out;
+	have_session = true;
+
+	if (put_user(arg.session, &uarg->session) ||
+	    put_user(arg.ret, &uarg->ret) ||
+	    put_user(arg.ret_origin, &uarg->ret_origin)) {
+		rc = -EFAULT;
+		goto out;
+	}
+	rc = params_to_user(uparams, arg.num_params, params);
+out:
+	/*
+	 * If we've succeeded to open the session but failed to communicate
+	 * it back to user space, close the session again to avoid leakage.
+	 */
+	if (rc && have_session && ctx->teedev->desc->ops->close_session)
+		ctx->teedev->desc->ops->close_session(ctx, arg.session);
+
+	if (params) {
+		/* Decrease ref count for all valid shared memory pointers */
+		for (n = 0; n < arg.num_params; n++)
+			if (param_is_memref(params + n) &&
+			    params[n].u.memref.shm)
+				tee_shm_put(params[n].u.memref.shm);
+		kfree(params);
+	}
+
+	return rc;
+}
+
+static int tee_ioctl_invoke(struct tee_context *ctx,
+			    struct tee_ioctl_buf_data __user *ubuf)
+{
+	int rc;
+	size_t n;
+	struct tee_ioctl_buf_data buf;
+	struct tee_ioctl_invoke_arg __user *uarg;
+	struct tee_ioctl_invoke_arg arg;
+	struct tee_ioctl_param __user *uparams = NULL;
+	struct tee_param *params = NULL;
+
+	if (!ctx->teedev->desc->ops->invoke_func)
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, sizeof(buf)))
+		return -EFAULT;
+
+	if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+	    buf.buf_len < sizeof(struct tee_ioctl_invoke_arg))
+		return -EINVAL;
+
+	uarg = u64_to_user_ptr(buf.buf_ptr);
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len)
+		return -EINVAL;
+
+	if (arg.num_params) {
+		params = kcalloc(arg.num_params, sizeof(struct tee_param),
+				 GFP_KERNEL);
+		if (!params)
+			return -ENOMEM;
+		uparams = uarg->params;
+		rc = params_from_user(ctx, params, arg.num_params, uparams);
+		if (rc)
+			goto out;
+	}
+
+	rc = ctx->teedev->desc->ops->invoke_func(ctx, &arg, params);
+	if (rc)
+		goto out;
+
+	if (put_user(arg.ret, &uarg->ret) ||
+	    put_user(arg.ret_origin, &uarg->ret_origin)) {
+		rc = -EFAULT;
+		goto out;
+	}
+	rc = params_to_user(uparams, arg.num_params, params);
+out:
+	if (params) {
+		/* Decrease ref count for all valid shared memory pointers */
+		for (n = 0; n < arg.num_params; n++)
+			if (param_is_memref(params + n) &&
+			    params[n].u.memref.shm)
+				tee_shm_put(params[n].u.memref.shm);
+		kfree(params);
+	}
+	return rc;
+}
+
+static int tee_ioctl_cancel(struct tee_context *ctx,
+			    struct tee_ioctl_cancel_arg __user *uarg)
+{
+	struct tee_ioctl_cancel_arg arg;
+
+	if (!ctx->teedev->desc->ops->cancel_req)
+		return -EINVAL;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	return ctx->teedev->desc->ops->cancel_req(ctx, arg.cancel_id,
+						  arg.session);
+}
+
+static int
+tee_ioctl_close_session(struct tee_context *ctx,
+			struct tee_ioctl_close_session_arg __user *uarg)
+{
+	struct tee_ioctl_close_session_arg arg;
+
+	if (!ctx->teedev->desc->ops->close_session)
+		return -EINVAL;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	return ctx->teedev->desc->ops->close_session(ctx, arg.session);
+}
+
+static int params_to_supp(struct tee_context *ctx,
+			  struct tee_ioctl_param __user *uparams,
+			  size_t num_params, struct tee_param *params)
+{
+	size_t n;
+
+	for (n = 0; n < num_params; n++) {
+		struct tee_ioctl_param ip;
+		struct tee_param *p = params + n;
+
+		ip.attr = p->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK;
+		switch (p->attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			ip.a = p->u.value.a;
+			ip.b = p->u.value.b;
+			ip.c = p->u.value.c;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			ip.b = p->u.memref.size;
+			if (!p->u.memref.shm) {
+				ip.a = 0;
+				ip.c = (u64)-1; /* invalid shm id */
+				break;
+			}
+			ip.a = p->u.memref.shm_offs;
+			ip.c = p->u.memref.shm->id;
+			break;
+		default:
+			ip.a = 0;
+			ip.b = 0;
+			ip.c = 0;
+			break;
+		}
+
+		if (copy_to_user(uparams + n, &ip, sizeof(ip)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int tee_ioctl_supp_recv(struct tee_context *ctx,
+			       struct tee_ioctl_buf_data __user *ubuf)
+{
+	int rc;
+	struct tee_ioctl_buf_data buf;
+	struct tee_iocl_supp_recv_arg __user *uarg;
+	struct tee_param *params;
+	u32 num_params;
+	u32 func;
+
+	if (!ctx->teedev->desc->ops->supp_recv)
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, sizeof(buf)))
+		return -EFAULT;
+
+	if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+	    buf.buf_len < sizeof(struct tee_iocl_supp_recv_arg))
+		return -EINVAL;
+
+	uarg = u64_to_user_ptr(buf.buf_ptr);
+	if (get_user(num_params, &uarg->num_params))
+		return -EFAULT;
+
+	if (sizeof(*uarg) + TEE_IOCTL_PARAM_SIZE(num_params) != buf.buf_len)
+		return -EINVAL;
+
+	params = kcalloc(num_params, sizeof(struct tee_param), GFP_KERNEL);
+	if (!params)
+		return -ENOMEM;
+
+	rc = ctx->teedev->desc->ops->supp_recv(ctx, &func, &num_params, params);
+	if (rc)
+		goto out;
+
+	if (put_user(func, &uarg->func) ||
+	    put_user(num_params, &uarg->num_params)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = params_to_supp(ctx, uarg->params, num_params, params);
+out:
+	kfree(params);
+	return rc;
+}
+
+static int params_from_supp(struct tee_param *params, size_t num_params,
+			    struct tee_ioctl_param __user *uparams)
+{
+	size_t n;
+
+	for (n = 0; n < num_params; n++) {
+		struct tee_param *p = params + n;
+		struct tee_ioctl_param ip;
+
+		if (copy_from_user(&ip, uparams + n, sizeof(ip)))
+			return -EFAULT;
+
+		/* All unused attribute bits has to be zero */
+		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_TYPE_MASK)
+			return -EINVAL;
+
+		p->attr = ip.attr;
+		switch (ip.attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			/* Only out and in/out values can be updated */
+			p->u.value.a = ip.a;
+			p->u.value.b = ip.b;
+			p->u.value.c = ip.c;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			/*
+			 * Only the size of the memref can be updated.
+			 * Since we don't have access to the original
+			 * parameters here, only store the supplied size.
+			 * The driver will copy the updated size into the
+			 * original parameters.
+			 */
+			p->u.memref.shm = NULL;
+			p->u.memref.shm_offs = 0;
+			p->u.memref.size = ip.b;
+			break;
+		default:
+			memset(&p->u, 0, sizeof(p->u));
+			break;
+		}
+	}
+	return 0;
+}
+
+static int tee_ioctl_supp_send(struct tee_context *ctx,
+			       struct tee_ioctl_buf_data __user *ubuf)
+{
+	long rc;
+	struct tee_ioctl_buf_data buf;
+	struct tee_iocl_supp_send_arg __user *uarg;
+	struct tee_param *params;
+	u32 num_params;
+	u32 ret;
+
+	/* Not valid for this driver */
+	if (!ctx->teedev->desc->ops->supp_send)
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, sizeof(buf)))
+		return -EFAULT;
+
+	if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+	    buf.buf_len < sizeof(struct tee_iocl_supp_send_arg))
+		return -EINVAL;
+
+	uarg = u64_to_user_ptr(buf.buf_ptr);
+	if (get_user(ret, &uarg->ret) ||
+	    get_user(num_params, &uarg->num_params))
+		return -EFAULT;
+
+	if (sizeof(*uarg) + TEE_IOCTL_PARAM_SIZE(num_params) > buf.buf_len)
+		return -EINVAL;
+
+	params = kcalloc(num_params, sizeof(struct tee_param), GFP_KERNEL);
+	if (!params)
+		return -ENOMEM;
+
+	rc = params_from_supp(params, num_params, uarg->params);
+	if (rc)
+		goto out;
+
+	rc = ctx->teedev->desc->ops->supp_send(ctx, ret, num_params, params);
+out:
+	kfree(params);
+	return rc;
+}
+
+static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct tee_context *ctx = filp->private_data;
+	void __user *uarg = (void __user *)arg;
+
+	switch (cmd) {
+	case TEE_IOC_VERSION:
+		return tee_ioctl_version(ctx, uarg);
+	case TEE_IOC_SHM_ALLOC:
+		return tee_ioctl_shm_alloc(ctx, uarg);
+	case TEE_IOC_OPEN_SESSION:
+		return tee_ioctl_open_session(ctx, uarg);
+	case TEE_IOC_INVOKE:
+		return tee_ioctl_invoke(ctx, uarg);
+	case TEE_IOC_CANCEL:
+		return tee_ioctl_cancel(ctx, uarg);
+	case TEE_IOC_CLOSE_SESSION:
+		return tee_ioctl_close_session(ctx, uarg);
+	case TEE_IOC_SUPPL_RECV:
+		return tee_ioctl_supp_recv(ctx, uarg);
+	case TEE_IOC_SUPPL_SEND:
+		return tee_ioctl_supp_send(ctx, uarg);
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations tee_fops = {
+	.owner = THIS_MODULE,
+	.open = tee_open,
+	.release = tee_release,
+	.unlocked_ioctl = tee_ioctl,
+	.compat_ioctl = tee_ioctl,
+};
+
+static void tee_release_device(struct device *dev)
+{
+	struct tee_device *teedev = container_of(dev, struct tee_device, dev);
+
+	spin_lock(&driver_lock);
+	clear_bit(teedev->id, dev_mask);
+	spin_unlock(&driver_lock);
+	mutex_destroy(&teedev->mutex);
+	idr_destroy(&teedev->idr);
+	kfree(teedev);
+}
+
+/**
+ * tee_device_alloc() - Allocate a new struct tee_device instance
+ * @teedesc:	Descriptor for this driver
+ * @dev:	Parent device for this device
+ * @pool:	Shared memory pool, NULL if not used
+ * @driver_data: Private driver data for this device
+ *
+ * Allocates a new struct tee_device instance. The device is
+ * removed by tee_device_unregister().
+ *
+ * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure
+ */
+struct tee_device *tee_device_alloc(const struct tee_desc *teedesc,
+				    struct device *dev,
+				    struct tee_shm_pool *pool,
+				    void *driver_data)
+{
+	struct tee_device *teedev;
+	void *ret;
+	int rc;
+	int offs = 0;
+
+	if (!teedesc || !teedesc->name || !teedesc->ops ||
+	    !teedesc->ops->get_version || !teedesc->ops->open ||
+	    !teedesc->ops->release || !pool)
+		return ERR_PTR(-EINVAL);
+
+	teedev = kzalloc(sizeof(*teedev), GFP_KERNEL);
+	if (!teedev) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	if (teedesc->flags & TEE_DESC_PRIVILEGED)
+		offs = TEE_NUM_DEVICES / 2;
+
+	spin_lock(&driver_lock);
+	teedev->id = find_next_zero_bit(dev_mask, TEE_NUM_DEVICES, offs);
+	if (teedev->id < TEE_NUM_DEVICES)
+		set_bit(teedev->id, dev_mask);
+	spin_unlock(&driver_lock);
+
+	if (teedev->id >= TEE_NUM_DEVICES) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	snprintf(teedev->name, sizeof(teedev->name), "tee%s%d",
+		 teedesc->flags & TEE_DESC_PRIVILEGED ? "priv" : "",
+		 teedev->id - offs);
+
+	teedev->dev.class = tee_class;
+	teedev->dev.release = tee_release_device;
+	teedev->dev.parent = dev;
+
+	teedev->dev.devt = MKDEV(MAJOR(tee_devt), teedev->id);
+
+	rc = dev_set_name(&teedev->dev, "%s", teedev->name);
+	if (rc) {
+		ret = ERR_PTR(rc);
+		goto err_devt;
+	}
+
+	cdev_init(&teedev->cdev, &tee_fops);
+	teedev->cdev.owner = teedesc->owner;
+	teedev->cdev.kobj.parent = &teedev->dev.kobj;
+
+	dev_set_drvdata(&teedev->dev, driver_data);
+	device_initialize(&teedev->dev);
+
+	/* 1 as tee_device_unregister() does one final tee_device_put() */
+	teedev->num_users = 1;
+	init_completion(&teedev->c_no_users);
+	mutex_init(&teedev->mutex);
+	idr_init(&teedev->idr);
+
+	teedev->desc = teedesc;
+	teedev->pool = pool;
+
+	return teedev;
+err_devt:
+	unregister_chrdev_region(teedev->dev.devt, 1);
+err:
+	pr_err("could not register %s driver\n",
+	       teedesc->flags & TEE_DESC_PRIVILEGED ? "privileged" : "client");
+	if (teedev && teedev->id < TEE_NUM_DEVICES) {
+		spin_lock(&driver_lock);
+		clear_bit(teedev->id, dev_mask);
+		spin_unlock(&driver_lock);
+	}
+	kfree(teedev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tee_device_alloc);
+
+static ssize_t implementation_id_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct tee_device *teedev = container_of(dev, struct tee_device, dev);
+	struct tee_ioctl_version_data vers;
+
+	teedev->desc->ops->get_version(teedev, &vers);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", vers.impl_id);
+}
+static DEVICE_ATTR_RO(implementation_id);
+
+static struct attribute *tee_dev_attrs[] = {
+	&dev_attr_implementation_id.attr,
+	NULL
+};
+
+static const struct attribute_group tee_dev_group = {
+	.attrs = tee_dev_attrs,
+};
+
+/**
+ * tee_device_register() - Registers a TEE device
+ * @teedev:	Device to register
+ *
+ * tee_device_unregister() need to be called to remove the @teedev if
+ * this function fails.
+ *
+ * @returns < 0 on failure
+ */
+int tee_device_register(struct tee_device *teedev)
+{
+	int rc;
+
+	if (teedev->flags & TEE_DEVICE_FLAG_REGISTERED) {
+		dev_err(&teedev->dev, "attempt to register twice\n");
+		return -EINVAL;
+	}
+
+	rc = cdev_add(&teedev->cdev, teedev->dev.devt, 1);
+	if (rc) {
+		dev_err(&teedev->dev,
+			"unable to cdev_add() %s, major %d, minor %d, err=%d\n",
+			teedev->name, MAJOR(teedev->dev.devt),
+			MINOR(teedev->dev.devt), rc);
+		return rc;
+	}
+
+	rc = device_add(&teedev->dev);
+	if (rc) {
+		dev_err(&teedev->dev,
+			"unable to device_add() %s, major %d, minor %d, err=%d\n",
+			teedev->name, MAJOR(teedev->dev.devt),
+			MINOR(teedev->dev.devt), rc);
+		goto err_device_add;
+	}
+
+	rc = sysfs_create_group(&teedev->dev.kobj, &tee_dev_group);
+	if (rc) {
+		dev_err(&teedev->dev,
+			"failed to create sysfs attributes, err=%d\n", rc);
+		goto err_sysfs_create_group;
+	}
+
+	teedev->flags |= TEE_DEVICE_FLAG_REGISTERED;
+	return 0;
+
+err_sysfs_create_group:
+	device_del(&teedev->dev);
+err_device_add:
+	cdev_del(&teedev->cdev);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(tee_device_register);
+
+void tee_device_put(struct tee_device *teedev)
+{
+	mutex_lock(&teedev->mutex);
+	/* Shouldn't put in this state */
+	if (!WARN_ON(!teedev->desc)) {
+		teedev->num_users--;
+		if (!teedev->num_users) {
+			teedev->desc = NULL;
+			complete(&teedev->c_no_users);
+		}
+	}
+	mutex_unlock(&teedev->mutex);
+}
+
+bool tee_device_get(struct tee_device *teedev)
+{
+	mutex_lock(&teedev->mutex);
+	if (!teedev->desc) {
+		mutex_unlock(&teedev->mutex);
+		return false;
+	}
+	teedev->num_users++;
+	mutex_unlock(&teedev->mutex);
+	return true;
+}
+
+/**
+ * tee_device_unregister() - Removes a TEE device
+ * @teedev:	Device to unregister
+ *
+ * This function should be called to remove the @teedev even if
+ * tee_device_register() hasn't been called yet. Does nothing if
+ * @teedev is NULL.
+ */
+void tee_device_unregister(struct tee_device *teedev)
+{
+	if (!teedev)
+		return;
+
+	if (teedev->flags & TEE_DEVICE_FLAG_REGISTERED) {
+		sysfs_remove_group(&teedev->dev.kobj, &tee_dev_group);
+		cdev_del(&teedev->cdev);
+		device_del(&teedev->dev);
+	}
+
+	tee_device_put(teedev);
+	wait_for_completion(&teedev->c_no_users);
+
+	/*
+	 * No need to take a mutex any longer now since teedev->desc was
+	 * set to NULL before teedev->c_no_users was completed.
+	 */
+
+	teedev->pool = NULL;
+
+	put_device(&teedev->dev);
+}
+EXPORT_SYMBOL_GPL(tee_device_unregister);
+
+/**
+ * tee_get_drvdata() - Return driver_data pointer
+ * @teedev:	Device containing the driver_data pointer
+ * @returns the driver_data pointer supplied to tee_register().
+ */
+void *tee_get_drvdata(struct tee_device *teedev)
+{
+	return dev_get_drvdata(&teedev->dev);
+}
+EXPORT_SYMBOL_GPL(tee_get_drvdata);
+
+static int __init tee_init(void)
+{
+	int rc;
+
+	tee_class = class_create(THIS_MODULE, "tee");
+	if (IS_ERR(tee_class)) {
+		pr_err("couldn't create class\n");
+		return PTR_ERR(tee_class);
+	}
+
+	rc = alloc_chrdev_region(&tee_devt, 0, TEE_NUM_DEVICES, "tee");
+	if (rc) {
+		pr_err("failed to allocate char dev region\n");
+		class_destroy(tee_class);
+		tee_class = NULL;
+	}
+
+	return rc;
+}
+
+static void __exit tee_exit(void)
+{
+	class_destroy(tee_class);
+	tee_class = NULL;
+	unregister_chrdev_region(tee_devt, TEE_NUM_DEVICES);
+}
+
+subsys_initcall(tee_init);
+module_exit(tee_exit);
+
+MODULE_AUTHOR("Linaro");
+MODULE_DESCRIPTION("TEE Driver");
+MODULE_VERSION("1.0");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h
new file mode 100644
index 000000000000..21cb6be8bce9
--- /dev/null
+++ b/drivers/tee/tee_private.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef TEE_PRIVATE_H
+#define TEE_PRIVATE_H
+
+#include <linux/cdev.h>
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+
+struct tee_device;
+
+/**
+ * struct tee_shm - shared memory object
+ * @teedev:	device used to allocate the object
+ * @ctx:	context using the object, if NULL the context is gone
+ * @link	link element
+ * @paddr:	physical address of the shared memory
+ * @kaddr:	virtual address of the shared memory
+ * @size:	size of shared memory
+ * @dmabuf:	dmabuf used to for exporting to user space
+ * @flags:	defined by TEE_SHM_* in tee_drv.h
+ * @id:		unique id of a shared memory object on this device
+ */
+struct tee_shm {
+	struct tee_device *teedev;
+	struct tee_context *ctx;
+	struct list_head link;
+	phys_addr_t paddr;
+	void *kaddr;
+	size_t size;
+	struct dma_buf *dmabuf;
+	u32 flags;
+	int id;
+};
+
+struct tee_shm_pool_mgr;
+
+/**
+ * struct tee_shm_pool_mgr_ops - shared memory pool manager operations
+ * @alloc:	called when allocating shared memory
+ * @free:	called when freeing shared memory
+ */
+struct tee_shm_pool_mgr_ops {
+	int (*alloc)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm,
+		     size_t size);
+	void (*free)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm);
+};
+
+/**
+ * struct tee_shm_pool_mgr - shared memory manager
+ * @ops:		operations
+ * @private_data:	private data for the shared memory manager
+ */
+struct tee_shm_pool_mgr {
+	const struct tee_shm_pool_mgr_ops *ops;
+	void *private_data;
+};
+
+/**
+ * struct tee_shm_pool - shared memory pool
+ * @private_mgr:	pool manager for shared memory only between kernel
+ *			and secure world
+ * @dma_buf_mgr:	pool manager for shared memory exported to user space
+ * @destroy:		called when destroying the pool
+ * @private_data:	private data for the pool
+ */
+struct tee_shm_pool {
+	struct tee_shm_pool_mgr private_mgr;
+	struct tee_shm_pool_mgr dma_buf_mgr;
+	void (*destroy)(struct tee_shm_pool *pool);
+	void *private_data;
+};
+
+#define TEE_DEVICE_FLAG_REGISTERED	0x1
+#define TEE_MAX_DEV_NAME_LEN		32
+
+/**
+ * struct tee_device - TEE Device representation
+ * @name:	name of device
+ * @desc:	description of device
+ * @id:		unique id of device
+ * @flags:	represented by TEE_DEVICE_FLAG_REGISTERED above
+ * @dev:	embedded basic device structure
+ * @cdev:	embedded cdev
+ * @num_users:	number of active users of this device
+ * @c_no_user:	completion used when unregistering the device
+ * @mutex:	mutex protecting @num_users and @idr
+ * @idr:	register of shared memory object allocated on this device
+ * @pool:	shared memory pool
+ */
+struct tee_device {
+	char name[TEE_MAX_DEV_NAME_LEN];
+	const struct tee_desc *desc;
+	int id;
+	unsigned int flags;
+
+	struct device dev;
+	struct cdev cdev;
+
+	size_t num_users;
+	struct completion c_no_users;
+	struct mutex mutex;	/* protects num_users and idr */
+
+	struct idr idr;
+	struct tee_shm_pool *pool;
+};
+
+int tee_shm_init(void);
+
+int tee_shm_get_fd(struct tee_shm *shm);
+
+bool tee_device_get(struct tee_device *teedev);
+void tee_device_put(struct tee_device *teedev);
+
+#endif /*TEE_PRIVATE_H*/
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
new file mode 100644
index 000000000000..0be1e3e93bee
--- /dev/null
+++ b/drivers/tee/tee_shm.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/fdtable.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "tee_private.h"
+
+static void tee_shm_release(struct tee_shm *shm)
+{
+	struct tee_device *teedev = shm->teedev;
+	struct tee_shm_pool_mgr *poolm;
+
+	mutex_lock(&teedev->mutex);
+	idr_remove(&teedev->idr, shm->id);
+	if (shm->ctx)
+		list_del(&shm->link);
+	mutex_unlock(&teedev->mutex);
+
+	if (shm->flags & TEE_SHM_DMA_BUF)
+		poolm = &teedev->pool->dma_buf_mgr;
+	else
+		poolm = &teedev->pool->private_mgr;
+
+	poolm->ops->free(poolm, shm);
+	kfree(shm);
+
+	tee_device_put(teedev);
+}
+
+static struct sg_table *tee_shm_op_map_dma_buf(struct dma_buf_attachment
+			*attach, enum dma_data_direction dir)
+{
+	return NULL;
+}
+
+static void tee_shm_op_unmap_dma_buf(struct dma_buf_attachment *attach,
+				     struct sg_table *table,
+				     enum dma_data_direction dir)
+{
+}
+
+static void tee_shm_op_release(struct dma_buf *dmabuf)
+{
+	struct tee_shm *shm = dmabuf->priv;
+
+	tee_shm_release(shm);
+}
+
+static void *tee_shm_op_kmap_atomic(struct dma_buf *dmabuf, unsigned long pgnum)
+{
+	return NULL;
+}
+
+static void *tee_shm_op_kmap(struct dma_buf *dmabuf, unsigned long pgnum)
+{
+	return NULL;
+}
+
+static int tee_shm_op_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
+{
+	struct tee_shm *shm = dmabuf->priv;
+	size_t size = vma->vm_end - vma->vm_start;
+
+	return remap_pfn_range(vma, vma->vm_start, shm->paddr >> PAGE_SHIFT,
+			       size, vma->vm_page_prot);
+}
+
+static struct dma_buf_ops tee_shm_dma_buf_ops = {
+	.map_dma_buf = tee_shm_op_map_dma_buf,
+	.unmap_dma_buf = tee_shm_op_unmap_dma_buf,
+	.release = tee_shm_op_release,
+	.kmap_atomic = tee_shm_op_kmap_atomic,
+	.kmap = tee_shm_op_kmap,
+	.mmap = tee_shm_op_mmap,
+};
+
+/**
+ * tee_shm_alloc() - Allocate shared memory
+ * @ctx:	Context that allocates the shared memory
+ * @size:	Requested size of shared memory
+ * @flags:	Flags setting properties for the requested shared memory.
+ *
+ * Memory allocated as global shared memory is automatically freed when the
+ * TEE file pointer is closed. The @flags field uses the bits defined by
+ * TEE_SHM_* in <linux/tee_drv.h>. TEE_SHM_MAPPED must currently always be
+ * set. If TEE_SHM_DMA_BUF global shared memory will be allocated and
+ * associated with a dma-buf handle, else driver private memory.
+ */
+struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
+{
+	struct tee_device *teedev = ctx->teedev;
+	struct tee_shm_pool_mgr *poolm = NULL;
+	struct tee_shm *shm;
+	void *ret;
+	int rc;
+
+	if (!(flags & TEE_SHM_MAPPED)) {
+		dev_err(teedev->dev.parent,
+			"only mapped allocations supported\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if ((flags & ~(TEE_SHM_MAPPED | TEE_SHM_DMA_BUF))) {
+		dev_err(teedev->dev.parent, "invalid shm flags 0x%x", flags);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!tee_device_get(teedev))
+		return ERR_PTR(-EINVAL);
+
+	if (!teedev->pool) {
+		/* teedev has been detached from driver */
+		ret = ERR_PTR(-EINVAL);
+		goto err_dev_put;
+	}
+
+	shm = kzalloc(sizeof(*shm), GFP_KERNEL);
+	if (!shm) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err_dev_put;
+	}
+
+	shm->flags = flags;
+	shm->teedev = teedev;
+	shm->ctx = ctx;
+	if (flags & TEE_SHM_DMA_BUF)
+		poolm = &teedev->pool->dma_buf_mgr;
+	else
+		poolm = &teedev->pool->private_mgr;
+
+	rc = poolm->ops->alloc(poolm, shm, size);
+	if (rc) {
+		ret = ERR_PTR(rc);
+		goto err_kfree;
+	}
+
+	mutex_lock(&teedev->mutex);
+	shm->id = idr_alloc(&teedev->idr, shm, 1, 0, GFP_KERNEL);
+	mutex_unlock(&teedev->mutex);
+	if (shm->id < 0) {
+		ret = ERR_PTR(shm->id);
+		goto err_pool_free;
+	}
+
+	if (flags & TEE_SHM_DMA_BUF) {
+		DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+
+		exp_info.ops = &tee_shm_dma_buf_ops;
+		exp_info.size = shm->size;
+		exp_info.flags = O_RDWR;
+		exp_info.priv = shm;
+
+		shm->dmabuf = dma_buf_export(&exp_info);
+		if (IS_ERR(shm->dmabuf)) {
+			ret = ERR_CAST(shm->dmabuf);
+			goto err_rem;
+		}
+	}
+	mutex_lock(&teedev->mutex);
+	list_add_tail(&shm->link, &ctx->list_shm);
+	mutex_unlock(&teedev->mutex);
+
+	return shm;
+err_rem:
+	mutex_lock(&teedev->mutex);
+	idr_remove(&teedev->idr, shm->id);
+	mutex_unlock(&teedev->mutex);
+err_pool_free:
+	poolm->ops->free(poolm, shm);
+err_kfree:
+	kfree(shm);
+err_dev_put:
+	tee_device_put(teedev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tee_shm_alloc);
+
+/**
+ * tee_shm_get_fd() - Increase reference count and return file descriptor
+ * @shm:	Shared memory handle
+ * @returns user space file descriptor to shared memory
+ */
+int tee_shm_get_fd(struct tee_shm *shm)
+{
+	u32 req_flags = TEE_SHM_MAPPED | TEE_SHM_DMA_BUF;
+	int fd;
+
+	if ((shm->flags & req_flags) != req_flags)
+		return -EINVAL;
+
+	fd = dma_buf_fd(shm->dmabuf, O_CLOEXEC);
+	if (fd >= 0)
+		get_dma_buf(shm->dmabuf);
+	return fd;
+}
+
+/**
+ * tee_shm_free() - Free shared memory
+ * @shm:	Handle to shared memory to free
+ */
+void tee_shm_free(struct tee_shm *shm)
+{
+	/*
+	 * dma_buf_put() decreases the dmabuf reference counter and will
+	 * call tee_shm_release() when the last reference is gone.
+	 *
+	 * In the case of driver private memory we call tee_shm_release
+	 * directly instead as it doesn't have a reference counter.
+	 */
+	if (shm->flags & TEE_SHM_DMA_BUF)
+		dma_buf_put(shm->dmabuf);
+	else
+		tee_shm_release(shm);
+}
+EXPORT_SYMBOL_GPL(tee_shm_free);
+
+/**
+ * tee_shm_va2pa() - Get physical address of a virtual address
+ * @shm:	Shared memory handle
+ * @va:		Virtual address to tranlsate
+ * @pa:		Returned physical address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa)
+{
+	/* Check that we're in the range of the shm */
+	if ((char *)va < (char *)shm->kaddr)
+		return -EINVAL;
+	if ((char *)va >= ((char *)shm->kaddr + shm->size))
+		return -EINVAL;
+
+	return tee_shm_get_pa(
+			shm, (unsigned long)va - (unsigned long)shm->kaddr, pa);
+}
+EXPORT_SYMBOL_GPL(tee_shm_va2pa);
+
+/**
+ * tee_shm_pa2va() - Get virtual address of a physical address
+ * @shm:	Shared memory handle
+ * @pa:		Physical address to tranlsate
+ * @va:		Returned virtual address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va)
+{
+	/* Check that we're in the range of the shm */
+	if (pa < shm->paddr)
+		return -EINVAL;
+	if (pa >= (shm->paddr + shm->size))
+		return -EINVAL;
+
+	if (va) {
+		void *v = tee_shm_get_va(shm, pa - shm->paddr);
+
+		if (IS_ERR(v))
+			return PTR_ERR(v);
+		*va = v;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tee_shm_pa2va);
+
+/**
+ * tee_shm_get_va() - Get virtual address of a shared memory plus an offset
+ * @shm:	Shared memory handle
+ * @offs:	Offset from start of this shared memory
+ * @returns virtual address of the shared memory + offs if offs is within
+ *	the bounds of this shared memory, else an ERR_PTR
+ */
+void *tee_shm_get_va(struct tee_shm *shm, size_t offs)
+{
+	if (offs >= shm->size)
+		return ERR_PTR(-EINVAL);
+	return (char *)shm->kaddr + offs;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_va);
+
+/**
+ * tee_shm_get_pa() - Get physical address of a shared memory plus an offset
+ * @shm:	Shared memory handle
+ * @offs:	Offset from start of this shared memory
+ * @pa:		Physical address to return
+ * @returns 0 if offs is within the bounds of this shared memory, else an
+ *	error code.
+ */
+int tee_shm_get_pa(struct tee_shm *shm, size_t offs, phys_addr_t *pa)
+{
+	if (offs >= shm->size)
+		return -EINVAL;
+	if (pa)
+		*pa = shm->paddr + offs;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_pa);
+
+/**
+ * tee_shm_get_from_id() - Find shared memory object and increase reference
+ * count
+ * @ctx:	Context owning the shared memory
+ * @id:		Id of shared memory object
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
+ */
+struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id)
+{
+	struct tee_device *teedev;
+	struct tee_shm *shm;
+
+	if (!ctx)
+		return ERR_PTR(-EINVAL);
+
+	teedev = ctx->teedev;
+	mutex_lock(&teedev->mutex);
+	shm = idr_find(&teedev->idr, id);
+	if (!shm || shm->ctx != ctx)
+		shm = ERR_PTR(-EINVAL);
+	else if (shm->flags & TEE_SHM_DMA_BUF)
+		get_dma_buf(shm->dmabuf);
+	mutex_unlock(&teedev->mutex);
+	return shm;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_from_id);
+
+/**
+ * tee_shm_get_id() - Get id of a shared memory object
+ * @shm:	Shared memory handle
+ * @returns id
+ */
+int tee_shm_get_id(struct tee_shm *shm)
+{
+	return shm->id;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_id);
+
+/**
+ * tee_shm_put() - Decrease reference count on a shared memory handle
+ * @shm:	Shared memory handle
+ */
+void tee_shm_put(struct tee_shm *shm)
+{
+	if (shm->flags & TEE_SHM_DMA_BUF)
+		dma_buf_put(shm->dmabuf);
+}
+EXPORT_SYMBOL_GPL(tee_shm_put);
diff --git a/drivers/tee/tee_shm_pool.c b/drivers/tee/tee_shm_pool.c
new file mode 100644
index 000000000000..fb4f8522a526
--- /dev/null
+++ b/drivers/tee/tee_shm_pool.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "tee_private.h"
+
+static int pool_op_gen_alloc(struct tee_shm_pool_mgr *poolm,
+			     struct tee_shm *shm, size_t size)
+{
+	unsigned long va;
+	struct gen_pool *genpool = poolm->private_data;
+	size_t s = roundup(size, 1 << genpool->min_alloc_order);
+
+	va = gen_pool_alloc(genpool, s);
+	if (!va)
+		return -ENOMEM;
+
+	memset((void *)va, 0, s);
+	shm->kaddr = (void *)va;
+	shm->paddr = gen_pool_virt_to_phys(genpool, va);
+	shm->size = s;
+	return 0;
+}
+
+static void pool_op_gen_free(struct tee_shm_pool_mgr *poolm,
+			     struct tee_shm *shm)
+{
+	gen_pool_free(poolm->private_data, (unsigned long)shm->kaddr,
+		      shm->size);
+	shm->kaddr = NULL;
+}
+
+static const struct tee_shm_pool_mgr_ops pool_ops_generic = {
+	.alloc = pool_op_gen_alloc,
+	.free = pool_op_gen_free,
+};
+
+static void pool_res_mem_destroy(struct tee_shm_pool *pool)
+{
+	gen_pool_destroy(pool->private_mgr.private_data);
+	gen_pool_destroy(pool->dma_buf_mgr.private_data);
+}
+
+static int pool_res_mem_mgr_init(struct tee_shm_pool_mgr *mgr,
+				 struct tee_shm_pool_mem_info *info,
+				 int min_alloc_order)
+{
+	size_t page_mask = PAGE_SIZE - 1;
+	struct gen_pool *genpool = NULL;
+	int rc;
+
+	/*
+	 * Start and end must be page aligned
+	 */
+	if ((info->vaddr & page_mask) || (info->paddr & page_mask) ||
+	    (info->size & page_mask))
+		return -EINVAL;
+
+	genpool = gen_pool_create(min_alloc_order, -1);
+	if (!genpool)
+		return -ENOMEM;
+
+	gen_pool_set_algo(genpool, gen_pool_best_fit, NULL);
+	rc = gen_pool_add_virt(genpool, info->vaddr, info->paddr, info->size,
+			       -1);
+	if (rc) {
+		gen_pool_destroy(genpool);
+		return rc;
+	}
+
+	mgr->private_data = genpool;
+	mgr->ops = &pool_ops_generic;
+	return 0;
+}
+
+/**
+ * tee_shm_pool_alloc_res_mem() - Create a shared memory pool from reserved
+ * memory range
+ * @priv_info:	Information for driver private shared memory pool
+ * @dmabuf_info: Information for dma-buf shared memory pool
+ *
+ * Start and end of pools will must be page aligned.
+ *
+ * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied
+ * in @dmabuf, others will use the range provided by @priv.
+ *
+ * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool *
+tee_shm_pool_alloc_res_mem(struct tee_shm_pool_mem_info *priv_info,
+			   struct tee_shm_pool_mem_info *dmabuf_info)
+{
+	struct tee_shm_pool *pool = NULL;
+	int ret;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * Create the pool for driver private shared memory
+	 */
+	ret = pool_res_mem_mgr_init(&pool->private_mgr, priv_info,
+				    3 /* 8 byte aligned */);
+	if (ret)
+		goto err;
+
+	/*
+	 * Create the pool for dma_buf shared memory
+	 */
+	ret = pool_res_mem_mgr_init(&pool->dma_buf_mgr, dmabuf_info,
+				    PAGE_SHIFT);
+	if (ret)
+		goto err;
+
+	pool->destroy = pool_res_mem_destroy;
+	return pool;
+err:
+	if (ret == -ENOMEM)
+		pr_err("%s: can't allocate memory for res_mem shared memory pool\n", __func__);
+	if (pool && pool->private_mgr.private_data)
+		gen_pool_destroy(pool->private_mgr.private_data);
+	kfree(pool);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_alloc_res_mem);
+
+/**
+ * tee_shm_pool_free() - Free a shared memory pool
+ * @pool:	The shared memory pool to free
+ *
+ * There must be no remaining shared memory allocated from this pool when
+ * this function is called.
+ */
+void tee_shm_pool_free(struct tee_shm_pool *pool)
+{
+	pool->destroy(pool);
+	kfree(pool);
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_free);
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
new file mode 100644
index 000000000000..0f175b8f6456
--- /dev/null
+++ b/include/linux/tee_drv.h
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __TEE_DRV_H
+#define __TEE_DRV_H
+
+#include <linux/types.h>
+#include <linux/idr.h>
+#include <linux/list.h>
+#include <linux/tee.h>
+
+/*
+ * The file describes the API provided by the generic TEE driver to the
+ * specific TEE driver.
+ */
+
+#define TEE_SHM_MAPPED		0x1	/* Memory mapped by the kernel */
+#define TEE_SHM_DMA_BUF		0x2	/* Memory with dma-buf handle */
+
+struct tee_device;
+struct tee_shm;
+struct tee_shm_pool;
+
+/**
+ * struct tee_context - driver specific context on file pointer data
+ * @teedev:	pointer to this drivers struct tee_device
+ * @list_shm:	List of shared memory object owned by this context
+ * @data:	driver specific context data, managed by the driver
+ */
+struct tee_context {
+	struct tee_device *teedev;
+	struct list_head list_shm;
+	void *data;
+};
+
+struct tee_param_memref {
+	size_t shm_offs;
+	size_t size;
+	struct tee_shm *shm;
+};
+
+struct tee_param_value {
+	u64 a;
+	u64 b;
+	u64 c;
+};
+
+struct tee_param {
+	u64 attr;
+	union {
+		struct tee_param_memref memref;
+		struct tee_param_value value;
+	} u;
+};
+
+/**
+ * struct tee_driver_ops - driver operations vtable
+ * @get_version:	returns version of driver
+ * @open:		called when the device file is opened
+ * @release:		release this open file
+ * @open_session:	open a new session
+ * @close_session:	close a session
+ * @invoke_func:	invoke a trusted function
+ * @cancel_req:		request cancel of an ongoing invoke or open
+ * @supp_revc:		called for supplicant to get a command
+ * @supp_send:		called for supplicant to send a response
+ */
+struct tee_driver_ops {
+	void (*get_version)(struct tee_device *teedev,
+			    struct tee_ioctl_version_data *vers);
+	int (*open)(struct tee_context *ctx);
+	void (*release)(struct tee_context *ctx);
+	int (*open_session)(struct tee_context *ctx,
+			    struct tee_ioctl_open_session_arg *arg,
+			    struct tee_param *param);
+	int (*close_session)(struct tee_context *ctx, u32 session);
+	int (*invoke_func)(struct tee_context *ctx,
+			   struct tee_ioctl_invoke_arg *arg,
+			   struct tee_param *param);
+	int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session);
+	int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params,
+			 struct tee_param *param);
+	int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params,
+			 struct tee_param *param);
+};
+
+/**
+ * struct tee_desc - Describes the TEE driver to the subsystem
+ * @name:	name of driver
+ * @ops:	driver operations vtable
+ * @owner:	module providing the driver
+ * @flags:	Extra properties of driver, defined by TEE_DESC_* below
+ */
+#define TEE_DESC_PRIVILEGED	0x1
+struct tee_desc {
+	const char *name;
+	const struct tee_driver_ops *ops;
+	struct module *owner;
+	u32 flags;
+};
+
+/**
+ * tee_device_alloc() - Allocate a new struct tee_device instance
+ * @teedesc:	Descriptor for this driver
+ * @dev:	Parent device for this device
+ * @pool:	Shared memory pool, NULL if not used
+ * @driver_data: Private driver data for this device
+ *
+ * Allocates a new struct tee_device instance. The device is
+ * removed by tee_device_unregister().
+ *
+ * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure
+ */
+struct tee_device *tee_device_alloc(const struct tee_desc *teedesc,
+				    struct device *dev,
+				    struct tee_shm_pool *pool,
+				    void *driver_data);
+
+/**
+ * tee_device_register() - Registers a TEE device
+ * @teedev:	Device to register
+ *
+ * tee_device_unregister() need to be called to remove the @teedev if
+ * this function fails.
+ *
+ * @returns < 0 on failure
+ */
+int tee_device_register(struct tee_device *teedev);
+
+/**
+ * tee_device_unregister() - Removes a TEE device
+ * @teedev:	Device to unregister
+ *
+ * This function should be called to remove the @teedev even if
+ * tee_device_register() hasn't been called yet. Does nothing if
+ * @teedev is NULL.
+ */
+void tee_device_unregister(struct tee_device *teedev);
+
+/**
+ * struct tee_shm_pool_mem_info - holds information needed to create a shared
+ * memory pool
+ * @vaddr:	Virtual address of start of pool
+ * @paddr:	Physical address of start of pool
+ * @size:	Size in bytes of the pool
+ */
+struct tee_shm_pool_mem_info {
+	unsigned long vaddr;
+	phys_addr_t paddr;
+	size_t size;
+};
+
+/**
+ * tee_shm_pool_alloc_res_mem() - Create a shared memory pool from reserved
+ * memory range
+ * @priv_info:	 Information for driver private shared memory pool
+ * @dmabuf_info: Information for dma-buf shared memory pool
+ *
+ * Start and end of pools will must be page aligned.
+ *
+ * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied
+ * in @dmabuf, others will use the range provided by @priv.
+ *
+ * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool *
+tee_shm_pool_alloc_res_mem(struct tee_shm_pool_mem_info *priv_info,
+			   struct tee_shm_pool_mem_info *dmabuf_info);
+
+/**
+ * tee_shm_pool_free() - Free a shared memory pool
+ * @pool:	The shared memory pool to free
+ *
+ * The must be no remaining shared memory allocated from this pool when
+ * this function is called.
+ */
+void tee_shm_pool_free(struct tee_shm_pool *pool);
+
+/**
+ * tee_get_drvdata() - Return driver_data pointer
+ * @returns the driver_data pointer supplied to tee_register().
+ */
+void *tee_get_drvdata(struct tee_device *teedev);
+
+/**
+ * tee_shm_alloc() - Allocate shared memory
+ * @ctx:	Context that allocates the shared memory
+ * @size:	Requested size of shared memory
+ * @flags:	Flags setting properties for the requested shared memory.
+ *
+ * Memory allocated as global shared memory is automatically freed when the
+ * TEE file pointer is closed. The @flags field uses the bits defined by
+ * TEE_SHM_* above. TEE_SHM_MAPPED must currently always be set. If
+ * TEE_SHM_DMA_BUF global shared memory will be allocated and associated
+ * with a dma-buf handle, else driver private memory.
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags);
+
+/**
+ * tee_shm_free() - Free shared memory
+ * @shm:	Handle to shared memory to free
+ */
+void tee_shm_free(struct tee_shm *shm);
+
+/**
+ * tee_shm_put() - Decrease reference count on a shared memory handle
+ * @shm:	Shared memory handle
+ */
+void tee_shm_put(struct tee_shm *shm);
+
+/**
+ * tee_shm_va2pa() - Get physical address of a virtual address
+ * @shm:	Shared memory handle
+ * @va:		Virtual address to tranlsate
+ * @pa:		Returned physical address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa);
+
+/**
+ * tee_shm_pa2va() - Get virtual address of a physical address
+ * @shm:	Shared memory handle
+ * @pa:		Physical address to tranlsate
+ * @va:		Returned virtual address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va);
+
+/**
+ * tee_shm_get_va() - Get virtual address of a shared memory plus an offset
+ * @shm:	Shared memory handle
+ * @offs:	Offset from start of this shared memory
+ * @returns virtual address of the shared memory + offs if offs is within
+ *	the bounds of this shared memory, else an ERR_PTR
+ */
+void *tee_shm_get_va(struct tee_shm *shm, size_t offs);
+
+/**
+ * tee_shm_get_pa() - Get physical address of a shared memory plus an offset
+ * @shm:	Shared memory handle
+ * @offs:	Offset from start of this shared memory
+ * @pa:		Physical address to return
+ * @returns 0 if offs is within the bounds of this shared memory, else an
+ *	error code.
+ */
+int tee_shm_get_pa(struct tee_shm *shm, size_t offs, phys_addr_t *pa);
+
+/**
+ * tee_shm_get_id() - Get id of a shared memory object
+ * @shm:	Shared memory handle
+ * @returns id
+ */
+int tee_shm_get_id(struct tee_shm *shm);
+
+/**
+ * tee_shm_get_from_id() - Find shared memory object and increase reference
+ * count
+ * @ctx:	Context owning the shared memory
+ * @id:		Id of shared memory object
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
+ */
+struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id);
+
+#endif /*__TEE_DRV_H*/
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
new file mode 100644
index 000000000000..370d8845ab21
--- /dev/null
+++ b/include/uapi/linux/tee.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __TEE_H
+#define __TEE_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * This file describes the API provided by a TEE driver to user space.
+ *
+ * Each TEE driver defines a TEE specific protocol which is used for the
+ * data passed back and forth using TEE_IOC_CMD.
+ */
+
+/* Helpers to make the ioctl defines */
+#define TEE_IOC_MAGIC	0xa4
+#define TEE_IOC_BASE	0
+
+/* Flags relating to shared memory */
+#define TEE_IOCTL_SHM_MAPPED	0x1	/* memory mapped in normal world */
+#define TEE_IOCTL_SHM_DMA_BUF	0x2	/* dma-buf handle on shared memory */
+
+#define TEE_MAX_ARG_SIZE	1024
+
+#define TEE_GEN_CAP_GP		(1 << 0)/* GlobalPlatform compliant TEE */
+
+/*
+ * TEE Implementation ID
+ */
+#define TEE_IMPL_ID_OPTEE	1
+
+/*
+ * OP-TEE specific capabilities
+ */
+#define TEE_OPTEE_CAP_TZ	(1 << 0)
+
+/**
+ * struct tee_ioctl_version_data - TEE version
+ * @impl_id:	[out] TEE implementation id
+ * @impl_caps:	[out] Implementation specific capabilities
+ * @gen_caps:	[out] Generic capabilities, defined by TEE_GEN_CAPS_* above
+ *
+ * Identifies the TEE implementation, @impl_id is one of TEE_IMPL_ID_* above.
+ * @impl_caps is implementation specific, for example TEE_OPTEE_CAP_*
+ * is valid when @impl_id == TEE_IMPL_ID_OPTEE.
+ */
+struct tee_ioctl_version_data {
+	__u32 impl_id;
+	__u32 impl_caps;
+	__u32 gen_caps;
+};
+
+/**
+ * TEE_IOC_VERSION - query version of TEE
+ *
+ * Takes a tee_ioctl_version_data struct and returns with the TEE version
+ * data filled in.
+ */
+#define TEE_IOC_VERSION		_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 0, \
+				     struct tee_ioctl_version_data)
+
+/**
+ * struct tee_ioctl_shm_alloc_data - Shared memory allocate argument
+ * @size:	[in/out] Size of shared memory to allocate
+ * @flags:	[in/out] Flags to/from allocation.
+ * @id:		[out] Identifier of the shared memory
+ *
+ * The flags field should currently be zero as input. Updated by the call
+ * with actual flags as defined by TEE_IOCTL_SHM_* above.
+ * This structure is used as argument for TEE_IOC_SHM_ALLOC below.
+ */
+struct tee_ioctl_shm_alloc_data {
+	__u64 size;
+	__u32 flags;
+	__s32 id;
+};
+
+/**
+ * TEE_IOC_SHM_ALLOC - allocate shared memory
+ *
+ * Allocates shared memory between the user space process and secure OS.
+ *
+ * Returns a file descriptor on success or < 0 on failure
+ *
+ * The returned file descriptor is used to map the shared memory into user
+ * space. The shared memory is freed when the descriptor is closed and the
+ * memory is unmapped.
+ */
+#define TEE_IOC_SHM_ALLOC	_IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 1, \
+				     struct tee_ioctl_shm_alloc_data)
+
+/**
+ * struct tee_ioctl_buf_data - Variable sized buffer
+ * @buf_ptr:	[in] A __user pointer to a buffer
+ * @buf_len:	[in] Length of the buffer above
+ *
+ * Used as argument for TEE_IOC_OPEN_SESSION, TEE_IOC_INVOKE,
+ * TEE_IOC_SUPPL_RECV, and TEE_IOC_SUPPL_SEND below.
+ */
+struct tee_ioctl_buf_data {
+	__u64 buf_ptr;
+	__u64 buf_len;
+};
+
+/*
+ * Attributes for struct tee_ioctl_param, selects field in the union
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_NONE		0	/* parameter not used */
+
+/*
+ * These defines value parameters (struct tee_ioctl_param_value)
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT	1
+#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT	2
+#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT	3	/* input and output */
+
+/*
+ * These defines shared memory reference parameters (struct
+ * tee_ioctl_param_memref)
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT	5
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT	6
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT	7	/* input and output */
+
+/*
+ * Mask for the type part of the attribute, leaves room for more types
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MASK		0xff
+
+/*
+ * Matches TEEC_LOGIN_* in GP TEE Client API
+ * Are only defined for GP compliant TEEs
+ */
+#define TEE_IOCTL_LOGIN_PUBLIC			0
+#define TEE_IOCTL_LOGIN_USER			1
+#define TEE_IOCTL_LOGIN_GROUP			2
+#define TEE_IOCTL_LOGIN_APPLICATION		4
+#define TEE_IOCTL_LOGIN_USER_APPLICATION	5
+#define TEE_IOCTL_LOGIN_GROUP_APPLICATION	6
+
+/**
+ * struct tee_ioctl_param - parameter
+ * @attr: attributes
+ * @a: if a memref, offset into the shared memory object, else a value parameter
+ * @b: if a memref, size of the buffer, else a value parameter
+ * @c: if a memref, shared memory identifier, else a value parameter
+ *
+ * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref or value is used in
+ * the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value and
+ * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref. TEE_PARAM_ATTR_TYPE_NONE
+ * indicates that none of the members are used.
+ *
+ * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an
+ * identifier representing the shared memory object. A memref can reference
+ * a part of a shared memory by specifying an offset (@a) and size (@b) of
+ * the object. To supply the entire shared memory object set the offset
+ * (@a) to 0 and size (@b) to the previously returned size of the object.
+ */
+struct tee_ioctl_param {
+	__u64 attr;
+	__u64 a;
+	__u64 b;
+	__u64 c;
+};
+
+#define TEE_IOCTL_UUID_LEN		16
+
+/**
+ * struct tee_ioctl_open_session_arg - Open session argument
+ * @uuid:	[in] UUID of the Trusted Application
+ * @clnt_uuid:	[in] UUID of client
+ * @clnt_login:	[in] Login class of client, TEE_IOCTL_LOGIN_* above
+ * @cancel_id:	[in] Cancellation id, a unique value to identify this request
+ * @session:	[out] Session id
+ * @ret:	[out] return value
+ * @ret_origin	[out] origin of the return value
+ * @num_params	[in] number of parameters following this struct
+ */
+struct tee_ioctl_open_session_arg {
+	__u8 uuid[TEE_IOCTL_UUID_LEN];
+	__u8 clnt_uuid[TEE_IOCTL_UUID_LEN];
+	__u32 clnt_login;
+	__u32 cancel_id;
+	__u32 session;
+	__u32 ret;
+	__u32 ret_origin;
+	__u32 num_params;
+	/* num_params tells the actual number of element in params */
+	struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_OPEN_SESSION - opens a session to a Trusted Application
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_ioctl_open_session_arg followed by any array of struct
+ * tee_ioctl_param
+ */
+#define TEE_IOC_OPEN_SESSION	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 2, \
+				     struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted
+ * Application
+ * @func:	[in] Trusted Application function, specific to the TA
+ * @session:	[in] Session id
+ * @cancel_id:	[in] Cancellation id, a unique value to identify this request
+ * @ret:	[out] return value
+ * @ret_origin	[out] origin of the return value
+ * @num_params	[in] number of parameters following this struct
+ */
+struct tee_ioctl_invoke_arg {
+	__u32 func;
+	__u32 session;
+	__u32 cancel_id;
+	__u32 ret;
+	__u32 ret_origin;
+	__u32 num_params;
+	/* num_params tells the actual number of element in params */
+	struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_INVOKE - Invokes a function in a Trusted Application
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_invoke_func_arg followed by any array of struct tee_param
+ */
+#define TEE_IOC_INVOKE		_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 3, \
+				     struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_ioctl_cancel_arg - Cancels an open session or invoke ioctl
+ * @cancel_id:	[in] Cancellation id, a unique value to identify this request
+ * @session:	[in] Session id, if the session is opened, else set to 0
+ */
+struct tee_ioctl_cancel_arg {
+	__u32 cancel_id;
+	__u32 session;
+};
+
+/**
+ * TEE_IOC_CANCEL - Cancels an open session or invoke
+ */
+#define TEE_IOC_CANCEL		_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 4, \
+				     struct tee_ioctl_cancel_arg)
+
+/**
+ * struct tee_ioctl_close_session_arg - Closes an open session
+ * @session:	[in] Session id
+ */
+struct tee_ioctl_close_session_arg {
+	__u32 session;
+};
+
+/**
+ * TEE_IOC_CLOSE_SESSION - Closes a session
+ */
+#define TEE_IOC_CLOSE_SESSION	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 5, \
+				     struct tee_ioctl_close_session_arg)
+
+/**
+ * struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function
+ * @func:	[in] supplicant function
+ * @num_params	[in/out] number of parameters following this struct
+ *
+ * @num_params is the number of params that tee-supplicant has room to
+ * receive when input, @num_params is the number of actual params
+ * tee-supplicant receives when output.
+ */
+struct tee_iocl_supp_recv_arg {
+	__u32 func;
+	__u32 num_params;
+	/* num_params tells the actual number of element in params */
+	struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_SUPPL_RECV - Receive a request for a supplicant function
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_iocl_supp_recv_arg followed by any array of struct tee_param
+ */
+#define TEE_IOC_SUPPL_RECV	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 6, \
+				     struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_iocl_supp_send_arg - Send a response to a received request
+ * @ret:	[out] return value
+ * @num_params	[in] number of parameters following this struct
+ */
+struct tee_iocl_supp_send_arg {
+	__u32 ret;
+	__u32 num_params;
+	/* num_params tells the actual number of element in params */
+	struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_SUPPL_SEND - Receive a request for a supplicant function
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_iocl_supp_send_arg followed by any array of struct tee_param
+ */
+#define TEE_IOC_SUPPL_SEND	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 7, \
+				     struct tee_ioctl_buf_data)
+
+/*
+ * Five syscalls are used when communicating with the TEE driver.
+ * open(): opens the device associated with the driver
+ * ioctl(): as described above operating on the file descriptor from open()
+ * close(): two cases
+ *   - closes the device file descriptor
+ *   - closes a file descriptor connected to allocated shared memory
+ * mmap(): maps shared memory into user space using information from struct
+ *	   tee_ioctl_shm_alloc_data
+ * munmap(): unmaps previously shared memory
+ */
+
+#endif /*__TEE_H*/

From 09da41c575ea9acbd184653853dc3fe1567c4e9a Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Tue, 14 Apr 2015 14:33:20 +0200
Subject: [PATCH 1283/3220] tee: add OP-TEE driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a OP-TEE driver which also can be compiled as a loadable module.

* Targets ARM and ARM64
* Supports using reserved memory from OP-TEE as shared memory
* Probes OP-TEE version using SMCs
* Accepts requests on privileged and unprivileged device
* Uses OPTEE message protocol version 2 to communicate with secure world

Change-Id: Iffaf30a91fff2d29dd87e61173c564271bcc7776
Acked-by: Andreas Dannenberg <dannenberg@ti.com>
Tested-by: Jerome Forissier <jerome.forissier@linaro.org> (HiKey)
Tested-by: Volodymyr Babchuk <vlad.babchuk@gmail.com> (RCAR H3)
Tested-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 4fb0a5eb364d239722e745c02aef0dbd4e0f1ad2)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 MAINTAINERS                       |   5 +
 drivers/tee/Kconfig               |  10 +
 drivers/tee/Makefile              |   1 +
 drivers/tee/optee/Kconfig         |   7 +
 drivers/tee/optee/Makefile        |   5 +
 drivers/tee/optee/call.c          | 444 +++++++++++++++++++++
 drivers/tee/optee/core.c          | 622 ++++++++++++++++++++++++++++++
 drivers/tee/optee/optee_msg.h     | 418 ++++++++++++++++++++
 drivers/tee/optee/optee_private.h | 183 +++++++++
 drivers/tee/optee/optee_smc.h     | 450 +++++++++++++++++++++
 drivers/tee/optee/rpc.c           | 396 +++++++++++++++++++
 drivers/tee/optee/supp.c          | 273 +++++++++++++
 12 files changed, 2814 insertions(+)
 create mode 100644 drivers/tee/optee/Kconfig
 create mode 100644 drivers/tee/optee/Makefile
 create mode 100644 drivers/tee/optee/call.c
 create mode 100644 drivers/tee/optee/core.c
 create mode 100644 drivers/tee/optee/optee_msg.h
 create mode 100644 drivers/tee/optee/optee_private.h
 create mode 100644 drivers/tee/optee/optee_smc.h
 create mode 100644 drivers/tee/optee/rpc.c
 create mode 100644 drivers/tee/optee/supp.c

diff --git a/MAINTAINERS b/MAINTAINERS
index ef689b0afc53..6e070bed5f58 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7935,6 +7935,11 @@ F:	arch/*/oprofile/
 F:	drivers/oprofile/
 F:	include/linux/oprofile.h
 
+OP-TEE DRIVER
+M:	Jens Wiklander <jens.wiklander@linaro.org>
+S:	Maintained
+F:	drivers/tee/optee/
+
 ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
 M:	Mark Fasheh <mfasheh@suse.com>
 M:	Joel Becker <jlbec@evilplan.org>
diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig
index 50c244ead46d..2330a4eb4e8b 100644
--- a/drivers/tee/Kconfig
+++ b/drivers/tee/Kconfig
@@ -6,3 +6,13 @@ config TEE
 	help
 	  This implements a generic interface towards a Trusted Execution
 	  Environment (TEE).
+
+if TEE
+
+menu "TEE drivers"
+
+source "drivers/tee/optee/Kconfig"
+
+endmenu
+
+endif
diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile
index ec64047a86e2..7a4e4a1ac39c 100644
--- a/drivers/tee/Makefile
+++ b/drivers/tee/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_TEE) += tee.o
 tee-objs += tee_core.o
 tee-objs += tee_shm.o
 tee-objs += tee_shm_pool.o
+obj-$(CONFIG_OPTEE) += optee/
diff --git a/drivers/tee/optee/Kconfig b/drivers/tee/optee/Kconfig
new file mode 100644
index 000000000000..0126de898036
--- /dev/null
+++ b/drivers/tee/optee/Kconfig
@@ -0,0 +1,7 @@
+# OP-TEE Trusted Execution Environment Configuration
+config OPTEE
+	tristate "OP-TEE"
+	depends on HAVE_ARM_SMCCC
+	help
+	  This implements the OP-TEE Trusted Execution Environment (TEE)
+	  driver.
diff --git a/drivers/tee/optee/Makefile b/drivers/tee/optee/Makefile
new file mode 100644
index 000000000000..92fe5789bcce
--- /dev/null
+++ b/drivers/tee/optee/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_OPTEE) += optee.o
+optee-objs += core.o
+optee-objs += call.o
+optee-objs += rpc.o
+optee-objs += supp.o
diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
new file mode 100644
index 000000000000..f7b7b404c990
--- /dev/null
+++ b/drivers/tee/optee/call.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/arm-smccc.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include "optee_private.h"
+#include "optee_smc.h"
+
+struct optee_call_waiter {
+	struct list_head list_node;
+	struct completion c;
+};
+
+static void optee_cq_wait_init(struct optee_call_queue *cq,
+			       struct optee_call_waiter *w)
+{
+	/*
+	 * We're preparing to make a call to secure world. In case we can't
+	 * allocate a thread in secure world we'll end up waiting in
+	 * optee_cq_wait_for_completion().
+	 *
+	 * Normally if there's no contention in secure world the call will
+	 * complete and we can cleanup directly with optee_cq_wait_final().
+	 */
+	mutex_lock(&cq->mutex);
+
+	/*
+	 * We add ourselves to the queue, but we don't wait. This
+	 * guarantees that we don't lose a completion if secure world
+	 * returns busy and another thread just exited and try to complete
+	 * someone.
+	 */
+	init_completion(&w->c);
+	list_add_tail(&w->list_node, &cq->waiters);
+
+	mutex_unlock(&cq->mutex);
+}
+
+static void optee_cq_wait_for_completion(struct optee_call_queue *cq,
+					 struct optee_call_waiter *w)
+{
+	wait_for_completion(&w->c);
+
+	mutex_lock(&cq->mutex);
+
+	/* Move to end of list to get out of the way for other waiters */
+	list_del(&w->list_node);
+	reinit_completion(&w->c);
+	list_add_tail(&w->list_node, &cq->waiters);
+
+	mutex_unlock(&cq->mutex);
+}
+
+static void optee_cq_complete_one(struct optee_call_queue *cq)
+{
+	struct optee_call_waiter *w;
+
+	list_for_each_entry(w, &cq->waiters, list_node) {
+		if (!completion_done(&w->c)) {
+			complete(&w->c);
+			break;
+		}
+	}
+}
+
+static void optee_cq_wait_final(struct optee_call_queue *cq,
+				struct optee_call_waiter *w)
+{
+	/*
+	 * We're done with the call to secure world. The thread in secure
+	 * world that was used for this call is now available for some
+	 * other task to use.
+	 */
+	mutex_lock(&cq->mutex);
+
+	/* Get out of the list */
+	list_del(&w->list_node);
+
+	/* Wake up one eventual waiting task */
+	optee_cq_complete_one(cq);
+
+	/*
+	 * If we're completed we've got a completion from another task that
+	 * was just done with its call to secure world. Since yet another
+	 * thread now is available in secure world wake up another eventual
+	 * waiting task.
+	 */
+	if (completion_done(&w->c))
+		optee_cq_complete_one(cq);
+
+	mutex_unlock(&cq->mutex);
+}
+
+/* Requires the filpstate mutex to be held */
+static struct optee_session *find_session(struct optee_context_data *ctxdata,
+					  u32 session_id)
+{
+	struct optee_session *sess;
+
+	list_for_each_entry(sess, &ctxdata->sess_list, list_node)
+		if (sess->session_id == session_id)
+			return sess;
+
+	return NULL;
+}
+
+/**
+ * optee_do_call_with_arg() - Do an SMC to OP-TEE in secure world
+ * @ctx:	calling context
+ * @parg:	physical address of message to pass to secure world
+ *
+ * Does and SMC to OP-TEE in secure world and handles eventual resulting
+ * Remote Procedure Calls (RPC) from OP-TEE.
+ *
+ * Returns return code from secure world, 0 is OK
+ */
+u32 optee_do_call_with_arg(struct tee_context *ctx, phys_addr_t parg)
+{
+	struct optee *optee = tee_get_drvdata(ctx->teedev);
+	struct optee_call_waiter w;
+	struct optee_rpc_param param = { };
+	u32 ret;
+
+	param.a0 = OPTEE_SMC_CALL_WITH_ARG;
+	reg_pair_from_64(&param.a1, &param.a2, parg);
+	/* Initialize waiter */
+	optee_cq_wait_init(&optee->call_queue, &w);
+	while (true) {
+		struct arm_smccc_res res;
+
+		optee->invoke_fn(param.a0, param.a1, param.a2, param.a3,
+				 param.a4, param.a5, param.a6, param.a7,
+				 &res);
+
+		if (res.a0 == OPTEE_SMC_RETURN_ETHREAD_LIMIT) {
+			/*
+			 * Out of threads in secure world, wait for a thread
+			 * become available.
+			 */
+			optee_cq_wait_for_completion(&optee->call_queue, &w);
+		} else if (OPTEE_SMC_RETURN_IS_RPC(res.a0)) {
+			param.a0 = res.a0;
+			param.a1 = res.a1;
+			param.a2 = res.a2;
+			param.a3 = res.a3;
+			optee_handle_rpc(ctx, &param);
+		} else {
+			ret = res.a0;
+			break;
+		}
+	}
+
+	/*
+	 * We're done with our thread in secure world, if there's any
+	 * thread waiters wake up one.
+	 */
+	optee_cq_wait_final(&optee->call_queue, &w);
+
+	return ret;
+}
+
+static struct tee_shm *get_msg_arg(struct tee_context *ctx, size_t num_params,
+				   struct optee_msg_arg **msg_arg,
+				   phys_addr_t *msg_parg)
+{
+	int rc;
+	struct tee_shm *shm;
+	struct optee_msg_arg *ma;
+
+	shm = tee_shm_alloc(ctx, OPTEE_MSG_GET_ARG_SIZE(num_params),
+			    TEE_SHM_MAPPED);
+	if (IS_ERR(shm))
+		return shm;
+
+	ma = tee_shm_get_va(shm, 0);
+	if (IS_ERR(ma)) {
+		rc = PTR_ERR(ma);
+		goto out;
+	}
+
+	rc = tee_shm_get_pa(shm, 0, msg_parg);
+	if (rc)
+		goto out;
+
+	memset(ma, 0, OPTEE_MSG_GET_ARG_SIZE(num_params));
+	ma->num_params = num_params;
+	*msg_arg = ma;
+out:
+	if (rc) {
+		tee_shm_free(shm);
+		return ERR_PTR(rc);
+	}
+
+	return shm;
+}
+
+int optee_open_session(struct tee_context *ctx,
+		       struct tee_ioctl_open_session_arg *arg,
+		       struct tee_param *param)
+{
+	struct optee_context_data *ctxdata = ctx->data;
+	int rc;
+	struct tee_shm *shm;
+	struct optee_msg_arg *msg_arg;
+	phys_addr_t msg_parg;
+	struct optee_session *sess = NULL;
+
+	/* +2 for the meta parameters added below */
+	shm = get_msg_arg(ctx, arg->num_params + 2, &msg_arg, &msg_parg);
+	if (IS_ERR(shm))
+		return PTR_ERR(shm);
+
+	msg_arg->cmd = OPTEE_MSG_CMD_OPEN_SESSION;
+	msg_arg->cancel_id = arg->cancel_id;
+
+	/*
+	 * Initialize and add the meta parameters needed when opening a
+	 * session.
+	 */
+	msg_arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_VALUE_INPUT |
+				  OPTEE_MSG_ATTR_META;
+	msg_arg->params[1].attr = OPTEE_MSG_ATTR_TYPE_VALUE_INPUT |
+				  OPTEE_MSG_ATTR_META;
+	memcpy(&msg_arg->params[0].u.value, arg->uuid, sizeof(arg->uuid));
+	memcpy(&msg_arg->params[1].u.value, arg->uuid, sizeof(arg->clnt_uuid));
+	msg_arg->params[1].u.value.c = arg->clnt_login;
+
+	rc = optee_to_msg_param(msg_arg->params + 2, arg->num_params, param);
+	if (rc)
+		goto out;
+
+	sess = kzalloc(sizeof(*sess), GFP_KERNEL);
+	if (!sess) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (optee_do_call_with_arg(ctx, msg_parg)) {
+		msg_arg->ret = TEEC_ERROR_COMMUNICATION;
+		msg_arg->ret_origin = TEEC_ORIGIN_COMMS;
+	}
+
+	if (msg_arg->ret == TEEC_SUCCESS) {
+		/* A new session has been created, add it to the list. */
+		sess->session_id = msg_arg->session;
+		mutex_lock(&ctxdata->mutex);
+		list_add(&sess->list_node, &ctxdata->sess_list);
+		mutex_unlock(&ctxdata->mutex);
+	} else {
+		kfree(sess);
+	}
+
+	if (optee_from_msg_param(param, arg->num_params, msg_arg->params + 2)) {
+		arg->ret = TEEC_ERROR_COMMUNICATION;
+		arg->ret_origin = TEEC_ORIGIN_COMMS;
+		/* Close session again to avoid leakage */
+		optee_close_session(ctx, msg_arg->session);
+	} else {
+		arg->session = msg_arg->session;
+		arg->ret = msg_arg->ret;
+		arg->ret_origin = msg_arg->ret_origin;
+	}
+out:
+	tee_shm_free(shm);
+
+	return rc;
+}
+
+int optee_close_session(struct tee_context *ctx, u32 session)
+{
+	struct optee_context_data *ctxdata = ctx->data;
+	struct tee_shm *shm;
+	struct optee_msg_arg *msg_arg;
+	phys_addr_t msg_parg;
+	struct optee_session *sess;
+
+	/* Check that the session is valid and remove it from the list */
+	mutex_lock(&ctxdata->mutex);
+	sess = find_session(ctxdata, session);
+	if (sess)
+		list_del(&sess->list_node);
+	mutex_unlock(&ctxdata->mutex);
+	if (!sess)
+		return -EINVAL;
+	kfree(sess);
+
+	shm = get_msg_arg(ctx, 0, &msg_arg, &msg_parg);
+	if (IS_ERR(shm))
+		return PTR_ERR(shm);
+
+	msg_arg->cmd = OPTEE_MSG_CMD_CLOSE_SESSION;
+	msg_arg->session = session;
+	optee_do_call_with_arg(ctx, msg_parg);
+
+	tee_shm_free(shm);
+	return 0;
+}
+
+int optee_invoke_func(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg,
+		      struct tee_param *param)
+{
+	struct optee_context_data *ctxdata = ctx->data;
+	struct tee_shm *shm;
+	struct optee_msg_arg *msg_arg;
+	phys_addr_t msg_parg;
+	struct optee_session *sess;
+	int rc;
+
+	/* Check that the session is valid */
+	mutex_lock(&ctxdata->mutex);
+	sess = find_session(ctxdata, arg->session);
+	mutex_unlock(&ctxdata->mutex);
+	if (!sess)
+		return -EINVAL;
+
+	shm = get_msg_arg(ctx, arg->num_params, &msg_arg, &msg_parg);
+	if (IS_ERR(shm))
+		return PTR_ERR(shm);
+	msg_arg->cmd = OPTEE_MSG_CMD_INVOKE_COMMAND;
+	msg_arg->func = arg->func;
+	msg_arg->session = arg->session;
+	msg_arg->cancel_id = arg->cancel_id;
+
+	rc = optee_to_msg_param(msg_arg->params, arg->num_params, param);
+	if (rc)
+		goto out;
+
+	if (optee_do_call_with_arg(ctx, msg_parg)) {
+		msg_arg->ret = TEEC_ERROR_COMMUNICATION;
+		msg_arg->ret_origin = TEEC_ORIGIN_COMMS;
+	}
+
+	if (optee_from_msg_param(param, arg->num_params, msg_arg->params)) {
+		msg_arg->ret = TEEC_ERROR_COMMUNICATION;
+		msg_arg->ret_origin = TEEC_ORIGIN_COMMS;
+	}
+
+	arg->ret = msg_arg->ret;
+	arg->ret_origin = msg_arg->ret_origin;
+out:
+	tee_shm_free(shm);
+	return rc;
+}
+
+int optee_cancel_req(struct tee_context *ctx, u32 cancel_id, u32 session)
+{
+	struct optee_context_data *ctxdata = ctx->data;
+	struct tee_shm *shm;
+	struct optee_msg_arg *msg_arg;
+	phys_addr_t msg_parg;
+	struct optee_session *sess;
+
+	/* Check that the session is valid */
+	mutex_lock(&ctxdata->mutex);
+	sess = find_session(ctxdata, session);
+	mutex_unlock(&ctxdata->mutex);
+	if (!sess)
+		return -EINVAL;
+
+	shm = get_msg_arg(ctx, 0, &msg_arg, &msg_parg);
+	if (IS_ERR(shm))
+		return PTR_ERR(shm);
+
+	msg_arg->cmd = OPTEE_MSG_CMD_CANCEL;
+	msg_arg->session = session;
+	msg_arg->cancel_id = cancel_id;
+	optee_do_call_with_arg(ctx, msg_parg);
+
+	tee_shm_free(shm);
+	return 0;
+}
+
+/**
+ * optee_enable_shm_cache() - Enables caching of some shared memory allocation
+ *			      in OP-TEE
+ * @optee:	main service struct
+ */
+void optee_enable_shm_cache(struct optee *optee)
+{
+	struct optee_call_waiter w;
+
+	/* We need to retry until secure world isn't busy. */
+	optee_cq_wait_init(&optee->call_queue, &w);
+	while (true) {
+		struct arm_smccc_res res;
+
+		optee->invoke_fn(OPTEE_SMC_ENABLE_SHM_CACHE, 0, 0, 0, 0, 0, 0,
+				 0, &res);
+		if (res.a0 == OPTEE_SMC_RETURN_OK)
+			break;
+		optee_cq_wait_for_completion(&optee->call_queue, &w);
+	}
+	optee_cq_wait_final(&optee->call_queue, &w);
+}
+
+/**
+ * optee_disable_shm_cache() - Disables caching of some shared memory allocation
+ *			      in OP-TEE
+ * @optee:	main service struct
+ */
+void optee_disable_shm_cache(struct optee *optee)
+{
+	struct optee_call_waiter w;
+
+	/* We need to retry until secure world isn't busy. */
+	optee_cq_wait_init(&optee->call_queue, &w);
+	while (true) {
+		union {
+			struct arm_smccc_res smccc;
+			struct optee_smc_disable_shm_cache_result result;
+		} res;
+
+		optee->invoke_fn(OPTEE_SMC_DISABLE_SHM_CACHE, 0, 0, 0, 0, 0, 0,
+				 0, &res.smccc);
+		if (res.result.status == OPTEE_SMC_RETURN_ENOTAVAIL)
+			break; /* All shm's freed */
+		if (res.result.status == OPTEE_SMC_RETURN_OK) {
+			struct tee_shm *shm;
+
+			shm = reg_pair_to_ptr(res.result.shm_upper32,
+					      res.result.shm_lower32);
+			tee_shm_free(shm);
+		} else {
+			optee_cq_wait_for_completion(&optee->call_queue, &w);
+		}
+	}
+	optee_cq_wait_final(&optee->call_queue, &w);
+}
diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
new file mode 100644
index 000000000000..58169e519422
--- /dev/null
+++ b/drivers/tee/optee/core.c
@@ -0,0 +1,622 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/tee_drv.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include "optee_private.h"
+#include "optee_smc.h"
+
+#define DRIVER_NAME "optee"
+
+#define OPTEE_SHM_NUM_PRIV_PAGES	1
+
+/**
+ * optee_from_msg_param() - convert from OPTEE_MSG parameters to
+ *			    struct tee_param
+ * @params:	subsystem internal parameter representation
+ * @num_params:	number of elements in the parameter arrays
+ * @msg_params:	OPTEE_MSG parameters
+ * Returns 0 on success or <0 on failure
+ */
+int optee_from_msg_param(struct tee_param *params, size_t num_params,
+			 const struct optee_msg_param *msg_params)
+{
+	int rc;
+	size_t n;
+	struct tee_shm *shm;
+	phys_addr_t pa;
+
+	for (n = 0; n < num_params; n++) {
+		struct tee_param *p = params + n;
+		const struct optee_msg_param *mp = msg_params + n;
+		u32 attr = mp->attr & OPTEE_MSG_ATTR_TYPE_MASK;
+
+		switch (attr) {
+		case OPTEE_MSG_ATTR_TYPE_NONE:
+			p->attr = TEE_IOCTL_PARAM_ATTR_TYPE_NONE;
+			memset(&p->u, 0, sizeof(p->u));
+			break;
+		case OPTEE_MSG_ATTR_TYPE_VALUE_INPUT:
+		case OPTEE_MSG_ATTR_TYPE_VALUE_OUTPUT:
+		case OPTEE_MSG_ATTR_TYPE_VALUE_INOUT:
+			p->attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT +
+				  attr - OPTEE_MSG_ATTR_TYPE_VALUE_INPUT;
+			p->u.value.a = mp->u.value.a;
+			p->u.value.b = mp->u.value.b;
+			p->u.value.c = mp->u.value.c;
+			break;
+		case OPTEE_MSG_ATTR_TYPE_TMEM_INPUT:
+		case OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT:
+		case OPTEE_MSG_ATTR_TYPE_TMEM_INOUT:
+			p->attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT +
+				  attr - OPTEE_MSG_ATTR_TYPE_TMEM_INPUT;
+			p->u.memref.size = mp->u.tmem.size;
+			shm = (struct tee_shm *)(unsigned long)
+				mp->u.tmem.shm_ref;
+			if (!shm) {
+				p->u.memref.shm_offs = 0;
+				p->u.memref.shm = NULL;
+				break;
+			}
+			rc = tee_shm_get_pa(shm, 0, &pa);
+			if (rc)
+				return rc;
+			p->u.memref.shm_offs = mp->u.tmem.buf_ptr - pa;
+			p->u.memref.shm = shm;
+
+			/* Check that the memref is covered by the shm object */
+			if (p->u.memref.size) {
+				size_t o = p->u.memref.shm_offs +
+					   p->u.memref.size - 1;
+
+				rc = tee_shm_get_pa(shm, o, NULL);
+				if (rc)
+					return rc;
+			}
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/**
+ * optee_to_msg_param() - convert from struct tee_params to OPTEE_MSG parameters
+ * @msg_params:	OPTEE_MSG parameters
+ * @num_params:	number of elements in the parameter arrays
+ * @params:	subsystem itnernal parameter representation
+ * Returns 0 on success or <0 on failure
+ */
+int optee_to_msg_param(struct optee_msg_param *msg_params, size_t num_params,
+		       const struct tee_param *params)
+{
+	int rc;
+	size_t n;
+	phys_addr_t pa;
+
+	for (n = 0; n < num_params; n++) {
+		const struct tee_param *p = params + n;
+		struct optee_msg_param *mp = msg_params + n;
+
+		switch (p->attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_NONE:
+			mp->attr = TEE_IOCTL_PARAM_ATTR_TYPE_NONE;
+			memset(&mp->u, 0, sizeof(mp->u));
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			mp->attr = OPTEE_MSG_ATTR_TYPE_VALUE_INPUT + p->attr -
+				   TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT;
+			mp->u.value.a = p->u.value.a;
+			mp->u.value.b = p->u.value.b;
+			mp->u.value.c = p->u.value.c;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			mp->attr = OPTEE_MSG_ATTR_TYPE_TMEM_INPUT +
+				   p->attr -
+				   TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT;
+			mp->u.tmem.shm_ref = (unsigned long)p->u.memref.shm;
+			mp->u.tmem.size = p->u.memref.size;
+			if (!p->u.memref.shm) {
+				mp->u.tmem.buf_ptr = 0;
+				break;
+			}
+			rc = tee_shm_get_pa(p->u.memref.shm,
+					    p->u.memref.shm_offs, &pa);
+			if (rc)
+				return rc;
+			mp->u.tmem.buf_ptr = pa;
+			mp->attr |= OPTEE_MSG_ATTR_CACHE_PREDEFINED <<
+					OPTEE_MSG_ATTR_CACHE_SHIFT;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static void optee_get_version(struct tee_device *teedev,
+			      struct tee_ioctl_version_data *vers)
+{
+	struct tee_ioctl_version_data v = {
+		.impl_id = TEE_IMPL_ID_OPTEE,
+		.impl_caps = TEE_OPTEE_CAP_TZ,
+		.gen_caps = TEE_GEN_CAP_GP,
+	};
+	*vers = v;
+}
+
+static int optee_open(struct tee_context *ctx)
+{
+	struct optee_context_data *ctxdata;
+	struct tee_device *teedev = ctx->teedev;
+	struct optee *optee = tee_get_drvdata(teedev);
+
+	ctxdata = kzalloc(sizeof(*ctxdata), GFP_KERNEL);
+	if (!ctxdata)
+		return -ENOMEM;
+
+	if (teedev == optee->supp_teedev) {
+		bool busy = true;
+
+		mutex_lock(&optee->supp.ctx_mutex);
+		if (!optee->supp.ctx) {
+			busy = false;
+			optee->supp.ctx = ctx;
+		}
+		mutex_unlock(&optee->supp.ctx_mutex);
+		if (busy) {
+			kfree(ctxdata);
+			return -EBUSY;
+		}
+	}
+
+	mutex_init(&ctxdata->mutex);
+	INIT_LIST_HEAD(&ctxdata->sess_list);
+
+	ctx->data = ctxdata;
+	return 0;
+}
+
+static void optee_release(struct tee_context *ctx)
+{
+	struct optee_context_data *ctxdata = ctx->data;
+	struct tee_device *teedev = ctx->teedev;
+	struct optee *optee = tee_get_drvdata(teedev);
+	struct tee_shm *shm;
+	struct optee_msg_arg *arg = NULL;
+	phys_addr_t parg;
+	struct optee_session *sess;
+	struct optee_session *sess_tmp;
+
+	if (!ctxdata)
+		return;
+
+	shm = tee_shm_alloc(ctx, sizeof(struct optee_msg_arg), TEE_SHM_MAPPED);
+	if (!IS_ERR(shm)) {
+		arg = tee_shm_get_va(shm, 0);
+		/*
+		 * If va2pa fails for some reason, we can't call
+		 * optee_close_session(), only free the memory. Secure OS
+		 * will leak sessions and finally refuse more sessions, but
+		 * we will at least let normal world reclaim its memory.
+		 */
+		if (!IS_ERR(arg))
+			tee_shm_va2pa(shm, arg, &parg);
+	}
+
+	list_for_each_entry_safe(sess, sess_tmp, &ctxdata->sess_list,
+				 list_node) {
+		list_del(&sess->list_node);
+		if (!IS_ERR_OR_NULL(arg)) {
+			memset(arg, 0, sizeof(*arg));
+			arg->cmd = OPTEE_MSG_CMD_CLOSE_SESSION;
+			arg->session = sess->session_id;
+			optee_do_call_with_arg(ctx, parg);
+		}
+		kfree(sess);
+	}
+	kfree(ctxdata);
+
+	if (!IS_ERR(shm))
+		tee_shm_free(shm);
+
+	ctx->data = NULL;
+
+	if (teedev == optee->supp_teedev) {
+		mutex_lock(&optee->supp.ctx_mutex);
+		optee->supp.ctx = NULL;
+		mutex_unlock(&optee->supp.ctx_mutex);
+	}
+}
+
+static struct tee_driver_ops optee_ops = {
+	.get_version = optee_get_version,
+	.open = optee_open,
+	.release = optee_release,
+	.open_session = optee_open_session,
+	.close_session = optee_close_session,
+	.invoke_func = optee_invoke_func,
+	.cancel_req = optee_cancel_req,
+};
+
+static struct tee_desc optee_desc = {
+	.name = DRIVER_NAME "-clnt",
+	.ops = &optee_ops,
+	.owner = THIS_MODULE,
+};
+
+static struct tee_driver_ops optee_supp_ops = {
+	.get_version = optee_get_version,
+	.open = optee_open,
+	.release = optee_release,
+	.supp_recv = optee_supp_recv,
+	.supp_send = optee_supp_send,
+};
+
+static struct tee_desc optee_supp_desc = {
+	.name = DRIVER_NAME "-supp",
+	.ops = &optee_supp_ops,
+	.owner = THIS_MODULE,
+	.flags = TEE_DESC_PRIVILEGED,
+};
+
+static bool optee_msg_api_uid_is_optee_api(optee_invoke_fn *invoke_fn)
+{
+	struct arm_smccc_res res;
+
+	invoke_fn(OPTEE_SMC_CALLS_UID, 0, 0, 0, 0, 0, 0, 0, &res);
+
+	if (res.a0 == OPTEE_MSG_UID_0 && res.a1 == OPTEE_MSG_UID_1 &&
+	    res.a2 == OPTEE_MSG_UID_2 && res.a3 == OPTEE_MSG_UID_3)
+		return true;
+	return false;
+}
+
+static bool optee_msg_api_revision_is_compatible(optee_invoke_fn *invoke_fn)
+{
+	union {
+		struct arm_smccc_res smccc;
+		struct optee_smc_calls_revision_result result;
+	} res;
+
+	invoke_fn(OPTEE_SMC_CALLS_REVISION, 0, 0, 0, 0, 0, 0, 0, &res.smccc);
+
+	if (res.result.major == OPTEE_MSG_REVISION_MAJOR &&
+	    (int)res.result.minor >= OPTEE_MSG_REVISION_MINOR)
+		return true;
+	return false;
+}
+
+static bool optee_msg_exchange_capabilities(optee_invoke_fn *invoke_fn,
+					    u32 *sec_caps)
+{
+	union {
+		struct arm_smccc_res smccc;
+		struct optee_smc_exchange_capabilities_result result;
+	} res;
+	u32 a1 = 0;
+
+	/*
+	 * TODO This isn't enough to tell if it's UP system (from kernel
+	 * point of view) or not, is_smp() returns the the information
+	 * needed, but can't be called directly from here.
+	 */
+	if (!IS_ENABLED(CONFIG_SMP) || nr_cpu_ids == 1)
+		a1 |= OPTEE_SMC_NSEC_CAP_UNIPROCESSOR;
+
+	invoke_fn(OPTEE_SMC_EXCHANGE_CAPABILITIES, a1, 0, 0, 0, 0, 0, 0,
+		  &res.smccc);
+
+	if (res.result.status != OPTEE_SMC_RETURN_OK)
+		return false;
+
+	*sec_caps = res.result.capabilities;
+	return true;
+}
+
+static struct tee_shm_pool *
+optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm)
+{
+	union {
+		struct arm_smccc_res smccc;
+		struct optee_smc_get_shm_config_result result;
+	} res;
+	struct tee_shm_pool *pool;
+	unsigned long vaddr;
+	phys_addr_t paddr;
+	size_t size;
+	phys_addr_t begin;
+	phys_addr_t end;
+	void *va;
+	struct tee_shm_pool_mem_info priv_info;
+	struct tee_shm_pool_mem_info dmabuf_info;
+
+	invoke_fn(OPTEE_SMC_GET_SHM_CONFIG, 0, 0, 0, 0, 0, 0, 0, &res.smccc);
+	if (res.result.status != OPTEE_SMC_RETURN_OK) {
+		pr_info("shm service not available\n");
+		return ERR_PTR(-ENOENT);
+	}
+
+	if (res.result.settings != OPTEE_SMC_SHM_CACHED) {
+		pr_err("only normal cached shared memory supported\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	begin = roundup(res.result.start, PAGE_SIZE);
+	end = rounddown(res.result.start + res.result.size, PAGE_SIZE);
+	paddr = begin;
+	size = end - begin;
+
+	if (size < 2 * OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE) {
+		pr_err("too small shared memory area\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	va = memremap(paddr, size, MEMREMAP_WB);
+	if (!va) {
+		pr_err("shared memory ioremap failed\n");
+		return ERR_PTR(-EINVAL);
+	}
+	vaddr = (unsigned long)va;
+
+	priv_info.vaddr = vaddr;
+	priv_info.paddr = paddr;
+	priv_info.size = OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
+	dmabuf_info.vaddr = vaddr + OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
+	dmabuf_info.paddr = paddr + OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
+	dmabuf_info.size = size - OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
+
+	pool = tee_shm_pool_alloc_res_mem(&priv_info, &dmabuf_info);
+	if (IS_ERR(pool)) {
+		memunmap(va);
+		goto out;
+	}
+
+	*memremaped_shm = va;
+out:
+	return pool;
+}
+
+/* Simple wrapper functions to be able to use a function pointer */
+static void optee_smccc_smc(unsigned long a0, unsigned long a1,
+			    unsigned long a2, unsigned long a3,
+			    unsigned long a4, unsigned long a5,
+			    unsigned long a6, unsigned long a7,
+			    struct arm_smccc_res *res)
+{
+	arm_smccc_smc(a0, a1, a2, a3, a4, a5, a6, a7, res);
+}
+
+static void optee_smccc_hvc(unsigned long a0, unsigned long a1,
+			    unsigned long a2, unsigned long a3,
+			    unsigned long a4, unsigned long a5,
+			    unsigned long a6, unsigned long a7,
+			    struct arm_smccc_res *res)
+{
+	arm_smccc_hvc(a0, a1, a2, a3, a4, a5, a6, a7, res);
+}
+
+static optee_invoke_fn *get_invoke_func(struct device_node *np)
+{
+	const char *method;
+
+	pr_info("probing for conduit method from DT.\n");
+
+	if (of_property_read_string(np, "method", &method)) {
+		pr_warn("missing \"method\" property\n");
+		return ERR_PTR(-ENXIO);
+	}
+
+	if (!strcmp("hvc", method))
+		return optee_smccc_hvc;
+	else if (!strcmp("smc", method))
+		return optee_smccc_smc;
+
+	pr_warn("invalid \"method\" property: %s\n", method);
+	return ERR_PTR(-EINVAL);
+}
+
+static struct optee *optee_probe(struct device_node *np)
+{
+	optee_invoke_fn *invoke_fn;
+	struct tee_shm_pool *pool;
+	struct optee *optee = NULL;
+	void *memremaped_shm = NULL;
+	struct tee_device *teedev;
+	u32 sec_caps;
+	int rc;
+
+	invoke_fn = get_invoke_func(np);
+	if (IS_ERR(invoke_fn))
+		return (void *)invoke_fn;
+
+	if (!optee_msg_api_uid_is_optee_api(invoke_fn)) {
+		pr_warn("api uid mismatch\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!optee_msg_api_revision_is_compatible(invoke_fn)) {
+		pr_warn("api revision mismatch\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!optee_msg_exchange_capabilities(invoke_fn, &sec_caps)) {
+		pr_warn("capabilities mismatch\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	/*
+	 * We have no other option for shared memory, if secure world
+	 * doesn't have any reserved memory we can use we can't continue.
+	 */
+	if (!(sec_caps & OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM))
+		return ERR_PTR(-EINVAL);
+
+	pool = optee_config_shm_memremap(invoke_fn, &memremaped_shm);
+	if (IS_ERR(pool))
+		return (void *)pool;
+
+	optee = kzalloc(sizeof(*optee), GFP_KERNEL);
+	if (!optee) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	optee->invoke_fn = invoke_fn;
+
+	teedev = tee_device_alloc(&optee_desc, NULL, pool, optee);
+	if (IS_ERR(teedev)) {
+		rc = PTR_ERR(teedev);
+		goto err;
+	}
+	optee->teedev = teedev;
+
+	teedev = tee_device_alloc(&optee_supp_desc, NULL, pool, optee);
+	if (IS_ERR(teedev)) {
+		rc = PTR_ERR(teedev);
+		goto err;
+	}
+	optee->supp_teedev = teedev;
+
+	rc = tee_device_register(optee->teedev);
+	if (rc)
+		goto err;
+
+	rc = tee_device_register(optee->supp_teedev);
+	if (rc)
+		goto err;
+
+	mutex_init(&optee->call_queue.mutex);
+	INIT_LIST_HEAD(&optee->call_queue.waiters);
+	optee_wait_queue_init(&optee->wait_queue);
+	optee_supp_init(&optee->supp);
+	optee->memremaped_shm = memremaped_shm;
+	optee->pool = pool;
+
+	optee_enable_shm_cache(optee);
+
+	pr_info("initialized driver\n");
+	return optee;
+err:
+	if (optee) {
+		/*
+		 * tee_device_unregister() is safe to call even if the
+		 * devices hasn't been registered with
+		 * tee_device_register() yet.
+		 */
+		tee_device_unregister(optee->supp_teedev);
+		tee_device_unregister(optee->teedev);
+		kfree(optee);
+	}
+	if (pool)
+		tee_shm_pool_free(pool);
+	if (memremaped_shm)
+		memunmap(memremaped_shm);
+	return ERR_PTR(rc);
+}
+
+static void optee_remove(struct optee *optee)
+{
+	/*
+	 * Ask OP-TEE to free all cached shared memory objects to decrease
+	 * reference counters and also avoid wild pointers in secure world
+	 * into the old shared memory range.
+	 */
+	optee_disable_shm_cache(optee);
+
+	/*
+	 * The two devices has to be unregistered before we can free the
+	 * other resources.
+	 */
+	tee_device_unregister(optee->supp_teedev);
+	tee_device_unregister(optee->teedev);
+
+	tee_shm_pool_free(optee->pool);
+	if (optee->memremaped_shm)
+		memunmap(optee->memremaped_shm);
+	optee_wait_queue_exit(&optee->wait_queue);
+	optee_supp_uninit(&optee->supp);
+	mutex_destroy(&optee->call_queue.mutex);
+
+	kfree(optee);
+}
+
+static const struct of_device_id optee_match[] = {
+	{ .compatible = "linaro,optee-tz" },
+	{},
+};
+
+static struct optee *optee_svc;
+
+static int __init optee_driver_init(void)
+{
+	struct device_node *fw_np;
+	struct device_node *np;
+	struct optee *optee;
+
+	/* Node is supposed to be below /firmware */
+	fw_np = of_find_node_by_name(NULL, "firmware");
+	if (!fw_np)
+		return -ENODEV;
+
+	np = of_find_matching_node(fw_np, optee_match);
+	of_node_put(fw_np);
+	if (!np)
+		return -ENODEV;
+
+	optee = optee_probe(np);
+	of_node_put(np);
+
+	if (IS_ERR(optee))
+		return PTR_ERR(optee);
+
+	optee_svc = optee;
+
+	return 0;
+}
+module_init(optee_driver_init);
+
+static void __exit optee_driver_exit(void)
+{
+	struct optee *optee = optee_svc;
+
+	optee_svc = NULL;
+	if (optee)
+		optee_remove(optee);
+}
+module_exit(optee_driver_exit);
+
+MODULE_AUTHOR("Linaro");
+MODULE_DESCRIPTION("OP-TEE driver");
+MODULE_SUPPORTED_DEVICE("");
+MODULE_VERSION("1.0");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/tee/optee/optee_msg.h b/drivers/tee/optee/optee_msg.h
new file mode 100644
index 000000000000..dd7a06ee0462
--- /dev/null
+++ b/drivers/tee/optee/optee_msg.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _OPTEE_MSG_H
+#define _OPTEE_MSG_H
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+
+/*
+ * This file defines the OP-TEE message protocol used to communicate
+ * with an instance of OP-TEE running in secure world.
+ *
+ * This file is divided into three sections.
+ * 1. Formatting of messages.
+ * 2. Requests from normal world
+ * 3. Requests from secure world, Remote Procedure Call (RPC), handled by
+ *    tee-supplicant.
+ */
+
+/*****************************************************************************
+ * Part 1 - formatting of messages
+ *****************************************************************************/
+
+#define OPTEE_MSG_ATTR_TYPE_NONE		0x0
+#define OPTEE_MSG_ATTR_TYPE_VALUE_INPUT		0x1
+#define OPTEE_MSG_ATTR_TYPE_VALUE_OUTPUT	0x2
+#define OPTEE_MSG_ATTR_TYPE_VALUE_INOUT		0x3
+#define OPTEE_MSG_ATTR_TYPE_RMEM_INPUT		0x5
+#define OPTEE_MSG_ATTR_TYPE_RMEM_OUTPUT		0x6
+#define OPTEE_MSG_ATTR_TYPE_RMEM_INOUT		0x7
+#define OPTEE_MSG_ATTR_TYPE_TMEM_INPUT		0x9
+#define OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT		0xa
+#define OPTEE_MSG_ATTR_TYPE_TMEM_INOUT		0xb
+
+#define OPTEE_MSG_ATTR_TYPE_MASK		GENMASK(7, 0)
+
+/*
+ * Meta parameter to be absorbed by the Secure OS and not passed
+ * to the Trusted Application.
+ *
+ * Currently only used with OPTEE_MSG_CMD_OPEN_SESSION.
+ */
+#define OPTEE_MSG_ATTR_META			BIT(8)
+
+/*
+ * The temporary shared memory object is not physically contigous and this
+ * temp memref is followed by another fragment until the last temp memref
+ * that doesn't have this bit set.
+ */
+#define OPTEE_MSG_ATTR_FRAGMENT			BIT(9)
+
+/*
+ * Memory attributes for caching passed with temp memrefs. The actual value
+ * used is defined outside the message protocol with the exception of
+ * OPTEE_MSG_ATTR_CACHE_PREDEFINED which means the attributes already
+ * defined for the memory range should be used. If optee_smc.h is used as
+ * bearer of this protocol OPTEE_SMC_SHM_* is used for values.
+ */
+#define OPTEE_MSG_ATTR_CACHE_SHIFT		16
+#define OPTEE_MSG_ATTR_CACHE_MASK		GENMASK(2, 0)
+#define OPTEE_MSG_ATTR_CACHE_PREDEFINED		0
+
+/*
+ * Same values as TEE_LOGIN_* from TEE Internal API
+ */
+#define OPTEE_MSG_LOGIN_PUBLIC			0x00000000
+#define OPTEE_MSG_LOGIN_USER			0x00000001
+#define OPTEE_MSG_LOGIN_GROUP			0x00000002
+#define OPTEE_MSG_LOGIN_APPLICATION		0x00000004
+#define OPTEE_MSG_LOGIN_APPLICATION_USER	0x00000005
+#define OPTEE_MSG_LOGIN_APPLICATION_GROUP	0x00000006
+
+/**
+ * struct optee_msg_param_tmem - temporary memory reference parameter
+ * @buf_ptr:	Address of the buffer
+ * @size:	Size of the buffer
+ * @shm_ref:	Temporary shared memory reference, pointer to a struct tee_shm
+ *
+ * Secure and normal world communicates pointers as physical address
+ * instead of the virtual address. This is because secure and normal world
+ * have completely independent memory mapping. Normal world can even have a
+ * hypervisor which need to translate the guest physical address (AKA IPA
+ * in ARM documentation) to a real physical address before passing the
+ * structure to secure world.
+ */
+struct optee_msg_param_tmem {
+	u64 buf_ptr;
+	u64 size;
+	u64 shm_ref;
+};
+
+/**
+ * struct optee_msg_param_rmem - registered memory reference parameter
+ * @offs:	Offset into shared memory reference
+ * @size:	Size of the buffer
+ * @shm_ref:	Shared memory reference, pointer to a struct tee_shm
+ */
+struct optee_msg_param_rmem {
+	u64 offs;
+	u64 size;
+	u64 shm_ref;
+};
+
+/**
+ * struct optee_msg_param_value - opaque value parameter
+ *
+ * Value parameters are passed unchecked between normal and secure world.
+ */
+struct optee_msg_param_value {
+	u64 a;
+	u64 b;
+	u64 c;
+};
+
+/**
+ * struct optee_msg_param - parameter used together with struct optee_msg_arg
+ * @attr:	attributes
+ * @tmem:	parameter by temporary memory reference
+ * @rmem:	parameter by registered memory reference
+ * @value:	parameter by opaque value
+ *
+ * @attr & OPTEE_MSG_ATTR_TYPE_MASK indicates if tmem, rmem or value is used in
+ * the union. OPTEE_MSG_ATTR_TYPE_VALUE_* indicates value,
+ * OPTEE_MSG_ATTR_TYPE_TMEM_* indicates tmem and
+ * OPTEE_MSG_ATTR_TYPE_RMEM_* indicates rmem.
+ * OPTEE_MSG_ATTR_TYPE_NONE indicates that none of the members are used.
+ */
+struct optee_msg_param {
+	u64 attr;
+	union {
+		struct optee_msg_param_tmem tmem;
+		struct optee_msg_param_rmem rmem;
+		struct optee_msg_param_value value;
+	} u;
+};
+
+/**
+ * struct optee_msg_arg - call argument
+ * @cmd: Command, one of OPTEE_MSG_CMD_* or OPTEE_MSG_RPC_CMD_*
+ * @func: Trusted Application function, specific to the Trusted Application,
+ *	     used if cmd == OPTEE_MSG_CMD_INVOKE_COMMAND
+ * @session: In parameter for all OPTEE_MSG_CMD_* except
+ *	     OPTEE_MSG_CMD_OPEN_SESSION where it's an output parameter instead
+ * @cancel_id: Cancellation id, a unique value to identify this request
+ * @ret: return value
+ * @ret_origin: origin of the return value
+ * @num_params: number of parameters supplied to the OS Command
+ * @params: the parameters supplied to the OS Command
+ *
+ * All normal calls to Trusted OS uses this struct. If cmd requires further
+ * information than what these field holds it can be passed as a parameter
+ * tagged as meta (setting the OPTEE_MSG_ATTR_META bit in corresponding
+ * attrs field). All parameters tagged as meta has to come first.
+ *
+ * Temp memref parameters can be fragmented if supported by the Trusted OS
+ * (when optee_smc.h is bearer of this protocol this is indicated with
+ * OPTEE_SMC_SEC_CAP_UNREGISTERED_SHM). If a logical memref parameter is
+ * fragmented then has all but the last fragment the
+ * OPTEE_MSG_ATTR_FRAGMENT bit set in attrs. Even if a memref is fragmented
+ * it will still be presented as a single logical memref to the Trusted
+ * Application.
+ */
+struct optee_msg_arg {
+	u32 cmd;
+	u32 func;
+	u32 session;
+	u32 cancel_id;
+	u32 pad;
+	u32 ret;
+	u32 ret_origin;
+	u32 num_params;
+
+	/* num_params tells the actual number of element in params */
+	struct optee_msg_param params[0];
+};
+
+/**
+ * OPTEE_MSG_GET_ARG_SIZE - return size of struct optee_msg_arg
+ *
+ * @num_params: Number of parameters embedded in the struct optee_msg_arg
+ *
+ * Returns the size of the struct optee_msg_arg together with the number
+ * of embedded parameters.
+ */
+#define OPTEE_MSG_GET_ARG_SIZE(num_params) \
+	(sizeof(struct optee_msg_arg) + \
+	 sizeof(struct optee_msg_param) * (num_params))
+
+/*****************************************************************************
+ * Part 2 - requests from normal world
+ *****************************************************************************/
+
+/*
+ * Return the following UID if using API specified in this file without
+ * further extensions:
+ * 384fb3e0-e7f8-11e3-af63-0002a5d5c51b.
+ * Represented in 4 32-bit words in OPTEE_MSG_UID_0, OPTEE_MSG_UID_1,
+ * OPTEE_MSG_UID_2, OPTEE_MSG_UID_3.
+ */
+#define OPTEE_MSG_UID_0			0x384fb3e0
+#define OPTEE_MSG_UID_1			0xe7f811e3
+#define OPTEE_MSG_UID_2			0xaf630002
+#define OPTEE_MSG_UID_3			0xa5d5c51b
+#define OPTEE_MSG_FUNCID_CALLS_UID	0xFF01
+
+/*
+ * Returns 2.0 if using API specified in this file without further
+ * extensions. Represented in 2 32-bit words in OPTEE_MSG_REVISION_MAJOR
+ * and OPTEE_MSG_REVISION_MINOR
+ */
+#define OPTEE_MSG_REVISION_MAJOR	2
+#define OPTEE_MSG_REVISION_MINOR	0
+#define OPTEE_MSG_FUNCID_CALLS_REVISION	0xFF03
+
+/*
+ * Get UUID of Trusted OS.
+ *
+ * Used by non-secure world to figure out which Trusted OS is installed.
+ * Note that returned UUID is the UUID of the Trusted OS, not of the API.
+ *
+ * Returns UUID in 4 32-bit words in the same way as
+ * OPTEE_MSG_FUNCID_CALLS_UID described above.
+ */
+#define OPTEE_MSG_OS_OPTEE_UUID_0	0x486178e0
+#define OPTEE_MSG_OS_OPTEE_UUID_1	0xe7f811e3
+#define OPTEE_MSG_OS_OPTEE_UUID_2	0xbc5e0002
+#define OPTEE_MSG_OS_OPTEE_UUID_3	0xa5d5c51b
+#define OPTEE_MSG_FUNCID_GET_OS_UUID	0x0000
+
+/*
+ * Get revision of Trusted OS.
+ *
+ * Used by non-secure world to figure out which version of the Trusted OS
+ * is installed. Note that the returned revision is the revision of the
+ * Trusted OS, not of the API.
+ *
+ * Returns revision in 2 32-bit words in the same way as
+ * OPTEE_MSG_CALLS_REVISION described above.
+ */
+#define OPTEE_MSG_FUNCID_GET_OS_REVISION	0x0001
+
+/*
+ * Do a secure call with struct optee_msg_arg as argument
+ * The OPTEE_MSG_CMD_* below defines what goes in struct optee_msg_arg::cmd
+ *
+ * OPTEE_MSG_CMD_OPEN_SESSION opens a session to a Trusted Application.
+ * The first two parameters are tagged as meta, holding two value
+ * parameters to pass the following information:
+ * param[0].u.value.a-b uuid of Trusted Application
+ * param[1].u.value.a-b uuid of Client
+ * param[1].u.value.c Login class of client OPTEE_MSG_LOGIN_*
+ *
+ * OPTEE_MSG_CMD_INVOKE_COMMAND invokes a command a previously opened
+ * session to a Trusted Application.  struct optee_msg_arg::func is Trusted
+ * Application function, specific to the Trusted Application.
+ *
+ * OPTEE_MSG_CMD_CLOSE_SESSION closes a previously opened session to
+ * Trusted Application.
+ *
+ * OPTEE_MSG_CMD_CANCEL cancels a currently invoked command.
+ *
+ * OPTEE_MSG_CMD_REGISTER_SHM registers a shared memory reference. The
+ * information is passed as:
+ * [in] param[0].attr			OPTEE_MSG_ATTR_TYPE_TMEM_INPUT
+ *					[| OPTEE_MSG_ATTR_FRAGMENT]
+ * [in] param[0].u.tmem.buf_ptr		physical address (of first fragment)
+ * [in] param[0].u.tmem.size		size (of first fragment)
+ * [in] param[0].u.tmem.shm_ref		holds shared memory reference
+ * ...
+ * The shared memory can optionally be fragmented, temp memrefs can follow
+ * each other with all but the last with the OPTEE_MSG_ATTR_FRAGMENT bit set.
+ *
+ * OPTEE_MSG_CMD_UNREGISTER_SHM unregisteres a previously registered shared
+ * memory reference. The information is passed as:
+ * [in] param[0].attr			OPTEE_MSG_ATTR_TYPE_RMEM_INPUT
+ * [in] param[0].u.rmem.shm_ref		holds shared memory reference
+ * [in] param[0].u.rmem.offs		0
+ * [in] param[0].u.rmem.size		0
+ */
+#define OPTEE_MSG_CMD_OPEN_SESSION	0
+#define OPTEE_MSG_CMD_INVOKE_COMMAND	1
+#define OPTEE_MSG_CMD_CLOSE_SESSION	2
+#define OPTEE_MSG_CMD_CANCEL		3
+#define OPTEE_MSG_CMD_REGISTER_SHM	4
+#define OPTEE_MSG_CMD_UNREGISTER_SHM	5
+#define OPTEE_MSG_FUNCID_CALL_WITH_ARG	0x0004
+
+/*****************************************************************************
+ * Part 3 - Requests from secure world, RPC
+ *****************************************************************************/
+
+/*
+ * All RPC is done with a struct optee_msg_arg as bearer of information,
+ * struct optee_msg_arg::arg holds values defined by OPTEE_MSG_RPC_CMD_* below
+ *
+ * RPC communication with tee-supplicant is reversed compared to normal
+ * client communication desribed above. The supplicant receives requests
+ * and sends responses.
+ */
+
+/*
+ * Load a TA into memory, defined in tee-supplicant
+ */
+#define OPTEE_MSG_RPC_CMD_LOAD_TA	0
+
+/*
+ * Reserved
+ */
+#define OPTEE_MSG_RPC_CMD_RPMB		1
+
+/*
+ * File system access, defined in tee-supplicant
+ */
+#define OPTEE_MSG_RPC_CMD_FS		2
+
+/*
+ * Get time
+ *
+ * Returns number of seconds and nano seconds since the Epoch,
+ * 1970-01-01 00:00:00 +0000 (UTC).
+ *
+ * [out] param[0].u.value.a	Number of seconds
+ * [out] param[0].u.value.b	Number of nano seconds.
+ */
+#define OPTEE_MSG_RPC_CMD_GET_TIME	3
+
+/*
+ * Wait queue primitive, helper for secure world to implement a wait queue.
+ *
+ * If secure world need to wait for a secure world mutex it issues a sleep
+ * request instead of spinning in secure world. Conversely is a wakeup
+ * request issued when a secure world mutex with a thread waiting thread is
+ * unlocked.
+ *
+ * Waiting on a key
+ * [in] param[0].u.value.a OPTEE_MSG_RPC_WAIT_QUEUE_SLEEP
+ * [in] param[0].u.value.b wait key
+ *
+ * Waking up a key
+ * [in] param[0].u.value.a OPTEE_MSG_RPC_WAIT_QUEUE_WAKEUP
+ * [in] param[0].u.value.b wakeup key
+ */
+#define OPTEE_MSG_RPC_CMD_WAIT_QUEUE	4
+#define OPTEE_MSG_RPC_WAIT_QUEUE_SLEEP	0
+#define OPTEE_MSG_RPC_WAIT_QUEUE_WAKEUP	1
+
+/*
+ * Suspend execution
+ *
+ * [in] param[0].value	.a number of milliseconds to suspend
+ */
+#define OPTEE_MSG_RPC_CMD_SUSPEND	5
+
+/*
+ * Allocate a piece of shared memory
+ *
+ * Shared memory can optionally be fragmented, to support that additional
+ * spare param entries are allocated to make room for eventual fragments.
+ * The spare param entries has .attr = OPTEE_MSG_ATTR_TYPE_NONE when
+ * unused. All returned temp memrefs except the last should have the
+ * OPTEE_MSG_ATTR_FRAGMENT bit set in the attr field.
+ *
+ * [in]  param[0].u.value.a		type of memory one of
+ *					OPTEE_MSG_RPC_SHM_TYPE_* below
+ * [in]  param[0].u.value.b		requested size
+ * [in]  param[0].u.value.c		required alignment
+ *
+ * [out] param[0].u.tmem.buf_ptr	physical address (of first fragment)
+ * [out] param[0].u.tmem.size		size (of first fragment)
+ * [out] param[0].u.tmem.shm_ref	shared memory reference
+ * ...
+ * [out] param[n].u.tmem.buf_ptr	physical address
+ * [out] param[n].u.tmem.size		size
+ * [out] param[n].u.tmem.shm_ref	shared memory reference (same value
+ *					as in param[n-1].u.tmem.shm_ref)
+ */
+#define OPTEE_MSG_RPC_CMD_SHM_ALLOC	6
+/* Memory that can be shared with a non-secure user space application */
+#define OPTEE_MSG_RPC_SHM_TYPE_APPL	0
+/* Memory only shared with non-secure kernel */
+#define OPTEE_MSG_RPC_SHM_TYPE_KERNEL	1
+
+/*
+ * Free shared memory previously allocated with OPTEE_MSG_RPC_CMD_SHM_ALLOC
+ *
+ * [in]  param[0].u.value.a		type of memory one of
+ *					OPTEE_MSG_RPC_SHM_TYPE_* above
+ * [in]  param[0].u.value.b		value of shared memory reference
+ *					returned in param[0].u.tmem.shm_ref
+ *					above
+ */
+#define OPTEE_MSG_RPC_CMD_SHM_FREE	7
+
+#endif /* _OPTEE_MSG_H */
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
new file mode 100644
index 000000000000..c374cd594314
--- /dev/null
+++ b/drivers/tee/optee/optee_private.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef OPTEE_PRIVATE_H
+#define OPTEE_PRIVATE_H
+
+#include <linux/arm-smccc.h>
+#include <linux/semaphore.h>
+#include <linux/tee_drv.h>
+#include <linux/types.h>
+#include "optee_msg.h"
+
+#define OPTEE_MAX_ARG_SIZE	1024
+
+/* Some Global Platform error codes used in this driver */
+#define TEEC_SUCCESS			0x00000000
+#define TEEC_ERROR_BAD_PARAMETERS	0xFFFF0006
+#define TEEC_ERROR_COMMUNICATION	0xFFFF000E
+#define TEEC_ERROR_OUT_OF_MEMORY	0xFFFF000C
+
+#define TEEC_ORIGIN_COMMS		0x00000002
+
+typedef void (optee_invoke_fn)(unsigned long, unsigned long, unsigned long,
+				unsigned long, unsigned long, unsigned long,
+				unsigned long, unsigned long,
+				struct arm_smccc_res *);
+
+struct optee_call_queue {
+	/* Serializes access to this struct */
+	struct mutex mutex;
+	struct list_head waiters;
+};
+
+struct optee_wait_queue {
+	/* Serializes access to this struct */
+	struct mutex mu;
+	struct list_head db;
+};
+
+/**
+ * struct optee_supp - supplicant synchronization struct
+ * @ctx			the context of current connected supplicant.
+ *			if !NULL the supplicant device is available for use,
+ *			else busy
+ * @ctx_mutex:		held while accessing @ctx
+ * @func:		supplicant function id to call
+ * @ret:		call return value
+ * @num_params:		number of elements in @param
+ * @param:		parameters for @func
+ * @req_posted:		if true, a request has been posted to the supplicant
+ * @supp_next_send:	if true, next step is for supplicant to send response
+ * @thrd_mutex:		held by the thread doing a request to supplicant
+ * @supp_mutex:		held by supplicant while operating on this struct
+ * @data_to_supp:	supplicant is waiting on this for next request
+ * @data_from_supp:	requesting thread is waiting on this to get the result
+ */
+struct optee_supp {
+	struct tee_context *ctx;
+	/* Serializes access of ctx */
+	struct mutex ctx_mutex;
+
+	u32 func;
+	u32 ret;
+	size_t num_params;
+	struct tee_param *param;
+
+	bool req_posted;
+	bool supp_next_send;
+	/* Serializes access to this struct for requesting thread */
+	struct mutex thrd_mutex;
+	/* Serializes access to this struct for supplicant threads */
+	struct mutex supp_mutex;
+	struct completion data_to_supp;
+	struct completion data_from_supp;
+};
+
+/**
+ * struct optee - main service struct
+ * @supp_teedev:	supplicant device
+ * @teedev:		client device
+ * @invoke_fn:		function to issue smc or hvc
+ * @call_queue:		queue of threads waiting to call @invoke_fn
+ * @wait_queue:		queue of threads from secure world waiting for a
+ *			secure world sync object
+ * @supp:		supplicant synchronization struct for RPC to supplicant
+ * @pool:		shared memory pool
+ * @memremaped_shm	virtual address of memory in shared memory pool
+ */
+struct optee {
+	struct tee_device *supp_teedev;
+	struct tee_device *teedev;
+	optee_invoke_fn *invoke_fn;
+	struct optee_call_queue call_queue;
+	struct optee_wait_queue wait_queue;
+	struct optee_supp supp;
+	struct tee_shm_pool *pool;
+	void *memremaped_shm;
+};
+
+struct optee_session {
+	struct list_head list_node;
+	u32 session_id;
+};
+
+struct optee_context_data {
+	/* Serializes access to this struct */
+	struct mutex mutex;
+	struct list_head sess_list;
+};
+
+struct optee_rpc_param {
+	u32	a0;
+	u32	a1;
+	u32	a2;
+	u32	a3;
+	u32	a4;
+	u32	a5;
+	u32	a6;
+	u32	a7;
+};
+
+void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param);
+
+void optee_wait_queue_init(struct optee_wait_queue *wq);
+void optee_wait_queue_exit(struct optee_wait_queue *wq);
+
+u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
+			struct tee_param *param);
+
+int optee_supp_read(struct tee_context *ctx, void __user *buf, size_t len);
+int optee_supp_write(struct tee_context *ctx, void __user *buf, size_t len);
+void optee_supp_init(struct optee_supp *supp);
+void optee_supp_uninit(struct optee_supp *supp);
+
+int optee_supp_recv(struct tee_context *ctx, u32 *func, u32 *num_params,
+		    struct tee_param *param);
+int optee_supp_send(struct tee_context *ctx, u32 ret, u32 num_params,
+		    struct tee_param *param);
+
+u32 optee_do_call_with_arg(struct tee_context *ctx, phys_addr_t parg);
+int optee_open_session(struct tee_context *ctx,
+		       struct tee_ioctl_open_session_arg *arg,
+		       struct tee_param *param);
+int optee_close_session(struct tee_context *ctx, u32 session);
+int optee_invoke_func(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg,
+		      struct tee_param *param);
+int optee_cancel_req(struct tee_context *ctx, u32 cancel_id, u32 session);
+
+void optee_enable_shm_cache(struct optee *optee);
+void optee_disable_shm_cache(struct optee *optee);
+
+int optee_from_msg_param(struct tee_param *params, size_t num_params,
+			 const struct optee_msg_param *msg_params);
+int optee_to_msg_param(struct optee_msg_param *msg_params, size_t num_params,
+		       const struct tee_param *params);
+
+/*
+ * Small helpers
+ */
+
+static inline void *reg_pair_to_ptr(u32 reg0, u32 reg1)
+{
+	return (void *)(unsigned long)(((u64)reg0 << 32) | reg1);
+}
+
+static inline void reg_pair_from_64(u32 *reg0, u32 *reg1, u64 val)
+{
+	*reg0 = val >> 32;
+	*reg1 = val;
+}
+
+#endif /*OPTEE_PRIVATE_H*/
diff --git a/drivers/tee/optee/optee_smc.h b/drivers/tee/optee/optee_smc.h
new file mode 100644
index 000000000000..13b7c98cdf25
--- /dev/null
+++ b/drivers/tee/optee/optee_smc.h
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef OPTEE_SMC_H
+#define OPTEE_SMC_H
+
+#include <linux/arm-smccc.h>
+#include <linux/bitops.h>
+
+#define OPTEE_SMC_STD_CALL_VAL(func_num) \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_STD_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_TRUSTED_OS, (func_num))
+#define OPTEE_SMC_FAST_CALL_VAL(func_num) \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_TRUSTED_OS, (func_num))
+
+/*
+ * Function specified by SMC Calling convention.
+ */
+#define OPTEE_SMC_FUNCID_CALLS_COUNT	0xFF00
+#define OPTEE_SMC_CALLS_COUNT \
+	ARM_SMCCC_CALL_VAL(OPTEE_SMC_FAST_CALL, SMCCC_SMC_32, \
+			   SMCCC_OWNER_TRUSTED_OS_END, \
+			   OPTEE_SMC_FUNCID_CALLS_COUNT)
+
+/*
+ * Normal cached memory (write-back), shareable for SMP systems and not
+ * shareable for UP systems.
+ */
+#define OPTEE_SMC_SHM_CACHED		1
+
+/*
+ * a0..a7 is used as register names in the descriptions below, on arm32
+ * that translates to r0..r7 and on arm64 to w0..w7. In both cases it's
+ * 32-bit registers.
+ */
+
+/*
+ * Function specified by SMC Calling convention
+ *
+ * Return one of the following UIDs if using API specified in this file
+ * without further extentions:
+ * 65cb6b93-af0c-4617-8ed6-644a8d1140f8
+ * see also OPTEE_SMC_UID_* in optee_msg.h
+ */
+#define OPTEE_SMC_FUNCID_CALLS_UID OPTEE_MSG_FUNCID_CALLS_UID
+#define OPTEE_SMC_CALLS_UID \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_TRUSTED_OS_END, \
+			   OPTEE_SMC_FUNCID_CALLS_UID)
+
+/*
+ * Function specified by SMC Calling convention
+ *
+ * Returns 2.0 if using API specified in this file without further extentions.
+ * see also OPTEE_MSG_REVISION_* in optee_msg.h
+ */
+#define OPTEE_SMC_FUNCID_CALLS_REVISION OPTEE_MSG_FUNCID_CALLS_REVISION
+#define OPTEE_SMC_CALLS_REVISION \
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+			   ARM_SMCCC_OWNER_TRUSTED_OS_END, \
+			   OPTEE_SMC_FUNCID_CALLS_REVISION)
+
+struct optee_smc_calls_revision_result {
+	unsigned long major;
+	unsigned long minor;
+	unsigned long reserved0;
+	unsigned long reserved1;
+};
+
+/*
+ * Get UUID of Trusted OS.
+ *
+ * Used by non-secure world to figure out which Trusted OS is installed.
+ * Note that returned UUID is the UUID of the Trusted OS, not of the API.
+ *
+ * Returns UUID in a0-4 in the same way as OPTEE_SMC_CALLS_UID
+ * described above.
+ */
+#define OPTEE_SMC_FUNCID_GET_OS_UUID OPTEE_MSG_FUNCID_GET_OS_UUID
+#define OPTEE_SMC_CALL_GET_OS_UUID \
+	OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_GET_OS_UUID)
+
+/*
+ * Get revision of Trusted OS.
+ *
+ * Used by non-secure world to figure out which version of the Trusted OS
+ * is installed. Note that the returned revision is the revision of the
+ * Trusted OS, not of the API.
+ *
+ * Returns revision in a0-1 in the same way as OPTEE_SMC_CALLS_REVISION
+ * described above.
+ */
+#define OPTEE_SMC_FUNCID_GET_OS_REVISION OPTEE_MSG_FUNCID_GET_OS_REVISION
+#define OPTEE_SMC_CALL_GET_OS_REVISION \
+	OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_GET_OS_REVISION)
+
+/*
+ * Call with struct optee_msg_arg as argument
+ *
+ * Call register usage:
+ * a0	SMC Function ID, OPTEE_SMC*CALL_WITH_ARG
+ * a1	Upper 32bit of a 64bit physical pointer to a struct optee_msg_arg
+ * a2	Lower 32bit of a 64bit physical pointer to a struct optee_msg_arg
+ * a3	Cache settings, not used if physical pointer is in a predefined shared
+ *	memory area else per OPTEE_SMC_SHM_*
+ * a4-6	Not used
+ * a7	Hypervisor Client ID register
+ *
+ * Normal return register usage:
+ * a0	Return value, OPTEE_SMC_RETURN_*
+ * a1-3	Not used
+ * a4-7	Preserved
+ *
+ * OPTEE_SMC_RETURN_ETHREAD_LIMIT return register usage:
+ * a0	Return value, OPTEE_SMC_RETURN_ETHREAD_LIMIT
+ * a1-3	Preserved
+ * a4-7	Preserved
+ *
+ * RPC return register usage:
+ * a0	Return value, OPTEE_SMC_RETURN_IS_RPC(val)
+ * a1-2	RPC parameters
+ * a3-7	Resume information, must be preserved
+ *
+ * Possible return values:
+ * OPTEE_SMC_RETURN_UNKNOWN_FUNCTION	Trusted OS does not recognize this
+ *					function.
+ * OPTEE_SMC_RETURN_OK			Call completed, result updated in
+ *					the previously supplied struct
+ *					optee_msg_arg.
+ * OPTEE_SMC_RETURN_ETHREAD_LIMIT	Number of Trusted OS threads exceeded,
+ *					try again later.
+ * OPTEE_SMC_RETURN_EBADADDR		Bad physcial pointer to struct
+ *					optee_msg_arg.
+ * OPTEE_SMC_RETURN_EBADCMD		Bad/unknown cmd in struct optee_msg_arg
+ * OPTEE_SMC_RETURN_IS_RPC()		Call suspended by RPC call to normal
+ *					world.
+ */
+#define OPTEE_SMC_FUNCID_CALL_WITH_ARG OPTEE_MSG_FUNCID_CALL_WITH_ARG
+#define OPTEE_SMC_CALL_WITH_ARG \
+	OPTEE_SMC_STD_CALL_VAL(OPTEE_SMC_FUNCID_CALL_WITH_ARG)
+
+/*
+ * Get Shared Memory Config
+ *
+ * Returns the Secure/Non-secure shared memory config.
+ *
+ * Call register usage:
+ * a0	SMC Function ID, OPTEE_SMC_GET_SHM_CONFIG
+ * a1-6	Not used
+ * a7	Hypervisor Client ID register
+ *
+ * Have config return register usage:
+ * a0	OPTEE_SMC_RETURN_OK
+ * a1	Physical address of start of SHM
+ * a2	Size of of SHM
+ * a3	Cache settings of memory, as defined by the
+ *	OPTEE_SMC_SHM_* values above
+ * a4-7	Preserved
+ *
+ * Not available register usage:
+ * a0	OPTEE_SMC_RETURN_ENOTAVAIL
+ * a1-3 Not used
+ * a4-7	Preserved
+ */
+#define OPTEE_SMC_FUNCID_GET_SHM_CONFIG	7
+#define OPTEE_SMC_GET_SHM_CONFIG \
+	OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_GET_SHM_CONFIG)
+
+struct optee_smc_get_shm_config_result {
+	unsigned long status;
+	unsigned long start;
+	unsigned long size;
+	unsigned long settings;
+};
+
+/*
+ * Exchanges capabilities between normal world and secure world
+ *
+ * Call register usage:
+ * a0	SMC Function ID, OPTEE_SMC_EXCHANGE_CAPABILITIES
+ * a1	bitfield of normal world capabilities OPTEE_SMC_NSEC_CAP_*
+ * a2-6	Not used
+ * a7	Hypervisor Client ID register
+ *
+ * Normal return register usage:
+ * a0	OPTEE_SMC_RETURN_OK
+ * a1	bitfield of secure world capabilities OPTEE_SMC_SEC_CAP_*
+ * a2-7	Preserved
+ *
+ * Error return register usage:
+ * a0	OPTEE_SMC_RETURN_ENOTAVAIL, can't use the capabilities from normal world
+ * a1	bitfield of secure world capabilities OPTEE_SMC_SEC_CAP_*
+ * a2-7 Preserved
+ */
+/* Normal world works as a uniprocessor system */
+#define OPTEE_SMC_NSEC_CAP_UNIPROCESSOR		BIT(0)
+/* Secure world has reserved shared memory for normal world to use */
+#define OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM	BIT(0)
+/* Secure world can communicate via previously unregistered shared memory */
+#define OPTEE_SMC_SEC_CAP_UNREGISTERED_SHM	BIT(1)
+#define OPTEE_SMC_FUNCID_EXCHANGE_CAPABILITIES	9
+#define OPTEE_SMC_EXCHANGE_CAPABILITIES \
+	OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_EXCHANGE_CAPABILITIES)
+
+struct optee_smc_exchange_capabilities_result {
+	unsigned long status;
+	unsigned long capabilities;
+	unsigned long reserved0;
+	unsigned long reserved1;
+};
+
+/*
+ * Disable and empties cache of shared memory objects
+ *
+ * Secure world can cache frequently used shared memory objects, for
+ * example objects used as RPC arguments. When secure world is idle this
+ * function returns one shared memory reference to free. To disable the
+ * cache and free all cached objects this function has to be called until
+ * it returns OPTEE_SMC_RETURN_ENOTAVAIL.
+ *
+ * Call register usage:
+ * a0	SMC Function ID, OPTEE_SMC_DISABLE_SHM_CACHE
+ * a1-6	Not used
+ * a7	Hypervisor Client ID register
+ *
+ * Normal return register usage:
+ * a0	OPTEE_SMC_RETURN_OK
+ * a1	Upper 32bit of a 64bit Shared memory cookie
+ * a2	Lower 32bit of a 64bit Shared memory cookie
+ * a3-7	Preserved
+ *
+ * Cache empty return register usage:
+ * a0	OPTEE_SMC_RETURN_ENOTAVAIL
+ * a1-7	Preserved
+ *
+ * Not idle return register usage:
+ * a0	OPTEE_SMC_RETURN_EBUSY
+ * a1-7	Preserved
+ */
+#define OPTEE_SMC_FUNCID_DISABLE_SHM_CACHE	10
+#define OPTEE_SMC_DISABLE_SHM_CACHE \
+	OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_DISABLE_SHM_CACHE)
+
+struct optee_smc_disable_shm_cache_result {
+	unsigned long status;
+	unsigned long shm_upper32;
+	unsigned long shm_lower32;
+	unsigned long reserved0;
+};
+
+/*
+ * Enable cache of shared memory objects
+ *
+ * Secure world can cache frequently used shared memory objects, for
+ * example objects used as RPC arguments. When secure world is idle this
+ * function returns OPTEE_SMC_RETURN_OK and the cache is enabled. If
+ * secure world isn't idle OPTEE_SMC_RETURN_EBUSY is returned.
+ *
+ * Call register usage:
+ * a0	SMC Function ID, OPTEE_SMC_ENABLE_SHM_CACHE
+ * a1-6	Not used
+ * a7	Hypervisor Client ID register
+ *
+ * Normal return register usage:
+ * a0	OPTEE_SMC_RETURN_OK
+ * a1-7	Preserved
+ *
+ * Not idle return register usage:
+ * a0	OPTEE_SMC_RETURN_EBUSY
+ * a1-7	Preserved
+ */
+#define OPTEE_SMC_FUNCID_ENABLE_SHM_CACHE	11
+#define OPTEE_SMC_ENABLE_SHM_CACHE \
+	OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_ENABLE_SHM_CACHE)
+
+/*
+ * Resume from RPC (for example after processing an IRQ)
+ *
+ * Call register usage:
+ * a0	SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC
+ * a1-3	Value of a1-3 when OPTEE_SMC_CALL_WITH_ARG returned
+ *	OPTEE_SMC_RETURN_RPC in a0
+ *
+ * Return register usage is the same as for OPTEE_SMC_*CALL_WITH_ARG above.
+ *
+ * Possible return values
+ * OPTEE_SMC_RETURN_UNKNOWN_FUNCTION	Trusted OS does not recognize this
+ *					function.
+ * OPTEE_SMC_RETURN_OK			Original call completed, result
+ *					updated in the previously supplied.
+ *					struct optee_msg_arg
+ * OPTEE_SMC_RETURN_RPC			Call suspended by RPC call to normal
+ *					world.
+ * OPTEE_SMC_RETURN_ERESUME		Resume failed, the opaque resume
+ *					information was corrupt.
+ */
+#define OPTEE_SMC_FUNCID_RETURN_FROM_RPC	3
+#define OPTEE_SMC_CALL_RETURN_FROM_RPC \
+	OPTEE_SMC_STD_CALL_VAL(OPTEE_SMC_FUNCID_RETURN_FROM_RPC)
+
+#define OPTEE_SMC_RETURN_RPC_PREFIX_MASK	0xFFFF0000
+#define OPTEE_SMC_RETURN_RPC_PREFIX		0xFFFF0000
+#define OPTEE_SMC_RETURN_RPC_FUNC_MASK		0x0000FFFF
+
+#define OPTEE_SMC_RETURN_GET_RPC_FUNC(ret) \
+	((ret) & OPTEE_SMC_RETURN_RPC_FUNC_MASK)
+
+#define OPTEE_SMC_RPC_VAL(func)		((func) | OPTEE_SMC_RETURN_RPC_PREFIX)
+
+/*
+ * Allocate memory for RPC parameter passing. The memory is used to hold a
+ * struct optee_msg_arg.
+ *
+ * "Call" register usage:
+ * a0	This value, OPTEE_SMC_RETURN_RPC_ALLOC
+ * a1	Size in bytes of required argument memory
+ * a2	Not used
+ * a3	Resume information, must be preserved
+ * a4-5	Not used
+ * a6-7	Resume information, must be preserved
+ *
+ * "Return" register usage:
+ * a0	SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC.
+ * a1	Upper 32bits of 64bit physical pointer to allocated
+ *	memory, (a1 == 0 && a2 == 0) if size was 0 or if memory can't
+ *	be allocated.
+ * a2	Lower 32bits of 64bit physical pointer to allocated
+ *	memory, (a1 == 0 && a2 == 0) if size was 0 or if memory can't
+ *	be allocated
+ * a3	Preserved
+ * a4	Upper 32bits of 64bit Shared memory cookie used when freeing
+ *	the memory or doing an RPC
+ * a5	Lower 32bits of 64bit Shared memory cookie used when freeing
+ *	the memory or doing an RPC
+ * a6-7	Preserved
+ */
+#define OPTEE_SMC_RPC_FUNC_ALLOC	0
+#define OPTEE_SMC_RETURN_RPC_ALLOC \
+	OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_ALLOC)
+
+/*
+ * Free memory previously allocated by OPTEE_SMC_RETURN_RPC_ALLOC
+ *
+ * "Call" register usage:
+ * a0	This value, OPTEE_SMC_RETURN_RPC_FREE
+ * a1	Upper 32bits of 64bit shared memory cookie belonging to this
+ *	argument memory
+ * a2	Lower 32bits of 64bit shared memory cookie belonging to this
+ *	argument memory
+ * a3-7	Resume information, must be preserved
+ *
+ * "Return" register usage:
+ * a0	SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC.
+ * a1-2	Not used
+ * a3-7	Preserved
+ */
+#define OPTEE_SMC_RPC_FUNC_FREE		2
+#define OPTEE_SMC_RETURN_RPC_FREE \
+	OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_FREE)
+
+/*
+ * Deliver an IRQ in normal world.
+ *
+ * "Call" register usage:
+ * a0	OPTEE_SMC_RETURN_RPC_IRQ
+ * a1-7	Resume information, must be preserved
+ *
+ * "Return" register usage:
+ * a0	SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC.
+ * a1-7	Preserved
+ */
+#define OPTEE_SMC_RPC_FUNC_IRQ		4
+#define OPTEE_SMC_RETURN_RPC_IRQ \
+	OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_IRQ)
+
+/*
+ * Do an RPC request. The supplied struct optee_msg_arg tells which
+ * request to do and the parameters for the request. The following fields
+ * are used (the rest are unused):
+ * - cmd		the Request ID
+ * - ret		return value of the request, filled in by normal world
+ * - num_params		number of parameters for the request
+ * - params		the parameters
+ * - param_attrs	attributes of the parameters
+ *
+ * "Call" register usage:
+ * a0	OPTEE_SMC_RETURN_RPC_CMD
+ * a1	Upper 32bit of a 64bit Shared memory cookie holding a
+ *	struct optee_msg_arg, must be preserved, only the data should
+ *	be updated
+ * a2	Lower 32bit of a 64bit Shared memory cookie holding a
+ *	struct optee_msg_arg, must be preserved, only the data should
+ *	be updated
+ * a3-7	Resume information, must be preserved
+ *
+ * "Return" register usage:
+ * a0	SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC.
+ * a1-2	Not used
+ * a3-7	Preserved
+ */
+#define OPTEE_SMC_RPC_FUNC_CMD		5
+#define OPTEE_SMC_RETURN_RPC_CMD \
+	OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_CMD)
+
+/* Returned in a0 */
+#define OPTEE_SMC_RETURN_UNKNOWN_FUNCTION 0xFFFFFFFF
+
+/* Returned in a0 only from Trusted OS functions */
+#define OPTEE_SMC_RETURN_OK		0x0
+#define OPTEE_SMC_RETURN_ETHREAD_LIMIT	0x1
+#define OPTEE_SMC_RETURN_EBUSY		0x2
+#define OPTEE_SMC_RETURN_ERESUME	0x3
+#define OPTEE_SMC_RETURN_EBADADDR	0x4
+#define OPTEE_SMC_RETURN_EBADCMD	0x5
+#define OPTEE_SMC_RETURN_ENOMEM		0x6
+#define OPTEE_SMC_RETURN_ENOTAVAIL	0x7
+#define OPTEE_SMC_RETURN_IS_RPC(ret)	__optee_smc_return_is_rpc((ret))
+
+static inline bool __optee_smc_return_is_rpc(u32 ret)
+{
+	return ret != OPTEE_SMC_RETURN_UNKNOWN_FUNCTION &&
+	       (ret & OPTEE_SMC_RETURN_RPC_PREFIX_MASK) ==
+			OPTEE_SMC_RETURN_RPC_PREFIX;
+}
+
+#endif /* OPTEE_SMC_H */
diff --git a/drivers/tee/optee/rpc.c b/drivers/tee/optee/rpc.c
new file mode 100644
index 000000000000..8814eca06021
--- /dev/null
+++ b/drivers/tee/optee/rpc.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "optee_private.h"
+#include "optee_smc.h"
+
+struct wq_entry {
+	struct list_head link;
+	struct completion c;
+	u32 key;
+};
+
+void optee_wait_queue_init(struct optee_wait_queue *priv)
+{
+	mutex_init(&priv->mu);
+	INIT_LIST_HEAD(&priv->db);
+}
+
+void optee_wait_queue_exit(struct optee_wait_queue *priv)
+{
+	mutex_destroy(&priv->mu);
+}
+
+static void handle_rpc_func_cmd_get_time(struct optee_msg_arg *arg)
+{
+	struct timespec64 ts;
+
+	if (arg->num_params != 1)
+		goto bad;
+	if ((arg->params[0].attr & OPTEE_MSG_ATTR_TYPE_MASK) !=
+			OPTEE_MSG_ATTR_TYPE_VALUE_OUTPUT)
+		goto bad;
+
+	getnstimeofday64(&ts);
+	arg->params[0].u.value.a = ts.tv_sec;
+	arg->params[0].u.value.b = ts.tv_nsec;
+
+	arg->ret = TEEC_SUCCESS;
+	return;
+bad:
+	arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+}
+
+static struct wq_entry *wq_entry_get(struct optee_wait_queue *wq, u32 key)
+{
+	struct wq_entry *w;
+
+	mutex_lock(&wq->mu);
+
+	list_for_each_entry(w, &wq->db, link)
+		if (w->key == key)
+			goto out;
+
+	w = kmalloc(sizeof(*w), GFP_KERNEL);
+	if (w) {
+		init_completion(&w->c);
+		w->key = key;
+		list_add_tail(&w->link, &wq->db);
+	}
+out:
+	mutex_unlock(&wq->mu);
+	return w;
+}
+
+static void wq_sleep(struct optee_wait_queue *wq, u32 key)
+{
+	struct wq_entry *w = wq_entry_get(wq, key);
+
+	if (w) {
+		wait_for_completion(&w->c);
+		mutex_lock(&wq->mu);
+		list_del(&w->link);
+		mutex_unlock(&wq->mu);
+		kfree(w);
+	}
+}
+
+static void wq_wakeup(struct optee_wait_queue *wq, u32 key)
+{
+	struct wq_entry *w = wq_entry_get(wq, key);
+
+	if (w)
+		complete(&w->c);
+}
+
+static void handle_rpc_func_cmd_wq(struct optee *optee,
+				   struct optee_msg_arg *arg)
+{
+	if (arg->num_params != 1)
+		goto bad;
+
+	if ((arg->params[0].attr & OPTEE_MSG_ATTR_TYPE_MASK) !=
+			OPTEE_MSG_ATTR_TYPE_VALUE_INPUT)
+		goto bad;
+
+	switch (arg->params[0].u.value.a) {
+	case OPTEE_MSG_RPC_WAIT_QUEUE_SLEEP:
+		wq_sleep(&optee->wait_queue, arg->params[0].u.value.b);
+		break;
+	case OPTEE_MSG_RPC_WAIT_QUEUE_WAKEUP:
+		wq_wakeup(&optee->wait_queue, arg->params[0].u.value.b);
+		break;
+	default:
+		goto bad;
+	}
+
+	arg->ret = TEEC_SUCCESS;
+	return;
+bad:
+	arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+}
+
+static void handle_rpc_func_cmd_wait(struct optee_msg_arg *arg)
+{
+	u32 msec_to_wait;
+
+	if (arg->num_params != 1)
+		goto bad;
+
+	if ((arg->params[0].attr & OPTEE_MSG_ATTR_TYPE_MASK) !=
+			OPTEE_MSG_ATTR_TYPE_VALUE_INPUT)
+		goto bad;
+
+	msec_to_wait = arg->params[0].u.value.a;
+
+	/* set task's state to interruptible sleep */
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* take a nap */
+	msleep(msec_to_wait);
+
+	arg->ret = TEEC_SUCCESS;
+	return;
+bad:
+	arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+}
+
+static void handle_rpc_supp_cmd(struct tee_context *ctx,
+				struct optee_msg_arg *arg)
+{
+	struct tee_param *params;
+
+	arg->ret_origin = TEEC_ORIGIN_COMMS;
+
+	params = kmalloc_array(arg->num_params, sizeof(struct tee_param),
+			       GFP_KERNEL);
+	if (!params) {
+		arg->ret = TEEC_ERROR_OUT_OF_MEMORY;
+		return;
+	}
+
+	if (optee_from_msg_param(params, arg->num_params, arg->params)) {
+		arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+		goto out;
+	}
+
+	arg->ret = optee_supp_thrd_req(ctx, arg->cmd, arg->num_params, params);
+
+	if (optee_to_msg_param(arg->params, arg->num_params, params))
+		arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+out:
+	kfree(params);
+}
+
+static struct tee_shm *cmd_alloc_suppl(struct tee_context *ctx, size_t sz)
+{
+	u32 ret;
+	struct tee_param param;
+	struct optee *optee = tee_get_drvdata(ctx->teedev);
+	struct tee_shm *shm;
+
+	param.attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT;
+	param.u.value.a = OPTEE_MSG_RPC_SHM_TYPE_APPL;
+	param.u.value.b = sz;
+	param.u.value.c = 0;
+
+	ret = optee_supp_thrd_req(ctx, OPTEE_MSG_RPC_CMD_SHM_ALLOC, 1, &param);
+	if (ret)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&optee->supp.ctx_mutex);
+	/* Increases count as secure world doesn't have a reference */
+	shm = tee_shm_get_from_id(optee->supp.ctx, param.u.value.c);
+	mutex_unlock(&optee->supp.ctx_mutex);
+	return shm;
+}
+
+static void handle_rpc_func_cmd_shm_alloc(struct tee_context *ctx,
+					  struct optee_msg_arg *arg)
+{
+	phys_addr_t pa;
+	struct tee_shm *shm;
+	size_t sz;
+	size_t n;
+
+	arg->ret_origin = TEEC_ORIGIN_COMMS;
+
+	if (!arg->num_params ||
+	    arg->params[0].attr != OPTEE_MSG_ATTR_TYPE_VALUE_INPUT) {
+		arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+		return;
+	}
+
+	for (n = 1; n < arg->num_params; n++) {
+		if (arg->params[n].attr != OPTEE_MSG_ATTR_TYPE_NONE) {
+			arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+			return;
+		}
+	}
+
+	sz = arg->params[0].u.value.b;
+	switch (arg->params[0].u.value.a) {
+	case OPTEE_MSG_RPC_SHM_TYPE_APPL:
+		shm = cmd_alloc_suppl(ctx, sz);
+		break;
+	case OPTEE_MSG_RPC_SHM_TYPE_KERNEL:
+		shm = tee_shm_alloc(ctx, sz, TEE_SHM_MAPPED);
+		break;
+	default:
+		arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+		return;
+	}
+
+	if (IS_ERR(shm)) {
+		arg->ret = TEEC_ERROR_OUT_OF_MEMORY;
+		return;
+	}
+
+	if (tee_shm_get_pa(shm, 0, &pa)) {
+		arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+		goto bad;
+	}
+
+	arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT;
+	arg->params[0].u.tmem.buf_ptr = pa;
+	arg->params[0].u.tmem.size = sz;
+	arg->params[0].u.tmem.shm_ref = (unsigned long)shm;
+	arg->ret = TEEC_SUCCESS;
+	return;
+bad:
+	tee_shm_free(shm);
+}
+
+static void cmd_free_suppl(struct tee_context *ctx, struct tee_shm *shm)
+{
+	struct tee_param param;
+
+	param.attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT;
+	param.u.value.a = OPTEE_MSG_RPC_SHM_TYPE_APPL;
+	param.u.value.b = tee_shm_get_id(shm);
+	param.u.value.c = 0;
+
+	/*
+	 * Match the tee_shm_get_from_id() in cmd_alloc_suppl() as secure
+	 * world has released its reference.
+	 *
+	 * It's better to do this before sending the request to supplicant
+	 * as we'd like to let the process doing the initial allocation to
+	 * do release the last reference too in order to avoid stacking
+	 * many pending fput() on the client process. This could otherwise
+	 * happen if secure world does many allocate and free in a single
+	 * invoke.
+	 */
+	tee_shm_put(shm);
+
+	optee_supp_thrd_req(ctx, OPTEE_MSG_RPC_CMD_SHM_FREE, 1, &param);
+}
+
+static void handle_rpc_func_cmd_shm_free(struct tee_context *ctx,
+					 struct optee_msg_arg *arg)
+{
+	struct tee_shm *shm;
+
+	arg->ret_origin = TEEC_ORIGIN_COMMS;
+
+	if (arg->num_params != 1 ||
+	    arg->params[0].attr != OPTEE_MSG_ATTR_TYPE_VALUE_INPUT) {
+		arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+		return;
+	}
+
+	shm = (struct tee_shm *)(unsigned long)arg->params[0].u.value.b;
+	switch (arg->params[0].u.value.a) {
+	case OPTEE_MSG_RPC_SHM_TYPE_APPL:
+		cmd_free_suppl(ctx, shm);
+		break;
+	case OPTEE_MSG_RPC_SHM_TYPE_KERNEL:
+		tee_shm_free(shm);
+		break;
+	default:
+		arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+	}
+	arg->ret = TEEC_SUCCESS;
+}
+
+static void handle_rpc_func_cmd(struct tee_context *ctx, struct optee *optee,
+				struct tee_shm *shm)
+{
+	struct optee_msg_arg *arg;
+
+	arg = tee_shm_get_va(shm, 0);
+	if (IS_ERR(arg)) {
+		pr_err("%s: tee_shm_get_va %p failed\n", __func__, shm);
+		return;
+	}
+
+	switch (arg->cmd) {
+	case OPTEE_MSG_RPC_CMD_GET_TIME:
+		handle_rpc_func_cmd_get_time(arg);
+		break;
+	case OPTEE_MSG_RPC_CMD_WAIT_QUEUE:
+		handle_rpc_func_cmd_wq(optee, arg);
+		break;
+	case OPTEE_MSG_RPC_CMD_SUSPEND:
+		handle_rpc_func_cmd_wait(arg);
+		break;
+	case OPTEE_MSG_RPC_CMD_SHM_ALLOC:
+		handle_rpc_func_cmd_shm_alloc(ctx, arg);
+		break;
+	case OPTEE_MSG_RPC_CMD_SHM_FREE:
+		handle_rpc_func_cmd_shm_free(ctx, arg);
+		break;
+	default:
+		handle_rpc_supp_cmd(ctx, arg);
+	}
+}
+
+/**
+ * optee_handle_rpc() - handle RPC from secure world
+ * @ctx:	context doing the RPC
+ * @param:	value of registers for the RPC
+ *
+ * Result of RPC is written back into @param.
+ */
+void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param)
+{
+	struct tee_device *teedev = ctx->teedev;
+	struct optee *optee = tee_get_drvdata(teedev);
+	struct tee_shm *shm;
+	phys_addr_t pa;
+
+	switch (OPTEE_SMC_RETURN_GET_RPC_FUNC(param->a0)) {
+	case OPTEE_SMC_RPC_FUNC_ALLOC:
+		shm = tee_shm_alloc(ctx, param->a1, TEE_SHM_MAPPED);
+		if (!IS_ERR(shm) && !tee_shm_get_pa(shm, 0, &pa)) {
+			reg_pair_from_64(&param->a1, &param->a2, pa);
+			reg_pair_from_64(&param->a4, &param->a5,
+					 (unsigned long)shm);
+		} else {
+			param->a1 = 0;
+			param->a2 = 0;
+			param->a4 = 0;
+			param->a5 = 0;
+		}
+		break;
+	case OPTEE_SMC_RPC_FUNC_FREE:
+		shm = reg_pair_to_ptr(param->a1, param->a2);
+		tee_shm_free(shm);
+		break;
+	case OPTEE_SMC_RPC_FUNC_IRQ:
+		/*
+		 * An IRQ was raised while secure world was executing,
+		 * since all IRQs are handled in Linux a dummy RPC is
+		 * performed to let Linux take the IRQ through the normal
+		 * vector.
+		 */
+		break;
+	case OPTEE_SMC_RPC_FUNC_CMD:
+		shm = reg_pair_to_ptr(param->a1, param->a2);
+		handle_rpc_func_cmd(ctx, optee, shm);
+		break;
+	default:
+		pr_warn("Unknown RPC func 0x%x\n",
+			(u32)OPTEE_SMC_RETURN_GET_RPC_FUNC(param->a0));
+		break;
+	}
+
+	param->a0 = OPTEE_SMC_CALL_RETURN_FROM_RPC;
+}
diff --git a/drivers/tee/optee/supp.c b/drivers/tee/optee/supp.c
new file mode 100644
index 000000000000..b4ea0678a436
--- /dev/null
+++ b/drivers/tee/optee/supp.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "optee_private.h"
+
+void optee_supp_init(struct optee_supp *supp)
+{
+	memset(supp, 0, sizeof(*supp));
+	mutex_init(&supp->ctx_mutex);
+	mutex_init(&supp->thrd_mutex);
+	mutex_init(&supp->supp_mutex);
+	init_completion(&supp->data_to_supp);
+	init_completion(&supp->data_from_supp);
+}
+
+void optee_supp_uninit(struct optee_supp *supp)
+{
+	mutex_destroy(&supp->ctx_mutex);
+	mutex_destroy(&supp->thrd_mutex);
+	mutex_destroy(&supp->supp_mutex);
+}
+
+/**
+ * optee_supp_thrd_req() - request service from supplicant
+ * @ctx:	context doing the request
+ * @func:	function requested
+ * @num_params:	number of elements in @param array
+ * @param:	parameters for function
+ *
+ * Returns result of operation to be passed to secure world
+ */
+u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
+			struct tee_param *param)
+{
+	bool interruptable;
+	struct optee *optee = tee_get_drvdata(ctx->teedev);
+	struct optee_supp *supp = &optee->supp;
+	u32 ret;
+
+	/*
+	 * Other threads blocks here until we've copied our answer from
+	 * supplicant.
+	 */
+	while (mutex_lock_interruptible(&supp->thrd_mutex)) {
+		/* See comment below on when the RPC can be interrupted. */
+		mutex_lock(&supp->ctx_mutex);
+		interruptable = !supp->ctx;
+		mutex_unlock(&supp->ctx_mutex);
+		if (interruptable)
+			return TEEC_ERROR_COMMUNICATION;
+	}
+
+	/*
+	 * We have exclusive access now since the supplicant at this
+	 * point is either doing a
+	 * wait_for_completion_interruptible(&supp->data_to_supp) or is in
+	 * userspace still about to do the ioctl() to enter
+	 * optee_supp_recv() below.
+	 */
+
+	supp->func = func;
+	supp->num_params = num_params;
+	supp->param = param;
+	supp->req_posted = true;
+
+	/* Let supplicant get the data */
+	complete(&supp->data_to_supp);
+
+	/*
+	 * Wait for supplicant to process and return result, once we've
+	 * returned from wait_for_completion(data_from_supp) we have
+	 * exclusive access again.
+	 */
+	while (wait_for_completion_interruptible(&supp->data_from_supp)) {
+		mutex_lock(&supp->ctx_mutex);
+		interruptable = !supp->ctx;
+		if (interruptable) {
+			/*
+			 * There's no supplicant available and since the
+			 * supp->ctx_mutex currently is held none can
+			 * become available until the mutex released
+			 * again.
+			 *
+			 * Interrupting an RPC to supplicant is only
+			 * allowed as a way of slightly improving the user
+			 * experience in case the supplicant hasn't been
+			 * started yet. During normal operation the supplicant
+			 * will serve all requests in a timely manner and
+			 * interrupting then wouldn't make sense.
+			 */
+			supp->ret = TEEC_ERROR_COMMUNICATION;
+			init_completion(&supp->data_to_supp);
+		}
+		mutex_unlock(&supp->ctx_mutex);
+		if (interruptable)
+			break;
+	}
+
+	ret = supp->ret;
+	supp->param = NULL;
+	supp->req_posted = false;
+
+	/* We're done, let someone else talk to the supplicant now. */
+	mutex_unlock(&supp->thrd_mutex);
+
+	return ret;
+}
+
+/**
+ * optee_supp_recv() - receive request for supplicant
+ * @ctx:	context receiving the request
+ * @func:	requested function in supplicant
+ * @num_params:	number of elements allocated in @param, updated with number
+ *		used elements
+ * @param:	space for parameters for @func
+ *
+ * Returns 0 on success or <0 on failure
+ */
+int optee_supp_recv(struct tee_context *ctx, u32 *func, u32 *num_params,
+		    struct tee_param *param)
+{
+	struct tee_device *teedev = ctx->teedev;
+	struct optee *optee = tee_get_drvdata(teedev);
+	struct optee_supp *supp = &optee->supp;
+	int rc;
+
+	/*
+	 * In case two threads in one supplicant is calling this function
+	 * simultaneously we need to protect the data with a mutex which
+	 * we'll release before returning.
+	 */
+	mutex_lock(&supp->supp_mutex);
+
+	if (supp->supp_next_send) {
+		/*
+		 * optee_supp_recv() has been called again without
+		 * a optee_supp_send() in between. Supplicant has
+		 * probably been restarted before it was able to
+		 * write back last result. Abort last request and
+		 * wait for a new.
+		 */
+		if (supp->req_posted) {
+			supp->ret = TEEC_ERROR_COMMUNICATION;
+			supp->supp_next_send = false;
+			complete(&supp->data_from_supp);
+		}
+	}
+
+	/*
+	 * This is where supplicant will be hanging most of the
+	 * time, let's make this interruptable so we can easily
+	 * restart supplicant if needed.
+	 */
+	if (wait_for_completion_interruptible(&supp->data_to_supp)) {
+		rc = -ERESTARTSYS;
+		goto out;
+	}
+
+	/* We have exlusive access to the data */
+
+	if (*num_params < supp->num_params) {
+		/*
+		 * Not enough room for parameters, tell supplicant
+		 * it failed and abort last request.
+		 */
+		supp->ret = TEEC_ERROR_COMMUNICATION;
+		rc = -EINVAL;
+		complete(&supp->data_from_supp);
+		goto out;
+	}
+
+	*func = supp->func;
+	*num_params = supp->num_params;
+	memcpy(param, supp->param,
+	       sizeof(struct tee_param) * supp->num_params);
+
+	/* Allow optee_supp_send() below to do its work */
+	supp->supp_next_send = true;
+
+	rc = 0;
+out:
+	mutex_unlock(&supp->supp_mutex);
+	return rc;
+}
+
+/**
+ * optee_supp_send() - send result of request from supplicant
+ * @ctx:	context sending result
+ * @ret:	return value of request
+ * @num_params:	number of parameters returned
+ * @param:	returned parameters
+ *
+ * Returns 0 on success or <0 on failure.
+ */
+int optee_supp_send(struct tee_context *ctx, u32 ret, u32 num_params,
+		    struct tee_param *param)
+{
+	struct tee_device *teedev = ctx->teedev;
+	struct optee *optee = tee_get_drvdata(teedev);
+	struct optee_supp *supp = &optee->supp;
+	size_t n;
+	int rc = 0;
+
+	/*
+	 * We still have exclusive access to the data since that's how we
+	 * left it when returning from optee_supp_read().
+	 */
+
+	/* See comment on mutex in optee_supp_read() above */
+	mutex_lock(&supp->supp_mutex);
+
+	if (!supp->supp_next_send) {
+		/*
+		 * Something strange is going on, supplicant shouldn't
+		 * enter optee_supp_send() in this state
+		 */
+		rc = -ENOENT;
+		goto out;
+	}
+
+	if (num_params != supp->num_params) {
+		/*
+		 * Something is wrong, let supplicant restart. Next call to
+		 * optee_supp_recv() will give an error to the requesting
+		 * thread and release it.
+		 */
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* Update out and in/out parameters */
+	for (n = 0; n < num_params; n++) {
+		struct tee_param *p = supp->param + n;
+
+		switch (p->attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			p->u.value.a = param[n].u.value.a;
+			p->u.value.b = param[n].u.value.b;
+			p->u.value.c = param[n].u.value.c;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			p->u.memref.size = param[n].u.memref.size;
+			break;
+		default:
+			break;
+		}
+	}
+	supp->ret = ret;
+
+	/* Allow optee_supp_recv() above to do its work */
+	supp->supp_next_send = false;
+
+	/* Let the requesting thread continue */
+	complete(&supp->data_from_supp);
+out:
+	mutex_unlock(&supp->supp_mutex);
+	return rc;
+}

From 9329ea145a6e1c142b244243199df26faccd6517 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Mon, 1 Jun 2015 16:15:25 +0200
Subject: [PATCH 1284/3220] Documentation: tee subsystem and op-tee driver

Change-Id: Ib5a195b501598bd6c7b869849938ab61797309ee
Acked-by: Andreas Dannenberg <dannenberg@ti.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 6a6e77006fcdba89708214556c6d560323e850fc)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 Documentation/00-INDEX |   2 +
 Documentation/tee.txt  | 118 +++++++++++++++++++++++++++++++++++++++++
 MAINTAINERS            |   1 +
 3 files changed, 121 insertions(+)
 create mode 100644 Documentation/tee.txt

diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index cd077ca0e1b8..bd3f803a4e06 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -435,6 +435,8 @@ sysrq.txt
 	- info on the magic SysRq key.
 target/
 	- directory with info on generating TCM v4 fabric .ko modules
+tee.txt
+	- info on the TEE subsystem and drivers
 this_cpu_ops.txt
 	- List rationale behind and the way to use this_cpu operations.
 thermal/
diff --git a/Documentation/tee.txt b/Documentation/tee.txt
new file mode 100644
index 000000000000..718599357596
--- /dev/null
+++ b/Documentation/tee.txt
@@ -0,0 +1,118 @@
+TEE subsystem
+This document describes the TEE subsystem in Linux.
+
+A TEE (Trusted Execution Environment) is a trusted OS running in some
+secure environment, for example, TrustZone on ARM CPUs, or a separate
+secure co-processor etc. A TEE driver handles the details needed to
+communicate with the TEE.
+
+This subsystem deals with:
+
+- Registration of TEE drivers
+
+- Managing shared memory between Linux and the TEE
+
+- Providing a generic API to the TEE
+
+The TEE interface
+=================
+
+include/uapi/linux/tee.h defines the generic interface to a TEE.
+
+User space (the client) connects to the driver by opening /dev/tee[0-9]* or
+/dev/teepriv[0-9]*.
+
+- TEE_IOC_SHM_ALLOC allocates shared memory and returns a file descriptor
+  which user space can mmap. When user space doesn't need the file
+  descriptor any more, it should be closed. When shared memory isn't needed
+  any longer it should be unmapped with munmap() to allow the reuse of
+  memory.
+
+- TEE_IOC_VERSION lets user space know which TEE this driver handles and
+  the its capabilities.
+
+- TEE_IOC_OPEN_SESSION opens a new session to a Trusted Application.
+
+- TEE_IOC_INVOKE invokes a function in a Trusted Application.
+
+- TEE_IOC_CANCEL may cancel an ongoing TEE_IOC_OPEN_SESSION or TEE_IOC_INVOKE.
+
+- TEE_IOC_CLOSE_SESSION closes a session to a Trusted Application.
+
+There are two classes of clients, normal clients and supplicants. The latter is
+a helper process for the TEE to access resources in Linux, for example file
+system access. A normal client opens /dev/tee[0-9]* and a supplicant opens
+/dev/teepriv[0-9].
+
+Much of the communication between clients and the TEE is opaque to the
+driver. The main job for the driver is to receive requests from the
+clients, forward them to the TEE and send back the results. In the case of
+supplicants the communication goes in the other direction, the TEE sends
+requests to the supplicant which then sends back the result.
+
+OP-TEE driver
+=============
+
+The OP-TEE driver handles OP-TEE [1] based TEEs. Currently it is only the ARM
+TrustZone based OP-TEE solution that is supported.
+
+Lowest level of communication with OP-TEE builds on ARM SMC Calling
+Convention (SMCCC) [2], which is the foundation for OP-TEE's SMC interface
+[3] used internally by the driver. Stacked on top of that is OP-TEE Message
+Protocol [4].
+
+OP-TEE SMC interface provides the basic functions required by SMCCC and some
+additional functions specific for OP-TEE. The most interesting functions are:
+
+- OPTEE_SMC_FUNCID_CALLS_UID (part of SMCCC) returns the version information
+  which is then returned by TEE_IOC_VERSION
+
+- OPTEE_SMC_CALL_GET_OS_UUID returns the particular OP-TEE implementation, used
+  to tell, for instance, a TrustZone OP-TEE apart from an OP-TEE running on a
+  separate secure co-processor.
+
+- OPTEE_SMC_CALL_WITH_ARG drives the OP-TEE message protocol
+
+- OPTEE_SMC_GET_SHM_CONFIG lets the driver and OP-TEE agree on which memory
+  range to used for shared memory between Linux and OP-TEE.
+
+The GlobalPlatform TEE Client API [5] is implemented on top of the generic
+TEE API.
+
+Picture of the relationship between the different components in the
+OP-TEE architecture.
+
+    User space                  Kernel                   Secure world
+    ~~~~~~~~~~                  ~~~~~~                   ~~~~~~~~~~~~
+ +--------+                                             +-------------+
+ | Client |                                             | Trusted     |
+ +--------+                                             | Application |
+    /\                                                  +-------------+
+    || +----------+                                           /\
+    || |tee-      |                                           ||
+    || |supplicant|                                           \/
+    || +----------+                                     +-------------+
+    \/      /\                                          | TEE Internal|
+ +-------+  ||                                          | API         |
+ + TEE   |  ||            +--------+--------+           +-------------+
+ | Client|  ||            | TEE    | OP-TEE |           | OP-TEE      |
+ | API   |  \/            | subsys | driver |           | Trusted OS  |
+ +-------+----------------+----+-------+----+-----------+-------------+
+ |      Generic TEE API        |       |     OP-TEE MSG               |
+ |      IOCTL (TEE_IOC_*)      |       |     SMCCC (OPTEE_SMC_CALL_*) |
+ +-----------------------------+       +------------------------------+
+
+RPC (Remote Procedure Call) are requests from secure world to kernel driver
+or tee-supplicant. An RPC is identified by a special range of SMCCC return
+values from OPTEE_SMC_CALL_WITH_ARG. RPC messages which are intended for the
+kernel are handled by the kernel driver. Other RPC messages will be forwarded to
+tee-supplicant without further involvement of the driver, except switching
+shared memory buffer representation.
+
+References:
+[1] https://github.com/OP-TEE/optee_os
+[2] http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html
+[3] drivers/tee/optee/optee_smc.h
+[4] drivers/tee/optee/optee_msg.h
+[5] http://www.globalplatform.org/specificationsdevice.asp look for
+    "TEE Client API Specification v1.0" and click download.
diff --git a/MAINTAINERS b/MAINTAINERS
index 6e070bed5f58..e1e620dd3f78 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9372,6 +9372,7 @@ S:	Maintained
 F:	include/linux/tee_drv.h
 F:	include/uapi/linux/tee.h
 F:	drivers/tee/
+F:	Documentation/tee.txt
 
 THUNDERBOLT DRIVER
 M:	Andreas Noever <andreas.noever@gmail.com>

From ed884ebd80a95ddbb13fb77402cb4c3f04e4ce62 Mon Sep 17 00:00:00 2001
From: Nick Bray <ncbray@google.com>
Date: Thu, 30 Nov 2017 15:49:54 -0800
Subject: [PATCH 1285/3220] ANDROID: initramfs: call free_initrd() when
 skipping init

Memory allocated for initrd would not be reclaimed if initializing ramfs
was skipped.

Bug: 69901741
Test: "grep MemTotal /proc/meminfo" increases by a few MB on an Android
device with a/b boot.

Change-Id: Ifbe094d303ed12cfd6de6aa004a8a19137a2f58a
Signed-off-by: Nick Bray <ncbray@google.com>
---
 init/initramfs.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/init/initramfs.c b/init/initramfs.c
index f8ce812ba43e..52059169f64d 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -621,8 +621,11 @@ static int __init populate_rootfs(void)
 {
 	char *err;
 
-	if (do_skip_initramfs)
+	if (do_skip_initramfs) {
+		if (initrd_start)
+			free_initrd();
 		return default_rootfs();
+	}
 
 	err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
 	if (err)

From 36205b7fa9638db8ce34c8d98165263641b15626 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 9 Mar 2016 14:08:15 -0800
Subject: [PATCH 1286/3220] UPSTREAM: kasan: add functions to clear stack
 poison

Functions which the compiler has instrumented for ASAN place poison on
the stack shadow upon entry and remove this poison prior to returning.

In some cases (e.g. hotplug and idle), CPUs may exit the kernel a
number of levels deep in C code.  If there are any instrumented
functions on this critical path, these will leave portions of the idle
thread stack shadow poisoned.

If a CPU returns to the kernel via a different path (e.g. a cold
entry), then depending on stack frame layout subsequent calls to
instrumented functions may use regions of the stack with stale poison,
resulting in (spurious) KASAN splats to the console.

Contemporary GCCs always add stack shadow poisoning when ASAN is
enabled, even when asked to not instrument a function [1], so we can't
simply annotate functions on the critical path to avoid poisoning.

Instead, this series explicitly removes any stale poison before it can
be hit.  In the common hotplug case we clear the entire stack shadow in
common code, before a CPU is brought online.

On architectures which perform a cold return as part of cpu idle may
retain an architecture-specific amount of stack contents.  To retain the
poison for this retained context, the arch code must call the core KASAN
code, passing a "watermark" stack pointer value beyond which shadow will
be cleared.  Architectures which don't perform a cold return as part of
idle do not need any additional code.

This patch (of 3):

Functions which the compiler has instrumented for KASAN place poison on
the stack shadow upon entry and remove this poision prior to returning.

In some cases (e.g.  hotplug and idle), CPUs may exit the kernel a number
of levels deep in C code.  If there are any instrumented functions on this
critical path, these will leave portions of the stack shadow poisoned.

If a CPU returns to the kernel via a different path (e.g.  a cold entry),
then depending on stack frame layout subsequent calls to instrumented
functions may use regions of the stack with stale poison, resulting in
(spurious) KASAN splats to the console.

To avoid this, we must clear stale poison from the stack prior to
instrumented functions being called.  This patch adds functions to the
KASAN core for removing poison from (portions of) a task's stack.  These
will be used by subsequent patches to avoid problems with hotplug and
idle.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from e3ae116339f9a0c77523abc95e338fa405946e07)
Change-Id: I9be31b714d5bdaec94a2dad3f0e468c094fe5fa2
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kasan.h |  6 +++++-
 mm/kasan/kasan.c      | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 4b9f85c963d0..0fdc798e3ff7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_KASAN_H
 #define _LINUX_KASAN_H
 
+#include <linux/sched.h>
 #include <linux/types.h>
 
 struct kmem_cache;
@@ -13,7 +14,6 @@ struct vm_struct;
 
 #include <asm/kasan.h>
 #include <asm/pgtable.h>
-#include <linux/sched.h>
 
 extern unsigned char kasan_zero_page[PAGE_SIZE];
 extern pte_t kasan_zero_pte[PTRS_PER_PTE];
@@ -43,6 +43,8 @@ static inline void kasan_disable_current(void)
 
 void kasan_unpoison_shadow(const void *address, size_t size);
 
+void kasan_unpoison_task_stack(struct task_struct *task);
+
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
@@ -66,6 +68,8 @@ void kasan_free_shadow(const struct vm_struct *vm);
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 
+static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+
 static inline void kasan_enable_current(void) {}
 static inline void kasan_disable_current(void) {}
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index bc0a8d8b8f42..1ad20ade8c91 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kmemleak.h>
+#include <linux/linkage.h>
 #include <linux/memblock.h>
 #include <linux/memory.h>
 #include <linux/mm.h>
@@ -60,6 +61,25 @@ void kasan_unpoison_shadow(const void *address, size_t size)
 	}
 }
 
+static void __kasan_unpoison_stack(struct task_struct *task, void *sp)
+{
+	void *base = task_stack_page(task);
+	size_t size = sp - base;
+
+	kasan_unpoison_shadow(base, size);
+}
+
+/* Unpoison the entire stack for a task. */
+void kasan_unpoison_task_stack(struct task_struct *task)
+{
+	__kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+}
+
+/* Unpoison the stack for the current task beyond a watermark sp value. */
+asmlinkage void kasan_unpoison_remaining_stack(void *sp)
+{
+	__kasan_unpoison_stack(current, sp);
+}
 
 /*
  * All functions below always inlined so compiler could

From e303a832d93e66e691ae08fd39d9969355da4955 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 8 Nov 2017 19:47:36 +0530
Subject: [PATCH 1287/3220] BACKPORT: schedutil: Reset cached freq if it is not
 in sync with next_freq

'cached_raw_freq' is used to get the next frequency quickly but should
always be in sync with sg_policy->next_freq. There are cases where it is
not and in such cases it should be reset to avoid switching to incorrect
frequencies.

Consider this case for example:
- policy->cur is 1.2 GHz (Max)
- New request comes for 780 MHz and we store that in cached_raw_freq.
- Based on 780 MHz, we calculate the effective frequency as 800 MHz.
- We then decide not to update the frequency as
  sugov_up_down_rate_limit() return true.
- Here cached_raw_freq is 780 MHz and sg_policy->next_freq is 1.2 GHz.
- Now if the utilization doesn't change in next request, then the next
  target frequency will still be 780 MHz and it will match with
  cached_raw_freq and so we will directly return 1.2 GHz instead of 800
  MHz.

BACKPORT of upstream commit 07458f6a5171 ("cpufreq: schedutil: Reset
cached_raw_freq when not in sync with next_freq").

This also updates sugov_update_commit() for handling up/down tunables, which
aren't present in mainline.

Change-Id: I70bca2c5dfdb545a0471d1c9e4c5addb30ab5494
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 kernel/sched/cpufreq_schedutil.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d3765f0cb699..6c84b4d28914 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -130,8 +130,11 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 {
 	struct cpufreq_policy *policy = sg_policy->policy;
 
-	if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
+	if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) {
+		/* Reset cached freq as next_freq isn't changed */
+		sg_policy->cached_raw_freq = 0;
 		return;
+	}
 
 	if (sg_policy->next_freq == next_freq)
 		return;
@@ -317,8 +320,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 		 * Do not reduce the frequency if the CPU has not been idle
 		 * recently, as the reduction is likely to be premature then.
 		 */
-		if (busy && next_f < sg_policy->next_freq)
+		if (busy && next_f < sg_policy->next_freq) {
 			next_f = sg_policy->next_freq;
+
+			/* Reset cached freq as next_freq has changed */
+			sg_policy->cached_raw_freq = 0;
+		}
 	}
 	sugov_update_commit(sg_policy, time, next_f);
 }

From b5797f6112c76298cacc3146cd578c82f167d094 Mon Sep 17 00:00:00 2001
From: Ke Wang <ke.wang@spreadtrum.com>
Date: Thu, 9 Nov 2017 11:30:56 +0800
Subject: [PATCH 1288/3220] sched: EAS/WALT: Don't take into account of running
 task's util

For upmigrating misfit running task case, the currently running
task's util has been counted into cpu_util(). Thus currently
__cpu_overutilized() which add task's uitl twice is overestimated.

Signed-off-by: Ke Wang <ke.wang@spreadtrum.com>
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b5ea66e5551c..d9b3ac867956 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6662,7 +6662,8 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
 
 
 #ifdef CONFIG_SCHED_WALT
-		if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+		if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+			p->state == TASK_WAKING)
 			delta = task_util(p);
 #endif
 		/* Not enough spare capacity on previous cpu */

From 44d2bfbf144e3945dd425b0c13b6c02da45f29aa Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 15 Mar 2016 14:54:15 -0700
Subject: [PATCH 1289/3220] UPSTREAM: mm/slab: activate debug_pagealloc in SLAB
 when it is actually enabled

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from a307ebd468e0b97c203f5a99a56a6017e4d1991a)
Change-Id: I4407dbf0a5a22ab6f9335219bafc6f28023635a0
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/slab.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 24a615d42d74..e9c484921b4c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1847,7 +1847,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-			if (cachep->size % PAGE_SIZE == 0 &&
+			if (debug_pagealloc_enabled() &&
+				cachep->size % PAGE_SIZE == 0 &&
 					OFF_SLAB(cachep))
 				kernel_map_pages(virt_to_page(objp),
 					cachep->size / PAGE_SIZE, 1);
@@ -2187,7 +2188,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	 * to check size >= 256. It guarantees that all necessary small
 	 * sized slab is initialized in current slab initialization sequence.
 	 */
-	if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
+	if (debug_pagealloc_enabled() &&
+		!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
 		size >= 256 && cachep->object_size > cache_line_size() &&
 		ALIGN(size, cachep->align) < PAGE_SIZE) {
 		cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
@@ -2243,7 +2245,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 		 * poisoning, then it's going to smash the contents of
 		 * the redzone and userword anyhow, so switch them off.
 		 */
-		if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
+		if (debug_pagealloc_enabled() &&
+			size % PAGE_SIZE == 0 && flags & SLAB_POISON)
 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 #endif
 	}
@@ -2737,7 +2740,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	set_obj_status(page, objnr, OBJECT_FREE);
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
+		if (debug_pagealloc_enabled() &&
+			(cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, caller);
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->size / PAGE_SIZE, 0);
@@ -2874,7 +2878,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 		return objp;
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+		if (debug_pagealloc_enabled() &&
+			(cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->size / PAGE_SIZE, 1);
 		else

From 5b5a89ff484f3652de76375ef6f820667198126b Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 15 Mar 2016 14:54:21 -0700
Subject: [PATCH 1290/3220] UPSTREAM: mm/slab: clean up DEBUG_PAGEALLOC
 processing code

Currently, open code for checking DEBUG_PAGEALLOC cache is spread to
some sites.  It makes code unreadable and hard to change.

This patch cleans up this code.  The following patch will change the
criteria for DEBUG_PAGEALLOC cache so this clean-up will help it, too.

[akpm@linux-foundation.org: fix build with CONFIG_DEBUG_PAGEALLOC=n]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 40b44137971c2e5865a78f9f7de274449983ccb5)
Change-Id: I784df4a54b62f77a22f8fa70990387fdf968219f
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/mm.h | 12 ++++--
 mm/slab.c          | 97 +++++++++++++++++++++++-----------------------
 2 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 941c7f711ada..652e73967707 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2161,14 +2161,18 @@ kernel_map_pages(struct page *page, int numpages, int enable)
 }
 #ifdef CONFIG_HIBERNATION
 extern bool kernel_page_present(struct page *page);
-#endif /* CONFIG_HIBERNATION */
-#else
+#endif	/* CONFIG_HIBERNATION */
+#else	/* CONFIG_DEBUG_PAGEALLOC */
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable) {}
 #ifdef CONFIG_HIBERNATION
 static inline bool kernel_page_present(struct page *page) { return true; }
-#endif /* CONFIG_HIBERNATION */
-#endif
+#endif	/* CONFIG_HIBERNATION */
+static inline bool debug_pagealloc_enabled(void)
+{
+	return false;
+}
+#endif	/* CONFIG_DEBUG_PAGEALLOC */
 
 #ifdef __HAVE_ARCH_GATE_AREA
 extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
diff --git a/mm/slab.c b/mm/slab.c
index e9c484921b4c..e51372d1f54f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1670,6 +1670,14 @@ static void kmem_rcu_free(struct rcu_head *head)
 }
 
 #if DEBUG
+static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+{
+	if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
+		(cachep->size % PAGE_SIZE) == 0)
+		return true;
+
+	return false;
+}
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1703,6 +1711,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
 	}
 	*addr++ = 0x87654321;
 }
+
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+				int map, unsigned long caller)
+{
+	if (!is_debug_pagealloc_cache(cachep))
+		return;
+
+	if (caller)
+		store_stackinfo(cachep, objp, caller);
+
+	kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+}
+
+#else
+static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+				int map, unsigned long caller) {}
+
 #endif
 
 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
@@ -1781,6 +1806,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 	int size, i;
 	int lines = 0;
 
+	if (is_debug_pagealloc_cache(cachep))
+		return;
+
 	realobj = (char *)objp + obj_offset(cachep);
 	size = cachep->object_size;
 
@@ -1846,17 +1874,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
 		void *objp = index_to_obj(cachep, page, i);
 
 		if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
-			if (debug_pagealloc_enabled() &&
-				cachep->size % PAGE_SIZE == 0 &&
-					OFF_SLAB(cachep))
-				kernel_map_pages(virt_to_page(objp),
-					cachep->size / PAGE_SIZE, 1);
-			else
-				check_poison_obj(cachep, objp);
-#else
 			check_poison_obj(cachep, objp);
-#endif
+			slab_kernel_map(cachep, objp, 1, 0);
 		}
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -2239,16 +2258,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	if (flags & CFLGS_OFF_SLAB) {
 		/* really off slab. No need for manual alignment */
 		freelist_size = calculate_freelist_size(cachep->num, 0);
-
-#ifdef CONFIG_PAGE_POISONING
-		/* If we're going to use the generic kernel_map_pages()
-		 * poisoning, then it's going to smash the contents of
-		 * the redzone and userword anyhow, so switch them off.
-		 */
-		if (debug_pagealloc_enabled() &&
-			size % PAGE_SIZE == 0 && flags & SLAB_POISON)
-			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-#endif
 	}
 
 	cachep->colour_off = cache_line_size();
@@ -2264,7 +2273,19 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	cachep->size = size;
 	cachep->reciprocal_buffer_size = reciprocal_value(size);
 
-	if (flags & CFLGS_OFF_SLAB) {
+#if DEBUG
+	/*
+	 * If we're going to use the generic kernel_map_pages()
+	 * poisoning, then it's going to smash the contents of
+	 * the redzone and userword anyhow, so switch them off.
+	 */
+	if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
+		(cachep->flags & SLAB_POISON) &&
+		is_debug_pagealloc_cache(cachep))
+		cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
+
+	if (OFF_SLAB(cachep)) {
 		cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
 		/*
 		 * This is a possibility for one of the kmalloc_{dma,}_caches.
@@ -2491,9 +2512,6 @@ static void cache_init_objs(struct kmem_cache *cachep,
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = index_to_obj(cachep, page, i);
 #if DEBUG
-		/* need to poison the objs? */
-		if (cachep->flags & SLAB_POISON)
-			poison_obj(cachep, objp, POISON_FREE);
 		if (cachep->flags & SLAB_STORE_USER)
 			*dbg_userword(cachep, objp) = NULL;
 
@@ -2517,10 +2535,11 @@ static void cache_init_objs(struct kmem_cache *cachep,
 				slab_error(cachep, "constructor overwrote the"
 					   " start of an object");
 		}
-		if ((cachep->size % PAGE_SIZE) == 0 &&
-			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
-			kernel_map_pages(virt_to_page(objp),
-					 cachep->size / PAGE_SIZE, 0);
+		/* need to poison the objs? */
+		if (cachep->flags & SLAB_POISON) {
+			poison_obj(cachep, objp, POISON_FREE);
+			slab_kernel_map(cachep, objp, 0, 0);
+		}
 #else
 		if (cachep->ctor)
 			cachep->ctor(objp);
@@ -2739,18 +2758,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 
 	set_obj_status(page, objnr, OBJECT_FREE);
 	if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
-		if (debug_pagealloc_enabled() &&
-			(cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
-			store_stackinfo(cachep, objp, caller);
-			kernel_map_pages(virt_to_page(objp),
-					 cachep->size / PAGE_SIZE, 0);
-		} else {
-			poison_obj(cachep, objp, POISON_FREE);
-		}
-#else
 		poison_obj(cachep, objp, POISON_FREE);
-#endif
+		slab_kernel_map(cachep, objp, 0, caller);
 	}
 	return objp;
 }
@@ -2877,16 +2886,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 	if (!objp)
 		return objp;
 	if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
-		if (debug_pagealloc_enabled() &&
-			(cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
-			kernel_map_pages(virt_to_page(objp),
-					 cachep->size / PAGE_SIZE, 1);
-		else
-			check_poison_obj(cachep, objp);
-#else
 		check_poison_obj(cachep, objp);
-#endif
+		slab_kernel_map(cachep, objp, 1, 0);
 		poison_obj(cachep, objp, POISON_INUSE);
 	}
 	if (cachep->flags & SLAB_STORE_USER)

From 63fd82260452806eec52aae2b842412c2070a444 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 15 Mar 2016 14:54:24 -0700
Subject: [PATCH 1291/3220] UPSTREAM: mm/slab: alternative implementation for
 DEBUG_SLAB_LEAK

DEBUG_SLAB_LEAK is a debug option.  It's current implementation requires
status buffer so we need more memory to use it.  And, it cause
kmem_cache initialization step more complex.

To remove this extra memory usage and to simplify initialization step,
this patch implement this feature with another way.

When user requests to get slab object owner information, it marks that
getting information is started.  And then, all free objects in caches
are flushed to corresponding slab page.  Now, we can distinguish all
freed object so we can know all allocated objects, too.  After
collecting slab object owner information on allocated objects, mark is
checked that there is no free during the processing.  If true, we can be
sure that our information is correct so information is returned to user.

Although this way is rather complex, it has two important benefits
mentioned above.  So, I think it is worth changing.

There is one drawback that it takes more time to get slab object owner
information but it is just a debug option so it doesn't matter at all.

To help review, this patch implements new way only.  Following patch
will remove useless code.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from d31676dfde257cb2b3e52d4e657d8ad2251e4d49)
Change-Id: I204ea0dd5553577d17c93f32f0d5a797ba0304af
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/slab_def.h |  3 ++
 mm/slab.c                | 85 +++++++++++++++++++++++++++++-----------
 2 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 33d049066c3d..3b3e7747495d 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -60,6 +60,9 @@ struct kmem_cache {
 	atomic_t allocmiss;
 	atomic_t freehit;
 	atomic_t freemiss;
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+	atomic_t store_user_clean;
+#endif
 
 	/*
 	 * If debugging is enabled, then the allocator can add additional
diff --git a/mm/slab.c b/mm/slab.c
index e51372d1f54f..0b7f017cd13d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,20 +406,25 @@ static void set_obj_status(struct page *page, int idx, int val)
 	status[idx] = val;
 }
 
-static inline unsigned int get_obj_status(struct page *page, int idx)
+static inline bool is_store_user_clean(struct kmem_cache *cachep)
 {
-	int freelist_size;
-	char *status;
-	struct kmem_cache *cachep = page->slab_cache;
+	return atomic_read(&cachep->store_user_clean) == 1;
+}
 
-	freelist_size = cachep->num * sizeof(freelist_idx_t);
-	status = (char *)page->freelist + freelist_size;
+static inline void set_store_user_clean(struct kmem_cache *cachep)
+{
+	atomic_set(&cachep->store_user_clean, 1);
+}
 
-	return status[idx];
+static inline void set_store_user_dirty(struct kmem_cache *cachep)
+{
+	if (is_store_user_clean(cachep))
+		atomic_set(&cachep->store_user_clean, 0);
 }
 
 #else
 static inline void set_obj_status(struct page *page, int idx, int val) {}
+static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
 
 #endif
 
@@ -2570,6 +2575,11 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
 	WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
 #endif
 
+#if DEBUG
+	if (cachep->flags & SLAB_STORE_USER)
+		set_store_user_dirty(cachep);
+#endif
+
 	return objp;
 }
 
@@ -2748,8 +2758,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 	}
-	if (cachep->flags & SLAB_STORE_USER)
+	if (cachep->flags & SLAB_STORE_USER) {
+		set_store_user_dirty(cachep);
 		*dbg_userword(cachep, objp) = (void *)caller;
+	}
 
 	objnr = obj_to_index(cachep, page, objp);
 
@@ -4110,15 +4122,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
 						struct page *page)
 {
 	void *p;
-	int i;
+	int i, j;
+	unsigned long v;
 
 	if (n[0] == n[1])
 		return;
 	for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
-		if (get_obj_status(page, i) != OBJECT_ACTIVE)
+		bool active = true;
+
+		for (j = page->active; j < c->num; j++) {
+			if (get_free_obj(page, j) == i) {
+				active = false;
+				break;
+			}
+		}
+
+		if (!active)
 			continue;
 
-		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+		/*
+		 * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
+		 * mapping is established when actual object allocation and
+		 * we could mistakenly access the unmapped object in the cpu
+		 * cache.
+		 */
+		if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
+			continue;
+
+		if (!add_caller(n, v))
 			return;
 	}
 }
@@ -4154,21 +4185,31 @@ static int leaks_show(struct seq_file *m, void *p)
 	if (!(cachep->flags & SLAB_RED_ZONE))
 		return 0;
 
-	/* OK, we can do it */
+	/*
+	 * Set store_user_clean and start to grab stored user information
+	 * for all objects on this cache. If some alloc/free requests comes
+	 * during the processing, information would be wrong so restart
+	 * whole processing.
+	 */
+	do {
+		set_store_user_clean(cachep);
+		drain_cpu_caches(cachep);
 
-	x[1] = 0;
+		x[1] = 0;
 
-	for_each_kmem_cache_node(cachep, node, n) {
+		for_each_kmem_cache_node(cachep, node, n) {
 
-		check_irq_on();
-		spin_lock_irq(&n->list_lock);
+			check_irq_on();
+			spin_lock_irq(&n->list_lock);
+
+			list_for_each_entry(page, &n->slabs_full, lru)
+				handle_slab(x, cachep, page);
+			list_for_each_entry(page, &n->slabs_partial, lru)
+				handle_slab(x, cachep, page);
+			spin_unlock_irq(&n->list_lock);
+		}
+	} while (!is_store_user_clean(cachep));
 
-		list_for_each_entry(page, &n->slabs_full, lru)
-			handle_slab(x, cachep, page);
-		list_for_each_entry(page, &n->slabs_partial, lru)
-			handle_slab(x, cachep, page);
-		spin_unlock_irq(&n->list_lock);
-	}
 	name = cachep->name;
 	if (x[0] == x[1]) {
 		/* Increase the buffer size */

From 106caadcea045e8f9957ff8702db68be33bc2d38 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 15 Mar 2016 14:54:27 -0700
Subject: [PATCH 1292/3220] UPSTREAM: mm/slab: remove object status buffer for
 DEBUG_SLAB_LEAK

Now, we don't use object status buffer in any setup. Remove it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 249247b6f8ee362189a2f2bf598a14ff6c95fb4c)
Change-Id: I60b1fb030da5d44aff2627f57dffd9dd7436e511
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/slab.c | 34 ++--------------------------------
 1 file changed, 2 insertions(+), 32 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 0b7f017cd13d..7248d56a855d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -390,22 +390,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 
 #endif
 
-#define OBJECT_FREE (0)
-#define OBJECT_ACTIVE (1)
-
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 
-static void set_obj_status(struct page *page, int idx, int val)
-{
-	int freelist_size;
-	char *status;
-	struct kmem_cache *cachep = page->slab_cache;
-
-	freelist_size = cachep->num * sizeof(freelist_idx_t);
-	status = (char *)page->freelist + freelist_size;
-	status[idx] = val;
-}
-
 static inline bool is_store_user_clean(struct kmem_cache *cachep)
 {
 	return atomic_read(&cachep->store_user_clean) == 1;
@@ -423,7 +409,6 @@ static inline void set_store_user_dirty(struct kmem_cache *cachep)
 }
 
 #else
-static inline void set_obj_status(struct page *page, int idx, int val) {}
 static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
 
 #endif
@@ -485,9 +470,6 @@ static size_t calculate_freelist_size(int nr_objs, size_t align)
 	size_t freelist_size;
 
 	freelist_size = nr_objs * sizeof(freelist_idx_t);
-	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
-		freelist_size += nr_objs * sizeof(char);
-
 	if (align)
 		freelist_size = ALIGN(freelist_size, align);
 
@@ -500,10 +482,7 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
 	int nr_objs;
 	size_t remained_size;
 	size_t freelist_size;
-	int extra_space = 0;
 
-	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
-		extra_space = sizeof(char);
 	/*
 	 * Ignore padding for the initial guess. The padding
 	 * is at most @align-1 bytes, and @buffer_size is at
@@ -512,7 +491,7 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
 	 * into the memory allocation when taking the padding
 	 * into account.
 	 */
-	nr_objs = slab_size / (buffer_size + idx_size + extra_space);
+	nr_objs = slab_size / (buffer_size + idx_size);
 
 	/*
 	 * This calculated number will be either the right
@@ -1970,16 +1949,13 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 			break;
 
 		if (flags & CFLGS_OFF_SLAB) {
-			size_t freelist_size_per_obj = sizeof(freelist_idx_t);
 			/*
 			 * Max number of objs-per-slab for caches which
 			 * use off-slab slabs. Needed to avoid a possible
 			 * looping condition in cache_grow().
 			 */
-			if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
-				freelist_size_per_obj += sizeof(char);
 			offslab_limit = size;
-			offslab_limit /= freelist_size_per_obj;
+			offslab_limit /= sizeof(freelist_idx_t);
 
  			if (num > offslab_limit)
 				break;
@@ -2549,7 +2525,6 @@ static void cache_init_objs(struct kmem_cache *cachep,
 		if (cachep->ctor)
 			cachep->ctor(objp);
 #endif
-		set_obj_status(page, i, OBJECT_FREE);
 		set_free_obj(page, i, i);
 	}
 }
@@ -2768,7 +2743,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	BUG_ON(objnr >= cachep->num);
 	BUG_ON(objp != index_to_obj(cachep, page, objnr));
 
-	set_obj_status(page, objnr, OBJECT_FREE);
 	if (cachep->flags & SLAB_POISON) {
 		poison_obj(cachep, objp, POISON_FREE);
 		slab_kernel_map(cachep, objp, 0, caller);
@@ -2893,8 +2867,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 				gfp_t flags, void *objp, unsigned long caller)
 {
-	struct page *page;
-
 	if (!objp)
 		return objp;
 	if (cachep->flags & SLAB_POISON) {
@@ -2919,8 +2891,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
 
-	page = virt_to_head_page(objp);
-	set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON)
 		cachep->ctor(objp);

From 511f370eed039a3b148eacf02a10fbcc035f2ba1 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 15 Mar 2016 14:54:47 -0700
Subject: [PATCH 1293/3220] UPSTREAM: mm/slab: factor out debugging
 initialization in cache_init_objs()

cache_init_objs() will be changed in following patch and current form
doesn't fit well for that change.  So, before doing it, this patch
separates debugging initialization.  This would cause two loop iteration
when debugging is enabled, but, this overhead seems too light than debug
feature itself so effect may not be visible.  This patch will greatly
simplify changes in cache_init_objs() in following patch.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 10b2e9e8e808bd30e1f4018a36366d07b0abd12f)
Change-Id: I9904974a674f17fc7d57daf0fe351742db67c006
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/slab.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 7248d56a855d..c394e6fbae6c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2485,14 +2485,14 @@ static inline void set_free_obj(struct page *page,
 	((freelist_idx_t *)(page->freelist))[idx] = val;
 }
 
-static void cache_init_objs(struct kmem_cache *cachep,
-			    struct page *page)
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
 {
+#if DEBUG
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = index_to_obj(cachep, page, i);
-#if DEBUG
+
 		if (cachep->flags & SLAB_STORE_USER)
 			*dbg_userword(cachep, objp) = NULL;
 
@@ -2521,10 +2521,22 @@ static void cache_init_objs(struct kmem_cache *cachep,
 			poison_obj(cachep, objp, POISON_FREE);
 			slab_kernel_map(cachep, objp, 0, 0);
 		}
-#else
-		if (cachep->ctor)
-			cachep->ctor(objp);
+	}
 #endif
+}
+
+static void cache_init_objs(struct kmem_cache *cachep,
+			    struct page *page)
+{
+	int i;
+
+	cache_init_objs_debug(cachep, page);
+
+	for (i = 0; i < cachep->num; i++) {
+		/* constructor could break poison info */
+		if (DEBUG == 0 && cachep->ctor)
+			cachep->ctor(index_to_obj(cachep, page, i));
+
 		set_free_obj(page, i, i);
 	}
 }

From 118cb473e1533167876057d36c19e4035f7d32aa Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 15 Mar 2016 14:54:18 -0700
Subject: [PATCH 1294/3220] UPSTREAM: mm/slab: use more appropriate condition
 check for debug_pagealloc

debug_pagealloc debugging is related to SLAB_POISON flag rather than
FORCED_DEBUG option, although FORCED_DEBUG option will enable
SLAB_POISON.  Fix it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 40323278b557a5909bbecfa181c91a3af7afbbe3)
Change-Id: I4a7dbd40b6a2eb777439f271fa600b201e6db1d3
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/slab.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index c394e6fbae6c..4337c5dcfe13 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2180,7 +2180,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 		else
 			size += BYTES_PER_WORD;
 	}
-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
 	/*
 	 * To activate debug pagealloc, off-slab management is necessary
 	 * requirement. In early phase of initialization, small sized slab
@@ -2188,14 +2187,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	 * to check size >= 256. It guarantees that all necessary small
 	 * sized slab is initialized in current slab initialization sequence.
 	 */
-	if (debug_pagealloc_enabled() &&
+	if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
 		!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
 		size >= 256 && cachep->object_size > cache_line_size() &&
 		ALIGN(size, cachep->align) < PAGE_SIZE) {
 		cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
 		size = PAGE_SIZE;
 	}
-#endif
 #endif
 
 	/*

From 09c23a802440a8b64c2c69b12bbf8f1cd5a8b5ed Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 15 Mar 2016 14:54:33 -0700
Subject: [PATCH 1295/3220] UPSTREAM: mm/slab: align cache size first before
 determination of OFF_SLAB candidate

Finding suitable OFF_SLAB candidate is more related to aligned cache
size rather than original size.  Same reasoning can be applied to the
debug pagealloc candidate.  So, this patch moves up alignment fixup to
proper position.  From that point, size is aligned so we can remove some
alignment fixups.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 832a15d209cd260180407bde1af18965b21623f3)
Change-Id: I8338d647da4a6eb6402c6fe4e2402b7db45ea5a5
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/slab.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 4337c5dcfe13..b3656c04e4af 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2180,6 +2180,17 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 		else
 			size += BYTES_PER_WORD;
 	}
+#endif
+
+	size = ALIGN(size, cachep->align);
+	/*
+	 * We should restrict the number of objects in a slab to implement
+	 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
+	 */
+	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
+		size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
+
+#if DEBUG
 	/*
 	 * To activate debug pagealloc, off-slab management is necessary
 	 * requirement. In early phase of initialization, small sized slab
@@ -2190,8 +2201,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
 		!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
 		size >= 256 && cachep->object_size > cache_line_size() &&
-		ALIGN(size, cachep->align) < PAGE_SIZE) {
-		cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
+		size < PAGE_SIZE) {
+		cachep->obj_offset += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
 #endif
@@ -2203,20 +2214,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
 	 */
 	if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
-	    !(flags & SLAB_NOLEAKTRACE))
+	    !(flags & SLAB_NOLEAKTRACE)) {
 		/*
 		 * Size is large, assume best to place the slab management obj
 		 * off-slab (should allow better packing of objs).
 		 */
 		flags |= CFLGS_OFF_SLAB;
-
-	size = ALIGN(size, cachep->align);
-	/*
-	 * We should restrict the number of objects in a slab to implement
-	 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
-	 */
-	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
-		size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
+	}
 
 	left_over = calculate_slab_order(cachep, size, cachep->align, flags);
 

From 5451a4a87d5364b652e401f54f4bf4d60a3f88e8 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 25 Mar 2016 14:21:59 -0700
Subject: [PATCH 1296/3220] UPSTREAM: mm, kasan: SLAB support

Add KASAN hooks to SLAB allocator.

This patch is based on the "mm: kasan: unified support for SLUB and SLAB
allocators" patch originally prepared by Dmitry Chernenkov.

Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 7ed2f9e663854db313f177a511145630e398b402)
Change-Id: I131fdafc1c27a25732475f5bbd1653b66954e1b7
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 Documentation/kasan.txt  |   5 +-
 include/linux/kasan.h    |  12 +++++
 include/linux/slab.h     |   6 +++
 include/linux/slab_def.h |  14 ++++++
 include/linux/slub_def.h |  11 +++++
 lib/Kconfig.kasan        |   4 +-
 mm/Makefile              |   1 +
 mm/kasan/kasan.c         | 102 +++++++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h         |  34 +++++++++++++
 mm/kasan/report.c        |  54 ++++++++++++++++-----
 mm/slab.c                |  43 ++++++++++++++---
 mm/slab_common.c         |   2 +-
 12 files changed, 266 insertions(+), 22 deletions(-)

diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt
index aa1e0c91e368..7dd95b35cd7c 100644
--- a/Documentation/kasan.txt
+++ b/Documentation/kasan.txt
@@ -12,8 +12,7 @@ KASAN uses compile-time instrumentation for checking every memory access,
 therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is
 required for detection of out-of-bounds accesses to stack or global variables.
 
-Currently KASAN is supported only for x86_64 architecture and requires the
-kernel to be built with the SLUB allocator.
+Currently KASAN is supported only for x86_64 architecture.
 
 1. Usage
 ========
@@ -27,7 +26,7 @@ inline are compiler instrumentation types. The former produces smaller binary
 the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC
 version 5.0 or later.
 
-Currently KASAN works only with the SLUB memory allocator.
+KASAN works with both SLUB and SLAB memory allocators.
 For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
 
 To disable instrumentation for specific files or directories, add a line
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 0fdc798e3ff7..839f2007a0f9 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -48,6 +48,9 @@ void kasan_unpoison_task_stack(struct task_struct *task);
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
+void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+			unsigned long *flags);
+
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
 void kasan_poison_object_data(struct kmem_cache *cache, void *object);
@@ -61,6 +64,11 @@ void kasan_krealloc(const void *object, size_t new_size);
 void kasan_slab_alloc(struct kmem_cache *s, void *object);
 void kasan_slab_free(struct kmem_cache *s, void *object);
 
+struct kasan_cache {
+	int alloc_meta_offset;
+	int free_meta_offset;
+};
+
 int kasan_module_alloc(void *addr, size_t size);
 void kasan_free_shadow(const struct vm_struct *vm);
 
@@ -76,6 +84,10 @@ static inline void kasan_disable_current(void) {}
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 
+static inline void kasan_cache_create(struct kmem_cache *cache,
+				      size_t *size,
+				      unsigned long *flags) {}
+
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
 					void *object) {}
diff --git a/include/linux/slab.h b/include/linux/slab.h
index b4e739f04ee6..d6c288de13a4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -87,6 +87,12 @@
 # define SLAB_FAILSLAB		0x00000000UL
 #endif
 
+#ifdef CONFIG_KASAN
+#define SLAB_KASAN		0x08000000UL
+#else
+#define SLAB_KASAN		0x00000000UL
+#endif
+
 /* The following flags affect the page allocator grouping pages by mobility */
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 3b3e7747495d..8ba02e0cee35 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -75,8 +75,22 @@ struct kmem_cache {
 #ifdef CONFIG_MEMCG_KMEM
 	struct memcg_cache_params memcg_params;
 #endif
+#ifdef CONFIG_KASAN
+	struct kasan_cache kasan_info;
+#endif
 
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
+static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
+				void *x) {
+	void *object = x - (x - page->s_mem) % cache->size;
+	void *last_object = page->s_mem + (cache->num - 1) * cache->size;
+
+	if (unlikely(object > last_object))
+		return last_object;
+	else
+		return object;
+}
+
 #endif	/* _LINUX_SLAB_DEF_H */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index f4e857e920cd..d0f54fb58555 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -130,4 +130,15 @@ static inline void *virt_to_obj(struct kmem_cache *s,
 void object_err(struct kmem_cache *s, struct page *page,
 		u8 *object, char *reason);
 
+static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
+				void *x) {
+	void *object = x - (x - page_address(page)) % cache->size;
+	void *last_object = page_address(page) +
+		(page->objects - 1) * cache->size;
+	if (unlikely(object > last_object))
+		return last_object;
+	else
+		return object;
+}
+
 #endif /* _LINUX_SLUB_DEF_H */
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 0fee5acd5aa0..0e4d2b3b0aee 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -5,7 +5,7 @@ if HAVE_ARCH_KASAN
 
 config KASAN
 	bool "KASan: runtime memory debugger"
-	depends on SLUB_DEBUG
+	depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB)
 	select CONSTRUCTORS
 	help
 	  Enables kernel address sanitizer - runtime memory debugger,
@@ -16,6 +16,8 @@ config KASAN
 	  This feature consumes about 1/8 of available memory and brings about
 	  ~x3 performance slowdown.
 	  For better error detection enable CONFIG_STACKTRACE.
+	  Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB
+	  (the resulting kernel does not boot).
 
 choice
 	prompt "Instrumentation type"
diff --git a/mm/Makefile b/mm/Makefile
index 8b532c94008f..31b952edd501 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,6 +3,7 @@
 #
 
 KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slab.o := n
 KASAN_SANITIZE_slub.o := n
 
 # Since __builtin_frame_address does work as used, disable the warning.
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 1ad20ade8c91..7c82509ef169 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -334,6 +334,59 @@ void kasan_free_pages(struct page *page, unsigned int order)
 				KASAN_FREE_PAGE);
 }
 
+#ifdef CONFIG_SLAB
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static size_t optimal_redzone(size_t object_size)
+{
+	int rz =
+		object_size <= 64        - 16   ? 16 :
+		object_size <= 128       - 32   ? 32 :
+		object_size <= 512       - 64   ? 64 :
+		object_size <= 4096      - 128  ? 128 :
+		object_size <= (1 << 14) - 256  ? 256 :
+		object_size <= (1 << 15) - 512  ? 512 :
+		object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+	return rz;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+			unsigned long *flags)
+{
+	int redzone_adjust;
+	/* Make sure the adjusted size is still less than
+	 * KMALLOC_MAX_CACHE_SIZE.
+	 * TODO: this check is only useful for SLAB, but not SLUB. We'll need
+	 * to skip it for SLUB when it starts using kasan_cache_create().
+	 */
+	if (*size > KMALLOC_MAX_CACHE_SIZE -
+	    sizeof(struct kasan_alloc_meta) -
+	    sizeof(struct kasan_free_meta))
+		return;
+	*flags |= SLAB_KASAN;
+	/* Add alloc meta. */
+	cache->kasan_info.alloc_meta_offset = *size;
+	*size += sizeof(struct kasan_alloc_meta);
+
+	/* Add free meta. */
+	if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
+	    cache->object_size < sizeof(struct kasan_free_meta)) {
+		cache->kasan_info.free_meta_offset = *size;
+		*size += sizeof(struct kasan_free_meta);
+	}
+	redzone_adjust = optimal_redzone(cache->object_size) -
+		(*size - cache->object_size);
+	if (redzone_adjust > 0)
+		*size += redzone_adjust;
+	*size = min(KMALLOC_MAX_CACHE_SIZE,
+		    max(*size,
+			cache->object_size +
+			optimal_redzone(cache->object_size)));
+}
+#endif
+
 void kasan_poison_slab(struct page *page)
 {
 	kasan_poison_shadow(page_address(page),
@@ -351,8 +404,36 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 	kasan_poison_shadow(object,
 			round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
 			KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+	if (cache->flags & SLAB_KASAN) {
+		struct kasan_alloc_meta *alloc_info =
+			get_alloc_info(cache, object);
+		alloc_info->state = KASAN_STATE_INIT;
+	}
+#endif
 }
 
+static inline void set_track(struct kasan_track *track)
+{
+	track->cpu = raw_smp_processor_id();
+	track->pid = current->pid;
+	track->when = jiffies;
+}
+
+#ifdef CONFIG_SLAB
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+					const void *object)
+{
+	return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+				      const void *object)
+{
+	return (void *)object + cache->kasan_info.free_meta_offset;
+}
+#endif
+
 void kasan_slab_alloc(struct kmem_cache *cache, void *object)
 {
 	kasan_kmalloc(cache, object, cache->object_size);
@@ -367,6 +448,17 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
 	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
 		return;
 
+#ifdef CONFIG_SLAB
+	if (cache->flags & SLAB_KASAN) {
+		struct kasan_free_meta *free_info =
+			get_free_info(cache, object);
+		struct kasan_alloc_meta *alloc_info =
+			get_alloc_info(cache, object);
+		alloc_info->state = KASAN_STATE_FREE;
+		set_track(&free_info->track);
+	}
+#endif
+
 	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
 }
 
@@ -386,6 +478,16 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
 	kasan_unpoison_shadow(object, size);
 	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
 		KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+	if (cache->flags & SLAB_KASAN) {
+		struct kasan_alloc_meta *alloc_info =
+			get_alloc_info(cache, object);
+
+		alloc_info->state = KASAN_STATE_ALLOC;
+		alloc_info->alloc_size = size;
+		set_track(&alloc_info->track);
+	}
+#endif
 }
 EXPORT_SYMBOL(kasan_kmalloc);
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 37ff0ab6a8ff..5af34d63609c 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -57,6 +57,40 @@ struct kasan_global {
 #endif
 };
 
+/**
+ * Structures to keep alloc and free tracks *
+ */
+
+enum kasan_state {
+	KASAN_STATE_INIT,
+	KASAN_STATE_ALLOC,
+	KASAN_STATE_FREE
+};
+
+struct kasan_track {
+	u64 cpu : 6;			/* for NR_CPUS = 64 */
+	u64 pid : 16;			/* 65536 processes */
+	u64 when : 42;			/* ~140 years */
+};
+
+struct kasan_alloc_meta {
+	u32 state : 2;	/* enum kasan_state */
+	u32 alloc_size : 30;
+	struct kasan_track track;
+};
+
+struct kasan_free_meta {
+	/* Allocator freelist pointer, unused by KASAN. */
+	void **freelist;
+	struct kasan_track track;
+};
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+					const void *object);
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+					const void *object);
+
+
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
 	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b4e31f78ae69..31e0a7e0c59d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -116,6 +116,46 @@ static inline bool init_task_stack_addr(const void *addr)
 			sizeof(init_thread_union.stack));
 }
 
+#ifdef CONFIG_SLAB
+static void print_track(struct kasan_track *track)
+{
+	pr_err("PID = %u, CPU = %u, timestamp = %lu\n", track->pid,
+	       track->cpu, (unsigned long)track->when);
+}
+
+static void object_err(struct kmem_cache *cache, struct page *page,
+			void *object, char *unused_reason)
+{
+	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+	struct kasan_free_meta *free_info;
+
+	dump_stack();
+	pr_err("Object at %p, in cache %s\n", object, cache->name);
+	if (!(cache->flags & SLAB_KASAN))
+		return;
+	switch (alloc_info->state) {
+	case KASAN_STATE_INIT:
+		pr_err("Object not allocated yet\n");
+		break;
+	case KASAN_STATE_ALLOC:
+		pr_err("Object allocated with size %u bytes.\n",
+		       alloc_info->alloc_size);
+		pr_err("Allocation:\n");
+		print_track(&alloc_info->track);
+		break;
+	case KASAN_STATE_FREE:
+		pr_err("Object freed, allocated with size %u bytes\n",
+		       alloc_info->alloc_size);
+		free_info = get_free_info(cache, object);
+		pr_err("Allocation:\n");
+		print_track(&alloc_info->track);
+		pr_err("Deallocation:\n");
+		print_track(&free_info->track);
+		break;
+	}
+}
+#endif
+
 static void print_address_description(struct kasan_access_info *info)
 {
 	const void *addr = info->access_addr;
@@ -127,17 +167,10 @@ static void print_address_description(struct kasan_access_info *info)
 		if (PageSlab(page)) {
 			void *object;
 			struct kmem_cache *cache = page->slab_cache;
-			void *last_object;
-
-			object = virt_to_obj(cache, page_address(page), addr);
-			last_object = page_address(page) +
-				page->objects * cache->size;
-
-			if (unlikely(object > last_object))
-				object = last_object; /* we hit into padding */
-
+			object = nearest_obj(cache, page,
+						(void *)info->access_addr);
 			object_err(cache, page, object,
-				"kasan: bad access detected");
+					"kasan: bad access detected");
 			return;
 		}
 		dump_page(page, "kasan: bad access detected");
@@ -147,7 +180,6 @@ static void print_address_description(struct kasan_access_info *info)
 		if (!init_task_stack_addr(addr))
 			pr_err("Address belongs to variable %pS\n", addr);
 	}
-
 	dump_stack();
 }
 
diff --git a/mm/slab.c b/mm/slab.c
index b3656c04e4af..c34aa670d337 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2182,6 +2182,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	}
 #endif
 
+	kasan_cache_create(cachep, &size, &flags);
+
 	size = ALIGN(size, cachep->align);
 	/*
 	 * We should restrict the number of objects in a slab to implement
@@ -2507,8 +2509,13 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
 		 * cache which they are a constructor for.  Otherwise, deadlock.
 		 * They must also be threaded.
 		 */
-		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
+		if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
+			kasan_unpoison_object_data(cachep,
+						   objp + obj_offset(cachep));
 			cachep->ctor(objp + obj_offset(cachep));
+			kasan_poison_object_data(
+				cachep, objp + obj_offset(cachep));
+		}
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2531,13 +2538,18 @@ static void cache_init_objs(struct kmem_cache *cachep,
 			    struct page *page)
 {
 	int i;
+	void *objp;
 
 	cache_init_objs_debug(cachep, page);
 
 	for (i = 0; i < cachep->num; i++) {
 		/* constructor could break poison info */
-		if (DEBUG == 0 && cachep->ctor)
-			cachep->ctor(index_to_obj(cachep, page, i));
+		if (DEBUG == 0 && cachep->ctor) {
+			objp = index_to_obj(cachep, page, i);
+			kasan_unpoison_object_data(cachep, objp);
+			cachep->ctor(objp);
+			kasan_poison_object_data(cachep, objp);
+		}
 
 		set_free_obj(page, i, i);
 	}
@@ -2671,6 +2683,7 @@ static int cache_grow(struct kmem_cache *cachep,
 
 	slab_map_pages(cachep, page, freelist);
 
+	kasan_poison_slab(page);
 	cache_init_objs(cachep, page);
 
 	if (gfpflags_allow_blocking(local_flags))
@@ -3370,6 +3383,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 {
 	struct array_cache *ac = cpu_cache_get(cachep);
 
+	kasan_slab_free(cachep, objp);
+
 	check_irq_off();
 	kmemleak_free_recursive(objp, cachep->flags);
 	objp = cache_free_debugcheck(cachep, objp, caller);
@@ -3408,6 +3423,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *ret = slab_alloc(cachep, flags, _RET_IP_);
 
+	kasan_slab_alloc(cachep, ret);
 	trace_kmem_cache_alloc(_RET_IP_, ret,
 			       cachep->object_size, cachep->size, flags);
 
@@ -3436,6 +3452,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 
 	ret = slab_alloc(cachep, flags, _RET_IP_);
 
+	kasan_kmalloc(cachep, ret, size);
 	trace_kmalloc(_RET_IP_, ret,
 		      size, cachep->size, flags);
 	return ret;
@@ -3459,6 +3476,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
+	kasan_slab_alloc(cachep, ret);
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    cachep->object_size, cachep->size,
 				    flags, nodeid);
@@ -3476,7 +3494,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 	void *ret;
 
 	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-
+	kasan_kmalloc(cachep, ret, size);
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, cachep->size,
 			   flags, nodeid);
@@ -3489,11 +3507,15 @@ static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 {
 	struct kmem_cache *cachep;
+	void *ret;
 
 	cachep = kmalloc_slab(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	return kmem_cache_alloc_node_trace(cachep, flags, node, size);
+	ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
+	kasan_kmalloc(cachep, ret, size);
+
+	return ret;
 }
 
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3527,6 +3549,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 		return cachep;
 	ret = slab_alloc(cachep, flags, caller);
 
+	kasan_kmalloc(cachep, ret, size);
 	trace_kmalloc(caller, ret,
 		      size, cachep->size, flags);
 
@@ -4303,10 +4326,18 @@ const char *__check_heap_object(const void *ptr, unsigned long n,
  */
 size_t ksize(const void *objp)
 {
+	size_t size;
+
 	BUG_ON(!objp);
 	if (unlikely(objp == ZERO_SIZE_PTR))
 		return 0;
 
-	return virt_to_cache(objp)->object_size;
+	size = virt_to_cache(objp)->object_size;
+	/* We assume that ksize callers could use the whole allocated area,
+	 * so we need to unpoison this area.
+	 */
+	kasan_krealloc(objp, size);
+
+	return size;
 }
 EXPORT_SYMBOL(ksize);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 01e7246de8df..ebaeb0659923 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache;
  */
 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
-		SLAB_FAILSLAB)
+		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK)
 

From a9683c505ba8a00f704ec6023af2a01a8690ec11 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 25 Mar 2016 14:22:02 -0700
Subject: [PATCH 1297/3220] BACKPORT: mm, kasan: add GFP flags to KASAN API

Add GFP flags to KASAN hooks for future patches to use.

This patch is based on the "mm: kasan: unified support for SLUB and SLAB
allocators" patch originally prepared by Dmitry Chernenkov.

Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 505f5dcb1c419e55a9621a01f83eb5745d8d7398)
Change-Id: I7c5539f59e6969e484a6ff4f104dce2390669cfd
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kasan.h | 19 +++++++++++--------
 include/linux/slab.h  |  4 ++--
 mm/kasan/kasan.c      | 15 ++++++++-------
 mm/mempool.c          | 16 ++++++++--------
 mm/slab.c             | 15 ++++++++-------
 mm/slab_common.c      |  4 ++--
 mm/slub.c             | 17 +++++++++--------
 7 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 839f2007a0f9..737371b56044 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -55,13 +55,14 @@ void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
 void kasan_poison_object_data(struct kmem_cache *cache, void *object);
 
-void kasan_kmalloc_large(const void *ptr, size_t size);
+void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
 void kasan_kfree_large(const void *ptr);
 void kasan_kfree(void *ptr);
-void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size);
-void kasan_krealloc(const void *object, size_t new_size);
+void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
+		  gfp_t flags);
+void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
 
-void kasan_slab_alloc(struct kmem_cache *s, void *object);
+void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
 void kasan_slab_free(struct kmem_cache *s, void *object);
 
 struct kasan_cache {
@@ -94,14 +95,16 @@ static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
 static inline void kasan_poison_object_data(struct kmem_cache *cache,
 					void *object) {}
 
-static inline void kasan_kmalloc_large(void *ptr, size_t size) {}
+static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {}
 static inline void kasan_kfree_large(const void *ptr) {}
 static inline void kasan_kfree(void *ptr) {}
 static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
-				size_t size) {}
-static inline void kasan_krealloc(const void *object, size_t new_size) {}
+				size_t size, gfp_t flags) {}
+static inline void kasan_krealloc(const void *object, size_t new_size,
+				 gfp_t flags) {}
 
-static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {}
+static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
+				   gfp_t flags) {}
 static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d6c288de13a4..16dc1e4a91f3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -374,7 +374,7 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
 {
 	void *ret = kmem_cache_alloc(s, flags);
 
-	kasan_kmalloc(s, ret, size);
+	kasan_kmalloc(s, ret, size, flags);
 	return ret;
 }
 
@@ -385,7 +385,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 {
 	void *ret = kmem_cache_alloc_node(s, gfpflags, node);
 
-	kasan_kmalloc(s, ret, size);
+	kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
 #endif /* CONFIG_TRACING */
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 7c82509ef169..cb998e0ec9d3 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -434,9 +434,9 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
 }
 #endif
 
-void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 {
-	kasan_kmalloc(cache, object, cache->object_size);
+	kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 
 void kasan_slab_free(struct kmem_cache *cache, void *object)
@@ -462,7 +462,8 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
 	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
 }
 
-void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
+		   gfp_t flags)
 {
 	unsigned long redzone_start;
 	unsigned long redzone_end;
@@ -491,7 +492,7 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
 }
 EXPORT_SYMBOL(kasan_kmalloc);
 
-void kasan_kmalloc_large(const void *ptr, size_t size)
+void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
 {
 	struct page *page;
 	unsigned long redzone_start;
@@ -510,7 +511,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size)
 		KASAN_PAGE_REDZONE);
 }
 
-void kasan_krealloc(const void *object, size_t size)
+void kasan_krealloc(const void *object, size_t size, gfp_t flags)
 {
 	struct page *page;
 
@@ -520,9 +521,9 @@ void kasan_krealloc(const void *object, size_t size)
 	page = virt_to_head_page(object);
 
 	if (unlikely(!PageSlab(page)))
-		kasan_kmalloc_large(object, size);
+		kasan_kmalloc_large(object, size, flags);
 	else
-		kasan_kmalloc(page->slab_cache, object, size);
+		kasan_kmalloc(page->slab_cache, object, size, flags);
 }
 
 void kasan_kfree(void *ptr)
diff --git a/mm/mempool.c b/mm/mempool.c
index 7924f4f58a6d..6a803876e538 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element)
 		kasan_free_pages(element, (unsigned long)pool->pool_data);
 }
 
-static void kasan_unpoison_element(mempool_t *pool, void *element)
+static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
 {
 	if (pool->alloc == mempool_alloc_slab)
-		kasan_slab_alloc(pool->pool_data, element);
+		kasan_slab_alloc(pool->pool_data, element, flags);
 	if (pool->alloc == mempool_kmalloc)
-		kasan_krealloc(element, (size_t)pool->pool_data);
+		kasan_krealloc(element, (size_t)pool->pool_data, flags);
 	if (pool->alloc == mempool_alloc_pages)
 		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
 }
@@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element)
 	pool->elements[pool->curr_nr++] = element;
 }
 
-static void *remove_element(mempool_t *pool)
+static void *remove_element(mempool_t *pool, gfp_t flags)
 {
 	void *element = pool->elements[--pool->curr_nr];
 
 	BUG_ON(pool->curr_nr < 0);
-	kasan_unpoison_element(pool, element);
+	kasan_unpoison_element(pool, element, flags);
 	check_element(pool, element);
 	return element;
 }
@@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool)
 		return;
 
 	while (pool->curr_nr) {
-		void *element = remove_element(pool);
+		void *element = remove_element(pool, GFP_KERNEL);
 		pool->free(element, pool->pool_data);
 	}
 	kfree(pool->elements);
@@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
 	spin_lock_irqsave(&pool->lock, flags);
 	if (new_min_nr <= pool->min_nr) {
 		while (new_min_nr < pool->curr_nr) {
-			element = remove_element(pool);
+			element = remove_element(pool, GFP_KERNEL);
 			spin_unlock_irqrestore(&pool->lock, flags);
 			pool->free(element, pool->pool_data);
 			spin_lock_irqsave(&pool->lock, flags);
@@ -336,7 +336,7 @@ repeat_alloc:
 
 	spin_lock_irqsave(&pool->lock, flags);
 	if (likely(pool->curr_nr)) {
-		element = remove_element(pool);
+		element = remove_element(pool, gfp_temp);
 		spin_unlock_irqrestore(&pool->lock, flags);
 		/* paired with rmb in mempool_free(), read comment there */
 		smp_wmb();
diff --git a/mm/slab.c b/mm/slab.c
index c34aa670d337..d71d62a407b8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3423,7 +3423,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *ret = slab_alloc(cachep, flags, _RET_IP_);
 
-	kasan_slab_alloc(cachep, ret);
+	kasan_slab_alloc(cachep, ret, flags);
 	trace_kmem_cache_alloc(_RET_IP_, ret,
 			       cachep->object_size, cachep->size, flags);
 
@@ -3452,7 +3452,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 
 	ret = slab_alloc(cachep, flags, _RET_IP_);
 
-	kasan_kmalloc(cachep, ret, size);
+	kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc(_RET_IP_, ret,
 		      size, cachep->size, flags);
 	return ret;
@@ -3476,7 +3476,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
-	kasan_slab_alloc(cachep, ret);
+	kasan_slab_alloc(cachep, ret, flags);
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    cachep->object_size, cachep->size,
 				    flags, nodeid);
@@ -3494,7 +3494,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 	void *ret;
 
 	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-	kasan_kmalloc(cachep, ret, size);
+
+	kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, cachep->size,
 			   flags, nodeid);
@@ -3513,7 +3514,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
 	ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
-	kasan_kmalloc(cachep, ret, size);
+	kasan_kmalloc(cachep, ret, size, flags);
 
 	return ret;
 }
@@ -3549,7 +3550,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 		return cachep;
 	ret = slab_alloc(cachep, flags, caller);
 
-	kasan_kmalloc(cachep, ret, size);
+	kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc(caller, ret,
 		      size, cachep->size, flags);
 
@@ -4336,7 +4337,7 @@ size_t ksize(const void *objp)
 	/* We assume that ksize callers could use the whole allocated area,
 	 * so we need to unpoison this area.
 	 */
-	kasan_krealloc(objp, size);
+	kasan_krealloc(objp, size, GFP_NOWAIT);
 
 	return size;
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ebaeb0659923..a3f6d3dd523e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1010,7 +1010,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 	page = alloc_kmem_pages(flags, order);
 	ret = page ? page_address(page) : NULL;
 	kmemleak_alloc(ret, size, 1, flags);
-	kasan_kmalloc_large(ret, size);
+	kasan_kmalloc_large(ret, size, flags);
 	return ret;
 }
 EXPORT_SYMBOL(kmalloc_order);
@@ -1191,7 +1191,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
 		ks = ksize(p);
 
 	if (ks >= new_size) {
-		kasan_krealloc((void *)p, new_size);
+		kasan_krealloc((void *)p, new_size, flags);
 		return (void *)p;
 	}
 
diff --git a/mm/slub.c b/mm/slub.c
index d6fe997c0577..32f0d880f941 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1307,7 +1307,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
 	kmemleak_alloc(ptr, size, 1, flags);
-	kasan_kmalloc_large(ptr, size);
+	kasan_kmalloc_large(ptr, size, flags);
 }
 
 static inline void kfree_hook(const void *x)
@@ -1341,7 +1341,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
 		kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
 		kmemleak_alloc_recursive(object, s->object_size, 1,
 					 s->flags, flags);
-		kasan_slab_alloc(s, object);
+		kasan_slab_alloc(s, object, flags);
 	}
 	memcg_kmem_put_cache(s);
 }
@@ -2625,7 +2625,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
 	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
 	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
-	kasan_kmalloc(s, ret, size);
+	kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2653,7 +2653,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, s->size, gfpflags, node);
 
-	kasan_kmalloc(s, ret, size);
+	kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -3197,7 +3197,8 @@ static void early_kmem_cache_node_alloc(int node)
 	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
 	init_tracking(kmem_cache_node, n);
 #endif
-	kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
+	kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+		      GFP_KERNEL);
 	init_kmem_cache_node(n);
 	inc_slabs_node(kmem_cache_node, node, page->objects);
 
@@ -3575,7 +3576,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 
 	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
-	kasan_kmalloc(s, ret, size);
+	kasan_kmalloc(s, ret, size, flags);
 
 	return ret;
 }
@@ -3620,7 +3621,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 
 	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
-	kasan_kmalloc(s, ret, size);
+	kasan_kmalloc(s, ret, size, flags);
 
 	return ret;
 }
@@ -3689,7 +3690,7 @@ size_t ksize(const void *object)
 	size_t size = __ksize(object);
 	/* We assume that ksize callers could use whole allocated area,
 	   so we need unpoison this area. */
-	kasan_krealloc(object, size);
+	kasan_krealloc(object, size, GFP_NOWAIT);
 	return size;
 }
 EXPORT_SYMBOL(ksize);

From 7336ab7b7c0af8c2fe4112ea4531dad8ca87c31e Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 25 Mar 2016 14:22:08 -0700
Subject: [PATCH 1298/3220] BACKPORT: mm, kasan: stackdepot implementation.
 Enable stackdepot for SLAB

Implement the stack depot and provide CONFIG_STACKDEPOT.  Stack depot
will allow KASAN store allocation/deallocation stack traces for memory
chunks.  The stack traces are stored in a hash table and referenced by
handles which reside in the kasan_alloc_meta and kasan_free_meta
structures in the allocated memory chunks.

IRQ stack traces are cut below the IRQ entry point to avoid unnecessary
duplication.

Right now stackdepot support is only enabled in SLAB allocator.  Once
KASAN features in SLAB are on par with those in SLUB we can switch SLUB
to stackdepot as well, thus removing the dependency on SLUB stack
bookkeeping, which wastes a lot of memory.

This patch is based on the "mm: kasan: stack depots" patch originally
prepared by Dmitry Chernenkov.

Joonsoo has said that he plans to reuse the stackdepot code for the
mm/page_owner.c debugging facility.

[akpm@linux-foundation.org: s/depot_stack_handle/depot_stack_handle_t]
[aryabinin@virtuozzo.com: comment style fixes]
Signed-off-by: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from cd11016e5f5212c13c0cec7384a525edc93b4921)
Change-Id: Ic804318410823b95d84e264a6334e018f21ef943
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 arch/x86/kernel/Makefile   |  18 ++-
 include/linux/stackdepot.h |  32 +++++
 lib/Kconfig                |   4 +
 lib/Kconfig.kasan          |   1 +
 lib/Makefile               |   3 +
 lib/stackdepot.c           | 284 +++++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.c           |  57 +++++++-
 mm/kasan/kasan.h           |  11 +-
 mm/kasan/report.c          |  12 +-
 9 files changed, 406 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/stackdepot.h
 create mode 100644 lib/stackdepot.c

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ffe01d0..616ebd22ef9a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,9 +16,21 @@ CFLAGS_REMOVE_ftrace.o = -pg
 CFLAGS_REMOVE_early_printk.o = -pg
 endif
 
-KASAN_SANITIZE_head$(BITS).o := n
-KASAN_SANITIZE_dumpstack.o := n
-KASAN_SANITIZE_dumpstack_$(BITS).o := n
+KASAN_SANITIZE_head$(BITS).o				:= n
+KASAN_SANITIZE_dumpstack.o				:= n
+KASAN_SANITIZE_dumpstack_$(BITS).o			:= n
+KASAN_SANITIZE_stacktrace.o := n
+
+OBJECT_FILES_NON_STANDARD_head_$(BITS).o		:= y
+OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
+OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o		:= y
+OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
+
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT		:= n
 
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
new file mode 100644
index 000000000000..7978b3e2c1e1
--- /dev/null
+++ b/include/linux/stackdepot.h
@@ -0,0 +1,32 @@
+/*
+ * A generic stack depot implementation
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_STACKDEPOT_H
+#define _LINUX_STACKDEPOT_H
+
+typedef u32 depot_stack_handle_t;
+
+struct stack_trace;
+
+depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags);
+
+void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 1a48744253d7..48635688046f 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -531,4 +531,8 @@ config ARCH_HAS_PMEM_API
 config ARCH_HAS_MMIO_FLUSH
 	bool
 
+config STACKDEPOT
+	bool
+	select STACKTRACE
+
 endmenu
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 0e4d2b3b0aee..67d8c6838ba9 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -7,6 +7,7 @@ config KASAN
 	bool "KASan: runtime memory debugger"
 	depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB)
 	select CONSTRUCTORS
+	select STACKDEPOT if SLAB
 	help
 	  Enables kernel address sanitizer - runtime memory debugger,
 	  designed to find out-of-bounds accesses and use-after-free bugs.
diff --git a/lib/Makefile b/lib/Makefile
index cb4f6aa95013..4e26d0bea99a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -163,6 +163,9 @@ obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
 obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 
+obj-$(CONFIG_STACKDEPOT) += stackdepot.o
+KASAN_SANITIZE_stackdepot.o := n
+
 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
 	       fdt_empty_tree.o
 $(foreach file, $(libfdt_files), \
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
new file mode 100644
index 000000000000..654c9d87e83a
--- /dev/null
+++ b/lib/stackdepot.c
@@ -0,0 +1,284 @@
+/*
+ * Generic stack depot for storing stack traces.
+ *
+ * Some debugging tools need to save stack traces of certain events which can
+ * be later presented to the user. For example, KASAN needs to safe alloc and
+ * free stacks for each object, but storing two stack traces per object
+ * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for
+ * that).
+ *
+ * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc
+ * and free stacks repeat a lot, we save about 100x space.
+ * Stacks are never removed from depot, so we store them contiguously one after
+ * another in a contiguos memory allocation.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/stackdepot.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
+
+#define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */
+#define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER))
+#define STACK_ALLOC_ALIGN 4
+#define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \
+					STACK_ALLOC_ALIGN)
+#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS)
+#define STACK_ALLOC_SLABS_CAP 1024
+#define STACK_ALLOC_MAX_SLABS \
+	(((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
+	 (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP)
+
+/* The compact structure to store the reference to stacks. */
+union handle_parts {
+	depot_stack_handle_t handle;
+	struct {
+		u32 slabindex : STACK_ALLOC_INDEX_BITS;
+		u32 offset : STACK_ALLOC_OFFSET_BITS;
+	};
+};
+
+struct stack_record {
+	struct stack_record *next;	/* Link in the hashtable */
+	u32 hash;			/* Hash in the hastable */
+	u32 size;			/* Number of frames in the stack */
+	union handle_parts handle;
+	unsigned long entries[1];	/* Variable-sized array of entries. */
+};
+
+static void *stack_slabs[STACK_ALLOC_MAX_SLABS];
+
+static int depot_index;
+static int next_slab_inited;
+static size_t depot_offset;
+static DEFINE_SPINLOCK(depot_lock);
+
+static bool init_stack_slab(void **prealloc)
+{
+	if (!*prealloc)
+		return false;
+	/*
+	 * This smp_load_acquire() pairs with smp_store_release() to
+	 * |next_slab_inited| below and in depot_alloc_stack().
+	 */
+	if (smp_load_acquire(&next_slab_inited))
+		return true;
+	if (stack_slabs[depot_index] == NULL) {
+		stack_slabs[depot_index] = *prealloc;
+	} else {
+		stack_slabs[depot_index + 1] = *prealloc;
+		/*
+		 * This smp_store_release pairs with smp_load_acquire() from
+		 * |next_slab_inited| above and in depot_save_stack().
+		 */
+		smp_store_release(&next_slab_inited, 1);
+	}
+	*prealloc = NULL;
+	return true;
+}
+
+/* Allocation of a new stack in raw storage */
+static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
+		u32 hash, void **prealloc, gfp_t alloc_flags)
+{
+	int required_size = offsetof(struct stack_record, entries) +
+		sizeof(unsigned long) * size;
+	struct stack_record *stack;
+
+	required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN);
+
+	if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) {
+		if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) {
+			WARN_ONCE(1, "Stack depot reached limit capacity");
+			return NULL;
+		}
+		depot_index++;
+		depot_offset = 0;
+		/*
+		 * smp_store_release() here pairs with smp_load_acquire() from
+		 * |next_slab_inited| in depot_save_stack() and
+		 * init_stack_slab().
+		 */
+		if (depot_index + 1 < STACK_ALLOC_MAX_SLABS)
+			smp_store_release(&next_slab_inited, 0);
+	}
+	init_stack_slab(prealloc);
+	if (stack_slabs[depot_index] == NULL)
+		return NULL;
+
+	stack = stack_slabs[depot_index] + depot_offset;
+
+	stack->hash = hash;
+	stack->size = size;
+	stack->handle.slabindex = depot_index;
+	stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN;
+	memcpy(stack->entries, entries, size * sizeof(unsigned long));
+	depot_offset += required_size;
+
+	return stack;
+}
+
+#define STACK_HASH_ORDER 20
+#define STACK_HASH_SIZE (1L << STACK_HASH_ORDER)
+#define STACK_HASH_MASK (STACK_HASH_SIZE - 1)
+#define STACK_HASH_SEED 0x9747b28c
+
+static struct stack_record *stack_table[STACK_HASH_SIZE] = {
+	[0 ...	STACK_HASH_SIZE - 1] = NULL
+};
+
+/* Calculate hash for a stack */
+static inline u32 hash_stack(unsigned long *entries, unsigned int size)
+{
+	return jhash2((u32 *)entries,
+			       size * sizeof(unsigned long) / sizeof(u32),
+			       STACK_HASH_SEED);
+}
+
+/* Find a stack that is equal to the one stored in entries in the hash */
+static inline struct stack_record *find_stack(struct stack_record *bucket,
+					     unsigned long *entries, int size,
+					     u32 hash)
+{
+	struct stack_record *found;
+
+	for (found = bucket; found; found = found->next) {
+		if (found->hash == hash &&
+		    found->size == size &&
+		    !memcmp(entries, found->entries,
+			    size * sizeof(unsigned long))) {
+			return found;
+		}
+	}
+	return NULL;
+}
+
+void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace)
+{
+	union handle_parts parts = { .handle = handle };
+	void *slab = stack_slabs[parts.slabindex];
+	size_t offset = parts.offset << STACK_ALLOC_ALIGN;
+	struct stack_record *stack = slab + offset;
+
+	trace->nr_entries = trace->max_entries = stack->size;
+	trace->entries = stack->entries;
+	trace->skip = 0;
+}
+
+/**
+ * depot_save_stack - save stack in a stack depot.
+ * @trace - the stacktrace to save.
+ * @alloc_flags - flags for allocating additional memory if required.
+ *
+ * Returns the handle of the stack struct stored in depot.
+ */
+depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
+				    gfp_t alloc_flags)
+{
+	u32 hash;
+	depot_stack_handle_t retval = 0;
+	struct stack_record *found = NULL, **bucket;
+	unsigned long flags;
+	struct page *page = NULL;
+	void *prealloc = NULL;
+
+	if (unlikely(trace->nr_entries == 0))
+		goto fast_exit;
+
+	hash = hash_stack(trace->entries, trace->nr_entries);
+	/* Bad luck, we won't store this stack. */
+	if (hash == 0)
+		goto exit;
+
+	bucket = &stack_table[hash & STACK_HASH_MASK];
+
+	/*
+	 * Fast path: look the stack trace up without locking.
+	 * The smp_load_acquire() here pairs with smp_store_release() to
+	 * |bucket| below.
+	 */
+	found = find_stack(smp_load_acquire(bucket), trace->entries,
+			   trace->nr_entries, hash);
+	if (found)
+		goto exit;
+
+	/*
+	 * Check if the current or the next stack slab need to be initialized.
+	 * If so, allocate the memory - we won't be able to do that under the
+	 * lock.
+	 *
+	 * The smp_load_acquire() here pairs with smp_store_release() to
+	 * |next_slab_inited| in depot_alloc_stack() and init_stack_slab().
+	 */
+	if (unlikely(!smp_load_acquire(&next_slab_inited))) {
+		/*
+		 * Zero out zone modifiers, as we don't have specific zone
+		 * requirements. Keep the flags related to allocation in atomic
+		 * contexts and I/O.
+		 */
+		alloc_flags &= ~GFP_ZONEMASK;
+		alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
+		page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
+		if (page)
+			prealloc = page_address(page);
+	}
+
+	spin_lock_irqsave(&depot_lock, flags);
+
+	found = find_stack(*bucket, trace->entries, trace->nr_entries, hash);
+	if (!found) {
+		struct stack_record *new =
+			depot_alloc_stack(trace->entries, trace->nr_entries,
+					  hash, &prealloc, alloc_flags);
+		if (new) {
+			new->next = *bucket;
+			/*
+			 * This smp_store_release() pairs with
+			 * smp_load_acquire() from |bucket| above.
+			 */
+			smp_store_release(bucket, new);
+			found = new;
+		}
+	} else if (prealloc) {
+		/*
+		 * We didn't need to store this stack trace, but let's keep
+		 * the preallocated memory for the future.
+		 */
+		WARN_ON(!init_stack_slab(&prealloc));
+	}
+
+	spin_unlock_irqrestore(&depot_lock, flags);
+exit:
+	if (prealloc) {
+		/* Nobody used this memory, ok to free it. */
+		free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER);
+	}
+	if (found)
+		retval = found->handle.handle;
+fast_exit:
+	return retval;
+}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index cb998e0ec9d3..acb3b6c4dd89 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -17,7 +17,9 @@
 #define DISABLE_BRANCH_PROFILING
 
 #include <linux/export.h>
+#include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/kasan.h>
 #include <linux/kernel.h>
 #include <linux/kmemleak.h>
 #include <linux/linkage.h>
@@ -32,7 +34,6 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/vmalloc.h>
-#include <linux/kasan.h>
 
 #include "kasan.h"
 #include "../slab.h"
@@ -413,23 +414,65 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 #endif
 }
 
-static inline void set_track(struct kasan_track *track)
+#ifdef CONFIG_SLAB
+static inline int in_irqentry_text(unsigned long ptr)
 {
-	track->cpu = raw_smp_processor_id();
-	track->pid = current->pid;
-	track->when = jiffies;
+	return (ptr >= (unsigned long)&__irqentry_text_start &&
+		ptr < (unsigned long)&__irqentry_text_end) ||
+		(ptr >= (unsigned long)&__softirqentry_text_start &&
+		 ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+static inline void filter_irq_stacks(struct stack_trace *trace)
+{
+	int i;
+
+	if (!trace->nr_entries)
+		return;
+	for (i = 0; i < trace->nr_entries; i++)
+		if (in_irqentry_text(trace->entries[i])) {
+			/* Include the irqentry function into the stack. */
+			trace->nr_entries = i + 1;
+			break;
+		}
+}
+
+static inline depot_stack_handle_t save_stack(gfp_t flags)
+{
+	unsigned long entries[KASAN_STACK_DEPTH];
+	struct stack_trace trace = {
+		.nr_entries = 0,
+		.entries = entries,
+		.max_entries = KASAN_STACK_DEPTH,
+		.skip = 0
+	};
+
+	save_stack_trace(&trace);
+	filter_irq_stacks(&trace);
+	if (trace.nr_entries != 0 &&
+	    trace.entries[trace.nr_entries-1] == ULONG_MAX)
+		trace.nr_entries--;
+
+	return depot_save_stack(&trace, flags);
+}
+
+static inline void set_track(struct kasan_track *track, gfp_t flags)
+{
+	track->pid = current->pid;
+	track->stack = save_stack(flags);
 }
 
-#ifdef CONFIG_SLAB
 struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
 					const void *object)
 {
+	BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
 	return (void *)object + cache->kasan_info.alloc_meta_offset;
 }
 
 struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
 				      const void *object)
 {
+	BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
 	return (void *)object + cache->kasan_info.free_meta_offset;
 }
 #endif
@@ -486,7 +529,7 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 
 		alloc_info->state = KASAN_STATE_ALLOC;
 		alloc_info->alloc_size = size;
-		set_track(&alloc_info->track);
+		set_track(&alloc_info->track, flags);
 	}
 #endif
 }
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 5af34d63609c..dd272eaada15 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -2,6 +2,7 @@
 #define __MM_KASAN_KASAN_H
 
 #include <linux/kasan.h>
+#include <linux/stackdepot.h>
 
 #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
 #define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
@@ -67,16 +68,18 @@ enum kasan_state {
 	KASAN_STATE_FREE
 };
 
+#define KASAN_STACK_DEPTH 64
+
 struct kasan_track {
-	u64 cpu : 6;			/* for NR_CPUS = 64 */
-	u64 pid : 16;			/* 65536 processes */
-	u64 when : 42;			/* ~140 years */
+	u32 pid;
+	depot_stack_handle_t stack;
 };
 
 struct kasan_alloc_meta {
+	struct kasan_track track;
 	u32 state : 2;	/* enum kasan_state */
 	u32 alloc_size : 30;
-	struct kasan_track track;
+	u32 reserved;
 };
 
 struct kasan_free_meta {
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 31e0a7e0c59d..4b11c70f4792 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -19,6 +19,7 @@
 #include <linux/printk.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/stackdepot.h>
 #include <linux/stacktrace.h>
 #include <linux/string.h>
 #include <linux/types.h>
@@ -119,8 +120,15 @@ static inline bool init_task_stack_addr(const void *addr)
 #ifdef CONFIG_SLAB
 static void print_track(struct kasan_track *track)
 {
-	pr_err("PID = %u, CPU = %u, timestamp = %lu\n", track->pid,
-	       track->cpu, (unsigned long)track->when);
+	pr_err("PID = %u\n", track->pid);
+	if (track->stack) {
+		struct stack_trace trace;
+
+		depot_fetch_stack(track->stack, &trace);
+		print_stack_trace(&trace, 0);
+	} else {
+		pr_err("(stack is not available)\n");
+	}
 }
 
 static void object_err(struct kmem_cache *cache, struct page *page,

From e9741edda65b94d16c5500ce45622c9dcee877c7 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 1 Apr 2016 14:31:15 -0700
Subject: [PATCH 1299/3220] UPSTREAM: mm, kasan: fix compilation for
 CONFIG_SLAB

Add the missing argument to set_track().

Fixes: cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB")
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 0b355eaaaae9bb8bb08b563ef55ecb23a4d743da)
Change-Id: I9bed7d2bef5bd7c6e6d774c347ef63678a7d76be
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/kasan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index acb3b6c4dd89..38f1dd79acdb 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -498,7 +498,7 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
 		struct kasan_alloc_meta *alloc_info =
 			get_alloc_info(cache, object);
 		alloc_info->state = KASAN_STATE_FREE;
-		set_track(&free_info->track);
+		set_track(&free_info->track, GFP_NOWAIT);
 	}
 #endif
 

From 5156a49621ae5be3c168587ffca8f772968f6a66 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 28 Apr 2016 16:19:09 -0700
Subject: [PATCH 1300/3220] UPSTREAM: lib/stackdepot.c: allow the stack trace
 hash to be zero

Do not bail out from depot_save_stack() if the stack trace has zero hash.
Initially depot_save_stack() silently dropped stack traces with zero
hashes, however there's actually no point in reserving this zero value.

Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 33334e25769c6ad69b983379578f42581d99a2f9)
Change-Id: I44c9c5a881e2f4176c3946905b8f11b26f45bc00
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/stackdepot.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 654c9d87e83a..9e0b0315a724 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -210,10 +210,6 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
 		goto fast_exit;
 
 	hash = hash_stack(trace->entries, trace->nr_entries);
-	/* Bad luck, we won't store this stack. */
-	if (hash == 0)
-		goto exit;
-
 	bucket = &stack_table[hash & STACK_HASH_MASK];
 
 	/*

From 205c0c95f9f654d414de676406a240127d051fe7 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 5 May 2016 16:22:35 -0700
Subject: [PATCH 1301/3220] UPSTREAM: lib/stackdepot: avoid to return 0 handle

Recently, we allow to save the stacktrace whose hashed value is 0.  It
causes the problem that stackdepot could return 0 even if in success.
User of stackdepot cannot distinguish whether it is success or not so we
need to solve this problem.  In this patch, 1 bit are added to handle
and make valid handle none 0 by setting this bit.  After that, valid
handle will not be 0 and 0 handle will represent failure correctly.

Fixes: 33334e25769c ("lib/stackdepot.c: allow the stack trace hash to be zero")
Link: http://lkml.kernel.org/r/1462252403-1106-1-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 7c31190bcfdbff225950902a9f226e4eb79ca94f)
Change-Id: Ibfb0eb8439225e03e72ed714570d8efac47188a0
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/stackdepot.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 9e0b0315a724..53ad6c0831ae 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -42,12 +42,14 @@
 
 #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
 
+#define STACK_ALLOC_NULL_PROTECTION_BITS 1
 #define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */
 #define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER))
 #define STACK_ALLOC_ALIGN 4
 #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \
 					STACK_ALLOC_ALIGN)
-#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS)
+#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \
+		STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS)
 #define STACK_ALLOC_SLABS_CAP 1024
 #define STACK_ALLOC_MAX_SLABS \
 	(((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
@@ -59,6 +61,7 @@ union handle_parts {
 	struct {
 		u32 slabindex : STACK_ALLOC_INDEX_BITS;
 		u32 offset : STACK_ALLOC_OFFSET_BITS;
+		u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS;
 	};
 };
 
@@ -136,6 +139,7 @@ static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
 	stack->size = size;
 	stack->handle.slabindex = depot_index;
 	stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN;
+	stack->handle.valid = 1;
 	memcpy(stack->entries, entries, size * sizeof(unsigned long));
 	depot_offset += required_size;
 

From 8009eecae3b29ae1a54be9549302b429cae4352b Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 20 May 2016 16:59:11 -0700
Subject: [PATCH 1302/3220] UPSTREAM: mm: kasan: initial memory quarantine
 implementation

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

When the object is freed, its state changes from KASAN_STATE_ALLOC to
KASAN_STATE_QUARANTINE.  The object is poisoned and put into quarantine
instead of being returned to the allocator, therefore every subsequent
access to that object triggers a KASAN error, and the error handler is
able to say where the object has been allocated and deallocated.

When it's time for the object to leave quarantine, its state becomes
KASAN_STATE_FREE and it's returned to the allocator.  From now on the
allocator may reuse it for another allocation.  Before that happens,
it's still possible to detect a use-after free on that object (it
retains the allocation/deallocation stacks).

When the allocator reuses this object, the shadow is unpoisoned and old
allocation/deallocation stacks are wiped.  Therefore a use of this
object, even an incorrect one, won't trigger ASan warning.

Without the quarantine, it's not guaranteed that the objects aren't
reused immediately, that's why the probability of catching a
use-after-free is lower than with quarantine in place.

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

Freed objects are first added to per-cpu quarantine queues.  When a
cache is destroyed or memory shrinking is requested, the objects are
moved into the global quarantine queue.  Whenever a kmalloc call allows
memory reclaiming, the oldest objects are popped out of the global queue
until the total size of objects in quarantine is less than 3/4 of the
maximum quarantine size (which is a fraction of installed physical
memory).

As long as an object remains in the quarantine, KASAN is able to report
accesses to it, so the chance of reporting a use-after-free is
increased.  Once the object leaves quarantine, the allocator may reuse
it, in which case the object is unpoisoned and KASAN can't detect
incorrect accesses to it.

Right now quarantine support is only enabled in SLAB allocator.
Unification of KASAN features in SLAB and SLUB will be done later.

This patch is based on the "mm: kasan: quarantine" patch originally
prepared by Dmitry Chernenkov.  A number of improvements have been
suggested by Andrey Ryabinin.

[glider@google.com: v9]
  Link: http://lkml.kernel.org/r/1462987130-144092-1-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 55834c59098d0c5a97b0f3247e55832b67facdcf)
Change-Id: Ib808d72a40f2e5137961d93dad540e85f8bbd2c4
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kasan.h |  13 +-
 mm/kasan/Makefile     |   1 +
 mm/kasan/kasan.c      |  59 +++++++--
 mm/kasan/kasan.h      |  21 ++-
 mm/kasan/quarantine.c | 291 ++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/report.c     |   1 +
 mm/mempool.c          |   2 +-
 mm/slab.c             |  12 +-
 mm/slab.h             |   2 +
 mm/slab_common.c      |   2 +
 10 files changed, 388 insertions(+), 16 deletions(-)
 create mode 100644 mm/kasan/quarantine.c

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 737371b56044..611927f5870d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -50,6 +50,8 @@ void kasan_free_pages(struct page *page, unsigned int order);
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 			unsigned long *flags);
+void kasan_cache_shrink(struct kmem_cache *cache);
+void kasan_cache_destroy(struct kmem_cache *cache);
 
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -63,7 +65,8 @@ void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
 void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
 
 void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
-void kasan_slab_free(struct kmem_cache *s, void *object);
+bool kasan_slab_free(struct kmem_cache *s, void *object);
+void kasan_poison_slab_free(struct kmem_cache *s, void *object);
 
 struct kasan_cache {
 	int alloc_meta_offset;
@@ -88,6 +91,8 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
 				      unsigned long *flags) {}
+static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
+static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
 
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
@@ -105,7 +110,11 @@ static inline void kasan_krealloc(const void *object, size_t new_size,
 
 static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags) {}
-static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+{
+	return false;
+}
+static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {}
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 64710148941e..31e73d289e7b 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -6,3 +6,4 @@ CFLAGS_REMOVE_kasan.o = -pg
 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
 obj-y := kasan.o report.o kasan_init.o
+obj-$(CONFIG_SLAB) += quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 38f1dd79acdb..8df666bb23be 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -388,6 +388,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 }
 #endif
 
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
+void kasan_cache_destroy(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
 void kasan_poison_slab(struct page *page)
 {
 	kasan_poison_shadow(page_address(page),
@@ -482,7 +492,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 	kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 
-void kasan_slab_free(struct kmem_cache *cache, void *object)
+void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 {
 	unsigned long size = cache->object_size;
 	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
@@ -491,18 +501,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
 	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
 		return;
 
+	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object)
+{
 #ifdef CONFIG_SLAB
-	if (cache->flags & SLAB_KASAN) {
-		struct kasan_free_meta *free_info =
-			get_free_info(cache, object);
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return false;
+
+	if (likely(cache->flags & SLAB_KASAN)) {
 		struct kasan_alloc_meta *alloc_info =
 			get_alloc_info(cache, object);
-		alloc_info->state = KASAN_STATE_FREE;
-		set_track(&free_info->track, GFP_NOWAIT);
-	}
-#endif
+		struct kasan_free_meta *free_info =
+			get_free_info(cache, object);
 
-	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+		switch (alloc_info->state) {
+		case KASAN_STATE_ALLOC:
+			alloc_info->state = KASAN_STATE_QUARANTINE;
+			quarantine_put(free_info, cache);
+			set_track(&free_info->track, GFP_NOWAIT);
+			kasan_poison_slab_free(cache, object);
+			return true;
+		case KASAN_STATE_QUARANTINE:
+		case KASAN_STATE_FREE:
+			pr_err("Double free");
+			dump_stack();
+			break;
+		default:
+			break;
+		}
+	}
+	return false;
+#else
+	kasan_poison_slab_free(cache, object);
+	return false;
+#endif
 }
 
 void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -511,6 +546,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(object == NULL))
 		return;
 
@@ -541,6 +579,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(ptr == NULL))
 		return;
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index dd272eaada15..7642ca0d74f7 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -65,6 +65,7 @@ struct kasan_global {
 enum kasan_state {
 	KASAN_STATE_INIT,
 	KASAN_STATE_ALLOC,
+	KASAN_STATE_QUARANTINE,
 	KASAN_STATE_FREE
 };
 
@@ -82,9 +83,14 @@ struct kasan_alloc_meta {
 	u32 reserved;
 };
 
+struct qlist_node {
+	struct qlist_node *next;
+};
 struct kasan_free_meta {
-	/* Allocator freelist pointer, unused by KASAN. */
-	void **freelist;
+	/* This field is used while the object is in the quarantine.
+	 * Otherwise it might be used for the allocator freelist.
+	 */
+	struct qlist_node quarantine_link;
 	struct kasan_track track;
 };
 
@@ -108,4 +114,15 @@ static inline bool kasan_report_enabled(void)
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 
+#ifdef CONFIG_SLAB
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+void quarantine_reduce(void);
+void quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline void quarantine_put(struct kasan_free_meta *info,
+				struct kmem_cache *cache) { }
+static inline void quarantine_reduce(void) { }
+static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+#endif
+
 #endif
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
new file mode 100644
index 000000000000..4973505a9bdd
--- /dev/null
+++ b/mm/kasan/quarantine.c
@@ -0,0 +1,291 @@
+/*
+ * KASAN quarantine.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "../slab.h"
+#include "kasan.h"
+
+/* Data structure and operations for quarantine queues. */
+
+/*
+ * Each queue is a signle-linked list, which also stores the total size of
+ * objects inside of it.
+ */
+struct qlist_head {
+	struct qlist_node *head;
+	struct qlist_node *tail;
+	size_t bytes;
+};
+
+#define QLIST_INIT { NULL, NULL, 0 }
+
+static bool qlist_empty(struct qlist_head *q)
+{
+	return !q->head;
+}
+
+static void qlist_init(struct qlist_head *q)
+{
+	q->head = q->tail = NULL;
+	q->bytes = 0;
+}
+
+static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
+		size_t size)
+{
+	if (unlikely(qlist_empty(q)))
+		q->head = qlink;
+	else
+		q->tail->next = qlink;
+	q->tail = qlink;
+	qlink->next = NULL;
+	q->bytes += size;
+}
+
+static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
+{
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	if (qlist_empty(to)) {
+		*to = *from;
+		qlist_init(from);
+		return;
+	}
+
+	to->tail->next = from->head;
+	to->tail = from->tail;
+	to->bytes += from->bytes;
+
+	qlist_init(from);
+}
+
+static void qlist_move(struct qlist_head *from, struct qlist_node *last,
+		struct qlist_head *to, size_t size)
+{
+	if (unlikely(last == from->tail)) {
+		qlist_move_all(from, to);
+		return;
+	}
+	if (qlist_empty(to))
+		to->head = from->head;
+	else
+		to->tail->next = from->head;
+	to->tail = last;
+	from->head = last->next;
+	last->next = NULL;
+	from->bytes -= size;
+	to->bytes += size;
+}
+
+
+/*
+ * The object quarantine consists of per-cpu queues and a global queue,
+ * guarded by quarantine_lock.
+ */
+static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
+
+static struct qlist_head global_quarantine;
+static DEFINE_SPINLOCK(quarantine_lock);
+
+/* Maximum size of the global queue. */
+static unsigned long quarantine_size;
+
+/*
+ * The fraction of physical memory the quarantine is allowed to occupy.
+ * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
+ * the ratio low to avoid OOM.
+ */
+#define QUARANTINE_FRACTION 32
+
+#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+
+static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
+{
+	return virt_to_head_page(qlink)->slab_cache;
+}
+
+static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	struct kasan_free_meta *free_info =
+		container_of(qlink, struct kasan_free_meta,
+			     quarantine_link);
+
+	return ((void *)free_info) - cache->kasan_info.free_meta_offset;
+}
+
+static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	void *object = qlink_to_object(qlink, cache);
+	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	alloc_info->state = KASAN_STATE_FREE;
+	___cache_free(cache, object, _THIS_IP_);
+	local_irq_restore(flags);
+}
+
+static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+{
+	struct qlist_node *qlink;
+
+	if (unlikely(qlist_empty(q)))
+		return;
+
+	qlink = q->head;
+	while (qlink) {
+		struct kmem_cache *obj_cache =
+			cache ? cache :	qlink_to_cache(qlink);
+		struct qlist_node *next = qlink->next;
+
+		qlink_free(qlink, obj_cache);
+		qlink = next;
+	}
+	qlist_init(q);
+}
+
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head *q;
+	struct qlist_head temp = QLIST_INIT;
+
+	local_irq_save(flags);
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_put(q, &info->quarantine_link, cache->size);
+	if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+		qlist_move_all(q, &temp);
+
+	local_irq_restore(flags);
+
+	if (unlikely(!qlist_empty(&temp))) {
+		spin_lock_irqsave(&quarantine_lock, flags);
+		qlist_move_all(&temp, &global_quarantine);
+		spin_unlock_irqrestore(&quarantine_lock, flags);
+	}
+}
+
+void quarantine_reduce(void)
+{
+	size_t new_quarantine_size;
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+	size_t size_to_free = 0;
+	struct qlist_node *last;
+
+	if (likely(READ_ONCE(global_quarantine.bytes) <=
+		   READ_ONCE(quarantine_size)))
+		return;
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+
+	/*
+	 * Update quarantine size in case of hotplug. Allocate a fraction of
+	 * the installed memory to quarantine minus per-cpu queue limits.
+	 */
+	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+		QUARANTINE_FRACTION;
+	new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus();
+	WRITE_ONCE(quarantine_size, new_quarantine_size);
+
+	last = global_quarantine.head;
+	while (last) {
+		struct kmem_cache *cache = qlink_to_cache(last);
+
+		size_to_free += cache->size;
+		if (!last->next || size_to_free >
+		    global_quarantine.bytes - QUARANTINE_LOW_SIZE)
+			break;
+		last = last->next;
+	}
+	qlist_move(&global_quarantine, last, &to_free, size_to_free);
+
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, NULL);
+}
+
+static void qlist_move_cache(struct qlist_head *from,
+				   struct qlist_head *to,
+				   struct kmem_cache *cache)
+{
+	struct qlist_node *prev = NULL, *curr;
+
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	curr = from->head;
+	while (curr) {
+		struct qlist_node *qlink = curr;
+		struct kmem_cache *obj_cache = qlink_to_cache(qlink);
+
+		if (obj_cache == cache) {
+			if (unlikely(from->head == qlink)) {
+				from->head = curr->next;
+				prev = curr;
+			} else
+				prev->next = curr->next;
+			if (unlikely(from->tail == qlink))
+				from->tail = curr->next;
+			from->bytes -= cache->size;
+			qlist_put(to, qlink, cache->size);
+		} else {
+			prev = curr;
+		}
+		curr = curr->next;
+	}
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
+	struct kmem_cache *cache = arg;
+	struct qlist_head to_free = QLIST_INIT;
+	struct qlist_head *q;
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_move_cache(q, &to_free, cache);
+	qlist_free_all(&to_free, cache);
+}
+
+void quarantine_remove_cache(struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+
+	on_each_cpu(per_cpu_remove_cache, cache, 1);
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+	qlist_move_cache(&global_quarantine, &to_free, cache);
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, cache);
+}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 4b11c70f4792..a41fb3796262 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -152,6 +152,7 @@ static void object_err(struct kmem_cache *cache, struct page *page,
 		print_track(&alloc_info->track);
 		break;
 	case KASAN_STATE_FREE:
+	case KASAN_STATE_QUARANTINE:
 		pr_err("Object freed, allocated with size %u bytes\n",
 		       alloc_info->alloc_size);
 		free_info = get_free_info(cache, object);
diff --git a/mm/mempool.c b/mm/mempool.c
index 6a803876e538..320b52b98388 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab)
-		kasan_slab_free(pool->pool_data, element);
+		kasan_poison_slab_free(pool->pool_data, element);
 	if (pool->alloc == mempool_kmalloc)
 		kasan_kfree(element);
 	if (pool->alloc == mempool_alloc_pages)
diff --git a/mm/slab.c b/mm/slab.c
index d71d62a407b8..3c35de748ff7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3381,9 +3381,17 @@ free_done:
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 				unsigned long caller)
 {
-	struct array_cache *ac = cpu_cache_get(cachep);
+	/* Put the object into the quarantine, don't touch it for now. */
+	if (kasan_slab_free(cachep, objp))
+		return;
 
-	kasan_slab_free(cachep, objp);
+	___cache_free(cachep, objp, caller);
+}
+
+void ___cache_free(struct kmem_cache *cachep, void *objp,
+		unsigned long caller)
+{
+	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
 	kmemleak_free_recursive(objp, cachep->flags);
diff --git a/mm/slab.h b/mm/slab.h
index 7b6087197997..66118e967e04 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -371,4 +371,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
 int memcg_slab_show(struct seq_file *m, void *p);
 
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
+
 #endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a3f6d3dd523e..fb04a8e89d14 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -712,6 +712,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	get_online_cpus();
 	get_online_mems();
 
+	kasan_cache_destroy(s);
 	mutex_lock(&slab_mutex);
 
 	s->refcount--;
@@ -750,6 +751,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 
 	get_online_cpus();
 	get_online_mems();
+	kasan_cache_shrink(cachep);
 	ret = __kmem_cache_shrink(cachep, false);
 	put_online_mems();
 	put_online_cpus();

From d8688d3bac41f851eeb270691ba785c67d9e4426 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 20 May 2016 16:59:20 -0700
Subject: [PATCH 1303/3220] UPSTREAM: mm/kasan: print name of
 mem[set,cpy,move]() caller in report

When bogus memory access happens in mem[set,cpy,move]() it's usually
caller's fault.  So don't blame mem[set,cpy,move]() in bug report, blame
the caller instead.

Before:
  BUG: KASAN: out-of-bounds access in memset+0x23/0x40 at <address>
After:
  BUG: KASAN: out-of-bounds access in <memset_caller> at <address>

Link: http://lkml.kernel.org/r/1462538722-1574-2-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 936bb4bbbb832f81055328b84e5afe1fc7246a8d)
Change-Id: I3a480d017054abc387b0bee8ca664a8e62cc57d3
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/kasan.c | 64 +++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 8df666bb23be..e5beb40d97b1 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -273,32 +273,36 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
 	return memory_is_poisoned_n(addr, size);
 }
 
-
-static __always_inline void check_memory_region(unsigned long addr,
-						size_t size, bool write)
+static __always_inline void check_memory_region_inline(unsigned long addr,
+						size_t size, bool write,
+						unsigned long ret_ip)
 {
 	if (unlikely(size == 0))
 		return;
 
 	if (unlikely((void *)addr <
 		kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
-		kasan_report(addr, size, write, _RET_IP_);
+		kasan_report(addr, size, write, ret_ip);
 		return;
 	}
 
 	if (likely(!memory_is_poisoned(addr, size)))
 		return;
 
-	kasan_report(addr, size, write, _RET_IP_);
+	kasan_report(addr, size, write, ret_ip);
 }
 
-void __asan_loadN(unsigned long addr, size_t size);
-void __asan_storeN(unsigned long addr, size_t size);
+static void check_memory_region(unsigned long addr,
+				size_t size, bool write,
+				unsigned long ret_ip)
+{
+	check_memory_region_inline(addr, size, write, ret_ip);
+}
 
 #undef memset
 void *memset(void *addr, int c, size_t len)
 {
-	__asan_storeN((unsigned long)addr, len);
+	check_memory_region((unsigned long)addr, len, true, _RET_IP_);
 
 	return __memset(addr, c, len);
 }
@@ -306,8 +310,8 @@ void *memset(void *addr, int c, size_t len)
 #undef memmove
 void *memmove(void *dest, const void *src, size_t len)
 {
-	__asan_loadN((unsigned long)src, len);
-	__asan_storeN((unsigned long)dest, len);
+	check_memory_region((unsigned long)src, len, false, _RET_IP_);
+	check_memory_region((unsigned long)dest, len, true, _RET_IP_);
 
 	return __memmove(dest, src, len);
 }
@@ -315,8 +319,8 @@ void *memmove(void *dest, const void *src, size_t len)
 #undef memcpy
 void *memcpy(void *dest, const void *src, size_t len)
 {
-	__asan_loadN((unsigned long)src, len);
-	__asan_storeN((unsigned long)dest, len);
+	check_memory_region((unsigned long)src, len, false, _RET_IP_);
+	check_memory_region((unsigned long)dest, len, true, _RET_IP_);
 
 	return __memcpy(dest, src, len);
 }
@@ -690,22 +694,22 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size)
 }
 EXPORT_SYMBOL(__asan_unregister_globals);
 
-#define DEFINE_ASAN_LOAD_STORE(size)				\
-	void __asan_load##size(unsigned long addr)		\
-	{							\
-		check_memory_region(addr, size, false);		\
-	}							\
-	EXPORT_SYMBOL(__asan_load##size);			\
-	__alias(__asan_load##size)				\
-	void __asan_load##size##_noabort(unsigned long);	\
-	EXPORT_SYMBOL(__asan_load##size##_noabort);		\
-	void __asan_store##size(unsigned long addr)		\
-	{							\
-		check_memory_region(addr, size, true);		\
-	}							\
-	EXPORT_SYMBOL(__asan_store##size);			\
-	__alias(__asan_store##size)				\
-	void __asan_store##size##_noabort(unsigned long);	\
+#define DEFINE_ASAN_LOAD_STORE(size)					\
+	void __asan_load##size(unsigned long addr)			\
+	{								\
+		check_memory_region_inline(addr, size, false, _RET_IP_);\
+	}								\
+	EXPORT_SYMBOL(__asan_load##size);				\
+	__alias(__asan_load##size)					\
+	void __asan_load##size##_noabort(unsigned long);		\
+	EXPORT_SYMBOL(__asan_load##size##_noabort);			\
+	void __asan_store##size(unsigned long addr)			\
+	{								\
+		check_memory_region_inline(addr, size, true, _RET_IP_);	\
+	}								\
+	EXPORT_SYMBOL(__asan_store##size);				\
+	__alias(__asan_store##size)					\
+	void __asan_store##size##_noabort(unsigned long);		\
 	EXPORT_SYMBOL(__asan_store##size##_noabort)
 
 DEFINE_ASAN_LOAD_STORE(1);
@@ -716,7 +720,7 @@ DEFINE_ASAN_LOAD_STORE(16);
 
 void __asan_loadN(unsigned long addr, size_t size)
 {
-	check_memory_region(addr, size, false);
+	check_memory_region(addr, size, false, _RET_IP_);
 }
 EXPORT_SYMBOL(__asan_loadN);
 
@@ -726,7 +730,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort);
 
 void __asan_storeN(unsigned long addr, size_t size)
 {
-	check_memory_region(addr, size, true);
+	check_memory_region(addr, size, true, _RET_IP_);
 }
 EXPORT_SYMBOL(__asan_storeN);
 

From ec4cb91ee5574de568e22be4fd2db1bc2ca65a05 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 20 May 2016 16:59:28 -0700
Subject: [PATCH 1304/3220] BACKPORT: mm/kasan: add API to check memory regions

Memory access coded in an assembly won't be seen by KASAN as a compiler
can instrument only C code.  Add kasan_check_[read,write]() API which is
going to be used to check a certain memory range.

Link: http://lkml.kernel.org/r/1462538722-1574-3-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 64f8ebaf115bcddc4aaa902f981c57ba6506bc42)
Change-Id: I3e75c7c22e77d390c55ca1b86ec58a6d6ea1da87
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 MAINTAINERS                  | 14 ++++++++++++++
 include/linux/kasan-checks.h | 12 ++++++++++++
 mm/kasan/kasan.c             | 12 ++++++++++++
 3 files changed, 38 insertions(+)
 create mode 100644 include/linux/kasan-checks.h

diff --git a/MAINTAINERS b/MAINTAINERS
index e1e620dd3f78..263e4dfb3fb0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5982,6 +5982,20 @@ S:	Maintained
 F:	Documentation/hwmon/k8temp
 F:	drivers/hwmon/k8temp.c
 
+KASAN
+M:	Andrey Ryabinin <aryabinin@virtuozzo.com>
+R:	Alexander Potapenko <glider@google.com>
+R:	Dmitry Vyukov <dvyukov@google.com>
+L:	kasan-dev@googlegroups.com
+S:	Maintained
+F:	arch/*/include/asm/kasan.h
+F:	arch/*/mm/kasan_init*
+F:	Documentation/kasan.txt
+F:	include/linux/kasan*.h
+F:	lib/test_kasan.c
+F:	mm/kasan/
+F:	scripts/Makefile.kasan
+
 KCONFIG
 M:	"Yann E. MORIN" <yann.morin.1998@free.fr>
 L:	linux-kbuild@vger.kernel.org
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
new file mode 100644
index 000000000000..b7f8aced7870
--- /dev/null
+++ b/include/linux/kasan-checks.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_KASAN_CHECKS_H
+#define _LINUX_KASAN_CHECKS_H
+
+#ifdef CONFIG_KASAN
+void kasan_check_read(const void *p, unsigned int size);
+void kasan_check_write(const void *p, unsigned int size);
+#else
+static inline void kasan_check_read(const void *p, unsigned int size) { }
+static inline void kasan_check_write(const void *p, unsigned int size) { }
+#endif
+
+#endif
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index e5beb40d97b1..18b6a2b8d183 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -299,6 +299,18 @@ static void check_memory_region(unsigned long addr,
 	check_memory_region_inline(addr, size, write, ret_ip);
 }
 
+void kasan_check_read(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+
+void kasan_check_write(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+
 #undef memset
 void *memset(void *addr, int c, size_t len)
 {

From 7a85d045077fbd31c7a1b69adb26e36665f54d12 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Wed, 8 Jun 2016 15:33:45 -0700
Subject: [PATCH 1305/3220] UPSTREAM: kasan: change memory hot-add error
 messages to info messages

Change the following memory hot-add error messages to info messages.
There is no need for these to be errors.

   kasan: WARNING: KASAN doesn't support memory hot-add
   kasan: Memory hot-add will be disabled

Link: http://lkml.kernel.org/r/1464794430-5486-1-git-send-email-shuahkh@osg.samsung.com
Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 91a4c272145652d798035c17e1c02c91001d3f51)
Change-Id: I6ac2acf71cb04f18d25c3e4cbf7317055d130f74
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/kasan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 18b6a2b8d183..28439acda6ec 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -763,8 +763,8 @@ static int kasan_mem_notifier(struct notifier_block *nb,
 
 static int __init kasan_memhotplug_init(void)
 {
-	pr_err("WARNING: KASAN doesn't support memory hot-add\n");
-	pr_err("Memory hot-add will be disabled\n");
+	pr_info("WARNING: KASAN doesn't support memory hot-add\n");
+	pr_info("Memory hot-add will be disabled\n");
 
 	hotplug_memory_notifier(kasan_mem_notifier, 0);
 

From 3240b4d6636e417ab391fe213943177732468815 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 24 Jun 2016 14:49:34 -0700
Subject: [PATCH 1306/3220] UPSTREAM: mm: mempool: kasan: don't poot mempool
 objects in quarantine

Currently we may put reserved by mempool elements into quarantine via
kasan_kfree().  This is totally wrong since quarantine may really free
these objects.  So when mempool will try to use such element,
use-after-free will happen.  Or mempool may decide that it no longer
need that element and double-free it.

So don't put object into quarantine in kasan_kfree(), just poison it.
Rename kasan_kfree() to kasan_poison_kfree() to respect that.

Also, we shouldn't use kasan_slab_alloc()/kasan_krealloc() in
kasan_unpoison_element() because those functions may update allocation
stacktrace.  This would be wrong for the most of the remove_element call
sites.

(The only call site where we may want to update alloc stacktrace is
 in mempool_alloc(). Kmemleak solves this by calling
 kmemleak_update_trace(), so we could make something like that too.
 But this is out of scope of this patch).

Fixes: 55834c59098d ("mm: kasan: initial memory quarantine implementation")
Link: http://lkml.kernel.org/r/575977C3.1010905@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reported-by: Kuthonuzo Luruo <kuthonuzo.luruo@hpe.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 9b75a867cc9ddbafcaf35029358ac500f2635ff3)
Change-Id: Idb6c152dae8f8f2975dbe6acb7165315be8b465b
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kasan.h | 11 +++++++----
 mm/kasan/kasan.c      |  6 +++---
 mm/mempool.c          | 12 ++++--------
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 611927f5870d..ac4b3c46a84d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -59,14 +59,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object);
 
 void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
 void kasan_kfree_large(const void *ptr);
-void kasan_kfree(void *ptr);
+void kasan_poison_kfree(void *ptr);
 void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
 		  gfp_t flags);
 void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
 
 void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
 bool kasan_slab_free(struct kmem_cache *s, void *object);
-void kasan_poison_slab_free(struct kmem_cache *s, void *object);
 
 struct kasan_cache {
 	int alloc_meta_offset;
@@ -76,6 +75,9 @@ struct kasan_cache {
 int kasan_module_alloc(void *addr, size_t size);
 void kasan_free_shadow(const struct vm_struct *vm);
 
+size_t ksize(const void *);
+static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
+
 #else /* CONFIG_KASAN */
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
@@ -102,7 +104,7 @@ static inline void kasan_poison_object_data(struct kmem_cache *cache,
 
 static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {}
 static inline void kasan_kfree_large(const void *ptr) {}
-static inline void kasan_kfree(void *ptr) {}
+static inline void kasan_poison_kfree(void *ptr) {}
 static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
 				size_t size, gfp_t flags) {}
 static inline void kasan_krealloc(const void *object, size_t new_size,
@@ -114,11 +116,12 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 {
 	return false;
 }
-static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {}
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
 
+static inline void kasan_unpoison_slab(const void *ptr) { }
+
 #endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 28439acda6ec..6845f9294696 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -508,7 +508,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 	kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 
-void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
+static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 {
 	unsigned long size = cache->object_size;
 	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
@@ -626,7 +626,7 @@ void kasan_krealloc(const void *object, size_t size, gfp_t flags)
 		kasan_kmalloc(page->slab_cache, object, size, flags);
 }
 
-void kasan_kfree(void *ptr)
+void kasan_poison_kfree(void *ptr)
 {
 	struct page *page;
 
@@ -636,7 +636,7 @@ void kasan_kfree(void *ptr)
 		kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
 				KASAN_FREE_PAGE);
 	else
-		kasan_slab_free(page->slab_cache, ptr);
+		kasan_poison_slab_free(page->slab_cache, ptr);
 }
 
 void kasan_kfree_large(const void *ptr)
diff --git a/mm/mempool.c b/mm/mempool.c
index 320b52b98388..5ba6c8b3b814 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,20 +104,16 @@ static inline void poison_element(mempool_t *pool, void *element)
 
 static void kasan_poison_element(mempool_t *pool, void *element)
 {
-	if (pool->alloc == mempool_alloc_slab)
-		kasan_poison_slab_free(pool->pool_data, element);
-	if (pool->alloc == mempool_kmalloc)
-		kasan_kfree(element);
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		kasan_poison_kfree(element);
 	if (pool->alloc == mempool_alloc_pages)
 		kasan_free_pages(element, (unsigned long)pool->pool_data);
 }
 
 static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
 {
-	if (pool->alloc == mempool_alloc_slab)
-		kasan_slab_alloc(pool->pool_data, element, flags);
-	if (pool->alloc == mempool_kmalloc)
-		kasan_krealloc(element, (size_t)pool->pool_data, flags);
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		kasan_unpoison_slab(element);
 	if (pool->alloc == mempool_alloc_pages)
 		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
 }

From 850627ab0ffc4b72e39a63e519ff0c5d3a31f902 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 14 Jul 2016 12:07:17 -0700
Subject: [PATCH 1307/3220] UPSTREAM: kasan/quarantine: fix bugs on
 qlist_move_cache()

There are two bugs on qlist_move_cache().  One is that qlist's tail
isn't set properly.  curr->next can be NULL since it is singly linked
list and NULL value on tail is invalid if there is one item on qlist.
Another one is that if cache is matched, qlist_put() is called and it
will set curr->next to NULL.  It would cause to stop the loop
prematurely.

These problems come from complicated implementation so I'd like to
re-implement it completely.  Implementation in this patch is really
simple.  Iterate all qlist_nodes and put them to appropriate list.

Unfortunately, I got this bug sometime ago and lose oops message.  But,
the bug looks trivial and no need to attach oops.

Fixes: 55834c59098d ("mm: kasan: initial memory quarantine implementation")
Link: http://lkml.kernel.org/r/1467766348-22419-1-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Kuthonuzo Luruo <poll.stdin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 0ab686d8c8303069e80300663b3be6201a8697fb)
Change-Id: Ifca87bd938c74ff18e7fc2680afb15070cc7019f
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/quarantine.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 4973505a9bdd..65793f150d1f 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -238,30 +238,23 @@ static void qlist_move_cache(struct qlist_head *from,
 				   struct qlist_head *to,
 				   struct kmem_cache *cache)
 {
-	struct qlist_node *prev = NULL, *curr;
+	struct qlist_node *curr;
 
 	if (unlikely(qlist_empty(from)))
 		return;
 
 	curr = from->head;
+	qlist_init(from);
 	while (curr) {
-		struct qlist_node *qlink = curr;
-		struct kmem_cache *obj_cache = qlink_to_cache(qlink);
+		struct qlist_node *next = curr->next;
+		struct kmem_cache *obj_cache = qlink_to_cache(curr);
 
-		if (obj_cache == cache) {
-			if (unlikely(from->head == qlink)) {
-				from->head = curr->next;
-				prev = curr;
-			} else
-				prev->next = curr->next;
-			if (unlikely(from->tail == qlink))
-				from->tail = curr->next;
-			from->bytes -= cache->size;
-			qlist_put(to, qlink, cache->size);
-		} else {
-			prev = curr;
-		}
-		curr = curr->next;
+		if (obj_cache == cache)
+			qlist_put(to, curr, obj_cache->size);
+		else
+			qlist_put(from, curr, obj_cache->size);
+
+		curr = next;
 	}
 }
 

From 85f8b42430080f6a5170cb948a32fc21645acaa9 Mon Sep 17 00:00:00 2001
From: Paul Lawrence <paullawrence@google.com>
Date: Wed, 6 Dec 2017 14:44:33 -0800
Subject: [PATCH 1308/3220] BACKPORT: mm, kasan: switch SLUB to stackdepot,
 enable memory quarantine for SLUB

For KASAN builds:
 - switch SLUB allocator to using stackdepot instead of storing the
   allocation/deallocation stacks in the objects;
 - change the freelist hook so that parts of the freelist can be put
   into the quarantine.

[aryabinin@virtuozzo.com: fixes]
  Link: http://lkml.kernel.org/r/1468601423-28676-1-git-send-email-aryabinin@virtuozzo.com
Link: http://lkml.kernel.org/r/1468347165-41906-3-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Steven Rostedt (Red Hat) <rostedt@goodmis.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kuthonuzo Luruo <kuthonuzo.luruo@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 80a9201a5965f4715d5c09790862e0df84ce0614)
Change-Id: I2b59c6d50d0db62d3609edfdc7be54e48f8afa5c
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kasan.h    |  2 ++
 include/linux/slab_def.h |  3 +-
 include/linux/slub_def.h |  5 ++++
 lib/Kconfig.kasan        |  4 +--
 mm/kasan/Makefile        |  3 +-
 mm/kasan/kasan.c         | 63 ++++++++++++++++++++--------------------
 mm/kasan/kasan.h         |  3 +-
 mm/kasan/report.c        |  8 ++---
 mm/slub.c                | 60 +++++++++++++++++++++++++++++---------
 9 files changed, 95 insertions(+), 56 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index ac4b3c46a84d..c9cf374445d8 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -77,6 +77,7 @@ void kasan_free_shadow(const struct vm_struct *vm);
 
 size_t ksize(const void *);
 static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
+size_t kasan_metadata_size(struct kmem_cache *cache);
 
 #else /* CONFIG_KASAN */
 
@@ -121,6 +122,7 @@ static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
 
 static inline void kasan_unpoison_slab(const void *ptr) { }
+static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 #endif /* CONFIG_KASAN */
 
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8ba02e0cee35..bf16ba9f6fdb 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -83,7 +83,8 @@ struct kmem_cache {
 };
 
 static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
-				void *x) {
+				void *x)
+{
 	void *object = x - (x - page->s_mem) % cache->size;
 	void *last_object = page->s_mem + (cache->num - 1) * cache->size;
 
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d0f54fb58555..fd720e3dd1b8 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -99,6 +99,11 @@ struct kmem_cache {
 	 */
 	int remote_node_defrag_ratio;
 #endif
+
+#ifdef CONFIG_KASAN
+	struct kasan_cache kasan_info;
+#endif
+
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 67d8c6838ba9..bd38aab05929 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN
 
 config KASAN
 	bool "KASan: runtime memory debugger"
-	depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB)
+	depends on SLUB || (SLAB && !DEBUG_SLAB)
 	select CONSTRUCTORS
-	select STACKDEPOT if SLAB
+	select STACKDEPOT
 	help
 	  Enables kernel address sanitizer - runtime memory debugger,
 	  designed to find out-of-bounds accesses and use-after-free bugs.
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 31e73d289e7b..58dc45dd9594 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -5,5 +5,4 @@ CFLAGS_REMOVE_kasan.o = -pg
 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
-obj-y := kasan.o report.o kasan_init.o
-obj-$(CONFIG_SLAB) += quarantine.o
+obj-y := kasan.o report.o kasan_init.o quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 6845f9294696..b6f99e81bfeb 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -351,7 +351,6 @@ void kasan_free_pages(struct page *page, unsigned int order)
 				KASAN_FREE_PAGE);
 }
 
-#ifdef CONFIG_SLAB
 /*
  * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
  * For larger allocations larger redzones are used.
@@ -373,16 +372,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 			unsigned long *flags)
 {
 	int redzone_adjust;
-	/* Make sure the adjusted size is still less than
-	 * KMALLOC_MAX_CACHE_SIZE.
-	 * TODO: this check is only useful for SLAB, but not SLUB. We'll need
-	 * to skip it for SLUB when it starts using kasan_cache_create().
-	 */
-	if (*size > KMALLOC_MAX_CACHE_SIZE -
-	    sizeof(struct kasan_alloc_meta) -
-	    sizeof(struct kasan_free_meta))
-		return;
-	*flags |= SLAB_KASAN;
+	int orig_size = *size;
+
 	/* Add alloc meta. */
 	cache->kasan_info.alloc_meta_offset = *size;
 	*size += sizeof(struct kasan_alloc_meta);
@@ -395,14 +386,26 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 	}
 	redzone_adjust = optimal_redzone(cache->object_size) -
 		(*size - cache->object_size);
+
 	if (redzone_adjust > 0)
 		*size += redzone_adjust;
-	*size = min(KMALLOC_MAX_CACHE_SIZE,
-		    max(*size,
-			cache->object_size +
-			optimal_redzone(cache->object_size)));
+
+	*size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size +
+					optimal_redzone(cache->object_size)));
+
+	/*
+	 * If the metadata doesn't fit, don't enable KASAN at all.
+	 */
+	if (*size <= cache->kasan_info.alloc_meta_offset ||
+			*size <= cache->kasan_info.free_meta_offset) {
+		cache->kasan_info.alloc_meta_offset = 0;
+		cache->kasan_info.free_meta_offset = 0;
+		*size = orig_size;
+		return;
+	}
+
+	*flags |= SLAB_KASAN;
 }
-#endif
 
 void kasan_cache_shrink(struct kmem_cache *cache)
 {
@@ -414,6 +417,14 @@ void kasan_cache_destroy(struct kmem_cache *cache)
 	quarantine_remove_cache(cache);
 }
 
+size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+	return (cache->kasan_info.alloc_meta_offset ?
+		sizeof(struct kasan_alloc_meta) : 0) +
+		(cache->kasan_info.free_meta_offset ?
+		sizeof(struct kasan_free_meta) : 0);
+}
+
 void kasan_poison_slab(struct page *page)
 {
 	kasan_poison_shadow(page_address(page),
@@ -431,16 +442,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 	kasan_poison_shadow(object,
 			round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
 			KASAN_KMALLOC_REDZONE);
-#ifdef CONFIG_SLAB
 	if (cache->flags & SLAB_KASAN) {
 		struct kasan_alloc_meta *alloc_info =
 			get_alloc_info(cache, object);
 		alloc_info->state = KASAN_STATE_INIT;
 	}
-#endif
 }
 
-#ifdef CONFIG_SLAB
 static inline int in_irqentry_text(unsigned long ptr)
 {
 	return (ptr >= (unsigned long)&__irqentry_text_start &&
@@ -501,7 +509,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
 	BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
 	return (void *)object + cache->kasan_info.free_meta_offset;
 }
-#endif
 
 void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 {
@@ -522,16 +529,16 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 
 bool kasan_slab_free(struct kmem_cache *cache, void *object)
 {
-#ifdef CONFIG_SLAB
 	/* RCU slabs could be legally used after free within the RCU period */
 	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
 		return false;
 
 	if (likely(cache->flags & SLAB_KASAN)) {
-		struct kasan_alloc_meta *alloc_info =
-			get_alloc_info(cache, object);
-		struct kasan_free_meta *free_info =
-			get_free_info(cache, object);
+		struct kasan_alloc_meta *alloc_info;
+		struct kasan_free_meta *free_info;
+
+		alloc_info = get_alloc_info(cache, object);
+		free_info = get_free_info(cache, object);
 
 		switch (alloc_info->state) {
 		case KASAN_STATE_ALLOC:
@@ -550,10 +557,6 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
 		}
 	}
 	return false;
-#else
-	kasan_poison_slab_free(cache, object);
-	return false;
-#endif
 }
 
 void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -576,7 +579,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 	kasan_unpoison_shadow(object, size);
 	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
 		KASAN_KMALLOC_REDZONE);
-#ifdef CONFIG_SLAB
 	if (cache->flags & SLAB_KASAN) {
 		struct kasan_alloc_meta *alloc_info =
 			get_alloc_info(cache, object);
@@ -585,7 +587,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 		alloc_info->alloc_size = size;
 		set_track(&alloc_info->track, flags);
 	}
-#endif
 }
 EXPORT_SYMBOL(kasan_kmalloc);
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 7642ca0d74f7..a7359f3e69e9 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -99,7 +99,6 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
 struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
 					const void *object);
 
-
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
 	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
@@ -114,7 +113,7 @@ static inline bool kasan_report_enabled(void)
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 
-#ifdef CONFIG_SLAB
+#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
 void quarantine_reduce(void);
 void quarantine_remove_cache(struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index a41fb3796262..1f5168185a9d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -117,7 +117,6 @@ static inline bool init_task_stack_addr(const void *addr)
 			sizeof(init_thread_union.stack));
 }
 
-#ifdef CONFIG_SLAB
 static void print_track(struct kasan_track *track)
 {
 	pr_err("PID = %u\n", track->pid);
@@ -131,8 +130,8 @@ static void print_track(struct kasan_track *track)
 	}
 }
 
-static void object_err(struct kmem_cache *cache, struct page *page,
-			void *object, char *unused_reason)
+static void kasan_object_err(struct kmem_cache *cache, struct page *page,
+				void *object, char *unused_reason)
 {
 	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
 	struct kasan_free_meta *free_info;
@@ -163,7 +162,6 @@ static void object_err(struct kmem_cache *cache, struct page *page,
 		break;
 	}
 }
-#endif
 
 static void print_address_description(struct kasan_access_info *info)
 {
@@ -178,7 +176,7 @@ static void print_address_description(struct kasan_access_info *info)
 			struct kmem_cache *cache = page->slab_cache;
 			object = nearest_obj(cache, page,
 						(void *)info->access_addr);
-			object_err(cache, page, object,
+			kasan_object_err(cache, page, object,
 					"kasan: bad access detected");
 			return;
 		}
diff --git a/mm/slub.c b/mm/slub.c
index 32f0d880f941..3b03911509d7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -287,6 +287,9 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
 		return s->object_size;
 
 #endif
+	if (s->flags & SLAB_KASAN)
+		return s->object_size;
+
 	/*
 	 * If we have the need to store the freelist pointer
 	 * back there or track user information then we can
@@ -469,8 +472,6 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
  */
 #if defined(CONFIG_SLUB_DEBUG_ON)
 static int slub_debug = DEBUG_DEFAULT_FLAGS;
-#elif defined(CONFIG_KASAN)
-static int slub_debug = SLAB_STORE_USER;
 #else
 static int slub_debug;
 #endif
@@ -675,6 +676,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 	if (s->flags & SLAB_STORE_USER)
 		off += 2 * sizeof(struct track);
 
+	off += kasan_metadata_size(s);
+
 	if (off != size_from_object(s))
 		/* Beginning of the filler is the free pointer */
 		print_section("Padding ", p + off, size_from_object(s) - off);
@@ -802,6 +805,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 		/* We also have user information there */
 		off += 2 * sizeof(struct track);
 
+	off += kasan_metadata_size(s);
+
 	if (size_from_object(s) == off)
 		return 1;
 
@@ -1346,8 +1351,10 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
 	memcg_kmem_put_cache(s);
 }
 
-static inline void slab_free_hook(struct kmem_cache *s, void *x)
+static inline void *slab_free_hook(struct kmem_cache *s, void *x)
 {
+	void *freeptr;
+
 	kmemleak_free_recursive(x, s->flags);
 
 	/*
@@ -1368,7 +1375,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 	if (!(s->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(x, s->object_size);
 
+	freeptr = get_freepointer(s, x);
+	/*
+	 * kasan_slab_free() may put x into memory quarantine, delaying its
+	 * reuse. In this case the object's freelist pointer is changed.
+	 */
 	kasan_slab_free(s, x);
+	return freeptr;
 }
 
 static inline void slab_free_freelist_hook(struct kmem_cache *s,
@@ -1386,11 +1399,11 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
 
 	void *object = head;
 	void *tail_obj = tail ? : head;
+	void *freeptr;
 
 	do {
-		slab_free_hook(s, object);
-	} while ((object != tail_obj) &&
-		 (object = get_freepointer(s, object)));
+		freeptr = slab_free_hook(s, object);
+	} while ((object != tail_obj) && (object = freeptr));
 #endif
 }
 
@@ -2798,16 +2811,13 @@ slab_empty:
  * same page) possible by specifying head and tail ptr, plus objects
  * count (cnt). Bulk free indicated by tail pointer being set.
  */
-static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
-				      void *head, void *tail, int cnt,
-				      unsigned long addr)
+static __always_inline void do_slab_free(struct kmem_cache *s,
+				struct page *page, void *head, void *tail,
+				int cnt, unsigned long addr)
 {
 	void *tail_obj = tail ? : head;
 	struct kmem_cache_cpu *c;
 	unsigned long tid;
-
-	slab_free_freelist_hook(s, head, tail);
-
 redo:
 	/*
 	 * Determine the currently cpus per cpu slab.
@@ -2841,6 +2851,27 @@ redo:
 
 }
 
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+				      void *head, void *tail, int cnt,
+				      unsigned long addr)
+{
+	slab_free_freelist_hook(s, head, tail);
+	/*
+	 * slab_free_freelist_hook() could have put the items into quarantine.
+	 * If so, no need to free them.
+	 */
+	if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU))
+		return;
+	do_slab_free(s, page, head, tail, cnt, addr);
+}
+
+#ifdef CONFIG_KASAN
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
+{
+	do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
+}
+#endif
+
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
 	s = cache_from_obj(s, x);
@@ -3261,7 +3292,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
 static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
 	unsigned long flags = s->flags;
-	unsigned long size = s->object_size;
+	size_t size = s->object_size;
 	int order;
 
 	/*
@@ -3320,7 +3351,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 		 * the object.
 		 */
 		size += 2 * sizeof(struct track);
+#endif
 
+	kasan_cache_create(s, &size, &s->flags);
+#ifdef CONFIG_SLUB_DEBUG
 	if (flags & SLAB_RED_ZONE) {
 		/*
 		 * Add some empty padding so that we can catch

From 76696721b36aa1281364ca2eeda852f0f75b6ea7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Thu, 28 Jul 2016 15:49:10 -0700
Subject: [PATCH 1309/3220] UPSTREAM: lib/stackdepot.c: use __GFP_NOWARN for
 stack allocations

This (large, atomic) allocation attempt can fail.  We expect and handle
that, so avoid the scary warning.

Link: http://lkml.kernel.org/r/20160720151905.GB19146@node.shutemov.name
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 87cc271d5e4320d705cfdf59f68d4d037b3511b2)
Change-Id: I1b20189c6c83287ac64f408804dc0b3c29789323
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/stackdepot.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 53ad6c0831ae..60f77f1d470a 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -242,6 +242,7 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
 		 */
 		alloc_flags &= ~GFP_ZONEMASK;
 		alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
+		alloc_flags |= __GFP_NOWARN;
 		page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
 		if (page)
 			prealloc = page_address(page);

From 13134b919bd031e9dcf0e34b18e94834873b76d3 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Tue, 2 Aug 2016 14:02:40 -0700
Subject: [PATCH 1310/3220] UPSTREAM: mm/kasan: fix corruptions and false
 positive reports

Once an object is put into quarantine, we no longer own it, i.e.  object
could leave the quarantine and be reallocated.  So having set_track()
call after the quarantine_put() may corrupt slab objects.

 BUG kmalloc-4096 (Not tainted): Poison overwritten
 -----------------------------------------------------------------------------
 Disabling lock debugging due to kernel taint
 INFO: 0xffff8804540de850-0xffff8804540de857. First byte 0xb5 instead of 0x6b
...
 INFO: Freed in qlist_free_all+0x42/0x100 age=75 cpu=3 pid=24492
  __slab_free+0x1d6/0x2e0
  ___cache_free+0xb6/0xd0
  qlist_free_all+0x83/0x100
  quarantine_reduce+0x177/0x1b0
  kasan_kmalloc+0xf3/0x100
  kasan_slab_alloc+0x12/0x20
  kmem_cache_alloc+0x109/0x3e0
  mmap_region+0x53e/0xe40
  do_mmap+0x70f/0xa50
  vm_mmap_pgoff+0x147/0x1b0
  SyS_mmap_pgoff+0x2c7/0x5b0
  SyS_mmap+0x1b/0x30
  do_syscall_64+0x1a0/0x4e0
  return_from_SYSCALL_64+0x0/0x7a
 INFO: Slab 0xffffea0011503600 objects=7 used=7 fp=0x          (null) flags=0x8000000000004080
 INFO: Object 0xffff8804540de848 @offset=26696 fp=0xffff8804540dc588
 Redzone ffff8804540de840: bb bb bb bb bb bb bb bb                          ........
 Object ffff8804540de848: 6b 6b 6b 6b 6b 6b 6b 6b b5 52 00 00 f2 01 60 cc  kkkkkkkk.R....`.

Similarly, poisoning after the quarantine_put() leads to false positive
use-after-free reports:

 BUG: KASAN: use-after-free in anon_vma_interval_tree_insert+0x304/0x430 at addr ffff880405c540a0
 Read of size 8 by task trinity-c0/3036
 CPU: 0 PID: 3036 Comm: trinity-c0 Not tainted 4.7.0-think+ #9
 Call Trace:
   dump_stack+0x68/0x96
   kasan_report_error+0x222/0x600
   __asan_report_load8_noabort+0x61/0x70
   anon_vma_interval_tree_insert+0x304/0x430
   anon_vma_chain_link+0x91/0xd0
   anon_vma_clone+0x136/0x3f0
   anon_vma_fork+0x81/0x4c0
   copy_process.part.47+0x2c43/0x5b20
   _do_fork+0x16d/0xbd0
   SyS_clone+0x19/0x20
   do_syscall_64+0x1a0/0x4e0
   entry_SYSCALL64_slow_path+0x25/0x25

Fix this by putting an object in the quarantine after all other
operations.

Fixes: 80a9201a5965 ("mm, kasan: switch SLUB to stackdepot, enable memory quarantine for SLUB")
Link: http://lkml.kernel.org/r/1470062715-14077-1-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reported-by: Dave Jones <davej@codemonkey.org.uk>
Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Reported-by: Sasha Levin <alexander.levin@verizon.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 4a3d308d6674fabf213bce9c1a661ef43a85e515)
Change-Id: Iaa699c447b97f8cb04afdd2d6a5f572bea439185
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/kasan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index b6f99e81bfeb..3019cecc0833 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -543,9 +543,9 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
 		switch (alloc_info->state) {
 		case KASAN_STATE_ALLOC:
 			alloc_info->state = KASAN_STATE_QUARANTINE;
-			quarantine_put(free_info, cache);
 			set_track(&free_info->track, GFP_NOWAIT);
 			kasan_poison_slab_free(cache, object);
+			quarantine_put(free_info, cache);
 			return true;
 		case KASAN_STATE_QUARANTINE:
 		case KASAN_STATE_FREE:

From f3bc37a7814d58ef7d0595c0a2c4193cf9f7512f Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Tue, 2 Aug 2016 14:02:43 -0700
Subject: [PATCH 1311/3220] UPSTREAM: mm/kasan: don't reduce quarantine in
 atomic contexts

Currently we call quarantine_reduce() for ___GFP_KSWAPD_RECLAIM (implied
by __GFP_RECLAIM) allocation.  So, basically we call it on almost every
allocation.  quarantine_reduce() sometimes is heavy operation, and
calling it with disabled interrupts may trigger hard LOCKUP:

 NMI watchdog: Watchdog detected hard LOCKUP on cpu 2irq event stamp: 1411258
 Call Trace:
  <NMI>   dump_stack+0x68/0x96
   watchdog_overflow_callback+0x15b/0x190
   __perf_event_overflow+0x1b1/0x540
   perf_event_overflow+0x14/0x20
   intel_pmu_handle_irq+0x36a/0xad0
   perf_event_nmi_handler+0x2c/0x50
   nmi_handle+0x128/0x480
   default_do_nmi+0xb2/0x210
   do_nmi+0x1aa/0x220
   end_repeat_nmi+0x1a/0x1e
  <<EOE>>   __kernel_text_address+0x86/0xb0
   print_context_stack+0x7b/0x100
   dump_trace+0x12b/0x350
   save_stack_trace+0x2b/0x50
   set_track+0x83/0x140
   free_debug_processing+0x1aa/0x420
   __slab_free+0x1d6/0x2e0
   ___cache_free+0xb6/0xd0
   qlist_free_all+0x83/0x100
   quarantine_reduce+0x177/0x1b0
   kasan_kmalloc+0xf3/0x100

Reduce the quarantine_reduce iff direct reclaim is allowed.

Fixes: 55834c59098d("mm: kasan: initial memory quarantine implementation")
Link: http://lkml.kernel.org/r/1470062715-14077-2-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reported-by: Dave Jones <davej@codemonkey.org.uk>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 4b3ec5a3f4b1d5c9d64b9ab704042400d050d432)
Change-Id: I7e6ad29acabc2091f98a8aac54ed041b574b5e7e
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/kasan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 3019cecc0833..c99ef40ebdfa 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -565,7 +565,7 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
-	if (flags & __GFP_RECLAIM)
+	if (gfpflags_allow_blocking(flags))
 		quarantine_reduce();
 
 	if (unlikely(object == NULL))
@@ -596,7 +596,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
-	if (flags & __GFP_RECLAIM)
+	if (gfpflags_allow_blocking(flags))
 		quarantine_reduce();
 
 	if (unlikely(ptr == NULL))

From 101a6c23d8ecdfdd5d44433d6d957912625df408 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Tue, 2 Aug 2016 14:02:46 -0700
Subject: [PATCH 1312/3220] UPSTREAM: mm/kasan, slub: don't disable interrupts
 when object leaves quarantine

SLUB doesn't require disabled interrupts to call ___cache_free().

Link: http://lkml.kernel.org/r/1470062715-14077-3-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from f7376aed6c032aab820fa36806a89e16e353a0d9)
Change-Id: I9c8ae37791ab10c746414322a672bdf0ebd1ed9f
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/quarantine.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 65793f150d1f..4852625ff851 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -147,10 +147,14 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
 	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
 	unsigned long flags;
 
-	local_irq_save(flags);
+	if (IS_ENABLED(CONFIG_SLAB))
+		local_irq_save(flags);
+
 	alloc_info->state = KASAN_STATE_FREE;
 	___cache_free(cache, object, _THIS_IP_);
-	local_irq_restore(flags);
+
+	if (IS_ENABLED(CONFIG_SLAB))
+		local_irq_restore(flags);
 }
 
 static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)

From 30d8b5b40294f132eba25efed83b66f02df6ca3c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 26 May 2016 15:16:11 -0700
Subject: [PATCH 1313/3220] UPSTREAM: mm: kasan: remove unused 'reserved' field
 from struct kasan_alloc_meta

Commit cd11016e5f52 ("mm, kasan: stackdepot implementation.  Enable
stackdepot for SLAB") added 'reserved' field, but never used it.

Link: http://lkml.kernel.org/r/1464021054-2307-1-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 9725759a96efb1ce56a1b93455ac0ab1901c5327)
Change-Id: I34d5d28a6f6e1014d234f38c23b6e4aa408d3e84
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/kasan.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index a7359f3e69e9..bd1ddd8e5474 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -80,7 +80,6 @@ struct kasan_alloc_meta {
 	struct kasan_track track;
 	u32 state : 2;	/* enum kasan_state */
 	u32 alloc_size : 30;
-	u32 reserved;
 };
 
 struct qlist_node {

From 1f93a556dfaf2e7da5a284d5e58088fcea46ffe9 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Tue, 2 Aug 2016 14:02:49 -0700
Subject: [PATCH 1314/3220] UPSTREAM: mm/kasan: get rid of ->alloc_size in
 struct kasan_alloc_meta

Size of slab object already stored in cache->object_size.

Note, that kmalloc() internally rounds up size of allocation, so
object_size may be not equal to alloc_size, but, usually we don't need
to know the exact size of allocated object.  In case if we need that
information, we still can figure it out from the report.  The dump of
shadow memory allows to identify the end of allocated memory, and
thereby the exact allocation size.

Link: http://lkml.kernel.org/r/1470062715-14077-4-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 47b5c2a0f021e90a79845d1a1353780e5edd0bce)
Change-Id: I76b555f9a8469f685607ca50f6c51b2e0ad1b4ab
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/kasan.c  | 1 -
 mm/kasan/kasan.h  | 3 +--
 mm/kasan/report.c | 8 +++-----
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index c99ef40ebdfa..388e812ccaca 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -584,7 +584,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 			get_alloc_info(cache, object);
 
 		alloc_info->state = KASAN_STATE_ALLOC;
-		alloc_info->alloc_size = size;
 		set_track(&alloc_info->track, flags);
 	}
 }
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index bd1ddd8e5474..7da506aa09a1 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -78,8 +78,7 @@ struct kasan_track {
 
 struct kasan_alloc_meta {
 	struct kasan_track track;
-	u32 state : 2;	/* enum kasan_state */
-	u32 alloc_size : 30;
+	u32 state;
 };
 
 struct qlist_node {
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 1f5168185a9d..d704a5705804 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -137,7 +137,9 @@ static void kasan_object_err(struct kmem_cache *cache, struct page *page,
 	struct kasan_free_meta *free_info;
 
 	dump_stack();
-	pr_err("Object at %p, in cache %s\n", object, cache->name);
+	pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
+		cache->object_size);
+
 	if (!(cache->flags & SLAB_KASAN))
 		return;
 	switch (alloc_info->state) {
@@ -145,15 +147,11 @@ static void kasan_object_err(struct kmem_cache *cache, struct page *page,
 		pr_err("Object not allocated yet\n");
 		break;
 	case KASAN_STATE_ALLOC:
-		pr_err("Object allocated with size %u bytes.\n",
-		       alloc_info->alloc_size);
 		pr_err("Allocation:\n");
 		print_track(&alloc_info->track);
 		break;
 	case KASAN_STATE_FREE:
 	case KASAN_STATE_QUARANTINE:
-		pr_err("Object freed, allocated with size %u bytes\n",
-		       alloc_info->alloc_size);
 		free_info = get_free_info(cache, object);
 		pr_err("Allocation:\n");
 		print_track(&alloc_info->track);

From 508ad7fe8983756073061f15891ecf2abde396df Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Tue, 2 Aug 2016 14:02:52 -0700
Subject: [PATCH 1315/3220] BACKPORT: mm/kasan: get rid of ->state in struct
 kasan_alloc_meta

The state of object currently tracked in two places - shadow memory, and
the ->state field in struct kasan_alloc_meta.  We can get rid of the
latter.  The will save us a little bit of memory.  Also, this allow us
to move free stack into struct kasan_alloc_meta, without increasing
memory consumption.  So now we should always know when the last time the
object was freed.  This may be useful for long delayed use-after-free
bugs.

As a side effect this fixes following UBSAN warning:
	UBSAN: Undefined behaviour in mm/kasan/quarantine.c:102:13
	member access within misaligned address ffff88000d1efebc for type 'struct qlist_node'
	which requires 8 byte alignment

Link: http://lkml.kernel.org/r/1470062715-14077-5-git-send-email-aryabinin@virtuozzo.com
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from b3cbd9bf77cd1888114dbee1653e79aa23fd4068)
Change-Id: Iaa4959a78ffd2e49f9060099df1fb32483df3085
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kasan.h |  3 +++
 mm/kasan/kasan.c      | 63 ++++++++++++++++++++-----------------------
 mm/kasan/kasan.h      | 12 ++-------
 mm/kasan/quarantine.c |  2 --
 mm/kasan/report.c     | 23 ++++------------
 mm/slab.c             |  2 +-
 mm/slub.c             |  1 +
 7 files changed, 41 insertions(+), 65 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index c9cf374445d8..d600303306eb 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -56,6 +56,7 @@ void kasan_cache_destroy(struct kmem_cache *cache);
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
 void kasan_poison_object_data(struct kmem_cache *cache, void *object);
+void kasan_init_slab_obj(struct kmem_cache *cache, const void *object);
 
 void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
 void kasan_kfree_large(const void *ptr);
@@ -102,6 +103,8 @@ static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
 					void *object) {}
 static inline void kasan_poison_object_data(struct kmem_cache *cache,
 					void *object) {}
+static inline void kasan_init_slab_obj(struct kmem_cache *cache,
+				const void *object) {}
 
 static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {}
 static inline void kasan_kfree_large(const void *ptr) {}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 388e812ccaca..92750e3b0083 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -442,11 +442,6 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 	kasan_poison_shadow(object,
 			round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
 			KASAN_KMALLOC_REDZONE);
-	if (cache->flags & SLAB_KASAN) {
-		struct kasan_alloc_meta *alloc_info =
-			get_alloc_info(cache, object);
-		alloc_info->state = KASAN_STATE_INIT;
-	}
 }
 
 static inline int in_irqentry_text(unsigned long ptr)
@@ -510,6 +505,17 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
 	return (void *)object + cache->kasan_info.free_meta_offset;
 }
 
+void kasan_init_slab_obj(struct kmem_cache *cache, const void *object)
+{
+	struct kasan_alloc_meta *alloc_info;
+
+	if (!(cache->flags & SLAB_KASAN))
+		return;
+
+	alloc_info = get_alloc_info(cache, object);
+	__memset(alloc_info, 0, sizeof(*alloc_info));
+}
+
 void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 {
 	kasan_kmalloc(cache, object, cache->object_size, flags);
@@ -529,34 +535,27 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 
 bool kasan_slab_free(struct kmem_cache *cache, void *object)
 {
+	s8 shadow_byte;
+
 	/* RCU slabs could be legally used after free within the RCU period */
 	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
 		return false;
 
-	if (likely(cache->flags & SLAB_KASAN)) {
-		struct kasan_alloc_meta *alloc_info;
-		struct kasan_free_meta *free_info;
-
-		alloc_info = get_alloc_info(cache, object);
-		free_info = get_free_info(cache, object);
-
-		switch (alloc_info->state) {
-		case KASAN_STATE_ALLOC:
-			alloc_info->state = KASAN_STATE_QUARANTINE;
-			set_track(&free_info->track, GFP_NOWAIT);
-			kasan_poison_slab_free(cache, object);
-			quarantine_put(free_info, cache);
-			return true;
-		case KASAN_STATE_QUARANTINE:
-		case KASAN_STATE_FREE:
-			pr_err("Double free");
-			dump_stack();
-			break;
-		default:
-			break;
-		}
+	shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
+	if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
+		pr_err("Double free");
+		dump_stack();
+		return true;
 	}
-	return false;
+
+	kasan_poison_slab_free(cache, object);
+
+	if (unlikely(!(cache->flags & SLAB_KASAN)))
+		return false;
+
+	set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
+	quarantine_put(get_free_info(cache, object), cache);
+	return true;
 }
 
 void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -579,13 +578,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 	kasan_unpoison_shadow(object, size);
 	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
 		KASAN_KMALLOC_REDZONE);
-	if (cache->flags & SLAB_KASAN) {
-		struct kasan_alloc_meta *alloc_info =
-			get_alloc_info(cache, object);
 
-		alloc_info->state = KASAN_STATE_ALLOC;
-		set_track(&alloc_info->track, flags);
-	}
+	if (cache->flags & SLAB_KASAN)
+		set_track(&get_alloc_info(cache, object)->alloc_track, flags);
 }
 EXPORT_SYMBOL(kasan_kmalloc);
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 7da506aa09a1..5407af019ae4 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -62,13 +62,6 @@ struct kasan_global {
  * Structures to keep alloc and free tracks *
  */
 
-enum kasan_state {
-	KASAN_STATE_INIT,
-	KASAN_STATE_ALLOC,
-	KASAN_STATE_QUARANTINE,
-	KASAN_STATE_FREE
-};
-
 #define KASAN_STACK_DEPTH 64
 
 struct kasan_track {
@@ -77,8 +70,8 @@ struct kasan_track {
 };
 
 struct kasan_alloc_meta {
-	struct kasan_track track;
-	u32 state;
+	struct kasan_track alloc_track;
+	struct kasan_track free_track;
 };
 
 struct qlist_node {
@@ -89,7 +82,6 @@ struct kasan_free_meta {
 	 * Otherwise it might be used for the allocator freelist.
 	 */
 	struct qlist_node quarantine_link;
-	struct kasan_track track;
 };
 
 struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 4852625ff851..7fd121d13b88 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -144,13 +144,11 @@ static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
 static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
 {
 	void *object = qlink_to_object(qlink, cache);
-	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
 	unsigned long flags;
 
 	if (IS_ENABLED(CONFIG_SLAB))
 		local_irq_save(flags);
 
-	alloc_info->state = KASAN_STATE_FREE;
 	___cache_free(cache, object, _THIS_IP_);
 
 	if (IS_ENABLED(CONFIG_SLAB))
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index d704a5705804..e4e0d5ee9132 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -134,7 +134,6 @@ static void kasan_object_err(struct kmem_cache *cache, struct page *page,
 				void *object, char *unused_reason)
 {
 	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
-	struct kasan_free_meta *free_info;
 
 	dump_stack();
 	pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
@@ -142,23 +141,11 @@ static void kasan_object_err(struct kmem_cache *cache, struct page *page,
 
 	if (!(cache->flags & SLAB_KASAN))
 		return;
-	switch (alloc_info->state) {
-	case KASAN_STATE_INIT:
-		pr_err("Object not allocated yet\n");
-		break;
-	case KASAN_STATE_ALLOC:
-		pr_err("Allocation:\n");
-		print_track(&alloc_info->track);
-		break;
-	case KASAN_STATE_FREE:
-	case KASAN_STATE_QUARANTINE:
-		free_info = get_free_info(cache, object);
-		pr_err("Allocation:\n");
-		print_track(&alloc_info->track);
-		pr_err("Deallocation:\n");
-		print_track(&free_info->track);
-		break;
-	}
+
+	pr_err("Allocated:\n");
+	print_track(&alloc_info->alloc_track);
+	pr_err("Freed:\n");
+	print_track(&alloc_info->free_track);
 }
 
 static void print_address_description(struct kasan_access_info *info)
diff --git a/mm/slab.c b/mm/slab.c
index 3c35de748ff7..1d1788b87061 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2496,7 +2496,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
 
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = index_to_obj(cachep, page, i);
-
+		kasan_init_slab_obj(cachep, objp);
 		if (cachep->flags & SLAB_STORE_USER)
 			*dbg_userword(cachep, objp) = NULL;
 
diff --git a/mm/slub.c b/mm/slub.c
index 3b03911509d7..a5bf23a53454 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1411,6 +1411,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 				void *object)
 {
 	setup_object_debug(s, page, object);
+	kasan_init_slab_obj(s, object);
 	if (unlikely(s->ctor)) {
 		kasan_unpoison_object_data(s, object);
 		s->ctor(object);

From 1a44264ae868238a338c1314fe0e762a48a67b67 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 17 Mar 2016 14:19:47 -0700
Subject: [PATCH 1316/3220] BACKPORT: mm: coalesce split strings

Kernel style prefers a single string over split strings when the string is
'user-visible'.

Miscellanea:

 - Add a missing newline
 - Realign arguments

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Tejun Heo <tj@kernel.org>	[percpu]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 756a025f00091918d9d09ca3229defb160b409c0)
Change-Id: I377fb1542980c15d2f306924656227ad17b02b5e
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/dmapool.c        | 10 ++++------
 mm/huge_memory.c    |  3 +--
 mm/kasan/report.c   |  6 ++----
 mm/kmemcheck.c      |  3 +--
 mm/kmemleak.c       | 18 ++++++++----------
 mm/memblock.c       |  3 +--
 mm/memory_hotplug.c |  3 +--
 mm/mempolicy.c      |  4 +---
 mm/mmap.c           |  3 +--
 mm/oom_kill.c       |  3 +--
 mm/page_alloc.c     | 37 +++++++++++++++++--------------------
 mm/percpu.c         |  4 ++--
 mm/slab.c           | 28 ++++++++++------------------
 mm/slab_common.c    | 10 ++++------
 mm/slub.c           | 19 +++++++++----------
 mm/sparse-vmemmap.c |  8 ++++----
 mm/sparse.c         |  8 ++++----
 mm/swapfile.c       |  3 +--
 mm/vmalloc.c        |  4 ++--
 19 files changed, 74 insertions(+), 103 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 57312b5d6e12..2821500e8123 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -452,13 +452,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 			}
 			spin_unlock_irqrestore(&pool->lock, flags);
 			if (pool->dev)
-				dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
-					"already free\n", pool->name,
-					(unsigned long long)dma);
+				dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n",
+					pool->name, (unsigned long long)dma);
 			else
-				printk(KERN_ERR "dma_pool_free %s, dma %Lx "
-					"already free\n", pool->name,
-					(unsigned long long)dma);
+				printk(KERN_ERR "dma_pool_free %s, dma %Lx already free\n",
+					pool->name, (unsigned long long)dma);
 			return;
 		}
 	}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8f3769ec8575..03e3a0c205ec 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -135,8 +135,7 @@ static void set_recommended_min_free_kbytes(void)
 
 	if (recommended_min > min_free_kbytes) {
 		if (user_min_free_kbytes >= 0)
-			pr_info("raising min_free_kbytes from %d to %lu "
-				"to help transparent hugepage allocations\n",
+			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
 				min_free_kbytes, recommended_min);
 
 		min_free_kbytes = recommended_min;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index e4e0d5ee9132..bf9e484f0c42 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -239,8 +239,7 @@ static void kasan_report_error(struct kasan_access_info *info)
 	 */
 	kasan_disable_current();
 	spin_lock_irqsave(&report_lock, flags);
-	pr_err("================================="
-		"=================================\n");
+	pr_err("==================================================================\n");
 	if (info->access_addr <
 			kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
 		if ((unsigned long)info->access_addr < PAGE_SIZE)
@@ -261,8 +260,7 @@ static void kasan_report_error(struct kasan_access_info *info)
 		print_address_description(info);
 		print_shadow_for_address(info->first_bad_addr);
 	}
-	pr_err("================================="
-		"=================================\n");
+	pr_err("==================================================================\n");
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 	spin_unlock_irqrestore(&report_lock, flags);
 	kasan_enable_current();
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index cab58bb592d8..e6347772bbda 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
 	shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
 	if (!shadow) {
 		if (printk_ratelimit())
-			printk(KERN_ERR "kmemcheck: failed to allocate "
-				"shadow bitmap\n");
+			printk(KERN_ERR "kmemcheck: failed to allocate shadow bitmap\n");
 		return;
 	}
 
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 19423a45d7d7..f78217d6e46e 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -597,8 +597,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
 		else if (parent->pointer + parent->size <= ptr)
 			link = &parent->rb_node.rb_right;
 		else {
-			kmemleak_stop("Cannot insert 0x%lx into the object "
-				      "search tree (overlaps existing)\n",
+			kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
 				      ptr);
 			/*
 			 * No need for parent->lock here since "parent" cannot
@@ -671,8 +670,8 @@ static void delete_object_part(unsigned long ptr, size_t size)
 	object = find_and_remove_object(ptr, 1);
 	if (!object) {
 #ifdef DEBUG
-		kmemleak_warn("Partially freeing unknown object at 0x%08lx "
-			      "(size %zu)\n", ptr, size);
+		kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+			      ptr, size);
 #endif
 		return;
 	}
@@ -718,8 +717,8 @@ static void paint_ptr(unsigned long ptr, int color)
 
 	object = find_and_get_object(ptr, 0);
 	if (!object) {
-		kmemleak_warn("Trying to color unknown object "
-			      "at 0x%08lx as %s\n", ptr,
+		kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
+			      ptr,
 			      (color == KMEMLEAK_GREY) ? "Grey" :
 			      (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
 		return;
@@ -1464,8 +1463,8 @@ static void kmemleak_scan(void)
 	if (new_leaks) {
 		kmemleak_found_leaks = true;
 
-		pr_info("%d new suspected memory leaks (see "
-			"/sys/kernel/debug/kmemleak)\n", new_leaks);
+		pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n",
+			new_leaks);
 	}
 
 }
@@ -1796,8 +1795,7 @@ static void kmemleak_do_cleanup(struct work_struct *work)
 	if (!kmemleak_found_leaks)
 		__kmemleak_do_cleanup();
 	else
-		pr_info("Kmemleak disabled without freeing internal data. "
-			"Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
+		pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n");
 }
 
 static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
diff --git a/mm/memblock.c b/mm/memblock.c
index fc9671d41a81..99c7f493d45f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -241,8 +241,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 		 * so we use WARN_ONCE() here to see the stack trace if
 		 * fail happens.
 		 */
-		WARN_ONCE(1, "memblock: bottom-up allocation failed, "
-			     "memory hotunplug may be affected\n");
+		WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
 	}
 
 	return __memblock_find_range_top_down(start, end, size, align, nid,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a18923e4359d..4d91216dcab2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1927,8 +1927,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 
 		beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
 		endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
-		pr_warn("removing memory fails, because memory "
-			"[%pa-%pa] is onlined\n",
+		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
 			&beginpa, &endpa);
 	}
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 177668a9c267..62e4af5b287f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2514,9 +2514,7 @@ static void __init check_numabalancing_enable(void)
 		set_numabalancing_state(numabalancing_override == 1);
 
 	if (num_online_nodes() > 1 && !numabalancing_override) {
-		pr_info("%s automatic NUMA balancing. "
-			"Configure with numa_balancing= or the "
-			"kernel.numa_balancing sysctl",
+		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
 			numabalancing_default ? "Enabling" : "Disabling");
 		set_numabalancing_state(numabalancing_default);
 	}
diff --git a/mm/mmap.c b/mm/mmap.c
index 19823fc3dcfa..a399f3056ad7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2702,8 +2702,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	unsigned long ret = -EINVAL;
 	struct file *file;
 
-	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
-			"See Documentation/vm/remap_file_pages.txt.\n",
+	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
 			current->comm, current->pid);
 
 	if (prot)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c12680993ff3..6c1b9c43cc23 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,8 +386,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 static void dump_header(struct oom_control *oc, struct task_struct *p,
 			struct mem_cgroup *memcg)
 {
-	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_score_adj=%hd\n",
+	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, oom_score_adj=%hd\n",
 		current->comm, oc->gfp_mask, oc->order,
 		current->signal->oom_score_adj);
 	cpuset_print_current_mems_allowed();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e6ea5cbb33ac..04b450ce822d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3997,8 +3997,7 @@ static int __parse_numa_zonelist_order(char *s)
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
-			"Ignoring invalid numa_zonelist_order value:  "
-			"%s\n", s);
+		       "Ignoring invalid numa_zonelist_order value:  %s\n", s);
 		return -EINVAL;
 	}
 	return 0;
@@ -4463,12 +4462,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 	else
 		page_group_by_mobility_disabled = 0;
 
-	pr_info("Built %i zonelists in %s order, mobility grouping %s.  "
-		"Total pages: %ld\n",
-			nr_online_nodes,
-			zonelist_order_name[current_zonelist_order],
-			page_group_by_mobility_disabled ? "off" : "on",
-			vm_total_pages);
+	pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n",
+		nr_online_nodes,
+		zonelist_order_name[current_zonelist_order],
+		page_group_by_mobility_disabled ? "off" : "on",
+		vm_total_pages);
 #ifdef CONFIG_NUMA
 	pr_info("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
@@ -5939,22 +5937,21 @@ void __init mem_init_print_info(const char *str)
 
 #undef	adj_init_size
 
-	pr_info("Memory: %luK/%luK available "
-	       "(%luK kernel code, %luK rwdata, %luK rodata, "
-	       "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
+	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
 #ifdef	CONFIG_HIGHMEM
-	       ", %luK highmem"
+		", %luK highmem"
 #endif
-	       "%s%s)\n",
-	       nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
-	       codesize >> 10, datasize >> 10, rosize >> 10,
-	       (init_data_size + init_code_size) >> 10, bss_size >> 10,
-	       (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
-	       totalcma_pages << (PAGE_SHIFT-10),
+		"%s%s)\n",
+		nr_free_pages() << (PAGE_SHIFT - 10),
+		physpages << (PAGE_SHIFT - 10),
+		codesize >> 10, datasize >> 10, rosize >> 10,
+		(init_data_size + init_code_size) >> 10, bss_size >> 10,
+		(physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+		totalcma_pages << (PAGE_SHIFT - 10),
 #ifdef	CONFIG_HIGHMEM
-	       totalhigh_pages << (PAGE_SHIFT-10),
+		totalhigh_pages << (PAGE_SHIFT - 10),
 #endif
-	       str ? ", " : "", str ? str : "");
+		str ? ", " : "", str ? str : "");
 }
 
 /**
diff --git a/mm/percpu.c b/mm/percpu.c
index ef6353f0adbd..d9f91253953e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -889,8 +889,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	size = ALIGN(size, 2);
 
 	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
-		WARN(true, "illegal size (%zu) or align (%zu) for "
-		     "percpu allocation\n", size, align);
+		WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
+		     size, align);
 		return NULL;
 	}
 
diff --git a/mm/slab.c b/mm/slab.c
index 1d1788b87061..8fc762c178bd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1742,11 +1742,9 @@ static void dump_line(char *data, int offset, int limit)
 	if (bad_count == 1) {
 		error ^= POISON_FREE;
 		if (!(error & (error - 1))) {
-			printk(KERN_ERR "Single bit error detected. Probably "
-					"bad RAM.\n");
+			printk(KERN_ERR "Single bit error detected. Probably bad RAM.\n");
 #ifdef CONFIG_X86
-			printk(KERN_ERR "Run memtest86+ or a similar memory "
-					"test tool.\n");
+			printk(KERN_ERR "Run memtest86+ or a similar memory test tool.\n");
 #else
 			printk(KERN_ERR "Run a memory test tool.\n");
 #endif
@@ -1863,11 +1861,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
 		}
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
-				slab_error(cachep, "start of a freed object "
-					   "was overwritten");
+				slab_error(cachep, "start of a freed object was overwritten");
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
-				slab_error(cachep, "end of a freed object "
-					   "was overwritten");
+				slab_error(cachep, "end of a freed object was overwritten");
 		}
 	}
 }
@@ -2519,11 +2515,9 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
-				slab_error(cachep, "constructor overwrote the"
-					   " end of an object");
+				slab_error(cachep, "constructor overwrote the end of an object");
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
-				slab_error(cachep, "constructor overwrote the"
-					   " start of an object");
+				slab_error(cachep, "constructor overwrote the start of an object");
 		}
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON) {
@@ -2597,8 +2591,8 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
 	/* Verify double free bug */
 	for (i = page->active; i < cachep->num; i++) {
 		if (get_free_obj(page, i) == objnr) {
-			printk(KERN_ERR "slab: double free detected in cache "
-					"'%s', objp %p\n", cachep->name, objp);
+			printk(KERN_ERR "slab: double free detected in cache '%s', objp %p\n",
+			       cachep->name, objp);
 			BUG();
 		}
 	}
@@ -2907,8 +2901,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 	if (cachep->flags & SLAB_RED_ZONE) {
 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
 				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-			slab_error(cachep, "double free, or memory outside"
-						" object was overwritten");
+			slab_error(cachep, "double free, or memory outside object was overwritten");
 			printk(KERN_ERR
 				"%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
 				objp, *dbg_redzone1(cachep, objp),
@@ -4032,8 +4025,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
 		unsigned long node_frees = cachep->node_frees;
 		unsigned long overflows = cachep->node_overflow;
 
-		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
-			   "%4lu %4lu %4lu %4lu %4lu",
+		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
 			   allocs, high, grown,
 			   reaped, errors, max_freeable, node_allocs,
 			   node_frees, overflows);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fb04a8e89d14..42dd69b558a5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -724,8 +724,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
 		err = shutdown_cache(s, &release, &need_rcu_barrier);
 
 	if (err) {
-		pr_err("kmem_cache_destroy %s: "
-		       "Slab cache still has objects\n", s->name);
+		pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
+		       s->name);
 		dump_stack();
 	}
 out_unlock:
@@ -1046,13 +1046,11 @@ static void print_slabinfo_header(struct seq_file *m)
 #else
 	seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
-	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
-		 "<objperslab> <pagesperslab>");
+	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #ifdef CONFIG_DEBUG_SLAB
-	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
-		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
 	seq_putc(m, '\n');
diff --git a/mm/slub.c b/mm/slub.c
index a5bf23a53454..06bed117ce90 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -970,14 +970,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 		max_objects = MAX_OBJS_PER_PAGE;
 
 	if (page->objects != max_objects) {
-		slab_err(s, page, "Wrong number of objects. Found %d but "
-			"should be %d", page->objects, max_objects);
+		slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
+			 page->objects, max_objects);
 		page->objects = max_objects;
 		slab_fix(s, "Number of objects adjusted.");
 	}
 	if (page->inuse != page->objects - nr) {
-		slab_err(s, page, "Wrong object count. Counter is %d but "
-			"counted were %d", page->inuse, page->objects - nr);
+		slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
+			 page->inuse, page->objects - nr);
 		page->inuse = page->objects - nr;
 		slab_fix(s, "Object count adjusted.");
 	}
@@ -1141,8 +1141,8 @@ next_object:
 
 	if (unlikely(s != page->slab_cache)) {
 		if (!PageSlab(page)) {
-			slab_err(s, page, "Attempt to free object(0x%p) "
-				"outside of slab", object);
+			slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
+				 object);
 		} else if (!page->slab_cache) {
 			pr_err("SLUB <none>: no slab for object 0x%p.\n",
 			       object);
@@ -3484,10 +3484,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 	free_kmem_cache_nodes(s);
 error:
 	if (flags & SLAB_PANIC)
-		panic("Cannot create slab %s size=%lu realsize=%u "
-			"order=%u offset=%u flags=%lx\n",
-			s->name, (unsigned long)s->size, s->size,
-			oo_order(s->oo), s->offset, flags);
+		panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
+		      s->name, (unsigned long)s->size, s->size,
+		      oo_order(s->oo), s->offset, flags);
 	return -EINVAL;
 }
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 4cba9c2783a1..c22065e02084 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -94,8 +94,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
 	int actual_node = early_pfn_to_nid(pfn);
 
 	if (node_distance(actual_node, node) > LOCAL_DISTANCE)
-		printk(KERN_WARNING "[%lx-%lx] potential offnode "
-			"page_structs\n", start, end - 1);
+		printk(KERN_WARNING "[%lx-%lx] potential offnode page_structs\n",
+		       start, end - 1);
 }
 
 pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
@@ -220,8 +220,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
 		if (map_map[pnum])
 			continue;
 		ms = __nr_to_section(pnum);
-		printk(KERN_ERR "%s: sparsemem memory map backing failed "
-			"some memory will not be available.\n", __func__);
+		printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n",
+		       __func__);
 		ms->section_mem_map = 0;
 	}
 
diff --git a/mm/sparse.c b/mm/sparse.c
index d1b48b691ac8..1b7543a775a4 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -428,8 +428,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
 		if (map_map[pnum])
 			continue;
 		ms = __nr_to_section(pnum);
-		printk(KERN_ERR "%s: sparsemem memory map backing failed "
-			"some memory will not be available.\n", __func__);
+		printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n",
+		       __func__);
 		ms->section_mem_map = 0;
 	}
 }
@@ -456,8 +456,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 	if (map)
 		return map;
 
-	printk(KERN_ERR "%s: sparsemem memory map backing failed "
-			"some memory will not be available.\n", __func__);
+	printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n",
+	       __func__);
 	ms->section_mem_map = 0;
 	return NULL;
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c1a0f3dea8b5..65e07eb6558b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2542,8 +2542,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
 	enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
 
-	pr_info("Adding %uk swap on %s.  "
-			"Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
+	pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
 		p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
 		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
 		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8e3c9c5a3042..77f2f0f501d1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -470,8 +470,8 @@ overflow:
 		goto retry;
 	}
 	if (printk_ratelimit())
-		pr_warn("vmap allocation for size %lu failed: "
-			"use vmalloc=<size> to increase size.\n", size);
+		pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
+			size);
 	kfree(va);
 	return ERR_PTR(-EBUSY);
 }

From 2771cfaf23478c20450a1cbc94fd263b04344d05 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Tue, 2 Aug 2016 14:02:55 -0700
Subject: [PATCH 1317/3220] UPSTREAM: kasan: improve double-free reports

Currently we just dump stack in case of double free bug.
Let's dump all info about the object that we have.

[aryabinin@virtuozzo.com: change double free message per Alexander]
  Link: http://lkml.kernel.org/r/1470153654-30160-1-git-send-email-aryabinin@virtuozzo.com
Link: http://lkml.kernel.org/r/1470062715-14077-6-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 7e088978933ee186533355ae03a9dc1de99cf6c7)
Change-Id: I733bf6272d44597907bcf01f1d13695b8e9f8cb4
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/kasan.c  |  3 +--
 mm/kasan/kasan.h  |  2 ++
 mm/kasan/report.c | 54 +++++++++++++++++++++++++++++++++--------------
 3 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 92750e3b0083..88af13c00d3c 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -543,8 +543,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
 
 	shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
 	if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
-		pr_err("Double free");
-		dump_stack();
+		kasan_report_double_free(cache, object, shadow_byte);
 		return true;
 	}
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 5407af019ae4..03f4545b103d 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -102,6 +102,8 @@ static inline bool kasan_report_enabled(void)
 
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+			s8 shadow);
 
 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index bf9e484f0c42..1a79218f79be 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -117,6 +117,26 @@ static inline bool init_task_stack_addr(const void *addr)
 			sizeof(init_thread_union.stack));
 }
 
+static DEFINE_SPINLOCK(report_lock);
+
+static void kasan_start_report(unsigned long *flags)
+{
+	/*
+	 * Make sure we don't end up in loop.
+	 */
+	kasan_disable_current();
+	spin_lock_irqsave(&report_lock, *flags);
+	pr_err("==================================================================\n");
+}
+
+static void kasan_end_report(unsigned long *flags)
+{
+	pr_err("==================================================================\n");
+	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+	spin_unlock_irqrestore(&report_lock, *flags);
+	kasan_enable_current();
+}
+
 static void print_track(struct kasan_track *track)
 {
 	pr_err("PID = %u\n", track->pid);
@@ -130,8 +150,7 @@ static void print_track(struct kasan_track *track)
 	}
 }
 
-static void kasan_object_err(struct kmem_cache *cache, struct page *page,
-				void *object, char *unused_reason)
+static void kasan_object_err(struct kmem_cache *cache, void *object)
 {
 	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
 
@@ -148,6 +167,18 @@ static void kasan_object_err(struct kmem_cache *cache, struct page *page,
 	print_track(&alloc_info->free_track);
 }
 
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+			s8 shadow)
+{
+	unsigned long flags;
+
+	kasan_start_report(&flags);
+	pr_err("BUG: Double free or freeing an invalid pointer\n");
+	pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
+	kasan_object_err(cache, object);
+	kasan_end_report(&flags);
+}
+
 static void print_address_description(struct kasan_access_info *info)
 {
 	const void *addr = info->access_addr;
@@ -161,8 +192,7 @@ static void print_address_description(struct kasan_access_info *info)
 			struct kmem_cache *cache = page->slab_cache;
 			object = nearest_obj(cache, page,
 						(void *)info->access_addr);
-			kasan_object_err(cache, page, object,
-					"kasan: bad access detected");
+			kasan_object_err(cache, object);
 			return;
 		}
 		dump_page(page, "kasan: bad access detected");
@@ -227,19 +257,13 @@ static void print_shadow_for_address(const void *addr)
 	}
 }
 
-static DEFINE_SPINLOCK(report_lock);
-
 static void kasan_report_error(struct kasan_access_info *info)
 {
 	unsigned long flags;
 	const char *bug_type;
 
-	/*
-	 * Make sure we don't end up in loop.
-	 */
-	kasan_disable_current();
-	spin_lock_irqsave(&report_lock, flags);
-	pr_err("==================================================================\n");
+	kasan_start_report(&flags);
+
 	if (info->access_addr <
 			kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
 		if ((unsigned long)info->access_addr < PAGE_SIZE)
@@ -260,10 +284,8 @@ static void kasan_report_error(struct kasan_access_info *info)
 		print_address_description(info);
 		print_shadow_for_address(info->first_bad_addr);
 	}
-	pr_err("==================================================================\n");
-	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
-	spin_unlock_irqrestore(&report_lock, flags);
-	kasan_enable_current();
+
+	kasan_end_report(&flags);
 }
 
 void kasan_report(unsigned long addr, size_t size,

From a9674f4e536ca25b6963afbc2894f97217c0bd46 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Tue, 2 Aug 2016 14:02:58 -0700
Subject: [PATCH 1318/3220] UPSTREAM: kasan: avoid overflowing quarantine size
 on low memory systems

If the total amount of memory assigned to quarantine is less than the
amount of memory assigned to per-cpu quarantines, |new_quarantine_size|
may overflow.  Instead, set it to zero.

[akpm@linux-foundation.org: cleanup: use WARN_ONCE return value]
Link: http://lkml.kernel.org/r/1470063563-96266-1-git-send-email-glider@google.com
Fixes: 55834c59098d ("mm: kasan: initial memory quarantine implementation")
Signed-off-by: Alexander Potapenko <glider@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from c3cee372282cb6bcdf19ac1457581d5dd5ecb554)
Change-Id: I8a647e5ee5d9494698aa2a31d50d587d6ff8b65c
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/quarantine.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 7fd121d13b88..b6728a33a4ac 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -198,7 +198,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
 
 void quarantine_reduce(void)
 {
-	size_t new_quarantine_size;
+	size_t new_quarantine_size, percpu_quarantines;
 	unsigned long flags;
 	struct qlist_head to_free = QLIST_INIT;
 	size_t size_to_free = 0;
@@ -216,7 +216,12 @@ void quarantine_reduce(void)
 	 */
 	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
 		QUARANTINE_FRACTION;
-	new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus();
+	percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
+	if (WARN_ONCE(new_quarantine_size < percpu_quarantines,
+		"Too little memory, disabling global KASAN quarantine.\n"))
+		new_quarantine_size = 0;
+	else
+		new_quarantine_size -= percpu_quarantines;
 	WRITE_ONCE(quarantine_size, new_quarantine_size);
 
 	last = global_quarantine.head;

From bfe4adad697b7a9c93432f1d4a2a7762c8c45bb9 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 11 Aug 2016 15:33:06 -0700
Subject: [PATCH 1319/3220] UPSTREAM: kasan: remove the unnecessary WARN_ONCE
 from quarantine.c

It's quite unlikely that the user will so little memory that the per-CPU
quarantines won't fit into the given fraction of the available memory.
Even in that case he won't be able to do anything with the information
given in the warning.

Link: http://lkml.kernel.org/r/1470929182-101413-1-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kuthonuzo Luruo <kuthonuzo.luruo@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from bcbf0d566b6e59a6e873bfe415cc415111a819e2)
Change-Id: I1230018140c32fab7ea1d1dc1d54471aa48ae45f
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/quarantine.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index b6728a33a4ac..baabaad4a4aa 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -217,11 +217,8 @@ void quarantine_reduce(void)
 	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
 		QUARANTINE_FRACTION;
 	percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
-	if (WARN_ONCE(new_quarantine_size < percpu_quarantines,
-		"Too little memory, disabling global KASAN quarantine.\n"))
-		new_quarantine_size = 0;
-	else
-		new_quarantine_size -= percpu_quarantines;
+	new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
+		0 : new_quarantine_size - percpu_quarantines;
 	WRITE_ONCE(quarantine_size, new_quarantine_size);
 
 	last = global_quarantine.head;

From 2cb9e02424f41417bd802cb967f9582b8765ca1c Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Fri, 14 Oct 2016 16:07:23 +0200
Subject: [PATCH 1320/3220] BACKPORT: kprobes: Unpoison stack in
 jprobe_return() for KASAN

I observed false KSAN positives in the sctp code, when
sctp uses jprobe_return() in jsctp_sf_eat_sack().

The stray 0xf4 in shadow memory are stack redzones:

[     ] ==================================================================
[     ] BUG: KASAN: stack-out-of-bounds in memcmp+0xe9/0x150 at addr ffff88005e48f480
[     ] Read of size 1 by task syz-executor/18535
[     ] page:ffffea00017923c0 count:0 mapcount:0 mapping:          (null) index:0x0
[     ] flags: 0x1fffc0000000000()
[     ] page dumped because: kasan: bad access detected
[     ] CPU: 1 PID: 18535 Comm: syz-executor Not tainted 4.8.0+ #28
[     ] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[     ]  ffff88005e48f2d0 ffffffff82d2b849 ffffffff0bc91e90 fffffbfff10971e8
[     ]  ffffed000bc91e90 ffffed000bc91e90 0000000000000001 0000000000000000
[     ]  ffff88005e48f480 ffff88005e48f350 ffffffff817d3169 ffff88005e48f370
[     ] Call Trace:
[     ]  [<ffffffff82d2b849>] dump_stack+0x12e/0x185
[     ]  [<ffffffff817d3169>] kasan_report+0x489/0x4b0
[     ]  [<ffffffff817d31a9>] __asan_report_load1_noabort+0x19/0x20
[     ]  [<ffffffff82d49529>] memcmp+0xe9/0x150
[     ]  [<ffffffff82df7486>] depot_save_stack+0x176/0x5c0
[     ]  [<ffffffff817d2031>] save_stack+0xb1/0xd0
[     ]  [<ffffffff817d27f2>] kasan_slab_free+0x72/0xc0
[     ]  [<ffffffff817d05b8>] kfree+0xc8/0x2a0
[     ]  [<ffffffff85b03f19>] skb_free_head+0x79/0xb0
[     ]  [<ffffffff85b0900a>] skb_release_data+0x37a/0x420
[     ]  [<ffffffff85b090ff>] skb_release_all+0x4f/0x60
[     ]  [<ffffffff85b11348>] consume_skb+0x138/0x370
[     ]  [<ffffffff8676ad7b>] sctp_chunk_put+0xcb/0x180
[     ]  [<ffffffff8676ae88>] sctp_chunk_free+0x58/0x70
[     ]  [<ffffffff8677fa5f>] sctp_inq_pop+0x68f/0xef0
[     ]  [<ffffffff8675ee36>] sctp_assoc_bh_rcv+0xd6/0x4b0
[     ]  [<ffffffff8677f2c1>] sctp_inq_push+0x131/0x190
[     ]  [<ffffffff867bad69>] sctp_backlog_rcv+0xe9/0xa20
[ ... ]
[     ] Memory state around the buggy address:
[     ]  ffff88005e48f380: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[     ]  ffff88005e48f400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[     ] >ffff88005e48f480: f4 f4 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[     ]                    ^
[     ]  ffff88005e48f500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[     ]  ffff88005e48f580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[     ] ==================================================================

KASAN stack instrumentation poisons stack redzones on function entry
and unpoisons them on function exit. If a function exits abnormally
(e.g. with a longjmp like jprobe_return()), stack redzones are left
poisoned. Later this leads to random KASAN false reports.

Unpoison stack redzones in the frames we are going to jump over
before doing actual longjmp in jprobe_return().

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: kasan-dev@googlegroups.com
Cc: surovegin@google.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1476454043-101898-1-git-send-email-dvyukov@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 64145065
(cherry-picked from 9f7d416c36124667c406978bcb39746589c35d7f)
Change-Id: I84e4fac44265a69f615601266b3415147dade633
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 arch/x86/kernel/kprobes/core.c |  5 ++++-
 include/linux/kasan.h          |  2 ++
 mm/kasan/kasan.c               | 22 +++++++++++++++++++---
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 99d293ea2b49..3eb804335458 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -49,7 +49,7 @@
 #include <linux/kdebug.h>
 #include <linux/kallsyms.h>
 #include <linux/ftrace.h>
-
+#include <linux/kasan.h>
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
@@ -1078,6 +1078,9 @@ void jprobe_return(void)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
+	/* Unpoison stack redzones in the frames we are going to jump over. */
+	kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp);
+
 	asm volatile (
 #ifdef CONFIG_X86_64
 			"       xchg   %%rbx,%%rsp	\n"
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d600303306eb..820c0ad54a01 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -44,6 +44,7 @@ static inline void kasan_disable_current(void)
 void kasan_unpoison_shadow(const void *address, size_t size);
 
 void kasan_unpoison_task_stack(struct task_struct *task);
+void kasan_unpoison_stack_above_sp_to(const void *watermark);
 
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
@@ -85,6 +86,7 @@ size_t kasan_metadata_size(struct kmem_cache *cache);
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+static inline void kasan_unpoison_stack_above_sp_to(const void *watermark) {}
 
 static inline void kasan_enable_current(void) {}
 static inline void kasan_disable_current(void) {}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 88af13c00d3c..70c009741aab 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -34,6 +34,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/vmalloc.h>
+#include <linux/bug.h>
 
 #include "kasan.h"
 #include "../slab.h"
@@ -62,7 +63,7 @@ void kasan_unpoison_shadow(const void *address, size_t size)
 	}
 }
 
-static void __kasan_unpoison_stack(struct task_struct *task, void *sp)
+static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
 {
 	void *base = task_stack_page(task);
 	size_t size = sp - base;
@@ -77,9 +78,24 @@ void kasan_unpoison_task_stack(struct task_struct *task)
 }
 
 /* Unpoison the stack for the current task beyond a watermark sp value. */
-asmlinkage void kasan_unpoison_remaining_stack(void *sp)
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 {
-	__kasan_unpoison_stack(current, sp);
+	__kasan_unpoison_stack(current, watermark);
+}
+
+/*
+ * Clear all poison for the region between the current SP and a provided
+ * watermark value, as is sometimes required prior to hand-crafted asm function
+ * returns in the middle of functions.
+ */
+void kasan_unpoison_stack_above_sp_to(const void *watermark)
+{
+	const void *sp = __builtin_frame_address(0);
+	size_t size = watermark - sp;
+
+	if (WARN_ON(sp > watermark))
+		return;
+	kasan_unpoison_shadow(sp, size);
 }
 
 /*

From d390896bf9ee0b14582d891e5b615852bad430df Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Thu, 27 Oct 2016 17:46:44 -0700
Subject: [PATCH 1321/3220] UPSTREAM: lib/stackdepot.c: bump stackdepot
 capacity from 16MB to 128MB

KASAN uses stackdepot to memorize stacks for all kmalloc/kfree calls.
Current stackdepot capacity is 16MB (1024 top level entries x 4 pages on
second level).  Size of each stack is (num_frames + 3) * sizeof(long).
Which gives us ~84K stacks.  This capacity was chosen empirically and it
is enough to run kernel normally.

However, when lots of configs are enabled and a fuzzer tries to maximize
code coverage, it easily hits the limit within tens of minutes.  I've
tested for long a time with number of top level entries bumped 4x
(4096).  And I think I've seen overflow only once.  But I don't have all
configs enabled and code coverage has not reached maximum yet.  So bump
it 8x to 8192.

Since we have two-level table, memory cost of this is very moderate --
currently the top-level table is 8KB, with this patch it is 64KB, which
is negligible under KASAN.

Here is some approx math.

128MB allows us to memorize ~670K stacks (assuming stack is ~200b).
I've grepped kernel for kmalloc|kfree|kmem_cache_alloc|kmem_cache_free|
kzalloc|kstrdup|kstrndup|kmemdup and it gives ~60K matches.  Most of
alloc/free call sites are reachable with only one stack.  But some
utility functions can have large fanout.  Assuming average fanout is 5x,
total number of alloc/free stacks is ~300K.

Link: http://lkml.kernel.org/r/1476458416-122131-1-git-send-email-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Baozeng Ding <sploving1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 02754e0a484a50a92d44c38879f2cb2792ebc572)
Change-Id: Ia08e608741a7e6dda059f0d6aa30dfdf8f52ef25
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/stackdepot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 60f77f1d470a..4d830e299989 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -50,7 +50,7 @@
 					STACK_ALLOC_ALIGN)
 #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \
 		STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS)
-#define STACK_ALLOC_SLABS_CAP 1024
+#define STACK_ALLOC_SLABS_CAP 8192
 #define STACK_ALLOC_MAX_SLABS \
 	(((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
 	 (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP)

From b8bff7a41314f518c21df0995577b41c9c0ec9ec Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 10 Nov 2016 10:46:47 -0800
Subject: [PATCH 1322/3220] UPSTREAM: lib/stackdepot: export save/fetch stack
 for drivers

Some drivers would like to record stacktraces in order to aide leak
tracing.  As stackdepot already provides a facility for only storing the
unique traces, thereby reducing the memory required, export that
functionality for use by drivers.

The code was originally created for KASAN and moved under lib in commit
cd11016e5f521 ("mm, kasan: stackdepot implementation.  Enable stackdepot
for SLAB") so that it could be shared with mm/.  In turn, we want to
share it now with drivers.

Link: http://lkml.kernel.org/r/20161108133209.22704-1-chris@chris-wilson.co.uk
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from ae65a21fb851f09bf6341761d884fb86b644b75a)
Change-Id: I3bf7ab5c6526c1ac895ac98d2d37cc26ea05bbde
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/stackdepot.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 4d830e299989..f87d138e9672 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -192,6 +192,7 @@ void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace)
 	trace->entries = stack->entries;
 	trace->skip = 0;
 }
+EXPORT_SYMBOL_GPL(depot_fetch_stack);
 
 /**
  * depot_save_stack - save stack in a stack depot.
@@ -283,3 +284,4 @@ exit:
 fast_exit:
 	return retval;
 }
+EXPORT_SYMBOL_GPL(depot_save_stack);

From adb03f57aa738efed8d3d0ee4309b05e499370a0 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 25 Mar 2016 14:21:56 -0700
Subject: [PATCH 1323/3220] UPSTREAM: kasan: modify kmalloc_large_oob_right(),
 add kmalloc_pagealloc_oob_right()

This patchset implements SLAB support for KASAN

Unlike SLUB, SLAB doesn't store allocation/deallocation stacks for heap
objects, therefore we reimplement this feature in mm/kasan/stackdepot.c.
The intention is to ultimately switch SLUB to use this implementation as
well, which will save a lot of memory (right now SLUB bloats each object
by 256 bytes to store the allocation/deallocation stacks).

Also neither SLUB nor SLAB delay the reuse of freed memory chunks, which
is necessary for better detection of use-after-free errors.  We
introduce memory quarantine (mm/kasan/quarantine.c), which allows
delayed reuse of deallocated memory.

This patch (of 7):

Rename kmalloc_large_oob_right() to kmalloc_pagealloc_oob_right(), as
the test only checks the page allocator functionality.  Also reimplement
kmalloc_large_oob_right() so that the test allocates a large enough
chunk of memory that still does not trigger the page allocator fallback.

Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from e6e8379c876de16c6b78f83b15d5ac32c79cb440)
Change-Id: Id711a46b1d85d84784bc599295d109f8f0c7f272
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/test_kasan.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index c32f3b0048dc..90ad74f71535 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -65,11 +65,34 @@ static noinline void __init kmalloc_node_oob_right(void)
 	kfree(ptr);
 }
 
-static noinline void __init kmalloc_large_oob_right(void)
+#ifdef CONFIG_SLUB
+static noinline void __init kmalloc_pagealloc_oob_right(void)
 {
 	char *ptr;
 	size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
 
+	/* Allocate a chunk that does not fit into a SLUB cache to trigger
+	 * the page allocator fallback.
+	 */
+	pr_info("kmalloc pagealloc allocation: out-of-bounds to right\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	ptr[size] = 0;
+	kfree(ptr);
+}
+#endif
+
+static noinline void __init kmalloc_large_oob_right(void)
+{
+	char *ptr;
+	size_t size = KMALLOC_MAX_CACHE_SIZE - 256;
+	/* Allocate a chunk that is large enough, but still fits into a slab
+	 * and does not trigger the page allocator fallback in SLUB.
+	 */
 	pr_info("kmalloc large allocation: out-of-bounds to right\n");
 	ptr = kmalloc(size, GFP_KERNEL);
 	if (!ptr) {
@@ -324,6 +347,9 @@ static int __init kmalloc_tests_init(void)
 	kmalloc_oob_right();
 	kmalloc_oob_left();
 	kmalloc_node_oob_right();
+#ifdef CONFIG_SLUB
+	kmalloc_pagealloc_oob_right();
+#endif
 	kmalloc_large_oob_right();
 	kmalloc_oob_krealloc_more();
 	kmalloc_oob_krealloc_less();

From b2bd0e0b3d4ca3f3e2c4c6f59e53e7790c2d2b81 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 25 Mar 2016 14:22:11 -0700
Subject: [PATCH 1324/3220] UPSTREAM: kasan: test fix: warn if the UAF could
 not be detected in kmalloc_uaf2

Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 9dcadd381b1d199074937019d612346c061de415)
Change-Id: I1111f642f21b3821d5e594e80d32db03fbce186c
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/test_kasan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 90ad74f71535..82169fbf2453 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -294,6 +294,8 @@ static noinline void __init kmalloc_uaf2(void)
 	}
 
 	ptr1[40] = 'x';
+	if (ptr1 == ptr2)
+		pr_err("Could not detect use-after-free: ptr1 == ptr2\n");
 	kfree(ptr2);
 }
 

From dd07855057900c75085bfa463f0561da36b13f48 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 20 May 2016 16:59:17 -0700
Subject: [PATCH 1325/3220] UPSTREAM: mm, kasan: add a ksize() test

Add a test that makes sure ksize() unpoisons the whole chunk.

Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 96fe805fb6fe9b2ed12fc54ad0e3e6829a4152cb)
Change-Id: Ia468981e2ccc62bc1678449a65aab341a9e7ceba
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/test_kasan.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 82169fbf2453..48e5a0be655c 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -344,6 +344,25 @@ static noinline void __init kasan_stack_oob(void)
 	*(volatile char *)p;
 }
 
+static noinline void __init ksize_unpoisons_memory(void)
+{
+	char *ptr;
+	size_t size = 123, real_size = size;
+
+	pr_info("ksize() unpoisons the whole allocated chunk\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+	real_size = ksize(ptr);
+	/* This access doesn't trigger an error. */
+	ptr[size] = 'x';
+	/* This one does. */
+	ptr[real_size] = 'y';
+	kfree(ptr);
+}
+
 static int __init kmalloc_tests_init(void)
 {
 	kmalloc_oob_right();
@@ -367,6 +386,7 @@ static int __init kmalloc_tests_init(void)
 	kmem_cache_oob();
 	kasan_stack_oob();
 	kasan_global_oob();
+	ksize_unpoisons_memory();
 	return -EAGAIN;
 }
 

From 7efecacc26bf592f1f66e712539d2b720682b695 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 20 May 2016 16:59:34 -0700
Subject: [PATCH 1326/3220] UPSTREAM: kasan/tests: add tests for user memory
 access functions

Add some tests for the newly-added user memory access API.

Link: http://lkml.kernel.org/r/1462538722-1574-1-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from eae08dcab80c695c16c9f1f7dcd5b8ed52bfc88b)
Change-Id: I735c7b84a83c969df8105a129df7bbbed8d50cc0
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/test_kasan.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 48e5a0be655c..5e51872b3fc1 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -12,9 +12,12 @@
 #define pr_fmt(fmt) "kasan test: %s " fmt, __func__
 
 #include <linux/kernel.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 
 static noinline void __init kmalloc_oob_right(void)
@@ -363,6 +366,51 @@ static noinline void __init ksize_unpoisons_memory(void)
 	kfree(ptr);
 }
 
+static noinline void __init copy_user_test(void)
+{
+	char *kmem;
+	char __user *usermem;
+	size_t size = 10;
+	int unused;
+
+	kmem = kmalloc(size, GFP_KERNEL);
+	if (!kmem)
+		return;
+
+	usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
+			    PROT_READ | PROT_WRITE | PROT_EXEC,
+			    MAP_ANONYMOUS | MAP_PRIVATE, 0);
+	if (IS_ERR(usermem)) {
+		pr_err("Failed to allocate user memory\n");
+		kfree(kmem);
+		return;
+	}
+
+	pr_info("out-of-bounds in copy_from_user()\n");
+	unused = copy_from_user(kmem, usermem, size + 1);
+
+	pr_info("out-of-bounds in copy_to_user()\n");
+	unused = copy_to_user(usermem, kmem, size + 1);
+
+	pr_info("out-of-bounds in __copy_from_user()\n");
+	unused = __copy_from_user(kmem, usermem, size + 1);
+
+	pr_info("out-of-bounds in __copy_to_user()\n");
+	unused = __copy_to_user(usermem, kmem, size + 1);
+
+	pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
+	unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
+
+	pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
+	unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
+
+	pr_info("out-of-bounds in strncpy_from_user()\n");
+	unused = strncpy_from_user(kmem, usermem, size + 1);
+
+	vm_munmap((unsigned long)usermem, PAGE_SIZE);
+	kfree(kmem);
+}
+
 static int __init kmalloc_tests_init(void)
 {
 	kmalloc_oob_right();
@@ -387,6 +435,7 @@ static int __init kmalloc_tests_init(void)
 	kasan_stack_oob();
 	kasan_global_oob();
 	ksize_unpoisons_memory();
+	copy_user_test();
 	return -EAGAIN;
 }
 

From 1d16d8b51ecdeb4f589b7c4890b6444ca42abaf7 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Wed, 30 Nov 2016 15:54:16 -0800
Subject: [PATCH 1327/3220] UPSTREAM: kasan: support use-after-scope detection

Gcc revision 241896 implements use-after-scope detection.  Will be
available in gcc 7.  Support it in KASAN.

Gcc emits 2 new callbacks to poison/unpoison large stack objects when
they go in/out of scope.  Implement the callbacks and add a test.

[dvyukov@google.com: v3]
  Link: http://lkml.kernel.org/r/1479998292-144502-1-git-send-email-dvyukov@google.com
Link: http://lkml.kernel.org/r/1479226045-145148-1-git-send-email-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: <stable@vger.kernel.org>	[4.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 828347f8f9a558cf1af2faa46387a26564f2ac3e)
Change-Id: Ib9cb585efbe98ba11a7efbd233ebd97cb4214a92
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 lib/test_kasan.c  | 29 +++++++++++++++++++++++++++++
 mm/kasan/kasan.c  | 19 +++++++++++++++++++
 mm/kasan/kasan.h  |  1 +
 mm/kasan/report.c |  3 +++
 4 files changed, 52 insertions(+)

diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 5e51872b3fc1..fbdf87920093 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -20,6 +20,11 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 
+/*
+ * Note: test functions are marked noinline so that their names appear in
+ * reports.
+ */
+
 static noinline void __init kmalloc_oob_right(void)
 {
 	char *ptr;
@@ -411,6 +416,29 @@ static noinline void __init copy_user_test(void)
 	kfree(kmem);
 }
 
+static noinline void __init use_after_scope_test(void)
+{
+	volatile char *volatile p;
+
+	pr_info("use-after-scope on int\n");
+	{
+		int local = 0;
+
+		p = (char *)&local;
+	}
+	p[0] = 1;
+	p[3] = 1;
+
+	pr_info("use-after-scope on array\n");
+	{
+		char local[1024] = {0};
+
+		p = local;
+	}
+	p[0] = 1;
+	p[1023] = 1;
+}
+
 static int __init kmalloc_tests_init(void)
 {
 	kmalloc_oob_right();
@@ -436,6 +464,7 @@ static int __init kmalloc_tests_init(void)
 	kasan_global_oob();
 	ksize_unpoisons_memory();
 	copy_user_test();
+	use_after_scope_test();
 	return -EAGAIN;
 }
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 70c009741aab..0e9505f66ec1 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -764,6 +764,25 @@ EXPORT_SYMBOL(__asan_storeN_noabort);
 void __asan_handle_no_return(void) {}
 EXPORT_SYMBOL(__asan_handle_no_return);
 
+/* Emitted by compiler to poison large objects when they go out of scope. */
+void __asan_poison_stack_memory(const void *addr, size_t size)
+{
+	/*
+	 * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
+	 * by redzones, so we simply round up size to simplify logic.
+	 */
+	kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
+			    KASAN_USE_AFTER_SCOPE);
+}
+EXPORT_SYMBOL(__asan_poison_stack_memory);
+
+/* Emitted by compiler to unpoison large objects when they go into scope. */
+void __asan_unpoison_stack_memory(const void *addr, size_t size)
+{
+	kasan_unpoison_shadow(addr, size);
+}
+EXPORT_SYMBOL(__asan_unpoison_stack_memory);
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 static int kasan_mem_notifier(struct notifier_block *nb,
 			unsigned long action, void *data)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 03f4545b103d..1c260e6b3b3c 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -21,6 +21,7 @@
 #define KASAN_STACK_MID         0xF2
 #define KASAN_STACK_RIGHT       0xF3
 #define KASAN_STACK_PARTIAL     0xF4
+#define KASAN_USE_AFTER_SCOPE   0xF8
 
 /* Don't break randconfig/all*config builds */
 #ifndef KASAN_ABI_VERSION
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 1a79218f79be..8ca412aebcf1 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -91,6 +91,9 @@ static void print_error_description(struct kasan_access_info *info)
 	case KASAN_KMALLOC_FREE:
 		bug_type = "use-after-free";
 		break;
+	case KASAN_USE_AFTER_SCOPE:
+		bug_type = "use-after-scope";
+		break;
 	}
 
 	pr_err("BUG: KASAN: %s in %pS at addr %p\n",

From 915047a1dea413c6bc9726fd36725a685ee14852 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 2 Dec 2016 11:42:21 -0600
Subject: [PATCH 1328/3220] UPSTREAM: x86/suspend: fix false positive KASAN
 warning on suspend/resume

Resuming from a suspend operation is showing a KASAN false positive
warning:

  BUG: KASAN: stack-out-of-bounds in unwind_get_return_address+0x11d/0x130 at addr ffff8803867d7878
  Read of size 8 by task pm-suspend/7774
  page:ffffea000e19f5c0 count:0 mapcount:0 mapping:          (null) index:0x0
  flags: 0x2ffff0000000000()
  page dumped because: kasan: bad access detected
  CPU: 0 PID: 7774 Comm: pm-suspend Tainted: G    B           4.9.0-rc7+ #8
  Hardware name: Gigabyte Technology Co., Ltd. Z170X-UD5/Z170X-UD5-CF, BIOS F5 03/07/2016
  Call Trace:
    dump_stack+0x63/0x82
    kasan_report_error+0x4b4/0x4e0
    ? acpi_hw_read_port+0xd0/0x1ea
    ? kfree_const+0x22/0x30
    ? acpi_hw_validate_io_request+0x1a6/0x1a6
    __asan_report_load8_noabort+0x61/0x70
    ? unwind_get_return_address+0x11d/0x130
    unwind_get_return_address+0x11d/0x130
    ? unwind_next_frame+0x97/0xf0
    __save_stack_trace+0x92/0x100
    save_stack_trace+0x1b/0x20
    save_stack+0x46/0xd0
    ? save_stack_trace+0x1b/0x20
    ? save_stack+0x46/0xd0
    ? kasan_kmalloc+0xad/0xe0
    ? kasan_slab_alloc+0x12/0x20
    ? acpi_hw_read+0x2b6/0x3aa
    ? acpi_hw_validate_register+0x20b/0x20b
    ? acpi_hw_write_port+0x72/0xc7
    ? acpi_hw_write+0x11f/0x15f
    ? acpi_hw_read_multiple+0x19f/0x19f
    ? memcpy+0x45/0x50
    ? acpi_hw_write_port+0x72/0xc7
    ? acpi_hw_write+0x11f/0x15f
    ? acpi_hw_read_multiple+0x19f/0x19f
    ? kasan_unpoison_shadow+0x36/0x50
    kasan_kmalloc+0xad/0xe0
    kasan_slab_alloc+0x12/0x20
    kmem_cache_alloc_trace+0xbc/0x1e0
    ? acpi_get_sleep_type_data+0x9a/0x578
    acpi_get_sleep_type_data+0x9a/0x578
    acpi_hw_legacy_wake_prep+0x88/0x22c
    ? acpi_hw_legacy_sleep+0x3c7/0x3c7
    ? acpi_write_bit_register+0x28d/0x2d3
    ? acpi_read_bit_register+0x19b/0x19b
    acpi_hw_sleep_dispatch+0xb5/0xba
    acpi_leave_sleep_state_prep+0x17/0x19
    acpi_suspend_enter+0x154/0x1e0
    ? trace_suspend_resume+0xe8/0xe8
    suspend_devices_and_enter+0xb09/0xdb0
    ? printk+0xa8/0xd8
    ? arch_suspend_enable_irqs+0x20/0x20
    ? try_to_freeze_tasks+0x295/0x600
    pm_suspend+0x6c9/0x780
    ? finish_wait+0x1f0/0x1f0
    ? suspend_devices_and_enter+0xdb0/0xdb0
    state_store+0xa2/0x120
    ? kobj_attr_show+0x60/0x60
    kobj_attr_store+0x36/0x70
    sysfs_kf_write+0x131/0x200
    kernfs_fop_write+0x295/0x3f0
    __vfs_write+0xef/0x760
    ? handle_mm_fault+0x1346/0x35e0
    ? do_iter_readv_writev+0x660/0x660
    ? __pmd_alloc+0x310/0x310
    ? do_lock_file_wait+0x1e0/0x1e0
    ? apparmor_file_permission+0x18/0x20
    ? security_file_permission+0x73/0x1c0
    ? rw_verify_area+0xbd/0x2b0
    vfs_write+0x149/0x4a0
    SyS_write+0xd9/0x1c0
    ? SyS_read+0x1c0/0x1c0
    entry_SYSCALL_64_fastpath+0x1e/0xad
  Memory state around the buggy address:
   ffff8803867d7700: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
   ffff8803867d7780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  >ffff8803867d7800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f4
                                                                  ^
   ffff8803867d7880: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
   ffff8803867d7900: 00 00 00 f1 f1 f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00

KASAN instrumentation poisons the stack when entering a function and
unpoisons it when exiting the function.  However, in the suspend path,
some functions never return, so their stack never gets unpoisoned,
resulting in stale KASAN shadow data which can cause later false
positive warnings like the one above.

Reported-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Bug: 64145065
(cherry-picked from b53f40db59b27b62bc294c30506b02a0cae47e0b)
Change-Id: Iafbf1f7f19bb7db9a49316cf70050a3dae576f15
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 arch/x86/kernel/acpi/wakeup_64.S | 9 +++++++++
 mm/kasan/kasan.c                 | 9 ++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8c35df468104..48e3979d174a 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -107,6 +107,15 @@ ENTRY(do_suspend_lowlevel)
 	movq	pt_regs_r14(%rax), %r14
 	movq	pt_regs_r15(%rax), %r15
 
+#ifdef CONFIG_KASAN
+	/*
+	 * The suspend path may have poisoned some areas deeper in the stack,
+	 * which we now need to unpoison.
+	 */
+	movq	%rsp, %rdi
+	call	kasan_unpoison_task_stack_below
+#endif
+
 	xorl	%eax, %eax
 	addq	$8, %rsp
 	jmp	restore_processor_state
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 0e9505f66ec1..b2a0cff2bb35 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task)
 /* Unpoison the stack for the current task beyond a watermark sp value. */
 asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 {
-	__kasan_unpoison_stack(current, watermark);
+	/*
+	 * Calculate the task stack base address.  Avoid using 'current'
+	 * because this function is called by early resume code which hasn't
+	 * yet set up the percpu register (%gs).
+	 */
+	void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
+
+	kasan_unpoison_shadow(base, watermark - base);
 }
 
 /*

From 418991b870f79c681fb3499a520a69cda6788959 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Mon, 12 Dec 2016 16:44:53 -0800
Subject: [PATCH 1329/3220] UPSTREAM: kasan: support panic_on_warn

If user sets panic_on_warn, he wants kernel to panic if there is
anything barely wrong with the kernel.  KASAN-detected errors are
definitely not less benign than an arbitrary kernel WARNING.

Panic after KASAN errors if panic_on_warn is set.

We use this for continuous fuzzing where we want kernel to stop and
reboot on any error.

Link: http://lkml.kernel.org/r/1476694764-31986-1-git-send-email-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 5c5c1f36cedfb51ec291181e71817f7fe7e03ee2)
Change-Id: Iee7cbc4ffbce8eb8d827447fdf960a6520d10b00
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 8ca412aebcf1..f479365530b6 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -137,6 +137,8 @@ static void kasan_end_report(unsigned long *flags)
 	pr_err("==================================================================\n");
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 	spin_unlock_irqrestore(&report_lock, *flags);
+	if (panic_on_warn)
+		panic("panic_on_warn set ...\n");
 	kasan_enable_current();
 }
 

From 9e84d95fd7dc9929ccffc6e704c23a8a9f124a3f Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Mon, 12 Dec 2016 16:44:56 -0800
Subject: [PATCH 1330/3220] UPSTREAM: kasan: eliminate long stalls during
 quarantine reduction

Currently we dedicate 1/32 of RAM for quarantine and then reduce it by
1/4 of total quarantine size.  This can be a significant amount of
memory.  For example, with 4GB of RAM total quarantine size is 128MB and
it is reduced by 32MB at a time.  With 128GB of RAM total quarantine
size is 4GB and it is reduced by 1GB.  This leads to several problems:

 - freeing 1GB can take tens of seconds, causes rcu stall warnings and
   just introduces unexpected long delays at random places
 - if kmalloc() is called under a mutex, other threads stall on that
   mutex while a thread reduces quarantine
 - threads wait on quarantine_lock while one thread grabs a large batch
   of objects to evict
 - we walk the uncached list of object to free twice which makes all of
   the above worse
 - when a thread frees objects, they are already not accounted against
   global_quarantine.bytes; as the result we can have quarantine_size
   bytes in quarantine + unbounded amount of memory in large batches in
   threads that are in process of freeing

Reduce size of quarantine in smaller batches to reduce the delays.  The
only reason to reduce it in batches is amortization of overheads, the
new batch size of 1MB should be well enough to amortize spinlock
lock/unlock and few function calls.

Plus organize quarantine as a FIFO array of batches.  This allows to not
walk the list in quarantine_reduce() under quarantine_lock, which in
turn reduces contention and is just faster.

This improves performance of heavy load (syzkaller fuzzing) by ~20% with
4 CPUs and 32GB of RAM.  Also this eliminates frequent (every 5 sec)
drops of CPU consumption from ~400% to ~100% (one thread reduces
quarantine while others are waiting on a mutex).

Some reference numbers:
1. Machine with 4 CPUs and 4GB of memory. Quarantine size 128MB.
   Currently we free 32MB at at time.
   With new code we free 1MB at a time (1024 batches, ~128 are used).
2. Machine with 32 CPUs and 128GB of memory. Quarantine size 4GB.
   Currently we free 1GB at at time.
   With new code we free 8MB at a time (1024 batches, ~512 are used).
3. Machine with 4096 CPUs and 1TB of memory. Quarantine size 32GB.
   Currently we free 8GB at at time.
   With new code we free 4MB at a time (16K batches, ~8K are used).

Link: http://lkml.kernel.org/r/1478756952-18695-1-git-send-email-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 64abdcb24351a27bed6e2b6a3c27348fe532c73f)
Change-Id: Idf73cb292453ceffc437121b7a5e152cde1901ff
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/quarantine.c | 92 ++++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 45 deletions(-)

diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index baabaad4a4aa..dae929c02bbb 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
 	qlist_init(from);
 }
 
-static void qlist_move(struct qlist_head *from, struct qlist_node *last,
-		struct qlist_head *to, size_t size)
-{
-	if (unlikely(last == from->tail)) {
-		qlist_move_all(from, to);
-		return;
-	}
-	if (qlist_empty(to))
-		to->head = from->head;
-	else
-		to->tail->next = from->head;
-	to->tail = last;
-	from->head = last->next;
-	last->next = NULL;
-	from->bytes -= size;
-	to->bytes += size;
-}
-
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+#define QUARANTINE_BATCHES \
+	(1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
 
 /*
  * The object quarantine consists of per-cpu queues and a global queue,
@@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last,
  */
 static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
 
-static struct qlist_head global_quarantine;
+/* Round-robin FIFO array of batches. */
+static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
+static int quarantine_head;
+static int quarantine_tail;
+/* Total size of all objects in global_quarantine across all batches. */
+static unsigned long quarantine_size;
 static DEFINE_SPINLOCK(quarantine_lock);
 
 /* Maximum size of the global queue. */
-static unsigned long quarantine_size;
+static unsigned long quarantine_max_size;
+
+/*
+ * Target size of a batch in global_quarantine.
+ * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
+ */
+static unsigned long quarantine_batch_size;
 
 /*
  * The fraction of physical memory the quarantine is allowed to occupy.
@@ -124,9 +120,6 @@ static unsigned long quarantine_size;
  */
 #define QUARANTINE_FRACTION 32
 
-#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
-#define QUARANTINE_PERCPU_SIZE (1 << 20)
-
 static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
 {
 	return virt_to_head_page(qlink)->slab_cache;
@@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
 
 	if (unlikely(!qlist_empty(&temp))) {
 		spin_lock_irqsave(&quarantine_lock, flags);
-		qlist_move_all(&temp, &global_quarantine);
+		WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
+		qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
+		if (global_quarantine[quarantine_tail].bytes >=
+				READ_ONCE(quarantine_batch_size)) {
+			int new_tail;
+
+			new_tail = quarantine_tail + 1;
+			if (new_tail == QUARANTINE_BATCHES)
+				new_tail = 0;
+			if (new_tail != quarantine_head)
+				quarantine_tail = new_tail;
+		}
 		spin_unlock_irqrestore(&quarantine_lock, flags);
 	}
 }
 
 void quarantine_reduce(void)
 {
-	size_t new_quarantine_size, percpu_quarantines;
+	size_t total_size, new_quarantine_size, percpu_quarantines;
 	unsigned long flags;
 	struct qlist_head to_free = QLIST_INIT;
-	size_t size_to_free = 0;
-	struct qlist_node *last;
 
-	if (likely(READ_ONCE(global_quarantine.bytes) <=
-		   READ_ONCE(quarantine_size)))
+	if (likely(READ_ONCE(quarantine_size) <=
+		   READ_ONCE(quarantine_max_size)))
 		return;
 
 	spin_lock_irqsave(&quarantine_lock, flags);
@@ -214,24 +216,23 @@ void quarantine_reduce(void)
 	 * Update quarantine size in case of hotplug. Allocate a fraction of
 	 * the installed memory to quarantine minus per-cpu queue limits.
 	 */
-	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+	total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
 		QUARANTINE_FRACTION;
 	percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
-	new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
-		0 : new_quarantine_size - percpu_quarantines;
-	WRITE_ONCE(quarantine_size, new_quarantine_size);
+	new_quarantine_size = (total_size < percpu_quarantines) ?
+		0 : total_size - percpu_quarantines;
+	WRITE_ONCE(quarantine_max_size, new_quarantine_size);
+	/* Aim at consuming at most 1/2 of slots in quarantine. */
+	WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
+		2 * total_size / QUARANTINE_BATCHES));
 
-	last = global_quarantine.head;
-	while (last) {
-		struct kmem_cache *cache = qlink_to_cache(last);
-
-		size_to_free += cache->size;
-		if (!last->next || size_to_free >
-		    global_quarantine.bytes - QUARANTINE_LOW_SIZE)
-			break;
-		last = last->next;
+	if (likely(quarantine_size > quarantine_max_size)) {
+		qlist_move_all(&global_quarantine[quarantine_head], &to_free);
+		WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
+		quarantine_head++;
+		if (quarantine_head == QUARANTINE_BATCHES)
+			quarantine_head = 0;
 	}
-	qlist_move(&global_quarantine, last, &to_free, size_to_free);
 
 	spin_unlock_irqrestore(&quarantine_lock, flags);
 
@@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg)
 
 void quarantine_remove_cache(struct kmem_cache *cache)
 {
-	unsigned long flags;
+	unsigned long flags, i;
 	struct qlist_head to_free = QLIST_INIT;
 
 	on_each_cpu(per_cpu_remove_cache, cache, 1);
 
 	spin_lock_irqsave(&quarantine_lock, flags);
-	qlist_move_cache(&global_quarantine, &to_free, cache);
+	for (i = 0; i < QUARANTINE_BATCHES; i++)
+		qlist_move_cache(&global_quarantine[i], &to_free, cache);
 	spin_unlock_irqrestore(&quarantine_lock, flags);
 
 	qlist_free_all(&to_free, cache);

From c0e5ed2db4049c457b75539a3551af2d3e9698c9 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Fri, 24 Feb 2017 15:00:05 -0800
Subject: [PATCH 1331/3220] BACKPORT: kasan: drain quarantine of memcg slab
 objects

Per memcg slab accounting and kasan have a problem with kmem_cache
destruction.
 - kmem_cache_create() allocates a kmem_cache, which is used for
   allocations from processes running in root (top) memcg.
 - Processes running in non root memcg and allocating with either
   __GFP_ACCOUNT or from a SLAB_ACCOUNT cache use a per memcg
   kmem_cache.
 - Kasan catches use-after-free by having kfree() and kmem_cache_free()
   defer freeing of objects. Objects are placed in a quarantine.
 - kmem_cache_destroy() destroys root and non root kmem_caches. It takes
   care to drain the quarantine of objects from the root memcg's
   kmem_cache, but ignores objects associated with non root memcg. This
   causes leaks because quarantined per memcg objects refer to per memcg
   kmem cache being destroyed.

To see the problem:

 1) create a slab cache with kmem_cache_create(,,,SLAB_ACCOUNT,)
 2) from non root memcg, allocate and free a few objects from cache
 3) dispose of the cache with kmem_cache_destroy() kmem_cache_destroy()
    will trigger a "Slab cache still has objects" warning indicating
    that the per memcg kmem_cache structure was leaked.

Fix the leak by draining kasan quarantined objects allocated from non
root memcg.

Racing memcg deletion is tricky, but handled.  kmem_cache_destroy() =>
shutdown_memcg_caches() => __shutdown_memcg_cache() => shutdown_cache()
flushes per memcg quarantined objects, even if that memcg has been
rmdir'd and gone through memcg_deactivate_kmem_caches().

This leak only affects destroyed SLAB_ACCOUNT kmem caches when kasan is
enabled.  So I don't think it's worth patching stable kernels.

Link: http://lkml.kernel.org/r/1482257462-36948-1-git-send-email-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from f9fa1d919c696e90c887d8742198023e7639d139)
Change-Id: Ie054d9cde7fb1ce62e65776bff5a70f72925d037
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kasan.h | 4 ++--
 mm/kasan/kasan.c      | 2 +-
 mm/kasan/quarantine.c | 1 +
 mm/slab_common.c      | 4 +++-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 820c0ad54a01..c908b25bf5a5 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -52,7 +52,7 @@ void kasan_free_pages(struct page *page, unsigned int order);
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 			unsigned long *flags);
 void kasan_cache_shrink(struct kmem_cache *cache);
-void kasan_cache_destroy(struct kmem_cache *cache);
+void kasan_cache_shutdown(struct kmem_cache *cache);
 
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -98,7 +98,7 @@ static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
 				      unsigned long *flags) {}
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
-static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
+static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
 
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index b2a0cff2bb35..25f0e6521f36 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -435,7 +435,7 @@ void kasan_cache_shrink(struct kmem_cache *cache)
 	quarantine_remove_cache(cache);
 }
 
-void kasan_cache_destroy(struct kmem_cache *cache)
+void kasan_cache_shutdown(struct kmem_cache *cache)
 {
 	quarantine_remove_cache(cache);
 }
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index dae929c02bbb..6f1ed1630873 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -274,6 +274,7 @@ static void per_cpu_remove_cache(void *arg)
 	qlist_free_all(&to_free, cache);
 }
 
+/* Free all quarantined objects belonging to cache. */
 void quarantine_remove_cache(struct kmem_cache *cache)
 {
 	unsigned long flags, i;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 42dd69b558a5..1577d113fac5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -453,6 +453,9 @@ EXPORT_SYMBOL(kmem_cache_create);
 static int shutdown_cache(struct kmem_cache *s,
 		struct list_head *release, bool *need_rcu_barrier)
 {
+	/* free asan quarantined objects */
+	kasan_cache_shutdown(s);
+
 	if (__kmem_cache_shutdown(s) != 0)
 		return -EBUSY;
 
@@ -712,7 +715,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	get_online_cpus();
 	get_online_mems();
 
-	kasan_cache_destroy(s);
 	mutex_lock(&slab_mutex);
 
 	s->refcount--;

From 16a34d5679f2a77800af894a3ac2e654bce788ab Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 09:57:00 +0100
Subject: [PATCH 1332/3220] BACKPORT: kasan, sched/headers: Uninline
 kasan_enable/disable_current()

<linux/kasan.h> is a low level header that is included early
in affected kernel headers. But it includes <linux/sched.h>
which complicates the cleanup of sched.h dependencies.

But kasan.h has almost no need for sched.h: its only use of
scheduler functionality is in two inline functions which are
not used very frequently - so uninline kasan_enable_current()
and kasan_disable_current().

Also add a <linux/sched.h> dependency to a .c file that depended
on kasan.h including it.

This paves the way to remove the <linux/sched.h> include from kasan.h.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 64145065
(cherry-picked from af8601ad420f6afa6445c927ad9f36d9700d96d6)
Change-Id: I13fd2d3927f663d694ea0d5bf44f18e2c62ae013
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kasan.h | 10 ++--------
 mm/kasan/kasan.c      | 10 ++++++++++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index c908b25bf5a5..7793036eac80 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -30,16 +30,10 @@ static inline void *kasan_mem_to_shadow(const void *addr)
 }
 
 /* Enable reporting bugs after kasan_disable_current() */
-static inline void kasan_enable_current(void)
-{
-	current->kasan_depth++;
-}
+extern void kasan_enable_current(void);
 
 /* Disable reporting bugs for current task */
-static inline void kasan_disable_current(void)
-{
-	current->kasan_depth--;
-}
+extern void kasan_disable_current(void);
 
 void kasan_unpoison_shadow(const void *address, size_t size);
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 25f0e6521f36..d99312ed7c22 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -39,6 +39,16 @@
 #include "kasan.h"
 #include "../slab.h"
 
+void kasan_enable_current(void)
+{
+	current->kasan_depth++;
+}
+
+void kasan_disable_current(void)
+{
+	current->kasan_depth--;
+}
+
 /*
  * Poisons the shadow memory for 'size' bytes starting from 'addr'.
  * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.

From 97bf1066e3a5095ded03a0329e2dfce9282daa5e Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Thu, 9 Mar 2017 16:17:28 -0800
Subject: [PATCH 1333/3220] UPSTREAM: kasan: resched in
 quarantine_remove_cache()

We see reported stalls/lockups in quarantine_remove_cache() on machines
with large amounts of RAM.  quarantine_remove_cache() needs to scan
whole quarantine in order to take out all objects belonging to the
cache.  Quarantine is currently 1/32-th of RAM, e.g.  on a machine with
256GB of memory that will be 8GB.  Moreover quarantine scanning is a
walk over uncached linked list, which is slow.

Add cond_resched() after scanning of each non-empty batch of objects.
Batches are specifically kept of reasonable size for quarantine_put().
On a machine with 256GB of RAM we should have ~512 non-empty batches,
each with 16MB of objects.

Link: http://lkml.kernel.org/r/20170308154239.25440-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 68fd814a3391c7e017ae6ace8855788748edd981)
Change-Id: I8a38466a9b9544bb303202c94bfba6201251e3f3
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/quarantine.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 6f1ed1630873..4ac39f20757a 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -283,8 +283,15 @@ void quarantine_remove_cache(struct kmem_cache *cache)
 	on_each_cpu(per_cpu_remove_cache, cache, 1);
 
 	spin_lock_irqsave(&quarantine_lock, flags);
-	for (i = 0; i < QUARANTINE_BATCHES; i++)
+	for (i = 0; i < QUARANTINE_BATCHES; i++) {
+		if (qlist_empty(&global_quarantine[i]))
+			continue;
 		qlist_move_cache(&global_quarantine[i], &to_free, cache);
+		/* Scanning whole quarantine can take a while. */
+		spin_unlock_irqrestore(&quarantine_lock, flags);
+		cond_resched();
+		spin_lock_irqsave(&quarantine_lock, flags);
+	}
 	spin_unlock_irqrestore(&quarantine_lock, flags);
 
 	qlist_free_all(&to_free, cache);

From ef1e5532cdaeb082573ae9edb05c7a3e5f627bb9 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Thu, 9 Mar 2017 16:17:32 -0800
Subject: [PATCH 1334/3220] UPSTREAM: kasan: fix races in
 quarantine_remove_cache()

quarantine_remove_cache() frees all pending objects that belong to the
cache, before we destroy the cache itself.  However there are currently
two possibilities how it can fail to do so.

First, another thread can hold some of the objects from the cache in
temp list in quarantine_put().  quarantine_put() has a windows of
enabled interrupts, and on_each_cpu() in quarantine_remove_cache() can
finish right in that window.  These objects will be later freed into the
destroyed cache.

Then, quarantine_reduce() has the same problem.  It grabs a batch of
objects from the global quarantine, then unlocks quarantine_lock and
then frees the batch.  quarantine_remove_cache() can finish while some
objects from the cache are still in the local to_free list in
quarantine_reduce().

Fix the race with quarantine_put() by disabling interrupts for the whole
duration of quarantine_put().  In combination with on_each_cpu() in
quarantine_remove_cache() it ensures that quarantine_remove_cache()
either sees the objects in the per-cpu list or in the global list.

Fix the race with quarantine_reduce() by protecting quarantine_reduce()
with srcu critical section and then doing synchronize_srcu() at the end
of quarantine_remove_cache().

I've done some assessment of how good synchronize_srcu() works in this
case.  And on a 4 CPU VM I see that it blocks waiting for pending read
critical sections in about 2-3% of cases.  Which looks good to me.

I suspect that these races are the root cause of some GPFs that I
episodically hit.  Previously I did not have any explanation for them.

  BUG: unable to handle kernel NULL pointer dereference at 00000000000000c8
  IP: qlist_free_all+0x2e/0xc0 mm/kasan/quarantine.c:155
  PGD 6aeea067
  PUD 60ed7067
  PMD 0
  Oops: 0000 [#1] SMP KASAN
  Dumping ftrace buffer:
     (ftrace buffer empty)
  Modules linked in:
  CPU: 0 PID: 13667 Comm: syz-executor2 Not tainted 4.10.0+ #60
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  task: ffff88005f948040 task.stack: ffff880069818000
  RIP: 0010:qlist_free_all+0x2e/0xc0 mm/kasan/quarantine.c:155
  RSP: 0018:ffff88006981f298 EFLAGS: 00010246
  RAX: ffffea0000ffff00 RBX: 0000000000000000 RCX: ffffea0000ffff1f
  RDX: 0000000000000000 RSI: ffff88003fffc3e0 RDI: 0000000000000000
  RBP: ffff88006981f2c0 R08: ffff88002fed7bd8 R09: 00000001001f000d
  R10: 00000000001f000d R11: ffff88006981f000 R12: ffff88003fffc3e0
  R13: ffff88006981f2d0 R14: ffffffff81877fae R15: 0000000080000000
  FS:  00007fb911a2d700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000000000c8 CR3: 0000000060ed6000 CR4: 00000000000006f0
  Call Trace:
   quarantine_reduce+0x10e/0x120 mm/kasan/quarantine.c:239
   kasan_kmalloc+0xca/0xe0 mm/kasan/kasan.c:590
   kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:544
   slab_post_alloc_hook mm/slab.h:456 [inline]
   slab_alloc_node mm/slub.c:2718 [inline]
   kmem_cache_alloc_node+0x1d3/0x280 mm/slub.c:2754
   __alloc_skb+0x10f/0x770 net/core/skbuff.c:219
   alloc_skb include/linux/skbuff.h:932 [inline]
   _sctp_make_chunk+0x3b/0x260 net/sctp/sm_make_chunk.c:1388
   sctp_make_data net/sctp/sm_make_chunk.c:1420 [inline]
   sctp_make_datafrag_empty+0x208/0x360 net/sctp/sm_make_chunk.c:746
   sctp_datamsg_from_user+0x7e8/0x11d0 net/sctp/chunk.c:266
   sctp_sendmsg+0x2611/0x3970 net/sctp/socket.c:1962
   inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:761
   sock_sendmsg_nosec net/socket.c:633 [inline]
   sock_sendmsg+0xca/0x110 net/socket.c:643
   SYSC_sendto+0x660/0x810 net/socket.c:1685
   SyS_sendto+0x40/0x50 net/socket.c:1653

I am not sure about backporting.  The bug is quite hard to trigger, I've
seen it few times during our massive continuous testing (however, it
could be cause of some other episodic stray crashes as it leads to
memory corruption...).  If it is triggered, the consequences are very
bad -- almost definite bad memory corruption.  The fix is non trivial
and has chances of introducing new bugs.  I am also not sure how
actively people use KASAN on older releases.

[dvyukov@google.com: - sorted includes[
  Link: http://lkml.kernel.org/r/20170309094028.51088-1-dvyukov@google.com
Link: http://lkml.kernel.org/r/20170308151532.5070-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from ce5bec54bb5debbbe51b40270d8f209a23cadae4)
Change-Id: I9199861f005d7932c37397b3ae23a123a4cff89b
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/quarantine.c | 42 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 4ac39f20757a..3a8ddf8baf7d 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -25,6 +25,7 @@
 #include <linux/printk.h>
 #include <linux/shrinker.h>
 #include <linux/slab.h>
+#include <linux/srcu.h>
 #include <linux/string.h>
 #include <linux/types.h>
 
@@ -103,6 +104,7 @@ static int quarantine_tail;
 /* Total size of all objects in global_quarantine across all batches. */
 static unsigned long quarantine_size;
 static DEFINE_SPINLOCK(quarantine_lock);
+DEFINE_STATIC_SRCU(remove_cache_srcu);
 
 /* Maximum size of the global queue. */
 static unsigned long quarantine_max_size;
@@ -173,17 +175,22 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
 	struct qlist_head *q;
 	struct qlist_head temp = QLIST_INIT;
 
+	/*
+	 * Note: irq must be disabled until after we move the batch to the
+	 * global quarantine. Otherwise quarantine_remove_cache() can miss
+	 * some objects belonging to the cache if they are in our local temp
+	 * list. quarantine_remove_cache() executes on_each_cpu() at the
+	 * beginning which ensures that it either sees the objects in per-cpu
+	 * lists or in the global quarantine.
+	 */
 	local_irq_save(flags);
 
 	q = this_cpu_ptr(&cpu_quarantine);
 	qlist_put(q, &info->quarantine_link, cache->size);
-	if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+	if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
 		qlist_move_all(q, &temp);
 
-	local_irq_restore(flags);
-
-	if (unlikely(!qlist_empty(&temp))) {
-		spin_lock_irqsave(&quarantine_lock, flags);
+		spin_lock(&quarantine_lock);
 		WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
 		qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
 		if (global_quarantine[quarantine_tail].bytes >=
@@ -196,20 +203,33 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
 			if (new_tail != quarantine_head)
 				quarantine_tail = new_tail;
 		}
-		spin_unlock_irqrestore(&quarantine_lock, flags);
+		spin_unlock(&quarantine_lock);
 	}
+
+	local_irq_restore(flags);
 }
 
 void quarantine_reduce(void)
 {
 	size_t total_size, new_quarantine_size, percpu_quarantines;
 	unsigned long flags;
+	int srcu_idx;
 	struct qlist_head to_free = QLIST_INIT;
 
 	if (likely(READ_ONCE(quarantine_size) <=
 		   READ_ONCE(quarantine_max_size)))
 		return;
 
+	/*
+	 * srcu critical section ensures that quarantine_remove_cache()
+	 * will not miss objects belonging to the cache while they are in our
+	 * local to_free list. srcu is chosen because (1) it gives us private
+	 * grace period domain that does not interfere with anything else,
+	 * and (2) it allows synchronize_srcu() to return without waiting
+	 * if there are no pending read critical sections (which is the
+	 * expected case).
+	 */
+	srcu_idx = srcu_read_lock(&remove_cache_srcu);
 	spin_lock_irqsave(&quarantine_lock, flags);
 
 	/*
@@ -237,6 +257,7 @@ void quarantine_reduce(void)
 	spin_unlock_irqrestore(&quarantine_lock, flags);
 
 	qlist_free_all(&to_free, NULL);
+	srcu_read_unlock(&remove_cache_srcu, srcu_idx);
 }
 
 static void qlist_move_cache(struct qlist_head *from,
@@ -280,6 +301,13 @@ void quarantine_remove_cache(struct kmem_cache *cache)
 	unsigned long flags, i;
 	struct qlist_head to_free = QLIST_INIT;
 
+	/*
+	 * Must be careful to not miss any objects that are being moved from
+	 * per-cpu list to the global quarantine in quarantine_put(),
+	 * nor objects being freed in quarantine_reduce(). on_each_cpu()
+	 * achieves the first goal, while synchronize_srcu() achieves the
+	 * second.
+	 */
 	on_each_cpu(per_cpu_remove_cache, cache, 1);
 
 	spin_lock_irqsave(&quarantine_lock, flags);
@@ -295,4 +323,6 @@ void quarantine_remove_cache(struct kmem_cache *cache)
 	spin_unlock_irqrestore(&quarantine_lock, flags);
 
 	qlist_free_all(&to_free, cache);
+
+	synchronize_srcu(&remove_cache_srcu);
 }

From e8a4efd041734c248674f585d44b770ef8ea79aa Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 31 Mar 2017 15:12:04 -0700
Subject: [PATCH 1335/3220] BACKPORT: kasan: report only the first error by
 default

Disable kasan after the first report.  There are several reasons for
this:

 - Single bug quite often has multiple invalid memory accesses causing
   storm in the dmesg.

 - Write OOB access might corrupt metadata so the next report will print
   bogus alloc/free stacktraces.

 - Reports after the first easily could be not bugs by itself but just
   side effects of the first one.

Given that multiple reports usually only do harm, it makes sense to
disable kasan after the first one.  If user wants to see all the
reports, the boot-time parameter kasan_multi_shot must be used.

[aryabinin@virtuozzo.com: wrote changelog and doc, added missing include]
Link: http://lkml.kernel.org/r/20170323154416.30257-1-aryabinin@virtuozzo.com
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from b0845ce58379d11dcad4cdb6824a6410de260216)
Change-Id: Ia8c6d40dd0d4f5b944bf3501c08d7a825070b116
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kasan.h |  3 +++
 lib/test_kasan.c      | 10 ++++++++++
 mm/kasan/kasan.h      |  5 -----
 mm/kasan/report.c     | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7793036eac80..b37afd197eed 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -75,6 +75,9 @@ size_t ksize(const void *);
 static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
 size_t kasan_metadata_size(struct kmem_cache *cache);
 
+bool kasan_save_enable_multi_shot(void);
+void kasan_restore_multi_shot(bool enabled);
+
 #else /* CONFIG_KASAN */
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index fbdf87920093..0e70ecc12fe2 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -19,6 +19,7 @@
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/kasan.h>
 
 /*
  * Note: test functions are marked noinline so that their names appear in
@@ -441,6 +442,12 @@ static noinline void __init use_after_scope_test(void)
 
 static int __init kmalloc_tests_init(void)
 {
+	/*
+	 * Temporarily enable multi-shot mode. Otherwise, we'd only get a
+	 * report for the first case.
+	 */
+	bool multishot = kasan_save_enable_multi_shot();
+
 	kmalloc_oob_right();
 	kmalloc_oob_left();
 	kmalloc_node_oob_right();
@@ -465,6 +472,9 @@ static int __init kmalloc_tests_init(void)
 	ksize_unpoisons_memory();
 	copy_user_test();
 	use_after_scope_test();
+
+	kasan_restore_multi_shot(multishot);
+
 	return -EAGAIN;
 }
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1c260e6b3b3c..dd2dea8eb077 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -96,11 +96,6 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 		<< KASAN_SHADOW_SCALE_SHIFT);
 }
 
-static inline bool kasan_report_enabled(void)
-{
-	return !current->kasan_depth;
-}
-
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 void kasan_report_double_free(struct kmem_cache *cache, void *object,
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index f479365530b6..ab42a0803f16 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,7 +13,9 @@
  *
  */
 
+#include <linux/bitops.h>
 #include <linux/ftrace.h>
+#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/printk.h>
@@ -293,6 +295,40 @@ static void kasan_report_error(struct kasan_access_info *info)
 	kasan_end_report(&flags);
 }
 
+static unsigned long kasan_flags;
+
+#define KASAN_BIT_REPORTED	0
+#define KASAN_BIT_MULTI_SHOT	1
+
+bool kasan_save_enable_multi_shot(void)
+{
+	return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
+
+void kasan_restore_multi_shot(bool enabled)
+{
+	if (!enabled)
+		clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
+
+static int __init kasan_set_multi_shot(char *str)
+{
+	set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+	return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+static inline bool kasan_report_enabled(void)
+{
+	if (current->kasan_depth)
+		return false;
+	if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+		return true;
+	return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip)
 {

From 454c0bcdfb83b16bd046feea01c8f9732c999e39 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 3 May 2017 14:56:25 -0700
Subject: [PATCH 1336/3220] UPSTREAM: kasan: introduce helper functions for
 determining bug type

Patch series "kasan: improve error reports", v2.

This patchset improves KASAN reports by making them easier to read and a
little more detailed.  Also improves mm/kasan/report.c readability.

Effectively changes a use-after-free report to:

  ==================================================================
  BUG: KASAN: use-after-free in kmalloc_uaf+0xaa/0xb6 [test_kasan]
  Write of size 1 at addr ffff88006aa59da8 by task insmod/3951

  CPU: 1 PID: 3951 Comm: insmod Tainted: G    B           4.10.0+ #84
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Call Trace:
   dump_stack+0x292/0x398
   print_address_description+0x73/0x280
   kasan_report.part.2+0x207/0x2f0
   __asan_report_store1_noabort+0x2c/0x30
   kmalloc_uaf+0xaa/0xb6 [test_kasan]
   kmalloc_tests_init+0x4f/0xa48 [test_kasan]
   do_one_initcall+0xf3/0x390
   do_init_module+0x215/0x5d0
   load_module+0x54de/0x82b0
   SYSC_init_module+0x3be/0x430
   SyS_init_module+0x9/0x10
   entry_SYSCALL_64_fastpath+0x1f/0xc2
  RIP: 0033:0x7f22cfd0b9da
  RSP: 002b:00007ffe69118a78 EFLAGS: 00000206 ORIG_RAX: 00000000000000af
  RAX: ffffffffffffffda RBX: 0000555671242090 RCX: 00007f22cfd0b9da
  RDX: 00007f22cffcaf88 RSI: 000000000004df7e RDI: 00007f22d0399000
  RBP: 00007f22cffcaf88 R08: 0000000000000003 R09: 0000000000000000
  R10: 00007f22cfd07d0a R11: 0000000000000206 R12: 0000555671243190
  R13: 000000000001fe81 R14: 0000000000000000 R15: 0000000000000004

  Allocated by task 3951:
   save_stack_trace+0x16/0x20
   save_stack+0x43/0xd0
   kasan_kmalloc+0xad/0xe0
   kmem_cache_alloc_trace+0x82/0x270
   kmalloc_uaf+0x56/0xb6 [test_kasan]
   kmalloc_tests_init+0x4f/0xa48 [test_kasan]
   do_one_initcall+0xf3/0x390
   do_init_module+0x215/0x5d0
   load_module+0x54de/0x82b0
   SYSC_init_module+0x3be/0x430
   SyS_init_module+0x9/0x10
   entry_SYSCALL_64_fastpath+0x1f/0xc2

  Freed by task 3951:
   save_stack_trace+0x16/0x20
   save_stack+0x43/0xd0
   kasan_slab_free+0x72/0xc0
   kfree+0xe8/0x2b0
   kmalloc_uaf+0x85/0xb6 [test_kasan]
   kmalloc_tests_init+0x4f/0xa48 [test_kasan]
   do_one_initcall+0xf3/0x390
   do_init_module+0x215/0x5d0
   load_module+0x54de/0x82b0
   SYSC_init_module+0x3be/0x430
   SyS_init_module+0x9/0x10
   entry_SYSCALL_64_fastpath+0x1f/0xc

  The buggy address belongs to the object at ffff88006aa59da0
   which belongs to the cache kmalloc-16 of size 16
  The buggy address is located 8 bytes inside of
   16-byte region [ffff88006aa59da0, ffff88006aa59db0)
  The buggy address belongs to the page:
  page:ffffea0001aa9640 count:1 mapcount:0 mapping:          (null) index:0x0
  flags: 0x100000000000100(slab)
  raw: 0100000000000100 0000000000000000 0000000000000000 0000000180800080
  raw: ffffea0001abe380 0000000700000007 ffff88006c401b40 0000000000000000
  page dumped because: kasan: bad access detected

  Memory state around the buggy address:
   ffff88006aa59c80: 00 00 fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc
   ffff88006aa59d00: 00 00 fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc
  >ffff88006aa59d80: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
                                    ^
   ffff88006aa59e00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
   ffff88006aa59e80: fb fb fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc
  ==================================================================

from:

  ==================================================================
  BUG: KASAN: use-after-free in kmalloc_uaf+0xaa/0xb6 [test_kasan] at addr ffff88006c4dcb28
  Write of size 1 by task insmod/3984
  CPU: 1 PID: 3984 Comm: insmod Tainted: G    B           4.10.0+ #83
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Call Trace:
   dump_stack+0x292/0x398
   kasan_object_err+0x1c/0x70
   kasan_report.part.1+0x20e/0x4e0
   __asan_report_store1_noabort+0x2c/0x30
   kmalloc_uaf+0xaa/0xb6 [test_kasan]
   kmalloc_tests_init+0x4f/0xa48 [test_kasan]
   do_one_initcall+0xf3/0x390
   do_init_module+0x215/0x5d0
   load_module+0x54de/0x82b0
   SYSC_init_module+0x3be/0x430
   SyS_init_module+0x9/0x10
   entry_SYSCALL_64_fastpath+0x1f/0xc2
  RIP: 0033:0x7feca0f779da
  RSP: 002b:00007ffdfeae5218 EFLAGS: 00000206 ORIG_RAX: 00000000000000af
  RAX: ffffffffffffffda RBX: 000055a064c13090 RCX: 00007feca0f779da
  RDX: 00007feca1236f88 RSI: 000000000004df7e RDI: 00007feca1605000
  RBP: 00007feca1236f88 R08: 0000000000000003 R09: 0000000000000000
  R10: 00007feca0f73d0a R11: 0000000000000206 R12: 000055a064c14190
  R13: 000000000001fe81 R14: 0000000000000000 R15: 0000000000000004
  Object at ffff88006c4dcb20, in cache kmalloc-16 size: 16
  Allocated:
  PID = 3984
   save_stack_trace+0x16/0x20
   save_stack+0x43/0xd0
   kasan_kmalloc+0xad/0xe0
   kmem_cache_alloc_trace+0x82/0x270
   kmalloc_uaf+0x56/0xb6 [test_kasan]
   kmalloc_tests_init+0x4f/0xa48 [test_kasan]
   do_one_initcall+0xf3/0x390
   do_init_module+0x215/0x5d0
   load_module+0x54de/0x82b0
   SYSC_init_module+0x3be/0x430
   SyS_init_module+0x9/0x10
   entry_SYSCALL_64_fastpath+0x1f/0xc2
  Freed:
  PID = 3984
   save_stack_trace+0x16/0x20
   save_stack+0x43/0xd0
   kasan_slab_free+0x73/0xc0
   kfree+0xe8/0x2b0
   kmalloc_uaf+0x85/0xb6 [test_kasan]
   kmalloc_tests_init+0x4f/0xa48 [test_kasan]
   do_one_initcall+0xf3/0x390
   do_init_module+0x215/0x5d0
   load_module+0x54de/0x82b0
   SYSC_init_module+0x3be/0x430
   SyS_init_module+0x9/0x10
   entry_SYSCALL_64_fastpath+0x1f/0xc2
  Memory state around the buggy address:
   ffff88006c4dca00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
   ffff88006c4dca80: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
  >ffff88006c4dcb00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
                                    ^
   ffff88006c4dcb80: fb fb fc fc 00 00 fc fc fb fb fc fc fb fb fc fc
   ffff88006c4dcc00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
  ==================================================================

This patch (of 9):

Introduce get_shadow_bug_type() function, which determines bug type
based on the shadow value for a particular kernel address.  Introduce
get_wild_bug_type() function, which determines bug type for addresses
which don't have a corresponding shadow value.

Link: http://lkml.kernel.org/r/20170302134851.101218-2-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 5e82cd120382ad7bbcc82298e34a034538b4384c)
Change-Id: I3359775858891c9c66d11d2a520831e329993ae9
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index ab42a0803f16..c2bc08b1b5e0 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
 	return first_bad_addr;
 }
 
-static void print_error_description(struct kasan_access_info *info)
+static bool addr_has_shadow(struct kasan_access_info *info)
+{
+	return (info->access_addr >=
+		kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
 {
 	const char *bug_type = "unknown-crash";
 	u8 *shadow_addr;
@@ -98,6 +104,27 @@ static void print_error_description(struct kasan_access_info *info)
 		break;
 	}
 
+	return bug_type;
+}
+
+const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+	const char *bug_type = "unknown-crash";
+
+	if ((unsigned long)info->access_addr < PAGE_SIZE)
+		bug_type = "null-ptr-deref";
+	else if ((unsigned long)info->access_addr < TASK_SIZE)
+		bug_type = "user-memory-access";
+	else
+		bug_type = "wild-memory-access";
+
+	return bug_type;
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+	const char *bug_type = get_shadow_bug_type(info);
+
 	pr_err("BUG: KASAN: %s in %pS at addr %p\n",
 		bug_type, (void *)info->ip,
 		info->access_addr);
@@ -267,18 +294,11 @@ static void print_shadow_for_address(const void *addr)
 static void kasan_report_error(struct kasan_access_info *info)
 {
 	unsigned long flags;
-	const char *bug_type;
 
 	kasan_start_report(&flags);
 
-	if (info->access_addr <
-			kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
-		if ((unsigned long)info->access_addr < PAGE_SIZE)
-			bug_type = "null-ptr-deref";
-		else if ((unsigned long)info->access_addr < TASK_SIZE)
-			bug_type = "user-memory-access";
-		else
-			bug_type = "wild-memory-access";
+	if (!addr_has_shadow(info)) {
+		const char *bug_type = get_wild_bug_type(info);
 		pr_err("BUG: KASAN: %s on address %p\n",
 			bug_type, info->access_addr);
 		pr_err("%s of size %zu by task %s/%d\n",

From 9f1b3e85c4c24af8c430ecb89279c6cd31a323ce Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 3 May 2017 14:56:28 -0700
Subject: [PATCH 1337/3220] UPSTREAM: kasan: unify report headers

Unify KASAN report header format for different kinds of bad memory
accesses.  Makes the code simpler.

Link: http://lkml.kernel.org/r/20170302134851.101218-3-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 7d418f7b0d3407b93ec70f3b380cc5beafa1fa68)
Change-Id: I81577ad4617e8c4624fc0701f45a197d211f12a6
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index c2bc08b1b5e0..d6b6ec77c56a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -121,16 +121,22 @@ const char *get_wild_bug_type(struct kasan_access_info *info)
 	return bug_type;
 }
 
+static const char *get_bug_type(struct kasan_access_info *info)
+{
+	if (addr_has_shadow(info))
+		return get_shadow_bug_type(info);
+	return get_wild_bug_type(info);
+}
+
 static void print_error_description(struct kasan_access_info *info)
 {
-	const char *bug_type = get_shadow_bug_type(info);
+	const char *bug_type = get_bug_type(info);
 
 	pr_err("BUG: KASAN: %s in %pS at addr %p\n",
-		bug_type, (void *)info->ip,
-		info->access_addr);
+		bug_type, (void *)info->ip, info->access_addr);
 	pr_err("%s of size %zu by task %s/%d\n",
-		info->is_write ? "Write" : "Read",
-		info->access_size, current->comm, task_pid_nr(current));
+		info->is_write ? "Write" : "Read", info->access_size,
+		current->comm, task_pid_nr(current));
 }
 
 static inline bool kernel_or_module_addr(const void *addr)
@@ -297,17 +303,11 @@ static void kasan_report_error(struct kasan_access_info *info)
 
 	kasan_start_report(&flags);
 
+	print_error_description(info);
+
 	if (!addr_has_shadow(info)) {
-		const char *bug_type = get_wild_bug_type(info);
-		pr_err("BUG: KASAN: %s on address %p\n",
-			bug_type, info->access_addr);
-		pr_err("%s of size %zu by task %s/%d\n",
-			info->is_write ? "Write" : "Read",
-			info->access_size, current->comm,
-			task_pid_nr(current));
 		dump_stack();
 	} else {
-		print_error_description(info);
 		print_address_description(info);
 		print_shadow_for_address(info->first_bad_addr);
 	}

From f50afce056449305a8a5805333bb16833696763e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 3 May 2017 14:56:31 -0700
Subject: [PATCH 1338/3220] UPSTREAM: kasan: change allocation and freeing
 stack traces headers

Change stack traces headers from:

  Allocated:
  PID = 42

to:

  Allocated by task 42:

Makes the report one line shorter and look better.

Link: http://lkml.kernel.org/r/20170302134851.101218-4-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from b6b72f4919c121bee5890732e0b8de2ab99c8dbc)
Change-Id: Iab66777f16016b5a3a8ce85f7cc62d4572fcf5b0
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index d6b6ec77c56a..7d24363edd66 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -177,9 +177,9 @@ static void kasan_end_report(unsigned long *flags)
 	kasan_enable_current();
 }
 
-static void print_track(struct kasan_track *track)
+static void print_track(struct kasan_track *track, const char *prefix)
 {
-	pr_err("PID = %u\n", track->pid);
+	pr_err("%s by task %u:\n", prefix, track->pid);
 	if (track->stack) {
 		struct stack_trace trace;
 
@@ -201,10 +201,8 @@ static void kasan_object_err(struct kmem_cache *cache, void *object)
 	if (!(cache->flags & SLAB_KASAN))
 		return;
 
-	pr_err("Allocated:\n");
-	print_track(&alloc_info->alloc_track);
-	pr_err("Freed:\n");
-	print_track(&alloc_info->free_track);
+	print_track(&alloc_info->alloc_track, "Allocated");
+	print_track(&alloc_info->free_track, "Freed");
 }
 
 void kasan_report_double_free(struct kmem_cache *cache, void *object,

From 51f4ed70d2de1e7150fc028314bfd01d73b28997 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 3 May 2017 14:56:34 -0700
Subject: [PATCH 1339/3220] UPSTREAM: kasan: simplify address description logic

Simplify logic for describing a memory address.  Add addr_to_page()
helper function.

Makes the code easier to follow.

Link: http://lkml.kernel.org/r/20170302134851.101218-5-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from db429f16e0b472292000fd53b63ebd7221a9856e)
Change-Id: Ie688a8fe0da5d1012e64bdbd26b5e7cb2ed43ff8
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 7d24363edd66..a82d6896062b 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -190,11 +190,18 @@ static void print_track(struct kasan_track *track, const char *prefix)
 	}
 }
 
-static void kasan_object_err(struct kmem_cache *cache, void *object)
+static struct page *addr_to_page(const void *addr)
+{
+	if ((addr >= (void *)PAGE_OFFSET) &&
+			(addr < high_memory))
+		return virt_to_head_page(addr);
+	return NULL;
+}
+
+static void describe_object(struct kmem_cache *cache, void *object)
 {
 	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
 
-	dump_stack();
 	pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
 		cache->object_size);
 
@@ -213,34 +220,32 @@ void kasan_report_double_free(struct kmem_cache *cache, void *object,
 	kasan_start_report(&flags);
 	pr_err("BUG: Double free or freeing an invalid pointer\n");
 	pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
-	kasan_object_err(cache, object);
+	dump_stack();
+	describe_object(cache, object);
 	kasan_end_report(&flags);
 }
 
 static void print_address_description(struct kasan_access_info *info)
 {
 	const void *addr = info->access_addr;
+	struct page *page = addr_to_page(addr);
 
-	if ((addr >= (void *)PAGE_OFFSET) &&
-		(addr < high_memory)) {
-		struct page *page = virt_to_head_page(addr);
-
-		if (PageSlab(page)) {
-			void *object;
-			struct kmem_cache *cache = page->slab_cache;
-			object = nearest_obj(cache, page,
-						(void *)info->access_addr);
-			kasan_object_err(cache, object);
-			return;
-		}
+	if (page)
 		dump_page(page, "kasan: bad access detected");
+
+	dump_stack();
+
+	if (page && PageSlab(page)) {
+		struct kmem_cache *cache = page->slab_cache;
+		void *object = nearest_obj(cache, page,	(void *)addr);
+
+		describe_object(cache, object);
 	}
 
 	if (kernel_or_module_addr(addr)) {
 		if (!init_task_stack_addr(addr))
 			pr_err("Address belongs to variable %pS\n", addr);
 	}
-	dump_stack();
 }
 
 static bool row_is_guilty(const void *row, const void *guilty)

From a1dcc361b55edb4be1ecad247352703dea6318e3 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 3 May 2017 14:56:38 -0700
Subject: [PATCH 1340/3220] UPSTREAM: kasan: change report header

Change report header format from:

  BUG: KASAN: use-after-free in unwind_get_return_address+0x28a/0x2c0 at addr ffff880069437950
  Read of size 8 by task insmod/3925

to:

  BUG: KASAN: use-after-free in unwind_get_return_address+0x28a/0x2c0
  Read of size 8 at addr ffff880069437950 by task insmod/3925

The exact access address is not usually important, so move it to the
second line.  This also makes the header look visually balanced.

Link: http://lkml.kernel.org/r/20170302134851.101218-6-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 7f0a84c23b1dede3e76a7b2ebbde45a506252005)
Change-Id: If9cacce637c317538d813b05ef2647707300d310
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index a82d6896062b..8efc69473a37 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -132,11 +132,11 @@ static void print_error_description(struct kasan_access_info *info)
 {
 	const char *bug_type = get_bug_type(info);
 
-	pr_err("BUG: KASAN: %s in %pS at addr %p\n",
-		bug_type, (void *)info->ip, info->access_addr);
-	pr_err("%s of size %zu by task %s/%d\n",
+	pr_err("BUG: KASAN: %s in %pS\n",
+		bug_type, (void *)info->ip);
+	pr_err("%s of size %zu at addr %p by task %s/%d\n",
 		info->is_write ? "Write" : "Read", info->access_size,
-		current->comm, task_pid_nr(current));
+		info->access_addr, current->comm, task_pid_nr(current));
 }
 
 static inline bool kernel_or_module_addr(const void *addr)

From 500a7ed23b1ea1829beec72d73447a8001ec58ce Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 3 May 2017 14:56:41 -0700
Subject: [PATCH 1341/3220] UPSTREAM: kasan: improve slab object description

Changes slab object description from:

  Object at ffff880068388540, in cache kmalloc-128 size: 128

to:

  The buggy address belongs to the object at ffff880068388540
   which belongs to the cache kmalloc-128 of size 128
  The buggy address is located 123 bytes inside of
   128-byte region [ffff880068388540, ffff8800683885c0)

Makes it more explanatory and adds information about relative offset of
the accessed address to the start of the object.

Link: http://lkml.kernel.org/r/20170302134851.101218-7-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 0c06f1f86c87b1eb93420effe0c0457b30911360)
Change-Id: I23928984dbe5a614b84c57e42b20ec13e7c739a4
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 55 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 8efc69473a37..a1b1ef7a19f5 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -198,18 +198,49 @@ static struct page *addr_to_page(const void *addr)
 	return NULL;
 }
 
-static void describe_object(struct kmem_cache *cache, void *object)
+static void describe_object_addr(struct kmem_cache *cache, void *object,
+				const void *addr)
+{
+	unsigned long access_addr = (unsigned long)addr;
+	unsigned long object_addr = (unsigned long)object;
+	const char *rel_type;
+	int rel_bytes;
+
+	pr_err("The buggy address belongs to the object at %p\n"
+	       " which belongs to the cache %s of size %d\n",
+		object, cache->name, cache->object_size);
+
+	if (!addr)
+		return;
+
+	if (access_addr < object_addr) {
+		rel_type = "to the left";
+		rel_bytes = object_addr - access_addr;
+	} else if (access_addr >= object_addr + cache->object_size) {
+		rel_type = "to the right";
+		rel_bytes = access_addr - (object_addr + cache->object_size);
+	} else {
+		rel_type = "inside";
+		rel_bytes = access_addr - object_addr;
+	}
+
+	pr_err("The buggy address is located %d bytes %s of\n"
+	       " %d-byte region [%p, %p)\n",
+		rel_bytes, rel_type, cache->object_size, (void *)object_addr,
+		(void *)(object_addr + cache->object_size));
+}
+
+static void describe_object(struct kmem_cache *cache, void *object,
+				const void *addr)
 {
 	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
 
-	pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
-		cache->object_size);
+	if (cache->flags & SLAB_KASAN) {
+		print_track(&alloc_info->alloc_track, "Allocated");
+		print_track(&alloc_info->free_track, "Freed");
+	}
 
-	if (!(cache->flags & SLAB_KASAN))
-		return;
-
-	print_track(&alloc_info->alloc_track, "Allocated");
-	print_track(&alloc_info->free_track, "Freed");
+	describe_object_addr(cache, object, addr);
 }
 
 void kasan_report_double_free(struct kmem_cache *cache, void *object,
@@ -221,13 +252,13 @@ void kasan_report_double_free(struct kmem_cache *cache, void *object,
 	pr_err("BUG: Double free or freeing an invalid pointer\n");
 	pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
 	dump_stack();
-	describe_object(cache, object);
+	describe_object(cache, object, NULL);
 	kasan_end_report(&flags);
 }
 
 static void print_address_description(struct kasan_access_info *info)
 {
-	const void *addr = info->access_addr;
+	void *addr = (void *)info->access_addr;
 	struct page *page = addr_to_page(addr);
 
 	if (page)
@@ -237,9 +268,9 @@ static void print_address_description(struct kasan_access_info *info)
 
 	if (page && PageSlab(page)) {
 		struct kmem_cache *cache = page->slab_cache;
-		void *object = nearest_obj(cache, page,	(void *)addr);
+		void *object = nearest_obj(cache, page,	addr);
 
-		describe_object(cache, object);
+		describe_object(cache, object, addr);
 	}
 
 	if (kernel_or_module_addr(addr)) {

From 35ea56c325fe692b16548e4797225067a5f97248 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 3 May 2017 14:56:44 -0700
Subject: [PATCH 1342/3220] UPSTREAM: kasan: print page description after
 stacks

Moves page description after the stacks since it's less important.

Link: http://lkml.kernel.org/r/20170302134851.101218-8-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 430a05f91d6051705a6ddbe207735ca62e39bb80)
Change-Id: Ia20b7d6cf5602531072e9bd4fd478737f8623db1
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index a1b1ef7a19f5..b015acc80876 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -261,9 +261,6 @@ static void print_address_description(struct kasan_access_info *info)
 	void *addr = (void *)info->access_addr;
 	struct page *page = addr_to_page(addr);
 
-	if (page)
-		dump_page(page, "kasan: bad access detected");
-
 	dump_stack();
 
 	if (page && PageSlab(page)) {
@@ -273,9 +270,14 @@ static void print_address_description(struct kasan_access_info *info)
 		describe_object(cache, object, addr);
 	}
 
-	if (kernel_or_module_addr(addr)) {
-		if (!init_task_stack_addr(addr))
-			pr_err("Address belongs to variable %pS\n", addr);
+	if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
+		pr_err("The buggy address belongs to the variable:\n");
+		pr_err(" %pS\n", addr);
+	}
+
+	if (page) {
+		pr_err("The buggy address belongs to the page:\n");
+		dump_page(page, "kasan: bad access detected");
 	}
 }
 

From 8140f5663c0f7fdbcf17132febf466fc9f3ec1dd Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 3 May 2017 14:56:47 -0700
Subject: [PATCH 1343/3220] UPSTREAM: kasan: improve double-free report format

Changes double-free report header from

  BUG: Double free or freeing an invalid pointer
  Unexpected shadow byte: 0xFB

to

  BUG: KASAN: double-free or invalid-free in kmalloc_oob_left+0xe5/0xef

This makes a bug uniquely identifiable by the first report line.  To
account for removing of the unexpected shadow value, print shadow bytes
at the end of the report as in reports for other kinds of bugs.

Link: http://lkml.kernel.org/r/20170302134851.101218-9-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 5ab6d91ac998158d04f9563335aa5f1409eda971)
Change-Id: I02dee92190216601d65866eb1c27f7381a22b0da
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/kasan.c  |  3 ++-
 mm/kasan/kasan.h  |  2 +-
 mm/kasan/report.c | 30 ++++++++++++++----------------
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index d99312ed7c22..cfdbe1ce9ef8 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -576,7 +576,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
 
 	shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
 	if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
-		kasan_report_double_free(cache, object, shadow_byte);
+		kasan_report_double_free(cache, object,
+				__builtin_return_address(1));
 		return true;
 	}
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index dd2dea8eb077..1229298cce64 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 void kasan_report_double_free(struct kmem_cache *cache, void *object,
-			s8 shadow);
+					void *ip);
 
 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b015acc80876..7d3d9670e233 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -243,22 +243,8 @@ static void describe_object(struct kmem_cache *cache, void *object,
 	describe_object_addr(cache, object, addr);
 }
 
-void kasan_report_double_free(struct kmem_cache *cache, void *object,
-			s8 shadow)
+static void print_address_description(void *addr)
 {
-	unsigned long flags;
-
-	kasan_start_report(&flags);
-	pr_err("BUG: Double free or freeing an invalid pointer\n");
-	pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
-	dump_stack();
-	describe_object(cache, object, NULL);
-	kasan_end_report(&flags);
-}
-
-static void print_address_description(struct kasan_access_info *info)
-{
-	void *addr = (void *)info->access_addr;
 	struct page *page = addr_to_page(addr);
 
 	dump_stack();
@@ -333,6 +319,18 @@ static void print_shadow_for_address(const void *addr)
 	}
 }
 
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+				void *ip)
+{
+	unsigned long flags;
+
+	kasan_start_report(&flags);
+	pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
+	print_address_description(object);
+	print_shadow_for_address(object);
+	kasan_end_report(&flags);
+}
+
 static void kasan_report_error(struct kasan_access_info *info)
 {
 	unsigned long flags;
@@ -344,7 +342,7 @@ static void kasan_report_error(struct kasan_access_info *info)
 	if (!addr_has_shadow(info)) {
 		dump_stack();
 	} else {
-		print_address_description(info);
+		print_address_description((void *)info->access_addr);
 		print_shadow_for_address(info->first_bad_addr);
 	}
 

From 35da9fc5478670193f4f1d6c9b484e8d3eadbf6c Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 3 May 2017 14:56:50 -0700
Subject: [PATCH 1344/3220] UPSTREAM: kasan: separate report parts by empty
 lines

Makes the report easier to read.

Link: http://lkml.kernel.org/r/20170302134851.101218-10-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from b19385993623c1a18a686b6b271cd24d5aa96f52)
Change-Id: I8cc010a73e257cb08c7a2537d7aabd3c9c2c8116
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 7d3d9670e233..beee0e980e2d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -237,7 +237,9 @@ static void describe_object(struct kmem_cache *cache, void *object,
 
 	if (cache->flags & SLAB_KASAN) {
 		print_track(&alloc_info->alloc_track, "Allocated");
+		pr_err("\n");
 		print_track(&alloc_info->free_track, "Freed");
+		pr_err("\n");
 	}
 
 	describe_object_addr(cache, object, addr);
@@ -248,6 +250,7 @@ static void print_address_description(void *addr)
 	struct page *page = addr_to_page(addr);
 
 	dump_stack();
+	pr_err("\n");
 
 	if (page && PageSlab(page)) {
 		struct kmem_cache *cache = page->slab_cache;
@@ -326,7 +329,9 @@ void kasan_report_double_free(struct kmem_cache *cache, void *object,
 
 	kasan_start_report(&flags);
 	pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
+	pr_err("\n");
 	print_address_description(object);
+	pr_err("\n");
 	print_shadow_for_address(object);
 	kasan_end_report(&flags);
 }
@@ -338,11 +343,13 @@ static void kasan_report_error(struct kasan_access_info *info)
 	kasan_start_report(&flags);
 
 	print_error_description(info);
+	pr_err("\n");
 
 	if (!addr_has_shadow(info)) {
 		dump_stack();
 	} else {
 		print_address_description((void *)info->access_addr);
+		pr_err("\n");
 		print_shadow_for_address(info->first_bad_addr);
 	}
 

From 862910bf9bd4fed54ef30014d927a2569c123fa6 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 10 Jul 2017 15:50:40 -0700
Subject: [PATCH 1345/3220] UPSTREAM: kasan: make get_wild_bug_type() static

The helper function get_wild_bug_type() does not need to be in global
scope, so make it static.

Cleans up sparse warning:

  "symbol 'get_wild_bug_type' was not declared. Should it be static?"

Link: http://lkml.kernel.org/r/20170622090049.10658-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 822d5ec25884b4e4436c819d03035fc0dd689309)
Change-Id: If89c8ba8ee3bdb0db7ecb67e773bfbf3179514f3
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 mm/kasan/report.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index beee0e980e2d..04bb1d3eb9ec 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -107,7 +107,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
 	return bug_type;
 }
 
-const char *get_wild_bug_type(struct kasan_access_info *info)
+static const char *get_wild_bug_type(struct kasan_access_info *info)
 {
 	const char *bug_type = "unknown-crash";
 

From 5bd8877ad806dd752ece05a19b1120bef18342b0 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 15 Jul 2016 11:42:43 +0200
Subject: [PATCH 1346/3220] UPSTREAM: x86, kasan, ftrace: Put APIC interrupt
 handlers into .irqentry.text

Dmitry Vyukov has reported unexpected KASAN stackdepot growth:

  https://github.com/google/kasan/issues/36

... which is caused by the APIC handlers not being present in .irqentry.text:

When building with CONFIG_FUNCTION_GRAPH_TRACER=y or CONFIG_KASAN=y, put the
APIC interrupt handlers into the .irqentry.text section. This is needed
because both KASAN and function graph tracer use __irqentry_text_start and
__irqentry_text_end to determine whether a function is an IRQ entry point.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: aryabinin@virtuozzo.com
Cc: kasan-dev@googlegroups.com
Cc: kcc@google.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1468575763-144889-1-git-send-email-glider@google.com
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 64145065
(cherry-picked from 469f00231278da68062a809306df0bac95a27507)
Change-Id: Ibca3dce99f9d36c048bce7e59c25064eb36fed6f
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 arch/x86/entry/entry_64.S | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cc0f2f5da19b..0d38d9f1dc7f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -672,9 +672,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
 .endm
 #endif
 
+/* Make sure APIC interrupt handlers end up in the irqentry section: */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+# define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
+# define POP_SECTION_IRQENTRY	.popsection
+#else
+# define PUSH_SECTION_IRQENTRY
+# define POP_SECTION_IRQENTRY
+#endif
+
 .macro apicinterrupt num sym do_sym
+PUSH_SECTION_IRQENTRY
 apicinterrupt3 \num \sym \do_sym
 trace_apicinterrupt \num \sym
+POP_SECTION_IRQENTRY
 .endm
 
 #ifdef CONFIG_SMP

From 5249dcc4ebe88377f49a235a9aa977ddd964e41a Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 25 Mar 2016 14:22:05 -0700
Subject: [PATCH 1347/3220] UPSTREAM: arch, ftrace: for KASAN put hard/soft IRQ
 entries into separate sections

KASAN needs to know whether the allocation happens in an IRQ handler.
This lets us strip everything below the IRQ entry point to reduce the
number of unique stack traces needed to be stored.

Move the definition of __irq_entry to <linux/interrupt.h> so that the
users don't need to pull in <linux/ftrace.h>.  Also introduce the
__softirq_entry macro which is similar to __irq_entry, but puts the
corresponding functions to the .softirqentry.text section.

Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from be7635e7287e0e8013af3c89a6354a9e0182594c)
Change-Id: Ib321eb9c2b76ef4785cf3fd522169f524348bd9a
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 arch/arm/include/asm/exception.h     |  2 +-
 arch/arm/kernel/vmlinux.lds.S        |  1 +
 arch/arm64/include/asm/exception.h   |  2 +-
 arch/arm64/kernel/vmlinux.lds.S      |  1 +
 arch/blackfin/kernel/vmlinux.lds.S   |  1 +
 arch/c6x/kernel/vmlinux.lds.S        |  1 +
 arch/metag/kernel/vmlinux.lds.S      |  1 +
 arch/microblaze/kernel/vmlinux.lds.S |  1 +
 arch/mips/kernel/vmlinux.lds.S       |  1 +
 arch/nios2/kernel/vmlinux.lds.S      |  1 +
 arch/openrisc/kernel/vmlinux.lds.S   |  1 +
 arch/parisc/kernel/vmlinux.lds.S     |  1 +
 arch/powerpc/kernel/vmlinux.lds.S    |  1 +
 arch/s390/kernel/vmlinux.lds.S       |  1 +
 arch/sh/kernel/vmlinux.lds.S         |  1 +
 arch/sparc/kernel/vmlinux.lds.S      |  1 +
 arch/tile/kernel/vmlinux.lds.S       |  1 +
 arch/x86/kernel/vmlinux.lds.S        |  1 +
 include/asm-generic/vmlinux.lds.h    | 12 +++++++++++-
 include/linux/ftrace.h               | 11 -----------
 include/linux/interrupt.h            | 20 ++++++++++++++++++++
 kernel/softirq.c                     |  2 +-
 kernel/trace/trace_functions_graph.c |  1 +
 23 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/arch/arm/include/asm/exception.h b/arch/arm/include/asm/exception.h
index 5abaf5bbd985..bf1991263d2d 100644
--- a/arch/arm/include/asm/exception.h
+++ b/arch/arm/include/asm/exception.h
@@ -7,7 +7,7 @@
 #ifndef __ASM_ARM_EXCEPTION_H
 #define __ASM_ARM_EXCEPTION_H
 
-#include <linux/ftrace.h>
+#include <linux/interrupt.h>
 
 #define __exception	__attribute__((section(".exception.text")))
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index be2ab6d3b91f..b2e234468cb5 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
 			*(.exception.text)
 			__exception_text_end = .;
 			IRQENTRY_TEXT
+			SOFTIRQENTRY_TEXT
 			TEXT_TEXT
 			SCHED_TEXT
 			LOCK_TEXT
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 6cb7e1a6bc02..0c2eec490abf 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -18,7 +18,7 @@
 #ifndef __ASM_EXCEPTION_H
 #define __ASM_EXCEPTION_H
 
-#include <linux/ftrace.h>
+#include <linux/interrupt.h>
 
 #define __exception	__attribute__((section(".exception.text")))
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index f1d6c49dcc5f..9572844c57a5 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -103,6 +103,7 @@ SECTIONS
 			*(.exception.text)
 			__exception_text_end = .;
 			IRQENTRY_TEXT
+			SOFTIRQENTRY_TEXT
 			TEXT_TEXT
 			SCHED_TEXT
 			LOCK_TEXT
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index c9eec84aa258..d920b959ff3a 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -35,6 +35,7 @@ SECTIONS
 #endif
 		LOCK_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		KPROBES_TEXT
 #ifdef CONFIG_ROMKERNEL
 		__sinittext = .;
diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S
index 5a6e141d1641..50bc10f97bcb 100644
--- a/arch/c6x/kernel/vmlinux.lds.S
+++ b/arch/c6x/kernel/vmlinux.lds.S
@@ -72,6 +72,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		KPROBES_TEXT
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S
index e12055e88bfe..150ace92c7ad 100644
--- a/arch/metag/kernel/vmlinux.lds.S
+++ b/arch/metag/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@ SECTIONS
 	LOCK_TEXT
 	KPROBES_TEXT
 	IRQENTRY_TEXT
+	SOFTIRQENTRY_TEXT
 	*(.text.*)
 	*(.gnu.warning)
 	}
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index be9488d69734..0a47f0410554 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -36,6 +36,7 @@ SECTIONS {
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		. = ALIGN (4) ;
 		_etext = . ;
 	}
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 2026203c41e2..261b2ce579bb 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -58,6 +58,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.text.*)
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S
index 326fab40a9de..e23e89539967 100644
--- a/arch/nios2/kernel/vmlinux.lds.S
+++ b/arch/nios2/kernel/vmlinux.lds.S
@@ -39,6 +39,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		KPROBES_TEXT
 	} =0
 	_etext = .;
diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S
index 3a08b55609b6..341fc086bc17 100644
--- a/arch/openrisc/kernel/vmlinux.lds.S
+++ b/arch/openrisc/kernel/vmlinux.lds.S
@@ -52,6 +52,7 @@ SECTIONS
 	  LOCK_TEXT
 	  KPROBES_TEXT
 	  IRQENTRY_TEXT
+	  SOFTIRQENTRY_TEXT
 	  *(.fixup)
 	  *(.text.__*)
 	  _etext = .;
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 60771df10fde..75304af9f742 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -72,6 +72,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.text.do_softirq)
 		*(.text.sys_exit)
 		*(.text.do_sigaltstack)
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index d41fd0af8980..2dd91f79de05 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -55,6 +55,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 
 #ifdef CONFIG_PPC32
 		*(.got1)
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 445657fe658c..0f41a8286378 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -28,6 +28,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
 	} :text = 0x0700
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index db88cbf9eafd..235a4101999f 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -39,6 +39,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
 		_etext = .;		/* End of text section */
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 4a41d412dd3d..7d02b1fef025 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -52,6 +52,7 @@ SECTIONS
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.gnu.warning)
 	} = 0
 	_etext = .;
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index 0e059a0101ea..378f5d8d1ec8 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -45,6 +45,7 @@ SECTIONS
     LOCK_TEXT
     KPROBES_TEXT
     IRQENTRY_TEXT
+    SOFTIRQENTRY_TEXT
     __fix_text_end = .;   /* tile-cpack won't rearrange before this */
     ALIGN_FUNCTION();
     *(.hottext*)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index fe133b710bef..1d0e36f909eb 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -101,6 +101,7 @@ SECTIONS
 		KPROBES_TEXT
 		ENTRY_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
 		/* End of text section */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index a65eedc15e93..5d418e82f40c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -465,7 +465,7 @@
 		*(.entry.text)						\
 		VMLINUX_SYMBOL(__entry_text_end) = .;
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__irqentry_text_start) = .;		\
@@ -475,6 +475,16 @@
 #define IRQENTRY_TEXT
 #endif
 
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+#define SOFTIRQENTRY_TEXT						\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__softirqentry_text_start) = .;		\
+		*(.softirqentry.text)					\
+		VMLINUX_SYMBOL(__softirqentry_text_end) = .;
+#else
+#define SOFTIRQENTRY_TEXT
+#endif
+
 /* Section used for early init (in .S files) */
 #define HEAD_TEXT  *(.head.text)
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ed94cea9eaff..312a4ef093e3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -781,16 +781,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
  */
 #define __notrace_funcgraph		notrace
 
-/*
- * We want to which function is an entrypoint of a hardirq.
- * That will help us to put a signal on output.
- */
-#define __irq_entry		 __attribute__((__section__(".irqentry.text")))
-
-/* Limits of hardirq entrypoints */
-extern char __irqentry_text_start[];
-extern char __irqentry_text_end[];
-
 #define FTRACE_NOTRACE_DEPTH 65536
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
@@ -827,7 +817,6 @@ static inline void unpause_graph_tracing(void)
 #else /* !CONFIG_FUNCTION_GRAPH_TRACER */
 
 #define __notrace_funcgraph
-#define __irq_entry
 #define INIT_FTRACE_GRAPH
 
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index ad16809c8596..a5f0a9e5453b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -672,4 +672,24 @@ extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
 
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+/*
+ * We want to know which function is an entrypoint of a hardirq or a softirq.
+ */
+#define __irq_entry		 __attribute__((__section__(".irqentry.text")))
+#define __softirq_entry  \
+	__attribute__((__section__(".softirqentry.text")))
+
+/* Limits of hardirq entrypoints */
+extern char __irqentry_text_start[];
+extern char __irqentry_text_end[];
+/* Limits of softirq entrypoints */
+extern char __softirqentry_text_start[];
+extern char __softirqentry_text_end[];
+
+#else
+#define __irq_entry
+#define __softirq_entry
+#endif
+
 #endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e4436f787..359be4f39986 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
 static inline void lockdep_softirq_end(bool in_hardirq) { }
 #endif
 
-asmlinkage __visible void __do_softirq(void)
+asmlinkage __visible void __softirq_entry __do_softirq(void)
 {
 	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
 	unsigned long old_flags = current->flags;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 96c75b0e9831..a804ee1b3ec6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -8,6 +8,7 @@
  */
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
 

From dcfa5fe36a67566a1301cf86a6d340aab1168c64 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 3 Aug 2017 11:38:21 +0900
Subject: [PATCH 1348/3220] BACKPORT: irq: Make the irqentry text section
 unconditional

Generate irqentry and softirqentry text sections without
any Kconfig dependencies. This will add extra sections, but
there should be no performace impact.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David S . Miller <davem@davemloft.net>
Cc: Francis Deslauriers <francis.deslauriers@efficios.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: linux-arch@vger.kernel.org
Cc: linux-cris-kernel@axis.com
Cc: mathieu.desnoyers@efficios.com
Link: http://lkml.kernel.org/r/150172789110.27216.3955739126693102122.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Bug: 64145065
(cherry-picked from 229a71860547ec856b156179a9c6bef2de426f66)
Change-Id: I8f10ad59f16d637834a9dcacebdf087a028e995d
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 arch/arm/include/asm/traps.h      |  7 -------
 arch/arm64/include/asm/traps.h    |  7 -------
 arch/x86/entry/entry_64.S         |  9 ++-------
 include/asm-generic/sections.h    |  4 ++++
 include/asm-generic/vmlinux.lds.h |  8 --------
 include/linux/interrupt.h         | 14 +-------------
 6 files changed, 7 insertions(+), 42 deletions(-)

diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h
index f555bb3664dc..683d9230984a 100644
--- a/arch/arm/include/asm/traps.h
+++ b/arch/arm/include/asm/traps.h
@@ -18,7 +18,6 @@ struct undef_hook {
 void register_undef_hook(struct undef_hook *hook);
 void unregister_undef_hook(struct undef_hook *hook);
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static inline int __in_irqentry_text(unsigned long ptr)
 {
 	extern char __irqentry_text_start[];
@@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr)
 	return ptr >= (unsigned long)&__irqentry_text_start &&
 	       ptr < (unsigned long)&__irqentry_text_end;
 }
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
-	return 0;
-}
-#endif
 
 static inline int in_exception_text(unsigned long ptr)
 {
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 0cc2f29bf9da..8f05d3b21418 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -34,7 +34,6 @@ struct undef_hook {
 void register_undef_hook(struct undef_hook *hook);
 void unregister_undef_hook(struct undef_hook *hook);
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static inline int __in_irqentry_text(unsigned long ptr)
 {
 	extern char __irqentry_text_start[];
@@ -43,12 +42,6 @@ static inline int __in_irqentry_text(unsigned long ptr)
 	return ptr >= (unsigned long)&__irqentry_text_start &&
 	       ptr < (unsigned long)&__irqentry_text_end;
 }
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
-	return 0;
-}
-#endif
 
 static inline int in_exception_text(unsigned long ptr)
 {
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 0d38d9f1dc7f..644b8b4e89a9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -673,13 +673,8 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
 #endif
 
 /* Make sure APIC interrupt handlers end up in the irqentry section: */
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-# define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
-# define POP_SECTION_IRQENTRY	.popsection
-#else
-# define PUSH_SECTION_IRQENTRY
-# define POP_SECTION_IRQENTRY
-#endif
+#define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY	.popsection
 
 .macro apicinterrupt num sym do_sym
 PUSH_SECTION_IRQENTRY
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b58fd667f87b..598f8c7d719f 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -24,6 +24,8 @@
  *	__kprobes_text_start, __kprobes_text_end
  *	__entry_text_start, __entry_text_end
  *	__ctors_start, __ctors_end
+ *	__irqentry_text_start, __irqentry_text_end
+ *	__softirqentry_text_start, __softirqentry_text_end
  */
 extern char _text[], _stext[], _etext[];
 extern char _data[], _sdata[], _edata[];
@@ -35,6 +37,8 @@ extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
 extern char __entry_text_start[], __entry_text_end[];
 extern char __start_rodata[], __end_rodata[];
+extern char __irqentry_text_start[], __irqentry_text_end[];
+extern char __softirqentry_text_start[], __softirqentry_text_end[];
 
 /* Start and end of .ctors section - used for constructor calls. */
 extern char __ctors_start[], __ctors_end[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5d418e82f40c..a9cb7cc303a4 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -465,25 +465,17 @@
 		*(.entry.text)						\
 		VMLINUX_SYMBOL(__entry_text_end) = .;
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__irqentry_text_start) = .;		\
 		*(.irqentry.text)					\
 		VMLINUX_SYMBOL(__irqentry_text_end) = .;
-#else
-#define IRQENTRY_TEXT
-#endif
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 #define SOFTIRQENTRY_TEXT						\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__softirqentry_text_start) = .;		\
 		*(.softirqentry.text)					\
 		VMLINUX_SYMBOL(__softirqentry_text_end) = .;
-#else
-#define SOFTIRQENTRY_TEXT
-#endif
 
 /* Section used for early init (in .S files) */
 #define HEAD_TEXT  *(.head.text)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a5f0a9e5453b..9ddd6a0ce3a2 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
 #include <linux/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/irq.h>
+#include <asm/sections.h>
 
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
@@ -672,7 +673,6 @@ extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 /*
  * We want to know which function is an entrypoint of a hardirq or a softirq.
  */
@@ -680,16 +680,4 @@ extern int arch_early_irq_init(void);
 #define __softirq_entry  \
 	__attribute__((__section__(".softirqentry.text")))
 
-/* Limits of hardirq entrypoints */
-extern char __irqentry_text_start[];
-extern char __irqentry_text_end[];
-/* Limits of softirq entrypoints */
-extern char __softirqentry_text_start[];
-extern char __softirqentry_text_end[];
-
-#else
-#define __irq_entry
-#define __softirq_entry
-#endif
-
 #endif

From 9b83f370dc669e5dbd15d23a41e821b7d6765581 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Tue, 22 Mar 2016 14:27:30 -0700
Subject: [PATCH 1349/3220] BACKPORT: kernel: add kcov code coverage

kcov provides code coverage collection for coverage-guided fuzzing
(randomized testing).  Coverage-guided fuzzing is a testing technique
that uses coverage feedback to determine new interesting inputs to a
system.  A notable user-space example is AFL
(http://lcamtuf.coredump.cx/afl/).  However, this technique is not
widely used for kernel testing due to missing compiler and kernel
support.

kcov does not aim to collect as much coverage as possible.  It aims to
collect more or less stable coverage that is function of syscall inputs.
To achieve this goal it does not collect coverage in soft/hard
interrupts and instrumentation of some inherently non-deterministic or
non-interesting parts of kernel is disbled (e.g.  scheduler, locking).

Currently there is a single coverage collection mode (tracing), but the
API anticipates additional collection modes.  Initially I also
implemented a second mode which exposes coverage in a fixed-size hash
table of counters (what Quentin used in his original patch).  I've
dropped the second mode for simplicity.

This patch adds the necessary support on kernel side.  The complimentary
compiler support was added in gcc revision 231296.

We've used this support to build syzkaller system call fuzzer, which has
found 90 kernel bugs in just 2 months:

  https://github.com/google/syzkaller/wiki/Found-Bugs

We've also found 30+ bugs in our internal systems with syzkaller.
Another (yet unexplored) direction where kcov coverage would greatly
help is more traditional "blob mutation".  For example, mounting a
random blob as a filesystem, or receiving a random blob over wire.

Why not gcov.  Typical fuzzing loop looks as follows: (1) reset
coverage, (2) execute a bit of code, (3) collect coverage, repeat.  A
typical coverage can be just a dozen of basic blocks (e.g.  an invalid
input).  In such context gcov becomes prohibitively expensive as
reset/collect coverage steps depend on total number of basic
blocks/edges in program (in case of kernel it is about 2M).  Cost of
kcov depends only on number of executed basic blocks/edges.  On top of
that, kernel requires per-thread coverage because there are always
background threads and unrelated processes that also produce coverage.
With inlined gcov instrumentation per-thread coverage is not possible.

kcov exposes kernel PCs and control flow to user-space which is
insecure.  But debugfs should not be mapped as user accessible.

Based on a patch by Quentin Casasnovas.

[akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode']
[akpm@linux-foundation.org: unbreak allmodconfig]
[akpm@linux-foundation.org: follow x86 Makefile layout standards]
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: syzkaller <syzkaller@googlegroups.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@google.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: David Drysdale <drysdale@google.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 5c9a8750a6409c63a0f01d51a9024861022f6593)
Change-Id: I17b5e04f6e89b241924e78ec32ead79c38b860ce
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 Documentation/kcov.txt                | 111 +++++++++++
 Makefile                              |  11 +-
 arch/x86/Kconfig                      |   1 +
 arch/x86/boot/Makefile                |   7 +
 arch/x86/boot/compressed/Makefile     |   3 +
 arch/x86/entry/vdso/Makefile          |   3 +
 arch/x86/kernel/apic/Makefile         |   4 +
 arch/x86/kernel/cpu/Makefile          |   4 +
 arch/x86/lib/Makefile                 |   3 +
 arch/x86/mm/Makefile                  |   3 +
 arch/x86/realmode/rm/Makefile         |   3 +
 drivers/firmware/efi/libstub/Makefile |   3 +
 include/linux/kcov.h                  |  29 +++
 include/linux/sched.h                 |  11 ++
 include/uapi/linux/kcov.h             |  10 +
 kernel/Makefile                       |  12 ++
 kernel/exit.c                         |   2 +
 kernel/fork.c                         |   3 +
 kernel/kcov.c                         | 273 ++++++++++++++++++++++++++
 kernel/locking/Makefile               |   3 +
 kernel/rcu/Makefile                   |   4 +
 kernel/sched/Makefile                 |   4 +
 lib/Kconfig.debug                     |  21 ++
 lib/Makefile                          |  12 ++
 mm/Makefile                           |  15 ++
 mm/kasan/Makefile                     |   1 +
 scripts/Makefile.lib                  |   6 +
 27 files changed, 561 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/kcov.txt
 create mode 100644 include/linux/kcov.h
 create mode 100644 include/uapi/linux/kcov.h
 create mode 100644 kernel/kcov.c

diff --git a/Documentation/kcov.txt b/Documentation/kcov.txt
new file mode 100644
index 000000000000..779ff4ab1c1d
--- /dev/null
+++ b/Documentation/kcov.txt
@@ -0,0 +1,111 @@
+kcov: code coverage for fuzzing
+===============================
+
+kcov exposes kernel code coverage information in a form suitable for coverage-
+guided fuzzing (randomized testing). Coverage data of a running kernel is
+exported via the "kcov" debugfs file. Coverage collection is enabled on a task
+basis, and thus it can capture precise coverage of a single system call.
+
+Note that kcov does not aim to collect as much coverage as possible. It aims
+to collect more or less stable coverage that is function of syscall inputs.
+To achieve this goal it does not collect coverage in soft/hard interrupts
+and instrumentation of some inherently non-deterministic parts of kernel is
+disbled (e.g. scheduler, locking).
+
+Usage:
+======
+
+Configure kernel with:
+
+        CONFIG_KCOV=y
+
+CONFIG_KCOV requires gcc built on revision 231296 or later.
+Profiling data will only become accessible once debugfs has been mounted:
+
+        mount -t debugfs none /sys/kernel/debug
+
+The following program demonstrates kcov usage from within a test program:
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+#define COVER_SIZE			(64<<10)
+
+int main(int argc, char **argv)
+{
+	int fd;
+	unsigned long *cover, n, i;
+
+	/* A single fd descriptor allows coverage collection on a single
+	 * thread.
+	 */
+	fd = open("/sys/kernel/debug/kcov", O_RDWR);
+	if (fd == -1)
+		perror("open"), exit(1);
+	/* Setup trace mode and trace size. */
+	if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE))
+		perror("ioctl"), exit(1);
+	/* Mmap buffer shared between kernel- and user-space. */
+	cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long),
+				     PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if ((void*)cover == MAP_FAILED)
+		perror("mmap"), exit(1);
+	/* Enable coverage collection on the current thread. */
+	if (ioctl(fd, KCOV_ENABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Reset coverage from the tail of the ioctl() call. */
+	__atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
+	/* That's the target syscal call. */
+	read(-1, NULL, 0);
+	/* Read number of PCs collected. */
+	n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
+	for (i = 0; i < n; i++)
+		printf("0x%lx\n", cover[i + 1]);
+	/* Disable coverage collection for the current thread. After this call
+	 * coverage can be enabled for a different thread.
+	 */
+	if (ioctl(fd, KCOV_DISABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Free resources. */
+	if (munmap(cover, COVER_SIZE * sizeof(unsigned long)))
+		perror("munmap"), exit(1);
+	if (close(fd))
+		perror("close"), exit(1);
+	return 0;
+}
+
+After piping through addr2line output of the program looks as follows:
+
+SyS_read
+fs/read_write.c:562
+__fdget_pos
+fs/file.c:774
+__fget_light
+fs/file.c:746
+__fget_light
+fs/file.c:750
+__fget_light
+fs/file.c:760
+__fdget_pos
+fs/file.c:784
+SyS_read
+fs/read_write.c:562
+
+If a program needs to collect coverage from several threads (independently),
+it needs to open /sys/kernel/debug/kcov in each thread separately.
+
+The interface is fine-grained to allow efficient forking of test processes.
+That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode,
+mmaps coverage buffer and then forks child processes in a loop. Child processes
+only need to enable coverage (disable happens automatically on thread end).
diff --git a/Makefile b/Makefile
index e73705b58a3c..a4044ee36db4 100644
--- a/Makefile
+++ b/Makefile
@@ -369,6 +369,7 @@ LDFLAGS_MODULE  =
 CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
 CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage -fno-tree-loop-im
+CFLAGS_KCOV	= -fsanitize-coverage=trace-pc
 
 
 # Use USERINCLUDE when you must reference the UAPI directories only.
@@ -416,7 +417,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
 export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -693,6 +694,14 @@ endif
 endif
 KBUILD_CFLAGS += $(stackp-flag)
 
+ifdef CONFIG_KCOV
+  ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
+    $(warning Cannot use CONFIG_KCOV: \
+             -fsanitize-coverage=trace-pc is not supported by compiler)
+    CFLAGS_KCOV =
+  endif
+endif
+
 ifeq ($(cc-name),clang)
 ifneq ($(CROSS_COMPILE),)
 CLANG_TRIPLE    ?= $(CROSS_COMPILE)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4d3cc82560a1..7b115f91b8e8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index c0cc2a6be0bf..707adf8b6a9f 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -11,6 +11,13 @@
 
 KASAN_SANITIZE := n
 
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT		:= n
+
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index efa6073ffa7e..6f7af9ac16e2 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -18,6 +18,9 @@
 
 KASAN_SANITIZE := n
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
 	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
 
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 7af017a8958f..fddeb1f4dcd2 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -5,6 +5,9 @@
 KBUILD_CFLAGS += $(DISABLE_LTO)
 KASAN_SANITIZE := n
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSO32-$(CONFIG_X86_32)		:= y
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 8bb12ddc5db8..8e63ebdcbd0b 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
 
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT		:= n
+
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o ipi.o vector.o
 obj-y				+= hw_nmi.o
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 58031303e304..c108683bb32c 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
 # Make sure load_percpu_segment has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index db4a1c6ea785..2b6561d97a46 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,6 +2,9 @@
 # Makefile for x86 specific library files.
 #
 
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o	:= n
+
 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
 inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
 quiet_cmd_inat_tables = GEN     $@
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 65c47fda26fc..60407270ea70 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o	:= n
+
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
 
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 2730d775ef9a..59610be05468 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -8,6 +8,9 @@
 #
 KASAN_SANITIZE := n
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 always := realmode.bin realmode.relocs
 
 wakeup-objs	:= wakeup_asm.o wakemain.o video-mode.o
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index d1f1bde85d3f..f89c4f25fbd3 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -23,6 +23,9 @@ KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
 GCOV_PROFILE			:= n
 KASAN_SANITIZE			:= n
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT			:= n
+
 lib-y				:= efi-stub-helper.o
 
 # include the stub's generic dependencies from lib/ when building for ARM/arm64
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
new file mode 100644
index 000000000000..2883ac98c280
--- /dev/null
+++ b/include/linux/kcov.h
@@ -0,0 +1,29 @@
+#ifndef _LINUX_KCOV_H
+#define _LINUX_KCOV_H
+
+#include <uapi/linux/kcov.h>
+
+struct task_struct;
+
+#ifdef CONFIG_KCOV
+
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
+enum kcov_mode {
+	/* Coverage collection is not enabled yet. */
+	KCOV_MODE_DISABLED = 0,
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 */
+	KCOV_MODE_TRACE = 1,
+};
+
+#else
+
+static inline void kcov_task_init(struct task_struct *t) {}
+static inline void kcov_task_exit(struct task_struct *t) {}
+
+#endif /* CONFIG_KCOV */
+#endif /* _LINUX_KCOV_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 60af9d2fa922..de6b3edd835b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,6 +51,7 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kcov.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/cred.h>
@@ -1973,6 +1974,16 @@ struct task_struct {
 	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
+#ifdef CONFIG_KCOV
+	/* Coverage collection mode enabled for this task (0 if disabled). */
+	enum kcov_mode kcov_mode;
+	/* Size of the kcov_area. */
+	unsigned	kcov_size;
+	/* Buffer for coverage collection. */
+	void		*kcov_area;
+	/* kcov desciptor wired with this task or NULL. */
+	struct kcov	*kcov;
+#endif
 #ifdef CONFIG_MEMCG
 	struct mem_cgroup *memcg_in_oom;
 	gfp_t memcg_oom_gfp_mask;
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
new file mode 100644
index 000000000000..574e22ec640d
--- /dev/null
+++ b/include/uapi/linux/kcov.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_KCOV_IOCTLS_H
+#define _LINUX_KCOV_IOCTLS_H
+
+#include <linux/types.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+
+#endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 53abf008ecb3..2dea801370f2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -19,6 +19,17 @@ CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
 endif
 
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
 # cond_syscall is currently not LTO compatible
 CFLAGS_sys_ni.o = $(DISABLE_LTO)
 
@@ -69,6 +80,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 62c4bd4abd3a..4ceeac2e39f4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,7 @@
 #include <linux/oom.h>
 #include <linux/writeback.h>
 #include <linux/shm.h>
+#include <linux/kcov.h>
 
 #include "sched/tune.h"
 
@@ -659,6 +660,7 @@ void do_exit(long code)
 	TASKS_RCU(int tasks_rcu_i);
 
 	profile_task_exit(tsk);
+	kcov_task_exit(tsk);
 
 	WARN_ON(blk_needs_flush_plug(tsk));
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 2633d5f61d3c..7b6941926728 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -75,6 +75,7 @@
 #include <linux/aio.h>
 #include <linux/compiler.h>
 #include <linux/sysctl.h>
+#include <linux/kcov.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -387,6 +388,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 	account_kernel_stack(stack, 1);
 
+	kcov_task_init(tsk);
+
 	return tsk;
 
 free_stack:
diff --git a/kernel/kcov.c b/kernel/kcov.c
new file mode 100644
index 000000000000..3efbee0834a8
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,273 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ *  - initial state after open()
+ *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ *  - then, mmap() call (several calls are allowed but not useful)
+ *  - then, repeated enable/disable for a task (only one task a time allowed)
+ */
+struct kcov {
+	/*
+	 * Reference counter. We keep one for:
+	 *  - opened file descriptor
+	 *  - task with enabled coverage (we can't unwire it from another task)
+	 */
+	atomic_t		refcount;
+	/* The lock protects mode, size, area and t. */
+	spinlock_t		lock;
+	enum kcov_mode		mode;
+	/* Size of arena (in long's for KCOV_MODE_TRACE). */
+	unsigned		size;
+	/* Coverage buffer shared with user space. */
+	void			*area;
+	/* Task for which we collect coverage, or NULL. */
+	struct task_struct	*t;
+};
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	enum kcov_mode mode;
+
+	t = current;
+	/*
+	 * We are interested in code coverage as a function of a syscall inputs,
+	 * so we ignore code executed in interrupts.
+	 */
+	if (!t || in_interrupt())
+		return;
+	mode = READ_ONCE(t->kcov_mode);
+	if (mode == KCOV_MODE_TRACE) {
+		unsigned long *area;
+		unsigned long pos;
+
+		/*
+		 * There is some code that runs in interrupts but for which
+		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+		 * interrupts, there are paired barrier()/WRITE_ONCE() in
+		 * kcov_ioctl_locked().
+		 */
+		barrier();
+		area = t->kcov_area;
+		/* The first word is number of subsequent PCs. */
+		pos = READ_ONCE(area[0]) + 1;
+		if (likely(pos < t->kcov_size)) {
+			area[pos] = _RET_IP_;
+			WRITE_ONCE(area[0], pos);
+		}
+	}
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+static void kcov_get(struct kcov *kcov)
+{
+	atomic_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+	if (atomic_dec_and_test(&kcov->refcount)) {
+		vfree(kcov->area);
+		kfree(kcov);
+	}
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+	t->kcov_mode = KCOV_MODE_DISABLED;
+	t->kcov_size = 0;
+	t->kcov_area = NULL;
+	t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+	struct kcov *kcov;
+
+	kcov = t->kcov;
+	if (kcov == NULL)
+		return;
+	spin_lock(&kcov->lock);
+	if (WARN_ON(kcov->t != t)) {
+		spin_unlock(&kcov->lock);
+		return;
+	}
+	/* Just to not leave dangling references behind. */
+	kcov_task_init(t);
+	kcov->t = NULL;
+	spin_unlock(&kcov->lock);
+	kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	int res = 0;
+	void *area;
+	struct kcov *kcov = vma->vm_file->private_data;
+	unsigned long size, off;
+	struct page *page;
+
+	area = vmalloc_user(vma->vm_end - vma->vm_start);
+	if (!area)
+		return -ENOMEM;
+
+	spin_lock(&kcov->lock);
+	size = kcov->size * sizeof(unsigned long);
+	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	    vma->vm_end - vma->vm_start != size) {
+		res = -EINVAL;
+		goto exit;
+	}
+	if (!kcov->area) {
+		kcov->area = area;
+		vma->vm_flags |= VM_DONTEXPAND;
+		spin_unlock(&kcov->lock);
+		for (off = 0; off < size; off += PAGE_SIZE) {
+			page = vmalloc_to_page(kcov->area + off);
+			if (vm_insert_page(vma, vma->vm_start + off, page))
+				WARN_ONCE(1, "vm_insert_page() failed");
+		}
+		return 0;
+	}
+exit:
+	spin_unlock(&kcov->lock);
+	vfree(area);
+	return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+	struct kcov *kcov;
+
+	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+	if (!kcov)
+		return -ENOMEM;
+	atomic_set(&kcov->refcount, 1);
+	spin_lock_init(&kcov->lock);
+	filep->private_data = kcov;
+	return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+	kcov_put(filep->private_data);
+	return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct task_struct *t;
+	unsigned long size, unused;
+
+	switch (cmd) {
+	case KCOV_INIT_TRACE:
+		/*
+		 * Enable kcov in trace mode and setup buffer size.
+		 * Must happen before anything else.
+		 */
+		if (kcov->mode != KCOV_MODE_DISABLED)
+			return -EBUSY;
+		/*
+		 * Size must be at least 2 to hold current position and one PC.
+		 * Later we allocate size * sizeof(unsigned long) memory,
+		 * that must not overflow.
+		 */
+		size = arg;
+		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+			return -EINVAL;
+		kcov->size = size;
+		kcov->mode = KCOV_MODE_TRACE;
+		return 0;
+	case KCOV_ENABLE:
+		/*
+		 * Enable coverage for the current task.
+		 * At this point user must have been enabled trace mode,
+		 * and mmapped the file. Coverage collection is disabled only
+		 * at task exit or voluntary by KCOV_DISABLE. After that it can
+		 * be enabled for another task.
+		 */
+		unused = arg;
+		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
+		    kcov->area == NULL)
+			return -EINVAL;
+		if (kcov->t != NULL)
+			return -EBUSY;
+		t = current;
+		/* Cache in task struct for performance. */
+		t->kcov_size = kcov->size;
+		t->kcov_area = kcov->area;
+		/* See comment in __sanitizer_cov_trace_pc(). */
+		barrier();
+		WRITE_ONCE(t->kcov_mode, kcov->mode);
+		t->kcov = kcov;
+		kcov->t = t;
+		/* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+		kcov_get(kcov);
+		return 0;
+	case KCOV_DISABLE:
+		/* Disable coverage for the current task. */
+		unused = arg;
+		if (unused != 0 || current->kcov != kcov)
+			return -EINVAL;
+		t = current;
+		if (WARN_ON(kcov->t != t))
+			return -EINVAL;
+		kcov_task_init(t);
+		kcov->t = NULL;
+		kcov_put(kcov);
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct kcov *kcov;
+	int res;
+
+	kcov = filep->private_data;
+	spin_lock(&kcov->lock);
+	res = kcov_ioctl_locked(kcov, cmd, arg);
+	spin_unlock(&kcov->lock);
+	return res;
+}
+
+static const struct file_operations kcov_fops = {
+	.open		= kcov_open,
+	.unlocked_ioctl	= kcov_ioctl,
+	.mmap		= kcov_mmap,
+	.release        = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+	if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+		pr_err("failed to create kcov in debugfs\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+device_initcall(kcov_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8e96f6cc2a4a..31322a4275cd 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,6 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT		:= n
 
 obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
 
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 61a16569ffbf..032b2c015beb 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,3 +1,7 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
 obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index d7ec4f7dd0d9..99378130a42f 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
 endif
 
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c879d72bf9f7..6899c7cd7898 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -676,6 +676,27 @@ source "lib/Kconfig.kasan"
 
 endmenu # "Memory Debugging"
 
+config ARCH_HAS_KCOV
+	bool
+	help
+	  KCOV does not have any arch-specific code, but currently it is enabled
+	  only for x86_64. KCOV requires testing on other archs, and most likely
+	  disabling of instrumentation for some early boot code.
+
+config KCOV
+	bool "Code coverage for fuzzing"
+	depends on ARCH_HAS_KCOV
+	select DEBUG_FS
+	help
+	  KCOV exposes kernel code coverage information in a form suitable
+	  for coverage-guided fuzzing (randomized testing).
+
+	  If RANDOMIZE_BASE is enabled, PC values will not be stable across
+	  different machines and across reboots. If you need stable PC values,
+	  disable RANDOMIZE_BASE.
+
+	  For more details, see Documentation/kcov.txt.
+
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
 	depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 4e26d0bea99a..b9ad86add71d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
 endif
 
+# These files are disabled because they produce lots of non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. For example,
+# rbtree can be global and individual rotations don't correlate with inputs.
+KCOV_INSTRUMENT_string.o := n
+KCOV_INSTRUMENT_rbtree.o := n
+KCOV_INSTRUMENT_list_debug.o := n
+KCOV_INSTRUMENT_debugobjects.o := n
+KCOV_INSTRUMENT_dynamic_debug.o := n
+# Kernel does not boot if we instrument this file as it uses custom calling
+# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
+KCOV_INSTRUMENT_hweight.o := n
+
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
diff --git a/mm/Makefile b/mm/Makefile
index 31b952edd501..ec91e951da28 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,6 +9,21 @@ KASAN_SANITIZE_slub.o := n
 # Since __builtin_frame_address does work as used, disable the warning.
 CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
 
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 58dc45dd9594..e1100433cefe 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,4 +1,5 @@
 KASAN_SANITIZE := n
+KCOV_INSTRUMENT := n
 
 CFLAGS_REMOVE_kasan.o = -pg
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index c3b3c94e2446..4dfe62241909 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -129,6 +129,12 @@ _c_flags += $(if $(patsubst n%,, \
 		$(CFLAGS_KASAN))
 endif
 
+ifeq ($(CONFIG_KCOV),y)
+_c_flags += $(if $(patsubst n%,, \
+	$(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \
+	$(CFLAGS_KCOV))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 

From 94fe28dd4dd394b07e3d02ecf32da9a0e7f4cd84 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Thu, 28 Apr 2016 16:18:52 -0700
Subject: [PATCH 1350/3220] UPSTREAM: kcov: don't trace the code coverage code

Kcov causes the compiler to add a call to __sanitizer_cov_trace_pc() in
every basic block.  Ftrace patches in a call to _mcount() to each
function it has annotated.

Letting these mechanisms annotate each other is a bad thing.  Break the
loop by adding 'notrace' to __sanitizer_cov_trace_pc() so that ftrace
won't try to patch this code.

This patch lets arm64 with KCOV and STACK_TRACER boot.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from bdab42dfc974d15303afbf259f340f374a453974)
Change-Id: If7708ca761f81e0645d709b263e61493fc016e01
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3efbee0834a8..78bed7125515 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -43,7 +43,7 @@ struct kcov {
  * Entry point from instrumented code.
  * This is called once per basic-block/edge.
  */
-void __sanitizer_cov_trace_pc(void)
+void notrace __sanitizer_cov_trace_pc(void)
 {
 	struct task_struct *t;
 	enum kcov_mode mode;

From f82dbb707f3dcc992acf162af2c9e245a7f151a9 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 28 Apr 2016 16:18:55 -0700
Subject: [PATCH 1351/3220] UPSTREAM: kcov: don't profile branches in kcov

Profiling 'if' statements in __sanitizer_cov_trace_pc() leads to
unbound recursion and crash:

	__sanitizer_cov_trace_pc() ->
		ftrace_likely_update ->
			__sanitizer_cov_trace_pc() ...

Define DISABLE_BRANCH_PROFILING to disable this tracer.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 36f05ae8bce904b4c8105363e6227a79d343bda6)
Change-Id: I53cea027ba86b89016df3944374bdb119a2ca9dd
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 78bed7125515..a02f2dddd1d7 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,5 +1,6 @@
 #define pr_fmt(fmt) "kcov: " fmt
 
+#define DISABLE_BRANCH_PROFILING
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/file.h>

From d1c5c9c2c635018f06f3f0c726f4b9ff2283a406 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 27 Oct 2016 17:46:21 -0700
Subject: [PATCH 1352/3220] UPSTREAM: kcov: properly check if we are in an
 interrupt

in_interrupt() returns a nonzero value when we are either in an
interrupt or have bh disabled via local_bh_disable().  Since we are
interested in only ignoring coverage from actual interrupts, do a proper
check instead of just calling in_interrupt().

As a result of this change, kcov will start to collect coverage from
within local_bh_disable()/local_bh_enable() sections.

Link: http://lkml.kernel.org/r/1476115803-20712-1-git-send-email-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: James Morse <james.morse@arm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from b274c0bb394c6a69ac12feac7c2db81f5aff5a55)
Change-Id: I91364ef699f9af0a57959caf65372a6559d7a6b0
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index a02f2dddd1d7..e9bb8f59711f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void)
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
+	 * The checks for whether we are in an interrupt are open-coded, because
+	 * 1. We can't use in_interrupt() here, since it also returns true
+	 *    when we are inside local_bh_disable() section.
+	 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
+	 *    since that leads to slower generated code (three separate tests,
+	 *    one for each of the flags).
 	 */
-	if (!t || in_interrupt())
+	if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
+							| NMI_MASK)))
 		return;
 	mode = READ_ONCE(t->kcov_mode);
 	if (mode == KCOV_MODE_TRACE) {

From 2b86624bff9d403f5ec3e7583096be91eb924c2c Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 7 Dec 2016 14:44:36 -0800
Subject: [PATCH 1353/3220] UPSTREAM: kcov: add missing #include
 <linux/sched.h>

In __sanitizer_cov_trace_pc we use task_struct and fields within it, but
as we haven't included <linux/sched.h>, it is not guaranteed to be
defined.  While we usually happen to acquire the definition through a
transitive include, this is fragile (and hasn't been true in the past,
causing issues with backports).

Include <linux/sched.h> to avoid any fragility.

[mark.rutland@arm.com: rewrote changelog]
Link: http://lkml.kernel.org/r/1481007384-27529-1-git-send-email-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 166ad0e1e2132ff0cda08b94af8301655fcabbcd)
Change-Id: Id5a06b927687ace8788f623fc91cc8305fde5f2d
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index e9bb8f59711f..4e631f8072dc 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/printk.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>

From f414a9442ffd1540f3c06c27697f0826c751af85 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 14 Dec 2016 15:05:46 -0800
Subject: [PATCH 1354/3220] UPSTREAM: kcov: add more missing includes

It is fragile that some definitions acquired via transitive
dependencies, as shown in below:

atomic_*        (<linux/atomic.h>)
ENOMEM/EN*      (<linux/errno.h>)
EXPORT_SYMBOL   (<linux/export.h>)
device_initcall (<linux/init.h>)
preempt_*       (<linux/preempt.h>)

Include them to prevent possible issues.

Link: http://lkml.kernel.org/r/1481163221-40170-1-git-send-email-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from db862358a4a96f52d3b0c713c703828f90d97de9)
Change-Id: Ia529631d2072cc795c46ae0276e51592318cd40f
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 4e631f8072dc..02b61fe8a806 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,11 +1,16 @@
 #define pr_fmt(fmt) "kcov: " fmt
 
 #define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
 #include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/preempt.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
 #include <linux/slab.h>

From 049e3f20578094975842fb41dbd5287e15c2eac6 Mon Sep 17 00:00:00 2001
From: Alexander Popov <alex.popov@linux.com>
Date: Mon, 19 Dec 2016 16:23:09 -0800
Subject: [PATCH 1355/3220] UPSTREAM: kcov: make kcov work properly with KASLR
 enabled

Subtract KASLR offset from the kernel addresses reported by kcov.
Tested on x86_64 and AArch64 (Hikey LeMaker).

Link: http://lkml.kernel.org/r/1481417456-28826-3-git-send-email-alex.popov@linux.com
Signed-off-by: Alexander Popov <alex.popov@linux.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Jon Masters <jcm@redhat.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: James Morse <james.morse@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: syzkaller <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 4983f0ab7ffaad1e534b21975367429736475205)
Change-Id: Ib19d7fb559f7db28314cd13a3e33e061d1dfdec9
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 02b61fe8a806..93373fc268db 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -19,6 +19,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/kcov.h>
+#include <asm/setup.h>
 
 /*
  * kcov descriptor (one per opened debugfs file).
@@ -73,6 +74,11 @@ void notrace __sanitizer_cov_trace_pc(void)
 	if (mode == KCOV_MODE_TRACE) {
 		unsigned long *area;
 		unsigned long pos;
+		unsigned long ip = _RET_IP_;
+
+#ifdef CONFIG_RANDOMIZE_BASE
+		ip -= kaslr_offset();
+#endif
 
 		/*
 		 * There is some code that runs in interrupts but for which
@@ -86,7 +92,7 @@ void notrace __sanitizer_cov_trace_pc(void)
 		/* The first word is number of subsequent PCs. */
 		pos = READ_ONCE(area[0]) + 1;
 		if (likely(pos < t->kcov_size)) {
-			area[pos] = _RET_IP_;
+			area[pos] = ip;
 			WRITE_ONCE(area[0], pos);
 		}
 	}

From 937f71940a639524aa0b3fef56558892efe1ee8a Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Mon, 8 May 2017 15:56:48 -0700
Subject: [PATCH 1356/3220] UPSTREAM: kcov: simplify interrupt check

in_interrupt() semantics are confusing and wrong for most users as it
also returns true when bh is disabled.  Thus we open coded a proper
check for interrupts in __sanitizer_cov_trace_pc() with a lengthy
explanatory comment.

Use the new in_task() predicate instead.

Link: http://lkml.kernel.org/r/20170321091026.139655-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: James Morse <james.morse@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from f61e869d519c0c11a8d80a503cfdfb4897df855a)
Change-Id: Ice260535314238c8f82ddc578ecaeea6177d28fc
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 93373fc268db..c3fe46193a31 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void)
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
-	 * The checks for whether we are in an interrupt are open-coded, because
-	 * 1. We can't use in_interrupt() here, since it also returns true
-	 *    when we are inside local_bh_disable() section.
-	 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
-	 *    since that leads to slower generated code (three separate tests,
-	 *    one for each of the flags).
 	 */
-	if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
-							| NMI_MASK)))
+	if (!t || !in_task())
 		return;
 	mode = READ_ONCE(t->kcov_mode);
 	if (mode == KCOV_MODE_TRACE) {

From b67f35bcaea1f2fef36166ac0663b0d3b543ed38 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Fri, 8 Sep 2017 16:17:35 -0700
Subject: [PATCH 1357/3220] UPSTREAM: kcov: support compat processes

Support compat processes in KCOV by providing compat_ioctl callback.
Compat mode uses the same ioctl callback: we have 2 commands that do not
use the argument and 1 that already checks that the arg does not overflow
INT_MAX.  This allows to use KCOV-guided fuzzing in compat processes.

Link: http://lkml.kernel.org/r/20170823100553.55812-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 7483e5d420d9d5aa1732c5efb0da59e095a8b24e)
Change-Id:I74b62f01941091649ce6e88b3130e4ca4274a8de
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index c3fe46193a31..f348bcbea7d2 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -270,6 +270,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 static const struct file_operations kcov_fops = {
 	.open		= kcov_open,
 	.unlocked_ioctl	= kcov_ioctl,
+	.compat_ioctl	= kcov_ioctl,
 	.mmap		= kcov_mmap,
 	.release        = kcov_close,
 };

From 48eb2bc60c2504cd2bcb5a67a5bcc50af693eec8 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 17 Nov 2017 15:30:42 -0800
Subject: [PATCH 1358/3220] UPSTREAM: kcov: remove pointless current != NULL
 check

__sanitizer_cov_trace_pc() is a hot code, so it's worth to remove
pointless '!current' check.  Current is never NULL.

Link: http://lkml.kernel.org/r/20170929162221.32500-1-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from fcf4edac049a8bca41658970292e2dfdbc9d5f62)
Change-Id: Ia76e8c6cc0dc3fb796d8e8b92430fcf659b52eee
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index f348bcbea7d2..9b909cce4deb 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -61,7 +61,7 @@ void notrace __sanitizer_cov_trace_pc(void)
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
 	 */
-	if (!t || !in_task())
+	if (!in_task())
 		return;
 	mode = READ_ONCE(t->kcov_mode);
 	if (mode == KCOV_MODE_TRACE) {

From 540baa17a89e25ec39f5618ff29a6d9a8bc5a325 Mon Sep 17 00:00:00 2001
From: Victor Chibotaru <tchibo@google.com>
Date: Fri, 17 Nov 2017 15:30:46 -0800
Subject: [PATCH 1359/3220] UPSTREAM: kcov: support comparison operands
 collection

Enables kcov to collect comparison operands from instrumented code.
This is done by using Clang's -fsanitize=trace-cmp instrumentation
(currently not available for GCC).

The comparison operands help a lot in fuzz testing.  E.g.  they are used
in Syzkaller to cover the interiors of conditional statements with way
less attempts and thus make previously unreachable code reachable.

To allow separate collection of coverage and comparison operands two
different work modes are implemented.  Mode selection is now done via a
KCOV_ENABLE ioctl call with corresponding argument value.

Link: http://lkml.kernel.org/r/20171011095459.70721-1-glider@google.com
Signed-off-by: Victor Chibotaru <tchibo@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from ded97d2c2b2c5f1dcced0bc57133f7753b037dfc)
Change-Id: Iaba700a3f4786048be14a5e764ccabceae114eb7
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 include/linux/kcov.h      |  12 ++-
 include/uapi/linux/kcov.h |  24 +++++
 kernel/kcov.c             | 218 +++++++++++++++++++++++++++++++-------
 3 files changed, 213 insertions(+), 41 deletions(-)

diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index 2883ac98c280..87e2a44f1bab 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -7,19 +7,23 @@ struct task_struct;
 
 #ifdef CONFIG_KCOV
 
-void kcov_task_init(struct task_struct *t);
-void kcov_task_exit(struct task_struct *t);
-
 enum kcov_mode {
 	/* Coverage collection is not enabled yet. */
 	KCOV_MODE_DISABLED = 0,
+	/* KCOV was initialized, but tracing mode hasn't been chosen yet. */
+	KCOV_MODE_INIT = 1,
 	/*
 	 * Tracing coverage collection mode.
 	 * Covered PCs are collected in a per-task buffer.
 	 */
-	KCOV_MODE_TRACE = 1,
+	KCOV_MODE_TRACE_PC = 2,
+	/* Collecting comparison operands mode. */
+	KCOV_MODE_TRACE_CMP = 3,
 };
 
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
 #else
 
 static inline void kcov_task_init(struct task_struct *t) {}
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
index 574e22ec640d..33b826b9946e 100644
--- a/include/uapi/linux/kcov.h
+++ b/include/uapi/linux/kcov.h
@@ -7,4 +7,28 @@
 #define KCOV_ENABLE			_IO('c', 100)
 #define KCOV_DISABLE			_IO('c', 101)
 
+enum {
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 * In new KCOV version the mode is chosen by calling
+	 * ioctl(fd, KCOV_ENABLE, mode). In older versions the mode argument
+	 * was supposed to be 0 in such a call. So, for reasons of backward
+	 * compatibility, we have chosen the value KCOV_TRACE_PC to be 0.
+	 */
+	KCOV_TRACE_PC = 0,
+	/* Collecting comparison operands mode. */
+	KCOV_TRACE_CMP = 1,
+};
+
+/*
+ * The format for the types of collected comparisons.
+ *
+ * Bit 0 shows whether one of the arguments is a compile-time constant.
+ * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes.
+ */
+#define KCOV_CMP_CONST          (1 << 0)
+#define KCOV_CMP_SIZE(n)        ((n) << 1)
+#define KCOV_CMP_MASK           KCOV_CMP_SIZE(3)
+
 #endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 9b909cce4deb..ef0209d18878 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -21,13 +21,21 @@
 #include <linux/kcov.h>
 #include <asm/setup.h>
 
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
 /*
  * kcov descriptor (one per opened debugfs file).
  * State transitions of the descriptor:
  *  - initial state after open()
  *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
  *  - then, mmap() call (several calls are allowed but not useful)
- *  - then, repeated enable/disable for a task (only one task a time allowed)
+ *  - then, ioctl(KCOV_ENABLE, arg), where arg is
+ *	KCOV_TRACE_PC - to trace only the PCs
+ *	or
+ *	KCOV_TRACE_CMP - to trace only the comparison operands
+ *  - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
  */
 struct kcov {
 	/*
@@ -47,6 +55,36 @@ struct kcov {
 	struct task_struct	*t;
 };
 
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
+{
+	enum kcov_mode mode;
+
+	/*
+	 * We are interested in code coverage as a function of a syscall inputs,
+	 * so we ignore code executed in interrupts.
+	 */
+	if (!in_task())
+		return false;
+	mode = READ_ONCE(t->kcov_mode);
+	/*
+	 * There is some code that runs in interrupts but for which
+	 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+	 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+	 * interrupts, there are paired barrier()/WRITE_ONCE() in
+	 * kcov_ioctl_locked().
+	 */
+	barrier();
+	return mode == needed_mode;
+}
+
+static unsigned long canonicalize_ip(unsigned long ip)
+{
+#ifdef CONFIG_RANDOMIZE_BASE
+	ip -= kaslr_offset();
+#endif
+	return ip;
+}
+
 /*
  * Entry point from instrumented code.
  * This is called once per basic-block/edge.
@@ -54,44 +92,139 @@ struct kcov {
 void notrace __sanitizer_cov_trace_pc(void)
 {
 	struct task_struct *t;
-	enum kcov_mode mode;
+	unsigned long *area;
+	unsigned long ip = canonicalize_ip(_RET_IP_);
+	unsigned long pos;
 
 	t = current;
-	/*
-	 * We are interested in code coverage as a function of a syscall inputs,
-	 * so we ignore code executed in interrupts.
-	 */
-	if (!in_task())
+	if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
 		return;
-	mode = READ_ONCE(t->kcov_mode);
-	if (mode == KCOV_MODE_TRACE) {
-		unsigned long *area;
-		unsigned long pos;
-		unsigned long ip = _RET_IP_;
 
-#ifdef CONFIG_RANDOMIZE_BASE
-		ip -= kaslr_offset();
-#endif
-
-		/*
-		 * There is some code that runs in interrupts but for which
-		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
-		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
-		 * interrupts, there are paired barrier()/WRITE_ONCE() in
-		 * kcov_ioctl_locked().
-		 */
-		barrier();
-		area = t->kcov_area;
-		/* The first word is number of subsequent PCs. */
-		pos = READ_ONCE(area[0]) + 1;
-		if (likely(pos < t->kcov_size)) {
-			area[pos] = ip;
-			WRITE_ONCE(area[0], pos);
-		}
+	area = t->kcov_area;
+	/* The first 64-bit word is the number of subsequent PCs. */
+	pos = READ_ONCE(area[0]) + 1;
+	if (likely(pos < t->kcov_size)) {
+		area[pos] = ip;
+		WRITE_ONCE(area[0], pos);
 	}
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
 
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+	struct task_struct *t;
+	u64 *area;
+	u64 count, start_index, end_pos, max_pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+		return;
+
+	ip = canonicalize_ip(ip);
+
+	/*
+	 * We write all comparison arguments and types as u64.
+	 * The buffer was allocated for t->kcov_size unsigned longs.
+	 */
+	area = (u64 *)t->kcov_area;
+	max_pos = t->kcov_size * sizeof(unsigned long);
+
+	count = READ_ONCE(area[0]);
+
+	/* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+	start_index = 1 + count * KCOV_WORDS_PER_CMP;
+	end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+	if (likely(end_pos <= max_pos)) {
+		area[start_index] = type;
+		area[start_index + 1] = arg1;
+		area[start_index + 2] = arg2;
+		area[start_index + 3] = ip;
+		WRITE_ONCE(area[0], count + 1);
+	}
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+	u64 i;
+	u64 count = cases[0];
+	u64 size = cases[1];
+	u64 type = KCOV_CMP_CONST;
+
+	switch (size) {
+	case 8:
+		type |= KCOV_CMP_SIZE(0);
+		break;
+	case 16:
+		type |= KCOV_CMP_SIZE(1);
+		break;
+	case 32:
+		type |= KCOV_CMP_SIZE(2);
+		break;
+	case 64:
+		type |= KCOV_CMP_SIZE(3);
+		break;
+	default:
+		return;
+	}
+	for (i = 0; i < count; i++)
+		write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
 static void kcov_get(struct kcov *kcov)
 {
 	atomic_inc(&kcov->refcount);
@@ -128,6 +261,7 @@ void kcov_task_exit(struct task_struct *t)
 	/* Just to not leave dangling references behind. */
 	kcov_task_init(t);
 	kcov->t = NULL;
+	kcov->mode = KCOV_MODE_INIT;
 	spin_unlock(&kcov->lock);
 	kcov_put(kcov);
 }
@@ -146,7 +280,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
 
 	spin_lock(&kcov->lock);
 	size = kcov->size * sizeof(unsigned long);
-	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
 	    vma->vm_end - vma->vm_start != size) {
 		res = -EINVAL;
 		goto exit;
@@ -175,6 +309,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
 	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
 	if (!kcov)
 		return -ENOMEM;
+	kcov->mode = KCOV_MODE_DISABLED;
 	atomic_set(&kcov->refcount, 1);
 	spin_lock_init(&kcov->lock);
 	filep->private_data = kcov;
@@ -210,7 +345,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
 			return -EINVAL;
 		kcov->size = size;
-		kcov->mode = KCOV_MODE_TRACE;
+		kcov->mode = KCOV_MODE_INIT;
 		return 0;
 	case KCOV_ENABLE:
 		/*
@@ -220,17 +355,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		 * at task exit or voluntary by KCOV_DISABLE. After that it can
 		 * be enabled for another task.
 		 */
-		unused = arg;
-		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
-		    kcov->area == NULL)
+		if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
 			return -EINVAL;
 		if (kcov->t != NULL)
 			return -EBUSY;
+		if (arg == KCOV_TRACE_PC)
+			kcov->mode = KCOV_MODE_TRACE_PC;
+		else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+			kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+		return -ENOTSUPP;
+#endif
+		else
+			return -EINVAL;
 		t = current;
 		/* Cache in task struct for performance. */
 		t->kcov_size = kcov->size;
 		t->kcov_area = kcov->area;
-		/* See comment in __sanitizer_cov_trace_pc(). */
+		/* See comment in check_kcov_mode(). */
 		barrier();
 		WRITE_ONCE(t->kcov_mode, kcov->mode);
 		t->kcov = kcov;
@@ -248,6 +391,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 			return -EINVAL;
 		kcov_task_init(t);
 		kcov->t = NULL;
+		kcov->mode = KCOV_MODE_INIT;
 		kcov_put(kcov);
 		return 0;
 	default:

From 4ae5c2ae5a505663450643794c1bd3d46d1729e3 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Fri, 8 Dec 2017 02:01:35 +0000
Subject: [PATCH 1360/3220] UPSTREAM: kcov: fix comparison callback signature

Fix a silly copy-paste bug.  We truncated u32 args to u16.

Link: http://lkml.kernel.org/r/20171207101134.107168-1-dvyukov@google.com
Fixes: ded97d2c2b2c ("kcov: support comparison operands collection")
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: syzkaller@googlegroups.com
Cc: Alexander Potapenko <glider@google.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Bug: 64145065
(cherry-picked from 8c0431ec452de79ef3fe998c1fbb1e3d3ac13ddd)
Change-Id: Ic3872c33d03a456640dd6fdcce3b0795765dc1c0
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 kernel/kcov.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index ef0209d18878..5813e9375a93 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -156,7 +156,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
 
-void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
 {
 	write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
 }
@@ -182,7 +182,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
 
-void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
 {
 	write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
 			_RET_IP_);

From 17005182843704cd5c4851058148835a2dd18169 Mon Sep 17 00:00:00 2001
From: Alexander Popov <alex.popov@linux.com>
Date: Mon, 19 Dec 2016 16:23:06 -0800
Subject: [PATCH 1361/3220] UPSTREAM: arm64: setup: introduce kaslr_offset()

Introduce kaslr_offset() similar to x86_64 to fix kcov.

[ Updated by Will Deacon ]

Link: http://lkml.kernel.org/r/1481417456-28826-2-git-send-email-alex.popov@linux.com
Signed-off-by: Alexander Popov <alex.popov@linux.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Jon Masters <jcm@redhat.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: James Morse <james.morse@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: syzkaller <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Bug: 64145065
(cherry-picked from 7ede8665f27cde7da69e8b2fbeaa1ed0664879c5)
Change-Id: I9f013afed7f60d2280bb36ce3ba14d8c5515ddb1
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 arch/arm64/include/asm/memory.h | 5 +++++
 arch/arm64/kernel/setup.c       | 8 ++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 2600d4a9c1c2..6911cff76d10 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -145,6 +145,11 @@ extern u64			kimage_vaddr;
 /* the offset between the kernel virtual and physical mappings */
 extern u64			kimage_voffset;
 
+static inline unsigned long kaslr_offset(void)
+{
+	return kimage_vaddr - KIMAGE_VADDR;
+}
+
 /*
  * Allow all memory at the discovery stage. We will clip it later.
  */
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 45f44e9cf06c..4bc5bc9463b8 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -409,11 +409,11 @@ subsys_initcall(topology_init);
 static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
 			      void *p)
 {
-	u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR;
+	const unsigned long offset = kaslr_offset();
 
-	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) {
-		pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n",
-			 kaslr_offset, KIMAGE_VADDR);
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) {
+		pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n",
+			 offset, KIMAGE_VADDR);
 	} else {
 		pr_emerg("Kernel Offset: disabled\n");
 	}

From 610c835673f31633f49a6f1aa7bab74b3e2156d1 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Wed, 20 Dec 2017 16:21:00 +0100
Subject: [PATCH 1362/3220] ANDROID: binder: Remove obsolete proc waitqueue.

It was no longer being used.

Change-Id: I7fc42b76f688a459ad990f59fbd7006b96bb91a6
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d559596988d7..b4a958e20c44 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -513,8 +513,6 @@ struct binder_priority {
  *                        (protected by @inner_lock)
  * @todo:                 list of work for this process
  *                        (protected by @inner_lock)
- * @wait:                 wait queue head to wait for proc work
- *                        (invariant after initialized)
  * @stats:                per-process binder statistics
  *                        (atomics, no lock needed)
  * @delivered_death:      list of delivered death notification
@@ -555,7 +553,6 @@ struct binder_proc {
 	bool is_dead;
 
 	struct list_head todo;
-	wait_queue_head_t wait;
 	struct binder_stats stats;
 	struct list_head delivered_death;
 	int max_threads;

From 77b02dccafafc467e2f299d49f652b1875c013c7 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 20 Dec 2017 16:59:11 -0800
Subject: [PATCH 1363/3220] ANDROID: sdcardfs: notify lower file of opens

fsnotify_open is not called within dentry_open,
so we need to call it ourselves.

Change-Id: Ia7f323b3d615e6ca5574e114e8a5d7973fb4c119
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 70706497
---
 fs/sdcardfs/file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 5ac0b0bbb0ec..dd76ecf33cf3 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -18,6 +18,7 @@
  * General Public License.
  */
 
+#include <linux/fsnotify.h>
 #include "sdcardfs.h"
 #ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
 #include <linux/backing-dev.h>
@@ -259,6 +260,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 			fput(lower_file); /* fput calls dput for lower_dentry */
 		}
 	} else {
+		fsnotify_open(lower_file);
 		sdcardfs_set_lower_file(file, lower_file);
 	}
 

From a51b84097d2103264f8b0759942e313317bccd78 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 2 Jan 2018 14:44:49 -0800
Subject: [PATCH 1364/3220] ANDROID: sdcardfs: Add default_normal option

The default_normal option causes mounts with the gid set to
AID_SDCARD_RW to have user specific gids, as in the normal case.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: I9619b8ac55f41415df943484dc8db1ea986cef6f
Bug: 64672411
---
 fs/sdcardfs/main.c     | 6 ++++++
 fs/sdcardfs/sdcardfs.h | 3 ++-
 fs/sdcardfs/super.c    | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 0a2b5167e9a2..3d1023a7ff7c 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -33,6 +33,7 @@ enum {
 	Opt_userid,
 	Opt_reserved_mb,
 	Opt_gid_derivation,
+	Opt_default_normal,
 	Opt_err,
 };
 
@@ -45,6 +46,7 @@ static const match_table_t sdcardfs_tokens = {
 	{Opt_userid, "userid=%d"},
 	{Opt_multiuser, "multiuser"},
 	{Opt_gid_derivation, "derive_gid"},
+	{Opt_default_normal, "default_normal"},
 	{Opt_reserved_mb, "reserved_mb=%u"},
 	{Opt_err, NULL}
 };
@@ -68,6 +70,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 	opts->reserved_mb = 0;
 	/* by default, gid derivation is off */
 	opts->gid_derivation = false;
+	vfsopts->default_normal = false;
 
 	*debug = 0;
 
@@ -122,6 +125,8 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 		case Opt_gid_derivation:
 			opts->gid_derivation = true;
 			break;
+		case Opt_default_normal:
+			vfsopts->default_normal = true;
 		/* unknown option */
 		default:
 			if (!silent)
@@ -175,6 +180,7 @@ int parse_options_remount(struct super_block *sb, char *options, int silent,
 				return 0;
 			vfsopts->mask = option;
 			break;
+		case Opt_default_normal:
 		case Opt_multiuser:
 		case Opt_userid:
 		case Opt_fsuid:
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 88b92b2f1872..f5054a2650f1 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -226,6 +226,7 @@ struct sdcardfs_mount_options {
 struct sdcardfs_vfsmount_options {
 	gid_t gid;
 	mode_t mask;
+	bool default_normal;
 };
 
 extern int parse_options_remount(struct super_block *sb, char *options, int silent,
@@ -417,7 +418,7 @@ static inline int get_gid(struct vfsmount *mnt,
 {
 	struct sdcardfs_vfsmount_options *opts = mnt->data;
 
-	if (opts->gid == AID_SDCARD_RW)
+	if (opts->gid == AID_SDCARD_RW && !opts->default_normal)
 		/* As an optimization, certain trusted system components only run
 		 * as owner but operate across all users. Since we're now handing
 		 * out the sdcard_rw GID only to trusted apps, we're okay relaxing
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index b89947d878e3..a28b40f5adc8 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -304,6 +304,8 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
 		seq_printf(m, ",userid=%u", opts->fs_user_id);
 	if (opts->gid_derivation)
 		seq_puts(m, ",derive_gid");
+	if (vfsopts->default_normal)
+		seq_puts(m, ",default_normal");
 	if (opts->reserved_mb != 0)
 		seq_printf(m, ",reserved=%uMB", opts->reserved_mb);
 

From 56089a8a56dd21c562947e2bd6804e0245f52ba5 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 13 Sep 2016 11:16:06 +0100
Subject: [PATCH 1365/3220] UPSTREAM: arm64: tlbflush.h: add __tlbi() macro

As with dsb() and isb(), add a __tlbi() helper so that we can avoid
distracting asm boilerplate every time we want a TLBI. As some TLBI
operations take an argument while others do not, some pre-processor is
used to handle these two cases with different assembly blocks.

The existing tlbflush.h code is moved over to use the helper.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
[ rename helper to __tlbi, update comment and commit log ]
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from commit db68f3e7594aca77632d56c449bd36c6c931d59a)

Change-Id: I9b94aff5efd20e3485dfa3a2780e1f8130e60d52
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/tlbflush.h | 34 +++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index b460ae28e346..deab52374119 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -24,6 +24,24 @@
 #include <linux/sched.h>
 #include <asm/cputype.h>
 
+/*
+ * Raw TLBI operations.
+ *
+ * Where necessary, use the __tlbi() macro to avoid asm()
+ * boilerplate. Drivers and most kernel code should use the TLB
+ * management routines in preference to the macro below.
+ *
+ * The macro can be used as __tlbi(op) or __tlbi(op, arg), depending
+ * on whether a particular TLBI operation takes an argument or
+ * not. The macros handles invoking the asm with or without the
+ * register argument as appropriate.
+ */
+#define __TLBI_0(op, arg)		asm ("tlbi " #op)
+#define __TLBI_1(op, arg)		asm ("tlbi " #op ", %0" : : "r" (arg))
+#define __TLBI_N(op, arg, n, ...)	__TLBI_##n(op, arg)
+
+#define __tlbi(op, ...)		__TLBI_N(op, ##__VA_ARGS__, 1, 0)
+
 /*
  *	TLB Management
  *	==============
@@ -66,7 +84,7 @@
 static inline void local_flush_tlb_all(void)
 {
 	dsb(nshst);
-	asm("tlbi	vmalle1");
+	__tlbi(vmalle1);
 	dsb(nsh);
 	isb();
 }
@@ -74,7 +92,7 @@ static inline void local_flush_tlb_all(void)
 static inline void flush_tlb_all(void)
 {
 	dsb(ishst);
-	asm("tlbi	vmalle1is");
+	__tlbi(vmalle1is);
 	dsb(ish);
 	isb();
 }
@@ -84,7 +102,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 	unsigned long asid = ASID(mm) << 48;
 
 	dsb(ishst);
-	asm("tlbi	aside1is, %0" : : "r" (asid));
+	__tlbi(aside1is, asid);
 	dsb(ish);
 }
 
@@ -94,7 +112,7 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 	unsigned long addr = uaddr >> 12 | (ASID(vma->vm_mm) << 48);
 
 	dsb(ishst);
-	asm("tlbi	vale1is, %0" : : "r" (addr));
+	__tlbi(vale1is, addr);
 	dsb(ish);
 }
 
@@ -122,9 +140,9 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	dsb(ishst);
 	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) {
 		if (last_level)
-			asm("tlbi vale1is, %0" : : "r"(addr));
+			__tlbi(vale1is, addr);
 		else
-			asm("tlbi vae1is, %0" : : "r"(addr));
+			__tlbi(vae1is, addr);
 	}
 	dsb(ish);
 }
@@ -149,7 +167,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
 
 	dsb(ishst);
 	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
-		asm("tlbi vaae1is, %0" : : "r"(addr));
+		__tlbi(vaae1is, addr);
 	dsb(ish);
 	isb();
 }
@@ -163,7 +181,7 @@ static inline void __flush_tlb_pgtable(struct mm_struct *mm,
 {
 	unsigned long addr = uaddr >> 12 | (ASID(mm) << 48);
 
-	asm("tlbi	vae1is, %0" : : "r" (addr));
+	__tlbi(vae1is, addr);
 	dsb(ish);
 }
 

From 7f436e6b4ec682845d0842b7e1fb2546c40fe740 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 19 Jul 2017 17:24:49 +0100
Subject: [PATCH 1366/3220] UPSTREAM: arm64: factor out entry stack
 manipulation

In subsequent patches, we will detect stack overflow in our exception
entry code, by verifying the SP after it has been decremented to make
space for the exception regs.

This verification code is small, and we can minimize its impact by
placing it directly in the vectors. To avoid redundant modification of
the SP, we also need to move the initial decrement of the SP into the
vectors.

As a preparatory step, this patch introduces kernel_ventry, which
performs this decrement, and updates the entry code accordingly.
Subsequent patches will fold SP verification into kernel_ventry.

There should be no functional change as a result of this patch.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
[Mark: turn into prep patch, expand commit msg]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
(cherry picked from commit b11e5759bfac0c474d95ec4780b1566350e64cad)

Change-Id: I5883da81b374498f2f9e16ccb596b22c5568f2fe
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/kernel/entry.S | 47 ++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index dba3aceaed2f..e89231f1e323 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -70,8 +70,13 @@
 #define BAD_FIQ		2
 #define BAD_ERROR	3
 
-	.macro	kernel_entry, el, regsize = 64
+	.macro kernel_ventry	label
+	.align 7
 	sub	sp, sp, #S_FRAME_SIZE
+	b	\label
+	.endm
+
+	.macro	kernel_entry, el, regsize = 64
 	.if	\regsize == 32
 	mov	w0, w0				// zero upper 32 bits of x0
 	.endif
@@ -345,31 +350,31 @@ tsk	.req	x28		// current thread_info
 
 	.align	11
 ENTRY(vectors)
-	ventry	el1_sync_invalid		// Synchronous EL1t
-	ventry	el1_irq_invalid			// IRQ EL1t
-	ventry	el1_fiq_invalid			// FIQ EL1t
-	ventry	el1_error_invalid		// Error EL1t
+	kernel_ventry	el1_sync_invalid		// Synchronous EL1t
+	kernel_ventry	el1_irq_invalid			// IRQ EL1t
+	kernel_ventry	el1_fiq_invalid			// FIQ EL1t
+	kernel_ventry	el1_error_invalid		// Error EL1t
 
-	ventry	el1_sync			// Synchronous EL1h
-	ventry	el1_irq				// IRQ EL1h
-	ventry	el1_fiq_invalid			// FIQ EL1h
-	ventry	el1_error_invalid		// Error EL1h
+	kernel_ventry	el1_sync			// Synchronous EL1h
+	kernel_ventry	el1_irq				// IRQ EL1h
+	kernel_ventry	el1_fiq_invalid			// FIQ EL1h
+	kernel_ventry	el1_error_invalid		// Error EL1h
 
-	ventry	el0_sync			// Synchronous 64-bit EL0
-	ventry	el0_irq				// IRQ 64-bit EL0
-	ventry	el0_fiq_invalid			// FIQ 64-bit EL0
-	ventry	el0_error_invalid		// Error 64-bit EL0
+	kernel_ventry	el0_sync			// Synchronous 64-bit EL0
+	kernel_ventry	el0_irq				// IRQ 64-bit EL0
+	kernel_ventry	el0_fiq_invalid			// FIQ 64-bit EL0
+	kernel_ventry	el0_error_invalid		// Error 64-bit EL0
 
 #ifdef CONFIG_COMPAT
-	ventry	el0_sync_compat			// Synchronous 32-bit EL0
-	ventry	el0_irq_compat			// IRQ 32-bit EL0
-	ventry	el0_fiq_invalid_compat		// FIQ 32-bit EL0
-	ventry	el0_error_invalid_compat	// Error 32-bit EL0
+	kernel_ventry	el0_sync_compat			// Synchronous 32-bit EL0
+	kernel_ventry	el0_irq_compat			// IRQ 32-bit EL0
+	kernel_ventry	el0_fiq_invalid_compat		// FIQ 32-bit EL0
+	kernel_ventry	el0_error_invalid_compat	// Error 32-bit EL0
 #else
-	ventry	el0_sync_invalid		// Synchronous 32-bit EL0
-	ventry	el0_irq_invalid			// IRQ 32-bit EL0
-	ventry	el0_fiq_invalid			// FIQ 32-bit EL0
-	ventry	el0_error_invalid		// Error 32-bit EL0
+	kernel_ventry	el0_sync_invalid		// Synchronous 32-bit EL0
+	kernel_ventry	el0_irq_invalid			// IRQ 32-bit EL0
+	kernel_ventry	el0_fiq_invalid			// FIQ 32-bit EL0
+	kernel_ventry	el0_error_invalid		// Error 32-bit EL0
 #endif
 END(vectors)
 

From 44f905b3a22fd7030bd5dc9756ffa7e74a0cfb44 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 12:56:18 +0100
Subject: [PATCH 1367/3220] FROMLIST: arm64: mm: Use non-global mappings for
 kernel space

In preparation for unmapping the kernel whilst running in userspace,
make the kernel mappings non-global so we can avoid expensive TLB
invalidation on kernel exit to userspace.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit e046eb0c9bf26d94be9e4592c00c7a78b0fa9bfd)

Change-Id: If53d6db042f8fefff3ecf8a7658291e1f1ac659f
[ghackmann@google.com: apply pgtable-prot.h changes to pgtable.h instead]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/kernel-pgtable.h | 12 ++++++++++--
 arch/arm64/include/asm/pgtable.h        | 17 +++++++++++++----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 7803343e5881..77a27af01371 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -78,8 +78,16 @@
 /*
  * Initial memory map attributes.
  */
-#define SWAPPER_PTE_FLAGS	(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
-#define SWAPPER_PMD_FLAGS	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+#define _SWAPPER_PTE_FLAGS	(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
+#define _SWAPPER_PMD_FLAGS	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define SWAPPER_PTE_FLAGS	(_SWAPPER_PTE_FLAGS | PTE_NG)
+#define SWAPPER_PMD_FLAGS	(_SWAPPER_PMD_FLAGS | PMD_SECT_NG)
+#else
+#define SWAPPER_PTE_FLAGS	_SWAPPER_PTE_FLAGS
+#define SWAPPER_PMD_FLAGS	_SWAPPER_PMD_FLAGS
+#endif
 
 #if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_MM_MMUFLAGS	(PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index fa210858b873..def3ba0901ba 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -61,8 +61,16 @@ extern void __pmd_error(const char *file, int line, unsigned long val);
 extern void __pud_error(const char *file, int line, unsigned long val);
 extern void __pgd_error(const char *file, int line, unsigned long val);
 
-#define PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
-#define PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+#define _PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
+#define _PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define PROT_DEFAULT		(_PROT_DEFAULT | PTE_NG)
+#define PROT_SECT_DEFAULT	(_PROT_SECT_DEFAULT | PMD_SECT_NG)
+#else
+#define PROT_DEFAULT		_PROT_DEFAULT
+#define PROT_SECT_DEFAULT	_PROT_SECT_DEFAULT
+#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
 
 #define PROT_DEVICE_nGnRnE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
 #define PROT_DEVICE_nGnRE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE))
@@ -75,6 +83,7 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
 #define PROT_SECT_NORMAL_EXEC	(PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
 
 #define _PAGE_DEFAULT		(PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
+#define _HYP_PAGE_DEFAULT	(_PAGE_DEFAULT & ~PTE_NG)
 
 #define PAGE_KERNEL		__pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 #define PAGE_KERNEL_RO		__pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_RDONLY)
@@ -82,13 +91,13 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
 #define PAGE_KERNEL_EXEC	__pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 #define PAGE_KERNEL_EXEC_CONT	__pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
 
-#define PAGE_HYP		__pgprot(_PAGE_DEFAULT | PTE_HYP)
+#define PAGE_HYP		__pgprot(_HYP_PAGE_DEFAULT | PTE_HYP)
 #define PAGE_HYP_DEVICE		__pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
 
 #define PAGE_S2			__pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
 #define PAGE_S2_DEVICE		__pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN)
 
-#define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_PXN | PTE_UXN)
+#define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_NG | PTE_PXN | PTE_UXN)
 #define PAGE_SHARED		__pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
 #define PAGE_SHARED_EXEC	__pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE)
 #define PAGE_COPY		__pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)

From 0d78166ea4cb477660d5e4c3582fd3c1f2470a6c Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 13:04:48 +0100
Subject: [PATCH 1368/3220] FROMLIST: arm64: mm: Temporarily disable
 ARM64_SW_TTBR0_PAN

We're about to rework the way ASIDs are allocated, switch_mm is
implemented and low-level kernel entry/exit is handled, so keep the
ARM64_SW_TTBR0_PAN code out of the way whilst we do the heavy lifting.

It will be re-enabled in a subsequent patch.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 376133b7edc20f237a42e4c72415cc9e8c0a9704)

Change-Id: I38d3f7a66b1d52abcea3e23b1e80277b03c6dbe0
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1f7ba1848c64..a1df4430f51d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -719,6 +719,7 @@ endif
 
 config ARM64_SW_TTBR0_PAN
 	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
+	depends on BROKEN       # Temporary while switch_mm is reworked
 	help
 	  Enabling this option prevents the kernel from accessing
 	  user-space memory directly by pointing TTBR0_EL1 to a reserved

From 76546093d4300ed5ba9a0415731e23d13c01a0c8 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 13:19:09 +0100
Subject: [PATCH 1369/3220] FROMLIST: arm64: mm: Move ASID from TTBR0 to TTBR1

In preparation for mapping kernelspace and userspace with different
ASIDs, move the ASID to TTBR1 and update switch_mm to context-switch
TTBR0 via an invalid mapping (the zero page).

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 7655abb953860485940d4de74fb45a8192149bb6)

Change-Id: Id8a18e16dfab5c8b7bc31174b14100142a6af3b0
[ghackmann@google.com: adjust context]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/mmu_context.h   | 7 +++++++
 arch/arm64/include/asm/pgtable-hwdef.h | 2 ++
 arch/arm64/include/asm/proc-fns.h      | 6 ------
 arch/arm64/mm/proc.S                   | 9 ++++++---
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 386956aae2dc..85fd438e6c00 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -59,6 +59,13 @@ static inline void cpu_set_reserved_ttbr0(void)
 	: "r" (ttbr));
 }
 
+static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUG_ON(pgd == swapper_pg_dir);
+	cpu_set_reserved_ttbr0();
+	cpu_do_switch_mm(virt_to_phys(pgd),mm);
+}
+
 /*
  * TCR.T0SZ value to use when the ID map is active. Usually equals
  * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 9786f770088d..d7890c0f2d3d 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -224,6 +224,8 @@
 #define TCR_TG1_16K		(UL(1) << 30)
 #define TCR_TG1_4K		(UL(2) << 30)
 #define TCR_TG1_64K		(UL(3) << 30)
+
+#define TCR_A1			(UL(1) << 22)
 #define TCR_ASID16		(UL(1) << 36)
 #define TCR_TBI0		(UL(1) << 37)
 #define TCR_HA			(UL(1) << 39)
diff --git a/arch/arm64/include/asm/proc-fns.h b/arch/arm64/include/asm/proc-fns.h
index 14ad6e4e87d1..16cef2e8449e 100644
--- a/arch/arm64/include/asm/proc-fns.h
+++ b/arch/arm64/include/asm/proc-fns.h
@@ -35,12 +35,6 @@ extern u64 cpu_do_resume(phys_addr_t ptr, u64 idmap_ttbr);
 
 #include <asm/memory.h>
 
-#define cpu_switch_mm(pgd,mm)				\
-do {							\
-	BUG_ON(pgd == swapper_pg_dir);			\
-	cpu_do_switch_mm(virt_to_phys(pgd),mm);		\
-} while (0)
-
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* __ASM_PROCFNS_H */
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 3b3a4710dcd6..2e1e25328fdc 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -138,9 +138,12 @@ ENDPROC(cpu_do_resume)
  *	- pgd_phys - physical address of new TTB
  */
 ENTRY(cpu_do_switch_mm)
+	mrs	x2, ttbr1_el1
 	mmid	x1, x1				// get mm->context.id
-	bfi	x0, x1, #48, #16		// set the ASID
-	msr	ttbr0_el1, x0			// set TTBR0
+	bfi	x2, x1, #48, #16		// set the ASID
+	msr	ttbr1_el1, x2			// in TTBR1 (since TCR.A1 is set)
+	isb
+	msr	ttbr0_el1, x0			// now update TTBR0
 	isb
 	post_ttbr0_update_workaround
 	ret
@@ -223,7 +226,7 @@ ENTRY(__cpu_setup)
 	 * both user and kernel.
 	 */
 	ldr	x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
-			TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0
+			TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 | TCR_A1
 	tcr_set_idmap_t0sz	x10, x9
 
 	/*

From d99c809e7f5d0e769d81a7648a664655c9b6d42d Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 13:58:16 +0100
Subject: [PATCH 1370/3220] FROMLIST: arm64: mm: Fix and re-enable
 ARM64_SW_TTBR0_PAN

With the ASID now installed in TTBR1, we can re-enable ARM64_SW_TTBR0_PAN
by ensuring that we switch to a reserved ASID of zero when disabling
user access and restore the active user ASID on the uaccess enable path.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 27a921e75711d924617269e0ba4adb8bae9fd0d1)

Change-Id: I3b06e02766753c59fac975363a2ead5c5e45b8f3
[ghackmann@google.com: adjust context, applying asm-uaccess.h changes to
 uaccess.h]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/Kconfig               |  1 -
 arch/arm64/include/asm/uaccess.h | 46 +++++++++++++++++++++++---------
 arch/arm64/kernel/entry.S        |  4 +--
 arch/arm64/lib/clear_user.S      |  2 +-
 arch/arm64/lib/copy_from_user.S  |  2 +-
 arch/arm64/lib/copy_in_user.S    |  2 +-
 arch/arm64/lib/copy_to_user.S    |  2 +-
 arch/arm64/mm/cache.S            |  2 +-
 arch/arm64/xen/hypercall.S       |  2 +-
 9 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a1df4430f51d..1f7ba1848c64 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -719,7 +719,6 @@ endif
 
 config ARM64_SW_TTBR0_PAN
 	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
-	depends on BROKEN       # Temporary while switch_mm is reworked
 	help
 	  Enabling this option prevents the kernel from accessing
 	  user-space memory directly by pointing TTBR0_EL1 to a reserved
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 064cef9ae2d1..d4383596a38c 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -144,15 +144,19 @@ static inline void __uaccess_ttbr0_disable(void)
 {
 	unsigned long ttbr;
 
+	ttbr = read_sysreg(ttbr1_el1);
 	/* reserved_ttbr0 placed at the end of swapper_pg_dir */
-	ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE;
-	write_sysreg(ttbr, ttbr0_el1);
+	write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1);
+	isb();
+	/* Set reserved ASID */
+	ttbr &= ~(0xffffUL << 48);
+	write_sysreg(ttbr, ttbr1_el1);
 	isb();
 }
 
 static inline void __uaccess_ttbr0_enable(void)
 {
-	unsigned long flags;
+	unsigned long flags, ttbr0, ttbr1;
 
 	/*
 	 * Disable interrupts to avoid preemption between reading the 'ttbr0'
@@ -160,7 +164,16 @@ static inline void __uaccess_ttbr0_enable(void)
 	 * roll-over and an update of 'ttbr0'.
 	 */
 	local_irq_save(flags);
-	write_sysreg(current_thread_info()->ttbr0, ttbr0_el1);
+	ttbr0 = current_thread_info()->ttbr0;
+
+	/* Restore active ASID */
+	ttbr1 = read_sysreg(ttbr1_el1);
+	ttbr1 |= ttbr0 & (0xffffUL << 48);
+	write_sysreg(ttbr1, ttbr1_el1);
+	isb();
+
+	/* Restore user page table */
+	write_sysreg(ttbr0, ttbr0_el1);
 	isb();
 	local_irq_restore(flags);
 }
@@ -442,11 +455,20 @@ extern __must_check long strnlen_user(const char __user *str, long n);
 	add	\tmp1, \tmp1, #SWAPPER_DIR_SIZE	// reserved_ttbr0 at the end of swapper_pg_dir
 	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
 	isb
+	sub	\tmp1, \tmp1, #SWAPPER_DIR_SIZE
+	bic	\tmp1, \tmp1, #(0xffff << 48)
+	msr	ttbr1_el1, \tmp1		// set reserved ASID
+	isb
 	.endm
 
-	.macro	__uaccess_ttbr0_enable, tmp1
+	.macro	__uaccess_ttbr0_enable, tmp1, tmp2
 	get_thread_info \tmp1
 	ldr	\tmp1, [\tmp1, #TSK_TI_TTBR0]	// load saved TTBR0_EL1
+	mrs	\tmp2, ttbr1_el1
+	extr    \tmp2, \tmp2, \tmp1, #48
+	ror     \tmp2, \tmp2, #16
+	msr	ttbr1_el1, \tmp2		// set the active ASID
+	isb
 	msr	ttbr0_el1, \tmp1		// set the non-PAN TTBR0_EL1
 	isb
 	.endm
@@ -457,18 +479,18 @@ alternative_if_not ARM64_HAS_PAN
 alternative_else_nop_endif
 	.endm
 
-	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2, tmp3
 alternative_if_not ARM64_HAS_PAN
-	save_and_disable_irq \tmp2		// avoid preemption
-	__uaccess_ttbr0_enable \tmp1
-	restore_irq \tmp2
+	save_and_disable_irq \tmp3		// avoid preemption
+	__uaccess_ttbr0_enable \tmp1, \tmp2
+	restore_irq \tmp3
 alternative_else_nop_endif
 	.endm
 #else
 	.macro	uaccess_ttbr0_disable, tmp1
 	.endm
 
-	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2, tmp3
 	.endm
 #endif
 
@@ -482,8 +504,8 @@ alternative_if ARM64_ALT_PAN_NOT_UAO
 alternative_else_nop_endif
 	.endm
 
-	.macro	uaccess_enable_not_uao, tmp1, tmp2
-	uaccess_ttbr0_enable \tmp1, \tmp2
+	.macro	uaccess_enable_not_uao, tmp1, tmp2, tmp3
+	uaccess_ttbr0_enable \tmp1, \tmp2, \tmp3
 alternative_if ARM64_ALT_PAN_NOT_UAO
 	SET_PSTATE_PAN(0)
 alternative_else_nop_endif
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index e89231f1e323..9f3c86da7005 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -145,7 +145,7 @@ alternative_if ARM64_HAS_PAN
 alternative_else_nop_endif
 
 	.if	\el != 0
-	mrs	x21, ttbr0_el1
+	mrs	x21, ttbr1_el1
 	tst	x21, #0xffff << 48		// Check for the reserved ASID
 	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
 	b.eq	1f				// TTBR0 access already disabled
@@ -213,7 +213,7 @@ alternative_else_nop_endif
 	tbnz	x22, #22, 1f			// Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
 	.endif
 
-	__uaccess_ttbr0_enable x0
+	__uaccess_ttbr0_enable x0, x1
 
 	.if	\el == 0
 	/*
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index d7150e30438a..dd65ca253eb4 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -30,7 +30,7 @@
  * Alignment fixed up by hardware.
  */
 ENTRY(__clear_user)
-	uaccess_enable_not_uao x2, x3
+	uaccess_enable_not_uao x2, x3, x4
 	mov	x2, x1			// save the size for fixup return
 	subs	x1, x1, #8
 	b.mi	2f
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 90154f3f7f2a..1ff23f81e242 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -64,7 +64,7 @@
 
 end	.req	x5
 ENTRY(__arch_copy_from_user)
-	uaccess_enable_not_uao x3, x4
+	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
 	uaccess_disable_not_uao x3
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 718b1c4e2f85..074d52fcd75b 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -65,7 +65,7 @@
 
 end	.req	x5
 ENTRY(__copy_in_user)
-	uaccess_enable_not_uao x3, x4
+	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
 	uaccess_disable_not_uao x3
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index e99e31c9acac..67118444cde0 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -63,7 +63,7 @@
 
 end	.req	x5
 ENTRY(__arch_copy_to_user)
-	uaccess_enable_not_uao x3, x4
+	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
 	uaccess_disable_not_uao x3
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 3be2cda5dbda..d7371925f9e2 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -49,7 +49,7 @@ ENTRY(flush_icache_range)
  *	- end     - virtual end address of region
  */
 ENTRY(__flush_cache_user_range)
-	uaccess_ttbr0_enable x2, x3
+	uaccess_ttbr0_enable x2, x3, x4
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x0, x3
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index b96db5dafec4..27b38711023b 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -98,7 +98,7 @@ ENTRY(privcmd_call)
 	 * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
 	 * is enabled (it implies that hardware UAO and PAN disabled).
 	 */
-	uaccess_ttbr0_enable x6, x7
+	uaccess_ttbr0_enable x6, x7, x8
 	hvc XEN_IMM
 
 	/*

From 290ef683657ca5738372cc669e80d6a89f000fb2 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 14:10:28 +0100
Subject: [PATCH 1371/3220] FROMLIST: arm64: mm: Allocate ASIDs in pairs

In preparation for separate kernel/user ASIDs, allocate them in pairs
for each mm_struct. The bottom bit distinguishes the two: if it is set,
then the ASID will map only userspace.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 0c8ea531b7740754cf374ca8b7510655f569c5e3)

Change-Id: I283c99292b165e04ff1b6b9cb5806805974ae915
[ghackmann@google.com: adjust context]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/mmu.h |  2 ++
 arch/arm64/mm/context.c      | 25 +++++++++++++++++--------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 990124a67eeb..be814d4f71cb 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -16,6 +16,8 @@
 #ifndef __ASM_MMU_H
 #define __ASM_MMU_H
 
+#define USER_ASID_FLAG	(UL(1) << 48)
+
 typedef struct {
 	atomic64_t	id;
 	void		*vdso;
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 25128089c386..10d68e438a37 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -38,7 +38,16 @@ static cpumask_t tlb_flush_pending;
 
 #define ASID_MASK		(~GENMASK(asid_bits - 1, 0))
 #define ASID_FIRST_VERSION	(1UL << asid_bits)
-#define NUM_USER_ASIDS		ASID_FIRST_VERSION
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define NUM_USER_ASIDS		(ASID_FIRST_VERSION >> 1)
+#define asid2idx(asid)		(((asid) & ~ASID_MASK) >> 1)
+#define idx2asid(idx)		(((idx) << 1) & ~ASID_MASK)
+#else
+#define NUM_USER_ASIDS		(ASID_FIRST_VERSION)
+#define asid2idx(asid)		((asid) & ~ASID_MASK)
+#define idx2asid(idx)		asid2idx(idx)
+#endif
 
 static void flush_context(unsigned int cpu)
 {
@@ -65,7 +74,7 @@ static void flush_context(unsigned int cpu)
 		 */
 		if (asid == 0)
 			asid = per_cpu(reserved_asids, i);
-		__set_bit(asid & ~ASID_MASK, asid_map);
+		__set_bit(asid2idx(asid), asid_map);
 		per_cpu(reserved_asids, i) = asid;
 	}
 
@@ -120,16 +129,16 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 		 * We had a valid ASID in a previous life, so try to re-use
 		 * it if possible.
 		 */
-		asid &= ~ASID_MASK;
-		if (!__test_and_set_bit(asid, asid_map))
+		if (!__test_and_set_bit(asid2idx(asid), asid_map))
 			return newasid;
 	}
 
 	/*
 	 * Allocate a free ASID. If we can't find one, take a note of the
-	 * currently active ASIDs and mark the TLBs as requiring flushes.
-	 * We always count from ASID #1, as we use ASID #0 when setting a
-	 * reserved TTBR0 for the init_mm.
+	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
+	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
+	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
+	 * pairs.
 	 */
 	asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx);
 	if (asid != NUM_USER_ASIDS)
@@ -146,7 +155,7 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 set_asid:
 	__set_bit(asid, asid_map);
 	cur_idx = asid;
-	return asid | generation;
+	return idx2asid(asid) | generation;
 }
 
 void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)

From 9497136713f61e65b91329750cc1c013616ccf7f Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 13:58:08 +0000
Subject: [PATCH 1372/3220] FROMLIST: arm64: mm: Add
 arm64_kernel_unmapped_at_el0 helper

In order for code such as TLB invalidation to operate efficiently when
the decision to map the kernel at EL0 is determined at runtime, this
patch introduces a helper function, arm64_kernel_unmapped_at_el0, to
determine whether or not the kernel is mapped whilst running in userspace.

Currently, this just reports the value of CONFIG_UNMAP_KERNEL_AT_EL0,
but will later be hooked up to a fake CPU capability using a static key.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit fc0e1299da548b32440051f58f08e0c1eb7edd0b)

Change-Id: I0f48eadf55ee97f09553380a62d9fffe54d9dc83
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/mmu.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index be814d4f71cb..8e60392c9137 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -18,6 +18,8 @@
 
 #define USER_ASID_FLAG	(UL(1) << 48)
 
+#ifndef __ASSEMBLY__
+
 typedef struct {
 	atomic64_t	id;
 	void		*vdso;
@@ -30,6 +32,11 @@ typedef struct {
  */
 #define ASID(mm)	((mm)->context.id.counter & 0xffff)
 
+static inline bool arm64_kernel_unmapped_at_el0(void)
+{
+	return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
+}
+
 extern void paging_init(void);
 extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
 extern void init_mem_pgprot(void);
@@ -38,4 +45,5 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       pgprot_t prot);
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys);
 
+#endif	/* !__ASSEMBLY__ */
 #endif

From 47b558b4299178ad11a3aea318c6246657abac35 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 14:13:33 +0100
Subject: [PATCH 1373/3220] FROMLIST: arm64: mm: Invalidate both kernel and
 user ASIDs when performing TLBI

Since an mm has both a kernel and a user ASID, we need to ensure that
broadcast TLB maintenance targets both address spaces so that things
like CoW continue to work with the uaccess primitives in the kernel.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 9b0de864b5bc298ea53005ad812f3386f81aee9c)

Change-Id: I2369f242a6461795349568cc68ae6324244e6709
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/tlbflush.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index deab52374119..ad6bd8b26ada 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -23,6 +23,7 @@
 
 #include <linux/sched.h>
 #include <asm/cputype.h>
+#include <asm/mmu.h>
 
 /*
  * Raw TLBI operations.
@@ -42,6 +43,11 @@
 
 #define __tlbi(op, ...)		__TLBI_N(op, ##__VA_ARGS__, 1, 0)
 
+#define __tlbi_user(op, arg) do {						\
+	if (arm64_kernel_unmapped_at_el0())					\
+		__tlbi(op, (arg) | USER_ASID_FLAG);				\
+} while (0)
+
 /*
  *	TLB Management
  *	==============
@@ -103,6 +109,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 
 	dsb(ishst);
 	__tlbi(aside1is, asid);
+	__tlbi_user(aside1is, asid);
 	dsb(ish);
 }
 
@@ -113,6 +120,7 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 
 	dsb(ishst);
 	__tlbi(vale1is, addr);
+	__tlbi_user(vale1is, addr);
 	dsb(ish);
 }
 
@@ -139,10 +147,13 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 
 	dsb(ishst);
 	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) {
-		if (last_level)
+		if (last_level) {
 			__tlbi(vale1is, addr);
-		else
+			__tlbi_user(vale1is, addr);
+		} else {
 			__tlbi(vae1is, addr);
+			__tlbi_user(vae1is, addr);
+		}
 	}
 	dsb(ish);
 }
@@ -182,6 +193,7 @@ static inline void __flush_tlb_pgtable(struct mm_struct *mm,
 	unsigned long addr = uaddr >> 12 | (ASID(mm) << 48);
 
 	__tlbi(vae1is, addr);
+	__tlbi_user(vae1is, addr);
 	dsb(ish);
 }
 

From 6fdb633b0b4f6c7b5bd7be4026e0c6b4a39f6633 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:07:40 +0000
Subject: [PATCH 1374/3220] FROMLIST: arm64: entry: Add exception trampoline
 page for exceptions from EL0

To allow unmapping of the kernel whilst running at EL0, we need to
point the exception vectors at an entry trampoline that can map/unmap
the kernel on entry/exit respectively.

This patch adds the trampoline page, although it is not yet plugged
into the vector table and is therefore unused.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit c7b9adaf85f818d747eeff5145eb4095ccd587fb)

Change-Id: Idd27ab26f1ec1db2ff756fc33ebb782201806f7c
[ghackmann@google.com: adjust context]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/kernel/entry.S       | 85 +++++++++++++++++++++++++++++++++
 arch/arm64/kernel/vmlinux.lds.S | 17 +++++++
 2 files changed, 102 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 9f3c86da7005..08dc4b01d4da 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -29,6 +29,7 @@
 #include <asm/esr.h>
 #include <asm/irq.h>
 #include <asm/memory.h>
+#include <asm/mmu.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
 #include <asm/uaccess.h>
@@ -914,6 +915,90 @@ __ni_sys_trace:
 	bl	do_ni_syscall
 	b	__sys_trace_return
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+/*
+ * Exception vectors trampoline.
+ */
+	.pushsection ".entry.tramp.text", "ax"
+
+	.macro tramp_map_kernel, tmp
+	mrs	\tmp, ttbr1_el1
+	sub	\tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
+	bic	\tmp, \tmp, #USER_ASID_FLAG
+	msr	ttbr1_el1, \tmp
+	.endm
+
+	.macro tramp_unmap_kernel, tmp
+	mrs	\tmp, ttbr1_el1
+	add	\tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
+	orr	\tmp, \tmp, #USER_ASID_FLAG
+	msr	ttbr1_el1, \tmp
+	/*
+	 * We avoid running the post_ttbr_update_workaround here because the
+	 * user and kernel ASIDs don't have conflicting mappings, so any
+	 * "blessing" as described in:
+	 *
+	 *   http://lkml.kernel.org/r/56BB848A.6060603@caviumnetworks.com
+	 *
+	 * will not hurt correctness. Whilst this may partially defeat the
+	 * point of using split ASIDs in the first place, it avoids
+	 * the hit of invalidating the entire I-cache on every return to
+	 * userspace.
+	 */
+	.endm
+
+	.macro tramp_ventry, regsize = 64
+	.align	7
+1:
+	.if	\regsize == 64
+	msr	tpidrro_el0, x30	// Restored in kernel_ventry
+	.endif
+	tramp_map_kernel	x30
+	ldr	x30, =vectors
+	prfm	plil1strm, [x30, #(1b - tramp_vectors)]
+	msr	vbar_el1, x30
+	add	x30, x30, #(1b - tramp_vectors)
+	isb
+	br	x30
+	.endm
+
+	.macro tramp_exit, regsize = 64
+	adr	x30, tramp_vectors
+	msr	vbar_el1, x30
+	tramp_unmap_kernel	x30
+	.if	\regsize == 64
+	mrs	x30, far_el1
+	.endif
+	eret
+	.endm
+
+	.align	11
+ENTRY(tramp_vectors)
+	.space	0x400
+
+	tramp_ventry
+	tramp_ventry
+	tramp_ventry
+	tramp_ventry
+
+	tramp_ventry	32
+	tramp_ventry	32
+	tramp_ventry	32
+	tramp_ventry	32
+END(tramp_vectors)
+
+ENTRY(tramp_exit_native)
+	tramp_exit
+END(tramp_exit_native)
+
+ENTRY(tramp_exit_compat)
+	tramp_exit	32
+END(tramp_exit_compat)
+
+	.ltorg
+	.popsection				// .entry.tramp.text
+#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
+
 /*
  * Special system call wrappers.
  */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 9572844c57a5..a0432ab720cd 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -46,6 +46,17 @@ jiffies = jiffies_64;
 	*(.idmap.text)					\
 	VMLINUX_SYMBOL(__idmap_text_end) = .;
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define TRAMP_TEXT					\
+	. = ALIGN(PAGE_SIZE);				\
+	VMLINUX_SYMBOL(__entry_tramp_text_start) = .;	\
+	*(.entry.tramp.text)				\
+	. = ALIGN(PAGE_SIZE);				\
+	VMLINUX_SYMBOL(__entry_tramp_text_end) = .;
+#else
+#define TRAMP_TEXT
+#endif
+
 /*
  * The size of the PE/COFF section that covers the kernel image, which
  * runs from stext to _edata, must be a round multiple of the PE/COFF
@@ -109,6 +120,7 @@ SECTIONS
 			LOCK_TEXT
 			HYPERVISOR_TEXT
 			IDMAP_TEXT
+			TRAMP_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 		. = ALIGN(16);
@@ -191,6 +203,11 @@ SECTIONS
 	. += RESERVED_TTBR0_SIZE;
 #endif
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	tramp_pg_dir = .;
+	. += PAGE_SIZE;
+#endif
+
 	_end = .;
 
 	STABS_DEBUG

From 39685f556c0cda356623d8892ecddac11f50a560 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:14:17 +0000
Subject: [PATCH 1375/3220] FROMLIST: arm64: mm: Map entry trampoline into
 trampoline and kernel page tables

The exception entry trampoline needs to be mapped at the same virtual
address in both the trampoline page table (which maps nothing else)
and also the kernel page table, so that we can swizzle TTBR1_EL1 on
exceptions from and return to EL0.

This patch maps the trampoline at a fixed virtual address in the fixmap
area of the kernel virtual address space, which allows the kernel proper
to be randomized with respect to the trampoline when KASLR is enabled.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 51a0048beb449682d632d0af52a515adb9f9882e)

Change-Id: I31b2dcdf4db36c3e31181fe43ccb984f9efb6ac6
[ghackmann@google.com:
 - adjust context
 - tweak __create_pgd_mapping() call to match 4.4 APIs]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/fixmap.h  |  4 ++++
 arch/arm64/include/asm/pgtable.h |  1 +
 arch/arm64/kernel/asm-offsets.c  |  5 +++++
 arch/arm64/mm/mmu.c              | 23 +++++++++++++++++++++++
 4 files changed, 33 insertions(+)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 1a617d46fce9..8fa98f26129a 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -50,6 +50,10 @@ enum fixed_addresses {
 
 	FIX_EARLYCON_MEM_BASE,
 	FIX_TEXT_POKE0,
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	FIX_ENTRY_TRAMP_TEXT,
+#define TRAMP_VALIAS		(__fix_to_virt(FIX_ENTRY_TRAMP_TEXT))
+#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
 	__end_of_permanent_fixed_addresses,
 
 	/*
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index def3ba0901ba..81eee57ad519 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -714,6 +714,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
+extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
 
 /*
  * Encode and decode a swap entry:
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 3288c0cae9e6..3fa949f04ce4 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/kvm_host.h>
+#include <asm/fixmap.h>
 #include <asm/thread_info.h>
 #include <asm/memory.h>
 #include <asm/smp_plat.h>
@@ -182,5 +183,9 @@ int main(void)
 #endif
   DEFINE(ARM_SMCCC_RES_X0_OFFS,	offsetof(struct arm_smccc_res, a0));
   DEFINE(ARM_SMCCC_RES_X2_OFFS,	offsetof(struct arm_smccc_res, a2));
+  BLANK();
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+  DEFINE(TRAMP_VALIAS,		TRAMP_VALIAS);
+#endif
   return 0;
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e6409dc0cea9..a383ec16f1b7 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -494,6 +494,29 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end,
 	vm_area_add_early(vma);
 }
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+static int __init map_entry_trampoline(void)
+{
+	extern char __entry_tramp_text_start[];
+
+	pgprot_t prot = PAGE_KERNEL_EXEC;
+	phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
+
+	/* The trampoline is always mapped and can therefore be global */
+	pgprot_val(prot) &= ~PTE_NG;
+
+	/* Map only the text into the trampoline page table */
+	memset(tramp_pg_dir, 0, PGD_SIZE);
+	__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
+			     prot, late_pgtable_alloc);
+
+	/* ...as well as the kernel page table */
+	__set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
+	return 0;
+}
+core_initcall(map_entry_trampoline);
+#endif
+
 /*
  * Create fine-grained mappings for the kernel.
  */

From 3dcb84674478b1dc2dfe292a8fe81220f4958467 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:20:21 +0000
Subject: [PATCH 1376/3220] FROMLIST: arm64: entry: Explicitly pass exception
 level to kernel_ventry macro

We will need to treat exceptions from EL0 differently in kernel_ventry,
so rework the macro to take the exception level as an argument and
construct the branch target using that.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 5b1f7fe41909cde40decad9f0e8ee585777a0538)

Change-Id: Iab10d2237e24c008d05856a4bd953504de6e10a8
[ghackmann@google.com: adjust context and kernel entry point names]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/kernel/entry.S | 44 +++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 08dc4b01d4da..7d178ffb1c1b 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -71,10 +71,10 @@
 #define BAD_FIQ		2
 #define BAD_ERROR	3
 
-	.macro kernel_ventry	label
+	.macro kernel_ventry, el, label, regsize = 64
 	.align 7
 	sub	sp, sp, #S_FRAME_SIZE
-	b	\label
+	b	el\()\el\()_\label
 	.endm
 
 	.macro	kernel_entry, el, regsize = 64
@@ -351,31 +351,31 @@ tsk	.req	x28		// current thread_info
 
 	.align	11
 ENTRY(vectors)
-	kernel_ventry	el1_sync_invalid		// Synchronous EL1t
-	kernel_ventry	el1_irq_invalid			// IRQ EL1t
-	kernel_ventry	el1_fiq_invalid			// FIQ EL1t
-	kernel_ventry	el1_error_invalid		// Error EL1t
+	kernel_ventry	1, sync_invalid			// Synchronous EL1t
+	kernel_ventry	1, irq_invalid			// IRQ EL1t
+	kernel_ventry	1, fiq_invalid			// FIQ EL1t
+	kernel_ventry	1, error_invalid		// Error EL1t
 
-	kernel_ventry	el1_sync			// Synchronous EL1h
-	kernel_ventry	el1_irq				// IRQ EL1h
-	kernel_ventry	el1_fiq_invalid			// FIQ EL1h
-	kernel_ventry	el1_error_invalid		// Error EL1h
+	kernel_ventry	1, sync				// Synchronous EL1h
+	kernel_ventry	1, irq				// IRQ EL1h
+	kernel_ventry	1, fiq_invalid			// FIQ EL1h
+	kernel_ventry	1, error_invalid		// Error EL1h
 
-	kernel_ventry	el0_sync			// Synchronous 64-bit EL0
-	kernel_ventry	el0_irq				// IRQ 64-bit EL0
-	kernel_ventry	el0_fiq_invalid			// FIQ 64-bit EL0
-	kernel_ventry	el0_error_invalid		// Error 64-bit EL0
+	kernel_ventry	0, sync				// Synchronous 64-bit EL0
+	kernel_ventry	0, irq				// IRQ 64-bit EL0
+	kernel_ventry	0, fiq_invalid			// FIQ 64-bit EL0
+	kernel_ventry	0, error_invalid		// Error 64-bit EL0
 
 #ifdef CONFIG_COMPAT
-	kernel_ventry	el0_sync_compat			// Synchronous 32-bit EL0
-	kernel_ventry	el0_irq_compat			// IRQ 32-bit EL0
-	kernel_ventry	el0_fiq_invalid_compat		// FIQ 32-bit EL0
-	kernel_ventry	el0_error_invalid_compat	// Error 32-bit EL0
+	kernel_ventry	0, sync_compat, 32		// Synchronous 32-bit EL0
+	kernel_ventry	0, irq_compat, 32		// IRQ 32-bit EL0
+	kernel_ventry	0, fiq_invalid_compat, 32	// FIQ 32-bit EL0
+	kernel_ventry	0, error_invalid_compat, 32	// Error 32-bit EL0
 #else
-	kernel_ventry	el0_sync_invalid		// Synchronous 32-bit EL0
-	kernel_ventry	el0_irq_invalid			// IRQ 32-bit EL0
-	kernel_ventry	el0_fiq_invalid			// FIQ 32-bit EL0
-	kernel_ventry	el0_error_invalid		// Error 32-bit EL0
+	kernel_ventry	0, sync_invalid, 32		// Synchronous 32-bit EL0
+	kernel_ventry	0, irq_invalid, 32		// IRQ 32-bit EL0
+	kernel_ventry	0, fiq_invalid, 32		// FIQ 32-bit EL0
+	kernel_ventry	0, error_invalid, 32		// Error 32-bit EL0
 #endif
 END(vectors)
 

From 75f5a2df1df353bb135d8935dd0bcf1f79f96b88 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:24:29 +0000
Subject: [PATCH 1377/3220] FROMLIST: arm64: entry: Hook up entry trampoline to
 exception vectors

Hook up the entry trampoline to our exception vectors so that all
exceptions from and returns to EL0 go via the trampoline, which swizzles
the vector base register accordingly. Transitioning to and from the
kernel clobbers x30, so we use tpidrro_el0 and far_el1 as scratch
registers for native tasks.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 4bf3286d29f3a88425d8d8cd53428cbb8f865f04)

Change-Id: Id1e175bdaa0ec2bf8e59f941502183907902a710
[ghackmann@google.com: adjust context, replacing
 alternative_if_not ARM64_WORKAROUND_845719 block with upstream version]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/kernel/entry.S | 49 ++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 7d178ffb1c1b..bf52151f1e20 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -73,10 +73,26 @@
 
 	.macro kernel_ventry, el, label, regsize = 64
 	.align 7
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	.if	\el == 0
+	.if	\regsize == 64
+	mrs	x30, tpidrro_el0
+	msr	tpidrro_el0, xzr
+	.else
+	mov	x30, xzr
+	.endif
+	.endif
+#endif
+
 	sub	sp, sp, #S_FRAME_SIZE
 	b	el\()\el\()_\label
 	.endm
 
+	.macro tramp_alias, dst, sym
+	mov_q	\dst, TRAMP_VALIAS
+	add	\dst, \dst, #(\sym - .entry.tramp.text)
+	.endm
+
 	.macro	kernel_entry, el, regsize = 64
 	.if	\regsize == 32
 	mov	w0, w0				// zero upper 32 bits of x0
@@ -235,24 +251,20 @@ alternative_else_nop_endif
 	.if	\el == 0
 	ldr	x23, [sp, #S_SP]		// load return stack pointer
 	msr	sp_el0, x23
+	tst	x22, #PSR_MODE32_BIT		// native task?
+	b.eq	3f
+
 #ifdef CONFIG_ARM64_ERRATUM_845719
-alternative_if_not ARM64_WORKAROUND_845719
-	nop
-	nop
-#ifdef CONFIG_PID_IN_CONTEXTIDR
-	nop
-#endif
-alternative_else
-	tbz	x22, #4, 1f
+alternative_if ARM64_WORKAROUND_845719
 #ifdef CONFIG_PID_IN_CONTEXTIDR
 	mrs	x29, contextidr_el1
 	msr	contextidr_el1, x29
 #else
 	msr contextidr_el1, xzr
 #endif
-1:
-alternative_endif
+alternative_else_nop_endif
 #endif
+3:
 	.endif
 
 	msr	elr_el1, x21			// set up the return data
@@ -274,7 +286,22 @@ alternative_endif
 	ldp	x28, x29, [sp, #16 * 14]
 	ldr	lr, [sp, #S_LR]
 	add	sp, sp, #S_FRAME_SIZE		// restore sp
-	eret					// return to kernel
+
+#ifndef CONFIG_UNMAP_KERNEL_AT_EL0
+	eret
+#else
+	.if	\el == 0
+	bne	4f
+	msr	far_el1, x30
+	tramp_alias	x30, tramp_exit_native
+	br	x30
+4:
+	tramp_alias	x30, tramp_exit_compat
+	br	x30
+	.else
+	eret
+	.endif
+#endif
 	.endm
 
 	.macro	irq_stack_entry

From c6e2ad951f73e59defd13ec4c8b2285557ee0193 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:29:19 +0000
Subject: [PATCH 1378/3220] FROMLIST: arm64: erratum: Work around Falkor
 erratum #E1003 in trampoline code

We rely on an atomic swizzling of TTBR1 when transitioning from the entry
trampoline to the kernel proper on an exception. We can't rely on this
atomicity in the face of Falkor erratum #E1003, so on affected cores we
can issue a TLB invalidation to invalidate the walk cache prior to
jumping into the kernel. There is still the possibility of a TLB conflict
here due to conflicting walk cache entries prior to the invalidation, but
this doesn't appear to be the case on these CPUs in practice.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit d1777e686ad10ba7c594304429c6045fb79255a1)

Change-Id: Ia6c7ffd47745c179738250afa01cb8bf8594b235
[ghackmann@google.com: replace runtime alternative_if with a
 compile-time check for Code Aurora's out-of-tree CONFIG_ARCH_MSM8996.
 Kryo needs this workaround too, and 4.4 doesn't have any of the
 upstream Falkor errata infrastructure needed to detect this at boot time.]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/kernel/entry.S | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index bf52151f1e20..5be224cd01e7 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -953,6 +953,16 @@ __ni_sys_trace:
 	sub	\tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
 	bic	\tmp, \tmp, #USER_ASID_FLAG
 	msr	ttbr1_el1, \tmp
+#ifdef CONFIG_ARCH_MSM8996
+	/* ASID already in \tmp[63:48] */
+	movk	\tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12)
+	movk	\tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12)
+	/* 2MB boundary containing the vectors, so we nobble the walk cache */
+	movk	\tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12)
+	isb
+	tlbi	vae1, \tmp
+	dsb	nsh
+#endif /* CONFIG_ARCH_MSM8996 */
 	.endm
 
 	.macro tramp_unmap_kernel, tmp

From 59eae0dbcf6c0cfb428f2bfb167af8a281026a05 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:33:28 +0000
Subject: [PATCH 1379/3220] FROMLIST: arm64: tls: Avoid unconditional zeroing
 of tpidrro_el0 for native tasks

When unmapping the kernel at EL0, we use tpidrro_el0 as a scratch register
during exception entry from native tasks and subsequently zero it in
the kernel_ventry macro. We can therefore avoid zeroing tpidrro_el0
in the context-switch path for native tasks using the entry trampoline.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 18011eac28c7cb31c87b86b7d0e5b01894405c7f)

Change-Id: Ief7b4099f055420a7a23c8dcf497269192f5fb58
[ghackmann@google.com: adjust context]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/kernel/process.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 5ac462f222f8..dfbabc0e274b 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -376,19 +376,17 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 
 static void tls_thread_switch(struct task_struct *next)
 {
-	unsigned long tpidr, tpidrro;
+	unsigned long tpidr;
 
 	asm("mrs %0, tpidr_el0" : "=r" (tpidr));
 	*task_user_tls(current) = tpidr;
 
-	tpidr = *task_user_tls(next);
-	tpidrro = is_compat_thread(task_thread_info(next)) ?
-		  next->thread.tp_value : 0;
+	if (is_compat_thread(task_thread_info(next)))
+		write_sysreg(next->thread.tp_value, tpidrro_el0);
+	else if (!arm64_kernel_unmapped_at_el0())
+		write_sysreg(0, tpidrro_el0);
 
-	asm(
-	"	msr	tpidr_el0, %0\n"
-	"	msr	tpidrro_el0, %1"
-	: : "r" (tpidr), "r" (tpidrro));
+	write_sysreg(*task_user_tls(next), tpidr_el0);
 }
 
 /* Restore the UAO state depending on next's addr_limit */

From 472e95061e6a9cf05b0b908762f14be548bb1392 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:38:19 +0000
Subject: [PATCH 1380/3220] FROMLIST: arm64: entry: Add fake CPU feature for
 unmapping the kernel at EL0

Allow explicit disabling of the entry trampoline on the kernel command
line (kpti=off) by adding a fake CPU feature (ARM64_UNMAP_KERNEL_AT_EL0)
that can be used to toggle the alternative sequences in our entry code and
avoid use of the trampoline altogether if desired. This also allows us to
make use of a static key in arm64_kernel_unmapped_at_el0().

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit ea1e3de85e94d711f63437c04624aa0e8de5c8b3)

Change-Id: I11cb874d12a7d0921f452c62b0752e0028a8e0a7
[ghackmann@google.com:
 - adjust context
 - apply cpucaps.h changes to cpufeature.h
 - replace cpus_have_const_cap() with cpus_have_cap()
 - tweak unmap_kernel_at_el0() declaration to match 4.4 APIs]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/cpufeature.h |  3 ++-
 arch/arm64/include/asm/mmu.h        |  3 ++-
 arch/arm64/kernel/cpufeature.c      | 39 +++++++++++++++++++++++++++++
 arch/arm64/kernel/entry.S           |  9 ++++---
 4 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 4c56ecdf2dc8..2c6497a042b1 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -35,8 +35,9 @@
 #define ARM64_ALT_PAN_NOT_UAO			10
 #define ARM64_HAS_VIRT_HOST_EXTN		11
 #define ARM64_WORKAROUND_CAVIUM_27456		12
+#define ARM64_UNMAP_KERNEL_AT_EL0		23
 
-#define ARM64_NCAPS				13
+#define ARM64_NCAPS				24
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 8e60392c9137..bb907189aef0 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -34,7 +34,8 @@ typedef struct {
 
 static inline bool arm64_kernel_unmapped_at_el0(void)
 {
-	return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
+	return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) &&
+	       cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0);
 }
 
 extern void paging_init(void);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 56cbcfba0c02..0e34add11c7a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -650,6 +650,39 @@ static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry)
 	return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, rv_min, rv_max);
 }
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */
+
+static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry)
+{
+	/* Forced on command line? */
+	if (__kpti_forced) {
+		pr_info_once("kernel page table isolation forced %s by command line option\n",
+			     __kpti_forced > 0 ? "ON" : "OFF");
+		return __kpti_forced > 0;
+	}
+
+	/* Useful for KASLR robustness */
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
+		return true;
+
+	return false;
+}
+
+static int __init parse_kpti(char *str)
+{
+	bool enabled;
+	int ret = strtobool(str, &enabled);
+
+	if (ret)
+		return ret;
+
+	__kpti_forced = enabled ? 1 : -1;
+	return 0;
+}
+__setup("kpti=", parse_kpti);
+#endif	/* CONFIG_UNMAP_KERNEL_AT_EL0 */
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
@@ -702,6 +735,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.matches = cpufeature_pan_not_uao,
 	},
 #endif /* CONFIG_ARM64_PAN */
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	{
+		.capability = ARM64_UNMAP_KERNEL_AT_EL0,
+		.matches = unmap_kernel_at_el0,
+	},
+#endif
 	{},
 };
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5be224cd01e7..60d69614e174 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -74,6 +74,7 @@
 	.macro kernel_ventry, el, label, regsize = 64
 	.align 7
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+alternative_if ARM64_UNMAP_KERNEL_AT_EL0
 	.if	\el == 0
 	.if	\regsize == 64
 	mrs	x30, tpidrro_el0
@@ -82,6 +83,7 @@
 	mov	x30, xzr
 	.endif
 	.endif
+alternative_else_nop_endif
 #endif
 
 	sub	sp, sp, #S_FRAME_SIZE
@@ -287,10 +289,9 @@ alternative_else_nop_endif
 	ldr	lr, [sp, #S_LR]
 	add	sp, sp, #S_FRAME_SIZE		// restore sp
 
-#ifndef CONFIG_UNMAP_KERNEL_AT_EL0
-	eret
-#else
 	.if	\el == 0
+alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	bne	4f
 	msr	far_el1, x30
 	tramp_alias	x30, tramp_exit_native
@@ -298,10 +299,10 @@ alternative_else_nop_endif
 4:
 	tramp_alias	x30, tramp_exit_compat
 	br	x30
+#endif
 	.else
 	eret
 	.endif
-#endif
 	.endm
 
 	.macro	irq_stack_entry

From d891a62401ae543e66a847fd73ac8d7581ae6d64 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 14:41:01 +0000
Subject: [PATCH 1381/3220] FROMLIST: arm64: Kconfig: Add
 CONFIG_UNMAP_KERNEL_AT_EL0

Add a Kconfig entry to control use of the entry trampoline, which allows
us to unmap the kernel whilst running in userspace and improve the
robustness of KASLR.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 084eb77cd3a81134d02500977dc0ecc9277dc97d)

Change-Id: Iac41787b660dde902f32325afd2f454da600b60d
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/Kconfig | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1f7ba1848c64..206070f0813c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -650,6 +650,19 @@ config FORCE_MAX_ZONEORDER
 	  However for 4K, we choose a higher default value, 11 as opposed to 10, giving us
 	  4M allocations matching the default size used by generic code.
 
+config UNMAP_KERNEL_AT_EL0
+	bool "Unmap kernel when running in userspace (aka \"KAISER\")"
+	default y
+	help
+	  Some attacks against KASLR make use of the timing difference between
+	  a permission fault which could arise from a page table entry that is
+	  present in the TLB, and a translation fault which always requires a
+	  page table walk. This option defends against these attacks by unmapping
+	  the kernel whilst running in userspace, therefore forcing translation
+	  faults for all of kernel space.
+
+	  If unsure, say Y.
+
 menuconfig ARMV8_DEPRECATED
 	bool "Emulate deprecated/obsolete ARMv8 instructions"
 	depends on COMPAT

From c62a3d7a8de517e0df46e0f4cc46c45f5c44a235 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 1 Dec 2017 17:33:48 +0000
Subject: [PATCH 1382/3220] FROMLIST: arm64: mm: Introduce TTBR_ASID_MASK for
 getting at the ASID in the TTBR

There are now a handful of open-coded masks to extract the ASID from a
TTBR value, so introduce a TTBR_ASID_MASK and use that instead.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit b519538dfefc2f8478a1bcb458459c861d431784)

Change-Id: I538071c8ec96dca587205c78839c07b6c772fa91
[ghackmann@google.com: adjust context, applying asm-uaccess.h changes
 to uaccess.h instead]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/mmu.h     | 1 +
 arch/arm64/include/asm/uaccess.h | 7 ++++---
 arch/arm64/kernel/entry.S        | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index bb907189aef0..39e502f0ab80 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -17,6 +17,7 @@
 #define __ASM_MMU_H
 
 #define USER_ASID_FLAG	(UL(1) << 48)
+#define TTBR_ASID_MASK	(UL(0xffff) << 48)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index d4383596a38c..57c4bbdf77cc 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -20,6 +20,7 @@
 
 #include <asm/alternative.h>
 #include <asm/kernel-pgtable.h>
+#include <asm/mmu.h>
 #include <asm/sysreg.h>
 
 #ifndef __ASSEMBLY__
@@ -149,7 +150,7 @@ static inline void __uaccess_ttbr0_disable(void)
 	write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1);
 	isb();
 	/* Set reserved ASID */
-	ttbr &= ~(0xffffUL << 48);
+	ttbr &= ~TTBR_ASID_MASK;
 	write_sysreg(ttbr, ttbr1_el1);
 	isb();
 }
@@ -168,7 +169,7 @@ static inline void __uaccess_ttbr0_enable(void)
 
 	/* Restore active ASID */
 	ttbr1 = read_sysreg(ttbr1_el1);
-	ttbr1 |= ttbr0 & (0xffffUL << 48);
+	ttbr1 |= ttbr0 & TTBR_ASID_MASK;
 	write_sysreg(ttbr1, ttbr1_el1);
 	isb();
 
@@ -456,7 +457,7 @@ extern __must_check long strnlen_user(const char __user *str, long n);
 	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
 	isb
 	sub	\tmp1, \tmp1, #SWAPPER_DIR_SIZE
-	bic	\tmp1, \tmp1, #(0xffff << 48)
+	bic	\tmp1, \tmp1, #TTBR_ASID_MASK
 	msr	ttbr1_el1, \tmp1		// set reserved ASID
 	isb
 	.endm
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 60d69614e174..1c7f6f9961a1 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -165,7 +165,7 @@ alternative_else_nop_endif
 
 	.if	\el != 0
 	mrs	x21, ttbr1_el1
-	tst	x21, #0xffff << 48		// Check for the reserved ASID
+	tst	x21, #TTBR_ASID_MASK		// Check for the reserved ASID
 	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
 	b.eq	1f				// TTBR0 access already disabled
 	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR

From 8a345128f74e2313b14689da63dcaa3483703801 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 6 Dec 2017 11:24:02 +0000
Subject: [PATCH 1383/3220] FROMLIST: arm64: kaslr: Put kernel vectors address
 in separate data page

The literal pool entry for identifying the vectors base is the only piece
of information in the trampoline page that identifies the true location
of the kernel.

This patch moves it into a page-aligned region of the .rodata section
and maps this adjacent to the trampoline text via an additional fixmap
entry, which protects against any accidental leakage of the trampoline
contents.

Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 6c27c4082f4f70b9f41df4d0adf51128b40351df)

Change-Id: Iffe72dc5e7ee171d83a7b916a16146e35ddf904e
[ghackmann@google.com:
 - adjust context
 - replace ARM64_WORKAROUND_QCOM_FALKOR_E1003 alternative with
   compile-time CONFIG_ARCH_MSM8996 check]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/fixmap.h |  1 +
 arch/arm64/kernel/entry.S       | 16 ++++++++++++++++
 arch/arm64/kernel/vmlinux.lds.S |  4 ++++
 arch/arm64/mm/mmu.c             | 10 +++++++++-
 4 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 8fa98f26129a..03a1e908b8e9 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -51,6 +51,7 @@ enum fixed_addresses {
 	FIX_EARLYCON_MEM_BASE,
 	FIX_TEXT_POKE0,
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+	FIX_ENTRY_TRAMP_DATA,
 	FIX_ENTRY_TRAMP_TEXT,
 #define TRAMP_VALIAS		(__fix_to_virt(FIX_ENTRY_TRAMP_TEXT))
 #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 1c7f6f9961a1..5b13ed6ec742 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -992,7 +992,15 @@ __ni_sys_trace:
 	msr	tpidrro_el0, x30	// Restored in kernel_ventry
 	.endif
 	tramp_map_kernel	x30
+#ifdef CONFIG_RANDOMIZE_BASE
+	adr	x30, tramp_vectors + PAGE_SIZE
+#ifndef CONFIG_ARCH_MSM8996
+	isb
+#endif
+	ldr	x30, [x30]
+#else
 	ldr	x30, =vectors
+#endif
 	prfm	plil1strm, [x30, #(1b - tramp_vectors)]
 	msr	vbar_el1, x30
 	add	x30, x30, #(1b - tramp_vectors)
@@ -1035,6 +1043,14 @@ END(tramp_exit_compat)
 
 	.ltorg
 	.popsection				// .entry.tramp.text
+#ifdef CONFIG_RANDOMIZE_BASE
+	.pushsection ".rodata", "a"
+	.align PAGE_SHIFT
+	.globl	__entry_tramp_data_start
+__entry_tramp_data_start:
+	.quad	vectors
+	.popsection				// .rodata
+#endif /* CONFIG_RANDOMIZE_BASE */
 #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
 
 /*
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index a0432ab720cd..6e4832def254 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -224,6 +224,10 @@ ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
 ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
 	"ID map text too big or misaligned")
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE,
+	"Entry trampoline text too big")
+#endif
 /*
  * If padding is applied before .head.text, virt<->phys conversions will fail.
  */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a383ec16f1b7..0029b0b95438 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -510,8 +510,16 @@ static int __init map_entry_trampoline(void)
 	__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
 			     prot, late_pgtable_alloc);
 
-	/* ...as well as the kernel page table */
+	/* Map both the text and data into the kernel page table */
 	__set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+		extern char __entry_tramp_data_start[];
+
+		__set_fixmap(FIX_ENTRY_TRAMP_DATA,
+			     __pa_symbol(__entry_tramp_data_start),
+			     PAGE_KERNEL_RO);
+	}
+
 	return 0;
 }
 core_initcall(map_entry_trampoline);

From c65273a83da79d474affb7f6d431ac53beac0761 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 16:15:59 +0000
Subject: [PATCH 1384/3220] arm64: use RET instruction for exiting the
 trampoline

Speculation attacks against the entry trampoline can potentially resteer
the speculative instruction stream through the indirect branch and into
arbitrary gadgets within the kernel.

This patch defends against these attacks by forcing a misprediction
through the return stack: a dummy BL instruction loads an entry into
the stack, so that the predicted program flow of the subsequent RET
instruction is to a branch-to-self instruction which is finally resolved
as a branch to the kernel vectors with speculation suppressed.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/kernel/entry.S | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5b13ed6ec742..4473036955b7 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -991,6 +991,9 @@ __ni_sys_trace:
 	.if	\regsize == 64
 	msr	tpidrro_el0, x30	// Restored in kernel_ventry
 	.endif
+	bl	2f
+	b	.
+2:
 	tramp_map_kernel	x30
 #ifdef CONFIG_RANDOMIZE_BASE
 	adr	x30, tramp_vectors + PAGE_SIZE
@@ -1005,7 +1008,7 @@ __ni_sys_trace:
 	msr	vbar_el1, x30
 	add	x30, x30, #(1b - tramp_vectors)
 	isb
-	br	x30
+	ret
 	.endm
 
 	.macro tramp_exit, regsize = 64

From dac143966992a69b158f9ab9cb45f7adfa9d168c Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 14 Nov 2017 16:19:39 +0000
Subject: [PATCH 1385/3220] arm64: Kconfig: Reword UNMAP_KERNEL_AT_EL0 kconfig
 entry

Although CONFIG_UNMAP_KERNEL_AT_EL0 does make KASLR more robust, it's
actually more useful as a mitigation against speculation attacks that
can leak arbitrary kernel data to userspace through speculation.

Reword the Kconfig help message to reflect this, and make the option
depend on EXPERT so that it is on by default for the majority of users.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/Kconfig | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 206070f0813c..251aa1b453c2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -651,15 +651,14 @@ config FORCE_MAX_ZONEORDER
 	  4M allocations matching the default size used by generic code.
 
 config UNMAP_KERNEL_AT_EL0
-	bool "Unmap kernel when running in userspace (aka \"KAISER\")"
+	bool "Unmap kernel when running in userspace (aka \"KAISER\")" if EXPERT
 	default y
 	help
-	  Some attacks against KASLR make use of the timing difference between
-	  a permission fault which could arise from a page table entry that is
-	  present in the TLB, and a translation fault which always requires a
-	  page table walk. This option defends against these attacks by unmapping
-	  the kernel whilst running in userspace, therefore forcing translation
-	  faults for all of kernel space.
+	  Speculation attacks against some high-performance processors can
+	  be used to bypass MMU permission checks and leak kernel data to
+	  userspace. This can be defended against by unmapping the kernel
+	  when running in userspace, mapping it back in on exception entry
+	  via a trampoline page in the vector table.
 
 	  If unsure, say Y.
 

From eb4909b080c437ebb12cbdea1dce7fc60741c2bd Mon Sep 17 00:00:00 2001
From: Hemant Kumar <hemantk@codeaurora.org>
Date: Mon, 8 Aug 2016 16:20:15 -0700
Subject: [PATCH 1386/3220] ANDROID: usb: f_fs: Prevent gadget unbind if it is
 already unbound

Upon usb composition switch there is possibility of ep0 file
release happening after gadget driver bind. In case of composition
switch from adb to a non-adb composition gadget will never gets
bound again resulting into failure of usb device enumeration. Fix
this issue by checking FFS_FL_BOUND flag and avoid extra
gadget driver unbind if it is already done as part of composition
switch.

Change-Id: I1638001ff4a94f08224b188aa42425f3d732fa2b
Signed-off-by: Hemant Kumar <hemantk@codeaurora.org>
---
 drivers/usb/gadget/function/f_fs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 39bb65265bff..eb298daf49c7 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -3490,7 +3490,8 @@ static void ffs_closed(struct ffs_data *ffs)
 	ci = opts->func_inst.group.cg_item.ci_parent->ci_parent;
 	ffs_dev_unlock();
 
-	unregister_gadget_item(ci);
+	if (test_bit(FFS_FL_BOUND, &ffs->flags))
+		unregister_gadget_item(ci);
 	return;
 done:
 	ffs_dev_unlock();

From 64afad041d007eed07a92a7f8794dd62f7c8f2bd Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 8 Jan 2018 13:57:36 -0800
Subject: [PATCH 1387/3220] ANDROID: sdcardfs: Fix missing break on
 default_normal

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 64672411
Change-Id: I98796df95dc9846adb77a11f49a1a254fb1618b1
---
 fs/sdcardfs/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 3d1023a7ff7c..d5239b21cb8a 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -127,6 +127,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 			break;
 		case Opt_default_normal:
 			vfsopts->default_normal = true;
+			break;
 		/* unknown option */
 		default:
 			if (!silent)

From bd3c67ac4fe8be040ac4cbcafdc4731b52dc1321 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 1 Feb 2017 11:48:58 +0000
Subject: [PATCH 1388/3220] BACKPORT: arm64: Add CNTVCT_EL0 trap handler

Since people seem to make a point in breaking the userspace visible
counter, we have no choice but to trap the access. Add the required
handler.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
(cherry picked from commit 6126ce0588eb5a0752d5c8b5796a7fca324fd887)

Change-Id: I0705f47c85a78040df38df18f51a4a22500b904d
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/esr.h | 37 ++++++++++++++++++++++++++++++++++++
 arch/arm64/kernel/entry.S    | 12 +++++++++++-
 arch/arm64/kernel/traps.c    | 19 ++++++++++++++++++
 3 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index f772e15c4766..f40f2f4a4e84 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -109,6 +109,43 @@
 	((ESR_ELx_EC_BRK64 << ESR_ELx_EC_SHIFT) | ESR_ELx_IL |	\
 	 ((imm) & 0xffff))
 
+/* ISS field definitions for System instruction traps */
+#define ESR_ELx_SYS64_ISS_RES0_SHIFT	22
+#define ESR_ELx_SYS64_ISS_RES0_MASK	(UL(0x7) << ESR_ELx_SYS64_ISS_RES0_SHIFT)
+#define ESR_ELx_SYS64_ISS_DIR_MASK	0x1
+#define ESR_ELx_SYS64_ISS_DIR_READ	0x1
+#define ESR_ELx_SYS64_ISS_DIR_WRITE	0x0
+
+#define ESR_ELx_SYS64_ISS_RT_SHIFT	5
+#define ESR_ELx_SYS64_ISS_RT_MASK	(UL(0x1f) << ESR_ELx_SYS64_ISS_RT_SHIFT)
+#define ESR_ELx_SYS64_ISS_CRM_SHIFT	1
+#define ESR_ELx_SYS64_ISS_CRM_MASK	(UL(0xf) << ESR_ELx_SYS64_ISS_CRM_SHIFT)
+#define ESR_ELx_SYS64_ISS_CRN_SHIFT	10
+#define ESR_ELx_SYS64_ISS_CRN_MASK	(UL(0xf) << ESR_ELx_SYS64_ISS_CRN_SHIFT)
+#define ESR_ELx_SYS64_ISS_OP1_SHIFT	14
+#define ESR_ELx_SYS64_ISS_OP1_MASK	(UL(0x7) << ESR_ELx_SYS64_ISS_OP1_SHIFT)
+#define ESR_ELx_SYS64_ISS_OP2_SHIFT	17
+#define ESR_ELx_SYS64_ISS_OP2_MASK	(UL(0x7) << ESR_ELx_SYS64_ISS_OP2_SHIFT)
+#define ESR_ELx_SYS64_ISS_OP0_SHIFT	20
+#define ESR_ELx_SYS64_ISS_OP0_MASK	(UL(0x3) << ESR_ELx_SYS64_ISS_OP0_SHIFT)
+#define ESR_ELx_SYS64_ISS_SYS_MASK	(ESR_ELx_SYS64_ISS_OP0_MASK | \
+					 ESR_ELx_SYS64_ISS_OP1_MASK | \
+					 ESR_ELx_SYS64_ISS_OP2_MASK | \
+					 ESR_ELx_SYS64_ISS_CRN_MASK | \
+					 ESR_ELx_SYS64_ISS_CRM_MASK)
+#define ESR_ELx_SYS64_ISS_SYS_VAL(op0, op1, op2, crn, crm) \
+					(((op0) << ESR_ELx_SYS64_ISS_OP0_SHIFT) | \
+					 ((op1) << ESR_ELx_SYS64_ISS_OP1_SHIFT) | \
+					 ((op2) << ESR_ELx_SYS64_ISS_OP2_SHIFT) | \
+					 ((crn) << ESR_ELx_SYS64_ISS_CRN_SHIFT) | \
+					 ((crm) << ESR_ELx_SYS64_ISS_CRM_SHIFT))
+
+#define ESR_ELx_SYS64_ISS_SYS_OP_MASK	(ESR_ELx_SYS64_ISS_SYS_MASK | \
+					 ESR_ELx_SYS64_ISS_DIR_MASK)
+
+#define ESR_ELx_SYS64_ISS_SYS_CNTVCT	(ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 2, 14, 0) | \
+					 ESR_ELx_SYS64_ISS_DIR_READ)
+
 #ifndef __ASSEMBLY__
 #include <asm/types.h>
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 4473036955b7..434ef7178224 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -604,7 +604,7 @@ el0_sync:
 	cmp	x24, #ESR_ELx_EC_FP_EXC64	// FP/ASIMD exception
 	b.eq	el0_fpsimd_exc
 	cmp	x24, #ESR_ELx_EC_SYS64		// configurable trap
-	b.eq	el0_undef
+	b.eq	el0_sys
 	cmp	x24, #ESR_ELx_EC_SP_ALIGN	// stack alignment exception
 	b.eq	el0_sp_pc
 	cmp	x24, #ESR_ELx_EC_PC_ALIGN	// pc alignment exception
@@ -732,6 +732,16 @@ el0_undef:
 	mov	x0, sp
 	bl	do_undefinstr
 	b	ret_to_user
+el0_sys:
+	/*
+	 * System instructions, for trapped cache maintenance instructions
+	 */
+	enable_dbg_and_irq
+	ct_user_exit
+	mov	x0, x25
+	mov	x1, sp
+	bl	do_sysinstr
+	b	ret_to_user
 el0_dbg:
 	/*
 	 * Debug exception handling
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 9b42bb2214ee..42615da5b2e9 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -409,6 +409,25 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
 	arm64_notify_die("Oops - undefined instruction", regs, &info, 0);
 }
 
+static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
+{
+	int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+
+	if (rt != 31)
+		regs->regs[rt] = arch_counter_get_cntvct();
+	regs->pc += 4;
+}
+
+asmlinkage void __exception do_sysinstr(unsigned int esr, struct pt_regs *regs)
+{
+	if ((esr & ESR_ELx_SYS64_ISS_SYS_OP_MASK) == ESR_ELx_SYS64_ISS_SYS_CNTVCT) {
+		cntvct_read_handler(esr, regs);
+		return;
+	}
+
+	do_undefinstr(regs);
+}
+
 long compat_arm_syscall(struct pt_regs *regs);
 
 asmlinkage long do_ni_syscall(struct pt_regs *regs)

From b0dc52ea0834364ea4b62d0977fb5511f37ccf5a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 24 Apr 2017 09:04:03 +0100
Subject: [PATCH 1389/3220] BACKPORT: arm64: Add CNTFRQ_EL0 trap handler

We now trap accesses to CNTVCT_EL0 when the counter is broken
enough to require the kernel to mediate the access. But it
turns out that some existing userspace (such as OpenMPI) do
probe for the counter frequency, leading to an UNDEF exception
as CNTVCT_EL0 and CNTFRQ_EL0 share the same control bit.

The fix is to handle the exception the same way we do for CNTVCT_EL0.

Fixes: a86bd139f2ae ("arm64: arch_timer: Enable CNTVCT_EL0 trap if workaround is enabled")
Reported-by: Hanjun Guo <guohanjun@huawei.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
(cherry picked from commit 9842119a238bfb92cbab63258dabb54f0e7b111b)

Change-Id: I2f163e2511bab6225f319c0a9e732735cbd108a0
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/include/asm/esr.h |  3 +++
 arch/arm64/kernel/traps.c    | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index f40f2f4a4e84..2d4e9c26f8f6 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -146,6 +146,9 @@
 #define ESR_ELx_SYS64_ISS_SYS_CNTVCT	(ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 2, 14, 0) | \
 					 ESR_ELx_SYS64_ISS_DIR_READ)
 
+#define ESR_ELx_SYS64_ISS_SYS_CNTFRQ	(ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \
+					 ESR_ELx_SYS64_ISS_DIR_READ)
+
 #ifndef __ASSEMBLY__
 #include <asm/types.h>
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 42615da5b2e9..ef77a5f4f075 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -418,11 +418,23 @@ static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
 	regs->pc += 4;
 }
 
+static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs)
+{
+	int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+
+	if (rt != 31)
+		regs->regs[rt] = read_sysreg(cntfrq_el0);
+	regs->pc += 4;
+}
+
 asmlinkage void __exception do_sysinstr(unsigned int esr, struct pt_regs *regs)
 {
 	if ((esr & ESR_ELx_SYS64_ISS_SYS_OP_MASK) == ESR_ELx_SYS64_ISS_SYS_CNTVCT) {
 		cntvct_read_handler(esr, regs);
 		return;
+	} else if ((esr & ESR_ELx_SYS64_ISS_SYS_OP_MASK) == ESR_ELx_SYS64_ISS_SYS_CNTFRQ) {
+		cntfrq_read_handler(esr, regs);
+		return;
 	}
 
 	do_undefinstr(regs);

From ed776960efdeaa25d080270cd9aa1b853829a2db Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Wed, 4 Oct 2017 09:31:34 -0700
Subject: [PATCH 1390/3220] arm64: issue isb when trapping CNTVCT_EL0 access

Change-Id: I6005a6e944494257bfc2243fde2f7a09c3fd76c6
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/kernel/traps.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index ef77a5f4f075..d242a46e8957 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -33,6 +33,7 @@
 #include <linux/syscalls.h>
 
 #include <asm/atomic.h>
+#include <asm/barrier.h>
 #include <asm/bug.h>
 #include <asm/debug-monitors.h>
 #include <asm/esr.h>
@@ -413,6 +414,7 @@ static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
 {
 	int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
 
+	isb();
 	if (rt != 31)
 		regs->regs[rt] = arch_counter_get_cntvct();
 	regs->pc += 4;

From 618a5108ea9c08fb7436b42de8c3353d3c9370a6 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Tue, 19 Sep 2017 10:55:17 -0700
Subject: [PATCH 1391/3220] clocksource: arch_timer: make virtual counter
 access configurable

Change-Id: Ibdb1fd768b748002b90bfc165612c12c8311f8a2
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/clocksource/Kconfig          | 8 ++++++++
 drivers/clocksource/arm_arch_timer.c | 5 ++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 2eb5f0efae90..50f512629afd 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -158,6 +158,14 @@ config ARM_ARCH_TIMER_EVTSTREAM
 	  This must be disabled for hardware validation purposes to detect any
 	  hardware anomalies of missing events.
 
+config ARM_ARCH_TIMER_VCT_ACCESS
+	bool "Support for ARM architected timer virtual counter access in userspace"
+	default !ARM64
+	depends on ARM_ARCH_TIMER
+	help
+	  This option enables support for reading the ARM architected timer's
+	  virtual counter in userspace.
+
 config ARM_GLOBAL_TIMER
 	bool
 	select CLKSRC_OF if OF
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index c64d543d64bf..1c029b49f96d 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -333,7 +333,10 @@ static void arch_counter_set_user_access(void)
 			| ARCH_TIMER_USR_PCT_ACCESS_EN);
 
 	/* Enable user access to the virtual counter */
-	cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+	if (IS_ENABLED(CONFIG_ARM_ARCH_TIMER_VCT_ACCESS))
+		cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+	else
+		cntkctl &= ~ARCH_TIMER_USR_VCT_ACCESS_EN;
 
 	arch_timer_set_cntkctl(cntkctl);
 }

From 3301b5569f273fa395d0c7b4a904763273b5f837 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Thu, 4 Feb 2016 01:17:12 +0900
Subject: [PATCH 1392/3220] BACKPORT: selinux: nlmsgtab: add SOCK_DESTROY to
 the netlink mapping tables

Without this, using SOCK_DESTROY in enforcing mode results in:

  SELinux: unrecognized netlink message type=21 for sclass=32

Original patch has SOCK_DESTROY instead of SOCK_DESTROY_BACKPORT

Change-Id: I2d0bb7a0b1ef3b201e956479a93f58c844909f8b
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 security/selinux/nlmsgtab.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 2bbb41822d8e..0714b4c61a8b 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -83,6 +83,7 @@ static struct nlmsg_perm nlmsg_tcpdiag_perms[] =
 	{ TCPDIAG_GETSOCK,	NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
 	{ DCCPDIAG_GETSOCK,	NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
 	{ SOCK_DIAG_BY_FAMILY,	NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+	{ SOCK_DESTROY_BACKPORT,NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE },
 };
 
 static struct nlmsg_perm nlmsg_xfrm_perms[] =

From 5c60ee2f61bf105243ae1dbae28b834353add182 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 May 2017 21:05:16 +0200
Subject: [PATCH 1393/3220] BACKPORT: tee: add ARM_SMCCC dependency

For the moment, the tee subsystem only makes sense in combination with
the op-tee driver that depends on ARM_SMCCC, so let's hide the subsystem
from users that can't select that.

Change-Id: Ied6a479d3b14c4b9075b91adca5c18dfda9e7545
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
(cherry picked from commit e84188852a7239d7a144af12f7e5dac8fa88600b)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig
index 2330a4eb4e8b..a6df12d88f90 100644
--- a/drivers/tee/Kconfig
+++ b/drivers/tee/Kconfig
@@ -1,6 +1,7 @@
 # Generic Trusted Execution Environment Configuration
 config TEE
 	tristate "Trusted Execution Environment support"
+	depends on HAVE_ARM_SMCCC || COMPILE_TEST
 	select DMA_SHARED_BUFFER
 	select GENERIC_ALLOCATOR
 	help

From 5a7ccb43604c04816491125f306e619ac6f25fb4 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 12 Jul 2017 10:06:20 -0300
Subject: [PATCH 1394/3220] BACKPORT: tee.txt: standardize document format

Each text file under Documentation follows a different format. Some
doesn't even have titles!

Change its representation to follow the adopted standard,
using ReST markups for it to be parseable by Sphinx:

- adjust identation of titles;
- mark ascii artwork as a literal block;
- adjust references.

Change-Id: I5d410e1fae61bbc240de47f6837730100519fda8
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
(cherry picked from commit 4297739f2b5d4693d9b9f9e3dffeecf2ae9f8081)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 Documentation/tee.txt | 51 +++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/Documentation/tee.txt b/Documentation/tee.txt
index 718599357596..56ea85ffebf2 100644
--- a/Documentation/tee.txt
+++ b/Documentation/tee.txt
@@ -1,4 +1,7 @@
+=============
 TEE subsystem
+=============
+
 This document describes the TEE subsystem in Linux.
 
 A TEE (Trusted Execution Environment) is a trusted OS running in some
@@ -80,27 +83,27 @@ The GlobalPlatform TEE Client API [5] is implemented on top of the generic
 TEE API.
 
 Picture of the relationship between the different components in the
-OP-TEE architecture.
+OP-TEE architecture::
 
-    User space                  Kernel                   Secure world
-    ~~~~~~~~~~                  ~~~~~~                   ~~~~~~~~~~~~
- +--------+                                             +-------------+
- | Client |                                             | Trusted     |
- +--------+                                             | Application |
-    /\                                                  +-------------+
-    || +----------+                                           /\
-    || |tee-      |                                           ||
-    || |supplicant|                                           \/
-    || +----------+                                     +-------------+
-    \/      /\                                          | TEE Internal|
- +-------+  ||                                          | API         |
- + TEE   |  ||            +--------+--------+           +-------------+
- | Client|  ||            | TEE    | OP-TEE |           | OP-TEE      |
- | API   |  \/            | subsys | driver |           | Trusted OS  |
- +-------+----------------+----+-------+----+-----------+-------------+
- |      Generic TEE API        |       |     OP-TEE MSG               |
- |      IOCTL (TEE_IOC_*)      |       |     SMCCC (OPTEE_SMC_CALL_*) |
- +-----------------------------+       +------------------------------+
+      User space                  Kernel                   Secure world
+      ~~~~~~~~~~                  ~~~~~~                   ~~~~~~~~~~~~
+   +--------+                                             +-------------+
+   | Client |                                             | Trusted     |
+   +--------+                                             | Application |
+      /\                                                  +-------------+
+      || +----------+                                           /\
+      || |tee-      |                                           ||
+      || |supplicant|                                           \/
+      || +----------+                                     +-------------+
+      \/      /\                                          | TEE Internal|
+   +-------+  ||                                          | API         |
+   + TEE   |  ||            +--------+--------+           +-------------+
+   | Client|  ||            | TEE    | OP-TEE |           | OP-TEE      |
+   | API   |  \/            | subsys | driver |           | Trusted OS  |
+   +-------+----------------+----+-------+----+-----------+-------------+
+   |      Generic TEE API        |       |     OP-TEE MSG               |
+   |      IOCTL (TEE_IOC_*)      |       |     SMCCC (OPTEE_SMC_CALL_*) |
+   +-----------------------------+       +------------------------------+
 
 RPC (Remote Procedure Call) are requests from secure world to kernel driver
 or tee-supplicant. An RPC is identified by a special range of SMCCC return
@@ -109,10 +112,16 @@ kernel are handled by the kernel driver. Other RPC messages will be forwarded to
 tee-supplicant without further involvement of the driver, except switching
 shared memory buffer representation.
 
-References:
+References
+==========
+
 [1] https://github.com/OP-TEE/optee_os
+
 [2] http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html
+
 [3] drivers/tee/optee/optee_smc.h
+
 [4] drivers/tee/optee/optee_msg.h
+
 [5] http://www.globalplatform.org/specificationsdevice.asp look for
     "TEE Client API Specification v1.0" and click download.

From 62275d495b420f40bbc8c33bed80a9647c07bc86 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Mon, 15 May 2017 11:09:28 +0200
Subject: [PATCH 1395/3220] BACKPORT: tee: optee: fix uninitialized symbol
 'parg'

Fixes the static checker warning in optee_release().
error: uninitialized symbol 'parg'.

Change-Id: I2eabb31695085cf1e96af8089ca91778bd7ce5a5
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit efb14036bd7f8914f721e1e82891d4ba617cc784)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/core.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index 58169e519422..857141e29e80 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -224,13 +224,14 @@ static void optee_release(struct tee_context *ctx)
 	if (!IS_ERR(shm)) {
 		arg = tee_shm_get_va(shm, 0);
 		/*
-		 * If va2pa fails for some reason, we can't call
-		 * optee_close_session(), only free the memory. Secure OS
-		 * will leak sessions and finally refuse more sessions, but
-		 * we will at least let normal world reclaim its memory.
+		 * If va2pa fails for some reason, we can't call into
+		 * secure world, only free the memory. Secure OS will leak
+		 * sessions and finally refuse more sessions, but we will
+		 * at least let normal world reclaim its memory.
 		 */
 		if (!IS_ERR(arg))
-			tee_shm_va2pa(shm, arg, &parg);
+			if (tee_shm_va2pa(shm, arg, &parg))
+				arg = NULL; /* prevent usage of parg below */
 	}
 
 	list_for_each_entry_safe(sess, sess_tmp, &ctxdata->sess_list,

From 185c2d72a1f6d8ba8e6ecff89582a274cb815253 Mon Sep 17 00:00:00 2001
From: Jerome Forissier <jerome.forissier@linaro.org>
Date: Wed, 31 May 2017 13:21:05 +0200
Subject: [PATCH 1396/3220] BACKPORT: tee: add forward declaration for struct
 device

tee_drv.h references struct device, but does not include device.h nor
platform_device.h. Therefore, if tee_drv.h is included by some file
that does not pull device.h nor platform_device.h beforehand, we have a
compile warning. Fix this by adding a forward declaration.

Change-Id: Iadb9563a540c95064774c577f679e0d630b939c8
Signed-off-by: Jerome Forissier <jerome.forissier@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 999616b8536cf3b9a1d0d74d5542ea009df482ff)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 include/linux/tee_drv.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 0f175b8f6456..cb889afe576b 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -28,6 +28,7 @@
 #define TEE_SHM_MAPPED		0x1	/* Memory mapped by the kernel */
 #define TEE_SHM_DMA_BUF		0x2	/* Memory with dma-buf handle */
 
+struct device;
 struct tee_device;
 struct tee_shm;
 struct tee_shm_pool;

From ab64937ab167aa599e74d3b7edfd1d01651b686f Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Sat, 1 Jul 2017 17:56:06 +0530
Subject: [PATCH 1397/3220] BACKPORT: tee: tee_shm: Constify dma_buf_ops
 structures.

dma_buf_ops are not supposed to change at runtime. All functions
working with dma_buf_ops provided by <linux/dma-buf.h> work with
const dma_buf_ops. So mark the non-const structs as const.

File size before:
   text	   data	    bss	    dec	    hex	filename
   2026	    112	      0	   2138	    85a	drivers/tee/tee_shm.o

File size After adding 'const':
   text	   data	    bss	    dec	    hex	filename
   2138	      0	      0	   2138	    85a	drivers/tee/tee_shm.o

Change-Id: I6dfa99c45bf0078d7048525fa0554bc1f607f1e9
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 53e3ca5cee24f5fafe4e9ff5fe4b230e1a1b85ed)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_shm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 0be1e3e93bee..37207ae2de7b 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -80,7 +80,7 @@ static int tee_shm_op_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
 			       size, vma->vm_page_prot);
 }
 
-static struct dma_buf_ops tee_shm_dma_buf_ops = {
+static const struct dma_buf_ops tee_shm_dma_buf_ops = {
 	.map_dma_buf = tee_shm_op_map_dma_buf,
 	.unmap_dma_buf = tee_shm_op_unmap_dma_buf,
 	.release = tee_shm_op_release,

From 848ca983486f1b3d0ae0008586cd41f5dd2c3904 Mon Sep 17 00:00:00 2001
From: Bhumika Goyal <bhumirks@gmail.com>
Date: Thu, 29 Jun 2017 15:05:04 +0530
Subject: [PATCH 1398/3220] BACKPORT: tee: optee: add const to tee_driver_ops
 and tee_desc structures

Add const to tee_desc structures as they are only passed as an argument
to the function tee_device_alloc. This argument is of type const, so
declare these structures as const too.
Add const to tee_driver_ops structures as they are only stored in the
ops field of a tee_desc structure. This field is of type const, so
declare these structure types as const.

Change-Id: Ia9d0348f4dd5078a8a48c74739b69c0871c61bd0
Signed-off-by: Bhumika Goyal <bhumirks@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 96e72ddeec4546fda0e194298c2ee39e394a3ab7)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index 857141e29e80..7952357df9c8 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -259,7 +259,7 @@ static void optee_release(struct tee_context *ctx)
 	}
 }
 
-static struct tee_driver_ops optee_ops = {
+static const struct tee_driver_ops optee_ops = {
 	.get_version = optee_get_version,
 	.open = optee_open,
 	.release = optee_release,
@@ -269,13 +269,13 @@ static struct tee_driver_ops optee_ops = {
 	.cancel_req = optee_cancel_req,
 };
 
-static struct tee_desc optee_desc = {
+static const struct tee_desc optee_desc = {
 	.name = DRIVER_NAME "-clnt",
 	.ops = &optee_ops,
 	.owner = THIS_MODULE,
 };
 
-static struct tee_driver_ops optee_supp_ops = {
+static const struct tee_driver_ops optee_supp_ops = {
 	.get_version = optee_get_version,
 	.open = optee_open,
 	.release = optee_release,
@@ -283,7 +283,7 @@ static struct tee_driver_ops optee_supp_ops = {
 	.supp_send = optee_supp_send,
 };
 
-static struct tee_desc optee_supp_desc = {
+static const struct tee_desc optee_supp_desc = {
 	.name = DRIVER_NAME "-supp",
 	.ops = &optee_supp_ops,
 	.owner = THIS_MODULE,

From 648c5f26631a034dee9ba5f89af194f27a730a3b Mon Sep 17 00:00:00 2001
From: tiger-yu99 <tigeryu99@hotmail.com>
Date: Sat, 6 May 2017 00:20:32 +0800
Subject: [PATCH 1399/3220] BACKPORT: tee: optee: interruptible RPC sleep

Prior to this patch RPC sleep was uninterruptible since msleep() is
uninterruptible. Change to use msleep_interruptible() instead.

Change-Id: I3737067d936a1e405e4b8309d2ec521f4835a7b8
Signed-off-by: Tiger Yu <tigeryu99@hotmail.com>
Reviewed-by: Joakim Bech <joakim.bech@linaro.org>
Signed-off-by: Jerome Forissier <jerome.forissier@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit a9980e947ec97297e03d2332d6beff06f5131a98)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/rpc.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/tee/optee/rpc.c b/drivers/tee/optee/rpc.c
index 8814eca06021..9488ed1541bc 100644
--- a/drivers/tee/optee/rpc.c
+++ b/drivers/tee/optee/rpc.c
@@ -140,11 +140,8 @@ static void handle_rpc_func_cmd_wait(struct optee_msg_arg *arg)
 
 	msec_to_wait = arg->params[0].u.value.a;
 
-	/* set task's state to interruptible sleep */
-	set_current_state(TASK_INTERRUPTIBLE);
-
-	/* take a nap */
-	msleep(msec_to_wait);
+	/* Go to interruptible sleep */
+	msleep_interruptible(msec_to_wait);
 
 	arg->ret = TEEC_SUCCESS;
 	return;

From 310eb1682d8adb277aa8b61ffa2578474d28d341 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Thu, 16 Feb 2017 09:07:02 +0100
Subject: [PATCH 1400/3220] BACKPORT: tee: indicate privileged dev in gen_caps

Mirrors the TEE_DESC_PRIVILEGED bit of struct tee_desc:flags into struct
tee_ioctl_version_data:gen_caps as TEE_GEN_CAP_PRIVILEGED in
tee_ioctl_version()

Change-Id: Iebd281e36b45181325da6b7982f045b4642e72d4
Reviewed-by: Jerome Forissier <jerome.forissier@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 059cf566e123ca7eb7434285c6455d7afafb4e02)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_core.c   | 5 +++++
 include/uapi/linux/tee.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 5c60bf4423e6..58a5009eacc3 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -90,8 +90,13 @@ static int tee_ioctl_version(struct tee_context *ctx,
 	struct tee_ioctl_version_data vers;
 
 	ctx->teedev->desc->ops->get_version(ctx->teedev, &vers);
+
+	if (ctx->teedev->desc->flags & TEE_DESC_PRIVILEGED)
+		vers.gen_caps |= TEE_GEN_CAP_PRIVILEGED;
+
 	if (copy_to_user(uvers, &vers, sizeof(vers)))
 		return -EFAULT;
+
 	return 0;
 }
 
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 370d8845ab21..688782e90140 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -49,6 +49,7 @@
 #define TEE_MAX_ARG_SIZE	1024
 
 #define TEE_GEN_CAP_GP		(1 << 0)/* GlobalPlatform compliant TEE */
+#define TEE_GEN_CAP_PRIVILEGED	(1 << 1)/* Privileged device (for supplicant) */
 
 /*
  * TEE Implementation ID

From f7d164341533f7a3318f3633554270e5e0fa40db Mon Sep 17 00:00:00 2001
From: David Wang <david.wang@arm.com>
Date: Thu, 16 Feb 2017 16:43:44 +0800
Subject: [PATCH 1401/3220] BACKPORT: tee: optee: sync with new naming of
 interrupts

In the latest changes of optee_os, the interrupts' names are
changed to "native" and "foreign" interrupts.

Change-Id: I813558914d5abcd58ebb1a33c7de8f7c25858968
Signed-off-by: David Wang <david.wang@arm.com>
Signed-off-by: Jerome Forissier <jerome.forissier@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 39e6519a3f135b143dee4d4fb5ac0438e75454e2)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/optee_smc.h | 12 ++++++------
 drivers/tee/optee/rpc.c       |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/tee/optee/optee_smc.h b/drivers/tee/optee/optee_smc.h
index 13b7c98cdf25..069c8e1429de 100644
--- a/drivers/tee/optee/optee_smc.h
+++ b/drivers/tee/optee/optee_smc.h
@@ -298,7 +298,7 @@ struct optee_smc_disable_shm_cache_result {
 	OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_ENABLE_SHM_CACHE)
 
 /*
- * Resume from RPC (for example after processing an IRQ)
+ * Resume from RPC (for example after processing a foreign interrupt)
  *
  * Call register usage:
  * a0	SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC
@@ -383,19 +383,19 @@ struct optee_smc_disable_shm_cache_result {
 	OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_FREE)
 
 /*
- * Deliver an IRQ in normal world.
+ * Deliver foreign interrupt to normal world.
  *
  * "Call" register usage:
- * a0	OPTEE_SMC_RETURN_RPC_IRQ
+ * a0	OPTEE_SMC_RETURN_RPC_FOREIGN_INTR
  * a1-7	Resume information, must be preserved
  *
  * "Return" register usage:
  * a0	SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC.
  * a1-7	Preserved
  */
-#define OPTEE_SMC_RPC_FUNC_IRQ		4
-#define OPTEE_SMC_RETURN_RPC_IRQ \
-	OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_IRQ)
+#define OPTEE_SMC_RPC_FUNC_FOREIGN_INTR		4
+#define OPTEE_SMC_RETURN_RPC_FOREIGN_INTR \
+	OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_FOREIGN_INTR)
 
 /*
  * Do an RPC request. The supplied struct optee_msg_arg tells which
diff --git a/drivers/tee/optee/rpc.c b/drivers/tee/optee/rpc.c
index 9488ed1541bc..cef417f4f4d2 100644
--- a/drivers/tee/optee/rpc.c
+++ b/drivers/tee/optee/rpc.c
@@ -371,11 +371,11 @@ void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param)
 		shm = reg_pair_to_ptr(param->a1, param->a2);
 		tee_shm_free(shm);
 		break;
-	case OPTEE_SMC_RPC_FUNC_IRQ:
+	case OPTEE_SMC_RPC_FUNC_FOREIGN_INTR:
 		/*
-		 * An IRQ was raised while secure world was executing,
-		 * since all IRQs are handled in Linux a dummy RPC is
-		 * performed to let Linux take the IRQ through the normal
+		 * A foreign interrupt was raised while secure world was
+		 * executing, since they are handled in Linux a dummy RPC is
+		 * performed to let Linux take the interrupt through the normal
 		 * vector.
 		 */
 		break;

From 7902639f8133bbc7d993d6214c2ffad009659bbe Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Mon, 9 Oct 2017 11:11:49 +0200
Subject: [PATCH 1402/3220] BACKPORT: optee: fix invalid of_node_put() in
 optee_driver_init()

The first node supplied to of_find_matching_node() has its reference
counter decreased as part of call to that function. In optee_driver_init()
after calling of_find_matching_node() it's invalid to call of_node_put() on
the supplied node again.

So remove the invalid call to of_node_put().

Change-Id: Ibc0a31192d94e79c1487d72bb1e5628206fcf408
Reported-by: Alex Shi <alex.shi@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit f044113113dd95ba73916bde10e804d3cdfa2662)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index 7952357df9c8..edb6e4e9ef3a 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -590,7 +590,6 @@ static int __init optee_driver_init(void)
 		return -ENODEV;
 
 	np = of_find_matching_node(fw_np, optee_match);
-	of_node_put(fw_np);
 	if (!np)
 		return -ENODEV;
 

From 1849cd3d301d79c527c62e9077c3e8edcec3762b Mon Sep 17 00:00:00 2001
From: Artem Borisov <dedsa2002@gmail.com>
Date: Sat, 13 Jan 2018 18:03:40 +0300
Subject: [PATCH 1403/3220] ANDROID: uid_sys_stats: fix the comment

It is not uid_cputime.c anymore.

Change-Id: I7effc2a449c1f9cba9d86a7b122a9c05fc266405
Signed-off-by: Artem Borisov <dedsa2002@gmail.com>
---
 drivers/misc/uid_sys_stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 031320e51522..0e6ab4e7c686 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -1,4 +1,4 @@
-/* drivers/misc/uid_cputime.c
+/* drivers/misc/uid_sys_stats.c
  *
  * Copyright (C) 2014 - 2015 Google, Inc.
  *

From c2f631bf4969d9774288149581a5a9253adf4392 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@google.com>
Date: Thu, 22 Jun 2017 12:14:40 -0700
Subject: [PATCH 1404/3220] fscrypt: updates on 4.15-rc4

Cherry-picked from origin/upstream-f2fs-stable-linux-4.4.y:

ba1ade71012d fscrypt: resolve some cherry-pick bugs
9e32f17d241b fscrypt: move to generic async completion
4ecacbed6e1c crypto: introduce crypto wait for async op
42d89da82b25 fscrypt: lock mutex before checking for bounce page pool
2286508d17c2 fscrypt: new helper function - fscrypt_prepare_setattr()
5cbdd42ad248 fscrypt: new helper function - fscrypt_prepare_lookup()
a31feba5c18f fscrypt: new helper function - fscrypt_prepare_rename()
95efafb6239d fscrypt: new helper function - fscrypt_prepare_link()
2b4b4f98dddf fscrypt: new helper function - fscrypt_file_open()
8c815f381cd6 fscrypt: new helper function - fscrypt_require_key()
272e43502577 fscrypt: remove unneeded empty fscrypt_operations structs
1034eeec516a fscrypt: remove ->is_encrypted()
32c0d3ae9d66 fscrypt: switch from ->is_encrypted() to IS_ENCRYPTED()
a4781dd1f175 fs, fscrypt: add an S_ENCRYPTED inode flag
ff0a3dbc9392 fscrypt: clean up include file mess
bc4a61c60bea fscrypt: fix dereference of NULL user_key_payload
a53dc7e00559 fscrypt: make ->dummy_context() return bool

Change-Id: I461d742adc7b77177df91429a1fd9c8624a698d6
Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
---
 crypto/api.c                    |  13 ++
 fs/crypto/Makefile              |   2 +-
 fs/crypto/crypto.c              |  37 +---
 fs/crypto/fname.c               |  39 +----
 fs/crypto/fscrypt_private.h     |  12 +-
 fs/crypto/hooks.c               | 112 ++++++++++++
 fs/crypto/keyinfo.c             |  28 +--
 fs/crypto/policy.c              |   6 +-
 fs/ext4/inode.c                 |   5 +-
 fs/f2fs/f2fs.h                  |   9 +-
 fs/f2fs/inode.c                 |   5 +-
 fs/f2fs/super.c                 |   7 +-
 include/linux/crypto.h          |  40 +++++
 include/linux/fs.h              |   2 +
 include/linux/fscrypt.h         | 290 ++++++++++++++++++++++++++++++++
 include/linux/fscrypt_common.h  | 138 ---------------
 include/linux/fscrypt_notsupp.h |  39 ++++-
 include/linux/fscrypt_supp.h    |  17 +-
 18 files changed, 550 insertions(+), 251 deletions(-)
 create mode 100644 fs/crypto/hooks.c
 create mode 100644 include/linux/fscrypt.h
 delete mode 100644 include/linux/fscrypt_common.h

diff --git a/crypto/api.c b/crypto/api.c
index bbc147cb5dec..e5c1abfd451f 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -24,6 +24,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/completion.h>
 #include "internal.h"
 
 LIST_HEAD(crypto_alg_list);
@@ -611,5 +612,17 @@ int crypto_has_alg(const char *name, u32 type, u32 mask)
 }
 EXPORT_SYMBOL_GPL(crypto_has_alg);
 
+void crypto_req_done(struct crypto_async_request *req, int err)
+{
+	struct crypto_wait *wait = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+
+	wait->err = err;
+	complete(&wait->completion);
+}
+EXPORT_SYMBOL_GPL(crypto_req_done);
+
 MODULE_DESCRIPTION("Cryptographic core API");
 MODULE_LICENSE("GPL");
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index 9f6607f17b53..cb496989a6b6 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
 
-fscrypto-y := crypto.o fname.o policy.o keyinfo.o
+fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o
 fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index c7835df7e7b8..732a786cce9d 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -126,21 +126,6 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
 }
 EXPORT_SYMBOL(fscrypt_get_ctx);
 
-/**
- * page_crypt_complete() - completion callback for page crypto
- * @req: The asynchronous cipher request context
- * @res: The result of the cipher operation
- */
-static void page_crypt_complete(struct crypto_async_request *req, int res)
-{
-	struct fscrypt_completion_result *ecr = req->data;
-
-	if (res == -EINPROGRESS)
-		return;
-	ecr->res = res;
-	complete(&ecr->completion);
-}
-
 int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 			   u64 lblk_num, struct page *src_page,
 			   struct page *dest_page, unsigned int len,
@@ -151,7 +136,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 		u8 padding[FS_IV_SIZE - sizeof(__le64)];
 	} iv;
 	struct skcipher_request *req = NULL;
-	DECLARE_FS_COMPLETION_RESULT(ecr);
+	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist dst, src;
 	struct fscrypt_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_ctfm;
@@ -179,7 +164,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 
 	skcipher_request_set_callback(
 		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-		page_crypt_complete, &ecr);
+		crypto_req_done, &wait);
 
 	sg_init_table(&dst, 1);
 	sg_set_page(&dst, dest_page, len, offs);
@@ -187,14 +172,9 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 	sg_set_page(&src, src_page, len, offs);
 	skcipher_request_set_crypt(req, &src, &dst, len, &iv);
 	if (rw == FS_DECRYPT)
-		res = crypto_skcipher_decrypt(req);
+		res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
 	else
-		res = crypto_skcipher_encrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		BUG_ON(req->base.data != &ecr);
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
+		res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res) {
 		printk_ratelimited(KERN_ERR
@@ -340,7 +320,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
 		return -ECHILD;
 
 	dir = dget_parent(dentry);
-	if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) {
+	if (!IS_ENCRYPTED(d_inode(dir))) {
 		dput(dir);
 		return 0;
 	}
@@ -410,11 +390,8 @@ int fscrypt_initialize(unsigned int cop_flags)
 {
 	int i, res = -ENOMEM;
 
-	/*
-	 * No need to allocate a bounce page pool if there already is one or
-	 * this FS won't use it.
-	 */
-	if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool)
+	/* No need to allocate a bounce page pool if this FS won't use it. */
+	if (cop_flags & FS_CFLG_OWN_PAGES)
 		return 0;
 
 	mutex_lock(&fscrypt_init_mutex);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index ad9f814fdead..6eb434363ff2 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -14,21 +14,6 @@
 #include <linux/ratelimit.h>
 #include "fscrypt_private.h"
 
-/**
- * fname_crypt_complete() - completion callback for filename crypto
- * @req: The asynchronous cipher request context
- * @res: The result of the cipher operation
- */
-static void fname_crypt_complete(struct crypto_async_request *req, int res)
-{
-	struct fscrypt_completion_result *ecr = req->data;
-
-	if (res == -EINPROGRESS)
-		return;
-	ecr->res = res;
-	complete(&ecr->completion);
-}
-
 /**
  * fname_encrypt() - encrypt a filename
  *
@@ -40,7 +25,7 @@ static int fname_encrypt(struct inode *inode,
 			const struct qstr *iname, struct fscrypt_str *oname)
 {
 	struct skcipher_request *req = NULL;
-	DECLARE_FS_COMPLETION_RESULT(ecr);
+	DECLARE_CRYPTO_WAIT(wait);
 	struct fscrypt_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_ctfm;
 	int res = 0;
@@ -76,17 +61,12 @@ static int fname_encrypt(struct inode *inode,
 	}
 	skcipher_request_set_callback(req,
 			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-			fname_crypt_complete, &ecr);
+			crypto_req_done, &wait);
 	sg_init_one(&sg, oname->name, cryptlen);
 	skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv);
 
 	/* Do the encryption */
-	res = crypto_skcipher_encrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		/* Request is being completed asynchronously; wait for it */
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
+	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res < 0) {
 		printk_ratelimited(KERN_ERR
@@ -110,7 +90,7 @@ static int fname_decrypt(struct inode *inode,
 				struct fscrypt_str *oname)
 {
 	struct skcipher_request *req = NULL;
-	DECLARE_FS_COMPLETION_RESULT(ecr);
+	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist src_sg, dst_sg;
 	struct fscrypt_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_ctfm;
@@ -131,7 +111,7 @@ static int fname_decrypt(struct inode *inode,
 	}
 	skcipher_request_set_callback(req,
 		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-		fname_crypt_complete, &ecr);
+		crypto_req_done, &wait);
 
 	/* Initialize IV */
 	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
@@ -140,11 +120,7 @@ static int fname_decrypt(struct inode *inode,
 	sg_init_one(&src_sg, iname->name, iname->len);
 	sg_init_one(&dst_sg, oname->name, oname->len);
 	skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
-	res = crypto_skcipher_decrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
+	res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res < 0) {
 		printk_ratelimited(KERN_ERR
@@ -382,8 +358,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 	memset(fname, 0, sizeof(struct fscrypt_name));
 	fname->usr_fname = iname;
 
-	if (!dir->i_sb->s_cop->is_encrypted(dir) ||
-				fscrypt_is_dot_dotdot(iname)) {
+	if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) {
 		fname->disk_name.name = (unsigned char *)iname->name;
 		fname->disk_name.len = iname->len;
 		return 0;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 79d79755d79b..c3ad415cd14f 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -11,7 +11,8 @@
 #ifndef _FSCRYPT_PRIVATE_H
 #define _FSCRYPT_PRIVATE_H
 
-#include <linux/fscrypt_supp.h>
+#define __FS_HAS_ENCRYPTION 1
+#include <linux/fscrypt.h>
 #include <crypto/hash.h>
 
 /* Encryption parameters */
@@ -69,15 +70,6 @@ typedef enum {
 #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
 #define FS_CTX_HAS_BOUNCE_BUFFER_FL		0x00000002
 
-struct fscrypt_completion_result {
-	struct completion completion;
-	int res;
-};
-
-#define DECLARE_FS_COMPLETION_RESULT(ecr) \
-	struct fscrypt_completion_result ecr = { \
-		COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 }
-
 /* bio stuffs */
 #define REQ_OP_READ	READ
 #define REQ_OP_WRITE	WRITE
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
new file mode 100644
index 000000000000..9f5fb2eb9cf7
--- /dev/null
+++ b/fs/crypto/hooks.c
@@ -0,0 +1,112 @@
+/*
+ * fs/crypto/hooks.c
+ *
+ * Encryption hooks for higher-level filesystem operations.
+ */
+
+#include <linux/ratelimit.h>
+#include "fscrypt_private.h"
+
+/**
+ * fscrypt_file_open - prepare to open a possibly-encrypted regular file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * Currently, an encrypted regular file can only be opened if its encryption key
+ * is available; access to the raw encrypted contents is not supported.
+ * Therefore, we first set up the inode's encryption key (if not already done)
+ * and return an error if it's unavailable.
+ *
+ * We also verify that if the parent directory (from the path via which the file
+ * is being opened) is encrypted, then the inode being opened uses the same
+ * encryption policy.  This is needed as part of the enforcement that all files
+ * in an encrypted directory tree use the same encryption policy, as a
+ * protection against certain types of offline attacks.  Note that this check is
+ * needed even when opening an *unencrypted* file, since it's forbidden to have
+ * an unencrypted file in an encrypted directory.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ */
+int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+	int err;
+	struct dentry *dir;
+
+	err = fscrypt_require_key(inode);
+	if (err)
+		return err;
+
+	dir = dget_parent(file_dentry(filp));
+	if (IS_ENCRYPTED(d_inode(dir)) &&
+	    !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+		pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu",
+				    d_inode(dir)->i_ino, inode->i_ino);
+		err = -EPERM;
+	}
+	dput(dir);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_file_open);
+
+int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
+{
+	int err;
+
+	err = fscrypt_require_key(dir);
+	if (err)
+		return err;
+
+	if (!fscrypt_has_permitted_context(dir, inode))
+		return -EPERM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_link);
+
+int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry,
+			     unsigned int flags)
+{
+	int err;
+
+	err = fscrypt_require_key(old_dir);
+	if (err)
+		return err;
+
+	err = fscrypt_require_key(new_dir);
+	if (err)
+		return err;
+
+	if (old_dir != new_dir) {
+		if (IS_ENCRYPTED(new_dir) &&
+		    !fscrypt_has_permitted_context(new_dir,
+						   d_inode(old_dentry)))
+			return -EPERM;
+
+		if ((flags & RENAME_EXCHANGE) &&
+		    IS_ENCRYPTED(old_dir) &&
+		    !fscrypt_has_permitted_context(old_dir,
+						   d_inode(new_dentry)))
+			return -EPERM;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename);
+
+int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry)
+{
+	int err = fscrypt_get_encryption_info(dir);
+
+	if (err)
+		return err;
+
+	if (fscrypt_has_encryption_key(dir)) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+		spin_unlock(&dentry->d_lock);
+	}
+
+	d_set_d_op(dentry, &fscrypt_d_ops);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 66e0728e9bbe..444c65ed6db8 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -17,17 +17,6 @@
 
 static struct crypto_shash *essiv_hash_tfm;
 
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
-	struct fscrypt_completion_result *ecr = req->data;
-
-	if (rc == -EINPROGRESS)
-		return;
-
-	ecr->res = rc;
-	complete(&ecr->completion);
-}
-
 /**
  * derive_key_aes() - Derive a key using AES-128-ECB
  * @deriving_key: Encryption key used for derivation.
@@ -42,7 +31,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
 {
 	int res = 0;
 	struct skcipher_request *req = NULL;
-	DECLARE_FS_COMPLETION_RESULT(ecr);
+	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist src_sg, dst_sg;
 	struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
 
@@ -59,7 +48,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
 	}
 	skcipher_request_set_callback(req,
 			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-			derive_crypt_complete, &ecr);
+			crypto_req_done, &wait);
 	res = crypto_skcipher_setkey(tfm, deriving_key,
 					FS_AES_128_ECB_KEY_SIZE);
 	if (res < 0)
@@ -69,11 +58,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
 	sg_init_one(&dst_sg, derived_raw_key, source_key->size);
 	skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size,
 				   NULL);
-	res = crypto_skcipher_encrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
+	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
 out:
 	skcipher_request_free(req);
 	crypto_free_skcipher(tfm);
@@ -109,6 +94,11 @@ static int validate_user_key(struct fscrypt_info *crypt_info,
 		goto out;
 	}
 	ukp = user_key_payload(keyring_key);
+	if (!ukp) {
+		/* key was revoked before we acquired its semaphore */
+		res = -EKEYREVOKED;
+		goto out;
+	}
 	if (ukp->datalen != sizeof(struct fscrypt_key)) {
 		res = -EINVAL;
 		goto out;
@@ -268,7 +258,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
 	if (res < 0) {
 		if (!fscrypt_dummy_context_enabled(inode) ||
-		    inode->i_sb->s_cop->is_encrypted(inode))
+		    IS_ENCRYPTED(inode))
 			return res;
 		/* Fake up a context for an unencrypted directory */
 		memset(&ctx, 0, sizeof(ctx));
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 9914d51dff86..2f2c53f2e136 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -109,7 +109,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
 	struct fscrypt_policy policy;
 	int res;
 
-	if (!inode->i_sb->s_cop->is_encrypted(inode))
+	if (!IS_ENCRYPTED(inode))
 		return -ENODATA;
 
 	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
@@ -166,11 +166,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 		return 1;
 
 	/* No restrictions if the parent directory is unencrypted */
-	if (!cops->is_encrypted(parent))
+	if (!IS_ENCRYPTED(parent))
 		return 1;
 
 	/* Encrypted directories must not contain unencrypted files */
-	if (!cops->is_encrypted(child))
+	if (!IS_ENCRYPTED(child))
 		return 0;
 
 	/*
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 530e790f22e0..fbc4f31c87cc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4194,8 +4194,11 @@ void ext4_set_inode_flags(struct inode *inode)
 		new_fl |= S_DIRSYNC;
 	if (test_opt(inode->i_sb, DAX))
 		new_fl |= S_DAX;
+	if (flags & EXT4_ENCRYPT_FL)
+		new_fl |= S_ENCRYPTED;
 	inode_set_flags(inode, new_fl,
-			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
+			S_ENCRYPTED);
 }
 
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 081ec493baae..983f4a7b505d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -23,14 +23,12 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
 #include <crypto/hash.h>
 #include <linux/writeback.h>
 
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
 #else
@@ -3152,6 +3150,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
 {
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
 	file_set_encrypt(inode);
+	inode->i_flags |= S_ENCRYPTED;
 #endif
 }
 
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9684d53563f1..b4c4f2b25304 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -43,8 +43,11 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & FS_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
+	if (f2fs_encrypted_inode(inode))
+		new_fl |= S_ENCRYPTED;
 	inode_set_flags(inode, new_fl,
-			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
+			S_ENCRYPTED);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 76e2f1518224..187cead7bd37 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1742,14 +1742,9 @@ static const struct fscrypt_operations f2fs_cryptops = {
 	.key_prefix	= "f2fs:",
 	.get_context	= f2fs_get_context,
 	.set_context	= f2fs_set_context,
-	.is_encrypted	= f2fs_encrypted_inode,
 	.empty_dir	= f2fs_empty_dir,
 	.max_namelen	= f2fs_max_namelen,
 };
-#else
-static const struct fscrypt_operations f2fs_cryptops = {
-	.is_encrypted	= f2fs_encrypted_inode,
-};
 #endif
 
 static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -2478,7 +2473,9 @@ try_onemore:
 #endif
 
 	sb->s_op = &f2fs_sops;
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
 	sb->s_cop = &f2fs_cryptops;
+#endif
 	sb->s_xattr = f2fs_xattr_handlers;
 	sb->s_export_op = &f2fs_export_ops;
 	sb->s_magic = F2FS_SUPER_MAGIC;
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index e71cb70a1ac2..b7c1e1a7ebac 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
+#include <linux/completion.h>
 
 /*
  * Autoloaded crypto modules should only use a prefixed name to avoid allowing
@@ -469,6 +470,45 @@ struct crypto_alg {
 	struct module *cra_module;
 } CRYPTO_MINALIGN_ATTR;
 
+/*
+ * A helper struct for waiting for completion of async crypto ops
+ */
+struct crypto_wait {
+	struct completion completion;
+	int err;
+};
+
+/*
+ * Macro for declaring a crypto op async wait object on stack
+ */
+#define DECLARE_CRYPTO_WAIT(_wait) \
+	struct crypto_wait _wait = { \
+		COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 }
+
+/*
+ * Async ops completion helper functioons
+ */
+void crypto_req_done(struct crypto_async_request *req, int err);
+
+static inline int crypto_wait_req(int err, struct crypto_wait *wait)
+{
+	switch (err) {
+	case -EINPROGRESS:
+	case -EBUSY:
+		wait_for_completion(&wait->completion);
+		reinit_completion(&wait->completion);
+		err = wait->err;
+		break;
+	};
+
+	return err;
+}
+
+static inline void crypto_init_wait(struct crypto_wait *wait)
+{
+	init_completion(&wait->completion);
+}
+
 /*
  * Algorithm registration interface.
  */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b79adb9782e..19a5337f9dab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1798,6 +1798,7 @@ struct super_operations {
 #else
 #define S_DAX		0	/* Make all the DAX code disappear */
 #endif
+#define S_ENCRYPTED	16384	/* Encrypted file (using fs/crypto/) */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -1836,6 +1837,7 @@ struct super_operations {
 #define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
 #define IS_NOSEC(inode)		((inode)->i_flags & S_NOSEC)
 #define IS_DAX(inode)		((inode)->i_flags & S_DAX)
+#define IS_ENCRYPTED(inode)	((inode)->i_flags & S_ENCRYPTED)
 
 #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
 				 (inode)->i_rdev == WHITEOUT_DEV)
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
new file mode 100644
index 000000000000..8641e56b8f8a
--- /dev/null
+++ b/include/linux/fscrypt.h
@@ -0,0 +1,290 @@
+/*
+ * fscrypt.h: declarations for per-file encryption
+ *
+ * Filesystems that implement per-file encryption include this header
+ * file with the __FS_HAS_ENCRYPTION set according to whether that filesystem
+ * is being built with encryption support or not.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#ifndef _LINUX_FSCRYPT_H
+#define _LINUX_FSCRYPT_H
+
+#include <linux/key.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <crypto/skcipher.h>
+#include <uapi/linux/fs.h>
+
+#define FS_CRYPTO_BLOCK_SIZE		16
+
+struct fscrypt_info;
+
+struct fscrypt_ctx {
+	union {
+		struct {
+			struct page *bounce_page;	/* Ciphertext page */
+			struct page *control_page;	/* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;	/* Free list */
+	};
+	u8 flags;				/* Flags */
+};
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __packed;
+
+struct fscrypt_str {
+	unsigned char *name;
+	u32 len;
+};
+
+struct fscrypt_name {
+	const struct qstr *usr_fname;
+	struct fscrypt_str disk_name;
+	u32 hash;
+	u32 minor_hash;
+	struct fscrypt_str crypto_buf;
+};
+
+#define FSTR_INIT(n, l)		{ .name = n, .len = l }
+#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p)		((p)->disk_name.name)
+#define fname_len(p)		((p)->disk_name.len)
+
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto opertions for filesystems
+ */
+struct fscrypt_operations {
+	unsigned int flags;
+	const char *key_prefix;
+	int (*get_context)(struct inode *, void *, size_t);
+	int (*set_context)(struct inode *, const void *, size_t, void *);
+	bool (*dummy_context)(struct inode *);
+	bool (*empty_dir)(struct inode *);
+	unsigned (*max_namelen)(struct inode *);
+};
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	if (inode->i_sb->s_cop->dummy_context &&
+				inode->i_sb->s_cop->dummy_context(inode))
+		return true;
+	return false;
+}
+
+static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
+					u32 filenames_mode)
+{
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+		return true;
+
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+		return true;
+
+	return false;
+}
+
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
+{
+	if (str->len == 1 && str->name[0] == '.')
+		return true;
+
+	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+		return true;
+
+	return false;
+}
+
+#if __FS_HAS_ENCRYPTION
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+}
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+	return (inode->i_crypt_info != NULL);
+}
+
+#include <linux/fscrypt_supp.h>
+
+#else /* !__FS_HAS_ENCRYPTION */
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+}
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+	return 0;
+}
+
+#include <linux/fscrypt_notsupp.h>
+#endif /* __FS_HAS_ENCRYPTION */
+
+/**
+ * fscrypt_require_key - require an inode's encryption key
+ * @inode: the inode we need the key for
+ *
+ * If the inode is encrypted, set up its encryption key if not already done.
+ * Then require that the key be present and return -ENOKEY otherwise.
+ *
+ * No locks are needed, and the key will live as long as the struct inode --- so
+ * it won't go away from under you.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_require_key(struct inode *inode)
+{
+	if (IS_ENCRYPTED(inode)) {
+		int err = fscrypt_get_encryption_info(inode);
+
+		if (err)
+			return err;
+		if (!fscrypt_has_encryption_key(inode))
+			return -ENOKEY;
+	}
+	return 0;
+}
+
+/**
+ * fscrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory
+ * @old_dentry: an existing dentry for the inode being linked
+ * @dir: the target directory
+ * @dentry: negative dentry for the target filename
+ *
+ * A new link can only be added to an encrypted directory if the directory's
+ * encryption key is available --- since otherwise we'd have no way to encrypt
+ * the filename.  Therefore, we first set up the directory's encryption key (if
+ * not already done) and return an error if it's unavailable.
+ *
+ * We also verify that the link will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
+ * -EPERM if the link would result in an inconsistent encryption policy, or
+ * another -errno code.
+ */
+static inline int fscrypt_prepare_link(struct dentry *old_dentry,
+				       struct inode *dir,
+				       struct dentry *dentry)
+{
+	if (IS_ENCRYPTED(dir))
+		return __fscrypt_prepare_link(d_inode(old_dentry), dir);
+	return 0;
+}
+
+/**
+ * fscrypt_prepare_rename - prepare for a rename between possibly-encrypted directories
+ * @old_dir: source directory
+ * @old_dentry: dentry for source file
+ * @new_dir: target directory
+ * @new_dentry: dentry for target location (may be negative unless exchanging)
+ * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
+ *
+ * Prepare for ->rename() where the source and/or target directories may be
+ * encrypted.  A new link can only be added to an encrypted directory if the
+ * directory's encryption key is available --- since otherwise we'd have no way
+ * to encrypt the filename.  A rename to an existing name, on the other hand,
+ * *is* cryptographically possible without the key.  However, we take the more
+ * conservative approach and just forbid all no-key renames.
+ *
+ * We also verify that the rename will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if an encryption key is missing, -EPERM if the
+ * rename would cause inconsistent encryption policies, or another -errno code.
+ */
+static inline int fscrypt_prepare_rename(struct inode *old_dir,
+					 struct dentry *old_dentry,
+					 struct inode *new_dir,
+					 struct dentry *new_dentry,
+					 unsigned int flags)
+{
+	if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
+		return __fscrypt_prepare_rename(old_dir, old_dentry,
+						new_dir, new_dentry, flags);
+	return 0;
+}
+
+/**
+ * fscrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory
+ * @dir: directory being searched
+ * @dentry: filename being looked up
+ * @flags: lookup flags
+ *
+ * Prepare for ->lookup() in a directory which may be encrypted.  Lookups can be
+ * done with or without the directory's encryption key; without the key,
+ * filenames are presented in encrypted form.  Therefore, we'll try to set up
+ * the directory's encryption key, but even without it the lookup can continue.
+ *
+ * To allow invalidating stale dentries if the directory's encryption key is
+ * added later, we also install a custom ->d_revalidate() method and use the
+ * DCACHE_ENCRYPTED_WITH_KEY flag to indicate whether a given dentry is a
+ * plaintext name (flag set) or a ciphertext name (flag cleared).
+ *
+ * Return: 0 on success, -errno if a problem occurred while setting up the
+ * encryption key
+ */
+static inline int fscrypt_prepare_lookup(struct inode *dir,
+					 struct dentry *dentry,
+					 unsigned int flags)
+{
+	if (IS_ENCRYPTED(dir))
+		return __fscrypt_prepare_lookup(dir, dentry);
+	return 0;
+}
+
+/**
+ * fscrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes
+ * @dentry: dentry through which the inode is being changed
+ * @attr: attributes to change
+ *
+ * Prepare for ->setattr() on a possibly-encrypted inode.  On an encrypted file,
+ * most attribute changes are allowed even without the encryption key.  However,
+ * without the encryption key we do have to forbid truncates.  This is needed
+ * because the size being truncated to may not be a multiple of the filesystem
+ * block size, and in that case we'd have to decrypt the final block, zero the
+ * portion past i_size, and re-encrypt it.  (We *could* allow truncating to a
+ * filesystem block boundary, but it's simpler to just forbid all truncates ---
+ * and we already forbid all other contents modifications without the key.)
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_prepare_setattr(struct dentry *dentry,
+					  struct iattr *attr)
+{
+	if (attr->ia_valid & ATTR_SIZE)
+		return fscrypt_require_key(d_inode(dentry));
+	return 0;
+}
+
+#endif	/* _LINUX_FSCRYPT_H */
diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h
deleted file mode 100644
index 4022c61f7e9b..000000000000
--- a/include/linux/fscrypt_common.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * fscrypt_common.h: common declarations for per-file encryption
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-
-#ifndef _LINUX_FSCRYPT_COMMON_H
-#define _LINUX_FSCRYPT_COMMON_H
-
-#include <linux/key.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/bio.h>
-#include <linux/dcache.h>
-#include <crypto/skcipher.h>
-#include <uapi/linux/fs.h>
-
-#define FS_CRYPTO_BLOCK_SIZE		16
-
-struct fscrypt_info;
-
-struct fscrypt_ctx {
-	union {
-		struct {
-			struct page *bounce_page;	/* Ciphertext page */
-			struct page *control_page;	/* Original page  */
-		} w;
-		struct {
-			struct bio *bio;
-			struct work_struct work;
-		} r;
-		struct list_head free_list;	/* Free list */
-	};
-	u8 flags;				/* Flags */
-};
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct fscrypt_symlink_data {
-	__le16 len;
-	char encrypted_path[1];
-} __packed;
-
-struct fscrypt_str {
-	unsigned char *name;
-	u32 len;
-};
-
-struct fscrypt_name {
-	const struct qstr *usr_fname;
-	struct fscrypt_str disk_name;
-	u32 hash;
-	u32 minor_hash;
-	struct fscrypt_str crypto_buf;
-};
-
-#define FSTR_INIT(n, l)		{ .name = n, .len = l }
-#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p)		((p)->disk_name.name)
-#define fname_len(p)		((p)->disk_name.len)
-
-/*
- * fscrypt superblock flags
- */
-#define FS_CFLG_OWN_PAGES (1U << 1)
-
-/*
- * crypto opertions for filesystems
- */
-struct fscrypt_operations {
-	unsigned int flags;
-	const char *key_prefix;
-	int (*get_context)(struct inode *, void *, size_t);
-	int (*set_context)(struct inode *, const void *, size_t, void *);
-	int (*dummy_context)(struct inode *);
-	bool (*is_encrypted)(struct inode *);
-	bool (*empty_dir)(struct inode *);
-	unsigned (*max_namelen)(struct inode *);
-};
-
-static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
-{
-	if (inode->i_sb->s_cop->dummy_context &&
-				inode->i_sb->s_cop->dummy_context(inode))
-		return true;
-	return false;
-}
-
-static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
-					u32 filenames_mode)
-{
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
-		return true;
-
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
-		return true;
-
-	return false;
-}
-
-static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
-{
-	if (str->len == 1 && str->name[0] == '.')
-		return true;
-
-	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
-		return true;
-
-	return false;
-}
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
-#else
-	WARN_ON_ONCE(1);
-	return ERR_PTR(-EINVAL);
-#endif
-}
-
-static inline int fscrypt_has_encryption_key(const struct inode *inode)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-	return (inode->i_crypt_info != NULL);
-#else
-	return 0;
-#endif
-}
-
-#endif	/* _LINUX_FSCRYPT_COMMON_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index ec406aed2f2f..c4c6bf2c390e 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -3,13 +3,16 @@
  *
  * This stubs out the fscrypt functions for filesystems configured without
  * encryption support.
+ *
+ * Do not include this file directly. Use fscrypt.h instead!
  */
+#ifndef _LINUX_FSCRYPT_H
+#error "Incorrect include of linux/fscrypt_notsupp.h!"
+#endif
 
 #ifndef _LINUX_FSCRYPT_NOTSUPP_H
 #define _LINUX_FSCRYPT_NOTSUPP_H
 
-#include <linux/fscrypt_common.h>
-
 /* crypto.c */
 static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
 						  gfp_t gfp_flags)
@@ -97,7 +100,7 @@ static inline int fscrypt_setup_filename(struct inode *dir,
 					 const struct qstr *iname,
 					 int lookup, struct fscrypt_name *fname)
 {
-	if (dir->i_sb->s_cop->is_encrypted(dir))
+	if (IS_ENCRYPTED(dir))
 		return -EOPNOTSUPP;
 
 	memset(fname, 0, sizeof(struct fscrypt_name));
@@ -174,4 +177,34 @@ static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 	return -EOPNOTSUPP;
 }
 
+/* hooks.c */
+
+static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+	if (IS_ENCRYPTED(inode))
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static inline int __fscrypt_prepare_link(struct inode *inode,
+					 struct inode *dir)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_rename(struct inode *old_dir,
+					   struct dentry *old_dentry,
+					   struct inode *new_dir,
+					   struct dentry *new_dentry,
+					   unsigned int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_lookup(struct inode *dir,
+					   struct dentry *dentry)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif	/* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index 32e2fcf13b01..2db5e9706f60 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -1,14 +1,15 @@
 /*
  * fscrypt_supp.h
  *
- * This is included by filesystems configured with encryption support.
+ * Do not include this file directly. Use fscrypt.h instead!
  */
+#ifndef _LINUX_FSCRYPT_H
+#error "Incorrect include of linux/fscrypt_supp.h!"
+#endif
 
 #ifndef _LINUX_FSCRYPT_SUPP_H
 #define _LINUX_FSCRYPT_SUPP_H
 
-#include <linux/fscrypt_common.h>
-
 /* crypto.c */
 extern struct kmem_cache *fscrypt_info_cachep;
 extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
@@ -142,4 +143,14 @@ extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
 				 unsigned int);
 
+/* hooks.c */
+extern int fscrypt_file_open(struct inode *inode, struct file *filp);
+extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir);
+extern int __fscrypt_prepare_rename(struct inode *old_dir,
+				    struct dentry *old_dentry,
+				    struct inode *new_dir,
+				    struct dentry *new_dentry,
+				    unsigned int flags);
+extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry);
+
 #endif	/* _LINUX_FSCRYPT_SUPP_H */

From ca0ebb4ee237903eed1113c57f0e9535c3d24221 Mon Sep 17 00:00:00 2001
From: Ke Wang <ke.wang@spreadtrum.com>
Date: Thu, 11 Jan 2018 09:49:25 +0800
Subject: [PATCH 1405/3220] sched: EAS: Initialize push_task as NULL to avoid
 direct reference on out_unlock path

After applying up-migrate patches(dc626b2 sched: avoid pushing
tasks to an offline CPU, 2da014c sched: Extend active balance
to accept 'push_task' argument), leaving EAS disabled and doing
a stability test which includes some random cpu plugin/plugout.
There are two types crashes happened as below:

TYPE 1:
[ 2072.653091] c1 ------------[ cut here ]------------
[ 2072.653133] c1 WARNING: CPU: 1 PID: 13 at kernel/fork.c:252 __put_task_struct+0x30/0x124()
[ 2072.653173] c1 CPU: 1 PID: 13 Comm: migration/1 Tainted: G        W  O    4.4.83-01066-g04c5403-dirty #17
[ 2072.653215] c1 [<c011141c>] (unwind_backtrace) from [<c010ced8>] (show_stack+0x20/0x24)
[ 2072.653235] c1 [<c010ced8>] (show_stack) from [<c043d7f8>] (dump_stack+0xa8/0xe0)
[ 2072.653255] c1 [<c043d7f8>] (dump_stack) from [<c012be04>] (warn_slowpath_common+0x98/0xc4)
[ 2072.653273] c1 [<c012be04>] (warn_slowpath_common) from [<c012beec>] (warn_slowpath_null+0x2c/0x34)
[ 2072.653291] c1 [<c012beec>] (warn_slowpath_null) from [<c01293b4>] (__put_task_struct+0x30/0x124)
[ 2072.653310] c1 [<c01293b4>] (__put_task_struct) from [<c0166964>] (active_load_balance_cpu_stop+0x22c/0x314)
[ 2072.653331] c1 [<c0166964>] (active_load_balance_cpu_stop) from [<c01c2604>] (cpu_stopper_thread+0x90/0x144)
[ 2072.653352] c1 [<c01c2604>] (cpu_stopper_thread) from [<c014d80c>] (smpboot_thread_fn+0x258/0x270)
[ 2072.653370] c1 [<c014d80c>] (smpboot_thread_fn) from [<c0149ee4>] (kthread+0x118/0x12c)
[ 2072.653388] c1 [<c0149ee4>] (kthread) from [<c0108310>] (ret_from_fork+0x14/0x24)
[ 2072.653400] c1 ---[ end trace 49c3d154890763fc ]---
[ 2072.653418] c1 Unable to handle kernel NULL pointer dereference at virtual address 00000000
...
[ 2072.832804] c1 [<c01ba00c>] (put_css_set) from [<c01be870>] (cgroup_free+0x6c/0x78)
[ 2072.832823] c1 [<c01be870>] (cgroup_free) from [<c01293f8>] (__put_task_struct+0x74/0x124)
[ 2072.832844] c1 [<c01293f8>] (__put_task_struct) from [<c0166964>] (active_load_balance_cpu_stop+0x22c/0x314)
[ 2072.832860] c1 [<c0166964>] (active_load_balance_cpu_stop) from [<c01c2604>] (cpu_stopper_thread+0x90/0x144)
[ 2072.832879] c1 [<c01c2604>] (cpu_stopper_thread) from [<c014d80c>] (smpboot_thread_fn+0x258/0x270)
[ 2072.832896] c1 [<c014d80c>] (smpboot_thread_fn) from [<c0149ee4>] (kthread+0x118/0x12c)
[ 2072.832914] c1 [<c0149ee4>] (kthread) from [<c0108310>] (ret_from_fork+0x14/0x24)
[ 2072.832930] c1 Code: f57ff05b f590f000 e3e02000 e3a03001 (e1941f9f)
[ 2072.839208] c1 ---[ end trace 49c3d154890763fd ]---

TYPE 2:
[  214.742695] c1 ------------[ cut here ]------------
[  214.742709] c1 kernel BUG at kernel/smpboot.c:136!
[  214.742718] c1 Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM
[  214.748785] c1 CPU: 1 PID: 18 Comm: migration/2 Tainted: G        W  O    4.4.83-00912-g370f62c #1
[  214.748805] c1 task: ef2d9680 task.stack: ee862000
[  214.748821] c1 PC is at smpboot_thread_fn+0x168/0x270
[  214.748832] c1 LR is at smpboot_thread_fn+0xe4/0x270
...
[  214.821339] c1 [<c014d71c>] (smpboot_thread_fn) from [<c0149ee4>] (kthread+0x118/0x12c)
[  214.821363] c1 [<c0149ee4>] (kthread) from [<c0108310>] (ret_from_fork+0x14/0x24)
[  214.821378] c1 Code: e5950000 e5943010 e1500003 0a000000 (e7f001f2)
[  214.827676] c1 ---[ end trace da87539f59bab8de ]---

For the first type crash, the root cause is the push_task pointer will be
used without initialization on the out_lock path. And maybe cpu hotplug in/out
make this happen more easily.

For the second type crash, it hits 'BUG_ON(td->cpu != smp_processor_id());' in
smpboot_thread_fn(). It seems that OOPS was caused by migration/2 which actually
running on cpu1. And I haven't found what actually happened.

However, after this fix, the second type crash seems gone too.

Signed-off-by: Ke Wang <ke.wang@spreadtrum.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d9b3ac867956..ca6ac458016b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9357,7 +9357,7 @@ static int active_load_balance_cpu_stop(void *data)
 	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd = NULL;
 	struct task_struct *p = NULL;
-	struct task_struct *push_task;
+	struct task_struct *push_task = NULL;
 	int push_task_detached = 0;
 	struct lb_env env = {
 		.sd		= sd,

From c30184d9b66164f4c4d18a333bfe2b5f323cf0e2 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 10 Aug 2017 13:34:30 +0100
Subject: [PATCH 1406/3220] FROMLIST: arm64: mm: Rename
 post_ttbr0_update_workaround

The post_ttbr0_update_workaround hook applies to any change to TTBRx_EL1.
Since we're using TTBR1 for the ASID, rename the hook to make it clearer
as to what it's doing.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Tested-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 158d495899ce55db453f682a8ac8390d5a426578)

Change-Id: Iaf152ca1bd0a20bd15a77afac4ad4e9ea8ada08f
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/arm64/include/asm/assembler.h | 4 ++--
 arch/arm64/kernel/entry.S          | 2 +-
 arch/arm64/mm/proc.S               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index e450bb6d21bd..610c6b55cd78 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -399,9 +399,9 @@ alternative_endif
 	.endm
 
 /*
- * Errata workaround post TTBR0_EL1 update.
+ * Errata workaround post TTBRx_EL1 update.
  */
-	.macro	post_ttbr0_update_workaround
+	.macro	post_ttbr_update_workaround
 #ifdef CONFIG_CAVIUM_ERRATUM_27456
 alternative_if ARM64_WORKAROUND_CAVIUM_27456
 	ic	iallu
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 434ef7178224..28561049b9e2 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -241,7 +241,7 @@ alternative_else_nop_endif
 	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
 	 * corruption).
 	 */
-	post_ttbr0_update_workaround
+	post_ttbr_update_workaround
 	.endif
 1:
 	.if	\el != 0
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 2e1e25328fdc..b06a797e0c5d 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -145,7 +145,7 @@ ENTRY(cpu_do_switch_mm)
 	isb
 	msr	ttbr0_el1, x0			// now update TTBR0
 	isb
-	post_ttbr0_update_workaround
+	post_ttbr_update_workaround
 	ret
 ENDPROC(cpu_do_switch_mm)
 

From da94c13a990c600646f173b41fe27dec28660f60 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 2 Jan 2018 18:19:39 +0000
Subject: [PATCH 1407/3220] FROMLIST: arm64: Move post_ttbr_update_workaround
 to C code

We will soon need to invoke a CPU-specific function pointer after changing
page tables, so move post_ttbr_update_workaround out into C code to make
this possible.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 400a169447ad2268b023637a118fba27246bcc19)

Change-Id: I4e6edb3dcb6aabe9c17e4698619a093e76495b36
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/arm64/include/asm/assembler.h | 13 -------------
 arch/arm64/kernel/entry.S          |  2 +-
 arch/arm64/mm/context.c            |  9 +++++++++
 arch/arm64/mm/proc.S               |  3 +--
 4 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 610c6b55cd78..5f6c8345c0e6 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -398,17 +398,4 @@ alternative_endif
 	mrs	\rd, sp_el0
 	.endm
 
-/*
- * Errata workaround post TTBRx_EL1 update.
- */
-	.macro	post_ttbr_update_workaround
-#ifdef CONFIG_CAVIUM_ERRATUM_27456
-alternative_if ARM64_WORKAROUND_CAVIUM_27456
-	ic	iallu
-	dsb	nsh
-	isb
-alternative_else_nop_endif
-#endif
-	.endm
-
 #endif	/* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 28561049b9e2..f331cf104862 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -241,7 +241,7 @@ alternative_else_nop_endif
 	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
 	 * corruption).
 	 */
-	post_ttbr_update_workaround
+	bl	post_ttbr_update_workaround
 	.endif
 1:
 	.if	\el != 0
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 10d68e438a37..b3d4a7153617 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -199,6 +199,15 @@ switch_mm_fastpath:
 		cpu_switch_mm(mm->pgd, mm);
 }
 
+/* Errata workaround post TTBRx_EL1 update. */
+asmlinkage void post_ttbr_update_workaround(void)
+{
+	asm(ALTERNATIVE("nop; nop; nop",
+			"ic iallu; dsb nsh; isb",
+			ARM64_WORKAROUND_CAVIUM_27456,
+			CONFIG_CAVIUM_ERRATUM_27456));
+}
+
 static int asids_init(void)
 {
 	int fld = cpuid_feature_extract_field(read_cpuid(SYS_ID_AA64MMFR0_EL1), 4);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index b06a797e0c5d..4c72bd6f2feb 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -145,8 +145,7 @@ ENTRY(cpu_do_switch_mm)
 	isb
 	msr	ttbr0_el1, x0			// now update TTBR0
 	isb
-	post_ttbr_update_workaround
-	ret
+	b	post_ttbr_update_workaround	// Back to C code...
 ENDPROC(cpu_do_switch_mm)
 
 	.pushsection ".idmap.text", "ax"

From 52c02cf1b2a22196c78bae28653cdec114cc9c86 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 10 Jan 2018 13:18:30 +0000
Subject: [PATCH 1408/3220] FROMLIST: arm64: kpti: Fix the interaction between
 ASID switching and software PAN

With ARM64_SW_TTBR0_PAN enabled, the exception entry code checks the
active ASID to decide whether user access was enabled (non-zero ASID)
when the exception was taken. On return from exception, if user access
was previously disabled, it re-instates TTBR0_EL1 from the per-thread
saved value (updated in switch_mm() or efi_set_pgd()).

Commit 7655abb95386 ("arm64: mm: Move ASID from TTBR0 to TTBR1") makes a
TTBR0_EL1 + ASID switching non-atomic. Subsequently, commit 27a921e75711
("arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN") changes the
__uaccess_ttbr0_disable() function and asm macro to first write the
reserved TTBR0_EL1 followed by the ASID=0 update in TTBR1_EL1. If an
exception occurs between these two, the exception return code will
re-instate a valid TTBR0_EL1. Similar scenario can happen in
cpu_switch_mm() between setting the reserved TTBR0_EL1 and the ASID
update in cpu_do_switch_mm().

This patch reverts the entry.S check for ASID == 0 to TTBR0_EL1 and
disables the interrupts around the TTBR0_EL1 and ASID switching code in
__uaccess_ttbr0_disable(). It also ensures that, when returning from the
EFI runtime services, efi_set_pgd() doesn't leave a non-zero ASID in
TTBR1_EL1 by using uaccess_ttbr0_{enable,disable}.

The accesses to current_thread_info()->ttbr0 are updated to use
READ_ONCE/WRITE_ONCE.

As a safety measure, __uaccess_ttbr0_enable() always masks out any
existing non-zero ASID TTBR1_EL1 before writing in the new ASID.

Fixes: 27a921e75711 ("arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN")
Acked-by: Will Deacon <will.deacon@arm.com>
Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: James Morse <james.morse@arm.com>
Tested-by: James Morse <james.morse@arm.com>
Co-developed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
(cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git
 commit 6b88a32c7af68895134872cdec3b6bfdb532d94e)

Change-Id: Icd6f58f0b12fcfdeaf08dceb36a929f585ac1479
[ghackmann@google.com:
 - adjust context
 - apply asm-uaccess.h changes to uaccess.h
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/arm64/include/asm/efi.h         | 12 +++++++-----
 arch/arm64/include/asm/mmu_context.h |  7 +++++--
 arch/arm64/include/asm/uaccess.h     | 21 +++++++++++++--------
 arch/arm64/kernel/entry.S            |  2 +-
 arch/arm64/lib/clear_user.S          |  2 +-
 arch/arm64/lib/copy_from_user.S      |  2 +-
 arch/arm64/lib/copy_in_user.S        |  2 +-
 arch/arm64/lib/copy_to_user.S        |  2 +-
 arch/arm64/mm/cache.S                |  2 +-
 arch/arm64/mm/proc.S                 |  3 +++
 arch/arm64/xen/hypercall.S           |  2 +-
 11 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 932f5a56d1a6..48e317e4a32d 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -76,12 +76,14 @@ static inline void efi_set_pgd(struct mm_struct *mm)
 		if (mm != current->active_mm) {
 			/*
 			 * Update the current thread's saved ttbr0 since it is
-			 * restored as part of a return from exception. Set
-			 * the hardware TTBR0_EL1 using cpu_switch_mm()
-			 * directly to enable potential errata workarounds.
+			 * restored as part of a return from exception. Enable
+			 * access to the valid TTBR0_EL1 and invoke the errata
+			 * workaround directly since there is no return from
+			 * exception when invoking the EFI run-time services.
 			 */
 			update_saved_ttbr0(current, mm);
-			cpu_switch_mm(mm->pgd, mm);
+			uaccess_ttbr0_enable();
+			post_ttbr_update_workaround();
 		} else {
 			/*
 			 * Defer the switch to the current thread's TTBR0_EL1
@@ -89,7 +91,7 @@ static inline void efi_set_pgd(struct mm_struct *mm)
 			 * thread's saved ttbr0 corresponding to its active_mm
 			 * (if different from init_mm).
 			 */
-			cpu_set_reserved_ttbr0();
+			uaccess_ttbr0_disable();
 			if (current->active_mm != &init_mm)
 				update_saved_ttbr0(current, current->active_mm);
 		}
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 85fd438e6c00..8848c49ac206 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -186,9 +186,10 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
 				      struct mm_struct *mm)
 {
 	if (system_uses_ttbr0_pan()) {
+		u64 ttbr;
 		BUG_ON(mm->pgd == swapper_pg_dir);
-		task_thread_info(tsk)->ttbr0 =
-			virt_to_phys(mm->pgd) | ASID(mm) << 48;
+		ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48;
+		WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
 	}
 }
 #else
@@ -235,4 +236,6 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #define deactivate_mm(tsk,mm)	do { } while (0)
 #define activate_mm(prev,next)	switch_mm(prev, next, current)
 
+void post_ttbr_update_workaround(void);
+
 #endif
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 57c4bbdf77cc..d39d8bde42d7 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -143,16 +143,18 @@ static inline void set_fs(mm_segment_t fs)
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 static inline void __uaccess_ttbr0_disable(void)
 {
-	unsigned long ttbr;
+	unsigned long flags, ttbr;
 
+	local_irq_save(flags);
 	ttbr = read_sysreg(ttbr1_el1);
+	ttbr &= ~TTBR_ASID_MASK;
 	/* reserved_ttbr0 placed at the end of swapper_pg_dir */
 	write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1);
 	isb();
 	/* Set reserved ASID */
-	ttbr &= ~TTBR_ASID_MASK;
 	write_sysreg(ttbr, ttbr1_el1);
 	isb();
+	local_irq_restore(flags);
 }
 
 static inline void __uaccess_ttbr0_enable(void)
@@ -165,10 +167,11 @@ static inline void __uaccess_ttbr0_enable(void)
 	 * roll-over and an update of 'ttbr0'.
 	 */
 	local_irq_save(flags);
-	ttbr0 = current_thread_info()->ttbr0;
+	ttbr0 = READ_ONCE(current_thread_info()->ttbr0);
 
 	/* Restore active ASID */
 	ttbr1 = read_sysreg(ttbr1_el1);
+	ttbr1 &= ~TTBR_ASID_MASK;		/* safety measure */
 	ttbr1 |= ttbr0 & TTBR_ASID_MASK;
 	write_sysreg(ttbr1, ttbr1_el1);
 	isb();
@@ -453,11 +456,11 @@ extern __must_check long strnlen_user(const char __user *str, long n);
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	.macro	__uaccess_ttbr0_disable, tmp1
 	mrs	\tmp1, ttbr1_el1		// swapper_pg_dir
+	bic	\tmp1, \tmp1, #TTBR_ASID_MASK
 	add	\tmp1, \tmp1, #SWAPPER_DIR_SIZE	// reserved_ttbr0 at the end of swapper_pg_dir
 	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
 	isb
 	sub	\tmp1, \tmp1, #SWAPPER_DIR_SIZE
-	bic	\tmp1, \tmp1, #TTBR_ASID_MASK
 	msr	ttbr1_el1, \tmp1		// set reserved ASID
 	isb
 	.endm
@@ -474,9 +477,11 @@ extern __must_check long strnlen_user(const char __user *str, long n);
 	isb
 	.endm
 
-	.macro	uaccess_ttbr0_disable, tmp1
+	.macro	uaccess_ttbr0_disable, tmp1, tmp2
 alternative_if_not ARM64_HAS_PAN
+	save_and_disable_irq \tmp2		// avoid preemption
 	__uaccess_ttbr0_disable \tmp1
+	restore_irq \tmp2
 alternative_else_nop_endif
 	.endm
 
@@ -488,7 +493,7 @@ alternative_if_not ARM64_HAS_PAN
 alternative_else_nop_endif
 	.endm
 #else
-	.macro	uaccess_ttbr0_disable, tmp1
+	.macro	uaccess_ttbr0_disable, tmp1, tmp2
 	.endm
 
 	.macro	uaccess_ttbr0_enable, tmp1, tmp2, tmp3
@@ -498,8 +503,8 @@ alternative_else_nop_endif
 /*
  * These macros are no-ops when UAO is present.
  */
-	.macro	uaccess_disable_not_uao, tmp1
-	uaccess_ttbr0_disable \tmp1
+	.macro	uaccess_disable_not_uao, tmp1, tmp2
+	uaccess_ttbr0_disable \tmp1, \tmp2
 alternative_if ARM64_ALT_PAN_NOT_UAO
 	SET_PSTATE_PAN(1)
 alternative_else_nop_endif
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index f331cf104862..4ed652aa62fd 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -164,7 +164,7 @@ alternative_if ARM64_HAS_PAN
 alternative_else_nop_endif
 
 	.if	\el != 0
-	mrs	x21, ttbr1_el1
+	mrs	x21, ttbr0_el1
 	tst	x21, #TTBR_ASID_MASK		// Check for the reserved ASID
 	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
 	b.eq	1f				// TTBR0 access already disabled
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index dd65ca253eb4..07c7ad97ee28 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -50,7 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
 	b.mi	5f
 uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
 5:	mov	x0, #0
-	uaccess_disable_not_uao x2
+	uaccess_disable_not_uao x2, x3
 	ret
 ENDPROC(__clear_user)
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 1ff23f81e242..683adc358be7 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -67,7 +67,7 @@ ENTRY(__arch_copy_from_user)
 	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+	uaccess_disable_not_uao x3, x4
 	mov	x0, #0				// Nothing to copy
 	ret
 ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 074d52fcd75b..e8bfaf19f778 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -68,7 +68,7 @@ ENTRY(__copy_in_user)
 	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+	uaccess_disable_not_uao x3, x4
 	mov	x0, #0
 	ret
 ENDPROC(__copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 67118444cde0..f6cfcc0441de 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -66,7 +66,7 @@ ENTRY(__arch_copy_to_user)
 	uaccess_enable_not_uao x3, x4, x5
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+	uaccess_disable_not_uao x3, x4
 	mov	x0, #0
 	ret
 ENDPROC(__arch_copy_to_user)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index d7371925f9e2..e5091d9cceb6 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -72,7 +72,7 @@ USER(9f, ic	ivau, x4	)		// invalidate I line PoU
 	isb
 	mov	x0, #0
 1:
-	uaccess_ttbr0_disable x1
+	uaccess_ttbr0_disable x1, x2
 	ret
 9:
 	mov	x0, #-EFAULT
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 4c72bd6f2feb..cfe2f9bdffab 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -140,6 +140,9 @@ ENDPROC(cpu_do_resume)
 ENTRY(cpu_do_switch_mm)
 	mrs	x2, ttbr1_el1
 	mmid	x1, x1				// get mm->context.id
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	bfi	x0, x1, #48, #16		// set the ASID field in TTBR0
+#endif
 	bfi	x2, x1, #48, #16		// set the ASID
 	msr	ttbr1_el1, x2			// in TTBR1 (since TCR.A1 is set)
 	isb
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 27b38711023b..a396beb7829b 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -104,6 +104,6 @@ ENTRY(privcmd_call)
 	/*
 	 * Disable userspace access from kernel once the hyp call completed.
 	 */
-	uaccess_ttbr0_disable x6
+	uaccess_ttbr0_disable x6, x7
 	ret
 ENDPROC(privcmd_call);

From d96d95dd1d8bf37426911169c242d3699b6f3245 Mon Sep 17 00:00:00 2001
From: Hyojun Kim <hyojun@google.com>
Date: Thu, 21 Dec 2017 09:57:41 -0800
Subject: [PATCH 1409/3220] blkdev: Refactoring block io latency histogram
 codes

The current io_latency_state structure includes entries for read
and write requests. There are special types of write commands
such as sync and discard (trim) commands, and the current
implementation is not general enough if we want to separate
latency histogram for such special commands.

This change makes io_latency_state structure request-type neutral.
It also changes to print the latency average.

Signed-off-by: Hyojun Kim <hyojun@google.com>
---
 block/blk-core.c          | 91 +++++++++++++--------------------------
 drivers/mmc/core/core.c   | 22 +++++++---
 drivers/scsi/ufs/ufshcd.c | 23 ++++++----
 drivers/scsi/ufs/ufshcd.h |  3 +-
 include/linux/blkdev.h    | 39 +++++------------
 include/linux/mmc/host.h  |  3 +-
 6 files changed, 74 insertions(+), 107 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d1788778dd06..aac018436fb8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3574,76 +3574,43 @@ int __init blk_dev_init(void)
  * TODO : If necessary, we can make the histograms per-cpu and aggregate
  * them when printing them out.
  */
-void
-blk_zero_latency_hist(struct io_latency_state *s)
-{
-	memset(s->latency_y_axis_read, 0,
-	       sizeof(s->latency_y_axis_read));
-	memset(s->latency_y_axis_write, 0,
-	       sizeof(s->latency_y_axis_write));
-	s->latency_reads_elems = 0;
-	s->latency_writes_elems = 0;
-}
-EXPORT_SYMBOL(blk_zero_latency_hist);
-
 ssize_t
-blk_latency_hist_show(struct io_latency_state *s, char *buf)
+blk_latency_hist_show(char* name, struct io_latency_state *s, char *buf,
+		int buf_size)
 {
 	int i;
 	int bytes_written = 0;
 	u_int64_t num_elem, elem;
 	int pct;
+	u_int64_t average;
 
-	num_elem = s->latency_reads_elems;
-	if (num_elem > 0) {
-		bytes_written += scnprintf(buf + bytes_written,
-			   PAGE_SIZE - bytes_written,
-			   "IO svc_time Read Latency Histogram (n = %llu):\n",
-			   num_elem);
-		for (i = 0;
-		     i < ARRAY_SIZE(latency_x_axis_us);
-		     i++) {
-			elem = s->latency_y_axis_read[i];
-			pct = div64_u64(elem * 100, num_elem);
-			bytes_written += scnprintf(buf + bytes_written,
-						   PAGE_SIZE - bytes_written,
-						   "\t< %5lluus%15llu%15d%%\n",
-						   latency_x_axis_us[i],
-						   elem, pct);
-		}
-		/* Last element in y-axis table is overflow */
-		elem = s->latency_y_axis_read[i];
-		pct = div64_u64(elem * 100, num_elem);
-		bytes_written += scnprintf(buf + bytes_written,
-					   PAGE_SIZE - bytes_written,
-					   "\t> %5dms%15llu%15d%%\n", 10,
-					   elem, pct);
-	}
-	num_elem = s->latency_writes_elems;
-	if (num_elem > 0) {
-		bytes_written += scnprintf(buf + bytes_written,
-			   PAGE_SIZE - bytes_written,
-			   "IO svc_time Write Latency Histogram (n = %llu):\n",
-			   num_elem);
-		for (i = 0;
-		     i < ARRAY_SIZE(latency_x_axis_us);
-		     i++) {
-			elem = s->latency_y_axis_write[i];
-			pct = div64_u64(elem * 100, num_elem);
-			bytes_written += scnprintf(buf + bytes_written,
-						   PAGE_SIZE - bytes_written,
-						   "\t< %5lluus%15llu%15d%%\n",
-						   latency_x_axis_us[i],
-						   elem, pct);
-		}
-		/* Last element in y-axis table is overflow */
-		elem = s->latency_y_axis_write[i];
-		pct = div64_u64(elem * 100, num_elem);
-		bytes_written += scnprintf(buf + bytes_written,
-					   PAGE_SIZE - bytes_written,
-					   "\t> %5dms%15llu%15d%%\n", 10,
-					   elem, pct);
+       num_elem = s->latency_elems;
+       if (num_elem > 0) {
+	       average = div64_u64(s->latency_sum, s->latency_elems);
+	       bytes_written += scnprintf(buf + bytes_written,
+			       buf_size - bytes_written,
+			       "IO svc_time %s Latency Histogram (n = %llu,"
+			       " average = %llu):\n", name, num_elem, average);
+	       for (i = 0;
+		    i < ARRAY_SIZE(latency_x_axis_us);
+		    i++) {
+		       elem = s->latency_y_axis[i];
+		       pct = div64_u64(elem * 100, num_elem);
+		       bytes_written += scnprintf(buf + bytes_written,
+				       PAGE_SIZE - bytes_written,
+				       "\t< %6lluus%15llu%15d%%\n",
+				       latency_x_axis_us[i],
+				       elem, pct);
+	       }
+	       /* Last element in y-axis table is overflow */
+	       elem = s->latency_y_axis[i];
+	       pct = div64_u64(elem * 100, num_elem);
+	       bytes_written += scnprintf(buf + bytes_written,
+			       PAGE_SIZE - bytes_written,
+			       "\t>=%6lluus%15llu%15d%%\n",
+			       latency_x_axis_us[i - 1], elem, pct);
 	}
+
 	return bytes_written;
 }
 EXPORT_SYMBOL(blk_latency_hist_show);
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 2986e270d19a..ea3a72754793 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -191,9 +191,10 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
 				completion = ktime_get();
 				delta_us = ktime_us_delta(completion,
 							  mrq->io_start);
-				blk_update_latency_hist(&host->io_lat_s,
-					(mrq->data->flags & MMC_DATA_READ),
-					delta_us);
+				blk_update_latency_hist(
+					(mrq->data->flags & MMC_DATA_READ) ?
+					&host->io_lat_read :
+					&host->io_lat_write, delta_us);
 			}
 #endif
 			trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data);
@@ -2932,8 +2933,14 @@ static ssize_t
 latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct mmc_host *host = cls_dev_to_mmc_host(dev);
+	size_t written_bytes;
 
-	return blk_latency_hist_show(&host->io_lat_s, buf);
+	written_bytes = blk_latency_hist_show("Read", &host->io_lat_read,
+			buf, PAGE_SIZE);
+	written_bytes += blk_latency_hist_show("Write", &host->io_lat_write,
+			buf + written_bytes, PAGE_SIZE - written_bytes);
+
+	return written_bytes;
 }
 
 /*
@@ -2951,9 +2958,10 @@ latency_hist_store(struct device *dev, struct device_attribute *attr,
 
 	if (kstrtol(buf, 0, &value))
 		return -EINVAL;
-	if (value == BLK_IO_LAT_HIST_ZERO)
-		blk_zero_latency_hist(&host->io_lat_s);
-	else if (value == BLK_IO_LAT_HIST_ENABLE ||
+	if (value == BLK_IO_LAT_HIST_ZERO) {
+		memset(&host->io_lat_read, 0, sizeof(host->io_lat_read));
+		memset(&host->io_lat_write, 0, sizeof(host->io_lat_write));
+	} else if (value == BLK_IO_LAT_HIST_ENABLE ||
 		 value == BLK_IO_LAT_HIST_DISABLE)
 		host->latency_hist_enabled = value;
 	return count;
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 1b9008cab6eb..4bc2ee1f42bf 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -3207,10 +3207,10 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
 					completion = ktime_get();
 					delta_us = ktime_us_delta(completion,
 						  req->lat_hist_io_start);
-					/* rq_data_dir() => true if WRITE */
-					blk_update_latency_hist(&hba->io_lat_s,
-						(rq_data_dir(req) == READ),
-						delta_us);
+					blk_update_latency_hist(
+						(rq_data_dir(req) == READ) ?
+						&hba->io_lat_read :
+						&hba->io_lat_write, delta_us);
 				}
 			}
 			/* Do not touch lrbp after scsi done */
@@ -5382,9 +5382,10 @@ latency_hist_store(struct device *dev, struct device_attribute *attr,
 
 	if (kstrtol(buf, 0, &value))
 		return -EINVAL;
-	if (value == BLK_IO_LAT_HIST_ZERO)
-		blk_zero_latency_hist(&hba->io_lat_s);
-	else if (value == BLK_IO_LAT_HIST_ENABLE ||
+	if (value == BLK_IO_LAT_HIST_ZERO) {
+		memset(&hba->io_lat_read, 0, sizeof(hba->io_lat_read));
+		memset(&hba->io_lat_write, 0, sizeof(hba->io_lat_write));
+	} else if (value == BLK_IO_LAT_HIST_ENABLE ||
 		 value == BLK_IO_LAT_HIST_DISABLE)
 		hba->latency_hist_enabled = value;
 	return count;
@@ -5395,8 +5396,14 @@ latency_hist_show(struct device *dev, struct device_attribute *attr,
 		  char *buf)
 {
 	struct ufs_hba *hba = dev_get_drvdata(dev);
+	size_t written_bytes;
 
-	return blk_latency_hist_show(&hba->io_lat_s, buf);
+	written_bytes = blk_latency_hist_show("Read", &hba->io_lat_read,
+			buf, PAGE_SIZE);
+	written_bytes += blk_latency_hist_show("Write", &hba->io_lat_write,
+			buf + written_bytes, PAGE_SIZE - written_bytes);
+
+	return written_bytes;
 }
 
 static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index b74eedbd831f..c1310ead0c2a 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -542,7 +542,8 @@ struct ufs_hba {
 	bool is_sys_suspended;
 
 	int			latency_hist_enabled;
-	struct io_latency_state io_lat_s;
+	struct io_latency_state io_lat_read;
+	struct io_latency_state io_lat_write;
 };
 
 /* Returns true if clocks can be gated. Otherwise false */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a9562bb029d0..fb5302d6c931 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1694,43 +1694,26 @@ static const u_int64_t latency_x_axis_us[] = {
 #define BLK_IO_LAT_HIST_ZERO            2
 
 struct io_latency_state {
-	u_int64_t	latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1];
-	u_int64_t	latency_reads_elems;
-	u_int64_t	latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1];
-	u_int64_t	latency_writes_elems;
+	u_int64_t	latency_y_axis[ARRAY_SIZE(latency_x_axis_us) + 1];
+	u_int64_t	latency_elems;
+	u_int64_t	latency_sum;
 };
 
 static inline void
-blk_update_latency_hist(struct io_latency_state *s,
-			int read,
-			u_int64_t delta_us)
+blk_update_latency_hist(struct io_latency_state *s, u_int64_t delta_us)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) {
-		if (delta_us < (u_int64_t)latency_x_axis_us[i]) {
-			if (read)
-				s->latency_y_axis_read[i]++;
-			else
-				s->latency_y_axis_write[i]++;
+	for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++)
+		if (delta_us < (u_int64_t)latency_x_axis_us[i])
 			break;
-		}
-	}
-	if (i == ARRAY_SIZE(latency_x_axis_us)) {
-		/* Overflowed the histogram */
-		if (read)
-			s->latency_y_axis_read[i]++;
-		else
-			s->latency_y_axis_write[i]++;
-	}
-	if (read)
-		s->latency_reads_elems++;
-	else
-		s->latency_writes_elems++;
+	s->latency_y_axis[i]++;
+	s->latency_elems++;
+	s->latency_sum += delta_us;
 }
 
-void blk_zero_latency_hist(struct io_latency_state *s);
-ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf);
+ssize_t blk_latency_hist_show(char* name, struct io_latency_state *s,
+		char *buf, int buf_size);
 
 #else /* CONFIG_BLOCK */
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 97b2b0b1f99d..46e91d4fce7e 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -382,7 +382,8 @@ struct mmc_host {
 
 #ifdef CONFIG_BLOCK
 	int			latency_hist_enabled;
-	struct io_latency_state io_lat_s;
+	struct io_latency_state io_lat_read;
+	struct io_latency_state io_lat_write;
 #endif
 
 	unsigned long		private[0] ____cacheline_aligned;

From 3fc4284df70b6a79a7d8e2c69c55a62a1c37c8a2 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 18 Jan 2018 16:17:16 -0800
Subject: [PATCH 1410/3220] ANDROID: sdcardfs: Move default_normal to
 superblock

Moving default_normal from mount info to superblock info
as it doesn't need to change between mount points.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 72158116
Change-Id: I16c6a0577c601b4f7566269f7e189fcf697afd4e
---
 fs/sdcardfs/inode.c    |  7 ++++---
 fs/sdcardfs/main.c     |  4 ++--
 fs/sdcardfs/sdcardfs.h | 10 ++++++----
 fs/sdcardfs/super.c    |  2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 85eb8ebb5372..36c36590bcea 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -641,7 +641,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma
 	 */
 	copy_attrs(&tmp, inode);
 	tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
-	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, inode->i_sb, top));
 	tmp.i_mode = (inode->i_mode & S_IFMT)
 			| get_mode(mnt, SDCARDFS_I(inode), top);
 	data_put(top);
@@ -718,7 +718,7 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	 */
 	copy_attrs(&tmp, inode);
 	tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
-	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+	tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, dentry->d_sb, top));
 	tmp.i_mode = (inode->i_mode & S_IFMT)
 			| get_mode(mnt, SDCARDFS_I(inode), top);
 	tmp.i_size = i_size_read(inode);
@@ -819,6 +819,7 @@ static int sdcardfs_fillattr(struct vfsmount *mnt,
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
 	struct sdcardfs_inode_data *top = top_data_get(info);
+	struct super_block *sb = inode->i_sb;
 
 	if (!top)
 		return -EINVAL;
@@ -828,7 +829,7 @@ static int sdcardfs_fillattr(struct vfsmount *mnt,
 	stat->mode = (inode->i_mode  & S_IFMT) | get_mode(mnt, info, top);
 	stat->nlink = inode->i_nlink;
 	stat->uid = make_kuid(&init_user_ns, top->d_uid);
-	stat->gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+	stat->gid = make_kgid(&init_user_ns, get_gid(mnt, sb, top));
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
 	stat->atime = inode->i_atime;
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index d5239b21cb8a..ac27bb301c5e 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -70,7 +70,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 	opts->reserved_mb = 0;
 	/* by default, gid derivation is off */
 	opts->gid_derivation = false;
-	vfsopts->default_normal = false;
+	opts->default_normal = false;
 
 	*debug = 0;
 
@@ -126,7 +126,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 			opts->gid_derivation = true;
 			break;
 		case Opt_default_normal:
-			vfsopts->default_normal = true;
+			opts->default_normal = true;
 			break;
 		/* unknown option */
 		default:
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index f5054a2650f1..c6c49ea182ec 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -220,13 +220,13 @@ struct sdcardfs_mount_options {
 	userid_t fs_user_id;
 	bool multiuser;
 	bool gid_derivation;
+	bool default_normal;
 	unsigned int reserved_mb;
 };
 
 struct sdcardfs_vfsmount_options {
 	gid_t gid;
 	mode_t mask;
-	bool default_normal;
 };
 
 extern int parse_options_remount(struct super_block *sb, char *options, int silent,
@@ -414,11 +414,13 @@ static inline void set_top(struct sdcardfs_inode_info *info,
 }
 
 static inline int get_gid(struct vfsmount *mnt,
+		struct super_block *sb,
 		struct sdcardfs_inode_data *data)
 {
-	struct sdcardfs_vfsmount_options *opts = mnt->data;
+	struct sdcardfs_vfsmount_options *vfsopts = mnt->data;
+	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(sb);
 
-	if (opts->gid == AID_SDCARD_RW && !opts->default_normal)
+	if (vfsopts->gid == AID_SDCARD_RW && !sbi->options.default_normal)
 		/* As an optimization, certain trusted system components only run
 		 * as owner but operate across all users. Since we're now handing
 		 * out the sdcard_rw GID only to trusted apps, we're okay relaxing
@@ -427,7 +429,7 @@ static inline int get_gid(struct vfsmount *mnt,
 		 */
 		return AID_SDCARD_RW;
 	else
-		return multiuser_get_uid(data->userid, opts->gid);
+		return multiuser_get_uid(data->userid, vfsopts->gid);
 }
 
 static inline int get_mode(struct vfsmount *mnt,
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index a28b40f5adc8..87d6f836592e 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -304,7 +304,7 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
 		seq_printf(m, ",userid=%u", opts->fs_user_id);
 	if (opts->gid_derivation)
 		seq_puts(m, ",derive_gid");
-	if (vfsopts->default_normal)
+	if (opts->default_normal)
 		seq_puts(m, ",default_normal");
 	if (opts->reserved_mb != 0)
 		seq_printf(m, ",reserved=%uMB", opts->reserved_mb);

From e70c132b2dac4fa9287f08f7572907ed99cbb65b Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 23 Jan 2018 14:34:38 -0800
Subject: [PATCH 1411/3220] ANDROID: xattr: Pass EOPNOTSUPP to permission2

The permission call for xattr operations happens regardless of
whether or not the xattr functions are implemented.

The xattr functions currently don't have support for permission2.
Passing EOPNOTSUPP as the mount point in xattr_permission allows
us to return EOPNOTSUPP early in permission2, if the filesystem
supports it.

Change-Id: I9d07e4cd633cf40af60450ffbff7ac5c1b4e8c2c
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 35848445
---
 fs/sdcardfs/inode.c | 2 ++
 fs/xattr.c          | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 36c36590bcea..3ef34874fd7b 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -625,6 +625,8 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma
 	struct inode tmp;
 	struct sdcardfs_inode_data *top = top_data_get(SDCARDFS_I(inode));
 
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
 	if (!top)
 		return -EINVAL;
 
diff --git a/fs/xattr.c b/fs/xattr.c
index 76f01bf4b048..a40f49cc04c3 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -70,7 +70,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 			return -EPERM;
 	}
 
-	return inode_permission(inode, mask);
+	return inode_permission2(ERR_PTR(-EOPNOTSUPP), inode, mask);
 }
 
 /**

From 202e079275c622e56d67021b84ec792426692445 Mon Sep 17 00:00:00 2001
From: Greg KH <gregkh@linuxfoundation.org>
Date: Wed, 8 Mar 2017 19:03:44 +0100
Subject: [PATCH 1412/3220] UPSTREAM: eventpoll.h: add missing epoll event
 masks

[resend due to me forgetting to cc: linux-api the first time around I
posted these back on Feb 23]

From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

For some reason these values are not in the uapi header file, so any
libc has to define it themselves.  To prevent them from needing to do
this, just have the kernel provide the correct values.

Reported-by: Elliott Hughes <enh@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 7e040726850a106587485c21bdacc0bfc8a0cbed)

Change-Id: I7b1370668faeeb7597288b29dde5bc6d63c95be6
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 include/uapi/linux/eventpoll.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index bc81fb2e1f0e..6f04cb419115 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -26,6 +26,19 @@
 #define EPOLL_CTL_DEL 2
 #define EPOLL_CTL_MOD 3
 
+/* Epoll event masks */
+#define EPOLLIN		0x00000001
+#define EPOLLPRI	0x00000002
+#define EPOLLOUT	0x00000004
+#define EPOLLERR	0x00000008
+#define EPOLLHUP	0x00000010
+#define EPOLLRDNORM	0x00000040
+#define EPOLLRDBAND	0x00000080
+#define EPOLLWRNORM	0x00000100
+#define EPOLLWRBAND	0x00000200
+#define EPOLLMSG	0x00000400
+#define EPOLLRDHUP	0x00002000
+
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
  * from happening while those events are being processed.

From 962d1f3fe2f44b79f2fb45b82171781a5f98c7ae Mon Sep 17 00:00:00 2001
From: Ke Wang <ke.wang@spreadtrum.com>
Date: Thu, 11 Jan 2018 11:16:32 +0800
Subject: [PATCH 1413/3220] ANDROID: sched: EAS: check energy_aware() before
 calling select_energy_cpu_brute() in up-migrate path

In up-migrate path, select_energy_cpu_brute() was called directly
without checking energy_aware(). This will make select_energy_cpu_brute()
always worked even disabling energy_aware() on the asymmetric cpu
capacity system.

Signed-off-by: Ke Wang <ke.wang@spreadtrum.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ca6ac458016b..06e77d60a510 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9920,7 +9920,7 @@ void check_for_migration(struct rq *rq, struct task_struct *p)
 	int active_balance;
 	int cpu = task_cpu(p);
 
-	if (rq->misfit_task) {
+	if (energy_aware() && rq->misfit_task) {
 		if (rq->curr->state != TASK_RUNNING ||
 		    rq->curr->nr_cpus_allowed == 1)
 			return;

From a81d32264721fcb29e54d981fe17519cef600d78 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 11 Sep 2017 17:10:37 -0700
Subject: [PATCH 1414/3220] ANDROID: sched/rt: schedtune: Add boost retention
 to RT

Boosted RT tasks can be deboosted quickly, this makes boost usless
for RT tasks and causes lots of glitching. Use timers to prevent
de-boost too soon and wait for long enough such that next enqueue
happens after a threshold.

While this can be solved in the governor, there are following
advantages:
- The approach used is governor-independent
- Reduces boost group lock contention for frequently sleepers/wakers

Note:
Fixed build breakage due to schedfreq dependency which isn't used
for RT anymore.

Bug: 30210506

Change-Id: I428a2695cac06cc3458cdde0dea72315e4e66c00
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 include/linux/sched.h |   4 ++
 kernel/sched/core.c   |   1 +
 kernel/sched/rt.c     | 148 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h  |   1 +
 4 files changed, 154 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4645017c6a24..d3dd0d516957 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1434,6 +1434,10 @@ struct sched_rt_entity {
 	unsigned long watchdog_stamp;
 	unsigned int time_slice;
 
+	/* Accesses for these must be guarded by rq->lock of the task's rq */
+	bool schedtune_enqueued;
+	struct hrtimer schedtune_timer;
+
 	struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity	*parent;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fcb7887dacea..6c9213a6fa9f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2200,6 +2200,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	init_dl_task_timer(&p->dl);
 	__dl_clear_params(p);
 
+	init_rt_schedtune_timer(&p->rt);
 	INIT_LIST_HEAD(&p->rt.run_list);
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0f21b12332df..d367aa11e511 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,7 @@
 
 #include <linux/slab.h>
 #include <linux/irq_work.h>
+#include <linux/hrtimer.h>
 
 #include "walt.h"
 #include "tune.h"
@@ -975,6 +976,70 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 	return 0;
 }
 
+#define RT_SCHEDTUNE_INTERVAL 50000000ULL
+
+static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
+{
+	struct sched_rt_entity *rt_se = container_of(timer,
+			struct sched_rt_entity,
+			schedtune_timer);
+	struct task_struct *p = rt_task_of(rt_se);
+	struct rq *rq = task_rq(p);
+
+	raw_spin_lock(&rq->lock);
+
+	/*
+	 * Nothing to do if:
+	 * - task has switched runqueues
+	 * - task isn't RT anymore
+	 */
+	if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
+		goto out;
+
+	/*
+	 * If task got enqueued back during callback time, it means we raced
+	 * with the enqueue on another cpu, that's Ok, just do nothing as
+	 * enqueue path would have tried to cancel us and we shouldn't run
+	 * Also check the schedtune_enqueued flag as class-switch on a
+	 * sleeping task may have already canceled the timer and done dq
+	 */
+	if (p->on_rq || !rt_se->schedtune_enqueued)
+		goto out;
+
+	/*
+	 * RT task is no longer active, cancel boost
+	 */
+	rt_se->schedtune_enqueued = false;
+	schedtune_dequeue_task(p, cpu_of(rq));
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+out:
+	raw_spin_unlock(&rq->lock);
+
+	/*
+	 * This can free the task_struct if no more references.
+	 */
+	put_task_struct(p);
+
+	return HRTIMER_NORESTART;
+}
+
+void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+	struct hrtimer *timer = &rt_se->schedtune_timer;
+
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	timer->function = rt_schedtune_timer;
+	rt_se->schedtune_enqueued = false;
+}
+
+static void start_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+	struct hrtimer *timer = &rt_se->schedtune_timer;
+
+	hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
+			HRTIMER_MODE_REL_PINNED);
+}
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -1312,7 +1377,32 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 
+	if (!schedtune_task_boost(p))
+		return;
+
+	/*
+	 * If schedtune timer is active, that means a boost was already
+	 * done, just cancel the timer so that deboost doesn't happen.
+	 * Otherwise, increase the boost. If an enqueued timer was
+	 * cancelled, put the task reference.
+	 */
+	if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+		put_task_struct(p);
+
+	/*
+	 * schedtune_enqueued can be true in the following situation:
+	 * enqueue_task_rt grabs rq lock before timer fires
+	 *    or before its callback acquires rq lock
+	 * schedtune_enqueued can be false if timer callback is running
+	 * and timer just released rq lock, or if the timer finished
+	 * running and canceling the boost
+	 */
+	if (rt_se->schedtune_enqueued)
+		return;
+
+	rt_se->schedtune_enqueued = true;
 	schedtune_enqueue_task(p, cpu_of(rq));
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1324,7 +1414,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	walt_dec_cumulative_runnable_avg(rq, p);
 
 	dequeue_pushable_task(rq, p);
+
+	if (!rt_se->schedtune_enqueued)
+		return;
+
+	if (flags == DEQUEUE_SLEEP) {
+		get_task_struct(p);
+		start_schedtune_timer(rt_se);
+		return;
+	}
+
+	rt_se->schedtune_enqueued = false;
 	schedtune_dequeue_task(p, cpu_of(rq));
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
 }
 
 /*
@@ -1364,6 +1466,32 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
+/*
+ * Perform a schedtune dequeue and cancelation of boost timers if needed.
+ * Should be called only with the rq->lock held.
+ */
+static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
+{
+	struct sched_rt_entity *rt_se = &p->rt;
+
+	BUG_ON(!raw_spin_is_locked(&rq->lock));
+
+	if (!rt_se->schedtune_enqueued)
+		return;
+
+	/*
+	 * Incase of class change cancel any active timers. If an enqueued
+	 * timer was cancelled, put the task ref.
+	 */
+	if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+		put_task_struct(p);
+
+	/* schedtune_enqueued is true, deboost it */
+	rt_se->schedtune_enqueued = false;
+	schedtune_dequeue_task(p, task_cpu(p));
+	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+}
+
 static int
 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
 		  int sibling_count_hint)
@@ -1418,6 +1546,19 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
 	rcu_read_unlock();
 
 out:
+	/*
+	 * If previous CPU was different, make sure to cancel any active
+	 * schedtune timers and deboost.
+	 */
+	if (task_cpu(p) != cpu) {
+		unsigned long fl;
+		struct rq *prq = task_rq(p);
+
+		raw_spin_lock_irqsave(&prq->lock, fl);
+		schedtune_dequeue_rt(prq, p);
+		raw_spin_unlock_irqrestore(&prq->lock, fl);
+	}
+
 	return cpu;
 }
 
@@ -2162,6 +2303,13 @@ static void rq_offline_rt(struct rq *rq)
  */
 static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
+	/*
+	 * On class switch from rt, always cancel active schedtune timers,
+	 * this handles the cases where we switch class for a task that is
+	 * already rt-dequeued but has a running timer.
+	 */
+	schedtune_dequeue_rt(rq, p);
+
 	/*
 	 * If there are other RT tasks then we will reschedule
 	 * and the scheduling of the other RT tasks will handle
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3db67eb07bdd..b9ba3279f095 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1408,6 +1408,7 @@ extern void resched_cpu(int cpu);
 
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se);
 
 extern struct dl_bandwidth def_dl_bandwidth;
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);

From 4325c9c3359b9e2080d8b9867ae2a06d69672302 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 29 Jan 2018 21:31:21 -0800
Subject: [PATCH 1415/3220] ANDROID: sdcardfs: Use lower getattr times/size

We now use the lower filesystem's getattr for time and size related
information.

Change-Id: I3dd05614a0c2837a13eeb033444fbdf070ddce2a
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 72007585
---
 fs/sdcardfs/inode.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 3ef34874fd7b..2a4520a63993 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -816,8 +816,8 @@ out_err:
 	return err;
 }
 
-static int sdcardfs_fillattr(struct vfsmount *mnt,
-				struct inode *inode, struct kstat *stat)
+static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode,
+				struct kstat *lower_stat, struct kstat *stat)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
 	struct sdcardfs_inode_data *top = top_data_get(info);
@@ -833,12 +833,12 @@ static int sdcardfs_fillattr(struct vfsmount *mnt,
 	stat->uid = make_kuid(&init_user_ns, top->d_uid);
 	stat->gid = make_kgid(&init_user_ns, get_gid(mnt, sb, top));
 	stat->rdev = inode->i_rdev;
-	stat->size = i_size_read(inode);
-	stat->atime = inode->i_atime;
-	stat->mtime = inode->i_mtime;
-	stat->ctime = inode->i_ctime;
-	stat->blksize = (1 << inode->i_blkbits);
-	stat->blocks = inode->i_blocks;
+	stat->size = lower_stat->size;
+	stat->atime = lower_stat->atime;
+	stat->mtime = lower_stat->mtime;
+	stat->ctime = lower_stat->ctime;
+	stat->blksize = lower_stat->blksize;
+	stat->blocks = lower_stat->blocks;
 	data_put(top);
 	return 0;
 }
@@ -864,8 +864,7 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		goto out;
 	sdcardfs_copy_and_fix_attrs(d_inode(dentry),
 			      d_inode(lower_path.dentry));
-	err = sdcardfs_fillattr(mnt, d_inode(dentry), stat);
-	stat->blocks = lower_stat.blocks;
+	err = sdcardfs_fillattr(mnt, d_inode(dentry), &lower_stat, stat);
 out:
 	sdcardfs_put_lower_path(dentry, &lower_path);
 	return err;

From 6b508c8d77e1db393bf021f0e3eccdb84c7cddd6 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 23 Jan 2018 17:51:26 -0800
Subject: [PATCH 1416/3220] Revert "ANDROID: sdcardfs: notify lower file of
 opens"

This reverts commit 77b02dccafafc467e2f299d49f652b1875c013c7.

Instead of calling notify within sdcardfs, which reverse the
order of notifications during an open with truncate, we'll
make fs_notify worry about it.

Change-Id: Ic634401c0f223500066300a4df8b1453a0b35b60
Bug: 70706497
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/file.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index dd76ecf33cf3..5ac0b0bbb0ec 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -18,7 +18,6 @@
  * General Public License.
  */
 
-#include <linux/fsnotify.h>
 #include "sdcardfs.h"
 #ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
 #include <linux/backing-dev.h>
@@ -260,7 +259,6 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 			fput(lower_file); /* fput calls dput for lower_dentry */
 		}
 	} else {
-		fsnotify_open(lower_file);
 		sdcardfs_set_lower_file(file, lower_file);
 	}
 

From 0989de294d189b0e8742b9296b511af20004724a Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 23 Jan 2018 15:02:50 -0800
Subject: [PATCH 1417/3220] ANDROID: fsnotify: Notify lower fs of open

If the filesystem being watched supports d_canonical_path,
notify the lower filesystem of the open as well.

Change-Id: I2b1739e068afbaf5eb39950516072bff8345ebfe
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 70706497
---
 include/linux/fsnotify.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index a7789559078b..7dba769ef934 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -230,12 +230,19 @@ static inline void fsnotify_modify(struct file *file)
 static inline void fsnotify_open(struct file *file)
 {
 	struct path *path = &file->f_path;
+	struct path lower_path;
 	struct inode *inode = file_inode(file);
 	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_ISDIR;
 
+	if (path->dentry->d_op && path->dentry->d_op->d_canonical_path) {
+		path->dentry->d_op->d_canonical_path(path, &lower_path);
+		fsnotify_parent(&lower_path, NULL, mask);
+		fsnotify(lower_path.dentry->d_inode, mask, &lower_path, FSNOTIFY_EVENT_PATH, NULL, 0);
+		path_put(&lower_path);
+	}
 	fsnotify_parent(path, NULL, mask);
 	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }

From 4e74e983ab6ee28bb272bbcd317bd2d6d60e4be3 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 1 Feb 2018 16:52:22 -0800
Subject: [PATCH 1418/3220] ANDROID: sdcardfs: Protect set_top

If the top is changed while we're attempting to use it, it's
possible that the reference will be put while we are in the
process of grabbing a reference.

Now we grab a spinlock to protect grabbing our reference count.

Additionally, we now set the inode_info's top value to point to
it's own data when initializing, which makes tracking changes
easier.

Change-Id: If15748c786ce4c0480ab8c5051a92523aff284d2
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/derived_perm.c | 28 +++++++++++++---------------
 fs/sdcardfs/main.c         |  6 ++----
 fs/sdcardfs/sdcardfs.h     | 26 ++++++++++++++++++--------
 fs/sdcardfs/super.c        |  3 +++
 4 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index e0b74dc77c5f..85126ec6533c 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -32,23 +32,20 @@ static void inherit_derived_state(struct inode *parent, struct inode *child)
 	ci->data->under_android = pi->data->under_android;
 	ci->data->under_cache = pi->data->under_cache;
 	ci->data->under_obb = pi->data->under_obb;
-	set_top(ci, pi->top_data);
 }
 
 /* helper function for derived state */
 void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
-					uid_t uid, bool under_android,
-					struct sdcardfs_inode_data *top)
+					uid_t uid)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
 
 	info->data->perm = perm;
 	info->data->userid = userid;
 	info->data->d_uid = uid;
-	info->data->under_android = under_android;
+	info->data->under_android = false;
 	info->data->under_cache = false;
 	info->data->under_obb = false;
-	set_top(info, top);
 }
 
 /* While renaming, there is a point where we want the path from dentry,
@@ -58,8 +55,8 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 				const struct qstr *name)
 {
 	struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
-	struct sdcardfs_inode_data *parent_data =
-			SDCARDFS_I(d_inode(parent))->data;
+	struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+	struct sdcardfs_inode_data *parent_data = parent_info->data;
 	appid_t appid;
 	unsigned long user_num;
 	int err;
@@ -80,13 +77,15 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 	inherit_derived_state(d_inode(parent), d_inode(dentry));
 
 	/* Files don't get special labels */
-	if (!S_ISDIR(d_inode(dentry)->i_mode))
+	if (!S_ISDIR(d_inode(dentry)->i_mode)) {
+		set_top(info, parent_info);
 		return;
+	}
 	/* Derive custom permissions based on parent and current node */
 	switch (parent_data->perm) {
 	case PERM_INHERIT:
 	case PERM_ANDROID_PACKAGE_CACHE:
-		/* Already inherited above */
+		set_top(info, parent_info);
 		break;
 	case PERM_PRE_ROOT:
 		/* Legacy internal layout places users at top level */
@@ -96,7 +95,6 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 			info->data->userid = 0;
 		else
 			info->data->userid = user_num;
-		set_top(info, info->data);
 		break;
 	case PERM_ROOT:
 		/* Assume masked off by default. */
@@ -104,24 +102,24 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 			/* App-specific directories inside; let anyone traverse */
 			info->data->perm = PERM_ANDROID;
 			info->data->under_android = true;
-			set_top(info, info->data);
+		} else {
+			set_top(info, parent_info);
 		}
 		break;
 	case PERM_ANDROID:
 		if (qstr_case_eq(name, &q_data)) {
 			/* App-specific directories inside; let anyone traverse */
 			info->data->perm = PERM_ANDROID_DATA;
-			set_top(info, info->data);
 		} else if (qstr_case_eq(name, &q_obb)) {
 			/* App-specific directories inside; let anyone traverse */
 			info->data->perm = PERM_ANDROID_OBB;
 			info->data->under_obb = true;
-			set_top(info, info->data);
 			/* Single OBB directory is always shared */
 		} else if (qstr_case_eq(name, &q_media)) {
 			/* App-specific directories inside; let anyone traverse */
 			info->data->perm = PERM_ANDROID_MEDIA;
-			set_top(info, info->data);
+		} else {
+			set_top(info, parent_info);
 		}
 		break;
 	case PERM_ANDROID_OBB:
@@ -132,13 +130,13 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 		if (appid != 0 && !is_excluded(name->name, parent_data->userid))
 			info->data->d_uid =
 				multiuser_get_uid(parent_data->userid, appid);
-		set_top(info, info->data);
 		break;
 	case PERM_ANDROID_PACKAGE:
 		if (qstr_case_eq(name, &q_cache)) {
 			info->data->perm = PERM_ANDROID_PACKAGE_CACHE;
 			info->data->under_cache = true;
 		}
+		set_top(info, parent_info);
 		break;
 	}
 }
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index ac27bb301c5e..e4fd3fbb05e6 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -341,13 +341,11 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	mutex_lock(&sdcardfs_super_list_lock);
 	if (sb_info->options.multiuser) {
 		setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT,
-				sb_info->options.fs_user_id, AID_ROOT,
-				false, SDCARDFS_I(d_inode(sb->s_root))->data);
+				sb_info->options.fs_user_id, AID_ROOT);
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name);
 	} else {
 		setup_derived_state(d_inode(sb->s_root), PERM_ROOT,
-				sb_info->options.fs_user_id, AID_ROOT,
-				false, SDCARDFS_I(d_inode(sb->s_root))->data);
+				sb_info->options.fs_user_id, AID_ROOT);
 		snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
 	}
 	fixup_tmp_permissions(d_inode(sb->s_root));
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index c6c49ea182ec..f0607a55441d 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -201,6 +201,7 @@ struct sdcardfs_inode_info {
 	struct sdcardfs_inode_data *data;
 
 	/* top folder for ownership */
+	spinlock_t top_lock;
 	struct sdcardfs_inode_data *top_data;
 
 	struct inode vfs_inode;
@@ -380,7 +381,12 @@ static inline struct sdcardfs_inode_data *data_get(
 static inline struct sdcardfs_inode_data *top_data_get(
 		struct sdcardfs_inode_info *info)
 {
-	return data_get(info->top_data);
+	struct sdcardfs_inode_data *top_data;
+
+	spin_lock(&info->top_lock);
+	top_data = data_get(info->top_data);
+	spin_unlock(&info->top_lock);
+	return top_data;
 }
 
 extern void data_release(struct kref *ref);
@@ -402,15 +408,20 @@ static inline void release_own_data(struct sdcardfs_inode_info *info)
 }
 
 static inline void set_top(struct sdcardfs_inode_info *info,
-			struct sdcardfs_inode_data *top)
+			struct sdcardfs_inode_info *top_owner)
 {
-	struct sdcardfs_inode_data *old_top = info->top_data;
+	struct sdcardfs_inode_data *old_top;
+	struct sdcardfs_inode_data *new_top = NULL;
 
-	if (top)
-		data_get(top);
-	info->top_data = top;
+	if (top_owner)
+		new_top = top_data_get(top_owner);
+
+	spin_lock(&info->top_lock);
+	old_top = info->top_data;
+	info->top_data = new_top;
 	if (old_top)
 		data_put(old_top);
+	spin_unlock(&info->top_lock);
 }
 
 static inline int get_gid(struct vfsmount *mnt,
@@ -516,8 +527,7 @@ struct limit_search {
 };
 
 extern void setup_derived_state(struct inode *inode, perm_t perm,
-		userid_t userid, uid_t uid, bool under_android,
-		struct sdcardfs_inode_data *top);
+			userid_t userid, uid_t uid);
 extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
 extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name);
 extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index 87d6f836592e..cffcdb11cb8a 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -215,6 +215,9 @@ static struct inode *sdcardfs_alloc_inode(struct super_block *sb)
 
 	i->data = d;
 	kref_init(&d->refcount);
+	i->top_data = d;
+	spin_lock_init(&i->top_lock);
+	kref_get(&d->refcount);
 
 	i->vfs_inode.i_version = 1;
 	return &i->vfs_inode;

From eeb8a2dda8269e00a279815210ae7870bc5f153d Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 22 Jan 2016 05:20:26 +0000
Subject: [PATCH 1419/3220] UPSTREAM: MIPS: math-emu: Correctly handle NOP
 emulation

Fix an issue introduced with commit 9ab4471c9f1b ("MIPS: math-emu:
Correct delay-slot exception propagation") where the emulation of a NOP
instruction signals the need to terminate the emulation loop.  This in
turn, if the PC has not changed from the entry to the loop, will cause
the kernel to terminate the program with SIGILL.

Consider this program:

static double div(double d)
{
	do
		d /= 2.0;
	while (d > .5);
	return d;
}

int main(int argc, char **argv)
{
	return div(argc);
}

which gets compiled to the following binary code:

00400490 <main>:
  400490:	44840000 	mtc1	a0,$f0
  400494:	3c020040 	lui	v0,0x40
  400498:	d44207f8 	ldc1	$f2,2040(v0)
  40049c:	46800021 	cvt.d.w	$f0,$f0
  4004a0:	46220002 	mul.d	$f0,$f0,$f2
  4004a4:	4620103c 	c.lt.d	$f2,$f0
  4004a8:	4501fffd 	bc1t	4004a0 <main+0x10>
  4004ac:	00000000 	nop
  4004b0:	4620000d 	trunc.w.d	$f0,$f0
  4004b4:	03e00008 	jr	ra
  4004b8:	44020000 	mfc1	v0,$f0
  4004bc:	00000000 	nop

Where the FPU emulator is used, depending on the number of command-line
arguments this code will either run to completion or terminate with
SIGILL.

If no arguments are specified, then BC1T will not be taken, NOP will not
be emulated and code will complete successfully.

If one argument is specified, then BC1T will be taken once and NOP will
be emulated.  At this point the entry PC value will be 0x400498 and the
new PC value, set by `mips_dsemul' will be 0x4004a0, the target of BC1T.
The emulation loop will terminate, but SIGILL will not be issued,
because the PC has changed.  The FPU emulator will be entered again and
on the second execution BC1T will not be taken, NOP will not be emulated
and code will complete successfully.

If two or more arguments are specified, then the first execution of BC1T
will proceed as above.  Upon reentering the FPU emulator the emulation
loop will continue to BC1T, at which point the branch will be taken and
NOP emulated again.  At this point however the entry PC value will be
0x4004a0, the same as the target of BC1T.  This will make the emulator
conclude that execution has not advanced and therefore an unsupported
FPU instruction has been encountered, and SIGILL will be sent to the
process.

Fix the problem by extending the internal API of `mips_dsemul', making
it return -1 if no delay slot emulation frame has been made, the
instruction has been handled and execution of the emulation loop needs
to continue as if nothing happened.  Remove code from `mips_dsemul' to
reproduce steps made by the emulation loop at the conclusion of each
iteration, as those will be reached normally now.  Adjust call sites
accordingly.  Document the API.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Aurelien Jarno <aurelien@aurel32.net>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12172/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit e4553573b37c3f72533683cb5f3a1ad300b18d37)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c |  4 ++++
 arch/mips/math-emu/dsemul.c | 14 ++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 89d05de8040a..b8941063a56e 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1268,6 +1268,8 @@ branch_common:
 						 */
 						sig = mips_dsemul(xcp, ir,
 								  contpc);
+						if (sig < 0)
+							break;
 						if (sig)
 							xcp->cp0_epc = bcpc;
 						/*
@@ -1321,6 +1323,8 @@ branch_common:
 				 * instruction in the dslot
 				 */
 				sig = mips_dsemul(xcp, ir, contpc);
+				if (sig < 0)
+					break;
 				if (sig)
 					xcp->cp0_epc = bcpc;
 				/* SIGILL forces out of the emulation loop.  */
diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index cbb36c14b155..70e4824c64dc 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -31,18 +31,20 @@ struct emuframe {
 	unsigned long		epc;
 };
 
+/*
+ * Set up an emulation frame for instruction IR, from a delay slot of
+ * a branch jumping to CPC.  Return 0 if successful, -1 if no emulation
+ * required, otherwise a signal number causing a frame setup failure.
+ */
 int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 {
 	struct emuframe __user *fr;
 	int err;
 
+	/* NOP is easy */
 	if ((get_isa16_mode(regs->cp0_epc) && ((ir >> 16) == MM_NOP16)) ||
-		(ir == 0)) {
-		/* NOP is easy */
-		regs->cp0_epc = cpc;
-		clear_delay_slot(regs);
-		return 0;
-	}
+	    (ir == 0))
+		return -1;
 
 	pr_debug("dsemul %lx %lx\n", regs->cp0_epc, cpc);
 

From 0aa09660cfef273fe72a27b445323247997292e7 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 22 Jan 2016 05:20:37 +0000
Subject: [PATCH 1420/3220] UPSTREAM: MIPS: math-emu: dsemul: Fix ill
 formatting of microMIPS part

Correct formatting breakage introduced with commit 102cedc32a6e ("MIPS:
microMIPS: Floating point support."), so that further changes to this
code can be consistent.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Aurelien Jarno <aurelien@aurel32.net>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12173/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit a87265cfedce49fa362030ae3e6ef047e08bc12c)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dsemul.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 70e4824c64dc..4e30bfc3cdd5 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -75,10 +75,14 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 		return SIGBUS;
 
 	if (get_isa16_mode(regs->cp0_epc)) {
-		err = __put_user(ir >> 16, (u16 __user *)(&fr->emul));
-		err |= __put_user(ir & 0xffff, (u16 __user *)((long)(&fr->emul) + 2));
-		err |= __put_user(BREAK_MATH >> 16, (u16 __user *)(&fr->badinst));
-		err |= __put_user(BREAK_MATH & 0xffff, (u16 __user *)((long)(&fr->badinst) + 2));
+		err = __put_user(ir >> 16,
+				 (u16 __user *)(&fr->emul));
+		err |= __put_user(ir & 0xffff,
+				  (u16 __user *)((long)(&fr->emul) + 2));
+		err |= __put_user(BREAK_MATH >> 16,
+				  (u16 __user *)(&fr->badinst));
+		err |= __put_user(BREAK_MATH & 0xffff,
+				  (u16 __user *)((long)(&fr->badinst) + 2));
 	} else {
 		err = __put_user(ir, &fr->emul);
 		err |= __put_user((mips_instruction)BREAK_MATH, &fr->badinst);
@@ -125,8 +129,10 @@ int do_dsemulret(struct pt_regs *xcp)
 	 *  - Is the following memory word the BD_COOKIE?
 	 */
 	if (get_isa16_mode(xcp->cp0_epc)) {
-		err = __get_user(instr[0], (u16 __user *)(&fr->badinst));
-		err |= __get_user(instr[1], (u16 __user *)((long)(&fr->badinst) + 2));
+		err = __get_user(instr[0],
+				 (u16 __user *)(&fr->badinst));
+		err |= __get_user(instr[1],
+				  (u16 __user *)((long)(&fr->badinst) + 2));
 		insn = (instr[0] << 16) | instr[1];
 	} else {
 		err = __get_user(insn, &fr->badinst);

From e415d7547a282c4d020235778fff15e1b3612820 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 22 Jan 2016 05:20:46 +0000
Subject: [PATCH 1421/3220] UPSTREAM: MIPS: math-emu: Make microMIPS branch
 delay slot emulation work

Complement commit 102cedc32a6e ("MIPS: microMIPS: Floating point
support.") which introduced microMIPS FPU emulation, but did not adjust
the encoding of the BREAK instruction used to terminate the branch delay
slot emulation frame.  Consequently the execution of any such frame is
indeterminate and, depending on CPU configuration, will result in random
code execution or an offending program being terminated with SIGILL.

This is because the regular MIPS BREAK instruction is encoded with the 0
major and the 0xd minor opcode, however in the microMIPS instruction set
this major/minor opcode pair denotes an encoding reserved for the DSP
ASE.  Instead the microMIPS BREAK instruction is encoded with the 0
major and the 0x7 minor opcode.

Use the correct BREAK encoding for microMIPS FPU emulation then.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Aurelien Jarno <aurelien@aurel32.net>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12174/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 733b8bc183f491e8263009edf8ef184fb44a6882)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/fpu_emulator.h       |  2 +-
 arch/mips/include/asm/mips-r2-to-r6-emul.h |  2 +-
 arch/mips/math-emu/dsemul.c                | 11 +++++++----
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/asm/fpu_emulator.h b/arch/mips/include/asm/fpu_emulator.h
index 2f021cdfba4f..3225c3c0724b 100644
--- a/arch/mips/include/asm/fpu_emulator.h
+++ b/arch/mips/include/asm/fpu_emulator.h
@@ -79,7 +79,7 @@ int mm_isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 /*
  * Break instruction with special math emu break code set
  */
-#define BREAK_MATH (0x0000000d | (BRK_MEMU << 16))
+#define BREAK_MATH(micromips) (((micromips) ? 0x7 : 0xd) | (BRK_MEMU << 16))
 
 #define SIGNALLING_NAN 0x7ff800007ff80000LL
 
diff --git a/arch/mips/include/asm/mips-r2-to-r6-emul.h b/arch/mips/include/asm/mips-r2-to-r6-emul.h
index 4b89f28047f7..1f6ea8352ca9 100644
--- a/arch/mips/include/asm/mips-r2-to-r6-emul.h
+++ b/arch/mips/include/asm/mips-r2-to-r6-emul.h
@@ -52,7 +52,7 @@ do {									\
 	__this_cpu_inc(mipsr2emustats.M);				\
 	err = __get_user(nir, (u32 __user *)regs->cp0_epc);		\
 	if (!err) {							\
-		if (nir == BREAK_MATH)					\
+		if (nir == BREAK_MATH(0))				\
 			__this_cpu_inc(mipsr2bdemustats.M);		\
 	}								\
 	preempt_enable();						\
diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 4e30bfc3cdd5..32ec5d7e1728 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -38,6 +38,7 @@ struct emuframe {
  */
 int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 {
+	mips_instruction break_math;
 	struct emuframe __user *fr;
 	int err;
 
@@ -65,6 +66,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 	 * branches, but gives us a cleaner interface to the exception
 	 * handler (single entry point).
 	 */
+	break_math = BREAK_MATH(get_isa16_mode(regs->cp0_epc));
 
 	/* Ensure that the two instructions are in the same cache line */
 	fr = (struct emuframe __user *)
@@ -79,13 +81,13 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 				 (u16 __user *)(&fr->emul));
 		err |= __put_user(ir & 0xffff,
 				  (u16 __user *)((long)(&fr->emul) + 2));
-		err |= __put_user(BREAK_MATH >> 16,
+		err |= __put_user(break_math >> 16,
 				  (u16 __user *)(&fr->badinst));
-		err |= __put_user(BREAK_MATH & 0xffff,
+		err |= __put_user(break_math & 0xffff,
 				  (u16 __user *)((long)(&fr->badinst) + 2));
 	} else {
 		err = __put_user(ir, &fr->emul);
-		err |= __put_user((mips_instruction)BREAK_MATH, &fr->badinst);
+		err |= __put_user(break_math, &fr->badinst);
 	}
 
 	err |= __put_user((mips_instruction)BD_COOKIE, &fr->cookie);
@@ -139,7 +141,8 @@ int do_dsemulret(struct pt_regs *xcp)
 	}
 	err |= __get_user(cookie, &fr->cookie);
 
-	if (unlikely(err || (insn != BREAK_MATH) || (cookie != BD_COOKIE))) {
+	if (unlikely(err || insn != BREAK_MATH(get_isa16_mode(xcp->cp0_epc)) ||
+		     cookie != BD_COOKIE)) {
 		MIPS_FPU_EMU_INC_STATS(errors);
 		return 0;
 	}

From a589481cd80332e8b0895a48ddc75e0997acd7ef Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 22 Jan 2016 05:21:00 +0000
Subject: [PATCH 1422/3220] UPSTREAM: MIPS: math-emu: Correct the emulation of
 microMIPS ADDIUPC instruction

Emulate the microMIPS ADDIUPC instruction directly in `mips_dsemul'.  If
executed in the emulation frame, this instruction produces an incorrect
result, because the value of the PC there is not the same as where the
instruction originated.

Reshape code so as to handle all microMIPS cases together.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Aurelien Jarno <aurelien@aurel32.net>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12175/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 69a1e6cbdf1f40d5dcae84c5a538d390b6d2c307)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/uapi/asm/inst.h |  8 ++++++++
 arch/mips/math-emu/dsemul.c       | 24 ++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 9b44d5a816fa..b145749c743f 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -799,6 +799,13 @@ struct mm_x_format {		/* Scaled indexed load format (microMIPS) */
 	;)))))
 };
 
+struct mm_a_format {		/* ADDIUPC format (microMIPS) */
+	__BITFIELD_FIELD(unsigned int opcode : 6,
+	__BITFIELD_FIELD(unsigned int rs : 3,
+	__BITFIELD_FIELD(signed int simmediate : 23,
+	;)))
+};
+
 /*
  * microMIPS instruction formats (16-bit length)
  */
@@ -940,6 +947,7 @@ union mips_instruction {
 	struct mm_i_format mm_i_format;
 	struct mm_m_format mm_m_format;
 	struct mm_x_format mm_x_format;
+	struct mm_a_format mm_a_format;
 	struct mm_b0_format mm_b0_format;
 	struct mm_b1_format mm_b1_format;
 	struct mm16_m_format mm16_m_format ;
diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 32ec5d7e1728..77a623db3b38 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -43,10 +43,30 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 	int err;
 
 	/* NOP is easy */
-	if ((get_isa16_mode(regs->cp0_epc) && ((ir >> 16) == MM_NOP16)) ||
-	    (ir == 0))
+	if (ir == 0)
 		return -1;
 
+	/* microMIPS instructions */
+	if (get_isa16_mode(regs->cp0_epc)) {
+		union mips_instruction insn = { .word = ir };
+
+		/* NOP16 aka MOVE16 $0, $0 */
+		if ((ir >> 16) == MM_NOP16)
+			return -1;
+
+		/* ADDIUPC */
+		if (insn.mm_a_format.opcode == mm_addiupc_op) {
+			unsigned int rs;
+			s32 v;
+
+			rs = (((insn.mm_a_format.rs + 0x1e) & 0xf) + 2);
+			v = regs->cp0_epc & ~3;
+			v += insn.mm_a_format.simmediate << 2;
+			regs->regs[rs] = (long)v;
+			return -1;
+		}
+	}
+
 	pr_debug("dsemul %lx %lx\n", regs->cp0_epc, cpc);
 
 	/*

From caeae55e45a60a4cd0f049882245ee8ec73a4a60 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 22 Jan 2016 05:21:13 +0000
Subject: [PATCH 1423/3220] UPSTREAM: MIPS: math-emu: dsemul: Correct
 description of the emulation frame

Remove irrelevant content from the description of the emulation frame in
`mips_dsemul', referring to bare-metal configurations.  Update the text,
reflecting the change made with commit ba3049ed4086 ("MIPS: Switch FPU
emulator trap to BREAK instruction."), where we switched from using an
address error exception on an unaligned access to the use of a BREAK 514
instruction causing a breakpoint exception instead.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Aurelien Jarno <aurelien@aurel32.net>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12176/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 6e1715f7c34d00dc94f3cecb2526ae3ff0b0649f)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dsemul.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 77a623db3b38..bca7c4367ca4 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -78,13 +78,8 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 	 * Algorithmics used a system call instruction, and
 	 * borrowed that vector.  MIPS/Linux version is a bit
 	 * more heavyweight in the interests of portability and
-	 * multiprocessor support.  For Linux we generate a
-	 * an unaligned access and force an address error exception.
-	 *
-	 * For embedded systems (stand-alone) we prefer to use a
-	 * non-existing CP1 instruction. This prevents us from emulating
-	 * branches, but gives us a cleaner interface to the exception
-	 * handler (single entry point).
+	 * multiprocessor support.  For Linux we use a BREAK 514
+	 * instruction causing a breakpoint exception.
 	 */
 	break_math = BREAK_MATH(get_isa16_mode(regs->cp0_epc));
 

From c56de7416556b840463de8a1baa394e938bdcb38 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 22 Jan 2016 05:21:47 +0000
Subject: [PATCH 1424/3220] UPSTREAM: MIPS: math-emu: dsemul: Reduce
 `get_isa16_mode' clutter

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Aurelien Jarno <aurelien@aurel32.net>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12178/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 6d7b14151d7510ed434f2e587cdae9eca82fc123)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dsemul.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index bca7c4367ca4..46b964d2b79c 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -38,6 +38,7 @@ struct emuframe {
  */
 int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 {
+	int isa16 = get_isa16_mode(regs->cp0_epc);
 	mips_instruction break_math;
 	struct emuframe __user *fr;
 	int err;
@@ -47,7 +48,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 		return -1;
 
 	/* microMIPS instructions */
-	if (get_isa16_mode(regs->cp0_epc)) {
+	if (isa16) {
 		union mips_instruction insn = { .word = ir };
 
 		/* NOP16 aka MOVE16 $0, $0 */
@@ -81,7 +82,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 	 * multiprocessor support.  For Linux we use a BREAK 514
 	 * instruction causing a breakpoint exception.
 	 */
-	break_math = BREAK_MATH(get_isa16_mode(regs->cp0_epc));
+	break_math = BREAK_MATH(isa16);
 
 	/* Ensure that the two instructions are in the same cache line */
 	fr = (struct emuframe __user *)
@@ -91,7 +92,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 	if (unlikely(!access_ok(VERIFY_WRITE, fr, sizeof(struct emuframe))))
 		return SIGBUS;
 
-	if (get_isa16_mode(regs->cp0_epc)) {
+	if (isa16) {
 		err = __put_user(ir >> 16,
 				 (u16 __user *)(&fr->emul));
 		err |= __put_user(ir & 0xffff,
@@ -113,8 +114,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 		return SIGBUS;
 	}
 
-	regs->cp0_epc = ((unsigned long) &fr->emul) |
-		get_isa16_mode(regs->cp0_epc);
+	regs->cp0_epc = (unsigned long)&fr->emul | isa16;
 
 	flush_cache_sigtramp((unsigned long)&fr->emul);
 
@@ -123,6 +123,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 
 int do_dsemulret(struct pt_regs *xcp)
 {
+	int isa16 = get_isa16_mode(xcp->cp0_epc);
 	struct emuframe __user *fr;
 	unsigned long epc;
 	u32 insn, cookie;
@@ -145,7 +146,7 @@ int do_dsemulret(struct pt_regs *xcp)
 	 *  - Is the instruction pointed to by the EPC an BREAK_MATH?
 	 *  - Is the following memory word the BD_COOKIE?
 	 */
-	if (get_isa16_mode(xcp->cp0_epc)) {
+	if (isa16) {
 		err = __get_user(instr[0],
 				 (u16 __user *)(&fr->badinst));
 		err |= __get_user(instr[1],
@@ -156,8 +157,8 @@ int do_dsemulret(struct pt_regs *xcp)
 	}
 	err |= __get_user(cookie, &fr->cookie);
 
-	if (unlikely(err || insn != BREAK_MATH(get_isa16_mode(xcp->cp0_epc)) ||
-		     cookie != BD_COOKIE)) {
+	if (unlikely(err ||
+		     insn != BREAK_MATH(isa16) || cookie != BD_COOKIE)) {
 		MIPS_FPU_EMU_INC_STATS(errors);
 		return 0;
 	}

From 883f4325f636ec426e82f2ac6f6550a5f2d7340f Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Sat, 30 Jan 2016 09:09:43 +0000
Subject: [PATCH 1425/3220] UPSTREAM: MIPS: math-emu: dsemul: Remove an unused
 bit in ADDIUPC emulation

Avoid a reader's confusion, as the calculation is correct either way.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12283/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 036aff91c30a6f15d5bf25f22827abc26b6d06c1)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dsemul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 46b964d2b79c..d4ceacd4fa12 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -60,7 +60,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 			unsigned int rs;
 			s32 v;
 
-			rs = (((insn.mm_a_format.rs + 0x1e) & 0xf) + 2);
+			rs = (((insn.mm_a_format.rs + 0xe) & 0xf) + 2);
 			v = regs->cp0_epc & ~3;
 			v += insn.mm_a_format.simmediate << 2;
 			regs->regs[rs] = (long)v;

From afcd2b897895ebff91eed97a84dfb135e86c5c24 Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 21 May 2016 14:01:51 +0200
Subject: [PATCH 1426/3220] UPSTREAM: MIPS: math-emu: Fix typo

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Cc: macro@imgtec.com
Cc: trivial@kernel.org
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/13333/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit e7e3346cc64fff306694bdc41908283d195339c1)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dsemul.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index d4ceacd4fa12..47074887e64c 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -8,7 +8,7 @@
 #include "ieee754.h"
 
 /*
- * Emulate the arbritrary instruction ir at xcp->cp0_epc.  Required when
+ * Emulate the arbitrary instruction ir at xcp->cp0_epc.  Required when
  * we have to emulate the instruction in a COP1 branch delay slot.  Do
  * not change cp0_epc due to the instruction
  *
@@ -88,7 +88,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 	fr = (struct emuframe __user *)
 		((regs->regs[29] - sizeof(struct emuframe)) & ~0x7);
 
-	/* Verify that the stack pointer is not competely insane */
+	/* Verify that the stack pointer is not completely insane */
 	if (unlikely(!access_ok(VERIFY_WRITE, fr, sizeof(struct emuframe))))
 		return SIGBUS;
 

From cb7c7f371ea5922f1efea469afe2475c4b039d8f Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Tue, 9 Feb 2016 11:00:07 -0800
Subject: [PATCH 1427/3220] UPSTREAM: MIPS: Select CONFIG_HANDLE_DOMAIN_IRQ and
 make it work.

Per the subject, always select HANDLE_DOMAIN_IRQ, and implement
set_irq_regs() so that it actually works.

Signed-off-by: David Daney <david.daney@cavium.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12496/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 1d2753a66acbb101a0ec495cd13b9031ac1b171f)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/Kconfig                |  1 +
 arch/mips/include/asm/irq_regs.h | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8b0424abc84c..117ac742a063 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -64,6 +64,7 @@ config MIPS
 	select HAVE_IRQ_TIME_ACCOUNTING
 	select GENERIC_TIME_VSYSCALL
 	select ARCH_CLOCKSOURCE_DATA
+	select HANDLE_DOMAIN_IRQ
 
 menu "Machine selection"
 
diff --git a/arch/mips/include/asm/irq_regs.h b/arch/mips/include/asm/irq_regs.h
index 33bd2a06de57..8c48d6dd1d78 100644
--- a/arch/mips/include/asm/irq_regs.h
+++ b/arch/mips/include/asm/irq_regs.h
@@ -18,4 +18,14 @@ static inline struct pt_regs *get_irq_regs(void)
 	return current_thread_info()->regs;
 }
 
+static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
+{
+	struct pt_regs *old_regs;
+
+	old_regs = get_irq_regs();
+	current_thread_info()->regs = new_regs;
+
+	return old_regs;
+}
+
 #endif /* __ASM_IRQ_REGS_H */

From 1e9d84070a8c2c3dc4002807611a20a353868267 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 1 Feb 2016 13:50:36 +0000
Subject: [PATCH 1428/3220] UPSTREAM: MIPS: Properly disable FPU in
 start_thread()

start_thread() (called for execve(2)) clears the TIF_USEDFPU flag
without atomically disabling the FPU. With a preemptive kernel, an
unfortunately timed preemption after this could result in another
task (or KVM guest) being scheduled in with the FPU still enabled, since
lose_fpu_inatomic() only turns it off if TIF_USEDFPU is set.

Use lose_fpu(0) instead of the separate FPU / MSA management, which
should do the right thing (drop FPU properly and atomically without
saving state) and will be more future proof.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Reviewed-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12302/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 76e5846d3bdf59eb1010d5607003da2dc3910bb1)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/process.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index fcbc4e57d765..25dabde4bdeb 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -64,12 +64,10 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 	status = regs->cp0_status & ~(ST0_CU0|ST0_CU1|ST0_FR|KU_MASK);
 	status |= KU_USER;
 	regs->cp0_status = status;
-	clear_used_math();
-	clear_fpu_owner();
-	init_dsp();
-	clear_thread_flag(TIF_USEDMSA);
+	lose_fpu(0);
 	clear_thread_flag(TIF_MSA_CTX_LIVE);
-	disable_msa();
+	clear_used_math();
+	init_dsp();
 	regs->cp0_epc = pc;
 	regs->regs[29] = sp;
 }

From 5ea4150e17720495e25f57c61416b23562bb6f24 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Sun, 27 Mar 2016 00:07:14 +0100
Subject: [PATCH 1429/3220] UPSTREAM: MIPS: Make flush_thread

Avoids function calls to an empty function.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 04cc89d120f94131de89a6e20da27016db4782ce)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/processor.h | 4 ++++
 arch/mips/kernel/process.c        | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index 041153f5cf93..cfa15bad43af 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -355,6 +355,10 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
  */
 extern void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp);
 
+static inline void flush_thread(void)
+{
+}
+
 unsigned long get_wchan(struct task_struct *p);
 
 #define __KSTK_TOS(tsk) ((unsigned long)task_stack_page(tsk) + \
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 25dabde4bdeb..adb081f513ff 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -76,10 +76,6 @@ void exit_thread(void)
 {
 }
 
-void flush_thread(void)
-{
-}
-
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	/*

From 3ebefa784088b9d630f6c3b27666af71f51d342a Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:16 -0700
Subject: [PATCH 1430/3220] BACKPORT: exit_thread: remove empty bodies

Define HAVE_EXIT_THREAD for archs which want to do something in
exit_thread. For others, let's define exit_thread as an empty inline.

This is a cleanup before we change the prototype of exit_thread to
accept a task parameter.

[akpm@linux-foundation.org: fix mips]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 5f56a5dfdb9bcb3bca03df59980d4d2f012cbb53)

Conflicts:
	arch/powerpc/kernel/process.c
	arch/xtensa/Kconfig
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/Kconfig                            |  5 +++++
 arch/alpha/kernel/process.c             |  8 --------
 arch/arc/kernel/process.c               |  7 -------
 arch/arm/Kconfig                        |  1 +
 arch/arm64/kernel/process.c             |  7 -------
 arch/avr32/Kconfig                      |  1 +
 arch/blackfin/include/asm/processor.h   |  7 -------
 arch/c6x/kernel/process.c               |  4 ----
 arch/cris/Kconfig                       |  1 +
 arch/cris/arch-v10/kernel/process.c     |  9 ---------
 arch/frv/include/asm/processor.h        |  7 -------
 arch/h8300/include/asm/processor.h      |  7 -------
 arch/hexagon/kernel/process.c           |  7 -------
 arch/ia64/Kconfig                       |  1 +
 arch/m32r/kernel/process.c              |  9 ---------
 arch/m68k/include/asm/processor.h       |  7 -------
 arch/metag/Kconfig                      |  1 +
 arch/metag/include/asm/processor.h      |  2 --
 arch/microblaze/include/asm/processor.h | 10 ----------
 arch/mips/kernel/process.c              |  4 ----
 arch/mn10300/Kconfig                    |  1 +
 arch/nios2/include/asm/processor.h      |  5 -----
 arch/openrisc/include/asm/processor.h   |  9 ---------
 arch/parisc/kernel/process.c            |  7 -------
 arch/powerpc/Kconfig                    |  1 +
 arch/s390/Kconfig                       |  1 +
 arch/score/kernel/process.c             |  2 --
 arch/sh/Kconfig                         |  1 +
 arch/sh/kernel/process_32.c             |  7 -------
 arch/sparc/Kconfig                      |  1 +
 arch/tile/Kconfig                       |  1 +
 arch/um/kernel/process.c                |  4 ----
 arch/unicore32/kernel/process.c         |  7 -------
 arch/x86/Kconfig                        |  1 +
 arch/xtensa/Kconfig                     |  1 +
 include/linux/sched.h                   |  7 +++++++
 36 files changed, 25 insertions(+), 136 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index ed2539c590bf..a29c6098fb98 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -536,6 +536,11 @@ config HAVE_ARCH_MMAP_RND_BITS
 	  - ARCH_MMAP_RND_BITS_MIN
 	  - ARCH_MMAP_RND_BITS_MAX
 
+config HAVE_EXIT_THREAD
+	bool
+	help
+	  An architecture implements exit_thread.
+
 config ARCH_MMAP_RND_BITS_MIN
 	int
 
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 84d13263ce46..b483156698d5 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -210,14 +210,6 @@ start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 }
 EXPORT_SYMBOL(start_thread);
 
-/*
- * Free current thread data structures etc..
- */
-void
-exit_thread(void)
-{
-}
-
 void
 flush_thread(void)
 {
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index a3f750e76b68..b5db9e7fd649 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -183,13 +183,6 @@ void flush_thread(void)
 {
 }
 
-/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
 int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 {
 	return 0;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 07ac8f7df8bb..b7e2f82247d1 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -50,6 +50,7 @@ config ARM
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
 	select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index dfbabc0e274b..236120fc4dbc 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -270,13 +270,6 @@ void show_regs(struct pt_regs * regs)
 	__show_regs(regs);
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 static void tls_thread_flush(void)
 {
 	asm ("msr tpidr_el0, xzr");
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b6878eb64884..37d9c02be634 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -4,6 +4,7 @@ config AVR32
 	# that we usually don't need on AVR32.
 	select EXPERT
 	select HAVE_CLK
+	select HAVE_EXIT_THREAD
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select VIRT_TO_BUS
diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h
index 7acd46653df3..0c265aba94ad 100644
--- a/arch/blackfin/include/asm/processor.h
+++ b/arch/blackfin/include/asm/processor.h
@@ -75,13 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index 3ae9f5a166a0..0ee7686a78f3 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -82,10 +82,6 @@ void flush_thread(void)
 {
 }
 
-void exit_thread(void)
-{
-}
-
 /*
  * Do necessary setup to start up a newly executed thread.
  */
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index e086f9e93728..088551086342 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -59,6 +59,7 @@ config CRIS
 	select GENERIC_IOMAP
 	select MODULES_USE_ELF_RELA
 	select CLONE_BACKWARDS2
+	select HAVE_EXIT_THREAD if ETRAX_ARCH_V32
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select ARCH_REQUIRE_GPIOLIB
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 02b783457be0..96e5afef6b47 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -35,15 +35,6 @@ void default_idle(void)
 	local_irq_enable();
 }
 
-/*
- * Free current thread data structures etc..
- */
-
-void exit_thread(void)
-{
-	/* Nothing needs to be done.  */
-}
-
 /* if the watchdog is enabled, we can simply disable interrupts and go
  * into an eternal loop, and the watchdog will reset the CPU after 0.1s
  * if on the other hand the watchdog wasn't enabled, we just enable it and wait
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h
index ae8d423e79d9..73f0a79ad8e6 100644
--- a/arch/frv/include/asm/processor.h
+++ b/arch/frv/include/asm/processor.h
@@ -96,13 +96,6 @@ extern asmlinkage void *restore_user_regs(const struct user_context *target, ...
 #define release_segments(mm)		do { } while (0)
 #define forget_segments()		do { } while (0)
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h
index 54e3fd83c336..111df7397ac7 100644
--- a/arch/h8300/include/asm/processor.h
+++ b/arch/h8300/include/asm/processor.h
@@ -110,13 +110,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index a9ebd471823a..d9edfd3fc52a 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -136,13 +136,6 @@ void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
 /*
  * Some archs flush debug and FPU info here
  */
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 7534fd34f79d..a6a72c6e2d03 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -18,6 +18,7 @@ config IA64
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
 	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
 	select HAVE_UNSTABLE_SCHED_CLOCK
+	select HAVE_EXIT_THREAD
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index e69221d581d5..a88b1f01e91f 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -101,15 +101,6 @@ void show_regs(struct pt_regs * regs)
 #endif
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-	/* Nothing to do. */
-	DPRINTK("pid = %d\n", current->pid);
-}
-
 void flush_thread(void)
 {
 	DPRINTK("pid = %d\n", current->pid);
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h
index 20dda1d4b860..a6ce2ec8d693 100644
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -153,13 +153,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index 0b389a81c43a..6fe012a001be 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -11,6 +11,7 @@ config METAG
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DYNAMIC_FTRACE
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_KERNEL_BZIP2
diff --git a/arch/metag/include/asm/processor.h b/arch/metag/include/asm/processor.h
index 0838ca699764..a0333ebcac35 100644
--- a/arch/metag/include/asm/processor.h
+++ b/arch/metag/include/asm/processor.h
@@ -134,8 +134,6 @@ static inline void release_thread(struct task_struct *dead_task)
 #define copy_segments(tsk, mm)		do { } while (0)
 #define release_segments(mm)		do { } while (0)
 
-extern void exit_thread(void);
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index 497a988d79c2..c38d0dd91134 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -70,11 +70,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free all resources held by a thread. */
-static inline void exit_thread(void)
-{
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *t);
 
 extern unsigned long get_wchan(struct task_struct *p);
@@ -127,11 +122,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free current thread data structures etc.  */
-static inline void exit_thread(void)
-{
-}
-
 /* Return saved (kernel) PC of a blocked thread.  */
 #  define thread_saved_pc(tsk)	\
 	((tsk)->thread.regs ? (tsk)->thread.regs->r15 : 0)
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index adb081f513ff..83153946f389 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -72,10 +72,6 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 	regs->regs[29] = sp;
 }
 
-void exit_thread(void)
-{
-}
-
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	/*
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 78ae5552fdb8..e5f4a197a7ea 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -1,5 +1,6 @@
 config MN10300
 	def_bool y
+	select HAVE_EXIT_THREAD
 	select HAVE_OPROFILE
 	select HAVE_UID16
 	select GENERIC_IRQ_SHOW
diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h
index c2ba45c159c7..1c953f0cadbf 100644
--- a/arch/nios2/include/asm/processor.h
+++ b/arch/nios2/include/asm/processor.h
@@ -75,11 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free current thread data structures etc.. */
-static inline void exit_thread(void)
-{
-}
-
 /* Return saved PC of a blocked thread. */
 #define thread_saved_pc(tsk)	((tsk)->thread.kregs->ea)
 
diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h
index 4d235e3d2534..70334c9f7d24 100644
--- a/arch/openrisc/include/asm/processor.h
+++ b/arch/openrisc/include/asm/processor.h
@@ -84,15 +84,6 @@ void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp);
 void release_thread(struct task_struct *);
 unsigned long get_wchan(struct task_struct *p);
 
-/*
- * Free current thread data structures etc..
- */
-
-extern inline void exit_thread(void)
-{
-	/* Nothing needs to be done.  */
-}
-
 /*
  * Return saved PC of a blocked thread. For now, this is the "user" PC
  */
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 809905a811ed..40639439d8b3 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -144,13 +144,6 @@ void machine_power_off(void)
 void (*pm_power_off)(void) = machine_power_off;
 EXPORT_SYMBOL(pm_power_off);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	/* Only needs to handle fpu stuff or perf monitors.
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index dfb1ee8c3e06..e55c4fc5d325 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -113,6 +113,7 @@ config PPC
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
+	select HAVE_EXIT_THREAD
 	select HAVE_OPROFILE
 	select HAVE_DEBUG_KMEMLEAK
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2ee95ece0498..8e82f40a3ae9 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -129,6 +129,7 @@ config S390
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index a1519ad3d49d..aae9480706c2 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -56,8 +56,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp)
 	regs->regs[0] = sp;
 }
 
-void exit_thread(void) {}
-
 /*
  * When a process does an "exec", machine state like FPU and debug
  * registers need to be reset.  This is a hook function for that.
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d514df7e04dd..0bc9282d37d2 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -71,6 +71,7 @@ config SUPERH32
 
 config SUPERH64
 	def_bool ARCH = "sh64"
+	select HAVE_EXIT_THREAD
 	select KALLSYMS
 
 config ARCH_DEFCONFIG
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 2885fc9d9dcd..ee12e9451874 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -76,13 +76,6 @@ void start_thread(struct pt_regs *regs, unsigned long new_pc,
 }
 EXPORT_SYMBOL(start_thread);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 94f4ac21761b..b4f7a98acbc5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -20,6 +20,7 @@ config SPARC
 	select HAVE_OPROFILE
 	select HAVE_ARCH_KGDB if !SMP || SPARC64
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_EXIT_THREAD
 	select SYSCTL_EXCEPTION_TRACE
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 8ec7a4599c08..09d4656f1492 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -3,6 +3,7 @@
 
 config TILE
 	def_bool y
+	select HAVE_EXIT_THREAD
 	select HAVE_PERF_EVENTS
 	select USE_PMC if PERF_EVENTS
 	select HAVE_DMA_ATTRS
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 48af59aae129..0b04711f1f18 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -103,10 +103,6 @@ void interrupt_end(void)
 		tracehook_notify_resume(regs);
 }
 
-void exit_thread(void)
-{
-}
-
 int get_current_pid(void)
 {
 	return task_pid_nr(current);
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index b008e9961465..00299c927852 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -201,13 +201,6 @@ void show_regs(struct pt_regs *regs)
 	__backtrace();
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	struct thread_info *thread = current_thread_info();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 02eb4db5d0b7..93a1b2af3ec3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -108,6 +108,7 @@ config X86
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select HAVE_EXIT_THREAD
 	select HAVE_FENTRY			if X86_64
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_FP_TEST
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 82044f732323..bdad9be4a729 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -16,6 +16,7 @@ config XTENSA
 	select GENERIC_SCHED_CLOCK
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
+	select HAVE_EXIT_THREAD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUTEX_CMPXCHG if !MMU
 	select HAVE_IRQ_TIME_ACCOUNTING
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d3dd0d516957..761ccea3eb4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2816,7 +2816,14 @@ static inline int copy_thread_tls(
 }
 #endif
 extern void flush_thread(void);
+
+#ifdef CONFIG_HAVE_EXIT_THREAD
 extern void exit_thread(void);
+#else
+static inline void exit_thread(void)
+{
+}
+#endif
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_sighand(struct sighand_struct *);

From cc261c372935629b5414b864205a3bd7e2c69137 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 11 Sep 2017 11:24:23 +0200
Subject: [PATCH 1431/3220] BACKPORT: s390: get rid of exit_thread()

exit_thread() is empty now. Therefore remove it and get rid of a
pointless branch.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
(cherry picked from commit 59a19ea9a0b3b0ee69887b6a5015aee3a3c7e527)

Conflicts:
	arch/s390/Kconfig
	arch/s390/kernel/process.c
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/s390/Kconfig          | 1 -
 arch/s390/kernel/process.c | 7 -------
 2 files changed, 8 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 8e82f40a3ae9..2ee95ece0498 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -129,7 +129,6 @@ config S390
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
-	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 7bc4e4c5d5b8..d1653cc64550 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -67,13 +67,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 
 extern void kernel_thread_starter(void);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 }

From c9cd1837ca21b9731f954b7469083482ae5d1e8e Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 8 Jul 2016 11:06:19 +0100
Subject: [PATCH 1432/3220] UPSTREAM: MIPS: Use per-mm page to execute branch
 delay slot instructions

In some cases the kernel needs to execute an instruction from the delay
slot of an emulated branch instruction. These cases include:

  - Emulated floating point branch instructions (bc1[ft]l?) for systems
    which don't include an FPU, or upon which the kernel is run with the
    "nofpu" parameter.

  - MIPSr6 systems running binaries targeting older revisions of the
    architecture, which may include branch instructions whose encodings
    are no longer valid in MIPSr6.

Executing instructions from such delay slots is done by writing the
instruction to memory followed by a trap, as part of an "emuframe", and
executing it. This avoids the requirement of an emulator for the entire
MIPS instruction set. Prior to this patch such emuframes are written to
the user stack and executed from there.

This patch moves FP branch delay emuframes off of the user stack and
into a per-mm page. Allocating a page per-mm leaves userland with access
to only what it had access to previously, and compared to other
solutions is relatively simple.

When a thread requires a delay slot emulation, it is allocated a frame.
A thread may only have one frame allocated at any one time, since it may
only ever be executing one instruction at any one time. In order to
ensure that we can free up allocated frame later, its index is recorded
in struct thread_struct. In the typical case, after executing the delay
slot instruction we'll execute a break instruction with the BRK_MEMU
code. This traps back to the kernel & leads to a call to do_dsemulret
which frees the allocated frame & moves the user PC back to the
instruction that would have executed following the emulated branch.
In some cases the delay slot instruction may be invalid, such as a
branch, or may trigger an exception. In these cases the BRK_MEMU break
instruction will not be hit. In order to ensure that frames are freed
this patch introduces dsemul_thread_cleanup() and calls it to free any
allocated frame upon thread exit. If the instruction generated an
exception & leads to a signal being delivered to the thread, or indeed
if a signal simply happens to be delivered to the thread whilst it is
executing from the struct emuframe, then we need to take care to exit
the frame appropriately. This is done by either rolling back the user PC
to the branch or advancing it to the continuation PC prior to signal
delivery, using dsemul_thread_rollback(). If this were not done then a
sigreturn would return to the struct emuframe, and if that frame had
meanwhile been used in response to an emulated branch instruction within
the signal handler then we would execute the wrong user code.

Whilst a user could theoretically place something like a compact branch
to self in a delay slot and cause their thread to become stuck in an
infinite loop with the frame never being deallocated, this would:

  - Only affect the users single process.

  - Be architecturally invalid since there would be a branch in the
    delay slot, which is forbidden.

  - Be extremely unlikely to happen by mistake, and provide a program
    with no more ability to harm the system than a simple infinite loop
    would.

If a thread requires a delay slot emulation & no frame is available to
it (ie. the process has enough other threads that all frames are
currently in use) then the thread joins a waitqueue. It will sleep until
a frame is freed by another thread in the process.

Since we now know whether a thread has an allocated frame due to our
tracking of its index, the cookie field of struct emuframe is removed as
we can be more certain whether we have a valid frame. Since a thread may
only ever have a single frame at any given time, the epc field of struct
emuframe is also removed & the PC to continue from is instead stored in
struct thread_struct. Together these changes simplify & shrink struct
emuframe somewhat, allowing twice as many frames to fit into the page
allocated for them.

The primary benefit of this patch is that we are now free to mark the
user stack non-executable where that is possible.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Leonid Yegoshin <leonid.yegoshin@imgtec.com>
Cc: Maciej Rozycki <maciej.rozycki@imgtec.com>
Cc: Faraz Shahbazker <faraz.shahbazker@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: Matthew Fortune <matthew.fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/13764/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 432c6bacbd0c16ec210c43da411ccc3855c4c010)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/Kconfig                     |   1 +
 arch/mips/include/asm/dsemul.h        |  92 +++++++
 arch/mips/include/asm/fpu_emulator.h  |  17 +-
 arch/mips/include/asm/mmu.h           |   9 +
 arch/mips/include/asm/mmu_context.h   |   6 +
 arch/mips/include/asm/processor.h     |  18 +-
 arch/mips/kernel/mips-r2-to-r6-emul.c |   8 +-
 arch/mips/kernel/process.c            |  14 ++
 arch/mips/kernel/signal.c             |   8 +
 arch/mips/kernel/vdso.c               |  10 +
 arch/mips/math-emu/cp1emu.c           |   8 +-
 arch/mips/math-emu/dsemul.c           | 333 +++++++++++++++++---------
 12 files changed, 391 insertions(+), 133 deletions(-)
 create mode 100644 arch/mips/include/asm/dsemul.h

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 117ac742a063..75e92584013b 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -65,6 +65,7 @@ config MIPS
 	select GENERIC_TIME_VSYSCALL
 	select ARCH_CLOCKSOURCE_DATA
 	select HANDLE_DOMAIN_IRQ
+	select HAVE_EXIT_THREAD
 
 menu "Machine selection"
 
diff --git a/arch/mips/include/asm/dsemul.h b/arch/mips/include/asm/dsemul.h
new file mode 100644
index 000000000000..a6e067801f23
--- /dev/null
+++ b/arch/mips/include/asm/dsemul.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016 Imagination Technologies
+ * Author: Paul Burton <paul.burton@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __MIPS_ASM_DSEMUL_H__
+#define __MIPS_ASM_DSEMUL_H__
+
+#include <asm/break.h>
+#include <asm/inst.h>
+
+/* Break instruction with special math emu break code set */
+#define BREAK_MATH(micromips)	(((micromips) ? 0x7 : 0xd) | (BRK_MEMU << 16))
+
+/* When used as a frame index, indicates the lack of a frame */
+#define BD_EMUFRAME_NONE	((int)BIT(31))
+
+struct mm_struct;
+struct pt_regs;
+struct task_struct;
+
+/**
+ * mips_dsemul() - 'Emulate' an instruction from a branch delay slot
+ * @regs:	User thread register context.
+ * @ir:		The instruction to be 'emulated'.
+ * @branch_pc:	The PC of the branch instruction.
+ * @cont_pc:	The PC to continue at following 'emulation'.
+ *
+ * Emulate or execute an arbitrary MIPS instruction within the context of
+ * the current user thread. This is used primarily to handle instructions
+ * in the delay slots of emulated branch instructions, for example FP
+ * branch instructions on systems without an FPU.
+ *
+ * Return: Zero on success, negative if ir is a NOP, signal number on failure.
+ */
+extern int mips_dsemul(struct pt_regs *regs, mips_instruction ir,
+		       unsigned long branch_pc, unsigned long cont_pc);
+
+/**
+ * do_dsemulret() - Return from a delay slot 'emulation' frame
+ * @xcp:	User thread register context.
+ *
+ * Call in response to the BRK_MEMU break instruction used to return to
+ * the kernel from branch delay slot 'emulation' frames following a call
+ * to mips_dsemul(). Restores the user thread PC to the value that was
+ * passed as the cpc parameter to mips_dsemul().
+ *
+ * Return: True if an emulation frame was returned from, else false.
+ */
+extern bool do_dsemulret(struct pt_regs *xcp);
+
+/**
+ * dsemul_thread_cleanup() - Cleanup thread 'emulation' frame
+ * @tsk: The task structure associated with the thread
+ *
+ * If the thread @tsk has a branch delay slot 'emulation' frame
+ * allocated to it then free that frame.
+ *
+ * Return: True if a frame was freed, else false.
+ */
+extern bool dsemul_thread_cleanup(struct task_struct *tsk);
+
+/**
+ * dsemul_thread_rollback() - Rollback from an 'emulation' frame
+ * @regs:	User thread register context.
+ *
+ * If the current thread, whose register context is represented by @regs,
+ * is executing within a delay slot 'emulation' frame then exit that
+ * frame. The PC will be rolled back to the branch if the instruction
+ * that was being 'emulated' has not yet executed, or advanced to the
+ * continuation PC if it has.
+ *
+ * Return: True if a frame was exited, else false.
+ */
+extern bool dsemul_thread_rollback(struct pt_regs *regs);
+
+/**
+ * dsemul_mm_cleanup() - Cleanup per-mm delay slot 'emulation' state
+ * @mm:		The struct mm_struct to cleanup state for.
+ *
+ * Cleanup state for the given @mm, ensuring that any memory allocated
+ * for delay slot 'emulation' book-keeping is freed. This is to be called
+ * before @mm is freed in order to avoid memory leaks.
+ */
+extern void dsemul_mm_cleanup(struct mm_struct *mm);
+
+#endif /* __MIPS_ASM_DSEMUL_H__ */
diff --git a/arch/mips/include/asm/fpu_emulator.h b/arch/mips/include/asm/fpu_emulator.h
index 3225c3c0724b..355dc25172e7 100644
--- a/arch/mips/include/asm/fpu_emulator.h
+++ b/arch/mips/include/asm/fpu_emulator.h
@@ -24,7 +24,7 @@
 #define _ASM_FPU_EMULATOR_H
 
 #include <linux/sched.h>
-#include <asm/break.h>
+#include <asm/dsemul.h>
 #include <asm/thread_info.h>
 #include <asm/inst.h>
 #include <asm/local.h>
@@ -60,27 +60,16 @@ do {									\
 #define MIPS_FPU_EMU_INC_STATS(M) do { } while (0)
 #endif /* CONFIG_DEBUG_FS */
 
-extern int mips_dsemul(struct pt_regs *regs, mips_instruction ir,
-	unsigned long cpc);
-extern int do_dsemulret(struct pt_regs *xcp);
 extern int fpu_emulator_cop1Handler(struct pt_regs *xcp,
 				    struct mips_fpu_struct *ctx, int has_fpu,
 				    void *__user *fault_addr);
 int process_fpemu_return(int sig, void __user *fault_addr,
 			 unsigned long fcr31);
+int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
+		  unsigned long *contpc);
 int mm_isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 		     unsigned long *contpc);
 
-/*
- * Instruction inserted following the badinst to further tag the sequence
- */
-#define BD_COOKIE 0x0000bd36	/* tne $0, $0 with baggage */
-
-/*
- * Break instruction with special math emu break code set
- */
-#define BREAK_MATH(micromips) (((micromips) ? 0x7 : 0xd) | (BRK_MEMU << 16))
-
 #define SIGNALLING_NAN 0x7ff800007ff80000LL
 
 static inline void fpu_emulator_init_fpu(void)
diff --git a/arch/mips/include/asm/mmu.h b/arch/mips/include/asm/mmu.h
index 1afa1f986df8..f6ba08d77931 100644
--- a/arch/mips/include/asm/mmu.h
+++ b/arch/mips/include/asm/mmu.h
@@ -2,11 +2,20 @@
 #define __ASM_MMU_H
 
 #include <linux/atomic.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
 
 typedef struct {
 	unsigned long asid[NR_CPUS];
 	void *vdso;
 	atomic_t fp_mode_switching;
+
+	/* lock to be held whilst modifying fp_bd_emupage_allocmap */
+	spinlock_t bd_emupage_lock;
+	/* bitmap tracking allocation of fp_bd_emupage */
+	unsigned long *bd_emupage_allocmap;
+	/* wait queue for threads requiring an emuframe */
+	wait_queue_head_t bd_emupage_queue;
 } mm_context_t;
 
 #endif /* __ASM_MMU_H */
diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index 45914b59824c..3a7761f88a16 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -16,6 +16,7 @@
 #include <linux/smp.h>
 #include <linux/slab.h>
 #include <asm/cacheflush.h>
+#include <asm/dsemul.h>
 #include <asm/hazards.h>
 #include <asm/tlbflush.h>
 #include <asm-generic/mm_hooks.h>
@@ -133,6 +134,10 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 
 	atomic_set(&mm->context.fp_mode_switching, 0);
 
+	mm->context.bd_emupage_allocmap = NULL;
+	spin_lock_init(&mm->context.bd_emupage_lock);
+	init_waitqueue_head(&mm->context.bd_emupage_queue);
+
 	return 0;
 }
 
@@ -167,6 +172,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  */
 static inline void destroy_context(struct mm_struct *mm)
 {
+	dsemul_mm_cleanup(mm);
 }
 
 #define deactivate_mm(tsk, mm)	do { } while (0)
diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index cfa15bad43af..25eef1d60f27 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -11,12 +11,14 @@
 #ifndef _ASM_PROCESSOR_H
 #define _ASM_PROCESSOR_H
 
+#include <linux/atomic.h>
 #include <linux/cpumask.h>
 #include <linux/threads.h>
 
 #include <asm/cachectl.h>
 #include <asm/cpu.h>
 #include <asm/cpu-info.h>
+#include <asm/dsemul.h>
 #include <asm/mipsregs.h>
 #include <asm/prefetch.h>
 
@@ -74,7 +76,11 @@ extern unsigned int vced_count, vcei_count;
 
 #endif
 
-#define STACK_TOP	(TASK_SIZE & PAGE_MASK)
+/*
+ * One page above the stack is used for branch delay slot "emulation".
+ * See dsemul.c for details.
+ */
+#define STACK_TOP	((TASK_SIZE & PAGE_MASK) - PAGE_SIZE)
 
 /*
  * This decides where the kernel will search for a free chunk of vm
@@ -252,6 +258,12 @@ struct thread_struct {
 
 	/* Saved fpu/fpu emulator stuff. */
 	struct mips_fpu_struct fpu FPU_ALIGN;
+	/* Assigned branch delay slot 'emulation' frame */
+	atomic_t bd_emu_frame;
+	/* PC of the branch from a branch delay slot 'emulation' */
+	unsigned long bd_emu_branch_pc;
+	/* PC to continue from following a branch delay slot 'emulation' */
+	unsigned long bd_emu_cont_pc;
 #ifdef CONFIG_MIPS_MT_FPAFF
 	/* Emulated instruction count */
 	unsigned long emulated_fp;
@@ -319,6 +331,10 @@ struct thread_struct {
 	 * FPU affinity state (null if not FPAFF)		\
 	 */							\
 	FPAFF_INIT						\
+	/* Delay slot emulation */				\
+	.bd_emu_frame = ATOMIC_INIT(BD_EMUFRAME_NONE),		\
+	.bd_emu_branch_pc = 0,					\
+	.bd_emu_cont_pc = 0,					\
 	/*							\
 	 * Saved DSP stuff					\
 	 */							\
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index e3384065f5e7..a7a9f7cfb870 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -283,7 +283,7 @@ static int jr_func(struct pt_regs *regs, u32 ir)
 		err = mipsr6_emul(regs, nir);
 		if (err > 0) {
 			regs->cp0_epc = nepc;
-			err = mips_dsemul(regs, nir, cepc);
+			err = mips_dsemul(regs, nir, epc, cepc);
 			if (err == SIGILL)
 				err = SIGEMT;
 			MIPS_R2_STATS(dsemul);
@@ -1033,7 +1033,7 @@ repeat:
 			if (nir) {
 				err = mipsr6_emul(regs, nir);
 				if (err > 0) {
-					err = mips_dsemul(regs, nir, cpc);
+					err = mips_dsemul(regs, nir, epc, cpc);
 					if (err == SIGILL)
 						err = SIGEMT;
 					MIPS_R2_STATS(dsemul);
@@ -1082,7 +1082,7 @@ repeat:
 			if (nir) {
 				err = mipsr6_emul(regs, nir);
 				if (err > 0) {
-					err = mips_dsemul(regs, nir, cpc);
+					err = mips_dsemul(regs, nir, epc, cpc);
 					if (err == SIGILL)
 						err = SIGEMT;
 					MIPS_R2_STATS(dsemul);
@@ -1149,7 +1149,7 @@ repeat:
 		if (nir) {
 			err = mipsr6_emul(regs, nir);
 			if (err > 0) {
-				err = mips_dsemul(regs, nir, cpc);
+				err = mips_dsemul(regs, nir, epc, cpc);
 				if (err == SIGILL)
 					err = SIGEMT;
 				MIPS_R2_STATS(dsemul);
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 83153946f389..474ce8d1f67d 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -30,6 +30,7 @@
 #include <asm/asm.h>
 #include <asm/bootinfo.h>
 #include <asm/cpu.h>
+#include <asm/dsemul.h>
 #include <asm/dsp.h>
 #include <asm/fpu.h>
 #include <asm/irq.h>
@@ -67,11 +68,22 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 	lose_fpu(0);
 	clear_thread_flag(TIF_MSA_CTX_LIVE);
 	clear_used_math();
+	atomic_set(&current->thread.bd_emu_frame, BD_EMUFRAME_NONE);
 	init_dsp();
 	regs->cp0_epc = pc;
 	regs->regs[29] = sp;
 }
 
+void exit_thread(struct task_struct *tsk)
+{
+	/*
+	 * User threads may have allocated a delay slot emulation frame.
+	 * If so, clean up that allocation.
+	 */
+	if (!(current->flags & PF_KTHREAD))
+		dsemul_thread_cleanup(tsk);
+}
+
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	/*
@@ -158,6 +170,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
 	clear_tsk_thread_flag(p, TIF_FPUBOUND);
 #endif /* CONFIG_MIPS_MT_FPAFF */
 
+	atomic_set(&p->thread.bd_emu_frame, BD_EMUFRAME_NONE);
+
 	if (clone_flags & CLONE_SETTLS)
 		ti->tp_value = regs->regs[7];
 
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 9e35b6b26aa8..129be9703fe8 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -772,6 +772,14 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	struct mips_abi *abi = current->thread.abi;
 	void *vdso = current->mm->context.vdso;
 
+	/*
+	 * If we were emulating a delay slot instruction, exit that frame such
+	 * that addresses in the sigframe are as expected for userland and we
+	 * don't have a problem if we reuse the thread's frame for an
+	 * instruction within the signal handler.
+	 */
+	dsemul_thread_rollback(regs);
+
 	if (regs->regs[0]) {
 		switch(regs->regs[2]) {
 		case ERESTART_RESTARTBLOCK:
diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index 5649a9e429e0..3fcc833b316d 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -106,6 +106,16 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
 	down_write(&mm->mmap_sem);
 
+	/* Map delay slot emulation page */
+	base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
+			   VM_READ|VM_WRITE|VM_EXEC|
+			   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+			   0);
+	if (IS_ERR_VALUE(base)) {
+		ret = base;
+		goto out;
+	}
+
 	/*
 	 * Determine total area size. This includes the VDSO data itself, the
 	 * data page, and the GIC user page if present. Always create a mapping
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index b8941063a56e..066e0720a7f1 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -434,8 +434,8 @@ static int microMIPS32_to_MIPS32(union mips_instruction *insn_ptr)
  * a single subroutine should be used across both
  * modules.
  */
-static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
-			 unsigned long *contpc)
+int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
+		  unsigned long *contpc)
 {
 	union mips_instruction insn = (union mips_instruction)dec_insn.insn;
 	unsigned int fcr31;
@@ -1267,7 +1267,7 @@ branch_common:
 						 * instruction in the dslot.
 						 */
 						sig = mips_dsemul(xcp, ir,
-								  contpc);
+								  bcpc, contpc);
 						if (sig < 0)
 							break;
 						if (sig)
@@ -1322,7 +1322,7 @@ branch_common:
 				 * Single step the non-cp1
 				 * instruction in the dslot
 				 */
-				sig = mips_dsemul(xcp, ir, contpc);
+				sig = mips_dsemul(xcp, ir, bcpc, contpc);
 				if (sig < 0)
 					break;
 				if (sig)
diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 47074887e64c..72a4642eee2c 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -1,3 +1,6 @@
+#include <linux/err.h>
+#include <linux/slab.h>
+
 #include <asm/branch.h>
 #include <asm/cacheflush.h>
 #include <asm/fpu_emulator.h>
@@ -5,43 +8,211 @@
 #include <asm/mipsregs.h>
 #include <asm/uaccess.h>
 
-#include "ieee754.h"
-
-/*
- * Emulate the arbitrary instruction ir at xcp->cp0_epc.  Required when
- * we have to emulate the instruction in a COP1 branch delay slot.  Do
- * not change cp0_epc due to the instruction
+/**
+ * struct emuframe - The 'emulation' frame structure
+ * @emul:	The instruction to 'emulate'.
+ * @badinst:	A break instruction to cause a return to the kernel.
  *
- * According to the spec:
- * 1) it shouldn't be a branch :-)
- * 2) it can be a COP instruction :-(
- * 3) if we are tring to run a protected memory space we must take
- *    special care on memory access instructions :-(
+ * This structure defines the frames placed within the delay slot emulation
+ * page in response to a call to mips_dsemul(). Each thread may be allocated
+ * only one frame at any given time. The kernel stores within it the
+ * instruction to be 'emulated' followed by a break instruction, then
+ * executes the frame in user mode. The break causes a trap to the kernel
+ * which leads to do_dsemulret() being called unless the instruction in
+ * @emul causes a trap itself, is a branch, or a signal is delivered to
+ * the thread. In these cases the allocated frame will either be reused by
+ * a subsequent delay slot 'emulation', or be freed during signal delivery or
+ * upon thread exit.
+ *
+ * This approach is used because:
+ *
+ * - Actually emulating all instructions isn't feasible. We would need to
+ *   be able to handle instructions from all revisions of the MIPS ISA,
+ *   all ASEs & all vendor instruction set extensions. This would be a
+ *   whole lot of work & continual maintenance burden as new instructions
+ *   are introduced, and in the case of some vendor extensions may not
+ *   even be possible. Thus we need to take the approach of actually
+ *   executing the instruction.
+ *
+ * - We must execute the instruction within user context. If we were to
+ *   execute the instruction in kernel mode then it would have access to
+ *   kernel resources without very careful checks, leaving us with a
+ *   high potential for security or stability issues to arise.
+ *
+ * - We used to place the frame on the users stack, but this requires
+ *   that the stack be executable. This is bad for security so the
+ *   per-process page is now used instead.
+ *
+ * - The instruction in @emul may be something entirely invalid for a
+ *   delay slot. The user may (intentionally or otherwise) place a branch
+ *   in a delay slot, or a kernel mode instruction, or something else
+ *   which generates an exception. Thus we can't rely upon the break in
+ *   @badinst always being hit. For this reason we track the index of the
+ *   frame allocated to each thread, allowing us to clean it up at later
+ *   points such as signal delivery or thread exit.
+ *
+ * - The user may generate a fake struct emuframe if they wish, invoking
+ *   the BRK_MEMU break instruction themselves. We must therefore not
+ *   trust that BRK_MEMU means there's actually a valid frame allocated
+ *   to the thread, and must not allow the user to do anything they
+ *   couldn't already.
  */
-
-/*
- * "Trampoline" return routine to catch exception following
- *  execution of delay-slot instruction execution.
- */
-
 struct emuframe {
 	mips_instruction	emul;
 	mips_instruction	badinst;
-	mips_instruction	cookie;
-	unsigned long		epc;
 };
 
-/*
- * Set up an emulation frame for instruction IR, from a delay slot of
- * a branch jumping to CPC.  Return 0 if successful, -1 if no emulation
- * required, otherwise a signal number causing a frame setup failure.
- */
-int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
+static const int emupage_frame_count = PAGE_SIZE / sizeof(struct emuframe);
+
+static inline __user struct emuframe *dsemul_page(void)
+{
+	return (__user struct emuframe *)STACK_TOP;
+}
+
+static int alloc_emuframe(void)
+{
+	mm_context_t *mm_ctx = &current->mm->context;
+	int idx;
+
+retry:
+	spin_lock(&mm_ctx->bd_emupage_lock);
+
+	/* Ensure we have an allocation bitmap */
+	if (!mm_ctx->bd_emupage_allocmap) {
+		mm_ctx->bd_emupage_allocmap =
+			kcalloc(BITS_TO_LONGS(emupage_frame_count),
+					      sizeof(unsigned long),
+				GFP_ATOMIC);
+
+		if (!mm_ctx->bd_emupage_allocmap) {
+			idx = BD_EMUFRAME_NONE;
+			goto out_unlock;
+		}
+	}
+
+	/* Attempt to allocate a single bit/frame */
+	idx = bitmap_find_free_region(mm_ctx->bd_emupage_allocmap,
+				      emupage_frame_count, 0);
+	if (idx < 0) {
+		/*
+		 * Failed to allocate a frame. We'll wait until one becomes
+		 * available. We unlock the page so that other threads actually
+		 * get the opportunity to free their frames, which means
+		 * technically the result of bitmap_full may be incorrect.
+		 * However the worst case is that we repeat all this and end up
+		 * back here again.
+		 */
+		spin_unlock(&mm_ctx->bd_emupage_lock);
+		if (!wait_event_killable(mm_ctx->bd_emupage_queue,
+			!bitmap_full(mm_ctx->bd_emupage_allocmap,
+				     emupage_frame_count)))
+			goto retry;
+
+		/* Received a fatal signal - just give in */
+		return BD_EMUFRAME_NONE;
+	}
+
+	/* Success! */
+	pr_debug("allocate emuframe %d to %d\n", idx, current->pid);
+out_unlock:
+	spin_unlock(&mm_ctx->bd_emupage_lock);
+	return idx;
+}
+
+static void free_emuframe(int idx, struct mm_struct *mm)
+{
+	mm_context_t *mm_ctx = &mm->context;
+
+	spin_lock(&mm_ctx->bd_emupage_lock);
+
+	pr_debug("free emuframe %d from %d\n", idx, current->pid);
+	bitmap_clear(mm_ctx->bd_emupage_allocmap, idx, 1);
+
+	/* If some thread is waiting for a frame, now's its chance */
+	wake_up(&mm_ctx->bd_emupage_queue);
+
+	spin_unlock(&mm_ctx->bd_emupage_lock);
+}
+
+static bool within_emuframe(struct pt_regs *regs)
+{
+	unsigned long base = (unsigned long)dsemul_page();
+
+	if (regs->cp0_epc < base)
+		return false;
+	if (regs->cp0_epc >= (base + PAGE_SIZE))
+		return false;
+
+	return true;
+}
+
+bool dsemul_thread_cleanup(struct task_struct *tsk)
+{
+	int fr_idx;
+
+	/* Clear any allocated frame, retrieving its index */
+	fr_idx = atomic_xchg(&tsk->thread.bd_emu_frame, BD_EMUFRAME_NONE);
+
+	/* If no frame was allocated, we're done */
+	if (fr_idx == BD_EMUFRAME_NONE)
+		return false;
+
+	task_lock(tsk);
+
+	/* Free the frame that this thread had allocated */
+	if (tsk->mm)
+		free_emuframe(fr_idx, tsk->mm);
+
+	task_unlock(tsk);
+	return true;
+}
+
+bool dsemul_thread_rollback(struct pt_regs *regs)
+{
+	struct emuframe __user *fr;
+	int fr_idx;
+
+	/* Do nothing if we're not executing from a frame */
+	if (!within_emuframe(regs))
+		return false;
+
+	/* Find the frame being executed */
+	fr_idx = atomic_read(&current->thread.bd_emu_frame);
+	if (fr_idx == BD_EMUFRAME_NONE)
+		return false;
+	fr = &dsemul_page()[fr_idx];
+
+	/*
+	 * If the PC is at the emul instruction, roll back to the branch. If
+	 * PC is at the badinst (break) instruction, we've already emulated the
+	 * instruction so progress to the continue PC. If it's anything else
+	 * then something is amiss & the user has branched into some other area
+	 * of the emupage - we'll free the allocated frame anyway.
+	 */
+	if (msk_isa16_mode(regs->cp0_epc) == (unsigned long)&fr->emul)
+		regs->cp0_epc = current->thread.bd_emu_branch_pc;
+	else if (msk_isa16_mode(regs->cp0_epc) == (unsigned long)&fr->badinst)
+		regs->cp0_epc = current->thread.bd_emu_cont_pc;
+
+	atomic_set(&current->thread.bd_emu_frame, BD_EMUFRAME_NONE);
+	free_emuframe(fr_idx, current->mm);
+	return true;
+}
+
+void dsemul_mm_cleanup(struct mm_struct *mm)
+{
+	mm_context_t *mm_ctx = &mm->context;
+
+	kfree(mm_ctx->bd_emupage_allocmap);
+}
+
+int mips_dsemul(struct pt_regs *regs, mips_instruction ir,
+		unsigned long branch_pc, unsigned long cont_pc)
 {
 	int isa16 = get_isa16_mode(regs->cp0_epc);
 	mips_instruction break_math;
 	struct emuframe __user *fr;
-	int err;
+	int err, fr_idx;
 
 	/* NOP is easy */
 	if (ir == 0)
@@ -68,30 +239,20 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 		}
 	}
 
-	pr_debug("dsemul %lx %lx\n", regs->cp0_epc, cpc);
+	pr_debug("dsemul 0x%08lx cont at 0x%08lx\n", regs->cp0_epc, cont_pc);
 
-	/*
-	 * The strategy is to push the instruction onto the user stack
-	 * and put a trap after it which we can catch and jump to
-	 * the required address any alternative apart from full
-	 * instruction emulation!!.
-	 *
-	 * Algorithmics used a system call instruction, and
-	 * borrowed that vector.  MIPS/Linux version is a bit
-	 * more heavyweight in the interests of portability and
-	 * multiprocessor support.  For Linux we use a BREAK 514
-	 * instruction causing a breakpoint exception.
-	 */
+	/* Allocate a frame if we don't already have one */
+	fr_idx = atomic_read(&current->thread.bd_emu_frame);
+	if (fr_idx == BD_EMUFRAME_NONE)
+		fr_idx = alloc_emuframe();
+	if (fr_idx == BD_EMUFRAME_NONE)
+		return SIGBUS;
+	fr = &dsemul_page()[fr_idx];
+
+	/* Retrieve the appropriately encoded break instruction */
 	break_math = BREAK_MATH(isa16);
 
-	/* Ensure that the two instructions are in the same cache line */
-	fr = (struct emuframe __user *)
-		((regs->regs[29] - sizeof(struct emuframe)) & ~0x7);
-
-	/* Verify that the stack pointer is not completely insane */
-	if (unlikely(!access_ok(VERIFY_WRITE, fr, sizeof(struct emuframe))))
-		return SIGBUS;
-
+	/* Write the instructions to the frame */
 	if (isa16) {
 		err = __put_user(ir >> 16,
 				 (u16 __user *)(&fr->emul));
@@ -106,84 +267,36 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
 		err |= __put_user(break_math, &fr->badinst);
 	}
 
-	err |= __put_user((mips_instruction)BD_COOKIE, &fr->cookie);
-	err |= __put_user(cpc, &fr->epc);
-
 	if (unlikely(err)) {
 		MIPS_FPU_EMU_INC_STATS(errors);
+		free_emuframe(fr_idx, current->mm);
 		return SIGBUS;
 	}
 
+	/* Record the PC of the branch, PC to continue from & frame index */
+	current->thread.bd_emu_branch_pc = branch_pc;
+	current->thread.bd_emu_cont_pc = cont_pc;
+	atomic_set(&current->thread.bd_emu_frame, fr_idx);
+
+	/* Change user register context to execute the frame */
 	regs->cp0_epc = (unsigned long)&fr->emul | isa16;
 
+	/* Ensure the icache observes our newly written frame */
 	flush_cache_sigtramp((unsigned long)&fr->emul);
 
 	return 0;
 }
 
-int do_dsemulret(struct pt_regs *xcp)
+bool do_dsemulret(struct pt_regs *xcp)
 {
-	int isa16 = get_isa16_mode(xcp->cp0_epc);
-	struct emuframe __user *fr;
-	unsigned long epc;
-	u32 insn, cookie;
-	int err = 0;
-	u16 instr[2];
-
-	fr = (struct emuframe __user *)
-		(msk_isa16_mode(xcp->cp0_epc) - sizeof(mips_instruction));
-
-	/*
-	 * If we can't even access the area, something is very wrong, but we'll
-	 * leave that to the default handling
-	 */
-	if (!access_ok(VERIFY_READ, fr, sizeof(struct emuframe)))
-		return 0;
-
-	/*
-	 * Do some sanity checking on the stackframe:
-	 *
-	 *  - Is the instruction pointed to by the EPC an BREAK_MATH?
-	 *  - Is the following memory word the BD_COOKIE?
-	 */
-	if (isa16) {
-		err = __get_user(instr[0],
-				 (u16 __user *)(&fr->badinst));
-		err |= __get_user(instr[1],
-				  (u16 __user *)((long)(&fr->badinst) + 2));
-		insn = (instr[0] << 16) | instr[1];
-	} else {
-		err = __get_user(insn, &fr->badinst);
-	}
-	err |= __get_user(cookie, &fr->cookie);
-
-	if (unlikely(err ||
-		     insn != BREAK_MATH(isa16) || cookie != BD_COOKIE)) {
+	/* Cleanup the allocated frame, returning if there wasn't one */
+	if (!dsemul_thread_cleanup(current)) {
 		MIPS_FPU_EMU_INC_STATS(errors);
-		return 0;
-	}
-
-	/*
-	 * At this point, we are satisfied that it's a BD emulation trap.  Yes,
-	 * a user might have deliberately put two malformed and useless
-	 * instructions in a row in his program, in which case he's in for a
-	 * nasty surprise - the next instruction will be treated as a
-	 * continuation address!  Alas, this seems to be the only way that we
-	 * can handle signals, recursion, and longjmps() in the context of
-	 * emulating the branch delay instruction.
-	 */
-
-	pr_debug("dsemulret\n");
-
-	if (__get_user(epc, &fr->epc)) {		/* Saved EPC */
-		/* This is not a good situation to be in */
-		force_sig(SIGBUS, current);
-
-		return 0;
+		return false;
 	}
 
 	/* Set EPC to return to post-branch instruction */
-	xcp->cp0_epc = epc;
-	MIPS_FPU_EMU_INC_STATS(ds_emul);
-	return 1;
+	xcp->cp0_epc = current->thread.bd_emu_cont_pc;
+	pr_debug("dsemulret to 0x%08lx\n", xcp->cp0_epc);
+	return true;
 }

From c16fb87374fdc11c3353a6ed2513da734cf90e64 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:11 -0700
Subject: [PATCH 1433/3220] UPSTREAM: mn10300: let exit_fpu accept a task

We need to call exit_thread from copy_process in a fail path.  Since
exit_thread on mn10300 calls exit_thread_runtime_instr, make it accept
task_struct as a parameter now.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 2ec656eb4054ce55e6d453b8614ef9669e84c542)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mn10300/include/asm/fpu.h | 6 ++----
 arch/mn10300/kernel/process.c  | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/mn10300/include/asm/fpu.h b/arch/mn10300/include/asm/fpu.h
index 738ff72659d5..a47e995d45f3 100644
--- a/arch/mn10300/include/asm/fpu.h
+++ b/arch/mn10300/include/asm/fpu.h
@@ -76,11 +76,9 @@ static inline void unlazy_fpu(struct task_struct *tsk)
 	preempt_enable();
 }
 
-static inline void exit_fpu(void)
+static inline void exit_fpu(struct task_struct *tsk)
 {
 #ifdef CONFIG_LAZY_SAVE_FPU
-	struct task_struct *tsk = current;
-
 	preempt_disable();
 	if (fpu_state_owner == tsk)
 		fpu_state_owner = NULL;
@@ -123,7 +121,7 @@ static inline void fpu_init_state(void) {}
 static inline void fpu_save(struct fpu_state_struct *s) {}
 static inline void fpu_kill_state(struct task_struct *tsk) {}
 static inline void unlazy_fpu(struct task_struct *tsk) {}
-static inline void exit_fpu(void) {}
+static inline void exit_fpu(struct task_struct *tsk) {}
 static inline void flush_fpu(void) {}
 static inline int fpu_setup_sigcontext(struct fpucontext *buf) { return 0; }
 static inline int fpu_restore_sigcontext(struct fpucontext *buf) { return 0; }
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 3707da583d05..74a96ccf7451 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -105,7 +105,7 @@ void show_regs(struct pt_regs *regs)
  */
 void exit_thread(void)
 {
-	exit_fpu();
+	exit_fpu(current);
 }
 
 void flush_thread(void)

From af22cc785fe4a1127ce6df206405629ce4870761 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:20 -0700
Subject: [PATCH 1434/3220] BACKPORT: exit_thread: accept a task parameter to
 be exited

We need to call exit_thread from copy_process in a fail path.  So make it
accept task_struct as a parameter.

[v2]
* s390: exit_thread_runtime_instr doesn't make sense to be called for
  non-current tasks.
* arm: fix the comment in vfp_thread_copy
* change 'me' to 'tsk' for task_struct
* now we can change only archs that actually have exit_thread

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit e64646946ed32902fd597fa6e514b1da84642de3)

Conflicts:
	arch/s390/kernel/process.c
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm/kernel/process.c           |  4 ++--
 arch/arm/vfp/vfpmodule.c            |  4 ----
 arch/avr32/kernel/process.c         |  4 ++--
 arch/cris/arch-v32/kernel/process.c |  4 ++--
 arch/ia64/kernel/perfmon.c          |  4 ++--
 arch/ia64/kernel/process.c          | 14 +++++++-------
 arch/metag/kernel/process.c         |  6 +++---
 arch/mn10300/kernel/process.c       |  4 ++--
 arch/sh/kernel/process_64.c         |  5 ++---
 arch/sparc/kernel/process_32.c      | 12 ++++++------
 arch/sparc/kernel/process_64.c      |  4 ++--
 arch/tile/kernel/process.c          |  4 ++--
 arch/x86/kernel/process.c           |  5 ++---
 arch/xtensa/kernel/process.c        |  4 ++--
 include/linux/sched.h               |  4 ++--
 kernel/exit.c                       |  2 +-
 16 files changed, 39 insertions(+), 45 deletions(-)

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 0017b3edfea8..e06307ad1d1c 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -268,9 +268,9 @@ EXPORT_SYMBOL_GPL(thread_notify_head);
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	thread_notify(THREAD_NOTIFY_EXIT, current_thread_info());
+	thread_notify(THREAD_NOTIFY_EXIT, task_thread_info(tsk));
 }
 
 void flush_thread(void)
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 2a61e4b04600..73085d3482ed 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -156,10 +156,6 @@ static void vfp_thread_copy(struct thread_info *thread)
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  *  THREAD_NOTIFY_EXIT
- *   - the thread (v) will be running on the local CPU, so
- *	v === current_thread_info()
- *   - thread->cpu is the local CPU number at the time it is accessed,
- *	but may change at any time.
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  */
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 42a53e740a7e..68e5b9dac059 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -62,9 +62,9 @@ void machine_restart(char *cmd)
 /*
  * Free current thread data structures etc
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	ocd_disable(current);
+	ocd_disable(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index c7ce784a393c..4d1afa9f9fd3 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -33,9 +33,9 @@ void default_idle(void)
  */
 
 extern void deconfigure_bp(long pid);
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	deconfigure_bp(current->pid);
+	deconfigure_bp(tsk->pid);
 }
 
 /*
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 60e02f7747ff..c5d5955511d1 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4543,8 +4543,8 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 
 
 /*
- * called only from exit_thread(): task == current
- * we come here only if current has a context attached (loaded or masked)
+ * called only from exit_thread()
+ * we come here only if the task has a context attached (loaded or masked)
  */
 void
 pfm_exit_thread(struct task_struct *task)
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index b51514957620..aae6c4dc7ae7 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -570,22 +570,22 @@ flush_thread (void)
 }
 
 /*
- * Clean up state associated with current thread.  This is called when
+ * Clean up state associated with a thread.  This is called when
  * the thread calls exit().
  */
 void
-exit_thread (void)
+exit_thread (struct task_struct *tsk)
 {
 
-	ia64_drop_fpu(current);
+	ia64_drop_fpu(tsk);
 #ifdef CONFIG_PERFMON
        /* if needed, stop monitoring and flush state to perfmon context */
-	if (current->thread.pfm_context)
-		pfm_exit_thread(current);
+	if (tsk->thread.pfm_context)
+		pfm_exit_thread(tsk);
 
 	/* free debug register resources */
-	if (current->thread.flags & IA64_THREAD_DBG_VALID)
-		pfm_release_debug_registers(current);
+	if (tsk->thread.flags & IA64_THREAD_DBG_VALID)
+		pfm_release_debug_registers(tsk);
 #endif
 }
 
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 7f546183a0f0..35062796edf2 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -345,10 +345,10 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc.
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	clear_fpu(&current->thread);
-	clear_dsp(&current->thread);
+	clear_fpu(&tsk->thread);
+	clear_dsp(&tsk->thread);
 }
 
 /* TODO: figure out how to unwind the kernel stack here to figure out
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 74a96ccf7451..cbede4e88dee 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -103,9 +103,9 @@ void show_regs(struct pt_regs *regs)
 /*
  * free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	exit_fpu(current);
+	exit_fpu(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index e2062e643341..9d3e9916555d 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -288,7 +288,7 @@ void show_regs(struct pt_regs *regs)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 	/*
 	 * See arch/sparc/kernel/process.c for the precedent for doing
@@ -307,9 +307,8 @@ void exit_thread(void)
 	 * which it would get safely nulled.
 	 */
 #ifdef CONFIG_SH_FPU
-	if (last_task_used_math == current) {
+	if (last_task_used_math == tsk)
 		last_task_used_math = NULL;
-	}
 #endif
 }
 
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index c5113c7ce2fd..b7780a5bef11 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -184,21 +184,21 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifndef CONFIG_SMP
-	if(last_task_used_math == current) {
+	if (last_task_used_math == tsk) {
 #else
-	if (test_thread_flag(TIF_USEDFPU)) {
+	if (test_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU)) {
 #endif
 		/* Keep process from leaving FPU in a bogon state. */
 		put_psr(get_psr() | PSR_EF);
-		fpsave(&current->thread.float_regs[0], &current->thread.fsr,
-		       &current->thread.fpqueue[0], &current->thread.fpqdepth);
+		fpsave(&tsk->thread.float_regs[0], &tsk->thread.fsr,
+		       &tsk->thread.fpqueue[0], &tsk->thread.fpqdepth);
 #ifndef CONFIG_SMP
 		last_task_used_math = NULL;
 #else
-		clear_thread_flag(TIF_USEDFPU);
+		clear_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU);
 #endif
 	}
 }
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 46a59643bb1c..7c3fc5886038 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -417,9 +417,9 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 }
 
 /* Free current thread data structures etc.. */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct thread_info *t = current_thread_info();
+	struct thread_info *t = task_thread_info(tsk);
 
 	if (t->utraps) {
 		if (t->utraps[0] < 2)
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index a97ab1a69a90..c5f3c1a20507 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -539,7 +539,7 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifdef CONFIG_HARDWALL
 	/*
@@ -548,7 +548,7 @@ void exit_thread(void)
 	 * the last reference to a hardwall fd, it would already have
 	 * been released and deactivated at this point.)
 	 */
-	hardwall_deactivate_all(current);
+	hardwall_deactivate_all(tsk);
 #endif
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c1b21d61b769..2754dad26839 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,10 +81,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct task_struct *me = current;
-	struct thread_struct *t = &me->thread;
+	struct thread_struct *t = &tsk->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
 	struct fpu *fpu = &t->fpu;
 
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 1c85323f01d7..17a378c15a15 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -113,10 +113,10 @@ void arch_cpu_idle(void)
 /*
  * This is called when the thread calls exit().
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #if XTENSA_HAVE_COPROCESSORS
-	coprocessor_release_all(current_thread_info());
+	coprocessor_release_all(task_thread_info(tsk));
 #endif
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 761ccea3eb4a..8377ac73a034 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2818,9 +2818,9 @@ static inline int copy_thread_tls(
 extern void flush_thread(void);
 
 #ifdef CONFIG_HAVE_EXIT_THREAD
-extern void exit_thread(void);
+extern void exit_thread(struct task_struct *tsk);
 #else
-static inline void exit_thread(void)
+static inline void exit_thread(struct task_struct *tsk)
 {
 }
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 4ceeac2e39f4..0003d8b97fd9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -753,7 +753,7 @@ void do_exit(long code)
 		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
-	exit_thread();
+	exit_thread(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent

From 30bee7f9387b2b7a897ff1143bbb8dcbed28918c Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 22 Sep 2016 15:47:40 +0100
Subject: [PATCH 1435/3220] UPSTREAM: MIPS: Fix delay slot emulation count in
 debugfs

Commit 432c6bacbd0c ("MIPS: Use per-mm page to execute branch delay slot
instructions") accidentally removed use of the MIPS_FPU_EMU_INC_STATS
macro from do_dsemulret, leading to the ds_emul file in debugfs always
returning zero even though we perform delay slot emulations.

Fix this by re-adding the use of the MIPS_FPU_EMU_INC_STATS macro.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Fixes: 432c6bacbd0c ("MIPS: Use per-mm page to execute branch delay slot instructions")
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/14301/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 116e7111c8e3cc65ceef9664741bd593483e9517)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dsemul.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 72a4642eee2c..4a094f7acb3d 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -298,5 +298,6 @@ bool do_dsemulret(struct pt_regs *xcp)
 	/* Set EPC to return to post-branch instruction */
 	xcp->cp0_epc = current->thread.bd_emu_cont_pc;
 	pr_debug("dsemulret to 0x%08lx\n", xcp->cp0_epc);
+	MIPS_FPU_EMU_INC_STATS(ds_emul);
 	return true;
 }

From 18e152e0510756337aef3c2dc3ba858416293e4a Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 13 Nov 2015 00:46:44 +0000
Subject: [PATCH 1436/3220] UPSTREAM: MIPS: Use a union to access the ELF file
 header

Rewrite `arch_elf_pt_proc' and `arch_check_elf' using a union to access
the ELF file header.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Fortune <Matthew.Fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11474/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 2ed02dd415aee71a8bf4c621a9bd65c256642b96)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/elf.c | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index 3afffc30ee12..c408465ea83a 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -68,15 +68,23 @@ static struct mode_req none_req = { true, true, false, true, true };
 int arch_elf_pt_proc(void *_ehdr, void *_phdr, struct file *elf,
 		     bool is_interp, struct arch_elf_state *state)
 {
-	struct elf32_hdr *ehdr32 = _ehdr;
+	union {
+		struct elf32_hdr e32;
+		struct elf64_hdr e64;
+	} *ehdr = _ehdr;
 	struct elf32_phdr *phdr32 = _phdr;
 	struct elf64_phdr *phdr64 = _phdr;
 	struct mips_elf_abiflags_v0 abiflags;
+	bool elf32;
+	u32 flags;
 	int ret;
 
+	elf32 = ehdr->e32.e_ident[EI_CLASS] == ELFCLASS32;
+	flags = elf32 ? ehdr->e32.e_flags : ehdr->e64.e_flags;
+
 	/* Lets see if this is an O32 ELF */
-	if (ehdr32->e_ident[EI_CLASS] == ELFCLASS32) {
-		if (ehdr32->e_flags & EF_MIPS_FP64) {
+	if (elf32) {
+		if (flags & EF_MIPS_FP64) {
 			/*
 			 * Set MIPS_ABI_FP_OLD_64 for EF_MIPS_FP64. We will override it
 			 * later if needed
@@ -123,10 +131,17 @@ int arch_elf_pt_proc(void *_ehdr, void *_phdr, struct file *elf,
 int arch_check_elf(void *_ehdr, bool has_interpreter,
 		   struct arch_elf_state *state)
 {
-	struct elf32_hdr *ehdr = _ehdr;
+	union {
+		struct elf32_hdr e32;
+		struct elf64_hdr e64;
+	} *ehdr = _ehdr;
 	struct mode_req prog_req, interp_req;
 	int fp_abi, interp_fp_abi, abi0, abi1, max_abi;
-	bool is_mips64;
+	bool elf32;
+	u32 flags;
+
+	elf32 = ehdr->e32.e_ident[EI_CLASS] == ELFCLASS32;
+	flags = elf32 ? ehdr->e32.e_flags : ehdr->e64.e_flags;
 
 	if (!config_enabled(CONFIG_MIPS_O32_FP64_SUPPORT))
 		return 0;
@@ -142,21 +157,18 @@ int arch_check_elf(void *_ehdr, bool has_interpreter,
 		abi0 = abi1 = fp_abi;
 	}
 
-	is_mips64 = (ehdr->e_ident[EI_CLASS] == ELFCLASS64) ||
-		    (ehdr->e_flags & EF_MIPS_ABI2);
-
-	if (is_mips64) {
-		/* MIPS64 code always uses FR=1, thus the default is easy */
-		state->overall_fp_mode = FP_FR1;
-
-		/* Disallow access to the various FPXX & FP64 ABIs */
-		max_abi = MIPS_ABI_FP_SOFT;
-	} else {
+	if (elf32 && !(flags & EF_MIPS_ABI2)) {
 		/* Default to a mode capable of running code expecting FR=0 */
 		state->overall_fp_mode = cpu_has_mips_r6 ? FP_FRE : FP_FR0;
 
 		/* Allow all ABIs we know about */
 		max_abi = MIPS_ABI_FP_64A;
+	} else {
+		/* MIPS64 code always uses FR=1, thus the default is easy */
+		state->overall_fp_mode = FP_FR1;
+
+		/* Disallow access to the various FPXX & FP64 ABIs */
+		max_abi = MIPS_ABI_FP_SOFT;
 	}
 
 	if ((abi0 > max_abi && abi0 != MIPS_ABI_FP_UNKNOWN) ||

From 380f74966e731c6cde9464e026d1433e782bd2dd Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 13 Nov 2015 00:47:48 +0000
Subject: [PATCH 1437/3220] UPSTREAM: ELF: Also pass any interpreter's file
 header to `arch_check_elf'

Also pass any interpreter's file header to `arch_check_elf' so that any
architecture handler can have a look at it if needed.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Fortune <Matthew.Fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11478/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit eb4bc076ff94b82fce04f6db061de597f71bd129)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/elf.h | 2 +-
 arch/mips/kernel/elf.c      | 2 +-
 fs/binfmt_elf.c             | 6 +++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index b01a6ff468e0..891013578490 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -448,7 +448,7 @@ struct arch_elf_state {
 extern int arch_elf_pt_proc(void *ehdr, void *phdr, struct file *elf,
 			    bool is_interp, struct arch_elf_state *state);
 
-extern int arch_check_elf(void *ehdr, bool has_interpreter,
+extern int arch_check_elf(void *ehdr, bool has_interpreter, void *interp_ehdr,
 			  struct arch_elf_state *state);
 
 extern void mips_set_personality_fp(struct arch_elf_state *state);
diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index c408465ea83a..d0181383c791 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -128,7 +128,7 @@ int arch_elf_pt_proc(void *_ehdr, void *_phdr, struct file *elf,
 	return 0;
 }
 
-int arch_check_elf(void *_ehdr, bool has_interpreter,
+int arch_check_elf(void *_ehdr, bool has_interpreter, void *_interp_ehdr,
 		   struct arch_elf_state *state)
 {
 	union {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8a0243efd359..b9065d672887 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -491,6 +491,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
  * arch_check_elf() - check an ELF executable
  * @ehdr:	The main ELF header
  * @has_interp:	True if the ELF has an interpreter, else false.
+ * @interp_ehdr: The interpreter's ELF header
  * @state:	Architecture-specific state preserved throughout the process
  *		of loading the ELF.
  *
@@ -502,6 +503,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
  *         with that return code.
  */
 static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
+				 struct elfhdr *interp_ehdr,
 				 struct arch_elf_state *state)
 {
 	/* Dummy implementation, always proceed */
@@ -829,7 +831,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	 * still possible to return an error to the code that invoked
 	 * the exec syscall.
 	 */
-	retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state);
+	retval = arch_check_elf(&loc->elf_ex,
+				!!interpreter, &loc->interp_elf_ex,
+				&arch_state);
 	if (retval)
 		goto out_free_dentry;
 

From 43517cec9fba957b266b07724ed6920d4f973d63 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 13 Nov 2015 00:48:02 +0000
Subject: [PATCH 1438/3220] UPSTREAM: MIPS: ELF: Interpret the NAN2008 file
 header flag

Handle the EF_MIPS_NAN2008 ELF file header flag and refuse execution
where there is no support in the FPU for the NaN encoding mode requested
by a binary invoked.  Ensure that the setting of the bit in the binary
matches one in any intepreter used.  Set the thread's initial FCSR
contents according to the value of the EF_MIPS_NAN2008.

Set the values of the FCSR ABS2008 and NAN2008 bits both to the same
value if possible, to take the approach taken with existing FPU hardware
into account.  As of now all implementations have both bits hardwired to
the same value, that is both are fixed at 0 or both are fixed at 1, even
though the architecture allows for implementations where the amount of
control implemented with each of these two individual bits is
independent of each other.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Fortune <Matthew.Fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11479/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 2b5e869ecfcb3112f7e1267cb0328f3ff6d49b18)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/elf.h |  9 ++++--
 arch/mips/kernel/elf.c      | 56 +++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index 891013578490..3dba5d05830c 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -12,7 +12,6 @@
 #include <linux/fs.h>
 #include <uapi/linux/elf.h>
 
-#include <asm/cpu-info.h>
 #include <asm/current.h>
 
 /* ELF header e_flags defines. */
@@ -44,6 +43,7 @@
 #define EF_MIPS_OPTIONS_FIRST	0x00000080
 #define EF_MIPS_32BITMODE	0x00000100
 #define EF_MIPS_FP64		0x00000200
+#define EF_MIPS_NAN2008		0x00000400
 #define EF_MIPS_ABI		0x0000f000
 #define EF_MIPS_ARCH		0xf0000000
 
@@ -305,7 +305,7 @@ do {									\
 									\
 	current->thread.abi = &mips_abi;				\
 									\
-	current->thread.fpu.fcr31 = boot_cpu_data.fpu_csr31;		\
+	mips_set_personality_nan(state);				\
 } while (0)
 
 #endif /* CONFIG_32BIT */
@@ -367,7 +367,7 @@ do {									\
 	else								\
 		current->thread.abi = &mips_abi;			\
 									\
-	current->thread.fpu.fcr31 = boot_cpu_data.fpu_csr31;		\
+	mips_set_personality_nan(state);				\
 									\
 	p = personality(current->personality);				\
 	if (p != PER_LINUX32 && p != PER_LINUX)				\
@@ -432,6 +432,7 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 				       int uses_interp);
 
 struct arch_elf_state {
+	int nan_2008;
 	int fp_abi;
 	int interp_fp_abi;
 	int overall_fp_mode;
@@ -440,6 +441,7 @@ struct arch_elf_state {
 #define MIPS_ABI_FP_UNKNOWN	(-1)	/* Unknown FP ABI (kernel internal) */
 
 #define INIT_ARCH_ELF_STATE {			\
+	.nan_2008 = -1,				\
 	.fp_abi = MIPS_ABI_FP_UNKNOWN,		\
 	.interp_fp_abi = MIPS_ABI_FP_UNKNOWN,	\
 	.overall_fp_mode = -1,			\
@@ -451,6 +453,7 @@ extern int arch_elf_pt_proc(void *ehdr, void *phdr, struct file *elf,
 extern int arch_check_elf(void *ehdr, bool has_interpreter, void *interp_ehdr,
 			  struct arch_elf_state *state);
 
+extern void mips_set_personality_nan(struct arch_elf_state *state);
 extern void mips_set_personality_fp(struct arch_elf_state *state);
 
 #endif /* _ASM_ELF_H */
diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index d0181383c791..33b4cef8ef20 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -11,6 +11,8 @@
 #include <linux/elf.h>
 #include <linux/sched.h>
 
+#include <asm/cpu-info.h>
+
 /* FPU modes */
 enum {
 	FP_FRE,
@@ -135,6 +137,10 @@ int arch_check_elf(void *_ehdr, bool has_interpreter, void *_interp_ehdr,
 		struct elf32_hdr e32;
 		struct elf64_hdr e64;
 	} *ehdr = _ehdr;
+	union {
+		struct elf32_hdr e32;
+		struct elf64_hdr e64;
+	} *iehdr = _interp_ehdr;
 	struct mode_req prog_req, interp_req;
 	int fp_abi, interp_fp_abi, abi0, abi1, max_abi;
 	bool elf32;
@@ -143,6 +149,32 @@ int arch_check_elf(void *_ehdr, bool has_interpreter, void *_interp_ehdr,
 	elf32 = ehdr->e32.e_ident[EI_CLASS] == ELFCLASS32;
 	flags = elf32 ? ehdr->e32.e_flags : ehdr->e64.e_flags;
 
+	/*
+	 * Determine the NaN personality, reject the binary if no hardware
+	 * support.  Also ensure that any interpreter matches the executable.
+	 */
+	if (flags & EF_MIPS_NAN2008) {
+		if (cpu_has_nan_2008)
+			state->nan_2008 = 1;
+		else
+			return -ENOEXEC;
+	} else {
+		if (cpu_has_nan_legacy)
+			state->nan_2008 = 0;
+		else
+			return -ENOEXEC;
+	}
+	if (has_interpreter) {
+		bool ielf32;
+		u32 iflags;
+
+		ielf32 = iehdr->e32.e_ident[EI_CLASS] == ELFCLASS32;
+		iflags = ielf32 ? iehdr->e32.e_flags : iehdr->e64.e_flags;
+
+		if ((flags ^ iflags) & EF_MIPS_NAN2008)
+			return -ELIBBAD;
+	}
+
 	if (!config_enabled(CONFIG_MIPS_O32_FP64_SUPPORT))
 		return 0;
 
@@ -266,3 +298,27 @@ void mips_set_personality_fp(struct arch_elf_state *state)
 		BUG();
 	}
 }
+
+/*
+ * Select the IEEE 754 NaN encoding and ABS.fmt/NEG.fmt execution mode
+ * in FCSR according to the ELF NaN personality.
+ */
+void mips_set_personality_nan(struct arch_elf_state *state)
+{
+	struct cpuinfo_mips *c = &boot_cpu_data;
+	struct task_struct *t = current;
+
+	t->thread.fpu.fcr31 = c->fpu_csr31;
+	switch (state->nan_2008) {
+	case 0:
+		break;
+	case 1:
+		if (!(c->fpu_msk31 & FPU_CSR_NAN2008))
+			t->thread.fpu.fcr31 |= FPU_CSR_NAN2008;
+		if (!(c->fpu_msk31 & FPU_CSR_ABS2008))
+			t->thread.fpu.fcr31 |= FPU_CSR_ABS2008;
+		break;
+	default:
+		BUG();
+	}
+}

From d73ca92398df3e7b42f95e39df13b6e7e23adb7b Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 13 Nov 2015 00:46:55 +0000
Subject: [PATCH 1439/3220] UPSTREAM: MIPS: Define the legacy-NaN and 2008-NaN
 features

Allocate CPU option bits and define macros for the legacy-NaN and
2008-NaN IEEE Std 754 MIPS architecture features.  Unconditionally mark
the legacy-NaN feature as present across hardware and emulated
floating-point configurations.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Fortune <Matthew.Fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11475/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 9519ef37a4a4e9c3c2d7d89ecbecfaf3c839208a)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/cpu-features.h | 7 +++++++
 arch/mips/include/asm/cpu.h          | 2 ++
 arch/mips/kernel/cpu-probe.c         | 2 ++
 3 files changed, 11 insertions(+)

diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index d1e04c943f5f..eeec8c8e2da2 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -414,4 +414,11 @@
 # define cpu_has_small_pages	(cpu_data[0].options & MIPS_CPU_SP)
 #endif
 
+#ifndef cpu_has_nan_legacy
+#define cpu_has_nan_legacy	(cpu_data[0].options & MIPS_CPU_NAN_LEGACY)
+#endif
+#ifndef cpu_has_nan_2008
+#define cpu_has_nan_2008	(cpu_data[0].options & MIPS_CPU_NAN_2008)
+#endif
+
 #endif /* __ASM_CPU_FEATURES_H */
diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h
index 82ad15f11049..a97ca97285ec 100644
--- a/arch/mips/include/asm/cpu.h
+++ b/arch/mips/include/asm/cpu.h
@@ -386,6 +386,8 @@ enum cpu_type_enum {
 #define MIPS_CPU_BP_GHIST	0x8000000000ull /* R12K+ Branch Prediction Global History */
 #define MIPS_CPU_SP		0x10000000000ull /* Small (1KB) page support */
 #define MIPS_CPU_FTLB		0x20000000000ull /* CPU has Fixed-page-size TLB */
+#define MIPS_CPU_NAN_LEGACY	0x40000000000ull /* Legacy NaN implemented */
+#define MIPS_CPU_NAN_2008	0x80000000000ull /* 2008 NaN implemented */
 
 /*
  * CPU ASE encodings
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 6b9064499bd3..758b625d95e0 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -137,6 +137,7 @@ static void cpu_set_fpu_opts(struct cpuinfo_mips *c)
 	}
 
 	cpu_set_fpu_fcsr_mask(c);
+	c->options |= MIPS_CPU_NAN_LEGACY;
 }
 
 /*
@@ -147,6 +148,7 @@ static void cpu_set_nofpu_opts(struct cpuinfo_mips *c)
 	c->options &= ~MIPS_CPU_FPU;
 	c->fpu_msk31 = mips_nofpu_msk31;
 
+	c->options |= MIPS_CPU_NAN_LEGACY;
 	cpu_set_nofpu_id(c);
 }
 

From 5dd3c3a431abd62207bd8dff868195e6c4249fdd Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 13 Nov 2015 00:48:15 +0000
Subject: [PATCH 1440/3220] UPSTREAM: MIPS: Determine the presence of IEEE Std
 754-2008 features

Determine the presence of and the amount of control available over IEEE
Std 754-2008 features.

In the case of a hardware FPU being used examine the FIR register for
the presence of the HAS2008 bit and then the FCSR register for the
writability of the ABS2008 and NAN2008 bits and the hardwired state of
each of these bits if read-only.  Update the initial FCSR contents used
for threads and the FCSR writability mask accordingly.

For full FPU emulation and MIPS32 or MIPS64 processors make the FCSR
ABS2008 and NAN2008 bits writable.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Fortune <Matthew.Fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11480/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 93adeaf6dadcaed943a51ad9ea2d77e24e6d70a7)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/cpu-probe.c | 76 +++++++++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 758b625d95e0..575102cab44a 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -98,6 +98,78 @@ static inline void cpu_set_fpu_fcsr_mask(struct cpuinfo_mips *c)
 	c->fpu_msk31 = ~(fcsr0 ^ fcsr1) & ~mask;
 }
 
+/*
+ * Determine the IEEE 754 NaN encodings and ABS.fmt/NEG.fmt execution modes
+ * supported by FPU hardware.
+ */
+static void cpu_set_fpu_2008(struct cpuinfo_mips *c)
+{
+	if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 |
+			    MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 |
+			    MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) {
+		unsigned long sr, fir, fcsr, fcsr0, fcsr1;
+
+		sr = read_c0_status();
+		__enable_fpu(FPU_AS_IS);
+
+		fir = read_32bit_cp1_register(CP1_REVISION);
+		if (fir & MIPS_FPIR_HAS2008) {
+			fcsr = read_32bit_cp1_register(CP1_STATUS);
+
+			fcsr0 = fcsr & ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008);
+			write_32bit_cp1_register(CP1_STATUS, fcsr0);
+			fcsr0 = read_32bit_cp1_register(CP1_STATUS);
+
+			fcsr1 = fcsr | FPU_CSR_ABS2008 | FPU_CSR_NAN2008;
+			write_32bit_cp1_register(CP1_STATUS, fcsr1);
+			fcsr1 = read_32bit_cp1_register(CP1_STATUS);
+
+			write_32bit_cp1_register(CP1_STATUS, fcsr);
+
+			if (!(fcsr0 & FPU_CSR_NAN2008))
+				c->options |= MIPS_CPU_NAN_LEGACY;
+			if (fcsr1 & FPU_CSR_NAN2008)
+				c->options |= MIPS_CPU_NAN_2008;
+
+			if ((fcsr0 ^ fcsr1) & FPU_CSR_ABS2008)
+				c->fpu_msk31 &= ~FPU_CSR_ABS2008;
+			else
+				c->fpu_csr31 |= fcsr & FPU_CSR_ABS2008;
+
+			if ((fcsr0 ^ fcsr1) & FPU_CSR_NAN2008)
+				c->fpu_msk31 &= ~FPU_CSR_NAN2008;
+			else
+				c->fpu_csr31 |= fcsr & FPU_CSR_NAN2008;
+		} else {
+			c->options |= MIPS_CPU_NAN_LEGACY;
+		}
+
+		write_c0_status(sr);
+	} else {
+		c->options |= MIPS_CPU_NAN_LEGACY;
+	}
+}
+
+/*
+ * Set the IEEE 754 NaN encodings and ABS.fmt/NEG.fmt execution modes
+ * for the FPU emulator.  Clear the flags where required in case called
+ * from `fpu_disable', to override details obtained from FPU hardware.
+ */
+static void cpu_set_nofpu_2008(struct cpuinfo_mips *c)
+{
+	c->fpu_csr31 &= ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008);
+	if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 |
+			    MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 |
+			    MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) {
+		c->options |= MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY;
+		c->fpu_msk31 &= ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008);
+	} else {
+		c->options &= ~MIPS_CPU_NAN_2008;
+		c->options |= MIPS_CPU_NAN_LEGACY;
+		c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008;
+	}
+}
+
 /*
  * Set the FIR feature flags for the FPU emulator.
  */
@@ -137,7 +209,7 @@ static void cpu_set_fpu_opts(struct cpuinfo_mips *c)
 	}
 
 	cpu_set_fpu_fcsr_mask(c);
-	c->options |= MIPS_CPU_NAN_LEGACY;
+	cpu_set_fpu_2008(c);
 }
 
 /*
@@ -148,7 +220,7 @@ static void cpu_set_nofpu_opts(struct cpuinfo_mips *c)
 	c->options &= ~MIPS_CPU_FPU;
 	c->fpu_msk31 = mips_nofpu_msk31;
 
-	c->options |= MIPS_CPU_NAN_LEGACY;
+	cpu_set_nofpu_2008(c);
 	cpu_set_nofpu_id(c);
 }
 

From 39dd84faa5ea2112f9377fcc5f9c6cb694b4bdbe Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 13 Nov 2015 00:48:29 +0000
Subject: [PATCH 1441/3220] UPSTREAM: MIPS: Add IEEE Std 754 conformance mode
 selection

Add an `ieee754=' kernel parameter to control IEEE Std 754 conformance
mode.

Use separate flags copied from the respective CPU feature flags, and
adjusted according to the conformance mode selected, to make binaries
requesting individual NaN encoding modes accepted or rejected as needed.
Update the initial setting for FCSR and, in the full FPU emulation mode,
its read-only mask accordingly.  Accept the mode selection requested for
legacy processors as well.

As with the EF_MIPS_NAN2008 ELF file header flag adjust both ABS2008 and
NAN2008 bits at the same time, to match the choice made for hardware
currently implemented.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Fortune <Matthew.Fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11481/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 503943e0e52bd3fbf014aa1d838ced37adb43121)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 Documentation/kernel-parameters.txt |  35 ++++++++++
 arch/mips/include/asm/elf.h         |   4 ++
 arch/mips/kernel/cpu-probe.c        | 105 +++++++++++++++++++++++++---
 arch/mips/kernel/elf.c              |  12 ++--
 4 files changed, 142 insertions(+), 14 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9fee3a90deaa..e2072d12693b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1453,6 +1453,41 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			In such case C2/C3 won't be used again.
 			idle=nomwait: Disable mwait for CPU C-states
 
+	ieee754=	[MIPS] Select IEEE Std 754 conformance mode
+			Format: { strict | legacy | 2008 | relaxed }
+			Default: strict
+
+			Choose which programs will be accepted for execution
+			based on the IEEE 754 NaN encoding(s) supported by
+			the FPU and the NaN encoding requested with the value
+			of an ELF file header flag individually set by each
+			binary.  Hardware implementations are permitted to
+			support either or both of the legacy and the 2008 NaN
+			encoding mode.
+
+			Available settings are as follows:
+			strict	accept binaries that request a NaN encoding
+				supported by the FPU
+			legacy	only accept legacy-NaN binaries, if supported
+				by the FPU
+			2008	only accept 2008-NaN binaries, if supported
+				by the FPU
+			relaxed	accept any binaries regardless of whether
+				supported by the FPU
+
+			The FPU emulator is always able to support both NaN
+			encodings, so if no FPU hardware is present or it has
+			been disabled with 'nofpu', then the settings of
+			'legacy' and '2008' strap the emulator accordingly,
+			'relaxed' straps the emulator for both legacy-NaN and
+			2008-NaN, whereas 'strict' enables legacy-NaN only on
+			legacy processors and both NaN encodings on MIPS32 or
+			MIPS64 CPUs.
+
+			The setting for ABS.fmt/NEG.fmt instruction execution
+			mode generally follows that for the NaN encoding,
+			except where unsupported by hardware.
+
 	ignore_loglevel	[KNL]
 			Ignore loglevel setting - this will print /all/
 			kernel messages to the console. Useful for debugging.
diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index 3dba5d05830c..cefb7a596878 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -447,6 +447,10 @@ struct arch_elf_state {
 	.overall_fp_mode = -1,			\
 }
 
+/* Whether to accept legacy-NaN and 2008-NaN user binaries.  */
+extern bool mips_use_nan_legacy;
+extern bool mips_use_nan_2008;
+
 extern int arch_elf_pt_proc(void *ehdr, void *phdr, struct file *elf,
 			    bool is_interp, struct arch_elf_state *state);
 
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 575102cab44a..36fa414f2cc4 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -151,25 +151,108 @@ static void cpu_set_fpu_2008(struct cpuinfo_mips *c)
 }
 
 /*
- * Set the IEEE 754 NaN encodings and ABS.fmt/NEG.fmt execution modes
- * for the FPU emulator.  Clear the flags where required in case called
- * from `fpu_disable', to override details obtained from FPU hardware.
+ * IEEE 754 conformance mode to use.  Affects the NaN encoding and the
+ * ABS.fmt/NEG.fmt execution mode.
+ */
+static enum { STRICT, LEGACY, STD2008, RELAXED } ieee754 = STRICT;
+
+/*
+ * Set the IEEE 754 NaN encodings and the ABS.fmt/NEG.fmt execution modes
+ * to support by the FPU emulator according to the IEEE 754 conformance
+ * mode selected.  Note that "relaxed" straps the emulator so that it
+ * allows 2008-NaN binaries even for legacy processors.
  */
 static void cpu_set_nofpu_2008(struct cpuinfo_mips *c)
 {
+	c->options &= ~(MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY);
 	c->fpu_csr31 &= ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008);
-	if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 |
-			    MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 |
-			    MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) {
-		c->options |= MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY;
-		c->fpu_msk31 &= ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008);
-	} else {
-		c->options &= ~MIPS_CPU_NAN_2008;
+	c->fpu_msk31 &= ~(FPU_CSR_ABS2008 | FPU_CSR_NAN2008);
+
+	switch (ieee754) {
+	case STRICT:
+		if (c->isa_level & (MIPS_CPU_ISA_M32R1 | MIPS_CPU_ISA_M64R1 |
+				    MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 |
+				    MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) {
+			c->options |= MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY;
+		} else {
+			c->options |= MIPS_CPU_NAN_LEGACY;
+			c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008;
+		}
+		break;
+	case LEGACY:
 		c->options |= MIPS_CPU_NAN_LEGACY;
 		c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008;
+		break;
+	case STD2008:
+		c->options |= MIPS_CPU_NAN_2008;
+		c->fpu_csr31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008;
+		c->fpu_msk31 |= FPU_CSR_ABS2008 | FPU_CSR_NAN2008;
+		break;
+	case RELAXED:
+		c->options |= MIPS_CPU_NAN_2008 | MIPS_CPU_NAN_LEGACY;
+		break;
 	}
 }
 
+/*
+ * Override the IEEE 754 NaN encoding and ABS.fmt/NEG.fmt execution mode
+ * according to the "ieee754=" parameter.
+ */
+static void cpu_set_nan_2008(struct cpuinfo_mips *c)
+{
+	switch (ieee754) {
+	case STRICT:
+		mips_use_nan_legacy = !!cpu_has_nan_legacy;
+		mips_use_nan_2008 = !!cpu_has_nan_2008;
+		break;
+	case LEGACY:
+		mips_use_nan_legacy = !!cpu_has_nan_legacy;
+		mips_use_nan_2008 = !cpu_has_nan_legacy;
+		break;
+	case STD2008:
+		mips_use_nan_legacy = !cpu_has_nan_2008;
+		mips_use_nan_2008 = !!cpu_has_nan_2008;
+		break;
+	case RELAXED:
+		mips_use_nan_legacy = true;
+		mips_use_nan_2008 = true;
+		break;
+	}
+}
+
+/*
+ * IEEE 754 NaN encoding and ABS.fmt/NEG.fmt execution mode override
+ * settings:
+ *
+ * strict:  accept binaries that request a NaN encoding supported by the FPU
+ * legacy:  only accept legacy-NaN binaries
+ * 2008:    only accept 2008-NaN binaries
+ * relaxed: accept any binaries regardless of whether supported by the FPU
+ */
+static int __init ieee754_setup(char *s)
+{
+	if (!s)
+		return -1;
+	else if (!strcmp(s, "strict"))
+		ieee754 = STRICT;
+	else if (!strcmp(s, "legacy"))
+		ieee754 = LEGACY;
+	else if (!strcmp(s, "2008"))
+		ieee754 = STD2008;
+	else if (!strcmp(s, "relaxed"))
+		ieee754 = RELAXED;
+	else
+		return -1;
+
+	if (!(boot_cpu_data.options & MIPS_CPU_FPU))
+		cpu_set_nofpu_2008(&boot_cpu_data);
+	cpu_set_nan_2008(&boot_cpu_data);
+
+	return 0;
+}
+
+early_param("ieee754", ieee754_setup);
+
 /*
  * Set the FIR feature flags for the FPU emulator.
  */
@@ -210,6 +293,7 @@ static void cpu_set_fpu_opts(struct cpuinfo_mips *c)
 
 	cpu_set_fpu_fcsr_mask(c);
 	cpu_set_fpu_2008(c);
+	cpu_set_nan_2008(c);
 }
 
 /*
@@ -221,6 +305,7 @@ static void cpu_set_nofpu_opts(struct cpuinfo_mips *c)
 	c->fpu_msk31 = mips_nofpu_msk31;
 
 	cpu_set_nofpu_2008(c);
+	cpu_set_nan_2008(c);
 	cpu_set_nofpu_id(c);
 }
 
diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index 33b4cef8ef20..d09cf9b0e89f 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -13,6 +13,10 @@
 
 #include <asm/cpu-info.h>
 
+/* Whether to accept legacy-NaN and 2008-NaN user binaries.  */
+bool mips_use_nan_legacy;
+bool mips_use_nan_2008;
+
 /* FPU modes */
 enum {
 	FP_FRE,
@@ -150,16 +154,16 @@ int arch_check_elf(void *_ehdr, bool has_interpreter, void *_interp_ehdr,
 	flags = elf32 ? ehdr->e32.e_flags : ehdr->e64.e_flags;
 
 	/*
-	 * Determine the NaN personality, reject the binary if no hardware
-	 * support.  Also ensure that any interpreter matches the executable.
+	 * Determine the NaN personality, reject the binary if not allowed.
+	 * Also ensure that any interpreter matches the executable.
 	 */
 	if (flags & EF_MIPS_NAN2008) {
-		if (cpu_has_nan_2008)
+		if (mips_use_nan_2008)
 			state->nan_2008 = 1;
 		else
 			return -ENOEXEC;
 	} else {
-		if (cpu_has_nan_legacy)
+		if (mips_use_nan_legacy)
 			state->nan_2008 = 0;
 		else
 			return -ENOEXEC;

From b38614bbdfce1135d8c20b61a37851673608b985 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 8 Jul 2016 11:06:20 +0100
Subject: [PATCH 1442/3220] UPSTREAM: MIPS: non-exec stack & heap when non-exec
 PT_GNU_STACK is present

The stack and heap have both been executable by default on MIPS until
now. This patch changes the default to be non-executable, but only for
ELF binaries with a non-executable PT_GNU_STACK header present. This
does apply to both the heap & the stack, despite the name PT_GNU_STACK,
and this matches the behaviour of other architectures like ARM & x86.

Current MIPS toolchains do not produce the PT_GNU_STACK header, which
means that we can rely upon this patch not changing the behaviour of
existing binaries. The new default will only take effect for newly
compiled binaries once toolchains are updated to support PT_GNU_STACK,
and since those binaries are newly compiled they can be compiled
expecting the change in default behaviour. Again this matches the way in
which the ARM & x86 architectures handled their implementations of
non-executable memory.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Leonid Yegoshin <leonid.yegoshin@imgtec.com>
Cc: Maciej Rozycki <maciej.rozycki@imgtec.com>
Cc: Faraz Shahbazker <faraz.shahbazker@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: Matthew Fortune <matthew.fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/13765/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 1a770b85c1f1c1ee37afd7cef5237ffc4c970f04)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/elf.h  |  3 +++
 arch/mips/include/asm/page.h |  6 ++++--
 arch/mips/kernel/elf.c       | 19 +++++++++++++++++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index cefb7a596878..dc72fdc73719 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -460,4 +460,7 @@ extern int arch_check_elf(void *ehdr, bool has_interpreter, void *interp_ehdr,
 extern void mips_set_personality_nan(struct arch_elf_state *state);
 extern void mips_set_personality_fp(struct arch_elf_state *state);
 
+#define elf_read_implies_exec(ex, stk) mips_elf_read_implies_exec(&(ex), stk)
+extern int mips_elf_read_implies_exec(void *elf_ex, int exstack);
+
 #endif /* _ASM_ELF_H */
diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
index 21ed7150fec3..74cb004c2868 100644
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@@ -229,8 +229,10 @@ extern int __virt_addr_valid(const volatile void *kaddr);
 #define virt_addr_valid(kaddr)						\
 	__virt_addr_valid((const volatile void *) (kaddr))
 
-#define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE | VM_EXEC | \
-				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS \
+	(VM_READ | VM_WRITE | \
+	 ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
+	 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #define UNCAC_ADDR(addr)	((addr) - PAGE_OFFSET + UNCAC_BASE)
 #define CAC_ADDR(addr)		((addr) - UNCAC_BASE + PAGE_OFFSET)
diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index d09cf9b0e89f..32ce422a6886 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -8,9 +8,12 @@
  * option) any later version.
  */
 
+#include <linux/binfmts.h>
 #include <linux/elf.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 
+#include <asm/cpu-features.h>
 #include <asm/cpu-info.h>
 
 /* Whether to accept legacy-NaN and 2008-NaN user binaries.  */
@@ -326,3 +329,19 @@ void mips_set_personality_nan(struct arch_elf_state *state)
 		BUG();
 	}
 }
+
+int mips_elf_read_implies_exec(void *elf_ex, int exstack)
+{
+	if (exstack != EXSTACK_DISABLE_X) {
+		/* The binary doesn't request a non-executable stack */
+		return 1;
+	}
+
+	if (!cpu_has_rixi) {
+		/* The CPU doesn't support non-executable memory */
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mips_elf_read_implies_exec);

From 71b31a91e393b454ecd5931fa474801b878069fa Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 13 Nov 2015 00:47:08 +0000
Subject: [PATCH 1443/3220] UPSTREAM: MIPS: math-emu: Add IEEE Std 754-2008
 ABS.fmt and NEG.fmt emulation

Implement IEEE Std 754-2008 non-arithmetic ABS.fmt and NEG.fmt emulation
wired to the state of the FCSR.ABS2008 bit.  In the non-arithmetic mode
the sign bit is altered according to the operation requested regardless
of the datum encoded in the input operand, no other bits are changed,
the resulting bit pattern is written to the output operand and no
exception is ever signalled.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Fortune <Matthew.Fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11476/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 198f70589e3c0f0f50da646152443787e959228f)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_simple.c | 38 ++++++++++++++++++++++------------
 arch/mips/math-emu/sp_simple.c | 38 ++++++++++++++++++++++------------
 2 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/arch/mips/math-emu/dp_simple.c b/arch/mips/math-emu/dp_simple.c
index 926d56bf37f2..eb96485ed939 100644
--- a/arch/mips/math-emu/dp_simple.c
+++ b/arch/mips/math-emu/dp_simple.c
@@ -23,27 +23,39 @@
 
 union ieee754dp ieee754dp_neg(union ieee754dp x)
 {
-	unsigned int oldrm;
 	union ieee754dp y;
 
-	oldrm = ieee754_csr.rm;
-	ieee754_csr.rm = FPU_CSR_RD;
-	y = ieee754dp_sub(ieee754dp_zero(0), x);
-	ieee754_csr.rm = oldrm;
+	if (ieee754_csr.abs2008) {
+		y = x;
+		DPSIGN(y) = !DPSIGN(x);
+	} else {
+		unsigned int oldrm;
+
+		oldrm = ieee754_csr.rm;
+		ieee754_csr.rm = FPU_CSR_RD;
+		y = ieee754dp_sub(ieee754dp_zero(0), x);
+		ieee754_csr.rm = oldrm;
+	}
 	return y;
 }
 
 union ieee754dp ieee754dp_abs(union ieee754dp x)
 {
-	unsigned int oldrm;
 	union ieee754dp y;
 
-	oldrm = ieee754_csr.rm;
-	ieee754_csr.rm = FPU_CSR_RD;
-	if (DPSIGN(x))
-		y = ieee754dp_sub(ieee754dp_zero(0), x);
-	else
-		y = ieee754dp_add(ieee754dp_zero(0), x);
-	ieee754_csr.rm = oldrm;
+	if (ieee754_csr.abs2008) {
+		y = x;
+		DPSIGN(y) = 0;
+	} else {
+		unsigned int oldrm;
+
+		oldrm = ieee754_csr.rm;
+		ieee754_csr.rm = FPU_CSR_RD;
+		if (DPSIGN(x))
+			y = ieee754dp_sub(ieee754dp_zero(0), x);
+		else
+			y = ieee754dp_add(ieee754dp_zero(0), x);
+		ieee754_csr.rm = oldrm;
+	}
 	return y;
 }
diff --git a/arch/mips/math-emu/sp_simple.c b/arch/mips/math-emu/sp_simple.c
index c50e9451f2d2..756c9cf2dfd2 100644
--- a/arch/mips/math-emu/sp_simple.c
+++ b/arch/mips/math-emu/sp_simple.c
@@ -23,27 +23,39 @@
 
 union ieee754sp ieee754sp_neg(union ieee754sp x)
 {
-	unsigned int oldrm;
 	union ieee754sp y;
 
-	oldrm = ieee754_csr.rm;
-	ieee754_csr.rm = FPU_CSR_RD;
-	y = ieee754sp_sub(ieee754sp_zero(0), x);
-	ieee754_csr.rm = oldrm;
+	if (ieee754_csr.abs2008) {
+		y = x;
+		SPSIGN(y) = !SPSIGN(x);
+	} else {
+		unsigned int oldrm;
+
+		oldrm = ieee754_csr.rm;
+		ieee754_csr.rm = FPU_CSR_RD;
+		y = ieee754sp_sub(ieee754sp_zero(0), x);
+		ieee754_csr.rm = oldrm;
+	}
 	return y;
 }
 
 union ieee754sp ieee754sp_abs(union ieee754sp x)
 {
-	unsigned int oldrm;
 	union ieee754sp y;
 
-	oldrm = ieee754_csr.rm;
-	ieee754_csr.rm = FPU_CSR_RD;
-	if (SPSIGN(x))
-		y = ieee754sp_sub(ieee754sp_zero(0), x);
-	else
-		y = ieee754sp_add(ieee754sp_zero(0), x);
-	ieee754_csr.rm = oldrm;
+	if (ieee754_csr.abs2008) {
+		y = x;
+		SPSIGN(y) = 0;
+	} else {
+		unsigned int oldrm;
+
+		oldrm = ieee754_csr.rm;
+		ieee754_csr.rm = FPU_CSR_RD;
+		if (SPSIGN(x))
+			y = ieee754sp_sub(ieee754sp_zero(0), x);
+		else
+			y = ieee754sp_add(ieee754sp_zero(0), x);
+		ieee754_csr.rm = oldrm;
+	}
 	return y;
 }

From bb61ba21369b4d968f351de8379e895809feca80 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 13 Nov 2015 00:47:28 +0000
Subject: [PATCH 1444/3220] UPSTREAM: MIPS: math-emu: Add IEEE Std 754-2008 NaN
 encoding emulation

Implement IEEE Std 754-2008 NaN encoding wired to the state of the
FCSR.NAN2008 bit.  Make the interpretation of the quiet bit in NaN data
as follows:

* in the legacy mode originally defined by the MIPS architecture the
  value of 1 denotes an sNaN whereas the value of 0 denotes a qNaN,

* in the 2008 mode introduced with revision 5 of the MIPS architecture
  the value of 0 denotes an sNaN whereas the value of 1 denotes a qNaN,
  following the definition of the preferred NaN encoding introduced with
  IEEE Std 754-2008.

In the 2008 mode, following the requirement of the said standard, quiet
an sNaN where needed by setting the quiet bit to 1 and leaving all the
NaN payload bits unchanged.

Update format conversion operations according to the rules set by IEEE
Std 754-2008 and the MIPS architecture.  Specifically:

* propagate NaN payload bits through conversions between floating-point
  formats such that as much information as possible is preserved and
  specifically a conversion from a narrower format to a wider format and
  then back to the original format does not change a qNaN payload in any
  way,

* conversions from a floating-point to an integer format where the
  source is a NaN, infinity or a value that would convert to an integer
  outside the range of the result format produce, under the default
  exception handling, the respective values defined by the MIPS
  architecture.

In full FPU emulation set the FIR.HAS2008 bit to 1, however do not make
any further FCSR bits writable.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Fortune <Matthew.Fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11477/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 90d53a91fbd0c5a0882c29fa4279a3d2d700c76d)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/cpu-probe.c    |  2 ++
 arch/mips/math-emu/dp_tint.c    |  9 ++++---
 arch/mips/math-emu/dp_tlong.c   |  9 ++++---
 arch/mips/math-emu/ieee754.c    |  6 +++--
 arch/mips/math-emu/ieee754.h    | 42 +++++++++++++++++++++++----------
 arch/mips/math-emu/ieee754dp.c  | 12 ++++++++--
 arch/mips/math-emu/ieee754int.h | 12 +++++-----
 arch/mips/math-emu/ieee754sp.c  | 12 ++++++++--
 arch/mips/math-emu/sp_fdp.c     | 13 ++++++----
 arch/mips/math-emu/sp_tint.c    |  9 ++++---
 arch/mips/math-emu/sp_tlong.c   |  9 ++++---
 11 files changed, 93 insertions(+), 42 deletions(-)

diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 36fa414f2cc4..b725b713b9f8 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -268,6 +268,8 @@ static void cpu_set_nofpu_id(struct cpuinfo_mips *c)
 	if (c->isa_level & (MIPS_CPU_ISA_M32R2 | MIPS_CPU_ISA_M64R2 |
 			    MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6))
 		value |= MIPS_FPIR_F64 | MIPS_FPIR_L | MIPS_FPIR_W;
+	if (c->options & MIPS_CPU_NAN_2008)
+		value |= MIPS_FPIR_HAS2008;
 	c->fpu_id = value;
 }
 
diff --git a/arch/mips/math-emu/dp_tint.c b/arch/mips/math-emu/dp_tint.c
index 6ffc336c530e..f3985617ce31 100644
--- a/arch/mips/math-emu/dp_tint.c
+++ b/arch/mips/math-emu/dp_tint.c
@@ -38,10 +38,13 @@ int ieee754dp_tint(union ieee754dp x)
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
 	case IEEE754_CLASS_QNAN:
-	case IEEE754_CLASS_INF:
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754si_indef();
 
+	case IEEE754_CLASS_INF:
+		ieee754_setcx(IEEE754_INVALID_OPERATION);
+		return ieee754si_overflow(xs);
+
 	case IEEE754_CLASS_ZERO:
 		return 0;
 
@@ -53,7 +56,7 @@ int ieee754dp_tint(union ieee754dp x)
 		/* Set invalid. We will only use overflow for floating
 		   point overflow */
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754si_indef();
+		return ieee754si_overflow(xs);
 	}
 	/* oh gawd */
 	if (xe > DP_FBITS) {
@@ -93,7 +96,7 @@ int ieee754dp_tint(union ieee754dp x)
 		if ((xm >> 31) != 0 && (xs == 0 || xm != 0x80000000)) {
 			/* This can happen after rounding */
 			ieee754_setcx(IEEE754_INVALID_OPERATION);
-			return ieee754si_indef();
+			return ieee754si_overflow(xs);
 		}
 		if (round || sticky)
 			ieee754_setcx(IEEE754_INEXACT);
diff --git a/arch/mips/math-emu/dp_tlong.c b/arch/mips/math-emu/dp_tlong.c
index 9cdc145b75e0..748fa10ed4cf 100644
--- a/arch/mips/math-emu/dp_tlong.c
+++ b/arch/mips/math-emu/dp_tlong.c
@@ -38,10 +38,13 @@ s64 ieee754dp_tlong(union ieee754dp x)
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
 	case IEEE754_CLASS_QNAN:
-	case IEEE754_CLASS_INF:
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754di_indef();
 
+	case IEEE754_CLASS_INF:
+		ieee754_setcx(IEEE754_INVALID_OPERATION);
+		return ieee754di_overflow(xs);
+
 	case IEEE754_CLASS_ZERO:
 		return 0;
 
@@ -56,7 +59,7 @@ s64 ieee754dp_tlong(union ieee754dp x)
 		/* Set invalid. We will only use overflow for floating
 		   point overflow */
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754di_indef();
+		return ieee754di_overflow(xs);
 	}
 	/* oh gawd */
 	if (xe > DP_FBITS) {
@@ -97,7 +100,7 @@ s64 ieee754dp_tlong(union ieee754dp x)
 		if ((xm >> 63) != 0) {
 			/* This can happen after rounding */
 			ieee754_setcx(IEEE754_INVALID_OPERATION);
-			return ieee754di_indef();
+			return ieee754di_overflow(xs);
 		}
 		if (round || sticky)
 			ieee754_setcx(IEEE754_INEXACT);
diff --git a/arch/mips/math-emu/ieee754.c b/arch/mips/math-emu/ieee754.c
index 8e97acbbe22c..e16ae7b75dbb 100644
--- a/arch/mips/math-emu/ieee754.c
+++ b/arch/mips/math-emu/ieee754.c
@@ -59,7 +59,8 @@ const union ieee754dp __ieee754dp_spcvals[] = {
 	DPCNST(1, 3,           0x4000000000000ULL),	/* - 10.0   */
 	DPCNST(0, DP_EMAX + 1, 0x0000000000000ULL),	/* + infinity */
 	DPCNST(1, DP_EMAX + 1, 0x0000000000000ULL),	/* - infinity */
-	DPCNST(0, DP_EMAX + 1, 0x7FFFFFFFFFFFFULL),	/* + indef quiet Nan */
+	DPCNST(0, DP_EMAX + 1, 0x7FFFFFFFFFFFFULL),	/* + ind legacy qNaN */
+	DPCNST(0, DP_EMAX + 1, 0x8000000000000ULL),	/* + indef 2008 qNaN */
 	DPCNST(0, DP_EMAX,     0xFFFFFFFFFFFFFULL),	/* + max */
 	DPCNST(1, DP_EMAX,     0xFFFFFFFFFFFFFULL),	/* - max */
 	DPCNST(0, DP_EMIN,     0x0000000000000ULL),	/* + min normal */
@@ -82,7 +83,8 @@ const union ieee754sp __ieee754sp_spcvals[] = {
 	SPCNST(1, 3,	       0x200000),	/* - 10.0   */
 	SPCNST(0, SP_EMAX + 1, 0x000000),	/* + infinity */
 	SPCNST(1, SP_EMAX + 1, 0x000000),	/* - infinity */
-	SPCNST(0, SP_EMAX + 1, 0x3FFFFF),	/* + indef quiet Nan  */
+	SPCNST(0, SP_EMAX + 1, 0x3FFFFF),	/* + indef legacy quiet NaN */
+	SPCNST(0, SP_EMAX + 1, 0x400000),	/* + indef 2008 quiet NaN */
 	SPCNST(0, SP_EMAX,     0x7FFFFF),	/* + max normal */
 	SPCNST(1, SP_EMAX,     0x7FFFFF),	/* - max normal */
 	SPCNST(0, SP_EMIN,     0x000000),	/* + min normal */
diff --git a/arch/mips/math-emu/ieee754.h b/arch/mips/math-emu/ieee754.h
index df94720714c7..d3be351aed15 100644
--- a/arch/mips/math-emu/ieee754.h
+++ b/arch/mips/math-emu/ieee754.h
@@ -221,15 +221,16 @@ union ieee754dp ieee754dp_dump(char *s, union ieee754dp x);
 #define IEEE754_SPCVAL_NTEN		5	/* -10.0 */
 #define IEEE754_SPCVAL_PINFINITY	6	/* +inf */
 #define IEEE754_SPCVAL_NINFINITY	7	/* -inf */
-#define IEEE754_SPCVAL_INDEF		8	/* quiet NaN */
-#define IEEE754_SPCVAL_PMAX		9	/* +max norm */
-#define IEEE754_SPCVAL_NMAX		10	/* -max norm */
-#define IEEE754_SPCVAL_PMIN		11	/* +min norm */
-#define IEEE754_SPCVAL_NMIN		12	/* -min norm */
-#define IEEE754_SPCVAL_PMIND		13	/* +min denorm */
-#define IEEE754_SPCVAL_NMIND		14	/* -min denorm */
-#define IEEE754_SPCVAL_P1E31		15	/* + 1.0e31 */
-#define IEEE754_SPCVAL_P1E63		16	/* + 1.0e63 */
+#define IEEE754_SPCVAL_INDEF_LEG	8	/* legacy quiet NaN */
+#define IEEE754_SPCVAL_INDEF_2008	9	/* IEEE 754-2008 quiet NaN */
+#define IEEE754_SPCVAL_PMAX		10	/* +max norm */
+#define IEEE754_SPCVAL_NMAX		11	/* -max norm */
+#define IEEE754_SPCVAL_PMIN		12	/* +min norm */
+#define IEEE754_SPCVAL_NMIN		13	/* -min norm */
+#define IEEE754_SPCVAL_PMIND		14	/* +min denorm */
+#define IEEE754_SPCVAL_NMIND		15	/* -min denorm */
+#define IEEE754_SPCVAL_P1E31		16	/* + 1.0e31 */
+#define IEEE754_SPCVAL_P1E63		17	/* + 1.0e63 */
 
 extern const union ieee754dp __ieee754dp_spcvals[];
 extern const union ieee754sp __ieee754sp_spcvals[];
@@ -243,7 +244,8 @@ extern const union ieee754sp __ieee754sp_spcvals[];
 #define ieee754dp_zero(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PZERO+(sn)])
 #define ieee754dp_one(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PONE+(sn)])
 #define ieee754dp_ten(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PTEN+(sn)])
-#define ieee754dp_indef()	(ieee754dp_spcvals[IEEE754_SPCVAL_INDEF])
+#define ieee754dp_indef()	(ieee754dp_spcvals[IEEE754_SPCVAL_INDEF_LEG + \
+						   ieee754_csr.nan2008])
 #define ieee754dp_max(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PMAX+(sn)])
 #define ieee754dp_min(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PMIN+(sn)])
 #define ieee754dp_mind(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PMIND+(sn)])
@@ -254,7 +256,8 @@ extern const union ieee754sp __ieee754sp_spcvals[];
 #define ieee754sp_zero(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PZERO+(sn)])
 #define ieee754sp_one(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PONE+(sn)])
 #define ieee754sp_ten(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PTEN+(sn)])
-#define ieee754sp_indef()	(ieee754sp_spcvals[IEEE754_SPCVAL_INDEF])
+#define ieee754sp_indef()	(ieee754sp_spcvals[IEEE754_SPCVAL_INDEF_LEG + \
+						   ieee754_csr.nan2008])
 #define ieee754sp_max(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PMAX+(sn)])
 #define ieee754sp_min(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PMIN+(sn)])
 #define ieee754sp_mind(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PMIND+(sn)])
@@ -266,12 +269,25 @@ extern const union ieee754sp __ieee754sp_spcvals[];
  */
 static inline int ieee754si_indef(void)
 {
-	return INT_MAX;
+	return ieee754_csr.nan2008 ? 0 : INT_MAX;
 }
 
 static inline s64 ieee754di_indef(void)
 {
-	return S64_MAX;
+	return ieee754_csr.nan2008 ? 0 : S64_MAX;
+}
+
+/*
+ * Overflow integer value
+ */
+static inline int ieee754si_overflow(int xs)
+{
+	return ieee754_csr.nan2008 && xs ? INT_MIN : INT_MAX;
+}
+
+static inline s64 ieee754di_overflow(int xs)
+{
+	return ieee754_csr.nan2008 && xs ? S64_MIN : S64_MAX;
 }
 
 /* result types for xctx.rt */
diff --git a/arch/mips/math-emu/ieee754dp.c b/arch/mips/math-emu/ieee754dp.c
index 522d843f2ffd..ad3c73436777 100644
--- a/arch/mips/math-emu/ieee754dp.c
+++ b/arch/mips/math-emu/ieee754dp.c
@@ -37,8 +37,11 @@ static inline int ieee754dp_isnan(union ieee754dp x)
 
 static inline int ieee754dp_issnan(union ieee754dp x)
 {
+	int qbit;
+
 	assert(ieee754dp_isnan(x));
-	return (DPMANT(x) & DP_MBIT(DP_FBITS - 1)) == DP_MBIT(DP_FBITS - 1);
+	qbit = (DPMANT(x) & DP_MBIT(DP_FBITS - 1)) == DP_MBIT(DP_FBITS - 1);
+	return ieee754_csr.nan2008 ^ qbit;
 }
 
 
@@ -51,7 +54,12 @@ union ieee754dp __cold ieee754dp_nanxcpt(union ieee754dp r)
 	assert(ieee754dp_issnan(r));
 
 	ieee754_setcx(IEEE754_INVALID_OPERATION);
-	return ieee754dp_indef();
+	if (ieee754_csr.nan2008)
+		DPMANT(r) |= DP_MBIT(DP_FBITS - 1);
+	else
+		r = ieee754dp_indef();
+
+	return r;
 }
 
 static u64 ieee754dp_get_rounding(int sn, u64 xm)
diff --git a/arch/mips/math-emu/ieee754int.h b/arch/mips/math-emu/ieee754int.h
index 6383e2c5c1ad..ed7bb277b3e0 100644
--- a/arch/mips/math-emu/ieee754int.h
+++ b/arch/mips/math-emu/ieee754int.h
@@ -63,10 +63,10 @@ static inline int ieee754_class_nan(int xc)
 	if (ve == SP_EMAX+1+SP_EBIAS) {					\
 		if (vm == 0)						\
 			vc = IEEE754_CLASS_INF;				\
-		else if (vm & SP_MBIT(SP_FBITS-1))			\
-			vc = IEEE754_CLASS_SNAN;			\
-		else							\
+		else if (ieee754_csr.nan2008 ^ !(vm & SP_MBIT(SP_FBITS - 1))) \
 			vc = IEEE754_CLASS_QNAN;			\
+		else							\
+			vc = IEEE754_CLASS_SNAN;			\
 	} else if (ve == SP_EMIN-1+SP_EBIAS) {				\
 		if (vm) {						\
 			ve = SP_EMIN;					\
@@ -97,10 +97,10 @@ static inline int ieee754_class_nan(int xc)
 	if (ve == DP_EMAX+1+DP_EBIAS) {					\
 		if (vm == 0)						\
 			vc = IEEE754_CLASS_INF;				\
-		else if (vm & DP_MBIT(DP_FBITS-1))			\
-			vc = IEEE754_CLASS_SNAN;			\
-		else							\
+		else if (ieee754_csr.nan2008 ^ !(vm & DP_MBIT(DP_FBITS - 1))) \
 			vc = IEEE754_CLASS_QNAN;			\
+		else							\
+			vc = IEEE754_CLASS_SNAN;			\
 	} else if (ve == DP_EMIN-1+DP_EBIAS) {				\
 		if (vm) {						\
 			ve = DP_EMIN;					\
diff --git a/arch/mips/math-emu/ieee754sp.c b/arch/mips/math-emu/ieee754sp.c
index ca8e35e33bf7..def00ffc50fc 100644
--- a/arch/mips/math-emu/ieee754sp.c
+++ b/arch/mips/math-emu/ieee754sp.c
@@ -37,8 +37,11 @@ static inline int ieee754sp_isnan(union ieee754sp x)
 
 static inline int ieee754sp_issnan(union ieee754sp x)
 {
+	int qbit;
+
 	assert(ieee754sp_isnan(x));
-	return SPMANT(x) & SP_MBIT(SP_FBITS - 1);
+	qbit = (SPMANT(x) & SP_MBIT(SP_FBITS - 1)) == SP_MBIT(SP_FBITS - 1);
+	return ieee754_csr.nan2008 ^ qbit;
 }
 
 
@@ -51,7 +54,12 @@ union ieee754sp __cold ieee754sp_nanxcpt(union ieee754sp r)
 	assert(ieee754sp_issnan(r));
 
 	ieee754_setcx(IEEE754_INVALID_OPERATION);
-	return ieee754sp_indef();
+	if (ieee754_csr.nan2008)
+		SPMANT(r) |= SP_MBIT(SP_FBITS - 1);
+	else
+		r = ieee754sp_indef();
+
+	return r;
 }
 
 static unsigned ieee754sp_get_rounding(int sn, unsigned xm)
diff --git a/arch/mips/math-emu/sp_fdp.c b/arch/mips/math-emu/sp_fdp.c
index 3797148893ad..5060e8fdcb0b 100644
--- a/arch/mips/math-emu/sp_fdp.c
+++ b/arch/mips/math-emu/sp_fdp.c
@@ -44,13 +44,16 @@ union ieee754sp ieee754sp_fdp(union ieee754dp x)
 
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
-		return ieee754sp_nanxcpt(ieee754sp_nan_fdp(xs, xm));
-
+		x = ieee754dp_nanxcpt(x);
+		EXPLODEXDP;
+		/* Fall through.  */
 	case IEEE754_CLASS_QNAN:
 		y = ieee754sp_nan_fdp(xs, xm);
-		EXPLODEYSP;
-		if (!ieee754_class_nan(yc))
-			y = ieee754sp_indef();
+		if (!ieee754_csr.nan2008) {
+			EXPLODEYSP;
+			if (!ieee754_class_nan(yc))
+				y = ieee754sp_indef();
+		}
 		return y;
 
 	case IEEE754_CLASS_INF:
diff --git a/arch/mips/math-emu/sp_tint.c b/arch/mips/math-emu/sp_tint.c
index 091299a31798..f4b4cabfe2e1 100644
--- a/arch/mips/math-emu/sp_tint.c
+++ b/arch/mips/math-emu/sp_tint.c
@@ -38,10 +38,13 @@ int ieee754sp_tint(union ieee754sp x)
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
 	case IEEE754_CLASS_QNAN:
-	case IEEE754_CLASS_INF:
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754si_indef();
 
+	case IEEE754_CLASS_INF:
+		ieee754_setcx(IEEE754_INVALID_OPERATION);
+		return ieee754si_overflow(xs);
+
 	case IEEE754_CLASS_ZERO:
 		return 0;
 
@@ -56,7 +59,7 @@ int ieee754sp_tint(union ieee754sp x)
 		/* Set invalid. We will only use overflow for floating
 		   point overflow */
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754si_indef();
+		return ieee754si_overflow(xs);
 	}
 	/* oh gawd */
 	if (xe > SP_FBITS) {
@@ -97,7 +100,7 @@ int ieee754sp_tint(union ieee754sp x)
 		if ((xm >> 31) != 0) {
 			/* This can happen after rounding */
 			ieee754_setcx(IEEE754_INVALID_OPERATION);
-			return ieee754si_indef();
+			return ieee754si_overflow(xs);
 		}
 		if (round || sticky)
 			ieee754_setcx(IEEE754_INEXACT);
diff --git a/arch/mips/math-emu/sp_tlong.c b/arch/mips/math-emu/sp_tlong.c
index 9f3c742c1cea..a2450c7e452a 100644
--- a/arch/mips/math-emu/sp_tlong.c
+++ b/arch/mips/math-emu/sp_tlong.c
@@ -39,10 +39,13 @@ s64 ieee754sp_tlong(union ieee754sp x)
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
 	case IEEE754_CLASS_QNAN:
-	case IEEE754_CLASS_INF:
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754di_indef();
 
+	case IEEE754_CLASS_INF:
+		ieee754_setcx(IEEE754_INVALID_OPERATION);
+		return ieee754di_overflow(xs);
+
 	case IEEE754_CLASS_ZERO:
 		return 0;
 
@@ -57,7 +60,7 @@ s64 ieee754sp_tlong(union ieee754sp x)
 		/* Set invalid. We will only use overflow for floating
 		   point overflow */
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754di_indef();
+		return ieee754di_overflow(xs);
 	}
 	/* oh gawd */
 	if (xe > SP_FBITS) {
@@ -94,7 +97,7 @@ s64 ieee754sp_tlong(union ieee754sp x)
 		if ((xm >> 63) != 0) {
 			/* This can happen after rounding */
 			ieee754_setcx(IEEE754_INVALID_OPERATION);
-			return ieee754di_indef();
+			return ieee754di_overflow(xs);
 		}
 		if (round || sticky)
 			ieee754_setcx(IEEE754_INEXACT);

From 0c23219c3a7c0f3188d5584d9cebf2bcc0b65591 Mon Sep 17 00:00:00 2001
From: Adam Buchbinder <adam.buchbinder@gmail.com>
Date: Thu, 25 Feb 2016 00:44:58 -0800
Subject: [PATCH 1445/3220] UPSTREAM: MIPS: Fix misspellings in comments.

Signed-off-by: Adam Buchbinder <adam.buchbinder@gmail.com>
Cc: linux-mips@linux-mips.org
Cc: trivial@kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12617/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 92a76f6d8545efc67f03278009e9a828bdad3419)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/alchemy/common/dbdma.c               |  4 ++--
 .../executive/cvmx-interrupt-decodes.c         | 14 +++++++-------
 arch/mips/cavium-octeon/executive/cvmx-pko.c   |  2 +-
 arch/mips/cavium-octeon/smp.c                  |  2 +-
 arch/mips/dec/int-handler.S                    |  2 +-
 arch/mips/fw/arc/memory.c                      |  2 +-
 .../asm/mach-cavium-octeon/kernel-entry-init.h |  2 +-
 .../asm/mach-generic/kernel-entry-init.h       |  2 +-
 arch/mips/include/asm/mach-ip27/irq.h          |  2 +-
 .../include/asm/mach-ip27/kernel-entry-init.h  |  2 +-
 arch/mips/include/asm/mach-jz4740/gpio.h       |  2 +-
 arch/mips/include/asm/mips-cm.h                |  2 +-
 arch/mips/include/asm/octeon/cvmx-config.h     |  2 +-
 arch/mips/include/asm/octeon/cvmx.h            |  2 +-
 arch/mips/include/asm/pci/bridge.h             | 18 +++++++++---------
 arch/mips/include/asm/sgi/hpc3.h               |  2 +-
 arch/mips/include/asm/sgiarcs.h                |  4 ++--
 arch/mips/include/asm/sn/ioc3.h                |  2 +-
 arch/mips/include/asm/sn/sn0/hubio.h           |  2 +-
 arch/mips/include/asm/uaccess.h                |  2 +-
 arch/mips/kernel/mips-cm.c                     |  2 +-
 arch/mips/kernel/perf_event_mipsxx.c           |  2 +-
 arch/mips/kernel/pm-cps.c                      |  2 +-
 arch/mips/kernel/process.c                     |  2 +-
 arch/mips/kernel/traps.c                       |  2 +-
 arch/mips/kvm/tlb.c                            |  2 +-
 arch/mips/kvm/trap_emul.c                      |  2 +-
 arch/mips/math-emu/ieee754dp.c                 |  6 +++---
 arch/mips/math-emu/ieee754sp.c                 |  6 +++---
 arch/mips/mm/sc-ip22.c                         |  2 +-
 arch/mips/mm/tlbex.c                           |  2 +-
 arch/mips/sgi-ip27/ip27-memory.c               |  2 +-
 32 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/arch/mips/alchemy/common/dbdma.c b/arch/mips/alchemy/common/dbdma.c
index 745695db5ba0..f2f264b5aafe 100644
--- a/arch/mips/alchemy/common/dbdma.c
+++ b/arch/mips/alchemy/common/dbdma.c
@@ -261,7 +261,7 @@ u32 au1xxx_dbdma_chan_alloc(u32 srcid, u32 destid,
 	au1x_dma_chan_t *cp;
 
 	/*
-	 * We do the intialization on the first channel allocation.
+	 * We do the initialization on the first channel allocation.
 	 * We have to wait because of the interrupt handler initialization
 	 * which can't be done successfully during board set up.
 	 */
@@ -964,7 +964,7 @@ u32 au1xxx_dbdma_put_dscr(u32 chanid, au1x_ddma_desc_t *dscr)
 	dp->dscr_source1 = dscr->dscr_source1;
 	dp->dscr_cmd1 = dscr->dscr_cmd1;
 	nbytes = dscr->dscr_cmd1;
-	/* Allow the caller to specifiy if an interrupt is generated */
+	/* Allow the caller to specify if an interrupt is generated */
 	dp->dscr_cmd0 &= ~DSCR_CMD0_IE;
 	dp->dscr_cmd0 |= dscr->dscr_cmd0 | DSCR_CMD0_V;
 	ctp->chan_ptr->ddma_dbell = 0;
diff --git a/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c b/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c
index e59d1b79f24c..2f415d9d0f3c 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c
@@ -68,7 +68,7 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block)
 		gmx_rx_int_en.s.pause_drp = 1;
 		/* Skipping gmx_rx_int_en.s.reserved_16_18 */
 		/*gmx_rx_int_en.s.ifgerr = 1; */
-		/*gmx_rx_int_en.s.coldet = 1; // Collsion detect */
+		/*gmx_rx_int_en.s.coldet = 1; // Collision detect */
 		/*gmx_rx_int_en.s.falerr = 1; // False carrier error or extend error after slottime */
 		/*gmx_rx_int_en.s.rsverr = 1; // RGMII reserved opcodes */
 		/*gmx_rx_int_en.s.pcterr = 1; // Bad Preamble / Protocol */
@@ -89,7 +89,7 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block)
 		/*gmx_rx_int_en.s.phy_spd = 1; */
 		/*gmx_rx_int_en.s.phy_link = 1; */
 		/*gmx_rx_int_en.s.ifgerr = 1; */
-		/*gmx_rx_int_en.s.coldet = 1; // Collsion detect */
+		/*gmx_rx_int_en.s.coldet = 1; // Collision detect */
 		/*gmx_rx_int_en.s.falerr = 1; // False carrier error or extend error after slottime */
 		/*gmx_rx_int_en.s.rsverr = 1; // RGMII reserved opcodes */
 		/*gmx_rx_int_en.s.pcterr = 1; // Bad Preamble / Protocol */
@@ -112,7 +112,7 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block)
 		/*gmx_rx_int_en.s.phy_spd = 1; */
 		/*gmx_rx_int_en.s.phy_link = 1; */
 		/*gmx_rx_int_en.s.ifgerr = 1; */
-		/*gmx_rx_int_en.s.coldet = 1; // Collsion detect */
+		/*gmx_rx_int_en.s.coldet = 1; // Collision detect */
 		/*gmx_rx_int_en.s.falerr = 1; // False carrier error or extend error after slottime */
 		/*gmx_rx_int_en.s.rsverr = 1; // RGMII reserved opcodes */
 		/*gmx_rx_int_en.s.pcterr = 1; // Bad Preamble / Protocol */
@@ -134,7 +134,7 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block)
 		/*gmx_rx_int_en.s.phy_spd = 1; */
 		/*gmx_rx_int_en.s.phy_link = 1; */
 		/*gmx_rx_int_en.s.ifgerr = 1; */
-		/*gmx_rx_int_en.s.coldet = 1; // Collsion detect */
+		/*gmx_rx_int_en.s.coldet = 1; // Collision detect */
 		/*gmx_rx_int_en.s.falerr = 1; // False carrier error or extend error after slottime */
 		/*gmx_rx_int_en.s.rsverr = 1; // RGMII reserved opcodes */
 		/*gmx_rx_int_en.s.pcterr = 1; // Bad Preamble / Protocol */
@@ -156,7 +156,7 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block)
 		/*gmx_rx_int_en.s.phy_spd = 1; */
 		/*gmx_rx_int_en.s.phy_link = 1; */
 		/*gmx_rx_int_en.s.ifgerr = 1; */
-		/*gmx_rx_int_en.s.coldet = 1; // Collsion detect */
+		/*gmx_rx_int_en.s.coldet = 1; // Collision detect */
 		/*gmx_rx_int_en.s.falerr = 1; // False carrier error or extend error after slottime */
 		/*gmx_rx_int_en.s.rsverr = 1; // RGMII reserved opcodes */
 		/*gmx_rx_int_en.s.pcterr = 1; // Bad Preamble / Protocol */
@@ -179,7 +179,7 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block)
 		/*gmx_rx_int_en.s.phy_spd = 1; */
 		/*gmx_rx_int_en.s.phy_link = 1; */
 		/*gmx_rx_int_en.s.ifgerr = 1; */
-		/*gmx_rx_int_en.s.coldet = 1; // Collsion detect */
+		/*gmx_rx_int_en.s.coldet = 1; // Collision detect */
 		/*gmx_rx_int_en.s.falerr = 1; // False carrier error or extend error after slottime */
 		/*gmx_rx_int_en.s.rsverr = 1; // RGMII reserved opcodes */
 		/*gmx_rx_int_en.s.pcterr = 1; // Bad Preamble / Protocol */
@@ -209,7 +209,7 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block)
 		gmx_rx_int_en.s.pause_drp = 1;
 		/* Skipping gmx_rx_int_en.s.reserved_16_18 */
 		/*gmx_rx_int_en.s.ifgerr = 1; */
-		/*gmx_rx_int_en.s.coldet = 1; // Collsion detect */
+		/*gmx_rx_int_en.s.coldet = 1; // Collision detect */
 		/*gmx_rx_int_en.s.falerr = 1; // False carrier error or extend error after slottime */
 		/*gmx_rx_int_en.s.rsverr = 1; // RGMII reserved opcodes */
 		/*gmx_rx_int_en.s.pcterr = 1; // Bad Preamble / Protocol */
diff --git a/arch/mips/cavium-octeon/executive/cvmx-pko.c b/arch/mips/cavium-octeon/executive/cvmx-pko.c
index 87be167a7a6a..676fab50dd2b 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-pko.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-pko.c
@@ -189,7 +189,7 @@ void cvmx_pko_initialize_global(void)
 	/*
 	 * Set the size of the PKO command buffers to an odd number of
 	 * 64bit words. This allows the normal two word send to stay
-	 * aligned and never span a comamnd word buffer.
+	 * aligned and never span a command word buffer.
 	 */
 	config.u64 = 0;
 	config.s.pool = CVMX_FPA_OUTPUT_BUFFER_POOL;
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index b7fa9ae28c36..42412ba0f3bf 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -331,7 +331,7 @@ static int octeon_update_boot_vector(unsigned int cpu)
 	}
 
 	if (!(avail_coremask & (1 << coreid))) {
-		/* core not available, assume, that catched by simple-executive */
+		/* core not available, assume, that caught by simple-executive */
 		cvmx_write_csr(CVMX_CIU_PP_RST, 1 << coreid);
 		cvmx_write_csr(CVMX_CIU_PP_RST, 0);
 	}
diff --git a/arch/mips/dec/int-handler.S b/arch/mips/dec/int-handler.S
index 554d1da97743..1910223a9c02 100644
--- a/arch/mips/dec/int-handler.S
+++ b/arch/mips/dec/int-handler.S
@@ -5,7 +5,7 @@
  * Written by Ralf Baechle and Andreas Busse, modified for DECstation
  * support by Paul Antoine and Harald Koerfgen.
  *
- * completly rewritten:
+ * completely rewritten:
  * Copyright (C) 1998 Harald Koerfgen
  *
  * Rewritten extensively for controller-driven IRQ support
diff --git a/arch/mips/fw/arc/memory.c b/arch/mips/fw/arc/memory.c
index 5537b94572b2..0d75b5a0bad4 100644
--- a/arch/mips/fw/arc/memory.c
+++ b/arch/mips/fw/arc/memory.c
@@ -9,7 +9,7 @@
  * PROM library functions for acquiring/using memory descriptors given to us
  * from the ARCS firmware.  This is only used when CONFIG_ARC_MEMORY is set
  * because on some machines like SGI IP27 the ARC memory configuration data
- * completly bogus and alternate easier to use mechanisms are available.
+ * completely bogus and alternate easier to use mechanisms are available.
  */
 #include <linux/init.h>
 #include <linux/kernel.h>
diff --git a/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h b/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h
index cf92fe733995..c4873e8594ef 100644
--- a/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h
+++ b/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h
@@ -141,7 +141,7 @@ octeon_main_processor:
 .endm
 
 /*
- * Do SMP slave processor setup necessary before we can savely execute C code.
+ * Do SMP slave processor setup necessary before we can safely execute C code.
  */
 	.macro	smp_slave_setup
 	.endm
diff --git a/arch/mips/include/asm/mach-generic/kernel-entry-init.h b/arch/mips/include/asm/mach-generic/kernel-entry-init.h
index 13b0751b010a..a229297c880b 100644
--- a/arch/mips/include/asm/mach-generic/kernel-entry-init.h
+++ b/arch/mips/include/asm/mach-generic/kernel-entry-init.h
@@ -16,7 +16,7 @@
 	.endm
 
 /*
- * Do SMP slave processor setup necessary before we can savely execute C code.
+ * Do SMP slave processor setup necessary before we can safely execute C code.
  */
 	.macro	smp_slave_setup
 	.endm
diff --git a/arch/mips/include/asm/mach-ip27/irq.h b/arch/mips/include/asm/mach-ip27/irq.h
index cf4384bfa846..b0b7261ff3ad 100644
--- a/arch/mips/include/asm/mach-ip27/irq.h
+++ b/arch/mips/include/asm/mach-ip27/irq.h
@@ -11,7 +11,7 @@
 #define __ASM_MACH_IP27_IRQ_H
 
 /*
- * A hardwired interrupt number is completly stupid for this system - a
+ * A hardwired interrupt number is completely stupid for this system - a
  * large configuration might have thousands if not tenthousands of
  * interrupts.
  */
diff --git a/arch/mips/include/asm/mach-ip27/kernel-entry-init.h b/arch/mips/include/asm/mach-ip27/kernel-entry-init.h
index b087cb83da3a..f992c1db876b 100644
--- a/arch/mips/include/asm/mach-ip27/kernel-entry-init.h
+++ b/arch/mips/include/asm/mach-ip27/kernel-entry-init.h
@@ -81,7 +81,7 @@
 	.endm
 
 /*
- * Do SMP slave processor setup necessary before we can savely execute C code.
+ * Do SMP slave processor setup necessary before we can safely execute C code.
  */
 	.macro	smp_slave_setup
 	GET_NASID_ASM	t1
diff --git a/arch/mips/include/asm/mach-jz4740/gpio.h b/arch/mips/include/asm/mach-jz4740/gpio.h
index bf8c3e1860e7..7c7708a23baa 100644
--- a/arch/mips/include/asm/mach-jz4740/gpio.h
+++ b/arch/mips/include/asm/mach-jz4740/gpio.h
@@ -27,7 +27,7 @@ enum jz_gpio_function {
 
 /*
  Usually a driver for a SoC component has to request several gpio pins and
- configure them as funcion pins.
+ configure them as function pins.
  jz_gpio_bulk_request can be used to ease this process.
  Usually one would do something like:
 
diff --git a/arch/mips/include/asm/mips-cm.h b/arch/mips/include/asm/mips-cm.h
index b836ddec82b7..c66fb8073c75 100644
--- a/arch/mips/include/asm/mips-cm.h
+++ b/arch/mips/include/asm/mips-cm.h
@@ -28,7 +28,7 @@ extern void __iomem *mips_cm_l2sync_base;
  * This function returns the physical base address of the Coherence Manager
  * global control block, or 0 if no Coherence Manager is present. It provides
  * a default implementation which reads the CMGCRBase register where available,
- * and may be overriden by platforms which determine this address in a
+ * and may be overridden by platforms which determine this address in a
  * different way by defining a function with the same prototype except for the
  * name mips_cm_phys_base (without underscores).
  */
diff --git a/arch/mips/include/asm/octeon/cvmx-config.h b/arch/mips/include/asm/octeon/cvmx-config.h
index f7dd17d0dc22..f4f1996e0fac 100644
--- a/arch/mips/include/asm/octeon/cvmx-config.h
+++ b/arch/mips/include/asm/octeon/cvmx-config.h
@@ -33,7 +33,7 @@
 /* Packet buffers */
 #define CVMX_FPA_PACKET_POOL		    (0)
 #define CVMX_FPA_PACKET_POOL_SIZE	    CVMX_FPA_POOL_0_SIZE
-/* Work queue entrys */
+/* Work queue entries */
 #define CVMX_FPA_WQE_POOL		    (1)
 #define CVMX_FPA_WQE_POOL_SIZE		    CVMX_FPA_POOL_1_SIZE
 /* PKO queue command buffers */
diff --git a/arch/mips/include/asm/octeon/cvmx.h b/arch/mips/include/asm/octeon/cvmx.h
index 774bb45834cb..56e3436f1c21 100644
--- a/arch/mips/include/asm/octeon/cvmx.h
+++ b/arch/mips/include/asm/octeon/cvmx.h
@@ -189,7 +189,7 @@ static inline uint64_t cvmx_ptr_to_phys(void *ptr)
 static inline void *cvmx_phys_to_ptr(uint64_t physical_address)
 {
 	if (sizeof(void *) == 8) {
-		/* Just set the top bit, avoiding any TLB uglyness */
+		/* Just set the top bit, avoiding any TLB ugliness */
 		return CASTPTR(void,
 			       CVMX_ADD_SEG(CVMX_MIPS_SPACE_XKPHYS,
 					    physical_address));
diff --git a/arch/mips/include/asm/pci/bridge.h b/arch/mips/include/asm/pci/bridge.h
index 8d7a63b52ac7..3206245d1ed6 100644
--- a/arch/mips/include/asm/pci/bridge.h
+++ b/arch/mips/include/asm/pci/bridge.h
@@ -269,16 +269,16 @@ typedef struct bridge_err_cmdword_s {
 	union {
 		u32		cmd_word;
 		struct {
-			u32	didn:4,		/* Destination ID */
-				sidn:4,		/* Source ID	  */
-				pactyp:4,	/* Packet type	  */
-				tnum:5,		/* Trans Number	  */
-				coh:1,		/* Coh Transacti  */
-				ds:2,		/* Data size	  */
-				gbr:1,		/* GBR enable	  */
-				vbpm:1,		/* VBPM message	  */
+			u32	didn:4,		/* Destination ID  */
+				sidn:4,		/* Source ID	   */
+				pactyp:4,	/* Packet type	   */
+				tnum:5,		/* Trans Number	   */
+				coh:1,		/* Coh Transaction */
+				ds:2,		/* Data size	   */
+				gbr:1,		/* GBR enable	   */
+				vbpm:1,		/* VBPM message	   */
 				error:1,	/* Error occurred  */
-				barr:1,		/* Barrier op	  */
+				barr:1,		/* Barrier op	   */
 				rsvd:8;
 		} berr_st;
 	} berr_un;
diff --git a/arch/mips/include/asm/sgi/hpc3.h b/arch/mips/include/asm/sgi/hpc3.h
index 59920b345942..4a9c99050c13 100644
--- a/arch/mips/include/asm/sgi/hpc3.h
+++ b/arch/mips/include/asm/sgi/hpc3.h
@@ -147,7 +147,7 @@ struct hpc3_ethregs {
 #define HPC3_EPCFG_P1	 0x000f /* Cycles to spend in P1 state for PIO */
 #define HPC3_EPCFG_P2	 0x00f0 /* Cycles to spend in P2 state for PIO */
 #define HPC3_EPCFG_P3	 0x0f00 /* Cycles to spend in P3 state for PIO */
-#define HPC3_EPCFG_TST	 0x1000 /* Diagnistic ram test feature bit */
+#define HPC3_EPCFG_TST	 0x1000 /* Diagnostic ram test feature bit */
 
 	u32 _unused2[0x1000/4 - 8];	/* padding */
 
diff --git a/arch/mips/include/asm/sgiarcs.h b/arch/mips/include/asm/sgiarcs.h
index 26ddfff28c8e..105a9479ac5f 100644
--- a/arch/mips/include/asm/sgiarcs.h
+++ b/arch/mips/include/asm/sgiarcs.h
@@ -144,7 +144,7 @@ struct linux_tinfo {
 struct linux_vdirent {
 	ULONG namelen;
 	unsigned char attr;
-	char fname[32]; /* XXX imperical, should be a define */
+	char fname[32]; /* XXX empirical, should be a define */
 };
 
 /* Other stuff for files. */
@@ -179,7 +179,7 @@ struct linux_finfo {
 	enum linux_devtypes   dtype;
 	unsigned long	      namelen;
 	unsigned char	      attr;
-	char		      name[32]; /* XXX imperical, should be define */
+	char		      name[32]; /* XXX empirical, should be define */
 };
 
 /* This describes the vector containing function pointers to the ARC
diff --git a/arch/mips/include/asm/sn/ioc3.h b/arch/mips/include/asm/sn/ioc3.h
index e33f0363235b..feb385180f87 100644
--- a/arch/mips/include/asm/sn/ioc3.h
+++ b/arch/mips/include/asm/sn/ioc3.h
@@ -355,7 +355,7 @@ struct ioc3_etxd {
 #define SSCR_PAUSE_STATE 0x40000000	/* sets when PAUSE takes effect */
 #define SSCR_RESET	0x80000000	/* reset DMA channels */
 
-/* all producer/comsumer pointers are the same bitfield */
+/* all producer/consumer pointers are the same bitfield */
 #define PROD_CONS_PTR_4K 0x00000ff8	/* for 4K buffers */
 #define PROD_CONS_PTR_1K 0x000003f8	/* for 1K buffers */
 #define PROD_CONS_PTR_OFF 3
diff --git a/arch/mips/include/asm/sn/sn0/hubio.h b/arch/mips/include/asm/sn/sn0/hubio.h
index 5998b13e9764..57ece90f8cf1 100644
--- a/arch/mips/include/asm/sn/sn0/hubio.h
+++ b/arch/mips/include/asm/sn/sn0/hubio.h
@@ -628,7 +628,7 @@ typedef union h1_icrbb_u {
 /*
  * Values for field imsgtype
  */
-#define IIO_ICRB_IMSGT_XTALK	0	/* Incoming Meessage from Xtalk */
+#define IIO_ICRB_IMSGT_XTALK	0	/* Incoming Message from Xtalk */
 #define IIO_ICRB_IMSGT_BTE	1	/* Incoming message from BTE	*/
 #define IIO_ICRB_IMSGT_SN0NET	2	/* Incoming message from SN0 net */
 #define IIO_ICRB_IMSGT_CRB	3	/* Incoming message from CRB ???  */
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index c74c32ccc647..b6e20f3053f4 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -96,7 +96,7 @@ static inline bool eva_kernel_access(void)
 }
 
 /*
- * Is a address valid? This does a straighforward calculation rather
+ * Is a address valid? This does a straightforward calculation rather
  * than tests.
  *
  * Address valid if:
diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c
index 1448c1f43d4e..760217bbb2fa 100644
--- a/arch/mips/kernel/mips-cm.c
+++ b/arch/mips/kernel/mips-cm.c
@@ -24,7 +24,7 @@ static char *cm2_tr[8] = {
 	"0x04", "cpc", "0x06", "0x07"
 };
 
-/* CM3 Tag ECC transation type */
+/* CM3 Tag ECC transaction type */
 static char *cm3_tr[16] = {
 	[0x0] = "ReqNoData",
 	[0x1] = "0x1",
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index d7b8dd43147a..9bc1191b1ab0 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -530,7 +530,7 @@ static void mipspmu_enable(struct pmu *pmu)
 
 /*
  * MIPS performance counters can be per-TC. The control registers can
- * not be directly accessed accross CPUs. Hence if we want to do global
+ * not be directly accessed across CPUs. Hence if we want to do global
  * control, we need cross CPU calls. on_each_cpu() can help us, but we
  * can not make sure this function is called with interrupts enabled. So
  * here we pause local counters and then grab a rwlock and leave the
diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c
index 0b3e58a3189f..ebb227471700 100644
--- a/arch/mips/kernel/pm-cps.c
+++ b/arch/mips/kernel/pm-cps.c
@@ -471,7 +471,7 @@ static void * __init cps_gen_entry_code(unsigned cpu, enum cps_pm_state state)
 	/*
 	 * Disable all but self interventions. The load from COHCTL is defined
 	 * by the interAptiv & proAptiv SUMs as ensuring that the operation
-	 * resulting from the preceeding store is complete.
+	 * resulting from the preceding store is complete.
 	 */
 	uasm_i_addiu(&p, t0, zero, 1 << cpu_data[cpu].core);
 	uasm_i_sw(&p, t0, 0, r_pcohctl);
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 474ce8d1f67d..fe61ce7ad9b1 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -712,7 +712,7 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value)
 	 * allows us to only worry about whether an FP mode switch is in
 	 * progress when FP is first used in a tasks time slice. Pretty much all
 	 * of the mode switch overhead can thus be confined to cases where mode
-	 * switches are actually occuring. That is, to here. However for the
+	 * switches are actually occurring. That is, to here. However for the
 	 * thread performing the mode switch it may take a while...
 	 */
 	if (num_online_cpus() > 1) {
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 31ca2edd7218..133274082d0a 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2229,7 +2229,7 @@ void __init trap_init(void)
 
 	/*
 	 * Copy the generic exception handlers to their final destination.
-	 * This will be overriden later as suitable for a particular
+	 * This will be overridden later as suitable for a particular
 	 * configuration.
 	 */
 	set_handler(0x180, &except_vec3_generic, 0x80);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index eff71c75dc27..75d056022c64 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -683,7 +683,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
 
-	/* Alocate new kernel and user ASIDs if needed */
+	/* Allocate new kernel and user ASIDs if needed */
 
 	local_irq_save(flags);
 
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 307cc4c98bdd..473aa4dff867 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -501,7 +501,7 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	kvm_write_c0_guest_config7(cop0, (MIPS_CONF7_WII) | (1 << 10));
 
 	/*
-	 * Setup IntCtl defaults, compatibilty mode for timer interrupts (HW5)
+	 * Setup IntCtl defaults, compatibility mode for timer interrupts (HW5)
 	 */
 	kvm_write_c0_guest_intctl(cop0, 0xFC000000);
 
diff --git a/arch/mips/math-emu/ieee754dp.c b/arch/mips/math-emu/ieee754dp.c
index ad3c73436777..47d26c805eac 100644
--- a/arch/mips/math-emu/ieee754dp.c
+++ b/arch/mips/math-emu/ieee754dp.c
@@ -97,7 +97,7 @@ union ieee754dp ieee754dp_format(int sn, int xe, u64 xm)
 {
 	assert(xm);		/* we don't gen exact zeros (probably should) */
 
-	assert((xm >> (DP_FBITS + 1 + 3)) == 0);	/* no execess */
+	assert((xm >> (DP_FBITS + 1 + 3)) == 0);	/* no excess */
 	assert(xm & (DP_HIDDEN_BIT << 3));
 
 	if (xe < DP_EMIN) {
@@ -165,7 +165,7 @@ union ieee754dp ieee754dp_format(int sn, int xe, u64 xm)
 	/* strip grs bits */
 	xm >>= 3;
 
-	assert((xm >> (DP_FBITS + 1)) == 0);	/* no execess */
+	assert((xm >> (DP_FBITS + 1)) == 0);	/* no excess */
 	assert(xe >= DP_EMIN);
 
 	if (xe > DP_EMAX) {
@@ -198,7 +198,7 @@ union ieee754dp ieee754dp_format(int sn, int xe, u64 xm)
 			ieee754_setcx(IEEE754_UNDERFLOW);
 		return builddp(sn, DP_EMIN - 1 + DP_EBIAS, xm);
 	} else {
-		assert((xm >> (DP_FBITS + 1)) == 0);	/* no execess */
+		assert((xm >> (DP_FBITS + 1)) == 0);	/* no excess */
 		assert(xm & DP_HIDDEN_BIT);
 
 		return builddp(sn, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
diff --git a/arch/mips/math-emu/ieee754sp.c b/arch/mips/math-emu/ieee754sp.c
index def00ffc50fc..e0b2c450b963 100644
--- a/arch/mips/math-emu/ieee754sp.c
+++ b/arch/mips/math-emu/ieee754sp.c
@@ -97,7 +97,7 @@ union ieee754sp ieee754sp_format(int sn, int xe, unsigned xm)
 {
 	assert(xm);		/* we don't gen exact zeros (probably should) */
 
-	assert((xm >> (SP_FBITS + 1 + 3)) == 0);	/* no execess */
+	assert((xm >> (SP_FBITS + 1 + 3)) == 0);	/* no excess */
 	assert(xm & (SP_HIDDEN_BIT << 3));
 
 	if (xe < SP_EMIN) {
@@ -163,7 +163,7 @@ union ieee754sp ieee754sp_format(int sn, int xe, unsigned xm)
 	/* strip grs bits */
 	xm >>= 3;
 
-	assert((xm >> (SP_FBITS + 1)) == 0);	/* no execess */
+	assert((xm >> (SP_FBITS + 1)) == 0);	/* no excess */
 	assert(xe >= SP_EMIN);
 
 	if (xe > SP_EMAX) {
@@ -196,7 +196,7 @@ union ieee754sp ieee754sp_format(int sn, int xe, unsigned xm)
 			ieee754_setcx(IEEE754_UNDERFLOW);
 		return buildsp(sn, SP_EMIN - 1 + SP_EBIAS, xm);
 	} else {
-		assert((xm >> (SP_FBITS + 1)) == 0);	/* no execess */
+		assert((xm >> (SP_FBITS + 1)) == 0);	/* no excess */
 		assert(xm & SP_HIDDEN_BIT);
 
 		return buildsp(sn, xe + SP_EBIAS, xm & ~SP_HIDDEN_BIT);
diff --git a/arch/mips/mm/sc-ip22.c b/arch/mips/mm/sc-ip22.c
index efaf364fe581..f293a97cb885 100644
--- a/arch/mips/mm/sc-ip22.c
+++ b/arch/mips/mm/sc-ip22.c
@@ -172,7 +172,7 @@ static inline int __init indy_sc_probe(void)
 	return 1;
 }
 
-/* XXX Check with wje if the Indy caches can differenciate between
+/* XXX Check with wje if the Indy caches can differentiate between
    writeback + invalidate and just invalidate.	*/
 static struct bcache_ops indy_sc_ops = {
 	.bc_enable = indy_sc_enable,
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 63b7d6f82d24..b5a0234da765 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -12,7 +12,7 @@
  * Copyright (C) 2011  MIPS Technologies, Inc.
  *
  * ... and the days got worse and worse and now you see
- * I've gone completly out of my mind.
+ * I've gone completely out of my mind.
  *
  * They're coming to take me a away haha
  * they're coming to take me a away hoho hihi haha
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 8d0eb2643248..f1f88291451e 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -7,7 +7,7 @@
  * Copyright (C) 2000 by Silicon Graphics, Inc.
  * Copyright (C) 2004 by Christoph Hellwig
  *
- * On SGI IP27 the ARC memory configuration data is completly bogus but
+ * On SGI IP27 the ARC memory configuration data is completely bogus but
  * alternate easier to use mechanisms are available.
  */
 #include <linux/init.h>

From 9a89fca3bbe23559b25c8c5eeadd90aac2e95c44 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 13 Nov 2015 00:48:48 +0000
Subject: [PATCH 1446/3220] UPSTREAM: MIPS: math-emu: Always propagate sNaN
 payload in quieting

Propagate sNaN payload in quieting in the legacy-NaN mode as well.  If
clearing the quiet bit would produce infinity, then set the next lower
trailing significand field bit, matching the SB-1 and BMIPS5000 hardware
implementations.  Some other MIPS FPU hardware implementations do
produce the default qNaN bit pattern instead.

This reverts some changes made for semantics preservation with commit
dc3ddf42 [MIPS: math-emu: Update sNaN quieting handlers], consequently
bringing back most of the semantics from before commit fdffbafb [Lots of
FPU bug fixes from Kjeld Borch Egevang.], except from the qNaN produced
in the infinity case.  Previously the default qNaN bit pattern was
produced in that case.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Fortune <Matthew.Fortune@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11483/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit acd9e20cd9d0e6af5680e1870a966d8082a1130a)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/ieee754dp.c | 9 ++++++---
 arch/mips/math-emu/ieee754sp.c | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/mips/math-emu/ieee754dp.c b/arch/mips/math-emu/ieee754dp.c
index 47d26c805eac..465a0342ed4c 100644
--- a/arch/mips/math-emu/ieee754dp.c
+++ b/arch/mips/math-emu/ieee754dp.c
@@ -54,10 +54,13 @@ union ieee754dp __cold ieee754dp_nanxcpt(union ieee754dp r)
 	assert(ieee754dp_issnan(r));
 
 	ieee754_setcx(IEEE754_INVALID_OPERATION);
-	if (ieee754_csr.nan2008)
+	if (ieee754_csr.nan2008) {
 		DPMANT(r) |= DP_MBIT(DP_FBITS - 1);
-	else
-		r = ieee754dp_indef();
+	} else {
+		DPMANT(r) &= ~DP_MBIT(DP_FBITS - 1);
+		if (!ieee754dp_isnan(r))
+			DPMANT(r) |= DP_MBIT(DP_FBITS - 2);
+	}
 
 	return r;
 }
diff --git a/arch/mips/math-emu/ieee754sp.c b/arch/mips/math-emu/ieee754sp.c
index e0b2c450b963..860e9162097f 100644
--- a/arch/mips/math-emu/ieee754sp.c
+++ b/arch/mips/math-emu/ieee754sp.c
@@ -54,10 +54,13 @@ union ieee754sp __cold ieee754sp_nanxcpt(union ieee754sp r)
 	assert(ieee754sp_issnan(r));
 
 	ieee754_setcx(IEEE754_INVALID_OPERATION);
-	if (ieee754_csr.nan2008)
+	if (ieee754_csr.nan2008) {
 		SPMANT(r) |= SP_MBIT(SP_FBITS - 1);
-	else
-		r = ieee754sp_indef();
+	} else {
+		SPMANT(r) &= ~SP_MBIT(SP_FBITS - 1);
+		if (!ieee754sp_isnan(r))
+			SPMANT(r) |= SP_MBIT(SP_FBITS - 2);
+	}
 
 	return r;
 }

From b61e67de1ac44a2faca12e9fd476c87fa539efb5 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 14:04:45 +0100
Subject: [PATCH 1447/3220] UPSTREAM: MIPS: math-emu: Fix BC1{EQ,NE}Z emulation

The conditions for branching when emulating the BC1EQZ & BC1NEZ
instructions were backwards, leading to each of those instructions being
treated as the other. Fix this by reversing the conditions, and clear up
the code a little for readability & checkpatch.

Fixes: c909ca718e8f ("MIPS: math-emu: Emulate missing BC1{EQ,NE}Z instructions")
Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13150/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 93583e178ebfdd2fadf950eef1547f305cac12ca)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 066e0720a7f1..4ea9836cec58 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -975,9 +975,10 @@ static int cop1Emulate(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 		struct mm_decoded_insn dec_insn, void *__user *fault_addr)
 {
 	unsigned long contpc = xcp->cp0_epc + dec_insn.pc_inc;
-	unsigned int cond, cbit;
+	unsigned int cond, cbit, bit0;
 	mips_instruction ir;
 	int likely, pc_inc;
+	union fpureg *fpr;
 	u32 __user *wva;
 	u64 __user *dva;
 	u32 wval;
@@ -1189,14 +1190,14 @@ emul:
 				return SIGILL;
 
 			cond = likely = 0;
+			fpr = &current->thread.fpu.fpr[MIPSInst_RT(ir)];
+			bit0 = get_fpr32(fpr, 0) & 0x1;
 			switch (MIPSInst_RS(ir)) {
 			case bc1eqz_op:
-				if (get_fpr32(&current->thread.fpu.fpr[MIPSInst_RT(ir)], 0) & 0x1)
-				    cond = 1;
+				cond = bit0 == 0;
 				break;
 			case bc1nez_op:
-				if (!(get_fpr32(&current->thread.fpu.fpr[MIPSInst_RT(ir)], 0) & 0x1))
-				    cond = 1;
+				cond = bit0 != 0;
 				break;
 			}
 			goto branch_common;

From 476f2f0a48ca04c5a3f9a385becdafe2adaf8666 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 14:04:48 +0100
Subject: [PATCH 1448/3220] UPSTREAM: MIPS: math-emu: Emulate MIPSr6 sel.fmt
 instruction

Add support for emulating the MIPSr6 sel.fmt instruction, which was
previously missing from the FPU emulation code. This instruction selects
its result from 2 possible source registers, based upon bit 0 of the
destination register, and is valid only for S (single) & D (double) data
types.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13153/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 4b820d95dc53c15e6e727da964430a3ed60e05ef)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 4ea9836cec58..06ffddd916c0 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1677,7 +1677,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			union ieee754sp(*b) (union ieee754sp, union ieee754sp);
 			union ieee754sp(*u) (union ieee754sp);
 		} handler;
-		union ieee754sp fs, ft;
+		union ieee754sp fd, fs, ft;
 
 		switch (MIPSInst_FUNC(ir)) {
 			/* binary ops */
@@ -1948,6 +1948,17 @@ copcsr:
 			rfmt = w_fmt;
 			goto copcsr;
 
+		case fsel_op:
+			if (!cpu_has_mips_r6)
+				return SIGILL;
+
+			SPFROMREG(fd, MIPSInst_FD(ir));
+			if (fd.bits & 0x1)
+				SPFROMREG(rv.s, MIPSInst_FT(ir));
+			else
+				SPFROMREG(rv.s, MIPSInst_FS(ir));
+			break;
+
 		case fcvtl_op:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;
@@ -1996,7 +2007,7 @@ copcsr:
 	}
 
 	case d_fmt: {
-		union ieee754dp fs, ft;
+		union ieee754dp fd, fs, ft;
 		union {
 			union ieee754dp(*b) (union ieee754dp, union ieee754dp);
 			union ieee754dp(*u) (union ieee754dp);
@@ -2246,6 +2257,17 @@ dcopuop:
 			rfmt = w_fmt;
 			goto copcsr;
 
+		case fsel_op:
+			if (!cpu_has_mips_r6)
+				return SIGILL;
+
+			DPFROMREG(fd, MIPSInst_FD(ir));
+			if (fd.bits & 0x1)
+				DPFROMREG(rv.d, MIPSInst_FT(ir));
+			else
+				DPFROMREG(rv.d, MIPSInst_FS(ir));
+			break;
+
 		case fcvtl_op:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;

From e6ba7699581f8d0cd0dd15674251cfff1f8bee09 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 14:04:49 +0100
Subject: [PATCH 1449/3220] UPSTREAM: MIPS: math-emu: Unify
 ieee754sp_m{add,sub}f

The code for emulating MIPSr6 madd.s & msub.s instructions has
previously been implemented as 2 different functions, namely
ieee754sp_maddf & ieee754sp_msubf. The difference in behaviour of these
2 instructions is merely the sign of the product, so we can easily share
the code implementing them. Do this for the single precision variant,
removing the original ieee754sp_msubf in favor of reusing the code from
ieee754sp_maddf.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13154/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 6162051e87f6ea785cb51ad99bdcf8eb0bd9cb07)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/Makefile   |   2 +-
 arch/mips/math-emu/sp_maddf.c |  22 ++-
 arch/mips/math-emu/sp_msubf.c | 258 ----------------------------------
 3 files changed, 21 insertions(+), 261 deletions(-)
 delete mode 100644 arch/mips/math-emu/sp_msubf.c

diff --git a/arch/mips/math-emu/Makefile b/arch/mips/math-emu/Makefile
index a19641d3ac23..3389aff21783 100644
--- a/arch/mips/math-emu/Makefile
+++ b/arch/mips/math-emu/Makefile
@@ -6,7 +6,7 @@ obj-y	+= cp1emu.o ieee754dp.o ieee754sp.o ieee754.o \
 	   dp_div.o dp_mul.o dp_sub.o dp_add.o dp_fsp.o dp_cmp.o dp_simple.o \
 	   dp_tint.o dp_fint.o dp_maddf.o dp_msubf.o dp_2008class.o dp_fmin.o dp_fmax.o \
 	   sp_div.o sp_mul.o sp_sub.o sp_add.o sp_fdp.o sp_cmp.o sp_simple.o \
-	   sp_tint.o sp_fint.o sp_maddf.o sp_msubf.o sp_2008class.o sp_fmin.o sp_fmax.o \
+	   sp_tint.o sp_fint.o sp_maddf.o sp_2008class.o sp_fmin.o sp_fmax.o \
 	   dsemul.o
 
 lib-y	+= ieee754d.o \
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index dd1dd83e34eb..93b7132d60e2 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -14,8 +14,12 @@
 
 #include "ieee754sp.h"
 
-union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
-				union ieee754sp y)
+enum maddf_flags {
+	maddf_negate_product	= 1 << 0,
+};
+
+static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
+				 union ieee754sp y, enum maddf_flags flags)
 {
 	int re;
 	int rs;
@@ -154,6 +158,8 @@ union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
 
 	re = xe + ye;
 	rs = xs ^ ys;
+	if (flags & maddf_negate_product)
+		rs ^= 1;
 
 	/* shunt to top of word */
 	xm <<= 32 - (SP_FBITS + 1);
@@ -253,3 +259,15 @@ union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
 	}
 	return ieee754sp_format(zs, ze, zm);
 }
+
+union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
+				union ieee754sp y)
+{
+	return _sp_maddf(z, x, y, 0);
+}
+
+union ieee754sp ieee754sp_msubf(union ieee754sp z, union ieee754sp x,
+				union ieee754sp y)
+{
+	return _sp_maddf(z, x, y, maddf_negate_product);
+}
diff --git a/arch/mips/math-emu/sp_msubf.c b/arch/mips/math-emu/sp_msubf.c
deleted file mode 100644
index 81c38b980d69..000000000000
--- a/arch/mips/math-emu/sp_msubf.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * IEEE754 floating point arithmetic
- * single precision: MSUB.f (Fused Multiply Subtract)
- * MSUBF.fmt: FPR[fd] = FPR[fd] - (FPR[fs] x FPR[ft])
- *
- * MIPS floating point support
- * Copyright (C) 2015 Imagination Technologies, Ltd.
- * Author: Markos Chandras <markos.chandras@imgtec.com>
- *
- *  This program is free software; you can distribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; version 2 of the License.
- */
-
-#include "ieee754sp.h"
-
-union ieee754sp ieee754sp_msubf(union ieee754sp z, union ieee754sp x,
-				union ieee754sp y)
-{
-	int re;
-	int rs;
-	unsigned rm;
-	unsigned short lxm;
-	unsigned short hxm;
-	unsigned short lym;
-	unsigned short hym;
-	unsigned lrm;
-	unsigned hrm;
-	unsigned t;
-	unsigned at;
-	int s;
-
-	COMPXSP;
-	COMPYSP;
-	u32 zm; int ze; int zs __maybe_unused; int zc;
-
-	EXPLODEXSP;
-	EXPLODEYSP;
-	EXPLODESP(z, zc, zs, ze, zm)
-
-	FLUSHXSP;
-	FLUSHYSP;
-	FLUSHSP(z, zc, zs, ze, zm);
-
-	ieee754_clearcx();
-
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754sp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		SPDNORMx(zm, ze);
-	/* QNAN is handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
-		return ieee754sp_nanxcpt(y);
-
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
-		return ieee754sp_nanxcpt(x);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
-		return y;
-
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
-
-	/*
-	 * Infinity handling
-	 */
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754sp_indef();
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		return ieee754sp_inf(xs ^ ys);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
-		if (zc == IEEE754_CLASS_INF)
-			return ieee754sp_inf(zs);
-		/* Multiplication is 0 so just return z */
-		return z;
-
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
-		SPDNORMX;
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754sp_inf(zs);
-		SPDNORMY;
-		break;
-
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754sp_inf(zs);
-		SPDNORMX;
-		break;
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754sp_inf(zs);
-		/* fall through to real compuation */
-	}
-
-	/* Finally get to do some computation */
-
-	/*
-	 * Do the multiplication bit first
-	 *
-	 * rm = xm * ym, re = xe + ye basically
-	 *
-	 * At this point xm and ym should have been normalized.
-	 */
-
-	/* rm = xm * ym, re = xe+ye basically */
-	assert(xm & SP_HIDDEN_BIT);
-	assert(ym & SP_HIDDEN_BIT);
-
-	re = xe + ye;
-	rs = xs ^ ys;
-
-	/* shunt to top of word */
-	xm <<= 32 - (SP_FBITS + 1);
-	ym <<= 32 - (SP_FBITS + 1);
-
-	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
-	 */
-	lxm = xm & 0xffff;
-	hxm = xm >> 16;
-	lym = ym & 0xffff;
-	hym = ym >> 16;
-
-	lrm = lxm * lym;	/* 16 * 16 => 32 */
-	hrm = hxm * hym;	/* 16 * 16 => 32 */
-
-	t = lxm * hym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
-
-	t = hxm * lym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
-
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((int) rm < 0) {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3))) |
-		    ((rm << (SP_FBITS + 1 + 3)) != 0);
-		re++;
-	} else {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (SP_FBITS + 1 + 3 + 1)) != 0);
-	}
-	assert(rm & (SP_HIDDEN_BIT << 3));
-
-	/* And now the subtraction */
-
-	/* Flip sign of r and handle as add */
-	rs ^= 1;
-
-	assert(zm & SP_HIDDEN_BIT);
-
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
-
-	if (ze > re) {
-		/*
-		 * Have to shift y fraction right to align.
-		 */
-		s = ze - re;
-		SPXSRSYn(s);
-	} else if (re > ze) {
-		/*
-		 * Have to shift x fraction right to align.
-		 */
-		s = re - ze;
-		SPXSRSYn(s);
-	}
-	assert(ze == re);
-	assert(ze <= SP_EMAX);
-
-	if (zs == rs) {
-		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in zm, zs and ze.
-		 */
-		zm = zm + rm;
-
-		if (zm >> (SP_FBITS + 1 + 3)) { /* carry out */
-			SPXSRSX1(); /* shift preserving sticky */
-		}
-	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
-		} else {
-			zm = rm - zm;
-			zs = rs;
-		}
-		if (zm == 0)
-			return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD);
-
-		/*
-		 * Normalize in extended single precision
-		 */
-		while ((zm >> (SP_MBITS + 3)) == 0) {
-			zm <<= 1;
-			ze--;
-		}
-
-	}
-	return ieee754sp_format(zs, ze, zm);
-}

From 8c50090eaeb37ade57e73c630c8dd4c6c2a2cb6d Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 14:04:50 +0100
Subject: [PATCH 1450/3220] UPSTREAM: MIPS: math-emu: Unify
 ieee754dp_m{add,sub}f

The code for emulating MIPSr6 madd.d & msub.d instructions has
previously been implemented as 2 different functions, namely
ieee754dp_maddf & ieee754dp_msubf. The difference in behaviour of these
2 instructions is merely the sign of the product, so we can easily share
the code implementing them. Do this for the double precision variant,
removing the original ieee754dp_msubf in favor of reusing the code from
ieee754dp_maddf.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13155/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit d728f6709bcc49c98097485e3561f1faaf52b4f3)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/Makefile   |   2 +-
 arch/mips/math-emu/dp_maddf.c |  22 ++-
 arch/mips/math-emu/dp_msubf.c | 269 ----------------------------------
 3 files changed, 21 insertions(+), 272 deletions(-)
 delete mode 100644 arch/mips/math-emu/dp_msubf.c

diff --git a/arch/mips/math-emu/Makefile b/arch/mips/math-emu/Makefile
index 3389aff21783..e9bbc2a6526f 100644
--- a/arch/mips/math-emu/Makefile
+++ b/arch/mips/math-emu/Makefile
@@ -4,7 +4,7 @@
 
 obj-y	+= cp1emu.o ieee754dp.o ieee754sp.o ieee754.o \
 	   dp_div.o dp_mul.o dp_sub.o dp_add.o dp_fsp.o dp_cmp.o dp_simple.o \
-	   dp_tint.o dp_fint.o dp_maddf.o dp_msubf.o dp_2008class.o dp_fmin.o dp_fmax.o \
+	   dp_tint.o dp_fint.o dp_maddf.o dp_2008class.o dp_fmin.o dp_fmax.o \
 	   sp_div.o sp_mul.o sp_sub.o sp_add.o sp_fdp.o sp_cmp.o sp_simple.o \
 	   sp_tint.o sp_fint.o sp_maddf.o sp_2008class.o sp_fmin.o sp_fmax.o \
 	   dsemul.o
diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 119eda9fa1ea..d5e0fb13ef0f 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -14,8 +14,12 @@
 
 #include "ieee754dp.h"
 
-union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
-				union ieee754dp y)
+enum maddf_flags {
+	maddf_negate_product	= 1 << 0,
+};
+
+static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
+				 union ieee754dp y, enum maddf_flags flags)
 {
 	int re;
 	int rs;
@@ -154,6 +158,8 @@ union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	re = xe + ye;
 	rs = xs ^ ys;
+	if (flags & maddf_negate_product)
+		rs ^= 1;
 
 	/* shunt to top of word */
 	xm <<= 64 - (DP_FBITS + 1);
@@ -263,3 +269,15 @@ union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	return ieee754dp_format(zs, ze, zm);
 }
+
+union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
+				union ieee754dp y)
+{
+	return _dp_maddf(z, x, y, 0);
+}
+
+union ieee754dp ieee754dp_msubf(union ieee754dp z, union ieee754dp x,
+				union ieee754dp y)
+{
+	return _dp_maddf(z, x, y, maddf_negate_product);
+}
diff --git a/arch/mips/math-emu/dp_msubf.c b/arch/mips/math-emu/dp_msubf.c
deleted file mode 100644
index 12241262f856..000000000000
--- a/arch/mips/math-emu/dp_msubf.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * IEEE754 floating point arithmetic
- * double precision: MSUB.f (Fused Multiply Subtract)
- * MSUBF.fmt: FPR[fd] = FPR[fd] - (FPR[fs] x FPR[ft])
- *
- * MIPS floating point support
- * Copyright (C) 2015 Imagination Technologies, Ltd.
- * Author: Markos Chandras <markos.chandras@imgtec.com>
- *
- *  This program is free software; you can distribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; version 2 of the License.
- */
-
-#include "ieee754dp.h"
-
-union ieee754dp ieee754dp_msubf(union ieee754dp z, union ieee754dp x,
-				union ieee754dp y)
-{
-	int re;
-	int rs;
-	u64 rm;
-	unsigned lxm;
-	unsigned hxm;
-	unsigned lym;
-	unsigned hym;
-	u64 lrm;
-	u64 hrm;
-	u64 t;
-	u64 at;
-	int s;
-
-	COMPXDP;
-	COMPYDP;
-
-	u64 zm; int ze; int zs __maybe_unused; int zc;
-
-	EXPLODEXDP;
-	EXPLODEYDP;
-	EXPLODEDP(z, zc, zs, ze, zm)
-
-	FLUSHXDP;
-	FLUSHYDP;
-	FLUSHDP(z, zc, zs, ze, zm);
-
-	ieee754_clearcx();
-
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754dp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		DPDNORMx(zm, ze);
-	/* QNAN is handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
-		return ieee754dp_nanxcpt(y);
-
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
-		return ieee754dp_nanxcpt(x);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
-		return y;
-
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
-
-
-	/*
-	 * Infinity handling
-	 */
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754dp_indef();
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		return ieee754dp_inf(xs ^ ys);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
-		if (zc == IEEE754_CLASS_INF)
-			return ieee754dp_inf(zs);
-		/* Multiplication is 0 so just return z */
-		return z;
-
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
-		DPDNORMX;
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754dp_inf(zs);
-		DPDNORMY;
-		break;
-
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754dp_inf(zs);
-		DPDNORMX;
-		break;
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754dp_inf(zs);
-		/* fall through to real computations */
-	}
-
-	/* Finally get to do some computation */
-
-	/*
-	 * Do the multiplication bit first
-	 *
-	 * rm = xm * ym, re = xe + ye basically
-	 *
-	 * At this point xm and ym should have been normalized.
-	 */
-	assert(xm & DP_HIDDEN_BIT);
-	assert(ym & DP_HIDDEN_BIT);
-
-	re = xe + ye;
-	rs = xs ^ ys;
-
-	/* shunt to top of word */
-	xm <<= 64 - (DP_FBITS + 1);
-	ym <<= 64 - (DP_FBITS + 1);
-
-	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
-	 */
-
-	/* 32 * 32 => 64 */
-#define DPXMULT(x, y)	((u64)(x) * (u64)y)
-
-	lxm = xm;
-	hxm = xm >> 32;
-	lym = ym;
-	hym = ym >> 32;
-
-	lrm = DPXMULT(lxm, lym);
-	hrm = DPXMULT(hxm, hym);
-
-	t = DPXMULT(lxm, hym);
-
-	at = lrm + (t << 32);
-	hrm += at < lrm;
-	lrm = at;
-
-	hrm = hrm + (t >> 32);
-
-	t = DPXMULT(hxm, lym);
-
-	at = lrm + (t << 32);
-	hrm += at < lrm;
-	lrm = at;
-
-	hrm = hrm + (t >> 32);
-
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((s64) rm < 0) {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3))) |
-		     ((rm << (DP_FBITS + 1 + 3)) != 0);
-			re++;
-	} else {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (DP_FBITS + 1 + 3 + 1)) != 0);
-	}
-	assert(rm & (DP_HIDDEN_BIT << 3));
-
-	/* And now the subtraction */
-
-	/* flip sign of r and handle as add */
-	rs ^= 1;
-
-	assert(zm & DP_HIDDEN_BIT);
-
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
-
-	if (ze > re) {
-		/*
-		 * Have to shift y fraction right to align.
-		 */
-		s = ze - re;
-		rm = XDPSRS(rm, s);
-		re += s;
-	} else if (re > ze) {
-		/*
-		 * Have to shift x fraction right to align.
-		 */
-		s = re - ze;
-		zm = XDPSRS(zm, s);
-		ze += s;
-	}
-	assert(ze == re);
-	assert(ze <= DP_EMAX);
-
-	if (zs == rs) {
-		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in xm, xs and xe.
-		 */
-		zm = zm + rm;
-
-		if (zm >> (DP_FBITS + 1 + 3)) { /* carry out */
-			zm = XDPSRS1(zm);
-			ze++;
-		}
-	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
-		} else {
-			zm = rm - zm;
-			zs = rs;
-		}
-		if (zm == 0)
-			return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD);
-
-		/*
-		 * Normalize to rounding precision.
-		 */
-		while ((zm >> (DP_FBITS + 3)) == 0) {
-			zm <<= 1;
-			ze--;
-		}
-	}
-
-	return ieee754dp_format(zs, ze, zm);
-}

From 663009749ffdb2786e75eb5bdb3c5e0174a8e12a Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 14:04:51 +0100
Subject: [PATCH 1451/3220] UPSTREAM: MIPS: math-emu: Add z argument macros

Introduce macros for handling the "z" argument to maddf & msubf, making
its handling consistent with that of the "x" & "y" arguments rather than
open-coding equivalents.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13156/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit e2d11e1a8398b7447d337add50521a5abc6267fd)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c   |  9 ++++-----
 arch/mips/math-emu/ieee754dp.h  |  1 +
 arch/mips/math-emu/ieee754int.h | 10 ++++++++++
 arch/mips/math-emu/ieee754sp.h  |  1 +
 arch/mips/math-emu/sp_maddf.c   |  8 ++++----
 5 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index d5e0fb13ef0f..0e1d4d80e22e 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -36,16 +36,15 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	COMPXDP;
 	COMPYDP;
-
-	u64 zm; int ze; int zs __maybe_unused; int zc;
+	COMPZDP;
 
 	EXPLODEXDP;
 	EXPLODEYDP;
-	EXPLODEDP(z, zc, zs, ze, zm)
+	EXPLODEZDP;
 
 	FLUSHXDP;
 	FLUSHYDP;
-	FLUSHDP(z, zc, zs, ze, zm);
+	FLUSHZDP;
 
 	ieee754_clearcx();
 
@@ -54,7 +53,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754dp_nanxcpt(z);
 	case IEEE754_CLASS_DNORM:
-		DPDNORMx(zm, ze);
+		DPDNORMZ;
 	/* QNAN is handled separately below */
 	}
 
diff --git a/arch/mips/math-emu/ieee754dp.h b/arch/mips/math-emu/ieee754dp.h
index e2babd98fee3..9ba023004eb6 100644
--- a/arch/mips/math-emu/ieee754dp.h
+++ b/arch/mips/math-emu/ieee754dp.h
@@ -60,6 +60,7 @@ static inline int ieee754dp_finite(union ieee754dp x)
 	while ((m >> DP_FBITS) == 0) { m <<= 1; e--; }
 #define DPDNORMX	DPDNORMx(xm, xe)
 #define DPDNORMY	DPDNORMx(ym, ye)
+#define DPDNORMZ	DPDNORMx(zm, ze)
 
 static inline union ieee754dp builddp(int s, int bx, u64 m)
 {
diff --git a/arch/mips/math-emu/ieee754int.h b/arch/mips/math-emu/ieee754int.h
index ed7bb277b3e0..8bc2f6963324 100644
--- a/arch/mips/math-emu/ieee754int.h
+++ b/arch/mips/math-emu/ieee754int.h
@@ -55,6 +55,9 @@ static inline int ieee754_class_nan(int xc)
 #define COMPYSP \
 	unsigned ym; int ye; int ys; int yc
 
+#define COMPZSP \
+	unsigned zm; int ze; int zs; int zc
+
 #define EXPLODESP(v, vc, vs, ve, vm)					\
 {									\
 	vs = SPSIGN(v);							\
@@ -81,6 +84,7 @@ static inline int ieee754_class_nan(int xc)
 }
 #define EXPLODEXSP EXPLODESP(x, xc, xs, xe, xm)
 #define EXPLODEYSP EXPLODESP(y, yc, ys, ye, ym)
+#define EXPLODEZSP EXPLODESP(z, zc, zs, ze, zm)
 
 
 #define COMPXDP \
@@ -89,6 +93,9 @@ static inline int ieee754_class_nan(int xc)
 #define COMPYDP \
 	u64 ym; int ye; int ys; int yc
 
+#define COMPZDP \
+	u64 zm; int ze; int zs; int zc
+
 #define EXPLODEDP(v, vc, vs, ve, vm)					\
 {									\
 	vm = DPMANT(v);							\
@@ -115,6 +122,7 @@ static inline int ieee754_class_nan(int xc)
 }
 #define EXPLODEXDP EXPLODEDP(x, xc, xs, xe, xm)
 #define EXPLODEYDP EXPLODEDP(y, yc, ys, ye, ym)
+#define EXPLODEZDP EXPLODEDP(z, zc, zs, ze, zm)
 
 #define FLUSHDP(v, vc, vs, ve, vm)					\
 	if (vc==IEEE754_CLASS_DNORM) {					\
@@ -140,7 +148,9 @@ static inline int ieee754_class_nan(int xc)
 
 #define FLUSHXDP FLUSHDP(x, xc, xs, xe, xm)
 #define FLUSHYDP FLUSHDP(y, yc, ys, ye, ym)
+#define FLUSHZDP FLUSHDP(z, zc, zs, ze, zm)
 #define FLUSHXSP FLUSHSP(x, xc, xs, xe, xm)
 #define FLUSHYSP FLUSHSP(y, yc, ys, ye, ym)
+#define FLUSHZSP FLUSHSP(z, zc, zs, ze, zm)
 
 #endif /* __IEEE754INT_H  */
diff --git a/arch/mips/math-emu/ieee754sp.h b/arch/mips/math-emu/ieee754sp.h
index 374a3f00a589..b24fdff90f21 100644
--- a/arch/mips/math-emu/ieee754sp.h
+++ b/arch/mips/math-emu/ieee754sp.h
@@ -65,6 +65,7 @@ static inline int ieee754sp_finite(union ieee754sp x)
 	while ((m >> SP_FBITS) == 0) { m <<= 1; e--; }
 #define SPDNORMX	SPDNORMx(xm, xe)
 #define SPDNORMY	SPDNORMx(ym, ye)
+#define SPDNORMZ	SPDNORMx(zm, ze)
 
 static inline union ieee754sp buildsp(int s, int bx, unsigned m)
 {
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index 93b7132d60e2..86e1d0b25446 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -36,15 +36,15 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 
 	COMPXSP;
 	COMPYSP;
-	u32 zm; int ze; int zs __maybe_unused; int zc;
+	COMPZSP;
 
 	EXPLODEXSP;
 	EXPLODEYSP;
-	EXPLODESP(z, zc, zs, ze, zm)
+	EXPLODEZSP;
 
 	FLUSHXSP;
 	FLUSHYSP;
-	FLUSHSP(z, zc, zs, ze, zm);
+	FLUSHZSP;
 
 	ieee754_clearcx();
 
@@ -53,7 +53,7 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754sp_nanxcpt(z);
 	case IEEE754_CLASS_DNORM:
-		SPDNORMx(zm, ze);
+		SPDNORMZ;
 	/* QNAN is handled separately below */
 	}
 

From fb2a789d164e114d3b7fe3723d4df6c505d8b34e Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 14:04:52 +0100
Subject: [PATCH 1452/3220] UPSTREAM: MIPS: math-emu: Fix bit-width in
 ieee754dp_{mul, maddf, msubf} comments

A comment in ieee754dp_mul indicates that the code is about to perform a
32b x 32b multiplication & keep the high 32b of the result. It appears
this was copied from the single-precision multiplication code, since the
code actually goes on to perform a 64b x 64b multiplication & keep the
high 64b of the result. Fix the comment to indicate 64b.

It appears also that this comment was copied verbatim along with the
rest of the multiplication code into ieee754dp_maddf, which has since
been renamed _dp_maddf. Fix the same issue there.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13157/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 95bff2410cdccfe2cf4b99f4e86165956767740e)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c | 2 +-
 arch/mips/math-emu/dp_mul.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 0e1d4d80e22e..dc83fdd43f60 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -165,7 +165,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	ym <<= 64 - (DP_FBITS + 1);
 
 	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
+	 * Multiply 64 bits xm, ym to give high 64 bits rm with stickness.
 	 */
 
 	/* 32 * 32 => 64 */
diff --git a/arch/mips/math-emu/dp_mul.c b/arch/mips/math-emu/dp_mul.c
index d0901f03fa19..d6a7573d2c52 100644
--- a/arch/mips/math-emu/dp_mul.c
+++ b/arch/mips/math-emu/dp_mul.c
@@ -125,7 +125,7 @@ union ieee754dp ieee754dp_mul(union ieee754dp x, union ieee754dp y)
 	ym <<= 64 - (DP_FBITS + 1);
 
 	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
+	 * Multiply 64 bits xm, ym to give high 64 bits rm with stickness.
 	 */
 
 	/* 32 * 32 => 64 */

From 478f521caf2a5232c46a83fef838c66a92631716 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 14:04:53 +0100
Subject: [PATCH 1453/3220] UPSTREAM: MIPS: math-emu: Fix code indentation

A line incrementing the re variable was indented a level too deep in
ieee754dp_mul, making the code unclear to read. Fix the indentation.
This appears to have been copied verbatim along with the rest of the
multiplication code to ieee754dp_maddf, now _dp_maddf, too so fix the
indentation there too.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13158/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 5c18c936b52ae80db5737849e11f436e79b84b2d)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c | 2 +-
 arch/mips/math-emu/dp_mul.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index dc83fdd43f60..4a2d03c72959 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -203,7 +203,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	if ((s64) rm < 0) {
 		rm = (rm >> (64 - (DP_FBITS + 1 + 3))) |
 		     ((rm << (DP_FBITS + 1 + 3)) != 0);
-			re++;
+		re++;
 	} else {
 		rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) |
 		     ((rm << (DP_FBITS + 1 + 3 + 1)) != 0);
diff --git a/arch/mips/math-emu/dp_mul.c b/arch/mips/math-emu/dp_mul.c
index d6a7573d2c52..87d0b44b0614 100644
--- a/arch/mips/math-emu/dp_mul.c
+++ b/arch/mips/math-emu/dp_mul.c
@@ -163,7 +163,7 @@ union ieee754dp ieee754dp_mul(union ieee754dp x, union ieee754dp y)
 	if ((s64) rm < 0) {
 		rm = (rm >> (64 - (DP_FBITS + 1 + 3))) |
 		     ((rm << (DP_FBITS + 1 + 3)) != 0);
-			re++;
+		re++;
 	} else {
 		rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) |
 		     ((rm << (DP_FBITS + 1 + 3 + 1)) != 0);

From d3154ef7038bc2ffdb4056b40d9e3771994d3852 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 14:04:47 +0100
Subject: [PATCH 1454/3220] UPSTREAM: MIPS: inst: Declare fsel_op for sel.fmt
 instruction

Declare the opcode for the MIPSr6 sel.fmt instruction, as fsel_op in
order to match other FP op names.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13152/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit b6d5c4eda7a771e72b3640500e026d72c4f64419)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/uapi/asm/inst.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index b145749c743f..6122914a428a 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -166,6 +166,7 @@ enum cop1_sdw_func {
 	fceill_op    =	0x0a, ffloorl_op   =  0x0b,
 	fround_op    =	0x0c, ftrunc_op	   =  0x0d,
 	fceil_op     =	0x0e, ffloor_op	   =  0x0f,
+	fsel_op      =  0x10,
 	fmovc_op     =	0x11, fmovz_op	   =  0x12,
 	fmovn_op     =	0x13, fseleqz_op   =  0x14,
 	frecip_op    =  0x15, frsqrt_op    =  0x16,

From f67d2acc25243b22b247c9659ba0602667aba936 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 14:04:54 +0100
Subject: [PATCH 1455/3220] UPSTREAM: MIPS: math-emu: Fix m{add,sub}.s shifts

The code in _sp_maddf (formerly ieee754sp_madd) appears to have been
copied verbatim from ieee754sp_add, and although it's adding the
unpacked "r" & "z" floats it kept using macros that operate on "x" &
"y". This led to the addition being carried out incorrectly on some
mismash of the product, accumulator & multiplicand fields. Typically
this would lead to the assertions "ze == re" & "ze <= SP_EMAX" failing
since ze & re hadn't been operated upon.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction")
Cc: Adam Buchbinder <adam.buchbinder@gmail.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13159/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit db57f29d50683afd75c7f8b9908af7669837c3a9)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/ieee754sp.c |  3 ++-
 arch/mips/math-emu/ieee754sp.h | 16 +++++++---------
 arch/mips/math-emu/sp_add.c    |  6 ++++--
 arch/mips/math-emu/sp_maddf.c  | 13 ++++++++-----
 arch/mips/math-emu/sp_sub.c    |  6 ++++--
 5 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/arch/mips/math-emu/ieee754sp.c b/arch/mips/math-emu/ieee754sp.c
index 860e9162097f..260e68965907 100644
--- a/arch/mips/math-emu/ieee754sp.c
+++ b/arch/mips/math-emu/ieee754sp.c
@@ -141,7 +141,8 @@ union ieee754sp ieee754sp_format(int sn, int xe, unsigned xm)
 		} else {
 			/* sticky right shift es bits
 			 */
-			SPXSRSXn(es);
+			xm = XSPSRS(xm, es);
+			xe += es;
 			assert((xm & (SP_HIDDEN_BIT << 3)) == 0);
 			assert(xe == SP_EMIN);
 		}
diff --git a/arch/mips/math-emu/ieee754sp.h b/arch/mips/math-emu/ieee754sp.h
index b24fdff90f21..8476067075fe 100644
--- a/arch/mips/math-emu/ieee754sp.h
+++ b/arch/mips/math-emu/ieee754sp.h
@@ -46,19 +46,17 @@ static inline int ieee754sp_finite(union ieee754sp x)
 }
 
 /* 3bit extended single precision sticky right shift */
-#define SPXSRSXn(rs)							\
-	(xe += rs,							\
-	 xm = (rs > (SP_FBITS+3))?1:((xm) >> (rs)) | ((xm) << (32-(rs)) != 0))
+#define XSPSRS(v, rs)						\
+	((rs > (SP_FBITS+3))?1:((v) >> (rs)) | ((v) << (32-(rs)) != 0))
+
+#define XSPSRS1(m) \
+	((m >> 1) | (m & 1))
 
 #define SPXSRSX1() \
-	(xe++, (xm = (xm >> 1) | (xm & 1)))
-
-#define SPXSRSYn(rs)								\
-	(ye+=rs,								\
-	 ym = (rs > (SP_FBITS+3))?1:((ym) >> (rs)) | ((ym) << (32-(rs)) != 0))
+	(xe++, (xm = XSPSRS1(xm)))
 
 #define SPXSRSY1() \
-	(ye++, (ym = (ym >> 1) | (ym & 1)))
+	(ye++, (ym = XSPSRS1(ym)))
 
 /* convert denormal to normalized with extended exponent */
 #define SPDNORMx(m,e) \
diff --git a/arch/mips/math-emu/sp_add.c b/arch/mips/math-emu/sp_add.c
index f1c87b07d3b4..c55c0c00bca8 100644
--- a/arch/mips/math-emu/sp_add.c
+++ b/arch/mips/math-emu/sp_add.c
@@ -132,13 +132,15 @@ union ieee754sp ieee754sp_add(union ieee754sp x, union ieee754sp y)
 		 * Have to shift y fraction right to align.
 		 */
 		s = xe - ye;
-		SPXSRSYn(s);
+		ym = XSPSRS(ym, s);
+		ye += s;
 	} else if (ye > xe) {
 		/*
 		 * Have to shift x fraction right to align.
 		 */
 		s = ye - xe;
-		SPXSRSXn(s);
+		xm = XSPSRS(xm, s);
+		xe += s;
 	}
 	assert(xe == ye);
 	assert(xe <= SP_EMAX);
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index 86e1d0b25446..a8cd8b4f235e 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -214,16 +214,18 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 
 	if (ze > re) {
 		/*
-		 * Have to shift y fraction right to align.
+		 * Have to shift r fraction right to align.
 		 */
 		s = ze - re;
-		SPXSRSYn(s);
+		rm = XSPSRS(rm, s);
+		re += s;
 	} else if (re > ze) {
 		/*
-		 * Have to shift x fraction right to align.
+		 * Have to shift z fraction right to align.
 		 */
 		s = re - ze;
-		SPXSRSYn(s);
+		zm = XSPSRS(zm, s);
+		ze += s;
 	}
 	assert(ze == re);
 	assert(ze <= SP_EMAX);
@@ -236,7 +238,8 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 		zm = zm + rm;
 
 		if (zm >> (SP_FBITS + 1 + 3)) { /* carry out */
-			SPXSRSX1();
+			zm = XSPSRS1(zm);
+			ze++;
 		}
 	} else {
 		if (zm >= rm) {
diff --git a/arch/mips/math-emu/sp_sub.c b/arch/mips/math-emu/sp_sub.c
index ec5f937a8b3e..dc998ed47295 100644
--- a/arch/mips/math-emu/sp_sub.c
+++ b/arch/mips/math-emu/sp_sub.c
@@ -134,13 +134,15 @@ union ieee754sp ieee754sp_sub(union ieee754sp x, union ieee754sp y)
 		 * have to shift y fraction right to align
 		 */
 		s = xe - ye;
-		SPXSRSYn(s);
+		ym = XSPSRS(ym, s);
+		ye += s;
 	} else if (ye > xe) {
 		/*
 		 * have to shift x fraction right to align
 		 */
 		s = ye - xe;
-		SPXSRSXn(s);
+		xm = XSPSRS(xm, s);
+		xe += s;
 	}
 	assert(xe == ye);
 	assert(xe <= SP_EMAX);

From b1529b8d20af549d4fc5856bec6a86ac9643c02c Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:07 +0100
Subject: [PATCH 1456/3220] UPSTREAM: MIPS: inst.h: Rename b{eq,ne}zcji[al]c_op
 to pop{6,7}6_op

The opcodes currently defined in inst.h as beqzcjic_op & bnezcjialc_op
are actually defined in the MIPS base instruction set manuals as pop66 &
pop76 respectively. Rename them as such, for consistency with the
documentation.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 1c66b79bb3b11942a98085fd89295cf6cddae41a)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/uapi/asm/inst.h | 4 ++--
 arch/mips/kernel/branch.c         | 4 ++--
 arch/mips/math-emu/cp1emu.c       | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 6122914a428a..90a9c06a2b55 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -32,9 +32,9 @@ enum major_op {
 	sb_op, sh_op, swl_op, sw_op,
 	sdl_op, sdr_op, swr_op, cache_op,
 	ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op,
-	lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op,
+	lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op,
 	sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op,
-	scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op
+	scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op
 };
 
 /*
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 71e8f4c0b8da..1d29885d633e 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -799,7 +799,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 		epc += 4 + (insn.i_format.simmediate << 2);
 		regs->cp0_epc = epc;
 		break;
-	case beqzcjic_op:
+	case pop66_op:
 		if (!cpu_has_mips_r6) {
 			ret = -SIGILL;
 			break;
@@ -807,7 +807,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 		/* Compact branch: BEQZC || JIC */
 		regs->cp0_epc += 8;
 		break;
-	case bnezcjialc_op:
+	case pop76_op:
 		if (!cpu_has_mips_r6) {
 			ret = -SIGILL;
 			break;
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 06ffddd916c0..052e72d05b5c 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -683,14 +683,14 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 			dec_insn.next_pc_inc;
 
 		return 1;
-	case beqzcjic_op:
+	case pop66_op:
 		if (!cpu_has_mips_r6)
 			break;
 		*contpc = regs->cp0_epc + dec_insn.pc_inc +
 			dec_insn.next_pc_inc;
 
 		return 1;
-	case bnezcjialc_op:
+	case pop76_op:
 		if (!cpu_has_mips_r6)
 			break;
 		if (!insn.i_format.rs)

From 23080a1ef201305a9017dd3a116bf07ead7d597e Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:08 +0100
Subject: [PATCH 1457/3220] UPSTREAM: MIPS: inst.h: Rename cbcond{0,1}_op to
 pop{1,3}0_op

The opcodes currently defined in inst.h as cbcond0_op & cbcond1_op are
actually defined in the MIPS base instruction set manuals as pop10 &
pop30 respectively. Rename them as such, for consistency with the
documentation.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 1b492600068d5fbd033196ce2bdb28735a23747e)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/uapi/asm/inst.h | 4 ++--
 arch/mips/kernel/branch.c         | 4 ++--
 arch/mips/math-emu/cp1emu.c       | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 90a9c06a2b55..effeef52566b 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -21,11 +21,11 @@
 enum major_op {
 	spec_op, bcond_op, j_op, jal_op,
 	beq_op, bne_op, blez_op, bgtz_op,
-	addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op,
+	addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op,
 	andi_op, ori_op, xori_op, lui_op,
 	cop0_op, cop1_op, cop2_op, cop1x_op,
 	beql_op, bnel_op, blezl_op, bgtzl_op,
-	daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op,
+	daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op,
 	spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op,
 	lb_op, lh_op, lwl_op, lw_op,
 	lbu_op, lhu_op, lwr_op, lwu_op,
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 1d29885d633e..0f9bc15b69ca 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -820,8 +820,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 		regs->cp0_epc += 8;
 		break;
 #endif
-	case cbcond0_op:
-	case cbcond1_op:
+	case pop10_op:
+	case pop30_op:
 		/* Only valid for MIPS R6 */
 		if (!cpu_has_mips_r6) {
 			ret = -SIGILL;
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 052e72d05b5c..abf8a4f52eeb 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -627,8 +627,8 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 				dec_insn.pc_inc +
 				dec_insn.next_pc_inc;
 		return 1;
-	case cbcond0_op:
-	case cbcond1_op:
+	case pop10_op:
+	case pop30_op:
 		if (!cpu_has_mips_r6)
 			break;
 		if (insn.i_format.rt && !insn.i_format.rs)

From a8955d815ec71515ff5d658efc08a0f65f63eb4d Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 4 Mar 2016 01:44:28 +0000
Subject: [PATCH 1458/3220] UPSTREAM: MIPS: traps: Correct the SIGTRAP debug
 ABI in `do_watch' and `do_trap_or_bp'

Follow our own rules set in <asm/siginfo.h> for SIGTRAP signals issued
from `do_watch' and `do_trap_or_bp' by setting the signal code to
TRAP_HWBKPT and TRAP_BRKPT respectively, for Watch exceptions and for
those Breakpoint exceptions whose originating BREAK instruction's code
does not have a special meaning.  Keep Trap exceptions unaffected as
these are not debug events.

No existing user software is expected to examine signal codes for these
signals as SI_KERNEL has been always used here.  This change makes the
MIPS port more like other Linux ports, which reduces the complexity and
provides for performance improvement in GDB.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Luis Machado <lgustavo@codesourcery.com>
Cc: linux-mips@linux-mips.org
Cc: gdb@sourceware.org
Patchwork: https://patchwork.linux-mips.org/patch/12758/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 3b143cca6e1397188f507a6c727f4108861ceb8b)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/mips-r2-to-r6-emul.h |  2 +-
 arch/mips/kernel/mips-r2-to-r6-emul.c      | 12 ++++++------
 arch/mips/kernel/traps.c                   | 18 +++++++++++++-----
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/arch/mips/include/asm/mips-r2-to-r6-emul.h b/arch/mips/include/asm/mips-r2-to-r6-emul.h
index 1f6ea8352ca9..20621e1ca238 100644
--- a/arch/mips/include/asm/mips-r2-to-r6-emul.h
+++ b/arch/mips/include/asm/mips-r2-to-r6-emul.h
@@ -79,7 +79,7 @@ struct r2_decoder_table {
 };
 
 
-extern void do_trap_or_bp(struct pt_regs *regs, unsigned int code,
+extern void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 			  const char *str);
 
 #ifndef CONFIG_MIPSR2_TO_R6_EMULATOR
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index a7a9f7cfb870..66db8387ce1a 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -941,42 +941,42 @@ repeat:
 		switch (rt) {
 		case tgei_op:
 			if ((long)regs->regs[rs] >= MIPSInst_SIMM(inst))
-				do_trap_or_bp(regs, 0, "TGEI");
+				do_trap_or_bp(regs, 0, 0, "TGEI");
 
 			MIPS_R2_STATS(traps);
 
 			break;
 		case tgeiu_op:
 			if (regs->regs[rs] >= MIPSInst_UIMM(inst))
-				do_trap_or_bp(regs, 0, "TGEIU");
+				do_trap_or_bp(regs, 0, 0, "TGEIU");
 
 			MIPS_R2_STATS(traps);
 
 			break;
 		case tlti_op:
 			if ((long)regs->regs[rs] < MIPSInst_SIMM(inst))
-				do_trap_or_bp(regs, 0, "TLTI");
+				do_trap_or_bp(regs, 0, 0, "TLTI");
 
 			MIPS_R2_STATS(traps);
 
 			break;
 		case tltiu_op:
 			if (regs->regs[rs] < MIPSInst_UIMM(inst))
-				do_trap_or_bp(regs, 0, "TLTIU");
+				do_trap_or_bp(regs, 0, 0, "TLTIU");
 
 			MIPS_R2_STATS(traps);
 
 			break;
 		case teqi_op:
 			if (regs->regs[rs] == MIPSInst_SIMM(inst))
-				do_trap_or_bp(regs, 0, "TEQI");
+				do_trap_or_bp(regs, 0, 0, "TEQI");
 
 			MIPS_R2_STATS(traps);
 
 			break;
 		case tnei_op:
 			if (regs->regs[rs] != MIPSInst_SIMM(inst))
-				do_trap_or_bp(regs, 0, "TNEI");
+				do_trap_or_bp(regs, 0, 0, "TNEI");
 
 			MIPS_R2_STATS(traps);
 
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 133274082d0a..e9215406469a 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -56,6 +56,7 @@
 #include <asm/pgtable.h>
 #include <asm/ptrace.h>
 #include <asm/sections.h>
+#include <asm/siginfo.h>
 #include <asm/tlbdebug.h>
 #include <asm/traps.h>
 #include <asm/uaccess.h>
@@ -873,7 +874,7 @@ out:
 	exception_exit(prev_state);
 }
 
-void do_trap_or_bp(struct pt_regs *regs, unsigned int code,
+void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code,
 	const char *str)
 {
 	siginfo_t info = { 0 };
@@ -930,7 +931,13 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code,
 	default:
 		scnprintf(b, sizeof(b), "%s instruction in kernel code", str);
 		die_if_kernel(b, regs);
-		force_sig(SIGTRAP, current);
+		if (si_code) {
+			info.si_signo = SIGTRAP;
+			info.si_code = si_code;
+			force_sig_info(SIGTRAP, &info, current);
+		} else {
+			force_sig(SIGTRAP, current);
+		}
 	}
 }
 
@@ -1014,7 +1021,7 @@ asmlinkage void do_bp(struct pt_regs *regs)
 		break;
 	}
 
-	do_trap_or_bp(regs, bcode, "Break");
+	do_trap_or_bp(regs, bcode, TRAP_BRKPT, "Break");
 
 out:
 	set_fs(seg);
@@ -1056,7 +1063,7 @@ asmlinkage void do_tr(struct pt_regs *regs)
 			tcode = (opcode >> 6) & ((1 << 10) - 1);
 	}
 
-	do_trap_or_bp(regs, tcode, "Trap");
+	do_trap_or_bp(regs, tcode, 0, "Trap");
 
 out:
 	set_fs(seg);
@@ -1507,6 +1514,7 @@ asmlinkage void do_mdmx(struct pt_regs *regs)
  */
 asmlinkage void do_watch(struct pt_regs *regs)
 {
+	siginfo_t info = { .si_signo = SIGTRAP, .si_code = TRAP_HWBKPT };
 	enum ctx_state prev_state;
 	u32 cause;
 
@@ -1527,7 +1535,7 @@ asmlinkage void do_watch(struct pt_regs *regs)
 	if (test_tsk_thread_flag(current, TIF_LOAD_WATCH)) {
 		mips_read_watch_registers();
 		local_irq_enable();
-		force_sig(SIGTRAP, current);
+		force_sig_info(SIGTRAP, &info, current);
 	} else {
 		mips_clear_watch_registers();
 		local_irq_enable();

From d552d767e47c1697aa54675007c3af622e12311c Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 21 May 2016 14:01:20 +0200
Subject: [PATCH 1459/3220] UPSTREAM: MIPS: R6: Fix typo

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Cc: macro@imgtec.com
Cc: paul.burton@imgtec.com
Cc: Leonid.Yegoshin@imgtec.com
Cc: trivial@kernel.org
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/13329/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit f5be47f5a4b74ca4bc58348a0b2a6aa4c8cba1be)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/mips-r2-to-r6-emul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index 66db8387ce1a..7b349a27f9d2 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -2204,7 +2204,7 @@ fpu_emul:
 	}
 
 	/*
-	 * Lets not return to userland just yet. It's constly and
+	 * Lets not return to userland just yet. It's costly and
 	 * it's likely we have more R2 instructions to emulate
 	 */
 	if (!err && (pass++ < MIPS_R2_EMUL_TOTAL_PASS)) {

From 66e81bda602748c4cf93b993e4df50d94c6ffad9 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Sun, 22 May 2016 00:39:18 +0200
Subject: [PATCH 1460/3220] UPSTREAM: MIPS: Spelling fix lets -> let's

As noticed by Sergei in the discussion of Andrea Gelmini's patch series.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Reported-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
(cherry picked from commit 4939788eb8559754a120531c49ffa96bb30fee06)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h      | 2 +-
 arch/mips/include/asm/mach-lantiq/falcon/lantiq_soc.h | 2 +-
 arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h   | 2 +-
 arch/mips/kernel/branch.c                             | 4 ++--
 arch/mips/kernel/elf.c                                | 2 +-
 arch/mips/kernel/mips-r2-to-r6-emul.c                 | 2 +-
 arch/mips/sgi-ip27/ip27-nmi.c                         | 2 +-
 arch/mips/sgi-ip27/ip27-xtalk.c                       | 2 +-
 arch/mips/sni/rm200.c                                 | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h b/arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h
index ca8077afac4a..456ddba152c4 100644
--- a/arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h
+++ b/arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h
@@ -100,7 +100,7 @@ typedef volatile struct au1xxx_ddma_desc {
 	u32	dscr_nxtptr;		/* Next descriptor pointer (mostly) */
 	/*
 	 * First 32 bytes are HW specific!!!
-	 * Lets have some SW data following -- make sure it's 32 bytes.
+	 * Let's have some SW data following -- make sure it's 32 bytes.
 	 */
 	u32	sw_status;
 	u32	sw_context;
diff --git a/arch/mips/include/asm/mach-lantiq/falcon/lantiq_soc.h b/arch/mips/include/asm/mach-lantiq/falcon/lantiq_soc.h
index 98d6a2f14aaf..642504188a5b 100644
--- a/arch/mips/include/asm/mach-lantiq/falcon/lantiq_soc.h
+++ b/arch/mips/include/asm/mach-lantiq/falcon/lantiq_soc.h
@@ -22,7 +22,7 @@
 
 /*
  * during early_printk no ioremap possible at this early stage
- * lets use KSEG1 instead
+ * let's use KSEG1 instead
  */
 #define LTQ_ASC0_BASE_ADDR	0x1E100C00
 #define LTQ_EARLY_ASC		KSEG1ADDR(LTQ_ASC0_BASE_ADDR)
diff --git a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
index dd6005b75e0c..b230347a2b84 100644
--- a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
+++ b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
@@ -75,7 +75,7 @@ extern __iomem void *ltq_cgu_membase;
 
 /*
  * during early_printk no ioremap is possible
- * lets use KSEG1 instead
+ * let's use KSEG1 instead
  */
 #define LTQ_ASC1_BASE_ADDR	0x1E100C00
 #define LTQ_EARLY_ASC		KSEG1ADDR(LTQ_ASC1_BASE_ADDR)
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 0f9bc15b69ca..41dab26c5d69 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -479,7 +479,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 			/*
 			 * OK we are here either because we hit a NAL
 			 * instruction or because we are emulating an
-			 * old bltzal{,l} one. Lets figure out what the
+			 * old bltzal{,l} one. Let's figure out what the
 			 * case really is.
 			 */
 			if (!insn.i_format.rs) {
@@ -511,7 +511,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 			/*
 			 * OK we are here either because we hit a BAL
 			 * instruction or because we are emulating an
-			 * old bgezal{,l} one. Lets figure out what the
+			 * old bgezal{,l} one. Let's figure out what the
 			 * case really is.
 			 */
 			if (!insn.i_format.rs) {
diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index 32ce422a6886..2aaa0825e56d 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -91,7 +91,7 @@ int arch_elf_pt_proc(void *_ehdr, void *_phdr, struct file *elf,
 	elf32 = ehdr->e32.e_ident[EI_CLASS] == ELFCLASS32;
 	flags = elf32 ? ehdr->e32.e_flags : ehdr->e64.e_flags;
 
-	/* Lets see if this is an O32 ELF */
+	/* Let's see if this is an O32 ELF */
 	if (elf32) {
 		if (flags & EF_MIPS_FP64) {
 			/*
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index 7b349a27f9d2..576a1b20bb0f 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -2204,7 +2204,7 @@ fpu_emul:
 	}
 
 	/*
-	 * Lets not return to userland just yet. It's costly and
+	 * Let's not return to userland just yet. It's costly and
 	 * it's likely we have more R2 instructions to emulate
 	 */
 	if (!err && (pass++ < MIPS_R2_EMUL_TOTAL_PASS)) {
diff --git a/arch/mips/sgi-ip27/ip27-nmi.c b/arch/mips/sgi-ip27/ip27-nmi.c
index a2358b44420c..cfceaea92724 100644
--- a/arch/mips/sgi-ip27/ip27-nmi.c
+++ b/arch/mips/sgi-ip27/ip27-nmi.c
@@ -23,7 +23,7 @@ typedef unsigned long machreg_t;
 static arch_spinlock_t nmi_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 
 /*
- * Lets see what else we need to do here. Set up sp, gp?
+ * Let's see what else we need to do here. Set up sp, gp?
  */
 void nmi_dump(void)
 {
diff --git a/arch/mips/sgi-ip27/ip27-xtalk.c b/arch/mips/sgi-ip27/ip27-xtalk.c
index 20f582a2137a..4fe5678ba74d 100644
--- a/arch/mips/sgi-ip27/ip27-xtalk.c
+++ b/arch/mips/sgi-ip27/ip27-xtalk.c
@@ -67,7 +67,7 @@ static int xbow_probe(nasid_t nasid)
 		return -ENODEV;
 
 	/*
-	 * Okay, here's a xbow. Lets arbitrate and find
+	 * Okay, here's a xbow. Let's arbitrate and find
 	 * out if we should initialize it. Set enabled
 	 * hub connected at highest or lowest widget as
 	 * master.
diff --git a/arch/mips/sni/rm200.c b/arch/mips/sni/rm200.c
index a046b302623e..160b88000b4b 100644
--- a/arch/mips/sni/rm200.c
+++ b/arch/mips/sni/rm200.c
@@ -263,7 +263,7 @@ spurious_8259A_irq:
 		static int spurious_irq_mask;
 		/*
 		 * At this point we can be sure the IRQ is spurious,
-		 * lets ACK and report it. [once per IRQ]
+		 * let's ACK and report it. [once per IRQ]
 		 */
 		if (!(spurious_irq_mask & irqmask)) {
 			printk(KERN_DEBUG

From aff62c8fadd7789f87c9a7b856db2f3168b298d7 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 13 Jul 2016 14:12:47 +0100
Subject: [PATCH 1461/3220] UPSTREAM: MIPS: c-r4k: Fix
 protected_writeback_scache_line for EVA

The protected_writeback_scache_line() function is used by
local_r4k_flush_cache_sigtramp() to flush an FPU delay slot emulation
trampoline on the userland stack from the caches so it is visible to
subsequent instruction fetches.

Commit de8974e3f76c ("MIPS: asm: r4kcache: Add EVA cache flushing
functions") updated some protected_ cache flush functions to use EVA
CACHEE instructions via protected_cachee_op(), and commit 83fd43449baa
("MIPS: r4kcache: Add EVA case for protected_writeback_dcache_line") did
the same thing for protected_writeback_dcache_line(), but
protected_writeback_scache_line() never got updated. Lets fix that now
to flush the right user address from the secondary cache rather than
some arbitrary kernel unmapped address.

This issue was spotted through code inspection, and it seems unlikely to
be possible to hit this in practice. It theoretically affect EVA kernels
on EVA capable cores with an L2 cache, where the icache fetches straight
from RAM (cpu_icache_snoops_remote_store == 0), running a hard float
userland with FPU disabled (nofpu). That both Malta and Boston platforms
override cpu_icache_snoops_remote_store to 1 suggests that all MIPS
cores fetch instructions into icache straight from L2 rather than RAM.

Fixes: de8974e3f76c ("MIPS: asm: r4kcache: Add EVA cache flushing functions")
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Leonid Yegoshin <leonid.yegoshin@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/13800/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 0758b116b4080d9a2a2a715bec6eee2cbd828215)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/r4kcache.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h
index 38902bf97adc..667ca3c467b7 100644
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -210,7 +210,11 @@ static inline void protected_writeback_dcache_line(unsigned long addr)
 
 static inline void protected_writeback_scache_line(unsigned long addr)
 {
+#ifdef CONFIG_EVA
+	protected_cachee_op(Hit_Writeback_Inv_SD, addr);
+#else
 	protected_cache_op(Hit_Writeback_Inv_SD, addr);
+#endif
 }
 
 /*

From 98561f4928838a8ab828a660d0d013db1dd50958 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 13 Jul 2016 14:12:48 +0100
Subject: [PATCH 1462/3220] UPSTREAM: MIPS: c-r4k: Fix sigtramp SMP call to use
 kmap

Fix r4k_flush_cache_sigtramp() and local_r4k_flush_cache_sigtramp() to
flush the delay slot emulation trampoline cacheline through a kmap
rather than directly when the active_mm doesn't match that of the task
initiating the flush, a bit like local_r4k_flush_cache_page() does.

This would fix a corner case on SMP systems without hardware globalized
hit cache ops, where a migration to another CPU after the flush, where
that CPU did not have the same mm active at the time of the flush, could
result in stale icache content being executed instead of the trampoline,
e.g. from a previous delay slot emulation with a similar stack pointer.

This case was artificially triggered by replacing the icache flush with
a full indexed flush (not globalized on CM systems) and forcing the SMP
call to take place, with a test program that alternated two FPU delay
slots with a parent process repeatedly changing scheduler affinity.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/13797/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit e523f289fe4d18d509f8827f8a293e8c5e4f51a1)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/mm/c-r4k.c | 75 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 6 deletions(-)

diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 5d3a25e1cfae..4d3859e11f12 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -780,25 +780,72 @@ static void r4k_dma_cache_inv(unsigned long addr, unsigned long size)
 }
 #endif /* CONFIG_DMA_NONCOHERENT || CONFIG_DMA_MAYBE_COHERENT */
 
+struct flush_cache_sigtramp_args {
+	struct mm_struct *mm;
+	struct page *page;
+	unsigned long addr;
+};
+
 /*
  * While we're protected against bad userland addresses we don't care
  * very much about what happens in that case.  Usually a segmentation
  * fault will dump the process later on anyway ...
  */
-static void local_r4k_flush_cache_sigtramp(void * arg)
+static void local_r4k_flush_cache_sigtramp(void *args)
 {
+	struct flush_cache_sigtramp_args *fcs_args = args;
+	unsigned long addr = fcs_args->addr;
+	struct page *page = fcs_args->page;
+	struct mm_struct *mm = fcs_args->mm;
+	int map_coherent = 0;
+	void *vaddr;
+
 	unsigned long ic_lsize = cpu_icache_line_size();
 	unsigned long dc_lsize = cpu_dcache_line_size();
 	unsigned long sc_lsize = cpu_scache_line_size();
-	unsigned long addr = (unsigned long) arg;
+
+	/*
+	 * If owns no valid ASID yet, cannot possibly have gotten
+	 * this page into the cache.
+	 */
+	if (!has_valid_asid(mm))
+		return;
+
+	if (mm == current->active_mm) {
+		vaddr = NULL;
+	} else {
+		/*
+		 * Use kmap_coherent or kmap_atomic to do flushes for
+		 * another ASID than the current one.
+		 */
+		map_coherent = (cpu_has_dc_aliases &&
+				page_mapcount(page) &&
+				!Page_dcache_dirty(page));
+		if (map_coherent)
+			vaddr = kmap_coherent(page, addr);
+		else
+			vaddr = kmap_atomic(page);
+		addr = (unsigned long)vaddr + (addr & ~PAGE_MASK);
+	}
 
 	R4600_HIT_CACHEOP_WAR_IMPL;
 	if (dc_lsize)
-		protected_writeback_dcache_line(addr & ~(dc_lsize - 1));
+		vaddr ? flush_dcache_line(addr & ~(dc_lsize - 1))
+		      : protected_writeback_dcache_line(addr & ~(dc_lsize - 1));
 	if (!cpu_icache_snoops_remote_store && scache_size)
-		protected_writeback_scache_line(addr & ~(sc_lsize - 1));
+		vaddr ? flush_scache_line(addr & ~(sc_lsize - 1))
+		      : protected_writeback_scache_line(addr & ~(sc_lsize - 1));
 	if (ic_lsize)
-		protected_flush_icache_line(addr & ~(ic_lsize - 1));
+		vaddr ? flush_icache_line(addr & ~(ic_lsize - 1))
+		      : protected_flush_icache_line(addr & ~(ic_lsize - 1));
+
+	if (vaddr) {
+		if (map_coherent)
+			kunmap_coherent();
+		else
+			kunmap_atomic(vaddr);
+	}
+
 	if (MIPS4K_ICACHE_REFILL_WAR) {
 		__asm__ __volatile__ (
 			".set push\n\t"
@@ -823,7 +870,23 @@ static void local_r4k_flush_cache_sigtramp(void * arg)
 
 static void r4k_flush_cache_sigtramp(unsigned long addr)
 {
-	r4k_on_each_cpu(local_r4k_flush_cache_sigtramp, (void *) addr);
+	struct flush_cache_sigtramp_args args;
+	int npages;
+
+	down_read(&current->mm->mmap_sem);
+
+	npages = get_user_pages_fast(addr, 1, 0, &args.page);
+	if (npages < 1)
+		goto out;
+
+	args.mm = current->mm;
+	args.addr = addr;
+
+	r4k_on_each_cpu(local_r4k_flush_cache_sigtramp, &args);
+
+	put_page(args.page);
+out:
+	up_read(&current->mm->mmap_sem);
 }
 
 static void r4k_flush_icache_all(void)

From 1c7d67bedb0d84d045f5788536a9ca835f1768e8 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 21 Aug 2016 15:58:13 -0400
Subject: [PATCH 1463/3220] UPSTREAM: MIPS: kernel: Audit and remove any
 unnecessary uses of module.h

Historically a lot of these existed because we did not have
a distinction between what was modular code and what was providing
support to modules via EXPORT_SYMBOL and friends.  That changed
when we forked out support for the latter into the export.h file.

This means we should be able to reduce the usage of module.h
in code that is obj-y Makefile or bool Kconfig.  The advantage
in doing so is that module.h itself sources about 15 other headers;
adding significantly to what we feed cpp, and it can obscure what
headers we are effectively using.

Since module.h was the source for init.h (for __init) and for
export.h (for EXPORT_SYMBOL) we consider each obj-y/bool instance
for the presence of either and replace as needed.

In the case of the n32/o32 files, we have to get rid of a couple
no-op MODULE_ tags to facilitate the module.h removal.  They piggy
back off the fs/ elf binary support, which is also a bool Kconfig.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/14032/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit d9d5417755eda87db8e370e4dd2175fbd8814acc)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/binfmt_elfn32.c      | 8 +-------
 arch/mips/kernel/binfmt_elfo32.c      | 8 +-------
 arch/mips/kernel/branch.c             | 2 +-
 arch/mips/kernel/linux32.c            | 1 -
 arch/mips/kernel/mips-r2-to-r6-emul.c | 1 -
 arch/mips/kernel/smp.c                | 2 +-
 6 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c
index 1188e00bb120..9fd86df3fcfa 100644
--- a/arch/mips/kernel/binfmt_elfn32.c
+++ b/arch/mips/kernel/binfmt_elfn32.c
@@ -1,5 +1,6 @@
 /*
  * Support for n32 Linux/MIPS ELF binaries.
+ * Author: Ralf Baechle (ralf@linux-mips.org)
  *
  * Copyright (C) 1999, 2001 Ralf Baechle
  * Copyright (C) 1999, 2001 Silicon Graphics, Inc.
@@ -51,7 +52,6 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 #define ELF_ET_DYN_BASE		(TASK32_SIZE / 3 * 2)
 
 #include <asm/processor.h>
-#include <linux/module.h>
 #include <linux/elfcore.h>
 #include <linux/compat.h>
 #include <linux/math64.h>
@@ -110,12 +110,6 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value)
 
 #define ELF_CORE_EFLAGS EF_MIPS_ABI2
 
-MODULE_DESCRIPTION("Binary format loader for compatibility with n32 Linux/MIPS binaries");
-MODULE_AUTHOR("Ralf Baechle (ralf@linux-mips.org)");
-
-#undef MODULE_DESCRIPTION
-#undef MODULE_AUTHOR
-
 #undef TASK_SIZE
 #define TASK_SIZE TASK_SIZE32
 
diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c
index 928767858b86..bccbc0123c13 100644
--- a/arch/mips/kernel/binfmt_elfo32.c
+++ b/arch/mips/kernel/binfmt_elfo32.c
@@ -1,5 +1,6 @@
 /*
  * Support for o32 Linux/MIPS ELF binaries.
+ * Author: Ralf Baechle (ralf@linux-mips.org)
  *
  * Copyright (C) 1999, 2001 Ralf Baechle
  * Copyright (C) 1999, 2001 Silicon Graphics, Inc.
@@ -72,7 +73,6 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 
 #include <asm/processor.h>
 
-#include <linux/module.h>
 #include <linux/elfcore.h>
 #include <linux/compat.h>
 #include <linux/math64.h>
@@ -129,12 +129,6 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value)
 	value->tv_usec = rem / NSEC_PER_USEC;
 }
 
-MODULE_DESCRIPTION("Binary format loader for compatibility with o32 Linux/MIPS binaries");
-MODULE_AUTHOR("Ralf Baechle (ralf@linux-mips.org)");
-
-#undef MODULE_DESCRIPTION
-#undef MODULE_AUTHOR
-
 #undef TASK_SIZE
 #define TASK_SIZE TASK_SIZE32
 
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 41dab26c5d69..56f166a48fbc 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -9,7 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <asm/branch.h>
 #include <asm/cpu.h>
 #include <asm/cpu-features.h>
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 0b29646bcee7..50fb62544df7 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -26,7 +26,6 @@
 #include <linux/utsname.h>
 #include <linux/personality.h>
 #include <linux/dnotify.h>
-#include <linux/module.h>
 #include <linux/binfmts.h>
 #include <linux/security.h>
 #include <linux/compat.h>
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index 576a1b20bb0f..d163a8e0a617 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -15,7 +15,6 @@
 #include <linux/debugfs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
 
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 4af08c197177..1ef11f46db5a 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -25,7 +25,7 @@
 #include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/time.h>
 #include <linux/timex.h>
 #include <linux/sched.h>

From fb0c12e677a335c99b7b2a2f72f6fc8ede79ad14 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 28 Oct 2016 08:21:03 +0100
Subject: [PATCH 1464/3220] UPSTREAM: MIPS: Fix FCSR Cause bit handling for
 correct SIGFPE issue

Sanitize FCSR Cause bit handling, following a trail of past attempts:

* commit 4249548454f7 ("MIPS: ptrace: Fix FP context restoration FCSR
regression"),

* commit 443c44032a54 ("MIPS: Always clear FCSR cause bits after
emulation"),

* commit 64bedffe4968 ("MIPS: Clear [MSA]FPE CSR.Cause after
notify_die()"),

* commit b1442d39fac2 ("MIPS: Prevent user from setting FCSR cause
bits"),

* commit b54d2901517d ("Properly handle branch delay slots in connection
with signals.").

Specifically do not mask these bits out in ptrace(2) processing and send
a SIGFPE signal instead whenever a matching pair of an FCSR Cause and
Enable bit is seen as execution of an affected context is about to
resume.  Only then clear Cause bits, and even then do not clear any bits
that are set but masked with the respective Enable bits.  Adjust Cause
bit clearing throughout code likewise, except within the FPU emulator
proper where they are set according to IEEE 754 exceptions raised as the
operation emulated executed.  Do so so that any IEEE 754 exceptions
subject to their default handling are recorded like with operations
executed by FPU hardware.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/14460/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 5a1aca4469fdccd5b74ba0b4e490173b2b447895)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/fpu_emulator.h  | 13 +++++
 arch/mips/include/asm/switch_to.h     | 18 +++++++
 arch/mips/kernel/mips-r2-to-r6-emul.c | 10 ++--
 arch/mips/kernel/ptrace.c             |  7 ++-
 arch/mips/kernel/traps.c              | 72 +++++++++++++++------------
 5 files changed, 78 insertions(+), 42 deletions(-)

diff --git a/arch/mips/include/asm/fpu_emulator.h b/arch/mips/include/asm/fpu_emulator.h
index 355dc25172e7..c05369e0b8d6 100644
--- a/arch/mips/include/asm/fpu_emulator.h
+++ b/arch/mips/include/asm/fpu_emulator.h
@@ -63,6 +63,8 @@ do {									\
 extern int fpu_emulator_cop1Handler(struct pt_regs *xcp,
 				    struct mips_fpu_struct *ctx, int has_fpu,
 				    void *__user *fault_addr);
+void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr,
+		     struct task_struct *tsk);
 int process_fpemu_return(int sig, void __user *fault_addr,
 			 unsigned long fcr31);
 int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
@@ -81,4 +83,15 @@ static inline void fpu_emulator_init_fpu(void)
 		set_fpr64(&t->thread.fpu.fpr[i], 0, SIGNALLING_NAN);
 }
 
+/*
+ * Mask the FCSR Cause bits according to the Enable bits, observing
+ * that Unimplemented is always enabled.
+ */
+static inline unsigned long mask_fcr31_x(unsigned long fcr31)
+{
+	return fcr31 & (FPU_CSR_UNI_X |
+			((fcr31 & FPU_CSR_ALL_E) <<
+			 (ffs(FPU_CSR_ALL_X) - ffs(FPU_CSR_ALL_E))));
+}
+
 #endif /* _ASM_FPU_EMULATOR_H */
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
index ebb5c0f2f90d..c0ae27971e31 100644
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -75,6 +75,22 @@ do {	if (cpu_has_rw_llb) {						\
 	}								\
 } while (0)
 
+/*
+ * Check FCSR for any unmasked exceptions pending set with `ptrace',
+ * clear them and send a signal.
+ */
+#define __sanitize_fcr31(next)						\
+do {									\
+	unsigned long fcr31 = mask_fcr31_x(next->thread.fpu.fcr31);	\
+	void __user *pc;						\
+									\
+	if (unlikely(fcr31)) {						\
+		pc = (void __user *)task_pt_regs(next)->cp0_epc;	\
+		next->thread.fpu.fcr31 &= ~fcr31;			\
+		force_fcr31_sig(fcr31, pc, next);			\
+	}								\
+} while (0)
+
 /*
  * For newly created kernel threads switch_to() will return to
  * ret_from_kernel_thread, newly created user threads to ret_from_fork.
@@ -85,6 +101,8 @@ do {	if (cpu_has_rw_llb) {						\
 do {									\
 	__mips_mt_fpaff_switch_to(prev);				\
 	lose_fpu_inatomic(1, prev);					\
+	if (tsk_used_math(next))					\
+		__sanitize_fcr31(next);					\
 	if (cpu_has_dsp) {						\
 		__save_dsp(prev);					\
 		__restore_dsp(next);					\
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index d163a8e0a617..f6043441373d 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -899,7 +899,7 @@ static inline int mipsr2_find_op_func(struct pt_regs *regs, u32 inst,
  * mipsr2_decoder: Decode and emulate a MIPS R2 instruction
  * @regs: Process register set
  * @inst: Instruction to decode and emulate
- * @fcr31: Floating Point Control and Status Register returned
+ * @fcr31: Floating Point Control and Status Register Cause bits returned
  */
 int mipsr2_decoder(struct pt_regs *regs, u32 inst, unsigned long *fcr31)
 {
@@ -1172,13 +1172,13 @@ fpu_emul:
 
 		err = fpu_emulator_cop1Handler(regs, &current->thread.fpu, 0,
 					       &fault_addr);
-		*fcr31 = current->thread.fpu.fcr31;
 
 		/*
-		 * We can't allow the emulated instruction to leave any of
-		 * the cause bits set in $fcr31.
+		 * We can't allow the emulated instruction to leave any
+		 * enabled Cause bits set in $fcr31.
 		 */
-		current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+		*fcr31 = res = mask_fcr31_x(current->thread.fpu.fcr31);
+		current->thread.fpu.fcr31 &= ~res;
 
 		/*
 		 * this is a tricky issue - lose_fpu() uses LL/SC atomics
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index c3d2d2c05fdb..0f0030e7f6d9 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -79,16 +79,15 @@ void ptrace_disable(struct task_struct *child)
 }
 
 /*
- * Poke at FCSR according to its mask.  Don't set the cause bits as
- * this is currently not handled correctly in FP context restoration
- * and will cause an oops if a corresponding enable bit is set.
+ * Poke at FCSR according to its mask.  Set the Cause bits even
+ * if a corresponding Enable bit is set.  This will be noticed at
+ * the time the thread is switched to and SIGFPE thrown accordingly.
  */
 static void ptrace_setfcr31(struct task_struct *child, u32 value)
 {
 	u32 fcr31;
 	u32 mask;
 
-	value &= ~FPU_CSR_ALL_X;
 	fcr31 = child->thread.fpu.fcr31;
 	mask = boot_cpu_data.fpu_msk31;
 	child->thread.fpu.fcr31 = (value & ~mask) | (fcr31 & mask);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index e9215406469a..2ad27a560f7f 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -706,6 +706,32 @@ asmlinkage void do_ov(struct pt_regs *regs)
 	exception_exit(prev_state);
 }
 
+/*
+ * Send SIGFPE according to FCSR Cause bits, which must have already
+ * been masked against Enable bits.  This is impotant as Inexact can
+ * happen together with Overflow or Underflow, and `ptrace' can set
+ * any bits.
+ */
+void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr,
+		     struct task_struct *tsk)
+{
+	struct siginfo si = { .si_addr = fault_addr, .si_signo = SIGFPE };
+
+	if (fcr31 & FPU_CSR_INV_X)
+		si.si_code = FPE_FLTINV;
+	else if (fcr31 & FPU_CSR_DIV_X)
+		si.si_code = FPE_FLTDIV;
+	else if (fcr31 & FPU_CSR_OVF_X)
+		si.si_code = FPE_FLTOVF;
+	else if (fcr31 & FPU_CSR_UDF_X)
+		si.si_code = FPE_FLTUND;
+	else if (fcr31 & FPU_CSR_INE_X)
+		si.si_code = FPE_FLTRES;
+	else
+		si.si_code = __SI_FAULT;
+	force_sig_info(SIGFPE, &si, tsk);
+}
+
 int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
 {
 	struct siginfo si = { 0 };
@@ -715,27 +741,7 @@ int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
 		return 0;
 
 	case SIGFPE:
-		si.si_addr = fault_addr;
-		si.si_signo = sig;
-		/*
-		 * Inexact can happen together with Overflow or Underflow.
-		 * Respect the mask to deliver the correct exception.
-		 */
-		fcr31 &= (fcr31 & FPU_CSR_ALL_E) <<
-			 (ffs(FPU_CSR_ALL_X) - ffs(FPU_CSR_ALL_E));
-		if (fcr31 & FPU_CSR_INV_X)
-			si.si_code = FPE_FLTINV;
-		else if (fcr31 & FPU_CSR_DIV_X)
-			si.si_code = FPE_FLTDIV;
-		else if (fcr31 & FPU_CSR_OVF_X)
-			si.si_code = FPE_FLTOVF;
-		else if (fcr31 & FPU_CSR_UDF_X)
-			si.si_code = FPE_FLTUND;
-		else if (fcr31 & FPU_CSR_INE_X)
-			si.si_code = FPE_FLTRES;
-		else
-			si.si_code = __SI_FAULT;
-		force_sig_info(sig, &si, current);
+		force_fcr31_sig(fcr31, fault_addr, current);
 		return 1;
 
 	case SIGBUS:
@@ -798,13 +804,13 @@ static int simulate_fp(struct pt_regs *regs, unsigned int opcode,
 	/* Run the emulator */
 	sig = fpu_emulator_cop1Handler(regs, &current->thread.fpu, 1,
 				       &fault_addr);
-	fcr31 = current->thread.fpu.fcr31;
 
 	/*
-	 * We can't allow the emulated instruction to leave any of
-	 * the cause bits set in $fcr31.
+	 * We can't allow the emulated instruction to leave any
+	 * enabled Cause bits set in $fcr31.
 	 */
-	current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+	fcr31 = mask_fcr31_x(current->thread.fpu.fcr31);
+	current->thread.fpu.fcr31 &= ~fcr31;
 
 	/* Restore the hardware register state */
 	own_fpu(1);
@@ -830,7 +836,7 @@ asmlinkage void do_fpe(struct pt_regs *regs, unsigned long fcr31)
 		goto out;
 
 	/* Clear FCSR.Cause before enabling interrupts */
-	write_32bit_cp1_register(CP1_STATUS, fcr31 & ~FPU_CSR_ALL_X);
+	write_32bit_cp1_register(CP1_STATUS, fcr31 & ~mask_fcr31_x(fcr31));
 	local_irq_enable();
 
 	die_if_kernel("FP exception in kernel code", regs);
@@ -852,13 +858,13 @@ asmlinkage void do_fpe(struct pt_regs *regs, unsigned long fcr31)
 		/* Run the emulator */
 		sig = fpu_emulator_cop1Handler(regs, &current->thread.fpu, 1,
 					       &fault_addr);
-		fcr31 = current->thread.fpu.fcr31;
 
 		/*
-		 * We can't allow the emulated instruction to leave any of
-		 * the cause bits set in $fcr31.
+		 * We can't allow the emulated instruction to leave any
+		 * enabled Cause bits set in $fcr31.
 		 */
-		current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+		fcr31 = mask_fcr31_x(current->thread.fpu.fcr31);
+		current->thread.fpu.fcr31 &= ~fcr31;
 
 		/* Restore the hardware register state */
 		own_fpu(1);	/* Using the FPU again.	 */
@@ -1437,13 +1443,13 @@ asmlinkage void do_cpu(struct pt_regs *regs)
 
 		sig = fpu_emulator_cop1Handler(regs, &current->thread.fpu, 0,
 					       &fault_addr);
-		fcr31 = current->thread.fpu.fcr31;
 
 		/*
 		 * We can't allow the emulated instruction to leave
-		 * any of the cause bits set in $fcr31.
+		 * any enabled Cause bits set in $fcr31.
 		 */
-		current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+		fcr31 = mask_fcr31_x(current->thread.fpu.fcr31);
+		current->thread.fpu.fcr31 &= ~fcr31;
 
 		/* Send a signal if required.  */
 		if (!process_fpemu_return(sig, fault_addr, fcr31) && !err)

From f9b39ce8d4433676ce8ccc6e044a6818ebf5e649 Mon Sep 17 00:00:00 2001
From: Jonas Gorski <jogo@openwrt.org>
Date: Mon, 20 Jun 2016 11:27:37 +0200
Subject: [PATCH 1465/3220] BACKPORT: MIPS: store the appended dtb address in a
 variable

Instead of rewriting the arguments to match the UHI spec, store the
address of a appended or UHI supplied dtb in fw_supplied_dtb.

That way the original bootloader arugments are kept intact while still
making the use of an appended dtb invisible for mach code.

Mach code can still find out if it is an appended dtb by comparing
fw_arg1 with fw_supplied_dtb.

Signed-off-by: Jonas Gorski <jogo@openwrt.org>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: John Crispin <john@phrozen.org>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Alban Bedel <albeu@free.fr>
Cc: Daniel Gimpelevich <daniel@gimpelevich.san-francisco.ca.us>
Cc: Antony Pavlov <antonynpavlov@gmail.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/13699/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 15f37e1588920e010f20b53f04af94e91b8ee714)

Conflicts:
	arch/mips/ath79/setup.c
	arch/mips/lantiq/prom.c
	arch/mips/pic32/pic32mzda/init.c
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/bmips/setup.c          |  4 ++--
 arch/mips/include/asm/bootinfo.h |  4 ++++
 arch/mips/kernel/head.S          | 21 ++++++++++++++-------
 arch/mips/kernel/setup.c         |  4 ++++
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c
index 5b16d2955fbb..a2a8344b8a64 100644
--- a/arch/mips/bmips/setup.c
+++ b/arch/mips/bmips/setup.c
@@ -149,8 +149,8 @@ void __init plat_mem_setup(void)
 	/* intended to somewhat resemble ARM; see Documentation/arm/Booting */
 	if (fw_arg0 == 0 && fw_arg1 == 0xffffffff)
 		dtb = phys_to_virt(fw_arg2);
-	else if (fw_arg0 == -2) /* UHI interface */
-		dtb = (void *)fw_arg1;
+	else if (fw_passed_dtb) /* UHI interface */
+		dtb = (void *)fw_passed_dtb;
 	else if (__dtb_start != __dtb_end)
 		dtb = (void *)__dtb_start;
 	else
diff --git a/arch/mips/include/asm/bootinfo.h b/arch/mips/include/asm/bootinfo.h
index b603804caac5..6effbf5c71ec 100644
--- a/arch/mips/include/asm/bootinfo.h
+++ b/arch/mips/include/asm/bootinfo.h
@@ -127,6 +127,10 @@ extern char arcs_cmdline[COMMAND_LINE_SIZE];
  */
 extern unsigned long fw_arg0, fw_arg1, fw_arg2, fw_arg3;
 
+#ifdef CONFIG_USE_OF
+extern unsigned long fw_passed_dtb;
+#endif
+
 /*
  * Platform memory detection hook called by setup_arch
  */
diff --git a/arch/mips/kernel/head.S b/arch/mips/kernel/head.S
index 4e4cc5b9a771..462989e89ec9 100644
--- a/arch/mips/kernel/head.S
+++ b/arch/mips/kernel/head.S
@@ -94,21 +94,24 @@ NESTED(kernel_entry, 16, sp)			# kernel entry point
 	jr	t0
 0:
 
+#ifdef CONFIG_USE_OF
 #ifdef CONFIG_MIPS_RAW_APPENDED_DTB
-	PTR_LA		t0, __appended_dtb
+	PTR_LA		t2, __appended_dtb
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
 	li		t1, 0xd00dfeed
 #else
 	li		t1, 0xedfe0dd0
 #endif
-	lw		t2, (t0)
-	bne		t1, t2, not_found
-	 nop
+	lw		t0, (t2)
+	beq		t0, t1, dtb_found
+#endif
+	li		t1, -2
+	beq		a0, t1, dtb_found
+	move		t2, a1
 
-	move		a1, t0
-	PTR_LI		a0, -2
-not_found:
+	li		t2, 0
+dtb_found:
 #endif
 	PTR_LA		t0, __bss_start		# clear .bss
 	LONG_S		zero, (t0)
@@ -123,6 +126,10 @@ not_found:
 	LONG_S		a2, fw_arg2
 	LONG_S		a3, fw_arg3
 
+#ifdef CONFIG_USE_OF
+	LONG_S		t2, fw_passed_dtb
+#endif
+
 	MTC0		zero, CP0_CONTEXT	# clear context register
 	PTR_LA		$28, init_thread_union
 	/* Set the SP after an empty pt_regs.  */
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 4f9f1ae49213..ac010f157763 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -886,6 +886,10 @@ void __init setup_arch(char **cmdline_p)
 unsigned long kernelsp[NR_CPUS];
 unsigned long fw_arg0, fw_arg1, fw_arg2, fw_arg3;
 
+#ifdef CONFIG_USE_OF
+unsigned long fw_passed_dtb;
+#endif
+
 #ifdef CONFIG_DEBUG_FS
 struct dentry *mips_debugfs_dir;
 static int __init debugfs_mips(void)

From 0075089fffece3280029ec95d234079f3d9e72f3 Mon Sep 17 00:00:00 2001
From: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Date: Thu, 19 Nov 2015 17:38:21 -0800
Subject: [PATCH 1466/3220] UPSTREAM: MIPS: remove aliasing alignment if HW has
 antialising support

MIPS hardware may have an antialising support and it works even
page size is small.

Setup a shared memory aliasing mask to page size if hardware has
an antialising support. Big shared memory mask forces a disruption
in page address assignment and that corrupts Android library memory
handling.

Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Cc: cernekee@gmail.com
Cc: paul.gortmaker@windriver.com
Cc: kumba@gentoo.org
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/11516/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit cb80b2a38bd609b2f5a650e9ab87ea50105ad5ea)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/mm/c-r4k.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 4d3859e11f12..5628f85bc5b0 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -1733,7 +1733,7 @@ void r4k_cache_init(void)
 	 * This code supports virtually indexed processors and will be
 	 * unnecessarily inefficient on physically indexed processors.
 	 */
-	if (c->dcache.linesz)
+	if (c->dcache.linesz && cpu_has_dc_aliases)
 		shm_align_mask = max_t( unsigned long,
 					c->dcache.sets * c->dcache.linesz - 1,
 					PAGE_SIZE - 1);

From 4fed79c571c30386bac6e7b1cf4fa5313d9add3f Mon Sep 17 00:00:00 2001
From: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Date: Mon, 13 Mar 2017 16:36:35 +0100
Subject: [PATCH 1467/3220] UPSTREAM: MIPS: r2-on-r6-emu: Fix BLEZL and BGTZL
 identification

Fix the problem of inaccurate identification of instructions BLEZL and
BGTZL in R2 emulation code by making sure all necessary encoding
specifications are met.

Previously, certain R6 instructions could be identified as BLEZL or
BGTZL. R2 emulation routine didn't take into account that both BLEZL
and BGTZL instructions require their rt field (bits 20 to 16 of
instruction encoding) to be 0, and that, at same time, if the value in
that field is not 0, the encoding may represent a legitimate MIPS R6
instruction.

This means that a problem could occur after emulation optimization,
when emulation routine tried to pipeline emulation, picked up a next
candidate, and subsequently misrecognized an R6 instruction as BLEZL
or BGTZL.

It should be said that for single pass strategy, the problem does not
happen because CPU doesn't trap on branch-compacts which share opcode
space with BLEZL/BGTZL (but have rt field != 0, of course).

Signed-off-by: Leonid Yegoshin <leonid.yegoshin@imgtec.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtech.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtech.com>
Reported-by: Douglas Leung <douglas.leung@imgtec.com>
Reviewed-by: Paul Burton <paul.burton@imgtec.com>
Cc: james.hogan@imgtec.com
Cc: petar.jovanovic@imgtec.com
Cc: goran.ferenc@imgtec.com
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/15456/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 5bba7aa4958e271c3ffceb70d47d3206524cf489)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/mips-r2-to-r6-emul.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index f6043441373d..8267f36191bd 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -1096,10 +1096,20 @@ repeat:
 		}
 		break;
 
-	case beql_op:
-	case bnel_op:
 	case blezl_op:
 	case bgtzl_op:
+		/*
+		 * For BLEZL and BGTZL, rt field must be set to 0. If this
+		 * is not the case, this may be an encoding of a MIPS R6
+		 * instruction, so return to CPU execution if this occurs
+		 */
+		if (MIPSInst_RT(inst)) {
+			err = SIGILL;
+			break;
+		}
+		/* fall through */
+	case beql_op:
+	case bnel_op:
 		if (delay_slot(regs)) {
 			err = SIGILL;
 			break;

From 4e734c6f34248141045bf588de81b91441036bb6 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Mon, 13 Mar 2017 16:36:36 +0100
Subject: [PATCH 1468/3220] UPSTREAM: MIPS: r2-on-r6-emu: Clear BLTZALL and
 BGEZALL debugfs counters

Add missing clearing of BLTZALL and BGEZALL emulation counters in
function mipsr2_stats_clear_show().

Previously, it was not possible to reset BLTZALL and BGEZALL
emulation counters - their value remained the same even after
explicit request via debugfs. As far as other related counters
are concerned, they all seem to be properly cleared.

This change affects debugfs operation only, core R2 emulation
functionality is not affected.

Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Reviewed-by: Paul Burton <paul.burton@imgtec.com>
Cc: james.hogan@imgtec.com
Cc: leonid.yegoshin@imgtec.com
Cc: douglas.leung@imgtec.com
Cc: petar.jovanovic@imgtec.com
Cc: miodrag.dinic@imgtec.com
Cc: goran.ferenc@imgtec.com
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/15517/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 411dac79cc2ed80f7e348ccc23eb4d8b0ba9f6d5)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/mips-r2-to-r6-emul.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index 8267f36191bd..d7fbcc56c66f 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -2339,6 +2339,8 @@ static int mipsr2_stats_clear_show(struct seq_file *s, void *unused)
 	__this_cpu_write((mipsr2bremustats).bgezl, 0);
 	__this_cpu_write((mipsr2bremustats).bltzll, 0);
 	__this_cpu_write((mipsr2bremustats).bgezll, 0);
+	__this_cpu_write((mipsr2bremustats).bltzall, 0);
+	__this_cpu_write((mipsr2bremustats).bgezall, 0);
 	__this_cpu_write((mipsr2bremustats).bltzal, 0);
 	__this_cpu_write((mipsr2bremustats).bgezal, 0);
 	__this_cpu_write((mipsr2bremustats).beql, 0);

From da8a6c79d7a1fd696b745a715045db3bb31110d2 Mon Sep 17 00:00:00 2001
From: Douglas Leung <douglas.leung@imgtec.com>
Date: Mon, 13 Mar 2017 16:36:37 +0100
Subject: [PATCH 1469/3220] UPSTREAM: MIPS: math-emu: Fix BC1EQZ and BC1NEZ
 condition handling

Correct the treatment of branching conditions for BC1EQZ and BC1NEZ
instructions in function isBranchInstr().

Previously, corresponding conditions were swapped, which in turn meant
that, for these two instructions, function isBranchInstr() returned
wrong value in its output parameter contpc.

This change is actually an extension of the fix done by the commit
93583e178ebf ("MIPS: math-emu: Fix BC1{EQ,NE}Z emulation"). That commit
dealt with a similar problem in function cop1Emulate(), while this
commit deals with condition handling in function isBranchInstr().
The code styles of changes in these two commits are kept as
consistent as possible.

Signed-off-by: Douglas Leung <douglas.leung@imgtec.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Reviewed-by: Paul Burton <paul.burton@imgtec.com>
Cc: james.hogan@imgtec.com
Cc: leonid.yegoshin@imgtec.com
Cc: petar.jovanovic@imgtec.com
Cc: goran.ferenc@imgtec.com
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/15489/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 8bcd84a4a37c88d8304ca3a64f0461a51487e239)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index abf8a4f52eeb..f32ec09a8ad4 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -440,6 +440,8 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 	union mips_instruction insn = (union mips_instruction)dec_insn.insn;
 	unsigned int fcr31;
 	unsigned int bit = 0;
+	unsigned int bit0;
+	union fpureg *fpr;
 
 	switch (insn.i_format.opcode) {
 	case spec_op:
@@ -707,14 +709,14 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 		    ((insn.i_format.rs == bc1eqz_op) ||
 		     (insn.i_format.rs == bc1nez_op))) {
 			bit = 0;
+			fpr = &current->thread.fpu.fpr[insn.i_format.rt];
+			bit0 = get_fpr32(fpr, 0) & 0x1;
 			switch (insn.i_format.rs) {
 			case bc1eqz_op:
-				if (get_fpr32(&current->thread.fpu.fpr[insn.i_format.rt], 0) & 0x1)
-				    bit = 1;
+				bit = bit0 == 0;
 				break;
 			case bc1nez_op:
-				if (!(get_fpr32(&current->thread.fpu.fpr[insn.i_format.rt], 0) & 0x1))
-				    bit = 1;
+				bit = bit0 != 0;
 				break;
 			}
 			if (bit)

From 54c1aa3ea2d1bc8612535c677f01b4f14b7551ab Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Thu, 21 Apr 2016 12:25:38 +0100
Subject: [PATCH 1470/3220] UPSTREAM: MIPS: Allow emulation for unaligned
 [LS]DXC1 instructions

If an address error exception occurs for a LDXC1 or SDXC1 instruction,
within the cop1x opcode space, allow it to be passed through to the FPU
emulator rather than resulting in a SIGILL. This causes LDXC1 & SDXC1 to
be handled in a manner consistent with the more common LDC1 & SDC1
instructions.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Tested-by: Aurelien Jarno <aurelien@aurel32.net>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/13143/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit e70ac023f9515c70cf2b291a294f0f250df29847)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/unaligned.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index 5c62065cbf22..28b3af73a17b 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -1191,6 +1191,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 	case ldc1_op:
 	case swc1_op:
 	case sdc1_op:
+	case cop1x_op:
 		die_if_kernel("Unaligned FP access in kernel code", regs);
 		BUG_ON(!used_math());
 

From 62725e3e7dc340d70d965a5cdf5c23389a1a5af8 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 5 Oct 2016 18:18:19 +0100
Subject: [PATCH 1471/3220] UPSTREAM: MIPS: Support generating Flattened Image
 Trees (.itb)

Add support for generating kernel images in the Flattened Image Tree
(.itb) format as supported by U-Boot. This format is essentially a
Flattened Device Tree binary containing images (kernels, DTBs, ramdisks)
and configurations which link those images together. The big advantages
of FIT images over the uImage format are:

  - We can include FDTs in the kernel image in a way that the bootloader
    can extract it & manipulate it before providing it to the kernel.
    Thus we can ship FDTs as part of the kernel giving us the advantages
    of being able to develop & maintain the DT within the kernel tree,
    but also have the benefits of the bootloader being able to
    manipulate the FDT. Example uses for this would be to inject the
    kernel command line into the chosen node, or to fill in the correct
    memory size.

  - We can include multiple configurations in a single kernel image.
    This means that a single FIT image can, given appropriate
    bootloaders, be booted on different boards with the bootloader
    selecting an appropriate configuration & providing the correct FDT
    to the kernel.

  - We can support a multitude of hashes over the data.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/14352/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit cf2a5e0bb4c66e8c43caf9f1be93a1bd7fd07b17)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/Makefile      |  8 +++++-
 arch/mips/boot/Makefile | 57 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 252e347958f3..0ed953e3a5fe 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -263,7 +263,8 @@ KBUILD_CPPFLAGS += -DVMLINUX_LOAD_ADDRESS=$(load-y)
 KBUILD_CPPFLAGS += -DDATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)
 
 bootvars-y	= VMLINUX_LOAD_ADDRESS=$(load-y) \
-		  VMLINUX_ENTRY_ADDRESS=$(entry-y)
+		  VMLINUX_ENTRY_ADDRESS=$(entry-y) \
+		  PLATFORM=$(platform-y)
 
 LDFLAGS			+= -m $(ld-emul)
 
@@ -303,6 +304,11 @@ boot-y			+= uImage.gz
 boot-y			+= uImage.lzma
 boot-y			+= uImage.lzo
 endif
+boot-y			+= vmlinux.itb
+boot-y			+= vmlinux.gz.itb
+boot-y			+= vmlinux.bz2.itb
+boot-y			+= vmlinux.lzma.itb
+boot-y			+= vmlinux.lzo.itb
 
 # compressed boot image targets (arch/mips/boot/compressed/)
 bootz-y			:= vmlinuz
diff --git a/arch/mips/boot/Makefile b/arch/mips/boot/Makefile
index acb1988f354e..ed656636acce 100644
--- a/arch/mips/boot/Makefile
+++ b/arch/mips/boot/Makefile
@@ -100,3 +100,60 @@ $(obj)/uImage.lzo: $(obj)/vmlinux.bin.lzo FORCE
 $(obj)/uImage: $(obj)/uImage.$(suffix-y)
 	@ln -sf $(notdir $<) $@
 	@echo '  Image $@ is ready'
+
+#
+# Flattened Image Tree (.itb) images
+#
+
+targets += vmlinux.itb
+targets += vmlinux.gz.itb
+targets += vmlinux.bz2.itb
+targets += vmlinux.lzma.itb
+targets += vmlinux.lzo.itb
+
+quiet_cmd_cpp_its_S = ITS     $@
+      cmd_cpp_its_S = $(CPP) $(cpp_flags) -P -C -o $@ $< \
+		        -DKERNEL_NAME="\"Linux $(KERNELRELEASE)\"" \
+			-DVMLINUX_BINARY="\"$(3)\"" \
+			-DVMLINUX_COMPRESSION="\"$(2)\"" \
+			-DVMLINUX_LOAD_ADDRESS=$(VMLINUX_LOAD_ADDRESS) \
+			-DVMLINUX_ENTRY_ADDRESS=$(VMLINUX_ENTRY_ADDRESS)
+
+$(obj)/vmlinux.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+	$(call if_changed_dep,cpp_its_S,none,vmlinux.bin)
+
+$(obj)/vmlinux.gz.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+	$(call if_changed_dep,cpp_its_S,gzip,vmlinux.bin.gz)
+
+$(obj)/vmlinux.bz2.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+	$(call if_changed_dep,cpp_its_S,bzip2,vmlinux.bin.bz2)
+
+$(obj)/vmlinux.lzma.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+	$(call if_changed_dep,cpp_its_S,lzma,vmlinux.bin.lzma)
+
+$(obj)/vmlinux.lzo.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+	$(call if_changed_dep,cpp_its_S,lzo,vmlinux.bin.lzo)
+
+quiet_cmd_itb-image = ITB     $@
+      cmd_itb-image = \
+		env PATH="$(objtree)/scripts/dtc:$(PATH)" \
+		$(CONFIG_SHELL) $(MKIMAGE) \
+		-D "-I dts -O dtb -p 500 \
+			--include $(objtree)/arch/mips \
+			--warning no-unit_address_vs_reg" \
+		-f $(2) $@
+
+$(obj)/vmlinux.itb: $(obj)/vmlinux.its $(obj)/vmlinux.bin FORCE
+	$(call if_changed,itb-image,$<)
+
+$(obj)/vmlinux.gz.itb: $(obj)/vmlinux.gz.its $(obj)/vmlinux.bin.gz FORCE
+	$(call if_changed,itb-image,$<)
+
+$(obj)/vmlinux.bz2.itb: $(obj)/vmlinux.bz2.its $(obj)/vmlinux.bin.bz2 FORCE
+	$(call if_changed,itb-image,$<)
+
+$(obj)/vmlinux.lzma.itb: $(obj)/vmlinux.lzma.its $(obj)/vmlinux.bin.lzma FORCE
+	$(call if_changed,itb-image,$<)
+
+$(obj)/vmlinux.lzo.itb: $(obj)/vmlinux.lzo.its $(obj)/vmlinux.bin.lzo FORCE
+	$(call if_changed,itb-image,$<)

From 9c0e042c61a2c66013b6905ea3c04f0766460ebb Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 5 Oct 2016 18:18:20 +0100
Subject: [PATCH 1472/3220] UPSTREAM: MIPS: generic: Introduce generic DT-based
 board support

Introduce a "generic" platform, which aims to be board-agnostic by
making use of device trees passed by the boot protocol defined in the
MIPS UHI (Universal Hosting Interface) specification. Provision is made
for supporting boards which use a legacy boot protocol that can't be
changed, but adding support for such boards or any others is left to
followon patches.

Right now the built kernels expect to be loaded to 0x80100000, ie. in
kseg0. This is fine for the vast majority of MIPS platforms, but
nevertheless it would be good to remove this limitation in the future by
mapping the kernel via the TLB such that it can be loaded anywhere & map
itself appropriately.

Configuration is handled by dynamically generating configs using
scripts/kconfig/merge_config.sh, somewhat similar to the way powerpc
makes use of it. This allows for variations upon the configuration, eg.
differing architecture revisions or subsets of driver support for
differing boards, to be handled without having a large number of
defconfig files.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/14353/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit eed0eabd12ef061821cbfa20d903476e07645320)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/Kbuild.platforms                 |   1 +
 arch/mips/Kconfig                          |  52 ++++++
 arch/mips/Makefile                         |  56 +++++++
 arch/mips/boot/Makefile                    |  11 +-
 arch/mips/configs/generic/32r1.config      |   2 +
 arch/mips/configs/generic/32r2.config      |   3 +
 arch/mips/configs/generic/32r6.config      |   2 +
 arch/mips/configs/generic/64r1.config      |   4 +
 arch/mips/configs/generic/64r2.config      |   5 +
 arch/mips/configs/generic/64r6.config      |   4 +
 arch/mips/configs/generic/eb.config        |   1 +
 arch/mips/configs/generic/el.config        |   1 +
 arch/mips/configs/generic/micro32r2.config |   4 +
 arch/mips/configs/generic_defconfig        |  96 +++++++++++
 arch/mips/generic/Kconfig                  |  12 ++
 arch/mips/generic/Makefile                 |  13 ++
 arch/mips/generic/Platform                 |  14 ++
 arch/mips/generic/init.c                   | 176 +++++++++++++++++++++
 arch/mips/generic/irq.c                    |  64 ++++++++
 arch/mips/generic/proc.c                   |  29 ++++
 arch/mips/generic/vmlinux.its.S            |  31 ++++
 arch/mips/include/asm/machine.h            |  63 ++++++++
 22 files changed, 643 insertions(+), 1 deletion(-)
 create mode 100644 arch/mips/configs/generic/32r1.config
 create mode 100644 arch/mips/configs/generic/32r2.config
 create mode 100644 arch/mips/configs/generic/32r6.config
 create mode 100644 arch/mips/configs/generic/64r1.config
 create mode 100644 arch/mips/configs/generic/64r2.config
 create mode 100644 arch/mips/configs/generic/64r6.config
 create mode 100644 arch/mips/configs/generic/eb.config
 create mode 100644 arch/mips/configs/generic/el.config
 create mode 100644 arch/mips/configs/generic/micro32r2.config
 create mode 100644 arch/mips/configs/generic_defconfig
 create mode 100644 arch/mips/generic/Kconfig
 create mode 100644 arch/mips/generic/Makefile
 create mode 100644 arch/mips/generic/Platform
 create mode 100644 arch/mips/generic/init.c
 create mode 100644 arch/mips/generic/irq.c
 create mode 100644 arch/mips/generic/proc.c
 create mode 100644 arch/mips/generic/vmlinux.its.S
 create mode 100644 arch/mips/include/asm/machine.h

diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms
index a96c81d1d22e..5631eaa8e54b 100644
--- a/arch/mips/Kbuild.platforms
+++ b/arch/mips/Kbuild.platforms
@@ -11,6 +11,7 @@ platforms += cavium-octeon
 platforms += cobalt
 platforms += dec
 platforms += emma
+platforms += generic
 platforms += jazz
 platforms += jz4740
 platforms += lantiq
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 75e92584013b..a5db20d06584 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -73,6 +73,57 @@ choice
 	prompt "System type"
 	default SGI_IP22
 
+config MIPS_GENERIC
+	bool "Generic board-agnostic MIPS kernel"
+	select BOOT_RAW
+	select BUILTIN_DTB
+	select CEVT_R4K
+	select CLKSRC_MIPS_GIC
+	select COMMON_CLK
+	select CPU_MIPSR2_IRQ_VI
+	select CPU_MIPSR2_IRQ_EI
+	select CSRC_R4K
+	select DMA_PERDEV_COHERENT
+	select HW_HAS_PCI
+	select IRQ_MIPS_CPU
+	select LIBFDT
+	select MIPS_CPU_SCACHE
+	select MIPS_GIC
+	select MIPS_L1_CACHE_SHIFT_7
+	select NO_EXCEPT_FILL
+	select PCI_DRIVERS_GENERIC
+	select PINCTRL
+	select SMP_UP if SMP
+	select SYS_HAS_CPU_MIPS32_R1
+	select SYS_HAS_CPU_MIPS32_R2
+	select SYS_HAS_CPU_MIPS32_R6
+	select SYS_HAS_CPU_MIPS64_R1
+	select SYS_HAS_CPU_MIPS64_R2
+	select SYS_HAS_CPU_MIPS64_R6
+	select SYS_SUPPORTS_32BIT_KERNEL
+	select SYS_SUPPORTS_64BIT_KERNEL
+	select SYS_SUPPORTS_BIG_ENDIAN
+	select SYS_SUPPORTS_HIGHMEM
+	select SYS_SUPPORTS_LITTLE_ENDIAN
+	select SYS_SUPPORTS_MICROMIPS
+	select SYS_SUPPORTS_MIPS_CPS
+	select SYS_SUPPORTS_MIPS16
+	select SYS_SUPPORTS_MULTITHREADING
+	select SYS_SUPPORTS_RELOCATABLE
+	select SYS_SUPPORTS_SMARTMIPS
+	select USB_EHCI_BIG_ENDIAN_DESC if BIG_ENDIAN
+	select USB_EHCI_BIG_ENDIAN_MMIO if BIG_ENDIAN
+	select USB_OHCI_BIG_ENDIAN_DESC if BIG_ENDIAN
+	select USB_OHCI_BIG_ENDIAN_MMIO if BIG_ENDIAN
+	select USB_UHCI_BIG_ENDIAN_DESC if BIG_ENDIAN
+	select USB_UHCI_BIG_ENDIAN_MMIO if BIG_ENDIAN
+	select USE_OF
+	help
+	  Select this to build a kernel which aims to support multiple boards,
+	  generally using a flattened device tree passed from the bootloader
+	  using the boot protocol defined in the UHI (Unified Hosting
+	  Interface) specification.
+
 config MIPS_ALCHEMY
 	bool "Alchemy processor based machines"
 	select ARCH_PHYS_ADDR_T_64BIT
@@ -979,6 +1030,7 @@ source "arch/mips/ath79/Kconfig"
 source "arch/mips/bcm47xx/Kconfig"
 source "arch/mips/bcm63xx/Kconfig"
 source "arch/mips/bmips/Kconfig"
+source "arch/mips/generic/Kconfig"
 source "arch/mips/jazz/Kconfig"
 source "arch/mips/jz4740/Kconfig"
 source "arch/mips/lantiq/Kconfig"
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 0ed953e3a5fe..9a1f330eb7ff 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -265,6 +265,12 @@ KBUILD_CPPFLAGS += -DDATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)
 bootvars-y	= VMLINUX_LOAD_ADDRESS=$(load-y) \
 		  VMLINUX_ENTRY_ADDRESS=$(entry-y) \
 		  PLATFORM=$(platform-y)
+ifdef CONFIG_32BIT
+bootvars-y	+= ADDR_BITS=32
+endif
+ifdef CONFIG_64BIT
+bootvars-y	+= ADDR_BITS=64
+endif
 
 LDFLAGS			+= -m $(ld-emul)
 
@@ -419,4 +425,54 @@ define archhelp
 	echo '  dtbs_install         - Install dtbs to $(INSTALL_DTBS_PATH)'
 	echo
 	echo '  These will be default as appropriate for a configured platform.'
+	echo
+	echo '  If you are targeting a system supported by generic kernels you may'
+	echo '  configure the kernel for a given architecture target like so:'
+	echo
+	echo '  {micro32,32,64}{r1,r2,r6}{el,}_defconfig <BOARDS="list of boards">'
+	echo
+	echo '  Otherwise, the following default configurations are available:'
 endef
+
+generic_config_dir = $(srctree)/arch/$(ARCH)/configs/generic
+generic_defconfigs :=
+
+#
+# If the user generates a generic kernel configuration without specifying a
+# list of boards to include the config fragments for, default to including all
+# available board config fragments.
+#
+ifeq ($(BOARDS),)
+BOARDS = $(patsubst board-%.config,%,$(notdir $(wildcard $(generic_config_dir)/board-*.config)))
+endif
+
+#
+# Generic kernel configurations which merge generic_defconfig with the
+# appropriate config fragments from arch/mips/configs/generic/, resulting in
+# the ability to easily configure the kernel for a given architecture,
+# endianness & set of boards without duplicating the needed configuration in
+# hundreds of defconfig files.
+#
+define gen_generic_defconfigs
+$(foreach bits,$(1),$(foreach rev,$(2),$(foreach endian,$(3),
+target := $(bits)$(rev)$(filter el,$(endian))_defconfig
+generic_defconfigs += $$(target)
+$$(target): $(generic_config_dir)/$(bits)$(rev).config
+$$(target): $(generic_config_dir)/$(endian).config
+)))
+endef
+
+$(eval $(call gen_generic_defconfigs,32 64,r1 r2 r6,eb el))
+$(eval $(call gen_generic_defconfigs,micro32,r2,eb el))
+
+.PHONY: $(generic_defconfigs)
+$(generic_defconfigs):
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh \
+		-m -O $(objtree) $(srctree)/arch/$(ARCH)/configs/generic_defconfig $^ \
+		$(foreach board,$(BOARDS),$(generic_config_dir)/board-$(board).config)
+	$(Q)$(MAKE) olddefconfig
+
+#
+# Prevent generic merge_config rules attempting to merge single fragments
+#
+$(generic_config_dir)/%.config: ;
diff --git a/arch/mips/boot/Makefile b/arch/mips/boot/Makefile
index ed656636acce..2728a9a9c7c5 100644
--- a/arch/mips/boot/Makefile
+++ b/arch/mips/boot/Makefile
@@ -111,13 +111,22 @@ targets += vmlinux.bz2.itb
 targets += vmlinux.lzma.itb
 targets += vmlinux.lzo.itb
 
+ifeq ($(ADDR_BITS),32)
+	itb_addr_cells = 1
+endif
+ifeq ($(ADDR_BITS),64)
+	itb_addr_cells = 2
+endif
+
 quiet_cmd_cpp_its_S = ITS     $@
       cmd_cpp_its_S = $(CPP) $(cpp_flags) -P -C -o $@ $< \
 		        -DKERNEL_NAME="\"Linux $(KERNELRELEASE)\"" \
 			-DVMLINUX_BINARY="\"$(3)\"" \
 			-DVMLINUX_COMPRESSION="\"$(2)\"" \
 			-DVMLINUX_LOAD_ADDRESS=$(VMLINUX_LOAD_ADDRESS) \
-			-DVMLINUX_ENTRY_ADDRESS=$(VMLINUX_ENTRY_ADDRESS)
+			-DVMLINUX_ENTRY_ADDRESS=$(VMLINUX_ENTRY_ADDRESS) \
+			-DADDR_BITS=$(ADDR_BITS) \
+			-DADDR_CELLS=$(itb_addr_cells)
 
 $(obj)/vmlinux.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
 	$(call if_changed_dep,cpp_its_S,none,vmlinux.bin)
diff --git a/arch/mips/configs/generic/32r1.config b/arch/mips/configs/generic/32r1.config
new file mode 100644
index 000000000000..a11cd8715519
--- /dev/null
+++ b/arch/mips/configs/generic/32r1.config
@@ -0,0 +1,2 @@
+CONFIG_CPU_MIPS32_R1=y
+CONFIG_HIGHMEM=y
diff --git a/arch/mips/configs/generic/32r2.config b/arch/mips/configs/generic/32r2.config
new file mode 100644
index 000000000000..9570672d4f9f
--- /dev/null
+++ b/arch/mips/configs/generic/32r2.config
@@ -0,0 +1,3 @@
+CONFIG_CPU_MIPS32_R2=y
+CONFIG_MIPS_O32_FP64_SUPPORT=y
+CONFIG_HIGHMEM=y
diff --git a/arch/mips/configs/generic/32r6.config b/arch/mips/configs/generic/32r6.config
new file mode 100644
index 000000000000..ca606e71f4d0
--- /dev/null
+++ b/arch/mips/configs/generic/32r6.config
@@ -0,0 +1,2 @@
+CONFIG_CPU_MIPS32_R6=y
+CONFIG_HIGHMEM=y
diff --git a/arch/mips/configs/generic/64r1.config b/arch/mips/configs/generic/64r1.config
new file mode 100644
index 000000000000..7c1ea7e7bae3
--- /dev/null
+++ b/arch/mips/configs/generic/64r1.config
@@ -0,0 +1,4 @@
+CONFIG_CPU_MIPS64_R1=y
+CONFIG_64BIT=y
+CONFIG_MIPS32_O32=y
+CONFIG_MIPS32_N32=y
diff --git a/arch/mips/configs/generic/64r2.config b/arch/mips/configs/generic/64r2.config
new file mode 100644
index 000000000000..b4d31ae8bfec
--- /dev/null
+++ b/arch/mips/configs/generic/64r2.config
@@ -0,0 +1,5 @@
+CONFIG_CPU_MIPS64_R2=y
+CONFIG_MIPS_O32_FP64_SUPPORT=y
+CONFIG_64BIT=y
+CONFIG_MIPS32_O32=y
+CONFIG_MIPS32_N32=y
diff --git a/arch/mips/configs/generic/64r6.config b/arch/mips/configs/generic/64r6.config
new file mode 100644
index 000000000000..7cac0339c4d5
--- /dev/null
+++ b/arch/mips/configs/generic/64r6.config
@@ -0,0 +1,4 @@
+CONFIG_CPU_MIPS64_R6=y
+CONFIG_64BIT=y
+CONFIG_MIPS32_O32=y
+CONFIG_MIPS32_N32=y
diff --git a/arch/mips/configs/generic/eb.config b/arch/mips/configs/generic/eb.config
new file mode 100644
index 000000000000..c5cdc99a6530
--- /dev/null
+++ b/arch/mips/configs/generic/eb.config
@@ -0,0 +1 @@
+CONFIG_CPU_BIG_ENDIAN=y
diff --git a/arch/mips/configs/generic/el.config b/arch/mips/configs/generic/el.config
new file mode 100644
index 000000000000..ee43fdb3b8f4
--- /dev/null
+++ b/arch/mips/configs/generic/el.config
@@ -0,0 +1 @@
+CONFIG_CPU_LITTLE_ENDIAN=y
diff --git a/arch/mips/configs/generic/micro32r2.config b/arch/mips/configs/generic/micro32r2.config
new file mode 100644
index 000000000000..b701fe7aaa68
--- /dev/null
+++ b/arch/mips/configs/generic/micro32r2.config
@@ -0,0 +1,4 @@
+CONFIG_CPU_MIPS32_R2=y
+CONFIG_CPU_MICROMIPS=y
+CONFIG_MIPS_O32_FP64_SUPPORT=y
+CONFIG_HIGHMEM=y
diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig
new file mode 100644
index 000000000000..c95d94c7838b
--- /dev/null
+++ b/arch/mips/configs/generic_defconfig
@@ -0,0 +1,96 @@
+CONFIG_MIPS_GENERIC=y
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_MIPS_CPS=y
+CONFIG_CPU_HAS_MSA=y
+CONFIG_HIGHMEM=y
+CONFIG_NR_CPUS=2
+CONFIG_MIPS_O32_FP64_SUPPORT=y
+CONFIG_SYSVIPC=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_BLK_CGROUP=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_NAMESPACES=y
+CONFIG_USER_NS=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_USERFAULTFD=y
+CONFIG_EMBEDDED=y
+# CONFIG_SLUB_DEBUG is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_CC_STACKPROTECTOR_REGULAR=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_TRIM_UNUSED_KSYMS=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_NETFILTER=y
+# CONFIG_WIRELESS is not set
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_SCSI=y
+# CONFIG_INPUT_MOUSEDEV is not set
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_SERIO is not set
+CONFIG_HW_RANDOM=y
+# CONFIG_HWMON is not set
+CONFIG_MFD_SYSCON=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+# CONFIG_USB_SUPPORT is not set
+# CONFIG_MIPS_PLATFORM_DEVICES is not set
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_FANOTIFY=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_OVERLAY_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_V4_1=y
+CONFIG_NFS_V4_2=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
+CONFIG_DEBUG_FS=y
+# CONFIG_SCHED_DEBUG is not set
+# CONFIG_FTRACE is not set
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="earlycon"
+# CONFIG_XZ_DEC_X86 is not set
+# CONFIG_XZ_DEC_POWERPC is not set
+# CONFIG_XZ_DEC_IA64 is not set
+# CONFIG_XZ_DEC_ARM is not set
+# CONFIG_XZ_DEC_ARMTHUMB is not set
+# CONFIG_XZ_DEC_SPARC is not set
diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig
new file mode 100644
index 000000000000..baddb0646b23
--- /dev/null
+++ b/arch/mips/generic/Kconfig
@@ -0,0 +1,12 @@
+if MIPS_GENERIC
+
+config LEGACY_BOARDS
+	bool
+	help
+	  Select this from your board if the board must use a legacy, non-UHI,
+	  boot protocol. This will cause the kernel to scan through the list of
+	  supported machines calling their detect functions in turn if the
+	  kernel is booted without being provided with an FDT via the UHI
+	  boot protocol.
+
+endif
diff --git a/arch/mips/generic/Makefile b/arch/mips/generic/Makefile
new file mode 100644
index 000000000000..26e64207f7ad
--- /dev/null
+++ b/arch/mips/generic/Makefile
@@ -0,0 +1,13 @@
+#
+# Copyright (C) 2016 Imagination Technologies
+# Author: Paul Burton <paul.burton@imgtec.com>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation;  either version 2 of the  License, or (at your
+# option) any later version.
+#
+
+obj-y += init.o
+obj-y += irq.o
+obj-y += proc.o
diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform
new file mode 100644
index 000000000000..9a30d69e2281
--- /dev/null
+++ b/arch/mips/generic/Platform
@@ -0,0 +1,14 @@
+#
+# Copyright (C) 2016 Imagination Technologies
+# Author: Paul Burton <paul.burton@imgtec.com>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation;  either version 2 of the  License, or (at your
+# option) any later version.
+#
+
+platform-$(CONFIG_MIPS_GENERIC)	+= generic/
+cflags-$(CONFIG_MIPS_GENERIC)	+= -I$(srctree)/arch/mips/include/asm/mach-generic
+load-$(CONFIG_MIPS_GENERIC)	+= 0xffffffff80100000
+all-$(CONFIG_MIPS_GENERIC)	:= vmlinux.gz.itb
diff --git a/arch/mips/generic/init.c b/arch/mips/generic/init.c
new file mode 100644
index 000000000000..0ea73e845440
--- /dev/null
+++ b/arch/mips/generic/init.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2016 Imagination Technologies
+ * Author: Paul Burton <paul.burton@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/irqchip.h>
+#include <linux/of_fdt.h>
+#include <linux/of_platform.h>
+
+#include <asm/fw/fw.h>
+#include <asm/irq_cpu.h>
+#include <asm/machine.h>
+#include <asm/mips-cpc.h>
+#include <asm/prom.h>
+#include <asm/smp-ops.h>
+#include <asm/time.h>
+
+static __initdata const void *fdt;
+static __initdata const struct mips_machine *mach;
+static __initdata const void *mach_match_data;
+
+void __init prom_init(void)
+{
+	const struct mips_machine *check_mach;
+	const struct of_device_id *match;
+
+	if ((fw_arg0 == -2) && !fdt_check_header((void *)fw_arg1)) {
+		/*
+		 * We booted using the UHI boot protocol, so we have been
+		 * provided with the appropriate device tree for the board.
+		 * Make use of it & search for any machine struct based upon
+		 * the root compatible string.
+		 */
+		fdt = (void *)fw_arg1;
+
+		for_each_mips_machine(check_mach) {
+			match = mips_machine_is_compatible(check_mach, fdt);
+			if (match) {
+				mach = check_mach;
+				mach_match_data = match->data;
+				break;
+			}
+		}
+	} else if (IS_ENABLED(CONFIG_LEGACY_BOARDS)) {
+		/*
+		 * We weren't booted using the UHI boot protocol, but do
+		 * support some number of boards with legacy boot protocols.
+		 * Attempt to find the right one.
+		 */
+		for_each_mips_machine(check_mach) {
+			if (!check_mach->detect)
+				continue;
+
+			if (!check_mach->detect())
+				continue;
+
+			mach = check_mach;
+		}
+
+		/*
+		 * If we don't recognise the machine then we can't continue, so
+		 * die here.
+		 */
+		BUG_ON(!mach);
+
+		/* Retrieve the machine's FDT */
+		fdt = mach->fdt;
+	}
+
+	BUG_ON(!fdt);
+}
+
+void __init *plat_get_fdt(void)
+{
+	return (void *)fdt;
+}
+
+void __init plat_mem_setup(void)
+{
+	if (mach && mach->fixup_fdt)
+		fdt = mach->fixup_fdt(fdt, mach_match_data);
+
+	strlcpy(arcs_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+	__dt_setup_arch((void *)fdt);
+}
+
+void __init device_tree_init(void)
+{
+	int err;
+
+	unflatten_and_copy_device_tree();
+	mips_cpc_probe();
+
+	err = register_cps_smp_ops();
+	if (err)
+		err = register_up_smp_ops();
+}
+
+void __init plat_time_init(void)
+{
+	struct device_node *np;
+	struct clk *clk;
+
+	of_clk_init(NULL);
+
+	if (!cpu_has_counter) {
+		mips_hpt_frequency = 0;
+	} else if (mach && mach->measure_hpt_freq) {
+		mips_hpt_frequency = mach->measure_hpt_freq();
+	} else {
+		np = of_get_cpu_node(0, NULL);
+		if (!np) {
+			pr_err("Failed to get CPU node\n");
+			return;
+		}
+
+		clk = of_clk_get(np, 0);
+		if (IS_ERR(clk)) {
+			pr_err("Failed to get CPU clock: %ld\n", PTR_ERR(clk));
+			return;
+		}
+
+		mips_hpt_frequency = clk_get_rate(clk);
+		clk_put(clk);
+
+		switch (boot_cpu_type()) {
+		case CPU_20KC:
+		case CPU_25KF:
+			/* The counter runs at the CPU clock rate */
+			break;
+		default:
+			/* The counter runs at half the CPU clock rate */
+			mips_hpt_frequency /= 2;
+			break;
+		}
+	}
+
+	clocksource_probe();
+}
+
+void __init arch_init_irq(void)
+{
+	struct device_node *intc_node;
+
+	intc_node = of_find_compatible_node(NULL, NULL,
+					    "mti,cpu-interrupt-controller");
+	if (!cpu_has_veic && !intc_node)
+		mips_cpu_irq_init();
+
+	irqchip_init();
+}
+
+static int __init publish_devices(void)
+{
+	if (!of_have_populated_dt())
+		panic("Device-tree not present");
+
+	if (of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL))
+		panic("Failed to populate DT");
+
+	return 0;
+}
+arch_initcall(publish_devices);
+
+void __init prom_free_prom_memory(void)
+{
+}
diff --git a/arch/mips/generic/irq.c b/arch/mips/generic/irq.c
new file mode 100644
index 000000000000..14064bdd91dd
--- /dev/null
+++ b/arch/mips/generic/irq.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2016 Imagination Technologies
+ * Author: Paul Burton <paul.burton@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/irqchip/mips-gic.h>
+#include <linux/types.h>
+
+#include <asm/irq.h>
+
+int get_c0_fdc_int(void)
+{
+	int mips_cpu_fdc_irq;
+
+	if (cpu_has_veic)
+		panic("Unimplemented!");
+	else if (gic_present)
+		mips_cpu_fdc_irq = gic_get_c0_fdc_int();
+	else if (cp0_fdc_irq >= 0)
+		mips_cpu_fdc_irq = MIPS_CPU_IRQ_BASE + cp0_fdc_irq;
+	else
+		mips_cpu_fdc_irq = -1;
+
+	return mips_cpu_fdc_irq;
+}
+
+int get_c0_perfcount_int(void)
+{
+	int mips_cpu_perf_irq;
+
+	if (cpu_has_veic)
+		panic("Unimplemented!");
+	else if (gic_present)
+		mips_cpu_perf_irq = gic_get_c0_perfcount_int();
+	else if (cp0_perfcount_irq >= 0)
+		mips_cpu_perf_irq = MIPS_CPU_IRQ_BASE + cp0_perfcount_irq;
+	else
+		mips_cpu_perf_irq = -1;
+
+	return mips_cpu_perf_irq;
+}
+
+unsigned int get_c0_compare_int(void)
+{
+	int mips_cpu_timer_irq;
+
+	if (cpu_has_veic)
+		panic("Unimplemented!");
+	else if (gic_present)
+		mips_cpu_timer_irq = gic_get_c0_compare_int();
+	else
+		mips_cpu_timer_irq = MIPS_CPU_IRQ_BASE + cp0_compare_irq;
+
+	return mips_cpu_timer_irq;
+}
diff --git a/arch/mips/generic/proc.c b/arch/mips/generic/proc.c
new file mode 100644
index 000000000000..42b33250a4a2
--- /dev/null
+++ b/arch/mips/generic/proc.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2016 Imagination Technologies
+ * Author: Paul Burton <paul.burton@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/of.h>
+
+#include <asm/bootinfo.h>
+
+const char *get_system_type(void)
+{
+	const char *str;
+	int err;
+
+	err = of_property_read_string(of_root, "model", &str);
+	if (!err)
+		return str;
+
+	err = of_property_read_string_index(of_root, "compatible", 0, &str);
+	if (!err)
+		return str;
+
+	return "Unknown";
+}
diff --git a/arch/mips/generic/vmlinux.its.S b/arch/mips/generic/vmlinux.its.S
new file mode 100644
index 000000000000..f67fbf1c8541
--- /dev/null
+++ b/arch/mips/generic/vmlinux.its.S
@@ -0,0 +1,31 @@
+/dts-v1/;
+
+/ {
+	description = KERNEL_NAME;
+	#address-cells = <ADDR_CELLS>;
+
+	images {
+		kernel@0 {
+			description = KERNEL_NAME;
+			data = /incbin/(VMLINUX_BINARY);
+			type = "kernel";
+			arch = "mips";
+			os = "linux";
+			compression = VMLINUX_COMPRESSION;
+			load = /bits/ ADDR_BITS <VMLINUX_LOAD_ADDRESS>;
+			entry = /bits/ ADDR_BITS <VMLINUX_ENTRY_ADDRESS>;
+			hash@0 {
+				algo = "sha1";
+			};
+		};
+	};
+
+	configurations {
+		default = "conf@default";
+
+		conf@default {
+			description = "Generic Linux kernel";
+			kernel = "kernel@0";
+		};
+	};
+};
diff --git a/arch/mips/include/asm/machine.h b/arch/mips/include/asm/machine.h
new file mode 100644
index 000000000000..6b444cd9526f
--- /dev/null
+++ b/arch/mips/include/asm/machine.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2016 Imagination Technologies
+ * Author: Paul Burton <paul.burton@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __MIPS_ASM_MACHINE_H__
+#define __MIPS_ASM_MACHINE_H__
+
+#include <linux/libfdt.h>
+#include <linux/of.h>
+
+struct mips_machine {
+	const struct of_device_id *matches;
+	const void *fdt;
+	bool (*detect)(void);
+	const void *(*fixup_fdt)(const void *fdt, const void *match_data);
+	unsigned int (*measure_hpt_freq)(void);
+};
+
+extern long __mips_machines_start;
+extern long __mips_machines_end;
+
+#define MIPS_MACHINE(name)						\
+	static const struct mips_machine __mips_mach_##name		\
+		__used __section(.mips.machines.init)
+
+#define for_each_mips_machine(mach)					\
+	for ((mach) = (struct mips_machine *)&__mips_machines_start;	\
+	     (mach) < (struct mips_machine *)&__mips_machines_end;	\
+	     (mach)++)
+
+/**
+ * mips_machine_is_compatible() - check if a machine is compatible with an FDT
+ * @mach: the machine struct to check
+ * @fdt: the FDT to check for compatibility with
+ *
+ * Check whether the given machine @mach is compatible with the given flattened
+ * device tree @fdt, based upon the compatibility property of the root node.
+ *
+ * Return: the device id matched if any, else NULL
+ */
+static inline const struct of_device_id *
+mips_machine_is_compatible(const struct mips_machine *mach, const void *fdt)
+{
+	const struct of_device_id *match;
+
+	if (!mach->matches)
+		return NULL;
+
+	for (match = mach->matches; match->compatible; match++) {
+		if (fdt_node_check_compatible(fdt, 0, match->compatible) == 0)
+			return match;
+	}
+
+	return NULL;
+}
+
+#endif /* __MIPS_ASM_MACHINE_H__ */

From bdb66e07860f072f60d095c5dd9ab7f508d5ea00 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 15 Oct 2016 23:03:43 +0100
Subject: [PATCH 1473/3220] UPSTREAM: MIPS: CPC: Provide default
 mips_cpc_default_phys_base to ignore CPC

Provide a default implementation of mips_cpc_default_phys_base() which
simply returns 0, and adjust mips_cpc_phys_base() to allow for
mips_cpc_default_phys_base() returning 0. This allows kernels which
include CPC support to be built without platform code & simply ignore
the CPC if it wasn't already enabled by the bootloader.

This fixes link failures such as the following from generic defconfigs:

   arch/mips/built-in.o: In function `mips_cpc_phys_base':
   arch/mips/kernel/mips-cpc.c:47: undefined reference to `mips_cpc_default_phys_base'

[ralf@linux-mips.org: changed prototype for coding style compliance.]

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/14401/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 682c1e52215da4a3e89c14aad60bfc0d400b025f)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/mips-cpc.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kernel/mips-cpc.c b/arch/mips/kernel/mips-cpc.c
index 566b8d2c092c..b232caf4d53c 100644
--- a/arch/mips/kernel/mips-cpc.c
+++ b/arch/mips/kernel/mips-cpc.c
@@ -21,6 +21,11 @@ static DEFINE_PER_CPU_ALIGNED(spinlock_t, cpc_core_lock);
 
 static DEFINE_PER_CPU_ALIGNED(unsigned long, cpc_core_lock_flags);
 
+phys_addr_t __weak mips_cpc_default_phys_base(void)
+{
+	return 0;
+}
+
 /**
  * mips_cpc_phys_base - retrieve the physical base address of the CPC
  *
@@ -43,8 +48,12 @@ static phys_addr_t mips_cpc_phys_base(void)
 	if (cpc_base & CM_GCR_CPC_BASE_CPCEN_MSK)
 		return cpc_base & CM_GCR_CPC_BASE_CPCBASE_MSK;
 
-	/* Otherwise, give it the default address & enable it */
+	/* Otherwise, use the default address */
 	cpc_base = mips_cpc_default_phys_base();
+	if (!cpc_base)
+		return cpc_base;
+
+	/* Enable the CPC, mapped at the default address */
 	write_gcr_cpc_base(cpc_base | CM_GCR_CPC_BASE_CPCEN_MSK);
 	return cpc_base;
 }

From ad0828f49b4e70ee6527f2d8f25bca2d0fbc71fd Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 5 Oct 2016 18:18:14 +0100
Subject: [PATCH 1474/3220] UPSTREAM: MIPS: Sanitise coherentio semantics

The coherentio variable has previously been used as a boolean value,
indicating whether the user specified that coherent I/O should be
enabled or disabled. It failed to take into account the case where the
user does not specify any preference, in which case it makes sense that
we should default to coherent I/O if the hardware supports it
(hw_coherentio is non-zero).

Introduce an enum to clarify the 3 different values of coherentio & use
it throughout the code, modifying plat_device_is_coherent() &
r4k_cache_init() to take into account the default case.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: Paul Burton <paul.burton@imgtec.com>
Patchwork: https://patchwork.linux-mips.org/patch/14347/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit f23020230e682a43cc4706cabb041bba469df2d6)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/alchemy/common/setup.c                   |  6 +++---
 arch/mips/include/asm/dma-coherence.h              | 12 +++++++++---
 arch/mips/include/asm/mach-generic/dma-coherence.h | 10 +++++++++-
 arch/mips/mm/c-r4k.c                               |  3 ++-
 arch/mips/mm/dma-default.c                         |  7 ++++---
 arch/mips/mti-malta/malta-setup.c                  |  4 ++--
 arch/mips/pci/pci-alchemy.c                        |  3 ++-
 7 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/arch/mips/alchemy/common/setup.c b/arch/mips/alchemy/common/setup.c
index 2902138b3e0f..7faaa6d593a7 100644
--- a/arch/mips/alchemy/common/setup.c
+++ b/arch/mips/alchemy/common/setup.c
@@ -48,17 +48,17 @@ void __init plat_mem_setup(void)
 		clear_c0_config(1 << 19); /* Clear Config[OD] */
 
 	hw_coherentio = 0;
-	coherentio = 1;
+	coherentio = IO_COHERENCE_ENABLED;
 	switch (alchemy_get_cputype()) {
 	case ALCHEMY_CPU_AU1000:
 	case ALCHEMY_CPU_AU1500:
 	case ALCHEMY_CPU_AU1100:
-		coherentio = 0;
+		coherentio = IO_COHERENCE_DISABLED;
 		break;
 	case ALCHEMY_CPU_AU1200:
 		/* Au1200 AB USB does not support coherent memory */
 		if (0 == (read_c0_prid() & PRID_REV_MASK))
-			coherentio = 0;
+			coherentio = IO_COHERENCE_DISABLED;
 		break;
 	}
 
diff --git a/arch/mips/include/asm/dma-coherence.h b/arch/mips/include/asm/dma-coherence.h
index bc5e85d579e6..4fbce79fb57f 100644
--- a/arch/mips/include/asm/dma-coherence.h
+++ b/arch/mips/include/asm/dma-coherence.h
@@ -9,14 +9,20 @@
 #ifndef __ASM_DMA_COHERENCE_H
 #define __ASM_DMA_COHERENCE_H
 
+enum coherent_io_user_state {
+	IO_COHERENCE_DEFAULT,
+	IO_COHERENCE_ENABLED,
+	IO_COHERENCE_DISABLED,
+};
+
 #ifdef CONFIG_DMA_MAYBE_COHERENT
-extern int coherentio;
+extern enum coherent_io_user_state coherentio;
 extern int hw_coherentio;
 #else
 #ifdef CONFIG_DMA_COHERENT
-#define coherentio	1
+#define coherentio	IO_COHERENCE_ENABLED
 #else
-#define coherentio	0
+#define coherentio	IO_COHERENCE_DISABLED
 #endif
 #define hw_coherentio	0
 #endif /* CONFIG_DMA_MAYBE_COHERENT */
diff --git a/arch/mips/include/asm/mach-generic/dma-coherence.h b/arch/mips/include/asm/mach-generic/dma-coherence.h
index 0f8a354fd468..8484f82fc794 100644
--- a/arch/mips/include/asm/mach-generic/dma-coherence.h
+++ b/arch/mips/include/asm/mach-generic/dma-coherence.h
@@ -49,7 +49,15 @@ static inline int plat_dma_supported(struct device *dev, u64 mask)
 
 static inline int plat_device_is_coherent(struct device *dev)
 {
-	return coherentio;
+	switch (coherentio) {
+	default:
+	case IO_COHERENCE_DEFAULT:
+		return hw_coherentio;
+	case IO_COHERENCE_ENABLED:
+		return 1;
+	case IO_COHERENCE_DISABLED:
+		return 0;
+	}
 }
 
 #ifndef plat_post_dma_flush
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 5628f85bc5b0..1602ae199017 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -1759,7 +1759,8 @@ void r4k_cache_init(void)
 	local_flush_icache_range	= local_r4k_flush_icache_range;
 
 #if defined(CONFIG_DMA_NONCOHERENT) || defined(CONFIG_DMA_MAYBE_COHERENT)
-	if (coherentio) {
+	if ((coherentio == IO_COHERENCE_ENABLED) ||
+	    ((coherentio == IO_COHERENCE_DEFAULT) && hw_coherentio)) {
 		_dma_cache_wback_inv	= (void *)cache_noop;
 		_dma_cache_wback	= (void *)cache_noop;
 		_dma_cache_inv		= (void *)cache_noop;
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index 730d394ce5f0..7276b8941d17 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -25,13 +25,14 @@
 #include <dma-coherence.h>
 
 #ifdef CONFIG_DMA_MAYBE_COHERENT
-int coherentio = 0;	/* User defined DMA coherency from command line. */
+/* User defined DMA coherency from command line. */
+enum coherent_io_user_state coherentio = IO_COHERENCE_DEFAULT;
 EXPORT_SYMBOL_GPL(coherentio);
 int hw_coherentio = 0;	/* Actual hardware supported DMA coherency setting. */
 
 static int __init setcoherentio(char *str)
 {
-	coherentio = 1;
+	coherentio = IO_COHERENCE_ENABLED;
 	pr_info("Hardware DMA cache coherency (command line)\n");
 	return 0;
 }
@@ -39,7 +40,7 @@ early_param("coherentio", setcoherentio);
 
 static int __init setnocoherentio(char *str)
 {
-	coherentio = 0;
+	coherentio = IO_COHERENCE_DISABLED;
 	pr_info("Software DMA cache coherency (command line)\n");
 	return 0;
 }
diff --git a/arch/mips/mti-malta/malta-setup.c b/arch/mips/mti-malta/malta-setup.c
index 36b09b2ea972..363c9a34e57e 100644
--- a/arch/mips/mti-malta/malta-setup.c
+++ b/arch/mips/mti-malta/malta-setup.c
@@ -154,12 +154,12 @@ static void __init plat_setup_iocoherency(void)
 	 * coherency instead.
 	 */
 	if (plat_enable_iocoherency()) {
-		if (coherentio == 0)
+		if (coherentio == IO_COHERENCE_DISABLED)
 			pr_info("Hardware DMA cache coherency disabled\n");
 		else
 			pr_info("Hardware DMA cache coherency enabled\n");
 	} else {
-		if (coherentio == 1)
+		if (coherentio == IO_COHERENCE_ENABLED)
 			pr_info("Hardware DMA cache coherency unsupported, but enabled from command line!\n");
 		else
 			pr_info("Software DMA cache coherency enabled\n");
diff --git a/arch/mips/pci/pci-alchemy.c b/arch/mips/pci/pci-alchemy.c
index 28952637a862..f945ee85dcaf 100644
--- a/arch/mips/pci/pci-alchemy.c
+++ b/arch/mips/pci/pci-alchemy.c
@@ -429,7 +429,8 @@ static int alchemy_pci_probe(struct platform_device *pdev)
 
 	/* Au1500 revisions older than AD have borked coherent PCI */
 	if ((alchemy_get_cputype() == ALCHEMY_CPU_AU1500) &&
-	    (read_c0_prid() < 0x01030202) && !coherentio) {
+	    (read_c0_prid() < 0x01030202) &&
+	    (coherentio == IO_COHERENCE_DISABLED)) {
 		val = __raw_readl(ctx->regs + PCI_REG_CONFIG);
 		val |= PCI_CONFIG_NC;
 		__raw_writel(val, ctx->regs + PCI_REG_CONFIG);

From 5e7488820cf9e76f9395701fa8cff0b2c1767eee Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 5 Oct 2016 18:18:15 +0100
Subject: [PATCH 1475/3220] UPSTREAM: MIPS: dma-default: Don't check
 hw_coherentio if device is non-coherent

There are no cases where plat_device_is_coherent() will return zero
whilst hw_coherentio is non-zero, and acting any differently in such a
case doesn't make much sense - if a device is non-coherent with the CPU
caches then access to memory "coherent" with DMA must be uncached. Clean
up the nonsensical case.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/14348/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit cfa93fb9c2eae805f2c16d72bad04ca49b6e16d2)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/mm/dma-default.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index 7276b8941d17..feb87be3b62c 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -160,8 +160,7 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
 	*dma_handle = plat_map_dma_mem(dev, ret, size);
 	if (!plat_device_is_coherent(dev)) {
 		dma_cache_wback_inv((unsigned long) ret, size);
-		if (!hw_coherentio)
-			ret = UNCAC_ADDR(ret);
+		ret = UNCAC_ADDR(ret);
 	}
 
 	return ret;
@@ -189,7 +188,7 @@ static void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr,
 
 	plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
 
-	if (!plat_device_is_coherent(dev) && !hw_coherentio)
+	if (!plat_device_is_coherent(dev))
 		addr = CAC_ADDR(addr);
 
 	page = virt_to_page((void *) addr);
@@ -209,7 +208,7 @@ static int mips_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 	unsigned long pfn;
 	int ret = -ENXIO;
 
-	if (!plat_device_is_coherent(dev) && !hw_coherentio)
+	if (!plat_device_is_coherent(dev))
 		addr = CAC_ADDR(addr);
 
 	pfn = page_to_pfn(virt_to_page((void *)addr));

From df5314d4c24b9ec8c36a794e58db67745fdbe410 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 5 Oct 2016 18:18:16 +0100
Subject: [PATCH 1476/3220] UPSTREAM: MIPS: Support per-device DMA coherence

On some MIPS systems, a subset of devices may have DMA coherent with CPU
caches. For example in systems including a MIPS I/O Coherence Unit
(IOCU), some devices may be connected to that IOCU whilst others are
not.

Prior to this patch, we have a plat_device_is_coherent() function but no
implementation which does anything besides return a global true or
false, optionally chosen at runtime. For devices such as those described
above this is insufficient.

Fix this by tracking DMA coherence on a per-device basis with a
dma_coherent field in struct dev_archdata. Setting this from
arch_setup_dma_ops() takes care of devices which set the dma-coherent
property via device tree, and any PCI devices beneath a bridge described
in DT, automatically.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/14349/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 20d330645cfb8cfecfb82b369e4d3084e429e68a)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/Kconfig                                  |  4 ++++
 arch/mips/include/asm/device.h                     |  5 +++++
 arch/mips/include/asm/dma-coherence.h              |  4 +++-
 arch/mips/include/asm/dma-mapping.h                | 10 ++++++++++
 arch/mips/include/asm/mach-generic/dma-coherence.h |  4 ++++
 arch/mips/mm/c-r4k.c                               |  4 ++++
 arch/mips/mm/dma-default.c                         |  2 +-
 7 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index a5db20d06584..c9479fead851 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1140,6 +1140,10 @@ config DMA_MAYBE_COHERENT
 	select DMA_NONCOHERENT
 	bool
 
+config DMA_PERDEV_COHERENT
+	bool
+	select DMA_MAYBE_COHERENT
+
 config DMA_COHERENT
 	bool
 
diff --git a/arch/mips/include/asm/device.h b/arch/mips/include/asm/device.h
index c94fafba9e62..21c2082a0dfb 100644
--- a/arch/mips/include/asm/device.h
+++ b/arch/mips/include/asm/device.h
@@ -11,6 +11,11 @@ struct dma_map_ops;
 struct dev_archdata {
 	/* DMA operations on that device */
 	struct dma_map_ops *dma_ops;
+
+#ifdef CONFIG_DMA_PERDEV_COHERENT
+	/* Non-zero if DMA is coherent with CPU caches */
+	bool dma_coherent;
+#endif
 };
 
 struct pdev_archdata {
diff --git a/arch/mips/include/asm/dma-coherence.h b/arch/mips/include/asm/dma-coherence.h
index 4fbce79fb57f..72d0eab02afc 100644
--- a/arch/mips/include/asm/dma-coherence.h
+++ b/arch/mips/include/asm/dma-coherence.h
@@ -15,7 +15,9 @@ enum coherent_io_user_state {
 	IO_COHERENCE_DISABLED,
 };
 
-#ifdef CONFIG_DMA_MAYBE_COHERENT
+#if defined(CONFIG_DMA_PERDEV_COHERENT)
+/* Don't provide (hw_)coherentio to avoid misuse */
+#elif defined(CONFIG_DMA_MAYBE_COHERENT)
 extern enum coherent_io_user_state coherentio;
 extern int hw_coherentio;
 #else
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index e604f760c4a0..953236992be3 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -34,4 +34,14 @@ static inline void dma_mark_clean(void *addr, size_t size) {}
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction);
 
+#define arch_setup_dma_ops arch_setup_dma_ops
+static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
+				      u64 size, const struct iommu_ops *iommu,
+				      bool coherent)
+{
+#ifdef CONFIG_DMA_PERDEV_COHERENT
+	dev->archdata.dma_coherent = coherent;
+#endif
+}
+
 #endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/mips/include/asm/mach-generic/dma-coherence.h b/arch/mips/include/asm/mach-generic/dma-coherence.h
index 8484f82fc794..61addb1677e9 100644
--- a/arch/mips/include/asm/mach-generic/dma-coherence.h
+++ b/arch/mips/include/asm/mach-generic/dma-coherence.h
@@ -49,6 +49,9 @@ static inline int plat_dma_supported(struct device *dev, u64 mask)
 
 static inline int plat_device_is_coherent(struct device *dev)
 {
+#ifdef CONFIG_DMA_PERDEV_COHERENT
+	return dev->archdata.dma_coherent;
+#else
 	switch (coherentio) {
 	default:
 	case IO_COHERENCE_DEFAULT:
@@ -58,6 +61,7 @@ static inline int plat_device_is_coherent(struct device *dev)
 	case IO_COHERENCE_DISABLED:
 		return 0;
 	}
+#endif
 }
 
 #ifndef plat_post_dma_flush
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 1602ae199017..d66a61efb143 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -1759,8 +1759,12 @@ void r4k_cache_init(void)
 	local_flush_icache_range	= local_r4k_flush_icache_range;
 
 #if defined(CONFIG_DMA_NONCOHERENT) || defined(CONFIG_DMA_MAYBE_COHERENT)
+# if defined(CONFIG_DMA_PERDEV_COHERENT)
+	if (0) {
+# else
 	if ((coherentio == IO_COHERENCE_ENABLED) ||
 	    ((coherentio == IO_COHERENCE_DEFAULT) && hw_coherentio)) {
+# endif
 		_dma_cache_wback_inv	= (void *)cache_noop;
 		_dma_cache_wback	= (void *)cache_noop;
 		_dma_cache_inv		= (void *)cache_noop;
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index feb87be3b62c..3ae73ae45938 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -24,7 +24,7 @@
 
 #include <dma-coherence.h>
 
-#ifdef CONFIG_DMA_MAYBE_COHERENT
+#if defined(CONFIG_DMA_MAYBE_COHERENT) && !defined(CONFIG_DMA_PERDEV_COHERENT)
 /* User defined DMA coherency from command line. */
 enum coherent_io_user_state coherentio = IO_COHERENCE_DEFAULT;
 EXPORT_SYMBOL_GPL(coherentio);

From 91fe2ac4d28f139e7b6ffb1b5d888164a3e7cee0 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Mon, 19 Jun 2017 17:50:12 +0200
Subject: [PATCH 1477/3220] UPSTREAM: MIPS: math-emu: Handle zero accumulator
 case in MADDF and MSUBF separately

If accumulator value is zero, just return the value of previously
calculated product. This brings logic in MADDF/MSUBF implementation
closer to the logic in ADD/SUB case.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: James.Hogan@imgtec.com
Cc: Paul.Burton@imgtec.com
Cc: Raghu.Gandham@imgtec.com
Cc: Leonid.Yegoshin@imgtec.com
Cc: Douglas.Leung@imgtec.com
Cc: Petar.Jovanovic@imgtec.com
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16512/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit ddbfff7429a75d954bf5bdff9f2222bceb4c236a)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c | 5 ++++-
 arch/mips/math-emu/sp_maddf.c | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 4a2d03c72959..caa62f20a888 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -54,7 +54,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 		return ieee754dp_nanxcpt(z);
 	case IEEE754_CLASS_DNORM:
 		DPDNORMZ;
-	/* QNAN is handled separately below */
+	/* QNAN and ZERO cases are handled separately below */
 	}
 
 	switch (CLPAIR(xc, yc)) {
@@ -210,6 +210,9 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	}
 	assert(rm & (DP_HIDDEN_BIT << 3));
 
+	if (zc == IEEE754_CLASS_ZERO)
+		return ieee754dp_format(rs, re, rm);
+
 	/* And now the addition */
 	assert(zm & DP_HIDDEN_BIT);
 
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index a8cd8b4f235e..c91d5e5d9b5f 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -54,7 +54,7 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 		return ieee754sp_nanxcpt(z);
 	case IEEE754_CLASS_DNORM:
 		SPDNORMZ;
-	/* QNAN is handled separately below */
+	/* QNAN and ZERO cases are handled separately below */
 	}
 
 	switch (CLPAIR(xc, yc)) {
@@ -203,6 +203,9 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	}
 	assert(rm & (SP_HIDDEN_BIT << 3));
 
+	if (zc == IEEE754_CLASS_ZERO)
+		return ieee754sp_format(rs, re, rm);
+
 	/* And now the addition */
 
 	assert(zm & SP_HIDDEN_BIT);

From 678610b55ab6ec747be68f33afde096e1e1b28a1 Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@imgtec.com>
Date: Mon, 19 Jun 2017 17:50:08 +0200
Subject: [PATCH 1478/3220] UPSTREAM: MIPS: cmdline: Add support for 'memmap'
 parameter

Implement support for parsing 'memmap' kernel command line parameter.

This patch covers parsing of the following two formats for 'memmap'
parameter values:

  - nn[KMG]@ss[KMG]
  - nn[KMG]$ss[KMG]

  ([KMG] = K M or G (kilo, mega, giga))

These two allowed formats for parameter value are already documented
in file kernel-parameters.txt in Documentation/admin-guide folder.
Some architectures already support them, but Mips did not prior to
this patch.

Excerpt from Documentation/admin-guide/kernel-parameters.txt:

memmap=nn[KMG]@ss[KMG]
    [KNL] Force usage of a specific region of memory.
    Region of memory to be used is from ss to ss+nn.

memmap=nn[KMG]$ss[KMG]
    Mark specific memory as reserved.
    Region of memory to be reserved is from ss to ss+nn.
    Example: Exclude memory from 0x18690000-0x1869ffff
        memmap=64K$0x18690000
        or
        memmap=0x10000$0x18690000

There is no need to update this documentation file with respect to
this patch.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: James.Hogan@imgtec.com
Cc: Paul.Burton@imgtec.com
Cc: Raghu.Gandham@imgtec.com
Cc: Leonid.Yegoshin@imgtec.com
Cc: Douglas.Leung@imgtec.com
Cc: Petar.Jovanovic@imgtec.com
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16508/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 296a7624f5b292af610d728e7e347fda341a985e)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/setup.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index ac010f157763..162a14e62669 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -597,6 +597,46 @@ static int __init early_parse_mem(char *p)
 }
 early_param("mem", early_parse_mem);
 
+static int __init early_parse_memmap(char *p)
+{
+	char *oldp;
+	u64 start_at, mem_size;
+
+	if (!p)
+		return -EINVAL;
+
+	if (!strncmp(p, "exactmap", 8)) {
+		pr_err("\"memmap=exactmap\" invalid on MIPS\n");
+		return 0;
+	}
+
+	oldp = p;
+	mem_size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
+
+	if (*p == '@') {
+		start_at = memparse(p+1, &p);
+		add_memory_region(start_at, mem_size, BOOT_MEM_RAM);
+	} else if (*p == '#') {
+		pr_err("\"memmap=nn#ss\" (force ACPI data) invalid on MIPS\n");
+		return -EINVAL;
+	} else if (*p == '$') {
+		start_at = memparse(p+1, &p);
+		add_memory_region(start_at, mem_size, BOOT_MEM_RESERVED);
+	} else {
+		pr_err("\"memmap\" invalid format!\n");
+		return -EINVAL;
+	}
+
+	if (*p == '\0') {
+		usermem = 1;
+		return 0;
+	} else
+		return -EINVAL;
+}
+early_param("memmap", early_parse_memmap);
+
 #ifdef CONFIG_PROC_VMCORE
 unsigned long setup_elfcorehdr, setup_elfcorehdr_size;
 static int __init early_parse_elfcorehdr(char *p)

From 40a5b6cb8310515fd963f5a9fba92b6bb9bd281a Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@imgtec.com>
Date: Mon, 19 Jun 2017 17:50:09 +0200
Subject: [PATCH 1479/3220] UPSTREAM: MIPS: build: Fix "-modd-spreg" switch
 usage when compiling for mips32r6

Add "-modd-spreg" when compiling the kernel for mips32r6 target.

This makes sure the kernel builds properly even with toolchains that
use "-mno-odd-spreg" by default. This is the case with Android gcc.
Prior to this patch, kernel builds using gcc for Android failed with
following error messages, if target architecture is set to mips32r6:

arch/mips/kernel/r4k_switch.S: Assembler messages:
.../r4k_switch.S:210: Error: float register should be even, was 1
.../r4k_switch.S:212: Error: float register should be even, was 3
.../r4k_switch.S:214: Error: float register should be even, was 5
.../r4k_switch.S:216: Error: float register should be even, was 7
.../r4k_switch.S:218: Error: float register should be even, was 9
.../r4k_switch.S:220: Error: float register should be even, was 11
.../r4k_switch.S:222: Error: float register should be even, was 13
.../r4k_switch.S:224: Error: float register should be even, was 15
.../r4k_switch.S:226: Error: float register should be even, was 17
.../r4k_switch.S:228: Error: float register should be even, was 19
.../r4k_switch.S:230: Error: float register should be even, was 21
.../r4k_switch.S:232: Error: float register should be even, was 23
.../r4k_switch.S:234: Error: float register should be even, was 25
.../r4k_switch.S:236: Error: float register should be even, was 27
.../r4k_switch.S:238: Error: float register should be even, was 29
.../r4k_switch.S:240: Error: float register should be even, was 31
make[2]: *** [arch/mips/kernel/r4k_switch.o] Error 1

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: James.Hogan@imgtec.com
Cc: Paul.Burton@imgtec.com
Cc: Raghu.Gandham@imgtec.com
Cc: Leonid.Yegoshin@imgtec.com
Cc: Douglas.Leung@imgtec.com
Cc: Petar.Jovanovic@imgtec.com
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16509/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 21855a6e5b44999fb9f5493cc2c8c9eed5f32876)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 9a1f330eb7ff..a13dc908224d 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -138,7 +138,7 @@ cflags-$(CONFIG_CPU_MIPS32_R1)	+= $(call cc-option,-march=mips32,-mips32 -U_MIPS
 			-Wa,-mips32 -Wa,--trap
 cflags-$(CONFIG_CPU_MIPS32_R2)	+= $(call cc-option,-march=mips32r2,-mips32r2 -U_MIPS_ISA -D_MIPS_ISA=_MIPS_ISA_MIPS32) \
 			-Wa,-mips32r2 -Wa,--trap
-cflags-$(CONFIG_CPU_MIPS32_R6)	+= -march=mips32r6 -Wa,--trap
+cflags-$(CONFIG_CPU_MIPS32_R6)	+= -march=mips32r6 -Wa,--trap -modd-spreg
 cflags-$(CONFIG_CPU_MIPS64_R1)	+= $(call cc-option,-march=mips64,-mips64 -U_MIPS_ISA -D_MIPS_ISA=_MIPS_ISA_MIPS64) \
 			-Wa,-mips64 -Wa,--trap
 cflags-$(CONFIG_CPU_MIPS64_R2)	+= $(call cc-option,-march=mips64r2,-mips64r2 -U_MIPS_ISA -D_MIPS_ISA=_MIPS_ISA_MIPS64) \

From a78332d6d8bbe6f2cf2a5e11dd8c56c2a7e988aa Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@imgtec.com>
Date: Mon, 19 Jun 2017 17:50:11 +0200
Subject: [PATCH 1480/3220] UPSTREAM: MIPS: unaligned: Add DSP lwx & lhx
 missaligned access support

Add handling of missaligned access for DSP load instructions
lwx & lhx.

Since DSP instructions share SPECIAL3 opcode with other non-DSP
instructions, necessary logic was inserted for distinguishing
between instructions with SPECIAL3 opcode. For that purpose,
the instruction format for DSP instructions is added to
arch/mips/include/uapi/asm/inst.h.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtech.com>
Cc: James.Hogan@imgtec.com
Cc: Paul.Burton@imgtec.com
Cc: Raghu.Gandham@imgtec.com
Cc: Leonid.Yegoshin@imgtec.com
Cc: Douglas.Leung@imgtec.com
Cc: Petar.Jovanovic@imgtec.com
Cc: Goran.Ferenc@imgtec.com
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16511/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 3f88ec633362efa454ca4ea289d4ad91cd44a976)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/uapi/asm/inst.h |  11 ++
 arch/mips/kernel/unaligned.c      | 182 +++++++++++++++++-------------
 2 files changed, 115 insertions(+), 78 deletions(-)

diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index effeef52566b..bd817e626da1 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -645,6 +645,16 @@ struct msa_mi10_format {		/* MSA MI10 */
 	;))))))
 };
 
+struct dsp_format {		/* SPEC3 DSP format instructions */
+	__BITFIELD_FIELD(unsigned int opcode : 6,
+	__BITFIELD_FIELD(unsigned int base : 5,
+	__BITFIELD_FIELD(unsigned int index : 5,
+	__BITFIELD_FIELD(unsigned int rd : 5,
+	__BITFIELD_FIELD(unsigned int op : 5,
+	__BITFIELD_FIELD(unsigned int func : 6,
+	;))))))
+};
+
 struct spec3_format {   /* SPEC3 */
 	__BITFIELD_FIELD(unsigned int opcode:6,
 	__BITFIELD_FIELD(unsigned int rs:5,
@@ -933,6 +943,7 @@ union mips_instruction {
 	struct b_format b_format;
 	struct ps_format ps_format;
 	struct v_format v_format;
+	struct dsp_format dsp_format;
 	struct spec3_format spec3_format;
 	struct fb_format fb_format;
 	struct fp0_format fp0_format;
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index 28b3af73a17b..0aa4bae4d4d4 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -939,88 +939,114 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		 * The remaining opcodes are the ones that are really of
 		 * interest.
 		 */
-#ifdef CONFIG_EVA
 	case spec3_op:
-		/*
-		 * we can land here only from kernel accessing user memory,
-		 * so we need to "switch" the address limit to user space, so
-		 * address check can work properly.
-		 */
-		seg = get_fs();
-		set_fs(USER_DS);
-		switch (insn.spec3_format.func) {
-		case lhe_op:
-			if (!access_ok(VERIFY_READ, addr, 2)) {
-				set_fs(seg);
-				goto sigbus;
+		if (insn.dsp_format.func == lx_op) {
+			switch (insn.dsp_format.op) {
+			case lwx_op:
+				if (!access_ok(VERIFY_READ, addr, 4))
+					goto sigbus;
+				LoadW(addr, value, res);
+				if (res)
+					goto fault;
+				compute_return_epc(regs);
+				regs->regs[insn.dsp_format.rd] = value;
+				break;
+			case lhx_op:
+				if (!access_ok(VERIFY_READ, addr, 2))
+					goto sigbus;
+				LoadHW(addr, value, res);
+				if (res)
+					goto fault;
+				compute_return_epc(regs);
+				regs->regs[insn.dsp_format.rd] = value;
+				break;
+			default:
+				goto sigill;
 			}
-			LoadHWE(addr, value, res);
-			if (res) {
-				set_fs(seg);
-				goto fault;
-			}
-			compute_return_epc(regs);
-			regs->regs[insn.spec3_format.rt] = value;
-			break;
-		case lwe_op:
-			if (!access_ok(VERIFY_READ, addr, 4)) {
-				set_fs(seg);
-				goto sigbus;
-			}
-				LoadWE(addr, value, res);
-			if (res) {
-				set_fs(seg);
-				goto fault;
-			}
-			compute_return_epc(regs);
-			regs->regs[insn.spec3_format.rt] = value;
-			break;
-		case lhue_op:
-			if (!access_ok(VERIFY_READ, addr, 2)) {
-				set_fs(seg);
-				goto sigbus;
-			}
-			LoadHWUE(addr, value, res);
-			if (res) {
-				set_fs(seg);
-				goto fault;
-			}
-			compute_return_epc(regs);
-			regs->regs[insn.spec3_format.rt] = value;
-			break;
-		case she_op:
-			if (!access_ok(VERIFY_WRITE, addr, 2)) {
-				set_fs(seg);
-				goto sigbus;
-			}
-			compute_return_epc(regs);
-			value = regs->regs[insn.spec3_format.rt];
-			StoreHWE(addr, value, res);
-			if (res) {
-				set_fs(seg);
-				goto fault;
-			}
-			break;
-		case swe_op:
-			if (!access_ok(VERIFY_WRITE, addr, 4)) {
-				set_fs(seg);
-				goto sigbus;
-			}
-			compute_return_epc(regs);
-			value = regs->regs[insn.spec3_format.rt];
-			StoreWE(addr, value, res);
-			if (res) {
-				set_fs(seg);
-				goto fault;
-			}
-			break;
-		default:
-			set_fs(seg);
-			goto sigill;
 		}
-		set_fs(seg);
-		break;
+#ifdef CONFIG_EVA
+		else {
+			/*
+			 * we can land here only from kernel accessing user
+			 * memory, so we need to "switch" the address limit to
+			 * user space, so that address check can work properly.
+			 */
+			seg = get_fs();
+			set_fs(USER_DS);
+			switch (insn.spec3_format.func) {
+			case lhe_op:
+				if (!access_ok(VERIFY_READ, addr, 2)) {
+					set_fs(seg);
+					goto sigbus;
+				}
+				LoadHWE(addr, value, res);
+				if (res) {
+					set_fs(seg);
+					goto fault;
+				}
+				compute_return_epc(regs);
+				regs->regs[insn.spec3_format.rt] = value;
+				break;
+			case lwe_op:
+				if (!access_ok(VERIFY_READ, addr, 4)) {
+					set_fs(seg);
+					goto sigbus;
+				}
+				LoadWE(addr, value, res);
+				if (res) {
+					set_fs(seg);
+					goto fault;
+				}
+				compute_return_epc(regs);
+				regs->regs[insn.spec3_format.rt] = value;
+				break;
+			case lhue_op:
+				if (!access_ok(VERIFY_READ, addr, 2)) {
+					set_fs(seg);
+					goto sigbus;
+				}
+				LoadHWUE(addr, value, res);
+				if (res) {
+					set_fs(seg);
+					goto fault;
+				}
+				compute_return_epc(regs);
+				regs->regs[insn.spec3_format.rt] = value;
+				break;
+			case she_op:
+				if (!access_ok(VERIFY_WRITE, addr, 2)) {
+					set_fs(seg);
+					goto sigbus;
+				}
+				compute_return_epc(regs);
+				value = regs->regs[insn.spec3_format.rt];
+				StoreHWE(addr, value, res);
+				if (res) {
+					set_fs(seg);
+					goto fault;
+				}
+				break;
+			case swe_op:
+				if (!access_ok(VERIFY_WRITE, addr, 4)) {
+					set_fs(seg);
+					goto sigbus;
+				}
+				compute_return_epc(regs);
+				value = regs->regs[insn.spec3_format.rt];
+				StoreWE(addr, value, res);
+				if (res) {
+					set_fs(seg);
+					goto fault;
+				}
+				break;
+			default:
+				set_fs(seg);
+				goto sigill;
+			}
+			set_fs(seg);
+		}
 #endif
+		break;
 	case lh_op:
 		if (!access_ok(VERIFY_READ, addr, 2))
 			goto sigbus;

From 7ff79d8a3d814e5eba3417658537108ca704ad60 Mon Sep 17 00:00:00 2001
From: Goran Ferenc <goran.ferenc@imgtec.com>
Date: Wed, 28 Jun 2017 17:55:28 +0200
Subject: [PATCH 1481/3220] UPSTREAM: MIPS: VDSO: Fix conversions in
 do_monotonic()/do_monotonic_coarse()

Fix incorrect calculation in do_monotonic() and do_monotonic_coarse()
function that in turn caused incorrect values returned by the vdso
version of system call clock_gettime() on mips64 if its system clock
ID parameter was CLOCK_MONOTONIC or CLOCK_MONOTONIC_COARSE.

Consider these variables and their types on mips32 and mips64:

tk->wall_to_monotonic.tv_sec  s64, s64   (kernel/vdso.c)
vdso_data.wall_to_mono_sec    u32, u32   (kernel/vdso.c)
to_mono_sec                   u32, u32   (vdso/gettimeofday.c)
ts->tv_sec                    s32, s64   (vdso/gettimeofday.c)

For mips64 case, u32 vdso_data.wall_to_mono_sec variable is updated
from the 64-bit signed variable tk->wall_to_monotonic.tv_sec
(kernel/vdso.c:76) which is a negative number holding the time passed
from 1970-01-01 to the time boot started. This 64-bit signed value is
currently around 47+ years, in seconds. For instance, let this value
be:

-1489757461

or

11111111111111111111111111111111 10100111001101000001101011101011

By updating 32-bit vdso_data.wall_to_mono_sec variable, we lose upper
32 bits (signed 1's).

to_mono_sec variable is a parameter of do_monotonic() and
do_monotonic_coarse() functions which holds vdso_data.wall_to_mono_sec
value. Its value needs to be added (or subtracted considering it holds
negative value from the tk->wall_to_monotonic.tv_sec) to the current
time passed from 1970-01-01 (ts->tv_sec), which is again something like
47+ years, but increased by the time passed from the boot to the
current time. ts->tv_sec is 32-bit long in case of 32-bit architecture
and 64-bit long in case of 64-bit architecture. Consider the update of
ts->tv_sec (vdso/gettimeofday.c:55 & 167):

ts->tv_sec += to_mono_sec;

mips32 case: This update will be performed correctly, since both
ts->tv_sec and to_mono_sec are 32-bit long and the sign in to_mono_sec
is preserved. Implicit conversion from u32 to s32 will be done
correctly.

mips64 case: This update will be wrong, since the implicit conversion
will not be done correctly. The reason is that the conversion will be
from u32 to s64. This is because to_mono_sec is 32-bit long for both
mips32 and mips64 cases and s64..33 bits of converted to_mono_sec
variable will be zeros.

So, in order to make MIPS64 implementation work properly for
MONOTONIC and MONOTONIC_COARSE clock ids on mips64, the size of
wall_to_mono_sec variable in mips_vdso_data union and respective
parameters in do_monotonic() and do_monotonic_coarse() functions
should be changed from u32 to u64. Because of consistency, this
size change from u32 and u64 is also done for wall_to_mono_nsec
variable and corresponding function parameters.

As far as similar situations for other architectures are concerned,
let's take a look at arm. Arm has two distinct vdso_data structures
for 32-bit & 64-bit cases, and arm's wall_to_mono_sec and
wall_to_mono_nsec are u32 for 32-bit and u64 for 64-bit cases.
On the other hand, MIPS has only one structure (mips_vdso_data),
hence the need for changing the size of above mentioned parameters.

Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16638/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 8ec7f15b8cca4f790df5cdf33f26e2926d4ee2fd)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/vdso.h  | 4 ++--
 arch/mips/vdso/gettimeofday.c | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/asm/vdso.h b/arch/mips/include/asm/vdso.h
index 8f4ca5dd992b..b7cd6cf77b83 100644
--- a/arch/mips/include/asm/vdso.h
+++ b/arch/mips/include/asm/vdso.h
@@ -79,8 +79,8 @@ union mips_vdso_data {
 	struct {
 		u64 xtime_sec;
 		u64 xtime_nsec;
-		u32 wall_to_mono_sec;
-		u32 wall_to_mono_nsec;
+		u64 wall_to_mono_sec;
+		u64 wall_to_mono_nsec;
 		u32 seq_count;
 		u32 cs_shift;
 		u8 clock_mode;
diff --git a/arch/mips/vdso/gettimeofday.c b/arch/mips/vdso/gettimeofday.c
index ce89c9e294f9..fd7d433970bf 100644
--- a/arch/mips/vdso/gettimeofday.c
+++ b/arch/mips/vdso/gettimeofday.c
@@ -39,8 +39,8 @@ static __always_inline int do_monotonic_coarse(struct timespec *ts,
 					       const union mips_vdso_data *data)
 {
 	u32 start_seq;
-	u32 to_mono_sec;
-	u32 to_mono_nsec;
+	u64 to_mono_sec;
+	u64 to_mono_nsec;
 
 	do {
 		start_seq = vdso_data_read_begin(data);
@@ -148,8 +148,8 @@ static __always_inline int do_monotonic(struct timespec *ts,
 {
 	u32 start_seq;
 	u64 ns;
-	u32 to_mono_sec;
-	u32 to_mono_nsec;
+	u64 to_mono_sec;
+	u64 to_mono_nsec;
 
 	do {
 		start_seq = vdso_data_read_begin(data);

From 3ab7b67564de5335868033205bd11cdfa7da4377 Mon Sep 17 00:00:00 2001
From: Goran Ferenc <goran.ferenc@imgtec.com>
Date: Wed, 28 Jun 2017 17:55:29 +0200
Subject: [PATCH 1482/3220] UPSTREAM: MIPS: VDSO: Add implementation of
 clock_gettime() fallback

This patch adds clock_gettime_fallback() function that wraps assembly
invocation of clock_gettime() syscall using __NR_clock_gettime.

This function is used if pure VDSO implementation of clock_gettime()
does not succeed for any reason. For example, it is called if the
clkid parameter of clock_gettime() is not one of the clkids listed
in the switch-case block of the function __vdso_clock_gettime()
(one such case for clkid is CLOCK_BOOTIME).

If syscall invocation via __NR_clock_gettime fails, register a3 will
be set. So, after the syscall, register a3 is tested and the return
value is negated if it's set.

Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16639/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 180902e08f051f72c89ffa366f4e4f7a8e9c753e)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/vdso/gettimeofday.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/mips/vdso/gettimeofday.c b/arch/mips/vdso/gettimeofday.c
index fd7d433970bf..5f6337545ee2 100644
--- a/arch/mips/vdso/gettimeofday.c
+++ b/arch/mips/vdso/gettimeofday.c
@@ -20,6 +20,24 @@
 #include <asm/unistd.h>
 #include <asm/vdso.h>
 
+static __always_inline long clock_gettime_fallback(clockid_t _clkid,
+					   struct timespec *_ts)
+{
+	register struct timespec *ts asm("a1") = _ts;
+	register clockid_t clkid asm("a0") = _clkid;
+	register long ret asm("v0");
+	register long nr asm("v0") = __NR_clock_gettime;
+	register long error asm("a3");
+
+	asm volatile(
+	"       syscall\n"
+	: "=r" (ret), "=r" (error)
+	: "r" (clkid), "r" (ts), "r" (nr)
+	: "memory");
+
+	return error ? -ret : ret;
+}
+
 static __always_inline int do_realtime_coarse(struct timespec *ts,
 					      const union mips_vdso_data *data)
 {
@@ -207,7 +225,7 @@ int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 int __vdso_clock_gettime(clockid_t clkid, struct timespec *ts)
 {
 	const union mips_vdso_data *data = get_vdso_data();
-	int ret;
+	int ret = -1;
 
 	switch (clkid) {
 	case CLOCK_REALTIME_COARSE:
@@ -223,10 +241,11 @@ int __vdso_clock_gettime(clockid_t clkid, struct timespec *ts)
 		ret = do_monotonic(ts, data);
 		break;
 	default:
-		ret = -ENOSYS;
 		break;
 	}
 
-	/* If we return -ENOSYS libc should fall back to a syscall. */
+	if (ret)
+		ret = clock_gettime_fallback(clkid, ts);
+
 	return ret;
 }

From 491fef35384656b5e11f3bf4ea21693b7a3cc9e4 Mon Sep 17 00:00:00 2001
From: Goran Ferenc <goran.ferenc@imgtec.com>
Date: Wed, 28 Jun 2017 17:55:30 +0200
Subject: [PATCH 1483/3220] UPSTREAM: MIPS: VDSO: Add implementation of
 gettimeofday() fallback

This patch adds gettimeofday_fallback() function that wraps assembly
invocation of gettimeofday() syscall using __NR_gettimeofday.

This function is used if pure VDSO implementation gettimeofday()
does not succeed for any reason. Its imeplementation is enclosed in
"#ifdef CONFIG_MIPS_CLOCK_VSYSCALL" to be in sync with the similar
arrangement for __vdso_gettimeofday().

If syscall invocation via __NR_gettimeofday fails, register a3 will
be set. So, after the syscall, register a3 is tested and the return
valuem is negated if it's set.

Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16640/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 0b523a85e134d41f57ddd8c5193bd9f0a5e20b0d)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/vdso/gettimeofday.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/mips/vdso/gettimeofday.c b/arch/mips/vdso/gettimeofday.c
index 5f6337545ee2..23305bf6c7a2 100644
--- a/arch/mips/vdso/gettimeofday.c
+++ b/arch/mips/vdso/gettimeofday.c
@@ -20,6 +20,28 @@
 #include <asm/unistd.h>
 #include <asm/vdso.h>
 
+#ifdef CONFIG_MIPS_CLOCK_VSYSCALL
+
+static __always_inline long gettimeofday_fallback(struct timeval *_tv,
+					  struct timezone *_tz)
+{
+	register struct timezone *tz asm("a1") = _tz;
+	register struct timeval *tv asm("a0") = _tv;
+	register long ret asm("v0");
+	register long nr asm("v0") = __NR_gettimeofday;
+	register long error asm("a3");
+
+	asm volatile(
+	"       syscall\n"
+	: "=r" (ret), "=r" (error)
+	: "r" (tv), "r" (tz), "r" (nr)
+	: "memory");
+
+	return error ? -ret : ret;
+}
+
+#endif
+
 static __always_inline long clock_gettime_fallback(clockid_t _clkid,
 					   struct timespec *_ts)
 {
@@ -205,7 +227,7 @@ int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 
 	ret = do_realtime(&ts, data);
 	if (ret)
-		return ret;
+		return gettimeofday_fallback(tv, tz);
 
 	if (tv) {
 		tv->tv_sec = ts.tv_sec;

From 2150cb63454b07e0ff1ba440231503fd2630c00f Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Wed, 28 Jun 2017 17:55:31 +0200
Subject: [PATCH 1484/3220] UPSTREAM: MIPS: VDSO: Fix a mismatch between
 comment and preprocessor constant

Sync the comment with its preprocessor constant counterpart.

Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Miodrag Dinic <miodrag.dinic@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16641/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit bdb94f6e824d5bd1577c3f80cbe0c6b4beab5a5c)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/vdso/gettimeofday.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/vdso/gettimeofday.c b/arch/mips/vdso/gettimeofday.c
index 23305bf6c7a2..974276e828b2 100644
--- a/arch/mips/vdso/gettimeofday.c
+++ b/arch/mips/vdso/gettimeofday.c
@@ -242,7 +242,7 @@ int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 	return 0;
 }
 
-#endif /* CONFIG_CLKSRC_MIPS_GIC */
+#endif /* CONFIG_MIPS_CLOCK_VSYSCALL */
 
 int __vdso_clock_gettime(clockid_t clkid, struct timespec *ts)
 {

From 5381edf8435d65b85e774ed2e681bd4b812ff2c8 Mon Sep 17 00:00:00 2001
From: Goran Ferenc <goran.ferenc@imgtec.com>
Date: Thu, 27 Jul 2017 18:08:47 +0200
Subject: [PATCH 1485/3220] UPSTREAM: MIPS: VDSO: Fix clobber lists in fallback
 code paths

Extend clobber lists to include all GP registers.

Fixes: 0b523a85e134 ("MIPS: VDSO: Add implementation of gettimeofday() fallback")

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Cc: Bo Hu <bohu@google.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jin Qian <jinqian@google.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16879/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit b399ee28c29c07f6a7ad87dade9148828757e6e9)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/vdso/gettimeofday.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/mips/vdso/gettimeofday.c b/arch/mips/vdso/gettimeofday.c
index 974276e828b2..e2690d7ca4dd 100644
--- a/arch/mips/vdso/gettimeofday.c
+++ b/arch/mips/vdso/gettimeofday.c
@@ -35,7 +35,8 @@ static __always_inline long gettimeofday_fallback(struct timeval *_tv,
 	"       syscall\n"
 	: "=r" (ret), "=r" (error)
 	: "r" (tv), "r" (tz), "r" (nr)
-	: "memory");
+	: "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13",
+	  "$14", "$15", "$24", "$25", "hi", "lo", "memory");
 
 	return error ? -ret : ret;
 }
@@ -55,7 +56,8 @@ static __always_inline long clock_gettime_fallback(clockid_t _clkid,
 	"       syscall\n"
 	: "=r" (ret), "=r" (error)
 	: "r" (clkid), "r" (ts), "r" (nr)
-	: "memory");
+	: "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13",
+	  "$14", "$15", "$24", "$25", "hi", "lo", "memory");
 
 	return error ? -ret : ret;
 }

From 872946b047f1146c220956bd8457eb90d4b241e9 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Mon, 9 Jan 2017 01:26:37 +0100
Subject: [PATCH 1486/3220] UPSTREAM: tty: goldfish: Fix a parameter of a call
 to free_irq

'request_irq()' and 'free_irq()' should be called with the same dev_id.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 1a5c2d1de7d35f5eb9793266237903348989502b)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/tty/goldfish.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index 1e332855b933..b2f830d6a654 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -300,7 +300,7 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	return 0;
 
 err_tty_register_device_failed:
-	free_irq(irq, pdev);
+	free_irq(irq, qtty);
 err_request_irq_failed:
 	goldfish_tty_current_line_count--;
 	if (goldfish_tty_current_line_count == 0)

From 9d434b683579802d247cdfe96cd7808186d495a8 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Thu, 27 Jul 2017 18:08:54 +0200
Subject: [PATCH 1487/3220] UPSTREAM: MIPS: math-emu: <MADDF|MSUBF>.<D|S>: Fix
 NaN propagation

Fix the cases of <MADDF|MSUBF>.<D|S> when any of three inputs is any
NaN. Correct behavior of <MADDF|MSUBF>.<D|S> fd, fs, ft is following:

  - if any of inputs is sNaN, return a sNaN using following rules: if
    only one input is sNaN, return that one; if more than one input is
    sNaN, order of precedence for return value is fd, fs, ft
  - if no input is sNaN, but at least one of inputs is qNaN, return a
    qNaN using following rules: if only one input is qNaN, return that
    one; if more than one input is qNaN, order of precedence for
    return value is fd, fs, ft

The previous code contained correct handling of some above cases, but
not all. Also, such handling was scattered into various cases of
"switch (CLPAIR(xc, yc))" statement, and elsewhere. With this patch,
this logic is placed in one place, and "switch (CLPAIR(xc, yc))" is
significantly simplified.

A relevant example:

MADDF.S fd,fs,ft:
  If fs contains qNaN1, ft contains qNaN2, and fd contains qNaN3, fd
  is going to contain qNaN3 (without this patch, it used to contain
  qNaN1).

Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction")
Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction")

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Cc: Bo Hu <bohu@google.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: Jin Qian <jinqian@google.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: <stable@vger.kernel.org> # 4.7+
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16886/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit e840be6e7057757befc3581e1699e30fe7f0dd51)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c | 66 +++++++++++-----------------------
 arch/mips/math-emu/sp_maddf.c | 68 ++++++++++++-----------------------
 2 files changed, 42 insertions(+), 92 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index caa62f20a888..8b1bd42aad3c 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -48,52 +48,34 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	ieee754_clearcx();
 
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
+	/*
+	 * Handle the cases when at least one of x, y or z is a NaN.
+	 * Order of precedence is sNaN, qNaN and z, x, y.
+	 */
+	if (zc == IEEE754_CLASS_SNAN)
 		return ieee754dp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		DPDNORMZ;
-	/* QNAN and ZERO cases are handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
-		return ieee754dp_nanxcpt(y);
-
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
+	if (xc == IEEE754_CLASS_SNAN)
 		return ieee754dp_nanxcpt(x);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
+	if (yc == IEEE754_CLASS_SNAN)
+		return ieee754dp_nanxcpt(y);
+	if (zc == IEEE754_CLASS_QNAN)
+		return z;
+	if (xc == IEEE754_CLASS_QNAN)
+		return x;
+	if (yc == IEEE754_CLASS_QNAN)
 		return y;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
+	if (zc == IEEE754_CLASS_DNORM)
+		DPDNORMZ;
+	/* ZERO z cases are handled separately below */
 
+	switch (CLPAIR(xc, yc)) {
 
 	/*
 	 * Infinity handling
 	 */
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754dp_indef();
 
@@ -102,8 +84,6 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
 		return ieee754dp_inf(xs ^ ys);
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
@@ -120,25 +100,19 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 		DPDNORMX;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
 		DPDNORMY;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
 		DPDNORMX;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
 		/* fall through to real computations */
 	}
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index c91d5e5d9b5f..6cdaa2a85c9d 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -48,51 +48,35 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 
 	ieee754_clearcx();
 
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
+	/*
+	 * Handle the cases when at least one of x, y or z is a NaN.
+	 * Order of precedence is sNaN, qNaN and z, x, y.
+	 */
+	if (zc == IEEE754_CLASS_SNAN)
 		return ieee754sp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		SPDNORMZ;
-	/* QNAN and ZERO cases are handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
-		return ieee754sp_nanxcpt(y);
-
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
+	if (xc == IEEE754_CLASS_SNAN)
 		return ieee754sp_nanxcpt(x);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
+	if (yc == IEEE754_CLASS_SNAN)
+		return ieee754sp_nanxcpt(y);
+	if (zc == IEEE754_CLASS_QNAN)
+		return z;
+	if (xc == IEEE754_CLASS_QNAN)
+		return x;
+	if (yc == IEEE754_CLASS_QNAN)
 		return y;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
+	if (zc == IEEE754_CLASS_DNORM)
+		SPDNORMZ;
+	/* ZERO z cases are handled separately below */
+
+	switch (CLPAIR(xc, yc)) {
+
 
 	/*
 	 * Infinity handling
 	 */
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754sp_indef();
 
@@ -101,8 +85,6 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
 		return ieee754sp_inf(xs ^ ys);
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
@@ -119,25 +101,19 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 		SPDNORMX;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
 		SPDNORMY;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
 		SPDNORMX;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
 		/* fall through to real computations */
 	}

From a2e223208c437b9d8b74c5fa9672fe17b9956501 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Thu, 27 Jul 2017 18:08:55 +0200
Subject: [PATCH 1488/3220] UPSTREAM: MIPS: math-emu: <MADDF|MSUBF>.<D|S>: Fix
 some cases of infinite inputs

Fix the cases of <MADDF|MSUBF>.<D|S> when any of two multiplicands is
infinity. The correct behavior in such cases is affected by the nature
of third input. Cases of addition of infinities with opposite signs
and subtraction of infinities with same signs may arise and must be
handles separately. Also, the value od flags argument (that determines
whether the instruction is MADDF or MSUBF) affects the outcome.

Relevant examples:

MADDF.S fd,fs,ft:
  If fs contains +inf, ft contains +inf, and fd contains -inf, fd is
  going to contain indef (without this patch, it used to contain
  -inf).

MSUBF.S fd,fs,ft:
  If fs contains +inf, ft contains 1.0, and fd contains +0.0, fd is
  going to contain -inf (without this patch, it used to contain +inf).

Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction")
Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction")

Signed-off-by: Douglas Leung <douglas.leung@imgtec.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: Bo Hu <bohu@google.com>
Cc: Jin Qian <jinqian@google.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: <stable@vger.kernel.org> # 4.7+
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16887/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 0c64fe6348687f0e1cea9a608eae9d351124a73a)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c | 22 +++++++++++++++++++++-
 arch/mips/math-emu/sp_maddf.c | 22 +++++++++++++++++++++-
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 8b1bd42aad3c..557a0a118827 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -84,7 +84,27 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		return ieee754dp_inf(xs ^ ys);
+		if ((zc == IEEE754_CLASS_INF) &&
+		    ((!(flags & maddf_negate_product) && (zs != (xs ^ ys))) ||
+		     ((flags & maddf_negate_product) && (zs == (xs ^ ys))))) {
+			/*
+			 * Cases of addition of infinities with opposite signs
+			 * or subtraction of infinities with same signs.
+			 */
+			ieee754_setcx(IEEE754_INVALID_OPERATION);
+			return ieee754dp_indef();
+		}
+		/*
+		 * z is here either not an infinity, or an infinity having the
+		 * same sign as product (x*y) (in case of MADDF.D instruction)
+		 * or product -(x*y) (in MSUBF.D case). The result must be an
+		 * infinity, and its sign is determined only by the value of
+		 * (flags & maddf_negate_product) and the signs of x and y.
+		 */
+		if (flags & maddf_negate_product)
+			return ieee754dp_inf(1 ^ (xs ^ ys));
+		else
+			return ieee754dp_inf(xs ^ ys);
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index 6cdaa2a85c9d..0d8d25f4b737 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -85,7 +85,27 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		return ieee754sp_inf(xs ^ ys);
+		if ((zc == IEEE754_CLASS_INF) &&
+		    ((!(flags & maddf_negate_product) && (zs != (xs ^ ys))) ||
+		     ((flags & maddf_negate_product) && (zs == (xs ^ ys))))) {
+			/*
+			 * Cases of addition of infinities with opposite signs
+			 * or subtraction of infinities with same signs.
+			 */
+			ieee754_setcx(IEEE754_INVALID_OPERATION);
+			return ieee754sp_indef();
+		}
+		/*
+		 * z is here either not an infinity, or an infinity having the
+		 * same sign as product (x*y) (in case of MADDF.D instruction)
+		 * or product -(x*y) (in MSUBF.D case). The result must be an
+		 * infinity, and its sign is determined only by the value of
+		 * (flags & maddf_negate_product) and the signs of x and y.
+		 */
+		if (flags & maddf_negate_product)
+			return ieee754sp_inf(1 ^ (xs ^ ys));
+		else
+			return ieee754sp_inf(xs ^ ys);
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):

From 91e26d5b2df5d9ce97379914eb27fbfd7691c137 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Thu, 27 Jul 2017 18:08:56 +0200
Subject: [PATCH 1489/3220] UPSTREAM: MIPS: math-emu: <MADDF|MSUBF>.<D|S>: Fix
 some cases of zero inputs

Fix the cases of <MADDF|MSUBF>.<D|S> when any of two multiplicands is
+0 or -0, and the third input is also +0 or -0. Depending on the signs
of inputs, certain special cases must be handled.

A relevant example:

MADDF.S fd,fs,ft:
  If fs contains +0.0, ft contains -0.0, and fd contains 0.0, fd is
  going to contain +0.0 (without this patch, it used to contain -0.0).

Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction")
Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction")

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Cc: Bo Hu <bohu@google.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: Jin Qian <jinqian@google.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: <stable@vger.kernel.org> # 4.7+
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16888/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 7cf64ce4d37f1b4f44365fcf77f565d523819dcd)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c | 18 +++++++++++++++++-
 arch/mips/math-emu/sp_maddf.c | 18 +++++++++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 557a0a118827..c38fe1bde875 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -113,7 +113,23 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
 		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
-		/* Multiplication is 0 so just return z */
+		if (zc == IEEE754_CLASS_ZERO) {
+			/* Handle cases +0 + (-0) and similar ones. */
+			if ((!(flags & maddf_negate_product)
+					&& (zs == (xs ^ ys))) ||
+			    ((flags & maddf_negate_product)
+					&& (zs != (xs ^ ys))))
+				/*
+				 * Cases of addition of zeros of equal signs
+				 * or subtraction of zeroes of opposite signs.
+				 * The sign of the resulting zero is in any
+				 * such case determined only by the sign of z.
+				 */
+				return z;
+
+			return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD);
+		}
+		/* x*y is here 0, and z is not 0, so just return z */
 		return z;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index 0d8d25f4b737..4241ec19df12 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -114,7 +114,23 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
 		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
-		/* Multiplication is 0 so just return z */
+		if (zc == IEEE754_CLASS_ZERO) {
+			/* Handle cases +0 + (-0) and similar ones. */
+			if ((!(flags & maddf_negate_product)
+					&& (zs == (xs ^ ys))) ||
+			    ((flags & maddf_negate_product)
+					&& (zs != (xs ^ ys))))
+				/*
+				 * Cases of addition of zeros of equal signs
+				 * or subtraction of zeroes of opposite signs.
+				 * The sign of the resulting zero is in any
+				 * such case determined only by the sign of z.
+				 */
+				return z;
+
+			return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD);
+		}
+		/* x*y is here 0, and z is not 0, so just return z */
 		return z;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):

From 75edb6197d50591fd77a19d4b7f7f36a24eb7bfc Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Thu, 27 Jul 2017 18:08:57 +0200
Subject: [PATCH 1490/3220] UPSTREAM: MIPS: math-emu: <MADDF|MSUBF>.<D|S>:
 Clean up "maddf_flags" enumeration

Fix definition and usage of "maddf_flags" enumeration. Avoid duplicate
definition and apply more common capitalization.

This patch does not change any scenario. It just makes MADDF and
MSUBF emulation code more readable and easier to maintain, and
hopefully prevents future bugs as well.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Cc: Bo Hu <bohu@google.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: Jin Qian <jinqian@google.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: <stable@vger.kernel.org> # 4.7+
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16889/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit ae11c0619973ffd73a496308d8a1cb5e1a353737)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c   | 19 ++++++++-----------
 arch/mips/math-emu/ieee754int.h |  4 ++++
 arch/mips/math-emu/sp_maddf.c   | 19 ++++++++-----------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index c38fe1bde875..e799fc826b0c 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -14,9 +14,6 @@
 
 #include "ieee754dp.h"
 
-enum maddf_flags {
-	maddf_negate_product	= 1 << 0,
-};
 
 static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 				 union ieee754dp y, enum maddf_flags flags)
@@ -85,8 +82,8 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
 		if ((zc == IEEE754_CLASS_INF) &&
-		    ((!(flags & maddf_negate_product) && (zs != (xs ^ ys))) ||
-		     ((flags & maddf_negate_product) && (zs == (xs ^ ys))))) {
+		    ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) ||
+		     ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) {
 			/*
 			 * Cases of addition of infinities with opposite signs
 			 * or subtraction of infinities with same signs.
@@ -99,9 +96,9 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 		 * same sign as product (x*y) (in case of MADDF.D instruction)
 		 * or product -(x*y) (in MSUBF.D case). The result must be an
 		 * infinity, and its sign is determined only by the value of
-		 * (flags & maddf_negate_product) and the signs of x and y.
+		 * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y.
 		 */
-		if (flags & maddf_negate_product)
+		if (flags & MADDF_NEGATE_PRODUCT)
 			return ieee754dp_inf(1 ^ (xs ^ ys));
 		else
 			return ieee754dp_inf(xs ^ ys);
@@ -115,9 +112,9 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 			return ieee754dp_inf(zs);
 		if (zc == IEEE754_CLASS_ZERO) {
 			/* Handle cases +0 + (-0) and similar ones. */
-			if ((!(flags & maddf_negate_product)
+			if ((!(flags & MADDF_NEGATE_PRODUCT)
 					&& (zs == (xs ^ ys))) ||
-			    ((flags & maddf_negate_product)
+			    ((flags & MADDF_NEGATE_PRODUCT)
 					&& (zs != (xs ^ ys))))
 				/*
 				 * Cases of addition of zeros of equal signs
@@ -167,7 +164,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	re = xe + ye;
 	rs = xs ^ ys;
-	if (flags & maddf_negate_product)
+	if (flags & MADDF_NEGATE_PRODUCT)
 		rs ^= 1;
 
 	/* shunt to top of word */
@@ -291,5 +288,5 @@ union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
 union ieee754dp ieee754dp_msubf(union ieee754dp z, union ieee754dp x,
 				union ieee754dp y)
 {
-	return _dp_maddf(z, x, y, maddf_negate_product);
+	return _dp_maddf(z, x, y, MADDF_NEGATE_PRODUCT);
 }
diff --git a/arch/mips/math-emu/ieee754int.h b/arch/mips/math-emu/ieee754int.h
index 8bc2f6963324..dd2071f430e0 100644
--- a/arch/mips/math-emu/ieee754int.h
+++ b/arch/mips/math-emu/ieee754int.h
@@ -26,6 +26,10 @@
 
 #define CLPAIR(x, y)	((x)*6+(y))
 
+enum maddf_flags {
+	MADDF_NEGATE_PRODUCT	= 1 << 0,
+};
+
 static inline void ieee754_clearcx(void)
 {
 	ieee754_csr.cx = 0;
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index 4241ec19df12..07f5a9bb1312 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -14,9 +14,6 @@
 
 #include "ieee754sp.h"
 
-enum maddf_flags {
-	maddf_negate_product	= 1 << 0,
-};
 
 static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 				 union ieee754sp y, enum maddf_flags flags)
@@ -86,8 +83,8 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
 		if ((zc == IEEE754_CLASS_INF) &&
-		    ((!(flags & maddf_negate_product) && (zs != (xs ^ ys))) ||
-		     ((flags & maddf_negate_product) && (zs == (xs ^ ys))))) {
+		    ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) ||
+		     ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) {
 			/*
 			 * Cases of addition of infinities with opposite signs
 			 * or subtraction of infinities with same signs.
@@ -100,9 +97,9 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 		 * same sign as product (x*y) (in case of MADDF.D instruction)
 		 * or product -(x*y) (in MSUBF.D case). The result must be an
 		 * infinity, and its sign is determined only by the value of
-		 * (flags & maddf_negate_product) and the signs of x and y.
+		 * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y.
 		 */
-		if (flags & maddf_negate_product)
+		if (flags & MADDF_NEGATE_PRODUCT)
 			return ieee754sp_inf(1 ^ (xs ^ ys));
 		else
 			return ieee754sp_inf(xs ^ ys);
@@ -116,9 +113,9 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 			return ieee754sp_inf(zs);
 		if (zc == IEEE754_CLASS_ZERO) {
 			/* Handle cases +0 + (-0) and similar ones. */
-			if ((!(flags & maddf_negate_product)
+			if ((!(flags & MADDF_NEGATE_PRODUCT)
 					&& (zs == (xs ^ ys))) ||
-			    ((flags & maddf_negate_product)
+			    ((flags & MADDF_NEGATE_PRODUCT)
 					&& (zs != (xs ^ ys))))
 				/*
 				 * Cases of addition of zeros of equal signs
@@ -170,7 +167,7 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 
 	re = xe + ye;
 	rs = xs ^ ys;
-	if (flags & maddf_negate_product)
+	if (flags & MADDF_NEGATE_PRODUCT)
 		rs ^= 1;
 
 	/* shunt to top of word */
@@ -287,5 +284,5 @@ union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
 union ieee754sp ieee754sp_msubf(union ieee754sp z, union ieee754sp x,
 				union ieee754sp y)
 {
-	return _sp_maddf(z, x, y, maddf_negate_product);
+	return _sp_maddf(z, x, y, MADDF_NEGATE_PRODUCT);
 }

From 93716dce827bf4b17504a73a112b106b89b28cee Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Mon, 21 Aug 2017 14:24:47 +0200
Subject: [PATCH 1491/3220] UPSTREAM: MIPS: math-emu: CMP.Sxxx.<D|S>: Prevent
 occurrences of SIGILL crashes

Fix CMP.Sxxx.<D|S> SIGILL crashes by fixing main switch/case statement
in fpu_emul() function so that inadvertent fall-troughs are prevented.

Consider, let's say, CMP.SAF.S instruction when one of inputs is zero
and another input is a signaling NaN. The desired output is zero, and
the exception flag "invalid operation" set. For such case, the main
portion of the implementation is within "d_fmt" case of the main
"switch/case" statement in fpu_emul() function. The execution will
follow one of "if-else" branches that doesn't contain "goto cop1scr;"
statement, and will therefore reach the end of "d_fmt" case. It will
subsequently fall through to the next case, "l_fmt". After following
similar pattern, the execution will fall through to the succeeding
case, which is "default". The "default" case contains "return SIGILL;"
statement only. This means that the caller application will crash
with "illegal instruction" message.

It is obvious that above described fall-throughs are unnecessary and
harmful. This patch rectifies that behavior by providing "break;"
statements at the end of cases "d_fmt" and "l_fmt".

There are 22 instructions affected by this problem:

CMP.<SAF|SEQ|SLE|SLT|SNE|SOR|SUEQ|SULE|SULT|SUN|SUNE>.<D|S>.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17140/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 1ff8560ac9db1cbffcd700b70e1661f2fcc2e5d7)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index f32ec09a8ad4..9ed65360b596 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -2395,6 +2395,7 @@ dcopuop:
 			break;
 			}
 		}
+		break;
 	}
 
 	case l_fmt:
@@ -2468,6 +2469,8 @@ dcopuop:
 			break;
 			}
 		}
+		break;
+
 	default:
 		return SIGILL;
 	}

From fdb44c1063e82a850fa173bc0f20908bc6479814 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Mon, 21 Aug 2017 14:24:48 +0200
Subject: [PATCH 1492/3220] BACKPORT: MIPS: math-emu: RINT.<D|S>: Fix several
 problems by reimplementation

Reimplement RINT.<D|S> kernel emulation so that all RINT.<D|S>
specifications are met.

For the sake of simplicity, let's analyze RINT.S only. Prior to
this patch, RINT.S emulation was essentially implemented as (in
pseudocode) <output> = ieee754sp_flong(ieee754sp_tlong(<input>)),
where ieee754sp_tlong() and ieee754sp_flong() are functions
providing conversion from double to integer, and from integer
to double, respectively. On surface, this implementation looks
correct, but actually fails in many cases. Following problems
were detected:

1. NaN and infinity cases will not be handled properly. The
   function ieee754sp_flong() never returns NaN nor infinity.
2. For RINT.S, for all inputs larger than LONG_MAX, and smaller
   than FLT_MAX, the result will be wrong, and the overflow
   exception will be erroneously set. A similar problem for
   negative inputs exists as well.
3. For some rounding modes, for some negative inputs close to zero,
   the return value will be zero, and should be -zero. This is
   because ieee754sp_flong() never returns -zero.

This patch removes the problems above by implementing dedicated
functions for RINT.<D|S> emulation.

The core of the new function functionality is adapted version of
the core of the function ieee754sp_tlong(). However, there are many
details that are implemented to match RINT.<D|S> specification. It
should be said that the functionality of ieee754sp_tlong() actually
closely corresponds to CVT.L.S instruction, and it is used while
emulating CVT.L.S. However, RINT.S and CVT.L.S instructions differ
in many aspects. This patch fulfills missing support for RINT.<D|S>.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hans Verkuil <hans.verkuil@cisco.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17141/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 3ec404d88cefbe42d96a46f20f554f8366d64c33)

Conflicts:
	MAINTAINERS
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 MAINTAINERS                  |  7 +++
 arch/mips/math-emu/Makefile  |  6 ++-
 arch/mips/math-emu/cp1emu.c  |  6 +--
 arch/mips/math-emu/dp_rint.c | 89 +++++++++++++++++++++++++++++++++++
 arch/mips/math-emu/ieee754.h |  2 +
 arch/mips/math-emu/sp_rint.c | 90 ++++++++++++++++++++++++++++++++++++
 6 files changed, 194 insertions(+), 6 deletions(-)
 create mode 100644 arch/mips/math-emu/dp_rint.c
 create mode 100644 arch/mips/math-emu/sp_rint.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 263e4dfb3fb0..bbdff393fa2f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7054,6 +7054,13 @@ S:	Supported
 F:	Documentation/mips/
 F:	arch/mips/
 
+MIPS RINT INSTRUCTION EMULATION
+M:	Aleksandar Markovic <aleksandar.markovic@imgtec.com>
+L:	linux-mips@linux-mips.org
+S:	Supported
+F:	arch/mips/math-emu/sp_rint.c
+F:	arch/mips/math-emu/dp_rint.c
+
 MIROSOUND PCM20 FM RADIO RECEIVER DRIVER
 M:	Hans Verkuil <hverkuil@xs4all.nl>
 L:	linux-media@vger.kernel.org
diff --git a/arch/mips/math-emu/Makefile b/arch/mips/math-emu/Makefile
index e9bbc2a6526f..e9f10b88b695 100644
--- a/arch/mips/math-emu/Makefile
+++ b/arch/mips/math-emu/Makefile
@@ -4,9 +4,11 @@
 
 obj-y	+= cp1emu.o ieee754dp.o ieee754sp.o ieee754.o \
 	   dp_div.o dp_mul.o dp_sub.o dp_add.o dp_fsp.o dp_cmp.o dp_simple.o \
-	   dp_tint.o dp_fint.o dp_maddf.o dp_2008class.o dp_fmin.o dp_fmax.o \
+	   dp_tint.o dp_fint.o dp_rint.o dp_maddf.o dp_2008class.o dp_fmin.o \
+	   dp_fmax.o							     \
 	   sp_div.o sp_mul.o sp_sub.o sp_add.o sp_fdp.o sp_cmp.o sp_simple.o \
-	   sp_tint.o sp_fint.o sp_maddf.o sp_2008class.o sp_fmin.o sp_fmax.o \
+	   sp_tint.o sp_fint.o sp_rint.o sp_maddf.o sp_2008class.o sp_fmin.o \
+	   sp_fmax.o							     \
 	   dsemul.o
 
 lib-y	+= ieee754d.o \
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 9ed65360b596..ed7aeac55bf5 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1807,8 +1807,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 				return SIGILL;
 
 			SPFROMREG(fs, MIPSInst_FS(ir));
-			rv.l = ieee754sp_tlong(fs);
-			rv.s = ieee754sp_flong(rv.l);
+			rv.s = ieee754sp_rint(fs);
 			goto copcsr;
 		}
 
@@ -2136,8 +2135,7 @@ copcsr:
 				return SIGILL;
 
 			DPFROMREG(fs, MIPSInst_FS(ir));
-			rv.l = ieee754dp_tlong(fs);
-			rv.d = ieee754dp_flong(rv.l);
+			rv.d = ieee754dp_rint(fs);
 			goto copcsr;
 		}
 
diff --git a/arch/mips/math-emu/dp_rint.c b/arch/mips/math-emu/dp_rint.c
new file mode 100644
index 000000000000..c3b9077ff357
--- /dev/null
+++ b/arch/mips/math-emu/dp_rint.c
@@ -0,0 +1,89 @@
+/* IEEE754 floating point arithmetic
+ * double precision: common utilities
+ */
+/*
+ * MIPS floating point support
+ * Copyright (C) 1994-2000 Algorithmics Ltd.
+ * Copyright (C) 2017 Imagination Technologies, Ltd.
+ * Author: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
+ *
+ *  This program is free software; you can distribute it and/or modify it
+ *  under the terms of the GNU General Public License (Version 2) as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program.
+ */
+
+#include "ieee754dp.h"
+
+union ieee754dp ieee754dp_rint(union ieee754dp x)
+{
+	union ieee754dp ret;
+	u64 residue;
+	int sticky;
+	int round;
+	int odd;
+
+	COMPXDP;
+
+	ieee754_clearcx();
+
+	EXPLODEXDP;
+	FLUSHXDP;
+
+	if (xc == IEEE754_CLASS_SNAN)
+		return ieee754dp_nanxcpt(x);
+
+	if ((xc == IEEE754_CLASS_QNAN) ||
+	    (xc == IEEE754_CLASS_INF) ||
+	    (xc == IEEE754_CLASS_ZERO))
+		return x;
+
+	if (xe >= DP_FBITS)
+		return x;
+
+	if (xe < -1) {
+		residue = xm;
+		round = 0;
+		sticky = residue != 0;
+		xm = 0;
+	} else {
+		residue = xm << (64 - DP_FBITS + xe);
+		round = (residue >> 63) != 0;
+		sticky = (residue << 1) != 0;
+		xm >>= DP_FBITS - xe;
+	}
+
+	odd = (xm & 0x1) != 0x0;
+
+	switch (ieee754_csr.rm) {
+	case FPU_CSR_RN:	/* toward nearest */
+		if (round && (sticky || odd))
+			xm++;
+		break;
+	case FPU_CSR_RZ:	/* toward zero */
+		break;
+	case FPU_CSR_RU:	/* toward +infinity */
+		if ((round || sticky) && !xs)
+			xm++;
+		break;
+	case FPU_CSR_RD:	/* toward -infinity */
+		if ((round || sticky) && xs)
+			xm++;
+		break;
+	}
+
+	if (round || sticky)
+		ieee754_setcx(IEEE754_INEXACT);
+
+	ret = ieee754dp_flong(xm);
+	DPSIGN(ret) = xs;
+
+	return ret;
+}
diff --git a/arch/mips/math-emu/ieee754.h b/arch/mips/math-emu/ieee754.h
index d3be351aed15..92dc8fa565cb 100644
--- a/arch/mips/math-emu/ieee754.h
+++ b/arch/mips/math-emu/ieee754.h
@@ -67,6 +67,7 @@ union ieee754sp ieee754sp_div(union ieee754sp x, union ieee754sp y);
 union ieee754sp ieee754sp_fint(int x);
 union ieee754sp ieee754sp_flong(s64 x);
 union ieee754sp ieee754sp_fdp(union ieee754dp x);
+union ieee754sp ieee754sp_rint(union ieee754sp x);
 
 int ieee754sp_tint(union ieee754sp x);
 s64 ieee754sp_tlong(union ieee754sp x);
@@ -101,6 +102,7 @@ union ieee754dp ieee754dp_neg(union ieee754dp x);
 union ieee754dp ieee754dp_fint(int x);
 union ieee754dp ieee754dp_flong(s64 x);
 union ieee754dp ieee754dp_fsp(union ieee754sp x);
+union ieee754dp ieee754dp_rint(union ieee754dp x);
 
 int ieee754dp_tint(union ieee754dp x);
 s64 ieee754dp_tlong(union ieee754dp x);
diff --git a/arch/mips/math-emu/sp_rint.c b/arch/mips/math-emu/sp_rint.c
new file mode 100644
index 000000000000..70765b17e196
--- /dev/null
+++ b/arch/mips/math-emu/sp_rint.c
@@ -0,0 +1,90 @@
+/* IEEE754 floating point arithmetic
+ * single precision
+ */
+/*
+ * MIPS floating point support
+ * Copyright (C) 1994-2000 Algorithmics Ltd.
+ * Copyright (C) 2017 Imagination Technologies, Ltd.
+ * Author: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
+ *
+ *  This program is free software; you can distribute it and/or modify it
+ *  under the terms of the GNU General Public License (Version 2) as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program.
+ */
+
+#include "ieee754sp.h"
+
+union ieee754sp ieee754sp_rint(union ieee754sp x)
+{
+	union ieee754sp ret;
+	u32 residue;
+	int sticky;
+	int round;
+	int odd;
+
+	COMPXDP;		/* <-- DP needed for 64-bit mantissa tmp */
+
+	ieee754_clearcx();
+
+	EXPLODEXSP;
+	FLUSHXSP;
+
+	if (xc == IEEE754_CLASS_SNAN)
+		return ieee754sp_nanxcpt(x);
+
+	if ((xc == IEEE754_CLASS_QNAN) ||
+	    (xc == IEEE754_CLASS_INF) ||
+	    (xc == IEEE754_CLASS_ZERO))
+		return x;
+
+	if (xe >= SP_FBITS)
+		return x;
+
+	if (xe < -1) {
+		residue = xm;
+		round = 0;
+		sticky = residue != 0;
+		xm = 0;
+	} else {
+		residue = xm << (xe + 1);
+		residue <<= 31 - SP_FBITS;
+		round = (residue >> 31) != 0;
+		sticky = (residue << 1) != 0;
+		xm >>= SP_FBITS - xe;
+	}
+
+	odd = (xm & 0x1) != 0x0;
+
+	switch (ieee754_csr.rm) {
+	case FPU_CSR_RN:	/* toward nearest */
+		if (round && (sticky || odd))
+			xm++;
+		break;
+	case FPU_CSR_RZ:	/* toward zero */
+		break;
+	case FPU_CSR_RU:	/* toward +infinity */
+		if ((round || sticky) && !xs)
+			xm++;
+		break;
+	case FPU_CSR_RD:	/* toward -infinity */
+		if ((round || sticky) && xs)
+			xm++;
+		break;
+	}
+
+	if (round || sticky)
+		ieee754_setcx(IEEE754_INEXACT);
+
+	ret = ieee754sp_flong(xm);
+	SPSIGN(ret) = xs;
+
+	return ret;
+}

From 49ab66eb1e5724a238d307a167712f09b43590bd Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Mon, 21 Aug 2017 14:24:49 +0200
Subject: [PATCH 1493/3220] BACKPORT: MIPS: math-emu: CLASS.D: Zero bits 32-63
 of the result

Fix content of CLASS.D output bits 32-63 to match hardware behavior.

Prior to this patch, bits 32-63 of CLASS.D output were not
initialized, causing different 32-63 bits content of CLASS.D, based on
circumstances. However, the hardware consistently returns all these
bits zeroed. The documentation is not clear whether these bits should
be zero or unpredictable. Since technically "all zero" case still can
be viewed as belonging to "unpredictable" class of results, it is
better to zero bits 32-63.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17142/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit e1231dd6b1cfbed9dfda5de488ce23c2414e1f04)

Conflicts:
	arch/mips/math-emu/cp1emu.c
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index ed7aeac55bf5..6ad7bd871508 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -2146,8 +2146,8 @@ copcsr:
 				return SIGILL;
 
 			DPFROMREG(fs, MIPSInst_FS(ir));
-			rv.w = ieee754dp_2008class(fs);
-			rfmt = w_fmt;
+			rv.l = ieee754dp_2008class(fs);
+			rfmt = l_fmt;
 			goto copcsr;
 		}
 

From b4c0bacf7fb7225df9ed6cc3ef27cf0d5bfed025 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Mon, 21 Aug 2017 14:24:50 +0200
Subject: [PATCH 1494/3220] UPSTREAM: MIPS: math-emu: Add FP emu debugfs
 statistics for branches

Add FP emu debugfs counter for branches.

The new counter is displayed the same way as existing counter, and
its default path is /sys/kernel/debug/mips/fpuemustats/.

The limitation of this counter is that it counts only R6 branch
instructions BC1NEZ and BC1EQZ.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17143/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit ae5f3f5b81dd2c776f0ad49d6d121ce1255b35eb)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/fpu_emulator.h | 1 +
 arch/mips/math-emu/cp1emu.c          | 1 +
 arch/mips/math-emu/me-debugfs.c      | 1 +
 3 files changed, 3 insertions(+)

diff --git a/arch/mips/include/asm/fpu_emulator.h b/arch/mips/include/asm/fpu_emulator.h
index c05369e0b8d6..7f5cf1f4bcbd 100644
--- a/arch/mips/include/asm/fpu_emulator.h
+++ b/arch/mips/include/asm/fpu_emulator.h
@@ -36,6 +36,7 @@ struct mips_fpu_emulator_stats {
 	unsigned long emulated;
 	unsigned long loads;
 	unsigned long stores;
+	unsigned long branches;
 	unsigned long cp1ops;
 	unsigned long cp1xops;
 	unsigned long errors;
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 6ad7bd871508..008372d19a6a 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1231,6 +1231,7 @@ emul:
 				break;
 			}
 branch_common:
+			MIPS_FPU_EMU_INC_STATS(branches);
 			set_delay_slot(xcp);
 			if (cond) {
 				/*
diff --git a/arch/mips/math-emu/me-debugfs.c b/arch/mips/math-emu/me-debugfs.c
index be650ed7db59..78b26c8dc697 100644
--- a/arch/mips/math-emu/me-debugfs.c
+++ b/arch/mips/math-emu/me-debugfs.c
@@ -53,6 +53,7 @@ do {									\
 	FPU_STAT_CREATE(emulated);
 	FPU_STAT_CREATE(loads);
 	FPU_STAT_CREATE(stores);
+	FPU_STAT_CREATE(branches);
 	FPU_STAT_CREATE(cp1ops);
 	FPU_STAT_CREATE(cp1xops);
 	FPU_STAT_CREATE(errors);

From ba44d882f5189316ff5c5a283e7b61a224f5203b Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Mon, 21 Aug 2017 14:24:51 +0200
Subject: [PATCH 1495/3220] UPSTREAM: MIPS: math-emu: Add FP emu debugfs clear
 functionality

Add capability for the user to clear all FP emu debugfs counters.

This is achieved by having a special debugfs file "fpuemustats_clear"
(under default location "/sys/kernel/debug/mips"). Each access to the
file results in setting all counters to zero (it is enough, let's say,
to issue a "cat /sys/kernel/debug/mips/fpuemustats_clear").

This functionality already exists for R2 emulation statistics,
but was missing for FP emulation statistics. The implementation in
this patch is consistent with its R2 emulation counterpart.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17144/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 25ad8db632ec54c60daad9107ddf25a2a608a450)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/me-debugfs.c | 38 ++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/arch/mips/math-emu/me-debugfs.c b/arch/mips/math-emu/me-debugfs.c
index 78b26c8dc697..f080493ba743 100644
--- a/arch/mips/math-emu/me-debugfs.c
+++ b/arch/mips/math-emu/me-debugfs.c
@@ -28,15 +28,51 @@ static int fpuemu_stat_get(void *data, u64 *val)
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_fpuemu_stat, fpuemu_stat_get, NULL, "%llu\n");
 
+static int fpuemustats_clear_show(struct seq_file *s, void *unused)
+{
+	__this_cpu_write((fpuemustats).emulated, 0);
+	__this_cpu_write((fpuemustats).loads, 0);
+	__this_cpu_write((fpuemustats).stores, 0);
+	__this_cpu_write((fpuemustats).branches, 0);
+	__this_cpu_write((fpuemustats).cp1ops, 0);
+	__this_cpu_write((fpuemustats).cp1xops, 0);
+	__this_cpu_write((fpuemustats).errors, 0);
+	__this_cpu_write((fpuemustats).ieee754_inexact, 0);
+	__this_cpu_write((fpuemustats).ieee754_underflow, 0);
+	__this_cpu_write((fpuemustats).ieee754_overflow, 0);
+	__this_cpu_write((fpuemustats).ieee754_zerodiv, 0);
+	__this_cpu_write((fpuemustats).ieee754_invalidop, 0);
+	__this_cpu_write((fpuemustats).ds_emul, 0);
+
+	return 0;
+}
+
+static int fpuemustats_clear_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fpuemustats_clear_show, inode->i_private);
+}
+
+static const struct file_operations fpuemustats_clear_fops = {
+	.open                   = fpuemustats_clear_open,
+	.read			= seq_read,
+	.llseek			= seq_lseek,
+	.release		= single_release,
+};
+
 static int __init debugfs_fpuemu(void)
 {
-	struct dentry *d, *dir;
+	struct dentry *d, *dir, *reset_file;
 
 	if (!mips_debugfs_dir)
 		return -ENODEV;
 	dir = debugfs_create_dir("fpuemustats", mips_debugfs_dir);
 	if (!dir)
 		return -ENOMEM;
+	reset_file = debugfs_create_file("fpuemustats_clear", 0444,
+					 mips_debugfs_dir, NULL,
+					 &fpuemustats_clear_fops);
+	if (!reset_file)
+		return -ENOMEM;
 
 #define FPU_EMU_STAT_OFFSET(m)						\
 	offsetof(struct mips_fpu_emulator_stats, m)

From 80a1b3334ae491328f67b27f954df361e653138a Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Mon, 21 Aug 2017 14:24:52 +0200
Subject: [PATCH 1496/3220] UPSTREAM: MIPS: math-emu: Add FP emu debugfs stats
 for individual instructions

Add FP emulation debugfs statistics for individual instructions. The
debugfs files that contain counter values are placed in a separate
directory called "instructions". This means that the default path for
these new stat is "/sys/kernel/debug/mips/fpuemustats/instructions".

Each instruction counter is mapped to the debugfs file that has the
same name as instruction name. The lowercase is choosen as more
commonly used case for instruction names.

One example of usage:

mips_host::/sys/kernel/debug/mips/fpuemustats/instructions # grep "" *

The shortened output of this command is:

abs.d:34
abs.s:5711
add.d:10401
add.s:399307
bc1eqz:3199
...
...
...
sub.s:167211
trunc.l.d:375
trunc.l.s:8054
trunc.w.d:421
trunc.w.s:27032

The limitation of this patch is that it handles R6 FP emulation
instructions only. There are altogether 114 handled instructions.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17145/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 454854ace22f5a9fdd369a4e428493159a02f029)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/fpu_emulator.h | 115 +++++++++++
 arch/mips/math-emu/cp1emu.c          | 258 ++++++++++++++++++++++++
 arch/mips/math-emu/me-debugfs.c      | 281 ++++++++++++++++++++++++++-
 3 files changed, 650 insertions(+), 4 deletions(-)

diff --git a/arch/mips/include/asm/fpu_emulator.h b/arch/mips/include/asm/fpu_emulator.h
index 7f5cf1f4bcbd..c7544dea523b 100644
--- a/arch/mips/include/asm/fpu_emulator.h
+++ b/arch/mips/include/asm/fpu_emulator.h
@@ -46,6 +46,121 @@ struct mips_fpu_emulator_stats {
 	unsigned long ieee754_zerodiv;
 	unsigned long ieee754_invalidop;
 	unsigned long ds_emul;
+
+	unsigned long abs_s;
+	unsigned long abs_d;
+	unsigned long add_s;
+	unsigned long add_d;
+	unsigned long bc1eqz;
+	unsigned long bc1nez;
+	unsigned long ceil_w_s;
+	unsigned long ceil_w_d;
+	unsigned long ceil_l_s;
+	unsigned long ceil_l_d;
+	unsigned long class_s;
+	unsigned long class_d;
+	unsigned long cmp_af_s;
+	unsigned long cmp_af_d;
+	unsigned long cmp_eq_s;
+	unsigned long cmp_eq_d;
+	unsigned long cmp_le_s;
+	unsigned long cmp_le_d;
+	unsigned long cmp_lt_s;
+	unsigned long cmp_lt_d;
+	unsigned long cmp_ne_s;
+	unsigned long cmp_ne_d;
+	unsigned long cmp_or_s;
+	unsigned long cmp_or_d;
+	unsigned long cmp_ueq_s;
+	unsigned long cmp_ueq_d;
+	unsigned long cmp_ule_s;
+	unsigned long cmp_ule_d;
+	unsigned long cmp_ult_s;
+	unsigned long cmp_ult_d;
+	unsigned long cmp_un_s;
+	unsigned long cmp_un_d;
+	unsigned long cmp_une_s;
+	unsigned long cmp_une_d;
+	unsigned long cmp_saf_s;
+	unsigned long cmp_saf_d;
+	unsigned long cmp_seq_s;
+	unsigned long cmp_seq_d;
+	unsigned long cmp_sle_s;
+	unsigned long cmp_sle_d;
+	unsigned long cmp_slt_s;
+	unsigned long cmp_slt_d;
+	unsigned long cmp_sne_s;
+	unsigned long cmp_sne_d;
+	unsigned long cmp_sor_s;
+	unsigned long cmp_sor_d;
+	unsigned long cmp_sueq_s;
+	unsigned long cmp_sueq_d;
+	unsigned long cmp_sule_s;
+	unsigned long cmp_sule_d;
+	unsigned long cmp_sult_s;
+	unsigned long cmp_sult_d;
+	unsigned long cmp_sun_s;
+	unsigned long cmp_sun_d;
+	unsigned long cmp_sune_s;
+	unsigned long cmp_sune_d;
+	unsigned long cvt_d_l;
+	unsigned long cvt_d_s;
+	unsigned long cvt_d_w;
+	unsigned long cvt_l_s;
+	unsigned long cvt_l_d;
+	unsigned long cvt_s_d;
+	unsigned long cvt_s_l;
+	unsigned long cvt_s_w;
+	unsigned long cvt_w_s;
+	unsigned long cvt_w_d;
+	unsigned long div_s;
+	unsigned long div_d;
+	unsigned long floor_w_s;
+	unsigned long floor_w_d;
+	unsigned long floor_l_s;
+	unsigned long floor_l_d;
+	unsigned long maddf_s;
+	unsigned long maddf_d;
+	unsigned long max_s;
+	unsigned long max_d;
+	unsigned long maxa_s;
+	unsigned long maxa_d;
+	unsigned long min_s;
+	unsigned long min_d;
+	unsigned long mina_s;
+	unsigned long mina_d;
+	unsigned long mov_s;
+	unsigned long mov_d;
+	unsigned long msubf_s;
+	unsigned long msubf_d;
+	unsigned long mul_s;
+	unsigned long mul_d;
+	unsigned long neg_s;
+	unsigned long neg_d;
+	unsigned long recip_s;
+	unsigned long recip_d;
+	unsigned long rint_s;
+	unsigned long rint_d;
+	unsigned long round_w_s;
+	unsigned long round_w_d;
+	unsigned long round_l_s;
+	unsigned long round_l_d;
+	unsigned long rsqrt_s;
+	unsigned long rsqrt_d;
+	unsigned long sel_s;
+	unsigned long sel_d;
+	unsigned long seleqz_s;
+	unsigned long seleqz_d;
+	unsigned long selnez_s;
+	unsigned long selnez_d;
+	unsigned long sqrt_s;
+	unsigned long sqrt_d;
+	unsigned long sub_s;
+	unsigned long sub_d;
+	unsigned long trunc_w_s;
+	unsigned long trunc_w_d;
+	unsigned long trunc_l_s;
+	unsigned long trunc_l_d;
 };
 
 DECLARE_PER_CPU(struct mips_fpu_emulator_stats, fpuemustats);
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 008372d19a6a..aa44e16db1ab 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1196,9 +1196,11 @@ emul:
 			bit0 = get_fpr32(fpr, 0) & 0x1;
 			switch (MIPSInst_RS(ir)) {
 			case bc1eqz_op:
+				MIPS_FPU_EMU_INC_STATS(bc1eqz);
 				cond = bit0 == 0;
 				break;
 			case bc1nez_op:
+				MIPS_FPU_EMU_INC_STATS(bc1nez);
 				cond = bit0 != 0;
 				break;
 			}
@@ -1685,15 +1687,19 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 		switch (MIPSInst_FUNC(ir)) {
 			/* binary ops */
 		case fadd_op:
+			MIPS_FPU_EMU_INC_STATS(add_s);
 			handler.b = ieee754sp_add;
 			goto scopbop;
 		case fsub_op:
+			MIPS_FPU_EMU_INC_STATS(sub_s);
 			handler.b = ieee754sp_sub;
 			goto scopbop;
 		case fmul_op:
+			MIPS_FPU_EMU_INC_STATS(mul_s);
 			handler.b = ieee754sp_mul;
 			goto scopbop;
 		case fdiv_op:
+			MIPS_FPU_EMU_INC_STATS(div_s);
 			handler.b = ieee754sp_div;
 			goto scopbop;
 
@@ -1702,6 +1708,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_2_3_4_5_r)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(sqrt_s);
 			handler.u = ieee754sp_sqrt;
 			goto scopuop;
 
@@ -1714,6 +1721,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(rsqrt_s);
 			handler.u = fpemu_sp_rsqrt;
 			goto scopuop;
 
@@ -1721,6 +1729,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(recip_s);
 			handler.u = fpemu_sp_recip;
 			goto scopuop;
 
@@ -1757,6 +1766,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(seleqz_s);
 			SPFROMREG(rv.s, MIPSInst_FT(ir));
 			if (rv.w & 0x1)
 				rv.w = 0;
@@ -1768,6 +1778,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(selnez_s);
 			SPFROMREG(rv.s, MIPSInst_FT(ir));
 			if (rv.w & 0x1)
 				SPFROMREG(rv.s, MIPSInst_FS(ir));
@@ -1781,6 +1792,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(maddf_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			SPFROMREG(fd, MIPSInst_FD(ir));
@@ -1794,6 +1806,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(msubf_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			SPFROMREG(fd, MIPSInst_FD(ir));
@@ -1807,6 +1820,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(rint_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_rint(fs);
 			goto copcsr;
@@ -1818,6 +1832,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(class_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.w = ieee754sp_2008class(fs);
 			rfmt = w_fmt;
@@ -1830,6 +1845,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(min_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmin(fs, ft);
@@ -1842,6 +1858,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(mina_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmina(fs, ft);
@@ -1854,6 +1871,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(max_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmax(fs, ft);
@@ -1866,6 +1884,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(maxa_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmaxa(fs, ft);
@@ -1873,15 +1892,18 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 		}
 
 		case fabs_op:
+			MIPS_FPU_EMU_INC_STATS(abs_s);
 			handler.u = ieee754sp_abs;
 			goto scopuop;
 
 		case fneg_op:
+			MIPS_FPU_EMU_INC_STATS(neg_s);
 			handler.u = ieee754sp_neg;
 			goto scopuop;
 
 		case fmov_op:
 			/* an easy one */
+			MIPS_FPU_EMU_INC_STATS(mov_s);
 			SPFROMREG(rv.s, MIPSInst_FS(ir));
 			goto copcsr;
 
@@ -1924,12 +1946,14 @@ copcsr:
 			return SIGILL;	/* not defined */
 
 		case fcvtd_op:
+			MIPS_FPU_EMU_INC_STATS(cvt_d_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fsp(fs);
 			rfmt = d_fmt;
 			goto copcsr;
 
 		case fcvtw_op:
+			MIPS_FPU_EMU_INC_STATS(cvt_w_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.w = ieee754sp_tint(fs);
 			rfmt = w_fmt;
@@ -1942,6 +1966,15 @@ copcsr:
 			if (!cpu_has_mips_2_3_4_5_r)
 				return SIGILL;
 
+			if (MIPSInst_FUNC(ir) == fceil_op)
+				MIPS_FPU_EMU_INC_STATS(ceil_w_s);
+			if (MIPSInst_FUNC(ir) == ffloor_op)
+				MIPS_FPU_EMU_INC_STATS(floor_w_s);
+			if (MIPSInst_FUNC(ir) == fround_op)
+				MIPS_FPU_EMU_INC_STATS(round_w_s);
+			if (MIPSInst_FUNC(ir) == ftrunc_op)
+				MIPS_FPU_EMU_INC_STATS(trunc_w_s);
+
 			oldrm = ieee754_csr.rm;
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			ieee754_csr.rm = MIPSInst_FUNC(ir);
@@ -1954,6 +1987,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(sel_s);
 			SPFROMREG(fd, MIPSInst_FD(ir));
 			if (fd.bits & 0x1)
 				SPFROMREG(rv.s, MIPSInst_FT(ir));
@@ -1965,6 +1999,7 @@ copcsr:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(cvt_l_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.l = ieee754sp_tlong(fs);
 			rfmt = l_fmt;
@@ -1977,6 +2012,15 @@ copcsr:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;
 
+			if (MIPSInst_FUNC(ir) == fceill_op)
+				MIPS_FPU_EMU_INC_STATS(ceil_l_s);
+			if (MIPSInst_FUNC(ir) == ffloorl_op)
+				MIPS_FPU_EMU_INC_STATS(floor_l_s);
+			if (MIPSInst_FUNC(ir) == froundl_op)
+				MIPS_FPU_EMU_INC_STATS(round_l_s);
+			if (MIPSInst_FUNC(ir) == ftruncl_op)
+				MIPS_FPU_EMU_INC_STATS(trunc_l_s);
+
 			oldrm = ieee754_csr.rm;
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			ieee754_csr.rm = MIPSInst_FUNC(ir);
@@ -2018,15 +2062,19 @@ copcsr:
 		switch (MIPSInst_FUNC(ir)) {
 			/* binary ops */
 		case fadd_op:
+			MIPS_FPU_EMU_INC_STATS(add_d);
 			handler.b = ieee754dp_add;
 			goto dcopbop;
 		case fsub_op:
+			MIPS_FPU_EMU_INC_STATS(sub_d);
 			handler.b = ieee754dp_sub;
 			goto dcopbop;
 		case fmul_op:
+			MIPS_FPU_EMU_INC_STATS(mul_d);
 			handler.b = ieee754dp_mul;
 			goto dcopbop;
 		case fdiv_op:
+			MIPS_FPU_EMU_INC_STATS(div_d);
 			handler.b = ieee754dp_div;
 			goto dcopbop;
 
@@ -2035,6 +2083,7 @@ copcsr:
 			if (!cpu_has_mips_2_3_4_5_r)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(sqrt_d);
 			handler.u = ieee754dp_sqrt;
 			goto dcopuop;
 		/*
@@ -2046,12 +2095,14 @@ copcsr:
 			if (!cpu_has_mips_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(rsqrt_d);
 			handler.u = fpemu_dp_rsqrt;
 			goto dcopuop;
 		case frecip_op:
 			if (!cpu_has_mips_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(recip_d);
 			handler.u = fpemu_dp_recip;
 			goto dcopuop;
 		case fmovc_op:
@@ -2085,6 +2136,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(seleqz_d);
 			DPFROMREG(rv.d, MIPSInst_FT(ir));
 			if (rv.l & 0x1)
 				rv.l = 0;
@@ -2096,6 +2148,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(selnez_d);
 			DPFROMREG(rv.d, MIPSInst_FT(ir));
 			if (rv.l & 0x1)
 				DPFROMREG(rv.d, MIPSInst_FS(ir));
@@ -2109,6 +2162,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(maddf_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			DPFROMREG(fd, MIPSInst_FD(ir));
@@ -2122,6 +2176,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(msubf_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			DPFROMREG(fd, MIPSInst_FD(ir));
@@ -2135,6 +2190,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(rint_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_rint(fs);
 			goto copcsr;
@@ -2146,6 +2202,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(class_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.l = ieee754dp_2008class(fs);
 			rfmt = l_fmt;
@@ -2158,6 +2215,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(min_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmin(fs, ft);
@@ -2170,6 +2228,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(mina_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmina(fs, ft);
@@ -2182,6 +2241,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(max_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmax(fs, ft);
@@ -2194,6 +2254,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(maxa_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmaxa(fs, ft);
@@ -2201,15 +2262,18 @@ copcsr:
 		}
 
 		case fabs_op:
+			MIPS_FPU_EMU_INC_STATS(abs_d);
 			handler.u = ieee754dp_abs;
 			goto dcopuop;
 
 		case fneg_op:
+			MIPS_FPU_EMU_INC_STATS(neg_d);
 			handler.u = ieee754dp_neg;
 			goto dcopuop;
 
 		case fmov_op:
 			/* an easy one */
+			MIPS_FPU_EMU_INC_STATS(mov_d);
 			DPFROMREG(rv.d, MIPSInst_FS(ir));
 			goto copcsr;
 
@@ -2229,6 +2293,7 @@ dcopuop:
 		 * unary conv ops
 		 */
 		case fcvts_op:
+			MIPS_FPU_EMU_INC_STATS(cvt_s_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fdp(fs);
 			rfmt = s_fmt;
@@ -2238,6 +2303,7 @@ dcopuop:
 			return SIGILL;	/* not defined */
 
 		case fcvtw_op:
+			MIPS_FPU_EMU_INC_STATS(cvt_w_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.w = ieee754dp_tint(fs);	/* wrong */
 			rfmt = w_fmt;
@@ -2250,6 +2316,15 @@ dcopuop:
 			if (!cpu_has_mips_2_3_4_5_r)
 				return SIGILL;
 
+			if (MIPSInst_FUNC(ir) == fceil_op)
+				MIPS_FPU_EMU_INC_STATS(ceil_w_d);
+			if (MIPSInst_FUNC(ir) == ffloor_op)
+				MIPS_FPU_EMU_INC_STATS(floor_w_d);
+			if (MIPSInst_FUNC(ir) == fround_op)
+				MIPS_FPU_EMU_INC_STATS(round_w_d);
+			if (MIPSInst_FUNC(ir) == ftrunc_op)
+				MIPS_FPU_EMU_INC_STATS(trunc_w_d);
+
 			oldrm = ieee754_csr.rm;
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			ieee754_csr.rm = MIPSInst_FUNC(ir);
@@ -2262,6 +2337,7 @@ dcopuop:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(sel_d);
 			DPFROMREG(fd, MIPSInst_FD(ir));
 			if (fd.bits & 0x1)
 				DPFROMREG(rv.d, MIPSInst_FT(ir));
@@ -2273,6 +2349,7 @@ dcopuop:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(cvt_l_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.l = ieee754dp_tlong(fs);
 			rfmt = l_fmt;
@@ -2285,6 +2362,15 @@ dcopuop:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;
 
+			if (MIPSInst_FUNC(ir) == fceill_op)
+				MIPS_FPU_EMU_INC_STATS(ceil_l_d);
+			if (MIPSInst_FUNC(ir) == ffloorl_op)
+				MIPS_FPU_EMU_INC_STATS(floor_l_d);
+			if (MIPSInst_FUNC(ir) == froundl_op)
+				MIPS_FPU_EMU_INC_STATS(round_l_d);
+			if (MIPSInst_FUNC(ir) == ftruncl_op)
+				MIPS_FPU_EMU_INC_STATS(trunc_l_d);
+
 			oldrm = ieee754_csr.rm;
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			ieee754_csr.rm = MIPSInst_FUNC(ir);
@@ -2326,12 +2412,14 @@ dcopuop:
 		switch (MIPSInst_FUNC(ir)) {
 		case fcvts_op:
 			/* convert word to single precision real */
+			MIPS_FPU_EMU_INC_STATS(cvt_s_w);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fint(fs.bits);
 			rfmt = s_fmt;
 			goto copcsr;
 		case fcvtd_op:
 			/* convert word to double precision real */
+			MIPS_FPU_EMU_INC_STATS(cvt_d_w);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fint(fs.bits);
 			rfmt = d_fmt;
@@ -2351,6 +2439,90 @@ dcopuop:
 			    (MIPSInst_FUNC(ir) & 0x20))
 				return SIGILL;
 
+			if (!sig) {
+				if (!(MIPSInst_FUNC(ir) & PREDICATE_BIT)) {
+					switch (cmpop) {
+					case 0:
+					MIPS_FPU_EMU_INC_STATS(cmp_af_s);
+					break;
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_un_s);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_eq_s);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_ueq_s);
+					break;
+					case 4:
+					MIPS_FPU_EMU_INC_STATS(cmp_lt_s);
+					break;
+					case 5:
+					MIPS_FPU_EMU_INC_STATS(cmp_ult_s);
+					break;
+					case 6:
+					MIPS_FPU_EMU_INC_STATS(cmp_le_s);
+					break;
+					case 7:
+					MIPS_FPU_EMU_INC_STATS(cmp_ule_s);
+					break;
+					}
+				} else {
+					switch (cmpop) {
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_or_s);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_une_s);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_ne_s);
+					break;
+					}
+				}
+			} else {
+				if (!(MIPSInst_FUNC(ir) & PREDICATE_BIT)) {
+					switch (cmpop) {
+					case 0:
+					MIPS_FPU_EMU_INC_STATS(cmp_saf_s);
+					break;
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_sun_s);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_seq_s);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_sueq_s);
+					break;
+					case 4:
+					MIPS_FPU_EMU_INC_STATS(cmp_slt_s);
+					break;
+					case 5:
+					MIPS_FPU_EMU_INC_STATS(cmp_sult_s);
+					break;
+					case 6:
+					MIPS_FPU_EMU_INC_STATS(cmp_sle_s);
+					break;
+					case 7:
+					MIPS_FPU_EMU_INC_STATS(cmp_sule_s);
+					break;
+					}
+				} else {
+					switch (cmpop) {
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_sor_s);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_sune_s);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_sne_s);
+					break;
+					}
+				}
+			}
+
 			/* fmt is w_fmt for single precision so fix it */
 			rfmt = s_fmt;
 			/* default to false */
@@ -2407,11 +2579,13 @@ dcopuop:
 		switch (MIPSInst_FUNC(ir)) {
 		case fcvts_op:
 			/* convert long to single precision real */
+			MIPS_FPU_EMU_INC_STATS(cvt_s_l);
 			rv.s = ieee754sp_flong(bits);
 			rfmt = s_fmt;
 			goto copcsr;
 		case fcvtd_op:
 			/* convert long to double precision real */
+			MIPS_FPU_EMU_INC_STATS(cvt_d_l);
 			rv.d = ieee754dp_flong(bits);
 			rfmt = d_fmt;
 			goto copcsr;
@@ -2425,6 +2599,90 @@ dcopuop:
 			    (MIPSInst_FUNC(ir) & 0x20))
 				return SIGILL;
 
+			if (!sig) {
+				if (!(MIPSInst_FUNC(ir) & PREDICATE_BIT)) {
+					switch (cmpop) {
+					case 0:
+					MIPS_FPU_EMU_INC_STATS(cmp_af_d);
+					break;
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_un_d);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_eq_d);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_ueq_d);
+					break;
+					case 4:
+					MIPS_FPU_EMU_INC_STATS(cmp_lt_d);
+					break;
+					case 5:
+					MIPS_FPU_EMU_INC_STATS(cmp_ult_d);
+					break;
+					case 6:
+					MIPS_FPU_EMU_INC_STATS(cmp_le_d);
+					break;
+					case 7:
+					MIPS_FPU_EMU_INC_STATS(cmp_ule_d);
+					break;
+					}
+				} else {
+					switch (cmpop) {
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_or_d);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_une_d);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_ne_d);
+					break;
+					}
+				}
+			} else {
+				if (!(MIPSInst_FUNC(ir) & PREDICATE_BIT)) {
+					switch (cmpop) {
+					case 0:
+					MIPS_FPU_EMU_INC_STATS(cmp_saf_d);
+					break;
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_sun_d);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_seq_d);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_sueq_d);
+					break;
+					case 4:
+					MIPS_FPU_EMU_INC_STATS(cmp_slt_d);
+					break;
+					case 5:
+					MIPS_FPU_EMU_INC_STATS(cmp_sult_d);
+					break;
+					case 6:
+					MIPS_FPU_EMU_INC_STATS(cmp_sle_d);
+					break;
+					case 7:
+					MIPS_FPU_EMU_INC_STATS(cmp_sule_d);
+					break;
+					}
+				} else {
+					switch (cmpop) {
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_sor_d);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_sune_d);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_sne_d);
+					break;
+					}
+				}
+			}
+
 			/* fmt is l_fmt for double precision so fix it */
 			rfmt = d_fmt;
 			/* default to false */
diff --git a/arch/mips/math-emu/me-debugfs.c b/arch/mips/math-emu/me-debugfs.c
index f080493ba743..8c0ec154aecc 100644
--- a/arch/mips/math-emu/me-debugfs.c
+++ b/arch/mips/math-emu/me-debugfs.c
@@ -28,6 +28,26 @@ static int fpuemu_stat_get(void *data, u64 *val)
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_fpuemu_stat, fpuemu_stat_get, NULL, "%llu\n");
 
+/*
+ * Used to obtain names for a debugfs instruction counter, given field name
+ * in fpuemustats structure. For example, for input "cmp_sueq_d", the output
+ * would be "cmp.sueq.d". This is needed since dots are not allowed to be
+ * used in structure field names, and are, on the other hand, desired to be
+ * used in debugfs item names to be clearly associated to corresponding
+ * MIPS FPU instructions.
+ */
+static void adjust_instruction_counter_name(char *out_name, char *in_name)
+{
+	int i = 0;
+
+	strcpy(out_name, in_name);
+	while (in_name[i] != '\0') {
+		if (out_name[i] == '_')
+			out_name[i] = '.';
+		i++;
+	}
+}
+
 static int fpuemustats_clear_show(struct seq_file *s, void *unused)
 {
 	__this_cpu_write((fpuemustats).emulated, 0);
@@ -44,6 +64,121 @@ static int fpuemustats_clear_show(struct seq_file *s, void *unused)
 	__this_cpu_write((fpuemustats).ieee754_invalidop, 0);
 	__this_cpu_write((fpuemustats).ds_emul, 0);
 
+	__this_cpu_write((fpuemustats).abs_s, 0);
+	__this_cpu_write((fpuemustats).abs_d, 0);
+	__this_cpu_write((fpuemustats).add_s, 0);
+	__this_cpu_write((fpuemustats).add_d, 0);
+	__this_cpu_write((fpuemustats).bc1eqz, 0);
+	__this_cpu_write((fpuemustats).bc1nez, 0);
+	__this_cpu_write((fpuemustats).ceil_w_s, 0);
+	__this_cpu_write((fpuemustats).ceil_w_d, 0);
+	__this_cpu_write((fpuemustats).ceil_l_s, 0);
+	__this_cpu_write((fpuemustats).ceil_l_d, 0);
+	__this_cpu_write((fpuemustats).class_s, 0);
+	__this_cpu_write((fpuemustats).class_d, 0);
+	__this_cpu_write((fpuemustats).cmp_af_s, 0);
+	__this_cpu_write((fpuemustats).cmp_af_d, 0);
+	__this_cpu_write((fpuemustats).cmp_eq_s, 0);
+	__this_cpu_write((fpuemustats).cmp_eq_d, 0);
+	__this_cpu_write((fpuemustats).cmp_le_s, 0);
+	__this_cpu_write((fpuemustats).cmp_le_d, 0);
+	__this_cpu_write((fpuemustats).cmp_lt_s, 0);
+	__this_cpu_write((fpuemustats).cmp_lt_d, 0);
+	__this_cpu_write((fpuemustats).cmp_ne_s, 0);
+	__this_cpu_write((fpuemustats).cmp_ne_d, 0);
+	__this_cpu_write((fpuemustats).cmp_or_s, 0);
+	__this_cpu_write((fpuemustats).cmp_or_d, 0);
+	__this_cpu_write((fpuemustats).cmp_ueq_s, 0);
+	__this_cpu_write((fpuemustats).cmp_ueq_d, 0);
+	__this_cpu_write((fpuemustats).cmp_ule_s, 0);
+	__this_cpu_write((fpuemustats).cmp_ule_d, 0);
+	__this_cpu_write((fpuemustats).cmp_ult_s, 0);
+	__this_cpu_write((fpuemustats).cmp_ult_d, 0);
+	__this_cpu_write((fpuemustats).cmp_un_s, 0);
+	__this_cpu_write((fpuemustats).cmp_un_d, 0);
+	__this_cpu_write((fpuemustats).cmp_une_s, 0);
+	__this_cpu_write((fpuemustats).cmp_une_d, 0);
+	__this_cpu_write((fpuemustats).cmp_saf_s, 0);
+	__this_cpu_write((fpuemustats).cmp_saf_d, 0);
+	__this_cpu_write((fpuemustats).cmp_seq_s, 0);
+	__this_cpu_write((fpuemustats).cmp_seq_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sle_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sle_d, 0);
+	__this_cpu_write((fpuemustats).cmp_slt_s, 0);
+	__this_cpu_write((fpuemustats).cmp_slt_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sne_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sne_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sor_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sor_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sueq_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sueq_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sule_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sule_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sult_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sult_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sun_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sun_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sune_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sune_d, 0);
+	__this_cpu_write((fpuemustats).cvt_d_l, 0);
+	__this_cpu_write((fpuemustats).cvt_d_s, 0);
+	__this_cpu_write((fpuemustats).cvt_d_w, 0);
+	__this_cpu_write((fpuemustats).cvt_l_s, 0);
+	__this_cpu_write((fpuemustats).cvt_l_d, 0);
+	__this_cpu_write((fpuemustats).cvt_s_d, 0);
+	__this_cpu_write((fpuemustats).cvt_s_l, 0);
+	__this_cpu_write((fpuemustats).cvt_s_w, 0);
+	__this_cpu_write((fpuemustats).cvt_w_s, 0);
+	__this_cpu_write((fpuemustats).cvt_w_d, 0);
+	__this_cpu_write((fpuemustats).div_s, 0);
+	__this_cpu_write((fpuemustats).div_d, 0);
+	__this_cpu_write((fpuemustats).floor_w_s, 0);
+	__this_cpu_write((fpuemustats).floor_w_d, 0);
+	__this_cpu_write((fpuemustats).floor_l_s, 0);
+	__this_cpu_write((fpuemustats).floor_l_d, 0);
+	__this_cpu_write((fpuemustats).maddf_s, 0);
+	__this_cpu_write((fpuemustats).maddf_d, 0);
+	__this_cpu_write((fpuemustats).max_s, 0);
+	__this_cpu_write((fpuemustats).max_d, 0);
+	__this_cpu_write((fpuemustats).maxa_s, 0);
+	__this_cpu_write((fpuemustats).maxa_d, 0);
+	__this_cpu_write((fpuemustats).min_s, 0);
+	__this_cpu_write((fpuemustats).min_d, 0);
+	__this_cpu_write((fpuemustats).mina_s, 0);
+	__this_cpu_write((fpuemustats).mina_d, 0);
+	__this_cpu_write((fpuemustats).mov_s, 0);
+	__this_cpu_write((fpuemustats).mov_d, 0);
+	__this_cpu_write((fpuemustats).msubf_s, 0);
+	__this_cpu_write((fpuemustats).msubf_d, 0);
+	__this_cpu_write((fpuemustats).mul_s, 0);
+	__this_cpu_write((fpuemustats).mul_d, 0);
+	__this_cpu_write((fpuemustats).neg_s, 0);
+	__this_cpu_write((fpuemustats).neg_d, 0);
+	__this_cpu_write((fpuemustats).recip_s, 0);
+	__this_cpu_write((fpuemustats).recip_d, 0);
+	__this_cpu_write((fpuemustats).rint_s, 0);
+	__this_cpu_write((fpuemustats).rint_d, 0);
+	__this_cpu_write((fpuemustats).round_w_s, 0);
+	__this_cpu_write((fpuemustats).round_w_d, 0);
+	__this_cpu_write((fpuemustats).round_l_s, 0);
+	__this_cpu_write((fpuemustats).round_l_d, 0);
+	__this_cpu_write((fpuemustats).rsqrt_s, 0);
+	__this_cpu_write((fpuemustats).rsqrt_d, 0);
+	__this_cpu_write((fpuemustats).sel_s, 0);
+	__this_cpu_write((fpuemustats).sel_d, 0);
+	__this_cpu_write((fpuemustats).seleqz_s, 0);
+	__this_cpu_write((fpuemustats).seleqz_d, 0);
+	__this_cpu_write((fpuemustats).selnez_s, 0);
+	__this_cpu_write((fpuemustats).selnez_d, 0);
+	__this_cpu_write((fpuemustats).sqrt_s, 0);
+	__this_cpu_write((fpuemustats).sqrt_d, 0);
+	__this_cpu_write((fpuemustats).sub_s, 0);
+	__this_cpu_write((fpuemustats).sub_d, 0);
+	__this_cpu_write((fpuemustats).trunc_w_s, 0);
+	__this_cpu_write((fpuemustats).trunc_w_d, 0);
+	__this_cpu_write((fpuemustats).trunc_l_s, 0);
+	__this_cpu_write((fpuemustats).trunc_l_d, 0);
+
 	return 0;
 }
 
@@ -61,13 +196,18 @@ static const struct file_operations fpuemustats_clear_fops = {
 
 static int __init debugfs_fpuemu(void)
 {
-	struct dentry *d, *dir, *reset_file;
+	struct dentry *fpuemu_debugfs_base_dir;
+	struct dentry *fpuemu_debugfs_inst_dir;
+	struct dentry *d, *reset_file;
 
 	if (!mips_debugfs_dir)
 		return -ENODEV;
-	dir = debugfs_create_dir("fpuemustats", mips_debugfs_dir);
-	if (!dir)
+
+	fpuemu_debugfs_base_dir = debugfs_create_dir("fpuemustats",
+						     mips_debugfs_dir);
+	if (!fpuemu_debugfs_base_dir)
 		return -ENOMEM;
+
 	reset_file = debugfs_create_file("fpuemustats_clear", 0444,
 					 mips_debugfs_dir, NULL,
 					 &fpuemustats_clear_fops);
@@ -79,7 +219,7 @@ static int __init debugfs_fpuemu(void)
 
 #define FPU_STAT_CREATE(m)						\
 do {									\
-	d = debugfs_create_file(#m , S_IRUGO, dir,			\
+	d = debugfs_create_file(#m, 0444, fpuemu_debugfs_base_dir,	\
 				(void *)FPU_EMU_STAT_OFFSET(m),		\
 				&fops_fpuemu_stat);			\
 	if (!d)								\
@@ -100,6 +240,139 @@ do {									\
 	FPU_STAT_CREATE(ieee754_invalidop);
 	FPU_STAT_CREATE(ds_emul);
 
+	fpuemu_debugfs_inst_dir = debugfs_create_dir("instructions",
+						     fpuemu_debugfs_base_dir);
+	if (!fpuemu_debugfs_inst_dir)
+		return -ENOMEM;
+
+#define FPU_STAT_CREATE_EX(m)						\
+do {									\
+	char name[32];							\
+									\
+	adjust_instruction_counter_name(name, #m);			\
+									\
+	d = debugfs_create_file(name, 0444, fpuemu_debugfs_inst_dir,	\
+				(void *)FPU_EMU_STAT_OFFSET(m),		\
+				&fops_fpuemu_stat);			\
+	if (!d)								\
+		return -ENOMEM;						\
+} while (0)
+
+	FPU_STAT_CREATE_EX(abs_s);
+	FPU_STAT_CREATE_EX(abs_d);
+	FPU_STAT_CREATE_EX(add_s);
+	FPU_STAT_CREATE_EX(add_d);
+	FPU_STAT_CREATE_EX(bc1eqz);
+	FPU_STAT_CREATE_EX(bc1nez);
+	FPU_STAT_CREATE_EX(ceil_w_s);
+	FPU_STAT_CREATE_EX(ceil_w_d);
+	FPU_STAT_CREATE_EX(ceil_l_s);
+	FPU_STAT_CREATE_EX(ceil_l_d);
+	FPU_STAT_CREATE_EX(class_s);
+	FPU_STAT_CREATE_EX(class_d);
+	FPU_STAT_CREATE_EX(cmp_af_s);
+	FPU_STAT_CREATE_EX(cmp_af_d);
+	FPU_STAT_CREATE_EX(cmp_eq_s);
+	FPU_STAT_CREATE_EX(cmp_eq_d);
+	FPU_STAT_CREATE_EX(cmp_le_s);
+	FPU_STAT_CREATE_EX(cmp_le_d);
+	FPU_STAT_CREATE_EX(cmp_lt_s);
+	FPU_STAT_CREATE_EX(cmp_lt_d);
+	FPU_STAT_CREATE_EX(cmp_ne_s);
+	FPU_STAT_CREATE_EX(cmp_ne_d);
+	FPU_STAT_CREATE_EX(cmp_or_s);
+	FPU_STAT_CREATE_EX(cmp_or_d);
+	FPU_STAT_CREATE_EX(cmp_ueq_s);
+	FPU_STAT_CREATE_EX(cmp_ueq_d);
+	FPU_STAT_CREATE_EX(cmp_ule_s);
+	FPU_STAT_CREATE_EX(cmp_ule_d);
+	FPU_STAT_CREATE_EX(cmp_ult_s);
+	FPU_STAT_CREATE_EX(cmp_ult_d);
+	FPU_STAT_CREATE_EX(cmp_un_s);
+	FPU_STAT_CREATE_EX(cmp_un_d);
+	FPU_STAT_CREATE_EX(cmp_une_s);
+	FPU_STAT_CREATE_EX(cmp_une_d);
+	FPU_STAT_CREATE_EX(cmp_saf_s);
+	FPU_STAT_CREATE_EX(cmp_saf_d);
+	FPU_STAT_CREATE_EX(cmp_seq_s);
+	FPU_STAT_CREATE_EX(cmp_seq_d);
+	FPU_STAT_CREATE_EX(cmp_sle_s);
+	FPU_STAT_CREATE_EX(cmp_sle_d);
+	FPU_STAT_CREATE_EX(cmp_slt_s);
+	FPU_STAT_CREATE_EX(cmp_slt_d);
+	FPU_STAT_CREATE_EX(cmp_sne_s);
+	FPU_STAT_CREATE_EX(cmp_sne_d);
+	FPU_STAT_CREATE_EX(cmp_sor_s);
+	FPU_STAT_CREATE_EX(cmp_sor_d);
+	FPU_STAT_CREATE_EX(cmp_sueq_s);
+	FPU_STAT_CREATE_EX(cmp_sueq_d);
+	FPU_STAT_CREATE_EX(cmp_sule_s);
+	FPU_STAT_CREATE_EX(cmp_sule_d);
+	FPU_STAT_CREATE_EX(cmp_sult_s);
+	FPU_STAT_CREATE_EX(cmp_sult_d);
+	FPU_STAT_CREATE_EX(cmp_sun_s);
+	FPU_STAT_CREATE_EX(cmp_sun_d);
+	FPU_STAT_CREATE_EX(cmp_sune_s);
+	FPU_STAT_CREATE_EX(cmp_sune_d);
+	FPU_STAT_CREATE_EX(cvt_d_l);
+	FPU_STAT_CREATE_EX(cvt_d_s);
+	FPU_STAT_CREATE_EX(cvt_d_w);
+	FPU_STAT_CREATE_EX(cvt_l_s);
+	FPU_STAT_CREATE_EX(cvt_l_d);
+	FPU_STAT_CREATE_EX(cvt_s_d);
+	FPU_STAT_CREATE_EX(cvt_s_l);
+	FPU_STAT_CREATE_EX(cvt_s_w);
+	FPU_STAT_CREATE_EX(cvt_w_s);
+	FPU_STAT_CREATE_EX(cvt_w_d);
+	FPU_STAT_CREATE_EX(div_s);
+	FPU_STAT_CREATE_EX(div_d);
+	FPU_STAT_CREATE_EX(floor_w_s);
+	FPU_STAT_CREATE_EX(floor_w_d);
+	FPU_STAT_CREATE_EX(floor_l_s);
+	FPU_STAT_CREATE_EX(floor_l_d);
+	FPU_STAT_CREATE_EX(maddf_s);
+	FPU_STAT_CREATE_EX(maddf_d);
+	FPU_STAT_CREATE_EX(max_s);
+	FPU_STAT_CREATE_EX(max_d);
+	FPU_STAT_CREATE_EX(maxa_s);
+	FPU_STAT_CREATE_EX(maxa_d);
+	FPU_STAT_CREATE_EX(min_s);
+	FPU_STAT_CREATE_EX(min_d);
+	FPU_STAT_CREATE_EX(mina_s);
+	FPU_STAT_CREATE_EX(mina_d);
+	FPU_STAT_CREATE_EX(mov_s);
+	FPU_STAT_CREATE_EX(mov_d);
+	FPU_STAT_CREATE_EX(msubf_s);
+	FPU_STAT_CREATE_EX(msubf_d);
+	FPU_STAT_CREATE_EX(mul_s);
+	FPU_STAT_CREATE_EX(mul_d);
+	FPU_STAT_CREATE_EX(neg_s);
+	FPU_STAT_CREATE_EX(neg_d);
+	FPU_STAT_CREATE_EX(recip_s);
+	FPU_STAT_CREATE_EX(recip_d);
+	FPU_STAT_CREATE_EX(rint_s);
+	FPU_STAT_CREATE_EX(rint_d);
+	FPU_STAT_CREATE_EX(round_w_s);
+	FPU_STAT_CREATE_EX(round_w_d);
+	FPU_STAT_CREATE_EX(round_l_s);
+	FPU_STAT_CREATE_EX(round_l_d);
+	FPU_STAT_CREATE_EX(rsqrt_s);
+	FPU_STAT_CREATE_EX(rsqrt_d);
+	FPU_STAT_CREATE_EX(sel_s);
+	FPU_STAT_CREATE_EX(sel_d);
+	FPU_STAT_CREATE_EX(seleqz_s);
+	FPU_STAT_CREATE_EX(seleqz_d);
+	FPU_STAT_CREATE_EX(selnez_s);
+	FPU_STAT_CREATE_EX(selnez_d);
+	FPU_STAT_CREATE_EX(sqrt_s);
+	FPU_STAT_CREATE_EX(sqrt_d);
+	FPU_STAT_CREATE_EX(sub_s);
+	FPU_STAT_CREATE_EX(sub_d);
+	FPU_STAT_CREATE_EX(trunc_w_s);
+	FPU_STAT_CREATE_EX(trunc_w_d);
+	FPU_STAT_CREATE_EX(trunc_l_s);
+	FPU_STAT_CREATE_EX(trunc_l_d);
+
 	return 0;
 }
 arch_initcall(debugfs_fpuemu);

From 67fdb99cce96c5bd4a7690e77b920f1ea474a118 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Tue, 29 Aug 2017 15:53:18 +0200
Subject: [PATCH 1497/3220] UPSTREAM: tty: goldfish: Refactor constants to
 better reflect their nature

Classify constants GOLDFISH_TTY_xxx into two groups: command ids and
register offsets. Apply different naming for register offsets (add
'REG_' after 'GOLDFISH_TTY_' in constant names). Change implementation
to use preprocessor's '#define' statements instead of 'enum'
declaration (as this is more common way of implementation in such
cases).

This makes the driver code easier to follow and hopefully prevents
future bugs.

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 2296eee704e70dac7fef8ac13f2716e4896dd13e)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/tty/goldfish.c | 50 ++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index b2f830d6a654..7b903c0047b9 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -23,20 +23,18 @@
 #include <linux/module.h>
 #include <linux/goldfish.h>
 
-enum {
-	GOLDFISH_TTY_PUT_CHAR       = 0x00,
-	GOLDFISH_TTY_BYTES_READY    = 0x04,
-	GOLDFISH_TTY_CMD            = 0x08,
+/* Goldfish tty register's offsets */
+#define	GOLDFISH_TTY_REG_BYTES_READY	0x04
+#define	GOLDFISH_TTY_REG_CMD		0x08
+#define	GOLDFISH_TTY_REG_DATA_PTR	0x10
+#define	GOLDFISH_TTY_REG_DATA_LEN	0x14
+#define	GOLDFISH_TTY_REG_DATA_PTR_HIGH	0x18
 
-	GOLDFISH_TTY_DATA_PTR       = 0x10,
-	GOLDFISH_TTY_DATA_LEN       = 0x14,
-	GOLDFISH_TTY_DATA_PTR_HIGH  = 0x18,
-
-	GOLDFISH_TTY_CMD_INT_DISABLE    = 0,
-	GOLDFISH_TTY_CMD_INT_ENABLE     = 1,
-	GOLDFISH_TTY_CMD_WRITE_BUFFER   = 2,
-	GOLDFISH_TTY_CMD_READ_BUFFER    = 3,
-};
+/* Goldfish tty commands */
+#define	GOLDFISH_TTY_CMD_INT_DISABLE	0
+#define	GOLDFISH_TTY_CMD_INT_ENABLE	1
+#define	GOLDFISH_TTY_CMD_WRITE_BUFFER	2
+#define	GOLDFISH_TTY_CMD_READ_BUFFER	3
 
 struct goldfish_tty {
 	struct tty_port port;
@@ -59,10 +57,10 @@ static void goldfish_tty_do_write(int line, const char *buf, unsigned count)
 	struct goldfish_tty *qtty = &goldfish_ttys[line];
 	void __iomem *base = qtty->base;
 	spin_lock_irqsave(&qtty->lock, irq_flags);
-	gf_write_ptr(buf, base + GOLDFISH_TTY_DATA_PTR,
-				base + GOLDFISH_TTY_DATA_PTR_HIGH);
-	writel(count, base + GOLDFISH_TTY_DATA_LEN);
-	writel(GOLDFISH_TTY_CMD_WRITE_BUFFER, base + GOLDFISH_TTY_CMD);
+	gf_write_ptr(buf, base + GOLDFISH_TTY_REG_DATA_PTR,
+		     base + GOLDFISH_TTY_REG_DATA_PTR_HIGH);
+	writel(count, base + GOLDFISH_TTY_REG_DATA_LEN);
+	writel(GOLDFISH_TTY_CMD_WRITE_BUFFER, base + GOLDFISH_TTY_REG_CMD);
 	spin_unlock_irqrestore(&qtty->lock, irq_flags);
 }
 
@@ -74,16 +72,16 @@ static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id)
 	unsigned char *buf;
 	u32 count;
 
-	count = readl(base + GOLDFISH_TTY_BYTES_READY);
+	count = readl(base + GOLDFISH_TTY_REG_BYTES_READY);
 	if (count == 0)
 		return IRQ_NONE;
 
 	count = tty_prepare_flip_string(&qtty->port, &buf, count);
 	spin_lock_irqsave(&qtty->lock, irq_flags);
-	gf_write_ptr(buf, base + GOLDFISH_TTY_DATA_PTR,
-				base + GOLDFISH_TTY_DATA_PTR_HIGH);
-	writel(count, base + GOLDFISH_TTY_DATA_LEN);
-	writel(GOLDFISH_TTY_CMD_READ_BUFFER, base + GOLDFISH_TTY_CMD);
+	gf_write_ptr(buf, base + GOLDFISH_TTY_REG_DATA_PTR,
+		     base + GOLDFISH_TTY_REG_DATA_PTR_HIGH);
+	writel(count, base + GOLDFISH_TTY_REG_DATA_LEN);
+	writel(GOLDFISH_TTY_CMD_READ_BUFFER, base + GOLDFISH_TTY_REG_CMD);
 	spin_unlock_irqrestore(&qtty->lock, irq_flags);
 	tty_schedule_flip(&qtty->port);
 	return IRQ_HANDLED;
@@ -93,7 +91,7 @@ static int goldfish_tty_activate(struct tty_port *port, struct tty_struct *tty)
 {
 	struct goldfish_tty *qtty = container_of(port, struct goldfish_tty,
 									port);
-	writel(GOLDFISH_TTY_CMD_INT_ENABLE, qtty->base + GOLDFISH_TTY_CMD);
+	writel(GOLDFISH_TTY_CMD_INT_ENABLE, qtty->base + GOLDFISH_TTY_REG_CMD);
 	return 0;
 }
 
@@ -101,7 +99,7 @@ static void goldfish_tty_shutdown(struct tty_port *port)
 {
 	struct goldfish_tty *qtty = container_of(port, struct goldfish_tty,
 									port);
-	writel(GOLDFISH_TTY_CMD_INT_DISABLE, qtty->base + GOLDFISH_TTY_CMD);
+	writel(GOLDFISH_TTY_CMD_INT_DISABLE, qtty->base + GOLDFISH_TTY_REG_CMD);
 }
 
 static int goldfish_tty_open(struct tty_struct *tty, struct file *filp)
@@ -136,7 +134,7 @@ static int goldfish_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct goldfish_tty *qtty = &goldfish_ttys[tty->index];
 	void __iomem *base = qtty->base;
-	return readl(base + GOLDFISH_TTY_BYTES_READY);
+	return readl(base + GOLDFISH_TTY_REG_BYTES_READY);
 }
 
 static void goldfish_tty_console_write(struct console *co, const char *b,
@@ -272,7 +270,7 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	qtty->base = base;
 	qtty->irq = irq;
 
-	writel(GOLDFISH_TTY_CMD_INT_DISABLE, base + GOLDFISH_TTY_CMD);
+	writel(GOLDFISH_TTY_CMD_INT_DISABLE, base + GOLDFISH_TTY_REG_CMD);
 
 	ret = request_irq(irq, goldfish_tty_interrupt, IRQF_SHARED,
 						"goldfish_tty", qtty);

From 77ed324e037215406d317fd805e7828f15e13da0 Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@imgtec.com>
Date: Tue, 29 Aug 2017 15:53:19 +0200
Subject: [PATCH 1498/3220] UPSTREAM: tty: goldfish: Use streaming DMA for r/w
 operations on Ranchu platforms

Implement tty r/w operations using streaming DMA.

Goldfish tty for Ranchu platforms has been modified to use
streaming DMA mappings for read/write operations. This change
eliminates the need for snooping through the TLB in QEMU using
cpu_get_phys_page_debug() which does not guarantee that it will
return the valid va -> pa mapping.

The streaming DMA mapping is implemented using dma_map_single() per
transfer, while dma_unmap_single() is used for unmapping right after
the DMA transfer.

Using DMA API is the proper way for handling r/w transfers and
makes this driver more portable, thus effectively eliminating
the need for virt_to_page() and page_to_phys() conversions.

This change does not affect the old style Goldfish tty behaviour
which is still used by the Goldfish emulator. Version register has
been added and probed to see which platform is running this driver.
Reading from the new register GOLDFISH_TTY_REG_VERSION using the
Goldfish emulator will return 0 and driver will work with virtual
addresses. Whereas if run on Ranchu it returns 1, and thus DMA is
used.

(Goldfish and Ranchu are code names for the first and the second
generation of virtual boards used by Android emulator.)

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 7157d2be23da9f8860c69e2b79184a4e02701dad)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/tty/goldfish.c | 166 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 139 insertions(+), 27 deletions(-)

diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index 7b903c0047b9..40f472843a8d 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -22,6 +22,8 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/goldfish.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
 
 /* Goldfish tty register's offsets */
 #define	GOLDFISH_TTY_REG_BYTES_READY	0x04
@@ -29,6 +31,7 @@
 #define	GOLDFISH_TTY_REG_DATA_PTR	0x10
 #define	GOLDFISH_TTY_REG_DATA_LEN	0x14
 #define	GOLDFISH_TTY_REG_DATA_PTR_HIGH	0x18
+#define	GOLDFISH_TTY_REG_VERSION	0x20
 
 /* Goldfish tty commands */
 #define	GOLDFISH_TTY_CMD_INT_DISABLE	0
@@ -43,6 +46,8 @@ struct goldfish_tty {
 	u32 irq;
 	int opencount;
 	struct console console;
+	u32 version;
+	struct device *dev;
 };
 
 static DEFINE_MUTEX(goldfish_tty_lock);
@@ -51,24 +56,95 @@ static u32 goldfish_tty_line_count = 8;
 static u32 goldfish_tty_current_line_count;
 static struct goldfish_tty *goldfish_ttys;
 
-static void goldfish_tty_do_write(int line, const char *buf, unsigned count)
+static void do_rw_io(struct goldfish_tty *qtty,
+		     unsigned long address,
+		     unsigned int count,
+		     int is_write)
 {
 	unsigned long irq_flags;
-	struct goldfish_tty *qtty = &goldfish_ttys[line];
 	void __iomem *base = qtty->base;
+
 	spin_lock_irqsave(&qtty->lock, irq_flags);
-	gf_write_ptr(buf, base + GOLDFISH_TTY_REG_DATA_PTR,
+	gf_write_ptr((void *)address, base + GOLDFISH_TTY_REG_DATA_PTR,
 		     base + GOLDFISH_TTY_REG_DATA_PTR_HIGH);
 	writel(count, base + GOLDFISH_TTY_REG_DATA_LEN);
-	writel(GOLDFISH_TTY_CMD_WRITE_BUFFER, base + GOLDFISH_TTY_REG_CMD);
+
+	if (is_write)
+		writel(GOLDFISH_TTY_CMD_WRITE_BUFFER,
+		       base + GOLDFISH_TTY_REG_CMD);
+	else
+		writel(GOLDFISH_TTY_CMD_READ_BUFFER,
+		       base + GOLDFISH_TTY_REG_CMD);
+
 	spin_unlock_irqrestore(&qtty->lock, irq_flags);
 }
 
+static void goldfish_tty_rw(struct goldfish_tty *qtty,
+			    unsigned long addr,
+			    unsigned int count,
+			    int is_write)
+{
+	dma_addr_t dma_handle;
+	enum dma_data_direction dma_dir;
+
+	dma_dir = (is_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	if (qtty->version > 0) {
+		/*
+		 * Goldfish TTY for Ranchu platform uses
+		 * physical addresses and DMA for read/write operations
+		 */
+		unsigned long addr_end = addr + count;
+
+		while (addr < addr_end) {
+			unsigned long pg_end = (addr & PAGE_MASK) + PAGE_SIZE;
+			unsigned long next =
+					pg_end < addr_end ? pg_end : addr_end;
+			unsigned long avail = next - addr;
+
+			/*
+			 * Map the buffer's virtual address to the DMA address
+			 * so the buffer can be accessed by the device.
+			 */
+			dma_handle = dma_map_single(qtty->dev, (void *)addr,
+						    avail, dma_dir);
+
+			if (dma_mapping_error(qtty->dev, dma_handle)) {
+				dev_err(qtty->dev, "tty: DMA mapping error.\n");
+				return;
+			}
+			do_rw_io(qtty, dma_handle, avail, is_write);
+
+			/*
+			 * Unmap the previously mapped region after
+			 * the completion of the read/write operation.
+			 */
+			dma_unmap_single(qtty->dev, dma_handle, avail, dma_dir);
+
+			addr += avail;
+		}
+	} else {
+		/*
+		 * Old style Goldfish TTY used on the Goldfish platform
+		 * uses virtual addresses.
+		 */
+		do_rw_io(qtty, addr, count, is_write);
+	}
+}
+
+static void goldfish_tty_do_write(int line, const char *buf,
+				  unsigned int count)
+{
+	struct goldfish_tty *qtty = &goldfish_ttys[line];
+	unsigned long address = (unsigned long)(void *)buf;
+
+	goldfish_tty_rw(qtty, address, count, 1);
+}
+
 static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id)
 {
 	struct goldfish_tty *qtty = dev_id;
 	void __iomem *base = qtty->base;
-	unsigned long irq_flags;
+	unsigned long address;
 	unsigned char *buf;
 	u32 count;
 
@@ -77,12 +153,10 @@ static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id)
 		return IRQ_NONE;
 
 	count = tty_prepare_flip_string(&qtty->port, &buf, count);
-	spin_lock_irqsave(&qtty->lock, irq_flags);
-	gf_write_ptr(buf, base + GOLDFISH_TTY_REG_DATA_PTR,
-		     base + GOLDFISH_TTY_REG_DATA_PTR_HIGH);
-	writel(count, base + GOLDFISH_TTY_REG_DATA_LEN);
-	writel(GOLDFISH_TTY_CMD_READ_BUFFER, base + GOLDFISH_TTY_REG_CMD);
-	spin_unlock_irqrestore(&qtty->lock, irq_flags);
+
+	address = (unsigned long)(void *)buf;
+	goldfish_tty_rw(qtty, address, count, 0);
+
 	tty_schedule_flip(&qtty->port);
 	return IRQ_HANDLED;
 }
@@ -225,7 +299,7 @@ static void goldfish_tty_delete_driver(void)
 static int goldfish_tty_probe(struct platform_device *pdev)
 {
 	struct goldfish_tty *qtty;
-	int ret = -EINVAL;
+	int ret = -ENODEV;
 	struct resource *r;
 	struct device *ttydev;
 	void __iomem *base;
@@ -233,16 +307,22 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	unsigned int line;
 
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (r == NULL)
-		return -EINVAL;
+	if (!r) {
+		pr_err("goldfish_tty: No MEM resource available!\n");
+		return -ENOMEM;
+	}
 
 	base = ioremap(r->start, 0x1000);
-	if (base == NULL)
-		pr_err("goldfish_tty: unable to remap base\n");
+	if (!base) {
+		pr_err("goldfish_tty: Unable to ioremap base!\n");
+		return -ENOMEM;
+	}
 
 	r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (r == NULL)
+	if (!r) {
+		pr_err("goldfish_tty: No IRQ resource available!\n");
 		goto err_unmap;
+	}
 
 	irq = r->start;
 
@@ -253,13 +333,17 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	else
 		line = pdev->id;
 
-	if (line >= goldfish_tty_line_count)
-		goto err_create_driver_failed;
+	if (line >= goldfish_tty_line_count) {
+		pr_err("goldfish_tty: Reached maximum tty number of %d.\n",
+		       goldfish_tty_current_line_count);
+		ret = -ENOMEM;
+		goto err_unlock;
+	}
 
 	if (goldfish_tty_current_line_count == 0) {
 		ret = goldfish_tty_create_driver();
 		if (ret)
-			goto err_create_driver_failed;
+			goto err_unlock;
 	}
 	goldfish_tty_current_line_count++;
 
@@ -269,17 +353,45 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 	qtty->port.ops = &goldfish_port_ops;
 	qtty->base = base;
 	qtty->irq = irq;
+	qtty->dev = &pdev->dev;
+
+	/*
+	 * Goldfish TTY device used by the Goldfish emulator
+	 * should identify itself with 0, forcing the driver
+	 * to use virtual addresses. Goldfish TTY device
+	 * on Ranchu emulator (qemu2) returns 1 here and
+	 * driver will use physical addresses.
+	 */
+	qtty->version = readl(base + GOLDFISH_TTY_REG_VERSION);
+
+	/*
+	 * Goldfish TTY device on Ranchu emulator (qemu2)
+	 * will use DMA for read/write IO operations.
+	 */
+	if (qtty->version > 0) {
+		/*
+		 * Initialize dma_mask to 32-bits.
+		 */
+		if (!pdev->dev.dma_mask)
+			pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
+		ret = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+		if (ret) {
+			dev_err(&pdev->dev, "No suitable DMA available.\n");
+			goto err_dec_line_count;
+		}
+	}
 
 	writel(GOLDFISH_TTY_CMD_INT_DISABLE, base + GOLDFISH_TTY_REG_CMD);
 
 	ret = request_irq(irq, goldfish_tty_interrupt, IRQF_SHARED,
-						"goldfish_tty", qtty);
-	if (ret)
-		goto err_request_irq_failed;
-
+			  "goldfish_tty", qtty);
+	if (ret) {
+		pr_err("goldfish_tty: No IRQ available!\n");
+		goto err_dec_line_count;
+	}
 
 	ttydev = tty_port_register_device(&qtty->port, goldfish_tty_driver,
-							line, &pdev->dev);
+					  line, &pdev->dev);
 	if (IS_ERR(ttydev)) {
 		ret = PTR_ERR(ttydev);
 		goto err_tty_register_device_failed;
@@ -299,11 +411,11 @@ static int goldfish_tty_probe(struct platform_device *pdev)
 
 err_tty_register_device_failed:
 	free_irq(irq, qtty);
-err_request_irq_failed:
+err_dec_line_count:
 	goldfish_tty_current_line_count--;
 	if (goldfish_tty_current_line_count == 0)
 		goldfish_tty_delete_driver();
-err_create_driver_failed:
+err_unlock:
 	mutex_unlock(&goldfish_tty_lock);
 err_unmap:
 	iounmap(base);

From 3d36fb9f2a723c95718ee84fe8420607cf548e25 Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@imgtec.com>
Date: Tue, 29 Aug 2017 15:53:20 +0200
Subject: [PATCH 1499/3220] UPSTREAM: tty: goldfish: Implement support for
 kernel 'earlycon' parameter

Add early console functionality to the Goldfish tty driver.

When 'earlycon' kernel command line parameter is used with no options,
the early console is determined by the 'stdout-path' property in device
tree's 'chosen' node. This is illustrated in the following device tree
source example:

Device tree example:

    chosen {
        stdout-path = "/goldfish_tty@1f004000";
    };

    goldfish_tty@1f004000 {
        interrupts = <0xc>;
        reg = <0x1f004000 0x0 0x1000>;
        compatible = "google,goldfish-tty";
    };

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 3840ed9548f778717aaab5eab744da798c3ea055)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/tty/Kconfig    |  3 +++
 drivers/tty/goldfish.c | 26 ++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index c01f45095877..cadfc5f370f1 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -403,6 +403,9 @@ config PPC_EARLY_DEBUG_EHV_BC_HANDLE
 config GOLDFISH_TTY
 	tristate "Goldfish TTY Driver"
 	depends on GOLDFISH
+	select SERIAL_CORE
+	select SERIAL_CORE_CONSOLE
+	select SERIAL_EARLYCON
 	help
 	  Console and system TTY driver for the Goldfish virtual platform.
 
diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index 40f472843a8d..41c701401f71 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (C) 2007 Google, Inc.
  * Copyright (C) 2012 Intel, Inc.
+ * Copyright (C) 2017 Imagination Technologies Ltd.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -24,6 +25,7 @@
 #include <linux/goldfish.h>
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
+#include <linux/serial_core.h>
 
 /* Goldfish tty register's offsets */
 #define	GOLDFISH_TTY_REG_BYTES_READY	0x04
@@ -440,6 +442,30 @@ static int goldfish_tty_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static void gf_early_console_putchar(struct uart_port *port, int ch)
+{
+	__raw_writel(ch, port->membase);
+}
+
+static void gf_early_write(struct console *con, const char *s, unsigned int n)
+{
+	struct earlycon_device *dev = con->data;
+
+	uart_console_write(&dev->port, s, n, gf_early_console_putchar);
+}
+
+static int __init gf_earlycon_setup(struct earlycon_device *device,
+				    const char *opt)
+{
+	if (!device->port.membase)
+		return -ENODEV;
+
+	device->con->write = gf_early_write;
+	return 0;
+}
+
+OF_EARLYCON_DECLARE(early_gf_tty, "google,goldfish-tty", gf_earlycon_setup);
+
 static const struct of_device_id goldfish_tty_of_match[] = {
 	{ .compatible = "google,goldfish-tty", },
 	{},

From 150e79518f3095f73a6d2c6fe7e80c1a083fbca9 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Date: Fri, 18 Aug 2017 15:08:53 +0200
Subject: [PATCH 1500/3220] BACKPORT: dt-bindings: Add device tree binding for
 Goldfish RTC driver

Add documentation for DT binding of Goldfish RTC driver. The compatible
string used by OS for binding the driver is "google,goldfish-rtc".

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
(cherry picked from commit 7a08de1d8fd27ac60ed8ce7a15efb88471014080)

Conflicts:
	MAINTAINERS
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 .../bindings/rtc/google,goldfish-rtc.txt        | 17 +++++++++++++++++
 MAINTAINERS                                     |  5 +++++
 2 files changed, 22 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/rtc/google,goldfish-rtc.txt

diff --git a/Documentation/devicetree/bindings/rtc/google,goldfish-rtc.txt b/Documentation/devicetree/bindings/rtc/google,goldfish-rtc.txt
new file mode 100644
index 000000000000..634312dd95ca
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/google,goldfish-rtc.txt
@@ -0,0 +1,17 @@
+Android Goldfish RTC
+
+Android Goldfish RTC device used by Android emulator.
+
+Required properties:
+
+- compatible : should contain "google,goldfish-rtc"
+- reg        : <registers mapping>
+- interrupts : <interrupt mapping>
+
+Example:
+
+	goldfish_timer@9020000 {
+		compatible = "google,goldfish-rtc";
+		reg = <0x9020000 0x1000>;
+		interrupts = <0x3>;
+	};
diff --git a/MAINTAINERS b/MAINTAINERS
index bbdff393fa2f..dc56d4d951b2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -756,6 +756,11 @@ S:	Supported
 F:	drivers/android/
 F:	drivers/staging/android/
 
+ANDROID GOLDFISH RTC DRIVER
+M:	Miodrag Dinic <miodrag.dinic@imgtec.com>
+S:	Supported
+F:	Documentation/devicetree/bindings/rtc/google,goldfish-rtc.txt
+
 AOA (Apple Onboard Audio) ALSA DRIVER
 M:	Johannes Berg <johannes@sipsolutions.net>
 L:	linuxppc-dev@lists.ozlabs.org

From e33cff3589a728d0f0f4e2800a7d1ead7f031636 Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@imgtec.com>
Date: Fri, 18 Aug 2017 15:08:54 +0200
Subject: [PATCH 1501/3220] UPSTREAM: rtc: goldfish: Add RTC driver for Android
 emulator

Add device driver for a virtual RTC device in Android emulator.

The compatible string used by OS for binding the driver is defined
as "google,goldfish-rtc".

Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
(cherry picked from commit f22d9cdcb5eb7ed1c4629a167474d68df0003a3d)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 MAINTAINERS                |   1 +
 drivers/rtc/Kconfig        |   8 ++
 drivers/rtc/Makefile       |   1 +
 drivers/rtc/rtc-goldfish.c | 237 +++++++++++++++++++++++++++++++++++++
 4 files changed, 247 insertions(+)
 create mode 100644 drivers/rtc/rtc-goldfish.c

diff --git a/MAINTAINERS b/MAINTAINERS
index dc56d4d951b2..a9925754dd49 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -760,6 +760,7 @@ ANDROID GOLDFISH RTC DRIVER
 M:	Miodrag Dinic <miodrag.dinic@imgtec.com>
 S:	Supported
 F:	Documentation/devicetree/bindings/rtc/google,goldfish-rtc.txt
+F:	drivers/rtc/rtc-goldfish.c
 
 AOA (Apple Onboard Audio) ALSA DRIVER
 M:	Johannes Berg <johannes@sipsolutions.net>
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 2a524244afec..b5f12198e535 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1627,5 +1627,13 @@ config RTC_DRV_HID_SENSOR_TIME
 	  If this driver is compiled as a module, it will be named
 	  rtc-hid-sensor-time.
 
+config RTC_DRV_GOLDFISH
+	tristate "Goldfish Real Time Clock"
+	depends on MIPS && (GOLDFISH || COMPILE_TEST)
+	help
+	  Say yes to enable RTC driver for the Goldfish based virtual platform.
+
+	  Goldfish is a code name for the virtual platform developed by Google
+	  for Android emulation.
 
 endif # RTC_CLASS
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 231f76451615..e9f66679a51e 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -161,3 +161,4 @@ obj-$(CONFIG_RTC_DRV_WM8350)	+= rtc-wm8350.o
 obj-$(CONFIG_RTC_DRV_X1205)	+= rtc-x1205.o
 obj-$(CONFIG_RTC_DRV_XGENE)	+= rtc-xgene.o
 obj-$(CONFIG_RTC_DRV_ZYNQMP)	+= rtc-zynqmp.o
+obj-$(CONFIG_RTC_DRV_GOLDFISH)	+= rtc-goldfish.o
diff --git a/drivers/rtc/rtc-goldfish.c b/drivers/rtc/rtc-goldfish.c
new file mode 100644
index 000000000000..d67769265185
--- /dev/null
+++ b/drivers/rtc/rtc-goldfish.c
@@ -0,0 +1,237 @@
+/* drivers/rtc/rtc-goldfish.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ * Copyright (C) 2017 Imagination Technologies Ltd.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/io.h>
+
+#define TIMER_TIME_LOW		0x00	/* get low bits of current time  */
+					/*   and update TIMER_TIME_HIGH  */
+#define TIMER_TIME_HIGH	0x04	/* get high bits of time at last */
+					/*   TIMER_TIME_LOW read         */
+#define TIMER_ALARM_LOW	0x08	/* set low bits of alarm and     */
+					/*   activate it                 */
+#define TIMER_ALARM_HIGH	0x0c	/* set high bits of next alarm   */
+#define TIMER_IRQ_ENABLED	0x10
+#define TIMER_CLEAR_ALARM	0x14
+#define TIMER_ALARM_STATUS	0x18
+#define TIMER_CLEAR_INTERRUPT	0x1c
+
+struct goldfish_rtc {
+	void __iomem *base;
+	int irq;
+	struct rtc_device *rtc;
+};
+
+static int goldfish_rtc_read_alarm(struct device *dev,
+				   struct rtc_wkalrm *alrm)
+{
+	u64 rtc_alarm;
+	u64 rtc_alarm_low;
+	u64 rtc_alarm_high;
+	void __iomem *base;
+	struct goldfish_rtc *rtcdrv;
+
+	rtcdrv = dev_get_drvdata(dev);
+	base = rtcdrv->base;
+
+	rtc_alarm_low = readl(base + TIMER_ALARM_LOW);
+	rtc_alarm_high = readl(base + TIMER_ALARM_HIGH);
+	rtc_alarm = (rtc_alarm_high << 32) | rtc_alarm_low;
+
+	do_div(rtc_alarm, NSEC_PER_SEC);
+	memset(alrm, 0, sizeof(struct rtc_wkalrm));
+
+	rtc_time_to_tm(rtc_alarm, &alrm->time);
+
+	if (readl(base + TIMER_ALARM_STATUS))
+		alrm->enabled = 1;
+	else
+		alrm->enabled = 0;
+
+	return 0;
+}
+
+static int goldfish_rtc_set_alarm(struct device *dev,
+				  struct rtc_wkalrm *alrm)
+{
+	struct goldfish_rtc *rtcdrv;
+	unsigned long rtc_alarm;
+	u64 rtc_alarm64;
+	u64 rtc_status_reg;
+	void __iomem *base;
+	int ret = 0;
+
+	rtcdrv = dev_get_drvdata(dev);
+	base = rtcdrv->base;
+
+	if (alrm->enabled) {
+		ret = rtc_tm_to_time(&alrm->time, &rtc_alarm);
+		if (ret != 0)
+			return ret;
+
+		rtc_alarm64 = rtc_alarm * NSEC_PER_SEC;
+		writel((rtc_alarm64 >> 32), base + TIMER_ALARM_HIGH);
+		writel(rtc_alarm64, base + TIMER_ALARM_LOW);
+	} else {
+		/*
+		 * if this function was called with enabled=0
+		 * then it could mean that the application is
+		 * trying to cancel an ongoing alarm
+		 */
+		rtc_status_reg = readl(base + TIMER_ALARM_STATUS);
+		if (rtc_status_reg)
+			writel(1, base + TIMER_CLEAR_ALARM);
+	}
+
+	return ret;
+}
+
+static int goldfish_rtc_alarm_irq_enable(struct device *dev,
+					 unsigned int enabled)
+{
+	void __iomem *base;
+	struct goldfish_rtc *rtcdrv;
+
+	rtcdrv = dev_get_drvdata(dev);
+	base = rtcdrv->base;
+
+	if (enabled)
+		writel(1, base + TIMER_IRQ_ENABLED);
+	else
+		writel(0, base + TIMER_IRQ_ENABLED);
+
+	return 0;
+}
+
+static irqreturn_t goldfish_rtc_interrupt(int irq, void *dev_id)
+{
+	struct goldfish_rtc *rtcdrv = dev_id;
+	void __iomem *base = rtcdrv->base;
+
+	writel(1, base + TIMER_CLEAR_INTERRUPT);
+
+	rtc_update_irq(rtcdrv->rtc, 1, RTC_IRQF | RTC_AF);
+
+	return IRQ_HANDLED;
+}
+
+static int goldfish_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct goldfish_rtc *rtcdrv;
+	void __iomem *base;
+	u64 time_high;
+	u64 time_low;
+	u64 time;
+
+	rtcdrv = dev_get_drvdata(dev);
+	base = rtcdrv->base;
+
+	time_low = readl(base + TIMER_TIME_LOW);
+	time_high = readl(base + TIMER_TIME_HIGH);
+	time = (time_high << 32) | time_low;
+
+	do_div(time, NSEC_PER_SEC);
+
+	rtc_time_to_tm(time, tm);
+
+	return 0;
+}
+
+static int goldfish_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct goldfish_rtc *rtcdrv;
+	void __iomem *base;
+	unsigned long now;
+	u64 now64;
+	int ret;
+
+	rtcdrv = dev_get_drvdata(dev);
+	base = rtcdrv->base;
+
+	ret = rtc_tm_to_time(tm, &now);
+	if (ret == 0) {
+		now64 = now * NSEC_PER_SEC;
+		writel((now64 >> 32), base + TIMER_TIME_HIGH);
+		writel(now64, base + TIMER_TIME_LOW);
+	}
+
+	return ret;
+}
+
+static const struct rtc_class_ops goldfish_rtc_ops = {
+	.read_time	= goldfish_rtc_read_time,
+	.set_time	= goldfish_rtc_set_time,
+	.read_alarm	= goldfish_rtc_read_alarm,
+	.set_alarm	= goldfish_rtc_set_alarm,
+	.alarm_irq_enable = goldfish_rtc_alarm_irq_enable
+};
+
+static int goldfish_rtc_probe(struct platform_device *pdev)
+{
+	struct goldfish_rtc *rtcdrv;
+	struct resource *r;
+	int err;
+
+	rtcdrv = devm_kzalloc(&pdev->dev, sizeof(*rtcdrv), GFP_KERNEL);
+	if (!rtcdrv)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, rtcdrv);
+
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!r)
+		return -ENODEV;
+
+	rtcdrv->base = devm_ioremap_resource(&pdev->dev, r);
+	if (IS_ERR(rtcdrv->base))
+		return -ENODEV;
+
+	rtcdrv->irq = platform_get_irq(pdev, 0);
+	if (rtcdrv->irq < 0)
+		return -ENODEV;
+
+	rtcdrv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
+					       &goldfish_rtc_ops,
+					       THIS_MODULE);
+	if (IS_ERR(rtcdrv->rtc))
+		return PTR_ERR(rtcdrv->rtc);
+
+	err = devm_request_irq(&pdev->dev, rtcdrv->irq,
+			       goldfish_rtc_interrupt,
+			       0, pdev->name, rtcdrv);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static const struct of_device_id goldfish_rtc_of_match[] = {
+	{ .compatible = "google,goldfish-rtc", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, goldfish_rtc_of_match);
+
+static struct platform_driver goldfish_rtc = {
+	.probe = goldfish_rtc_probe,
+	.driver = {
+		.name = "goldfish_rtc",
+		.of_match_table = goldfish_rtc_of_match,
+	}
+};
+
+module_platform_driver(goldfish_rtc);

From 1faba225e9bf454caaae3f3b469941361afc0a50 Mon Sep 17 00:00:00 2001
From: Manuel Lauss <manuel.lauss@gmail.com>
Date: Mon, 14 Aug 2017 12:21:48 +0200
Subject: [PATCH 1502/3220] UPSTREAM: MIPS: math-emu: do not use bools for
 arithmetic

GCC-7 complains about a boolean value being used with an arithmetic
AND:

arch/mips/math-emu/cp1emu.c: In function 'cop1Emulate':
arch/mips/math-emu/cp1emu.c:838:14: warning: '~' on a boolean expression [-Wbool-operation]
  fpr = (x) & ~(cop1_64bit(xcp) == 0);    \
              ^
arch/mips/math-emu/cp1emu.c:1068:3: note: in expansion of macro 'DITOREG'
   DITOREG(dval, MIPSInst_RT(ir));
   ^~~~~~~
arch/mips/math-emu/cp1emu.c:838:14: note: did you mean to use logical not?
  fpr = (x) & ~(cop1_64bit(xcp) == 0);    \

Since cop1_64bit() returns and int, just flip the LSB.

Suggested-by: Maciej W. Rozycki <macro@imgtec.com>
Signed-off-by: Manuel Lauss <manuel.lauss@gmail.com>
Reviewed-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17058/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 8535f2ba0a9b971df62a5890699b9dfe2e0d5580)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index aa44e16db1ab..3c53aaff3ae3 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -831,12 +831,12 @@ do {									\
 } while (0)
 
 #define DIFROMREG(di, x)						\
-	((di) = get_fpr64(&ctx->fpr[(x) & ~(cop1_64bit(xcp) == 0)], 0))
+	((di) = get_fpr64(&ctx->fpr[(x) & ~(cop1_64bit(xcp) ^ 1)], 0))
 
 #define DITOREG(di, x)							\
 do {									\
 	unsigned fpr, i;						\
-	fpr = (x) & ~(cop1_64bit(xcp) == 0);				\
+	fpr = (x) & ~(cop1_64bit(xcp) ^ 1);				\
 	set_fpr64(&ctx->fpr[fpr], 0, di);				\
 	for (i = 1; i < ARRAY_SIZE(ctx->fpr[x].val64); i++)		\
 		set_fpr64(&ctx->fpr[fpr], i, 0);			\

From 848a96c7c180a528ac79b0ab73886842f707ec8e Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Fri, 20 Oct 2017 16:27:44 +0200
Subject: [PATCH 1503/3220] UPSTREAM: MIPS: Update RINT emulation maintainer
 email address

Change all relevant instances of aleksandar.markovic@imgtec.com
email address to aleksandar.markovic@mips.com.

Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Patchwork: https://patchwork.linux-mips.org/patch/17514/
Signed-off-by: James Hogan <jhogan@kernel.org>
(cherry picked from commit 89677e44b26ef49fd57208c7885fdd729b3724e5)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 .mailmap    | 1 +
 MAINTAINERS | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index b1e9a97653dc..8c5993eefdcc 100644
--- a/.mailmap
+++ b/.mailmap
@@ -14,6 +14,7 @@ Adrian Bunk <bunk@stusta.de>
 Alan Cox <alan@lxorguk.ukuu.org.uk>
 Alan Cox <root@hraefn.swansea.linux.org.uk>
 Aleksey Gorelov <aleksey_gorelov@phoenix.com>
+Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
 Al Viro <viro@ftp.linux.org.uk>
 Al Viro <viro@zenIV.linux.org.uk>
 Andreas Herrmann <aherrman@de.ibm.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index a9925754dd49..d24aa29fdb13 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7061,7 +7061,7 @@ F:	Documentation/mips/
 F:	arch/mips/
 
 MIPS RINT INSTRUCTION EMULATION
-M:	Aleksandar Markovic <aleksandar.markovic@imgtec.com>
+M:	Aleksandar Markovic <aleksandar.markovic@mips.com>
 L:	linux-mips@linux-mips.org
 S:	Supported
 F:	arch/mips/math-emu/sp_rint.c

From 960cfd07111c3ec84829bb2665d9baf506e126bf Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Fri, 20 Oct 2017 16:27:45 +0200
Subject: [PATCH 1504/3220] UPSTREAM: MIPS: Update Goldfish RTC driver
 maintainer email address

Change all relevant instances of miodrag.dinic@imgtec.com
email address to miodrag.dinic@mips.com.

Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Patchwork: https://patchwork.linux-mips.org/patch/17515/
[jhogan@kernel.org: Fix .mailmap direction]
Signed-off-by: James Hogan <jhogan@kernel.org>

(cherry picked from commit 0464a53eba0af434b8516c2e01d881aa587cd517)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 .mailmap    | 1 +
 MAINTAINERS | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index 8c5993eefdcc..8cbe2bd7d090 100644
--- a/.mailmap
+++ b/.mailmap
@@ -86,6 +86,7 @@ Matthieu CASTET <castet.matthieu@free.fr>
 Mayuresh Janorkar <mayur@ti.com>
 Michael Buesch <m@bues.ch>
 Michel Dänzer <michel@tungstengraphics.com>
+Miodrag Dinic <miodrag.dinic@mips.com> <miodrag.dinic@imgtec.com>
 Mitesh shah <mshah@teja.com>
 Mohit Kumar <mohit.kumar@st.com> <mohit.kumar.dhaka@gmail.com>
 Morten Welinder <terra@gnome.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index d24aa29fdb13..48bf18de18bb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -757,7 +757,7 @@ F:	drivers/android/
 F:	drivers/staging/android/
 
 ANDROID GOLDFISH RTC DRIVER
-M:	Miodrag Dinic <miodrag.dinic@imgtec.com>
+M:	Miodrag Dinic <miodrag.dinic@mips.com>
 S:	Supported
 F:	Documentation/devicetree/bindings/rtc/google,goldfish-rtc.txt
 F:	drivers/rtc/rtc-goldfish.c

From 171e64fc7e69d6a5096a2d7e961da6ee2a3c2d52 Mon Sep 17 00:00:00 2001
From: Douglas Leung <douglas.leung@imgtec.com>
Date: Thu, 27 Jul 2017 18:08:58 +0200
Subject: [PATCH 1505/3220] UPSTREAM: MIPS: math-emu: <MADDF|MSUBF>.S: Fix
 accuracy (32-bit case)

Implement fused multiply-add with correct accuracy.

Fused multiply-add operation has better accuracy than respective
sequential execution of multiply and add operations applied on the
same inputs. This is because accuracy errors accumulate in latter
case.

This patch implements fused multiply-add with the same accuracy
as it is implemented in hardware, using 64-bit intermediate
calculations.

One test case example (raw bits) that this patch fixes:

MADDF.S fd,fs,ft:
  fd = 0x22575225
  fs = ft = 0x3727c5ac

Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction")
Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction")

Signed-off-by: Douglas Leung <douglas.leung@imgtec.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: Bo Hu <bohu@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jin Qian <jinqian@google.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: <stable@vger.kernel.org> # 4.7+
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16890/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit b3b8e1eb27c523e32b6a8aa7ec8ac4754456af57)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/ieee754sp.h |   4 ++
 arch/mips/math-emu/sp_maddf.c  | 116 +++++++++++++--------------------
 2 files changed, 50 insertions(+), 70 deletions(-)

diff --git a/arch/mips/math-emu/ieee754sp.h b/arch/mips/math-emu/ieee754sp.h
index 8476067075fe..0f63e4202cff 100644
--- a/arch/mips/math-emu/ieee754sp.h
+++ b/arch/mips/math-emu/ieee754sp.h
@@ -45,6 +45,10 @@ static inline int ieee754sp_finite(union ieee754sp x)
 	return SPBEXP(x) != SP_EMAX + 1 + SP_EBIAS;
 }
 
+/* 64 bit right shift with rounding */
+#define XSPSRS64(v, rs)						\
+	(((rs) >= 64) ? ((v) != 0) : ((v) >> (rs)) | ((v) << (64-(rs)) != 0))
+
 /* 3bit extended single precision sticky right shift */
 #define XSPSRS(v, rs)						\
 	((rs > (SP_FBITS+3))?1:((v) >> (rs)) | ((v) << (32-(rs)) != 0))
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index 07f5a9bb1312..7195fe785d81 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -21,14 +21,8 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	int re;
 	int rs;
 	unsigned rm;
-	unsigned short lxm;
-	unsigned short hxm;
-	unsigned short lym;
-	unsigned short hym;
-	unsigned lrm;
-	unsigned hrm;
-	unsigned t;
-	unsigned at;
+	uint64_t rm64;
+	uint64_t zm64;
 	int s;
 
 	COMPXSP;
@@ -170,108 +164,90 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	if (flags & MADDF_NEGATE_PRODUCT)
 		rs ^= 1;
 
-	/* shunt to top of word */
-	xm <<= 32 - (SP_FBITS + 1);
-	ym <<= 32 - (SP_FBITS + 1);
+	/* Multiple 24 bit xm and ym to give 48 bit results */
+	rm64 = (uint64_t)xm * ym;
 
-	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
-	 */
-	lxm = xm & 0xffff;
-	hxm = xm >> 16;
-	lym = ym & 0xffff;
-	hym = ym >> 16;
+	/* Shunt to top of word */
+	rm64 = rm64 << 16;
 
-	lrm = lxm * lym;	/* 16 * 16 => 32 */
-	hrm = hxm * hym;	/* 16 * 16 => 32 */
-
-	t = lxm * hym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
-
-	t = hxm * lym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
-
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((int) rm < 0) {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3))) |
-		    ((rm << (SP_FBITS + 1 + 3)) != 0);
+	/* Put explicit bit at bit 62 if necessary */
+	if ((int64_t) rm64 < 0) {
+		rm64 = rm64 >> 1;
 		re++;
-	} else {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (SP_FBITS + 1 + 3 + 1)) != 0);
 	}
-	assert(rm & (SP_HIDDEN_BIT << 3));
 
-	if (zc == IEEE754_CLASS_ZERO)
+	assert(rm64 & (1 << 62));
+
+	if (zc == IEEE754_CLASS_ZERO) {
+		/*
+		 * Move explicit bit from bit 62 to bit 26 since the
+		 * ieee754sp_format code expects the mantissa to be
+		 * 27 bits wide (24 + 3 rounding bits).
+		 */
+		rm = XSPSRS64(rm64, (62 - 26));
 		return ieee754sp_format(rs, re, rm);
+	}
 
-	/* And now the addition */
-
-	assert(zm & SP_HIDDEN_BIT);
-
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
+	/* Move explicit bit from bit 23 to bit 62 */
+	zm64 = (uint64_t)zm << (62 - 23);
+	assert(zm64 & (1 << 62));
 
+	/* Make the exponents the same */
 	if (ze > re) {
 		/*
 		 * Have to shift r fraction right to align.
 		 */
 		s = ze - re;
-		rm = XSPSRS(rm, s);
+		rm64 = XSPSRS64(rm64, s);
 		re += s;
 	} else if (re > ze) {
 		/*
 		 * Have to shift z fraction right to align.
 		 */
 		s = re - ze;
-		zm = XSPSRS(zm, s);
+		zm64 = XSPSRS64(zm64, s);
 		ze += s;
 	}
 	assert(ze == re);
 	assert(ze <= SP_EMAX);
 
+	/* Do the addition */
 	if (zs == rs) {
 		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in zm, zs and ze.
+		 * Generate 64 bit result by adding two 63 bit numbers
+		 * leaving result in zm64, zs and ze.
 		 */
-		zm = zm + rm;
-
-		if (zm >> (SP_FBITS + 1 + 3)) { /* carry out */
-			zm = XSPSRS1(zm);
+		zm64 = zm64 + rm64;
+		if ((int64_t)zm64 < 0) {	/* carry out */
+			zm64 = XSPSRS1(zm64);
 			ze++;
 		}
 	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
+		if (zm64 >= rm64) {
+			zm64 = zm64 - rm64;
 		} else {
-			zm = rm - zm;
+			zm64 = rm64 - zm64;
 			zs = rs;
 		}
-		if (zm == 0)
+		if (zm64 == 0)
 			return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD);
 
 		/*
-		 * Normalize in extended single precision
+		 * Put explicit bit at bit 62 if necessary.
 		 */
-		while ((zm >> (SP_MBITS + 3)) == 0) {
-			zm <<= 1;
+		while ((zm64 >> 62) == 0) {
+			zm64 <<= 1;
 			ze--;
 		}
-
 	}
+
+	/*
+	 * Move explicit bit from bit 62 to bit 26 since the
+	 * ieee754sp_format code expects the mantissa to be
+	 * 27 bits wide (24 + 3 rounding bits).
+	 */
+	zm = XSPSRS64(zm64, (62 - 26));
+
 	return ieee754sp_format(zs, ze, zm);
 }
 

From 7970be560ee7407c54811ca29df21f7fca76e970 Mon Sep 17 00:00:00 2001
From: Douglas Leung <douglas.leung@imgtec.com>
Date: Thu, 27 Jul 2017 18:08:59 +0200
Subject: [PATCH 1506/3220] UPSTREAM: MIPS: math-emu: <MADDF|MSUBF>.D: Fix
 accuracy (64-bit case)

Implement fused multiply-add with correct accuracy.

Fused multiply-add operation has better accuracy than respective
sequential execution of multiply and add operations applied on the
same inputs. This is because accuracy errors accumulate in latter
case.

This patch implements fused multiply-add with the same accuracy
as it is implemented in hardware, using 128-bit intermediate
calculations.

One test case example (raw bits) that this patch fixes:

MADDF.D fd,fs,ft:
  fd = 0x00000ca000000000
  fs = ft = 0x3f40624dd2f1a9fc

Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction")
Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction")

Signed-off-by: Douglas Leung <douglas.leung@imgtec.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@imgtec.com>
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
Cc: Douglas Leung <douglas.leung@imgtec.com>
Cc: Bo Hu <bohu@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jin Qian <jinqian@google.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Petar Jovanovic <petar.jovanovic@imgtec.com>
Cc: Raghu Gandham <raghu.gandham@imgtec.com>
Cc: <stable@vger.kernel.org> # 4.7+
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/16891/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 2cfa58259f4b65b33ebe8f167019a1f89c6c3289)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c | 133 ++++++++++++++++++++++++----------
 1 file changed, 94 insertions(+), 39 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index e799fc826b0c..e0d9be5fbf4c 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -15,18 +15,44 @@
 #include "ieee754dp.h"
 
 
+/* 128 bits shift right logical with rounding. */
+void srl128(u64 *hptr, u64 *lptr, int count)
+{
+	u64 low;
+
+	if (count >= 128) {
+		*lptr = *hptr != 0 || *lptr != 0;
+		*hptr = 0;
+	} else if (count >= 64) {
+		if (count == 64) {
+			*lptr = *hptr | (*lptr != 0);
+		} else {
+			low = *lptr;
+			*lptr = *hptr >> (count - 64);
+			*lptr |= (*hptr << (128 - count)) != 0 || low != 0;
+		}
+		*hptr = 0;
+	} else {
+		low = *lptr;
+		*lptr = low >> count | *hptr << (64 - count);
+		*lptr |= (low << (64 - count)) != 0;
+		*hptr = *hptr >> count;
+	}
+}
+
 static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 				 union ieee754dp y, enum maddf_flags flags)
 {
 	int re;
 	int rs;
-	u64 rm;
 	unsigned lxm;
 	unsigned hxm;
 	unsigned lym;
 	unsigned hym;
 	u64 lrm;
 	u64 hrm;
+	u64 lzm;
+	u64 hzm;
 	u64 t;
 	u64 at;
 	int s;
@@ -172,7 +198,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	ym <<= 64 - (DP_FBITS + 1);
 
 	/*
-	 * Multiply 64 bits xm, ym to give high 64 bits rm with stickness.
+	 * Multiply 64 bits xm and ym to give 128 bits result in hrm:lrm.
 	 */
 
 	/* 32 * 32 => 64 */
@@ -202,81 +228,110 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	hrm = hrm + (t >> 32);
 
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((s64) rm < 0) {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3))) |
-		     ((rm << (DP_FBITS + 1 + 3)) != 0);
+	/* Put explicit bit at bit 126 if necessary */
+	if ((int64_t)hrm < 0) {
+		lrm = (hrm << 63) | (lrm >> 1);
+		hrm = hrm >> 1;
 		re++;
-	} else {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (DP_FBITS + 1 + 3 + 1)) != 0);
 	}
-	assert(rm & (DP_HIDDEN_BIT << 3));
 
-	if (zc == IEEE754_CLASS_ZERO)
-		return ieee754dp_format(rs, re, rm);
+	assert(hrm & (1 << 62));
 
-	/* And now the addition */
-	assert(zm & DP_HIDDEN_BIT);
+	if (zc == IEEE754_CLASS_ZERO) {
+		/*
+		 * Move explicit bit from bit 126 to bit 55 since the
+		 * ieee754dp_format code expects the mantissa to be
+		 * 56 bits wide (53 + 3 rounding bits).
+		 */
+		srl128(&hrm, &lrm, (126 - 55));
+		return ieee754dp_format(rs, re, lrm);
+	}
 
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
+	/* Move explicit bit from bit 52 to bit 126 */
+	lzm = 0;
+	hzm = zm << 10;
+	assert(hzm & (1 << 62));
 
+	/* Make the exponents the same */
 	if (ze > re) {
 		/*
 		 * Have to shift y fraction right to align.
 		 */
 		s = ze - re;
-		rm = XDPSRS(rm, s);
+		srl128(&hrm, &lrm, s);
 		re += s;
 	} else if (re > ze) {
 		/*
 		 * Have to shift x fraction right to align.
 		 */
 		s = re - ze;
-		zm = XDPSRS(zm, s);
+		srl128(&hzm, &lzm, s);
 		ze += s;
 	}
 	assert(ze == re);
 	assert(ze <= DP_EMAX);
 
+	/* Do the addition */
 	if (zs == rs) {
 		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in xm, xs and xe.
+		 * Generate 128 bit result by adding two 127 bit numbers
+		 * leaving result in hzm:lzm, zs and ze.
 		 */
-		zm = zm + rm;
-
-		if (zm >> (DP_FBITS + 1 + 3)) { /* carry out */
-			zm = XDPSRS1(zm);
+		hzm = hzm + hrm + (lzm > (lzm + lrm));
+		lzm = lzm + lrm;
+		if ((int64_t)hzm < 0) {        /* carry out */
+			srl128(&hzm, &lzm, 1);
 			ze++;
 		}
 	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
+		if (hzm > hrm || (hzm == hrm && lzm >= lrm)) {
+			hzm = hzm - hrm - (lzm < lrm);
+			lzm = lzm - lrm;
 		} else {
-			zm = rm - zm;
+			hzm = hrm - hzm - (lrm < lzm);
+			lzm = lrm - lzm;
 			zs = rs;
 		}
-		if (zm == 0)
+		if (lzm == 0 && hzm == 0)
 			return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD);
 
 		/*
-		 * Normalize to rounding precision.
+		 * Put explicit bit at bit 126 if necessary.
 		 */
-		while ((zm >> (DP_FBITS + 3)) == 0) {
-			zm <<= 1;
-			ze--;
+		if (hzm == 0) {
+			/* left shift by 63 or 64 bits */
+			if ((int64_t)lzm < 0) {
+				/* MSB of lzm is the explicit bit */
+				hzm = lzm >> 1;
+				lzm = lzm << 63;
+				ze -= 63;
+			} else {
+				hzm = lzm;
+				lzm = 0;
+				ze -= 64;
+			}
+		}
+
+		t = 0;
+		while ((hzm >> (62 - t)) == 0)
+			t++;
+
+		assert(t <= 62);
+		if (t) {
+			hzm = hzm << t | lzm >> (64 - t);
+			lzm = lzm << t;
+			ze -= t;
 		}
 	}
 
-	return ieee754dp_format(zs, ze, zm);
+	/*
+	 * Move explicit bit from bit 126 to bit 55 since the
+	 * ieee754dp_format code expects the mantissa to be
+	 * 56 bits wide (53 + 3 rounding bits).
+	 */
+	srl128(&hzm, &lzm, (126 - 55));
+
+	return ieee754dp_format(zs, ze, lzm);
 }
 
 union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,

From 851d630a4e30adce79a50f9d0f508d3292aee247 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Thu, 2 Nov 2017 12:13:59 +0100
Subject: [PATCH 1507/3220] UPSTREAM: MIPS: math-emu: Use preferred flavor of
 unsigned integer declarations

Fix occurences of unsigned integer variable declarations that are
not preferred by standards of checkpatch scripts. This removes a
significant number of checkpatch warnings for files in math-emu
directory (several files become completely warning-free), and thus
makes easier to spot (now and in the future) other, perhaps more
significant, checkpatch errors and warnings.

Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Reviewed-by: James Hogan <jhogan@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Douglas Leung <douglas.leung@mips.com>
Cc: Goran Ferenc <goran.ferenc@mips.com>
Cc: "Maciej W. Rozycki" <macro@imgtec.com>
Cc: Manuel Lauss <manuel.lauss@gmail.com>
Cc: Miodrag Dinic <miodrag.dinic@mips.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petar Jovanovic <petar.jovanovic@mips.com>
Cc: Raghu Gandham <raghu.gandham@mips.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17582/
Signed-off-by: James Hogan <jhogan@kernel.org>
(cherry picked from commit a58f85b5d5bbe44ee9dc8eae03a4f21fa3e087cc)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c     | 18 ++++++++++--------
 arch/mips/math-emu/dp_maddf.c   |  8 ++++----
 arch/mips/math-emu/dp_mul.c     |  8 ++++----
 arch/mips/math-emu/dp_sqrt.c    |  4 ++--
 arch/mips/math-emu/ieee754.h    | 15 ++++++++-------
 arch/mips/math-emu/ieee754int.h |  6 +++---
 arch/mips/math-emu/ieee754sp.c  |  4 ++--
 arch/mips/math-emu/ieee754sp.h  |  2 +-
 arch/mips/math-emu/sp_div.c     |  4 ++--
 arch/mips/math-emu/sp_fint.c    |  2 +-
 arch/mips/math-emu/sp_maddf.c   |  6 +++---
 arch/mips/math-emu/sp_mul.c     | 10 +++++-----
 12 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 3c53aaff3ae3..cd0f6c06147c 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -811,7 +811,7 @@ do {									\
 #define SITOREG(si, x)							\
 do {									\
 	if (cop1_64bit(xcp) && !hybrid_fprs()) {			\
-		unsigned i;						\
+		unsigned int i;						\
 		set_fpr32(&ctx->fpr[x], 0, si);				\
 		for (i = 1; i < ARRAY_SIZE(ctx->fpr[x].val32); i++)	\
 			set_fpr32(&ctx->fpr[x], i, 0);			\
@@ -824,7 +824,7 @@ do {									\
 
 #define SITOHREG(si, x)							\
 do {									\
-	unsigned i;							\
+	unsigned int i;							\
 	set_fpr32(&ctx->fpr[x], 1, si);					\
 	for (i = 2; i < ARRAY_SIZE(ctx->fpr[x].val32); i++)		\
 		set_fpr32(&ctx->fpr[x], i, 0);				\
@@ -835,7 +835,7 @@ do {									\
 
 #define DITOREG(di, x)							\
 do {									\
-	unsigned fpr, i;						\
+	unsigned int fpr, i;						\
 	fpr = (x) & ~(cop1_64bit(xcp) ^ 1);				\
 	set_fpr64(&ctx->fpr[fpr], 0, di);				\
 	for (i = 1; i < ARRAY_SIZE(ctx->fpr[x].val64); i++)		\
@@ -1467,7 +1467,7 @@ DEF3OP(nmsub, dp, ieee754dp_mul, ieee754dp_sub, ieee754dp_neg);
 static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 	mips_instruction ir, void *__user *fault_addr)
 {
-	unsigned rcsr = 0;	/* resulting csr */
+	unsigned int rcsr = 0;	/* resulting csr */
 
 	MIPS_FPU_EMU_INC_STATS(cp1xops);
 
@@ -1663,10 +1663,10 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 	mips_instruction ir)
 {
 	int rfmt;		/* resulting format */
-	unsigned rcsr = 0;	/* resulting csr */
+	unsigned int rcsr = 0;	/* resulting csr */
 	unsigned int oldrm;
 	unsigned int cbit;
-	unsigned cond;
+	unsigned int cond;
 	union {
 		union ieee754dp d;
 		union ieee754sp s;
@@ -2031,9 +2031,10 @@ copcsr:
 
 		default:
 			if (!NO_R6EMU && MIPSInst_FUNC(ir) >= fcmp_op) {
-				unsigned cmpop = MIPSInst_FUNC(ir) - fcmp_op;
+				unsigned int cmpop;
 				union ieee754sp fs, ft;
 
+				cmpop = MIPSInst_FUNC(ir) - fcmp_op;
 				SPFROMREG(fs, MIPSInst_FS(ir));
 				SPFROMREG(ft, MIPSInst_FT(ir));
 				rv.w = ieee754sp_cmp(fs, ft,
@@ -2381,9 +2382,10 @@ dcopuop:
 
 		default:
 			if (!NO_R6EMU && MIPSInst_FUNC(ir) >= fcmp_op) {
-				unsigned cmpop = MIPSInst_FUNC(ir) - fcmp_op;
+				unsigned int cmpop;
 				union ieee754dp fs, ft;
 
+				cmpop = MIPSInst_FUNC(ir) - fcmp_op;
 				DPFROMREG(fs, MIPSInst_FS(ir));
 				DPFROMREG(ft, MIPSInst_FT(ir));
 				rv.w = ieee754dp_cmp(fs, ft,
diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index e0d9be5fbf4c..7ad79ed411f5 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -45,10 +45,10 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 {
 	int re;
 	int rs;
-	unsigned lxm;
-	unsigned hxm;
-	unsigned lym;
-	unsigned hym;
+	unsigned int lxm;
+	unsigned int hxm;
+	unsigned int lym;
+	unsigned int hym;
 	u64 lrm;
 	u64 hrm;
 	u64 lzm;
diff --git a/arch/mips/math-emu/dp_mul.c b/arch/mips/math-emu/dp_mul.c
index 87d0b44b0614..60c8bfe40947 100644
--- a/arch/mips/math-emu/dp_mul.c
+++ b/arch/mips/math-emu/dp_mul.c
@@ -26,10 +26,10 @@ union ieee754dp ieee754dp_mul(union ieee754dp x, union ieee754dp y)
 	int re;
 	int rs;
 	u64 rm;
-	unsigned lxm;
-	unsigned hxm;
-	unsigned lym;
-	unsigned hym;
+	unsigned int lxm;
+	unsigned int hxm;
+	unsigned int lym;
+	unsigned int hym;
 	u64 lrm;
 	u64 hrm;
 	u64 t;
diff --git a/arch/mips/math-emu/dp_sqrt.c b/arch/mips/math-emu/dp_sqrt.c
index cd5bc083001e..cea907b83146 100644
--- a/arch/mips/math-emu/dp_sqrt.c
+++ b/arch/mips/math-emu/dp_sqrt.c
@@ -21,7 +21,7 @@
 
 #include "ieee754dp.h"
 
-static const unsigned table[] = {
+static const unsigned int table[] = {
 	0, 1204, 3062, 5746, 9193, 13348, 18162, 23592,
 	29598, 36145, 43202, 50740, 58733, 67158, 75992,
 	85215, 83599, 71378, 60428, 50647, 41945, 34246,
@@ -33,7 +33,7 @@ union ieee754dp ieee754dp_sqrt(union ieee754dp x)
 {
 	struct _ieee754_csr oldcsr;
 	union ieee754dp y, z, t;
-	unsigned scalx, yh;
+	unsigned int scalx, yh;
 	COMPXDP;
 
 	EXPLODEXDP;
diff --git a/arch/mips/math-emu/ieee754.h b/arch/mips/math-emu/ieee754.h
index 92dc8fa565cb..e0eb7a965fdf 100644
--- a/arch/mips/math-emu/ieee754.h
+++ b/arch/mips/math-emu/ieee754.h
@@ -165,11 +165,12 @@ struct _ieee754_csr {
 };
 #define ieee754_csr (*(struct _ieee754_csr *)(&current->thread.fpu.fcr31))
 
-static inline unsigned ieee754_getrm(void)
+static inline unsigned int ieee754_getrm(void)
 {
 	return (ieee754_csr.rm);
 }
-static inline unsigned ieee754_setrm(unsigned rm)
+
+static inline unsigned int ieee754_setrm(unsigned int rm)
 {
 	return (ieee754_csr.rm = rm);
 }
@@ -177,14 +178,14 @@ static inline unsigned ieee754_setrm(unsigned rm)
 /*
  * get current exceptions
  */
-static inline unsigned ieee754_getcx(void)
+static inline unsigned int ieee754_getcx(void)
 {
 	return (ieee754_csr.cx);
 }
 
 /* test for current exception condition
  */
-static inline int ieee754_cxtest(unsigned n)
+static inline int ieee754_cxtest(unsigned int n)
 {
 	return (ieee754_csr.cx & n);
 }
@@ -192,21 +193,21 @@ static inline int ieee754_cxtest(unsigned n)
 /*
  * get sticky exceptions
  */
-static inline unsigned ieee754_getsx(void)
+static inline unsigned int ieee754_getsx(void)
 {
 	return (ieee754_csr.sx);
 }
 
 /* clear sticky conditions
 */
-static inline unsigned ieee754_clrsx(void)
+static inline unsigned int ieee754_clrsx(void)
 {
 	return (ieee754_csr.sx = 0);
 }
 
 /* test for sticky exception condition
  */
-static inline int ieee754_sxtest(unsigned n)
+static inline int ieee754_sxtest(unsigned int n)
 {
 	return (ieee754_csr.sx & n);
 }
diff --git a/arch/mips/math-emu/ieee754int.h b/arch/mips/math-emu/ieee754int.h
index dd2071f430e0..06ac0e2ac7ac 100644
--- a/arch/mips/math-emu/ieee754int.h
+++ b/arch/mips/math-emu/ieee754int.h
@@ -54,13 +54,13 @@ static inline int ieee754_class_nan(int xc)
 }
 
 #define COMPXSP \
-	unsigned xm; int xe; int xs __maybe_unused; int xc
+	unsigned int xm; int xe; int xs __maybe_unused; int xc
 
 #define COMPYSP \
-	unsigned ym; int ye; int ys; int yc
+	unsigned int ym; int ye; int ys; int yc
 
 #define COMPZSP \
-	unsigned zm; int ze; int zs; int zc
+	unsigned int zm; int ze; int zs; int zc
 
 #define EXPLODESP(v, vc, vs, ve, vm)					\
 {									\
diff --git a/arch/mips/math-emu/ieee754sp.c b/arch/mips/math-emu/ieee754sp.c
index 260e68965907..8423e4c5e415 100644
--- a/arch/mips/math-emu/ieee754sp.c
+++ b/arch/mips/math-emu/ieee754sp.c
@@ -65,7 +65,7 @@ union ieee754sp __cold ieee754sp_nanxcpt(union ieee754sp r)
 	return r;
 }
 
-static unsigned ieee754sp_get_rounding(int sn, unsigned xm)
+static unsigned int ieee754sp_get_rounding(int sn, unsigned int xm)
 {
 	/* inexact must round of 3 bits
 	 */
@@ -96,7 +96,7 @@ static unsigned ieee754sp_get_rounding(int sn, unsigned xm)
  * xe is an unbiased exponent
  * xm is 3bit extended precision value.
  */
-union ieee754sp ieee754sp_format(int sn, int xe, unsigned xm)
+union ieee754sp ieee754sp_format(int sn, int xe, unsigned int xm)
 {
 	assert(xm);		/* we don't gen exact zeros (probably should) */
 
diff --git a/arch/mips/math-emu/ieee754sp.h b/arch/mips/math-emu/ieee754sp.h
index 0f63e4202cff..8c5a63804873 100644
--- a/arch/mips/math-emu/ieee754sp.h
+++ b/arch/mips/math-emu/ieee754sp.h
@@ -69,7 +69,7 @@ static inline int ieee754sp_finite(union ieee754sp x)
 #define SPDNORMY	SPDNORMx(ym, ye)
 #define SPDNORMZ	SPDNORMx(zm, ze)
 
-static inline union ieee754sp buildsp(int s, int bx, unsigned m)
+static inline union ieee754sp buildsp(int s, int bx, unsigned int m)
 {
 	union ieee754sp r;
 
diff --git a/arch/mips/math-emu/sp_div.c b/arch/mips/math-emu/sp_div.c
index 27f6db3a0a4c..23587b31ca87 100644
--- a/arch/mips/math-emu/sp_div.c
+++ b/arch/mips/math-emu/sp_div.c
@@ -23,9 +23,9 @@
 
 union ieee754sp ieee754sp_div(union ieee754sp x, union ieee754sp y)
 {
-	unsigned rm;
+	unsigned int rm;
 	int re;
-	unsigned bm;
+	unsigned int bm;
 
 	COMPXSP;
 	COMPYSP;
diff --git a/arch/mips/math-emu/sp_fint.c b/arch/mips/math-emu/sp_fint.c
index d5d8495b2cc4..1a35d12b6fc8 100644
--- a/arch/mips/math-emu/sp_fint.c
+++ b/arch/mips/math-emu/sp_fint.c
@@ -23,7 +23,7 @@
 
 union ieee754sp ieee754sp_fint(int x)
 {
-	unsigned xm;
+	unsigned int xm;
 	int xe;
 	int xs;
 
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index 7195fe785d81..f823338dbb65 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -20,9 +20,9 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 {
 	int re;
 	int rs;
-	unsigned rm;
-	uint64_t rm64;
-	uint64_t zm64;
+	unsigned int rm;
+	u64 rm64;
+	u64 zm64;
 	int s;
 
 	COMPXSP;
diff --git a/arch/mips/math-emu/sp_mul.c b/arch/mips/math-emu/sp_mul.c
index d910c43a6f30..4015101fbc37 100644
--- a/arch/mips/math-emu/sp_mul.c
+++ b/arch/mips/math-emu/sp_mul.c
@@ -25,15 +25,15 @@ union ieee754sp ieee754sp_mul(union ieee754sp x, union ieee754sp y)
 {
 	int re;
 	int rs;
-	unsigned rm;
+	unsigned int rm;
 	unsigned short lxm;
 	unsigned short hxm;
 	unsigned short lym;
 	unsigned short hym;
-	unsigned lrm;
-	unsigned hrm;
-	unsigned t;
-	unsigned at;
+	unsigned int lrm;
+	unsigned int hrm;
+	unsigned int t;
+	unsigned int at;
 
 	COMPXSP;
 	COMPYSP;

From 0dcb34db9f488646f3be9f88365c12a6df4d4f76 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Thu, 9 Nov 2017 18:09:30 +0100
Subject: [PATCH 1508/3220] UPSTREAM: Documentation: Add device tree binding
 for Goldfish FB driver

Add documentation for DT binding of Goldfish FB driver. The compatible
string used by OS for binding the driver is "google,goldfish-fb".

Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Goran Ferenc <goran.ferenc@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Acked-by: Rob Herring <robh@kernel.org>
Cc: David Airlie <airlied@linux.ie>
Cc: Douglas Leung <douglas.leung@mips.com>
Cc: James Hogan <james.hogan@mips.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petar Jovanovic <petar.jovanovic@mips.com>
Cc: Raghu Gandham <raghu.gandham@mips.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
(cherry picked from commit bad12f43d0b164aad2a32b48bdc09d0d9680a213)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 .../bindings/display/google,goldfish-fb.txt     | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/display/google,goldfish-fb.txt

diff --git a/Documentation/devicetree/bindings/display/google,goldfish-fb.txt b/Documentation/devicetree/bindings/display/google,goldfish-fb.txt
new file mode 100644
index 000000000000..751fa9f51e5d
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/google,goldfish-fb.txt
@@ -0,0 +1,17 @@
+Android Goldfish framebuffer
+
+Android Goldfish framebuffer device used by Android emulator.
+
+Required properties:
+
+- compatible : should contain "google,goldfish-fb"
+- reg        : <registers mapping>
+- interrupts : <interrupt mapping>
+
+Example:
+
+	display-controller@1f008000 {
+		compatible = "google,goldfish-fb";
+		interrupts = <0x10>;
+		reg = <0x1f008000 0x100>;
+	};

From 9684a8cd1c07521cac2f57d7b834134ad864ac0d Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:21 +0000
Subject: [PATCH 1509/3220] UPSTREAM: MIPS: Detect MIPSr6 Virtual Processor
 support

MIPSr6 introduces support for "Virtual Processors", which are
conceptually similar to VPEs from the now-deprecated MT ASE. Detect
whether the system supports VPs using the VP bit in Config5, adding
cpu_has_vp for use by later patches.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: Maciej W. Rozycki <macro@imgtec.com>
Cc: Joshua Kinard <kumba@gentoo.org>
Cc: Steven J. Hill <sjhill@realitydiluted.com>
Cc: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12327/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit f270d881fa552c9c21c37417af2bf95da9a74347)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/cpu-features.h | 4 ++++
 arch/mips/include/asm/cpu.h          | 1 +
 arch/mips/include/asm/mipsregs.h     | 1 +
 arch/mips/kernel/cpu-probe.c         | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8e2da2..57cdc5be9545 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -311,6 +311,10 @@
 #define cpu_has_mipsmt		(cpu_data[0].ases & MIPS_ASE_MIPSMT)
 #endif
 
+#ifndef cpu_has_vp
+#define cpu_has_vp		(cpu_data[0].options & MIPS_CPU_VP)
+#endif
+
 #ifndef cpu_has_userlocal
 #define cpu_has_userlocal	(cpu_data[0].options & MIPS_CPU_ULRI)
 #endif
diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h
index a97ca97285ec..82a26ed17515 100644
--- a/arch/mips/include/asm/cpu.h
+++ b/arch/mips/include/asm/cpu.h
@@ -388,6 +388,7 @@ enum cpu_type_enum {
 #define MIPS_CPU_FTLB		0x20000000000ull /* CPU has Fixed-page-size TLB */
 #define MIPS_CPU_NAN_LEGACY	0x40000000000ull /* Legacy NaN implemented */
 #define MIPS_CPU_NAN_2008	0x80000000000ull /* 2008 NaN implemented */
+#define MIPS_CPU_VP		0x100000000000ull /* MIPSr6 Virtual Processors (multi-threading) */
 
 /*
  * CPU ASE encodings
diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index e43aca183c99..c785da1d8660 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -589,6 +589,7 @@
 #define MIPS_CONF5_MRP		(_ULCAST_(1) << 3)
 #define MIPS_CONF5_LLB		(_ULCAST_(1) << 4)
 #define MIPS_CONF5_MVH		(_ULCAST_(1) << 5)
+#define MIPS_CONF5_VP		(_ULCAST_(1) << 7)
 #define MIPS_CONF5_FRE		(_ULCAST_(1) << 8)
 #define MIPS_CONF5_UFE		(_ULCAST_(1) << 9)
 #define MIPS_CONF5_MSAEN	(_ULCAST_(1) << 27)
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index b725b713b9f8..e38442d5cd6e 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -796,6 +796,8 @@ static inline unsigned int decode_config5(struct cpuinfo_mips *c)
 	if (config5 & MIPS_CONF5_MVH)
 		c->options |= MIPS_CPU_XPA;
 #endif
+	if (cpu_has_mips_r6 && (config5 & MIPS_CONF5_VP))
+		c->options |= MIPS_CPU_VP;
 
 	return config5 & MIPS_CONF_M;
 }

From 518b875ac8eb7080f3db4dd45d39fd08239f589f Mon Sep 17 00:00:00 2001
From: Markos Chandras <markos.chandras@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:22 +0000
Subject: [PATCH 1510/3220] UPSTREAM: MIPS: traps: Make sure secondary cores
 have a sane ebase register

We shouldn't trust that the secondary cores will have a sane ebase register
(either from the bootloader or during the hardware design phase) so use the
ebase address as calculated by the boot CPU.

Signed-off-by: Markos Chandras <markos.chandras@imgtec.com>
Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Petri Gynther <pgynther@google.com>
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12328/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 04d83f948510f17f8f2ab320b2386f4b5fbd0bd4)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/traps.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 2ad27a560f7f..e23f4775b10b 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2140,6 +2140,13 @@ void per_cpu_trap_init(bool is_boot_cpu)
 	 *  o read IntCtl.IPFDC to determine the fast debug channel interrupt
 	 */
 	if (cpu_has_mips_r2_r6) {
+		/*
+		 * We shouldn't trust a secondary core has a sane EBASE register
+		 * so use the one calculated by the boot CPU.
+		 */
+		if (!is_boot_cpu)
+			write_c0_ebase(ebase);
+
 		cp0_compare_irq_shift = CAUSEB_TI - CAUSEB_IP;
 		cp0_compare_irq = (read_c0_intctl() >> INTCTLB_IPTI) & 7;
 		cp0_perfcount_irq = (read_c0_intctl() >> INTCTLB_IPPCI) & 7;

From 55454ca1bd24d950af9f3fd74d9943aef07deb2e Mon Sep 17 00:00:00 2001
From: Markos Chandras <markos.chandras@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:23 +0000
Subject: [PATCH 1511/3220] UPSTREAM: MIPS: pm-cps: Avoid offset overflow on
 MIPSr6

This is similar to commit 934c79231c1b ("MIPS: asm: r4kcache: Add MIPS
R6 cache unroll functions"). The CACHE instruction has been redefined
for MIPSr6 and it reduced its offset field to 8 bits. This leads to
micro-assembler field overflow warnings when booting SMP MIPSr6 cores
like the following one:

Call Trace:
[<ffffffff8010af88>] show_stack+0x68/0x88
[<ffffffff8056ddf0>] dump_stack+0x68/0x88
[<ffffffff801305bc>] warn_slowpath_common+0x8c/0xc8
[<ffffffff80130630>] warn_slowpath_fmt+0x38/0x48
[<ffffffff80125814>] build_insn+0x514/0x5c0
[<ffffffff806ee134>] cps_gen_cache_routine.isra.3+0xe0/0x1b8
[<ffffffff806ee570>] cps_pm_init+0x364/0x9ec
[<ffffffff80100538>] do_one_initcall+0x90/0x1a8
[<ffffffff806e8c14>] kernel_init_freeable+0x160/0x21c
[<ffffffff8056b6a0>] kernel_init+0x10/0xf8
[<ffffffff801059f8>] ret_from_kernel_thread+0x14/0x1c

We fix this by incrementing the base register on every loop.

Signed-off-by: Markos Chandras <markos.chandras@imgtec.com>
Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12329/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 0f2a1484487407390196ca5b3c3a07bee6df15ad)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/pm-cps.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c
index ebb227471700..91901dbb3830 100644
--- a/arch/mips/kernel/pm-cps.c
+++ b/arch/mips/kernel/pm-cps.c
@@ -223,11 +223,18 @@ static void __init cps_gen_cache_routine(u32 **pp, struct uasm_label **pl,
 	uasm_build_label(pl, *pp, lbl);
 
 	/* Generate the cache ops */
-	for (i = 0; i < unroll_lines; i++)
-		uasm_i_cache(pp, op, i * cache->linesz, t0);
+	for (i = 0; i < unroll_lines; i++) {
+		if (cpu_has_mips_r6) {
+			uasm_i_cache(pp, op, 0, t0);
+			uasm_i_addiu(pp, t0, t0, cache->linesz);
+		} else {
+			uasm_i_cache(pp, op, i * cache->linesz, t0);
+		}
+	}
 
-	/* Update the base address */
-	uasm_i_addiu(pp, t0, t0, unroll_lines * cache->linesz);
+	if (!cpu_has_mips_r6)
+		/* Update the base address */
+		uasm_i_addiu(pp, t0, t0, unroll_lines * cache->linesz);
 
 	/* Loop if we haven't reached the end address yet */
 	uasm_il_bne(pp, pr, t0, t1, lbl);

From 2ea248c38c525966b695adf7833c875836000bff Mon Sep 17 00:00:00 2001
From: Markos Chandras <markos.chandras@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:24 +0000
Subject: [PATCH 1512/3220] UPSTREAM: MIPS: CPC: Add start, stop and running
 CM3 CPC registers

Add the new CM3 registers for controlling bringing up and powering down
VPs on MIPSR6 cores.

Signed-off-by: Markos Chandras <markos.chandras@imgtec.com>
Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12330/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 88036557bac3c831a564dcd6c860da48ae55756f)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/mips-cpc.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/mips/include/asm/mips-cpc.h b/arch/mips/include/asm/mips-cpc.h
index e09035239e53..8c519f9827a3 100644
--- a/arch/mips/include/asm/mips-cpc.h
+++ b/arch/mips/include/asm/mips-cpc.h
@@ -106,6 +106,9 @@ BUILD_CPC_R_(revision,		MIPS_CPC_GCB_OFS + 0x20)
 BUILD_CPC_Cx_RW(cmd,		0x00)
 BUILD_CPC_Cx_RW(stat_conf,	0x08)
 BUILD_CPC_Cx_RW(other,		0x10)
+BUILD_CPC_Cx_RW(vp_stop,	0x20)
+BUILD_CPC_Cx_RW(vp_run,		0x28)
+BUILD_CPC_Cx_RW(vp_running,	0x30)
 
 /* CPC_Cx_CMD register fields */
 #define CPC_Cx_CMD_SHF				0

From 9eab7bcece0f430c323250b5d1d371c20e0611b4 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:25 +0000
Subject: [PATCH 1513/3220] UPSTREAM: MIPS: CM: Add CM GCR_BEV_BASE accessors

Generate accessor functions for the GCR_BEV_BASE register introduced by
CM3, for use by a later patch.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12331/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit db8e00af7b4a09ef5924140c1c42494fc88204ef)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/mips-cm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/include/asm/mips-cm.h b/arch/mips/include/asm/mips-cm.h
index c66fb8073c75..7b18653fbda2 100644
--- a/arch/mips/include/asm/mips-cm.h
+++ b/arch/mips/include/asm/mips-cm.h
@@ -208,6 +208,7 @@ BUILD_CM_RW(l2_config,		MIPS_CM_GCB_OFS + 0x130)
 BUILD_CM_RW(sys_config2,	MIPS_CM_GCB_OFS + 0x150)
 BUILD_CM_RW(l2_pft_control,	MIPS_CM_GCB_OFS + 0x300)
 BUILD_CM_RW(l2_pft_control_b,	MIPS_CM_GCB_OFS + 0x308)
+BUILD_CM_RW(bev_base,		MIPS_CM_GCB_OFS + 0x680)
 
 /* Core Local & Core Other register accessor functions */
 BUILD_CM_Cx_RW(reset_release,	0x00)

From 17aa10a65c9be29ad4ee65656ce2d23c5273ec41 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:26 +0000
Subject: [PATCH 1514/3220] UPSTREAM: MIPS: CM: Fix mips_cm_max_vp_width for UP
 kernels

Fix mips_cm_max_vp_width for UP kernels where it previously referenced
smp_num_siblings, which is not declared for UP kernels. This led to
build errors such as the following:

  drivers/built-in.o: In function `$L446':
  irq-mips-gic.c:(.text+0x1994): undefined reference to `smp_num_siblings'
  drivers/built-in.o:irq-mips-gic.c:(.text+0x199c): more undefined references to `smp_num_siblings' follow

On UP kernels simply return 1, leaving the reference to smp_num_siblings
in place only for SMP kernels.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12332/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit a60ae81e5e5918138703f22427dd8f2445985b55)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/mips-cm.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/mips-cm.h b/arch/mips/include/asm/mips-cm.h
index 7b18653fbda2..70ad1c759555 100644
--- a/arch/mips/include/asm/mips-cm.h
+++ b/arch/mips/include/asm/mips-cm.h
@@ -458,7 +458,10 @@ static inline unsigned int mips_cm_max_vp_width(void)
 	if (mips_cm_revision() >= CM_REV_CM3)
 		return read_gcr_sys_config2() & CM_GCR_SYS_CONFIG2_MAXVPW_MSK;
 
-	return smp_num_siblings;
+	if (config_enabled(CONFIG_SMP))
+		return smp_num_siblings;
+
+	return 1;
 }
 
 /**

From 70dd808bc7794898b84688bda3ebf7103970603c Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:27 +0000
Subject: [PATCH 1515/3220] UPSTREAM: irqchip: mips-gic: Use HW IDs for
 VPE_OTHER_ADDR

The Linux CPU number doesn't necessarily match up with the ID used for a
VP by hardware. Convert the CPU number to the HW ID using mips_cm_vp_id
when writing to the VP(E)_OTHER_ADDR register in order to ensure that we
correctly access registers for the VPs of secondary cores. This most
notably affects systems using CM3, such as those based around I6400.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12333/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit d46812bb0bef04efa4e6cba677ba54c2e4f1ae59)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/irqchip/irq-mips-gic.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 6f1dbd52ec91..57c0e1a29715 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -181,7 +181,7 @@ void gic_write_cpu_compare(cycle_t cnt, int cpu)
 
 	local_irq_save(flags);
 
-	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), cpu);
+	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), mips_cm_vp_id(cpu));
 
 	if (mips_cm_is64) {
 		gic_write(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE), cnt);
@@ -534,7 +534,8 @@ static void gic_mask_local_irq_all_vpes(struct irq_data *d)
 
 	spin_lock_irqsave(&gic_lock, flags);
 	for (i = 0; i < gic_vpes; i++) {
-		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), i);
+		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
+			  mips_cm_vp_id(i));
 		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_RMASK), 1 << intr);
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
@@ -548,7 +549,8 @@ static void gic_unmask_local_irq_all_vpes(struct irq_data *d)
 
 	spin_lock_irqsave(&gic_lock, flags);
 	for (i = 0; i < gic_vpes; i++) {
-		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), i);
+		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
+			  mips_cm_vp_id(i));
 		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_SMASK), 1 << intr);
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
@@ -665,7 +667,8 @@ static void __init gic_basic_init(void)
 	for (i = 0; i < gic_vpes; i++) {
 		unsigned int j;
 
-		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), i);
+		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
+			  mips_cm_vp_id(i));
 		for (j = 0; j < GIC_NUM_LOCAL_INTRS; j++) {
 			if (!gic_local_irq_is_routable(j))
 				continue;
@@ -710,7 +713,8 @@ static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq,
 	for (i = 0; i < gic_vpes; i++) {
 		u32 val = GIC_MAP_TO_PIN_MSK | gic_cpu_pin;
 
-		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), i);
+		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
+			  mips_cm_vp_id(i));
 
 		switch (intr) {
 		case GIC_LOCAL_INT_WD:

From 11afd7041257637e50e6fbd830b294f58879f901 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:28 +0000
Subject: [PATCH 1516/3220] UPSTREAM: irqchip: mips-gic: Provide VP ID accessor

Provide a gic_read_local_vp_id() function to read the VCNUM field of the
GICs local VP_IDENT register. This will be used by a further patch to
check that the value reported by the GIC matches up with the kernels
calculation.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Jason Cooper <jason@lakedaemon.net>
Cc: Andrew Bresticker <abrestic@chromium.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12334/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 835d2b452969820fd67a755a2c01fb6e12822448)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/irqchip/irq-mips-gic.c   |  8 ++++++++
 include/linux/irqchip/mips-gic.h | 17 +++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 57c0e1a29715..3f79b3a203aa 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -230,6 +230,14 @@ void gic_stop_count(void)
 
 #endif
 
+unsigned gic_read_local_vp_id(void)
+{
+	unsigned long ident;
+
+	ident = gic_read(GIC_REG(VPE_LOCAL, GIC_VP_IDENT));
+	return ident & GIC_VP_IDENT_VCNUM_MSK;
+}
+
 static bool gic_local_irq_is_routable(int intr)
 {
 	u32 vpe_ctl;
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index ce824db48d64..d5d82c72e207 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -103,6 +103,7 @@
 #define GIC_VPE_SWINT0_MAP_OFS		0x0054
 #define GIC_VPE_SWINT1_MAP_OFS		0x0058
 #define GIC_VPE_OTHER_ADDR_OFS		0x0080
+#define GIC_VP_IDENT_OFS		0x0088
 #define GIC_VPE_WD_CONFIG0_OFS		0x0090
 #define GIC_VPE_WD_COUNT0_OFS		0x0094
 #define GIC_VPE_WD_INITIAL0_OFS		0x0098
@@ -211,6 +212,10 @@
 #define GIC_VPE_SMASK_FDC_SHF		6
 #define GIC_VPE_SMASK_FDC_MSK		(MSK(1) << GIC_VPE_SMASK_FDC_SHF)
 
+/* GIC_VP_IDENT fields */
+#define GIC_VP_IDENT_VCNUM_SHF		0
+#define GIC_VP_IDENT_VCNUM_MSK		(MSK(6) << GIC_VP_IDENT_VCNUM_SHF)
+
 /* GIC nomenclature for Core Interrupt Pins. */
 #define GIC_CPU_INT0		0 /* Core Interrupt 2 */
 #define GIC_CPU_INT1		1 /* .		      */
@@ -281,4 +286,16 @@ static inline int gic_get_usm_range(struct resource *gic_usm_res)
 
 #endif /* CONFIG_MIPS_GIC */
 
+/**
+ * gic_read_local_vp_id() - read the local VPs VCNUM
+ *
+ * Read the VCNUM of the local VP from the GIC_VP_IDENT register and
+ * return it to the caller. This ID should be used to refer to the VP
+ * via the GICs VP-other region, or when calculating an offset to a
+ * bit representing the VP in interrupt masks.
+ *
+ * Return: The VCNUM value for the local VP.
+ */
+extern unsigned gic_read_local_vp_id(void);
+
 #endif /* __LINUX_IRQCHIP_MIPS_GIC_H */

From 85e9df5b2a8347cd124042faab2c2d0f85b5fd04 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:29 +0000
Subject: [PATCH 1517/3220] UPSTREAM: MIPS: smp-cps: Ensure our VP ident
 calculation is correct

When bringing up a CPU, ensure that its local ID as provided by the GIC
matches up with our calculation of it. This is vital, since if the
condition doesn't hold then we won't have configured interrupts
correctly for the VP.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Niklas Cassel <niklas.cassel@axis.com>
Cc: Ezequiel Garcia <ezequiel.garcia@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12335/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit ba1c0a490a1fa61971b1cf9dd89acc7b4424e798)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/smp-cps.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index e04c8057b882..7f7ac25d379b 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -304,6 +304,17 @@ static void cps_init_secondary(void)
 	if (cpu_has_mipsmt)
 		dmt();
 
+	if (mips_cm_revision() >= CM_REV_CM3) {
+		unsigned ident = gic_read_local_vp_id();
+
+		/*
+		 * Ensure that our calculation of the VP ID matches up with
+		 * what the GIC reports, otherwise we'll have configured
+		 * interrupts incorrectly.
+		 */
+		BUG_ON(ident != mips_cm_vp_id(smp_processor_id()));
+	}
+
 	change_c0_status(ST0_IM, STATUSF_IP2 | STATUSF_IP3 | STATUSF_IP4 |
 				 STATUSF_IP5 | STATUSF_IP6 | STATUSF_IP7);
 }

From d71fec6285d8acfe156525b48806784f37f56087 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:30 +0000
Subject: [PATCH 1518/3220] UPSTREAM: MIPS: smp-cps: Pull cache init into a
 function

In preparation for further modifications to mips_cps_core_entry, pull
the L1 cache initialisation out into a separate function. This both
makes the code in mips_cps_core_entry read more clearly, particularly
when modifying it, and shortens it which will become important as code
is added that needs to continue to fit within the reset vector.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12336/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 3dbc9971618ba0e88f25f2168aa5731b53af6f0b)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/cps-vec.S | 143 ++++++++++++++++++++-----------------
 1 file changed, 76 insertions(+), 67 deletions(-)

diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S
index ac81edd44563..8d5ea4b778f9 100644
--- a/arch/mips/kernel/cps-vec.S
+++ b/arch/mips/kernel/cps-vec.S
@@ -90,74 +90,9 @@ not_nmi:
 	li	t0, ST0_CU1 | ST0_CU0 | ST0_BEV | STATUS_BITDEPS
 	mtc0	t0, CP0_STATUS
 
-	/*
-	 * Clear the bits used to index the caches. Note that the architecture
-	 * dictates that writing to any of TagLo or TagHi selects 0 or 2 should
-	 * be valid for all MIPS32 CPUs, even those for which said writes are
-	 * unnecessary.
-	 */
-	mtc0	zero, CP0_TAGLO, 0
-	mtc0	zero, CP0_TAGHI, 0
-	mtc0	zero, CP0_TAGLO, 2
-	mtc0	zero, CP0_TAGHI, 2
-	ehb
-
-	/* Primary cache configuration is indicated by Config1 */
-	mfc0	v0, CP0_CONFIG, 1
-
-	/* Detect I-cache line size */
-	_EXT	t0, v0, MIPS_CONF1_IL_SHF, MIPS_CONF1_IL_SZ
-	beqz	t0, icache_done
-	 li	t1, 2
-	sllv	t0, t1, t0
-
-	/* Detect I-cache size */
-	_EXT	t1, v0, MIPS_CONF1_IS_SHF, MIPS_CONF1_IS_SZ
-	xori	t2, t1, 0x7
-	beqz	t2, 1f
-	 li	t3, 32
-	addiu	t1, t1, 1
-	sllv	t1, t3, t1
-1:	/* At this point t1 == I-cache sets per way */
-	_EXT	t2, v0, MIPS_CONF1_IA_SHF, MIPS_CONF1_IA_SZ
-	addiu	t2, t2, 1
-	mul	t1, t1, t0
-	mul	t1, t1, t2
-
-	li	a0, CKSEG0
-	PTR_ADD	a1, a0, t1
-1:	cache	Index_Store_Tag_I, 0(a0)
-	PTR_ADD	a0, a0, t0
-	bne	a0, a1, 1b
+	/* Initialize the L1 caches */
+	jal	mips_cps_cache_init
 	 nop
-icache_done:
-
-	/* Detect D-cache line size */
-	_EXT	t0, v0, MIPS_CONF1_DL_SHF, MIPS_CONF1_DL_SZ
-	beqz	t0, dcache_done
-	 li	t1, 2
-	sllv	t0, t1, t0
-
-	/* Detect D-cache size */
-	_EXT	t1, v0, MIPS_CONF1_DS_SHF, MIPS_CONF1_DS_SZ
-	xori	t2, t1, 0x7
-	beqz	t2, 1f
-	 li	t3, 32
-	addiu	t1, t1, 1
-	sllv	t1, t3, t1
-1:	/* At this point t1 == D-cache sets per way */
-	_EXT	t2, v0, MIPS_CONF1_DA_SHF, MIPS_CONF1_DA_SZ
-	addiu	t2, t2, 1
-	mul	t1, t1, t0
-	mul	t1, t1, t2
-
-	li	a0, CKSEG0
-	PTR_ADDU a1, a0, t1
-	PTR_SUBU a1, a1, t0
-1:	cache	Index_Store_Tag_D, 0(a0)
-	bne	a0, a1, 1b
-	 PTR_ADD a0, a0, t0
-dcache_done:
 
 	/* Set Kseg0 CCA to that in s0 */
 	mfc0	t0, CP0_CONFIG
@@ -486,6 +421,80 @@ LEAF(mips_cps_boot_vpes)
 	 nop
 	END(mips_cps_boot_vpes)
 
+LEAF(mips_cps_cache_init)
+	/*
+	 * Clear the bits used to index the caches. Note that the architecture
+	 * dictates that writing to any of TagLo or TagHi selects 0 or 2 should
+	 * be valid for all MIPS32 CPUs, even those for which said writes are
+	 * unnecessary.
+	 */
+	mtc0	zero, CP0_TAGLO, 0
+	mtc0	zero, CP0_TAGHI, 0
+	mtc0	zero, CP0_TAGLO, 2
+	mtc0	zero, CP0_TAGHI, 2
+	ehb
+
+	/* Primary cache configuration is indicated by Config1 */
+	mfc0	v0, CP0_CONFIG, 1
+
+	/* Detect I-cache line size */
+	_EXT	t0, v0, MIPS_CONF1_IL_SHF, MIPS_CONF1_IL_SZ
+	beqz	t0, icache_done
+	 li	t1, 2
+	sllv	t0, t1, t0
+
+	/* Detect I-cache size */
+	_EXT	t1, v0, MIPS_CONF1_IS_SHF, MIPS_CONF1_IS_SZ
+	xori	t2, t1, 0x7
+	beqz	t2, 1f
+	 li	t3, 32
+	addiu	t1, t1, 1
+	sllv	t1, t3, t1
+1:	/* At this point t1 == I-cache sets per way */
+	_EXT	t2, v0, MIPS_CONF1_IA_SHF, MIPS_CONF1_IA_SZ
+	addiu	t2, t2, 1
+	mul	t1, t1, t0
+	mul	t1, t1, t2
+
+	li	a0, CKSEG0
+	PTR_ADD	a1, a0, t1
+1:	cache	Index_Store_Tag_I, 0(a0)
+	PTR_ADD	a0, a0, t0
+	bne	a0, a1, 1b
+	 nop
+icache_done:
+
+	/* Detect D-cache line size */
+	_EXT	t0, v0, MIPS_CONF1_DL_SHF, MIPS_CONF1_DL_SZ
+	beqz	t0, dcache_done
+	 li	t1, 2
+	sllv	t0, t1, t0
+
+	/* Detect D-cache size */
+	_EXT	t1, v0, MIPS_CONF1_DS_SHF, MIPS_CONF1_DS_SZ
+	xori	t2, t1, 0x7
+	beqz	t2, 1f
+	 li	t3, 32
+	addiu	t1, t1, 1
+	sllv	t1, t3, t1
+1:	/* At this point t1 == D-cache sets per way */
+	_EXT	t2, v0, MIPS_CONF1_DA_SHF, MIPS_CONF1_DA_SZ
+	addiu	t2, t2, 1
+	mul	t1, t1, t0
+	mul	t1, t1, t2
+
+	li	a0, CKSEG0
+	PTR_ADDU a1, a0, t1
+	PTR_SUBU a1, a1, t0
+1:	cache	Index_Store_Tag_D, 0(a0)
+	bne	a0, a1, 1b
+	 PTR_ADD a0, a0, t0
+dcache_done:
+
+	jr	ra
+	 nop
+	END(mips_cps_cache_init)
+
 #if defined(CONFIG_MIPS_CPS_PM) && defined(CONFIG_CPU_PM)
 
 	/* Calculate a pointer to this CPUs struct mips_static_suspend_state */

From e71f7c351f4f3acf0be9173ec3e7016f87b7880e Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:31 +0000
Subject: [PATCH 1519/3220] UPSTREAM: MIPS: smp-cps: Pull boot config retrieval
 out of mips_cps_boot_vpes

The mips_cps_boot_vpes function previously included code to retrieve
pointers to the core & VPE boot configuration structs. These structures
were used both by mips_cps_boot_vpes and by its mips_cps_core_entry
callsite. In preparation for skipping the call to mips_cps_boot_vpes on
some invocations of mips_cps_core_entry, pull the calculation of those
pointers out into a separate function such that it can continue to be
shared.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Niklas Cassel <niklas.cassel@axis.com>
Cc: Ezequiel Garcia <ezequiel.garcia@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12337/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit f12401d7219f5a1e361ded834016e5777a10262b)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/include/asm/smp-cps.h |  2 +-
 arch/mips/kernel/cps-vec.S      | 74 ++++++++++++++++++++-------------
 arch/mips/kernel/smp-cps.c      |  7 +++-
 3 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/arch/mips/include/asm/smp-cps.h b/arch/mips/include/asm/smp-cps.h
index 326c16ebd589..2ae1f61a4a95 100644
--- a/arch/mips/include/asm/smp-cps.h
+++ b/arch/mips/include/asm/smp-cps.h
@@ -29,7 +29,7 @@ extern struct core_boot_config *mips_cps_core_bootcfg;
 extern void mips_cps_core_entry(void);
 extern void mips_cps_core_init(void);
 
-extern struct vpe_boot_config *mips_cps_boot_vpes(void);
+extern void mips_cps_boot_vpes(struct core_boot_config *cfg, unsigned vpe);
 
 extern void mips_cps_pm_save(void);
 extern void mips_cps_pm_restore(void);
diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S
index 8d5ea4b778f9..2613de9bcb2d 100644
--- a/arch/mips/kernel/cps-vec.S
+++ b/arch/mips/kernel/cps-vec.S
@@ -60,6 +60,17 @@
 	 nop
 	.endm
 
+	/* Calculate an uncached address for the CM GCRs */
+	.macro	cmgcrb	dest
+	.set	push
+	.set	noat
+	MFC0	$1, CP0_CMGCRBASE
+	PTR_SLL	$1, $1, 4
+	PTR_LI	\dest, UNCAC_BASE
+	PTR_ADDU \dest, \dest, $1
+	.set	pop
+	.endm
+
 .section .text.cps-vec
 .balign 0x1000
 
@@ -102,13 +113,8 @@ not_nmi:
 	mtc0	t0, CP0_CONFIG
 	ehb
 
-	/* Calculate an uncached address for the CM GCRs */
-	MFC0	v1, CP0_CMGCRBASE
-	PTR_SLL	v1, v1, 4
-	PTR_LI	t0, UNCAC_BASE
-	PTR_ADDU v1, v1, t0
-
 	/* Enter the coherent domain */
+	cmgcrb	v1
 	li	t0, 0xff
 	sw	t0, GCR_CL_COHERENCE_OFS(v1)
 	ehb
@@ -128,17 +134,22 @@ not_nmi:
 	/* Do any EVA initialization if necessary */
 	eva_init
 
+	/* Retrieve boot configuration pointers */
+	jal	mips_cps_get_bootcfg
+	 nop
+
 	/*
 	 * Boot any other VPEs within this core that should be online, and
 	 * deactivate this VPE if it should be offline.
 	 */
+	move	a1, t9
 	jal	mips_cps_boot_vpes
-	 nop
+	 move	a0, v0
 
 	/* Off we go! */
-	PTR_L	t1, VPEBOOTCFG_PC(v0)
-	PTR_L	gp, VPEBOOTCFG_GP(v0)
-	PTR_L	sp, VPEBOOTCFG_SP(v0)
+	PTR_L	t1, VPEBOOTCFG_PC(v1)
+	PTR_L	gp, VPEBOOTCFG_GP(v1)
+	PTR_L	sp, VPEBOOTCFG_SP(v1)
 	jr	t1
 	 nop
 	END(mips_cps_core_entry)
@@ -258,18 +269,21 @@ LEAF(mips_cps_core_init)
 	 nop
 	END(mips_cps_core_init)
 
-LEAF(mips_cps_boot_vpes)
-	/* Retrieve CM base address */
-	PTR_LA	t0, mips_cm_base
-	PTR_L	t0, 0(t0)
-
+/**
+ * mips_cps_get_bootcfg() - retrieve boot configuration pointers
+ *
+ * Returns: pointer to struct core_boot_config in v0, pointer to
+ *          struct vpe_boot_config in v1, VPE ID in t9
+ */
+LEAF(mips_cps_get_bootcfg)
 	/* Calculate a pointer to this cores struct core_boot_config */
+	cmgcrb	t0
 	lw	t0, GCR_CL_ID_OFS(t0)
 	li	t1, COREBOOTCFG_SIZE
 	mul	t0, t0, t1
 	PTR_LA	t1, mips_cps_core_bootcfg
 	PTR_L	t1, 0(t1)
-	PTR_ADDU t0, t0, t1
+	PTR_ADDU v0, t0, t1
 
 	/* Calculate this VPEs ID. If the core doesn't support MT use 0 */
 	li	t9, 0
@@ -297,22 +311,27 @@ LEAF(mips_cps_boot_vpes)
 
 1:	/* Calculate a pointer to this VPEs struct vpe_boot_config */
 	li	t1, VPEBOOTCFG_SIZE
-	mul	v0, t9, t1
-	PTR_L	ta3, COREBOOTCFG_VPECONFIG(t0)
-	PTR_ADDU v0, v0, ta3
+	mul	v1, t9, t1
+	PTR_L	ta3, COREBOOTCFG_VPECONFIG(v0)
+	PTR_ADDU v1, v1, ta3
 
-#ifdef CONFIG_MIPS_MT_SMP
-
-	/* If the core doesn't support MT then return */
-	bnez	ta2, 1f
-	 nop
 	jr	ra
 	 nop
+	END(mips_cps_get_bootcfg)
+
+LEAF(mips_cps_boot_vpes)
+	PTR_L	ta2, COREBOOTCFG_VPEMASK(a0)
+	PTR_L	ta3, COREBOOTCFG_VPECONFIG(a0)
+
+#ifdef CONFIG_MIPS_MT
 
 	.set	push
 	.set	mt
 
-1:	/* Enter VPE configuration state */
+	/* If the core doesn't support MT then return */
+	has_mt	t0, 5f
+
+	/* Enter VPE configuration state */
 	dvpe
 	PTR_LA	t1, 1f
 	jr.hb	t1
@@ -323,7 +342,6 @@ LEAF(mips_cps_boot_vpes)
 	ehb
 
 	/* Loop through each VPE */
-	PTR_L	ta2, COREBOOTCFG_VPEMASK(t0)
 	move	t8, ta2
 	li	ta1, 0
 
@@ -400,7 +418,7 @@ LEAF(mips_cps_boot_vpes)
 
 	/* Check whether this VPE is meant to be running */
 	li	t0, 1
-	sll	t0, t0, t9
+	sll	t0, t0, a1
 	and	t0, t0, t8
 	bnez	t0, 2f
 	 nop
@@ -417,7 +435,7 @@ LEAF(mips_cps_boot_vpes)
 #endif /* CONFIG_MIPS_MT_SMP */
 
 	/* Return */
-	jr	ra
+5:	jr	ra
 	 nop
 	END(mips_cps_boot_vpes)
 
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 7f7ac25d379b..af59951bd5ab 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -247,7 +247,10 @@ static void boot_core(unsigned core)
 
 static void remote_vpe_boot(void *dummy)
 {
-	mips_cps_boot_vpes();
+	unsigned core = current_cpu_data.core;
+	struct core_boot_config *core_cfg = &mips_cps_core_bootcfg[core];
+
+	mips_cps_boot_vpes(core_cfg, cpu_vpe_id(&current_cpu_data));
 }
 
 static void cps_boot_secondary(int cpu, struct task_struct *idle)
@@ -293,7 +296,7 @@ static void cps_boot_secondary(int cpu, struct task_struct *idle)
 	BUG_ON(!cpu_has_mipsmt);
 
 	/* Boot a VPE on this core */
-	mips_cps_boot_vpes();
+	mips_cps_boot_vpes(core_cfg, vpe_id);
 out:
 	preempt_enable();
 }

From 5857ebce0d057ae81eb704010652d5a68add21d4 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:32 +0000
Subject: [PATCH 1520/3220] UPSTREAM: MIPS: smp-cps: Skip core setup if
 coherent

In preparation for supporting MIPSr6 multithreading (ie. VPs) which will
begin execution from the core reset vector, skip core level setup if the
core is already coherent. This is never the case when a core is first
started, since boot_core explicitly clears the cores GCR_Cx_COH_EN
register, and always the case when secondary VPs start since the first
VP to start will have enabled coherence after initialising the core &
its caches.

One notable side effect of this patch is that eva_init gets called
slightly earlier, prior to mips_cps_core_init rather than after it.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12338/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 87a70bcdb41008decfcf7c217e26b0bcd7f52642)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/cps-vec.S | 39 +++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S
index 2613de9bcb2d..4849cbd47ccc 100644
--- a/arch/mips/kernel/cps-vec.S
+++ b/arch/mips/kernel/cps-vec.S
@@ -101,43 +101,52 @@ not_nmi:
 	li	t0, ST0_CU1 | ST0_CU0 | ST0_BEV | STATUS_BITDEPS
 	mtc0	t0, CP0_STATUS
 
+	/* Skip cache & coherence setup if we're already coherent */
+	cmgcrb	v1
+	lw	s7, GCR_CL_COHERENCE_OFS(v1)
+	bnez	s7, 1f
+	 nop
+
 	/* Initialize the L1 caches */
 	jal	mips_cps_cache_init
 	 nop
 
+	/* Enter the coherent domain */
+	li	t0, 0xff
+	sw	t0, GCR_CL_COHERENCE_OFS(v1)
+	ehb
+
 	/* Set Kseg0 CCA to that in s0 */
-	mfc0	t0, CP0_CONFIG
+1:	mfc0	t0, CP0_CONFIG
 	ori	t0, 0x7
 	xori	t0, 0x7
 	or	t0, t0, s0
 	mtc0	t0, CP0_CONFIG
 	ehb
 
-	/* Enter the coherent domain */
-	cmgcrb	v1
-	li	t0, 0xff
-	sw	t0, GCR_CL_COHERENCE_OFS(v1)
-	ehb
-
 	/* Jump to kseg0 */
 	PTR_LA	t0, 1f
 	jr	t0
 	 nop
 
 	/*
-	 * We're up, cached & coherent. Perform any further required core-level
-	 * initialisation.
+	 * We're up, cached & coherent. Perform any EVA initialization necessary
+	 * before we access memory.
 	 */
-1:	jal	mips_cps_core_init
-	 nop
-
-	/* Do any EVA initialization if necessary */
-	eva_init
+1:	eva_init
 
 	/* Retrieve boot configuration pointers */
 	jal	mips_cps_get_bootcfg
 	 nop
 
+	/* Skip core-level init if we started up coherent */
+	bnez	s7, 1f
+	 nop
+
+	/* Perform any further required core-level initialisation */
+	jal	mips_cps_core_init
+	 nop
+
 	/*
 	 * Boot any other VPEs within this core that should be online, and
 	 * deactivate this VPE if it should be offline.
@@ -147,7 +156,7 @@ not_nmi:
 	 move	a0, v0
 
 	/* Off we go! */
-	PTR_L	t1, VPEBOOTCFG_PC(v1)
+1:	PTR_L	t1, VPEBOOTCFG_PC(v1)
 	PTR_L	gp, VPEBOOTCFG_GP(v1)
 	PTR_L	sp, VPEBOOTCFG_SP(v1)
 	jr	t1

From f7eed353c580e6d3e83ed13180f470ee9191dbdc Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:33 +0000
Subject: [PATCH 1521/3220] UPSTREAM: MIPS: smp-cps: Support MIPSr6 Virtual
 Processors

Introduce support for bringing up Virtual Processors in MIPSr6 systems
as CPUs, much like their VPE parallel from the now-deprecated MT ASE.
The existing mips_cps_boot_vpes function fits the MIPSr6 architecture
pretty well - it can now simply write the mask of running VPs to the
VC_RUN register, rather than looping through each & starting or stopping
as appropriate as is done for VPEs from the MT ASE. Thus the VP support
is in general an extension & simplification of the existing MT ASE VPE
(aka SMVP) support.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Niklas Cassel <niklas.cassel@axis.com>
Cc: Ezequiel Garcia <ezequiel.garcia@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12339/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 5a3e7c02d84fd31e6a2b1b242612363b6131a09e)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/Kconfig                |  2 +-
 arch/mips/include/asm/cpu-info.h |  4 +--
 arch/mips/kernel/cps-vec.S       | 53 ++++++++++++++++++++++++++++++--
 arch/mips/kernel/smp-cps.c       | 35 ++++++++++++++++++---
 4 files changed, 85 insertions(+), 9 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index c9479fead851..7b673c1cc2dc 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2328,7 +2328,7 @@ config MIPS_CMP
 
 config MIPS_CPS
 	bool "MIPS Coherent Processing System support"
-	depends on SYS_SUPPORTS_MIPS_CPS && !CPU_MIPSR6
+	depends on SYS_SUPPORTS_MIPS_CPS
 	select MIPS_CM
 	select MIPS_CPC
 	select MIPS_CPS_PM if HOTPLUG_CPU
diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h
index e7dc785a91ca..aa526c304115 100644
--- a/arch/mips/include/asm/cpu-info.h
+++ b/arch/mips/include/asm/cpu-info.h
@@ -68,7 +68,7 @@ struct cpuinfo_mips {
 #ifdef CONFIG_64BIT
 	int			vmbits; /* Virtual memory size in bits */
 #endif
-#ifdef CONFIG_MIPS_MT_SMP
+#if defined(CONFIG_MIPS_MT_SMP) || defined(CONFIG_CPU_MIPSR6)
 	/*
 	 * There is not necessarily a 1:1 mapping of VPE num to CPU number
 	 * in particular on multi-core systems.
@@ -125,7 +125,7 @@ struct proc_cpuinfo_notifier_args {
 	unsigned long n;
 };
 
-#ifdef CONFIG_MIPS_MT_SMP
+#if defined(CONFIG_MIPS_MT_SMP) || defined(CONFIG_CPU_MIPSR6)
 # define cpu_vpe_id(cpuinfo)	((cpuinfo)->vpe_id)
 #else
 # define cpu_vpe_id(cpuinfo)	({ (void)cpuinfo; 0; })
diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S
index 4849cbd47ccc..c28138d53d32 100644
--- a/arch/mips/kernel/cps-vec.S
+++ b/arch/mips/kernel/cps-vec.S
@@ -18,9 +18,12 @@
 #include <asm/mipsmtregs.h>
 #include <asm/pm.h>
 
+#define GCR_CPC_BASE_OFS	0x0088
 #define GCR_CL_COHERENCE_OFS	0x2008
 #define GCR_CL_ID_OFS		0x2028
 
+#define CPC_CL_VC_RUN_OFS	0x2028
+
 .extern mips_cm_base
 
 .set noreorder
@@ -60,6 +63,26 @@
 	 nop
 	.endm
 
+	/*
+	 * Set dest to non-zero if the core supports MIPSr6 multithreading
+	 * (ie. VPs), else zero. If MIPSr6 multithreading is not supported then
+	 * branch to nomt.
+	 */
+	.macro	has_vp	dest, nomt
+	mfc0	\dest, CP0_CONFIG, 1
+	bgez	\dest, \nomt
+	 mfc0	\dest, CP0_CONFIG, 2
+	bgez	\dest, \nomt
+	 mfc0	\dest, CP0_CONFIG, 3
+	bgez	\dest, \nomt
+	 mfc0	\dest, CP0_CONFIG, 4
+	bgez	\dest, \nomt
+	 mfc0	\dest, CP0_CONFIG, 5
+	andi	\dest, \dest, MIPS_CONF5_VP
+	beqz	\dest, \nomt
+	 nop
+	.endm
+
 	/* Calculate an uncached address for the CM GCRs */
 	.macro	cmgcrb	dest
 	.set	push
@@ -296,7 +319,17 @@ LEAF(mips_cps_get_bootcfg)
 
 	/* Calculate this VPEs ID. If the core doesn't support MT use 0 */
 	li	t9, 0
-#ifdef CONFIG_MIPS_MT_SMP
+#if defined(CONFIG_CPU_MIPSR6)
+	has_vp	ta2, 1f
+
+	/*
+	 * Assume non-contiguous numbering. Perhaps some day we'll need
+	 * to handle contiguous VP numbering, but no such systems yet
+	 * exist.
+	 */
+	mfc0	t9, $3, 1
+	andi	t9, t9, 0xff
+#elif defined(CONFIG_MIPS_MT_SMP)
 	has_mt	ta2, 1f
 
 	/* Find the number of VPEs present in the core */
@@ -332,7 +365,23 @@ LEAF(mips_cps_boot_vpes)
 	PTR_L	ta2, COREBOOTCFG_VPEMASK(a0)
 	PTR_L	ta3, COREBOOTCFG_VPECONFIG(a0)
 
-#ifdef CONFIG_MIPS_MT
+#if defined(CONFIG_CPU_MIPSR6)
+
+	has_vp	t0, 5f
+
+	/* Find base address of CPC */
+	cmgcrb	t3
+	PTR_L	t1, GCR_CPC_BASE_OFS(t3)
+	PTR_LI	t2, ~0x7fff
+	and	t1, t1, t2
+	PTR_LI	t2, UNCAC_BASE
+	PTR_ADD	t1, t1, t2
+
+	/* Set VC_RUN to the VPE mask */
+	PTR_S	ta2, CPC_CL_VC_RUN_OFS(t1)
+	ehb
+
+#elif defined(CONFIG_MIPS_MT)
 
 	.set	push
 	.set	mt
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index af59951bd5ab..d353482d176b 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -35,7 +35,8 @@ static unsigned core_vpe_count(unsigned core)
 {
 	unsigned cfg;
 
-	if (!config_enabled(CONFIG_MIPS_MT_SMP) || !cpu_has_mipsmt)
+	if ((!config_enabled(CONFIG_MIPS_MT_SMP) || !cpu_has_mipsmt)
+		&& (!config_enabled(CONFIG_CPU_MIPSR6) || !cpu_has_vp))
 		return 1;
 
 	mips_cm_lock_other(core, 0);
@@ -47,11 +48,12 @@ static unsigned core_vpe_count(unsigned core)
 static void __init cps_smp_setup(void)
 {
 	unsigned int ncores, nvpes, core_vpes;
+	unsigned long core_entry;
 	int c, v;
 
 	/* Detect & record VPE topology */
 	ncores = mips_cm_numcores();
-	pr_info("VPE topology ");
+	pr_info("%s topology ", cpu_has_mips_r6 ? "VP" : "VPE");
 	for (c = nvpes = 0; c < ncores; c++) {
 		core_vpes = core_vpe_count(c);
 		pr_cont("%c%u", c ? ',' : '{', core_vpes);
@@ -62,7 +64,7 @@ static void __init cps_smp_setup(void)
 
 		for (v = 0; v < min_t(int, core_vpes, NR_CPUS - nvpes); v++) {
 			cpu_data[nvpes + v].core = c;
-#ifdef CONFIG_MIPS_MT_SMP
+#if defined(CONFIG_MIPS_MT_SMP) || defined(CONFIG_CPU_MIPSR6)
 			cpu_data[nvpes + v].vpe_id = v;
 #endif
 		}
@@ -91,6 +93,11 @@ static void __init cps_smp_setup(void)
 	/* Make core 0 coherent with everything */
 	write_gcr_cl_coherence(0xff);
 
+	if (mips_cm_revision() >= CM_REV_CM3) {
+		core_entry = CKSEG1ADDR((unsigned long)mips_cps_core_entry);
+		write_gcr_bev_base(core_entry);
+	}
+
 #ifdef CONFIG_MIPS_MT_FPAFF
 	/* If we have an FPU, enroll ourselves in the FPU-full mask */
 	if (cpu_has_fpu)
@@ -210,6 +217,18 @@ static void boot_core(unsigned core)
 	if (mips_cpc_present()) {
 		/* Reset the core */
 		mips_cpc_lock_other(core);
+
+		if (mips_cm_revision() >= CM_REV_CM3) {
+			/* Run VP0 following the reset */
+			write_cpc_co_vp_run(0x1);
+
+			/*
+			 * Ensure that the VP_RUN register is written before the
+			 * core leaves reset.
+			 */
+			wmb();
+		}
+
 		write_cpc_co_cmd(CPC_Cx_CMD_RESET);
 
 		timeout = 100;
@@ -259,6 +278,7 @@ static void cps_boot_secondary(int cpu, struct task_struct *idle)
 	unsigned vpe_id = cpu_vpe_id(&cpu_data[cpu]);
 	struct core_boot_config *core_cfg = &mips_cps_core_bootcfg[core];
 	struct vpe_boot_config *vpe_cfg = &core_cfg->vpe_config[vpe_id];
+	unsigned long core_entry;
 	unsigned int remote;
 	int err;
 
@@ -276,6 +296,13 @@ static void cps_boot_secondary(int cpu, struct task_struct *idle)
 		goto out;
 	}
 
+	if (cpu_has_vp) {
+		mips_cm_lock_other(core, vpe_id);
+		core_entry = CKSEG1ADDR((unsigned long)mips_cps_core_entry);
+		write_gcr_co_reset_base(core_entry);
+		mips_cm_unlock_other();
+	}
+
 	if (core != current_cpu_data.core) {
 		/* Boot a VPE on another powered up core */
 		for (remote = 0; remote < NR_CPUS; remote++) {
@@ -293,7 +320,7 @@ static void cps_boot_secondary(int cpu, struct task_struct *idle)
 		goto out;
 	}
 
-	BUG_ON(!cpu_has_mipsmt);
+	BUG_ON(!cpu_has_mipsmt && !cpu_has_vp);
 
 	/* Boot a VPE on this core */
 	mips_cps_boot_vpes(core_cfg, vpe_id);

From 7e307c7f65fa4ae8a6cd977d08c2d0e9bc6c1acf Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:34 +0000
Subject: [PATCH 1522/3220] UPSTREAM: MIPS: smp-cps: Add nothreads kernel
 parameter

When debugging a new system or core it can be useful to disable the use
of multithreading. Introduce a "nothreads" kernel command line parameter
that can be set in order to do so.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Niklas Cassel <niklas.cassel@axis.com>
Cc: Ezequiel Garcia <ezequiel.garcia@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12340/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 6422a913856716be080dba4c2cb9d083d4e244ed)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/smp-cps.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index d353482d176b..651233e709f4 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -27,14 +27,25 @@
 #include <asm/time.h>
 #include <asm/uasm.h>
 
+static bool threads_disabled;
 static DECLARE_BITMAP(core_power, NR_CPUS);
 
 struct core_boot_config *mips_cps_core_bootcfg;
 
+static int __init setup_nothreads(char *s)
+{
+	threads_disabled = true;
+	return 0;
+}
+early_param("nothreads", setup_nothreads);
+
 static unsigned core_vpe_count(unsigned core)
 {
 	unsigned cfg;
 
+	if (threads_disabled)
+		return 1;
+
 	if ((!config_enabled(CONFIG_MIPS_MT_SMP) || !cpu_has_mipsmt)
 		&& (!config_enabled(CONFIG_CPU_MIPSR6) || !cpu_has_vp))
 		return 1;

From 79a0335b1267498e92340dc640fa786c6cab18dd Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 3 Feb 2016 03:15:35 +0000
Subject: [PATCH 1523/3220] UPSTREAM: MIPS: smp-cps: Stop printing EJTAG
 exceptions to UART

When CONFIG_MIPS_CPS_NS16550 is enabled, some register state is dumped
to the UART when an exception is taken via the BEV on secondary cores.
EJTAG exceptions are architecturally expected to be handled by the BEV
even when Status.BEV is 0. This effectively means that if userland
executes an sdbbp instruction on a secondary core then the kernel dumps
register state to the UART even though the exception is perfectly normal
& expected. Prevent this by simply not dumping information to the UART
for EJTAG exceptions.

Fixes: 609cf6f2291a ("MIPS: CPS: Early debug using an ns16550-compatible UART")
Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12341/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 6609ccdc852f7bfbfa54300dd5b3cd89eb4ced6f)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/cps-vec.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S
index c28138d53d32..51b98dc371b3 100644
--- a/arch/mips/kernel/cps-vec.S
+++ b/arch/mips/kernel/cps-vec.S
@@ -223,7 +223,6 @@ LEAF(excep_intex)
 
 .org 0x480
 LEAF(excep_ejtag)
-	DUMP_EXCEP("EJTAG")
 	PTR_LA	k0, ejtag_debug_handler
 	jr	k0
 	 nop

From 4eb22eaa8a3e4052a9f75ae6d896a521199ee43c Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Mon, 8 Feb 2016 09:46:31 -0800
Subject: [PATCH 1524/3220] UPSTREAM: MIPS: Fix early CM probing

Commit c014d164f21d ("MIPS: Add platform callback before initializing
the L2 cache") added a platform_early_l2_init function in order to allow
platforms to probe for the CM before L2 initialisation is performed, so
that CM GCRs are available to mips_sc_probe.

That commit actually fails to do anything useful, since it checks
mips_cm_revision to determine whether it should call mips_cm_probe but
the result of mips_cm_revision will always be 0 until mips_cm_probe has
been called. Thus the "early" mips_cm_probe call never occurs.

Fix this & drop the useless weak platform_early_l2_init function by
simply calling mips_cm_probe from setup_arch. For platforms that don't
select CONFIG_MIPS_CM this will be a no-op, and for those that do it
removes the requirement for them to call mips_cm_probe manually
(although doing so isn't harmful for now).

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Cc: Andrzej Hajda <a.hajda@samsung.com>
Cc: Aaro Koskinen <aaro.koskinen@nokia.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Cc: Jaedon Shin <jaedon.shin@gmail.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jonas Gorski <jogo@openwrt.org>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12475/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 3af5a67c86a30f8cd8bfd6202709be21cedd2756)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/setup.c         |  1 +
 arch/mips/mm/sc-mips.c           | 10 ----------
 arch/mips/mti-malta/malta-init.c |  8 --------
 3 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 162a14e62669..34fd37e5c898 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -897,6 +897,7 @@ static inline void prefill_possible_map(void) {}
 void __init setup_arch(char **cmdline_p)
 {
 	cpu_probe();
+	mips_cm_probe();
 	prom_init();
 
 	setup_early_fdc_console();
diff --git a/arch/mips/mm/sc-mips.c b/arch/mips/mm/sc-mips.c
index ddb8154610cc..91dec32c77b7 100644
--- a/arch/mips/mm/sc-mips.c
+++ b/arch/mips/mm/sc-mips.c
@@ -186,10 +186,6 @@ static int __init mips_sc_probe_cm3(void)
 	return 0;
 }
 
-void __weak platform_early_l2_init(void)
-{
-}
-
 static inline int __init mips_sc_probe(void)
 {
 	struct cpuinfo_mips *c = &current_cpu_data;
@@ -199,12 +195,6 @@ static inline int __init mips_sc_probe(void)
 	/* Mark as not present until probe completed */
 	c->scache.flags |= MIPS_CACHE_NOT_PRESENT;
 
-	/*
-	 * Do we need some platform specific probing before
-	 * we configure L2?
-	 */
-	platform_early_l2_init();
-
 	if (mips_cm_revision() >= CM_REV_CM3)
 		return mips_sc_probe_cm3();
 
diff --git a/arch/mips/mti-malta/malta-init.c b/arch/mips/mti-malta/malta-init.c
index 571148c5fd0b..dc2c5214809d 100644
--- a/arch/mips/mti-malta/malta-init.c
+++ b/arch/mips/mti-malta/malta-init.c
@@ -293,7 +293,6 @@ mips_pci_controller:
 	console_config();
 #endif
 	/* Early detection of CMP support */
-	mips_cm_probe();
 	mips_cpc_probe();
 
 	if (!register_cps_smp_ops())
@@ -304,10 +303,3 @@ mips_pci_controller:
 		return;
 	register_up_smp_ops();
 }
-
-void platform_early_l2_init(void)
-{
-	/* L2 configuration lives in the CM3 */
-	if (mips_cm_revision() >= CM_REV_CM3)
-		mips_cm_probe();
-}

From 2710d8db2830098ce88648d50975513635e5e81c Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Wed, 7 Sep 2016 10:45:11 +0100
Subject: [PATCH 1525/3220] UPSTREAM: MIPS: pm-cps: Change FSB workaround to
 CPU blacklist

The check for whether a CPU required the FSB flush workaround
previously required every CPU not requiring it to be whitelisted. That
approach does not scale well as new CPUs are introduced so change the
default from a WARN and returning an error to just returning 0. Any CPUs
requiring the workaround can then be added to the blacklist.

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Reviewed-by: Paul Burton <paul.burton@imgtec.com>
Cc: Adam Buchbinder <adam.buchbinder@gmail.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/14218/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit b97d0b9099afcd5c569acaf9b0eb3f6e3b1b1f35)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/pm-cps.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c
index 91901dbb3830..a3b602083149 100644
--- a/arch/mips/kernel/pm-cps.c
+++ b/arch/mips/kernel/pm-cps.c
@@ -271,14 +271,9 @@ static int __init cps_gen_flush_fsb(u32 **pp, struct uasm_label **pl,
 		/* On older ones it's unavailable */
 		return -1;
 
-	/* CPUs which do not require the workaround */
-	case CPU_P5600:
-	case CPU_I6400:
-		return 0;
-
 	default:
-		WARN_ONCE(1, "pm-cps: FSB flush unsupported for this CPU\n");
-		return -1;
+		/* Assume that the CPU does not need this workaround */
+		return 0;
 	}
 
 	/*

From 0466b2e7a4d7f582580640c6fde5b06a1df9772f Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Mon, 7 Aug 2017 16:01:18 -0700
Subject: [PATCH 1526/3220] UPSTREAM: MIPS: generic: Bump default NR_CPUS to 16

In generic_defconfig set CONFIG_NR_CPUS to 16 rather than 2, which is a
rather too low limit for many modern day MIPS systems.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16949/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit c2c03291fba35dbba1712a0d9a679a43567d36a4)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/configs/generic_defconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig
index c95d94c7838b..196c14d400b8 100644
--- a/arch/mips/configs/generic_defconfig
+++ b/arch/mips/configs/generic_defconfig
@@ -3,7 +3,7 @@ CONFIG_CPU_LITTLE_ENDIAN=y
 CONFIG_MIPS_CPS=y
 CONFIG_CPU_HAS_MSA=y
 CONFIG_HIGHMEM=y
-CONFIG_NR_CPUS=2
+CONFIG_NR_CPUS=16
 CONFIG_MIPS_O32_FP64_SUPPORT=y
 CONFIG_SYSVIPC=y
 CONFIG_NO_HZ_IDLE=y

From 63a30ac54ee77099ebeae3c524d664c602f5b730 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Thu, 7 Jul 2016 08:50:39 +0100
Subject: [PATCH 1527/3220] UPSTREAM: MIPS: smp-cps: Add support for CPU
 hotplug of MIPSr6 processors

Introduce support for hotplug of Virtual Processors in MIPSr6 systems.
The method is simpler than the VPE parallel from the now-deprecated MT
ASE, it can now simply write the VP_STOP register with the mask of VPs
to halt, and use the VP_RUNNING register to determine when the VP has
halted.

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Reviewed-by: Paul Burton <paul.burton@imgtec.com>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Qais Yousef <qais.yousef@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/13752/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 0d2808f338c7cb0ccf6b087dd7be0e4fa0c865e0)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/smp-cps.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 651233e709f4..a211bbe82d0d 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -404,14 +404,16 @@ static enum {
 
 void play_dead(void)
 {
-	unsigned cpu, core;
+	unsigned int cpu, core, vpe_id;
 
 	local_irq_disable();
 	idle_task_exit();
 	cpu = smp_processor_id();
 	cpu_death = CPU_DEATH_POWER;
 
-	if (cpu_has_mipsmt) {
+	pr_debug("CPU%d going offline\n", cpu);
+
+	if (cpu_has_mipsmt || cpu_has_vp) {
 		core = cpu_data[cpu].core;
 
 		/* Look for another online VPE within the core */
@@ -432,10 +434,21 @@ void play_dead(void)
 	complete(&cpu_death_chosen);
 
 	if (cpu_death == CPU_DEATH_HALT) {
-		/* Halt this TC */
-		write_c0_tchalt(TCHALT_H);
-		instruction_hazard();
+		vpe_id = cpu_vpe_id(&cpu_data[cpu]);
+
+		pr_debug("Halting core %d VP%d\n", core, vpe_id);
+		if (cpu_has_mipsmt) {
+			/* Halt this TC */
+			write_c0_tchalt(TCHALT_H);
+			instruction_hazard();
+		} else if (cpu_has_vp) {
+			write_cpc_cl_vp_stop(1 << vpe_id);
+
+			/* Ensure that the VP_STOP register is written */
+			wmb();
+		}
 	} else {
+		pr_debug("Gating power to core %d\n", core);
 		/* Power down the core */
 		cps_pm_enter_state(CPS_PM_POWER_GATED);
 	}
@@ -462,6 +475,7 @@ static void wait_for_sibling_halt(void *ptr_cpu)
 static void cps_cpu_die(unsigned int cpu)
 {
 	unsigned core = cpu_data[cpu].core;
+	unsigned int vpe_id = cpu_vpe_id(&cpu_data[cpu]);
 	unsigned stat;
 	int err;
 
@@ -490,10 +504,12 @@ static void cps_cpu_die(unsigned int cpu)
 		 * in which case the CPC will refuse to power down the core.
 		 */
 		do {
+			mips_cm_lock_other(core, vpe_id);
 			mips_cpc_lock_other(core);
 			stat = read_cpc_co_stat_conf();
 			stat &= CPC_Cx_STAT_CONF_SEQSTATE_MSK;
 			mips_cpc_unlock_other();
+			mips_cm_unlock_other();
 		} while (stat != CPC_Cx_STAT_CONF_SEQSTATE_D0 &&
 			 stat != CPC_Cx_STAT_CONF_SEQSTATE_D2 &&
 			 stat != CPC_Cx_STAT_CONF_SEQSTATE_U2);
@@ -510,6 +526,12 @@ static void cps_cpu_die(unsigned int cpu)
 					       (void *)(unsigned long)cpu, 1);
 		if (err)
 			panic("Failed to call remote sibling CPU\n");
+	} else if (cpu_has_vp) {
+		do {
+			mips_cm_lock_other(core, vpe_id);
+			stat = read_cpc_co_vp_running();
+			mips_cm_unlock_other();
+		} while (stat & (1 << vpe_id));
 	}
 }
 

From b4af2637b68c69878febbff5a392af838b9b57ff Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Thu, 22 Sep 2016 11:59:47 +0100
Subject: [PATCH 1528/3220] UPSTREAM: MIPS: smp-cps: Avoid BUG() when offlining
 pre-r6 CPUs

Commit 0d2808f338c7 ("MIPS: smp-cps: Add support for CPU hotplug of
MIPSr6 processors") added a call to mips_cm_lock_other in order to lock
the CPC in CPUs containing a version 3 or higher Coherence Manager,
which use the general CM core other register, where previous CMs had a
dedicated core other register for the CPC.

A kernel BUG() is triggered, however, if mips_cm_lock_other is called
with a VP other than 0 on a CPU with CM < 3, a condition introduced by
0d2808f338c7.

Avoid the BUG() by always locking VP0 when locking the CPC, since the
required register, cpc_stat_conf, is shared by all vps in a core.

Fixes: 0d2808f338c7 ("MIPS: smp-cps: Add support for CPU hotplug...)

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/14297/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 6ca8ac773e97e2dfa5734ae435c40e672dd19ac4)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/smp-cps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index a211bbe82d0d..0ed4bfdaa65d 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -504,7 +504,7 @@ static void cps_cpu_die(unsigned int cpu)
 		 * in which case the CPC will refuse to power down the core.
 		 */
 		do {
-			mips_cm_lock_other(core, vpe_id);
+			mips_cm_lock_other(core, 0);
 			mips_cpc_lock_other(core);
 			stat = read_cpc_co_stat_conf();
 			stat &= CPC_Cx_STAT_CONF_SEQSTATE_MSK;

From fb77eb94a33ddd06c52257d02f77362df4f52c70 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 2 Jun 2017 14:48:50 -0700
Subject: [PATCH 1529/3220] UPSTREAM: MIPS: CM: Avoid per-core locking with CM3
 & higher

CM3 provides a GCR_CL_OTHER register per VP, rather than only per core.
This means that we don't need to prevent other VPs within a core from
racing with code that makes use of the core-other register region.

Reduce locking overhead by demoting the per-core spinlock providing
protection for CM2.5 & lower to a per-CPU/per-VP spinlock for CM3 &
higher.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16193/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 516db1c61f3fd4328361699a2c74781ab1dbf84c)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/mips-cm.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c
index 760217bbb2fa..fb70ab8229d5 100644
--- a/arch/mips/kernel/mips-cm.c
+++ b/arch/mips/kernel/mips-cm.c
@@ -265,15 +265,34 @@ void mips_cm_lock_other(unsigned int core, unsigned int vp)
 	u32 val;
 
 	preempt_disable();
-	curr_core = current_cpu_data.core;
-	spin_lock_irqsave(&per_cpu(cm_core_lock, curr_core),
-			  per_cpu(cm_core_lock_flags, curr_core));
 
 	if (mips_cm_revision() >= CM_REV_CM3) {
 		val = core << CM3_GCR_Cx_OTHER_CORE_SHF;
 		val |= vp << CM3_GCR_Cx_OTHER_VP_SHF;
+
+		/*
+		 * We need to disable interrupts in SMP systems in order to
+		 * ensure that we don't interrupt the caller with code which
+		 * may modify the redirect register. We do so here in a
+		 * slightly obscure way by using a spin lock, since this has
+		 * the neat property of also catching any nested uses of
+		 * mips_cm_lock_other() leading to a deadlock or a nice warning
+		 * with lockdep enabled.
+		 */
+		spin_lock_irqsave(this_cpu_ptr(&cm_core_lock),
+				  *this_cpu_ptr(&cm_core_lock_flags));
 	} else {
 		BUG_ON(vp != 0);
+
+		/*
+		 * We only have a GCR_CL_OTHER per core in systems with
+		 * CM 2.5 & older, so have to ensure other VP(E)s don't
+		 * race with us.
+		 */
+		curr_core = current_cpu_data.core;
+		spin_lock_irqsave(&per_cpu(cm_core_lock, curr_core),
+				  per_cpu(cm_core_lock_flags, curr_core));
+
 		val = core << CM_GCR_Cx_OTHER_CORENUM_SHF;
 	}
 
@@ -288,10 +307,17 @@ void mips_cm_lock_other(unsigned int core, unsigned int vp)
 
 void mips_cm_unlock_other(void)
 {
-	unsigned curr_core = current_cpu_data.core;
+	unsigned int curr_core;
+
+	if (mips_cm_revision() < CM_REV_CM3) {
+		curr_core = current_cpu_data.core;
+		spin_unlock_irqrestore(&per_cpu(cm_core_lock, curr_core),
+				       per_cpu(cm_core_lock_flags, curr_core));
+	} else {
+		spin_unlock_irqrestore(this_cpu_ptr(&cm_core_lock),
+				       *this_cpu_ptr(&cm_core_lock_flags));
+	}
 
-	spin_unlock_irqrestore(&per_cpu(cm_core_lock, curr_core),
-			       per_cpu(cm_core_lock_flags, curr_core));
 	preempt_enable();
 }
 

From 5ec00040d8b69a2643ad0335a22accd57bdb9513 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 2 Jun 2017 14:48:51 -0700
Subject: [PATCH 1530/3220] UPSTREAM: MIPS: CM: WARN on attempt to lock invalid
 VP, not BUG

Rather than using BUG_ON in the case of an invalid attempt to lock
access to a non-zero VP on a pre-CM3 system, use WARN_ON so that we have
even the slightest chance of recovery.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16194/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 2f93a60c3d829da07764eafd922beb40e7317aa3)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/mips-cm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c
index fb70ab8229d5..36bd476d0760 100644
--- a/arch/mips/kernel/mips-cm.c
+++ b/arch/mips/kernel/mips-cm.c
@@ -282,7 +282,7 @@ void mips_cm_lock_other(unsigned int core, unsigned int vp)
 		spin_lock_irqsave(this_cpu_ptr(&cm_core_lock),
 				  *this_cpu_ptr(&cm_core_lock_flags));
 	} else {
-		BUG_ON(vp != 0);
+		WARN_ON(vp != 0);
 
 		/*
 		 * We only have a GCR_CL_OTHER per core in systems with

From 9f251794e77fd027cf5b6f8093d9135b324bd755 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 2 Jun 2017 14:48:52 -0700
Subject: [PATCH 1531/3220] UPSTREAM: MIPS: CPS: Select
 CONFIG_SYS_SUPPORTS_SCHED_SMT for MIPSr6

Prior to MIPSr6 multithreading is only supported if CONFIG_MIPS_MT_SMP
is enabled, so CONFIG_MIPS_MT_SMP selects CONFIG_SYS_SUPPORTS_SCHED_SMT.
With MIPSr6 the CONFIG_MIPS_CPS SMP implementation always supports
multithreading, so have it select CONFIG_SYS_SUPPORTS_SCHED_SMT in order
to allow the scheduler to make better informed decisions on
multithreaded MIPSr6 systems (for example those using I6400 or I6500
CPUs).

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16195/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit c8b7712c34d0604e2540608731bd5e9202c1139e)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 7b673c1cc2dc..a0383f86e230 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2336,6 +2336,7 @@ config MIPS_CPS
 	select SMP
 	select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
 	select SYS_SUPPORTS_HOTPLUG_CPU
+	select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6
 	select SYS_SUPPORTS_SMP
 	select WEAK_ORDERING
 	help

From bf48b18d3758f067a9693986f3d121fd94508e77 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 2 Jun 2017 14:48:53 -0700
Subject: [PATCH 1532/3220] UPSTREAM: MIPS: CPS: Prevent multi-core with dcache
 aliasing

Systems using the MIPS Coherence Manager (CM) cannot support multi-core
SMP with dcache aliasing. This is because CPU caches are VIPT, but
interventions in CM-based systems provide only the physical address to
remote caches. This means that interventions may behave incorrectly in
the presence of an aliasing dcache, since the physical address used
when handling an intervention may lead to operation on an aliased cache
line rather than the correct line.

Prevent us from running into this issue by refusing to boot secondary
cores in systems where dcache aliasing may occur.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16196/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 5570ba2ee920de4e7760a2802b842771845b2c32)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/smp-cps.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 0ed4bfdaa65d..61ce01464913 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -140,9 +140,11 @@ static void __init cps_prepare_cpus(unsigned int max_cpus)
 
 	/* Warn the user if the CCA prevents multi-core */
 	ncores = mips_cm_numcores();
-	if (cca_unsuitable && ncores > 1) {
-		pr_warn("Using only one core due to unsuitable CCA 0x%x\n",
-			cca);
+	if ((cca_unsuitable || cpu_has_dc_aliases) && ncores > 1) {
+		pr_warn("Using only one core due to %s%s%s\n",
+			cca_unsuitable ? "unsuitable CCA" : "",
+			(cca_unsuitable && cpu_has_dc_aliases) ? " & " : "",
+			cpu_has_dc_aliases ? "dcache aliasing" : "");
 
 		for_each_present_cpu(c) {
 			if (cpu_data[c].core)

From 346a79c3cc3d592d4a08a1a417b55f2ebe77a57f Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 2 Jun 2017 14:48:54 -0700
Subject: [PATCH 1533/3220] UPSTREAM: MIPS: CPS: Handle cores not powering down
 more gracefully

If we get into a state where a core that ought to power down isn't doing
so then the current result is that another CPU gets stuck inside
cps_cpu_die() waiting for CPU that ought to be powering down to do so.
The best case scenario is that we then trigger RCU stall messages or
lockup messages, but neither makes it particularly clear what's
happening.

Handle this more gracefully by introducing a timeout beyond which we
warn the user that the core didn't power down & stop waiting for it.
This at least allows the CPU running cps_cpu_die() to continue normally,
and hopefully presuming the CPU that powered back up is doing nothing
harmful the system will continue functioning as normal.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16197/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 4ad755c9e39c0eeae16f96b97602f1954f582c66)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/smp-cps.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 61ce01464913..1b78309fb493 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -478,6 +478,7 @@ static void cps_cpu_die(unsigned int cpu)
 {
 	unsigned core = cpu_data[cpu].core;
 	unsigned int vpe_id = cpu_vpe_id(&cpu_data[cpu]);
+	ktime_t fail_time;
 	unsigned stat;
 	int err;
 
@@ -505,6 +506,7 @@ static void cps_cpu_die(unsigned int cpu)
 		 * state, the latter happening when a JTAG probe is connected
 		 * in which case the CPC will refuse to power down the core.
 		 */
+		fail_time = ktime_add_ms(ktime_get(), 2000);
 		do {
 			mips_cm_lock_other(core, 0);
 			mips_cpc_lock_other(core);
@@ -512,9 +514,28 @@ static void cps_cpu_die(unsigned int cpu)
 			stat &= CPC_Cx_STAT_CONF_SEQSTATE_MSK;
 			mips_cpc_unlock_other();
 			mips_cm_unlock_other();
-		} while (stat != CPC_Cx_STAT_CONF_SEQSTATE_D0 &&
-			 stat != CPC_Cx_STAT_CONF_SEQSTATE_D2 &&
-			 stat != CPC_Cx_STAT_CONF_SEQSTATE_U2);
+
+			if (stat == CPC_Cx_STAT_CONF_SEQSTATE_D0 ||
+			    stat == CPC_Cx_STAT_CONF_SEQSTATE_D2 ||
+			    stat == CPC_Cx_STAT_CONF_SEQSTATE_U2)
+				break;
+
+			/*
+			 * The core ought to have powered down, but didn't &
+			 * now we don't really know what state it's in. It's
+			 * likely that its _pwr_up pin has been wired to logic
+			 * 1 & it powered back up as soon as we powered it
+			 * down...
+			 *
+			 * The best we can do is warn the user & continue in
+			 * the hope that the core is doing nothing harmful &
+			 * might behave properly if we online it later.
+			 */
+			if (WARN(ktime_after(ktime_get(), fail_time),
+				 "CPU%u hasn't powered down, seq. state %u\n",
+				 cpu, stat >> CPC_Cx_STAT_CONF_SEQSTATE_SHF))
+				break;
+		} while (1);
 
 		/* Indicate the core is powered off */
 		bitmap_clear(core_power, core, 1);

From b1f4d404b412b61646da1df0e97a750bdd4032b9 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 2 Jun 2017 14:48:55 -0700
Subject: [PATCH 1534/3220] UPSTREAM: MIPS: CPS: Handle spurious VP starts more
 gracefully

On pre-r6 systems with the MT ASE the CPS SMP code included checks to
halt the VPE running mips_cps_boot_vpes() if its bit in the struct
core_boot_config vpe_mask field is clear. This was largely done in order
to allow us to start arbitrary VPEs within a core despite the fact that
hardware is typically configured to run only VPE0 after powering up a
core. VPE0 would start the desired other VPEs, halt itself, and the fact
that VPE0 started would be largely hidden & irrelevant.

In MIPSr6 multithreading we have control over which VPs start executing
when a core powers up via the cores CPC registers accessed remotely
through the redirect block. For this reason the MIPSr6 multithreading
path in mips_cps_boot_vpes() hasn't bothered up until now to handle
halting the VP running it.

However it is possible to power up cores entirely in hardware by using a
pwr_up pin associated with the core. Unfortunately some systems wire
this pin to a logic 1, which means that it is possible for a core to
power up at a point that software doesn't expect. The result is that we
generally go execute the kernel on a CPU that ought not to be running &
the results can be unpredictable.

Handle this case by stopping VPs that we don't expect to be running in
mips_cps_boot_vpes() - with this change even if a core powers up it will
do nothing useful & all VPs within it will stop running before they
proceed to run general kernel code & do any damage. Ideally we would
produce some sort of warning here, but given the stage of core bringup
this happens at that would be non-trivial. We also will only hit this if
a core starts up after being offlined via hotplug, and when that happens
we will already produce a warning that the CPU didn't power down in
cps_cpu_die() which seems sufficient.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16198/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit fa7a3b4a7217b40bf58c4f38e5ee573b43a8aa2f)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/cps-vec.S | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S
index 51b98dc371b3..6829ee25af60 100644
--- a/arch/mips/kernel/cps-vec.S
+++ b/arch/mips/kernel/cps-vec.S
@@ -22,6 +22,7 @@
 #define GCR_CL_COHERENCE_OFS	0x2008
 #define GCR_CL_ID_OFS		0x2028
 
+#define CPC_CL_VC_STOP_OFS	0x2020
 #define CPC_CL_VC_RUN_OFS	0x2028
 
 .extern mips_cm_base
@@ -376,8 +377,12 @@ LEAF(mips_cps_boot_vpes)
 	PTR_LI	t2, UNCAC_BASE
 	PTR_ADD	t1, t1, t2
 
-	/* Set VC_RUN to the VPE mask */
+	/* Start any other VPs that ought to be running */
 	PTR_S	ta2, CPC_CL_VC_RUN_OFS(t1)
+
+	/* Ensure this VP stops running if it shouldn't be */
+	not	ta2
+	PTR_S	ta2, CPC_CL_VC_STOP_OFS(t1)
 	ehb
 
 #elif defined(CONFIG_MIPS_MT)

From 315428718fd29e4ffac9611ba17bc4a2679a2de3 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 2 Jun 2017 15:38:03 -0700
Subject: [PATCH 1535/3220] UPSTREAM: MIPS: Allow storing pgd in C0_CONTEXT for
 MIPSr6

CONFIG_MIPS_PGD_C0_CONTEXT, which allows a pointer to the page directory
to be stored in the cop0 Context register when enabled, was previously
only allowed for MIPSr2. MIPSr6 is just as able to make use of it, so
allow it there too.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16204/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit cebf8c0f4f4e378f5e82606023b92ffbb1ad6048)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index a0383f86e230..2c93e3b7d318 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2039,7 +2039,7 @@ config CPU_SUPPORTS_UNCACHED_ACCELERATED
 	bool
 config MIPS_PGD_C0_CONTEXT
 	bool
-	default y if 64BIT && CPU_MIPSR2 && !CPU_XLP
+	default y if 64BIT && (CPU_MIPSR2 || CPU_MIPSR6) && !CPU_XLP
 
 #
 # Set to y for ptrace access to watch registers.

From 4d143413ce843f4960fbdb5177b8aae6ab778296 Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@mips.com>
Date: Fri, 29 Dec 2017 16:41:45 +0100
Subject: [PATCH 1536/3220] UPSTREAM: dt-bindings/goldfish-pic: Add device tree
 binding for Goldfish PIC driver

Add documentation for DT binding of Goldfish PIC driver. The compatible
string used by OS for binding the driver is "google,goldfish-pic".

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Goran Ferenc <goran.ferenc@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
(cherry picked from commit c2ba80af4805543ace4928191d877ffe706087e1)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 .../google,goldfish-pic.txt                   | 30 +++++++++++++++++++
 MAINTAINERS                                   |  5 ++++
 2 files changed, 35 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/interrupt-controller/google,goldfish-pic.txt

diff --git a/Documentation/devicetree/bindings/interrupt-controller/google,goldfish-pic.txt b/Documentation/devicetree/bindings/interrupt-controller/google,goldfish-pic.txt
new file mode 100644
index 000000000000..35f752706e7d
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/google,goldfish-pic.txt
@@ -0,0 +1,30 @@
+Android Goldfish PIC
+
+Android Goldfish programmable interrupt device used by Android
+emulator.
+
+Required properties:
+
+- compatible : should contain "google,goldfish-pic"
+- reg        : <registers mapping>
+- interrupts : <interrupt mapping>
+
+Example for mips when used in cascade mode:
+
+        cpuintc {
+                #interrupt-cells = <0x1>;
+                #address-cells = <0>;
+                interrupt-controller;
+                compatible = "mti,cpu-interrupt-controller";
+        };
+
+        interrupt-controller@1f000000 {
+                compatible = "google,goldfish-pic";
+                reg = <0x1f000000 0x1000>;
+
+                interrupt-controller;
+                #interrupt-cells = <0x1>;
+
+                interrupt-parent = <&cpuintc>;
+                interrupts = <0x2>;
+        };
diff --git a/MAINTAINERS b/MAINTAINERS
index 48bf18de18bb..84e5fdb02fb9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -756,6 +756,11 @@ S:	Supported
 F:	drivers/android/
 F:	drivers/staging/android/
 
+ANDROID GOLDFISH PIC DRIVER
+M:	Miodrag Dinic <miodrag.dinic@mips.com>
+S:	Supported
+F:	Documentation/devicetree/bindings/interrupt-controller/google,goldfish-pic.txt
+
 ANDROID GOLDFISH RTC DRIVER
 M:	Miodrag Dinic <miodrag.dinic@mips.com>
 S:	Supported

From 0e3eb81e608af3e2bbfb8aa49f8f6ab6e7bc5df1 Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@mips.com>
Date: Fri, 29 Dec 2017 16:41:46 +0100
Subject: [PATCH 1537/3220] BACKPORT: irqchip/irq-goldfish-pic: Add Goldfish
 PIC driver

Add device driver for a virtual programmable interrupt controller

The virtual PIC is designed as a device tree-based interrupt controller.

The compatible string used by OS for binding the driver is
"google,goldfish-pic".

Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Goran Ferenc <goran.ferenc@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
(cherry picked from commit 4235ff50cf98dd42ba15175687570f9f03e124a1)

Conflicts:
	drivers/irqchip/Kconfig
	drivers/irqchip/Makefile
	drivers/irqchip/irq-goldfish-pic.c
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 MAINTAINERS                        |   1 +
 drivers/irqchip/Kconfig            |   8 ++
 drivers/irqchip/Makefile           |   1 +
 drivers/irqchip/irq-goldfish-pic.c | 136 +++++++++++++++++++++++++++++
 4 files changed, 146 insertions(+)
 create mode 100644 drivers/irqchip/irq-goldfish-pic.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 84e5fdb02fb9..0d2266dad653 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -760,6 +760,7 @@ ANDROID GOLDFISH PIC DRIVER
 M:	Miodrag Dinic <miodrag.dinic@mips.com>
 S:	Supported
 F:	Documentation/devicetree/bindings/interrupt-controller/google,goldfish-pic.txt
+F:	drivers/irqchip/irq-goldfish-pic.c
 
 ANDROID GOLDFISH RTC DRIVER
 M:	Miodrag Dinic <miodrag.dinic@mips.com>
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 4d7294e5d982..9ce6fa46f00e 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -193,3 +193,11 @@ config IRQ_MXS
 	def_bool y if MACH_ASM9260 || ARCH_MXS
 	select IRQ_DOMAIN
 	select STMP_DEVICE
+
+config GOLDFISH_PIC
+       bool "Goldfish programmable interrupt controller"
+       depends on MIPS && (GOLDFISH || COMPILE_TEST)
+       select IRQ_DOMAIN
+       help
+         Say yes here to enable Goldfish interrupt controller driver used
+         for Goldfish based virtual platforms.
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 177f78f6e6d6..616d052b65c0 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -55,3 +55,4 @@ obj-$(CONFIG_RENESAS_H8S_INTC)		+= irq-renesas-h8s.o
 obj-$(CONFIG_ARCH_SA1100)		+= irq-sa11x0.o
 obj-$(CONFIG_INGENIC_IRQ)		+= irq-ingenic.o
 obj-$(CONFIG_IMX_GPCV2)			+= irq-imx-gpcv2.o
+obj-$(CONFIG_GOLDFISH_PIC) 		+= irq-goldfish-pic.o
diff --git a/drivers/irqchip/irq-goldfish-pic.c b/drivers/irqchip/irq-goldfish-pic.c
new file mode 100644
index 000000000000..ac18926b68b7
--- /dev/null
+++ b/drivers/irqchip/irq-goldfish-pic.c
@@ -0,0 +1,136 @@
+/*
+ * Driver for MIPS Goldfish Programmable Interrupt Controller.
+ *
+ * Author: Miodrag Dinic <miodrag.dinic@mips.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+
+#define GFPIC_NR_IRQS			32
+
+/* 8..39 Cascaded Goldfish PIC interrupts */
+#define GFPIC_IRQ_BASE			8
+
+#define GFPIC_REG_IRQ_PENDING		0x04
+#define GFPIC_REG_IRQ_DISABLE_ALL	0x08
+#define GFPIC_REG_IRQ_DISABLE		0x0c
+#define GFPIC_REG_IRQ_ENABLE		0x10
+
+struct goldfish_pic_data {
+	void __iomem *base;
+	struct irq_domain *irq_domain;
+};
+
+static void goldfish_pic_cascade(struct irq_desc *desc)
+{
+	struct goldfish_pic_data *gfpic = irq_desc_get_handler_data(desc);
+	struct irq_chip *host_chip = irq_desc_get_chip(desc);
+	u32 pending, hwirq, virq;
+
+	chained_irq_enter(host_chip, desc);
+
+	pending = readl(gfpic->base + GFPIC_REG_IRQ_PENDING);
+	while (pending) {
+		hwirq = __fls(pending);
+		virq = irq_linear_revmap(gfpic->irq_domain, hwirq);
+		generic_handle_irq(virq);
+		pending &= ~(1 << hwirq);
+	}
+
+	chained_irq_exit(host_chip, desc);
+}
+
+static const struct irq_domain_ops goldfish_irq_domain_ops = {
+	.xlate = irq_domain_xlate_onecell,
+};
+
+static int __init goldfish_pic_of_init(struct device_node *of_node,
+				       struct device_node *parent)
+{
+	struct goldfish_pic_data *gfpic;
+	struct irq_chip_generic *gc;
+	struct irq_chip_type *ct;
+	unsigned int parent_irq;
+	int ret = 0;
+
+	gfpic = kzalloc(sizeof(*gfpic), GFP_KERNEL);
+	if (!gfpic) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	parent_irq = irq_of_parse_and_map(of_node, 0);
+	if (!parent_irq) {
+		pr_err("Failed to map parent IRQ!\n");
+		ret = -EINVAL;
+		goto out_free;
+	}
+
+	gfpic->base = of_iomap(of_node, 0);
+	if (!gfpic->base) {
+		pr_err("Failed to map base address!\n");
+		ret = -ENOMEM;
+		goto out_unmap_irq;
+	}
+
+	/* Mask interrupts. */
+	writel(1, gfpic->base + GFPIC_REG_IRQ_DISABLE_ALL);
+
+	gc = irq_alloc_generic_chip("GFPIC", 1, GFPIC_IRQ_BASE, gfpic->base,
+				    handle_level_irq);
+	if (!gc) {
+		pr_err("Failed to allocate chip structures!\n");
+		ret = -ENOMEM;
+		goto out_iounmap;
+	}
+
+	ct = gc->chip_types;
+	ct->regs.enable = GFPIC_REG_IRQ_ENABLE;
+	ct->regs.disable = GFPIC_REG_IRQ_DISABLE;
+	ct->chip.irq_unmask = irq_gc_unmask_enable_reg;
+	ct->chip.irq_mask = irq_gc_mask_disable_reg;
+
+	irq_setup_generic_chip(gc, IRQ_MSK(GFPIC_NR_IRQS), 0,
+			       IRQ_NOPROBE | IRQ_LEVEL, 0);
+
+	gfpic->irq_domain = irq_domain_add_legacy(of_node, GFPIC_NR_IRQS,
+						  GFPIC_IRQ_BASE, 0,
+						  &goldfish_irq_domain_ops,
+						  NULL);
+	if (!gfpic->irq_domain) {
+		pr_err("Failed to add irqdomain!\n");
+		ret = -ENOMEM;
+		goto out_iounmap;
+	}
+
+	irq_set_chained_handler_and_data(parent_irq,
+					 goldfish_pic_cascade, gfpic);
+
+	pr_info("Successfully registered.\n");
+	return 0;
+
+out_iounmap:
+	iounmap(gfpic->base);
+out_unmap_irq:
+	irq_dispose_mapping(parent_irq);
+out_free:
+	kfree(gfpic);
+out_err:
+	pr_err("Failed to initialize! (errno = %d)\n", ret);
+	return ret;
+}
+
+IRQCHIP_DECLARE(google_gf_pic, "google,goldfish-pic", goldfish_pic_of_init);

From 39678d21f51ce3b13f897ddbdc1b9c6d5e25f0e6 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Fri, 2 Jun 2017 12:29:56 -0700
Subject: [PATCH 1538/3220] BACKPORT: MIPS: generic: Add a MAINTAINERS entry

Add an entry to MAINTAINERS for the generic platform code, such that
relevant people, starting with myself, can be CC'd on patches.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/16186/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 032a469b1e6ef02209308a5b107c10beb4b12fb6)

Conflicts:
	MAINTAINERS
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 MAINTAINERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 0d2266dad653..a1ec446c3358 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7066,6 +7066,12 @@ S:	Supported
 F:	Documentation/mips/
 F:	arch/mips/
 
+MIPS GENERIC PLATFORM
+M:	Paul Burton <paul.burton@imgtec.com>
+L:	linux-mips@linux-mips.org
+S:	Supported
+F:	arch/mips/generic/
+
 MIPS RINT INSTRUCTION EMULATION
 M:	Aleksandar Markovic <aleksandar.markovic@mips.com>
 L:	linux-mips@linux-mips.org

From 7bad331b2528a21da3c4d981904c5ffe502c3573 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 2 Aug 2016 14:07:24 -0700
Subject: [PATCH 1539/3220] UPSTREAM: config: add android config fragments

Copy the config fragments from the AOSP common kernel android-4.4
branch.  It is becoming possible to run mainline kernels with Android,
but the kernel defconfigs don't work as-is and debugging missing config
options is a pain.  Adding the config fragments into the kernel tree,
makes configuring a mainline kernel as simple as:

  make ARCH=arm multi_v7_defconfig android-base.config android-recommended.config

The following non-upstream config options were removed:

  CONFIG_NETFILTER_XT_MATCH_QTAGUID
  CONFIG_NETFILTER_XT_MATCH_QUOTA2
  CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
  CONFIG_PPPOLAC
  CONFIG_PPPOPNS
  CONFIG_SECURITY_PERF_EVENTS_RESTRICT
  CONFIG_USB_CONFIGFS_F_MTP
  CONFIG_USB_CONFIGFS_F_PTP
  CONFIG_USB_CONFIGFS_F_ACC
  CONFIG_USB_CONFIGFS_F_AUDIO_SRC
  CONFIG_USB_CONFIGFS_UEVENT
  CONFIG_INPUT_KEYCHORD
  CONFIG_INPUT_KEYRESET

Link: http://lkml.kernel.org/r/1466708235-28593-1-git-send-email-robh@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 27eb6622ab67bad75814c9b7b08096cfb16be63a)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 MAINTAINERS                               |   5 +
 kernel/configs/android-base.config        | 152 ++++++++++++++++++++++
 kernel/configs/android-recommended.config | 121 +++++++++++++++++
 3 files changed, 278 insertions(+)
 create mode 100644 kernel/configs/android-base.config
 create mode 100644 kernel/configs/android-recommended.config

diff --git a/MAINTAINERS b/MAINTAINERS
index a1ec446c3358..46da781dae65 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -746,6 +746,11 @@ W:	http://ez.analog.com/community/linux-device-drivers
 S:	Supported
 F:	drivers/dma/dma-axi-dmac.c
 
+ANDROID CONFIG FRAGMENTS
+M:	Rob Herring <robh@kernel.org>
+S:	Supported
+F:	kernel/configs/android*
+
 ANDROID DRIVERS
 M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 M:	Arve Hjønnevåg <arve@android.com>
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
new file mode 100644
index 000000000000..9f748ed7bea8
--- /dev/null
+++ b/kernel/configs/android-base.config
@@ -0,0 +1,152 @@
+#  KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_MODULES is not set
+# CONFIG_OABI_COMPAT is not set
+# CONFIG_SYSVIPC is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_TPROXY=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_QUOTA=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_STAGING=y
+CONFIG_SWP_EMULATION=y
+CONFIG_SYNC=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
new file mode 100644
index 000000000000..e3b953e966d2
--- /dev/null
+++ b/kernel/configs/android-recommended.config
@@ -0,0 +1,121 @@
+#  KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_COMPACTION=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_DM_UEVENT=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASK_XACCT=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y

From 8ae29f24b92cb31d0dadcabf1c5ae0a810650cf6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 11 Oct 2016 13:54:36 -0700
Subject: [PATCH 1540/3220] UPSTREAM: config/android: Remove
 CONFIG_IPV6_PRIVACY

Option is long gone, see commit 5d9efa7ee99e ("ipv6: Remove privacy
config option.")

Link: http://lkml.kernel.org/r/20160811170340.9859-1-bp@alien8.de
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Rob Herring <robh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit a2c6a235dbf4318fc7f7981932478e6c47f093ab)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 9f748ed7bea8..fcf2de834ee9 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -41,7 +41,6 @@ CONFIG_IPV6=y
 CONFIG_IPV6_MIP6=y
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_PRIVACY=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
 CONFIG_IP_ADVANCED_ROUTER=y

From 258e4752645454cb63b4a3c4a1dcad402925e74b Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 11 Oct 2016 13:54:38 -0700
Subject: [PATCH 1541/3220] UPSTREAM: config: android: move device mapper
 options to recommended

CONFIG_MD is in recommended, but other dependent options like DM_CRYPT and
DM_VERITY options are in base.  The result is the options in base don't
get enabled when applying both base and recommended fragments.  Move all
the options to recommended.

Link: http://lkml.kernel.org/r/20160908185934.18098-1-robh@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit f023a3956f273859ed36f624f75a66c272124b16)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config        | 4 ----
 kernel/configs/android-recommended.config | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index fcf2de834ee9..da8b3e9deac6 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -11,7 +11,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y
 CONFIG_ARMV8_DEPRECATED=y
 CONFIG_ASHMEM=y
 CONFIG_AUDIT=y
-CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_CPUACCT=y
@@ -19,9 +18,6 @@ CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
 CONFIG_HIGH_RES_TIMERS=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index e3b953e966d2..297756be369c 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -6,12 +6,16 @@
 # CONFIG_PM_WAKELOCKS_GC is not set
 # CONFIG_VT is not set
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_COMPACTION=y
 CONFIG_DEBUG_RODATA=y
+CONFIG_DM_CRYPT=y
 CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
 CONFIG_DRAGONRISE_FF=y
 CONFIG_ENABLE_DEFAULT_TRACERS=y
 CONFIG_EXT4_FS=y

From d31dd9f2a24d774d18c69a7fb768bdc145a7784e Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 11 Oct 2016 13:54:41 -0700
Subject: [PATCH 1542/3220] UPSTREAM: config: android: set SELinux as default
 security mode

Android won't boot without SELinux enabled, so make it the default.

Link: http://lkml.kernel.org/r/20160908185934.18098-2-robh@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit d90ae51a3e7556c9f50431db43cd8190934ccf94)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index da8b3e9deac6..248070e0d61d 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -18,6 +18,7 @@ CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DEFAULT_SECURITY_SELINUX=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
 CONFIG_HIGH_RES_TIMERS=y

From 247bb900b1d96cb2696c1d0c3bcb44144a3b1000 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 11 Oct 2016 13:54:44 -0700
Subject: [PATCH 1543/3220] UPSTREAM: config: android: enable CONFIG_SECCOMP

As of Android N, SECCOMP is required. Without it, we will get
mediaextractor error:

E /system/bin/mediaextractor: libminijail: prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER): Invalid argument

Link: http://lkml.kernel.org/r/20160908185934.18098-3-robh@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 2489a1771aae86062c637f0eb7a06129681208b1)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 248070e0d61d..1a8f34f63601 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -131,6 +131,7 @@ CONFIG_PREEMPT=y
 CONFIG_QUOTA=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECCOMP=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y

From 48303629e52617678490d99d29f03ef960a9c61a Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 27 Feb 2017 14:28:06 -0800
Subject: [PATCH 1544/3220] UPSTREAM: config: android-base: enable hardened
 usercopy and kernel ASLR

Enable CONFIG_HARDENED_USERCOPY and CONFIG_RANDOMIZE_BASE in Android
base config fragment.

Reviewed at https://android-review.googlesource.com/#/c/283659/
Reviewed at https://android-review.googlesource.com/#/c/278133/

Link: http://lkml.kernel.org/r/1481113148-29204-2-git-send-email-amit.pundir@linaro.org
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Cc: Rob Herring <rob.herring@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 0d31a194d517ce77a1c3a06cd8708184bb6e7862)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 1a8f34f63601..26a06e09a5bd 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -21,6 +21,7 @@ CONFIG_CP15_BARRIER_EMULATION=y
 CONFIG_DEFAULT_SECURITY_SELINUX=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
@@ -129,6 +130,7 @@ CONFIG_PPP_DEFLATE=y
 CONFIG_PPP_MPPE=y
 CONFIG_PREEMPT=y
 CONFIG_QUOTA=y
+CONFIG_RANDOMIZE_BASE=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SECCOMP=y

From 4814880b1bbe171b350f02706696a4f4afe7330c Mon Sep 17 00:00:00 2001
From: Max Shi <meixuanshi@google.com>
Date: Thu, 8 Jun 2017 18:09:11 +0530
Subject: [PATCH 1545/3220] UPSTREAM: config: android-base: disable
 CONFIG_USELIB and CONFIG_FHANDLE

Turn off the two kernel configs to disable related system ABI.

Reviewed-at: https://android-review.googlesource.com/#/c/264976/

Signed-off-by: Max Shi <meixuanshi@google.com>
[AmitP: cherry-picked this change from Android common kernel]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(cherry picked from commit c1ebc2febdb85a73a4f91a9b9eaab6387619eaa6)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 26a06e09a5bd..efe5ff86767e 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -1,10 +1,12 @@
 #  KEEP ALPHABETICALLY SORTED
 # CONFIG_DEVKMEM is not set
 # CONFIG_DEVMEM is not set
+# CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
 # CONFIG_MODULES is not set
 # CONFIG_OABI_COMPAT is not set
 # CONFIG_SYSVIPC is not set
+# CONFIG_USELIB is not set
 CONFIG_ANDROID=y
 CONFIG_ANDROID_BINDER_IPC=y
 CONFIG_ANDROID_LOW_MEMORY_KILLER=y

From f03dda414b4ad268b2d1d4a94cc442456eadd573 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Thu, 8 Jun 2017 18:09:13 +0530
Subject: [PATCH 1546/3220] UPSTREAM: config: android-base: add CONFIG_IKCONFIG
 option

This adds CONFIG_IKCONFIG and CONFIG_IKCONFIG_PROC options, which are a
requirement for the O release.

Reviewed-at: https://android-review.googlesource.com/#/c/364553/

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
[AmitP: cherry-picked this change from Android common kernel]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(cherry picked from commit 5b89db2fa545b473dc352689ac3afe407367ea34)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index efe5ff86767e..e12cfec25758 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -25,6 +25,8 @@ CONFIG_EMBEDDED=y
 CONFIG_FB=y
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_HIGH_RES_TIMERS=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
 CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y

From 54479d4f24153261adf1c78c272ba6582eb61040 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Thu, 8 Jun 2017 18:09:14 +0530
Subject: [PATCH 1547/3220] UPSTREAM: config: android-base: add CONFIG_MODULES
 option

This adds CONFIG_MODULES, CONFIG_MODULE_UNLOAD, and CONFIG_MODVERSIONS
which are required by the O release.

Reviewed-at: https://android-review.googlesource.com/#/c/364554/

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
[AmitP: cherry-picked this change from Android common kernel]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(cherry picked from commit 2096e1706336d83cd66ca744e4d904af4d63e25c)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index e12cfec25758..62cb392fc34b 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -3,7 +3,6 @@
 # CONFIG_DEVMEM is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
-# CONFIG_MODULES is not set
 # CONFIG_OABI_COMPAT is not set
 # CONFIG_SYSVIPC is not set
 # CONFIG_USELIB is not set
@@ -64,6 +63,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y
 CONFIG_IP_NF_TARGET_NETMAP=y
 CONFIG_IP_NF_TARGET_REDIRECT=y
 CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
 CONFIG_NET=y
 CONFIG_NETDEVICES=y
 CONFIG_NETFILTER=y

From b4ad83cea26d2da1463291e34fcdef223938a924 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Thu, 8 Jun 2017 18:09:15 +0530
Subject: [PATCH 1548/3220] UPSTREAM: config: android-base: add CGROUP_BPF

Add CONFIG_CGROUP_BPF as a default configuration in android base config
since it is used to replace XT_QTAGUID in future.

Reviewed-at: https://android-review.googlesource.com/#/c/400374/

Signed-off-by: Chenbo Feng <fengc@google.com>
[AmitP: cherry-picked this change from Android common kernel]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(cherry picked from commit 2edfe6be206adc4c1055e053322d27267f8952bc)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 62cb392fc34b..cdde5af6b332 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -14,6 +14,7 @@ CONFIG_ASHMEM=y
 CONFIG_AUDIT=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CGROUPS=y
+CONFIG_CGROUP_BPF=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y

From 414511978e11cf823d1c41a1756f99d008a91da8 Mon Sep 17 00:00:00 2001
From: Roberto Pereira <rpere@google.com>
Date: Thu, 8 Jun 2017 18:09:16 +0530
Subject: [PATCH 1549/3220] UPSTREAM: config: android-base: disable CONFIG_NFSD
 and CONFIG_NFS_FS

Disable Network file system support.

Reviewed-at: https://android-review.googlesource.com/#/c/409559/

Signed-off-by: Roberto Pereira <rpere@google.com>
[AmitP: cherry-picked this change from Android common kernel
        and updated commit message]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(cherry picked from commit 9e69dd0179c346dfb5d08b8d46d5f5c9c81ab1b7)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/configs/android-base.config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index cdde5af6b332..d70829033bb7 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -3,6 +3,8 @@
 # CONFIG_DEVMEM is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
+# CONFIG_NFSD is not set
+# CONFIG_NFS_FS is not set
 # CONFIG_OABI_COMPAT is not set
 # CONFIG_SYSVIPC is not set
 # CONFIG_USELIB is not set

From a6f6fe171a593b7cc650ef8b6051e1b84fe35cd2 Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@sonymobile.com>
Date: Thu, 7 Jan 2016 11:03:14 -0800
Subject: [PATCH 1550/3220] UPSTREAM: dtc: create tool to diff device trees

Create script to diff device trees.

The device tree can be in any of the forms recognized by the dtc compiler:
  - source
  - binary blob
  - file system tree (from /proc/devicetree)

If the device tree is a source file, then it is pre-processed in the
same way as it would be when built in the linux kernel source tree
before diffing.

Signed-off-by: Frank Rowand <frank.rowand@sonymobile.com>
Signed-off-by: Rob Herring <robh@kernel.org>
(cherry picked from commit 10eadc253ddf8325bc6daafdbed67438cfede84c)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 scripts/dtc/dtx_diff | 343 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 343 insertions(+)
 create mode 100755 scripts/dtc/dtx_diff

diff --git a/scripts/dtc/dtx_diff b/scripts/dtc/dtx_diff
new file mode 100755
index 000000000000..f1160053d75e
--- /dev/null
+++ b/scripts/dtc/dtx_diff
@@ -0,0 +1,343 @@
+#! /bin/bash
+
+# Copyright (C) 2015 Frank Rowand
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+
+
+usage() {
+
+	# use spaces instead of tabs in the usage message
+	cat >&2 <<eod
+
+Usage:
+
+   `basename $0` DTx
+        decompile DTx
+
+   `basename $0` DTx_1 DTx_2
+        diff DTx_1 and DTx_2
+
+
+       -f           print full dts in diff (--unified=99999)
+       -h           synonym for --help
+       -help        synonym for --help
+      --help        print this message and exit
+       -s SRCTREE   linux kernel source tree is at path SRCTREE
+                        (default is current directory)
+       -S           linux kernel source tree is at root of current git repo
+       -u           unsorted, do not sort DTx
+
+
+Each DTx is processed by the dtc compiler to produce a sorted dts source
+file.  If DTx is a dts source file then it is pre-processed in the same
+manner as done for the compile of the dts source file in the Linux kernel
+build system ('#include' and '/include/' directives are processed).
+
+If two DTx are provided, the resulting dts source files are diffed.
+
+If DTx is a directory, it is treated as a DT subtree, such as
+  /proc/device-tree.
+
+If DTx contains the binary blob magic value in the first four bytes,
+  it is treated as a binary blob (aka .dtb or FDT).
+
+Otherwise DTx is treated as a dts source file (aka .dts).
+
+   If this script is not run from the root of the linux source tree,
+   and DTx utilizes '#include' or '/include/' then the path of the
+   linux source tree can be provided by '-s SRCTREE' or '-S' so that
+   include paths will be set properly.
+
+   The shell variable \${ARCH} must provide the architecture containing
+   the dts source file for include paths to be set properly for '#include'
+   or '/include/' to be processed.
+
+   If DTx_1 and DTx_2 are in different architectures, then this script
+   may not work since \${ARCH} is part of the include path.  Two possible
+   workarounds:
+
+      `basename $0` \\
+          <(ARCH=arch_of_dtx_1 `basename $0` DTx_1) \\
+          <(ARCH=arch_of_dtx_2 `basename $0` DTx_2)
+
+      `basename $0` ARCH=arch_of_dtx_1 DTx_1 >tmp_dtx_1.dts
+      `basename $0` ARCH=arch_of_dtx_2 DTx_2 >tmp_dtx_2.dts
+      `basename $0` tmp_dtx_1.dts tmp_dtx_2.dts
+      rm tmp_dtx_1.dts tmp_dtx_2.dts
+
+   If DTx_1 and DTx_2 are in different directories, then this script will
+   add the path of DTx_1 and DTx_2 to the include paths.  If DTx_2 includes
+   a local file that exists in both the path of DTx_1 and DTx_2 then the
+   file in the path of DTx_1 will incorrectly be included.  Possible
+   workaround:
+
+      `basename $0` DTx_1 >tmp_dtx_1.dts
+      `basename $0` DTx_2 >tmp_dtx_2.dts
+      `basename $0` tmp_dtx_1.dts tmp_dtx_2.dts
+      rm tmp_dtx_1.dts tmp_dtx_2.dts
+
+eod
+}
+
+
+compile_to_dts() {
+
+	dtx="$1"
+
+	if [ -d "${dtx}" ] ; then
+
+		# -----  input is file tree
+
+		if ( ! ${DTC} -I fs ${dtx} ) ; then
+			exit 3
+		fi
+
+	elif [ -f "${dtx}" ] && [ -r "${dtx}" ] ; then
+
+		magic=`hexdump -n 4 -e '/1 "%02x"' ${dtx}`
+		if [ "${magic}" = "d00dfeed" ] ; then
+
+			# -----  input is FDT (binary blob)
+
+			if ( ! ${DTC} -I dtb ${dtx} ) ; then
+				exit 3
+			fi
+
+			return
+
+		fi
+
+		# -----  input is DTS (source)
+
+		if ( cpp ${cpp_flags} -x assembler-with-cpp ${dtx} \
+			| ${DTC} -I dts ) ; then
+			return
+		fi
+
+		echo ""                                                      >&2
+		echo "Possible hints to resolve the above error:"            >&2
+		echo "  (hints might not fix the problem)"                   >&2
+
+		hint_given=0
+
+		if [ "${ARCH}" = "" ] ; then
+			hint_given=1
+			echo ""                                              >&2
+			echo "  shell variable \$ARCH not set"               >&2
+		fi
+
+		dtx_arch=`echo "/${dtx}" | sed -e 's|.*/arch/||' -e 's|/.*||'`
+
+		if [ "${dtx_arch}" != ""  -a "${dtx_arch}" != "${ARCH}" ] ; then
+			hint_given=1
+			echo ""                                              >&2
+			echo "  architecture ${dtx_arch} is in file path,"   >&2
+			echo "  but does not match shell variable \$ARCH"    >&2
+			echo "  (${ARCH}) does not match shell variable"     >&2
+			echo "  \$ARCH (${ARCH})"                            >&2
+		fi
+
+		if [ ! -d ${srctree}/arch/${ARCH} ] ; then
+			hint_given=1
+			echo ""                                              >&2
+			echo "  ${srctree}/arch/${ARCH}/ does not exist"     >&2
+			echo "  Is \$ARCH='${ARCH}' correct?"                >&2
+			echo "  Possible fix: use '-s' option"               >&2
+
+			git_root=`git rev-parse --show-toplevel 2>/dev/null`
+			if [ -d ${git_root}/arch/ ] ; then
+				echo "  Possible fix: use '-S' option"       >&2
+			fi
+		fi
+
+		if [ $hint_given = 0 ] ; then
+			echo ""                                              >&2
+			echo "  No hints available."                         >&2
+		fi
+
+		echo ""                                                      >&2
+
+		exit 3
+
+	else
+		echo ""                                                     >&2
+		echo "ERROR: ${dtx} does not exist or is not readable"      >&2
+		echo ""                                                     >&2
+		exit 2
+	fi
+
+}
+
+
+# -----  start of script
+
+cmd_diff=0
+diff_flags="-u"
+dtx_file_1=""
+dtx_file_2=""
+dtc_sort="-s"
+help=0
+srctree=""
+
+
+while [ $# -gt 0 ] ; do
+
+	case $1 in
+
+	-f )
+		diff_flags="--unified=999999"
+		shift
+		;;
+
+	-h | -help | --help )
+		help=1
+		shift
+		;;
+
+	-s )
+		srctree="$2"
+		shift 2
+		;;
+
+	-S )
+		git_root=`git rev-parse --show-toplevel 2>/dev/null`
+		srctree="${git_root}"
+		shift
+		;;
+
+	-u )
+		dtc_sort=""
+		shift
+		;;
+
+	*)
+		if [ "${dtx_file_1}"  = "" ] ; then
+			dtx_file_1="$1"
+		elif [ "${dtx_file_2}" = "" ] ; then
+			dtx_file_2="$1"
+		else
+			echo ""                                             >&2
+			echo "ERROR: Unexpected parameter: $1"              >&2
+			echo ""                                             >&2
+			exit 2
+		fi
+		shift
+		;;
+
+	esac
+
+done
+
+if [ "${srctree}" = "" ] ; then
+	srctree="."
+fi
+
+if [ "${dtx_file_2}" != "" ]; then
+	cmd_diff=1
+fi
+
+if (( ${help} )) ; then
+	usage
+	exit 1
+fi
+
+# this must follow check for ${help}
+if [ "${dtx_file_1}" = "" ]; then
+	echo ""                                                             >&2
+	echo "ERROR: parameter DTx required"                                >&2
+	echo ""                                                             >&2
+	exit 2
+fi
+
+
+# -----  prefer dtc from linux kernel, allow fallback to dtc in $PATH
+
+if [ "${KBUILD_OUTPUT:0:2}" = ".." ] ; then
+	__KBUILD_OUTPUT="${srctree}/${KBUILD_OUTPUT}"
+elif [ "${KBUILD_OUTPUT}" = "" ] ; then
+	__KBUILD_OUTPUT="."
+else
+	__KBUILD_OUTPUT="${KBUILD_OUTPUT}"
+fi
+
+DTC="${__KBUILD_OUTPUT}/scripts/dtc/dtc"
+
+if [ ! -x ${DTC} ] ; then
+	__DTC="dtc"
+	if ( ! which ${__DTC} >/dev/null ) ; then
+
+		# use spaces instead of tabs in the error message
+		cat >&2 <<eod
+
+ERROR: unable to find a 'dtc' program
+
+   Preferred 'dtc' (built from Linux kernel source tree) was not found or
+   is not executable.
+
+      'dtc' is: ${DTC}
+
+      If it does not exist, create it from the root of the Linux source tree:
+
+         'make scripts'.
+
+      If not at the root of the Linux kernel source tree -s SRCTREE or -S
+      may need to be specified to find 'dtc'.
+
+      If 'O=\${dir}' is specified in your Linux builds, this script requires
+      'export KBUILD_OUTPUT=\${dir}' or add \${dir}/scripts/dtc to \$PATH
+      before running.
+
+      If \${KBUILD_OUTPUT} is a relative path, then '-s SRCDIR', -S, or run
+      this script from the root of the Linux kernel source tree is required.
+
+   Fallback '${__DTC}' was also not in \${PATH} or is not executable.
+
+eod
+		exit 2
+	fi
+	DTC=${__DTC}
+fi
+
+
+# -----  cpp and dtc flags same as for linux source tree build of .dtb files,
+#        plus directories of the dtx file(s)
+
+dtx_path_1_dtc_include="-i `dirname ${dtx_file_1}`"
+
+dtx_path_2_dtc_include=""
+if (( ${cmd_diff} )) ; then
+	dtx_path_2_dtc_include="-i `dirname ${dtx_file_2}`"
+fi
+
+cpp_flags="\
+	-nostdinc                                  \
+	-I${srctree}/arch/${ARCH}/boot/dts         \
+	-I${srctree}/arch/${ARCH}/boot/dts/include \
+	-I${srctree}/drivers/of/testcase-data      \
+	-undef -D__DTS__"
+
+dtc_flags="\
+	-i ${srctree}/arch/${ARCH}/boot/dts/ \
+	-i ${srctree}/kernel/dts             \
+	${dtx_path_1_dtc_include}            \
+	${dtx_path_2_dtc_include}"
+
+DTC="${DTC} ${dtc_flags} -O dts -qq -f ${dtc_sort} -o -"
+
+
+# -----  do the diff or decompile
+
+if (( ${cmd_diff} )) ; then
+
+	diff ${diff_flags} \
+		<(compile_to_dts "${dtx_file_1}") \
+		<(compile_to_dts "${dtx_file_2}")
+
+else
+
+	compile_to_dts "${dtx_file_1}"
+
+fi

From ecb533a0af095e88404270a1cd8b230c6c2d6a68 Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@sonymobile.com>
Date: Tue, 2 Feb 2016 11:02:35 -0800
Subject: [PATCH 1551/3220] UPSTREAM: scripts/dtc: dtx_diff - add info to error
 message

If kernel config options are not properly set, "make scripts" will not
compile dtc.  Update the unable to find dtc error message to check
the kernel config and give better advice on how to create dtc.

Reword another error message to increase clarity.

Signed-off-by: Frank Rowand <frank.rowand@sonymobile.com>
Signed-off-by: Rob Herring <robh@kernel.org>
(cherry picked from commit 60c7f4cb1fa4df62b7ba07e9b087728ca7ce5bc8)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 scripts/dtc/dtx_diff | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/dtc/dtx_diff b/scripts/dtc/dtx_diff
index f1160053d75e..959ab2646d38 100755
--- a/scripts/dtc/dtx_diff
+++ b/scripts/dtc/dtx_diff
@@ -136,8 +136,7 @@ compile_to_dts() {
 			echo ""                                              >&2
 			echo "  architecture ${dtx_arch} is in file path,"   >&2
 			echo "  but does not match shell variable \$ARCH"    >&2
-			echo "  (${ARCH}) does not match shell variable"     >&2
-			echo "  \$ARCH (${ARCH})"                            >&2
+			echo "  >>\$ARCH<< is: >>${ARCH}<<"                  >&2
 		fi
 
 		if [ ! -d ${srctree}/arch/${ARCH} ] ; then
@@ -267,6 +266,14 @@ DTC="${__KBUILD_OUTPUT}/scripts/dtc/dtc"
 
 if [ ! -x ${DTC} ] ; then
 	__DTC="dtc"
+	if grep -q "^CONFIG_DTC=y" ${__KBUILD_OUTPUT}/.config ; then
+		make_command='
+         make scripts'
+	else
+		make_command='
+         Enable CONFIG_DTC in the kernel configuration
+         make scripts'
+	fi
 	if ( ! which ${__DTC} >/dev/null ) ; then
 
 		# use spaces instead of tabs in the error message
@@ -280,8 +287,7 @@ ERROR: unable to find a 'dtc' program
       'dtc' is: ${DTC}
 
       If it does not exist, create it from the root of the Linux source tree:
-
-         'make scripts'.
+${make_command}
 
       If not at the root of the Linux kernel source tree -s SRCTREE or -S
       may need to be specified to find 'dtc'.

From 8b8ef6794bdba4b5805596c0933bfa0be224b457 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 26 Jan 2016 09:04:11 -0600
Subject: [PATCH 1552/3220] UPSTREAM: scripts/dtc: Update to upstream commit
 b06e55c88b9b

Sync to upstream dtc commit b06e55c88b9b ("Prevent crash on modulo by
zero"). This adds the following commits from upstream:

b06e55c Prevent crash on modulo by zero
b433450 Fix some bugs in processing of line directives
d728ad5 Fix crash on nul character in string escape sequence
1ab2205 Gracefully handle bad octal literals
1937095 Prevent crash on division by zero
d0b3ab0 libfdt: Fix undefined behaviour in fdt_offset_ptr()
d4c7c25 libfdt: check for potential overrun in _fdt_splice()
f58799b libfdt: Add some missing symbols to version.lds
af9f26d Remove duplicated -Werror in dtc Makefile
604e61e fdt: Add functions to retrieve strings
8702bd1 fdt: Add a function to get the index of a string
2218387 fdt: Add a function to count strings
554fde2 libfdt: fix comment block of fdt_get_property_namelen()
e5e6df7 fdtdump: Fix bug printing bytestrings with negative values
067829e Remove redundant fdtdump test code
897a429 Move fdt_path_offset alias tests to right tests section
2d1417c Add simple .travis.yml
f6dbc6c guess output file format
5e78dff guess input file format based on file content or file name
8b927bf tests: convert `echo -n` to `printf`
64c46b0 Fix crash with poorly defined #size-cells

Cc: Grant Likely <grant.likely@linaro.org>
Tested-by: Frank Rowand <frank.rowand@sonymobile.com>
Reviewed-by: Frank Rowand <frank.rowand@sonymobile.com>
Signed-off-by: Rob Herring <robh@kernel.org>
(cherry picked from commit 91feabc2e2240ee80dc8ac08103cb83f497e4d12)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 scripts/dtc/checks.c                 |   2 +-
 scripts/dtc/dtc-lexer.l              |  39 ++++++----
 scripts/dtc/dtc-lexer.lex.c_shipped  | 101 ++++++++++++++------------
 scripts/dtc/dtc-parser.tab.c_shipped | 102 +++++++++++++++------------
 scripts/dtc/dtc-parser.y             |  20 +++++-
 scripts/dtc/dtc.c                    |  62 +++++++++++++++-
 scripts/dtc/libfdt/fdt.c             |  13 ++--
 scripts/dtc/libfdt/fdt_ro.c          | 100 ++++++++++++++++++++++++++
 scripts/dtc/libfdt/fdt_rw.c          |   2 +
 scripts/dtc/libfdt/libfdt.h          |  73 ++++++++++++++++++-
 scripts/dtc/util.c                   |   3 +-
 scripts/dtc/version_gen.h            |   2 +-
 12 files changed, 399 insertions(+), 120 deletions(-)

diff --git a/scripts/dtc/checks.c b/scripts/dtc/checks.c
index e81a8c74b8d2..0c03ac9159c1 100644
--- a/scripts/dtc/checks.c
+++ b/scripts/dtc/checks.c
@@ -560,7 +560,7 @@ static void check_reg_format(struct check *c, struct node *dt,
 	size_cells = node_size_cells(node->parent);
 	entrylen = (addr_cells + size_cells) * sizeof(cell_t);
 
-	if ((prop->val.len % entrylen) != 0)
+	if (!entrylen || (prop->val.len % entrylen) != 0)
 		FAIL(c, "\"reg\" property in %s has invalid length (%d bytes) "
 		     "(#address-cells == %d, #size-cells == %d)",
 		     node->fullpath, prop->val.len, addr_cells, size_cells);
diff --git a/scripts/dtc/dtc-lexer.l b/scripts/dtc/dtc-lexer.l
index 0ee1caf03dd0..790fbf6cf2d7 100644
--- a/scripts/dtc/dtc-lexer.l
+++ b/scripts/dtc/dtc-lexer.l
@@ -73,24 +73,32 @@ static void lexical_error(const char *fmt, ...);
 		}
 
 <*>^"#"(line)?[ \t]+[0-9]+[ \t]+{STRING}([ \t]+[0-9]+)? {
-			char *line, *tmp, *fn;
+			char *line, *fnstart, *fnend;
+			struct data fn;
 			/* skip text before line # */
 			line = yytext;
 			while (!isdigit((unsigned char)*line))
 				line++;
-			/* skip digits in line # */
-			tmp = line;
-			while (!isspace((unsigned char)*tmp))
-				tmp++;
-			/* "NULL"-terminate line # */
-			*tmp = '\0';
-			/* start of filename */
-			fn = strchr(tmp + 1, '"') + 1;
-			/* strip trailing " from filename */
-			tmp = strchr(fn, '"');
-			*tmp = 0;
+
+			/* regexp ensures that first and list "
+			 * in the whole yytext are those at
+			 * beginning and end of the filename string */
+			fnstart = memchr(yytext, '"', yyleng);
+			for (fnend = yytext + yyleng - 1;
+			     *fnend != '"'; fnend--)
+				;
+			assert(fnstart && fnend && (fnend > fnstart));
+
+			fn = data_copy_escape_string(fnstart + 1,
+						     fnend - fnstart - 1);
+
+			/* Don't allow nuls in filenames */
+			if (memchr(fn.val, '\0', fn.len - 1))
+				lexical_error("nul in line number directive");
+
 			/* -1 since #line is the number of the next line */
-			srcpos_set_line(xstrdup(fn), atoi(line) - 1);
+			srcpos_set_line(xstrdup(fn.val), atoi(line) - 1);
+			data_free(fn);
 		}
 
 <*><<EOF>>		{
@@ -153,7 +161,10 @@ static void lexical_error(const char *fmt, ...);
 			errno = 0;
 			yylval.integer = strtoull(yytext, &e, 0);
 
-			assert(!(*e) || !e[strspn(e, "UL")]);
+			if (*e && e[strspn(e, "UL")]) {
+				lexical_error("Bad integer literal '%s'",
+					      yytext);
+			}
 
 			if (errno == ERANGE)
 				lexical_error("Integer literal '%s' out of range",
diff --git a/scripts/dtc/dtc-lexer.lex.c_shipped b/scripts/dtc/dtc-lexer.lex.c_shipped
index 11cd78e72305..ba525c2f9fc2 100644
--- a/scripts/dtc/dtc-lexer.lex.c_shipped
+++ b/scripts/dtc/dtc-lexer.lex.c_shipped
@@ -951,31 +951,39 @@ case 2:
 YY_RULE_SETUP
 #line 75 "dtc-lexer.l"
 {
-			char *line, *tmp, *fn;
+			char *line, *fnstart, *fnend;
+			struct data fn;
 			/* skip text before line # */
 			line = yytext;
 			while (!isdigit((unsigned char)*line))
 				line++;
-			/* skip digits in line # */
-			tmp = line;
-			while (!isspace((unsigned char)*tmp))
-				tmp++;
-			/* "NULL"-terminate line # */
-			*tmp = '\0';
-			/* start of filename */
-			fn = strchr(tmp + 1, '"') + 1;
-			/* strip trailing " from filename */
-			tmp = strchr(fn, '"');
-			*tmp = 0;
+
+			/* regexp ensures that first and list "
+			 * in the whole yytext are those at
+			 * beginning and end of the filename string */
+			fnstart = memchr(yytext, '"', yyleng);
+			for (fnend = yytext + yyleng - 1;
+			     *fnend != '"'; fnend--)
+				;
+			assert(fnstart && fnend && (fnend > fnstart));
+
+			fn = data_copy_escape_string(fnstart + 1,
+						     fnend - fnstart - 1);
+
+			/* Don't allow nuls in filenames */
+			if (memchr(fn.val, '\0', fn.len - 1))
+				lexical_error("nul in line number directive");
+
 			/* -1 since #line is the number of the next line */
-			srcpos_set_line(xstrdup(fn), atoi(line) - 1);
+			srcpos_set_line(xstrdup(fn.val), atoi(line) - 1);
+			data_free(fn);
 		}
 	YY_BREAK
 case YY_STATE_EOF(INITIAL):
 case YY_STATE_EOF(BYTESTRING):
 case YY_STATE_EOF(PROPNODENAME):
 case YY_STATE_EOF(V1):
-#line 96 "dtc-lexer.l"
+#line 104 "dtc-lexer.l"
 {
 			if (!pop_input_file()) {
 				yyterminate();
@@ -985,7 +993,7 @@ case YY_STATE_EOF(V1):
 case 3:
 /* rule 3 can match eol */
 YY_RULE_SETUP
-#line 102 "dtc-lexer.l"
+#line 110 "dtc-lexer.l"
 {
 			DPRINT("String: %s\n", yytext);
 			yylval.data = data_copy_escape_string(yytext+1,
@@ -995,7 +1003,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 4:
 YY_RULE_SETUP
-#line 109 "dtc-lexer.l"
+#line 117 "dtc-lexer.l"
 {
 			DPRINT("Keyword: /dts-v1/\n");
 			dts_version = 1;
@@ -1005,7 +1013,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 5:
 YY_RULE_SETUP
-#line 116 "dtc-lexer.l"
+#line 124 "dtc-lexer.l"
 {
 			DPRINT("Keyword: /memreserve/\n");
 			BEGIN_DEFAULT();
@@ -1014,7 +1022,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 6:
 YY_RULE_SETUP
-#line 122 "dtc-lexer.l"
+#line 130 "dtc-lexer.l"
 {
 			DPRINT("Keyword: /bits/\n");
 			BEGIN_DEFAULT();
@@ -1023,7 +1031,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 7:
 YY_RULE_SETUP
-#line 128 "dtc-lexer.l"
+#line 136 "dtc-lexer.l"
 {
 			DPRINT("Keyword: /delete-property/\n");
 			DPRINT("<PROPNODENAME>\n");
@@ -1033,7 +1041,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 8:
 YY_RULE_SETUP
-#line 135 "dtc-lexer.l"
+#line 143 "dtc-lexer.l"
 {
 			DPRINT("Keyword: /delete-node/\n");
 			DPRINT("<PROPNODENAME>\n");
@@ -1043,7 +1051,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 9:
 YY_RULE_SETUP
-#line 142 "dtc-lexer.l"
+#line 150 "dtc-lexer.l"
 {
 			DPRINT("Label: %s\n", yytext);
 			yylval.labelref = xstrdup(yytext);
@@ -1053,7 +1061,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 10:
 YY_RULE_SETUP
-#line 149 "dtc-lexer.l"
+#line 157 "dtc-lexer.l"
 {
 			char *e;
 			DPRINT("Integer Literal: '%s'\n", yytext);
@@ -1061,7 +1069,10 @@ YY_RULE_SETUP
 			errno = 0;
 			yylval.integer = strtoull(yytext, &e, 0);
 
-			assert(!(*e) || !e[strspn(e, "UL")]);
+			if (*e && e[strspn(e, "UL")]) {
+				lexical_error("Bad integer literal '%s'",
+					      yytext);
+			}
 
 			if (errno == ERANGE)
 				lexical_error("Integer literal '%s' out of range",
@@ -1076,7 +1087,7 @@ YY_RULE_SETUP
 case 11:
 /* rule 11 can match eol */
 YY_RULE_SETUP
-#line 168 "dtc-lexer.l"
+#line 179 "dtc-lexer.l"
 {
 			struct data d;
 			DPRINT("Character literal: %s\n", yytext);
@@ -1100,7 +1111,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 12:
 YY_RULE_SETUP
-#line 189 "dtc-lexer.l"
+#line 200 "dtc-lexer.l"
 {	/* label reference */
 			DPRINT("Ref: %s\n", yytext+1);
 			yylval.labelref = xstrdup(yytext+1);
@@ -1109,7 +1120,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 13:
 YY_RULE_SETUP
-#line 195 "dtc-lexer.l"
+#line 206 "dtc-lexer.l"
 {	/* new-style path reference */
 			yytext[yyleng-1] = '\0';
 			DPRINT("Ref: %s\n", yytext+2);
@@ -1119,7 +1130,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 14:
 YY_RULE_SETUP
-#line 202 "dtc-lexer.l"
+#line 213 "dtc-lexer.l"
 {
 			yylval.byte = strtol(yytext, NULL, 16);
 			DPRINT("Byte: %02x\n", (int)yylval.byte);
@@ -1128,7 +1139,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 15:
 YY_RULE_SETUP
-#line 208 "dtc-lexer.l"
+#line 219 "dtc-lexer.l"
 {
 			DPRINT("/BYTESTRING\n");
 			BEGIN_DEFAULT();
@@ -1137,7 +1148,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 16:
 YY_RULE_SETUP
-#line 214 "dtc-lexer.l"
+#line 225 "dtc-lexer.l"
 {
 			DPRINT("PropNodeName: %s\n", yytext);
 			yylval.propnodename = xstrdup((yytext[0] == '\\') ?
@@ -1148,7 +1159,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 17:
 YY_RULE_SETUP
-#line 222 "dtc-lexer.l"
+#line 233 "dtc-lexer.l"
 {
 			DPRINT("Binary Include\n");
 			return DT_INCBIN;
@@ -1157,64 +1168,64 @@ YY_RULE_SETUP
 case 18:
 /* rule 18 can match eol */
 YY_RULE_SETUP
-#line 227 "dtc-lexer.l"
+#line 238 "dtc-lexer.l"
 /* eat whitespace */
 	YY_BREAK
 case 19:
 /* rule 19 can match eol */
 YY_RULE_SETUP
-#line 228 "dtc-lexer.l"
+#line 239 "dtc-lexer.l"
 /* eat C-style comments */
 	YY_BREAK
 case 20:
 /* rule 20 can match eol */
 YY_RULE_SETUP
-#line 229 "dtc-lexer.l"
+#line 240 "dtc-lexer.l"
 /* eat C++-style comments */
 	YY_BREAK
 case 21:
 YY_RULE_SETUP
-#line 231 "dtc-lexer.l"
+#line 242 "dtc-lexer.l"
 { return DT_LSHIFT; };
 	YY_BREAK
 case 22:
 YY_RULE_SETUP
-#line 232 "dtc-lexer.l"
+#line 243 "dtc-lexer.l"
 { return DT_RSHIFT; };
 	YY_BREAK
 case 23:
 YY_RULE_SETUP
-#line 233 "dtc-lexer.l"
+#line 244 "dtc-lexer.l"
 { return DT_LE; };
 	YY_BREAK
 case 24:
 YY_RULE_SETUP
-#line 234 "dtc-lexer.l"
+#line 245 "dtc-lexer.l"
 { return DT_GE; };
 	YY_BREAK
 case 25:
 YY_RULE_SETUP
-#line 235 "dtc-lexer.l"
+#line 246 "dtc-lexer.l"
 { return DT_EQ; };
 	YY_BREAK
 case 26:
 YY_RULE_SETUP
-#line 236 "dtc-lexer.l"
+#line 247 "dtc-lexer.l"
 { return DT_NE; };
 	YY_BREAK
 case 27:
 YY_RULE_SETUP
-#line 237 "dtc-lexer.l"
+#line 248 "dtc-lexer.l"
 { return DT_AND; };
 	YY_BREAK
 case 28:
 YY_RULE_SETUP
-#line 238 "dtc-lexer.l"
+#line 249 "dtc-lexer.l"
 { return DT_OR; };
 	YY_BREAK
 case 29:
 YY_RULE_SETUP
-#line 240 "dtc-lexer.l"
+#line 251 "dtc-lexer.l"
 {
 			DPRINT("Char: %c (\\x%02x)\n", yytext[0],
 				(unsigned)yytext[0]);
@@ -1232,10 +1243,10 @@ YY_RULE_SETUP
 	YY_BREAK
 case 30:
 YY_RULE_SETUP
-#line 255 "dtc-lexer.l"
+#line 266 "dtc-lexer.l"
 ECHO;
 	YY_BREAK
-#line 1239 "dtc-lexer.lex.c"
+#line 1250 "dtc-lexer.lex.c"
 
 	case YY_END_OF_BUFFER:
 		{
@@ -2195,7 +2206,7 @@ void yyfree (void * ptr )
 
 #define YYTABLES_NAME "yytables"
 
-#line 254 "dtc-lexer.l"
+#line 265 "dtc-lexer.l"
 
 
 
diff --git a/scripts/dtc/dtc-parser.tab.c_shipped b/scripts/dtc/dtc-parser.tab.c_shipped
index 116458c8dfc4..31cec50a1265 100644
--- a/scripts/dtc/dtc-parser.tab.c_shipped
+++ b/scripts/dtc/dtc-parser.tab.c_shipped
@@ -499,9 +499,9 @@ static const yytype_uint16 yyrline[] =
      298,   303,   322,   336,   343,   344,   345,   352,   356,   357,
      361,   362,   366,   367,   371,   372,   376,   377,   381,   382,
      386,   387,   388,   392,   393,   394,   395,   396,   400,   401,
-     402,   406,   407,   408,   412,   413,   414,   415,   419,   420,
-     421,   422,   427,   430,   434,   442,   445,   449,   457,   461,
-     465
+     402,   406,   407,   408,   412,   413,   422,   431,   435,   436,
+     437,   438,   443,   446,   450,   458,   461,   465,   473,   477,
+     481
 };
 #endif
 
@@ -1909,111 +1909,125 @@ yyreduce:
     break;
 
   case 65:
-#line 413 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) / (yyvsp[0].integer); }
-#line 1915 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 414 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			if ((yyvsp[0].integer) != 0) {
+				(yyval.integer) = (yyvsp[-2].integer) / (yyvsp[0].integer);
+			} else {
+				ERROR(&(yyloc), "Division by zero");
+				(yyval.integer) = 0;
+			}
+		}
+#line 1922 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 66:
-#line 414 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) % (yyvsp[0].integer); }
-#line 1921 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 423 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			if ((yyvsp[0].integer) != 0) {
+				(yyval.integer) = (yyvsp[-2].integer) % (yyvsp[0].integer);
+			} else {
+				ERROR(&(yyloc), "Division by zero");
+				(yyval.integer) = 0;
+			}
+		}
+#line 1935 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 69:
-#line 420 "dtc-parser.y" /* yacc.c:1646  */
+#line 436 "dtc-parser.y" /* yacc.c:1646  */
     { (yyval.integer) = -(yyvsp[0].integer); }
-#line 1927 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1941 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 70:
-#line 421 "dtc-parser.y" /* yacc.c:1646  */
+#line 437 "dtc-parser.y" /* yacc.c:1646  */
     { (yyval.integer) = ~(yyvsp[0].integer); }
-#line 1933 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 71:
-#line 422 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = !(yyvsp[0].integer); }
-#line 1939 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 72:
-#line 427 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.data) = empty_data;
-		}
 #line 1947 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
+  case 71:
+#line 438 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = !(yyvsp[0].integer); }
+#line 1953 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 72:
+#line 443 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = empty_data;
+		}
+#line 1961 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
   case 73:
-#line 431 "dtc-parser.y" /* yacc.c:1646  */
+#line 447 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.data) = data_append_byte((yyvsp[-1].data), (yyvsp[0].byte));
 		}
-#line 1955 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1969 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 74:
-#line 435 "dtc-parser.y" /* yacc.c:1646  */
+#line 451 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.data) = data_add_marker((yyvsp[-1].data), LABEL, (yyvsp[0].labelref));
 		}
-#line 1963 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1977 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 75:
-#line 442 "dtc-parser.y" /* yacc.c:1646  */
+#line 458 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.nodelist) = NULL;
 		}
-#line 1971 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1985 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 76:
-#line 446 "dtc-parser.y" /* yacc.c:1646  */
+#line 462 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.nodelist) = chain_node((yyvsp[-1].node), (yyvsp[0].nodelist));
 		}
-#line 1979 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1993 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 77:
-#line 450 "dtc-parser.y" /* yacc.c:1646  */
+#line 466 "dtc-parser.y" /* yacc.c:1646  */
     {
 			ERROR(&(yylsp[0]), "Properties must precede subnodes");
 			YYERROR;
 		}
-#line 1988 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2002 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 78:
-#line 458 "dtc-parser.y" /* yacc.c:1646  */
+#line 474 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.node) = name_node((yyvsp[0].node), (yyvsp[-1].propnodename));
 		}
-#line 1996 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2010 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 79:
-#line 462 "dtc-parser.y" /* yacc.c:1646  */
+#line 478 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.node) = name_node(build_node_delete(), (yyvsp[-1].propnodename));
 		}
-#line 2004 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2018 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 80:
-#line 466 "dtc-parser.y" /* yacc.c:1646  */
+#line 482 "dtc-parser.y" /* yacc.c:1646  */
     {
 			add_label(&(yyvsp[0].node)->labels, (yyvsp[-1].labelref));
 			(yyval.node) = (yyvsp[0].node);
 		}
-#line 2013 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2027 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
 
-#line 2017 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2031 "dtc-parser.tab.c" /* yacc.c:1646  */
       default: break;
     }
   /* User semantic actions sometimes alter yychar, and that requires
@@ -2248,7 +2262,7 @@ yyreturn:
 #endif
   return yyresult;
 }
-#line 472 "dtc-parser.y" /* yacc.c:1906  */
+#line 488 "dtc-parser.y" /* yacc.c:1906  */
 
 
 void yyerror(char const *s)
diff --git a/scripts/dtc/dtc-parser.y b/scripts/dtc/dtc-parser.y
index 5a897e36562d..000873f070fd 100644
--- a/scripts/dtc/dtc-parser.y
+++ b/scripts/dtc/dtc-parser.y
@@ -410,8 +410,24 @@ integer_add:
 
 integer_mul:
 	  integer_mul '*' integer_unary { $$ = $1 * $3; }
-	| integer_mul '/' integer_unary { $$ = $1 / $3; }
-	| integer_mul '%' integer_unary { $$ = $1 % $3; }
+	| integer_mul '/' integer_unary
+		{
+			if ($3 != 0) {
+				$$ = $1 / $3;
+			} else {
+				ERROR(&@$, "Division by zero");
+				$$ = 0;
+			}
+		}
+	| integer_mul '%' integer_unary
+		{
+			if ($3 != 0) {
+				$$ = $1 % $3;
+			} else {
+				ERROR(&@$, "Division by zero");
+				$$ = 0;
+			}
+		}
 	| integer_unary
 	;
 
diff --git a/scripts/dtc/dtc.c b/scripts/dtc/dtc.c
index 8c4add69a765..5fa23c406266 100644
--- a/scripts/dtc/dtc.c
+++ b/scripts/dtc/dtc.c
@@ -18,6 +18,8 @@
  *                                                                   USA
  */
 
+#include <sys/stat.h>
+
 #include "dtc.h"
 #include "srcpos.h"
 
@@ -104,11 +106,56 @@ static const char * const usage_opts_help[] = {
 	NULL,
 };
 
+static const char *guess_type_by_name(const char *fname, const char *fallback)
+{
+	const char *s;
+
+	s = strrchr(fname, '.');
+	if (s == NULL)
+		return fallback;
+	if (!strcasecmp(s, ".dts"))
+		return "dts";
+	if (!strcasecmp(s, ".dtb"))
+		return "dtb";
+	return fallback;
+}
+
+static const char *guess_input_format(const char *fname, const char *fallback)
+{
+	struct stat statbuf;
+	uint32_t magic;
+	FILE *f;
+
+	if (stat(fname, &statbuf) != 0)
+		return fallback;
+
+	if (S_ISDIR(statbuf.st_mode))
+		return "fs";
+
+	if (!S_ISREG(statbuf.st_mode))
+		return fallback;
+
+	f = fopen(fname, "r");
+	if (f == NULL)
+		return fallback;
+	if (fread(&magic, 4, 1, f) != 1) {
+		fclose(f);
+		return fallback;
+	}
+	fclose(f);
+
+	magic = fdt32_to_cpu(magic);
+	if (magic == FDT_MAGIC)
+		return "dtb";
+
+	return guess_type_by_name(fname, fallback);
+}
+
 int main(int argc, char *argv[])
 {
 	struct boot_info *bi;
-	const char *inform = "dts";
-	const char *outform = "dts";
+	const char *inform = NULL;
+	const char *outform = NULL;
 	const char *outname = "-";
 	const char *depname = NULL;
 	bool force = false, sort = false;
@@ -213,6 +260,17 @@ int main(int argc, char *argv[])
 		fprintf(depfile, "%s:", outname);
 	}
 
+	if (inform == NULL)
+		inform = guess_input_format(arg, "dts");
+	if (outform == NULL) {
+		outform = guess_type_by_name(outname, NULL);
+		if (outform == NULL) {
+			if (streq(inform, "dts"))
+				outform = "dtb";
+			else
+				outform = "dts";
+		}
+	}
 	if (streq(inform, "dts"))
 		bi = dt_from_source(arg);
 	else if (streq(inform, "fs"))
diff --git a/scripts/dtc/libfdt/fdt.c b/scripts/dtc/libfdt/fdt.c
index 2ce6a44179de..22286a1aaeaf 100644
--- a/scripts/dtc/libfdt/fdt.c
+++ b/scripts/dtc/libfdt/fdt.c
@@ -76,18 +76,19 @@ int fdt_check_header(const void *fdt)
 
 const void *fdt_offset_ptr(const void *fdt, int offset, unsigned int len)
 {
-	const char *p;
+	unsigned absoffset = offset + fdt_off_dt_struct(fdt);
+
+	if ((absoffset < offset)
+	    || ((absoffset + len) < absoffset)
+	    || (absoffset + len) > fdt_totalsize(fdt))
+		return NULL;
 
 	if (fdt_version(fdt) >= 0x11)
 		if (((offset + len) < offset)
 		    || ((offset + len) > fdt_size_dt_struct(fdt)))
 			return NULL;
 
-	p = _fdt_offset_ptr(fdt, offset);
-
-	if (p + len < p)
-		return NULL;
-	return p;
+	return _fdt_offset_ptr(fdt, offset);
 }
 
 uint32_t fdt_next_tag(const void *fdt, int startoffset, int *nextoffset)
diff --git a/scripts/dtc/libfdt/fdt_ro.c b/scripts/dtc/libfdt/fdt_ro.c
index a65e4b5b72b6..e5b313682007 100644
--- a/scripts/dtc/libfdt/fdt_ro.c
+++ b/scripts/dtc/libfdt/fdt_ro.c
@@ -538,6 +538,106 @@ int fdt_stringlist_contains(const char *strlist, int listlen, const char *str)
 	return 0;
 }
 
+int fdt_stringlist_count(const void *fdt, int nodeoffset, const char *property)
+{
+	const char *list, *end;
+	int length, count = 0;
+
+	list = fdt_getprop(fdt, nodeoffset, property, &length);
+	if (!list)
+		return -length;
+
+	end = list + length;
+
+	while (list < end) {
+		length = strnlen(list, end - list) + 1;
+
+		/* Abort if the last string isn't properly NUL-terminated. */
+		if (list + length > end)
+			return -FDT_ERR_BADVALUE;
+
+		list += length;
+		count++;
+	}
+
+	return count;
+}
+
+int fdt_stringlist_search(const void *fdt, int nodeoffset, const char *property,
+			  const char *string)
+{
+	int length, len, idx = 0;
+	const char *list, *end;
+
+	list = fdt_getprop(fdt, nodeoffset, property, &length);
+	if (!list)
+		return -length;
+
+	len = strlen(string) + 1;
+	end = list + length;
+
+	while (list < end) {
+		length = strnlen(list, end - list) + 1;
+
+		/* Abort if the last string isn't properly NUL-terminated. */
+		if (list + length > end)
+			return -FDT_ERR_BADVALUE;
+
+		if (length == len && memcmp(list, string, length) == 0)
+			return idx;
+
+		list += length;
+		idx++;
+	}
+
+	return -FDT_ERR_NOTFOUND;
+}
+
+const char *fdt_stringlist_get(const void *fdt, int nodeoffset,
+			       const char *property, int idx,
+			       int *lenp)
+{
+	const char *list, *end;
+	int length;
+
+	list = fdt_getprop(fdt, nodeoffset, property, &length);
+	if (!list) {
+		if (lenp)
+			*lenp = length;
+
+		return NULL;
+	}
+
+	end = list + length;
+
+	while (list < end) {
+		length = strnlen(list, end - list) + 1;
+
+		/* Abort if the last string isn't properly NUL-terminated. */
+		if (list + length > end) {
+			if (lenp)
+				*lenp = -FDT_ERR_BADVALUE;
+
+			return NULL;
+		}
+
+		if (idx == 0) {
+			if (lenp)
+				*lenp = length - 1;
+
+			return list;
+		}
+
+		list += length;
+		idx--;
+	}
+
+	if (lenp)
+		*lenp = -FDT_ERR_NOTFOUND;
+
+	return NULL;
+}
+
 int fdt_node_check_compatible(const void *fdt, int nodeoffset,
 			      const char *compatible)
 {
diff --git a/scripts/dtc/libfdt/fdt_rw.c b/scripts/dtc/libfdt/fdt_rw.c
index 70adec6c371b..8be02b1f68f3 100644
--- a/scripts/dtc/libfdt/fdt_rw.c
+++ b/scripts/dtc/libfdt/fdt_rw.c
@@ -101,6 +101,8 @@ static int _fdt_splice(void *fdt, void *splicepoint, int oldlen, int newlen)
 
 	if (((p + oldlen) < p) || ((p + oldlen) > end))
 		return -FDT_ERR_BADOFFSET;
+	if ((p < (char *)fdt) || ((end - oldlen + newlen) < (char *)fdt))
+		return -FDT_ERR_BADOFFSET;
 	if ((end - oldlen + newlen) > ((char *)fdt + fdt_totalsize(fdt)))
 		return -FDT_ERR_NOSPACE;
 	memmove(p + newlen, p + oldlen, end - p - oldlen);
diff --git a/scripts/dtc/libfdt/libfdt.h b/scripts/dtc/libfdt/libfdt.h
index ea35ac3c9be4..59ca33976e56 100644
--- a/scripts/dtc/libfdt/libfdt.h
+++ b/scripts/dtc/libfdt/libfdt.h
@@ -121,7 +121,12 @@
 	/* FDT_ERR_BADNCELLS: Device tree has a #address-cells, #size-cells
 	 * or similar property with a bad format or value */
 
-#define FDT_ERR_MAX		14
+#define FDT_ERR_BADVALUE	15
+	/* FDT_ERR_BADVALUE: Device tree has a property with an unexpected
+	 * value. For example: a property expected to contain a string list
+	 * is not NUL-terminated within the length of its value. */
+
+#define FDT_ERR_MAX		15
 
 /**********************************************************************/
 /* Low-level functions (you probably don't need these)                */
@@ -457,8 +462,8 @@ const struct fdt_property *fdt_get_property_by_offset(const void *fdt,
  * @namelen: number of characters of name to consider
  * @lenp: pointer to an integer variable (will be overwritten) or NULL
  *
- * Identical to fdt_get_property_namelen(), but only examine the first
- * namelen characters of name for matching the property name.
+ * Identical to fdt_get_property(), but only examine the first namelen
+ * characters of name for matching the property name.
  */
 const struct fdt_property *fdt_get_property_namelen(const void *fdt,
 						    int nodeoffset,
@@ -868,6 +873,68 @@ int fdt_node_offset_by_compatible(const void *fdt, int startoffset,
  */
 int fdt_stringlist_contains(const char *strlist, int listlen, const char *str);
 
+/**
+ * fdt_stringlist_count - count the number of strings in a string list
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of a tree node
+ * @property: name of the property containing the string list
+ * @return:
+ *   the number of strings in the given property
+ *   -FDT_ERR_BADVALUE if the property value is not NUL-terminated
+ *   -FDT_ERR_NOTFOUND if the property does not exist
+ */
+int fdt_stringlist_count(const void *fdt, int nodeoffset, const char *property);
+
+/**
+ * fdt_stringlist_search - find a string in a string list and return its index
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of a tree node
+ * @property: name of the property containing the string list
+ * @string: string to look up in the string list
+ *
+ * Note that it is possible for this function to succeed on property values
+ * that are not NUL-terminated. That's because the function will stop after
+ * finding the first occurrence of @string. This can for example happen with
+ * small-valued cell properties, such as #address-cells, when searching for
+ * the empty string.
+ *
+ * @return:
+ *   the index of the string in the list of strings
+ *   -FDT_ERR_BADVALUE if the property value is not NUL-terminated
+ *   -FDT_ERR_NOTFOUND if the property does not exist or does not contain
+ *                     the given string
+ */
+int fdt_stringlist_search(const void *fdt, int nodeoffset, const char *property,
+			  const char *string);
+
+/**
+ * fdt_stringlist_get() - obtain the string at a given index in a string list
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of a tree node
+ * @property: name of the property containing the string list
+ * @index: index of the string to return
+ * @lenp: return location for the string length or an error code on failure
+ *
+ * Note that this will successfully extract strings from properties with
+ * non-NUL-terminated values. For example on small-valued cell properties
+ * this function will return the empty string.
+ *
+ * If non-NULL, the length of the string (on success) or a negative error-code
+ * (on failure) will be stored in the integer pointer to by lenp.
+ *
+ * @return:
+ *   A pointer to the string at the given index in the string list or NULL on
+ *   failure. On success the length of the string will be stored in the memory
+ *   location pointed to by the lenp parameter, if non-NULL. On failure one of
+ *   the following negative error codes will be returned in the lenp parameter
+ *   (if non-NULL):
+ *     -FDT_ERR_BADVALUE if the property value is not NUL-terminated
+ *     -FDT_ERR_NOTFOUND if the property does not exist
+ */
+const char *fdt_stringlist_get(const void *fdt, int nodeoffset,
+			       const char *property, int index,
+			       int *lenp);
+
 /**********************************************************************/
 /* Read-only functions (addressing related)                           */
 /**********************************************************************/
diff --git a/scripts/dtc/util.c b/scripts/dtc/util.c
index 9d65226df9e4..fb124eea4919 100644
--- a/scripts/dtc/util.c
+++ b/scripts/dtc/util.c
@@ -152,7 +152,6 @@ char get_escape_char(const char *s, int *i)
 	int	j = *i + 1;
 	char	val;
 
-	assert(c);
 	switch (c) {
 	case 'a':
 		val = '\a';
@@ -349,7 +348,6 @@ int utilfdt_decode_type(const char *fmt, int *type, int *size)
 void utilfdt_print_data(const char *data, int len)
 {
 	int i;
-	const char *p = data;
 	const char *s;
 
 	/* no data, don't print */
@@ -376,6 +374,7 @@ void utilfdt_print_data(const char *data, int len)
 			       i < (len - 1) ? " " : "");
 		printf(">");
 	} else {
+		const unsigned char *p = (const unsigned char *)data;
 		printf(" = [");
 		for (i = 0; i < len; i++)
 			printf("%02x%s", *p++, i < len - 1 ? " " : "");
diff --git a/scripts/dtc/version_gen.h b/scripts/dtc/version_gen.h
index 5b8c7d53d608..11d93e6d8220 100644
--- a/scripts/dtc/version_gen.h
+++ b/scripts/dtc/version_gen.h
@@ -1 +1 @@
-#define DTC_VERSION "DTC 1.4.1-g9d3649bd"
+#define DTC_VERSION "DTC 1.4.1-gb06e55c8"

From 4d8ea865394ee408d2c5e5c8e317fd952dc0e46f Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 4 Mar 2016 08:56:58 -0600
Subject: [PATCH 1553/3220] UPSTREAM: scripts/dtc: Update to upstream version
 53bf130b1cdd

Sync to upstream dtc commit 53bf130b1cdd ("libfdt: simplify
fdt_node_check_compatible()"). This adds the following commits from
upstream:

53bf130 libfdt: simplify fdt_node_check_compatible()
c9d9121 Warn on node name unit-address presence/absence mismatch
2e53f9d Catch unsigned 32bit overflow when parsing flattened device tree offsets

Signed-off-by: Rob Herring <robh@kernel.org>
(cherry picked from commit b993734718c0106418e068f21c7be01afc12306c)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 scripts/dtc/checks.c        | 26 ++++++++++++++++++++++++++
 scripts/dtc/flattree.c      |  4 ++--
 scripts/dtc/libfdt/fdt_ro.c |  6 ++----
 scripts/dtc/version_gen.h   |  2 +-
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/scripts/dtc/checks.c b/scripts/dtc/checks.c
index 0c03ac9159c1..386f9563313f 100644
--- a/scripts/dtc/checks.c
+++ b/scripts/dtc/checks.c
@@ -294,6 +294,30 @@ static void check_node_name_format(struct check *c, struct node *dt,
 }
 NODE_ERROR(node_name_format, NULL, &node_name_chars);
 
+static void check_unit_address_vs_reg(struct check *c, struct node *dt,
+			     struct node *node)
+{
+	const char *unitname = get_unitname(node);
+	struct property *prop = get_property(node, "reg");
+
+	if (!prop) {
+		prop = get_property(node, "ranges");
+		if (prop && !prop->val.len)
+			prop = NULL;
+	}
+
+	if (prop) {
+		if (!unitname[0])
+			FAIL(c, "Node %s has a reg or ranges property, but no unit name",
+			    node->fullpath);
+	} else {
+		if (unitname[0])
+			FAIL(c, "Node %s has a unit name, but no reg property",
+			    node->fullpath);
+	}
+}
+NODE_WARNING(unit_address_vs_reg, NULL);
+
 static void check_property_name_chars(struct check *c, struct node *dt,
 				      struct node *node, struct property *prop)
 {
@@ -667,6 +691,8 @@ static struct check *check_table[] = {
 
 	&addr_size_cells, &reg_format, &ranges_format,
 
+	&unit_address_vs_reg,
+
 	&avoid_default_addr_size,
 	&obsolete_chosen_interrupt_controller,
 
diff --git a/scripts/dtc/flattree.c b/scripts/dtc/flattree.c
index bd99fa2d33b8..ec14954f5810 100644
--- a/scripts/dtc/flattree.c
+++ b/scripts/dtc/flattree.c
@@ -889,7 +889,7 @@ struct boot_info *dt_from_blob(const char *fname)
 
 	if (version >= 3) {
 		uint32_t size_str = fdt32_to_cpu(fdt->size_dt_strings);
-		if (off_str+size_str > totalsize)
+		if ((off_str+size_str < off_str) || (off_str+size_str > totalsize))
 			die("String table extends past total size\n");
 		inbuf_init(&strbuf, blob + off_str, blob + off_str + size_str);
 	} else {
@@ -898,7 +898,7 @@ struct boot_info *dt_from_blob(const char *fname)
 
 	if (version >= 17) {
 		size_dt = fdt32_to_cpu(fdt->size_dt_struct);
-		if (off_dt+size_dt > totalsize)
+		if ((off_dt+size_dt < off_dt) || (off_dt+size_dt > totalsize))
 			die("Structure block extends past total size\n");
 	}
 
diff --git a/scripts/dtc/libfdt/fdt_ro.c b/scripts/dtc/libfdt/fdt_ro.c
index e5b313682007..50cce864283c 100644
--- a/scripts/dtc/libfdt/fdt_ro.c
+++ b/scripts/dtc/libfdt/fdt_ro.c
@@ -647,10 +647,8 @@ int fdt_node_check_compatible(const void *fdt, int nodeoffset,
 	prop = fdt_getprop(fdt, nodeoffset, "compatible", &len);
 	if (!prop)
 		return len;
-	if (fdt_stringlist_contains(prop, len, compatible))
-		return 0;
-	else
-		return 1;
+
+	return !fdt_stringlist_contains(prop, len, compatible);
 }
 
 int fdt_node_offset_by_compatible(const void *fdt, int startoffset,
diff --git a/scripts/dtc/version_gen.h b/scripts/dtc/version_gen.h
index 11d93e6d8220..ad9b05ae698b 100644
--- a/scripts/dtc/version_gen.h
+++ b/scripts/dtc/version_gen.h
@@ -1 +1 @@
-#define DTC_VERSION "DTC 1.4.1-gb06e55c8"
+#define DTC_VERSION "DTC 1.4.1-g53bf130b"

From 68b6a5ccf304d7e02e3593bf73daedb82b6e0051 Mon Sep 17 00:00:00 2001
From: Gaurav Minocha <gaurav.minocha.os@gmail.com>
Date: Tue, 19 Jul 2016 19:37:44 -0700
Subject: [PATCH 1554/3220] UPSTREAM: scripts/dtc: dt_to_config - kernel config
 options for a devicetree

Determining which kernel config options need to be enabled for a
given devicetree can be a painful process.  Create a new tool to
find the drivers that may match a devicetree node compatible,
find the kernel config options that enable the driver, and
optionally report whether the kernel config option is enabled.

Signed-off-by: Gaurav Minocha <gaurav.minocha.os@gmail.com>
Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Signed-off-by: Rob Herring <robh@kernel.org>
(cherry picked from commit ca0cd118a15f9a1e25fa6086543ab49ddd96df99)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 scripts/dtc/dt_to_config | 1213 ++++++++++++++++++++++++++++++++++++++
 scripts/dtc/dtx_diff     |    2 +-
 2 files changed, 1214 insertions(+), 1 deletion(-)
 create mode 100755 scripts/dtc/dt_to_config

diff --git a/scripts/dtc/dt_to_config b/scripts/dtc/dt_to_config
new file mode 100755
index 000000000000..9a248b505c58
--- /dev/null
+++ b/scripts/dtc/dt_to_config
@@ -0,0 +1,1213 @@
+#!/usr/bin/perl
+
+# Copyright 2016 by Frank Rowand
+# Copyright 2016 by Gaurav Minocha
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License v2.
+
+use strict 'refs';
+use strict subs;
+
+use Getopt::Long;
+
+$VUFX = "160610a";
+
+$script_name = $0;
+$script_name =~ s|^.*/||;
+
+
+# ----- constants for print_flags()
+
+# Position in string $pr_flags.  Range of 0..($num_pr_flags - 1).
+$pr_flag_pos_mcompatible       = 0;
+$pr_flag_pos_driver            = 1;
+$pr_flag_pos_mdriver           = 2;
+$pr_flag_pos_config            = 3;
+$pr_flag_pos_mconfig           = 4;
+$pr_flag_pos_node_not_enabled  = 5;
+$pr_flag_pos_white_list        = 6;
+$pr_flag_pos_hard_coded        = 7;
+$pr_flag_pos_config_hard_coded = 8;
+$pr_flag_pos_config_none       = 9;
+$pr_flag_pos_config_m          = 10;
+$pr_flag_pos_config_y          = 11;
+$pr_flag_pos_config_test_fail  = 12;
+
+$num_pr_flags = $pr_flag_pos_config_test_fail + 1;
+
+# flags in @pr_flag_value must be unique values to allow simple regular
+# expessions to work for --include_flags and --exclude_flags.
+# Convention: use upper case letters for potential issues or problems.
+
+@pr_flag_value = ('M', 'd', 'D', 'c', 'C', 'E', 'W', 'H', 'x', 'n', 'm', 'y', 'F');
+
+@pr_flag_help = (
+    "multiple compatibles found for this node",
+    "driver found for this compatible",
+    "multiple drivers found for this compatible",
+    "kernel config found for this driver",
+    "multiple config options found for this driver",
+    "node is not enabled",
+    "compatible is white listed",
+    "matching driver and/or kernel config is hard coded",
+    "kernel config hard coded in Makefile",
+    "one or more kernel config file options is not set",
+    "one or more kernel config file options is set to 'm'",
+    "one or more kernel config file options is set to 'y'",
+    "one of more kernel config file options fails to have correct value"
+);
+
+
+# -----
+
+%driver_config = ();   # driver config array, indexed by driver source file
+%driver_count = ();    # driver_cnt, indexed by compatible
+%compat_driver = ();   # compatible driver array, indexed by compatible
+%existing_config = (); # existing config symbols present in given config file
+                       # expected values are: "y", "m", a decimal number, a
+                       # hex number, or a string
+
+# ----- magic compatibles, do not have a driver
+#
+# Will not search for drivers for these compatibles.
+
+%compat_white_list = (
+       'none'                  => '1',
+       'pci'                   => '1',
+       'simple-bus'            => '1',
+);
+
+# Will not search for drivers for these compatibles.
+#
+# These compatibles have a very large number of false positives.
+#
+# 'hardcoded_no_driver' is a magic value.  Other code knows this
+# magic value.  Do not use 'no_driver' here!
+#
+# Revisit each 'hardcoded_no_driver' to see how the compatible
+# is used.  Are there drivers that can be provided?
+
+%driver_hard_code_list = (
+       'cache'                 => ['hardcoded_no_driver'],
+       'eeprom'                => ['hardcoded_no_driver'],
+       'gpio'                  => ['hardcoded_no_driver'],
+       'gpio-keys'             => ['drivers/input/keyboard/gpio_keys.c'],
+       'i2c-gpio'              => ['drivers/i2c/busses/i2c-gpio.c'],
+       'isa'                   => ['arch/mips/mti-malta/malta-dt.c',
+                                    'arch/x86/kernel/devicetree.c'],
+       'led'                   => ['hardcoded_no_driver'],
+       'm25p32'                => ['hardcoded_no_driver'],
+       'm25p64'                => ['hardcoded_no_driver'],
+       'm25p80'                => ['hardcoded_no_driver'],
+       'mtd-ram'               => ['drivers/mtd/maps/physmap_of.c'],
+       'pwm-backlight'         => ['drivers/video/backlight/pwm_bl.c'],
+       'spidev'                => ['hardcoded_no_driver'],
+       'syscon'                => ['drivers/mfd/syscon.c'],
+       'tlv320aic23'           => ['hardcoded_no_driver'],
+       'wm8731'                => ['hardcoded_no_driver'],
+);
+
+# Use these config options instead of searching makefiles
+
+%driver_config_hard_code_list = (
+
+       # this one needed even if %driver_hard_code_list is empty
+       'no_driver'                             => ['no_config'],
+       'hardcoded_no_driver'                   => ['no_config'],
+
+       # drivers/usb/host/ehci-ppc-of.c
+       # drivers/usb/host/ehci-xilinx-of.c
+       #  are included from:
+       #    drivers/usb/host/ehci-hcd.c
+       #  thus the search of Makefile for the included .c files is incorrect
+       # ehci-hcd.c wraps the includes with ifdef CONFIG_USB_EHCI_HCD_..._OF
+       #
+       # similar model for ohci-hcd.c (but no ohci-xilinx-of.c)
+       #
+       # similarly, uhci-hcd.c includes uhci-platform.c
+
+       'drivers/usb/host/ehci-ppc-of.c'        => ['CONFIG_USB_EHCI_HCD',
+                                                   'CONFIG_USB_EHCI_HCD_PPC_OF'],
+       'drivers/usb/host/ohci-ppc-of.c'        => ['CONFIG_USB_OHCI_HCD',
+                                                   'CONFIG_USB_OHCI_HCD_PPC_OF'],
+
+       'drivers/usb/host/ehci-xilinx-of.c'     => ['CONFIG_USB_EHCI_HCD',
+                                                   'CONFIG_USB_EHCI_HCD_XILINX'],
+
+       'drivers/usb/host/uhci-platform.c'      => ['CONFIG_USB_UHCI_HCD',
+                                                   'CONFIG_USB_UHCI_PLATFORM'],
+
+       # scan_makefile will find only one of these config options:
+       #    ifneq ($(CONFIG_SOC_IMX6)$(CONFIG_SOC_LS1021A),)
+       'arch/arm/mach-imx/platsmp.c'           => ['CONFIG_SOC_IMX6 && CONFIG_SMP',
+                                                   'CONFIG_SOC_LS1021A && CONFIG_SMP'],
+);
+
+
+# 'virt/kvm/arm/.*' are controlled by makefiles in other directories,
+# using relative paths, such as 'KVM := ../../../virt/kvm'.  Do not
+# add complexity to find_kconfig() to deal with this.  There is a long
+# term intent to change the kvm related makefiles to the normal kernel
+# style.  After that is done, this entry can be removed from the
+# black_list_driver.
+
+@black_list_driver = (
+       # kvm no longer a problem after commit 503a62862e8f in 4.7-rc1
+       # 'virt/kvm/arm/.*',
+);
+
+
+sub usage()
+{
+       print
+"
+Usage: $script_name [options] device-tree...
+
+    device_tree is: dts_file | dtb_file | proc_device-tree
+
+
+Valid options:
+     -c FILE             Read kernel config options from FILE
+    --config FILE        synonym for 'c'
+    --config-format      config file friendly output format
+    --exclude-flag FLAG  exclude entries with a matching flag
+     -h                  Display this message and exit
+    --help               synonym for 'h'
+    --black-list-driver  use driver black list
+    --white-list-config  use config white list
+    --white-list-driver  use driver white list
+    --include-flag FLAG  include only entries with a matching flag
+    --include-suspect    include only entries with an uppercase flag
+    --short-name         do not show the path portion of the node name
+    --show-lists         report of white and black lists
+    --version            Display program version and exit
+
+
+  Report driver source files that match the compatibles in the device
+  tree file and the kernel config options that enable the driver source
+  files.
+
+  This program must be run in the root directory of a Linux kernel
+  source tree.
+
+  The default format is a report that is intended to be easily human
+  scannable.
+
+  An alternate format can be selected by --config-format.  This will
+  create output that can easily be edited to create a fragment that can
+  be appended to the existing kernel config file.  Each entry consists of
+  multiple lines.  The first line reports flags, the node path, compatible
+  value, driver file matching the compatible, configuration options, and
+  current values of the configuration options.  For each configuration
+  option, the following lines report the current value and the value that
+  is required for the driver file to be included in the kernel.
+
+  If a large number of drivers or config options is listed for a node,
+  and the '$pr_flag_value[$pr_flag_pos_hard_coded]' flag is set consider using --white-list-config and/or
+  --white-list-driver.  If the white list option suppresses the correct
+  entry please report that as a bug.
+
+  CAUTION:
+     This program uses heuristics to guess which driver(s) support each
+     compatible string and which config option(s) enables the driver(s).
+     Do not believe that the reported information is fully correct.
+     This program is intended to aid the process of determining the
+     proper kernel configuration for a device tree, but this is not
+     a fully automated process -- human involvement may still be
+     required!
+
+     The driver match heuristic used is to search for source files
+     containing the compatible string enclosed in quotes.
+
+     This program might not be able to find all drivers matching a
+     compatible string.
+
+     Some makefiles are overly clever.  This program was not made
+     complex enough to handle them.  If no config option is listed
+     for a driver, look at the makefile for the driver source file.
+     Even if a config option is listed for a driver, some other
+     available config options may not be listed.
+
+  FLAG values:
+";
+
+       for ($k = 0; $k < $num_pr_flags; $k++) {
+               printf "     %s   %s\n", $pr_flag_value[$k], $pr_flag_help[$k];
+       }
+
+       print
+"
+     Upper case letters indicate potential issues or problems.
+
+  The flag:
+
+";
+
+       $k = $pr_flag_pos_hard_coded;
+       printf "     %s   %s\n", $pr_flag_value[$k], $pr_flag_help[$k];
+
+       print
+"
+  will be set if the config or driver is in the white lists, even if
+  --white-list-config and --white-list-driver are not specified.
+  This is a hint that 1) many of these reported lines are likely to
+  be incorrect, and 2) using those options will reduce the number of
+  drivers and/or config options reported.
+
+  --white-list-config and --white-list-driver may not be accurate if this
+  program is not well maintained.  Use them with appropriate skepticism.
+  Use the --show-lists option to report the values in the list.
+
+  Return value:
+    0   if no error
+    1   error processing command line
+    2   unable to open or read kernel config file
+    3   unable to open or process input device tree file(s)
+
+  EXAMPLES:
+
+     dt_to_config arch/arm/boot/dts/my_dts_file.dts
+
+       Basic report.
+
+     dt_to_config \\
+        --config \${KBUILD_OUTPUT}/.config \\
+        arch/\${ARCH}/boot/dts/my_dts_file.dts
+
+       Full report, with config file issues noted.
+
+     dt_to_config --include-suspect \\
+        --config \${KBUILD_OUTPUT}/.config \\
+        arch/\${ARCH}/boot/dts/my_dts_file.dts
+
+       Report of node / compatible string / driver tuples that should
+       be further investigated.  A node may have multiple compatible
+       strings.  A compatible string may be matched by multiple drivers.
+       A driver may have config file issues noted.  The compatible string
+       and/or driver may be in the white lists.
+
+     dt_to_config --include-suspect --config-format \\
+        --config ${KBUILD_OUTPUT}/.config \\
+        arch/\${ARCH}/boot/dts/my_dts_file.dts
+
+       Report of node / compatible string / driver tuples that should
+       be further investigated.  The report can be edited to uncomment
+       the config options to select the desired tuple for a given node.
+       A node may have multiple compatible strings.  A compatible string
+       may be matched by multiple drivers.  A driver may have config file
+       issues noted.  The compatible string and/or driver may be in the
+       white lists.
+
+";
+}
+
+sub set_flag()
+{
+       # pr_flags_ref is a reference to $pr_flags
+
+       my $pr_flags_ref = shift;
+       my $pos          = shift;
+
+       substr $$pr_flags_ref, $pos, 1, $pr_flag_value[$pos];
+
+       return $pr_flags;
+}
+
+sub print_flags()
+{
+       # return 1 if anything printed, else 0
+
+       # some fields of pn_arg_ref might not be used in this function, but
+       # extract all of them anyway.
+       my $pn_arg_ref     = shift;
+
+       my $compat         = $pn_arg_ref->{compat};
+       my $compatible_cnt = $pn_arg_ref->{compatible_cnt};
+       my $config         = $pn_arg_ref->{config};
+       my $config_cnt     = $pn_arg_ref->{config_cnt};
+       my $driver         = $pn_arg_ref->{driver};
+       my $driver_cnt     = $pn_arg_ref->{driver_cnt};
+       my $full_node      = $pn_arg_ref->{full_node};
+       my $node           = $pn_arg_ref->{node};
+       my $node_enabled   = $pn_arg_ref->{node_enabled};
+       my $white_list     = $pn_arg_ref->{white_list};
+
+       my $pr_flags       = '-' x $num_pr_flags;
+
+
+       # ----- set flags in $pr_flags
+
+       if ($compatible_cnt > 1) {
+               &set_flag(\$pr_flags, $pr_flag_pos_mcompatible);
+       }
+
+       if ($config_cnt > 1) {
+               &set_flag(\$pr_flags, $pr_flag_pos_mconfig);
+       }
+
+       if ($driver_cnt >= 1) {
+               &set_flag(\$pr_flags, $pr_flag_pos_driver);
+       }
+
+       if ($driver_cnt > 1) {
+               &set_flag(\$pr_flags, $pr_flag_pos_mdriver);
+       }
+
+       # These strings are the same way the linux kernel tests.
+       # The ePapr lists of values is slightly different.
+       if (!(
+             ($node_enabled eq "") ||
+             ($node_enabled eq "ok") ||
+             ($node_enabled eq "okay")
+            )) {
+               &set_flag(\$pr_flags, $pr_flag_pos_node_not_enabled);
+       }
+
+       if ($white_list) {
+               &set_flag(\$pr_flags, $pr_flag_pos_white_list);
+       }
+
+       if (exists($driver_hard_code_list{$compat}) ||
+           (exists($driver_config_hard_code_list{$driver}) &&
+            ($driver ne "no_driver"))) {
+               &set_flag(\$pr_flags, $pr_flag_pos_hard_coded);
+       }
+
+       my @configs = split(' && ', $config);
+       for $configs (@configs) {
+               $not = $configs =~ /^!/;
+               $configs =~ s/^!//;
+
+               if (($configs ne "no_config") && ($configs ne "no_makefile")) {
+                       &set_flag(\$pr_flags, $pr_flag_pos_config);
+               }
+
+               if (($config_cnt >= 1) &&
+                   ($configs !~ /CONFIG_/) &&
+                   (($configs ne "no_config") && ($configs ne "no_makefile"))) {
+                       &set_flag(\$pr_flags, $pr_flag_pos_config_hard_coded);
+               }
+
+               my $existing_config = $existing_config{$configs};
+               if ($existing_config eq "m") {
+                       &set_flag(\$pr_flags, $pr_flag_pos_config_m);
+                       # Possible fail, depends on whether built in or
+                       # module is desired.
+                       &set_flag(\$pr_flags, $pr_flag_pos_config_test_fail);
+               } elsif ($existing_config eq "y") {
+                       &set_flag(\$pr_flags, $pr_flag_pos_config_y);
+                       if ($not) {
+                               &set_flag(\$pr_flags, $pr_flag_pos_config_test_fail);
+                       }
+               } elsif (($config_file) && ($configs =~ /CONFIG_/)) {
+                       &set_flag(\$pr_flags, $pr_flag_pos_config_none);
+                       if (!$not) {
+                               &set_flag(\$pr_flags, $pr_flag_pos_config_test_fail);
+                       }
+               }
+       }
+
+       # ----- include / exclude filters
+
+       if ($include_flag_pattern && ($pr_flags !~ m/$include_flag_pattern/)) {
+               return 0;
+       }
+
+       if ($exclude_flag_pattern && ($pr_flags =~ m/$exclude_flag_pattern/)) {
+               return 0;
+       }
+
+       if ($config_format) {
+               print "# ";
+       }
+       print "$pr_flags : ";
+
+       return 1;
+}
+
+
+sub print_node()
+{
+       # return number of lines printed
+
+       # some fields of pn_arg_ref might not be used in this function, but
+       # extract all of them anyway.
+       my $pn_arg_ref     = shift;
+
+       my $compat         = $pn_arg_ref->{compat};
+       my $compatible_cnt = $pn_arg_ref->{compatible_cnt};
+       my $config         = $pn_arg_ref->{config};
+       my $config_cnt     = $pn_arg_ref->{config_cnt};
+       my $driver         = $pn_arg_ref->{driver};
+       my $driver_cnt     = $pn_arg_ref->{driver_cnt};
+       my $full_node      = $pn_arg_ref->{full_node};
+       my $node           = $pn_arg_ref->{node};
+       my $node_enabled   = $pn_arg_ref->{node_enabled};
+       my $white_list     = $pn_arg_ref->{white_list};
+
+       my $separator;
+
+       if (! &print_flags($pn_arg_ref)) {
+               return 0;
+       }
+
+
+       if ($short_name) {
+               print "$node";
+       } else {
+               print "$full_node";
+       }
+       print " : $compat : $driver : $config : ";
+
+       my @configs = split(' && ', $config);
+
+       if ($config_file) {
+               for $configs (@configs) {
+                       $configs =~ s/^!//;
+                       my $existing_config = $existing_config{$configs};
+                       if (!$existing_config) {
+                               # check for /-m/, /-y/, or /-objs/
+                               if ($configs !~ /CONFIG_/) {
+                                       $existing_config = "x";
+                               };
+                       };
+                       if ($existing_config) {
+                               print "$separator", "$existing_config";
+                               $separator = ", ";
+                       } else {
+                               print "$separator", "n";
+                               $separator = ", ";
+                       }
+               }
+       } else {
+               print "none";
+       }
+
+       print "\n";
+
+       if ($config_format) {
+               for $configs (@configs) {
+                       $not = $configs =~ /^!/;
+                       $configs =~ s/^!//;
+                       my $existing_config = $existing_config{$configs};
+
+                       if ($not) {
+                               if ($configs !~ /CONFIG_/) {
+                                       print "# $configs\n";
+                               } elsif ($existing_config eq "m") {
+                                       print "# $configs is m\n";
+                                       print "# $configs=n\n";
+                               } elsif ($existing_config eq "y") {
+                                       print "# $configs is set\n";
+                                       print "# $configs=n\n";
+                               } else {
+                                       print "# $configs is not set\n";
+                                       print "# $configs=n\n";
+                               }
+
+                       } else {
+                               if ($configs !~ /CONFIG_/) {
+                                       print "# $configs\n";
+                               } elsif ($existing_config eq "m") {
+                                       print "# $configs is m\n";
+                                       print "# $configs=y\n";
+                               } elsif ($existing_config eq "y") {
+                                       print "# $configs is set\n";
+                                       print "# $configs=y\n";
+                               } else {
+                                       print "# $configs is not set\n";
+                                       print "# $configs=y\n";
+                               }
+                       }
+               }
+       }
+
+       return 1;
+}
+
+
+sub scan_makefile
+{
+       my $pn_arg_ref    = shift;
+       my $driver        = shift;
+
+       # ----- Find Kconfig symbols that enable driver
+
+       my ($dir, $base) = $driver =~ m{(.*)/(.*).c};
+
+       my $makefile = $dir . "/Makefile";
+       if (! -r $makefile) {
+               $makefile = $dir . "/Kbuild";
+       }
+       if (! -r $makefile) {
+               my $config;
+
+               $config = 'no_makefile';
+               push @{ $driver_config{$driver} }, $config;
+               return;
+       }
+
+       if (!open(MAKEFILE_FILE, "<", "$makefile")) {
+               return;
+       }
+
+       my $line;
+       my @config;
+       my @if_config;
+       my @make_var;
+
+       NEXT_LINE:
+       while ($next_line = <MAKEFILE_FILE>) {
+               my $config;
+               my $if_config;
+               my $ifdef;
+               my $ifeq;
+               my $ifndef;
+               my $ifneq;
+               my $ifdef_config;
+               my $ifeq_config;
+               my $ifndef_config;
+               my $ifneq_config;
+
+               chomp($next_line);
+               $line = $line . $next_line;
+               if ($next_line =~ /\\$/) {
+                       $line =~ s/\\$/ /;
+                       next NEXT_LINE;
+               }
+               if ($line =~ /^\s*#/) {
+                       $line = "";
+                       next NEXT_LINE;
+               }
+
+               # -----  condition ... else ... endif
+
+               if ($line =~ /^([ ]\s*|)else\b/) {
+                       $if_config = "!" . pop @if_config;
+                       $if_config =~ s/^!!//;
+                       push @if_config, $if_config;
+                       $line =~ s/^([ ]\s*|)else\b//;
+               }
+
+               ($null, $ifeq_config,  $ifeq_config_val )  = $line =~ /^([ ]\s*|)ifeq\b.*\b(CONFIG_[A-Za-z0-9_]*)(.*)/;
+               ($null, $ifneq_config, $ifneq_config_val)  = $line =~ /^([ ]\s*|)ifneq\b.*\b(CONFIG_[A-Za-z0-9_]*)(.*)/;
+               ($null, $ifdef_config)                     = $line =~ /^([ ]\s*|)ifdef\b.*\b(CONFIG_[A-Za-z0-9_]*)/;
+               ($null, $ifndef_config)                    = $line =~ /^([ ]\s*|)ifndef\b.*\b(CONFIG_[A-Za-z0-9_]*)/;
+
+               ($null, $ifeq)   = $line =~ /^([ ]\s*|)ifeq\b\s*(.*)/;
+               ($null, $ifneq)  = $line =~ /^([ ]\s*|)ifneq\b\s*(.*)/;
+               ($null, $ifdef)  = $line =~ /^([ ]\s*|)ifdef\b\s*(.*)/;
+               ($null, $ifndef) = $line =~ /^([ ]\s*|)ifndef\b\s*(.*)/;
+
+               # Order of tests is important.  Prefer "CONFIG_*" regex match over
+               # less specific regex match.
+               if ($ifdef_config) {
+                       $if_config = $ifdef_config;
+               } elsif ($ifeq_config) {
+                       if ($ifeq_config_val =~ /y/) {
+                               $if_config = $ifeq_config;
+                       } else {
+                               $if_config = "!" . $ifeq_config;
+                       }
+               } elsif ($ifndef_config) {
+                       $if_config = "!" . $ifndef_config;
+               } elsif ($ifneq_config) {
+                       if ($ifneq_config_val =~ /y/) {
+                               $if_config = "!" . $ifneq_config;
+                       } else {
+                               $if_config = $ifneq_config;
+                       }
+               } elsif ($ifdef) {
+                       $if_config = $ifdef;
+               } elsif ($ifeq) {
+                       $if_config = $ifeq;
+               } elsif ($ifndef) {
+                       $if_config = "!" . $ifndef;
+               } elsif ($ifneq) {
+                       $if_config = "!" . $ifneq;
+               } else {
+                       $if_config = "";
+               }
+               $if_config =~ s/^!!//;
+
+               if ($if_config) {
+                       push @if_config, $if_config;
+                       $line = "";
+                       next NEXT_LINE;
+               }
+
+               if ($line =~ /^([ ]\s*|)endif\b/) {
+                       pop @if_config;
+                       $line = "";
+                       next NEXT_LINE;
+               }
+
+               # ----- simple CONFIG_* = *.[co]  or  xxx [+:?]*= *.[co]
+               # Most makefiles select on *.o, but
+               # arch/powerpc/boot/Makefile selects on *.c
+
+               ($config) = $line =~ /(CONFIG_[A-Za-z0-9_]+).*\b$base.[co]\b/;
+
+               # ----- match a make variable instead of *.[co]
+               # Recursively expanded variables are not handled.
+
+               if (!$config) {
+                       my $make_var;
+                       ($make_var) = $line =~ /\s*(\S+?)\s*[+:\?]*=.*\b$base.[co]\b/;
+                       if ($make_var) {
+                               if ($make_var =~ /[a-zA-Z0-9]+-[ym]/) {
+                                       $config = $make_var;
+                               } elsif ($make_var =~ /[a-zA-Z0-9]+-objs/) {
+                                       $config = $make_var;
+                               } else {
+                                       push @make_var, $make_var;
+                               }
+                       }
+               }
+
+               if (!$config) {
+                       for $make_var (@make_var) {
+                               ($config) = $line =~ /(CONFIG_[A-Za-z0-9_]+).*\b$make_var\b/;
+                               last if ($config);
+                       }
+               }
+
+               if (!$config) {
+                       for $make_var (@make_var) {
+                               ($config) = $line =~ /\s*(\S+?)\s*[+:\?]*=.*\b$make_var\b/;
+                               last if ($config);
+                       }
+               }
+
+               # ----- next if no config found
+
+               if (!$config) {
+                       $line = "";
+                       next NEXT_LINE;
+               }
+
+               for $if_config (@if_config) {
+                       $config = $if_config . " && " . $config;
+               }
+
+               push @{ $driver_config{$driver} }, $config;
+
+               $line = "";
+       }
+
+       close(MAKEFILE_FILE);
+
+}
+
+
+sub find_kconfig
+{
+       my $pn_arg_ref    = shift;
+       my $driver        = shift;
+
+       my $lines_printed = 0;
+       my @configs;
+
+       if (!@{ $driver_config{$driver} }) {
+               &scan_makefile($pn_arg_ref, $driver);
+               if (!@{ $driver_config{$driver} }) {
+                       push @{ $driver_config{$driver} }, "no_config";
+               }
+       }
+
+       @configs = @{ $driver_config{$driver} };
+
+       $$pn_arg_ref{config_cnt} = $#configs + 1;
+       for my $config (@configs) {
+               $$pn_arg_ref{config} = $config;
+               $lines_printed += &print_node($pn_arg_ref);
+       }
+
+       return $lines_printed;
+}
+
+
+sub handle_compatible()
+{
+       my $full_node     = shift;
+       my $node          = shift;
+       my $compatible    = shift;
+       my $node_enabled  = shift;
+
+       my $compat;
+       my $lines_printed = 0;
+       my %pn_arg        = ();
+
+       return if (!$node or !$compatible);
+
+       # Do not process compatible property of root node,
+       # it is used to match board, not to bind a driver.
+       return if ($node eq "/");
+
+       $pn_arg{full_node}    = $full_node;
+       $pn_arg{node}         = $node;
+       $pn_arg{node_enabled} = $node_enabled;
+
+       my @compatibles = split('", "', $compatible);
+
+       $compatibles[0] =~ s/^"//;
+       $compatibles[$#compatibles] =~ s/"$//;
+
+       $pn_arg{compatible_cnt} = $#compatibles + 1;
+
+       COMPAT:
+       for $compat (@compatibles) {
+
+               $pn_arg{compat}     = $compat;
+               $pn_arg{driver_cnt} = 0;
+               $pn_arg{white_list} = 0;
+
+               if (exists($compat_white_list{$compat})) {
+                       $pn_arg{white_list} = 1;
+                       $pn_arg{driver}     = "no_driver";
+                       $pn_arg{config_cnt} = 1;
+                       $pn_arg{config}     = "no_config";
+                       $lines_printed += &print_node(\%pn_arg);
+                       next COMPAT;
+               }
+
+               # ----- if compat previously seen, use cached info
+
+               if (exists($compat_driver{$compat})) {
+                       for my $driver (@{ $compat_driver{$compat} }) {
+                               $pn_arg{driver}     = $driver;
+                               $pn_arg{driver_cnt} = $driver_count{$compat};
+                               $pn_arg{config_cnt} = $#{ $driver_config{$driver}} + 1;
+
+                               for my $config (@{ $driver_config{$driver} }) {
+                                       $pn_arg{config} = $config;
+                                       $lines_printed += &print_node(\%pn_arg);
+                               }
+
+                               if (!@{ $driver_config{$driver} }) {
+                                       # no config cached yet
+                                       # $driver in %driver_hard_code_list
+                                       # but not %driver_config_hard_code_list
+                                       $lines_printed += &find_kconfig(\%pn_arg, $driver);
+                               }
+                       }
+                       next COMPAT;
+               }
+
+
+               # ----- Find drivers (source files that contain compatible)
+
+               # this will miss arch/sparc/include/asm/parport.h
+               # It is better to move the compatible out of the .h
+               # than to add *.h. to the files list, because *.h generates
+               # a lot of false negatives.
+               my $files = '"*.c"';
+               my $drivers = `git grep -l '"$compat"' -- $files`;
+               chomp($drivers);
+               if ($drivers eq "") {
+                       $pn_arg{driver} = "no_driver";
+                       $pn_arg{config_cnt} = 1;
+                       $pn_arg{config} = "no_config";
+                       push @{ $compat_driver{$compat} }, "no_driver";
+                       $lines_printed += &print_node(\%pn_arg);
+                       next COMPAT;
+               }
+
+               my @drivers = split("\n", $drivers);
+               $driver_count{$compat} = $#drivers + 1;
+               $pn_arg{driver_cnt}    = $#drivers + 1;
+
+               DRIVER:
+               for my $driver (@drivers) {
+                       push @{ $compat_driver{$compat} }, $driver;
+                       $pn_arg{driver} = $driver;
+
+                       # ----- if driver previously seen, use cached info
+
+                       $pn_arg{config_cnt} = $#{ $driver_config{$driver} } + 1;
+                       for my $config (@{ $driver_config{$driver} }) {
+                               $pn_arg{config} = $config;
+                               $lines_printed += &print_node(\%pn_arg);
+                       }
+                       if (@{ $driver_config{$driver} }) {
+                               next DRIVER;
+                       }
+
+                       if ($black_list_driver) {
+                               for $black (@black_list_driver) {
+                                       next DRIVER if ($driver =~ /^$black$/);
+                               }
+                       }
+
+
+                       # ----- Find Kconfig symbols that enable driver
+
+                       $lines_printed += &find_kconfig(\%pn_arg, $driver);
+
+               }
+       }
+
+       # White space (line) between nodes for readability.
+       # Each node may report several compatibles.
+       # For each compatible, multiple drivers may be reported.
+       # For each driver, multiple CONFIG_ options may be reported.
+       if ($lines_printed) {
+               print "\n";
+       }
+}
+
+sub read_dts()
+{
+       my $file         = shift;
+
+       my $compatible   = "";
+       my $line;
+       my $node         = "";
+       my $node_enabled = "";
+
+       if (! -r $file) {
+               print STDERR "file '$file' is not readable or does not exist\n";
+               exit 3;
+       }
+
+       if (!open(DT_FILE, "-|", "$dtx_diff $file")) {
+               print STDERR "\n";
+               print STDERR "shell command failed:\n";
+               print STDERR "   $dtx_diff $file\n";
+               print STDERR "\n";
+               exit 3;
+       }
+
+       FILE:
+       while ($line = <DT_FILE>) {
+               chomp($line);
+
+               if ($line =~ /{/) {
+
+                       &handle_compatible($full_node, $node, $compatible,
+                                          $node_enabled);
+
+                       while ($end_node_count-- > 0) {
+                               pop @full_node;
+                       };
+                       $end_node_count = 0;
+                       $full_node = @full_node[-1];
+
+                       $node = $line;
+                       $node =~ s/^\s*(.*)\s+\{.*/$1/;
+                       $node =~ s/.*: //;
+                       if ($node eq '/' ) {
+                               $full_node = '/';
+                       } elsif ($full_node ne '/') {
+                               $full_node = $full_node . '/' . $node;
+                       } else {
+                               $full_node = '/' . $node;
+                       }
+                       push @full_node, $full_node;
+
+                       $compatible = "";
+                       $node_enabled = "";
+                       next FILE;
+               }
+
+               if ($line =~ /}/) {
+                       $end_node_count++;
+               }
+
+               if ($line =~ /(\s+|^)status =/) {
+                       $node_enabled = $line;
+                       $node_enabled =~ s/^\t*//;
+                       $node_enabled =~ s/^status = "//;
+                       $node_enabled =~ s/";$//;
+                       next FILE;
+               }
+
+               if ($line =~ /(\s+|^)compatible =/) {
+                       # Extract all compatible entries for this device
+                       # White space matching here and in handle_compatible() is
+                       # precise, because input format is the output of dtc,
+                       # which is invoked by dtx_diff.
+                       $compatible = $line;
+                       $compatible =~ s/^\t*//;
+                       $compatible =~ s/^compatible = //;
+                       $compatible =~ s/;$//;
+               }
+       }
+
+       &handle_compatible($full_node, $node, $compatible, $node_enabled);
+
+       close(DT_FILE);
+}
+
+
+sub read_config_file()
+{
+       if (! -r $config_file) {
+               print STDERR "file '$config_file' is not readable or does not exist\n";
+               exit 2;
+       }
+
+       if (!open(CONFIG_FILE, "<", "$config_file")) {
+               print STDERR "open $config_file failed\n";
+               exit 2;
+       }
+
+       my @line;
+
+       LINE:
+       while ($line = <CONFIG_FILE>) {
+               chomp($line);
+               next LINE if ($line =~ /^\s*#/);
+               next LINE if ($line =~ /^\s*$/);
+               @line = split /=/, $line;
+               $existing_config{@line[0]} = @line[1];
+       }
+
+       close(CONFIG_FILE);
+}
+
+
+sub cmd_line_err()
+{
+       my $msg = shift;
+
+       print STDERR "\n";
+       print STDERR "   ERROR processing command line options\n";
+       print STDERR "         $msg\n" if ($msg ne "");
+       print STDERR "\n";
+       print STDERR "   For help, type '$script_name --help'\n";
+       print STDERR "\n";
+}
+
+
+# -----------------------------------------------------------------------------
+# program entry point
+
+Getopt::Long::Configure("no_ignore_case", "bundling");
+
+if (!GetOptions(
+       "c=s"               => \$config_file,
+       "config=s"          => \$config_file,
+       "config-format"     => \$config_format,
+       "exclude-flag=s"    => \@exclude_flag,
+       "h"                 => \$help,
+       "help"              => \$help,
+       "black-list-driver" => \$black_list_driver,
+       "white-list-config" => \$white_list_config,
+       "white-list-driver" => \$white_list_driver,
+       "include-flag=s"    => \@include_flag,
+       "include-suspect"   => \$include_suspect,
+       "short-name"        => \$short_name,
+       "show-lists"        => \$show_lists,
+       "version"           => \$version,
+       )) {
+
+       &cmd_line_err();
+
+       exit 1;
+}
+
+
+my $exit_after_messages = 0;
+
+if ($version) {
+       print STDERR "\n$script_name  $VUFX\n\n";
+       $exit_after_messages = 1;
+}
+
+
+if ($help) {
+       &usage;
+       $exit_after_messages = 1;
+}
+
+
+if ($show_lists) {
+
+       print "\n";
+       print "These compatibles are hard coded to have no driver.\n";
+       print "\n";
+       for my $compat (sort keys %compat_white_list) {
+               print "   $compat\n";
+       }
+
+
+       print "\n\n";
+       print "The driver for these compatibles is hard coded (white list).\n";
+       print "\n";
+       my $max_compat_len = 0;
+       for my $compat (sort keys %driver_hard_code_list) {
+               if (length $compat > $max_compat_len) {
+                       $max_compat_len = length $compat;
+               }
+       }
+       for my $compat (sort keys %driver_hard_code_list) {
+               if (($driver ne "hardcoded_no_driver") && ($driver ne "no_driver")) {
+                       my $first = 1;
+                       for my $driver (@{ $driver_hard_code_list{$compat} }) {
+                               if ($first) {
+                                       print "   $compat";
+                                       print " " x ($max_compat_len - length $compat);
+                                       $first = 0;
+                               } else {
+                                       print "   ", " " x $max_compat_len;
+                               }
+                               print "  $driver\n";
+                       }
+               }
+       }
+
+
+       print "\n\n";
+       print "The configuration option for these drivers is hard coded (white list).\n";
+       print "\n";
+       my $max_driver_len = 0;
+       for my $driver (sort keys %driver_config_hard_code_list) {
+               if (length $driver > $max_driver_len) {
+                       $max_driver_len = length $driver;
+               }
+       }
+       for my $driver (sort keys %driver_config_hard_code_list) {
+               if (($driver ne "hardcoded_no_driver") && ($driver ne "no_driver")) {
+                       my $first = 1;
+                       for my $config (@{ $driver_config_hard_code_list{$driver} }) {
+                               if ($first) {
+                                       print "   $driver";
+                                       print " " x ($max_driver_len - length $driver);
+                                       $first = 0;
+                               } else {
+                                       print "   ", " " x $max_driver_len;
+                               }
+                               print "  $config\n";
+                       }
+               }
+       }
+
+
+       print "\n\n";
+       print "These drivers are black listed.\n";
+       print "\n";
+       for my $driver (@black_list_driver) {
+               print "   $driver\n";
+       }
+
+       print "\n";
+
+       $exit_after_messages = 1;
+}
+
+
+if ($exit_after_messages) {
+       exit 0;
+}
+
+
+$exclude_flag_pattern = "[";
+for my $exclude_flag (@exclude_flag) {
+       $exclude_flag_pattern = $exclude_flag_pattern . $exclude_flag;
+}
+$exclude_flag_pattern = $exclude_flag_pattern . "]";
+# clean up if empty
+$exclude_flag_pattern =~ s/^\[\]$//;
+
+
+$include_flag_pattern = "[";
+for my $include_flag (@include_flag) {
+       $include_flag_pattern = $include_flag_pattern . $include_flag;
+}
+$include_flag_pattern = $include_flag_pattern . "]";
+# clean up if empty
+$include_flag_pattern =~ s/^\[\]$//;
+
+
+if ($exclude_flag_pattern) {
+       my $found = 0;
+       for $pr_flag_value (@pr_flag_value) {
+               if ($exclude_flag_pattern =~ m/$pr_flag_value/) {
+                       $found = 1;
+               }
+       }
+       if (!$found) {
+               &cmd_line_err("invalid value for FLAG in --exclude-flag\n");
+               exit 1
+       }
+}
+
+if ($include_flag_pattern) {
+       my $found = 0;
+       for $pr_flag_value (@pr_flag_value) {
+               if ($include_flag_pattern =~ m/$pr_flag_value/) {
+                       $found = 1;
+               }
+       }
+       if (!$found) {
+               &cmd_line_err("invalid value for FLAG in --include-flag\n");
+               exit 1
+       }
+}
+
+if ($include_suspect) {
+       $include_flag_pattern =~ s/\[//;
+       $include_flag_pattern =~ s/\]//;
+       $include_flag_pattern = "[" . $include_flag_pattern . "A-Z]";
+}
+
+if ($exclude_flag_pattern =~ m/$include_flag_pattern/) {
+       &cmd_line_err("the same flag appears in both --exclude-flag and --include-flag or --include-suspect\n");
+       exit 1
+}
+
+
+# ($#ARGV < 0) is valid for --help, --version
+if ($#ARGV < 0) {
+       &cmd_line_err("device-tree... is required");
+       exit 1
+}
+
+
+if ($config_file) {
+       &read_config_file();
+}
+
+
+# avoid pushing duplicates for this value
+$driver = "hardcoded_no_driver";
+for $config ( @{ $driver_config_hard_code_list{$driver} } ) {
+       push @{ $driver_config{$driver} }, $config;
+}
+
+if ($white_list_driver) {
+       for my $compat (keys %driver_hard_code_list) {
+               for my $driver (@{ $driver_hard_code_list{$compat} }) {
+                       push @{ $compat_driver{$compat} }, $driver;
+                       if ($driver ne "hardcoded_no_driver") {
+                               $driver_count{$compat} = scalar @{ $compat_driver{$compat} };
+                       }
+               }
+       }
+}
+
+if ($white_list_config) {
+       for my $driver (keys %driver_config_hard_code_list) {
+               if ($driver ne "hardcoded_no_driver") {
+                       for $config ( @{ $driver_config_hard_code_list{$driver} } ) {
+                               push @{ $driver_config{$driver} }, $config;
+                       }
+               }
+       }
+}
+
+if (-x "scripts/dtc/dtx_diff") {
+       $dtx_diff = "scripts/dtc/dtx_diff";
+} else {
+
+       print STDERR "\n";
+       print STDERR "$script_name must be run from the root directory of a Linux kernel tree\n";
+       print STDERR "\n";
+       exit 3;
+}
+
+for $file (@ARGV) {
+       &read_dts($file);
+}
diff --git a/scripts/dtc/dtx_diff b/scripts/dtc/dtx_diff
index 959ab2646d38..ec47f95991a3 100755
--- a/scripts/dtc/dtx_diff
+++ b/scripts/dtc/dtx_diff
@@ -266,7 +266,7 @@ DTC="${__KBUILD_OUTPUT}/scripts/dtc/dtc"
 
 if [ ! -x ${DTC} ] ; then
 	__DTC="dtc"
-	if grep -q "^CONFIG_DTC=y" ${__KBUILD_OUTPUT}/.config ; then
+	if grep -q "^CONFIG_DTC=y" ${__KBUILD_OUTPUT}/.config 2>/dev/null; then
 		make_command='
          make scripts'
 	else

From 90b7079af2cd25a8c1b3c03a3f840dae5f5bca2e Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 4 Jan 2017 10:45:20 -0600
Subject: [PATCH 1555/3220] UPSTREAM: scripts/dtc: Update to upstream version
 0931cea3ba20

Sync to upstream dtc commit 0931cea3ba20 ("dtc: fdtdump: check fdt if
not in scanning mode"). In particular, this pulls in dtc overlay
support.

This adds the following commits from upstream:

f88865469b65 dtc: Fix memory leak in character literal parsing
00fbb8696b66 Rename boot_info
1ef86ad2c24f dtc: Clean up /dts-v1/ and /plugin/ handling in grammar
e3c769aa9c16 dtc: Don't always generate __symbols__ for plugins
c96cb3c0169e tests: Don't use -@ on plugin de/recompile tests
66381538ce24 tests: Remove "suppression of fixups" tests
ba765b273f0f tests: Clarify dtc overlay tests
6ea8cd944fcd tests: More thorough tests of libfdt overlay application without dtc
7d8ef6e1db97 tests: Correct fdt handling of overlays without fixups and base trees without symbols
b4dc0ed8b127 tests: Fix double expansion bugs in test code
3ea879dc0c8f tests: Split overlay tests into those with do/don't exercise dtc plugin generation
47b4d66a2f11 tests: Test auto-alias generation on base tree, not overlay
72e1ad811523 tests: Make overlay/plugin tests unconditional
e7b3c3b5951b tests: Add overlay tests
9637e3f772a9 tests: Add check_path test
20f29d8d41f6 dtc: Plugin and fixup support
a2c92cac53f8 dtc: Document the dynamic plugin internals
8f70ac39801d checks: Pass boot_info instead of root node
ea10f953878f libfdt: add missing errors to fdt_strerror()
daa75e8fa594 libfdt: fix fdt_stringlist_search()
e28eff5b787a libfdt: fix fdt_stringlist_count()
ae97c7722840 tests: overlay: Rename the device tree blobs to be more explicit
96162d2bd9cb tests: overlay: Add test suffix to the compiled blobs
5ce8634733b7 libfdt: Add fdt_overlay_apply to the exported symbols
804a9db90ad2 fdt: strerr: Remove spurious BADOVERLAY
e8c3a1a493fa tests: overlay: Move back the bad fixup tests
7a72d89d3f81 libfdt: overlay: Fix symbols and fixups nodes condition
cabbaa972cdd libfdt: overlay: Report a bad overlay for mismatching local fixups
deb0a5c1aeaa libfdt: Add BADPHANDLE error string
7b7a6be9ba15 libfdt: Don't use 'index' as a local variable name
aea8860d831e tests: Add tests cases for the overlay code
0cdd06c5135b libfdt: Add overlay application function
39240cc865cf libfdt: Extend the reach of FDT_ERR_BADPHANDLE
4aa3a6f5e6d9 libfdt: Add new errors for the overlay code
6d1832c9e64b dtc: Remove "home page" link
45fd440a9561 Fix some typing errors in libfdt.h and livetree.c
a59be4939c13 Merge tag 'v1.4.2'
a34bb721caca dtc: Fix assorted problems in the testcases for the -a option
874f40588d3e Implement the -a option to pad dtb aligned
ec02b34c05be dtc: Makefile improvements for release uploading
1ed45d40a137 dtc: Bump version to 1.4.2
36fd7331fb11 libfdt: simplify fdt_del_mem_rsv()
d877364e4a0f libfdt: Add fdt_setprop_inplace_namelen_partial
3e9037aaad44 libfdt: Add fdt_getprop_namelen_w
84e0e1346c68 libfdt: Add max phandle retrieval function
d29126c90acb libfdt: Add iterator over properties
902d0f0953d0 libfdt: Add a subnodes iterator macro
c539075ba8ba fdtput.c: Fix memory leak.
f79ddb83e185 fdtget.c: Fix memory leak
1074ee54b63f convert-dtsv0-lexer.l: fix memory leak
e24d39a024e6 fdtdump.c: make sure size_t argument to memchr is always unsigned.
44a59713cf05 Remove unused srcpos_dump() function
cb9241ae3453 DTC: Fix memory leak on flatname.
1ee0ae24ea09 Simplify check field and macro names
9d97527a8621 Remove property check functions
2e709d158e11 Remove tree check functions
c4cb12e193e3 Alter grammar to allow multiple /dts-v1/ tags
d71d25d76012 Use xasprintf() in srcpos
9dc404958e9c util: Add xasprintf portable asprintf variant
beef80b8b55f Correct a missing space in a fdt_header cast
68d43cec1253 Correct line lengths in libfdt.h
b0dbceafd49a Correct space-after-tab in libfdt.h

Signed-off-by: Rob Herring <robh@kernel.org>
(cherry picked from commit 6f05afcbb031722ec1eff77dde188ff2edf8940e)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 scripts/dtc/checks.c                 | 351 ++++++-----
 scripts/dtc/dtc-lexer.l              |  21 +-
 scripts/dtc/dtc-lexer.lex.c_shipped  | 632 ++++++++++---------
 scripts/dtc/dtc-parser.tab.c_shipped | 910 ++++++++++++++-------------
 scripts/dtc/dtc-parser.tab.h_shipped |  54 +-
 scripts/dtc/dtc-parser.y             |  34 +-
 scripts/dtc/dtc.c                    |  69 +-
 scripts/dtc/dtc.h                    |  39 +-
 scripts/dtc/flattree.c               |  41 +-
 scripts/dtc/fstree.c                 |   5 +-
 scripts/dtc/libfdt/Makefile.libfdt   |   2 +-
 scripts/dtc/libfdt/fdt_ro.c          |  30 +-
 scripts/dtc/libfdt/fdt_rw.c          |   6 +-
 scripts/dtc/libfdt/fdt_strerror.c    |   6 +
 scripts/dtc/libfdt/fdt_wip.c         |  31 +-
 scripts/dtc/libfdt/libfdt.h          | 210 ++++++-
 scripts/dtc/libfdt/libfdt_env.h      |   1 +
 scripts/dtc/livetree.c               | 299 ++++++++-
 scripts/dtc/srcpos.c                 |  35 +-
 scripts/dtc/srcpos.h                 |   1 -
 scripts/dtc/treesource.c             |  14 +-
 scripts/dtc/util.c                   |  30 +
 scripts/dtc/util.h                   |   1 +
 scripts/dtc/version_gen.h            |   2 +-
 24 files changed, 1733 insertions(+), 1091 deletions(-)

diff --git a/scripts/dtc/checks.c b/scripts/dtc/checks.c
index 386f9563313f..3d18e45374c8 100644
--- a/scripts/dtc/checks.c
+++ b/scripts/dtc/checks.c
@@ -40,16 +40,11 @@ enum checkstatus {
 
 struct check;
 
-typedef void (*tree_check_fn)(struct check *c, struct node *dt);
-typedef void (*node_check_fn)(struct check *c, struct node *dt, struct node *node);
-typedef void (*prop_check_fn)(struct check *c, struct node *dt,
-			      struct node *node, struct property *prop);
+typedef void (*check_fn)(struct check *c, struct dt_info *dti, struct node *node);
 
 struct check {
 	const char *name;
-	tree_check_fn tree_fn;
-	node_check_fn node_fn;
-	prop_check_fn prop_fn;
+	check_fn fn;
 	void *data;
 	bool warn, error;
 	enum checkstatus status;
@@ -58,45 +53,24 @@ struct check {
 	struct check **prereq;
 };
 
-#define CHECK_ENTRY(nm, tfn, nfn, pfn, d, w, e, ...)	       \
-	static struct check *nm##_prereqs[] = { __VA_ARGS__ }; \
-	static struct check nm = { \
-		.name = #nm, \
-		.tree_fn = (tfn), \
-		.node_fn = (nfn), \
-		.prop_fn = (pfn), \
-		.data = (d), \
-		.warn = (w), \
-		.error = (e), \
+#define CHECK_ENTRY(_nm, _fn, _d, _w, _e, ...)	       \
+	static struct check *_nm##_prereqs[] = { __VA_ARGS__ }; \
+	static struct check _nm = { \
+		.name = #_nm, \
+		.fn = (_fn), \
+		.data = (_d), \
+		.warn = (_w), \
+		.error = (_e), \
 		.status = UNCHECKED, \
-		.num_prereqs = ARRAY_SIZE(nm##_prereqs), \
-		.prereq = nm##_prereqs, \
+		.num_prereqs = ARRAY_SIZE(_nm##_prereqs), \
+		.prereq = _nm##_prereqs, \
 	};
-#define WARNING(nm, tfn, nfn, pfn, d, ...) \
-	CHECK_ENTRY(nm, tfn, nfn, pfn, d, true, false, __VA_ARGS__)
-#define ERROR(nm, tfn, nfn, pfn, d, ...) \
-	CHECK_ENTRY(nm, tfn, nfn, pfn, d, false, true, __VA_ARGS__)
-#define CHECK(nm, tfn, nfn, pfn, d, ...) \
-	CHECK_ENTRY(nm, tfn, nfn, pfn, d, false, false, __VA_ARGS__)
-
-#define TREE_WARNING(nm, d, ...) \
-	WARNING(nm, check_##nm, NULL, NULL, d, __VA_ARGS__)
-#define TREE_ERROR(nm, d, ...) \
-	ERROR(nm, check_##nm, NULL, NULL, d, __VA_ARGS__)
-#define TREE_CHECK(nm, d, ...) \
-	CHECK(nm, check_##nm, NULL, NULL, d, __VA_ARGS__)
-#define NODE_WARNING(nm, d, ...) \
-	WARNING(nm, NULL, check_##nm, NULL, d,  __VA_ARGS__)
-#define NODE_ERROR(nm, d, ...) \
-	ERROR(nm, NULL, check_##nm, NULL, d, __VA_ARGS__)
-#define NODE_CHECK(nm, d, ...) \
-	CHECK(nm, NULL, check_##nm, NULL, d, __VA_ARGS__)
-#define PROP_WARNING(nm, d, ...) \
-	WARNING(nm, NULL, NULL, check_##nm, d, __VA_ARGS__)
-#define PROP_ERROR(nm, d, ...) \
-	ERROR(nm, NULL, NULL, check_##nm, d, __VA_ARGS__)
-#define PROP_CHECK(nm, d, ...) \
-	CHECK(nm, NULL, NULL, check_##nm, d, __VA_ARGS__)
+#define WARNING(_nm, _fn, _d, ...) \
+	CHECK_ENTRY(_nm, _fn, _d, true, false, __VA_ARGS__)
+#define ERROR(_nm, _fn, _d, ...) \
+	CHECK_ENTRY(_nm, _fn, _d, false, true, __VA_ARGS__)
+#define CHECK(_nm, _fn, _d, ...) \
+	CHECK_ENTRY(_nm, _fn, _d, false, false, __VA_ARGS__)
 
 #ifdef __GNUC__
 static inline void check_msg(struct check *c, const char *fmt, ...) __attribute__((format (printf, 2, 3)));
@@ -123,27 +97,21 @@ static inline void check_msg(struct check *c, const char *fmt, ...)
 		check_msg((c), __VA_ARGS__); \
 	} while (0)
 
-static void check_nodes_props(struct check *c, struct node *dt, struct node *node)
+static void check_nodes_props(struct check *c, struct dt_info *dti, struct node *node)
 {
 	struct node *child;
-	struct property *prop;
 
 	TRACE(c, "%s", node->fullpath);
-	if (c->node_fn)
-		c->node_fn(c, dt, node);
-
-	if (c->prop_fn)
-		for_each_property(node, prop) {
-			TRACE(c, "%s\t'%s'", node->fullpath, prop->name);
-			c->prop_fn(c, dt, node, prop);
-		}
+	if (c->fn)
+		c->fn(c, dti, node);
 
 	for_each_child(node, child)
-		check_nodes_props(c, dt, child);
+		check_nodes_props(c, dti, child);
 }
 
-static bool run_check(struct check *c, struct node *dt)
+static bool run_check(struct check *c, struct dt_info *dti)
 {
+	struct node *dt = dti->dt;
 	bool error = false;
 	int i;
 
@@ -156,7 +124,7 @@ static bool run_check(struct check *c, struct node *dt)
 
 	for (i = 0; i < c->num_prereqs; i++) {
 		struct check *prq = c->prereq[i];
-		error = error || run_check(prq, dt);
+		error = error || run_check(prq, dti);
 		if (prq->status != PASSED) {
 			c->status = PREREQ;
 			check_msg(c, "Failed prerequisite '%s'",
@@ -167,11 +135,8 @@ static bool run_check(struct check *c, struct node *dt)
 	if (c->status != UNCHECKED)
 		goto out;
 
-	if (c->node_fn || c->prop_fn)
-		check_nodes_props(c, dt, dt);
+	check_nodes_props(c, dti, dt);
 
-	if (c->tree_fn)
-		c->tree_fn(c, dt);
 	if (c->status == UNCHECKED)
 		c->status = PASSED;
 
@@ -189,13 +154,14 @@ out:
  */
 
 /* A check which always fails, for testing purposes only */
-static inline void check_always_fail(struct check *c, struct node *dt)
+static inline void check_always_fail(struct check *c, struct dt_info *dti,
+				     struct node *node)
 {
 	FAIL(c, "always_fail check");
 }
-TREE_CHECK(always_fail, NULL);
+CHECK(always_fail, check_always_fail, NULL);
 
-static void check_is_string(struct check *c, struct node *root,
+static void check_is_string(struct check *c, struct dt_info *dti,
 			    struct node *node)
 {
 	struct property *prop;
@@ -210,11 +176,11 @@ static void check_is_string(struct check *c, struct node *root,
 		     propname, node->fullpath);
 }
 #define WARNING_IF_NOT_STRING(nm, propname) \
-	WARNING(nm, NULL, check_is_string, NULL, (propname))
+	WARNING(nm, check_is_string, (propname))
 #define ERROR_IF_NOT_STRING(nm, propname) \
-	ERROR(nm, NULL, check_is_string, NULL, (propname))
+	ERROR(nm, check_is_string, (propname))
 
-static void check_is_cell(struct check *c, struct node *root,
+static void check_is_cell(struct check *c, struct dt_info *dti,
 			  struct node *node)
 {
 	struct property *prop;
@@ -229,15 +195,15 @@ static void check_is_cell(struct check *c, struct node *root,
 		     propname, node->fullpath);
 }
 #define WARNING_IF_NOT_CELL(nm, propname) \
-	WARNING(nm, NULL, check_is_cell, NULL, (propname))
+	WARNING(nm, check_is_cell, (propname))
 #define ERROR_IF_NOT_CELL(nm, propname) \
-	ERROR(nm, NULL, check_is_cell, NULL, (propname))
+	ERROR(nm, check_is_cell, (propname))
 
 /*
  * Structural check functions
  */
 
-static void check_duplicate_node_names(struct check *c, struct node *dt,
+static void check_duplicate_node_names(struct check *c, struct dt_info *dti,
 				       struct node *node)
 {
 	struct node *child, *child2;
@@ -250,9 +216,9 @@ static void check_duplicate_node_names(struct check *c, struct node *dt,
 				FAIL(c, "Duplicate node name %s",
 				     child->fullpath);
 }
-NODE_ERROR(duplicate_node_names, NULL);
+ERROR(duplicate_node_names, check_duplicate_node_names, NULL);
 
-static void check_duplicate_property_names(struct check *c, struct node *dt,
+static void check_duplicate_property_names(struct check *c, struct dt_info *dti,
 					   struct node *node)
 {
 	struct property *prop, *prop2;
@@ -267,14 +233,14 @@ static void check_duplicate_property_names(struct check *c, struct node *dt,
 		}
 	}
 }
-NODE_ERROR(duplicate_property_names, NULL);
+ERROR(duplicate_property_names, check_duplicate_property_names, NULL);
 
 #define LOWERCASE	"abcdefghijklmnopqrstuvwxyz"
 #define UPPERCASE	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #define DIGITS		"0123456789"
 #define PROPNODECHARS	LOWERCASE UPPERCASE DIGITS ",._+*#?-"
 
-static void check_node_name_chars(struct check *c, struct node *dt,
+static void check_node_name_chars(struct check *c, struct dt_info *dti,
 				  struct node *node)
 {
 	int n = strspn(node->name, c->data);
@@ -283,19 +249,19 @@ static void check_node_name_chars(struct check *c, struct node *dt,
 		FAIL(c, "Bad character '%c' in node %s",
 		     node->name[n], node->fullpath);
 }
-NODE_ERROR(node_name_chars, PROPNODECHARS "@");
+ERROR(node_name_chars, check_node_name_chars, PROPNODECHARS "@");
 
-static void check_node_name_format(struct check *c, struct node *dt,
+static void check_node_name_format(struct check *c, struct dt_info *dti,
 				   struct node *node)
 {
 	if (strchr(get_unitname(node), '@'))
 		FAIL(c, "Node %s has multiple '@' characters in name",
 		     node->fullpath);
 }
-NODE_ERROR(node_name_format, NULL, &node_name_chars);
+ERROR(node_name_format, check_node_name_format, NULL, &node_name_chars);
 
-static void check_unit_address_vs_reg(struct check *c, struct node *dt,
-			     struct node *node)
+static void check_unit_address_vs_reg(struct check *c, struct dt_info *dti,
+				      struct node *node)
 {
 	const char *unitname = get_unitname(node);
 	struct property *prop = get_property(node, "reg");
@@ -316,18 +282,22 @@ static void check_unit_address_vs_reg(struct check *c, struct node *dt,
 			    node->fullpath);
 	}
 }
-NODE_WARNING(unit_address_vs_reg, NULL);
+WARNING(unit_address_vs_reg, check_unit_address_vs_reg, NULL);
 
-static void check_property_name_chars(struct check *c, struct node *dt,
-				      struct node *node, struct property *prop)
+static void check_property_name_chars(struct check *c, struct dt_info *dti,
+				      struct node *node)
 {
-	int n = strspn(prop->name, c->data);
+	struct property *prop;
 
-	if (n < strlen(prop->name))
-		FAIL(c, "Bad character '%c' in property name \"%s\", node %s",
-		     prop->name[n], prop->name, node->fullpath);
+	for_each_property(node, prop) {
+		int n = strspn(prop->name, c->data);
+
+		if (n < strlen(prop->name))
+			FAIL(c, "Bad character '%c' in property name \"%s\", node %s",
+			     prop->name[n], prop->name, node->fullpath);
+	}
 }
-PROP_ERROR(property_name_chars, PROPNODECHARS);
+ERROR(property_name_chars, check_property_name_chars, PROPNODECHARS);
 
 #define DESCLABEL_FMT	"%s%s%s%s%s"
 #define DESCLABEL_ARGS(node,prop,mark)		\
@@ -336,10 +306,11 @@ PROP_ERROR(property_name_chars, PROPNODECHARS);
 	((prop) ? (prop)->name : ""), \
 	((prop) ? "' in " : ""), (node)->fullpath
 
-static void check_duplicate_label(struct check *c, struct node *dt,
+static void check_duplicate_label(struct check *c, struct dt_info *dti,
 				  const char *label, struct node *node,
 				  struct property *prop, struct marker *mark)
 {
+	struct node *dt = dti->dt;
 	struct node *othernode = NULL;
 	struct property *otherprop = NULL;
 	struct marker *othermark = NULL;
@@ -362,44 +333,43 @@ static void check_duplicate_label(struct check *c, struct node *dt,
 		     DESCLABEL_ARGS(othernode, otherprop, othermark));
 }
 
-static void check_duplicate_label_node(struct check *c, struct node *dt,
+static void check_duplicate_label_node(struct check *c, struct dt_info *dti,
 				       struct node *node)
 {
 	struct label *l;
+	struct property *prop;
 
 	for_each_label(node->labels, l)
-		check_duplicate_label(c, dt, l->label, node, NULL, NULL);
+		check_duplicate_label(c, dti, l->label, node, NULL, NULL);
+
+	for_each_property(node, prop) {
+		struct marker *m = prop->val.markers;
+
+		for_each_label(prop->labels, l)
+			check_duplicate_label(c, dti, l->label, node, prop, NULL);
+
+		for_each_marker_of_type(m, LABEL)
+			check_duplicate_label(c, dti, m->ref, node, prop, m);
+	}
 }
-static void check_duplicate_label_prop(struct check *c, struct node *dt,
-				       struct node *node, struct property *prop)
-{
-	struct marker *m = prop->val.markers;
-	struct label *l;
-
-	for_each_label(prop->labels, l)
-		check_duplicate_label(c, dt, l->label, node, prop, NULL);
-
-	for_each_marker_of_type(m, LABEL)
-		check_duplicate_label(c, dt, m->ref, node, prop, m);
-}
-ERROR(duplicate_label, NULL, check_duplicate_label_node,
-      check_duplicate_label_prop, NULL);
-
-static void check_explicit_phandles(struct check *c, struct node *root,
-				    struct node *node, struct property *prop)
+ERROR(duplicate_label, check_duplicate_label_node, NULL);
+
+static cell_t check_phandle_prop(struct check *c, struct dt_info *dti,
+				 struct node *node, const char *propname)
 {
+	struct node *root = dti->dt;
+	struct property *prop;
 	struct marker *m;
-	struct node *other;
 	cell_t phandle;
 
-	if (!streq(prop->name, "phandle")
-	    && !streq(prop->name, "linux,phandle"))
-		return;
+	prop = get_property(node, propname);
+	if (!prop)
+		return 0;
 
 	if (prop->val.len != sizeof(cell_t)) {
 		FAIL(c, "%s has bad length (%d) %s property",
 		     node->fullpath, prop->val.len, prop->name);
-		return;
+		return 0;
 	}
 
 	m = prop->val.markers;
@@ -411,14 +381,13 @@ static void check_explicit_phandles(struct check *c, struct node *root,
 			 * by construction. */ {
 			FAIL(c, "%s in %s is a reference to another node",
 			     prop->name, node->fullpath);
-			return;
 		}
 		/* But setting this node's phandle equal to its own
 		 * phandle is allowed - that means allocate a unique
 		 * phandle for this node, even if it's not otherwise
 		 * referenced.  The value will be filled in later, so
-		 * no further checking for now. */
-		return;
+		 * we treat it as having no phandle data for now. */
+		return 0;
 	}
 
 	phandle = propval_cell(prop);
@@ -426,12 +395,36 @@ static void check_explicit_phandles(struct check *c, struct node *root,
 	if ((phandle == 0) || (phandle == -1)) {
 		FAIL(c, "%s has bad value (0x%x) in %s property",
 		     node->fullpath, phandle, prop->name);
-		return;
+		return 0;
 	}
 
-	if (node->phandle && (node->phandle != phandle))
-		FAIL(c, "%s has %s property which replaces existing phandle information",
-		     node->fullpath, prop->name);
+	return phandle;
+}
+
+static void check_explicit_phandles(struct check *c, struct dt_info *dti,
+				    struct node *node)
+{
+	struct node *root = dti->dt;
+	struct node *other;
+	cell_t phandle, linux_phandle;
+
+	/* Nothing should have assigned phandles yet */
+	assert(!node->phandle);
+
+	phandle = check_phandle_prop(c, dti, node, "phandle");
+
+	linux_phandle = check_phandle_prop(c, dti, node, "linux,phandle");
+
+	if (!phandle && !linux_phandle)
+		/* No valid phandles; nothing further to check */
+		return;
+
+	if (linux_phandle && phandle && (phandle != linux_phandle))
+		FAIL(c, "%s has mismatching 'phandle' and 'linux,phandle'"
+		     " properties", node->fullpath);
+
+	if (linux_phandle && !phandle)
+		phandle = linux_phandle;
 
 	other = get_node_by_phandle(root, phandle);
 	if (other && (other != node)) {
@@ -442,9 +435,9 @@ static void check_explicit_phandles(struct check *c, struct node *root,
 
 	node->phandle = phandle;
 }
-PROP_ERROR(explicit_phandles, NULL);
+ERROR(explicit_phandles, check_explicit_phandles, NULL);
 
-static void check_name_properties(struct check *c, struct node *root,
+static void check_name_properties(struct check *c, struct dt_info *dti,
 				  struct node *node)
 {
 	struct property **pp, *prop = NULL;
@@ -472,60 +465,73 @@ static void check_name_properties(struct check *c, struct node *root,
 	}
 }
 ERROR_IF_NOT_STRING(name_is_string, "name");
-NODE_ERROR(name_properties, NULL, &name_is_string);
+ERROR(name_properties, check_name_properties, NULL, &name_is_string);
 
 /*
  * Reference fixup functions
  */
 
-static void fixup_phandle_references(struct check *c, struct node *dt,
-				     struct node *node, struct property *prop)
+static void fixup_phandle_references(struct check *c, struct dt_info *dti,
+				     struct node *node)
 {
-	struct marker *m = prop->val.markers;
-	struct node *refnode;
-	cell_t phandle;
+	struct node *dt = dti->dt;
+	struct property *prop;
 
-	for_each_marker_of_type(m, REF_PHANDLE) {
-		assert(m->offset + sizeof(cell_t) <= prop->val.len);
+	for_each_property(node, prop) {
+		struct marker *m = prop->val.markers;
+		struct node *refnode;
+		cell_t phandle;
 
-		refnode = get_node_by_ref(dt, m->ref);
-		if (! refnode) {
-			FAIL(c, "Reference to non-existent node or label \"%s\"\n",
-			     m->ref);
-			continue;
+		for_each_marker_of_type(m, REF_PHANDLE) {
+			assert(m->offset + sizeof(cell_t) <= prop->val.len);
+
+			refnode = get_node_by_ref(dt, m->ref);
+			if (! refnode) {
+				if (!(dti->dtsflags & DTSF_PLUGIN))
+					FAIL(c, "Reference to non-existent node or "
+							"label \"%s\"\n", m->ref);
+				else /* mark the entry as unresolved */
+					*((cell_t *)(prop->val.val + m->offset)) =
+						cpu_to_fdt32(0xffffffff);
+				continue;
+			}
+
+			phandle = get_node_phandle(dt, refnode);
+			*((cell_t *)(prop->val.val + m->offset)) = cpu_to_fdt32(phandle);
 		}
-
-		phandle = get_node_phandle(dt, refnode);
-		*((cell_t *)(prop->val.val + m->offset)) = cpu_to_fdt32(phandle);
 	}
 }
-ERROR(phandle_references, NULL, NULL, fixup_phandle_references, NULL,
+ERROR(phandle_references, fixup_phandle_references, NULL,
       &duplicate_node_names, &explicit_phandles);
 
-static void fixup_path_references(struct check *c, struct node *dt,
-				  struct node *node, struct property *prop)
+static void fixup_path_references(struct check *c, struct dt_info *dti,
+				  struct node *node)
 {
-	struct marker *m = prop->val.markers;
-	struct node *refnode;
-	char *path;
+	struct node *dt = dti->dt;
+	struct property *prop;
 
-	for_each_marker_of_type(m, REF_PATH) {
-		assert(m->offset <= prop->val.len);
+	for_each_property(node, prop) {
+		struct marker *m = prop->val.markers;
+		struct node *refnode;
+		char *path;
 
-		refnode = get_node_by_ref(dt, m->ref);
-		if (!refnode) {
-			FAIL(c, "Reference to non-existent node or label \"%s\"\n",
-			     m->ref);
-			continue;
+		for_each_marker_of_type(m, REF_PATH) {
+			assert(m->offset <= prop->val.len);
+
+			refnode = get_node_by_ref(dt, m->ref);
+			if (!refnode) {
+				FAIL(c, "Reference to non-existent node or label \"%s\"\n",
+				     m->ref);
+				continue;
+			}
+
+			path = refnode->fullpath;
+			prop->val = data_insert_at_marker(prop->val, m, path,
+							  strlen(path) + 1);
 		}
-
-		path = refnode->fullpath;
-		prop->val = data_insert_at_marker(prop->val, m, path,
-						  strlen(path) + 1);
 	}
 }
-ERROR(path_references, NULL, NULL, fixup_path_references, NULL,
-      &duplicate_node_names);
+ERROR(path_references, fixup_path_references, NULL, &duplicate_node_names);
 
 /*
  * Semantic checks
@@ -538,7 +544,7 @@ WARNING_IF_NOT_STRING(device_type_is_string, "device_type");
 WARNING_IF_NOT_STRING(model_is_string, "model");
 WARNING_IF_NOT_STRING(status_is_string, "status");
 
-static void fixup_addr_size_cells(struct check *c, struct node *dt,
+static void fixup_addr_size_cells(struct check *c, struct dt_info *dti,
 				  struct node *node)
 {
 	struct property *prop;
@@ -554,7 +560,7 @@ static void fixup_addr_size_cells(struct check *c, struct node *dt,
 	if (prop)
 		node->size_cells = propval_cell(prop);
 }
-WARNING(addr_size_cells, NULL, fixup_addr_size_cells, NULL, NULL,
+WARNING(addr_size_cells, fixup_addr_size_cells, NULL,
 	&address_cells_is_cell, &size_cells_is_cell);
 
 #define node_addr_cells(n) \
@@ -562,7 +568,7 @@ WARNING(addr_size_cells, NULL, fixup_addr_size_cells, NULL, NULL,
 #define node_size_cells(n) \
 	(((n)->size_cells == -1) ? 1 : (n)->size_cells)
 
-static void check_reg_format(struct check *c, struct node *dt,
+static void check_reg_format(struct check *c, struct dt_info *dti,
 			     struct node *node)
 {
 	struct property *prop;
@@ -589,9 +595,9 @@ static void check_reg_format(struct check *c, struct node *dt,
 		     "(#address-cells == %d, #size-cells == %d)",
 		     node->fullpath, prop->val.len, addr_cells, size_cells);
 }
-NODE_WARNING(reg_format, NULL, &addr_size_cells);
+WARNING(reg_format, check_reg_format, NULL, &addr_size_cells);
 
-static void check_ranges_format(struct check *c, struct node *dt,
+static void check_ranges_format(struct check *c, struct dt_info *dti,
 				struct node *node)
 {
 	struct property *prop;
@@ -630,12 +636,12 @@ static void check_ranges_format(struct check *c, struct node *dt,
 		     p_addr_cells, c_addr_cells, c_size_cells);
 	}
 }
-NODE_WARNING(ranges_format, NULL, &addr_size_cells);
+WARNING(ranges_format, check_ranges_format, NULL, &addr_size_cells);
 
 /*
  * Style checks
  */
-static void check_avoid_default_addr_size(struct check *c, struct node *dt,
+static void check_avoid_default_addr_size(struct check *c, struct dt_info *dti,
 					  struct node *node)
 {
 	struct property *reg, *ranges;
@@ -657,14 +663,21 @@ static void check_avoid_default_addr_size(struct check *c, struct node *dt,
 		FAIL(c, "Relying on default #size-cells value for %s",
 		     node->fullpath);
 }
-NODE_WARNING(avoid_default_addr_size, NULL, &addr_size_cells);
+WARNING(avoid_default_addr_size, check_avoid_default_addr_size, NULL,
+	&addr_size_cells);
 
 static void check_obsolete_chosen_interrupt_controller(struct check *c,
-						       struct node *dt)
+						       struct dt_info *dti,
+						       struct node *node)
 {
+	struct node *dt = dti->dt;
 	struct node *chosen;
 	struct property *prop;
 
+	if (node != dt)
+		return;
+
+
 	chosen = get_node_by_path(dt, "/chosen");
 	if (!chosen)
 		return;
@@ -674,7 +687,8 @@ static void check_obsolete_chosen_interrupt_controller(struct check *c,
 		FAIL(c, "/chosen has obsolete \"interrupt-controller\" "
 		     "property");
 }
-TREE_WARNING(obsolete_chosen_interrupt_controller, NULL);
+WARNING(obsolete_chosen_interrupt_controller,
+	check_obsolete_chosen_interrupt_controller, NULL);
 
 static struct check *check_table[] = {
 	&duplicate_node_names, &duplicate_property_names,
@@ -760,9 +774,8 @@ void parse_checks_option(bool warn, bool error, const char *arg)
 	die("Unrecognized check name \"%s\"\n", name);
 }
 
-void process_checks(bool force, struct boot_info *bi)
+void process_checks(bool force, struct dt_info *dti)
 {
-	struct node *dt = bi->dt;
 	int i;
 	int error = 0;
 
@@ -770,7 +783,7 @@ void process_checks(bool force, struct boot_info *bi)
 		struct check *c = check_table[i];
 
 		if (c->warn || c->error)
-			error = error || run_check(c, dt);
+			error = error || run_check(c, dti);
 	}
 
 	if (error) {
diff --git a/scripts/dtc/dtc-lexer.l b/scripts/dtc/dtc-lexer.l
index 790fbf6cf2d7..c600603044f3 100644
--- a/scripts/dtc/dtc-lexer.l
+++ b/scripts/dtc/dtc-lexer.l
@@ -121,6 +121,11 @@ static void lexical_error(const char *fmt, ...);
 			return DT_V1;
 		}
 
+<*>"/plugin/"	{
+			DPRINT("Keyword: /plugin/\n");
+			return DT_PLUGIN;
+		}
+
 <*>"/memreserve/"	{
 			DPRINT("Keyword: /memreserve/\n");
 			BEGIN_DEFAULT();
@@ -184,16 +189,16 @@ static void lexical_error(const char *fmt, ...);
 			if (d.len == 1) {
 				lexical_error("Empty character literal");
 				yylval.integer = 0;
-				return DT_CHAR_LITERAL;
+			} else {
+				yylval.integer = (unsigned char)d.val[0];
+
+				if (d.len > 2)
+					lexical_error("Character literal has %d"
+						      " characters instead of 1",
+						      d.len - 1);
 			}
 
-			yylval.integer = (unsigned char)d.val[0];
-
-			if (d.len > 2)
-				lexical_error("Character literal has %d"
-					      " characters instead of 1",
-					      d.len - 1);
-
+			data_free(d);
 			return DT_CHAR_LITERAL;
 		}
 
diff --git a/scripts/dtc/dtc-lexer.lex.c_shipped b/scripts/dtc/dtc-lexer.lex.c_shipped
index ba525c2f9fc2..2c862bc86ad0 100644
--- a/scripts/dtc/dtc-lexer.lex.c_shipped
+++ b/scripts/dtc/dtc-lexer.lex.c_shipped
@@ -8,8 +8,8 @@
 
 #define FLEX_SCANNER
 #define YY_FLEX_MAJOR_VERSION 2
-#define YY_FLEX_MINOR_VERSION 5
-#define YY_FLEX_SUBMINOR_VERSION 39
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 1
 #if YY_FLEX_SUBMINOR_VERSION > 0
 #define FLEX_BETA
 #endif
@@ -88,25 +88,13 @@ typedef unsigned int flex_uint32_t;
 
 #endif /* ! FLEXINT_H */
 
-#ifdef __cplusplus
-
-/* The "const" storage-class-modifier is valid. */
-#define YY_USE_CONST
-
-#else	/* ! __cplusplus */
-
-/* C99 requires __STDC__ to be defined as 1. */
-#if defined (__STDC__)
-
-#define YY_USE_CONST
-
-#endif	/* defined (__STDC__) */
-#endif	/* ! __cplusplus */
-
-#ifdef YY_USE_CONST
+/* TODO: this is always defined, so inline it */
 #define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
 #else
-#define yyconst
+#define yynoreturn
 #endif
 
 /* Returned upon end-of-file. */
@@ -167,7 +155,7 @@ typedef struct yy_buffer_state *YY_BUFFER_STATE;
 typedef size_t yy_size_t;
 #endif
 
-extern yy_size_t yyleng;
+extern int yyleng;
 
 extern FILE *yyin, *yyout;
 
@@ -206,12 +194,12 @@ struct yy_buffer_state
 	/* Size of input buffer in bytes, not including room for EOB
 	 * characters.
 	 */
-	yy_size_t yy_buf_size;
+	int yy_buf_size;
 
 	/* Number of characters read into yy_ch_buf, not including EOB
 	 * characters.
 	 */
-	yy_size_t yy_n_chars;
+	int yy_n_chars;
 
 	/* Whether we "own" the buffer - i.e., we know we created it,
 	 * and can realloc() it to grow it, and should free() it to
@@ -234,7 +222,7 @@ struct yy_buffer_state
 
     int yy_bs_lineno; /**< The line count. */
     int yy_bs_column; /**< The column count. */
-    
+
 	/* Whether to try to fill the input buffer when we reach the
 	 * end of it.
 	 */
@@ -262,7 +250,7 @@ struct yy_buffer_state
 /* Stack of input buffers. */
 static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */
 static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */
-static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */
+static YY_BUFFER_STATE * yy_buffer_stack = NULL; /**< Stack as an array. */
 
 /* We provide macros for accessing buffer states in case in the
  * future we want to put the buffer states in a more general
@@ -281,11 +269,11 @@ static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */
 
 /* yy_hold_char holds the character lost when yytext is formed. */
 static char yy_hold_char;
-static yy_size_t yy_n_chars;		/* number of characters read into yy_ch_buf */
-yy_size_t yyleng;
+static int yy_n_chars;		/* number of characters read into yy_ch_buf */
+int yyleng;
 
 /* Points to current character in buffer. */
-static char *yy_c_buf_p = (char *) 0;
+static char *yy_c_buf_p = NULL;
 static int yy_init = 0;		/* whether we need to initialize */
 static int yy_start = 0;	/* start state number */
 
@@ -310,7 +298,7 @@ static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file  );
 
 YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size  );
 YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str  );
-YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,yy_size_t len  );
+YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len  );
 
 void *yyalloc (yy_size_t  );
 void *yyrealloc (void *,yy_size_t  );
@@ -342,12 +330,12 @@ void yyfree (void *  );
 
 /* Begin user sect3 */
 
-#define yywrap() 1
+#define yywrap() (/*CONSTCOND*/1)
 #define YY_SKIP_YYWRAP
 
 typedef unsigned char YY_CHAR;
 
-FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0;
+FILE *yyin = NULL, *yyout = NULL;
 
 typedef int yy_state_type;
 
@@ -356,25 +344,28 @@ extern int yylineno;
 int yylineno = 1;
 
 extern char *yytext;
+#ifdef yytext_ptr
+#undef yytext_ptr
+#endif
 #define yytext_ptr yytext
 
 static yy_state_type yy_get_previous_state (void );
 static yy_state_type yy_try_NUL_trans (yy_state_type current_state  );
 static int yy_get_next_buffer (void );
-static void yy_fatal_error (yyconst char msg[]  );
+static void yynoreturn yy_fatal_error (yyconst char* msg  );
 
 /* Done after the current pattern has been matched and before the
  * corresponding action - sets up yytext.
  */
 #define YY_DO_BEFORE_ACTION \
 	(yytext_ptr) = yy_bp; \
-	yyleng = (size_t) (yy_cp - yy_bp); \
+	yyleng = (int) (yy_cp - yy_bp); \
 	(yy_hold_char) = *yy_cp; \
 	*yy_cp = '\0'; \
 	(yy_c_buf_p) = yy_cp;
 
-#define YY_NUM_RULES 30
-#define YY_END_OF_BUFFER 31
+#define YY_NUM_RULES 31
+#define YY_END_OF_BUFFER 32
 /* This struct is not used in this scanner,
    but its presence is necessary. */
 struct yy_trans_info
@@ -382,28 +373,29 @@ struct yy_trans_info
 	flex_int32_t yy_verify;
 	flex_int32_t yy_nxt;
 	};
-static yyconst flex_int16_t yy_accept[159] =
+static yyconst flex_int16_t yy_accept[166] =
     {   0,
-        0,    0,    0,    0,    0,    0,    0,    0,   31,   29,
-       18,   18,   29,   29,   29,   29,   29,   29,   29,   29,
-       29,   29,   29,   29,   29,   29,   15,   16,   16,   29,
-       16,   10,   10,   18,   26,    0,    3,    0,   27,   12,
-        0,    0,   11,    0,    0,    0,    0,    0,    0,    0,
-       21,   23,   25,   24,   22,    0,    9,   28,    0,    0,
-        0,   14,   14,   16,   16,   16,   10,   10,   10,    0,
-       12,    0,   11,    0,    0,    0,   20,    0,    0,    0,
-        0,    0,    0,    0,    0,   16,   10,   10,   10,    0,
-       13,   19,    0,    0,    0,    0,    0,    0,    0,    0,
+        0,    0,    0,    0,    0,    0,    0,    0,   32,   30,
+       19,   19,   30,   30,   30,   30,   30,   30,   30,   30,
+       30,   30,   30,   30,   30,   30,   16,   17,   17,   30,
+       17,   11,   11,   19,   27,    0,    3,    0,   28,   13,
+        0,    0,   12,    0,    0,    0,    0,    0,    0,    0,
+        0,   22,   24,   26,   25,   23,    0,   10,   29,    0,
+        0,    0,   15,   15,   17,   17,   17,   11,   11,   11,
+        0,   13,    0,   12,    0,    0,    0,   21,    0,    0,
+        0,    0,    0,    0,    0,    0,    0,   17,   11,   11,
+       11,    0,   14,   20,    0,    0,    0,    0,    0,    0,
 
-        0,   16,    0,    0,    0,    0,    0,    0,    0,    0,
-        0,   16,    6,    0,    0,    0,    0,    0,    0,    2,
-        0,    0,    0,    0,    0,    0,    0,    0,    4,   17,
-        0,    0,    2,    0,    0,    0,    0,    0,    0,    0,
-        0,    0,    0,    0,    0,    1,    0,    0,    0,    0,
-        5,    8,    0,    0,    0,    0,    7,    0
+        0,    0,    0,    0,   17,    0,    0,    0,    0,    0,
+        0,    0,    0,    0,    0,   17,    7,    0,    0,    0,
+        0,    0,    0,    0,    2,    0,    0,    0,    0,    0,
+        0,    0,    0,    0,    4,   18,    0,    0,    5,    2,
+        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+        0,    0,    1,    0,    0,    0,    0,    6,    9,    0,
+        0,    0,    0,    8,    0
     } ;
 
-static yyconst flex_int32_t yy_ec[256] =
+static yyconst YY_CHAR yy_ec[256] =
     {   0,
         1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
         4,    4,    4,    1,    1,    1,    1,    1,    1,    1,
@@ -416,9 +408,9 @@ static yyconst flex_int32_t yy_ec[256] =
        22,   22,   22,   22,   24,   22,   22,   25,   22,   22,
         1,   26,   27,    1,   22,    1,   21,   28,   29,   30,
 
-       31,   21,   22,   22,   32,   22,   22,   33,   34,   35,
-       36,   37,   22,   38,   39,   40,   41,   42,   22,   25,
-       43,   22,   44,   45,   46,    1,    1,    1,    1,    1,
+       31,   21,   32,   22,   33,   22,   22,   34,   35,   36,
+       37,   38,   22,   39,   40,   41,   42,   43,   22,   25,
+       44,   22,   45,   46,   47,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
@@ -435,163 +427,165 @@ static yyconst flex_int32_t yy_ec[256] =
         1,    1,    1,    1,    1
     } ;
 
-static yyconst flex_int32_t yy_meta[47] =
+static yyconst YY_CHAR yy_meta[48] =
     {   0,
         1,    1,    1,    1,    1,    1,    2,    3,    1,    2,
         2,    2,    4,    5,    5,    5,    6,    1,    1,    1,
         7,    8,    8,    8,    8,    1,    1,    7,    7,    7,
         7,    8,    8,    8,    8,    8,    8,    8,    8,    8,
-        8,    8,    8,    3,    1,    4
+        8,    8,    8,    8,    3,    1,    4
     } ;
 
-static yyconst flex_int16_t yy_base[173] =
+static yyconst flex_uint16_t yy_base[180] =
     {   0,
-        0,  383,   34,  382,   65,  381,   37,  105,  387,  391,
-       54,  111,  367,  110,  109,  109,  112,   41,  366,  104,
-      367,  338,  124,  117,    0,  144,  391,    0,  121,    0,
-      135,  155,  140,  179,  391,  160,  391,  379,  391,    0,
-      368,  141,  391,  167,  370,  376,  346,  103,  342,  345,
-      391,  391,  391,  391,  391,  358,  391,  391,  175,  342,
-      338,  391,  355,    0,  185,  339,  184,  347,  346,    0,
-        0,  322,  175,  357,  175,  363,  352,  324,  330,  323,
-      332,  326,  201,  324,  329,  322,  391,  333,  181,  309,
-      391,  341,  340,  313,  320,  338,  178,  311,  146,  317,
+        0,  393,   35,  392,   66,  391,   38,  107,  397,  401,
+       55,  113,  377,  112,  111,  111,  114,   42,  376,  106,
+      377,  347,  126,  120,    0,  147,  401,    0,  124,    0,
+      137,  158,  170,  163,  401,  153,  401,  389,  401,    0,
+      378,  120,  401,  131,  380,  386,  355,  139,  351,  355,
+      351,  401,  401,  401,  401,  401,  367,  401,  401,  185,
+      350,  346,  401,  364,    0,  185,  347,  189,  356,  355,
+        0,    0,  330,  180,  366,  141,  372,  361,  332,  338,
+      331,  341,  334,  326,  205,  331,  337,  329,  401,  341,
+      167,  316,  401,  349,  348,  320,  328,  346,  180,  318,
 
-      314,  315,  335,  331,  303,  300,  309,  299,  308,  188,
-      336,  335,  391,  305,  320,  281,  283,  271,  203,  288,
-      281,  271,  266,  264,  245,  242,  208,  104,  391,  391,
-      244,  218,  204,  219,  206,  224,  201,  212,  204,  229,
-      215,  208,  207,  200,  219,  391,  233,  221,  200,  181,
-      391,  391,  149,  122,   86,   41,  391,  391,  245,  251,
-      259,  263,  267,  273,  280,  284,  292,  300,  304,  310,
-      318,  326
+      324,  209,  324,  320,  322,  342,  338,  309,  306,  315,
+      305,  315,  312,  192,  342,  341,  401,  293,  306,  282,
+      268,  252,  255,  203,  285,  282,  272,  268,  252,  233,
+      232,  239,  208,  107,  401,  401,  238,  211,  401,  211,
+      212,  208,  228,  203,  215,  207,  233,  222,  212,  211,
+      203,  227,  401,  237,  225,  204,  185,  401,  401,  149,
+      128,   88,   42,  401,  401,  253,  259,  267,  271,  275,
+      281,  288,  292,  300,  308,  312,  318,  326,  334
     } ;
 
-static yyconst flex_int16_t yy_def[173] =
+static yyconst flex_int16_t yy_def[180] =
     {   0,
-      158,    1,    1,    3,  158,    5,    1,    1,  158,  158,
-      158,  158,  158,  159,  160,  161,  158,  158,  158,  158,
-      162,  158,  158,  158,  163,  162,  158,  164,  165,  164,
-      164,  158,  158,  158,  158,  159,  158,  159,  158,  166,
-      158,  161,  158,  161,  167,  168,  158,  158,  158,  158,
-      158,  158,  158,  158,  158,  162,  158,  158,  158,  158,
-      158,  158,  162,  164,  165,  164,  158,  158,  158,  169,
-      166,  170,  161,  167,  167,  168,  158,  158,  158,  158,
-      158,  158,  158,  158,  158,  164,  158,  158,  169,  170,
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
+      165,    1,    1,    3,  165,    5,    1,    1,  165,  165,
+      165,  165,  165,  166,  167,  168,  165,  165,  165,  165,
+      169,  165,  165,  165,  170,  169,  165,  171,  172,  171,
+      171,  165,  165,  165,  165,  166,  165,  166,  165,  173,
+      165,  168,  165,  168,  174,  175,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  169,  165,  165,  165,
+      165,  165,  165,  169,  171,  172,  171,  165,  165,  165,
+      176,  173,  177,  168,  174,  174,  175,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  171,  165,  165,
+      176,  177,  165,  165,  165,  165,  165,  165,  165,  165,
 
-      158,  164,  158,  158,  158,  158,  158,  158,  158,  171,
-      158,  164,  158,  158,  158,  158,  158,  158,  171,  158,
-      171,  158,  158,  158,  158,  158,  158,  158,  158,  158,
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
-      172,  158,  158,  158,  172,  158,  172,  158,  158,  158,
-      158,  158,  158,  158,  158,  158,  158,    0,  158,  158,
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
-      158,  158
+      165,  165,  165,  165,  171,  165,  165,  165,  165,  165,
+      165,  165,  165,  178,  165,  171,  165,  165,  165,  165,
+      165,  165,  165,  178,  165,  178,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  179,  165,  165,
+      165,  179,  165,  179,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,    0,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165,  165
     } ;
 
-static yyconst flex_int16_t yy_nxt[438] =
+static yyconst flex_uint16_t yy_nxt[449] =
     {   0,
        10,   11,   12,   11,   13,   14,   10,   15,   16,   10,
        10,   10,   17,   10,   10,   10,   10,   18,   19,   20,
        21,   21,   21,   21,   21,   10,   10,   21,   21,   21,
        21,   21,   21,   21,   21,   21,   21,   21,   21,   21,
-       21,   21,   21,   10,   22,   10,   24,   25,   25,   25,
-       32,   33,   33,  157,   26,   34,   34,   34,   51,   52,
-       27,   26,   26,   26,   26,   10,   11,   12,   11,   13,
-       14,   28,   15,   16,   28,   28,   28,   24,   28,   28,
-       28,   10,   18,   19,   20,   29,   29,   29,   29,   29,
-       30,   10,   29,   29,   29,   29,   29,   29,   29,   29,
+       21,   21,   21,   21,   10,   22,   10,   24,   25,   25,
+       25,   32,   33,   33,  164,   26,   34,   34,   34,   52,
+       53,   27,   26,   26,   26,   26,   10,   11,   12,   11,
+       13,   14,   28,   15,   16,   28,   28,   28,   24,   28,
+       28,   28,   10,   18,   19,   20,   29,   29,   29,   29,
+       29,   30,   10,   29,   29,   29,   29,   29,   29,   29,
 
-       29,   29,   29,   29,   29,   29,   29,   29,   10,   22,
-       10,   23,   34,   34,   34,   37,   39,   43,   32,   33,
-       33,   45,   54,   55,   46,   59,   45,   64,  156,   46,
-       64,   64,   64,   79,   44,   38,   59,   57,  134,   47,
-      135,   48,   80,   49,   47,   50,   48,   99,   61,   43,
-       50,  110,   41,   67,   67,   67,   60,   63,   63,   63,
-       57,  155,   68,   69,   63,   37,   44,   66,   67,   67,
-       67,   63,   63,   63,   63,   73,   59,   68,   69,   70,
-       34,   34,   34,   43,   75,   38,  154,   92,   83,   83,
-       83,   64,   44,  120,   64,   64,   64,   67,   67,   67,
+       29,   29,   29,   29,   29,   29,   29,   29,   29,   29,
+       10,   22,   10,   23,   34,   34,   34,   37,   39,   43,
+       32,   33,   33,   45,   55,   56,   46,   60,   43,   45,
+       65,  163,   46,   65,   65,   65,   44,   38,   60,   74,
+       58,   47,  141,   48,  142,   44,   49,   47,   50,   48,
+       76,   51,   62,   94,   50,   41,   44,   51,   37,   61,
+       64,   64,   64,   58,   34,   34,   34,   64,  162,   80,
+       67,   68,   68,   68,   64,   64,   64,   64,   38,   81,
+       69,   70,   71,   68,   68,   68,   60,  161,   43,   69,
+       70,   65,   69,   70,   65,   65,   65,  125,   85,   85,
 
-       44,   57,   99,   68,   69,  107,   68,   69,  120,  127,
-      108,  153,  152,  121,   83,   83,   83,  133,  133,  133,
-      146,  133,  133,  133,  146,  140,  140,  140,  121,  141,
-      140,  140,  140,  151,  141,  158,  150,  149,  148,  144,
-      147,  143,  142,  139,  147,   36,   36,   36,   36,   36,
-       36,   36,   36,   40,  138,  137,  136,   40,   40,   42,
-       42,   42,   42,   42,   42,   42,   42,   56,   56,   56,
-       56,   62,  132,   62,   64,  131,  130,   64,  129,   64,
-       64,   65,  128,  158,   65,   65,   65,   65,   71,  127,
-       71,   71,   74,   74,   74,   74,   74,   74,   74,   74,
+       85,   58,   68,   68,   68,   44,  102,  110,  125,  133,
+      102,   69,   70,  111,  114,  160,  159,  126,   85,   85,
+       85,  140,  140,  140,  140,  140,  140,  153,  126,  147,
+      147,  147,  153,  148,  147,  147,  147,  158,  148,  165,
+      157,  156,  155,  151,  150,  149,  146,  154,  145,  144,
+      143,  139,  154,   36,   36,   36,   36,   36,   36,   36,
+       36,   40,  138,  137,  136,   40,   40,   42,   42,   42,
+       42,   42,   42,   42,   42,   57,   57,   57,   57,   63,
+      135,   63,   65,  134,  165,   65,  133,   65,   65,   66,
+      132,  131,   66,   66,   66,   66,   72,  130,   72,   72,
 
-       76,   76,   76,   76,   76,   76,   76,   76,   89,  126,
-       89,   90,  125,   90,   90,  124,   90,   90,  119,  119,
-      119,  119,  119,  119,  119,  119,  145,  145,  145,  145,
-      145,  145,  145,  145,  123,  122,   59,   59,  118,  117,
-      116,  115,  114,  113,   45,  112,  108,  111,  109,  106,
-      105,  104,   46,  103,   91,   87,  102,  101,  100,   98,
-       97,   96,   95,   94,   93,   77,   75,   91,   88,   87,
-       86,   57,   85,   84,   57,   82,   81,   78,   77,   75,
-       72,  158,   58,   57,   53,   35,  158,   31,   23,   23,
-        9,  158,  158,  158,  158,  158,  158,  158,  158,  158,
+       75,   75,   75,   75,   75,   75,   75,   75,   77,   77,
+       77,   77,   77,   77,   77,   77,   91,  129,   91,   92,
+      128,   92,   92,  127,   92,   92,  124,  124,  124,  124,
+      124,  124,  124,  124,  152,  152,  152,  152,  152,  152,
+      152,  152,   60,   60,  123,  122,  121,  120,  119,  118,
+      117,   45,  116,  111,  115,  113,  112,  109,  108,  107,
+       46,  106,   93,   89,  105,  104,  103,  101,  100,   99,
+       98,   97,   96,   95,   78,   76,   93,   90,   89,   88,
+       58,   87,   86,   58,   84,   83,   82,   79,   78,   76,
+       73,  165,   59,   58,   54,   35,  165,   31,   23,   23,
 
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
-      158,  158,  158,  158,  158,  158,  158
+        9,  165,  165,  165,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165
     } ;
 
-static yyconst flex_int16_t yy_chk[438] =
+static yyconst flex_int16_t yy_chk[449] =
     {   0,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-        1,    1,    1,    1,    1,    1,    3,    3,    3,    3,
-        7,    7,    7,  156,    3,   11,   11,   11,   18,   18,
-        3,    3,    3,    3,    3,    5,    5,    5,    5,    5,
+        1,    1,    1,    1,    1,    1,    1,    3,    3,    3,
+        3,    7,    7,    7,  163,    3,   11,   11,   11,   18,
+       18,    3,    3,    3,    3,    3,    5,    5,    5,    5,
         5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
         5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
         5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
 
         5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
-        5,    8,   12,   12,   12,   14,   15,   16,    8,    8,
-        8,   17,   20,   20,   17,   23,   24,   29,  155,   24,
-       29,   29,   29,   48,   16,   14,   31,   29,  128,   17,
-      128,   17,   48,   17,   24,   17,   24,   99,   24,   42,
-       24,   99,   15,   33,   33,   33,   23,   26,   26,   26,
-       26,  154,   33,   33,   26,   36,   42,   31,   32,   32,
-       32,   26,   26,   26,   26,   44,   59,   32,   32,   32,
-       34,   34,   34,   73,   75,   36,  153,   75,   59,   59,
-       59,   65,   44,  110,   65,   65,   65,   67,   67,   67,
+        5,    5,    5,    8,   12,   12,   12,   14,   15,   16,
+        8,    8,    8,   17,   20,   20,   17,   23,   42,   24,
+       29,  162,   24,   29,   29,   29,   16,   14,   31,   44,
+       29,   17,  134,   17,  134,   42,   17,   24,   17,   24,
+       76,   17,   24,   76,   24,   15,   44,   24,   36,   23,
+       26,   26,   26,   26,   34,   34,   34,   26,  161,   48,
+       31,   32,   32,   32,   26,   26,   26,   26,   36,   48,
+       32,   32,   32,   33,   33,   33,   60,  160,   74,   91,
+       91,   66,   33,   33,   66,   66,   66,  114,   60,   60,
 
-       73,   65,   83,   89,   89,   97,   67,   67,  119,  127,
-       97,  150,  149,  110,   83,   83,   83,  133,  133,  133,
-      141,  127,  127,  127,  145,  136,  136,  136,  119,  136,
-      140,  140,  140,  148,  140,  147,  144,  143,  142,  139,
-      141,  138,  137,  135,  145,  159,  159,  159,  159,  159,
-      159,  159,  159,  160,  134,  132,  131,  160,  160,  161,
-      161,  161,  161,  161,  161,  161,  161,  162,  162,  162,
-      162,  163,  126,  163,  164,  125,  124,  164,  123,  164,
-      164,  165,  122,  121,  165,  165,  165,  165,  166,  120,
-      166,  166,  167,  167,  167,  167,  167,  167,  167,  167,
+       60,   66,   68,   68,   68,   74,   85,   99,  124,  133,
+      102,   68,   68,   99,  102,  157,  156,  114,   85,   85,
+       85,  133,  133,  133,  140,  140,  140,  148,  124,  143,
+      143,  143,  152,  143,  147,  147,  147,  155,  147,  154,
+      151,  150,  149,  146,  145,  144,  142,  148,  141,  138,
+      137,  132,  152,  166,  166,  166,  166,  166,  166,  166,
+      166,  167,  131,  130,  129,  167,  167,  168,  168,  168,
+      168,  168,  168,  168,  168,  169,  169,  169,  169,  170,
+      128,  170,  171,  127,  126,  171,  125,  171,  171,  172,
+      123,  122,  172,  172,  172,  172,  173,  121,  173,  173,
 
-      168,  168,  168,  168,  168,  168,  168,  168,  169,  118,
-      169,  170,  117,  170,  170,  116,  170,  170,  171,  171,
-      171,  171,  171,  171,  171,  171,  172,  172,  172,  172,
-      172,  172,  172,  172,  115,  114,  112,  111,  109,  108,
-      107,  106,  105,  104,  103,  102,  101,  100,   98,   96,
-       95,   94,   93,   92,   90,   88,   86,   85,   84,   82,
-       81,   80,   79,   78,   77,   76,   74,   72,   69,   68,
-       66,   63,   61,   60,   56,   50,   49,   47,   46,   45,
+      174,  174,  174,  174,  174,  174,  174,  174,  175,  175,
+      175,  175,  175,  175,  175,  175,  176,  120,  176,  177,
+      119,  177,  177,  118,  177,  177,  178,  178,  178,  178,
+      178,  178,  178,  178,  179,  179,  179,  179,  179,  179,
+      179,  179,  116,  115,  113,  112,  111,  110,  109,  108,
+      107,  106,  105,  104,  103,  101,  100,   98,   97,   96,
+       95,   94,   92,   90,   88,   87,   86,   84,   83,   82,
+       81,   80,   79,   78,   77,   75,   73,   70,   69,   67,
+       64,   62,   61,   57,   51,   50,   49,   47,   46,   45,
        41,   38,   22,   21,   19,   13,    9,    6,    4,    2,
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
 
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
-      158,  158,  158,  158,  158,  158,  158,  158,  158,  158,
-      158,  158,  158,  158,  158,  158,  158
+      165,  165,  165,  165,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165,  165,  165,
+      165,  165,  165,  165,  165,  165,  165,  165
     } ;
 
 static yy_state_type yy_last_accepting_state;
@@ -662,7 +656,7 @@ static int dts_version = 1;
 static void push_input_file(const char *filename);
 static bool pop_input_file(void);
 static void lexical_error(const char *fmt, ...);
-#line 666 "dtc-lexer.lex.c"
+#line 660 "dtc-lexer.lex.c"
 
 #define INITIAL 0
 #define BYTESTRING 1
@@ -698,19 +692,19 @@ void yyset_extra (YY_EXTRA_TYPE user_defined  );
 
 FILE *yyget_in (void );
 
-void yyset_in  (FILE * in_str  );
+void yyset_in  (FILE * _in_str  );
 
 FILE *yyget_out (void );
 
-void yyset_out  (FILE * out_str  );
+void yyset_out  (FILE * _out_str  );
 
-yy_size_t yyget_leng (void );
+			int yyget_leng (void );
 
 char *yyget_text (void );
 
 int yyget_lineno (void );
 
-void yyset_lineno (int line_number  );
+void yyset_lineno (int _line_number  );
 
 /* Macros after this point can all be overridden by user definitions in
  * section 1.
@@ -724,6 +718,10 @@ extern int yywrap (void );
 #endif
 #endif
 
+#ifndef YY_NO_UNPUT
+    
+#endif
+
 #ifndef yytext_ptr
 static void yy_flex_strncpy (char *,yyconst char *,int );
 #endif
@@ -757,7 +755,7 @@ static int input (void );
 /* This used to be an fputs(), but since the string might contain NUL's,
  * we now use fwrite().
  */
-#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0)
+#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0)
 #endif
 
 /* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
@@ -781,7 +779,7 @@ static int input (void );
 	else \
 		{ \
 		errno=0; \
-		while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
+		while ( (result = (int) fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
 			{ \
 			if( errno != EINTR) \
 				{ \
@@ -836,7 +834,7 @@ extern int yylex (void);
 
 /* Code executed at the end of each rule. */
 #ifndef YY_BREAK
-#define YY_BREAK break;
+#define YY_BREAK /*LINTED*/break;
 #endif
 
 #define YY_RULE_SETUP \
@@ -849,9 +847,9 @@ extern int yylex (void);
  */
 YY_DECL
 {
-	register yy_state_type yy_current_state;
-	register char *yy_cp, *yy_bp;
-	register int yy_act;
+	yy_state_type yy_current_state;
+	char *yy_cp, *yy_bp;
+	int yy_act;
     
 	if ( !(yy_init) )
 		{
@@ -882,9 +880,9 @@ YY_DECL
 	{
 #line 68 "dtc-lexer.l"
 
-#line 886 "dtc-lexer.lex.c"
+#line 884 "dtc-lexer.lex.c"
 
-	while ( 1 )		/* loops until end-of-file is reached */
+	while ( /*CONSTCOND*/1 )		/* loops until end-of-file is reached */
 		{
 		yy_cp = (yy_c_buf_p);
 
@@ -901,7 +899,7 @@ YY_DECL
 yy_match:
 		do
 			{
-			register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
+			YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
 			if ( yy_accept[yy_current_state] )
 				{
 				(yy_last_accepting_state) = yy_current_state;
@@ -910,13 +908,13 @@ yy_match:
 			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 				{
 				yy_current_state = (int) yy_def[yy_current_state];
-				if ( yy_current_state >= 159 )
+				if ( yy_current_state >= 166 )
 					yy_c = yy_meta[(unsigned int) yy_c];
 				}
-			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + (flex_int16_t) yy_c];
 			++yy_cp;
 			}
-		while ( yy_current_state != 158 );
+		while ( yy_current_state != 165 );
 		yy_cp = (yy_last_accepting_cpos);
 		yy_current_state = (yy_last_accepting_state);
 
@@ -1014,24 +1012,32 @@ YY_RULE_SETUP
 case 5:
 YY_RULE_SETUP
 #line 124 "dtc-lexer.l"
+{
+			DPRINT("Keyword: /plugin/\n");
+			return DT_PLUGIN;
+		}
+	YY_BREAK
+case 6:
+YY_RULE_SETUP
+#line 129 "dtc-lexer.l"
 {
 			DPRINT("Keyword: /memreserve/\n");
 			BEGIN_DEFAULT();
 			return DT_MEMRESERVE;
 		}
 	YY_BREAK
-case 6:
+case 7:
 YY_RULE_SETUP
-#line 130 "dtc-lexer.l"
+#line 135 "dtc-lexer.l"
 {
 			DPRINT("Keyword: /bits/\n");
 			BEGIN_DEFAULT();
 			return DT_BITS;
 		}
 	YY_BREAK
-case 7:
+case 8:
 YY_RULE_SETUP
-#line 136 "dtc-lexer.l"
+#line 141 "dtc-lexer.l"
 {
 			DPRINT("Keyword: /delete-property/\n");
 			DPRINT("<PROPNODENAME>\n");
@@ -1039,9 +1045,9 @@ YY_RULE_SETUP
 			return DT_DEL_PROP;
 		}
 	YY_BREAK
-case 8:
+case 9:
 YY_RULE_SETUP
-#line 143 "dtc-lexer.l"
+#line 148 "dtc-lexer.l"
 {
 			DPRINT("Keyword: /delete-node/\n");
 			DPRINT("<PROPNODENAME>\n");
@@ -1049,9 +1055,9 @@ YY_RULE_SETUP
 			return DT_DEL_NODE;
 		}
 	YY_BREAK
-case 9:
+case 10:
 YY_RULE_SETUP
-#line 150 "dtc-lexer.l"
+#line 155 "dtc-lexer.l"
 {
 			DPRINT("Label: %s\n", yytext);
 			yylval.labelref = xstrdup(yytext);
@@ -1059,9 +1065,9 @@ YY_RULE_SETUP
 			return DT_LABEL;
 		}
 	YY_BREAK
-case 10:
+case 11:
 YY_RULE_SETUP
-#line 157 "dtc-lexer.l"
+#line 162 "dtc-lexer.l"
 {
 			char *e;
 			DPRINT("Integer Literal: '%s'\n", yytext);
@@ -1084,10 +1090,10 @@ YY_RULE_SETUP
 			return DT_LITERAL;
 		}
 	YY_BREAK
-case 11:
-/* rule 11 can match eol */
+case 12:
+/* rule 12 can match eol */
 YY_RULE_SETUP
-#line 179 "dtc-lexer.l"
+#line 184 "dtc-lexer.l"
 {
 			struct data d;
 			DPRINT("Character literal: %s\n", yytext);
@@ -1096,31 +1102,31 @@ YY_RULE_SETUP
 			if (d.len == 1) {
 				lexical_error("Empty character literal");
 				yylval.integer = 0;
-				return DT_CHAR_LITERAL;
+			} else {
+				yylval.integer = (unsigned char)d.val[0];
+
+				if (d.len > 2)
+					lexical_error("Character literal has %d"
+						      " characters instead of 1",
+						      d.len - 1);
 			}
 
-			yylval.integer = (unsigned char)d.val[0];
-
-			if (d.len > 2)
-				lexical_error("Character literal has %d"
-					      " characters instead of 1",
-					      d.len - 1);
-
+			data_free(d);
 			return DT_CHAR_LITERAL;
 		}
 	YY_BREAK
-case 12:
+case 13:
 YY_RULE_SETUP
-#line 200 "dtc-lexer.l"
+#line 205 "dtc-lexer.l"
 {	/* label reference */
 			DPRINT("Ref: %s\n", yytext+1);
 			yylval.labelref = xstrdup(yytext+1);
 			return DT_REF;
 		}
 	YY_BREAK
-case 13:
+case 14:
 YY_RULE_SETUP
-#line 206 "dtc-lexer.l"
+#line 211 "dtc-lexer.l"
 {	/* new-style path reference */
 			yytext[yyleng-1] = '\0';
 			DPRINT("Ref: %s\n", yytext+2);
@@ -1128,27 +1134,27 @@ YY_RULE_SETUP
 			return DT_REF;
 		}
 	YY_BREAK
-case 14:
+case 15:
 YY_RULE_SETUP
-#line 213 "dtc-lexer.l"
+#line 218 "dtc-lexer.l"
 {
 			yylval.byte = strtol(yytext, NULL, 16);
 			DPRINT("Byte: %02x\n", (int)yylval.byte);
 			return DT_BYTE;
 		}
 	YY_BREAK
-case 15:
+case 16:
 YY_RULE_SETUP
-#line 219 "dtc-lexer.l"
+#line 224 "dtc-lexer.l"
 {
 			DPRINT("/BYTESTRING\n");
 			BEGIN_DEFAULT();
 			return ']';
 		}
 	YY_BREAK
-case 16:
+case 17:
 YY_RULE_SETUP
-#line 225 "dtc-lexer.l"
+#line 230 "dtc-lexer.l"
 {
 			DPRINT("PropNodeName: %s\n", yytext);
 			yylval.propnodename = xstrdup((yytext[0] == '\\') ?
@@ -1157,75 +1163,75 @@ YY_RULE_SETUP
 			return DT_PROPNODENAME;
 		}
 	YY_BREAK
-case 17:
+case 18:
 YY_RULE_SETUP
-#line 233 "dtc-lexer.l"
+#line 238 "dtc-lexer.l"
 {
 			DPRINT("Binary Include\n");
 			return DT_INCBIN;
 		}
 	YY_BREAK
-case 18:
-/* rule 18 can match eol */
-YY_RULE_SETUP
-#line 238 "dtc-lexer.l"
-/* eat whitespace */
-	YY_BREAK
 case 19:
 /* rule 19 can match eol */
 YY_RULE_SETUP
-#line 239 "dtc-lexer.l"
-/* eat C-style comments */
+#line 243 "dtc-lexer.l"
+/* eat whitespace */
 	YY_BREAK
 case 20:
 /* rule 20 can match eol */
 YY_RULE_SETUP
-#line 240 "dtc-lexer.l"
-/* eat C++-style comments */
+#line 244 "dtc-lexer.l"
+/* eat C-style comments */
 	YY_BREAK
 case 21:
+/* rule 21 can match eol */
 YY_RULE_SETUP
-#line 242 "dtc-lexer.l"
-{ return DT_LSHIFT; };
+#line 245 "dtc-lexer.l"
+/* eat C++-style comments */
 	YY_BREAK
 case 22:
 YY_RULE_SETUP
-#line 243 "dtc-lexer.l"
-{ return DT_RSHIFT; };
+#line 247 "dtc-lexer.l"
+{ return DT_LSHIFT; };
 	YY_BREAK
 case 23:
 YY_RULE_SETUP
-#line 244 "dtc-lexer.l"
-{ return DT_LE; };
+#line 248 "dtc-lexer.l"
+{ return DT_RSHIFT; };
 	YY_BREAK
 case 24:
 YY_RULE_SETUP
-#line 245 "dtc-lexer.l"
-{ return DT_GE; };
+#line 249 "dtc-lexer.l"
+{ return DT_LE; };
 	YY_BREAK
 case 25:
 YY_RULE_SETUP
-#line 246 "dtc-lexer.l"
-{ return DT_EQ; };
+#line 250 "dtc-lexer.l"
+{ return DT_GE; };
 	YY_BREAK
 case 26:
 YY_RULE_SETUP
-#line 247 "dtc-lexer.l"
-{ return DT_NE; };
+#line 251 "dtc-lexer.l"
+{ return DT_EQ; };
 	YY_BREAK
 case 27:
 YY_RULE_SETUP
-#line 248 "dtc-lexer.l"
-{ return DT_AND; };
+#line 252 "dtc-lexer.l"
+{ return DT_NE; };
 	YY_BREAK
 case 28:
 YY_RULE_SETUP
-#line 249 "dtc-lexer.l"
-{ return DT_OR; };
+#line 253 "dtc-lexer.l"
+{ return DT_AND; };
 	YY_BREAK
 case 29:
 YY_RULE_SETUP
-#line 251 "dtc-lexer.l"
+#line 254 "dtc-lexer.l"
+{ return DT_OR; };
+	YY_BREAK
+case 30:
+YY_RULE_SETUP
+#line 256 "dtc-lexer.l"
 {
 			DPRINT("Char: %c (\\x%02x)\n", yytext[0],
 				(unsigned)yytext[0]);
@@ -1241,12 +1247,12 @@ YY_RULE_SETUP
 			return yytext[0];
 		}
 	YY_BREAK
-case 30:
+case 31:
 YY_RULE_SETUP
-#line 266 "dtc-lexer.l"
+#line 271 "dtc-lexer.l"
 ECHO;
 	YY_BREAK
-#line 1250 "dtc-lexer.lex.c"
+#line 1256 "dtc-lexer.lex.c"
 
 	case YY_END_OF_BUFFER:
 		{
@@ -1388,9 +1394,9 @@ ECHO;
  */
 static int yy_get_next_buffer (void)
 {
-    	register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
-	register char *source = (yytext_ptr);
-	register int number_to_move, i;
+    	char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	char *source = (yytext_ptr);
+	yy_size_t number_to_move, i;
 	int ret_val;
 
 	if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] )
@@ -1419,7 +1425,7 @@ static int yy_get_next_buffer (void)
 	/* Try to read more data. */
 
 	/* First move last chars to start of buffer. */
-	number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1;
+	number_to_move = (yy_size_t) ((yy_c_buf_p) - (yytext_ptr)) - 1;
 
 	for ( i = 0; i < number_to_move; ++i )
 		*(dest++) = *(source++);
@@ -1432,7 +1438,7 @@ static int yy_get_next_buffer (void)
 
 	else
 		{
-			yy_size_t num_to_read =
+			int num_to_read =
 			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
 
 		while ( num_to_read <= 0 )
@@ -1446,7 +1452,7 @@ static int yy_get_next_buffer (void)
 
 			if ( b->yy_is_our_buffer )
 				{
-				yy_size_t new_size = b->yy_buf_size * 2;
+				int new_size = b->yy_buf_size * 2;
 
 				if ( new_size <= 0 )
 					b->yy_buf_size += b->yy_buf_size / 8;
@@ -1459,7 +1465,7 @@ static int yy_get_next_buffer (void)
 				}
 			else
 				/* Can't grow it, we don't own it. */
-				b->yy_ch_buf = 0;
+				b->yy_ch_buf = NULL;
 
 			if ( ! b->yy_ch_buf )
 				YY_FATAL_ERROR(
@@ -1501,9 +1507,9 @@ static int yy_get_next_buffer (void)
 	else
 		ret_val = EOB_ACT_CONTINUE_SCAN;
 
-	if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+	if ((int) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
 		/* Extend the array by 50%, plus the number we really need. */
-		yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1);
+		int new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1);
 		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size  );
 		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
 			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
@@ -1522,15 +1528,15 @@ static int yy_get_next_buffer (void)
 
     static yy_state_type yy_get_previous_state (void)
 {
-	register yy_state_type yy_current_state;
-	register char *yy_cp;
+	yy_state_type yy_current_state;
+	char *yy_cp;
     
 	yy_current_state = (yy_start);
 	yy_current_state += YY_AT_BOL();
 
 	for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp )
 		{
-		register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
 		if ( yy_accept[yy_current_state] )
 			{
 			(yy_last_accepting_state) = yy_current_state;
@@ -1539,10 +1545,10 @@ static int yy_get_next_buffer (void)
 		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 			{
 			yy_current_state = (int) yy_def[yy_current_state];
-			if ( yy_current_state >= 159 )
+			if ( yy_current_state >= 166 )
 				yy_c = yy_meta[(unsigned int) yy_c];
 			}
-		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + (flex_int16_t) yy_c];
 		}
 
 	return yy_current_state;
@@ -1555,10 +1561,10 @@ static int yy_get_next_buffer (void)
  */
     static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state )
 {
-	register int yy_is_jam;
-    	register char *yy_cp = (yy_c_buf_p);
+	int yy_is_jam;
+    	char *yy_cp = (yy_c_buf_p);
 
-	register YY_CHAR yy_c = 1;
+	YY_CHAR yy_c = 1;
 	if ( yy_accept[yy_current_state] )
 		{
 		(yy_last_accepting_state) = yy_current_state;
@@ -1567,15 +1573,19 @@ static int yy_get_next_buffer (void)
 	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 		{
 		yy_current_state = (int) yy_def[yy_current_state];
-		if ( yy_current_state >= 159 )
+		if ( yy_current_state >= 166 )
 			yy_c = yy_meta[(unsigned int) yy_c];
 		}
-	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
-	yy_is_jam = (yy_current_state == 158);
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + (flex_int16_t) yy_c];
+	yy_is_jam = (yy_current_state == 165);
 
 		return yy_is_jam ? 0 : yy_current_state;
 }
 
+#ifndef YY_NO_UNPUT
+
+#endif
+
 #ifndef YY_NO_INPUT
 #ifdef __cplusplus
     static int yyinput (void)
@@ -1600,7 +1610,7 @@ static int yy_get_next_buffer (void)
 
 		else
 			{ /* need more input */
-			yy_size_t offset = (yy_c_buf_p) - (yytext_ptr);
+			int offset = (yy_c_buf_p) - (yytext_ptr);
 			++(yy_c_buf_p);
 
 			switch ( yy_get_next_buffer(  ) )
@@ -1624,7 +1634,7 @@ static int yy_get_next_buffer (void)
 				case EOB_ACT_END_OF_FILE:
 					{
 					if ( yywrap( ) )
-						return EOF;
+						return 0;
 
 					if ( ! (yy_did_buffer_switch_on_eof) )
 						YY_NEW_FILE;
@@ -1727,7 +1737,7 @@ static void yy_load_buffer_state  (void)
 	if ( ! b )
 		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
 
-	b->yy_buf_size = size;
+	b->yy_buf_size = (yy_size_t)size;
 
 	/* yy_ch_buf has to be 2 characters longer than the size given because
 	 * we need to put in 2 end-of-buffer characters.
@@ -1874,7 +1884,7 @@ void yypop_buffer_state (void)
  */
 static void yyensure_buffer_stack (void)
 {
-	yy_size_t num_to_alloc;
+	int num_to_alloc;
     
 	if (!(yy_buffer_stack)) {
 
@@ -1882,15 +1892,15 @@ static void yyensure_buffer_stack (void)
 		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
 		 * immediate realloc on the next call.
          */
-		num_to_alloc = 1;
+      num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */
 		(yy_buffer_stack) = (struct yy_buffer_state**)yyalloc
 								(num_to_alloc * sizeof(struct yy_buffer_state*)
 								);
 		if ( ! (yy_buffer_stack) )
 			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
-								  
+
 		memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*));
-				
+
 		(yy_buffer_stack_max) = num_to_alloc;
 		(yy_buffer_stack_top) = 0;
 		return;
@@ -1899,7 +1909,7 @@ static void yyensure_buffer_stack (void)
 	if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){
 
 		/* Increase the buffer to prepare for a possible push. */
-		int grow_size = 8 /* arbitrary grow size */;
+		yy_size_t grow_size = 8 /* arbitrary grow size */;
 
 		num_to_alloc = (yy_buffer_stack_max) + grow_size;
 		(yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc
@@ -1919,7 +1929,7 @@ static void yyensure_buffer_stack (void)
  * @param base the character buffer
  * @param size the size in bytes of the character buffer
  * 
- * @return the newly allocated buffer state object. 
+ * @return the newly allocated buffer state object.
  */
 YY_BUFFER_STATE yy_scan_buffer  (char * base, yy_size_t  size )
 {
@@ -1929,7 +1939,7 @@ YY_BUFFER_STATE yy_scan_buffer  (char * base, yy_size_t  size )
 	     base[size-2] != YY_END_OF_BUFFER_CHAR ||
 	     base[size-1] != YY_END_OF_BUFFER_CHAR )
 		/* They forgot to leave room for the EOB's. */
-		return 0;
+		return NULL;
 
 	b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state )  );
 	if ( ! b )
@@ -1938,7 +1948,7 @@ YY_BUFFER_STATE yy_scan_buffer  (char * base, yy_size_t  size )
 	b->yy_buf_size = size - 2;	/* "- 2" to take care of EOB's */
 	b->yy_buf_pos = b->yy_ch_buf = base;
 	b->yy_is_our_buffer = 0;
-	b->yy_input_file = 0;
+	b->yy_input_file = NULL;
 	b->yy_n_chars = b->yy_buf_size;
 	b->yy_is_interactive = 0;
 	b->yy_at_bol = 1;
@@ -1961,7 +1971,7 @@ YY_BUFFER_STATE yy_scan_buffer  (char * base, yy_size_t  size )
 YY_BUFFER_STATE yy_scan_string (yyconst char * yystr )
 {
     
-	return yy_scan_bytes(yystr,strlen(yystr) );
+	return yy_scan_bytes(yystr,(int) strlen(yystr) );
 }
 
 /** Setup the input buffer state to scan the given bytes. The next call to yylex() will
@@ -1971,7 +1981,7 @@ YY_BUFFER_STATE yy_scan_string (yyconst char * yystr )
  * 
  * @return the newly allocated buffer state object.
  */
-YY_BUFFER_STATE yy_scan_bytes  (yyconst char * yybytes, yy_size_t  _yybytes_len )
+YY_BUFFER_STATE yy_scan_bytes  (yyconst char * yybytes, int  _yybytes_len )
 {
 	YY_BUFFER_STATE b;
 	char *buf;
@@ -1979,7 +1989,7 @@ YY_BUFFER_STATE yy_scan_bytes  (yyconst char * yybytes, yy_size_t  _yybytes_len
 	yy_size_t i;
     
 	/* Get memory for full buffer, including space for trailing EOB's. */
-	n = _yybytes_len + 2;
+	n = (yy_size_t) _yybytes_len + 2;
 	buf = (char *) yyalloc(n  );
 	if ( ! buf )
 		YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" );
@@ -2005,9 +2015,9 @@ YY_BUFFER_STATE yy_scan_bytes  (yyconst char * yybytes, yy_size_t  _yybytes_len
 #define YY_EXIT_FAILURE 2
 #endif
 
-static void yy_fatal_error (yyconst char* msg )
+static void yynoreturn yy_fatal_error (yyconst char* msg )
 {
-    	(void) fprintf( stderr, "%s\n", msg );
+			(void) fprintf( stderr, "%s\n", msg );
 	exit( YY_EXIT_FAILURE );
 }
 
@@ -2035,7 +2045,7 @@ static void yy_fatal_error (yyconst char* msg )
  */
 int yyget_lineno  (void)
 {
-        
+    
     return yylineno;
 }
 
@@ -2058,7 +2068,7 @@ FILE *yyget_out  (void)
 /** Get the length of the current token.
  * 
  */
-yy_size_t yyget_leng  (void)
+int yyget_leng  (void)
 {
         return yyleng;
 }
@@ -2073,29 +2083,29 @@ char *yyget_text  (void)
 }
 
 /** Set the current line number.
- * @param line_number
+ * @param _line_number line number
  * 
  */
-void yyset_lineno (int  line_number )
+void yyset_lineno (int  _line_number )
 {
     
-    yylineno = line_number;
+    yylineno = _line_number;
 }
 
 /** Set the input stream. This does not discard the current
  * input buffer.
- * @param in_str A readable stream.
+ * @param _in_str A readable stream.
  * 
  * @see yy_switch_to_buffer
  */
-void yyset_in (FILE *  in_str )
+void yyset_in (FILE *  _in_str )
 {
-        yyin = in_str ;
+        yyin = _in_str ;
 }
 
-void yyset_out (FILE *  out_str )
+void yyset_out (FILE *  _out_str )
 {
-        yyout = out_str ;
+        yyout = _out_str ;
 }
 
 int yyget_debug  (void)
@@ -2103,9 +2113,9 @@ int yyget_debug  (void)
         return yy_flex_debug;
 }
 
-void yyset_debug (int  bdebug )
+void yyset_debug (int  _bdebug )
 {
-        yy_flex_debug = bdebug ;
+        yy_flex_debug = _bdebug ;
 }
 
 static int yy_init_globals (void)
@@ -2114,10 +2124,10 @@ static int yy_init_globals (void)
      * This function is called from yylex_destroy(), so don't allocate here.
      */
 
-    (yy_buffer_stack) = 0;
+    (yy_buffer_stack) = NULL;
     (yy_buffer_stack_top) = 0;
     (yy_buffer_stack_max) = 0;
-    (yy_c_buf_p) = (char *) 0;
+    (yy_c_buf_p) = NULL;
     (yy_init) = 0;
     (yy_start) = 0;
 
@@ -2126,8 +2136,8 @@ static int yy_init_globals (void)
     yyin = stdin;
     yyout = stdout;
 #else
-    yyin = (FILE *) 0;
-    yyout = (FILE *) 0;
+    yyin = NULL;
+    yyout = NULL;
 #endif
 
     /* For future reference: Set errno on error, since we are called by
@@ -2165,7 +2175,8 @@ int yylex_destroy  (void)
 #ifndef yytext_ptr
 static void yy_flex_strncpy (char* s1, yyconst char * s2, int n )
 {
-	register int i;
+		
+	int i;
 	for ( i = 0; i < n; ++i )
 		s1[i] = s2[i];
 }
@@ -2174,7 +2185,7 @@ static void yy_flex_strncpy (char* s1, yyconst char * s2, int n )
 #ifdef YY_NEED_STRLEN
 static int yy_flex_strlen (yyconst char * s )
 {
-	register int n;
+	int n;
 	for ( n = 0; s[n]; ++n )
 		;
 
@@ -2184,11 +2195,12 @@ static int yy_flex_strlen (yyconst char * s )
 
 void *yyalloc (yy_size_t  size )
 {
-	return (void *) malloc( size );
+			return malloc(size);
 }
 
 void *yyrealloc  (void * ptr, yy_size_t  size )
 {
+		
 	/* The cast to (char *) in the following accommodates both
 	 * implementations that use char* generic pointers, and those
 	 * that use void* generic pointers.  It works with the latter
@@ -2196,17 +2208,17 @@ void *yyrealloc  (void * ptr, yy_size_t  size )
 	 * any pointer type to void*, and deal with argument conversions
 	 * as though doing an assignment.
 	 */
-	return (void *) realloc( (char *) ptr, size );
+	return realloc(ptr, size);
 }
 
 void yyfree (void * ptr )
 {
-	free( (char *) ptr );	/* see yyrealloc() for (char *) cast */
+			free( (char *) ptr );	/* see yyrealloc() for (char *) cast */
 }
 
 #define YYTABLES_NAME "yytables"
 
-#line 265 "dtc-lexer.l"
+#line 271 "dtc-lexer.l"
 
 
 
diff --git a/scripts/dtc/dtc-parser.tab.c_shipped b/scripts/dtc/dtc-parser.tab.c_shipped
index 31cec50a1265..2965227a1b4a 100644
--- a/scripts/dtc/dtc-parser.tab.c_shipped
+++ b/scripts/dtc/dtc-parser.tab.c_shipped
@@ -1,8 +1,8 @@
-/* A Bison parser, made by GNU Bison 3.0.2.  */
+/* A Bison parser, made by GNU Bison 3.0.4.  */
 
 /* Bison implementation for Yacc-like parsers in C
 
-   Copyright (C) 1984, 1989-1990, 2000-2013 Free Software Foundation, Inc.
+   Copyright (C) 1984, 1989-1990, 2000-2015 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -44,7 +44,7 @@
 #define YYBISON 1
 
 /* Bison version.  */
-#define YYBISON_VERSION "3.0.2"
+#define YYBISON_VERSION "3.0.4"
 
 /* Skeleton name.  */
 #define YYSKELETON_NAME "yacc.c"
@@ -65,6 +65,7 @@
 #line 20 "dtc-parser.y" /* yacc.c:339  */
 
 #include <stdio.h>
+#include <inttypes.h>
 
 #include "dtc.h"
 #include "srcpos.h"
@@ -77,10 +78,10 @@ extern void yyerror(char const *s);
 		treesource_error = true; \
 	} while (0)
 
-extern struct boot_info *the_boot_info;
+extern struct dt_info *parser_output;
 extern bool treesource_error;
 
-#line 84 "dtc-parser.tab.c" /* yacc.c:339  */
+#line 85 "dtc-parser.tab.c" /* yacc.c:339  */
 
 # ifndef YY_NULLPTR
 #  if defined __cplusplus && 201103L <= __cplusplus
@@ -116,35 +117,36 @@ extern int yydebug;
   enum yytokentype
   {
     DT_V1 = 258,
-    DT_MEMRESERVE = 259,
-    DT_LSHIFT = 260,
-    DT_RSHIFT = 261,
-    DT_LE = 262,
-    DT_GE = 263,
-    DT_EQ = 264,
-    DT_NE = 265,
-    DT_AND = 266,
-    DT_OR = 267,
-    DT_BITS = 268,
-    DT_DEL_PROP = 269,
-    DT_DEL_NODE = 270,
-    DT_PROPNODENAME = 271,
-    DT_LITERAL = 272,
-    DT_CHAR_LITERAL = 273,
-    DT_BYTE = 274,
-    DT_STRING = 275,
-    DT_LABEL = 276,
-    DT_REF = 277,
-    DT_INCBIN = 278
+    DT_PLUGIN = 259,
+    DT_MEMRESERVE = 260,
+    DT_LSHIFT = 261,
+    DT_RSHIFT = 262,
+    DT_LE = 263,
+    DT_GE = 264,
+    DT_EQ = 265,
+    DT_NE = 266,
+    DT_AND = 267,
+    DT_OR = 268,
+    DT_BITS = 269,
+    DT_DEL_PROP = 270,
+    DT_DEL_NODE = 271,
+    DT_PROPNODENAME = 272,
+    DT_LITERAL = 273,
+    DT_CHAR_LITERAL = 274,
+    DT_BYTE = 275,
+    DT_STRING = 276,
+    DT_LABEL = 277,
+    DT_REF = 278,
+    DT_INCBIN = 279
   };
 #endif
 
 /* Value type.  */
 #if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
-typedef union YYSTYPE YYSTYPE;
+
 union YYSTYPE
 {
-#line 38 "dtc-parser.y" /* yacc.c:355  */
+#line 39 "dtc-parser.y" /* yacc.c:355  */
 
 	char *propnodename;
 	char *labelref;
@@ -162,9 +164,12 @@ union YYSTYPE
 	struct node *nodelist;
 	struct reserve_info *re;
 	uint64_t integer;
+	unsigned int flags;
 
-#line 167 "dtc-parser.tab.c" /* yacc.c:355  */
+#line 170 "dtc-parser.tab.c" /* yacc.c:355  */
 };
+
+typedef union YYSTYPE YYSTYPE;
 # define YYSTYPE_IS_TRIVIAL 1
 # define YYSTYPE_IS_DECLARED 1
 #endif
@@ -192,7 +197,7 @@ int yyparse (void);
 
 /* Copy the second part of user declarations.  */
 
-#line 196 "dtc-parser.tab.c" /* yacc.c:358  */
+#line 201 "dtc-parser.tab.c" /* yacc.c:358  */
 
 #ifdef short
 # undef short
@@ -434,23 +439,23 @@ union yyalloc
 #endif /* !YYCOPY_NEEDED */
 
 /* YYFINAL -- State number of the termination state.  */
-#define YYFINAL  4
+#define YYFINAL  6
 /* YYLAST -- Last index in YYTABLE.  */
-#define YYLAST   136
+#define YYLAST   138
 
 /* YYNTOKENS -- Number of terminals.  */
-#define YYNTOKENS  47
+#define YYNTOKENS  48
 /* YYNNTS -- Number of nonterminals.  */
-#define YYNNTS  28
+#define YYNNTS  30
 /* YYNRULES -- Number of rules.  */
-#define YYNRULES  80
+#define YYNRULES  84
 /* YYNSTATES -- Number of states.  */
-#define YYNSTATES  144
+#define YYNSTATES  149
 
 /* YYTRANSLATE[YYX] -- Symbol number corresponding to YYX as returned
    by yylex, with out-of-bounds checking.  */
 #define YYUNDEFTOK  2
-#define YYMAXUTOK   278
+#define YYMAXUTOK   279
 
 #define YYTRANSLATE(YYX)                                                \
   ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
@@ -462,16 +467,16 @@ static const yytype_uint8 yytranslate[] =
        0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,    46,     2,     2,     2,    44,    40,     2,
-      32,    34,    43,    41,    33,    42,     2,    25,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,    37,    24,
-      35,    28,    29,    36,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,    47,     2,     2,     2,    45,    41,     2,
+      33,    35,    44,    42,    34,    43,     2,    26,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,    38,    25,
+      36,    29,    30,    37,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,    30,     2,    31,    39,     2,     2,     2,     2,     2,
+       2,    31,     2,    32,    40,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,    26,    38,    27,    45,     2,     2,     2,
+       2,     2,     2,    27,    39,    28,    46,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
@@ -486,22 +491,22 @@ static const yytype_uint8 yytranslate[] =
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
        5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
-      15,    16,    17,    18,    19,    20,    21,    22,    23
+      15,    16,    17,    18,    19,    20,    21,    22,    23,    24
 };
 
 #if YYDEBUG
   /* YYRLINE[YYN] -- Source line where rule number YYN was defined.  */
 static const yytype_uint16 yyrline[] =
 {
-       0,   104,   104,   113,   116,   123,   127,   135,   139,   144,
-     155,   165,   180,   188,   191,   198,   202,   206,   210,   218,
-     222,   226,   230,   234,   250,   260,   268,   271,   275,   282,
-     298,   303,   322,   336,   343,   344,   345,   352,   356,   357,
-     361,   362,   366,   367,   371,   372,   376,   377,   381,   382,
-     386,   387,   388,   392,   393,   394,   395,   396,   400,   401,
-     402,   406,   407,   408,   412,   413,   422,   431,   435,   436,
-     437,   438,   443,   446,   450,   458,   461,   465,   473,   477,
-     481
+       0,   109,   109,   117,   121,   128,   129,   139,   142,   149,
+     153,   161,   165,   170,   181,   191,   206,   214,   217,   224,
+     228,   232,   236,   244,   248,   252,   256,   260,   276,   286,
+     294,   297,   301,   308,   324,   329,   348,   362,   369,   370,
+     371,   378,   382,   383,   387,   388,   392,   393,   397,   398,
+     402,   403,   407,   408,   412,   413,   414,   418,   419,   420,
+     421,   422,   426,   427,   428,   432,   433,   434,   438,   439,
+     448,   457,   461,   462,   463,   464,   469,   472,   476,   484,
+     487,   491,   499,   503,   507
 };
 #endif
 
@@ -510,19 +515,20 @@ static const yytype_uint16 yyrline[] =
    First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
 static const char *const yytname[] =
 {
-  "$end", "error", "$undefined", "DT_V1", "DT_MEMRESERVE", "DT_LSHIFT",
-  "DT_RSHIFT", "DT_LE", "DT_GE", "DT_EQ", "DT_NE", "DT_AND", "DT_OR",
-  "DT_BITS", "DT_DEL_PROP", "DT_DEL_NODE", "DT_PROPNODENAME", "DT_LITERAL",
-  "DT_CHAR_LITERAL", "DT_BYTE", "DT_STRING", "DT_LABEL", "DT_REF",
-  "DT_INCBIN", "';'", "'/'", "'{'", "'}'", "'='", "'>'", "'['", "']'",
-  "'('", "','", "')'", "'<'", "'?'", "':'", "'|'", "'^'", "'&'", "'+'",
-  "'-'", "'*'", "'%'", "'~'", "'!'", "$accept", "sourcefile",
-  "memreserves", "memreserve", "devicetree", "nodedef", "proplist",
-  "propdef", "propdata", "propdataprefix", "arrayprefix", "integer_prim",
-  "integer_expr", "integer_trinary", "integer_or", "integer_and",
-  "integer_bitor", "integer_bitxor", "integer_bitand", "integer_eq",
-  "integer_rela", "integer_shift", "integer_add", "integer_mul",
-  "integer_unary", "bytestring", "subnodes", "subnode", YY_NULLPTR
+  "$end", "error", "$undefined", "DT_V1", "DT_PLUGIN", "DT_MEMRESERVE",
+  "DT_LSHIFT", "DT_RSHIFT", "DT_LE", "DT_GE", "DT_EQ", "DT_NE", "DT_AND",
+  "DT_OR", "DT_BITS", "DT_DEL_PROP", "DT_DEL_NODE", "DT_PROPNODENAME",
+  "DT_LITERAL", "DT_CHAR_LITERAL", "DT_BYTE", "DT_STRING", "DT_LABEL",
+  "DT_REF", "DT_INCBIN", "';'", "'/'", "'{'", "'}'", "'='", "'>'", "'['",
+  "']'", "'('", "','", "')'", "'<'", "'?'", "':'", "'|'", "'^'", "'&'",
+  "'+'", "'-'", "'*'", "'%'", "'~'", "'!'", "$accept", "sourcefile",
+  "header", "headers", "memreserves", "memreserve", "devicetree",
+  "nodedef", "proplist", "propdef", "propdata", "propdataprefix",
+  "arrayprefix", "integer_prim", "integer_expr", "integer_trinary",
+  "integer_or", "integer_and", "integer_bitor", "integer_bitxor",
+  "integer_bitand", "integer_eq", "integer_rela", "integer_shift",
+  "integer_add", "integer_mul", "integer_unary", "bytestring", "subnodes",
+  "subnode", YY_NULLPTR
 };
 #endif
 
@@ -533,16 +539,16 @@ static const yytype_uint16 yytoknum[] =
 {
        0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
      265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
-     275,   276,   277,   278,    59,    47,   123,   125,    61,    62,
-      91,    93,    40,    44,    41,    60,    63,    58,   124,    94,
-      38,    43,    45,    42,    37,   126,    33
+     275,   276,   277,   278,   279,    59,    47,   123,   125,    61,
+      62,    91,    93,    40,    44,    41,    60,    63,    58,   124,
+      94,    38,    43,    45,    42,    37,   126,    33
 };
 # endif
 
-#define YYPACT_NINF -81
+#define YYPACT_NINF -44
 
 #define yypact_value_is_default(Yystate) \
-  (!!((Yystate) == (-81)))
+  (!!((Yystate) == (-44)))
 
 #define YYTABLE_NINF -1
 
@@ -553,21 +559,21 @@ static const yytype_uint16 yytoknum[] =
      STATE-NUM.  */
 static const yytype_int8 yypact[] =
 {
-      16,   -11,    21,    10,   -81,    25,    10,    19,    10,   -81,
-     -81,    -9,    25,   -81,     2,    51,   -81,    -9,    -9,    -9,
-     -81,     1,   -81,    -6,    50,    14,    28,    29,    36,     3,
-      58,    44,    -3,   -81,    47,   -81,   -81,    65,    68,     2,
-       2,   -81,   -81,   -81,   -81,    -9,    -9,    -9,    -9,    -9,
-      -9,    -9,    -9,    -9,    -9,    -9,    -9,    -9,    -9,    -9,
-      -9,    -9,    -9,    -9,   -81,    63,    69,     2,   -81,   -81,
-      50,    57,    14,    28,    29,    36,     3,     3,    58,    58,
-      58,    58,    44,    44,    -3,    -3,   -81,   -81,   -81,    79,
-      80,    -8,    63,   -81,    72,    63,   -81,   -81,    -9,    76,
-      77,   -81,   -81,   -81,   -81,   -81,    78,   -81,   -81,   -81,
-     -81,   -81,    35,     4,   -81,   -81,   -81,   -81,    86,   -81,
-     -81,   -81,    73,   -81,   -81,    33,    71,    84,    39,   -81,
-     -81,   -81,   -81,   -81,    41,   -81,   -81,   -81,    25,   -81,
-      74,    25,    75,   -81
+      14,    27,    61,    14,     8,    18,   -44,   -44,    37,     8,
+      40,     8,    64,   -44,   -44,   -12,    37,   -44,    50,    52,
+     -44,   -44,   -12,   -12,   -12,   -44,    51,   -44,    -4,    78,
+      53,    54,    55,    17,     2,    30,    38,    -3,   -44,    66,
+     -44,   -44,    70,    72,    50,    50,   -44,   -44,   -44,   -44,
+     -12,   -12,   -12,   -12,   -12,   -12,   -12,   -12,   -12,   -12,
+     -12,   -12,   -12,   -12,   -12,   -12,   -12,   -12,   -12,   -44,
+       3,    73,    50,   -44,   -44,    78,    59,    53,    54,    55,
+      17,     2,     2,    30,    30,    30,    30,    38,    38,    -3,
+      -3,   -44,   -44,   -44,    82,    83,    44,     3,   -44,    74,
+       3,   -44,   -44,   -12,    76,    79,   -44,   -44,   -44,   -44,
+     -44,    80,   -44,   -44,   -44,   -44,   -44,   -10,    36,   -44,
+     -44,   -44,   -44,    85,   -44,   -44,   -44,    75,   -44,   -44,
+      21,    71,    88,    -6,   -44,   -44,   -44,   -44,   -44,    11,
+     -44,   -44,   -44,    37,   -44,    77,    37,    81,   -44
 };
 
   /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM.
@@ -575,37 +581,37 @@ static const yytype_int8 yypact[] =
      means the default is an error.  */
 static const yytype_uint8 yydefact[] =
 {
-       0,     0,     0,     3,     1,     0,     0,     0,     3,    34,
-      35,     0,     0,     6,     0,     2,     4,     0,     0,     0,
-      68,     0,    37,    38,    40,    42,    44,    46,    48,    50,
-      53,    60,    63,    67,     0,    13,     7,     0,     0,     0,
-       0,    69,    70,    71,    36,     0,     0,     0,     0,     0,
+       0,     0,     0,     5,     7,     3,     1,     6,     0,     0,
+       0,     7,     0,    38,    39,     0,     0,    10,     0,     2,
+       8,     4,     0,     0,     0,    72,     0,    41,    42,    44,
+      46,    48,    50,    52,    54,    57,    64,    67,    71,     0,
+      17,    11,     0,     0,     0,     0,    73,    74,    75,    40,
        0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,     5,    75,     0,     0,    10,     8,
-      41,     0,    43,    45,    47,    49,    51,    52,    56,    57,
-      55,    54,    58,    59,    61,    62,    65,    64,    66,     0,
-       0,     0,     0,    14,     0,    75,    11,     9,     0,     0,
-       0,    16,    26,    78,    18,    80,     0,    77,    76,    39,
-      17,    79,     0,     0,    12,    25,    15,    27,     0,    19,
-      28,    22,     0,    72,    30,     0,     0,     0,     0,    33,
-      32,    20,    31,    29,     0,    73,    74,    21,     0,    24,
-       0,     0,     0,    23
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     9,
+      79,     0,     0,    14,    12,    45,     0,    47,    49,    51,
+      53,    55,    56,    60,    61,    59,    58,    62,    63,    65,
+      66,    69,    68,    70,     0,     0,     0,     0,    18,     0,
+      79,    15,    13,     0,     0,     0,    20,    30,    82,    22,
+      84,     0,    81,    80,    43,    21,    83,     0,     0,    16,
+      29,    19,    31,     0,    23,    32,    26,     0,    76,    34,
+       0,     0,     0,     0,    37,    36,    24,    35,    33,     0,
+      77,    78,    25,     0,    28,     0,     0,     0,    27
 };
 
   /* YYPGOTO[NTERM-NUM].  */
 static const yytype_int8 yypgoto[] =
 {
-     -81,   -81,   100,   104,   -81,   -38,   -81,   -80,   -81,   -81,
-     -81,    -5,    66,    13,   -81,    70,    67,    81,    64,    82,
-      37,    27,    34,    38,   -14,   -81,    22,    24
+     -44,   -44,   -44,   103,    99,   104,   -44,   -43,   -44,   -21,
+     -44,   -44,   -44,    -8,    63,     9,   -44,    65,    67,    68,
+      69,    62,    26,     4,    22,    23,   -19,   -44,    20,    28
 };
 
   /* YYDEFGOTO[NTERM-NUM].  */
 static const yytype_int16 yydefgoto[] =
 {
-      -1,     2,     7,     8,    15,    36,    65,    93,   112,   113,
-     125,    20,    21,    22,    23,    24,    25,    26,    27,    28,
-      29,    30,    31,    32,    33,   128,    94,    95
+      -1,     2,     3,     4,    10,    11,    19,    41,    70,    98,
+     117,   118,   130,    25,    26,    27,    28,    29,    30,    31,
+      32,    33,    34,    35,    36,    37,    38,   133,    99,   100
 };
 
   /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM.  If
@@ -613,87 +619,87 @@ static const yytype_int16 yydefgoto[] =
      number is the opposite.  If YYTABLE_NINF, syntax error.  */
 static const yytype_uint8 yytable[] =
 {
-      12,    68,    69,    41,    42,    43,    45,    34,     9,    10,
-      53,    54,   104,     3,     5,   107,   101,   118,    35,     1,
-     102,     4,    61,    11,   119,   120,   121,   122,    35,    97,
-      46,     6,    55,    17,   123,    44,    18,    19,    56,   124,
-      62,    63,     9,    10,    14,    51,    52,    86,    87,    88,
-       9,    10,    48,   103,   129,   130,   115,    11,   135,   116,
-     136,    47,   131,    57,    58,    11,    37,    49,   117,    50,
-     137,    64,    38,    39,   138,   139,    40,    89,    90,    91,
-      78,    79,    80,    81,    92,    59,    60,    66,    76,    77,
-      67,    82,    83,    96,    98,    99,   100,    84,    85,   106,
-     110,   111,   114,   126,   134,   127,   133,   141,    16,   143,
-      13,   109,    71,    74,    72,    70,   105,   108,     0,     0,
-     132,     0,     0,     0,     0,     0,     0,     0,     0,    73,
-       0,     0,    75,   140,     0,     0,   142
+      16,    73,    74,    46,    47,    48,    13,    14,    39,    50,
+      58,    59,   120,     8,   140,   121,   141,     1,    94,    95,
+      96,    15,    12,    66,   122,    97,   142,    56,    57,   102,
+       9,    22,    60,    51,    23,    24,    62,    63,    61,    13,
+      14,    67,    68,   134,   135,   143,   144,    91,    92,    93,
+     123,   136,     5,   108,    15,    13,    14,   124,   125,   126,
+     127,     6,    83,    84,    85,    86,    18,   128,    42,   106,
+      15,    40,   129,   107,    43,    44,   109,    40,    45,   112,
+      64,    65,    81,    82,    87,    88,    49,    89,    90,    21,
+      52,    69,    53,    71,    54,    72,    55,   103,   101,   104,
+     105,   115,   111,   131,   116,   119,     7,   138,   132,   139,
+      20,   146,   114,    17,    76,    75,   148,    80,     0,    77,
+     113,    78,   137,    79,     0,   110,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,   145,     0,     0,   147
 };
 
 static const yytype_int16 yycheck[] =
 {
-       5,    39,    40,    17,    18,    19,    12,    12,    17,    18,
-       7,     8,    92,    24,     4,    95,    24,    13,    26,     3,
-      28,     0,    25,    32,    20,    21,    22,    23,    26,    67,
-      36,    21,    29,    42,    30,    34,    45,    46,    35,    35,
-      43,    44,    17,    18,    25,     9,    10,    61,    62,    63,
-      17,    18,    38,    91,    21,    22,    21,    32,    19,    24,
-      21,    11,    29,     5,     6,    32,    15,    39,    33,    40,
-      31,    24,    21,    22,    33,    34,    25,    14,    15,    16,
-      53,    54,    55,    56,    21,    41,    42,    22,    51,    52,
-      22,    57,    58,    24,    37,    16,    16,    59,    60,    27,
-      24,    24,    24,    17,    20,    32,    35,    33,     8,    34,
-       6,    98,    46,    49,    47,    45,    92,    95,    -1,    -1,
-     125,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    48,
-      -1,    -1,    50,   138,    -1,    -1,   141
+       8,    44,    45,    22,    23,    24,    18,    19,    16,    13,
+       8,     9,    22,     5,    20,    25,    22,     3,    15,    16,
+      17,    33,     4,    26,    34,    22,    32,    10,    11,    72,
+      22,    43,    30,    37,    46,    47,     6,     7,    36,    18,
+      19,    44,    45,    22,    23,    34,    35,    66,    67,    68,
+      14,    30,    25,    96,    33,    18,    19,    21,    22,    23,
+      24,     0,    58,    59,    60,    61,    26,    31,    16,    25,
+      33,    27,    36,    29,    22,    23,    97,    27,    26,   100,
+      42,    43,    56,    57,    62,    63,    35,    64,    65,    25,
+      12,    25,    39,    23,    40,    23,    41,    38,    25,    17,
+      17,    25,    28,    18,    25,    25,     3,    36,    33,    21,
+      11,    34,   103,     9,    51,    50,    35,    55,    -1,    52,
+     100,    53,   130,    54,    -1,    97,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    -1,   143,    -1,    -1,   146
 };
 
   /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
      symbol of state STATE-NUM.  */
 static const yytype_uint8 yystos[] =
 {
-       0,     3,    48,    24,     0,     4,    21,    49,    50,    17,
-      18,    32,    58,    50,    25,    51,    49,    42,    45,    46,
-      58,    59,    60,    61,    62,    63,    64,    65,    66,    67,
-      68,    69,    70,    71,    58,    26,    52,    15,    21,    22,
-      25,    71,    71,    71,    34,    12,    36,    11,    38,    39,
-      40,     9,    10,     7,     8,    29,    35,     5,     6,    41,
-      42,    25,    43,    44,    24,    53,    22,    22,    52,    52,
-      62,    59,    63,    64,    65,    66,    67,    67,    68,    68,
-      68,    68,    69,    69,    70,    70,    71,    71,    71,    14,
-      15,    16,    21,    54,    73,    74,    24,    52,    37,    16,
-      16,    24,    28,    52,    54,    74,    27,    54,    73,    60,
-      24,    24,    55,    56,    24,    21,    24,    33,    13,    20,
-      21,    22,    23,    30,    35,    57,    17,    32,    72,    21,
-      22,    29,    58,    35,    20,    19,    21,    31,    33,    34,
-      58,    33,    58,    34
+       0,     3,    49,    50,    51,    25,     0,    51,     5,    22,
+      52,    53,     4,    18,    19,    33,    61,    53,    26,    54,
+      52,    25,    43,    46,    47,    61,    62,    63,    64,    65,
+      66,    67,    68,    69,    70,    71,    72,    73,    74,    61,
+      27,    55,    16,    22,    23,    26,    74,    74,    74,    35,
+      13,    37,    12,    39,    40,    41,    10,    11,     8,     9,
+      30,    36,     6,     7,    42,    43,    26,    44,    45,    25,
+      56,    23,    23,    55,    55,    65,    62,    66,    67,    68,
+      69,    70,    70,    71,    71,    71,    71,    72,    72,    73,
+      73,    74,    74,    74,    15,    16,    17,    22,    57,    76,
+      77,    25,    55,    38,    17,    17,    25,    29,    55,    57,
+      77,    28,    57,    76,    63,    25,    25,    58,    59,    25,
+      22,    25,    34,    14,    21,    22,    23,    24,    31,    36,
+      60,    18,    33,    75,    22,    23,    30,    61,    36,    21,
+      20,    22,    32,    34,    35,    61,    34,    61,    35
 };
 
   /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
 static const yytype_uint8 yyr1[] =
 {
-       0,    47,    48,    49,    49,    50,    50,    51,    51,    51,
-      51,    51,    52,    53,    53,    54,    54,    54,    54,    55,
-      55,    55,    55,    55,    55,    55,    56,    56,    56,    57,
-      57,    57,    57,    57,    58,    58,    58,    59,    60,    60,
-      61,    61,    62,    62,    63,    63,    64,    64,    65,    65,
-      66,    66,    66,    67,    67,    67,    67,    67,    68,    68,
-      68,    69,    69,    69,    70,    70,    70,    70,    71,    71,
-      71,    71,    72,    72,    72,    73,    73,    73,    74,    74,
-      74
+       0,    48,    49,    50,    50,    51,    51,    52,    52,    53,
+      53,    54,    54,    54,    54,    54,    55,    56,    56,    57,
+      57,    57,    57,    58,    58,    58,    58,    58,    58,    58,
+      59,    59,    59,    60,    60,    60,    60,    60,    61,    61,
+      61,    62,    63,    63,    64,    64,    65,    65,    66,    66,
+      67,    67,    68,    68,    69,    69,    69,    70,    70,    70,
+      70,    70,    71,    71,    71,    72,    72,    72,    73,    73,
+      73,    73,    74,    74,    74,    74,    75,    75,    75,    76,
+      76,    76,    77,    77,    77
 };
 
   /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN.  */
 static const yytype_uint8 yyr2[] =
 {
-       0,     2,     4,     0,     2,     4,     2,     2,     3,     4,
-       3,     4,     5,     0,     2,     4,     2,     3,     2,     2,
-       3,     4,     2,     9,     5,     2,     0,     2,     2,     3,
-       1,     2,     2,     2,     1,     1,     3,     1,     1,     5,
-       1,     3,     1,     3,     1,     3,     1,     3,     1,     3,
-       1,     3,     3,     1,     3,     3,     3,     3,     3,     3,
-       1,     3,     3,     1,     3,     3,     3,     1,     1,     2,
-       2,     2,     0,     2,     2,     0,     2,     2,     2,     3,
-       2
+       0,     2,     3,     2,     4,     1,     2,     0,     2,     4,
+       2,     2,     3,     4,     3,     4,     5,     0,     2,     4,
+       2,     3,     2,     2,     3,     4,     2,     9,     5,     2,
+       0,     2,     2,     3,     1,     2,     2,     2,     1,     1,
+       3,     1,     1,     5,     1,     3,     1,     3,     1,     3,
+       1,     3,     1,     3,     1,     3,     3,     1,     3,     3,
+       3,     3,     3,     3,     1,     3,     3,     1,     3,     3,
+       3,     1,     1,     2,     2,     2,     0,     2,     2,     0,
+       2,     2,     2,     3,     2
 };
 
 
@@ -1463,65 +1469,91 @@ yyreduce:
   switch (yyn)
     {
         case 2:
-#line 105 "dtc-parser.y" /* yacc.c:1646  */
+#line 110 "dtc-parser.y" /* yacc.c:1646  */
     {
-			the_boot_info = build_boot_info((yyvsp[-1].re), (yyvsp[0].node),
-							guess_boot_cpuid((yyvsp[0].node)));
+			parser_output = build_dt_info((yyvsp[-2].flags), (yyvsp[-1].re), (yyvsp[0].node),
+			                              guess_boot_cpuid((yyvsp[0].node)));
 		}
-#line 1472 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1478 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 3:
-#line 113 "dtc-parser.y" /* yacc.c:1646  */
+#line 118 "dtc-parser.y" /* yacc.c:1646  */
     {
-			(yyval.re) = NULL;
+			(yyval.flags) = DTSF_V1;
 		}
-#line 1480 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1486 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 4:
-#line 117 "dtc-parser.y" /* yacc.c:1646  */
+#line 122 "dtc-parser.y" /* yacc.c:1646  */
     {
-			(yyval.re) = chain_reserve_entry((yyvsp[-1].re), (yyvsp[0].re));
+			(yyval.flags) = DTSF_V1 | DTSF_PLUGIN;
 		}
-#line 1488 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 5:
-#line 124 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.re) = build_reserve_entry((yyvsp[-2].integer), (yyvsp[-1].integer));
-		}
-#line 1496 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1494 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 6:
-#line 128 "dtc-parser.y" /* yacc.c:1646  */
+#line 130 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			if ((yyvsp[0].flags) != (yyvsp[-1].flags))
+				ERROR(&(yylsp[0]), "Header flags don't match earlier ones");
+			(yyval.flags) = (yyvsp[-1].flags);
+		}
+#line 1504 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 7:
+#line 139 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.re) = NULL;
+		}
+#line 1512 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 8:
+#line 143 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.re) = chain_reserve_entry((yyvsp[-1].re), (yyvsp[0].re));
+		}
+#line 1520 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 9:
+#line 150 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.re) = build_reserve_entry((yyvsp[-2].integer), (yyvsp[-1].integer));
+		}
+#line 1528 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 10:
+#line 154 "dtc-parser.y" /* yacc.c:1646  */
     {
 			add_label(&(yyvsp[0].re)->labels, (yyvsp[-1].labelref));
 			(yyval.re) = (yyvsp[0].re);
 		}
-#line 1505 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1537 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 7:
-#line 136 "dtc-parser.y" /* yacc.c:1646  */
+  case 11:
+#line 162 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.node) = name_node((yyvsp[0].node), "");
 		}
-#line 1513 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1545 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 8:
-#line 140 "dtc-parser.y" /* yacc.c:1646  */
+  case 12:
+#line 166 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.node) = merge_nodes((yyvsp[-2].node), (yyvsp[0].node));
 		}
-#line 1521 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1553 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 9:
-#line 145 "dtc-parser.y" /* yacc.c:1646  */
+  case 13:
+#line 171 "dtc-parser.y" /* yacc.c:1646  */
     {
 			struct node *target = get_node_by_ref((yyvsp[-3].node), (yyvsp[-1].labelref));
 
@@ -1532,11 +1564,11 @@ yyreduce:
 				ERROR(&(yylsp[-1]), "Label or path %s not found", (yyvsp[-1].labelref));
 			(yyval.node) = (yyvsp[-3].node);
 		}
-#line 1536 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1568 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 10:
-#line 156 "dtc-parser.y" /* yacc.c:1646  */
+  case 14:
+#line 182 "dtc-parser.y" /* yacc.c:1646  */
     {
 			struct node *target = get_node_by_ref((yyvsp[-2].node), (yyvsp[-1].labelref));
 
@@ -1546,11 +1578,11 @@ yyreduce:
 				ERROR(&(yylsp[-1]), "Label or path %s not found", (yyvsp[-1].labelref));
 			(yyval.node) = (yyvsp[-2].node);
 		}
-#line 1550 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1582 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 11:
-#line 166 "dtc-parser.y" /* yacc.c:1646  */
+  case 15:
+#line 192 "dtc-parser.y" /* yacc.c:1646  */
     {
 			struct node *target = get_node_by_ref((yyvsp[-3].node), (yyvsp[-1].labelref));
 
@@ -1562,100 +1594,100 @@ yyreduce:
 
 			(yyval.node) = (yyvsp[-3].node);
 		}
-#line 1566 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 12:
-#line 181 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.node) = build_node((yyvsp[-3].proplist), (yyvsp[-2].nodelist));
-		}
-#line 1574 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 13:
-#line 188 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.proplist) = NULL;
-		}
-#line 1582 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 14:
-#line 192 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.proplist) = chain_property((yyvsp[0].prop), (yyvsp[-1].proplist));
-		}
-#line 1590 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 15:
-#line 199 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.prop) = build_property((yyvsp[-3].propnodename), (yyvsp[-1].data));
-		}
 #line 1598 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 16:
-#line 203 "dtc-parser.y" /* yacc.c:1646  */
+#line 207 "dtc-parser.y" /* yacc.c:1646  */
     {
-			(yyval.prop) = build_property((yyvsp[-1].propnodename), empty_data);
+			(yyval.node) = build_node((yyvsp[-3].proplist), (yyvsp[-2].nodelist));
 		}
 #line 1606 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 17:
-#line 207 "dtc-parser.y" /* yacc.c:1646  */
+#line 214 "dtc-parser.y" /* yacc.c:1646  */
     {
-			(yyval.prop) = build_property_delete((yyvsp[-1].propnodename));
+			(yyval.proplist) = NULL;
 		}
 #line 1614 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 18:
-#line 211 "dtc-parser.y" /* yacc.c:1646  */
+#line 218 "dtc-parser.y" /* yacc.c:1646  */
     {
-			add_label(&(yyvsp[0].prop)->labels, (yyvsp[-1].labelref));
-			(yyval.prop) = (yyvsp[0].prop);
+			(yyval.proplist) = chain_property((yyvsp[0].prop), (yyvsp[-1].proplist));
 		}
-#line 1623 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1622 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 19:
-#line 219 "dtc-parser.y" /* yacc.c:1646  */
+#line 225 "dtc-parser.y" /* yacc.c:1646  */
     {
-			(yyval.data) = data_merge((yyvsp[-1].data), (yyvsp[0].data));
+			(yyval.prop) = build_property((yyvsp[-3].propnodename), (yyvsp[-1].data));
 		}
-#line 1631 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1630 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 20:
-#line 223 "dtc-parser.y" /* yacc.c:1646  */
+#line 229 "dtc-parser.y" /* yacc.c:1646  */
     {
-			(yyval.data) = data_merge((yyvsp[-2].data), (yyvsp[-1].array).data);
+			(yyval.prop) = build_property((yyvsp[-1].propnodename), empty_data);
 		}
-#line 1639 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1638 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 21:
-#line 227 "dtc-parser.y" /* yacc.c:1646  */
+#line 233 "dtc-parser.y" /* yacc.c:1646  */
     {
-			(yyval.data) = data_merge((yyvsp[-3].data), (yyvsp[-1].data));
+			(yyval.prop) = build_property_delete((yyvsp[-1].propnodename));
 		}
-#line 1647 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1646 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 22:
-#line 231 "dtc-parser.y" /* yacc.c:1646  */
+#line 237 "dtc-parser.y" /* yacc.c:1646  */
     {
-			(yyval.data) = data_add_marker((yyvsp[-1].data), REF_PATH, (yyvsp[0].labelref));
+			add_label(&(yyvsp[0].prop)->labels, (yyvsp[-1].labelref));
+			(yyval.prop) = (yyvsp[0].prop);
 		}
 #line 1655 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 23:
-#line 235 "dtc-parser.y" /* yacc.c:1646  */
+#line 245 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = data_merge((yyvsp[-1].data), (yyvsp[0].data));
+		}
+#line 1663 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 24:
+#line 249 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = data_merge((yyvsp[-2].data), (yyvsp[-1].array).data);
+		}
+#line 1671 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 25:
+#line 253 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = data_merge((yyvsp[-3].data), (yyvsp[-1].data));
+		}
+#line 1679 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 26:
+#line 257 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = data_add_marker((yyvsp[-1].data), REF_PATH, (yyvsp[0].labelref));
+		}
+#line 1687 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 27:
+#line 261 "dtc-parser.y" /* yacc.c:1646  */
     {
 			FILE *f = srcfile_relative_open((yyvsp[-5].data).val, NULL);
 			struct data d;
@@ -1671,11 +1703,11 @@ yyreduce:
 			(yyval.data) = data_merge((yyvsp[-8].data), d);
 			fclose(f);
 		}
-#line 1675 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1707 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 24:
-#line 251 "dtc-parser.y" /* yacc.c:1646  */
+  case 28:
+#line 277 "dtc-parser.y" /* yacc.c:1646  */
     {
 			FILE *f = srcfile_relative_open((yyvsp[-1].data).val, NULL);
 			struct data d = empty_data;
@@ -1685,43 +1717,43 @@ yyreduce:
 			(yyval.data) = data_merge((yyvsp[-4].data), d);
 			fclose(f);
 		}
-#line 1689 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 25:
-#line 261 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.data) = data_add_marker((yyvsp[-1].data), LABEL, (yyvsp[0].labelref));
-		}
-#line 1697 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 26:
-#line 268 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.data) = empty_data;
-		}
-#line 1705 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 27:
-#line 272 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.data) = (yyvsp[-1].data);
-		}
-#line 1713 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 28:
-#line 276 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.data) = data_add_marker((yyvsp[-1].data), LABEL, (yyvsp[0].labelref));
-		}
 #line 1721 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 29:
-#line 283 "dtc-parser.y" /* yacc.c:1646  */
+#line 287 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = data_add_marker((yyvsp[-1].data), LABEL, (yyvsp[0].labelref));
+		}
+#line 1729 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 30:
+#line 294 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = empty_data;
+		}
+#line 1737 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 31:
+#line 298 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = (yyvsp[-1].data);
+		}
+#line 1745 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 32:
+#line 302 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = data_add_marker((yyvsp[-1].data), LABEL, (yyvsp[0].labelref));
+		}
+#line 1753 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 33:
+#line 309 "dtc-parser.y" /* yacc.c:1646  */
     {
 			unsigned long long bits;
 
@@ -1737,20 +1769,20 @@ yyreduce:
 			(yyval.array).data = empty_data;
 			(yyval.array).bits = bits;
 		}
-#line 1741 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1773 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 30:
-#line 299 "dtc-parser.y" /* yacc.c:1646  */
+  case 34:
+#line 325 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.array).data = empty_data;
 			(yyval.array).bits = 32;
 		}
-#line 1750 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1782 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 31:
-#line 304 "dtc-parser.y" /* yacc.c:1646  */
+  case 35:
+#line 330 "dtc-parser.y" /* yacc.c:1646  */
     {
 			if ((yyvsp[-1].array).bits < 64) {
 				uint64_t mask = (1ULL << (yyvsp[-1].array).bits) - 1;
@@ -1769,11 +1801,11 @@ yyreduce:
 
 			(yyval.array).data = data_append_integer((yyvsp[-1].array).data, (yyvsp[0].integer), (yyvsp[-1].array).bits);
 		}
-#line 1773 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1805 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 32:
-#line 323 "dtc-parser.y" /* yacc.c:1646  */
+  case 36:
+#line 349 "dtc-parser.y" /* yacc.c:1646  */
     {
 			uint64_t val = ~0ULL >> (64 - (yyvsp[-1].array).bits);
 
@@ -1787,129 +1819,129 @@ yyreduce:
 
 			(yyval.array).data = data_append_integer((yyvsp[-1].array).data, val, (yyvsp[-1].array).bits);
 		}
-#line 1791 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1823 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 33:
-#line 337 "dtc-parser.y" /* yacc.c:1646  */
+  case 37:
+#line 363 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.array).data = data_add_marker((yyvsp[-1].array).data, LABEL, (yyvsp[0].labelref));
 		}
-#line 1799 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 36:
-#line 346 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.integer) = (yyvsp[-1].integer);
-		}
-#line 1807 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 39:
-#line 357 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-4].integer) ? (yyvsp[-2].integer) : (yyvsp[0].integer); }
-#line 1813 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 41:
-#line 362 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) || (yyvsp[0].integer); }
-#line 1819 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 43:
-#line 367 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) && (yyvsp[0].integer); }
-#line 1825 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 45:
-#line 372 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) | (yyvsp[0].integer); }
 #line 1831 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
+  case 40:
+#line 372 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.integer) = (yyvsp[-1].integer);
+		}
+#line 1839 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 43:
+#line 383 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-4].integer) ? (yyvsp[-2].integer) : (yyvsp[0].integer); }
+#line 1845 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 45:
+#line 388 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) || (yyvsp[0].integer); }
+#line 1851 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
   case 47:
-#line 377 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) ^ (yyvsp[0].integer); }
-#line 1837 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 393 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) && (yyvsp[0].integer); }
+#line 1857 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 49:
-#line 382 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) & (yyvsp[0].integer); }
-#line 1843 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 398 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) | (yyvsp[0].integer); }
+#line 1863 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 51:
-#line 387 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) == (yyvsp[0].integer); }
-#line 1849 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 403 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) ^ (yyvsp[0].integer); }
+#line 1869 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 52:
-#line 388 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) != (yyvsp[0].integer); }
-#line 1855 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 54:
-#line 393 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) < (yyvsp[0].integer); }
-#line 1861 "dtc-parser.tab.c" /* yacc.c:1646  */
+  case 53:
+#line 408 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) & (yyvsp[0].integer); }
+#line 1875 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 55:
-#line 394 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) > (yyvsp[0].integer); }
-#line 1867 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 413 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) == (yyvsp[0].integer); }
+#line 1881 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 56:
-#line 395 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) <= (yyvsp[0].integer); }
-#line 1873 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 57:
-#line 396 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) >= (yyvsp[0].integer); }
-#line 1879 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 414 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) != (yyvsp[0].integer); }
+#line 1887 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 58:
-#line 400 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) << (yyvsp[0].integer); }
-#line 1885 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 419 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) < (yyvsp[0].integer); }
+#line 1893 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 59:
-#line 401 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) >> (yyvsp[0].integer); }
-#line 1891 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 420 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) > (yyvsp[0].integer); }
+#line 1899 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 60:
+#line 421 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) <= (yyvsp[0].integer); }
+#line 1905 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 61:
-#line 406 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) + (yyvsp[0].integer); }
-#line 1897 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 422 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) >= (yyvsp[0].integer); }
+#line 1911 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 62:
-#line 407 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) - (yyvsp[0].integer); }
-#line 1903 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 426 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) << (yyvsp[0].integer); }
+#line 1917 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 64:
-#line 412 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = (yyvsp[-2].integer) * (yyvsp[0].integer); }
-#line 1909 "dtc-parser.tab.c" /* yacc.c:1646  */
+  case 63:
+#line 427 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) >> (yyvsp[0].integer); }
+#line 1923 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 65:
-#line 414 "dtc-parser.y" /* yacc.c:1646  */
+#line 432 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) + (yyvsp[0].integer); }
+#line 1929 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 66:
+#line 433 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) - (yyvsp[0].integer); }
+#line 1935 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 68:
+#line 438 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = (yyvsp[-2].integer) * (yyvsp[0].integer); }
+#line 1941 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 69:
+#line 440 "dtc-parser.y" /* yacc.c:1646  */
     {
 			if ((yyvsp[0].integer) != 0) {
 				(yyval.integer) = (yyvsp[-2].integer) / (yyvsp[0].integer);
@@ -1918,11 +1950,11 @@ yyreduce:
 				(yyval.integer) = 0;
 			}
 		}
-#line 1922 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1954 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 66:
-#line 423 "dtc-parser.y" /* yacc.c:1646  */
+  case 70:
+#line 449 "dtc-parser.y" /* yacc.c:1646  */
     {
 			if ((yyvsp[0].integer) != 0) {
 				(yyval.integer) = (yyvsp[-2].integer) % (yyvsp[0].integer);
@@ -1931,103 +1963,103 @@ yyreduce:
 				(yyval.integer) = 0;
 			}
 		}
-#line 1935 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 69:
-#line 436 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = -(yyvsp[0].integer); }
-#line 1941 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 70:
-#line 437 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = ~(yyvsp[0].integer); }
-#line 1947 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 71:
-#line 438 "dtc-parser.y" /* yacc.c:1646  */
-    { (yyval.integer) = !(yyvsp[0].integer); }
-#line 1953 "dtc-parser.tab.c" /* yacc.c:1646  */
-    break;
-
-  case 72:
-#line 443 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.data) = empty_data;
-		}
-#line 1961 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 1967 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 73:
-#line 447 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.data) = data_append_byte((yyvsp[-1].data), (yyvsp[0].byte));
-		}
-#line 1969 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 462 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = -(yyvsp[0].integer); }
+#line 1973 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 74:
-#line 451 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.data) = data_add_marker((yyvsp[-1].data), LABEL, (yyvsp[0].labelref));
-		}
-#line 1977 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 463 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = ~(yyvsp[0].integer); }
+#line 1979 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 75:
-#line 458 "dtc-parser.y" /* yacc.c:1646  */
-    {
-			(yyval.nodelist) = NULL;
-		}
+#line 464 "dtc-parser.y" /* yacc.c:1646  */
+    { (yyval.integer) = !(yyvsp[0].integer); }
 #line 1985 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 76:
-#line 462 "dtc-parser.y" /* yacc.c:1646  */
+#line 469 "dtc-parser.y" /* yacc.c:1646  */
     {
-			(yyval.nodelist) = chain_node((yyvsp[-1].node), (yyvsp[0].nodelist));
+			(yyval.data) = empty_data;
 		}
 #line 1993 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
   case 77:
-#line 466 "dtc-parser.y" /* yacc.c:1646  */
+#line 473 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = data_append_byte((yyvsp[-1].data), (yyvsp[0].byte));
+		}
+#line 2001 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 78:
+#line 477 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.data) = data_add_marker((yyvsp[-1].data), LABEL, (yyvsp[0].labelref));
+		}
+#line 2009 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 79:
+#line 484 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.nodelist) = NULL;
+		}
+#line 2017 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 80:
+#line 488 "dtc-parser.y" /* yacc.c:1646  */
+    {
+			(yyval.nodelist) = chain_node((yyvsp[-1].node), (yyvsp[0].nodelist));
+		}
+#line 2025 "dtc-parser.tab.c" /* yacc.c:1646  */
+    break;
+
+  case 81:
+#line 492 "dtc-parser.y" /* yacc.c:1646  */
     {
 			ERROR(&(yylsp[0]), "Properties must precede subnodes");
 			YYERROR;
 		}
-#line 2002 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2034 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 78:
-#line 474 "dtc-parser.y" /* yacc.c:1646  */
+  case 82:
+#line 500 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.node) = name_node((yyvsp[0].node), (yyvsp[-1].propnodename));
 		}
-#line 2010 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2042 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 79:
-#line 478 "dtc-parser.y" /* yacc.c:1646  */
+  case 83:
+#line 504 "dtc-parser.y" /* yacc.c:1646  */
     {
 			(yyval.node) = name_node(build_node_delete(), (yyvsp[-1].propnodename));
 		}
-#line 2018 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2050 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
-  case 80:
-#line 482 "dtc-parser.y" /* yacc.c:1646  */
+  case 84:
+#line 508 "dtc-parser.y" /* yacc.c:1646  */
     {
 			add_label(&(yyvsp[0].node)->labels, (yyvsp[-1].labelref));
 			(yyval.node) = (yyvsp[0].node);
 		}
-#line 2027 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2059 "dtc-parser.tab.c" /* yacc.c:1646  */
     break;
 
 
-#line 2031 "dtc-parser.tab.c" /* yacc.c:1646  */
+#line 2063 "dtc-parser.tab.c" /* yacc.c:1646  */
       default: break;
     }
   /* User semantic actions sometimes alter yychar, and that requires
@@ -2262,7 +2294,7 @@ yyreturn:
 #endif
   return yyresult;
 }
-#line 488 "dtc-parser.y" /* yacc.c:1906  */
+#line 514 "dtc-parser.y" /* yacc.c:1906  */
 
 
 void yyerror(char const *s)
diff --git a/scripts/dtc/dtc-parser.tab.h_shipped b/scripts/dtc/dtc-parser.tab.h_shipped
index 30867c688300..6aa512c1b337 100644
--- a/scripts/dtc/dtc-parser.tab.h_shipped
+++ b/scripts/dtc/dtc-parser.tab.h_shipped
@@ -1,8 +1,8 @@
-/* A Bison parser, made by GNU Bison 3.0.2.  */
+/* A Bison parser, made by GNU Bison 3.0.4.  */
 
 /* Bison interface for Yacc-like parsers in C
 
-   Copyright (C) 1984, 1989-1990, 2000-2013 Free Software Foundation, Inc.
+   Copyright (C) 1984, 1989-1990, 2000-2015 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -46,35 +46,36 @@ extern int yydebug;
   enum yytokentype
   {
     DT_V1 = 258,
-    DT_MEMRESERVE = 259,
-    DT_LSHIFT = 260,
-    DT_RSHIFT = 261,
-    DT_LE = 262,
-    DT_GE = 263,
-    DT_EQ = 264,
-    DT_NE = 265,
-    DT_AND = 266,
-    DT_OR = 267,
-    DT_BITS = 268,
-    DT_DEL_PROP = 269,
-    DT_DEL_NODE = 270,
-    DT_PROPNODENAME = 271,
-    DT_LITERAL = 272,
-    DT_CHAR_LITERAL = 273,
-    DT_BYTE = 274,
-    DT_STRING = 275,
-    DT_LABEL = 276,
-    DT_REF = 277,
-    DT_INCBIN = 278
+    DT_PLUGIN = 259,
+    DT_MEMRESERVE = 260,
+    DT_LSHIFT = 261,
+    DT_RSHIFT = 262,
+    DT_LE = 263,
+    DT_GE = 264,
+    DT_EQ = 265,
+    DT_NE = 266,
+    DT_AND = 267,
+    DT_OR = 268,
+    DT_BITS = 269,
+    DT_DEL_PROP = 270,
+    DT_DEL_NODE = 271,
+    DT_PROPNODENAME = 272,
+    DT_LITERAL = 273,
+    DT_CHAR_LITERAL = 274,
+    DT_BYTE = 275,
+    DT_STRING = 276,
+    DT_LABEL = 277,
+    DT_REF = 278,
+    DT_INCBIN = 279
   };
 #endif
 
 /* Value type.  */
 #if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
-typedef union YYSTYPE YYSTYPE;
+
 union YYSTYPE
 {
-#line 38 "dtc-parser.y" /* yacc.c:1909  */
+#line 39 "dtc-parser.y" /* yacc.c:1909  */
 
 	char *propnodename;
 	char *labelref;
@@ -92,9 +93,12 @@ union YYSTYPE
 	struct node *nodelist;
 	struct reserve_info *re;
 	uint64_t integer;
+	unsigned int flags;
 
-#line 97 "dtc-parser.tab.h" /* yacc.c:1909  */
+#line 99 "dtc-parser.tab.h" /* yacc.c:1909  */
 };
+
+typedef union YYSTYPE YYSTYPE;
 # define YYSTYPE_IS_TRIVIAL 1
 # define YYSTYPE_IS_DECLARED 1
 #endif
diff --git a/scripts/dtc/dtc-parser.y b/scripts/dtc/dtc-parser.y
index 000873f070fd..b2fd4d155839 100644
--- a/scripts/dtc/dtc-parser.y
+++ b/scripts/dtc/dtc-parser.y
@@ -19,6 +19,7 @@
  */
 %{
 #include <stdio.h>
+#include <inttypes.h>
 
 #include "dtc.h"
 #include "srcpos.h"
@@ -31,7 +32,7 @@ extern void yyerror(char const *s);
 		treesource_error = true; \
 	} while (0)
 
-extern struct boot_info *the_boot_info;
+extern struct dt_info *parser_output;
 extern bool treesource_error;
 %}
 
@@ -52,9 +53,11 @@ extern bool treesource_error;
 	struct node *nodelist;
 	struct reserve_info *re;
 	uint64_t integer;
+	unsigned int flags;
 }
 
 %token DT_V1
+%token DT_PLUGIN
 %token DT_MEMRESERVE
 %token DT_LSHIFT DT_RSHIFT DT_LE DT_GE DT_EQ DT_NE DT_AND DT_OR
 %token DT_BITS
@@ -71,6 +74,8 @@ extern bool treesource_error;
 
 %type <data> propdata
 %type <data> propdataprefix
+%type <flags> header
+%type <flags> headers
 %type <re> memreserve
 %type <re> memreserves
 %type <array> arrayprefix
@@ -101,10 +106,31 @@ extern bool treesource_error;
 %%
 
 sourcefile:
-	  DT_V1 ';' memreserves devicetree
+	  headers memreserves devicetree
 		{
-			the_boot_info = build_boot_info($3, $4,
-							guess_boot_cpuid($4));
+			parser_output = build_dt_info($1, $2, $3,
+			                              guess_boot_cpuid($3));
+		}
+	;
+
+header:
+	  DT_V1 ';'
+		{
+			$$ = DTSF_V1;
+		}
+	| DT_V1 ';' DT_PLUGIN ';'
+		{
+			$$ = DTSF_V1 | DTSF_PLUGIN;
+		}
+	;
+
+headers:
+	  header
+	| header headers
+		{
+			if ($2 != $1)
+				ERROR(&@2, "Header flags don't match earlier ones");
+			$$ = $1;
 		}
 	;
 
diff --git a/scripts/dtc/dtc.c b/scripts/dtc/dtc.c
index 5fa23c406266..a4edf4c7aebf 100644
--- a/scripts/dtc/dtc.c
+++ b/scripts/dtc/dtc.c
@@ -30,7 +30,16 @@ int quiet;		/* Level of quietness */
 int reservenum;		/* Number of memory reservation slots */
 int minsize;		/* Minimum blob size */
 int padsize;		/* Additional padding to blob */
+int alignsize;		/* Additional padding to blob accroding to the alignsize */
 int phandle_format = PHANDLE_BOTH;	/* Use linux,phandle or phandle properties */
+int generate_symbols;	/* enable symbols & fixup support */
+int generate_fixups;		/* suppress generation of fixups on symbol support */
+int auto_label_aliases;		/* auto generate labels -> aliases */
+
+static int is_power_of_2(int x)
+{
+	return (x > 0) && ((x & (x - 1)) == 0);
+}
 
 static void fill_fullpaths(struct node *tree, const char *prefix)
 {
@@ -53,7 +62,7 @@ static void fill_fullpaths(struct node *tree, const char *prefix)
 #define FDT_VERSION(version)	_FDT_VERSION(version)
 #define _FDT_VERSION(version)	#version
 static const char usage_synopsis[] = "dtc [options] <input file>";
-static const char usage_short_opts[] = "qI:O:o:V:d:R:S:p:fb:i:H:sW:E:hv";
+static const char usage_short_opts[] = "qI:O:o:V:d:R:S:p:a:fb:i:H:sW:E:@Ahv";
 static struct option const usage_long_opts[] = {
 	{"quiet",            no_argument, NULL, 'q'},
 	{"in-format",         a_argument, NULL, 'I'},
@@ -64,6 +73,7 @@ static struct option const usage_long_opts[] = {
 	{"reserve",           a_argument, NULL, 'R'},
 	{"space",             a_argument, NULL, 'S'},
 	{"pad",               a_argument, NULL, 'p'},
+	{"align",             a_argument, NULL, 'a'},
 	{"boot-cpu",          a_argument, NULL, 'b'},
 	{"force",            no_argument, NULL, 'f'},
 	{"include",           a_argument, NULL, 'i'},
@@ -71,6 +81,8 @@ static struct option const usage_long_opts[] = {
 	{"phandle",           a_argument, NULL, 'H'},
 	{"warning",           a_argument, NULL, 'W'},
 	{"error",             a_argument, NULL, 'E'},
+	{"symbols",	     no_argument, NULL, '@'},
+	{"auto-alias",       no_argument, NULL, 'A'},
 	{"help",             no_argument, NULL, 'h'},
 	{"version",          no_argument, NULL, 'v'},
 	{NULL,               no_argument, NULL, 0x0},
@@ -91,6 +103,7 @@ static const char * const usage_opts_help[] = {
 	"\n\tMake space for <number> reserve map entries (for dtb and asm output)",
 	"\n\tMake the blob at least <bytes> long (extra space)",
 	"\n\tAdd padding to the blob of <bytes> long (extra space)",
+	"\n\tMake the blob align to the <bytes> (extra space)",
 	"\n\tSet the physical boot cpu",
 	"\n\tTry to produce output even if the input tree has errors",
 	"\n\tAdd a path to search for include files",
@@ -101,6 +114,8 @@ static const char * const usage_opts_help[] = {
 	 "\t\tboth   - Both \"linux,phandle\" and \"phandle\" properties",
 	"\n\tEnable/disable warnings (prefix with \"no-\")",
 	"\n\tEnable/disable errors (prefix with \"no-\")",
+	"\n\tEnable generation of symbols",
+	"\n\tEnable auto-alias of labels",
 	"\n\tPrint this help and exit",
 	"\n\tPrint version and exit",
 	NULL,
@@ -153,7 +168,7 @@ static const char *guess_input_format(const char *fname, const char *fallback)
 
 int main(int argc, char *argv[])
 {
-	struct boot_info *bi;
+	struct dt_info *dti;
 	const char *inform = NULL;
 	const char *outform = NULL;
 	const char *outname = "-";
@@ -169,6 +184,7 @@ int main(int argc, char *argv[])
 	reservenum = 0;
 	minsize    = 0;
 	padsize    = 0;
+	alignsize  = 0;
 
 	while ((opt = util_getopt_long()) != EOF) {
 		switch (opt) {
@@ -196,6 +212,12 @@ int main(int argc, char *argv[])
 		case 'p':
 			padsize = strtol(optarg, NULL, 0);
 			break;
+		case 'a':
+			alignsize = strtol(optarg, NULL, 0);
+			if (!is_power_of_2(alignsize))
+				die("Invalid argument \"%d\" to -a option\n",
+				    optarg);
+			break;
 		case 'f':
 			force = true;
 			break;
@@ -234,6 +256,13 @@ int main(int argc, char *argv[])
 			parse_checks_option(false, true, optarg);
 			break;
 
+		case '@':
+			generate_symbols = 1;
+			break;
+		case 'A':
+			auto_label_aliases = 1;
+			break;
+
 		case 'h':
 			usage(NULL);
 		default:
@@ -272,11 +301,11 @@ int main(int argc, char *argv[])
 		}
 	}
 	if (streq(inform, "dts"))
-		bi = dt_from_source(arg);
+		dti = dt_from_source(arg);
 	else if (streq(inform, "fs"))
-		bi = dt_from_fs(arg);
+		dti = dt_from_fs(arg);
 	else if(streq(inform, "dtb"))
-		bi = dt_from_blob(arg);
+		dti = dt_from_blob(arg);
 	else
 		die("Unknown input format \"%s\"\n", inform);
 
@@ -286,13 +315,29 @@ int main(int argc, char *argv[])
 	}
 
 	if (cmdline_boot_cpuid != -1)
-		bi->boot_cpuid_phys = cmdline_boot_cpuid;
+		dti->boot_cpuid_phys = cmdline_boot_cpuid;
 
-	fill_fullpaths(bi->dt, "");
-	process_checks(force, bi);
+	fill_fullpaths(dti->dt, "");
+	process_checks(force, dti);
+
+	/* on a plugin, generate by default */
+	if (dti->dtsflags & DTSF_PLUGIN) {
+		generate_fixups = 1;
+	}
+
+	if (auto_label_aliases)
+		generate_label_tree(dti, "aliases", false);
+
+	if (generate_symbols)
+		generate_label_tree(dti, "__symbols__", true);
+
+	if (generate_fixups) {
+		generate_fixups_tree(dti, "__fixups__");
+		generate_local_fixups_tree(dti, "__local_fixups__");
+	}
 
 	if (sort)
-		sort_tree(bi);
+		sort_tree(dti);
 
 	if (streq(outname, "-")) {
 		outf = stdout;
@@ -304,11 +349,11 @@ int main(int argc, char *argv[])
 	}
 
 	if (streq(outform, "dts")) {
-		dt_to_source(outf, bi);
+		dt_to_source(outf, dti);
 	} else if (streq(outform, "dtb")) {
-		dt_to_blob(outf, bi, outversion);
+		dt_to_blob(outf, dti, outversion);
 	} else if (streq(outform, "asm")) {
-		dt_to_asm(outf, bi, outversion);
+		dt_to_asm(outf, dti, outversion);
 	} else if (streq(outform, "null")) {
 		/* do nothing */
 	} else {
diff --git a/scripts/dtc/dtc.h b/scripts/dtc/dtc.h
index 56212c8df660..c6f125c68ba8 100644
--- a/scripts/dtc/dtc.h
+++ b/scripts/dtc/dtc.h
@@ -53,7 +53,11 @@ extern int quiet;		/* Level of quietness */
 extern int reservenum;		/* Number of memory reservation slots */
 extern int minsize;		/* Minimum blob size */
 extern int padsize;		/* Additional padding to blob */
+extern int alignsize;		/* Additional padding to blob accroding to the alignsize */
 extern int phandle_format;	/* Use linux,phandle or phandle properties */
+extern int generate_symbols;	/* generate symbols for nodes with labels */
+extern int generate_fixups;	/* generate fixups */
+extern int auto_label_aliases;	/* auto generate labels -> aliases */
 
 #define PHANDLE_LEGACY	0x1
 #define PHANDLE_EPAPR	0x2
@@ -201,6 +205,8 @@ void delete_property(struct property *prop);
 void add_child(struct node *parent, struct node *child);
 void delete_node_by_name(struct node *parent, char *name);
 void delete_node(struct node *node);
+void append_to_property(struct node *node,
+			char *name, const void *data, int len);
 
 const char *get_unitname(struct node *node);
 struct property *get_property(struct node *node, const char *propname);
@@ -235,35 +241,44 @@ struct reserve_info *add_reserve_entry(struct reserve_info *list,
 				       struct reserve_info *new);
 
 
-struct boot_info {
+struct dt_info {
+	unsigned int dtsflags;
 	struct reserve_info *reservelist;
-	struct node *dt;		/* the device tree */
 	uint32_t boot_cpuid_phys;
+	struct node *dt;		/* the device tree */
 };
 
-struct boot_info *build_boot_info(struct reserve_info *reservelist,
-				  struct node *tree, uint32_t boot_cpuid_phys);
-void sort_tree(struct boot_info *bi);
+/* DTS version flags definitions */
+#define DTSF_V1		0x0001	/* /dts-v1/ */
+#define DTSF_PLUGIN	0x0002	/* /plugin/ */
+
+struct dt_info *build_dt_info(unsigned int dtsflags,
+			      struct reserve_info *reservelist,
+			      struct node *tree, uint32_t boot_cpuid_phys);
+void sort_tree(struct dt_info *dti);
+void generate_label_tree(struct dt_info *dti, char *name, bool allocph);
+void generate_fixups_tree(struct dt_info *dti, char *name);
+void generate_local_fixups_tree(struct dt_info *dti, char *name);
 
 /* Checks */
 
 void parse_checks_option(bool warn, bool error, const char *arg);
-void process_checks(bool force, struct boot_info *bi);
+void process_checks(bool force, struct dt_info *dti);
 
 /* Flattened trees */
 
-void dt_to_blob(FILE *f, struct boot_info *bi, int version);
-void dt_to_asm(FILE *f, struct boot_info *bi, int version);
+void dt_to_blob(FILE *f, struct dt_info *dti, int version);
+void dt_to_asm(FILE *f, struct dt_info *dti, int version);
 
-struct boot_info *dt_from_blob(const char *fname);
+struct dt_info *dt_from_blob(const char *fname);
 
 /* Tree source */
 
-void dt_to_source(FILE *f, struct boot_info *bi);
-struct boot_info *dt_from_source(const char *f);
+void dt_to_source(FILE *f, struct dt_info *dti);
+struct dt_info *dt_from_source(const char *f);
 
 /* FS trees */
 
-struct boot_info *dt_from_fs(const char *dirname);
+struct dt_info *dt_from_fs(const char *dirname);
 
 #endif /* _DTC_H */
diff --git a/scripts/dtc/flattree.c b/scripts/dtc/flattree.c
index ec14954f5810..ebac548b3fa8 100644
--- a/scripts/dtc/flattree.c
+++ b/scripts/dtc/flattree.c
@@ -366,7 +366,7 @@ static void make_fdt_header(struct fdt_header *fdt,
 		fdt->size_dt_struct = cpu_to_fdt32(dtsize);
 }
 
-void dt_to_blob(FILE *f, struct boot_info *bi, int version)
+void dt_to_blob(FILE *f, struct dt_info *dti, int version)
 {
 	struct version_info *vi = NULL;
 	int i;
@@ -384,29 +384,36 @@ void dt_to_blob(FILE *f, struct boot_info *bi, int version)
 	if (!vi)
 		die("Unknown device tree blob version %d\n", version);
 
-	flatten_tree(bi->dt, &bin_emitter, &dtbuf, &strbuf, vi);
+	flatten_tree(dti->dt, &bin_emitter, &dtbuf, &strbuf, vi);
 	bin_emit_cell(&dtbuf, FDT_END);
 
-	reservebuf = flatten_reserve_list(bi->reservelist, vi);
+	reservebuf = flatten_reserve_list(dti->reservelist, vi);
 
 	/* Make header */
 	make_fdt_header(&fdt, vi, reservebuf.len, dtbuf.len, strbuf.len,
-			bi->boot_cpuid_phys);
+			dti->boot_cpuid_phys);
 
 	/*
 	 * If the user asked for more space than is used, adjust the totalsize.
 	 */
 	if (minsize > 0) {
 		padlen = minsize - fdt32_to_cpu(fdt.totalsize);
-		if ((padlen < 0) && (quiet < 1))
-			fprintf(stderr,
-				"Warning: blob size %d >= minimum size %d\n",
-				fdt32_to_cpu(fdt.totalsize), minsize);
+		if (padlen < 0) {
+			padlen = 0;
+			if (quiet < 1)
+				fprintf(stderr,
+					"Warning: blob size %d >= minimum size %d\n",
+					fdt32_to_cpu(fdt.totalsize), minsize);
+		}
 	}
 
 	if (padsize > 0)
 		padlen = padsize;
 
+	if (alignsize > 0)
+		padlen = ALIGN(fdt32_to_cpu(fdt.totalsize) + padlen, alignsize)
+			- fdt32_to_cpu(fdt.totalsize);
+
 	if (padlen > 0) {
 		int tsize = fdt32_to_cpu(fdt.totalsize);
 		tsize += padlen;
@@ -460,7 +467,7 @@ static void dump_stringtable_asm(FILE *f, struct data strbuf)
 	}
 }
 
-void dt_to_asm(FILE *f, struct boot_info *bi, int version)
+void dt_to_asm(FILE *f, struct dt_info *dti, int version)
 {
 	struct version_info *vi = NULL;
 	int i;
@@ -500,7 +507,7 @@ void dt_to_asm(FILE *f, struct boot_info *bi, int version)
 
 	if (vi->flags & FTF_BOOTCPUID) {
 		fprintf(f, "\t/* boot_cpuid_phys */\n");
-		asm_emit_cell(f, bi->boot_cpuid_phys);
+		asm_emit_cell(f, dti->boot_cpuid_phys);
 	}
 
 	if (vi->flags & FTF_STRTABSIZE) {
@@ -530,7 +537,7 @@ void dt_to_asm(FILE *f, struct boot_info *bi, int version)
 	 * Use .long on high and low halfs of u64s to avoid .quad
 	 * as it appears .quad isn't available in some assemblers.
 	 */
-	for (re = bi->reservelist; re; re = re->next) {
+	for (re = dti->reservelist; re; re = re->next) {
 		struct label *l;
 
 		for_each_label(re->labels, l) {
@@ -550,7 +557,7 @@ void dt_to_asm(FILE *f, struct boot_info *bi, int version)
 	fprintf(f, "\t.long\t0, 0\n\t.long\t0, 0\n");
 
 	emit_label(f, symprefix, "struct_start");
-	flatten_tree(bi->dt, &asm_emitter, f, &strbuf, vi);
+	flatten_tree(dti->dt, &asm_emitter, f, &strbuf, vi);
 
 	fprintf(f, "\t/* FDT_END */\n");
 	asm_emit_cell(f, FDT_END);
@@ -572,6 +579,8 @@ void dt_to_asm(FILE *f, struct boot_info *bi, int version)
 	if (padsize > 0) {
 		fprintf(f, "\t.space\t%d, 0\n", padsize);
 	}
+	if (alignsize > 0)
+		asm_emit_align(f, alignsize);
 	emit_label(f, symprefix, "blob_abs_end");
 
 	data_free(strbuf);
@@ -797,11 +806,15 @@ static struct node *unflatten_tree(struct inbuf *dtbuf,
 		}
 	} while (val != FDT_END_NODE);
 
+	if (node->name != flatname) {
+		free(flatname);
+	}
+
 	return node;
 }
 
 
-struct boot_info *dt_from_blob(const char *fname)
+struct dt_info *dt_from_blob(const char *fname)
 {
 	FILE *f;
 	uint32_t magic, totalsize, version, size_dt, boot_cpuid_phys;
@@ -929,5 +942,5 @@ struct boot_info *dt_from_blob(const char *fname)
 
 	fclose(f);
 
-	return build_boot_info(reservelist, tree, boot_cpuid_phys);
+	return build_dt_info(DTSF_V1, reservelist, tree, boot_cpuid_phys);
 }
diff --git a/scripts/dtc/fstree.c b/scripts/dtc/fstree.c
index 6d1beec9581d..ae7d06c3c492 100644
--- a/scripts/dtc/fstree.c
+++ b/scripts/dtc/fstree.c
@@ -79,13 +79,12 @@ static struct node *read_fstree(const char *dirname)
 	return tree;
 }
 
-struct boot_info *dt_from_fs(const char *dirname)
+struct dt_info *dt_from_fs(const char *dirname)
 {
 	struct node *tree;
 
 	tree = read_fstree(dirname);
 	tree = name_node(tree, "");
 
-	return build_boot_info(NULL, tree, guess_boot_cpuid(tree));
+	return build_dt_info(DTSF_V1, NULL, tree, guess_boot_cpuid(tree));
 }
-
diff --git a/scripts/dtc/libfdt/Makefile.libfdt b/scripts/dtc/libfdt/Makefile.libfdt
index 09c322ed82ba..098b3f36e668 100644
--- a/scripts/dtc/libfdt/Makefile.libfdt
+++ b/scripts/dtc/libfdt/Makefile.libfdt
@@ -7,5 +7,5 @@ LIBFDT_soname = libfdt.$(SHAREDLIB_EXT).1
 LIBFDT_INCLUDES = fdt.h libfdt.h libfdt_env.h
 LIBFDT_VERSION = version.lds
 LIBFDT_SRCS = fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c fdt_empty_tree.c \
-	fdt_addresses.c
+	fdt_addresses.c fdt_overlay.c
 LIBFDT_OBJS = $(LIBFDT_SRCS:%.c=%.o)
diff --git a/scripts/dtc/libfdt/fdt_ro.c b/scripts/dtc/libfdt/fdt_ro.c
index 50cce864283c..3d00d2eee0e3 100644
--- a/scripts/dtc/libfdt/fdt_ro.c
+++ b/scripts/dtc/libfdt/fdt_ro.c
@@ -88,6 +88,32 @@ static int _fdt_string_eq(const void *fdt, int stroffset,
 	return (strlen(p) == len) && (memcmp(p, s, len) == 0);
 }
 
+uint32_t fdt_get_max_phandle(const void *fdt)
+{
+	uint32_t max_phandle = 0;
+	int offset;
+
+	for (offset = fdt_next_node(fdt, -1, NULL);;
+	     offset = fdt_next_node(fdt, offset, NULL)) {
+		uint32_t phandle;
+
+		if (offset == -FDT_ERR_NOTFOUND)
+			return max_phandle;
+
+		if (offset < 0)
+			return (uint32_t)-1;
+
+		phandle = fdt_get_phandle(fdt, offset);
+		if (phandle == (uint32_t)-1)
+			continue;
+
+		if (phandle > max_phandle)
+			max_phandle = phandle;
+	}
+
+	return 0;
+}
+
 int fdt_get_mem_rsv(const void *fdt, int n, uint64_t *address, uint64_t *size)
 {
 	FDT_CHECK_HEADER(fdt);
@@ -545,7 +571,7 @@ int fdt_stringlist_count(const void *fdt, int nodeoffset, const char *property)
 
 	list = fdt_getprop(fdt, nodeoffset, property, &length);
 	if (!list)
-		return -length;
+		return length;
 
 	end = list + length;
 
@@ -571,7 +597,7 @@ int fdt_stringlist_search(const void *fdt, int nodeoffset, const char *property,
 
 	list = fdt_getprop(fdt, nodeoffset, property, &length);
 	if (!list)
-		return -length;
+		return length;
 
 	len = strlen(string) + 1;
 	end = list + length;
diff --git a/scripts/dtc/libfdt/fdt_rw.c b/scripts/dtc/libfdt/fdt_rw.c
index 8be02b1f68f3..2eed4f58387c 100644
--- a/scripts/dtc/libfdt/fdt_rw.c
+++ b/scripts/dtc/libfdt/fdt_rw.c
@@ -191,17 +191,13 @@ int fdt_add_mem_rsv(void *fdt, uint64_t address, uint64_t size)
 int fdt_del_mem_rsv(void *fdt, int n)
 {
 	struct fdt_reserve_entry *re = _fdt_mem_rsv_w(fdt, n);
-	int err;
 
 	FDT_RW_CHECK_HEADER(fdt);
 
 	if (n >= fdt_num_mem_rsv(fdt))
 		return -FDT_ERR_NOTFOUND;
 
-	err = _fdt_splice_mem_rsv(fdt, re, 1, 0);
-	if (err)
-		return err;
-	return 0;
+	return _fdt_splice_mem_rsv(fdt, re, 1, 0);
 }
 
 static int _fdt_resize_property(void *fdt, int nodeoffset, const char *name,
diff --git a/scripts/dtc/libfdt/fdt_strerror.c b/scripts/dtc/libfdt/fdt_strerror.c
index e6c3ceee8c58..9677a1887e57 100644
--- a/scripts/dtc/libfdt/fdt_strerror.c
+++ b/scripts/dtc/libfdt/fdt_strerror.c
@@ -69,6 +69,7 @@ static struct fdt_errtabent fdt_errtable[] = {
 
 	FDT_ERRTABENT(FDT_ERR_BADOFFSET),
 	FDT_ERRTABENT(FDT_ERR_BADPATH),
+	FDT_ERRTABENT(FDT_ERR_BADPHANDLE),
 	FDT_ERRTABENT(FDT_ERR_BADSTATE),
 
 	FDT_ERRTABENT(FDT_ERR_TRUNCATED),
@@ -76,6 +77,11 @@ static struct fdt_errtabent fdt_errtable[] = {
 	FDT_ERRTABENT(FDT_ERR_BADVERSION),
 	FDT_ERRTABENT(FDT_ERR_BADSTRUCTURE),
 	FDT_ERRTABENT(FDT_ERR_BADLAYOUT),
+	FDT_ERRTABENT(FDT_ERR_INTERNAL),
+	FDT_ERRTABENT(FDT_ERR_BADNCELLS),
+	FDT_ERRTABENT(FDT_ERR_BADVALUE),
+	FDT_ERRTABENT(FDT_ERR_BADOVERLAY),
+	FDT_ERRTABENT(FDT_ERR_NOPHANDLES),
 };
 #define FDT_ERRTABSIZE	(sizeof(fdt_errtable) / sizeof(fdt_errtable[0]))
 
diff --git a/scripts/dtc/libfdt/fdt_wip.c b/scripts/dtc/libfdt/fdt_wip.c
index c5bbb68d3273..6aaab399929c 100644
--- a/scripts/dtc/libfdt/fdt_wip.c
+++ b/scripts/dtc/libfdt/fdt_wip.c
@@ -55,21 +55,42 @@
 
 #include "libfdt_internal.h"
 
-int fdt_setprop_inplace(void *fdt, int nodeoffset, const char *name,
-			const void *val, int len)
+int fdt_setprop_inplace_namelen_partial(void *fdt, int nodeoffset,
+					const char *name, int namelen,
+					uint32_t idx, const void *val,
+					int len)
 {
 	void *propval;
 	int proplen;
 
-	propval = fdt_getprop_w(fdt, nodeoffset, name, &proplen);
+	propval = fdt_getprop_namelen_w(fdt, nodeoffset, name, namelen,
+					&proplen);
+	if (!propval)
+		return proplen;
+
+	if (proplen < (len + idx))
+		return -FDT_ERR_NOSPACE;
+
+	memcpy((char *)propval + idx, val, len);
+	return 0;
+}
+
+int fdt_setprop_inplace(void *fdt, int nodeoffset, const char *name,
+			const void *val, int len)
+{
+	const void *propval;
+	int proplen;
+
+	propval = fdt_getprop(fdt, nodeoffset, name, &proplen);
 	if (! propval)
 		return proplen;
 
 	if (proplen != len)
 		return -FDT_ERR_NOSPACE;
 
-	memcpy(propval, val, len);
-	return 0;
+	return fdt_setprop_inplace_namelen_partial(fdt, nodeoffset, name,
+						   strlen(name), 0,
+						   val, len);
 }
 
 static void _fdt_nop_region(void *start, int len)
diff --git a/scripts/dtc/libfdt/libfdt.h b/scripts/dtc/libfdt/libfdt.h
index 59ca33976e56..b842b156fa17 100644
--- a/scripts/dtc/libfdt/libfdt.h
+++ b/scripts/dtc/libfdt/libfdt.h
@@ -61,7 +61,7 @@
 #define FDT_ERR_NOTFOUND	1
 	/* FDT_ERR_NOTFOUND: The requested node or property does not exist */
 #define FDT_ERR_EXISTS		2
-	/* FDT_ERR_EXISTS: Attemped to create a node or property which
+	/* FDT_ERR_EXISTS: Attempted to create a node or property which
 	 * already exists */
 #define FDT_ERR_NOSPACE		3
 	/* FDT_ERR_NOSPACE: Operation needed to expand the device
@@ -79,8 +79,10 @@
 	 * (e.g. missing a leading / for a function which requires an
 	 * absolute path) */
 #define FDT_ERR_BADPHANDLE	6
-	/* FDT_ERR_BADPHANDLE: Function was passed an invalid phandle
-	 * value.  phandle values of 0 and -1 are not permitted. */
+	/* FDT_ERR_BADPHANDLE: Function was passed an invalid phandle.
+	 * This can be caused either by an invalid phandle property
+	 * length, or the phandle value was either 0 or -1, which are
+	 * not permitted. */
 #define FDT_ERR_BADSTATE	7
 	/* FDT_ERR_BADSTATE: Function was passed an incomplete device
 	 * tree created by the sequential-write functions, which is
@@ -126,7 +128,16 @@
 	 * value. For example: a property expected to contain a string list
 	 * is not NUL-terminated within the length of its value. */
 
-#define FDT_ERR_MAX		15
+#define FDT_ERR_BADOVERLAY	16
+	/* FDT_ERR_BADOVERLAY: The device tree overlay, while
+	 * correctly structured, cannot be applied due to some
+	 * unexpected or missing value, property or node. */
+
+#define FDT_ERR_NOPHANDLES	17
+	/* FDT_ERR_NOPHANDLES: The device tree doesn't have any
+	 * phandle available anymore without causing an overflow */
+
+#define FDT_ERR_MAX		17
 
 /**********************************************************************/
 /* Low-level functions (you probably don't need these)                */
@@ -168,27 +179,55 @@ int fdt_first_subnode(const void *fdt, int offset);
  */
 int fdt_next_subnode(const void *fdt, int offset);
 
+/**
+ * fdt_for_each_subnode - iterate over all subnodes of a parent
+ *
+ * @node:	child node (int, lvalue)
+ * @fdt:	FDT blob (const void *)
+ * @parent:	parent node (int)
+ *
+ * This is actually a wrapper around a for loop and would be used like so:
+ *
+ *	fdt_for_each_subnode(node, fdt, parent) {
+ *		Use node
+ *		...
+ *	}
+ *
+ *	if ((node < 0) && (node != -FDT_ERR_NOT_FOUND)) {
+ *		Error handling
+ *	}
+ *
+ * Note that this is implemented as a macro and @node is used as
+ * iterator in the loop. The parent variable be constant or even a
+ * literal.
+ *
+ */
+#define fdt_for_each_subnode(node, fdt, parent)		\
+	for (node = fdt_first_subnode(fdt, parent);	\
+	     node >= 0;					\
+	     node = fdt_next_subnode(fdt, node))
+
 /**********************************************************************/
 /* General functions                                                  */
 /**********************************************************************/
 
 #define fdt_get_header(fdt, field) \
 	(fdt32_to_cpu(((const struct fdt_header *)(fdt))->field))
-#define fdt_magic(fdt) 			(fdt_get_header(fdt, magic))
+#define fdt_magic(fdt)			(fdt_get_header(fdt, magic))
 #define fdt_totalsize(fdt)		(fdt_get_header(fdt, totalsize))
 #define fdt_off_dt_struct(fdt)		(fdt_get_header(fdt, off_dt_struct))
 #define fdt_off_dt_strings(fdt)		(fdt_get_header(fdt, off_dt_strings))
 #define fdt_off_mem_rsvmap(fdt)		(fdt_get_header(fdt, off_mem_rsvmap))
 #define fdt_version(fdt)		(fdt_get_header(fdt, version))
-#define fdt_last_comp_version(fdt) 	(fdt_get_header(fdt, last_comp_version))
-#define fdt_boot_cpuid_phys(fdt) 	(fdt_get_header(fdt, boot_cpuid_phys))
-#define fdt_size_dt_strings(fdt) 	(fdt_get_header(fdt, size_dt_strings))
+#define fdt_last_comp_version(fdt)	(fdt_get_header(fdt, last_comp_version))
+#define fdt_boot_cpuid_phys(fdt)	(fdt_get_header(fdt, boot_cpuid_phys))
+#define fdt_size_dt_strings(fdt)	(fdt_get_header(fdt, size_dt_strings))
 #define fdt_size_dt_struct(fdt)		(fdt_get_header(fdt, size_dt_struct))
 
 #define __fdt_set_hdr(name) \
 	static inline void fdt_set_##name(void *fdt, uint32_t val) \
 	{ \
-		struct fdt_header *fdth = (struct fdt_header*)fdt; \
+		struct fdt_header *fdth = (struct fdt_header *)fdt; \
 		fdth->name = cpu_to_fdt32(val); \
 	}
 __fdt_set_hdr(magic);
@@ -258,6 +297,21 @@ int fdt_move(const void *fdt, void *buf, int bufsize);
  */
 const char *fdt_string(const void *fdt, int stroffset);
 
+/**
+ * fdt_get_max_phandle - retrieves the highest phandle in a tree
+ * @fdt: pointer to the device tree blob
+ *
+ * fdt_get_max_phandle retrieves the highest phandle in the given
+ * device tree. This will ignore badly formatted phandles, or phandles
+ * with a value of 0 or -1.
+ *
+ * returns:
+ *      the highest phandle on success
+ *      0, if no phandle was found in the device tree
+ *      -1, if an error occurred
+ */
+uint32_t fdt_get_max_phandle(const void *fdt);
+
 /**
  * fdt_num_mem_rsv - retrieve the number of memory reserve map entries
  * @fdt: pointer to the device tree blob
@@ -318,8 +372,9 @@ int fdt_subnode_offset_namelen(const void *fdt, int parentoffset,
  * returns:
  *	structure block offset of the requested subnode (>=0), on success
  *	-FDT_ERR_NOTFOUND, if the requested subnode does not exist
- *	-FDT_ERR_BADOFFSET, if parentoffset did not point to an FDT_BEGIN_NODE tag
- *      -FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADOFFSET, if parentoffset did not point to an FDT_BEGIN_NODE
+ *		tag
+ *	-FDT_ERR_BADMAGIC,
  *	-FDT_ERR_BADVERSION,
  *	-FDT_ERR_BADSTATE,
  *	-FDT_ERR_BADSTRUCTURE,
@@ -351,7 +406,8 @@ int fdt_path_offset_namelen(const void *fdt, const char *path, int namelen);
  * address).
  *
  * returns:
- *	structure block offset of the node with the requested path (>=0), on success
+ *	structure block offset of the node with the requested path (>=0), on
+ *		success
  *	-FDT_ERR_BADPATH, given path does not begin with '/' or is invalid
  *	-FDT_ERR_NOTFOUND, if the requested node does not exist
  *      -FDT_ERR_BADMAGIC,
@@ -375,10 +431,12 @@ int fdt_path_offset(const void *fdt, const char *path);
  *
  * returns:
  *	pointer to the node's name, on success
- *		If lenp is non-NULL, *lenp contains the length of that name (>=0)
+ *		If lenp is non-NULL, *lenp contains the length of that name
+ *			(>=0)
  *	NULL, on error
  *		if lenp is non-NULL *lenp contains an error code (<0):
- *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE
+ *			tag
  *		-FDT_ERR_BADMAGIC,
  *		-FDT_ERR_BADVERSION,
  *		-FDT_ERR_BADSTATE, standard meanings
@@ -426,6 +484,33 @@ int fdt_first_property_offset(const void *fdt, int nodeoffset);
  */
 int fdt_next_property_offset(const void *fdt, int offset);
 
+/**
+ * fdt_for_each_property_offset - iterate over all properties of a node
+ *
+ * @property_offset:	property offset (int, lvalue)
+ * @fdt:		FDT blob (const void *)
+ * @node:		node offset (int)
+ *
+ * This is actually a wrapper around a for loop and would be used like so:
+ *
+ *	fdt_for_each_property_offset(property, fdt, node) {
+ *		Use property
+ *		...
+ *	}
+ *
+ *	if ((property < 0) && (property != -FDT_ERR_NOT_FOUND)) {
+ *		Error handling
+ *	}
+ *
+ * Note that this is implemented as a macro and property is used as
+ * iterator in the loop. The node variable can be constant or even a
+ * literal.
+ */
+#define fdt_for_each_property_offset(property, fdt, node)	\
+	for (property = fdt_first_property_offset(fdt, node);	\
+	     property >= 0;					\
+	     property = fdt_next_property_offset(fdt, property))
+
 /**
  * fdt_get_property_by_offset - retrieve the property at a given offset
  * @fdt: pointer to the device tree blob
@@ -490,7 +575,8 @@ const struct fdt_property *fdt_get_property_namelen(const void *fdt,
  *	NULL, on error
  *		if lenp is non-NULL, *lenp contains an error code (<0):
  *		-FDT_ERR_NOTFOUND, node does not have named property
- *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE
+ *			tag
  *		-FDT_ERR_BADMAGIC,
  *		-FDT_ERR_BADVERSION,
  *		-FDT_ERR_BADSTATE,
@@ -554,6 +640,13 @@ const void *fdt_getprop_by_offset(const void *fdt, int offset,
  */
 const void *fdt_getprop_namelen(const void *fdt, int nodeoffset,
 				const char *name, int namelen, int *lenp);
+static inline void *fdt_getprop_namelen_w(void *fdt, int nodeoffset,
+					  const char *name, int namelen,
+					  int *lenp)
+{
+	return (void *)(uintptr_t)fdt_getprop_namelen(fdt, nodeoffset, name,
+						      namelen, lenp);
+}
 
 /**
  * fdt_getprop - retrieve the value of a given property
@@ -575,7 +668,8 @@ const void *fdt_getprop_namelen(const void *fdt, int nodeoffset,
  *	NULL, on error
  *		if lenp is non-NULL, *lenp contains an error code (<0):
  *		-FDT_ERR_NOTFOUND, node does not have named property
- *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag
+ *		-FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE
+ *			tag
  *		-FDT_ERR_BADMAGIC,
  *		-FDT_ERR_BADVERSION,
  *		-FDT_ERR_BADSTATE,
@@ -617,7 +711,7 @@ const char *fdt_get_alias_namelen(const void *fdt,
 				  const char *name, int namelen);
 
 /**
- * fdt_get_alias - retreive the path referenced by a given alias
+ * fdt_get_alias - retrieve the path referenced by a given alias
  * @fdt: pointer to the device tree blob
  * @name: name of the alias th look up
  *
@@ -647,7 +741,7 @@ const char *fdt_get_alias(const void *fdt, const char *name);
  *	0, on success
  *		buf contains the absolute path of the node at
  *		nodeoffset, as a NUL-terminated string.
- * 	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
  *	-FDT_ERR_NOSPACE, the path of the given node is longer than (bufsize-1)
  *		characters and will not fit in the given buffer.
  *	-FDT_ERR_BADMAGIC,
@@ -677,11 +771,11 @@ int fdt_get_path(const void *fdt, int nodeoffset, char *buf, int buflen);
  * structure from the start to nodeoffset.
  *
  * returns:
-
  *	structure block offset of the node at node offset's ancestor
  *		of depth supernodedepth (>=0), on success
- * 	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
-*	-FDT_ERR_NOTFOUND, supernodedepth was greater than the depth of nodeoffset
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_NOTFOUND, supernodedepth was greater than the depth of
+ *		nodeoffset
  *	-FDT_ERR_BADMAGIC,
  *	-FDT_ERR_BADVERSION,
  *	-FDT_ERR_BADSTATE,
@@ -703,7 +797,7 @@ int fdt_supernode_atdepth_offset(const void *fdt, int nodeoffset,
  *
  * returns:
  *	depth of the node at nodeoffset (>=0), on success
- * 	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
  *	-FDT_ERR_BADMAGIC,
  *	-FDT_ERR_BADVERSION,
  *	-FDT_ERR_BADSTATE,
@@ -726,7 +820,7 @@ int fdt_node_depth(const void *fdt, int nodeoffset);
  * returns:
  *	structure block offset of the parent of the node at nodeoffset
  *		(>=0), on success
- * 	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
  *	-FDT_ERR_BADMAGIC,
  *	-FDT_ERR_BADVERSION,
  *	-FDT_ERR_BADSTATE,
@@ -766,7 +860,7 @@ int fdt_parent_offset(const void *fdt, int nodeoffset);
  *		 on success
  *	-FDT_ERR_NOTFOUND, no node matching the criterion exists in the
  *		tree after startoffset
- * 	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
  *	-FDT_ERR_BADMAGIC,
  *	-FDT_ERR_BADVERSION,
  *	-FDT_ERR_BADSTATE,
@@ -813,7 +907,7 @@ int fdt_node_offset_by_phandle(const void *fdt, uint32_t phandle);
  *	1, if the node has a 'compatible' property, but it does not list
  *		the given string
  *	-FDT_ERR_NOTFOUND, if the given node has no 'compatible' property
- * 	-FDT_ERR_BADOFFSET, if nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADOFFSET, if nodeoffset does not refer to a BEGIN_NODE tag
  *	-FDT_ERR_BADMAGIC,
  *	-FDT_ERR_BADVERSION,
  *	-FDT_ERR_BADSTATE,
@@ -850,7 +944,7 @@ int fdt_node_check_compatible(const void *fdt, int nodeoffset,
  *		 on success
  *	-FDT_ERR_NOTFOUND, no node matching the criterion exists in the
  *		tree after startoffset
- * 	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
+ *	-FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag
  *	-FDT_ERR_BADMAGIC,
  *	-FDT_ERR_BADVERSION,
  *	-FDT_ERR_BADSTATE,
@@ -960,7 +1054,8 @@ const char *fdt_stringlist_get(const void *fdt, int nodeoffset,
  * returns:
  *	0 <= n < FDT_MAX_NCELLS, on success
  *      2, if the node has no #address-cells property
- *      -FDT_ERR_BADNCELLS, if the node has a badly formatted or invalid #address-cells property
+ *      -FDT_ERR_BADNCELLS, if the node has a badly formatted or invalid
+ *		#address-cells property
  *	-FDT_ERR_BADMAGIC,
  *	-FDT_ERR_BADVERSION,
  *	-FDT_ERR_BADSTATE,
@@ -980,7 +1075,8 @@ int fdt_address_cells(const void *fdt, int nodeoffset);
  * returns:
  *	0 <= n < FDT_MAX_NCELLS, on success
  *      2, if the node has no #address-cells property
- *      -FDT_ERR_BADNCELLS, if the node has a badly formatted or invalid #size-cells property
+ *      -FDT_ERR_BADNCELLS, if the node has a badly formatted or invalid
+ *		#size-cells property
  *	-FDT_ERR_BADMAGIC,
  *	-FDT_ERR_BADVERSION,
  *	-FDT_ERR_BADSTATE,
@@ -994,6 +1090,27 @@ int fdt_size_cells(const void *fdt, int nodeoffset);
 /* Write-in-place functions                                           */
 /**********************************************************************/
 
+/**
+ * fdt_setprop_inplace_namelen_partial - change a property's value,
+ *                                       but not its size
+ * @fdt: pointer to the device tree blob
+ * @nodeoffset: offset of the node whose property to change
+ * @name: name of the property to change
+ * @namelen: number of characters of name to consider
+ * @idx: index of the property to change in the array
+ * @val: pointer to data to replace the property value with
+ * @len: length of the property value
+ *
+ * Identical to fdt_setprop_inplace(), but modifies the given property
+ * starting from the given index, and using only the first characters
+ * of the name. It is useful when you want to manipulate only one value of
+ * an array and you have a string that doesn't end with \0.
+ */
+int fdt_setprop_inplace_namelen_partial(void *fdt, int nodeoffset,
+					const char *name, int namelen,
+					uint32_t idx, const void *val,
+					int len);
+
 /**
  * fdt_setprop_inplace - change a property's value, but not its size
  * @fdt: pointer to the device tree blob
@@ -1604,9 +1721,11 @@ int fdt_add_subnode_namelen(void *fdt, int parentoffset,
  * change the offsets of some existing nodes.
 
  * returns:
- *	structure block offset of the created nodeequested subnode (>=0), on success
+ *	structure block offset of the created nodeequested subnode (>=0), on
+ *		success
  *	-FDT_ERR_NOTFOUND, if the requested subnode does not exist
- *	-FDT_ERR_BADOFFSET, if parentoffset did not point to an FDT_BEGIN_NODE tag
+ *	-FDT_ERR_BADOFFSET, if parentoffset did not point to an FDT_BEGIN_NODE
+ *		tag
  *	-FDT_ERR_EXISTS, if the node at parentoffset already has a subnode of
  *		the given name
  *	-FDT_ERR_NOSPACE, if there is insufficient free space in the
@@ -1644,6 +1763,37 @@ int fdt_add_subnode(void *fdt, int parentoffset, const char *name);
  */
 int fdt_del_node(void *fdt, int nodeoffset);
 
+/**
+ * fdt_overlay_apply - Applies a DT overlay on a base DT
+ * @fdt: pointer to the base device tree blob
+ * @fdto: pointer to the device tree overlay blob
+ *
+ * fdt_overlay_apply() will apply the given device tree overlay on the
+ * given base device tree.
+ *
+ * Expect the base device tree to be modified, even if the function
+ * returns an error.
+ *
+ * returns:
+ *	0, on success
+ *	-FDT_ERR_NOSPACE, there's not enough space in the base device tree
+ *	-FDT_ERR_NOTFOUND, the overlay points to some inexistant nodes or
+ *		properties in the base DT
+ *	-FDT_ERR_BADPHANDLE,
+ *	-FDT_ERR_BADOVERLAY,
+ *	-FDT_ERR_NOPHANDLES,
+ *	-FDT_ERR_INTERNAL,
+ *	-FDT_ERR_BADLAYOUT,
+ *	-FDT_ERR_BADMAGIC,
+ *	-FDT_ERR_BADOFFSET,
+ *	-FDT_ERR_BADPATH,
+ *	-FDT_ERR_BADVERSION,
+ *	-FDT_ERR_BADSTRUCTURE,
+ *	-FDT_ERR_BADSTATE,
+ *	-FDT_ERR_TRUNCATED, standard meanings
+ */
+int fdt_overlay_apply(void *fdt, void *fdto);
+
 /**********************************************************************/
 /* Debugging / informational functions                                */
 /**********************************************************************/
diff --git a/scripts/dtc/libfdt/libfdt_env.h b/scripts/dtc/libfdt/libfdt_env.h
index 9dea97dfff81..99f936dacc60 100644
--- a/scripts/dtc/libfdt/libfdt_env.h
+++ b/scripts/dtc/libfdt/libfdt_env.h
@@ -54,6 +54,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <string.h>
 
 #ifdef __CHECKER__
diff --git a/scripts/dtc/livetree.c b/scripts/dtc/livetree.c
index e229b84432f9..afa2f67b142a 100644
--- a/scripts/dtc/livetree.c
+++ b/scripts/dtc/livetree.c
@@ -204,7 +204,7 @@ struct node *merge_nodes(struct node *old_node, struct node *new_node)
 			}
 		}
 
-		/* if no collision occured, add child to the old node. */
+		/* if no collision occurred, add child to the old node. */
 		if (new_child)
 			add_child(old_node, new_child);
 	}
@@ -296,6 +296,23 @@ void delete_node(struct node *node)
 	delete_labels(&node->labels);
 }
 
+void append_to_property(struct node *node,
+				    char *name, const void *data, int len)
+{
+	struct data d;
+	struct property *p;
+
+	p = get_property(node, name);
+	if (p) {
+		d = data_append_data(p->val, data, len);
+		p->val = d;
+	} else {
+		d = data_append_data(empty_data, data, len);
+		p = build_property(name, d);
+		add_property(node, p);
+	}
+}
+
 struct reserve_info *build_reserve_entry(uint64_t address, uint64_t size)
 {
 	struct reserve_info *new = xmalloc(sizeof(*new));
@@ -335,17 +352,19 @@ struct reserve_info *add_reserve_entry(struct reserve_info *list,
 	return list;
 }
 
-struct boot_info *build_boot_info(struct reserve_info *reservelist,
-				  struct node *tree, uint32_t boot_cpuid_phys)
+struct dt_info *build_dt_info(unsigned int dtsflags,
+			      struct reserve_info *reservelist,
+			      struct node *tree, uint32_t boot_cpuid_phys)
 {
-	struct boot_info *bi;
+	struct dt_info *dti;
 
-	bi = xmalloc(sizeof(*bi));
-	bi->reservelist = reservelist;
-	bi->dt = tree;
-	bi->boot_cpuid_phys = boot_cpuid_phys;
+	dti = xmalloc(sizeof(*dti));
+	dti->dtsflags = dtsflags;
+	dti->reservelist = reservelist;
+	dti->dt = tree;
+	dti->boot_cpuid_phys = boot_cpuid_phys;
 
-	return bi;
+	return dti;
 }
 
 /*
@@ -592,12 +611,12 @@ static int cmp_reserve_info(const void *ax, const void *bx)
 		return 0;
 }
 
-static void sort_reserve_entries(struct boot_info *bi)
+static void sort_reserve_entries(struct dt_info *dti)
 {
 	struct reserve_info *ri, **tbl;
 	int n = 0, i = 0;
 
-	for (ri = bi->reservelist;
+	for (ri = dti->reservelist;
 	     ri;
 	     ri = ri->next)
 		n++;
@@ -607,14 +626,14 @@ static void sort_reserve_entries(struct boot_info *bi)
 
 	tbl = xmalloc(n * sizeof(*tbl));
 
-	for (ri = bi->reservelist;
+	for (ri = dti->reservelist;
 	     ri;
 	     ri = ri->next)
 		tbl[i++] = ri;
 
 	qsort(tbl, n, sizeof(*tbl), cmp_reserve_info);
 
-	bi->reservelist = tbl[0];
+	dti->reservelist = tbl[0];
 	for (i = 0; i < (n-1); i++)
 		tbl[i]->next = tbl[i+1];
 	tbl[n-1]->next = NULL;
@@ -704,8 +723,256 @@ static void sort_node(struct node *node)
 		sort_node(c);
 }
 
-void sort_tree(struct boot_info *bi)
+void sort_tree(struct dt_info *dti)
 {
-	sort_reserve_entries(bi);
-	sort_node(bi->dt);
+	sort_reserve_entries(dti);
+	sort_node(dti->dt);
+}
+
+/* utility helper to avoid code duplication */
+static struct node *build_and_name_child_node(struct node *parent, char *name)
+{
+	struct node *node;
+
+	node = build_node(NULL, NULL);
+	name_node(node, xstrdup(name));
+	add_child(parent, node);
+
+	return node;
+}
+
+static struct node *build_root_node(struct node *dt, char *name)
+{
+	struct node *an;
+
+	an = get_subnode(dt, name);
+	if (!an)
+		an = build_and_name_child_node(dt, name);
+
+	if (!an)
+		die("Could not build root node /%s\n", name);
+
+	return an;
+}
+
+static bool any_label_tree(struct dt_info *dti, struct node *node)
+{
+	struct node *c;
+
+	if (node->labels)
+		return true;
+
+	for_each_child(node, c)
+		if (any_label_tree(dti, c))
+			return true;
+
+	return false;
+}
+
+static void generate_label_tree_internal(struct dt_info *dti,
+					 struct node *an, struct node *node,
+					 bool allocph)
+{
+	struct node *dt = dti->dt;
+	struct node *c;
+	struct property *p;
+	struct label *l;
+
+	/* if there are labels */
+	if (node->labels) {
+
+		/* now add the label in the node */
+		for_each_label(node->labels, l) {
+
+			/* check whether the label already exists */
+			p = get_property(an, l->label);
+			if (p) {
+				fprintf(stderr, "WARNING: label %s already"
+					" exists in /%s", l->label,
+					an->name);
+				continue;
+			}
+
+			/* insert it */
+			p = build_property(l->label,
+				data_copy_mem(node->fullpath,
+						strlen(node->fullpath) + 1));
+			add_property(an, p);
+		}
+
+		/* force allocation of a phandle for this node */
+		if (allocph)
+			(void)get_node_phandle(dt, node);
+	}
+
+	for_each_child(node, c)
+		generate_label_tree_internal(dti, an, c, allocph);
+}
+
+static bool any_fixup_tree(struct dt_info *dti, struct node *node)
+{
+	struct node *c;
+	struct property *prop;
+	struct marker *m;
+
+	for_each_property(node, prop) {
+		m = prop->val.markers;
+		for_each_marker_of_type(m, REF_PHANDLE) {
+			if (!get_node_by_ref(dti->dt, m->ref))
+				return true;
+		}
+	}
+
+	for_each_child(node, c) {
+		if (any_fixup_tree(dti, c))
+			return true;
+	}
+
+	return false;
+}
+
+static void add_fixup_entry(struct dt_info *dti, struct node *fn,
+			    struct node *node, struct property *prop,
+			    struct marker *m)
+{
+	char *entry;
+
+	/* m->ref can only be a REF_PHANDLE, but check anyway */
+	assert(m->type == REF_PHANDLE);
+
+	/* there shouldn't be any ':' in the arguments */
+	if (strchr(node->fullpath, ':') || strchr(prop->name, ':'))
+		die("arguments should not contain ':'\n");
+
+	xasprintf(&entry, "%s:%s:%u",
+			node->fullpath, prop->name, m->offset);
+	append_to_property(fn, m->ref, entry, strlen(entry) + 1);
+}
+
+static void generate_fixups_tree_internal(struct dt_info *dti,
+					  struct node *fn,
+					  struct node *node)
+{
+	struct node *dt = dti->dt;
+	struct node *c;
+	struct property *prop;
+	struct marker *m;
+	struct node *refnode;
+
+	for_each_property(node, prop) {
+		m = prop->val.markers;
+		for_each_marker_of_type(m, REF_PHANDLE) {
+			refnode = get_node_by_ref(dt, m->ref);
+			if (!refnode)
+				add_fixup_entry(dti, fn, node, prop, m);
+		}
+	}
+
+	for_each_child(node, c)
+		generate_fixups_tree_internal(dti, fn, c);
+}
+
+static bool any_local_fixup_tree(struct dt_info *dti, struct node *node)
+{
+	struct node *c;
+	struct property *prop;
+	struct marker *m;
+
+	for_each_property(node, prop) {
+		m = prop->val.markers;
+		for_each_marker_of_type(m, REF_PHANDLE) {
+			if (get_node_by_ref(dti->dt, m->ref))
+				return true;
+		}
+	}
+
+	for_each_child(node, c) {
+		if (any_local_fixup_tree(dti, c))
+			return true;
+	}
+
+	return false;
+}
+
+static void add_local_fixup_entry(struct dt_info *dti,
+		struct node *lfn, struct node *node,
+		struct property *prop, struct marker *m,
+		struct node *refnode)
+{
+	struct node *wn, *nwn;	/* local fixup node, walk node, new */
+	uint32_t value_32;
+	char **compp;
+	int i, depth;
+
+	/* walk back retreiving depth */
+	depth = 0;
+	for (wn = node; wn; wn = wn->parent)
+		depth++;
+
+	/* allocate name array */
+	compp = xmalloc(sizeof(*compp) * depth);
+
+	/* store names in the array */
+	for (wn = node, i = depth - 1; wn; wn = wn->parent, i--)
+		compp[i] = wn->name;
+
+	/* walk the path components creating nodes if they don't exist */
+	for (wn = lfn, i = 1; i < depth; i++, wn = nwn) {
+		/* if no node exists, create it */
+		nwn = get_subnode(wn, compp[i]);
+		if (!nwn)
+			nwn = build_and_name_child_node(wn, compp[i]);
+	}
+
+	free(compp);
+
+	value_32 = cpu_to_fdt32(m->offset);
+	append_to_property(wn, prop->name, &value_32, sizeof(value_32));
+}
+
+static void generate_local_fixups_tree_internal(struct dt_info *dti,
+						struct node *lfn,
+						struct node *node)
+{
+	struct node *dt = dti->dt;
+	struct node *c;
+	struct property *prop;
+	struct marker *m;
+	struct node *refnode;
+
+	for_each_property(node, prop) {
+		m = prop->val.markers;
+		for_each_marker_of_type(m, REF_PHANDLE) {
+			refnode = get_node_by_ref(dt, m->ref);
+			if (refnode)
+				add_local_fixup_entry(dti, lfn, node, prop, m, refnode);
+		}
+	}
+
+	for_each_child(node, c)
+		generate_local_fixups_tree_internal(dti, lfn, c);
+}
+
+void generate_label_tree(struct dt_info *dti, char *name, bool allocph)
+{
+	if (!any_label_tree(dti, dti->dt))
+		return;
+	generate_label_tree_internal(dti, build_root_node(dti->dt, name),
+				     dti->dt, allocph);
+}
+
+void generate_fixups_tree(struct dt_info *dti, char *name)
+{
+	if (!any_fixup_tree(dti, dti->dt))
+		return;
+	generate_fixups_tree_internal(dti, build_root_node(dti->dt, name),
+				      dti->dt);
+}
+
+void generate_local_fixups_tree(struct dt_info *dti, char *name)
+{
+	if (!any_local_fixup_tree(dti, dti->dt))
+		return;
+	generate_local_fixups_tree_internal(dti, build_root_node(dti->dt, name),
+					    dti->dt);
 }
diff --git a/scripts/dtc/srcpos.c b/scripts/dtc/srcpos.c
index f534c22a888d..aa3aad04cec4 100644
--- a/scripts/dtc/srcpos.c
+++ b/scripts/dtc/srcpos.c
@@ -246,46 +246,27 @@ srcpos_copy(struct srcpos *pos)
 	return pos_new;
 }
 
-
-
-void
-srcpos_dump(struct srcpos *pos)
-{
-	printf("file        : \"%s\"\n",
-	       pos->file ? (char *) pos->file : "<no file>");
-	printf("first_line  : %d\n", pos->first_line);
-	printf("first_column: %d\n", pos->first_column);
-	printf("last_line   : %d\n", pos->last_line);
-	printf("last_column : %d\n", pos->last_column);
-	printf("file        : %s\n", pos->file->name);
-}
-
-
 char *
 srcpos_string(struct srcpos *pos)
 {
 	const char *fname = "<no-file>";
 	char *pos_str;
-	int rc;
 
 	if (pos)
 		fname = pos->file->name;
 
 
 	if (pos->first_line != pos->last_line)
-		rc = asprintf(&pos_str, "%s:%d.%d-%d.%d", fname,
-			      pos->first_line, pos->first_column,
-			      pos->last_line, pos->last_column);
+		xasprintf(&pos_str, "%s:%d.%d-%d.%d", fname,
+			  pos->first_line, pos->first_column,
+			  pos->last_line, pos->last_column);
 	else if (pos->first_column != pos->last_column)
-		rc = asprintf(&pos_str, "%s:%d.%d-%d", fname,
-			      pos->first_line, pos->first_column,
-			      pos->last_column);
+		xasprintf(&pos_str, "%s:%d.%d-%d", fname,
+			  pos->first_line, pos->first_column,
+			  pos->last_column);
 	else
-		rc = asprintf(&pos_str, "%s:%d.%d", fname,
-			      pos->first_line, pos->first_column);
-
-	if (rc == -1)
-		die("Couldn't allocate in srcpos string");
+		xasprintf(&pos_str, "%s:%d.%d", fname,
+			  pos->first_line, pos->first_column);
 
 	return pos_str;
 }
diff --git a/scripts/dtc/srcpos.h b/scripts/dtc/srcpos.h
index f81827bd684a..2cdfcd82e95e 100644
--- a/scripts/dtc/srcpos.h
+++ b/scripts/dtc/srcpos.h
@@ -105,7 +105,6 @@ extern struct srcpos srcpos_empty;
 extern void srcpos_update(struct srcpos *pos, const char *text, int len);
 extern struct srcpos *srcpos_copy(struct srcpos *pos);
 extern char *srcpos_string(struct srcpos *pos);
-extern void srcpos_dump(struct srcpos *pos);
 
 extern void srcpos_verror(struct srcpos *pos, const char *prefix,
 			  const char *fmt, va_list va)
diff --git a/scripts/dtc/treesource.c b/scripts/dtc/treesource.c
index a55d1d128cce..c9d8967969f9 100644
--- a/scripts/dtc/treesource.c
+++ b/scripts/dtc/treesource.c
@@ -25,12 +25,12 @@ extern FILE *yyin;
 extern int yyparse(void);
 extern YYLTYPE yylloc;
 
-struct boot_info *the_boot_info;
+struct dt_info *parser_output;
 bool treesource_error;
 
-struct boot_info *dt_from_source(const char *fname)
+struct dt_info *dt_from_source(const char *fname)
 {
-	the_boot_info = NULL;
+	parser_output = NULL;
 	treesource_error = false;
 
 	srcfile_push(fname);
@@ -43,7 +43,7 @@ struct boot_info *dt_from_source(const char *fname)
 	if (treesource_error)
 		die("Syntax error parsing input tree\n");
 
-	return the_boot_info;
+	return parser_output;
 }
 
 static void write_prefix(FILE *f, int level)
@@ -263,13 +263,13 @@ static void write_tree_source_node(FILE *f, struct node *tree, int level)
 }
 
 
-void dt_to_source(FILE *f, struct boot_info *bi)
+void dt_to_source(FILE *f, struct dt_info *dti)
 {
 	struct reserve_info *re;
 
 	fprintf(f, "/dts-v1/;\n\n");
 
-	for (re = bi->reservelist; re; re = re->next) {
+	for (re = dti->reservelist; re; re = re->next) {
 		struct label *l;
 
 		for_each_label(re->labels, l)
@@ -279,6 +279,6 @@ void dt_to_source(FILE *f, struct boot_info *bi)
 			(unsigned long long)re->re.size);
 	}
 
-	write_tree_source_node(f, bi->dt, 0);
+	write_tree_source_node(f, dti->dt, 0);
 }
 
diff --git a/scripts/dtc/util.c b/scripts/dtc/util.c
index fb124eea4919..3550f86bd6df 100644
--- a/scripts/dtc/util.c
+++ b/scripts/dtc/util.c
@@ -46,6 +46,36 @@ char *xstrdup(const char *s)
 	return d;
 }
 
+/* based in part from (3) vsnprintf */
+int xasprintf(char **strp, const char *fmt, ...)
+{
+	int n, size = 128;	/* start with 128 bytes */
+	char *p;
+	va_list ap;
+
+	/* initial pointer is NULL making the fist realloc to be malloc */
+	p = NULL;
+	while (1) {
+		p = xrealloc(p, size);
+
+		/* Try to print in the allocated space. */
+		va_start(ap, fmt);
+		n = vsnprintf(p, size, fmt, ap);
+		va_end(ap);
+
+		/* If that worked, return the string. */
+		if (n > -1 && n < size)
+			break;
+		/* Else try again with more space. */
+		if (n > -1)	/* glibc 2.1 */
+			size = n + 1; /* precisely what is needed */
+		else		/* glibc 2.0 */
+			size *= 2; /* twice the old size */
+	}
+	*strp = p;
+	return strlen(p);
+}
+
 char *join_path(const char *path, const char *name)
 {
 	int lenp = strlen(path);
diff --git a/scripts/dtc/util.h b/scripts/dtc/util.h
index f800b6011fb1..f5c4f1b50d30 100644
--- a/scripts/dtc/util.h
+++ b/scripts/dtc/util.h
@@ -59,6 +59,7 @@ static inline void *xrealloc(void *p, size_t len)
 }
 
 extern char *xstrdup(const char *s);
+extern int xasprintf(char **strp, const char *fmt, ...);
 extern char *join_path(const char *path, const char *name);
 
 /**
diff --git a/scripts/dtc/version_gen.h b/scripts/dtc/version_gen.h
index ad9b05ae698b..16c2e53a85e3 100644
--- a/scripts/dtc/version_gen.h
+++ b/scripts/dtc/version_gen.h
@@ -1 +1 @@
-#define DTC_VERSION "DTC 1.4.1-g53bf130b"
+#define DTC_VERSION "DTC 1.4.2-g0931cea3"

From e59efd01149a6368ce0196761eee892e2ef5df48 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Thu, 2 Nov 2017 12:14:00 +0100
Subject: [PATCH 1556/3220] FROMLIST: MIPS: math-emu: Remove an unnecessary
 header inclusion

Remove an unnecessary header inclusion of "ieee754dp.h".

Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
(cherry picked from: https://patchwork.linux-mips.org/patch/17583/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/sp_tlong.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/mips/math-emu/sp_tlong.c b/arch/mips/math-emu/sp_tlong.c
index a2450c7e452a..bca5ac995801 100644
--- a/arch/mips/math-emu/sp_tlong.c
+++ b/arch/mips/math-emu/sp_tlong.c
@@ -20,7 +20,6 @@
  */
 
 #include "ieee754sp.h"
-#include "ieee754dp.h"
 
 s64 ieee754sp_tlong(union ieee754sp x)
 {

From c18976613cf09e54cb85314e0402e63fc74d3161 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Thu, 2 Nov 2017 12:14:01 +0100
Subject: [PATCH 1557/3220] FROMLIST: MIPS: math-emu: Avoid definition
 duplication for macro DPXMULT()

Avoid duplicate definition of macro DPXMULT(). Move its definition
to a header.

Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
(cherry picked from: https://patchwork.linux-mips.org/patch/17584/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c  | 3 ---
 arch/mips/math-emu/dp_mul.c    | 3 ---
 arch/mips/math-emu/ieee754dp.h | 3 +++
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 7ad79ed411f5..28b90fd3e093 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -201,9 +201,6 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	 * Multiply 64 bits xm and ym to give 128 bits result in hrm:lrm.
 	 */
 
-	/* 32 * 32 => 64 */
-#define DPXMULT(x, y)	((u64)(x) * (u64)y)
-
 	lxm = xm;
 	hxm = xm >> 32;
 	lym = ym;
diff --git a/arch/mips/math-emu/dp_mul.c b/arch/mips/math-emu/dp_mul.c
index 60c8bfe40947..7bc5dde8e746 100644
--- a/arch/mips/math-emu/dp_mul.c
+++ b/arch/mips/math-emu/dp_mul.c
@@ -128,9 +128,6 @@ union ieee754dp ieee754dp_mul(union ieee754dp x, union ieee754dp y)
 	 * Multiply 64 bits xm, ym to give high 64 bits rm with stickness.
 	 */
 
-	/* 32 * 32 => 64 */
-#define DPXMULT(x, y)	((u64)(x) * (u64)y)
-
 	lxm = xm;
 	hxm = xm >> 32;
 	lym = ym;
diff --git a/arch/mips/math-emu/ieee754dp.h b/arch/mips/math-emu/ieee754dp.h
index 9ba023004eb6..a56707b75282 100644
--- a/arch/mips/math-emu/ieee754dp.h
+++ b/arch/mips/math-emu/ieee754dp.h
@@ -55,6 +55,9 @@ static inline int ieee754dp_finite(union ieee754dp x)
 #define XDPSRS1(v)	\
 	(((v) >> 1) | ((v) & 1))
 
+/* 32bit * 32bit => 64bit unsigned integer multiplication */
+#define DPXMULT(x, y)	((u64)(x) * (u64)y)
+
 /* convert denormal to normalized with extended exponent */
 #define DPDNORMx(m,e) \
 	while ((m >> DP_FBITS) == 0) { m <<= 1; e--; }

From 0d0e0871fd4f7fb4a6feeb00ac9993c83504c384 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Thu, 2 Nov 2017 12:14:02 +0100
Subject: [PATCH 1558/3220] FROMLIST: MIPS: math-emu: Declare function srl128()
 as static

Declare function srl128() as static, since it it used just locally
to the source file.

This also removes a sparse warning for corresponding file.

Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
(cherry picked from: https://patchwork.linux-mips.org/patch/17585/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/dp_maddf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 28b90fd3e093..5c4ad8e32fcb 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -16,7 +16,7 @@
 
 
 /* 128 bits shift right logical with rounding. */
-void srl128(u64 *hptr, u64 *lptr, int count)
+static void srl128(u64 *hptr, u64 *lptr, int count)
 {
 	u64 low;
 

From 7ac185315bf0b2a42b70ad55e5030667bd81ce91 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Thu, 2 Nov 2017 12:14:03 +0100
Subject: [PATCH 1559/3220] FROMLIST: MIPS: math-emu: Avoid an assignment
 within if statement condition

Move invocation of fpu_emu() to be out of if statement condition.

This makes code easier to follow and debug, and fixes a checkpatch
warning.

Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
(cherry picked from: https://patchwork.linux-mips.org/patch/17586/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index cd0f6c06147c..18ea0b076818 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1354,7 +1354,8 @@ branch_common:
 				return SIGILL;
 
 			/* a real fpu computation instruction */
-			if ((sig = fpu_emu(xcp, ctx, ir)))
+			sig = fpu_emu(xcp, ctx, ir);
+			if (sig)
 				return sig;
 		}
 		break;

From 4ca35deb836b3b6879016d864266932abe9dc704 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Thu, 2 Nov 2017 12:14:04 +0100
Subject: [PATCH 1560/3220] FROMLIST: MIPS: math-emu: Avoid multiple assignment

Replace several instances of multiple assignment with individual
assignments.

Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
(cherry picked from: https://patchwork.linux-mips.org/patch/17587/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c  | 3 ++-
 arch/mips/math-emu/dp_sqrt.c | 6 ++++--
 arch/mips/math-emu/sp_sqrt.c | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 18ea0b076818..42947a16adb9 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1191,7 +1191,8 @@ emul:
 			if (!cpu_has_mips_r6 || delay_slot(xcp))
 				return SIGILL;
 
-			cond = likely = 0;
+			likely = 0;
+			cond = 0;
 			fpr = &current->thread.fpu.fpr[MIPSInst_RT(ir)];
 			bit0 = get_fpr32(fpr, 0) & 0x1;
 			switch (MIPSInst_RS(ir)) {
diff --git a/arch/mips/math-emu/dp_sqrt.c b/arch/mips/math-emu/dp_sqrt.c
index cea907b83146..fa39eb84ba8b 100644
--- a/arch/mips/math-emu/dp_sqrt.c
+++ b/arch/mips/math-emu/dp_sqrt.c
@@ -91,7 +91,8 @@ union ieee754dp ieee754dp_sqrt(union ieee754dp x)
 		scalx -= 256;
 	}
 
-	y = x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
+	x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
+	y = x;
 
 	/* magic initial approximation to almost 8 sig. bits */
 	yh = y.bits >> 32;
@@ -108,7 +109,8 @@ union ieee754dp ieee754dp_sqrt(union ieee754dp x)
 
 	/* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */
 	/* t=y*y; z=t;	pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */
-	z = t = ieee754dp_mul(y, y);
+	t = ieee754dp_mul(y, y);
+	z = t;
 	t.bexp += 0x001;
 	t = ieee754dp_add(t, z);
 	z = ieee754dp_mul(ieee754dp_sub(x, z), y);
diff --git a/arch/mips/math-emu/sp_sqrt.c b/arch/mips/math-emu/sp_sqrt.c
index 67059c33a250..9cc83f012342 100644
--- a/arch/mips/math-emu/sp_sqrt.c
+++ b/arch/mips/math-emu/sp_sqrt.c
@@ -82,7 +82,8 @@ union ieee754sp ieee754sp_sqrt(union ieee754sp x)
 
 	/* generate sqrt(x) bit by bit */
 	ix += ix;
-	q = s = 0;		/* q = sqrt(x) */
+	s = 0;
+	q = 0;			/* q = sqrt(x) */
 	r = 0x01000000;		/* r = moving bit from right to left */
 
 	while (r != 0) {

From fcc1a32d69d5fc0e62583bf47ceaa8668d9911d6 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Thu, 2 Nov 2017 12:14:05 +0100
Subject: [PATCH 1561/3220] FROMLIST: MIPS: math-emu: Mark fall throughs in
 switch statements with a comment

Mark intentional fall throughs in switch statements with a consistent
comment.

In most of the cases, a new comment line containing text "fall through"
is inserted. In some of the cases, existing comment contained a variation
of the text "fall through" (for example, "FALL THROUGH" or "drop through").
In such cases, the existing comment is modified to contain "fall through".
Lastly, in two cases, code segments were described in comments as "fall
througs", but were in reality "breaks out" of switch statement. In such
cases, existing comments are accordingly modified.

Apart from making code easier to follow and debug, this change enables
some static code analysers to interpret newly inserted comments as their
annotations (and, therefore, not issue warnings of type "fall through in
switch statement", which is desireable, since marked fallthroughs are
intentional).

Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
(cherry picked from: https://patchwork.linux-mips.org/patch/17588/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/math-emu/cp1emu.c   | 22 +++++++++++++++-------
 arch/mips/math-emu/dp_add.c   |  3 +--
 arch/mips/math-emu/dp_div.c   |  1 +
 arch/mips/math-emu/dp_fmax.c  |  2 ++
 arch/mips/math-emu/dp_fmin.c  |  2 ++
 arch/mips/math-emu/dp_maddf.c |  3 ++-
 arch/mips/math-emu/dp_mul.c   |  1 +
 arch/mips/math-emu/dp_sqrt.c  |  2 +-
 arch/mips/math-emu/dp_sub.c   |  2 +-
 arch/mips/math-emu/sp_add.c   |  3 +--
 arch/mips/math-emu/sp_div.c   |  1 +
 arch/mips/math-emu/sp_fdp.c   |  3 ++-
 arch/mips/math-emu/sp_fmax.c  |  2 ++
 arch/mips/math-emu/sp_fmin.c  |  2 ++
 arch/mips/math-emu/sp_maddf.c |  3 ++-
 arch/mips/math-emu/sp_mul.c   |  1 +
 arch/mips/math-emu/sp_sub.c   |  1 +
 17 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 42947a16adb9..ebb5e3bfe12a 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -452,7 +452,7 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 					regs->cp0_epc + dec_insn.pc_inc +
 					dec_insn.next_pc_inc;
 			}
-			/* Fall through */
+			/* fall through */
 		case jr_op:
 			/* For R6, JR already emulated in jalr_op */
 			if (NO_R6EMU && insn.r_format.func == jr_op)
@@ -472,10 +472,11 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 			regs->regs[31] = regs->cp0_epc +
 				dec_insn.pc_inc +
 				dec_insn.next_pc_inc;
-			/* Fall through */
+			/* fall through */
 		case bltzl_op:
 			if (NO_R6EMU)
 				break;
+			/* fall through */
 		case bltz_op:
 			if ((long)regs->regs[insn.i_format.rs] < 0)
 				*contpc = regs->cp0_epc +
@@ -495,10 +496,11 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 			regs->regs[31] = regs->cp0_epc +
 				dec_insn.pc_inc +
 				dec_insn.next_pc_inc;
-			/* Fall through */
+			/* fall through */
 		case bgezl_op:
 			if (NO_R6EMU)
 				break;
+			/* fall through */
 		case bgez_op:
 			if ((long)regs->regs[insn.i_format.rs] >= 0)
 				*contpc = regs->cp0_epc +
@@ -513,11 +515,12 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 		break;
 	case jalx_op:
 		set_isa16_mode(bit);
+		/* fall through */
 	case jal_op:
 		regs->regs[31] = regs->cp0_epc +
 			dec_insn.pc_inc +
 			dec_insn.next_pc_inc;
-		/* Fall through */
+		/* fall through */
 	case j_op:
 		*contpc = regs->cp0_epc + dec_insn.pc_inc;
 		*contpc >>= 28;
@@ -529,6 +532,7 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 	case beql_op:
 		if (NO_R6EMU)
 			break;
+		/* fall through */
 	case beq_op:
 		if (regs->regs[insn.i_format.rs] ==
 		    regs->regs[insn.i_format.rt])
@@ -543,6 +547,7 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 	case bnel_op:
 		if (NO_R6EMU)
 			break;
+		/* fall through */
 	case bne_op:
 		if (regs->regs[insn.i_format.rs] !=
 		    regs->regs[insn.i_format.rt])
@@ -557,6 +562,7 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 	case blezl_op:
 		if (!insn.i_format.rt && NO_R6EMU)
 			break;
+		/* fall through */
 	case blez_op:
 
 		/*
@@ -594,6 +600,7 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 	case bgtzl_op:
 		if (!insn.i_format.rt && NO_R6EMU)
 			break;
+		/* fall through */
 	case bgtz_op:
 		/*
 		 * Compact branches for R6 for the
@@ -730,7 +737,8 @@ int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 
 			return 1;
 		}
-		/* R2/R6 compatible cop1 instruction. Fall through */
+		/* R2/R6 compatible cop1 instruction */
+		/* fall through */
 	case cop2_op:
 	case cop1x_op:
 		if (insn.i_format.rs == bc_op) {
@@ -1222,14 +1230,14 @@ emul:
 			case bcfl_op:
 				if (cpu_has_mips_2_3_4_5_r)
 					likely = 1;
-				/* Fall through */
+				/* fall through */
 			case bcf_op:
 				cond = !cond;
 				break;
 			case bctl_op:
 				if (cpu_has_mips_2_3_4_5_r)
 					likely = 1;
-				/* Fall through */
+				/* fall through */
 			case bct_op:
 				break;
 			}
diff --git a/arch/mips/math-emu/dp_add.c b/arch/mips/math-emu/dp_add.c
index 8954ef031f84..678de20e4cb1 100644
--- a/arch/mips/math-emu/dp_add.c
+++ b/arch/mips/math-emu/dp_add.c
@@ -104,8 +104,7 @@ union ieee754dp ieee754dp_add(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
-
-		/* FALL THROUGH */
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
diff --git a/arch/mips/math-emu/dp_div.c b/arch/mips/math-emu/dp_div.c
index f4746f7c5f63..3063ae3ab3b9 100644
--- a/arch/mips/math-emu/dp_div.c
+++ b/arch/mips/math-emu/dp_div.c
@@ -103,6 +103,7 @@ union ieee754dp ieee754dp_div(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c
index 5bec64f2884e..d1f984b40344 100644
--- a/arch/mips/math-emu/dp_fmax.c
+++ b/arch/mips/math-emu/dp_fmax.c
@@ -96,6 +96,7 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
@@ -224,6 +225,7 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c
index a287b23818d8..f98b96135c8d 100644
--- a/arch/mips/math-emu/dp_fmin.c
+++ b/arch/mips/math-emu/dp_fmin.c
@@ -96,6 +96,7 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
@@ -224,6 +225,7 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 5c4ad8e32fcb..7ea2f8222026 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -157,6 +157,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		if (zc == IEEE754_CLASS_INF)
@@ -173,7 +174,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
 		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
-		/* fall through to real computations */
+		/* continue to real computations */
 	}
 
 	/* Finally get to do some computation */
diff --git a/arch/mips/math-emu/dp_mul.c b/arch/mips/math-emu/dp_mul.c
index 7bc5dde8e746..c34a6cdf1b25 100644
--- a/arch/mips/math-emu/dp_mul.c
+++ b/arch/mips/math-emu/dp_mul.c
@@ -101,6 +101,7 @@ union ieee754dp ieee754dp_mul(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
diff --git a/arch/mips/math-emu/dp_sqrt.c b/arch/mips/math-emu/dp_sqrt.c
index fa39eb84ba8b..1d26c92e5295 100644
--- a/arch/mips/math-emu/dp_sqrt.c
+++ b/arch/mips/math-emu/dp_sqrt.c
@@ -142,7 +142,7 @@ union ieee754dp ieee754dp_sqrt(union ieee754dp x)
 		switch (oldcsr.rm) {
 		case FPU_CSR_RU:
 			y.bits += 1;
-			/* drop through */
+			/* fall through */
 		case FPU_CSR_RN:
 			t.bits += 1;
 			break;
diff --git a/arch/mips/math-emu/dp_sub.c b/arch/mips/math-emu/dp_sub.c
index fc17a781b9ae..3cc48b86519b 100644
--- a/arch/mips/math-emu/dp_sub.c
+++ b/arch/mips/math-emu/dp_sub.c
@@ -106,7 +106,7 @@ union ieee754dp ieee754dp_sub(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
-		/* FALL THROUGH */
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		/* normalize ym,ye */
diff --git a/arch/mips/math-emu/sp_add.c b/arch/mips/math-emu/sp_add.c
index c55c0c00bca8..51dced9fbdaf 100644
--- a/arch/mips/math-emu/sp_add.c
+++ b/arch/mips/math-emu/sp_add.c
@@ -104,8 +104,7 @@ union ieee754sp ieee754sp_add(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
-
-		/* FALL THROUGH */
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
diff --git a/arch/mips/math-emu/sp_div.c b/arch/mips/math-emu/sp_div.c
index 23587b31ca87..5d2904960eb8 100644
--- a/arch/mips/math-emu/sp_div.c
+++ b/arch/mips/math-emu/sp_div.c
@@ -103,6 +103,7 @@ union ieee754sp ieee754sp_div(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
diff --git a/arch/mips/math-emu/sp_fdp.c b/arch/mips/math-emu/sp_fdp.c
index 5060e8fdcb0b..36a50f9082d1 100644
--- a/arch/mips/math-emu/sp_fdp.c
+++ b/arch/mips/math-emu/sp_fdp.c
@@ -46,7 +46,8 @@ union ieee754sp ieee754sp_fdp(union ieee754dp x)
 	case IEEE754_CLASS_SNAN:
 		x = ieee754dp_nanxcpt(x);
 		EXPLODEXDP;
-		/* Fall through.  */
+		/* fall through */
+
 	case IEEE754_CLASS_QNAN:
 		y = ieee754sp_nan_fdp(xs, xm);
 		if (!ieee754_csr.nan2008) {
diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c
index 74a5a00d2f22..22019ed691df 100644
--- a/arch/mips/math-emu/sp_fmax.c
+++ b/arch/mips/math-emu/sp_fmax.c
@@ -96,6 +96,7 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
@@ -224,6 +225,7 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c
index c51385f46b09..feaec3985cca 100644
--- a/arch/mips/math-emu/sp_fmin.c
+++ b/arch/mips/math-emu/sp_fmin.c
@@ -96,6 +96,7 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
@@ -224,6 +225,7 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index f823338dbb65..07ba675401e2 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -126,6 +126,7 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		if (zc == IEEE754_CLASS_INF)
@@ -142,7 +143,7 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
 		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
-		/* fall through to real computations */
+		/* continue to real computations */
 	}
 
 	/* Finally get to do some computation */
diff --git a/arch/mips/math-emu/sp_mul.c b/arch/mips/math-emu/sp_mul.c
index 4015101fbc37..fde71e293ec4 100644
--- a/arch/mips/math-emu/sp_mul.c
+++ b/arch/mips/math-emu/sp_mul.c
@@ -101,6 +101,7 @@ union ieee754sp ieee754sp_mul(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
diff --git a/arch/mips/math-emu/sp_sub.c b/arch/mips/math-emu/sp_sub.c
index dc998ed47295..9f2ff72c3d6b 100644
--- a/arch/mips/math-emu/sp_sub.c
+++ b/arch/mips/math-emu/sp_sub.c
@@ -106,6 +106,7 @@ union ieee754sp ieee754sp_sub(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;

From 22904b58933d0d2cfe3e9c0014d02a198320a5c9 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Fri, 19 Jan 2018 16:40:48 +0100
Subject: [PATCH 1562/3220] FROMLIST: dt-bindings: Document mti,mips-cpc
 binding

Document a binding for the MIPS Cluster Power Controller (CPC) that
allows the device tree to specify where the CPC registers are located.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Reviewed-by: Rob Herring <robh@kernel.org>
(cherry picked from: https://patchwork.linux-mips.org/patch/18512/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 Documentation/devicetree/bindings/power/mti,mips-cpc.txt | 8 ++++++++
 MAINTAINERS                                              | 1 +
 2 files changed, 9 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/power/mti,mips-cpc.txt

diff --git a/Documentation/devicetree/bindings/power/mti,mips-cpc.txt b/Documentation/devicetree/bindings/power/mti,mips-cpc.txt
new file mode 100644
index 000000000000..c6b82511ae8a
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/mti,mips-cpc.txt
@@ -0,0 +1,8 @@
+Binding for MIPS Cluster Power Controller (CPC).
+
+This binding allows a system to specify where the CPC registers are
+located.
+
+Required properties:
+compatible : Should be "mti,mips-cpc".
+regs: Should describe the address & size of the CPC register region.
diff --git a/MAINTAINERS b/MAINTAINERS
index 46da781dae65..f5e6e92ec41b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7075,6 +7075,7 @@ MIPS GENERIC PLATFORM
 M:	Paul Burton <paul.burton@imgtec.com>
 L:	linux-mips@linux-mips.org
 S:	Supported
+F:	Documentation/devicetree/bindings/power/mti,mips-cpc.txt
 F:	arch/mips/generic/
 
 MIPS RINT INSTRUCTION EMULATION

From 5bf8bba0f178a4687035494a3ab995548df2e792 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Fri, 19 Jan 2018 16:40:49 +0100
Subject: [PATCH 1563/3220] FROMLIST: MIPS: CPC: Map registers using DT in
 mips_cpc_default_phys_base()

Reading mips_cpc_base value from the DT allows each platform to
define it according to its needs. This is especially convenient
for MIPS_GENERIC kernel where this kind of information should be
determined in runtime.

Use mti,mips-cpc compatible string with just a reg property to
specify the register location for your platform.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
(cherry picked from: https://patchwork.linux-mips.org/patch/18513/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/kernel/mips-cpc.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/mips/kernel/mips-cpc.c b/arch/mips/kernel/mips-cpc.c
index b232caf4d53c..3491ee05737c 100644
--- a/arch/mips/kernel/mips-cpc.c
+++ b/arch/mips/kernel/mips-cpc.c
@@ -10,6 +10,8 @@
 
 #include <linux/errno.h>
 #include <linux/percpu.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/spinlock.h>
 
 #include <asm/mips-cm.h>
@@ -23,6 +25,17 @@ static DEFINE_PER_CPU_ALIGNED(unsigned long, cpc_core_lock_flags);
 
 phys_addr_t __weak mips_cpc_default_phys_base(void)
 {
+	struct device_node *cpc_node;
+	struct resource res;
+	int err;
+
+	cpc_node = of_find_compatible_node(of_root, NULL, "mti,mips-cpc");
+	if (cpc_node) {
+		err = of_address_to_resource(cpc_node, 0, &res);
+		if (!err)
+			return res.start;
+	}
+
 	return 0;
 }
 

From d4d732cfa4afe151563662e7ec9ef99e08ebd5a3 Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@mips.com>
Date: Tue, 9 Jan 2018 12:57:16 +0100
Subject: [PATCH 1564/3220] FROMLIST: MIPS: Add noexec=on|off kernel parameter

Add a new kernel parameter to override the default behavior related to
the decision whether to indicate stack as non-executable or executable
(regardless of PT_GNU_STACK entry or CPU RIXI support) in function
mips_elf_read_implies_exec().

Allowed values:

noexec=on:	force indicating non-exec stack & heap
noexec=off:	force indicating executable stack & heap

If this parameter is omitted, kernel behavior remains the same as it
was before this patch is applied.

This functionality is convenient during debugging and is especially
useful for Android development where indication of non-executable
stack is required.

NOTE: Using noexec=on on a system without CPU XI support is not
recommended since there is no actual HW support that provide
non-executable stack and heap. Use only for debugging purposes and
not in a production environment.

Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
(cherry picked from: https://patchwork.linux-mips.org/patch/18218/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 Documentation/kernel-parameters.txt | 19 ++++++++++++
 arch/mips/kernel/elf.c              | 48 +++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e2072d12693b..6ae512fb4488 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2470,6 +2470,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			noexec=on: enable non-executable mappings (default)
 			noexec=off: disable non-executable mappings
 
+	noexec		[MIPS]
+			Force indicating stack and heap as non-executable or
+			executable regardless of PT_GNU_STACK entry or CPU XI
+			(execute inhibit) support. Valid valuess are: on, off.
+			noexec=on:  force indicating non-executable
+				    stack and heap
+			noexec=off: force indicating executable
+				    stack and heap
+			If this parameter is omitted, stack and heap will be
+			indicated non-executable or executable as they are
+			actually set up, which depends on PT_GNU_STACK entry
+			and possibly other factors (for instance, CPU XI
+			support).
+			NOTE: Using noexec=on on a system without CPU XI
+			support	is not recommended since there is no actual
+			HW support that provide non-executable stack/heap.
+			Use only for debugging purposes and not in a
+			production environment.
+
 	nosmap		[X86]
 			Disable SMAP (Supervisor Mode Access Prevention)
 			even if it is supported by processor.
diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index 2aaa0825e56d..624e15dbd405 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -330,8 +330,56 @@ void mips_set_personality_nan(struct arch_elf_state *state)
 	}
 }
 
+static int noexec = EXSTACK_DEFAULT;
+
+/*
+ * kernel parameter: noexec=on|off
+ *
+ * Force indicating stack and heap as non-executable or
+ * executable regardless of PT_GNU_STACK entry or CPU XI
+ * (execute inhibit) support. Valid valuess are: on, off.
+ *
+ * noexec=on:  force indicating non-executable
+ *             stack and heap
+ * noexec=off: force indicating executable
+ *             stack and heap
+ *
+ * If this parameter is omitted, stack and heap will be
+ * indicated non-executable or executable as they are
+ * actually set up, which depends on PT_GNU_STACK entry
+ * and possibly other factors (for instance, CPU XI
+ * support).
+ *
+ * NOTE: Using noexec=on on a system without CPU XI
+ * support is not recommended since there is no actual
+ * HW support that provide non-executable stack/heap.
+ * Use only for debugging purposes and not in a
+ * production environment.
+ */
+static int __init noexec_setup(char *str)
+{
+	if (!strcmp(str, "on"))
+		noexec = EXSTACK_DISABLE_X;
+	else if (!strcmp(str, "off"))
+		noexec = EXSTACK_ENABLE_X;
+	else
+		pr_err("Malformed noexec format! noexec=on|off\n");
+
+	return 1;
+}
+__setup("noexec=", noexec_setup);
+
 int mips_elf_read_implies_exec(void *elf_ex, int exstack)
 {
+	switch (noexec) {
+	case EXSTACK_DISABLE_X:
+		return 0;
+	case EXSTACK_ENABLE_X:
+		return 1;
+	default:
+		break;
+	}
+
 	if (exstack != EXSTACK_DISABLE_X) {
 		/* The binary doesn't request a non-executable stack */
 		return 1;

From 0bc65cbcc33545f1718ea8a98272be931780271b Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@mips.com>
Date: Fri, 29 Dec 2017 16:41:47 +0100
Subject: [PATCH 1565/3220] FROMLIST: MIPS: ranchu: Add Ranchu as a new
 generic-based board

Provide amendments to the MIPS generic platform framework so that
the new generic-based board Ranchu can be chosen to be built.

The Ranchu board is intended to be used by Android emulator. The name
"Ranchu" originates from Android development community. "Goldfish" and
"Ranchu" are terms used for two generations of virtual boards used by
Android emulator. The name "Ranchu" is a newer one among the two, and
this patch deals with Ranchu. However, for historical reasons, some
devices/drivers still contain the name "Goldfish".

MIPS Ranchu machine includes a number of Goldfish devices. The support
for Virtio devices is also included. Ranchu board supports up to 16
Virtio devices which can be attached using Virtio MMIO Bus. This is
summarized in the following picture:

       ABUS
        ||----MIPS CPU
        ||       |                    IRQs
        ||----Goldfish PIC------------(32)--------
        ||                     | | | | | | | | |
        ||----Goldfish TTY------ | | | | | | | |
        ||                       | | | | | | | |
        ||----Goldfish RTC-------- | | | | | | |
        ||                         | | | | | | |
        ||----Goldfish FB----------- | | | | | |
        ||                           | | | | | |
        ||----Goldfish Events--------- | | | | |
        ||                             | | | | |
        ||----Goldfish Audio------------ | | | |
        ||                               | | | |
        ||----Goldfish Battery------------ | | |
        ||                                 | | |
        ||----Android PIPE------------------ | |
        ||                                   | |
        ||----Virtio MMIO Bus                | |
        ||    |    |    |                    | |
        ||    |    |   (virtio-block)--------- |
        ||   (16)  |                           |
        ||    |   (virtio-net)------------------

Device Tree is created on the QEMU side based on the information about
devices IO map and IRQ numbers. Kernel will load this DTB using UHI
boot protocol DTB handover mode.

Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Goran Ferenc <goran.ferenc@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Reviewed-by: James Hogan <jhogan@kernel.org>
(cherry picked from: https://patchwork.linux-mips.org/patch/18138/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 MAINTAINERS                                   |  7 ++
 arch/mips/configs/generic/board-ranchu.config | 30 ++++++
 arch/mips/generic/Kconfig                     | 10 ++
 arch/mips/generic/Makefile                    |  2 +
 arch/mips/generic/board-ranchu.c              | 92 +++++++++++++++++++
 5 files changed, 141 insertions(+)
 create mode 100644 arch/mips/configs/generic/board-ranchu.config
 create mode 100644 arch/mips/generic/board-ranchu.c

diff --git a/MAINTAINERS b/MAINTAINERS
index f5e6e92ec41b..8368ad504c57 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8887,6 +8887,13 @@ S:	Maintained
 F:	Documentation/blockdev/ramdisk.txt
 F:	drivers/block/brd.c
 
+RANCHU VIRTUAL BOARD FOR MIPS
+M:	Miodrag Dinic <miodrag.dinic@mips.com>
+L:	linux-mips@linux-mips.org
+S:	Supported
+F:	arch/mips/generic/board-ranchu.c
+F:	arch/mips/configs/generic/board-ranchu.config
+
 RANDOM NUMBER DRIVER
 M:	"Theodore Ts'o" <tytso@mit.edu>
 S:	Maintained
diff --git a/arch/mips/configs/generic/board-ranchu.config b/arch/mips/configs/generic/board-ranchu.config
new file mode 100644
index 000000000000..fee9ad4c5598
--- /dev/null
+++ b/arch/mips/configs/generic/board-ranchu.config
@@ -0,0 +1,30 @@
+CONFIG_VIRT_BOARD_RANCHU=y
+
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_GOLDFISH=y
+CONFIG_STAGING=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH_PIC=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_GOLDFISH_TTY=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_GOLDFISH=y
+
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_POWER_SUPPLY=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_SYSCON=y
+CONFIG_POWER_RESET_SYSCON_POWEROFF=y
+
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_NETDEVICES=y
+CONFIG_VIRTIO_NET=y
diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig
index baddb0646b23..80e1c72dfd50 100644
--- a/arch/mips/generic/Kconfig
+++ b/arch/mips/generic/Kconfig
@@ -9,4 +9,14 @@ config LEGACY_BOARDS
 	  kernel is booted without being provided with an FDT via the UHI
 	  boot protocol.
 
+config VIRT_BOARD_RANCHU
+	bool "Support Ranchu platform for Android emulator"
+	help
+	  This enables support for the platform used by Android emulator.
+
+	  Ranchu platform consists of a set of virtual devices. This platform
+	  enables emulation of variety of virtual configurations while using
+	  Android emulator. Android emulator is based on Qemu, and contains
+	  the support for the same set of virtual devices.
+
 endif
diff --git a/arch/mips/generic/Makefile b/arch/mips/generic/Makefile
index 26e64207f7ad..bdaa5b22dc36 100644
--- a/arch/mips/generic/Makefile
+++ b/arch/mips/generic/Makefile
@@ -11,3 +11,5 @@
 obj-y += init.o
 obj-y += irq.o
 obj-y += proc.o
+
+obj-$(CONFIG_VIRT_BOARD_RANCHU)	+= board-ranchu.o
diff --git a/arch/mips/generic/board-ranchu.c b/arch/mips/generic/board-ranchu.c
new file mode 100644
index 000000000000..ea451b89bb53
--- /dev/null
+++ b/arch/mips/generic/board-ranchu.c
@@ -0,0 +1,92 @@
+/*
+ * Support code for virtual Ranchu board for MIPS.
+ *
+ * Author: Miodrag Dinic <miodrag.dinic@mips.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/of_address.h>
+#include <linux/types.h>
+
+#include <asm/machine.h>
+#include <asm/mipsregs.h>
+#include <asm/time.h>
+
+#define GOLDFISH_TIMER_LOW		0x00
+#define GOLDFISH_TIMER_HIGH		0x04
+
+static __init u64 read_rtc_time(void __iomem *base)
+{
+	u32 time_low;
+	u32 time_high;
+
+	/*
+	 * Reading the low address latches the high value
+	 * as well so there is no fear that we may read
+	 * inaccurate high value.
+	 */
+	time_low = readl(base + GOLDFISH_TIMER_LOW);
+	time_high = readl(base + GOLDFISH_TIMER_HIGH);
+
+	return ((u64)time_high << 32) | time_low;
+}
+
+static __init unsigned int ranchu_measure_hpt_freq(void)
+{
+	u64 rtc_start, rtc_current, rtc_delta;
+	unsigned int start, count;
+	struct device_node *np;
+	void __iomem *rtc_base;
+
+	np = of_find_compatible_node(NULL, NULL, "google,goldfish-rtc");
+	if (!np)
+		panic("%s(): Failed to find 'google,goldfish-rtc' dt node!",
+		      __func__);
+
+	rtc_base = of_iomap(np, 0);
+	if (!rtc_base)
+		panic("%s(): Failed to ioremap Goldfish RTC base!", __func__);
+
+	/*
+	 * Poll the nanosecond resolution RTC for one
+	 * second to calibrate the CPU frequency.
+	 */
+	rtc_start = read_rtc_time(rtc_base);
+	start = read_c0_count();
+
+	do {
+		rtc_current = read_rtc_time(rtc_base);
+		rtc_delta = rtc_current - rtc_start;
+	} while (rtc_delta < NSEC_PER_SEC);
+
+	count = read_c0_count() - start;
+
+	/*
+	 * Make sure the frequency will be a round number.
+	 * Without this correction, the returned value may vary
+	 * between subsequent emulation executions.
+	 *
+	 * TODO: Set this value using device tree.
+	 */
+	count += 5000;
+	count -= count % 10000;
+
+	iounmap(rtc_base);
+
+	return count;
+}
+
+static const struct of_device_id ranchu_of_match[] __initconst = {
+	{
+		.compatible = "mti,ranchu",
+	},
+};
+
+MIPS_MACHINE(ranchu) = {
+	.matches = ranchu_of_match,
+	.measure_hpt_freq = ranchu_measure_hpt_freq,
+};

From 8c74f4c80eb5e8cbc0a985a46353d57d2bab36a8 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 30 Nov 2017 09:16:31 +0100
Subject: [PATCH 1566/3220] FROMLIST: tty: goldfish: Enable 'earlycon' only if
 built-in

Commit 3840ed9548f7 ("tty: goldfish: Implement support for kernel
'earlycon' parameter") breaks an allmodconfig config on x86:
|  LD      vmlinux.o
|  MODPOST vmlinux.o
|drivers/tty/serial/earlycon.o: In function `parse_options':
|drivers/tty/serial/earlycon.c:97: undefined reference to `uart_parse_earlycon'
|Makefile:1005: recipe for target 'vmlinux' failed

earlycon.c::parse_options() invokes uart_parse_earlycon() from serial_core.c
which is compiled=m because GOLDFISH_TTY itself (and most others) are =m.
To avoid that, I'm adding the _CONSOLE config option which is selected if the
GOLDFISH module itself is =y since it doesn't need the early bits for the =m
case (other drivers do the same dance).
The alternative would be to move uart_parse_earlycon() from
serial_core.c to earlycon.c (we don't have that many users of that
function).

Fixes: 3840ed9548f7 ("tty: goldfish: Implement support for kernel
       'earlycon' parameter")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
(cherry picked from: https://patchwork.kernel.org/patch/10084429/)
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/tty/Kconfig    | 6 +++++-
 drivers/tty/goldfish.c | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index cadfc5f370f1..331bfc4fbd80 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -405,10 +405,14 @@ config GOLDFISH_TTY
 	depends on GOLDFISH
 	select SERIAL_CORE
 	select SERIAL_CORE_CONSOLE
-	select SERIAL_EARLYCON
 	help
 	  Console and system TTY driver for the Goldfish virtual platform.
 
+config GOLDFISH_TTY_EARLY_CONSOLE
+	bool
+	default y if GOLDFISH_TTY=y
+	select SERIAL_EARLYCON
+
 config DA_TTY
 	bool "DA TTY"
 	depends on METAG_DA
diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index 41c701401f71..c6ec9e64451b 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -442,6 +442,7 @@ static int goldfish_tty_remove(struct platform_device *pdev)
 	return 0;
 }
 
+#ifdef CONFIG_GOLDFISH_TTY_EARLY_CONSOLE
 static void gf_early_console_putchar(struct uart_port *port, int ch)
 {
 	__raw_writel(ch, port->membase);
@@ -465,6 +466,7 @@ static int __init gf_earlycon_setup(struct earlycon_device *device,
 }
 
 OF_EARLYCON_DECLARE(early_gf_tty, "google,goldfish-tty", gf_earlycon_setup);
+#endif
 
 static const struct of_device_id goldfish_tty_of_match[] = {
 	{ .compatible = "google,goldfish-tty", },

From 938ae27ff3ddd142f1a97bc9a0404cfc2b17ff94 Mon Sep 17 00:00:00 2001
From: Miodrag Dinic <miodrag.dinic@mips.com>
Date: Mon, 5 Feb 2018 10:29:37 +0100
Subject: [PATCH 1567/3220] ANDROID: MIPS: Add ranchu[32r5|32r6|64]_defconfig

Change-Id: I6faf54a3c0ebd1dbb97494b0f6e96202460ee648
Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/mips/configs/ranchu32r5_defconfig | 314 ++++++++++++++++++++++++
 arch/mips/configs/ranchu32r6_defconfig | 313 ++++++++++++++++++++++++
 arch/mips/configs/ranchu64_defconfig   | 315 +++++++++++++++++++++++++
 arch/mips/configs/ranchu_defconfig     | 312 ++++++++++++++++++++++++
 4 files changed, 1254 insertions(+)
 create mode 100644 arch/mips/configs/ranchu32r5_defconfig
 create mode 100644 arch/mips/configs/ranchu32r6_defconfig
 create mode 100644 arch/mips/configs/ranchu64_defconfig
 create mode 100644 arch/mips/configs/ranchu_defconfig

diff --git a/arch/mips/configs/ranchu32r5_defconfig b/arch/mips/configs/ranchu32r5_defconfig
new file mode 100644
index 000000000000..62287a910a12
--- /dev/null
+++ b/arch/mips/configs/ranchu32r5_defconfig
@@ -0,0 +1,314 @@
+CONFIG_MIPS_GENERIC=y
+CONFIG_VIRT_BOARD_RANCHU=y
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_CPU_MIPS32_R2=y
+CONFIG_MIPS_CPS=y
+CONFIG_CPU_HAS_MSA=y
+CONFIG_HIGHMEM=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+CONFIG_NR_CPUS=16
+CONFIG_PREEMPT=y
+CONFIG_MIPS_O32_FP64_SUPPORT=y
+# CONFIG_USELIB is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_CGROUP=y
+CONFIG_NAMESPACES=y
+CONFIG_USER_NS=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_USERFAULTFD=y
+CONFIG_EMBEDDED=y
+CONFIG_PERF_EVENTS=y
+# CONFIG_SLUB_DEBUG is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_CC_STACKPROTECTOR_REGULAR=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_DNS_RESOLVER=y
+CONFIG_CFG80211=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_DMA_CMA=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_SCSI=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+# CONFIG_INPUT_MOUSEDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_I8042 is not set
+CONFIG_SERIO_LIBPS2=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_GOLDFISH_TTY=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+CONFIG_HW_RANDOM_VIRTIO=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_SYSCON=y
+CONFIG_POWER_RESET_SYSCON_POWEROFF=y
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+# CONFIG_LOGIWHEELS_FF is not set
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+# CONFIG_USB_SUPPORT is not set
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_GOLDFISH=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH_SYNC=y
+# CONFIG_MIPS_PLATFORM_DEVICES is not set
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_GOLDFISH_PIC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_FANOTIFY=y
+CONFIG_QUOTA=y
+CONFIG_QFMT_V2=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_OVERLAY_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_CONFIGFS_FS=y
+CONFIG_SDCARD_FS=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
+CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+# CONFIG_FTRACE is not set
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="earlycon"
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+# CONFIG_XZ_DEC_X86 is not set
+# CONFIG_XZ_DEC_POWERPC is not set
+# CONFIG_XZ_DEC_IA64 is not set
+# CONFIG_XZ_DEC_ARM is not set
+# CONFIG_XZ_DEC_ARMTHUMB is not set
+# CONFIG_XZ_DEC_SPARC is not set
diff --git a/arch/mips/configs/ranchu32r6_defconfig b/arch/mips/configs/ranchu32r6_defconfig
new file mode 100644
index 000000000000..4df031b26a5d
--- /dev/null
+++ b/arch/mips/configs/ranchu32r6_defconfig
@@ -0,0 +1,313 @@
+CONFIG_MIPS_GENERIC=y
+CONFIG_VIRT_BOARD_RANCHU=y
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_CPU_MIPS32_R6=y
+CONFIG_MIPS_CPS=y
+CONFIG_CPU_HAS_MSA=y
+CONFIG_HIGHMEM=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+CONFIG_NR_CPUS=16
+CONFIG_PREEMPT=y
+# CONFIG_USELIB is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_CGROUP=y
+CONFIG_NAMESPACES=y
+CONFIG_USER_NS=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_USERFAULTFD=y
+CONFIG_EMBEDDED=y
+CONFIG_PERF_EVENTS=y
+# CONFIG_SLUB_DEBUG is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_CC_STACKPROTECTOR_REGULAR=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_DNS_RESOLVER=y
+CONFIG_CFG80211=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_DMA_CMA=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_SCSI=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+# CONFIG_INPUT_MOUSEDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_I8042 is not set
+CONFIG_SERIO_LIBPS2=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_GOLDFISH_TTY=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+CONFIG_HW_RANDOM_VIRTIO=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_SYSCON=y
+CONFIG_POWER_RESET_SYSCON_POWEROFF=y
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+# CONFIG_LOGIWHEELS_FF is not set
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+# CONFIG_USB_SUPPORT is not set
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_GOLDFISH=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH_SYNC=y
+# CONFIG_MIPS_PLATFORM_DEVICES is not set
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_GOLDFISH_PIC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_FANOTIFY=y
+CONFIG_QUOTA=y
+CONFIG_QFMT_V2=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_OVERLAY_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_CONFIGFS_FS=y
+CONFIG_SDCARD_FS=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
+CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+# CONFIG_FTRACE is not set
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="earlycon"
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+# CONFIG_XZ_DEC_X86 is not set
+# CONFIG_XZ_DEC_POWERPC is not set
+# CONFIG_XZ_DEC_IA64 is not set
+# CONFIG_XZ_DEC_ARM is not set
+# CONFIG_XZ_DEC_ARMTHUMB is not set
+# CONFIG_XZ_DEC_SPARC is not set
diff --git a/arch/mips/configs/ranchu64_defconfig b/arch/mips/configs/ranchu64_defconfig
new file mode 100644
index 000000000000..35ff3534441c
--- /dev/null
+++ b/arch/mips/configs/ranchu64_defconfig
@@ -0,0 +1,315 @@
+CONFIG_MIPS_GENERIC=y
+CONFIG_VIRT_BOARD_RANCHU=y
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_CPU_MIPS64_R6=y
+CONFIG_64BIT=y
+CONFIG_MIPS_CPS=y
+CONFIG_CPU_HAS_MSA=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+CONFIG_NR_CPUS=16
+CONFIG_PREEMPT=y
+# CONFIG_USELIB is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_CGROUP=y
+CONFIG_NAMESPACES=y
+CONFIG_USER_NS=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_USERFAULTFD=y
+CONFIG_EMBEDDED=y
+CONFIG_PERF_EVENTS=y
+# CONFIG_SLUB_DEBUG is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_CC_STACKPROTECTOR_REGULAR=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_MIPS32_O32=y
+CONFIG_MIPS32_N32=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_DNS_RESOLVER=y
+CONFIG_CFG80211=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_DMA_CMA=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_SCSI=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+# CONFIG_INPUT_MOUSEDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_I8042 is not set
+CONFIG_SERIO_LIBPS2=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_GOLDFISH_TTY=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+CONFIG_HW_RANDOM_VIRTIO=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_SYSCON=y
+CONFIG_POWER_RESET_SYSCON_POWEROFF=y
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+# CONFIG_LOGIWHEELS_FF is not set
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+# CONFIG_USB_SUPPORT is not set
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_GOLDFISH=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH_SYNC=y
+# CONFIG_MIPS_PLATFORM_DEVICES is not set
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_GOLDFISH_PIC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_FANOTIFY=y
+CONFIG_QUOTA=y
+CONFIG_QFMT_V2=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_OVERLAY_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_CONFIGFS_FS=y
+CONFIG_SDCARD_FS=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
+CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+# CONFIG_FTRACE is not set
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="earlycon"
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+# CONFIG_XZ_DEC_X86 is not set
+# CONFIG_XZ_DEC_POWERPC is not set
+# CONFIG_XZ_DEC_IA64 is not set
+# CONFIG_XZ_DEC_ARM is not set
+# CONFIG_XZ_DEC_ARMTHUMB is not set
+# CONFIG_XZ_DEC_SPARC is not set
diff --git a/arch/mips/configs/ranchu_defconfig b/arch/mips/configs/ranchu_defconfig
new file mode 100644
index 000000000000..266a11b66c78
--- /dev/null
+++ b/arch/mips/configs/ranchu_defconfig
@@ -0,0 +1,312 @@
+CONFIG_MIPS_GENERIC=y
+CONFIG_VIRT_BOARD_RANCHU=y
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_MIPS_CPS=y
+CONFIG_HIGHMEM=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+CONFIG_NR_CPUS=16
+CONFIG_PREEMPT=y
+CONFIG_MIPS_O32_FP64_SUPPORT=y
+# CONFIG_USELIB is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_CGROUP=y
+CONFIG_NAMESPACES=y
+CONFIG_USER_NS=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_USERFAULTFD=y
+CONFIG_EMBEDDED=y
+CONFIG_PERF_EVENTS=y
+# CONFIG_SLUB_DEBUG is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_CC_STACKPROTECTOR_REGULAR=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_DNS_RESOLVER=y
+CONFIG_CFG80211=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_DMA_CMA=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_SCSI=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+# CONFIG_INPUT_MOUSEDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_I8042 is not set
+CONFIG_SERIO_LIBPS2=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_GOLDFISH_TTY=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+CONFIG_HW_RANDOM_VIRTIO=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_SYSCON=y
+CONFIG_POWER_RESET_SYSCON_POWEROFF=y
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+# CONFIG_LOGIWHEELS_FF is not set
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+# CONFIG_USB_SUPPORT is not set
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_GOLDFISH=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH_SYNC=y
+# CONFIG_MIPS_PLATFORM_DEVICES is not set
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_GOLDFISH_PIC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_FANOTIFY=y
+CONFIG_QUOTA=y
+CONFIG_QFMT_V2=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_OVERLAY_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_CONFIGFS_FS=y
+CONFIG_SDCARD_FS=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
+CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+# CONFIG_FTRACE is not set
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="earlycon noexec=on"
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+# CONFIG_XZ_DEC_X86 is not set
+# CONFIG_XZ_DEC_POWERPC is not set
+# CONFIG_XZ_DEC_IA64 is not set
+# CONFIG_XZ_DEC_ARM is not set
+# CONFIG_XZ_DEC_ARMTHUMB is not set
+# CONFIG_XZ_DEC_SPARC is not set

From 756df204117ba96c5f27be61caff2164fe05f015 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 11 Feb 2016 16:59:12 -0600
Subject: [PATCH 1568/3220] UPSTREAM: ARM: boot: Add an implementation of
 strnlen for libfdt

Recent versions of libfdt add a dependency on strnlen. Copy the
implementation in lib/string.c here, so we can update libfdt.

Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Rob Herring <robh@kernel.org>

Change-Id: I18ac2af16d541f99a3b0b39e51baa60fa57dd537
(cherry picked from commit 76df69806b7fafea013e2f4f82b0bd54498f3406)
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm/boot/compressed/string.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm/boot/compressed/string.c b/arch/arm/boot/compressed/string.c
index 36e53ef9200f..689467448736 100644
--- a/arch/arm/boot/compressed/string.c
+++ b/arch/arm/boot/compressed/string.c
@@ -65,6 +65,15 @@ size_t strlen(const char *s)
 	return sc - s;
 }
 
+size_t strnlen(const char *s, size_t count)
+{
+	const char *sc;
+
+	for (sc = s; count-- && *sc != '\0'; ++sc)
+		/* nothing */;
+	return sc - s;
+}
+
 int memcmp(const void *cs, const void *ct, size_t count)
 {
 	const unsigned char *su1 = cs, *su2 = ct, *end = su1 + count;

From 037d2374ce3ee79b7d1f8bed64dddbde0e18fc08 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 16 Feb 2016 11:16:31 +0100
Subject: [PATCH 1569/3220] UPSTREAM: arm64/efi: Make strnlen() available to
 the EFI namespace

Changes introduced in the upstream version of libfdt pulled in by commit
91feabc2e224 ("scripts/dtc: Update to upstream commit b06e55c88b9b") use
the strnlen() function, which isn't currently available to the EFI name-
space. Add it to the EFI namespace to avoid a linker error.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Rob Herring <robh@kernel.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

(cherry picked from commit 7f4e346263f59ff50b531dda94609fb13ca12401)
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 arch/arm64/kernel/image.h | 1 +
 arch/arm64/lib/strnlen.S  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index db1bf57948f1..5e360ce88f10 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -98,6 +98,7 @@ __efistub_memcpy		= KALLSYMS_HIDE(__pi_memcpy);
 __efistub_memmove		= KALLSYMS_HIDE(__pi_memmove);
 __efistub_memset		= KALLSYMS_HIDE(__pi_memset);
 __efistub_strlen		= KALLSYMS_HIDE(__pi_strlen);
+__efistub_strnlen		= KALLSYMS_HIDE(__pi_strnlen);
 __efistub_strcmp		= KALLSYMS_HIDE(__pi_strcmp);
 __efistub_strncmp		= KALLSYMS_HIDE(__pi_strncmp);
 __efistub___flush_dcache_area	= KALLSYMS_HIDE(__pi___flush_dcache_area);
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S
index 2ca665711bf2..eae38da6e0bb 100644
--- a/arch/arm64/lib/strnlen.S
+++ b/arch/arm64/lib/strnlen.S
@@ -168,4 +168,4 @@ CPU_LE( lsr	tmp2, tmp2, tmp4 )	/* Shift (tmp1 & 63).  */
 .Lhit_limit:
 	mov	len, limit
 	ret
-ENDPROC(strnlen)
+ENDPIPROC(strnlen)

From 810bdaf1dd747016fea5d5256530cf9ade05a4c9 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Fri, 5 Jan 2018 11:27:07 +0100
Subject: [PATCH 1570/3220] UPSTREAM: ANDROID: binder: remove waitqueue when
 thread exits.

binder_poll() passes the thread->wait waitqueue that
can be slept on for work. When a thread that uses
epoll explicitly exits using BINDER_THREAD_EXIT,
the waitqueue is freed, but it is never removed
from the corresponding epoll data structure. When
the process subsequently exits, the epoll cleanup
code tries to access the waitlist, which results in
a use-after-free.

Prevent this by using POLLFREE when the thread exits.

(cherry picked from commit f5cb779ba16334b45ba8946d6bfa6d9834d1527f)

Change-Id: Ib34b1cbb8ab2192d78c3d9956b2f963a66ecad2e
Signed-off-by: Martijn Coenen <maco@android.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: stable <stable@vger.kernel.org> # 4.14
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index b4a958e20c44..297b61689250 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4536,6 +4536,18 @@ static int binder_thread_release(struct binder_proc *proc,
 		if (t)
 			spin_lock(&t->lock);
 	}
+
+	/*
+	 * If this thread used poll, make sure we remove the waitqueue
+	 * from any epoll data structures holding it with POLLFREE.
+	 * waitqueue_active() is safe to use here because we're holding
+	 * the inner lock.
+	 */
+	if ((thread->looper & BINDER_LOOPER_STATE_POLL) &&
+	    waitqueue_active(&thread->wait)) {
+		wake_up_poll(&thread->wait, POLLHUP | POLLFREE);
+	}
+
 	binder_inner_proc_unlock(thread->proc);
 
 	if (send_reply)

From 6b8924e4899fbc1093f499338688ad446bd0a4c8 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Tue, 28 Nov 2017 18:22:11 -0800
Subject: [PATCH 1571/3220] ANDROID: qtaguid: Fix the UAF probelm with
 tag_ref_tree

When multiple threads is trying to tag/delete the same socket at the
same time, there is a chance the tag_ref_entry of the target socket to
be null before the uid_tag_data entry is freed. It is caused by the
ctrl_cmd_tag function where it doesn't correctly grab the spinlocks
when tagging a socket.

Signed-off-by: Chenbo Feng <fengc@google.com>
Bug: 65853158
Change-Id: I5d89885918054cf835370a52bff2d693362ac5f0
---
 net/netfilter/xt_qtaguid.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index 6f5cdc5b505f..148c2917285c 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -519,13 +519,11 @@ static struct tag_ref *get_tag_ref(tag_t full_tag,
 
 	DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n",
 		 full_tag);
-	spin_lock_bh(&uid_tag_data_tree_lock);
 	tr_entry = lookup_tag_ref(full_tag, &utd_entry);
 	BUG_ON(IS_ERR_OR_NULL(utd_entry));
 	if (!tr_entry)
 		tr_entry = new_tag_ref(full_tag, utd_entry);
 
-	spin_unlock_bh(&uid_tag_data_tree_lock);
 	if (utd_res)
 		*utd_res = utd_entry;
 	DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n",
@@ -2021,6 +2019,7 @@ static int ctrl_cmd_delete(const char *input)
 
 	/* Delete socket tags */
 	spin_lock_bh(&sock_tag_list_lock);
+	spin_lock_bh(&uid_tag_data_tree_lock);
 	node = rb_first(&sock_tag_tree);
 	while (node) {
 		st_entry = rb_entry(node, struct sock_tag, sock_node);
@@ -2050,6 +2049,7 @@ static int ctrl_cmd_delete(const char *input)
 				list_del(&st_entry->list);
 		}
 	}
+	spin_unlock_bh(&uid_tag_data_tree_lock);
 	spin_unlock_bh(&sock_tag_list_lock);
 
 	sock_tag_tree_erase(&st_to_free_tree);
@@ -2259,10 +2259,12 @@ static int ctrl_cmd_tag(const char *input)
 	full_tag = combine_atag_with_uid(acct_tag, uid_int);
 
 	spin_lock_bh(&sock_tag_list_lock);
+	spin_lock_bh(&uid_tag_data_tree_lock);
 	sock_tag_entry = get_sock_stat_nl(el_socket->sk);
 	tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry);
 	if (IS_ERR(tag_ref_entry)) {
 		res = PTR_ERR(tag_ref_entry);
+		spin_unlock_bh(&uid_tag_data_tree_lock);
 		spin_unlock_bh(&sock_tag_list_lock);
 		goto err_put;
 	}
@@ -2289,9 +2291,14 @@ static int ctrl_cmd_tag(const char *input)
 			pr_err("qtaguid: ctrl_tag(%s): "
 			       "socket tag alloc failed\n",
 			       input);
+			BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+			tag_ref_entry->num_sock_tags--;
+			free_tag_ref_from_utd_entry(tag_ref_entry,
+						    uid_tag_data_entry);
+			spin_unlock_bh(&uid_tag_data_tree_lock);
 			spin_unlock_bh(&sock_tag_list_lock);
 			res = -ENOMEM;
-			goto err_tag_unref_put;
+			goto err_put;
 		}
 		/*
 		 * Hold the sk refcount here to make sure the sk pointer cannot
@@ -2301,7 +2308,6 @@ static int ctrl_cmd_tag(const char *input)
 		sock_tag_entry->sk = el_socket->sk;
 		sock_tag_entry->pid = current->tgid;
 		sock_tag_entry->tag = combine_atag_with_uid(acct_tag, uid_int);
-		spin_lock_bh(&uid_tag_data_tree_lock);
 		pqd_entry = proc_qtu_data_tree_search(
 			&proc_qtu_data_tree, current->tgid);
 		/*
@@ -2319,11 +2325,11 @@ static int ctrl_cmd_tag(const char *input)
 		else
 			list_add(&sock_tag_entry->list,
 				 &pqd_entry->sock_tag_list);
-		spin_unlock_bh(&uid_tag_data_tree_lock);
 
 		sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree);
 		atomic64_inc(&qtu_events.sockets_tagged);
 	}
+	spin_unlock_bh(&uid_tag_data_tree_lock);
 	spin_unlock_bh(&sock_tag_list_lock);
 	/* We keep the ref to the sk until it is untagged */
 	CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->sk_refcnt=%d\n",
@@ -2332,10 +2338,6 @@ static int ctrl_cmd_tag(const char *input)
 	sockfd_put(el_socket);
 	return 0;
 
-err_tag_unref_put:
-	BUG_ON(tag_ref_entry->num_sock_tags <= 0);
-	tag_ref_entry->num_sock_tags--;
-	free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry);
 err_put:
 	CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->sk_refcnt=%d\n",
 		 input, atomic_read(&el_socket->sk->sk_refcnt) - 1);

From 0e9bcc1e0a16cde2514e19b744401f8d7c110cc9 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Mon, 17 Oct 2016 10:09:39 +0100
Subject: [PATCH 1572/3220] UPSTREAM: MIPS: Fix build of compressed image

Changes introduced to arch/mips/Makefile for the generic kernel resulted
in build errors when making a compressed image if platform-y has multiple
values, like this:

make[2]: *** No rule to make target `alchemy/'.
make[1]: *** [vmlinuz] Error 2
make[1]: Target `_all' not remade because of errors.
make: *** [sub-make] Error 2
make: Target `_all' not remade because of errors.

Fix this by quoting $(platform-y) as it is passed to the Makefile in
arch/mips/boot/compressed/Makefile

Reported-by: kernelci.org bot <bot@kernelci.org>
Link: https://storage.kernelci.org/next/next-20161017/mips-gpr_defconfig/build.log
Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Reviewed-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/14405/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
(cherry picked from commit 818f38c5b7c4482abd71c64ac4d49911fbefaf9e)
---
 arch/mips/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index a13dc908224d..0e4516d631a6 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -264,7 +264,7 @@ KBUILD_CPPFLAGS += -DDATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)
 
 bootvars-y	= VMLINUX_LOAD_ADDRESS=$(load-y) \
 		  VMLINUX_ENTRY_ADDRESS=$(entry-y) \
-		  PLATFORM=$(platform-y)
+		  PLATFORM="$(platform-y)"
 ifdef CONFIG_32BIT
 bootvars-y	+= ADDR_BITS=32
 endif

From 78d59f1aea624e2ac110773ac7bedd9d44b343b8 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Wed, 24 Jan 2018 09:25:24 +0000
Subject: [PATCH 1573/3220] sched/fair: prevent possible infinite loop in
 sched_group_energy

There is a race between hotplug and energy_diff which might result
in endless loop in sched_group_energy. When this happens, the end
condition cannot be detected.

We can store how many CPUs we need to visit at the beginning, and
bail out of the energy calculation if we visit more cpus than expected.

Bug: 72311797 72202633
Change-Id: I8dda75468ee1570da4071cd8165ef5131a8205d8
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 06e77d60a510..61f88665baf3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5447,10 +5447,22 @@ static int sched_group_energy(struct energy_env *eenv)
 {
 	struct cpumask visit_cpus;
 	u64 total_energy = 0;
+	int cpu_count;
 
 	WARN_ON(!eenv->sg_top->sge);
 
 	cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+	/* If a cpu is hotplugged in while we are in this function,
+	 * it does not appear in the existing visit_cpus mask
+	 * which came from the sched_group pointer of the
+	 * sched_domain pointed at by sd_ea for either the prev
+	 * or next cpu and was dereferenced in __energy_diff.
+	 * Since we will dereference sd_scs later as we iterate
+	 * through the CPUs we expect to visit, new CPUs can
+	 * be present which are not in the visit_cpus mask.
+	 * Guard this with cpu_count.
+	 */
+	cpu_count = cpumask_weight(&visit_cpus);
 
 	while (!cpumask_empty(&visit_cpus)) {
 		struct sched_group *sg_shared_cap = NULL;
@@ -5460,6 +5472,8 @@ static int sched_group_energy(struct energy_env *eenv)
 		/*
 		 * Is the group utilization affected by cpus outside this
 		 * sched_group?
+		 * This sd may have groups with cpus which were not present
+		 * when we took visit_cpus.
 		 */
 		sd = rcu_dereference(per_cpu(sd_scs, cpu));
 
@@ -5509,8 +5523,24 @@ static int sched_group_energy(struct energy_env *eenv)
 
 				total_energy += sg_busy_energy + sg_idle_energy;
 
-				if (!sd->child)
+				if (!sd->child) {
+					/*
+					 * cpu_count here is the number of
+					 * cpus we expect to visit in this
+					 * calculation. If we race against
+					 * hotplug, we can have extra cpus
+					 * added to the groups we are
+					 * iterating which do not appear in
+					 * the visit_cpus mask. In that case
+					 * we are not able to calculate energy
+					 * without restarting so we will bail
+					 * out and use prev_cpu this time.
+					 */
+					if (!cpu_count)
+						return -EINVAL;
 					cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+					cpu_count--;
+				}
 
 				if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
 					goto next_cpu;
@@ -5522,6 +5552,9 @@ static int sched_group_energy(struct energy_env *eenv)
 		 * If we raced with hotplug and got an sd NULL-pointer;
 		 * returning a wrong energy estimation is better than
 		 * entering an infinite loop.
+		 * Specifically: If a cpu is unplugged after we took
+		 * the visit_cpus mask, it no longer has an sd_scs
+		 * pointer, so when we dereference it, we get NULL.
 		 */
 		if (cpumask_test_cpu(cpu, &visit_cpus))
 			return -EINVAL;

From 5e56f242721e98a18abc2436561a0e7ce44b4c85 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 25 Aug 2017 15:57:04 -0700
Subject: [PATCH 1574/3220] time: Fix ktime_get_raw() incorrect base
 accumulation

In comqit fc6eead7c1e2 ("time: Clean up CLOCK_MONOTONIC_RAW time
handling"), the following code got mistakenly added to the update of the
raw timekeeper:

 /* Update the monotonic raw base */
 seconds = tk->raw_sec;
 nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
 tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);

Which adds the raw_sec value and the shifted down raw xtime_nsec to the
base value.

But the read function adds the shifted down tk->tkr_raw.xtime_nsec value
another time, The result of this is that ktime_get_raw() users (which are
all internal users) see the raw time move faster then it should (the rate
at which can vary with the current size of tkr_raw.xtime_nsec), which has
resulted in at least problems with graphics rendering performance.

The change tried to match the monotonic base update logic:

 seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
 nsec = (u32) tk->wall_to_monotonic.tv_nsec;
 tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);

Which adds the wall_to_monotonic.tv_nsec value, but not the
tk->tkr_mono.xtime_nsec value to the base.

To fix this, simplify the tkr_raw.base accumulation to only accumulate the
raw_sec portion, and do not include the tkr_raw.xtime_nsec portion, which
will be added at read time.

Fixes: fc6eead7c1e2 ("time: Clean up CLOCK_MONOTONIC_RAW time handling")
Reported-and-tested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Daniel Mentz <danielmentz@google.com>
Link: http://lkml.kernel.org/r/1503701824-1645-1-git-send-email-john.stultz@linaro.org

(cherry picked from commit 0bcdc0987cce9880436b70836c6a92bb8e744fd1)

Change-Id: I91d552bef42005d954f77963beafdca3cb6eb246
---
 kernel/time/timekeeping.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fc86fdcce932..7902ecbce8ec 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -633,9 +633,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	tk->ktime_sec = seconds;
 
 	/* Update the monotonic raw base */
-	seconds = tk->raw_sec;
-	nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
-	tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+	tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
 }
 
 /* must hold timekeeper_lock */

From 55a36659621564511040e8e27391219a9117565d Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Thu, 31 Aug 2017 10:37:00 +0200
Subject: [PATCH 1575/3220] BACKPORT: xfrm: Fix return value check of
 copy_sec_ctx.

commit 8598112d04af21cf6c895670e72dcb8a9f58e74f upstream.

A recent commit added an output_mark. When copying
this output_mark, the return value of copy_sec_ctx
is overwitten without a check. Fix this by copying
the output_mark before the security context.

Fixes: 077fbac405bf ("net: xfrm: support setting an output mark.")
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>

Change-Id: I25e9ac6cf79dc8d0ee599bbd23e9d5b5f34a4284
Fixes: Change-Id: I76120fba036e21780ced31ad390faf491ea81e52
       ("BACKPORT: net: xfrm: support setting an output mark.")
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
---
 net/xfrm/xfrm_user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 8b71b09e5ab6..6905de41c5d4 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -868,13 +868,13 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 			      &x->replay);
 	if (ret)
 		goto out;
-	if (x->security)
-		ret = copy_sec_ctx(x->security, skb);
 	if (x->props.output_mark) {
 		ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark);
 		if (ret)
 			goto out;
 	}
+	if (x->security)
+		ret = copy_sec_ctx(x->security, skb);
 out:
 	return ret;
 }

From 014c9019a2f61fe445d81676320b4d8676b3242d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Wed, 21 Feb 2018 12:12:01 +0100
Subject: [PATCH 1576/3220] f2fs: fix build error with multiply defined
 inode_nohighmem()

Due to the merge of 4.4.116, there is a build error in f2fs due to
inode_nohighmem() being defined twice.  This patch removes the f2fs-only
instance of the function as it's no longer needed.

Bug: 72320324
Change-Id: If14f1e167498bceb2e434420181923952f7748ba
Cc: Jaegeuk Kim <jaegeuk@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 fs/f2fs/f2fs.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 983f4a7b505d..d3ebe4517881 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -169,11 +169,6 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq)
 	return waitqueue_active(wq);
 }
 
-static inline void inode_nohighmem(struct inode *inode)
-{
-	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
-}
-
 /**
  * current_time - Return FS time
  * @inode: inode.

From fb96bdaa3aa30c7aa605b7ecf42df459b2b386e7 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Fri, 23 Dec 2016 13:13:27 +0100
Subject: [PATCH 1577/3220] BACKPORT: tee: add tee_param_is_memref() for driver
 use

Change-Id: I105eb7c113b68695c28123f520d8d9b07a1fcda4
Reviewed-by: Etienne Carriere <etienne.carriere@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 84debcc53533f162bf11f24e6a503d227c175cbe)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_core.c  | 16 ++--------------
 include/linux/tee_drv.h | 12 ++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 58a5009eacc3..c78104589e42 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -221,18 +221,6 @@ static int params_to_user(struct tee_ioctl_param __user *uparams,
 	return 0;
 }
 
-static bool param_is_memref(struct tee_param *param)
-{
-	switch (param->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
-	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
-	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
-	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
-		return true;
-	default:
-		return false;
-	}
-}
-
 static int tee_ioctl_open_session(struct tee_context *ctx,
 				  struct tee_ioctl_buf_data __user *ubuf)
 {
@@ -296,7 +284,7 @@ out:
 	if (params) {
 		/* Decrease ref count for all valid shared memory pointers */
 		for (n = 0; n < arg.num_params; n++)
-			if (param_is_memref(params + n) &&
+			if (tee_param_is_memref(params + n) &&
 			    params[n].u.memref.shm)
 				tee_shm_put(params[n].u.memref.shm);
 		kfree(params);
@@ -358,7 +346,7 @@ out:
 	if (params) {
 		/* Decrease ref count for all valid shared memory pointers */
 		for (n = 0; n < arg.num_params; n++)
-			if (param_is_memref(params + n) &&
+			if (tee_param_is_memref(params + n) &&
 			    params[n].u.memref.shm)
 				tee_shm_put(params[n].u.memref.shm);
 		kfree(params);
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index cb889afe576b..f4a0ac05ebb4 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -275,4 +275,16 @@ int tee_shm_get_id(struct tee_shm *shm);
  */
 struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id);
 
+static inline bool tee_param_is_memref(struct tee_param *param)
+{
+	switch (param->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+		return true;
+	default:
+		return false;
+	}
+}
+
 #endif /*__TEE_DRV_H*/

From 4883df6ebbcd0537d8052f76f8803bf86468e48c Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Fri, 23 Dec 2016 13:13:34 +0100
Subject: [PATCH 1578/3220] BACKPORT: tee: add TEE_IOCTL_PARAM_ATTR_META

Adds TEE_IOCTL_PARAM_ATTR_META which can be used to indicate meta
parameters when communicating with user space. These meta parameters can
be used by supplicant support multiple parallel requests at a time.

Change-Id: Id119468872ef96c941da0dfbbabed59e55366f12
Reviewed-by: Etienne Carriere <etienne.carriere@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit f2aa97240c84b8f258710e297ba60048bd9c153e)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/supp.c | 25 +++++++++++++++++++++++++
 drivers/tee/tee_core.c   | 16 ++++++++++------
 include/uapi/linux/tee.h |  7 +++++++
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/drivers/tee/optee/supp.c b/drivers/tee/optee/supp.c
index b4ea0678a436..56aa8b929b8c 100644
--- a/drivers/tee/optee/supp.c
+++ b/drivers/tee/optee/supp.c
@@ -119,6 +119,27 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
 	return ret;
 }
 
+static int supp_check_recv_params(size_t num_params, struct tee_param *params)
+{
+	size_t n;
+
+	/*
+	 * If there's memrefs we need to decrease those as they where
+	 * increased earlier and we'll even refuse to accept any below.
+	 */
+	for (n = 0; n < num_params; n++)
+		if (tee_param_is_memref(params + n) && params[n].u.memref.shm)
+			tee_shm_put(params[n].u.memref.shm);
+
+	/*
+	 * We only expect parameters as TEE_IOCTL_PARAM_ATTR_TYPE_NONE (0).
+	 */
+	for (n = 0; n < num_params; n++)
+		if (params[n].attr)
+			return -EINVAL;
+	return 0;
+}
+
 /**
  * optee_supp_recv() - receive request for supplicant
  * @ctx:	context receiving the request
@@ -137,6 +158,10 @@ int optee_supp_recv(struct tee_context *ctx, u32 *func, u32 *num_params,
 	struct optee_supp *supp = &optee->supp;
 	int rc;
 
+	rc = supp_check_recv_params(*num_params, param);
+	if (rc)
+		return rc;
+
 	/*
 	 * In case two threads in one supplicant is calling this function
 	 * simultaneously we need to protect the data with a mutex which
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index c78104589e42..4d0ce606f0fc 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -152,11 +152,11 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params,
 			return -EFAULT;
 
 		/* All unused attribute bits has to be zero */
-		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_TYPE_MASK)
+		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_MASK)
 			return -EINVAL;
 
 		params[n].attr = ip.attr;
-		switch (ip.attr) {
+		switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
 		case TEE_IOCTL_PARAM_ATTR_TYPE_NONE:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
 			break;
@@ -394,8 +394,8 @@ static int params_to_supp(struct tee_context *ctx,
 		struct tee_ioctl_param ip;
 		struct tee_param *p = params + n;
 
-		ip.attr = p->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK;
-		switch (p->attr) {
+		ip.attr = p->attr;
+		switch (p->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
 			ip.a = p->u.value.a;
@@ -459,6 +459,10 @@ static int tee_ioctl_supp_recv(struct tee_context *ctx,
 	if (!params)
 		return -ENOMEM;
 
+	rc = params_from_user(ctx, params, num_params, uarg->params);
+	if (rc)
+		goto out;
+
 	rc = ctx->teedev->desc->ops->supp_recv(ctx, &func, &num_params, params);
 	if (rc)
 		goto out;
@@ -488,11 +492,11 @@ static int params_from_supp(struct tee_param *params, size_t num_params,
 			return -EFAULT;
 
 		/* All unused attribute bits has to be zero */
-		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_TYPE_MASK)
+		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_MASK)
 			return -EINVAL;
 
 		p->attr = ip.attr;
-		switch (ip.attr) {
+		switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
 			/* Only out and in/out values can be updated */
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 688782e90140..267c12e7fd79 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -154,6 +154,13 @@ struct tee_ioctl_buf_data {
  */
 #define TEE_IOCTL_PARAM_ATTR_TYPE_MASK		0xff
 
+/* Meta parameter carrying extra information about the message. */
+#define TEE_IOCTL_PARAM_ATTR_META		0x100
+
+/* Mask of all known attr bits */
+#define TEE_IOCTL_PARAM_ATTR_MASK \
+	(TEE_IOCTL_PARAM_ATTR_TYPE_MASK | TEE_IOCTL_PARAM_ATTR_META)
+
 /*
  * Matches TEEC_LOGIN_* in GP TEE Client API
  * Are only defined for GP compliant TEEs

From 7d14aee395ffa9ee334036887900feaed8bdc555 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Fri, 23 Dec 2016 13:13:39 +0100
Subject: [PATCH 1579/3220] BACKPORT: optee: support asynchronous supplicant
 requests

Adds support for asynchronous supplicant requests, meaning that the
supplicant can process several requests in parallel or block in a
request for some time.

Change-Id: Iec1bc41d57aa3765f0d743c03bd7f35fcc45172b
Acked-by: Etienne Carriere <etienne.carriere@linaro.org>
Tested-by: Etienne Carriere <etienne.carriere@linaro.org> (b2260 pager=y/n)
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 1647a5ac175490d7dac2e74532e85b6197fc74e9)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/core.c          |  11 +-
 drivers/tee/optee/optee_private.h |  41 ++--
 drivers/tee/optee/rpc.c           |   4 +-
 drivers/tee/optee/supp.c          | 364 ++++++++++++++++++------------
 4 files changed, 245 insertions(+), 175 deletions(-)

diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index edb6e4e9ef3a..d0dd09219795 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -187,12 +187,12 @@ static int optee_open(struct tee_context *ctx)
 	if (teedev == optee->supp_teedev) {
 		bool busy = true;
 
-		mutex_lock(&optee->supp.ctx_mutex);
+		mutex_lock(&optee->supp.mutex);
 		if (!optee->supp.ctx) {
 			busy = false;
 			optee->supp.ctx = ctx;
 		}
-		mutex_unlock(&optee->supp.ctx_mutex);
+		mutex_unlock(&optee->supp.mutex);
 		if (busy) {
 			kfree(ctxdata);
 			return -EBUSY;
@@ -252,11 +252,8 @@ static void optee_release(struct tee_context *ctx)
 
 	ctx->data = NULL;
 
-	if (teedev == optee->supp_teedev) {
-		mutex_lock(&optee->supp.ctx_mutex);
-		optee->supp.ctx = NULL;
-		mutex_unlock(&optee->supp.ctx_mutex);
-	}
+	if (teedev == optee->supp_teedev)
+		optee_supp_release(&optee->supp);
 }
 
 static const struct tee_driver_ops optee_ops = {
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
index c374cd594314..3e7da187acbe 100644
--- a/drivers/tee/optee/optee_private.h
+++ b/drivers/tee/optee/optee_private.h
@@ -53,36 +53,24 @@ struct optee_wait_queue {
  * @ctx			the context of current connected supplicant.
  *			if !NULL the supplicant device is available for use,
  *			else busy
- * @ctx_mutex:		held while accessing @ctx
- * @func:		supplicant function id to call
- * @ret:		call return value
- * @num_params:		number of elements in @param
- * @param:		parameters for @func
- * @req_posted:		if true, a request has been posted to the supplicant
- * @supp_next_send:	if true, next step is for supplicant to send response
- * @thrd_mutex:		held by the thread doing a request to supplicant
- * @supp_mutex:		held by supplicant while operating on this struct
- * @data_to_supp:	supplicant is waiting on this for next request
- * @data_from_supp:	requesting thread is waiting on this to get the result
+ * @mutex:		held while accessing content of this struct
+ * @req_id:		current request id if supplicant is doing synchronous
+ *			communication, else -1
+ * @reqs:		queued request not yet retrieved by supplicant
+ * @idr:		IDR holding all requests currently being processed
+ *			by supplicant
+ * @reqs_c:		completion used by supplicant when waiting for a
+ *			request to be queued.
  */
 struct optee_supp {
+	/* Serializes access to this struct */
+	struct mutex mutex;
 	struct tee_context *ctx;
-	/* Serializes access of ctx */
-	struct mutex ctx_mutex;
 
-	u32 func;
-	u32 ret;
-	size_t num_params;
-	struct tee_param *param;
-
-	bool req_posted;
-	bool supp_next_send;
-	/* Serializes access to this struct for requesting thread */
-	struct mutex thrd_mutex;
-	/* Serializes access to this struct for supplicant threads */
-	struct mutex supp_mutex;
-	struct completion data_to_supp;
-	struct completion data_from_supp;
+	int req_id;
+	struct list_head reqs;
+	struct idr idr;
+	struct completion reqs_c;
 };
 
 /**
@@ -142,6 +130,7 @@ int optee_supp_read(struct tee_context *ctx, void __user *buf, size_t len);
 int optee_supp_write(struct tee_context *ctx, void __user *buf, size_t len);
 void optee_supp_init(struct optee_supp *supp);
 void optee_supp_uninit(struct optee_supp *supp);
+void optee_supp_release(struct optee_supp *supp);
 
 int optee_supp_recv(struct tee_context *ctx, u32 *func, u32 *num_params,
 		    struct tee_param *param);
diff --git a/drivers/tee/optee/rpc.c b/drivers/tee/optee/rpc.c
index cef417f4f4d2..c6df4317ca9f 100644
--- a/drivers/tee/optee/rpc.c
+++ b/drivers/tee/optee/rpc.c
@@ -192,10 +192,10 @@ static struct tee_shm *cmd_alloc_suppl(struct tee_context *ctx, size_t sz)
 	if (ret)
 		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&optee->supp.ctx_mutex);
+	mutex_lock(&optee->supp.mutex);
 	/* Increases count as secure world doesn't have a reference */
 	shm = tee_shm_get_from_id(optee->supp.ctx, param.u.value.c);
-	mutex_unlock(&optee->supp.ctx_mutex);
+	mutex_unlock(&optee->supp.mutex);
 	return shm;
 }
 
diff --git a/drivers/tee/optee/supp.c b/drivers/tee/optee/supp.c
index 56aa8b929b8c..df35fc01fd3e 100644
--- a/drivers/tee/optee/supp.c
+++ b/drivers/tee/optee/supp.c
@@ -16,21 +16,61 @@
 #include <linux/uaccess.h>
 #include "optee_private.h"
 
+struct optee_supp_req {
+	struct list_head link;
+
+	bool busy;
+	u32 func;
+	u32 ret;
+	size_t num_params;
+	struct tee_param *param;
+
+	struct completion c;
+};
+
 void optee_supp_init(struct optee_supp *supp)
 {
 	memset(supp, 0, sizeof(*supp));
-	mutex_init(&supp->ctx_mutex);
-	mutex_init(&supp->thrd_mutex);
-	mutex_init(&supp->supp_mutex);
-	init_completion(&supp->data_to_supp);
-	init_completion(&supp->data_from_supp);
+	mutex_init(&supp->mutex);
+	init_completion(&supp->reqs_c);
+	idr_init(&supp->idr);
+	INIT_LIST_HEAD(&supp->reqs);
+	supp->req_id = -1;
 }
 
 void optee_supp_uninit(struct optee_supp *supp)
 {
-	mutex_destroy(&supp->ctx_mutex);
-	mutex_destroy(&supp->thrd_mutex);
-	mutex_destroy(&supp->supp_mutex);
+	mutex_destroy(&supp->mutex);
+	idr_destroy(&supp->idr);
+}
+
+void optee_supp_release(struct optee_supp *supp)
+{
+	int id;
+	struct optee_supp_req *req;
+	struct optee_supp_req *req_tmp;
+
+	mutex_lock(&supp->mutex);
+
+	/* Abort all request retrieved by supplicant */
+	idr_for_each_entry(&supp->idr, req, id) {
+		req->busy = false;
+		idr_remove(&supp->idr, id);
+		req->ret = TEEC_ERROR_COMMUNICATION;
+		complete(&req->c);
+	}
+
+	/* Abort all queued requests */
+	list_for_each_entry_safe(req, req_tmp, &supp->reqs, link) {
+		list_del(&req->link);
+		req->ret = TEEC_ERROR_COMMUNICATION;
+		complete(&req->c);
+	}
+
+	supp->ctx = NULL;
+	supp->req_id = -1;
+
+	mutex_unlock(&supp->mutex);
 }
 
 /**
@@ -44,53 +84,42 @@ void optee_supp_uninit(struct optee_supp *supp)
  */
 u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
 			struct tee_param *param)
+
 {
-	bool interruptable;
 	struct optee *optee = tee_get_drvdata(ctx->teedev);
 	struct optee_supp *supp = &optee->supp;
+	struct optee_supp_req *req = kzalloc(sizeof(*req), GFP_KERNEL);
+	bool interruptable;
 	u32 ret;
 
-	/*
-	 * Other threads blocks here until we've copied our answer from
-	 * supplicant.
-	 */
-	while (mutex_lock_interruptible(&supp->thrd_mutex)) {
-		/* See comment below on when the RPC can be interrupted. */
-		mutex_lock(&supp->ctx_mutex);
-		interruptable = !supp->ctx;
-		mutex_unlock(&supp->ctx_mutex);
-		if (interruptable)
-			return TEEC_ERROR_COMMUNICATION;
-	}
+	if (!req)
+		return TEEC_ERROR_OUT_OF_MEMORY;
 
-	/*
-	 * We have exclusive access now since the supplicant at this
-	 * point is either doing a
-	 * wait_for_completion_interruptible(&supp->data_to_supp) or is in
-	 * userspace still about to do the ioctl() to enter
-	 * optee_supp_recv() below.
-	 */
+	init_completion(&req->c);
+	req->func = func;
+	req->num_params = num_params;
+	req->param = param;
 
-	supp->func = func;
-	supp->num_params = num_params;
-	supp->param = param;
-	supp->req_posted = true;
+	/* Insert the request in the request list */
+	mutex_lock(&supp->mutex);
+	list_add_tail(&req->link, &supp->reqs);
+	mutex_unlock(&supp->mutex);
 
-	/* Let supplicant get the data */
-	complete(&supp->data_to_supp);
+	/* Tell an eventual waiter there's a new request */
+	complete(&supp->reqs_c);
 
 	/*
 	 * Wait for supplicant to process and return result, once we've
-	 * returned from wait_for_completion(data_from_supp) we have
+	 * returned from wait_for_completion(&req->c) successfully we have
 	 * exclusive access again.
 	 */
-	while (wait_for_completion_interruptible(&supp->data_from_supp)) {
-		mutex_lock(&supp->ctx_mutex);
+	while (wait_for_completion_interruptible(&req->c)) {
+		mutex_lock(&supp->mutex);
 		interruptable = !supp->ctx;
 		if (interruptable) {
 			/*
 			 * There's no supplicant available and since the
-			 * supp->ctx_mutex currently is held none can
+			 * supp->mutex currently is held none can
 			 * become available until the mutex released
 			 * again.
 			 *
@@ -101,28 +130,65 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
 			 * will serve all requests in a timely manner and
 			 * interrupting then wouldn't make sense.
 			 */
-			supp->ret = TEEC_ERROR_COMMUNICATION;
-			init_completion(&supp->data_to_supp);
+			interruptable = !req->busy;
+			if (!req->busy)
+				list_del(&req->link);
 		}
-		mutex_unlock(&supp->ctx_mutex);
-		if (interruptable)
+		mutex_unlock(&supp->mutex);
+
+		if (interruptable) {
+			req->ret = TEEC_ERROR_COMMUNICATION;
 			break;
+		}
 	}
 
-	ret = supp->ret;
-	supp->param = NULL;
-	supp->req_posted = false;
-
-	/* We're done, let someone else talk to the supplicant now. */
-	mutex_unlock(&supp->thrd_mutex);
+	ret = req->ret;
+	kfree(req);
 
 	return ret;
 }
 
-static int supp_check_recv_params(size_t num_params, struct tee_param *params)
+static struct optee_supp_req  *supp_pop_entry(struct optee_supp *supp,
+					      int num_params, int *id)
+{
+	struct optee_supp_req *req;
+
+	if (supp->req_id != -1) {
+		/*
+		 * Supplicant should not mix synchronous and asnynchronous
+		 * requests.
+		 */
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (list_empty(&supp->reqs))
+		return NULL;
+
+	req = list_first_entry(&supp->reqs, struct optee_supp_req, link);
+
+	if (num_params < req->num_params) {
+		/* Not enough room for parameters */
+		return ERR_PTR(-EINVAL);
+	}
+
+	*id = idr_alloc(&supp->idr, req, 1, 0, GFP_KERNEL);
+	if (*id < 0)
+		return ERR_PTR(-ENOMEM);
+
+	list_del(&req->link);
+	req->busy = true;
+
+	return req;
+}
+
+static int supp_check_recv_params(size_t num_params, struct tee_param *params,
+				  size_t *num_meta)
 {
 	size_t n;
 
+	if (!num_params)
+		return -EINVAL;
+
 	/*
 	 * If there's memrefs we need to decrease those as they where
 	 * increased earlier and we'll even refuse to accept any below.
@@ -132,11 +198,20 @@ static int supp_check_recv_params(size_t num_params, struct tee_param *params)
 			tee_shm_put(params[n].u.memref.shm);
 
 	/*
-	 * We only expect parameters as TEE_IOCTL_PARAM_ATTR_TYPE_NONE (0).
+	 * We only expect parameters as TEE_IOCTL_PARAM_ATTR_TYPE_NONE with
+	 * or without the TEE_IOCTL_PARAM_ATTR_META bit set.
 	 */
 	for (n = 0; n < num_params; n++)
-		if (params[n].attr)
+		if (params[n].attr &&
+		    params[n].attr != TEE_IOCTL_PARAM_ATTR_META)
 			return -EINVAL;
+
+	/* At most we'll need one meta parameter so no need to check for more */
+	if (params->attr == TEE_IOCTL_PARAM_ATTR_META)
+		*num_meta = 1;
+	else
+		*num_meta = 0;
+
 	return 0;
 }
 
@@ -156,69 +231,99 @@ int optee_supp_recv(struct tee_context *ctx, u32 *func, u32 *num_params,
 	struct tee_device *teedev = ctx->teedev;
 	struct optee *optee = tee_get_drvdata(teedev);
 	struct optee_supp *supp = &optee->supp;
+	struct optee_supp_req *req = NULL;
+	int id;
+	size_t num_meta;
 	int rc;
 
-	rc = supp_check_recv_params(*num_params, param);
+	rc = supp_check_recv_params(*num_params, param, &num_meta);
 	if (rc)
 		return rc;
 
-	/*
-	 * In case two threads in one supplicant is calling this function
-	 * simultaneously we need to protect the data with a mutex which
-	 * we'll release before returning.
-	 */
-	mutex_lock(&supp->supp_mutex);
+	while (true) {
+		mutex_lock(&supp->mutex);
+		req = supp_pop_entry(supp, *num_params - num_meta, &id);
+		mutex_unlock(&supp->mutex);
 
-	if (supp->supp_next_send) {
-		/*
-		 * optee_supp_recv() has been called again without
-		 * a optee_supp_send() in between. Supplicant has
-		 * probably been restarted before it was able to
-		 * write back last result. Abort last request and
-		 * wait for a new.
-		 */
-		if (supp->req_posted) {
-			supp->ret = TEEC_ERROR_COMMUNICATION;
-			supp->supp_next_send = false;
-			complete(&supp->data_from_supp);
+		if (req) {
+			if (IS_ERR(req))
+				return PTR_ERR(req);
+			break;
 		}
-	}
 
-	/*
-	 * This is where supplicant will be hanging most of the
-	 * time, let's make this interruptable so we can easily
-	 * restart supplicant if needed.
-	 */
-	if (wait_for_completion_interruptible(&supp->data_to_supp)) {
-		rc = -ERESTARTSYS;
-		goto out;
-	}
-
-	/* We have exlusive access to the data */
-
-	if (*num_params < supp->num_params) {
 		/*
-		 * Not enough room for parameters, tell supplicant
-		 * it failed and abort last request.
+		 * If we didn't get a request we'll block in
+		 * wait_for_completion() to avoid needless spinning.
+		 *
+		 * This is where supplicant will be hanging most of
+		 * the time, let's make this interruptable so we
+		 * can easily restart supplicant if needed.
 		 */
-		supp->ret = TEEC_ERROR_COMMUNICATION;
-		rc = -EINVAL;
-		complete(&supp->data_from_supp);
-		goto out;
+		if (wait_for_completion_interruptible(&supp->reqs_c))
+			return -ERESTARTSYS;
 	}
 
-	*func = supp->func;
-	*num_params = supp->num_params;
-	memcpy(param, supp->param,
-	       sizeof(struct tee_param) * supp->num_params);
+	if (num_meta) {
+		/*
+		 * tee-supplicant support meta parameters -> requsts can be
+		 * processed asynchronously.
+		 */
+		param->attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT |
+			      TEE_IOCTL_PARAM_ATTR_META;
+		param->u.value.a = id;
+		param->u.value.b = 0;
+		param->u.value.c = 0;
+	} else {
+		mutex_lock(&supp->mutex);
+		supp->req_id = id;
+		mutex_unlock(&supp->mutex);
+	}
 
-	/* Allow optee_supp_send() below to do its work */
-	supp->supp_next_send = true;
+	*func = req->func;
+	*num_params = req->num_params + num_meta;
+	memcpy(param + num_meta, req->param,
+	       sizeof(struct tee_param) * req->num_params);
 
-	rc = 0;
-out:
-	mutex_unlock(&supp->supp_mutex);
-	return rc;
+	return 0;
+}
+
+static struct optee_supp_req *supp_pop_req(struct optee_supp *supp,
+					   size_t num_params,
+					   struct tee_param *param,
+					   size_t *num_meta)
+{
+	struct optee_supp_req *req;
+	int id;
+	size_t nm;
+	const u32 attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT |
+			 TEE_IOCTL_PARAM_ATTR_META;
+
+	if (!num_params)
+		return ERR_PTR(-EINVAL);
+
+	if (supp->req_id == -1) {
+		if (param->attr != attr)
+			return ERR_PTR(-EINVAL);
+		id = param->u.value.a;
+		nm = 1;
+	} else {
+		id = supp->req_id;
+		nm = 0;
+	}
+
+	req = idr_find(&supp->idr, id);
+	if (!req)
+		return ERR_PTR(-ENOENT);
+
+	if ((num_params - nm) != req->num_params)
+		return ERR_PTR(-EINVAL);
+
+	req->busy = false;
+	idr_remove(&supp->idr, id);
+	supp->req_id = -1;
+	*num_meta = nm;
+
+	return req;
 }
 
 /**
@@ -236,63 +341,42 @@ int optee_supp_send(struct tee_context *ctx, u32 ret, u32 num_params,
 	struct tee_device *teedev = ctx->teedev;
 	struct optee *optee = tee_get_drvdata(teedev);
 	struct optee_supp *supp = &optee->supp;
+	struct optee_supp_req *req;
 	size_t n;
-	int rc = 0;
+	size_t num_meta;
 
-	/*
-	 * We still have exclusive access to the data since that's how we
-	 * left it when returning from optee_supp_read().
-	 */
+	mutex_lock(&supp->mutex);
+	req = supp_pop_req(supp, num_params, param, &num_meta);
+	mutex_unlock(&supp->mutex);
 
-	/* See comment on mutex in optee_supp_read() above */
-	mutex_lock(&supp->supp_mutex);
-
-	if (!supp->supp_next_send) {
-		/*
-		 * Something strange is going on, supplicant shouldn't
-		 * enter optee_supp_send() in this state
-		 */
-		rc = -ENOENT;
-		goto out;
-	}
-
-	if (num_params != supp->num_params) {
-		/*
-		 * Something is wrong, let supplicant restart. Next call to
-		 * optee_supp_recv() will give an error to the requesting
-		 * thread and release it.
-		 */
-		rc = -EINVAL;
-		goto out;
+	if (IS_ERR(req)) {
+		/* Something is wrong, let supplicant restart. */
+		return PTR_ERR(req);
 	}
 
 	/* Update out and in/out parameters */
-	for (n = 0; n < num_params; n++) {
-		struct tee_param *p = supp->param + n;
+	for (n = 0; n < req->num_params; n++) {
+		struct tee_param *p = req->param + n;
 
-		switch (p->attr) {
+		switch (p->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
-			p->u.value.a = param[n].u.value.a;
-			p->u.value.b = param[n].u.value.b;
-			p->u.value.c = param[n].u.value.c;
+			p->u.value.a = param[n + num_meta].u.value.a;
+			p->u.value.b = param[n + num_meta].u.value.b;
+			p->u.value.c = param[n + num_meta].u.value.c;
 			break;
 		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
-			p->u.memref.size = param[n].u.memref.size;
+			p->u.memref.size = param[n + num_meta].u.memref.size;
 			break;
 		default:
 			break;
 		}
 	}
-	supp->ret = ret;
-
-	/* Allow optee_supp_recv() above to do its work */
-	supp->supp_next_send = false;
+	req->ret = ret;
 
 	/* Let the requesting thread continue */
-	complete(&supp->data_from_supp);
-out:
-	mutex_unlock(&supp->supp_mutex);
-	return rc;
+	complete(&req->c);
+
+	return 0;
 }

From 15bf9b9422ec766f94bbb94e23a27ef16d5bee57 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Wed, 29 Nov 2017 14:48:25 +0200
Subject: [PATCH 1580/3220] BACKPORT: tee: flexible shared memory pool creation

Makes creation of shm pools more flexible by adding new more primitive
functions to allocate a shm pool. This makes it easier to add driver
specific shm pool management.

Change-Id: Ief7841b612e1f2ad67222f058bb6627ac9dcd41d
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
(cherry picked from commit e2aca5d8928acb9cc9a87802b02102d4f9b9b596)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_private.h  |  57 +------------
 drivers/tee/tee_shm.c      |   8 +-
 drivers/tee/tee_shm_pool.c | 167 +++++++++++++++++++++++--------------
 include/linux/tee_drv.h    |  91 ++++++++++++++++++++
 4 files changed, 200 insertions(+), 123 deletions(-)

diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h
index 21cb6be8bce9..2bc2b5ab1661 100644
--- a/drivers/tee/tee_private.h
+++ b/drivers/tee/tee_private.h
@@ -21,68 +21,15 @@
 #include <linux/mutex.h>
 #include <linux/types.h>
 
-struct tee_device;
-
-/**
- * struct tee_shm - shared memory object
- * @teedev:	device used to allocate the object
- * @ctx:	context using the object, if NULL the context is gone
- * @link	link element
- * @paddr:	physical address of the shared memory
- * @kaddr:	virtual address of the shared memory
- * @size:	size of shared memory
- * @dmabuf:	dmabuf used to for exporting to user space
- * @flags:	defined by TEE_SHM_* in tee_drv.h
- * @id:		unique id of a shared memory object on this device
- */
-struct tee_shm {
-	struct tee_device *teedev;
-	struct tee_context *ctx;
-	struct list_head link;
-	phys_addr_t paddr;
-	void *kaddr;
-	size_t size;
-	struct dma_buf *dmabuf;
-	u32 flags;
-	int id;
-};
-
-struct tee_shm_pool_mgr;
-
-/**
- * struct tee_shm_pool_mgr_ops - shared memory pool manager operations
- * @alloc:	called when allocating shared memory
- * @free:	called when freeing shared memory
- */
-struct tee_shm_pool_mgr_ops {
-	int (*alloc)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm,
-		     size_t size);
-	void (*free)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm);
-};
-
-/**
- * struct tee_shm_pool_mgr - shared memory manager
- * @ops:		operations
- * @private_data:	private data for the shared memory manager
- */
-struct tee_shm_pool_mgr {
-	const struct tee_shm_pool_mgr_ops *ops;
-	void *private_data;
-};
-
 /**
  * struct tee_shm_pool - shared memory pool
  * @private_mgr:	pool manager for shared memory only between kernel
  *			and secure world
  * @dma_buf_mgr:	pool manager for shared memory exported to user space
- * @destroy:		called when destroying the pool
- * @private_data:	private data for the pool
  */
 struct tee_shm_pool {
-	struct tee_shm_pool_mgr private_mgr;
-	struct tee_shm_pool_mgr dma_buf_mgr;
-	void (*destroy)(struct tee_shm_pool *pool);
-	void *private_data;
+	struct tee_shm_pool_mgr *private_mgr;
+	struct tee_shm_pool_mgr *dma_buf_mgr;
 };
 
 #define TEE_DEVICE_FLAG_REGISTERED	0x1
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 37207ae2de7b..a3172bc5be69 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -32,9 +32,9 @@ static void tee_shm_release(struct tee_shm *shm)
 	mutex_unlock(&teedev->mutex);
 
 	if (shm->flags & TEE_SHM_DMA_BUF)
-		poolm = &teedev->pool->dma_buf_mgr;
+		poolm = teedev->pool->dma_buf_mgr;
 	else
-		poolm = &teedev->pool->private_mgr;
+		poolm = teedev->pool->private_mgr;
 
 	poolm->ops->free(poolm, shm);
 	kfree(shm);
@@ -139,9 +139,9 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
 	shm->teedev = teedev;
 	shm->ctx = ctx;
 	if (flags & TEE_SHM_DMA_BUF)
-		poolm = &teedev->pool->dma_buf_mgr;
+		poolm = teedev->pool->dma_buf_mgr;
 	else
-		poolm = &teedev->pool->private_mgr;
+		poolm = teedev->pool->private_mgr;
 
 	rc = poolm->ops->alloc(poolm, shm, size);
 	if (rc) {
diff --git a/drivers/tee/tee_shm_pool.c b/drivers/tee/tee_shm_pool.c
index fb4f8522a526..e6d4b9e4a864 100644
--- a/drivers/tee/tee_shm_pool.c
+++ b/drivers/tee/tee_shm_pool.c
@@ -44,49 +44,18 @@ static void pool_op_gen_free(struct tee_shm_pool_mgr *poolm,
 	shm->kaddr = NULL;
 }
 
+static void pool_op_gen_destroy_poolmgr(struct tee_shm_pool_mgr *poolm)
+{
+	gen_pool_destroy(poolm->private_data);
+	kfree(poolm);
+}
+
 static const struct tee_shm_pool_mgr_ops pool_ops_generic = {
 	.alloc = pool_op_gen_alloc,
 	.free = pool_op_gen_free,
+	.destroy_poolmgr = pool_op_gen_destroy_poolmgr,
 };
 
-static void pool_res_mem_destroy(struct tee_shm_pool *pool)
-{
-	gen_pool_destroy(pool->private_mgr.private_data);
-	gen_pool_destroy(pool->dma_buf_mgr.private_data);
-}
-
-static int pool_res_mem_mgr_init(struct tee_shm_pool_mgr *mgr,
-				 struct tee_shm_pool_mem_info *info,
-				 int min_alloc_order)
-{
-	size_t page_mask = PAGE_SIZE - 1;
-	struct gen_pool *genpool = NULL;
-	int rc;
-
-	/*
-	 * Start and end must be page aligned
-	 */
-	if ((info->vaddr & page_mask) || (info->paddr & page_mask) ||
-	    (info->size & page_mask))
-		return -EINVAL;
-
-	genpool = gen_pool_create(min_alloc_order, -1);
-	if (!genpool)
-		return -ENOMEM;
-
-	gen_pool_set_algo(genpool, gen_pool_best_fit, NULL);
-	rc = gen_pool_add_virt(genpool, info->vaddr, info->paddr, info->size,
-			       -1);
-	if (rc) {
-		gen_pool_destroy(genpool);
-		return rc;
-	}
-
-	mgr->private_data = genpool;
-	mgr->ops = &pool_ops_generic;
-	return 0;
-}
-
 /**
  * tee_shm_pool_alloc_res_mem() - Create a shared memory pool from reserved
  * memory range
@@ -104,43 +73,110 @@ struct tee_shm_pool *
 tee_shm_pool_alloc_res_mem(struct tee_shm_pool_mem_info *priv_info,
 			   struct tee_shm_pool_mem_info *dmabuf_info)
 {
-	struct tee_shm_pool *pool = NULL;
-	int ret;
-
-	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-	if (!pool) {
-		ret = -ENOMEM;
-		goto err;
-	}
+	struct tee_shm_pool_mgr *priv_mgr;
+	struct tee_shm_pool_mgr *dmabuf_mgr;
+	void *rc;
 
 	/*
 	 * Create the pool for driver private shared memory
 	 */
-	ret = pool_res_mem_mgr_init(&pool->private_mgr, priv_info,
-				    3 /* 8 byte aligned */);
-	if (ret)
-		goto err;
+	rc = tee_shm_pool_mgr_alloc_res_mem(priv_info->vaddr, priv_info->paddr,
+					    priv_info->size,
+					    3 /* 8 byte aligned */);
+	if (IS_ERR(rc))
+		return rc;
+	priv_mgr = rc;
 
 	/*
 	 * Create the pool for dma_buf shared memory
 	 */
-	ret = pool_res_mem_mgr_init(&pool->dma_buf_mgr, dmabuf_info,
-				    PAGE_SHIFT);
-	if (ret)
-		goto err;
+	rc = tee_shm_pool_mgr_alloc_res_mem(dmabuf_info->vaddr,
+					    dmabuf_info->paddr,
+					    dmabuf_info->size, PAGE_SHIFT);
+	if (IS_ERR(rc))
+		goto err_free_priv_mgr;
+	dmabuf_mgr = rc;
 
-	pool->destroy = pool_res_mem_destroy;
-	return pool;
-err:
-	if (ret == -ENOMEM)
-		pr_err("%s: can't allocate memory for res_mem shared memory pool\n", __func__);
-	if (pool && pool->private_mgr.private_data)
-		gen_pool_destroy(pool->private_mgr.private_data);
-	kfree(pool);
-	return ERR_PTR(ret);
+	rc = tee_shm_pool_alloc(priv_mgr, dmabuf_mgr);
+	if (IS_ERR(rc))
+		goto err_free_dmabuf_mgr;
+
+	return rc;
+
+err_free_dmabuf_mgr:
+	tee_shm_pool_mgr_destroy(dmabuf_mgr);
+err_free_priv_mgr:
+	tee_shm_pool_mgr_destroy(priv_mgr);
+
+	return rc;
 }
 EXPORT_SYMBOL_GPL(tee_shm_pool_alloc_res_mem);
 
+struct tee_shm_pool_mgr *tee_shm_pool_mgr_alloc_res_mem(unsigned long vaddr,
+							phys_addr_t paddr,
+							size_t size,
+							int min_alloc_order)
+{
+	const size_t page_mask = PAGE_SIZE - 1;
+	struct tee_shm_pool_mgr *mgr;
+	int rc;
+
+	/* Start and end must be page aligned */
+	if (vaddr & page_mask || paddr & page_mask || size & page_mask)
+		return ERR_PTR(-EINVAL);
+
+	mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
+	if (!mgr)
+		return ERR_PTR(-ENOMEM);
+
+	mgr->private_data = gen_pool_create(min_alloc_order, -1);
+	if (!mgr->private_data) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	gen_pool_set_algo(mgr->private_data, gen_pool_best_fit, NULL);
+	rc = gen_pool_add_virt(mgr->private_data, vaddr, paddr, size, -1);
+	if (rc) {
+		gen_pool_destroy(mgr->private_data);
+		goto err;
+	}
+
+	mgr->ops = &pool_ops_generic;
+
+	return mgr;
+err:
+	kfree(mgr);
+
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_mgr_alloc_res_mem);
+
+static bool check_mgr_ops(struct tee_shm_pool_mgr *mgr)
+{
+	return mgr && mgr->ops && mgr->ops->alloc && mgr->ops->free &&
+		mgr->ops->destroy_poolmgr;
+}
+
+struct tee_shm_pool *tee_shm_pool_alloc(struct tee_shm_pool_mgr *priv_mgr,
+					struct tee_shm_pool_mgr *dmabuf_mgr)
+{
+	struct tee_shm_pool *pool;
+
+	if (!check_mgr_ops(priv_mgr) || !check_mgr_ops(dmabuf_mgr))
+		return ERR_PTR(-EINVAL);
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	pool->private_mgr = priv_mgr;
+	pool->dma_buf_mgr = dmabuf_mgr;
+
+	return pool;
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_alloc);
+
 /**
  * tee_shm_pool_free() - Free a shared memory pool
  * @pool:	The shared memory pool to free
@@ -150,7 +186,10 @@ EXPORT_SYMBOL_GPL(tee_shm_pool_alloc_res_mem);
  */
 void tee_shm_pool_free(struct tee_shm_pool *pool)
 {
-	pool->destroy(pool);
+	if (pool->private_mgr)
+		tee_shm_pool_mgr_destroy(pool->private_mgr);
+	if (pool->dma_buf_mgr)
+		tee_shm_pool_mgr_destroy(pool->dma_buf_mgr);
 	kfree(pool);
 }
 EXPORT_SYMBOL_GPL(tee_shm_pool_free);
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index f4a0ac05ebb4..ead79b560b9d 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -149,6 +149,97 @@ int tee_device_register(struct tee_device *teedev);
  */
 void tee_device_unregister(struct tee_device *teedev);
 
+/**
+ * struct tee_shm - shared memory object
+ * @teedev:	device used to allocate the object
+ * @ctx:	context using the object, if NULL the context is gone
+ * @link	link element
+ * @paddr:	physical address of the shared memory
+ * @kaddr:	virtual address of the shared memory
+ * @size:	size of shared memory
+ * @offset:	offset of buffer in user space
+ * @pages:	locked pages from userspace
+ * @num_pages:	number of locked pages
+ * @dmabuf:	dmabuf used to for exporting to user space
+ * @flags:	defined by TEE_SHM_* in tee_drv.h
+ * @id:		unique id of a shared memory object on this device
+ *
+ * This pool is only supposed to be accessed directly from the TEE
+ * subsystem and from drivers that implements their own shm pool manager.
+ */
+struct tee_shm {
+	struct tee_device *teedev;
+	struct tee_context *ctx;
+	struct list_head link;
+	phys_addr_t paddr;
+	void *kaddr;
+	size_t size;
+	unsigned int offset;
+	struct page **pages;
+	size_t num_pages;
+	struct dma_buf *dmabuf;
+	u32 flags;
+	int id;
+};
+
+/**
+ * struct tee_shm_pool_mgr - shared memory manager
+ * @ops:		operations
+ * @private_data:	private data for the shared memory manager
+ */
+struct tee_shm_pool_mgr {
+	const struct tee_shm_pool_mgr_ops *ops;
+	void *private_data;
+};
+
+/**
+ * struct tee_shm_pool_mgr_ops - shared memory pool manager operations
+ * @alloc:		called when allocating shared memory
+ * @free:		called when freeing shared memory
+ * @destroy_poolmgr:	called when destroying the pool manager
+ */
+struct tee_shm_pool_mgr_ops {
+	int (*alloc)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm,
+		     size_t size);
+	void (*free)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm);
+	void (*destroy_poolmgr)(struct tee_shm_pool_mgr *poolmgr);
+};
+
+/**
+ * tee_shm_pool_alloc() - Create a shared memory pool from shm managers
+ * @priv_mgr:	manager for driver private shared memory allocations
+ * @dmabuf_mgr:	manager for dma-buf shared memory allocations
+ *
+ * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied
+ * in @dmabuf, others will use the range provided by @priv.
+ *
+ * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool *tee_shm_pool_alloc(struct tee_shm_pool_mgr *priv_mgr,
+					struct tee_shm_pool_mgr *dmabuf_mgr);
+
+/*
+ * tee_shm_pool_mgr_alloc_res_mem() - Create a shm manager for reserved
+ * memory
+ * @vaddr:	Virtual address of start of pool
+ * @paddr:	Physical address of start of pool
+ * @size:	Size in bytes of the pool
+ *
+ * @returns pointer to a 'struct tee_shm_pool_mgr' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool_mgr *tee_shm_pool_mgr_alloc_res_mem(unsigned long vaddr,
+							phys_addr_t paddr,
+							size_t size,
+							int min_alloc_order);
+
+/**
+ * tee_shm_pool_mgr_destroy() - Free a shared memory manager
+ */
+static inline void tee_shm_pool_mgr_destroy(struct tee_shm_pool_mgr *poolm)
+{
+	poolm->ops->destroy_poolmgr(poolm);
+}
+
 /**
  * struct tee_shm_pool_mem_info - holds information needed to create a shared
  * memory pool

From 9b5e064b883b7a2c2cb2645d77cff59edee04b62 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Wed, 29 Nov 2017 14:48:26 +0200
Subject: [PATCH 1581/3220] BACKPORT: tee: add register user memory

Added new ioctl to allow users register own buffers as a shared memory.

Change-Id: If7f52f1d7c733d1d31de791523a07748e77fa202
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
[jw: moved tee_shm_is_registered() declaration]
[jw: added space after __tee_shm_alloc() implementation]
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 033ddf12bcf5326b93bd604f50a7474a434a35f9)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_core.c   |  41 +++++++-
 drivers/tee/tee_shm.c    | 206 ++++++++++++++++++++++++++++++++++-----
 include/linux/tee_drv.h  |  47 ++++++++-
 include/uapi/linux/tee.h |  30 ++++++
 4 files changed, 294 insertions(+), 30 deletions(-)

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 4d0ce606f0fc..bb88298cea3a 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -114,8 +114,6 @@ static int tee_ioctl_shm_alloc(struct tee_context *ctx,
 	if (data.flags)
 		return -EINVAL;
 
-	data.id = -1;
-
 	shm = tee_shm_alloc(ctx, data.size, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF);
 	if (IS_ERR(shm))
 		return PTR_ERR(shm);
@@ -138,6 +136,43 @@ static int tee_ioctl_shm_alloc(struct tee_context *ctx,
 	return ret;
 }
 
+static int
+tee_ioctl_shm_register(struct tee_context *ctx,
+		       struct tee_ioctl_shm_register_data __user *udata)
+{
+	long ret;
+	struct tee_ioctl_shm_register_data data;
+	struct tee_shm *shm;
+
+	if (copy_from_user(&data, udata, sizeof(data)))
+		return -EFAULT;
+
+	/* Currently no input flags are supported */
+	if (data.flags)
+		return -EINVAL;
+
+	shm = tee_shm_register(ctx, data.addr, data.length,
+			       TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED);
+	if (IS_ERR(shm))
+		return PTR_ERR(shm);
+
+	data.id = shm->id;
+	data.flags = shm->flags;
+	data.length = shm->size;
+
+	if (copy_to_user(udata, &data, sizeof(data)))
+		ret = -EFAULT;
+	else
+		ret = tee_shm_get_fd(shm);
+	/*
+	 * When user space closes the file descriptor the shared memory
+	 * should be freed or if tee_shm_get_fd() failed then it will
+	 * be freed immediately.
+	 */
+	tee_shm_put(shm);
+	return ret;
+}
+
 static int params_from_user(struct tee_context *ctx, struct tee_param *params,
 			    size_t num_params,
 			    struct tee_ioctl_param __user *uparams)
@@ -578,6 +613,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return tee_ioctl_version(ctx, uarg);
 	case TEE_IOC_SHM_ALLOC:
 		return tee_ioctl_shm_alloc(ctx, uarg);
+	case TEE_IOC_SHM_REGISTER:
+		return tee_ioctl_shm_register(ctx, uarg);
 	case TEE_IOC_OPEN_SESSION:
 		return tee_ioctl_open_session(ctx, uarg);
 	case TEE_IOC_INVOKE:
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index a3172bc5be69..ad06723c6dd2 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -23,7 +23,6 @@
 static void tee_shm_release(struct tee_shm *shm)
 {
 	struct tee_device *teedev = shm->teedev;
-	struct tee_shm_pool_mgr *poolm;
 
 	mutex_lock(&teedev->mutex);
 	idr_remove(&teedev->idr, shm->id);
@@ -31,12 +30,29 @@ static void tee_shm_release(struct tee_shm *shm)
 		list_del(&shm->link);
 	mutex_unlock(&teedev->mutex);
 
-	if (shm->flags & TEE_SHM_DMA_BUF)
-		poolm = teedev->pool->dma_buf_mgr;
-	else
-		poolm = teedev->pool->private_mgr;
+	if (shm->flags & TEE_SHM_POOL) {
+		struct tee_shm_pool_mgr *poolm;
+
+		if (shm->flags & TEE_SHM_DMA_BUF)
+			poolm = teedev->pool->dma_buf_mgr;
+		else
+			poolm = teedev->pool->private_mgr;
+
+		poolm->ops->free(poolm, shm);
+	} else if (shm->flags & TEE_SHM_REGISTER) {
+		size_t n;
+		int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm);
+
+		if (rc)
+			dev_err(teedev->dev.parent,
+				"unregister shm %p failed: %d", shm, rc);
+
+		for (n = 0; n < shm->num_pages; n++)
+			put_page(shm->pages[n]);
+
+		kfree(shm->pages);
+	}
 
-	poolm->ops->free(poolm, shm);
 	kfree(shm);
 
 	tee_device_put(teedev);
@@ -76,6 +92,10 @@ static int tee_shm_op_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
 	struct tee_shm *shm = dmabuf->priv;
 	size_t size = vma->vm_end - vma->vm_start;
 
+	/* Refuse sharing shared memory provided by application */
+	if (shm->flags & TEE_SHM_REGISTER)
+		return -EINVAL;
+
 	return remap_pfn_range(vma, vma->vm_start, shm->paddr >> PAGE_SHIFT,
 			       size, vma->vm_page_prot);
 }
@@ -89,26 +109,20 @@ static const struct dma_buf_ops tee_shm_dma_buf_ops = {
 	.mmap = tee_shm_op_mmap,
 };
 
-/**
- * tee_shm_alloc() - Allocate shared memory
- * @ctx:	Context that allocates the shared memory
- * @size:	Requested size of shared memory
- * @flags:	Flags setting properties for the requested shared memory.
- *
- * Memory allocated as global shared memory is automatically freed when the
- * TEE file pointer is closed. The @flags field uses the bits defined by
- * TEE_SHM_* in <linux/tee_drv.h>. TEE_SHM_MAPPED must currently always be
- * set. If TEE_SHM_DMA_BUF global shared memory will be allocated and
- * associated with a dma-buf handle, else driver private memory.
- */
-struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
+struct tee_shm *__tee_shm_alloc(struct tee_context *ctx,
+				struct tee_device *teedev,
+				size_t size, u32 flags)
 {
-	struct tee_device *teedev = ctx->teedev;
 	struct tee_shm_pool_mgr *poolm = NULL;
 	struct tee_shm *shm;
 	void *ret;
 	int rc;
 
+	if (ctx && ctx->teedev != teedev) {
+		dev_err(teedev->dev.parent, "ctx and teedev mismatch\n");
+		return ERR_PTR(-EINVAL);
+	}
+
 	if (!(flags & TEE_SHM_MAPPED)) {
 		dev_err(teedev->dev.parent,
 			"only mapped allocations supported\n");
@@ -135,7 +149,7 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
 		goto err_dev_put;
 	}
 
-	shm->flags = flags;
+	shm->flags = flags | TEE_SHM_POOL;
 	shm->teedev = teedev;
 	shm->ctx = ctx;
 	if (flags & TEE_SHM_DMA_BUF)
@@ -171,9 +185,12 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
 			goto err_rem;
 		}
 	}
-	mutex_lock(&teedev->mutex);
-	list_add_tail(&shm->link, &ctx->list_shm);
-	mutex_unlock(&teedev->mutex);
+
+	if (ctx) {
+		mutex_lock(&teedev->mutex);
+		list_add_tail(&shm->link, &ctx->list_shm);
+		mutex_unlock(&teedev->mutex);
+	}
 
 	return shm;
 err_rem:
@@ -188,8 +205,140 @@ err_dev_put:
 	tee_device_put(teedev);
 	return ret;
 }
+
+/**
+ * tee_shm_alloc() - Allocate shared memory
+ * @ctx:	Context that allocates the shared memory
+ * @size:	Requested size of shared memory
+ * @flags:	Flags setting properties for the requested shared memory.
+ *
+ * Memory allocated as global shared memory is automatically freed when the
+ * TEE file pointer is closed. The @flags field uses the bits defined by
+ * TEE_SHM_* in <linux/tee_drv.h>. TEE_SHM_MAPPED must currently always be
+ * set. If TEE_SHM_DMA_BUF global shared memory will be allocated and
+ * associated with a dma-buf handle, else driver private memory.
+ */
+struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
+{
+	return __tee_shm_alloc(ctx, ctx->teedev, size, flags);
+}
 EXPORT_SYMBOL_GPL(tee_shm_alloc);
 
+struct tee_shm *tee_shm_priv_alloc(struct tee_device *teedev, size_t size)
+{
+	return __tee_shm_alloc(NULL, teedev, size, TEE_SHM_MAPPED);
+}
+EXPORT_SYMBOL_GPL(tee_shm_priv_alloc);
+
+struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
+				 size_t length, u32 flags)
+{
+	struct tee_device *teedev = ctx->teedev;
+	const u32 req_flags = TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED;
+	struct tee_shm *shm;
+	void *ret;
+	int rc;
+	int num_pages;
+	unsigned long start;
+
+	if (flags != req_flags)
+		return ERR_PTR(-ENOTSUPP);
+
+	if (!tee_device_get(teedev))
+		return ERR_PTR(-EINVAL);
+
+	if (!teedev->desc->ops->shm_register ||
+	    !teedev->desc->ops->shm_unregister) {
+		tee_device_put(teedev);
+		return ERR_PTR(-ENOTSUPP);
+	}
+
+	shm = kzalloc(sizeof(*shm), GFP_KERNEL);
+	if (!shm) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	shm->flags = flags | TEE_SHM_REGISTER;
+	shm->teedev = teedev;
+	shm->ctx = ctx;
+	shm->id = -1;
+	start = rounddown(addr, PAGE_SIZE);
+	shm->offset = addr - start;
+	shm->size = length;
+	num_pages = (roundup(addr + length, PAGE_SIZE) - start) / PAGE_SIZE;
+	shm->pages = kcalloc(num_pages, sizeof(*shm->pages), GFP_KERNEL);
+	if (!shm->pages) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	rc = get_user_pages_fast(start, num_pages, 1, shm->pages);
+	if (rc > 0)
+		shm->num_pages = rc;
+	if (rc != num_pages) {
+		if (rc > 0)
+			rc = -ENOMEM;
+		ret = ERR_PTR(rc);
+		goto err;
+	}
+
+	mutex_lock(&teedev->mutex);
+	shm->id = idr_alloc(&teedev->idr, shm, 1, 0, GFP_KERNEL);
+	mutex_unlock(&teedev->mutex);
+
+	if (shm->id < 0) {
+		ret = ERR_PTR(shm->id);
+		goto err;
+	}
+
+	rc = teedev->desc->ops->shm_register(ctx, shm, shm->pages,
+					     shm->num_pages);
+	if (rc) {
+		ret = ERR_PTR(rc);
+		goto err;
+	}
+
+	if (flags & TEE_SHM_DMA_BUF) {
+		DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+
+		exp_info.ops = &tee_shm_dma_buf_ops;
+		exp_info.size = shm->size;
+		exp_info.flags = O_RDWR;
+		exp_info.priv = shm;
+
+		shm->dmabuf = dma_buf_export(&exp_info);
+		if (IS_ERR(shm->dmabuf)) {
+			ret = ERR_CAST(shm->dmabuf);
+			teedev->desc->ops->shm_unregister(ctx, shm);
+			goto err;
+		}
+	}
+
+	mutex_lock(&teedev->mutex);
+	list_add_tail(&shm->link, &ctx->list_shm);
+	mutex_unlock(&teedev->mutex);
+
+	return shm;
+err:
+	if (shm) {
+		size_t n;
+
+		if (shm->id >= 0) {
+			mutex_lock(&teedev->mutex);
+			idr_remove(&teedev->idr, shm->id);
+			mutex_unlock(&teedev->mutex);
+		}
+		for (n = 0; n < shm->num_pages; n++)
+			put_page(shm->pages[n]);
+		kfree(shm->pages);
+	}
+	kfree(shm);
+	tee_device_put(teedev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tee_shm_register);
+
 /**
  * tee_shm_get_fd() - Increase reference count and return file descriptor
  * @shm:	Shared memory handle
@@ -197,10 +346,9 @@ EXPORT_SYMBOL_GPL(tee_shm_alloc);
  */
 int tee_shm_get_fd(struct tee_shm *shm)
 {
-	u32 req_flags = TEE_SHM_MAPPED | TEE_SHM_DMA_BUF;
 	int fd;
 
-	if ((shm->flags & req_flags) != req_flags)
+	if (!(shm->flags & TEE_SHM_DMA_BUF))
 		return -EINVAL;
 
 	fd = dma_buf_fd(shm->dmabuf, O_CLOEXEC);
@@ -238,6 +386,8 @@ EXPORT_SYMBOL_GPL(tee_shm_free);
  */
 int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa)
 {
+	if (!(shm->flags & TEE_SHM_MAPPED))
+		return -EINVAL;
 	/* Check that we're in the range of the shm */
 	if ((char *)va < (char *)shm->kaddr)
 		return -EINVAL;
@@ -258,6 +408,8 @@ EXPORT_SYMBOL_GPL(tee_shm_va2pa);
  */
 int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va)
 {
+	if (!(shm->flags & TEE_SHM_MAPPED))
+		return -EINVAL;
 	/* Check that we're in the range of the shm */
 	if (pa < shm->paddr)
 		return -EINVAL;
@@ -284,6 +436,8 @@ EXPORT_SYMBOL_GPL(tee_shm_pa2va);
  */
 void *tee_shm_get_va(struct tee_shm *shm, size_t offs)
 {
+	if (!(shm->flags & TEE_SHM_MAPPED))
+		return ERR_PTR(-EINVAL);
 	if (offs >= shm->size)
 		return ERR_PTR(-EINVAL);
 	return (char *)shm->kaddr + offs;
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index ead79b560b9d..0f9e574299b6 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -25,8 +25,12 @@
  * specific TEE driver.
  */
 
-#define TEE_SHM_MAPPED		0x1	/* Memory mapped by the kernel */
-#define TEE_SHM_DMA_BUF		0x2	/* Memory with dma-buf handle */
+#define TEE_SHM_MAPPED		BIT(0)	/* Memory mapped by the kernel */
+#define TEE_SHM_DMA_BUF		BIT(1)	/* Memory with dma-buf handle */
+#define TEE_SHM_EXT_DMA_BUF	BIT(2)	/* Memory with dma-buf handle */
+#define TEE_SHM_REGISTER	BIT(3)  /* Memory registered in secure world */
+#define TEE_SHM_USER_MAPPED	BIT(4)  /* Memory mapped in user space */
+#define TEE_SHM_POOL		BIT(5)  /* Memory allocated from pool */
 
 struct device;
 struct tee_device;
@@ -76,6 +80,8 @@ struct tee_param {
  * @cancel_req:		request cancel of an ongoing invoke or open
  * @supp_revc:		called for supplicant to get a command
  * @supp_send:		called for supplicant to send a response
+ * @shm_register:	register shared memory buffer in TEE
+ * @shm_unregister:	unregister shared memory buffer in TEE
  */
 struct tee_driver_ops {
 	void (*get_version)(struct tee_device *teedev,
@@ -94,6 +100,9 @@ struct tee_driver_ops {
 			 struct tee_param *param);
 	int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params,
 			 struct tee_param *param);
+	int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm,
+			    struct page **pages, size_t num_pages);
+	int (*shm_unregister)(struct tee_context *ctx, struct tee_shm *shm);
 };
 
 /**
@@ -301,6 +310,40 @@ void *tee_get_drvdata(struct tee_device *teedev);
  */
 struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags);
 
+/**
+ * tee_shm_priv_alloc() - Allocate shared memory privately
+ * @dev:	Device that allocates the shared memory
+ * @size:	Requested size of shared memory
+ *
+ * Allocates shared memory buffer that is not associated with any client
+ * context. Such buffers are owned by TEE driver and used for internal calls.
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_priv_alloc(struct tee_device *teedev, size_t size);
+
+/**
+ * tee_shm_register() - Register shared memory buffer
+ * @ctx:	Context that registers the shared memory
+ * @addr:	Address is userspace of the shared buffer
+ * @length:	Length of the shared buffer
+ * @flags:	Flags setting properties for the requested shared memory.
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
+				 size_t length, u32 flags);
+
+/**
+ * tee_shm_is_registered() - Check if shared memory object in registered in TEE
+ * @shm:	Shared memory handle
+ * @returns true if object is registered in TEE
+ */
+static inline bool tee_shm_is_registered(struct tee_shm *shm)
+{
+	return shm && (shm->flags & TEE_SHM_REGISTER);
+}
+
 /**
  * tee_shm_free() - Free shared memory
  * @shm:	Handle to shared memory to free
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 267c12e7fd79..4b9eb064d7e7 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -50,6 +50,7 @@
 
 #define TEE_GEN_CAP_GP		(1 << 0)/* GlobalPlatform compliant TEE */
 #define TEE_GEN_CAP_PRIVILEGED	(1 << 1)/* Privileged device (for supplicant) */
+#define TEE_GEN_CAP_REG_MEM	(1 << 2)/* Supports registering shared memory */
 
 /*
  * TEE Implementation ID
@@ -339,6 +340,35 @@ struct tee_iocl_supp_send_arg {
 #define TEE_IOC_SUPPL_SEND	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 7, \
 				     struct tee_ioctl_buf_data)
 
+/**
+ * struct tee_ioctl_shm_register_data - Shared memory register argument
+ * @addr:      [in] Start address of shared memory to register
+ * @length:    [in/out] Length of shared memory to register
+ * @flags:     [in/out] Flags to/from registration.
+ * @id:                [out] Identifier of the shared memory
+ *
+ * The flags field should currently be zero as input. Updated by the call
+ * with actual flags as defined by TEE_IOCTL_SHM_* above.
+ * This structure is used as argument for TEE_IOC_SHM_REGISTER below.
+ */
+struct tee_ioctl_shm_register_data {
+	__u64 addr;
+	__u64 length;
+	__u32 flags;
+	__s32 id;
+};
+
+/**
+ * TEE_IOC_SHM_REGISTER - Register shared memory argument
+ *
+ * Registers shared memory between the user space process and secure OS.
+ *
+ * Returns a file descriptor on success or < 0 on failure
+ *
+ * The shared memory is unregisterred when the descriptor is closed.
+ */
+#define TEE_IOC_SHM_REGISTER   _IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 9, \
+				     struct tee_ioctl_shm_register_data)
 /*
  * Five syscalls are used when communicating with the TEE driver.
  * open(): opens the device associated with the driver

From ba5928ca5c23ed8793288c4d19a541a8d86deb38 Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:27 +0200
Subject: [PATCH 1582/3220] BACKPORT: tee: shm: add accessors for buffer size
 and page offset

These two function will be needed for shared memory registration in OP-TEE

Change-Id: Iccfbc063f90edd2f8a283bc0424d95e1f0874c8a
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit b25946ad951c013c31d0a0e82d2017004bdc8fed)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 include/linux/tee_drv.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 0f9e574299b6..71cd1f2841c9 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -393,6 +393,26 @@ void *tee_shm_get_va(struct tee_shm *shm, size_t offs);
  */
 int tee_shm_get_pa(struct tee_shm *shm, size_t offs, phys_addr_t *pa);
 
+/**
+ * tee_shm_get_size() - Get size of shared memory buffer
+ * @shm:	Shared memory handle
+ * @returns size of shared memory
+ */
+static inline size_t tee_shm_get_size(struct tee_shm *shm)
+{
+	return shm->size;
+}
+
+/**
+ * tee_shm_get_page_offset() - Get shared buffer offset from page start
+ * @shm:	Shared memory handle
+ * @returns page offset of shared buffer
+ */
+static inline size_t tee_shm_get_page_offset(struct tee_shm *shm)
+{
+	return shm->offset;
+}
+
 /**
  * tee_shm_get_id() - Get id of a shared memory object
  * @shm:	Shared memory handle

From a06ac65fa54f4a8b904b8e9cc87021fb6a842a81 Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:28 +0200
Subject: [PATCH 1583/3220] BACKPORT: tee: shm: add page accessor functions

In order to register a shared buffer in TEE, we need accessor
function that return list of pages for that buffer.

Change-Id: I4078da3e7fff7e44bb6478e297250458d66dd89d
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit e0c69ae8bfb500facebe1fa331f9400c216eaab0)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 include/linux/tee_drv.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 71cd1f2841c9..80963f665db3 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -403,6 +403,19 @@ static inline size_t tee_shm_get_size(struct tee_shm *shm)
 	return shm->size;
 }
 
+/**
+ * tee_shm_get_pages() - Get list of pages that hold shared buffer
+ * @shm:	Shared memory handle
+ * @num_pages:	Number of pages will be stored there
+ * @returns pointer to pages array
+ */
+static inline struct page **tee_shm_get_pages(struct tee_shm *shm,
+					      size_t *num_pages)
+{
+	*num_pages = shm->num_pages;
+	return shm->pages;
+}
+
 /**
  * tee_shm_get_page_offset() - Get shared buffer offset from page start
  * @shm:	Shared memory handle

From ba1bc285991caa9bd8a49f48a8195d2c2a86b5c8 Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:29 +0200
Subject: [PATCH 1584/3220] BACKPORT: tee: optee: Update protocol definitions

There were changes in REE<->OP-TEE ABI recently.
Now ABI allows us to pass non-contiguous memory buffers as list of
pages to OP-TEE. This can be achieved by using new parameter attribute
OPTEE_MSG_ATTR_NONCONTIG.

OP-TEE also is able to use all non-secure RAM for shared buffers. This
new capability is enabled with OPTEE_SMC_SEC_CAP_DYNAMIC_SHM flag.

This patch adds necessary definitions to the protocol definition files at
Linux side.

Change-Id: I1a709ac6195292fdd06d37f2b28b4e18b3652137
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit de5c6dfc43daa59feb824505f80fe4591f8f8f85)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/optee_msg.h | 38 +++++++++++++++++++++++++++++------
 drivers/tee/optee/optee_smc.h |  7 +++++++
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/drivers/tee/optee/optee_msg.h b/drivers/tee/optee/optee_msg.h
index dd7a06ee0462..30504901be80 100644
--- a/drivers/tee/optee/optee_msg.h
+++ b/drivers/tee/optee/optee_msg.h
@@ -67,11 +67,32 @@
 #define OPTEE_MSG_ATTR_META			BIT(8)
 
 /*
- * The temporary shared memory object is not physically contigous and this
- * temp memref is followed by another fragment until the last temp memref
- * that doesn't have this bit set.
+ * Pointer to a list of pages used to register user-defined SHM buffer.
+ * Used with OPTEE_MSG_ATTR_TYPE_TMEM_*.
+ * buf_ptr should point to the beginning of the buffer. Buffer will contain
+ * list of page addresses. OP-TEE core can reconstruct contiguous buffer from
+ * that page addresses list. Page addresses are stored as 64 bit values.
+ * Last entry on a page should point to the next page of buffer.
+ * Every entry in buffer should point to a 4k page beginning (12 least
+ * significant bits must be equal to zero).
+ *
+ * 12 least significant bints of optee_msg_param.u.tmem.buf_ptr should hold page
+ * offset of the user buffer.
+ *
+ * So, entries should be placed like members of this structure:
+ *
+ * struct page_data {
+ *   uint64_t pages_array[OPTEE_MSG_NONCONTIG_PAGE_SIZE/sizeof(uint64_t) - 1];
+ *   uint64_t next_page_data;
+ * };
+ *
+ * Structure is designed to exactly fit into the page size
+ * OPTEE_MSG_NONCONTIG_PAGE_SIZE which is a standard 4KB page.
+ *
+ * The size of 4KB is chosen because this is the smallest page size for ARM
+ * architectures. If REE uses larger pages, it should divide them to 4KB ones.
  */
-#define OPTEE_MSG_ATTR_FRAGMENT			BIT(9)
+#define OPTEE_MSG_ATTR_NONCONTIG		BIT(9)
 
 /*
  * Memory attributes for caching passed with temp memrefs. The actual value
@@ -94,6 +115,11 @@
 #define OPTEE_MSG_LOGIN_APPLICATION_USER	0x00000005
 #define OPTEE_MSG_LOGIN_APPLICATION_GROUP	0x00000006
 
+/*
+ * Page size used in non-contiguous buffer entries
+ */
+#define OPTEE_MSG_NONCONTIG_PAGE_SIZE		4096
+
 /**
  * struct optee_msg_param_tmem - temporary memory reference parameter
  * @buf_ptr:	Address of the buffer
@@ -145,8 +171,8 @@ struct optee_msg_param_value {
  *
  * @attr & OPTEE_MSG_ATTR_TYPE_MASK indicates if tmem, rmem or value is used in
  * the union. OPTEE_MSG_ATTR_TYPE_VALUE_* indicates value,
- * OPTEE_MSG_ATTR_TYPE_TMEM_* indicates tmem and
- * OPTEE_MSG_ATTR_TYPE_RMEM_* indicates rmem.
+ * OPTEE_MSG_ATTR_TYPE_TMEM_* indicates @tmem and
+ * OPTEE_MSG_ATTR_TYPE_RMEM_* indicates @rmem,
  * OPTEE_MSG_ATTR_TYPE_NONE indicates that none of the members are used.
  */
 struct optee_msg_param {
diff --git a/drivers/tee/optee/optee_smc.h b/drivers/tee/optee/optee_smc.h
index 069c8e1429de..7cd327243ada 100644
--- a/drivers/tee/optee/optee_smc.h
+++ b/drivers/tee/optee/optee_smc.h
@@ -222,6 +222,13 @@ struct optee_smc_get_shm_config_result {
 #define OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM	BIT(0)
 /* Secure world can communicate via previously unregistered shared memory */
 #define OPTEE_SMC_SEC_CAP_UNREGISTERED_SHM	BIT(1)
+
+/*
+ * Secure world supports commands "register/unregister shared memory",
+ * secure world accepts command buffers located in any parts of non-secure RAM
+ */
+#define OPTEE_SMC_SEC_CAP_DYNAMIC_SHM		BIT(2)
+
 #define OPTEE_SMC_FUNCID_EXCHANGE_CAPABILITIES	9
 #define OPTEE_SMC_EXCHANGE_CAPABILITIES \
 	OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_EXCHANGE_CAPABILITIES)

From 7a86865317b0a3e965b559c269f4898c6df913d6 Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:30 +0200
Subject: [PATCH 1585/3220] BACKPORT: tee: optee: add page list manipulation
 functions

These functions will be used to pass information about shared
buffers to OP-TEE. ABI between Linux and OP-TEE is defined
in optee_msg.h and optee_smc.h.

optee_msg.h defines OPTEE_MSG_ATTR_NONCONTIG attribute
for shared memory references and describes how such references
should be passed. Note that it uses 64-bit page addresses even
on 32 bit systems. This is done to support LPAE and to unify
interface.

Change-Id: I7cdee66cfacb3d3d1243864aecac5270a9d11c96
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
[jw: replacing uint64_t with u64 in optee_fill_pages_list()]
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 3bb48ba5cd60f9685aa8f1ccd9b14a72e237c13f)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/call.c          | 91 +++++++++++++++++++++++++++++++
 drivers/tee/optee/optee_private.h |  5 ++
 2 files changed, 96 insertions(+)

diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
index f7b7b404c990..e85860f6e057 100644
--- a/drivers/tee/optee/call.c
+++ b/drivers/tee/optee/call.c
@@ -11,6 +11,7 @@
  * GNU General Public License for more details.
  *
  */
+#include <asm/pgtable.h>
 #include <linux/arm-smccc.h>
 #include <linux/device.h>
 #include <linux/err.h>
@@ -442,3 +443,93 @@ void optee_disable_shm_cache(struct optee *optee)
 	}
 	optee_cq_wait_final(&optee->call_queue, &w);
 }
+
+#define PAGELIST_ENTRIES_PER_PAGE				\
+	((OPTEE_MSG_NONCONTIG_PAGE_SIZE / sizeof(u64)) - 1)
+
+/**
+ * optee_fill_pages_list() - write list of user pages to given shared
+ * buffer.
+ *
+ * @dst: page-aligned buffer where list of pages will be stored
+ * @pages: array of pages that represents shared buffer
+ * @num_pages: number of entries in @pages
+ * @page_offset: offset of user buffer from page start
+ *
+ * @dst should be big enough to hold list of user page addresses and
+ *	links to the next pages of buffer
+ */
+void optee_fill_pages_list(u64 *dst, struct page **pages, int num_pages,
+			   size_t page_offset)
+{
+	int n = 0;
+	phys_addr_t optee_page;
+	/*
+	 * Refer to OPTEE_MSG_ATTR_NONCONTIG description in optee_msg.h
+	 * for details.
+	 */
+	struct {
+		u64 pages_list[PAGELIST_ENTRIES_PER_PAGE];
+		u64 next_page_data;
+	} *pages_data;
+
+	/*
+	 * Currently OP-TEE uses 4k page size and it does not looks
+	 * like this will change in the future.  On other hand, there are
+	 * no know ARM architectures with page size < 4k.
+	 * Thus the next built assert looks redundant. But the following
+	 * code heavily relies on this assumption, so it is better be
+	 * safe than sorry.
+	 */
+	BUILD_BUG_ON(PAGE_SIZE < OPTEE_MSG_NONCONTIG_PAGE_SIZE);
+
+	pages_data = (void *)dst;
+	/*
+	 * If linux page is bigger than 4k, and user buffer offset is
+	 * larger than 4k/8k/12k/etc this will skip first 4k pages,
+	 * because they bear no value data for OP-TEE.
+	 */
+	optee_page = page_to_phys(*pages) +
+		round_down(page_offset, OPTEE_MSG_NONCONTIG_PAGE_SIZE);
+
+	while (true) {
+		pages_data->pages_list[n++] = optee_page;
+
+		if (n == PAGELIST_ENTRIES_PER_PAGE) {
+			pages_data->next_page_data =
+				virt_to_phys(pages_data + 1);
+			pages_data++;
+			n = 0;
+		}
+
+		optee_page += OPTEE_MSG_NONCONTIG_PAGE_SIZE;
+		if (!(optee_page & ~PAGE_MASK)) {
+			if (!--num_pages)
+				break;
+			pages++;
+			optee_page = page_to_phys(*pages);
+		}
+	}
+}
+
+/*
+ * The final entry in each pagelist page is a pointer to the next
+ * pagelist page.
+ */
+static size_t get_pages_list_size(size_t num_entries)
+{
+	int pages = DIV_ROUND_UP(num_entries, PAGELIST_ENTRIES_PER_PAGE);
+
+	return pages * OPTEE_MSG_NONCONTIG_PAGE_SIZE;
+}
+
+u64 *optee_allocate_pages_list(size_t num_entries)
+{
+	return alloc_pages_exact(get_pages_list_size(num_entries), GFP_KERNEL);
+}
+
+void optee_free_pages_list(void *list, size_t num_entries)
+{
+	free_pages_exact(list, get_pages_list_size(num_entries));
+}
+
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
index 3e7da187acbe..3696f5b4a792 100644
--- a/drivers/tee/optee/optee_private.h
+++ b/drivers/tee/optee/optee_private.h
@@ -154,6 +154,11 @@ int optee_from_msg_param(struct tee_param *params, size_t num_params,
 int optee_to_msg_param(struct optee_msg_param *msg_params, size_t num_params,
 		       const struct tee_param *params);
 
+u64 *optee_allocate_pages_list(size_t num_entries);
+void optee_free_pages_list(void *array, size_t num_entries);
+void optee_fill_pages_list(u64 *dst, struct page **pages, int num_pages,
+			   size_t page_offset);
+
 /*
  * Small helpers
  */

From 05f0a3bccaeed2ddff000a5afe964ed99db70b94 Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:31 +0200
Subject: [PATCH 1586/3220] BACKPORT: tee: optee: add shared buffer
 registration functions

This change adds ops for shm_(un)register functions in tee interface.
Client application can use these functions to (un)register an own shared
buffer in OP-TEE address space. This allows zero copy data sharing between
Normal and Secure Worlds.

Please note that while those functions were added to optee code,
it does not report to userspace that those functions are available.
OP-TEE code does not set TEE_GEN_CAP_REG_MEM flag. This flag will be
enabled only after all other features of dynamic shared memory will be
implemented in subsequent patches. Of course user can ignore presence of
TEE_GEN_CAP_REG_MEM flag and try do call those functions. This is okay,
driver will register shared buffer in OP-TEE, but any attempts to use
this shared buffer will fail.

Change-Id: I9cd17ffcf306e02f1e0017b8ef0df835847c93b8
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 06ca79179c4e00efe53cfe43456f1586f944f04f)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/call.c          | 69 +++++++++++++++++++++++++++++++
 drivers/tee/optee/core.c          |  2 +
 drivers/tee/optee/optee_private.h |  4 ++
 3 files changed, 75 insertions(+)

diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
index e85860f6e057..a05e9e61105f 100644
--- a/drivers/tee/optee/call.c
+++ b/drivers/tee/optee/call.c
@@ -533,3 +533,72 @@ void optee_free_pages_list(void *list, size_t num_entries)
 	free_pages_exact(list, get_pages_list_size(num_entries));
 }
 
+int optee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
+		       struct page **pages, size_t num_pages)
+{
+	struct tee_shm *shm_arg = NULL;
+	struct optee_msg_arg *msg_arg;
+	u64 *pages_list;
+	phys_addr_t msg_parg;
+	int rc = 0;
+
+	if (!num_pages)
+		return -EINVAL;
+
+	pages_list = optee_allocate_pages_list(num_pages);
+	if (!pages_list)
+		return -ENOMEM;
+
+	shm_arg = get_msg_arg(ctx, 1, &msg_arg, &msg_parg);
+	if (IS_ERR(shm_arg)) {
+		rc = PTR_ERR(shm_arg);
+		goto out;
+	}
+
+	optee_fill_pages_list(pages_list, pages, num_pages,
+			      tee_shm_get_page_offset(shm));
+
+	msg_arg->cmd = OPTEE_MSG_CMD_REGISTER_SHM;
+	msg_arg->params->attr = OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT |
+				OPTEE_MSG_ATTR_NONCONTIG;
+	msg_arg->params->u.tmem.shm_ref = (unsigned long)shm;
+	msg_arg->params->u.tmem.size = tee_shm_get_size(shm);
+	/*
+	 * In the least bits of msg_arg->params->u.tmem.buf_ptr we
+	 * store buffer offset from 4k page, as described in OP-TEE ABI.
+	 */
+	msg_arg->params->u.tmem.buf_ptr = virt_to_phys(pages_list) |
+	  (tee_shm_get_page_offset(shm) & (OPTEE_MSG_NONCONTIG_PAGE_SIZE - 1));
+
+	if (optee_do_call_with_arg(ctx, msg_parg) ||
+	    msg_arg->ret != TEEC_SUCCESS)
+		rc = -EINVAL;
+
+	tee_shm_free(shm_arg);
+out:
+	optee_free_pages_list(pages_list, num_pages);
+	return rc;
+}
+
+int optee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm)
+{
+	struct tee_shm *shm_arg;
+	struct optee_msg_arg *msg_arg;
+	phys_addr_t msg_parg;
+	int rc = 0;
+
+	shm_arg = get_msg_arg(ctx, 1, &msg_arg, &msg_parg);
+	if (IS_ERR(shm_arg))
+		return PTR_ERR(shm_arg);
+
+	msg_arg->cmd = OPTEE_MSG_CMD_UNREGISTER_SHM;
+
+	msg_arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_RMEM_INPUT;
+	msg_arg->params[0].u.rmem.shm_ref = (unsigned long)shm;
+
+	if (optee_do_call_with_arg(ctx, msg_parg) ||
+	    msg_arg->ret != TEEC_SUCCESS)
+		rc = -EINVAL;
+	tee_shm_free(shm_arg);
+	return rc;
+}
diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index d0dd09219795..0aa7dbb6b2fe 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -264,6 +264,8 @@ static const struct tee_driver_ops optee_ops = {
 	.close_session = optee_close_session,
 	.invoke_func = optee_invoke_func,
 	.cancel_req = optee_cancel_req,
+	.shm_register = optee_shm_register,
+	.shm_unregister = optee_shm_unregister,
 };
 
 static const struct tee_desc optee_desc = {
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
index 3696f5b4a792..cdda1fa7241f 100644
--- a/drivers/tee/optee/optee_private.h
+++ b/drivers/tee/optee/optee_private.h
@@ -149,6 +149,10 @@ int optee_cancel_req(struct tee_context *ctx, u32 cancel_id, u32 session);
 void optee_enable_shm_cache(struct optee *optee);
 void optee_disable_shm_cache(struct optee *optee);
 
+int optee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
+		       struct page **pages, size_t num_pages);
+int optee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm);
+
 int optee_from_msg_param(struct tee_param *params, size_t num_params,
 			 const struct optee_msg_param *msg_params);
 int optee_to_msg_param(struct optee_msg_param *msg_params, size_t num_params,

From 765318c240e1a7c69f57d548138a98a24202376d Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:32 +0200
Subject: [PATCH 1587/3220] BACKPORT: tee: optee: add registered shared
 parameters handling

Now, when client applications can register own shared buffers in OP-TEE,
we need to extend ABI for parameter passing to/from OP-TEE.

So, if OP-TEE core detects that parameter belongs to registered shared
memory, it will use corresponding parameter attribute.

Change-Id: Iac7907b49c5acc75806526951c77069ec23694cc
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 64cf9d8a672e770fed85a65b5c6767fc0aa1473b)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/core.c | 78 ++++++++++++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 15 deletions(-)

diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index 0aa7dbb6b2fe..b122009f890c 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -97,6 +97,25 @@ int optee_from_msg_param(struct tee_param *params, size_t num_params,
 					return rc;
 			}
 			break;
+		case OPTEE_MSG_ATTR_TYPE_RMEM_INPUT:
+		case OPTEE_MSG_ATTR_TYPE_RMEM_OUTPUT:
+		case OPTEE_MSG_ATTR_TYPE_RMEM_INOUT:
+			p->attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT +
+				  attr - OPTEE_MSG_ATTR_TYPE_RMEM_INPUT;
+			p->u.memref.size = mp->u.rmem.size;
+			shm = (struct tee_shm *)(unsigned long)
+				mp->u.rmem.shm_ref;
+
+			if (!shm) {
+				p->u.memref.shm_offs = 0;
+				p->u.memref.shm = NULL;
+				break;
+			}
+			p->u.memref.shm_offs = mp->u.rmem.offs;
+			p->u.memref.shm = shm;
+
+			break;
+
 		default:
 			return -EINVAL;
 		}
@@ -104,6 +123,46 @@ int optee_from_msg_param(struct tee_param *params, size_t num_params,
 	return 0;
 }
 
+static int to_msg_param_tmp_mem(struct optee_msg_param *mp,
+				const struct tee_param *p)
+{
+	int rc;
+	phys_addr_t pa;
+
+	mp->attr = OPTEE_MSG_ATTR_TYPE_TMEM_INPUT + p->attr -
+		   TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT;
+
+	mp->u.tmem.shm_ref = (unsigned long)p->u.memref.shm;
+	mp->u.tmem.size = p->u.memref.size;
+
+	if (!p->u.memref.shm) {
+		mp->u.tmem.buf_ptr = 0;
+		return 0;
+	}
+
+	rc = tee_shm_get_pa(p->u.memref.shm, p->u.memref.shm_offs, &pa);
+	if (rc)
+		return rc;
+
+	mp->u.tmem.buf_ptr = pa;
+	mp->attr |= OPTEE_MSG_ATTR_CACHE_PREDEFINED <<
+		    OPTEE_MSG_ATTR_CACHE_SHIFT;
+
+	return 0;
+}
+
+static int to_msg_param_reg_mem(struct optee_msg_param *mp,
+				const struct tee_param *p)
+{
+	mp->attr = OPTEE_MSG_ATTR_TYPE_RMEM_INPUT + p->attr -
+		   TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT;
+
+	mp->u.rmem.shm_ref = (unsigned long)p->u.memref.shm;
+	mp->u.rmem.size = p->u.memref.size;
+	mp->u.rmem.offs = p->u.memref.shm_offs;
+	return 0;
+}
+
 /**
  * optee_to_msg_param() - convert from struct tee_params to OPTEE_MSG parameters
  * @msg_params:	OPTEE_MSG parameters
@@ -116,7 +175,6 @@ int optee_to_msg_param(struct optee_msg_param *msg_params, size_t num_params,
 {
 	int rc;
 	size_t n;
-	phys_addr_t pa;
 
 	for (n = 0; n < num_params; n++) {
 		const struct tee_param *p = params + n;
@@ -139,22 +197,12 @@ int optee_to_msg_param(struct optee_msg_param *msg_params, size_t num_params,
 		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
 		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
-			mp->attr = OPTEE_MSG_ATTR_TYPE_TMEM_INPUT +
-				   p->attr -
-				   TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT;
-			mp->u.tmem.shm_ref = (unsigned long)p->u.memref.shm;
-			mp->u.tmem.size = p->u.memref.size;
-			if (!p->u.memref.shm) {
-				mp->u.tmem.buf_ptr = 0;
-				break;
-			}
-			rc = tee_shm_get_pa(p->u.memref.shm,
-					    p->u.memref.shm_offs, &pa);
+			if (tee_shm_is_registered(p->u.memref.shm))
+				rc = to_msg_param_reg_mem(mp, p);
+			else
+				rc = to_msg_param_tmp_mem(mp, p);
 			if (rc)
 				return rc;
-			mp->u.tmem.buf_ptr = pa;
-			mp->attr |= OPTEE_MSG_ATTR_CACHE_PREDEFINED <<
-					OPTEE_MSG_ATTR_CACHE_SHIFT;
 			break;
 		default:
 			return -EINVAL;

From 8e7265bf47e51c94a272be4723a51b2ef3668901 Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:33 +0200
Subject: [PATCH 1588/3220] BACKPORT: tee: optee: add registered buffers
 handling into RPC calls

With latest changes to OP-TEE we can use any buffers as a shared memory.
Thus, it is possible for supplicant to provide part of own memory
when OP-TEE asks to allocate a shared buffer.

This patch adds support for such feature into RPC handling code.
Now when OP-TEE asks supplicant to allocate shared buffer, supplicant
can use TEE_IOC_SHM_REGISTER to provide such buffer. RPC handler is
aware of this, so it will pass list of allocated pages to OP-TEE.

Change-Id: I01a9970ff377848416d002bf89936a8220354873
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
[jw: fix parenthesis alignment in free_pages_list()]
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 53a107c812de3dd74707458aa751eb457718ff9e)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/call.c          | 19 +++++++-
 drivers/tee/optee/core.c          |  2 +
 drivers/tee/optee/optee_private.h | 15 +++++-
 drivers/tee/optee/rpc.c           | 77 +++++++++++++++++++++++++++----
 4 files changed, 102 insertions(+), 11 deletions(-)

diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
index a05e9e61105f..e675e82ff095 100644
--- a/drivers/tee/optee/call.c
+++ b/drivers/tee/optee/call.c
@@ -136,6 +136,7 @@ u32 optee_do_call_with_arg(struct tee_context *ctx, phys_addr_t parg)
 	struct optee *optee = tee_get_drvdata(ctx->teedev);
 	struct optee_call_waiter w;
 	struct optee_rpc_param param = { };
+	struct optee_call_ctx call_ctx = { };
 	u32 ret;
 
 	param.a0 = OPTEE_SMC_CALL_WITH_ARG;
@@ -160,13 +161,14 @@ u32 optee_do_call_with_arg(struct tee_context *ctx, phys_addr_t parg)
 			param.a1 = res.a1;
 			param.a2 = res.a2;
 			param.a3 = res.a3;
-			optee_handle_rpc(ctx, &param);
+			optee_handle_rpc(ctx, &param, &call_ctx);
 		} else {
 			ret = res.a0;
 			break;
 		}
 	}
 
+	optee_rpc_finalize_call(&call_ctx);
 	/*
 	 * We're done with our thread in secure world, if there's any
 	 * thread waiters wake up one.
@@ -602,3 +604,18 @@ int optee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm)
 	tee_shm_free(shm_arg);
 	return rc;
 }
+
+int optee_shm_register_supp(struct tee_context *ctx, struct tee_shm *shm,
+			    struct page **pages, size_t num_pages)
+{
+	/*
+	 * We don't want to register supplicant memory in OP-TEE.
+	 * Instead information about it will be passed in RPC code.
+	 */
+	return 0;
+}
+
+int optee_shm_unregister_supp(struct tee_context *ctx, struct tee_shm *shm)
+{
+	return 0;
+}
diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index b122009f890c..94453f0f152d 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -328,6 +328,8 @@ static const struct tee_driver_ops optee_supp_ops = {
 	.release = optee_release,
 	.supp_recv = optee_supp_recv,
 	.supp_send = optee_supp_send,
+	.shm_register = optee_shm_register_supp,
+	.shm_unregister = optee_shm_unregister_supp,
 };
 
 static const struct tee_desc optee_supp_desc = {
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
index cdda1fa7241f..8f3b740a2b16 100644
--- a/drivers/tee/optee/optee_private.h
+++ b/drivers/tee/optee/optee_private.h
@@ -118,7 +118,16 @@ struct optee_rpc_param {
 	u32	a7;
 };
 
-void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param);
+/* Holds context that is preserved during one STD call */
+struct optee_call_ctx {
+	/* information about pages list used in last allocation */
+	void *pages_list;
+	size_t num_entries;
+};
+
+void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param,
+		      struct optee_call_ctx *call_ctx);
+void optee_rpc_finalize_call(struct optee_call_ctx *call_ctx);
 
 void optee_wait_queue_init(struct optee_wait_queue *wq);
 void optee_wait_queue_exit(struct optee_wait_queue *wq);
@@ -153,6 +162,10 @@ int optee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
 		       struct page **pages, size_t num_pages);
 int optee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm);
 
+int optee_shm_register_supp(struct tee_context *ctx, struct tee_shm *shm,
+			    struct page **pages, size_t num_pages);
+int optee_shm_unregister_supp(struct tee_context *ctx, struct tee_shm *shm);
+
 int optee_from_msg_param(struct tee_param *params, size_t num_params,
 			 const struct optee_msg_param *msg_params);
 int optee_to_msg_param(struct optee_msg_param *msg_params, size_t num_params,
diff --git a/drivers/tee/optee/rpc.c b/drivers/tee/optee/rpc.c
index c6df4317ca9f..41aea12e2bcc 100644
--- a/drivers/tee/optee/rpc.c
+++ b/drivers/tee/optee/rpc.c
@@ -200,7 +200,8 @@ static struct tee_shm *cmd_alloc_suppl(struct tee_context *ctx, size_t sz)
 }
 
 static void handle_rpc_func_cmd_shm_alloc(struct tee_context *ctx,
-					  struct optee_msg_arg *arg)
+					  struct optee_msg_arg *arg,
+					  struct optee_call_ctx *call_ctx)
 {
 	phys_addr_t pa;
 	struct tee_shm *shm;
@@ -245,10 +246,49 @@ static void handle_rpc_func_cmd_shm_alloc(struct tee_context *ctx,
 		goto bad;
 	}
 
-	arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT;
-	arg->params[0].u.tmem.buf_ptr = pa;
-	arg->params[0].u.tmem.size = sz;
-	arg->params[0].u.tmem.shm_ref = (unsigned long)shm;
+	sz = tee_shm_get_size(shm);
+
+	if (tee_shm_is_registered(shm)) {
+		struct page **pages;
+		u64 *pages_list;
+		size_t page_num;
+
+		pages = tee_shm_get_pages(shm, &page_num);
+		if (!pages || !page_num) {
+			arg->ret = TEEC_ERROR_OUT_OF_MEMORY;
+			goto bad;
+		}
+
+		pages_list = optee_allocate_pages_list(page_num);
+		if (!pages_list) {
+			arg->ret = TEEC_ERROR_OUT_OF_MEMORY;
+			goto bad;
+		}
+
+		call_ctx->pages_list = pages_list;
+		call_ctx->num_entries = page_num;
+
+		arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT |
+				      OPTEE_MSG_ATTR_NONCONTIG;
+		/*
+		 * In the least bits of u.tmem.buf_ptr we store buffer offset
+		 * from 4k page, as described in OP-TEE ABI.
+		 */
+		arg->params[0].u.tmem.buf_ptr = virt_to_phys(pages_list) |
+			(tee_shm_get_page_offset(shm) &
+			 (OPTEE_MSG_NONCONTIG_PAGE_SIZE - 1));
+		arg->params[0].u.tmem.size = tee_shm_get_size(shm);
+		arg->params[0].u.tmem.shm_ref = (unsigned long)shm;
+
+		optee_fill_pages_list(pages_list, pages, page_num,
+				      tee_shm_get_page_offset(shm));
+	} else {
+		arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT;
+		arg->params[0].u.tmem.buf_ptr = pa;
+		arg->params[0].u.tmem.size = sz;
+		arg->params[0].u.tmem.shm_ref = (unsigned long)shm;
+	}
+
 	arg->ret = TEEC_SUCCESS;
 	return;
 bad:
@@ -307,8 +347,24 @@ static void handle_rpc_func_cmd_shm_free(struct tee_context *ctx,
 	arg->ret = TEEC_SUCCESS;
 }
 
+static void free_pages_list(struct optee_call_ctx *call_ctx)
+{
+	if (call_ctx->pages_list) {
+		optee_free_pages_list(call_ctx->pages_list,
+				      call_ctx->num_entries);
+		call_ctx->pages_list = NULL;
+		call_ctx->num_entries = 0;
+	}
+}
+
+void optee_rpc_finalize_call(struct optee_call_ctx *call_ctx)
+{
+	free_pages_list(call_ctx);
+}
+
 static void handle_rpc_func_cmd(struct tee_context *ctx, struct optee *optee,
-				struct tee_shm *shm)
+				struct tee_shm *shm,
+				struct optee_call_ctx *call_ctx)
 {
 	struct optee_msg_arg *arg;
 
@@ -329,7 +385,8 @@ static void handle_rpc_func_cmd(struct tee_context *ctx, struct optee *optee,
 		handle_rpc_func_cmd_wait(arg);
 		break;
 	case OPTEE_MSG_RPC_CMD_SHM_ALLOC:
-		handle_rpc_func_cmd_shm_alloc(ctx, arg);
+		free_pages_list(call_ctx);
+		handle_rpc_func_cmd_shm_alloc(ctx, arg, call_ctx);
 		break;
 	case OPTEE_MSG_RPC_CMD_SHM_FREE:
 		handle_rpc_func_cmd_shm_free(ctx, arg);
@@ -343,10 +400,12 @@ static void handle_rpc_func_cmd(struct tee_context *ctx, struct optee *optee,
  * optee_handle_rpc() - handle RPC from secure world
  * @ctx:	context doing the RPC
  * @param:	value of registers for the RPC
+ * @call_ctx:	call context. Preserved during one OP-TEE invocation
  *
  * Result of RPC is written back into @param.
  */
-void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param)
+void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param,
+		      struct optee_call_ctx *call_ctx)
 {
 	struct tee_device *teedev = ctx->teedev;
 	struct optee *optee = tee_get_drvdata(teedev);
@@ -381,7 +440,7 @@ void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param)
 		break;
 	case OPTEE_SMC_RPC_FUNC_CMD:
 		shm = reg_pair_to_ptr(param->a1, param->a2);
-		handle_rpc_func_cmd(ctx, optee, shm);
+		handle_rpc_func_cmd(ctx, optee, shm, call_ctx);
 		break;
 	default:
 		pr_warn("Unknown RPC func 0x%x\n",

From 57e74116de17f784bec0dabb1a803051cc74d2d6 Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:34 +0200
Subject: [PATCH 1589/3220] BACKPORT: tee: optee: store OP-TEE capabilities in
 private data

Those capabilities will be used in subsequent patches.

Change-Id: Iea89d2274a83d6e7b1fd864946a97d7427501fcd
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit d885cc5e0759fc19badadddb60a64344b551469b)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/core.c          | 1 +
 drivers/tee/optee/optee_private.h | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index 94453f0f152d..e359dc5dc9c3 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -542,6 +542,7 @@ static struct optee *optee_probe(struct device_node *np)
 	}
 
 	optee->invoke_fn = invoke_fn;
+	optee->sec_caps = sec_caps;
 
 	teedev = tee_device_alloc(&optee_desc, NULL, pool, optee);
 	if (IS_ERR(teedev)) {
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
index 8f3b740a2b16..a85a24725e31 100644
--- a/drivers/tee/optee/optee_private.h
+++ b/drivers/tee/optee/optee_private.h
@@ -84,6 +84,8 @@ struct optee_supp {
  * @supp:		supplicant synchronization struct for RPC to supplicant
  * @pool:		shared memory pool
  * @memremaped_shm	virtual address of memory in shared memory pool
+ * @sec_caps:		secure world capabilities defined by
+ *			OPTEE_SMC_SEC_CAP_* in optee_smc.h
  */
 struct optee {
 	struct tee_device *supp_teedev;
@@ -94,6 +96,7 @@ struct optee {
 	struct optee_supp supp;
 	struct tee_shm_pool *pool;
 	void *memremaped_shm;
+	u32 sec_caps;
 };
 
 struct optee_session {

From cee71b9f6b17e56f208edd2b2f718969c4eb92f1 Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:35 +0200
Subject: [PATCH 1590/3220] BACKPORT: tee: optee: add optee-specific shared
 pool implementation

This is simple pool that uses kernel page allocator. This pool can be
used in case OP-TEE supports dynamic shared memory.

Change-Id: I816cd8ab0752915b29143f0c24e40de803975c59
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit abd135ba215c05ca84f9809e6047db25fc28b835)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/Makefile   |  1 +
 drivers/tee/optee/shm_pool.c | 75 ++++++++++++++++++++++++++++++++++++
 drivers/tee/optee/shm_pool.h | 23 +++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 drivers/tee/optee/shm_pool.c
 create mode 100644 drivers/tee/optee/shm_pool.h

diff --git a/drivers/tee/optee/Makefile b/drivers/tee/optee/Makefile
index 92fe5789bcce..220cf4298f0d 100644
--- a/drivers/tee/optee/Makefile
+++ b/drivers/tee/optee/Makefile
@@ -3,3 +3,4 @@ optee-objs += core.o
 optee-objs += call.o
 optee-objs += rpc.o
 optee-objs += supp.o
+optee-objs += shm_pool.o
diff --git a/drivers/tee/optee/shm_pool.c b/drivers/tee/optee/shm_pool.c
new file mode 100644
index 000000000000..49397813fff1
--- /dev/null
+++ b/drivers/tee/optee/shm_pool.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ * Copyright (c) 2017, EPAM Systems
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "optee_private.h"
+#include "optee_smc.h"
+#include "shm_pool.h"
+
+static int pool_op_alloc(struct tee_shm_pool_mgr *poolm,
+			 struct tee_shm *shm, size_t size)
+{
+	unsigned int order = get_order(size);
+	struct page *page;
+
+	page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!page)
+		return -ENOMEM;
+
+	shm->kaddr = page_address(page);
+	shm->paddr = page_to_phys(page);
+	shm->size = PAGE_SIZE << order;
+
+	return 0;
+}
+
+static void pool_op_free(struct tee_shm_pool_mgr *poolm,
+			 struct tee_shm *shm)
+{
+	free_pages((unsigned long)shm->kaddr, get_order(shm->size));
+	shm->kaddr = NULL;
+}
+
+static void pool_op_destroy_poolmgr(struct tee_shm_pool_mgr *poolm)
+{
+	kfree(poolm);
+}
+
+static const struct tee_shm_pool_mgr_ops pool_ops = {
+	.alloc = pool_op_alloc,
+	.free = pool_op_free,
+	.destroy_poolmgr = pool_op_destroy_poolmgr,
+};
+
+/**
+ * optee_shm_pool_alloc_pages() - create page-based allocator pool
+ *
+ * This pool is used when OP-TEE supports dymanic SHM. In this case
+ * command buffers and such are allocated from kernel's own memory.
+ */
+struct tee_shm_pool_mgr *optee_shm_pool_alloc_pages(void)
+{
+	struct tee_shm_pool_mgr *mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
+
+	if (!mgr)
+		return ERR_PTR(-ENOMEM);
+
+	mgr->ops = &pool_ops;
+
+	return mgr;
+}
diff --git a/drivers/tee/optee/shm_pool.h b/drivers/tee/optee/shm_pool.h
new file mode 100644
index 000000000000..4e753c3bf7ec
--- /dev/null
+++ b/drivers/tee/optee/shm_pool.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ * Copyright (c) 2016, EPAM Systems
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef SHM_POOL_H
+#define SHM_POOL_H
+
+#include <linux/tee_drv.h>
+
+struct tee_shm_pool_mgr *optee_shm_pool_alloc_pages(void);
+
+#endif

From 9af8e19afc93bab16ea67525eeca4bfa459ce6ad Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:36 +0200
Subject: [PATCH 1591/3220] BACKPORT: tee: optee: enable dynamic SHM support

Previous patches added various features that are needed for dynamic SHM.
Dynamic SHM allows Normal World to share any buffers with OP-TEE.
While original design suggested to use pre-allocated region (usually of
1M to 2M of size), this new approach allows to use all non-secure RAM for
command buffers, RPC allocations and TA parameters.

This patch checks capability OPTEE_SMC_SEC_CAP_DYNAMIC_SHM. If it was set
by OP-TEE, then kernel part of OP-TEE will use kernel page allocator
to allocate command buffers. Also it will set TEE_GEN_CAP_REG_MEM
capability to tell userspace that it supports shared memory registration.

Change-Id: If6b54bdd2aafaa4dabe2b0b31aa3116999eef14a
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit f58e236c9d665ad0af99c908de4a9b6f07e74dda)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/core.c | 67 ++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index e359dc5dc9c3..e9843c53fe31 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -28,6 +28,7 @@
 #include <linux/uaccess.h>
 #include "optee_private.h"
 #include "optee_smc.h"
+#include "shm_pool.h"
 
 #define DRIVER_NAME "optee"
 
@@ -219,6 +220,10 @@ static void optee_get_version(struct tee_device *teedev,
 		.impl_caps = TEE_OPTEE_CAP_TZ,
 		.gen_caps = TEE_GEN_CAP_GP,
 	};
+	struct optee *optee = tee_get_drvdata(teedev);
+
+	if (optee->sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM)
+		v.gen_caps |= TEE_GEN_CAP_REG_MEM;
 	*vers = v;
 }
 
@@ -394,21 +399,22 @@ static bool optee_msg_exchange_capabilities(optee_invoke_fn *invoke_fn,
 }
 
 static struct tee_shm_pool *
-optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm)
+optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm,
+			  u32 sec_caps)
 {
 	union {
 		struct arm_smccc_res smccc;
 		struct optee_smc_get_shm_config_result result;
 	} res;
-	struct tee_shm_pool *pool;
 	unsigned long vaddr;
 	phys_addr_t paddr;
 	size_t size;
 	phys_addr_t begin;
 	phys_addr_t end;
 	void *va;
-	struct tee_shm_pool_mem_info priv_info;
-	struct tee_shm_pool_mem_info dmabuf_info;
+	struct tee_shm_pool_mgr *priv_mgr;
+	struct tee_shm_pool_mgr *dmabuf_mgr;
+	void *rc;
 
 	invoke_fn(OPTEE_SMC_GET_SHM_CONFIG, 0, 0, 0, 0, 0, 0, 0, &res.smccc);
 	if (res.result.status != OPTEE_SMC_RETURN_OK) {
@@ -438,22 +444,49 @@ optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm)
 	}
 	vaddr = (unsigned long)va;
 
-	priv_info.vaddr = vaddr;
-	priv_info.paddr = paddr;
-	priv_info.size = OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
-	dmabuf_info.vaddr = vaddr + OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
-	dmabuf_info.paddr = paddr + OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
-	dmabuf_info.size = size - OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
+	/*
+	 * If OP-TEE can work with unregistered SHM, we will use own pool
+	 * for private shm
+	 */
+	if (sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM) {
+		rc = optee_shm_pool_alloc_pages();
+		if (IS_ERR(rc))
+			goto err_memunmap;
+		priv_mgr = rc;
+	} else {
+		const size_t sz = OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
 
-	pool = tee_shm_pool_alloc_res_mem(&priv_info, &dmabuf_info);
-	if (IS_ERR(pool)) {
-		memunmap(va);
-		goto out;
+		rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, sz,
+						    3 /* 8 bytes aligned */);
+		if (IS_ERR(rc))
+			goto err_memunmap;
+		priv_mgr = rc;
+
+		vaddr += sz;
+		paddr += sz;
+		size -= sz;
 	}
 
+	rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, size, PAGE_SHIFT);
+	if (IS_ERR(rc))
+		goto err_free_priv_mgr;
+	dmabuf_mgr = rc;
+
+	rc = tee_shm_pool_alloc(priv_mgr, dmabuf_mgr);
+	if (IS_ERR(rc))
+		goto err_free_dmabuf_mgr;
+
 	*memremaped_shm = va;
-out:
-	return pool;
+
+	return rc;
+
+err_free_dmabuf_mgr:
+	tee_shm_pool_mgr_destroy(dmabuf_mgr);
+err_free_priv_mgr:
+	tee_shm_pool_mgr_destroy(priv_mgr);
+err_memunmap:
+	memunmap(va);
+	return rc;
 }
 
 /* Simple wrapper functions to be able to use a function pointer */
@@ -531,7 +564,7 @@ static struct optee *optee_probe(struct device_node *np)
 	if (!(sec_caps & OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM))
 		return ERR_PTR(-EINVAL);
 
-	pool = optee_config_shm_memremap(invoke_fn, &memremaped_shm);
+	pool = optee_config_shm_memremap(invoke_fn, &memremaped_shm, sec_caps);
 	if (IS_ERR(pool))
 		return (void *)pool;
 

From 17c7c494f71816b458f7fc6f90b7043d8398cad4 Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:37 +0200
Subject: [PATCH 1592/3220] BACKPORT: tee: use reference counting for
 tee_context

We need to ensure that tee_context is present until last
shared buffer will be freed.

Change-Id: I0346e266f17b06af82144290d230029d0193a3d8
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 217e0250cccb9e54d457991446cd3fab413085e1)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_core.c    | 44 +++++++++++++++++++++++++++++----------
 drivers/tee/tee_private.h |  3 +++
 drivers/tee/tee_shm.c     |  7 +++++++
 include/linux/tee_drv.h   |  7 +++++++
 4 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index bb88298cea3a..6c4b200a4560 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -54,6 +54,7 @@ static int tee_open(struct inode *inode, struct file *filp)
 		goto err;
 	}
 
+	kref_init(&ctx->refcount);
 	ctx->teedev = teedev;
 	INIT_LIST_HEAD(&ctx->list_shm);
 	filp->private_data = ctx;
@@ -68,19 +69,40 @@ err:
 	return rc;
 }
 
+void teedev_ctx_get(struct tee_context *ctx)
+{
+	if (ctx->releasing)
+		return;
+
+	kref_get(&ctx->refcount);
+}
+
+static void teedev_ctx_release(struct kref *ref)
+{
+	struct tee_context *ctx = container_of(ref, struct tee_context,
+					       refcount);
+	ctx->releasing = true;
+	ctx->teedev->desc->ops->release(ctx);
+	kfree(ctx);
+}
+
+void teedev_ctx_put(struct tee_context *ctx)
+{
+	if (ctx->releasing)
+		return;
+
+	kref_put(&ctx->refcount, teedev_ctx_release);
+}
+
+static void teedev_close_context(struct tee_context *ctx)
+{
+	tee_device_put(ctx->teedev);
+	teedev_ctx_put(ctx);
+}
+
 static int tee_release(struct inode *inode, struct file *filp)
 {
-	struct tee_context *ctx = filp->private_data;
-	struct tee_device *teedev = ctx->teedev;
-	struct tee_shm *shm;
-
-	ctx->teedev->desc->ops->release(ctx);
-	mutex_lock(&ctx->teedev->mutex);
-	list_for_each_entry(shm, &ctx->list_shm, link)
-		shm->ctx = NULL;
-	mutex_unlock(&ctx->teedev->mutex);
-	kfree(ctx);
-	tee_device_put(teedev);
+	teedev_close_context(filp->private_data);
 	return 0;
 }
 
diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h
index 2bc2b5ab1661..85d99d621603 100644
--- a/drivers/tee/tee_private.h
+++ b/drivers/tee/tee_private.h
@@ -73,4 +73,7 @@ int tee_shm_get_fd(struct tee_shm *shm);
 bool tee_device_get(struct tee_device *teedev);
 void tee_device_put(struct tee_device *teedev);
 
+void teedev_ctx_get(struct tee_context *ctx);
+void teedev_ctx_put(struct tee_context *ctx);
+
 #endif /*TEE_PRIVATE_H*/
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index ad06723c6dd2..e8d6de997838 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -53,6 +53,9 @@ static void tee_shm_release(struct tee_shm *shm)
 		kfree(shm->pages);
 	}
 
+	if (shm->ctx)
+		teedev_ctx_put(shm->ctx);
+
 	kfree(shm);
 
 	tee_device_put(teedev);
@@ -187,6 +190,7 @@ struct tee_shm *__tee_shm_alloc(struct tee_context *ctx,
 	}
 
 	if (ctx) {
+		teedev_ctx_get(ctx);
 		mutex_lock(&teedev->mutex);
 		list_add_tail(&shm->link, &ctx->list_shm);
 		mutex_unlock(&teedev->mutex);
@@ -253,6 +257,8 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
 		return ERR_PTR(-ENOTSUPP);
 	}
 
+	teedev_ctx_get(ctx);
+
 	shm = kzalloc(sizeof(*shm), GFP_KERNEL);
 	if (!shm) {
 		ret = ERR_PTR(-ENOMEM);
@@ -334,6 +340,7 @@ err:
 		kfree(shm->pages);
 	}
 	kfree(shm);
+	teedev_ctx_put(ctx);
 	tee_device_put(teedev);
 	return ret;
 }
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 80963f665db3..484e5058abbd 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -17,6 +17,7 @@
 
 #include <linux/types.h>
 #include <linux/idr.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/tee.h>
 
@@ -42,11 +43,17 @@ struct tee_shm_pool;
  * @teedev:	pointer to this drivers struct tee_device
  * @list_shm:	List of shared memory object owned by this context
  * @data:	driver specific context data, managed by the driver
+ * @refcount:	reference counter for this structure
+ * @releasing:  flag that indicates if context is being released right now.
+ *		It is needed to break circular dependency on context during
+ *              shared memory release.
  */
 struct tee_context {
 	struct tee_device *teedev;
 	struct list_head list_shm;
 	void *data;
+	struct kref refcount;
+	bool releasing;
 };
 
 struct tee_param_memref {

From 78d437dbb108782b5c805ccbe99da8d4251329fd Mon Sep 17 00:00:00 2001
From: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Date: Wed, 29 Nov 2017 14:48:38 +0200
Subject: [PATCH 1593/3220] BACKPORT: tee: shm: inline tee_shm_get_id()

Now, when struct tee_shm is defined in public header,
we can inline small getter functions like this one.

Change-Id: I9aba40c18ec448c043ab0b31849e4d6429908371
Signed-off-by: Volodymyr Babchuk <vlad.babchuk@gmail.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit ef8e08d24ca84846ce639b835ebd2f15a943f42b)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_shm.c   | 11 -----------
 include/linux/tee_drv.h |  5 ++++-
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index e8d6de997838..dd55e4735e3b 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -496,17 +496,6 @@ struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id)
 }
 EXPORT_SYMBOL_GPL(tee_shm_get_from_id);
 
-/**
- * tee_shm_get_id() - Get id of a shared memory object
- * @shm:	Shared memory handle
- * @returns id
- */
-int tee_shm_get_id(struct tee_shm *shm)
-{
-	return shm->id;
-}
-EXPORT_SYMBOL_GPL(tee_shm_get_id);
-
 /**
  * tee_shm_put() - Decrease reference count on a shared memory handle
  * @shm:	Shared memory handle
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 484e5058abbd..41bd4bded28c 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -438,7 +438,10 @@ static inline size_t tee_shm_get_page_offset(struct tee_shm *shm)
  * @shm:	Shared memory handle
  * @returns id
  */
-int tee_shm_get_id(struct tee_shm *shm);
+static inline int tee_shm_get_id(struct tee_shm *shm)
+{
+	return shm->id;
+}
 
 /**
  * tee_shm_get_from_id() - Find shared memory object and increase reference

From 5b6bf566d7ee3ef6862ace6064e52fa4a1f82c8a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Dec 2017 21:18:25 +0100
Subject: [PATCH 1594/3220] BACKPORT: tee: optee: fix header dependencies

The optee driver includes the header files in an unusual order,
with asm/pgtable.h before the linux/*.h headers. For some reason
this seems to trigger a build failure:

drivers/tee/optee/call.c: In function 'optee_fill_pages_list':
include/asm-generic/memory_model.h:64:14: error: implicit declaration of function 'page_to_section'; did you mean '__nr_to_section'? [-Werror=implicit-function-declaration]
  int __sec = page_to_section(__pg);   \
drivers/tee/optee/call.c:494:15: note: in expansion of macro 'page_to_phys'
  optee_page = page_to_phys(*pages) +

Let's just include linux/mm.h, which will then get the other
header implicitly.

Change-Id: I01a0a0cab15f132c9e0972a6b65bbdb08487fb82
Fixes: 3bb48ba5cd60 ("tee: optee: add page list manipulation functions")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
(cherry picked from commit f681e08f671a8e68b085ba66190b8661deab4d85)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/call.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
index e675e82ff095..0f38b3827457 100644
--- a/drivers/tee/optee/call.c
+++ b/drivers/tee/optee/call.c
@@ -11,11 +11,11 @@
  * GNU General Public License for more details.
  *
  */
-#include <asm/pgtable.h>
 #include <linux/arm-smccc.h>
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/errno.h>
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/tee_drv.h>
 #include <linux/types.h>

From 1b8bb30b0787a91d4903c8dfd88852f24634160a Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Thu, 28 Dec 2017 10:08:00 +0100
Subject: [PATCH 1595/3220] BACKPORT: tee: add start argument to shm_register
 callback

Adds a start argument to the shm_register callback to allow the callback
to check memory type of the passed pages.

Change-Id: I61457d60ca192637f8d986e2d6f8aeb153d2c484
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 95ffe4ca43877eea176d7e95aa0d38bbdc3d2903)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/call.c          | 6 ++++--
 drivers/tee/optee/optee_private.h | 6 ++++--
 drivers/tee/tee_shm.c             | 2 +-
 include/linux/tee_drv.h           | 3 ++-
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
index 0f38b3827457..411bfab58628 100644
--- a/drivers/tee/optee/call.c
+++ b/drivers/tee/optee/call.c
@@ -536,7 +536,8 @@ void optee_free_pages_list(void *list, size_t num_entries)
 }
 
 int optee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
-		       struct page **pages, size_t num_pages)
+		       struct page **pages, size_t num_pages,
+		       unsigned long start)
 {
 	struct tee_shm *shm_arg = NULL;
 	struct optee_msg_arg *msg_arg;
@@ -606,7 +607,8 @@ int optee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm)
 }
 
 int optee_shm_register_supp(struct tee_context *ctx, struct tee_shm *shm,
-			    struct page **pages, size_t num_pages)
+			    struct page **pages, size_t num_pages,
+			    unsigned long start)
 {
 	/*
 	 * We don't want to register supplicant memory in OP-TEE.
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
index a85a24725e31..35e79386c556 100644
--- a/drivers/tee/optee/optee_private.h
+++ b/drivers/tee/optee/optee_private.h
@@ -162,11 +162,13 @@ void optee_enable_shm_cache(struct optee *optee);
 void optee_disable_shm_cache(struct optee *optee);
 
 int optee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
-		       struct page **pages, size_t num_pages);
+		       struct page **pages, size_t num_pages,
+		       unsigned long start);
 int optee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm);
 
 int optee_shm_register_supp(struct tee_context *ctx, struct tee_shm *shm,
-			    struct page **pages, size_t num_pages);
+			    struct page **pages, size_t num_pages,
+			    unsigned long start);
 int optee_shm_unregister_supp(struct tee_context *ctx, struct tee_shm *shm);
 
 int optee_from_msg_param(struct tee_param *params, size_t num_params,
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index dd55e4735e3b..56ed03dccca0 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -299,7 +299,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
 	}
 
 	rc = teedev->desc->ops->shm_register(ctx, shm, shm->pages,
-					     shm->num_pages);
+					     shm->num_pages, start);
 	if (rc) {
 		ret = ERR_PTR(rc);
 		goto err;
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 41bd4bded28c..a2b3dfcee0b5 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -108,7 +108,8 @@ struct tee_driver_ops {
 	int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params,
 			 struct tee_param *param);
 	int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm,
-			    struct page **pages, size_t num_pages);
+			    struct page **pages, size_t num_pages,
+			    unsigned long start);
 	int (*shm_unregister)(struct tee_context *ctx, struct tee_shm *shm);
 };
 

From 7d6b8f23177a8f47ffc9a217a0047d5474156134 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Thu, 28 Dec 2017 11:14:05 +0100
Subject: [PATCH 1596/3220] BACKPORT: tee: optee: check type of registered
 shared memory

Checks the memory type of the pages to be registered as shared memory.
Only normal cached memory is allowed.

Change-Id: I8fc58e3ddc0ce94da996fde852268ae7350fcbba
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit cdbcf83d29c1bf2aaa65260e74beaac1bcdc231c)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/optee/call.c | 43 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
index 411bfab58628..a5afbe6dee68 100644
--- a/drivers/tee/optee/call.c
+++ b/drivers/tee/optee/call.c
@@ -535,6 +535,41 @@ void optee_free_pages_list(void *list, size_t num_entries)
 	free_pages_exact(list, get_pages_list_size(num_entries));
 }
 
+static bool is_normal_memory(pgprot_t p)
+{
+#if defined(CONFIG_ARM)
+	return (pgprot_val(p) & L_PTE_MT_MASK) == L_PTE_MT_WRITEALLOC;
+#elif defined(CONFIG_ARM64)
+	return (pgprot_val(p) & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL);
+#else
+#error "Unuspported architecture"
+#endif
+}
+
+static int __check_mem_type(struct vm_area_struct *vma, unsigned long end)
+{
+	while (vma && is_normal_memory(vma->vm_page_prot)) {
+		if (vma->vm_end >= end)
+			return 0;
+		vma = vma->vm_next;
+	}
+
+	return -EINVAL;
+}
+
+static int check_mem_type(unsigned long start, size_t num_pages)
+{
+	struct mm_struct *mm = current->mm;
+	int rc;
+
+	down_read(&mm->mmap_sem);
+	rc = __check_mem_type(find_vma(mm, start),
+			      start + num_pages * PAGE_SIZE);
+	up_read(&mm->mmap_sem);
+
+	return rc;
+}
+
 int optee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
 		       struct page **pages, size_t num_pages,
 		       unsigned long start)
@@ -543,11 +578,15 @@ int optee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
 	struct optee_msg_arg *msg_arg;
 	u64 *pages_list;
 	phys_addr_t msg_parg;
-	int rc = 0;
+	int rc;
 
 	if (!num_pages)
 		return -EINVAL;
 
+	rc = check_mem_type(start, num_pages);
+	if (rc)
+		return rc;
+
 	pages_list = optee_allocate_pages_list(num_pages);
 	if (!pages_list)
 		return -ENOMEM;
@@ -614,7 +653,7 @@ int optee_shm_register_supp(struct tee_context *ctx, struct tee_shm *shm,
 	 * We don't want to register supplicant memory in OP-TEE.
 	 * Instead information about it will be passed in RPC code.
 	 */
-	return 0;
+	return check_mem_type(start, num_pages);
 }
 
 int optee_shm_unregister_supp(struct tee_context *ctx, struct tee_shm *shm)

From 18fee2106b39559a1000094df83566482d793a28 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 22 Dec 2017 17:01:22 +0000
Subject: [PATCH 1597/3220] BACKPORT: tee: shm: make function __tee_shm_alloc
 static

The function __tee_shm_alloc is local to the source and does
not need to be in global scope, so make it static.

Cleans up sparse warning:
symbol '__tee_shm_alloc' was not declared. Should it be static?

Change-Id: I39cc9afd601505dc7fdee1cfff1811c3582822f9
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 80ec6f5de60b6934f145b2f7e5369592bcab85f3)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_shm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 56ed03dccca0..fd3ff05eb8ca 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -112,9 +112,9 @@ static const struct dma_buf_ops tee_shm_dma_buf_ops = {
 	.mmap = tee_shm_op_mmap,
 };
 
-struct tee_shm *__tee_shm_alloc(struct tee_context *ctx,
-				struct tee_device *teedev,
-				size_t size, u32 flags)
+static struct tee_shm *__tee_shm_alloc(struct tee_context *ctx,
+				       struct tee_device *teedev,
+				       size_t size, u32 flags)
 {
 	struct tee_shm_pool_mgr *poolm = NULL;
 	struct tee_shm *shm;

From ed384bc362644588e8d4594ea1614ad4bc3e1feb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 22 Dec 2017 17:51:50 +0000
Subject: [PATCH 1598/3220] BACKPORT: tee: shm: don't put_page on null
 shm->pages

In the case that shm->pages fails to allocate, the current exit
error path will try to put_page on a null shm->pages and cause
a null pointer dereference when accessing shm->pages[n]. Fix this
by only performing the put_page and kfree on shm->pages if it
is not null.

Detected by CoverityScan, CID#1463283 ("Dereference after null check")

Change-Id: I4874df9fce449834fdf9064c718c2d2517f69bc4
Fixes: 033ddf12bcf5 ("tee: add register user memory")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit c94f31b526fe658c25dd2d07c90486a85437f01c)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_shm.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index fd3ff05eb8ca..4f0ae2e98010 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -335,9 +335,11 @@ err:
 			idr_remove(&teedev->idr, shm->id);
 			mutex_unlock(&teedev->mutex);
 		}
-		for (n = 0; n < shm->num_pages; n++)
-			put_page(shm->pages[n]);
-		kfree(shm->pages);
+		if (shm->pages) {
+			for (n = 0; n < shm->num_pages; n++)
+				put_page(shm->pages[n]);
+			kfree(shm->pages);
+		}
 	}
 	kfree(shm);
 	teedev_ctx_put(ctx);

From 8c1e0d6585a78463e537757459f0a388200e1bcd Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 6 Jan 2018 12:22:30 +0300
Subject: [PATCH 1599/3220] BACKPORT: tee: shm: Potential NULL dereference
 calling tee_shm_register()

get_user_pages_fast() can return zero in certain error paths.  We should
handle that or else it means we accidentally return ERR_PTR(0) which is
NULL instead of an error pointer.  The callers are not expecting that
and will crash with a NULL dereference.

Change-Id: I8ecf03cf8ddf3b248d765dee3cf0b634e9838678
Fixes: 033ddf12bcf5 ("tee: add register user memory")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
(cherry picked from commit 2490cdf6435b1d3cac0dbf710cd752487c67c296)
Signed-off-by: Victor Chong <victor.chong@linaro.org>
---
 drivers/tee/tee_shm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 4f0ae2e98010..ed2d71c3337d 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -283,7 +283,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
 	if (rc > 0)
 		shm->num_pages = rc;
 	if (rc != num_pages) {
-		if (rc > 0)
+		if (rc >= 0)
 			rc = -ENOMEM;
 		ret = ERR_PTR(rc);
 		goto err;

From 56ee1e817908cc4971ef0aa1c33d838a0722c69a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@google.com>
Date: Thu, 16 Nov 2017 16:59:14 +0800
Subject: [PATCH 1600/3220] f2fs: updates on v4.16-rc1

Pull f2fs updates from Jaegeuk Kim:
 "In this round, we've followed up to support some generic features such
  as cgroup, block reservation, linking fscrypt_ops, delivering
  write_hints, and some ioctls. And, we could fix some corner cases in
  terms of power-cut recovery and subtle deadlocks.

  Enhancements:
   - bitmap operations to handle NAT blocks
   - readahead to improve readdir speed
   - switch to use fscrypt_*
   - apply write hints for direct IO
   - add reserve_root=%u,resuid=%u,resgid=%u to reserve blocks for root/uid/gid
   - modify b_avail and b_free to consider root reserved blocks
   - support cgroup writeback
   - support FIEMAP_FLAG_XATTR for fibmap
   - add F2FS_IOC_PRECACHE_EXTENTS to pre-cache extents
   - add F2FS_IOC_{GET/SET}_PIN_FILE to pin LBAs for data blocks
   - support inode creation time

  Bug fixs:
   - sysfile-based quota operations
   - memory footprint accounting
   - allow to write data on partial preallocation case
   - fix deadlock case on fallocate
   - fix to handle fill_super errors
   - fix missing inode updates of fsync'ed file
   - recover renamed file which was fsycn'ed before
   - drop inmemory pages in corner error case
   - keep last_disk_size correctly
   - recover missing i_inline flags during roll-forward

  Various clean-up patches were added as well"

Cherry-pick from origin/upstream-f2fs-stable-linux-4.4.y:

5f9b3abb911f f2fs: support inode creation time
9fb0de175172 f2fs: rebuild sit page from sit info in mem
1062a0c01829 f2fs: stop issuing discard if fs is readonly
fa043fae9030 f2fs: clean up duplicated assignment in init_discard_policy
b007190234d6 f2fs: use GFP_F2FS_ZERO for cleanup
35b11839a1ae f2fs: allow to recover node blocks given updated checkpoint
e56500860be0 f2fs: recover some i_inline flags
64aa9569a1bf f2fs: correct removexattr behavior for null valued extended attribute
70b3a923daff f2fs: drop page cache after fs shutdown
8069a0e983d9 f2fs: stop gc/discard thread after fs shutdown
bb924f777717 f2fs: hanlde error case in f2fs_ioc_shutdown
700b53f21ee8 f2fs: split need_inplace_update
f31d52811c1f f2fs: fix to update last_disk_size correctly
eeb0118b8340 f2fs: kill F2FS_INLINE_XATTR_ADDRS for cleanup
c1b74c967092 f2fs: clean up error path of fill_super
d5efd57e013b f2fs: avoid hungtask when GC encrypted block if io_bits is set
c4027d08430b f2fs: allow quota to use reserved blocks
18d267c273a9 f2fs: fix to drop all inmem pages correctly
4dca47531eb0 f2fs: speed up defragment on sparse file
999f806a7c9e f2fs: support F2FS_IOC_PRECACHE_EXTENTS
84960fca96c4 f2fs: add an ioctl to disable GC for specific file
292c8e1cfd4d f2fs: prevent newly created inode from being dirtied incorrectly
58b1f5b0fcf1 f2fs: support FIEMAP_FLAG_XATTR
6afa9a94d09b f2fs: fix to cover f2fs_inline_data_fiemap with inode_lock
10f4a4140b61 f2fs: check node page again in write end io
b203c58dfd55 f2fs: fix to caclulate required free section correctly
d49132d45cb0 f2fs: handle newly created page when revoking inmem pages
2ce6b9d8167e f2fs: add resgid and resuid to reserve root blocks
f53dcf6799ab f2fs: implement cgroup writeback support
1338f376d5a3 f2fs: remove unused pend_list_tag
d4f19f6266ab f2fs: avoid high cpu usage in discard thread
b78e9302e2e3 f2fs: make local functions static
62438ba87b79 f2fs: add reserved blocks for root user
06a366757ff7 f2fs: check segment type in __f2fs_replace_block
4c6bc4be375a f2fs: update inode info to inode page for new file
591b33638733 f2fs: show precise # of blocks that user/root can use
b242d7edc537 f2fs: clean up unneeded declaration
87b8168e9ef0 f2fs: continue to do direct IO if we only preallocate partial blocks
2b4d859bd9d8 f2fs: enable quota at remount from r to w
54bf13a0adcd f2fs: skip stop_checkpoint for user data writes
25ef3006ba23 f2fs: fix missing error number for xattr operation
cff2c7fe417b f2fs: recover directory operations by fsync
e2bb618a0a6b f2fs: return error during fill_super
8a2c11d8658d f2fs: fix an error case of missing update inode page
cd38d5ada5a4 f2fs: fix potential hangtask in f2fs_trace_pid
e81cafbeba4b f2fs: no need return value in restore summary process
04d44000d633 f2fs: use unlikely for release case
925d0933d8f0 f2fs: don't return value in truncate_data_blocks_range
f7986c416d1b f2fs: clean up f2fs_map_blocks
e4f5e26cdadf f2fs: clean up hash codes
1f994d47080c f2fs: fix error handling in fill_super
e7db649b5fb1 f2fs: spread f2fs_k{m,z}alloc
5d4e487b9929 f2fs: inject fault to kvmalloc
8b33886c37cd f2fs: inject fault to kzalloc
d94680798786 f2fs: remove a redundant conditional expression
3bc01114a338 f2fs: apply write hints to select the type of segment for direct write
c80f01959114 f2fs: switch to fscrypt_prepare_setattr()
bb8b850365ff f2fs: switch to fscrypt_prepare_lookup()
9ab470eaf8a8 f2fs: switch to fscrypt_prepare_rename()
aeaac517a12d f2fs: switch to fscrypt_prepare_link()
101c6a96ad1c f2fs: switch to fscrypt_file_open()
6d025237a1f8 f2fs: remove repeated f2fs_bug_on
b01e03d724de f2fs: remove an excess variable
e1f9be2f7c82 f2fs: fix lock dependency in between dio_rwsem & i_mmap_sem
e5c7c8601030 f2fs: remove unused parameter
f130dbb98a68 f2fs: still write data if preallocate only partial blocks
47ee9b259811 f2fs: introduce sysfs readdir_ra to readahead inode block in readdir
55e2f89181ce f2fs: fix concurrent problem for updating free bitmap
e1398f6554b4 f2fs: remove unneeded memory footprint accounting
2d69561135f2 f2fs: no need to read nat block if nat_block_bitmap is set
4dd2d0733809 f2fs: reserve nid resource for quota sysfile

Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |   6 +
 fs/f2fs/checkpoint.c                    |  10 +-
 fs/f2fs/data.c                          | 303 ++++++++++++++++++++----
 fs/f2fs/debug.c                         |  12 +-
 fs/f2fs/dir.c                           |   6 +
 fs/f2fs/f2fs.h                          | 210 ++++++++++++----
 fs/f2fs/file.c                          | 270 ++++++++++++++++-----
 fs/f2fs/gc.c                            |  18 +-
 fs/f2fs/gc.h                            |   2 +
 fs/f2fs/inode.c                         |  34 ++-
 fs/f2fs/namei.c                         |  67 ++----
 fs/f2fs/node.c                          | 149 ++++++------
 fs/f2fs/node.h                          |   4 +
 fs/f2fs/recovery.c                      |  27 ++-
 fs/f2fs/segment.c                       | 131 +++++-----
 fs/f2fs/segment.h                       |  92 ++++---
 fs/f2fs/super.c                         | 146 +++++++++---
 fs/f2fs/sysfs.c                         |  14 +-
 fs/f2fs/trace.c                         |  12 +-
 fs/f2fs/xattr.c                         |  12 +-
 include/linux/f2fs_fs.h                 |  14 +-
 include/trace/events/f2fs.h             |   3 +-
 22 files changed, 1092 insertions(+), 450 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 2baed1151eac..db7aab1516de 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -186,3 +186,9 @@ Date:		August 2017
 Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
 Description:
 		 Controls sleep time of GC urgent mode
+
+What:		/sys/fs/f2fs/<disk>/readdir_ra
+Date:		November 2017
+Contact:	"Sheng Yong" <shengyong1@huawei.com>
+Description:
+		 Controls readahead inode block in readdir.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 2eb778174a9b..3c343e922f6e 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -238,12 +238,15 @@ static int __f2fs_write_meta_page(struct page *page,
 
 	trace_f2fs_writepage(page, META);
 
+	if (unlikely(f2fs_cp_error(sbi))) {
+		dec_page_count(sbi, F2FS_DIRTY_META);
+		unlock_page(page);
+		return 0;
+	}
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
 	if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
 		goto redirty_out;
-	if (unlikely(f2fs_cp_error(sbi)))
-		goto redirty_out;
 
 	write_meta_page(sbi, page, io_type);
 	dec_page_count(sbi, F2FS_DIRTY_META);
@@ -797,7 +800,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 	block_t cp_blk_no;
 	int i;
 
-	sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);
+	sbi->ckpt = f2fs_kzalloc(sbi, cp_blks * blk_size, GFP_KERNEL);
 	if (!sbi->ckpt)
 		return -ENOMEM;
 	/*
@@ -1158,6 +1161,7 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	/* set this flag to activate crc|cp_ver for recovery */
 	__set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
+	__clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG);
 
 	spin_unlock_irqrestore(&sbi->cp_lock, flags);
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index cdccc429325b..d5299265feea 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -111,8 +111,13 @@ static void f2fs_write_end_io(struct bio *bio)
 
 		if (unlikely(bio->bi_error)) {
 			set_bit(AS_EIO, &page->mapping->flags);
-			f2fs_stop_checkpoint(sbi, true);
+			if (type == F2FS_WB_CP_DATA)
+				f2fs_stop_checkpoint(sbi, true);
 		}
+
+		f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) &&
+					page->index != nid_of_node(page));
+
 		dec_page_count(sbi, type);
 		clear_cold_data(page);
 		end_page_writeback(page);
@@ -168,6 +173,7 @@ static bool __same_bdev(struct f2fs_sb_info *sbi,
  * Low-level block read/write IO operations.
  */
 static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
+				struct writeback_control *wbc,
 				int npages, bool is_read)
 {
 	struct bio *bio;
@@ -177,6 +183,8 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 	f2fs_target_device(sbi, blk_addr, bio);
 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
 	bio->bi_private = is_read ? NULL : sbi;
+	if (wbc)
+		wbc_init_bio(wbc, bio);
 
 	return bio;
 }
@@ -372,7 +380,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	f2fs_trace_ios(fio, 0);
 
 	/* Allocate a new bio */
-	bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op));
+	bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc,
+				1, is_read_io(fio->op));
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		bio_put(bio);
@@ -434,7 +443,7 @@ alloc_new:
 			dec_page_count(sbi, WB_DATA_TYPE(bio_page));
 			goto out_fail;
 		}
-		io->bio = __bio_alloc(sbi, fio->new_blkaddr,
+		io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc,
 						BIO_MAX_PAGES, false);
 		io->fio = *fio;
 	}
@@ -444,6 +453,9 @@ alloc_new:
 		goto alloc_new;
 	}
 
+	if (fio->io_wbc)
+		wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE);
+
 	io->last_block_in_bio = fio->new_blkaddr;
 	f2fs_trace_ios(fio, 0);
 
@@ -782,7 +794,7 @@ got_it:
 	return page;
 }
 
-static int __allocate_data_block(struct dnode_of_data *dn)
+static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct f2fs_summary sum;
@@ -807,7 +819,7 @@ alloc:
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 
 	allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
-					&sum, CURSEG_WARM_DATA, NULL, false);
+					&sum, seg_type, NULL, false);
 	set_data_blkaddr(dn);
 
 	/* update i_size */
@@ -830,10 +842,12 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct f2fs_map_blocks map;
+	int flag;
 	int err = 0;
+	bool direct_io = iocb->ki_flags & IOCB_DIRECT;
 
 	/* convert inline data for Direct I/O*/
-	if (iocb->ki_flags & IOCB_DIRECT) {
+	if (direct_io) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
 			return err;
@@ -850,19 +864,34 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 		map.m_len = 0;
 
 	map.m_next_pgofs = NULL;
+	map.m_next_extent = NULL;
+	map.m_seg_type = NO_CHECK_TYPE;
 
-	if (iocb->ki_flags & IOCB_DIRECT)
-		return f2fs_map_blocks(inode, &map, 1,
-			__force_buffered_io(inode, WRITE) ?
-				F2FS_GET_BLOCK_PRE_AIO :
-				F2FS_GET_BLOCK_PRE_DIO);
+	if (direct_io) {
+		/* map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); */
+		map.m_seg_type = rw_hint_to_seg_type(WRITE_LIFE_NOT_SET);
+		flag = __force_buffered_io(inode, WRITE) ?
+					F2FS_GET_BLOCK_PRE_AIO :
+					F2FS_GET_BLOCK_PRE_DIO;
+		goto map_blocks;
+	}
 	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
 			return err;
 	}
-	if (!f2fs_has_inline_data(inode))
-		return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+	if (f2fs_has_inline_data(inode))
+		return err;
+
+	flag = F2FS_GET_BLOCK_PRE_AIO;
+
+map_blocks:
+	err = f2fs_map_blocks(inode, &map, 1, flag);
+	if (map.m_len > 0 && err == -ENOSPC) {
+		if (!direct_io)
+			set_inode_flag(inode, FI_NO_PREALLOC);
+		err = 0;
+	}
 	return err;
 }
 
@@ -903,6 +932,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	blkcnt_t prealloc;
 	struct extent_info ei = {0,0,0};
 	block_t blkaddr;
+	unsigned int start_pgofs;
 
 	if (!maxblocks)
 		return 0;
@@ -918,6 +948,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 		map->m_pblk = ei.blk + pgofs - ei.fofs;
 		map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
 		map->m_flags = F2FS_MAP_MAPPED;
+		if (map->m_next_extent)
+			*map->m_next_extent = pgofs + map->m_len;
 		goto out;
 	}
 
@@ -936,10 +968,14 @@ next_dnode:
 			if (map->m_next_pgofs)
 				*map->m_next_pgofs =
 					get_next_page_offset(&dn, pgofs);
+			if (map->m_next_extent)
+				*map->m_next_extent =
+					get_next_page_offset(&dn, pgofs);
 		}
 		goto unlock_out;
 	}
 
+	start_pgofs = pgofs;
 	prealloc = 0;
 	last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
 	end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
@@ -959,7 +995,8 @@ next_block:
 					last_ofs_in_node = dn.ofs_in_node;
 				}
 			} else {
-				err = __allocate_data_block(&dn);
+				err = __allocate_data_block(&dn,
+							map->m_seg_type);
 				if (!err)
 					set_inode_flag(inode, FI_APPEND_WRITE);
 			}
@@ -972,14 +1009,20 @@ next_block:
 				map->m_pblk = 0;
 				goto sync_out;
 			}
+			if (flag == F2FS_GET_BLOCK_PRECACHE)
+				goto sync_out;
 			if (flag == F2FS_GET_BLOCK_FIEMAP &&
 						blkaddr == NULL_ADDR) {
 				if (map->m_next_pgofs)
 					*map->m_next_pgofs = pgofs + 1;
-			}
-			if (flag != F2FS_GET_BLOCK_FIEMAP ||
-						blkaddr != NEW_ADDR)
 				goto sync_out;
+			}
+			if (flag != F2FS_GET_BLOCK_FIEMAP) {
+				/* for defragment case */
+				if (map->m_next_pgofs)
+					*map->m_next_pgofs = pgofs + 1;
+				goto sync_out;
+			}
 		}
 	}
 
@@ -1030,6 +1073,16 @@ skip:
 	else if (dn.ofs_in_node < end_offset)
 		goto next_block;
 
+	if (flag == F2FS_GET_BLOCK_PRECACHE) {
+		if (map->m_flags & F2FS_MAP_MAPPED) {
+			unsigned int ofs = start_pgofs - map->m_lblk;
+
+			f2fs_update_extent_cache_range(&dn,
+				start_pgofs, map->m_pblk + ofs,
+				map->m_len - ofs);
+		}
+	}
+
 	f2fs_put_dnode(&dn);
 
 	if (create) {
@@ -1039,6 +1092,17 @@ skip:
 	goto next_dnode;
 
 sync_out:
+	if (flag == F2FS_GET_BLOCK_PRECACHE) {
+		if (map->m_flags & F2FS_MAP_MAPPED) {
+			unsigned int ofs = start_pgofs - map->m_lblk;
+
+			f2fs_update_extent_cache_range(&dn,
+				start_pgofs, map->m_pblk + ofs,
+				map->m_len - ofs);
+		}
+		if (map->m_next_extent)
+			*map->m_next_extent = pgofs + 1;
+	}
 	f2fs_put_dnode(&dn);
 unlock_out:
 	if (create) {
@@ -1052,7 +1116,7 @@ out:
 
 static int __get_data_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh, int create, int flag,
-			pgoff_t *next_pgofs)
+			pgoff_t *next_pgofs, int seg_type)
 {
 	struct f2fs_map_blocks map;
 	int err;
@@ -1060,6 +1124,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 	map.m_next_pgofs = next_pgofs;
+	map.m_next_extent = NULL;
+	map.m_seg_type = seg_type;
 
 	err = f2fs_map_blocks(inode, &map, create, flag);
 	if (!err) {
@@ -1075,14 +1141,18 @@ static int get_data_block(struct inode *inode, sector_t iblock,
 			pgoff_t *next_pgofs)
 {
 	return __get_data_block(inode, iblock, bh_result, create,
-							flag, next_pgofs);
+							flag, next_pgofs,
+							NO_CHECK_TYPE);
 }
 
 static int get_data_block_dio(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	return __get_data_block(inode, iblock, bh_result, create,
-						F2FS_GET_BLOCK_DEFAULT, NULL);
+						F2FS_GET_BLOCK_DEFAULT, NULL,
+						rw_hint_to_seg_type(
+							WRITE_LIFE_NOT_SET));
+						/* inode->i_write_hint)); */
 }
 
 static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -1093,7 +1163,8 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock,
 		return -EFBIG;
 
 	return __get_data_block(inode, iblock, bh_result, create,
-						F2FS_GET_BLOCK_BMAP, NULL);
+						F2FS_GET_BLOCK_BMAP, NULL,
+						NO_CHECK_TYPE);
 }
 
 static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -1106,6 +1177,68 @@ static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
 	return (blk << inode->i_blkbits);
 }
 
+static int f2fs_xattr_fiemap(struct inode *inode,
+				struct fiemap_extent_info *fieinfo)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct page *page;
+	struct node_info ni;
+	__u64 phys = 0, len;
+	__u32 flags;
+	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+	int err = 0;
+
+	if (f2fs_has_inline_xattr(inode)) {
+		int offset;
+
+		page = f2fs_grab_cache_page(NODE_MAPPING(sbi),
+						inode->i_ino, false);
+		if (!page)
+			return -ENOMEM;
+
+		get_node_info(sbi, inode->i_ino, &ni);
+
+		phys = (__u64)blk_to_logical(inode, ni.blk_addr);
+		offset = offsetof(struct f2fs_inode, i_addr) +
+					sizeof(__le32) * (DEF_ADDRS_PER_INODE -
+					get_inline_xattr_addrs(inode));
+
+		phys += offset;
+		len = inline_xattr_size(inode);
+
+		f2fs_put_page(page, 1);
+
+		flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED;
+
+		if (!xnid)
+			flags |= FIEMAP_EXTENT_LAST;
+
+		err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
+		if (err || err == 1)
+			return err;
+	}
+
+	if (xnid) {
+		page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false);
+		if (!page)
+			return -ENOMEM;
+
+		get_node_info(sbi, xnid, &ni);
+
+		phys = (__u64)blk_to_logical(inode, ni.blk_addr);
+		len = inode->i_sb->s_blocksize;
+
+		f2fs_put_page(page, 1);
+
+		flags = FIEMAP_EXTENT_LAST;
+	}
+
+	if (phys)
+		err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
+
+	return (err < 0 ? err : 0);
+}
+
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		u64 start, u64 len)
 {
@@ -1116,18 +1249,29 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	u32 flags = 0;
 	int ret = 0;
 
-	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+		ret = f2fs_precache_extents(inode);
+		if (ret)
+			return ret;
+	}
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR);
 	if (ret)
 		return ret;
 
+	inode_lock(inode);
+
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+		ret = f2fs_xattr_fiemap(inode, fieinfo);
+		goto out;
+	}
+
 	if (f2fs_has_inline_data(inode)) {
 		ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
 		if (ret != -EAGAIN)
-			return ret;
+			goto out;
 	}
 
-	inode_lock(inode);
-
 	if (logical_to_blk(inode, len) == 0)
 		len = blk_to_logical(inode, 1);
 
@@ -1197,7 +1341,6 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 			unsigned nr_pages)
 {
 	struct bio *bio = NULL;
-	unsigned page_idx;
 	sector_t last_block_in_bio = 0;
 	struct inode *inode = mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
@@ -1213,9 +1356,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 	map.m_len = 0;
 	map.m_flags = 0;
 	map.m_next_pgofs = NULL;
+	map.m_next_extent = NULL;
+	map.m_seg_type = NO_CHECK_TYPE;
 
-	for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
-
+	for (; nr_pages; nr_pages--) {
 		if (pages) {
 			page = list_last_entry(pages, struct page, lru);
 
@@ -1374,18 +1518,79 @@ retry_encrypt:
 	return PTR_ERR(fio->encrypted_page);
 }
 
+static inline bool check_inplace_update_policy(struct inode *inode,
+				struct f2fs_io_info *fio)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	unsigned int policy = SM_I(sbi)->ipu_policy;
+
+	if (policy & (0x1 << F2FS_IPU_FORCE))
+		return true;
+	if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
+		return true;
+	if (policy & (0x1 << F2FS_IPU_UTIL) &&
+			utilization(sbi) > SM_I(sbi)->min_ipu_util)
+		return true;
+	if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
+			utilization(sbi) > SM_I(sbi)->min_ipu_util)
+		return true;
+
+	/*
+	 * IPU for rewrite async pages
+	 */
+	if (policy & (0x1 << F2FS_IPU_ASYNC) &&
+			fio && fio->op == REQ_OP_WRITE &&
+			!(fio->op_flags & REQ_SYNC) &&
+			!f2fs_encrypted_inode(inode))
+		return true;
+
+	/* this is only set during fdatasync */
+	if (policy & (0x1 << F2FS_IPU_FSYNC) &&
+			is_inode_flag_set(inode, FI_NEED_IPU))
+		return true;
+
+	return false;
+}
+
+bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio)
+{
+	if (f2fs_is_pinned_file(inode))
+		return true;
+
+	/* if this is cold file, we should overwrite to avoid fragmentation */
+	if (file_is_cold(inode))
+		return true;
+
+	return check_inplace_update_policy(inode, fio);
+}
+
+bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (test_opt(sbi, LFS))
+		return true;
+	if (S_ISDIR(inode->i_mode))
+		return true;
+	if (f2fs_is_atomic_file(inode))
+		return true;
+	if (fio) {
+		if (is_cold_data(fio->page))
+			return true;
+		if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
+			return true;
+	}
+	return false;
+}
+
 static inline bool need_inplace_update(struct f2fs_io_info *fio)
 {
 	struct inode *inode = fio->page->mapping->host;
 
-	if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
-		return false;
-	if (is_cold_data(fio->page))
-		return false;
-	if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
+	if (should_update_outplace(inode, fio))
 		return false;
 
-	return need_inplace_update_policy(inode, fio);
+	return should_update_inplace(inode, fio);
 }
 
 static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio)
@@ -1506,10 +1711,17 @@ static int __write_data_page(struct page *page, bool *submitted,
 		.submitted = false,
 		.need_lock = LOCK_RETRY,
 		.io_type = io_type,
+		.io_wbc = wbc,
 	};
 
 	trace_f2fs_writepage(page, DATA);
 
+	/* we should bypass data pages to proceed the kworkder jobs */
+	if (unlikely(f2fs_cp_error(sbi))) {
+		mapping_set_error(page->mapping, -EIO);
+		goto out;
+	}
+
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
 
@@ -1534,12 +1746,6 @@ write:
 			available_free_memory(sbi, BASE_CHECK))))
 		goto redirty_out;
 
-	/* we should bypass data pages to proceed the kworkder jobs */
-	if (unlikely(f2fs_cp_error(sbi))) {
-		mapping_set_error(page->mapping, -EIO);
-		goto out;
-	}
-
 	/* Dentry blocks are controlled by checkpoint */
 	if (S_ISDIR(inode->i_mode)) {
 		fio.need_lock = LOCK_DONE;
@@ -1569,10 +1775,14 @@ write:
 		}
 	}
 
-	down_write(&F2FS_I(inode)->i_sem);
-	if (F2FS_I(inode)->last_disk_size < psize)
-		F2FS_I(inode)->last_disk_size = psize;
-	up_write(&F2FS_I(inode)->i_sem);
+	if (err) {
+		file_set_keep_isize(inode);
+	} else {
+		down_write(&F2FS_I(inode)->i_sem);
+		if (F2FS_I(inode)->last_disk_size < psize)
+			F2FS_I(inode)->last_disk_size = psize;
+		up_write(&F2FS_I(inode)->i_sem);
+	}
 
 done:
 	if (err && err != -ENOENT)
@@ -1936,7 +2146,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *page = NULL;
 	pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
-	bool need_balance = false;
+	bool need_balance = false, drop_atomic = false;
 	block_t blkaddr = NULL_ADDR;
 	int err = 0;
 
@@ -1955,6 +2165,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	if (f2fs_is_atomic_file(inode) &&
 			!available_free_memory(sbi, INMEM_PAGES)) {
 		err = -ENOMEM;
+		drop_atomic = true;
 		goto fail;
 	}
 
@@ -2035,7 +2246,7 @@ repeat:
 fail:
 	f2fs_put_page(page, 1);
 	f2fs_write_failed(mapping, pos + len);
-	if (f2fs_is_atomic_file(inode))
+	if (drop_atomic)
 		drop_inmem_pages_all(sbi);
 	return err;
 }
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index ecada8425268..a66107b5cfff 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -49,14 +49,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
 	si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
 	si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
-
-	si->nquota_files = 0;
-	if (f2fs_sb_has_quota_ino(sbi->sb)) {
-		for (i = 0; i < MAXQUOTAS; i++) {
-			if (f2fs_qf_ino(sbi->sb, i))
-				si->nquota_files++;
-		}
-	}
+	si->nquota_files = sbi->nquota_files;
 	si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
 	si->aw_cnt = atomic_read(&sbi->aw_cnt);
@@ -186,7 +179,6 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
 	si->base_mem += 2 * sizeof(struct f2fs_inode_info);
 	si->base_mem += sizeof(*sbi->ckpt);
-	si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
 
 	/* build sm */
 	si->base_mem += sizeof(struct f2fs_sm_info);
@@ -447,7 +439,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct f2fs_stat_info *si;
 
-	si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
+	si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
 	if (!si)
 		return -ENOMEM;
 
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1955707b138b..bde445e4e690 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -713,6 +713,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 
 	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
 
+	add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO);
+
 	if (f2fs_has_inline_dentry(dir))
 		return f2fs_delete_inline_entry(dentry, page, dir, inode);
 
@@ -798,6 +800,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 	unsigned int bit_pos;
 	struct f2fs_dir_entry *de = NULL;
 	struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
 
 	bit_pos = ((unsigned long)ctx->pos % d->max);
 
@@ -836,6 +839,9 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 					le32_to_cpu(de->ino), d_type))
 			return 1;
 
+		if (sbi->readdir_ra == 1)
+			ra_node_page(sbi, le32_to_cpu(de->ino));
+
 		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
 		ctx->pos = start_pos + bit_pos;
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d3ebe4517881..d2a6b688a622 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -19,6 +19,7 @@
 #include <linux/magic.h>
 #include <linux/kobject.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/vmalloc.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
@@ -44,6 +45,7 @@
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 enum {
 	FAULT_KMALLOC,
+	FAULT_KVMALLOC,
 	FAULT_PAGE_ALLOC,
 	FAULT_PAGE_GET,
 	FAULT_ALLOC_BIO,
@@ -95,6 +97,7 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_PRJQUOTA		0x00200000
 #define F2FS_MOUNT_QUOTA		0x00400000
 #define F2FS_MOUNT_INLINE_XATTR_SIZE	0x00800000
+#define F2FS_MOUNT_RESERVE_ROOT		0x01000000
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -122,6 +125,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_INODE_CHKSUM	0x0020
 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR	0x0040
 #define F2FS_FEATURE_QUOTA_INO		0x0080
+#define F2FS_FEATURE_INODE_CRTIME	0x0100
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -191,6 +195,12 @@ static inline struct timespec current_time(struct inode *inode)
 	return timespec_trunc(now, inode->i_sb->s_time_gran);
 }
 
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define	F2FS_DEF_RESUID		0
+#define	F2FS_DEF_RESGID		0
+
 /*
  * For checkpoint manager
  */
@@ -241,6 +251,7 @@ enum {
 	ORPHAN_INO,		/* for orphan ino list */
 	APPEND_INO,		/* for append ino list */
 	UPDATE_INO,		/* for update ino list */
+	TRANS_DIR_INO,		/* for trasactions dir ino list */
 	FLUSH_INO,		/* for multiple device flushing */
 	MAX_INO_ENTRY,		/* max. list */
 };
@@ -326,7 +337,6 @@ struct discard_cmd_control {
 	struct task_struct *f2fs_issue_discard;	/* discard thread */
 	struct list_head entry_list;		/* 4KB discard entry list */
 	struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
-	unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */
 	struct list_head wait_list;		/* store on-flushing entries */
 	struct list_head fstrim_list;		/* in-flight discard from fstrim */
 	wait_queue_head_t discard_wait_queue;	/* waiting queue for wake-up */
@@ -409,6 +419,9 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
 #define F2FS_IOC_GARBAGE_COLLECT_RANGE	_IOW(F2FS_IOCTL_MAGIC, 11,	\
 						struct f2fs_gc_range)
 #define F2FS_IOC_GET_FEATURES		_IOR(F2FS_IOCTL_MAGIC, 12, __u32)
+#define F2FS_IOC_SET_PIN_FILE		_IOW(F2FS_IOCTL_MAGIC, 13, __u32)
+#define F2FS_IOC_GET_PIN_FILE		_IOR(F2FS_IOCTL_MAGIC, 14, __u32)
+#define F2FS_IOC_PRECACHE_EXTENTS	_IO(F2FS_IOCTL_MAGIC, 15)
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY	FS_IOC_SET_ENCRYPTION_POLICY
 #define F2FS_IOC_GET_ENCRYPTION_POLICY	FS_IOC_GET_ENCRYPTION_POLICY
@@ -461,10 +474,9 @@ struct f2fs_flush_device {
 #define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
 static inline int get_inline_xattr_addrs(struct inode *inode);
-#define F2FS_INLINE_XATTR_ADDRS(inode)	get_inline_xattr_addrs(inode)
 #define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
 				(CUR_ADDRS_PER_INODE(inode) -		\
-				F2FS_INLINE_XATTR_ADDRS(inode) -	\
+				get_inline_xattr_addrs(inode) -	\
 				DEF_INLINE_RESERVED_SIZE))
 
 /* for inline dir */
@@ -601,6 +613,8 @@ struct f2fs_map_blocks {
 	unsigned int m_len;
 	unsigned int m_flags;
 	pgoff_t *m_next_pgofs;		/* point next possible non-hole pgofs */
+	pgoff_t *m_next_extent;		/* point to next possible extent */
+	int m_seg_type;
 };
 
 /* for flag in get_data_block */
@@ -610,6 +624,7 @@ enum {
 	F2FS_GET_BLOCK_BMAP,
 	F2FS_GET_BLOCK_PRE_DIO,
 	F2FS_GET_BLOCK_PRE_AIO,
+	F2FS_GET_BLOCK_PRECACHE,
 };
 
 /*
@@ -642,7 +657,10 @@ struct f2fs_inode_info {
 	unsigned long i_flags;		/* keep an inode flags for ioctl */
 	unsigned char i_advise;		/* use to give file attribute hints */
 	unsigned char i_dir_level;	/* use for dentry level for large dir */
-	unsigned int i_current_depth;	/* use only in directory structure */
+	union {
+		unsigned int i_current_depth;	/* only for directory depth */
+		unsigned short i_gc_failures;	/* only for regular file */
+	};
 	unsigned int i_pino;		/* parent inode number */
 	umode_t i_acl_mode;		/* keep file acl mode temporarily */
 
@@ -677,6 +695,7 @@ struct f2fs_inode_info {
 	int i_extra_isize;		/* size of extra space located in i_addr */
 	kprojid_t i_projid;		/* id for project quota */
 	int i_inline_xattr_size;	/* inline xattr size */
+	struct timespec i_crtime;	/* inode creation time */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -981,6 +1000,7 @@ enum cp_reason_type {
 	CP_NODE_NEED_CP,
 	CP_FASTBOOT_MODE,
 	CP_SPEC_LOG_NUM,
+	CP_RECOVER_DIR,
 };
 
 enum iostat_type {
@@ -1016,6 +1036,7 @@ struct f2fs_io_info {
 	int need_lock;		/* indicate we need to lock cp_rwsem */
 	bool in_list;		/* indicate fio is in io_list */
 	enum iostat_type io_type;	/* io type */
+	struct writeback_control *io_wbc; /* writeback control */
 };
 
 #define is_read_io(rw) ((rw) == READ)
@@ -1152,6 +1173,7 @@ struct f2fs_sb_info {
 	int dir_level;				/* directory level */
 	int inline_xattr_size;			/* inline xattr size */
 	unsigned int trigger_ssr_threshold;	/* threshold to trigger ssr */
+	int readdir_ra;				/* readahead inode in readdir */
 
 	block_t user_block_count;		/* # of user blocks */
 	block_t total_valid_block_count;	/* # of valid blocks */
@@ -1159,6 +1181,11 @@ struct f2fs_sb_info {
 	block_t last_valid_block_count;		/* for recovery */
 	block_t reserved_blocks;		/* configurable reserved blocks */
 	block_t current_reserved_blocks;	/* current reserved blocks */
+	block_t root_reserved_blocks;		/* root reserved blocks */
+	kuid_t s_resuid;			/* reserved blocks for uid */
+	kgid_t s_resgid;			/* reserved blocks for gid */
+
+	unsigned int nquota_files;		/* # of quota sysfile */
 
 	u32 s_next_generation;			/* for NFS support */
 
@@ -1183,6 +1210,9 @@ struct f2fs_sb_info {
 	/* threshold for converting bg victims for fg */
 	u64 fggc_threshold;
 
+	/* threshold for gc trials on pinned files */
+	u64 gc_pin_file_threshold;
+
 	/* maximum # of trials to find a victim segment for SSR and GC */
 	unsigned int max_victim_search;
 
@@ -1309,33 +1339,7 @@ static inline bool is_idle(struct f2fs_sb_info *sbi)
 /*
  * Inline functions
  */
-static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
-			   unsigned int length)
-{
-	SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
-	u32 *ctx = (u32 *)shash_desc_ctx(shash);
-	u32 retval;
-	int err;
-
-	shash->tfm = sbi->s_chksum_driver;
-	shash->flags = 0;
-	*ctx = F2FS_SUPER_MAGIC;
-
-	err = crypto_shash_update(shash, address, length);
-	BUG_ON(err);
-
-	retval = *ctx;
-	barrier_data(ctx);
-	return retval;
-}
-
-static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
-				  void *buf, size_t buf_size)
-{
-	return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
-}
-
-static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc,
+static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc,
 			      const void *address, unsigned int length)
 {
 	struct {
@@ -1356,6 +1360,24 @@ static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc,
 	return *(u32 *)desc.ctx;
 }
 
+static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
+			   unsigned int length)
+{
+	return __f2fs_crc32(sbi, F2FS_SUPER_MAGIC, address, length);
+}
+
+static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
+				  void *buf, size_t buf_size)
+{
+	return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
+}
+
+static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc,
+			      const void *address, unsigned int length)
+{
+	return __f2fs_crc32(sbi, crc, address, length);
+}
+
 static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
 {
 	return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -1614,6 +1636,25 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
 	return ofs == XATTR_NODE_OFFSET;
 }
 
+static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
+					struct inode *inode)
+{
+	if (!inode)
+		return true;
+	if (!test_opt(sbi, RESERVE_ROOT))
+		return false;
+	if (IS_NOQUOTA(inode))
+		return true;
+	if (capable(CAP_SYS_RESOURCE))
+		return true;
+	if (uid_eq(sbi->s_resuid, current_fsuid()))
+		return true;
+	if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) &&
+					in_group_p(sbi->s_resgid))
+		return true;
+	return false;
+}
+
 static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool);
 static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 				 struct inode *inode, blkcnt_t *count)
@@ -1643,11 +1684,17 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 	sbi->total_valid_block_count += (block_t)(*count);
 	avail_user_block_count = sbi->user_block_count -
 					sbi->current_reserved_blocks;
+
+	if (!__allow_reserved_blocks(sbi, inode))
+		avail_user_block_count -= sbi->root_reserved_blocks;
+
 	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
 		diff = sbi->total_valid_block_count - avail_user_block_count;
+		if (diff > *count)
+			diff = *count;
 		*count -= diff;
 		release = diff;
-		sbi->total_valid_block_count = avail_user_block_count;
+		sbi->total_valid_block_count -= diff;
 		if (!*count) {
 			spin_unlock(&sbi->stat_lock);
 			percpu_counter_sub(&sbi->alloc_valid_block_count, diff);
@@ -1656,7 +1703,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 	}
 	spin_unlock(&sbi->stat_lock);
 
-	if (release)
+	if (unlikely(release))
 		dquot_release_reservation_block(inode, release);
 	f2fs_i_blocks_write(inode, *count, true, true);
 	return 0;
@@ -1836,9 +1883,13 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 
 	spin_lock(&sbi->stat_lock);
 
-	valid_block_count = sbi->total_valid_block_count + 1;
-	if (unlikely(valid_block_count + sbi->current_reserved_blocks >
-						sbi->user_block_count)) {
+	valid_block_count = sbi->total_valid_block_count +
+					sbi->current_reserved_blocks + 1;
+
+	if (!__allow_reserved_blocks(sbi, inode))
+		valid_block_count += sbi->root_reserved_blocks;
+
+	if (unlikely(valid_block_count > sbi->user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
 		goto enospc;
 	}
@@ -2051,11 +2102,11 @@ static inline block_t datablock_addr(struct inode *inode,
 	raw_node = F2FS_NODE(node_page);
 
 	/* from GC path only */
-	if (!inode) {
-		if (is_inode)
+	if (is_inode) {
+		if (!inode)
 			base = offset_in_addr(&raw_node->i);
-	} else if (f2fs_has_extra_attr(inode) && is_inode) {
-		base = get_extra_isize(inode);
+		else if (f2fs_has_extra_attr(inode))
+			base = get_extra_isize(inode);
 	}
 
 	addr_array = blkaddr_in_node(raw_node);
@@ -2166,6 +2217,7 @@ enum {
 	FI_HOT_DATA,		/* indicate file is hot */
 	FI_EXTRA_ATTR,		/* indicate file has extra attribute */
 	FI_PROJ_INHERIT,	/* indicate file inherits projectid */
+	FI_PIN_FILE,		/* indicate file should not be gced */
 };
 
 static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -2175,10 +2227,12 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
 	case FI_INLINE_XATTR:
 	case FI_INLINE_DATA:
 	case FI_INLINE_DENTRY:
+	case FI_NEW_INODE:
 		if (set)
 			return;
 	case FI_DATA_EXIST:
 	case FI_INLINE_DOTS:
+	case FI_PIN_FILE:
 		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
@@ -2259,6 +2313,13 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
 	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
+static inline void f2fs_i_gc_failures_write(struct inode *inode,
+					unsigned int count)
+{
+	F2FS_I(inode)->i_gc_failures = count;
+	f2fs_mark_inode_dirty_sync(inode, true);
+}
+
 static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
 {
 	F2FS_I(inode)->i_xattr_nid = xnid;
@@ -2287,6 +2348,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
 		set_bit(FI_INLINE_DOTS, &fi->flags);
 	if (ri->i_inline & F2FS_EXTRA_ATTR)
 		set_bit(FI_EXTRA_ATTR, &fi->flags);
+	if (ri->i_inline & F2FS_PIN_FILE)
+		set_bit(FI_PIN_FILE, &fi->flags);
 }
 
 static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -2305,6 +2368,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
 		ri->i_inline |= F2FS_INLINE_DOTS;
 	if (is_inode_flag_set(inode, FI_EXTRA_ATTR))
 		ri->i_inline |= F2FS_EXTRA_ATTR;
+	if (is_inode_flag_set(inode, FI_PIN_FILE))
+		ri->i_inline |= F2FS_PIN_FILE;
 }
 
 static inline int f2fs_has_extra_attr(struct inode *inode)
@@ -2319,7 +2384,7 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
 
 static inline unsigned int addrs_per_inode(struct inode *inode)
 {
-	return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode);
+	return CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode);
 }
 
 static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
@@ -2327,7 +2392,7 @@ static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
 	struct f2fs_inode *ri = F2FS_INODE(page);
 
 	return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
-					F2FS_INLINE_XATTR_ADDRS(inode)]);
+					get_inline_xattr_addrs(inode)]);
 }
 
 static inline int inline_xattr_size(struct inode *inode)
@@ -2350,6 +2415,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode)
 	return is_inode_flag_set(inode, FI_INLINE_DOTS);
 }
 
+static inline bool f2fs_is_pinned_file(struct inode *inode)
+{
+	return is_inode_flag_set(inode, FI_PIN_FILE);
+}
+
 static inline bool f2fs_is_atomic_file(struct inode *inode)
 {
 	return is_inode_flag_set(inode, FI_ATOMIC_FILE);
@@ -2498,12 +2568,44 @@ static inline void *kvzalloc(size_t size, gfp_t flags)
 	return ret;
 }
 
+enum rw_hint {
+	WRITE_LIFE_NOT_SET	= 0,
+	WRITE_LIFE_NONE		= 1, /* RWH_WRITE_LIFE_NONE */
+	WRITE_LIFE_SHORT	= 2, /* RWH_WRITE_LIFE_SHORT */
+	WRITE_LIFE_MEDIUM	= 3, /* RWH_WRITE_LIFE_MEDIUM */
+	WRITE_LIFE_LONG		= 4, /* RWH_WRITE_LIFE_LONG */
+	WRITE_LIFE_EXTREME	= 5, /* RWH_WRITE_LIFE_EXTREME */
+};
+
+static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi,
+					size_t size, gfp_t flags)
+{
+	return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO);
+}
+
+static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi,
+					size_t size, gfp_t flags)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_KVMALLOC)) {
+		f2fs_show_injection_info(FAULT_KVMALLOC);
+		return NULL;
+	}
+#endif
+	return kvmalloc(size, flags);
+}
+
+static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi,
+					size_t size, gfp_t flags)
+{
+	return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO);
+}
+
 static inline int get_extra_isize(struct inode *inode)
 {
 	return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
 }
 
-static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb);
 static inline int get_inline_xattr_addrs(struct inode *inode)
 {
 	return F2FS_I(inode)->i_inline_xattr_size;
@@ -2559,9 +2661,11 @@ int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			struct kstat *stat);
 int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
 int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
-int truncate_data_blocks_range(struct dnode_of_data *dn, int count);
+void truncate_data_blocks_range(struct dnode_of_data *dn, int count);
+int f2fs_precache_extents(struct inode *inode);
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int f2fs_pin_file_control(struct inode *inode, bool inc);
 
 /*
  * inode.c
@@ -2572,8 +2676,8 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page);
 struct inode *f2fs_iget(struct super_block *sb, unsigned long ino);
 struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino);
 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink);
-int update_inode(struct inode *inode, struct page *node_page);
-int update_inode_page(struct inode *inode);
+void update_inode(struct inode *inode, struct page *node_page);
+void update_inode_page(struct inode *inode);
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void f2fs_evict_inode(struct inode *inode);
 void handle_failed_inode(struct inode *inode);
@@ -2684,10 +2788,9 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
 void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid);
 int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink);
 void recover_inline_xattr(struct inode *inode, struct page *page);
-int recover_xattr_data(struct inode *inode, struct page *page,
-			block_t blkaddr);
+int recover_xattr_data(struct inode *inode, struct page *page);
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
-int restore_node_summary(struct f2fs_sb_info *sbi,
+void restore_node_summary(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct f2fs_summary_block *sum);
 void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 int build_node_manager(struct f2fs_sb_info *sbi);
@@ -2714,6 +2817,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
 void init_discard_policy(struct discard_policy *dpolicy, int discard_type,
 						unsigned int granularity);
+void drop_discard_cmd(struct f2fs_sb_info *sbi);
 void stop_discard_thread(struct f2fs_sb_info *sbi);
 bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
 void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
@@ -2752,6 +2856,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi);
 void destroy_segment_manager(struct f2fs_sb_info *sbi);
 int __init create_segment_manager_caches(void);
 void destroy_segment_manager_caches(void);
+int rw_hint_to_seg_type(enum rw_hint hint);
 
 /*
  * checkpoint.c
@@ -2821,6 +2926,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 			int create, int flag);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			u64 start, u64 len);
+bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio);
+bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio);
 void f2fs_set_page_dirty_nobuffers(struct page *page);
 int __f2fs_write_data_pages(struct address_space *mapping,
 						struct writeback_control *wbc,
@@ -3189,6 +3296,11 @@ static inline int f2fs_sb_has_quota_ino(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO);
 }
 
+static inline int f2fs_sb_has_inode_crtime(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index bfff53f658e1..65cda5bc61b7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -168,6 +168,9 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
 		cp_reason = CP_FASTBOOT_MODE;
 	else if (sbi->active_logs == 2)
 		cp_reason = CP_SPEC_LOG_NUM;
+	else if (need_dentry_mark(sbi, inode->i_ino) &&
+		exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO))
+		cp_reason = CP_RECOVER_DIR;
 
 	return cp_reason;
 }
@@ -474,26 +477,14 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 static int f2fs_file_open(struct inode *inode, struct file *filp)
 {
-	struct dentry *dir;
+	int err = fscrypt_file_open(inode, filp);
 
-	if (f2fs_encrypted_inode(inode)) {
-		int ret = fscrypt_get_encryption_info(inode);
-		if (ret)
-			return -EACCES;
-		if (!fscrypt_has_encryption_key(inode))
-			return -ENOKEY;
-	}
-	dir = dget_parent(file_dentry(filp));
-	if (f2fs_encrypted_inode(d_inode(dir)) &&
-			!fscrypt_has_permitted_context(d_inode(dir), inode)) {
-		dput(dir);
-		return -EPERM;
-	}
-	dput(dir);
+	if (err)
+		return err;
 	return dquot_file_open(inode, filp);
 }
 
-int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+void truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct f2fs_node *raw_node;
@@ -536,7 +527,6 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 	f2fs_update_time(sbi, REQ_TIME);
 	trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
 					 dn->ofs_in_node, nr_free);
-	return nr_free;
 }
 
 void truncate_data_blocks(struct dnode_of_data *dn)
@@ -683,6 +673,37 @@ int f2fs_getattr(struct vfsmount *mnt,
 			 struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = d_inode(dentry);
+#if 0
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_inode *ri;
+	unsigned int flags;
+
+	if (f2fs_has_extra_attr(inode) &&
+			f2fs_sb_has_inode_crtime(inode->i_sb) &&
+			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
+		stat->result_mask |= STATX_BTIME;
+		stat->btime.tv_sec = fi->i_crtime.tv_sec;
+		stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
+	}
+
+	flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL);
+	if (flags & FS_APPEND_FL)
+		stat->attributes |= STATX_ATTR_APPEND;
+	if (flags & FS_COMPR_FL)
+		stat->attributes |= STATX_ATTR_COMPRESSED;
+	if (f2fs_encrypted_inode(inode))
+		stat->attributes |= STATX_ATTR_ENCRYPTED;
+	if (flags & FS_IMMUTABLE_FL)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	if (flags & FS_NODUMP_FL)
+		stat->attributes |= STATX_ATTR_NODUMP;
+
+	stat->attributes_mask |= (STATX_ATTR_APPEND |
+				  STATX_ATTR_COMPRESSED |
+				  STATX_ATTR_ENCRYPTED |
+				  STATX_ATTR_IMMUTABLE |
+				  STATX_ATTR_NODUMP);
+#endif
 	generic_fillattr(inode, stat);
 
 	/* we need to show initial sectors used for inline_data/dentries */
@@ -736,6 +757,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		return err;
 
+	err = fscrypt_prepare_setattr(dentry, attr);
+	if (err)
+		return err;
+
 	if (is_quota_modification(inode, attr)) {
 		err = dquot_initialize(inode);
 		if (err)
@@ -751,14 +776,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (f2fs_encrypted_inode(inode)) {
-			err = fscrypt_get_encryption_info(inode);
-			if (err)
-				return err;
-			if (!fscrypt_has_encryption_key(inode))
-				return -ENOKEY;
-		}
-
 		if (attr->ia_size <= i_size_read(inode)) {
 			down_write(&F2FS_I(inode)->i_mmap_sem);
 			truncate_setsize(inode, attr->ia_size);
@@ -1098,11 +1115,13 @@ static int __exchange_data_block(struct inode *src_inode,
 	while (len) {
 		olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
 
-		src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+		src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode),
+					sizeof(block_t) * olen, GFP_KERNEL);
 		if (!src_blkaddr)
 			return -ENOMEM;
 
-		do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+		do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode),
+					sizeof(int) * olen, GFP_KERNEL);
 		if (!do_replace) {
 			kvfree(src_blkaddr);
 			return -ENOMEM;
@@ -1170,14 +1189,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	pg_start = offset >> PAGE_SHIFT;
 	pg_end = (offset + len) >> PAGE_SHIFT;
 
+	/* avoid gc operation during block exchange */
+	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
 	down_write(&F2FS_I(inode)->i_mmap_sem);
 	/* write out all dirty pages from offset */
 	ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
 	if (ret)
-		goto out;
-
-	/* avoid gc operation during block exchange */
-	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+		goto out_unlock;
 
 	truncate_pagecache(inode, offset);
 
@@ -1196,9 +1215,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
 out_unlock:
-	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
-out:
 	up_write(&F2FS_I(inode)->i_mmap_sem);
+	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 	return ret;
 }
 
@@ -1369,6 +1387,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	f2fs_balance_fs(sbi, true);
 
+	/* avoid gc operation during block exchange */
+	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
 	down_write(&F2FS_I(inode)->i_mmap_sem);
 	ret = truncate_blocks(inode, i_size_read(inode), true);
 	if (ret)
@@ -1379,9 +1400,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		goto out;
 
-	/* avoid gc operation during block exchange */
-	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
-
 	truncate_pagecache(inode, offset);
 
 	pg_start = offset >> PAGE_SHIFT;
@@ -1409,10 +1427,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
-
-	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 out:
 	up_write(&F2FS_I(inode)->i_mmap_sem);
+	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 	return ret;
 }
 
@@ -1420,7 +1437,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 					loff_t len, int mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+	struct f2fs_map_blocks map = { .m_next_pgofs = NULL,
+			.m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE };
 	pgoff_t pg_end;
 	loff_t new_size = i_size_read(inode);
 	loff_t off_end;
@@ -1828,14 +1846,20 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 	switch (in) {
 	case F2FS_GOING_DOWN_FULLSYNC:
 		sb = freeze_bdev(sb->s_bdev);
-		if (sb && !IS_ERR(sb)) {
+		if (IS_ERR(sb)) {
+			ret = PTR_ERR(sb);
+			goto out;
+		}
+		if (sb) {
 			f2fs_stop_checkpoint(sbi, false);
 			thaw_bdev(sb->s_bdev, sb);
 		}
 		break;
 	case F2FS_GOING_DOWN_METASYNC:
 		/* do checkpoint only */
-		f2fs_sync_fs(sb, 1);
+		ret = f2fs_sync_fs(sb, 1);
+		if (ret)
+			goto out;
 		f2fs_stop_checkpoint(sbi, false);
 		break;
 	case F2FS_GOING_DOWN_NOSYNC:
@@ -1849,6 +1873,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 		ret = -EINVAL;
 		goto out;
 	}
+
+	stop_gc_thread(sbi);
+	stop_discard_thread(sbi);
+
+	drop_discard_cmd(sbi);
+	clear_opt(sbi, DISCARD);
+
 	f2fs_update_time(sbi, REQ_TIME);
 out:
 	mnt_drop_write_file(filp);
@@ -2060,9 +2091,10 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 					struct f2fs_defragment *range)
 {
 	struct inode *inode = file_inode(filp);
-	struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+	struct f2fs_map_blocks map = { .m_next_extent = NULL,
+					.m_seg_type = NO_CHECK_TYPE };
 	struct extent_info ei = {0,0,0};
-	pgoff_t pg_start, pg_end;
+	pgoff_t pg_start, pg_end, next_pgofs;
 	unsigned int blk_per_seg = sbi->blocks_per_seg;
 	unsigned int total = 0, sec_num;
 	block_t blk_end = 0;
@@ -2070,7 +2102,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	int err;
 
 	/* if in-place-update policy is enabled, don't waste time here */
-	if (need_inplace_update_policy(inode, NULL))
+	if (should_update_inplace(inode, NULL))
 		return -EINVAL;
 
 	pg_start = range->start >> PAGE_SHIFT;
@@ -2096,6 +2128,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	}
 
 	map.m_lblk = pg_start;
+	map.m_next_pgofs = &next_pgofs;
 
 	/*
 	 * lookup mapping info in dnode page cache, skip defragmenting if all
@@ -2109,14 +2142,16 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 			goto out;
 
 		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
-			map.m_lblk++;
+			map.m_lblk = next_pgofs;
 			continue;
 		}
 
-		if (blk_end && blk_end != map.m_pblk) {
+		if (blk_end && blk_end != map.m_pblk)
 			fragmented = true;
-			break;
-		}
+
+		/* record total count of block that we're going to move */
+		total += map.m_len;
+
 		blk_end = map.m_pblk + map.m_len;
 
 		map.m_lblk += map.m_len;
@@ -2125,10 +2160,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	if (!fragmented)
 		goto out;
 
-	map.m_lblk = pg_start;
-	map.m_len = pg_end - pg_start;
-
-	sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi);
+	sec_num = (total + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi);
 
 	/*
 	 * make sure there are enough free section for LFS allocation, this can
@@ -2140,6 +2172,10 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 		goto out;
 	}
 
+	map.m_lblk = pg_start;
+	map.m_len = pg_end - pg_start;
+	total = 0;
+
 	while (map.m_lblk < pg_end) {
 		pgoff_t idx;
 		int cnt = 0;
@@ -2151,7 +2187,7 @@ do_map:
 			goto clear_out;
 
 		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
-			map.m_lblk++;
+			map.m_lblk = next_pgofs;
 			continue;
 		}
 
@@ -2463,6 +2499,125 @@ static int f2fs_ioc_get_features(struct file *filp, unsigned long arg)
 	return put_user(sb_feature, (u32 __user *)arg);
 }
 
+int f2fs_pin_file_control(struct inode *inode, bool inc)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	/* Use i_gc_failures for normal file as a risk signal. */
+	if (inc)
+		f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1);
+
+	if (fi->i_gc_failures > sbi->gc_pin_file_threshold) {
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"%s: Enable GC = ino %lx after %x GC trials\n",
+			__func__, inode->i_ino, fi->i_gc_failures);
+		clear_inode_flag(inode, FI_PIN_FILE);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	__u32 pin;
+	int ret = 0;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	if (get_user(pin, (__u32 __user *)arg))
+		return -EFAULT;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (f2fs_readonly(F2FS_I_SB(inode)->sb))
+		return -EROFS;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	inode_lock(inode);
+
+	if (should_update_outplace(inode, NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!pin) {
+		clear_inode_flag(inode, FI_PIN_FILE);
+		F2FS_I(inode)->i_gc_failures = 1;
+		goto done;
+	}
+
+	if (f2fs_pin_file_control(inode, false)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		goto out;
+
+	set_inode_flag(inode, FI_PIN_FILE);
+	ret = F2FS_I(inode)->i_gc_failures;
+done:
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+out:
+	inode_unlock(inode);
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	__u32 pin = 0;
+
+	if (is_inode_flag_set(inode, FI_PIN_FILE))
+		pin = F2FS_I(inode)->i_gc_failures;
+	return put_user(pin, (u32 __user *)arg);
+}
+
+int f2fs_precache_extents(struct inode *inode)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_map_blocks map;
+	pgoff_t m_next_extent;
+	loff_t end;
+	int err;
+
+	if (is_inode_flag_set(inode, FI_NO_EXTENT))
+		return -EOPNOTSUPP;
+
+	map.m_lblk = 0;
+	map.m_next_pgofs = NULL;
+	map.m_next_extent = &m_next_extent;
+	map.m_seg_type = NO_CHECK_TYPE;
+	end = F2FS_I_SB(inode)->max_file_blocks;
+
+	while (map.m_lblk < end) {
+		map.m_len = end - map.m_lblk;
+
+		down_write(&fi->dio_rwsem[WRITE]);
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE);
+		up_write(&fi->dio_rwsem[WRITE]);
+		if (err)
+			return err;
+
+		map.m_lblk = m_next_extent;
+	}
+
+	return err;
+}
+
+static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg)
+{
+	return f2fs_precache_extents(file_inode(filp));
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -2509,6 +2664,12 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_flush_device(filp, arg);
 	case F2FS_IOC_GET_FEATURES:
 		return f2fs_ioc_get_features(filp, arg);
+	case F2FS_IOC_GET_PIN_FILE:
+		return f2fs_ioc_get_pin_file(filp, arg);
+	case F2FS_IOC_SET_PIN_FILE:
+		return f2fs_ioc_set_pin_file(filp, arg);
+	case F2FS_IOC_PRECACHE_EXTENTS:
+		return f2fs_ioc_precache_extents(filp, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -2587,6 +2748,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC_MOVE_RANGE:
 	case F2FS_IOC_FLUSH_DEVICE:
 	case F2FS_IOC_GET_FEATURES:
+	case F2FS_IOC_GET_PIN_FILE:
+	case F2FS_IOC_SET_PIN_FILE:
+	case F2FS_IOC_PRECACHE_EXTENTS:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index be9fd616736b..d0de3429c26c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -624,6 +624,11 @@ static void move_data_block(struct inode *inode, block_t bidx,
 	if (f2fs_is_atomic_file(inode))
 		goto out;
 
+	if (f2fs_is_pinned_file(inode)) {
+		f2fs_pin_file_control(inode, true);
+		goto out;
+	}
+
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
 	if (err)
@@ -686,7 +691,12 @@ static void move_data_block(struct inode *inode, block_t bidx,
 	fio.op = REQ_OP_WRITE;
 	fio.op_flags = REQ_SYNC | REQ_NOIDLE;
 	fio.new_blkaddr = newaddr;
-	f2fs_submit_page_write(&fio);
+	err = f2fs_submit_page_write(&fio);
+	if (err) {
+		if (PageWriteback(fio.encrypted_page))
+			end_page_writeback(fio.encrypted_page);
+		goto put_page_out;
+	}
 
 	f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
 
@@ -720,6 +730,11 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
 
 	if (f2fs_is_atomic_file(inode))
 		goto out;
+	if (f2fs_is_pinned_file(inode)) {
+		if (gc_type == FG_GC)
+			f2fs_pin_file_control(inode, true);
+		goto out;
+	}
 
 	if (gc_type == BG_GC) {
 		if (PageWriteback(page))
@@ -1091,6 +1106,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
 
 	sbi->fggc_threshold = div64_u64((main_count - ovp_count) *
 				BLKS_PER_SEC(sbi), (main_count - resv_count));
+	sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
 
 	/* give warm/cold data area from slower device */
 	if (sbi->s_ndevs && sbi->segs_per_sec == 1)
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 9325191fab2d..b0045d4c8d1e 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -20,6 +20,8 @@
 #define LIMIT_INVALID_BLOCK	40 /* percentage over total user space */
 #define LIMIT_FREE_BLOCK	40 /* percentage over invalid + free space */
 
+#define DEF_GC_FAILED_PINNED_FILES	2048
+
 /* Search max. number of dirty segments to select a victim segment */
 #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
 
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index b4c4f2b25304..89c838bfb067 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -22,6 +22,9 @@
 
 void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
 {
+	if (is_inode_flag_set(inode, FI_NEW_INODE))
+		return;
+
 	if (f2fs_inode_dirtied(inode, sync))
 		return;
 
@@ -275,6 +278,12 @@ static int do_read_inode(struct inode *inode)
 		i_projid = F2FS_DEF_PROJID;
 	fi->i_projid = make_kprojid(&init_user_ns, i_projid);
 
+	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) &&
+			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
+		fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime);
+		fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
+	}
+
 	f2fs_put_page(node_page, 1);
 
 	stat_inc_inline_xattr(inode);
@@ -360,14 +369,15 @@ retry:
 	return inode;
 }
 
-int update_inode(struct inode *inode, struct page *node_page)
+void update_inode(struct inode *inode, struct page *node_page)
 {
 	struct f2fs_inode *ri;
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 
-	f2fs_inode_synced(inode);
-
 	f2fs_wait_on_page_writeback(node_page, NODE, true);
+	set_page_dirty(node_page);
+
+	f2fs_inode_synced(inode);
 
 	ri = F2FS_INODE(node_page);
 
@@ -417,6 +427,15 @@ int update_inode(struct inode *inode, struct page *node_page)
 						F2FS_I(inode)->i_projid);
 			ri->i_projid = cpu_to_le32(i_projid);
 		}
+
+		if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) &&
+			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
+								i_crtime)) {
+			ri->i_crtime =
+				cpu_to_le64(F2FS_I(inode)->i_crtime.tv_sec);
+			ri->i_crtime_nsec =
+				cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec);
+		}
 	}
 
 	__set_inode_rdev(inode, ri);
@@ -426,14 +445,12 @@ int update_inode(struct inode *inode, struct page *node_page)
 	if (inode->i_nlink == 0)
 		clear_inline_node(node_page);
 
-	return set_page_dirty(node_page);
 }
 
-int update_inode_page(struct inode *inode)
+void update_inode_page(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *node_page;
-	int ret = 0;
 retry:
 	node_page = get_node_page(sbi, inode->i_ino);
 	if (IS_ERR(node_page)) {
@@ -444,11 +461,10 @@ retry:
 		} else if (err != -ENOENT) {
 			f2fs_stop_checkpoint(sbi, false);
 		}
-		return 0;
+		return;
 	}
-	ret = update_inode(inode, node_page);
+	update_inode(inode, node_page);
 	f2fs_put_page(node_page, 1);
-	return ret;
 }
 
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index cf8f4370d256..da7f709e3926 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -50,7 +50,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+	inode->i_mtime = inode->i_atime = inode->i_ctime =
+			F2FS_I(inode)->i_crtime = current_time(inode);
 	inode->i_generation = sbi->s_next_generation++;
 
 	err = insert_inode_locked(inode);
@@ -74,12 +75,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (err)
 		goto fail_drop;
 
+	set_inode_flag(inode, FI_NEW_INODE);
+
 	/* If the directory encrypted, then we should encrypt the inode. */
 	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
 		f2fs_set_encrypted_inode(inode);
 
-	set_inode_flag(inode, FI_NEW_INODE);
-
 	if (f2fs_sb_has_extra_attr(sbi->sb)) {
 		set_inode_flag(inode, FI_EXTRA_ATTR);
 		F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
@@ -240,9 +241,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
-	if (f2fs_encrypted_inode(dir) &&
-			!fscrypt_has_permitted_context(dir, inode))
-		return -EPERM;
+	err = fscrypt_prepare_link(old_dentry, dir, dentry);
+	if (err)
+		return err;
 
 	if (is_inode_flag_set(dir, FI_PROJ_INHERIT) &&
 			(!projid_eq(F2FS_I(dir)->i_projid,
@@ -357,20 +358,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 
 	trace_f2fs_lookup_start(dir, dentry, flags);
 
-	if (f2fs_encrypted_inode(dir)) {
-		err = fscrypt_get_encryption_info(dir);
-
-		/*
-		 * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
-		 * created while the directory was encrypted and we
-		 * don't have access to the key.
-		 */
-		if (fscrypt_has_encryption_key(dir))
-			fscrypt_set_encrypted_dentry(dentry);
-		fscrypt_set_d_op(dentry);
-		if (err && err != -ENOKEY)
-			goto out;
-	}
+	err = fscrypt_prepare_lookup(dir, dentry, flags);
+	if (err)
+		goto out;
 
 	if (dentry->d_name.len > F2FS_NAME_LEN) {
 		err = -ENAMETOOLONG;
@@ -541,7 +531,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 		struct qstr istr = QSTR_INIT(symname, len);
 		struct fscrypt_str ostr;
 
-		sd = kzalloc(disk_link.len, GFP_NOFS);
+		sd = f2fs_kzalloc(sbi, disk_link.len, GFP_NOFS);
 		if (!sd) {
 			err = -ENOMEM;
 			goto err_out;
@@ -797,18 +787,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
-	if ((f2fs_encrypted_inode(old_dir) &&
-			!fscrypt_has_encryption_key(old_dir)) ||
-			(f2fs_encrypted_inode(new_dir) &&
-			!fscrypt_has_encryption_key(new_dir)))
-		return -ENOKEY;
-
-	if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
-			!fscrypt_has_permitted_context(new_dir, old_inode)) {
-		err = -EPERM;
-		goto out;
-	}
-
 	if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
 			(!projid_eq(F2FS_I(new_dir)->i_projid,
 			F2FS_I(old_dentry->d_inode)->i_projid)))
@@ -955,6 +933,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 		f2fs_i_links_write(old_dir, false);
 	}
+	add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
 
 	f2fs_unlock_op(sbi);
 
@@ -999,18 +978,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
-	if ((f2fs_encrypted_inode(old_dir) &&
-			!fscrypt_has_encryption_key(old_dir)) ||
-			(f2fs_encrypted_inode(new_dir) &&
-			!fscrypt_has_encryption_key(new_dir)))
-		return -ENOKEY;
-
-	if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
-			(old_dir != new_dir) &&
-			(!fscrypt_has_permitted_context(new_dir, old_inode) ||
-			 !fscrypt_has_permitted_context(old_dir, new_inode)))
-		return -EPERM;
-
 	if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
 			!projid_eq(F2FS_I(new_dir)->i_projid,
 			F2FS_I(old_dentry->d_inode)->i_projid)) ||
@@ -1121,6 +1088,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 	f2fs_mark_inode_dirty_sync(new_dir, false);
 
+	add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO);
+	add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
+
 	f2fs_unlock_op(sbi);
 
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
@@ -1150,9 +1120,16 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
+	int err;
+
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
 
+	err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
+				     flags);
+	if (err)
+		return err;
+
 	if (flags & RENAME_EXCHANGE) {
 		return f2fs_cross_rename(old_dir, old_dentry,
 					 new_dir, new_dentry);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 964c99655942..c294d0feea08 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -143,11 +143,9 @@ static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
 	struct nat_entry *new;
 
 	if (no_fail)
-		new = f2fs_kmem_cache_alloc(nat_entry_slab,
-						GFP_NOFS | __GFP_ZERO);
+		new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
 	else
-		new = kmem_cache_alloc(nat_entry_slab,
-						GFP_NOFS | __GFP_ZERO);
+		new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
 	if (new) {
 		nat_set_nid(new, nid);
 		nat_reset_flag(new);
@@ -702,7 +700,6 @@ static void truncate_node(struct dnode_of_data *dn)
 	struct node_info ni;
 
 	get_node_info(sbi, dn->nid, &ni);
-	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
 
 	/* Deallocate node address */
 	invalidate_blocks(sbi, ni.blk_addr);
@@ -1340,14 +1337,19 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 		.encrypted_page = NULL,
 		.submitted = false,
 		.io_type = io_type,
+		.io_wbc = wbc,
 	};
 
 	trace_f2fs_writepage(page, NODE);
 
+	if (unlikely(f2fs_cp_error(sbi))) {
+		dec_page_count(sbi, F2FS_DIRTY_NODES);
+		unlock_page(page);
+		return 0;
+	}
+
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
-	if (unlikely(f2fs_cp_error(sbi)))
-		goto redirty_out;
 
 	/* get old block addr of this node page */
 	nid = nid_of_node(page);
@@ -1592,12 +1594,6 @@ next_step:
 			struct page *page = pvec.pages[i];
 			bool submitted = false;
 
-			if (unlikely(f2fs_cp_error(sbi))) {
-				pagevec_release(&pvec);
-				ret = -EIO;
-				goto out;
-			}
-
 			/*
 			 * flushing sequence with step:
 			 * 0. indirect nodes
@@ -1667,9 +1663,12 @@ continue_unlock:
 		step++;
 		goto next_step;
 	}
-out:
+
 	if (nwritten)
 		f2fs_submit_merged_write(sbi, NODE);
+
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
 	return ret;
 }
 
@@ -1834,8 +1833,33 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
 	}
 }
 
+static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
+							bool set, bool build)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
+	unsigned int nid_ofs = nid - START_NID(nid);
+
+	if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
+		return;
+
+	if (set) {
+		if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+			return;
+		__set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+		nm_i->free_nid_count[nat_ofs]++;
+	} else {
+		if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+			return;
+		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+		if (!build)
+			nm_i->free_nid_count[nat_ofs]--;
+	}
+}
+
 /* return if the nid is recognized as free */
-static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
+static bool add_free_nid(struct f2fs_sb_info *sbi,
+				nid_t nid, bool build, bool update)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i, *e;
@@ -1851,8 +1875,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	i->nid = nid;
 	i->state = FREE_NID;
 
-	if (radix_tree_preload(GFP_NOFS))
-		goto err;
+	radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
 
 	spin_lock(&nm_i->nid_list_lock);
 
@@ -1893,9 +1916,14 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	ret = true;
 	err = __insert_free_nid(sbi, i, FREE_NID);
 err_out:
+	if (update) {
+		update_free_nid_bitmap(sbi, nid, ret, build);
+		if (!build)
+			nm_i->available_nids++;
+	}
 	spin_unlock(&nm_i->nid_list_lock);
 	radix_tree_preload_end();
-err:
+
 	if (err)
 		kmem_cache_free(free_nid_slab, i);
 	return ret;
@@ -1919,30 +1947,6 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 		kmem_cache_free(free_nid_slab, i);
 }
 
-static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
-							bool set, bool build)
-{
-	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
-	unsigned int nid_ofs = nid - START_NID(nid);
-
-	if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
-		return;
-
-	if (set) {
-		if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
-			return;
-		__set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-		nm_i->free_nid_count[nat_ofs]++;
-	} else {
-		if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
-			return;
-		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-		if (!build)
-			nm_i->free_nid_count[nat_ofs]--;
-	}
-}
-
 static void scan_nat_page(struct f2fs_sb_info *sbi,
 			struct page *nat_page, nid_t start_nid)
 {
@@ -1952,26 +1956,23 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 	unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
 	int i;
 
-	if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
-		return;
-
 	__set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
 
 	i = start_nid % NAT_ENTRY_PER_BLOCK;
 
 	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
-		bool freed = false;
-
 		if (unlikely(start_nid >= nm_i->max_nid))
 			break;
 
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
 		f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
-		if (blk_addr == NULL_ADDR)
-			freed = add_free_nid(sbi, start_nid, true);
-		spin_lock(&NM_I(sbi)->nid_list_lock);
-		update_free_nid_bitmap(sbi, start_nid, freed, true);
-		spin_unlock(&NM_I(sbi)->nid_list_lock);
+		if (blk_addr == NULL_ADDR) {
+			add_free_nid(sbi, start_nid, true, true);
+		} else {
+			spin_lock(&NM_I(sbi)->nid_list_lock);
+			update_free_nid_bitmap(sbi, start_nid, false, true);
+			spin_unlock(&NM_I(sbi)->nid_list_lock);
+		}
 	}
 }
 
@@ -1989,7 +1990,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi)
 		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
 		nid = le32_to_cpu(nid_in_journal(journal, i));
 		if (addr == NULL_ADDR)
-			add_free_nid(sbi, nid, true);
+			add_free_nid(sbi, nid, true, false);
 		else
 			remove_free_nid(sbi, nid);
 	}
@@ -2016,7 +2017,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
 				break;
 
 			nid = i * NAT_ENTRY_PER_BLOCK + idx;
-			add_free_nid(sbi, nid, true);
+			add_free_nid(sbi, nid, true, false);
 
 			if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
 				goto out;
@@ -2059,10 +2060,13 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 	down_read(&nm_i->nat_tree_lock);
 
 	while (1) {
-		struct page *page = get_current_nat_page(sbi, nid);
+		if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
+						nm_i->nat_block_bitmap)) {
+			struct page *page = get_current_nat_page(sbi, nid);
 
-		scan_nat_page(sbi, page, nid);
-		f2fs_put_page(page, 1);
+			scan_nat_page(sbi, page, nid);
+			f2fs_put_page(page, 1);
+		}
 
 		nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
 		if (unlikely(nid >= nm_i->max_nid))
@@ -2225,7 +2229,9 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
 	f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
 
 	ri = F2FS_INODE(page);
-	if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
+	if (ri->i_inline & F2FS_INLINE_XATTR) {
+		set_inode_flag(inode, FI_INLINE_XATTR);
+	} else {
 		clear_inode_flag(inode, FI_INLINE_XATTR);
 		goto update_inode;
 	}
@@ -2241,7 +2247,7 @@ update_inode:
 	f2fs_put_page(ipage, 1);
 }
 
-int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+int recover_xattr_data(struct inode *inode, struct page *page)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
@@ -2255,7 +2261,6 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
 
 	/* 1: invalidate the previous xattr nid */
 	get_node_info(sbi, prev_xnid, &ni);
-	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
 	invalidate_blocks(sbi, ni.blk_addr);
 	dec_valid_node_count(sbi, inode, false);
 	set_node_addr(sbi, &ni, NULL_ADDR, false);
@@ -2344,7 +2349,7 @@ retry:
 	return 0;
 }
 
-int restore_node_summary(struct f2fs_sb_info *sbi,
+void restore_node_summary(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct f2fs_summary_block *sum)
 {
 	struct f2fs_node *rn;
@@ -2377,7 +2382,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 		invalidate_mapping_pages(META_MAPPING(sbi), addr,
 							addr + nrpages);
 	}
-	return 0;
 }
 
 static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
@@ -2519,11 +2523,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 		nat_reset_flag(ne);
 		__clear_nat_cache_dirty(NM_I(sbi), set, ne);
 		if (nat_get_blkaddr(ne) == NULL_ADDR) {
-			add_free_nid(sbi, nid, false);
-			spin_lock(&NM_I(sbi)->nid_list_lock);
-			NM_I(sbi)->available_nids++;
-			update_free_nid_bitmap(sbi, nid, true, false);
-			spin_unlock(&NM_I(sbi)->nid_list_lock);
+			add_free_nid(sbi, nid, false, true);
 		} else {
 			spin_lock(&NM_I(sbi)->nid_list_lock);
 			update_free_nid_bitmap(sbi, nid, false, false);
@@ -2604,8 +2604,8 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
 
 	nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
 						F2FS_BLKSIZE - 1);
-	nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS,
-						GFP_KERNEL);
+	nm_i->nat_bits = f2fs_kzalloc(sbi,
+			nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
 	if (!nm_i->nat_bits)
 		return -ENOMEM;
 
@@ -2683,7 +2683,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
 	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
-							F2FS_RESERVED_NODE_NUM;
+				sbi->nquota_files - F2FS_RESERVED_NODE_NUM;
 	nm_i->nid_cnt[FREE_NID] = 0;
 	nm_i->nid_cnt[PREALLOC_NID] = 0;
 	nm_i->nat_cnt = 0;
@@ -2730,17 +2730,17 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
-	nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks *
+	nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks *
 					NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
 	if (!nm_i->free_nid_bitmap)
 		return -ENOMEM;
 
-	nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8,
+	nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8,
 								GFP_KERNEL);
 	if (!nm_i->nat_block_bitmap)
 		return -ENOMEM;
 
-	nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks *
+	nm_i->free_nid_count = f2fs_kvzalloc(sbi, nm_i->nat_blocks *
 					sizeof(unsigned short), GFP_KERNEL);
 	if (!nm_i->free_nid_count)
 		return -ENOMEM;
@@ -2751,7 +2751,8 @@ int build_node_manager(struct f2fs_sb_info *sbi)
 {
 	int err;
 
-	sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
+	sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info),
+							GFP_KERNEL);
 	if (!sbi->nm_info)
 		return -ENOMEM;
 
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0ee3e5ff49a3..081ef0d672bf 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -305,6 +305,10 @@ static inline bool is_recoverable_dnode(struct page *page)
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
 	__u64 cp_ver = cur_cp_version(ckpt);
 
+	/* Don't care crc part, if fsck.f2fs sets it. */
+	if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG))
+		return (cp_ver << 32) == (cpver_of_node(page) << 32);
+
 	if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
 		cp_ver |= (cur_cp_crc(ckpt) << 32);
 
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 92c57ace1939..b6d1ec620a8c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -195,6 +195,20 @@ out:
 	return err;
 }
 
+static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri)
+{
+	if (ri->i_inline & F2FS_PIN_FILE)
+		set_inode_flag(inode, FI_PIN_FILE);
+	else
+		clear_inode_flag(inode, FI_PIN_FILE);
+	if (ri->i_inline & F2FS_DATA_EXIST)
+		set_inode_flag(inode, FI_DATA_EXIST);
+	else
+		clear_inode_flag(inode, FI_DATA_EXIST);
+	if (!(ri->i_inline & F2FS_INLINE_DOTS))
+		clear_inode_flag(inode, FI_INLINE_DOTS);
+}
+
 static void recover_inode(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *raw = F2FS_INODE(page);
@@ -211,13 +225,16 @@ static void recover_inode(struct inode *inode, struct page *page)
 
 	F2FS_I(inode)->i_advise = raw->i_advise;
 
+	recover_inline_flags(inode, raw);
+
 	if (file_enc_name(inode))
 		name = "<encrypted>";
 	else
 		name = F2FS_INODE(page)->i_name;
 
-	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
-			ino_of_node(page), name);
+	f2fs_msg(inode->i_sb, KERN_NOTICE,
+		"recover_inode: ino = %x, name = %s, inline = %x",
+			ino_of_node(page), name, raw->i_inline);
 }
 
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
@@ -404,7 +421,7 @@ truncate_out:
 }
 
 static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
-					struct page *page, block_t blkaddr)
+					struct page *page)
 {
 	struct dnode_of_data dn;
 	struct node_info ni;
@@ -415,7 +432,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	if (IS_INODE(page)) {
 		recover_inline_xattr(inode, page);
 	} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
-		err = recover_xattr_data(inode, page, blkaddr);
+		err = recover_xattr_data(inode, page);
 		if (!err)
 			recovered++;
 		goto out;
@@ -568,7 +585,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
 				break;
 			}
 		}
-		err = do_recover_data(sbi, entry->inode, page, blkaddr);
+		err = do_recover_data(sbi, entry->inode, page);
 		if (err) {
 			f2fs_put_page(page, 1);
 			break;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 94939a5a96c8..bf98f6f34b7e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -248,7 +248,11 @@ retry:
 				goto next;
 			}
 			get_node_info(sbi, dn.nid, &ni);
-			f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+			if (cur->old_addr == NEW_ADDR) {
+				invalidate_blocks(sbi, dn.data_blkaddr);
+				f2fs_update_data_blkaddr(&dn, NEW_ADDR);
+			} else
+				f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
 					cur->old_addr, ni.version, true, true);
 			f2fs_put_dnode(&dn);
 		}
@@ -657,7 +661,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 		goto init_thread;
 	}
 
-	fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
+	fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL);
 	if (!fcc)
 		return -ENOMEM;
 	atomic_set(&fcc->issued_flush, 0);
@@ -965,7 +969,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	return 0;
 }
 
-void __check_sit_bitmap(struct f2fs_sb_info *sbi,
+static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
 				block_t start, block_t end)
 {
 #ifdef CONFIG_F2FS_CHECK_FS
@@ -1284,6 +1288,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
 		pend_list = &dcc->pend_list[i];
 
 		mutex_lock(&dcc->cmd_lock);
+		if (list_empty(pend_list))
+			goto next;
 		f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
 		blk_start_plug(&plug);
 		list_for_each_entry_safe(dc, tmp, pend_list, list) {
@@ -1302,6 +1308,7 @@ skip:
 				break;
 		}
 		blk_finish_plug(&plug);
+next:
 		mutex_unlock(&dcc->cmd_lock);
 
 		if (iter >= dpolicy->max_requests)
@@ -1336,6 +1343,11 @@ static bool __drop_discard_cmd(struct f2fs_sb_info *sbi)
 	return dropped;
 }
 
+void drop_discard_cmd(struct f2fs_sb_info *sbi)
+{
+	__drop_discard_cmd(sbi);
+}
+
 static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi,
 							struct discard_cmd *dc)
 {
@@ -1404,7 +1416,7 @@ static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
 }
 
 /* This should be covered by global mutex, &sit_i->sentry_lock */
-void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct discard_cmd *dc;
@@ -1474,6 +1486,8 @@ static int issue_discard_thread(void *data)
 				msecs_to_jiffies(wait_ms));
 		if (try_to_freeze())
 			continue;
+		if (f2fs_readonly(sbi->sb))
+			continue;
 		if (kthread_should_stop())
 			return 0;
 
@@ -1783,25 +1797,20 @@ void init_discard_policy(struct discard_policy *dpolicy,
 	dpolicy->sync = true;
 	dpolicy->granularity = granularity;
 
+	dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+	dpolicy->io_aware_gran = MAX_PLIST_NUM;
+
 	if (discard_type == DPOLICY_BG) {
 		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
 		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
-		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
-		dpolicy->io_aware_gran = MAX_PLIST_NUM;
 		dpolicy->io_aware = true;
 	} else if (discard_type == DPOLICY_FORCE) {
 		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
 		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
-		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
-		dpolicy->io_aware_gran = MAX_PLIST_NUM;
 		dpolicy->io_aware = true;
 	} else if (discard_type == DPOLICY_FSTRIM) {
-		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
-		dpolicy->io_aware_gran = MAX_PLIST_NUM;
 		dpolicy->io_aware = false;
 	} else if (discard_type == DPOLICY_UMOUNT) {
-		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
-		dpolicy->io_aware_gran = MAX_PLIST_NUM;
 		dpolicy->io_aware = false;
 	}
 }
@@ -1817,7 +1826,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 		goto init_thread;
 	}
 
-	dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL);
+	dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL);
 	if (!dcc)
 		return -ENOMEM;
 
@@ -2514,7 +2523,6 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 	return false;
 }
 
-#if 0
 int rw_hint_to_seg_type(enum rw_hint hint)
 {
 	switch (hint) {
@@ -2526,7 +2534,6 @@ int rw_hint_to_seg_type(enum rw_hint hint)
 		return CURSEG_WARM_DATA;
 	}
 }
-#endif
 
 static int __get_segment_type_2(struct f2fs_io_info *fio)
 {
@@ -2822,6 +2829,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		}
 	}
 
+	f2fs_bug_on(sbi, !IS_DATASEG(type));
 	curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);
@@ -2906,7 +2914,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr)
 	}
 }
 
-static int read_compacted_summaries(struct f2fs_sb_info *sbi)
+static void read_compacted_summaries(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	struct curseg_info *seg_i;
@@ -2963,7 +2971,6 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 		}
 	}
 	f2fs_put_page(page, 1);
-	return 0;
 }
 
 static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
@@ -3009,13 +3016,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 				ns->ofs_in_node = 0;
 			}
 		} else {
-			int err;
-
-			err = restore_node_summary(sbi, segno, sum);
-			if (err) {
-				f2fs_put_page(new, 1);
-				return err;
-			}
+			restore_node_summary(sbi, segno, sum);
 		}
 	}
 
@@ -3054,8 +3055,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 							META_CP, true);
 
 		/* restore for compacted data summary */
-		if (read_compacted_summaries(sbi))
-			return -EINVAL;
+		read_compacted_summaries(sbi);
 		type = CURSEG_HOT_NODE;
 	}
 
@@ -3191,28 +3191,19 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
 					unsigned int start)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
-	struct page *src_page, *dst_page;
+	struct page *page;
 	pgoff_t src_off, dst_off;
-	void *src_addr, *dst_addr;
 
 	src_off = current_sit_addr(sbi, start);
 	dst_off = next_sit_addr(sbi, src_off);
 
-	/* get current sit block page without lock */
-	src_page = get_meta_page(sbi, src_off);
-	dst_page = grab_meta_page(sbi, dst_off);
-	f2fs_bug_on(sbi, PageDirty(src_page));
-
-	src_addr = page_address(src_page);
-	dst_addr = page_address(dst_page);
-	memcpy(dst_addr, src_addr, PAGE_SIZE);
-
-	set_page_dirty(dst_page);
-	f2fs_put_page(src_page, 1);
+	page = grab_meta_page(sbi, dst_off);
+	seg_info_to_sit_page(sbi, page, start);
 
+	set_page_dirty(page);
 	set_to_next_sit(sit_i, start);
 
-	return dst_page;
+	return page;
 }
 
 static struct sit_entry_set *grab_sit_entry_set(void)
@@ -3421,52 +3412,54 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	unsigned int bitmap_size;
 
 	/* allocate memory for SIT information */
-	sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL);
+	sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL);
 	if (!sit_i)
 		return -ENOMEM;
 
 	SM_I(sbi)->sit_info = sit_i;
 
-	sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) *
+	sit_i->sentries = f2fs_kvzalloc(sbi, MAIN_SEGS(sbi) *
 					sizeof(struct seg_entry), GFP_KERNEL);
 	if (!sit_i->sentries)
 		return -ENOMEM;
 
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
-	sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
+	sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, bitmap_size,
+								GFP_KERNEL);
 	if (!sit_i->dirty_sentries_bitmap)
 		return -ENOMEM;
 
 	for (start = 0; start < MAIN_SEGS(sbi); start++) {
 		sit_i->sentries[start].cur_valid_map
-			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+			= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 		sit_i->sentries[start].ckpt_valid_map
-			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+			= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 		if (!sit_i->sentries[start].cur_valid_map ||
 				!sit_i->sentries[start].ckpt_valid_map)
 			return -ENOMEM;
 
 #ifdef CONFIG_F2FS_CHECK_FS
 		sit_i->sentries[start].cur_valid_map_mir
-			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+			= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 		if (!sit_i->sentries[start].cur_valid_map_mir)
 			return -ENOMEM;
 #endif
 
 		if (f2fs_discard_en(sbi)) {
 			sit_i->sentries[start].discard_map
-				= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+				= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE,
+								GFP_KERNEL);
 			if (!sit_i->sentries[start].discard_map)
 				return -ENOMEM;
 		}
 	}
 
-	sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+	sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 	if (!sit_i->tmp_map)
 		return -ENOMEM;
 
 	if (sbi->segs_per_sec > 1) {
-		sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) *
+		sit_i->sec_entries = f2fs_kvzalloc(sbi, MAIN_SECS(sbi) *
 					sizeof(struct sec_entry), GFP_KERNEL);
 		if (!sit_i->sec_entries)
 			return -ENOMEM;
@@ -3510,19 +3503,19 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
 	unsigned int bitmap_size, sec_bitmap_size;
 
 	/* allocate memory for free segmap information */
-	free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL);
+	free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL);
 	if (!free_i)
 		return -ENOMEM;
 
 	SM_I(sbi)->free_info = free_i;
 
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
-	free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL);
+	free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL);
 	if (!free_i->free_segmap)
 		return -ENOMEM;
 
 	sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
-	free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL);
+	free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL);
 	if (!free_i->free_secmap)
 		return -ENOMEM;
 
@@ -3543,7 +3536,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 	struct curseg_info *array;
 	int i;
 
-	array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL);
+	array = f2fs_kzalloc(sbi, sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL);
 	if (!array)
 		return -ENOMEM;
 
@@ -3551,12 +3544,12 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < NR_CURSEG_TYPE; i++) {
 		mutex_init(&array[i].curseg_mutex);
-		array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL);
+		array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
 		if (!array[i].sum_blk)
 			return -ENOMEM;
 		init_rwsem(&array[i].journal_rwsem);
-		array[i].journal = kzalloc(sizeof(struct f2fs_journal),
-							GFP_KERNEL);
+		array[i].journal = f2fs_kzalloc(sbi,
+				sizeof(struct f2fs_journal), GFP_KERNEL);
 		if (!array[i].journal)
 			return -ENOMEM;
 		array[i].segno = NULL_SEGNO;
@@ -3565,7 +3558,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 	return restore_curseg_summaries(sbi);
 }
 
-static void build_sit_entries(struct f2fs_sb_info *sbi)
+static int build_sit_entries(struct f2fs_sb_info *sbi)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
@@ -3575,6 +3568,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 	int sit_blk_cnt = SIT_BLK_CNT(sbi);
 	unsigned int i, start, end;
 	unsigned int readed, start_blk = 0;
+	int err = 0;
 
 	do {
 		readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
@@ -3593,7 +3587,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
 			f2fs_put_page(page, 1);
 
-			check_block_count(sbi, start, &sit);
+			err = check_block_count(sbi, start, &sit);
+			if (err)
+				return err;
 			seg_info_from_raw_sit(se, &sit);
 
 			/* build discard map only one time */
@@ -3628,7 +3624,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 
 		old_valid_blocks = se->valid_blocks;
 
-		check_block_count(sbi, start, &sit);
+		err = check_block_count(sbi, start, &sit);
+		if (err)
+			break;
 		seg_info_from_raw_sit(se, &sit);
 
 		if (f2fs_discard_en(sbi)) {
@@ -3648,6 +3646,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 				se->valid_blocks - old_valid_blocks;
 	}
 	up_read(&curseg->journal_rwsem);
+	return err;
 }
 
 static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -3702,7 +3701,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
 
-	dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL);
+	dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
 	if (!dirty_i->victim_secmap)
 		return -ENOMEM;
 	return 0;
@@ -3714,7 +3713,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
 	unsigned int bitmap_size, i;
 
 	/* allocate memory for dirty segments list information */
-	dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL);
+	dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info),
+								GFP_KERNEL);
 	if (!dirty_i)
 		return -ENOMEM;
 
@@ -3724,7 +3724,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
 
 	for (i = 0; i < NR_DIRTY_TYPE; i++) {
-		dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL);
+		dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size,
+								GFP_KERNEL);
 		if (!dirty_i->dirty_segmap[i])
 			return -ENOMEM;
 	}
@@ -3768,7 +3769,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	struct f2fs_sm_info *sm_info;
 	int err;
 
-	sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL);
+	sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL);
 	if (!sm_info)
 		return -ENOMEM;
 
@@ -3820,7 +3821,9 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 		return err;
 
 	/* reinit free segmap based on SIT */
-	build_sit_entries(sbi);
+	err = build_sit_entries(sbi);
+	if (err)
+		return err;
 
 	init_free_segmap(sbi);
 	err = build_dirty_segmap(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5264b6ed120c..5d6d3e72be31 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -348,16 +348,41 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se,
 	se->mtime = le64_to_cpu(rs->mtime);
 }
 
-static inline void seg_info_to_raw_sit(struct seg_entry *se,
+static inline void __seg_info_to_raw_sit(struct seg_entry *se,
 					struct f2fs_sit_entry *rs)
 {
 	unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) |
 					se->valid_blocks;
 	rs->vblocks = cpu_to_le16(raw_vblocks);
 	memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
+	rs->mtime = cpu_to_le64(se->mtime);
+}
+
+static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi,
+				struct page *page, unsigned int start)
+{
+	struct f2fs_sit_block *raw_sit;
+	struct seg_entry *se;
+	struct f2fs_sit_entry *rs;
+	unsigned int end = min(start + SIT_ENTRY_PER_BLOCK,
+					(unsigned long)MAIN_SEGS(sbi));
+	int i;
+
+	raw_sit = (struct f2fs_sit_block *)page_address(page);
+	for (i = 0; i < end - start; i++) {
+		rs = &raw_sit->entries[i];
+		se = get_seg_entry(sbi, start + i);
+		__seg_info_to_raw_sit(se, rs);
+	}
+}
+
+static inline void seg_info_to_raw_sit(struct seg_entry *se,
+					struct f2fs_sit_entry *rs)
+{
+	__seg_info_to_raw_sit(se, rs);
+
 	memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
 	se->ckpt_valid_blocks = se->valid_blocks;
-	rs->mtime = cpu_to_le64(se->mtime);
 }
 
 static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
@@ -580,47 +605,6 @@ enum {
 	F2FS_IPU_ASYNC,
 };
 
-static inline bool need_inplace_update_policy(struct inode *inode,
-				struct f2fs_io_info *fio)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	unsigned int policy = SM_I(sbi)->ipu_policy;
-
-	if (test_opt(sbi, LFS))
-		return false;
-
-	/* if this is cold file, we should overwrite to avoid fragmentation */
-	if (file_is_cold(inode))
-		return true;
-
-	if (policy & (0x1 << F2FS_IPU_FORCE))
-		return true;
-	if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
-		return true;
-	if (policy & (0x1 << F2FS_IPU_UTIL) &&
-			utilization(sbi) > SM_I(sbi)->min_ipu_util)
-		return true;
-	if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
-			utilization(sbi) > SM_I(sbi)->min_ipu_util)
-		return true;
-
-	/*
-	 * IPU for rewrite async pages
-	 */
-	if (policy & (0x1 << F2FS_IPU_ASYNC) &&
-			fio && fio->op == REQ_OP_WRITE &&
-			!(fio->op_flags & REQ_SYNC) &&
-			!f2fs_encrypted_inode(inode))
-		return true;
-
-	/* this is only set during fdatasync */
-	if (policy & (0x1 << F2FS_IPU_FSYNC) &&
-			is_inode_flag_set(inode, FI_NEED_IPU))
-		return true;
-
-	return false;
-}
-
 static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
 		int type)
 {
@@ -655,7 +639,7 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 /*
  * Summary block is always treated as an invalid block
  */
-static inline void check_block_count(struct f2fs_sb_info *sbi,
+static inline int check_block_count(struct f2fs_sb_info *sbi,
 		int segno, struct f2fs_sit_entry *raw_sit)
 {
 #ifdef CONFIG_F2FS_CHECK_FS
@@ -677,11 +661,25 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 		cur_pos = next_pos;
 		is_valid = !is_valid;
 	} while (cur_pos < sbi->blocks_per_seg);
-	BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
+
+	if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+				"Mismatch valid blocks %d vs. %d",
+					GET_SIT_VBLOCKS(raw_sit), valid_blocks);
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		return -EINVAL;
+	}
 #endif
 	/* check segment usage, and check boundary of a given segment number */
-	f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
-					|| segno > TOTAL_SEGS(sbi) - 1);
+	if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
+					|| segno > TOTAL_SEGS(sbi) - 1)) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+				"Wrong valid blocks %d or segno %u",
+					GET_SIT_VBLOCKS(raw_sit), segno);
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		return -EINVAL;
+	}
+	return 0;
 }
 
 static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 187cead7bd37..aaeba346e9d7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -43,6 +43,7 @@ static struct kmem_cache *f2fs_inode_cachep;
 
 char *fault_name[FAULT_MAX] = {
 	[FAULT_KMALLOC]		= "kmalloc",
+	[FAULT_KVMALLOC]	= "kvmalloc",
 	[FAULT_PAGE_ALLOC]	= "page alloc",
 	[FAULT_PAGE_GET]	= "page get",
 	[FAULT_ALLOC_BIO]	= "alloc bio",
@@ -106,6 +107,9 @@ enum {
 	Opt_noextent_cache,
 	Opt_noinline_data,
 	Opt_data_flush,
+	Opt_reserve_root,
+	Opt_resgid,
+	Opt_resuid,
 	Opt_mode,
 	Opt_io_size_bits,
 	Opt_fault_injection,
@@ -156,6 +160,9 @@ static match_table_t f2fs_tokens = {
 	{Opt_noextent_cache, "noextent_cache"},
 	{Opt_noinline_data, "noinline_data"},
 	{Opt_data_flush, "data_flush"},
+	{Opt_reserve_root, "reserve_root=%u"},
+	{Opt_resgid, "resgid=%u"},
+	{Opt_resuid, "resuid=%u"},
 	{Opt_mode, "mode=%s"},
 	{Opt_io_size_bits, "io_bits=%u"},
 	{Opt_fault_injection, "fault_injection=%u"},
@@ -190,6 +197,28 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
 	va_end(args);
 }
 
+static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
+{
+	block_t limit = (sbi->user_block_count << 1) / 1000;
+
+	/* limit is 0.2% */
+	if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) {
+		sbi->root_reserved_blocks = limit;
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"Reduce reserved blocks for root = %u",
+				sbi->root_reserved_blocks);
+	}
+	if (!test_opt(sbi, RESERVE_ROOT) &&
+		(!uid_eq(sbi->s_resuid,
+				make_kuid(&init_user_ns, F2FS_DEF_RESUID)) ||
+		!gid_eq(sbi->s_resgid,
+				make_kgid(&init_user_ns, F2FS_DEF_RESGID))))
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"Ignore s_resuid=%u, s_resgid=%u w/o reserve_root",
+				from_kuid_munged(&init_user_ns, sbi->s_resuid),
+				from_kgid_munged(&init_user_ns, sbi->s_resgid));
+}
+
 static void init_once(void *foo)
 {
 	struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
@@ -320,6 +349,8 @@ static int parse_options(struct super_block *sb, char *options)
 	substring_t args[MAX_OPT_ARGS];
 	char *p, *name;
 	int arg = 0;
+	kuid_t uid;
+	kgid_t gid;
 #ifdef CONFIG_QUOTA
 	int ret;
 #endif
@@ -487,6 +518,40 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_data_flush:
 			set_opt(sbi, DATA_FLUSH);
 			break;
+		case Opt_reserve_root:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			if (test_opt(sbi, RESERVE_ROOT)) {
+				f2fs_msg(sb, KERN_INFO,
+					"Preserve previous reserve_root=%u",
+					sbi->root_reserved_blocks);
+			} else {
+				sbi->root_reserved_blocks = arg;
+				set_opt(sbi, RESERVE_ROOT);
+			}
+			break;
+		case Opt_resuid:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			uid = make_kuid(current_user_ns(), arg);
+			if (!uid_valid(uid)) {
+				f2fs_msg(sb, KERN_ERR,
+					"Invalid uid value %d", arg);
+				return -EINVAL;
+			}
+			sbi->s_resuid = uid;
+			break;
+		case Opt_resgid:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			gid = make_kgid(current_user_ns(), arg);
+			if (!gid_valid(gid)) {
+				f2fs_msg(sb, KERN_ERR,
+					"Invalid gid value %d", arg);
+				return -EINVAL;
+			}
+			sbi->s_resgid = gid;
+			break;
 		case Opt_mode:
 			name = match_strdup(&args[0]);
 
@@ -994,22 +1059,25 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct super_block *sb = dentry->d_sb;
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-	block_t total_count, user_block_count, start_count, ovp_count;
+	block_t total_count, user_block_count, start_count;
 	u64 avail_node_count;
 
 	total_count = le64_to_cpu(sbi->raw_super->block_count);
 	user_block_count = sbi->user_block_count;
 	start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
-	ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
 	buf->f_type = F2FS_SUPER_MAGIC;
 	buf->f_bsize = sbi->blocksize;
 
 	buf->f_blocks = total_count - start_count;
-	buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
-	buf->f_bavail = user_block_count - valid_user_blocks(sbi) -
+	buf->f_bfree = user_block_count - valid_user_blocks(sbi) -
 						sbi->current_reserved_blocks;
+	if (buf->f_bfree > sbi->root_reserved_blocks)
+		buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks;
+	else
+		buf->f_bavail = 0;
 
-	avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+	avail_node_count = sbi->total_node_count - sbi->nquota_files -
+						F2FS_RESERVED_NODE_NUM;
 
 	if (avail_node_count > user_block_count) {
 		buf->f_files = user_block_count;
@@ -1135,6 +1203,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 	else if (test_opt(sbi, LFS))
 		seq_puts(seq, "lfs");
 	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
+	if (test_opt(sbi, RESERVE_ROOT))
+		seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u",
+				sbi->root_reserved_blocks,
+				from_kuid_munged(&init_user_ns, sbi->s_resuid),
+				from_kgid_munged(&init_user_ns, sbi->s_resgid));
 	if (F2FS_IO_SIZE_BITS(sbi))
 		seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi));
 #ifdef CONFIG_F2FS_FAULT_INJECTION
@@ -1264,7 +1337,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		err = dquot_suspend(sb, -1);
 		if (err < 0)
 			goto restore_opts;
-	} else {
+	} else if (f2fs_readonly(sb) && !(*flags & MS_RDONLY)) {
 		/* dquot_resume needs RW */
 		sb->s_flags &= ~MS_RDONLY;
 		if (sb_any_quota_suspended(sb)) {
@@ -1333,6 +1406,7 @@ skip:
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
 
+	limit_reserve_root(sbi);
 	return 0;
 restore_gc:
 	if (need_restart_gc) {
@@ -1658,7 +1732,7 @@ void f2fs_quota_off_umount(struct super_block *sb)
 }
 
 #if 0
-int f2fs_get_projid(struct inode *inode, kprojid_t *projid)
+static int f2fs_get_projid(struct inode *inode, kprojid_t *projid)
 {
 	*projid = F2FS_I(inode)->i_projid;
 	return 0;
@@ -2152,14 +2226,15 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 	if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
 		FDEV(devi).nr_blkz++;
 
-	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
+	FDEV(devi).blkz_type = f2fs_kmalloc(sbi, FDEV(devi).nr_blkz,
+								GFP_KERNEL);
 	if (!FDEV(devi).blkz_type)
 		return -ENOMEM;
 
 #define F2FS_REPORT_NR_ZONES   4096
 
-	zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone),
-			GFP_KERNEL);
+	zones = f2fs_kzalloc(sbi, sizeof(struct blk_zone) *
+				F2FS_REPORT_NR_ZONES, GFP_KERNEL);
 	if (!zones)
 		return -ENOMEM;
 
@@ -2303,8 +2378,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 	 * Initialize multiple devices information, or single
 	 * zoned block device information.
 	 */
-	sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info),
-				GFP_KERNEL);
+	sbi->devs = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_info) *
+						max_devices, GFP_KERNEL);
 	if (!sbi->devs)
 		return -ENOMEM;
 
@@ -2427,6 +2502,9 @@ try_onemore:
 	sb->s_fs_info = sbi;
 	sbi->raw_super = raw_super;
 
+	sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
+	sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
+
 	/* precompute checksum seed for metadata */
 	if (f2fs_sb_has_inode_chksum(sb))
 		sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
@@ -2470,6 +2548,13 @@ try_onemore:
 	else
 		sb->s_qcop = &f2fs_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
+
+	if (f2fs_sb_has_quota_ino(sbi->sb)) {
+		for (i = 0; i < MAXQUOTAS; i++) {
+			if (f2fs_qf_ino(sbi->sb, i))
+				sbi->nquota_files++;
+		}
+	}
 #endif
 
 	sb->s_op = &f2fs_sops;
@@ -2483,6 +2568,7 @@ try_onemore:
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
 	memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+	sb->s_iflags |= SB_I_CGROUPWB;
 
 	/* init f2fs-specific super block info */
 	sbi->valid_super_block = valid_super_block;
@@ -2503,8 +2589,9 @@ try_onemore:
 		int n = (i == META) ? 1: NR_TEMP_TYPE;
 		int j;
 
-		sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info),
-								GFP_KERNEL);
+		sbi->write_io[i] = f2fs_kmalloc(sbi,
+					n * sizeof(struct f2fs_bio_info),
+					GFP_KERNEL);
 		if (!sbi->write_io[i]) {
 			err = -ENOMEM;
 			goto free_options;
@@ -2525,14 +2612,14 @@ try_onemore:
 
 	err = init_percpu_info(sbi);
 	if (err)
-		goto free_options;
+		goto free_bio_info;
 
 	if (F2FS_IO_SIZE(sbi) > 1) {
 		sbi->write_io_dummy =
 			mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
 		if (!sbi->write_io_dummy) {
 			err = -ENOMEM;
-			goto free_options;
+			goto free_percpu;
 		}
 	}
 
@@ -2567,6 +2654,7 @@ try_onemore:
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
 	sbi->reserved_blocks = 0;
 	sbi->current_reserved_blocks = 0;
+	limit_reserve_root(sbi);
 
 	for (i = 0; i < NR_INODE_TYPE; i++) {
 		INIT_LIST_HEAD(&sbi->inode_list[i]);
@@ -2612,18 +2700,16 @@ try_onemore:
 		goto free_nm;
 	}
 
-	f2fs_join_shrinker(sbi);
-
 	err = f2fs_build_stats(sbi);
 	if (err)
-		goto free_nm;
+		goto free_node_inode;
 
 	/* read root inode and dentry */
 	root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
 	if (IS_ERR(root)) {
 		f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
 		err = PTR_ERR(root);
-		goto free_node_inode;
+		goto free_stats;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		iput(root);
@@ -2719,6 +2805,8 @@ skip_recovery:
 			sbi->valid_super_block ? 1 : 2, err);
 	}
 
+	f2fs_join_shrinker(sbi);
+
 	f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx",
 				cur_cp_version(F2FS_CKPT(sbi)));
 	f2fs_update_time(sbi, CP_TIME);
@@ -2745,14 +2833,12 @@ free_sysfs:
 free_root_inode:
 	dput(sb->s_root);
 	sb->s_root = NULL;
-free_node_inode:
-	truncate_inode_pages_final(NODE_MAPPING(sbi));
-	mutex_lock(&sbi->umount_mutex);
-	release_ino_entry(sbi, true);
-	f2fs_leave_shrinker(sbi);
-	iput(sbi->node_inode);
-	mutex_unlock(&sbi->umount_mutex);
+free_stats:
 	f2fs_destroy_stats(sbi);
+free_node_inode:
+	release_ino_entry(sbi, true);
+	truncate_inode_pages_final(NODE_MAPPING(sbi));
+	iput(sbi->node_inode);
 free_nm:
 	destroy_node_manager(sbi);
 free_sm:
@@ -2765,10 +2851,12 @@ free_meta_inode:
 	iput(sbi->meta_inode);
 free_io_dummy:
 	mempool_destroy(sbi->write_io_dummy);
-free_options:
+free_percpu:
+	destroy_percpu_info(sbi);
+free_bio_info:
 	for (i = 0; i < NR_PAGE_TYPE; i++)
 		kfree(sbi->write_io[i]);
-	destroy_percpu_info(sbi);
+free_options:
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 9835348b6e5d..d978c7b6ea04 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -113,6 +113,9 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_quota_ino(sb))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "quota_ino");
+	if (f2fs_sb_has_inode_crtime(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "inode_crtime");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
@@ -162,7 +165,8 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
 #endif
 	if (a->struct_type == RESERVED_BLOCKS) {
 		spin_lock(&sbi->stat_lock);
-		if (t > (unsigned long)sbi->user_block_count) {
+		if (t > (unsigned long)(sbi->user_block_count -
+					sbi->root_reserved_blocks)) {
 			spin_unlock(&sbi->stat_lock);
 			return -EINVAL;
 		}
@@ -231,6 +235,7 @@ enum feat_id {
 	FEAT_INODE_CHECKSUM,
 	FEAT_FLEXIBLE_INLINE_XATTR,
 	FEAT_QUOTA_INO,
+	FEAT_INODE_CRTIME,
 };
 
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -245,6 +250,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 	case FEAT_INODE_CHECKSUM:
 	case FEAT_FLEXIBLE_INLINE_XATTR:
 	case FEAT_QUOTA_INO:
+	case FEAT_INODE_CRTIME:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
 	return 0;
@@ -299,6 +305,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
 F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
@@ -320,6 +328,7 @@ F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
 F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
 F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
 F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
+F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -346,6 +355,8 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(cp_interval),
 	ATTR_LIST(idle_interval),
 	ATTR_LIST(iostat_enable),
+	ATTR_LIST(readdir_ra),
+	ATTR_LIST(gc_pin_file_thresh),
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	ATTR_LIST(inject_rate),
 	ATTR_LIST(inject_type),
@@ -371,6 +382,7 @@ static struct attribute *f2fs_feat_attrs[] = {
 	ATTR_LIST(inode_checksum),
 	ATTR_LIST(flexible_inline_xattr),
 	ATTR_LIST(quota_ino),
+	ATTR_LIST(inode_crtime),
 	NULL,
 };
 
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index bccbbf2616d2..a1fcd00bbb2b 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -17,7 +17,7 @@
 #include "trace.h"
 
 static RADIX_TREE(pids, GFP_ATOMIC);
-static spinlock_t pids_lock;
+static struct mutex pids_lock;
 static struct last_io_info last_io;
 
 static inline void __print_last_io(void)
@@ -64,7 +64,7 @@ void f2fs_trace_pid(struct page *page)
 	if (radix_tree_preload(GFP_NOFS))
 		return;
 
-	spin_lock(&pids_lock);
+	mutex_lock(&pids_lock);
 	p = radix_tree_lookup(&pids, pid);
 	if (p == current)
 		goto out;
@@ -77,7 +77,7 @@ void f2fs_trace_pid(struct page *page)
 			MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
 			pid, current->comm);
 out:
-	spin_unlock(&pids_lock);
+	mutex_unlock(&pids_lock);
 	radix_tree_preload_end();
 }
 
@@ -122,7 +122,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
 
 void f2fs_build_trace_ios(void)
 {
-	spin_lock_init(&pids_lock);
+	mutex_init(&pids_lock);
 }
 
 #define PIDVEC_SIZE	128
@@ -150,7 +150,7 @@ void f2fs_destroy_trace_ios(void)
 	pid_t next_pid = 0;
 	unsigned int found;
 
-	spin_lock(&pids_lock);
+	mutex_lock(&pids_lock);
 	while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
 		unsigned idx;
 
@@ -158,5 +158,5 @@ void f2fs_destroy_trace_ios(void)
 		for (idx = 0; idx < found; idx++)
 			radix_tree_delete(&pids, pid[idx]);
 	}
-	spin_unlock(&pids_lock);
+	mutex_unlock(&pids_lock);
 }
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 7acf56ebda65..116be979b897 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -345,8 +345,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 	if (!size && !inline_size)
 		return -ENODATA;
 
-	txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
-							GFP_F2FS_ZERO);
+	txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode),
+			inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS);
 	if (!txattr_addr)
 		return -ENOMEM;
 
@@ -398,8 +398,8 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
 	void *txattr_addr;
 	int err;
 
-	txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
-							GFP_F2FS_ZERO);
+	txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode),
+			inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS);
 	if (!txattr_addr)
 		return -ENOMEM;
 
@@ -480,6 +480,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 	if (F2FS_I(inode)->i_xattr_nid) {
 		xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
 		if (IS_ERR(xpage)) {
+			err = PTR_ERR(xpage);
 			alloc_nid_failed(sbi, new_nid);
 			goto in_page_out;
 		}
@@ -490,6 +491,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 		set_new_dnode(&dn, inode, NULL, NULL, new_nid);
 		xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
 		if (IS_ERR(xpage)) {
+			err = PTR_ERR(xpage);
 			alloc_nid_failed(sbi, new_nid);
 			goto in_page_out;
 		}
@@ -638,7 +640,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 			goto exit;
 		}
 
-		if (f2fs_xattr_value_same(here, value, size))
+		if (value && f2fs_xattr_value_same(here, value, size))
 			goto exit;
 	} else if ((flags & XATTR_REPLACE)) {
 		error = -ENODATA;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index fef1caeddf54..c82ae65b5330 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -117,6 +117,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_NOCRC_RECOVERY_FLAG	0x00000200
 #define CP_TRIMMED_FLAG		0x00000100
 #define CP_NAT_BITS_FLAG	0x00000080
 #define CP_CRC_RECOVERY_FLAG	0x00000040
@@ -212,6 +213,7 @@ struct f2fs_extent {
 #define F2FS_DATA_EXIST		0x08	/* file inline data exist flag */
 #define F2FS_INLINE_DOTS	0x10	/* file having implicit dot dentries */
 #define F2FS_EXTRA_ATTR		0x20	/* file having extra attribute */
+#define F2FS_PIN_FILE		0x40	/* file should not be gced */
 
 struct f2fs_inode {
 	__le16 i_mode;			/* file mode */
@@ -229,7 +231,13 @@ struct f2fs_inode {
 	__le32 i_ctime_nsec;		/* change time in nano scale */
 	__le32 i_mtime_nsec;		/* modification time in nano scale */
 	__le32 i_generation;		/* file version (for NFS) */
-	__le32 i_current_depth;		/* only for directory depth */
+	union {
+		__le32 i_current_depth;	/* only for directory depth */
+		__le16 i_gc_failures;	/*
+					 * # of gc failures on pinned file.
+					 * only for regular files.
+					 */
+	};
 	__le32 i_xattr_nid;		/* nid to save xattr */
 	__le32 i_flags;			/* file attributes */
 	__le32 i_pino;			/* parent inode number */
@@ -245,8 +253,10 @@ struct f2fs_inode {
 			__le16 i_inline_xattr_size;	/* inline xattr size, unit: 4 bytes */
 			__le32 i_projid;	/* project id */
 			__le32 i_inode_checksum;/* inode meta checksum */
+			__le64 i_crtime;	/* creation time */
+			__le32 i_crtime_nsec;	/* creation time in nano scale */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
-		};
+		} __packed;
 		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
 	};
 	__le32 i_nid[DEF_NIDS_PER_INODE];	/* direct(2), indirect(2),
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 589df6f73789..0cdf6cc5c557 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -138,7 +138,8 @@ TRACE_DEFINE_ENUM(CP_TRIMMED);
 		{ CP_NO_SPC_ROLL,	"no space roll forward" },	\
 		{ CP_NODE_NEED_CP,	"node needs cp" },		\
 		{ CP_FASTBOOT_MODE,	"fastboot mode" },		\
-		{ CP_SPEC_LOG_NUM,	"log type is 2" })
+		{ CP_SPEC_LOG_NUM,	"log type is 2" },		\
+		{ CP_RECOVER_DIR,	"dir needs recovery" })
 
 struct victim_sel_policy;
 struct f2fs_map_blocks;

From 718f4ad191eff77b0325d8e2e61d9e91fbe81bd6 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Fri, 16 Feb 2018 09:47:15 +0100
Subject: [PATCH 1601/3220] UPSTREAM: ANDROID: binder: synchronize_rcu() when
 using POLLFREE.

To prevent races with ep_remove_waitqueue() removing the
waitqueue at the same time.

Reported-by: syzbot+a2a3c4909716e271487e@syzkaller.appspotmail.com
Signed-off-by: Martijn Coenen <maco@android.com>
Cc: stable <stable@vger.kernel.org> # 4.14+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 5eeb2ca02a2f6084fc57ae5c244a38baab07033a)

Change-Id: Ia0089448079c78d0ab0b57303faf838e9e5ee797
---
 drivers/android/binder.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 297b61689250..c87df4ac413c 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4550,6 +4550,15 @@ static int binder_thread_release(struct binder_proc *proc,
 
 	binder_inner_proc_unlock(thread->proc);
 
+	/*
+	 * This is needed to avoid races between wake_up_poll() above and
+	 * and ep_remove_waitqueue() called for other reasons (eg the epoll file
+	 * descriptor being closed); ep_remove_waitqueue() holds an RCU read
+	 * lock, so we can be sure it's done after calling synchronize_rcu().
+	 */
+	if (thread->looper & BINDER_LOOPER_STATE_POLL)
+		synchronize_rcu();
+
 	if (send_reply)
 		binder_send_failed_reply(send_reply, BR_DEAD_REPLY);
 	binder_release_work(proc, &thread->todo);

From 1decfa0b7db0512fcf6e55abc821d58773510a24 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Feb 2018 10:42:19 -0800
Subject: [PATCH 1602/3220] BACKPORT, FROMGIT: crypto: speck - add support for
 the Speck block cipher

Add a generic implementation of Speck, including the Speck128 and
Speck64 variants.  Speck is a lightweight block cipher that can be much
faster than AES on processors that don't have AES instructions.

We are planning to offer Speck-XTS (probably Speck128/256-XTS) as an
option for dm-crypt and fscrypt on Android, for low-end mobile devices
with older CPUs such as ARMv7 which don't have the Cryptography
Extensions.  Currently, such devices are unencrypted because AES is not
fast enough, even when the NEON bit-sliced implementation of AES is
used.  Other AES alternatives such as Twofish, Threefish, Camellia,
CAST6, and Serpent aren't fast enough either; it seems that only a
modern ARX cipher can provide sufficient performance on these devices.

This is a replacement for our original proposal
(https://patchwork.kernel.org/patch/10101451/) which was to offer
ChaCha20 for these devices.  However, the use of a stream cipher for
disk/file encryption with no space to store nonces would have been much
more insecure than we thought initially, given that it would be used on
top of flash storage as well as potentially on top of F2FS, neither of
which is guaranteed to overwrite data in-place.

Speck has been somewhat controversial due to its origin.  Nevertheless,
it has a straightforward design (it's an ARX cipher), and it appears to
be the leading software-optimized lightweight block cipher currently,
with the most cryptanalysis.  It's also easy to implement without side
channels, unlike AES.  Moreover, we only intend Speck to be used when
the status quo is no encryption, due to AES not being fast enough.

We've also considered a novel length-preserving encryption mode based on
ChaCha20 and Poly1305.  While theoretically attractive, such a mode
would be a brand new crypto construction and would be more complicated
and difficult to implement efficiently in comparison to Speck-XTS.

There is confusion about the byte and word orders of Speck, since the
original paper doesn't specify them.  But we have implemented it using
the orders the authors recommended in a correspondence with them.  The
test vectors are taken from the original paper but were mapped to byte
arrays using the recommended byte and word orders.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

(cherry picked from commit da7a0ab5b4babbe5d7a46f852582be06a00a28f0
 git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master)
(removed 'const' from test vectors)
(replaced use of __VECS macro in crypto/testmgr.c)
Change-Id: Id13c44dee8e3817590950c178d54b24c3aee0b4e
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/Kconfig   |  14 +++
 crypto/Makefile  |   1 +
 crypto/speck.c   | 299 +++++++++++++++++++++++++++++++++++++++++++++++
 crypto/testmgr.c |  30 +++++
 crypto/testmgr.h | 128 ++++++++++++++++++++
 5 files changed, 472 insertions(+)
 create mode 100644 crypto/speck.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 3240d394426c..7a9f4d3d089b 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1390,6 +1390,20 @@ config CRYPTO_SERPENT_AVX2_X86_64
 	  See also:
 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
 
+config CRYPTO_SPECK
+	tristate "Speck cipher algorithm"
+	select CRYPTO_ALGAPI
+	help
+	  Speck is a lightweight block cipher that is tuned for optimal
+	  performance in software (rather than hardware).
+
+	  Speck may not be as secure as AES, and should only be used on systems
+	  where AES is not fast enough.
+
+	  See also: <https://eprint.iacr.org/2013/404.pdf>
+
+	  If unsure, say N.
+
 config CRYPTO_TEA
 	tristate "TEA, XTEA and XETA cipher algorithms"
 	select CRYPTO_ALGAPI
diff --git a/crypto/Makefile b/crypto/Makefile
index 8507d1fab3ac..7ae15c47f684 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_CRYPTO_TEA) += tea.o
 obj-$(CONFIG_CRYPTO_KHAZAD) += khazad.o
 obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o
 obj-$(CONFIG_CRYPTO_SEED) += seed.o
+obj-$(CONFIG_CRYPTO_SPECK) += speck.o
 obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
 obj-$(CONFIG_CRYPTO_CHACHA20) += chacha20_generic.o
 obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o
diff --git a/crypto/speck.c b/crypto/speck.c
new file mode 100644
index 000000000000..4e80ad76bcd7
--- /dev/null
+++ b/crypto/speck.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Speck: a lightweight block cipher
+ *
+ * Copyright (c) 2018 Google, Inc
+ *
+ * Speck has 10 variants, including 5 block sizes.  For now we only implement
+ * the variants Speck128/128, Speck128/192, Speck128/256, Speck64/96, and
+ * Speck64/128.   Speck${B}/${K} denotes the variant with a block size of B bits
+ * and a key size of K bits.  The Speck128 variants are believed to be the most
+ * secure variants, and they use the same block size and key sizes as AES.  The
+ * Speck64 variants are less secure, but on 32-bit processors are usually
+ * faster.  The remaining variants (Speck32, Speck48, and Speck96) are even less
+ * secure and/or not as well suited for implementation on either 32-bit or
+ * 64-bit processors, so are omitted.
+ *
+ * Reference: "The Simon and Speck Families of Lightweight Block Ciphers"
+ * https://eprint.iacr.org/2013/404.pdf
+ *
+ * In a correspondence, the Speck designers have also clarified that the words
+ * should be interpreted in little-endian format, and the words should be
+ * ordered such that the first word of each block is 'y' rather than 'x', and
+ * the first key word (rather than the last) becomes the first round key.
+ */
+
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+/* Speck128 */
+
+#define SPECK128_BLOCK_SIZE	16
+
+#define SPECK128_128_KEY_SIZE	16
+#define SPECK128_128_NROUNDS	32
+
+#define SPECK128_192_KEY_SIZE	24
+#define SPECK128_192_NROUNDS	33
+
+#define SPECK128_256_KEY_SIZE	32
+#define SPECK128_256_NROUNDS	34
+
+struct speck128_tfm_ctx {
+	u64 round_keys[SPECK128_256_NROUNDS];
+	int nrounds;
+};
+
+static __always_inline void speck128_round(u64 *x, u64 *y, u64 k)
+{
+	*x = ror64(*x, 8);
+	*x += *y;
+	*x ^= k;
+	*y = rol64(*y, 3);
+	*y ^= *x;
+}
+
+static __always_inline void speck128_unround(u64 *x, u64 *y, u64 k)
+{
+	*y ^= *x;
+	*y = ror64(*y, 3);
+	*x ^= k;
+	*x -= *y;
+	*x = rol64(*x, 8);
+}
+
+static void speck128_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	u64 y = get_unaligned_le64(in);
+	u64 x = get_unaligned_le64(in + 8);
+	int i;
+
+	for (i = 0; i < ctx->nrounds; i++)
+		speck128_round(&x, &y, ctx->round_keys[i]);
+
+	put_unaligned_le64(y, out);
+	put_unaligned_le64(x, out + 8);
+}
+
+static void speck128_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	u64 y = get_unaligned_le64(in);
+	u64 x = get_unaligned_le64(in + 8);
+	int i;
+
+	for (i = ctx->nrounds - 1; i >= 0; i--)
+		speck128_unround(&x, &y, ctx->round_keys[i]);
+
+	put_unaligned_le64(y, out);
+	put_unaligned_le64(x, out + 8);
+}
+
+static int speck128_setkey(struct crypto_tfm *tfm, const u8 *key,
+			   unsigned int keylen)
+{
+	struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	u64 l[3];
+	u64 k;
+	int i;
+
+	switch (keylen) {
+	case SPECK128_128_KEY_SIZE:
+		k = get_unaligned_le64(key);
+		l[0] = get_unaligned_le64(key + 8);
+		ctx->nrounds = SPECK128_128_NROUNDS;
+		for (i = 0; i < ctx->nrounds; i++) {
+			ctx->round_keys[i] = k;
+			speck128_round(&l[0], &k, i);
+		}
+		break;
+	case SPECK128_192_KEY_SIZE:
+		k = get_unaligned_le64(key);
+		l[0] = get_unaligned_le64(key + 8);
+		l[1] = get_unaligned_le64(key + 16);
+		ctx->nrounds = SPECK128_192_NROUNDS;
+		for (i = 0; i < ctx->nrounds; i++) {
+			ctx->round_keys[i] = k;
+			speck128_round(&l[i % 2], &k, i);
+		}
+		break;
+	case SPECK128_256_KEY_SIZE:
+		k = get_unaligned_le64(key);
+		l[0] = get_unaligned_le64(key + 8);
+		l[1] = get_unaligned_le64(key + 16);
+		l[2] = get_unaligned_le64(key + 24);
+		ctx->nrounds = SPECK128_256_NROUNDS;
+		for (i = 0; i < ctx->nrounds; i++) {
+			ctx->round_keys[i] = k;
+			speck128_round(&l[i % 3], &k, i);
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Speck64 */
+
+#define SPECK64_BLOCK_SIZE	8
+
+#define SPECK64_96_KEY_SIZE	12
+#define SPECK64_96_NROUNDS	26
+
+#define SPECK64_128_KEY_SIZE	16
+#define SPECK64_128_NROUNDS	27
+
+struct speck64_tfm_ctx {
+	u32 round_keys[SPECK64_128_NROUNDS];
+	int nrounds;
+};
+
+static __always_inline void speck64_round(u32 *x, u32 *y, u32 k)
+{
+	*x = ror32(*x, 8);
+	*x += *y;
+	*x ^= k;
+	*y = rol32(*y, 3);
+	*y ^= *x;
+}
+
+static __always_inline void speck64_unround(u32 *x, u32 *y, u32 k)
+{
+	*y ^= *x;
+	*y = ror32(*y, 3);
+	*x ^= k;
+	*x -= *y;
+	*x = rol32(*x, 8);
+}
+
+static void speck64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 y = get_unaligned_le32(in);
+	u32 x = get_unaligned_le32(in + 4);
+	int i;
+
+	for (i = 0; i < ctx->nrounds; i++)
+		speck64_round(&x, &y, ctx->round_keys[i]);
+
+	put_unaligned_le32(y, out);
+	put_unaligned_le32(x, out + 4);
+}
+
+static void speck64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 y = get_unaligned_le32(in);
+	u32 x = get_unaligned_le32(in + 4);
+	int i;
+
+	for (i = ctx->nrounds - 1; i >= 0; i--)
+		speck64_unround(&x, &y, ctx->round_keys[i]);
+
+	put_unaligned_le32(y, out);
+	put_unaligned_le32(x, out + 4);
+}
+
+static int speck64_setkey(struct crypto_tfm *tfm, const u8 *key,
+			  unsigned int keylen)
+{
+	struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 l[3];
+	u32 k;
+	int i;
+
+	switch (keylen) {
+	case SPECK64_96_KEY_SIZE:
+		k = get_unaligned_le32(key);
+		l[0] = get_unaligned_le32(key + 4);
+		l[1] = get_unaligned_le32(key + 8);
+		ctx->nrounds = SPECK64_96_NROUNDS;
+		for (i = 0; i < ctx->nrounds; i++) {
+			ctx->round_keys[i] = k;
+			speck64_round(&l[i % 2], &k, i);
+		}
+		break;
+	case SPECK64_128_KEY_SIZE:
+		k = get_unaligned_le32(key);
+		l[0] = get_unaligned_le32(key + 4);
+		l[1] = get_unaligned_le32(key + 8);
+		l[2] = get_unaligned_le32(key + 12);
+		ctx->nrounds = SPECK64_128_NROUNDS;
+		for (i = 0; i < ctx->nrounds; i++) {
+			ctx->round_keys[i] = k;
+			speck64_round(&l[i % 3], &k, i);
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Algorithm definitions */
+
+static struct crypto_alg speck_algs[] = {
+	{
+		.cra_name		= "speck128",
+		.cra_driver_name	= "speck128-generic",
+		.cra_priority		= 100,
+		.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+		.cra_blocksize		= SPECK128_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct speck128_tfm_ctx),
+		.cra_module		= THIS_MODULE,
+		.cra_u			= {
+			.cipher = {
+				.cia_min_keysize	= SPECK128_128_KEY_SIZE,
+				.cia_max_keysize	= SPECK128_256_KEY_SIZE,
+				.cia_setkey		= speck128_setkey,
+				.cia_encrypt		= speck128_encrypt,
+				.cia_decrypt		= speck128_decrypt
+			}
+		}
+	}, {
+		.cra_name		= "speck64",
+		.cra_driver_name	= "speck64-generic",
+		.cra_priority		= 100,
+		.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+		.cra_blocksize		= SPECK64_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct speck64_tfm_ctx),
+		.cra_module		= THIS_MODULE,
+		.cra_u			= {
+			.cipher = {
+				.cia_min_keysize	= SPECK64_96_KEY_SIZE,
+				.cia_max_keysize	= SPECK64_128_KEY_SIZE,
+				.cia_setkey		= speck64_setkey,
+				.cia_encrypt		= speck64_encrypt,
+				.cia_decrypt		= speck64_decrypt
+			}
+		}
+	}
+};
+
+static int __init speck_module_init(void)
+{
+	return crypto_register_algs(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+static void __exit speck_module_exit(void)
+{
+	crypto_unregister_algs(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+module_init(speck_module_init);
+module_exit(speck_module_exit);
+
+MODULE_DESCRIPTION("Speck block cipher (generic)");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("speck128");
+MODULE_ALIAS_CRYPTO("speck128-generic");
+MODULE_ALIAS_CRYPTO("speck64");
+MODULE_ALIAS_CRYPTO("speck64-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 6d4da8fd24fd..e54027b2b07f 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3113,6 +3113,36 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "ecb(speck128)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = speck128_enc_tv_template,
+					.count = ARRAY_SIZE(speck128_enc_tv_template)
+				},
+				.dec = {
+					.vecs = speck128_dec_tv_template,
+					.count = ARRAY_SIZE(speck128_dec_tv_template)
+				}
+			}
+		}
+	}, {
+		.alg = "ecb(speck64)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = speck64_enc_tv_template,
+					.count = ARRAY_SIZE(speck64_enc_tv_template)
+				},
+				.dec = {
+					.vecs = speck64_dec_tv_template,
+					.count = ARRAY_SIZE(speck64_dec_tv_template)
+				}
+			}
+		}
 	}, {
 		.alg = "ecb(tea)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index ba6530d8ba58..5cacb7508936 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -12589,6 +12589,134 @@ static struct cipher_testvec serpent_xts_dec_tv_template[] = {
 	},
 };
 
+/*
+ * Speck test vectors taken from the original paper:
+ * "The Simon and Speck Families of Lightweight Block Ciphers"
+ * https://eprint.iacr.org/2013/404.pdf
+ *
+ * Note that the paper does not make byte and word order clear.  But it was
+ * confirmed with the authors that the intended orders are little endian byte
+ * order and (y, x) word order.  Equivalently, the printed test vectors, when
+ * looking at only the bytes (ignoring the whitespace that divides them into
+ * words), are backwards: the left-most byte is actually the one with the
+ * highest memory address, while the right-most byte is actually the one with
+ * the lowest memory address.
+ */
+
+static struct cipher_testvec speck128_enc_tv_template[] = {
+	{ /* Speck128/128 */
+		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.klen	= 16,
+		.input	= "\x20\x6d\x61\x64\x65\x20\x69\x74"
+			  "\x20\x65\x71\x75\x69\x76\x61\x6c",
+		.ilen	= 16,
+		.result	= "\x18\x0d\x57\x5c\xdf\xfe\x60\x78"
+			  "\x65\x32\x78\x79\x51\x98\x5d\xa6",
+		.rlen	= 16,
+	}, { /* Speck128/192 */
+		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17",
+		.klen	= 24,
+		.input	= "\x65\x6e\x74\x20\x74\x6f\x20\x43"
+			  "\x68\x69\x65\x66\x20\x48\x61\x72",
+		.ilen	= 16,
+		.result	= "\x86\x18\x3c\xe0\x5d\x18\xbc\xf9"
+			  "\x66\x55\x13\x13\x3a\xcf\xe4\x1b",
+		.rlen	= 16,
+	}, { /* Speck128/256 */
+		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+		.klen	= 32,
+		.input	= "\x70\x6f\x6f\x6e\x65\x72\x2e\x20"
+			  "\x49\x6e\x20\x74\x68\x6f\x73\x65",
+		.ilen	= 16,
+		.result	= "\x43\x8f\x18\x9c\x8d\xb4\xee\x4e"
+			  "\x3e\xf5\xc0\x05\x04\x01\x09\x41",
+		.rlen	= 16,
+	},
+};
+
+static struct cipher_testvec speck128_dec_tv_template[] = {
+	{ /* Speck128/128 */
+		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.klen	= 16,
+		.input	= "\x18\x0d\x57\x5c\xdf\xfe\x60\x78"
+			  "\x65\x32\x78\x79\x51\x98\x5d\xa6",
+		.ilen	= 16,
+		.result	= "\x20\x6d\x61\x64\x65\x20\x69\x74"
+			  "\x20\x65\x71\x75\x69\x76\x61\x6c",
+		.rlen	= 16,
+	}, { /* Speck128/192 */
+		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17",
+		.klen	= 24,
+		.input	= "\x86\x18\x3c\xe0\x5d\x18\xbc\xf9"
+			  "\x66\x55\x13\x13\x3a\xcf\xe4\x1b",
+		.ilen	= 16,
+		.result	= "\x65\x6e\x74\x20\x74\x6f\x20\x43"
+			  "\x68\x69\x65\x66\x20\x48\x61\x72",
+		.rlen	= 16,
+	}, { /* Speck128/256 */
+		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+		.klen	= 32,
+		.input	= "\x43\x8f\x18\x9c\x8d\xb4\xee\x4e"
+			  "\x3e\xf5\xc0\x05\x04\x01\x09\x41",
+		.ilen	= 16,
+		.result	= "\x70\x6f\x6f\x6e\x65\x72\x2e\x20"
+			  "\x49\x6e\x20\x74\x68\x6f\x73\x65",
+		.rlen	= 16,
+	},
+};
+
+static struct cipher_testvec speck64_enc_tv_template[] = {
+	{ /* Speck64/96 */
+		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"
+			  "\x10\x11\x12\x13",
+		.klen	= 12,
+		.input	= "\x65\x61\x6e\x73\x20\x46\x61\x74",
+		.ilen	= 8,
+		.result	= "\x6c\x94\x75\x41\xec\x52\x79\x9f",
+		.rlen	= 8,
+	}, { /* Speck64/128 */
+		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"
+			  "\x10\x11\x12\x13\x18\x19\x1a\x1b",
+		.klen	= 16,
+		.input	= "\x2d\x43\x75\x74\x74\x65\x72\x3b",
+		.ilen	= 8,
+		.result	= "\x8b\x02\x4e\x45\x48\xa5\x6f\x8c",
+		.rlen	= 8,
+	},
+};
+
+static struct cipher_testvec speck64_dec_tv_template[] = {
+	{ /* Speck64/96 */
+		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"
+			  "\x10\x11\x12\x13",
+		.klen	= 12,
+		.input	= "\x6c\x94\x75\x41\xec\x52\x79\x9f",
+		.ilen	= 8,
+		.result	= "\x65\x61\x6e\x73\x20\x46\x61\x74",
+		.rlen	= 8,
+	}, { /* Speck64/128 */
+		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"
+			  "\x10\x11\x12\x13\x18\x19\x1a\x1b",
+		.klen	= 16,
+		.input	= "\x8b\x02\x4e\x45\x48\xa5\x6f\x8c",
+		.ilen	= 8,
+		.result	= "\x2d\x43\x75\x74\x74\x65\x72\x3b",
+		.rlen	= 8,
+	},
+};
+
 /* Cast6 test vectors from RFC 2612 */
 #define CAST6_ENC_TEST_VECTORS		4
 #define CAST6_DEC_TEST_VECTORS		4

From e86420efb2dfeee0152e125765134d423759bd5a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Feb 2018 10:42:20 -0800
Subject: [PATCH 1603/3220] FROMGIT: crypto: speck - export common helpers

Export the Speck constants and transform context and the ->setkey(),
->encrypt(), and ->decrypt() functions so that they can be reused by the
ARM NEON implementation of Speck-XTS.  The generic key expansion code
will be reused because it is not performance-critical and is not
vectorizable, while the generic encryption and decryption functions are
needed as fallbacks and for the XTS tweak encryption.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

(cherry picked from commit c8c36413ca8ccbf7a0afe71247fc4617ee2dfcfe
 git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master)
Change-Id: I93e96e1ef40de7071af212146b8ad3bf45297c1d
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/speck.c         | 90 +++++++++++++++++++++++-------------------
 include/crypto/speck.h | 62 +++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 41 deletions(-)
 create mode 100644 include/crypto/speck.h

diff --git a/crypto/speck.c b/crypto/speck.c
index 4e80ad76bcd7..58aa9f7f91f7 100644
--- a/crypto/speck.c
+++ b/crypto/speck.c
@@ -24,6 +24,7 @@
  */
 
 #include <asm/unaligned.h>
+#include <crypto/speck.h>
 #include <linux/bitops.h>
 #include <linux/crypto.h>
 #include <linux/init.h>
@@ -31,22 +32,6 @@
 
 /* Speck128 */
 
-#define SPECK128_BLOCK_SIZE	16
-
-#define SPECK128_128_KEY_SIZE	16
-#define SPECK128_128_NROUNDS	32
-
-#define SPECK128_192_KEY_SIZE	24
-#define SPECK128_192_NROUNDS	33
-
-#define SPECK128_256_KEY_SIZE	32
-#define SPECK128_256_NROUNDS	34
-
-struct speck128_tfm_ctx {
-	u64 round_keys[SPECK128_256_NROUNDS];
-	int nrounds;
-};
-
 static __always_inline void speck128_round(u64 *x, u64 *y, u64 k)
 {
 	*x = ror64(*x, 8);
@@ -65,9 +50,9 @@ static __always_inline void speck128_unround(u64 *x, u64 *y, u64 k)
 	*x = rol64(*x, 8);
 }
 
-static void speck128_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+void crypto_speck128_encrypt(const struct speck128_tfm_ctx *ctx,
+			     u8 *out, const u8 *in)
 {
-	const struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 y = get_unaligned_le64(in);
 	u64 x = get_unaligned_le64(in + 8);
 	int i;
@@ -78,10 +63,16 @@ static void speck128_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le64(y, out);
 	put_unaligned_le64(x, out + 8);
 }
+EXPORT_SYMBOL_GPL(crypto_speck128_encrypt);
 
-static void speck128_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static void speck128_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	crypto_speck128_encrypt(crypto_tfm_ctx(tfm), out, in);
+}
+
+void crypto_speck128_decrypt(const struct speck128_tfm_ctx *ctx,
+			     u8 *out, const u8 *in)
 {
-	const struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 y = get_unaligned_le64(in);
 	u64 x = get_unaligned_le64(in + 8);
 	int i;
@@ -92,11 +83,16 @@ static void speck128_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le64(y, out);
 	put_unaligned_le64(x, out + 8);
 }
+EXPORT_SYMBOL_GPL(crypto_speck128_decrypt);
 
-static int speck128_setkey(struct crypto_tfm *tfm, const u8 *key,
+static void speck128_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	crypto_speck128_decrypt(crypto_tfm_ctx(tfm), out, in);
+}
+
+int crypto_speck128_setkey(struct speck128_tfm_ctx *ctx, const u8 *key,
 			   unsigned int keylen)
 {
-	struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 l[3];
 	u64 k;
 	int i;
@@ -138,22 +134,16 @@ static int speck128_setkey(struct crypto_tfm *tfm, const u8 *key,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(crypto_speck128_setkey);
+
+static int speck128_setkey(struct crypto_tfm *tfm, const u8 *key,
+			   unsigned int keylen)
+{
+	return crypto_speck128_setkey(crypto_tfm_ctx(tfm), key, keylen);
+}
 
 /* Speck64 */
 
-#define SPECK64_BLOCK_SIZE	8
-
-#define SPECK64_96_KEY_SIZE	12
-#define SPECK64_96_NROUNDS	26
-
-#define SPECK64_128_KEY_SIZE	16
-#define SPECK64_128_NROUNDS	27
-
-struct speck64_tfm_ctx {
-	u32 round_keys[SPECK64_128_NROUNDS];
-	int nrounds;
-};
-
 static __always_inline void speck64_round(u32 *x, u32 *y, u32 k)
 {
 	*x = ror32(*x, 8);
@@ -172,9 +162,9 @@ static __always_inline void speck64_unround(u32 *x, u32 *y, u32 k)
 	*x = rol32(*x, 8);
 }
 
-static void speck64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+void crypto_speck64_encrypt(const struct speck64_tfm_ctx *ctx,
+			    u8 *out, const u8 *in)
 {
-	const struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 y = get_unaligned_le32(in);
 	u32 x = get_unaligned_le32(in + 4);
 	int i;
@@ -185,10 +175,16 @@ static void speck64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le32(y, out);
 	put_unaligned_le32(x, out + 4);
 }
+EXPORT_SYMBOL_GPL(crypto_speck64_encrypt);
 
-static void speck64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static void speck64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	crypto_speck64_encrypt(crypto_tfm_ctx(tfm), out, in);
+}
+
+void crypto_speck64_decrypt(const struct speck64_tfm_ctx *ctx,
+			    u8 *out, const u8 *in)
 {
-	const struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 y = get_unaligned_le32(in);
 	u32 x = get_unaligned_le32(in + 4);
 	int i;
@@ -199,11 +195,16 @@ static void speck64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le32(y, out);
 	put_unaligned_le32(x, out + 4);
 }
+EXPORT_SYMBOL_GPL(crypto_speck64_decrypt);
 
-static int speck64_setkey(struct crypto_tfm *tfm, const u8 *key,
+static void speck64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	crypto_speck64_decrypt(crypto_tfm_ctx(tfm), out, in);
+}
+
+int crypto_speck64_setkey(struct speck64_tfm_ctx *ctx, const u8 *key,
 			  unsigned int keylen)
 {
-	struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 l[3];
 	u32 k;
 	int i;
@@ -236,6 +237,13 @@ static int speck64_setkey(struct crypto_tfm *tfm, const u8 *key,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(crypto_speck64_setkey);
+
+static int speck64_setkey(struct crypto_tfm *tfm, const u8 *key,
+			  unsigned int keylen)
+{
+	return crypto_speck64_setkey(crypto_tfm_ctx(tfm), key, keylen);
+}
 
 /* Algorithm definitions */
 
diff --git a/include/crypto/speck.h b/include/crypto/speck.h
new file mode 100644
index 000000000000..73cfc952d405
--- /dev/null
+++ b/include/crypto/speck.h
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common values for the Speck algorithm
+ */
+
+#ifndef _CRYPTO_SPECK_H
+#define _CRYPTO_SPECK_H
+
+#include <linux/types.h>
+
+/* Speck128 */
+
+#define SPECK128_BLOCK_SIZE	16
+
+#define SPECK128_128_KEY_SIZE	16
+#define SPECK128_128_NROUNDS	32
+
+#define SPECK128_192_KEY_SIZE	24
+#define SPECK128_192_NROUNDS	33
+
+#define SPECK128_256_KEY_SIZE	32
+#define SPECK128_256_NROUNDS	34
+
+struct speck128_tfm_ctx {
+	u64 round_keys[SPECK128_256_NROUNDS];
+	int nrounds;
+};
+
+void crypto_speck128_encrypt(const struct speck128_tfm_ctx *ctx,
+			     u8 *out, const u8 *in);
+
+void crypto_speck128_decrypt(const struct speck128_tfm_ctx *ctx,
+			     u8 *out, const u8 *in);
+
+int crypto_speck128_setkey(struct speck128_tfm_ctx *ctx, const u8 *key,
+			   unsigned int keysize);
+
+/* Speck64 */
+
+#define SPECK64_BLOCK_SIZE	8
+
+#define SPECK64_96_KEY_SIZE	12
+#define SPECK64_96_NROUNDS	26
+
+#define SPECK64_128_KEY_SIZE	16
+#define SPECK64_128_NROUNDS	27
+
+struct speck64_tfm_ctx {
+	u32 round_keys[SPECK64_128_NROUNDS];
+	int nrounds;
+};
+
+void crypto_speck64_encrypt(const struct speck64_tfm_ctx *ctx,
+			    u8 *out, const u8 *in);
+
+void crypto_speck64_decrypt(const struct speck64_tfm_ctx *ctx,
+			    u8 *out, const u8 *in);
+
+int crypto_speck64_setkey(struct speck64_tfm_ctx *ctx, const u8 *key,
+			  unsigned int keysize);
+
+#endif /* _CRYPTO_SPECK_H */

From f54e3ccbe8a2be3947e4238817de3e43f1da8fe9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Feb 2018 10:42:21 -0800
Subject: [PATCH 1604/3220] BACKPORT, FROMGIT: crypto: arm/speck - add
 NEON-accelerated implementation of Speck-XTS

Add an ARM NEON-accelerated implementation of Speck-XTS.  It operates on
128-byte chunks at a time, i.e. 8 blocks for Speck128 or 16 blocks for
Speck64.  Each 128-byte chunk goes through XTS preprocessing, then is
encrypted/decrypted (doing one cipher round for all the blocks, then the
next round, etc.), then goes through XTS postprocessing.

The performance depends on the processor but can be about 3 times faster
than the generic code.  For example, on an ARMv7 processor we observe
the following performance with Speck128/256-XTS:

    xts-speck128-neon:     Encryption 107.9 MB/s, Decryption 108.1 MB/s
    xts(speck128-generic): Encryption  32.1 MB/s, Decryption  36.6 MB/s

In comparison to AES-256-XTS without the Cryptography Extensions:

    xts-aes-neonbs:        Encryption  41.2 MB/s, Decryption  36.7 MB/s
    xts(aes-asm):          Encryption  31.7 MB/s, Decryption  30.8 MB/s
    xts(aes-generic):      Encryption  21.2 MB/s, Decryption  20.9 MB/s

Speck64/128-XTS is even faster:

    xts-speck64-neon:      Encryption 138.6 MB/s, Decryption 139.1 MB/s

Note that as with the generic code, only the Speck128 and Speck64
variants are supported.  Also, for now only the XTS mode of operation is
supported, to target the disk and file encryption use cases.  The NEON
code also only handles the portion of the data that is evenly divisible
into 128-byte chunks, with any remainder handled by a C fallback.  Of
course, other modes of operation could be added later if needed, and/or
the NEON code could be updated to handle other buffer sizes.

The XTS specification is only defined for AES which has a 128-bit block
size, so for the GF(2^64) math needed for Speck64-XTS we use the
reducing polynomial 'x^64 + x^4 + x^3 + x + 1' given by the original XEX
paper.  Of course, when possible users should use Speck128-XTS, but even
that may be too slow on some processors; Speck64-XTS can be faster.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

(cherry picked from commit ede9622162fac42eacde231d64e94c926f4be45d
 git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master)
(changed speck-neon-glue.c to use blkcipher API instead of skcipher API)
(resolved merge conflict in arch/arm/crypto/Makefile)
(made CONFIG_CRYPTO_SPECK_NEON select CONFIG_CRYPTO_GF128MUL, since
 gf128mul_x_ble() is non-inline in older kernels)
Change-Id: I5bbc86cb3c2cbc36636a59a0db725b2ad95ea81b
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm/crypto/Kconfig           |   7 +
 arch/arm/crypto/Makefile          |   2 +
 arch/arm/crypto/speck-neon-core.S | 432 ++++++++++++++++++++++++++++++
 arch/arm/crypto/speck-neon-glue.c | 314 ++++++++++++++++++++++
 4 files changed, 755 insertions(+)
 create mode 100644 arch/arm/crypto/speck-neon-core.S
 create mode 100644 arch/arm/crypto/speck-neon-glue.c

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 27ed1b1cd1d7..72aa5ecd5580 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -120,4 +120,11 @@ config CRYPTO_GHASH_ARM_CE
 	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
 	  that is part of the ARMv8 Crypto Extensions
 
+config CRYPTO_SPECK_NEON
+	tristate "NEON accelerated Speck cipher algorithms"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_GF128MUL
+	select CRYPTO_SPECK
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc5150702b64..1d0448a875ee 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -36,6 +37,7 @@ sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
+speck-neon-y := speck-neon-core.o speck-neon-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/speck-neon-core.S b/arch/arm/crypto/speck-neon-core.S
new file mode 100644
index 000000000000..3c1e203e53b9
--- /dev/null
+++ b/arch/arm/crypto/speck-neon-core.S
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
+ *
+ * Copyright (c) 2018 Google, Inc
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+
+	// arguments
+	ROUND_KEYS	.req	r0	// const {u64,u32} *round_keys
+	NROUNDS		.req	r1	// int nrounds
+	DST		.req	r2	// void *dst
+	SRC		.req	r3	// const void *src
+	NBYTES		.req	r4	// unsigned int nbytes
+	TWEAK		.req	r5	// void *tweak
+
+	// registers which hold the data being encrypted/decrypted
+	X0		.req	q0
+	X0_L		.req	d0
+	X0_H		.req	d1
+	Y0		.req	q1
+	Y0_H		.req	d3
+	X1		.req	q2
+	X1_L		.req	d4
+	X1_H		.req	d5
+	Y1		.req	q3
+	Y1_H		.req	d7
+	X2		.req	q4
+	X2_L		.req	d8
+	X2_H		.req	d9
+	Y2		.req	q5
+	Y2_H		.req	d11
+	X3		.req	q6
+	X3_L		.req	d12
+	X3_H		.req	d13
+	Y3		.req	q7
+	Y3_H		.req	d15
+
+	// the round key, duplicated in all lanes
+	ROUND_KEY	.req	q8
+	ROUND_KEY_L	.req	d16
+	ROUND_KEY_H	.req	d17
+
+	// index vector for vtbl-based 8-bit rotates
+	ROTATE_TABLE	.req	d18
+
+	// multiplication table for updating XTS tweaks
+	GF128MUL_TABLE	.req	d19
+	GF64MUL_TABLE	.req	d19
+
+	// current XTS tweak value(s)
+	TWEAKV		.req	q10
+	TWEAKV_L	.req	d20
+	TWEAKV_H	.req	d21
+
+	TMP0		.req	q12
+	TMP0_L		.req	d24
+	TMP0_H		.req	d25
+	TMP1		.req	q13
+	TMP2		.req	q14
+	TMP3		.req	q15
+
+	.align		4
+.Lror64_8_table:
+	.byte		1, 2, 3, 4, 5, 6, 7, 0
+.Lror32_8_table:
+	.byte		1, 2, 3, 0, 5, 6, 7, 4
+.Lrol64_8_table:
+	.byte		7, 0, 1, 2, 3, 4, 5, 6
+.Lrol32_8_table:
+	.byte		3, 0, 1, 2, 7, 4, 5, 6
+.Lgf128mul_table:
+	.byte		0, 0x87
+	.fill		14
+.Lgf64mul_table:
+	.byte		0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
+	.fill		12
+
+/*
+ * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
+ *
+ * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
+ * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
+ * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
+ *
+ * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
+ * the vtbl approach is faster on some processors and the same speed on others.
+ */
+.macro _speck_round_128bytes	n
+
+	// x = ror(x, 8)
+	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
+	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
+	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
+	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
+	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
+	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
+	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
+	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
+
+	// x += y
+	vadd.u\n	X0, Y0
+	vadd.u\n	X1, Y1
+	vadd.u\n	X2, Y2
+	vadd.u\n	X3, Y3
+
+	// x ^= k
+	veor		X0, ROUND_KEY
+	veor		X1, ROUND_KEY
+	veor		X2, ROUND_KEY
+	veor		X3, ROUND_KEY
+
+	// y = rol(y, 3)
+	vshl.u\n	TMP0, Y0, #3
+	vshl.u\n	TMP1, Y1, #3
+	vshl.u\n	TMP2, Y2, #3
+	vshl.u\n	TMP3, Y3, #3
+	vsri.u\n	TMP0, Y0, #(\n - 3)
+	vsri.u\n	TMP1, Y1, #(\n - 3)
+	vsri.u\n	TMP2, Y2, #(\n - 3)
+	vsri.u\n	TMP3, Y3, #(\n - 3)
+
+	// y ^= x
+	veor		Y0, TMP0, X0
+	veor		Y1, TMP1, X1
+	veor		Y2, TMP2, X2
+	veor		Y3, TMP3, X3
+.endm
+
+/*
+ * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
+ *
+ * This is the inverse of _speck_round_128bytes().
+ */
+.macro _speck_unround_128bytes	n
+
+	// y ^= x
+	veor		TMP0, Y0, X0
+	veor		TMP1, Y1, X1
+	veor		TMP2, Y2, X2
+	veor		TMP3, Y3, X3
+
+	// y = ror(y, 3)
+	vshr.u\n	Y0, TMP0, #3
+	vshr.u\n	Y1, TMP1, #3
+	vshr.u\n	Y2, TMP2, #3
+	vshr.u\n	Y3, TMP3, #3
+	vsli.u\n	Y0, TMP0, #(\n - 3)
+	vsli.u\n	Y1, TMP1, #(\n - 3)
+	vsli.u\n	Y2, TMP2, #(\n - 3)
+	vsli.u\n	Y3, TMP3, #(\n - 3)
+
+	// x ^= k
+	veor		X0, ROUND_KEY
+	veor		X1, ROUND_KEY
+	veor		X2, ROUND_KEY
+	veor		X3, ROUND_KEY
+
+	// x -= y
+	vsub.u\n	X0, Y0
+	vsub.u\n	X1, Y1
+	vsub.u\n	X2, Y2
+	vsub.u\n	X3, Y3
+
+	// x = rol(x, 8);
+	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
+	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
+	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
+	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
+	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
+	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
+	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
+	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
+.endm
+
+.macro _xts128_precrypt_one	dst_reg, tweak_buf, tmp
+
+	// Load the next source block
+	vld1.8		{\dst_reg}, [SRC]!
+
+	// Save the current tweak in the tweak buffer
+	vst1.8		{TWEAKV}, [\tweak_buf:128]!
+
+	// XOR the next source block with the current tweak
+	veor		\dst_reg, TWEAKV
+
+	/*
+	 * Calculate the next tweak by multiplying the current one by x,
+	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
+	 */
+	vshr.u64	\tmp, TWEAKV, #63
+	vshl.u64	TWEAKV, #1
+	veor		TWEAKV_H, \tmp\()_L
+	vtbl.8		\tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
+	veor		TWEAKV_L, \tmp\()_H
+.endm
+
+.macro _xts64_precrypt_two	dst_reg, tweak_buf, tmp
+
+	// Load the next two source blocks
+	vld1.8		{\dst_reg}, [SRC]!
+
+	// Save the current two tweaks in the tweak buffer
+	vst1.8		{TWEAKV}, [\tweak_buf:128]!
+
+	// XOR the next two source blocks with the current two tweaks
+	veor		\dst_reg, TWEAKV
+
+	/*
+	 * Calculate the next two tweaks by multiplying the current ones by x^2,
+	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
+	 */
+	vshr.u64	\tmp, TWEAKV, #62
+	vshl.u64	TWEAKV, #2
+	vtbl.8		\tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
+	vtbl.8		\tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
+	veor		TWEAKV, \tmp
+.endm
+
+/*
+ * _speck_xts_crypt() - Speck-XTS encryption/decryption
+ *
+ * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
+ * using Speck-XTS, specifically the variant with a block size of '2n' and round
+ * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
+ * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
+ * nonzero multiple of 128.
+ */
+.macro _speck_xts_crypt	n, decrypting
+	push		{r4-r7}
+	mov		r7, sp
+
+	/*
+	 * The first four parameters were passed in registers r0-r3.  Load the
+	 * additional parameters, which were passed on the stack.
+	 */
+	ldr		NBYTES, [sp, #16]
+	ldr		TWEAK, [sp, #20]
+
+	/*
+	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
+	 * round key rather than the first, since for decryption the round keys
+	 * are used in reverse order.
+	 */
+.if \decrypting
+.if \n == 64
+	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
+	sub		ROUND_KEYS, #8
+.else
+	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
+	sub		ROUND_KEYS, #4
+.endif
+.endif
+
+	// Load the index vector for vtbl-based 8-bit rotates
+.if \decrypting
+	ldr		r12, =.Lrol\n\()_8_table
+.else
+	ldr		r12, =.Lror\n\()_8_table
+.endif
+	vld1.8		{ROTATE_TABLE}, [r12:64]
+
+	// One-time XTS preparation
+
+	/*
+	 * Allocate stack space to store 128 bytes worth of tweaks.  For
+	 * performance, this space is aligned to a 16-byte boundary so that we
+	 * can use the load/store instructions that declare 16-byte alignment.
+	 */
+	sub		sp, #128
+	bic		sp, #0xf
+
+.if \n == 64
+	// Load first tweak
+	vld1.8		{TWEAKV}, [TWEAK]
+
+	// Load GF(2^128) multiplication table
+	ldr		r12, =.Lgf128mul_table
+	vld1.8		{GF128MUL_TABLE}, [r12:64]
+.else
+	// Load first tweak
+	vld1.8		{TWEAKV_L}, [TWEAK]
+
+	// Load GF(2^64) multiplication table
+	ldr		r12, =.Lgf64mul_table
+	vld1.8		{GF64MUL_TABLE}, [r12:64]
+
+	// Calculate second tweak, packing it together with the first
+	vshr.u64	TMP0_L, TWEAKV_L, #63
+	vtbl.u8		TMP0_L, {GF64MUL_TABLE}, TMP0_L
+	vshl.u64	TWEAKV_H, TWEAKV_L, #1
+	veor		TWEAKV_H, TMP0_L
+.endif
+
+.Lnext_128bytes_\@:
+
+	/*
+	 * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
+	 * values, and save the tweaks on the stack for later.  Then
+	 * de-interleave the 'x' and 'y' elements of each block, i.e. make it so
+	 * that the X[0-3] registers contain only the second halves of blocks,
+	 * and the Y[0-3] registers contain only the first halves of blocks.
+	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
+	 */
+	mov		r12, sp
+.if \n == 64
+	_xts128_precrypt_one	X0, r12, TMP0
+	_xts128_precrypt_one	Y0, r12, TMP0
+	_xts128_precrypt_one	X1, r12, TMP0
+	_xts128_precrypt_one	Y1, r12, TMP0
+	_xts128_precrypt_one	X2, r12, TMP0
+	_xts128_precrypt_one	Y2, r12, TMP0
+	_xts128_precrypt_one	X3, r12, TMP0
+	_xts128_precrypt_one	Y3, r12, TMP0
+	vswp		X0_L, Y0_H
+	vswp		X1_L, Y1_H
+	vswp		X2_L, Y2_H
+	vswp		X3_L, Y3_H
+.else
+	_xts64_precrypt_two	X0, r12, TMP0
+	_xts64_precrypt_two	Y0, r12, TMP0
+	_xts64_precrypt_two	X1, r12, TMP0
+	_xts64_precrypt_two	Y1, r12, TMP0
+	_xts64_precrypt_two	X2, r12, TMP0
+	_xts64_precrypt_two	Y2, r12, TMP0
+	_xts64_precrypt_two	X3, r12, TMP0
+	_xts64_precrypt_two	Y3, r12, TMP0
+	vuzp.32		Y0, X0
+	vuzp.32		Y1, X1
+	vuzp.32		Y2, X2
+	vuzp.32		Y3, X3
+.endif
+
+	// Do the cipher rounds
+
+	mov		r12, ROUND_KEYS
+	mov		r6, NROUNDS
+
+.Lnext_round_\@:
+.if \decrypting
+.if \n == 64
+	vld1.64		ROUND_KEY_L, [r12]
+	sub		r12, #8
+	vmov		ROUND_KEY_H, ROUND_KEY_L
+.else
+	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
+	sub		r12, #4
+.endif
+	_speck_unround_128bytes	\n
+.else
+.if \n == 64
+	vld1.64		ROUND_KEY_L, [r12]!
+	vmov		ROUND_KEY_H, ROUND_KEY_L
+.else
+	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
+.endif
+	_speck_round_128bytes	\n
+.endif
+	subs		r6, r6, #1
+	bne		.Lnext_round_\@
+
+	// Re-interleave the 'x' and 'y' elements of each block
+.if \n == 64
+	vswp		X0_L, Y0_H
+	vswp		X1_L, Y1_H
+	vswp		X2_L, Y2_H
+	vswp		X3_L, Y3_H
+.else
+	vzip.32		Y0, X0
+	vzip.32		Y1, X1
+	vzip.32		Y2, X2
+	vzip.32		Y3, X3
+.endif
+
+	// XOR the encrypted/decrypted blocks with the tweaks we saved earlier
+	mov		r12, sp
+	vld1.8		{TMP0, TMP1}, [r12:128]!
+	vld1.8		{TMP2, TMP3}, [r12:128]!
+	veor		X0, TMP0
+	veor		Y0, TMP1
+	veor		X1, TMP2
+	veor		Y1, TMP3
+	vld1.8		{TMP0, TMP1}, [r12:128]!
+	vld1.8		{TMP2, TMP3}, [r12:128]!
+	veor		X2, TMP0
+	veor		Y2, TMP1
+	veor		X3, TMP2
+	veor		Y3, TMP3
+
+	// Store the ciphertext in the destination buffer
+	vst1.8		{X0, Y0}, [DST]!
+	vst1.8		{X1, Y1}, [DST]!
+	vst1.8		{X2, Y2}, [DST]!
+	vst1.8		{X3, Y3}, [DST]!
+
+	// Continue if there are more 128-byte chunks remaining, else return
+	subs		NBYTES, #128
+	bne		.Lnext_128bytes_\@
+
+	// Store the next tweak
+.if \n == 64
+	vst1.8		{TWEAKV}, [TWEAK]
+.else
+	vst1.8		{TWEAKV_L}, [TWEAK]
+.endif
+
+	mov		sp, r7
+	pop		{r4-r7}
+	bx		lr
+.endm
+
+ENTRY(speck128_xts_encrypt_neon)
+	_speck_xts_crypt	n=64, decrypting=0
+ENDPROC(speck128_xts_encrypt_neon)
+
+ENTRY(speck128_xts_decrypt_neon)
+	_speck_xts_crypt	n=64, decrypting=1
+ENDPROC(speck128_xts_decrypt_neon)
+
+ENTRY(speck64_xts_encrypt_neon)
+	_speck_xts_crypt	n=32, decrypting=0
+ENDPROC(speck64_xts_encrypt_neon)
+
+ENTRY(speck64_xts_decrypt_neon)
+	_speck_xts_crypt	n=32, decrypting=1
+ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm/crypto/speck-neon-glue.c b/arch/arm/crypto/speck-neon-glue.c
new file mode 100644
index 000000000000..ea36c3a6f1d9
--- /dev/null
+++ b/arch/arm/crypto/speck-neon-glue.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
+ *
+ * Copyright (c) 2018 Google, Inc
+ *
+ * Note: the NIST recommendation for XTS only specifies a 128-bit block size,
+ * but a 64-bit version (needed for Speck64) is fairly straightforward; the math
+ * is just done in GF(2^64) instead of GF(2^128), with the reducing polynomial
+ * x^64 + x^4 + x^3 + x + 1 from the original XEX paper (Rogaway, 2004:
+ * "Efficient Instantiations of Tweakable Blockciphers and Refinements to Modes
+ * OCB and PMAC"), represented as 0x1B.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/algapi.h>
+#include <crypto/gf128mul.h>
+#include <crypto/speck.h>
+#include <crypto/xts.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/* The assembly functions only handle multiples of 128 bytes */
+#define SPECK_NEON_CHUNK_SIZE	128
+
+/* Speck128 */
+
+struct speck128_xts_tfm_ctx {
+	struct speck128_tfm_ctx main_key;
+	struct speck128_tfm_ctx tweak_key;
+};
+
+asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
+					  void *dst, const void *src,
+					  unsigned int nbytes, void *tweak);
+
+asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
+					  void *dst, const void *src,
+					  unsigned int nbytes, void *tweak);
+
+typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
+				     u8 *, const u8 *);
+typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
+					  const void *, unsigned int, void *);
+
+static __always_inline int
+__speck128_xts_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes,
+		     speck128_crypt_one_t crypt_one,
+		     speck128_xts_crypt_many_t crypt_many)
+{
+	struct crypto_blkcipher *tfm = desc->tfm;
+	const struct speck128_xts_tfm_ctx *ctx = crypto_blkcipher_ctx(tfm);
+	struct blkcipher_walk walk;
+	le128 tweak;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, SPECK_NEON_CHUNK_SIZE);
+
+	crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+		u8 *dst = walk.dst.virt.addr;
+		const u8 *src = walk.src.virt.addr;
+
+		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
+			unsigned int count;
+
+			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
+			kernel_neon_begin();
+			(*crypt_many)(ctx->main_key.round_keys,
+				      ctx->main_key.nrounds,
+				      dst, src, count, &tweak);
+			kernel_neon_end();
+			dst += count;
+			src += count;
+			nbytes -= count;
+		}
+
+		/* Handle any remainder with generic code */
+		while (nbytes >= sizeof(tweak)) {
+			le128_xor((le128 *)dst, (const le128 *)src, &tweak);
+			(*crypt_one)(&ctx->main_key, dst, dst);
+			le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
+			gf128mul_x_ble((be128 *)&tweak, (const be128 *)&tweak);
+
+			dst += sizeof(tweak);
+			src += sizeof(tweak);
+			nbytes -= sizeof(tweak);
+		}
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static int speck128_xts_encrypt(struct blkcipher_desc *desc,
+				struct scatterlist *dst,
+				struct scatterlist *src,
+				unsigned int nbytes)
+{
+	return __speck128_xts_crypt(desc, dst, src, nbytes,
+				    crypto_speck128_encrypt,
+				    speck128_xts_encrypt_neon);
+}
+
+static int speck128_xts_decrypt(struct blkcipher_desc *desc,
+				struct scatterlist *dst,
+				struct scatterlist *src,
+				unsigned int nbytes)
+{
+	return __speck128_xts_crypt(desc, dst, src, nbytes,
+				    crypto_speck128_decrypt,
+				    speck128_xts_decrypt_neon);
+}
+
+static int speck128_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
+			       unsigned int keylen)
+{
+	struct speck128_xts_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	if (keylen % 2)
+		return -EINVAL;
+
+	keylen /= 2;
+
+	err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
+	if (err)
+		return err;
+
+	return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
+}
+
+/* Speck64 */
+
+struct speck64_xts_tfm_ctx {
+	struct speck64_tfm_ctx main_key;
+	struct speck64_tfm_ctx tweak_key;
+};
+
+asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
+					 void *dst, const void *src,
+					 unsigned int nbytes, void *tweak);
+
+asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
+					 void *dst, const void *src,
+					 unsigned int nbytes, void *tweak);
+
+typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
+				    u8 *, const u8 *);
+typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
+					 const void *, unsigned int, void *);
+
+static __always_inline int
+__speck64_xts_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		    struct scatterlist *src, unsigned int nbytes,
+		    speck64_crypt_one_t crypt_one,
+		    speck64_xts_crypt_many_t crypt_many)
+{
+	struct crypto_blkcipher *tfm = desc->tfm;
+	const struct speck64_xts_tfm_ctx *ctx = crypto_blkcipher_ctx(tfm);
+	struct blkcipher_walk walk;
+	__le64 tweak;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, SPECK_NEON_CHUNK_SIZE);
+
+	crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+		u8 *dst = walk.dst.virt.addr;
+		const u8 *src = walk.src.virt.addr;
+
+		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
+			unsigned int count;
+
+			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
+			kernel_neon_begin();
+			(*crypt_many)(ctx->main_key.round_keys,
+				      ctx->main_key.nrounds,
+				      dst, src, count, &tweak);
+			kernel_neon_end();
+			dst += count;
+			src += count;
+			nbytes -= count;
+		}
+
+		/* Handle any remainder with generic code */
+		while (nbytes >= sizeof(tweak)) {
+			*(__le64 *)dst = *(__le64 *)src ^ tweak;
+			(*crypt_one)(&ctx->main_key, dst, dst);
+			*(__le64 *)dst ^= tweak;
+			tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
+					    ((tweak & cpu_to_le64(1ULL << 63)) ?
+					     0x1B : 0));
+			dst += sizeof(tweak);
+			src += sizeof(tweak);
+			nbytes -= sizeof(tweak);
+		}
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static int speck64_xts_encrypt(struct blkcipher_desc *desc,
+			       struct scatterlist *dst, struct scatterlist *src,
+			       unsigned int nbytes)
+{
+	return __speck64_xts_crypt(desc, dst, src, nbytes,
+				   crypto_speck64_encrypt,
+				   speck64_xts_encrypt_neon);
+}
+
+static int speck64_xts_decrypt(struct blkcipher_desc *desc,
+			       struct scatterlist *dst, struct scatterlist *src,
+			       unsigned int nbytes)
+{
+	return __speck64_xts_crypt(desc, dst, src, nbytes,
+				   crypto_speck64_decrypt,
+				   speck64_xts_decrypt_neon);
+}
+
+static int speck64_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct speck64_xts_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	if (keylen % 2)
+		return -EINVAL;
+
+	keylen /= 2;
+
+	err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
+	if (err)
+		return err;
+
+	return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
+}
+
+static struct crypto_alg speck_algs[] = {
+	{
+		.cra_name		= "xts(speck128)",
+		.cra_driver_name	= "xts-speck128-neon",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+		.cra_blocksize		= SPECK128_BLOCK_SIZE,
+		.cra_type		= &crypto_blkcipher_type,
+		.cra_ctxsize		= sizeof(struct speck128_xts_tfm_ctx),
+		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
+		.cra_u = {
+			.blkcipher = {
+				.min_keysize		= 2 * SPECK128_128_KEY_SIZE,
+				.max_keysize		= 2 * SPECK128_256_KEY_SIZE,
+				.ivsize			= SPECK128_BLOCK_SIZE,
+				.setkey			= speck128_xts_setkey,
+				.encrypt		= speck128_xts_encrypt,
+				.decrypt		= speck128_xts_decrypt,
+			}
+		}
+	}, {
+		.cra_name		= "xts(speck64)",
+		.cra_driver_name	= "xts-speck64-neon",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+		.cra_blocksize		= SPECK64_BLOCK_SIZE,
+		.cra_type		= &crypto_blkcipher_type,
+		.cra_ctxsize		= sizeof(struct speck64_xts_tfm_ctx),
+		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
+		.cra_u = {
+			.blkcipher = {
+				.min_keysize		= 2 * SPECK64_96_KEY_SIZE,
+				.max_keysize		= 2 * SPECK64_128_KEY_SIZE,
+				.ivsize			= SPECK64_BLOCK_SIZE,
+				.setkey			= speck64_xts_setkey,
+				.encrypt		= speck64_xts_encrypt,
+				.decrypt		= speck64_xts_decrypt,
+			}
+		}
+	}
+};
+
+static int __init speck_neon_module_init(void)
+{
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+	return crypto_register_algs(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+static void __exit speck_neon_module_exit(void)
+{
+	crypto_unregister_algs(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+module_init(speck_neon_module_init);
+module_exit(speck_neon_module_exit);
+
+MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("xts(speck128)");
+MODULE_ALIAS_CRYPTO("xts-speck128-neon");
+MODULE_ALIAS_CRYPTO("xts(speck64)");
+MODULE_ALIAS_CRYPTO("xts-speck64-neon");

From b37c1244dd074b052247290a8c8b1eb9a02b4f84 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Feb 2018 10:42:22 -0800
Subject: [PATCH 1605/3220] BACKPORT, FROMGIT: crypto: speck - add test vectors
 for Speck128-XTS

Add test vectors for Speck128-XTS, generated in userspace using C code.
The inputs were borrowed from the AES-XTS test vectors.

Both xts(speck128-generic) and xts-speck128-neon pass these tests.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

(cherry picked from commit c3bb521bb6ac3023ae236a3a361f951f8d78ecc4
 git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master)
(removed 'const' from test vectors)
(replaced use of __VECS macro in crypto/testmgr.c)
Change-Id: Ifd701d5df4a6602c207cfb28decc620ef7e5f896
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/testmgr.c |  15 ++
 crypto/testmgr.h | 687 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 702 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index e54027b2b07f..2be1d5d04464 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3888,6 +3888,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "xts(speck128)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = speck128_xts_enc_tv_template,
+					.count = ARRAY_SIZE(speck128_xts_enc_tv_template)
+				},
+				.dec = {
+					.vecs = speck128_xts_dec_tv_template,
+					.count = ARRAY_SIZE(speck128_xts_dec_tv_template)
+				}
+			}
+		}
 	}, {
 		.alg = "xts(twofish)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 5cacb7508936..343eeee9366f 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -12677,6 +12677,693 @@ static struct cipher_testvec speck128_dec_tv_template[] = {
 	},
 };
 
+/*
+ * Speck128-XTS test vectors, taken from the AES-XTS test vectors with the
+ * result recomputed with Speck128 as the cipher
+ */
+
+static struct cipher_testvec speck128_xts_enc_tv_template[] = {
+	{
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ilen	= 32,
+		.result	= "\xbe\xa0\xe7\x03\xd7\xfe\xab\x62"
+			  "\x3b\x99\x4a\x64\x74\x77\xac\xed"
+			  "\xd8\xf4\xa6\xcf\xae\xb9\x07\x42"
+			  "\x51\xd9\xb6\x1d\xe0\x5e\xbc\x54",
+		.rlen	= 32,
+	}, {
+		.key	= "\x11\x11\x11\x11\x11\x11\x11\x11"
+			  "\x11\x11\x11\x11\x11\x11\x11\x11"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22",
+		.klen	= 32,
+		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44",
+		.ilen	= 32,
+		.result	= "\xfb\x53\x81\x75\x6f\x9f\x34\xad"
+			  "\x7e\x01\xed\x7b\xcc\xda\x4e\x4a"
+			  "\xd4\x84\xa4\x53\xd5\x88\x73\x1b"
+			  "\xfd\xcb\xae\x0d\xf3\x04\xee\xe6",
+		.rlen	= 32,
+	}, {
+		.key	= "\xff\xfe\xfd\xfc\xfb\xfa\xf9\xf8"
+			  "\xf7\xf6\xf5\xf4\xf3\xf2\xf1\xf0"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22",
+		.klen	= 32,
+		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44",
+		.ilen	= 32,
+		.result	= "\x21\x52\x84\x15\xd1\xf7\x21\x55"
+			  "\xd9\x75\x4a\xd3\xc5\xdb\x9f\x7d"
+			  "\xda\x63\xb2\xf1\x82\xb0\x89\x59"
+			  "\x86\xd4\xaa\xaa\xdd\xff\x4f\x92",
+		.rlen	= 32,
+	}, {
+		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
+			  "\x23\x53\x60\x28\x74\x71\x35\x26"
+			  "\x31\x41\x59\x26\x53\x58\x97\x93"
+			  "\x23\x84\x62\x64\x33\x83\x27\x95",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.ilen	= 512,
+		.result	= "\x57\xb5\xf8\x71\x6e\x6d\xdd\x82"
+			  "\x53\xd0\xed\x2d\x30\xc1\x20\xef"
+			  "\x70\x67\x5e\xff\x09\x70\xbb\xc1"
+			  "\x3a\x7b\x48\x26\xd9\x0b\xf4\x48"
+			  "\xbe\xce\xb1\xc7\xb2\x67\xc4\xa7"
+			  "\x76\xf8\x36\x30\xb7\xb4\x9a\xd9"
+			  "\xf5\x9d\xd0\x7b\xc1\x06\x96\x44"
+			  "\x19\xc5\x58\x84\x63\xb9\x12\x68"
+			  "\x68\xc7\xaa\x18\x98\xf2\x1f\x5c"
+			  "\x39\xa6\xd8\x32\x2b\xc3\x51\xfd"
+			  "\x74\x79\x2e\xb4\x44\xd7\x69\xc4"
+			  "\xfc\x29\xe6\xed\x26\x1e\xa6\x9d"
+			  "\x1c\xbe\x00\x0e\x7f\x3a\xca\xfb"
+			  "\x6d\x13\x65\xa0\xf9\x31\x12\xe2"
+			  "\x26\xd1\xec\x2b\x0a\x8b\x59\x99"
+			  "\xa7\x49\xa0\x0e\x09\x33\x85\x50"
+			  "\xc3\x23\xca\x7a\xdd\x13\x45\x5f"
+			  "\xde\x4c\xa7\xcb\x00\x8a\x66\x6f"
+			  "\xa2\xb6\xb1\x2e\xe1\xa0\x18\xf6"
+			  "\xad\xf3\xbd\xeb\xc7\xef\x55\x4f"
+			  "\x79\x91\x8d\x36\x13\x7b\xd0\x4a"
+			  "\x6c\x39\xfb\x53\xb8\x6f\x02\x51"
+			  "\xa5\x20\xac\x24\x1c\x73\x59\x73"
+			  "\x58\x61\x3a\x87\x58\xb3\x20\x56"
+			  "\x39\x06\x2b\x4d\xd3\x20\x2b\x89"
+			  "\x3f\xa2\xf0\x96\xeb\x7f\xa4\xcd"
+			  "\x11\xae\xbd\xcb\x3a\xb4\xd9\x91"
+			  "\x09\x35\x71\x50\x65\xac\x92\xe3"
+			  "\x7b\x32\xc0\x7a\xdd\xd4\xc3\x92"
+			  "\x6f\xeb\x79\xde\x6f\xd3\x25\xc9"
+			  "\xcd\x63\xf5\x1e\x7a\x3b\x26\x9d"
+			  "\x77\x04\x80\xa9\xbf\x38\xb5\xbd"
+			  "\xb8\x05\x07\xbd\xfd\xab\x7b\xf8"
+			  "\x2a\x26\xcc\x49\x14\x6d\x55\x01"
+			  "\x06\x94\xd8\xb2\x2d\x53\x83\x1b"
+			  "\x8f\xd4\xdd\x57\x12\x7e\x18\xba"
+			  "\x8e\xe2\x4d\x80\xef\x7e\x6b\x9d"
+			  "\x24\xa9\x60\xa4\x97\x85\x86\x2a"
+			  "\x01\x00\x09\xf1\xcb\x4a\x24\x1c"
+			  "\xd8\xf6\xe6\x5b\xe7\x5d\xf2\xc4"
+			  "\x97\x1c\x10\xc6\x4d\x66\x4f\x98"
+			  "\x87\x30\xac\xd5\xea\x73\x49\x10"
+			  "\x80\xea\xe5\x5f\x4d\x5f\x03\x33"
+			  "\x66\x02\x35\x3d\x60\x06\x36\x4f"
+			  "\x14\x1c\xd8\x07\x1f\x78\xd0\xf8"
+			  "\x4f\x6c\x62\x7c\x15\xa5\x7c\x28"
+			  "\x7c\xcc\xeb\x1f\xd1\x07\x90\x93"
+			  "\x7e\xc2\xa8\x3a\x80\xc0\xf5\x30"
+			  "\xcc\x75\xcf\x16\x26\xa9\x26\x3b"
+			  "\xe7\x68\x2f\x15\x21\x5b\xe4\x00"
+			  "\xbd\x48\x50\xcd\x75\x70\xc4\x62"
+			  "\xbb\x41\xfb\x89\x4a\x88\x3b\x3b"
+			  "\x51\x66\x02\x69\x04\x97\x36\xd4"
+			  "\x75\xae\x0b\xa3\x42\xf8\xca\x79"
+			  "\x8f\x93\xe9\xcc\x38\xbd\xd6\xd2"
+			  "\xf9\x70\x4e\xc3\x6a\x8e\x25\xbd"
+			  "\xea\x15\x5a\xa0\x85\x7e\x81\x0d"
+			  "\x03\xe7\x05\x39\xf5\x05\x26\xee"
+			  "\xec\xaa\x1f\x3d\xc9\x98\x76\x01"
+			  "\x2c\xf4\xfc\xa3\x88\x77\x38\xc4"
+			  "\x50\x65\x50\x6d\x04\x1f\xdf\x5a"
+			  "\xaa\xf2\x01\xa9\xc1\x8d\xee\xca"
+			  "\x47\x26\xef\x39\xb8\xb4\xf2\xd1"
+			  "\xd6\xbb\x1b\x2a\xc1\x34\x14\xcf",
+		.rlen	= 512,
+	}, {
+		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
+			  "\x23\x53\x60\x28\x74\x71\x35\x26"
+			  "\x62\x49\x77\x57\x24\x70\x93\x69"
+			  "\x99\x59\x57\x49\x66\x96\x76\x27"
+			  "\x31\x41\x59\x26\x53\x58\x97\x93"
+			  "\x23\x84\x62\x64\x33\x83\x27\x95"
+			  "\x02\x88\x41\x97\x16\x93\x99\x37"
+			  "\x51\x05\x82\x09\x74\x94\x45\x92",
+		.klen	= 64,
+		.iv	= "\xff\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.ilen	= 512,
+		.result	= "\xc5\x85\x2a\x4b\x73\xe4\xf6\xf1"
+			  "\x7e\xf9\xf6\xe9\xa3\x73\x36\xcb"
+			  "\xaa\xb6\x22\xb0\x24\x6e\x3d\x73"
+			  "\x92\x99\xde\xd3\x76\xed\xcd\x63"
+			  "\x64\x3a\x22\x57\xc1\x43\x49\xd4"
+			  "\x79\x36\x31\x19\x62\xae\x10\x7e"
+			  "\x7d\xcf\x7a\xe2\x6b\xce\x27\xfa"
+			  "\xdc\x3d\xd9\x83\xd3\x42\x4c\xe0"
+			  "\x1b\xd6\x1d\x1a\x6f\xd2\x03\x00"
+			  "\xfc\x81\x99\x8a\x14\x62\xf5\x7e"
+			  "\x0d\xe7\x12\xe8\x17\x9d\x0b\xec"
+			  "\xe2\xf7\xc9\xa7\x63\xd1\x79\xb6"
+			  "\x62\x62\x37\xfe\x0a\x4c\x4a\x37"
+			  "\x70\xc7\x5e\x96\x5f\xbc\x8e\x9e"
+			  "\x85\x3c\x4f\x26\x64\x85\xbc\x68"
+			  "\xb0\xe0\x86\x5e\x26\x41\xce\x11"
+			  "\x50\xda\x97\x14\xe9\x9e\xc7\x6d"
+			  "\x3b\xdc\x43\xde\x2b\x27\x69\x7d"
+			  "\xfc\xb0\x28\xbd\x8f\xb1\xc6\x31"
+			  "\x14\x4d\xf0\x74\x37\xfd\x07\x25"
+			  "\x96\x55\xe5\xfc\x9e\x27\x2a\x74"
+			  "\x1b\x83\x4d\x15\x83\xac\x57\xa0"
+			  "\xac\xa5\xd0\x38\xef\x19\x56\x53"
+			  "\x25\x4b\xfc\xce\x04\x23\xe5\x6b"
+			  "\xf6\xc6\x6c\x32\x0b\xb3\x12\xc5"
+			  "\xed\x22\x34\x1c\x5d\xed\x17\x06"
+			  "\x36\xa3\xe6\x77\xb9\x97\x46\xb8"
+			  "\xe9\x3f\x7e\xc7\xbc\x13\x5c\xdc"
+			  "\x6e\x3f\x04\x5e\xd1\x59\xa5\x82"
+			  "\x35\x91\x3d\x1b\xe4\x97\x9f\x92"
+			  "\x1c\x5e\x5f\x6f\x41\xd4\x62\xa1"
+			  "\x8d\x39\xfc\x42\xfb\x38\x80\xb9"
+			  "\x0a\xe3\xcc\x6a\x93\xd9\x7a\xb1"
+			  "\xe9\x69\xaf\x0a\x6b\x75\x38\xa7"
+			  "\xa1\xbf\xf7\xda\x95\x93\x4b\x78"
+			  "\x19\xf5\x94\xf9\xd2\x00\x33\x37"
+			  "\xcf\xf5\x9e\x9c\xf3\xcc\xa6\xee"
+			  "\x42\xb2\x9e\x2c\x5f\x48\x23\x26"
+			  "\x15\x25\x17\x03\x3d\xfe\x2c\xfc"
+			  "\xeb\xba\xda\xe0\x00\x05\xb6\xa6"
+			  "\x07\xb3\xe8\x36\x5b\xec\x5b\xbf"
+			  "\xd6\x5b\x00\x74\xc6\x97\xf1\x6a"
+			  "\x49\xa1\xc3\xfa\x10\x52\xb9\x14"
+			  "\xad\xb7\x73\xf8\x78\x12\xc8\x59"
+			  "\x17\x80\x4c\x57\x39\xf1\x6d\x80"
+			  "\x25\x77\x0f\x5e\x7d\xf0\xaf\x21"
+			  "\xec\xce\xb7\xc8\x02\x8a\xed\x53"
+			  "\x2c\x25\x68\x2e\x1f\x85\x5e\x67"
+			  "\xd1\x07\x7a\x3a\x89\x08\xe0\x34"
+			  "\xdc\xdb\x26\xb4\x6b\x77\xfc\x40"
+			  "\x31\x15\x72\xa0\xf0\x73\xd9\x3b"
+			  "\xd5\xdb\xfe\xfc\x8f\xa9\x44\xa2"
+			  "\x09\x9f\xc6\x33\xe5\xe2\x88\xe8"
+			  "\xf3\xf0\x1a\xf4\xce\x12\x0f\xd6"
+			  "\xf7\x36\xe6\xa4\xf4\x7a\x10\x58"
+			  "\xcc\x1f\x48\x49\x65\x47\x75\xe9"
+			  "\x28\xe1\x65\x7b\xf2\xc4\xb5\x07"
+			  "\xf2\xec\x76\xd8\x8f\x09\xf3\x16"
+			  "\xa1\x51\x89\x3b\xeb\x96\x42\xac"
+			  "\x65\xe0\x67\x63\x29\xdc\xb4\x7d"
+			  "\xf2\x41\x51\x6a\xcb\xde\x3c\xfb"
+			  "\x66\x8d\x13\xca\xe0\x59\x2a\x00"
+			  "\xc9\x53\x4c\xe6\x9e\xe2\x73\xd5"
+			  "\x67\x19\xb2\xbd\x9a\x63\xd7\x5c",
+		.rlen	= 512,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 512 - 20, 4, 16 },
+	}
+};
+
+static struct cipher_testvec speck128_xts_dec_tv_template[] = {
+	{
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\xbe\xa0\xe7\x03\xd7\xfe\xab\x62"
+			  "\x3b\x99\x4a\x64\x74\x77\xac\xed"
+			  "\xd8\xf4\xa6\xcf\xae\xb9\x07\x42"
+			  "\x51\xd9\xb6\x1d\xe0\x5e\xbc\x54",
+		.ilen	= 32,
+		.result	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.rlen	= 32,
+	}, {
+		.key	= "\x11\x11\x11\x11\x11\x11\x11\x11"
+			  "\x11\x11\x11\x11\x11\x11\x11\x11"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22",
+		.klen	= 32,
+		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\xfb\x53\x81\x75\x6f\x9f\x34\xad"
+			  "\x7e\x01\xed\x7b\xcc\xda\x4e\x4a"
+			  "\xd4\x84\xa4\x53\xd5\x88\x73\x1b"
+			  "\xfd\xcb\xae\x0d\xf3\x04\xee\xe6",
+		.ilen	= 32,
+		.result	= "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44",
+		.rlen	= 32,
+	}, {
+		.key	= "\xff\xfe\xfd\xfc\xfb\xfa\xf9\xf8"
+			  "\xf7\xf6\xf5\xf4\xf3\xf2\xf1\xf0"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22",
+		.klen	= 32,
+		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x21\x52\x84\x15\xd1\xf7\x21\x55"
+			  "\xd9\x75\x4a\xd3\xc5\xdb\x9f\x7d"
+			  "\xda\x63\xb2\xf1\x82\xb0\x89\x59"
+			  "\x86\xd4\xaa\xaa\xdd\xff\x4f\x92",
+		.ilen	= 32,
+		.result	= "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44",
+		.rlen	= 32,
+	}, {
+		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
+			  "\x23\x53\x60\x28\x74\x71\x35\x26"
+			  "\x31\x41\x59\x26\x53\x58\x97\x93"
+			  "\x23\x84\x62\x64\x33\x83\x27\x95",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x57\xb5\xf8\x71\x6e\x6d\xdd\x82"
+			  "\x53\xd0\xed\x2d\x30\xc1\x20\xef"
+			  "\x70\x67\x5e\xff\x09\x70\xbb\xc1"
+			  "\x3a\x7b\x48\x26\xd9\x0b\xf4\x48"
+			  "\xbe\xce\xb1\xc7\xb2\x67\xc4\xa7"
+			  "\x76\xf8\x36\x30\xb7\xb4\x9a\xd9"
+			  "\xf5\x9d\xd0\x7b\xc1\x06\x96\x44"
+			  "\x19\xc5\x58\x84\x63\xb9\x12\x68"
+			  "\x68\xc7\xaa\x18\x98\xf2\x1f\x5c"
+			  "\x39\xa6\xd8\x32\x2b\xc3\x51\xfd"
+			  "\x74\x79\x2e\xb4\x44\xd7\x69\xc4"
+			  "\xfc\x29\xe6\xed\x26\x1e\xa6\x9d"
+			  "\x1c\xbe\x00\x0e\x7f\x3a\xca\xfb"
+			  "\x6d\x13\x65\xa0\xf9\x31\x12\xe2"
+			  "\x26\xd1\xec\x2b\x0a\x8b\x59\x99"
+			  "\xa7\x49\xa0\x0e\x09\x33\x85\x50"
+			  "\xc3\x23\xca\x7a\xdd\x13\x45\x5f"
+			  "\xde\x4c\xa7\xcb\x00\x8a\x66\x6f"
+			  "\xa2\xb6\xb1\x2e\xe1\xa0\x18\xf6"
+			  "\xad\xf3\xbd\xeb\xc7\xef\x55\x4f"
+			  "\x79\x91\x8d\x36\x13\x7b\xd0\x4a"
+			  "\x6c\x39\xfb\x53\xb8\x6f\x02\x51"
+			  "\xa5\x20\xac\x24\x1c\x73\x59\x73"
+			  "\x58\x61\x3a\x87\x58\xb3\x20\x56"
+			  "\x39\x06\x2b\x4d\xd3\x20\x2b\x89"
+			  "\x3f\xa2\xf0\x96\xeb\x7f\xa4\xcd"
+			  "\x11\xae\xbd\xcb\x3a\xb4\xd9\x91"
+			  "\x09\x35\x71\x50\x65\xac\x92\xe3"
+			  "\x7b\x32\xc0\x7a\xdd\xd4\xc3\x92"
+			  "\x6f\xeb\x79\xde\x6f\xd3\x25\xc9"
+			  "\xcd\x63\xf5\x1e\x7a\x3b\x26\x9d"
+			  "\x77\x04\x80\xa9\xbf\x38\xb5\xbd"
+			  "\xb8\x05\x07\xbd\xfd\xab\x7b\xf8"
+			  "\x2a\x26\xcc\x49\x14\x6d\x55\x01"
+			  "\x06\x94\xd8\xb2\x2d\x53\x83\x1b"
+			  "\x8f\xd4\xdd\x57\x12\x7e\x18\xba"
+			  "\x8e\xe2\x4d\x80\xef\x7e\x6b\x9d"
+			  "\x24\xa9\x60\xa4\x97\x85\x86\x2a"
+			  "\x01\x00\x09\xf1\xcb\x4a\x24\x1c"
+			  "\xd8\xf6\xe6\x5b\xe7\x5d\xf2\xc4"
+			  "\x97\x1c\x10\xc6\x4d\x66\x4f\x98"
+			  "\x87\x30\xac\xd5\xea\x73\x49\x10"
+			  "\x80\xea\xe5\x5f\x4d\x5f\x03\x33"
+			  "\x66\x02\x35\x3d\x60\x06\x36\x4f"
+			  "\x14\x1c\xd8\x07\x1f\x78\xd0\xf8"
+			  "\x4f\x6c\x62\x7c\x15\xa5\x7c\x28"
+			  "\x7c\xcc\xeb\x1f\xd1\x07\x90\x93"
+			  "\x7e\xc2\xa8\x3a\x80\xc0\xf5\x30"
+			  "\xcc\x75\xcf\x16\x26\xa9\x26\x3b"
+			  "\xe7\x68\x2f\x15\x21\x5b\xe4\x00"
+			  "\xbd\x48\x50\xcd\x75\x70\xc4\x62"
+			  "\xbb\x41\xfb\x89\x4a\x88\x3b\x3b"
+			  "\x51\x66\x02\x69\x04\x97\x36\xd4"
+			  "\x75\xae\x0b\xa3\x42\xf8\xca\x79"
+			  "\x8f\x93\xe9\xcc\x38\xbd\xd6\xd2"
+			  "\xf9\x70\x4e\xc3\x6a\x8e\x25\xbd"
+			  "\xea\x15\x5a\xa0\x85\x7e\x81\x0d"
+			  "\x03\xe7\x05\x39\xf5\x05\x26\xee"
+			  "\xec\xaa\x1f\x3d\xc9\x98\x76\x01"
+			  "\x2c\xf4\xfc\xa3\x88\x77\x38\xc4"
+			  "\x50\x65\x50\x6d\x04\x1f\xdf\x5a"
+			  "\xaa\xf2\x01\xa9\xc1\x8d\xee\xca"
+			  "\x47\x26\xef\x39\xb8\xb4\xf2\xd1"
+			  "\xd6\xbb\x1b\x2a\xc1\x34\x14\xcf",
+		.ilen	= 512,
+		.result	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.rlen	= 512,
+	}, {
+		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
+			  "\x23\x53\x60\x28\x74\x71\x35\x26"
+			  "\x62\x49\x77\x57\x24\x70\x93\x69"
+			  "\x99\x59\x57\x49\x66\x96\x76\x27"
+			  "\x31\x41\x59\x26\x53\x58\x97\x93"
+			  "\x23\x84\x62\x64\x33\x83\x27\x95"
+			  "\x02\x88\x41\x97\x16\x93\x99\x37"
+			  "\x51\x05\x82\x09\x74\x94\x45\x92",
+		.klen	= 64,
+		.iv	= "\xff\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\xc5\x85\x2a\x4b\x73\xe4\xf6\xf1"
+			  "\x7e\xf9\xf6\xe9\xa3\x73\x36\xcb"
+			  "\xaa\xb6\x22\xb0\x24\x6e\x3d\x73"
+			  "\x92\x99\xde\xd3\x76\xed\xcd\x63"
+			  "\x64\x3a\x22\x57\xc1\x43\x49\xd4"
+			  "\x79\x36\x31\x19\x62\xae\x10\x7e"
+			  "\x7d\xcf\x7a\xe2\x6b\xce\x27\xfa"
+			  "\xdc\x3d\xd9\x83\xd3\x42\x4c\xe0"
+			  "\x1b\xd6\x1d\x1a\x6f\xd2\x03\x00"
+			  "\xfc\x81\x99\x8a\x14\x62\xf5\x7e"
+			  "\x0d\xe7\x12\xe8\x17\x9d\x0b\xec"
+			  "\xe2\xf7\xc9\xa7\x63\xd1\x79\xb6"
+			  "\x62\x62\x37\xfe\x0a\x4c\x4a\x37"
+			  "\x70\xc7\x5e\x96\x5f\xbc\x8e\x9e"
+			  "\x85\x3c\x4f\x26\x64\x85\xbc\x68"
+			  "\xb0\xe0\x86\x5e\x26\x41\xce\x11"
+			  "\x50\xda\x97\x14\xe9\x9e\xc7\x6d"
+			  "\x3b\xdc\x43\xde\x2b\x27\x69\x7d"
+			  "\xfc\xb0\x28\xbd\x8f\xb1\xc6\x31"
+			  "\x14\x4d\xf0\x74\x37\xfd\x07\x25"
+			  "\x96\x55\xe5\xfc\x9e\x27\x2a\x74"
+			  "\x1b\x83\x4d\x15\x83\xac\x57\xa0"
+			  "\xac\xa5\xd0\x38\xef\x19\x56\x53"
+			  "\x25\x4b\xfc\xce\x04\x23\xe5\x6b"
+			  "\xf6\xc6\x6c\x32\x0b\xb3\x12\xc5"
+			  "\xed\x22\x34\x1c\x5d\xed\x17\x06"
+			  "\x36\xa3\xe6\x77\xb9\x97\x46\xb8"
+			  "\xe9\x3f\x7e\xc7\xbc\x13\x5c\xdc"
+			  "\x6e\x3f\x04\x5e\xd1\x59\xa5\x82"
+			  "\x35\x91\x3d\x1b\xe4\x97\x9f\x92"
+			  "\x1c\x5e\x5f\x6f\x41\xd4\x62\xa1"
+			  "\x8d\x39\xfc\x42\xfb\x38\x80\xb9"
+			  "\x0a\xe3\xcc\x6a\x93\xd9\x7a\xb1"
+			  "\xe9\x69\xaf\x0a\x6b\x75\x38\xa7"
+			  "\xa1\xbf\xf7\xda\x95\x93\x4b\x78"
+			  "\x19\xf5\x94\xf9\xd2\x00\x33\x37"
+			  "\xcf\xf5\x9e\x9c\xf3\xcc\xa6\xee"
+			  "\x42\xb2\x9e\x2c\x5f\x48\x23\x26"
+			  "\x15\x25\x17\x03\x3d\xfe\x2c\xfc"
+			  "\xeb\xba\xda\xe0\x00\x05\xb6\xa6"
+			  "\x07\xb3\xe8\x36\x5b\xec\x5b\xbf"
+			  "\xd6\x5b\x00\x74\xc6\x97\xf1\x6a"
+			  "\x49\xa1\xc3\xfa\x10\x52\xb9\x14"
+			  "\xad\xb7\x73\xf8\x78\x12\xc8\x59"
+			  "\x17\x80\x4c\x57\x39\xf1\x6d\x80"
+			  "\x25\x77\x0f\x5e\x7d\xf0\xaf\x21"
+			  "\xec\xce\xb7\xc8\x02\x8a\xed\x53"
+			  "\x2c\x25\x68\x2e\x1f\x85\x5e\x67"
+			  "\xd1\x07\x7a\x3a\x89\x08\xe0\x34"
+			  "\xdc\xdb\x26\xb4\x6b\x77\xfc\x40"
+			  "\x31\x15\x72\xa0\xf0\x73\xd9\x3b"
+			  "\xd5\xdb\xfe\xfc\x8f\xa9\x44\xa2"
+			  "\x09\x9f\xc6\x33\xe5\xe2\x88\xe8"
+			  "\xf3\xf0\x1a\xf4\xce\x12\x0f\xd6"
+			  "\xf7\x36\xe6\xa4\xf4\x7a\x10\x58"
+			  "\xcc\x1f\x48\x49\x65\x47\x75\xe9"
+			  "\x28\xe1\x65\x7b\xf2\xc4\xb5\x07"
+			  "\xf2\xec\x76\xd8\x8f\x09\xf3\x16"
+			  "\xa1\x51\x89\x3b\xeb\x96\x42\xac"
+			  "\x65\xe0\x67\x63\x29\xdc\xb4\x7d"
+			  "\xf2\x41\x51\x6a\xcb\xde\x3c\xfb"
+			  "\x66\x8d\x13\xca\xe0\x59\x2a\x00"
+			  "\xc9\x53\x4c\xe6\x9e\xe2\x73\xd5"
+			  "\x67\x19\xb2\xbd\x9a\x63\xd7\x5c",
+		.ilen	= 512,
+		.result	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.rlen	= 512,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 512 - 20, 4, 16 },
+	}
+};
+
 static struct cipher_testvec speck64_enc_tv_template[] = {
 	{ /* Speck64/96 */
 		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"

From de4b647c0124a928b458d7d79927a3cd0a7462e3 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Feb 2018 10:42:23 -0800
Subject: [PATCH 1606/3220] BACKPORT, FROMGIT: crypto: speck - add test vectors
 for Speck64-XTS

Add test vectors for Speck64-XTS, generated in userspace using C code.
The inputs were borrowed from the AES-XTS test vectors, with key lengths
adjusted.

xts-speck64-neon passes these tests.  However, they aren't currently
applicable for the generic XTS template, as that only supports a 128-bit
block size.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

(cherry picked from commit 41b3316e75ee5e8aec7234c9d631582b13a38c7d
 git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master)
(removed 'const' from test vectors)
(replaced use of __VECS macro in crypto/testmgr.c)
Change-Id: I61a2c77dbfcf487d77b3d9ef0a823dadea8ddf07
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/testmgr.c |  15 ++
 crypto/testmgr.h | 671 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 686 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 2be1d5d04464..a4aef61e40d8 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3903,6 +3903,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "xts(speck64)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = speck64_xts_enc_tv_template,
+					.count = ARRAY_SIZE(speck64_xts_enc_tv_template)
+				},
+				.dec = {
+					.vecs = speck64_xts_dec_tv_template,
+					.count = ARRAY_SIZE(speck64_xts_dec_tv_template)
+				}
+			}
+		}
 	}, {
 		.alg = "xts(twofish)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 343eeee9366f..0bb950ea22ed 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -13404,6 +13404,677 @@ static struct cipher_testvec speck64_dec_tv_template[] = {
 	},
 };
 
+/*
+ * Speck64-XTS test vectors, taken from the AES-XTS test vectors with the result
+ * recomputed with Speck64 as the cipher, and key lengths adjusted
+ */
+
+static struct cipher_testvec speck64_xts_enc_tv_template[] = {
+	{
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.klen	= 24,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ilen	= 32,
+		.result	= "\x84\xaf\x54\x07\x19\xd4\x7c\xa6"
+			  "\xe4\xfe\xdf\xc4\x1f\x34\xc3\xc2"
+			  "\x80\xf5\x72\xe7\xcd\xf0\x99\x22"
+			  "\x35\xa7\x2f\x06\xef\xdc\x51\xaa",
+		.rlen	= 32,
+	}, {
+		.key	= "\x11\x11\x11\x11\x11\x11\x11\x11"
+			  "\x11\x11\x11\x11\x11\x11\x11\x11"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22",
+		.klen	= 24,
+		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44",
+		.ilen	= 32,
+		.result	= "\x12\x56\x73\xcd\x15\x87\xa8\x59"
+			  "\xcf\x84\xae\xd9\x1c\x66\xd6\x9f"
+			  "\xb3\x12\x69\x7e\x36\xeb\x52\xff"
+			  "\x62\xdd\xba\x90\xb3\xe1\xee\x99",
+		.rlen	= 32,
+	}, {
+		.key	= "\xff\xfe\xfd\xfc\xfb\xfa\xf9\xf8"
+			  "\xf7\xf6\xf5\xf4\xf3\xf2\xf1\xf0"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22",
+		.klen	= 24,
+		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44",
+		.ilen	= 32,
+		.result	= "\x15\x1b\xe4\x2c\xa2\x5a\x2d\x2c"
+			  "\x27\x36\xc0\xbf\x5d\xea\x36\x37"
+			  "\x2d\x1a\x88\xbc\x66\xb5\xd0\x0b"
+			  "\xa1\xbc\x19\xb2\x0f\x3b\x75\x34",
+		.rlen	= 32,
+	}, {
+		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
+			  "\x23\x53\x60\x28\x74\x71\x35\x26"
+			  "\x31\x41\x59\x26\x53\x58\x97\x93",
+		.klen	= 24,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.ilen	= 512,
+		.result	= "\xaf\xa1\x81\xa6\x32\xbb\x15\x8e"
+			  "\xf8\x95\x2e\xd3\xe6\xee\x7e\x09"
+			  "\x0c\x1a\xf5\x02\x97\x8b\xe3\xb3"
+			  "\x11\xc7\x39\x96\xd0\x95\xf4\x56"
+			  "\xf4\xdd\x03\x38\x01\x44\x2c\xcf"
+			  "\x88\xae\x8e\x3c\xcd\xe7\xaa\x66"
+			  "\xfe\x3d\xc6\xfb\x01\x23\x51\x43"
+			  "\xd5\xd2\x13\x86\x94\x34\xe9\x62"
+			  "\xf9\x89\xe3\xd1\x7b\xbe\xf8\xef"
+			  "\x76\x35\x04\x3f\xdb\x23\x9d\x0b"
+			  "\x85\x42\xb9\x02\xd6\xcc\xdb\x96"
+			  "\xa7\x6b\x27\xb6\xd4\x45\x8f\x7d"
+			  "\xae\xd2\x04\xd5\xda\xc1\x7e\x24"
+			  "\x8c\x73\xbe\x48\x7e\xcf\x65\x28"
+			  "\x29\xe5\xbe\x54\x30\xcb\x46\x95"
+			  "\x4f\x2e\x8a\x36\xc8\x27\xc5\xbe"
+			  "\xd0\x1a\xaf\xab\x26\xcd\x9e\x69"
+			  "\xa1\x09\x95\x71\x26\xe9\xc4\xdf"
+			  "\xe6\x31\xc3\x46\xda\xaf\x0b\x41"
+			  "\x1f\xab\xb1\x8e\xd6\xfc\x0b\xb3"
+			  "\x82\xc0\x37\x27\xfc\x91\xa7\x05"
+			  "\xfb\xc5\xdc\x2b\x74\x96\x48\x43"
+			  "\x5d\x9c\x19\x0f\x60\x63\x3a\x1f"
+			  "\x6f\xf0\x03\xbe\x4d\xfd\xc8\x4a"
+			  "\xc6\xa4\x81\x6d\xc3\x12\x2a\x5c"
+			  "\x07\xff\xf3\x72\x74\x48\xb5\x40"
+			  "\x50\xb5\xdd\x90\x43\x31\x18\x15"
+			  "\x7b\xf2\xa6\xdb\x83\xc8\x4b\x4a"
+			  "\x29\x93\x90\x8b\xda\x07\xf0\x35"
+			  "\x6d\x90\x88\x09\x4e\x83\xf5\x5b"
+			  "\x94\x12\xbb\x33\x27\x1d\x3f\x23"
+			  "\x51\xa8\x7c\x07\xa2\xae\x77\xa6"
+			  "\x50\xfd\xcc\xc0\x4f\x80\x7a\x9f"
+			  "\x66\xdd\xcd\x75\x24\x8b\x33\xf7"
+			  "\x20\xdb\x83\x9b\x4f\x11\x63\x6e"
+			  "\xcf\x37\xef\xc9\x11\x01\x5c\x45"
+			  "\x32\x99\x7c\x3c\x9e\x42\x89\xe3"
+			  "\x70\x6d\x15\x9f\xb1\xe6\xb6\x05"
+			  "\xfe\x0c\xb9\x49\x2d\x90\x6d\xcc"
+			  "\x5d\x3f\xc1\xfe\x89\x0a\x2e\x2d"
+			  "\xa0\xa8\x89\x3b\x73\x39\xa5\x94"
+			  "\x4c\xa4\xa6\xbb\xa7\x14\x46\x89"
+			  "\x10\xff\xaf\xef\xca\xdd\x4f\x80"
+			  "\xb3\xdf\x3b\xab\xd4\xe5\x5a\xc7"
+			  "\x33\xca\x00\x8b\x8b\x3f\xea\xec"
+			  "\x68\x8a\xc2\x6d\xfd\xd4\x67\x0f"
+			  "\x22\x31\xe1\x0e\xfe\x5a\x04\xd5"
+			  "\x64\xa3\xf1\x1a\x76\x28\xcc\x35"
+			  "\x36\xa7\x0a\x74\xf7\x1c\x44\x9b"
+			  "\xc7\x1b\x53\x17\x02\xea\xd1\xad"
+			  "\x13\x51\x73\xc0\xa0\xb2\x05\x32"
+			  "\xa8\xa2\x37\x2e\xe1\x7a\x3a\x19"
+			  "\x26\xb4\x6c\x62\x5d\xb3\x1a\x1d"
+			  "\x59\xda\xee\x1a\x22\x18\xda\x0d"
+			  "\x88\x0f\x55\x8b\x72\x62\xfd\xc1"
+			  "\x69\x13\xcd\x0d\x5f\xc1\x09\x52"
+			  "\xee\xd6\xe3\x84\x4d\xee\xf6\x88"
+			  "\xaf\x83\xdc\x76\xf4\xc0\x93\x3f"
+			  "\x4a\x75\x2f\xb0\x0b\x3e\xc4\x54"
+			  "\x7d\x69\x8d\x00\x62\x77\x0d\x14"
+			  "\xbe\x7c\xa6\x7d\xc5\x24\x4f\xf3"
+			  "\x50\xf7\x5f\xf4\xc2\xca\x41\x97"
+			  "\x37\xbe\x75\x74\xcd\xf0\x75\x6e"
+			  "\x25\x23\x94\xbd\xda\x8d\xb0\xd4",
+		.rlen	= 512,
+	}, {
+		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
+			  "\x23\x53\x60\x28\x74\x71\x35\x26"
+			  "\x62\x49\x77\x57\x24\x70\x93\x69"
+			  "\x99\x59\x57\x49\x66\x96\x76\x27",
+		.klen	= 32,
+		.iv	= "\xff\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.ilen	= 512,
+		.result	= "\x55\xed\x71\xd3\x02\x8e\x15\x3b"
+			  "\xc6\x71\x29\x2d\x3e\x89\x9f\x59"
+			  "\x68\x6a\xcc\x8a\x56\x97\xf3\x95"
+			  "\x4e\x51\x08\xda\x2a\xf8\x6f\x3c"
+			  "\x78\x16\xea\x80\xdb\x33\x75\x94"
+			  "\xf9\x29\xc4\x2b\x76\x75\x97\xc7"
+			  "\xf2\x98\x2c\xf9\xff\xc8\xd5\x2b"
+			  "\x18\xf1\xaf\xcf\x7c\xc5\x0b\xee"
+			  "\xad\x3c\x76\x7c\xe6\x27\xa2\x2a"
+			  "\xe4\x66\xe1\xab\xa2\x39\xfc\x7c"
+			  "\xf5\xec\x32\x74\xa3\xb8\x03\x88"
+			  "\x52\xfc\x2e\x56\x3f\xa1\xf0\x9f"
+			  "\x84\x5e\x46\xed\x20\x89\xb6\x44"
+			  "\x8d\xd0\xed\x54\x47\x16\xbe\x95"
+			  "\x8a\xb3\x6b\x72\xc4\x32\x52\x13"
+			  "\x1b\xb0\x82\xbe\xac\xf9\x70\xa6"
+			  "\x44\x18\xdd\x8c\x6e\xca\x6e\x45"
+			  "\x8f\x1e\x10\x07\x57\x25\x98\x7b"
+			  "\x17\x8c\x78\xdd\x80\xa7\xd9\xd8"
+			  "\x63\xaf\xb9\x67\x57\xfd\xbc\xdb"
+			  "\x44\xe9\xc5\x65\xd1\xc7\x3b\xff"
+			  "\x20\xa0\x80\x1a\xc3\x9a\xad\x5e"
+			  "\x5d\x3b\xd3\x07\xd9\xf5\xfd\x3d"
+			  "\x4a\x8b\xa8\xd2\x6e\x7a\x51\x65"
+			  "\x6c\x8e\x95\xe0\x45\xc9\x5f\x4a"
+			  "\x09\x3c\x3d\x71\x7f\x0c\x84\x2a"
+			  "\xc8\x48\x52\x1a\xc2\xd5\xd6\x78"
+			  "\x92\x1e\xa0\x90\x2e\xea\xf0\xf3"
+			  "\xdc\x0f\xb1\xaf\x0d\x9b\x06\x2e"
+			  "\x35\x10\x30\x82\x0d\xe7\xc5\x9b"
+			  "\xde\x44\x18\xbd\x9f\xd1\x45\xa9"
+			  "\x7b\x7a\x4a\xad\x35\x65\x27\xca"
+			  "\xb2\xc3\xd4\x9b\x71\x86\x70\xee"
+			  "\xf1\x89\x3b\x85\x4b\x5b\xaa\xaf"
+			  "\xfc\x42\xc8\x31\x59\xbe\x16\x60"
+			  "\x4f\xf9\xfa\x12\xea\xd0\xa7\x14"
+			  "\xf0\x7a\xf3\xd5\x8d\xbd\x81\xef"
+			  "\x52\x7f\x29\x51\x94\x20\x67\x3c"
+			  "\xd1\xaf\x77\x9f\x22\x5a\x4e\x63"
+			  "\xe7\xff\x73\x25\xd1\xdd\x96\x8a"
+			  "\x98\x52\x6d\xf3\xac\x3e\xf2\x18"
+			  "\x6d\xf6\x0a\x29\xa6\x34\x3d\xed"
+			  "\xe3\x27\x0d\x9d\x0a\x02\x44\x7e"
+			  "\x5a\x7e\x67\x0f\x0a\x9e\xd6\xad"
+			  "\x91\xe6\x4d\x81\x8c\x5c\x59\xaa"
+			  "\xfb\xeb\x56\x53\xd2\x7d\x4c\x81"
+			  "\x65\x53\x0f\x41\x11\xbd\x98\x99"
+			  "\xf9\xc6\xfa\x51\x2e\xa3\xdd\x8d"
+			  "\x84\x98\xf9\x34\xed\x33\x2a\x1f"
+			  "\x82\xed\xc1\x73\x98\xd3\x02\xdc"
+			  "\xe6\xc2\x33\x1d\xa2\xb4\xca\x76"
+			  "\x63\x51\x34\x9d\x96\x12\xae\xce"
+			  "\x83\xc9\x76\x5e\xa4\x1b\x53\x37"
+			  "\x17\xd5\xc0\x80\x1d\x62\xf8\x3d"
+			  "\x54\x27\x74\xbb\x10\x86\x57\x46"
+			  "\x68\xe1\xed\x14\xe7\x9d\xfc\x84"
+			  "\x47\xbc\xc2\xf8\x19\x4b\x99\xcf"
+			  "\x7a\xe9\xc4\xb8\x8c\x82\x72\x4d"
+			  "\x7b\x4f\x38\x55\x36\x71\x64\xc1"
+			  "\xfc\x5c\x75\x52\x33\x02\x18\xf8"
+			  "\x17\xe1\x2b\xc2\x43\x39\xbd\x76"
+			  "\x9b\x63\x76\x32\x2f\x19\x72\x10"
+			  "\x9f\x21\x0c\xf1\x66\x50\x7f\xa5"
+			  "\x0d\x1f\x46\xe0\xba\xd3\x2f\x3c",
+		.rlen	= 512,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 512 - 20, 4, 16 },
+	}
+};
+
+static struct cipher_testvec speck64_xts_dec_tv_template[] = {
+	{
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.klen	= 24,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x84\xaf\x54\x07\x19\xd4\x7c\xa6"
+			  "\xe4\xfe\xdf\xc4\x1f\x34\xc3\xc2"
+			  "\x80\xf5\x72\xe7\xcd\xf0\x99\x22"
+			  "\x35\xa7\x2f\x06\xef\xdc\x51\xaa",
+		.ilen	= 32,
+		.result	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.rlen	= 32,
+	}, {
+		.key	= "\x11\x11\x11\x11\x11\x11\x11\x11"
+			  "\x11\x11\x11\x11\x11\x11\x11\x11"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22",
+		.klen	= 24,
+		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x12\x56\x73\xcd\x15\x87\xa8\x59"
+			  "\xcf\x84\xae\xd9\x1c\x66\xd6\x9f"
+			  "\xb3\x12\x69\x7e\x36\xeb\x52\xff"
+			  "\x62\xdd\xba\x90\xb3\xe1\xee\x99",
+		.ilen	= 32,
+		.result	= "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44",
+		.rlen	= 32,
+	}, {
+		.key	= "\xff\xfe\xfd\xfc\xfb\xfa\xf9\xf8"
+			  "\xf7\xf6\xf5\xf4\xf3\xf2\xf1\xf0"
+			  "\x22\x22\x22\x22\x22\x22\x22\x22",
+		.klen	= 24,
+		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x15\x1b\xe4\x2c\xa2\x5a\x2d\x2c"
+			  "\x27\x36\xc0\xbf\x5d\xea\x36\x37"
+			  "\x2d\x1a\x88\xbc\x66\xb5\xd0\x0b"
+			  "\xa1\xbc\x19\xb2\x0f\x3b\x75\x34",
+		.ilen	= 32,
+		.result	= "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44"
+			  "\x44\x44\x44\x44\x44\x44\x44\x44",
+		.rlen	= 32,
+	}, {
+		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
+			  "\x23\x53\x60\x28\x74\x71\x35\x26"
+			  "\x31\x41\x59\x26\x53\x58\x97\x93",
+		.klen	= 24,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\xaf\xa1\x81\xa6\x32\xbb\x15\x8e"
+			  "\xf8\x95\x2e\xd3\xe6\xee\x7e\x09"
+			  "\x0c\x1a\xf5\x02\x97\x8b\xe3\xb3"
+			  "\x11\xc7\x39\x96\xd0\x95\xf4\x56"
+			  "\xf4\xdd\x03\x38\x01\x44\x2c\xcf"
+			  "\x88\xae\x8e\x3c\xcd\xe7\xaa\x66"
+			  "\xfe\x3d\xc6\xfb\x01\x23\x51\x43"
+			  "\xd5\xd2\x13\x86\x94\x34\xe9\x62"
+			  "\xf9\x89\xe3\xd1\x7b\xbe\xf8\xef"
+			  "\x76\x35\x04\x3f\xdb\x23\x9d\x0b"
+			  "\x85\x42\xb9\x02\xd6\xcc\xdb\x96"
+			  "\xa7\x6b\x27\xb6\xd4\x45\x8f\x7d"
+			  "\xae\xd2\x04\xd5\xda\xc1\x7e\x24"
+			  "\x8c\x73\xbe\x48\x7e\xcf\x65\x28"
+			  "\x29\xe5\xbe\x54\x30\xcb\x46\x95"
+			  "\x4f\x2e\x8a\x36\xc8\x27\xc5\xbe"
+			  "\xd0\x1a\xaf\xab\x26\xcd\x9e\x69"
+			  "\xa1\x09\x95\x71\x26\xe9\xc4\xdf"
+			  "\xe6\x31\xc3\x46\xda\xaf\x0b\x41"
+			  "\x1f\xab\xb1\x8e\xd6\xfc\x0b\xb3"
+			  "\x82\xc0\x37\x27\xfc\x91\xa7\x05"
+			  "\xfb\xc5\xdc\x2b\x74\x96\x48\x43"
+			  "\x5d\x9c\x19\x0f\x60\x63\x3a\x1f"
+			  "\x6f\xf0\x03\xbe\x4d\xfd\xc8\x4a"
+			  "\xc6\xa4\x81\x6d\xc3\x12\x2a\x5c"
+			  "\x07\xff\xf3\x72\x74\x48\xb5\x40"
+			  "\x50\xb5\xdd\x90\x43\x31\x18\x15"
+			  "\x7b\xf2\xa6\xdb\x83\xc8\x4b\x4a"
+			  "\x29\x93\x90\x8b\xda\x07\xf0\x35"
+			  "\x6d\x90\x88\x09\x4e\x83\xf5\x5b"
+			  "\x94\x12\xbb\x33\x27\x1d\x3f\x23"
+			  "\x51\xa8\x7c\x07\xa2\xae\x77\xa6"
+			  "\x50\xfd\xcc\xc0\x4f\x80\x7a\x9f"
+			  "\x66\xdd\xcd\x75\x24\x8b\x33\xf7"
+			  "\x20\xdb\x83\x9b\x4f\x11\x63\x6e"
+			  "\xcf\x37\xef\xc9\x11\x01\x5c\x45"
+			  "\x32\x99\x7c\x3c\x9e\x42\x89\xe3"
+			  "\x70\x6d\x15\x9f\xb1\xe6\xb6\x05"
+			  "\xfe\x0c\xb9\x49\x2d\x90\x6d\xcc"
+			  "\x5d\x3f\xc1\xfe\x89\x0a\x2e\x2d"
+			  "\xa0\xa8\x89\x3b\x73\x39\xa5\x94"
+			  "\x4c\xa4\xa6\xbb\xa7\x14\x46\x89"
+			  "\x10\xff\xaf\xef\xca\xdd\x4f\x80"
+			  "\xb3\xdf\x3b\xab\xd4\xe5\x5a\xc7"
+			  "\x33\xca\x00\x8b\x8b\x3f\xea\xec"
+			  "\x68\x8a\xc2\x6d\xfd\xd4\x67\x0f"
+			  "\x22\x31\xe1\x0e\xfe\x5a\x04\xd5"
+			  "\x64\xa3\xf1\x1a\x76\x28\xcc\x35"
+			  "\x36\xa7\x0a\x74\xf7\x1c\x44\x9b"
+			  "\xc7\x1b\x53\x17\x02\xea\xd1\xad"
+			  "\x13\x51\x73\xc0\xa0\xb2\x05\x32"
+			  "\xa8\xa2\x37\x2e\xe1\x7a\x3a\x19"
+			  "\x26\xb4\x6c\x62\x5d\xb3\x1a\x1d"
+			  "\x59\xda\xee\x1a\x22\x18\xda\x0d"
+			  "\x88\x0f\x55\x8b\x72\x62\xfd\xc1"
+			  "\x69\x13\xcd\x0d\x5f\xc1\x09\x52"
+			  "\xee\xd6\xe3\x84\x4d\xee\xf6\x88"
+			  "\xaf\x83\xdc\x76\xf4\xc0\x93\x3f"
+			  "\x4a\x75\x2f\xb0\x0b\x3e\xc4\x54"
+			  "\x7d\x69\x8d\x00\x62\x77\x0d\x14"
+			  "\xbe\x7c\xa6\x7d\xc5\x24\x4f\xf3"
+			  "\x50\xf7\x5f\xf4\xc2\xca\x41\x97"
+			  "\x37\xbe\x75\x74\xcd\xf0\x75\x6e"
+			  "\x25\x23\x94\xbd\xda\x8d\xb0\xd4",
+		.ilen	= 512,
+		.result	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.rlen	= 512,
+	}, {
+		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
+			  "\x23\x53\x60\x28\x74\x71\x35\x26"
+			  "\x62\x49\x77\x57\x24\x70\x93\x69"
+			  "\x99\x59\x57\x49\x66\x96\x76\x27",
+		.klen	= 32,
+		.iv	= "\xff\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.input	= "\x55\xed\x71\xd3\x02\x8e\x15\x3b"
+			  "\xc6\x71\x29\x2d\x3e\x89\x9f\x59"
+			  "\x68\x6a\xcc\x8a\x56\x97\xf3\x95"
+			  "\x4e\x51\x08\xda\x2a\xf8\x6f\x3c"
+			  "\x78\x16\xea\x80\xdb\x33\x75\x94"
+			  "\xf9\x29\xc4\x2b\x76\x75\x97\xc7"
+			  "\xf2\x98\x2c\xf9\xff\xc8\xd5\x2b"
+			  "\x18\xf1\xaf\xcf\x7c\xc5\x0b\xee"
+			  "\xad\x3c\x76\x7c\xe6\x27\xa2\x2a"
+			  "\xe4\x66\xe1\xab\xa2\x39\xfc\x7c"
+			  "\xf5\xec\x32\x74\xa3\xb8\x03\x88"
+			  "\x52\xfc\x2e\x56\x3f\xa1\xf0\x9f"
+			  "\x84\x5e\x46\xed\x20\x89\xb6\x44"
+			  "\x8d\xd0\xed\x54\x47\x16\xbe\x95"
+			  "\x8a\xb3\x6b\x72\xc4\x32\x52\x13"
+			  "\x1b\xb0\x82\xbe\xac\xf9\x70\xa6"
+			  "\x44\x18\xdd\x8c\x6e\xca\x6e\x45"
+			  "\x8f\x1e\x10\x07\x57\x25\x98\x7b"
+			  "\x17\x8c\x78\xdd\x80\xa7\xd9\xd8"
+			  "\x63\xaf\xb9\x67\x57\xfd\xbc\xdb"
+			  "\x44\xe9\xc5\x65\xd1\xc7\x3b\xff"
+			  "\x20\xa0\x80\x1a\xc3\x9a\xad\x5e"
+			  "\x5d\x3b\xd3\x07\xd9\xf5\xfd\x3d"
+			  "\x4a\x8b\xa8\xd2\x6e\x7a\x51\x65"
+			  "\x6c\x8e\x95\xe0\x45\xc9\x5f\x4a"
+			  "\x09\x3c\x3d\x71\x7f\x0c\x84\x2a"
+			  "\xc8\x48\x52\x1a\xc2\xd5\xd6\x78"
+			  "\x92\x1e\xa0\x90\x2e\xea\xf0\xf3"
+			  "\xdc\x0f\xb1\xaf\x0d\x9b\x06\x2e"
+			  "\x35\x10\x30\x82\x0d\xe7\xc5\x9b"
+			  "\xde\x44\x18\xbd\x9f\xd1\x45\xa9"
+			  "\x7b\x7a\x4a\xad\x35\x65\x27\xca"
+			  "\xb2\xc3\xd4\x9b\x71\x86\x70\xee"
+			  "\xf1\x89\x3b\x85\x4b\x5b\xaa\xaf"
+			  "\xfc\x42\xc8\x31\x59\xbe\x16\x60"
+			  "\x4f\xf9\xfa\x12\xea\xd0\xa7\x14"
+			  "\xf0\x7a\xf3\xd5\x8d\xbd\x81\xef"
+			  "\x52\x7f\x29\x51\x94\x20\x67\x3c"
+			  "\xd1\xaf\x77\x9f\x22\x5a\x4e\x63"
+			  "\xe7\xff\x73\x25\xd1\xdd\x96\x8a"
+			  "\x98\x52\x6d\xf3\xac\x3e\xf2\x18"
+			  "\x6d\xf6\x0a\x29\xa6\x34\x3d\xed"
+			  "\xe3\x27\x0d\x9d\x0a\x02\x44\x7e"
+			  "\x5a\x7e\x67\x0f\x0a\x9e\xd6\xad"
+			  "\x91\xe6\x4d\x81\x8c\x5c\x59\xaa"
+			  "\xfb\xeb\x56\x53\xd2\x7d\x4c\x81"
+			  "\x65\x53\x0f\x41\x11\xbd\x98\x99"
+			  "\xf9\xc6\xfa\x51\x2e\xa3\xdd\x8d"
+			  "\x84\x98\xf9\x34\xed\x33\x2a\x1f"
+			  "\x82\xed\xc1\x73\x98\xd3\x02\xdc"
+			  "\xe6\xc2\x33\x1d\xa2\xb4\xca\x76"
+			  "\x63\x51\x34\x9d\x96\x12\xae\xce"
+			  "\x83\xc9\x76\x5e\xa4\x1b\x53\x37"
+			  "\x17\xd5\xc0\x80\x1d\x62\xf8\x3d"
+			  "\x54\x27\x74\xbb\x10\x86\x57\x46"
+			  "\x68\xe1\xed\x14\xe7\x9d\xfc\x84"
+			  "\x47\xbc\xc2\xf8\x19\x4b\x99\xcf"
+			  "\x7a\xe9\xc4\xb8\x8c\x82\x72\x4d"
+			  "\x7b\x4f\x38\x55\x36\x71\x64\xc1"
+			  "\xfc\x5c\x75\x52\x33\x02\x18\xf8"
+			  "\x17\xe1\x2b\xc2\x43\x39\xbd\x76"
+			  "\x9b\x63\x76\x32\x2f\x19\x72\x10"
+			  "\x9f\x21\x0c\xf1\x66\x50\x7f\xa5"
+			  "\x0d\x1f\x46\xe0\xba\xd3\x2f\x3c",
+		.ilen	= 512,
+		.result	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+			  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			  "\x10\x11\x12\x13\x14\x15\x16\x17"
+			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+			  "\x20\x21\x22\x23\x24\x25\x26\x27"
+			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+			  "\x30\x31\x32\x33\x34\x35\x36\x37"
+			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+			  "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x57"
+			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+			  "\x60\x61\x62\x63\x64\x65\x66\x67"
+			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+			  "\x70\x71\x72\x73\x74\x75\x76\x77"
+			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+			  "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.rlen	= 512,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 512 - 20, 4, 16 },
+	}
+};
+
 /* Cast6 test vectors from RFC 2612 */
 #define CAST6_ENC_TEST_VECTORS		4
 #define CAST6_DEC_TEST_VECTORS		4

From 2240ca0fc8b7bc28f679695206c3f3518665a3d9 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 20 Feb 2018 20:25:45 -0800
Subject: [PATCH 1607/3220] ANDROID: sdcardfs: Hold i_mutex for i_size_write

When we call i_size_write, we must be holding i_mutex to avoid
possible lockups on 32 bit/SMP architectures. This is not
necessary on 64 bit architectures.

Change-Id: Ic3b946507c54d81b5c9046f9b57d25d4b0f9feef
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 73287721
---
 fs/sdcardfs/file.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 5ac0b0bbb0ec..2879d1291a11 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -62,6 +62,7 @@ static ssize_t sdcardfs_write(struct file *file, const char __user *buf,
 	int err;
 	struct file *lower_file;
 	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = d_inode(dentry);
 
 	/* check disk space */
 	if (!check_min_free_space(dentry, count, 0)) {
@@ -73,10 +74,12 @@ static ssize_t sdcardfs_write(struct file *file, const char __user *buf,
 	err = vfs_write(lower_file, buf, count, ppos);
 	/* update our inode times+sizes upon a successful lower write */
 	if (err >= 0) {
-		fsstack_copy_inode_size(d_inode(dentry),
-					file_inode(lower_file));
-		fsstack_copy_attr_times(d_inode(dentry),
-					file_inode(lower_file));
+		if (sizeof(loff_t) > sizeof(long))
+			inode_lock(inode);
+		fsstack_copy_inode_size(inode, file_inode(lower_file));
+		fsstack_copy_attr_times(inode, file_inode(lower_file));
+		if (sizeof(loff_t) > sizeof(long))
+			inode_unlock(inode);
 	}
 
 	return err;
@@ -391,6 +394,7 @@ ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	int err;
 	struct file *file = iocb->ki_filp, *lower_file;
+	struct inode *inode = file->f_path.dentry->d_inode;
 
 	lower_file = sdcardfs_lower_file(file);
 	if (!lower_file->f_op->write_iter) {
@@ -405,10 +409,12 @@ ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	fput(lower_file);
 	/* update upper inode times/sizes as needed */
 	if (err >= 0 || err == -EIOCBQUEUED) {
-		fsstack_copy_inode_size(file->f_path.dentry->d_inode,
-					file_inode(lower_file));
-		fsstack_copy_attr_times(file->f_path.dentry->d_inode,
-					file_inode(lower_file));
+		if (sizeof(loff_t) > sizeof(long))
+			inode_lock(inode);
+		fsstack_copy_inode_size(inode, file_inode(lower_file));
+		fsstack_copy_attr_times(inode, file_inode(lower_file));
+		if (sizeof(loff_t) > sizeof(long))
+			inode_lock(inode);
 	}
 out:
 	return err;

From 239a415f39e01da2865d790e79aab4939a128026 Mon Sep 17 00:00:00 2001
From: Ritesh Harjani <riteshh@codeaurora.org>
Date: Mon, 4 Dec 2017 09:51:07 +0530
Subject: [PATCH 1608/3220] ANDROID: sdcardfs: Set num in extension_details
 during make_item

Without this patch when you delete an extension from configfs
it still exists in the hash table data structures and we are
unable to delete it or change it's group.
This happens because during deletion the key & value is taken from
extension_details, and was not properly set.

Fix it by this patch.

Change-Id: I7c20cb1ab4d99e6aceadcb5ef850f0bb47f18be8
Signed-off-by: Ritesh Harjani <riteshh@codeaurora.org>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 73055997
---
 fs/sdcardfs/packagelist.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
index 66c6f8e72a02..947388294a07 100644
--- a/fs/sdcardfs/packagelist.c
+++ b/fs/sdcardfs/packagelist.c
@@ -659,6 +659,7 @@ static struct config_item *extension_details_make_item(struct config_group *grou
 		return ERR_PTR(-ENOMEM);
 	}
 	qstr_init(&extension_details->name, tmp);
+	extension_details->num = extensions_value->num;
 	ret = insert_ext_gid_entry(&extension_details->name, extensions_value->num);
 
 	if (ret) {

From 024f962d4b2458ee814a6b0bb1027e3504cf7b74 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Wed, 28 Feb 2018 17:02:13 +0100
Subject: [PATCH 1609/3220] Revert "binder: add missing binder_unlock()"

This reverts commit bf5a6a7256834ad88104d3a99cc935c9ac49b527.

There is no binder_unlock() in this branch, so this patch needs to be
reverted.

Change-Id: I22dca6eee1e93e712b9b1038c42edc29e679aad6
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Eric Biggers <ebiggers@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 drivers/android/binder.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 9270da81939b..7286cf9a726b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4574,10 +4574,8 @@ static unsigned int binder_poll(struct file *filp,
 	bool wait_for_proc_work;
 
 	thread = binder_get_thread(proc);
-	if (!thread) {
-		binder_unlock(__func__);
+	if (!thread)
 		return POLLERR;
-	}
 
 	binder_inner_proc_lock(thread->proc);
 	thread->looper |= BINDER_LOOPER_STATE_POLL;

From 855ea747806b2e31b844d6fdacd97e0a836be207 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 1 Mar 2018 16:35:31 -0800
Subject: [PATCH 1610/3220] ANDROID: keychord: Check for write data size

keychord driver causes a kernel warning when writing more than
(1 << (MAX_ORDER - 1)) * PAGE_SIZE bytes to /dev/keychord.
In reality writes to this file should be much smaller, so
limiting data size to PAGE_SIZE seems to be appropriate.
This change checks write data size and if it's more than
PAGE_SIZE causes write to fail.

Bug: 73962978

Change-Id: I8a064a396d4259ffca924fa35d80e9700c4f8d79
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 drivers/input/misc/keychord.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c
index fdcc14653b64..82fefdff366a 100644
--- a/drivers/input/misc/keychord.c
+++ b/drivers/input/misc/keychord.c
@@ -276,7 +276,7 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer,
 	size_t resid = count;
 	size_t key_bytes;
 
-	if (count < sizeof(struct input_keychord))
+	if (count < sizeof(struct input_keychord) || count > PAGE_SIZE)
 		return -EINVAL;
 	keychords = kzalloc(count, GFP_KERNEL);
 	if (!keychords)

From fba21f6831a2c5507adb5f4d9e35f02e207ad18b Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Wed, 31 Jan 2018 18:11:57 -0800
Subject: [PATCH 1611/3220] ANDROID: cpufreq: track per-task time in state

Add time in state data to task structs, and create
/proc/<pid>/time_in_state files to show how long each individual task
has run at each frequency.
Create a CONFIG_CPU_FREQ_TIMES option to enable/disable this tracking.

Signed-off-by: Connor O'Brien <connoro@google.com>
Bug: 72339335
Test: Read /proc/<pid>/time_in_state
Change-Id: Ia6456754f4cb1e83b2bc35efa8fbe9f8696febc8
---
 drivers/cpufreq/Kconfig         |   9 ++
 drivers/cpufreq/Makefile        |   5 +-
 drivers/cpufreq/cpufreq.c       |   3 +
 drivers/cpufreq/cpufreq_times.c | 205 ++++++++++++++++++++++++++++++++
 fs/proc/base.c                  |   7 ++
 include/linux/cpufreq_times.h   |  34 ++++++
 include/linux/sched.h           |   4 +
 kernel/exit.c                   |   4 +
 kernel/sched/core.c             |   5 +
 kernel/sched/cputime.c          |  10 ++
 10 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 drivers/cpufreq/cpufreq_times.c
 create mode 100644 include/linux/cpufreq_times.h

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 0dcc74e6328e..ac1bb26fb158 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -46,6 +46,15 @@ config CPU_FREQ_STAT_DETAILS
 
 	  If in doubt, say N.
 
+config CPU_FREQ_TIMES
+       bool "CPU frequency time-in-state statistics"
+       default y
+       help
+         This driver exports CPU time-in-state information through procfs file
+         system.
+
+         If in doubt, say N.
+
 choice
 	prompt "Default CPUFreq governor"
 	default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 04e6324fd638..54070bf413bf 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -4,7 +4,10 @@ obj-$(CONFIG_CPU_FREQ)			+= cpufreq.o freq_table.o cpufreq_governor_attr_set.o
 # CPUfreq stats
 obj-$(CONFIG_CPU_FREQ_STAT)             += cpufreq_stats.o
 
-# CPUfreq governors 
+# CPUfreq times
+obj-$(CONFIG_CPU_FREQ_TIMES)		+= cpufreq_times.o
+
+# CPUfreq governors
 obj-$(CONFIG_CPU_FREQ_GOV_PERFORMANCE)	+= cpufreq_performance.o
 obj-$(CONFIG_CPU_FREQ_GOV_POWERSAVE)	+= cpufreq_powersave.o
 obj-$(CONFIG_CPU_FREQ_GOV_USERSPACE)	+= cpufreq_userspace.o
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 620ab220feff..5dfea63204ca 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -19,6 +19,7 @@
 
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
+#include <linux/cpufreq_times.h>
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/init.h>
@@ -437,6 +438,7 @@ static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
 		pr_debug("FREQ: %lu - CPU: %lu\n",
 			 (unsigned long)freqs->new, (unsigned long)freqs->cpu);
 		trace_cpu_frequency(freqs->new, freqs->cpu);
+		cpufreq_times_record_transition(freqs);
 		srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
 				CPUFREQ_POSTCHANGE, freqs);
 		if (likely(policy) && likely(policy->cpu == freqs->cpu))
@@ -1342,6 +1344,7 @@ static int cpufreq_online(unsigned int cpu)
 			goto out_exit_policy;
 		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 				CPUFREQ_CREATE_POLICY, policy);
+		cpufreq_times_create_policy(policy);
 
 		write_lock_irqsave(&cpufreq_driver_lock, flags);
 		list_add(&policy->policy_list, &cpufreq_policy_list);
diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
new file mode 100644
index 000000000000..0084305c383e
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -0,0 +1,205 @@
+/* drivers/cpufreq/cpufreq_times.c
+ *
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/cpufreq_times.h>
+#include <linux/cputime.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+
+static DEFINE_SPINLOCK(task_time_in_state_lock); /* task->time_in_state */
+
+/**
+ * struct cpu_freqs - per-cpu frequency information
+ * @offset: start of these freqs' stats in task time_in_state array
+ * @max_state: number of entries in freq_table
+ * @last_index: index in freq_table of last frequency switched to
+ * @freq_table: list of available frequencies
+ */
+struct cpu_freqs {
+	unsigned int offset;
+	unsigned int max_state;
+	unsigned int last_index;
+	unsigned int freq_table[0];
+};
+
+static struct cpu_freqs *all_freqs[NR_CPUS];
+
+static unsigned int next_offset;
+
+void cpufreq_task_times_init(struct task_struct *p)
+{
+	void *temp;
+	unsigned long flags;
+	unsigned int max_state;
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	p->time_in_state = NULL;
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+	p->max_state = 0;
+
+	max_state = READ_ONCE(next_offset);
+
+	/* We use one array to avoid multiple allocs per task */
+	temp = kcalloc(max_state, sizeof(p->time_in_state[0]), GFP_ATOMIC);
+	if (!temp)
+		return;
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	p->time_in_state = temp;
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+	p->max_state = max_state;
+}
+
+/* Caller must hold task_time_in_state_lock */
+static int cpufreq_task_times_realloc_locked(struct task_struct *p)
+{
+	void *temp;
+	unsigned int max_state = READ_ONCE(next_offset);
+
+	temp = krealloc(p->time_in_state, max_state * sizeof(u64), GFP_ATOMIC);
+	if (!temp)
+		return -ENOMEM;
+	p->time_in_state = temp;
+	memset(p->time_in_state + p->max_state, 0,
+	       (max_state - p->max_state) * sizeof(u64));
+	p->max_state = max_state;
+	return 0;
+}
+
+void cpufreq_task_times_exit(struct task_struct *p)
+{
+	unsigned long flags;
+	void *temp;
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	temp = p->time_in_state;
+	p->time_in_state = NULL;
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+	kfree(temp);
+}
+
+int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
+	struct pid *pid, struct task_struct *p)
+{
+	unsigned int cpu, i;
+	cputime_t cputime;
+	unsigned long flags;
+	u64 *times;
+	struct cpu_freqs *freqs;
+	struct cpu_freqs *last_freqs = NULL;
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	for_each_possible_cpu(cpu) {
+		freqs = all_freqs[cpu];
+		if (!freqs || freqs == last_freqs)
+			continue;
+		last_freqs = freqs;
+
+		seq_printf(m, "cpu%u\n", cpu);
+		for (i = 0; i < freqs->max_state; i++) {
+			if (freqs->freq_table[i] == CPUFREQ_ENTRY_INVALID)
+				continue;
+			cputime = 0;
+			if (freqs->offset + i < p->max_state &&
+			    p->time_in_state)
+				cputime = times[freqs->offset + i];
+			seq_printf(m, "%u %lu\n", freqs->freq_table[i],
+				   (unsigned long)cputime_to_clock_t(cputime));
+		}
+	}
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+	return 0;
+}
+
+void cpufreq_acct_update_power(struct task_struct *p, cputime_t cputime)
+{
+	unsigned long flags;
+	unsigned int state;
+	struct cpu_freqs *freqs = all_freqs[task_cpu(p)];
+
+	if (!freqs || p->flags & PF_EXITING)
+		return;
+
+	state = freqs->offset + READ_ONCE(freqs->last_index);
+
+	spin_lock_irqsave(&task_time_in_state_lock, flags);
+	if ((state < p->max_state || !cpufreq_task_times_realloc_locked(p)) &&
+	    p->time_in_state)
+		p->time_in_state[state] += cputime;
+	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+}
+
+void cpufreq_times_create_policy(struct cpufreq_policy *policy)
+{
+	int cpu, index;
+	unsigned int count = 0;
+	struct cpufreq_frequency_table *pos, *table;
+	struct cpu_freqs *freqs;
+	void *tmp;
+
+	if (all_freqs[policy->cpu])
+		return;
+
+	table = cpufreq_frequency_get_table(policy->cpu);
+	if (!table)
+		return;
+
+	cpufreq_for_each_entry(pos, table)
+		count++;
+
+	tmp =  kzalloc(sizeof(*freqs) + sizeof(freqs->freq_table[0]) * count,
+		       GFP_KERNEL);
+	if (!tmp)
+		return;
+
+	freqs = tmp;
+	freqs->max_state = count;
+
+	index = cpufreq_frequency_table_get_index(policy, policy->cur);
+	if (index >= 0)
+		WRITE_ONCE(freqs->last_index, index);
+
+	cpufreq_for_each_entry(pos, table)
+		freqs->freq_table[pos - table] = pos->frequency;
+
+	freqs->offset = next_offset;
+	WRITE_ONCE(next_offset, freqs->offset + count);
+	for_each_cpu(cpu, policy->related_cpus)
+		all_freqs[cpu] = freqs;
+}
+
+void cpufreq_times_record_transition(struct cpufreq_freqs *freq)
+{
+	int index;
+	struct cpu_freqs *freqs = all_freqs[freq->cpu];
+	struct cpufreq_policy *policy;
+
+	if (!freqs)
+		return;
+
+	policy = cpufreq_cpu_get(freq->cpu);
+	if (!policy)
+		return;
+
+	index = cpufreq_frequency_table_get_index(policy, freq->new);
+	if (index >= 0)
+		WRITE_ONCE(freqs->last_index, index);
+
+	cpufreq_cpu_put(policy);
+}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index deafb880368b..247ed9b0786f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -87,6 +87,7 @@
 #include <linux/slab.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
+#include <linux/cpufreq_times.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -2904,6 +2905,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("timers",	  S_IRUGO, proc_timers_operations),
 #endif
 	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
+#ifdef CONFIG_CPU_FREQ_TIMES
+	ONE("time_in_state", 0444, proc_time_in_state_show),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3288,6 +3292,9 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
 #endif
+#ifdef CONFIG_CPU_FREQ_TIMES
+	ONE("time_in_state", 0444, proc_time_in_state_show),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h
new file mode 100644
index 000000000000..87f3ff4bc60a
--- /dev/null
+++ b/include/linux/cpufreq_times.h
@@ -0,0 +1,34 @@
+/* drivers/cpufreq/cpufreq_times.c
+ *
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_CPUFREQ_TIMES_H
+#define _LINUX_CPUFREQ_TIMES_H
+
+#include <linux/cpufreq.h>
+
+#ifdef CONFIG_CPU_FREQ_TIMES
+void cpufreq_task_times_init(struct task_struct *p);
+void cpufreq_task_times_exit(struct task_struct *p);
+int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
+			    struct pid *pid, struct task_struct *p);
+void cpufreq_acct_update_power(struct task_struct *p, cputime_t cputime);
+void cpufreq_times_create_policy(struct cpufreq_policy *policy);
+void cpufreq_times_record_transition(struct cpufreq_freqs *freq);
+#else
+static inline void cpufreq_times_create_policy(struct cpufreq_policy *policy) {}
+static inline void cpufreq_times_record_transition(
+	struct cpufreq_freqs *freq) {}
+#endif /* CONFIG_CPU_FREQ_TIMES */
+#endif /* _LINUX_CPUFREQ_TIMES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8377ac73a034..24ea3398ae15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1702,6 +1702,10 @@ struct task_struct {
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
+#ifdef CONFIG_CPU_FREQ_TIMES
+	u64 *time_in_state;
+	unsigned int max_state;
+#endif
 	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqlock_t vtime_seqlock;
diff --git a/kernel/exit.c b/kernel/exit.c
index 0003d8b97fd9..0a480853d527 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,6 +54,7 @@
 #include <linux/writeback.h>
 #include <linux/shm.h>
 #include <linux/kcov.h>
+#include <linux/cpufreq_times.h>
 
 #include "sched/tune.h"
 
@@ -173,6 +174,9 @@ void release_task(struct task_struct *p)
 {
 	struct task_struct *leader;
 	int zap_leader;
+#ifdef CONFIG_CPU_FREQ_TIMES
+	cpufreq_task_times_exit(p);
+#endif
 repeat:
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a767432922ca..8d71855bedd4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
+#include <linux/cpufreq_times.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -2196,6 +2197,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
+#ifdef CONFIG_CPU_FREQ_TIMES
+	cpufreq_task_times_init(p);
+#endif
+
 	RB_CLEAR_NODE(&p->dl.rb_node);
 	init_dl_task_timer(&p->dl);
 	__dl_clear_params(p);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index acde1d7c763c..c0763cba909d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,6 +4,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
+#include <linux/cpufreq_times.h>
 #include "sched.h"
 #include "walt.h"
 
@@ -165,6 +166,11 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 
 	/* Account for user time used */
 	acct_account_cputime(p);
+
+#ifdef CONFIG_CPU_FREQ_TIMES
+	/* Account power usage for user time */
+	cpufreq_acct_update_power(p, cputime);
+#endif
 }
 
 /*
@@ -215,6 +221,10 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
 
 	/* Account for system time used */
 	acct_account_cputime(p);
+#ifdef CONFIG_CPU_FREQ_TIMES
+	/* Account power usage for system time */
+	cpufreq_acct_update_power(p, cputime);
+#endif
 }
 
 /*

From 229c03238f0cf3679d2fdf16fa9f207420d99d41 Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Tue, 6 Feb 2018 13:30:27 -0800
Subject: [PATCH 1612/3220] ANDROID: cpufreq: times: track per-uid time in
 state

Add /proc/uid_time_in_state showing per uid/frequency/cluster
times. Allow uid removal through /proc/uid_cputime/remove_uid_range.

Signed-off-by: Connor O'Brien <connoro@google.com>
Bug: 72339335
Test: Read /proc/uid_time_in_state
Change-Id: I20ba3546a27c25b7e7991e2a86986e158aafa58c
---
 drivers/cpufreq/cpufreq_times.c | 202 ++++++++++++++++++++++++++++++++
 drivers/misc/uid_sys_stats.c    |   5 +
 include/linux/cpufreq_times.h   |   3 +
 3 files changed, 210 insertions(+)

diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
index 0084305c383e..8e1a97e5728d 100644
--- a/drivers/cpufreq/cpufreq_times.c
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -16,13 +16,29 @@
 #include <linux/cpufreq.h>
 #include <linux/cpufreq_times.h>
 #include <linux/cputime.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
 
+#define UID_HASH_BITS 10
+
+DECLARE_HASHTABLE(uid_hash_table, UID_HASH_BITS);
+
 static DEFINE_SPINLOCK(task_time_in_state_lock); /* task->time_in_state */
+static DEFINE_SPINLOCK(uid_lock); /* uid_hash_table */
+
+struct uid_entry {
+	uid_t uid;
+	unsigned int max_state;
+	struct hlist_node hash;
+	struct rcu_head rcu;
+	u64 time_in_state[0];
+};
 
 /**
  * struct cpu_freqs - per-cpu frequency information
@@ -42,6 +58,131 @@ static struct cpu_freqs *all_freqs[NR_CPUS];
 
 static unsigned int next_offset;
 
+/* Caller must hold uid lock */
+static struct uid_entry *find_uid_entry_locked(uid_t uid)
+{
+	struct uid_entry *uid_entry;
+
+	hash_for_each_possible(uid_hash_table, uid_entry, hash, uid) {
+		if (uid_entry->uid == uid)
+			return uid_entry;
+	}
+	return NULL;
+}
+
+/* Caller must hold uid lock */
+static struct uid_entry *find_or_register_uid_locked(uid_t uid)
+{
+	struct uid_entry *uid_entry, *temp;
+	unsigned int max_state = READ_ONCE(next_offset);
+	size_t alloc_size = sizeof(uid_entry) + max_state *
+		sizeof(uid_entry->time_in_state[0]);
+
+	uid_entry = find_uid_entry_locked(uid);
+	if (uid_entry) {
+		if (uid_entry->max_state == max_state)
+			return uid_entry;
+		/* uid_entry->time_in_state is too small to track all freqs, so
+		 * expand it.
+		 */
+		temp = __krealloc(uid_entry, alloc_size, GFP_ATOMIC);
+		if (!temp)
+			return uid_entry;
+		temp->max_state = max_state;
+		memset(temp->time_in_state + uid_entry->max_state, 0,
+		       (max_state - uid_entry->max_state) *
+		       sizeof(uid_entry->time_in_state[0]));
+		hlist_replace_rcu(&uid_entry->hash, &temp->hash);
+		kfree_rcu(uid_entry, rcu);
+		return temp;
+	}
+
+	uid_entry = kzalloc(alloc_size, GFP_ATOMIC);
+	if (!uid_entry)
+		return NULL;
+
+	uid_entry->uid = uid;
+	uid_entry->max_state = max_state;
+
+	hash_add_rcu(uid_hash_table, &uid_entry->hash, uid);
+
+	return uid_entry;
+}
+
+static bool freq_index_invalid(unsigned int index)
+{
+	unsigned int cpu;
+	struct cpu_freqs *freqs;
+
+	for_each_possible_cpu(cpu) {
+		freqs = all_freqs[cpu];
+		if (!freqs || index < freqs->offset ||
+		    freqs->offset + freqs->max_state <= index)
+			continue;
+		return freqs->freq_table[index - freqs->offset] ==
+			CPUFREQ_ENTRY_INVALID;
+	}
+	return true;
+}
+
+static void *uid_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos >= HASH_SIZE(uid_hash_table))
+		return NULL;
+
+	return &uid_hash_table[*pos];
+}
+
+static void *uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	if (*pos >= HASH_SIZE(uid_hash_table))
+		return NULL;
+
+	return &uid_hash_table[*pos];
+}
+
+static void uid_seq_stop(struct seq_file *seq, void *v) { }
+
+static int uid_time_in_state_seq_show(struct seq_file *m, void *v)
+{
+	struct uid_entry *uid_entry;
+	struct cpu_freqs *freqs, *last_freqs = NULL;
+	int i, cpu;
+
+	if (v == uid_hash_table) {
+		seq_puts(m, "uid:");
+		for_each_possible_cpu(cpu) {
+			freqs = all_freqs[cpu];
+			if (!freqs || freqs == last_freqs)
+				continue;
+			last_freqs = freqs;
+			for (i = 0; i < freqs->max_state; i++)
+				seq_printf(m, " %d", freqs->freq_table[i]);
+		}
+		seq_putc(m, '\n');
+	}
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(uid_entry, (struct hlist_head *)v, hash) {
+		if (uid_entry->max_state)
+			seq_printf(m, "%d:", uid_entry->uid);
+		for (i = 0; i < uid_entry->max_state; ++i) {
+			if (freq_index_invalid(i))
+				continue;
+			seq_printf(m, " %lu", (unsigned long)cputime_to_clock_t(
+					   uid_entry->time_in_state[i]));
+		}
+		if (uid_entry->max_state)
+			seq_putc(m, '\n');
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+
 void cpufreq_task_times_init(struct task_struct *p)
 {
 	void *temp;
@@ -87,6 +228,9 @@ void cpufreq_task_times_exit(struct task_struct *p)
 	unsigned long flags;
 	void *temp;
 
+	if (!p->time_in_state)
+		return;
+
 	spin_lock_irqsave(&task_time_in_state_lock, flags);
 	temp = p->time_in_state;
 	p->time_in_state = NULL;
@@ -131,7 +275,9 @@ void cpufreq_acct_update_power(struct task_struct *p, cputime_t cputime)
 {
 	unsigned long flags;
 	unsigned int state;
+	struct uid_entry *uid_entry;
 	struct cpu_freqs *freqs = all_freqs[task_cpu(p)];
+	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
 
 	if (!freqs || p->flags & PF_EXITING)
 		return;
@@ -143,6 +289,12 @@ void cpufreq_acct_update_power(struct task_struct *p, cputime_t cputime)
 	    p->time_in_state)
 		p->time_in_state[state] += cputime;
 	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+
+	spin_lock_irqsave(&uid_lock, flags);
+	uid_entry = find_or_register_uid_locked(uid);
+	if (uid_entry && state < uid_entry->max_state)
+		uid_entry->time_in_state[state] += cputime;
+	spin_unlock_irqrestore(&uid_lock, flags);
 }
 
 void cpufreq_times_create_policy(struct cpufreq_policy *policy)
@@ -184,6 +336,27 @@ void cpufreq_times_create_policy(struct cpufreq_policy *policy)
 		all_freqs[cpu] = freqs;
 }
 
+void cpufreq_task_times_remove_uids(uid_t uid_start, uid_t uid_end)
+{
+	struct uid_entry *uid_entry;
+	struct hlist_node *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&uid_lock, flags);
+
+	for (; uid_start <= uid_end; uid_start++) {
+		hash_for_each_possible_safe(uid_hash_table, uid_entry, tmp,
+			hash, uid_start) {
+			if (uid_start == uid_entry->uid) {
+				hash_del_rcu(&uid_entry->hash);
+				kfree_rcu(uid_entry, rcu);
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&uid_lock, flags);
+}
+
 void cpufreq_times_record_transition(struct cpufreq_freqs *freq)
 {
 	int index;
@@ -203,3 +376,32 @@ void cpufreq_times_record_transition(struct cpufreq_freqs *freq)
 
 	cpufreq_cpu_put(policy);
 }
+
+static const struct seq_operations uid_time_in_state_seq_ops = {
+	.start = uid_seq_start,
+	.next = uid_seq_next,
+	.stop = uid_seq_stop,
+	.show = uid_time_in_state_seq_show,
+};
+
+static int uid_time_in_state_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &uid_time_in_state_seq_ops);
+}
+
+static const struct file_operations uid_time_in_state_fops = {
+	.open		= uid_time_in_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init cpufreq_times_init(void)
+{
+	proc_create_data("uid_time_in_state", 0444, NULL,
+			 &uid_time_in_state_fops, NULL);
+
+	return 0;
+}
+
+early_initcall(cpufreq_times_init);
diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index 0e6ab4e7c686..cda32366cc7e 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/atomic.h>
+#include <linux/cpufreq_times.h>
 #include <linux/err.h>
 #include <linux/hashtable.h>
 #include <linux/init.h>
@@ -421,6 +422,10 @@ static ssize_t uid_remove_write(struct file *file,
 		kstrtol(end_uid, 10, &uid_end) != 0) {
 		return -EINVAL;
 	}
+
+	/* Also remove uids from /proc/uid_time_in_state */
+	cpufreq_task_times_remove_uids(uid_start, uid_end);
+
 	rt_mutex_lock(&uid_lock);
 
 	for (; uid_start <= uid_end; uid_start++) {
diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h
index 87f3ff4bc60a..64b94445e872 100644
--- a/include/linux/cpufreq_times.h
+++ b/include/linux/cpufreq_times.h
@@ -26,9 +26,12 @@ int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
 void cpufreq_acct_update_power(struct task_struct *p, cputime_t cputime);
 void cpufreq_times_create_policy(struct cpufreq_policy *policy);
 void cpufreq_times_record_transition(struct cpufreq_freqs *freq);
+void cpufreq_task_times_remove_uids(uid_t uid_start, uid_t uid_end);
 #else
 static inline void cpufreq_times_create_policy(struct cpufreq_policy *policy) {}
 static inline void cpufreq_times_record_transition(
 	struct cpufreq_freqs *freq) {}
+static inline void cpufreq_task_times_remove_uids(uid_t uid_start,
+						  uid_t uid_end) {}
 #endif /* CONFIG_CPU_FREQ_TIMES */
 #endif /* _LINUX_CPUFREQ_TIMES_H */

From 552095deb5650aa958cb29c6ae712728d38c4a5a Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Mon, 16 Oct 2017 10:30:24 -0700
Subject: [PATCH 1613/3220] ANDROID: proc: Add /proc/uid directory

Add support for reporting per-uid information through procfs, roughly
following the approach used for per-tid and per-tgid directories in
fs/proc/base.c.
This also entails some new tracking of which uids have been used, to
avoid losing information when the last task with a given uid exits.

Signed-off-by: Connor O'Brien <connoro@google.com>
Bug: 72339335
Test: ls /proc/uid/; compare with UIDs in /proc/uid_time_in_state
Change-Id: I0908f0c04438b11ceb673d860e58441bf503d478
---
 fs/proc/Kconfig         |   7 +
 fs/proc/Makefile        |   1 +
 fs/proc/internal.h      |   9 ++
 fs/proc/root.c          |   2 +-
 fs/proc/uid.c           | 278 ++++++++++++++++++++++++++++++++++++++++
 include/linux/proc_fs.h |   6 +
 kernel/user.c           |   3 +
 7 files changed, 305 insertions(+), 1 deletion(-)
 create mode 100644 fs/proc/uid.c

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..523bfc094e42 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -81,3 +81,10 @@ config PROC_CHILDREN
 
 	  Say Y if you are running any user-space software which takes benefit from
 	  this interface. For example, rkt is such a piece of software.
+
+config PROC_UID
+	bool "Include /proc/uid/ files"
+	default y
+	depends on PROC_FS
+	help
+	Provides aggregated per-uid information under /proc/uid.
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7151ea428041..baab1daf7585 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -24,6 +24,7 @@ proc-y	+= softirqs.o
 proc-y	+= namespaces.o
 proc-y	+= self.o
 proc-y	+= thread_self.o
+proc-$(CONFIG_PROC_UID)  += uid.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2781095bd1..fa4dd182bfaa 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -255,6 +255,15 @@ static inline void proc_sys_init(void) { }
 static inline void sysctl_head_put(struct ctl_table_header *head) { }
 #endif
 
+/*
+ * uid.c
+ */
+#ifdef CONFIG_PROC_UID
+extern int proc_uid_init(void);
+#else
+static inline void proc_uid_init(void) { }
+#endif
+
 /*
  * proc_tty.c
  */
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ec649c92d270..1d0b6a125563 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -182,7 +182,7 @@ void __init proc_root_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
-
+	proc_uid_init();
 #ifdef CONFIG_SYSVIPC
 	proc_mkdir("sysvipc", NULL);
 #endif
diff --git a/fs/proc/uid.c b/fs/proc/uid.c
new file mode 100644
index 000000000000..3c7e87c2a820
--- /dev/null
+++ b/fs/proc/uid.c
@@ -0,0 +1,278 @@
+/*
+ * /proc/uid support
+ */
+
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct proc_dir_entry *proc_uid;
+
+#define UID_HASH_BITS 10
+
+static DECLARE_HASHTABLE(proc_uid_hash_table, UID_HASH_BITS);
+
+/*
+ * use rt_mutex here to avoid priority inversion between high-priority readers
+ * of these files and tasks calling proc_register_uid().
+ */
+static DEFINE_RT_MUTEX(proc_uid_lock); /* proc_uid_hash_table */
+
+struct uid_hash_entry {
+	uid_t uid;
+	struct hlist_node hash;
+};
+
+/* Caller must hold proc_uid_lock */
+static bool uid_hash_entry_exists_locked(uid_t uid)
+{
+	struct uid_hash_entry *entry;
+
+	hash_for_each_possible(proc_uid_hash_table, entry, hash, uid) {
+		if (entry->uid == uid)
+			return true;
+	}
+	return false;
+}
+
+void proc_register_uid(kuid_t kuid)
+{
+	struct uid_hash_entry *entry;
+	bool exists;
+	uid_t uid = from_kuid_munged(current_user_ns(), kuid);
+
+	rt_mutex_lock(&proc_uid_lock);
+	exists = uid_hash_entry_exists_locked(uid);
+	rt_mutex_unlock(&proc_uid_lock);
+	if (exists)
+		return;
+
+	entry = kzalloc(sizeof(struct uid_hash_entry), GFP_KERNEL);
+	if (!entry)
+		return;
+	entry->uid = uid;
+
+	rt_mutex_lock(&proc_uid_lock);
+	if (uid_hash_entry_exists_locked(uid))
+		kfree(entry);
+	else
+		hash_add(proc_uid_hash_table, &entry->hash, uid);
+	rt_mutex_unlock(&proc_uid_lock);
+}
+
+struct uid_entry {
+	const char *name;
+	int len;
+	umode_t mode;
+	const struct inode_operations *iop;
+	const struct file_operations *fop;
+};
+
+#define NOD(NAME, MODE, IOP, FOP) {			\
+	.name	= (NAME),				\
+	.len	= sizeof(NAME) - 1,			\
+	.mode	= MODE,					\
+	.iop	= IOP,					\
+	.fop	= FOP,					\
+}
+
+static const struct uid_entry uid_base_stuff[] = {};
+
+const struct inode_operations proc_uid_def_inode_operations = {
+	.setattr	= proc_setattr,
+};
+
+struct inode *proc_uid_make_inode(struct super_block *sb, kuid_t kuid)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_op = &proc_uid_def_inode_operations;
+	inode->i_uid = kuid;
+
+	return inode;
+}
+
+static int proc_uident_instantiate(struct inode *dir, struct dentry *dentry,
+				   struct task_struct *unused, const void *ptr)
+{
+	const struct uid_entry *u = ptr;
+	struct inode *inode;
+
+	inode = proc_uid_make_inode(dir->i_sb, dir->i_uid);
+	if (!inode)
+		return -ENOENT;
+
+	inode->i_mode = u->mode;
+	if (S_ISDIR(inode->i_mode))
+		set_nlink(inode, 2);
+	if (u->iop)
+		inode->i_op = u->iop;
+	if (u->fop)
+		inode->i_fop = u->fop;
+	d_add(dentry, inode);
+	return 0;
+}
+
+static struct dentry *proc_uid_base_lookup(struct inode *dir,
+					   struct dentry *dentry,
+					   unsigned int flags)
+{
+	const struct uid_entry *u, *last;
+	unsigned int nents = ARRAY_SIZE(uid_base_stuff);
+
+	if (nents == 0)
+		return ERR_PTR(-ENOENT);
+
+	last = &uid_base_stuff[nents - 1];
+	for (u = uid_base_stuff; u <= last; u++) {
+		if (u->len != dentry->d_name.len)
+			continue;
+		if (!memcmp(dentry->d_name.name, u->name, u->len))
+			break;
+	}
+	if (u > last)
+		return ERR_PTR(-ENOENT);
+
+	return ERR_PTR(proc_uident_instantiate(dir, dentry, NULL, u));
+}
+
+static int proc_uid_base_readdir(struct file *file, struct dir_context *ctx)
+{
+	unsigned int nents = ARRAY_SIZE(uid_base_stuff);
+	const struct uid_entry *u;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	if (ctx->pos >= nents + 2)
+		return 0;
+
+	for (u = uid_base_stuff + (ctx->pos - 2);
+	     u <= uid_base_stuff + nents - 1; u++) {
+		if (!proc_fill_cache(file, ctx, u->name, u->len,
+				     proc_uident_instantiate, NULL, u))
+			break;
+		ctx->pos++;
+	}
+
+	return 0;
+}
+
+static const struct inode_operations proc_uid_base_inode_operations = {
+	.lookup		= proc_uid_base_lookup,
+	.setattr	= proc_setattr,
+};
+
+static const struct file_operations proc_uid_base_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_uid_base_readdir,
+	.llseek		= default_llseek,
+};
+
+static int proc_uid_instantiate(struct inode *dir, struct dentry *dentry,
+				struct task_struct *unused, const void *ptr)
+{
+	unsigned int i, len;
+	nlink_t nlinks;
+	kuid_t *kuid = (kuid_t *)ptr;
+	struct inode *inode = proc_uid_make_inode(dir->i_sb, *kuid);
+
+	if (!inode)
+		return -ENOENT;
+
+	inode->i_mode = S_IFDIR | 0555;
+	inode->i_op = &proc_uid_base_inode_operations;
+	inode->i_fop = &proc_uid_base_operations;
+	inode->i_flags |= S_IMMUTABLE;
+
+	nlinks = 2;
+	len = ARRAY_SIZE(uid_base_stuff);
+	for (i = 0; i < len; ++i) {
+		if (S_ISDIR(uid_base_stuff[i].mode))
+			++nlinks;
+	}
+	set_nlink(inode, nlinks);
+
+	d_add(dentry, inode);
+
+	return 0;
+}
+
+static int proc_uid_readdir(struct file *file, struct dir_context *ctx)
+{
+	int last_shown, i;
+	unsigned long bkt;
+	struct uid_hash_entry *entry;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	i = 0;
+	last_shown = ctx->pos - 2;
+	rt_mutex_lock(&proc_uid_lock);
+	hash_for_each(proc_uid_hash_table, bkt, entry, hash) {
+		int len;
+		char buf[PROC_NUMBUF];
+
+		if (i < last_shown)
+			continue;
+		len = snprintf(buf, sizeof(buf), "%u", entry->uid);
+		if (!proc_fill_cache(file, ctx, buf, len,
+				     proc_uid_instantiate, NULL, &entry->uid))
+			break;
+		i++;
+		ctx->pos++;
+	}
+	rt_mutex_unlock(&proc_uid_lock);
+	return 0;
+}
+
+static struct dentry *proc_uid_lookup(struct inode *dir, struct dentry *dentry,
+				      unsigned int flags)
+{
+	int result = -ENOENT;
+
+	uid_t uid = name_to_int(&dentry->d_name);
+	bool uid_exists;
+
+	rt_mutex_lock(&proc_uid_lock);
+	uid_exists = uid_hash_entry_exists_locked(uid);
+	rt_mutex_unlock(&proc_uid_lock);
+	if (uid_exists) {
+		kuid_t kuid = make_kuid(current_user_ns(), uid);
+
+		result = proc_uid_instantiate(dir, dentry, NULL, &kuid);
+	}
+	return ERR_PTR(result);
+}
+
+static const struct file_operations proc_uid_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_uid_readdir,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations proc_uid_inode_operations = {
+	.lookup		= proc_uid_lookup,
+	.setattr	= proc_setattr,
+};
+
+int __init proc_uid_init(void)
+{
+	proc_uid = proc_mkdir("uid", NULL);
+	proc_uid->proc_iops = &proc_uid_inode_operations;
+	proc_uid->proc_fops = &proc_uid_operations;
+
+	return 0;
+}
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index b97bf2ef996e..b326d0a0cace 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -74,6 +74,12 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
 
 #endif /* CONFIG_PROC_FS */
 
+#ifdef CONFIG_PROC_UID
+extern void proc_register_uid(kuid_t uid);
+#else
+static inline void proc_register_uid(kuid_t uid) {}
+#endif
+
 struct net;
 
 static inline struct proc_dir_entry *proc_net_mkdir(
diff --git a/kernel/user.c b/kernel/user.c
index b069ccbfb0b0..41e94e453836 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/proc_ns.h>
 
 /*
@@ -201,6 +202,7 @@ struct user_struct *alloc_uid(kuid_t uid)
 		}
 		spin_unlock_irq(&uidhash_lock);
 	}
+	proc_register_uid(uid);
 
 	return up;
 
@@ -222,6 +224,7 @@ static int __init uid_cache_init(void)
 	spin_lock_irq(&uidhash_lock);
 	uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
 	spin_unlock_irq(&uidhash_lock);
+	proc_register_uid(GLOBAL_ROOT_UID);
 
 	return 0;
 }

From ea0dbcb473aab21f18b4f5e0fe6bb5dd4fc74459 Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Mon, 22 Jan 2018 18:28:08 -0800
Subject: [PATCH 1614/3220] ANDROID: cpufreq: Add time_in_state to /proc/uid
 directories

Add per-uid files that report the data in binary format rather than
text, to allow faster reading & parsing by userspace.

Signed-off-by: Connor O'Brien <connoro@google.com>
Bug: 72339335
Test: compare values to those reported in /proc/uid_time_in_state
Change-Id: I463039ea7f17b842be4c70024fe772539fe2ce02
---
 drivers/cpufreq/cpufreq_times.c | 49 +++++++++++++++++++++++++++++++++
 fs/proc/uid.c                   | 16 ++++++++++-
 include/linux/cpufreq_times.h   |  1 +
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
index 8e1a97e5728d..b74017f06a8e 100644
--- a/drivers/cpufreq/cpufreq_times.c
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -58,6 +58,19 @@ static struct cpu_freqs *all_freqs[NR_CPUS];
 
 static unsigned int next_offset;
 
+
+/* Caller must hold rcu_read_lock() */
+static struct uid_entry *find_uid_entry_rcu(uid_t uid)
+{
+	struct uid_entry *uid_entry;
+
+	hash_for_each_possible_rcu(uid_hash_table, uid_entry, hash, uid) {
+		if (uid_entry->uid == uid)
+			return uid_entry;
+	}
+	return NULL;
+}
+
 /* Caller must hold uid lock */
 static struct uid_entry *find_uid_entry_locked(uid_t uid)
 {
@@ -125,6 +138,36 @@ static bool freq_index_invalid(unsigned int index)
 	return true;
 }
 
+static int single_uid_time_in_state_show(struct seq_file *m, void *ptr)
+{
+	struct uid_entry *uid_entry;
+	unsigned int i;
+	u64 time;
+	uid_t uid = from_kuid_munged(current_user_ns(), *(kuid_t *)m->private);
+
+	if (uid == overflowuid)
+		return -EINVAL;
+
+	rcu_read_lock();
+
+	uid_entry = find_uid_entry_rcu(uid);
+	if (!uid_entry) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	for (i = 0; i < uid_entry->max_state; ++i) {
+		if (freq_index_invalid(i))
+			continue;
+		time = cputime_to_clock_t(uid_entry->time_in_state[i]);
+		seq_write(m, &time, sizeof(time));
+	}
+
+	rcu_read_unlock();
+
+	return 0;
+}
+
 static void *uid_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	if (*pos >= HASH_SIZE(uid_hash_table))
@@ -389,6 +432,12 @@ static int uid_time_in_state_open(struct inode *inode, struct file *file)
 	return seq_open(file, &uid_time_in_state_seq_ops);
 }
 
+int single_uid_time_in_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, single_uid_time_in_state_show,
+			&(inode->i_uid));
+}
+
 static const struct file_operations uid_time_in_state_fops = {
 	.open		= uid_time_in_state_open,
 	.read		= seq_read,
diff --git a/fs/proc/uid.c b/fs/proc/uid.c
index 3c7e87c2a820..c0d439a00560 100644
--- a/fs/proc/uid.c
+++ b/fs/proc/uid.c
@@ -2,6 +2,7 @@
  * /proc/uid support
  */
 
+#include <linux/cpufreq_times.h>
 #include <linux/fs.h>
 #include <linux/hashtable.h>
 #include <linux/init.h>
@@ -81,7 +82,20 @@ struct uid_entry {
 	.fop	= FOP,					\
 }
 
-static const struct uid_entry uid_base_stuff[] = {};
+#ifdef CONFIG_CPU_FREQ_TIMES
+const struct file_operations proc_uid_time_in_state_operations = {
+	.open		= single_uid_time_in_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+static const struct uid_entry uid_base_stuff[] = {
+#ifdef CONFIG_CPU_FREQ_TIMES
+	NOD("time_in_state", 0444, NULL, &proc_uid_time_in_state_operations),
+#endif
+};
 
 const struct inode_operations proc_uid_def_inode_operations = {
 	.setattr	= proc_setattr,
diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h
index 64b94445e872..35c623aa9c1d 100644
--- a/include/linux/cpufreq_times.h
+++ b/include/linux/cpufreq_times.h
@@ -27,6 +27,7 @@ void cpufreq_acct_update_power(struct task_struct *p, cputime_t cputime);
 void cpufreq_times_create_policy(struct cpufreq_policy *policy);
 void cpufreq_times_record_transition(struct cpufreq_freqs *freq);
 void cpufreq_task_times_remove_uids(uid_t uid_start, uid_t uid_end);
+int single_uid_time_in_state_open(struct inode *inode, struct file *file);
 #else
 static inline void cpufreq_times_create_policy(struct cpufreq_policy *policy) {}
 static inline void cpufreq_times_record_transition(

From 18edb0c8f3cbadfc4944022be71c1a723cb6d0d5 Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Tue, 6 Mar 2018 17:13:47 -0800
Subject: [PATCH 1615/3220] ANDROID: cpufreq: times: Add missing includes

Without these, the goldfish x86 and x86_64 builds fail.

Test: build kernel for goldfish x86 and x86_64
Change-Id: I1cbdbaaa03404975ee51c7420927d605074c93e4
Signed-off-by: Connor O'Brien <connoro@google.com>
---
 include/linux/cpufreq_times.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h
index 35c623aa9c1d..3fb38750c853 100644
--- a/include/linux/cpufreq_times.h
+++ b/include/linux/cpufreq_times.h
@@ -17,6 +17,8 @@
 #define _LINUX_CPUFREQ_TIMES_H
 
 #include <linux/cpufreq.h>
+#include <linux/cputime.h>
+#include <linux/pid.h>
 
 #ifdef CONFIG_CPU_FREQ_TIMES
 void cpufreq_task_times_init(struct task_struct *p);

From 86ad18f55c903d44f0da0d2f15491c5eab0944b8 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@android.com>
Date: Wed, 7 Feb 2018 12:38:47 -0800
Subject: [PATCH 1616/3220] UPSTREAM: ANDROID: binder: remove WARN() for
 redundant txn error

binder_send_failed_reply() is called when a synchronous
transaction fails. It reports an error to the thread that
is waiting for the completion. Given that the transaction
is synchronous, there should never be more than 1 error
response to that thread -- this was being asserted with
a WARN().

However, when exercising the driver with syzbot tests, cases
were observed where multiple "synchronous" requests were
sent without waiting for responses, so it is possible that
multiple errors would be reported to the thread. This testing
was conducted with panic_on_warn set which forced the crash.

This is easily reproduced by sending back-to-back
"synchronous" transactions without checking for any
response (eg, set read_size to 0):

    bwr.write_buffer = (uintptr_t)&bc1;
    bwr.write_size = sizeof(bc1);
    bwr.read_buffer = (uintptr_t)&br;
    bwr.read_size = 0;
    ioctl(fd, BINDER_WRITE_READ, &bwr);
    sleep(1);
    bwr2.write_buffer = (uintptr_t)&bc2;
    bwr2.write_size = sizeof(bc2);
    bwr2.read_buffer = (uintptr_t)&br;
    bwr2.read_size = 0;
    ioctl(fd, BINDER_WRITE_READ, &bwr2);
    sleep(1);

The first transaction is sent to the servicemanager and the reply
fails because no VMA is set up by this client. After
binder_send_failed_reply() is called, the BINDER_WORK_RETURN_ERROR
is sitting on the thread's todo list since the read_size was 0 and
the client is not waiting for a response.

The 2nd transaction is sent and the BINDER_WORK_RETURN_ERROR has not
been consumed, so the thread's reply_error.cmd is still set (normally
cleared when the BINDER_WORK_RETURN_ERROR is handled). Therefore
when the servicemanager attempts to reply to the 2nd failed
transaction, the error is already set and it triggers this warning.

This is a user error since it is not waiting for the synchronous
transaction to complete. If it ever does check, it will see an
error.

Changed the WARN() to a pr_warn().

Signed-off-by: Todd Kjos <tkjos@android.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit e46a3b3ba7509cb7fda0e07bc7c63a2cd90f579b)

Change-Id: I3365b0775ceee37bdb1d868e3ce066c260aa88ea
---
 drivers/android/binder.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 7286cf9a726b..4ffc84d43795 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2148,8 +2148,14 @@ static void binder_send_failed_reply(struct binder_transaction *t,
 					&target_thread->reply_error.work);
 				wake_up_interruptible(&target_thread->wait);
 			} else {
-				WARN(1, "Unexpected reply error: %u\n",
-						target_thread->reply_error.cmd);
+				/*
+				 * Cannot get here for normal operation, but
+				 * we can if multiple synchronous transactions
+				 * are sent without blocking for responses.
+				 * Just ignore the 2nd error in this case.
+				 */
+				pr_warn("Unexpected reply error: %u\n",
+					target_thread->reply_error.cmd);
 			}
 			binder_inner_proc_unlock(target_thread->proc);
 			binder_thread_dec_tmpref(target_thread);

From c4a320621a8513c73a8f30083e99bcf62d19bd21 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@glider.be>
Date: Fri, 19 Jan 2018 16:24:08 +0100
Subject: [PATCH 1617/3220] FROMLIST: ARM: amba: Don't read past the end of
 sysfs "driver_override" buffer

When printing the driver_override parameter when it is 4095 and 4094
bytes long, the printing code would access invalid memory because we
need count + 1 bytes for printing.

Cfr. commits 4efe874aace57dba ("PCI: Don't read past the end of sysfs
"driver_override" buffer") and bf563b01c2895a4b ("driver core: platform:
Don't read past the end of "driver_override" buffer").

Fixes: 3cf385713460eb2b ("ARM: 8256/1: driver coamba: add device binding path 'driver_override'")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
(cherry picked from: https://patchwork.kernel.org/patch/10175611/)
Signed-off-by: Todd Kjos <tkjos@google.com>

Change-Id: I7a9fffc8e3cc775fcf693edae7b42d57c0a375c5
---
 drivers/amba/bus.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index f0099360039e..42086ad535c5 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -82,7 +82,8 @@ static ssize_t driver_override_store(struct device *_dev,
 	struct amba_device *dev = to_amba_device(_dev);
 	char *driver_override, *old = dev->driver_override, *cp;
 
-	if (count > PATH_MAX)
+	/* We need to keep extra room for a newline */
+	if (count >= (PAGE_SIZE - 1))
 		return -EINVAL;
 
 	driver_override = kstrndup(buf, count, GFP_KERNEL);

From 098ee1cfeaf3388aefaaec696cb419c7307a44ae Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Wed, 7 Mar 2018 12:48:41 -0800
Subject: [PATCH 1618/3220] ANDROID: proc: fix config & includes for /proc/uid

Per-UID proc files require rtmutex, so add an include statement and
make CONFIG_PROC_UID depend on CONFIG_RT_MUTEXES

Bug: 74338318
Test: Previously broken bcm63xx and stm32 builds now succeed.
Change-Id: Id9d44775cf9ea04319d21d833a4666e3dfc16b40
Signed-off-by: Connor O'Brien <connoro@google.com>
---
 fs/proc/Kconfig | 2 +-
 fs/proc/uid.c   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 523bfc094e42..08dce22afec1 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -85,6 +85,6 @@ config PROC_CHILDREN
 config PROC_UID
 	bool "Include /proc/uid/ files"
 	default y
-	depends on PROC_FS
+	depends on PROC_FS && RT_MUTEXES
 	help
 	Provides aggregated per-uid information under /proc/uid.
diff --git a/fs/proc/uid.c b/fs/proc/uid.c
index c0d439a00560..616d99b157c3 100644
--- a/fs/proc/uid.c
+++ b/fs/proc/uid.c
@@ -7,6 +7,7 @@
 #include <linux/hashtable.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
+#include <linux/rtmutex.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>

From 3f5605ddb0438f375a51be8164f575cdceab7b17 Mon Sep 17 00:00:00 2001
From: Lingfeng Yang <lfy@google.com>
Date: Tue, 28 Mar 2017 09:05:26 -0700
Subject: [PATCH 1619/3220] ANDROID: Add input support for Android Wear.

Also enable the roraty device in Kconfig and
defconfigs.

Change-Id: Id05325a1d7e5a03baf6f85bb28bd6ee87fa8a20c
Signed-off-by: Roman Kiryanov <rkir@google.com>
Signed-off-by: Lingfeng Yang <lfy@google.com>
---
 arch/arm/configs/ranchu_defconfig        |   1 +
 arch/x86/configs/i386_ranchu_defconfig   |   1 +
 arch/x86/configs/x86_64_ranchu_defconfig |   1 +
 drivers/input/keyboard/Kconfig           |  10 ++
 drivers/input/keyboard/Makefile          |   1 +
 drivers/input/keyboard/goldfish_rotary.c | 200 +++++++++++++++++++++++
 6 files changed, 214 insertions(+)
 create mode 100644 drivers/input/keyboard/goldfish_rotary.c

diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig
index 49e7bbd5825a..d5a16dfe678f 100644
--- a/arch/arm/configs/ranchu_defconfig
+++ b/arch/arm/configs/ranchu_defconfig
@@ -173,6 +173,7 @@ CONFIG_USB_USBNET=y
 CONFIG_INPUT_EVDEV=y
 CONFIG_INPUT_KEYRESET=y
 CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+CONFIG_KEYBOARD_GOLDFISH_ROTARY=y
 # CONFIG_INPUT_MOUSE is not set
 CONFIG_INPUT_JOYSTICK=y
 CONFIG_JOYSTICK_XPAD=y
diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig
index 65ed8c8f8444..dde67eacd8e2 100644
--- a/arch/x86/configs/i386_ranchu_defconfig
+++ b/arch/x86/configs/i386_ranchu_defconfig
@@ -240,6 +240,7 @@ CONFIG_INPUT_EVDEV=y
 CONFIG_INPUT_KEYRESET=y
 # CONFIG_KEYBOARD_ATKBD is not set
 CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+CONFIG_KEYBOARD_GOLDFISH_ROTARY=y
 # CONFIG_INPUT_MOUSE is not set
 CONFIG_INPUT_JOYSTICK=y
 CONFIG_JOYSTICK_XPAD=y
diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig
index d977bd91e390..e8ed8eef62ec 100644
--- a/arch/x86/configs/x86_64_ranchu_defconfig
+++ b/arch/x86/configs/x86_64_ranchu_defconfig
@@ -237,6 +237,7 @@ CONFIG_INPUT_EVDEV=y
 CONFIG_INPUT_KEYRESET=y
 # CONFIG_KEYBOARD_ATKBD is not set
 CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+CONFIG_KEYBOARD_GOLDFISH_ROTARY=y
 # CONFIG_INPUT_MOUSE is not set
 CONFIG_INPUT_JOYSTICK=y
 CONFIG_JOYSTICK_XPAD=y
diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index ddd8148d51d7..75ff4c965573 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -525,6 +525,16 @@ config KEYBOARD_GOLDFISH_EVENTS
 	  To compile this driver as a module, choose M here: the
 	  module will be called goldfish-events.
 
+config KEYBOARD_GOLDFISH_ROTARY
+	depends on GOLDFISH
+	tristate "Rotary encoder device for Goldfish"
+	help
+	  Say Y here to get an input event device for the Goldfish virtual
+	  device emulator that acts as a rotary encoder.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called goldfish-rotary.
+
 config KEYBOARD_STOWAWAY
 	tristate "Stowaway keyboard"
 	select SERIO
diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
index 1d416ddf84e4..a5d43fc8fab6 100644
--- a/drivers/input/keyboard/Makefile
+++ b/drivers/input/keyboard/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_KEYBOARD_CROS_EC)		+= cros_ec_keyb.o
 obj-$(CONFIG_KEYBOARD_DAVINCI)		+= davinci_keyscan.o
 obj-$(CONFIG_KEYBOARD_EP93XX)		+= ep93xx_keypad.o
 obj-$(CONFIG_KEYBOARD_GOLDFISH_EVENTS)	+= goldfish_events.o
+obj-$(CONFIG_KEYBOARD_GOLDFISH_ROTARY)	+= goldfish_rotary.o
 obj-$(CONFIG_KEYBOARD_GPIO)		+= gpio_keys.o
 obj-$(CONFIG_KEYBOARD_GPIO_POLLED)	+= gpio_keys_polled.o
 obj-$(CONFIG_KEYBOARD_TCA6416)		+= tca6416-keypad.o
diff --git a/drivers/input/keyboard/goldfish_rotary.c b/drivers/input/keyboard/goldfish_rotary.c
new file mode 100644
index 000000000000..485727d44684
--- /dev/null
+++ b/drivers/input/keyboard/goldfish_rotary.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2007 Google, Inc.
+ * Copyright (C) 2012 Intel, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/types.h>
+#include <linux/input.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/acpi.h>
+
+enum {
+	REG_READ        = 0x00,
+	REG_SET_PAGE    = 0x00,
+	REG_LEN         = 0x04,
+	REG_DATA        = 0x08,
+
+	PAGE_NAME       = 0x00000,
+	PAGE_EVBITS     = 0x10000,
+	PAGE_ABSDATA    = 0x20000 | EV_ABS,
+};
+
+struct event_dev {
+	struct input_dev *input;
+	int irq;
+	void __iomem *addr;
+	char name[0];
+};
+
+static irqreturn_t rotary_interrupt(int irq, void *dev_id)
+{
+	struct event_dev *edev = dev_id;
+	unsigned type, code, value;
+
+	type = __raw_readl(edev->addr + REG_READ);
+	code = __raw_readl(edev->addr + REG_READ);
+	value = __raw_readl(edev->addr + REG_READ);
+
+	input_event(edev->input, type, code, value);
+	return IRQ_HANDLED;
+}
+
+static void rotary_import_bits(struct event_dev *edev,
+			unsigned long bits[], unsigned type, size_t count)
+{
+	void __iomem *addr = edev->addr;
+	int i, j;
+	size_t size;
+	uint8_t val;
+
+	__raw_writel(PAGE_EVBITS | type, addr + REG_SET_PAGE);
+
+	size = __raw_readl(addr + REG_LEN) * 8;
+	if (size < count)
+		count = size;
+
+	addr += REG_DATA;
+	for (i = 0; i < count; i += 8) {
+		val = __raw_readb(addr++);
+		for (j = 0; j < 8; j++)
+			if (val & 1 << j)
+				set_bit(i + j, bits);
+	}
+}
+
+static void rotary_import_abs_params(struct event_dev *edev)
+{
+	struct input_dev *input_dev = edev->input;
+	void __iomem *addr = edev->addr;
+	u32 val[4];
+	int count;
+	int i, j;
+
+	__raw_writel(PAGE_ABSDATA, addr + REG_SET_PAGE);
+
+	count = __raw_readl(addr + REG_LEN) / sizeof(val);
+	if (count > ABS_MAX)
+		count = ABS_MAX;
+
+	for (i = 0; i < count; i++) {
+		if (!test_bit(i, input_dev->absbit))
+			continue;
+
+		for (j = 0; j < ARRAY_SIZE(val); j++) {
+			int offset = (i * ARRAY_SIZE(val) + j) * sizeof(u32);
+			val[j] = __raw_readl(edev->addr + REG_DATA + offset);
+		}
+
+		input_set_abs_params(input_dev, i,
+				     val[0], val[1], val[2], val[3]);
+	}
+}
+
+static int rotary_probe(struct platform_device *pdev)
+{
+	struct input_dev *input_dev;
+	struct event_dev *edev;
+	struct resource *res;
+	unsigned keymapnamelen;
+	void __iomem *addr;
+	int irq;
+	int i;
+	int error;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return -EINVAL;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -EINVAL;
+
+	addr = devm_ioremap(&pdev->dev, res->start, 4096);
+	if (!addr)
+		return -ENOMEM;
+
+	__raw_writel(PAGE_NAME, addr + REG_SET_PAGE);
+	keymapnamelen = __raw_readl(addr + REG_LEN);
+
+	edev = devm_kzalloc(&pdev->dev,
+			    sizeof(struct event_dev) + keymapnamelen + 1,
+			    GFP_KERNEL);
+	if (!edev)
+		return -ENOMEM;
+
+	input_dev = devm_input_allocate_device(&pdev->dev);
+	if (!input_dev)
+		return -ENOMEM;
+
+	edev->input = input_dev;
+	edev->addr = addr;
+	edev->irq = irq;
+
+	for (i = 0; i < keymapnamelen; i++)
+		edev->name[i] = __raw_readb(edev->addr + REG_DATA + i);
+
+	pr_debug("rotary_probe() keymap=%s\n", edev->name);
+
+	input_dev->name = edev->name;
+	input_dev->id.bustype = BUS_HOST;
+	rotary_import_bits(edev, input_dev->evbit, EV_SYN, EV_MAX);
+	rotary_import_bits(edev, input_dev->relbit, EV_REL, REL_MAX);
+	rotary_import_bits(edev, input_dev->absbit, EV_ABS, ABS_MAX);
+
+	rotary_import_abs_params(edev);
+
+	error = devm_request_irq(&pdev->dev, edev->irq, rotary_interrupt, 0,
+				 "goldfish-rotary", edev);
+	if (error)
+		return error;
+
+	error = input_register_device(input_dev);
+	if (error)
+		return error;
+
+	return 0;
+}
+
+static const struct of_device_id goldfish_rotary_of_match[] = {
+	{ .compatible = "generic,goldfish-rotary", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, goldfish_rotary_of_match);
+
+static const struct acpi_device_id goldfish_rotary_acpi_match[] = {
+	{ "GFSH0008", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_rotary_acpi_match);
+
+static struct platform_driver rotary_driver = {
+	.probe	= rotary_probe,
+	.driver	= {
+		.owner	= THIS_MODULE,
+		.name	= "goldfish_rotary",
+		.of_match_table = goldfish_rotary_of_match,
+		.acpi_match_table = ACPI_PTR(goldfish_rotary_acpi_match),
+	},
+};
+
+module_platform_driver(rotary_driver);
+
+MODULE_AUTHOR("Nimrod Gileadi");
+MODULE_DESCRIPTION("Goldfish Rotary Encoder Device");
+MODULE_LICENSE("GPL v2");

From f0f1365fa1a3382759aa9b661d6a757ebfa6f074 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Tue, 6 Mar 2018 16:37:21 -0800
Subject: [PATCH 1620/3220] ANDROID: Delete the goldfish_nand driver.

This driver was inherited from qemu1 and not used anymore.

Test: manually (built the kernel)
Change-Id: Ie66773bbde4ece8ade86f441effb6a94c16b9660
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/staging/goldfish/Kconfig             |   7 -
 drivers/staging/goldfish/Makefile            |   1 -
 drivers/staging/goldfish/goldfish_nand.c     | 442 -------------------
 drivers/staging/goldfish/goldfish_nand_reg.h |  76 ----
 4 files changed, 526 deletions(-)
 delete mode 100644 drivers/staging/goldfish/goldfish_nand.c
 delete mode 100644 drivers/staging/goldfish/goldfish_nand_reg.h

diff --git a/drivers/staging/goldfish/Kconfig b/drivers/staging/goldfish/Kconfig
index c579141a7bed..c8871d0c0776 100644
--- a/drivers/staging/goldfish/Kconfig
+++ b/drivers/staging/goldfish/Kconfig
@@ -10,10 +10,3 @@ config GOLDFISH_SYNC
 	---help---
 	  Emulated sync fences for the Goldfish Android Virtual Device
 
-config MTD_GOLDFISH_NAND
-	tristate "Goldfish NAND device"
-	depends on GOLDFISH
-	depends on MTD
-	help
-	  Drives the emulated NAND flash device on the Google Goldfish
-	  Android virtual device.
diff --git a/drivers/staging/goldfish/Makefile b/drivers/staging/goldfish/Makefile
index 0cf525588210..30db49141814 100644
--- a/drivers/staging/goldfish/Makefile
+++ b/drivers/staging/goldfish/Makefile
@@ -3,7 +3,6 @@
 #
 
 obj-$(CONFIG_GOLDFISH_AUDIO) += goldfish_audio.o
-obj-$(CONFIG_MTD_GOLDFISH_NAND)	+= goldfish_nand.o
 
 # and sync
 
diff --git a/drivers/staging/goldfish/goldfish_nand.c b/drivers/staging/goldfish/goldfish_nand.c
deleted file mode 100644
index 623353db5a08..000000000000
--- a/drivers/staging/goldfish/goldfish_nand.c
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * drivers/mtd/devices/goldfish_nand.c
- *
- * Copyright (C) 2007 Google, Inc.
- * Copyright (C) 2012 Intel, Inc.
- * Copyright (C) 2013 Intel, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/io.h>
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/vmalloc.h>
-#include <linux/mtd/mtd.h>
-#include <linux/platform_device.h>
-#include <linux/mutex.h>
-#include <linux/goldfish.h>
-#include <asm/div64.h>
-
-#include "goldfish_nand_reg.h"
-
-struct goldfish_nand {
-	/* lock protects access to the device registers */
-	struct mutex            lock;
-	unsigned char __iomem  *base;
-	struct cmd_params       *cmd_params;
-	size_t                  mtd_count;
-	struct mtd_info         mtd[0];
-};
-
-static u32 goldfish_nand_cmd_with_params(struct mtd_info *mtd,
-					 enum nand_cmd cmd, u64 addr, u32 len,
-					 void *ptr, u32 *rv)
-{
-	u32 cmdp;
-	struct goldfish_nand *nand = mtd->priv;
-	struct cmd_params *cps = nand->cmd_params;
-	unsigned char __iomem  *base = nand->base;
-
-	if (!cps)
-		return -1;
-
-	switch (cmd) {
-	case NAND_CMD_ERASE:
-		cmdp = NAND_CMD_ERASE_WITH_PARAMS;
-		break;
-	case NAND_CMD_READ:
-		cmdp = NAND_CMD_READ_WITH_PARAMS;
-		break;
-	case NAND_CMD_WRITE:
-		cmdp = NAND_CMD_WRITE_WITH_PARAMS;
-		break;
-	default:
-		return -1;
-	}
-	cps->dev = mtd - nand->mtd;
-	cps->addr_high = (u32)(addr >> 32);
-	cps->addr_low = (u32)addr;
-	cps->transfer_size = len;
-	cps->data = (unsigned long)ptr;
-	writel(cmdp, base + NAND_COMMAND);
-	*rv = cps->result;
-	return 0;
-}
-
-static u32 goldfish_nand_cmd(struct mtd_info *mtd, enum nand_cmd cmd,
-			     u64 addr, u32 len, void *ptr)
-{
-	struct goldfish_nand *nand = mtd->priv;
-	u32 rv;
-	unsigned char __iomem  *base = nand->base;
-
-	mutex_lock(&nand->lock);
-	if (goldfish_nand_cmd_with_params(mtd, cmd, addr, len, ptr, &rv)) {
-		writel(mtd - nand->mtd, base + NAND_DEV);
-		writel((u32)(addr >> 32), base + NAND_ADDR_HIGH);
-		writel((u32)addr, base + NAND_ADDR_LOW);
-		writel(len, base + NAND_TRANSFER_SIZE);
-		gf_write_ptr(ptr, base + NAND_DATA, base + NAND_DATA_HIGH);
-		writel(cmd, base + NAND_COMMAND);
-		rv = readl(base + NAND_RESULT);
-	}
-	mutex_unlock(&nand->lock);
-	return rv;
-}
-
-static int goldfish_nand_erase(struct mtd_info *mtd, struct erase_info *instr)
-{
-	loff_t ofs = instr->addr;
-	u32 len = instr->len;
-	u32 rem;
-
-	if (ofs + len > mtd->size)
-		goto invalid_arg;
-	rem = do_div(ofs, mtd->writesize);
-	if (rem)
-		goto invalid_arg;
-	ofs *= (mtd->writesize + mtd->oobsize);
-
-	if (len % mtd->writesize)
-		goto invalid_arg;
-	len = len / mtd->writesize * (mtd->writesize + mtd->oobsize);
-
-	if (goldfish_nand_cmd(mtd, NAND_CMD_ERASE, ofs, len, NULL) != len) {
-		pr_err("goldfish_nand_erase: erase failed, start %llx, len %x, dev_size %llx, erase_size %x\n",
-		       ofs, len, mtd->size, mtd->erasesize);
-		return -EIO;
-	}
-
-	instr->state = MTD_ERASE_DONE;
-	mtd_erase_callback(instr);
-
-	return 0;
-
-invalid_arg:
-	pr_err("goldfish_nand_erase: invalid erase, start %llx, len %x, dev_size %llx, erase_size %x\n",
-	       ofs, len, mtd->size, mtd->erasesize);
-	return -EINVAL;
-}
-
-static int goldfish_nand_read_oob(struct mtd_info *mtd, loff_t ofs,
-				  struct mtd_oob_ops *ops)
-{
-	u32 rem;
-
-	if (ofs + ops->len > mtd->size)
-		goto invalid_arg;
-	if (ops->datbuf && ops->len && ops->len != mtd->writesize)
-		goto invalid_arg;
-	if (ops->ooblen + ops->ooboffs > mtd->oobsize)
-		goto invalid_arg;
-
-	rem = do_div(ofs, mtd->writesize);
-	if (rem)
-		goto invalid_arg;
-	ofs *= (mtd->writesize + mtd->oobsize);
-
-	if (ops->datbuf)
-		ops->retlen = goldfish_nand_cmd(mtd, NAND_CMD_READ, ofs,
-						ops->len, ops->datbuf);
-	ofs += mtd->writesize + ops->ooboffs;
-	if (ops->oobbuf)
-		ops->oobretlen = goldfish_nand_cmd(mtd, NAND_CMD_READ, ofs,
-						ops->ooblen, ops->oobbuf);
-	return 0;
-
-invalid_arg:
-	pr_err("goldfish_nand_read_oob: invalid read, start %llx, len %zx, ooblen %zx, dev_size %llx, write_size %x\n",
-	       ofs, ops->len, ops->ooblen, mtd->size, mtd->writesize);
-	return -EINVAL;
-}
-
-static int goldfish_nand_write_oob(struct mtd_info *mtd, loff_t ofs,
-				   struct mtd_oob_ops *ops)
-{
-	u32 rem;
-
-	if (ofs + ops->len > mtd->size)
-		goto invalid_arg;
-	if (ops->len && ops->len != mtd->writesize)
-		goto invalid_arg;
-	if (ops->ooblen + ops->ooboffs > mtd->oobsize)
-		goto invalid_arg;
-
-	rem = do_div(ofs, mtd->writesize);
-	if (rem)
-		goto invalid_arg;
-	ofs *= (mtd->writesize + mtd->oobsize);
-
-	if (ops->datbuf)
-		ops->retlen = goldfish_nand_cmd(mtd, NAND_CMD_WRITE, ofs,
-						ops->len, ops->datbuf);
-	ofs += mtd->writesize + ops->ooboffs;
-	if (ops->oobbuf)
-		ops->oobretlen = goldfish_nand_cmd(mtd, NAND_CMD_WRITE, ofs,
-						ops->ooblen, ops->oobbuf);
-	return 0;
-
-invalid_arg:
-	pr_err("goldfish_nand_write_oob: invalid write, start %llx, len %zx, ooblen %zx, dev_size %llx, write_size %x\n",
-	       ofs, ops->len, ops->ooblen, mtd->size, mtd->writesize);
-	return -EINVAL;
-}
-
-static int goldfish_nand_read(struct mtd_info *mtd, loff_t from, size_t len,
-			      size_t *retlen, u_char *buf)
-{
-	u32 rem;
-
-	if (from + len > mtd->size)
-		goto invalid_arg;
-
-	rem = do_div(from, mtd->writesize);
-	if (rem)
-		goto invalid_arg;
-	from *= (mtd->writesize + mtd->oobsize);
-
-	*retlen = goldfish_nand_cmd(mtd, NAND_CMD_READ, from, len, buf);
-	return 0;
-
-invalid_arg:
-	pr_err("goldfish_nand_read: invalid read, start %llx, len %zx, dev_size %llx, write_size %x\n",
-	       from, len, mtd->size, mtd->writesize);
-	return -EINVAL;
-}
-
-static int goldfish_nand_write(struct mtd_info *mtd, loff_t to, size_t len,
-			       size_t *retlen, const u_char *buf)
-{
-	u32 rem;
-
-	if (to + len > mtd->size)
-		goto invalid_arg;
-
-	rem = do_div(to, mtd->writesize);
-	if (rem)
-		goto invalid_arg;
-	to *= (mtd->writesize + mtd->oobsize);
-
-	*retlen = goldfish_nand_cmd(mtd, NAND_CMD_WRITE, to, len, (void *)buf);
-	return 0;
-
-invalid_arg:
-	pr_err("goldfish_nand_write: invalid write, start %llx, len %zx, dev_size %llx, write_size %x\n",
-	       to, len, mtd->size, mtd->writesize);
-	return -EINVAL;
-}
-
-static int goldfish_nand_block_isbad(struct mtd_info *mtd, loff_t ofs)
-{
-	u32 rem;
-
-	if (ofs >= mtd->size)
-		goto invalid_arg;
-
-	rem = do_div(ofs, mtd->erasesize);
-	if (rem)
-		goto invalid_arg;
-	ofs *= mtd->erasesize / mtd->writesize;
-	ofs *= (mtd->writesize + mtd->oobsize);
-
-	return goldfish_nand_cmd(mtd, NAND_CMD_BLOCK_BAD_GET, ofs, 0, NULL);
-
-invalid_arg:
-	pr_err("goldfish_nand_block_isbad: invalid arg, ofs %llx, dev_size %llx, write_size %x\n",
-	       ofs, mtd->size, mtd->writesize);
-	return -EINVAL;
-}
-
-static int goldfish_nand_block_markbad(struct mtd_info *mtd, loff_t ofs)
-{
-	u32 rem;
-
-	if (ofs >= mtd->size)
-		goto invalid_arg;
-
-	rem = do_div(ofs, mtd->erasesize);
-	if (rem)
-		goto invalid_arg;
-	ofs *= mtd->erasesize / mtd->writesize;
-	ofs *= (mtd->writesize + mtd->oobsize);
-
-	if (goldfish_nand_cmd(mtd, NAND_CMD_BLOCK_BAD_SET, ofs, 0, NULL) != 1)
-		return -EIO;
-	return 0;
-
-invalid_arg:
-	pr_err("goldfish_nand_block_markbad: invalid arg, ofs %llx, dev_size %llx, write_size %x\n",
-	       ofs, mtd->size, mtd->writesize);
-	return -EINVAL;
-}
-
-static int nand_setup_cmd_params(struct platform_device *pdev,
-				 struct goldfish_nand *nand)
-{
-	u64 paddr;
-	unsigned char __iomem  *base = nand->base;
-
-	nand->cmd_params = devm_kzalloc(&pdev->dev,
-					sizeof(struct cmd_params), GFP_KERNEL);
-	if (!nand->cmd_params)
-		return -1;
-
-	paddr = __pa(nand->cmd_params);
-	writel((u32)(paddr >> 32), base + NAND_CMD_PARAMS_ADDR_HIGH);
-	writel((u32)paddr, base + NAND_CMD_PARAMS_ADDR_LOW);
-	return 0;
-}
-
-static int goldfish_nand_init_device(struct platform_device *pdev,
-				     struct goldfish_nand *nand, int id)
-{
-	u32 name_len;
-	u32 result;
-	u32 flags;
-	unsigned char __iomem  *base = nand->base;
-	struct mtd_info *mtd = &nand->mtd[id];
-	char *name;
-
-	mutex_lock(&nand->lock);
-	writel(id, base + NAND_DEV);
-	flags = readl(base + NAND_DEV_FLAGS);
-	name_len = readl(base + NAND_DEV_NAME_LEN);
-	mtd->writesize = readl(base + NAND_DEV_PAGE_SIZE);
-	mtd->size = readl(base + NAND_DEV_SIZE_LOW);
-	mtd->size |= (u64)readl(base + NAND_DEV_SIZE_HIGH) << 32;
-	mtd->oobsize = readl(base + NAND_DEV_EXTRA_SIZE);
-	mtd->oobavail = mtd->oobsize;
-	mtd->erasesize = readl(base + NAND_DEV_ERASE_SIZE) /
-			(mtd->writesize + mtd->oobsize) * mtd->writesize;
-	do_div(mtd->size, mtd->writesize + mtd->oobsize);
-	mtd->size *= mtd->writesize;
-	dev_dbg(&pdev->dev,
-		"goldfish nand dev%d: size %llx, page %d, extra %d, erase %d\n",
-		       id, mtd->size, mtd->writesize,
-		       mtd->oobsize, mtd->erasesize);
-	mutex_unlock(&nand->lock);
-
-	mtd->priv = nand;
-
-	name = devm_kzalloc(&pdev->dev, name_len + 1, GFP_KERNEL);
-	if (!name)
-		return -ENOMEM;
-	mtd->name = name;
-
-	result = goldfish_nand_cmd(mtd, NAND_CMD_GET_DEV_NAME, 0, name_len,
-				   name);
-	if (result != name_len) {
-		dev_err(&pdev->dev,
-			"goldfish_nand_init_device failed to get dev name %d != %d\n",
-			       result, name_len);
-		return -ENODEV;
-	}
-	((char *)mtd->name)[name_len] = '\0';
-
-	/* Setup the MTD structure */
-	mtd->type = MTD_NANDFLASH;
-	mtd->flags = MTD_CAP_NANDFLASH;
-	if (flags & NAND_DEV_FLAG_READ_ONLY)
-		mtd->flags &= ~MTD_WRITEABLE;
-	if (flags & NAND_DEV_FLAG_CMD_PARAMS_CAP)
-		nand_setup_cmd_params(pdev, nand);
-
-	mtd->owner = THIS_MODULE;
-	mtd->_erase = goldfish_nand_erase;
-	mtd->_read = goldfish_nand_read;
-	mtd->_write = goldfish_nand_write;
-	mtd->_read_oob = goldfish_nand_read_oob;
-	mtd->_write_oob = goldfish_nand_write_oob;
-	mtd->_block_isbad = goldfish_nand_block_isbad;
-	mtd->_block_markbad = goldfish_nand_block_markbad;
-
-	if (mtd_device_register(mtd, NULL, 0))
-		return -EIO;
-
-	return 0;
-}
-
-static int goldfish_nand_probe(struct platform_device *pdev)
-{
-	u32 num_dev;
-	int i;
-	int err;
-	u32 num_dev_working;
-	u32 version;
-	struct resource *r;
-	struct goldfish_nand *nand;
-	unsigned char __iomem  *base;
-
-	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!r)
-		return -ENODEV;
-
-	base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE);
-	if (!base)
-		return -ENOMEM;
-
-	version = readl(base + NAND_VERSION);
-	if (version != NAND_VERSION_CURRENT) {
-		dev_err(&pdev->dev,
-			"goldfish_nand_init: version mismatch, got %d, expected %d\n",
-				version, NAND_VERSION_CURRENT);
-		return -ENODEV;
-	}
-	num_dev = readl(base + NAND_NUM_DEV);
-	if (num_dev == 0)
-		return -ENODEV;
-
-	nand = devm_kzalloc(&pdev->dev, sizeof(*nand) +
-				sizeof(struct mtd_info) * num_dev, GFP_KERNEL);
-	if (!nand)
-		return -ENOMEM;
-
-	mutex_init(&nand->lock);
-	nand->base = base;
-	nand->mtd_count = num_dev;
-	platform_set_drvdata(pdev, nand);
-
-	num_dev_working = 0;
-	for (i = 0; i < num_dev; i++) {
-		err = goldfish_nand_init_device(pdev, nand, i);
-		if (err == 0)
-			num_dev_working++;
-	}
-	if (num_dev_working == 0)
-		return -ENODEV;
-	return 0;
-}
-
-static int goldfish_nand_remove(struct platform_device *pdev)
-{
-	struct goldfish_nand *nand = platform_get_drvdata(pdev);
-	int i;
-
-	for (i = 0; i < nand->mtd_count; i++) {
-		if (nand->mtd[i].name)
-			mtd_device_unregister(&nand->mtd[i]);
-	}
-	return 0;
-}
-
-static struct platform_driver goldfish_nand_driver = {
-	.probe		= goldfish_nand_probe,
-	.remove		= goldfish_nand_remove,
-	.driver = {
-		.name = "goldfish_nand"
-	}
-};
-
-module_platform_driver(goldfish_nand_driver);
-MODULE_LICENSE("GPL");
diff --git a/drivers/staging/goldfish/goldfish_nand_reg.h b/drivers/staging/goldfish/goldfish_nand_reg.h
deleted file mode 100644
index 43aeba3a4c8f..000000000000
--- a/drivers/staging/goldfish/goldfish_nand_reg.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * drivers/mtd/devices/goldfish_nand_reg.h
- *
- * Copyright (C) 2007 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef GOLDFISH_NAND_REG_H
-#define GOLDFISH_NAND_REG_H
-
-enum nand_cmd {
-	/* Write device name for NAND_DEV to NAND_DATA (vaddr) */
-	NAND_CMD_GET_DEV_NAME,
-	NAND_CMD_READ,
-	NAND_CMD_WRITE,
-	NAND_CMD_ERASE,
-	/* NAND_RESULT is 1 if block is bad, 0 if it is not */
-	NAND_CMD_BLOCK_BAD_GET,
-	NAND_CMD_BLOCK_BAD_SET,
-	NAND_CMD_READ_WITH_PARAMS,
-	NAND_CMD_WRITE_WITH_PARAMS,
-	NAND_CMD_ERASE_WITH_PARAMS
-};
-
-enum nand_dev_flags {
-	NAND_DEV_FLAG_READ_ONLY = 0x00000001,
-	NAND_DEV_FLAG_CMD_PARAMS_CAP = 0x00000002,
-};
-
-#define NAND_VERSION_CURRENT (1)
-
-enum nand_reg {
-	/* Global */
-	NAND_VERSION        = 0x000,
-	NAND_NUM_DEV        = 0x004,
-	NAND_DEV            = 0x008,
-
-	/* Dev info */
-	NAND_DEV_FLAGS      = 0x010,
-	NAND_DEV_NAME_LEN   = 0x014,
-	NAND_DEV_PAGE_SIZE  = 0x018,
-	NAND_DEV_EXTRA_SIZE = 0x01c,
-	NAND_DEV_ERASE_SIZE = 0x020,
-	NAND_DEV_SIZE_LOW   = 0x028,
-	NAND_DEV_SIZE_HIGH  = 0x02c,
-
-	/* Command */
-	NAND_RESULT         = 0x040,
-	NAND_COMMAND        = 0x044,
-	NAND_DATA           = 0x048,
-	NAND_DATA_HIGH	    = 0x100,
-	NAND_TRANSFER_SIZE  = 0x04c,
-	NAND_ADDR_LOW       = 0x050,
-	NAND_ADDR_HIGH      = 0x054,
-	NAND_CMD_PARAMS_ADDR_LOW = 0x058,
-	NAND_CMD_PARAMS_ADDR_HIGH = 0x05c,
-};
-
-struct cmd_params {
-	u32 dev;
-	u32 addr_low;
-	u32 addr_high;
-	u32 transfer_size;
-	unsigned long data;
-	u32 result;
-};
-#endif

From 1a6afad3777b6741f5bf2feec39cb7512aedae9b Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Wed, 28 Feb 2018 20:31:52 +0800
Subject: [PATCH 1621/3220] FROMLIST: f2fs: don't put dentry page in pagecache
 into highmem

Cherry-pick from origin/upstream-f2fs-stable-linux-4.4.y:
39ed8376d611 ("f2fs: don't put dentry page in pagecache into highmem")

Previous dentry page uses highmem, which will cause panic in platforms
using highmem (such as arm), since the address space of dentry pages
from highmem directly goes into the decryption path via the function
fscrypt_fname_disk_to_usr. But sg_init_one assumes the address is not
from highmem, and then cause panic since it doesn't call kmap_high but
kunmap_high is triggered at the end. To fix this problem in a simple
way, this patch avoids to put dentry page in pagecache into highmem.

Change-Id: I0c87dafb92fce72bf70403a15d28c73992c03203
Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: fix coding style]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c           | 23 +++++------------------
 fs/f2fs/f2fs.h          |  6 ------
 fs/f2fs/inline.c        |  3 +--
 fs/f2fs/inode.c         |  2 +-
 fs/f2fs/namei.c         | 32 ++++++++------------------------
 fs/f2fs/recovery.c      | 11 +++++------
 include/linux/f2fs_fs.h |  1 -
 7 files changed, 20 insertions(+), 58 deletions(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index bde445e4e690..560b707050ca 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -94,14 +94,12 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
 	struct f2fs_dir_entry *de;
 	struct f2fs_dentry_ptr d;
 
-	dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
+	dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
 
 	make_dentry_ptr_block(NULL, &d, dentry_blk);
 	de = find_target_dentry(fname, namehash, max_slots, &d);
 	if (de)
 		*res_page = dentry_page;
-	else
-		kunmap(dentry_page);
 
 	return de;
 }
@@ -287,7 +285,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
 	de = f2fs_find_entry(dir, qstr, page);
 	if (de) {
 		res = le32_to_cpu(de->ino);
-		f2fs_dentry_kunmap(dir, *page);
 		f2fs_put_page(*page, 0);
 	}
 
@@ -302,7 +299,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 	f2fs_wait_on_page_writeback(page, type, true);
 	de->ino = cpu_to_le32(inode->i_ino);
 	set_de_type(de, inode->i_mode);
-	f2fs_dentry_kunmap(dir, page);
 	set_page_dirty(page);
 
 	dir->i_mtime = dir->i_ctime = current_time(dir);
@@ -350,13 +346,11 @@ static int make_empty_dir(struct inode *inode,
 	if (IS_ERR(dentry_page))
 		return PTR_ERR(dentry_page);
 
-	dentry_blk = kmap_atomic(dentry_page);
+	dentry_blk = page_address(dentry_page);
 
 	make_dentry_ptr_block(NULL, &d, dentry_blk);
 	do_make_empty_dir(inode, parent, &d);
 
-	kunmap_atomic(dentry_blk);
-
 	set_page_dirty(dentry_page);
 	f2fs_put_page(dentry_page, 1);
 	return 0;
@@ -547,13 +541,12 @@ start:
 		if (IS_ERR(dentry_page))
 			return PTR_ERR(dentry_page);
 
-		dentry_blk = kmap(dentry_page);
+		dentry_blk = page_address(dentry_page);
 		bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
 						slots, NR_DENTRY_IN_BLOCK);
 		if (bit_pos < NR_DENTRY_IN_BLOCK)
 			goto add_dentry;
 
-		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 	}
 
@@ -588,7 +581,6 @@ fail:
 	if (inode)
 		up_write(&F2FS_I(inode)->i_sem);
 
-	kunmap(dentry_page);
 	f2fs_put_page(dentry_page, 1);
 
 	return err;
@@ -642,7 +634,6 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 		F2FS_I(dir)->task = NULL;
 	}
 	if (de) {
-		f2fs_dentry_kunmap(dir, page);
 		f2fs_put_page(page, 0);
 		err = -EEXIST;
 	} else if (IS_ERR(page)) {
@@ -730,7 +721,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
 			NR_DENTRY_IN_BLOCK,
 			0);
-	kunmap(page); /* kunmap - pair of f2fs_find_entry */
 	set_page_dirty(page);
 
 	dir->i_ctime = dir->i_mtime = current_time(dir);
@@ -775,7 +765,7 @@ bool f2fs_empty_dir(struct inode *dir)
 				return false;
 		}
 
-		dentry_blk = kmap_atomic(dentry_page);
+		dentry_blk = page_address(dentry_page);
 		if (bidx == 0)
 			bit_pos = 2;
 		else
@@ -783,7 +773,6 @@ bool f2fs_empty_dir(struct inode *dir)
 		bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
 						NR_DENTRY_IN_BLOCK,
 						bit_pos);
-		kunmap_atomic(dentry_blk);
 
 		f2fs_put_page(dentry_page, 1);
 
@@ -901,19 +890,17 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 			}
 		}
 
-		dentry_blk = kmap(dentry_page);
+		dentry_blk = page_address(dentry_page);
 
 		make_dentry_ptr_block(inode, &d, dentry_blk);
 
 		err = f2fs_fill_dentries(ctx, &d,
 				n * NR_DENTRY_IN_BLOCK, &fstr);
 		if (err) {
-			kunmap(dentry_page);
 			f2fs_put_page(dentry_page, 1);
 			break;
 		}
 
-		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 	}
 out_free:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d2a6b688a622..34595ca42f68 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2458,12 +2458,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode)
 	return is_inode_flag_set(inode, FI_INLINE_DENTRY);
 }
 
-static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
-{
-	if (!f2fs_has_inline_dentry(dir))
-		kunmap(page);
-}
-
 static inline int is_file(struct inode *inode, int type)
 {
 	return F2FS_I(inode)->i_advise & type;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 91d5d831be72..37ab2159506a 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -387,7 +387,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 	f2fs_wait_on_page_writeback(page, DATA, true);
 	zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE);
 
-	dentry_blk = kmap_atomic(page);
+	dentry_blk = page_address(page);
 
 	make_dentry_ptr_inline(dir, &src, inline_dentry);
 	make_dentry_ptr_block(dir, &dst, dentry_blk);
@@ -404,7 +404,6 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 	memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max);
 	memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN);
 
-	kunmap_atomic(dentry_blk);
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 	set_page_dirty(page);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 89c838bfb067..10be247ca421 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -328,7 +328,7 @@ make_now:
 		inode->i_op = &f2fs_dir_inode_operations;
 		inode->i_fop = &f2fs_dir_operations;
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
-		mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+		inode_nohighmem(inode);
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (f2fs_encrypted_inode(inode))
 			inode->i_op = &f2fs_encrypted_symlink_inode_operations;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index da7f709e3926..6bb1adb84324 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -317,7 +317,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
 
 	de = f2fs_find_entry(dir, &dot, &page);
 	if (de) {
-		f2fs_dentry_kunmap(dir, page);
 		f2fs_put_page(page, 0);
 	} else if (IS_ERR(page)) {
 		err = PTR_ERR(page);
@@ -329,14 +328,12 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
 	}
 
 	de = f2fs_find_entry(dir, &dotdot, &page);
-	if (de) {
-		f2fs_dentry_kunmap(dir, page);
+	if (de)
 		f2fs_put_page(page, 0);
-	} else if (IS_ERR(page)) {
+	else if (IS_ERR(page))
 		err = PTR_ERR(page);
-	} else {
+	else
 		err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
-	}
 out:
 	if (!err)
 		clear_inode_flag(dir, FI_INLINE_DOTS);
@@ -377,7 +374,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	ino = le32_to_cpu(de->ino);
-	f2fs_dentry_kunmap(dir, page);
 	f2fs_put_page(page, 0);
 
 	inode = f2fs_iget(dir->i_sb, ino);
@@ -452,7 +448,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 	err = acquire_orphan_inode(sbi);
 	if (err) {
 		f2fs_unlock_op(sbi);
-		f2fs_dentry_kunmap(dir, page);
 		f2fs_put_page(page, 0);
 		goto fail;
 	}
@@ -610,7 +605,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inode->i_op = &f2fs_dir_inode_operations;
 	inode->i_fop = &f2fs_dir_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
-	mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+	inode_nohighmem(inode);
 
 	set_inode_flag(inode, FI_INC_LINK);
 	f2fs_lock_op(sbi);
@@ -924,13 +919,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 
 	if (old_dir_entry) {
-		if (old_dir != new_dir && !whiteout) {
+		if (old_dir != new_dir && !whiteout)
 			f2fs_set_link(old_inode, old_dir_entry,
 						old_dir_page, new_dir);
-		} else {
-			f2fs_dentry_kunmap(old_inode, old_dir_page);
+		else
 			f2fs_put_page(old_dir_page, 0);
-		}
 		f2fs_i_links_write(old_dir, false);
 	}
 	add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
@@ -943,20 +936,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 put_out_dir:
 	f2fs_unlock_op(sbi);
-	if (new_page) {
-		f2fs_dentry_kunmap(new_dir, new_page);
+	if (new_page)
 		f2fs_put_page(new_page, 0);
-	}
 out_whiteout:
 	if (whiteout)
 		iput(whiteout);
 out_dir:
-	if (old_dir_entry) {
-		f2fs_dentry_kunmap(old_inode, old_dir_page);
+	if (old_dir_entry)
 		f2fs_put_page(old_dir_page, 0);
-	}
 out_old:
-	f2fs_dentry_kunmap(old_dir, old_page);
 	f2fs_put_page(old_page, 0);
 out:
 	return err;
@@ -1098,19 +1086,15 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return 0;
 out_new_dir:
 	if (new_dir_entry) {
-		f2fs_dentry_kunmap(new_inode, new_dir_page);
 		f2fs_put_page(new_dir_page, 0);
 	}
 out_old_dir:
 	if (old_dir_entry) {
-		f2fs_dentry_kunmap(old_inode, old_dir_page);
 		f2fs_put_page(old_dir_page, 0);
 	}
 out_new:
-	f2fs_dentry_kunmap(new_dir, new_page);
 	f2fs_put_page(new_page, 0);
 out_old:
-	f2fs_dentry_kunmap(old_dir, old_page);
 	f2fs_put_page(old_page, 0);
 out:
 	return err;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index b6d1ec620a8c..210de28c9cd2 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -144,7 +144,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
 retry:
 	de = __f2fs_find_entry(dir, &fname, &page);
 	if (de && inode->i_ino == le32_to_cpu(de->ino))
-		goto out_unmap_put;
+		goto out_put;
 
 	if (de) {
 		einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino));
@@ -153,19 +153,19 @@ retry:
 			err = PTR_ERR(einode);
 			if (err == -ENOENT)
 				err = -EEXIST;
-			goto out_unmap_put;
+			goto out_put;
 		}
 
 		err = dquot_initialize(einode);
 		if (err) {
 			iput(einode);
-			goto out_unmap_put;
+			goto out_put;
 		}
 
 		err = acquire_orphan_inode(F2FS_I_SB(inode));
 		if (err) {
 			iput(einode);
-			goto out_unmap_put;
+			goto out_put;
 		}
 		f2fs_delete_entry(de, page, dir, einode);
 		iput(einode);
@@ -180,8 +180,7 @@ retry:
 		goto retry;
 	goto out;
 
-out_unmap_put:
-	f2fs_dentry_kunmap(dir, page);
+out_put:
 	f2fs_put_page(page, 0);
 out:
 	if (file_enc_name(inode))
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index c82ae65b5330..073365c9808a 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -46,7 +46,6 @@
 
 /* This flag is used by node and meta inodes, and by recovery */
 #define GFP_F2FS_ZERO		(GFP_NOFS | __GFP_ZERO)
-#define GFP_F2FS_HIGH_ZERO	(GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM)
 
 /*
  * For further optimization on multi-head logs, on-disk layout supports maximum

From 2898d1ea41670fe3142c8a1c0c6b58b4c24a555b Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Fri, 2 Mar 2018 13:09:26 -0800
Subject: [PATCH 1622/3220] ANDROID: Address checkpatch.pl warnings in
 goldfish_pipe_v2

Change-Id: Icb89f2c163ea7ba463f7d8492458699d15331856
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 133 +++++++++++--------
 1 file changed, 81 insertions(+), 52 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index ad373ed36555..911b049a28c3 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -58,8 +58,7 @@ enum {
 	PIPE_CURRENT_DEVICE_VERSION = 2
 };
 
-/*
- * IMPORTANT: The following constants must match the ones used and defined
+/* IMPORTANT: The following constants must match the ones used and defined
  * in external/qemu/hw/goldfish_pipe.c in the Android source tree.
  */
 
@@ -70,7 +69,10 @@ enum PipePollFlags {
 	PIPE_POLL_HUP	= 1 << 2
 };
 
-/* Possible status values used to signal errors - see goldfish_pipe_error_convert */
+/*
+ * Possible status values used to signal errors - see
+ * goldfish_pipe_error_convert
+ */
 enum PipeErrors {
 	PIPE_ERROR_INVAL  = -1,
 	PIPE_ERROR_AGAIN  = -2,
@@ -117,8 +119,8 @@ enum PipeCmdCode {
 	PIPE_CMD_WAKE_ON_READ,
 
 	/*
-	 * TODO(zyy): implement a deferred read/write execution to allow parallel
-	 *  processing of pipe operations on the host.
+	 * TODO(zyy): implement a deferred read/write execution to allow
+	 * parallel processing of pipe operations on the host.
 	*/
 	PIPE_CMD_WAKE_ON_DONE_IO,
 };
@@ -135,17 +137,21 @@ struct goldfish_pipe_command;
 
 /* A per-pipe command structure, shared with the host */
 struct goldfish_pipe_command {
-	s32 cmd;		/* PipeCmdCode, guest -> host */
-	s32 id;			/* pipe id, guest -> host */
-	s32 status;		/* command execution status, host -> guest */
+	s32 cmd;	/* PipeCmdCode, guest -> host */
+	s32 id;		/* pipe id, guest -> host */
+	s32 status;	/* command execution status, host -> guest */
 	s32 reserved;	/* to pad to 64-bit boundary */
 	union {
 		/* Parameters for PIPE_CMD_{READ,WRITE} */
 		struct {
-			u32 buffers_count;					/* number of buffers, guest -> host */
-			s32 consumed_size;					/* number of consumed bytes, host -> guest */
-			u64 ptrs[MAX_BUFFERS_PER_COMMAND]; 	/* buffer pointers, guest -> host */
-			u32 sizes[MAX_BUFFERS_PER_COMMAND];	/* buffer sizes, guest -> host */
+			/* number of buffers, guest -> host */
+			u32 buffers_count;
+			/* number of consumed bytes, host -> guest */
+			s32 consumed_size;
+			/* buffer pointers, guest -> host */
+			u64 ptrs[MAX_BUFFERS_PER_COMMAND];
+			/* buffer sizes, guest -> host */
+			u32 sizes[MAX_BUFFERS_PER_COMMAND];
 		} rw_params;
 	};
 };
@@ -165,34 +171,44 @@ struct open_command_param {
 /* Device-level set of buffers shared with the host */
 struct goldfish_pipe_dev_buffers {
 	struct open_command_param open_command_params;
-	struct signalled_pipe_buffer signalled_pipe_buffers[MAX_SIGNALLED_PIPES];
+	struct signalled_pipe_buffer
+		signalled_pipe_buffers[MAX_SIGNALLED_PIPES];
 };
 
 /* This data type models a given pipe instance */
 struct goldfish_pipe {
-	u32 id;							/* pipe ID - index into goldfish_pipe_dev::pipes array */
-	unsigned long flags;			/* The wake flags pipe is waiting for
-									 * Note: not protected with any lock, uses atomic operations
-									 *  and barriers to make it thread-safe.
-									 */
-	unsigned long signalled_flags;	/* wake flags host have signalled,
-									 *  - protected by goldfish_pipe_dev::lock */
+	/* pipe ID - index into goldfish_pipe_dev::pipes array */
+	u32 id;
 
-	struct goldfish_pipe_command *command_buffer;	/* A pointer to command buffer */
+	/* The wake flags pipe is waiting for.
+	 * Note: not protected with any lock, uses atomic operations and
+	 * barriers to make it thread-safe.
+	 */
+	unsigned long flags;
 
 	/* doubly linked list of signalled pipes, protected by goldfish_pipe_dev::lock */
+	/* wake flags host have signalled,
+	 * protected by goldfish_pipe_dev::lock
+	 */
+	unsigned long signalled_flags;
+
+	/* A pointer to command buffer */
+	struct goldfish_pipe_command *command_buffer;
+
 	struct goldfish_pipe *prev_signalled;
 	struct goldfish_pipe *next_signalled;
 
 	/*
 	 * A pipe's own lock. Protects the following:
-	 *  - *command_buffer - makes sure a command can safely write its parameters
-	 *    to the host and read the results back.
+	 *  - *command_buffer - makes sure a command can safely write its
+	 *      parameters to the host and read the results back.
 	 */
 	struct mutex lock;
 
-	wait_queue_head_t wake_queue;	/* A wake queue for sleeping until host signals an event */
-	struct goldfish_pipe_dev *dev;	/* Pointer to the parent goldfish_pipe_dev instance */
+	/* A wake queue for sleeping until host signals an event */
+	wait_queue_head_t wake_queue;
+	/* Pointer to the parent goldfish_pipe_dev instance */
+	struct goldfish_pipe_dev *dev;
 };
 
 struct goldfish_pipe_dev pipe_dev[1] = {};
@@ -235,10 +251,12 @@ static int goldfish_pipe_error_convert(int status)
 
 static int pin_user_pages(unsigned long first_page, unsigned long last_page,
 	unsigned last_page_size, int is_write,
-	struct page *pages[MAX_BUFFERS_PER_COMMAND], unsigned *iter_last_page_size)
+	struct page *pages[MAX_BUFFERS_PER_COMMAND],
+	unsigned *iter_last_page_size)
 {
 	int ret;
 	int requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1;
+
 	if (requested_pages > MAX_BUFFERS_PER_COMMAND) {
 		requested_pages = MAX_BUFFERS_PER_COMMAND;
 		*iter_last_page_size = PAGE_SIZE;
@@ -260,10 +278,11 @@ static void release_user_pages(struct page **pages, int pages_count,
 	int is_write, s32 consumed_size)
 {
 	int i;
+
 	for (i = 0; i < pages_count; i++) {
-		if (!is_write && consumed_size > 0) {
+		if (!is_write && consumed_size > 0)
 			set_page_dirty(pages[i]);
-		}
+
 		put_page(pages[i]);
 	}
 }
@@ -291,7 +310,9 @@ static void populate_rw_params(
 	command->rw_params.sizes[0] = size_on_page;
 	for (; i < pages_count; ++i) {
 		xaddr = page_to_phys(pages[i]);
-		size_on_page = (i == pages_count - 1) ? iter_last_page_size : PAGE_SIZE;
+		size_on_page = (i == pages_count - 1) ?
+			iter_last_page_size : PAGE_SIZE;
+
 		if (xaddr == xaddr_prev + PAGE_SIZE) {
 			command->rw_params.sizes[buffer_idx] += size_on_page;
 		} else {
@@ -307,7 +328,7 @@ static void populate_rw_params(
 static int transfer_max_buffers(struct goldfish_pipe* pipe,
 	unsigned long address, unsigned long address_end, int is_write,
 	unsigned long last_page, unsigned int last_page_size,
-	s32* consumed_size, int* status)
+	s32 *consumed_size, int *status)
 {
 	struct page *pages[MAX_BUFFERS_PER_COMMAND];
 	unsigned long first_page = address & PAGE_MASK;
@@ -327,8 +348,9 @@ static int transfer_max_buffers(struct goldfish_pipe* pipe,
 		pipe->command_buffer);
 
 	/* Transfer the data */
-	*status = goldfish_cmd_locked(pipe,
-						is_write ? PIPE_CMD_WRITE : PIPE_CMD_READ);
+	*status = goldfish_pipe_cmd_locked(
+		pipe,
+		is_write ? PIPE_CMD_WRITE : PIPE_CMD_READ);
 
 	*consumed_size = pipe->command_buffer->rw_params.consumed_size;
 
@@ -389,12 +411,14 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 		s32 consumed_size;
 		int status;
 		ret = transfer_max_buffers(pipe, address, address_end, is_write,
-				last_page, last_page_size, &consumed_size, &status);
+			last_page, last_page_size, &consumed_size, &status);
 		if (ret < 0)
 			break;
 
 		if (consumed_size > 0) {
-			/* No matter what's the status, we've transfered something */
+			/* No matter what's the status, we've transfered
+			 * something
+			 */
 			count += consumed_size;
 			address += consumed_size;
 		}
@@ -413,8 +437,9 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 			 * err.
 			 */
 			if (status != PIPE_ERROR_AGAIN)
-				pr_info_ratelimited("goldfish_pipe: backend error %d on %s\n",
-									status, is_write ? "write" : "read");
+				pr_info_ratelimited(
+					"goldfish_pipe: backend error %d on %s\n",
+					status, is_write ? "write" : "read");
 			break;
 		}
 
@@ -422,7 +447,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 		 * If the error is not PIPE_ERROR_AGAIN, or if we are in
 		 * non-blocking mode, just return the error code.
 		 */
-		if (status != PIPE_ERROR_AGAIN || (filp->f_flags & O_NONBLOCK) != 0) {
+		if (status != PIPE_ERROR_AGAIN
+			|| (filp->f_flags & O_NONBLOCK) != 0) {
 			ret = goldfish_pipe_error_convert(status);
 			break;
 		}
@@ -440,7 +466,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 static ssize_t goldfish_pipe_read(struct file *filp, char __user *buffer,
 				size_t bufflen, loff_t *ppos)
 {
-	return goldfish_pipe_read_write(filp, buffer, bufflen, /* is_write */ 0);
+	return goldfish_pipe_read_write(filp, buffer, bufflen,
+		/* is_write */ 0);
 }
 
 static ssize_t goldfish_pipe_write(struct file *filp,
@@ -493,9 +520,9 @@ static void signalled_pipes_add_locked(struct goldfish_pipe_dev *dev,
 		|| dev->first_signalled_pipe == pipe)
 		return;	/* already in the list */
 	pipe->next_signalled = dev->first_signalled_pipe;
-	if (dev->first_signalled_pipe) {
+	if (dev->first_signalled_pipe)
 		dev->first_signalled_pipe->prev_signalled = pipe;
-	}
+
 	dev->first_signalled_pipe = pipe;
 }
 
@@ -511,21 +538,22 @@ static void signalled_pipes_remove_locked(struct goldfish_pipe_dev *dev,
 	pipe->next_signalled = NULL;
 }
 
-static struct goldfish_pipe *signalled_pipes_pop_front(struct goldfish_pipe_dev *dev,
+static struct goldfish_pipe *signalled_pipes_pop_front(
+		struct goldfish_pipe_dev *dev,
 		int *wakes)
 {
 	struct goldfish_pipe *pipe;
 	unsigned long flags;
+
 	spin_lock_irqsave(&dev->lock, flags);
 
 	pipe = dev->first_signalled_pipe;
 	if (pipe) {
 		*wakes = pipe->signalled_flags;
 		pipe->signalled_flags = 0;
-		/*
-		 * This is an optimized version of signalled_pipes_remove_locked() -
-		 * we want to make it as fast as possible to wake the sleeping pipe
-		 * operations faster
+		/* This is an optimized version of
+		 * signalled_pipes_remove_locked() - we want to make it as fast
+		 * as possible to wake the sleeping pipe operations faster.
 		 */
 		dev->first_signalled_pipe = pipe->next_signalled;
 		if (dev->first_signalled_pipe)
@@ -553,8 +581,8 @@ static void goldfish_interrupt_task(unsigned long unused)
 				clear_bit(BIT_WAKE_ON_WRITE, &pipe->flags);
 		}
 		/*
-		 * wake_up_interruptible() implies a write barrier, so don't explicitly
-		 * add another one here.
+		 * wake_up_interruptible() implies a write barrier, so don't
+		 * explicitly add another one here.
 		 */
 		wake_up_interruptible(&pipe->wake_queue);
 	}
@@ -608,6 +636,7 @@ static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id)
 static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
 {
 	int id;
+
 	for (id = 0; id < dev->pipes_capacity; ++id)
 		if (!dev->pipes[id])
 			return id;
@@ -657,11 +686,11 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 	init_waitqueue_head(&pipe->wake_queue);
 
 	/*
-	 * Command buffer needs to be allocated on its own page to make sure it is
-	 * physically contiguous in host's address space.
+	 * Command buffer needs to be allocated on its own page to make sure it
+	 * is physically contiguous in host's address space.
 	 */
 	pipe->command_buffer =
-			(struct goldfish_pipe_command*)__get_free_page(GFP_KERNEL);
+		(struct goldfish_pipe_command *)__get_free_page(GFP_KERNEL);
 	if (!pipe->command_buffer) {
 		status = -ENOMEM;
 		goto err_pipe;
@@ -764,8 +793,8 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 	/*
 	 * We're going to pass two buffers, open_command_params and
 	 * signalled_pipe_buffers, to the host. This means each of those buffers
-	 * needs to be contained in a single physical page. The easiest choice is
-	 * to just allocate a page and place the buffers in it.
+	 * needs to be contained in a single physical page. The easiest choice
+	 * is to just allocate a page and place the buffers in it.
 	 */
 	BUG_ON(sizeof(*dev->buffers) > PAGE_SIZE);
 	page = (char*)__get_free_page(GFP_KERNEL);

From 64913aa9605d0f5b407120ce8f854661201517fa Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Wed, 14 Mar 2018 16:04:34 -0700
Subject: [PATCH 1623/3220] ANDROID: goldfish: Fix typo in
 goldfish_cmd_locked() call

Patch 2898d1ea41670fe3142c8a1c0c6b58b4c24a555b
    ANDROID: Address checkpatch.pl warnings in goldfish_pipe_v2
has typo in function call.

Change-Id: I3355c4a71f1e4026b6095a34cfe7adefa52f7fb6
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 911b049a28c3..4341cb1a2a2a 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -348,7 +348,7 @@ static int transfer_max_buffers(struct goldfish_pipe* pipe,
 		pipe->command_buffer);
 
 	/* Transfer the data */
-	*status = goldfish_pipe_cmd_locked(
+	*status = goldfish_cmd_locked(
 		pipe,
 		is_write ? PIPE_CMD_WRITE : PIPE_CMD_READ);
 

From 51744275f535e9fab69926be0df22e85e5ae97c1 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 15 Mar 2018 19:37:16 -0700
Subject: [PATCH 1624/3220] ANDROID: sdcardfs: fix lock issue on 32 bit/SMP
 architectures

Fixes: 2240ca0fc8b7 ("ANDROID: sdcardfs: Hold i_mutex for i_size_write")

Change-Id: If7f2ed90f59c552b9ef9262b0f6aaed394f68784
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 73287721
---
 fs/sdcardfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 2879d1291a11..1461254f301d 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -414,7 +414,7 @@ ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 		fsstack_copy_inode_size(inode, file_inode(lower_file));
 		fsstack_copy_attr_times(inode, file_inode(lower_file));
 		if (sizeof(loff_t) > sizeof(long))
-			inode_lock(inode);
+			inode_unlock(inode);
 	}
 out:
 	return err;

From 25aa0cb369ec84def46082eb64c3e5ef03c774fa Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Tue, 13 Mar 2018 17:16:48 -0700
Subject: [PATCH 1625/3220] ANDROID: Address checkpatch.pl warnings in
 goldfish_pipe

Change-Id: I324549dccaab6a931c4cf91e9d0d1cbd130c6a09
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index fd1452e28352..4b4825c56de9 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -58,7 +58,9 @@
 #define CMD_WAKE_ON_READ       7  /* tell the emulator to wake us when reading
 				   * is possible */
 
-/* Possible status values used to signal errors - see goldfish_pipe_error_convert */
+/* Possible status values used to signal errors -
+ * see goldfish_pipe_error_convert
+ */
 #define PIPE_ERROR_INVAL       -1
 #define PIPE_ERROR_AGAIN       -2
 #define PIPE_ERROR_NOMEM       -3
@@ -158,6 +160,7 @@ static int valid_batchbuffer_addr(struct goldfish_pipe_dev *dev,
 {
 	u32 aph, apl;
 	u64 paddr;
+
 	aph = readl(dev->base + PIPE_REG_PARAMS_ADDR_HIGH);
 	apl = readl(dev->base + PIPE_REG_PARAMS_ADDR_LOW);
 
@@ -174,7 +177,8 @@ static int setup_access_params_addr(struct platform_device *pdev,
 	u64 paddr;
 	struct access_params *aps;
 
-	aps = devm_kzalloc(&pdev->dev, sizeof(struct access_params), GFP_KERNEL);
+	aps = devm_kzalloc(&pdev->dev, sizeof(struct access_params),
+		GFP_KERNEL);
 	if (!aps)
 		return -1;
 
@@ -226,7 +230,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 	struct goldfish_pipe *pipe = filp->private_data;
 	struct goldfish_pipe_dev *dev = pipe->dev;
 	unsigned long address, address_end;
-	struct page* pages[MAX_PAGES_TO_GRAB] = {};
+	struct page *pages[MAX_PAGES_TO_GRAB] = {};
 	int count = 0, ret = -EINVAL;
 
 	/* If the emulator already closed the pipe, no need to go further */
@@ -517,7 +521,7 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 	pipe->dev = dev;
 	mutex_init(&pipe->lock);
 	DPRINT("%s: call. pipe_dev pipe_dev=0x%lx new_pipe_addr=0x%lx file=0x%lx\n", __FUNCTION__, pipe_dev, pipe, file);
-	// spin lock init, write head of list, i guess
+	/* spin lock init, write head of list, i guess */
 	init_waitqueue_head(&pipe->wake_queue);
 
 	/*
@@ -566,8 +570,8 @@ static struct miscdevice goldfish_pipe_dev = {
 int goldfish_pipe_device_init_v1(struct platform_device *pdev)
 {
 	struct goldfish_pipe_dev *dev = pipe_dev;
-	int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
-				IRQF_SHARED, "goldfish_pipe", dev);
+	int err = devm_request_irq(&pdev->dev, dev->irq,
+		goldfish_pipe_interrupt, IRQF_SHARED, "goldfish_pipe", dev);
 	if (err) {
 		dev_err(&pdev->dev, "unable to allocate IRQ for v1\n");
 		return err;

From 39f8160f1772a09d27db46df1beb9c5d4ddb1508 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Thu, 15 Mar 2018 14:40:28 -0700
Subject: [PATCH 1626/3220] ANDROID: Address checkpatch warnings in goldfishfb

Change-Id: If16dd322f2635873a8d42c4f455ddb0a20864b6a
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/video/fbdev/goldfishfb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c
index 1e56b50e4082..bae5ab990b60 100644
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -125,6 +125,7 @@ static int goldfish_fb_check_var(struct fb_var_screeninfo *var,
 static int goldfish_fb_set_par(struct fb_info *info)
 {
 	struct goldfish_fb *fb = container_of(info, struct goldfish_fb, fb);
+
 	if (fb->rotation != fb->fb.var.rotate) {
 		info->fix.line_length = info->var.xres * 2;
 		fb->rotation = fb->fb.var.rotate;
@@ -156,6 +157,7 @@ static int goldfish_fb_pan_display(struct fb_var_screeninfo *var,
 static int goldfish_fb_blank(int blank, struct fb_info *info)
 {
 	struct goldfish_fb *fb = container_of(info, struct goldfish_fb, fb);
+
 	switch (blank) {
 	case FB_BLANK_NORMAL:
 		writel(1, fb->reg_base + FB_SET_BLANK);

From 5759471c5c24d813d542c1218b147327b80c4012 Mon Sep 17 00:00:00 2001
From: bohu <bohu@google.com>
Date: Mon, 5 Oct 2015 16:07:23 -0700
Subject: [PATCH 1627/3220] ANDROID: ranchu: 32 bit framebuffer support

Support both 16-bit and 32-bit framebuffers.

BUG: 72717639
Change-Id: Ib1357ef6c16d66a9d94da57bb97558c5ff820640
Signed-off-by: Bo Hu <bohu@google.com>
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/video/fbdev/goldfishfb.c | 94 +++++++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 15 deletions(-)

diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c
index bae5ab990b60..88adb2970b44 100644
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -38,11 +38,58 @@ enum {
 	FB_SET_BLANK        = 0x18,
 	FB_GET_PHYS_WIDTH   = 0x1c,
 	FB_GET_PHYS_HEIGHT  = 0x20,
+	FB_GET_FORMAT       = 0x24,
 
 	FB_INT_VSYNC             = 1U << 0,
 	FB_INT_BASE_UPDATE_DONE  = 1U << 1
 };
 
+/* These values *must* match the platform definitions found under
+ * <system/graphics.h>
+ */
+enum {
+	HAL_PIXEL_FORMAT_RGBA_8888          = 1,
+	HAL_PIXEL_FORMAT_RGBX_8888          = 2,
+	HAL_PIXEL_FORMAT_RGB_888            = 3,
+	HAL_PIXEL_FORMAT_RGB_565            = 4,
+	HAL_PIXEL_FORMAT_BGRA_8888          = 5,
+};
+
+struct framebuffer_config {
+	u8 bytes_per_pixel;
+	u8 red_offset;
+	u8 red_length;
+	u8 green_offset;
+	u8 green_length;
+	u8 blue_offset;
+	u8 blue_length;
+	u8 transp_offset;
+	u8 transp_length;
+};
+
+enum {
+	CHAR_BIT = 8
+};
+
+static const struct framebuffer_config *get_fb_config_from_format(int format)
+{
+	static const struct framebuffer_config fb_configs[] = {
+		{ 0, 0,  0, 0, 0, 0,  0, 0,  0 }, /* Invalid, assume RGB_565 */
+		{ 4, 0,  8, 8, 8, 16, 8, 24, 8 }, /* HAL_PIXEL_FORMAT_RGBA_8888 */
+		{ 4, 0,  8, 8, 8, 16, 8, 0,  0 }, /* HAL_PIXEL_FORMAT_RGBX_8888 */
+		{ 3, 0,  8, 8, 8, 16, 8, 0,  0 }, /* HAL_PIXEL_FORMAT_RGB_888 */
+		{ 2, 11, 5, 5, 6, 0,  5, 0,  0 }, /* HAL_PIXEL_FORMAT_RGB_565 */
+		{ 4, 16, 8, 8, 8, 0,  8, 24, 8 }, /* HAL_PIXEL_FORMAT_BGRA_8888 */
+	};
+
+	if (format > 0 &&
+		format < sizeof(fb_configs) / sizeof(struct framebuffer_config)) {
+		return &fb_configs[format];
+	}
+
+	return &fb_configs[HAL_PIXEL_FORMAT_RGB_565]; /* legacy default */
+}
+
 struct goldfish_fb {
 	void __iomem *reg_base;
 	int irq;
@@ -127,7 +174,8 @@ static int goldfish_fb_set_par(struct fb_info *info)
 	struct goldfish_fb *fb = container_of(info, struct goldfish_fb, fb);
 
 	if (fb->rotation != fb->fb.var.rotate) {
-		info->fix.line_length = info->var.xres * 2;
+		info->fix.line_length = info->var.xres *
+			(fb->fb.var.bits_per_pixel / CHAR_BIT);
 		fb->rotation = fb->fb.var.rotate;
 		writel(fb->rotation, fb->reg_base + FB_SET_ROTATION);
 	}
@@ -144,13 +192,17 @@ static int goldfish_fb_pan_display(struct fb_var_screeninfo *var,
 
 	spin_lock_irqsave(&fb->lock, irq_flags);
 	base_update_count = fb->base_update_count;
-	writel(fb->fb.fix.smem_start + fb->fb.var.xres * 2 * var->yoffset,
-						fb->reg_base + FB_SET_BASE);
+	writel(fb->fb.fix.smem_start +
+			fb->fb.var.xres *
+			(fb->fb.var.bits_per_pixel / CHAR_BIT) *
+			var->yoffset,
+			fb->reg_base + FB_SET_BASE);
 	spin_unlock_irqrestore(&fb->lock, irq_flags);
 	wait_event_timeout(fb->wait,
 			fb->base_update_count != base_update_count, HZ / 15);
 	if (fb->base_update_count == base_update_count)
-		pr_err("goldfish_fb_pan_display: timeout waiting for base update\n");
+		pr_err("goldfish_fb_pan_display: timeout waiting for "
+			"base update\n");
 	return 0;
 }
 
@@ -188,8 +240,10 @@ static int goldfish_fb_probe(struct platform_device *pdev)
 	struct resource *r;
 	struct goldfish_fb *fb;
 	size_t framesize;
-	u32 width, height;
+	u32 width, height, format;
+	int bytes_per_pixel;
 	dma_addr_t fbpaddr;
+	const struct framebuffer_config *fb_config;
 
 	fb = kzalloc(sizeof(*fb), GFP_KERNEL);
 	if (fb == NULL) {
@@ -219,13 +273,20 @@ static int goldfish_fb_probe(struct platform_device *pdev)
 
 	width = readl(fb->reg_base + FB_GET_WIDTH);
 	height = readl(fb->reg_base + FB_GET_HEIGHT);
+	format = readl(fb->reg_base + FB_GET_FORMAT);
+	fb_config = get_fb_config_from_format(format);
+	if (!fb_config) {
+		ret = -EINVAL;
+		goto err_no_irq;
+	}
+	bytes_per_pixel = fb_config->bytes_per_pixel;
 
 	fb->fb.fbops		= &goldfish_fb_ops;
 	fb->fb.flags		= FBINFO_FLAG_DEFAULT;
 	fb->fb.pseudo_palette	= fb->cmap;
 	fb->fb.fix.type		= FB_TYPE_PACKED_PIXELS;
 	fb->fb.fix.visual = FB_VISUAL_TRUECOLOR;
-	fb->fb.fix.line_length = width * 2;
+	fb->fb.fix.line_length = width * bytes_per_pixel;
 	fb->fb.fix.accel	= FB_ACCEL_NONE;
 	fb->fb.fix.ypanstep = 1;
 
@@ -233,20 +294,22 @@ static int goldfish_fb_probe(struct platform_device *pdev)
 	fb->fb.var.yres		= height;
 	fb->fb.var.xres_virtual	= width;
 	fb->fb.var.yres_virtual	= height * 2;
-	fb->fb.var.bits_per_pixel = 16;
+	fb->fb.var.bits_per_pixel = bytes_per_pixel * CHAR_BIT;
 	fb->fb.var.activate	= FB_ACTIVATE_NOW;
 	fb->fb.var.height	= readl(fb->reg_base + FB_GET_PHYS_HEIGHT);
 	fb->fb.var.width	= readl(fb->reg_base + FB_GET_PHYS_WIDTH);
 	fb->fb.var.pixclock	= 0;
 
-	fb->fb.var.red.offset = 11;
-	fb->fb.var.red.length = 5;
-	fb->fb.var.green.offset = 5;
-	fb->fb.var.green.length = 6;
-	fb->fb.var.blue.offset = 0;
-	fb->fb.var.blue.length = 5;
+	fb->fb.var.red.offset = fb_config->red_offset;
+	fb->fb.var.red.length = fb_config->red_length;
+	fb->fb.var.green.offset = fb_config->green_offset;
+	fb->fb.var.green.length = fb_config->green_length;
+	fb->fb.var.blue.offset = fb_config->blue_offset;
+	fb->fb.var.blue.length = fb_config->blue_length;
+	fb->fb.var.transp.offset = fb_config->transp_offset;
+	fb->fb.var.transp.length = fb_config->transp_length;
 
-	framesize = width * height * 2 * 2;
+	framesize = width * height * 2 * bytes_per_pixel;
 	fb->fb.screen_base = (char __force __iomem *)dma_alloc_coherent(
 						&pdev->dev, framesize,
 						&fbpaddr, GFP_KERNEL);
@@ -297,7 +360,8 @@ static int goldfish_fb_remove(struct platform_device *pdev)
 	size_t framesize;
 	struct goldfish_fb *fb = platform_get_drvdata(pdev);
 
-	framesize = fb->fb.var.xres_virtual * fb->fb.var.yres_virtual * 2;
+	framesize = fb->fb.var.xres_virtual * fb->fb.var.yres_virtual *
+		(fb->fb.var.bits_per_pixel / CHAR_BIT);
 	unregister_framebuffer(&fb->fb);
 	free_irq(fb->irq, fb);
 

From b6c49d2852dd45f78d13302d78a8a53ddc743ae1 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 19 Jul 2017 20:27:30 +0200
Subject: [PATCH 1628/3220] llist: clang: introduce member_address_is_nonnull()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently llist_for_each_entry() and llist_for_each_entry_safe() iterate
until &pos->member != NULL.  But when building the kernel with Clang,
the compiler assumes &pos->member cannot be NULL if the member's offset
is greater than 0 (which would be equivalent to the object being
non-contiguous in memory).  Therefore the loop condition is always true,
and the loops become infinite.

To work around this, introduce the member_address_is_nonnull() macro,
which casts object pointer to uintptr_t, thus letting the member pointer
to be NULL.

Signed-off-by: Alexander Potapenko <glider@google.com>
Tested-by: Sodagudi Prasad <psodagud@codeaurora.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit beaec533fc2701a28a4d667f67c9f59c6e4e0d13)
Reported-by: Jean-Baptiste Théou <jb@essential.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>

Change-Id: I90d5c5cfbbc6f847370f296975416a797dc067ee
---
 include/linux/llist.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/include/linux/llist.h b/include/linux/llist.h
index fd4ca0b4fe0f..ac6796138ba0 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -87,6 +87,23 @@ static inline void init_llist_head(struct llist_head *list)
 #define llist_entry(ptr, type, member)		\
 	container_of(ptr, type, member)
 
+/**
+ * member_address_is_nonnull - check whether the member address is not NULL
+ * @ptr:	the object pointer (struct type * that contains the llist_node)
+ * @member:	the name of the llist_node within the struct.
+ *
+ * This macro is conceptually the same as
+ *	&ptr->member != NULL
+ * but it works around the fact that compilers can decide that taking a member
+ * address is never a NULL pointer.
+ *
+ * Real objects that start at a high address and have a member at NULL are
+ * unlikely to exist, but such pointers may be returned e.g. by the
+ * container_of() macro.
+ */
+#define member_address_is_nonnull(ptr, member)	\
+	((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0)
+
 /**
  * llist_for_each - iterate over some deleted entries of a lock-less list
  * @pos:	the &struct llist_node to use as a loop cursor
@@ -121,7 +138,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 #define llist_for_each_entry(pos, node, member)				\
 	for ((pos) = llist_entry((node), typeof(*(pos)), member);	\
-	     &(pos)->member != NULL;					\
+	     member_address_is_nonnull(pos, member);			\
 	     (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))
 
 /**
@@ -143,7 +160,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 #define llist_for_each_entry_safe(pos, n, node, member)			       \
 	for (pos = llist_entry((node), typeof(*pos), member);		       \
-	     &pos->member != NULL &&					       \
+	     member_address_is_nonnull(pos, member) &&			       \
 	        (n = llist_entry(pos->member.next, typeof(*n), member), true); \
 	     pos = n)
 

From 9a42d514a637cfcbac4000c0d4528e0a3b9403c5 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Wed, 28 Feb 2018 14:59:22 +0800
Subject: [PATCH 1629/3220] staging: android: ashmem: Fix possible deadlock in
 ashmem_ioctl

ashmem_mutex may create a chain of dependencies like:

CPU0                                    CPU1
 mmap syscall                           ioctl syscall
 -> mmap_sem (acquired)                 -> ashmem_ioctl
 -> ashmem_mmap                            -> ashmem_mutex (acquired)
    -> ashmem_mutex (try to acquire)       -> copy_from_user
                                              -> mmap_sem (try to acquire)

There is a lock odering problem between mmap_sem and ashmem_mutex causing
a lockdep splat[1] during a syzcaller test. This patch fixes the problem
by move copy_from_user out of ashmem_mutex.

[1] https://www.spinics.net/lists/kernel/msg2733200.html

Fixes: ce8a3a9e76d0 (staging: android: ashmem: Fix a race condition in pin ioctls)
Reported-by: syzbot+d7a918a7a8e1c952bc36@syzkaller.appspotmail.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/ashmem.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 7f0738e5cf4a..cba6b4e17fee 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -697,16 +697,14 @@ static int ashmem_pin_unpin(struct ashmem_area *asma, unsigned long cmd,
 	size_t pgstart, pgend;
 	int ret = -EINVAL;
 
+	if (unlikely(copy_from_user(&pin, p, sizeof(pin))))
+		return -EFAULT;
+
 	mutex_lock(&ashmem_mutex);
 
 	if (unlikely(!asma->file))
 		goto out_unlock;
 
-	if (unlikely(copy_from_user(&pin, p, sizeof(pin)))) {
-		ret = -EFAULT;
-		goto out_unlock;
-	}
-
 	/* per custom, you can pass zero for len to mean "everything onward" */
 	if (!pin.len)
 		pin.len = PAGE_ALIGN(asma->size) - pin.offset;

From a1f595ac462f0c567caf25294bc069f7fbb88608 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Thu, 15 Mar 2018 10:50:25 -0700
Subject: [PATCH 1630/3220] ANDROID: Fix whitespace in goldfish

Bug: 72717639
bug: 66884503
Change-Id: I91848191e436012324935f6b62a5077792f2fb58
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe.c    | 14 ++++++---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 32 +++++++++++++-------
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 4b4825c56de9..2fa1bdd72ab8 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -35,8 +35,10 @@
 #define PIPE_REG_ADDRESS		0x10  /* write: physical address */
 #define PIPE_REG_ADDRESS_HIGH	        0x34  /* write: physical address */
 #define PIPE_REG_WAKES			0x14  /* read: wake flags */
-#define PIPE_REG_PARAMS_ADDR_LOW	0x18  /* read/write: batch data address */
-#define PIPE_REG_PARAMS_ADDR_HIGH	0x1c  /* read/write: batch data address */
+#define PIPE_REG_PARAMS_ADDR_LOW	0x18  /* read/write: batch data address
+					       */
+#define PIPE_REG_PARAMS_ADDR_HIGH	0x1c  /* read/write: batch data address
+					       */
 #define PIPE_REG_ACCESS_PARAMS		0x20  /* write: batch access */
 #define PIPE_REG_VERSION		0x24  /* read: device version */
 
@@ -53,10 +55,12 @@
 /* The following commands are related to write operations */
 #define CMD_WRITE_BUFFER	4  /* send a user buffer to the emulator */
 #define CMD_WAKE_ON_WRITE	5  /* tell the emulator to wake us when writing
-				     is possible */
+				    * is possible
+				    */
 #define CMD_READ_BUFFER        6  /* receive a user buffer from the emulator */
 #define CMD_WAKE_ON_READ       7  /* tell the emulator to wake us when reading
-				   * is possible */
+				   * is possible
+				   */
 
 /* Possible status values used to signal errors -
  * see goldfish_pipe_error_convert
@@ -589,5 +593,5 @@ int goldfish_pipe_device_init_v1(struct platform_device *pdev)
 
 void goldfish_pipe_device_deinit_v1(struct platform_device *pdev)
 {
-    misc_deregister(&goldfish_pipe_dev);
+	misc_deregister(&goldfish_pipe_dev);
 }
diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 4341cb1a2a2a..77db324991fb 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -186,7 +186,6 @@ struct goldfish_pipe {
 	 */
 	unsigned long flags;
 
-	/* doubly linked list of signalled pipes, protected by goldfish_pipe_dev::lock */
 	/* wake flags host have signalled,
 	 * protected by goldfish_pipe_dev::lock
 	 */
@@ -195,6 +194,9 @@ struct goldfish_pipe {
 	/* A pointer to command buffer */
 	struct goldfish_pipe_command *command_buffer;
 
+	/* doubly linked list of signalled pipes,
+	 * protected by goldfish_pipe_dev::lock
+	 */
 	struct goldfish_pipe *prev_signalled;
 	struct goldfish_pipe *next_signalled;
 
@@ -216,7 +218,8 @@ struct goldfish_pipe_dev pipe_dev[1] = {};
 static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
 {
 	pipe->command_buffer->cmd = cmd;
-	pipe->command_buffer->status = PIPE_ERROR_INVAL;	/* failure by default */
+	/* failure by default */
+	pipe->command_buffer->status = PIPE_ERROR_INVAL;
 	writel(pipe->id, pipe->dev->base + PIPE_REG_CMD);
 	return pipe->command_buffer->status;
 }
@@ -488,9 +491,8 @@ static unsigned int goldfish_pipe_poll(struct file *filp, poll_table *wait)
 	poll_wait(filp, &pipe->wake_queue, wait);
 
 	status = goldfish_cmd(pipe, PIPE_CMD_POLL);
-	if (status < 0) {
+	if (status < 0)
 		return -ERESTARTSYS;
-	}
 
 	if (status & PIPE_POLL_IN)
 		mask |= POLLIN | POLLRDNORM;
@@ -715,8 +717,9 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 			(u64)(unsigned long)__pa(pipe->command_buffer);
 	status = goldfish_cmd_locked(pipe, PIPE_CMD_OPEN);
 	spin_unlock_irqrestore(&dev->lock, flags);
-	if (status < 0)
+	if (status < 0) {
 		goto err_cmd;
+	}
 	/* All is done, save the pipe into the file's private data field */
 	file->private_data = pipe;
 	return 0;
@@ -786,7 +789,8 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 
 	dev->first_signalled_pipe = NULL;
 	dev->pipes_capacity = INITIAL_PIPES_CAPACITY;
-	dev->pipes = kcalloc(dev->pipes_capacity, sizeof(*dev->pipes), GFP_KERNEL);
+	dev->pipes = kcalloc(dev->pipes_capacity, sizeof(*dev->pipes),
+		GFP_KERNEL);
 	if (!dev->pipes)
 		return -ENOMEM;
 
@@ -807,13 +811,19 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 	/* Send the buffer addresses to the host */
 	{
 		u64 paddr = __pa(&dev->buffers->signalled_pipe_buffers);
-		writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH);
-		writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_SIGNAL_BUFFER);
-		writel((u32)MAX_SIGNALLED_PIPES, dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT);
+
+		writel((u32)(unsigned long)(paddr >> 32),
+			dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH);
+		writel((u32)(unsigned long)paddr,
+			dev->base + PIPE_REG_SIGNAL_BUFFER);
+		writel((u32)MAX_SIGNALLED_PIPES,
+			dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT);
 
 		paddr = __pa(&dev->buffers->open_command_params);
-		writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_OPEN_BUFFER_HIGH);
-		writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_OPEN_BUFFER);
+		writel((u32)(unsigned long)(paddr >> 32),
+			dev->base + PIPE_REG_OPEN_BUFFER_HIGH);
+		writel((u32)(unsigned long)paddr,
+			dev->base + PIPE_REG_OPEN_BUFFER);
 	}
 	return 0;
 }

From eacdfbad3ba8a2602a33174452440570639405f1 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Tue, 20 Mar 2018 13:29:19 -0700
Subject: [PATCH 1631/3220] ANDROID: Use standard logging functions in
 goldfish_pipe

Bug: 72717639
bug: 66884503
Change-Id: Ifb784f6ebc5a7ef8981a6dbd4d3f6065b9347db3
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe.c    | 33 ++++++++------------
 drivers/platform/goldfish/goldfish_pipe_v2.c |  9 +++++-
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 2fa1bdd72ab8..dc376b0fd276 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -77,14 +77,6 @@
 
 #define MAX_PAGES_TO_GRAB 32
 
-#define DEBUG 0
-
-#if DEBUG
-#define DPRINT(...) { printk(KERN_ERR __VA_ARGS__); }
-#else
-#define DPRINT(...)
-#endif
-
 /* This data type models a given pipe instance */
 struct goldfish_pipe {
 	struct goldfish_pipe_dev *dev;
@@ -276,17 +268,17 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 		ret = get_user_pages_fast(first_page, requested_pages,
 				!is_write, pages);
 
-		DPRINT("%s: requested pages: %d %d %p\n", __FUNCTION__,
-			ret, requested_pages, first_page);
+		pr_debug("%s: requested pages: %d %ld %p\n", __func__, ret,
+			requested_pages, (void*)first_page);
 		if (ret == 0) {
-			DPRINT("%s: error: (requested pages == 0) (wanted %d)\n",
-					__FUNCTION__, requested_pages);
+			pr_err("%s: error: (requested pages == 0) (wanted %ld)\n",
+					__func__, requested_pages);
 			mutex_unlock(&pipe->lock);
 			return ret;
 		}
 		if (ret < 0) {
-			DPRINT("%s: (requested pages < 0) %d \n",
-				 	__FUNCTION__, requested_pages);
+			pr_err("%s: (requested pages < 0) %ld \n",
+				 	__func__, requested_pages);
 			mutex_unlock(&pipe->lock);
 			return ret;
 		}
@@ -301,8 +293,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 				xaddr_prev = xaddr_i;
 				num_contiguous_pages++;
 			} else {
-				DPRINT("%s: discontinuous page boundary: %d pages instead\n",
-						__FUNCTION__, page_i);
+				pr_err("%s: discontinuous page boundary: %d pages instead\n",
+						__func__, page_i);
 				break;
 			}
 		}
@@ -353,8 +345,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 			 * ABI relies on this behavior.
 			 */
 			if (status != PIPE_ERROR_AGAIN)
-				pr_info_ratelimited("goldfish_pipe: backend returned error %d on %s\n",
-						status, is_write ? "write" : "read");
+				pr_err_ratelimited("goldfish_pipe: backend returned error %d on %s\n",
+					status, is_write ? "write" : "read");
 			ret = 0;
 			break;
 		}
@@ -524,7 +516,8 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 
 	pipe->dev = dev;
 	mutex_init(&pipe->lock);
-	DPRINT("%s: call. pipe_dev pipe_dev=0x%lx new_pipe_addr=0x%lx file=0x%lx\n", __FUNCTION__, pipe_dev, pipe, file);
+	pr_debug("%s: call. pipe_dev dev=%p new_pipe_addr=%p file=%p\n",
+		__func__, dev, pipe, file);
 	/* spin lock init, write head of list, i guess */
 	init_waitqueue_head(&pipe->wake_queue);
 
@@ -548,7 +541,7 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
 {
 	struct goldfish_pipe *pipe = filp->private_data;
 
-	DPRINT("%s: call. pipe=0x%lx file=0x%lx\n", __FUNCTION__, pipe, filp);
+	pr_debug("%s: call. pipe=%p file=%p\n", __func__, pipe, filp);
 	/* The guest is closing the channel, so tell the emulator right now */
 	goldfish_cmd(pipe, CMD_CLOSE);
 	kfree(pipe);
diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 77db324991fb..3119b3341a7b 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -46,6 +46,7 @@
  * exchange is properly mapped during a transfer.
  */
 
+#include <linux/printk.h>
 #include "goldfish_pipe.h"
 
 
@@ -440,7 +441,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 			 * err.
 			 */
 			if (status != PIPE_ERROR_AGAIN)
-				pr_info_ratelimited(
+				pr_err_ratelimited(
 					"goldfish_pipe: backend error %d on %s\n",
 					status, is_write ? "write" : "read");
 			break;
@@ -694,6 +695,7 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 	pipe->command_buffer =
 		(struct goldfish_pipe_command *)__get_free_page(GFP_KERNEL);
 	if (!pipe->command_buffer) {
+		pr_err("Could not alloc pipe command buffer!\n");
 		status = -ENOMEM;
 		goto err_pipe;
 	}
@@ -702,6 +704,7 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 
 	id = get_free_pipe_id_locked(dev);
 	if (id < 0) {
+		pr_err("Could not get free pipe id!\n");
 		status = id;
 		goto err_id_locked;
 	}
@@ -718,10 +721,12 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 	status = goldfish_cmd_locked(pipe, PIPE_CMD_OPEN);
 	spin_unlock_irqrestore(&dev->lock, flags);
 	if (status < 0) {
+		pr_err("Could not tell host of new pipe! status=%d", status);
 		goto err_cmd;
 	}
 	/* All is done, save the pipe into the file's private data field */
 	file->private_data = pipe;
+	pr_debug("%s on 0x%p\n", __func__, pipe);
 	return 0;
 
 err_cmd:
@@ -741,6 +746,8 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
 	struct goldfish_pipe *pipe = filp->private_data;
 	struct goldfish_pipe_dev *dev = pipe->dev;
 
+	pr_debug("%s on 0x%p\n", __func__, pipe);
+
 	/* The guest is closing the channel, so tell the emulator right now */
 	(void)goldfish_cmd(pipe, PIPE_CMD_CLOSE);
 

From 4b5b4ff4a687fd25ad6b788678a8af8014b24c66 Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Tue, 20 Mar 2018 16:01:43 -0700
Subject: [PATCH 1632/3220] ANDROID: cpufreq: times: avoid prematurely freeing
 uid_entry

__krealloc may return the same pointer that's passed in if the
original allocation provided enough memory to accommodate the new
request. In this case the "old" uid_entry should not be freed or
replaced in uid_hash_table, so add a check to avoid doing so.

Bug: 74338318
Test: Hikey960 boots & shows reasonable UID numbers
Change-Id: Id1a094f60a8ffcc827358d0f40c9f3ff70719cce
Signed-off-by: Connor O'Brien <connoro@google.com>
---
 drivers/cpufreq/cpufreq_times.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
index b74017f06a8e..326a5e1acdc6 100644
--- a/drivers/cpufreq/cpufreq_times.c
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -105,8 +105,10 @@ static struct uid_entry *find_or_register_uid_locked(uid_t uid)
 		memset(temp->time_in_state + uid_entry->max_state, 0,
 		       (max_state - uid_entry->max_state) *
 		       sizeof(uid_entry->time_in_state[0]));
-		hlist_replace_rcu(&uid_entry->hash, &temp->hash);
-		kfree_rcu(uid_entry, rcu);
+		if (temp != uid_entry) {
+			hlist_replace_rcu(&uid_entry->hash, &temp->hash);
+			kfree_rcu(uid_entry, rcu);
+		}
 		return temp;
 	}
 

From 37343fde0c4a8645e9530540bb8c10d18db3875a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 5 Mar 2018 11:17:07 -0800
Subject: [PATCH 1633/3220] BACKPORT, FROMLIST: crypto: arm64/speck - add
 NEON-accelerated implementation of Speck-XTS

Add a NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
for ARM64.  This is ported from the 32-bit version.  It may be useful on
devices with 64-bit ARM CPUs that don't have the Cryptography
Extensions, so cannot do AES efficiently -- e.g. the Cortex-A53
processor on the Raspberry Pi 3.

It generally works the same way as the 32-bit version, but there are
some slight differences due to the different instructions, registers,
and syntax available in ARM64 vs. in ARM32.  For example, in the 64-bit
version there are enough registers to hold the XTS tweaks for each
128-byte chunk, so they don't need to be saved on the stack.

Benchmarks on a Raspberry Pi 3 running a 64-bit kernel:

   Algorithm                              Encryption     Decryption
   ---------                              ----------     ----------
   Speck64/128-XTS (NEON)                 92.2 MB/s      92.2 MB/s
   Speck128/256-XTS (NEON)                75.0 MB/s      75.0 MB/s
   Speck128/256-XTS (generic)             47.4 MB/s      35.6 MB/s
   AES-128-XTS (NEON bit-sliced)          33.4 MB/s      29.6 MB/s
   AES-256-XTS (NEON bit-sliced)          24.6 MB/s      21.7 MB/s

The code performs well on higher-end ARM64 processors as well, though
such processors tend to have the Crypto Extensions which make AES
preferred.  For example, here are the same benchmarks run on a HiKey960
(with CPU affinity set for the A73 cores), with the Crypto Extensions
implementation of AES-256-XTS added:

   Algorithm                              Encryption     Decryption
   ---------                              -----------    -----------
   AES-256-XTS (Crypto Extensions)        1273.3 MB/s    1274.7 MB/s
   Speck64/128-XTS (NEON)                  359.8 MB/s     348.0 MB/s
   Speck128/256-XTS (NEON)                 292.5 MB/s     286.1 MB/s
   Speck128/256-XTS (generic)              186.3 MB/s     181.8 MB/s
   AES-128-XTS (NEON bit-sliced)           142.0 MB/s     124.3 MB/s
   AES-256-XTS (NEON bit-sliced)           104.7 MB/s      91.1 MB/s

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

(cherry picked from commit 91a2abb78f940ac821345cb7cc376dca94336c2f
 git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master)
(changed speck-neon-glue.c to use blkcipher API instead of skcipher API)
(resolved merge conflicts in arch/arm64/crypto/Makefile and
 arch/arm64/crypto/Kconfig)
(made CONFIG_CRYPTO_SPECK_NEON select CONFIG_CRYPTO_GF128MUL, since
 gf128mul_x_ble() is non-inline in older kernels)
Change-Id: Iaed7a14c84b32b09ec299060a5d27060693043d5
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm64/crypto/Kconfig           |   7 +
 arch/arm64/crypto/Makefile          |   3 +
 arch/arm64/crypto/speck-neon-core.S | 352 ++++++++++++++++++++++++++++
 arch/arm64/crypto/speck-neon-glue.c | 308 ++++++++++++++++++++++++
 4 files changed, 670 insertions(+)
 create mode 100644 arch/arm64/crypto/speck-neon-core.S
 create mode 100644 arch/arm64/crypto/speck-neon-glue.c

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index de1aab4b5da8..487cd382d333 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -58,4 +58,11 @@ config CRYPTO_CRC32_ARM64
 	tristate "CRC32 and CRC32C using optional ARMv8 instructions"
 	depends on ARM64
 	select CRYPTO_HASH
+
+config CRYPTO_SPECK_NEON
+	tristate "NEON accelerated Speck cipher algorithms"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_GF128MUL
+	select CRYPTO_SPECK
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index f0a8f2475ea3..28a71246e0a3 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -32,6 +32,9 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
 obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
 aes-neon-blk-y := aes-glue-neon.o aes-neon.o
 
+obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
+speck-neon-y := speck-neon-core.o speck-neon-glue.o
+
 AFLAGS_aes-ce.o		:= -DINTERLEAVE=4
 AFLAGS_aes-neon.o	:= -DINTERLEAVE=4
 
diff --git a/arch/arm64/crypto/speck-neon-core.S b/arch/arm64/crypto/speck-neon-core.S
new file mode 100644
index 000000000000..b14463438b09
--- /dev/null
+++ b/arch/arm64/crypto/speck-neon-core.S
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
+ *
+ * Copyright (c) 2018 Google, Inc
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	.text
+
+	// arguments
+	ROUND_KEYS	.req	x0	// const {u64,u32} *round_keys
+	NROUNDS		.req	w1	// int nrounds
+	NROUNDS_X	.req	x1
+	DST		.req	x2	// void *dst
+	SRC		.req	x3	// const void *src
+	NBYTES		.req	w4	// unsigned int nbytes
+	TWEAK		.req	x5	// void *tweak
+
+	// registers which hold the data being encrypted/decrypted
+	// (underscores avoid a naming collision with ARM64 registers x0-x3)
+	X_0		.req	v0
+	Y_0		.req	v1
+	X_1		.req	v2
+	Y_1		.req	v3
+	X_2		.req	v4
+	Y_2		.req	v5
+	X_3		.req	v6
+	Y_3		.req	v7
+
+	// the round key, duplicated in all lanes
+	ROUND_KEY	.req	v8
+
+	// index vector for tbl-based 8-bit rotates
+	ROTATE_TABLE	.req	v9
+	ROTATE_TABLE_Q	.req	q9
+
+	// temporary registers
+	TMP0		.req	v10
+	TMP1		.req	v11
+	TMP2		.req	v12
+	TMP3		.req	v13
+
+	// multiplication table for updating XTS tweaks
+	GFMUL_TABLE	.req	v14
+	GFMUL_TABLE_Q	.req	q14
+
+	// next XTS tweak value(s)
+	TWEAKV_NEXT	.req	v15
+
+	// XTS tweaks for the blocks currently being encrypted/decrypted
+	TWEAKV0		.req	v16
+	TWEAKV1		.req	v17
+	TWEAKV2		.req	v18
+	TWEAKV3		.req	v19
+	TWEAKV4		.req	v20
+	TWEAKV5		.req	v21
+	TWEAKV6		.req	v22
+	TWEAKV7		.req	v23
+
+	.align		4
+.Lror64_8_table:
+	.octa		0x080f0e0d0c0b0a090007060504030201
+.Lror32_8_table:
+	.octa		0x0c0f0e0d080b0a090407060500030201
+.Lrol64_8_table:
+	.octa		0x0e0d0c0b0a09080f0605040302010007
+.Lrol32_8_table:
+	.octa		0x0e0d0c0f0a09080b0605040702010003
+.Lgf128mul_table:
+	.octa		0x00000000000000870000000000000001
+.Lgf64mul_table:
+	.octa		0x0000000000000000000000002d361b00
+
+/*
+ * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
+ *
+ * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
+ * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
+ * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
+ * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
+ */
+.macro _speck_round_128bytes	n, lanes
+
+	// x = ror(x, 8)
+	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
+	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
+	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
+	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
+
+	// x += y
+	add		X_0.\lanes, X_0.\lanes, Y_0.\lanes
+	add		X_1.\lanes, X_1.\lanes, Y_1.\lanes
+	add		X_2.\lanes, X_2.\lanes, Y_2.\lanes
+	add		X_3.\lanes, X_3.\lanes, Y_3.\lanes
+
+	// x ^= k
+	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
+	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
+	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
+	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
+
+	// y = rol(y, 3)
+	shl		TMP0.\lanes, Y_0.\lanes, #3
+	shl		TMP1.\lanes, Y_1.\lanes, #3
+	shl		TMP2.\lanes, Y_2.\lanes, #3
+	shl		TMP3.\lanes, Y_3.\lanes, #3
+	sri		TMP0.\lanes, Y_0.\lanes, #(\n - 3)
+	sri		TMP1.\lanes, Y_1.\lanes, #(\n - 3)
+	sri		TMP2.\lanes, Y_2.\lanes, #(\n - 3)
+	sri		TMP3.\lanes, Y_3.\lanes, #(\n - 3)
+
+	// y ^= x
+	eor		Y_0.16b, TMP0.16b, X_0.16b
+	eor		Y_1.16b, TMP1.16b, X_1.16b
+	eor		Y_2.16b, TMP2.16b, X_2.16b
+	eor		Y_3.16b, TMP3.16b, X_3.16b
+.endm
+
+/*
+ * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
+ *
+ * This is the inverse of _speck_round_128bytes().
+ */
+.macro _speck_unround_128bytes	n, lanes
+
+	// y ^= x
+	eor		TMP0.16b, Y_0.16b, X_0.16b
+	eor		TMP1.16b, Y_1.16b, X_1.16b
+	eor		TMP2.16b, Y_2.16b, X_2.16b
+	eor		TMP3.16b, Y_3.16b, X_3.16b
+
+	// y = ror(y, 3)
+	ushr		Y_0.\lanes, TMP0.\lanes, #3
+	ushr		Y_1.\lanes, TMP1.\lanes, #3
+	ushr		Y_2.\lanes, TMP2.\lanes, #3
+	ushr		Y_3.\lanes, TMP3.\lanes, #3
+	sli		Y_0.\lanes, TMP0.\lanes, #(\n - 3)
+	sli		Y_1.\lanes, TMP1.\lanes, #(\n - 3)
+	sli		Y_2.\lanes, TMP2.\lanes, #(\n - 3)
+	sli		Y_3.\lanes, TMP3.\lanes, #(\n - 3)
+
+	// x ^= k
+	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
+	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
+	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
+	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
+
+	// x -= y
+	sub		X_0.\lanes, X_0.\lanes, Y_0.\lanes
+	sub		X_1.\lanes, X_1.\lanes, Y_1.\lanes
+	sub		X_2.\lanes, X_2.\lanes, Y_2.\lanes
+	sub		X_3.\lanes, X_3.\lanes, Y_3.\lanes
+
+	// x = rol(x, 8)
+	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
+	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
+	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
+	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
+.endm
+
+.macro _next_xts_tweak	next, cur, tmp, n
+.if \n == 64
+	/*
+	 * Calculate the next tweak by multiplying the current one by x,
+	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
+	 */
+	sshr		\tmp\().2d, \cur\().2d, #63
+	and		\tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
+	shl		\next\().2d, \cur\().2d, #1
+	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+	eor		\next\().16b, \next\().16b, \tmp\().16b
+.else
+	/*
+	 * Calculate the next two tweaks by multiplying the current ones by x^2,
+	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
+	 */
+	ushr		\tmp\().2d, \cur\().2d, #62
+	shl		\next\().2d, \cur\().2d, #2
+	tbl		\tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
+	eor		\next\().16b, \next\().16b, \tmp\().16b
+.endif
+.endm
+
+/*
+ * _speck_xts_crypt() - Speck-XTS encryption/decryption
+ *
+ * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
+ * using Speck-XTS, specifically the variant with a block size of '2n' and round
+ * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
+ * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
+ * nonzero multiple of 128.
+ */
+.macro _speck_xts_crypt	n, lanes, decrypting
+
+	/*
+	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
+	 * round key rather than the first, since for decryption the round keys
+	 * are used in reverse order.
+	 */
+.if \decrypting
+	mov		NROUNDS, NROUNDS	/* zero the high 32 bits */
+.if \n == 64
+	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
+	sub		ROUND_KEYS, ROUND_KEYS, #8
+.else
+	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
+	sub		ROUND_KEYS, ROUND_KEYS, #4
+.endif
+.endif
+
+	// Load the index vector for tbl-based 8-bit rotates
+.if \decrypting
+	ldr		ROTATE_TABLE_Q, .Lrol\n\()_8_table
+.else
+	ldr		ROTATE_TABLE_Q, .Lror\n\()_8_table
+.endif
+
+	// One-time XTS preparation
+.if \n == 64
+	// Load first tweak
+	ld1		{TWEAKV0.16b}, [TWEAK]
+
+	// Load GF(2^128) multiplication table
+	ldr		GFMUL_TABLE_Q, .Lgf128mul_table
+.else
+	// Load first tweak
+	ld1		{TWEAKV0.8b}, [TWEAK]
+
+	// Load GF(2^64) multiplication table
+	ldr		GFMUL_TABLE_Q, .Lgf64mul_table
+
+	// Calculate second tweak, packing it together with the first
+	ushr		TMP0.2d, TWEAKV0.2d, #63
+	shl		TMP1.2d, TWEAKV0.2d, #1
+	tbl		TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
+	eor		TMP0.8b, TMP0.8b, TMP1.8b
+	mov		TWEAKV0.d[1], TMP0.d[0]
+.endif
+
+.Lnext_128bytes_\@:
+
+	// Calculate XTS tweaks for next 128 bytes
+	_next_xts_tweak	TWEAKV1, TWEAKV0, TMP0, \n
+	_next_xts_tweak	TWEAKV2, TWEAKV1, TMP0, \n
+	_next_xts_tweak	TWEAKV3, TWEAKV2, TMP0, \n
+	_next_xts_tweak	TWEAKV4, TWEAKV3, TMP0, \n
+	_next_xts_tweak	TWEAKV5, TWEAKV4, TMP0, \n
+	_next_xts_tweak	TWEAKV6, TWEAKV5, TMP0, \n
+	_next_xts_tweak	TWEAKV7, TWEAKV6, TMP0, \n
+	_next_xts_tweak	TWEAKV_NEXT, TWEAKV7, TMP0, \n
+
+	// Load the next source blocks into {X,Y}[0-3]
+	ld1		{X_0.16b-Y_1.16b}, [SRC], #64
+	ld1		{X_2.16b-Y_3.16b}, [SRC], #64
+
+	// XOR the source blocks with their XTS tweaks
+	eor		TMP0.16b, X_0.16b, TWEAKV0.16b
+	eor		Y_0.16b,  Y_0.16b, TWEAKV1.16b
+	eor		TMP1.16b, X_1.16b, TWEAKV2.16b
+	eor		Y_1.16b,  Y_1.16b, TWEAKV3.16b
+	eor		TMP2.16b, X_2.16b, TWEAKV4.16b
+	eor		Y_2.16b,  Y_2.16b, TWEAKV5.16b
+	eor		TMP3.16b, X_3.16b, TWEAKV6.16b
+	eor		Y_3.16b,  Y_3.16b, TWEAKV7.16b
+
+	/*
+	 * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
+	 * that the X[0-3] registers contain only the second halves of blocks,
+	 * and the Y[0-3] registers contain only the first halves of blocks.
+	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
+	 */
+	uzp2		X_0.\lanes, TMP0.\lanes, Y_0.\lanes
+	uzp1		Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
+	uzp2		X_1.\lanes, TMP1.\lanes, Y_1.\lanes
+	uzp1		Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
+	uzp2		X_2.\lanes, TMP2.\lanes, Y_2.\lanes
+	uzp1		Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
+	uzp2		X_3.\lanes, TMP3.\lanes, Y_3.\lanes
+	uzp1		Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
+
+	// Do the cipher rounds
+	mov		x6, ROUND_KEYS
+	mov		w7, NROUNDS
+.Lnext_round_\@:
+.if \decrypting
+	ld1r		{ROUND_KEY.\lanes}, [x6]
+	sub		x6, x6, #( \n / 8 )
+	_speck_unround_128bytes	\n, \lanes
+.else
+	ld1r		{ROUND_KEY.\lanes}, [x6], #( \n / 8 )
+	_speck_round_128bytes	\n, \lanes
+.endif
+	subs		w7, w7, #1
+	bne		.Lnext_round_\@
+
+	// Re-interleave the 'x' and 'y' elements of each block
+	zip1		TMP0.\lanes, Y_0.\lanes, X_0.\lanes
+	zip2		Y_0.\lanes,  Y_0.\lanes, X_0.\lanes
+	zip1		TMP1.\lanes, Y_1.\lanes, X_1.\lanes
+	zip2		Y_1.\lanes,  Y_1.\lanes, X_1.\lanes
+	zip1		TMP2.\lanes, Y_2.\lanes, X_2.\lanes
+	zip2		Y_2.\lanes,  Y_2.\lanes, X_2.\lanes
+	zip1		TMP3.\lanes, Y_3.\lanes, X_3.\lanes
+	zip2		Y_3.\lanes,  Y_3.\lanes, X_3.\lanes
+
+	// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
+	eor		X_0.16b, TMP0.16b, TWEAKV0.16b
+	eor		Y_0.16b, Y_0.16b,  TWEAKV1.16b
+	eor		X_1.16b, TMP1.16b, TWEAKV2.16b
+	eor		Y_1.16b, Y_1.16b,  TWEAKV3.16b
+	eor		X_2.16b, TMP2.16b, TWEAKV4.16b
+	eor		Y_2.16b, Y_2.16b,  TWEAKV5.16b
+	eor		X_3.16b, TMP3.16b, TWEAKV6.16b
+	eor		Y_3.16b, Y_3.16b,  TWEAKV7.16b
+	mov		TWEAKV0.16b, TWEAKV_NEXT.16b
+
+	// Store the ciphertext in the destination buffer
+	st1		{X_0.16b-Y_1.16b}, [DST], #64
+	st1		{X_2.16b-Y_3.16b}, [DST], #64
+
+	// Continue if there are more 128-byte chunks remaining
+	subs		NBYTES, NBYTES, #128
+	bne		.Lnext_128bytes_\@
+
+	// Store the next tweak and return
+.if \n == 64
+	st1		{TWEAKV_NEXT.16b}, [TWEAK]
+.else
+	st1		{TWEAKV_NEXT.8b}, [TWEAK]
+.endif
+	ret
+.endm
+
+ENTRY(speck128_xts_encrypt_neon)
+	_speck_xts_crypt	n=64, lanes=2d, decrypting=0
+ENDPROC(speck128_xts_encrypt_neon)
+
+ENTRY(speck128_xts_decrypt_neon)
+	_speck_xts_crypt	n=64, lanes=2d, decrypting=1
+ENDPROC(speck128_xts_decrypt_neon)
+
+ENTRY(speck64_xts_encrypt_neon)
+	_speck_xts_crypt	n=32, lanes=4s, decrypting=0
+ENDPROC(speck64_xts_encrypt_neon)
+
+ENTRY(speck64_xts_decrypt_neon)
+	_speck_xts_crypt	n=32, lanes=4s, decrypting=1
+ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm64/crypto/speck-neon-glue.c b/arch/arm64/crypto/speck-neon-glue.c
new file mode 100644
index 000000000000..c7d18569d408
--- /dev/null
+++ b/arch/arm64/crypto/speck-neon-glue.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
+ * (64-bit version; based on the 32-bit version)
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/algapi.h>
+#include <crypto/gf128mul.h>
+#include <crypto/speck.h>
+#include <crypto/xts.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/* The assembly functions only handle multiples of 128 bytes */
+#define SPECK_NEON_CHUNK_SIZE	128
+
+/* Speck128 */
+
+struct speck128_xts_tfm_ctx {
+	struct speck128_tfm_ctx main_key;
+	struct speck128_tfm_ctx tweak_key;
+};
+
+asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
+					  void *dst, const void *src,
+					  unsigned int nbytes, void *tweak);
+
+asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
+					  void *dst, const void *src,
+					  unsigned int nbytes, void *tweak);
+
+typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
+				     u8 *, const u8 *);
+typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
+					  const void *, unsigned int, void *);
+
+static __always_inline int
+__speck128_xts_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes,
+		     speck128_crypt_one_t crypt_one,
+		     speck128_xts_crypt_many_t crypt_many)
+{
+	struct crypto_blkcipher *tfm = desc->tfm;
+	const struct speck128_xts_tfm_ctx *ctx = crypto_blkcipher_ctx(tfm);
+	struct blkcipher_walk walk;
+	le128 tweak;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, SPECK_NEON_CHUNK_SIZE);
+
+	crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+		u8 *dst = walk.dst.virt.addr;
+		const u8 *src = walk.src.virt.addr;
+
+		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
+			unsigned int count;
+
+			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
+			kernel_neon_begin();
+			(*crypt_many)(ctx->main_key.round_keys,
+				      ctx->main_key.nrounds,
+				      dst, src, count, &tweak);
+			kernel_neon_end();
+			dst += count;
+			src += count;
+			nbytes -= count;
+		}
+
+		/* Handle any remainder with generic code */
+		while (nbytes >= sizeof(tweak)) {
+			le128_xor((le128 *)dst, (const le128 *)src, &tweak);
+			(*crypt_one)(&ctx->main_key, dst, dst);
+			le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
+			gf128mul_x_ble((be128 *)&tweak, (const be128 *)&tweak);
+
+			dst += sizeof(tweak);
+			src += sizeof(tweak);
+			nbytes -= sizeof(tweak);
+		}
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static int speck128_xts_encrypt(struct blkcipher_desc *desc,
+				struct scatterlist *dst,
+				struct scatterlist *src,
+				unsigned int nbytes)
+{
+	return __speck128_xts_crypt(desc, dst, src, nbytes,
+				    crypto_speck128_encrypt,
+				    speck128_xts_encrypt_neon);
+}
+
+static int speck128_xts_decrypt(struct blkcipher_desc *desc,
+				struct scatterlist *dst,
+				struct scatterlist *src,
+				unsigned int nbytes)
+{
+	return __speck128_xts_crypt(desc, dst, src, nbytes,
+				    crypto_speck128_decrypt,
+				    speck128_xts_decrypt_neon);
+}
+
+static int speck128_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
+			       unsigned int keylen)
+{
+	struct speck128_xts_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	if (keylen % 2)
+		return -EINVAL;
+
+	keylen /= 2;
+
+	err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
+	if (err)
+		return err;
+
+	return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
+}
+
+/* Speck64 */
+
+struct speck64_xts_tfm_ctx {
+	struct speck64_tfm_ctx main_key;
+	struct speck64_tfm_ctx tweak_key;
+};
+
+asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
+					 void *dst, const void *src,
+					 unsigned int nbytes, void *tweak);
+
+asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
+					 void *dst, const void *src,
+					 unsigned int nbytes, void *tweak);
+
+typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
+				    u8 *, const u8 *);
+typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
+					 const void *, unsigned int, void *);
+
+static __always_inline int
+__speck64_xts_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		    struct scatterlist *src, unsigned int nbytes,
+		    speck64_crypt_one_t crypt_one,
+		    speck64_xts_crypt_many_t crypt_many)
+{
+	struct crypto_blkcipher *tfm = desc->tfm;
+	const struct speck64_xts_tfm_ctx *ctx = crypto_blkcipher_ctx(tfm);
+	struct blkcipher_walk walk;
+	__le64 tweak;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, SPECK_NEON_CHUNK_SIZE);
+
+	crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+		u8 *dst = walk.dst.virt.addr;
+		const u8 *src = walk.src.virt.addr;
+
+		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
+			unsigned int count;
+
+			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
+			kernel_neon_begin();
+			(*crypt_many)(ctx->main_key.round_keys,
+				      ctx->main_key.nrounds,
+				      dst, src, count, &tweak);
+			kernel_neon_end();
+			dst += count;
+			src += count;
+			nbytes -= count;
+		}
+
+		/* Handle any remainder with generic code */
+		while (nbytes >= sizeof(tweak)) {
+			*(__le64 *)dst = *(__le64 *)src ^ tweak;
+			(*crypt_one)(&ctx->main_key, dst, dst);
+			*(__le64 *)dst ^= tweak;
+			tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
+					    ((tweak & cpu_to_le64(1ULL << 63)) ?
+					     0x1B : 0));
+			dst += sizeof(tweak);
+			src += sizeof(tweak);
+			nbytes -= sizeof(tweak);
+		}
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static int speck64_xts_encrypt(struct blkcipher_desc *desc,
+			       struct scatterlist *dst, struct scatterlist *src,
+			       unsigned int nbytes)
+{
+	return __speck64_xts_crypt(desc, dst, src, nbytes,
+				   crypto_speck64_encrypt,
+				   speck64_xts_encrypt_neon);
+}
+
+static int speck64_xts_decrypt(struct blkcipher_desc *desc,
+			       struct scatterlist *dst, struct scatterlist *src,
+			       unsigned int nbytes)
+{
+	return __speck64_xts_crypt(desc, dst, src, nbytes,
+				   crypto_speck64_decrypt,
+				   speck64_xts_decrypt_neon);
+}
+
+static int speck64_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct speck64_xts_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	if (keylen % 2)
+		return -EINVAL;
+
+	keylen /= 2;
+
+	err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
+	if (err)
+		return err;
+
+	return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
+}
+
+static struct crypto_alg speck_algs[] = {
+	{
+		.cra_name		= "xts(speck128)",
+		.cra_driver_name	= "xts-speck128-neon",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+		.cra_blocksize		= SPECK128_BLOCK_SIZE,
+		.cra_type		= &crypto_blkcipher_type,
+		.cra_ctxsize		= sizeof(struct speck128_xts_tfm_ctx),
+		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
+		.cra_u = {
+			.blkcipher = {
+				.min_keysize		= 2 * SPECK128_128_KEY_SIZE,
+				.max_keysize		= 2 * SPECK128_256_KEY_SIZE,
+				.ivsize			= SPECK128_BLOCK_SIZE,
+				.setkey			= speck128_xts_setkey,
+				.encrypt		= speck128_xts_encrypt,
+				.decrypt		= speck128_xts_decrypt,
+			}
+		}
+	}, {
+		.cra_name		= "xts(speck64)",
+		.cra_driver_name	= "xts-speck64-neon",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+		.cra_blocksize		= SPECK64_BLOCK_SIZE,
+		.cra_type		= &crypto_blkcipher_type,
+		.cra_ctxsize		= sizeof(struct speck64_xts_tfm_ctx),
+		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
+		.cra_u = {
+			.blkcipher = {
+				.min_keysize		= 2 * SPECK64_96_KEY_SIZE,
+				.max_keysize		= 2 * SPECK64_128_KEY_SIZE,
+				.ivsize			= SPECK64_BLOCK_SIZE,
+				.setkey			= speck64_xts_setkey,
+				.encrypt		= speck64_xts_encrypt,
+				.decrypt		= speck64_xts_decrypt,
+			}
+		}
+	}
+};
+
+static int __init speck_neon_module_init(void)
+{
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+	return crypto_register_algs(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+static void __exit speck_neon_module_exit(void)
+{
+	crypto_unregister_algs(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+module_init(speck_neon_module_init);
+module_exit(speck_neon_module_exit);
+
+MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("xts(speck128)");
+MODULE_ALIAS_CRYPTO("xts-speck128-neon");
+MODULE_ALIAS_CRYPTO("xts(speck64)");
+MODULE_ALIAS_CRYPTO("xts-speck64-neon");

From 262c43ff01465b284020ba300617b5ae57ae14e8 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 24 Mar 2016 10:52:42 -0500
Subject: [PATCH 1634/3220] dtc: turn off dtc unit address warnings by default

The newly added dtc warning to check DT unit-address without reg
property and vice-versa generates lots of warnings. Turn off the check
unless building with W=1 or W=2.

Signed-off-by: Rob Herring <robh@kernel.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: linux-kbuild@vger.kernel.org
(cherry picked from commit bc553986a2f7c56d0de811485d5312ea29692d5d)
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>

Change-Id: Ief2e68988f6cf32cb4e98f489fa4079f523c0887
---
 scripts/Makefile.lib | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index da07e46cf8d3..6e082c2385ed 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -269,6 +269,11 @@ cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -n -f -9 > $@) || \
 # DTC
 # ---------------------------------------------------------------------------
 
+# Disable noisy checks by default
+ifeq ($(KBUILD_ENABLE_EXTRA_GCC_CHECKS),)
+DTC_FLAGS += -Wno-unit_address_vs_reg
+endif
+
 # Generate an assembly file to wrap the output of the device tree compiler
 quiet_cmd_dt_S_dtb= DTB     $@
 cmd_dt_S_dtb=						\

From 49632c6d98736c5c0890c3cf081dcf02566c456a Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Mon, 26 Mar 2018 18:30:11 -0700
Subject: [PATCH 1635/3220] ANDROID: cpufreq: times: fix
 proc_time_in_state_show

Read times from p->time_in_state. Remove the "times" pointer, which is
never initialized.

Bug: 75238970
Test: /proc/<pid>/time_in_state now shows some nonzero values
Change-Id: I2f375b64ec39de034da3e24e5e5fb58b04958b76
Signed-off-by: Connor O'Brien <connoro@google.com>
---
 drivers/cpufreq/cpufreq_times.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
index 326a5e1acdc6..a226e39275b0 100644
--- a/drivers/cpufreq/cpufreq_times.c
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -289,7 +289,6 @@ int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
 	unsigned int cpu, i;
 	cputime_t cputime;
 	unsigned long flags;
-	u64 *times;
 	struct cpu_freqs *freqs;
 	struct cpu_freqs *last_freqs = NULL;
 
@@ -307,7 +306,7 @@ int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
 			cputime = 0;
 			if (freqs->offset + i < p->max_state &&
 			    p->time_in_state)
-				cputime = times[freqs->offset + i];
+				cputime = p->time_in_state[freqs->offset + i];
 			seq_printf(m, "%u %lu\n", freqs->freq_table[i],
 				   (unsigned long)cputime_to_clock_t(cputime));
 		}

From d9e0aa6379fbea7736f135fc2ca34e9e40e27fdf Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 13 Jan 2016 15:52:07 -0600
Subject: [PATCH 1636/3220] UPSTREAM: drm: virtio-gpu: get the fb from the
 plane state for atomic updates

When using the atomic API, plane->fb is not set when calling
virtio_gpu_plane_atomic_update. Use plane->state->fb instead.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
(cherry picked from commit 11c94ace5e25a1b6d12184b1a6f54ef4dc75aa62)

Change-Id: Ic5f4e3852fcd44af965d6a1cf784c38a34aee19e
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/gpu/drm/virtio/virtgpu_plane.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/virtio/virtgpu_plane.c b/drivers/gpu/drm/virtio/virtgpu_plane.c
index 4a74129c5708..6b9ad59c643f 100644
--- a/drivers/gpu/drm/virtio/virtgpu_plane.c
+++ b/drivers/gpu/drm/virtio/virtgpu_plane.c
@@ -68,8 +68,8 @@ static void virtio_gpu_plane_atomic_update(struct drm_plane *plane,
 	struct virtio_gpu_object *bo;
 	uint32_t handle;
 
-	if (plane->fb) {
-		vgfb = to_virtio_gpu_framebuffer(plane->fb);
+	if (plane->state->fb) {
+		vgfb = to_virtio_gpu_framebuffer(plane->state->fb);
 		bo = gem_to_virtio_gpu_obj(vgfb->obj);
 		handle = bo->hw_res_handle;
 	} else {

From 9f7a117985f0e26445fbce71e2b6e18877cd010e Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 13 Jan 2016 15:52:08 -0600
Subject: [PATCH 1637/3220] UPSTREAM: drm: virtio-gpu: ensure plane is flushed
 to host on atomic update

This fixes drawing updates when updating planes with atomic API.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
(cherry picked from commit bd17d1c77c2ae3bb988a256ea5bda6d405b62b56)

Change-Id: Ib688992f35a775e2c90640387eceaf343df846de
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/gpu/drm/virtio/virtgpu_plane.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/virtio/virtgpu_plane.c b/drivers/gpu/drm/virtio/virtgpu_plane.c
index 6b9ad59c643f..af121cf41809 100644
--- a/drivers/gpu/drm/virtio/virtgpu_plane.c
+++ b/drivers/gpu/drm/virtio/virtgpu_plane.c
@@ -84,6 +84,11 @@ static void virtio_gpu_plane_atomic_update(struct drm_plane *plane,
 				   plane->state->crtc_h,
 				   plane->state->crtc_x,
 				   plane->state->crtc_y);
+	virtio_gpu_cmd_resource_flush(vgdev, handle,
+				      plane->state->crtc_x,
+				      plane->state->crtc_y,
+				      plane->state->crtc_w,
+				      plane->state->crtc_h);
 }
 
 

From c5707dc8d27359f9eb999d3e757ec106e8bb727b Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 13 Jan 2016 15:52:09 -0600
Subject: [PATCH 1638/3220] UPSTREAM: drm: virtio-gpu: transfer dumb buffers to
 host on plane update

For dumb buffers, we need to transfer them to the host when updating a
plane.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
(cherry picked from commit 4109e7f7d5aeefaa1d72f9ab4c63aa5d79fc81fb)

Change-Id: I125ccb3943d82ac1a7738e2afde28eea9e55abcc
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/gpu/drm/virtio/virtgpu_plane.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/virtio/virtgpu_plane.c b/drivers/gpu/drm/virtio/virtgpu_plane.c
index af121cf41809..7b6e5c5e7284 100644
--- a/drivers/gpu/drm/virtio/virtgpu_plane.c
+++ b/drivers/gpu/drm/virtio/virtgpu_plane.c
@@ -72,6 +72,13 @@ static void virtio_gpu_plane_atomic_update(struct drm_plane *plane,
 		vgfb = to_virtio_gpu_framebuffer(plane->state->fb);
 		bo = gem_to_virtio_gpu_obj(vgfb->obj);
 		handle = bo->hw_res_handle;
+		if (bo->dumb) {
+			virtio_gpu_cmd_transfer_to_host_2d
+				(vgdev, handle, 0,
+				 cpu_to_le32(plane->state->crtc_w),
+				 cpu_to_le32(plane->state->crtc_h),
+				 plane->state->crtc_x, plane->state->crtc_y, NULL);
+		}
 	} else {
 		handle = 0;
 	}

From 38033e9f3745f46b67bdd23cd80cb1d0f9a3ba22 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 13 Jan 2016 15:52:10 -0600
Subject: [PATCH 1639/3220] UPSTREAM: drm: virtio-gpu: set atomic flag

Advertise atomic mode setting capability to user space.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
(cherry picked from commit 5443ce86fa37e7d3cc63d2067d05e3388fdeec17)

Change-Id: I2c5db7056a2c0ce99dd83b234f064671ee5b2e2a
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/gpu/drm/virtio/virtgpu_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c
index b40ed6061f05..7f898cfdc746 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -118,7 +118,7 @@ static const struct file_operations virtio_gpu_driver_fops = {
 
 
 static struct drm_driver driver = {
-	.driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_PRIME | DRIVER_RENDER,
+	.driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_PRIME | DRIVER_RENDER | DRIVER_ATOMIC,
 	.set_busid = drm_virtio_set_busid,
 	.load = virtio_gpu_driver_load,
 	.unload = virtio_gpu_driver_unload,

From 4175c4528efe66d1307f55ee653e8dd4ec4abe2a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 30 Mar 2018 10:53:44 +0200
Subject: [PATCH 1640/3220] Revert "genirq: Use irqd_get_trigger_type to
 compare the trigger type for shared IRQs"

This reverts commit 9d0273bb1c4b645817eccfe5c5975ea29add3300 which is
commit 382bd4de61827dbaaf5fb4fb7b1f4be4a86505e7 upstream.

It causes too many problems with the stable tree, and would require too
many other things to be backported, so just revert it.

Reported-by: Guenter Roeck <linux@roeck-us.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/irq/manage.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4889a8ab77ce..a079ed14f230 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1189,10 +1189,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * set the trigger type must match. Also all must
 		 * agree on ONESHOT.
 		 */
-		unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
-
 		if (!((old->flags & new->flags) & IRQF_SHARED) ||
-		    (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
+		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
 		    ((old->flags ^ new->flags) & IRQF_ONESHOT))
 			goto mismatch;
 

From a89da170dbe49ce3ae9a1800fc58bdc54f923ccc Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Fri, 30 Mar 2018 16:43:26 -0700
Subject: [PATCH 1641/3220] ANDROID: cpufreq: times: allocate enough space for
 a uid_entry

since the variable called uid_entry is a pointer, need to use
sizeof(*uid_entry) to allocate enough space for a full uid_entry
struct.

Bug: 74338318
Change-Id: I488a7cab849398ef7b1f4712b7746f8cf645209d
Signed-off-by: Connor O'Brien <connoro@google.com>
---
 drivers/cpufreq/cpufreq_times.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
index a226e39275b0..dd4ff655e5fe 100644
--- a/drivers/cpufreq/cpufreq_times.c
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -88,7 +88,7 @@ static struct uid_entry *find_or_register_uid_locked(uid_t uid)
 {
 	struct uid_entry *uid_entry, *temp;
 	unsigned int max_state = READ_ONCE(next_offset);
-	size_t alloc_size = sizeof(uid_entry) + max_state *
+	size_t alloc_size = sizeof(*uid_entry) + max_state *
 		sizeof(uid_entry->time_in_state[0]);
 
 	uid_entry = find_uid_entry_locked(uid);

From ffb6bfb300975b057f8451c88c1a3206b21020cc Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Sat, 31 Mar 2018 20:44:45 -0700
Subject: [PATCH 1642/3220] ANDROID: xt_qtaguid: Remove unnecessary null checks
 to ifa_label

'ifa_label' will never be NULL since it isn't a plain pointer but an
array of char values.

../net/netfilter/xt_qtaguid.c:971:11: warning: address of array
'ifa->ifa_label' will always evaluate to 'true'
[-Wpointer-bool-conversion]
                        ifa->ifa_label ? ifa->ifa_label : "(null)");
                        ~~~~~^~~~~~~~~ ~
../net/netfilter/xt_qtaguid.c:972:13: warning: address of array
'ifa->ifa_label' will always evaluate to 'true'
[-Wpointer-bool-conversion]
                if (ifa->ifa_label && !strcmp(ifname, ifa->ifa_label))
                    ~~~~~^~~~~~~~~ ~~

Change-Id: I3c87a5d4b830aaa21a59e9c39cfe0a1d60d7f830
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
---
 net/netfilter/xt_qtaguid.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index 148c2917285c..0711c7db765f 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -968,8 +968,8 @@ static void iface_stat_create(struct net_device *net_dev,
 			IF_DEBUG("qtaguid: iface_stat: create(%s): "
 				 "ifa=%p ifa_label=%s\n",
 				 ifname, ifa,
-				 ifa->ifa_label ? ifa->ifa_label : "(null)");
-			if (ifa->ifa_label && !strcmp(ifname, ifa->ifa_label))
+				 ifa->ifa_label);
+			if (!strcmp(ifname, ifa->ifa_label))
 				break;
 		}
 	}

From 207b579e3db6fd0cb6fe40ba3e929635ad748d89 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Sat, 31 Mar 2018 20:56:23 -0700
Subject: [PATCH 1643/3220] ANDROID: xt_qtaguid: Remove unnecessary null checks
 to device's name

'name' will never be NULL since it isn't a plain pointer but an array
of char values.

../net/netfilter/xt_qtaguid.c:1195:27: warning: address of array
'(*el_dev)->name' will always evaluate to 'true'
[-Wpointer-bool-conversion]
        if (unlikely(!(*el_dev)->name)) {
                     ~~~~~~~~~~~~^~~~

Change-Id: If3b25f17829b43e8a639193fb9cd04ae45947200
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
---
 net/netfilter/xt_qtaguid.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
index 0711c7db765f..dffa245f8fef 100644
--- a/net/netfilter/xt_qtaguid.c
+++ b/net/netfilter/xt_qtaguid.c
@@ -1192,11 +1192,6 @@ static void get_dev_and_dir(const struct sk_buff *skb,
 		       par->hooknum, __func__);
 		BUG();
 	}
-	if (unlikely(!(*el_dev)->name)) {
-		pr_err("qtaguid[%d]: %s(): no dev->name?!!\n",
-		       par->hooknum, __func__);
-		BUG();
-	}
 	if (skb->dev && *el_dev != skb->dev) {
 		MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs par->%s=%p %s\n",
 			 par->hooknum, skb->dev, skb->dev->name,

From f89b70f28f5555378c1e176f4cde015ff60eed73 Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Tue, 3 Apr 2018 16:05:37 -0700
Subject: [PATCH 1644/3220] ANDROID: cpufreq: times: skip printing invalid
 frequencies

The header of /proc/uid_time_in_state should match the logic used for
the rest of the file by skipping invalid frequency table entries.

Test: Read /proc/uid_time_in_state and check for invalid frequencies
in header.
Signed-off-by: Connor O'Brien <connoro@google.com>

Change-Id: I96888e7b71f4928383ff7080c98c988d5fe1a95c
---
 drivers/cpufreq/cpufreq_times.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
index dd4ff655e5fe..e5df7a47cc16 100644
--- a/drivers/cpufreq/cpufreq_times.c
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -203,8 +203,12 @@ static int uid_time_in_state_seq_show(struct seq_file *m, void *v)
 			if (!freqs || freqs == last_freqs)
 				continue;
 			last_freqs = freqs;
-			for (i = 0; i < freqs->max_state; i++)
+			for (i = 0; i < freqs->max_state; i++) {
+				if (freqs->freq_table[i] ==
+				    CPUFREQ_ENTRY_INVALID)
+					continue;
 				seq_printf(m, " %d", freqs->freq_table[i]);
+			}
 		}
 		seq_putc(m, '\n');
 	}

From 014fa364f80b4607b1f6d88eda43fad07fadf3e6 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Fri, 30 Mar 2018 14:31:46 -0700
Subject: [PATCH 1645/3220] ANDROID: Include missing headers in goldfish.h

Include headers to define 'dma_addr_t' and 'writel' symbols that
goldfish.h refers to.

Bug: 72886167
Change-Id: I0bb16d739e15edbedb779468bffc8ef46d9b6982
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 include/linux/goldfish.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/goldfish.h b/include/linux/goldfish.h
index 93e080b39cf6..e7e6f7aff124 100644
--- a/include/linux/goldfish.h
+++ b/include/linux/goldfish.h
@@ -1,6 +1,9 @@
 #ifndef __LINUX_GOLDFISH_H
 #define __LINUX_GOLDFISH_H
 
+#include <linux/types.h>
+#include <linux/io.h>
+
 /* Helpers for Goldfish virtual platform */
 
 static inline void gf_write_ptr(const void *ptr, void __iomem *portl,

From dac472216c722e57af90306535dc8100d4bc91ef Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Fri, 30 Mar 2018 14:55:02 -0700
Subject: [PATCH 1646/3220] ANDROID: Cleanup type casting in goldfish.h

Bug: 72886167
Change-Id: I506a24e6e659d83a9df5efa0f8f00229e0a4b2d4
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 include/linux/goldfish.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/goldfish.h b/include/linux/goldfish.h
index e7e6f7aff124..793590dcfb47 100644
--- a/include/linux/goldfish.h
+++ b/include/linux/goldfish.h
@@ -9,9 +9,11 @@
 static inline void gf_write_ptr(const void *ptr, void __iomem *portl,
 				void __iomem *porth)
 {
-	writel((u32)(unsigned long)ptr, portl);
+	const uintptr_t addr = (uintptr_t)ptr;
+
+	writel((u32)addr, portl);
 #ifdef CONFIG_64BIT
-	writel((unsigned long)ptr >> 32, porth);
+	writel(addr >> 32, porth);
 #endif
 }
 

From 71b3c043fe2d45dd430c8d37c17fa5ab45a764eb Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Tue, 6 Mar 2018 17:40:02 -0800
Subject: [PATCH 1647/3220] ANDROID: pdev_bus: replace writel with gf_write_ptr

We introduced a function that calls writel inside to simplify 64bit
support.

Bug: 72886167
Change-Id: I987891e0b331a0e205da4bbf4ee98c19edf087b7
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/pdev_bus.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/platform/goldfish/pdev_bus.c b/drivers/platform/goldfish/pdev_bus.c
index dd9ea463c2a4..56ff307b02a6 100644
--- a/drivers/platform/goldfish/pdev_bus.c
+++ b/drivers/platform/goldfish/pdev_bus.c
@@ -130,10 +130,9 @@ static int goldfish_new_pdev(void)
 	dev->pdev.dev.dma_mask = (void *)(dev->pdev.name + name_len + 1);
 	*dev->pdev.dev.dma_mask = ~0;
 
-#ifdef CONFIG_64BIT
-	writel((u32)((u64)name>>32), pdev_bus_base + PDEV_BUS_GET_NAME_HIGH);
-#endif
-	writel((u32)(unsigned long)name, pdev_bus_base + PDEV_BUS_GET_NAME);
+	gf_write_ptr(name, pdev_bus_base + PDEV_BUS_GET_NAME,
+		pdev_bus_base + PDEV_BUS_GET_NAME_HIGH);
+
 	name[name_len] = '\0';
 	dev->pdev.id = readl(pdev_bus_base + PDEV_BUS_ID);
 	dev->pdev.resource[0].start = base;

From e8cafbb53744b40cc1f3a3ca7e10a9be1d290ec2 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Wed, 4 Apr 2018 15:42:17 -0700
Subject: [PATCH 1648/3220] ANDROID: add missing include to pdev_bus

Bug: 72886167
Change-Id: I23a875ae5ca325c11942b1174e0eca9d676f4bc4
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/pdev_bus.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/goldfish/pdev_bus.c b/drivers/platform/goldfish/pdev_bus.c
index 56ff307b02a6..d97340477cf3 100644
--- a/drivers/platform/goldfish/pdev_bus.c
+++ b/drivers/platform/goldfish/pdev_bus.c
@@ -21,6 +21,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/io.h>
+#include <linux/goldfish.h>
 
 #define PDEV_BUS_OP_DONE        (0x00)
 #define PDEV_BUS_OP_REMOVE_DEV  (0x04)

From 13172f49e65a7ab71d31e5d105b9ffdddf7f3a07 Mon Sep 17 00:00:00 2001
From: Ritesh Harjani <riteshh@codeaurora.org>
Date: Mon, 19 Mar 2018 15:49:54 +0530
Subject: [PATCH 1649/3220] ANDROID: sdcardfs: Fix sdcardfs to stop creating
 cases-sensitive duplicate entries.

sdcardfs_name_match gets a 'name' argument from the underlying FS.
This need not be null terminated string.
So in sdcardfs_name_match -> qstr_case_eq -> we should use
str_n_case_eq.

This happens because few of the entries in lower level FS may not be
NULL terminated and may have some garbage characters passed while
doing sdcardfs_name_match.

For e.g.
 # dmesg |grep Download
 [  103.646386] sdcardfs_name_match: q1->name=.nomedia, q1->len=8,
 q2->name=Download\x17\x80\x03, q2->len=8
 [  104.021340] sdcardfs_name_match: q1->name=.nomedia, q1->len=8,
 q2->name=Download\x17\x80\x03, q2->len=8
 [  105.196864] sdcardfs_name_match: q1->name=.nomedia, q1->len=8,
 q2->name=Download\x17\x80\x03, q2->len=8
 [  109.113521] sdcardfs_name_match: q1->name=logs, q1->len=4,
 q2->name=Download\x17\x80\x03, q2->len=8

Now when we try to create a directory with different case for a such
files. SDCARDFS creates a entry if it could not find the underlying
entry in it's dcache.

To reproduce:-
1. bootup the device wait for some time after sdcardfs mounting to
   complete.
2. cd /storage/emulated/0
3. echo 3 > /proc/sys/vm/drop_caches
4. mkdir download

We now start seeing two entries with name.
Download & download.

Change-Id: I976d92a220a607dd8cdb96c01c2041c5c2bc3326
Signed-off-by: Ritesh Harjani <riteshh@codeaurora.org>
bug: 75987238
---
 fs/sdcardfs/sdcardfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index f0607a55441d..055e413509e4 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -669,7 +669,7 @@ static inline bool str_n_case_eq(const char *s1, const char *s2, size_t len)
 
 static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2)
 {
-	return q1->len == q2->len && str_case_eq(q1->name, q2->name);
+	return q1->len == q2->len && str_n_case_eq(q1->name, q2->name, q2->len);
 }
 
 #define QSTR_LITERAL(string) QSTR_INIT(string, sizeof(string)-1)

From 86f9f957c50470291411f09283b5daae1a015988 Mon Sep 17 00:00:00 2001
From: Ritesh Harjani <riteshh@codeaurora.org>
Date: Mon, 19 Mar 2018 16:03:09 +0530
Subject: [PATCH 1650/3220] ANDROID: fuse: Add null terminator to path in
 canonical path to avoid issue

page allocated in fuse_dentry_canonical_path to be handled in
fuse_dev_do_write is allocated using __get_free_pages(GFP_KERNEL).
This may not return a page with data filled with 0. Now this
page may not have a null terminator at all.
If this happens and userspace fuse daemon screws up by passing a string
to kernel which is not NULL terminated (or did not fill anything),
then inside fuse driver in kernel when we try to do
strlen(fuse_dev_write->kern_path->getname_kernel)
on that page data -> it may give us issue with kernel paging request.

Unable to handle kernel paging request at virtual address
------------[ cut here ]------------
<..>
PC is at strlen+0x10/0x90
LR is at getname_kernel+0x2c/0xf4
<..>
strlen+0x10/0x90
kern_path+0x28/0x4c
fuse_dev_do_write+0x5b8/0x694
fuse_dev_write+0x74/0x94
do_iter_readv_writev+0x80/0xb8
do_readv_writev+0xec/0x1cc
vfs_writev+0x54/0x64
SyS_writev+0x64/0xe4
el0_svc_naked+0x24/0x28

To avoid this we should ensure in case of FUSE_CANONICAL_PATH,
the page is null terminated.

Change-Id: I33ca7cc76b4472eaa982c67bb20685df451121f5
Signed-off-by: Ritesh Harjani <riteshh@codeaurora.org>
Bug: 75984715
[Daniel - small edit, using args size ]
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/fuse/dev.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fbfec06b054d..d0ab1a34a6d2 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1941,8 +1941,10 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 
 	err = copy_out_args(cs, &req->out, nbytes);
 	if (req->in.h.opcode == FUSE_CANONICAL_PATH) {
-		req->out.h.error = kern_path((char *)req->out.args[0].value, 0,
-							req->canonical_path);
+		char *path = (char *)req->out.args[0].value;
+
+		path[req->out.args[0].size - 1] = 0;
+		req->out.h.error = kern_path(path, 0, req->canonical_path);
 	}
 	fuse_copy_finish(cs);
 

From 89332ea087c35102462e8cac914ac1a62db68e57 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Fri, 6 Apr 2018 15:21:57 -0700
Subject: [PATCH 1651/3220] Fix whitespace in drivers/tty/goldfish.c

Change-Id: I5454fc73d9f8a9fbafc1e2ccd37fe45685cf64d7
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/tty/goldfish.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index c6ec9e64451b..01af1ae9f27c 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -181,6 +181,7 @@ static void goldfish_tty_shutdown(struct tty_port *port)
 static int goldfish_tty_open(struct tty_struct *tty, struct file *filp)
 {
 	struct goldfish_tty *qtty = &goldfish_ttys[tty->index];
+
 	return tty_port_open(&qtty->port, tty, filp);
 }
 
@@ -210,6 +211,7 @@ static int goldfish_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct goldfish_tty *qtty = &goldfish_ttys[tty->index];
 	void __iomem *base = qtty->base;
+
 	return readl(base + GOLDFISH_TTY_REG_BYTES_READY);
 }
 

From f48927a014a5f958583b226fee02a278c91f0394 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Fri, 6 Apr 2018 15:52:14 -0700
Subject: [PATCH 1652/3220] Add missing include to drivers/tty/goldfish.c

Change-Id: I4a71a1cad108c8b3bb3875e8483ac2ac7357268c
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/tty/goldfish.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index 01af1ae9f27c..6da7e3bd5d6d 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -26,6 +26,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/serial_core.h>
+#include <linux/of.h>
 
 /* Goldfish tty register's offsets */
 #define	GOLDFISH_TTY_REG_BYTES_READY	0x04

From 64bc01b0025eaee66ae613d15bfd7d570a8f6f92 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Fri, 6 Apr 2018 16:01:37 -0700
Subject: [PATCH 1653/3220] Replace #define with enum for better compilation
 errors.

Change-Id: I810a2099fdf4e918bdd02e5327243fdd0faaac5f
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/tty/goldfish.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index 6da7e3bd5d6d..dc5c0e6cdb84 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -29,18 +29,22 @@
 #include <linux/of.h>
 
 /* Goldfish tty register's offsets */
-#define	GOLDFISH_TTY_REG_BYTES_READY	0x04
-#define	GOLDFISH_TTY_REG_CMD		0x08
-#define	GOLDFISH_TTY_REG_DATA_PTR	0x10
-#define	GOLDFISH_TTY_REG_DATA_LEN	0x14
-#define	GOLDFISH_TTY_REG_DATA_PTR_HIGH	0x18
-#define	GOLDFISH_TTY_REG_VERSION	0x20
+enum {
+	GOLDFISH_TTY_REG_BYTES_READY	= 0x04,
+	GOLDFISH_TTY_REG_CMD		= 0x08,
+	GOLDFISH_TTY_REG_DATA_PTR	= 0x10,
+	GOLDFISH_TTY_REG_DATA_LEN	= 0x14,
+	GOLDFISH_TTY_REG_DATA_PTR_HIGH	= 0x18,
+	GOLDFISH_TTY_REG_VERSION	= 0x20,
+};
 
 /* Goldfish tty commands */
-#define	GOLDFISH_TTY_CMD_INT_DISABLE	0
-#define	GOLDFISH_TTY_CMD_INT_ENABLE	1
-#define	GOLDFISH_TTY_CMD_WRITE_BUFFER	2
-#define	GOLDFISH_TTY_CMD_READ_BUFFER	3
+enum {
+	GOLDFISH_TTY_CMD_INT_DISABLE	= 0,
+	GOLDFISH_TTY_CMD_INT_ENABLE	= 1,
+	GOLDFISH_TTY_CMD_WRITE_BUFFER	= 2,
+	GOLDFISH_TTY_CMD_READ_BUFFER	= 3,
+};
 
 struct goldfish_tty {
 	struct tty_port port;

From c758d68c867897aca58ca38ac947893b1e0af78e Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Fri, 6 Apr 2018 16:51:19 -0700
Subject: [PATCH 1654/3220] Reduce amount of casting in drivers/tty/goldfish.c.

Change-Id: I8be0ff52a4c3a5bfcc72971f79df8d1c0b5a0eec
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/tty/goldfish.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index dc5c0e6cdb84..4686e93aaf94 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -87,32 +87,35 @@ static void do_rw_io(struct goldfish_tty *qtty,
 }
 
 static void goldfish_tty_rw(struct goldfish_tty *qtty,
-			    unsigned long addr,
+			    const void *address_ptr,
 			    unsigned int count,
 			    int is_write)
 {
 	dma_addr_t dma_handle;
 	enum dma_data_direction dma_dir;
+	uintptr_t address;
 
+	address = (uintptr_t)address_ptr;
 	dma_dir = (is_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
 	if (qtty->version > 0) {
 		/*
 		 * Goldfish TTY for Ranchu platform uses
 		 * physical addresses and DMA for read/write operations
 		 */
-		unsigned long addr_end = addr + count;
+		uintptr_t address_end = address + count;
 
-		while (addr < addr_end) {
-			unsigned long pg_end = (addr & PAGE_MASK) + PAGE_SIZE;
-			unsigned long next =
-					pg_end < addr_end ? pg_end : addr_end;
-			unsigned long avail = next - addr;
+		while (address < address_end) {
+			uintptr_t page_end = (address & PAGE_MASK) + PAGE_SIZE;
+			uintptr_t next     = page_end < address_end ?
+						page_end : address_end;
+			uintptr_t avail    = next - address;
 
 			/*
 			 * Map the buffer's virtual address to the DMA address
 			 * so the buffer can be accessed by the device.
 			 */
-			dma_handle = dma_map_single(qtty->dev, (void *)addr,
+			dma_handle = dma_map_single(qtty->dev, (void *)address,
 						    avail, dma_dir);
 
 			if (dma_mapping_error(qtty->dev, dma_handle)) {
@@ -127,31 +130,30 @@ static void goldfish_tty_rw(struct goldfish_tty *qtty,
 			 */
 			dma_unmap_single(qtty->dev, dma_handle, avail, dma_dir);
 
-			addr += avail;
+			address += avail;
 		}
 	} else {
 		/*
 		 * Old style Goldfish TTY used on the Goldfish platform
 		 * uses virtual addresses.
 		 */
-		do_rw_io(qtty, addr, count, is_write);
+		do_rw_io(qtty, address, count, is_write);
 	}
+
 }
 
 static void goldfish_tty_do_write(int line, const char *buf,
 				  unsigned int count)
 {
 	struct goldfish_tty *qtty = &goldfish_ttys[line];
-	unsigned long address = (unsigned long)(void *)buf;
 
-	goldfish_tty_rw(qtty, address, count, 1);
+	goldfish_tty_rw(qtty, buf, count, 1);
 }
 
 static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id)
 {
 	struct goldfish_tty *qtty = dev_id;
 	void __iomem *base = qtty->base;
-	unsigned long address;
 	unsigned char *buf;
 	u32 count;
 
@@ -160,9 +162,7 @@ static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id)
 		return IRQ_NONE;
 
 	count = tty_prepare_flip_string(&qtty->port, &buf, count);
-
-	address = (unsigned long)(void *)buf;
-	goldfish_tty_rw(qtty, address, count, 0);
+	goldfish_tty_rw(qtty, buf, count, 0);
 
 	tty_schedule_flip(&qtty->port);
 	return IRQ_HANDLED;

From 24a2d9039378087f1570dbc0a2e8c8ef76be265e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@google.com>
Date: Fri, 5 Jan 2018 10:44:52 -0800
Subject: [PATCH 1655/3220] f2fs/fscrypt: updates to v4.17-rc1

Pull f2fs update from Jaegeuk Kim:
 "In this round, we've mainly focused on performance tuning and critical
  bug fixes occurred in low-end devices. Sheng Yong introduced
  lost_found feature to keep missing files during recovery instead of
  thrashing them. We're preparing coming fsverity implementation. And,
  we've got more features to communicate with users for better
  performance. In low-end devices, some memory-related issues were
  fixed, and subtle race condtions and corner cases were addressed as
  well.

  Enhancements:
   - large nat bitmaps for more free node ids
   - add three block allocation policies to pass down write hints given by user
   - expose extension list to user and introduce hot file extension
   - tune small devices seamlessly for low-end devices
   - set readdir_ra by default
   - give more resources under gc_urgent mode regarding to discard and cleaning
   - introduce fsync_mode to enforce posix or not
   - nowait aio support
   - add lost_found feature to keep dangling inodes
   - reserve bits for future fsverity feature
   - add test_dummy_encryption for FBE

  Bug fixes:
   - don't use highmem for dentry pages
   - align memory boundary for bitops
   - truncate preallocated blocks in write errors
   - guarantee i_times on fsync call
   - clear CP_TRIMMED_FLAG correctly
   - prevent node chain loop during recovery
   - avoid data race between atomic write and background cleaning
   - avoid unnecessary selinux violation warnings on resgid option
   - GFP_NOFS to avoid deadlock in quota and read paths
   - fix f2fs_skip_inode_update to allow i_size recovery

  In addition to the above, there are several minor bug fixes and clean-ups"

Cherry-pick from origin/upstream-f2fs-stable-linux-4.4.y:

42bf67fc543b f2fs: remain written times to update inode during fsync
6cb5aa02bfbd f2fs: make assignment of t->dentry_bitmap more readable
a8d07f1f9c62 f2fs: truncate preallocated blocks in error case
86444d600692 f2fs: fix a wrong condition in f2fs_skip_inode_update
db2188a68704 f2fs: reserve bits for fs-verity
ee2e74b3f00e f2fs: Add a segment type check in inplace write
0192e0a4502f f2fs: no need to initialize zero value for GFP_F2FS_ZERO
49338842e9b2 f2fs: don't track new nat entry in nat set
d6a69d5e6568 f2fs: clean up with F2FS_BLK_ALIGN
2c8834a7a2c9 f2fs: check blkaddr more accuratly before issue a bio
6ab573a9d96f f2fs: Set GF_NOFS in read_cache_page_gfp while doing f2fs_quota_read
7419dcb8be02 f2fs: introduce a new mount option test_dummy_encryption
9321e22c038c f2fs: introduce F2FS_FEATURE_LOST_FOUND feature
8a5719615847 f2fs: release locks before return in f2fs_ioc_gc_range()
739ace131cdf f2fs: align memory boundary for bitops
4c55abe4f8d2 f2fs: remove unneeded set_cold_node()
30654507e0a2 f2fs: add nowait aio support
d909e9410634 f2fs: wrap all options with f2fs_sb_info.mount_opt
5738be52b3e8 f2fs: Don't overwrite all types of node to keep node chain
0bdeb167c843 f2fs: introduce mount option for fsync mode
6bc490f0eedc f2fs: fix to restore old mount option in ->remount_fs
0c9c3e034410 f2fs: wrap sb_rdonly with f2fs_readonly
6c6611223a79 f2fs: avoid selinux denial on CAP_SYS_RESOURCE
076a6f32fe5d f2fs: support hot file extension
58edcdbca67a f2fs: fix to avoid race in between atomic write and background GC
1e0aeb0af9ed f2fs: do gc in greedy mode for whole range if gc_urgent mode is set
10b2d001d6ac f2fs: issue discard aggressively in the gc_urgent mode
a5052f32b940 f2fs: set readdir_ra by default
1aa536a624cc f2fs: add auto tuning for small devices
0ffdffc8f106 f2fs: add mount option for segment allocation policy
b79829891249 f2fs: don't stop GC if GC is contended
766d2321697f f2fs: expose extension_list sysfs entry
98b329de5026 f2fs: fix to set KEEP_SIZE bit in f2fs_zero_range
4d409fa3346b f2fs: introduce sb_lock to make encrypt pwsalt update exclusive
1f6bac14c100 f2fs: remove redundant initialization of pointer 'p'
946aefc7545d f2fs: flush cp pack except cp pack 2 page at first
e5081a52ac09 f2fs: clean up f2fs_sb_has_xxx functions
a292477154b5 f2fs: remove redundant check of page type when submit bio
190e64a819df f2fs: fix to handle looped node chain during recovery
889d98087652 f2fs: handle quota for orphan inodes
92b12bb1a23e f2fs: support passing down write hints to block layer with F2FS policy
22fa74c2b097 f2fs: support passing down write hints given by users to block layer
180900373ec1 f2fs: fix to clear CP_TRIMMED_FLAG
0671fae134bb f2fs: support large nat bitmap
eceb943d5d59 f2fs: fix to check extent cache in f2fs_drop_extent_tree
2e2a339c9853 f2fs: restrict inline_xattr_size configuration
41dda1164137 f2fs: fix heap mode to reset it back
39575737bb62 f2fs: fix potential corruption in area before F2FS_SUPER_OFFSET
7e0e7995ee97 fscrypt: fix build with pre-4.6 gcc versions
31d3279a4fca fscrypt: fix up fscrypt_fname_encrypted_size() for internal use
82bec888567b fscrypt: define fscrypt_fname_alloc_buffer() to be for presented names
168a90782888 fscrypt: calculate NUL-padding length in one place only
042ae9f4cfbf fscrypt: move fscrypt_symlink_data to fscrypt_private.h
f9550c24c20e fscrypt: remove fscrypt_fname_usr_to_disk()
7ac4756a2474 f2fs: switch to fscrypt_get_symlink()
6b76f58e24bd f2fs: switch to fscrypt ->symlink() helper functions
fd457d2c4e04 fscrypt: new helper function - fscrypt_get_symlink()
a1cdacb7ae0d fscrypt: new helper functions for ->symlink()
7f43602f4d10 fscrypt: trim down fscrypt.h includes
d9cadc11bdcf fscrypt: move fscrypt_is_dot_dotdot() to fs/crypto/fname.c
e6fe930580cb fscrypt: move fscrypt_valid_enc_modes() to fscrypt_private.h
efefa434f47e fscrypt: move fscrypt_operations declaration to fscrypt_supp.h
7ed178bc8ae9 fscrypt: split fscrypt_dummy_context_enabled() into supp/notsupp versions
3f16e09dadfb fscrypt: move fscrypt_ctx declaration to fscrypt_supp.h
8216a0b51a3b fscrypt: move fscrypt_info_cachep declaration to fscrypt_private.h
dfe0b3b1b67f fscrypt: move fscrypt_control_page() to supp/notsupp headers
3a2c79177822 fscrypt: move fscrypt_has_encryption_key() to supp/notsupp headers

Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |  11 +
 Documentation/filesystems/f2fs.txt      |  17 ++
 fs/crypto/crypto.c                      |   1 +
 fs/crypto/fname.c                       | 138 ++++------
 fs/crypto/fscrypt_private.h             |  31 +++
 fs/crypto/hooks.c                       | 156 +++++++++++
 fs/crypto/keyinfo.c                     |   1 +
 fs/f2fs/checkpoint.c                    | 101 ++++---
 fs/f2fs/data.c                          |  92 +++++--
 fs/f2fs/dir.c                           |   9 +-
 fs/f2fs/extent_cache.c                  |   5 +-
 fs/f2fs/f2fs.h                          | 192 +++++++------
 fs/f2fs/file.c                          |  94 +++++--
 fs/f2fs/gc.c                            |  23 +-
 fs/f2fs/inode.c                         |   9 +-
 fs/f2fs/namei.c                         | 247 ++++++++---------
 fs/f2fs/node.c                          |  55 +++-
 fs/f2fs/node.h                          |   5 +-
 fs/f2fs/recovery.c                      |  14 +
 fs/f2fs/segment.c                       | 133 ++++++++-
 fs/f2fs/segment.h                       |  27 +-
 fs/f2fs/super.c                         | 348 +++++++++++++++++-------
 fs/f2fs/sysfs.c                         |  73 ++++-
 include/linux/blk_types.h               |   1 +
 include/linux/f2fs_fs.h                 |  19 +-
 include/linux/fs.h                      |  21 +-
 include/linux/fscrypt.h                 | 174 +++++-------
 include/linux/fscrypt_notsupp.h         |  55 ++--
 include/linux/fscrypt_supp.h            |  65 ++++-
 include/uapi/linux/fcntl.h              |  21 ++
 30 files changed, 1487 insertions(+), 651 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index db7aab1516de..b8d0a30f1644 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -192,3 +192,14 @@ Date:		November 2017
 Contact:	"Sheng Yong" <shengyong1@huawei.com>
 Description:
 		 Controls readahead inode block in readdir.
+
+What:		/sys/fs/f2fs/<disk>/extension_list
+Date:		Feburary 2018
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Description:
+		 Used to control configure extension list:
+		 - Query: cat /sys/fs/f2fs/<disk>/extension_list
+		 - Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list
+		 - Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
+		 - [h] means add/del hot file extension
+		 - [c] means add/del cold file extension
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 6cf9ad12c57f..1f52baea2f69 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -172,6 +172,23 @@ offgrpjquota           Turn off group journelled quota.
 offprjjquota           Turn off project journelled quota.
 quota                  Enable plain user disk quota accounting.
 noquota                Disable all plain disk quota option.
+whint_mode=%s          Control which write hints are passed down to block
+                       layer. This supports "off", "user-based", and
+                       "fs-based".  In "off" mode (default), f2fs does not pass
+                       down hints. In "user-based" mode, f2fs tries to pass
+                       down hints given by users. And in "fs-based" mode, f2fs
+                       passes down hints with its policy.
+alloc_mode=%s          Adjust block allocation policy, which supports "reuse"
+                       and "default".
+fsync_mode=%s          Control the policy of fsync. Currently supports "posix"
+                       and "strict". In "posix" mode, which is default, fsync
+                       will follow POSIX semantics and does a light operation
+                       to improve the filesystem performance. In "strict" mode,
+                       fsync will be heavy and behaves in line with xfs, ext4
+                       and btrfs, where xfstest generic/342 will pass, but the
+                       performance will regress.
+test_dummy_encryption  Enable dummy encryption, which provides a fake fscrypt
+                       context. The fake fscrypt context is used by xfstests.
 
 ================================================================================
 DEBUGFS ENTRIES
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 732a786cce9d..ce654526c0fb 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -27,6 +27,7 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <crypto/aes.h>
+#include <crypto/skcipher.h>
 #include "fscrypt_private.h"
 
 static unsigned int num_prealloc_crypto_pages = 32;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6eb434363ff2..b18fa323d1d9 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -12,42 +12,46 @@
 
 #include <linux/scatterlist.h>
 #include <linux/ratelimit.h>
+#include <crypto/skcipher.h>
 #include "fscrypt_private.h"
 
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
+{
+	if (str->len == 1 && str->name[0] == '.')
+		return true;
+
+	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+		return true;
+
+	return false;
+}
+
 /**
  * fname_encrypt() - encrypt a filename
  *
- * The caller must have allocated sufficient memory for the @oname string.
+ * The output buffer must be at least as large as the input buffer.
+ * Any extra space is filled with NUL padding before encryption.
  *
  * Return: 0 on success, -errno on failure
  */
-static int fname_encrypt(struct inode *inode,
-			const struct qstr *iname, struct fscrypt_str *oname)
+int fname_encrypt(struct inode *inode, const struct qstr *iname,
+		  u8 *out, unsigned int olen)
 {
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
-	struct fscrypt_info *ci = inode->i_crypt_info;
-	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm;
 	int res = 0;
 	char iv[FS_CRYPTO_BLOCK_SIZE];
 	struct scatterlist sg;
-	int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
-	unsigned int lim;
-	unsigned int cryptlen;
-
-	lim = inode->i_sb->s_cop->max_namelen(inode);
-	if (iname->len <= 0 || iname->len > lim)
-		return -EIO;
 
 	/*
 	 * Copy the filename to the output buffer for encrypting in-place and
 	 * pad it with the needed number of NUL bytes.
 	 */
-	cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE);
-	cryptlen = round_up(cryptlen, padding);
-	cryptlen = min(cryptlen, lim);
-	memcpy(oname->name, iname->name, iname->len);
-	memset(oname->name + iname->len, 0, cryptlen - iname->len);
+	if (WARN_ON(olen < iname->len))
+		return -ENOBUFS;
+	memcpy(out, iname->name, iname->len);
+	memset(out + iname->len, 0, olen - iname->len);
 
 	/* Initialize the IV */
 	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
@@ -62,8 +66,8 @@ static int fname_encrypt(struct inode *inode,
 	skcipher_request_set_callback(req,
 			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 			crypto_req_done, &wait);
-	sg_init_one(&sg, oname->name, cryptlen);
-	skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv);
+	sg_init_one(&sg, out, olen);
+	skcipher_request_set_crypt(req, &sg, &sg, olen, iv);
 
 	/* Do the encryption */
 	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
@@ -74,7 +78,6 @@ static int fname_encrypt(struct inode *inode,
 		return res;
 	}
 
-	oname->len = cryptlen;
 	return 0;
 }
 
@@ -187,50 +190,52 @@ static int digest_decode(const char *src, int len, char *dst)
 	return cp - dst;
 }
 
-u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen)
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+				  u32 max_len, u32 *encrypted_len_ret)
 {
-	int padding = 32;
-	struct fscrypt_info *ci = inode->i_crypt_info;
+	int padding = 4 << (inode->i_crypt_info->ci_flags &
+			    FS_POLICY_FLAGS_PAD_MASK);
+	u32 encrypted_len;
 
-	if (ci)
-		padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
-	ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE);
-	return round_up(ilen, padding);
+	if (orig_len > max_len)
+		return false;
+	encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE);
+	encrypted_len = round_up(encrypted_len, padding);
+	*encrypted_len_ret = min(encrypted_len, max_len);
+	return true;
 }
-EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
 
 /**
- * fscrypt_fname_crypto_alloc_obuff() -
+ * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames
  *
- * Allocates an output buffer that is sufficient for the crypto operation
- * specified by the context and the direction.
+ * Allocate a buffer that is large enough to hold any decrypted or encoded
+ * filename (null-terminated), for the given maximum encrypted filename length.
+ *
+ * Return: 0 on success, -errno on failure
  */
 int fscrypt_fname_alloc_buffer(const struct inode *inode,
-				u32 ilen, struct fscrypt_str *crypto_str)
+			       u32 max_encrypted_len,
+			       struct fscrypt_str *crypto_str)
 {
-	u32 olen = fscrypt_fname_encrypted_size(inode, ilen);
 	const u32 max_encoded_len =
 		max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE),
 		      1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)));
+	u32 max_presented_len;
 
-	crypto_str->len = olen;
-	olen = max(olen, max_encoded_len);
+	max_presented_len = max(max_encoded_len, max_encrypted_len);
 
-	/*
-	 * Allocated buffer can hold one more character to null-terminate the
-	 * string
-	 */
-	crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
-	if (!(crypto_str->name))
+	crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS);
+	if (!crypto_str->name)
 		return -ENOMEM;
+	crypto_str->len = max_presented_len;
 	return 0;
 }
 EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
 
 /**
- * fscrypt_fname_crypto_free_buffer() -
+ * fscrypt_fname_free_buffer - free the buffer for presented filenames
  *
- * Frees the buffer allocated for crypto operation.
+ * Free the buffer allocated by fscrypt_fname_alloc_buffer().
  */
 void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
 {
@@ -296,35 +301,6 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
 }
 EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
 
-/**
- * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
- * space
- *
- * The caller must have allocated sufficient memory for the @oname string.
- *
- * Return: 0 on success, -errno on failure
- */
-int fscrypt_fname_usr_to_disk(struct inode *inode,
-			const struct qstr *iname,
-			struct fscrypt_str *oname)
-{
-	if (fscrypt_is_dot_dotdot(iname)) {
-		oname->name[0] = '.';
-		oname->name[iname->len - 1] = '.';
-		oname->len = iname->len;
-		return 0;
-	}
-	if (inode->i_crypt_info)
-		return fname_encrypt(inode, iname, oname);
-	/*
-	 * Without a proper key, a user is not allowed to modify the filenames
-	 * in a directory. Consequently, a user space name cannot be mapped to
-	 * a disk-space name
-	 */
-	return -ENOKEY;
-}
-EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
-
 /**
  * fscrypt_setup_filename() - prepare to search a possibly encrypted directory
  * @dir: the directory that will be searched
@@ -368,11 +344,17 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 		return ret;
 
 	if (dir->i_crypt_info) {
-		ret = fscrypt_fname_alloc_buffer(dir, iname->len,
-							&fname->crypto_buf);
-		if (ret)
-			return ret;
-		ret = fname_encrypt(dir, iname, &fname->crypto_buf);
+		if (!fscrypt_fname_encrypted_size(dir, iname->len,
+						  dir->i_sb->s_cop->max_namelen(dir),
+						  &fname->crypto_buf.len))
+			return -ENAMETOOLONG;
+		fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
+						 GFP_NOFS);
+		if (!fname->crypto_buf.name)
+			return -ENOMEM;
+
+		ret = fname_encrypt(dir, iname, fname->crypto_buf.name,
+				    fname->crypto_buf.len);
 		if (ret)
 			goto errout;
 		fname->disk_name.name = fname->crypto_buf.name;
@@ -424,7 +406,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 	return 0;
 
 errout:
-	fscrypt_fname_free_buffer(&fname->crypto_buf);
+	kfree(fname->crypto_buf.name);
 	return ret;
 }
 EXPORT_SYMBOL(fscrypt_setup_filename);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index c3ad415cd14f..5c296d4af4a9 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -49,6 +49,15 @@ struct fscrypt_context {
 
 #define FS_ENCRYPTION_CONTEXT_FORMAT_V1		1
 
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __packed;
+
 /*
  * A pointer to this structure is stored in the file system's in-core
  * representation of an inode.
@@ -81,7 +90,22 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
 	bio->bi_rw = op | op_flags;
 }
 
+static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
+					   u32 filenames_mode)
+{
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+		return true;
+
+	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
+	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+		return true;
+
+	return false;
+}
+
 /* crypto.c */
+extern struct kmem_cache *fscrypt_info_cachep;
 extern int fscrypt_initialize(unsigned int cop_flags);
 extern struct workqueue_struct *fscrypt_read_workqueue;
 extern int fscrypt_do_page_crypto(const struct inode *inode,
@@ -93,6 +117,13 @@ extern int fscrypt_do_page_crypto(const struct inode *inode,
 extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
 					      gfp_t gfp_flags);
 
+/* fname.c */
+extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
+			 u8 *out, unsigned int olen);
+extern bool fscrypt_fname_encrypted_size(const struct inode *inode,
+					 u32 orig_len, u32 max_len,
+					 u32 *encrypted_len_ret);
+
 /* keyinfo.c */
 extern void __exit fscrypt_essiv_cleanup(void);
 
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 9f5fb2eb9cf7..bc010e4609ef 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -110,3 +110,159 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
+
+int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
+			      unsigned int max_len,
+			      struct fscrypt_str *disk_link)
+{
+	int err;
+
+	/*
+	 * To calculate the size of the encrypted symlink target we need to know
+	 * the amount of NUL padding, which is determined by the flags set in
+	 * the encryption policy which will be inherited from the directory.
+	 * The easiest way to get access to this is to just load the directory's
+	 * fscrypt_info, since we'll need it to create the dir_entry anyway.
+	 *
+	 * Note: in test_dummy_encryption mode, @dir may be unencrypted.
+	 */
+	err = fscrypt_get_encryption_info(dir);
+	if (err)
+		return err;
+	if (!fscrypt_has_encryption_key(dir))
+		return -ENOKEY;
+
+	/*
+	 * Calculate the size of the encrypted symlink and verify it won't
+	 * exceed max_len.  Note that for historical reasons, encrypted symlink
+	 * targets are prefixed with the ciphertext length, despite this
+	 * actually being redundant with i_size.  This decreases by 2 bytes the
+	 * longest symlink target we can accept.
+	 *
+	 * We could recover 1 byte by not counting a null terminator, but
+	 * counting it (even though it is meaningless for ciphertext) is simpler
+	 * for now since filesystems will assume it is there and subtract it.
+	 */
+	if (!fscrypt_fname_encrypted_size(dir, len,
+					  max_len - sizeof(struct fscrypt_symlink_data),
+					  &disk_link->len))
+		return -ENAMETOOLONG;
+	disk_link->len += sizeof(struct fscrypt_symlink_data);
+
+	disk_link->name = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink);
+
+int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
+			      unsigned int len, struct fscrypt_str *disk_link)
+{
+	int err;
+	struct qstr iname = QSTR_INIT(target, len);
+	struct fscrypt_symlink_data *sd;
+	unsigned int ciphertext_len;
+
+	err = fscrypt_require_key(inode);
+	if (err)
+		return err;
+
+	if (disk_link->name) {
+		/* filesystem-provided buffer */
+		sd = (struct fscrypt_symlink_data *)disk_link->name;
+	} else {
+		sd = kmalloc(disk_link->len, GFP_NOFS);
+		if (!sd)
+			return -ENOMEM;
+	}
+	ciphertext_len = disk_link->len - sizeof(*sd);
+	sd->len = cpu_to_le16(ciphertext_len);
+
+	err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len);
+	if (err) {
+		if (!disk_link->name)
+			kfree(sd);
+		return err;
+	}
+	/*
+	 * Null-terminating the ciphertext doesn't make sense, but we still
+	 * count the null terminator in the length, so we might as well
+	 * initialize it just in case the filesystem writes it out.
+	 */
+	sd->encrypted_path[ciphertext_len] = '\0';
+
+	if (!disk_link->name)
+		disk_link->name = (unsigned char *)sd;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink);
+
+/**
+ * fscrypt_get_symlink - get the target of an encrypted symlink
+ * @inode: the symlink inode
+ * @caddr: the on-disk contents of the symlink
+ * @max_size: size of @caddr buffer
+ * @done: if successful, will be set up to free the returned target
+ *
+ * If the symlink's encryption key is available, we decrypt its target.
+ * Otherwise, we encode its target for presentation.
+ *
+ * This may sleep, so the filesystem must have dropped out of RCU mode already.
+ *
+ * Return: the presentable symlink target or an ERR_PTR()
+ */
+void *fscrypt_get_symlink(struct inode *inode, const void *caddr,
+				unsigned int max_size)
+{
+	const struct fscrypt_symlink_data *sd;
+	struct fscrypt_str cstr, pstr;
+	int err;
+
+	/* This is for encrypted symlinks only */
+	if (WARN_ON(!IS_ENCRYPTED(inode)))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Try to set up the symlink's encryption key, but we can continue
+	 * regardless of whether the key is available or not.
+	 */
+	err = fscrypt_get_encryption_info(inode);
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * For historical reasons, encrypted symlink targets are prefixed with
+	 * the ciphertext length, even though this is redundant with i_size.
+	 */
+
+	if (max_size < sizeof(*sd))
+		return ERR_PTR(-EUCLEAN);
+	sd = caddr;
+	cstr.name = (unsigned char *)sd->encrypted_path;
+	cstr.len = le16_to_cpu(sd->len);
+
+	if (cstr.len == 0)
+		return ERR_PTR(-EUCLEAN);
+
+	if (cstr.len + sizeof(*sd) - 1 > max_size)
+		return ERR_PTR(-EUCLEAN);
+
+	err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+	if (err)
+		return ERR_PTR(err);
+
+	err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
+	if (err)
+		goto err_kfree;
+
+	err = -EUCLEAN;
+	if (pstr.name[0] == '\0')
+		goto err_kfree;
+
+	pstr.name[pstr.len] = '\0';
+	return pstr.name;
+
+err_kfree:
+	kfree(pstr.name);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(fscrypt_get_symlink);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 444c65ed6db8..7c00331da5df 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -13,6 +13,7 @@
 #include <linux/ratelimit.h>
 #include <crypto/aes.h>
 #include <crypto/sha.h>
+#include <crypto/skcipher.h>
 #include "fscrypt_private.h"
 
 static struct crypto_shash *essiv_hash_tfm;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 3c343e922f6e..04c608646fd5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -68,6 +68,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
 		.old_blkaddr = index,
 		.new_blkaddr = index,
 		.encrypted_page = NULL,
+		.is_meta = is_meta,
 	};
 
 	if (unlikely(!is_meta))
@@ -163,6 +164,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 						REQ_RAHEAD,
 		.encrypted_page = NULL,
 		.in_list = false,
+		.is_meta = (type != META_POR),
 	};
 	struct blk_plug plug;
 
@@ -573,13 +575,8 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 	struct node_info ni;
 	int err = acquire_orphan_inode(sbi);
 
-	if (err) {
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-				"%s: orphan failed (ino=%x), run fsck to fix.",
-				__func__, ino);
-		return err;
-	}
+	if (err)
+		goto err_out;
 
 	__add_ino_entry(sbi, ino, 0, ORPHAN_INO);
 
@@ -593,6 +590,11 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 		return PTR_ERR(inode);
 	}
 
+	err = dquot_initialize(inode);
+	if (err)
+		goto err_out;
+
+	dquot_initialize(inode);
 	clear_nlink(inode);
 
 	/* truncate all the data during iput */
@@ -602,14 +604,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 
 	/* ENOMEM was fully retried in f2fs_evict_inode. */
 	if (ni.blk_addr != NULL_ADDR) {
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		f2fs_msg(sbi->sb, KERN_WARNING,
-			"%s: orphan failed (ino=%x) by kernel, retry mount.",
-				__func__, ino);
-		return -EIO;
+		err = -EIO;
+		goto err_out;
 	}
 	__remove_ino_entry(sbi, ino, ORPHAN_INO);
 	return 0;
+
+err_out:
+	set_sbi_flag(sbi, SBI_NEED_FSCK);
+	f2fs_msg(sbi->sb, KERN_WARNING,
+			"%s: orphan failed (ino=%x), run fsck to fix.",
+			__func__, ino);
+	return err;
 }
 
 int recover_orphan_inodes(struct f2fs_sb_info *sbi)
@@ -1140,6 +1146,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	if (cpc->reason & CP_TRIMMED)
 		__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+	else
+		__clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
 
 	if (cpc->reason & CP_UMOUNT)
 		__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
@@ -1166,6 +1174,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	spin_unlock_irqrestore(&sbi->cp_lock, flags);
 }
 
+static void commit_checkpoint(struct f2fs_sb_info *sbi,
+	void *src, block_t blk_addr)
+{
+	struct writeback_control wbc = {
+		.for_reclaim = 0,
+	};
+
+	/*
+	 * pagevec_lookup_tag and lock_page again will take
+	 * some extra time. Therefore, update_meta_pages and
+	 * sync_meta_pages are combined in this function.
+	 */
+	struct page *page = grab_meta_page(sbi, blk_addr);
+	int err;
+
+	memcpy(page_address(page), src, PAGE_SIZE);
+	set_page_dirty(page);
+
+	f2fs_wait_on_page_writeback(page, META, true);
+	f2fs_bug_on(sbi, PageWriteback(page));
+	if (unlikely(!clear_page_dirty_for_io(page)))
+		f2fs_bug_on(sbi, 1);
+
+	/* writeout cp pack 2 page */
+	err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
+	f2fs_bug_on(sbi, err);
+
+	f2fs_put_page(page, 0);
+
+	/* submit checkpoint (with barrier if NOBARRIER is not set) */
+	f2fs_submit_merged_write(sbi, META_FLUSH);
+}
+
 static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1268,16 +1309,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		}
 	}
 
-	/* need to wait for end_io results */
-	wait_on_all_pages_writeback(sbi);
-	if (unlikely(f2fs_cp_error(sbi)))
-		return -EIO;
-
-	/* flush all device cache */
-	err = f2fs_flush_device_cache(sbi);
-	if (err)
-		return err;
-
 	/* write out checkpoint buffer at block 0 */
 	update_meta_page(sbi, ckpt, start_blk++);
 
@@ -1305,26 +1336,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		start_blk += NR_CURSEG_NODE_TYPE;
 	}
 
-	/* writeout checkpoint block */
-	update_meta_page(sbi, ckpt, start_blk);
+	/* update user_block_counts */
+	sbi->last_valid_block_count = sbi->total_valid_block_count;
+	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
 
-	/* wait for previous submitted node/meta pages writeback */
+	/* Here, we have one bio having CP pack except cp pack 2 page */
+	sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+
+	/* wait for previous submitted meta pages writeback */
 	wait_on_all_pages_writeback(sbi);
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
-	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
-	filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
+	/* flush all device cache */
+	err = f2fs_flush_device_cache(sbi);
+	if (err)
+		return err;
 
-	/* update user_block_counts */
-	sbi->last_valid_block_count = sbi->total_valid_block_count;
-	percpu_counter_set(&sbi->alloc_valid_block_count, 0);
-
-	/* Here, we only have one bio having CP pack */
-	sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
-
-	/* wait for previous submitted meta pages writeback */
+	/* barrier and flush checkpoint cp pack 2 page if it can */
+	commit_checkpoint(sbi, ckpt, start_blk);
 	wait_on_all_pages_writeback(sbi);
 
 	release_ino_entry(sbi, false);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d5299265feea..ad44712bb902 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -174,15 +174,22 @@ static bool __same_bdev(struct f2fs_sb_info *sbi,
  */
 static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 				struct writeback_control *wbc,
-				int npages, bool is_read)
+				int npages, bool is_read,
+				enum page_type type, enum temp_type temp)
 {
 	struct bio *bio;
 
 	bio = f2fs_bio_alloc(sbi, npages, true);
 
 	f2fs_target_device(sbi, blk_addr, bio);
-	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
-	bio->bi_private = is_read ? NULL : sbi;
+	if (is_read) {
+		bio->bi_end_io = f2fs_read_end_io;
+		bio->bi_private = NULL;
+	} else {
+		bio->bi_end_io = f2fs_write_end_io;
+		bio->bi_private = sbi;
+		bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp);
+	}
 	if (wbc)
 		wbc_init_bio(wbc, bio);
 
@@ -195,13 +202,12 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
 	if (!is_read_io(bio_op(bio))) {
 		unsigned int start;
 
-		if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
-			current->plug && (type == DATA || type == NODE))
-			blk_finish_plug(current->plug);
-
 		if (type != DATA && type != NODE)
 			goto submit_io;
 
+		if (f2fs_sb_has_blkzoned(sbi->sb) && current->plug)
+			blk_finish_plug(current->plug);
+
 		start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
 		start %= F2FS_IO_SIZE(sbi);
 
@@ -376,12 +382,13 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	struct page *page = fio->encrypted_page ?
 			fio->encrypted_page : fio->page;
 
+	verify_block_addr(fio, fio->new_blkaddr);
 	trace_f2fs_submit_page_bio(page, fio);
 	f2fs_trace_ios(fio, 0);
 
 	/* Allocate a new bio */
 	bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc,
-				1, is_read_io(fio->op));
+				1, is_read_io(fio->op), fio->type, fio->temp);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		bio_put(bio);
@@ -421,8 +428,8 @@ next:
 	}
 
 	if (fio->old_blkaddr != NEW_ADDR)
-		verify_block_addr(sbi, fio->old_blkaddr);
-	verify_block_addr(sbi, fio->new_blkaddr);
+		verify_block_addr(fio, fio->old_blkaddr);
+	verify_block_addr(fio, fio->new_blkaddr);
 
 	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
@@ -444,7 +451,8 @@ alloc_new:
 			goto out_fail;
 		}
 		io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc,
-						BIO_MAX_PAGES, false);
+						BIO_MAX_PAGES, false,
+						fio->type, fio->temp);
 		io->fio = *fio;
 	}
 
@@ -831,13 +839,6 @@ alloc:
 	return 0;
 }
 
-static inline bool __force_buffered_io(struct inode *inode, int rw)
-{
-	return (f2fs_encrypted_file(inode) ||
-			(rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
-			F2FS_I_SB(inode)->s_ndevs);
-}
-
 int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
@@ -868,9 +869,8 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	map.m_seg_type = NO_CHECK_TYPE;
 
 	if (direct_io) {
-		/* map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); */
-		map.m_seg_type = rw_hint_to_seg_type(WRITE_LIFE_NOT_SET);
-		flag = __force_buffered_io(inode, WRITE) ?
+		map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint);
+		flag = f2fs_force_buffered_io(inode, WRITE) ?
 					F2FS_GET_BLOCK_PRE_AIO :
 					F2FS_GET_BLOCK_PRE_DIO;
 		goto map_blocks;
@@ -1114,6 +1114,31 @@ out:
 	return err;
 }
 
+bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
+{
+	struct f2fs_map_blocks map;
+	block_t last_lblk;
+	int err;
+
+	if (pos + len > i_size_read(inode))
+		return false;
+
+	map.m_lblk = F2FS_BYTES_TO_BLK(pos);
+	map.m_next_pgofs = NULL;
+	map.m_next_extent = NULL;
+	map.m_seg_type = NO_CHECK_TYPE;
+	last_lblk = F2FS_BLK_ALIGN(pos + len);
+
+	while (map.m_lblk < last_lblk) {
+		map.m_len = last_lblk - map.m_lblk;
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
+		if (err || map.m_len == 0)
+			return false;
+		map.m_lblk += map.m_len;
+	}
+	return true;
+}
+
 static int __get_data_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh, int create, int flag,
 			pgoff_t *next_pgofs, int seg_type)
@@ -1151,8 +1176,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock,
 	return __get_data_block(inode, iblock, bh_result, create,
 						F2FS_GET_BLOCK_DEFAULT, NULL,
 						rw_hint_to_seg_type(
-							WRITE_LIFE_NOT_SET));
-						/* inode->i_write_hint)); */
+							inode->i_write_hint));
 }
 
 static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -2304,15 +2328,18 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	size_t count = iov_iter_count(iter);
 	int rw = iov_iter_rw(iter);
 	int err;
+	enum rw_hint hint = iocb->ki_hint;
+	int whint_mode = F2FS_OPTION(sbi).whint_mode;
 
 	err = check_direct_IO(inode, iter, offset);
 	if (err)
 		return err;
 
-	if (__force_buffered_io(inode, rw))
+	if (f2fs_force_buffered_io(inode, rw))
 		return 0;
 
 	if (trace_android_fs_dataread_start_enabled() &&
@@ -2339,11 +2366,24 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	}
 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
 
-	down_read(&F2FS_I(inode)->dio_rwsem[rw]);
+	if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
+		iocb->ki_hint = WRITE_LIFE_NOT_SET;
+
+	if (!down_read_trylock(&F2FS_I(inode)->dio_rwsem[rw])) {
+		if (iocb->ki_flags & IOCB_NOWAIT) {
+			iocb->ki_hint = hint;
+			err = -EAGAIN;
+			goto out;
+		}
+		down_read(&F2FS_I(inode)->dio_rwsem[rw]);
+	}
+
 	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
 	up_read(&F2FS_I(inode)->dio_rwsem[rw]);
 
 	if (rw == WRITE) {
+		if (whint_mode == WHINT_MODE_OFF)
+			iocb->ki_hint = hint;
 		if (err > 0) {
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
 									err);
@@ -2352,7 +2392,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 			f2fs_write_failed(mapping, offset + count);
 		}
 	}
-
+out:
 	if (trace_android_fs_dataread_start_enabled() &&
 	    (iov_iter_rw(iter) == READ))
 		trace_android_fs_dataread_end(inode, offset, count);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 560b707050ca..41d32171bd52 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -361,6 +361,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
 			struct page *dpage)
 {
 	struct page *page;
+	int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir));
 	int err;
 
 	if (is_inode_flag_set(inode, FI_NEW_INODE)) {
@@ -387,7 +388,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
 		if (err)
 			goto put_error;
 
-		if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
+		if ((f2fs_encrypted_inode(dir) || dummy_encrypt) &&
+					f2fs_may_encrypt(inode)) {
 			err = fscrypt_inherit_context(dir, inode, page, false);
 			if (err)
 				goto put_error;
@@ -396,8 +398,6 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
 		page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
 		if (IS_ERR(page))
 			return page;
-
-		set_cold_node(inode, page);
 	}
 
 	if (new_name) {
@@ -704,7 +704,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 
 	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
 
-	add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO);
+	if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT)
+		add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO);
 
 	if (f2fs_has_inline_dentry(dir))
 		return f2fs_delete_inline_entry(dentry, page, dir, inode);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index ff2352a0ed15..d5a861bf2b42 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -460,7 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode,
 				struct rb_node *insert_parent)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct rb_node **p = &et->root.rb_node;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct extent_node *en = NULL;
 
@@ -706,6 +706,9 @@ void f2fs_drop_extent_tree(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 
+	if (!f2fs_may_extent_tree(inode))
+		return;
+
 	set_inode_flag(inode, FI_NO_EXTENT);
 
 	write_lock(&et->lock);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 34595ca42f68..e258f6536b56 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -99,9 +99,10 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_INLINE_XATTR_SIZE	0x00800000
 #define F2FS_MOUNT_RESERVE_ROOT		0x01000000
 
-#define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
-#define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
-#define test_opt(sbi, option)	((sbi)->mount_opt.opt & F2FS_MOUNT_##option)
+#define F2FS_OPTION(sbi)	((sbi)->mount_opt)
+#define clear_opt(sbi, option)	(F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option)	(F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option)	(F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option)
 
 #define ver_after(a, b)	(typecheck(unsigned long long, a) &&		\
 		typecheck(unsigned long long, b) &&			\
@@ -114,7 +115,26 @@ typedef u32 block_t;	/*
 typedef u32 nid_t;
 
 struct f2fs_mount_info {
-	unsigned int	opt;
+	unsigned int opt;
+	int write_io_size_bits;		/* Write IO size bits */
+	block_t root_reserved_blocks;	/* root reserved blocks */
+	kuid_t s_resuid;		/* reserved blocks for uid */
+	kgid_t s_resgid;		/* reserved blocks for gid */
+	int active_logs;		/* # of active logs */
+	int inline_xattr_size;		/* inline xattr size */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	struct f2fs_fault_info fault_info;	/* For fault injection */
+#endif
+#ifdef CONFIG_QUOTA
+	/* Names of quota files with journalled quota */
+	char *s_qf_names[MAXQUOTAS];
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
+	/* For which write hints are passed down to block layer */
+	int whint_mode;
+	int alloc_mode;			/* segment allocation policy */
+	int fsync_mode;			/* fsync policy */
+	bool test_dummy_encryption;	/* test dummy encryption */
 };
 
 #define F2FS_FEATURE_ENCRYPT		0x0001
@@ -126,6 +146,8 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR	0x0040
 #define F2FS_FEATURE_QUOTA_INO		0x0080
 #define F2FS_FEATURE_INODE_CRTIME	0x0100
+#define F2FS_FEATURE_LOST_FOUND		0x0200
+#define F2FS_FEATURE_VERITY		0x0400	/* reserved */
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -509,7 +531,7 @@ static inline void make_dentry_ptr_block(struct inode *inode,
 	d->inode = inode;
 	d->max = NR_DENTRY_IN_BLOCK;
 	d->nr_bitmap = SIZE_OF_DENTRY_BITMAP;
-	d->bitmap = &t->dentry_bitmap;
+	d->bitmap = t->dentry_bitmap;
 	d->dentry = t->dentry;
 	d->filename = t->filename;
 }
@@ -635,6 +657,8 @@ enum {
 #define FADVISE_ENCRYPT_BIT	0x04
 #define FADVISE_ENC_NAME_BIT	0x08
 #define FADVISE_KEEP_SIZE_BIT	0x10
+#define FADVISE_HOT_BIT		0x20
+#define FADVISE_VERITY_BIT	0x40	/* reserved */
 
 #define file_is_cold(inode)	is_file(inode, FADVISE_COLD_BIT)
 #define file_wrong_pino(inode)	is_file(inode, FADVISE_LOST_PINO_BIT)
@@ -649,6 +673,9 @@ enum {
 #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
 #define file_keep_isize(inode)	is_file(inode, FADVISE_KEEP_SIZE_BIT)
 #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
+#define file_is_hot(inode)	is_file(inode, FADVISE_HOT_BIT)
+#define file_set_hot(inode)	set_file(inode, FADVISE_HOT_BIT)
+#define file_clear_hot(inode)	clear_file(inode, FADVISE_HOT_BIT)
 
 #define DEF_DIR_LEVEL		0
 
@@ -696,6 +723,7 @@ struct f2fs_inode_info {
 	kprojid_t i_projid;		/* id for project quota */
 	int i_inline_xattr_size;	/* inline xattr size */
 	struct timespec i_crtime;	/* inode creation time */
+	struct timespec i_disk_time[4];	/* inode disk times */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -802,7 +830,7 @@ struct f2fs_nm_info {
 	unsigned int nid_cnt[MAX_NID_STATE];	/* the number of free node id */
 	spinlock_t nid_list_lock;	/* protect nid lists ops */
 	struct mutex build_lock;	/* lock for build free nids */
-	unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
+	unsigned char **free_nid_bitmap;
 	unsigned char *nat_block_bitmap;
 	unsigned short *free_nid_count;	/* free nid count of NAT block */
 
@@ -1035,6 +1063,7 @@ struct f2fs_io_info {
 	bool submitted;		/* indicate IO submission */
 	int need_lock;		/* indicate we need to lock cp_rwsem */
 	bool in_list;		/* indicate fio is in io_list */
+	bool is_meta;		/* indicate borrow meta inode mapping or not */
 	enum iostat_type io_type;	/* io type */
 	struct writeback_control *io_wbc; /* writeback control */
 };
@@ -1096,10 +1125,34 @@ enum {
 	MAX_TIME,
 };
 
+enum {
+	WHINT_MODE_OFF,		/* not pass down write hints */
+	WHINT_MODE_USER,	/* try to pass down hints given by users */
+	WHINT_MODE_FS,		/* pass down hints with F2FS policy */
+};
+
+enum {
+	ALLOC_MODE_DEFAULT,	/* stay default */
+	ALLOC_MODE_REUSE,	/* reuse segments as much as possible */
+};
+
+enum fsync_mode {
+	FSYNC_MODE_POSIX,	/* fsync follows posix semantics */
+	FSYNC_MODE_STRICT,	/* fsync behaves in line with ext4 */
+};
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#define DUMMY_ENCRYPTION_ENABLED(sbi) \
+			(unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
+#else
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
+#endif
+
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
 	struct f2fs_super_block *raw_super;	/* raw super block pointer */
+	struct rw_semaphore sb_lock;		/* lock for raw super block */
 	int valid_super_block;			/* valid super block no */
 	unsigned long s_flag;				/* flags for sbi */
 
@@ -1119,7 +1172,6 @@ struct f2fs_sb_info {
 	struct f2fs_bio_info *write_io[NR_PAGE_TYPE];	/* for write bios */
 	struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE];
 						/* bio ordering for NODE/DATA */
-	int write_io_size_bits;			/* Write IO size bits */
 	mempool_t *write_io_dummy;		/* Dummy pages */
 
 	/* for checkpoint */
@@ -1169,9 +1221,7 @@ struct f2fs_sb_info {
 	unsigned int total_node_count;		/* total node block count */
 	unsigned int total_valid_node_count;	/* valid node block count */
 	loff_t max_file_blocks;			/* max block index of file */
-	int active_logs;			/* # of active logs */
 	int dir_level;				/* directory level */
-	int inline_xattr_size;			/* inline xattr size */
 	unsigned int trigger_ssr_threshold;	/* threshold to trigger ssr */
 	int readdir_ra;				/* readahead inode in readdir */
 
@@ -1181,9 +1231,6 @@ struct f2fs_sb_info {
 	block_t last_valid_block_count;		/* for recovery */
 	block_t reserved_blocks;		/* configurable reserved blocks */
 	block_t current_reserved_blocks;	/* current reserved blocks */
-	block_t root_reserved_blocks;		/* root reserved blocks */
-	kuid_t s_resuid;			/* reserved blocks for uid */
-	kgid_t s_resgid;			/* reserved blocks for gid */
 
 	unsigned int nquota_files;		/* # of quota sysfile */
 
@@ -1268,17 +1315,6 @@ struct f2fs_sb_info {
 
 	/* Precomputed FS UUID checksum for seeding other checksums */
 	__u32 s_chksum_seed;
-
-	/* For fault injection */
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-	struct f2fs_fault_info fault_info;
-#endif
-
-#ifdef CONFIG_QUOTA
-	/* Names of quota files with journalled quota */
-	char *s_qf_names[MAXQUOTAS];
-	int s_jquota_fmt;			/* Format of quota to use */
-#endif
 };
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
@@ -1288,7 +1324,7 @@ struct f2fs_sb_info {
 		__func__, __builtin_return_address(0))
 static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
 {
-	struct f2fs_fault_info *ffi = &sbi->fault_info;
+	struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
 
 	if (!ffi->inject_rate)
 		return false;
@@ -1645,13 +1681,13 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
 		return false;
 	if (IS_NOQUOTA(inode))
 		return true;
+	if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid()))
+		return true;
+	if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) &&
+					in_group_p(F2FS_OPTION(sbi).s_resgid))
+		return true;
 	if (capable(CAP_SYS_RESOURCE))
 		return true;
-	if (uid_eq(sbi->s_resuid, current_fsuid()))
-		return true;
-	if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) &&
-					in_group_p(sbi->s_resgid))
-		return true;
 	return false;
 }
 
@@ -1686,7 +1722,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 					sbi->current_reserved_blocks;
 
 	if (!__allow_reserved_blocks(sbi, inode))
-		avail_user_block_count -= sbi->root_reserved_blocks;
+		avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
 
 	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
 		diff = sbi->total_valid_block_count - avail_user_block_count;
@@ -1821,6 +1857,12 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	int offset;
 
+	if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) {
+		offset = (flag == SIT_BITMAP) ?
+			le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0;
+		return &ckpt->sit_nat_version_bitmap + offset;
+	}
+
 	if (__cp_payload(sbi) > 0) {
 		if (flag == NAT_BITMAP)
 			return &ckpt->sit_nat_version_bitmap;
@@ -1887,7 +1929,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 					sbi->current_reserved_blocks + 1;
 
 	if (!__allow_reserved_blocks(sbi, inode))
-		valid_block_count += sbi->root_reserved_blocks;
+		valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
 
 	if (unlikely(valid_block_count > sbi->user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
@@ -2489,7 +2531,17 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 	}
 	if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
 			file_keep_isize(inode) ||
-			i_size_read(inode) & PAGE_MASK)
+			i_size_read(inode) & ~PAGE_MASK)
+		return false;
+
+	if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+		return false;
+	if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
+		return false;
+	if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+		return false;
+	if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3,
+						&F2FS_I(inode)->i_crtime))
 		return false;
 
 	down_read(&F2FS_I(inode)->i_sem);
@@ -2499,8 +2551,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 	return ret;
 }
 
-#define sb_rdonly	f2fs_readonly
-static inline int f2fs_readonly(struct super_block *sb)
+static inline bool f2fs_readonly(struct super_block *sb)
 {
 	return sb->s_flags & MS_RDONLY;
 }
@@ -2562,15 +2613,6 @@ static inline void *kvzalloc(size_t size, gfp_t flags)
 	return ret;
 }
 
-enum rw_hint {
-	WRITE_LIFE_NOT_SET	= 0,
-	WRITE_LIFE_NONE		= 1, /* RWH_WRITE_LIFE_NONE */
-	WRITE_LIFE_SHORT	= 2, /* RWH_WRITE_LIFE_SHORT */
-	WRITE_LIFE_MEDIUM	= 3, /* RWH_WRITE_LIFE_MEDIUM */
-	WRITE_LIFE_LONG		= 4, /* RWH_WRITE_LIFE_LONG */
-	WRITE_LIFE_EXTREME	= 5, /* RWH_WRITE_LIFE_EXTREME */
-};
-
 static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi,
 					size_t size, gfp_t flags)
 {
@@ -2679,6 +2721,8 @@ void handle_failed_inode(struct inode *inode);
 /*
  * namei.c
  */
+int update_extension_list(struct f2fs_sb_info *sbi, const char *name,
+							bool hot, bool set);
 struct dentry *f2fs_get_parent(struct dentry *child);
 
 /*
@@ -2851,6 +2895,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi);
 int __init create_segment_manager_caches(void);
 void destroy_segment_manager_caches(void);
 int rw_hint_to_seg_type(enum rw_hint hint);
+enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type,
+				enum temp_type temp);
 
 /*
  * checkpoint.c
@@ -2933,6 +2979,7 @@ int f2fs_release_page(struct page *page, gfp_t wait);
 int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
 			struct page *page, enum migrate_mode mode);
 #endif
+bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
 
 /*
  * gc.c
@@ -3255,45 +3302,21 @@ static inline bool f2fs_bio_encrypted(struct bio *bio)
 	return bio->bi_private != NULL;
 }
 
-static inline int f2fs_sb_has_crypto(struct super_block *sb)
-{
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
+#define F2FS_FEATURE_FUNCS(name, flagname) \
+static inline int f2fs_sb_has_##name(struct super_block *sb) \
+{ \
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \
 }
 
-static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
-{
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
-}
-
-static inline int f2fs_sb_has_extra_attr(struct super_block *sb)
-{
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR);
-}
-
-static inline int f2fs_sb_has_project_quota(struct super_block *sb)
-{
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA);
-}
-
-static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
-{
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
-}
-
-static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb)
-{
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR);
-}
-
-static inline int f2fs_sb_has_quota_ino(struct super_block *sb)
-{
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO);
-}
-
-static inline int f2fs_sb_has_inode_crtime(struct super_block *sb)
-{
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME);
-}
+F2FS_FEATURE_FUNCS(encrypt, ENCRYPT);
+F2FS_FEATURE_FUNCS(blkzoned, BLKZONED);
+F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR);
+F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA);
+F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM);
+F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
+F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO);
+F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME);
+F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
@@ -3313,7 +3336,7 @@ static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
 {
 	struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
 
-	return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb);
+	return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb);
 }
 
 static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
@@ -3342,4 +3365,11 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #endif
 }
 
+static inline bool f2fs_force_buffered_io(struct inode *inode, int rw)
+{
+	return (f2fs_encrypted_file(inode) ||
+			(rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
+			F2FS_I_SB(inode)->s_ndevs);
+}
+
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 65cda5bc61b7..39c3acb454a3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -166,9 +166,10 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
 		cp_reason = CP_NODE_NEED_CP;
 	else if (test_opt(sbi, FASTBOOT))
 		cp_reason = CP_FASTBOOT_MODE;
-	else if (sbi->active_logs == 2)
+	else if (F2FS_OPTION(sbi).active_logs == 2)
 		cp_reason = CP_SPEC_LOG_NUM;
-	else if (need_dentry_mark(sbi, inode->i_ino) &&
+	else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT &&
+		need_dentry_mark(sbi, inode->i_ino) &&
 		exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO))
 		cp_reason = CP_RECOVER_DIR;
 
@@ -481,6 +482,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
 
 	if (err)
 		return err;
+
+	filp->f_mode |= FMODE_NOWAIT;
+
 	return dquot_file_open(inode, filp);
 }
 
@@ -571,7 +575,6 @@ truncate_out:
 int truncate_blocks(struct inode *inode, u64 from, bool lock)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	unsigned int blocksize = inode->i_sb->s_blocksize;
 	struct dnode_of_data dn;
 	pgoff_t free_from;
 	int count = 0, err = 0;
@@ -580,7 +583,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
 
 	trace_f2fs_truncate_blocks_enter(inode, from);
 
-	free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
+	free_from = (pgoff_t)F2FS_BLK_ALIGN(from);
 
 	if (free_from >= sbi->max_file_blocks)
 		goto free_partial;
@@ -1354,8 +1357,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 	}
 
 out:
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
-		f2fs_i_size_write(inode, new_size);
+	if (new_size > i_size_read(inode)) {
+		if (mode & FALLOC_FL_KEEP_SIZE)
+			file_set_keep_isize(inode);
+		else
+			f2fs_i_size_write(inode, new_size);
+	}
 out_sem:
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 
@@ -1709,6 +1716,8 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 
 	inode_lock(inode);
 
+	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
 	if (f2fs_is_volatile_file(inode))
 		goto err_out;
 
@@ -1727,6 +1736,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
 	}
 err_out:
+	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
@@ -1936,7 +1946,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 
-	if (!f2fs_sb_has_crypto(inode->i_sb))
+	if (!f2fs_sb_has_encrypt(inode->i_sb))
 		return -EOPNOTSUPP;
 
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
@@ -1946,7 +1956,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
-	if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb))
+	if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb))
 		return -EOPNOTSUPP;
 	return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 }
@@ -1957,16 +1967,18 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int err;
 
-	if (!f2fs_sb_has_crypto(inode->i_sb))
+	if (!f2fs_sb_has_encrypt(inode->i_sb))
 		return -EOPNOTSUPP;
 
-	if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
-		goto got_it;
-
 	err = mnt_want_write_file(filp);
 	if (err)
 		return err;
 
+	down_write(&sbi->sb_lock);
+
+	if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
+		goto got_it;
+
 	/* update superblock with uuid */
 	generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
 
@@ -1974,15 +1986,16 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
 	if (err) {
 		/* undo new data */
 		memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
-		mnt_drop_write_file(filp);
-		return err;
+		goto out_err;
 	}
-	mnt_drop_write_file(filp);
 got_it:
 	if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
 									16))
-		return -EFAULT;
-	return 0;
+		err = -EFAULT;
+out_err:
+	up_write(&sbi->sb_lock);
+	mnt_drop_write_file(filp);
+	return err;
 }
 
 static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
@@ -2043,8 +2056,10 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg)
 		return ret;
 
 	end = range.start + range.len;
-	if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi))
-		return -EINVAL;
+	if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) {
+		ret = -EINVAL;
+		goto out;
+	}
 do_more:
 	if (!range.sync) {
 		if (!mutex_trylock(&sbi->gc_mutex)) {
@@ -2685,25 +2700,54 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;
 
-	inode_lock(inode);
+	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+		return -EINVAL;
+
+	if (!inode_trylock(inode)) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+		inode_lock(inode);
+	}
+
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0) {
+		bool preallocated = false;
+		size_t target_size = 0;
 		int err;
 
 		if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
 			set_inode_flag(inode, FI_NO_PREALLOC);
 
-		err = f2fs_preallocate_blocks(iocb, from);
-		if (err) {
-			clear_inode_flag(inode, FI_NO_PREALLOC);
-			inode_unlock(inode);
-			return err;
+		if ((iocb->ki_flags & IOCB_NOWAIT) &&
+			(iocb->ki_flags & IOCB_DIRECT)) {
+				if (!f2fs_overwrite_io(inode, iocb->ki_pos,
+						iov_iter_count(from)) ||
+					f2fs_has_inline_data(inode) ||
+					f2fs_force_buffered_io(inode, WRITE)) {
+						inode_unlock(inode);
+						return -EAGAIN;
+				}
+
+		} else {
+			preallocated = true;
+			target_size = iocb->ki_pos + iov_iter_count(from);
+
+			err = f2fs_preallocate_blocks(iocb, from);
+			if (err) {
+				clear_inode_flag(inode, FI_NO_PREALLOC);
+				inode_unlock(inode);
+				return err;
+			}
 		}
 		blk_start_plug(&plug);
 		ret = __generic_file_write_iter(iocb, from);
 		blk_finish_plug(&plug);
 		clear_inode_flag(inode, FI_NO_PREALLOC);
 
+		/* if we couldn't write data, we should deallocate blocks. */
+		if (preallocated && i_size_read(inode) < target_size)
+			f2fs_truncate(inode);
+
 		if (ret > 0)
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
 	}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d0de3429c26c..54f51a990794 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -76,14 +76,15 @@ static int gc_thread_func(void *data)
 		 * invalidated soon after by user update or deletion.
 		 * So, I'd like to wait some time to collect dirty segments.
 		 */
-		if (!mutex_trylock(&sbi->gc_mutex))
-			goto next;
-
 		if (gc_th->gc_urgent) {
 			wait_ms = gc_th->urgent_sleep_time;
+			mutex_lock(&sbi->gc_mutex);
 			goto do_gc;
 		}
 
+		if (!mutex_trylock(&sbi->gc_mutex))
+			goto next;
+
 		if (!is_idle(sbi)) {
 			increase_sleep_time(gc_th, &wait_ms);
 			mutex_unlock(&sbi->gc_mutex);
@@ -161,12 +162,17 @@ static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type)
 {
 	int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
 
-	if (gc_th && gc_th->gc_idle) {
+	if (!gc_th)
+		return gc_mode;
+
+	if (gc_th->gc_idle) {
 		if (gc_th->gc_idle == 1)
 			gc_mode = GC_CB;
 		else if (gc_th->gc_idle == 2)
 			gc_mode = GC_GREEDY;
 	}
+	if (gc_th->gc_urgent)
+		gc_mode = GC_GREEDY;
 	return gc_mode;
 }
 
@@ -188,11 +194,14 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
 	}
 
 	/* we need to check every dirty segments in the FG_GC case */
-	if (gc_type != FG_GC && p->max_search > sbi->max_victim_search)
+	if (gc_type != FG_GC &&
+			(sbi->gc_thread && !sbi->gc_thread->gc_urgent) &&
+			p->max_search > sbi->max_victim_search)
 		p->max_search = sbi->max_victim_search;
 
-	/* let's select beginning hot/small space first */
-	if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+	/* let's select beginning hot/small space first in no_heap mode*/
+	if (test_opt(sbi, NOHEAP) &&
+		(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
 		p->offset = 0;
 	else
 		p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 10be247ca421..51846fc54fbd 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -284,6 +284,10 @@ static int do_read_inode(struct inode *inode)
 		fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
 	}
 
+	F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
+	F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
+	F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+	F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
 	f2fs_put_page(node_page, 1);
 
 	stat_inc_inline_xattr(inode);
@@ -439,12 +443,15 @@ void update_inode(struct inode *inode, struct page *node_page)
 	}
 
 	__set_inode_rdev(inode, ri);
-	set_cold_node(inode, node_page);
 
 	/* deleted inode */
 	if (inode->i_nlink == 0)
 		clear_inline_node(node_page);
 
+	F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
+	F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
+	F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+	F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
 }
 
 void update_inode_page(struct inode *inode)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 6bb1adb84324..5ec20f077629 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -78,7 +78,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	set_inode_flag(inode, FI_NEW_INODE);
 
 	/* If the directory encrypted, then we should encrypt the inode. */
-	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
+	if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+				f2fs_may_encrypt(inode))
 		f2fs_set_encrypted_inode(inode);
 
 	if (f2fs_sb_has_extra_attr(sbi->sb)) {
@@ -97,7 +98,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
 		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
 		if (f2fs_has_inline_xattr(inode))
-			xattr_size = sbi->inline_xattr_size;
+			xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
 		/* Otherwise, will be 0 */
 	} else if (f2fs_has_inline_xattr(inode) ||
 				f2fs_has_inline_dentry(inode)) {
@@ -142,7 +143,7 @@ fail_drop:
 	return ERR_PTR(err);
 }
 
-static int is_multimedia_file(const unsigned char *s, const char *sub)
+static int is_extension_exist(const unsigned char *s, const char *sub)
 {
 	size_t slen = strlen(s);
 	size_t sublen = strlen(sub);
@@ -168,19 +169,94 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
 /*
  * Set multimedia files as cold files for hot/cold data separation
  */
-static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
+static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
 		const unsigned char *name)
 {
-	int i;
-	__u8 (*extlist)[8] = sbi->raw_super->extension_list;
+	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+	int i, cold_count, hot_count;
 
-	int count = le32_to_cpu(sbi->raw_super->extension_count);
-	for (i = 0; i < count; i++) {
-		if (is_multimedia_file(name, extlist[i])) {
+	down_read(&sbi->sb_lock);
+
+	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+	hot_count = sbi->raw_super->hot_ext_count;
+
+	for (i = 0; i < cold_count + hot_count; i++) {
+		if (!is_extension_exist(name, extlist[i]))
+			continue;
+		if (i < cold_count)
 			file_set_cold(inode);
-			break;
-		}
+		else
+			file_set_hot(inode);
+		break;
 	}
+
+	up_read(&sbi->sb_lock);
+}
+
+int update_extension_list(struct f2fs_sb_info *sbi, const char *name,
+							bool hot, bool set)
+{
+	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+	int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+	int hot_count = sbi->raw_super->hot_ext_count;
+	int total_count = cold_count + hot_count;
+	int start, count;
+	int i;
+
+	if (set) {
+		if (total_count == F2FS_MAX_EXTENSION)
+			return -EINVAL;
+	} else {
+		if (!hot && !cold_count)
+			return -EINVAL;
+		if (hot && !hot_count)
+			return -EINVAL;
+	}
+
+	if (hot) {
+		start = cold_count;
+		count = total_count;
+	} else {
+		start = 0;
+		count = cold_count;
+	}
+
+	for (i = start; i < count; i++) {
+		if (strcmp(name, extlist[i]))
+			continue;
+
+		if (set)
+			return -EINVAL;
+
+		memcpy(extlist[i], extlist[i + 1],
+				F2FS_EXTENSION_LEN * (total_count - i - 1));
+		memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN);
+		if (hot)
+			sbi->raw_super->hot_ext_count = hot_count - 1;
+		else
+			sbi->raw_super->extension_count =
+						cpu_to_le32(cold_count - 1);
+		return 0;
+	}
+
+	if (!set)
+		return -EINVAL;
+
+	if (hot) {
+		strncpy(extlist[count], name, strlen(name));
+		sbi->raw_super->hot_ext_count = hot_count + 1;
+	} else {
+		char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];
+
+		memcpy(buf, &extlist[cold_count],
+				F2FS_EXTENSION_LEN * hot_count);
+		memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN);
+		strncpy(extlist[cold_count], name, strlen(name));
+		memcpy(&extlist[cold_count + 1], buf,
+				F2FS_EXTENSION_LEN * hot_count);
+		sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1);
+	}
+	return 0;
 }
 
 static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
@@ -203,7 +279,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		return PTR_ERR(inode);
 
 	if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
-		set_cold_files(sbi, inode, dentry->d_name.name);
+		set_file_temperature(sbi, inode, dentry->d_name.name);
 
 	inode->i_op = &f2fs_file_inode_operations;
 	inode->i_fop = &f2fs_file_operations;
@@ -478,27 +554,16 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
 	size_t len = strlen(symname);
-	struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
-	struct fscrypt_symlink_data *sd = NULL;
+	struct fscrypt_str disk_link;
 	int err;
 
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
-	if (f2fs_encrypted_inode(dir)) {
-		err = fscrypt_get_encryption_info(dir);
-		if (err)
-			return err;
-
-		if (!fscrypt_has_encryption_key(dir))
-			return -ENOKEY;
-
-		disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
-				sizeof(struct fscrypt_symlink_data));
-	}
-
-	if (disk_link.len > dir->i_sb->s_blocksize)
-		return -ENAMETOOLONG;
+	err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
+				      &disk_link);
+	if (err)
+		return err;
 
 	err = dquot_initialize(dir);
 	if (err)
@@ -508,7 +573,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	if (f2fs_encrypted_inode(inode))
+	if (IS_ENCRYPTED(inode))
 		inode->i_op = &f2fs_encrypted_symlink_inode_operations;
 	else
 		inode->i_op = &f2fs_symlink_inode_operations;
@@ -518,38 +583,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
-		goto out;
+		goto out_handle_failed_inode;
 	f2fs_unlock_op(sbi);
 	alloc_nid_done(sbi, inode->i_ino);
 
-	if (f2fs_encrypted_inode(inode)) {
-		struct qstr istr = QSTR_INIT(symname, len);
-		struct fscrypt_str ostr;
-
-		sd = f2fs_kzalloc(sbi, disk_link.len, GFP_NOFS);
-		if (!sd) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-
-		err = fscrypt_get_encryption_info(inode);
-		if (err)
-			goto err_out;
-
-		if (!fscrypt_has_encryption_key(inode)) {
-			err = -ENOKEY;
-			goto err_out;
-		}
-
-		ostr.name = sd->encrypted_path;
-		ostr.len = disk_link.len;
-		err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
-		if (err)
-			goto err_out;
-
-		sd->len = cpu_to_le16(ostr.len);
-		disk_link.name = (char *)sd;
-	}
+	err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link);
+	if (err)
+		goto err_out;
 
 	err = page_symlink(inode, disk_link.name, disk_link.len);
 
@@ -576,12 +616,14 @@ err_out:
 		f2fs_unlink(dir, dentry);
 	}
 
-	kfree(sd);
-
 	f2fs_balance_fs(sbi, true);
-	return err;
-out:
+	goto out_free_encrypted_link;
+
+out_handle_failed_inode:
 	handle_failed_inode(inode);
+out_free_encrypted_link:
+	if (disk_link.name != (unsigned char *)symname)
+		kfree(disk_link.name);
 	return err;
 }
 
@@ -743,10 +785,12 @@ out:
 
 static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+
+	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
-	if (f2fs_encrypted_inode(dir)) {
+	if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
 		int err = fscrypt_get_encryption_info(dir);
 		if (err)
 			return err;
@@ -926,7 +970,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			f2fs_put_page(old_dir_page, 0);
 		f2fs_i_links_write(old_dir, false);
 	}
-	add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
+	if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
+		add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
 
 	f2fs_unlock_op(sbi);
 
@@ -1076,8 +1121,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 	f2fs_mark_inode_dirty_sync(new_dir, false);
 
-	add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO);
-	add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
+	if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) {
+		add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO);
+		add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
+	}
 
 	f2fs_unlock_op(sbi);
 
@@ -1127,65 +1174,21 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 
 static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
 {
-	struct page *cpage = NULL;
-	char *caddr, *paddr = NULL;
-	struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
-	struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
-	struct fscrypt_symlink_data *sd;
 	struct inode *inode = d_inode(dentry);
-	u32 max_size = inode->i_sb->s_blocksize;
-	int res;
+	struct page *page;
+	void *target;
 
-	res = fscrypt_get_encryption_info(inode);
-	if (res)
-		return ERR_PTR(res);
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
 
-	cpage = read_mapping_page(inode->i_mapping, 0, NULL);
-	if (IS_ERR(cpage))
-		return ERR_CAST(cpage);
-	caddr = page_address(cpage);
+	page = read_mapping_page(inode->i_mapping, 0, NULL);
+	if (IS_ERR(page))
+		return ERR_CAST(page);
 
-	/* Symlink is encrypted */
-	sd = (struct fscrypt_symlink_data *)caddr;
-	cstr.name = sd->encrypted_path;
-	cstr.len = le16_to_cpu(sd->len);
-
-	/* this is broken symlink case */
-	if (unlikely(cstr.len == 0)) {
-		res = -ENOENT;
-		goto errout;
-	}
-
-	if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
-		/* Symlink data on the disk is corrupted */
-		res = -EIO;
-		goto errout;
-	}
-	res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
-	if (res)
-		goto errout;
-
-	res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
-	if (res)
-		goto errout;
-
-	/* this is broken symlink case */
-	if (unlikely(pstr.name[0] == 0)) {
-		res = -ENOENT;
-		goto errout;
-	}
-
-	paddr = pstr.name;
-
-	/* Null-terminate the name */
-	paddr[pstr.len] = '\0';
-
-	put_page(cpage);
-	return *cookie = paddr;
-errout:
-	fscrypt_fname_free_buffer(&pstr);
-	put_page(cpage);
-	return ERR_PTR(res);
+	target = fscrypt_get_symlink(inode, page_address(page),
+				     inode->i_sb->s_blocksize);
+	put_page(page);
+	return *cookie = target;
 }
 
 const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index c294d0feea08..157d768c7b31 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -193,8 +193,8 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
 	__free_nat_entry(e);
 }
 
-static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
-						struct nat_entry *ne)
+static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
+							struct nat_entry *ne)
 {
 	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
 	struct nat_entry_set *head;
@@ -209,15 +209,36 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 		head->entry_cnt = 0;
 		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
 	}
+	return head;
+}
+
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+						struct nat_entry *ne)
+{
+	struct nat_entry_set *head;
+	bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR;
+
+	if (!new_ne)
+		head = __grab_nat_entry_set(nm_i, ne);
+
+	/*
+	 * update entry_cnt in below condition:
+	 * 1. update NEW_ADDR to valid block address;
+	 * 2. update old block address to new one;
+	 */
+	if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
+				!get_nat_flag(ne, IS_DIRTY)))
+		head->entry_cnt++;
+
+	set_nat_flag(ne, IS_PREALLOC, new_ne);
 
 	if (get_nat_flag(ne, IS_DIRTY))
 		goto refresh_list;
 
 	nm_i->dirty_nat_cnt++;
-	head->entry_cnt++;
 	set_nat_flag(ne, IS_DIRTY, true);
 refresh_list:
-	if (nat_get_blkaddr(ne) == NEW_ADDR)
+	if (new_ne)
 		list_del_init(&ne->list);
 	else
 		list_move_tail(&ne->list, &head->entry_list);
@@ -1076,7 +1097,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
 
 	f2fs_wait_on_page_writeback(page, NODE, true);
 	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
-	set_cold_node(dn->inode, page);
+	set_cold_node(page, S_ISDIR(dn->inode->i_mode));
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 	if (set_page_dirty(page))
@@ -2313,6 +2334,7 @@ retry:
 	if (!PageUptodate(ipage))
 		SetPageUptodate(ipage);
 	fill_node_footer(ipage, ino, ino, 0, true);
+	set_cold_node(page, false);
 
 	src = F2FS_INODE(page);
 	dst = F2FS_INODE(ipage);
@@ -2602,8 +2624,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
 	if (!enabled_nat_bits(sbi, NULL))
 		return 0;
 
-	nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
-						F2FS_BLKSIZE - 1);
+	nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
 	nm_i->nat_bits = f2fs_kzalloc(sbi,
 			nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
 	if (!nm_i->nat_bits)
@@ -2729,12 +2750,20 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 static int init_free_nid_cache(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	int i;
 
-	nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks *
-					NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
+	nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks *
+				sizeof(unsigned char *), GFP_KERNEL);
 	if (!nm_i->free_nid_bitmap)
 		return -ENOMEM;
 
+	for (i = 0; i < nm_i->nat_blocks; i++) {
+		nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi,
+				NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL);
+		if (!nm_i->free_nid_bitmap)
+			return -ENOMEM;
+	}
+
 	nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8,
 								GFP_KERNEL);
 	if (!nm_i->nat_block_bitmap)
@@ -2825,7 +2854,13 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 	up_write(&nm_i->nat_tree_lock);
 
 	kvfree(nm_i->nat_block_bitmap);
-	kvfree(nm_i->free_nid_bitmap);
+	if (nm_i->free_nid_bitmap) {
+		int i;
+
+		for (i = 0; i < nm_i->nat_blocks; i++)
+			kvfree(nm_i->free_nid_bitmap[i]);
+		kfree(nm_i->free_nid_bitmap);
+	}
 	kvfree(nm_i->free_nid_count);
 
 	kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 081ef0d672bf..b95e49e4a928 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -44,6 +44,7 @@ enum {
 	HAS_FSYNCED_INODE,	/* is the inode fsynced before? */
 	HAS_LAST_FSYNC,		/* has the latest node fsync mark? */
 	IS_DIRTY,		/* this nat entry is dirty? */
+	IS_PREALLOC,		/* nat entry is preallocated */
 };
 
 /*
@@ -422,12 +423,12 @@ static inline void clear_inline_node(struct page *page)
 	ClearPageChecked(page);
 }
 
-static inline void set_cold_node(struct inode *inode, struct page *page)
+static inline void set_cold_node(struct page *page, bool is_dir)
 {
 	struct f2fs_node *rn = F2FS_NODE(page);
 	unsigned int flag = le32_to_cpu(rn->footer.flag);
 
-	if (S_ISDIR(inode->i_mode))
+	if (is_dir)
 		flag &= ~(0x1 << COLD_BIT_SHIFT);
 	else
 		flag |= (0x1 << COLD_BIT_SHIFT);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 210de28c9cd2..4ddc2262baf1 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -242,6 +242,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
 	struct curseg_info *curseg;
 	struct page *page = NULL;
 	block_t blkaddr;
+	unsigned int loop_cnt = 0;
+	unsigned int free_blocks = sbi->user_block_count -
+					valid_user_blocks(sbi);
 	int err = 0;
 
 	/* get node pages in the current segment */
@@ -294,6 +297,17 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
 		if (IS_INODE(page) && is_dent_dnode(page))
 			entry->last_dentry = blkaddr;
 next:
+		/* sanity check in order to detect looped node chain */
+		if (++loop_cnt >= free_blocks ||
+			blkaddr == next_blkaddr_of_node(page)) {
+			f2fs_msg(sbi->sb, KERN_NOTICE,
+				"%s: detect looped node chain, "
+				"blkaddr:%u, next:%u",
+				__func__, blkaddr, next_blkaddr_of_node(page));
+			err = -EINVAL;
+			break;
+		}
+
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
 		f2fs_put_page(page, 1);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index bf98f6f34b7e..d7bac60ad719 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1491,12 +1491,11 @@ static int issue_discard_thread(void *data)
 		if (kthread_should_stop())
 			return 0;
 
-		if (dcc->discard_wake) {
+		if (dcc->discard_wake)
 			dcc->discard_wake = 0;
-			if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
-				init_discard_policy(&dpolicy,
-							DPOLICY_FORCE, 1);
-		}
+
+		if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
+			init_discard_policy(&dpolicy, DPOLICY_FORCE, 1);
 
 		sb_start_intwrite(sbi->sb);
 
@@ -1565,7 +1564,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
 #ifdef CONFIG_BLK_DEV_ZONED
-	if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
+	if (f2fs_sb_has_blkzoned(sbi->sb) &&
 				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
 		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
 #endif
@@ -1763,7 +1762,7 @@ find_next:
 					sbi->blocks_per_seg, cur_pos);
 			len = next_pos - cur_pos;
 
-			if (f2fs_sb_mounted_blkzoned(sbi->sb) ||
+			if (f2fs_sb_has_blkzoned(sbi->sb) ||
 			    (force && len < cpc->trim_minlen))
 				goto skip;
 
@@ -1807,7 +1806,7 @@ void init_discard_policy(struct discard_policy *dpolicy,
 	} else if (discard_type == DPOLICY_FORCE) {
 		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
 		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
-		dpolicy->io_aware = true;
+		dpolicy->io_aware = false;
 	} else if (discard_type == DPOLICY_FSTRIM) {
 		dpolicy->io_aware = false;
 	} else if (discard_type == DPOLICY_UMOUNT) {
@@ -1943,7 +1942,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 			sbi->discard_blks--;
 
 		/* don't overwrite by SSR to keep node chain */
-		if (se->type == CURSEG_WARM_NODE) {
+		if (IS_NODESEG(se->type)) {
 			if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
 				se->ckpt_valid_blocks++;
 		}
@@ -2244,11 +2243,17 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
 	if (sbi->segs_per_sec != 1)
 		return CURSEG_I(sbi, type)->segno;
 
-	if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+	if (test_opt(sbi, NOHEAP) &&
+		(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
 		return 0;
 
 	if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
 		return SIT_I(sbi)->last_victim[ALLOC_NEXT];
+
+	/* find segments from 0 to reuse freed segments */
+	if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
+		return 0;
+
 	return CURSEG_I(sbi, type)->segno;
 }
 
@@ -2535,6 +2540,101 @@ int rw_hint_to_seg_type(enum rw_hint hint)
 	}
 }
 
+/* This returns write hints for each segment type. This hints will be
+ * passed down to block layer. There are mapping tables which depend on
+ * the mount option 'whint_mode'.
+ *
+ * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
+ *
+ * 2) whint_mode=user-based. F2FS tries to pass down hints given by users.
+ *
+ * User                  F2FS                     Block
+ * ----                  ----                     -----
+ *                       META                     WRITE_LIFE_NOT_SET
+ *                       HOT_NODE                 "
+ *                       WARM_NODE                "
+ *                       COLD_NODE                "
+ * ioctl(COLD)           COLD_DATA                WRITE_LIFE_EXTREME
+ * extension list        "                        "
+ *
+ * -- buffered io
+ * WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE       "                        "
+ * WRITE_LIFE_MEDIUM     "                        "
+ * WRITE_LIFE_LONG       "                        "
+ *
+ * -- direct io
+ * WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE       "                        WRITE_LIFE_NONE
+ * WRITE_LIFE_MEDIUM     "                        WRITE_LIFE_MEDIUM
+ * WRITE_LIFE_LONG       "                        WRITE_LIFE_LONG
+ *
+ * 3) whint_mode=fs-based. F2FS passes down hints with its policy.
+ *
+ * User                  F2FS                     Block
+ * ----                  ----                     -----
+ *                       META                     WRITE_LIFE_MEDIUM;
+ *                       HOT_NODE                 WRITE_LIFE_NOT_SET
+ *                       WARM_NODE                "
+ *                       COLD_NODE                WRITE_LIFE_NONE
+ * ioctl(COLD)           COLD_DATA                WRITE_LIFE_EXTREME
+ * extension list        "                        "
+ *
+ * -- buffered io
+ * WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_LONG
+ * WRITE_LIFE_NONE       "                        "
+ * WRITE_LIFE_MEDIUM     "                        "
+ * WRITE_LIFE_LONG       "                        "
+ *
+ * -- direct io
+ * WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE       "                        WRITE_LIFE_NONE
+ * WRITE_LIFE_MEDIUM     "                        WRITE_LIFE_MEDIUM
+ * WRITE_LIFE_LONG       "                        WRITE_LIFE_LONG
+ */
+
+enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi,
+				enum page_type type, enum temp_type temp)
+{
+	if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) {
+		if (type == DATA) {
+			if (temp == WARM)
+				return WRITE_LIFE_NOT_SET;
+			else if (temp == HOT)
+				return WRITE_LIFE_SHORT;
+			else if (temp == COLD)
+				return WRITE_LIFE_EXTREME;
+		} else {
+			return WRITE_LIFE_NOT_SET;
+		}
+	} else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) {
+		if (type == DATA) {
+			if (temp == WARM)
+				return WRITE_LIFE_LONG;
+			else if (temp == HOT)
+				return WRITE_LIFE_SHORT;
+			else if (temp == COLD)
+				return WRITE_LIFE_EXTREME;
+		} else if (type == NODE) {
+			if (temp == WARM || temp == HOT)
+				return WRITE_LIFE_NOT_SET;
+			else if (temp == COLD)
+				return WRITE_LIFE_NONE;
+		} else if (type == META) {
+			return WRITE_LIFE_MEDIUM;
+		}
+	}
+	return WRITE_LIFE_NOT_SET;
+}
+
 static int __get_segment_type_2(struct f2fs_io_info *fio)
 {
 	if (fio->type == DATA)
@@ -2567,7 +2667,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
 
 		if (is_cold_data(fio->page) || file_is_cold(inode))
 			return CURSEG_COLD_DATA;
-		if (is_inode_flag_set(inode, FI_HOT_DATA))
+		if (file_is_hot(inode) ||
+				is_inode_flag_set(inode, FI_HOT_DATA))
 			return CURSEG_HOT_DATA;
 		/* rw_hint_to_seg_type(inode->i_write_hint); */
 		return CURSEG_WARM_DATA;
@@ -2583,7 +2684,7 @@ static int __get_segment_type(struct f2fs_io_info *fio)
 {
 	int type = 0;
 
-	switch (fio->sbi->active_logs) {
+	switch (F2FS_OPTION(fio->sbi).active_logs) {
 	case 2:
 		type = __get_segment_type_2(fio);
 		break;
@@ -2723,6 +2824,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.type = META,
+		.temp = HOT,
 		.op = REQ_OP_WRITE,
 		.op_flags = REQ_SYNC | REQ_NOIDLE | REQ_META | REQ_PRIO,
 		.old_blkaddr = page->index,
@@ -2769,8 +2871,15 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
 int rewrite_data_page(struct f2fs_io_info *fio)
 {
 	int err;
+	struct f2fs_sb_info *sbi = fio->sbi;
 
 	fio->new_blkaddr = fio->old_blkaddr;
+	/* i/o temperature is needed for passing down write hints */
+	__get_segment_type(fio);
+
+	f2fs_bug_on(sbi, !IS_DATASEG(get_seg_entry(sbi,
+			GET_SEGNO(sbi, fio->new_blkaddr))->type));
+
 	stat_inc_inplace_blocks(fio->sbi);
 
 	err = f2fs_submit_page_bio(fio);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5d6d3e72be31..96a2d57ba8a4 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -53,13 +53,19 @@
 	 ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /		\
 	  (sbi)->segs_per_sec))	\
 
-#define MAIN_BLKADDR(sbi)	(SM_I(sbi)->main_blkaddr)
-#define SEG0_BLKADDR(sbi)	(SM_I(sbi)->seg0_blkaddr)
+#define MAIN_BLKADDR(sbi)						\
+	(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : 				\
+		le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr))
+#define SEG0_BLKADDR(sbi)						\
+	(SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : 				\
+		le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr))
 
 #define MAIN_SEGS(sbi)	(SM_I(sbi)->main_segments)
 #define MAIN_SECS(sbi)	((sbi)->total_sections)
 
-#define TOTAL_SEGS(sbi)	(SM_I(sbi)->segment_count)
+#define TOTAL_SEGS(sbi)							\
+	(SM_I(sbi) ? SM_I(sbi)->segment_count : 				\
+		le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count))
 #define TOTAL_BLKS(sbi)	(TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
 
 #define MAX_BLKADDR(sbi)	(SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
@@ -596,6 +602,8 @@ static inline int utilization(struct f2fs_sb_info *sbi)
 #define DEF_MIN_FSYNC_BLOCKS	8
 #define DEF_MIN_HOT_BLOCKS	16
 
+#define SMALL_VOLUME_SEGMENTS	(16 * 512)	/* 16GB */
+
 enum {
 	F2FS_IPU_FORCE,
 	F2FS_IPU_SSR,
@@ -630,10 +638,17 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 	f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1);
 }
 
-static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr)
 {
-	BUG_ON(blk_addr < SEG0_BLKADDR(sbi)
-			|| blk_addr >= MAX_BLKADDR(sbi));
+	struct f2fs_sb_info *sbi = fio->sbi;
+
+	if (PAGE_TYPE_OF_BIO(fio->type) == META &&
+				(!is_read_io(fio->op) || fio->is_meta))
+		BUG_ON(blk_addr < SEG0_BLKADDR(sbi) ||
+				blk_addr >= MAIN_BLKADDR(sbi));
+	else
+		BUG_ON(blk_addr < MAIN_BLKADDR(sbi) ||
+				blk_addr >= MAX_BLKADDR(sbi));
 }
 
 /*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index aaeba346e9d7..a622eb4f59f2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -60,7 +60,7 @@ char *fault_name[FAULT_MAX] = {
 static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
 						unsigned int rate)
 {
-	struct f2fs_fault_info *ffi = &sbi->fault_info;
+	struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
 
 	if (rate) {
 		atomic_set(&ffi->inject_ops, 0);
@@ -129,6 +129,10 @@ enum {
 	Opt_jqfmt_vfsold,
 	Opt_jqfmt_vfsv0,
 	Opt_jqfmt_vfsv1,
+	Opt_whint,
+	Opt_alloc,
+	Opt_fsync,
+	Opt_test_dummy_encryption,
 	Opt_err,
 };
 
@@ -182,6 +186,10 @@ static match_table_t f2fs_tokens = {
 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
 	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
+	{Opt_whint, "whint_mode=%s"},
+	{Opt_alloc, "alloc_mode=%s"},
+	{Opt_fsync, "fsync_mode=%s"},
+	{Opt_test_dummy_encryption, "test_dummy_encryption"},
 	{Opt_err, NULL},
 };
 
@@ -202,21 +210,24 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
 	block_t limit = (sbi->user_block_count << 1) / 1000;
 
 	/* limit is 0.2% */
-	if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) {
-		sbi->root_reserved_blocks = limit;
+	if (test_opt(sbi, RESERVE_ROOT) &&
+			F2FS_OPTION(sbi).root_reserved_blocks > limit) {
+		F2FS_OPTION(sbi).root_reserved_blocks = limit;
 		f2fs_msg(sbi->sb, KERN_INFO,
 			"Reduce reserved blocks for root = %u",
-				sbi->root_reserved_blocks);
+			F2FS_OPTION(sbi).root_reserved_blocks);
 	}
 	if (!test_opt(sbi, RESERVE_ROOT) &&
-		(!uid_eq(sbi->s_resuid,
+		(!uid_eq(F2FS_OPTION(sbi).s_resuid,
 				make_kuid(&init_user_ns, F2FS_DEF_RESUID)) ||
-		!gid_eq(sbi->s_resgid,
+		!gid_eq(F2FS_OPTION(sbi).s_resgid,
 				make_kgid(&init_user_ns, F2FS_DEF_RESGID))))
 		f2fs_msg(sbi->sb, KERN_INFO,
 			"Ignore s_resuid=%u, s_resgid=%u w/o reserve_root",
-				from_kuid_munged(&init_user_ns, sbi->s_resuid),
-				from_kgid_munged(&init_user_ns, sbi->s_resgid));
+				from_kuid_munged(&init_user_ns,
+					F2FS_OPTION(sbi).s_resuid),
+				from_kgid_munged(&init_user_ns,
+					F2FS_OPTION(sbi).s_resgid));
 }
 
 static void init_once(void *foo)
@@ -236,7 +247,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 	char *qname;
 	int ret = -EINVAL;
 
-	if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) {
+	if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) {
 		f2fs_msg(sb, KERN_ERR,
 			"Cannot change journaled "
 			"quota options when quota turned on");
@@ -254,8 +265,8 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 			"Not enough memory for storing quotafile name");
 		return -EINVAL;
 	}
-	if (sbi->s_qf_names[qtype]) {
-		if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+	if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
+		if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)
 			ret = 0;
 		else
 			f2fs_msg(sb, KERN_ERR,
@@ -268,7 +279,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 			"quotafile must be on filesystem root");
 		goto errout;
 	}
-	sbi->s_qf_names[qtype] = qname;
+	F2FS_OPTION(sbi).s_qf_names[qtype] = qname;
 	set_opt(sbi, QUOTA);
 	return 0;
 errout:
@@ -280,13 +291,13 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 
-	if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) {
+	if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) {
 		f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options"
 			" when quota turned on");
 		return -EINVAL;
 	}
-	kfree(sbi->s_qf_names[qtype]);
-	sbi->s_qf_names[qtype] = NULL;
+	kfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
+	F2FS_OPTION(sbi).s_qf_names[qtype] = NULL;
 	return 0;
 }
 
@@ -302,15 +313,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
 			 "Cannot enable project quota enforcement.");
 		return -1;
 	}
-	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] ||
-			sbi->s_qf_names[PRJQUOTA]) {
-		if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+	if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
+			F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
+			F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) {
+		if (test_opt(sbi, USRQUOTA) &&
+				F2FS_OPTION(sbi).s_qf_names[USRQUOTA])
 			clear_opt(sbi, USRQUOTA);
 
-		if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+		if (test_opt(sbi, GRPQUOTA) &&
+				F2FS_OPTION(sbi).s_qf_names[GRPQUOTA])
 			clear_opt(sbi, GRPQUOTA);
 
-		if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA])
+		if (test_opt(sbi, PRJQUOTA) &&
+				F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
 			clear_opt(sbi, PRJQUOTA);
 
 		if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) ||
@@ -320,19 +335,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
 			return -1;
 		}
 
-		if (!sbi->s_jquota_fmt) {
+		if (!F2FS_OPTION(sbi).s_jquota_fmt) {
 			f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format "
 					"not specified");
 			return -1;
 		}
 	}
 
-	if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) {
+	if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) {
 		f2fs_msg(sbi->sb, KERN_INFO,
 			"QUOTA feature is enabled, so ignore jquota_fmt");
-		sbi->s_jquota_fmt = 0;
+		F2FS_OPTION(sbi).s_jquota_fmt = 0;
 	}
-	if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) {
+	if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) {
 		f2fs_msg(sbi->sb, KERN_INFO,
 			 "Filesystem with quota feature cannot be mounted RDWR "
 			 "without CONFIG_QUOTA");
@@ -403,14 +418,14 @@ static int parse_options(struct super_block *sb, char *options)
 			q = bdev_get_queue(sb->s_bdev);
 			if (blk_queue_discard(q)) {
 				set_opt(sbi, DISCARD);
-			} else if (!f2fs_sb_mounted_blkzoned(sb)) {
+			} else if (!f2fs_sb_has_blkzoned(sb)) {
 				f2fs_msg(sb, KERN_WARNING,
 					"mounting with \"discard\" option, but "
 					"the device does not support discard");
 			}
 			break;
 		case Opt_nodiscard:
-			if (f2fs_sb_mounted_blkzoned(sb)) {
+			if (f2fs_sb_has_blkzoned(sb)) {
 				f2fs_msg(sb, KERN_WARNING,
 					"discard is required for zoned block devices");
 				return -EINVAL;
@@ -440,7 +455,7 @@ static int parse_options(struct super_block *sb, char *options)
 			if (args->from && match_int(args, &arg))
 				return -EINVAL;
 			set_opt(sbi, INLINE_XATTR_SIZE);
-			sbi->inline_xattr_size = arg;
+			F2FS_OPTION(sbi).inline_xattr_size = arg;
 			break;
 #else
 		case Opt_user_xattr:
@@ -480,7 +495,7 @@ static int parse_options(struct super_block *sb, char *options)
 				return -EINVAL;
 			if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
 				return -EINVAL;
-			sbi->active_logs = arg;
+			F2FS_OPTION(sbi).active_logs = arg;
 			break;
 		case Opt_disable_ext_identify:
 			set_opt(sbi, DISABLE_EXT_IDENTIFY);
@@ -524,9 +539,9 @@ static int parse_options(struct super_block *sb, char *options)
 			if (test_opt(sbi, RESERVE_ROOT)) {
 				f2fs_msg(sb, KERN_INFO,
 					"Preserve previous reserve_root=%u",
-					sbi->root_reserved_blocks);
+					F2FS_OPTION(sbi).root_reserved_blocks);
 			} else {
-				sbi->root_reserved_blocks = arg;
+				F2FS_OPTION(sbi).root_reserved_blocks = arg;
 				set_opt(sbi, RESERVE_ROOT);
 			}
 			break;
@@ -539,7 +554,7 @@ static int parse_options(struct super_block *sb, char *options)
 					"Invalid uid value %d", arg);
 				return -EINVAL;
 			}
-			sbi->s_resuid = uid;
+			F2FS_OPTION(sbi).s_resuid = uid;
 			break;
 		case Opt_resgid:
 			if (args->from && match_int(args, &arg))
@@ -550,7 +565,7 @@ static int parse_options(struct super_block *sb, char *options)
 					"Invalid gid value %d", arg);
 				return -EINVAL;
 			}
-			sbi->s_resgid = gid;
+			F2FS_OPTION(sbi).s_resgid = gid;
 			break;
 		case Opt_mode:
 			name = match_strdup(&args[0]);
@@ -559,7 +574,7 @@ static int parse_options(struct super_block *sb, char *options)
 				return -ENOMEM;
 			if (strlen(name) == 8 &&
 					!strncmp(name, "adaptive", 8)) {
-				if (f2fs_sb_mounted_blkzoned(sb)) {
+				if (f2fs_sb_has_blkzoned(sb)) {
 					f2fs_msg(sb, KERN_WARNING,
 						 "adaptive mode is not allowed with "
 						 "zoned block device feature");
@@ -585,7 +600,7 @@ static int parse_options(struct super_block *sb, char *options)
 					1 << arg, BIO_MAX_PAGES);
 				return -EINVAL;
 			}
-			sbi->write_io_size_bits = arg;
+			F2FS_OPTION(sbi).write_io_size_bits = arg;
 			break;
 		case Opt_fault_injection:
 			if (args->from && match_int(args, &arg))
@@ -646,13 +661,13 @@ static int parse_options(struct super_block *sb, char *options)
 				return ret;
 			break;
 		case Opt_jqfmt_vfsold:
-			sbi->s_jquota_fmt = QFMT_VFS_OLD;
+			F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD;
 			break;
 		case Opt_jqfmt_vfsv0:
-			sbi->s_jquota_fmt = QFMT_VFS_V0;
+			F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0;
 			break;
 		case Opt_jqfmt_vfsv1:
-			sbi->s_jquota_fmt = QFMT_VFS_V1;
+			F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1;
 			break;
 		case Opt_noquota:
 			clear_opt(sbi, QUOTA);
@@ -679,6 +694,73 @@ static int parse_options(struct super_block *sb, char *options)
 					"quota operations not supported");
 			break;
 #endif
+		case Opt_whint:
+			name = match_strdup(&args[0]);
+			if (!name)
+				return -ENOMEM;
+			if (strlen(name) == 10 &&
+					!strncmp(name, "user-based", 10)) {
+				F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER;
+			} else if (strlen(name) == 3 &&
+					!strncmp(name, "off", 3)) {
+				F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
+			} else if (strlen(name) == 8 &&
+					!strncmp(name, "fs-based", 8)) {
+				F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
+			} else {
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			break;
+		case Opt_alloc:
+			name = match_strdup(&args[0]);
+			if (!name)
+				return -ENOMEM;
+
+			if (strlen(name) == 7 &&
+					!strncmp(name, "default", 7)) {
+				F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+			} else if (strlen(name) == 5 &&
+					!strncmp(name, "reuse", 5)) {
+				F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+			} else {
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			break;
+		case Opt_fsync:
+			name = match_strdup(&args[0]);
+			if (!name)
+				return -ENOMEM;
+			if (strlen(name) == 5 &&
+					!strncmp(name, "posix", 5)) {
+				F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
+			} else if (strlen(name) == 6 &&
+					!strncmp(name, "strict", 6)) {
+				F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT;
+			} else {
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			break;
+		case Opt_test_dummy_encryption:
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+			if (!f2fs_sb_has_encrypt(sb)) {
+				f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
+				return -EINVAL;
+			}
+
+			F2FS_OPTION(sbi).test_dummy_encryption = true;
+			f2fs_msg(sb, KERN_INFO,
+					"Test dummy encryption mode enabled");
+#else
+			f2fs_msg(sb, KERN_INFO,
+					"Test dummy encryption mount option ignored");
+#endif
+			break;
 		default:
 			f2fs_msg(sb, KERN_ERR,
 				"Unrecognized mount option \"%s\" or missing value",
@@ -699,14 +781,22 @@ static int parse_options(struct super_block *sb, char *options)
 	}
 
 	if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+		if (!f2fs_sb_has_extra_attr(sb) ||
+			!f2fs_sb_has_flexible_inline_xattr(sb)) {
+			f2fs_msg(sb, KERN_ERR,
+					"extra_attr or flexible_inline_xattr "
+					"feature is off");
+			return -EINVAL;
+		}
 		if (!test_opt(sbi, INLINE_XATTR)) {
 			f2fs_msg(sb, KERN_ERR,
 					"inline_xattr_size option should be "
 					"set with inline_xattr option");
 			return -EINVAL;
 		}
-		if (!sbi->inline_xattr_size ||
-			sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE -
+		if (!F2FS_OPTION(sbi).inline_xattr_size ||
+			F2FS_OPTION(sbi).inline_xattr_size >=
+					DEF_ADDRS_PER_INODE -
 					F2FS_TOTAL_EXTRA_ATTR_SIZE -
 					DEF_INLINE_RESERVED_SIZE -
 					DEF_MIN_INLINE_SIZE) {
@@ -715,6 +805,12 @@ static int parse_options(struct super_block *sb, char *options)
 			return -EINVAL;
 		}
 	}
+
+	/* Not pass down write hints if the number of active logs is lesser
+	 * than NR_CURSEG_TYPE.
+	 */
+	if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE)
+		F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
 	return 0;
 }
 
@@ -731,7 +827,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	/* Initialize f2fs-specific inode info */
 	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
-	fi->i_advise = 0;
 	init_rwsem(&fi->i_sem);
 	INIT_LIST_HEAD(&fi->dirty_list);
 	INIT_LIST_HEAD(&fi->gdirty_list);
@@ -743,10 +838,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	init_rwsem(&fi->i_mmap_sem);
 	init_rwsem(&fi->i_xattr_sem);
 
-#ifdef CONFIG_QUOTA
-	memset(&fi->i_dquot, 0, sizeof(fi->i_dquot));
-	fi->i_reserved_quota = 0;
-#endif
 	/* Will be used by directory only */
 	fi->i_dir_level = F2FS_SB(sb)->dir_level;
 
@@ -957,7 +1048,7 @@ static void f2fs_put_super(struct super_block *sb)
 		mempool_destroy(sbi->write_io_dummy);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
-		kfree(sbi->s_qf_names[i]);
+		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
 #endif
 	destroy_percpu_info(sbi);
 	for (i = 0; i < NR_PAGE_TYPE; i++)
@@ -1071,8 +1162,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks = total_count - start_count;
 	buf->f_bfree = user_block_count - valid_user_blocks(sbi) -
 						sbi->current_reserved_blocks;
-	if (buf->f_bfree > sbi->root_reserved_blocks)
-		buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks;
+	if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks)
+		buf->f_bavail = buf->f_bfree -
+				F2FS_OPTION(sbi).root_reserved_blocks;
 	else
 		buf->f_bavail = 0;
 
@@ -1107,10 +1199,10 @@ static inline void f2fs_show_quota_options(struct seq_file *seq,
 #ifdef CONFIG_QUOTA
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 
-	if (sbi->s_jquota_fmt) {
+	if (F2FS_OPTION(sbi).s_jquota_fmt) {
 		char *fmtname = "";
 
-		switch (sbi->s_jquota_fmt) {
+		switch (F2FS_OPTION(sbi).s_jquota_fmt) {
 		case QFMT_VFS_OLD:
 			fmtname = "vfsold";
 			break;
@@ -1124,14 +1216,17 @@ static inline void f2fs_show_quota_options(struct seq_file *seq,
 		seq_printf(seq, ",jqfmt=%s", fmtname);
 	}
 
-	if (sbi->s_qf_names[USRQUOTA])
-		seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
+	if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA])
+		seq_show_option(seq, "usrjquota",
+			F2FS_OPTION(sbi).s_qf_names[USRQUOTA]);
 
-	if (sbi->s_qf_names[GRPQUOTA])
-		seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
+	if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA])
+		seq_show_option(seq, "grpjquota",
+			F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]);
 
-	if (sbi->s_qf_names[PRJQUOTA])
-		seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]);
+	if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
+		seq_show_option(seq, "prjjquota",
+			F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]);
 #endif
 }
 
@@ -1166,7 +1261,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",noinline_xattr");
 	if (test_opt(sbi, INLINE_XATTR_SIZE))
 		seq_printf(seq, ",inline_xattr_size=%u",
-					sbi->inline_xattr_size);
+					F2FS_OPTION(sbi).inline_xattr_size);
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 	if (test_opt(sbi, POSIX_ACL))
@@ -1202,18 +1297,20 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, "adaptive");
 	else if (test_opt(sbi, LFS))
 		seq_puts(seq, "lfs");
-	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
+	seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
 	if (test_opt(sbi, RESERVE_ROOT))
 		seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u",
-				sbi->root_reserved_blocks,
-				from_kuid_munged(&init_user_ns, sbi->s_resuid),
-				from_kgid_munged(&init_user_ns, sbi->s_resgid));
+				F2FS_OPTION(sbi).root_reserved_blocks,
+				from_kuid_munged(&init_user_ns,
+					F2FS_OPTION(sbi).s_resuid),
+				from_kgid_munged(&init_user_ns,
+					F2FS_OPTION(sbi).s_resgid));
 	if (F2FS_IO_SIZE_BITS(sbi))
 		seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi));
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	if (test_opt(sbi, FAULT_INJECTION))
 		seq_printf(seq, ",fault_injection=%u",
-				sbi->fault_info.inject_rate);
+				F2FS_OPTION(sbi).fault_info.inject_rate);
 #endif
 #ifdef CONFIG_QUOTA
 	if (test_opt(sbi, QUOTA))
@@ -1226,15 +1323,37 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",prjquota");
 #endif
 	f2fs_show_quota_options(seq, sbi->sb);
+	if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER)
+		seq_printf(seq, ",whint_mode=%s", "user-based");
+	else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
+		seq_printf(seq, ",whint_mode=%s", "fs-based");
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	if (F2FS_OPTION(sbi).test_dummy_encryption)
+		seq_puts(seq, ",test_dummy_encryption");
+#endif
 
+	if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT)
+		seq_printf(seq, ",alloc_mode=%s", "default");
+	else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
+		seq_printf(seq, ",alloc_mode=%s", "reuse");
+
+	if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX)
+		seq_printf(seq, ",fsync_mode=%s", "posix");
+	else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
+		seq_printf(seq, ",fsync_mode=%s", "strict");
 	return 0;
 }
 
 static void default_options(struct f2fs_sb_info *sbi)
 {
 	/* init some FS parameters */
-	sbi->active_logs = NR_CURSEG_TYPE;
-	sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE;
+	F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
+	F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+	F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
+	F2FS_OPTION(sbi).test_dummy_encryption = false;
+	sbi->readdir_ra = 1;
 
 	set_opt(sbi, BG_GC);
 	set_opt(sbi, INLINE_XATTR);
@@ -1244,7 +1363,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, NOHEAP);
 	sbi->sb->s_flags |= MS_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
-	if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
+	if (f2fs_sb_has_blkzoned(sbi->sb)) {
 		set_opt_mode(sbi, F2FS_MOUNT_LFS);
 		set_opt(sbi, DISCARD);
 	} else {
@@ -1271,16 +1390,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct f2fs_mount_info org_mount_opt;
 	unsigned long old_sb_flags;
-	int err, active_logs;
+	int err;
 	bool need_restart_gc = false;
 	bool need_stop_gc = false;
 	bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-	struct f2fs_fault_info ffi = sbi->fault_info;
-#endif
 #ifdef CONFIG_QUOTA
-	int s_jquota_fmt;
-	char *s_qf_names[MAXQUOTAS];
 	int i, j;
 #endif
 
@@ -1290,21 +1404,21 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 */
 	org_mount_opt = sbi->mount_opt;
 	old_sb_flags = sb->s_flags;
-	active_logs = sbi->active_logs;
 
 #ifdef CONFIG_QUOTA
-	s_jquota_fmt = sbi->s_jquota_fmt;
+	org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
-		if (sbi->s_qf_names[i]) {
-			s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
-							 GFP_KERNEL);
-			if (!s_qf_names[i]) {
+		if (F2FS_OPTION(sbi).s_qf_names[i]) {
+			org_mount_opt.s_qf_names[i] =
+				kstrdup(F2FS_OPTION(sbi).s_qf_names[i],
+				GFP_KERNEL);
+			if (!org_mount_opt.s_qf_names[i]) {
 				for (j = 0; j < i; j++)
-					kfree(s_qf_names[j]);
+					kfree(org_mount_opt.s_qf_names[j]);
 				return -ENOMEM;
 			}
 		} else {
-			s_qf_names[i] = NULL;
+			org_mount_opt.s_qf_names[i] = NULL;
 		}
 	}
 #endif
@@ -1374,7 +1488,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		need_stop_gc = true;
 	}
 
-	if (*flags & MS_RDONLY) {
+	if (*flags & MS_RDONLY ||
+		F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) {
 		writeback_inodes_sb(sb, WB_REASON_SYNC);
 		sync_inodes_sb(sb);
 
@@ -1400,7 +1515,7 @@ skip:
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
-		kfree(s_qf_names[i]);
+		kfree(org_mount_opt.s_qf_names[i]);
 #endif
 	/* Update the POSIXACL Flag */
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -1418,18 +1533,14 @@ restore_gc:
 	}
 restore_opts:
 #ifdef CONFIG_QUOTA
-	sbi->s_jquota_fmt = s_jquota_fmt;
+	F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
-		kfree(sbi->s_qf_names[i]);
-		sbi->s_qf_names[i] = s_qf_names[i];
+		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+		F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i];
 	}
 #endif
 	sbi->mount_opt = org_mount_opt;
-	sbi->active_logs = active_logs;
 	sb->s_flags = old_sb_flags;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-	sbi->fault_info = ffi;
-#endif
 	return err;
 }
 
@@ -1457,7 +1568,7 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
 	while (toread > 0) {
 		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
 repeat:
-		page = read_mapping_page(mapping, blkidx, NULL);
+		page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
 		if (IS_ERR(page)) {
 			if (PTR_ERR(page) == -ENOMEM) {
 				congestion_wait(BLK_RW_ASYNC, HZ/50);
@@ -1551,8 +1662,8 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode)
 
 static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
 {
-	return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type],
-						sbi->s_jquota_fmt, type);
+	return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type],
+					F2FS_OPTION(sbi).s_jquota_fmt, type);
 }
 
 int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
@@ -1571,7 +1682,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
 	}
 
 	for (i = 0; i < MAXQUOTAS; i++) {
-		if (sbi->s_qf_names[i]) {
+		if (F2FS_OPTION(sbi).s_qf_names[i]) {
 			err = f2fs_quota_on_mount(sbi, i);
 			if (!err) {
 				enabled = 1;
@@ -1801,11 +1912,28 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
 static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	/*
+	 * Encrypting the root directory is not allowed because fsck
+	 * expects lost+found directory to exist and remain unencrypted
+	 * if LOST_FOUND feature is enabled.
+	 *
+	 */
+	if (f2fs_sb_has_lost_found(sbi->sb) &&
+			inode->i_ino == F2FS_ROOT_INO(sbi))
+		return -EPERM;
+
 	return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
 				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
 				ctx, len, fs_data, XATTR_CREATE);
 }
 
+static bool f2fs_dummy_context(struct inode *inode)
+{
+	return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode));
+}
+
 static unsigned f2fs_max_namelen(struct inode *inode)
 {
 	return S_ISLNK(inode->i_mode) ?
@@ -1816,6 +1944,7 @@ static const struct fscrypt_operations f2fs_cryptops = {
 	.key_prefix	= "f2fs:",
 	.get_context	= f2fs_get_context,
 	.set_context	= f2fs_set_context,
+	.dummy_context	= f2fs_dummy_context,
 	.empty_dir	= f2fs_empty_dir,
 	.max_namelen	= f2fs_max_namelen,
 };
@@ -1898,7 +2027,6 @@ static int __f2fs_commit_super(struct buffer_head *bh,
 	lock_buffer(bh);
 	if (super)
 		memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
-	set_buffer_uptodate(bh);
 	set_buffer_dirty(bh);
 	unlock_buffer(bh);
 
@@ -2185,6 +2313,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 
 	sbi->dirty_device = 0;
 	spin_lock_init(&sbi->dev_lock);
+
+	init_rwsem(&sbi->sb_lock);
 }
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -2210,7 +2340,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 	unsigned int n = 0;
 	int err = -EIO;
 
-	if (!f2fs_sb_mounted_blkzoned(sbi->sb))
+	if (!f2fs_sb_has_blkzoned(sbi->sb))
 		return 0;
 
 	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
@@ -2338,7 +2468,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 	}
 
 	/* write back-up superblock first */
-	bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1);
+	bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1);
 	if (!bh)
 		return -EIO;
 	err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
@@ -2349,7 +2479,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 		return err;
 
 	/* write current valid superblock */
-	bh = sb_getblk(sbi->sb, sbi->valid_super_block);
+	bh = sb_bread(sbi->sb, sbi->valid_super_block);
 	if (!bh)
 		return -EIO;
 	err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
@@ -2421,7 +2551,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 
 #ifdef CONFIG_BLK_DEV_ZONED
 		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
-				!f2fs_sb_mounted_blkzoned(sbi->sb)) {
+				!f2fs_sb_has_blkzoned(sbi->sb)) {
 			f2fs_msg(sbi->sb, KERN_ERR,
 				"Zoned block device feature not enabled\n");
 			return -EINVAL;
@@ -2455,6 +2585,18 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 	return 0;
 }
 
+static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_sm_info *sm_i = SM_I(sbi);
+
+	/* adjust parameters according to the volume size */
+	if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
+		F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+		sm_i->dcc_info->discard_granularity = 1;
+		sm_i->ipu_policy = 1 << F2FS_IPU_FORCE;
+	}
+}
+
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct f2fs_sb_info *sbi;
@@ -2502,8 +2644,8 @@ try_onemore:
 	sb->s_fs_info = sbi;
 	sbi->raw_super = raw_super;
 
-	sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
-	sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
+	F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
+	F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
 
 	/* precompute checksum seed for metadata */
 	if (f2fs_sb_has_inode_chksum(sb))
@@ -2516,7 +2658,7 @@ try_onemore:
 	 * devices, but mandatory for host-managed zoned block devices.
 	 */
 #ifndef CONFIG_BLK_DEV_ZONED
-	if (f2fs_sb_mounted_blkzoned(sb)) {
+	if (f2fs_sb_has_blkzoned(sb)) {
 		f2fs_msg(sb, KERN_ERR,
 			 "Zoned block device support is not enabled\n");
 		err = -EOPNOTSUPP;
@@ -2732,7 +2874,7 @@ try_onemore:
 	 * Turn on quotas which were not enabled for read-only mounts if
 	 * filesystem has quota feature, so that they are updated correctly.
 	 */
-	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) {
+	if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) {
 		err = f2fs_enable_quotas(sb);
 		if (err) {
 			f2fs_msg(sb, KERN_ERR,
@@ -2807,6 +2949,8 @@ skip_recovery:
 
 	f2fs_join_shrinker(sbi);
 
+	f2fs_tuning_parameters(sbi);
+
 	f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx",
 				cur_cp_version(F2FS_CKPT(sbi)));
 	f2fs_update_time(sbi, CP_TIME);
@@ -2815,7 +2959,7 @@ skip_recovery:
 
 free_meta:
 #ifdef CONFIG_QUOTA
-	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb))
+	if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb))
 		f2fs_quota_off_umount(sbi->sb);
 #endif
 	f2fs_sync_inode_meta(sbi);
@@ -2859,7 +3003,7 @@ free_bio_info:
 free_options:
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
-		kfree(sbi->s_qf_names[i]);
+		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
 #endif
 	kfree(options);
 free_sb_buf:
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index d978c7b6ea04..f33a56d6e6dd 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -58,7 +58,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	else if (struct_type == FAULT_INFO_RATE ||
 					struct_type == FAULT_INFO_TYPE)
-		return (unsigned char *)&sbi->fault_info;
+		return (unsigned char *)&F2FS_OPTION(sbi).fault_info;
 #endif
 	return NULL;
 }
@@ -92,10 +92,10 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (!sb->s_bdev->bd_part)
 		return snprintf(buf, PAGE_SIZE, "0\n");
 
-	if (f2fs_sb_has_crypto(sb))
+	if (f2fs_sb_has_encrypt(sb))
 		len += snprintf(buf, PAGE_SIZE - len, "%s",
 						"encryption");
-	if (f2fs_sb_mounted_blkzoned(sb))
+	if (f2fs_sb_has_blkzoned(sb))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "blkzoned");
 	if (f2fs_sb_has_extra_attr(sb))
@@ -116,6 +116,9 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_inode_crtime(sb))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "inode_crtime");
+	if (f2fs_sb_has_lost_found(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "lost_found");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
@@ -136,6 +139,27 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 	if (!ptr)
 		return -EINVAL;
 
+	if (!strcmp(a->attr.name, "extension_list")) {
+		__u8 (*extlist)[F2FS_EXTENSION_LEN] =
+					sbi->raw_super->extension_list;
+		int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+		int hot_count = sbi->raw_super->hot_ext_count;
+		int len = 0, i;
+
+		len += snprintf(buf + len, PAGE_SIZE - len,
+						"cold file extenstion:\n");
+		for (i = 0; i < cold_count; i++)
+			len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+								extlist[i]);
+
+		len += snprintf(buf + len, PAGE_SIZE - len,
+						"hot file extenstion:\n");
+		for (i = cold_count; i < cold_count + hot_count; i++)
+			len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+								extlist[i]);
+		return len;
+	}
+
 	ui = (unsigned int *)(ptr + a->offset);
 
 	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
@@ -154,6 +178,41 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
 	if (!ptr)
 		return -EINVAL;
 
+	if (!strcmp(a->attr.name, "extension_list")) {
+		const char *name = strim((char *)buf);
+		bool set = true, hot;
+
+		if (!strncmp(name, "[h]", 3))
+			hot = true;
+		else if (!strncmp(name, "[c]", 3))
+			hot = false;
+		else
+			return -EINVAL;
+
+		name += 3;
+
+		if (*name == '!') {
+			name++;
+			set = false;
+		}
+
+		if (strlen(name) >= F2FS_EXTENSION_LEN)
+			return -EINVAL;
+
+		down_write(&sbi->sb_lock);
+
+		ret = update_extension_list(sbi, name, hot, set);
+		if (ret)
+			goto out;
+
+		ret = f2fs_commit_super(sbi, false);
+		if (ret)
+			update_extension_list(sbi, name, hot, !set);
+out:
+		up_write(&sbi->sb_lock);
+		return ret ? ret : count;
+	}
+
 	ui = (unsigned int *)(ptr + a->offset);
 
 	ret = kstrtoul(skip_spaces(buf), 0, &t);
@@ -166,7 +225,7 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
 	if (a->struct_type == RESERVED_BLOCKS) {
 		spin_lock(&sbi->stat_lock);
 		if (t > (unsigned long)(sbi->user_block_count -
-					sbi->root_reserved_blocks)) {
+				F2FS_OPTION(sbi).root_reserved_blocks)) {
 			spin_unlock(&sbi->stat_lock);
 			return -EINVAL;
 		}
@@ -236,6 +295,7 @@ enum feat_id {
 	FEAT_FLEXIBLE_INLINE_XATTR,
 	FEAT_QUOTA_INO,
 	FEAT_INODE_CRTIME,
+	FEAT_LOST_FOUND,
 };
 
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -251,6 +311,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 	case FEAT_FLEXIBLE_INLINE_XATTR:
 	case FEAT_QUOTA_INO:
 	case FEAT_INODE_CRTIME:
+	case FEAT_LOST_FOUND:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
 	return 0;
@@ -307,6 +368,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list);
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
 F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
@@ -329,6 +391,7 @@ F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
 F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
 F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
 F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
+F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -357,6 +420,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(iostat_enable),
 	ATTR_LIST(readdir_ra),
 	ATTR_LIST(gc_pin_file_thresh),
+	ATTR_LIST(extension_list),
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	ATTR_LIST(inject_rate),
 	ATTR_LIST(inject_type),
@@ -383,6 +447,7 @@ static struct attribute *f2fs_feat_attrs[] = {
 	ATTR_LIST(flexible_inline_xattr),
 	ATTR_LIST(quota_ino),
 	ATTR_LIST(inode_crtime),
+	ATTR_LIST(lost_found),
 	NULL,
 };
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0fb65843ec1e..f0942a82bb20 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -47,6 +47,7 @@ struct bio {
 	struct bio		*bi_next;	/* request queue link */
 	struct block_device	*bi_bdev;
 	unsigned int		bi_flags;	/* status, command, etc */
+	unsigned short		bi_write_hint;
 	int			bi_error;
 	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
 						 * top bits priority
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 073365c9808a..2ebfa01b7091 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -21,6 +21,7 @@
 #define F2FS_BLKSIZE			4096	/* support only 4KB block */
 #define F2FS_BLKSIZE_BITS		12	/* bits for F2FS_BLKSIZE */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
+#define F2FS_EXTENSION_LEN		8	/* max size of extension */
 #define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS)
 
 #define NULL_ADDR		((block_t)0)	/* used as block_t addresses */
@@ -38,10 +39,10 @@
 
 #define F2FS_MAX_QUOTAS		3
 
-#define F2FS_IO_SIZE(sbi)	(1 << (sbi)->write_io_size_bits) /* Blocks */
-#define F2FS_IO_SIZE_KB(sbi)	(1 << ((sbi)->write_io_size_bits + 2)) /* KB */
-#define F2FS_IO_SIZE_BYTES(sbi)	(1 << ((sbi)->write_io_size_bits + 12)) /* B */
-#define F2FS_IO_SIZE_BITS(sbi)	((sbi)->write_io_size_bits) /* power of 2 */
+#define F2FS_IO_SIZE(sbi)	(1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */
+#define F2FS_IO_SIZE_KB(sbi)	(1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */
+#define F2FS_IO_SIZE_BYTES(sbi)	(1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */
+#define F2FS_IO_SIZE_BITS(sbi)	(F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */
 #define F2FS_IO_SIZE_MASK(sbi)	(F2FS_IO_SIZE(sbi) - 1)
 
 /* This flag is used by node and meta inodes, and by recovery */
@@ -101,7 +102,7 @@ struct f2fs_super_block {
 	__u8 uuid[16];			/* 128-bit uuid for volume */
 	__le16 volume_name[MAX_VOLUME_NAME];	/* volume name */
 	__le32 extension_count;		/* # of extensions below */
-	__u8 extension_list[F2FS_MAX_EXTENSION][8];	/* extension array */
+	__u8 extension_list[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];/* extension array */
 	__le32 cp_payload;
 	__u8 version[VERSION_LEN];	/* the kernel version */
 	__u8 init_version[VERSION_LEN];	/* the initial kernel version */
@@ -110,12 +111,14 @@ struct f2fs_super_block {
 	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
 	struct f2fs_device devs[MAX_DEVICES];	/* device list */
 	__le32 qf_ino[F2FS_MAX_QUOTAS];	/* quota inode numbers */
-	__u8 reserved[315];		/* valid reserved region */
+	__u8 hot_ext_count;		/* # of hot file extension */
+	__u8 reserved[314];		/* valid reserved region */
 } __packed;
 
 /*
  * For checkpoint
  */
+#define CP_LARGE_NAT_BITMAP_FLAG	0x00000400
 #define CP_NOCRC_RECOVERY_FLAG	0x00000200
 #define CP_TRIMMED_FLAG		0x00000100
 #define CP_NAT_BITS_FLAG	0x00000080
@@ -302,6 +305,10 @@ struct f2fs_node {
  */
 #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
 #define NAT_ENTRY_BITMAP_SIZE	((NAT_ENTRY_PER_BLOCK + 7) / 8)
+#define NAT_ENTRY_BITMAP_SIZE_ALIGNED				\
+	((NAT_ENTRY_BITMAP_SIZE + BITS_PER_LONG - 1) /		\
+	BITS_PER_LONG * BITS_PER_LONG)
+
 
 struct f2fs_nat_entry {
 	__u8 version;		/* latest version of cached nat entry */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 16c8235bc0d2..cfb6ff61df30 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,6 +20,7 @@
 #include <linux/rwsem.h>
 #include <linux/capability.h>
 #include <linux/semaphore.h>
+#include <linux/fcntl.h>
 #include <linux/fiemap.h>
 #include <linux/rculist_bl.h>
 #include <linux/atomic.h>
@@ -143,6 +144,9 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
 
+/* File is capable of returning -EAGAIN if I/O will block */
+#define FMODE_NOWAIT		((__force fmode_t)0x8000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are
@@ -319,9 +323,22 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+/*
+ * Write life time hint values.
+ */
+enum rw_hint {
+	WRITE_LIFE_NOT_SET	= 0,
+	WRITE_LIFE_NONE		= RWH_WRITE_LIFE_NONE,
+	WRITE_LIFE_SHORT	= RWH_WRITE_LIFE_SHORT,
+	WRITE_LIFE_MEDIUM	= RWH_WRITE_LIFE_MEDIUM,
+	WRITE_LIFE_LONG		= RWH_WRITE_LIFE_LONG,
+	WRITE_LIFE_EXTREME	= RWH_WRITE_LIFE_EXTREME,
+};
+
 #define IOCB_EVENTFD		(1 << 0)
 #define IOCB_APPEND		(1 << 1)
 #define IOCB_DIRECT		(1 << 2)
+#define IOCB_NOWAIT		(1 << 7)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -329,6 +346,7 @@ struct kiocb {
 	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
 	void			*private;
 	int			ki_flags;
+	enum rw_hint		ki_hint;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -625,6 +643,7 @@ struct inode {
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
 	unsigned int		i_blkbits;
+	enum rw_hint		i_write_hint;
 	blkcnt_t		i_blocks;
 
 #ifdef __NEED_I_SIZE_ORDERED
@@ -1059,8 +1078,6 @@ struct file_lock_context {
 #define OFFT_OFFSET_MAX	INT_LIMIT(off_t)
 #endif
 
-#include <linux/fcntl.h>
-
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
 #ifdef CONFIG_FILE_LOCKING
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 8641e56b8f8a..9e535af579e8 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -13,42 +13,13 @@
 #ifndef _LINUX_FSCRYPT_H
 #define _LINUX_FSCRYPT_H
 
-#include <linux/key.h>
 #include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/bio.h>
-#include <linux/dcache.h>
-#include <crypto/skcipher.h>
-#include <uapi/linux/fs.h>
 
 #define FS_CRYPTO_BLOCK_SIZE		16
 
+struct fscrypt_ctx;
 struct fscrypt_info;
 
-struct fscrypt_ctx {
-	union {
-		struct {
-			struct page *bounce_page;	/* Ciphertext page */
-			struct page *control_page;	/* Original page  */
-		} w;
-		struct {
-			struct bio *bio;
-			struct work_struct work;
-		} r;
-		struct list_head free_list;	/* Free list */
-	};
-	u8 flags;				/* Flags */
-};
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct fscrypt_symlink_data {
-	__le16 len;
-	char encrypted_path[1];
-} __packed;
-
 struct fscrypt_str {
 	unsigned char *name;
 	u32 len;
@@ -67,86 +38,11 @@ struct fscrypt_name {
 #define fname_name(p)		((p)->disk_name.name)
 #define fname_len(p)		((p)->disk_name.len)
 
-/*
- * fscrypt superblock flags
- */
-#define FS_CFLG_OWN_PAGES (1U << 1)
-
-/*
- * crypto opertions for filesystems
- */
-struct fscrypt_operations {
-	unsigned int flags;
-	const char *key_prefix;
-	int (*get_context)(struct inode *, void *, size_t);
-	int (*set_context)(struct inode *, const void *, size_t, void *);
-	bool (*dummy_context)(struct inode *);
-	bool (*empty_dir)(struct inode *);
-	unsigned (*max_namelen)(struct inode *);
-};
-
-static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
-{
-	if (inode->i_sb->s_cop->dummy_context &&
-				inode->i_sb->s_cop->dummy_context(inode))
-		return true;
-	return false;
-}
-
-static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
-					u32 filenames_mode)
-{
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
-		return true;
-
-	if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
-	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
-		return true;
-
-	return false;
-}
-
-static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
-{
-	if (str->len == 1 && str->name[0] == '.')
-		return true;
-
-	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
-		return true;
-
-	return false;
-}
-
 #if __FS_HAS_ENCRYPTION
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
-}
-
-static inline bool fscrypt_has_encryption_key(const struct inode *inode)
-{
-	return (inode->i_crypt_info != NULL);
-}
-
 #include <linux/fscrypt_supp.h>
-
-#else /* !__FS_HAS_ENCRYPTION */
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-	WARN_ON_ONCE(1);
-	return ERR_PTR(-EINVAL);
-}
-
-static inline bool fscrypt_has_encryption_key(const struct inode *inode)
-{
-	return 0;
-}
-
+#else
 #include <linux/fscrypt_notsupp.h>
-#endif /* __FS_HAS_ENCRYPTION */
+#endif
 
 /**
  * fscrypt_require_key - require an inode's encryption key
@@ -287,4 +183,68 @@ static inline int fscrypt_prepare_setattr(struct dentry *dentry,
 	return 0;
 }
 
+/**
+ * fscrypt_prepare_symlink - prepare to create a possibly-encrypted symlink
+ * @dir: directory in which the symlink is being created
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @max_len: space the filesystem has available to store the symlink target
+ * @disk_link: (out) the on-disk symlink target being prepared
+ *
+ * This function computes the size the symlink target will require on-disk,
+ * stores it in @disk_link->len, and validates it against @max_len.  An
+ * encrypted symlink may be longer than the original.
+ *
+ * Additionally, @disk_link->name is set to @target if the symlink will be
+ * unencrypted, but left NULL if the symlink will be encrypted.  For encrypted
+ * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the
+ * on-disk target later.  (The reason for the two-step process is that some
+ * filesystems need to know the size of the symlink target before creating the
+ * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.)
+ *
+ * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long,
+ * -ENOKEY if the encryption key is missing, or another -errno code if a problem
+ * occurred while setting up the encryption key.
+ */
+static inline int fscrypt_prepare_symlink(struct inode *dir,
+					  const char *target,
+					  unsigned int len,
+					  unsigned int max_len,
+					  struct fscrypt_str *disk_link)
+{
+	if (IS_ENCRYPTED(dir) || fscrypt_dummy_context_enabled(dir))
+		return __fscrypt_prepare_symlink(dir, len, max_len, disk_link);
+
+	disk_link->name = (unsigned char *)target;
+	disk_link->len = len + 1;
+	if (disk_link->len > max_len)
+		return -ENAMETOOLONG;
+	return 0;
+}
+
+/**
+ * fscrypt_encrypt_symlink - encrypt the symlink target if needed
+ * @inode: symlink inode
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @disk_link: (in/out) the on-disk symlink target being prepared
+ *
+ * If the symlink target needs to be encrypted, then this function encrypts it
+ * into @disk_link->name.  fscrypt_prepare_symlink() must have been called
+ * previously to compute @disk_link->len.  If the filesystem did not allocate a
+ * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one
+ * will be kmalloc()'ed and the filesystem will be responsible for freeing it.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static inline int fscrypt_encrypt_symlink(struct inode *inode,
+					  const char *target,
+					  unsigned int len,
+					  struct fscrypt_str *disk_link)
+{
+	if (IS_ENCRYPTED(inode))
+		return __fscrypt_encrypt_symlink(inode, target, len, disk_link);
+	return 0;
+}
+
 #endif	/* _LINUX_FSCRYPT_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index c4c6bf2c390e..5777251400f9 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -13,6 +13,16 @@
 #ifndef _LINUX_FSCRYPT_NOTSUPP_H
 #define _LINUX_FSCRYPT_NOTSUPP_H
 
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+	return false;
+}
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	return false;
+}
+
 /* crypto.c */
 static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
 						  gfp_t gfp_flags)
@@ -42,6 +52,11 @@ static inline int fscrypt_decrypt_page(const struct inode *inode,
 	return -EOPNOTSUPP;
 }
 
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+}
 
 static inline void fscrypt_restore_control_page(struct page *page)
 {
@@ -115,16 +130,8 @@ static inline void fscrypt_free_filename(struct fscrypt_name *fname)
 	return;
 }
 
-static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode,
-					       u32 ilen)
-{
-	/* never happens */
-	WARN_ON(1);
-	return 0;
-}
-
 static inline int fscrypt_fname_alloc_buffer(const struct inode *inode,
-					     u32 ilen,
+					     u32 max_encrypted_len,
 					     struct fscrypt_str *crypto_str)
 {
 	return -EOPNOTSUPP;
@@ -143,13 +150,6 @@ static inline int fscrypt_fname_disk_to_usr(struct inode *inode,
 	return -EOPNOTSUPP;
 }
 
-static inline int fscrypt_fname_usr_to_disk(struct inode *inode,
-					    const struct qstr *iname,
-					    struct fscrypt_str *oname)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
 				      const u8 *de_name, u32 de_name_len)
 {
@@ -207,4 +207,27 @@ static inline int __fscrypt_prepare_lookup(struct inode *dir,
 	return -EOPNOTSUPP;
 }
 
+static inline int __fscrypt_prepare_symlink(struct inode *dir,
+					    unsigned int len,
+					    unsigned int max_len,
+					    struct fscrypt_str *disk_link)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_encrypt_symlink(struct inode *inode,
+					    const char *target,
+					    unsigned int len,
+					    struct fscrypt_str *disk_link)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void *fscrypt_get_symlink(struct inode *inode,
+					      const void *caddr,
+					      unsigned int max_size)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 #endif	/* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index 2db5e9706f60..c88d2058902a 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -10,8 +10,54 @@
 #ifndef _LINUX_FSCRYPT_SUPP_H
 #define _LINUX_FSCRYPT_SUPP_H
 
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto operations for filesystems
+ */
+struct fscrypt_operations {
+	unsigned int flags;
+	const char *key_prefix;
+	int (*get_context)(struct inode *, void *, size_t);
+	int (*set_context)(struct inode *, const void *, size_t, void *);
+	bool (*dummy_context)(struct inode *);
+	bool (*empty_dir)(struct inode *);
+	unsigned (*max_namelen)(struct inode *);
+};
+
+struct fscrypt_ctx {
+	union {
+		struct {
+			struct page *bounce_page;	/* Ciphertext page */
+			struct page *control_page;	/* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;	/* Free list */
+	};
+	u8 flags;				/* Flags */
+};
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+	return (inode->i_crypt_info != NULL);
+}
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	return inode->i_sb->s_cop->dummy_context &&
+		inode->i_sb->s_cop->dummy_context(inode);
+}
+
 /* crypto.c */
-extern struct kmem_cache *fscrypt_info_cachep;
 extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
 extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
@@ -19,6 +65,12 @@ extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
 						u64, gfp_t);
 extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
 				unsigned int, u64);
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+}
+
 extern void fscrypt_restore_control_page(struct page *);
 
 extern const struct dentry_operations fscrypt_d_ops;
@@ -54,14 +106,11 @@ static inline void fscrypt_free_filename(struct fscrypt_name *fname)
 	kfree(fname->crypto_buf.name);
 }
 
-extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32);
 extern int fscrypt_fname_alloc_buffer(const struct inode *, u32,
 				struct fscrypt_str *);
 extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
 extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
 			const struct fscrypt_str *, struct fscrypt_str *);
-extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
-			struct fscrypt_str *);
 
 #define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE	32
 
@@ -152,5 +201,13 @@ extern int __fscrypt_prepare_rename(struct inode *old_dir,
 				    struct dentry *new_dentry,
 				    unsigned int flags);
 extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry);
+extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
+				     unsigned int max_len,
+				     struct fscrypt_str *disk_link);
+extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
+				     unsigned int len,
+				     struct fscrypt_str *disk_link);
+extern void *fscrypt_get_symlink(struct inode *inode, const void *caddr,
+				       unsigned int max_size);
 
 #endif	/* _LINUX_FSCRYPT_SUPP_H */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index beed138bd359..f85ed3a5ef4d 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -42,6 +42,27 @@
 #define F_SEAL_WRITE	0x0008	/* prevent writes */
 /* (1U << 31) is reserved for signed error codes */
 
+/*
+ * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the
+ * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on
+ * the specific file.
+ */
+#define F_GET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 12)
+#define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
+#define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)
+
+/*
+ * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
+ * used to clear any hints previously set.
+ */
+#define RWF_WRITE_LIFE_NOT_SET	0
+#define RWH_WRITE_LIFE_NONE	1
+#define RWH_WRITE_LIFE_SHORT	2
+#define RWH_WRITE_LIFE_MEDIUM	3
+#define RWH_WRITE_LIFE_LONG	4
+#define RWH_WRITE_LIFE_EXTREME	5
+
 /*
  * Types of directory notifications that may be requested.
  */

From c6743a3ae71ba7653d967a14ceccea00c8e9c0ec Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Mon, 9 Apr 2018 15:54:08 -0700
Subject: [PATCH 1656/3220] ANDROID: proc: add null check in proc_uid_init

Check for case when proc_mkdir returns null.

Bug: 75236413
Test: Build & boot device; run 'ls /proc/uid/'
Change-Id: Ie8dd71cc724787286a5f7bc19b5a611ac87a2697
Signed-off-by: Connor O'Brien <connoro@google.com>
---
 fs/proc/uid.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/proc/uid.c b/fs/proc/uid.c
index 616d99b157c3..040591d341f8 100644
--- a/fs/proc/uid.c
+++ b/fs/proc/uid.c
@@ -286,6 +286,8 @@ static const struct inode_operations proc_uid_inode_operations = {
 int __init proc_uid_init(void)
 {
 	proc_uid = proc_mkdir("uid", NULL);
+	if (!proc_uid)
+		return -ENOMEM;
 	proc_uid->proc_iops = &proc_uid_inode_operations;
 	proc_uid->proc_fops = &proc_uid_operations;
 

From e5882583019dd1179aa6182314b3fb354b603186 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Mon, 2 Apr 2018 14:03:23 -0700
Subject: [PATCH 1657/3220] Revert "fixup! proc: make oom adjustment files user
 read-only"

CTS no longer expects oom_{adj,score_adj} to be read-only.  See
https://android-review.googlesource.com/530687/ for additional context.

This reverts commit 87a7a2cfbe48c29e48ed846d939f82d710780d99.

Bug: 63142211
Change-Id: I1d1b33c93ca6b6c16a9c1d5430dcb5cbe3ec894c
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 fs/proc/base.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 247ed9b0786f..f558b4bd5d6a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2877,6 +2877,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
 #endif
 	ONE("oom_score",  S_IRUGO, proc_oom_score),
+	INF("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL

From 4f175e7cd92d15232200a614894761695e87a702 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Mon, 2 Apr 2018 14:03:43 -0700
Subject: [PATCH 1658/3220] Revert "proc: make oom adjustment files user
 read-only"

CTS no longer expects oom_{adj,score_adj} to be read-only.  See
https://android-review.googlesource.com/530687/ for additional context.

This reverts commit 55541f1ac36fbb6edd94273cab21d69a82d31519.

Bug: 63142211
Change-Id: I8011b4389b17d9577a6aff08943d0021e990171b
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 fs/proc/base.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f558b4bd5d6a..440ec5f00214 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2877,9 +2877,8 @@ static const struct pid_entry tgid_base_stuff[] = {
 	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
 #endif
 	ONE("oom_score",  S_IRUGO, proc_oom_score),
-	INF("oom_score",  S_IRUGO, proc_oom_score),
-	REG("oom_adj",    S_IRUSR, proc_oom_adj_operations),
-	REG("oom_score_adj", S_IRUSR, proc_oom_score_adj_operations),
+	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
@@ -3272,8 +3271,8 @@ static const struct pid_entry tid_base_stuff[] = {
 	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
 #endif
 	ONE("oom_score", S_IRUGO, proc_oom_score),
-	REG("oom_adj",   S_IRUSR, proc_oom_adj_operations),
-	REG("oom_score_adj", S_IRUSR, proc_oom_score_adj_operations),
+	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),

From 6a38f9839e752c0372817a08ed5cc153b131033e Mon Sep 17 00:00:00 2001
From: Greg Hartman <ghartman@google.com>
Date: Thu, 5 Apr 2018 17:59:11 -0700
Subject: [PATCH 1659/3220] FROMLIST: staging: Android: Add 'vsoc' driver for
 cuttlefish.

The cuttlefish system is a virtual SoC architecture based on QEMU. It
uses the QEMU ivshmem feature to share memory regions between guest and
host with a custom protocol.

Signed-off-by: Greg Hartman <ghartman@google.com>
[sent upstream via staging https://patchwork.kernel.org/patch/10339507/]
Change-Id: Iaf5d7536898329a66d00764d8892d1395164519e
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/staging/android/Kconfig         |    9 +
 drivers/staging/android/Makefile        |    1 +
 drivers/staging/android/TODO            |   10 +
 drivers/staging/android/uapi/vsoc_shm.h |  303 ++++++
 drivers/staging/android/vsoc.c          | 1169 +++++++++++++++++++++++
 5 files changed, 1492 insertions(+)
 create mode 100644 drivers/staging/android/uapi/vsoc_shm.h
 create mode 100644 drivers/staging/android/vsoc.c

diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig
index 0c0c092ce354..82eb4dcf40c6 100644
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -75,6 +75,15 @@ config SW_SYNC_USER
 	  *WARNING* improper use of this can result in deadlocking kernel
 	  drivers from userspace.
 
+config ANDROID_VSOC
+	tristate "Android Virtual SoC support"
+	default n
+	depends on PCI_MSI
+	---help---
+	  This option adds support for the Virtual SoC driver needed to boot
+	  a 'cuttlefish' Android image inside QEmu. The driver interacts with
+	  a QEmu ivshmem device. If built as a module, it will be called vsoc.
+
 source "drivers/staging/android/ion/Kconfig"
 
 source "drivers/staging/android/fiq_debugger/Kconfig"
diff --git a/drivers/staging/android/Makefile b/drivers/staging/android/Makefile
index fcb7edca24c9..087886668641 100644
--- a/drivers/staging/android/Makefile
+++ b/drivers/staging/android/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_ANDROID_TIMED_GPIO)	+= timed_gpio.o
 obj-$(CONFIG_ANDROID_LOW_MEMORY_KILLER)	+= lowmemorykiller.o
 obj-$(CONFIG_SYNC)			+= sync.o sync_debug.o
 obj-$(CONFIG_SW_SYNC)			+= sw_sync.o
+obj-$(CONFIG_ANDROID_VSOC)		+= vsoc.o
diff --git a/drivers/staging/android/TODO b/drivers/staging/android/TODO
index 8f3ac37bfe12..0c32c00fa700 100644
--- a/drivers/staging/android/TODO
+++ b/drivers/staging/android/TODO
@@ -25,5 +25,15 @@ ion/
    exposes existing cma regions and doesn't reserve unecessarily memory when
    booting a system which doesn't use ion.
 
+vsoc.c, uapi/vsoc_shm.h
+ - The current driver uses the same wait queue for all of the futexes in a
+   region. This will cause false wakeups in regions with a large number of
+   waiting threads. We should eventually use multiple queues and select the
+   queue based on the region.
+ - Add debugfs support for examining the permissions of regions.
+ - Use ioremap_wc instead of ioremap_nocache.
+ - Remove VSOC_WAIT_FOR_INCOMING_INTERRUPT ioctl. This functionality has been
+   superseded by the futex and is there for legacy reasons.
+
 Please send patches to Greg Kroah-Hartman <greg@kroah.com> and Cc:
 Arve Hjønnevåg <arve@android.com> and Riley Andrews <riandrews@android.com>
diff --git a/drivers/staging/android/uapi/vsoc_shm.h b/drivers/staging/android/uapi/vsoc_shm.h
new file mode 100644
index 000000000000..741b1387c25b
--- /dev/null
+++ b/drivers/staging/android/uapi/vsoc_shm.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_VSOC_SHM_H
+#define _UAPI_LINUX_VSOC_SHM_H
+
+#include <linux/types.h>
+
+/**
+ * A permission is a token that permits a receiver to read and/or write an area
+ * of memory within a Vsoc region.
+ *
+ * An fd_scoped permission grants both read and write access, and can be
+ * attached to a file description (see open(2)).
+ * Ownership of the area can then be shared by passing a file descriptor
+ * among processes.
+ *
+ * begin_offset and end_offset define the area of memory that is controlled by
+ * the permission. owner_offset points to a word, also in shared memory, that
+ * controls ownership of the area.
+ *
+ * ownership of the region expires when the associated file description is
+ * released.
+ *
+ * At most one permission can be attached to each file description.
+ *
+ * This is useful when implementing HALs like gralloc that scope and pass
+ * ownership of shared resources via file descriptors.
+ *
+ * The caller is responsibe for doing any fencing.
+ *
+ * The calling process will normally identify a currently free area of
+ * memory. It will construct a proposed fd_scoped_permission_arg structure:
+ *
+ *   begin_offset and end_offset describe the area being claimed
+ *
+ *   owner_offset points to the location in shared memory that indicates the
+ *   owner of the area.
+ *
+ *   owned_value is the value that will be stored in owner_offset iff the
+ *   permission can be granted. It must be different than VSOC_REGION_FREE.
+ *
+ * Two fd_scoped_permission structures are compatible if they vary only by
+ * their owned_value fields.
+ *
+ * The driver ensures that, for any group of simultaneous callers proposing
+ * compatible fd_scoped_permissions, it will accept exactly one of the
+ * propopsals. The other callers will get a failure with errno of EAGAIN.
+ *
+ * A process receiving a file descriptor can identify the region being
+ * granted using the VSOC_GET_FD_SCOPED_PERMISSION ioctl.
+ */
+struct fd_scoped_permission {
+	__u32 begin_offset;
+	__u32 end_offset;
+	__u32 owner_offset;
+	__u32 owned_value;
+};
+
+/*
+ * This value represents a free area of memory. The driver expects to see this
+ * value at owner_offset when creating a permission otherwise it will not do it,
+ * and will write this value back once the permission is no longer needed.
+ */
+#define VSOC_REGION_FREE ((__u32)0)
+
+/**
+ * ioctl argument for VSOC_CREATE_FD_SCOPE_PERMISSION
+ */
+struct fd_scoped_permission_arg {
+	struct fd_scoped_permission perm;
+	__s32 managed_region_fd;
+};
+
+#define VSOC_NODE_FREE ((__u32)0)
+
+/*
+ * Describes a signal table in shared memory. Each non-zero entry in the
+ * table indicates that the receiver should signal the futex at the given
+ * offset. Offsets are relative to the region, not the shared memory window.
+ *
+ * interrupt_signalled_offset is used to reliably signal interrupts across the
+ * vmm boundary. There are two roles: transmitter and receiver. For example,
+ * in the host_to_guest_signal_table the host is the transmitter and the
+ * guest is the receiver. The protocol is as follows:
+ *
+ * 1. The transmitter should convert the offset of the futex to an offset
+ *    in the signal table [0, (1 << num_nodes_lg2))
+ *    The transmitter can choose any appropriate hashing algorithm, including
+ *    hash = futex_offset & ((1 << num_nodes_lg2) - 1)
+ *
+ * 3. The transmitter should atomically compare and swap futex_offset with 0
+ *    at hash. There are 3 possible outcomes
+ *      a. The swap fails because the futex_offset is already in the table.
+ *         The transmitter should stop.
+ *      b. Some other offset is in the table. This is a hash collision. The
+ *         transmitter should move to another table slot and try again. One
+ *         possible algorithm:
+ *         hash = (hash + 1) & ((1 << num_nodes_lg2) - 1)
+ *      c. The swap worked. Continue below.
+ *
+ * 3. The transmitter atomically swaps 1 with the value at the
+ *    interrupt_signalled_offset. There are two outcomes:
+ *      a. The prior value was 1. In this case an interrupt has already been
+ *         posted. The transmitter is done.
+ *      b. The prior value was 0, indicating that the receiver may be sleeping.
+ *         The transmitter will issue an interrupt.
+ *
+ * 4. On waking the receiver immediately exchanges a 0 with the
+ *    interrupt_signalled_offset. If it receives a 0 then this a spurious
+ *    interrupt. That may occasionally happen in the current protocol, but
+ *    should be rare.
+ *
+ * 5. The receiver scans the signal table by atomicaly exchanging 0 at each
+ *    location. If a non-zero offset is returned from the exchange the
+ *    receiver wakes all sleepers at the given offset:
+ *      futex((int*)(region_base + old_value), FUTEX_WAKE, MAX_INT);
+ *
+ * 6. The receiver thread then does a conditional wait, waking immediately
+ *    if the value at interrupt_signalled_offset is non-zero. This catches cases
+ *    here additional  signals were posted while the table was being scanned.
+ *    On the guest the wait is handled via the VSOC_WAIT_FOR_INCOMING_INTERRUPT
+ *    ioctl.
+ */
+struct vsoc_signal_table_layout {
+	/* log_2(Number of signal table entries) */
+	__u32 num_nodes_lg2;
+	/*
+	 * Offset to the first signal table entry relative to the start of the
+	 * region
+	 */
+	__u32 futex_uaddr_table_offset;
+	/*
+	 * Offset to an atomic_t / atomic uint32_t. A non-zero value indicates
+	 * that one or more offsets are currently posted in the table.
+	 * semi-unique access to an entry in the table
+	 */
+	__u32 interrupt_signalled_offset;
+};
+
+#define VSOC_REGION_WHOLE ((__s32)0)
+#define VSOC_DEVICE_NAME_SZ 16
+
+/**
+ * Each HAL would (usually) talk to a single device region
+ * Mulitple entities care about these regions:
+ * - The ivshmem_server will populate the regions in shared memory
+ * - The guest kernel will read the region, create minor device nodes, and
+ *   allow interested parties to register for FUTEX_WAKE events in the region
+ * - HALs will access via the minor device nodes published by the guest kernel
+ * - Host side processes will access the region via the ivshmem_server:
+ *   1. Pass name to ivshmem_server at a UNIX socket
+ *   2. ivshmemserver will reply with 2 fds:
+ *     - host->guest doorbell fd
+ *     - guest->host doorbell fd
+ *     - fd for the shared memory region
+ *     - region offset
+ *   3. Start a futex receiver thread on the doorbell fd pointed at the
+ *      signal_nodes
+ */
+struct vsoc_device_region {
+	__u16 current_version;
+	__u16 min_compatible_version;
+	__u32 region_begin_offset;
+	__u32 region_end_offset;
+	__u32 offset_of_region_data;
+	struct vsoc_signal_table_layout guest_to_host_signal_table;
+	struct vsoc_signal_table_layout host_to_guest_signal_table;
+	/* Name of the device. Must always be terminated with a '\0', so
+	 * the longest supported device name is 15 characters.
+	 */
+	char device_name[VSOC_DEVICE_NAME_SZ];
+	/* There are two ways that permissions to access regions are handled:
+	 *   - When subdivided_by is VSOC_REGION_WHOLE, any process that can
+	 *     open the device node for the region gains complete access to it.
+	 *   - When subdivided is set processes that open the region cannot
+	 *     access it. Access to a sub-region must be established by invoking
+	 *     the VSOC_CREATE_FD_SCOPE_PERMISSION ioctl on the region
+	 *     referenced in subdivided_by, providing a fileinstance
+	 *     (represented by a fd) opened on this region.
+	 */
+	__u32 managed_by;
+};
+
+/*
+ * The vsoc layout descriptor.
+ * The first 4K should be reserved for the shm header and region descriptors.
+ * The regions should be page aligned.
+ */
+
+struct vsoc_shm_layout_descriptor {
+	__u16 major_version;
+	__u16 minor_version;
+
+	/* size of the shm. This may be redundant but nice to have */
+	__u32 size;
+
+	/* number of shared memory regions */
+	__u32 region_count;
+
+	/* The offset to the start of region descriptors */
+	__u32 vsoc_region_desc_offset;
+};
+
+/*
+ * This specifies the current version that should be stored in
+ * vsoc_shm_layout_descriptor.major_version and
+ * vsoc_shm_layout_descriptor.minor_version.
+ * It should be updated only if the vsoc_device_region and
+ * vsoc_shm_layout_descriptor structures have changed.
+ * Versioning within each region is transferred
+ * via the min_compatible_version and current_version fields in
+ * vsoc_device_region. The driver does not consult these fields: they are left
+ * for the HALs and host processes and will change independently of the layout
+ * version.
+ */
+#define CURRENT_VSOC_LAYOUT_MAJOR_VERSION 2
+#define CURRENT_VSOC_LAYOUT_MINOR_VERSION 0
+
+#define VSOC_CREATE_FD_SCOPED_PERMISSION \
+	_IOW(0xF5, 0, struct fd_scoped_permission)
+#define VSOC_GET_FD_SCOPED_PERMISSION _IOR(0xF5, 1, struct fd_scoped_permission)
+
+/*
+ * This is used to signal the host to scan the guest_to_host_signal_table
+ * for new futexes to wake. This sends an interrupt if one is not already
+ * in flight.
+ */
+#define VSOC_MAYBE_SEND_INTERRUPT_TO_HOST _IO(0xF5, 2)
+
+/*
+ * When this returns the guest will scan host_to_guest_signal_table to
+ * check for new futexes to wake.
+ */
+/* TODO(ghartman): Consider moving this to the bottom half */
+#define VSOC_WAIT_FOR_INCOMING_INTERRUPT _IO(0xF5, 3)
+
+/*
+ * Guest HALs will use this to retrieve the region description after
+ * opening their device node.
+ */
+#define VSOC_DESCRIBE_REGION _IOR(0xF5, 4, struct vsoc_device_region)
+
+/*
+ * Wake any threads that may be waiting for a host interrupt on this region.
+ * This is mostly used during shutdown.
+ */
+#define VSOC_SELF_INTERRUPT _IO(0xF5, 5)
+
+/*
+ * This is used to signal the host to scan the guest_to_host_signal_table
+ * for new futexes to wake. This sends an interrupt unconditionally.
+ */
+#define VSOC_SEND_INTERRUPT_TO_HOST _IO(0xF5, 6)
+
+enum wait_types {
+	VSOC_WAIT_UNDEFINED = 0,
+	VSOC_WAIT_IF_EQUAL = 1,
+	VSOC_WAIT_IF_EQUAL_TIMEOUT = 2
+};
+
+/*
+ * Wait for a condition to be true
+ *
+ * Note, this is sized and aligned so the 32 bit and 64 bit layouts are
+ * identical.
+ */
+struct vsoc_cond_wait {
+	/* Input: Offset of the 32 bit word to check */
+	__u32 offset;
+	/* Input: Value that will be compared with the offset */
+	__u32 value;
+	/* Monotonic time to wake at in seconds */
+	__u64 wake_time_sec;
+	/* Input: Monotonic time to wait in nanoseconds */
+	__u32 wake_time_nsec;
+	/* Input: Type of wait */
+	__u32 wait_type;
+	/* Output: Number of times the thread woke before returning. */
+	__u32 wakes;
+	/* Ensure that we're 8-byte aligned and 8 byte length for 32/64 bit
+	 * compatibility.
+	 */
+	__u32 reserved_1;
+};
+
+#define VSOC_COND_WAIT _IOWR(0xF5, 7, struct vsoc_cond_wait)
+
+/* Wake any local threads waiting at the offset given in arg */
+#define VSOC_COND_WAKE _IO(0xF5, 8)
+
+#endif /* _UAPI_LINUX_VSOC_SHM_H */
diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
new file mode 100644
index 000000000000..587c66d709b9
--- /dev/null
+++ b/drivers/staging/android/vsoc.c
@@ -0,0 +1,1169 @@
+/*
+ * drivers/android/staging/vsoc.c
+ *
+ * Android Virtual System on a Chip (VSoC) driver
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * Author: ghartman@google.com
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Based on drivers/char/kvm_ivshmem.c - driver for KVM Inter-VM shared memory
+ *         Copyright 2009 Cam Macdonell <cam@cs.ualberta.ca>
+ *
+ * Based on cirrusfb.c and 8139cp.c:
+ *   Copyright 1999-2001 Jeff Garzik
+ *   Copyright 2001-2004 Jeff Garzik
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/freezer.h>
+#include <linux/futex.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/cdev.h>
+#include <linux/file.h>
+#include "uapi/vsoc_shm.h"
+
+#define VSOC_DEV_NAME "vsoc"
+
+/*
+ * Description of the ivshmem-doorbell PCI device used by QEmu. These
+ * constants follow docs/specs/ivshmem-spec.txt, which can be found in
+ * the QEmu repository. This was last reconciled with the version that
+ * came out with 2.8
+ */
+
+/*
+ * These constants are determined KVM Inter-VM shared memory device
+ * register offsets
+ */
+enum {
+	INTR_MASK = 0x00,	/* Interrupt Mask */
+	INTR_STATUS = 0x04,	/* Interrupt Status */
+	IV_POSITION = 0x08,	/* VM ID */
+	DOORBELL = 0x0c,	/* Doorbell */
+};
+
+static const int REGISTER_BAR;  /* Equal to 0 */
+static const int MAX_REGISTER_BAR_LEN = 0x100;
+/*
+ * The MSI-x BAR is not used directly.
+ *
+ * static const int MSI_X_BAR = 1;
+ */
+static const int SHARED_MEMORY_BAR = 2;
+
+struct vsoc_region_data {
+	char name[VSOC_DEVICE_NAME_SZ + 1];
+	wait_queue_head_t interrupt_wait_queue;
+	/* TODO(b/73664181): Use multiple futex wait queues */
+	wait_queue_head_t futex_wait_queue;
+	/* Flag indicating that an interrupt has been signalled by the host. */
+	atomic_t *incoming_signalled;
+	/* Flag indicating the guest has signalled the host. */
+	atomic_t *outgoing_signalled;
+	int irq_requested;
+	int device_created;
+};
+
+struct vsoc_device {
+	/* Kernel virtual address of REGISTER_BAR. */
+	void __iomem *regs;
+	/* Physical address of SHARED_MEMORY_BAR. */
+	phys_addr_t shm_phys_start;
+	/* Kernel virtual address of SHARED_MEMORY_BAR. */
+	void *kernel_mapped_shm;
+	/* Size of the entire shared memory window in bytes. */
+	size_t shm_size;
+	/*
+	 * Pointer to the virtual address of the shared memory layout structure.
+	 * This is probably identical to kernel_mapped_shm, but saving this
+	 * here saves a lot of annoying casts.
+	 */
+	struct vsoc_shm_layout_descriptor *layout;
+	/*
+	 * Points to a table of region descriptors in the kernel's virtual
+	 * address space. Calculated from
+	 * vsoc_shm_layout_descriptor.vsoc_region_desc_offset
+	 */
+	struct vsoc_device_region *regions;
+	/* Head of a list of permissions that have been granted. */
+	struct list_head permissions;
+	struct pci_dev *dev;
+	/* Per-region (and therefore per-interrupt) information. */
+	struct vsoc_region_data *regions_data;
+	/*
+	 * Table of msi-x entries. This has to be separated from struct
+	 * vsoc_region_data because the kernel deals with them as an array.
+	 */
+	struct msix_entry *msix_entries;
+	/*
+	 * Flags that indicate what we've initialzied. These are used to do an
+	 * orderly cleanup of the device.
+	 */
+	char enabled_device;
+	char requested_regions;
+	char cdev_added;
+	char class_added;
+	char msix_enabled;
+	/* Mutex that protectes the permission list */
+	struct mutex mtx;
+	/* Major number assigned by the kernel */
+	int major;
+
+	struct cdev cdev;
+	struct class *class;
+};
+
+static struct vsoc_device vsoc_dev;
+
+/*
+ * TODO(ghartman): Add a /sys filesystem entry that summarizes the permissions.
+ */
+
+struct fd_scoped_permission_node {
+	struct fd_scoped_permission permission;
+	struct list_head list;
+};
+
+struct vsoc_private_data {
+	struct fd_scoped_permission_node *fd_scoped_permission_node;
+};
+
+static long vsoc_ioctl(struct file *, unsigned int, unsigned long);
+static int vsoc_mmap(struct file *, struct vm_area_struct *);
+static int vsoc_open(struct inode *, struct file *);
+static int vsoc_release(struct inode *, struct file *);
+static ssize_t vsoc_read(struct file *, char *, size_t, loff_t *);
+static ssize_t vsoc_write(struct file *, const char *, size_t, loff_t *);
+static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin);
+static int do_create_fd_scoped_permission(
+	struct vsoc_device_region *region_p,
+	struct fd_scoped_permission_node *np,
+	struct fd_scoped_permission_arg *__user arg);
+static void do_destroy_fd_scoped_permission(
+	struct vsoc_device_region *owner_region_p,
+	struct fd_scoped_permission *perm);
+static long do_vsoc_describe_region(struct file *,
+				    struct vsoc_device_region __user *);
+static ssize_t vsoc_get_area(struct file *filp, __u32 *perm_off);
+
+/**
+ * Validate arguments on entry points to the driver.
+ */
+inline int vsoc_validate_inode(struct inode *inode)
+{
+	if (iminor(inode) >= vsoc_dev.layout->region_count) {
+		dev_err(&vsoc_dev.dev->dev,
+			"describe_region: invalid region %d\n", iminor(inode));
+		return -ENODEV;
+	}
+	return 0;
+}
+
+inline int vsoc_validate_filep(struct file *filp)
+{
+	int ret = vsoc_validate_inode(file_inode(filp));
+
+	if (ret)
+		return ret;
+	if (!filp->private_data) {
+		dev_err(&vsoc_dev.dev->dev,
+			"No private data on fd, region %d\n",
+			iminor(file_inode(filp)));
+		return -EBADFD;
+	}
+	return 0;
+}
+
+/* Converts from shared memory offset to virtual address */
+static inline void *shm_off_to_virtual_addr(__u32 offset)
+{
+	return vsoc_dev.kernel_mapped_shm + offset;
+}
+
+/* Converts from shared memory offset to physical address */
+static inline phys_addr_t shm_off_to_phys_addr(__u32 offset)
+{
+	return vsoc_dev.shm_phys_start + offset;
+}
+
+/**
+ * Convenience functions to obtain the region from the inode or file.
+ * Dangerous to call before validating the inode/file.
+ */
+static inline struct vsoc_device_region *vsoc_region_from_inode(
+	struct inode *inode)
+{
+	return &vsoc_dev.regions[iminor(inode)];
+}
+
+static inline struct vsoc_device_region *vsoc_region_from_filep(
+	struct file *inode)
+{
+	return vsoc_region_from_inode(file_inode(inode));
+}
+
+static inline uint32_t vsoc_device_region_size(struct vsoc_device_region *r)
+{
+	return r->region_end_offset - r->region_begin_offset;
+}
+
+static const struct file_operations vsoc_ops = {
+	.owner = THIS_MODULE,
+	.open = vsoc_open,
+	.mmap = vsoc_mmap,
+	.read = vsoc_read,
+	.unlocked_ioctl = vsoc_ioctl,
+	.compat_ioctl = vsoc_ioctl,
+	.write = vsoc_write,
+	.llseek = vsoc_lseek,
+	.release = vsoc_release,
+};
+
+static struct pci_device_id vsoc_id_table[] = {
+	{0x1af4, 0x1110, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{0},
+};
+
+MODULE_DEVICE_TABLE(pci, vsoc_id_table);
+
+static void vsoc_remove_device(struct pci_dev *pdev);
+static int vsoc_probe_device(struct pci_dev *pdev,
+			     const struct pci_device_id *ent);
+
+static struct pci_driver vsoc_pci_driver = {
+	.name = "vsoc",
+	.id_table = vsoc_id_table,
+	.probe = vsoc_probe_device,
+	.remove = vsoc_remove_device,
+};
+
+static int do_create_fd_scoped_permission(
+	struct vsoc_device_region *region_p,
+	struct fd_scoped_permission_node *np,
+	struct fd_scoped_permission_arg *__user arg)
+{
+	struct file *managed_filp;
+	s32 managed_fd;
+	atomic_t *owner_ptr = NULL;
+	struct vsoc_device_region *managed_region_p;
+
+	if (copy_from_user(&np->permission, &arg->perm, sizeof(*np)) ||
+	    copy_from_user(&managed_fd,
+			   &arg->managed_region_fd, sizeof(managed_fd))) {
+		return -EFAULT;
+	}
+	managed_filp = fdget(managed_fd).file;
+	/* Check that it's a valid fd, */
+	if (!managed_filp || vsoc_validate_filep(managed_filp))
+		return -EPERM;
+	/* EEXIST if the given fd already has a permission. */
+	if (((struct vsoc_private_data *)managed_filp->private_data)->
+	    fd_scoped_permission_node)
+		return -EEXIST;
+	managed_region_p = vsoc_region_from_filep(managed_filp);
+	/* Check that the provided region is managed by this one */
+	if (&vsoc_dev.regions[managed_region_p->managed_by] != region_p)
+		return -EPERM;
+	/* The area must be well formed and have non-zero size */
+	if (np->permission.begin_offset >= np->permission.end_offset)
+		return -EINVAL;
+	/* The area must fit in the memory window */
+	if (np->permission.end_offset >
+	    vsoc_device_region_size(managed_region_p))
+		return -ERANGE;
+	/* The area must be in the region data section */
+	if (np->permission.begin_offset <
+	    managed_region_p->offset_of_region_data)
+		return -ERANGE;
+	/* The area must be page aligned */
+	if (!PAGE_ALIGNED(np->permission.begin_offset) ||
+	    !PAGE_ALIGNED(np->permission.end_offset))
+		return -EINVAL;
+	/* Owner offset must be naturally aligned in the window */
+	if (np->permission.owner_offset &
+	    (sizeof(np->permission.owner_offset) - 1))
+		return -EINVAL;
+	/* The owner flag must reside in the owner memory */
+	if (np->permission.owner_offset + sizeof(np->permission.owner_offset) >
+	    vsoc_device_region_size(region_p))
+		return -ERANGE;
+	/* The owner flag must reside in the data section */
+	if (np->permission.owner_offset < region_p->offset_of_region_data)
+		return -EINVAL;
+	/* The owner value must change to claim the memory */
+	if (np->permission.owned_value == VSOC_REGION_FREE)
+		return -EINVAL;
+	owner_ptr =
+	    (atomic_t *)shm_off_to_virtual_addr(region_p->region_begin_offset +
+						np->permission.owner_offset);
+	/* We've already verified that this is in the shared memory window, so
+	 * it should be safe to write to this address.
+	 */
+	if (atomic_cmpxchg(owner_ptr,
+			   VSOC_REGION_FREE,
+			   np->permission.owned_value) != VSOC_REGION_FREE) {
+		return -EBUSY;
+	}
+	((struct vsoc_private_data *)managed_filp->private_data)->
+	    fd_scoped_permission_node = np;
+	/* The file offset needs to be adjusted if the calling
+	 * process did any read/write operations on the fd
+	 * before creating the permission.
+	 */
+	if (managed_filp->f_pos) {
+		if (managed_filp->f_pos > np->permission.end_offset) {
+			/* If the offset is beyond the permission end, set it
+			 * to the end.
+			 */
+			managed_filp->f_pos = np->permission.end_offset;
+		} else {
+			/* If the offset is within the permission interval
+			 * keep it there otherwise reset it to zero.
+			 */
+			if (managed_filp->f_pos < np->permission.begin_offset) {
+				managed_filp->f_pos = 0;
+			} else {
+				managed_filp->f_pos -=
+				    np->permission.begin_offset;
+			}
+		}
+	}
+	return 0;
+}
+
+static void do_destroy_fd_scoped_permission_node(
+	struct vsoc_device_region *owner_region_p,
+	struct fd_scoped_permission_node *node)
+{
+	if (node) {
+		do_destroy_fd_scoped_permission(owner_region_p,
+						&node->permission);
+		mutex_lock(&vsoc_dev.mtx);
+		list_del(&node->list);
+		mutex_unlock(&vsoc_dev.mtx);
+		kfree(node);
+	}
+}
+
+static void do_destroy_fd_scoped_permission(
+		struct vsoc_device_region *owner_region_p,
+		struct fd_scoped_permission *perm)
+{
+	atomic_t *owner_ptr = NULL;
+	int prev = 0;
+
+	if (!perm)
+		return;
+	owner_ptr = (atomic_t *)shm_off_to_virtual_addr(
+		owner_region_p->region_begin_offset + perm->owner_offset);
+	prev = atomic_xchg(owner_ptr, VSOC_REGION_FREE);
+	if (prev != perm->owned_value)
+		dev_err(&vsoc_dev.dev->dev,
+			"%x-%x: owner (%s) %x: expected to be %x was %x",
+			perm->begin_offset, perm->end_offset,
+			owner_region_p->device_name, perm->owner_offset,
+			perm->owned_value, prev);
+}
+
+static long do_vsoc_describe_region(struct file *filp,
+				    struct vsoc_device_region __user *dest)
+{
+	struct vsoc_device_region *region_p;
+	int retval = vsoc_validate_filep(filp);
+
+	if (retval)
+		return retval;
+	region_p = vsoc_region_from_filep(filp);
+	if (copy_to_user(dest, region_p, sizeof(*region_p)))
+		return -EFAULT;
+	return 0;
+}
+
+/**
+ * Implements the inner logic of cond_wait. Copies to and from userspace are
+ * done in the helper function below.
+ */
+static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg)
+{
+	DEFINE_WAIT(wait);
+	u32 region_number = iminor(file_inode(filp));
+	struct vsoc_region_data *data = vsoc_dev.regions_data + region_number;
+	struct hrtimer_sleeper timeout, *to = NULL;
+	int ret = 0;
+	struct vsoc_device_region *region_p = vsoc_region_from_filep(filp);
+	atomic_t *address = NULL;
+	struct timespec ts;
+
+	/* Ensure that the offset is aligned */
+	if (arg->offset & (sizeof(uint32_t) - 1))
+		return -EADDRNOTAVAIL;
+	/* Ensure that the offset is within shared memory */
+	if (((uint64_t)arg->offset) + region_p->region_begin_offset +
+	    sizeof(uint32_t) > region_p->region_end_offset)
+		return -E2BIG;
+	address = shm_off_to_virtual_addr(region_p->region_begin_offset +
+					  arg->offset);
+
+	/* Ensure that the type of wait is valid */
+	switch (arg->wait_type) {
+	case VSOC_WAIT_IF_EQUAL:
+		break;
+	case VSOC_WAIT_IF_EQUAL_TIMEOUT:
+		to = &timeout;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (to) {
+		/* Copy the user-supplied timesec into the kernel structure.
+		 * We do things this way to flatten differences between 32 bit
+		 * and 64 bit timespecs.
+		 */
+		ts.tv_sec = arg->wake_time_sec;
+		ts.tv_nsec = arg->wake_time_nsec;
+
+		if (!timespec_valid(&ts))
+			return -EINVAL;
+		hrtimer_init_on_stack(&to->timer, CLOCK_MONOTONIC,
+				      HRTIMER_MODE_ABS);
+		hrtimer_set_expires_range_ns(&to->timer, timespec_to_ktime(ts),
+					     current->timer_slack_ns);
+
+		hrtimer_init_sleeper(to, current);
+	}
+
+	while (1) {
+		prepare_to_wait(&data->futex_wait_queue, &wait,
+				TASK_INTERRUPTIBLE);
+		/*
+		 * Check the sentinel value after prepare_to_wait. If the value
+		 * changes after this check the writer will call signal,
+		 * changing the task state from INTERRUPTIBLE to RUNNING. That
+		 * will ensure that schedule() will eventually schedule this
+		 * task.
+		 */
+		if (atomic_read(address) != arg->value) {
+			ret = 0;
+			break;
+		}
+		if (to) {
+			hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+			if (likely(to->task))
+				freezable_schedule();
+			hrtimer_cancel(&to->timer);
+			if (!to->task) {
+				ret = -ETIMEDOUT;
+				break;
+			}
+		} else {
+			freezable_schedule();
+		}
+		/* Count the number of times that we woke up. This is useful
+		 * for unit testing.
+		 */
+		++arg->wakes;
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+	}
+	finish_wait(&data->futex_wait_queue, &wait);
+	if (to)
+		destroy_hrtimer_on_stack(&to->timer);
+	return ret;
+}
+
+/**
+ * Handles the details of copying from/to userspace to ensure that the copies
+ * happen on all of the return paths of cond_wait.
+ */
+static int do_vsoc_cond_wait(struct file *filp,
+			     struct vsoc_cond_wait __user *untrusted_in)
+{
+	struct vsoc_cond_wait arg;
+	int rval = 0;
+
+	if (copy_from_user(&arg, untrusted_in, sizeof(arg)))
+		return -EFAULT;
+	/* wakes is an out parameter. Initialize it to something sensible. */
+	arg.wakes = 0;
+	rval = handle_vsoc_cond_wait(filp, &arg);
+	if (copy_to_user(untrusted_in, &arg, sizeof(arg)))
+		return -EFAULT;
+	return rval;
+}
+
+static int do_vsoc_cond_wake(struct file *filp, uint32_t offset)
+{
+	struct vsoc_device_region *region_p = vsoc_region_from_filep(filp);
+	u32 region_number = iminor(file_inode(filp));
+	struct vsoc_region_data *data = vsoc_dev.regions_data + region_number;
+	/* Ensure that the offset is aligned */
+	if (offset & (sizeof(uint32_t) - 1))
+		return -EADDRNOTAVAIL;
+	/* Ensure that the offset is within shared memory */
+	if (((uint64_t)offset) + region_p->region_begin_offset +
+	    sizeof(uint32_t) > region_p->region_end_offset)
+		return -E2BIG;
+	/*
+	 * TODO(b/73664181): Use multiple futex wait queues.
+	 * We need to wake every sleeper when the condition changes. Typically
+	 * only a single thread will be waiting on the condition, but there
+	 * are exceptions. The worst case is about 10 threads.
+	 */
+	wake_up_interruptible_all(&data->futex_wait_queue);
+	return 0;
+}
+
+static long vsoc_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	int rv = 0;
+	struct vsoc_device_region *region_p;
+	u32 reg_num;
+	struct vsoc_region_data *reg_data;
+	int retval = vsoc_validate_filep(filp);
+
+	if (retval)
+		return retval;
+	region_p = vsoc_region_from_filep(filp);
+	reg_num = iminor(file_inode(filp));
+	reg_data = vsoc_dev.regions_data + reg_num;
+	switch (cmd) {
+	case VSOC_CREATE_FD_SCOPED_PERMISSION:
+		{
+			struct fd_scoped_permission_node *node = NULL;
+
+			node = kzalloc(sizeof(*node), GFP_KERNEL);
+			/* We can't allocate memory for the permission */
+			if (!node)
+				return -ENOMEM;
+			INIT_LIST_HEAD(&node->list);
+			rv = do_create_fd_scoped_permission(
+				region_p,
+				node,
+				(struct fd_scoped_permission_arg __user *)arg);
+			if (!rv) {
+				mutex_lock(&vsoc_dev.mtx);
+				list_add(&node->list, &vsoc_dev.permissions);
+				mutex_unlock(&vsoc_dev.mtx);
+			} else {
+				kfree(node);
+				return rv;
+			}
+		}
+		break;
+
+	case VSOC_GET_FD_SCOPED_PERMISSION:
+		{
+			struct fd_scoped_permission_node *node =
+			    ((struct vsoc_private_data *)filp->private_data)->
+			    fd_scoped_permission_node;
+			if (!node)
+				return -ENOENT;
+			if (copy_to_user
+			    ((struct fd_scoped_permission __user *)arg,
+			     &node->permission, sizeof(node->permission)))
+				return -EFAULT;
+		}
+		break;
+
+	case VSOC_MAYBE_SEND_INTERRUPT_TO_HOST:
+		if (!atomic_xchg(
+			    reg_data->outgoing_signalled,
+			    1)) {
+			writel(reg_num, vsoc_dev.regs + DOORBELL);
+			return 0;
+		} else {
+			return -EBUSY;
+		}
+		break;
+
+	case VSOC_SEND_INTERRUPT_TO_HOST:
+		writel(reg_num, vsoc_dev.regs + DOORBELL);
+		return 0;
+
+	case VSOC_WAIT_FOR_INCOMING_INTERRUPT:
+		wait_event_interruptible(
+			reg_data->interrupt_wait_queue,
+			(atomic_read(reg_data->incoming_signalled) != 0));
+		break;
+
+	case VSOC_DESCRIBE_REGION:
+		return do_vsoc_describe_region(
+			filp,
+			(struct vsoc_device_region __user *)arg);
+
+	case VSOC_SELF_INTERRUPT:
+		atomic_set(reg_data->incoming_signalled, 1);
+		wake_up_interruptible(&reg_data->interrupt_wait_queue);
+		break;
+
+	case VSOC_COND_WAIT:
+		return do_vsoc_cond_wait(filp,
+					 (struct vsoc_cond_wait __user *)arg);
+	case VSOC_COND_WAKE:
+		return do_vsoc_cond_wake(filp, arg);
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static ssize_t vsoc_read(struct file *filp, char *buffer, size_t len,
+			 loff_t *poffset)
+{
+	__u32 area_off;
+	void *area_p;
+	ssize_t area_len;
+	int retval = vsoc_validate_filep(filp);
+
+	if (retval)
+		return retval;
+	area_len = vsoc_get_area(filp, &area_off);
+	area_p = shm_off_to_virtual_addr(area_off);
+	area_p += *poffset;
+	area_len -= *poffset;
+	if (area_len <= 0)
+		return 0;
+	if (area_len < len)
+		len = area_len;
+	if (copy_to_user(buffer, area_p, len))
+		return -EFAULT;
+	*poffset += len;
+	return len;
+}
+
+static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin)
+{
+	ssize_t area_len = 0;
+	int retval = vsoc_validate_filep(filp);
+
+	if (retval)
+		return retval;
+	area_len = vsoc_get_area(filp, NULL);
+	switch (origin) {
+	case SEEK_SET:
+		break;
+
+	case SEEK_CUR:
+		if (offset > 0 && offset + filp->f_pos < 0)
+			return -EOVERFLOW;
+		offset += filp->f_pos;
+		break;
+
+	case SEEK_END:
+		if (offset > 0 && offset + area_len < 0)
+			return -EOVERFLOW;
+		offset += area_len;
+		break;
+
+	case SEEK_DATA:
+		if (offset >= area_len)
+			return -EINVAL;
+		if (offset < 0)
+			offset = 0;
+		break;
+
+	case SEEK_HOLE:
+		/* Next hole is always the end of the region, unless offset is
+		 * beyond that
+		 */
+		if (offset < area_len)
+			offset = area_len;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (offset < 0 || offset > area_len)
+		return -EINVAL;
+	filp->f_pos = offset;
+
+	return offset;
+}
+
+static ssize_t vsoc_write(struct file *filp, const char *buffer,
+			  size_t len, loff_t *poffset)
+{
+	__u32 area_off;
+	void *area_p;
+	ssize_t area_len;
+	int retval = vsoc_validate_filep(filp);
+
+	if (retval)
+		return retval;
+	area_len = vsoc_get_area(filp, &area_off);
+	area_p = shm_off_to_virtual_addr(area_off);
+	area_p += *poffset;
+	area_len -= *poffset;
+	if (area_len <= 0)
+		return 0;
+	if (area_len < len)
+		len = area_len;
+	if (copy_from_user(area_p, buffer, len))
+		return -EFAULT;
+	*poffset += len;
+	return len;
+}
+
+static irqreturn_t vsoc_interrupt(int irq, void *region_data_v)
+{
+	struct vsoc_region_data *region_data =
+	    (struct vsoc_region_data *)region_data_v;
+	int reg_num = region_data - vsoc_dev.regions_data;
+
+	if (unlikely(!region_data))
+		return IRQ_NONE;
+
+	if (unlikely(reg_num < 0 ||
+		     reg_num >= vsoc_dev.layout->region_count)) {
+		dev_err(&vsoc_dev.dev->dev,
+			"invalid irq @%p reg_num=0x%04x\n",
+			region_data, reg_num);
+		return IRQ_NONE;
+	}
+	if (unlikely(vsoc_dev.regions_data + reg_num != region_data)) {
+		dev_err(&vsoc_dev.dev->dev,
+			"irq not aligned @%p reg_num=0x%04x\n",
+			region_data, reg_num);
+		return IRQ_NONE;
+	}
+	wake_up_interruptible(&region_data->interrupt_wait_queue);
+	return IRQ_HANDLED;
+}
+
+static int vsoc_probe_device(struct pci_dev *pdev,
+			     const struct pci_device_id *ent)
+{
+	int result;
+	int i;
+	resource_size_t reg_size;
+	dev_t devt;
+
+	vsoc_dev.dev = pdev;
+	result = pci_enable_device(pdev);
+	if (result) {
+		dev_err(&pdev->dev,
+			"pci_enable_device failed %s: error %d\n",
+			pci_name(pdev), result);
+		return result;
+	}
+	vsoc_dev.enabled_device = 1;
+	result = pci_request_regions(pdev, "vsoc");
+	if (result < 0) {
+		dev_err(&pdev->dev, "pci_request_regions failed\n");
+		vsoc_remove_device(pdev);
+		return -EBUSY;
+	}
+	vsoc_dev.requested_regions = 1;
+	/* Set up the control registers in BAR 0 */
+	reg_size = pci_resource_len(pdev, REGISTER_BAR);
+	if (reg_size > MAX_REGISTER_BAR_LEN)
+		vsoc_dev.regs =
+		    pci_iomap(pdev, REGISTER_BAR, MAX_REGISTER_BAR_LEN);
+	else
+		vsoc_dev.regs = pci_iomap(pdev, REGISTER_BAR, reg_size);
+
+	if (!vsoc_dev.regs) {
+		dev_err(&pdev->dev,
+			"cannot ioremap registers of size %zu\n",
+		       (size_t)reg_size);
+		vsoc_remove_device(pdev);
+		return -EBUSY;
+	}
+
+	/* Map the shared memory in BAR 2 */
+	vsoc_dev.shm_phys_start = pci_resource_start(pdev, SHARED_MEMORY_BAR);
+	vsoc_dev.shm_size = pci_resource_len(pdev, SHARED_MEMORY_BAR);
+
+	dev_info(&pdev->dev, "shared memory @ DMA %p size=0x%zx\n",
+		 (void *)vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+	/* TODO(ghartman): ioremap_wc should work here */
+	vsoc_dev.kernel_mapped_shm = ioremap_nocache(
+			vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+	if (!vsoc_dev.kernel_mapped_shm) {
+		dev_err(&vsoc_dev.dev->dev, "cannot iomap region\n");
+		vsoc_remove_device(pdev);
+		return -EBUSY;
+	}
+
+	vsoc_dev.layout =
+	    (struct vsoc_shm_layout_descriptor *)vsoc_dev.kernel_mapped_shm;
+	dev_info(&pdev->dev, "major_version: %d\n",
+		 vsoc_dev.layout->major_version);
+	dev_info(&pdev->dev, "minor_version: %d\n",
+		 vsoc_dev.layout->minor_version);
+	dev_info(&pdev->dev, "size: 0x%x\n", vsoc_dev.layout->size);
+	dev_info(&pdev->dev, "regions: %d\n", vsoc_dev.layout->region_count);
+	if (vsoc_dev.layout->major_version !=
+	    CURRENT_VSOC_LAYOUT_MAJOR_VERSION) {
+		dev_err(&vsoc_dev.dev->dev,
+			"driver supports only major_version %d\n",
+			CURRENT_VSOC_LAYOUT_MAJOR_VERSION);
+		vsoc_remove_device(pdev);
+		return -EBUSY;
+	}
+	result = alloc_chrdev_region(&devt, 0, vsoc_dev.layout->region_count,
+				     VSOC_DEV_NAME);
+	if (result) {
+		dev_err(&vsoc_dev.dev->dev, "alloc_chrdev_region failed\n");
+		vsoc_remove_device(pdev);
+		return -EBUSY;
+	}
+	vsoc_dev.major = MAJOR(devt);
+	cdev_init(&vsoc_dev.cdev, &vsoc_ops);
+	vsoc_dev.cdev.owner = THIS_MODULE;
+	result = cdev_add(&vsoc_dev.cdev, devt, vsoc_dev.layout->region_count);
+	if (result) {
+		dev_err(&vsoc_dev.dev->dev, "cdev_add error\n");
+		vsoc_remove_device(pdev);
+		return -EBUSY;
+	}
+	vsoc_dev.cdev_added = 1;
+	vsoc_dev.class = class_create(THIS_MODULE, VSOC_DEV_NAME);
+	if (IS_ERR(vsoc_dev.class)) {
+		dev_err(&vsoc_dev.dev->dev, "class_create failed\n");
+		vsoc_remove_device(pdev);
+		return PTR_ERR(vsoc_dev.class);
+	}
+	vsoc_dev.class_added = 1;
+	vsoc_dev.regions = (struct vsoc_device_region *)
+		(vsoc_dev.kernel_mapped_shm +
+		 vsoc_dev.layout->vsoc_region_desc_offset);
+	vsoc_dev.msix_entries = kcalloc(
+			vsoc_dev.layout->region_count,
+			sizeof(vsoc_dev.msix_entries[0]), GFP_KERNEL);
+	if (!vsoc_dev.msix_entries) {
+		dev_err(&vsoc_dev.dev->dev,
+			"unable to allocate msix_entries\n");
+		vsoc_remove_device(pdev);
+		return -ENOSPC;
+	}
+	vsoc_dev.regions_data = kcalloc(
+			vsoc_dev.layout->region_count,
+			sizeof(vsoc_dev.regions_data[0]), GFP_KERNEL);
+	if (!vsoc_dev.regions_data) {
+		dev_err(&vsoc_dev.dev->dev,
+			"unable to allocate regions' data\n");
+		vsoc_remove_device(pdev);
+		return -ENOSPC;
+	}
+	for (i = 0; i < vsoc_dev.layout->region_count; ++i)
+		vsoc_dev.msix_entries[i].entry = i;
+
+	result = pci_enable_msix_exact(vsoc_dev.dev, vsoc_dev.msix_entries,
+				       vsoc_dev.layout->region_count);
+	if (result) {
+		dev_info(&pdev->dev, "pci_enable_msix failed: %d\n", result);
+		vsoc_remove_device(pdev);
+		return -ENOSPC;
+	}
+	/* Check that all regions are well formed */
+	for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
+		const struct vsoc_device_region *region = vsoc_dev.regions + i;
+
+		if (!PAGE_ALIGNED(region->region_begin_offset) ||
+		    !PAGE_ALIGNED(region->region_end_offset)) {
+			dev_err(&vsoc_dev.dev->dev,
+				"region %d not aligned (%x:%x)", i,
+				region->region_begin_offset,
+				region->region_end_offset);
+			vsoc_remove_device(pdev);
+			return -EFAULT;
+		}
+		if (region->region_begin_offset >= region->region_end_offset ||
+		    region->region_end_offset > vsoc_dev.shm_size) {
+			dev_err(&vsoc_dev.dev->dev,
+				"region %d offsets are wrong: %x %x %zx",
+				i, region->region_begin_offset,
+				region->region_end_offset, vsoc_dev.shm_size);
+			vsoc_remove_device(pdev);
+			return -EFAULT;
+		}
+		if (region->managed_by >= vsoc_dev.layout->region_count) {
+			dev_err(&vsoc_dev.dev->dev,
+				"region %d has invalid owner: %u",
+				i, region->managed_by);
+			vsoc_remove_device(pdev);
+			return -EFAULT;
+		}
+	}
+	vsoc_dev.msix_enabled = 1;
+	for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
+		const struct vsoc_device_region *region = vsoc_dev.regions + i;
+		size_t name_sz = sizeof(vsoc_dev.regions_data[i].name) - 1;
+		const struct vsoc_signal_table_layout *h_to_g_signal_table =
+			&region->host_to_guest_signal_table;
+		const struct vsoc_signal_table_layout *g_to_h_signal_table =
+			&region->guest_to_host_signal_table;
+
+		vsoc_dev.regions_data[i].name[name_sz] = '\0';
+		memcpy(vsoc_dev.regions_data[i].name, region->device_name,
+		       name_sz);
+		dev_info(&pdev->dev, "region %d name=%s\n",
+			 i, vsoc_dev.regions_data[i].name);
+		init_waitqueue_head(
+				&vsoc_dev.regions_data[i].interrupt_wait_queue);
+		init_waitqueue_head(&vsoc_dev.regions_data[i].futex_wait_queue);
+		vsoc_dev.regions_data[i].incoming_signalled =
+			vsoc_dev.kernel_mapped_shm +
+			region->region_begin_offset +
+			h_to_g_signal_table->interrupt_signalled_offset;
+		vsoc_dev.regions_data[i].outgoing_signalled =
+			vsoc_dev.kernel_mapped_shm +
+			region->region_begin_offset +
+			g_to_h_signal_table->interrupt_signalled_offset;
+
+		result = request_irq(
+				vsoc_dev.msix_entries[i].vector,
+				vsoc_interrupt, 0,
+				vsoc_dev.regions_data[i].name,
+				vsoc_dev.regions_data + i);
+		if (result) {
+			dev_info(&pdev->dev,
+				 "request_irq failed irq=%d vector=%d\n",
+				i, vsoc_dev.msix_entries[i].vector);
+			vsoc_remove_device(pdev);
+			return -ENOSPC;
+		}
+		vsoc_dev.regions_data[i].irq_requested = 1;
+		if (!device_create(vsoc_dev.class, NULL,
+				   MKDEV(vsoc_dev.major, i),
+				   NULL, vsoc_dev.regions_data[i].name)) {
+			dev_err(&vsoc_dev.dev->dev, "device_create failed\n");
+			vsoc_remove_device(pdev);
+			return -EBUSY;
+		}
+		vsoc_dev.regions_data[i].device_created = 1;
+	}
+	return 0;
+}
+
+/*
+ * This should undo all of the allocations in the probe function in reverse
+ * order.
+ *
+ * Notes:
+ *
+ *   The device may have been partially initialized, so double check
+ *   that the allocations happened.
+ *
+ *   This function may be called multiple times, so mark resources as freed
+ *   as they are deallocated.
+ */
+static void vsoc_remove_device(struct pci_dev *pdev)
+{
+	int i;
+	/*
+	 * pdev is the first thing to be set on probe and the last thing
+	 * to be cleared here. If it's NULL then there is no cleanup.
+	 */
+	if (!pdev || !vsoc_dev.dev)
+		return;
+	dev_info(&pdev->dev, "remove_device\n");
+	if (vsoc_dev.regions_data) {
+		for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
+			if (vsoc_dev.regions_data[i].device_created) {
+				device_destroy(vsoc_dev.class,
+					       MKDEV(vsoc_dev.major, i));
+				vsoc_dev.regions_data[i].device_created = 0;
+			}
+			if (vsoc_dev.regions_data[i].irq_requested)
+				free_irq(vsoc_dev.msix_entries[i].vector, NULL);
+			vsoc_dev.regions_data[i].irq_requested = 0;
+		}
+		kfree(vsoc_dev.regions_data);
+		vsoc_dev.regions_data = 0;
+	}
+	if (vsoc_dev.msix_enabled) {
+		pci_disable_msix(pdev);
+		vsoc_dev.msix_enabled = 0;
+	}
+	kfree(vsoc_dev.msix_entries);
+	vsoc_dev.msix_entries = 0;
+	vsoc_dev.regions = 0;
+	if (vsoc_dev.class_added) {
+		class_destroy(vsoc_dev.class);
+		vsoc_dev.class_added = 0;
+	}
+	if (vsoc_dev.cdev_added) {
+		cdev_del(&vsoc_dev.cdev);
+		vsoc_dev.cdev_added = 0;
+	}
+	if (vsoc_dev.major && vsoc_dev.layout) {
+		unregister_chrdev_region(MKDEV(vsoc_dev.major, 0),
+					 vsoc_dev.layout->region_count);
+		vsoc_dev.major = 0;
+	}
+	vsoc_dev.layout = 0;
+	if (vsoc_dev.kernel_mapped_shm) {
+		pci_iounmap(pdev, vsoc_dev.kernel_mapped_shm);
+		vsoc_dev.kernel_mapped_shm = 0;
+	}
+	if (vsoc_dev.regs) {
+		pci_iounmap(pdev, vsoc_dev.regs);
+		vsoc_dev.regs = 0;
+	}
+	if (vsoc_dev.requested_regions) {
+		pci_release_regions(pdev);
+		vsoc_dev.requested_regions = 0;
+	}
+	if (vsoc_dev.enabled_device) {
+		pci_disable_device(pdev);
+		vsoc_dev.enabled_device = 0;
+	}
+	/* Do this last: it indicates that the device is not initialized. */
+	vsoc_dev.dev = NULL;
+}
+
+static void __exit vsoc_cleanup_module(void)
+{
+	vsoc_remove_device(vsoc_dev.dev);
+	pci_unregister_driver(&vsoc_pci_driver);
+}
+
+static int __init vsoc_init_module(void)
+{
+	int err = -ENOMEM;
+
+	INIT_LIST_HEAD(&vsoc_dev.permissions);
+	mutex_init(&vsoc_dev.mtx);
+
+	err = pci_register_driver(&vsoc_pci_driver);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+static int vsoc_open(struct inode *inode, struct file *filp)
+{
+	/* Can't use vsoc_validate_filep because filp is still incomplete */
+	int ret = vsoc_validate_inode(inode);
+
+	if (ret)
+		return ret;
+	filp->private_data =
+		kzalloc(sizeof(struct vsoc_private_data), GFP_KERNEL);
+	if (!filp->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static int vsoc_release(struct inode *inode, struct file *filp)
+{
+	struct vsoc_private_data *private_data = NULL;
+	struct fd_scoped_permission_node *node = NULL;
+	struct vsoc_device_region *owner_region_p = NULL;
+	int retval = vsoc_validate_filep(filp);
+
+	if (retval)
+		return retval;
+	private_data = (struct vsoc_private_data *)filp->private_data;
+	if (!private_data)
+		return 0;
+
+	node = private_data->fd_scoped_permission_node;
+	if (node) {
+		owner_region_p = vsoc_region_from_inode(inode);
+		if (owner_region_p->managed_by != VSOC_REGION_WHOLE) {
+			owner_region_p =
+			    &vsoc_dev.regions[owner_region_p->managed_by];
+		}
+		do_destroy_fd_scoped_permission_node(owner_region_p, node);
+		private_data->fd_scoped_permission_node = NULL;
+	}
+	kfree(private_data);
+	filp->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Returns the device relative offset and length of the area specified by the
+ * fd scoped permission. If there is no fd scoped permission set, a default
+ * permission covering the entire region is assumed, unless the region is owned
+ * by another one, in which case the default is a permission with zero size.
+ */
+static ssize_t vsoc_get_area(struct file *filp, __u32 *area_offset)
+{
+	__u32 off = 0;
+	ssize_t length = 0;
+	struct vsoc_device_region *region_p;
+	struct fd_scoped_permission *perm;
+
+	region_p = vsoc_region_from_filep(filp);
+	off = region_p->region_begin_offset;
+	perm = &((struct vsoc_private_data *)filp->private_data)->
+		fd_scoped_permission_node->permission;
+	if (perm) {
+		off += perm->begin_offset;
+		length = perm->end_offset - perm->begin_offset;
+	} else if (region_p->managed_by == VSOC_REGION_WHOLE) {
+		/* No permission set and the regions is not owned by another,
+		 * default to full region access.
+		 */
+		length = vsoc_device_region_size(region_p);
+	} else {
+		/* return zero length, access is denied. */
+		length = 0;
+	}
+	if (area_offset)
+		*area_offset = off;
+	return length;
+}
+
+static int vsoc_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	unsigned long len = vma->vm_end - vma->vm_start;
+	__u32 area_off;
+	phys_addr_t mem_off;
+	ssize_t area_len;
+	int retval = vsoc_validate_filep(filp);
+
+	if (retval)
+		return retval;
+	area_len = vsoc_get_area(filp, &area_off);
+	/* Add the requested offset */
+	area_off += (vma->vm_pgoff << PAGE_SHIFT);
+	area_len -= (vma->vm_pgoff << PAGE_SHIFT);
+	if (area_len < len)
+		return -EINVAL;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	mem_off = shm_off_to_phys_addr(area_off);
+	if (io_remap_pfn_range(vma, vma->vm_start, mem_off >> PAGE_SHIFT,
+			       len, vma->vm_page_prot))
+		return -EAGAIN;
+	return 0;
+}
+
+module_init(vsoc_init_module);
+module_exit(vsoc_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Greg Hartman <ghartman@google.com>");
+MODULE_DESCRIPTION("VSoC interpretation of QEmu's ivshmem device");
+MODULE_VERSION("1.0");

From 12f9b37fcd1b062ae91f23b8a6d650582c37073b Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Thu, 5 Apr 2018 18:01:04 -0700
Subject: [PATCH 1660/3220] ANDROID: Add defconfig for cuttlefish.

This file is based on x86_64_defconfig, merged with the base and
recommended configs from configs.git, with the virtio drivers enabled
and some spurious kernel features turned off.

(This version differs from the 4.9 and 4.14 versions because it uses the
 deprecated ANDROID_LOW_MEMORY_KILLER feature instead of MEMCG.)

Change-Id: I61bde941e8cfef2dd83cb4ff040f7380922cc44e
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 442 +++++++++++++++++++
 1 file changed, 442 insertions(+)
 create mode 100644 arch/x86/configs/x86_64_cuttlefish_defconfig

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
new file mode 100644
index 000000000000..512d009146dd
--- /dev/null
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -0,0 +1,442 @@
+CONFIG_POSIX_MQUEUE=y
+# CONFIG_USELIB is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_NAMESPACES=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_RD_LZ4 is not set
+CONFIG_KALLSYMS_ALL=y
+# CONFIG_PCSPKR_PLATFORM is not set
+CONFIG_BPF_SYSCALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_CC_STACKPROTECTOR_STRONG=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_SMP=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_PARAVIRT_SPINLOCKS=y
+CONFIG_MCORE2=y
+CONFIG_PROCESSOR_SELECT=y
+# CONFIG_CPU_SUP_CENTAUR is not set
+CONFIG_NR_CPUS=8
+CONFIG_PREEMPT=y
+# CONFIG_MICROCODE is not set
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_DEFAULT_MMAP_MIN_ADDR=65536
+CONFIG_TRANSPARENT_HUGEPAGE=y
+# CONFIG_MTRR is not set
+CONFIG_HZ_100=y
+CONFIG_KEXEC=y
+CONFIG_CRASH_DUMP=y
+CONFIG_PHYSICAL_START=0x200000
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 reboot=p"
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_ACPI_PROCFS_POWER=y
+# CONFIG_ACPI_FAN is not set
+# CONFIG_ACPI_THERMAL is not set
+# CONFIG_X86_PM_TIMER is not set
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_X86_ACPI_CPUFREQ=y
+# CONFIG_X86_ACPI_CPUFREQ_CPB is not set
+CONFIG_PCI_MMCONFIG=y
+CONFIG_PCI_MSI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_IA32_EMULATION=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_TCP_CONG_ADVANCED=y
+# CONFIG_TCP_CONG_BIC is not set
+# CONFIG_TCP_CONG_WESTWOOD is not set
+# CONFIG_TCP_CONG_HTCP is not set
+CONFIG_TCP_MD5SIG=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_IPV6HEADER=y
+CONFIG_IP6_NF_MATCH_RPFILTER=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_RFKILL=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DEVTMPFS=y
+CONFIG_DEBUG_DEVRES=y
+CONFIG_OF=y
+CONFIG_OF_UNITTEST=y
+# CONFIG_PNP_DEBUG_MESSAGES is not set
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_UID_SYS_STATS=y
+CONFIG_MEMORY_STATE_TIME=y
+CONFIG_SCSI=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+# CONFIG_ETHERNET is not set
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+# CONFIG_USB_NET_AX8817X is not set
+# CONFIG_USB_NET_AX88179_178A is not set
+# CONFIG_USB_NET_CDCETHER is not set
+# CONFIG_USB_NET_CDC_NCM is not set
+# CONFIG_USB_NET_NET1080 is not set
+# CONFIG_USB_NET_CDC_SUBSET is not set
+# CONFIG_USB_NET_ZAURUS is not set
+CONFIG_MAC80211_HWSIM=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_I8042 is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=48
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_INTEL is not set
+# CONFIG_HW_RANDOM_AMD is not set
+# CONFIG_HW_RANDOM_VIA is not set
+CONFIG_HW_RANDOM_VIRTIO=y
+CONFIG_HPET=y
+# CONFIG_HPET_MMAP_DEFAULT is not set
+# CONFIG_DEVPORT is not set
+# CONFIG_ACPI_I2C_OPREGION is not set
+# CONFIG_I2C_COMPAT is not set
+# CONFIG_I2C_HELPER_AUTO is not set
+CONFIG_PTP_1588_CLOCK=y
+CONFIG_GPIOLIB=y
+# CONFIG_HWMON is not set
+# CONFIG_X86_PKG_TEMP_THERMAL is not set
+CONFIG_WATCHDOG=y
+CONFIG_SOFT_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+# CONFIG_DVB_TUNER_DIB0070 is not set
+# CONFIG_DVB_TUNER_DIB0090 is not set
+# CONFIG_VGA_ARB is not set
+CONFIG_DRM=y
+# CONFIG_DRM_FBDEV_EMULATION is not set
+CONFIG_DRM_VIRTIO_GPU=y
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+# CONFIG_HID_GENERIC is not set
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_DUMMY_HCD=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_ACC=y
+CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SW_SYNC_USER=y
+CONFIG_ANDROID_VSOC=y
+CONFIG_ION=y
+# CONFIG_X86_PLATFORM_DEVICES is not set
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+# CONFIG_FIRMWARE_MEMMAP is not set
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_QFMT_V2=y
+CONFIG_AUTOFS4_FS=y
+CONFIG_FUSE_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_SDCARD_FS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_LOCKUP_DETECTOR=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_IO_DELAY_NONE=y
+CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_PATH=y
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
+# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
+CONFIG_CRYPTO_ECHAINIV=y

From 25d6aefb7ce9d8a468ba6513457e4cb5d0fa76a7 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Fri, 13 Apr 2018 15:36:37 -0700
Subject: [PATCH 1661/3220] ANDROID: Add build server config for cuttlefish.

The build server config can be used with gcc or clang.
Specify CC=clang to build with clang.

Change-Id: Id346ab1489ecaaef8e9e66b084cc416dd0581f69
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 build.config.cuttlefish.x86_64 | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 build.config.cuttlefish.x86_64

diff --git a/build.config.cuttlefish.x86_64 b/build.config.cuttlefish.x86_64
new file mode 100644
index 000000000000..b3d89109fe75
--- /dev/null
+++ b/build.config.cuttlefish.x86_64
@@ -0,0 +1,15 @@
+ARCH=x86_64
+BRANCH=android-4.4
+CLANG_TRIPLE=x86_64-linux-gnu-
+CROSS_COMPILE=x86_64-linux-androidkernel-
+DEFCONFIG=x86_64_cuttlefish_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+POST_DEFCONFIG_CMDS="check_defconfig"
+CLANG_PREBUILT_BIN=prebuilts/clang/host/linux-x86/clang-4630689/bin
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"

From f478c9bfbb71ed9b17af110606eb81a08269003b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 18 Apr 2018 11:09:47 -0700
Subject: [PATCH 1662/3220] fscrypt: allow synchronous bio decryption

Cherry-pick from origin/upstream-f2fs-stable-linux-4.4.y:
  13890bed2032 ("fscrypt: allow synchronous bio decryption")

Currently, fscrypt provides fscrypt_decrypt_bio_pages() which decrypts a
bio's pages asynchronously, then unlocks them afterwards.  But, this
assumes that decryption is the last "postprocessing step" for the bio,
so it's incompatible with additional postprocessing steps such as
authenticity verification after decryption.

Therefore, rename the existing fscrypt_decrypt_bio_pages() to
fscrypt_enqueue_decrypt_bio().  Then, add fscrypt_decrypt_bio() which
decrypts the pages in the bio synchronously without unlocking the pages,
nor setting them Uptodate; and add fscrypt_enqueue_decrypt_work(), which
enqueues work on the fscrypt_read_workqueue.  The new functions will be
used by filesystems that support both fscrypt and fs-verity.

Change-Id: I99f1c7bfb79381f6e4abf2b1f418776b19bd8e08
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/crypto/bio.c                 | 35 +++++++++++++++++++++------------
 fs/crypto/crypto.c              |  8 +++++++-
 fs/crypto/fscrypt_private.h     |  1 -
 fs/f2fs/data.c                  |  2 +-
 include/linux/fscrypt_notsupp.h | 13 +++++++++---
 include/linux/fscrypt_supp.h    |  5 ++++-
 6 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index a91ed46fe503..c7cf565c434e 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -25,15 +25,8 @@
 #include <linux/namei.h>
 #include "fscrypt_private.h"
 
-/*
- * Call fscrypt_decrypt_page on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
+static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
 {
-	struct fscrypt_ctx *ctx =
-		container_of(work, struct fscrypt_ctx, r.work);
-	struct bio *bio = ctx->r.bio;
 	struct bio_vec *bv;
 	int i;
 
@@ -45,22 +38,38 @@ static void completion_pages(struct work_struct *work)
 		if (ret) {
 			WARN_ON_ONCE(1);
 			SetPageError(page);
-		} else {
+		} else if (done) {
 			SetPageUptodate(page);
 		}
-		unlock_page(page);
+		if (done)
+			unlock_page(page);
 	}
+}
+
+void fscrypt_decrypt_bio(struct bio *bio)
+{
+	__fscrypt_decrypt_bio(bio, false);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio);
+
+static void completion_pages(struct work_struct *work)
+{
+	struct fscrypt_ctx *ctx =
+		container_of(work, struct fscrypt_ctx, r.work);
+	struct bio *bio = ctx->r.bio;
+
+	__fscrypt_decrypt_bio(bio, true);
 	fscrypt_release_ctx(ctx);
 	bio_put(bio);
 }
 
-void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio)
 {
 	INIT_WORK(&ctx->r.work, completion_pages);
 	ctx->r.bio = bio;
-	queue_work(fscrypt_read_workqueue, &ctx->r.work);
+	fscrypt_enqueue_decrypt_work(&ctx->r.work);
 }
-EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio);
 
 void fscrypt_pullback_bio_page(struct page **page, bool restore)
 {
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index ce654526c0fb..0758d32ad01b 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -45,12 +45,18 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static LIST_HEAD(fscrypt_free_ctxs);
 static DEFINE_SPINLOCK(fscrypt_ctx_lock);
 
-struct workqueue_struct *fscrypt_read_workqueue;
+static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
 static struct kmem_cache *fscrypt_ctx_cachep;
 struct kmem_cache *fscrypt_info_cachep;
 
+void fscrypt_enqueue_decrypt_work(struct work_struct *work)
+{
+	queue_work(fscrypt_read_workqueue, work);
+}
+EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
+
 /**
  * fscrypt_release_ctx() - Releases an encryption context
  * @ctx: The encryption context to release.
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 5c296d4af4a9..426aa1b27f17 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -107,7 +107,6 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
 /* crypto.c */
 extern struct kmem_cache *fscrypt_info_cachep;
 extern int fscrypt_initialize(unsigned int cop_flags);
-extern struct workqueue_struct *fscrypt_read_workqueue;
 extern int fscrypt_do_page_crypto(const struct inode *inode,
 				  fscrypt_direction_t rw, u64 lblk_num,
 				  struct page *src_page,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ad44712bb902..aa10851f3849 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -66,7 +66,7 @@ static void f2fs_read_end_io(struct bio *bio)
 		if (bio->bi_error) {
 			fscrypt_release_ctx(bio->bi_private);
 		} else {
-			fscrypt_decrypt_bio_pages(bio->bi_private, bio);
+			fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
 			return;
 		}
 	}
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index 5777251400f9..44bd4fbd3ec5 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -24,6 +24,10 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
 }
 
 /* crypto.c */
+static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
+{
+}
+
 static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
 						  gfp_t gfp_flags)
 {
@@ -160,10 +164,13 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
 }
 
 /* bio.c */
-static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx,
-					     struct bio *bio)
+static inline void fscrypt_decrypt_bio(struct bio *bio)
+{
+}
+
+static inline void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
+					       struct bio *bio)
 {
-	return;
 }
 
 static inline void fscrypt_pullback_bio_page(struct page **page, bool restore)
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index c88d2058902a..9d1857302b73 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -58,6 +58,7 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
 }
 
 /* crypto.c */
+extern void fscrypt_enqueue_decrypt_work(struct work_struct *);
 extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
 extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
@@ -187,7 +188,9 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
 }
 
 /* bio.c */
-extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
+extern void fscrypt_decrypt_bio(struct bio *);
+extern void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
+					struct bio *bio);
 extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
 				 unsigned int);

From 0f4e0fa71f18604e498ee8458370f367e3b36de3 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 18 Apr 2018 11:09:48 -0700
Subject: [PATCH 1663/3220] f2fs: refactor read path to allow multiple
 postprocessing steps

Cherry-pick from origin/upstream-f2fs-stable-linux-4.4.y:
  c18b4f60c8df ("f2fs: refactor read path to allow multiple postprocessing steps")

Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only.  But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.

To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step.  The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step.  When that completes, it continues to the next step.  Pages that
fail any postprocessing step have PageError set.  Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.

Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.

This may also be useful for other future f2fs features such as
compression.

Change-Id: I742be348b9dfc2113200bcc5366a84e978371a54
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c   | 168 +++++++++++++++++++++++++++++++++++------------
 fs/f2fs/f2fs.h   |  12 +++-
 fs/f2fs/file.c   |   4 +-
 fs/f2fs/gc.c     |   6 +-
 fs/f2fs/inline.c |   2 +-
 fs/f2fs/super.c  |   6 ++
 6 files changed, 148 insertions(+), 50 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa10851f3849..f87b0473b977 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -30,6 +30,11 @@
 #include <trace/events/f2fs.h>
 #include <trace/events/android_fs.h>
 
+#define NUM_PREALLOC_POST_READ_CTXS	128
+
+static struct kmem_cache *bio_post_read_ctx_cache;
+static mempool_t *bio_post_read_ctx_pool;
+
 static bool __is_cp_guaranteed(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
@@ -50,11 +55,77 @@ static bool __is_cp_guaranteed(struct page *page)
 	return false;
 }
 
-static void f2fs_read_end_io(struct bio *bio)
+/* postprocessing steps for read bios */
+enum bio_post_read_step {
+	STEP_INITIAL = 0,
+	STEP_DECRYPT,
+};
+
+struct bio_post_read_ctx {
+	struct bio *bio;
+	struct work_struct work;
+	unsigned int cur_step;
+	unsigned int enabled_steps;
+};
+
+static void __read_end_io(struct bio *bio)
 {
-	struct bio_vec *bvec;
+	struct page *page;
+	struct bio_vec *bv;
 	int i;
 
+	bio_for_each_segment_all(bv, bio, i) {
+		page = bv->bv_page;
+
+		/* PG_error was set if any post_read step failed */
+		if (bio->bi_error || PageError(page)) {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		} else {
+			SetPageUptodate(page);
+		}
+		unlock_page(page);
+	}
+	if (bio->bi_private)
+		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+	bio_put(bio);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
+
+static void decrypt_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+	fscrypt_decrypt_bio(ctx->bio);
+
+	bio_post_read_processing(ctx);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+{
+	switch (++ctx->cur_step) {
+	case STEP_DECRYPT:
+		if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
+			INIT_WORK(&ctx->work, decrypt_work);
+			fscrypt_enqueue_decrypt_work(&ctx->work);
+			return;
+		}
+		ctx->cur_step++;
+		/* fall-through */
+	default:
+		__read_end_io(ctx->bio);
+	}
+}
+
+static bool f2fs_bio_post_read_required(struct bio *bio)
+{
+	return bio->bi_private && !bio->bi_error;
+}
+
+static void f2fs_read_end_io(struct bio *bio)
+{
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) {
 		f2fs_show_injection_info(FAULT_IO);
@@ -62,28 +133,15 @@ static void f2fs_read_end_io(struct bio *bio)
 	}
 #endif
 
-	if (f2fs_bio_encrypted(bio)) {
-		if (bio->bi_error) {
-			fscrypt_release_ctx(bio->bi_private);
-		} else {
-			fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
-			return;
-		}
+	if (f2fs_bio_post_read_required(bio)) {
+		struct bio_post_read_ctx *ctx = bio->bi_private;
+
+		ctx->cur_step = STEP_INITIAL;
+		bio_post_read_processing(ctx);
+		return;
 	}
 
-	bio_for_each_segment_all(bvec, bio, i) {
-		struct page *page = bvec->bv_page;
-
-		if (!bio->bi_error) {
-			if (!PageUptodate(page))
-				SetPageUptodate(page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-		unlock_page(page);
-	}
-	bio_put(bio);
+	__read_end_io(bio);
 }
 
 static void f2fs_write_end_io(struct bio *bio)
@@ -480,29 +538,33 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 							 unsigned nr_pages)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct fscrypt_ctx *ctx = NULL;
 	struct bio *bio;
+	struct bio_post_read_ctx *ctx;
+	unsigned int post_read_steps = 0;
 
-	if (f2fs_encrypted_file(inode)) {
-		ctx = fscrypt_get_ctx(inode, GFP_NOFS);
-		if (IS_ERR(ctx))
-			return ERR_CAST(ctx);
+	bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
+	if (!bio)
+		return ERR_PTR(-ENOMEM);
+	f2fs_target_device(sbi, blkaddr, bio);
+	bio->bi_end_io = f2fs_read_end_io;
+	bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+	if (f2fs_encrypted_file(inode))
+		post_read_steps |= 1 << STEP_DECRYPT;
+	if (post_read_steps) {
+		ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+		if (!ctx) {
+			bio_put(bio);
+			return ERR_PTR(-ENOMEM);
+		}
+		ctx->bio = bio;
+		ctx->enabled_steps = post_read_steps;
+		bio->bi_private = ctx;
 
 		/* wait the page to be moved by cleaning */
 		f2fs_wait_on_block_writeback(sbi, blkaddr);
 	}
 
-	bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
-	if (!bio) {
-		if (ctx)
-			fscrypt_release_ctx(ctx);
-		return ERR_PTR(-ENOMEM);
-	}
-	f2fs_target_device(sbi, blkaddr, bio);
-	bio->bi_end_io = f2fs_read_end_io;
-	bio->bi_private = ctx;
-	bio_set_op_attrs(bio, REQ_OP_READ, 0);
-
 	return bio;
 }
 
@@ -1523,7 +1585,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio)
 	if (!f2fs_encrypted_file(inode))
 		return 0;
 
-	/* wait for GCed encrypted page writeback */
+	/* wait for GCed page writeback via META_MAPPING */
 	f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr);
 
 retry_encrypt:
@@ -2235,8 +2297,8 @@ repeat:
 
 	f2fs_wait_on_page_writeback(page, DATA, false);
 
-	/* wait for GCed encrypted page writeback */
-	if (f2fs_encrypted_file(inode))
+	/* wait for GCed page writeback via META_MAPPING */
+	if (f2fs_post_read_required(inode))
 		f2fs_wait_on_block_writeback(sbi, blkaddr);
 
 	if (len == PAGE_SIZE || PageUptodate(page))
@@ -2596,3 +2658,27 @@ const struct address_space_operations f2fs_dblock_aops = {
 	.migratepage    = f2fs_migrate_page,
 #endif
 };
+
+int __init f2fs_init_post_read_processing(void)
+{
+	bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0);
+	if (!bio_post_read_ctx_cache)
+		goto fail;
+	bio_post_read_ctx_pool =
+		mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
+					 bio_post_read_ctx_cache);
+	if (!bio_post_read_ctx_pool)
+		goto fail_free_cache;
+	return 0;
+
+fail_free_cache:
+	kmem_cache_destroy(bio_post_read_ctx_cache);
+fail:
+	return -ENOMEM;
+}
+
+void __exit f2fs_destroy_post_read_processing(void)
+{
+	mempool_destroy(bio_post_read_ctx_pool);
+	kmem_cache_destroy(bio_post_read_ctx_cache);
+}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e258f6536b56..30326956bc38 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2937,6 +2937,8 @@ void destroy_checkpoint_caches(void);
 /*
  * data.c
  */
+int f2fs_init_post_read_processing(void);
+void f2fs_destroy_post_read_processing(void);
 void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
 void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
 				struct inode *inode, nid_t ino, pgoff_t idx,
@@ -3297,9 +3299,13 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
 #endif
 }
 
-static inline bool f2fs_bio_encrypted(struct bio *bio)
+/*
+ * Returns true if the reads of the inode's data need to undergo some
+ * postprocessing step, like decryption or authenticity verification.
+ */
+static inline bool f2fs_post_read_required(struct inode *inode)
 {
-	return bio->bi_private != NULL;
+	return f2fs_encrypted_file(inode);
 }
 
 #define F2FS_FEATURE_FUNCS(name, flagname) \
@@ -3367,7 +3373,7 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 
 static inline bool f2fs_force_buffered_io(struct inode *inode, int rw)
 {
-	return (f2fs_encrypted_file(inode) ||
+	return (f2fs_post_read_required(inode) ||
 			(rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
 			F2FS_I_SB(inode)->s_ndevs);
 }
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 39c3acb454a3..7587758a285f 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -113,8 +113,8 @@ mapped:
 	/* fill the page */
 	f2fs_wait_on_page_writeback(page, DATA, false);
 
-	/* wait for GCed encrypted page writeback */
-	if (f2fs_encrypted_file(inode))
+	/* wait for GCed page writeback via META_MAPPING */
+	if (f2fs_post_read_required(inode))
 		f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr);
 
 out_sem:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 54f51a990794..c009b50d69f5 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -850,8 +850,8 @@ next_step:
 			if (IS_ERR(inode) || is_bad_inode(inode))
 				continue;
 
-			/* if encrypted inode, let's go phase 3 */
-			if (f2fs_encrypted_file(inode)) {
+			/* if inode uses special I/O path, let's go phase 3 */
+			if (f2fs_post_read_required(inode)) {
 				add_gc_inode(gc_list, inode);
 				continue;
 			}
@@ -899,7 +899,7 @@ next_step:
 
 			start_bidx = start_bidx_of_node(nofs, inode)
 								+ ofs_in_node;
-			if (f2fs_encrypted_file(inode))
+			if (f2fs_post_read_required(inode))
 				move_data_block(inode, start_bidx, segno, off);
 			else
 				move_data_page(inode, start_bidx, gc_type,
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 37ab2159506a..922a213693c1 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -26,7 +26,7 @@ bool f2fs_may_inline_data(struct inode *inode)
 	if (i_size_read(inode) > MAX_INLINE_DATA(inode))
 		return false;
 
-	if (f2fs_encrypted_file(inode))
+	if (f2fs_post_read_required(inode))
 		return false;
 
 	return true;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a622eb4f59f2..55b2bad55671 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3100,8 +3100,13 @@ static int __init init_f2fs_fs(void)
 	err = f2fs_create_root_stats();
 	if (err)
 		goto free_filesystem;
+	err = f2fs_init_post_read_processing();
+	if (err)
+		goto free_root_stats;
 	return 0;
 
+free_root_stats:
+	f2fs_destroy_root_stats();
 free_filesystem:
 	unregister_filesystem(&f2fs_fs_type);
 free_shrinker:
@@ -3124,6 +3129,7 @@ fail:
 
 static void __exit exit_f2fs_fs(void)
 {
+	f2fs_destroy_post_read_processing();
 	f2fs_destroy_root_stats();
 	unregister_filesystem(&f2fs_fs_type);
 	unregister_shrinker(&f2fs_shrinker_info);

From a9c753efb3b7339d4b333bb99bcea76060336aaf Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 18 Apr 2018 15:48:42 -0700
Subject: [PATCH 1664/3220] f2fs: call unlock_new_inode() before
 d_instantiate()

Cherry-pick from origin/upstream-f2fs-stable-linux-4.4.y:
  dafecc032ea1 ("f2fs: call unlock_new_inode() before d_instantiate()")

xfstest generic/429 sometimes hangs on f2fs, caused by a thread being
unable to take a directory's i_rwsem for write in vfs_rmdir().  In the
test, one thread repeatedly creates and removes a directory, and other
threads repeatedly look up a file in the directory.  The bug is that
f2fs_mkdir() calls d_instantiate() before unlock_new_inode(), resulting
in the directory inode being exposed to lookups before it has been fully
initialized.  And with CONFIG_DEBUG_LOCK_ALLOC, unlock_new_inode()
reinitializes ->i_rwsem, corrupting its state when it is already held.

Fix it by calling unlock_new_inode() before d_instantiate().  This
matches what other filesystems do.

Fixes: 57397d86c62d ("f2fs: add inode operations for special inodes")
Change-Id: I1c50a45449b89ce590ae2caccb7d499f4b717665
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 5ec20f077629..fecae8685d2a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -294,8 +294,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
 	alloc_nid_done(sbi, ino);
 
-	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
@@ -594,8 +594,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	err = page_symlink(inode, disk_link.name, disk_link.len);
 
 err_out:
-	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
 
 	/*
 	 * Let's flush symlink data in order to avoid broken symlink as much as
@@ -658,8 +658,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 	alloc_nid_done(sbi, inode->i_ino);
 
-	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
@@ -710,8 +710,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 
 	alloc_nid_done(sbi, inode->i_ino);
 
-	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);

From ed3b23f4d993c8b4bf79a718e6c5da788a12208e Mon Sep 17 00:00:00 2001
From: Patrik Torstensson <totte@google.com>
Date: Thu, 22 Mar 2018 18:18:04 -0700
Subject: [PATCH 1665/3220] BACKPORT: dm verity: add 'check_at_most_once'
 option to only validate hashes once

This allows platforms that are CPU/memory contrained to verify data
blocks only the first time they are read from the data device, rather
than every time.  As such, it provides a reduced level of security
because only offline tampering of the data device's content will be
detected, not online tampering.

Hash blocks are still verified each time they are read from the hash
device, since verification of hash blocks is less performance critical
than data blocks, and a hash block will not be verified any more after
all the data blocks it covers have been verified anyway.

This option introduces a bitset that is used to check if a block has
been validated before or not.  A block can be validated more than once
as there is no thread protection for the bitset.

These changes were developed and tested on entry-level Android Go
devices.

(change to use vzalloc/vfree)
(change per_bio_data_size variable)

Bug: 72664474
Change-Id: I2db2db26c7897c78407789de2cc4cff3587c84bb
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit 843f38d382b1ca2f6f4ae2ef7c35933e6319ffbb)
Signed-off-by: Patrik Torstensson <totte@google.com>
---
 Documentation/device-mapper/verity.txt | 11 +++++
 drivers/md/dm-verity-target.c          | 65 ++++++++++++++++++++++++--
 drivers/md/dm-verity.h                 |  1 +
 3 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
index 89fd8f9a259f..b3d2e4a42255 100644
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -109,6 +109,17 @@ fec_start <offset>
     This is the offset, in <data_block_size> blocks, from the start of the
     FEC device to the beginning of the encoding data.
 
+check_at_most_once
+    Verify data blocks only the first time they are read from the data device,
+    rather than every time.  This reduces the overhead of dm-verity so that it
+    can be used on systems that are memory and/or CPU constrained.  However, it
+    provides a reduced level of security because only offline tampering of the
+    data device's content will be detected, not online tampering.
+
+    Hash blocks are still verified each time they are read from the hash device,
+    since verification of hash blocks is less performance critical than data
+    blocks, and a hash block will not be verified any more after all the data
+    blocks it covers have been verified anyway.
 
 Theory of operation
 ===================
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index e34cf53bd068..ceff074b3b74 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -19,6 +19,7 @@
 
 #include <linux/module.h>
 #include <linux/reboot.h>
+#include <linux/vmalloc.h>
 
 #define DM_MSG_PREFIX			"verity"
 
@@ -32,6 +33,7 @@
 #define DM_VERITY_OPT_LOGGING		"ignore_corruption"
 #define DM_VERITY_OPT_RESTART		"restart_on_corruption"
 #define DM_VERITY_OPT_IGN_ZEROES	"ignore_zero_blocks"
+#define DM_VERITY_OPT_AT_MOST_ONCE	"check_at_most_once"
 
 #define DM_VERITY_OPTS_MAX		(2 + DM_VERITY_OPTS_FEC)
 
@@ -398,6 +400,18 @@ static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
 	return 0;
 }
 
+/*
+ * Moves the bio iter one data block forward.
+ */
+static inline void verity_bv_skip_block(struct dm_verity *v,
+					struct dm_verity_io *io,
+					struct bvec_iter *iter)
+{
+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+
+	bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits);
+}
+
 /*
  * Verify one "dm_verity_io" structure.
  */
@@ -410,9 +424,16 @@ static int verity_verify_io(struct dm_verity_io *io)
 
 	for (b = 0; b < io->n_blocks; b++) {
 		int r;
+		sector_t cur_block = io->block + b;
 		struct shash_desc *desc = verity_io_hash_desc(v, io);
 
-		r = verity_hash_for_block(v, io, io->block + b,
+		if (v->validated_blocks &&
+		    likely(test_bit(cur_block, v->validated_blocks))) {
+			verity_bv_skip_block(v, io, &io->iter);
+			continue;
+		}
+
+		r = verity_hash_for_block(v, io, cur_block,
 					  verity_io_want_digest(v, io),
 					  &is_zero);
 		if (unlikely(r < 0))
@@ -445,13 +466,16 @@ static int verity_verify_io(struct dm_verity_io *io)
 			return r;
 
 		if (likely(memcmp(verity_io_real_digest(v, io),
-				  verity_io_want_digest(v, io), v->digest_size) == 0))
+				  verity_io_want_digest(v, io), v->digest_size) == 0)) {
+			if (v->validated_blocks)
+				set_bit(cur_block, v->validated_blocks);
 			continue;
+		}
 		else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
-					   io->block + b, NULL, &start) == 0)
+					   cur_block, NULL, &start) == 0)
 			continue;
 		else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
-					   io->block + b))
+					   cur_block))
 			return -EIO;
 	}
 
@@ -645,6 +669,8 @@ void verity_status(struct dm_target *ti, status_type_t type,
 			args += DM_VERITY_OPTS_FEC;
 		if (v->zero_digest)
 			args++;
+		if (v->validated_blocks)
+			args++;
 		if (!args)
 			return;
 		DMEMIT(" %u", args);
@@ -663,6 +689,8 @@ void verity_status(struct dm_target *ti, status_type_t type,
 		}
 		if (v->zero_digest)
 			DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES);
+		if (v->validated_blocks)
+			DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE);
 		sz = verity_fec_status_table(v, sz, result, maxlen);
 		break;
 	}
@@ -716,6 +744,7 @@ void verity_dtr(struct dm_target *ti)
 	if (v->bufio)
 		dm_bufio_client_destroy(v->bufio);
 
+	vfree(v->validated_blocks);
 	kfree(v->salt);
 	kfree(v->root_digest);
 	kfree(v->zero_digest);
@@ -737,6 +766,26 @@ void verity_dtr(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(verity_dtr);
 
+static int verity_alloc_most_once(struct dm_verity *v)
+{
+	struct dm_target *ti = v->ti;
+
+	/* the bitset can only handle INT_MAX blocks */
+	if (v->data_blocks > INT_MAX) {
+		ti->error = "device too large to use check_at_most_once";
+		return -E2BIG;
+	}
+
+	v->validated_blocks = vzalloc(BITS_TO_LONGS(v->data_blocks) *
+				       sizeof(unsigned long));
+	if (!v->validated_blocks) {
+		ti->error = "failed to allocate bitset for check_at_most_once";
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
 static int verity_alloc_zero_digest(struct dm_verity *v)
 {
 	int r = -ENOMEM;
@@ -806,6 +855,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
 			}
 			continue;
 
+		} else if (!strcasecmp(arg_name, DM_VERITY_OPT_AT_MOST_ONCE)) {
+			r = verity_alloc_most_once(v);
+			if (r)
+				return r;
+			continue;
+
 		} else if (verity_is_fec_opt_arg(arg_name)) {
 			r = verity_fec_parse_opt_args(as, v, &argc, arg_name);
 			if (r)
@@ -1074,7 +1129,7 @@ EXPORT_SYMBOL_GPL(verity_ctr);
 
 static struct target_type verity_target = {
 	.name		= "verity",
-	.version	= {1, 3, 0},
+	.version	= {1, 4, 0},
 	.module		= THIS_MODULE,
 	.ctr		= verity_ctr,
 	.dtr		= verity_dtr,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index a90d1d416107..d216fc76d350 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -63,6 +63,7 @@ struct dm_verity {
 	sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
 
 	struct dm_verity_fec *fec;	/* forward error correction */
+	unsigned long *validated_blocks; /* bitset blocks validated */
 };
 
 struct dm_verity_io {

From bd23e3af176519605433eaa1727ba1d3fa69e7bb Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Thu, 26 May 2016 17:21:05 -0700
Subject: [PATCH 1666/3220] UPSTREAM: timer: Export destroy_hrtimer_on_stack()

hrtimer_init_on_stack() needs a matching call to
destroy_hrtimer_on_stack(), so both need to be exported.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit c08376ac97cb202ec65320f3d90d5c4c5e2adb0b)
[astrachan: Fixes i386-allmodconfig build failure in vsoc.ko noticed
 by 01.org kbuild-all project building kernel/msm]
Bug: 78036828
Change-Id: If4d5c466255019322ea21ef38ee5b1b382cce969
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 kernel/time/hrtimer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d0d8717ebc3a..8d2e7074860b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -435,6 +435,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
 {
 	debug_object_free(timer, &hrtimer_debug_descr);
 }
+EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
 
 #else
 static inline void debug_hrtimer_init(struct hrtimer *timer) { }

From b5390be7d000458da86a07f3cd354717385b36a4 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 11 Apr 2018 23:09:04 -0700
Subject: [PATCH 1667/3220] f2fs: clear PageError on writepage

Cherry-pick from origin/upstream-f2fs-stable-linux-4.4.y:
commit 070da80085a4 ("f2fs: clear PageError on writepage")

This patch clears PageError in some pages tagged by read path, but when we
write the pages with valid contents, writepage should clear the bit likewise
ext4.

Change-Id: I7272074f2bb9c81fc43e37074b44e9d761756263
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f87b0473b977..706dcba18814 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1735,6 +1735,7 @@ got_it:
 			goto out_writepage;
 
 		set_page_writeback(page);
+		ClearPageError(page);
 		f2fs_put_dnode(&dn);
 		if (fio->need_lock == LOCK_REQ)
 			f2fs_unlock_op(fio->sbi);
@@ -1757,6 +1758,7 @@ got_it:
 		goto out_writepage;
 
 	set_page_writeback(page);
+	ClearPageError(page);
 
 	/* LFS mode write path */
 	write_data_page(&dn, fio);

From 35346d5c24f3b737681cbc23feade50eb074e6ab Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 20 Apr 2018 19:29:52 -0700
Subject: [PATCH 1668/3220] Revert "f2fs: introduce
 f2fs_set_page_dirty_nobuffer"

Cherry-pick from origin/upstream-f2fs-stable-linux-4.4.y:
commit 3e7a14117575 ("Revert "f2fs: introduce f2fs_set_page_dirty_nobuffer"")

This patch reverts copied f2fs_set_page_dirty_nobuffer to use generic function
for stability.

This reverts commit fe76b796fc5194cc3d57265002e3a748566d073f.

Change-Id: I3d4728d894d1af41a2f1e30ebc375907abd5ffc8
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  2 +-
 fs/f2fs/data.c       | 35 +----------------------------------
 fs/f2fs/f2fs.h       |  1 -
 fs/f2fs/node.c       |  2 +-
 4 files changed, 3 insertions(+), 37 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 04c608646fd5..760d1ad22722 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -386,7 +386,7 @@ static int f2fs_set_meta_page_dirty(struct page *page)
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 	if (!PageDirty(page)) {
-		f2fs_set_page_dirty_nobuffers(page);
+		__set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
 		SetPagePrivate(page);
 		f2fs_trace_pid(page);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 706dcba18814..a670702cf3ff 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -19,8 +19,6 @@
 #include <linux/bio.h>
 #include <linux/prefetch.h>
 #include <linux/uio.h>
-#include <linux/mm.h>
-#include <linux/memcontrol.h>
 #include <linux/cleancache.h>
 
 #include "f2fs.h"
@@ -2513,37 +2511,6 @@ int f2fs_release_page(struct page *page, gfp_t wait)
 	return 1;
 }
 
-/*
- * This was copied from __set_page_dirty_buffers which gives higher performance
- * in very high speed storages. (e.g., pmem)
- */
-void f2fs_set_page_dirty_nobuffers(struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct mem_cgroup *memcg;
-	unsigned long flags;
-
-	if (unlikely(!mapping))
-		return;
-
-	spin_lock(&mapping->private_lock);
-	memcg = mem_cgroup_begin_page_stat(page);
-	SetPageDirty(page);
-	spin_unlock(&mapping->private_lock);
-
-	spin_lock_irqsave(&mapping->tree_lock, flags);
-	WARN_ON_ONCE(!PageUptodate(page));
-	account_page_dirtied(page, mapping, memcg);
-	radix_tree_tag_set(&mapping->page_tree,
-			page_index(page), PAGECACHE_TAG_DIRTY);
-	spin_unlock_irqrestore(&mapping->tree_lock, flags);
-
-	mem_cgroup_end_page_stat(memcg);
-
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	return;
-}
-
 static int f2fs_set_data_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
@@ -2567,7 +2534,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
 	}
 
 	if (!PageDirty(page)) {
-		f2fs_set_page_dirty_nobuffers(page);
+		__set_page_dirty_nobuffers(page);
 		update_dirty_page(inode, page);
 		return 1;
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 30326956bc38..871211c98477 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2970,7 +2970,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			u64 start, u64 len);
 bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio);
 bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio);
-void f2fs_set_page_dirty_nobuffers(struct page *page);
 int __f2fs_write_data_pages(struct address_space *mapping,
 						struct writeback_control *wbc,
 						enum iostat_type io_type);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 157d768c7b31..3871e7d3f69e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1775,7 +1775,7 @@ static int f2fs_set_node_page_dirty(struct page *page)
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 	if (!PageDirty(page)) {
-		f2fs_set_page_dirty_nobuffers(page);
+		__set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
 		SetPagePrivate(page);
 		f2fs_trace_pid(page);

From e69d0248ef3b0d1e8f8e89de3b6404c7a7203911 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 20 Apr 2018 23:44:59 -0700
Subject: [PATCH 1669/3220] f2fs: check cap_resource only for data blocks

Cherry-pick from origin/upstream-f2fs-stable-linux-4.4.y:
commit f819874f58cf ("f2fs: check cap_resource only for data blocks")

This patch changes the rule to check cap_resource for data blocks, not inode
or node blocks in order to avoid selinux denial.

Change-Id: I875d7ccf7cce7b833a1c11cb0eef0b504b823c4a
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 871211c98477..d0bfcfed35e2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1673,7 +1673,7 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
 }
 
 static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
-					struct inode *inode)
+					struct inode *inode, bool cap)
 {
 	if (!inode)
 		return true;
@@ -1686,7 +1686,7 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
 	if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) &&
 					in_group_p(F2FS_OPTION(sbi).s_resgid))
 		return true;
-	if (capable(CAP_SYS_RESOURCE))
+	if (cap && capable(CAP_SYS_RESOURCE))
 		return true;
 	return false;
 }
@@ -1721,7 +1721,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 	avail_user_block_count = sbi->user_block_count -
 					sbi->current_reserved_blocks;
 
-	if (!__allow_reserved_blocks(sbi, inode))
+	if (!__allow_reserved_blocks(sbi, inode, true))
 		avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
 
 	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
@@ -1928,7 +1928,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 	valid_block_count = sbi->total_valid_block_count +
 					sbi->current_reserved_blocks + 1;
 
-	if (!__allow_reserved_blocks(sbi, inode))
+	if (!__allow_reserved_blocks(sbi, inode, false))
 		valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
 
 	if (unlikely(valid_block_count > sbi->user_block_count)) {

From 86e6cbb5c6ae172e3a04ff69a91247c7b353ad15 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 11 Apr 2018 16:19:10 -0700
Subject: [PATCH 1670/3220] ANDROID: sdcardfs: Check for private data earlier

When an sdcardfs dentry is destroyed, it may not yet
have its fsdata initialized. It must be checked before
we try to access the paths in its private data.

Additionally, when cleaning up the superblock after
a failure, we don't have our sb private data, so
check for that case.

Bug: 77923821
Change-Id: I89caf6e121ed86480b42024664453fe0031bbcf3
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/dentry.c | 2 ++
 fs/sdcardfs/lookup.c | 2 --
 fs/sdcardfs/main.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index 13da7e5245bd..642627161cad 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -131,6 +131,8 @@ out:
 
 static void sdcardfs_d_release(struct dentry *dentry)
 {
+	if (!dentry || !dentry->d_fsdata)
+		return;
 	/* release and reset the lower paths */
 	if (has_graft_path(dentry))
 		sdcardfs_put_reset_orig_path(dentry);
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 676e394e07be..206f8cbc7d7d 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -41,8 +41,6 @@ void sdcardfs_destroy_dentry_cache(void)
 
 void free_dentry_private_data(struct dentry *dentry)
 {
-	if (!dentry || !dentry->d_fsdata)
-		return;
 	kmem_cache_free(sdcardfs_dentry_cachep, dentry->d_fsdata);
 	dentry->d_fsdata = NULL;
 }
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index e4fd3fbb05e6..c3120108f627 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -422,7 +422,7 @@ void sdcardfs_kill_sb(struct super_block *sb)
 {
 	struct sdcardfs_sb_info *sbi;
 
-	if (sb->s_magic == SDCARDFS_SUPER_MAGIC) {
+	if (sb->s_magic == SDCARDFS_SUPER_MAGIC && sb->s_fs_info) {
 		sbi = SDCARDFS_SB(sb);
 		mutex_lock(&sdcardfs_super_list_lock);
 		list_del(&sbi->list);

From 1f0ab36725c4171e8e1aa7bcde7e3c642459eadf Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 11 Apr 2018 16:24:51 -0700
Subject: [PATCH 1671/3220] ANDROID: sdcardfs: d_make_root calls iput

d_make_root will call iput on failure, so we
shouldn't try to do that ourselves.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 77923821
Change-Id: I1abb4afb0f894ab917b7c6be8c833676f436beb7
---
 fs/sdcardfs/main.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index c3120108f627..1a977493f88d 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -316,7 +316,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	sb->s_root = d_make_root(inode);
 	if (!sb->s_root) {
 		err = -ENOMEM;
-		goto out_iput;
+		goto out_sput;
 	}
 	d_set_d_op(sb->s_root, &sdcardfs_ci_dops);
 
@@ -361,8 +361,6 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	/* no longer needed: free_dentry_private_data(sb->s_root); */
 out_freeroot:
 	dput(sb->s_root);
-out_iput:
-	iput(inode);
 out_sput:
 	/* drop refs we took earlier */
 	atomic_dec(&lower_sb->s_active);

From 02dbf92c9fbee40af29a47e1f3782b97e46a88db Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Wed, 11 Apr 2018 18:39:43 -0700
Subject: [PATCH 1672/3220] ANDROID: sdcardfs: Set s_root to NULL after putting

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Bug: 77923821
Change-Id: I1705bfd146009561d2d1da5f0e6a342ec6932a1c
---
 fs/sdcardfs/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 1a977493f88d..30e0c431a1ea 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -361,6 +361,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	/* no longer needed: free_dentry_private_data(sb->s_root); */
 out_freeroot:
 	dput(sb->s_root);
+	sb->s_root = NULL;
 out_sput:
 	/* drop refs we took earlier */
 	atomic_dec(&lower_sb->s_active);

From 4c706fe2e4ae8c7196847fe0e88f2d937130dd3a Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 25 Apr 2018 22:11:01 -0700
Subject: [PATCH 1673/3220] ANDROID: staging: ion: Obey kptr_restrict

Some debugging messages did not use %pK, but since those messages are
not very useful and have been removed upstream, just remove them
instead.

Bug: 77937819
Change-Id: Ie45897fe2d6ec3f842a02883e8ec929ed2e76933
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/staging/android/ion/ion.c          | 9 ---------
 drivers/staging/android/ion/ion_cma_heap.c | 4 ----
 2 files changed, 13 deletions(-)

diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index 374f840f31a4..b11453631a24 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -856,7 +856,6 @@ void ion_client_destroy(struct ion_client *client)
 	struct ion_device *dev = client->dev;
 	struct rb_node *n;
 
-	pr_debug("%s: %d\n", __func__, __LINE__);
 	while ((n = rb_first(&client->handles))) {
 		struct ion_handle *handle = rb_entry(n, struct ion_handle,
 						     node);
@@ -947,9 +946,6 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
 	int pages = PAGE_ALIGN(buffer->size) / PAGE_SIZE;
 	int i;
 
-	pr_debug("%s: syncing for device %s\n", __func__,
-		 dev ? dev_name(dev) : "null");
-
 	if (!ion_buffer_fault_user_mappings(buffer))
 		return;
 
@@ -1003,7 +999,6 @@ static void ion_vm_open(struct vm_area_struct *vma)
 	mutex_lock(&buffer->lock);
 	list_add(&vma_list->list, &buffer->vmas);
 	mutex_unlock(&buffer->lock);
-	pr_debug("%s: adding %p\n", __func__, vma);
 }
 
 static void ion_vm_close(struct vm_area_struct *vma)
@@ -1011,14 +1006,12 @@ static void ion_vm_close(struct vm_area_struct *vma)
 	struct ion_buffer *buffer = vma->vm_private_data;
 	struct ion_vma_list *vma_list, *tmp;
 
-	pr_debug("%s\n", __func__);
 	mutex_lock(&buffer->lock);
 	list_for_each_entry_safe(vma_list, tmp, &buffer->vmas, list) {
 		if (vma_list->vma != vma)
 			continue;
 		list_del(&vma_list->list);
 		kfree(vma_list);
-		pr_debug("%s: deleting %p\n", __func__, vma);
 		break;
 	}
 	mutex_unlock(&buffer->lock);
@@ -1375,7 +1368,6 @@ static int ion_release(struct inode *inode, struct file *file)
 {
 	struct ion_client *client = file->private_data;
 
-	pr_debug("%s: %d\n", __func__, __LINE__);
 	ion_client_destroy(client);
 	return 0;
 }
@@ -1387,7 +1379,6 @@ static int ion_open(struct inode *inode, struct file *file)
 	struct ion_client *client;
 	char debug_name[64];
 
-	pr_debug("%s: %d\n", __func__, __LINE__);
 	snprintf(debug_name, 64, "%u", task_pid_nr(current->group_leader));
 	client = ion_client_create(dev, debug_name);
 	if (IS_ERR(client))
diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c
index a3446da4fdc2..133345a0b70c 100644
--- a/drivers/staging/android/ion/ion_cma_heap.c
+++ b/drivers/staging/android/ion/ion_cma_heap.c
@@ -49,8 +49,6 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
 	struct device *dev = cma_heap->dev;
 	struct ion_cma_buffer_info *info;
 
-	dev_dbg(dev, "Request buffer allocation len %ld\n", len);
-
 	if (buffer->flags & ION_FLAG_CACHED)
 		return -EINVAL;
 
@@ -78,7 +76,6 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
 		goto free_table;
 	/* keep this for memory release */
 	buffer->priv_virt = info;
-	dev_dbg(dev, "Allocate buffer %p\n", buffer);
 	return 0;
 
 free_table:
@@ -96,7 +93,6 @@ static void ion_cma_free(struct ion_buffer *buffer)
 	struct device *dev = cma_heap->dev;
 	struct ion_cma_buffer_info *info = buffer->priv_virt;
 
-	dev_dbg(dev, "Release buffer %p\n", buffer);
 	/* release memory */
 	dma_free_coherent(dev, buffer->size, info->cpu_addr, info->handle);
 	/* release sg table */

From 23eaecc37dfb0aa51cf131c32cb05784ef84d3be Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 10:32:13 +0200
Subject: [PATCH 1674/3220] UPSTREAM: tracing: always define
 trace_{irq,preempt}_{enable_disable}

We get a build error in the irqsoff tracer in some configurations:

kernel/trace/trace_irqsoff.c: In function 'trace_preempt_on':
kernel/trace/trace_irqsoff.c:855:2: error: implicit declaration of function 'trace_preempt_enable_rcuidle'; did you mean 'trace_irq_enable_rcuidle'? [-Werror=implicit-function-declaration]
  trace_preempt_enable_rcuidle(a0, a1);

The problem is that trace_preempt_enable_rcuidle() has different
definition based on multiple Kconfig symbols, but not all combinations
have a valid definition.

This changes the conditions so that we always get exactly one
definition of each of the four tracing macros. I have not tried
to verify that these definitions are sensible, but now we
can build all randconfig combinations again.

Link: http://lkml.kernel.org/r/20171019083230.2450779-1-arnd@arndb.de

Change-Id: I28715af208379e993df85c2fb35549290f4fbd6e
Fixes: d59158162e03 ("tracing: Add support for preempt and irq enable/disable events")
Acked-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/trace/events/preemptirq.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
index f5024c560d8f..9c4eb33c5a1d 100644
--- a/include/trace/events/preemptirq.h
+++ b/include/trace/events/preemptirq.h
@@ -56,15 +56,18 @@ DEFINE_EVENT(preemptirq_template, preempt_enable,
 
 #include <trace/define_trace.h>
 
-#else /* !CONFIG_PREEMPTIRQ_EVENTS */
+#endif /* !CONFIG_PREEMPTIRQ_EVENTS */
 
+#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || defined(CONFIG_PROVE_LOCKING)
 #define trace_irq_enable(...)
 #define trace_irq_disable(...)
-#define trace_preempt_enable(...)
-#define trace_preempt_disable(...)
 #define trace_irq_enable_rcuidle(...)
 #define trace_irq_disable_rcuidle(...)
+#endif
+
+#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || !defined(CONFIG_DEBUG_PREEMPT)
+#define trace_preempt_enable(...)
+#define trace_preempt_disable(...)
 #define trace_preempt_enable_rcuidle(...)
 #define trace_preempt_disable_rcuidle(...)
-
 #endif

From ec6875b2de0551078ae83f04917624aca20c67ff Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Tue, 20 Mar 2018 13:40:04 -0700
Subject: [PATCH 1675/3220] goldfish: pipe: ANDROID: fix logging format strings

* remove '%p' to avoid exposing kernel addresses;
* add missing '\n'.

Bug: 72717639
Bug: 66884503
Change-Id: I81357841dc334df595a3244f232c2e6bb72e64a1
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 3119b3341a7b..aa794a6fcee7 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -721,12 +721,11 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 	status = goldfish_cmd_locked(pipe, PIPE_CMD_OPEN);
 	spin_unlock_irqrestore(&dev->lock, flags);
 	if (status < 0) {
-		pr_err("Could not tell host of new pipe! status=%d", status);
+		pr_err("Could not tell host of new pipe! status=%d\n", status);
 		goto err_cmd;
 	}
 	/* All is done, save the pipe into the file's private data field */
 	file->private_data = pipe;
-	pr_debug("%s on 0x%p\n", __func__, pipe);
 	return 0;
 
 err_cmd:
@@ -746,8 +745,6 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
 	struct goldfish_pipe *pipe = filp->private_data;
 	struct goldfish_pipe_dev *dev = pipe->dev;
 
-	pr_debug("%s on 0x%p\n", __func__, pipe);
-
 	/* The guest is closing the channel, so tell the emulator right now */
 	(void)goldfish_cmd(pipe, PIPE_CMD_CLOSE);
 

From 0794d75be19c67bbddd9875982662c75261a9f69 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 13:57:43 -0700
Subject: [PATCH 1676/3220] goldfish: pipe: ANDROID: use the 'BIT' macro for
 wakeup flags

Bug: 72717639
Bug: 66884503
Change-Id: I977525f145d8b678d39867037c5fddcc35e0a52b
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index aa794a6fcee7..d1416044aa4c 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -83,9 +83,9 @@ enum PipeErrors {
 
 /* Bit-flags used to signal events from the emulator */
 enum PipeWakeFlags {
-	PIPE_WAKE_CLOSED = 1 << 0,  /* emulator closed pipe */
-	PIPE_WAKE_READ   = 1 << 1,  /* pipe can now be read from */
-	PIPE_WAKE_WRITE  = 1 << 2  /* pipe can now be written to */
+	PIPE_WAKE_CLOSED = BIT(0),  /* emulator closed pipe */
+	PIPE_WAKE_READ   = BIT(1),  /* pipe can now be read from */
+	PIPE_WAKE_WRITE  = BIT(2),  /* pipe can now be written to */
 };
 
 /* Bit flags for the 'flags' field */

From c4fcb01762a4b9c3d4585becb812677bb3cd9495 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 14:01:51 -0700
Subject: [PATCH 1677/3220] goldfish: pipe: ANDROID: replace 'BUG_ON' with
 'BUILD_BUG_ON'

Bug: 72717639
Bug: 66884503
Change-Id: I6afcf21c7ff0fd2f5fcca327813b8406798c4269
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index d1416044aa4c..33bb0057668a 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -804,7 +804,7 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 	 * needs to be contained in a single physical page. The easiest choice
 	 * is to just allocate a page and place the buffers in it.
 	 */
-	BUG_ON(sizeof(*dev->buffers) > PAGE_SIZE);
+	BUILD_BUG_ON(sizeof(*dev->buffers) > PAGE_SIZE);
 	page = (char*)__get_free_page(GFP_KERNEL);
 	if (!page) {
 		kfree(dev->pipes);
@@ -845,7 +845,7 @@ static int goldfish_pipe_probe(struct platform_device *pdev)
 	struct resource *r;
 	struct goldfish_pipe_dev *dev = pipe_dev;
 
-	BUG_ON(sizeof(struct goldfish_pipe_command) > PAGE_SIZE);
+	BUILD_BUG_ON(sizeof(struct goldfish_pipe_command) > PAGE_SIZE);
 
 	/* not thread safe, but this should not happen */
 	WARN_ON(dev->base != NULL);

From 9a7f2e964107eb8f23aed71a9fffa26d1927f4a6 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 14:09:12 -0700
Subject: [PATCH 1678/3220] goldfish: pipe: ANDROID: add blank lines

To separate variable declarations from executable code
and to improve readability.

Bug: 72717639
Bug: 66884503
Change-Id: I46fb70b13b8e3d061dfc2288f5720a379f1f39a9
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 33bb0057668a..faa45782b686 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -49,7 +49,6 @@
 #include <linux/printk.h>
 #include "goldfish_pipe.h"
 
-
 /*
  * Update this when something changes in the driver's behavior so the host
  * can benefit from knowing it
@@ -228,9 +227,12 @@ static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
 static int goldfish_cmd(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
 {
 	int status;
+
 	if (mutex_lock_interruptible(&pipe->lock))
 		return PIPE_ERROR_IO;
+
 	status = goldfish_cmd_locked(pipe, cmd);
+
 	mutex_unlock(&pipe->lock);
 	return status;
 }
@@ -361,7 +363,6 @@ static int transfer_max_buffers(struct goldfish_pipe* pipe,
 	mutex_unlock(&pipe->lock);
 
 	release_user_pages(pages, pages_count, is_write, *consumed_size);
-
 	return 0;
 }
 
@@ -414,6 +415,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 	while (address < address_end) {
 		s32 consumed_size;
 		int status;
+
 		ret = transfer_max_buffers(pipe, address, address_end, is_write,
 			last_page, last_page_size, &consumed_size, &status);
 		if (ret < 0)
@@ -724,6 +726,7 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 		pr_err("Could not tell host of new pipe! status=%d\n", status);
 		goto err_cmd;
 	}
+
 	/* All is done, save the pipe into the file's private data field */
 	file->private_data = pipe;
 	return 0;
@@ -754,8 +757,10 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
 	spin_unlock_irqrestore(&dev->lock, flags);
 
 	filp->private_data = NULL;
+
 	free_page((unsigned long)pipe->command_buffer);
 	kfree(pipe);
+
 	return 0;
 }
 
@@ -829,6 +834,7 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 		writel((u32)(unsigned long)paddr,
 			dev->base + PIPE_REG_OPEN_BUFFER);
 	}
+
 	return 0;
 }
 
@@ -897,6 +903,7 @@ error:
 static int goldfish_pipe_remove(struct platform_device *pdev)
 {
 	struct goldfish_pipe_dev *dev = pipe_dev;
+
 	if (dev->version < PIPE_CURRENT_DEVICE_VERSION)
 		goldfish_pipe_device_deinit_v1(pdev);
 	else

From d176c57751903809f81c9021f4d45eb9c907e605 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 14:13:56 -0700
Subject: [PATCH 1679/3220] goldfish: pipe: ANDROID: remove a redundant target

Bug: 72717639
Bug: 66884503
Change-Id: Ie2fbf13bccd68e5091c5966b78cb5838840bc61e
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile
index 277a820ee4e1..e53ae2fc717b 100644
--- a/drivers/platform/goldfish/Makefile
+++ b/drivers/platform/goldfish/Makefile
@@ -2,5 +2,4 @@
 # Makefile for Goldfish platform specific drivers
 #
 obj-$(CONFIG_GOLDFISH_BUS)	+= pdev_bus.o
-obj-$(CONFIG_GOLDFISH_PIPE)	+= goldfish_pipe_all.o
-goldfish_pipe_all-objs := goldfish_pipe.o goldfish_pipe_v2.o
+obj-$(CONFIG_GOLDFISH_PIPE)	+= goldfish_pipe.o goldfish_pipe_v2.o

From 9846d12d4819a42f27f5ab5fc92cc07afb331f9e Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 14:19:57 -0700
Subject: [PATCH 1680/3220] goldfish: pipe: ANDROID: rename global variables

We don't need an array of 1 for pipe_dev and
use better names to distinguish between
goldfish_pipe_dev and miscdevice.

Bug: 72717639
Bug: 66884503
Change-Id: Iab040c158745f034ca8e9569fd49c84933b1c4ba
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe.c    | 11 ++++----
 drivers/platform/goldfish/goldfish_pipe.h    |  2 +-
 drivers/platform/goldfish/goldfish_pipe_v2.c | 29 ++++++++++----------
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index dc376b0fd276..df3f5c301a61 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -506,7 +506,7 @@ static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id)
 static int goldfish_pipe_open(struct inode *inode, struct file *file)
 {
 	struct goldfish_pipe *pipe;
-	struct goldfish_pipe_dev *dev = pipe_dev;
+	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
 	int32_t status;
 
 	/* Allocate new pipe kernel object */
@@ -558,7 +558,7 @@ static const struct file_operations goldfish_pipe_fops = {
 	.release = goldfish_pipe_release,
 };
 
-static struct miscdevice goldfish_pipe_dev = {
+static struct miscdevice goldfish_pipe_miscdev = {
 	.minor = MISC_DYNAMIC_MINOR,
 	.name = "goldfish_pipe",
 	.fops = &goldfish_pipe_fops,
@@ -566,15 +566,16 @@ static struct miscdevice goldfish_pipe_dev = {
 
 int goldfish_pipe_device_init_v1(struct platform_device *pdev)
 {
-	struct goldfish_pipe_dev *dev = pipe_dev;
+	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
 	int err = devm_request_irq(&pdev->dev, dev->irq,
 		goldfish_pipe_interrupt, IRQF_SHARED, "goldfish_pipe", dev);
+
 	if (err) {
 		dev_err(&pdev->dev, "unable to allocate IRQ for v1\n");
 		return err;
 	}
 
-	err = misc_register(&goldfish_pipe_dev);
+	err = misc_register(&goldfish_pipe_miscdev);
 	if (err) {
 		dev_err(&pdev->dev, "unable to register v1 device\n");
 		return err;
@@ -586,5 +587,5 @@ int goldfish_pipe_device_init_v1(struct platform_device *pdev)
 
 void goldfish_pipe_device_deinit_v1(struct platform_device *pdev)
 {
-	misc_deregister(&goldfish_pipe_dev);
+	misc_deregister(&goldfish_pipe_miscdev);
 }
diff --git a/drivers/platform/goldfish/goldfish_pipe.h b/drivers/platform/goldfish/goldfish_pipe.h
index 9b75a51dba24..5de147432203 100644
--- a/drivers/platform/goldfish/goldfish_pipe.h
+++ b/drivers/platform/goldfish/goldfish_pipe.h
@@ -86,6 +86,6 @@ struct goldfish_pipe_dev {
 	struct access_params *aps;
 };
 
-extern struct goldfish_pipe_dev pipe_dev[1];
+extern struct goldfish_pipe_dev goldfish_pipe_dev;
 
 #endif /* GOLDFISH_PIPE_H */
diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index faa45782b686..7688ed093363 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -213,7 +213,7 @@ struct goldfish_pipe {
 	struct goldfish_pipe_dev *dev;
 };
 
-struct goldfish_pipe_dev pipe_dev[1] = {};
+struct goldfish_pipe_dev goldfish_pipe_dev;
 
 static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
 {
@@ -572,11 +572,12 @@ static struct goldfish_pipe *signalled_pipes_pop_front(
 
 static void goldfish_interrupt_task(unsigned long unused)
 {
-	struct goldfish_pipe_dev *dev = pipe_dev;
 	/* Iterate over the signalled pipes and wake them one by one */
 	struct goldfish_pipe *pipe;
 	int wakes;
-	while ((pipe = signalled_pipes_pop_front(dev, &wakes)) != NULL) {
+
+	while ((pipe = signalled_pipes_pop_front(&goldfish_pipe_dev, &wakes)) !=
+			NULL) {
 		if (wakes & PIPE_WAKE_CLOSED) {
 			pipe->flags = 1 << BIT_CLOSED_ON_HOST;
 		} else {
@@ -613,7 +614,8 @@ static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id)
 	u32 i;
 	unsigned long flags;
 	struct goldfish_pipe_dev *dev = dev_id;
-	if (dev != pipe_dev)
+
+	if (dev != &goldfish_pipe_dev)
 		return IRQ_NONE;
 
 	/* Request the signalled pipes from the device */
@@ -676,7 +678,7 @@ static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
  */
 static int goldfish_pipe_open(struct inode *inode, struct file *file)
 {
-	struct goldfish_pipe_dev *dev = pipe_dev;
+	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
 	unsigned long flags;
 	int id;
 	int status;
@@ -773,7 +775,7 @@ static const struct file_operations goldfish_pipe_fops = {
 	.release = goldfish_pipe_release,
 };
 
-static struct miscdevice goldfish_pipe_dev = {
+static struct miscdevice goldfish_pipe_miscdev = {
 	.minor = MISC_DYNAMIC_MINOR,
 	.name = "goldfish_pipe",
 	.fops = &goldfish_pipe_fops,
@@ -782,7 +784,7 @@ static struct miscdevice goldfish_pipe_dev = {
 static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 {
 	char *page;
-	struct goldfish_pipe_dev *dev = pipe_dev;
+	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
 	int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
 				IRQF_SHARED, "goldfish_pipe", dev);
 	if (err) {
@@ -790,7 +792,7 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 		return err;
 	}
 
-	err = misc_register(&goldfish_pipe_dev);
+	err = misc_register(&goldfish_pipe_miscdev);
 	if (err) {
 		dev_err(&pdev->dev, "unable to register v2 device\n");
 		return err;
@@ -839,17 +841,16 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 }
 
 static void goldfish_pipe_device_deinit_v2(struct platform_device *pdev) {
-	struct goldfish_pipe_dev *dev = pipe_dev;
-	misc_deregister(&goldfish_pipe_dev);
-	kfree(dev->pipes);
-	free_page((unsigned long)dev->buffers);
+	misc_deregister(&goldfish_pipe_miscdev);
+	kfree(goldfish_pipe_dev.pipes);
+	free_page((unsigned long)goldfish_pipe_dev.buffers);
 }
 
 static int goldfish_pipe_probe(struct platform_device *pdev)
 {
 	int err;
 	struct resource *r;
-	struct goldfish_pipe_dev *dev = pipe_dev;
+	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
 
 	BUILD_BUG_ON(sizeof(struct goldfish_pipe_command) > PAGE_SIZE);
 
@@ -902,7 +903,7 @@ error:
 
 static int goldfish_pipe_remove(struct platform_device *pdev)
 {
-	struct goldfish_pipe_dev *dev = pipe_dev;
+	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
 
 	if (dev->version < PIPE_CURRENT_DEVICE_VERSION)
 		goldfish_pipe_device_deinit_v1(pdev);

From e008465076abb1abfacf51a381ea152fb2594eee Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 14:10:39 -0700
Subject: [PATCH 1681/3220] goldfish: pipe: ANDROID: fix whitespace

Make checkpatch.pl happy.

Bug: 72717639
Bug: 66884503
Change-Id: Ie8758cc91595eba57a0c9144cfba9309da62c966
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 7688ed093363..e8db17628bb6 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -331,7 +331,7 @@ static void populate_rw_params(
 	command->rw_params.buffers_count = buffer_idx + 1;
 }
 
-static int transfer_max_buffers(struct goldfish_pipe* pipe,
+static int transfer_max_buffers(struct goldfish_pipe *pipe,
 	unsigned long address, unsigned long address_end, int is_write,
 	unsigned long last_page, unsigned int last_page_size,
 	s32 *consumed_size, int *status)
@@ -685,6 +685,7 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 
 	/* Allocate new pipe kernel object */
 	struct goldfish_pipe *pipe = kzalloc(sizeof(*pipe), GFP_KERNEL);
+
 	if (pipe == NULL)
 		return -ENOMEM;
 
@@ -812,12 +813,12 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 	 * is to just allocate a page and place the buffers in it.
 	 */
 	BUILD_BUG_ON(sizeof(*dev->buffers) > PAGE_SIZE);
-	page = (char*)__get_free_page(GFP_KERNEL);
+	page = (char *)__get_free_page(GFP_KERNEL);
 	if (!page) {
 		kfree(dev->pipes);
 		return -ENOMEM;
 	}
-	dev->buffers = (struct goldfish_pipe_dev_buffers*)page;
+	dev->buffers = (struct goldfish_pipe_dev_buffers *)page;
 
 	/* Send the buffer addresses to the host */
 	{
@@ -840,7 +841,8 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 	return 0;
 }
 
-static void goldfish_pipe_device_deinit_v2(struct platform_device *pdev) {
+static void goldfish_pipe_device_deinit_v2(struct platform_device *pdev)
+{
 	misc_deregister(&goldfish_pipe_miscdev);
 	kfree(goldfish_pipe_dev.pipes);
 	free_page((unsigned long)goldfish_pipe_dev.buffers);

From ed6207885d70150db1caa1954e84d959a747622f Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 14:26:46 -0700
Subject: [PATCH 1682/3220] goldfish: pipe: ANDROID: Add 'pipe' to pipe
 functions

Replace the 'goldfish' prefix with 'goldfish_pipe' to
say they are pipe functions.

Bug: 72717639
Bug: 66884503
Change-Id: I5a5cf7ee38cf2ae193877b1ffac19eadb15a374a
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index e8db17628bb6..116c25e50b42 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -215,7 +215,8 @@ struct goldfish_pipe {
 
 struct goldfish_pipe_dev goldfish_pipe_dev;
 
-static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
+static int goldfish_pipe_cmd_locked(
+	struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
 {
 	pipe->command_buffer->cmd = cmd;
 	/* failure by default */
@@ -224,14 +225,14 @@ static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
 	return pipe->command_buffer->status;
 }
 
-static int goldfish_cmd(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
+static int goldfish_pipe_cmd(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
 {
 	int status;
 
 	if (mutex_lock_interruptible(&pipe->lock))
 		return PIPE_ERROR_IO;
 
-	status = goldfish_cmd_locked(pipe, cmd);
+	status = goldfish_pipe_cmd_locked(pipe, cmd);
 
 	mutex_unlock(&pipe->lock);
 	return status;
@@ -354,7 +355,7 @@ static int transfer_max_buffers(struct goldfish_pipe *pipe,
 		pipe->command_buffer);
 
 	/* Transfer the data */
-	*status = goldfish_cmd_locked(
+	*status = goldfish_pipe_cmd_locked(
 		pipe,
 		is_write ? PIPE_CMD_WRITE : PIPE_CMD_READ);
 
@@ -372,7 +373,7 @@ static int wait_for_host_signal(struct goldfish_pipe *pipe, int is_write)
 	set_bit(wakeBit, &pipe->flags);
 
 	/* Tell the emulator we're going to wait for a wake event */
-	(void)goldfish_cmd(pipe,
+	goldfish_pipe_cmd(pipe,
 			is_write ? PIPE_CMD_WAKE_ON_WRITE : PIPE_CMD_WAKE_ON_READ);
 
 	while (test_bit(wakeBit, &pipe->flags)) {
@@ -493,7 +494,7 @@ static unsigned int goldfish_pipe_poll(struct file *filp, poll_table *wait)
 
 	poll_wait(filp, &pipe->wake_queue, wait);
 
-	status = goldfish_cmd(pipe, PIPE_CMD_POLL);
+	status = goldfish_pipe_cmd(pipe, PIPE_CMD_POLL);
 	if (status < 0)
 		return -ERESTARTSYS;
 
@@ -723,7 +724,7 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 			MAX_BUFFERS_PER_COMMAND;
 	dev->buffers->open_command_params.command_buffer_ptr =
 			(u64)(unsigned long)__pa(pipe->command_buffer);
-	status = goldfish_cmd_locked(pipe, PIPE_CMD_OPEN);
+	status = goldfish_pipe_cmd_locked(pipe, PIPE_CMD_OPEN);
 	spin_unlock_irqrestore(&dev->lock, flags);
 	if (status < 0) {
 		pr_err("Could not tell host of new pipe! status=%d\n", status);
@@ -752,7 +753,7 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
 	struct goldfish_pipe_dev *dev = pipe->dev;
 
 	/* The guest is closing the channel, so tell the emulator right now */
-	(void)goldfish_cmd(pipe, PIPE_CMD_CLOSE);
+	goldfish_pipe_cmd(pipe, PIPE_CMD_CLOSE);
 
 	spin_lock_irqsave(&dev->lock, flags);
 	dev->pipes[pipe->id] = NULL;

From 8b274c38978f33c6800ed0a6cd4ff5e7e93b926a Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 14:27:37 -0700
Subject: [PATCH 1683/3220] goldfish: pipe: ANDROID: remove redundant casting

Casting twice is not required.

Bug: 72717639
Bug: 66884503
Change-Id: I3420388683a9746f2d2110af51d9d25c12c7eea6
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 116c25e50b42..af7374f6b525 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -723,7 +723,7 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 	dev->buffers->open_command_params.rw_params_max_count =
 			MAX_BUFFERS_PER_COMMAND;
 	dev->buffers->open_command_params.command_buffer_ptr =
-			(u64)(unsigned long)__pa(pipe->command_buffer);
+			(u64)__pa(pipe->command_buffer);
 	status = goldfish_pipe_cmd_locked(pipe, PIPE_CMD_OPEN);
 	spin_unlock_irqrestore(&dev->lock, flags);
 	if (status < 0) {

From 00b597e73dd23d7cc54cef7d9231742be9523dc9 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 14:32:34 -0700
Subject: [PATCH 1684/3220] goldfish: pipe: ANDROID: Do not crash

Return an error instead of crashing in signalled_pipes_add_locked.

Bug: 72717639
Bug: 66884503
Change-Id: I811ad1932f1600f8bbe4598cdaf206bd96ea921a
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index af7374f6b525..e762e5df63de 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -510,26 +510,30 @@ static unsigned int goldfish_pipe_poll(struct file *filp, poll_table *wait)
 	return mask;
 }
 
-static void signalled_pipes_add_locked(struct goldfish_pipe_dev *dev,
+static int signalled_pipes_add_locked(struct goldfish_pipe_dev *dev,
 	u32 id, u32 flags)
 {
 	struct goldfish_pipe *pipe;
 
-	BUG_ON(id >= dev->pipes_capacity);
+	if (id >= dev->pipes_capacity)
+		return -EINVAL;
 
 	pipe = dev->pipes[id];
 	if (!pipe)
-		return;
+		return -ENXIO;
+
 	pipe->signalled_flags |= flags;
 
 	if (pipe->prev_signalled || pipe->next_signalled
 		|| dev->first_signalled_pipe == pipe)
-		return;	/* already in the list */
+		return 0;  /* already in the list */
+
 	pipe->next_signalled = dev->first_signalled_pipe;
 	if (dev->first_signalled_pipe)
 		dev->first_signalled_pipe->prev_signalled = pipe;
-
 	dev->first_signalled_pipe = pipe;
+
+	return 0;
 }
 
 static void signalled_pipes_remove_locked(struct goldfish_pipe_dev *dev,

From ed11d8f2a0d07756acace1a2a295f29e41976358 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Tue, 20 Mar 2018 13:40:04 -0700
Subject: [PATCH 1685/3220] goldfish: pipe: ANDROID: Allocate memory with
 GFP_KERNEL.

We use GFP_KERNEL in the outer context.

Bug: 72717639
Bug: 66884503
Change-Id: I5e10dba5138818351936ec0f70cd01070eaf199f
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index e762e5df63de..590f6dea3c1b 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -658,7 +658,7 @@ static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
 		u32 new_capacity = 2 * dev->pipes_capacity;
 		struct goldfish_pipe **pipes =
 				kcalloc(new_capacity, sizeof(*pipes),
-					GFP_ATOMIC);
+					GFP_KERNEL);
 		if (!pipes)
 			return -ENOMEM;
 		memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity);

From 6b1151702d398f039aa0b966bcb9992354473689 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 14:28:34 -0700
Subject: [PATCH 1686/3220] goldfish: pipe: ANDROID: Update module license

Change MODULE_LICENSE() string to properly match
the license for the driver (goldfish_pipe.h).

Bug: 72717639
Bug: 66884503
Change-Id: I4d1ef778276b26b49c83c967c3c9314b2c0c5da7
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 590f6dea3c1b..64b463e0e6b3 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -944,4 +944,4 @@ static struct platform_driver goldfish_pipe_driver = {
 
 module_platform_driver(goldfish_pipe_driver);
 MODULE_AUTHOR("David Turner <digit@google.com>");
-MODULE_LICENSE("GPL");
+MODULE_LICENSE("GPL v2");

From cd38dbb03d71dcde7c0e073f224d7262e265bec3 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 17:11:33 -0700
Subject: [PATCH 1687/3220] goldfish: pipe: ANDROID: fix checkpatch warnings

Bug: 72717639
bug: 66884503
Change-Id: I83ddd1c2da24a2658d8fe1e43718aa6e9fc47d5d
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 64b463e0e6b3..5fd3c239444c 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -370,11 +370,12 @@ static int transfer_max_buffers(struct goldfish_pipe *pipe,
 static int wait_for_host_signal(struct goldfish_pipe *pipe, int is_write)
 {
 	u32 wakeBit = is_write ? BIT_WAKE_ON_WRITE : BIT_WAKE_ON_READ;
+	u32 cmdBit = is_write ? PIPE_CMD_WAKE_ON_WRITE : PIPE_CMD_WAKE_ON_READ;
+
 	set_bit(wakeBit, &pipe->flags);
 
 	/* Tell the emulator we're going to wait for a wake event */
-	goldfish_pipe_cmd(pipe,
-			is_write ? PIPE_CMD_WAKE_ON_WRITE : PIPE_CMD_WAKE_ON_READ);
+	goldfish_pipe_cmd(pipe, cmdBit);
 
 	while (test_bit(wakeBit, &pipe->flags)) {
 		if (wait_event_interruptible(
@@ -789,9 +790,12 @@ static struct miscdevice goldfish_pipe_miscdev = {
 
 static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 {
-	char *page;
 	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
-	int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
+	struct device *pdev_dev = &pdev->dev;
+	char *page;
+	int err;
+
+	err = devm_request_irq(pdev_dev, dev->irq, goldfish_pipe_interrupt,
 				IRQF_SHARED, "goldfish_pipe", dev);
 	if (err) {
 		dev_err(&pdev->dev, "unable to allocate IRQ for v2\n");

From 31569bb45018feb5a484c8d6b9d01ae03acbf73b Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Wed, 2 May 2018 10:55:06 -0700
Subject: [PATCH 1688/3220] goldfish: pipe: ANDROID: Use dev_ logging instead
 of pr_

The dev_ functions are the same as the corresponding
pr_ functions, but also print identifying information
about the struct device.

Bug: 72717639
bug: 66884503
Change-Id: I0b1147cf607eb5b0b07dd9753dcf2b60e9700afe
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe.h    |  3 +++
 drivers/platform/goldfish/goldfish_pipe_v2.c | 28 +++++++++++++-------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.h b/drivers/platform/goldfish/goldfish_pipe.h
index 5de147432203..18b4fecf4e43 100644
--- a/drivers/platform/goldfish/goldfish_pipe.h
+++ b/drivers/platform/goldfish/goldfish_pipe.h
@@ -84,6 +84,9 @@ struct goldfish_pipe_dev {
 
 	/* v1-specific access parameters */
 	struct access_params *aps;
+
+	/* ptr to platform device's device struct */
+	struct device *pdev_dev;
 };
 
 extern struct goldfish_pipe_dev goldfish_pipe_dev;
diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 5fd3c239444c..64907d3ad83e 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -397,6 +397,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 	int count = 0, ret = -EINVAL;
 	unsigned long address, address_end, last_page;
 	unsigned int last_page_size;
+	struct device *pdev_dev;
 
 	/* If the emulator already closed the pipe, no need to go further */
 	if (unlikely(test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)))
@@ -414,6 +415,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 	last_page = (address_end - 1) & PAGE_MASK;
 	last_page_size = ((address_end - 1) & ~PAGE_MASK) + 1;
 
+	pdev_dev = pipe->dev->pdev_dev;
+
 	while (address < address_end) {
 		s32 consumed_size;
 		int status;
@@ -445,7 +448,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 			 * err.
 			 */
 			if (status != PIPE_ERROR_AGAIN)
-				pr_err_ratelimited(
+				dev_err_ratelimited(pdev_dev,
 					"goldfish_pipe: backend error %d on %s\n",
 					status, is_write ? "write" : "read");
 			break;
@@ -685,6 +688,7 @@ static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
 static int goldfish_pipe_open(struct inode *inode, struct file *file)
 {
 	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
+	struct device *pdev_dev;
 	unsigned long flags;
 	int id;
 	int status;
@@ -699,6 +703,8 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 	mutex_init(&pipe->lock);
 	init_waitqueue_head(&pipe->wake_queue);
 
+	pdev_dev = dev->pdev_dev;
+
 	/*
 	 * Command buffer needs to be allocated on its own page to make sure it
 	 * is physically contiguous in host's address space.
@@ -706,7 +712,7 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 	pipe->command_buffer =
 		(struct goldfish_pipe_command *)__get_free_page(GFP_KERNEL);
 	if (!pipe->command_buffer) {
-		pr_err("Could not alloc pipe command buffer!\n");
+		dev_err(pdev_dev, "Could not alloc pipe command buffer!\n");
 		status = -ENOMEM;
 		goto err_pipe;
 	}
@@ -715,7 +721,7 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 
 	id = get_free_pipe_id_locked(dev);
 	if (id < 0) {
-		pr_err("Could not get free pipe id!\n");
+		dev_err(pdev_dev, "Could not get free pipe id!\n");
 		status = id;
 		goto err_id_locked;
 	}
@@ -732,7 +738,9 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 	status = goldfish_pipe_cmd_locked(pipe, PIPE_CMD_OPEN);
 	spin_unlock_irqrestore(&dev->lock, flags);
 	if (status < 0) {
-		pr_err("Could not tell host of new pipe! status=%d\n", status);
+		dev_err(pdev_dev,
+			"Could not tell host of new pipe! status=%d\n",
+			status);
 		goto err_cmd;
 	}
 
@@ -798,16 +806,17 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 	err = devm_request_irq(pdev_dev, dev->irq, goldfish_pipe_interrupt,
 				IRQF_SHARED, "goldfish_pipe", dev);
 	if (err) {
-		dev_err(&pdev->dev, "unable to allocate IRQ for v2\n");
+		dev_err(pdev_dev, "unable to allocate IRQ for v2\n");
 		return err;
 	}
 
 	err = misc_register(&goldfish_pipe_miscdev);
 	if (err) {
-		dev_err(&pdev->dev, "unable to register v2 device\n");
+		dev_err(pdev_dev, "unable to register v2 device\n");
 		return err;
 	}
 
+	dev->pdev_dev = pdev_dev;
 	dev->first_signalled_pipe = NULL;
 	dev->pipes_capacity = INITIAL_PIPES_CAPACITY;
 	dev->pipes = kcalloc(dev->pipes_capacity, sizeof(*dev->pipes),
@@ -862,6 +871,7 @@ static int goldfish_pipe_probe(struct platform_device *pdev)
 	int err;
 	struct resource *r;
 	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
+	struct device *pdev_dev = &pdev->dev;
 
 	BUILD_BUG_ON(sizeof(struct goldfish_pipe_command) > PAGE_SIZE);
 
@@ -872,12 +882,12 @@ static int goldfish_pipe_probe(struct platform_device *pdev)
 
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (r == NULL || resource_size(r) < PAGE_SIZE) {
-		dev_err(&pdev->dev, "can't allocate i/o page\n");
+		dev_err(pdev_dev, "can't allocate i/o page\n");
 		return -EINVAL;
 	}
-	dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE);
+	dev->base = devm_ioremap(pdev_dev, r->start, PAGE_SIZE);
 	if (dev->base == NULL) {
-		dev_err(&pdev->dev, "ioremap failed\n");
+		dev_err(pdev_dev, "ioremap failed\n");
 		return -EINVAL;
 	}
 

From 834913c4b790f7bf04e7181337ade138702959e4 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 18:04:11 -0700
Subject: [PATCH 1689/3220] goldfish: pipe: ANDROID: Replace writel with
 gf_write_ptr

We have a function around a pair of writel to
simplify 64-bit support.

Bug: 72717639
bug: 66884503
Change-Id: Id23d23f6998e7be348646bb4ea9884f165c1149b
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 22 +++++++-------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 64907d3ad83e..f8a4e47581fc 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -839,22 +839,16 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 	dev->buffers = (struct goldfish_pipe_dev_buffers *)page;
 
 	/* Send the buffer addresses to the host */
-	{
-		u64 paddr = __pa(&dev->buffers->signalled_pipe_buffers);
+	gf_write_ptr(&dev->buffers->signalled_pipe_buffers,
+		dev->base + PIPE_REG_SIGNAL_BUFFER,
+		dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH);
 
-		writel((u32)(unsigned long)(paddr >> 32),
-			dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH);
-		writel((u32)(unsigned long)paddr,
-			dev->base + PIPE_REG_SIGNAL_BUFFER);
-		writel((u32)MAX_SIGNALLED_PIPES,
-			dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT);
+	writel((u32)MAX_SIGNALLED_PIPES,
+		dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT);
 
-		paddr = __pa(&dev->buffers->open_command_params);
-		writel((u32)(unsigned long)(paddr >> 32),
-			dev->base + PIPE_REG_OPEN_BUFFER_HIGH);
-		writel((u32)(unsigned long)paddr,
-			dev->base + PIPE_REG_OPEN_BUFFER);
-	}
+	gf_write_ptr(&dev->buffers->open_command_params,
+		dev->base + PIPE_REG_OPEN_BUFFER,
+		dev->base + PIPE_REG_OPEN_BUFFER_HIGH);
 
 	return 0;
 }

From 1fe7e9202e522c89fb43e6013f7191cbfcc2133e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Thu, 3 May 2018 15:18:36 +0000
Subject: [PATCH 1690/3220] Revert "goldfish: pipe: ANDROID: remove a redundant
 target"

This reverts commit d176c57751903809f81c9021f4d45eb9c907e605.

Reason for revert: this is a regression as it reverts a previous build fix.

Change-Id: I880b97e6fc16a39dbc7be9466e0d1b7b2d8f9e8d
---
 drivers/platform/goldfish/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile
index e53ae2fc717b..277a820ee4e1 100644
--- a/drivers/platform/goldfish/Makefile
+++ b/drivers/platform/goldfish/Makefile
@@ -2,4 +2,5 @@
 # Makefile for Goldfish platform specific drivers
 #
 obj-$(CONFIG_GOLDFISH_BUS)	+= pdev_bus.o
-obj-$(CONFIG_GOLDFISH_PIPE)	+= goldfish_pipe.o goldfish_pipe_v2.o
+obj-$(CONFIG_GOLDFISH_PIPE)	+= goldfish_pipe_all.o
+goldfish_pipe_all-objs := goldfish_pipe.o goldfish_pipe_v2.o

From c4fd9a102b05e14f6ea927b1e5551d72533e68e5 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 2 May 2018 12:02:40 -0700
Subject: [PATCH 1691/3220] FROMLIST: staging: vsoc: Create wc kernel mapping
 for region shm.

Map the region shm as write-combining instead of uncachable.

Signed-off-by: Alistair Strachan <astrachan@google.com>
[sent upstream via staging https://patchwork.kernel.org/patch/10376965/]
Bug: 702147120
Change-Id: If587b4f26abaa3580b889b04dbb655f8d5b4a8ee
---
 drivers/staging/android/TODO   | 1 -
 drivers/staging/android/vsoc.c | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/staging/android/TODO b/drivers/staging/android/TODO
index 0c32c00fa700..2188bc395a48 100644
--- a/drivers/staging/android/TODO
+++ b/drivers/staging/android/TODO
@@ -31,7 +31,6 @@ vsoc.c, uapi/vsoc_shm.h
    waiting threads. We should eventually use multiple queues and select the
    queue based on the region.
  - Add debugfs support for examining the permissions of regions.
- - Use ioremap_wc instead of ioremap_nocache.
  - Remove VSOC_WAIT_FOR_INCOMING_INTERRUPT ioctl. This functionality has been
    superseded by the futex and is there for legacy reasons.
 
diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 587c66d709b9..794137b7751f 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -802,9 +802,7 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 
 	dev_info(&pdev->dev, "shared memory @ DMA %p size=0x%zx\n",
 		 (void *)vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
-	/* TODO(ghartman): ioremap_wc should work here */
-	vsoc_dev.kernel_mapped_shm = ioremap_nocache(
-			vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+	vsoc_dev.kernel_mapped_shm = pci_iomap_wc(pdev, SHARED_MEMORY_BAR, 0);
 	if (!vsoc_dev.kernel_mapped_shm) {
 		dev_err(&vsoc_dev.dev->dev, "cannot iomap region\n");
 		vsoc_remove_device(pdev);

From 935b28af56e372b7440d0f3392b560d01f0c8daa Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 2 May 2018 12:03:20 -0700
Subject: [PATCH 1692/3220] FROMLIST: staging: vsoc: Fix a i386-randconfig
 warning.

Fix "warning: cast to pointer from integer of different size" when
printing the region shm physical address. Use the %pa conversion
specifier and pass the resource by reference.

Signed-off-by: Alistair Strachan <astrachan@google.com>
[sent upstream via staging https://patchwork.kernel.org/patch/10376967/]
Bug: 702147120
Change-Id: Ibb0c32b461821235155c464bfd5c5963db512734
---
 drivers/staging/android/vsoc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 794137b7751f..3e6e4af7d6a1 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -800,8 +800,8 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 	vsoc_dev.shm_phys_start = pci_resource_start(pdev, SHARED_MEMORY_BAR);
 	vsoc_dev.shm_size = pci_resource_len(pdev, SHARED_MEMORY_BAR);
 
-	dev_info(&pdev->dev, "shared memory @ DMA %p size=0x%zx\n",
-		 (void *)vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+	dev_info(&pdev->dev, "shared memory @ DMA %pa size=0x%zx\n",
+		 &vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
 	vsoc_dev.kernel_mapped_shm = pci_iomap_wc(pdev, SHARED_MEMORY_BAR, 0);
 	if (!vsoc_dev.kernel_mapped_shm) {
 		dev_err(&vsoc_dev.dev->dev, "cannot iomap region\n");

From 4fca5a0c008a6de298458a43f1a506386c92dd16 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 2 May 2018 11:46:03 -0700
Subject: [PATCH 1693/3220] FROMLIST: staging: Fix sparse warnings in vsoc
 driver.

Signed-off-by: Alistair Strachan <astrachan@google.com>
[sent upstream via staging https://patchwork.kernel.org/patch/10376969/]
Bug: 702147120
Change-Id: I762db3bdb5e931be70ddac58f9d4e05cafb5986d
---
 drivers/staging/android/vsoc.c | 100 ++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 51 deletions(-)

diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 3e6e4af7d6a1..954ed2c5d807 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -81,8 +81,8 @@ struct vsoc_region_data {
 	atomic_t *incoming_signalled;
 	/* Flag indicating the guest has signalled the host. */
 	atomic_t *outgoing_signalled;
-	int irq_requested;
-	int device_created;
+	bool irq_requested;
+	bool device_created;
 };
 
 struct vsoc_device {
@@ -91,7 +91,7 @@ struct vsoc_device {
 	/* Physical address of SHARED_MEMORY_BAR. */
 	phys_addr_t shm_phys_start;
 	/* Kernel virtual address of SHARED_MEMORY_BAR. */
-	void *kernel_mapped_shm;
+	void __iomem *kernel_mapped_shm;
 	/* Size of the entire shared memory window in bytes. */
 	size_t shm_size;
 	/*
@@ -116,22 +116,23 @@ struct vsoc_device {
 	 * vsoc_region_data because the kernel deals with them as an array.
 	 */
 	struct msix_entry *msix_entries;
-	/*
-	 * Flags that indicate what we've initialzied. These are used to do an
-	 * orderly cleanup of the device.
-	 */
-	char enabled_device;
-	char requested_regions;
-	char cdev_added;
-	char class_added;
-	char msix_enabled;
 	/* Mutex that protectes the permission list */
 	struct mutex mtx;
 	/* Major number assigned by the kernel */
 	int major;
-
+	/* Character device assigned by the kernel */
 	struct cdev cdev;
+	/* Device class assigned by the kernel */
 	struct class *class;
+	/*
+	 * Flags that indicate what we've initialized. These are used to do an
+	 * orderly cleanup of the device.
+	 */
+	bool enabled_device;
+	bool requested_regions;
+	bool cdev_added;
+	bool class_added;
+	bool msix_enabled;
 };
 
 static struct vsoc_device vsoc_dev;
@@ -153,13 +154,13 @@ static long vsoc_ioctl(struct file *, unsigned int, unsigned long);
 static int vsoc_mmap(struct file *, struct vm_area_struct *);
 static int vsoc_open(struct inode *, struct file *);
 static int vsoc_release(struct inode *, struct file *);
-static ssize_t vsoc_read(struct file *, char *, size_t, loff_t *);
-static ssize_t vsoc_write(struct file *, const char *, size_t, loff_t *);
+static ssize_t vsoc_read(struct file *, char __user *, size_t, loff_t *);
+static ssize_t vsoc_write(struct file *, const char __user *, size_t, loff_t *);
 static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin);
 static int do_create_fd_scoped_permission(
 	struct vsoc_device_region *region_p,
 	struct fd_scoped_permission_node *np,
-	struct fd_scoped_permission_arg *__user arg);
+	struct fd_scoped_permission_arg __user *arg);
 static void do_destroy_fd_scoped_permission(
 	struct vsoc_device_region *owner_region_p,
 	struct fd_scoped_permission *perm);
@@ -198,7 +199,7 @@ inline int vsoc_validate_filep(struct file *filp)
 /* Converts from shared memory offset to virtual address */
 static inline void *shm_off_to_virtual_addr(__u32 offset)
 {
-	return vsoc_dev.kernel_mapped_shm + offset;
+	return (void __force *)vsoc_dev.kernel_mapped_shm + offset;
 }
 
 /* Converts from shared memory offset to physical address */
@@ -261,7 +262,7 @@ static struct pci_driver vsoc_pci_driver = {
 static int do_create_fd_scoped_permission(
 	struct vsoc_device_region *region_p,
 	struct fd_scoped_permission_node *np,
-	struct fd_scoped_permission_arg *__user arg)
+	struct fd_scoped_permission_arg __user *arg)
 {
 	struct file *managed_filp;
 	s32 managed_fd;
@@ -632,11 +633,11 @@ static long vsoc_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	return 0;
 }
 
-static ssize_t vsoc_read(struct file *filp, char *buffer, size_t len,
+static ssize_t vsoc_read(struct file *filp, char __user *buffer, size_t len,
 			 loff_t *poffset)
 {
 	__u32 area_off;
-	void *area_p;
+	const void *area_p;
 	ssize_t area_len;
 	int retval = vsoc_validate_filep(filp);
 
@@ -706,7 +707,7 @@ static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin)
 	return offset;
 }
 
-static ssize_t vsoc_write(struct file *filp, const char *buffer,
+static ssize_t vsoc_write(struct file *filp, const char __user *buffer,
 			  size_t len, loff_t *poffset)
 {
 	__u32 area_off;
@@ -772,14 +773,14 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 			pci_name(pdev), result);
 		return result;
 	}
-	vsoc_dev.enabled_device = 1;
+	vsoc_dev.enabled_device = true;
 	result = pci_request_regions(pdev, "vsoc");
 	if (result < 0) {
 		dev_err(&pdev->dev, "pci_request_regions failed\n");
 		vsoc_remove_device(pdev);
 		return -EBUSY;
 	}
-	vsoc_dev.requested_regions = 1;
+	vsoc_dev.requested_regions = true;
 	/* Set up the control registers in BAR 0 */
 	reg_size = pci_resource_len(pdev, REGISTER_BAR);
 	if (reg_size > MAX_REGISTER_BAR_LEN)
@@ -790,7 +791,7 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 
 	if (!vsoc_dev.regs) {
 		dev_err(&pdev->dev,
-			"cannot ioremap registers of size %zu\n",
+			"cannot map registers of size %zu\n",
 		       (size_t)reg_size);
 		vsoc_remove_device(pdev);
 		return -EBUSY;
@@ -809,8 +810,8 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 		return -EBUSY;
 	}
 
-	vsoc_dev.layout =
-	    (struct vsoc_shm_layout_descriptor *)vsoc_dev.kernel_mapped_shm;
+	vsoc_dev.layout = (struct vsoc_shm_layout_descriptor __force *)
+				vsoc_dev.kernel_mapped_shm;
 	dev_info(&pdev->dev, "major_version: %d\n",
 		 vsoc_dev.layout->major_version);
 	dev_info(&pdev->dev, "minor_version: %d\n",
@@ -841,16 +842,16 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 		vsoc_remove_device(pdev);
 		return -EBUSY;
 	}
-	vsoc_dev.cdev_added = 1;
+	vsoc_dev.cdev_added = true;
 	vsoc_dev.class = class_create(THIS_MODULE, VSOC_DEV_NAME);
 	if (IS_ERR(vsoc_dev.class)) {
 		dev_err(&vsoc_dev.dev->dev, "class_create failed\n");
 		vsoc_remove_device(pdev);
 		return PTR_ERR(vsoc_dev.class);
 	}
-	vsoc_dev.class_added = 1;
-	vsoc_dev.regions = (struct vsoc_device_region *)
-		(vsoc_dev.kernel_mapped_shm +
+	vsoc_dev.class_added = true;
+	vsoc_dev.regions = (struct vsoc_device_region __force *)
+		((void *)vsoc_dev.layout +
 		 vsoc_dev.layout->vsoc_region_desc_offset);
 	vsoc_dev.msix_entries = kcalloc(
 			vsoc_dev.layout->region_count,
@@ -910,7 +911,7 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 			return -EFAULT;
 		}
 	}
-	vsoc_dev.msix_enabled = 1;
+	vsoc_dev.msix_enabled = true;
 	for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
 		const struct vsoc_device_region *region = vsoc_dev.regions + i;
 		size_t name_sz = sizeof(vsoc_dev.regions_data[i].name) - 1;
@@ -928,14 +929,11 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 				&vsoc_dev.regions_data[i].interrupt_wait_queue);
 		init_waitqueue_head(&vsoc_dev.regions_data[i].futex_wait_queue);
 		vsoc_dev.regions_data[i].incoming_signalled =
-			vsoc_dev.kernel_mapped_shm +
-			region->region_begin_offset +
+			shm_off_to_virtual_addr(region->region_begin_offset) +
 			h_to_g_signal_table->interrupt_signalled_offset;
 		vsoc_dev.regions_data[i].outgoing_signalled =
-			vsoc_dev.kernel_mapped_shm +
-			region->region_begin_offset +
+			shm_off_to_virtual_addr(region->region_begin_offset) +
 			g_to_h_signal_table->interrupt_signalled_offset;
-
 		result = request_irq(
 				vsoc_dev.msix_entries[i].vector,
 				vsoc_interrupt, 0,
@@ -948,7 +946,7 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 			vsoc_remove_device(pdev);
 			return -ENOSPC;
 		}
-		vsoc_dev.regions_data[i].irq_requested = 1;
+		vsoc_dev.regions_data[i].irq_requested = true;
 		if (!device_create(vsoc_dev.class, NULL,
 				   MKDEV(vsoc_dev.major, i),
 				   NULL, vsoc_dev.regions_data[i].name)) {
@@ -956,7 +954,7 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 			vsoc_remove_device(pdev);
 			return -EBUSY;
 		}
-		vsoc_dev.regions_data[i].device_created = 1;
+		vsoc_dev.regions_data[i].device_created = true;
 	}
 	return 0;
 }
@@ -988,51 +986,51 @@ static void vsoc_remove_device(struct pci_dev *pdev)
 			if (vsoc_dev.regions_data[i].device_created) {
 				device_destroy(vsoc_dev.class,
 					       MKDEV(vsoc_dev.major, i));
-				vsoc_dev.regions_data[i].device_created = 0;
+				vsoc_dev.regions_data[i].device_created = false;
 			}
 			if (vsoc_dev.regions_data[i].irq_requested)
 				free_irq(vsoc_dev.msix_entries[i].vector, NULL);
-			vsoc_dev.regions_data[i].irq_requested = 0;
+			vsoc_dev.regions_data[i].irq_requested = false;
 		}
 		kfree(vsoc_dev.regions_data);
-		vsoc_dev.regions_data = 0;
+		vsoc_dev.regions_data = NULL;
 	}
 	if (vsoc_dev.msix_enabled) {
 		pci_disable_msix(pdev);
-		vsoc_dev.msix_enabled = 0;
+		vsoc_dev.msix_enabled = false;
 	}
 	kfree(vsoc_dev.msix_entries);
-	vsoc_dev.msix_entries = 0;
-	vsoc_dev.regions = 0;
+	vsoc_dev.msix_entries = NULL;
+	vsoc_dev.regions = NULL;
 	if (vsoc_dev.class_added) {
 		class_destroy(vsoc_dev.class);
-		vsoc_dev.class_added = 0;
+		vsoc_dev.class_added = false;
 	}
 	if (vsoc_dev.cdev_added) {
 		cdev_del(&vsoc_dev.cdev);
-		vsoc_dev.cdev_added = 0;
+		vsoc_dev.cdev_added = false;
 	}
 	if (vsoc_dev.major && vsoc_dev.layout) {
 		unregister_chrdev_region(MKDEV(vsoc_dev.major, 0),
 					 vsoc_dev.layout->region_count);
 		vsoc_dev.major = 0;
 	}
-	vsoc_dev.layout = 0;
+	vsoc_dev.layout = NULL;
 	if (vsoc_dev.kernel_mapped_shm) {
 		pci_iounmap(pdev, vsoc_dev.kernel_mapped_shm);
-		vsoc_dev.kernel_mapped_shm = 0;
+		vsoc_dev.kernel_mapped_shm = NULL;
 	}
 	if (vsoc_dev.regs) {
 		pci_iounmap(pdev, vsoc_dev.regs);
-		vsoc_dev.regs = 0;
+		vsoc_dev.regs = NULL;
 	}
 	if (vsoc_dev.requested_regions) {
 		pci_release_regions(pdev);
-		vsoc_dev.requested_regions = 0;
+		vsoc_dev.requested_regions = false;
 	}
 	if (vsoc_dev.enabled_device) {
 		pci_disable_device(pdev);
-		vsoc_dev.enabled_device = 0;
+		vsoc_dev.enabled_device = false;
 	}
 	/* Do this last: it indicates that the device is not initialized. */
 	vsoc_dev.dev = NULL;

From ac7f6028a7e3d515b1363fbbb8f5d427043103e7 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Tue, 25 Jul 2017 11:36:25 -0700
Subject: [PATCH 1694/3220] UPSTREAM: netpoll: Fix device name check in
 netpoll_setup()

Apparently netpoll_setup() assumes that netpoll.dev_name is a pointer
when checking if the device name is set:

if (np->dev_name) {
  ...

However the field is a character array, therefore the condition always
yields true. Check instead whether the first byte of the array has a
non-zero value.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 0c3a8f8b8fabff4f3ad2dd7b95ae0e90cdd1aebb)
Bug: 78886293
Change-Id: I1a6eec091c4bab5769a3519196f529030a71b6dd
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 net/core/netpoll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 440aa9f6e0a8..b55f340f5f71 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -666,7 +666,7 @@ int netpoll_setup(struct netpoll *np)
 	int err;
 
 	rtnl_lock();
-	if (np->dev_name) {
+	if (np->dev_name[0]) {
 		struct net *net = current->nsproxy->net_ns;
 		ndev = __dev_get_by_name(net, np->dev_name);
 	}

From 33594740c282a935ec8638fbef7de38573b51beb Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Tue, 18 Apr 2017 16:30:37 -0700
Subject: [PATCH 1695/3220] BACKPORT: clocksource: Use GENMASK_ULL in
 definition of CLOCKSOURCE_MASK

Besides reusing existing code this removes the special case handling
for 64-bit masks, which causes clang to raise a shift count overflow
warning due to https://bugs.llvm.org//show_bug.cgi?id=10030.

Suggested-by: Dmitry Torokhov <dtor@chromium.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Michael Davidson <md@google.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170418233037.70990-1-mka@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
(cherry picked from commit 0773cea37470f8e080c510fe720fc356cf35df3a)
[astrachan: minor re-diff; (cycle_t) vs (u64) cast]
Bug: 78886293
Change-Id: I97df9621cc65bba74a5086c2c35c2224b559a156
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 include/linux/clocksource.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 7784b597e959..071146169dfd 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -110,7 +110,7 @@ struct clocksource {
 #define CLOCK_SOURCE_RESELECT			0x100
 
 /* simplify initialization of mask field */
-#define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
+#define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0)
 
 /**
  * clocksource_khz2mult - calculates mult from khz and shift

From 4ecc6a3658d82b419306fe95573ee1dbd957a61c Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Fri, 21 Apr 2017 16:41:10 -0700
Subject: [PATCH 1696/3220] UPSTREAM: tracing: Use cpumask_available() to check
 if cpumask variable may be used

This fixes the following clang warning:

kernel/trace/trace.c:3231:12: warning: address of array 'iter->started'
  will always evaluate to 'true' [-Wpointer-bool-conversion]
        if (iter->started)

Link: http://lkml.kernel.org/r/20170421234110.117075-1-mka@chromium.org

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
(cherry picked from commit 4dbbe2d8e95c351157f292ece067f985c30c7b53)
Bug: 78886293
Change-Id: Ib17a68ce55ca80b04bdea2d232f8ca9f88b1b8a3
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 kernel/trace/trace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c180e4dc0fb9..293faa2ff763 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2708,13 +2708,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
 		return;
 
-	if (iter->started && cpumask_test_cpu(iter->cpu, iter->started))
+	if (cpumask_available(iter->started) &&
+	    cpumask_test_cpu(iter->cpu, iter->started))
 		return;
 
 	if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
 		return;
 
-	if (iter->started)
+	if (cpumask_available(iter->started))
 		cpumask_set_cpu(iter->cpu, iter->started);
 
 	/* Don't print started cpu buffer for the first entry of the trace */

From 720dfa9a4c3bacd955af2f53a2fb7bbca53ac10a Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nick.desaulniers@gmail.com>
Date: Sun, 21 May 2017 01:58:07 -0700
Subject: [PATCH 1697/3220] UPSTREAM: sysfs: remove signedness from
 sysfs_get_dirent

sysfs_get_dirent is usually invoked with a string literal, which
have the type char[].  While the toplevel Makefile
disables -Wpointer-sign, other Makefiles like

arch/x86/boot/compressed/Makefile

redefine KBUILD_CFLAGS. Fixes the warning:

In file included from arch/x86/boot/compressed/kaslr.c:17:
In file included from ./include/linux/module.h:17:
In file included from ./include/linux/kobject.h:21:
./include/linux/sysfs.h:517:37: warning: passing 'const unsigned char *'
to parameter of
      type 'const char *' converts between pointers to integer types
with different sign
      [-Wpointer-sign]
        return kernfs_find_and_get(parent, name);
                                           ^~~~
./include/linux/kernfs.h:462:57: note: passing argument to parameter
'name' here
kernfs_find_and_get(struct kernfs_node *kn, const char *name)
                                                        ^

Signed-off-by: Nick Desaulniers <nick.desaulniers@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 89cf2a20c3f13dbb4c15a0c6d2e390e700992173)
Bug: 78886293
Change-Id: Ic03f7b132fbf67b3543462448aa3f3633eae49a6
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 include/linux/sysfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 00a1f330f93a..9c452f6db438 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -518,7 +518,7 @@ static inline void sysfs_notify_dirent(struct kernfs_node *kn)
 }
 
 static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent,
-						   const unsigned char *name)
+						   const char *name)
 {
 	return kernfs_find_and_get(parent, name);
 }

From cb9daa6fb78fbd4405414310a7a3a1321793215c Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 17 Apr 2017 15:59:52 -0700
Subject: [PATCH 1698/3220] UPSTREAM: nl80211: Fix enum type of variable in
 nl80211_put_sta_rate()

rate_flg is of type 'enum nl80211_attrs', however it is assigned with
'enum nl80211_rate_info' values. Change the type of rate_flg accordingly.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
(cherry picked from commit bbf67e450a5dc2a595e1e7a67b4869f1a7f5a338)
Bug: 78886293
Change-Id: I5423e226dc65375245f8ca6261c025d967552f17
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 net/wireless/nl80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b0b58d1565c2..f20aa7c2c9c2 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3601,7 +3601,7 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
 	struct nlattr *rate;
 	u32 bitrate;
 	u16 bitrate_compat;
-	enum nl80211_attrs rate_flg;
+	enum nl80211_rate_info rate_flg;
 
 	rate = nla_nest_start(msg, attr);
 	if (!rate)

From 3817f7d606affa8ac0821692637018f941aaaabe Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 6 Apr 2017 16:31:41 -0700
Subject: [PATCH 1699/3220] UPSTREAM: mac80211: Fix clang warning about
 constant operand in logical operation

When clang detects a non-boolean constant in a logical operation it
generates a 'constant-logical-operand' warning. In
ieee80211_try_rate_control_ops_get() the result of strlen(<const str>)
is used in a logical operation, clang resolves the expression to an
(integer) constant at compile time when clang's builtin strlen function
is used.

Change the condition to check for strlen() > 0 to make the constant
operand boolean and thus avoid the warning.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
(cherry picked from commit 93f56de259376d7e4fff2b2d104082e1fa66e237)
Bug: 78886293
Change-Id: Ia819eb188699c1d81047c0dfa143da52c6cb490c
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 net/mac80211/rate.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index a4e2f4e67f94..24033c81f3d0 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -173,9 +173,11 @@ ieee80211_rate_control_ops_get(const char *name)
 		/* try default if specific alg requested but not found */
 		ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo);
 
-	/* try built-in one if specific alg requested but not found */
-	if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT))
+	/* Note: check for > 0 is intentional to avoid clang warning */
+	if (!ops && (strlen(CONFIG_MAC80211_RC_DEFAULT) > 0))
+		/* try built-in one if specific alg requested but not found */
 		ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT);
+
 	kernel_param_unlock(THIS_MODULE);
 
 	return ops;

From 31f312b49b72ba8f255364b249282f88c6b4cbaf Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 17 Apr 2017 13:59:53 -0700
Subject: [PATCH 1700/3220] UPSTREAM: mac80211: ibss: Fix channel type enum in
 ieee80211_sta_join_ibss()

cfg80211_chandef_create() expects an 'enum nl80211_channel_type' as
channel type however in ieee80211_sta_join_ibss()
NL80211_CHAN_WIDTH_20_NOHT is passed in two occasions, which is of
the enum type 'nl80211_chan_width'. Change the value to NL80211_CHAN_NO_HT
(20 MHz, non-HT channel) of the channel type enum.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
(cherry picked from commit a4ac6f2e53e568a77a2eb3710efd99ca08634c0a)
Bug: 78886293
Change-Id: Iab3fc750c2cd8b32cf501b9739445a7b5c83e8cd
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 net/mac80211/ibss.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 24ba31601fc9..08ac73b33947 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -427,7 +427,7 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	case NL80211_CHAN_WIDTH_5:
 	case NL80211_CHAN_WIDTH_10:
 		cfg80211_chandef_create(&chandef, cbss->channel,
-					NL80211_CHAN_WIDTH_20_NOHT);
+					NL80211_CHAN_NO_HT);
 		chandef.width = sdata->u.ibss.chandef.width;
 		break;
 	case NL80211_CHAN_WIDTH_80:
@@ -438,7 +438,7 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	default:
 		/* fall back to 20 MHz for unsupported modes */
 		cfg80211_chandef_create(&chandef, cbss->channel,
-					NL80211_CHAN_WIDTH_20_NOHT);
+					NL80211_CHAN_NO_HT);
 		break;
 	}
 

From 6ec0e3f3a9468f34ae9f6a1bf095a6ec1756b60c Mon Sep 17 00:00:00 2001
From: Greg Hartman <ghartman@google.com>
Date: Fri, 4 May 2018 14:45:44 -0700
Subject: [PATCH 1701/3220] ANDROID: x86_64_cuttlefish_defconfig: Disable KPTI

Disable page table isolation to avoid kernel panics when booting kernels
under KVM. Temporary workaround while this issue is investigated.

Bug: 78252157
Bug: 71362031
Change-Id: I5ba126b08678fcb8fd7bc31fe79de80fa81cf869
Merged-In: I5ba126b08678fcb8fd7bc31fe79de80fa81cf869
Signed-off-by: Greg Hartman <ghartman@google.com>
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 512d009146dd..9f0107157b8f 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -55,7 +55,7 @@ CONFIG_PHYSICAL_START=0x200000
 CONFIG_RANDOMIZE_BASE=y
 CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="console=ttyS0 reboot=p"
+CONFIG_CMDLINE="console=ttyS0 reboot=p nopti"
 CONFIG_PM_WAKELOCKS=y
 CONFIG_PM_WAKELOCKS_LIMIT=0
 # CONFIG_PM_WAKELOCKS_GC is not set

From fcce571117188a1d29189a98948357ace8cd1cdb Mon Sep 17 00:00:00 2001
From: Wei Wang <wvw@google.com>
Date: Thu, 3 May 2018 22:16:10 -0700
Subject: [PATCH 1702/3220] ANDROID: build.config: enforce trace_printk check

Bug: 79166848
Change-Id: I41d2fe57b377e305b4b68c30c98ee94643d142e4
Test: Build a kernel with trace_prink and see warning
Signed-off-by: Wei Wang <wvw@google.com>
---
 build.config.cuttlefish.x86_64 | 1 +
 build.config.goldfish.arm      | 1 +
 build.config.goldfish.arm64    | 1 +
 build.config.goldfish.mips     | 1 +
 build.config.goldfish.mips64   | 1 +
 build.config.goldfish.x86      | 1 +
 build.config.goldfish.x86_64   | 1 +
 7 files changed, 7 insertions(+)

diff --git a/build.config.cuttlefish.x86_64 b/build.config.cuttlefish.x86_64
index b3d89109fe75..16aa3fd789cc 100644
--- a/build.config.cuttlefish.x86_64
+++ b/build.config.cuttlefish.x86_64
@@ -13,3 +13,4 @@ arch/x86/boot/bzImage
 vmlinux
 System.map
 "
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.arm b/build.config.goldfish.arm
index 866da9361b71..ff5646ab4f40 100644
--- a/build.config.goldfish.arm
+++ b/build.config.goldfish.arm
@@ -10,3 +10,4 @@ arch/arm/boot/zImage
 vmlinux
 System.map
 "
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.arm64 b/build.config.goldfish.arm64
index 9c963cf4a3d8..4c896a679ab9 100644
--- a/build.config.goldfish.arm64
+++ b/build.config.goldfish.arm64
@@ -10,3 +10,4 @@ arch/arm64/boot/Image
 vmlinux
 System.map
 "
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.mips b/build.config.goldfish.mips
index 8af53d2c2940..9a14a444ac14 100644
--- a/build.config.goldfish.mips
+++ b/build.config.goldfish.mips
@@ -9,3 +9,4 @@ FILES="
 vmlinux
 System.map
 "
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.mips64 b/build.config.goldfish.mips64
index 2a33d36dc4c8..6ad9759f5f4a 100644
--- a/build.config.goldfish.mips64
+++ b/build.config.goldfish.mips64
@@ -9,3 +9,4 @@ FILES="
 vmlinux
 System.map
 "
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.x86 b/build.config.goldfish.x86
index f86253f58d4d..2266c621835e 100644
--- a/build.config.goldfish.x86
+++ b/build.config.goldfish.x86
@@ -10,3 +10,4 @@ arch/x86/boot/bzImage
 vmlinux
 System.map
 "
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.x86_64 b/build.config.goldfish.x86_64
index e1738861ec5c..08c42c2eba03 100644
--- a/build.config.goldfish.x86_64
+++ b/build.config.goldfish.x86_64
@@ -10,3 +10,4 @@ arch/x86/boot/bzImage
 vmlinux
 System.map
 "
+STOP_SHIP_TRACEPRINTK=1

From 086f384e23b5b282e4914ac2ce9522294471abd2 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 3 May 2018 23:26:02 -0700
Subject: [PATCH 1703/3220] UPSTREAM: f2fs: avoid fsync() failure caused by
 EAGAIN in writepage()

pageout() in MM traslates EAGAIN, so calls handle_write_error()
 -> mapping_set_error() -> set_bit(AS_EIO, ...).
 file_write_and_wait_range() will see EIO error, which is critical
 to return value of fsync() followed by atomic_write failure to user.

Change-Id: I6124ed299baf5e5b8993bb6b54f07bb8771eda8c
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
(cherry picked from commit 520a9486182437847212c8e226d042b1e14b7cc2)
---
 fs/f2fs/data.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a670702cf3ff..3d846b027fa1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1902,7 +1902,13 @@ out:
 
 redirty_out:
 	redirty_page_for_writepage(wbc, page);
-	if (!err)
+	/*
+	 * pageout() in MM traslates EAGAIN, so calls handle_write_error()
+	 * -> mapping_set_error() -> set_bit(AS_EIO, ...).
+	 * file_write_and_wait_range() will see EIO error, which is critical
+	 * to return value of fsync() followed by atomic_write failure to user.
+	 */
+	if (!err || wbc->for_reclaim)
 		return AOP_WRITEPAGE_ACTIVATE;
 	unlock_page(page);
 	return err;

From 033c952f2e7d374f82a69595bc3b52a64ca0bf4c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 11 Apr 2018 23:09:04 -0700
Subject: [PATCH 1704/3220] UPSTREAM: f2fs: clear PageError on writepage - part
 2

This patch clears PageError in some pages tagged by read path, but when we
write the pages with valid contents, writepage should clear the bit likewise
ext4.

Change-Id: I7d599ea65d0e30e5faa1cbfb3e1309f62511202b
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
(cherry picked from commit a44b418c31458b213ab59659776f5597e7e78b32)
---
 fs/f2fs/gc.c      | 1 +
 fs/f2fs/inline.c  | 1 +
 fs/f2fs/node.c    | 1 +
 fs/f2fs/segment.c | 1 +
 4 files changed, 4 insertions(+)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index c009b50d69f5..d28d31cbd7d2 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -693,6 +693,7 @@ static void move_data_block(struct inode *inode, block_t bidx,
 		dec_page_count(fio.sbi, F2FS_DIRTY_META);
 
 	set_page_writeback(fio.encrypted_page);
+	ClearPageError(page);
 
 	/* allocate block address */
 	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 922a213693c1..ac951ee9b20b 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -157,6 +157,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 
 	/* write data page to try to make data consistent */
 	set_page_writeback(page);
+	ClearPageError(page);
 	fio.old_blkaddr = dn->data_blkaddr;
 	set_inode_flag(dn->inode, FI_HOT_DATA);
 	write_data_page(dn, &fio);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3871e7d3f69e..16aee2a7b8a9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1398,6 +1398,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 		fio.op_flags |= WRITE_FLUSH_FUA;
 
 	set_page_writeback(page);
+	ClearPageError(page);
 	fio.old_blkaddr = ni.blk_addr;
 	write_node_page(nid, &fio);
 	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d7bac60ad719..01bc94df9f00 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2838,6 +2838,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
 		fio.op_flags &= ~REQ_META;
 
 	set_page_writeback(page);
+	ClearPageError(page);
 	f2fs_submit_page_write(&fio);
 
 	f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);

From 3702e76fb6e907cc75887fe5780fe160946e7c68 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 30 Apr 2018 17:11:42 -0700
Subject: [PATCH 1705/3220] goldfish: pipe: ANDROID: Add DMA support

This change improves the pipe performance by removing unnesessary
memory copying.

Bug: 72717639
Bug: 66884503
Change-Id: I0d279f682039e411faf4212713d82ec355c3e9ee
Signed-off-by: Roman Kiryanov <rkir@google.com>
Signed-off-by: Lingfeng Yang <lfy@google.com>
---
 drivers/platform/goldfish/goldfish_pipe.h    |   3 +
 drivers/platform/goldfish/goldfish_pipe_v2.c | 358 ++++++++++++++++++-
 include/uapi/linux/goldfish/goldfish_dma.h   |  83 +++++
 3 files changed, 442 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/linux/goldfish/goldfish_dma.h

diff --git a/drivers/platform/goldfish/goldfish_pipe.h b/drivers/platform/goldfish/goldfish_pipe.h
index 18b4fecf4e43..e24bef314468 100644
--- a/drivers/platform/goldfish/goldfish_pipe.h
+++ b/drivers/platform/goldfish/goldfish_pipe.h
@@ -87,6 +87,9 @@ struct goldfish_pipe_dev {
 
 	/* ptr to platform device's device struct */
 	struct device *pdev_dev;
+
+	/* DMA info */
+	size_t dma_alloc_total;
 };
 
 extern struct goldfish_pipe_dev goldfish_pipe_dev;
diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index f8a4e47581fc..28ac0b146e45 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -47,14 +47,24 @@
  */
 
 #include <linux/printk.h>
+#include <linux/dma-mapping.h>
+#include <linux/platform_device.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bug.h>
+#include <uapi/linux/goldfish/goldfish_dma.h>
 #include "goldfish_pipe.h"
 
 /*
  * Update this when something changes in the driver's behavior so the host
  * can benefit from knowing it
+ * Notes:
+ *	version 2 was an intermediate release and isn't supported anymore.
+ *	version 3 is goldfish_pipe_v2 without DMA support.
+	version 4 (current) is goldfish_pipe_v2 with DMA support.
  */
 enum {
-	PIPE_DRIVER_VERSION = 2,
+	PIPE_DRIVER_VERSION = 4,
 	PIPE_CURRENT_DEVICE_VERSION = 2
 };
 
@@ -123,12 +133,16 @@ enum PipeCmdCode {
 	 * parallel processing of pipe operations on the host.
 	*/
 	PIPE_CMD_WAKE_ON_DONE_IO,
+	PIPE_CMD_DMA_HOST_MAP,
+	PIPE_CMD_DMA_HOST_UNMAP,
 };
 
 enum {
 	MAX_BUFFERS_PER_COMMAND = 336,
 	MAX_SIGNALLED_PIPES = 64,
-	INITIAL_PIPES_CAPACITY = 64
+	INITIAL_PIPES_CAPACITY = 64,
+	DMA_REGION_MIN_SIZE = PAGE_SIZE,
+	DMA_REGION_MAX_SIZE = 256 << 20
 };
 
 struct goldfish_pipe_dev;
@@ -153,6 +167,11 @@ struct goldfish_pipe_command {
 			/* buffer sizes, guest -> host */
 			u32 sizes[MAX_BUFFERS_PER_COMMAND];
 		} rw_params;
+		/* Parameters for PIPE_CMD_DMA_HOST_(UN)MAP */
+		struct {
+			u64 dma_paddr;
+			u64 sz;
+		} dma_maphost_params;
 	};
 };
 
@@ -175,6 +194,24 @@ struct goldfish_pipe_dev_buffers {
 		signalled_pipe_buffers[MAX_SIGNALLED_PIPES];
 };
 
+/*
+ * The main data structure tracking state is
+ * struct goldfish_dma_context, which is included
+ * as an extra pointer field in struct goldfish_pipe.
+ * Each such context is associated with possibly
+ * one physical address and size describing the
+ * allocated DMA region, and only one allocation
+ * is allowed for each pipe fd. Further allocations
+ * require more open()'s of pipe fd's.
+ */
+struct goldfish_dma_context {
+	struct device *pdev_dev;	/* pointer to feed to dma_*_coherent */
+	void *dma_vaddr;		/* kernel vaddr of dma region */
+	size_t dma_size;		/* size of dma region */
+	dma_addr_t phys_begin;		/* paddr of dma region */
+	dma_addr_t phys_end;		/* paddr of dma region + dma_size */
+};
+
 /* This data type models a given pipe instance */
 struct goldfish_pipe {
 	/* pipe ID - index into goldfish_pipe_dev::pipes array */
@@ -211,6 +248,8 @@ struct goldfish_pipe {
 	wait_queue_head_t wake_queue;
 	/* Pointer to the parent goldfish_pipe_dev instance */
 	struct goldfish_pipe_dev *dev;
+	/* Holds information about reserved DMA region for this pipe */
+	struct goldfish_dma_context *dma;
 };
 
 struct goldfish_pipe_dev goldfish_pipe_dev;
@@ -744,6 +783,8 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
 		goto err_cmd;
 	}
 
+	pipe->dma = NULL;
+
 	/* All is done, save the pipe into the file's private data field */
 	file->private_data = pipe;
 	return 0;
@@ -759,6 +800,55 @@ err_pipe:
 	return status;
 }
 
+static void goldfish_pipe_dma_release_host(struct goldfish_pipe *pipe)
+{
+	struct goldfish_dma_context *dma = pipe->dma;
+	struct device *pdev_dev;
+
+	if (!dma)
+		return;
+
+	pdev_dev = pipe->dev->pdev_dev;
+
+	if (dma->dma_vaddr) {
+		dev_dbg(pdev_dev, "Last ref for dma region @ 0x%llx\n",
+			dma->phys_begin);
+
+		pipe->command_buffer->dma_maphost_params.dma_paddr =
+			dma->phys_begin;
+		pipe->command_buffer->dma_maphost_params.sz = dma->dma_size;
+		goldfish_pipe_cmd(pipe, PIPE_CMD_DMA_HOST_UNMAP);
+	}
+
+	dev_dbg(pdev_dev,
+		"after delete of dma @ 0x%llx: alloc total %zu\n",
+		dma->phys_begin, pipe->dev->dma_alloc_total);
+}
+
+static void goldfish_pipe_dma_release_guest(struct goldfish_pipe *pipe)
+{
+	struct goldfish_dma_context *dma = pipe->dma;
+	struct device *pdev_dev;
+
+	if (!dma)
+		return;
+
+	pdev_dev = pipe->dev->pdev_dev;
+
+	if (dma->dma_vaddr) {
+		dma_free_coherent(
+				dma->pdev_dev,
+				dma->dma_size,
+				dma->dma_vaddr,
+				dma->phys_begin);
+		pipe->dev->dma_alloc_total -= dma->dma_size;
+
+		dev_dbg(pdev_dev,
+			"after delete of dma @ 0x%llx: alloc total %zu\n",
+			dma->phys_begin, pipe->dev->dma_alloc_total);
+	}
+}
+
 static int goldfish_pipe_release(struct inode *inode, struct file *filp)
 {
 	unsigned long flags;
@@ -766,6 +856,7 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
 	struct goldfish_pipe_dev *dev = pipe->dev;
 
 	/* The guest is closing the channel, so tell the emulator right now */
+	goldfish_pipe_dma_release_host(pipe);
 	goldfish_pipe_cmd(pipe, PIPE_CMD_CLOSE);
 
 	spin_lock_irqsave(&dev->lock, flags);
@@ -775,12 +866,271 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
 
 	filp->private_data = NULL;
 
+	/* Even if a fd is duped or involved in a forked process,
+	 * open/release methods are called only once, ever.
+	 * This makes goldfish_pipe_release a safe point
+	 * to delete the DMA region.
+	 */
+	goldfish_pipe_dma_release_guest(pipe);
+
+	kfree(pipe->dma);
 	free_page((unsigned long)pipe->command_buffer);
 	kfree(pipe);
 
 	return 0;
 }
 
+/* VMA open/close are for debugging purposes only.
+ * One might think that fork() (and thus pure calls to open())
+ * will require some sort of bookkeeping or refcounting
+ * for dma contexts (incl. when to call dma_free_coherent),
+ * but |vm_private_data| field and |vma_open/close| are only
+ * for situations where the driver needs to interact with vma's
+ * directly with its own per-VMA data structure (which does
+ * need to be refcounted).
+ *
+ * Here, we just use the kernel's existing
+ * VMA processing; we don't do anything on our own.
+ * The only reason we would want to do so is if we had to do
+ * special processing for the virtual (not physical) memory
+ * already associated with DMA memory; it is much less related
+ * to the task of knowing when to alloc/dealloc DMA memory.
+ */
+static void goldfish_dma_vma_open(struct vm_area_struct *vma)
+{
+	/* Not used */
+}
+
+static void goldfish_dma_vma_close(struct vm_area_struct *vma)
+{
+	/* Not used */
+}
+
+static const struct vm_operations_struct goldfish_dma_vm_ops = {
+	.open = goldfish_dma_vma_open,
+	.close = goldfish_dma_vma_close,
+};
+
+static bool is_page_size_multiple(unsigned long sz)
+{
+	return !(sz & (PAGE_SIZE - 1));
+}
+
+static bool check_region_size_valid(size_t size)
+{
+	if (size < DMA_REGION_MIN_SIZE)
+		return false;
+
+	if (size > DMA_REGION_MAX_SIZE)
+		return false;
+
+	return is_page_size_multiple(size);
+}
+
+static int goldfish_pipe_dma_alloc_locked(struct goldfish_pipe *pipe)
+{
+	struct goldfish_dma_context *dma = pipe->dma;
+	struct device *pdev_dev = pipe->dev->pdev_dev;
+
+	dev_dbg(pdev_dev, "%s: try alloc dma for pipe %p\n",
+			__func__, pipe);
+
+	if (dma->dma_vaddr) {
+		dev_dbg(pdev_dev, "%s: already alloced, return.\n",
+			__func__);
+		return 0;
+	}
+
+	dma->phys_begin = 0;
+	dma->dma_vaddr =
+		dma_alloc_coherent(
+				dma->pdev_dev,
+				dma->dma_size,
+				&dma->phys_begin,
+				GFP_KERNEL);
+	return -ENOMEM;
+
+	dma->phys_end = dma->phys_begin + dma->dma_size;
+	pipe->dev->dma_alloc_total += dma->dma_size;
+
+	dev_dbg(pdev_dev, "%s: got v/p addrs "
+		"%p 0x%llx sz %zu total alloc %zu\n",
+		__func__,
+		dma->dma_vaddr,
+		dma->phys_begin,
+		dma->dma_size,
+		pipe->dev->dma_alloc_total);
+	pipe->command_buffer->dma_maphost_params.dma_paddr = dma->phys_begin;
+	pipe->command_buffer->dma_maphost_params.sz = dma->dma_size;
+	return goldfish_pipe_cmd_locked(pipe, PIPE_CMD_DMA_HOST_MAP);
+}
+
+static int goldfish_dma_mmap_locked(
+	struct goldfish_pipe *pipe, struct vm_area_struct *vma)
+{
+	struct goldfish_dma_context *dma = pipe->dma;
+	struct device *pdev_dev = pipe->dev->pdev_dev;
+	size_t sz_requested = vma->vm_end - vma->vm_start;
+	int status;
+
+	if (!check_region_size_valid(sz_requested)) {
+		dev_err(pdev_dev, "%s: bad size (%zu) requested\n", __func__,
+			sz_requested);
+		return -EINVAL;
+	}
+
+	dev_dbg(pdev_dev, "Mapping dma at 0x%llx\n", dma->phys_begin);
+
+	/* Alloc phys region if not allocated already. */
+	status = goldfish_pipe_dma_alloc_locked(pipe);
+	if (status)
+		return status;
+
+	status =
+		remap_pfn_range(
+				vma,
+				vma->vm_start,
+				dma->phys_begin >> PAGE_SHIFT,
+				sz_requested,
+				vma->vm_page_prot);
+
+	if (status < 0) {
+		dev_err(pdev_dev, "Cannot remap pfn range....\n");
+		return -EAGAIN;
+	}
+
+	vma->vm_ops = &goldfish_dma_vm_ops;
+	dev_dbg(pdev_dev, "goldfish_dma_mmap for host vaddr 0x%llx succeeded\n",
+		dma->phys_begin);
+
+	return 0;
+}
+
+/* When we call mmap() on a pipe fd, we obtain a pointer into
+ * the physically contiguous DMA region of the pipe device
+ * (Goldfish DMA).
+ */
+static int goldfish_dma_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct goldfish_pipe *pipe =
+		(struct goldfish_pipe *)(filp->private_data);
+	int status;
+
+	if (mutex_lock_interruptible(&pipe->lock))
+		return -ERESTARTSYS;
+
+	status = goldfish_dma_mmap_locked(pipe, vma);
+	mutex_unlock(&pipe->lock);
+	return status;
+
+}
+
+static int goldfish_pipe_dma_create_region(
+	struct goldfish_pipe *pipe, size_t size)
+{
+	struct goldfish_dma_context *dma =
+		kzalloc(sizeof(struct goldfish_dma_context), GFP_KERNEL);
+	struct device *pdev_dev = pipe->dev->pdev_dev;
+
+	if (dma) {
+		if (mutex_lock_interruptible(&pipe->lock)) {
+			kfree(dma);
+			return -ERESTARTSYS;
+		}
+
+		if (pipe->dma) {
+			mutex_unlock(&pipe->lock);
+			kfree(dma);
+			dev_err(pdev_dev, "The DMA region already allocated\n");
+			return -EBUSY;
+		}
+
+		dma->dma_size = size;
+		dma->pdev_dev = pipe->dev->pdev_dev;
+		pipe->dma = dma;
+		mutex_unlock(&pipe->lock);
+		return 0;
+	}
+
+	dev_err(pdev_dev, "Could not allocate DMA context info!\n");
+	return -ENOMEM;
+}
+
+long goldfish_dma_ioctl_getoff(struct goldfish_pipe *pipe, unsigned long arg)
+{
+	struct device *pdev_dev = pipe->dev->pdev_dev;
+	struct goldfish_dma_ioctl_info ioctl_data;
+	struct goldfish_dma_context *dma;
+
+	BUILD_BUG_ON(FIELD_SIZEOF(struct goldfish_dma_ioctl_info, phys_begin) <
+		FIELD_SIZEOF(struct goldfish_dma_context, phys_begin));
+
+	if (mutex_lock_interruptible(&pipe->lock)) {
+		dev_err(pdev_dev, "DMA_GETOFF: the pipe is not locked\n");
+		return -EACCES;
+	}
+
+	dma = pipe->dma;
+	if (dma) {
+		ioctl_data.phys_begin = dma->phys_begin;
+		ioctl_data.size = dma->dma_size;
+	} else {
+		ioctl_data.phys_begin = 0;
+		ioctl_data.size = 0;
+	}
+
+	if (copy_to_user((void __user *)arg, &ioctl_data,
+		sizeof(ioctl_data))) {
+		mutex_unlock(&pipe->lock);
+		return -EFAULT;
+	}
+
+	dev_dbg(pdev_dev,
+		"DMA_IOC_GETOFF: phys_begin=0x%llx size=%lld\n",
+		ioctl_data.phys_begin, ioctl_data.size);
+
+	mutex_unlock(&pipe->lock);
+	return 0;
+}
+
+long goldfish_dma_ioctl_create_region(struct goldfish_pipe *pipe,
+	unsigned long arg)
+{
+	struct goldfish_dma_ioctl_info ioctl_data;
+
+	if (copy_from_user(&ioctl_data, (void __user *)arg, sizeof(ioctl_data)))
+		return -EFAULT;
+
+	if (!check_region_size_valid(ioctl_data.size)) {
+		dev_err(pipe->dev->pdev_dev,
+			"DMA_CREATE_REGION: bad size (%lld) requested\n",
+			ioctl_data.size);
+		return -EINVAL;
+	}
+
+	return goldfish_pipe_dma_create_region(pipe, ioctl_data.size);
+}
+
+static long goldfish_dma_ioctl(
+	struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct goldfish_pipe *pipe =
+		(struct goldfish_pipe *)(file->private_data);
+
+	switch (cmd) {
+	case GOLDFISH_DMA_IOC_LOCK:
+		return 0;
+	case GOLDFISH_DMA_IOC_UNLOCK:
+		wake_up_interruptible(&pipe->wake_queue);
+		return 0;
+	case GOLDFISH_DMA_IOC_GETOFF:
+		return goldfish_dma_ioctl_getoff(pipe, arg);
+	case GOLDFISH_DMA_IOC_CREATE_REGION:
+		return goldfish_dma_ioctl_create_region(pipe, arg);
+	}
+	return -ENOTTY;
+}
+
 static const struct file_operations goldfish_pipe_fops = {
 	.owner = THIS_MODULE,
 	.read = goldfish_pipe_read,
@@ -788,6 +1138,10 @@ static const struct file_operations goldfish_pipe_fops = {
 	.poll = goldfish_pipe_poll,
 	.open = goldfish_pipe_open,
 	.release = goldfish_pipe_release,
+	/* DMA-related operations */
+	.mmap = goldfish_dma_mmap,
+	.unlocked_ioctl = goldfish_dma_ioctl,
+	.compat_ioctl = goldfish_dma_ioctl,
 };
 
 static struct miscdevice goldfish_pipe_miscdev = {
diff --git a/include/uapi/linux/goldfish/goldfish_dma.h b/include/uapi/linux/goldfish/goldfish_dma.h
new file mode 100644
index 000000000000..3d6376255c96
--- /dev/null
+++ b/include/uapi/linux/goldfish/goldfish_dma.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef UAPI_GOLDFISH_DMA_H
+#define UAPI_GOLDFISH_DMA_H
+
+#include <linux/types.h>
+
+/* GOLDFISH DMA
+ *
+ * Goldfish DMA is an extension to the pipe device
+ * and is designed to facilitate high-speed RAM->RAM
+ * transfers from guest to host.
+ *
+ * Interface (guest side):
+ *
+ * The guest user calls goldfish_dma_alloc (ioctls)
+ * and then mmap() on a goldfish pipe fd,
+ * which means that it wants high-speed access to
+ * host-visible memory.
+ *
+ * The guest can then write into the pointer
+ * returned by mmap(), and these writes
+ * become immediately visible on the host without BQL
+ * or otherweise context switching.
+ *
+ * dma_alloc_coherent() is used to obtain contiguous
+ * physical memory regions, and we allocate and interact
+ * with this region on both guest and host through
+ * the following ioctls:
+ *
+ * - LOCK: lock the region for data access.
+ * - UNLOCK: unlock the region. This may also be done from the host
+ *   through the WAKE_ON_UNLOCK_DMA procedure.
+ * - CREATE_REGION: initialize size info for a dma region.
+ * - GETOFF: send physical address to guest drivers.
+ * - (UN)MAPHOST: uses goldfish_pipe_cmd to tell the host to
+ * (un)map to the guest physical address associated
+ * with the current dma context. This makes the physically
+ * contiguous memory (in)visible to the host.
+ *
+ * Guest userspace obtains a pointer to the DMA memory
+ * through mmap(), which also lazily allocates the memory
+ * with dma_alloc_coherent. (On last pipe close(), the region is freed).
+ * The mmaped() region can handle very high bandwidth
+ * transfers, and pipe operations can be used at the same
+ * time to handle synchronization and command communication.
+ */
+
+#define GOLDFISH_DMA_BUFFER_SIZE (32 * 1024 * 1024)
+
+struct goldfish_dma_ioctl_info {
+	__u64 phys_begin;
+	__u64 size;
+};
+
+/* There is an ioctl associated with goldfish dma driver.
+ * Make it conflict with ioctls that are not likely to be used
+ * in the emulator.
+ * 'G'	00-3F	drivers/misc/sgi-gru/grulib.h	conflict!
+ * 'G'	00-0F	linux/gigaset_dev.h	conflict!
+ */
+#define GOLDFISH_DMA_IOC_MAGIC	'G'
+#define GOLDFISH_DMA_IOC_OP(OP)	_IOWR(GOLDFISH_DMA_IOC_MAGIC, OP, \
+				struct goldfish_dma_ioctl_info)
+
+#define GOLDFISH_DMA_IOC_LOCK		GOLDFISH_DMA_IOC_OP(0)
+#define GOLDFISH_DMA_IOC_UNLOCK		GOLDFISH_DMA_IOC_OP(1)
+#define GOLDFISH_DMA_IOC_GETOFF		GOLDFISH_DMA_IOC_OP(2)
+#define GOLDFISH_DMA_IOC_CREATE_REGION	GOLDFISH_DMA_IOC_OP(3)
+
+#endif /* UAPI_GOLDFISH_DMA_H */

From 6a92abb29cc61c789a969d694bd525bedee2a17f Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Wed, 28 Mar 2018 11:14:50 +0200
Subject: [PATCH 1706/3220] UPSTREAM: ANDROID: binder: prevent transactions
 into own process.

This can't happen with normal nodes (because you can't get a ref
to a node you own), but it could happen with the context manager;
to make the behavior consistent with regular nodes, reject
transactions into the context manager by the process owning it.

Reported-by: syzbot+09e05aba06723a94d43d@syzkaller.appspotmail.com
Signed-off-by: Martijn Coenen <maco@android.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 7aa135fcf26377f92dc0680a57566b4c7f3e281b)
---
 drivers/android/binder.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 4ffc84d43795..7f3e8dcd6006 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3002,6 +3002,14 @@ static void binder_transaction(struct binder_proc *proc,
 			else
 				return_error = BR_DEAD_REPLY;
 			mutex_unlock(&context->context_mgr_node_lock);
+			if (target_node && target_proc == proc) {
+				binder_user_error("%d:%d got transaction to context manager from process owning it\n",
+						  proc->pid, thread->pid);
+				return_error = BR_FAILED_REPLY;
+				return_error_param = -EINVAL;
+				return_error_line = __LINE__;
+				goto err_invalid_target_handle;
+			}
 		}
 		if (!target_node) {
 			/*

From 1e1da81a2733122ae0900c5f701e971e5fc161de Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Thu, 10 May 2018 10:52:53 -0700
Subject: [PATCH 1707/3220] Revert "goldfish: pipe: ANDROID: Allocate memory
 with GFP_KERNEL."

This reverts commit ed11d8f2a0d07756acace1a2a295f29e41976358.
Since get_free_pipe_id_locked runs with interrupts disabled, we don't
want to make calls that could lead to sleep.
Bug: 72717639
Bug: 66884503
Signed-off-by: Roman Kiryanov <rkir@google.com>
Change-Id: I6c2eac97fd415bdbd5afa0a3a90559e0da3e1c86
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 28ac0b146e45..3582fd68ffc2 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -697,11 +697,14 @@ static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
 			return id;
 
 	{
-		/* Reallocate the array */
+		/* Reallocate the array.
+		 * Since get_free_pipe_id_locked runs with interrupts disabled,
+		 * we don't want to make calls that could lead to sleep.
+		 */
 		u32 new_capacity = 2 * dev->pipes_capacity;
 		struct goldfish_pipe **pipes =
 				kcalloc(new_capacity, sizeof(*pipes),
-					GFP_KERNEL);
+					GFP_ATOMIC);
 		if (!pipes)
 			return -ENOMEM;
 		memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity);

From aa3863d2761492384e06510109b9413e28d25429 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Fri, 11 May 2018 11:19:38 -0700
Subject: [PATCH 1708/3220] goldfish: pipe: ANDROID: mark local functions
 static

goldfish_dma_ioctl_getoff and goldfish_dma_ioctl_create_region
are not used outside of goldfish_pipe_v2.c - mark them
static.
Bug: 72717639
Bug: 66884503
Signed-off-by: Roman Kiryanov <rkir@google.com>
Change-Id: Id4fe524cc4db9d36b0c794ad9f34356f396153b4
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 3582fd68ffc2..90bac4b055a3 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -1059,7 +1059,8 @@ static int goldfish_pipe_dma_create_region(
 	return -ENOMEM;
 }
 
-long goldfish_dma_ioctl_getoff(struct goldfish_pipe *pipe, unsigned long arg)
+static long goldfish_dma_ioctl_getoff(struct goldfish_pipe *pipe,
+	unsigned long arg)
 {
 	struct device *pdev_dev = pipe->dev->pdev_dev;
 	struct goldfish_dma_ioctl_info ioctl_data;
@@ -1096,7 +1097,7 @@ long goldfish_dma_ioctl_getoff(struct goldfish_pipe *pipe, unsigned long arg)
 	return 0;
 }
 
-long goldfish_dma_ioctl_create_region(struct goldfish_pipe *pipe,
+static long goldfish_dma_ioctl_create_region(struct goldfish_pipe *pipe,
 	unsigned long arg)
 {
 	struct goldfish_dma_ioctl_info ioctl_data;

From 4f75c34feee68e8473d0b51fadfb32c88f25320d Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Tue, 24 Apr 2018 18:06:56 -0700
Subject: [PATCH 1709/3220] ANDROID: sdcardfs: Don't d_drop in d_revalidate

After d_revalidate returns 0, the vfs will call
d_invalidate, which will call d_drop itself, along
with other cleanup.

Bug: 78262592
Change-Id: Idbb30e008c05d62edf2217679cb6a5517d8d1a2c
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/dentry.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index 642627161cad..3795d2c26915 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -51,7 +51,6 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	 * whether the base obbpath has been changed or not
 	 */
 	if (is_obbpath_invalid(dentry)) {
-		d_drop(dentry);
 		return 0;
 	}
 
@@ -65,7 +64,6 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if ((lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) {
 		err = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
 		if (err == 0) {
-			d_drop(dentry);
 			goto out;
 		}
 	}
@@ -73,14 +71,12 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	spin_lock(&lower_dentry->d_lock);
 	if (d_unhashed(lower_dentry)) {
 		spin_unlock(&lower_dentry->d_lock);
-		d_drop(dentry);
 		err = 0;
 		goto out;
 	}
 	spin_unlock(&lower_dentry->d_lock);
 
 	if (parent_lower_dentry != lower_cur_parent_dentry) {
-		d_drop(dentry);
 		err = 0;
 		goto out;
 	}
@@ -94,7 +90,6 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	}
 
 	if (!qstr_case_eq(&dentry->d_name, &lower_dentry->d_name)) {
-		__d_drop(dentry);
 		err = 0;
 	}
 
@@ -113,7 +108,6 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (inode) {
 		data = top_data_get(SDCARDFS_I(inode));
 		if (!data || data->abandoned) {
-			d_drop(dentry);
 			err = 0;
 		}
 		if (data)

From 1a2b3e7807da0947126f5e59cdaf4ecf6a51282e Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Thu, 17 Nov 2016 11:24:20 -0800
Subject: [PATCH 1710/3220] UPSTREAM: dm bufio: avoid sleeping while holding
 the dm_bufio lock

We've seen in-field reports showing _lots_ (18 in one case, 41 in
another) of tasks all sitting there blocked on:

  mutex_lock+0x4c/0x68
  dm_bufio_shrink_count+0x38/0x78
  shrink_slab.part.54.constprop.65+0x100/0x464
  shrink_zone+0xa8/0x198

In the two cases analyzed, we see one task that looks like this:

  Workqueue: kverityd verity_prefetch_io

  __switch_to+0x9c/0xa8
  __schedule+0x440/0x6d8
  schedule+0x94/0xb4
  schedule_timeout+0x204/0x27c
  schedule_timeout_uninterruptible+0x44/0x50
  wait_iff_congested+0x9c/0x1f0
  shrink_inactive_list+0x3a0/0x4cc
  shrink_lruvec+0x418/0x5cc
  shrink_zone+0x88/0x198
  try_to_free_pages+0x51c/0x588
  __alloc_pages_nodemask+0x648/0xa88
  __get_free_pages+0x34/0x7c
  alloc_buffer+0xa4/0x144
  __bufio_new+0x84/0x278
  dm_bufio_prefetch+0x9c/0x154
  verity_prefetch_io+0xe8/0x10c
  process_one_work+0x240/0x424
  worker_thread+0x2fc/0x424
  kthread+0x10c/0x114

...and that looks to be the one holding the mutex.

The problem has been reproduced on fairly easily:
0. Be running Chrome OS w/ verity enabled on the root filesystem
1. Pick test patch: http://crosreview.com/412360
2. Install launchBalloons.sh and balloon.arm from
     http://crbug.com/468342
   ...that's just a memory stress test app.
3. On a 4GB rk3399 machine, run
     nice ./launchBalloons.sh 4 900 100000
   ...that tries to eat 4 * 900 MB of memory and keep accessing.
4. Login to the Chrome web browser and restore many tabs

With that, I've seen printouts like:
  DOUG: long bufio 90758 ms
...and stack trace always show's we're in dm_bufio_prefetch().

The problem is that we try to allocate memory with GFP_NOIO while
we're holding the dm_bufio lock.  Instead we should be using
GFP_NOWAIT.  Using GFP_NOIO can cause us to sleep while holding the
lock and that causes the above problems.

The current behavior explained by David Rientjes:

  It will still try reclaim initially because __GFP_WAIT (or
  __GFP_KSWAPD_RECLAIM) is set by GFP_NOIO.  This is the cause of
  contention on dm_bufio_lock() that the thread holds.  You want to
  pass GFP_NOWAIT instead of GFP_NOIO to alloc_buffer() when holding a
  mutex that can be contended by a concurrent slab shrinker (if
  count_objects didn't use a trylock, this pattern would trivially
  deadlock).

This change significantly increases responsiveness of the system while
in this state.  It makes a real difference because it unblocks kswapd.
In the bug report analyzed, kswapd was hung:

   kswapd0         D ffffffc000204fd8     0    72      2 0x00000000
   Call trace:
   [<ffffffc000204fd8>] __switch_to+0x9c/0xa8
   [<ffffffc00090b794>] __schedule+0x440/0x6d8
   [<ffffffc00090bac0>] schedule+0x94/0xb4
   [<ffffffc00090be44>] schedule_preempt_disabled+0x28/0x44
   [<ffffffc00090d900>] __mutex_lock_slowpath+0x120/0x1ac
   [<ffffffc00090d9d8>] mutex_lock+0x4c/0x68
   [<ffffffc000708e7c>] dm_bufio_shrink_count+0x38/0x78
   [<ffffffc00030b268>] shrink_slab.part.54.constprop.65+0x100/0x464
   [<ffffffc00030dbd8>] shrink_zone+0xa8/0x198
   [<ffffffc00030e578>] balance_pgdat+0x328/0x508
   [<ffffffc00030eb7c>] kswapd+0x424/0x51c
   [<ffffffc00023f06c>] kthread+0x10c/0x114
   [<ffffffc000203dd0>] ret_from_fork+0x10/0x40

By unblocking kswapd memory pressure should be reduced.

Change-Id: I10da1bcb02160d75320c16259a54b5de4aafede1
Suggested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit 9ea61cac0b1ad0c09022f39fd97e9b99a2cfc2dc)
Signed-off-by: Minchan Kim <minchan@google.com>
---
 drivers/md/dm-bufio.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 969c815c90b6..d566c32e222a 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -818,7 +818,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
 	 * dm-bufio is resistant to allocation failures (it just keeps
 	 * one buffer reserved in cases all the allocations fail).
 	 * So set flags to not try too hard:
-	 *	GFP_NOIO: don't recurse into the I/O layer
+	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
+	 *		    mutex and wait ourselves.
 	 *	__GFP_NORETRY: don't retry and rather return failure
 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
 	 *	__GFP_NOWARN: don't print a warning in case of failure
@@ -828,7 +829,7 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
 	 */
 	while (1) {
 		if (dm_bufio_cache_size_latch != 1) {
-			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 			if (b)
 				return b;
 		}

From d0f23b8c9ef78602e6ac140a7fb98d779f206c53 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 22 May 2018 11:41:31 -0700
Subject: [PATCH 1711/3220] ANDROID: build: cuttlefish: Fix path to clang.

Reconcile with changes made to the kernel manifest. Clang must come from
master because it was not usable for kernel builds in older branches of
the Android platform.

Bug: 63889157
Change-Id: Id0a080fc2f1cba495f37f26afa48e43e736b756a
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 build.config.cuttlefish.x86_64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.config.cuttlefish.x86_64 b/build.config.cuttlefish.x86_64
index 16aa3fd789cc..221e4bce94e4 100644
--- a/build.config.cuttlefish.x86_64
+++ b/build.config.cuttlefish.x86_64
@@ -6,7 +6,7 @@ DEFCONFIG=x86_64_cuttlefish_defconfig
 EXTRA_CMDS=''
 KERNEL_DIR=common
 POST_DEFCONFIG_CMDS="check_defconfig"
-CLANG_PREBUILT_BIN=prebuilts/clang/host/linux-x86/clang-4630689/bin
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-4630689/bin
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
 FILES="
 arch/x86/boot/bzImage

From 237f695f140f5c0e41b8e54584855318702636ac Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 22 May 2018 16:09:23 -0700
Subject: [PATCH 1712/3220] ANDROID: build: cuttlefish: Upgrade clang to newer
 version.

Use the same clang version as hikey-linaro.

Bug: 63889157
Change-Id: I6932d6149642d429086207e63aa8a8d5c2afd6f7
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 build.config.cuttlefish.x86_64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.config.cuttlefish.x86_64 b/build.config.cuttlefish.x86_64
index 221e4bce94e4..08701c73b809 100644
--- a/build.config.cuttlefish.x86_64
+++ b/build.config.cuttlefish.x86_64
@@ -6,7 +6,7 @@ DEFCONFIG=x86_64_cuttlefish_defconfig
 EXTRA_CMDS=''
 KERNEL_DIR=common
 POST_DEFCONFIG_CMDS="check_defconfig"
-CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-4630689/bin
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-4679922/bin
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
 FILES="
 arch/x86/boot/bzImage

From ecf86ddd92cf4b4b52751838f8a6377071086c1d Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 22 May 2018 18:01:41 -0700
Subject: [PATCH 1713/3220] ANDROID: build: cuttlefish: Upgrade clang to newer
 version.

The last upgrade introduced a new build failure, because it had a bug
which caused it to emit PLT relocations, certain types of which cannot
be handled by the reloc tool in the kernel.

See https://bugs.llvm.org/show_bug.cgi?id=36674 for more details.

Bug: 63889157
Change-Id: I813febdbacb0579abcb12dc7f2164cce1e2f5a26
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 build.config.cuttlefish.x86_64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.config.cuttlefish.x86_64 b/build.config.cuttlefish.x86_64
index 08701c73b809..ce8c226a1db4 100644
--- a/build.config.cuttlefish.x86_64
+++ b/build.config.cuttlefish.x86_64
@@ -6,7 +6,7 @@ DEFCONFIG=x86_64_cuttlefish_defconfig
 EXTRA_CMDS=''
 KERNEL_DIR=common
 POST_DEFCONFIG_CMDS="check_defconfig"
-CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-4679922/bin
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r328903/bin
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
 FILES="
 arch/x86/boot/bzImage

From c3c51ea38a927e64e9e107d6d794a6e62c3f5731 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 22 May 2018 17:38:34 -0700
Subject: [PATCH 1714/3220] x86: vdso: Fix leaky vdso linker with CC=clang.

The vdso{32,64}.so can fail to build when CC=clang when clang tries to
find a suitable GCC toolchain to link these libraries with.

/usr/bin/ld: arch/x86/entry/vdso/vclock_gettime.o: access beyond end of merged section (782)

This happens because the host environment leaked into the CROSS_COMPILE
environment due to the way clang searches for suitable GCC toolchains.

Most of the time this goes unnoticed because the host linker is new
enough to work anyway, but on this particular machine it was not.

Extract the needed --target and --gcc-toolchain flags added in the top
level Makefile from KBUILD_CFLAGS.

Bug: 63889157
Change-Id: If7d4097d1d2eaf95f18d0295483bde8792a06844
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/x86/entry/vdso/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index fddeb1f4dcd2..32acb36d0a9a 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -166,7 +166,8 @@ quiet_cmd_vdso = VDSO    $@
 		 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
 
 VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
-	$(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
+	$(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) \
+	$(filter --target=% --gcc-toolchain=%,$(KBUILD_CFLAGS))
 GCOV_PROFILE := n
 
 #

From 9ff3cfc1ec343023b1027b341e03b892e066340b Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Wed, 23 May 2018 13:00:23 -0700
Subject: [PATCH 1715/3220] ANDROID: proc: fix undefined behavior in
 proc_uid_base_readdir

When uid_base_stuff has no entries, proc_uid_base_readdir tries to
compute an address before the start of the array. Revise this check to
use uid_base_stuff + nents instead, which makes the code valid
regardless of array size.

Bug: 80158484
Test: No more compiler warning with CONFIG_CPU_FREQ_TIMES=n
Change-Id: I6e55b27c3ba8210cee194f6d27bbd62c0b263796
Signed-off-by: Connor O'Brien <connoro@google.com>
---
 fs/proc/uid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/uid.c b/fs/proc/uid.c
index 040591d341f8..11f1efc33c59 100644
--- a/fs/proc/uid.c
+++ b/fs/proc/uid.c
@@ -174,7 +174,7 @@ static int proc_uid_base_readdir(struct file *file, struct dir_context *ctx)
 		return 0;
 
 	for (u = uid_base_stuff + (ctx->pos - 2);
-	     u <= uid_base_stuff + nents - 1; u++) {
+	     u < uid_base_stuff + nents; u++) {
 		if (!proc_fill_cache(file, ctx, u->name, u->len,
 				     proc_uident_instantiate, NULL, u))
 			break;

From a47d369cafa270b93c6029eb18d7dbb22cc95645 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Wed, 22 Jun 2016 17:28:41 -0300
Subject: [PATCH 1716/3220] cgroup: Disable IRQs while holding css_set_lock

While testing the deadline scheduler + cgroup setup I hit this
warning.

[  132.612935] ------------[ cut here ]------------
[  132.612951] WARNING: CPU: 5 PID: 0 at kernel/softirq.c:150 __local_bh_enable_ip+0x6b/0x80
[  132.612952] Modules linked in: (a ton of modules...)
[  132.612981] CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.7.0-rc2 #2
[  132.612981] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.2-20150714_191134- 04/01/2014
[  132.612982]  0000000000000086 45c8bb5effdd088b ffff88013fd43da0 ffffffff813d229e
[  132.612984]  0000000000000000 0000000000000000 ffff88013fd43de0 ffffffff810a652b
[  132.612985]  00000096811387b5 0000000000000200 ffff8800bab29d80 ffff880034c54c00
[  132.612986] Call Trace:
[  132.612987]  <IRQ>  [<ffffffff813d229e>] dump_stack+0x63/0x85
[  132.612994]  [<ffffffff810a652b>] __warn+0xcb/0xf0
[  132.612997]  [<ffffffff810e76a0>] ? push_dl_task.part.32+0x170/0x170
[  132.612999]  [<ffffffff810a665d>] warn_slowpath_null+0x1d/0x20
[  132.613000]  [<ffffffff810aba5b>] __local_bh_enable_ip+0x6b/0x80
[  132.613008]  [<ffffffff817d6c8a>] _raw_write_unlock_bh+0x1a/0x20
[  132.613010]  [<ffffffff817d6c9e>] _raw_spin_unlock_bh+0xe/0x10
[  132.613015]  [<ffffffff811388ac>] put_css_set+0x5c/0x60
[  132.613016]  [<ffffffff8113dc7f>] cgroup_free+0x7f/0xa0
[  132.613017]  [<ffffffff810a3912>] __put_task_struct+0x42/0x140
[  132.613018]  [<ffffffff810e776a>] dl_task_timer+0xca/0x250
[  132.613027]  [<ffffffff810e76a0>] ? push_dl_task.part.32+0x170/0x170
[  132.613030]  [<ffffffff8111371e>] __hrtimer_run_queues+0xee/0x270
[  132.613031]  [<ffffffff81113ec8>] hrtimer_interrupt+0xa8/0x190
[  132.613034]  [<ffffffff81051a58>] local_apic_timer_interrupt+0x38/0x60
[  132.613035]  [<ffffffff817d9b0d>] smp_apic_timer_interrupt+0x3d/0x50
[  132.613037]  [<ffffffff817d7c5c>] apic_timer_interrupt+0x8c/0xa0
[  132.613038]  <EOI>  [<ffffffff81063466>] ? native_safe_halt+0x6/0x10
[  132.613043]  [<ffffffff81037a4e>] default_idle+0x1e/0xd0
[  132.613044]  [<ffffffff810381cf>] arch_cpu_idle+0xf/0x20
[  132.613046]  [<ffffffff810e8fda>] default_idle_call+0x2a/0x40
[  132.613047]  [<ffffffff810e92d7>] cpu_startup_entry+0x2e7/0x340
[  132.613048]  [<ffffffff81050235>] start_secondary+0x155/0x190
[  132.613049] ---[ end trace f91934d162ce9977 ]---

The warn is the spin_(lock|unlock)_bh(&css_set_lock) in the interrupt
context. Converting the spin_lock_bh to spin_lock_irq(save) to avoid
this problem - and other problems of sharing a spinlock with an
interrupt.

Change-Id: I2064d3c21863e50ee1a70e57f7915d04f2ba0407
Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: cgroups@vger.kernel.org
Cc: stable@vger.kernel.org # 4.5+
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Acked-by: Zefan Li <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Git-commit: 82d6489d0fed2ec8a8c48c19e8d8a04ac8e5bb26
Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
[runminw@codeaurora.org: resolve trivial merge conflicts]
Signed-off-by: Runmin Wang <runminw@codeaurora.org>
---
 kernel/cgroup.c | 124 ++++++++++++++++++++++++++----------------------
 1 file changed, 66 insertions(+), 58 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8ab133d96435..795a2e849faa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -784,6 +784,8 @@ static void put_css_set_locked(struct css_set *cset)
 
 static void put_css_set(struct css_set *cset)
 {
+	unsigned long flags;
+
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -792,9 +794,9 @@ static void put_css_set(struct css_set *cset)
 	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irqsave(&css_set_lock, flags);
 	put_css_set_locked(cset);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
 /*
@@ -1017,11 +1019,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	cset = find_existing_css_set(old_cset, cgrp, template);
 	if (cset)
 		get_css_set(cset);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	if (cset)
 		return cset;
@@ -1049,7 +1051,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	 * find_existing_css_set() */
 	memcpy(cset->subsys, template, sizeof(cset->subsys));
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
@@ -1075,7 +1077,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 		css_get(css);
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	return cset;
 }
@@ -1139,7 +1141,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 		list_del(&link->cset_link);
@@ -1147,7 +1149,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 		kfree(link);
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
@@ -1551,11 +1553,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 		ss->root = dst_root;
 		css->cgroup = dcgrp;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		hash_for_each(css_set_table, i, cset, hlist)
 			list_move_tail(&cset->e_cset_node[ss->id],
 				       &dcgrp->e_csets[ss->id]);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 
 		src_root->subsys_mask &= ~(1 << ssid);
 		scgrp->subtree_control &= ~(1 << ssid);
@@ -1832,7 +1834,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	if (use_task_css_set_links)
 		goto out_unlock;
@@ -1857,8 +1859,12 @@ static void cgroup_enable_task_cg_lists(void)
 		 * entry won't be deleted though the process has exited.
 		 * Do it while holding siglock so that we don't end up
 		 * racing against cgroup_exit().
+		 *
+		 * Interrupts were already disabled while acquiring
+		 * the css_set_lock, so we do not need to disable it
+		 * again when acquiring the sighand->siglock here.
 		 */
-		spin_lock_irq(&p->sighand->siglock);
+		spin_lock(&p->sighand->siglock);
 		if (!(p->flags & PF_EXITING)) {
 			struct css_set *cset = task_css_set(p);
 
@@ -1867,11 +1873,11 @@ static void cgroup_enable_task_cg_lists(void)
 			list_add_tail(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
-		spin_unlock_irq(&p->sighand->siglock);
+		spin_unlock(&p->sighand->siglock);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 }
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1976,13 +1982,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	 * Link the root cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	hash_for_each(css_set_table, i, cset, hlist) {
 		link_css_set(&tmp_links, cset, root_cgrp);
 		if (css_set_populated(cset))
 			cgroup_update_populated(root_cgrp, true);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	BUG_ON(!list_empty(&root_cgrp->self.children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2215,7 +2221,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 	char *path = NULL;
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 
@@ -2228,7 +2234,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 			path = buf;
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	return path;
 }
@@ -2403,7 +2409,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	 * the new cgroup.  There are no failure cases after here, so this
 	 * is the commit point.
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(cset, &tset->src_csets, mg_node) {
 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
 			struct css_set *from_cset = task_css_set(task);
@@ -2414,7 +2420,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 			put_css_set_locked(from_cset);
 		}
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/*
 	 * Migration is committed, all target tasks are now on dst_csets.
@@ -2443,13 +2449,13 @@ out_cancel_attach:
 		}
 	}
 out_release_tset:
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_splice_init(&tset->dst_csets, &tset->src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	return ret;
 }
 
@@ -2466,14 +2472,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 }
 
 /**
@@ -2623,7 +2629,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2632,7 +2638,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	return cgroup_taskset_migrate(&tset, cgrp);
 }
@@ -2653,7 +2659,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	int ret;
 
 	/* look up all src csets */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2663,7 +2669,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
@@ -2697,9 +2703,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 		struct cgroup *cgrp;
 		struct inode *inode;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 
 		while (!cgroup_is_descendant(dst_cgrp, cgrp))
 			cgrp = cgroup_parent(cgrp);
@@ -2801,9 +2807,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (root == &cgrp_dfl_root)
 			continue;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		from_cgrp = task_cgroup_from_root(from, root);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 
 		retval = cgroup_attach_task(from_cgrp, tsk, false);
 		if (retval)
@@ -2928,7 +2934,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	percpu_down_write(&cgroup_threadgroup_rwsem);
 
 	/* look up all csses currently attached to @cgrp's subtree */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
 		struct cgrp_cset_link *link;
 
@@ -2940,14 +2946,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 			cgroup_migrate_add_src(link->cset, cgrp,
 					       &preloaded_csets);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/* NULL dst indicates self on default hierarchy */
 	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
 	if (ret)
 		goto out_finish;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
 		struct task_struct *task, *ntask;
 
@@ -2959,7 +2965,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
 			cgroup_taskset_add(task, &tset);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	ret = cgroup_taskset_migrate(&tset, cgrp);
 out_finish:
@@ -3642,10 +3648,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 	int count = 0;
 	struct cgrp_cset_link *link;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		count += atomic_read(&link->cset->refcount);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	return count;
 }
 
@@ -3983,7 +3989,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	memset(it, 0, sizeof(*it));
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	it->ss = css->ss;
 
@@ -3996,7 +4002,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	css_task_iter_advance_css_set(it);
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 }
 
 /**
@@ -4014,7 +4020,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		it->cur_task = NULL;
 	}
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	if (it->task_pos) {
 		it->cur_task = list_entry(it->task_pos, struct task_struct,
@@ -4023,7 +4029,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		css_task_iter_advance(it);
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	return it->cur_task;
 }
@@ -4037,10 +4043,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 void css_task_iter_end(struct css_task_iter *it)
 {
 	if (it->cur_cset) {
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		list_del(&it->iters_node);
 		put_css_set_locked(it->cur_cset);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 	}
 
 	if (it->cur_task)
@@ -4069,10 +4075,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	mutex_lock(&cgroup_mutex);
 
 	/* all tasks in @from are being moved, all csets are source */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &from->cset_links, cset_link)
 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
 	if (ret)
@@ -5177,10 +5183,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	cgrp->self.flags &= ~CSS_ONLINE;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		link->cset->dead = true;
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/* initiate massacre of all css's */
 	for_each_css(css, ssid, cgrp)
@@ -5439,7 +5445,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		goto out;
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	for_each_root(root) {
 		struct cgroup_subsys *ss;
@@ -5491,7 +5497,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 
 	retval = 0;
 out_unlock:
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	kfree(buf);
 out:
@@ -5652,13 +5658,13 @@ void cgroup_post_fork(struct task_struct *child,
 	if (use_task_css_set_links) {
 		struct css_set *cset;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			get_css_set(cset);
 			css_set_move_task(child, NULL, cset, false);
 		}
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 	}
 
 	/*
@@ -5702,9 +5708,9 @@ void cgroup_exit(struct task_struct *tsk)
 	cset = task_css_set(tsk);
 
 	if (!list_empty(&tsk->cg_list)) {
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 	} else {
 		get_css_set(cset);
 	}
@@ -5770,7 +5776,9 @@ static void cgroup_release_agent(struct work_struct *work)
 	if (!pathbuf || !agentbuf)
 		goto out;
 
+	spin_lock_irq(&css_set_lock);
 	path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+	spin_unlock_irq(&css_set_lock);
 	if (!path)
 		goto out;
 
@@ -5917,7 +5925,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 	if (!name_buf)
 		return -ENOMEM;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5928,7 +5936,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	kfree(name_buf);
 	return 0;
 }
@@ -5939,7 +5947,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
@@ -5962,7 +5970,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	overflow:
 		seq_puts(seq, "  ...\n");
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	return 0;
 }
 

From f552de2d035a915c428e47577e96d4237e5c29d0 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Thu, 24 May 2018 13:57:11 -0700
Subject: [PATCH 1717/3220] Revert "cgroup: Disable IRQs while holding
 css_set_lock"

This reverts commit a47d369cafa270b93c6029eb18d7dbb22cc95645.
---
 kernel/cgroup.c | 124 ++++++++++++++++++++++--------------------------
 1 file changed, 58 insertions(+), 66 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 795a2e849faa..8ab133d96435 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -784,8 +784,6 @@ static void put_css_set_locked(struct css_set *cset)
 
 static void put_css_set(struct css_set *cset)
 {
-	unsigned long flags;
-
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -794,9 +792,9 @@ static void put_css_set(struct css_set *cset)
 	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
 
-	spin_lock_irqsave(&css_set_lock, flags);
+	spin_lock_bh(&css_set_lock);
 	put_css_set_locked(cset);
-	spin_unlock_irqrestore(&css_set_lock, flags);
+	spin_unlock_bh(&css_set_lock);
 }
 
 /*
@@ -1019,11 +1017,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	cset = find_existing_css_set(old_cset, cgrp, template);
 	if (cset)
 		get_css_set(cset);
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	if (cset)
 		return cset;
@@ -1051,7 +1049,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	 * find_existing_css_set() */
 	memcpy(cset->subsys, template, sizeof(cset->subsys));
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
@@ -1077,7 +1075,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 		css_get(css);
 	}
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	return cset;
 }
@@ -1141,7 +1139,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 
 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 		list_del(&link->cset_link);
@@ -1149,7 +1147,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 		kfree(link);
 	}
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
@@ -1553,11 +1551,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 		ss->root = dst_root;
 		css->cgroup = dcgrp;
 
-		spin_lock_irq(&css_set_lock);
+		spin_lock_bh(&css_set_lock);
 		hash_for_each(css_set_table, i, cset, hlist)
 			list_move_tail(&cset->e_cset_node[ss->id],
 				       &dcgrp->e_csets[ss->id]);
-		spin_unlock_irq(&css_set_lock);
+		spin_unlock_bh(&css_set_lock);
 
 		src_root->subsys_mask &= ~(1 << ssid);
 		scgrp->subtree_control &= ~(1 << ssid);
@@ -1834,7 +1832,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 
 	if (use_task_css_set_links)
 		goto out_unlock;
@@ -1859,12 +1857,8 @@ static void cgroup_enable_task_cg_lists(void)
 		 * entry won't be deleted though the process has exited.
 		 * Do it while holding siglock so that we don't end up
 		 * racing against cgroup_exit().
-		 *
-		 * Interrupts were already disabled while acquiring
-		 * the css_set_lock, so we do not need to disable it
-		 * again when acquiring the sighand->siglock here.
 		 */
-		spin_lock(&p->sighand->siglock);
+		spin_lock_irq(&p->sighand->siglock);
 		if (!(p->flags & PF_EXITING)) {
 			struct css_set *cset = task_css_set(p);
 
@@ -1873,11 +1867,11 @@ static void cgroup_enable_task_cg_lists(void)
 			list_add_tail(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
-		spin_unlock(&p->sighand->siglock);
+		spin_unlock_irq(&p->sighand->siglock);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 }
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1982,13 +1976,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	 * Link the root cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	hash_for_each(css_set_table, i, cset, hlist) {
 		link_css_set(&tmp_links, cset, root_cgrp);
 		if (css_set_populated(cset))
 			cgroup_update_populated(root_cgrp, true);
 	}
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	BUG_ON(!list_empty(&root_cgrp->self.children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2221,7 +2215,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 	char *path = NULL;
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 
 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 
@@ -2234,7 +2228,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 			path = buf;
 	}
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	return path;
 }
@@ -2409,7 +2403,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	 * the new cgroup.  There are no failure cases after here, so this
 	 * is the commit point.
 	 */
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(cset, &tset->src_csets, mg_node) {
 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
 			struct css_set *from_cset = task_css_set(task);
@@ -2420,7 +2414,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 			put_css_set_locked(from_cset);
 		}
 	}
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	/*
 	 * Migration is committed, all target tasks are now on dst_csets.
@@ -2449,13 +2443,13 @@ out_cancel_attach:
 		}
 	}
 out_release_tset:
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	list_splice_init(&tset->dst_csets, &tset->src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 	return ret;
 }
 
@@ -2472,14 +2466,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset);
 	}
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 }
 
 /**
@@ -2629,7 +2623,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2638,7 +2632,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	return cgroup_taskset_migrate(&tset, cgrp);
 }
@@ -2659,7 +2653,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	int ret;
 
 	/* look up all src csets */
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2669,7 +2663,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
@@ -2703,9 +2697,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 		struct cgroup *cgrp;
 		struct inode *inode;
 
-		spin_lock_irq(&css_set_lock);
+		spin_lock_bh(&css_set_lock);
 		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-		spin_unlock_irq(&css_set_lock);
+		spin_unlock_bh(&css_set_lock);
 
 		while (!cgroup_is_descendant(dst_cgrp, cgrp))
 			cgrp = cgroup_parent(cgrp);
@@ -2807,9 +2801,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (root == &cgrp_dfl_root)
 			continue;
 
-		spin_lock_irq(&css_set_lock);
+		spin_lock_bh(&css_set_lock);
 		from_cgrp = task_cgroup_from_root(from, root);
-		spin_unlock_irq(&css_set_lock);
+		spin_unlock_bh(&css_set_lock);
 
 		retval = cgroup_attach_task(from_cgrp, tsk, false);
 		if (retval)
@@ -2934,7 +2928,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	percpu_down_write(&cgroup_threadgroup_rwsem);
 
 	/* look up all csses currently attached to @cgrp's subtree */
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
 		struct cgrp_cset_link *link;
 
@@ -2946,14 +2940,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 			cgroup_migrate_add_src(link->cset, cgrp,
 					       &preloaded_csets);
 	}
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	/* NULL dst indicates self on default hierarchy */
 	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
 	if (ret)
 		goto out_finish;
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
 		struct task_struct *task, *ntask;
 
@@ -2965,7 +2959,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
 			cgroup_taskset_add(task, &tset);
 	}
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	ret = cgroup_taskset_migrate(&tset, cgrp);
 out_finish:
@@ -3648,10 +3642,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 	int count = 0;
 	struct cgrp_cset_link *link;
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		count += atomic_read(&link->cset->refcount);
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 	return count;
 }
 
@@ -3989,7 +3983,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	memset(it, 0, sizeof(*it));
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 
 	it->ss = css->ss;
 
@@ -4002,7 +3996,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	css_task_iter_advance_css_set(it);
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 }
 
 /**
@@ -4020,7 +4014,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		it->cur_task = NULL;
 	}
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 
 	if (it->task_pos) {
 		it->cur_task = list_entry(it->task_pos, struct task_struct,
@@ -4029,7 +4023,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		css_task_iter_advance(it);
 	}
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	return it->cur_task;
 }
@@ -4043,10 +4037,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 void css_task_iter_end(struct css_task_iter *it)
 {
 	if (it->cur_cset) {
-		spin_lock_irq(&css_set_lock);
+		spin_lock_bh(&css_set_lock);
 		list_del(&it->iters_node);
 		put_css_set_locked(it->cur_cset);
-		spin_unlock_irq(&css_set_lock);
+		spin_unlock_bh(&css_set_lock);
 	}
 
 	if (it->cur_task)
@@ -4075,10 +4069,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	mutex_lock(&cgroup_mutex);
 
 	/* all tasks in @from are being moved, all csets are source */
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(link, &from->cset_links, cset_link)
 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
 	if (ret)
@@ -5183,10 +5177,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	cgrp->self.flags &= ~CSS_ONLINE;
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		link->cset->dead = true;
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 
 	/* initiate massacre of all css's */
 	for_each_css(css, ssid, cgrp)
@@ -5445,7 +5439,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		goto out;
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 
 	for_each_root(root) {
 		struct cgroup_subsys *ss;
@@ -5497,7 +5491,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 
 	retval = 0;
 out_unlock:
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	kfree(buf);
 out:
@@ -5658,13 +5652,13 @@ void cgroup_post_fork(struct task_struct *child,
 	if (use_task_css_set_links) {
 		struct css_set *cset;
 
-		spin_lock_irq(&css_set_lock);
+		spin_lock_bh(&css_set_lock);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			get_css_set(cset);
 			css_set_move_task(child, NULL, cset, false);
 		}
-		spin_unlock_irq(&css_set_lock);
+		spin_unlock_bh(&css_set_lock);
 	}
 
 	/*
@@ -5708,9 +5702,9 @@ void cgroup_exit(struct task_struct *tsk)
 	cset = task_css_set(tsk);
 
 	if (!list_empty(&tsk->cg_list)) {
-		spin_lock_irq(&css_set_lock);
+		spin_lock_bh(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
-		spin_unlock_irq(&css_set_lock);
+		spin_unlock_bh(&css_set_lock);
 	} else {
 		get_css_set(cset);
 	}
@@ -5776,9 +5770,7 @@ static void cgroup_release_agent(struct work_struct *work)
 	if (!pathbuf || !agentbuf)
 		goto out;
 
-	spin_lock_irq(&css_set_lock);
 	path = cgroup_path(cgrp, pathbuf, PATH_MAX);
-	spin_unlock_irq(&css_set_lock);
 	if (!path)
 		goto out;
 
@@ -5925,7 +5917,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 	if (!name_buf)
 		return -ENOMEM;
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5936,7 +5928,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 	kfree(name_buf);
 	return 0;
 }
@@ -5947,7 +5939,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
@@ -5970,7 +5962,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	overflow:
 		seq_puts(seq, "  ...\n");
 	}
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_bh(&css_set_lock);
 	return 0;
 }
 

From 1d92c594e7b30ef5f56f3f6589b39667917ffa28 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Wed, 22 Jun 2016 17:28:41 -0300
Subject: [PATCH 1718/3220] cgroup: Disable IRQs while holding css_set_lock

While testing the deadline scheduler + cgroup setup I hit this
warning.

[  132.612935] ------------[ cut here ]------------
[  132.612951] WARNING: CPU: 5 PID: 0 at kernel/softirq.c:150 __local_bh_enable_ip+0x6b/0x80
[  132.612952] Modules linked in: (a ton of modules...)
[  132.612981] CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.7.0-rc2 #2
[  132.612981] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.2-20150714_191134- 04/01/2014
[  132.612982]  0000000000000086 45c8bb5effdd088b ffff88013fd43da0 ffffffff813d229e
[  132.612984]  0000000000000000 0000000000000000 ffff88013fd43de0 ffffffff810a652b
[  132.612985]  00000096811387b5 0000000000000200 ffff8800bab29d80 ffff880034c54c00
[  132.612986] Call Trace:
[  132.612987]  <IRQ>  [<ffffffff813d229e>] dump_stack+0x63/0x85
[  132.612994]  [<ffffffff810a652b>] __warn+0xcb/0xf0
[  132.612997]  [<ffffffff810e76a0>] ? push_dl_task.part.32+0x170/0x170
[  132.612999]  [<ffffffff810a665d>] warn_slowpath_null+0x1d/0x20
[  132.613000]  [<ffffffff810aba5b>] __local_bh_enable_ip+0x6b/0x80
[  132.613008]  [<ffffffff817d6c8a>] _raw_write_unlock_bh+0x1a/0x20
[  132.613010]  [<ffffffff817d6c9e>] _raw_spin_unlock_bh+0xe/0x10
[  132.613015]  [<ffffffff811388ac>] put_css_set+0x5c/0x60
[  132.613016]  [<ffffffff8113dc7f>] cgroup_free+0x7f/0xa0
[  132.613017]  [<ffffffff810a3912>] __put_task_struct+0x42/0x140
[  132.613018]  [<ffffffff810e776a>] dl_task_timer+0xca/0x250
[  132.613027]  [<ffffffff810e76a0>] ? push_dl_task.part.32+0x170/0x170
[  132.613030]  [<ffffffff8111371e>] __hrtimer_run_queues+0xee/0x270
[  132.613031]  [<ffffffff81113ec8>] hrtimer_interrupt+0xa8/0x190
[  132.613034]  [<ffffffff81051a58>] local_apic_timer_interrupt+0x38/0x60
[  132.613035]  [<ffffffff817d9b0d>] smp_apic_timer_interrupt+0x3d/0x50
[  132.613037]  [<ffffffff817d7c5c>] apic_timer_interrupt+0x8c/0xa0
[  132.613038]  <EOI>  [<ffffffff81063466>] ? native_safe_halt+0x6/0x10
[  132.613043]  [<ffffffff81037a4e>] default_idle+0x1e/0xd0
[  132.613044]  [<ffffffff810381cf>] arch_cpu_idle+0xf/0x20
[  132.613046]  [<ffffffff810e8fda>] default_idle_call+0x2a/0x40
[  132.613047]  [<ffffffff810e92d7>] cpu_startup_entry+0x2e7/0x340
[  132.613048]  [<ffffffff81050235>] start_secondary+0x155/0x190
[  132.613049] ---[ end trace f91934d162ce9977 ]---

The warn is the spin_(lock|unlock)_bh(&css_set_lock) in the interrupt
context. Converting the spin_lock_bh to spin_lock_irq(save) to avoid
this problem - and other problems of sharing a spinlock with an
interrupt.

Change-Id: I5b5d5c79c3f380ac35f58596fc2cebaf6348eb67
Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: cgroups@vger.kernel.org
Cc: stable@vger.kernel.org # 4.5+
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Acked-by: Zefan Li <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 122 +++++++++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 58 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8ab133d96435..43e1bbb37038 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -784,6 +784,8 @@ static void put_css_set_locked(struct css_set *cset)
 
 static void put_css_set(struct css_set *cset)
 {
+	unsigned long flags;
+
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -792,9 +794,9 @@ static void put_css_set(struct css_set *cset)
 	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irqsave(&css_set_lock, flags);
 	put_css_set_locked(cset);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
 /*
@@ -1017,11 +1019,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	cset = find_existing_css_set(old_cset, cgrp, template);
 	if (cset)
 		get_css_set(cset);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	if (cset)
 		return cset;
@@ -1049,7 +1051,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	 * find_existing_css_set() */
 	memcpy(cset->subsys, template, sizeof(cset->subsys));
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
@@ -1075,7 +1077,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 		css_get(css);
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	return cset;
 }
@@ -1139,7 +1141,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 		list_del(&link->cset_link);
@@ -1147,7 +1149,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 		kfree(link);
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
@@ -1551,11 +1553,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 		ss->root = dst_root;
 		css->cgroup = dcgrp;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		hash_for_each(css_set_table, i, cset, hlist)
 			list_move_tail(&cset->e_cset_node[ss->id],
 				       &dcgrp->e_csets[ss->id]);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 
 		src_root->subsys_mask &= ~(1 << ssid);
 		scgrp->subtree_control &= ~(1 << ssid);
@@ -1832,7 +1834,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	if (use_task_css_set_links)
 		goto out_unlock;
@@ -1857,8 +1859,12 @@ static void cgroup_enable_task_cg_lists(void)
 		 * entry won't be deleted though the process has exited.
 		 * Do it while holding siglock so that we don't end up
 		 * racing against cgroup_exit().
+		 *
+		 * Interrupts were already disabled while acquiring
+		 * the css_set_lock, so we do not need to disable it
+		 * again when acquiring the sighand->siglock here.
 		 */
-		spin_lock_irq(&p->sighand->siglock);
+		spin_lock(&p->sighand->siglock);
 		if (!(p->flags & PF_EXITING)) {
 			struct css_set *cset = task_css_set(p);
 
@@ -1867,11 +1873,11 @@ static void cgroup_enable_task_cg_lists(void)
 			list_add_tail(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
-		spin_unlock_irq(&p->sighand->siglock);
+		spin_unlock(&p->sighand->siglock);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 }
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1976,13 +1982,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	 * Link the root cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	hash_for_each(css_set_table, i, cset, hlist) {
 		link_css_set(&tmp_links, cset, root_cgrp);
 		if (css_set_populated(cset))
 			cgroup_update_populated(root_cgrp, true);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	BUG_ON(!list_empty(&root_cgrp->self.children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2215,7 +2221,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 	char *path = NULL;
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 
@@ -2228,7 +2234,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 			path = buf;
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	return path;
 }
@@ -2403,7 +2409,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	 * the new cgroup.  There are no failure cases after here, so this
 	 * is the commit point.
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(cset, &tset->src_csets, mg_node) {
 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
 			struct css_set *from_cset = task_css_set(task);
@@ -2414,7 +2420,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 			put_css_set_locked(from_cset);
 		}
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/*
 	 * Migration is committed, all target tasks are now on dst_csets.
@@ -2443,13 +2449,13 @@ out_cancel_attach:
 		}
 	}
 out_release_tset:
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_splice_init(&tset->dst_csets, &tset->src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	return ret;
 }
 
@@ -2466,14 +2472,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 }
 
 /**
@@ -2623,7 +2629,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2632,7 +2638,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	return cgroup_taskset_migrate(&tset, cgrp);
 }
@@ -2653,7 +2659,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	int ret;
 
 	/* look up all src csets */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2663,7 +2669,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
@@ -2697,9 +2703,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 		struct cgroup *cgrp;
 		struct inode *inode;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 
 		while (!cgroup_is_descendant(dst_cgrp, cgrp))
 			cgrp = cgroup_parent(cgrp);
@@ -2801,9 +2807,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (root == &cgrp_dfl_root)
 			continue;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		from_cgrp = task_cgroup_from_root(from, root);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 
 		retval = cgroup_attach_task(from_cgrp, tsk, false);
 		if (retval)
@@ -2928,7 +2934,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	percpu_down_write(&cgroup_threadgroup_rwsem);
 
 	/* look up all csses currently attached to @cgrp's subtree */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
 		struct cgrp_cset_link *link;
 
@@ -2940,14 +2946,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 			cgroup_migrate_add_src(link->cset, cgrp,
 					       &preloaded_csets);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/* NULL dst indicates self on default hierarchy */
 	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
 	if (ret)
 		goto out_finish;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
 		struct task_struct *task, *ntask;
 
@@ -2959,7 +2965,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
 			cgroup_taskset_add(task, &tset);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	ret = cgroup_taskset_migrate(&tset, cgrp);
 out_finish:
@@ -3642,10 +3648,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 	int count = 0;
 	struct cgrp_cset_link *link;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		count += atomic_read(&link->cset->refcount);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	return count;
 }
 
@@ -3983,7 +3989,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	memset(it, 0, sizeof(*it));
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	it->ss = css->ss;
 
@@ -3996,7 +4002,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	css_task_iter_advance_css_set(it);
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 }
 
 /**
@@ -4014,7 +4020,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		it->cur_task = NULL;
 	}
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	if (it->task_pos) {
 		it->cur_task = list_entry(it->task_pos, struct task_struct,
@@ -4023,7 +4029,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		css_task_iter_advance(it);
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	return it->cur_task;
 }
@@ -4037,10 +4043,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 void css_task_iter_end(struct css_task_iter *it)
 {
 	if (it->cur_cset) {
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		list_del(&it->iters_node);
 		put_css_set_locked(it->cur_cset);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 	}
 
 	if (it->cur_task)
@@ -4069,10 +4075,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	mutex_lock(&cgroup_mutex);
 
 	/* all tasks in @from are being moved, all csets are source */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &from->cset_links, cset_link)
 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
 	if (ret)
@@ -5177,10 +5183,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	cgrp->self.flags &= ~CSS_ONLINE;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		link->cset->dead = true;
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/* initiate massacre of all css's */
 	for_each_css(css, ssid, cgrp)
@@ -5439,7 +5445,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		goto out;
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	for_each_root(root) {
 		struct cgroup_subsys *ss;
@@ -5491,7 +5497,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 
 	retval = 0;
 out_unlock:
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	kfree(buf);
 out:
@@ -5652,13 +5658,13 @@ void cgroup_post_fork(struct task_struct *child,
 	if (use_task_css_set_links) {
 		struct css_set *cset;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			get_css_set(cset);
 			css_set_move_task(child, NULL, cset, false);
 		}
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 	}
 
 	/*
@@ -5702,9 +5708,9 @@ void cgroup_exit(struct task_struct *tsk)
 	cset = task_css_set(tsk);
 
 	if (!list_empty(&tsk->cg_list)) {
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 	} else {
 		get_css_set(cset);
 	}
@@ -5917,7 +5923,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 	if (!name_buf)
 		return -ENOMEM;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5928,7 +5934,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	kfree(name_buf);
 	return 0;
 }
@@ -5939,7 +5945,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
@@ -5962,7 +5968,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	overflow:
 		seq_puts(seq, "  ...\n");
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	return 0;
 }
 

From 4b08356a76b859b8f4599bf8821a6702ac50a029 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 7 May 2018 17:22:08 -0700
Subject: [PATCH 1719/3220] BACKPORT, FROMLIST: fscrypt: add Speck128/256
 support

fscrypt currently only supports AES encryption.  However, many low-end
mobile devices have older CPUs that don't have AES instructions, e.g.
the ARMv8 Cryptography Extensions.  Currently, user data on such devices
is not encrypted at rest because AES is too slow, even when the NEON
bit-sliced implementation of AES is used.  Unfortunately, it is
infeasible to encrypt these devices at all when AES is the only option.

Therefore, this patch updates fscrypt to support the Speck block cipher,
which was recently added to the crypto API.  The C implementation of
Speck is not especially fast, but Speck can be implemented very
efficiently with general-purpose vector instructions, e.g. ARM NEON.
For example, on an ARMv7 processor, we measured the NEON-accelerated
Speck128/256-XTS at 69 MB/s for both encryption and decryption, while
AES-256-XTS with the NEON bit-sliced implementation was only 22 MB/s
encryption and 19 MB/s decryption.

There are multiple variants of Speck.  This patch only adds support for
Speck128/256, which is the variant with a 128-bit block size and 256-bit
key size -- the same as AES-256.  This is believed to be the most secure
variant of Speck, and it's only about 6% slower than Speck128/128.
Speck64/128 would be at least 20% faster because it has 20% rounds, and
it can be even faster on CPUs that can't efficiently do the 64-bit
operations needed for Speck128.  However, Speck64's 64-bit block size is
not preferred security-wise.  ARM NEON also supports the needed 64-bit
operations even on 32-bit CPUs, resulting in Speck128 being fast enough
for our targeted use cases so far.

The chosen modes of operation are XTS for contents and CTS-CBC for
filenames.  These are the same modes of operation that fscrypt defaults
to for AES.  Note that as with the other fscrypt modes, Speck will not
be used unless userspace chooses to use it.  Nor are any of the existing
modes (which are all AES-based) being removed, of course.

We intentionally don't make CONFIG_FS_ENCRYPTION select
CONFIG_CRYPTO_SPECK, so people will have to enable Speck support
themselves if they need it.  This is because we shouldn't bloat the
FS_ENCRYPTION dependencies with every new cipher, especially ones that
aren't recommended for most users.  Moreover, CRYPTO_SPECK is just the
generic implementation, which won't be fast enough for many users; in
practice, they'll need to enable CRYPTO_SPECK_NEON to get acceptable
performance.

More details about our choice of Speck can be found in our patches that
added Speck to the crypto API, and the follow-on discussion threads.
We're planning a publication that explains the choice in more detail.
But briefly, we can't use ChaCha20 as we previously proposed, since it
would be insecure to use a stream cipher in this context, with potential
IV reuse during writes on f2fs and/or on wear-leveling flash storage.

We also evaluated many other lightweight and/or ARX-based block ciphers
such as Chaskey-LTS, RC5, LEA, CHAM, Threefish, RC6, NOEKEON, SPARX, and
XTEA.  However, all had disadvantages vs. Speck, such as insufficient
performance with NEON, much less published cryptanalysis, or an
insufficient security level.  Various design choices in Speck make it
perform better with NEON than competing ciphers while still having a
security margin similar to AES, and in the case of Speck128 also the
same available security levels.  Unfortunately, Speck does have some
political baggage attached -- it's an NSA designed cipher, and was
rejected from an ISO standard (though for context, as far as I know none
of the above-mentioned alternatives are ISO standards either).
Nevertheless, we believe it is a good solution to the problem from a
technical perspective.

Certain algorithms constructed from ChaCha or the ChaCha permutation,
such as MEM (Masked Even-Mansour) or HPolyC, may also meet our
performance requirements.  However, these are new constructions that
need more time to receive the cryptographic review and acceptance needed
to be confident in their security.  HPolyC hasn't been published yet,
and we are concerned that MEM makes stronger assumptions about the
underlying permutation than the ChaCha stream cipher does.  In contrast,
the XTS mode of operation is relatively well accepted, and Speck has
over 70 cryptanalysis papers.  Of course, these ChaCha-based algorithms
can still be added later if they become ready.

The best known attack on Speck128/256 is a differential cryptanalysis
attack on 25 of 34 rounds with 2^253 time complexity and 2^125 chosen
plaintexts, i.e. only marginally faster than brute force.  There is no
known attack on the full 34 rounds.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

(cherry-picked from commit 12d28f79558f2e987c5f3817f89e1ccc0f11a7b5
 https://git.kernel.org/pub/scm/linux/kernel/git/tytso/fscrypt.git master)
(dropped Documentation/filesystems/fscrypt.rst change)
(fixed merge conflict in fs/crypto/keyinfo.c)
(also ported change to fs/ext4/, which isn't using fs/crypto/ in this
 kernel version)
Change-Id: I62c632044dfd06a2c5b74c2fb058f9c3b8af0add
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/fscrypt_private.h |  4 ++++
 fs/crypto/keyinfo.c         |  2 ++
 fs/ext4/crypto.c            | 12 ++++++++++--
 fs/ext4/crypto_fname.c      |  6 ------
 fs/ext4/crypto_key.c        |  6 ++++++
 fs/ext4/crypto_policy.c     | 14 +++++---------
 fs/ext4/ext4.h              |  5 +++--
 fs/ext4/ext4_crypto.h       |  4 ++++
 include/uapi/linux/fs.h     |  2 ++
 9 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 426aa1b27f17..fe6f6524c1aa 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -101,6 +101,10 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
 	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
 		return true;
 
+	if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS &&
+	    filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS)
+		return true;
+
 	return false;
 }
 
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 7c00331da5df..472f69188a96 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -134,6 +134,8 @@ static const struct {
 					     FS_AES_128_CBC_KEY_SIZE },
 	[FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))",
 					     FS_AES_128_CTS_KEY_SIZE },
+	[FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { "xts(speck128)",	64 },
+	[FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { "cts(cbc(speck128))",	32 },
 };
 
 static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index f240cef8b326..f6096ee77662 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -457,9 +457,17 @@ errout:
 	return err;
 }
 
-bool ext4_valid_contents_enc_mode(uint32_t mode)
+bool ext4_valid_enc_modes(uint32_t contents_mode, uint32_t filenames_mode)
 {
-	return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS);
+	if (contents_mode == EXT4_ENCRYPTION_MODE_AES_256_XTS) {
+		return (filenames_mode == EXT4_ENCRYPTION_MODE_AES_256_CTS ||
+			filenames_mode == EXT4_ENCRYPTION_MODE_AES_256_HEH);
+	}
+
+	if (contents_mode == EXT4_ENCRYPTION_MODE_SPECK128_256_XTS)
+		return filenames_mode == EXT4_ENCRYPTION_MODE_SPECK128_256_CTS;
+
+	return false;
 }
 
 /**
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 026716bdbbfc..5e5afb6ef71a 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -42,12 +42,6 @@ static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res)
 	complete(&ecr->completion);
 }
 
-bool ext4_valid_filenames_enc_mode(uint32_t mode)
-{
-	return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS ||
-		mode == EXT4_ENCRYPTION_MODE_AES_256_HEH);
-}
-
 static unsigned max_name_len(struct inode *inode)
 {
 	return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 14ae7781f2a8..68225223ffd8 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -258,6 +258,12 @@ int ext4_get_encryption_info(struct inode *inode)
 	case EXT4_ENCRYPTION_MODE_AES_256_HEH:
 		cipher_str = "heh(aes)";
 		break;
+	case EXT4_ENCRYPTION_MODE_SPECK128_256_XTS:
+		cipher_str = "xts(speck128)";
+		break;
+	case EXT4_ENCRYPTION_MODE_SPECK128_256_CTS:
+		cipher_str = "cts(cbc(speck128))";
+		break;
 	default:
 		printk_once(KERN_WARNING
 			    "ext4: unsupported key mode %d (ino %u)\n",
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index e4f4fc4e56ab..818fa45ecf08 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -60,16 +60,12 @@ static int ext4_create_encryption_context_from_policy(
 	ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
 	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
 	       EXT4_KEY_DESCRIPTOR_SIZE);
-	if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) {
+	if (!ext4_valid_enc_modes(policy->contents_encryption_mode,
+				  policy->filenames_encryption_mode)) {
 		printk(KERN_WARNING
-		       "%s: Invalid contents encryption mode %d\n", __func__,
-			policy->contents_encryption_mode);
-		return -EINVAL;
-	}
-	if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
-		printk(KERN_WARNING
-		       "%s: Invalid filenames encryption mode %d\n", __func__,
-			policy->filenames_encryption_mode);
+		       "%s: Invalid encryption modes (contents %d, filenames %d)\n",
+		       __func__, policy->contents_encryption_mode,
+		       policy->filenames_encryption_mode);
 		return -EINVAL;
 	}
 	if (policy->flags & ~EXT4_POLICY_FLAGS_VALID)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6edacb849e48..40992b68e639 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -589,6 +589,8 @@ enum {
 #define EXT4_ENCRYPTION_MODE_AES_256_GCM	2
 #define EXT4_ENCRYPTION_MODE_AES_256_CBC	3
 #define EXT4_ENCRYPTION_MODE_AES_256_CTS	4
+#define EXT4_ENCRYPTION_MODE_SPECK128_256_XTS	7
+#define EXT4_ENCRYPTION_MODE_SPECK128_256_CTS	8
 #define EXT4_ENCRYPTION_MODE_AES_256_HEH	126
 
 #include "ext4_crypto.h"
@@ -2259,7 +2261,7 @@ int ext4_get_policy(struct inode *inode,
 
 /* crypto.c */
 extern struct kmem_cache *ext4_crypt_info_cachep;
-bool ext4_valid_contents_enc_mode(uint32_t mode);
+bool ext4_valid_enc_modes(uint32_t contents_mode, uint32_t filenames_mode);
 uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
 extern struct workqueue_struct *ext4_read_workqueue;
 struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode,
@@ -2290,7 +2292,6 @@ static inline int ext4_sb_has_crypto(struct super_block *sb)
 #endif
 
 /* crypto_fname.c */
-bool ext4_valid_filenames_enc_mode(uint32_t mode);
 u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
 unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen);
 int ext4_fname_crypto_alloc_buffer(struct inode *inode,
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index e52637d969db..f7ba3be9d7ac 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -124,6 +124,10 @@ static inline int ext4_encryption_key_size(int mode)
 		return EXT4_AES_256_CTS_KEY_SIZE;
 	case EXT4_ENCRYPTION_MODE_AES_256_HEH:
 		return EXT4_AES_256_HEH_KEY_SIZE;
+	case EXT4_ENCRYPTION_MODE_SPECK128_256_XTS:
+		return 64;
+	case EXT4_ENCRYPTION_MODE_SPECK128_256_CTS:
+		return 32;
 	default:
 		BUG();
 	}
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 60d27496c328..d122ea5338d1 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -193,6 +193,8 @@ struct inodes_stat_t {
 #define FS_ENCRYPTION_MODE_AES_256_CTS		4
 #define FS_ENCRYPTION_MODE_AES_128_CBC		5
 #define FS_ENCRYPTION_MODE_AES_128_CTS		6
+#define FS_ENCRYPTION_MODE_SPECK128_256_XTS	7
+#define FS_ENCRYPTION_MODE_SPECK128_256_CTS	8
 
 
 struct fscrypt_policy {

From 54f36eadd123b33d8bad8c6784d0450a00d34a45 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 9 Nov 2017 10:52:19 -0800
Subject: [PATCH 1720/3220] UPSTREAM: sched/fair: Consider RT/IRQ pressure in
 capacity_spare_wake

capacity_spare_wake in the slow path influences choice of idlest groups,
as we search for groups with maximum spare capacity. In scenarios where
RT pressure is high, a sub optimal group can be chosen and hurt
performance of the task being woken up.

Several tests with results are included below to show improvements with
this change.

1) Hackbench on Pixel 2 Android device (4x4 ARM64 Octa core)
------------------------------------------------------------
Here we have RT activity running on big CPU cluster induced with rt-app,
and running hackbench in parallel. The RT tasks are bound to 4 CPUs on
the big cluster (cpu 4,5,6,7) and have 100ms periodicity with
runtime=20ms sleep=80ms.

Hackbench shows big benefit (30%) improvement when number of tasks is 8
and 32: Note: data is completion time in seconds (lower is better).
Number of loops for 8 and 16 tasks is 50000, and for 32 tasks its 20000.
+--------+-----+-------+-------------------+---------------------------+
| groups | fds | tasks | Without Patch     | With Patch                |
+--------+-----+-------+---------+---------+-----------------+---------+
|        |     |       | Mean    | Stdev   | Mean            | Stdev   |
|        |     |       +-------------------+-----------------+---------+
|      1 |   8 |     8 | 1.0534  | 0.13722 | 0.7293 (+30.7%) | 0.02653 |
|      2 |   8 |    16 | 1.6219  | 0.16631 | 1.6391 (-1%)    | 0.24001 |
|      4 |   8 |    32 | 1.2538  | 0.13086 | 1.1080 (+11.6%) | 0.16201 |
+--------+-----+-------+---------+---------+-----------------+---------+

2) Rohit ran barrier.c test (details below) with following improvements:
------------------------------------------------------------------------
This was Rohit's original use case for a patch he posted at [1] however
from his recent tests he showed my patch can replace his slow path
changes [1] and there's no need to selectively scan/skip CPUs in
find_idlest_group_cpu in the slow path to get the improvement he sees.

barrier.c (open_mp code) as a micro-benchmark. It does a number of
iterations and barrier sync at the end of each for loop.

Here barrier,c is running in along with ping on CPU 0 and 1 as:
'ping -l 10000 -q -s 10 -f hostX'

barrier.c can be found at:
http://www.spinics.net/lists/kernel/msg2506955.html

Following are the results for the iterations per second with this
micro-benchmark (higher is better), on a 44 core, 2 socket 88 Threads
Intel x86 machine:
+--------+------------------+---------------------------+
|Threads | Without patch    | With patch                |
|        |                  |                           |
+--------+--------+---------+-----------------+---------+
|        | Mean   | Std Dev | Mean            | Std Dev |
+--------+--------+---------+-----------------+---------+
|1       | 539.36 | 60.16   | 572.54 (+6.15%) | 40.95   |
|2       | 481.01 | 19.32   | 530.64 (+10.32%)| 56.16   |
|4       | 474.78 | 22.28   | 479.46 (+0.99%) | 18.89   |
|8       | 450.06 | 24.91   | 447.82 (-0.50%) | 12.36   |
|16      | 436.99 | 22.57   | 441.88 (+1.12%) | 7.39    |
|32      | 388.28 | 55.59   | 429.4  (+10.59%)| 31.14   |
|64      | 314.62 | 6.33    | 311.81 (-0.89%) | 11.99   |
+--------+--------+---------+-----------------+---------+

3) ping+hackbench test on bare-metal sever (Rohit ran this test)
----------------------------------------------------------------
Here hackbench is running in threaded mode along
with, running ping on CPU 0 and 1 as:
'ping -l 10000 -q -s 10 -f hostX'

This test is running on 2 socket, 20 core and 40 threads Intel x86
machine:
Number of loops is 10000 and runtime is in seconds (Lower is better).

+--------------+-----------------+--------------------------+
|Task Groups   | Without patch   |  With patch              |
|              +-------+---------+----------------+---------+
|(Groups of 40)| Mean  | Std Dev |  Mean          | Std Dev |
+--------------+-------+---------+----------------+---------+
|1             | 0.851 | 0.007   |  0.828 (+2.77%)| 0.032   |
|2             | 1.083 | 0.203   |  1.087 (-0.37%)| 0.246   |
|4             | 1.601 | 0.051   |  1.611 (-0.62%)| 0.055   |
|8             | 2.837 | 0.060   |  2.827 (+0.35%)| 0.031   |
|16            | 5.139 | 0.133   |  5.107 (+0.63%)| 0.085   |
|25            | 7.569 | 0.142   |  7.503 (+0.88%)| 0.143   |
+--------------+-------+---------+----------------+---------+

[1] https://patchwork.kernel.org/patch/9991635/

Matt Fleming also ran cyclictest and several different hackbench tests
on his test machines to santiy-check that the patch doesn't harm any
of his usecases.

Change-Id: Ibcedbb65273545ff2f4d1962b4eaca14072b1bd2
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Morten Ramussen <morten.rasmussen@arm.com>
Cc: Brendan Jackman <brendan.jackman@arm.com>
Tested-by: Rohit Jain <rohit.k.jain@oracle.com>
Tested-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9d2815ad548..ccf212fcd5bf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5985,7 +5985,7 @@ boosted_task_util(struct task_struct *task)
 
 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 {
-	return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+	return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
 }
 
 /*

From cb3afe1f1187bba5801dafadd9974bfbe3b9ff7a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Wed, 30 May 2018 20:44:08 +0200
Subject: [PATCH 1721/3220] Revert "vti4: Don't override MTU passed on link
 creation via IFLA_MTU"

This reverts commit 33cebc976c3b77bdf8f3fb4478cf776131ebfe4f which is
03080e5ec727 ("vti4: Don't override MTU passed on link creation via
IFLA_MTU") upstream as it causes test failures.

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 net/ipv4/ip_vti.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index fa79e8118b9b..4b7c81f88abf 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -366,6 +366,7 @@ static int vti_tunnel_init(struct net_device *dev)
 	memcpy(dev->dev_addr, &iph->saddr, 4);
 	memcpy(dev->broadcast, &iph->daddr, 4);
 
+	dev->mtu		= ETH_DATA_LEN;
 	dev->flags		= IFF_NOARP;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_LLTX;

From b68f27c36219064b6b9570ec5755dcdb1826a777 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 24 May 2018 13:57:26 -0700
Subject: [PATCH 1722/3220] FROMLIST: f2fs: early updates queued for v4.18-rc1

Cherry-picked from:
  origin/upstream-f2fs-stable-linux-4.4.y

85d2070f60c6 ("f2fs: turn down IO priority of discard from background")
4738f527db84 ("f2fs: don't split checkpoint in fstrim")
31e2713935ea ("f2fs: issue discard commands proactively in high fs utilization")
70676ef73646 ("f2fs: add fsync_mode=nobarrier for non-atomic files")
bb53d06b5f21 ("f2fs: let fstrim issue discard commands in lower priority")

Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |   1 +
 Documentation/filesystems/f2fs.txt      |  16 +-
 fs/f2fs/f2fs.h                          |  13 +-
 fs/f2fs/file.c                          |   2 +-
 fs/f2fs/segment.c                       | 246 ++++++++++++------------
 fs/f2fs/super.c                         |   4 +
 fs/f2fs/sysfs.c                         |   3 +
 7 files changed, 148 insertions(+), 137 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index b8d0a30f1644..f82da9bbb1fd 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -101,6 +101,7 @@ Date:		February 2015
 Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
 Description:
 		 Controls the trimming rate in batch mode.
+		 <deprecated>
 
 What:		/sys/fs/f2fs/<disk>/cp_interval
 Date:		October 2015
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 1f52baea2f69..ecccb51c7279 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -180,13 +180,15 @@ whint_mode=%s          Control which write hints are passed down to block
                        passes down hints with its policy.
 alloc_mode=%s          Adjust block allocation policy, which supports "reuse"
                        and "default".
-fsync_mode=%s          Control the policy of fsync. Currently supports "posix"
-                       and "strict". In "posix" mode, which is default, fsync
-                       will follow POSIX semantics and does a light operation
-                       to improve the filesystem performance. In "strict" mode,
-                       fsync will be heavy and behaves in line with xfs, ext4
-                       and btrfs, where xfstest generic/342 will pass, but the
-                       performance will regress.
+fsync_mode=%s          Control the policy of fsync. Currently supports "posix",
+                       "strict", and "nobarrier". In "posix" mode, which is
+                       default, fsync will follow POSIX semantics and does a
+                       light operation to improve the filesystem performance.
+                       In "strict" mode, fsync will be heavy and behaves in line
+                       with xfs, ext4 and btrfs, where xfstest generic/342 will
+                       pass, but the performance will regress. "nobarrier" is
+                       based on "posix", but doesn't issue flush command for
+                       non-atomic files likewise "nobarrier" mount option.
 test_dummy_encryption  Enable dummy encryption, which provides a fake fscrypt
                        context. The fake fscrypt context is used by xfstests.
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d0bfcfed35e2..97c17b3d984c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -238,15 +238,12 @@ enum {
 #define	CP_DISCARD	0x00000010
 #define CP_TRIMMED	0x00000020
 
-#define DEF_BATCHED_TRIM_SECTIONS	2048
-#define BATCHED_TRIM_SEGMENTS(sbi)	\
-		(GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections))
-#define BATCHED_TRIM_BLOCKS(sbi)	\
-		(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
 #define MAX_DISCARD_BLOCKS(sbi)		BLKS_PER_SEC(sbi)
 #define DEF_MAX_DISCARD_REQUEST		8	/* issue 8 discards per round */
+#define DEF_MAX_DISCARD_LEN		512	/* Max. 2MB per discard */
 #define DEF_MIN_DISCARD_ISSUE_TIME	50	/* 50 ms, if exists */
 #define DEF_MAX_DISCARD_ISSUE_TIME	60000	/* 60 s, if no candidates */
+#define DEF_DISCARD_URGENT_UTIL		80	/* do more discard over 80% */
 #define DEF_CP_INTERVAL			60	/* 60 secs */
 #define DEF_IDLE_INTERVAL		5	/* 5 secs */
 
@@ -753,7 +750,8 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
 static inline bool __is_discard_mergeable(struct discard_info *back,
 						struct discard_info *front)
 {
-	return back->lstart + back->len == front->lstart;
+	return (back->lstart + back->len == front->lstart) &&
+		(back->len + front->len < DEF_MAX_DISCARD_LEN);
 }
 
 static inline bool __is_discard_back_mergeable(struct discard_info *cur,
@@ -1139,6 +1137,7 @@ enum {
 enum fsync_mode {
 	FSYNC_MODE_POSIX,	/* fsync follows posix semantics */
 	FSYNC_MODE_STRICT,	/* fsync behaves in line with ext4 */
+	FSYNC_MODE_NOBARRIER,	/* fsync behaves nobarrier based on posix */
 };
 
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
@@ -2853,8 +2852,6 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
 void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
-void init_discard_policy(struct discard_policy *dpolicy, int discard_type,
-						unsigned int granularity);
 void drop_discard_cmd(struct f2fs_sb_info *sbi);
 void stop_discard_thread(struct f2fs_sb_info *sbi);
 bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7587758a285f..40d03d58b390 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -309,7 +309,7 @@ sync_nodes:
 	remove_ino_entry(sbi, ino, APPEND_INO);
 	clear_inode_flag(inode, FI_APPEND_WRITE);
 flush_out:
-	if (!atomic)
+	if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER)
 		ret = f2fs_issue_flush(sbi, inode->i_ino);
 	if (!ret) {
 		remove_ino_entry(sbi, ino, UPDATE_INO);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 01bc94df9f00..478a4504ba9a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -996,6 +996,39 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
 #endif
 }
 
+static void __init_discard_policy(struct f2fs_sb_info *sbi,
+				struct discard_policy *dpolicy,
+				int discard_type, unsigned int granularity)
+{
+	/* common policy */
+	dpolicy->type = discard_type;
+	dpolicy->sync = true;
+	dpolicy->granularity = granularity;
+
+	dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+	dpolicy->io_aware_gran = MAX_PLIST_NUM;
+
+	if (discard_type == DPOLICY_BG) {
+		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+		dpolicy->io_aware = true;
+		dpolicy->sync = false;
+		if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
+			dpolicy->granularity = 1;
+			dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+		}
+	} else if (discard_type == DPOLICY_FORCE) {
+		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+		dpolicy->io_aware = false;
+	} else if (discard_type == DPOLICY_FSTRIM) {
+		dpolicy->io_aware = false;
+	} else if (discard_type == DPOLICY_UMOUNT) {
+		dpolicy->io_aware = false;
+	}
+}
+
+
 /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
 static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
 						struct discard_policy *dpolicy,
@@ -1210,68 +1243,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
 	return 0;
 }
 
-static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
-					struct discard_policy *dpolicy,
-					unsigned int start, unsigned int end)
-{
-	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
-	struct rb_node **insert_p = NULL, *insert_parent = NULL;
-	struct discard_cmd *dc;
-	struct blk_plug plug;
-	int issued;
-
-next:
-	issued = 0;
-
-	mutex_lock(&dcc->cmd_lock);
-	f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
-
-	dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
-					NULL, start,
-					(struct rb_entry **)&prev_dc,
-					(struct rb_entry **)&next_dc,
-					&insert_p, &insert_parent, true);
-	if (!dc)
-		dc = next_dc;
-
-	blk_start_plug(&plug);
-
-	while (dc && dc->lstart <= end) {
-		struct rb_node *node;
-
-		if (dc->len < dpolicy->granularity)
-			goto skip;
-
-		if (dc->state != D_PREP) {
-			list_move_tail(&dc->list, &dcc->fstrim_list);
-			goto skip;
-		}
-
-		__submit_discard_cmd(sbi, dpolicy, dc);
-
-		if (++issued >= dpolicy->max_requests) {
-			start = dc->lstart + dc->len;
-
-			blk_finish_plug(&plug);
-			mutex_unlock(&dcc->cmd_lock);
-
-			schedule();
-
-			goto next;
-		}
-skip:
-		node = rb_next(&dc->rb_node);
-		dc = rb_entry_safe(node, struct discard_cmd, rb_node);
-
-		if (fatal_signal_pending(current))
-			break;
-	}
-
-	blk_finish_plug(&plug);
-	mutex_unlock(&dcc->cmd_lock);
-}
-
 static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
 					struct discard_policy *dpolicy)
 {
@@ -1412,7 +1383,18 @@ next:
 static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
 						struct discard_policy *dpolicy)
 {
-	__wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
+	struct discard_policy dp;
+
+	if (dpolicy) {
+		__wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
+		return;
+	}
+
+	/* wait all */
+	__init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1);
+	__wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
+	__init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1);
+	__wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
 }
 
 /* This should be covered by global mutex, &sit_i->sentry_lock */
@@ -1457,11 +1439,13 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
 	struct discard_policy dpolicy;
 	bool dropped;
 
-	init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity);
+	__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
+					dcc->discard_granularity);
 	__issue_discard_cmd(sbi, &dpolicy);
 	dropped = __drop_discard_cmd(sbi);
-	__wait_all_discard_cmd(sbi, &dpolicy);
 
+	/* just to make sure there is no pending discard commands */
+	__wait_all_discard_cmd(sbi, NULL);
 	return dropped;
 }
 
@@ -1477,7 +1461,7 @@ static int issue_discard_thread(void *data)
 	set_freezable();
 
 	do {
-		init_discard_policy(&dpolicy, DPOLICY_BG,
+		__init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
 					dcc->discard_granularity);
 
 		wait_event_interruptible_timeout(*q,
@@ -1495,7 +1479,7 @@ static int issue_discard_thread(void *data)
 			dcc->discard_wake = 0;
 
 		if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
-			init_discard_policy(&dpolicy, DPOLICY_FORCE, 1);
+			__init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
 
 		sb_start_intwrite(sbi->sb);
 
@@ -1788,32 +1772,6 @@ skip:
 	wake_up_discard_thread(sbi, false);
 }
 
-void init_discard_policy(struct discard_policy *dpolicy,
-				int discard_type, unsigned int granularity)
-{
-	/* common policy */
-	dpolicy->type = discard_type;
-	dpolicy->sync = true;
-	dpolicy->granularity = granularity;
-
-	dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
-	dpolicy->io_aware_gran = MAX_PLIST_NUM;
-
-	if (discard_type == DPOLICY_BG) {
-		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
-		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
-		dpolicy->io_aware = true;
-	} else if (discard_type == DPOLICY_FORCE) {
-		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
-		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
-		dpolicy->io_aware = false;
-	} else if (discard_type == DPOLICY_FSTRIM) {
-		dpolicy->io_aware = false;
-	} else if (discard_type == DPOLICY_UMOUNT) {
-		dpolicy->io_aware = false;
-	}
-}
-
 static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
@@ -2453,11 +2411,72 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	return has_candidate;
 }
 
+static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
+					struct discard_policy *dpolicy,
+					unsigned int start, unsigned int end)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+	struct rb_node **insert_p = NULL, *insert_parent = NULL;
+	struct discard_cmd *dc;
+	struct blk_plug plug;
+	int issued;
+
+next:
+	issued = 0;
+
+	mutex_lock(&dcc->cmd_lock);
+	f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+
+	dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+					NULL, start,
+					(struct rb_entry **)&prev_dc,
+					(struct rb_entry **)&next_dc,
+					&insert_p, &insert_parent, true);
+	if (!dc)
+		dc = next_dc;
+
+	blk_start_plug(&plug);
+
+	while (dc && dc->lstart <= end) {
+		struct rb_node *node;
+
+		if (dc->len < dpolicy->granularity)
+			goto skip;
+
+		if (dc->state != D_PREP) {
+			list_move_tail(&dc->list, &dcc->fstrim_list);
+			goto skip;
+		}
+
+		__submit_discard_cmd(sbi, dpolicy, dc);
+
+		if (++issued >= dpolicy->max_requests) {
+			start = dc->lstart + dc->len;
+
+			blk_finish_plug(&plug);
+			mutex_unlock(&dcc->cmd_lock);
+			__wait_all_discard_cmd(sbi, NULL);
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			goto next;
+		}
+skip:
+		node = rb_next(&dc->rb_node);
+		dc = rb_entry_safe(node, struct discard_cmd, rb_node);
+
+		if (fatal_signal_pending(current))
+			break;
+	}
+
+	blk_finish_plug(&plug);
+	mutex_unlock(&dcc->cmd_lock);
+}
+
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 {
 	__u64 start = F2FS_BYTES_TO_BLK(range->start);
 	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
-	unsigned int start_segno, end_segno, cur_segno;
+	unsigned int start_segno, end_segno;
 	block_t start_block, end_block;
 	struct cp_control cpc;
 	struct discard_policy dpolicy;
@@ -2483,40 +2502,27 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 
 	cpc.reason = CP_DISCARD;
 	cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
+	cpc.trim_start = start_segno;
+	cpc.trim_end = end_segno;
 
-	/* do checkpoint to issue discard commands safely */
-	for (cur_segno = start_segno; cur_segno <= end_segno;
-					cur_segno = cpc.trim_end + 1) {
-		cpc.trim_start = cur_segno;
+	if (sbi->discard_blks == 0)
+		goto out;
 
-		if (sbi->discard_blks == 0)
-			break;
-		else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi))
-			cpc.trim_end = end_segno;
-		else
-			cpc.trim_end = min_t(unsigned int,
-				rounddown(cur_segno +
-				BATCHED_TRIM_SEGMENTS(sbi),
-				sbi->segs_per_sec) - 1, end_segno);
-
-		mutex_lock(&sbi->gc_mutex);
-		err = write_checkpoint(sbi, &cpc);
-		mutex_unlock(&sbi->gc_mutex);
-		if (err)
-			break;
-
-		schedule();
-	}
+	mutex_lock(&sbi->gc_mutex);
+	err = write_checkpoint(sbi, &cpc);
+	mutex_unlock(&sbi->gc_mutex);
+	if (err)
+		goto out;
 
 	start_block = START_BLOCK(sbi, start_segno);
-	end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1);
+	end_block = START_BLOCK(sbi, end_segno + 1);
 
-	init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
+	__init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
 	__issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block);
 	trimmed = __wait_discard_cmd_range(sbi, &dpolicy,
 					start_block, end_block);
-out:
 	range->len = F2FS_BLK_TO_BYTES(trimmed);
+out:
 	return err;
 }
 
@@ -3904,8 +3910,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
 	sm_info->min_ssr_sections = reserved_sections(sbi);
 
-	sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
-
 	INIT_LIST_HEAD(&sm_info->sit_entry_set);
 
 	init_rwsem(&sm_info->curseg_lock);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 55b2bad55671..cb57ad3ca32d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -740,6 +740,10 @@ static int parse_options(struct super_block *sb, char *options)
 			} else if (strlen(name) == 6 &&
 					!strncmp(name, "strict", 6)) {
 				F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT;
+			} else if (strlen(name) == 9 &&
+					!strncmp(name, "nobarrier", 9)) {
+				F2FS_OPTION(sbi).fsync_mode =
+							FSYNC_MODE_NOBARRIER;
 			} else {
 				kfree(name);
 				return -EINVAL;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index f33a56d6e6dd..2c53de9251be 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -245,6 +245,9 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "trim_sections"))
+		return -EINVAL;
+
 	*ui = t;
 
 	if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)

From cb8f55426697d8da119dda08ec78adf2ed2e66fb Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Thu, 31 May 2018 13:36:29 -0700
Subject: [PATCH 1723/3220] ANDROID: Update x86_64_cuttlefish_defconfig

Merge with the configs from kernel/configs.git added recently.

This should fix ipsec VPN functionality.

Bug: 80540078
Change-Id: I9cc99f5e34d2809670fe2fc0df121610657f6769
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 9f0107157b8f..c8bb38813233 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -11,6 +11,7 @@ CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_SCHED=y
@@ -56,6 +57,7 @@ CONFIG_RANDOMIZE_BASE=y
 CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_CMDLINE_BOOL=y
 CONFIG_CMDLINE="console=ttyS0 reboot=p nopti"
+CONFIG_PM_AUTOSLEEP=y
 CONFIG_PM_WAKELOCKS=y
 CONFIG_PM_WAKELOCKS_LIMIT=0
 # CONFIG_PM_WAKELOCKS_GC is not set
@@ -88,8 +90,8 @@ CONFIG_IP_MROUTE=y
 CONFIG_IP_PIMSM_V1=y
 CONFIG_IP_PIMSM_V2=y
 CONFIG_SYN_COOKIES=y
+CONFIG_NET_IPVTI=y
 CONFIG_INET_ESP=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_LRO is not set
 CONFIG_INET_DIAG_DESTROY=y
@@ -105,6 +107,7 @@ CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y
 CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_VTI=y
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_NETLABEL=y
 CONFIG_NETFILTER=y
@@ -298,7 +301,6 @@ CONFIG_SOUND=y
 CONFIG_SND=y
 CONFIG_HIDRAW=y
 CONFIG_UHID=y
-# CONFIG_HID_GENERIC is not set
 CONFIG_HID_A4TECH=y
 CONFIG_HID_ACRUX=y
 CONFIG_HID_ACRUX_FF=y
@@ -362,6 +364,8 @@ CONFIG_USB_GADGET=y
 CONFIG_USB_DUMMY_HCD=y
 CONFIG_USB_CONFIGFS=y
 CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MTP=y
+CONFIG_USB_CONFIGFS_F_PTP=y
 CONFIG_USB_CONFIGFS_F_ACC=y
 CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
 CONFIG_USB_CONFIGFS_UEVENT=y
@@ -423,7 +427,6 @@ CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_LOCKUP_DETECTOR=y
 CONFIG_PANIC_TIMEOUT=5
-# CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
@@ -440,3 +443,4 @@ CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
 CONFIG_CRYPTO_ECHAINIV=y
+CONFIG_CRYPTO_SHA512=y

From e75204cc0ad5e591e57dee2538b777d52f133cf3 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Thu, 31 May 2018 15:47:36 -0700
Subject: [PATCH 1724/3220] ANDROID: x86_64_cuttlefish_defconfig: Enable F2FS

Bug: 80475502
Change-Id: I061467404f1d4b828ac1b7423db375a35934ce28
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index c8bb38813233..71026930c04c 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -394,6 +394,9 @@ CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
 CONFIG_EXT4_ENCRYPTION=y
+CONFIG_F2FS_FS=y
+CONFIG_F2FS_FS_SECURITY=y
+CONFIG_F2FS_FS_ENCRYPTION=y
 CONFIG_QUOTA=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 # CONFIG_PRINT_QUOTA_WARNING is not set

From f13f3745ee42fe690986bbca0bdda46711e10ac9 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 4 Jun 2018 10:43:24 -0700
Subject: [PATCH 1725/3220] Update arch/x86/configs/x86_64_ranchu_defconfig

Bug: 72717639
Change-Id: Ia7519bd03fefa2a3d7c91e3a2537599bfd6be70d
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 arch/x86/configs/x86_64_ranchu_defconfig | 70 +++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 3 deletions(-)

diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig
index e8ed8eef62ec..5cf2450842ab 100644
--- a/arch/x86/configs/x86_64_ranchu_defconfig
+++ b/arch/x86/configs/x86_64_ranchu_defconfig
@@ -1,5 +1,6 @@
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_POSIX_MQUEUE=y
+# CONFIG_USELIB is not set
 CONFIG_AUDIT=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
@@ -8,20 +9,29 @@ CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
+CONFIG_NAMESPACES=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS_ALL=y
+CONFIG_BPF_SYSCALL=y
 CONFIG_EMBEDDED=y
 # CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_CC_STACKPROTECTOR_STRONG=y
 CONFIG_ARCH_MMAP_RND_BITS=32
 CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_OSF_PARTITION=y
 CONFIG_AMIGA_PARTITION=y
@@ -34,6 +44,9 @@ CONFIG_SGI_PARTITION=y
 CONFIG_SUN_PARTITION=y
 CONFIG_KARMA_PARTITION=y
 CONFIG_SMP=y
+CONFIG_GOLDFISH=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
 CONFIG_MCORE2=y
 CONFIG_MAXSMP=y
 CONFIG_PREEMPT=y
@@ -47,6 +60,9 @@ CONFIG_EFI=y
 CONFIG_EFI_STUB=y
 CONFIG_HZ_100=y
 CONFIG_PHYSICAL_START=0x100000
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="nopti"
 CONFIG_PM_AUTOSLEEP=y
 CONFIG_PM_WAKELOCKS=y
 CONFIG_PM_WAKELOCKS_LIMIT=0
@@ -69,6 +85,7 @@ CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_XFRM_USER=y
+CONFIG_XFRM_STATISTICS=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
@@ -84,6 +101,8 @@ CONFIG_IP_MROUTE=y
 CONFIG_IP_PIMSM_V1=y
 CONFIG_IP_PIMSM_V2=y
 CONFIG_SYN_COOKIES=y
+CONFIG_NET_IPVTI=y
+CONFIG_INET_AH=y
 CONFIG_INET_ESP=y
 # CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_LRO is not set
@@ -95,6 +114,7 @@ CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y
 CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_VTI=y
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_NETLABEL=y
 CONFIG_NETFILTER=y
@@ -153,6 +173,10 @@ CONFIG_IP_NF_MATCH_ECN=y
 CONFIG_IP_NF_MATCH_TTL=y
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
 CONFIG_IP_NF_MANGLE=y
 CONFIG_IP_NF_RAW=y
 CONFIG_IP_NF_SECURITY=y
@@ -161,6 +185,7 @@ CONFIG_IP_NF_ARPFILTER=y
 CONFIG_IP_NF_ARP_MANGLE=y
 CONFIG_NF_CONNTRACK_IPV6=y
 CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_RPFILTER=y
 CONFIG_IP6_NF_FILTER=y
 CONFIG_IP6_NF_TARGET_REJECT=y
 CONFIG_IP6_NF_MANGLE=y
@@ -177,10 +202,13 @@ CONFIG_MAC80211_LEDS=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DMA_CMA=y
 CONFIG_CONNECTOR=y
+CONFIG_OF=y
+CONFIG_OF_UNITTEST=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_BLK_DEV_RAM_SIZE=16384
 CONFIG_VIRTIO_BLK=y
+CONFIG_UID_SYS_STATS=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_BLK_DEV_SR=y
 CONFIG_BLK_DEV_SR_VENDOR=y
@@ -210,6 +238,7 @@ CONFIG_DM_VERITY_FEC=y
 CONFIG_NETDEVICES=y
 CONFIG_NETCONSOLE=y
 CONFIG_TUN=y
+CONFIG_VETH=y
 CONFIG_VIRTIO_NET=y
 CONFIG_BNX2=y
 CONFIG_TIGON3=y
@@ -231,6 +260,7 @@ CONFIG_PPP_MPPE=y
 CONFIG_PPPOLAC=y
 CONFIG_PPPOPNS=y
 CONFIG_USB_USBNET=y
+CONFIG_MAC80211_HWSIM=y
 CONFIG_INPUT_POLLDEV=y
 # CONFIG_INPUT_MOUSEDEV_PSAUX is not set
 CONFIG_INPUT_EVDEV=y
@@ -263,7 +293,10 @@ CONFIG_SERIAL_NONSTANDARD=y
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+CONFIG_HW_RANDOM_VIRTIO=y
 CONFIG_NVRAM=y
+# CONFIG_DEVPORT is not set
 CONFIG_I2C_I801=y
 CONFIG_BATTERY_GOLDFISH=y
 CONFIG_WATCHDOG=y
@@ -280,6 +313,18 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 CONFIG_SOUND=y
 CONFIG_SND=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_SND_HDA_CODEC_REALTEK=y
+CONFIG_SND_HDA_CODEC_ANALOG=y
+CONFIG_SND_HDA_CODEC_SIGMATEL=y
+CONFIG_SND_HDA_CODEC_VIA=y
+CONFIG_SND_HDA_CODEC_HDMI=y
+CONFIG_SND_HDA_CODEC_CIRRUS=y
+CONFIG_SND_HDA_CODEC_CONEXANT=y
+CONFIG_SND_HDA_CODEC_CA0110=y
+CONFIG_SND_HDA_CODEC_CA0132=y
+CONFIG_SND_HDA_CODEC_CMEDIA=y
+CONFIG_SND_HDA_CODEC_SI3054=y
 CONFIG_HIDRAW=y
 CONFIG_UHID=y
 CONFIG_HID_A4TECH=y
@@ -349,6 +394,15 @@ CONFIG_USB_UHCI_HCD=y
 CONFIG_USB_PRINTER=y
 CONFIG_USB_STORAGE=y
 CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MTP=y
+CONFIG_USB_CONFIGFS_F_PTP=y
+CONFIG_USB_CONFIGFS_F_ACC=y
+CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
 CONFIG_EDAC=y
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_HCTOSYS is not set
@@ -362,8 +416,6 @@ CONFIG_SW_SYNC=y
 CONFIG_ION=y
 CONFIG_GOLDFISH_AUDIO=y
 CONFIG_GOLDFISH_SYNC=y
-CONFIG_SND_HDA_INTEL=y
-CONFIG_GOLDFISH=y
 CONFIG_GOLDFISH_PIPE=y
 CONFIG_ANDROID=y
 CONFIG_ANDROID_BINDER_IPC=y
@@ -373,6 +425,7 @@ CONFIG_EXT4_FS_SECURITY=y
 CONFIG_QUOTA=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 # CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_QFMT_V2=y
 CONFIG_FUSE_FS=y
 CONFIG_ISO9660_FS=y
 CONFIG_JOLIET=y
@@ -383,8 +436,10 @@ CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_HUGETLBFS=y
+CONFIG_SDCARD_FS=y
 CONFIG_PSTORE=y
 CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_PMSG=y
 CONFIG_PSTORE_RAM=y
 # CONFIG_NETWORK_FILESYSTEMS is not set
 CONFIG_NLS_DEFAULT="utf8"
@@ -399,16 +454,24 @@ CONFIG_DEBUG_INFO=y
 # CONFIG_UNUSED_SYMBOLS is not set
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DETECT_HUNG_TASK=y
 CONFIG_PANIC_TIMEOUT=5
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_DEBUG_SET_MODULE_RONX=y
 CONFIG_KEYS=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
+CONFIG_HARDENED_USERCOPY=y
 CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_USER=y
+CONFIG_CRYPTO_ECHAINIV=y
+CONFIG_CRYPTO_SHA512=y
+CONFIG_CRYPTO_AES_X86_64=y
 CONFIG_CRYPTO_TWOFISH=y
 CONFIG_ASYMMETRIC_KEY_TYPE=y
 CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
@@ -417,3 +480,4 @@ CONFIG_PKCS7_MESSAGE_PARSER=y
 CONFIG_PKCS7_TEST_KEY=y
 # CONFIG_VIRTUALIZATION is not set
 CONFIG_CRC_T10DIF=y
+CONFIG_ANDROID_BINDER_DEVICES="binder,hwbinder,vndbinder"

From 33467f9a0952ebfee1faf3aba9fdef8b42b9d5c5 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 4 Jun 2018 15:57:00 -0700
Subject: [PATCH 1726/3220] goldfish: pipe: ANDROID: remove redundant blank
 lines

Bug: 72717639
Change-Id: I951c34c7247c764ccd40a3e62680909b76403c4c
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 90bac4b055a3..7a53b6b4072d 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -996,7 +996,6 @@ static int goldfish_dma_mmap_locked(
 				dma->phys_begin >> PAGE_SHIFT,
 				sz_requested,
 				vma->vm_page_prot);
-
 	if (status < 0) {
 		dev_err(pdev_dev, "Cannot remap pfn range....\n");
 		return -EAGAIN;
@@ -1025,7 +1024,6 @@ static int goldfish_dma_mmap(struct file *filp, struct vm_area_struct *vma)
 	status = goldfish_dma_mmap_locked(pipe, vma);
 	mutex_unlock(&pipe->lock);
 	return status;
-
 }
 
 static int goldfish_pipe_dma_create_region(

From 6cdd987da8a5b293d83e4cd8b168ad1e3b7bc0a7 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 4 Jun 2018 18:22:58 -0700
Subject: [PATCH 1727/3220] goldfish: pipe: ANDROID: add missing check for
 memory allocated

Bug: 72717639
Change-Id: I9bc8db41f2269c38a259d842ea52bdc0037f3bae
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index 7a53b6b4072d..a904e66c6641 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -951,7 +951,8 @@ static int goldfish_pipe_dma_alloc_locked(struct goldfish_pipe *pipe)
 				dma->dma_size,
 				&dma->phys_begin,
 				GFP_KERNEL);
-	return -ENOMEM;
+	if (!dma->dma_vaddr)
+		return -ENOMEM;
 
 	dma->phys_end = dma->phys_begin + dma->dma_size;
 	pipe->dev->dma_alloc_total += dma->dma_size;

From 7e3a6fc483350235bf3446492133d60c292d487c Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Mon, 4 Jun 2018 18:25:35 -0700
Subject: [PATCH 1728/3220] goldfish: pipe: ANDROID: address must be written as
 __pa(x), not x

The previous change missed the __pa transformation applied to the address before passing it further. The value also has to be written from the high part first.

Bug: 72717639
Change-Id: Id0756ca733f26ced1d74179764116db05ec47bea
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 drivers/platform/goldfish/goldfish_pipe_v2.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
index a904e66c6641..f0b9b46047be 100644
--- a/drivers/platform/goldfish/goldfish_pipe_v2.c
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -1153,6 +1153,15 @@ static struct miscdevice goldfish_pipe_miscdev = {
 	.fops = &goldfish_pipe_fops,
 };
 
+
+static void write_pa_addr(void *addr, void __iomem *portl, void __iomem *porth)
+{
+	const unsigned long paddr = __pa(addr);
+
+	writel(paddr >> 32, porth);
+	writel((u32)paddr, portl);
+}
+
 static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 {
 	struct goldfish_pipe_dev *dev = &goldfish_pipe_dev;
@@ -1196,14 +1205,14 @@ static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
 	dev->buffers = (struct goldfish_pipe_dev_buffers *)page;
 
 	/* Send the buffer addresses to the host */
-	gf_write_ptr(&dev->buffers->signalled_pipe_buffers,
+	write_pa_addr(&dev->buffers->signalled_pipe_buffers,
 		dev->base + PIPE_REG_SIGNAL_BUFFER,
 		dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH);
 
 	writel((u32)MAX_SIGNALLED_PIPES,
 		dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT);
 
-	gf_write_ptr(&dev->buffers->open_command_params,
+	write_pa_addr(&dev->buffers->open_command_params,
 		dev->base + PIPE_REG_OPEN_BUFFER,
 		dev->base + PIPE_REG_OPEN_BUFFER_HIGH);
 

From 98b6097d0f14c3319e964e89136632ebfb6dc917 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 31 May 2018 10:20:48 -0700
Subject: [PATCH 1729/3220] FROMLIST: f2fs: run fstrim asynchronously if
 runtime discard is on

Cherry-picked from:
        origin/upstream-f2fs-stable-linux-4.4.y

We don't need to wait for whole bunch of discard candidates in fstrim, since
runtime discard will issue them in idle time.

Change-Id: I32602711842d603cca030765eab49b337789e8ad
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 478a4504ba9a..a02d5c1a7ed2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2519,9 +2519,18 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 
 	__init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
 	__issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block);
-	trimmed = __wait_discard_cmd_range(sbi, &dpolicy,
+
+	/*
+	 * We filed discard candidates, but actually we don't need to wait for
+	 * all of them, since they'll be issued in idle time along with runtime
+	 * discard option. User configuration looks like using runtime discard
+	 * or periodic fstrim instead of it.
+	 */
+	if (!test_opt(sbi, DISCARD)) {
+		trimmed = __wait_discard_cmd_range(sbi, &dpolicy,
 					start_block, end_block);
-	range->len = F2FS_BLK_TO_BYTES(trimmed);
+		range->len = F2FS_BLK_TO_BYTES(trimmed);
+	}
 out:
 	return err;
 }

From e4798d7f13c7d183d4c353bc8fd23ae3d2a5d904 Mon Sep 17 00:00:00 2001
From: Roman Kiryanov <rkir@google.com>
Date: Wed, 6 Jun 2018 10:48:00 -0700
Subject: [PATCH 1730/3220] ANDROID: Update arm64 ranchu64_defconfig

======== Summary =========
Device Manifest?    GOOD
Device Matrix?      GOOD
Framework Manifest? GOOD
Framework Matrix?   GOOD
Device HAL Manifest <==> Framework Compatibility Matrix? GOOD
Framework HAL Manifest <==> Device Compatibility Matrix? GOOD
Runtime info <==> Framework Compatibility Matrix?        GOOD
VintfObject::CheckCompatibility?                         GOOD

Bug: 109735735
Test: emulator -verbose -show-kernel -kernel /tmp/qemu-kernel-arm64-ranchu/kernel-qemu
Change-Id: I459e73fea6ba00a35446da856a18ebe3e530bd04
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
 arch/arm64/configs/ranchu64_defconfig | 71 ++++++++++++++++++++++-----
 1 file changed, 59 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/configs/ranchu64_defconfig b/arch/arm64/configs/ranchu64_defconfig
index fc55008d8c4c..7f847fc40d14 100644
--- a/arch/arm64/configs/ranchu64_defconfig
+++ b/arch/arm64/configs/ranchu64_defconfig
@@ -1,6 +1,7 @@
 # CONFIG_LOCALVERSION_AUTO is not set
 # CONFIG_SWAP is not set
 CONFIG_POSIX_MQUEUE=y
+# CONFIG_USELIB is not set
 CONFIG_AUDIT=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
@@ -17,14 +18,19 @@ CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RT_GROUP_SCHED=y
+CONFIG_NAMESPACES=y
 CONFIG_SCHED_AUTOGROUP=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
+CONFIG_CC_STACKPROTECTOR_STRONG=y
 CONFIG_ARCH_MMAP_RND_BITS=24
 CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_ARCH_VEXPRESS=y
@@ -36,6 +42,8 @@ CONFIG_ARMV8_DEPRECATED=y
 CONFIG_SWP_EMULATION=y
 CONFIG_CP15_BARRIER_EMULATION=y
 CONFIG_SETEND_EMULATION=y
+CONFIG_ARM64_SW_TTBR0_PAN=y
+CONFIG_RANDOMIZE_BASE=y
 CONFIG_CMDLINE="console=ttyAMA0"
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_COMPAT=y
@@ -50,15 +58,16 @@ CONFIG_UNIX=y
 CONFIG_XFRM_USER=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
-CONFIG_INET_DIAG_DESTROY=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_ADVANCED_ROUTER=y
 CONFIG_IP_MULTIPLE_TABLES=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_DHCP=y
 CONFIG_IP_PNP_BOOTP=y
+CONFIG_NET_IPVTI=y
 CONFIG_INET_ESP=y
 # CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
 CONFIG_IPV6_OPTIMISTIC_DAD=y
@@ -66,6 +75,7 @@ CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y
 CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_VTI=y
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_NETFILTER=y
 CONFIG_NF_CONNTRACK=y
@@ -124,6 +134,10 @@ CONFIG_IP_NF_MATCH_RPFILTER=y
 CONFIG_IP_NF_MATCH_TTL=y
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
 CONFIG_IP_NF_MANGLE=y
 CONFIG_IP_NF_TARGET_ECN=y
 CONFIG_IP_NF_TARGET_TTL=y
@@ -141,6 +155,7 @@ CONFIG_IP6_NF_MATCH_OPTS=y
 CONFIG_IP6_NF_MATCH_HL=y
 CONFIG_IP6_NF_MATCH_IPV6HEADER=y
 CONFIG_IP6_NF_MATCH_MH=y
+CONFIG_IP6_NF_MATCH_RPFILTER=y
 CONFIG_IP6_NF_MATCH_RT=y
 CONFIG_IP6_NF_TARGET_HL=y
 CONFIG_IP6_NF_FILTER=y
@@ -154,12 +169,14 @@ CONFIG_NET_CLS_U32=y
 CONFIG_NET_EMATCH=y
 CONFIG_NET_EMATCH_U32=y
 CONFIG_NET_CLS_ACT=y
-# CONFIG_WIRELESS is not set
+CONFIG_MAC80211=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_VIRTIO_BLK=y
+CONFIG_UID_SYS_STATS=y
+CONFIG_MEMORY_STATE_TIME=y
 CONFIG_SCSI=y
 # CONFIG_SCSI_PROC_FS is not set
 CONFIG_BLK_DEV_SD=y
@@ -180,13 +197,22 @@ CONFIG_PPP_DEFLATE=y
 CONFIG_PPP_MPPE=y
 CONFIG_PPPOLAC=y
 CONFIG_PPPOPNS=y
-# CONFIG_WLAN is not set
+CONFIG_USB_USBNET=y
 CONFIG_INPUT_EVDEV=y
 CONFIG_INPUT_KEYRESET=y
 CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+CONFIG_KEYBOARD_GOLDFISH_ROTARY=y
 # CONFIG_INPUT_MOUSE is not set
 CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
 CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
 CONFIG_INPUT_MISC=y
 CONFIG_INPUT_KEYCHORD=y
 CONFIG_INPUT_UINPUT=y
@@ -199,7 +225,8 @@ CONFIG_INPUT_GPIO=y
 CONFIG_SERIAL_AMBA_PL011=y
 CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
 CONFIG_VIRTIO_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
+CONFIG_HW_RANDOM=y
+CONFIG_HW_RANDOM_VIRTIO=y
 CONFIG_BATTERY_GOLDFISH=y
 # CONFIG_HWMON is not set
 CONFIG_MEDIA_SUPPORT=y
@@ -228,8 +255,10 @@ CONFIG_DRAGONRISE_FF=y
 CONFIG_HID_EMS_FF=y
 CONFIG_HID_ELECOM=y
 CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
 CONFIG_HID_KEYTOUCH=y
 CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
 CONFIG_HID_WALTOP=y
 CONFIG_HID_GYRATION=y
 CONFIG_HID_TWINHAN=y
@@ -244,14 +273,17 @@ CONFIG_HID_MAGICMOUSE=y
 CONFIG_HID_MICROSOFT=y
 CONFIG_HID_MONTEREY=y
 CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
 CONFIG_HID_ORTEK=y
 CONFIG_HID_PANTHERLORD=y
 CONFIG_PANTHERLORD_FF=y
 CONFIG_HID_PETALYNX=y
 CONFIG_HID_PICOLCD=y
 CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
 CONFIG_HID_SAITEK=y
 CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
 CONFIG_HID_SPEEDLINK=y
 CONFIG_HID_SUNPLUS=y
 CONFIG_HID_GREENASIA=y
@@ -265,7 +297,18 @@ CONFIG_HID_WACOM=y
 CONFIG_HID_WIIMOTE=y
 CONFIG_HID_ZEROPLUS=y
 CONFIG_HID_ZYDACRON=y
-# CONFIG_USB_SUPPORT is not set
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MTP=y
+CONFIG_USB_CONFIGFS_F_PTP=y
+CONFIG_USB_CONFIGFS_F_ACC=y
+CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
 CONFIG_RTC_CLASS=y
 CONFIG_VIRTIO_MMIO=y
 CONFIG_STAGING=y
@@ -286,27 +329,31 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_SECURITY=y
 CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+CONFIG_QFMT_V2=y
 CONFIG_FUSE_FS=y
 CONFIG_CUSE=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
-# CONFIG_MISC_FILESYSTEMS is not set
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
+CONFIG_SDCARD_FS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
 CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_FS=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_PANIC_TIMEOUT=5
-# CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-# CONFIG_FTRACE is not set
+CONFIG_ENABLE_DEFAULT_TRACERS=y
 CONFIG_ATOMIC64_SELFTEST=y
-CONFIG_DEBUG_RODATA=y
+CONFIG_KEYS=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
+CONFIG_HARDENED_USERCOPY=y
 CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_SHA512=y

From 226f96b03dc2a21f3dd430317e629b47048cdad5 Mon Sep 17 00:00:00 2001
From: Lianjun Huang <huanglianjun@vivo.com>
Date: Sat, 16 Jun 2018 22:59:46 +0800
Subject: [PATCH 1731/3220] ANDROID: sdcardfs: fix potential crash when
 reserved_mb is not zero

sdcardfs_mkdir() calls check_min_free_space(). When reserved_mb is not zero, a negative dentry will be passed to
ext4_statfs() at last and ext4_statfs() will crash. The parent dentry is positive. So we use the parent dentry to
check free space.

Change-Id: I80ab9623fe59ba911f4cc9f0e029a1c6f7ee421b
Signed-off-by: Lianjun Huang <huanglianjun@vivo.com>
---
 fs/sdcardfs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 2a4520a63993..75a8ab2ce5a8 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -270,6 +270,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	struct dentry *lower_dentry;
 	struct vfsmount *lower_mnt;
 	struct dentry *lower_parent_dentry = NULL;
+	struct dentry *parent_dentry = NULL;
 	struct path lower_path;
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 	const struct cred *saved_cred = NULL;
@@ -289,11 +290,14 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
 
 	/* check disk space */
-	if (!check_min_free_space(dentry, 0, 1)) {
+	parent_dentry = dget_parent(dentry);
+	if (!check_min_free_space(parent_dentry, 0, 1)) {
 		pr_err("sdcardfs: No minimum free space.\n");
 		err = -ENOSPC;
+		dput(parent_dentry);
 		goto out_revert;
 	}
+	dput(parent_dentry);
 
 	/* the lower_dentry is negative here */
 	sdcardfs_get_lower_path(dentry, &lower_path);

From cf21a9ac5ee4f57c6f45fe2b09d197bdc038f39d Mon Sep 17 00:00:00 2001
From: Patrik Torstensson <totte@google.com>
Date: Fri, 13 Apr 2018 15:34:48 -0700
Subject: [PATCH 1732/3220] ANDROID: Add kconfig to make dm-verity
 check_at_most_once default enabled

This change adds a kernel config for default enable
the check_at_most_once dm-verity option. This is to give us
the ability to enforce the usage of at_most_once
for entry-level phones.

Change-Id: Id40416672c4c2209a9866997d8c164b5de5dc7dc
Signed-off-by: Patrik Torstensson <totte@google.com>
Bug: 72664474
---
 drivers/md/Kconfig            | 20 ++++++++++++++++++++
 drivers/md/dm-verity-target.c |  8 ++++++++
 2 files changed, 28 insertions(+)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 5649d1ab0083..95de7ee56dfc 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -545,4 +545,24 @@ config DM_ANDROID_VERITY
 	  of the metadata contents are verified against the key included
 	  in the system keyring. Upon success, the underlying verity
 	  target is setup.
+
+config DM_ANDROID_VERITY_AT_MOST_ONCE_DEFAULT_ENABLED
+	bool "Verity will validate blocks at most once"
+   depends on DM_VERITY
+   ---help---
+	  Default enables at_most_once option for dm-verity
+
+	  Verify data blocks only the first time they are read from the
+	  data device, rather than every time.  This reduces the overhead
+	  of dm-verity so that it can be used on systems that are memory
+	  and/or CPU constrained.  However, it provides a reduced level
+	  of security because only offline tampering of the data device's
+	  content will be detected, not online tampering.
+
+	  Hash blocks are still verified each time they are read from the
+	  hash device, since verification of hash blocks is less performance
+	  critical than data blocks, and a hash block will not be verified
+	  any more after all the data blocks it covers have been verified anyway.
+
+	  If unsure, say N.
 endif # MD
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index ceff074b3b74..d2e3abc182b3 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1053,6 +1053,14 @@ int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			goto bad;
 	}
 
+#ifdef CONFIG_DM_ANDROID_VERITY_AT_MOST_ONCE_DEFAULT_ENABLED
+	if (!v->validated_blocks) {
+		r = verity_alloc_most_once(v);
+		if (r)
+			goto bad;
+	}
+#endif
+
 	v->hash_per_block_bits =
 		__fls((1 << v->hash_dev_block_bits) / v->digest_size);
 

From ed9bdc8a8fc520059510c1fd2b5ee8f4390be7cc Mon Sep 17 00:00:00 2001
From: Artem Borisov <dedsa2002@gmail.com>
Date: Fri, 13 Jul 2018 14:28:33 +0300
Subject: [PATCH 1733/3220] cpufreq: Kconfig: Remove CPU_FREQ_DEFAULT_GOV_SCHED

Sched governor was removed in df147c9e336cfcb4183db1eb9552b0429060cd0d
and this probably was left here by mistake. Remove it to avoid confusion.

Signed-off-by: Artem Borisov <dedsa2002@gmail.com>
---
 drivers/cpufreq/Kconfig | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index ac1bb26fb158..75ded53a7470 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -121,14 +121,6 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE
 	  loading your cpufreq low-level hardware driver, using the
 	  'interactive' governor for latency-sensitive workloads.
 
-config CPU_FREQ_DEFAULT_GOV_SCHED
-	bool "sched"
-	select CPU_FREQ_GOV_SCHED
-	help
-	  Use the CPUfreq governor 'sched' as default. This scales
-	  cpu frequency using CPU utilization estimates from the
-	  scheduler.
-
 config CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
 	bool "schedutil"
 	depends on SMP

From 8bdaa17ffa3f406efb03ba72dbad339531b178ad Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 29 Nov 2017 22:29:47 +0900
Subject: [PATCH 1734/3220] UPSTREAM: android: binder: Check for errors in
 binder_alloc_shrinker_init().

Both list_lru_init() and register_shrinker() might return an error.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Sherry Yang <sherryy@android.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 533dfb250d1c8d2bb8c9b65252f7b296b29913d4)

Change-Id: I5325ccaf34a04179ef3dae73dd8f3abfd6e21565
---
 drivers/android/binder.c       |  4 +++-
 drivers/android/binder_alloc.c | 12 +++++++++---
 drivers/android/binder_alloc.h |  2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 7f3e8dcd6006..9197d4e70238 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -5781,7 +5781,9 @@ static int __init binder_init(void)
 	struct binder_device *device;
 	struct hlist_node *tmp;
 
-	binder_alloc_shrinker_init();
+	ret = binder_alloc_shrinker_init();
+	if (ret)
+		return ret;
 
 	atomic_set(&binder_transaction_log.cur, ~0U);
 	atomic_set(&binder_transaction_log_failed.cur, ~0U);
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 3a4279d219f7..a1e123b5a2b6 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1010,8 +1010,14 @@ void binder_alloc_init(struct binder_alloc *alloc)
 	INIT_LIST_HEAD(&alloc->buffers);
 }
 
-void binder_alloc_shrinker_init(void)
+int binder_alloc_shrinker_init(void)
 {
-	list_lru_init(&binder_alloc_lru);
-	register_shrinker(&binder_shrinker);
+	int ret = list_lru_init(&binder_alloc_lru);
+
+	if (ret == 0) {
+		ret = register_shrinker(&binder_shrinker);
+		if (ret)
+			list_lru_destroy(&binder_alloc_lru);
+	}
+	return ret;
 }
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index 0b145307f1fd..9ef64e563856 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -130,7 +130,7 @@ extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
 						  size_t extra_buffers_size,
 						  int is_async);
 extern void binder_alloc_init(struct binder_alloc *alloc);
-void binder_alloc_shrinker_init(void);
+extern int binder_alloc_shrinker_init(void);
 extern void binder_alloc_vma_close(struct binder_alloc *alloc);
 extern struct binder_buffer *
 binder_alloc_prepare_to_free(struct binder_alloc *alloc,

From 8429d9832a2cb3d301eeb5d6606b6cd088b9e52b Mon Sep 17 00:00:00 2001
From: Xiongwei Song <sxwjean@gmail.com>
Date: Thu, 14 Dec 2017 12:15:42 +0800
Subject: [PATCH 1735/3220] UPSTREAM: ANDROID: binder: make
 binder_alloc_new_buf_locked static and indent its arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function binder_alloc_new_buf_locked() is only used in this file, so
make it static. Also clean up sparse warning:

drivers/android/binder_alloc.c:330:23: warning: no previous prototype
for ‘binder_alloc_new_buf_locked’ [-Wmissing-prototypes]

In addition, the line of the function name exceeds 80 characters when
add static for this function, hence indent its arguments anew.

Signed-off-by: Xiongwei Song <sxwjean@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 3f827245463a57f5ef64a665e1ca64eed0da00a5)

Change-Id: I6b379df815d30f9b3e9f1dd50334375123b25bbc
---
 drivers/android/binder_alloc.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index a1e123b5a2b6..9f0204a44ba0 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -328,11 +328,12 @@ err_no_vma:
 	return vma ? -ENOMEM : -ESRCH;
 }
 
-struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
-						  size_t data_size,
-						  size_t offsets_size,
-						  size_t extra_buffers_size,
-						  int is_async)
+static struct binder_buffer *binder_alloc_new_buf_locked(
+				struct binder_alloc *alloc,
+				size_t data_size,
+				size_t offsets_size,
+				size_t extra_buffers_size,
+				int is_async)
 {
 	struct rb_node *n = alloc->free_buffers.rb_node;
 	struct binder_buffer *buffer;

From dab911911501027bca4afdf8425263e7cb9341bb Mon Sep 17 00:00:00 2001
From: Elad Wexler <elad.wexler@gmail.com>
Date: Fri, 29 Dec 2017 11:03:37 +0200
Subject: [PATCH 1736/3220] UPSTREAM: android: binder: Prefer __func__ to using
 hardcoded function name

Coding style fixup

Signed-off-by: Elad Wexler <elad.wexler@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 00c41cddebde8d1a635bf81a7b255b7e56fd0d15)

Change-Id: I795e2a9f525c4a8df5cd0a81842a88529ba54f21
---
 drivers/android/binder.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 9197d4e70238..805df83963d4 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4911,7 +4911,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	return ret;
 
 err_bad_arg:
-	pr_err("binder_mmap: %d %lx-%lx %s failed %d\n",
+	pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
 	       proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
 	return ret;
 }
@@ -4921,7 +4921,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	struct binder_proc *proc;
 	struct binder_device *binder_dev;
 
-	binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n",
+	binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__,
 		     current->group_leader->pid, current->pid);
 
 	proc = kzalloc(sizeof(*proc), GFP_KERNEL);

From 086c9e40bfac145b292d79e73ee4501506073e15 Mon Sep 17 00:00:00 2001
From: Harsh Shandilya <harsh@prjkt.io>
Date: Fri, 22 Dec 2017 19:37:02 +0530
Subject: [PATCH 1737/3220] UPSTREAM: android: binder: Use octal permissions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

checkpatch warns against the use of symbolic permissions,
this patch migrates all symbolic permissions in the binder
driver to octal permissions.

Test: debugfs nodes created by binder have the same unix
permissions prior to and after this patch was applied.

Signed-off-by: Harsh Shandilya <harsh@prjkt.io>
Cc: "Arve Hjønnevåg" <arve@android.com>
Cc: Todd Kjos <tkjos@android.com>
Cc: Martijn Coenen <maco@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 21d02ddf716669e182a13b69b4dd928cf8ef5e0f)

Change-Id: I8152fe280ead1d04d89593e813a722f9eb5def27
---
 drivers/android/binder.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 805df83963d4..98c00b57ad80 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -143,7 +143,7 @@ enum {
 };
 static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR |
 	BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION;
-module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO);
+module_param_named(debug_mask, binder_debug_mask, uint, 0644);
 
 static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
 module_param_named(devices, binder_devices_param, charp, S_IRUGO);
@@ -162,7 +162,7 @@ static int binder_set_stop_on_user_error(const char *val,
 	return ret;
 }
 module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
-	param_get_int, &binder_stop_on_user_error, S_IWUSR | S_IRUGO);
+	param_get_int, &binder_stop_on_user_error, 0644);
 
 #define binder_debug(mask, x...) \
 	do { \
@@ -4966,7 +4966,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
 		 * anyway print all contexts that a given PID has, so this
 		 * is not a problem.
 		 */
-		proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO,
+		proc->debugfs_entry = debugfs_create_file(strbuf, 0444,
 			binder_debugfs_dir_entry_proc,
 			(void *)(unsigned long)proc->pid,
 			&binder_proc_fops);
@@ -5798,27 +5798,27 @@ static int __init binder_init(void)
 
 	if (binder_debugfs_dir_entry_root) {
 		debugfs_create_file("state",
-				    S_IRUGO,
+				    0444,
 				    binder_debugfs_dir_entry_root,
 				    NULL,
 				    &binder_state_fops);
 		debugfs_create_file("stats",
-				    S_IRUGO,
+				    0444,
 				    binder_debugfs_dir_entry_root,
 				    NULL,
 				    &binder_stats_fops);
 		debugfs_create_file("transactions",
-				    S_IRUGO,
+				    0444,
 				    binder_debugfs_dir_entry_root,
 				    NULL,
 				    &binder_transactions_fops);
 		debugfs_create_file("transaction_log",
-				    S_IRUGO,
+				    0444,
 				    binder_debugfs_dir_entry_root,
 				    &binder_transaction_log,
 				    &binder_transaction_log_fops);
 		debugfs_create_file("failed_transaction_log",
-				    S_IRUGO,
+				    0444,
 				    binder_debugfs_dir_entry_root,
 				    &binder_transaction_log_failed,
 				    &binder_transaction_log_fops);

From 8cfa4a392450f64d57af5c3b99dae20bd7143f13 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 23 Jan 2018 12:04:27 -0600
Subject: [PATCH 1738/3220] UPSTREAM: android: binder: Use true and false for
 boolean values

Assign true or false to boolean variables instead of an integer value.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Cc: Todd Kjos <tkjos@android.com>
Cc: Martijn Coenen <maco@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 197410ad884eb18b31d48e9d8e64cb5a9e326f2f)

Change-Id: I30bed831d6b6ff2e9e3e521ccc5d6836f0b30944
---
 drivers/android/binder.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 98c00b57ad80..3ec97d41cf30 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -251,7 +251,7 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
 	unsigned int cur = atomic_inc_return(&log->cur);
 
 	if (cur >= ARRAY_SIZE(log->entry))
-		log->full = 1;
+		log->full = true;
 	e = &log->entry[cur % ARRAY_SIZE(log->entry)];
 	WRITE_ONCE(e->debug_id_done, 0);
 	/*
@@ -2805,7 +2805,7 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 		if (node->has_async_transaction) {
 			pending_async = true;
 		} else {
-			node->has_async_transaction = 1;
+			node->has_async_transaction = true;
 		}
 	}
 
@@ -3670,7 +3670,7 @@ static int binder_thread_write(struct binder_proc *proc,
 				w = binder_dequeue_work_head_ilocked(
 						&buf_node->async_todo);
 				if (!w) {
-					buf_node->has_async_transaction = 0;
+					buf_node->has_async_transaction = false;
 				} else {
 					binder_enqueue_work_ilocked(
 							w, &proc->todo);

From 44343a7b30db28f6d140a23b46653d9fdd0a17c2 Mon Sep 17 00:00:00 2001
From: Ganesh Mahendran <opensource.ganesh@gmail.com>
Date: Wed, 10 Jan 2018 10:49:05 +0800
Subject: [PATCH 1739/3220] UPSTREAM: android: binder: use VM_ALLOC to get vm
 area

VM_IOREMAP is used to access hardware through a mechanism called
I/O mapped memory. Android binder is a IPC machanism which will
not access I/O memory.

And VM_IOREMAP has alignment requiement which may not needed in
binder.
    __get_vm_area_node()
    {
    ...
        if (flags & VM_IOREMAP)
            align = 1ul << clamp_t(int, fls_long(size),
               PAGE_SHIFT, IOREMAP_MAX_ORDER);
    ...
    }

This patch will save some kernel vm area, especially for 32bit os.

In 32bit OS, kernel vm area is only 240MB. We may got below
error when launching a app:

<3>[ 4482.440053] binder_alloc: binder_alloc_mmap_handler: 15728 8ce67000-8cf65000 get_vm_area failed -12
<3>[ 4483.218817] binder_alloc: binder_alloc_mmap_handler: 15745 8ce67000-8cf65000 get_vm_area failed -12

Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Acked-by: Martijn Coenen <maco@android.com>
Acked-by: Todd Kjos <tkjos@google.com>
Cc: stable <stable@vger.kernel.org>

----
V3: update comments
V2: update comments
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit aac6830ec1cb681544212838911cdc57f2638216)

Change-Id: Ide458abc6a4d3ec07973733aa223c4247eef20e6
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 9f0204a44ba0..16d02de1f700 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -671,7 +671,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
 		goto err_already_mapped;
 	}
 
-	area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
+	area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
 	if (area == NULL) {
 		ret = -ENOMEM;
 		failure_string = "get_vm_area";

From 02d82286734d9195913adf42300523c41f1008ab Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 29 Mar 2018 12:14:40 +0300
Subject: [PATCH 1740/3220] UPSTREAM: ANDROID: binder: re-order some conditions

It doesn't make any difference to runtime but I've switched these two
checks to make my static checker happy.

The problem is that "buffer->data_size" is user controlled and if it's
less than "sizeo(*hdr)" then that means "offset" can be more than
"buffer->data_size".  It's just cleaner to check it in the other order.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Martijn Coenen <maco@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 361f2ddbb0c9f9b4f336025a7bd0212cea4a34f0)

Change-Id: I098d525ba63d125caa9840e6e1d5004bf70edc3c
---
 drivers/android/binder.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3ec97d41cf30..d7085625186a 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2215,8 +2215,8 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
 	struct binder_object_header *hdr;
 	size_t object_size = 0;
 
-	if (offset > buffer->data_size - sizeof(*hdr) ||
-	    buffer->data_size < sizeof(*hdr) ||
+	if (buffer->data_size < sizeof(*hdr) ||
+	    offset > buffer->data_size - sizeof(*hdr) ||
 	    !IS_ALIGNED(offset, sizeof(u32)))
 		return 0;
 

From 613e7993d78f5e06f2e56224f85d1068b98d7235 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@google.com>
Date: Fri, 11 May 2018 01:45:24 -0700
Subject: [PATCH 1741/3220] UPSTREAM: ANDROID: binder: remove 32-bit binder
 interface.

New devices launching with Android P need to use the 64-bit
binder interface, even on 32-bit SoCs [0].

This change removes the Kconfig option to select the 32-bit
binder interface. We don't think this will affect existing
userspace for the following reasons:
1) The latest Android common tree is 4.14, so we don't
   believe any Android devices are on kernels >4.14.
2) Android devices launch on an LTS release and stick with
   it, so we wouldn't expect devices running on <= 4.14 now
   to upgrade to 4.17 or later. But even if they did, they'd
   rebuild the world (kernel + userspace) anyway.
3) Other userspaces like 'anbox' are already using the
   64-bit interface.

Note that this change doesn't remove the 32-bit UAPI
itself; the reason for that is that Android userspace
always uses the latest UAPI headers from upstream, and
userspace retains 32-bit support for devices that are
upgrading. This will be removed as well in 2-3 years,
at which point we can remove the code from the UAPI
as well.

Finally, this change introduces build errors on archs where
64-bit get_user/put_user is not supported, so make binder
unavailable on m68k (which wouldn't want it anyway).

[0]: https://android-review.googlesource.com/c/platform/build/+/595193

Signed-off-by: Martijn Coenen <maco@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 1190b4e38f97023154e6b3bef61b251aa5f970d0)

Change-Id: I73dadf1d7b45a42bb18be5d5d3f5c090e61866de
---
 drivers/android/Kconfig  | 15 +--------------
 drivers/android/binder.c |  4 ----
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index 01de42c8b74b..63ed9ceebf7b 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -9,7 +9,7 @@ if ANDROID
 
 config ANDROID_BINDER_IPC
 	bool "Android Binder IPC Driver"
-	depends on MMU
+	depends on MMU && !M68K
 	default n
 	---help---
 	  Binder is used in Android for both communication between processes,
@@ -31,19 +31,6 @@ config ANDROID_BINDER_DEVICES
 	  created. Each binder device has its own context manager, and is
 	  therefore logically separated from the other devices.
 
-config ANDROID_BINDER_IPC_32BIT
-	bool
-	depends on !64BIT && ANDROID_BINDER_IPC
-	default y
-	---help---
-	  The Binder API has been changed to support both 32 and 64bit
-	  applications in a mixed environment.
-
-	  Enable this to support an old 32-bit Android user-space (v4.4 and
-	  earlier).
-
-	  Note that enabling this will break newer Android user-space.
-
 config ANDROID_BINDER_IPC_SELFTEST
 	bool "Android Binder IPC Driver Selftest"
 	depends on ANDROID_BINDER_IPC
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d7085625186a..d4fb60022d60 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -71,10 +71,6 @@
 #include <linux/security.h>
 #include <linux/spinlock.h>
 
-#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
-#define BINDER_IPC_32BIT 1
-#endif
-
 #include <uapi/linux/android/binder.h>
 #include "binder_alloc.h"
 #include "binder_trace.h"

From 157ecdc803759e9455b7ef27f3fad97361478837 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AE=8B=E9=87=91=E6=97=B6?= <songjinshi@xiaomi.com>
Date: Thu, 10 May 2018 02:05:03 +0000
Subject: [PATCH 1742/3220] UPSTREAM: ANDROID: binder: correct the cmd print
 for BINDER_WORK_RETURN_ERROR

When to execute binder_stat_br the e->cmd has been modifying as BR_OK
instead of the original return error cmd, in fact we want to know the
original return error, such as BR_DEAD_REPLY or BR_FAILED_REPLY, etc.
instead of always BR_OK, in order to avoid the value of the e->cmd is
always BR_OK, so we need assign the value of the e->cmd to cmd before
e->cmd = BR_OK.

Signed-off-by: songjinshi <songjinshi@xiaomi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 838d5565669aa5bb7deb605684a5970d51d5eaf6)

Change-Id: I425b32c5419a491c6b9ceee7c00dde6513e0421d
---
 drivers/android/binder.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d4fb60022d60..8d303b672e0e 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4094,6 +4094,7 @@ retry:
 			binder_inner_proc_unlock(proc);
 			if (put_user(e->cmd, (uint32_t __user *)ptr))
 				return -EFAULT;
+			cmd = e->cmd;
 			e->cmd = BR_OK;
 			ptr += sizeof(uint32_t);
 

From e734b26701c1bab9b8a082a53f9b2cabf10051ee Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 7 May 2018 23:15:37 +0900
Subject: [PATCH 1743/3220] UPSTREAM: ANDROID: binder: change down_write to
 down_read
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

binder_update_page_range needs down_write of mmap_sem because
vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
it is set. However, when I profile binder working, it seems
every binder buffers should be mapped in advance by binder_mmap.
It means we could set VM_MIXEDMAP in binder_mmap time which is
already hold a mmap_sem as down_write so binder_update_page_range
doesn't need to hold a mmap_sem as down_write.
Please use proper API down_read. It would help mmap_sem contention
problem as well as fixing down_write abuse.

Ganesh Mahendran tested app launching and binder throughput test
and he said he couldn't find any problem and I did binder latency
test per Greg KH request(Thanks Martijn to teach me how I can do)
I cannot find any problem, too.

Cc: Ganesh Mahendran <opensource.ganesh@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Todd Kjos <tkjos@google.com>
Reviewed-by: Martijn Coenen <maco@android.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 720c241924046aff83f5f2323232f34a30a4c281)

Change-Id: I8358ceaaab4030f7122c95308dcad59557cad411
---
 drivers/android/binder.c       | 4 +++-
 drivers/android/binder_alloc.c | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 8d303b672e0e..eb61b0c2fddc 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4899,7 +4899,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 		failure_string = "bad vm_flags";
 		goto err_bad_arg;
 	}
-	vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
+	vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP;
+	vma->vm_flags &= ~VM_MAYWRITE;
+
 	vma->vm_ops = &binder_vm_ops;
 	vma->vm_private_data = proc;
 
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 16d02de1f700..1d9db2ef26bd 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -220,7 +220,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		mm = alloc->vma_vm_mm;
 
 	if (mm) {
-		down_write(&mm->mmap_sem);
+		down_read(&mm->mmap_sem);
 		vma = alloc->vma;
 	}
 
@@ -289,7 +289,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		/* vm_insert_page does not seem to increment the refcount */
 	}
 	if (mm) {
-		up_write(&mm->mmap_sem);
+		up_read(&mm->mmap_sem);
 		mmput(mm);
 	}
 	return 0;
@@ -322,7 +322,7 @@ err_page_ptr_cleared:
 	}
 err_no_vma:
 	if (mm) {
-		up_write(&mm->mmap_sem);
+		up_read(&mm->mmap_sem);
 		mmput(mm);
 	}
 	return vma ? -ENOMEM : -ESRCH;

From 96523f2450dc194ee8b914f9315fa6b609d172f3 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Fri, 15 Jun 2018 11:53:36 +0200
Subject: [PATCH 1744/3220] UPSTREAM: Revert "FROMLIST: binder: fix proc->files
 use-after-free"

This reverts commit f09daf140e6e6d3b34e34382bc47a06b854b774e.

Change-Id: I6d340f75e57e1badc5fe3f41e0aa8f148047c7bd
---
 drivers/android/binder.c | 63 +++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index eb61b0c2fddc..38e22aca89f6 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -462,8 +462,9 @@ struct binder_ref {
 };
 
 enum binder_deferred_state {
-	BINDER_DEFERRED_FLUSH        = 0x01,
-	BINDER_DEFERRED_RELEASE      = 0x02,
+	BINDER_DEFERRED_PUT_FILES    = 0x01,
+	BINDER_DEFERRED_FLUSH        = 0x02,
+	BINDER_DEFERRED_RELEASE      = 0x04,
 };
 
 /**
@@ -500,6 +501,8 @@ struct binder_priority {
  *                        (invariant after initialized)
  * @tsk                   task_struct for group_leader of process
  *                        (invariant after initialized)
+ * @files                 files_struct for process
+ *                        (invariant after initialized)
  * @deferred_work_node:   element for binder_deferred_list
  *                        (protected by binder_deferred_lock)
  * @deferred_work:        bitmap of deferred work to perform
@@ -544,6 +547,7 @@ struct binder_proc {
 	struct list_head waiting_threads;
 	int pid;
 	struct task_struct *tsk;
+	struct files_struct *files;
 	struct hlist_node deferred_work_node;
 	int deferred_work;
 	bool is_dead;
@@ -938,34 +942,22 @@ static void binder_free_thread(struct binder_thread *thread);
 static void binder_free_proc(struct binder_proc *proc);
 static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
 
-struct files_struct *binder_get_files_struct(struct binder_proc *proc)
-{
-	return get_files_struct(proc->tsk);
-}
-
 static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 {
-	struct files_struct *files;
+	struct files_struct *files = proc->files;
 	unsigned long rlim_cur;
 	unsigned long irqs;
-	int ret;
 
-	files = binder_get_files_struct(proc);
 	if (files == NULL)
 		return -ESRCH;
 
-	if (!lock_task_sighand(proc->tsk, &irqs)) {
-		ret = -EMFILE;
-		goto err;
-	}
+	if (!lock_task_sighand(proc->tsk, &irqs))
+		return -EMFILE;
 
 	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
 	unlock_task_sighand(proc->tsk, &irqs);
 
-	ret = __alloc_fd(files, 0, rlim_cur, flags);
-err:
-	put_files_struct(files);
-	return ret;
+	return __alloc_fd(files, 0, rlim_cur, flags);
 }
 
 /*
@@ -974,12 +966,8 @@ err:
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
-	struct files_struct *files = binder_get_files_struct(proc);
-
-	if (files) {
-		__fd_install(files, fd, file);
-		put_files_struct(files);
-	}
+	if (proc->files)
+		__fd_install(proc->files, fd, file);
 }
 
 /*
@@ -987,20 +975,18 @@ static void task_fd_install(
  */
 static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 {
-	struct files_struct *files = binder_get_files_struct(proc);
 	int retval;
 
-	if (files == NULL)
+	if (proc->files == NULL)
 		return -ESRCH;
 
-	retval = __close_fd(files, fd);
+	retval = __close_fd(proc->files, fd);
 	/* can't restart close syscall because file table entry was cleared */
 	if (unlikely(retval == -ERESTARTSYS ||
 		     retval == -ERESTARTNOINTR ||
 		     retval == -ERESTARTNOHAND ||
 		     retval == -ERESTART_RESTARTBLOCK))
 		retval = -EINTR;
-	put_files_struct(files);
 
 	return retval;
 }
@@ -4863,6 +4849,7 @@ static void binder_vma_close(struct vm_area_struct *vma)
 		     (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
 		     (unsigned long)pgprot_val(vma->vm_page_prot));
 	binder_alloc_vma_close(&proc->alloc);
+	binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
 }
 
 static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -4906,8 +4893,10 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	vma->vm_private_data = proc;
 
 	ret = binder_alloc_mmap_handler(&proc->alloc, vma);
-
-	return ret;
+	if (ret)
+		return ret;
+	proc->files = get_files_struct(current);
+	return 0;
 
 err_bad_arg:
 	pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
@@ -5086,6 +5075,8 @@ static void binder_deferred_release(struct binder_proc *proc)
 	struct rb_node *n;
 	int threads, nodes, incoming_refs, outgoing_refs, active_transactions;
 
+	BUG_ON(proc->files);
+
 	mutex_lock(&binder_procs_lock);
 	hlist_del(&proc->proc_node);
 	mutex_unlock(&binder_procs_lock);
@@ -5167,6 +5158,8 @@ static void binder_deferred_release(struct binder_proc *proc)
 static void binder_deferred_func(struct work_struct *work)
 {
 	struct binder_proc *proc;
+	struct files_struct *files;
+
 	int defer;
 
 	do {
@@ -5183,11 +5176,21 @@ static void binder_deferred_func(struct work_struct *work)
 		}
 		mutex_unlock(&binder_deferred_lock);
 
+		files = NULL;
+		if (defer & BINDER_DEFERRED_PUT_FILES) {
+			files = proc->files;
+			if (files)
+				proc->files = NULL;
+		}
+
 		if (defer & BINDER_DEFERRED_FLUSH)
 			binder_deferred_flush(proc);
 
 		if (defer & BINDER_DEFERRED_RELEASE)
 			binder_deferred_release(proc); /* frees proc */
+
+		if (files)
+			put_files_struct(files);
 	} while (proc);
 }
 static DECLARE_WORK(binder_deferred_work, binder_deferred_func);

From c88a3ec1ee64576a706432af3e8f1aee8e6e4d2e Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@android.com>
Date: Mon, 27 Nov 2017 09:32:33 -0800
Subject: [PATCH 1745/3220] UPSTREAM: binder: fix proc->files use-after-free

proc->files cleanup is initiated by binder_vma_close. Therefore
a reference on the binder_proc is not enough to prevent the
files_struct from being released while the binder_proc still has
a reference. This can lead to an attempt to dereference the
stale pointer obtained from proc->files prior to proc->files
cleanup. This has been seen once in task_get_unused_fd_flags()
when __alloc_fd() is called with a stale "files".

The fix is to protect proc->files with a mutex to prevent cleanup
while in use.

Signed-off-by: Todd Kjos <tkjos@google.com>
Cc: stable <stable@vger.kernel.org> # 4.14
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 7f3dc0088b98533f17128058fac73cd8b2752ef1)

Change-Id: I40982bb0b4615bda5459538c20eb2a913964042c
---
 drivers/android/binder.c | 44 ++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 38e22aca89f6..ef7c58541c5c 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -502,7 +502,8 @@ struct binder_priority {
  * @tsk                   task_struct for group_leader of process
  *                        (invariant after initialized)
  * @files                 files_struct for process
- *                        (invariant after initialized)
+ *                        (protected by @files_lock)
+ * @files_lock            mutex to protect @files
  * @deferred_work_node:   element for binder_deferred_list
  *                        (protected by binder_deferred_lock)
  * @deferred_work:        bitmap of deferred work to perform
@@ -548,6 +549,7 @@ struct binder_proc {
 	int pid;
 	struct task_struct *tsk;
 	struct files_struct *files;
+	struct mutex files_lock;
 	struct hlist_node deferred_work_node;
 	int deferred_work;
 	bool is_dead;
@@ -944,20 +946,26 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
 
 static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 {
-	struct files_struct *files = proc->files;
 	unsigned long rlim_cur;
 	unsigned long irqs;
+	int ret;
 
-	if (files == NULL)
-		return -ESRCH;
-
-	if (!lock_task_sighand(proc->tsk, &irqs))
-		return -EMFILE;
-
+	mutex_lock(&proc->files_lock);
+	if (proc->files == NULL) {
+		ret = -ESRCH;
+		goto err;
+	}
+	if (!lock_task_sighand(proc->tsk, &irqs)) {
+		ret = -EMFILE;
+		goto err;
+	}
 	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
 	unlock_task_sighand(proc->tsk, &irqs);
 
-	return __alloc_fd(files, 0, rlim_cur, flags);
+	ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
+err:
+	mutex_unlock(&proc->files_lock);
+	return ret;
 }
 
 /*
@@ -966,8 +974,10 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
+	mutex_lock(&proc->files_lock);
 	if (proc->files)
 		__fd_install(proc->files, fd, file);
+	mutex_unlock(&proc->files_lock);
 }
 
 /*
@@ -977,9 +987,11 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 {
 	int retval;
 
-	if (proc->files == NULL)
-		return -ESRCH;
-
+	mutex_lock(&proc->files_lock);
+	if (proc->files == NULL) {
+		retval = -ESRCH;
+		goto err;
+	}
 	retval = __close_fd(proc->files, fd);
 	/* can't restart close syscall because file table entry was cleared */
 	if (unlikely(retval == -ERESTARTSYS ||
@@ -987,7 +999,8 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 		     retval == -ERESTARTNOHAND ||
 		     retval == -ERESTART_RESTARTBLOCK))
 		retval = -EINTR;
-
+err:
+	mutex_unlock(&proc->files_lock);
 	return retval;
 }
 
@@ -4895,7 +4908,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	ret = binder_alloc_mmap_handler(&proc->alloc, vma);
 	if (ret)
 		return ret;
+	mutex_lock(&proc->files_lock);
 	proc->files = get_files_struct(current);
+	mutex_unlock(&proc->files_lock);
 	return 0;
 
 err_bad_arg:
@@ -4919,6 +4934,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	spin_lock_init(&proc->outer_lock);
 	get_task_struct(current->group_leader);
 	proc->tsk = current->group_leader;
+	mutex_init(&proc->files_lock);
 	INIT_LIST_HEAD(&proc->todo);
 	if (binder_supported_policy(current->policy)) {
 		proc->default_priority.sched_policy = current->policy;
@@ -5178,9 +5194,11 @@ static void binder_deferred_func(struct work_struct *work)
 
 		files = NULL;
 		if (defer & BINDER_DEFERRED_PUT_FILES) {
+			mutex_lock(&proc->files_lock);
 			files = proc->files;
 			if (files)
 				proc->files = NULL;
+			mutex_unlock(&proc->files_lock);
 		}
 
 		if (defer & BINDER_DEFERRED_FLUSH)

From 8dd84f190eec55766eb3e6215af185b1a3cf59f4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 21 Aug 2017 16:13:28 +0200
Subject: [PATCH 1746/3220] UPSTREAM: binder: free memory on error

On binder_init() the devices string is duplicated and smashed into individual
device names which are passed along. However, the original duplicated string
wasn't freed in case binder_init() failed. Let's free it on error.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 22eb9476b5d80a393ac0ba235c42bccc90b82c76)

Change-Id: I78fdeecf70c31ba4248b3de17130f97546288f84
---
 drivers/android/binder.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index ef7c58541c5c..e159ab84547d 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -5797,7 +5797,7 @@ static int __init init_binder_device(const char *name)
 static int __init binder_init(void)
 {
 	int ret;
-	char *device_name, *device_names;
+	char *device_name, *device_names, *device_tmp;
 	struct binder_device *device;
 	struct hlist_node *tmp;
 
@@ -5855,7 +5855,8 @@ static int __init binder_init(void)
 	}
 	strcpy(device_names, binder_devices_param);
 
-	while ((device_name = strsep(&device_names, ","))) {
+	device_tmp = device_names;
+	while ((device_name = strsep(&device_tmp, ","))) {
 		ret = init_binder_device(device_name);
 		if (ret)
 			goto err_init_binder_device_failed;
@@ -5869,6 +5870,9 @@ err_init_binder_device_failed:
 		hlist_del(&device->hlist);
 		kfree(device);
 	}
+
+	kfree(device_names);
+
 err_alloc_device_names_failed:
 	debugfs_remove_recursive(binder_debugfs_dir_entry_root);
 

From bda6b6e49b198870ddfd6cbbc88dda8271ddbbc5 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@android.com>
Date: Wed, 7 Feb 2018 13:57:37 -0800
Subject: [PATCH 1747/3220] UPSTREAM: binder: replace "%p" with "%pK"

The format specifier "%p" can leak kernel addresses. Use
"%pK" instead. There were 4 remaining cases in binder.c.

Signed-off-by: Todd Kjos <tkjos@google.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 8ca86f1639ec5890d400fff9211aca22d0a392eb)

Change-Id: I309241853c53bcdfa65c17cb05876e786597afdd
---
 drivers/android/binder.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index e159ab84547d..e0c46ce312d7 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2351,7 +2351,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 	int debug_id = buffer->debug_id;
 
 	binder_debug(BINDER_DEBUG_TRANSACTION,
-		     "%d buffer release %d, size %zd-%zd, failed at %p\n",
+		     "%d buffer release %d, size %zd-%zd, failed at %pK\n",
 		     proc->pid, buffer->debug_id,
 		     buffer->data_size, buffer->offsets_size, failed_at);
 
@@ -3887,7 +3887,7 @@ static int binder_thread_write(struct binder_proc *proc,
 				}
 			}
 			binder_debug(BINDER_DEBUG_DEAD_BINDER,
-				     "%d:%d BC_DEAD_BINDER_DONE %016llx found %p\n",
+				     "%d:%d BC_DEAD_BINDER_DONE %016llx found %pK\n",
 				     proc->pid, thread->pid, (u64)cookie,
 				     death);
 			if (death == NULL) {
@@ -5237,7 +5237,7 @@ static void print_binder_transaction_ilocked(struct seq_file *m,
 	spin_lock(&t->lock);
 	to_proc = t->to_proc;
 	seq_printf(m,
-		   "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %d:%d r%d",
+		   "%s %d: %pK from %d:%d to %d:%d code %x flags %x pri %d:%d r%d",
 		   prefix, t->debug_id, t,
 		   t->from ? t->from->proc->pid : 0,
 		   t->from ? t->from->pid : 0,
@@ -5262,7 +5262,7 @@ static void print_binder_transaction_ilocked(struct seq_file *m,
 	}
 	if (buffer->target_node)
 		seq_printf(m, " node %d", buffer->target_node->debug_id);
-	seq_printf(m, " size %zd:%zd data %p\n",
+	seq_printf(m, " size %zd:%zd data %pK\n",
 		   buffer->data_size, buffer->offsets_size,
 		   buffer->data);
 }

From e95033171949d70bba6f3c582b063e3bb8620f86 Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Fri, 13 Jul 2018 14:31:40 -0700
Subject: [PATCH 1748/3220] ANDROID: Reduce use of #ifdef CONFIG_CPU_FREQ_TIMES

Add empty versions of functions to cpufreq_times.h to cut down on use
of #ifdef in .c files.

Test: kernel builds with and without CONFIG_CPU_FREQ_TIMES=y
Change-Id: I49ac364fac3d42bba0ca1801e23b15081094fb12
Signed-off-by: Connor O'Brien <connoro@google.com>
---
 include/linux/cpufreq_times.h | 4 ++++
 kernel/exit.c                 | 3 +--
 kernel/sched/core.c           | 2 --
 kernel/sched/cputime.c        | 5 +----
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h
index 3fb38750c853..6374d4205a5f 100644
--- a/include/linux/cpufreq_times.h
+++ b/include/linux/cpufreq_times.h
@@ -31,6 +31,10 @@ void cpufreq_times_record_transition(struct cpufreq_freqs *freq);
 void cpufreq_task_times_remove_uids(uid_t uid_start, uid_t uid_end);
 int single_uid_time_in_state_open(struct inode *inode, struct file *file);
 #else
+static inline void cpufreq_task_times_init(struct task_struct *p) {}
+static inline void cpufreq_task_times_exit(struct task_struct *p) {}
+static inline void cpufreq_acct_update_power(struct task_struct *p,
+					     u64 cputime) {}
 static inline void cpufreq_times_create_policy(struct cpufreq_policy *policy) {}
 static inline void cpufreq_times_record_transition(
 	struct cpufreq_freqs *freq) {}
diff --git a/kernel/exit.c b/kernel/exit.c
index e9bfee5fcce5..4479af833505 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -174,9 +174,8 @@ void release_task(struct task_struct *p)
 {
 	struct task_struct *leader;
 	int zap_leader;
-#ifdef CONFIG_CPU_FREQ_TIMES
+
 	cpufreq_task_times_exit(p);
-#endif
 repeat:
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a0c1841993f..8f389b86bf34 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2198,9 +2198,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
-#ifdef CONFIG_CPU_FREQ_TIMES
 	cpufreq_task_times_init(p);
-#endif
 
 	RB_CLEAR_NODE(&p->dl.rb_node);
 	init_dl_task_timer(&p->dl);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index c0763cba909d..fc2cfd6b2941 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -167,10 +167,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 	/* Account for user time used */
 	acct_account_cputime(p);
 
-#ifdef CONFIG_CPU_FREQ_TIMES
 	/* Account power usage for user time */
 	cpufreq_acct_update_power(p, cputime);
-#endif
 }
 
 /*
@@ -221,10 +219,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
 
 	/* Account for system time used */
 	acct_account_cputime(p);
-#ifdef CONFIG_CPU_FREQ_TIMES
+
 	/* Account power usage for system time */
 	cpufreq_acct_update_power(p, cputime);
-#endif
 }
 
 /*

From 1b37d68f4c82b1c40cc16478de1f8f6a1f584a82 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultanxda@gmail.com>
Date: Sun, 3 Jun 2018 10:47:51 -0700
Subject: [PATCH 1749/3220] ANDROID: Fix massive cpufreq_times memory leaks

Every time _cpu_up() is called for a CPU, idle_thread_get() is called
which then re-initializes a CPU's idle thread that was already
previously created and cached in a global variable in
smpboot.c. idle_thread_get() calls init_idle() which then calls
__sched_fork(). __sched_fork() is where cpufreq_task_times_init() is,
and cpufreq_task_times_init() allocates memory for the task struct's
time_in_state array.

Since idle_thread_get() reuses a task struct instance that was already
previously created, this means that every time it calls init_idle(),
cpufreq_task_times_init() allocates this array again and overwrites
the existing allocation that the idle thread already had.

This causes memory to be leaked every time a CPU is onlined. In order
to fix this, move allocation of time_in_state into _do_fork to avoid
allocating it at all for idle threads. The cpufreq times interface is
intended to be used for tracking userspace tasks, so we can safely
remove it from the kernel's idle threads without killing any
functionality.

But that's not all!

Task structs can be freed outside of release_task(), which creates
another memory leak because a task struct can be freed without having
its cpufreq times allocation freed. To fix this, free the cpufreq
times allocation at the same time that task struct allocations are
freed, in free_task().

Since free_task() can also be called in error paths of copy_process()
after dup_task_struct(), set time_in_state to NULL immediately after
calling dup_task_struct() to avoid possible double free.

Bug description and fix adapted from patch submitted by
Sultan Alsawaf <sultanxda@gmail.com> at
https://android-review.googlesource.com/c/kernel/msm/+/700134

Bug: 110044919
Test: Hikey960 builds, boots & reports /proc/<pid>/time_in_state
correctly
Change-Id: I12fe7611fc88eb7f6c39f8f7629ad27b6ec4722c
Signed-off-by: Connor O'Brien <connoro@google.com>
---
 drivers/cpufreq/cpufreq_times.c | 9 ++++++---
 include/linux/cpufreq_times.h   | 2 ++
 kernel/exit.c                   | 3 ---
 kernel/fork.c                   | 6 ++++++
 kernel/sched/core.c             | 2 --
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
index e5df7a47cc16..e7a8b636a5f4 100644
--- a/drivers/cpufreq/cpufreq_times.c
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -234,16 +234,19 @@ static int uid_time_in_state_seq_show(struct seq_file *m, void *v)
 
 void cpufreq_task_times_init(struct task_struct *p)
 {
-	void *temp;
 	unsigned long flags;
-	unsigned int max_state;
 
 	spin_lock_irqsave(&task_time_in_state_lock, flags);
 	p->time_in_state = NULL;
 	spin_unlock_irqrestore(&task_time_in_state_lock, flags);
 	p->max_state = 0;
+}
 
-	max_state = READ_ONCE(next_offset);
+void cpufreq_task_times_alloc(struct task_struct *p)
+{
+	void *temp;
+	unsigned long flags;
+	unsigned int max_state = READ_ONCE(next_offset);
 
 	/* We use one array to avoid multiple allocs per task */
 	temp = kcalloc(max_state, sizeof(p->time_in_state[0]), GFP_ATOMIC);
diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h
index 6374d4205a5f..356a3fad03c9 100644
--- a/include/linux/cpufreq_times.h
+++ b/include/linux/cpufreq_times.h
@@ -22,6 +22,7 @@
 
 #ifdef CONFIG_CPU_FREQ_TIMES
 void cpufreq_task_times_init(struct task_struct *p);
+void cpufreq_task_times_alloc(struct task_struct *p);
 void cpufreq_task_times_exit(struct task_struct *p);
 int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
 			    struct pid *pid, struct task_struct *p);
@@ -32,6 +33,7 @@ void cpufreq_task_times_remove_uids(uid_t uid_start, uid_t uid_end);
 int single_uid_time_in_state_open(struct inode *inode, struct file *file);
 #else
 static inline void cpufreq_task_times_init(struct task_struct *p) {}
+static inline void cpufreq_task_times_alloc(struct task_struct *p) {}
 static inline void cpufreq_task_times_exit(struct task_struct *p) {}
 static inline void cpufreq_acct_update_power(struct task_struct *p,
 					     u64 cputime) {}
diff --git a/kernel/exit.c b/kernel/exit.c
index 4479af833505..e57bff761b88 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,7 +54,6 @@
 #include <linux/writeback.h>
 #include <linux/shm.h>
 #include <linux/kcov.h>
-#include <linux/cpufreq_times.h>
 
 #include "sched/tune.h"
 
@@ -174,8 +173,6 @@ void release_task(struct task_struct *p)
 {
 	struct task_struct *leader;
 	int zap_leader;
-
-	cpufreq_task_times_exit(p);
 repeat:
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
diff --git a/kernel/fork.c b/kernel/fork.c
index a24b96015538..80445ca0420b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
 #include <linux/compiler.h>
 #include <linux/sysctl.h>
 #include <linux/kcov.h>
+#include <linux/cpufreq_times.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -226,6 +227,7 @@ static void account_kernel_stack(unsigned long *stack, int account)
 
 void free_task(struct task_struct *tsk)
 {
+	cpufreq_task_times_exit(tsk);
 	account_kernel_stack(tsk->stack, -1);
 	arch_release_thread_stack(tsk->stack);
 	free_thread_stack(tsk->stack);
@@ -1360,6 +1362,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (!p)
 		goto fork_out;
 
+	cpufreq_task_times_init(p);
+
 	ftrace_graph_init_task(p);
 
 	rt_mutex_init_task(p);
@@ -1791,6 +1795,8 @@ long _do_fork(unsigned long clone_flags,
 		struct completion vfork;
 		struct pid *pid;
 
+		cpufreq_task_times_alloc(p);
+
 		trace_sched_process_fork(current, p);
 
 		pid = get_task_pid(p, PIDTYPE_PID);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8f389b86bf34..0c1b195a2aaf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2198,8 +2198,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
-	cpufreq_task_times_init(p);
-
 	RB_CLEAR_NODE(&p->dl.rb_node);
 	init_dl_task_timer(&p->dl);
 	__dl_clear_params(p);

From f402eb9ad587a3952a2180193fa62eeac4289bdc Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 25 Jul 2018 16:11:38 -0700
Subject: [PATCH 1750/3220] x86_64_cuttlefish_defconfig: enable verity cert

Bug: 72722987
Test: Build, boot and verify in /proc/keys
Change-Id: Ia55b94d56827003a88cb6083a75340ee31347470
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig |  5 ++++
 verity_dev_keys.x509                         | 24 ++++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 verity_dev_keys.x509

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 71026930c04c..7b63741c622d 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -447,3 +447,8 @@ CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_SHA512=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_SYSTEM_TRUSTED_KEYRING=y
+CONFIG_SYSTEM_TRUSTED_KEYS="verity_dev_keys.x509"
diff --git a/verity_dev_keys.x509 b/verity_dev_keys.x509
new file mode 100644
index 000000000000..86399c3c1dd7
--- /dev/null
+++ b/verity_dev_keys.x509
@@ -0,0 +1,24 @@
+-----BEGIN CERTIFICATE-----
+MIID/TCCAuWgAwIBAgIJAJcPmDkJqolJMA0GCSqGSIb3DQEBBQUAMIGUMQswCQYD
+VQQGEwJVUzETMBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNTW91bnRhaW4g
+VmlldzEQMA4GA1UECgwHQW5kcm9pZDEQMA4GA1UECwwHQW5kcm9pZDEQMA4GA1UE
+AwwHQW5kcm9pZDEiMCAGCSqGSIb3DQEJARYTYW5kcm9pZEBhbmRyb2lkLmNvbTAe
+Fw0xNDExMDYxOTA3NDBaFw00MjAzMjQxOTA3NDBaMIGUMQswCQYDVQQGEwJVUzET
+MBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNTW91bnRhaW4gVmlldzEQMA4G
+A1UECgwHQW5kcm9pZDEQMA4GA1UECwwHQW5kcm9pZDEQMA4GA1UEAwwHQW5kcm9p
+ZDEiMCAGCSqGSIb3DQEJARYTYW5kcm9pZEBhbmRyb2lkLmNvbTCCASIwDQYJKoZI
+hvcNAQEBBQADggEPADCCAQoCggEBAOjreE0vTVSRenuzO9vnaWfk0eQzYab0gqpi
+6xAzi6dmD+ugoEKJmbPiuE5Dwf21isZ9uhUUu0dQM46dK4ocKxMRrcnmGxydFn6o
+fs3ODJMXOkv2gKXL/FdbEPdDbxzdu8z3yk+W67udM/fW7WbaQ3DO0knu+izKak/3
+T41c5uoXmQ81UNtAzRGzGchNVXMmWuTGOkg6U+0I2Td7K8yvUMWhAWPPpKLtVH9r
+AL5TzjYNR92izdKcz3AjRsI3CTjtpiVABGeX0TcjRSuZB7K9EK56HV+OFNS6I1NP
+jdD7FIShyGlqqZdUOkAUZYanbpgeT5N7QL6uuqcGpoTOkalu6kkCAwEAAaNQME4w
+HQYDVR0OBBYEFH5DM/m7oArf4O3peeKO0ZIEkrQPMB8GA1UdIwQYMBaAFH5DM/m7
+oArf4O3peeKO0ZIEkrQPMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQEFBQADggEB
+AHO3NSvDE5jFvMehGGtS8BnFYdFKRIglDMc4niWSzhzOVYRH4WajxdtBWc5fx0ix
+NF/+hVKVhP6AIOQa+++sk+HIi7RvioPPbhjcsVlZe7cUEGrLSSveGouQyc+j0+m6
+JF84kszIl5GGNMTnx0XRPO+g8t6h5LWfnVydgZfpGRRg+WHewk1U2HlvTjIceb0N
+dcoJ8WKJAFWdcuE7VIm4w+vF/DYX/A2Oyzr2+QRhmYSv1cusgAeC1tvH4ap+J1Lg
+UnOu5Kh/FqPLLSwNVQp4Bu7b9QFfqK8Moj84bj88NqRGZgDyqzuTrFxn6FW7dmyA
+yttuAJAEAymk1mipd9+zp38=
+-----END CERTIFICATE-----

From 56b516c5e343439b0a33c797de1bed012d679fe0 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 25 Jul 2018 16:11:09 -0700
Subject: [PATCH 1751/3220] x86_64_cuttlefish_defconfig: Enable android-verity

Bug: 72722987
Test: Build & boot with x86_64_cuttlefish_defconfig
Change-Id: I961e6aaa944b5ab0c005cb39604a52f8dc98fb06
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 7b63741c622d..df9b6bd228f7 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -214,13 +214,17 @@ CONFIG_SCSI_CONSTANTS=y
 CONFIG_SCSI_SPI_ATTRS=y
 CONFIG_SCSI_VIRTIO=y
 CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=y
 CONFIG_BLK_DEV_DM=y
 CONFIG_DM_CRYPT=y
 CONFIG_DM_MIRROR=y
 CONFIG_DM_ZERO=y
 CONFIG_DM_UEVENT=y
 CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE=1
 CONFIG_DM_VERITY_FEC=y
+CONFIG_DM_ANDROID_VERITY=y
 CONFIG_NETDEVICES=y
 CONFIG_NETCONSOLE=y
 CONFIG_NETCONSOLE_DYNAMIC=y

From 9fa2a49a4ac4f105943fe244b0fa825beb1d1ba5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Jul 2018 12:08:54 +0200
Subject: [PATCH 1752/3220] tcp: avoid collapses in tcp_prune_queue() if
 possible

[ Upstream commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 ]

Right after a TCP flow is created, receiving tiny out of order
packets allways hit the condition :

if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
	tcp_clamp_window(sk);

tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc
(guarded by tcp_rmem[2])

Calling tcp_collapse_ofo_queue() in this case is not useful,
and offers a O(N^2) surface attack to malicious peers.

Better not attempt anything before full queue capacity is reached,
forcing attacker to spend lots of resource and allow us to more
easily detect the abuse.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 net/ipv4/tcp_input.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 667a2573317f..c757a74b2916 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4866,6 +4866,9 @@ static int tcp_prune_queue(struct sock *sk)
 	else if (tcp_under_memory_pressure(sk))
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
 	tcp_collapse_ofo_queue(sk);
 	if (!skb_queue_empty(&sk->sk_receive_queue))
 		tcp_collapse(sk, &sk->sk_receive_queue,

From 792e682a471db356576473d90e9700071171d7e1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Jul 2018 12:08:55 +0200
Subject: [PATCH 1753/3220] tcp: detect malicious patterns in
 tcp_collapse_ofo_queue()

[ Upstream commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf ]

In case an attacker feeds tiny packets completely out of order,
tcp_collapse_ofo_queue() might scan the whole rb-tree, performing
expensive copies, but not changing socket memory usage at all.

1) Do not attempt to collapse tiny skbs.
2) Add logic to exit early when too many tiny skbs are detected.

We prefer not doing aggressive collapsing (which copies packets)
for pathological flows, and revert to tcp_prune_ofo_queue() which
will be less expensive.

In the future, we might add the possibility of terminating flows
that are proven to be malicious.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 net/ipv4/tcp_input.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c757a74b2916..cc9874d7f223 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4780,6 +4780,7 @@ restart:
 static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 range_truesize, sum_tiny = 0;
 	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
 	struct sk_buff *head;
 	u32 start, end;
@@ -4789,6 +4790,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
+	range_truesize = skb->truesize;
 	head = skb;
 
 	for (;;) {
@@ -4803,8 +4805,17 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 		if (!skb ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, &tp->out_of_order_queue,
-				     head, skb, start, end);
+			/* Do not attempt collapsing tiny skbs */
+			if (range_truesize != head->truesize ||
+			    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+				tcp_collapse(sk, &tp->out_of_order_queue,
+					     head, skb, start, end);
+			} else {
+				sum_tiny += range_truesize;
+				if (sum_tiny > sk->sk_rcvbuf >> 3)
+					return;
+			}
+
 			head = skb;
 			if (!skb)
 				break;

From 7a77ef209cfb4e760c70e56bc5fdeaed8f34ed45 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 27 Jul 2018 13:58:53 +0200
Subject: [PATCH 1754/3220] Fix backport of "tcp: detect malicious patterns in
 tcp_collapse_ofo_queue()"

Based on review from Eric Dumazet, my backport of commit
3d4bf93ac12003f9b8e1e2de37fe27983deebdcf to older kernels was a bit
incorrect.  This patch fixes this.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 net/ipv4/tcp_input.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cc9874d7f223..d3b7172b9ee0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4822,6 +4822,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 			/* Start new segment */
 			start = TCP_SKB_CB(skb)->seq;
 			end = TCP_SKB_CB(skb)->end_seq;
+			range_truesize = skb->truesize;
 		} else {
 			if (before(TCP_SKB_CB(skb)->seq, start))
 				start = TCP_SKB_CB(skb)->seq;

From 9664bdeff388afe4749eee2332bc0cc220b6cee2 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 26 Jul 2018 16:32:09 -0700
Subject: [PATCH 1755/3220] ANDROID: sdcardfs: Check stacked filesystem depth

bug: 111860541
Change-Id: Ia0a30b2b8956c4ada28981584cd8647713a1e993
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/main.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 30e0c431a1ea..27ec726e7a46 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -295,6 +295,13 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
 	atomic_inc(&lower_sb->s_active);
 	sdcardfs_set_lower_super(sb, lower_sb);
 
+	sb->s_stack_depth = lower_sb->s_stack_depth + 1;
+	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+		pr_err("sdcardfs: maximum fs stacking depth exceeded\n");
+		err = -EINVAL;
+		goto out_sput;
+	}
+
 	/* inherit maxbytes from lower file system */
 	sb->s_maxbytes = lower_sb->s_maxbytes;
 

From 2241aa98c9aa6b5b5fec1a0ce4af9efb0d9eacdc Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Tue, 31 Jul 2018 20:28:27 +0200
Subject: [PATCH 1756/3220] kernel/sys.c: fix merge error with 4.4.144

Change-Id: I7b3b8c82b81f465e47a653c71e7d945cd1cd990d
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 kernel/sys.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sys.c b/kernel/sys.c
index 3195435462a6..0df4753d4969 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2455,6 +2455,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		break;
 	case PR_SET_VMA:
 		error = prctl_set_vma(arg2, arg3, arg4, arg5);
+		break;
 	case PR_GET_SPECULATION_CTRL:
 		if (arg3 || arg4 || arg5)
 			return -EINVAL;

From 37af2ff398acf2743e24d25a0ba20cfbf1c84d72 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 22 Aug 2018 17:07:20 -0700
Subject: [PATCH 1757/3220] ANDROID: Refresh x86_64_cuttlefish_defconfig

An LTS change removed the need to set a config option. This broke the
comparison validation with the output of "make savedefconfig".

Change-Id: Id7ed6c6546d0efe88b67c0d1b92183152406e6f6
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index df9b6bd228f7..99d7b53932f7 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -449,7 +449,6 @@ CONFIG_HARDENED_USERCOPY=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
-CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_SHA512=y
 CONFIG_ASYMMETRIC_KEY_TYPE=y
 CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y

From 3a2ec581f8e1b2f2ea949d4de42b99945cac5972 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 7 Jun 2018 13:39:49 -0700
Subject: [PATCH 1758/3220] UPSTREAM: socket: close race condition between
 sock_close() and sockfs_setattr()

fchownat() doesn't even hold refcnt of fd until it figures out
fd is really needed (otherwise is ignored) and releases it after
it resolves the path. This means sock_close() could race with
sockfs_setattr(), which leads to a NULL pointer dereference
since typically we set sock->sk to NULL in ->release().

As pointed out by Al, this is unique to sockfs. So we can fix this
in socket layer by acquiring inode_lock in sock_close() and
checking against NULL in sockfs_setattr().

sock_release() is called in many places, only the sock_close()
path matters here. And fortunately, this should not affect normal
sock_close() as it is only called when the last fd refcnt is gone.
It only affects sock_close() with a parallel sockfs_setattr() in
progress, which is not common.

Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.")
Reported-by: shankarapailoor <shankarapailoor@gmail.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Lorenzo Colitti <lorenzo@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

(cherry picked from commit 6d8c50dcb029872b298eea68cc6209c866fd3e14)
Signed-off-by: Chenbo Feng <fengc@google.com>

Bug: 112220999
Test: syzcaller reproducer doesn't trigger the crash anymore
Change-Id: I90bec1515889e0dfd23f94e3f29b366c7bbfcd11
---
 net/socket.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index b75a537807b5..b36c981d04dc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -528,7 +528,10 @@ static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (!err && (iattr->ia_valid & ATTR_UID)) {
 		struct socket *sock = SOCKET_I(d_inode(dentry));
 
-		sock->sk->sk_uid = iattr->ia_uid;
+		if (sock->sk)
+			sock->sk->sk_uid = iattr->ia_uid;
+		else
+			err = -ENOENT;
 	}
 
 	return err;
@@ -579,12 +582,16 @@ static struct socket *sock_alloc(void)
  *	an inode not a file.
  */
 
-void sock_release(struct socket *sock)
+static void __sock_release(struct socket *sock, struct inode *inode)
 {
 	if (sock->ops) {
 		struct module *owner = sock->ops->owner;
 
+		if (inode)
+			inode_lock(inode);
 		sock->ops->release(sock);
+		if (inode)
+			inode_unlock(inode);
 		sock->ops = NULL;
 		module_put(owner);
 	}
@@ -599,6 +606,11 @@ void sock_release(struct socket *sock)
 	}
 	sock->file = NULL;
 }
+
+void sock_release(struct socket *sock)
+{
+	__sock_release(sock, NULL);
+}
 EXPORT_SYMBOL(sock_release);
 
 void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags)
@@ -1035,7 +1047,7 @@ static int sock_mmap(struct file *file, struct vm_area_struct *vma)
 
 static int sock_close(struct inode *inode, struct file *filp)
 {
-	sock_release(SOCKET_I(inode));
+	__sock_release(SOCKET_I(inode), inode);
 	return 0;
 }
 

From dc5f588d8b73b20e17b783c7ef25faef4130fcad Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 14 Jan 2016 15:22:32 -0800
Subject: [PATCH 1759/3220] UPSTREAM: zram: pass gfp from zcomp frontend to
 backend

Each zcomp backend uses own gfp flag but it's pointless because the
context they could be called is driven by upper layer(ie, zcomp
frontend).  As well, zcomp frondend could call them in different
context.  One context(ie, zram init part) is it should be better to make
sure successful allocation other context(ie, further stream allocation
part for accelarating I/O speed) is just optional so let's pass gfp down
from driver (ie, zcomp frontend) like normal MM convention.

[sergey.senozhatsky@gmail.com: add missing __vmalloc zero and highmem gfps]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 75d8947a36d0c9aedd69118d1f14bf424005c7c2)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I572d0565de5aff94ebe0782eba9d34f9c9862060
---
 drivers/block/zram/zcomp.c     | 24 ++++++++++++++++--------
 drivers/block/zram/zcomp.h     |  2 +-
 drivers/block/zram/zcomp_lz4.c | 16 +++-------------
 drivers/block/zram/zcomp_lzo.c | 16 +++-------------
 4 files changed, 23 insertions(+), 35 deletions(-)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index c53617752b93..3ef42e563bb5 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -74,18 +74,18 @@ static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
  * allocate new zcomp_strm structure with ->private initialized by
  * backend, return NULL on error
  */
-static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
+static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
 {
-	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_NOIO);
+	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags);
 	if (!zstrm)
 		return NULL;
 
-	zstrm->private = comp->backend->create();
+	zstrm->private = comp->backend->create(flags);
 	/*
 	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
 	 * case when compressed size is larger than the original one
 	 */
-	zstrm->buffer = (void *)__get_free_pages(GFP_NOIO | __GFP_ZERO, 1);
+	zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1);
 	if (!zstrm->private || !zstrm->buffer) {
 		zcomp_strm_free(comp, zstrm);
 		zstrm = NULL;
@@ -120,8 +120,16 @@ static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
 		/* allocate new zstrm stream */
 		zs->avail_strm++;
 		spin_unlock(&zs->strm_lock);
-
-		zstrm = zcomp_strm_alloc(comp);
+		/*
+		 * This function can be called in swapout/fs write path
+		 * so we can't use GFP_FS|IO. And it assumes we already
+		 * have at least one stream in zram initialization so we
+		 * don't do best effort to allocate more stream in here.
+		 * A default stream will work well without further multiple
+		 * streams. That's why we use NORETRY | NOWARN.
+		 */
+		zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY |
+					__GFP_NOWARN);
 		if (!zstrm) {
 			spin_lock(&zs->strm_lock);
 			zs->avail_strm--;
@@ -209,7 +217,7 @@ static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
 	zs->max_strm = max_strm;
 	zs->avail_strm = 1;
 
-	zstrm = zcomp_strm_alloc(comp);
+	zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
 	if (!zstrm) {
 		kfree(zs);
 		return -ENOMEM;
@@ -259,7 +267,7 @@ static int zcomp_strm_single_create(struct zcomp *comp)
 
 	comp->stream = zs;
 	mutex_init(&zs->strm_lock);
-	zs->zstrm = zcomp_strm_alloc(comp);
+	zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
 	if (!zs->zstrm) {
 		kfree(zs);
 		return -ENOMEM;
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index 46e2b9f8f1f0..b7d2a4bcae54 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -33,7 +33,7 @@ struct zcomp_backend {
 	int (*decompress)(const unsigned char *src, size_t src_len,
 			unsigned char *dst);
 
-	void *(*create)(void);
+	void *(*create)(gfp_t flags);
 	void (*destroy)(void *private);
 
 	const char *name;
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
index dd6083124276..dc2338d5258c 100644
--- a/drivers/block/zram/zcomp_lz4.c
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -15,24 +15,14 @@
 
 #include "zcomp_lz4.h"
 
-static void *zcomp_lz4_create(void)
+static void *zcomp_lz4_create(gfp_t flags)
 {
 	void *ret;
 
-	/*
-	 * This function can be called in swapout/fs write path
-	 * so we can't use GFP_FS|IO. And it assumes we already
-	 * have at least one stream in zram initialization so we
-	 * don't do best effort to allocate more stream in here.
-	 * A default stream will work well without further multiple
-	 * streams. That's why we use NORETRY | NOWARN.
-	 */
-	ret = kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY |
-					__GFP_NOWARN);
+	ret = kzalloc(LZ4_MEM_COMPRESS, flags);
 	if (!ret)
 		ret = __vmalloc(LZ4_MEM_COMPRESS,
-				GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN |
-				__GFP_ZERO | __GFP_HIGHMEM,
+				flags | __GFP_ZERO | __GFP_HIGHMEM,
 				PAGE_KERNEL);
 	return ret;
 }
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
index edc549920fa0..0ab6fce8abe4 100644
--- a/drivers/block/zram/zcomp_lzo.c
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -15,24 +15,14 @@
 
 #include "zcomp_lzo.h"
 
-static void *lzo_create(void)
+static void *lzo_create(gfp_t flags)
 {
 	void *ret;
 
-	/*
-	 * This function can be called in swapout/fs write path
-	 * so we can't use GFP_FS|IO. And it assumes we already
-	 * have at least one stream in zram initialization so we
-	 * don't do best effort to allocate more stream in here.
-	 * A default stream will work well without further multiple
-	 * streams. That's why we use NORETRY | NOWARN.
-	 */
-	ret = kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY |
-					__GFP_NOWARN);
+	ret = kzalloc(LZO1X_MEM_COMPRESS, flags);
 	if (!ret)
 		ret = __vmalloc(LZO1X_MEM_COMPRESS,
-				GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN |
-				__GFP_ZERO | __GFP_HIGHMEM,
+				flags | __GFP_ZERO | __GFP_HIGHMEM,
 				PAGE_KERNEL);
 	return ret;
 }

From 6982182465b2edf1fe577b63b2be4addf2274573 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Thu, 14 Jan 2016 15:22:35 -0800
Subject: [PATCH 1760/3220] UPSTREAM: zram/zcomp: do not zero out zcomp private
 pages

Do not __GFP_ZERO allocated zcomp ->private pages.  We keep allocated
streams around and use them for read/write requests, so we supply a
zeroed out ->private to compression algorithm as a scratch buffer only
once -- the first time we use that stream.  For the rest of IO requests
served by this stream ->private usually contains some temporarily data
from the previous requests.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit e02d238c9852a91b30da9ea32ce36d1416cdc683)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I911832da703f596998a4139d6033ef1564848c9e
---
 drivers/block/zram/zcomp_lz4.c | 4 ++--
 drivers/block/zram/zcomp_lzo.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
index dc2338d5258c..0110086accba 100644
--- a/drivers/block/zram/zcomp_lz4.c
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -19,10 +19,10 @@ static void *zcomp_lz4_create(gfp_t flags)
 {
 	void *ret;
 
-	ret = kzalloc(LZ4_MEM_COMPRESS, flags);
+	ret = kmalloc(LZ4_MEM_COMPRESS, flags);
 	if (!ret)
 		ret = __vmalloc(LZ4_MEM_COMPRESS,
-				flags | __GFP_ZERO | __GFP_HIGHMEM,
+				flags | __GFP_HIGHMEM,
 				PAGE_KERNEL);
 	return ret;
 }
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
index 0ab6fce8abe4..ed7a1f0549ec 100644
--- a/drivers/block/zram/zcomp_lzo.c
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -19,10 +19,10 @@ static void *lzo_create(gfp_t flags)
 {
 	void *ret;
 
-	ret = kzalloc(LZO1X_MEM_COMPRESS, flags);
+	ret = kmalloc(LZO1X_MEM_COMPRESS, flags);
 	if (!ret)
 		ret = __vmalloc(LZO1X_MEM_COMPRESS,
-				flags | __GFP_ZERO | __GFP_HIGHMEM,
+				flags | __GFP_HIGHMEM,
 				PAGE_KERNEL);
 	return ret;
 }

From bece429b7204b3a325af7013ad7f16e4502326bc Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Fri, 20 May 2016 16:59:48 -0700
Subject: [PATCH 1761/3220] BACKPORT: zsmalloc: require GFP in zs_malloc()

Pass GFP flags to zs_malloc() instead of using a fixed mask supplied to
zs_create_pool(), so we can be more flexible, but, more importantly, we
need this to switch zram to per-cpu compression streams -- zram will try
to allocate handle with preemption disabled in a fast path and switch to
a slow path (using different gfp mask) if the fast one has failed.

Apart from that, this also align zs_malloc() interface with zspool/zbud.

[sergey.senozhatsky@gmail.com: pass GFP flags to zs_malloc() instead of using a fixed mask]
  Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit d0d8da2dc49dfdfe1d788eaf4d55eb5d4964d926)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I31276c9351be21a4ed588681b332e98142b76526
---
 drivers/block/zram/zram_drv.c |  4 ++--
 include/linux/zsmalloc.h      |  4 ++--
 mm/zsmalloc.c                 | 24 +++++++++++++-----------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 502406c9e6e1..89245f1e1f83 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -514,7 +514,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
 		goto out_error;
 	}
 
-	meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
+	meta->mem_pool = zs_create_pool(pool_name);
 	if (!meta->mem_pool) {
 		pr_err("Error creating memory pool\n");
 		goto out_error;
@@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			src = uncmem;
 	}
 
-	handle = zs_malloc(meta->mem_pool, clen);
+	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
 	if (!handle) {
 		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
 			index, clen);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 34eb16098a33..57a8e98f2708 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -41,10 +41,10 @@ struct zs_pool_stats {
 
 struct zs_pool;
 
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
+struct zs_pool *zs_create_pool(const char *name);
 void zs_destroy_pool(struct zs_pool *pool);
 
-unsigned long zs_malloc(struct zs_pool *pool, size_t size);
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
 void zs_free(struct zs_pool *pool, unsigned long obj);
 
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c1ea19478119..8fced2101492 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -247,7 +247,6 @@ struct zs_pool {
 	struct size_class **size_class;
 	struct kmem_cache *handle_cachep;
 
-	gfp_t flags;	/* allocation flags used when growing pool */
 	atomic_long_t pages_allocated;
 
 	struct zs_pool_stats stats;
@@ -296,10 +295,10 @@ static void destroy_handle_cache(struct zs_pool *pool)
 	kmem_cache_destroy(pool->handle_cachep);
 }
 
-static unsigned long alloc_handle(struct zs_pool *pool)
+static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
 {
 	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-		pool->flags & ~__GFP_HIGHMEM);
+			gfp & ~__GFP_HIGHMEM);
 }
 
 static void free_handle(struct zs_pool *pool, unsigned long handle)
@@ -325,7 +324,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
 			     const struct zpool_ops *zpool_ops,
 			     struct zpool *zpool)
 {
-	return zs_create_pool(name, gfp);
+	/*
+	 * Ignore global gfp flags: zs_malloc() may be invoked from
+	 * different contexts and its caller must provide a valid
+	 * gfp mask.
+	 */
+	return zs_create_pool(name);
 }
 
 static void zs_zpool_destroy(void *pool)
@@ -336,7 +340,7 @@ static void zs_zpool_destroy(void *pool)
 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
 			unsigned long *handle)
 {
-	*handle = zs_malloc(pool, size);
+	*handle = zs_malloc(pool, size, gfp);
 	return *handle ? 0 : -1;
 }
 static void zs_zpool_free(void *pool, unsigned long handle)
@@ -1388,7 +1392,7 @@ static unsigned long obj_malloc(struct page *first_page,
  * otherwise 0.
  * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
  */
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 {
 	unsigned long handle, obj;
 	struct size_class *class;
@@ -1397,7 +1401,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
 		return 0;
 
-	handle = alloc_handle(pool);
+	handle = alloc_handle(pool, gfp);
 	if (!handle)
 		return 0;
 
@@ -1410,7 +1414,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 
 	if (!first_page) {
 		spin_unlock(&class->lock);
-		first_page = alloc_zspage(class, pool->flags);
+		first_page = alloc_zspage(class, gfp);
 		if (unlikely(!first_page)) {
 			free_handle(pool, handle);
 			return 0;
@@ -1884,7 +1888,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
  * On success, a pointer to the newly created pool is returned,
  * otherwise NULL.
  */
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name)
 {
 	int i;
 	struct zs_pool *pool;
@@ -1954,8 +1958,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
 		prev_class = class;
 	}
 
-	pool->flags = flags;
-
 	if (zs_pool_stat_create(name, pool))
 		goto err;
 

From 5f0fa02cc5173c2f613bd3e93af4b2743c9a1de6 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Fri, 20 May 2016 16:59:51 -0700
Subject: [PATCH 1762/3220] UPSTREAM: zram: user per-cpu compression streams

Remove idle streams list and keep compression streams in per-cpu data.
This removes two contented spin_lock()/spin_unlock() calls from write
path and also prevent write OP from being preempted while holding the
compression stream, which can cause slow downs.

For instance, let's assume that we have N cpus and N-2
max_comp_streams.TASK1 owns the last idle stream, TASK2-TASK3 come in
with the write requests:

  TASK1            TASK2              TASK3
 zram_bvec_write()
  spin_lock
  find stream
  spin_unlock

  compress

  <<preempted>>   zram_bvec_write()
                   spin_lock
                   find stream
                   spin_unlock
                     no_stream
                       schedule
                                     zram_bvec_write()
                                      spin_lock
                                      find_stream
                                      spin_unlock
                                        no_stream
                                          schedule
   spin_lock
   release stream
   spin_unlock
     wake up TASK2

not only TASK2 and TASK3 will not get the stream, TASK1 will be
preempted in the middle of its operation; while we would prefer it to
finish compression and release the stream.

Test environment: x86_64, 4 CPU box, 3G zram, lzo

The following fio tests were executed:
      read, randread, write, randwrite, rw, randrw
with the increasing number of jobs from 1 to 10.

                  4 streams        8 streams       per-cpu
  ===========================================================
  jobs1
  READ:           2520.1MB/s       2566.5MB/s      2491.5MB/s
  READ:           2102.7MB/s       2104.2MB/s      2091.3MB/s
  WRITE:          1355.1MB/s       1320.2MB/s      1378.9MB/s
  WRITE:          1103.5MB/s       1097.2MB/s      1122.5MB/s
  READ:           434013KB/s       435153KB/s      439961KB/s
  WRITE:          433969KB/s       435109KB/s      439917KB/s
  READ:           403166KB/s       405139KB/s      403373KB/s
  WRITE:          403223KB/s       405197KB/s      403430KB/s
  jobs2
  READ:           7958.6MB/s       8105.6MB/s      8073.7MB/s
  READ:           6864.9MB/s       6989.8MB/s      7021.8MB/s
  WRITE:          2438.1MB/s       2346.9MB/s      3400.2MB/s
  WRITE:          1994.2MB/s       1990.3MB/s      2941.2MB/s
  READ:           981504KB/s       973906KB/s      1018.8MB/s
  WRITE:          981659KB/s       974060KB/s      1018.1MB/s
  READ:           937021KB/s       938976KB/s      987250KB/s
  WRITE:          934878KB/s       936830KB/s      984993KB/s
  jobs3
  READ:           13280MB/s        13553MB/s       13553MB/s
  READ:           11534MB/s        11785MB/s       11755MB/s
  WRITE:          3456.9MB/s       3469.9MB/s      4810.3MB/s
  WRITE:          3029.6MB/s       3031.6MB/s      4264.8MB/s
  READ:           1363.8MB/s       1362.6MB/s      1448.9MB/s
  WRITE:          1361.9MB/s       1360.7MB/s      1446.9MB/s
  READ:           1309.4MB/s       1310.6MB/s      1397.5MB/s
  WRITE:          1307.4MB/s       1308.5MB/s      1395.3MB/s
  jobs4
  READ:           20244MB/s        20177MB/s       20344MB/s
  READ:           17886MB/s        17913MB/s       17835MB/s
  WRITE:          4071.6MB/s       4046.1MB/s      6370.2MB/s
  WRITE:          3608.9MB/s       3576.3MB/s      5785.4MB/s
  READ:           1824.3MB/s       1821.6MB/s      1997.5MB/s
  WRITE:          1819.8MB/s       1817.4MB/s      1992.5MB/s
  READ:           1765.7MB/s       1768.3MB/s      1937.3MB/s
  WRITE:          1767.5MB/s       1769.1MB/s      1939.2MB/s
  jobs5
  READ:           18663MB/s        18986MB/s       18823MB/s
  READ:           16659MB/s        16605MB/s       16954MB/s
  WRITE:          3912.4MB/s       3888.7MB/s      6126.9MB/s
  WRITE:          3506.4MB/s       3442.5MB/s      5519.3MB/s
  READ:           1798.2MB/s       1746.5MB/s      1935.8MB/s
  WRITE:          1792.7MB/s       1740.7MB/s      1929.1MB/s
  READ:           1727.6MB/s       1658.2MB/s      1917.3MB/s
  WRITE:          1726.5MB/s       1657.2MB/s      1916.6MB/s
  jobs6
  READ:           21017MB/s        20922MB/s       21162MB/s
  READ:           19022MB/s        19140MB/s       18770MB/s
  WRITE:          3968.2MB/s       4037.7MB/s      6620.8MB/s
  WRITE:          3643.5MB/s       3590.2MB/s      6027.5MB/s
  READ:           1871.8MB/s       1880.5MB/s      2049.9MB/s
  WRITE:          1867.8MB/s       1877.2MB/s      2046.2MB/s
  READ:           1755.8MB/s       1710.3MB/s      1964.7MB/s
  WRITE:          1750.5MB/s       1705.9MB/s      1958.8MB/s
  jobs7
  READ:           21103MB/s        20677MB/s       21482MB/s
  READ:           18522MB/s        18379MB/s       19443MB/s
  WRITE:          4022.5MB/s       4067.4MB/s      6755.9MB/s
  WRITE:          3691.7MB/s       3695.5MB/s      5925.6MB/s
  READ:           1841.5MB/s       1933.9MB/s      2090.5MB/s
  WRITE:          1842.7MB/s       1935.3MB/s      2091.9MB/s
  READ:           1832.4MB/s       1856.4MB/s      1971.5MB/s
  WRITE:          1822.3MB/s       1846.2MB/s      1960.6MB/s
  jobs8
  READ:           20463MB/s        20194MB/s       20862MB/s
  READ:           18178MB/s        17978MB/s       18299MB/s
  WRITE:          4085.9MB/s       4060.2MB/s      7023.8MB/s
  WRITE:          3776.3MB/s       3737.9MB/s      6278.2MB/s
  READ:           1957.6MB/s       1944.4MB/s      2109.5MB/s
  WRITE:          1959.2MB/s       1946.2MB/s      2111.4MB/s
  READ:           1900.6MB/s       1885.7MB/s      2082.1MB/s
  WRITE:          1896.2MB/s       1881.4MB/s      2078.3MB/s
  jobs9
  READ:           19692MB/s        19734MB/s       19334MB/s
  READ:           17678MB/s        18249MB/s       17666MB/s
  WRITE:          4004.7MB/s       4064.8MB/s      6990.7MB/s
  WRITE:          3724.7MB/s       3772.1MB/s      6193.6MB/s
  READ:           1953.7MB/s       1967.3MB/s      2105.6MB/s
  WRITE:          1953.4MB/s       1966.7MB/s      2104.1MB/s
  READ:           1860.4MB/s       1897.4MB/s      2068.5MB/s
  WRITE:          1858.9MB/s       1895.9MB/s      2066.8MB/s
  jobs10
  READ:           19730MB/s        19579MB/s       19492MB/s
  READ:           18028MB/s        18018MB/s       18221MB/s
  WRITE:          4027.3MB/s       4090.6MB/s      7020.1MB/s
  WRITE:          3810.5MB/s       3846.8MB/s      6426.8MB/s
  READ:           1956.1MB/s       1994.6MB/s      2145.2MB/s
  WRITE:          1955.9MB/s       1993.5MB/s      2144.8MB/s
  READ:           1852.8MB/s       1911.6MB/s      2075.8MB/s
  WRITE:          1855.7MB/s       1914.6MB/s      2078.1MB/s

perf stat

                                  4 streams                       8 streams                       per-cpu
  ====================================================================================================================
  jobs1
  stalled-cycles-frontend      23,174,811,209 (  38.21%)     23,220,254,188 (  38.25%)       23,061,406,918 (  38.34%)
  stalled-cycles-backend       11,514,174,638 (  18.98%)     11,696,722,657 (  19.27%)       11,370,852,810 (  18.90%)
  instructions                 73,925,005,782 (    1.22)     73,903,177,632 (    1.22)       73,507,201,037 (    1.22)
  branches                     14,455,124,835 ( 756.063)     14,455,184,779 ( 755.281)       14,378,599,509 ( 758.546)
  branch-misses                    69,801,336 (   0.48%)         80,225,529 (   0.55%)           72,044,726 (   0.50%)
  jobs2
  stalled-cycles-frontend      49,912,741,782 (  46.11%)     50,101,189,290 (  45.95%)       32,874,195,633 (  35.11%)
  stalled-cycles-backend       27,080,366,230 (  25.02%)     27,949,970,232 (  25.63%)       16,461,222,706 (  17.58%)
  instructions                122,831,629,690 (    1.13)    122,919,846,419 (    1.13)      121,924,786,775 (    1.30)
  branches                     23,725,889,239 ( 692.663)     23,733,547,140 ( 688.062)       23,553,950,311 ( 794.794)
  branch-misses                    90,733,041 (   0.38%)         96,320,895 (   0.41%)           84,561,092 (   0.36%)
  jobs3
  stalled-cycles-frontend      66,437,834,608 (  45.58%)     63,534,923,344 (  43.69%)       42,101,478,505 (  33.19%)
  stalled-cycles-backend       34,940,799,661 (  23.97%)     34,774,043,148 (  23.91%)       21,163,324,388 (  16.68%)
  instructions                171,692,121,862 (    1.18)    171,775,373,044 (    1.18)      170,353,542,261 (    1.34)
  branches                     32,968,962,622 ( 628.723)     32,987,739,894 ( 630.512)       32,729,463,918 ( 717.027)
  branch-misses                   111,522,732 (   0.34%)        110,472,894 (   0.33%)           99,791,291 (   0.30%)
  jobs4
  stalled-cycles-frontend      98,741,701,675 (  49.72%)     94,797,349,965 (  47.59%)       54,535,655,381 (  33.53%)
  stalled-cycles-backend       54,642,609,615 (  27.51%)     55,233,554,408 (  27.73%)       27,882,323,541 (  17.14%)
  instructions                220,884,807,851 (    1.11)    220,930,887,273 (    1.11)      218,926,845,851 (    1.35)
  branches                     42,354,518,180 ( 592.105)     42,362,770,587 ( 590.452)       41,955,552,870 ( 716.154)
  branch-misses                   138,093,449 (   0.33%)        131,295,286 (   0.31%)          121,794,771 (   0.29%)
  jobs5
  stalled-cycles-frontend     116,219,747,212 (  48.14%)    110,310,397,012 (  46.29%)       66,373,082,723 (  33.70%)
  stalled-cycles-backend       66,325,434,776 (  27.48%)     64,157,087,914 (  26.92%)       32,999,097,299 (  16.76%)
  instructions                270,615,008,466 (    1.12)    270,546,409,525 (    1.14)      268,439,910,948 (    1.36)
  branches                     51,834,046,557 ( 599.108)     51,811,867,722 ( 608.883)       51,412,576,077 ( 729.213)
  branch-misses                   158,197,086 (   0.31%)        142,639,805 (   0.28%)          133,425,455 (   0.26%)
  jobs6
  stalled-cycles-frontend     138,009,414,492 (  48.23%)    139,063,571,254 (  48.80%)       75,278,568,278 (  32.80%)
  stalled-cycles-backend       79,211,949,650 (  27.68%)     79,077,241,028 (  27.75%)       37,735,797,899 (  16.44%)
  instructions                319,763,993,731 (    1.12)    319,937,782,834 (    1.12)      316,663,600,784 (    1.38)
  branches                     61,219,433,294 ( 595.056)     61,250,355,540 ( 598.215)       60,523,446,617 ( 733.706)
  branch-misses                   169,257,123 (   0.28%)        154,898,028 (   0.25%)          141,180,587 (   0.23%)
  jobs7
  stalled-cycles-frontend     162,974,812,119 (  49.20%)    159,290,061,987 (  48.43%)       88,046,641,169 (  33.21%)
  stalled-cycles-backend       92,223,151,661 (  27.84%)     91,667,904,406 (  27.87%)       44,068,454,971 (  16.62%)
  instructions                369,516,432,430 (    1.12)    369,361,799,063 (    1.12)      365,290,380,661 (    1.38)
  branches                     70,795,673,950 ( 594.220)     70,743,136,124 ( 597.876)       69,803,996,038 ( 732.822)
  branch-misses                   181,708,327 (   0.26%)        165,767,821 (   0.23%)          150,109,797 (   0.22%)
  jobs8
  stalled-cycles-frontend     185,000,017,027 (  49.30%)    182,334,345,473 (  48.37%)       99,980,147,041 (  33.26%)
  stalled-cycles-backend      105,753,516,186 (  28.18%)    107,937,830,322 (  28.63%)       51,404,177,181 (  17.10%)
  instructions                418,153,161,055 (    1.11)    418,308,565,828 (    1.11)      413,653,475,581 (    1.38)
  branches                     80,035,882,398 ( 592.296)     80,063,204,510 ( 589.843)       79,024,105,589 ( 730.530)
  branch-misses                   199,764,528 (   0.25%)        177,936,926 (   0.22%)          160,525,449 (   0.20%)
  jobs9
  stalled-cycles-frontend     210,941,799,094 (  49.63%)    204,714,679,254 (  48.55%)      114,251,113,756 (  33.96%)
  stalled-cycles-backend      122,640,849,067 (  28.85%)    122,188,553,256 (  28.98%)       58,360,041,127 (  17.35%)
  instructions                468,151,025,415 (    1.10)    467,354,869,323 (    1.11)      462,665,165,216 (    1.38)
  branches                     89,657,067,510 ( 585.628)     89,411,550,407 ( 588.990)       88,360,523,943 ( 730.151)
  branch-misses                   218,292,301 (   0.24%)        191,701,247 (   0.21%)          178,535,678 (   0.20%)
  jobs10
  stalled-cycles-frontend     233,595,958,008 (  49.81%)    227,540,615,689 (  49.11%)      160,341,979,938 (  43.07%)
  stalled-cycles-backend      136,153,676,021 (  29.03%)    133,635,240,742 (  28.84%)       65,909,135,465 (  17.70%)
  instructions                517,001,168,497 (    1.10)    516,210,976,158 (    1.11)      511,374,038,613 (    1.37)
  branches                     98,911,641,329 ( 585.796)     98,700,069,712 ( 591.583)       97,646,761,028 ( 728.712)
  branch-misses                   232,341,823 (   0.23%)        199,256,308 (   0.20%)          183,135,268 (   0.19%)

per-cpu streams tend to cause significantly less stalled cycles; execute
less branches and hit less branch-misses.

perf stat reported execution time

                          4 streams        8 streams       per-cpu
  ====================================================================
  jobs1
  seconds elapsed        20.909073870     20.875670495    20.817838540
  jobs2
  seconds elapsed        18.529488399     18.720566469    16.356103108
  jobs3
  seconds elapsed        18.991159531     18.991340812    16.766216066
  jobs4
  seconds elapsed        19.560643828     19.551323547    16.246621715
  jobs5
  seconds elapsed        24.746498464     25.221646740    20.696112444
  jobs6
  seconds elapsed        28.258181828     28.289765505    22.885688857
  jobs7
  seconds elapsed        32.632490241     31.909125381    26.272753738
  jobs8
  seconds elapsed        35.651403851     36.027596308    29.108024711
  jobs9
  seconds elapsed        40.569362365     40.024227989    32.898204012
  jobs10
  seconds elapsed        44.673112304     43.874898137    35.632952191

Please see
  Link: http://marc.info/?l=linux-kernel&m=146166970727530
  Link: http://marc.info/?l=linux-kernel&m=146174716719650
for more test results (under low memory conditions).

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit da9556a2367cf2261ab4d3e100693c82fb1ddb26)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I1af1a466f0ac3f74f9c36f06685111ccef0f4ec4
---
 drivers/block/zram/zcomp.c    | 297 ++++++++++------------------------
 drivers/block/zram/zcomp.h    |  14 +-
 drivers/block/zram/zram_drv.c |  36 ++++-
 3 files changed, 116 insertions(+), 231 deletions(-)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 3ef42e563bb5..bc98d5ed5477 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
+#include <linux/cpu.h>
 
 #include "zcomp.h"
 #include "zcomp_lzo.h"
@@ -20,29 +21,6 @@
 #include "zcomp_lz4.h"
 #endif
 
-/*
- * single zcomp_strm backend
- */
-struct zcomp_strm_single {
-	struct mutex strm_lock;
-	struct zcomp_strm *zstrm;
-};
-
-/*
- * multi zcomp_strm backend
- */
-struct zcomp_strm_multi {
-	/* protect strm list */
-	spinlock_t strm_lock;
-	/* max possible number of zstrm streams */
-	int max_strm;
-	/* number of available zstrm streams */
-	int avail_strm;
-	/* list of available strms */
-	struct list_head idle_strm;
-	wait_queue_head_t strm_wait;
-};
-
 static struct zcomp_backend *backends[] = {
 	&zcomp_lzo,
 #ifdef CONFIG_ZRAM_LZ4_COMPRESS
@@ -93,188 +71,6 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
 	return zstrm;
 }
 
-/*
- * get idle zcomp_strm or wait until other process release
- * (zcomp_strm_release()) one for us
- */
-static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
-{
-	struct zcomp_strm_multi *zs = comp->stream;
-	struct zcomp_strm *zstrm;
-
-	while (1) {
-		spin_lock(&zs->strm_lock);
-		if (!list_empty(&zs->idle_strm)) {
-			zstrm = list_entry(zs->idle_strm.next,
-					struct zcomp_strm, list);
-			list_del(&zstrm->list);
-			spin_unlock(&zs->strm_lock);
-			return zstrm;
-		}
-		/* zstrm streams limit reached, wait for idle stream */
-		if (zs->avail_strm >= zs->max_strm) {
-			spin_unlock(&zs->strm_lock);
-			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
-			continue;
-		}
-		/* allocate new zstrm stream */
-		zs->avail_strm++;
-		spin_unlock(&zs->strm_lock);
-		/*
-		 * This function can be called in swapout/fs write path
-		 * so we can't use GFP_FS|IO. And it assumes we already
-		 * have at least one stream in zram initialization so we
-		 * don't do best effort to allocate more stream in here.
-		 * A default stream will work well without further multiple
-		 * streams. That's why we use NORETRY | NOWARN.
-		 */
-		zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY |
-					__GFP_NOWARN);
-		if (!zstrm) {
-			spin_lock(&zs->strm_lock);
-			zs->avail_strm--;
-			spin_unlock(&zs->strm_lock);
-			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
-			continue;
-		}
-		break;
-	}
-	return zstrm;
-}
-
-/* add stream back to idle list and wake up waiter or free the stream */
-static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
-{
-	struct zcomp_strm_multi *zs = comp->stream;
-
-	spin_lock(&zs->strm_lock);
-	if (zs->avail_strm <= zs->max_strm) {
-		list_add(&zstrm->list, &zs->idle_strm);
-		spin_unlock(&zs->strm_lock);
-		wake_up(&zs->strm_wait);
-		return;
-	}
-
-	zs->avail_strm--;
-	spin_unlock(&zs->strm_lock);
-	zcomp_strm_free(comp, zstrm);
-}
-
-/* change max_strm limit */
-static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
-{
-	struct zcomp_strm_multi *zs = comp->stream;
-	struct zcomp_strm *zstrm;
-
-	spin_lock(&zs->strm_lock);
-	zs->max_strm = num_strm;
-	/*
-	 * if user has lowered the limit and there are idle streams,
-	 * immediately free as much streams (and memory) as we can.
-	 */
-	while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
-		zstrm = list_entry(zs->idle_strm.next,
-				struct zcomp_strm, list);
-		list_del(&zstrm->list);
-		zcomp_strm_free(comp, zstrm);
-		zs->avail_strm--;
-	}
-	spin_unlock(&zs->strm_lock);
-	return true;
-}
-
-static void zcomp_strm_multi_destroy(struct zcomp *comp)
-{
-	struct zcomp_strm_multi *zs = comp->stream;
-	struct zcomp_strm *zstrm;
-
-	while (!list_empty(&zs->idle_strm)) {
-		zstrm = list_entry(zs->idle_strm.next,
-				struct zcomp_strm, list);
-		list_del(&zstrm->list);
-		zcomp_strm_free(comp, zstrm);
-	}
-	kfree(zs);
-}
-
-static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
-{
-	struct zcomp_strm *zstrm;
-	struct zcomp_strm_multi *zs;
-
-	comp->destroy = zcomp_strm_multi_destroy;
-	comp->strm_find = zcomp_strm_multi_find;
-	comp->strm_release = zcomp_strm_multi_release;
-	comp->set_max_streams = zcomp_strm_multi_set_max_streams;
-	zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
-	if (!zs)
-		return -ENOMEM;
-
-	comp->stream = zs;
-	spin_lock_init(&zs->strm_lock);
-	INIT_LIST_HEAD(&zs->idle_strm);
-	init_waitqueue_head(&zs->strm_wait);
-	zs->max_strm = max_strm;
-	zs->avail_strm = 1;
-
-	zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
-	if (!zstrm) {
-		kfree(zs);
-		return -ENOMEM;
-	}
-	list_add(&zstrm->list, &zs->idle_strm);
-	return 0;
-}
-
-static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
-{
-	struct zcomp_strm_single *zs = comp->stream;
-	mutex_lock(&zs->strm_lock);
-	return zs->zstrm;
-}
-
-static void zcomp_strm_single_release(struct zcomp *comp,
-		struct zcomp_strm *zstrm)
-{
-	struct zcomp_strm_single *zs = comp->stream;
-	mutex_unlock(&zs->strm_lock);
-}
-
-static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
-{
-	/* zcomp_strm_single support only max_comp_streams == 1 */
-	return false;
-}
-
-static void zcomp_strm_single_destroy(struct zcomp *comp)
-{
-	struct zcomp_strm_single *zs = comp->stream;
-	zcomp_strm_free(comp, zs->zstrm);
-	kfree(zs);
-}
-
-static int zcomp_strm_single_create(struct zcomp *comp)
-{
-	struct zcomp_strm_single *zs;
-
-	comp->destroy = zcomp_strm_single_destroy;
-	comp->strm_find = zcomp_strm_single_find;
-	comp->strm_release = zcomp_strm_single_release;
-	comp->set_max_streams = zcomp_strm_single_set_max_streams;
-	zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
-	if (!zs)
-		return -ENOMEM;
-
-	comp->stream = zs;
-	mutex_init(&zs->strm_lock);
-	zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
-	if (!zs->zstrm) {
-		kfree(zs);
-		return -ENOMEM;
-	}
-	return 0;
-}
-
 /* show available compressors */
 ssize_t zcomp_available_show(const char *comp, char *buf)
 {
@@ -301,17 +97,17 @@ bool zcomp_available_algorithm(const char *comp)
 
 bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
 {
-	return comp->set_max_streams(comp, num_strm);
+	return true;
 }
 
 struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
 {
-	return comp->strm_find(comp);
+	return *get_cpu_ptr(comp->stream);
 }
 
 void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
 {
-	comp->strm_release(comp, zstrm);
+	put_cpu_ptr(comp->stream);
 }
 
 int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
@@ -327,9 +123,83 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
 	return comp->backend->decompress(src, src_len, dst);
 }
 
+static int __zcomp_cpu_notifier(struct zcomp *comp,
+		unsigned long action, unsigned long cpu)
+{
+	struct zcomp_strm *zstrm;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (WARN_ON(*per_cpu_ptr(comp->stream, cpu)))
+			break;
+		zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
+		if (IS_ERR_OR_NULL(zstrm)) {
+			pr_err("Can't allocate a compression stream\n");
+			return NOTIFY_BAD;
+		}
+		*per_cpu_ptr(comp->stream, cpu) = zstrm;
+		break;
+	case CPU_DEAD:
+	case CPU_UP_CANCELED:
+		zstrm = *per_cpu_ptr(comp->stream, cpu);
+		if (!IS_ERR_OR_NULL(zstrm))
+			zcomp_strm_free(comp, zstrm);
+		*per_cpu_ptr(comp->stream, cpu) = NULL;
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static int zcomp_cpu_notifier(struct notifier_block *nb,
+		unsigned long action, void *pcpu)
+{
+	unsigned long cpu = (unsigned long)pcpu;
+	struct zcomp *comp = container_of(nb, typeof(*comp), notifier);
+
+	return __zcomp_cpu_notifier(comp, action, cpu);
+}
+
+static int zcomp_init(struct zcomp *comp)
+{
+	unsigned long cpu;
+	int ret;
+
+	comp->notifier.notifier_call = zcomp_cpu_notifier;
+
+	comp->stream = alloc_percpu(struct zcomp_strm *);
+	if (!comp->stream)
+		return -ENOMEM;
+
+	cpu_notifier_register_begin();
+	for_each_online_cpu(cpu) {
+		ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu);
+		if (ret == NOTIFY_BAD)
+			goto cleanup;
+	}
+	__register_cpu_notifier(&comp->notifier);
+	cpu_notifier_register_done();
+	return 0;
+
+cleanup:
+	for_each_online_cpu(cpu)
+		__zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
+	cpu_notifier_register_done();
+	return -ENOMEM;
+}
+
 void zcomp_destroy(struct zcomp *comp)
 {
-	comp->destroy(comp);
+	unsigned long cpu;
+
+	cpu_notifier_register_begin();
+	for_each_online_cpu(cpu)
+		__zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
+	__unregister_cpu_notifier(&comp->notifier);
+	cpu_notifier_register_done();
+
+	free_percpu(comp->stream);
 	kfree(comp);
 }
 
@@ -339,9 +209,9 @@ void zcomp_destroy(struct zcomp *comp)
  * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL)
  * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in
  * case of allocation error, or any other error potentially
- * returned by functions zcomp_strm_{multi,single}_create.
+ * returned by zcomp_init().
  */
-struct zcomp *zcomp_create(const char *compress, int max_strm)
+struct zcomp *zcomp_create(const char *compress)
 {
 	struct zcomp *comp;
 	struct zcomp_backend *backend;
@@ -356,10 +226,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm)
 		return ERR_PTR(-ENOMEM);
 
 	comp->backend = backend;
-	if (max_strm > 1)
-		error = zcomp_strm_multi_create(comp, max_strm);
-	else
-		error = zcomp_strm_single_create(comp);
+	error = zcomp_init(comp);
 	if (error) {
 		kfree(comp);
 		return ERR_PTR(error);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index b7d2a4bcae54..ffd88cb747fe 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -10,8 +10,6 @@
 #ifndef _ZCOMP_H_
 #define _ZCOMP_H_
 
-#include <linux/mutex.h>
-
 struct zcomp_strm {
 	/* compression/decompression buffer */
 	void *buffer;
@@ -21,8 +19,6 @@ struct zcomp_strm {
 	 * working memory)
 	 */
 	void *private;
-	/* used in multi stream backend, protected by backend strm_lock */
-	struct list_head list;
 };
 
 /* static compression backend */
@@ -41,19 +37,15 @@ struct zcomp_backend {
 
 /* dynamic per-device compression frontend */
 struct zcomp {
-	void *stream;
+	struct zcomp_strm * __percpu *stream;
 	struct zcomp_backend *backend;
-
-	struct zcomp_strm *(*strm_find)(struct zcomp *comp);
-	void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
-	bool (*set_max_streams)(struct zcomp *comp, int num_strm);
-	void (*destroy)(struct zcomp *comp);
+	struct notifier_block notifier;
 };
 
 ssize_t zcomp_available_show(const char *comp, char *buf);
 bool zcomp_available_algorithm(const char *comp);
 
-struct zcomp *zcomp_create(const char *comp, int max_strm);
+struct zcomp *zcomp_create(const char *comp);
 void zcomp_destroy(struct zcomp *comp);
 
 struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 89245f1e1f83..e174f139dbc9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -650,7 +650,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 {
 	int ret = 0;
 	size_t clen;
-	unsigned long handle;
+	unsigned long handle = 0;
 	struct page *page;
 	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
 	struct zram_meta *meta = zram->meta;
@@ -673,9 +673,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			goto out;
 	}
 
-	zstrm = zcomp_strm_find(zram->comp);
+compress_again:
 	user_mem = kmap_atomic(page);
-
 	if (is_partial_io(bvec)) {
 		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
 		       bvec->bv_len);
@@ -699,6 +698,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		goto out;
 	}
 
+	zstrm = zcomp_strm_find(zram->comp);
 	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
 	if (!is_partial_io(bvec)) {
 		kunmap_atomic(user_mem);
@@ -710,6 +710,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 		pr_err("Compression failed! err=%d\n", ret);
 		goto out;
 	}
+
 	src = zstrm->buffer;
 	if (unlikely(clen > max_zpage_size)) {
 		clen = PAGE_SIZE;
@@ -717,8 +718,33 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			src = uncmem;
 	}
 
-	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
+	/*
+	 * handle allocation has 2 paths:
+	 * a) fast path is executed with preemption disabled (for
+	 *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
+	 *  since we can't sleep;
+	 * b) slow path enables preemption and attempts to allocate
+	 *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
+	 *  put per-cpu compression stream and, thus, to re-do
+	 *  the compression once handle is allocated.
+	 *
+	 * if we have a 'non-null' handle here then we are coming
+	 * from the slow path and handle has already been allocated.
+	 */
+	if (!handle)
+		handle = zs_malloc(meta->mem_pool, clen,
+				__GFP_KSWAPD_RECLAIM |
+				__GFP_NOWARN |
+				__GFP_HIGHMEM);
 	if (!handle) {
+		zcomp_strm_release(zram->comp, zstrm);
+		zstrm = NULL;
+
+		handle = zs_malloc(meta->mem_pool, clen,
+				GFP_NOIO | __GFP_HIGHMEM);
+		if (handle)
+			goto compress_again;
+
 		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
 			index, clen);
 		ret = -ENOMEM;
@@ -1038,7 +1064,7 @@ static ssize_t disksize_store(struct device *dev,
 	if (!meta)
 		return -ENOMEM;
 
-	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
+	comp = zcomp_create(zram->compressor);
 	if (IS_ERR(comp)) {
 		pr_err("Cannot initialise %s compressing backend\n",
 				zram->compressor);

From 4b4591e992aa50466f7c00318e094ef05e21f545 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Fri, 20 May 2016 16:59:59 -0700
Subject: [PATCH 1763/3220] UPSTREAM: zram: remove max_comp_streams internals

Remove the internal part of max_comp_streams interface, since we
switched to per-cpu streams.  We will keep RW max_comp_streams attr
around, because:

a) we may (silently) switch back to idle compression streams list and
   don't want to disturb user space

b) max_comp_streams attr must wait for the next 'lay off cycle'; we
   give user space 2 years to adjust before we remove/downgrade the attr,
   and there are already several attrs scheduled for removal in 4.11, so
   it's too late for max_comp_streams.

This slightly change a user visible behaviour:

- First, reading from max_comp_stream file now will always return the
  number of online CPUs.

- Second, writing to max_comp_stream will not take any effect.

Link: http://lkml.kernel.org/r/20160503165546.25201-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 43209ea2d17aae1540d4e28274e36404f72702f2)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I1902e741b4d3b83c5bd0d66bf1bae021dbfe2056
---
 Documentation/blockdev/zram.txt | 25 +++++-------------
 drivers/block/zram/zcomp.c      |  5 ----
 drivers/block/zram/zram_drv.c   | 45 ++++++++-------------------------
 drivers/block/zram/zram_drv.h   |  1 -
 4 files changed, 18 insertions(+), 58 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 5bda5031c83d..d88f0c70cd7f 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -59,27 +59,16 @@ num_devices parameter is optional and tells zram how many devices should be
 pre-created. Default: 1.
 
 2) Set max number of compression streams
-	Compression backend may use up to max_comp_streams compression streams,
-	thus allowing up to max_comp_streams concurrent compression operations.
-	By default, compression backend uses single compression stream.
+	Regardless the value passed to this attribute, ZRAM will always
+	allocate multiple compression streams - one per online CPUs - thus
+	allowing several concurrent compression operations. The number of
+	allocated compression streams goes down when some of the CPUs
+	become offline. There is no single-compression-stream mode anymore,
+	unless you are running a UP system or has only 1 CPU online.
 
-	Examples:
-	#show max compression streams number
+	To find out how many streams are currently available:
 	cat /sys/block/zram0/max_comp_streams
 
-	#set max compression streams number to 3
-	echo 3 > /sys/block/zram0/max_comp_streams
-
-Note:
-In order to enable compression backend's multi stream support max_comp_streams
-must be initially set to desired concurrency level before ZRAM device
-initialisation. Once the device initialised as a single stream compression
-backend (max_comp_streams equals to 1), you will see error if you try to change
-the value of max_comp_streams because single stream compression backend
-implemented as a special case by lock overhead issue and does not support
-dynamic max_comp_streams. Only multi stream backend supports dynamic
-max_comp_streams adjustment.
-
 3) Select compression algorithm
 	Using comp_algorithm device attribute one can see available and
 	currently selected (shown in square brackets) compression algorithms,
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index bc98d5ed5477..b51a816d766b 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -95,11 +95,6 @@ bool zcomp_available_algorithm(const char *comp)
 	return find_backend(comp) != NULL;
 }
 
-bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
-{
-	return true;
-}
-
 struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
 {
 	return *get_cpu_ptr(comp->stream);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e174f139dbc9..e38f4419cc4e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev,
 	return len;
 }
 
+/*
+ * We switched to per-cpu streams and this attr is not needed anymore.
+ * However, we will keep it around for some time, because:
+ * a) we may revert per-cpu streams in the future
+ * b) it's visible to user space and we need to follow our 2 years
+ *    retirement rule; but we already have a number of 'soon to be
+ *    altered' attrs, so max_comp_streams need to wait for the next
+ *    layoff cycle.
+ */
 static ssize_t max_comp_streams_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	int val;
-	struct zram *zram = dev_to_zram(dev);
-
-	down_read(&zram->init_lock);
-	val = zram->max_comp_streams;
-	up_read(&zram->init_lock);
-
-	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
 }
 
 static ssize_t max_comp_streams_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
-	int num;
-	struct zram *zram = dev_to_zram(dev);
-	int ret;
-
-	ret = kstrtoint(buf, 0, &num);
-	if (ret < 0)
-		return ret;
-	if (num < 1)
-		return -EINVAL;
-
-	down_write(&zram->init_lock);
-	if (init_done(zram)) {
-		if (!zcomp_set_max_streams(zram->comp, num)) {
-			pr_info("Cannot change max compression streams\n");
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	zram->max_comp_streams = num;
-	ret = len;
-out:
-	up_write(&zram->init_lock);
-	return ret;
+	return len;
 }
 
 static ssize_t comp_algorithm_show(struct device *dev,
@@ -1035,7 +1014,6 @@ static void zram_reset_device(struct zram *zram)
 	/* Reset stats */
 	memset(&zram->stats, 0, sizeof(zram->stats));
 	zram->disksize = 0;
-	zram->max_comp_streams = 1;
 
 	set_capacity(zram->disk, 0);
 	part_stat_set_all(&zram->disk->part0, 0);
@@ -1301,7 +1279,6 @@ static int zram_add(void)
 	}
 	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
 	zram->meta = NULL;
-	zram->max_comp_streams = 1;
 
 	pr_info("Added device: %s\n", zram->disk->disk_name);
 	return device_id;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 8e92339686d7..06b1636f4722 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -102,7 +102,6 @@ struct zram {
 	 * the number of pages zram can consume for storing compressed data
 	 */
 	unsigned long limit_pages;
-	int max_comp_streams;
 
 	struct zram_stats stats;
 	atomic_t refcount; /* refcount for zram_meta */

From d3080aadc19db63abe1119df9e4f80faf450d79c Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Fri, 20 May 2016 17:00:02 -0700
Subject: [PATCH 1764/3220] UPSTREAM: zram: introduce per-device debug_stat
 sysfs node

debug_stat sysfs is read-only and represents various debugging data that
zram developers may need.  This file is not meant to be used by anyone
else: its content is not documented and will change any time w/o any
notice.  Therefore, the output of debug_stat file contains a version
string.  To avoid any confusion, we will increase the version number
every time we modify the output.

At the moment this file exports only one value -- the number of
re-compressions, IOW, the number of times compression fast path has
failed.  This stat is temporary any will be useful in case if any
per-cpu compression streams regressions will be reported.

Link: http://lkml.kernel.org/r/20160513230834.GB26763@bbox
Link: http://lkml.kernel.org/r/20160511134553.12655-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 623e47fc64f8de480b322b7ed68855f97137e2a5)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ie0ef61db7aa0b2c713de1d8bf48e8a545b4276e9
---
 Documentation/ABI/testing/sysfs-block-zram |  9 +++++++++
 Documentation/blockdev/zram.txt            |  1 +
 drivers/block/zram/zram_drv.c              | 21 +++++++++++++++++++++
 drivers/block/zram/zram_drv.h              |  1 +
 4 files changed, 32 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 2e69e83bf510..4518d30b8c2e 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -166,3 +166,12 @@ Description:
 		The mm_stat file is read-only and represents device's mm
 		statistics (orig_data_size, compr_data_size, etc.) in a format
 		similar to block layer statistics file format.
+
+What:		/sys/block/zram<id>/debug_stat
+Date:		July 2016
+Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+		The debug_stat file is read-only and represents various
+		device's debugging info useful for kernel developers. Its
+		format is not documented intentionally and may change
+		anytime without any notice.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index d88f0c70cd7f..13100fb3c26d 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -172,6 +172,7 @@ mem_limit         RW    the maximum amount of memory ZRAM can use to store
 pages_compacted   RO    the number of pages freed during compaction
                         (available only via zram<id>/mm_stat node)
 compact           WO    trigger memory compaction
+debug_stat        RO    this file is used for zram debugging purposes
 
 WARNING
 =======
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e38f4419cc4e..74fb46afac71 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -435,8 +435,26 @@ static ssize_t mm_stat_show(struct device *dev,
 	return ret;
 }
 
+static ssize_t debug_stat_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int version = 1;
+	struct zram *zram = dev_to_zram(dev);
+	ssize_t ret;
+
+	down_read(&zram->init_lock);
+	ret = scnprintf(buf, PAGE_SIZE,
+			"version: %d\n%8llu\n",
+			version,
+			(u64)atomic64_read(&zram->stats.writestall));
+	up_read(&zram->init_lock);
+
+	return ret;
+}
+
 static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
+static DEVICE_ATTR_RO(debug_stat);
 ZRAM_ATTR_RO(num_reads);
 ZRAM_ATTR_RO(num_writes);
 ZRAM_ATTR_RO(failed_reads);
@@ -719,6 +737,8 @@ compress_again:
 		zcomp_strm_release(zram->comp, zstrm);
 		zstrm = NULL;
 
+		atomic64_inc(&zram->stats.writestall);
+
 		handle = zs_malloc(meta->mem_pool, clen,
 				GFP_NOIO | __GFP_HIGHMEM);
 		if (handle)
@@ -1181,6 +1201,7 @@ static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_comp_algorithm.attr,
 	&dev_attr_io_stat.attr,
 	&dev_attr_mm_stat.attr,
+	&dev_attr_debug_stat.attr,
 	NULL,
 };
 
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 06b1636f4722..3f5bf66a27e4 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -85,6 +85,7 @@ struct zram_stats {
 	atomic64_t zero_pages;		/* no. of zero filled pages */
 	atomic64_t pages_stored;	/* no. of pages currently stored */
 	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
+	atomic64_t writestall;		/* no. of write slow paths */
 };
 
 struct zram_meta {

From 2104ebe09ac5992efa0b670b4a0dcf91fbfc40d6 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 26 Jul 2016 15:22:42 -0700
Subject: [PATCH 1765/3220] UPSTREAM: zram: rename zstrm find-release functions

This has started as a 'add zlib support' work, but after some thinking I
saw no blockers for a bigger change -- a switch to crypto API.

We don't have an idle zstreams list anymore and our write path now works
absolutely differently, preventing preemption during compression.  This
removes possibilities of read paths preempting writes at wrong places
and opens the door for a move from custom LZO/LZ4 compression backends
implementation to a more generic one, using crypto compress API.

This patch set also eliminates the need of a new context-less crypto API
interface, which was quite hard to sell, so we can move along faster.

benchmarks:

(x86_64, 4GB, zram-perf script)

perf reported run-time fio (max jobs=3).  I performed fio test with the
increasing number of parallel jobs (max to 3) on a 3G zram device, using
`static' data and the following crypto comp algorithms:

	842, deflate, lz4, lz4hc, lzo

the output was:

 - test running time (which can tell us what algorithms performs faster)

and

 - zram mm_stat (which tells the compressed memory size, max used memory, etc).

It's just for information.  for example, LZ4HC has twice the running
time of LZO, but the compressed memory size is: 23592960 vs 34603008
bytes.

  test-fio-zram-842
     197.907655282 seconds time elapsed
     201.623142884 seconds time elapsed
     226.854291345 seconds time elapsed
  test-fio-zram-DEFLATE
     253.259516155 seconds time elapsed
     258.148563401 seconds time elapsed
     290.251909365 seconds time elapsed
  test-fio-zram-LZ4
      27.022598717 seconds time elapsed
      29.580522717 seconds time elapsed
      33.293463430 seconds time elapsed
  test-fio-zram-LZ4HC
      56.393954615 seconds time elapsed
      74.904659747 seconds time elapsed
     101.940998564 seconds time elapsed
  test-fio-zram-LZO
      28.155948075 seconds time elapsed
      30.390036330 seconds time elapsed
      34.455773159 seconds time elapsed

zram mm_stat-s (max fio jobs=3)

  test-fio-zram-842
  mm_stat (jobs1): 3221225472 673185792 690266112        0 690266112        0        0
  mm_stat (jobs2): 3221225472 673185792 690266112        0 690266112        0        0
  mm_stat (jobs3): 3221225472 673185792 690266112        0 690266112        0        0
  test-fio-zram-DEFLATE
  mm_stat (jobs1): 3221225472  24379392  37761024        0  37761024        0        0
  mm_stat (jobs2): 3221225472  24379392  37761024        0  37761024        0        0
  mm_stat (jobs3): 3221225472  24379392  37761024        0  37761024        0        0
  test-fio-zram-LZ4
  mm_stat (jobs1): 3221225472  23592960  37761024        0  37761024        0        0
  mm_stat (jobs2): 3221225472  23592960  37761024        0  37761024        0        0
  mm_stat (jobs3): 3221225472  23592960  37761024        0  37761024        0        0
  test-fio-zram-LZ4HC
  mm_stat (jobs1): 3221225472  23592960  37761024        0  37761024        0        0
  mm_stat (jobs2): 3221225472  23592960  37761024        0  37761024        0        0
  mm_stat (jobs3): 3221225472  23592960  37761024        0  37761024        0        0
  test-fio-zram-LZO
  mm_stat (jobs1): 3221225472  34603008  50335744        0  50335744        0        0
  mm_stat (jobs2): 3221225472  34603008  50335744        0  50335744        0        0
  mm_stat (jobs3): 3221225472  34603008  50335744        0  50339840        0        0

This patch (of 8):

We don't perform any zstream idle list lookup anymore, so
zcomp_strm_find()/zcomp_strm_release() names are not representative.

Rename to zcomp_stream_get()/zcomp_stream_put().

Link: http://lkml.kernel.org/r/20160531122017.2878-2-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 2aea8493d326bdf15446768333e1d2c91b040b5c)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I2f4c9e215bca73ba5adb1354aaec6e32e420920d
---
 drivers/block/zram/zcomp.c    | 4 ++--
 drivers/block/zram/zcomp.h    | 4 ++--
 drivers/block/zram/zram_drv.c | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index b51a816d766b..400f8267337e 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -95,12 +95,12 @@ bool zcomp_available_algorithm(const char *comp)
 	return find_backend(comp) != NULL;
 }
 
-struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
+struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
 {
 	return *get_cpu_ptr(comp->stream);
 }
 
-void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
+void zcomp_stream_put(struct zcomp *comp)
 {
 	put_cpu_ptr(comp->stream);
 }
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index ffd88cb747fe..944b8e60dd82 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -48,8 +48,8 @@ bool zcomp_available_algorithm(const char *comp);
 struct zcomp *zcomp_create(const char *comp);
 void zcomp_destroy(struct zcomp *comp);
 
-struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
-void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm);
+struct zcomp_strm *zcomp_stream_get(struct zcomp *comp);
+void zcomp_stream_put(struct zcomp *comp);
 
 int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
 		const unsigned char *src, size_t *dst_len);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 74fb46afac71..01bd4380d7b9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -695,7 +695,7 @@ compress_again:
 		goto out;
 	}
 
-	zstrm = zcomp_strm_find(zram->comp);
+	zstrm = zcomp_stream_get(zram->comp);
 	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
 	if (!is_partial_io(bvec)) {
 		kunmap_atomic(user_mem);
@@ -734,7 +734,7 @@ compress_again:
 				__GFP_NOWARN |
 				__GFP_HIGHMEM);
 	if (!handle) {
-		zcomp_strm_release(zram->comp, zstrm);
+		zcomp_stream_put(zram->comp);
 		zstrm = NULL;
 
 		atomic64_inc(&zram->stats.writestall);
@@ -769,7 +769,7 @@ compress_again:
 		memcpy(cmem, src, clen);
 	}
 
-	zcomp_strm_release(zram->comp, zstrm);
+	zcomp_stream_put(zram->comp);
 	zstrm = NULL;
 	zs_unmap_object(meta->mem_pool, handle);
 
@@ -789,7 +789,7 @@ compress_again:
 	atomic64_inc(&zram->stats.pages_stored);
 out:
 	if (zstrm)
-		zcomp_strm_release(zram->comp, zstrm);
+		zcomp_stream_put(zram->comp);
 	if (is_partial_io(bvec))
 		kfree(uncmem);
 	return ret;

From bf76af5746e19f71b6768e45bd118a87d03194dc Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 26 Jul 2016 15:22:45 -0700
Subject: [PATCH 1766/3220] BACKPORT: zram: switch to crypto compress API

We don't have an idle zstreams list anymore and our write path now works
absolutely differently, preventing preemption during compression.  This
removes possibilities of read paths preempting writes at wrong places
(which could badly affect the performance of both paths) and at the same
time opens the door for a move from custom LZO/LZ4 compression backends
implementation to a more generic one, using crypto compress API.

Joonsoo Kim [1] attempted to do this a while ago, but faced with the
need of introducing a new crypto API interface.  The root cause was the
fact that crypto API compression algorithms require a compression stream
structure (in zram terminology) for both compression and decompression
ops, while in reality only several of compression algorithms really need
it.  This resulted in a concept of context-less crypto API compression
backends [2].  Both write and read paths, though, would have been
executed with the preemption enabled, which in the worst case could have
resulted in a decreased worst-case performance, e.g.  consider the
following case:

	CPU0

	zram_write()
	  spin_lock()
	    take the last idle stream
	  spin_unlock()

	<< preempted >>

		zram_read()
		  spin_lock()
		   no idle streams
			  spin_unlock()
			  schedule()

	resuming zram_write compression()

but it took me some time to realize that, and it took even longer to
evolve zram and to make it ready for crypto API.  The key turned out to be
-- drop the idle streams list entirely.  Without the idle streams list we
are free to use compression algorithms that require compression stream for
decompression (read), because streams are now placed in per-cpu data and
each write path has to disable preemption for compression op, almost
completely eliminating the aforementioned case (technically, we still have
a small chance, because write path has a fast and a slow paths and the
slow path is executed with the preemption enabled; but the frequency of
failed fast path is too low).

TEST
====

- 4 CPUs, x86_64 system
- 3G zram, lzo
- fio tests: read, randread, write, randwrite, rw, randrw

test script [3] command:
 ZRAM_SIZE=3G LOG_SUFFIX=XXXX FIO_LOOPS=5 ./zram-fio-test.sh

                   BASE           PATCHED
jobs1
READ:           2527.2MB/s	 2482.7MB/s
READ:           2102.7MB/s	 2045.0MB/s
WRITE:          1284.3MB/s	 1324.3MB/s
WRITE:          1080.7MB/s	 1101.9MB/s
READ:           430125KB/s	 437498KB/s
WRITE:          430538KB/s	 437919KB/s
READ:           399593KB/s	 403987KB/s
WRITE:          399910KB/s	 404308KB/s
jobs2
READ:           8133.5MB/s	 7854.8MB/s
READ:           7086.6MB/s	 6912.8MB/s
WRITE:          3177.2MB/s	 3298.3MB/s
WRITE:          2810.2MB/s	 2871.4MB/s
READ:           1017.6MB/s	 1023.4MB/s
WRITE:          1018.2MB/s	 1023.1MB/s
READ:           977836KB/s	 984205KB/s
WRITE:          979435KB/s	 985814KB/s
jobs3
READ:           13557MB/s	 13391MB/s
READ:           11876MB/s	 11752MB/s
WRITE:          4641.5MB/s	 4682.1MB/s
WRITE:          4164.9MB/s	 4179.3MB/s
READ:           1453.8MB/s	 1455.1MB/s
WRITE:          1455.1MB/s	 1458.2MB/s
READ:           1387.7MB/s	 1395.7MB/s
WRITE:          1386.1MB/s	 1394.9MB/s
jobs4
READ:           20271MB/s	 20078MB/s
READ:           18033MB/s	 17928MB/s
WRITE:          6176.8MB/s	 6180.5MB/s
WRITE:          5686.3MB/s	 5705.3MB/s
READ:           2009.4MB/s	 2006.7MB/s
WRITE:          2007.5MB/s	 2004.9MB/s
READ:           1929.7MB/s	 1935.6MB/s
WRITE:          1926.8MB/s	 1932.6MB/s
jobs5
READ:           18823MB/s	 19024MB/s
READ:           18968MB/s	 19071MB/s
WRITE:          6191.6MB/s	 6372.1MB/s
WRITE:          5818.7MB/s	 5787.1MB/s
READ:           2011.7MB/s	 1981.3MB/s
WRITE:          2011.4MB/s	 1980.1MB/s
READ:           1949.3MB/s	 1935.7MB/s
WRITE:          1940.4MB/s	 1926.1MB/s
jobs6
READ:           21870MB/s	 21715MB/s
READ:           19957MB/s	 19879MB/s
WRITE:          6528.4MB/s	 6537.6MB/s
WRITE:          6098.9MB/s	 6073.6MB/s
READ:           2048.6MB/s	 2049.9MB/s
WRITE:          2041.7MB/s	 2042.9MB/s
READ:           2013.4MB/s	 1990.4MB/s
WRITE:          2009.4MB/s	 1986.5MB/s
jobs7
READ:           21359MB/s	 21124MB/s
READ:           19746MB/s	 19293MB/s
WRITE:          6660.4MB/s	 6518.8MB/s
WRITE:          6211.6MB/s	 6193.1MB/s
READ:           2089.7MB/s	 2080.6MB/s
WRITE:          2085.8MB/s	 2076.5MB/s
READ:           2041.2MB/s	 2052.5MB/s
WRITE:          2037.5MB/s	 2048.8MB/s
jobs8
READ:           20477MB/s	 19974MB/s
READ:           18922MB/s	 18576MB/s
WRITE:          6851.9MB/s	 6788.3MB/s
WRITE:          6407.7MB/s	 6347.5MB/s
READ:           2134.8MB/s	 2136.1MB/s
WRITE:          2132.8MB/s	 2134.4MB/s
READ:           2074.2MB/s	 2069.6MB/s
WRITE:          2087.3MB/s	 2082.4MB/s
jobs9
READ:           19797MB/s	 19994MB/s
READ:           18806MB/s	 18581MB/s
WRITE:          6878.7MB/s	 6822.7MB/s
WRITE:          6456.8MB/s	 6447.2MB/s
READ:           2141.1MB/s	 2154.7MB/s
WRITE:          2144.4MB/s	 2157.3MB/s
READ:           2084.1MB/s	 2085.1MB/s
WRITE:          2091.5MB/s	 2092.5MB/s
jobs10
READ:           19794MB/s	 19784MB/s
READ:           18794MB/s	 18745MB/s
WRITE:          6984.4MB/s	 6676.3MB/s
WRITE:          6532.3MB/s	 6342.7MB/s
READ:           2150.6MB/s	 2155.4MB/s
WRITE:          2156.8MB/s	 2161.5MB/s
READ:           2106.4MB/s	 2095.6MB/s
WRITE:          2109.7MB/s	 2098.4MB/s

                                    BASE                       PATCHED
jobs1                              perfstat
stalled-cycles-frontend     102,480,595,419 (  41.53%)	  114,508,864,804 (  46.92%)
stalled-cycles-backend       51,941,417,832 (  21.05%)	   46,836,112,388 (  19.19%)
instructions                283,612,054,215 (    1.15)	  283,918,134,959 (    1.16)
branches                     56,372,560,385 ( 724.923)	   56,449,814,753 ( 733.766)
branch-misses                   374,826,000 (   0.66%)	      326,935,859 (   0.58%)
jobs2                              perfstat
stalled-cycles-frontend     155,142,745,777 (  40.99%)	  164,170,979,198 (  43.82%)
stalled-cycles-backend       70,813,866,387 (  18.71%)	   66,456,858,165 (  17.74%)
instructions                463,436,648,173 (    1.22)	  464,221,890,191 (    1.24)
branches                     91,088,733,902 ( 760.088)	   91,278,144,546 ( 769.133)
branch-misses                   504,460,363 (   0.55%)	      394,033,842 (   0.43%)
jobs3                              perfstat
stalled-cycles-frontend     201,300,397,212 (  39.84%)	  223,969,902,257 (  44.44%)
stalled-cycles-backend       87,712,593,974 (  17.36%)	   81,618,888,712 (  16.19%)
instructions                642,869,545,023 (    1.27)	  644,677,354,132 (    1.28)
branches                    125,724,560,594 ( 690.682)	  126,133,159,521 ( 694.542)
branch-misses                   527,941,798 (   0.42%)	      444,782,220 (   0.35%)
jobs4                              perfstat
stalled-cycles-frontend     246,701,197,429 (  38.12%)	  280,076,030,886 (  43.29%)
stalled-cycles-backend      119,050,341,112 (  18.40%)	  110,955,641,671 (  17.15%)
instructions                822,716,962,127 (    1.27)	  825,536,969,320 (    1.28)
branches                    160,590,028,545 ( 688.614)	  161,152,996,915 ( 691.068)
branch-misses                   650,295,287 (   0.40%)	      550,229,113 (   0.34%)
jobs5                              perfstat
stalled-cycles-frontend     298,958,462,516 (  38.30%)	  344,852,200,358 (  44.16%)
stalled-cycles-backend      137,558,742,122 (  17.62%)	  129,465,067,102 (  16.58%)
instructions              1,005,714,688,752 (    1.29)	1,007,657,999,432 (    1.29)
branches                    195,988,773,962 ( 697.730)	  196,446,873,984 ( 700.319)
branch-misses                   695,818,940 (   0.36%)	      624,823,263 (   0.32%)
jobs6                              perfstat
stalled-cycles-frontend     334,497,602,856 (  36.71%)	  387,590,419,779 (  42.38%)
stalled-cycles-backend      163,539,365,335 (  17.95%)	  152,640,193,639 (  16.69%)
instructions              1,184,738,177,851 (    1.30)	1,187,396,281,677 (    1.30)
branches                    230,592,915,640 ( 702.902)	  231,253,802,882 ( 702.356)
branch-misses                   747,934,786 (   0.32%)	      643,902,424 (   0.28%)
jobs7                              perfstat
stalled-cycles-frontend     396,724,684,187 (  37.71%)	  460,705,858,952 (  43.84%)
stalled-cycles-backend      188,096,616,496 (  17.88%)	  175,785,787,036 (  16.73%)
instructions              1,364,041,136,608 (    1.30)	1,366,689,075,112 (    1.30)
branches                    265,253,096,936 ( 700.078)	  265,890,524,883 ( 702.839)
branch-misses                   784,991,589 (   0.30%)	      729,196,689 (   0.27%)
jobs8                              perfstat
stalled-cycles-frontend     440,248,299,870 (  36.92%)	  509,554,793,816 (  42.46%)
stalled-cycles-backend      222,575,930,616 (  18.67%)	  213,401,248,432 (  17.78%)
instructions              1,542,262,045,114 (    1.29)	1,545,233,932,257 (    1.29)
branches                    299,775,178,439 ( 697.666)	  300,528,458,505 ( 694.769)
branch-misses                   847,496,084 (   0.28%)	      748,794,308 (   0.25%)
jobs9                              perfstat
stalled-cycles-frontend     506,269,882,480 (  37.86%)	  592,798,032,820 (  44.43%)
stalled-cycles-backend      253,192,498,861 (  18.93%)	  233,727,666,185 (  17.52%)
instructions              1,721,985,080,913 (    1.29)	1,724,666,236,005 (    1.29)
branches                    334,517,360,255 ( 694.134)	  335,199,758,164 ( 697.131)
branch-misses                   873,496,730 (   0.26%)	      815,379,236 (   0.24%)
jobs10                             perfstat
stalled-cycles-frontend     549,063,363,749 (  37.18%)	  651,302,376,662 (  43.61%)
stalled-cycles-backend      281,680,986,810 (  19.07%)	  277,005,235,582 (  18.55%)
instructions              1,901,859,271,180 (    1.29)	1,906,311,064,230 (    1.28)
branches                    369,398,536,153 ( 694.004)	  370,527,696,358 ( 688.409)
branch-misses                   967,929,335 (   0.26%)	      890,125,056 (   0.24%)

                            BASE           PATCHED
seconds elapsed        79.421641008	78.735285546
seconds elapsed        61.471246133	60.869085949
seconds elapsed        62.317058173	62.224188495
seconds elapsed        60.030739363	60.081102518
seconds elapsed        74.070398362	74.317582865
seconds elapsed        84.985953007	85.414364176
seconds elapsed        97.724553255	98.173311344
seconds elapsed        109.488066758	110.268399318
seconds elapsed        122.768189405	122.967164498
seconds elapsed        135.130035105	136.934770801

On my other system (8 x86_64 CPUs, short version of test results):

                            BASE           PATCHED
seconds elapsed        19.518065994	19.806320662
seconds elapsed        15.172772749	15.594718291
seconds elapsed        13.820925970	13.821708564
seconds elapsed        13.293097816	14.585206405
seconds elapsed        16.207284118	16.064431606
seconds elapsed        17.958376158	17.771825767
seconds elapsed        19.478009164	19.602961508
seconds elapsed        21.347152811	21.352318709
seconds elapsed        24.478121126	24.171088735
seconds elapsed        26.865057442	26.767327618

So performance-wise the numbers are quite similar.

Also update zcomp interface to be more aligned with the crypto API.

[1] http://marc.info/?l=linux-kernel&m=144480832108927&w=2
[2] http://marc.info/?l=linux-kernel&m=145379613507518&w=2
[3] https://github.com/sergey-senozhatsky/zram-perf-test

Link: http://lkml.kernel.org/r/20160531122017.2878-3-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
Suggested-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit ebaf9ab56d9d5f350969bd1ea8f47234623c9684)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ia0c362b7419de59e6c6ea81c37f99ef1d22c2b4b
---
 drivers/block/zram/Kconfig    | 10 ++---
 drivers/block/zram/zcomp.c    | 76 ++++++++++++++++++++++-------------
 drivers/block/zram/zcomp.h    | 17 ++++----
 drivers/block/zram/zram_drv.c | 18 +++++----
 4 files changed, 69 insertions(+), 52 deletions(-)

diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 386ba3d1a6ee..2252cd7d0e89 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -1,8 +1,7 @@
 config ZRAM
 	tristate "Compressed RAM block device support"
-	depends on BLOCK && SYSFS && ZSMALLOC
-	select LZO_COMPRESS
-	select LZO_DECOMPRESS
+	depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO
+	select CRYPTO_LZO
 	default n
 	help
 	  Creates virtual block devices called /dev/zramX (X = 0, 1, ...).
@@ -18,9 +17,8 @@ config ZRAM
 config ZRAM_LZ4_COMPRESS
 	bool "Enable LZ4 algorithm support"
 	depends on ZRAM
-	select LZ4_COMPRESS
-	select LZ4_DECOMPRESS
+	select CRYPTO_LZ4
 	default n
 	help
 	  This option enables LZ4 compression algorithm support. Compression
-	  algorithm can be changed using `comp_algorithm' device attribute.
\ No newline at end of file
+	  algorithm can be changed using `comp_algorithm' device attribute.
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 400f8267337e..f35726860a1b 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -14,42 +14,39 @@
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/cpu.h>
+#include <linux/crypto.h>
 
 #include "zcomp.h"
-#include "zcomp_lzo.h"
-#ifdef CONFIG_ZRAM_LZ4_COMPRESS
-#include "zcomp_lz4.h"
-#endif
 
-static struct zcomp_backend *backends[] = {
-	&zcomp_lzo,
+static const char * const backends[] = {
+	"lzo",
 #ifdef CONFIG_ZRAM_LZ4_COMPRESS
-	&zcomp_lz4,
+	"lz4",
 #endif
 	NULL
 };
 
-static struct zcomp_backend *find_backend(const char *compress)
+static const char *find_backend(const char *compress)
 {
 	int i = 0;
 	while (backends[i]) {
-		if (sysfs_streq(compress, backends[i]->name))
+		if (sysfs_streq(compress, backends[i]))
 			break;
 		i++;
 	}
 	return backends[i];
 }
 
-static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
+static void zcomp_strm_free(struct zcomp_strm *zstrm)
 {
-	if (zstrm->private)
-		comp->backend->destroy(zstrm->private);
+	if (!IS_ERR_OR_NULL(zstrm->tfm))
+		crypto_free_comp(zstrm->tfm);
 	free_pages((unsigned long)zstrm->buffer, 1);
 	kfree(zstrm);
 }
 
 /*
- * allocate new zcomp_strm structure with ->private initialized by
+ * allocate new zcomp_strm structure with ->tfm initialized by
  * backend, return NULL on error
  */
 static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
@@ -58,14 +55,14 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
 	if (!zstrm)
 		return NULL;
 
-	zstrm->private = comp->backend->create(flags);
+	zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0);
 	/*
 	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
 	 * case when compressed size is larger than the original one
 	 */
 	zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1);
-	if (!zstrm->private || !zstrm->buffer) {
-		zcomp_strm_free(comp, zstrm);
+	if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) {
+		zcomp_strm_free(zstrm);
 		zstrm = NULL;
 	}
 	return zstrm;
@@ -78,12 +75,12 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
 	int i = 0;
 
 	while (backends[i]) {
-		if (!strcmp(comp, backends[i]->name))
+		if (!strcmp(comp, backends[i]))
 			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
-					"[%s] ", backends[i]->name);
+					"[%s] ", backends[i]);
 		else
 			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
-					"%s ", backends[i]->name);
+					"%s ", backends[i]);
 		i++;
 	}
 	sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
@@ -105,17 +102,38 @@ void zcomp_stream_put(struct zcomp *comp)
 	put_cpu_ptr(comp->stream);
 }
 
-int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
-		const unsigned char *src, size_t *dst_len)
+int zcomp_compress(struct zcomp_strm *zstrm,
+		const void *src, unsigned int *dst_len)
 {
-	return comp->backend->compress(src, zstrm->buffer, dst_len,
-			zstrm->private);
+	/*
+	 * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized
+	 * because sometimes we can endup having a bigger compressed data
+	 * due to various reasons: for example compression algorithms tend
+	 * to add some padding to the compressed buffer. Speaking of padding,
+	 * comp algorithm `842' pads the compressed length to multiple of 8
+	 * and returns -ENOSP when the dst memory is not big enough, which
+	 * is not something that ZRAM wants to see. We can handle the
+	 * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we
+	 * receive -ERRNO from the compressing backend we can't help it
+	 * anymore. To make `842' happy we need to tell the exact size of
+	 * the dst buffer, zram_drv will take care of the fact that
+	 * compressed buffer is too big.
+	 */
+	*dst_len = PAGE_SIZE * 2;
+
+	return crypto_comp_compress(zstrm->tfm,
+			src, PAGE_SIZE,
+			zstrm->buffer, dst_len);
 }
 
-int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
-		size_t src_len, unsigned char *dst)
+int zcomp_decompress(struct zcomp_strm *zstrm,
+		const void *src, unsigned int src_len, void *dst)
 {
-	return comp->backend->decompress(src, src_len, dst);
+	unsigned int dst_len = PAGE_SIZE;
+
+	return crypto_comp_decompress(zstrm->tfm,
+			src, src_len,
+			dst, &dst_len);
 }
 
 static int __zcomp_cpu_notifier(struct zcomp *comp,
@@ -138,7 +156,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
 	case CPU_UP_CANCELED:
 		zstrm = *per_cpu_ptr(comp->stream, cpu);
 		if (!IS_ERR_OR_NULL(zstrm))
-			zcomp_strm_free(comp, zstrm);
+			zcomp_strm_free(zstrm);
 		*per_cpu_ptr(comp->stream, cpu) = NULL;
 		break;
 	default:
@@ -209,7 +227,7 @@ void zcomp_destroy(struct zcomp *comp)
 struct zcomp *zcomp_create(const char *compress)
 {
 	struct zcomp *comp;
-	struct zcomp_backend *backend;
+	const char *backend;
 	int error;
 
 	backend = find_backend(compress);
@@ -220,7 +238,7 @@ struct zcomp *zcomp_create(const char *compress)
 	if (!comp)
 		return ERR_PTR(-ENOMEM);
 
-	comp->backend = backend;
+	comp->name = backend;
 	error = zcomp_init(comp);
 	if (error) {
 		kfree(comp);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index 944b8e60dd82..c914ab7972ef 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -13,12 +13,7 @@
 struct zcomp_strm {
 	/* compression/decompression buffer */
 	void *buffer;
-	/*
-	 * The private data of the compression stream, only compression
-	 * stream backend can touch this (e.g. compression algorithm
-	 * working memory)
-	 */
-	void *private;
+	struct crypto_comp *tfm;
 };
 
 /* static compression backend */
@@ -40,6 +35,8 @@ struct zcomp {
 	struct zcomp_strm * __percpu *stream;
 	struct zcomp_backend *backend;
 	struct notifier_block notifier;
+
+	const char *name;
 };
 
 ssize_t zcomp_available_show(const char *comp, char *buf);
@@ -51,11 +48,11 @@ void zcomp_destroy(struct zcomp *comp);
 struct zcomp_strm *zcomp_stream_get(struct zcomp *comp);
 void zcomp_stream_put(struct zcomp *comp);
 
-int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
-		const unsigned char *src, size_t *dst_len);
+int zcomp_compress(struct zcomp_strm *zstrm,
+		const void *src, unsigned int *dst_len);
 
-int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
-		size_t src_len, unsigned char *dst);
+int zcomp_decompress(struct zcomp_strm *zstrm,
+		const void *src, unsigned int src_len, void *dst);
 
 bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
 #endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 01bd4380d7b9..b5a8b6e840b1 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -563,7 +563,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
 	unsigned char *cmem;
 	struct zram_meta *meta = zram->meta;
 	unsigned long handle;
-	size_t size;
+	unsigned int size;
 
 	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
 	handle = meta->table[index].handle;
@@ -576,10 +576,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
 	}
 
 	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
-	if (size == PAGE_SIZE)
+	if (size == PAGE_SIZE) {
 		memcpy(mem, cmem, PAGE_SIZE);
-	else
-		ret = zcomp_decompress(zram->comp, cmem, size, mem);
+	} else {
+		struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
+
+		ret = zcomp_decompress(zstrm, cmem, size, mem);
+		zcomp_stream_put(zram->comp);
+	}
 	zs_unmap_object(meta->mem_pool, handle);
 	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
 
@@ -646,7 +650,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			   int offset)
 {
 	int ret = 0;
-	size_t clen;
+	unsigned int clen;
 	unsigned long handle = 0;
 	struct page *page;
 	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
@@ -696,7 +700,7 @@ compress_again:
 	}
 
 	zstrm = zcomp_stream_get(zram->comp);
-	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
+	ret = zcomp_compress(zstrm, uncmem, &clen);
 	if (!is_partial_io(bvec)) {
 		kunmap_atomic(user_mem);
 		user_mem = NULL;
@@ -744,7 +748,7 @@ compress_again:
 		if (handle)
 			goto compress_again;
 
-		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
+		pr_err("Error allocating memory for compressed page: %u, size=%u\n",
 			index, clen);
 		ret = -ENOMEM;
 		goto out;

From f44a2bb4f7a17c98aaf485a94d062c89c818156b Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 26 Jul 2016 15:22:48 -0700
Subject: [PATCH 1767/3220] UPSTREAM: zram: use crypto api to check alg
 availability

There is no way to get a string with all the crypto comp algorithms
supported by the crypto comp engine, so we need to maintain our own
backends list.  At the same time we additionally need to use
crypto_has_comp() to make sure that the user has requested a compression
algorithm that is recognized by the crypto comp engine.  Relying on
/proc/crypto is not an options here, because it does not show
not-yet-inserted compression modules.

Example:

 modprobe zram
 cat /proc/crypto | grep -i lz4
 modprobe lz4
 cat /proc/crypto | grep -i lz4
name         : lz4
driver       : lz4-generic
module       : lz4

So the user can't tell exactly if the lz4 is really supported from
/proc/crypto output, unless someone or something has loaded it.

This patch also adds crypto_has_comp() to zcomp_available_show().  We
store all the compression algorithms names in zcomp's `backends' array,
regardless the CONFIG_CRYPTO_FOO configuration, but show only those that
are also supported by crypto engine.  This helps user to know the exact
list of compression algorithms that can be used.

Example:
  module lz4 is not loaded yet, but is supported by the crypto
  engine. /proc/crypto has no information on this module, while
  zram's `comp_algorithm' lists it:

 cat /proc/crypto | grep -i lz4

 cat /sys/block/zram0/comp_algorithm
[lzo] lz4 deflate lz4hc 842

We still use the `backends' array to determine if the requested
compression backend is known to crypto api.  This array, however, may not
contain some entries, therefore as the last step we call crypto_has_comp()
function which attempts to insmod the requested compression algorithm to
determine if crypto api supports it.  The advantage of this method is that
now we permit the usage of out-of-tree crypto compression modules
(implementing S/W or H/W compression).

[sergey.senozhatsky@gmail.com: zram-use-crypto-api-to-check-alg-availability-v3]
  Link: http://lkml.kernel.org/r/20160604024902.11778-4-sergey.senozhatsky@gmail.com
Link: http://lkml.kernel.org/r/20160531122017.2878-5-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 415403be37e204632b17bdb6857890fe5a220cea)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I7c823238329bd6e5180386507d16123228804cc5
---
 Documentation/blockdev/zram.txt | 11 ++++++
 drivers/block/zram/zcomp.c      | 69 ++++++++++++++++++++-------------
 drivers/block/zram/zram_drv.c   | 16 ++++----
 drivers/block/zram/zram_drv.h   |  5 ++-
 4 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 13100fb3c26d..7c05357360a7 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -83,6 +83,17 @@ pre-created. Default: 1.
 	#select lzo compression algorithm
 	echo lzo > /sys/block/zram0/comp_algorithm
 
+	For the time being, the `comp_algorithm' content does not necessarily
+	show every compression algorithm supported by the kernel. We keep this
+	list primarily to simplify device configuration and one can configure
+	a new device with a compression algorithm that is not listed in
+	`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API
+	and, if some of the algorithms were built as modules, it's impossible
+	to list all of them using, for instance, /proc/crypto or any other
+	method. This, however, has an advantage of permitting the usage of
+	custom crypto compression modules (implementing S/W or H/W
+	compression).
+
 4) Set Disksize
         Set disk size by writing the value to sysfs node 'disksize'.
         The value can be either in bytes or you can use mem suffixes.
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index f35726860a1b..a2b4eb85b41d 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -26,17 +26,6 @@ static const char * const backends[] = {
 	NULL
 };
 
-static const char *find_backend(const char *compress)
-{
-	int i = 0;
-	while (backends[i]) {
-		if (sysfs_streq(compress, backends[i]))
-			break;
-		i++;
-	}
-	return backends[i];
-}
-
 static void zcomp_strm_free(struct zcomp_strm *zstrm)
 {
 	if (!IS_ERR_OR_NULL(zstrm->tfm))
@@ -68,28 +57,54 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
 	return zstrm;
 }
 
-/* show available compressors */
-ssize_t zcomp_available_show(const char *comp, char *buf)
+bool zcomp_available_algorithm(const char *comp)
 {
-	ssize_t sz = 0;
 	int i = 0;
 
 	while (backends[i]) {
-		if (!strcmp(comp, backends[i]))
-			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
-					"[%s] ", backends[i]);
-		else
-			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
-					"%s ", backends[i]);
+		if (sysfs_streq(comp, backends[i]))
+			return true;
 		i++;
 	}
-	sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
-	return sz;
+
+	/*
+	 * Crypto does not ignore a trailing new line symbol,
+	 * so make sure you don't supply a string containing
+	 * one.
+	 * This also means that we permit zcomp initialisation
+	 * with any compressing algorithm known to crypto api.
+	 */
+	return crypto_has_comp(comp, 0, 0) == 1;
 }
 
-bool zcomp_available_algorithm(const char *comp)
+/* show available compressors */
+ssize_t zcomp_available_show(const char *comp, char *buf)
 {
-	return find_backend(comp) != NULL;
+	bool known_algorithm = false;
+	ssize_t sz = 0;
+	int i = 0;
+
+	for (; backends[i]; i++) {
+		if (!strcmp(comp, backends[i])) {
+			known_algorithm = true;
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"[%s] ", backends[i]);
+		} else {
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"%s ", backends[i]);
+		}
+	}
+
+	/*
+	 * Out-of-tree module known to crypto api or a missing
+	 * entry in `backends'.
+	 */
+	if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1)
+		sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+				"[%s] ", comp);
+
+	sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+	return sz;
 }
 
 struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
@@ -227,18 +242,16 @@ void zcomp_destroy(struct zcomp *comp)
 struct zcomp *zcomp_create(const char *compress)
 {
 	struct zcomp *comp;
-	const char *backend;
 	int error;
 
-	backend = find_backend(compress);
-	if (!backend)
+	if (!zcomp_available_algorithm(compress))
 		return ERR_PTR(-EINVAL);
 
 	comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
 	if (!comp)
 		return ERR_PTR(-ENOMEM);
 
-	comp->name = backend;
+	comp->name = compress;
 	error = zcomp_init(comp);
 	if (error) {
 		kfree(comp);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index b5a8b6e840b1..be2ee475be5c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -342,9 +342,16 @@ static ssize_t comp_algorithm_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
 	struct zram *zram = dev_to_zram(dev);
+	char compressor[CRYPTO_MAX_ALG_NAME];
 	size_t sz;
 
-	if (!zcomp_available_algorithm(buf))
+	strlcpy(compressor, buf, sizeof(compressor));
+	/* ignore trailing newline */
+	sz = strlen(compressor);
+	if (sz > 0 && compressor[sz - 1] == '\n')
+		compressor[sz - 1] = 0x00;
+
+	if (!zcomp_available_algorithm(compressor))
 		return -EINVAL;
 
 	down_write(&zram->init_lock);
@@ -353,13 +360,8 @@ static ssize_t comp_algorithm_store(struct device *dev,
 		pr_info("Can't change algorithm for initialized device\n");
 		return -EBUSY;
 	}
-	strlcpy(zram->compressor, buf, sizeof(zram->compressor));
-
-	/* ignore trailing newline */
-	sz = strlen(zram->compressor);
-	if (sz > 0 && zram->compressor[sz - 1] == '\n')
-		zram->compressor[sz - 1] = 0x00;
 
+	strlcpy(zram->compressor, compressor, sizeof(compressor));
 	up_write(&zram->init_lock);
 	return len;
 }
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 3f5bf66a27e4..74fcf10da374 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -15,8 +15,9 @@
 #ifndef _ZRAM_DRV_H_
 #define _ZRAM_DRV_H_
 
-#include <linux/spinlock.h>
+#include <linux/rwsem.h>
 #include <linux/zsmalloc.h>
+#include <linux/crypto.h>
 
 #include "zcomp.h"
 
@@ -113,7 +114,7 @@ struct zram {
 	 * we can store in a disk.
 	 */
 	u64 disksize;	/* bytes */
-	char compressor[10];
+	char compressor[CRYPTO_MAX_ALG_NAME];
 	/*
 	 * zram is claimed so open request will be failed
 	 */

From 727cface4651971d7aac45d3dfbed869505393a6 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 26 Jul 2016 15:22:51 -0700
Subject: [PATCH 1768/3220] UPSTREAM: zram: cosmetic: cleanup documentation

zram documentation is a mix of different styles: spaces, tabs, tabs +
spaces, etc.  Clean it up.

Link: http://lkml.kernel.org/r/20160531122017.2878-6-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 69a30a8d2ac17c8080cf6ebfc91149fd6c2648b3)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ib71a1933e3da12b3a9f29b805a458cdc9815c36b
---
 Documentation/blockdev/zram.txt | 85 ++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 43 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 7c05357360a7..0535ae1f73e5 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -59,23 +59,23 @@ num_devices parameter is optional and tells zram how many devices should be
 pre-created. Default: 1.
 
 2) Set max number of compression streams
-	Regardless the value passed to this attribute, ZRAM will always
-	allocate multiple compression streams - one per online CPUs - thus
-	allowing several concurrent compression operations. The number of
-	allocated compression streams goes down when some of the CPUs
-	become offline. There is no single-compression-stream mode anymore,
-	unless you are running a UP system or has only 1 CPU online.
+Regardless the value passed to this attribute, ZRAM will always
+allocate multiple compression streams - one per online CPUs - thus
+allowing several concurrent compression operations. The number of
+allocated compression streams goes down when some of the CPUs
+become offline. There is no single-compression-stream mode anymore,
+unless you are running a UP system or has only 1 CPU online.
 
-	To find out how many streams are currently available:
+To find out how many streams are currently available:
 	cat /sys/block/zram0/max_comp_streams
 
 3) Select compression algorithm
-	Using comp_algorithm device attribute one can see available and
-	currently selected (shown in square brackets) compression algorithms,
-	change selected compression algorithm (once the device is initialised
-	there is no way to change compression algorithm).
+Using comp_algorithm device attribute one can see available and
+currently selected (shown in square brackets) compression algorithms,
+change selected compression algorithm (once the device is initialised
+there is no way to change compression algorithm).
 
-	Examples:
+Examples:
 	#show supported compression algorithms
 	cat /sys/block/zram0/comp_algorithm
 	lzo [lz4]
@@ -83,28 +83,27 @@ pre-created. Default: 1.
 	#select lzo compression algorithm
 	echo lzo > /sys/block/zram0/comp_algorithm
 
-	For the time being, the `comp_algorithm' content does not necessarily
-	show every compression algorithm supported by the kernel. We keep this
-	list primarily to simplify device configuration and one can configure
-	a new device with a compression algorithm that is not listed in
-	`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API
-	and, if some of the algorithms were built as modules, it's impossible
-	to list all of them using, for instance, /proc/crypto or any other
-	method. This, however, has an advantage of permitting the usage of
-	custom crypto compression modules (implementing S/W or H/W
-	compression).
+For the time being, the `comp_algorithm' content does not necessarily
+show every compression algorithm supported by the kernel. We keep this
+list primarily to simplify device configuration and one can configure
+a new device with a compression algorithm that is not listed in
+`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API
+and, if some of the algorithms were built as modules, it's impossible
+to list all of them using, for instance, /proc/crypto or any other
+method. This, however, has an advantage of permitting the usage of
+custom crypto compression modules (implementing S/W or H/W compression).
 
 4) Set Disksize
-        Set disk size by writing the value to sysfs node 'disksize'.
-        The value can be either in bytes or you can use mem suffixes.
-        Examples:
-            # Initialize /dev/zram0 with 50MB disksize
-            echo $((50*1024*1024)) > /sys/block/zram0/disksize
+Set disk size by writing the value to sysfs node 'disksize'.
+The value can be either in bytes or you can use mem suffixes.
+Examples:
+	# Initialize /dev/zram0 with 50MB disksize
+	echo $((50*1024*1024)) > /sys/block/zram0/disksize
 
-            # Using mem suffixes
-            echo 256K > /sys/block/zram0/disksize
-            echo 512M > /sys/block/zram0/disksize
-            echo 1G > /sys/block/zram0/disksize
+	# Using mem suffixes
+	echo 256K > /sys/block/zram0/disksize
+	echo 512M > /sys/block/zram0/disksize
+	echo 1G > /sys/block/zram0/disksize
 
 Note:
 There is little point creating a zram of greater than twice the size of memory
@@ -112,20 +111,20 @@ since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
 size of the disk when not in use so a huge zram is wasteful.
 
 5) Set memory limit: Optional
-	Set memory limit by writing the value to sysfs node 'mem_limit'.
-	The value can be either in bytes or you can use mem suffixes.
-	In addition, you could change the value in runtime.
-	Examples:
-	    # limit /dev/zram0 with 50MB memory
-	    echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
+Set memory limit by writing the value to sysfs node 'mem_limit'.
+The value can be either in bytes or you can use mem suffixes.
+In addition, you could change the value in runtime.
+Examples:
+	# limit /dev/zram0 with 50MB memory
+	echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
 
-	    # Using mem suffixes
-	    echo 256K > /sys/block/zram0/mem_limit
-	    echo 512M > /sys/block/zram0/mem_limit
-	    echo 1G > /sys/block/zram0/mem_limit
+	# Using mem suffixes
+	echo 256K > /sys/block/zram0/mem_limit
+	echo 512M > /sys/block/zram0/mem_limit
+	echo 1G > /sys/block/zram0/mem_limit
 
-	    # To disable memory limit
-	    echo 0 > /sys/block/zram0/mem_limit
+	# To disable memory limit
+	echo 0 > /sys/block/zram0/mem_limit
 
 6) Activate:
 	mkswap /dev/zram0

From 6849209303f878627de373d23bb3882fbbdd531b Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 26 Jul 2016 15:22:54 -0700
Subject: [PATCH 1769/3220] UPSTREAM: zram: delete custom lzo/lz4

Remove lzo/lz4 backends, we use crypto API now.

[sergey.senozhatsky@gmail.com: zram-delete-custom-lzo-lz4-v3]
  Link: http://lkml.kernel.org/r/20160604024902.11778-6-sergey.senozhatsky@gmail.com
Link: http://lkml.kernel.org/r/20160531122017.2878-7-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit ce1ed9f98e888aa220fb09da2e2bcfcfba218a27)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ic2aa300a1a66b61740da73833dab252dc0d4b74a
---
 drivers/block/zram/Kconfig     |  9 ------
 drivers/block/zram/Makefile    |  4 +--
 drivers/block/zram/zcomp.c     |  2 +-
 drivers/block/zram/zcomp.h     | 15 ---------
 drivers/block/zram/zcomp_lz4.c | 56 ----------------------------------
 drivers/block/zram/zcomp_lz4.h | 17 -----------
 drivers/block/zram/zcomp_lzo.c | 56 ----------------------------------
 drivers/block/zram/zcomp_lzo.h | 17 -----------
 8 files changed, 2 insertions(+), 174 deletions(-)
 delete mode 100644 drivers/block/zram/zcomp_lz4.c
 delete mode 100644 drivers/block/zram/zcomp_lz4.h
 delete mode 100644 drivers/block/zram/zcomp_lzo.c
 delete mode 100644 drivers/block/zram/zcomp_lzo.h

diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 2252cd7d0e89..b8ecba6dcd3b 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,12 +13,3 @@ config ZRAM
 	  disks and maybe many more.
 
 	  See zram.txt for more information.
-
-config ZRAM_LZ4_COMPRESS
-	bool "Enable LZ4 algorithm support"
-	depends on ZRAM
-	select CRYPTO_LZ4
-	default n
-	help
-	  This option enables LZ4 compression algorithm support. Compression
-	  algorithm can be changed using `comp_algorithm' device attribute.
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index be0763ff57a2..9e2b79e9a990 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,5 +1,3 @@
-zram-y	:=	zcomp_lzo.o zcomp.o zram_drv.o
-
-zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o
+zram-y	:=	zcomp.o zram_drv.o
 
 obj-$(CONFIG_ZRAM)	+=	zram.o
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index a2b4eb85b41d..9ab45d41624b 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -20,7 +20,7 @@
 
 static const char * const backends[] = {
 	"lzo",
-#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
 	"lz4",
 #endif
 	NULL
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index c914ab7972ef..478cac2ed465 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -16,24 +16,9 @@ struct zcomp_strm {
 	struct crypto_comp *tfm;
 };
 
-/* static compression backend */
-struct zcomp_backend {
-	int (*compress)(const unsigned char *src, unsigned char *dst,
-			size_t *dst_len, void *private);
-
-	int (*decompress)(const unsigned char *src, size_t src_len,
-			unsigned char *dst);
-
-	void *(*create)(gfp_t flags);
-	void (*destroy)(void *private);
-
-	const char *name;
-};
-
 /* dynamic per-device compression frontend */
 struct zcomp {
 	struct zcomp_strm * __percpu *stream;
-	struct zcomp_backend *backend;
 	struct notifier_block notifier;
 
 	const char *name;
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
deleted file mode 100644
index 0110086accba..000000000000
--- a/drivers/block/zram/zcomp_lz4.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2014 Sergey Senozhatsky.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/lz4.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-
-#include "zcomp_lz4.h"
-
-static void *zcomp_lz4_create(gfp_t flags)
-{
-	void *ret;
-
-	ret = kmalloc(LZ4_MEM_COMPRESS, flags);
-	if (!ret)
-		ret = __vmalloc(LZ4_MEM_COMPRESS,
-				flags | __GFP_HIGHMEM,
-				PAGE_KERNEL);
-	return ret;
-}
-
-static void zcomp_lz4_destroy(void *private)
-{
-	kvfree(private);
-}
-
-static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
-		size_t *dst_len, void *private)
-{
-	/* return  : Success if return 0 */
-	return lz4_compress(src, PAGE_SIZE, dst, dst_len, private);
-}
-
-static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
-		unsigned char *dst)
-{
-	size_t dst_len = PAGE_SIZE;
-	/* return  : Success if return 0 */
-	return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
-}
-
-struct zcomp_backend zcomp_lz4 = {
-	.compress = zcomp_lz4_compress,
-	.decompress = zcomp_lz4_decompress,
-	.create = zcomp_lz4_create,
-	.destroy = zcomp_lz4_destroy,
-	.name = "lz4",
-};
diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h
deleted file mode 100644
index 60613fb29dd8..000000000000
--- a/drivers/block/zram/zcomp_lz4.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) 2014 Sergey Senozhatsky.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ZCOMP_LZ4_H_
-#define _ZCOMP_LZ4_H_
-
-#include "zcomp.h"
-
-extern struct zcomp_backend zcomp_lz4;
-
-#endif /* _ZCOMP_LZ4_H_ */
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
deleted file mode 100644
index ed7a1f0549ec..000000000000
--- a/drivers/block/zram/zcomp_lzo.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2014 Sergey Senozhatsky.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/lzo.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-
-#include "zcomp_lzo.h"
-
-static void *lzo_create(gfp_t flags)
-{
-	void *ret;
-
-	ret = kmalloc(LZO1X_MEM_COMPRESS, flags);
-	if (!ret)
-		ret = __vmalloc(LZO1X_MEM_COMPRESS,
-				flags | __GFP_HIGHMEM,
-				PAGE_KERNEL);
-	return ret;
-}
-
-static void lzo_destroy(void *private)
-{
-	kvfree(private);
-}
-
-static int lzo_compress(const unsigned char *src, unsigned char *dst,
-		size_t *dst_len, void *private)
-{
-	int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private);
-	return ret == LZO_E_OK ? 0 : ret;
-}
-
-static int lzo_decompress(const unsigned char *src, size_t src_len,
-		unsigned char *dst)
-{
-	size_t dst_len = PAGE_SIZE;
-	int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
-	return ret == LZO_E_OK ? 0 : ret;
-}
-
-struct zcomp_backend zcomp_lzo = {
-	.compress = lzo_compress,
-	.decompress = lzo_decompress,
-	.create = lzo_create,
-	.destroy = lzo_destroy,
-	.name = "lzo",
-};
diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h
deleted file mode 100644
index 128c5807fa14..000000000000
--- a/drivers/block/zram/zcomp_lzo.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) 2014 Sergey Senozhatsky.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ZCOMP_LZO_H_
-#define _ZCOMP_LZO_H_
-
-#include "zcomp.h"
-
-extern struct zcomp_backend zcomp_lzo;
-
-#endif /* _ZCOMP_LZO_H_ */

From 3c395950e4219659298ce9dbf2bfe6c1506e09a3 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 26 Jul 2016 15:22:56 -0700
Subject: [PATCH 1770/3220] UPSTREAM: zram: add more compression algorithms

Add "deflate", "lz4hc", "842" algorithms to the list of known
compression backends.  The real availability of those algorithms,
however, depends on the corresponding CONFIG_CRYPTO_FOO config options.

[sergey.senozhatsky@gmail.com: zram-add-more-compression-algorithms-v3]
  Link: http://lkml.kernel.org/r/20160604024902.11778-7-sergey.senozhatsky@gmail.com
Link: http://lkml.kernel.org/r/20160531122017.2878-8-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit eb9f56d82547db407779967a2251ea28969245b0)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ie46c7676363ef13c559b45dab4968e2cc48a6cbe
---
 drivers/block/zram/zcomp.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 9ab45d41624b..32e521a2b8c8 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -22,6 +22,15 @@ static const char * const backends[] = {
 	"lzo",
 #if IS_ENABLED(CONFIG_CRYPTO_LZ4)
 	"lz4",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_DEFLATE)
+	"deflate",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC)
+	"lz4hc",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_842)
+	"842",
 #endif
 	NULL
 };

From 163e3f59ed2489572086172dbaac202351a6192b Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 26 Jul 2016 15:22:59 -0700
Subject: [PATCH 1771/3220] UPSTREAM: zram: drop gfp_t from zcomp_strm_alloc()

We now allocate streams from CPU_UP hot-plug path, there are no
context-dependent stream allocations anymore and we can schedule from
zcomp_strm_alloc().  Use GFP_KERNEL directly and drop a gfp_t parameter.

Link: http://lkml.kernel.org/r/20160531122017.2878-9-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 16d37725a042cc66f9ee95889dd40e734264508e)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: If09c4a97f3d3e45ad578d2b1d64b26f65617774d
---
 drivers/block/zram/zcomp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 32e521a2b8c8..4b5cd3a7b2b6 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -47,9 +47,9 @@ static void zcomp_strm_free(struct zcomp_strm *zstrm)
  * allocate new zcomp_strm structure with ->tfm initialized by
  * backend, return NULL on error
  */
-static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
+static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
 {
-	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags);
+	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL);
 	if (!zstrm)
 		return NULL;
 
@@ -58,7 +58,7 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
 	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
 	 * case when compressed size is larger than the original one
 	 */
-	zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1);
+	zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
 	if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) {
 		zcomp_strm_free(zstrm);
 		zstrm = NULL;
@@ -169,7 +169,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
 	case CPU_UP_PREPARE:
 		if (WARN_ON(*per_cpu_ptr(comp->stream, cpu)))
 			break;
-		zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
+		zstrm = zcomp_strm_alloc(comp);
 		if (IS_ERR_OR_NULL(zstrm)) {
 			pr_err("Can't allocate a compression stream\n");
 			return NOTIFY_BAD;

From f17cacc9c73617c0e6cef668a8b61d6e3a626646 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 26 Jul 2016 15:23:34 -0700
Subject: [PATCH 1772/3220] UPSTREAM: zram: use __GFP_MOVABLE for memory
 allocation

Zsmalloc is ready for page migration so zram can use __GFP_MOVABLE from
now on.

I did test to see how it helps to make higher order pages.  Test
scenario is as follows.

KVM guest, 1G memory, ext4 formated zram block device,

  for i in `seq 1 8`;
  do
          dd if=/dev/vda1 of=mnt/test$i.txt bs=128M count=1 &
  done

  wait `pidof dd`

  for i in `seq 1 2 8`;
  do
          rm -rf mnt/test$i.txt
  done
  fstrim -v mnt

  echo "init"
  cat /proc/buddyinfo

  echo "compaction"
  echo 1 > /proc/sys/vm/compact_memory
  cat /proc/buddyinfo

old:

  init
  Node 0, zone      DMA    208    120     51     41     11      0      0      0      0      0      0
  Node 0, zone    DMA32  16380  13777   9184   3805    789     54      3      0      0      0      0
  compaction
  Node 0, zone      DMA    132     82     40     39     16      2      1      0      0      0      0
  Node 0, zone    DMA32   5219   5526   4969   3455   1831    677    139     15      0      0      0

new:

  init
  Node 0, zone      DMA    379    115     97     19      2      0      0      0      0      0      0
  Node 0, zone    DMA32  18891  16774  10862   3947    637     21      0      0      0      0      0
  compaction
  Node 0, zone      DMA    214     66     87     29     10      3      0      0      0      0      0
  Node 0, zone    DMA32   1612   3139   3154   2469   1745    990    384     94      7      0      0

As you can see, compaction made so many high-order pages. Yay!

Link: http://lkml.kernel.org/r/1464736881-24886-13-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 9bc482d3460501ac809457af26b46b72cd7dc212)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I5d7f6eaa4c2d8d3f4da30fc2bd21f4db1be95e50
---
 drivers/block/zram/zram_drv.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index be2ee475be5c..38a6181be4b4 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -738,7 +738,8 @@ compress_again:
 		handle = zs_malloc(meta->mem_pool, clen,
 				__GFP_KSWAPD_RECLAIM |
 				__GFP_NOWARN |
-				__GFP_HIGHMEM);
+				__GFP_HIGHMEM |
+				__GFP_MOVABLE);
 	if (!handle) {
 		zcomp_stream_put(zram->comp);
 		zstrm = NULL;
@@ -746,7 +747,8 @@ compress_again:
 		atomic64_inc(&zram->stats.writestall);
 
 		handle = zs_malloc(meta->mem_pool, clen,
-				GFP_NOIO | __GFP_HIGHMEM);
+				GFP_NOIO | __GFP_HIGHMEM |
+				__GFP_MOVABLE);
 		if (handle)
 			goto compress_again;
 

From 354502bc5e57e6825d3e04492d7583fdf43a9bd1 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 10 Jan 2017 16:58:15 -0800
Subject: [PATCH 1773/3220] BACKPORT: mm: support anonymous stable page

During developemnt for zram-swap asynchronous writeback, I found strange
corruption of compressed page, resulting in:

  Modules linked in: zram(E)
  CPU: 3 PID: 1520 Comm: zramd-1 Tainted: G            E   4.8.0-mm1-00320-ge0d4894c9c38-dirty #3274
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
  task: ffff88007620b840 task.stack: ffff880078090000
  RIP: set_freeobj.part.43+0x1c/0x1f
  RSP: 0018:ffff880078093ca8  EFLAGS: 00010246
  RAX: 0000000000000018 RBX: ffff880076798d88 RCX: ffffffff81c408c8
  RDX: 0000000000000018 RSI: 0000000000000000 RDI: 0000000000000246
  RBP: ffff880078093cb0 R08: 0000000000000000 R09: 0000000000000000
  R10: ffff88005bc43030 R11: 0000000000001df3 R12: ffff880076798d88
  R13: 000000000005bc43 R14: ffff88007819d1b8 R15: 0000000000000001
  FS:  0000000000000000(0000) GS:ffff88007e380000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007fc934048f20 CR3: 0000000077b01000 CR4: 00000000000406e0
  Call Trace:
    obj_malloc+0x22b/0x260
    zs_malloc+0x1e4/0x580
    zram_bvec_rw+0x4cd/0x830 [zram]
    page_requests_rw+0x9c/0x130 [zram]
    zram_thread+0xe6/0x173 [zram]
    kthread+0xca/0xe0
    ret_from_fork+0x25/0x30

With investigation, it reveals currently stable page doesn't support
anonymous page.  IOW, reuse_swap_page can reuse the page without waiting
writeback completion so it can overwrite page zram is compressing.

Unfortunately, zram has used per-cpu stream feature from v4.7.
It aims for increasing cache hit ratio of scratch buffer for
compressing. Downside of that approach is that zram should ask
memory space for compressed page in per-cpu context which requires
stricted gfp flag which could be failed. If so, it retries to
allocate memory space out of per-cpu context so it could get memory
this time and compress the data again, copies it to the memory space.

In this scenario, zram assumes the data should never be changed
but it is not true unless stable page supports. So, If the data is
changed under us, zram can make buffer overrun because second
compression size could be bigger than one we got in previous trial
and blindly, copy bigger size object to smaller buffer which is
buffer overrun. The overrun breaks zsmalloc free object chaining
so system goes crash like above.

I think below is same problem.
https://bugzilla.suse.com/show_bug.cgi?id=997574

Unfortunately, reuse_swap_page should be atomic so that we cannot wait on
writeback in there so the approach in this patch is simply return false if
we found it needs stable page.  Although it increases memory footprint
temporarily, it happens rarely and it should be reclaimed easily althoug
it happened.  Also, It would be better than waiting of IO completion,
which is critial path for application latency.

Fixes: da9556a2367c ("zram: user per-cpu compression streams")
Link: http://lkml.kernel.org/r/20161120233015.GA14113@bbox
Link: http://lkml.kernel.org/r/1482366980-3782-2-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Hyeoncheol Lee <cheol.lee@lge.com>
Cc: <yjay.kim@lge.com>
Cc: Sangseok Lee <sangseok.lee@lge.com>
Cc: <stable@vger.kernel.org> [4.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit f05714293a591038304ddae7cb0dd747bb3786cc)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I0fa5012aff9daf614b2d1d04f35b86ff7043ff21
---
 include/linux/swap.h |  3 ++-
 mm/swapfile.c        | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d8ca2eaa3a8b..644359725ad6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -150,8 +150,9 @@ enum {
 	SWP_FILE	= (1 << 7),	/* set after swap_activate success */
 	SWP_AREA_DISCARD = (1 << 8),	/* single-time swap area discards */
 	SWP_PAGE_DISCARD = (1 << 9),	/* freed swap page-cluster discards */
+	SWP_STABLE_WRITES = (1 << 10),	/* no overwrite PG_writeback pages */
 					/* add others here before... */
-	SWP_SCANNING	= (1 << 10),	/* refcount in scan_swap_map */
+	SWP_SCANNING	= (1 << 11),	/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 623c77c1327b..9f7bd5e8e68a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,11 +932,25 @@ int reuse_swap_page(struct page *page)
 	count = page_mapcount(page);
 	if (count <= 1 && PageSwapCache(page)) {
 		count += page_swapcount(page);
-		if (count == 1 && !PageWriteback(page)) {
+		if (count != 1)
+			goto out;
+		if (!PageWriteback(page)) {
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
+		} else {
+			swp_entry_t entry;
+			struct swap_info_struct *p;
+
+			entry.val = page_private(page);
+			p = swap_info_get(entry);
+			if (p->flags & SWP_STABLE_WRITES) {
+				spin_unlock(&p->lock);
+				return false;
+			}
+			spin_unlock(&p->lock);
 		}
 	}
+out:
 	return count <= 1;
 }
 
@@ -2481,6 +2495,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		error = -ENOMEM;
 		goto bad_swap;
 	}
+
+	if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
+		p->flags |= SWP_STABLE_WRITES;
+
 	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
 		int cpu;
 

From 767df69376b9384d34df548e93cccaa892c21373 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 10 Jan 2017 16:58:18 -0800
Subject: [PATCH 1774/3220] UPSTREAM: zram: revalidate disk under init_lock

Commit b4c5c60920e3 ("zram: avoid lockdep splat by revalidate_disk")
moved revalidate_disk call out of init_lock to avoid lockdep
false-positive splat.  However, commit 08eee69fcf6b ("zram: remove
init_lock in zram_make_request") removed init_lock in IO path so there
is no worry about lockdep splat.  So, let's restore it.

This patch is needed to set BDI_CAP_STABLE_WRITES atomically in next
patch.

Fixes: da9556a2367c ("zram: user per-cpu compression streams")
Link: http://lkml.kernel.org/r/1482366980-3782-3-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Hyeoncheol Lee <cheol.lee@lge.com>
Cc: <yjay.kim@lge.com>
Cc: Sangseok Lee <sangseok.lee@lge.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: <stable@vger.kernel.org> [4.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit e7ccfc4ccb703e0f033bd4617580039898e912dd)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Iebb6f694e46a797f8ce34029857c01c0c71086c7
---
 drivers/block/zram/zram_drv.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 38a6181be4b4..ce56b48cb09d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1091,14 +1091,8 @@ static ssize_t disksize_store(struct device *dev,
 	zram->comp = comp;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-	up_write(&zram->init_lock);
-
-	/*
-	 * Revalidate disk out of the init_lock to avoid lockdep splat.
-	 * It's okay because disk's capacity is protected by init_lock
-	 * so that revalidate_disk always sees up-to-date capacity.
-	 */
 	revalidate_disk(zram->disk);
+	up_write(&zram->init_lock);
 
 	return len;
 

From 17ce08110bb7db4383d4a7ab65923911f939f8ff Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 10 Jan 2017 16:58:21 -0800
Subject: [PATCH 1775/3220] UPSTREAM: zram: support BDI_CAP_STABLE_WRITES

zram has used per-cpu stream feature from v4.7.  It aims for increasing
cache hit ratio of scratch buffer for compressing.  Downside of that
approach is that zram should ask memory space for compressed page in
per-cpu context which requires stricted gfp flag which could be failed.
If so, it retries to allocate memory space out of per-cpu context so it
could get memory this time and compress the data again, copies it to the
memory space.

In this scenario, zram assumes the data should never be changed but it is
not true without stable page support.  So, If the data is changed under
us, zram can make buffer overrun so that zsmalloc free object chain is
broken so system goes crash like below

   https://bugzilla.suse.com/show_bug.cgi?id=997574

This patch adds BDI_CAP_STABLE_WRITES to zram for declaring "I am block
device needing *stable write*".

Fixes: da9556a2367c ("zram: user per-cpu compression streams")
Link: http://lkml.kernel.org/r/1482366980-3782-4-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Hyeoncheol Lee <cheol.lee@lge.com>
Cc: <yjay.kim@lge.com>
Cc: Sangseok Lee <sangseok.lee@lge.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: <stable@vger.kernel.org> [4.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit b09ab054b69b07077bd3292f67e777861ac796e5)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I3134a5882c1939792ffa71b8f31f7ab642a0e9a3
---
 drivers/block/zram/zram_drv.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ce56b48cb09d..3ccab7727e04 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -25,6 +25,7 @@
 #include <linux/genhd.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
 #include <linux/err.h>
@@ -111,6 +112,14 @@ static inline bool is_partial_io(struct bio_vec *bvec)
 	return bvec->bv_len != PAGE_SIZE;
 }
 
+static void zram_revalidate_disk(struct zram *zram)
+{
+	revalidate_disk(zram->disk);
+	/* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */
+	zram->disk->queue->backing_dev_info.capabilities |=
+		BDI_CAP_STABLE_WRITES;
+}
+
 /*
  * Check if request is within bounds and aligned on zram logical blocks.
  */
@@ -1091,7 +1100,7 @@ static ssize_t disksize_store(struct device *dev,
 	zram->comp = comp;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-	revalidate_disk(zram->disk);
+	zram_revalidate_disk(zram);
 	up_write(&zram->init_lock);
 
 	return len;
@@ -1139,7 +1148,7 @@ static ssize_t reset_store(struct device *dev,
 	/* Make sure all the pending I/O are finished */
 	fsync_bdev(bdev);
 	zram_reset_device(zram);
-	revalidate_disk(zram->disk);
+	zram_revalidate_disk(zram);
 	bdput(bdev);
 
 	mutex_lock(&bdev->bd_mutex);

From 2ac12d9f2b311a1d9b4024d18586f429d7dfdfc9 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Wed, 22 Feb 2017 15:46:45 -0800
Subject: [PATCH 1776/3220] UPSTREAM: zram: remove obsolete sysfs attrs

We had a deprecated_attr_warn() warning for 2 years and now the time has
come and we finally can do the cleanup.

The plan was as follows:

: per-stat sysfs attributes are considered to be deprecated.
: The basic strategy is:
: -- the existing RW nodes will be downgraded to WO nodes (in linux 4.11)
: -- deprecated RO sysfs nodes will eventually be removed (in linux 4.11)
:
: The list of deprecated attributes can be found here:
: Documentation/ABI/obsolete/sysfs-block-zram
:
: Basically, every attribute that has its own read accessible sysfs
: node (e.g. num_reads) *AND* is accessible via one of the stat files
: (zram<id>/stat or zram<id>/io_stat or zram<id>/mm_stat) is considered
: to be deprecated.

The patch also removes `obsolete/sysfs-block-zram', clean ups
`testing/sysfs-block-zram' and tweaks zram.txt files.

Link: http://lkml.kernel.org/r/20170118035838.11090-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit c87d1655c29500b459fb135258a93f8309ada9c7)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Idd86259230d6a4bf0feeee53b5b69f3f3df774d4
---
 Documentation/ABI/obsolete/sysfs-block-zram | 119 --------------------
 Documentation/ABI/testing/sysfs-block-zram  | 101 ++---------------
 Documentation/blockdev/zram.txt             |  74 ++++++------
 drivers/block/zram/zram_drv.c               | 101 +----------------
 4 files changed, 42 insertions(+), 353 deletions(-)
 delete mode 100644 Documentation/ABI/obsolete/sysfs-block-zram

diff --git a/Documentation/ABI/obsolete/sysfs-block-zram b/Documentation/ABI/obsolete/sysfs-block-zram
deleted file mode 100644
index 720ea92cfb2e..000000000000
--- a/Documentation/ABI/obsolete/sysfs-block-zram
+++ /dev/null
@@ -1,119 +0,0 @@
-What:		/sys/block/zram<id>/num_reads
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The num_reads file is read-only and specifies the number of
-		reads (failed or successful) done on this device.
-		Now accessible via zram<id>/stat node.
-
-What:		/sys/block/zram<id>/num_writes
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The num_writes file is read-only and specifies the number of
-		writes (failed or successful) done on this device.
-		Now accessible via zram<id>/stat node.
-
-What:		/sys/block/zram<id>/invalid_io
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The invalid_io file is read-only and specifies the number of
-		non-page-size-aligned I/O requests issued to this device.
-		Now accessible via zram<id>/io_stat node.
-
-What:		/sys/block/zram<id>/failed_reads
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The failed_reads file is read-only and specifies the number of
-		failed reads happened on this device.
-		Now accessible via zram<id>/io_stat node.
-
-What:		/sys/block/zram<id>/failed_writes
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The failed_writes file is read-only and specifies the number of
-		failed writes happened on this device.
-		Now accessible via zram<id>/io_stat node.
-
-What:		/sys/block/zram<id>/notify_free
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The notify_free file is read-only. Depending on device usage
-		scenario it may account a) the number of pages freed because
-		of swap slot free notifications or b) the number of pages freed
-		because of REQ_DISCARD requests sent by bio. The former ones
-		are sent to a swap block device when a swap slot is freed, which
-		implies that this disk is being used as a swap disk. The latter
-		ones are sent by filesystem mounted with discard option,
-		whenever some data blocks are getting discarded.
-		Now accessible via zram<id>/io_stat node.
-
-What:		/sys/block/zram<id>/zero_pages
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The zero_pages file is read-only and specifies number of zero
-		filled pages written to this disk. No memory is allocated for
-		such pages.
-		Now accessible via zram<id>/mm_stat node.
-
-What:		/sys/block/zram<id>/orig_data_size
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The orig_data_size file is read-only and specifies uncompressed
-		size of data stored in this disk. This excludes zero-filled
-		pages (zero_pages) since no memory is allocated for them.
-		Unit: bytes
-		Now accessible via zram<id>/mm_stat node.
-
-What:		/sys/block/zram<id>/compr_data_size
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The compr_data_size file is read-only and specifies compressed
-		size of data stored in this disk. So, compression ratio can be
-		calculated using orig_data_size and this statistic.
-		Unit: bytes
-		Now accessible via zram<id>/mm_stat node.
-
-What:		/sys/block/zram<id>/mem_used_total
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The mem_used_total file is read-only and specifies the amount
-		of memory, including allocator fragmentation and metadata
-		overhead, allocated for this disk. So, allocator space
-		efficiency can be calculated using compr_data_size and this
-		statistic.
-		Unit: bytes
-		Now accessible via zram<id>/mm_stat node.
-
-What:		/sys/block/zram<id>/mem_used_max
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The mem_used_max file is read/write and specifies the amount
-		of maximum memory zram have consumed to store compressed data.
-		For resetting the value, you should write "0". Otherwise,
-		you could see -EINVAL.
-		Unit: bytes
-		Downgraded to write-only node: so it's possible to set new
-		value only; its current value is stored in zram<id>/mm_stat
-		node.
-
-What:		/sys/block/zram<id>/mem_limit
-Date:		August 2015
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The mem_limit file is read/write and specifies the maximum
-		amount of memory ZRAM can use to store the compressed data.
-		The limit could be changed in run time and "0" means disable
-		the limit.  No limit is the initial state.  Unit: bytes
-		Downgraded to write-only node: so it's possible to set new
-		value only; its current value is stored in zram<id>/mm_stat
-		node.
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 4518d30b8c2e..451b6d882b2c 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -22,41 +22,6 @@ Description:
 		device. The reset operation frees all the memory associated
 		with this device.
 
-What:		/sys/block/zram<id>/num_reads
-Date:		August 2010
-Contact:	Nitin Gupta <ngupta@vflare.org>
-Description:
-		The num_reads file is read-only and specifies the number of
-		reads (failed or successful) done on this device.
-
-What:		/sys/block/zram<id>/num_writes
-Date:		August 2010
-Contact:	Nitin Gupta <ngupta@vflare.org>
-Description:
-		The num_writes file is read-only and specifies the number of
-		writes (failed or successful) done on this device.
-
-What:		/sys/block/zram<id>/invalid_io
-Date:		August 2010
-Contact:	Nitin Gupta <ngupta@vflare.org>
-Description:
-		The invalid_io file is read-only and specifies the number of
-		non-page-size-aligned I/O requests issued to this device.
-
-What:		/sys/block/zram<id>/failed_reads
-Date:		February 2014
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The failed_reads file is read-only and specifies the number of
-		failed reads happened on this device.
-
-What:		/sys/block/zram<id>/failed_writes
-Date:		February 2014
-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
-		The failed_writes file is read-only and specifies the number of
-		failed writes happened on this device.
-
 What:		/sys/block/zram<id>/max_comp_streams
 Date:		February 2014
 Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
@@ -73,74 +38,24 @@ Description:
 		available and selected compression algorithms, change
 		compression algorithm selection.
 
-What:		/sys/block/zram<id>/notify_free
-Date:		August 2010
-Contact:	Nitin Gupta <ngupta@vflare.org>
-Description:
-		The notify_free file is read-only. Depending on device usage
-		scenario it may account a) the number of pages freed because
-		of swap slot free notifications or b) the number of pages freed
-		because of REQ_DISCARD requests sent by bio. The former ones
-		are sent to a swap block device when a swap slot is freed, which
-		implies that this disk is being used as a swap disk. The latter
-		ones are sent by filesystem mounted with discard option,
-		whenever some data blocks are getting discarded.
-
-What:		/sys/block/zram<id>/zero_pages
-Date:		August 2010
-Contact:	Nitin Gupta <ngupta@vflare.org>
-Description:
-		The zero_pages file is read-only and specifies number of zero
-		filled pages written to this disk. No memory is allocated for
-		such pages.
-
-What:		/sys/block/zram<id>/orig_data_size
-Date:		August 2010
-Contact:	Nitin Gupta <ngupta@vflare.org>
-Description:
-		The orig_data_size file is read-only and specifies uncompressed
-		size of data stored in this disk. This excludes zero-filled
-		pages (zero_pages) since no memory is allocated for them.
-		Unit: bytes
-
-What:		/sys/block/zram<id>/compr_data_size
-Date:		August 2010
-Contact:	Nitin Gupta <ngupta@vflare.org>
-Description:
-		The compr_data_size file is read-only and specifies compressed
-		size of data stored in this disk. So, compression ratio can be
-		calculated using orig_data_size and this statistic.
-		Unit: bytes
-
-What:		/sys/block/zram<id>/mem_used_total
-Date:		August 2010
-Contact:	Nitin Gupta <ngupta@vflare.org>
-Description:
-		The mem_used_total file is read-only and specifies the amount
-		of memory, including allocator fragmentation and metadata
-		overhead, allocated for this disk. So, allocator space
-		efficiency can be calculated using compr_data_size and this
-		statistic.
-		Unit: bytes
-
 What:		/sys/block/zram<id>/mem_used_max
 Date:		August 2014
 Contact:	Minchan Kim <minchan@kernel.org>
 Description:
-		The mem_used_max file is read/write and specifies the amount
-		of maximum memory zram have consumed to store compressed data.
-		For resetting the value, you should write "0". Otherwise,
-		you could see -EINVAL.
+		The mem_used_max file is write-only and is used to reset
+		the counter of maximum memory zram have consumed to store
+		compressed data. For resetting the value, you should write
+		"0". Otherwise, you could see -EINVAL.
 		Unit: bytes
 
 What:		/sys/block/zram<id>/mem_limit
 Date:		August 2014
 Contact:	Minchan Kim <minchan@kernel.org>
 Description:
-		The mem_limit file is read/write and specifies the maximum
-		amount of memory ZRAM can use to store the compressed data.  The
-		limit could be changed in run time and "0" means disable the
-		limit.  No limit is the initial state.  Unit: bytes
+		The mem_limit file is write-only and specifies the maximum
+		amount of memory ZRAM can use to store the compressed data.
+		The limit could be changed in run time and "0" means disable
+		the limit. No limit is the initial state.  Unit: bytes
 
 What:		/sys/block/zram<id>/compact
 Date:		August 2015
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 0535ae1f73e5..1c0c08d9206b 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -161,42 +161,14 @@ Name            access            description
 disksize          RW    show and set the device's disk size
 initstate         RO    shows the initialization state of the device
 reset             WO    trigger device reset
-num_reads         RO    the number of reads
-failed_reads      RO    the number of failed reads
-num_write         RO    the number of writes
-failed_writes     RO    the number of failed writes
-invalid_io        RO    the number of non-page-size-aligned I/O requests
+mem_used_max      WO    reset the `mem_used_max' counter (see later)
+mem_limit         WO    specifies the maximum amount of memory ZRAM can use
+                        to store the compressed data
 max_comp_streams  RW    the number of possible concurrent compress operations
 comp_algorithm    RW    show and change the compression algorithm
-notify_free       RO    the number of notifications to free pages (either
-                        slot free notifications or REQ_DISCARD requests)
-zero_pages        RO    the number of zero filled pages written to this disk
-orig_data_size    RO    uncompressed size of data stored in this disk
-compr_data_size   RO    compressed size of data stored in this disk
-mem_used_total    RO    the amount of memory allocated for this disk
-mem_used_max      RW    the maximum amount of memory zram have consumed to
-                        store the data (to reset this counter to the actual
-                        current value, write 1 to this attribute)
-mem_limit         RW    the maximum amount of memory ZRAM can use to store
-                        the compressed data
-pages_compacted   RO    the number of pages freed during compaction
-                        (available only via zram<id>/mm_stat node)
 compact           WO    trigger memory compaction
 debug_stat        RO    this file is used for zram debugging purposes
 
-WARNING
-=======
-per-stat sysfs attributes are considered to be deprecated.
-The basic strategy is:
--- the existing RW nodes will be downgraded to WO nodes (in linux 4.11)
--- deprecated RO sysfs nodes will eventually be removed (in linux 4.11)
-
-The list of deprecated attributes can be found here:
-Documentation/ABI/obsolete/sysfs-block-zram
-
-Basically, every attribute that has its own read accessible sysfs node
-(e.g. num_reads) *AND* is accessible via one of the stat files (zram<id>/stat
-or zram<id>/io_stat or zram<id>/mm_stat) is considered to be deprecated.
 
 User space is advised to use the following files to read the device statistics.
 
@@ -211,22 +183,40 @@ The stat file represents device's I/O statistics not accounted by block
 layer and, thus, not available in zram<id>/stat file. It consists of a
 single line of text and contains the following stats separated by
 whitespace:
-	failed_reads
-	failed_writes
-	invalid_io
-	notify_free
+ failed_reads     the number of failed reads
+ failed_writes    the number of failed writes
+ invalid_io       the number of non-page-size-aligned I/O requests
+ notify_free      Depending on device usage scenario it may account
+                  a) the number of pages freed because of swap slot free
+                  notifications or b) the number of pages freed because of
+                  REQ_DISCARD requests sent by bio. The former ones are
+                  sent to a swap block device when a swap slot is freed,
+                  which implies that this disk is being used as a swap disk.
+                  The latter ones are sent by filesystem mounted with
+                  discard option, whenever some data blocks are getting
+                  discarded.
 
 File /sys/block/zram<id>/mm_stat
 
 The stat file represents device's mm statistics. It consists of a single
 line of text and contains the following stats separated by whitespace:
-	orig_data_size
-	compr_data_size
-	mem_used_total
-	mem_limit
-	mem_used_max
-	zero_pages
-	num_migrated
+ orig_data_size   uncompressed size of data stored in this disk.
+                  This excludes zero-filled pages (zero_pages) since no
+                  memory is allocated for them.
+                  Unit: bytes
+ compr_data_size  compressed size of data stored in this disk
+ mem_used_total   the amount of memory allocated for this disk. This
+                  includes allocator fragmentation and metadata overhead,
+                  allocated for this disk. So, allocator space efficiency
+                  can be calculated using compr_data_size and this statistic.
+                  Unit: bytes
+ mem_limit        the maximum amount of memory ZRAM can use to store
+                  the compressed data
+ mem_used_max     the maximum amount of memory zram have consumed to
+                  store the data
+ zero_pages       the number of zero filled pages written to this disk.
+                  No memory is allocated for such pages.
+ pages_compacted  the number of pages freed during compaction
 
 9) Deactivate:
 	swapoff /dev/zram0
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 3ccab7727e04..022cda34ef34 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -44,27 +44,6 @@ static const char *default_compressor = "lzo";
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
 
-static inline void deprecated_attr_warn(const char *name)
-{
-	pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n",
-			task_pid_nr(current),
-			current->comm,
-			name,
-			"See zram documentation.");
-}
-
-#define ZRAM_ATTR_RO(name)						\
-static ssize_t name##_show(struct device *d,				\
-				struct device_attribute *attr, char *b)	\
-{									\
-	struct zram *zram = dev_to_zram(d);				\
-									\
-	deprecated_attr_warn(__stringify(name));			\
-	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
-		(u64)atomic64_read(&zram->stats.name));			\
-}									\
-static DEVICE_ATTR_RO(name);
-
 static inline bool init_done(struct zram *zram)
 {
 	return zram->disksize;
@@ -217,47 +196,6 @@ static ssize_t disksize_show(struct device *dev,
 	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
 }
 
-static ssize_t orig_data_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	deprecated_attr_warn("orig_data_size");
-	return scnprintf(buf, PAGE_SIZE, "%llu\n",
-		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
-}
-
-static ssize_t mem_used_total_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	u64 val = 0;
-	struct zram *zram = dev_to_zram(dev);
-
-	deprecated_attr_warn("mem_used_total");
-	down_read(&zram->init_lock);
-	if (init_done(zram)) {
-		struct zram_meta *meta = zram->meta;
-		val = zs_get_total_pages(meta->mem_pool);
-	}
-	up_read(&zram->init_lock);
-
-	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
-}
-
-static ssize_t mem_limit_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	u64 val;
-	struct zram *zram = dev_to_zram(dev);
-
-	deprecated_attr_warn("mem_limit");
-	down_read(&zram->init_lock);
-	val = zram->limit_pages;
-	up_read(&zram->init_lock);
-
-	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
-}
-
 static ssize_t mem_limit_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
@@ -276,21 +214,6 @@ static ssize_t mem_limit_store(struct device *dev,
 	return len;
 }
 
-static ssize_t mem_used_max_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	u64 val = 0;
-	struct zram *zram = dev_to_zram(dev);
-
-	deprecated_attr_warn("mem_used_max");
-	down_read(&zram->init_lock);
-	if (init_done(zram))
-		val = atomic_long_read(&zram->stats.max_used_pages);
-	up_read(&zram->init_lock);
-
-	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
-}
-
 static ssize_t mem_used_max_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
@@ -466,14 +389,6 @@ static ssize_t debug_stat_show(struct device *dev,
 static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
 static DEVICE_ATTR_RO(debug_stat);
-ZRAM_ATTR_RO(num_reads);
-ZRAM_ATTR_RO(num_writes);
-ZRAM_ATTR_RO(failed_reads);
-ZRAM_ATTR_RO(failed_writes);
-ZRAM_ATTR_RO(invalid_io);
-ZRAM_ATTR_RO(notify_free);
-ZRAM_ATTR_RO(zero_pages);
-ZRAM_ATTR_RO(compr_data_size);
 
 static inline bool zram_meta_get(struct zram *zram)
 {
@@ -1184,10 +1099,8 @@ static DEVICE_ATTR_WO(compact);
 static DEVICE_ATTR_RW(disksize);
 static DEVICE_ATTR_RO(initstate);
 static DEVICE_ATTR_WO(reset);
-static DEVICE_ATTR_RO(orig_data_size);
-static DEVICE_ATTR_RO(mem_used_total);
-static DEVICE_ATTR_RW(mem_limit);
-static DEVICE_ATTR_RW(mem_used_max);
+static DEVICE_ATTR_WO(mem_limit);
+static DEVICE_ATTR_WO(mem_used_max);
 static DEVICE_ATTR_RW(max_comp_streams);
 static DEVICE_ATTR_RW(comp_algorithm);
 
@@ -1195,17 +1108,7 @@ static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_disksize.attr,
 	&dev_attr_initstate.attr,
 	&dev_attr_reset.attr,
-	&dev_attr_num_reads.attr,
-	&dev_attr_num_writes.attr,
-	&dev_attr_failed_reads.attr,
-	&dev_attr_failed_writes.attr,
 	&dev_attr_compact.attr,
-	&dev_attr_invalid_io.attr,
-	&dev_attr_notify_free.attr,
-	&dev_attr_zero_pages.attr,
-	&dev_attr_orig_data_size.attr,
-	&dev_attr_compr_data_size.attr,
-	&dev_attr_mem_used_total.attr,
 	&dev_attr_mem_limit.attr,
 	&dev_attr_mem_used_max.attr,
 	&dev_attr_max_comp_streams.attr,

From 599a92fdd33d5000f832de252791274b4c2c179d Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 24 Feb 2017 14:56:47 -0800
Subject: [PATCH 1777/3220] BACKPORT: zram: remove waitqueue for IO done

zram_reset_device() waits for ongoing writepage pages to be completed by
zram->refcount logic.  However, it's pointless because before the reset,
we prevent further opening of zram by zram->claim and flush all of
pending IO by fsync_bdev so there should be no pending IO at the
zram_reset_device().

So let's remove that code which is even broken due to the lack of
wake_up elsewhere.

Link: http://lkml.kernel.org/r/1485145031-11661-1-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit a09759acaacf6cf738e1bc6c66d41485c87fd371)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I97170fb576be7baae63f82334af0dd5e91b16763
---
 drivers/block/zram/zram_drv.c | 40 +++--------------------------------
 drivers/block/zram/zram_drv.h |  3 ---
 2 files changed, 3 insertions(+), 40 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 022cda34ef34..c4cd527bc66c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -390,18 +390,6 @@ static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
 static DEVICE_ATTR_RO(debug_stat);
 
-static inline bool zram_meta_get(struct zram *zram)
-{
-	if (atomic_inc_not_zero(&zram->refcount))
-		return true;
-	return false;
-}
-
-static inline void zram_meta_put(struct zram *zram)
-{
-	atomic_dec(&zram->refcount);
-}
-
 static void zram_meta_free(struct zram_meta *meta, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
@@ -855,22 +843,17 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
-	if (unlikely(!zram_meta_get(zram)))
-		goto error;
-
 	blk_queue_split(queue, &bio, queue->bio_split);
 
 	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
 					bio->bi_iter.bi_size)) {
 		atomic64_inc(&zram->stats.invalid_io);
-		goto put_zram;
+		goto error;
 	}
 
 	__zram_make_request(zram, bio);
-	zram_meta_put(zram);
 	return BLK_QC_T_NONE;
-put_zram:
-	zram_meta_put(zram);
+
 error:
 	bio_io_error(bio);
 	return BLK_QC_T_NONE;
@@ -900,13 +883,11 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	struct bio_vec bv;
 
 	zram = bdev->bd_disk->private_data;
-	if (unlikely(!zram_meta_get(zram)))
-		goto out;
 
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
 		atomic64_inc(&zram->stats.invalid_io);
 		err = -EINVAL;
-		goto put_zram;
+		goto out;
 	}
 
 	index = sector >> SECTORS_PER_PAGE_SHIFT;
@@ -917,8 +898,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_offset = 0;
 
 	err = zram_bvec_rw(zram, &bv, index, offset, rw);
-put_zram:
-	zram_meta_put(zram);
 out:
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
@@ -951,17 +930,6 @@ static void zram_reset_device(struct zram *zram)
 	meta = zram->meta;
 	comp = zram->comp;
 	disksize = zram->disksize;
-	/*
-	 * Refcount will go down to 0 eventually and r/w handler
-	 * cannot handle further I/O so it will bail out by
-	 * check zram_meta_get.
-	 */
-	zram_meta_put(zram);
-	/*
-	 * We want to free zram_meta in process context to avoid
-	 * deadlock between reclaim path and any other locks.
-	 */
-	wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
 
 	/* Reset stats */
 	memset(&zram->stats, 0, sizeof(zram->stats));
@@ -1009,8 +977,6 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_destroy_comp;
 	}
 
-	init_waitqueue_head(&zram->io_done);
-	atomic_set(&zram->refcount, 1);
 	zram->meta = meta;
 	zram->comp = comp;
 	zram->disksize = disksize;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 74fcf10da374..2692554b7737 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -106,9 +106,6 @@ struct zram {
 	unsigned long limit_pages;
 
 	struct zram_stats stats;
-	atomic_t refcount; /* refcount for zram_meta */
-	/* wait all IO under all of cpu are done */
-	wait_queue_head_t io_done;
 	/*
 	 * This is the limit on amount of *uncompressed* worth of data
 	 * we can store in a disk.

From 72692005f580ba4191fe737aa907568560f4bcd3 Mon Sep 17 00:00:00 2001
From: zhouxianrong <zhouxianrong@huawei.com>
Date: Fri, 24 Feb 2017 14:59:27 -0800
Subject: [PATCH 1778/3220] BACKPORT: zram: extend zero pages to same element
 pages

The idea is that without doing more calculations we extend zero pages to
same element pages for zram.  zero page is special case of same element
page with zero element.

1. the test is done under android 7.0
2. startup too many applications circularly
3. sample the zero pages, same pages (none-zero element)
   and total pages in function page_zero_filled

the result is listed as below:

ZERO	SAME	TOTAL
36214	17842	598196

		ZERO/TOTAL	 SAME/TOTAL	  (ZERO+SAME)/TOTAL ZERO/SAME
AVERAGE	0.060631909	 0.024990816  0.085622726		2.663825038
STDEV	0.00674612	 0.005887625  0.009707034		2.115881328
MAX		0.069698422	 0.030046087  0.094975336		7.56043956
MIN		0.03959586	 0.007332205  0.056055193		1.928985507

from the above data, the benefit is about 2.5% and up to 3% of total
swapout pages.

The defect of the patch is that when we recovery a page from non-zero
element the operations are low efficient for partial read.

This patch extends zero_page to same_page so if there is any user to
have monitored zero_pages, he will be surprised if the number is
increased but it's not harmful, I believe.

[minchan@kernel.org: do not free same element pages in zram_meta_free]
  Link: http://lkml.kernel.org/r/20170207065741.GA2567@bbox
Link: http://lkml.kernel.org/r/1483692145-75357-1-git-send-email-zhouxianrong@huawei.com
Link: http://lkml.kernel.org/r/1486307804-27903-1-git-send-email-minchan@kernel.org
Signed-off-by: zhouxianrong <zhouxianrong@huawei.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 8e19d540d107ee897eb9a874844060c94e2376c0)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I92ebb07a6ad96be82443d6e0c0d4f25cbe936915
---
 Documentation/blockdev/zram.txt |  6 +--
 drivers/block/zram/zram_drv.c   | 87 +++++++++++++++++++++++----------
 drivers/block/zram/zram_drv.h   |  9 ++--
 3 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 1c0c08d9206b..4fced8a21307 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -201,8 +201,8 @@ File /sys/block/zram<id>/mm_stat
 The stat file represents device's mm statistics. It consists of a single
 line of text and contains the following stats separated by whitespace:
  orig_data_size   uncompressed size of data stored in this disk.
-                  This excludes zero-filled pages (zero_pages) since no
-                  memory is allocated for them.
+		  This excludes same-element-filled pages (same_pages) since
+		  no memory is allocated for them.
                   Unit: bytes
  compr_data_size  compressed size of data stored in this disk
  mem_used_total   the amount of memory allocated for this disk. This
@@ -214,7 +214,7 @@ line of text and contains the following stats separated by whitespace:
                   the compressed data
  mem_used_max     the maximum amount of memory zram have consumed to
                   store the data
- zero_pages       the number of zero filled pages written to this disk.
+ same_pages       the number of same element filled pages written to this disk.
                   No memory is allocated for such pages.
  pages_compacted  the number of pages freed during compaction
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index c4cd527bc66c..28a0e5169756 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -73,6 +73,17 @@ static void zram_clear_flag(struct zram_meta *meta, u32 index,
 	meta->table[index].value &= ~BIT(flag);
 }
 
+static inline void zram_set_element(struct zram_meta *meta, u32 index,
+			unsigned long element)
+{
+	meta->table[index].element = element;
+}
+
+static inline void zram_clear_element(struct zram_meta *meta, u32 index)
+{
+	meta->table[index].element = 0;
+}
+
 static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
 {
 	return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
@@ -145,31 +156,46 @@ static inline void update_used_max(struct zram *zram,
 	} while (old_max != cur_max);
 }
 
-static bool page_zero_filled(void *ptr)
+static inline void zram_fill_page(char *ptr, unsigned long len,
+					unsigned long value)
+{
+	int i;
+	unsigned long *page = (unsigned long *)ptr;
+
+	WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
+
+	if (likely(value == 0)) {
+		memset(ptr, 0, len);
+	} else {
+		for (i = 0; i < len / sizeof(*page); i++)
+			page[i] = value;
+	}
+}
+
+static bool page_same_filled(void *ptr, unsigned long *element)
 {
 	unsigned int pos;
 	unsigned long *page;
 
 	page = (unsigned long *)ptr;
 
-	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
-		if (page[pos])
+	for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) {
+		if (page[pos] != page[pos + 1])
 			return false;
 	}
 
+	*element = page[pos];
+
 	return true;
 }
 
-static void handle_zero_page(struct bio_vec *bvec)
+static void handle_same_page(struct bio_vec *bvec, unsigned long element)
 {
 	struct page *page = bvec->bv_page;
 	void *user_mem;
 
 	user_mem = kmap_atomic(page);
-	if (is_partial_io(bvec))
-		memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
-	else
-		clear_page(user_mem);
+	zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element);
 	kunmap_atomic(user_mem);
 
 	flush_dcache_page(page);
@@ -362,7 +388,7 @@ static ssize_t mm_stat_show(struct device *dev,
 			mem_used << PAGE_SHIFT,
 			zram->limit_pages << PAGE_SHIFT,
 			max_used << PAGE_SHIFT,
-			(u64)atomic64_read(&zram->stats.zero_pages),
+			(u64)atomic64_read(&zram->stats.same_pages),
 			pool_stats.pages_compacted);
 	up_read(&zram->init_lock);
 
@@ -398,8 +424,11 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)
 	/* Free all pages that are still in this zram device */
 	for (index = 0; index < num_pages; index++) {
 		unsigned long handle = meta->table[index].handle;
-
-		if (!handle)
+		/*
+		 * No memory is allocated for same element filled pages.
+		 * Simply clear same page flag.
+		 */
+		if (!handle || zram_test_flag(meta, index, ZRAM_SAME))
 			continue;
 
 		zs_free(meta->mem_pool, handle);
@@ -449,18 +478,20 @@ static void zram_free_page(struct zram *zram, size_t index)
 	struct zram_meta *meta = zram->meta;
 	unsigned long handle = meta->table[index].handle;
 
-	if (unlikely(!handle)) {
-		/*
-		 * No memory is allocated for zero filled pages.
-		 * Simply clear zero page flag.
-		 */
-		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
-			zram_clear_flag(meta, index, ZRAM_ZERO);
-			atomic64_dec(&zram->stats.zero_pages);
-		}
+	/*
+	 * No memory is allocated for same element filled pages.
+	 * Simply clear same page flag.
+	 */
+	if (zram_test_flag(meta, index, ZRAM_SAME)) {
+		zram_clear_flag(meta, index, ZRAM_SAME);
+		zram_clear_element(meta, index);
+		atomic64_dec(&zram->stats.same_pages);
 		return;
 	}
 
+	if (!handle)
+		return;
+
 	zs_free(meta->mem_pool, handle);
 
 	atomic64_sub(zram_get_obj_size(meta, index),
@@ -483,9 +514,9 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
 	handle = meta->table[index].handle;
 	size = zram_get_obj_size(meta, index);
 
-	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
+	if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) {
 		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-		memset(mem, 0, PAGE_SIZE);
+		zram_fill_page(mem, PAGE_SIZE, meta->table[index].element);
 		return 0;
 	}
 
@@ -521,9 +552,9 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 
 	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
 	if (unlikely(!meta->table[index].handle) ||
-			zram_test_flag(meta, index, ZRAM_ZERO)) {
+			zram_test_flag(meta, index, ZRAM_SAME)) {
 		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-		handle_zero_page(bvec);
+		handle_same_page(bvec, meta->table[index].element);
 		return 0;
 	}
 	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
@@ -571,6 +602,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 	struct zram_meta *meta = zram->meta;
 	struct zcomp_strm *zstrm = NULL;
 	unsigned long alloced_pages;
+	unsigned long element;
 
 	page = bvec->bv_page;
 	if (is_partial_io(bvec)) {
@@ -599,16 +631,17 @@ compress_again:
 		uncmem = user_mem;
 	}
 
-	if (page_zero_filled(uncmem)) {
+	if (page_same_filled(uncmem, &element)) {
 		if (user_mem)
 			kunmap_atomic(user_mem);
 		/* Free memory associated with this sector now. */
 		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
 		zram_free_page(zram, index);
-		zram_set_flag(meta, index, ZRAM_ZERO);
+		zram_set_flag(meta, index, ZRAM_SAME);
+		zram_set_element(meta, index, element);
 		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
 
-		atomic64_inc(&zram->stats.zero_pages);
+		atomic64_inc(&zram->stats.same_pages);
 		ret = 0;
 		goto out;
 	}
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 2692554b7737..caeff51f1571 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -61,7 +61,7 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
 /* Flags for zram pages (table[page_no].value) */
 enum zram_pageflags {
 	/* Page consists entirely of zeros */
-	ZRAM_ZERO = ZRAM_FLAG_SHIFT,
+	ZRAM_SAME = ZRAM_FLAG_SHIFT,
 	ZRAM_ACCESS,	/* page is now accessed */
 
 	__NR_ZRAM_PAGEFLAGS,
@@ -71,7 +71,10 @@ enum zram_pageflags {
 
 /* Allocated for each disk page */
 struct zram_table_entry {
-	unsigned long handle;
+	union {
+		unsigned long handle;
+		unsigned long element;
+	};
 	unsigned long value;
 };
 
@@ -83,7 +86,7 @@ struct zram_stats {
 	atomic64_t failed_writes;	/* can happen when memory is too low */
 	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
 	atomic64_t notify_free;	/* no. of swap slot free notifications */
-	atomic64_t zero_pages;		/* no. of zero filled pages */
+	atomic64_t same_pages;		/* no. of same element filled pages */
 	atomic64_t pages_stored;	/* no. of pages currently stored */
 	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
 	atomic64_t writestall;		/* no. of write slow paths */

From a612a2fc1494c64a48834eab4dbc96c9a1e87e60 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 13 Apr 2017 14:56:35 -0700
Subject: [PATCH 1779/3220] UPSTREAM: zram: fix operator precedence to get
 offset

In zram_rw_page, the logic to get offset is wrong by operator precedence
(i.e., "<<" is higher than "&").  With wrong offset, zram can corrupt
the user's data.  This patch fixes it.

Fixes: 8c7f01025 ("zram: implement rw_page operation of zram")
Link: http://lkml.kernel.org/r/1492042622-12074-1-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 4ca82dabc9fbf7bc5322aa54d802cb3cb7b125c5)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I6abb2aef381463976aea1fa8e7f5ca07367190e9
---
 drivers/block/zram/zram_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 28a0e5169756..b064c0b3c4c5 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -924,7 +924,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	}
 
 	index = sector >> SECTORS_PER_PAGE_SHIFT;
-	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
+	offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
 
 	bv.bv_page = page;
 	bv.bv_len = PAGE_SIZE;

From 5a1f3f4275c70fccb097e24b8c80cac9626a55ac Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:55:38 -0700
Subject: [PATCH 1780/3220] BACKPORT: zram: handle multiple pages attached
 bio's bvec

Patch series "zram clean up", v2.

This patchset aims to clean up zram .

[1] clean up multiple pages's bvec handling.
[2] clean up partial IO handling
[3-6] clean up zram via using accessor and removing pointless structure.

With [2-6] applied, we can get a few hundred bytes as well as huge
readibility enhance.

x86: 708 byte save

    add/remove: 1/1 grow/shrink: 0/11 up/down: 478/-1186 (-708)
    function                                     old     new   delta
    zram_special_page_read                         -     478    +478
    zram_reset_device                            317     314      -3
    mem_used_max_store                           131     128      -3
    compact_store                                 96      93      -3
    mm_stat_show                                 203     197      -6
    zram_add                                     719     712      -7
    zram_slot_free_notify                        229     214     -15
    zram_make_request                            819     803     -16
    zram_meta_free                               128     111     -17
    zram_free_page                               180     151     -29
    disksize_store                               432     361     -71
    zram_decompress_page.isra                    504       -    -504
    zram_bvec_rw                                2592    2080    -512
    Total: Before=25350773, After=25350065, chg -0.00%

ppc64: 231 byte save

    add/remove: 2/0 grow/shrink: 1/9 up/down: 681/-912 (-231)
    function                                     old     new   delta
    zram_special_page_read                         -     480    +480
    zram_slot_lock                                 -     200    +200
    vermagic                                      39      40      +1
    mm_stat_show                                 256     248      -8
    zram_meta_free                               200     184     -16
    zram_add                                     944     912     -32
    zram_free_page                               348     308     -40
    disksize_store                               572     492     -80
    zram_decompress_page                         664     564    -100
    zram_slot_free_notify                        292     160    -132
    zram_make_request                           1132    1000    -132
    zram_bvec_rw                                2768    2396    -372
    Total: Before=17565825, After=17565594, chg -0.00%

This patch (of 6):

Johannes Thumshirn reported system goes the panic when using NVMe over
Fabrics loopback target with zram.

The reason is zram expects each bvec in bio contains a single page
but nvme can attach a huge bulk of pages attached to the bio's bvec
so that zram's index arithmetic could be wrong so that out-of-bound
access makes system panic.

[1] in mainline solved solved the problem by limiting max_sectors with
SECTORS_PER_PAGE but it makes zram slow because bio should split with
each pages so this patch makes zram aware of multiple pages in a bvec
so it could solve without any regression(ie, bio split).

[1] 0bc315381fe9, zram: set physical queue limits to avoid array out of
    bounds accesses

Link: http://lkml.kernel.org/r/20170413134057.GA27499@bbox
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit e86942c7b6c1e1dd5e539f3bf3cfb63799163048)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ibedc9e8163fc16a0c2569e8c3e33dd81bb325ee5
---
 drivers/block/zram/zram_drv.c | 36 ++++++++++-------------------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index b064c0b3c4c5..90d9c29ebb74 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -136,8 +136,7 @@ static inline bool valid_io_request(struct zram *zram,
 
 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
 {
-	if (*offset + bvec->bv_len >= PAGE_SIZE)
-		(*index)++;
+	*index  += (*offset + bvec->bv_len) / PAGE_SIZE;
 	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
 }
 
@@ -835,31 +834,20 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 
 	rw = bio_data_dir(bio);
 	bio_for_each_segment(bvec, bio, iter) {
-		int max_transfer_size = PAGE_SIZE - offset;
-
-		if (bvec.bv_len > max_transfer_size) {
-			/*
-			 * zram_bvec_rw() can only make operation on a single
-			 * zram page. Split the bio vector.
-			 */
-			struct bio_vec bv;
-
-			bv.bv_page = bvec.bv_page;
-			bv.bv_len = max_transfer_size;
-			bv.bv_offset = bvec.bv_offset;
+		struct bio_vec bv = bvec;
+		unsigned int unwritten = bvec.bv_len;
 
+		do {
+			bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
+							unwritten);
 			if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
 				goto out;
 
-			bv.bv_len = bvec.bv_len - max_transfer_size;
-			bv.bv_offset += max_transfer_size;
-			if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0)
-				goto out;
-		} else
-			if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0)
-				goto out;
+			bv.bv_offset += bv.bv_len;
+			unwritten -= bv.bv_len;
 
-		update_position(&index, &offset, &bvec);
+			update_position(&index, &offset, &bv);
+		} while (unwritten);
 	}
 
 	bio_endio(bio);
@@ -876,8 +864,6 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
-	blk_queue_split(queue, &bio, queue->bio_split);
-
 	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
 					bio->bi_iter.bi_size)) {
 		atomic64_inc(&zram->stats.invalid_io);
@@ -1185,8 +1171,6 @@ static int zram_add(void)
 	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
 	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
 	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
-	zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE;
-	zram->disk->queue->limits.chunk_sectors = 0;
 	blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
 	/*
 	 * zram_bio_discard() will clear all logical blocks if logical block

From 9956749a9a6e812481a83404bae6ff0396bb8326 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:55:41 -0700
Subject: [PATCH 1781/3220] BACKPORT: zram: partial IO refactoring

For architecture(PAGE_SIZE > 4K), zram have supported partial IO.
However, the mixed code for handling normal/partial IO is too mess,
error-prone to modify IO handler functions with upcoming feature so this
patch aims for cleaning up zram's IO handling functions.

Link: http://lkml.kernel.org/r/1492052365-16169-3-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 1f7319c7427503abe2d365683588827b80f5714e)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I0f023d646d17f8156130cd0507b65f2223768adf
---
 drivers/block/zram/zram_drv.c | 359 ++++++++++++++++++----------------
 1 file changed, 195 insertions(+), 164 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 90d9c29ebb74..e480eaff1806 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -44,6 +44,8 @@ static const char *default_compressor = "lzo";
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
 
+static void zram_free_page(struct zram *zram, size_t index);
+
 static inline bool init_done(struct zram *zram)
 {
 	return zram->disksize;
@@ -97,10 +99,17 @@ static void zram_set_obj_size(struct zram_meta *meta,
 	meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
 }
 
+#if PAGE_SIZE != 4096
 static inline bool is_partial_io(struct bio_vec *bvec)
 {
 	return bvec->bv_len != PAGE_SIZE;
 }
+#else
+static inline bool is_partial_io(struct bio_vec *bvec)
+{
+	return false;
+}
+#endif
 
 static void zram_revalidate_disk(struct zram *zram)
 {
@@ -188,18 +197,6 @@ static bool page_same_filled(void *ptr, unsigned long *element)
 	return true;
 }
 
-static void handle_same_page(struct bio_vec *bvec, unsigned long element)
-{
-	struct page *page = bvec->bv_page;
-	void *user_mem;
-
-	user_mem = kmap_atomic(page);
-	zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element);
-	kunmap_atomic(user_mem);
-
-	flush_dcache_page(page);
-}
-
 static ssize_t initstate_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -415,6 +412,53 @@ static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
 static DEVICE_ATTR_RO(debug_stat);
 
+static bool zram_same_page_read(struct zram *zram, u32 index,
+				struct page *page,
+				unsigned int offset, unsigned int len)
+{
+	struct zram_meta *meta = zram->meta;
+
+	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	if (unlikely(!meta->table[index].handle) ||
+			zram_test_flag(meta, index, ZRAM_SAME)) {
+		void *mem;
+
+		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		mem = kmap_atomic(page);
+		zram_fill_page(mem + offset, len, meta->table[index].element);
+		kunmap_atomic(mem);
+		return true;
+	}
+	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+	return false;
+}
+
+static bool zram_same_page_write(struct zram *zram, u32 index,
+					struct page *page)
+{
+	unsigned long element;
+	void *mem = kmap_atomic(page);
+
+	if (page_same_filled(mem, &element)) {
+		struct zram_meta *meta = zram->meta;
+
+		kunmap_atomic(mem);
+		/* Free memory associated with this sector now. */
+		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_free_page(zram, index);
+		zram_set_flag(meta, index, ZRAM_SAME);
+		zram_set_element(meta, index, element);
+		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+		atomic64_inc(&zram->stats.same_pages);
+		return true;
+	}
+	kunmap_atomic(mem);
+
+	return false;
+}
+
 static void zram_meta_free(struct zram_meta *meta, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
@@ -501,169 +545,103 @@ static void zram_free_page(struct zram *zram, size_t index)
 	zram_set_obj_size(meta, index, 0);
 }
 
-static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 {
-	int ret = 0;
-	unsigned char *cmem;
-	struct zram_meta *meta = zram->meta;
+	int ret;
 	unsigned long handle;
 	unsigned int size;
+	void *src, *dst;
+	struct zram_meta *meta = zram->meta;
+
+	if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
+		return 0;
 
 	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
 	handle = meta->table[index].handle;
 	size = zram_get_obj_size(meta, index);
 
-	if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) {
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-		zram_fill_page(mem, PAGE_SIZE, meta->table[index].element);
-		return 0;
-	}
-
-	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+	src = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
 	if (size == PAGE_SIZE) {
-		memcpy(mem, cmem, PAGE_SIZE);
+		dst = kmap_atomic(page);
+		memcpy(dst, src, PAGE_SIZE);
+		kunmap_atomic(dst);
+		ret = 0;
 	} else {
 		struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
 
-		ret = zcomp_decompress(zstrm, cmem, size, mem);
+		dst = kmap_atomic(page);
+		ret = zcomp_decompress(zstrm, src, size, dst);
+		kunmap_atomic(dst);
 		zcomp_stream_put(zram->comp);
 	}
 	zs_unmap_object(meta->mem_pool, handle);
 	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
 
-	/* Should NEVER happen. Return bio error if it does. */
-	if (unlikely(ret)) {
-		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
-			  u32 index, int offset)
-{
-	int ret;
-	struct page *page;
-	unsigned char *user_mem, *uncmem = NULL;
-	struct zram_meta *meta = zram->meta;
-	page = bvec->bv_page;
-
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
-	if (unlikely(!meta->table[index].handle) ||
-			zram_test_flag(meta, index, ZRAM_SAME)) {
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-		handle_same_page(bvec, meta->table[index].element);
-		return 0;
-	}
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-
-	if (is_partial_io(bvec))
-		/* Use  a temporary buffer to decompress the page */
-		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
-
-	user_mem = kmap_atomic(page);
-	if (!is_partial_io(bvec))
-		uncmem = user_mem;
-
-	if (!uncmem) {
-		pr_err("Unable to allocate temp memory\n");
-		ret = -ENOMEM;
-		goto out_cleanup;
-	}
-
-	ret = zram_decompress_page(zram, uncmem, index);
 	/* Should NEVER happen. Return bio error if it does. */
 	if (unlikely(ret))
-		goto out_cleanup;
+		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
 
-	if (is_partial_io(bvec))
-		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
-				bvec->bv_len);
-
-	flush_dcache_page(page);
-	ret = 0;
-out_cleanup:
-	kunmap_atomic(user_mem);
-	if (is_partial_io(bvec))
-		kfree(uncmem);
 	return ret;
 }
 
-static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
-			   int offset)
+static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
+				u32 index, int offset)
 {
-	int ret = 0;
-	unsigned int clen;
-	unsigned long handle = 0;
+	int ret;
 	struct page *page;
-	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
-	struct zram_meta *meta = zram->meta;
-	struct zcomp_strm *zstrm = NULL;
-	unsigned long alloced_pages;
-	unsigned long element;
 
 	page = bvec->bv_page;
 	if (is_partial_io(bvec)) {
-		/*
-		 * This is a partial IO. We need to read the full page
-		 * before to write the changes.
-		 */
-		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
-		if (!uncmem) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		ret = zram_decompress_page(zram, uncmem, index);
-		if (ret)
-			goto out;
+		/* Use a temporary buffer to decompress the page */
+		page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
 	}
 
+	ret = zram_decompress_page(zram, page, index);
+	if (unlikely(ret))
+		goto out;
+
+	if (is_partial_io(bvec)) {
+		void *dst = kmap_atomic(bvec->bv_page);
+		void *src = kmap_atomic(page);
+
+		memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len);
+		kunmap_atomic(src);
+		kunmap_atomic(dst);
+	}
+out:
+	if (is_partial_io(bvec))
+		__free_page(page);
+
+	return ret;
+}
+
+static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
+			struct page *page,
+			unsigned long *out_handle, unsigned int *out_comp_len)
+{
+	int ret;
+	unsigned int comp_len;
+	void *src;
+	unsigned long alloced_pages;
+	unsigned long handle = 0;
+	struct zram_meta *meta = zram->meta;
+
 compress_again:
-	user_mem = kmap_atomic(page);
-	if (is_partial_io(bvec)) {
-		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
-		       bvec->bv_len);
-		kunmap_atomic(user_mem);
-		user_mem = NULL;
-	} else {
-		uncmem = user_mem;
-	}
-
-	if (page_same_filled(uncmem, &element)) {
-		if (user_mem)
-			kunmap_atomic(user_mem);
-		/* Free memory associated with this sector now. */
-		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
-		zram_free_page(zram, index);
-		zram_set_flag(meta, index, ZRAM_SAME);
-		zram_set_element(meta, index, element);
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-
-		atomic64_inc(&zram->stats.same_pages);
-		ret = 0;
-		goto out;
-	}
-
-	zstrm = zcomp_stream_get(zram->comp);
-	ret = zcomp_compress(zstrm, uncmem, &clen);
-	if (!is_partial_io(bvec)) {
-		kunmap_atomic(user_mem);
-		user_mem = NULL;
-		uncmem = NULL;
-	}
+	src = kmap_atomic(page);
+	ret = zcomp_compress(*zstrm, src, &comp_len);
+	kunmap_atomic(src);
 
 	if (unlikely(ret)) {
 		pr_err("Compression failed! err=%d\n", ret);
-		goto out;
+		if (handle)
+			zs_free(meta->mem_pool, handle);
+		return ret;
 	}
 
-	src = zstrm->buffer;
-	if (unlikely(clen > max_zpage_size)) {
-		clen = PAGE_SIZE;
-		if (is_partial_io(bvec))
-			src = uncmem;
-	}
+	if (unlikely(comp_len > max_zpage_size))
+		comp_len = PAGE_SIZE;
 
 	/*
 	 * handle allocation has 2 paths:
@@ -679,27 +657,21 @@ compress_again:
 	 * from the slow path and handle has already been allocated.
 	 */
 	if (!handle)
-		handle = zs_malloc(meta->mem_pool, clen,
+		handle = zs_malloc(meta->mem_pool, comp_len,
 				__GFP_KSWAPD_RECLAIM |
 				__GFP_NOWARN |
 				__GFP_HIGHMEM |
 				__GFP_MOVABLE);
 	if (!handle) {
 		zcomp_stream_put(zram->comp);
-		zstrm = NULL;
-
 		atomic64_inc(&zram->stats.writestall);
-
-		handle = zs_malloc(meta->mem_pool, clen,
+		handle = zs_malloc(meta->mem_pool, comp_len,
 				GFP_NOIO | __GFP_HIGHMEM |
 				__GFP_MOVABLE);
+		*zstrm = zcomp_stream_get(zram->comp);
 		if (handle)
 			goto compress_again;
-
-		pr_err("Error allocating memory for compressed page: %u, size=%u\n",
-			index, clen);
-		ret = -ENOMEM;
-		goto out;
+		return -ENOMEM;
 	}
 
 	alloced_pages = zs_get_total_pages(meta->mem_pool);
@@ -707,22 +679,45 @@ compress_again:
 
 	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
 		zs_free(meta->mem_pool, handle);
-		ret = -ENOMEM;
-		goto out;
+		return -ENOMEM;
 	}
 
-	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+	*out_handle = handle;
+	*out_comp_len = comp_len;
+	return 0;
+}
 
-	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
+{
+	int ret;
+	unsigned long handle;
+	unsigned int comp_len;
+	void *src, *dst;
+	struct zcomp_strm *zstrm;
+	struct zram_meta *meta = zram->meta;
+	struct page *page = bvec->bv_page;
+
+	if (zram_same_page_write(zram, index, page))
+		return 0;
+
+	zstrm = zcomp_stream_get(zram->comp);
+	ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
+	if (ret) {
+		zcomp_stream_put(zram->comp);
+		return ret;
+	}
+
+
+	dst = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+
+	src = zstrm->buffer;
+	if (comp_len == PAGE_SIZE)
 		src = kmap_atomic(page);
-		memcpy(cmem, src, PAGE_SIZE);
+	memcpy(dst, src, comp_len);
+	if (comp_len == PAGE_SIZE)
 		kunmap_atomic(src);
-	} else {
-		memcpy(cmem, src, clen);
-	}
 
 	zcomp_stream_put(zram->comp);
-	zstrm = NULL;
 	zs_unmap_object(meta->mem_pool, handle);
 
 	/*
@@ -731,19 +726,54 @@ compress_again:
 	 */
 	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
 	zram_free_page(zram, index);
-
 	meta->table[index].handle = handle;
-	zram_set_obj_size(meta, index, clen);
+	zram_set_obj_size(meta, index, comp_len);
 	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
 
 	/* Update stats */
-	atomic64_add(clen, &zram->stats.compr_data_size);
+	atomic64_add(comp_len, &zram->stats.compr_data_size);
 	atomic64_inc(&zram->stats.pages_stored);
+	return 0;
+}
+
+static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+				u32 index, int offset)
+{
+	int ret;
+	struct page *page = NULL;
+	void *src;
+	struct bio_vec vec;
+
+	vec = *bvec;
+	if (is_partial_io(bvec)) {
+		void *dst;
+		/*
+		 * This is a partial IO. We need to read the full page
+		 * before to write the changes.
+		 */
+		page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+
+		ret = zram_decompress_page(zram, page, index);
+		if (ret)
+			goto out;
+
+		src = kmap_atomic(bvec->bv_page);
+		dst = kmap_atomic(page);
+		memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len);
+		kunmap_atomic(dst);
+		kunmap_atomic(src);
+
+		vec.bv_page = page;
+		vec.bv_len = PAGE_SIZE;
+		vec.bv_offset = 0;
+	}
+
+	ret = __zram_bvec_write(zram, &vec, index);
 out:
-	if (zstrm)
-		zcomp_stream_put(zram->comp);
 	if (is_partial_io(bvec))
-		kfree(uncmem);
+		__free_page(page);
 	return ret;
 }
 
@@ -798,6 +828,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 	if (rw == READ) {
 		atomic64_inc(&zram->stats.num_reads);
 		ret = zram_bvec_read(zram, bvec, index, offset);
+		flush_dcache_page(bvec->bv_page);
 	} else {
 		atomic64_inc(&zram->stats.num_writes);
 		ret = zram_bvec_write(zram, bvec, index, offset);

From 5a9106094ba9634e00b19a1dc8515756777c13c3 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:55:44 -0700
Subject: [PATCH 1782/3220] UPSTREAM: zram: use zram_slot_lock instead of raw
 bit_spin_lock op

With this clean-up phase, I want to use zram's wrapper function to lock
table access which is more consistent with other zram's functions.

Link: http://lkml.kernel.org/r/1492052365-16169-4-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 86c49814d449ebc51c7d455ac8e3d17b9fa702eb)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I6afee89dce63dff6d759c78e25926814fc016107
---
 drivers/block/zram/zram_drv.c | 41 +++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e480eaff1806..a883c9876345 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -412,24 +412,38 @@ static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
 static DEVICE_ATTR_RO(debug_stat);
 
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+	struct zram_meta *meta = zram->meta;
+
+	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+	struct zram_meta *meta = zram->meta;
+
+	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+}
+
 static bool zram_same_page_read(struct zram *zram, u32 index,
 				struct page *page,
 				unsigned int offset, unsigned int len)
 {
 	struct zram_meta *meta = zram->meta;
 
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_lock(zram, index);
 	if (unlikely(!meta->table[index].handle) ||
 			zram_test_flag(meta, index, ZRAM_SAME)) {
 		void *mem;
 
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_unlock(zram, index);
 		mem = kmap_atomic(page);
 		zram_fill_page(mem + offset, len, meta->table[index].element);
 		kunmap_atomic(mem);
 		return true;
 	}
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_unlock(zram, index);
 
 	return false;
 }
@@ -445,11 +459,11 @@ static bool zram_same_page_write(struct zram *zram, u32 index,
 
 		kunmap_atomic(mem);
 		/* Free memory associated with this sector now. */
-		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_lock(zram, index);
 		zram_free_page(zram, index);
 		zram_set_flag(meta, index, ZRAM_SAME);
 		zram_set_element(meta, index, element);
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_unlock(zram, index);
 
 		atomic64_inc(&zram->stats.same_pages);
 		return true;
@@ -556,7 +570,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 	if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
 		return 0;
 
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_lock(zram, index);
 	handle = meta->table[index].handle;
 	size = zram_get_obj_size(meta, index);
 
@@ -575,7 +589,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 		zcomp_stream_put(zram->comp);
 	}
 	zs_unmap_object(meta->mem_pool, handle);
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_unlock(zram, index);
 
 	/* Should NEVER happen. Return bio error if it does. */
 	if (unlikely(ret))
@@ -724,11 +738,11 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 	 * Free memory associated with this sector
 	 * before overwriting unused sectors.
 	 */
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
 	meta->table[index].handle = handle;
 	zram_set_obj_size(meta, index, comp_len);
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_unlock(zram, index);
 
 	/* Update stats */
 	atomic64_add(comp_len, &zram->stats.compr_data_size);
@@ -786,7 +800,6 @@ static void zram_bio_discard(struct zram *zram, u32 index,
 			     int offset, struct bio *bio)
 {
 	size_t n = bio->bi_iter.bi_size;
-	struct zram_meta *meta = zram->meta;
 
 	/*
 	 * zram manages data in physical block size units. Because logical block
@@ -807,9 +820,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
 	}
 
 	while (n >= PAGE_SIZE) {
-		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_lock(zram, index);
 		zram_free_page(zram, index);
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_unlock(zram, index);
 		atomic64_inc(&zram->stats.notify_free);
 		index++;
 		n -= PAGE_SIZE;
@@ -918,9 +931,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
 	zram = bdev->bd_disk->private_data;
 	meta = zram->meta;
 
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_unlock(zram, index);
 	atomic64_inc(&zram->stats.notify_free);
 }
 

From c25a938eb9c5ca520e734fab60088485f2ebb7c3 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:55:47 -0700
Subject: [PATCH 1783/3220] UPSTREAM: zram: remove zram_meta structure

It's redundant now.  Instead, remove it and use zram structure directly.

Link: http://lkml.kernel.org/r/1492052365-16169-5-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit beb6602cf87abee547b2692031185111f625153a)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I720a282710b97fd75c156305fd505d4497b89e4c
---
 drivers/block/zram/zram_drv.c | 191 ++++++++++++++--------------------
 drivers/block/zram/zram_drv.h |   6 +-
 2 files changed, 79 insertions(+), 118 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a883c9876345..6690b25d6d29 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -57,46 +57,46 @@ static inline struct zram *dev_to_zram(struct device *dev)
 }
 
 /* flag operations require table entry bit_spin_lock() being held */
-static int zram_test_flag(struct zram_meta *meta, u32 index,
+static int zram_test_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
-	return meta->table[index].value & BIT(flag);
+	return zram->table[index].value & BIT(flag);
 }
 
-static void zram_set_flag(struct zram_meta *meta, u32 index,
+static void zram_set_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
-	meta->table[index].value |= BIT(flag);
+	zram->table[index].value |= BIT(flag);
 }
 
-static void zram_clear_flag(struct zram_meta *meta, u32 index,
+static void zram_clear_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
-	meta->table[index].value &= ~BIT(flag);
+	zram->table[index].value &= ~BIT(flag);
 }
 
-static inline void zram_set_element(struct zram_meta *meta, u32 index,
+static inline void zram_set_element(struct zram *zram, u32 index,
 			unsigned long element)
 {
-	meta->table[index].element = element;
+	zram->table[index].element = element;
 }
 
-static inline void zram_clear_element(struct zram_meta *meta, u32 index)
+static inline void zram_clear_element(struct zram *zram, u32 index)
 {
-	meta->table[index].element = 0;
+	zram->table[index].element = 0;
 }
 
-static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+static size_t zram_get_obj_size(struct zram *zram, u32 index)
 {
-	return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+	return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
 }
 
-static void zram_set_obj_size(struct zram_meta *meta,
+static void zram_set_obj_size(struct zram *zram,
 					u32 index, size_t size)
 {
-	unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+	unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT;
 
-	meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+	zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
 }
 
 #if PAGE_SIZE != 4096
@@ -249,9 +249,8 @@ static ssize_t mem_used_max_store(struct device *dev,
 
 	down_read(&zram->init_lock);
 	if (init_done(zram)) {
-		struct zram_meta *meta = zram->meta;
 		atomic_long_set(&zram->stats.max_used_pages,
-				zs_get_total_pages(meta->mem_pool));
+				zs_get_total_pages(zram->mem_pool));
 	}
 	up_read(&zram->init_lock);
 
@@ -324,7 +323,6 @@ static ssize_t compact_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
 	struct zram *zram = dev_to_zram(dev);
-	struct zram_meta *meta;
 
 	down_read(&zram->init_lock);
 	if (!init_done(zram)) {
@@ -332,8 +330,7 @@ static ssize_t compact_store(struct device *dev,
 		return -EINVAL;
 	}
 
-	meta = zram->meta;
-	zs_compact(meta->mem_pool);
+	zs_compact(zram->mem_pool);
 	up_read(&zram->init_lock);
 
 	return len;
@@ -370,8 +367,8 @@ static ssize_t mm_stat_show(struct device *dev,
 
 	down_read(&zram->init_lock);
 	if (init_done(zram)) {
-		mem_used = zs_get_total_pages(zram->meta->mem_pool);
-		zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+		mem_used = zs_get_total_pages(zram->mem_pool);
+		zs_pool_stats(zram->mem_pool, &pool_stats);
 	}
 
 	orig_size = atomic64_read(&zram->stats.pages_stored);
@@ -414,32 +411,26 @@ static DEVICE_ATTR_RO(debug_stat);
 
 static void zram_slot_lock(struct zram *zram, u32 index)
 {
-	struct zram_meta *meta = zram->meta;
-
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
 }
 
 static void zram_slot_unlock(struct zram *zram, u32 index)
 {
-	struct zram_meta *meta = zram->meta;
-
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
 }
 
 static bool zram_same_page_read(struct zram *zram, u32 index,
 				struct page *page,
 				unsigned int offset, unsigned int len)
 {
-	struct zram_meta *meta = zram->meta;
-
 	zram_slot_lock(zram, index);
-	if (unlikely(!meta->table[index].handle) ||
-			zram_test_flag(meta, index, ZRAM_SAME)) {
+	if (unlikely(!zram->table[index].handle) ||
+			zram_test_flag(zram, index, ZRAM_SAME)) {
 		void *mem;
 
 		zram_slot_unlock(zram, index);
 		mem = kmap_atomic(page);
-		zram_fill_page(mem + offset, len, meta->table[index].element);
+		zram_fill_page(mem + offset, len, zram->table[index].element);
 		kunmap_atomic(mem);
 		return true;
 	}
@@ -455,14 +446,12 @@ static bool zram_same_page_write(struct zram *zram, u32 index,
 	void *mem = kmap_atomic(page);
 
 	if (page_same_filled(mem, &element)) {
-		struct zram_meta *meta = zram->meta;
-
 		kunmap_atomic(mem);
 		/* Free memory associated with this sector now. */
 		zram_slot_lock(zram, index);
 		zram_free_page(zram, index);
-		zram_set_flag(meta, index, ZRAM_SAME);
-		zram_set_element(meta, index, element);
+		zram_set_flag(zram, index, ZRAM_SAME);
+		zram_set_element(zram, index, element);
 		zram_slot_unlock(zram, index);
 
 		atomic64_inc(&zram->stats.same_pages);
@@ -473,56 +462,44 @@ static bool zram_same_page_write(struct zram *zram, u32 index,
 	return false;
 }
 
-static void zram_meta_free(struct zram_meta *meta, u64 disksize)
+static void zram_meta_free(struct zram *zram, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
 	size_t index;
 
 	/* Free all pages that are still in this zram device */
 	for (index = 0; index < num_pages; index++) {
-		unsigned long handle = meta->table[index].handle;
+		unsigned long handle = zram->table[index].handle;
 		/*
 		 * No memory is allocated for same element filled pages.
 		 * Simply clear same page flag.
 		 */
-		if (!handle || zram_test_flag(meta, index, ZRAM_SAME))
+		if (!handle || zram_test_flag(zram, index, ZRAM_SAME))
 			continue;
 
-		zs_free(meta->mem_pool, handle);
+		zs_free(zram->mem_pool, handle);
 	}
 
-	zs_destroy_pool(meta->mem_pool);
-	vfree(meta->table);
-	kfree(meta);
+	zs_destroy_pool(zram->mem_pool);
+	vfree(zram->table);
 }
 
-static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
+static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 {
 	size_t num_pages;
-	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
-
-	if (!meta)
-		return NULL;
 
 	num_pages = disksize >> PAGE_SHIFT;
-	meta->table = vzalloc(num_pages * sizeof(*meta->table));
-	if (!meta->table) {
-		pr_err("Error allocating zram address table\n");
-		goto out_error;
+	zram->table = vzalloc(num_pages * sizeof(*zram->table));
+	if (!zram->table)
+		return false;
+
+	zram->mem_pool = zs_create_pool(zram->disk->disk_name);
+	if (!zram->mem_pool) {
+		vfree(zram->table);
+		return false;
 	}
 
-	meta->mem_pool = zs_create_pool(pool_name);
-	if (!meta->mem_pool) {
-		pr_err("Error creating memory pool\n");
-		goto out_error;
-	}
-
-	return meta;
-
-out_error:
-	vfree(meta->table);
-	kfree(meta);
-	return NULL;
+	return true;
 }
 
 /*
@@ -532,16 +509,15 @@ out_error:
  */
 static void zram_free_page(struct zram *zram, size_t index)
 {
-	struct zram_meta *meta = zram->meta;
-	unsigned long handle = meta->table[index].handle;
+	unsigned long handle = zram->table[index].handle;
 
 	/*
 	 * No memory is allocated for same element filled pages.
 	 * Simply clear same page flag.
 	 */
-	if (zram_test_flag(meta, index, ZRAM_SAME)) {
-		zram_clear_flag(meta, index, ZRAM_SAME);
-		zram_clear_element(meta, index);
+	if (zram_test_flag(zram, index, ZRAM_SAME)) {
+		zram_clear_flag(zram, index, ZRAM_SAME);
+		zram_clear_element(zram, index);
 		atomic64_dec(&zram->stats.same_pages);
 		return;
 	}
@@ -549,14 +525,14 @@ static void zram_free_page(struct zram *zram, size_t index)
 	if (!handle)
 		return;
 
-	zs_free(meta->mem_pool, handle);
+	zs_free(zram->mem_pool, handle);
 
-	atomic64_sub(zram_get_obj_size(meta, index),
+	atomic64_sub(zram_get_obj_size(zram, index),
 			&zram->stats.compr_data_size);
 	atomic64_dec(&zram->stats.pages_stored);
 
-	meta->table[index].handle = 0;
-	zram_set_obj_size(meta, index, 0);
+	zram->table[index].handle = 0;
+	zram_set_obj_size(zram, index, 0);
 }
 
 static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
@@ -565,16 +541,15 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 	unsigned long handle;
 	unsigned int size;
 	void *src, *dst;
-	struct zram_meta *meta = zram->meta;
 
 	if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
 		return 0;
 
 	zram_slot_lock(zram, index);
-	handle = meta->table[index].handle;
-	size = zram_get_obj_size(meta, index);
+	handle = zram->table[index].handle;
+	size = zram_get_obj_size(zram, index);
 
-	src = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
 	if (size == PAGE_SIZE) {
 		dst = kmap_atomic(page);
 		memcpy(dst, src, PAGE_SIZE);
@@ -588,7 +563,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 		kunmap_atomic(dst);
 		zcomp_stream_put(zram->comp);
 	}
-	zs_unmap_object(meta->mem_pool, handle);
+	zs_unmap_object(zram->mem_pool, handle);
 	zram_slot_unlock(zram, index);
 
 	/* Should NEVER happen. Return bio error if it does. */
@@ -640,7 +615,6 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
 	void *src;
 	unsigned long alloced_pages;
 	unsigned long handle = 0;
-	struct zram_meta *meta = zram->meta;
 
 compress_again:
 	src = kmap_atomic(page);
@@ -650,7 +624,7 @@ compress_again:
 	if (unlikely(ret)) {
 		pr_err("Compression failed! err=%d\n", ret);
 		if (handle)
-			zs_free(meta->mem_pool, handle);
+			zs_free(zram->mem_pool, handle);
 		return ret;
 	}
 
@@ -671,7 +645,7 @@ compress_again:
 	 * from the slow path and handle has already been allocated.
 	 */
 	if (!handle)
-		handle = zs_malloc(meta->mem_pool, comp_len,
+		handle = zs_malloc(zram->mem_pool, comp_len,
 				__GFP_KSWAPD_RECLAIM |
 				__GFP_NOWARN |
 				__GFP_HIGHMEM |
@@ -679,7 +653,7 @@ compress_again:
 	if (!handle) {
 		zcomp_stream_put(zram->comp);
 		atomic64_inc(&zram->stats.writestall);
-		handle = zs_malloc(meta->mem_pool, comp_len,
+		handle = zs_malloc(zram->mem_pool, comp_len,
 				GFP_NOIO | __GFP_HIGHMEM |
 				__GFP_MOVABLE);
 		*zstrm = zcomp_stream_get(zram->comp);
@@ -688,11 +662,11 @@ compress_again:
 		return -ENOMEM;
 	}
 
-	alloced_pages = zs_get_total_pages(meta->mem_pool);
+	alloced_pages = zs_get_total_pages(zram->mem_pool);
 	update_used_max(zram, alloced_pages);
 
 	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
-		zs_free(meta->mem_pool, handle);
+		zs_free(zram->mem_pool, handle);
 		return -ENOMEM;
 	}
 
@@ -708,7 +682,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 	unsigned int comp_len;
 	void *src, *dst;
 	struct zcomp_strm *zstrm;
-	struct zram_meta *meta = zram->meta;
 	struct page *page = bvec->bv_page;
 
 	if (zram_same_page_write(zram, index, page))
@@ -721,8 +694,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 		return ret;
 	}
 
-
-	dst = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
 
 	src = zstrm->buffer;
 	if (comp_len == PAGE_SIZE)
@@ -732,7 +704,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 		kunmap_atomic(src);
 
 	zcomp_stream_put(zram->comp);
-	zs_unmap_object(meta->mem_pool, handle);
+	zs_unmap_object(zram->mem_pool, handle);
 
 	/*
 	 * Free memory associated with this sector
@@ -740,8 +712,8 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 	 */
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	meta->table[index].handle = handle;
-	zram_set_obj_size(meta, index, comp_len);
+	zram->table[index].handle = handle;
+	zram_set_obj_size(zram, index, comp_len);
 	zram_slot_unlock(zram, index);
 
 	/* Update stats */
@@ -926,10 +898,8 @@ static void zram_slot_free_notify(struct block_device *bdev,
 				unsigned long index)
 {
 	struct zram *zram;
-	struct zram_meta *meta;
 
 	zram = bdev->bd_disk->private_data;
-	meta = zram->meta;
 
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
@@ -977,7 +947,6 @@ out:
 
 static void zram_reset_device(struct zram *zram)
 {
-	struct zram_meta *meta;
 	struct zcomp *comp;
 	u64 disksize;
 
@@ -990,7 +959,6 @@ static void zram_reset_device(struct zram *zram)
 		return;
 	}
 
-	meta = zram->meta;
 	comp = zram->comp;
 	disksize = zram->disksize;
 
@@ -1003,7 +971,7 @@ static void zram_reset_device(struct zram *zram)
 
 	up_write(&zram->init_lock);
 	/* I/O operation under all of CPU are done so let's free */
-	zram_meta_free(meta, disksize);
+	zram_meta_free(zram, disksize);
 	zcomp_destroy(comp);
 }
 
@@ -1012,7 +980,6 @@ static ssize_t disksize_store(struct device *dev,
 {
 	u64 disksize;
 	struct zcomp *comp;
-	struct zram_meta *meta;
 	struct zram *zram = dev_to_zram(dev);
 	int err;
 
@@ -1020,10 +987,18 @@ static ssize_t disksize_store(struct device *dev,
 	if (!disksize)
 		return -EINVAL;
 
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Cannot change disksize for initialized device\n");
+		err = -EBUSY;
+		goto out_unlock;
+	}
+
 	disksize = PAGE_ALIGN(disksize);
-	meta = zram_meta_alloc(zram->disk->disk_name, disksize);
-	if (!meta)
-		return -ENOMEM;
+	if (!zram_meta_alloc(zram, disksize)) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
 
 	comp = zcomp_create(zram->compressor);
 	if (IS_ERR(comp)) {
@@ -1033,14 +1008,6 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_free_meta;
 	}
 
-	down_write(&zram->init_lock);
-	if (init_done(zram)) {
-		pr_info("Cannot change disksize for initialized device\n");
-		err = -EBUSY;
-		goto out_destroy_comp;
-	}
-
-	zram->meta = meta;
 	zram->comp = comp;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
@@ -1049,11 +1016,10 @@ static ssize_t disksize_store(struct device *dev,
 
 	return len;
 
-out_destroy_comp:
-	up_write(&zram->init_lock);
-	zcomp_destroy(comp);
 out_free_meta:
-	zram_meta_free(meta, disksize);
+	zram_meta_free(zram, disksize);
+out_unlock:
+	up_write(&zram->init_lock);
 	return err;
 }
 
@@ -1240,7 +1206,6 @@ static int zram_add(void)
 		goto out_free_disk;
 	}
 	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
-	zram->meta = NULL;
 
 	pr_info("Added device: %s\n", zram->disk->disk_name);
 	return device_id;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index caeff51f1571..e34e44d02e3e 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -92,13 +92,9 @@ struct zram_stats {
 	atomic64_t writestall;		/* no. of write slow paths */
 };
 
-struct zram_meta {
+struct zram {
 	struct zram_table_entry *table;
 	struct zs_pool *mem_pool;
-};
-
-struct zram {
-	struct zram_meta *meta;
 	struct zcomp *comp;
 	struct gendisk *disk;
 	/* Prevent concurrent execution of device init */

From 70756ad7673bc65848c28172628319dddbf12cb6 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:55:50 -0700
Subject: [PATCH 1784/3220] UPSTREAM: zram: introduce zram data accessor

With element, sometime I got confused handle and element access.  It
might be my bad but I think it's time to introduce accessor to prevent
future idiot like me.  This patch is just clean-up patch so it shouldn't
change any behavior.

Link: http://lkml.kernel.org/r/1492052365-16169-6-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 643ae61d0f41c48aa7179921fe15ba4b4d8ddfec)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I3916d5561ab9fb2917455cac74bee431fbe84b5d
---
 drivers/block/zram/zram_drv.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 6690b25d6d29..cde2129304d6 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -56,6 +56,16 @@ static inline struct zram *dev_to_zram(struct device *dev)
 	return (struct zram *)dev_to_disk(dev)->private_data;
 }
 
+static unsigned long zram_get_handle(struct zram *zram, u32 index)
+{
+	return zram->table[index].handle;
+}
+
+static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
+{
+	zram->table[index].handle = handle;
+}
+
 /* flag operations require table entry bit_spin_lock() being held */
 static int zram_test_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
@@ -81,9 +91,9 @@ static inline void zram_set_element(struct zram *zram, u32 index,
 	zram->table[index].element = element;
 }
 
-static inline void zram_clear_element(struct zram *zram, u32 index)
+static unsigned long zram_get_element(struct zram *zram, u32 index)
 {
-	zram->table[index].element = 0;
+	return zram->table[index].element;
 }
 
 static size_t zram_get_obj_size(struct zram *zram, u32 index)
@@ -424,13 +434,14 @@ static bool zram_same_page_read(struct zram *zram, u32 index,
 				unsigned int offset, unsigned int len)
 {
 	zram_slot_lock(zram, index);
-	if (unlikely(!zram->table[index].handle) ||
-			zram_test_flag(zram, index, ZRAM_SAME)) {
+	if (unlikely(!zram_get_handle(zram, index) ||
+			zram_test_flag(zram, index, ZRAM_SAME))) {
 		void *mem;
 
 		zram_slot_unlock(zram, index);
 		mem = kmap_atomic(page);
-		zram_fill_page(mem + offset, len, zram->table[index].element);
+		zram_fill_page(mem + offset, len,
+					zram_get_element(zram, index));
 		kunmap_atomic(mem);
 		return true;
 	}
@@ -469,7 +480,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
 
 	/* Free all pages that are still in this zram device */
 	for (index = 0; index < num_pages; index++) {
-		unsigned long handle = zram->table[index].handle;
+		unsigned long handle = zram_get_handle(zram, index);
 		/*
 		 * No memory is allocated for same element filled pages.
 		 * Simply clear same page flag.
@@ -509,7 +520,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
  */
 static void zram_free_page(struct zram *zram, size_t index)
 {
-	unsigned long handle = zram->table[index].handle;
+	unsigned long handle = zram_get_handle(zram, index);
 
 	/*
 	 * No memory is allocated for same element filled pages.
@@ -517,7 +528,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 	 */
 	if (zram_test_flag(zram, index, ZRAM_SAME)) {
 		zram_clear_flag(zram, index, ZRAM_SAME);
-		zram_clear_element(zram, index);
+		zram_set_element(zram, index, 0);
 		atomic64_dec(&zram->stats.same_pages);
 		return;
 	}
@@ -531,7 +542,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 			&zram->stats.compr_data_size);
 	atomic64_dec(&zram->stats.pages_stored);
 
-	zram->table[index].handle = 0;
+	zram_set_handle(zram, index, 0);
 	zram_set_obj_size(zram, index, 0);
 }
 
@@ -546,7 +557,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 		return 0;
 
 	zram_slot_lock(zram, index);
-	handle = zram->table[index].handle;
+	handle = zram_get_handle(zram, index);
 	size = zram_get_obj_size(zram, index);
 
 	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
@@ -712,7 +723,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 	 */
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	zram->table[index].handle = handle;
+	zram_set_handle(zram, index, handle);
 	zram_set_obj_size(zram, index, comp_len);
 	zram_slot_unlock(zram, index);
 

From 866e69985296ea12876d147aac01a9bad6e5627e Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:55:53 -0700
Subject: [PATCH 1785/3220] UPSTREAM: zram: use zram_free_page instead of
 open-coded

The zram_free_page already handles NULL handle case and same page so use
it to reduce error probability.  (Acutaully, I made a mistake when I
handled same page feature)

Link: http://lkml.kernel.org/r/1492052365-16169-7-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 302128dce142d780417aa548bfd7ef4dfb89fa80)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ie38c52dfb1959377936b7cd9158ad1b5a02219bd
---
 drivers/block/zram/zram_drv.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index cde2129304d6..2dbd39781254 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -479,17 +479,8 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
 	size_t index;
 
 	/* Free all pages that are still in this zram device */
-	for (index = 0; index < num_pages; index++) {
-		unsigned long handle = zram_get_handle(zram, index);
-		/*
-		 * No memory is allocated for same element filled pages.
-		 * Simply clear same page flag.
-		 */
-		if (!handle || zram_test_flag(zram, index, ZRAM_SAME))
-			continue;
-
-		zs_free(zram->mem_pool, handle);
-	}
+	for (index = 0; index < num_pages; index++)
+		zram_free_page(zram, index);
 
 	zs_destroy_pool(zram->mem_pool);
 	vfree(zram->table);
@@ -972,9 +963,6 @@ static void zram_reset_device(struct zram *zram)
 
 	comp = zram->comp;
 	disksize = zram->disksize;
-
-	/* Reset stats */
-	memset(&zram->stats, 0, sizeof(zram->stats));
 	zram->disksize = 0;
 
 	set_capacity(zram->disk, 0);
@@ -983,6 +971,7 @@ static void zram_reset_device(struct zram *zram)
 	up_write(&zram->init_lock);
 	/* I/O operation under all of CPU are done so let's free */
 	zram_meta_free(zram, disksize);
+	memset(&zram->stats, 0, sizeof(zram->stats));
 	zcomp_destroy(comp);
 }
 

From 69aa01aaee7c8207c5bc10c1360f0398eb412b85 Mon Sep 17 00:00:00 2001
From: Sangwoo Park <sangwoo2.park@lge.com>
Date: Wed, 3 May 2017 14:55:56 -0700
Subject: [PATCH 1786/3220] UPSTREAM: zram: reduce load operation in
 page_same_filled

In page_same_filled function, all elements in the page is compared with
next index value.  The current comparison routine compares the (i)th and
(i+1)th values of the page.

In this case, two load operaions occur for each comparison.  But if we
store first value of the page stores at 'val' variable and using it to
compare with others, the load opearation is reduced.  It reduce load
operation per page by up to 64times.

Link: http://lkml.kernel.org/r/1488428104-7257-1-git-send-email-sangwoo2.park@lge.com
Signed-off-by: Sangwoo Park <sangwoo2.park@lge.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit f0fe9984656604ea8effd5ff82709ff8ce1f954b)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I6b58b583e83139eee9f0540da12850c43510cb8e
---
 drivers/block/zram/zram_drv.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 2dbd39781254..2db2459857fb 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -194,15 +194,17 @@ static bool page_same_filled(void *ptr, unsigned long *element)
 {
 	unsigned int pos;
 	unsigned long *page;
+	unsigned long val;
 
 	page = (unsigned long *)ptr;
+	val = page[0];
 
-	for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) {
-		if (page[pos] != page[pos + 1])
+	for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+		if (val != page[pos])
 			return false;
 	}
 
-	*element = page[pos];
+	*element = val;
 
 	return true;
 }

From c30307c479f23b3e47fd8a4c692bcaff7bdee77b Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 6 Jul 2017 15:37:12 -0700
Subject: [PATCH 1787/3220] UPSTREAM: zram: count same page write as
 page_stored

Regardless of whether it is same page or not, it's surely write and
stored to zram so we should increase pages_stored stat.  Otherwise, user
can see zero value via mm_stats although he writes a lot of pages to
zram.

Link: http://lkml.kernel.org/r/1494834068-27004-1-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 51f9f82c855d65ef14c2af10e0d2c86ec332a182)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I006d80df413a0fe0fd7dd58e535c6a2c03ab2c9d
---
 drivers/block/zram/zram_drv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 2db2459857fb..ac9b1cacb970 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -468,6 +468,7 @@ static bool zram_same_page_write(struct zram *zram, u32 index,
 		zram_slot_unlock(zram, index);
 
 		atomic64_inc(&zram->stats.same_pages);
+		atomic64_inc(&zram->stats.pages_stored);
 		return true;
 	}
 	kunmap_atomic(mem);
@@ -523,6 +524,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 		zram_clear_flag(zram, index, ZRAM_SAME);
 		zram_set_element(zram, index, 0);
 		atomic64_dec(&zram->stats.same_pages);
+		atomic64_dec(&zram->stats.pages_stored);
 		return;
 	}
 

From 0dfe38f677abe89c8334a10e7190414f337144f3 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Mon, 10 Jul 2017 15:50:15 -0700
Subject: [PATCH 1788/3220] UPSTREAM: zram: constify attribute_group
 structures.

attribute_groups are not supposed to change at runtime.  All functions
working with attribute_groups provided by <linux/sysfs.h> work with
const attribute_group.  So mark the non-const structs as const.

File size before:
   text	   data	    bss	    dec	    hex	filename
   8293	    841	      4	   9138	   23b2	drivers/block/zram/zram_drv.o

File size After adding 'const':
   text	   data	    bss	    dec	    hex	filename
   8357	    777	      4	   9138	   23b2	drivers/block/zram/zram_drv.o

Link: http://lkml.kernel.org/r/65680c1c4d85818f7094cbfa31c91bf28185ba1b.1499061182.git.arvind.yadav.cs@gmail.com
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit bc1bb362334ebc4c65dd4301f10fb70902b3db7d)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ic0765dea8c2fadb18623605ba48748a9b33df3fa
---
 drivers/block/zram/zram_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ac9b1cacb970..5e3ee189f9cf 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1118,7 +1118,7 @@ static struct attribute *zram_disk_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group zram_disk_attr_group = {
+static const struct attribute_group zram_disk_attr_group = {
 	.attrs = zram_disk_attrs,
 };
 

From b95c1171bdda51e101c309415a77409800724263 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 10 Aug 2017 15:24:29 -0700
Subject: [PATCH 1789/3220] UPSTREAM: zram: rework copy of compressor name in
 comp_algorithm_store()

comp_algorithm_store() passes the size of the source buffer to strlcpy()
instead of the destination buffer size.  Make it explicit that the two
buffers have the same size and use strcpy() instead of strlcpy().  The
latter can be done safely since the function ensures that the string in
the source buffer is terminated.

Link: http://lkml.kernel.org/r/20170803163350.45245-1-mka@chromium.org
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit f357e345eef7863da037e0243f2d3df4ba6df986)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ic9667b215ce5e0717bc6829d65e43e9b79602362
---
 drivers/block/zram/zram_drv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 5e3ee189f9cf..7bed8e73c376 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -307,7 +307,7 @@ static ssize_t comp_algorithm_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
 	struct zram *zram = dev_to_zram(dev);
-	char compressor[CRYPTO_MAX_ALG_NAME];
+	char compressor[ARRAY_SIZE(zram->compressor)];
 	size_t sz;
 
 	strlcpy(compressor, buf, sizeof(compressor));
@@ -326,7 +326,7 @@ static ssize_t comp_algorithm_store(struct device *dev,
 		return -EBUSY;
 	}
 
-	strlcpy(zram->compressor, compressor, sizeof(compressor));
+	strcpy(zram->compressor, compressor);
 	up_write(&zram->init_lock);
 	return len;
 }

From 490b2f1f67f4d0d073928edf73729f103c94f271 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Fri, 4 Aug 2017 13:19:17 -0700
Subject: [PATCH 1790/3220] UPSTREAM: lib: Add xxhash module

Adds xxhash kernel module with xxh32 and xxh64 hashes. xxhash is an
extremely fast non-cryptographic hash algorithm for checksumming.
The zstd compression and decompression modules added in the next patch
require xxhash. I extracted it out from zstd since it is useful on its
own. I copied the code from the upstream XXHash source repository and
translated it into kernel style. I ran benchmarks and tests in the kernel
and tests in userland.

I benchmarked xxhash as a special character device. I ran in four modes,
no-op, xxh32, xxh64, and crc32. The no-op mode simply copies the data to
kernel space and ignores it. The xxh32, xxh64, and crc32 modes compute
hashes on the copied data. I also ran it with four different buffer sizes.
The benchmark file is located in the upstream zstd source repository under
`contrib/linux-kernel/xxhash_test.c` [1].

I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD. I benchmarked using the file `filesystem.squashfs`
from `ubuntu-16.10-desktop-amd64.iso`, which is 1,536,217,088 B large.
Run the following commands for the benchmark:

    modprobe xxhash_test
    mknod xxhash_test c 245 0
    time cp filesystem.squashfs xxhash_test

The time is reported by the time of the userland `cp`.
The GB/s is computed with

    1,536,217,008 B / time(buffer size, hash)

which includes the time to copy from userland.
The Normalized GB/s is computed with

    1,536,217,088 B / (time(buffer size, hash) - time(buffer size, none)).

| Buffer Size (B) | Hash  | Time (s) | GB/s | Adjusted GB/s |
|-----------------|-------|----------|------|---------------|
|            1024 | none  |    0.408 | 3.77 |             - |
|            1024 | xxh32 |    0.649 | 2.37 |          6.37 |
|            1024 | xxh64 |    0.542 | 2.83 |         11.46 |
|            1024 | crc32 |    1.290 | 1.19 |          1.74 |
|            4096 | none  |    0.380 | 4.04 |             - |
|            4096 | xxh32 |    0.645 | 2.38 |          5.79 |
|            4096 | xxh64 |    0.500 | 3.07 |         12.80 |
|            4096 | crc32 |    1.168 | 1.32 |          1.95 |
|            8192 | none  |    0.351 | 4.38 |             - |
|            8192 | xxh32 |    0.614 | 2.50 |          5.84 |
|            8192 | xxh64 |    0.464 | 3.31 |         13.60 |
|            8192 | crc32 |    1.163 | 1.32 |          1.89 |
|           16384 | none  |    0.346 | 4.43 |             - |
|           16384 | xxh32 |    0.590 | 2.60 |          6.30 |
|           16384 | xxh64 |    0.466 | 3.30 |         12.80 |
|           16384 | crc32 |    1.183 | 1.30 |          1.84 |

Tested in userland using the test-suite in the zstd repo under
`contrib/linux-kernel/test/XXHashUserlandTest.cpp` [2] by mocking the
kernel functions. A line in each branch of every function in `xxhash.c`
was commented out to ensure that the test-suite fails. Additionally
tested while testing zstd and with SMHasher [3].

[1] https://phabricator.intern.facebook.com/P57526246
[2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/XXHashUserlandTest.cpp
[3] https://github.com/aappleby/smhasher

zstd source repository: https://github.com/facebook/zstd
XXHash source repository: https://github.com/cyan4973/xxhash

Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>

(cherry picked from commit 5d2405227a9eaea48e8cc95756a06d407b11f141)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I4b63e96457f17cf455591e8f35058dacd7aa9004
---
 include/linux/xxhash.h | 236 +++++++++++++++++++
 lib/Kconfig            |   3 +
 lib/Makefile           |   1 +
 lib/xxhash.c           | 500 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 740 insertions(+)
 create mode 100644 include/linux/xxhash.h
 create mode 100644 lib/xxhash.c

diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h
new file mode 100644
index 000000000000..9e1f42cb57e9
--- /dev/null
+++ b/include/linux/xxhash.h
@@ -0,0 +1,236 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following disclaimer
+ *     in the documentation and/or other materials provided with the
+ *     distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at:
+ * - xxHash homepage: http://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Notice extracted from xxHash homepage:
+ *
+ * xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+ * It also successfully passes all tests from the SMHasher suite.
+ *
+ * Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2
+ * Duo @3GHz)
+ *
+ * Name            Speed       Q.Score   Author
+ * xxHash          5.4 GB/s     10
+ * CrapWow         3.2 GB/s      2       Andrew
+ * MumurHash 3a    2.7 GB/s     10       Austin Appleby
+ * SpookyHash      2.0 GB/s     10       Bob Jenkins
+ * SBox            1.4 GB/s      9       Bret Mulvey
+ * Lookup3         1.2 GB/s      9       Bob Jenkins
+ * SuperFastHash   1.2 GB/s      1       Paul Hsieh
+ * CityHash64      1.05 GB/s    10       Pike & Alakuijala
+ * FNV             0.55 GB/s     5       Fowler, Noll, Vo
+ * CRC32           0.43 GB/s     9
+ * MD5-32          0.33 GB/s    10       Ronald L. Rivest
+ * SHA1-32         0.28 GB/s    10
+ *
+ * Q.Score is a measure of quality of the hash function.
+ * It depends on successfully passing SMHasher test set.
+ * 10 is a perfect score.
+ *
+ * A 64-bits version, named xxh64 offers much better speed,
+ * but for 64-bits applications only.
+ * Name     Speed on 64 bits    Speed on 32 bits
+ * xxh64       13.8 GB/s            1.9 GB/s
+ * xxh32        6.8 GB/s            6.0 GB/s
+ */
+
+#ifndef XXHASH_H
+#define XXHASH_H
+
+#include <linux/types.h>
+
+/*-****************************
+ * Simple Hash Functions
+ *****************************/
+
+/**
+ * xxh32() - calculate the 32-bit hash of the input with a given seed.
+ *
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+ *
+ * Return:  The 32-bit hash of the data.
+ */
+uint32_t xxh32(const void *input, size_t length, uint32_t seed);
+
+/**
+ * xxh64() - calculate the 64-bit hash of the input with a given seed.
+ *
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * This function runs 2x faster on 64-bit systems, but slower on 32-bit systems.
+ *
+ * Return:  The 64-bit hash of the data.
+ */
+uint64_t xxh64(const void *input, size_t length, uint64_t seed);
+
+/*-****************************
+ * Streaming Hash Functions
+ *****************************/
+
+/*
+ * These definitions are only meant to allow allocation of XXH state
+ * statically, on stack, or in a struct for example.
+ * Do not use members directly.
+ */
+
+/**
+ * struct xxh32_state - private xxh32 state, do not use members directly
+ */
+struct xxh32_state {
+	uint32_t total_len_32;
+	uint32_t large_len;
+	uint32_t v1;
+	uint32_t v2;
+	uint32_t v3;
+	uint32_t v4;
+	uint32_t mem32[4];
+	uint32_t memsize;
+};
+
+/**
+ * struct xxh32_state - private xxh64 state, do not use members directly
+ */
+struct xxh64_state {
+	uint64_t total_len;
+	uint64_t v1;
+	uint64_t v2;
+	uint64_t v3;
+	uint64_t v4;
+	uint64_t mem64[4];
+	uint32_t memsize;
+};
+
+/**
+ * xxh32_reset() - reset the xxh32 state to start a new hashing operation
+ *
+ * @state: The xxh32 state to reset.
+ * @seed:  Initialize the hash state with this seed.
+ *
+ * Call this function on any xxh32_state to prepare for a new hashing operation.
+ */
+void xxh32_reset(struct xxh32_state *state, uint32_t seed);
+
+/**
+ * xxh32_update() - hash the data given and update the xxh32 state
+ *
+ * @state:  The xxh32 state to update.
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh32_reset() call xxh32_update() as many times as necessary.
+ *
+ * Return:  Zero on success, otherwise an error code.
+ */
+int xxh32_update(struct xxh32_state *state, const void *input, size_t length);
+
+/**
+ * xxh32_digest() - produce the current xxh32 hash
+ *
+ * @state: Produce the current xxh32 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh32_digest(), and
+ * generate new hashes later on, by calling xxh32_digest() again.
+ *
+ * Return: The xxh32 hash stored in the state.
+ */
+uint32_t xxh32_digest(const struct xxh32_state *state);
+
+/**
+ * xxh64_reset() - reset the xxh64 state to start a new hashing operation
+ *
+ * @state: The xxh64 state to reset.
+ * @seed:  Initialize the hash state with this seed.
+ */
+void xxh64_reset(struct xxh64_state *state, uint64_t seed);
+
+/**
+ * xxh64_update() - hash the data given and update the xxh64 state
+ * @state:  The xxh64 state to update.
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh64_reset() call xxh64_update() as many times as necessary.
+ *
+ * Return:  Zero on success, otherwise an error code.
+ */
+int xxh64_update(struct xxh64_state *state, const void *input, size_t length);
+
+/**
+ * xxh64_digest() - produce the current xxh64 hash
+ *
+ * @state: Produce the current xxh64 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh64_digest(), and
+ * generate new hashes later on, by calling xxh64_digest() again.
+ *
+ * Return: The xxh64 hash stored in the state.
+ */
+uint64_t xxh64_digest(const struct xxh64_state *state);
+
+/*-**************************
+ * Utils
+ ***************************/
+
+/**
+ * xxh32_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh32 state.
+ * @dst: The destination xxh32 state.
+ */
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src);
+
+/**
+ * xxh64_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh64 state.
+ * @dst: The destination xxh64 state.
+ */
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src);
+
+#endif /* XXHASH_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 48635688046f..45f6b279d4a0 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -185,6 +185,9 @@ config CRC8
 	  when they need to do cyclic redundancy check according CRC8
 	  algorithm. Module will be called crc8.
 
+config XXHASH
+	tristate
+
 config AUDIT_GENERIC
 	bool
 	depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Makefile b/lib/Makefile
index b9ad86add71d..eb5af4a37c7f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_CRC32)	+= crc32.o
 obj-$(CONFIG_CRC7)	+= crc7.o
 obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
 obj-$(CONFIG_CRC8)	+= crc8.o
+obj-$(CONFIG_XXHASH)	+= xxhash.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
 
 obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/xxhash.c b/lib/xxhash.c
new file mode 100644
index 000000000000..aa61e2a3802f
--- /dev/null
+++ b/lib/xxhash.c
@@ -0,0 +1,500 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following disclaimer
+ *     in the documentation and/or other materials provided with the
+ *     distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at:
+ * - xxHash homepage: http://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+#include <asm/unaligned.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/xxhash.h>
+
+/*-*************************************
+ * Macros
+ **************************************/
+#define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r)))
+#define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r)))
+
+#ifdef __LITTLE_ENDIAN
+# define XXH_CPU_LITTLE_ENDIAN 1
+#else
+# define XXH_CPU_LITTLE_ENDIAN 0
+#endif
+
+/*-*************************************
+ * Constants
+ **************************************/
+static const uint32_t PRIME32_1 = 2654435761U;
+static const uint32_t PRIME32_2 = 2246822519U;
+static const uint32_t PRIME32_3 = 3266489917U;
+static const uint32_t PRIME32_4 =  668265263U;
+static const uint32_t PRIME32_5 =  374761393U;
+
+static const uint64_t PRIME64_1 = 11400714785074694791ULL;
+static const uint64_t PRIME64_2 = 14029467366897019727ULL;
+static const uint64_t PRIME64_3 =  1609587929392839161ULL;
+static const uint64_t PRIME64_4 =  9650029242287828579ULL;
+static const uint64_t PRIME64_5 =  2870177450012600261ULL;
+
+/*-**************************
+ *  Utils
+ ***************************/
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh32_copy_state);
+
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh64_copy_state);
+
+/*-***************************
+ * Simple Hash Functions
+ ****************************/
+static uint32_t xxh32_round(uint32_t seed, const uint32_t input)
+{
+	seed += input * PRIME32_2;
+	seed = xxh_rotl32(seed, 13);
+	seed *= PRIME32_1;
+	return seed;
+}
+
+uint32_t xxh32(const void *input, const size_t len, const uint32_t seed)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *b_end = p + len;
+	uint32_t h32;
+
+	if (len >= 16) {
+		const uint8_t *const limit = b_end - 16;
+		uint32_t v1 = seed + PRIME32_1 + PRIME32_2;
+		uint32_t v2 = seed + PRIME32_2;
+		uint32_t v3 = seed + 0;
+		uint32_t v4 = seed - PRIME32_1;
+
+		do {
+			v1 = xxh32_round(v1, get_unaligned_le32(p));
+			p += 4;
+			v2 = xxh32_round(v2, get_unaligned_le32(p));
+			p += 4;
+			v3 = xxh32_round(v3, get_unaligned_le32(p));
+			p += 4;
+			v4 = xxh32_round(v4, get_unaligned_le32(p));
+			p += 4;
+		} while (p <= limit);
+
+		h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) +
+			xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18);
+	} else {
+		h32 = seed + PRIME32_5;
+	}
+
+	h32 += (uint32_t)len;
+
+	while (p + 4 <= b_end) {
+		h32 += get_unaligned_le32(p) * PRIME32_3;
+		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h32 += (*p) * PRIME32_5;
+		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
+EXPORT_SYMBOL(xxh32);
+
+static uint64_t xxh64_round(uint64_t acc, const uint64_t input)
+{
+	acc += input * PRIME64_2;
+	acc = xxh_rotl64(acc, 31);
+	acc *= PRIME64_1;
+	return acc;
+}
+
+static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val)
+{
+	val = xxh64_round(0, val);
+	acc ^= val;
+	acc = acc * PRIME64_1 + PRIME64_4;
+	return acc;
+}
+
+uint64_t xxh64(const void *input, const size_t len, const uint64_t seed)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+	uint64_t h64;
+
+	if (len >= 32) {
+		const uint8_t *const limit = b_end - 32;
+		uint64_t v1 = seed + PRIME64_1 + PRIME64_2;
+		uint64_t v2 = seed + PRIME64_2;
+		uint64_t v3 = seed + 0;
+		uint64_t v4 = seed - PRIME64_1;
+
+		do {
+			v1 = xxh64_round(v1, get_unaligned_le64(p));
+			p += 8;
+			v2 = xxh64_round(v2, get_unaligned_le64(p));
+			p += 8;
+			v3 = xxh64_round(v3, get_unaligned_le64(p));
+			p += 8;
+			v4 = xxh64_round(v4, get_unaligned_le64(p));
+			p += 8;
+		} while (p <= limit);
+
+		h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+			xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+		h64 = xxh64_merge_round(h64, v1);
+		h64 = xxh64_merge_round(h64, v2);
+		h64 = xxh64_merge_round(h64, v3);
+		h64 = xxh64_merge_round(h64, v4);
+
+	} else {
+		h64  = seed + PRIME64_5;
+	}
+
+	h64 += (uint64_t)len;
+
+	while (p + 8 <= b_end) {
+		const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+		h64 ^= k1;
+		h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+		p += 8;
+	}
+
+	if (p + 4 <= b_end) {
+		h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+		h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h64 ^= (*p) * PRIME64_5;
+		h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+		p++;
+	}
+
+	h64 ^= h64 >> 33;
+	h64 *= PRIME64_2;
+	h64 ^= h64 >> 29;
+	h64 *= PRIME64_3;
+	h64 ^= h64 >> 32;
+
+	return h64;
+}
+EXPORT_SYMBOL(xxh64);
+
+/*-**************************************************
+ * Advanced Hash Functions
+ ***************************************************/
+void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed)
+{
+	/* use a local state for memcpy() to avoid strict-aliasing warnings */
+	struct xxh32_state state;
+
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME32_1 + PRIME32_2;
+	state.v2 = seed + PRIME32_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME32_1;
+	memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh32_reset);
+
+void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed)
+{
+	/* use a local state for memcpy() to avoid strict-aliasing warnings */
+	struct xxh64_state state;
+
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME64_1 + PRIME64_2;
+	state.v2 = seed + PRIME64_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME64_1;
+	memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh64_reset);
+
+int xxh32_update(struct xxh32_state *state, const void *input, const size_t len)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+
+	if (input == NULL)
+		return -EINVAL;
+
+	state->total_len_32 += (uint32_t)len;
+	state->large_len |= (len >= 16) | (state->total_len_32 >= 16);
+
+	if (state->memsize + len < 16) { /* fill in tmp buffer */
+		memcpy((uint8_t *)(state->mem32) + state->memsize, input, len);
+		state->memsize += (uint32_t)len;
+		return 0;
+	}
+
+	if (state->memsize) { /* some data left from previous update */
+		const uint32_t *p32 = state->mem32;
+
+		memcpy((uint8_t *)(state->mem32) + state->memsize, input,
+			16 - state->memsize);
+
+		state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32));
+		p32++;
+		state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32));
+		p32++;
+		state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32));
+		p32++;
+		state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32));
+		p32++;
+
+		p += 16-state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p <= b_end - 16) {
+		const uint8_t *const limit = b_end - 16;
+		uint32_t v1 = state->v1;
+		uint32_t v2 = state->v2;
+		uint32_t v3 = state->v3;
+		uint32_t v4 = state->v4;
+
+		do {
+			v1 = xxh32_round(v1, get_unaligned_le32(p));
+			p += 4;
+			v2 = xxh32_round(v2, get_unaligned_le32(p));
+			p += 4;
+			v3 = xxh32_round(v3, get_unaligned_le32(p));
+			p += 4;
+			v4 = xxh32_round(v4, get_unaligned_le32(p));
+			p += 4;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < b_end) {
+		memcpy(state->mem32, p, (size_t)(b_end-p));
+		state->memsize = (uint32_t)(b_end-p);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xxh32_update);
+
+uint32_t xxh32_digest(const struct xxh32_state *state)
+{
+	const uint8_t *p = (const uint8_t *)state->mem32;
+	const uint8_t *const b_end = (const uint8_t *)(state->mem32) +
+		state->memsize;
+	uint32_t h32;
+
+	if (state->large_len) {
+		h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) +
+			xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18);
+	} else {
+		h32 = state->v3 /* == seed */ + PRIME32_5;
+	}
+
+	h32 += state->total_len_32;
+
+	while (p + 4 <= b_end) {
+		h32 += get_unaligned_le32(p) * PRIME32_3;
+		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h32 += (*p) * PRIME32_5;
+		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
+EXPORT_SYMBOL(xxh32_digest);
+
+int xxh64_update(struct xxh64_state *state, const void *input, const size_t len)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+
+	if (input == NULL)
+		return -EINVAL;
+
+	state->total_len += len;
+
+	if (state->memsize + len < 32) { /* fill in tmp buffer */
+		memcpy(((uint8_t *)state->mem64) + state->memsize, input, len);
+		state->memsize += (uint32_t)len;
+		return 0;
+	}
+
+	if (state->memsize) { /* tmp buffer is full */
+		uint64_t *p64 = state->mem64;
+
+		memcpy(((uint8_t *)p64) + state->memsize, input,
+			32 - state->memsize);
+
+		state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64));
+		p64++;
+		state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64));
+		p64++;
+		state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64));
+		p64++;
+		state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64));
+
+		p += 32 - state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p + 32 <= b_end) {
+		const uint8_t *const limit = b_end - 32;
+		uint64_t v1 = state->v1;
+		uint64_t v2 = state->v2;
+		uint64_t v3 = state->v3;
+		uint64_t v4 = state->v4;
+
+		do {
+			v1 = xxh64_round(v1, get_unaligned_le64(p));
+			p += 8;
+			v2 = xxh64_round(v2, get_unaligned_le64(p));
+			p += 8;
+			v3 = xxh64_round(v3, get_unaligned_le64(p));
+			p += 8;
+			v4 = xxh64_round(v4, get_unaligned_le64(p));
+			p += 8;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < b_end) {
+		memcpy(state->mem64, p, (size_t)(b_end-p));
+		state->memsize = (uint32_t)(b_end - p);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xxh64_update);
+
+uint64_t xxh64_digest(const struct xxh64_state *state)
+{
+	const uint8_t *p = (const uint8_t *)state->mem64;
+	const uint8_t *const b_end = (const uint8_t *)state->mem64 +
+		state->memsize;
+	uint64_t h64;
+
+	if (state->total_len >= 32) {
+		const uint64_t v1 = state->v1;
+		const uint64_t v2 = state->v2;
+		const uint64_t v3 = state->v3;
+		const uint64_t v4 = state->v4;
+
+		h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+			xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+		h64 = xxh64_merge_round(h64, v1);
+		h64 = xxh64_merge_round(h64, v2);
+		h64 = xxh64_merge_round(h64, v3);
+		h64 = xxh64_merge_round(h64, v4);
+	} else {
+		h64  = state->v3 + PRIME64_5;
+	}
+
+	h64 += (uint64_t)state->total_len;
+
+	while (p + 8 <= b_end) {
+		const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+		h64 ^= k1;
+		h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+		p += 8;
+	}
+
+	if (p + 4 <= b_end) {
+		h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+		h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h64 ^= (*p) * PRIME64_5;
+		h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+		p++;
+	}
+
+	h64 ^= h64 >> 33;
+	h64 *= PRIME64_2;
+	h64 ^= h64 >> 29;
+	h64 *= PRIME64_3;
+	h64 ^= h64 >> 32;
+
+	return h64;
+}
+EXPORT_SYMBOL(xxh64_digest);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("xxHash");

From 460c0d10de2fc40227f70d8338386e5005e77fb9 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Wed, 9 Aug 2017 19:35:53 -0700
Subject: [PATCH 1791/3220] UPSTREAM: lib: Add zstd modules

Add zstd compression and decompression kernel modules.
zstd offers a wide varity of compression speed and quality trade-offs.
It can compress at speeds approaching lz4, and quality approaching lzma.
zstd decompressions at speeds more than twice as fast as zlib, and
decompression speed remains roughly the same across all compression levels.

The code was ported from the upstream zstd source repository. The
`linux/zstd.h` header was modified to match linux kernel style.
The cross-platform and allocation code was stripped out. Instead zstd
requires the caller to pass a preallocated workspace. The source files
were clang-formatted [1] to match the Linux Kernel style as much as
possible. Otherwise, the code was unmodified. We would like to avoid
as much further manual modification to the source code as possible, so it
will be easier to keep the kernel zstd up to date.

I benchmarked zstd compression as a special character device. I ran zstd
and zlib compression at several levels, as well as performing no
compression, which measure the time spent copying the data to kernel space.
Data is passed to the compresser 4096 B at a time. The benchmark file is
located in the upstream zstd source repository under
`contrib/linux-kernel/zstd_compress_test.c` [2].

I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD. I benchmarked using `silesia.tar` [3], which is
211,988,480 B large. Run the following commands for the benchmark:

    sudo modprobe zstd_compress_test
    sudo mknod zstd_compress_test c 245 0
    sudo cp silesia.tar zstd_compress_test

The time is reported by the time of the userland `cp`.
The MB/s is computed with

    1,536,217,008 B / time(buffer size, hash)

which includes the time to copy from userland.
The Adjusted MB/s is computed with

    1,536,217,088 B / (time(buffer size, hash) - time(buffer size, none)).

The memory reported is the amount of memory the compressor requests.

| Method   | Size (B) | Time (s) | Ratio | MB/s    | Adj MB/s | Mem (MB) |
|----------|----------|----------|-------|---------|----------|----------|
| none     | 11988480 |    0.100 |     1 | 2119.88 |        - |        - |
| zstd -1  | 73645762 |    1.044 | 2.878 |  203.05 |   224.56 |     1.23 |
| zstd -3  | 66988878 |    1.761 | 3.165 |  120.38 |   127.63 |     2.47 |
| zstd -5  | 65001259 |    2.563 | 3.261 |   82.71 |    86.07 |     2.86 |
| zstd -10 | 60165346 |   13.242 | 3.523 |   16.01 |    16.13 |    13.22 |
| zstd -15 | 58009756 |   47.601 | 3.654 |    4.45 |     4.46 |    21.61 |
| zstd -19 | 54014593 |  102.835 | 3.925 |    2.06 |     2.06 |    60.15 |
| zlib -1  | 77260026 |    2.895 | 2.744 |   73.23 |    75.85 |     0.27 |
| zlib -3  | 72972206 |    4.116 | 2.905 |   51.50 |    52.79 |     0.27 |
| zlib -6  | 68190360 |    9.633 | 3.109 |   22.01 |    22.24 |     0.27 |
| zlib -9  | 67613382 |   22.554 | 3.135 |    9.40 |     9.44 |     0.27 |

I benchmarked zstd decompression using the same method on the same machine.
The benchmark file is located in the upstream zstd repo under
`contrib/linux-kernel/zstd_decompress_test.c` [4]. The memory reported is
the amount of memory required to decompress data compressed with the given
compression level. If you know the maximum size of your input, you can
reduce the memory usage of decompression irrespective of the compression
level.

| Method   | Time (s) | MB/s    | Adjusted MB/s | Memory (MB) |
|----------|----------|---------|---------------|-------------|
| none     |    0.025 | 8479.54 |             - |           - |
| zstd -1  |    0.358 |  592.15 |        636.60 |        0.84 |
| zstd -3  |    0.396 |  535.32 |        571.40 |        1.46 |
| zstd -5  |    0.396 |  535.32 |        571.40 |        1.46 |
| zstd -10 |    0.374 |  566.81 |        607.42 |        2.51 |
| zstd -15 |    0.379 |  559.34 |        598.84 |        4.61 |
| zstd -19 |    0.412 |  514.54 |        547.77 |        8.80 |
| zlib -1  |    0.940 |  225.52 |        231.68 |        0.04 |
| zlib -3  |    0.883 |  240.08 |        247.07 |        0.04 |
| zlib -6  |    0.844 |  251.17 |        258.84 |        0.04 |
| zlib -9  |    0.837 |  253.27 |        287.64 |        0.04 |

Tested in userland using the test-suite in the zstd repo under
`contrib/linux-kernel/test/UserlandTest.cpp` [5] by mocking the kernel
functions. Fuzz tested using libfuzzer [6] with the fuzz harnesses under
`contrib/linux-kernel/test/{RoundTripCrash.c,DecompressCrash.c}` [7] [8]
with ASAN, UBSAN, and MSAN. Additionaly, it was tested while testing the
BtrFS and SquashFS patches coming next.

[1] https://clang.llvm.org/docs/ClangFormat.html
[2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/zstd_compress_test.c
[3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
[4] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/zstd_decompress_test.c
[5] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/UserlandTest.cpp
[6] http://llvm.org/docs/LibFuzzer.html
[7] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/RoundTripCrash.c
[8] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/DecompressCrash.c

zstd source repository: https://github.com/facebook/zstd

Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>

(cherry picked from commit 73f3d1b48f5069d46ba48aa28c2898dc93185560)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I47b9d43a8065b2b5a1362f8458065f0811cf70b9
---
 include/linux/zstd.h      | 1157 ++++++++++++
 lib/Kconfig               |    8 +
 lib/Makefile              |    2 +
 lib/zstd/Makefile         |   18 +
 lib/zstd/bitstream.h      |  374 ++++
 lib/zstd/compress.c       | 3484 +++++++++++++++++++++++++++++++++++++
 lib/zstd/decompress.c     | 2528 +++++++++++++++++++++++++++
 lib/zstd/entropy_common.c |  243 +++
 lib/zstd/error_private.h  |   53 +
 lib/zstd/fse.h            |  575 ++++++
 lib/zstd/fse_compress.c   |  795 +++++++++
 lib/zstd/fse_decompress.c |  332 ++++
 lib/zstd/huf.h            |  212 +++
 lib/zstd/huf_compress.c   |  770 ++++++++
 lib/zstd/huf_decompress.c |  960 ++++++++++
 lib/zstd/mem.h            |  151 ++
 lib/zstd/zstd_common.c    |   75 +
 lib/zstd/zstd_internal.h  |  263 +++
 lib/zstd/zstd_opt.h       | 1014 +++++++++++
 19 files changed, 13014 insertions(+)
 create mode 100644 include/linux/zstd.h
 create mode 100644 lib/zstd/Makefile
 create mode 100644 lib/zstd/bitstream.h
 create mode 100644 lib/zstd/compress.c
 create mode 100644 lib/zstd/decompress.c
 create mode 100644 lib/zstd/entropy_common.c
 create mode 100644 lib/zstd/error_private.h
 create mode 100644 lib/zstd/fse.h
 create mode 100644 lib/zstd/fse_compress.c
 create mode 100644 lib/zstd/fse_decompress.c
 create mode 100644 lib/zstd/huf.h
 create mode 100644 lib/zstd/huf_compress.c
 create mode 100644 lib/zstd/huf_decompress.c
 create mode 100644 lib/zstd/mem.h
 create mode 100644 lib/zstd/zstd_common.c
 create mode 100644 lib/zstd/zstd_internal.h
 create mode 100644 lib/zstd/zstd_opt.h

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
new file mode 100644
index 000000000000..249575e2485f
--- /dev/null
+++ b/include/linux/zstd.h
@@ -0,0 +1,1157 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef ZSTD_H
+#define ZSTD_H
+
+/* ======   Dependency   ======*/
+#include <linux/types.h>   /* size_t */
+
+
+/*-*****************************************************************************
+ * Introduction
+ *
+ * zstd, short for Zstandard, is a fast lossless compression algorithm,
+ * targeting real-time compression scenarios at zlib-level and better
+ * compression ratios. The zstd compression library provides in-memory
+ * compression and decompression functions. The library supports compression
+ * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled
+ * ultra, should be used with caution, as they require more memory.
+ * Compression can be done in:
+ *  - a single step, reusing a context (described as Explicit memory management)
+ *  - unbounded multiple steps (described as Streaming compression)
+ * The compression ratio achievable on small data can be highly improved using
+ * compression with a dictionary in:
+ *  - a single step (described as Simple dictionary API)
+ *  - a single step, reusing a dictionary (described as Fast dictionary API)
+ ******************************************************************************/
+
+/*======  Helper functions  ======*/
+
+/**
+ * enum ZSTD_ErrorCode - zstd error codes
+ *
+ * Functions that return size_t can be checked for errors using ZSTD_isError()
+ * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode().
+ */
+typedef enum {
+	ZSTD_error_no_error,
+	ZSTD_error_GENERIC,
+	ZSTD_error_prefix_unknown,
+	ZSTD_error_version_unsupported,
+	ZSTD_error_parameter_unknown,
+	ZSTD_error_frameParameter_unsupported,
+	ZSTD_error_frameParameter_unsupportedBy32bits,
+	ZSTD_error_frameParameter_windowTooLarge,
+	ZSTD_error_compressionParameter_unsupported,
+	ZSTD_error_init_missing,
+	ZSTD_error_memory_allocation,
+	ZSTD_error_stage_wrong,
+	ZSTD_error_dstSize_tooSmall,
+	ZSTD_error_srcSize_wrong,
+	ZSTD_error_corruption_detected,
+	ZSTD_error_checksum_wrong,
+	ZSTD_error_tableLog_tooLarge,
+	ZSTD_error_maxSymbolValue_tooLarge,
+	ZSTD_error_maxSymbolValue_tooSmall,
+	ZSTD_error_dictionary_corrupted,
+	ZSTD_error_dictionary_wrong,
+	ZSTD_error_dictionaryCreation_failed,
+	ZSTD_error_maxCode
+} ZSTD_ErrorCode;
+
+/**
+ * ZSTD_maxCLevel() - maximum compression level available
+ *
+ * Return: Maximum compression level available.
+ */
+int ZSTD_maxCLevel(void);
+/**
+ * ZSTD_compressBound() - maximum compressed size in worst case scenario
+ * @srcSize: The size of the data to compress.
+ *
+ * Return:   The maximum compressed size in the worst case scenario.
+ */
+size_t ZSTD_compressBound(size_t srcSize);
+/**
+ * ZSTD_isError() - tells if a size_t function result is an error code
+ * @code:  The function result to check for error.
+ *
+ * Return: Non-zero iff the code is an error.
+ */
+static __attribute__((unused)) unsigned int ZSTD_isError(size_t code)
+{
+	return code > (size_t)-ZSTD_error_maxCode;
+}
+/**
+ * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode
+ * @functionResult: The result of a function for which ZSTD_isError() is true.
+ *
+ * Return:          The ZSTD_ErrorCode corresponding to the functionResult or 0
+ *                  if the functionResult isn't an error.
+ */
+static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode(
+	size_t functionResult)
+{
+	if (!ZSTD_isError(functionResult))
+		return (ZSTD_ErrorCode)0;
+	return (ZSTD_ErrorCode)(0 - functionResult);
+}
+
+/**
+ * enum ZSTD_strategy - zstd compression search strategy
+ *
+ * From faster to stronger.
+ */
+typedef enum {
+	ZSTD_fast,
+	ZSTD_dfast,
+	ZSTD_greedy,
+	ZSTD_lazy,
+	ZSTD_lazy2,
+	ZSTD_btlazy2,
+	ZSTD_btopt,
+	ZSTD_btopt2
+} ZSTD_strategy;
+
+/**
+ * struct ZSTD_compressionParameters - zstd compression parameters
+ * @windowLog:    Log of the largest match distance. Larger means more
+ *                compression, and more memory needed during decompression.
+ * @chainLog:     Fully searched segment. Larger means more compression, slower,
+ *                and more memory (useless for fast).
+ * @hashLog:      Dispatch table. Larger means more compression,
+ *                slower, and more memory.
+ * @searchLog:    Number of searches. Larger means more compression and slower.
+ * @searchLength: Match length searched. Larger means faster decompression,
+ *                sometimes less compression.
+ * @targetLength: Acceptable match size for optimal parser (only). Larger means
+ *                more compression, and slower.
+ * @strategy:     The zstd compression strategy.
+ */
+typedef struct {
+	unsigned int windowLog;
+	unsigned int chainLog;
+	unsigned int hashLog;
+	unsigned int searchLog;
+	unsigned int searchLength;
+	unsigned int targetLength;
+	ZSTD_strategy strategy;
+} ZSTD_compressionParameters;
+
+/**
+ * struct ZSTD_frameParameters - zstd frame parameters
+ * @contentSizeFlag: Controls whether content size will be present in the frame
+ *                   header (when known).
+ * @checksumFlag:    Controls whether a 32-bit checksum is generated at the end
+ *                   of the frame for error detection.
+ * @noDictIDFlag:    Controls whether dictID will be saved into the frame header
+ *                   when using dictionary compression.
+ *
+ * The default value is all fields set to 0.
+ */
+typedef struct {
+	unsigned int contentSizeFlag;
+	unsigned int checksumFlag;
+	unsigned int noDictIDFlag;
+} ZSTD_frameParameters;
+
+/**
+ * struct ZSTD_parameters - zstd parameters
+ * @cParams: The compression parameters.
+ * @fParams: The frame parameters.
+ */
+typedef struct {
+	ZSTD_compressionParameters cParams;
+	ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+/**
+ * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level
+ * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
+ * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
+ * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
+ *
+ * Return:            The selected ZSTD_compressionParameters.
+ */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel,
+	unsigned long long estimatedSrcSize, size_t dictSize);
+
+/**
+ * ZSTD_getParams() - returns ZSTD_parameters for selected level
+ * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
+ * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
+ * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
+ *
+ * The same as ZSTD_getCParams() except also selects the default frame
+ * parameters (all zero).
+ *
+ * Return:            The selected ZSTD_parameters.
+ */
+ZSTD_parameters ZSTD_getParams(int compressionLevel,
+	unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*-*************************************
+ * Explicit memory management
+ **************************************/
+
+/**
+ * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * If multiple compression parameters might be used, the caller must call
+ * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum
+ * size.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           ZSTD_initCCtx().
+ */
+size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CCtx - the zstd compression context
+ *
+ * When compressing many times it is recommended to allocate a context just once
+ * and reuse it for each successive compression operation.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+/**
+ * ZSTD_initCCtx() - initialize a zstd compression context
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to
+ *                 determine how large the workspace must be.
+ *
+ * Return:         A compression context emplaced into workspace.
+ */
+ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_compressCCtx() - compress src into dst
+ * @ctx:         The context. Must have been initialized with a workspace at
+ *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
+ * @dst:         The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:         The data to compress.
+ * @srcSize:     The size of the data to compress.
+ * @params:      The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Return:       The compressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, ZSTD_parameters params);
+
+/**
+ * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ *         ZSTD_initDCtx().
+ */
+size_t ZSTD_DCtxWorkspaceBound(void);
+
+/**
+ * struct ZSTD_DCtx - the zstd decompression context
+ *
+ * When decompressing many times it is recommended to allocate a context just
+ * once and reuse it for each successive decompression operation.
+ */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+/**
+ * ZSTD_initDCtx() - initialize a zstd decompression context
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to
+ *                 determine how large the workspace must be.
+ *
+ * Return:         A decompression context emplaced into workspace.
+ */
+ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_decompressDCtx() - decompress zstd compressed src into dst
+ * @ctx:         The decompression context.
+ * @dst:         The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ *               as the decompressed size. If the caller cannot upper bound the
+ *               decompressed size, then it's better to use the streaming API.
+ * @src:         The zstd compressed data to decompress. Multiple concatenated
+ *               frames and skippable frames are allowed.
+ * @srcSize:     The exact size of the data to decompress.
+ *
+ * Return:       The decompressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+
+/*-************************
+ * Simple dictionary API
+ **************************/
+
+/**
+ * ZSTD_compress_usingDict() - compress src into dst using a dictionary
+ * @ctx:         The context. Must have been initialized with a workspace at
+ *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
+ * @dst:         The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:         The data to compress.
+ * @srcSize:     The size of the data to compress.
+ * @dict:        The dictionary to use for compression.
+ * @dictSize:    The size of the dictionary.
+ * @params:      The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Compression using a predefined dictionary. The same dictionary must be used
+ * during decompression.
+ *
+ * Return:       The compressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, const void *dict, size_t dictSize,
+	ZSTD_parameters params);
+
+/**
+ * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary
+ * @ctx:         The decompression context.
+ * @dst:         The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ *               as the decompressed size. If the caller cannot upper bound the
+ *               decompressed size, then it's better to use the streaming API.
+ * @src:         The zstd compressed data to decompress. Multiple concatenated
+ *               frames and skippable frames are allowed.
+ * @srcSize:     The exact size of the data to decompress.
+ * @dict:        The dictionary to use for decompression. The same dictionary
+ *               must've been used to compress the data.
+ * @dictSize:    The size of the dictionary.
+ *
+ * Return:       The decompressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, const void *dict, size_t dictSize);
+
+/*-**************************
+ * Fast dictionary API
+ ***************************/
+
+/**
+ * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           ZSTD_initCDict().
+ */
+size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CDict - a digested dictionary to be used for compression
+ */
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/**
+ * ZSTD_initCDict() - initialize a digested dictionary for compression
+ * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
+ *                 ZSTD_CDict so it must outlive the returned ZSTD_CDict.
+ * @dictSize:      The size of the dictionary.
+ * @params:        The parameters to use for compression. See ZSTD_getParams().
+ * @workspace:     The workspace. It must outlive the returned ZSTD_CDict.
+ * @workspaceSize: The workspace size. Must be at least
+ *                 ZSTD_CDictWorkspaceBound(params.cParams).
+ *
+ * When compressing multiple messages / blocks with the same dictionary it is
+ * recommended to load it just once. The ZSTD_CDict merely references the
+ * dictBuffer, so it must outlive the returned ZSTD_CDict.
+ *
+ * Return:         The digested dictionary emplaced into workspace.
+ */
+ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize,
+	ZSTD_parameters params, void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict
+ * @ctx:         The context. Must have been initialized with a workspace at
+ *               least as large as ZSTD_CCtxWorkspaceBound(cParams) where
+ *               cParams are the compression parameters used to initialize the
+ *               cdict.
+ * @dst:         The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:         The data to compress.
+ * @srcSize:     The size of the data to compress.
+ * @cdict:       The digested dictionary to use for compression.
+ * @params:      The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Compression using a digested dictionary. The same dictionary must be used
+ * during decompression.
+ *
+ * Return:       The compressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, const ZSTD_CDict *cdict);
+
+
+/**
+ * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict
+ *
+ * Return:  A lower bound on the size of the workspace that is passed to
+ *          ZSTD_initDDict().
+ */
+size_t ZSTD_DDictWorkspaceBound(void);
+
+/**
+ * struct ZSTD_DDict - a digested dictionary to be used for decompression
+ */
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/**
+ * ZSTD_initDDict() - initialize a digested dictionary for decompression
+ * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
+ *                 ZSTD_DDict so it must outlive the returned ZSTD_DDict.
+ * @dictSize:      The size of the dictionary.
+ * @workspace:     The workspace. It must outlive the returned ZSTD_DDict.
+ * @workspaceSize: The workspace size. Must be at least
+ *                 ZSTD_DDictWorkspaceBound().
+ *
+ * When decompressing multiple messages / blocks with the same dictionary it is
+ * recommended to load it just once. The ZSTD_DDict merely references the
+ * dictBuffer, so it must outlive the returned ZSTD_DDict.
+ *
+ * Return:         The digested dictionary emplaced into workspace.
+ */
+ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize,
+	void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict
+ * @ctx:         The decompression context.
+ * @dst:         The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ *               as the decompressed size. If the caller cannot upper bound the
+ *               decompressed size, then it's better to use the streaming API.
+ * @src:         The zstd compressed data to decompress. Multiple concatenated
+ *               frames and skippable frames are allowed.
+ * @srcSize:     The exact size of the data to decompress.
+ * @ddict:       The digested dictionary to use for decompression. The same
+ *               dictionary must've been used to compress the data.
+ *
+ * Return:       The decompressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst,
+	size_t dstCapacity, const void *src, size_t srcSize,
+	const ZSTD_DDict *ddict);
+
+
+/*-**************************
+ * Streaming
+ ***************************/
+
+/**
+ * struct ZSTD_inBuffer - input buffer for streaming
+ * @src:  Start of the input buffer.
+ * @size: Size of the input buffer.
+ * @pos:  Position where reading stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ */
+typedef struct ZSTD_inBuffer_s {
+	const void *src;
+	size_t size;
+	size_t pos;
+} ZSTD_inBuffer;
+
+/**
+ * struct ZSTD_outBuffer - output buffer for streaming
+ * @dst:  Start of the output buffer.
+ * @size: Size of the output buffer.
+ * @pos:  Position where writing stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ */
+typedef struct ZSTD_outBuffer_s {
+	void *dst;
+	size_t size;
+	size_t pos;
+} ZSTD_outBuffer;
+
+
+
+/*-*****************************************************************************
+ * Streaming compression - HowTo
+ *
+ * A ZSTD_CStream object is required to track streaming operation.
+ * Use ZSTD_initCStream() to initialize a ZSTD_CStream object.
+ * ZSTD_CStream objects can be reused multiple times on consecutive compression
+ * operations. It is recommended to re-use ZSTD_CStream in situations where many
+ * streaming operations will be achieved consecutively. Use one separate
+ * ZSTD_CStream per thread for parallel execution.
+ *
+ * Use ZSTD_compressStream() repetitively to consume input stream.
+ * The function will automatically update both `pos` fields.
+ * Note that it may not consume the entire input, in which case `pos < size`,
+ * and it's up to the caller to present again remaining data.
+ * It returns a hint for the preferred number of bytes to use as an input for
+ * the next function call.
+ *
+ * At any moment, it's possible to flush whatever data remains within internal
+ * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might
+ * still be some content left within the internal buffer if `output->size` is
+ * too small. It returns the number of bytes left in the internal buffer and
+ * must be called until it returns 0.
+ *
+ * ZSTD_endStream() instructs to finish a frame. It will perform a flush and
+ * write frame epilogue. The epilogue is required for decoders to consider a
+ * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush
+ * the full content if `output->size` is too small. In which case, call again
+ * ZSTD_endStream() to complete the flush. It returns the number of bytes left
+ * in the internal buffer and must be called until it returns 0.
+ ******************************************************************************/
+
+/**
+ * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           ZSTD_initCStream() and ZSTD_initCStream_usingCDict().
+ */
+size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CStream - the zstd streaming compression context
+ */
+typedef struct ZSTD_CStream_s ZSTD_CStream;
+
+/*===== ZSTD_CStream management functions =====*/
+/**
+ * ZSTD_initCStream() - initialize a zstd streaming compression context
+ * @params:         The zstd compression parameters.
+ * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must
+ *                  pass the source size (zero means empty source). Otherwise,
+ *                  the caller may optionally pass the source size, or zero if
+ *                  unknown.
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspaceSize:  The size of workspace.
+ *                  Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine
+ *                  how large the workspace must be.
+ *
+ * Return:          The zstd streaming compression context.
+ */
+ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params,
+	unsigned long long pledgedSrcSize, void *workspace,
+	size_t workspaceSize);
+
+/**
+ * ZSTD_initCStream_usingCDict() - initialize a streaming compression context
+ * @cdict:          The digested dictionary to use for compression.
+ * @pledgedSrcSize: Optionally the source size, or zero if unknown.
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspaceSize:  The size of workspace. Call ZSTD_CStreamWorkspaceBound()
+ *                  with the cParams used to initialize the cdict to determine
+ *                  how large the workspace must be.
+ *
+ * Return:          The zstd streaming compression context.
+ */
+ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict,
+	unsigned long long pledgedSrcSize, void *workspace,
+	size_t workspaceSize);
+
+/*===== Streaming compression functions =====*/
+/**
+ * ZSTD_resetCStream() - reset the context using parameters from creation
+ * @zcs:            The zstd streaming compression context to reset.
+ * @pledgedSrcSize: Optionally the source size, or zero if unknown.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame
+ * content size is always written into the frame header.
+ *
+ * Return:          Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize);
+/**
+ * ZSTD_compressStream() - streaming compress some of input into output
+ * @zcs:    The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ *          compressed data was written.
+ * @input:  Source buffer. `input->pos` is updated to indicate how much data was
+ *          read. Note that it may not consume the entire input, in which case
+ *          `input->pos < input->size`, and it's up to the caller to present
+ *          remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ *
+ * Return:  A hint for the number of bytes to use as the input for the next
+ *          function call or an error, which can be checked using
+ *          ZSTD_isError().
+ */
+size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output,
+	ZSTD_inBuffer *input);
+/**
+ * ZSTD_flushStream() - flush internal buffers into output
+ * @zcs:    The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ *          compressed data was written.
+ *
+ * ZSTD_flushStream() must be called until it returns 0, meaning all the data
+ * has been flushed. Since ZSTD_flushStream() causes a block to be ended,
+ * calling it too often will degrade the compression ratio.
+ *
+ * Return:  The number of bytes still present within internal buffers or an
+ *          error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
+/**
+ * ZSTD_endStream() - flush internal buffers into output and end the frame
+ * @zcs:    The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ *          compressed data was written.
+ *
+ * ZSTD_endStream() must be called until it returns 0, meaning all the data has
+ * been flushed and the frame epilogue has been written.
+ *
+ * Return:  The number of bytes still present within internal buffers or an
+ *          error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
+
+/**
+ * ZSTD_CStreamInSize() - recommended size for the input buffer
+ *
+ * Return: The recommended size for the input buffer.
+ */
+size_t ZSTD_CStreamInSize(void);
+/**
+ * ZSTD_CStreamOutSize() - recommended size for the output buffer
+ *
+ * When the output buffer is at least this large, it is guaranteed to be large
+ * enough to flush at least one complete compressed block.
+ *
+ * Return: The recommended size for the output buffer.
+ */
+size_t ZSTD_CStreamOutSize(void);
+
+
+
+/*-*****************************************************************************
+ * Streaming decompression - HowTo
+ *
+ * A ZSTD_DStream object is required to track streaming operations.
+ * Use ZSTD_initDStream() to initialize a ZSTD_DStream object.
+ * ZSTD_DStream objects can be re-used multiple times.
+ *
+ * Use ZSTD_decompressStream() repetitively to consume your input.
+ * The function will update both `pos` fields.
+ * If `input->pos < input->size`, some input has not been consumed.
+ * It's up to the caller to present again remaining data.
+ * If `output->pos < output->size`, decoder has flushed everything it could.
+ * Returns 0 iff a frame is completely decoded and fully flushed.
+ * Otherwise it returns a suggested next input size that will never load more
+ * than the current frame.
+ ******************************************************************************/
+
+/**
+ * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ *
+ * Return:         A lower bound on the size of the workspace that is passed to
+ *                 ZSTD_initDStream() and ZSTD_initDStream_usingDDict().
+ */
+size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize);
+
+/**
+ * struct ZSTD_DStream - the zstd streaming decompression context
+ */
+typedef struct ZSTD_DStream_s ZSTD_DStream;
+/*===== ZSTD_DStream management functions =====*/
+/**
+ * ZSTD_initDStream() - initialize a zstd streaming decompression context
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace.
+ *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
+ *                 how large the workspace must be.
+ *
+ * Return:         The zstd streaming decompression context.
+ */
+ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace,
+	size_t workspaceSize);
+/**
+ * ZSTD_initDStream_usingDDict() - initialize streaming decompression context
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ * @ddict:         The digested dictionary to use for decompression.
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace.
+ *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
+ *                 how large the workspace must be.
+ *
+ * Return:         The zstd streaming decompression context.
+ */
+ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize,
+	const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize);
+
+/*===== Streaming decompression functions =====*/
+/**
+ * ZSTD_resetDStream() - reset the context using parameters from creation
+ * @zds:   The zstd streaming decompression context to reset.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused.
+ *
+ * Return: Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_resetDStream(ZSTD_DStream *zds);
+/**
+ * ZSTD_decompressStream() - streaming decompress some of input into output
+ * @zds:    The zstd streaming decompression context.
+ * @output: Destination buffer. `output.pos` is updated to indicate how much
+ *          decompressed data was written.
+ * @input:  Source buffer. `input.pos` is updated to indicate how much data was
+ *          read. Note that it may not consume the entire input, in which case
+ *          `input.pos < input.size`, and it's up to the caller to present
+ *          remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ * ZSTD_decompressStream() will not consume the last byte of the frame until
+ * the entire frame is flushed.
+ *
+ * Return:  Returns 0 iff a frame is completely decoded and fully flushed.
+ *          Otherwise returns a hint for the number of bytes to use as the input
+ *          for the next function call or an error, which can be checked using
+ *          ZSTD_isError(). The size hint will never load more than the frame.
+ */
+size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output,
+	ZSTD_inBuffer *input);
+
+/**
+ * ZSTD_DStreamInSize() - recommended size for the input buffer
+ *
+ * Return: The recommended size for the input buffer.
+ */
+size_t ZSTD_DStreamInSize(void);
+/**
+ * ZSTD_DStreamOutSize() - recommended size for the output buffer
+ *
+ * When the output buffer is at least this large, it is guaranteed to be large
+ * enough to flush at least one complete decompressed block.
+ *
+ * Return: The recommended size for the output buffer.
+ */
+size_t ZSTD_DStreamOutSize(void);
+
+
+/* --- Constants ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
+
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+
+#define ZSTD_WINDOWLOG_MAX_32  27
+#define ZSTD_WINDOWLOG_MAX_64  27
+#define ZSTD_WINDOWLOG_MAX \
+	((unsigned int)(sizeof(size_t) == 4 \
+		? ZSTD_WINDOWLOG_MAX_32 \
+		: ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN 10
+#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX
+#define ZSTD_HASHLOG_MIN        6
+#define ZSTD_CHAINLOG_MAX     (ZSTD_WINDOWLOG_MAX+1)
+#define ZSTD_CHAINLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_HASHLOG3_MAX      17
+#define ZSTD_SEARCHLOG_MAX    (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN      1
+/* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_SEARCHLENGTH_MAX   7
+/* only for ZSTD_btopt, other strategies are limited to 4 */
+#define ZSTD_SEARCHLENGTH_MIN   3
+#define ZSTD_TARGETLENGTH_MIN   4
+#define ZSTD_TARGETLENGTH_MAX 999
+
+/* for static allocation */
+#define ZSTD_FRAMEHEADERSIZE_MAX 18
+#define ZSTD_FRAMEHEADERSIZE_MIN  6
+static const size_t ZSTD_frameHeaderSize_prefix = 5;
+static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
+static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+/* magic number + skippable frame length */
+static const size_t ZSTD_skippableHeaderSize = 8;
+
+
+/*-*************************************
+ * Compressed size functions
+ **************************************/
+
+/**
+ * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame
+ * @src:     Source buffer. It should point to the start of a zstd encoded frame
+ *           or a skippable frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ *           size of the frame.
+ *
+ * Return:   The compressed size of the frame pointed to by `src` or an error,
+ *           which can be check with ZSTD_isError().
+ *           Suitable to pass to ZSTD_decompress() or similar functions.
+ */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize);
+
+/*-*************************************
+ * Decompressed size functions
+ **************************************/
+/**
+ * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header
+ * @src:     It should point to the start of a zstd encoded frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
+ *
+ * Return:   The frame content size stored in the frame header if known.
+ *           `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the
+ *           frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input.
+ */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/**
+ * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames
+ * @src:     It should point to the start of a series of zstd encoded and/or
+ *           skippable frames.
+ * @srcSize: The exact size of the series of frames.
+ *
+ * If any zstd encoded frame in the series doesn't have the frame content size
+ * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always
+ * set when using ZSTD_compress(). The decompressed size can be very large.
+ * If the source is untrusted, the decompressed size could be wrong or
+ * intentionally modified. Always ensure the result fits within the
+ * application's authorized limits. ZSTD_findDecompressedSize() handles multiple
+ * frames, and so it must traverse the input to read each frame header. This is
+ * efficient as most of the data is skipped, however it does mean that all frame
+ * data must be present and valid.
+ *
+ * Return:   Decompressed size of all the data contained in the frames if known.
+ *           `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown.
+ *           `ZSTD_CONTENTSIZE_ERROR` if an error occurred.
+ */
+unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize);
+
+/*-*************************************
+ * Advanced compression functions
+ **************************************/
+/**
+ * ZSTD_checkCParams() - ensure parameter values remain within authorized range
+ * @cParams: The zstd compression parameters.
+ *
+ * Return:   Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams);
+
+/**
+ * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize
+ * @srcSize:  Optionally the estimated source size, or zero if unknown.
+ * @dictSize: Optionally the estimated dictionary size, or zero if unknown.
+ *
+ * Return:    The optimized parameters.
+ */
+ZSTD_compressionParameters ZSTD_adjustCParams(
+	ZSTD_compressionParameters cParams, unsigned long long srcSize,
+	size_t dictSize);
+
+/*--- Advanced decompression functions ---*/
+
+/**
+ * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame
+ * @buffer: The source buffer to check.
+ * @size:   The size of the source buffer, must be at least 4 bytes.
+ *
+ * Return: True iff the buffer starts with a zstd or skippable frame identifier.
+ */
+unsigned int ZSTD_isFrame(const void *buffer, size_t size);
+
+/**
+ * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary
+ * @dict:     The dictionary buffer.
+ * @dictSize: The size of the dictionary buffer.
+ *
+ * Return:    The dictionary id stored within the dictionary or 0 if the
+ *            dictionary is not a zstd dictionary. If it returns 0 the
+ *            dictionary can still be loaded as a content-only dictionary.
+ */
+unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize);
+
+/**
+ * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict
+ * @ddict: The ddict to find the id of.
+ *
+ * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not
+ *         a zstd dictionary. If it returns 0 `ddict` will be loaded as a
+ *         content-only dictionary.
+ */
+unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict);
+
+/**
+ * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame
+ * @src:     Source buffer. It must be a zstd encoded frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
+ *
+ * Return:   The dictionary id required to decompress the frame stored within
+ *           `src` or 0 if the dictionary id could not be decoded. It can return
+ *           0 if the frame does not require a dictionary, the dictionary id
+ *           wasn't stored in the frame, `src` is not a zstd frame, or `srcSize`
+ *           is too small.
+ */
+unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize);
+
+/**
+ * struct ZSTD_frameParams - zstd frame parameters stored in the frame header
+ * @frameContentSize: The frame content size, or 0 if not present.
+ * @windowSize:       The window size, or 0 if the frame is a skippable frame.
+ * @dictID:           The dictionary id, or 0 if not present.
+ * @checksumFlag:     Whether a checksum was used.
+ */
+typedef struct {
+	unsigned long long frameContentSize;
+	unsigned int windowSize;
+	unsigned int dictID;
+	unsigned int checksumFlag;
+} ZSTD_frameParams;
+
+/**
+ * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame
+ * @fparamsPtr: On success the frame parameters are written here.
+ * @src:        The source buffer. It must point to a zstd or skippable frame.
+ * @srcSize:    The size of the source buffer. `ZSTD_frameHeaderSize_max` is
+ *              always large enough to succeed.
+ *
+ * Return:      0 on success. If more data is required it returns how many bytes
+ *              must be provided to make forward progress. Otherwise it returns
+ *              an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src,
+	size_t srcSize);
+
+/*-*****************************************************************************
+ * Buffer-less and synchronous inner streaming functions
+ *
+ * This is an advanced API, giving full control over buffer management, for
+ * users which need direct control over memory.
+ * But it's also a complex one, with many restrictions (documented below).
+ * Prefer using normal streaming API for an easier experience
+ ******************************************************************************/
+
+/*-*****************************************************************************
+ * Buffer-less streaming compression (synchronous mode)
+ *
+ * A ZSTD_CCtx object is required to track streaming operations.
+ * Use ZSTD_initCCtx() to initialize a context.
+ * ZSTD_CCtx object can be re-used multiple times within successive compression
+ * operations.
+ *
+ * Start by initializing a context.
+ * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary
+ * compression,
+ * or ZSTD_compressBegin_advanced(), for finer parameter control.
+ * It's also possible to duplicate a reference context which has already been
+ * initialized, using ZSTD_copyCCtx()
+ *
+ * Then, consume your input using ZSTD_compressContinue().
+ * There are some important considerations to keep in mind when using this
+ * advanced function :
+ * - ZSTD_compressContinue() has no internal buffer. It uses externally provided
+ *   buffer only.
+ * - Interface is synchronous : input is consumed entirely and produce 1+
+ *   (or more) compressed blocks.
+ * - Caller must ensure there is enough space in `dst` to store compressed data
+ *   under worst case scenario. Worst case evaluation is provided by
+ *   ZSTD_compressBound().
+ *   ZSTD_compressContinue() doesn't guarantee recover after a failed
+ *   compression.
+ * - ZSTD_compressContinue() presumes prior input ***is still accessible and
+ *   unmodified*** (up to maximum distance size, see WindowLog).
+ *   It remembers all previous contiguous blocks, plus one separated memory
+ *   segment (which can itself consists of multiple contiguous blocks)
+ * - ZSTD_compressContinue() detects that prior input has been overwritten when
+ *   `src` buffer overlaps. In which case, it will "discard" the relevant memory
+ *   section from its history.
+ *
+ * Finish a frame with ZSTD_compressEnd(), which will write the last block(s)
+ * and optional checksum. It's possible to use srcSize==0, in which case, it
+ * will write a final empty block to end the frame. Without last block mark,
+ * frames will be considered unfinished (corrupted) by decoders.
+ *
+ * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new
+ * frame.
+ ******************************************************************************/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel);
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict,
+	size_t dictSize, int compressionLevel);
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict,
+	size_t dictSize, ZSTD_parameters params,
+	unsigned long long pledgedSrcSize);
+size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx,
+	unsigned long long pledgedSrcSize);
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict,
+	unsigned long long pledgedSrcSize);
+size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+
+
+
+/*-*****************************************************************************
+ * Buffer-less streaming decompression (synchronous mode)
+ *
+ * A ZSTD_DCtx object is required to track streaming operations.
+ * Use ZSTD_initDCtx() to initialize a context.
+ * A ZSTD_DCtx object can be re-used multiple times.
+ *
+ * First typical operation is to retrieve frame parameters, using
+ * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide
+ * important information to correctly decode the frame, such as the minimum
+ * rolling buffer size to allocate to decompress data (`windowSize`), and the
+ * dictionary ID used.
+ * Note: content size is optional, it may not be present. 0 means unknown.
+ * Note that these values could be wrong, either because of data malformation,
+ * or because an attacker is spoofing deliberate false information. As a
+ * consequence, check that values remain within valid application range,
+ * especially `windowSize`, before allocation. Each application can set its own
+ * limit, depending on local restrictions. For extended interoperability, it is
+ * recommended to support at least 8 MB.
+ * Frame parameters are extracted from the beginning of the compressed frame.
+ * Data fragment must be large enough to ensure successful decoding, typically
+ * `ZSTD_frameHeaderSize_max` bytes.
+ * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled.
+ *        >0: `srcSize` is too small, provide at least this many bytes.
+ *        errorCode, which can be tested using ZSTD_isError().
+ *
+ * Start decompression, with ZSTD_decompressBegin() or
+ * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared
+ * context, using ZSTD_copyDCtx().
+ *
+ * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue()
+ * alternatively.
+ * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize'
+ * to ZSTD_decompressContinue().
+ * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will
+ * fail.
+ *
+ * The result of ZSTD_decompressContinue() is the number of bytes regenerated
+ * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an
+ * error; it just means ZSTD_decompressContinue() has decoded some metadata
+ * item. It can also be an error code, which can be tested with ZSTD_isError().
+ *
+ * ZSTD_decompressContinue() needs previous data blocks during decompression, up
+ * to `windowSize`. They should preferably be located contiguously, prior to
+ * current block. Alternatively, a round buffer of sufficient size is also
+ * possible. Sufficient size is determined by frame parameters.
+ * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't
+ * follow each other, make sure that either the compressor breaks contiguity at
+ * the same place, or that previous contiguous segment is large enough to
+ * properly handle maximum back-reference.
+ *
+ * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+ * Context can then be reset to start a new decompression.
+ *
+ * Note: it's possible to know if next input to present is a header or a block,
+ * using ZSTD_nextInputType(). This information is not required to properly
+ * decode a frame.
+ *
+ * == Special case: skippable frames ==
+ *
+ * Skippable frames allow integration of user-defined data into a flow of
+ * concatenated frames. Skippable frames will be ignored (skipped) by a
+ * decompressor. The format of skippable frames is as follows:
+ * a) Skippable frame ID - 4 Bytes, Little endian format, any value from
+ *    0x184D2A50 to 0x184D2A5F
+ * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+ * c) Frame Content - any content (User Data) of length equal to Frame Size
+ * For skippable frames ZSTD_decompressContinue() always returns 0.
+ * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0
+ * what means that a frame is skippable.
+ * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might
+ *       actually be a zstd encoded frame with no content. For purposes of
+ *       decompression, it is valid in both cases to skip the frame using
+ *       ZSTD_findFrameCompressedSize() to find its size in bytes.
+ * It also returns frame size as fparamsPtr->frameContentSize.
+ ******************************************************************************/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx);
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict,
+	size_t dictSize);
+void   ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx);
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx);
+size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+typedef enum {
+	ZSTDnit_frameHeader,
+	ZSTDnit_blockHeader,
+	ZSTDnit_block,
+	ZSTDnit_lastBlock,
+	ZSTDnit_checksum,
+	ZSTDnit_skippableFrame
+} ZSTD_nextInputType_e;
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx);
+
+/*-*****************************************************************************
+ * Block functions
+ *
+ * Block functions produce and decode raw zstd blocks, without frame metadata.
+ * Frame metadata cost is typically ~18 bytes, which can be non-negligible for
+ * very small blocks (< 100 bytes). User will have to take in charge required
+ * information to regenerate data, such as compressed and content sizes.
+ *
+ * A few rules to respect:
+ * - Compressing and decompressing require a context structure
+ *   + Use ZSTD_initCCtx() and ZSTD_initDCtx()
+ * - It is necessary to init context before starting
+ *   + compression : ZSTD_compressBegin()
+ *   + decompression : ZSTD_decompressBegin()
+ *   + variants _usingDict() are also allowed
+ *   + copyCCtx() and copyDCtx() work too
+ * - Block size is limited, it must be <= ZSTD_getBlockSizeMax()
+ *   + If you need to compress more, cut data into multiple blocks
+ *   + Consider using the regular ZSTD_compress() instead, as frame metadata
+ *     costs become negligible when source size is large.
+ * - When a block is considered not compressible enough, ZSTD_compressBlock()
+ *   result will be zero. In which case, nothing is produced into `dst`.
+ *   + User must test for such outcome and deal directly with uncompressed data
+ *   + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!!
+ *   + In case of multiple successive blocks, decoder must be informed of
+ *     uncompressed block existence to follow proper history. Use
+ *     ZSTD_insertBlock() in such a case.
+ ******************************************************************************/
+
+/* Define for static allocation */
+#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)
+/*=====   Raw zstd block functions  =====*/
+size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx);
+size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart,
+	size_t blockSize);
+
+#endif  /* ZSTD_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 45f6b279d4a0..e1f158d7cd78 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -242,6 +242,14 @@ config LZ4HC_COMPRESS
 config LZ4_DECOMPRESS
 	tristate
 
+config ZSTD_COMPRESS
+	select XXHASH
+	tristate
+
+config ZSTD_DECOMPRESS
+	select XXHASH
+	tristate
+
 source "lib/xz/Kconfig"
 
 #
diff --git a/lib/Makefile b/lib/Makefile
index eb5af4a37c7f..5b6027433688 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -107,6 +107,8 @@ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
 obj-$(CONFIG_LZ4_COMPRESS) += lz4/
 obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/
 obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/
+obj-$(CONFIG_ZSTD_COMPRESS) += zstd/
+obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd/
 obj-$(CONFIG_XZ_DEC) += xz/
 obj-$(CONFIG_RAID6_PQ) += raid6/
 
diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
new file mode 100644
index 000000000000..dd0a359c135b
--- /dev/null
+++ b/lib/zstd/Makefile
@@ -0,0 +1,18 @@
+obj-$(CONFIG_ZSTD_COMPRESS) += zstd_compress.o
+obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd_decompress.o
+
+ccflags-y += -O3
+
+# Object files unique to zstd_compress and zstd_decompress
+zstd_compress-y := fse_compress.o huf_compress.o compress.o
+zstd_decompress-y := huf_decompress.o decompress.o
+
+# These object files are shared between the modules.
+# Always add them to zstd_compress.
+# Unless both zstd_compress and zstd_decompress are built in
+# then also add them to zstd_decompress.
+zstd_compress-y += entropy_common.o fse_decompress.o zstd_common.o
+
+ifneq ($(CONFIG_ZSTD_COMPRESS)$(CONFIG_ZSTD_DECOMPRESS),yy)
+	zstd_decompress-y += entropy_common.o fse_decompress.o zstd_common.o
+endif
diff --git a/lib/zstd/bitstream.h b/lib/zstd/bitstream.h
new file mode 100644
index 000000000000..a826b99e1d63
--- /dev/null
+++ b/lib/zstd/bitstream.h
@@ -0,0 +1,374 @@
+/*
+ * bitstream
+ * Part of FSE library
+ * header file (to include)
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "error_private.h" /* error codes and messages */
+#include "mem.h"	   /* unaligned access routines */
+
+/*=========================================
+*  Target specific
+=========================================*/
+#define STREAM_ACCUMULATOR_MIN_32 25
+#define STREAM_ACCUMULATOR_MIN_64 57
+#define STREAM_ACCUMULATOR_MIN ((U32)(ZSTD_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+*  A critical property of these streams is that they encode and decode in **reverse** direction.
+*  So the first bit sequence you add will be the last to be read, like a LIFO stack.
+*/
+typedef struct {
+	size_t bitContainer;
+	int bitPos;
+	char *startPtr;
+	char *ptr;
+	char *endPtr;
+} BIT_CStream_t;
+
+ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *dstBuffer, size_t dstCapacity);
+ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits);
+ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC);
+ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct {
+	size_t bitContainer;
+	unsigned bitsConsumed;
+	const char *ptr;
+	const char *start;
+} BIT_DStream_t;
+
+typedef enum {
+	BIT_DStream_unfinished = 0,
+	BIT_DStream_endOfBuffer = 1,
+	BIT_DStream_completed = 2,
+	BIT_DStream_overflow = 3
+} BIT_DStream_status; /* result of BIT_reloadDStream() */
+/* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize);
+ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, unsigned nbBits);
+ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD);
+ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *bitD);
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+/*-****************************************
+*  unsafe API
+******************************************/
+ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC);
+/* unsafe version; does not check buffer overflow */
+
+ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+ZSTD_STATIC unsigned BIT_highbit32(register U32 val) { return 31 - __builtin_clz(val); }
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {0,       1,       3,       7,	0xF,      0x1F,     0x3F,     0x7F,      0xFF,
+				    0x1FF,   0x3FF,   0x7FF,   0xFFF,    0x1FFF,   0x3FFF,   0x7FFF,   0xFFFF,    0x1FFFF,
+				    0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF}; /* up to 26 bits */
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(void*)
+ *  @return : 0 if success,
+			  otherwise an error code (can be tested using ERR_isError() ) */
+ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *startPtr, size_t dstCapacity)
+{
+	bitC->bitContainer = 0;
+	bitC->bitPos = 0;
+	bitC->startPtr = (char *)startPtr;
+	bitC->ptr = bitC->startPtr;
+	bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->ptr);
+	if (dstCapacity <= sizeof(bitC->ptr))
+		return ERROR(dstSize_tooSmall);
+	return 0;
+}
+
+/*! BIT_addBits() :
+	can add up to 26 bits into `bitC`.
+	Does not check for register overflow ! */
+ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits)
+{
+	bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+	bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
+ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits)
+{
+	bitC->bitContainer |= value << bitC->bitPos;
+	bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  unsafe version; does not check buffer overflow */
+ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC)
+{
+	size_t const nbBytes = bitC->bitPos >> 3;
+	ZSTD_writeLEST(bitC->ptr, bitC->bitContainer);
+	bitC->ptr += nbBytes;
+	bitC->bitPos &= 7;
+	bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_flushBits() :
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow. This will be revealed later on using BIT_closeCStream() */
+ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC)
+{
+	size_t const nbBytes = bitC->bitPos >> 3;
+	ZSTD_writeLEST(bitC->ptr, bitC->bitContainer);
+	bitC->ptr += nbBytes;
+	if (bitC->ptr > bitC->endPtr)
+		bitC->ptr = bitC->endPtr;
+	bitC->bitPos &= 7;
+	bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+			  or 0 if it could not fit into dstBuffer */
+ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC)
+{
+	BIT_addBitsFast(bitC, 1, 1); /* endMark */
+	BIT_flushBits(bitC);
+
+	if (bitC->ptr >= bitC->endPtr)
+		return 0; /* doesn't fit within authorized budget : cancel */
+
+	return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+*   Initialize a BIT_DStream_t.
+*   `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+*   `srcSize` must be the *exact* size of the bitStream, in bytes.
+*   @return : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize)
+{
+	if (srcSize < 1) {
+		memset(bitD, 0, sizeof(*bitD));
+		return ERROR(srcSize_wrong);
+	}
+
+	if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */
+		bitD->start = (const char *)srcBuffer;
+		bitD->ptr = (const char *)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+		bitD->bitContainer = ZSTD_readLEST(bitD->ptr);
+		{
+			BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1];
+			bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
+			if (lastByte == 0)
+				return ERROR(GENERIC); /* endMark not present */
+		}
+	} else {
+		bitD->start = (const char *)srcBuffer;
+		bitD->ptr = bitD->start;
+		bitD->bitContainer = *(const BYTE *)(bitD->start);
+		switch (srcSize) {
+		case 7: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[6]) << (sizeof(bitD->bitContainer) * 8 - 16);
+		case 6: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[5]) << (sizeof(bitD->bitContainer) * 8 - 24);
+		case 5: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[4]) << (sizeof(bitD->bitContainer) * 8 - 32);
+		case 4: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[3]) << 24;
+		case 3: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[2]) << 16;
+		case 2: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[1]) << 8;
+		default:;
+		}
+		{
+			BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1];
+			bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+			if (lastByte == 0)
+				return ERROR(GENERIC); /* endMark not present */
+		}
+		bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize) * 8;
+	}
+
+	return srcSize;
+}
+
+ZSTD_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start) { return bitContainer >> start; }
+
+ZSTD_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) { return (bitContainer >> start) & BIT_mask[nbBits]; }
+
+ZSTD_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) { return bitContainer & BIT_mask[nbBits]; }
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ *  @return : value extracted
+ */
+ZSTD_STATIC size_t BIT_lookBits(const BIT_DStream_t *bitD, U32 nbBits)
+{
+	U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1;
+	return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask - nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+ZSTD_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t *bitD, U32 nbBits)
+{
+	U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1;
+	return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask + 1) - nbBits) & bitMask);
+}
+
+ZSTD_STATIC void BIT_skipBits(BIT_DStream_t *bitD, U32 nbBits) { bitD->bitsConsumed += nbBits; }
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ *  @return : extracted value.
+ */
+ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, U32 nbBits)
+{
+	size_t const value = BIT_lookBits(bitD, nbBits);
+	BIT_skipBits(bitD, nbBits);
+	return value;
+}
+
+/*! BIT_readBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, U32 nbBits)
+{
+	size_t const value = BIT_lookBitsFast(bitD, nbBits);
+	BIT_skipBits(bitD, nbBits);
+	return value;
+}
+
+/*! BIT_reloadDStream() :
+*   Refill `bitD` from buffer previously set in BIT_initDStream() .
+*   This function is safe, it guarantees it will not read beyond src buffer.
+*   @return : status of `BIT_DStream_t` internal register.
+			  if status == BIT_DStream_unfinished, internal register is filled with >= (sizeof(bitD->bitContainer)*8 - 7) bits */
+ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD)
+{
+	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer) * 8)) /* should not happen => corruption detected */
+		return BIT_DStream_overflow;
+
+	if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+		bitD->ptr -= bitD->bitsConsumed >> 3;
+		bitD->bitsConsumed &= 7;
+		bitD->bitContainer = ZSTD_readLEST(bitD->ptr);
+		return BIT_DStream_unfinished;
+	}
+	if (bitD->ptr == bitD->start) {
+		if (bitD->bitsConsumed < sizeof(bitD->bitContainer) * 8)
+			return BIT_DStream_endOfBuffer;
+		return BIT_DStream_completed;
+	}
+	{
+		U32 nbBytes = bitD->bitsConsumed >> 3;
+		BIT_DStream_status result = BIT_DStream_unfinished;
+		if (bitD->ptr - nbBytes < bitD->start) {
+			nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */
+			result = BIT_DStream_endOfBuffer;
+		}
+		bitD->ptr -= nbBytes;
+		bitD->bitsConsumed -= nbBytes * 8;
+		bitD->bitContainer = ZSTD_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD) */
+		return result;
+	}
+}
+
+/*! BIT_endOfDStream() :
+*   @return Tells if DStream has exactly reached its end (all bits consumed).
+*/
+ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *DStream)
+{
+	return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer) * 8));
+}
+
+#endif /* BITSTREAM_H_MODULE */
diff --git a/lib/zstd/compress.c b/lib/zstd/compress.c
new file mode 100644
index 000000000000..f9166cf4f7a9
--- /dev/null
+++ b/lib/zstd/compress.c
@@ -0,0 +1,3484 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "fse.h"
+#include "huf.h"
+#include "mem.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h> /* memset */
+
+/*-*************************************
+*  Constants
+***************************************/
+static const U32 g_searchStrength = 8; /* control skip over incompressible data */
+#define HASH_READ_SIZE 8
+typedef enum { ZSTDcs_created = 0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+
+/*-*************************************
+*  Helper functions
+***************************************/
+size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; }
+
+/*-*************************************
+*  Sequence storage
+***************************************/
+static void ZSTD_resetSeqStore(seqStore_t *ssPtr)
+{
+	ssPtr->lit = ssPtr->litStart;
+	ssPtr->sequences = ssPtr->sequencesStart;
+	ssPtr->longLengthID = 0;
+}
+
+/*-*************************************
+*  Context memory management
+***************************************/
+struct ZSTD_CCtx_s {
+	const BYTE *nextSrc;  /* next block here to continue on curr prefix */
+	const BYTE *base;     /* All regular indexes relative to this position */
+	const BYTE *dictBase; /* extDict indexes relative to this position */
+	U32 dictLimit;	/* below that point, need extDict */
+	U32 lowLimit;	 /* below that point, no more data */
+	U32 nextToUpdate;     /* index from which to continue dictionary update */
+	U32 nextToUpdate3;    /* index from which to continue dictionary update */
+	U32 hashLog3;	 /* dispatch table : larger == faster, more memory */
+	U32 loadedDictEnd;    /* index of end of dictionary */
+	U32 forceWindow;      /* force back-references to respect limit of 1<<wLog, even for dictionary */
+	U32 forceRawDict;     /* Force loading dictionary in "content-only" mode (no header analysis) */
+	ZSTD_compressionStage_e stage;
+	U32 rep[ZSTD_REP_NUM];
+	U32 repToConfirm[ZSTD_REP_NUM];
+	U32 dictID;
+	ZSTD_parameters params;
+	void *workSpace;
+	size_t workSpaceSize;
+	size_t blockSize;
+	U64 frameContentSize;
+	struct xxh64_state xxhState;
+	ZSTD_customMem customMem;
+
+	seqStore_t seqStore; /* sequences storage ptrs */
+	U32 *hashTable;
+	U32 *hashTable3;
+	U32 *chainTable;
+	HUF_CElt *hufTable;
+	U32 flagStaticTables;
+	HUF_repeat flagStaticHufTable;
+	FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+	FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+	FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+	unsigned tmpCounters[HUF_COMPRESS_WORKSPACE_SIZE_U32];
+};
+
+size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams)
+{
+	size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << cParams.windowLog);
+	U32 const divider = (cParams.searchLength == 3) ? 3 : 4;
+	size_t const maxNbSeq = blockSize / divider;
+	size_t const tokenSpace = blockSize + 11 * maxNbSeq;
+	size_t const chainSize = (cParams.strategy == ZSTD_fast) ? 0 : (1 << cParams.chainLog);
+	size_t const hSize = ((size_t)1) << cParams.hashLog;
+	U32 const hashLog3 = (cParams.searchLength > 3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, cParams.windowLog);
+	size_t const h3Size = ((size_t)1) << hashLog3;
+	size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+	size_t const optSpace =
+	    ((MaxML + 1) + (MaxLL + 1) + (MaxOff + 1) + (1 << Litbits)) * sizeof(U32) + (ZSTD_OPT_NUM + 1) * (sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
+	size_t const workspaceSize = tableSpace + (256 * sizeof(U32)) /* huffTable */ + tokenSpace +
+				     (((cParams.strategy == ZSTD_btopt) || (cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
+
+	return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_CCtx)) + ZSTD_ALIGN(workspaceSize);
+}
+
+static ZSTD_CCtx *ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+	ZSTD_CCtx *cctx;
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+	cctx = (ZSTD_CCtx *)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem);
+	if (!cctx)
+		return NULL;
+	memset(cctx, 0, sizeof(ZSTD_CCtx));
+	cctx->customMem = customMem;
+	return cctx;
+}
+
+ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	ZSTD_CCtx *cctx = ZSTD_createCCtx_advanced(stackMem);
+	if (cctx) {
+		cctx->workSpace = ZSTD_stackAllocAll(cctx->customMem.opaque, &cctx->workSpaceSize);
+	}
+	return cctx;
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx *cctx)
+{
+	if (cctx == NULL)
+		return 0; /* support free on NULL */
+	ZSTD_free(cctx->workSpace, cctx->customMem);
+	ZSTD_free(cctx, cctx->customMem);
+	return 0; /* reserved as a potential error code in the future */
+}
+
+const seqStore_t *ZSTD_getSeqStore(const ZSTD_CCtx *ctx) /* hidden interface */ { return &(ctx->seqStore); }
+
+static ZSTD_parameters ZSTD_getParamsFromCCtx(const ZSTD_CCtx *cctx) { return cctx->params; }
+
+/** ZSTD_checkParams() :
+	ensure param values remain within authorized range.
+	@return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+#define CLAMPCHECK(val, min, max)                                       \
+	{                                                               \
+		if ((val < min) | (val > max))                          \
+			return ERROR(compressionParameter_unsupported); \
+	}
+	CLAMPCHECK(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+	CLAMPCHECK(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+	CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+	CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+	CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+	CLAMPCHECK(cParams.targetLength, ZSTD_TARGETLENGTH_MIN, ZSTD_TARGETLENGTH_MAX);
+	if ((U32)(cParams.strategy) > (U32)ZSTD_btopt2)
+		return ERROR(compressionParameter_unsupported);
+	return 0;
+}
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+	U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+	return hashLog - btScale;
+}
+
+/** ZSTD_adjustCParams() :
+	optimize `cPar` for a given input (`srcSize` and `dictSize`).
+	mostly downsizing to reduce memory consumption and initialization.
+	Both `srcSize` and `dictSize` are optional (use 0 if unknown),
+	but if both are 0, no optimization can be done.
+	Note : cPar is considered validated at this stage. Use ZSTD_checkParams() to ensure that. */
+ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+{
+	if (srcSize + dictSize == 0)
+		return cPar; /* no size information available : no adjustment */
+
+	/* resize params, to use less memory when necessary */
+	{
+		U32 const minSrcSize = (srcSize == 0) ? 500 : 0;
+		U64 const rSize = srcSize + dictSize + minSrcSize;
+		if (rSize < ((U64)1 << ZSTD_WINDOWLOG_MAX)) {
+			U32 const srcLog = MAX(ZSTD_HASHLOG_MIN, ZSTD_highbit32((U32)(rSize)-1) + 1);
+			if (cPar.windowLog > srcLog)
+				cPar.windowLog = srcLog;
+		}
+	}
+	if (cPar.hashLog > cPar.windowLog)
+		cPar.hashLog = cPar.windowLog;
+	{
+		U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+		if (cycleLog > cPar.windowLog)
+			cPar.chainLog -= (cycleLog - cPar.windowLog);
+	}
+
+	if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+		cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* required for frame header */
+
+	return cPar;
+}
+
+static U32 ZSTD_equivalentParams(ZSTD_parameters param1, ZSTD_parameters param2)
+{
+	return (param1.cParams.hashLog == param2.cParams.hashLog) & (param1.cParams.chainLog == param2.cParams.chainLog) &
+	       (param1.cParams.strategy == param2.cParams.strategy) & ((param1.cParams.searchLength == 3) == (param2.cParams.searchLength == 3));
+}
+
+/*! ZSTD_continueCCtx() :
+	reuse CCtx without reset (note : requires no dictionary) */
+static size_t ZSTD_continueCCtx(ZSTD_CCtx *cctx, ZSTD_parameters params, U64 frameContentSize)
+{
+	U32 const end = (U32)(cctx->nextSrc - cctx->base);
+	cctx->params = params;
+	cctx->frameContentSize = frameContentSize;
+	cctx->lowLimit = end;
+	cctx->dictLimit = end;
+	cctx->nextToUpdate = end + 1;
+	cctx->stage = ZSTDcs_init;
+	cctx->dictID = 0;
+	cctx->loadedDictEnd = 0;
+	{
+		int i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			cctx->rep[i] = repStartValue[i];
+	}
+	cctx->seqStore.litLengthSum = 0; /* force reset of btopt stats */
+	xxh64_reset(&cctx->xxhState, 0);
+	return 0;
+}
+
+typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset, ZSTDcrp_fullReset } ZSTD_compResetPolicy_e;
+
+/*! ZSTD_resetCCtx_advanced() :
+	note : `params` must be validated */
+static size_t ZSTD_resetCCtx_advanced(ZSTD_CCtx *zc, ZSTD_parameters params, U64 frameContentSize, ZSTD_compResetPolicy_e const crp)
+{
+	if (crp == ZSTDcrp_continue)
+		if (ZSTD_equivalentParams(params, zc->params)) {
+			zc->flagStaticTables = 0;
+			zc->flagStaticHufTable = HUF_repeat_none;
+			return ZSTD_continueCCtx(zc, params, frameContentSize);
+		}
+
+	{
+		size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << params.cParams.windowLog);
+		U32 const divider = (params.cParams.searchLength == 3) ? 3 : 4;
+		size_t const maxNbSeq = blockSize / divider;
+		size_t const tokenSpace = blockSize + 11 * maxNbSeq;
+		size_t const chainSize = (params.cParams.strategy == ZSTD_fast) ? 0 : (1 << params.cParams.chainLog);
+		size_t const hSize = ((size_t)1) << params.cParams.hashLog;
+		U32 const hashLog3 = (params.cParams.searchLength > 3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, params.cParams.windowLog);
+		size_t const h3Size = ((size_t)1) << hashLog3;
+		size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+		void *ptr;
+
+		/* Check if workSpace is large enough, alloc a new one if needed */
+		{
+			size_t const optSpace = ((MaxML + 1) + (MaxLL + 1) + (MaxOff + 1) + (1 << Litbits)) * sizeof(U32) +
+						(ZSTD_OPT_NUM + 1) * (sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
+			size_t const neededSpace = tableSpace + (256 * sizeof(U32)) /* huffTable */ + tokenSpace +
+						   (((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
+			if (zc->workSpaceSize < neededSpace) {
+				ZSTD_free(zc->workSpace, zc->customMem);
+				zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem);
+				if (zc->workSpace == NULL)
+					return ERROR(memory_allocation);
+				zc->workSpaceSize = neededSpace;
+			}
+		}
+
+		if (crp != ZSTDcrp_noMemset)
+			memset(zc->workSpace, 0, tableSpace); /* reset tables only */
+		xxh64_reset(&zc->xxhState, 0);
+		zc->hashLog3 = hashLog3;
+		zc->hashTable = (U32 *)(zc->workSpace);
+		zc->chainTable = zc->hashTable + hSize;
+		zc->hashTable3 = zc->chainTable + chainSize;
+		ptr = zc->hashTable3 + h3Size;
+		zc->hufTable = (HUF_CElt *)ptr;
+		zc->flagStaticTables = 0;
+		zc->flagStaticHufTable = HUF_repeat_none;
+		ptr = ((U32 *)ptr) + 256; /* note : HUF_CElt* is incomplete type, size is simulated using U32 */
+
+		zc->nextToUpdate = 1;
+		zc->nextSrc = NULL;
+		zc->base = NULL;
+		zc->dictBase = NULL;
+		zc->dictLimit = 0;
+		zc->lowLimit = 0;
+		zc->params = params;
+		zc->blockSize = blockSize;
+		zc->frameContentSize = frameContentSize;
+		{
+			int i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				zc->rep[i] = repStartValue[i];
+		}
+
+		if ((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) {
+			zc->seqStore.litFreq = (U32 *)ptr;
+			zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1 << Litbits);
+			zc->seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (MaxLL + 1);
+			zc->seqStore.offCodeFreq = zc->seqStore.matchLengthFreq + (MaxML + 1);
+			ptr = zc->seqStore.offCodeFreq + (MaxOff + 1);
+			zc->seqStore.matchTable = (ZSTD_match_t *)ptr;
+			ptr = zc->seqStore.matchTable + ZSTD_OPT_NUM + 1;
+			zc->seqStore.priceTable = (ZSTD_optimal_t *)ptr;
+			ptr = zc->seqStore.priceTable + ZSTD_OPT_NUM + 1;
+			zc->seqStore.litLengthSum = 0;
+		}
+		zc->seqStore.sequencesStart = (seqDef *)ptr;
+		ptr = zc->seqStore.sequencesStart + maxNbSeq;
+		zc->seqStore.llCode = (BYTE *)ptr;
+		zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
+		zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
+		zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
+
+		zc->stage = ZSTDcs_init;
+		zc->dictID = 0;
+		zc->loadedDictEnd = 0;
+
+		return 0;
+	}
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx *cctx)
+{
+	int i;
+	for (i = 0; i < ZSTD_REP_NUM; i++)
+		cctx->rep[i] = 0;
+}
+
+/*! ZSTD_copyCCtx() :
+*   Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+*   Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx *dstCCtx, const ZSTD_CCtx *srcCCtx, unsigned long long pledgedSrcSize)
+{
+	if (srcCCtx->stage != ZSTDcs_init)
+		return ERROR(stage_wrong);
+
+	memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+	{
+		ZSTD_parameters params = srcCCtx->params;
+		params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+		ZSTD_resetCCtx_advanced(dstCCtx, params, pledgedSrcSize, ZSTDcrp_noMemset);
+	}
+
+	/* copy tables */
+	{
+		size_t const chainSize = (srcCCtx->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << srcCCtx->params.cParams.chainLog);
+		size_t const hSize = ((size_t)1) << srcCCtx->params.cParams.hashLog;
+		size_t const h3Size = (size_t)1 << srcCCtx->hashLog3;
+		size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+		memcpy(dstCCtx->workSpace, srcCCtx->workSpace, tableSpace);
+	}
+
+	/* copy dictionary offsets */
+	dstCCtx->nextToUpdate = srcCCtx->nextToUpdate;
+	dstCCtx->nextToUpdate3 = srcCCtx->nextToUpdate3;
+	dstCCtx->nextSrc = srcCCtx->nextSrc;
+	dstCCtx->base = srcCCtx->base;
+	dstCCtx->dictBase = srcCCtx->dictBase;
+	dstCCtx->dictLimit = srcCCtx->dictLimit;
+	dstCCtx->lowLimit = srcCCtx->lowLimit;
+	dstCCtx->loadedDictEnd = srcCCtx->loadedDictEnd;
+	dstCCtx->dictID = srcCCtx->dictID;
+
+	/* copy entropy tables */
+	dstCCtx->flagStaticTables = srcCCtx->flagStaticTables;
+	dstCCtx->flagStaticHufTable = srcCCtx->flagStaticHufTable;
+	if (srcCCtx->flagStaticTables) {
+		memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, sizeof(dstCCtx->litlengthCTable));
+		memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, sizeof(dstCCtx->matchlengthCTable));
+		memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, sizeof(dstCCtx->offcodeCTable));
+	}
+	if (srcCCtx->flagStaticHufTable) {
+		memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256 * 4);
+	}
+
+	return 0;
+}
+
+/*! ZSTD_reduceTable() :
+*   reduce table indexes by `reducerValue` */
+static void ZSTD_reduceTable(U32 *const table, U32 const size, U32 const reducerValue)
+{
+	U32 u;
+	for (u = 0; u < size; u++) {
+		if (table[u] < reducerValue)
+			table[u] = 0;
+		else
+			table[u] -= reducerValue;
+	}
+}
+
+/*! ZSTD_reduceIndex() :
+*   rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex(ZSTD_CCtx *zc, const U32 reducerValue)
+{
+	{
+		U32 const hSize = 1 << zc->params.cParams.hashLog;
+		ZSTD_reduceTable(zc->hashTable, hSize, reducerValue);
+	}
+
+	{
+		U32 const chainSize = (zc->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << zc->params.cParams.chainLog);
+		ZSTD_reduceTable(zc->chainTable, chainSize, reducerValue);
+	}
+
+	{
+		U32 const h3Size = (zc->hashLog3) ? 1 << zc->hashLog3 : 0;
+		ZSTD_reduceTable(zc->hashTable3, h3Size, reducerValue);
+	}
+}
+
+/*-*******************************************************
+*  Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+size_t ZSTD_noCompressBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	if (srcSize + ZSTD_blockHeaderSize > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+	memcpy((BYTE *)dst + ZSTD_blockHeaderSize, src, srcSize);
+	ZSTD_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw);
+	return ZSTD_blockHeaderSize + srcSize;
+}
+
+static size_t ZSTD_noCompressLiterals(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	BYTE *const ostart = (BYTE * const)dst;
+	U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095);
+
+	if (srcSize + flSize > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+
+	switch (flSize) {
+	case 1: /* 2 - 1 - 5 */ ostart[0] = (BYTE)((U32)set_basic + (srcSize << 3)); break;
+	case 2: /* 2 - 2 - 12 */ ZSTD_writeLE16(ostart, (U16)((U32)set_basic + (1 << 2) + (srcSize << 4))); break;
+	default: /*note : should not be necessary : flSize is within {1,2,3} */
+	case 3: /* 2 - 2 - 20 */ ZSTD_writeLE32(ostart, (U32)((U32)set_basic + (3 << 2) + (srcSize << 4))); break;
+	}
+
+	memcpy(ostart + flSize, src, srcSize);
+	return srcSize + flSize;
+}
+
+static size_t ZSTD_compressRleLiteralsBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	BYTE *const ostart = (BYTE * const)dst;
+	U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095);
+
+	(void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */
+
+	switch (flSize) {
+	case 1: /* 2 - 1 - 5 */ ostart[0] = (BYTE)((U32)set_rle + (srcSize << 3)); break;
+	case 2: /* 2 - 2 - 12 */ ZSTD_writeLE16(ostart, (U16)((U32)set_rle + (1 << 2) + (srcSize << 4))); break;
+	default: /*note : should not be necessary : flSize is necessarily within {1,2,3} */
+	case 3: /* 2 - 2 - 20 */ ZSTD_writeLE32(ostart, (U32)((U32)set_rle + (3 << 2) + (srcSize << 4))); break;
+	}
+
+	ostart[flSize] = *(const BYTE *)src;
+	return flSize + 1;
+}
+
+static size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; }
+
+static size_t ZSTD_compressLiterals(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t const minGain = ZSTD_minGain(srcSize);
+	size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+	BYTE *const ostart = (BYTE *)dst;
+	U32 singleStream = srcSize < 256;
+	symbolEncodingType_e hType = set_compressed;
+	size_t cLitSize;
+
+/* small ? don't even attempt compression (speed opt) */
+#define LITERAL_NOENTROPY 63
+	{
+		size_t const minLitSize = zc->flagStaticHufTable == HUF_repeat_valid ? 6 : LITERAL_NOENTROPY;
+		if (srcSize <= minLitSize)
+			return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+	}
+
+	if (dstCapacity < lhSize + 1)
+		return ERROR(dstSize_tooSmall); /* not enough space for compression */
+	{
+		HUF_repeat repeat = zc->flagStaticHufTable;
+		int const preferRepeat = zc->params.cParams.strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+		if (repeat == HUF_repeat_valid && lhSize == 3)
+			singleStream = 1;
+		cLitSize = singleStream ? HUF_compress1X_repeat(ostart + lhSize, dstCapacity - lhSize, src, srcSize, 255, 11, zc->tmpCounters,
+								sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat)
+					: HUF_compress4X_repeat(ostart + lhSize, dstCapacity - lhSize, src, srcSize, 255, 11, zc->tmpCounters,
+								sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat);
+		if (repeat != HUF_repeat_none) {
+			hType = set_repeat;
+		} /* reused the existing table */
+		else {
+			zc->flagStaticHufTable = HUF_repeat_check;
+		} /* now have a table to reuse */
+	}
+
+	if ((cLitSize == 0) | (cLitSize >= srcSize - minGain)) {
+		zc->flagStaticHufTable = HUF_repeat_none;
+		return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+	}
+	if (cLitSize == 1) {
+		zc->flagStaticHufTable = HUF_repeat_none;
+		return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+	}
+
+	/* Build header */
+	switch (lhSize) {
+	case 3: /* 2 - 2 - 10 - 10 */
+	{
+		U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 14);
+		ZSTD_writeLE24(ostart, lhc);
+		break;
+	}
+	case 4: /* 2 - 2 - 14 - 14 */
+	{
+		U32 const lhc = hType + (2 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 18);
+		ZSTD_writeLE32(ostart, lhc);
+		break;
+	}
+	default: /* should not be necessary, lhSize is only {3,4,5} */
+	case 5:  /* 2 - 2 - 18 - 18 */
+	{
+		U32 const lhc = hType + (3 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 22);
+		ZSTD_writeLE32(ostart, lhc);
+		ostart[4] = (BYTE)(cLitSize >> 10);
+		break;
+	}
+	}
+	return lhSize + cLitSize;
+}
+
+static const BYTE LL_Code[64] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 16, 17, 17, 18, 18,
+				 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+				 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
+
+static const BYTE ML_Code[128] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+				  26, 27, 28, 29, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38,
+				  38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+				  40, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 42,
+				  42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42};
+
+void ZSTD_seqToCodes(const seqStore_t *seqStorePtr)
+{
+	BYTE const LL_deltaCode = 19;
+	BYTE const ML_deltaCode = 36;
+	const seqDef *const sequences = seqStorePtr->sequencesStart;
+	BYTE *const llCodeTable = seqStorePtr->llCode;
+	BYTE *const ofCodeTable = seqStorePtr->ofCode;
+	BYTE *const mlCodeTable = seqStorePtr->mlCode;
+	U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+	U32 u;
+	for (u = 0; u < nbSeq; u++) {
+		U32 const llv = sequences[u].litLength;
+		U32 const mlv = sequences[u].matchLength;
+		llCodeTable[u] = (llv > 63) ? (BYTE)ZSTD_highbit32(llv) + LL_deltaCode : LL_Code[llv];
+		ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset);
+		mlCodeTable[u] = (mlv > 127) ? (BYTE)ZSTD_highbit32(mlv) + ML_deltaCode : ML_Code[mlv];
+	}
+	if (seqStorePtr->longLengthID == 1)
+		llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+	if (seqStorePtr->longLengthID == 2)
+		mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+}
+
+ZSTD_STATIC size_t ZSTD_compressSequences_internal(ZSTD_CCtx *zc, void *dst, size_t dstCapacity)
+{
+	const int longOffsets = zc->params.cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+	const seqStore_t *seqStorePtr = &(zc->seqStore);
+	FSE_CTable *CTable_LitLength = zc->litlengthCTable;
+	FSE_CTable *CTable_OffsetBits = zc->offcodeCTable;
+	FSE_CTable *CTable_MatchLength = zc->matchlengthCTable;
+	U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */
+	const seqDef *const sequences = seqStorePtr->sequencesStart;
+	const BYTE *const ofCodeTable = seqStorePtr->ofCode;
+	const BYTE *const llCodeTable = seqStorePtr->llCode;
+	const BYTE *const mlCodeTable = seqStorePtr->mlCode;
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *const oend = ostart + dstCapacity;
+	BYTE *op = ostart;
+	size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+	BYTE *seqHead;
+
+	U32 *count;
+	S16 *norm;
+	U32 *workspace;
+	size_t workspaceSize = sizeof(zc->tmpCounters);
+	{
+		size_t spaceUsed32 = 0;
+		count = (U32 *)zc->tmpCounters + spaceUsed32;
+		spaceUsed32 += MaxSeq + 1;
+		norm = (S16 *)((U32 *)zc->tmpCounters + spaceUsed32);
+		spaceUsed32 += ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
+
+		workspace = (U32 *)zc->tmpCounters + spaceUsed32;
+		workspaceSize -= (spaceUsed32 << 2);
+	}
+
+	/* Compress literals */
+	{
+		const BYTE *const literals = seqStorePtr->litStart;
+		size_t const litSize = seqStorePtr->lit - literals;
+		size_t const cSize = ZSTD_compressLiterals(zc, op, dstCapacity, literals, litSize);
+		if (ZSTD_isError(cSize))
+			return cSize;
+		op += cSize;
+	}
+
+	/* Sequences Header */
+	if ((oend - op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */)
+		return ERROR(dstSize_tooSmall);
+	if (nbSeq < 0x7F)
+		*op++ = (BYTE)nbSeq;
+	else if (nbSeq < LONGNBSEQ)
+		op[0] = (BYTE)((nbSeq >> 8) + 0x80), op[1] = (BYTE)nbSeq, op += 2;
+	else
+		op[0] = 0xFF, ZSTD_writeLE16(op + 1, (U16)(nbSeq - LONGNBSEQ)), op += 3;
+	if (nbSeq == 0)
+		return op - ostart;
+
+	/* seqHead : flags for FSE encoding type */
+	seqHead = op++;
+
+#define MIN_SEQ_FOR_DYNAMIC_FSE 64
+#define MAX_SEQ_FOR_STATIC_FSE 1000
+
+	/* convert length/distances into codes */
+	ZSTD_seqToCodes(seqStorePtr);
+
+	/* CTable for Literal Lengths */
+	{
+		U32 max = MaxLL;
+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);
+		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+			*op++ = llCodeTable[0];
+			FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
+			LLtype = set_rle;
+		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+			LLtype = set_repeat;
+		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog - 1)))) {
+			FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, workspace, workspaceSize);
+			LLtype = set_basic;
+		} else {
+			size_t nbSeq_1 = nbSeq;
+			const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
+			if (count[llCodeTable[nbSeq - 1]] > 1) {
+				count[llCodeTable[nbSeq - 1]]--;
+				nbSeq_1--;
+			}
+			FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+			{
+				size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+				if (FSE_isError(NCountSize))
+					return NCountSize;
+				op += NCountSize;
+			}
+			FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, workspace, workspaceSize);
+			LLtype = set_compressed;
+		}
+	}
+
+	/* CTable for Offsets */
+	{
+		U32 max = MaxOff;
+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);
+		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+			*op++ = ofCodeTable[0];
+			FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
+			Offtype = set_rle;
+		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+			Offtype = set_repeat;
+		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog - 1)))) {
+			FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, workspace, workspaceSize);
+			Offtype = set_basic;
+		} else {
+			size_t nbSeq_1 = nbSeq;
+			const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
+			if (count[ofCodeTable[nbSeq - 1]] > 1) {
+				count[ofCodeTable[nbSeq - 1]]--;
+				nbSeq_1--;
+			}
+			FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+			{
+				size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+				if (FSE_isError(NCountSize))
+					return NCountSize;
+				op += NCountSize;
+			}
+			FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, workspace, workspaceSize);
+			Offtype = set_compressed;
+		}
+	}
+
+	/* CTable for MatchLengths */
+	{
+		U32 max = MaxML;
+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);
+		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+			*op++ = *mlCodeTable;
+			FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
+			MLtype = set_rle;
+		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+			MLtype = set_repeat;
+		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog - 1)))) {
+			FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, workspace, workspaceSize);
+			MLtype = set_basic;
+		} else {
+			size_t nbSeq_1 = nbSeq;
+			const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
+			if (count[mlCodeTable[nbSeq - 1]] > 1) {
+				count[mlCodeTable[nbSeq - 1]]--;
+				nbSeq_1--;
+			}
+			FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+			{
+				size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+				if (FSE_isError(NCountSize))
+					return NCountSize;
+				op += NCountSize;
+			}
+			FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, workspace, workspaceSize);
+			MLtype = set_compressed;
+		}
+	}
+
+	*seqHead = (BYTE)((LLtype << 6) + (Offtype << 4) + (MLtype << 2));
+	zc->flagStaticTables = 0;
+
+	/* Encoding Sequences */
+	{
+		BIT_CStream_t blockStream;
+		FSE_CState_t stateMatchLength;
+		FSE_CState_t stateOffsetBits;
+		FSE_CState_t stateLitLength;
+
+		CHECK_E(BIT_initCStream(&blockStream, op, oend - op), dstSize_tooSmall); /* not enough space remaining */
+
+		/* first symbols */
+		FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq - 1]);
+		FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq - 1]);
+		FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq - 1]);
+		BIT_addBits(&blockStream, sequences[nbSeq - 1].litLength, LL_bits[llCodeTable[nbSeq - 1]]);
+		if (ZSTD_32bits())
+			BIT_flushBits(&blockStream);
+		BIT_addBits(&blockStream, sequences[nbSeq - 1].matchLength, ML_bits[mlCodeTable[nbSeq - 1]]);
+		if (ZSTD_32bits())
+			BIT_flushBits(&blockStream);
+		if (longOffsets) {
+			U32 const ofBits = ofCodeTable[nbSeq - 1];
+			int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1);
+			if (extraBits) {
+				BIT_addBits(&blockStream, sequences[nbSeq - 1].offset, extraBits);
+				BIT_flushBits(&blockStream);
+			}
+			BIT_addBits(&blockStream, sequences[nbSeq - 1].offset >> extraBits, ofBits - extraBits);
+		} else {
+			BIT_addBits(&blockStream, sequences[nbSeq - 1].offset, ofCodeTable[nbSeq - 1]);
+		}
+		BIT_flushBits(&blockStream);
+
+		{
+			size_t n;
+			for (n = nbSeq - 2; n < nbSeq; n--) { /* intentional underflow */
+				BYTE const llCode = llCodeTable[n];
+				BYTE const ofCode = ofCodeTable[n];
+				BYTE const mlCode = mlCodeTable[n];
+				U32 const llBits = LL_bits[llCode];
+				U32 const ofBits = ofCode; /* 32b*/ /* 64b*/
+				U32 const mlBits = ML_bits[mlCode];
+				/* (7)*/							    /* (7)*/
+				FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */  /* 15 */
+				FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */
+				if (ZSTD_32bits())
+					BIT_flushBits(&blockStream);				  /* (7)*/
+				FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */
+				if (ZSTD_32bits() || (ofBits + mlBits + llBits >= 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
+					BIT_flushBits(&blockStream); /* (7)*/
+				BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+				if (ZSTD_32bits() && ((llBits + mlBits) > 24))
+					BIT_flushBits(&blockStream);
+				BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+				if (ZSTD_32bits())
+					BIT_flushBits(&blockStream); /* (7)*/
+				if (longOffsets) {
+					int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1);
+					if (extraBits) {
+						BIT_addBits(&blockStream, sequences[n].offset, extraBits);
+						BIT_flushBits(&blockStream); /* (7)*/
+					}
+					BIT_addBits(&blockStream, sequences[n].offset >> extraBits, ofBits - extraBits); /* 31 */
+				} else {
+					BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */
+				}
+				BIT_flushBits(&blockStream); /* (7)*/
+			}
+		}
+
+		FSE_flushCState(&blockStream, &stateMatchLength);
+		FSE_flushCState(&blockStream, &stateOffsetBits);
+		FSE_flushCState(&blockStream, &stateLitLength);
+
+		{
+			size_t const streamSize = BIT_closeCStream(&blockStream);
+			if (streamSize == 0)
+				return ERROR(dstSize_tooSmall); /* not enough space */
+			op += streamSize;
+		}
+	}
+	return op - ostart;
+}
+
+ZSTD_STATIC size_t ZSTD_compressSequences(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, size_t srcSize)
+{
+	size_t const cSize = ZSTD_compressSequences_internal(zc, dst, dstCapacity);
+	size_t const minGain = ZSTD_minGain(srcSize);
+	size_t const maxCSize = srcSize - minGain;
+	/* If the srcSize <= dstCapacity, then there is enough space to write a
+	 * raw uncompressed block. Since we ran out of space, the block must not
+	 * be compressible, so fall back to a raw uncompressed block.
+	 */
+	int const uncompressibleError = cSize == ERROR(dstSize_tooSmall) && srcSize <= dstCapacity;
+	int i;
+
+	if (ZSTD_isError(cSize) && !uncompressibleError)
+		return cSize;
+	if (cSize >= maxCSize || uncompressibleError) {
+		zc->flagStaticHufTable = HUF_repeat_none;
+		return 0;
+	}
+	/* confirm repcodes */
+	for (i = 0; i < ZSTD_REP_NUM; i++)
+		zc->rep[i] = zc->repToConfirm[i];
+	return cSize;
+}
+
+/*! ZSTD_storeSeq() :
+	Store a sequence (literal length, literals, offset code and match length code) into seqStore_t.
+	`offsetCode` : distance to match, or 0 == repCode.
+	`matchCode` : matchLength - MINMATCH
+*/
+ZSTD_STATIC void ZSTD_storeSeq(seqStore_t *seqStorePtr, size_t litLength, const void *literals, U32 offsetCode, size_t matchCode)
+{
+	/* copy Literals */
+	ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
+	seqStorePtr->lit += litLength;
+
+	/* literal Length */
+	if (litLength > 0xFFFF) {
+		seqStorePtr->longLengthID = 1;
+		seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+	}
+	seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+	/* match offset */
+	seqStorePtr->sequences[0].offset = offsetCode + 1;
+
+	/* match Length */
+	if (matchCode > 0xFFFF) {
+		seqStorePtr->longLengthID = 2;
+		seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+	}
+	seqStorePtr->sequences[0].matchLength = (U16)matchCode;
+
+	seqStorePtr->sequences++;
+}
+
+/*-*************************************
+*  Match length counter
+***************************************/
+static unsigned ZSTD_NbCommonBytes(register size_t val)
+{
+	if (ZSTD_isLittleEndian()) {
+		if (ZSTD_64bits()) {
+			return (__builtin_ctzll((U64)val) >> 3);
+		} else { /* 32 bits */
+			return (__builtin_ctz((U32)val) >> 3);
+		}
+	} else { /* Big Endian CPU */
+		if (ZSTD_64bits()) {
+			return (__builtin_clzll(val) >> 3);
+		} else { /* 32 bits */
+			return (__builtin_clz((U32)val) >> 3);
+		}
+	}
+}
+
+static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, const BYTE *const pInLimit)
+{
+	const BYTE *const pStart = pIn;
+	const BYTE *const pInLoopLimit = pInLimit - (sizeof(size_t) - 1);
+
+	while (pIn < pInLoopLimit) {
+		size_t const diff = ZSTD_readST(pMatch) ^ ZSTD_readST(pIn);
+		if (!diff) {
+			pIn += sizeof(size_t);
+			pMatch += sizeof(size_t);
+			continue;
+		}
+		pIn += ZSTD_NbCommonBytes(diff);
+		return (size_t)(pIn - pStart);
+	}
+	if (ZSTD_64bits())
+		if ((pIn < (pInLimit - 3)) && (ZSTD_read32(pMatch) == ZSTD_read32(pIn))) {
+			pIn += 4;
+			pMatch += 4;
+		}
+	if ((pIn < (pInLimit - 1)) && (ZSTD_read16(pMatch) == ZSTD_read16(pIn))) {
+		pIn += 2;
+		pMatch += 2;
+	}
+	if ((pIn < pInLimit) && (*pMatch == *pIn))
+		pIn++;
+	return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+*   can count match length with `ip` & `match` in 2 different segments.
+*   convention : on reaching mEnd, match count continue starting from iStart
+*/
+static size_t ZSTD_count_2segments(const BYTE *ip, const BYTE *match, const BYTE *iEnd, const BYTE *mEnd, const BYTE *iStart)
+{
+	const BYTE *const vEnd = MIN(ip + (mEnd - match), iEnd);
+	size_t const matchLength = ZSTD_count(ip, match, vEnd);
+	if (match + matchLength != mEnd)
+		return matchLength;
+	return matchLength + ZSTD_count(ip + matchLength, iStart, iEnd);
+}
+
+/*-*************************************
+*  Hashes
+***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32 - 24)) * prime3bytes) >> (32 - h); }
+ZSTD_STATIC size_t ZSTD_hash3Ptr(const void *ptr, U32 h) { return ZSTD_hash3(ZSTD_readLE32(ptr), h); } /* only in zstd_opt.h */
+
+static const U32 prime4bytes = 2654435761U;
+static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32 - h); }
+static size_t ZSTD_hash4Ptr(const void *ptr, U32 h) { return ZSTD_hash4(ZSTD_read32(ptr), h); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64 - 40)) * prime5bytes) >> (64 - h)); }
+static size_t ZSTD_hash5Ptr(const void *p, U32 h) { return ZSTD_hash5(ZSTD_readLE64(p), h); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64 - 48)) * prime6bytes) >> (64 - h)); }
+static size_t ZSTD_hash6Ptr(const void *p, U32 h) { return ZSTD_hash6(ZSTD_readLE64(p), h); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64 - 56)) * prime7bytes) >> (64 - h)); }
+static size_t ZSTD_hash7Ptr(const void *p, U32 h) { return ZSTD_hash7(ZSTD_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u)*prime8bytes) >> (64 - h)); }
+static size_t ZSTD_hash8Ptr(const void *p, U32 h) { return ZSTD_hash8(ZSTD_readLE64(p), h); }
+
+static size_t ZSTD_hashPtr(const void *p, U32 hBits, U32 mls)
+{
+	switch (mls) {
+	// case 3: return ZSTD_hash3Ptr(p, hBits);
+	default:
+	case 4: return ZSTD_hash4Ptr(p, hBits);
+	case 5: return ZSTD_hash5Ptr(p, hBits);
+	case 6: return ZSTD_hash6Ptr(p, hBits);
+	case 7: return ZSTD_hash7Ptr(p, hBits);
+	case 8: return ZSTD_hash8Ptr(p, hBits);
+	}
+}
+
+/*-*************************************
+*  Fast Scan
+***************************************/
+static void ZSTD_fillHashTable(ZSTD_CCtx *zc, const void *end, const U32 mls)
+{
+	U32 *const hashTable = zc->hashTable;
+	U32 const hBits = zc->params.cParams.hashLog;
+	const BYTE *const base = zc->base;
+	const BYTE *ip = base + zc->nextToUpdate;
+	const BYTE *const iend = ((const BYTE *)end) - HASH_READ_SIZE;
+	const size_t fastHashFillStep = 3;
+
+	while (ip <= iend) {
+		hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
+		ip += fastHashFillStep;
+	}
+}
+
+FORCE_INLINE
+void ZSTD_compressBlock_fast_generic(ZSTD_CCtx *cctx, const void *src, size_t srcSize, const U32 mls)
+{
+	U32 *const hashTable = cctx->hashTable;
+	U32 const hBits = cctx->params.cParams.hashLog;
+	seqStore_t *seqStorePtr = &(cctx->seqStore);
+	const BYTE *const base = cctx->base;
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const U32 lowestIndex = cctx->dictLimit;
+	const BYTE *const lowest = base + lowestIndex;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - HASH_READ_SIZE;
+	U32 offset_1 = cctx->rep[0], offset_2 = cctx->rep[1];
+	U32 offsetSaved = 0;
+
+	/* init */
+	ip += (ip == lowest);
+	{
+		U32 const maxRep = (U32)(ip - lowest);
+		if (offset_2 > maxRep)
+			offsetSaved = offset_2, offset_2 = 0;
+		if (offset_1 > maxRep)
+			offsetSaved = offset_1, offset_1 = 0;
+	}
+
+	/* Main Search Loop */
+	while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
+		size_t mLength;
+		size_t const h = ZSTD_hashPtr(ip, hBits, mls);
+		U32 const curr = (U32)(ip - base);
+		U32 const matchIndex = hashTable[h];
+		const BYTE *match = base + matchIndex;
+		hashTable[h] = curr; /* update hash table */
+
+		if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) {
+			mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4;
+			ip++;
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+		} else {
+			U32 offset;
+			if ((matchIndex <= lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) {
+				ip += ((ip - anchor) >> g_searchStrength) + 1;
+				continue;
+			}
+			mLength = ZSTD_count(ip + 4, match + 4, iend) + 4;
+			offset = (U32)(ip - match);
+			while (((ip > anchor) & (match > lowest)) && (ip[-1] == match[-1])) {
+				ip--;
+				match--;
+				mLength++;
+			} /* catch up */
+			offset_2 = offset_1;
+			offset_1 = offset;
+
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+		}
+
+		/* match found */
+		ip += mLength;
+		anchor = ip;
+
+		if (ip <= ilimit) {
+			/* Fill Table */
+			hashTable[ZSTD_hashPtr(base + curr + 2, hBits, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */
+			hashTable[ZSTD_hashPtr(ip - 2, hBits, mls)] = (U32)(ip - 2 - base);
+			/* check immediate repcode */
+			while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) {
+				/* store sequence */
+				size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4;
+				{
+					U32 const tmpOff = offset_2;
+					offset_2 = offset_1;
+					offset_1 = tmpOff;
+				} /* swap offset_2 <=> offset_1 */
+				hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
+				ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH);
+				ip += rLength;
+				anchor = ip;
+				continue; /* faster when present ... (?) */
+			}
+		}
+	}
+
+	/* save reps for next block */
+	cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
+	cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_fast(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	const U32 mls = ctx->params.cParams.searchLength;
+	switch (mls) {
+	default: /* includes case 3 */
+	case 4: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 4); return;
+	case 5: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 5); return;
+	case 6: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 6); return;
+	case 7: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 7); return;
+	}
+}
+
+static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 mls)
+{
+	U32 *hashTable = ctx->hashTable;
+	const U32 hBits = ctx->params.cParams.hashLog;
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const base = ctx->base;
+	const BYTE *const dictBase = ctx->dictBase;
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const U32 lowestIndex = ctx->lowLimit;
+	const BYTE *const dictStart = dictBase + lowestIndex;
+	const U32 dictLimit = ctx->dictLimit;
+	const BYTE *const lowPrefixPtr = base + dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+	/* Search Loop */
+	while (ip < ilimit) { /* < instead of <=, because (ip+1) */
+		const size_t h = ZSTD_hashPtr(ip, hBits, mls);
+		const U32 matchIndex = hashTable[h];
+		const BYTE *matchBase = matchIndex < dictLimit ? dictBase : base;
+		const BYTE *match = matchBase + matchIndex;
+		const U32 curr = (U32)(ip - base);
+		const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */
+		const BYTE *repBase = repIndex < dictLimit ? dictBase : base;
+		const BYTE *repMatch = repBase + repIndex;
+		size_t mLength;
+		hashTable[h] = curr; /* update hash table */
+
+		if ((((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) &&
+		    (ZSTD_read32(repMatch) == ZSTD_read32(ip + 1))) {
+			const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+			mLength = ZSTD_count_2segments(ip + 1 + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repMatchEnd, lowPrefixPtr) + EQUAL_READ32;
+			ip++;
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+		} else {
+			if ((matchIndex < lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) {
+				ip += ((ip - anchor) >> g_searchStrength) + 1;
+				continue;
+			}
+			{
+				const BYTE *matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+				const BYTE *lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+				U32 offset;
+				mLength = ZSTD_count_2segments(ip + EQUAL_READ32, match + EQUAL_READ32, iend, matchEnd, lowPrefixPtr) + EQUAL_READ32;
+				while (((ip > anchor) & (match > lowMatchPtr)) && (ip[-1] == match[-1])) {
+					ip--;
+					match--;
+					mLength++;
+				} /* catch up */
+				offset = curr - matchIndex;
+				offset_2 = offset_1;
+				offset_1 = offset;
+				ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+			}
+		}
+
+		/* found a match : store it */
+		ip += mLength;
+		anchor = ip;
+
+		if (ip <= ilimit) {
+			/* Fill Table */
+			hashTable[ZSTD_hashPtr(base + curr + 2, hBits, mls)] = curr + 2;
+			hashTable[ZSTD_hashPtr(ip - 2, hBits, mls)] = (U32)(ip - 2 - base);
+			/* check immediate repcode */
+			while (ip <= ilimit) {
+				U32 const curr2 = (U32)(ip - base);
+				U32 const repIndex2 = curr2 - offset_2;
+				const BYTE *repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+				if ((((U32)((dictLimit - 1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */
+				    && (ZSTD_read32(repMatch2) == ZSTD_read32(ip))) {
+					const BYTE *const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+					size_t repLength2 =
+					    ZSTD_count_2segments(ip + EQUAL_READ32, repMatch2 + EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+					U32 tmpOffset = offset_2;
+					offset_2 = offset_1;
+					offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+					ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH);
+					hashTable[ZSTD_hashPtr(ip, hBits, mls)] = curr2;
+					ip += repLength2;
+					anchor = ip;
+					continue;
+				}
+				break;
+			}
+		}
+	}
+
+	/* save reps for next block */
+	ctx->repToConfirm[0] = offset_1;
+	ctx->repToConfirm[1] = offset_2;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_fast_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	U32 const mls = ctx->params.cParams.searchLength;
+	switch (mls) {
+	default: /* includes case 3 */
+	case 4: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 4); return;
+	case 5: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 5); return;
+	case 6: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 6); return;
+	case 7: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 7); return;
+	}
+}
+
+/*-*************************************
+*  Double Fast
+***************************************/
+static void ZSTD_fillDoubleHashTable(ZSTD_CCtx *cctx, const void *end, const U32 mls)
+{
+	U32 *const hashLarge = cctx->hashTable;
+	U32 const hBitsL = cctx->params.cParams.hashLog;
+	U32 *const hashSmall = cctx->chainTable;
+	U32 const hBitsS = cctx->params.cParams.chainLog;
+	const BYTE *const base = cctx->base;
+	const BYTE *ip = base + cctx->nextToUpdate;
+	const BYTE *const iend = ((const BYTE *)end) - HASH_READ_SIZE;
+	const size_t fastHashFillStep = 3;
+
+	while (ip <= iend) {
+		hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base);
+		hashLarge[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base);
+		ip += fastHashFillStep;
+	}
+}
+
+FORCE_INLINE
+void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx *cctx, const void *src, size_t srcSize, const U32 mls)
+{
+	U32 *const hashLong = cctx->hashTable;
+	const U32 hBitsL = cctx->params.cParams.hashLog;
+	U32 *const hashSmall = cctx->chainTable;
+	const U32 hBitsS = cctx->params.cParams.chainLog;
+	seqStore_t *seqStorePtr = &(cctx->seqStore);
+	const BYTE *const base = cctx->base;
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const U32 lowestIndex = cctx->dictLimit;
+	const BYTE *const lowest = base + lowestIndex;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - HASH_READ_SIZE;
+	U32 offset_1 = cctx->rep[0], offset_2 = cctx->rep[1];
+	U32 offsetSaved = 0;
+
+	/* init */
+	ip += (ip == lowest);
+	{
+		U32 const maxRep = (U32)(ip - lowest);
+		if (offset_2 > maxRep)
+			offsetSaved = offset_2, offset_2 = 0;
+		if (offset_1 > maxRep)
+			offsetSaved = offset_1, offset_1 = 0;
+	}
+
+	/* Main Search Loop */
+	while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
+		size_t mLength;
+		size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+		size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+		U32 const curr = (U32)(ip - base);
+		U32 const matchIndexL = hashLong[h2];
+		U32 const matchIndexS = hashSmall[h];
+		const BYTE *matchLong = base + matchIndexL;
+		const BYTE *match = base + matchIndexS;
+		hashLong[h2] = hashSmall[h] = curr; /* update hash tables */
+
+		if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) { /* note : by construction, offset_1 <= curr */
+			mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4;
+			ip++;
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+		} else {
+			U32 offset;
+			if ((matchIndexL > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) {
+				mLength = ZSTD_count(ip + 8, matchLong + 8, iend) + 8;
+				offset = (U32)(ip - matchLong);
+				while (((ip > anchor) & (matchLong > lowest)) && (ip[-1] == matchLong[-1])) {
+					ip--;
+					matchLong--;
+					mLength++;
+				} /* catch up */
+			} else if ((matchIndexS > lowestIndex) && (ZSTD_read32(match) == ZSTD_read32(ip))) {
+				size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8);
+				U32 const matchIndex3 = hashLong[h3];
+				const BYTE *match3 = base + matchIndex3;
+				hashLong[h3] = curr + 1;
+				if ((matchIndex3 > lowestIndex) && (ZSTD_read64(match3) == ZSTD_read64(ip + 1))) {
+					mLength = ZSTD_count(ip + 9, match3 + 8, iend) + 8;
+					ip++;
+					offset = (U32)(ip - match3);
+					while (((ip > anchor) & (match3 > lowest)) && (ip[-1] == match3[-1])) {
+						ip--;
+						match3--;
+						mLength++;
+					} /* catch up */
+				} else {
+					mLength = ZSTD_count(ip + 4, match + 4, iend) + 4;
+					offset = (U32)(ip - match);
+					while (((ip > anchor) & (match > lowest)) && (ip[-1] == match[-1])) {
+						ip--;
+						match--;
+						mLength++;
+					} /* catch up */
+				}
+			} else {
+				ip += ((ip - anchor) >> g_searchStrength) + 1;
+				continue;
+			}
+
+			offset_2 = offset_1;
+			offset_1 = offset;
+
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+		}
+
+		/* match found */
+		ip += mLength;
+		anchor = ip;
+
+		if (ip <= ilimit) {
+			/* Fill Table */
+			hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] =
+			    curr + 2; /* here because curr+2 could be > iend-8 */
+			hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base);
+
+			/* check immediate repcode */
+			while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) {
+				/* store sequence */
+				size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4;
+				{
+					U32 const tmpOff = offset_2;
+					offset_2 = offset_1;
+					offset_1 = tmpOff;
+				} /* swap offset_2 <=> offset_1 */
+				hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base);
+				hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base);
+				ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH);
+				ip += rLength;
+				anchor = ip;
+				continue; /* faster when present ... (?) */
+			}
+		}
+	}
+
+	/* save reps for next block */
+	cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
+	cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_doubleFast(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	const U32 mls = ctx->params.cParams.searchLength;
+	switch (mls) {
+	default: /* includes case 3 */
+	case 4: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 4); return;
+	case 5: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 5); return;
+	case 6: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 6); return;
+	case 7: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 7); return;
+	}
+}
+
+static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 mls)
+{
+	U32 *const hashLong = ctx->hashTable;
+	U32 const hBitsL = ctx->params.cParams.hashLog;
+	U32 *const hashSmall = ctx->chainTable;
+	U32 const hBitsS = ctx->params.cParams.chainLog;
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const base = ctx->base;
+	const BYTE *const dictBase = ctx->dictBase;
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const U32 lowestIndex = ctx->lowLimit;
+	const BYTE *const dictStart = dictBase + lowestIndex;
+	const U32 dictLimit = ctx->dictLimit;
+	const BYTE *const lowPrefixPtr = base + dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+	/* Search Loop */
+	while (ip < ilimit) { /* < instead of <=, because (ip+1) */
+		const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+		const U32 matchIndex = hashSmall[hSmall];
+		const BYTE *matchBase = matchIndex < dictLimit ? dictBase : base;
+		const BYTE *match = matchBase + matchIndex;
+
+		const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+		const U32 matchLongIndex = hashLong[hLong];
+		const BYTE *matchLongBase = matchLongIndex < dictLimit ? dictBase : base;
+		const BYTE *matchLong = matchLongBase + matchLongIndex;
+
+		const U32 curr = (U32)(ip - base);
+		const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */
+		const BYTE *repBase = repIndex < dictLimit ? dictBase : base;
+		const BYTE *repMatch = repBase + repIndex;
+		size_t mLength;
+		hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */
+
+		if ((((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) &&
+		    (ZSTD_read32(repMatch) == ZSTD_read32(ip + 1))) {
+			const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+			mLength = ZSTD_count_2segments(ip + 1 + 4, repMatch + 4, iend, repMatchEnd, lowPrefixPtr) + 4;
+			ip++;
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+		} else {
+			if ((matchLongIndex > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) {
+				const BYTE *matchEnd = matchLongIndex < dictLimit ? dictEnd : iend;
+				const BYTE *lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr;
+				U32 offset;
+				mLength = ZSTD_count_2segments(ip + 8, matchLong + 8, iend, matchEnd, lowPrefixPtr) + 8;
+				offset = curr - matchLongIndex;
+				while (((ip > anchor) & (matchLong > lowMatchPtr)) && (ip[-1] == matchLong[-1])) {
+					ip--;
+					matchLong--;
+					mLength++;
+				} /* catch up */
+				offset_2 = offset_1;
+				offset_1 = offset;
+				ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+
+			} else if ((matchIndex > lowestIndex) && (ZSTD_read32(match) == ZSTD_read32(ip))) {
+				size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8);
+				U32 const matchIndex3 = hashLong[h3];
+				const BYTE *const match3Base = matchIndex3 < dictLimit ? dictBase : base;
+				const BYTE *match3 = match3Base + matchIndex3;
+				U32 offset;
+				hashLong[h3] = curr + 1;
+				if ((matchIndex3 > lowestIndex) && (ZSTD_read64(match3) == ZSTD_read64(ip + 1))) {
+					const BYTE *matchEnd = matchIndex3 < dictLimit ? dictEnd : iend;
+					const BYTE *lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr;
+					mLength = ZSTD_count_2segments(ip + 9, match3 + 8, iend, matchEnd, lowPrefixPtr) + 8;
+					ip++;
+					offset = curr + 1 - matchIndex3;
+					while (((ip > anchor) & (match3 > lowMatchPtr)) && (ip[-1] == match3[-1])) {
+						ip--;
+						match3--;
+						mLength++;
+					} /* catch up */
+				} else {
+					const BYTE *matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+					const BYTE *lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+					mLength = ZSTD_count_2segments(ip + 4, match + 4, iend, matchEnd, lowPrefixPtr) + 4;
+					offset = curr - matchIndex;
+					while (((ip > anchor) & (match > lowMatchPtr)) && (ip[-1] == match[-1])) {
+						ip--;
+						match--;
+						mLength++;
+					} /* catch up */
+				}
+				offset_2 = offset_1;
+				offset_1 = offset;
+				ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+
+			} else {
+				ip += ((ip - anchor) >> g_searchStrength) + 1;
+				continue;
+			}
+		}
+
+		/* found a match : store it */
+		ip += mLength;
+		anchor = ip;
+
+		if (ip <= ilimit) {
+			/* Fill Table */
+			hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] = curr + 2;
+			hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = curr + 2;
+			hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base);
+			hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base);
+			/* check immediate repcode */
+			while (ip <= ilimit) {
+				U32 const curr2 = (U32)(ip - base);
+				U32 const repIndex2 = curr2 - offset_2;
+				const BYTE *repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+				if ((((U32)((dictLimit - 1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */
+				    && (ZSTD_read32(repMatch2) == ZSTD_read32(ip))) {
+					const BYTE *const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+					size_t const repLength2 =
+					    ZSTD_count_2segments(ip + EQUAL_READ32, repMatch2 + EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+					U32 tmpOffset = offset_2;
+					offset_2 = offset_1;
+					offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+					ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH);
+					hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = curr2;
+					hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = curr2;
+					ip += repLength2;
+					anchor = ip;
+					continue;
+				}
+				break;
+			}
+		}
+	}
+
+	/* save reps for next block */
+	ctx->repToConfirm[0] = offset_1;
+	ctx->repToConfirm[1] = offset_2;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_doubleFast_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	U32 const mls = ctx->params.cParams.searchLength;
+	switch (mls) {
+	default: /* includes case 3 */
+	case 4: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 4); return;
+	case 5: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 5); return;
+	case 6: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 6); return;
+	case 7: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 7); return;
+	}
+}
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+*   ip : assumed <= iend-8 .
+*   @return : nb of positions added */
+static U32 ZSTD_insertBt1(ZSTD_CCtx *zc, const BYTE *const ip, const U32 mls, const BYTE *const iend, U32 nbCompares, U32 extDict)
+{
+	U32 *const hashTable = zc->hashTable;
+	U32 const hashLog = zc->params.cParams.hashLog;
+	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+	U32 *const bt = zc->chainTable;
+	U32 const btLog = zc->params.cParams.chainLog - 1;
+	U32 const btMask = (1 << btLog) - 1;
+	U32 matchIndex = hashTable[h];
+	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+	const BYTE *const base = zc->base;
+	const BYTE *const dictBase = zc->dictBase;
+	const U32 dictLimit = zc->dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const BYTE *match;
+	const U32 curr = (U32)(ip - base);
+	const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+	U32 *smallerPtr = bt + 2 * (curr & btMask);
+	U32 *largerPtr = smallerPtr + 1;
+	U32 dummy32; /* to be nullified at the end */
+	U32 const windowLow = zc->lowLimit;
+	U32 matchEndIdx = curr + 8;
+	size_t bestLength = 8;
+
+	hashTable[h] = curr; /* Update Hash Table */
+
+	while (nbCompares-- && (matchIndex > windowLow)) {
+		U32 *const nextPtr = bt + 2 * (matchIndex & btMask);
+		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+
+		if ((!extDict) || (matchIndex + matchLength >= dictLimit)) {
+			match = base + matchIndex;
+			if (match[matchLength] == ip[matchLength])
+				matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iend) + 1;
+		} else {
+			match = dictBase + matchIndex;
+			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
+			if (matchIndex + matchLength >= dictLimit)
+				match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+		}
+
+		if (matchLength > bestLength) {
+			bestLength = matchLength;
+			if (matchLength > matchEndIdx - matchIndex)
+				matchEndIdx = matchIndex + (U32)matchLength;
+		}
+
+		if (ip + matchLength == iend) /* equal : no way to know if inf or sup */
+			break;		      /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt the tree */
+
+		if (match[matchLength] < ip[matchLength]) { /* necessarily within correct buffer */
+			/* match is smaller than curr */
+			*smallerPtr = matchIndex;	  /* update smaller idx */
+			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+			if (matchIndex <= btLow) {
+				smallerPtr = &dummy32;
+				break;
+			}			  /* beyond tree size, stop the search */
+			smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
+			matchIndex = nextPtr[1];  /* new matchIndex larger than previous (closer to curr) */
+		} else {
+			/* match is larger than curr */
+			*largerPtr = matchIndex;
+			commonLengthLarger = matchLength;
+			if (matchIndex <= btLow) {
+				largerPtr = &dummy32;
+				break;
+			} /* beyond tree size, stop the search */
+			largerPtr = nextPtr;
+			matchIndex = nextPtr[0];
+		}
+	}
+
+	*smallerPtr = *largerPtr = 0;
+	if (bestLength > 384)
+		return MIN(192, (U32)(bestLength - 384)); /* speed optimization */
+	if (matchEndIdx > curr + 8)
+		return matchEndIdx - curr - 8;
+	return 1;
+}
+
+static size_t ZSTD_insertBtAndFindBestMatch(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, size_t *offsetPtr, U32 nbCompares, const U32 mls,
+					    U32 extDict)
+{
+	U32 *const hashTable = zc->hashTable;
+	U32 const hashLog = zc->params.cParams.hashLog;
+	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+	U32 *const bt = zc->chainTable;
+	U32 const btLog = zc->params.cParams.chainLog - 1;
+	U32 const btMask = (1 << btLog) - 1;
+	U32 matchIndex = hashTable[h];
+	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+	const BYTE *const base = zc->base;
+	const BYTE *const dictBase = zc->dictBase;
+	const U32 dictLimit = zc->dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const U32 curr = (U32)(ip - base);
+	const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+	const U32 windowLow = zc->lowLimit;
+	U32 *smallerPtr = bt + 2 * (curr & btMask);
+	U32 *largerPtr = bt + 2 * (curr & btMask) + 1;
+	U32 matchEndIdx = curr + 8;
+	U32 dummy32; /* to be nullified at the end */
+	size_t bestLength = 0;
+
+	hashTable[h] = curr; /* Update Hash Table */
+
+	while (nbCompares-- && (matchIndex > windowLow)) {
+		U32 *const nextPtr = bt + 2 * (matchIndex & btMask);
+		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+		const BYTE *match;
+
+		if ((!extDict) || (matchIndex + matchLength >= dictLimit)) {
+			match = base + matchIndex;
+			if (match[matchLength] == ip[matchLength])
+				matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iend) + 1;
+		} else {
+			match = dictBase + matchIndex;
+			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
+			if (matchIndex + matchLength >= dictLimit)
+				match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+		}
+
+		if (matchLength > bestLength) {
+			if (matchLength > matchEndIdx - matchIndex)
+				matchEndIdx = matchIndex + (U32)matchLength;
+			if ((4 * (int)(matchLength - bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)offsetPtr[0] + 1)))
+				bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
+			if (ip + matchLength == iend) /* equal : no way to know if inf or sup */
+				break;		      /* drop, to guarantee consistency (miss a little bit of compression) */
+		}
+
+		if (match[matchLength] < ip[matchLength]) {
+			/* match is smaller than curr */
+			*smallerPtr = matchIndex;	  /* update smaller idx */
+			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+			if (matchIndex <= btLow) {
+				smallerPtr = &dummy32;
+				break;
+			}			  /* beyond tree size, stop the search */
+			smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
+			matchIndex = nextPtr[1];  /* new matchIndex larger than previous (closer to curr) */
+		} else {
+			/* match is larger than curr */
+			*largerPtr = matchIndex;
+			commonLengthLarger = matchLength;
+			if (matchIndex <= btLow) {
+				largerPtr = &dummy32;
+				break;
+			} /* beyond tree size, stop the search */
+			largerPtr = nextPtr;
+			matchIndex = nextPtr[0];
+		}
+	}
+
+	*smallerPtr = *largerPtr = 0;
+
+	zc->nextToUpdate = (matchEndIdx > curr + 8) ? matchEndIdx - 8 : curr + 1;
+	return bestLength;
+}
+
+static void ZSTD_updateTree(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, const U32 nbCompares, const U32 mls)
+{
+	const BYTE *const base = zc->base;
+	const U32 target = (U32)(ip - base);
+	U32 idx = zc->nextToUpdate;
+
+	while (idx < target)
+		idx += ZSTD_insertBt1(zc, base + idx, mls, iend, nbCompares, 0);
+}
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 mls)
+{
+	if (ip < zc->base + zc->nextToUpdate)
+		return 0; /* skipped area */
+	ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
+	return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 0);
+}
+
+static size_t ZSTD_BtFindBestMatch_selectMLS(ZSTD_CCtx *zc, /* Index table will be updated */
+					     const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 matchLengthSearch)
+{
+	switch (matchLengthSearch) {
+	default: /* includes case 3 */
+	case 4: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
+	case 5: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+	case 7:
+	case 6: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+	}
+}
+
+static void ZSTD_updateTree_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, const U32 nbCompares, const U32 mls)
+{
+	const BYTE *const base = zc->base;
+	const U32 target = (U32)(ip - base);
+	U32 idx = zc->nextToUpdate;
+
+	while (idx < target)
+		idx += ZSTD_insertBt1(zc, base + idx, mls, iend, nbCompares, 1);
+}
+
+/** Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+					   const U32 mls)
+{
+	if (ip < zc->base + zc->nextToUpdate)
+		return 0; /* skipped area */
+	ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
+	return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 1);
+}
+
+static size_t ZSTD_BtFindBestMatch_selectMLS_extDict(ZSTD_CCtx *zc, /* Index table will be updated */
+						     const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+						     const U32 matchLengthSearch)
+{
+	switch (matchLengthSearch) {
+	default: /* includes case 3 */
+	case 4: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
+	case 5: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+	case 7:
+	case 6: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+	}
+}
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask) chainTable[(d)&mask]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_CCtx *zc, const BYTE *ip, U32 mls)
+{
+	U32 *const hashTable = zc->hashTable;
+	const U32 hashLog = zc->params.cParams.hashLog;
+	U32 *const chainTable = zc->chainTable;
+	const U32 chainMask = (1 << zc->params.cParams.chainLog) - 1;
+	const BYTE *const base = zc->base;
+	const U32 target = (U32)(ip - base);
+	U32 idx = zc->nextToUpdate;
+
+	while (idx < target) { /* catch up */
+		size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls);
+		NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+		hashTable[h] = idx;
+		idx++;
+	}
+
+	zc->nextToUpdate = target;
+	return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+/* inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE
+size_t ZSTD_HcFindBestMatch_generic(ZSTD_CCtx *zc, /* Index table will be updated */
+				    const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 mls,
+				    const U32 extDict)
+{
+	U32 *const chainTable = zc->chainTable;
+	const U32 chainSize = (1 << zc->params.cParams.chainLog);
+	const U32 chainMask = chainSize - 1;
+	const BYTE *const base = zc->base;
+	const BYTE *const dictBase = zc->dictBase;
+	const U32 dictLimit = zc->dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const U32 lowLimit = zc->lowLimit;
+	const U32 curr = (U32)(ip - base);
+	const U32 minChain = curr > chainSize ? curr - chainSize : 0;
+	int nbAttempts = maxNbAttempts;
+	size_t ml = EQUAL_READ32 - 1;
+
+	/* HC4 match finder */
+	U32 matchIndex = ZSTD_insertAndFindFirstIndex(zc, ip, mls);
+
+	for (; (matchIndex > lowLimit) & (nbAttempts > 0); nbAttempts--) {
+		const BYTE *match;
+		size_t currMl = 0;
+		if ((!extDict) || matchIndex >= dictLimit) {
+			match = base + matchIndex;
+			if (match[ml] == ip[ml]) /* potentially better */
+				currMl = ZSTD_count(ip, match, iLimit);
+		} else {
+			match = dictBase + matchIndex;
+			if (ZSTD_read32(match) == ZSTD_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+				currMl = ZSTD_count_2segments(ip + EQUAL_READ32, match + EQUAL_READ32, iLimit, dictEnd, prefixStart) + EQUAL_READ32;
+		}
+
+		/* save best solution */
+		if (currMl > ml) {
+			ml = currMl;
+			*offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
+			if (ip + currMl == iLimit)
+				break; /* best possible, and avoid read overflow*/
+		}
+
+		if (matchIndex <= minChain)
+			break;
+		matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+	}
+
+	return ml;
+}
+
+FORCE_INLINE size_t ZSTD_HcFindBestMatch_selectMLS(ZSTD_CCtx *zc, const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+						   const U32 matchLengthSearch)
+{
+	switch (matchLengthSearch) {
+	default: /* includes case 3 */
+	case 4: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 0);
+	case 5: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 0);
+	case 7:
+	case 6: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 0);
+	}
+}
+
+FORCE_INLINE size_t ZSTD_HcFindBestMatch_extDict_selectMLS(ZSTD_CCtx *zc, const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+							   const U32 matchLengthSearch)
+{
+	switch (matchLengthSearch) {
+	default: /* includes case 3 */
+	case 4: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 1);
+	case 5: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 1);
+	case 7:
+	case 6: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 1);
+	}
+}
+
+/* *******************************
+*  Common parser - lazy strategy
+*********************************/
+FORCE_INLINE
+void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 searchMethod, const U32 depth)
+{
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	const BYTE *const base = ctx->base + ctx->dictLimit;
+
+	U32 const maxSearches = 1 << ctx->params.cParams.searchLog;
+	U32 const mls = ctx->params.cParams.searchLength;
+
+	typedef size_t (*searchMax_f)(ZSTD_CCtx * zc, const BYTE *ip, const BYTE *iLimit, size_t *offsetPtr, U32 maxNbAttempts, U32 matchLengthSearch);
+	searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS;
+	U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1], savedOffset = 0;
+
+	/* init */
+	ip += (ip == base);
+	ctx->nextToUpdate3 = ctx->nextToUpdate;
+	{
+		U32 const maxRep = (U32)(ip - base);
+		if (offset_2 > maxRep)
+			savedOffset = offset_2, offset_2 = 0;
+		if (offset_1 > maxRep)
+			savedOffset = offset_1, offset_1 = 0;
+	}
+
+	/* Match Loop */
+	while (ip < ilimit) {
+		size_t matchLength = 0;
+		size_t offset = 0;
+		const BYTE *start = ip + 1;
+
+		/* check repCode */
+		if ((offset_1 > 0) & (ZSTD_read32(ip + 1) == ZSTD_read32(ip + 1 - offset_1))) {
+			/* repcode : we take it */
+			matchLength = ZSTD_count(ip + 1 + EQUAL_READ32, ip + 1 + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32;
+			if (depth == 0)
+				goto _storeSequence;
+		}
+
+		/* first search (depth 0) */
+		{
+			size_t offsetFound = 99999999;
+			size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+			if (ml2 > matchLength)
+				matchLength = ml2, start = ip, offset = offsetFound;
+		}
+
+		if (matchLength < EQUAL_READ32) {
+			ip += ((ip - anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */
+			continue;
+		}
+
+		/* let's try to find a better solution */
+		if (depth >= 1)
+			while (ip < ilimit) {
+				ip++;
+				if ((offset) && ((offset_1 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_1)))) {
+					size_t const mlRep = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32;
+					int const gain2 = (int)(mlRep * 3);
+					int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offset + 1) + 1);
+					if ((mlRep >= EQUAL_READ32) && (gain2 > gain1))
+						matchLength = mlRep, offset = 0, start = ip;
+				}
+				{
+					size_t offset2 = 99999999;
+					size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+					int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+					int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 4);
+					if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+						matchLength = ml2, offset = offset2, start = ip;
+						continue; /* search a better one */
+					}
+				}
+
+				/* let's find an even better one */
+				if ((depth == 2) && (ip < ilimit)) {
+					ip++;
+					if ((offset) && ((offset_1 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_1)))) {
+						size_t const ml2 = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32;
+						int const gain2 = (int)(ml2 * 4);
+						int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 1);
+						if ((ml2 >= EQUAL_READ32) && (gain2 > gain1))
+							matchLength = ml2, offset = 0, start = ip;
+					}
+					{
+						size_t offset2 = 99999999;
+						size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+						int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+						int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 7);
+						if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+							matchLength = ml2, offset = offset2, start = ip;
+							continue;
+						}
+					}
+				}
+				break; /* nothing found : store previous solution */
+			}
+
+		/* NOTE:
+		 * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
+		 * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
+		 * overflows the pointer, which is undefined behavior.
+		 */
+		/* catch up */
+		if (offset) {
+			while ((start > anchor) && (start > base + offset - ZSTD_REP_MOVE) &&
+			       (start[-1] == (start-offset+ZSTD_REP_MOVE)[-1])) /* only search for offset within prefix */
+			{
+				start--;
+				matchLength++;
+			}
+			offset_2 = offset_1;
+			offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+		}
+
+	/* store sequence */
+_storeSequence:
+		{
+			size_t const litLength = start - anchor;
+			ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH);
+			anchor = ip = start + matchLength;
+		}
+
+		/* check immediate repcode */
+		while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) {
+			/* store sequence */
+			matchLength = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_2, iend) + EQUAL_READ32;
+			offset = offset_2;
+			offset_2 = offset_1;
+			offset_1 = (U32)offset; /* swap repcodes */
+			ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH);
+			ip += matchLength;
+			anchor = ip;
+			continue; /* faster when present ... (?) */
+		}
+	}
+
+	/* Save reps for next block */
+	ctx->repToConfirm[0] = offset_1 ? offset_1 : savedOffset;
+	ctx->repToConfirm[1] = offset_2 ? offset_2 : savedOffset;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_btlazy2(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 1, 2); }
+
+static void ZSTD_compressBlock_lazy2(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 2); }
+
+static void ZSTD_compressBlock_lazy(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 1); }
+
+static void ZSTD_compressBlock_greedy(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 0); }
+
+FORCE_INLINE
+void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 searchMethod, const U32 depth)
+{
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	const BYTE *const base = ctx->base;
+	const U32 dictLimit = ctx->dictLimit;
+	const U32 lowestIndex = ctx->lowLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const BYTE *const dictBase = ctx->dictBase;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const dictStart = dictBase + ctx->lowLimit;
+
+	const U32 maxSearches = 1 << ctx->params.cParams.searchLog;
+	const U32 mls = ctx->params.cParams.searchLength;
+
+	typedef size_t (*searchMax_f)(ZSTD_CCtx * zc, const BYTE *ip, const BYTE *iLimit, size_t *offsetPtr, U32 maxNbAttempts, U32 matchLengthSearch);
+	searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS;
+
+	U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+	/* init */
+	ctx->nextToUpdate3 = ctx->nextToUpdate;
+	ip += (ip == prefixStart);
+
+	/* Match Loop */
+	while (ip < ilimit) {
+		size_t matchLength = 0;
+		size_t offset = 0;
+		const BYTE *start = ip + 1;
+		U32 curr = (U32)(ip - base);
+
+		/* check repCode */
+		{
+			const U32 repIndex = (U32)(curr + 1 - offset_1);
+			const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+			const BYTE *const repMatch = repBase + repIndex;
+			if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+				if (ZSTD_read32(ip + 1) == ZSTD_read32(repMatch)) {
+					/* repcode detected we should take it */
+					const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+					matchLength =
+					    ZSTD_count_2segments(ip + 1 + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+					if (depth == 0)
+						goto _storeSequence;
+				}
+		}
+
+		/* first search (depth 0) */
+		{
+			size_t offsetFound = 99999999;
+			size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+			if (ml2 > matchLength)
+				matchLength = ml2, start = ip, offset = offsetFound;
+		}
+
+		if (matchLength < EQUAL_READ32) {
+			ip += ((ip - anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */
+			continue;
+		}
+
+		/* let's try to find a better solution */
+		if (depth >= 1)
+			while (ip < ilimit) {
+				ip++;
+				curr++;
+				/* check repCode */
+				if (offset) {
+					const U32 repIndex = (U32)(curr - offset_1);
+					const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+					const BYTE *const repMatch = repBase + repIndex;
+					if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+						if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) {
+							/* repcode detected */
+							const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+							size_t const repLength =
+							    ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) +
+							    EQUAL_READ32;
+							int const gain2 = (int)(repLength * 3);
+							int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offset + 1) + 1);
+							if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+								matchLength = repLength, offset = 0, start = ip;
+						}
+				}
+
+				/* search match, depth 1 */
+				{
+					size_t offset2 = 99999999;
+					size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+					int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+					int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 4);
+					if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+						matchLength = ml2, offset = offset2, start = ip;
+						continue; /* search a better one */
+					}
+				}
+
+				/* let's find an even better one */
+				if ((depth == 2) && (ip < ilimit)) {
+					ip++;
+					curr++;
+					/* check repCode */
+					if (offset) {
+						const U32 repIndex = (U32)(curr - offset_1);
+						const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+						const BYTE *const repMatch = repBase + repIndex;
+						if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+							if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) {
+								/* repcode detected */
+								const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+								size_t repLength = ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend,
+													repEnd, prefixStart) +
+										   EQUAL_READ32;
+								int gain2 = (int)(repLength * 4);
+								int gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 1);
+								if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+									matchLength = repLength, offset = 0, start = ip;
+							}
+					}
+
+					/* search match, depth 2 */
+					{
+						size_t offset2 = 99999999;
+						size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+						int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+						int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 7);
+						if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+							matchLength = ml2, offset = offset2, start = ip;
+							continue;
+						}
+					}
+				}
+				break; /* nothing found : store previous solution */
+			}
+
+		/* catch up */
+		if (offset) {
+			U32 const matchIndex = (U32)((start - base) - (offset - ZSTD_REP_MOVE));
+			const BYTE *match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+			const BYTE *const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+			while ((start > anchor) && (match > mStart) && (start[-1] == match[-1])) {
+				start--;
+				match--;
+				matchLength++;
+			} /* catch up */
+			offset_2 = offset_1;
+			offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+		}
+
+	/* store sequence */
+	_storeSequence : {
+		size_t const litLength = start - anchor;
+		ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH);
+		anchor = ip = start + matchLength;
+	}
+
+		/* check immediate repcode */
+		while (ip <= ilimit) {
+			const U32 repIndex = (U32)((ip - base) - offset_2);
+			const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+			const BYTE *const repMatch = repBase + repIndex;
+			if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+				if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) {
+					/* repcode detected we should take it */
+					const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+					matchLength =
+					    ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+					offset = offset_2;
+					offset_2 = offset_1;
+					offset_1 = (U32)offset; /* swap offset history */
+					ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH);
+					ip += matchLength;
+					anchor = ip;
+					continue; /* faster when present ... (?) */
+				}
+			break;
+		}
+	}
+
+	/* Save reps for next block */
+	ctx->repToConfirm[0] = offset_1;
+	ctx->repToConfirm[1] = offset_2;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+void ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 0); }
+
+static void ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 1);
+}
+
+static void ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 2);
+}
+
+static void ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 1, 2);
+}
+
+/* The optimal parser */
+#include "zstd_opt.h"
+
+static void ZSTD_compressBlock_btopt(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+	ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0);
+#else
+	(void)ctx;
+	(void)src;
+	(void)srcSize;
+	return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt2(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+	ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1);
+#else
+	(void)ctx;
+	(void)src;
+	(void)srcSize;
+	return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+	ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 0);
+#else
+	(void)ctx;
+	(void)src;
+	(void)srcSize;
+	return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+	ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 1);
+#else
+	(void)ctx;
+	(void)src;
+	(void)srcSize;
+	return;
+#endif
+}
+
+typedef void (*ZSTD_blockCompressor)(ZSTD_CCtx *ctx, const void *src, size_t srcSize);
+
+static ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict)
+{
+	static const ZSTD_blockCompressor blockCompressor[2][8] = {
+	    {ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy, ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2,
+	     ZSTD_compressBlock_btlazy2, ZSTD_compressBlock_btopt, ZSTD_compressBlock_btopt2},
+	    {ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict, ZSTD_compressBlock_lazy_extDict,
+	     ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict, ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btopt2_extDict}};
+
+	return blockCompressor[extDict][(U32)strat];
+}
+
+static size_t ZSTD_compressBlock_internal(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->params.cParams.strategy, zc->lowLimit < zc->dictLimit);
+	const BYTE *const base = zc->base;
+	const BYTE *const istart = (const BYTE *)src;
+	const U32 curr = (U32)(istart - base);
+	if (srcSize < MIN_CBLOCK_SIZE + ZSTD_blockHeaderSize + 1)
+		return 0; /* don't even attempt compression below a certain srcSize */
+	ZSTD_resetSeqStore(&(zc->seqStore));
+	if (curr > zc->nextToUpdate + 384)
+		zc->nextToUpdate = curr - MIN(192, (U32)(curr - zc->nextToUpdate - 384)); /* update tree not updated after finding very long rep matches */
+	blockCompressor(zc, src, srcSize);
+	return ZSTD_compressSequences(zc, dst, dstCapacity, srcSize);
+}
+
+/*! ZSTD_compress_generic() :
+*   Compress a chunk of data into one or multiple blocks.
+*   All blocks will be terminated, all input will be consumed.
+*   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+*   Frame is supposed already started (header already produced)
+*   @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_generic(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, U32 lastFrameChunk)
+{
+	size_t blockSize = cctx->blockSize;
+	size_t remaining = srcSize;
+	const BYTE *ip = (const BYTE *)src;
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *op = ostart;
+	U32 const maxDist = 1 << cctx->params.cParams.windowLog;
+
+	if (cctx->params.fParams.checksumFlag && srcSize)
+		xxh64_update(&cctx->xxhState, src, srcSize);
+
+	while (remaining) {
+		U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+		size_t cSize;
+
+		if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE)
+			return ERROR(dstSize_tooSmall); /* not enough space to store compressed block */
+		if (remaining < blockSize)
+			blockSize = remaining;
+
+		/* preemptive overflow correction */
+		if (cctx->lowLimit > (3U << 29)) {
+			U32 const cycleMask = (1 << ZSTD_cycleLog(cctx->params.cParams.hashLog, cctx->params.cParams.strategy)) - 1;
+			U32 const curr = (U32)(ip - cctx->base);
+			U32 const newCurr = (curr & cycleMask) + (1 << cctx->params.cParams.windowLog);
+			U32 const correction = curr - newCurr;
+			ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_64 <= 30);
+			ZSTD_reduceIndex(cctx, correction);
+			cctx->base += correction;
+			cctx->dictBase += correction;
+			cctx->lowLimit -= correction;
+			cctx->dictLimit -= correction;
+			if (cctx->nextToUpdate < correction)
+				cctx->nextToUpdate = 0;
+			else
+				cctx->nextToUpdate -= correction;
+		}
+
+		if ((U32)(ip + blockSize - cctx->base) > cctx->loadedDictEnd + maxDist) {
+			/* enforce maxDist */
+			U32 const newLowLimit = (U32)(ip + blockSize - cctx->base) - maxDist;
+			if (cctx->lowLimit < newLowLimit)
+				cctx->lowLimit = newLowLimit;
+			if (cctx->dictLimit < cctx->lowLimit)
+				cctx->dictLimit = cctx->lowLimit;
+		}
+
+		cSize = ZSTD_compressBlock_internal(cctx, op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, ip, blockSize);
+		if (ZSTD_isError(cSize))
+			return cSize;
+
+		if (cSize == 0) { /* block is not compressible */
+			U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw) << 1) + (U32)(blockSize << 3);
+			if (blockSize + ZSTD_blockHeaderSize > dstCapacity)
+				return ERROR(dstSize_tooSmall);
+			ZSTD_writeLE32(op, cBlockHeader24); /* no pb, 4th byte will be overwritten */
+			memcpy(op + ZSTD_blockHeaderSize, ip, blockSize);
+			cSize = ZSTD_blockHeaderSize + blockSize;
+		} else {
+			U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed) << 1) + (U32)(cSize << 3);
+			ZSTD_writeLE24(op, cBlockHeader24);
+			cSize += ZSTD_blockHeaderSize;
+		}
+
+		remaining -= blockSize;
+		dstCapacity -= cSize;
+		ip += blockSize;
+		op += cSize;
+	}
+
+	if (lastFrameChunk && (op > ostart))
+		cctx->stage = ZSTDcs_ending;
+	return op - ostart;
+}
+
+static size_t ZSTD_writeFrameHeader(void *dst, size_t dstCapacity, ZSTD_parameters params, U64 pledgedSrcSize, U32 dictID)
+{
+	BYTE *const op = (BYTE *)dst;
+	U32 const dictIDSizeCode = (dictID > 0) + (dictID >= 256) + (dictID >= 65536); /* 0-3 */
+	U32 const checksumFlag = params.fParams.checksumFlag > 0;
+	U32 const windowSize = 1U << params.cParams.windowLog;
+	U32 const singleSegment = params.fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+	BYTE const windowLogByte = (BYTE)((params.cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+	U32 const fcsCode =
+	    params.fParams.contentSizeFlag ? (pledgedSrcSize >= 256) + (pledgedSrcSize >= 65536 + 256) + (pledgedSrcSize >= 0xFFFFFFFFU) : 0; /* 0-3 */
+	BYTE const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag << 2) + (singleSegment << 5) + (fcsCode << 6));
+	size_t pos;
+
+	if (dstCapacity < ZSTD_frameHeaderSize_max)
+		return ERROR(dstSize_tooSmall);
+
+	ZSTD_writeLE32(dst, ZSTD_MAGICNUMBER);
+	op[4] = frameHeaderDecriptionByte;
+	pos = 5;
+	if (!singleSegment)
+		op[pos++] = windowLogByte;
+	switch (dictIDSizeCode) {
+	default: /* impossible */
+	case 0: break;
+	case 1:
+		op[pos] = (BYTE)(dictID);
+		pos++;
+		break;
+	case 2:
+		ZSTD_writeLE16(op + pos, (U16)dictID);
+		pos += 2;
+		break;
+	case 3:
+		ZSTD_writeLE32(op + pos, dictID);
+		pos += 4;
+		break;
+	}
+	switch (fcsCode) {
+	default: /* impossible */
+	case 0:
+		if (singleSegment)
+			op[pos++] = (BYTE)(pledgedSrcSize);
+		break;
+	case 1:
+		ZSTD_writeLE16(op + pos, (U16)(pledgedSrcSize - 256));
+		pos += 2;
+		break;
+	case 2:
+		ZSTD_writeLE32(op + pos, (U32)(pledgedSrcSize));
+		pos += 4;
+		break;
+	case 3:
+		ZSTD_writeLE64(op + pos, (U64)(pledgedSrcSize));
+		pos += 8;
+		break;
+	}
+	return pos;
+}
+
+static size_t ZSTD_compressContinue_internal(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, U32 frame, U32 lastFrameChunk)
+{
+	const BYTE *const ip = (const BYTE *)src;
+	size_t fhSize = 0;
+
+	if (cctx->stage == ZSTDcs_created)
+		return ERROR(stage_wrong); /* missing init (ZSTD_compressBegin) */
+
+	if (frame && (cctx->stage == ZSTDcs_init)) {
+		fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, cctx->frameContentSize, cctx->dictID);
+		if (ZSTD_isError(fhSize))
+			return fhSize;
+		dstCapacity -= fhSize;
+		dst = (char *)dst + fhSize;
+		cctx->stage = ZSTDcs_ongoing;
+	}
+
+	/* Check if blocks follow each other */
+	if (src != cctx->nextSrc) {
+		/* not contiguous */
+		ptrdiff_t const delta = cctx->nextSrc - ip;
+		cctx->lowLimit = cctx->dictLimit;
+		cctx->dictLimit = (U32)(cctx->nextSrc - cctx->base);
+		cctx->dictBase = cctx->base;
+		cctx->base -= delta;
+		cctx->nextToUpdate = cctx->dictLimit;
+		if (cctx->dictLimit - cctx->lowLimit < HASH_READ_SIZE)
+			cctx->lowLimit = cctx->dictLimit; /* too small extDict */
+	}
+
+	/* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+	if ((ip + srcSize > cctx->dictBase + cctx->lowLimit) & (ip < cctx->dictBase + cctx->dictLimit)) {
+		ptrdiff_t const highInputIdx = (ip + srcSize) - cctx->dictBase;
+		U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)cctx->dictLimit) ? cctx->dictLimit : (U32)highInputIdx;
+		cctx->lowLimit = lowLimitMax;
+	}
+
+	cctx->nextSrc = ip + srcSize;
+
+	if (srcSize) {
+		size_t const cSize = frame ? ZSTD_compress_generic(cctx, dst, dstCapacity, src, srcSize, lastFrameChunk)
+					   : ZSTD_compressBlock_internal(cctx, dst, dstCapacity, src, srcSize);
+		if (ZSTD_isError(cSize))
+			return cSize;
+		return cSize + fhSize;
+	} else
+		return fhSize;
+}
+
+size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 0);
+}
+
+size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx) { return MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << cctx->params.cParams.windowLog); }
+
+size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t const blockSizeMax = ZSTD_getBlockSizeMax(cctx);
+	if (srcSize > blockSizeMax)
+		return ERROR(srcSize_wrong);
+	return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0, 0);
+}
+
+/*! ZSTD_loadDictionaryContent() :
+ *  @return : 0, or an error code
+ */
+static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx *zc, const void *src, size_t srcSize)
+{
+	const BYTE *const ip = (const BYTE *)src;
+	const BYTE *const iend = ip + srcSize;
+
+	/* input becomes curr prefix */
+	zc->lowLimit = zc->dictLimit;
+	zc->dictLimit = (U32)(zc->nextSrc - zc->base);
+	zc->dictBase = zc->base;
+	zc->base += ip - zc->nextSrc;
+	zc->nextToUpdate = zc->dictLimit;
+	zc->loadedDictEnd = zc->forceWindow ? 0 : (U32)(iend - zc->base);
+
+	zc->nextSrc = iend;
+	if (srcSize <= HASH_READ_SIZE)
+		return 0;
+
+	switch (zc->params.cParams.strategy) {
+	case ZSTD_fast: ZSTD_fillHashTable(zc, iend, zc->params.cParams.searchLength); break;
+
+	case ZSTD_dfast: ZSTD_fillDoubleHashTable(zc, iend, zc->params.cParams.searchLength); break;
+
+	case ZSTD_greedy:
+	case ZSTD_lazy:
+	case ZSTD_lazy2:
+		if (srcSize >= HASH_READ_SIZE)
+			ZSTD_insertAndFindFirstIndex(zc, iend - HASH_READ_SIZE, zc->params.cParams.searchLength);
+		break;
+
+	case ZSTD_btlazy2:
+	case ZSTD_btopt:
+	case ZSTD_btopt2:
+		if (srcSize >= HASH_READ_SIZE)
+			ZSTD_updateTree(zc, iend - HASH_READ_SIZE, iend, 1 << zc->params.cParams.searchLog, zc->params.cParams.searchLength);
+		break;
+
+	default:
+		return ERROR(GENERIC); /* strategy doesn't exist; impossible */
+	}
+
+	zc->nextToUpdate = (U32)(iend - zc->base);
+	return 0;
+}
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+   when FSE encoding.  Refuse dictionaries that assign zero probability to symbols
+   that we may encounter during compression.
+   NOTE: This behavior is not standard and could be improved in the future. */
+static size_t ZSTD_checkDictNCount(short *normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue)
+{
+	U32 s;
+	if (dictMaxSymbolValue < maxSymbolValue)
+		return ERROR(dictionary_corrupted);
+	for (s = 0; s <= maxSymbolValue; ++s) {
+		if (normalizedCounter[s] == 0)
+			return ERROR(dictionary_corrupted);
+	}
+	return 0;
+}
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : 0, or an error code
+ *  assumptions : magic number supposed already checked
+ *                dictSize supposed > 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_CCtx *cctx, const void *dict, size_t dictSize)
+{
+	const BYTE *dictPtr = (const BYTE *)dict;
+	const BYTE *const dictEnd = dictPtr + dictSize;
+	short offcodeNCount[MaxOff + 1];
+	unsigned offcodeMaxValue = MaxOff;
+
+	dictPtr += 4; /* skip magic number */
+	cctx->dictID = cctx->params.fParams.noDictIDFlag ? 0 : ZSTD_readLE32(dictPtr);
+	dictPtr += 4;
+
+	{
+		size_t const hufHeaderSize = HUF_readCTable_wksp(cctx->hufTable, 255, dictPtr, dictEnd - dictPtr, cctx->tmpCounters, sizeof(cctx->tmpCounters));
+		if (HUF_isError(hufHeaderSize))
+			return ERROR(dictionary_corrupted);
+		dictPtr += hufHeaderSize;
+	}
+
+	{
+		unsigned offcodeLog;
+		size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(offcodeHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (offcodeLog > OffFSELog)
+			return ERROR(dictionary_corrupted);
+		/* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+		CHECK_E(FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
+			dictionary_corrupted);
+		dictPtr += offcodeHeaderSize;
+	}
+
+	{
+		short matchlengthNCount[MaxML + 1];
+		unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+		size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(matchlengthHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (matchlengthLog > MLFSELog)
+			return ERROR(dictionary_corrupted);
+		/* Every match length code must have non-zero probability */
+		CHECK_F(ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
+		CHECK_E(
+		    FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
+		    dictionary_corrupted);
+		dictPtr += matchlengthHeaderSize;
+	}
+
+	{
+		short litlengthNCount[MaxLL + 1];
+		unsigned litlengthMaxValue = MaxLL, litlengthLog;
+		size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(litlengthHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (litlengthLog > LLFSELog)
+			return ERROR(dictionary_corrupted);
+		/* Every literal length code must have non-zero probability */
+		CHECK_F(ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
+		CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
+			dictionary_corrupted);
+		dictPtr += litlengthHeaderSize;
+	}
+
+	if (dictPtr + 12 > dictEnd)
+		return ERROR(dictionary_corrupted);
+	cctx->rep[0] = ZSTD_readLE32(dictPtr + 0);
+	cctx->rep[1] = ZSTD_readLE32(dictPtr + 4);
+	cctx->rep[2] = ZSTD_readLE32(dictPtr + 8);
+	dictPtr += 12;
+
+	{
+		size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+		U32 offcodeMax = MaxOff;
+		if (dictContentSize <= ((U32)-1) - 128 KB) {
+			U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+			offcodeMax = ZSTD_highbit32(maxOffset);		     /* Calculate minimum offset code required to represent maxOffset */
+		}
+		/* All offset values <= dictContentSize + 128 KB must be representable */
+		CHECK_F(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)));
+		/* All repCodes must be <= dictContentSize and != 0*/
+		{
+			U32 u;
+			for (u = 0; u < 3; u++) {
+				if (cctx->rep[u] == 0)
+					return ERROR(dictionary_corrupted);
+				if (cctx->rep[u] > dictContentSize)
+					return ERROR(dictionary_corrupted);
+			}
+		}
+
+		cctx->flagStaticTables = 1;
+		cctx->flagStaticHufTable = HUF_repeat_valid;
+		return ZSTD_loadDictionaryContent(cctx, dictPtr, dictContentSize);
+	}
+}
+
+/** ZSTD_compress_insertDictionary() :
+*   @return : 0, or an error code */
+static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx *cctx, const void *dict, size_t dictSize)
+{
+	if ((dict == NULL) || (dictSize <= 8))
+		return 0;
+
+	/* dict as pure content */
+	if ((ZSTD_readLE32(dict) != ZSTD_DICT_MAGIC) || (cctx->forceRawDict))
+		return ZSTD_loadDictionaryContent(cctx, dict, dictSize);
+
+	/* dict as zstd dictionary */
+	return ZSTD_loadZstdDictionary(cctx, dict, dictSize);
+}
+
+/*! ZSTD_compressBegin_internal() :
+*   @return : 0, or an error code */
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, ZSTD_parameters params, U64 pledgedSrcSize)
+{
+	ZSTD_compResetPolicy_e const crp = dictSize ? ZSTDcrp_fullReset : ZSTDcrp_continue;
+	CHECK_F(ZSTD_resetCCtx_advanced(cctx, params, pledgedSrcSize, crp));
+	return ZSTD_compress_insertDictionary(cctx, dict, dictSize);
+}
+
+/*! ZSTD_compressBegin_advanced() :
+*   @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+	/* compression parameters verification and optimization */
+	CHECK_F(ZSTD_checkCParams(params.cParams));
+	return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, pledgedSrcSize);
+}
+
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, int compressionLevel)
+{
+	ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
+	return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, 0);
+}
+
+size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel) { return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); }
+
+/*! ZSTD_writeEpilogue() :
+*   Ends a frame.
+*   @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity)
+{
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *op = ostart;
+	size_t fhSize = 0;
+
+	if (cctx->stage == ZSTDcs_created)
+		return ERROR(stage_wrong); /* init missing */
+
+	/* special case : empty frame */
+	if (cctx->stage == ZSTDcs_init) {
+		fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, 0, 0);
+		if (ZSTD_isError(fhSize))
+			return fhSize;
+		dstCapacity -= fhSize;
+		op += fhSize;
+		cctx->stage = ZSTDcs_ongoing;
+	}
+
+	if (cctx->stage != ZSTDcs_ending) {
+		/* write one last empty block, make it the "last" block */
+		U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw) << 1) + 0;
+		if (dstCapacity < 4)
+			return ERROR(dstSize_tooSmall);
+		ZSTD_writeLE32(op, cBlockHeader24);
+		op += ZSTD_blockHeaderSize;
+		dstCapacity -= ZSTD_blockHeaderSize;
+	}
+
+	if (cctx->params.fParams.checksumFlag) {
+		U32 const checksum = (U32)xxh64_digest(&cctx->xxhState);
+		if (dstCapacity < 4)
+			return ERROR(dstSize_tooSmall);
+		ZSTD_writeLE32(op, checksum);
+		op += 4;
+	}
+
+	cctx->stage = ZSTDcs_created; /* return to "created but no init" status */
+	return op - ostart;
+}
+
+size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t endResult;
+	size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 1);
+	if (ZSTD_isError(cSize))
+		return cSize;
+	endResult = ZSTD_writeEpilogue(cctx, (char *)dst + cSize, dstCapacity - cSize);
+	if (ZSTD_isError(endResult))
+		return endResult;
+	return cSize + endResult;
+}
+
+static size_t ZSTD_compress_internal(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize,
+				     ZSTD_parameters params)
+{
+	CHECK_F(ZSTD_compressBegin_internal(cctx, dict, dictSize, params, srcSize));
+	return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize,
+			       ZSTD_parameters params)
+{
+	return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
+}
+
+size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, ZSTD_parameters params)
+{
+	return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, NULL, 0, params);
+}
+
+/* =====  Dictionary API  ===== */
+
+struct ZSTD_CDict_s {
+	void *dictBuffer;
+	const void *dictContent;
+	size_t dictContentSize;
+	ZSTD_CCtx *refContext;
+}; /* typedef'd tp ZSTD_CDict within "zstd.h" */
+
+size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams) { return ZSTD_CCtxWorkspaceBound(cParams) + ZSTD_ALIGN(sizeof(ZSTD_CDict)); }
+
+static ZSTD_CDict *ZSTD_createCDict_advanced(const void *dictBuffer, size_t dictSize, unsigned byReference, ZSTD_parameters params, ZSTD_customMem customMem)
+{
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	{
+		ZSTD_CDict *const cdict = (ZSTD_CDict *)ZSTD_malloc(sizeof(ZSTD_CDict), customMem);
+		ZSTD_CCtx *const cctx = ZSTD_createCCtx_advanced(customMem);
+
+		if (!cdict || !cctx) {
+			ZSTD_free(cdict, customMem);
+			ZSTD_freeCCtx(cctx);
+			return NULL;
+		}
+
+		if ((byReference) || (!dictBuffer) || (!dictSize)) {
+			cdict->dictBuffer = NULL;
+			cdict->dictContent = dictBuffer;
+		} else {
+			void *const internalBuffer = ZSTD_malloc(dictSize, customMem);
+			if (!internalBuffer) {
+				ZSTD_free(cctx, customMem);
+				ZSTD_free(cdict, customMem);
+				return NULL;
+			}
+			memcpy(internalBuffer, dictBuffer, dictSize);
+			cdict->dictBuffer = internalBuffer;
+			cdict->dictContent = internalBuffer;
+		}
+
+		{
+			size_t const errorCode = ZSTD_compressBegin_advanced(cctx, cdict->dictContent, dictSize, params, 0);
+			if (ZSTD_isError(errorCode)) {
+				ZSTD_free(cdict->dictBuffer, customMem);
+				ZSTD_free(cdict, customMem);
+				ZSTD_freeCCtx(cctx);
+				return NULL;
+			}
+		}
+
+		cdict->refContext = cctx;
+		cdict->dictContentSize = dictSize;
+		return cdict;
+	}
+}
+
+ZSTD_CDict *ZSTD_initCDict(const void *dict, size_t dictSize, ZSTD_parameters params, void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	return ZSTD_createCDict_advanced(dict, dictSize, 1, params, stackMem);
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict *cdict)
+{
+	if (cdict == NULL)
+		return 0; /* support free on NULL */
+	{
+		ZSTD_customMem const cMem = cdict->refContext->customMem;
+		ZSTD_freeCCtx(cdict->refContext);
+		ZSTD_free(cdict->dictBuffer, cMem);
+		ZSTD_free(cdict, cMem);
+		return 0;
+	}
+}
+
+static ZSTD_parameters ZSTD_getParamsFromCDict(const ZSTD_CDict *cdict) { return ZSTD_getParamsFromCCtx(cdict->refContext); }
+
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict, unsigned long long pledgedSrcSize)
+{
+	if (cdict->dictContentSize)
+		CHECK_F(ZSTD_copyCCtx(cctx, cdict->refContext, pledgedSrcSize))
+	else {
+		ZSTD_parameters params = cdict->refContext->params;
+		params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+		CHECK_F(ZSTD_compressBegin_advanced(cctx, NULL, 0, params, pledgedSrcSize));
+	}
+	return 0;
+}
+
+/*! ZSTD_compress_usingCDict() :
+*   Compression using a digested Dictionary.
+*   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+*   Note that compression level is decided during dictionary creation */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const ZSTD_CDict *cdict)
+{
+	CHECK_F(ZSTD_compressBegin_usingCDict(cctx, cdict, srcSize));
+
+	if (cdict->refContext->params.fParams.contentSizeFlag == 1) {
+		cctx->params.fParams.contentSizeFlag = 1;
+		cctx->frameContentSize = srcSize;
+	} else {
+		cctx->params.fParams.contentSizeFlag = 0;
+	}
+
+	return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/* ******************************************************************
+*  Streaming
+********************************************************************/
+
+typedef enum { zcss_init, zcss_load, zcss_flush, zcss_final } ZSTD_cStreamStage;
+
+struct ZSTD_CStream_s {
+	ZSTD_CCtx *cctx;
+	ZSTD_CDict *cdictLocal;
+	const ZSTD_CDict *cdict;
+	char *inBuff;
+	size_t inBuffSize;
+	size_t inToCompress;
+	size_t inBuffPos;
+	size_t inBuffTarget;
+	size_t blockSize;
+	char *outBuff;
+	size_t outBuffSize;
+	size_t outBuffContentSize;
+	size_t outBuffFlushedSize;
+	ZSTD_cStreamStage stage;
+	U32 checksum;
+	U32 frameEnded;
+	U64 pledgedSrcSize;
+	U64 inputProcessed;
+	ZSTD_parameters params;
+	ZSTD_customMem customMem;
+}; /* typedef'd to ZSTD_CStream within "zstd.h" */
+
+size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams)
+{
+	size_t const inBuffSize = (size_t)1 << cParams.windowLog;
+	size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, inBuffSize);
+	size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1;
+
+	return ZSTD_CCtxWorkspaceBound(cParams) + ZSTD_ALIGN(sizeof(ZSTD_CStream)) + ZSTD_ALIGN(inBuffSize) + ZSTD_ALIGN(outBuffSize);
+}
+
+ZSTD_CStream *ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{
+	ZSTD_CStream *zcs;
+
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	zcs = (ZSTD_CStream *)ZSTD_malloc(sizeof(ZSTD_CStream), customMem);
+	if (zcs == NULL)
+		return NULL;
+	memset(zcs, 0, sizeof(ZSTD_CStream));
+	memcpy(&zcs->customMem, &customMem, sizeof(ZSTD_customMem));
+	zcs->cctx = ZSTD_createCCtx_advanced(customMem);
+	if (zcs->cctx == NULL) {
+		ZSTD_freeCStream(zcs);
+		return NULL;
+	}
+	return zcs;
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream *zcs)
+{
+	if (zcs == NULL)
+		return 0; /* support free on NULL */
+	{
+		ZSTD_customMem const cMem = zcs->customMem;
+		ZSTD_freeCCtx(zcs->cctx);
+		zcs->cctx = NULL;
+		ZSTD_freeCDict(zcs->cdictLocal);
+		zcs->cdictLocal = NULL;
+		ZSTD_free(zcs->inBuff, cMem);
+		zcs->inBuff = NULL;
+		ZSTD_free(zcs->outBuff, cMem);
+		zcs->outBuff = NULL;
+		ZSTD_free(zcs, cMem);
+		return 0;
+	}
+}
+
+/*======   Initialization   ======*/
+
+size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; }
+size_t ZSTD_CStreamOutSize(void) { return ZSTD_compressBound(ZSTD_BLOCKSIZE_ABSOLUTEMAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */; }
+
+static size_t ZSTD_resetCStream_internal(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize)
+{
+	if (zcs->inBuffSize == 0)
+		return ERROR(stage_wrong); /* zcs has not been init at least once => can't reset */
+
+	if (zcs->cdict)
+		CHECK_F(ZSTD_compressBegin_usingCDict(zcs->cctx, zcs->cdict, pledgedSrcSize))
+	else
+		CHECK_F(ZSTD_compressBegin_advanced(zcs->cctx, NULL, 0, zcs->params, pledgedSrcSize));
+
+	zcs->inToCompress = 0;
+	zcs->inBuffPos = 0;
+	zcs->inBuffTarget = zcs->blockSize;
+	zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+	zcs->stage = zcss_load;
+	zcs->frameEnded = 0;
+	zcs->pledgedSrcSize = pledgedSrcSize;
+	zcs->inputProcessed = 0;
+	return 0; /* ready to go */
+}
+
+size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize)
+{
+
+	zcs->params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+
+	return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
+}
+
+static size_t ZSTD_initCStream_advanced(ZSTD_CStream *zcs, const void *dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+	/* allocate buffers */
+	{
+		size_t const neededInBuffSize = (size_t)1 << params.cParams.windowLog;
+		if (zcs->inBuffSize < neededInBuffSize) {
+			zcs->inBuffSize = neededInBuffSize;
+			ZSTD_free(zcs->inBuff, zcs->customMem);
+			zcs->inBuff = (char *)ZSTD_malloc(neededInBuffSize, zcs->customMem);
+			if (zcs->inBuff == NULL)
+				return ERROR(memory_allocation);
+		}
+		zcs->blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, neededInBuffSize);
+	}
+	if (zcs->outBuffSize < ZSTD_compressBound(zcs->blockSize) + 1) {
+		zcs->outBuffSize = ZSTD_compressBound(zcs->blockSize) + 1;
+		ZSTD_free(zcs->outBuff, zcs->customMem);
+		zcs->outBuff = (char *)ZSTD_malloc(zcs->outBuffSize, zcs->customMem);
+		if (zcs->outBuff == NULL)
+			return ERROR(memory_allocation);
+	}
+
+	if (dict && dictSize >= 8) {
+		ZSTD_freeCDict(zcs->cdictLocal);
+		zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, 0, params, zcs->customMem);
+		if (zcs->cdictLocal == NULL)
+			return ERROR(memory_allocation);
+		zcs->cdict = zcs->cdictLocal;
+	} else
+		zcs->cdict = NULL;
+
+	zcs->checksum = params.fParams.checksumFlag > 0;
+	zcs->params = params;
+
+	return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
+}
+
+ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params, unsigned long long pledgedSrcSize, void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	ZSTD_CStream *const zcs = ZSTD_createCStream_advanced(stackMem);
+	if (zcs) {
+		size_t const code = ZSTD_initCStream_advanced(zcs, NULL, 0, params, pledgedSrcSize);
+		if (ZSTD_isError(code)) {
+			return NULL;
+		}
+	}
+	return zcs;
+}
+
+ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict, unsigned long long pledgedSrcSize, void *workspace, size_t workspaceSize)
+{
+	ZSTD_parameters const params = ZSTD_getParamsFromCDict(cdict);
+	ZSTD_CStream *const zcs = ZSTD_initCStream(params, pledgedSrcSize, workspace, workspaceSize);
+	if (zcs) {
+		zcs->cdict = cdict;
+		if (ZSTD_isError(ZSTD_resetCStream_internal(zcs, pledgedSrcSize))) {
+			return NULL;
+		}
+	}
+	return zcs;
+}
+
+/*======   Compression   ======*/
+
+typedef enum { zsf_gather, zsf_flush, zsf_end } ZSTD_flush_e;
+
+ZSTD_STATIC size_t ZSTD_limitCopy(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t const length = MIN(dstCapacity, srcSize);
+	memcpy(dst, src, length);
+	return length;
+}
+
+static size_t ZSTD_compressStream_generic(ZSTD_CStream *zcs, void *dst, size_t *dstCapacityPtr, const void *src, size_t *srcSizePtr, ZSTD_flush_e const flush)
+{
+	U32 someMoreWork = 1;
+	const char *const istart = (const char *)src;
+	const char *const iend = istart + *srcSizePtr;
+	const char *ip = istart;
+	char *const ostart = (char *)dst;
+	char *const oend = ostart + *dstCapacityPtr;
+	char *op = ostart;
+
+	while (someMoreWork) {
+		switch (zcs->stage) {
+		case zcss_init:
+			return ERROR(init_missing); /* call ZBUFF_compressInit() first ! */
+
+		case zcss_load:
+			/* complete inBuffer */
+			{
+				size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+				size_t const loaded = ZSTD_limitCopy(zcs->inBuff + zcs->inBuffPos, toLoad, ip, iend - ip);
+				zcs->inBuffPos += loaded;
+				ip += loaded;
+				if ((zcs->inBuffPos == zcs->inToCompress) || (!flush && (toLoad != loaded))) {
+					someMoreWork = 0;
+					break; /* not enough input to get a full block : stop there, wait for more */
+				}
+			}
+			/* compress curr block (note : this stage cannot be stopped in the middle) */
+			{
+				void *cDst;
+				size_t cSize;
+				size_t const iSize = zcs->inBuffPos - zcs->inToCompress;
+				size_t oSize = oend - op;
+				if (oSize >= ZSTD_compressBound(iSize))
+					cDst = op; /* compress directly into output buffer (avoid flush stage) */
+				else
+					cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+				cSize = (flush == zsf_end) ? ZSTD_compressEnd(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize)
+							   : ZSTD_compressContinue(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize);
+				if (ZSTD_isError(cSize))
+					return cSize;
+				if (flush == zsf_end)
+					zcs->frameEnded = 1;
+				/* prepare next block */
+				zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+				if (zcs->inBuffTarget > zcs->inBuffSize)
+					zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; /* note : inBuffSize >= blockSize */
+				zcs->inToCompress = zcs->inBuffPos;
+				if (cDst == op) {
+					op += cSize;
+					break;
+				} /* no need to flush */
+				zcs->outBuffContentSize = cSize;
+				zcs->outBuffFlushedSize = 0;
+				zcs->stage = zcss_flush; /* pass-through to flush stage */
+			}
+
+		case zcss_flush: {
+			size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+			size_t const flushed = ZSTD_limitCopy(op, oend - op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+			op += flushed;
+			zcs->outBuffFlushedSize += flushed;
+			if (toFlush != flushed) {
+				someMoreWork = 0;
+				break;
+			} /* dst too small to store flushed data : stop there */
+			zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+			zcs->stage = zcss_load;
+			break;
+		}
+
+		case zcss_final:
+			someMoreWork = 0; /* do nothing */
+			break;
+
+		default:
+			return ERROR(GENERIC); /* impossible */
+		}
+	}
+
+	*srcSizePtr = ip - istart;
+	*dstCapacityPtr = op - ostart;
+	zcs->inputProcessed += *srcSizePtr;
+	if (zcs->frameEnded)
+		return 0;
+	{
+		size_t hintInSize = zcs->inBuffTarget - zcs->inBuffPos;
+		if (hintInSize == 0)
+			hintInSize = zcs->blockSize;
+		return hintInSize;
+	}
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output, ZSTD_inBuffer *input)
+{
+	size_t sizeRead = input->size - input->pos;
+	size_t sizeWritten = output->size - output->pos;
+	size_t const result =
+	    ZSTD_compressStream_generic(zcs, (char *)(output->dst) + output->pos, &sizeWritten, (const char *)(input->src) + input->pos, &sizeRead, zsf_gather);
+	input->pos += sizeRead;
+	output->pos += sizeWritten;
+	return result;
+}
+
+/*======   Finalize   ======*/
+
+/*! ZSTD_flushStream() :
+*   @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output)
+{
+	size_t srcSize = 0;
+	size_t sizeWritten = output->size - output->pos;
+	size_t const result = ZSTD_compressStream_generic(zcs, (char *)(output->dst) + output->pos, &sizeWritten, &srcSize,
+							  &srcSize, /* use a valid src address instead of NULL */
+							  zsf_flush);
+	output->pos += sizeWritten;
+	if (ZSTD_isError(result))
+		return result;
+	return zcs->outBuffContentSize - zcs->outBuffFlushedSize; /* remaining to flush */
+}
+
+size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output)
+{
+	BYTE *const ostart = (BYTE *)(output->dst) + output->pos;
+	BYTE *const oend = (BYTE *)(output->dst) + output->size;
+	BYTE *op = ostart;
+
+	if ((zcs->pledgedSrcSize) && (zcs->inputProcessed != zcs->pledgedSrcSize))
+		return ERROR(srcSize_wrong); /* pledgedSrcSize not respected */
+
+	if (zcs->stage != zcss_final) {
+		/* flush whatever remains */
+		size_t srcSize = 0;
+		size_t sizeWritten = output->size - output->pos;
+		size_t const notEnded =
+		    ZSTD_compressStream_generic(zcs, ostart, &sizeWritten, &srcSize, &srcSize, zsf_end); /* use a valid src address instead of NULL */
+		size_t const remainingToFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+		op += sizeWritten;
+		if (remainingToFlush) {
+			output->pos += sizeWritten;
+			return remainingToFlush + ZSTD_BLOCKHEADERSIZE /* final empty block */ + (zcs->checksum * 4);
+		}
+		/* create epilogue */
+		zcs->stage = zcss_final;
+		zcs->outBuffContentSize = !notEnded ? 0 : ZSTD_compressEnd(zcs->cctx, zcs->outBuff, zcs->outBuffSize, NULL,
+									   0); /* write epilogue, including final empty block, into outBuff */
+	}
+
+	/* flush epilogue */
+	{
+		size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+		size_t const flushed = ZSTD_limitCopy(op, oend - op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+		op += flushed;
+		zcs->outBuffFlushedSize += flushed;
+		output->pos += op - ostart;
+		if (toFlush == flushed)
+			zcs->stage = zcss_init; /* end reached */
+		return toFlush - flushed;
+	}
+}
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_DEFAULT_CLEVEL 1
+#define ZSTD_MAX_CLEVEL 22
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL + 1] = {
+    {
+	/* "default" */
+	/* W,  C,  H,  S,  L, TL, strat */
+	{18, 12, 12, 1, 7, 16, ZSTD_fast},    /* level  0 - never used */
+	{19, 13, 14, 1, 7, 16, ZSTD_fast},    /* level  1 */
+	{19, 15, 16, 1, 6, 16, ZSTD_fast},    /* level  2 */
+	{20, 16, 17, 1, 5, 16, ZSTD_dfast},   /* level  3.*/
+	{20, 18, 18, 1, 5, 16, ZSTD_dfast},   /* level  4.*/
+	{20, 15, 18, 3, 5, 16, ZSTD_greedy},  /* level  5 */
+	{21, 16, 19, 2, 5, 16, ZSTD_lazy},    /* level  6 */
+	{21, 17, 20, 3, 5, 16, ZSTD_lazy},    /* level  7 */
+	{21, 18, 20, 3, 5, 16, ZSTD_lazy2},   /* level  8 */
+	{21, 20, 20, 3, 5, 16, ZSTD_lazy2},   /* level  9 */
+	{21, 19, 21, 4, 5, 16, ZSTD_lazy2},   /* level 10 */
+	{22, 20, 22, 4, 5, 16, ZSTD_lazy2},   /* level 11 */
+	{22, 20, 22, 5, 5, 16, ZSTD_lazy2},   /* level 12 */
+	{22, 21, 22, 5, 5, 16, ZSTD_lazy2},   /* level 13 */
+	{22, 21, 22, 6, 5, 16, ZSTD_lazy2},   /* level 14 */
+	{22, 21, 21, 5, 5, 16, ZSTD_btlazy2}, /* level 15 */
+	{23, 22, 22, 5, 5, 16, ZSTD_btlazy2}, /* level 16 */
+	{23, 21, 22, 4, 5, 24, ZSTD_btopt},   /* level 17 */
+	{23, 23, 22, 6, 5, 32, ZSTD_btopt},   /* level 18 */
+	{23, 23, 22, 6, 3, 48, ZSTD_btopt},   /* level 19 */
+	{25, 25, 23, 7, 3, 64, ZSTD_btopt2},  /* level 20 */
+	{26, 26, 23, 7, 3, 256, ZSTD_btopt2}, /* level 21 */
+	{27, 27, 25, 9, 3, 512, ZSTD_btopt2}, /* level 22 */
+    },
+    {
+	/* for srcSize <= 256 KB */
+	/* W,  C,  H,  S,  L,  T, strat */
+	{0, 0, 0, 0, 0, 0, ZSTD_fast},	 /* level  0 - not used */
+	{18, 13, 14, 1, 6, 8, ZSTD_fast},      /* level  1 */
+	{18, 14, 13, 1, 5, 8, ZSTD_dfast},     /* level  2 */
+	{18, 16, 15, 1, 5, 8, ZSTD_dfast},     /* level  3 */
+	{18, 15, 17, 1, 5, 8, ZSTD_greedy},    /* level  4.*/
+	{18, 16, 17, 4, 5, 8, ZSTD_greedy},    /* level  5.*/
+	{18, 16, 17, 3, 5, 8, ZSTD_lazy},      /* level  6.*/
+	{18, 17, 17, 4, 4, 8, ZSTD_lazy},      /* level  7 */
+	{18, 17, 17, 4, 4, 8, ZSTD_lazy2},     /* level  8 */
+	{18, 17, 17, 5, 4, 8, ZSTD_lazy2},     /* level  9 */
+	{18, 17, 17, 6, 4, 8, ZSTD_lazy2},     /* level 10 */
+	{18, 18, 17, 6, 4, 8, ZSTD_lazy2},     /* level 11.*/
+	{18, 18, 17, 7, 4, 8, ZSTD_lazy2},     /* level 12.*/
+	{18, 19, 17, 6, 4, 8, ZSTD_btlazy2},   /* level 13 */
+	{18, 18, 18, 4, 4, 16, ZSTD_btopt},    /* level 14.*/
+	{18, 18, 18, 4, 3, 16, ZSTD_btopt},    /* level 15.*/
+	{18, 19, 18, 6, 3, 32, ZSTD_btopt},    /* level 16.*/
+	{18, 19, 18, 8, 3, 64, ZSTD_btopt},    /* level 17.*/
+	{18, 19, 18, 9, 3, 128, ZSTD_btopt},   /* level 18.*/
+	{18, 19, 18, 10, 3, 256, ZSTD_btopt},  /* level 19.*/
+	{18, 19, 18, 11, 3, 512, ZSTD_btopt2}, /* level 20.*/
+	{18, 19, 18, 12, 3, 512, ZSTD_btopt2}, /* level 21.*/
+	{18, 19, 18, 13, 3, 512, ZSTD_btopt2}, /* level 22.*/
+    },
+    {
+	/* for srcSize <= 128 KB */
+	/* W,  C,  H,  S,  L,  T, strat */
+	{17, 12, 12, 1, 7, 8, ZSTD_fast},      /* level  0 - not used */
+	{17, 12, 13, 1, 6, 8, ZSTD_fast},      /* level  1 */
+	{17, 13, 16, 1, 5, 8, ZSTD_fast},      /* level  2 */
+	{17, 16, 16, 2, 5, 8, ZSTD_dfast},     /* level  3 */
+	{17, 13, 15, 3, 4, 8, ZSTD_greedy},    /* level  4 */
+	{17, 15, 17, 4, 4, 8, ZSTD_greedy},    /* level  5 */
+	{17, 16, 17, 3, 4, 8, ZSTD_lazy},      /* level  6 */
+	{17, 15, 17, 4, 4, 8, ZSTD_lazy2},     /* level  7 */
+	{17, 17, 17, 4, 4, 8, ZSTD_lazy2},     /* level  8 */
+	{17, 17, 17, 5, 4, 8, ZSTD_lazy2},     /* level  9 */
+	{17, 17, 17, 6, 4, 8, ZSTD_lazy2},     /* level 10 */
+	{17, 17, 17, 7, 4, 8, ZSTD_lazy2},     /* level 11 */
+	{17, 17, 17, 8, 4, 8, ZSTD_lazy2},     /* level 12 */
+	{17, 18, 17, 6, 4, 8, ZSTD_btlazy2},   /* level 13.*/
+	{17, 17, 17, 7, 3, 8, ZSTD_btopt},     /* level 14.*/
+	{17, 17, 17, 7, 3, 16, ZSTD_btopt},    /* level 15.*/
+	{17, 18, 17, 7, 3, 32, ZSTD_btopt},    /* level 16.*/
+	{17, 18, 17, 7, 3, 64, ZSTD_btopt},    /* level 17.*/
+	{17, 18, 17, 7, 3, 256, ZSTD_btopt},   /* level 18.*/
+	{17, 18, 17, 8, 3, 256, ZSTD_btopt},   /* level 19.*/
+	{17, 18, 17, 9, 3, 256, ZSTD_btopt2},  /* level 20.*/
+	{17, 18, 17, 10, 3, 256, ZSTD_btopt2}, /* level 21.*/
+	{17, 18, 17, 11, 3, 512, ZSTD_btopt2}, /* level 22.*/
+    },
+    {
+	/* for srcSize <= 16 KB */
+	/* W,  C,  H,  S,  L,  T, strat */
+	{14, 12, 12, 1, 7, 6, ZSTD_fast},      /* level  0 - not used */
+	{14, 14, 14, 1, 6, 6, ZSTD_fast},      /* level  1 */
+	{14, 14, 14, 1, 4, 6, ZSTD_fast},      /* level  2 */
+	{14, 14, 14, 1, 4, 6, ZSTD_dfast},     /* level  3.*/
+	{14, 14, 14, 4, 4, 6, ZSTD_greedy},    /* level  4.*/
+	{14, 14, 14, 3, 4, 6, ZSTD_lazy},      /* level  5.*/
+	{14, 14, 14, 4, 4, 6, ZSTD_lazy2},     /* level  6 */
+	{14, 14, 14, 5, 4, 6, ZSTD_lazy2},     /* level  7 */
+	{14, 14, 14, 6, 4, 6, ZSTD_lazy2},     /* level  8.*/
+	{14, 15, 14, 6, 4, 6, ZSTD_btlazy2},   /* level  9.*/
+	{14, 15, 14, 3, 3, 6, ZSTD_btopt},     /* level 10.*/
+	{14, 15, 14, 6, 3, 8, ZSTD_btopt},     /* level 11.*/
+	{14, 15, 14, 6, 3, 16, ZSTD_btopt},    /* level 12.*/
+	{14, 15, 14, 6, 3, 24, ZSTD_btopt},    /* level 13.*/
+	{14, 15, 15, 6, 3, 48, ZSTD_btopt},    /* level 14.*/
+	{14, 15, 15, 6, 3, 64, ZSTD_btopt},    /* level 15.*/
+	{14, 15, 15, 6, 3, 96, ZSTD_btopt},    /* level 16.*/
+	{14, 15, 15, 6, 3, 128, ZSTD_btopt},   /* level 17.*/
+	{14, 15, 15, 6, 3, 256, ZSTD_btopt},   /* level 18.*/
+	{14, 15, 15, 7, 3, 256, ZSTD_btopt},   /* level 19.*/
+	{14, 15, 15, 8, 3, 256, ZSTD_btopt2},  /* level 20.*/
+	{14, 15, 15, 9, 3, 256, ZSTD_btopt2},  /* level 21.*/
+	{14, 15, 15, 10, 3, 256, ZSTD_btopt2}, /* level 22.*/
+    },
+};
+
+/*! ZSTD_getCParams() :
+*   @return ZSTD_compressionParameters structure for a selected compression level, `srcSize` and `dictSize`.
+*   Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSize, size_t dictSize)
+{
+	ZSTD_compressionParameters cp;
+	size_t const addedSize = srcSize ? 0 : 500;
+	U64 const rSize = srcSize + dictSize ? srcSize + dictSize + addedSize : (U64)-1;
+	U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); /* intentional underflow for srcSizeHint == 0 */
+	if (compressionLevel <= 0)
+		compressionLevel = ZSTD_DEFAULT_CLEVEL; /* 0 == default; no negative compressionLevel yet */
+	if (compressionLevel > ZSTD_MAX_CLEVEL)
+		compressionLevel = ZSTD_MAX_CLEVEL;
+	cp = ZSTD_defaultCParameters[tableID][compressionLevel];
+	if (ZSTD_32bits()) { /* auto-correction, for 32-bits mode */
+		if (cp.windowLog > ZSTD_WINDOWLOG_MAX)
+			cp.windowLog = ZSTD_WINDOWLOG_MAX;
+		if (cp.chainLog > ZSTD_CHAINLOG_MAX)
+			cp.chainLog = ZSTD_CHAINLOG_MAX;
+		if (cp.hashLog > ZSTD_HASHLOG_MAX)
+			cp.hashLog = ZSTD_HASHLOG_MAX;
+	}
+	cp = ZSTD_adjustCParams(cp, srcSize, dictSize);
+	return cp;
+}
+
+/*! ZSTD_getParams() :
+*   same as ZSTD_getCParams(), but @return a `ZSTD_parameters` object (instead of `ZSTD_compressionParameters`).
+*   All fields of `ZSTD_frameParameters` are set to default (0) */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSize, size_t dictSize)
+{
+	ZSTD_parameters params;
+	ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSize, dictSize);
+	memset(&params, 0, sizeof(params));
+	params.cParams = cParams;
+	return params;
+}
+
+EXPORT_SYMBOL(ZSTD_maxCLevel);
+EXPORT_SYMBOL(ZSTD_compressBound);
+
+EXPORT_SYMBOL(ZSTD_CCtxWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initCCtx);
+EXPORT_SYMBOL(ZSTD_compressCCtx);
+EXPORT_SYMBOL(ZSTD_compress_usingDict);
+
+EXPORT_SYMBOL(ZSTD_CDictWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initCDict);
+EXPORT_SYMBOL(ZSTD_compress_usingCDict);
+
+EXPORT_SYMBOL(ZSTD_CStreamWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initCStream);
+EXPORT_SYMBOL(ZSTD_initCStream_usingCDict);
+EXPORT_SYMBOL(ZSTD_resetCStream);
+EXPORT_SYMBOL(ZSTD_compressStream);
+EXPORT_SYMBOL(ZSTD_flushStream);
+EXPORT_SYMBOL(ZSTD_endStream);
+EXPORT_SYMBOL(ZSTD_CStreamInSize);
+EXPORT_SYMBOL(ZSTD_CStreamOutSize);
+
+EXPORT_SYMBOL(ZSTD_getCParams);
+EXPORT_SYMBOL(ZSTD_getParams);
+EXPORT_SYMBOL(ZSTD_checkCParams);
+EXPORT_SYMBOL(ZSTD_adjustCParams);
+
+EXPORT_SYMBOL(ZSTD_compressBegin);
+EXPORT_SYMBOL(ZSTD_compressBegin_usingDict);
+EXPORT_SYMBOL(ZSTD_compressBegin_advanced);
+EXPORT_SYMBOL(ZSTD_copyCCtx);
+EXPORT_SYMBOL(ZSTD_compressBegin_usingCDict);
+EXPORT_SYMBOL(ZSTD_compressContinue);
+EXPORT_SYMBOL(ZSTD_compressEnd);
+
+EXPORT_SYMBOL(ZSTD_getBlockSizeMax);
+EXPORT_SYMBOL(ZSTD_compressBlock);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Compressor");
diff --git a/lib/zstd/decompress.c b/lib/zstd/decompress.c
new file mode 100644
index 000000000000..b17846725ca0
--- /dev/null
+++ b/lib/zstd/decompress.c
@@ -0,0 +1,2528 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+*  MAXWINDOWSIZE_DEFAULT :
+*  maximum window size accepted by DStream, by default.
+*  Frames requiring more memory will be rejected.
+*/
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+#define ZSTD_MAXWINDOWSIZE_DEFAULT ((1 << ZSTD_WINDOWLOG_MAX) + 1) /* defined within zstd.h */
+#endif
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include "fse.h"
+#include "huf.h"
+#include "mem.h" /* low level memory routines */
+#include "zstd_internal.h"
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h> /* memcpy, memmove, memset */
+
+#define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0)
+
+/*-*************************************
+*  Macros
+***************************************/
+#define ZSTD_isError ERR_isError /* for inlining */
+#define FSE_isError ERR_isError
+#define HUF_isError ERR_isError
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void *dst, const void *src) { memcpy(dst, src, 4); }
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+typedef enum {
+	ZSTDds_getFrameHeaderSize,
+	ZSTDds_decodeFrameHeader,
+	ZSTDds_decodeBlockHeader,
+	ZSTDds_decompressBlock,
+	ZSTDds_decompressLastBlock,
+	ZSTDds_checkChecksum,
+	ZSTDds_decodeSkippableHeader,
+	ZSTDds_skipFrame
+} ZSTD_dStage;
+
+typedef struct {
+	FSE_DTable LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+	FSE_DTable OFTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+	FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+	HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
+	U64 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32 / 2];
+	U32 rep[ZSTD_REP_NUM];
+} ZSTD_entropyTables_t;
+
+struct ZSTD_DCtx_s {
+	const FSE_DTable *LLTptr;
+	const FSE_DTable *MLTptr;
+	const FSE_DTable *OFTptr;
+	const HUF_DTable *HUFptr;
+	ZSTD_entropyTables_t entropy;
+	const void *previousDstEnd; /* detect continuity */
+	const void *base;	   /* start of curr segment */
+	const void *vBase;	  /* virtual start of previous segment if it was just before curr one */
+	const void *dictEnd;	/* end of previous segment */
+	size_t expected;
+	ZSTD_frameParams fParams;
+	blockType_e bType; /* used in ZSTD_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+	ZSTD_dStage stage;
+	U32 litEntropy;
+	U32 fseEntropy;
+	struct xxh64_state xxhState;
+	size_t headerSize;
+	U32 dictID;
+	const BYTE *litPtr;
+	ZSTD_customMem customMem;
+	size_t litSize;
+	size_t rleSize;
+	BYTE litBuffer[ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH];
+	BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+}; /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+size_t ZSTD_DCtxWorkspaceBound(void) { return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_DCtx)); }
+
+size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx)
+{
+	dctx->expected = ZSTD_frameHeaderSize_prefix;
+	dctx->stage = ZSTDds_getFrameHeaderSize;
+	dctx->previousDstEnd = NULL;
+	dctx->base = NULL;
+	dctx->vBase = NULL;
+	dctx->dictEnd = NULL;
+	dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+	dctx->litEntropy = dctx->fseEntropy = 0;
+	dctx->dictID = 0;
+	ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+	memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */
+	dctx->LLTptr = dctx->entropy.LLTable;
+	dctx->MLTptr = dctx->entropy.MLTable;
+	dctx->OFTptr = dctx->entropy.OFTable;
+	dctx->HUFptr = dctx->entropy.hufTable;
+	return 0;
+}
+
+ZSTD_DCtx *ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+	ZSTD_DCtx *dctx;
+
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	dctx = (ZSTD_DCtx *)ZSTD_malloc(sizeof(ZSTD_DCtx), customMem);
+	if (!dctx)
+		return NULL;
+	memcpy(&dctx->customMem, &customMem, sizeof(customMem));
+	ZSTD_decompressBegin(dctx);
+	return dctx;
+}
+
+ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	return ZSTD_createDCtx_advanced(stackMem);
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx *dctx)
+{
+	if (dctx == NULL)
+		return 0; /* support free on NULL */
+	ZSTD_free(dctx, dctx->customMem);
+	return 0; /* reserved as a potential error code in the future */
+}
+
+void ZSTD_copyDCtx(ZSTD_DCtx *dstDCtx, const ZSTD_DCtx *srcDCtx)
+{
+	size_t const workSpaceSize = (ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH) + ZSTD_frameHeaderSize_max;
+	memcpy(dstDCtx, srcDCtx, sizeof(ZSTD_DCtx) - workSpaceSize); /* no need to copy workspace */
+}
+
+static void ZSTD_refDDict(ZSTD_DCtx *dstDCtx, const ZSTD_DDict *ddict);
+
+/*-*************************************************************
+*   Decompression section
+***************************************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void *buffer, size_t size)
+{
+	if (size < 4)
+		return 0;
+	{
+		U32 const magic = ZSTD_readLE32(buffer);
+		if (magic == ZSTD_MAGICNUMBER)
+			return 1;
+		if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START)
+			return 1;
+	}
+	return 0;
+}
+
+/** ZSTD_frameHeaderSize() :
+*   srcSize must be >= ZSTD_frameHeaderSize_prefix.
+*   @return : size of the Frame Header */
+static size_t ZSTD_frameHeaderSize(const void *src, size_t srcSize)
+{
+	if (srcSize < ZSTD_frameHeaderSize_prefix)
+		return ERROR(srcSize_wrong);
+	{
+		BYTE const fhd = ((const BYTE *)src)[4];
+		U32 const dictID = fhd & 3;
+		U32 const singleSegment = (fhd >> 5) & 1;
+		U32 const fcsId = fhd >> 6;
+		return ZSTD_frameHeaderSize_prefix + !singleSegment + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] + (singleSegment && !fcsId);
+	}
+}
+
+/** ZSTD_getFrameParams() :
+*   decode Frame Header, or require larger `srcSize`.
+*   @return : 0, `fparamsPtr` is correctly filled,
+*            >0, `srcSize` is too small, result is expected `srcSize`,
+*             or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src, size_t srcSize)
+{
+	const BYTE *ip = (const BYTE *)src;
+
+	if (srcSize < ZSTD_frameHeaderSize_prefix)
+		return ZSTD_frameHeaderSize_prefix;
+	if (ZSTD_readLE32(src) != ZSTD_MAGICNUMBER) {
+		if ((ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+			if (srcSize < ZSTD_skippableHeaderSize)
+				return ZSTD_skippableHeaderSize; /* magic number + skippable frame length */
+			memset(fparamsPtr, 0, sizeof(*fparamsPtr));
+			fparamsPtr->frameContentSize = ZSTD_readLE32((const char *)src + 4);
+			fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
+			return 0;
+		}
+		return ERROR(prefix_unknown);
+	}
+
+	/* ensure there is enough `srcSize` to fully read/decode frame header */
+	{
+		size_t const fhsize = ZSTD_frameHeaderSize(src, srcSize);
+		if (srcSize < fhsize)
+			return fhsize;
+	}
+
+	{
+		BYTE const fhdByte = ip[4];
+		size_t pos = 5;
+		U32 const dictIDSizeCode = fhdByte & 3;
+		U32 const checksumFlag = (fhdByte >> 2) & 1;
+		U32 const singleSegment = (fhdByte >> 5) & 1;
+		U32 const fcsID = fhdByte >> 6;
+		U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;
+		U32 windowSize = 0;
+		U32 dictID = 0;
+		U64 frameContentSize = 0;
+		if ((fhdByte & 0x08) != 0)
+			return ERROR(frameParameter_unsupported); /* reserved bits, which must be zero */
+		if (!singleSegment) {
+			BYTE const wlByte = ip[pos++];
+			U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+			if (windowLog > ZSTD_WINDOWLOG_MAX)
+				return ERROR(frameParameter_windowTooLarge); /* avoids issue with 1 << windowLog */
+			windowSize = (1U << windowLog);
+			windowSize += (windowSize >> 3) * (wlByte & 7);
+		}
+
+		switch (dictIDSizeCode) {
+		default: /* impossible */
+		case 0: break;
+		case 1:
+			dictID = ip[pos];
+			pos++;
+			break;
+		case 2:
+			dictID = ZSTD_readLE16(ip + pos);
+			pos += 2;
+			break;
+		case 3:
+			dictID = ZSTD_readLE32(ip + pos);
+			pos += 4;
+			break;
+		}
+		switch (fcsID) {
+		default: /* impossible */
+		case 0:
+			if (singleSegment)
+				frameContentSize = ip[pos];
+			break;
+		case 1: frameContentSize = ZSTD_readLE16(ip + pos) + 256; break;
+		case 2: frameContentSize = ZSTD_readLE32(ip + pos); break;
+		case 3: frameContentSize = ZSTD_readLE64(ip + pos); break;
+		}
+		if (!windowSize)
+			windowSize = (U32)frameContentSize;
+		if (windowSize > windowSizeMax)
+			return ERROR(frameParameter_windowTooLarge);
+		fparamsPtr->frameContentSize = frameContentSize;
+		fparamsPtr->windowSize = windowSize;
+		fparamsPtr->dictID = dictID;
+		fparamsPtr->checksumFlag = checksumFlag;
+	}
+	return 0;
+}
+
+/** ZSTD_getFrameContentSize() :
+*   compatible with legacy mode
+*   @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+*             - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+*             - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+	{
+		ZSTD_frameParams fParams;
+		if (ZSTD_getFrameParams(&fParams, src, srcSize) != 0)
+			return ZSTD_CONTENTSIZE_ERROR;
+		if (fParams.windowSize == 0) {
+			/* Either skippable or empty frame, size == 0 either way */
+			return 0;
+		} else if (fParams.frameContentSize != 0) {
+			return fParams.frameContentSize;
+		} else {
+			return ZSTD_CONTENTSIZE_UNKNOWN;
+		}
+	}
+}
+
+/** ZSTD_findDecompressedSize() :
+ *  compatible with legacy mode
+ *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ *      skippable frames
+ *  @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize)
+{
+	{
+		unsigned long long totalDstSize = 0;
+		while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+			const U32 magicNumber = ZSTD_readLE32(src);
+
+			if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+				size_t skippableSize;
+				if (srcSize < ZSTD_skippableHeaderSize)
+					return ERROR(srcSize_wrong);
+				skippableSize = ZSTD_readLE32((const BYTE *)src + 4) + ZSTD_skippableHeaderSize;
+				if (srcSize < skippableSize) {
+					return ZSTD_CONTENTSIZE_ERROR;
+				}
+
+				src = (const BYTE *)src + skippableSize;
+				srcSize -= skippableSize;
+				continue;
+			}
+
+			{
+				unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+				if (ret >= ZSTD_CONTENTSIZE_ERROR)
+					return ret;
+
+				/* check for overflow */
+				if (totalDstSize + ret < totalDstSize)
+					return ZSTD_CONTENTSIZE_ERROR;
+				totalDstSize += ret;
+			}
+			{
+				size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+				if (ZSTD_isError(frameSrcSize)) {
+					return ZSTD_CONTENTSIZE_ERROR;
+				}
+
+				src = (const BYTE *)src + frameSrcSize;
+				srcSize -= frameSrcSize;
+			}
+		}
+
+		if (srcSize) {
+			return ZSTD_CONTENTSIZE_ERROR;
+		}
+
+		return totalDstSize;
+	}
+}
+
+/** ZSTD_decodeFrameHeader() :
+*   `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+*   @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx *dctx, const void *src, size_t headerSize)
+{
+	size_t const result = ZSTD_getFrameParams(&(dctx->fParams), src, headerSize);
+	if (ZSTD_isError(result))
+		return result; /* invalid header */
+	if (result > 0)
+		return ERROR(srcSize_wrong); /* headerSize too small */
+	if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID))
+		return ERROR(dictionary_wrong);
+	if (dctx->fParams.checksumFlag)
+		xxh64_reset(&dctx->xxhState, 0);
+	return 0;
+}
+
+typedef struct {
+	blockType_e blockType;
+	U32 lastBlock;
+	U32 origSize;
+} blockProperties_t;
+
+/*! ZSTD_getcBlockSize() :
+*   Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void *src, size_t srcSize, blockProperties_t *bpPtr)
+{
+	if (srcSize < ZSTD_blockHeaderSize)
+		return ERROR(srcSize_wrong);
+	{
+		U32 const cBlockHeader = ZSTD_readLE24(src);
+		U32 const cSize = cBlockHeader >> 3;
+		bpPtr->lastBlock = cBlockHeader & 1;
+		bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+		bpPtr->origSize = cSize; /* only useful for RLE */
+		if (bpPtr->blockType == bt_rle)
+			return 1;
+		if (bpPtr->blockType == bt_reserved)
+			return ERROR(corruption_detected);
+		return cSize;
+	}
+}
+
+static size_t ZSTD_copyRawBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	if (srcSize > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+	memcpy(dst, src, srcSize);
+	return srcSize;
+}
+
+static size_t ZSTD_setRleBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize, size_t regenSize)
+{
+	if (srcSize != 1)
+		return ERROR(srcSize_wrong);
+	if (regenSize > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+	memset(dst, *(const BYTE *)src, regenSize);
+	return regenSize;
+}
+
+/*! ZSTD_decodeLiteralsBlock() :
+	@return : nb of bytes read from src (< srcSize ) */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx *dctx, const void *src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
+{
+	if (srcSize < MIN_CBLOCK_SIZE)
+		return ERROR(corruption_detected);
+
+	{
+		const BYTE *const istart = (const BYTE *)src;
+		symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+
+		switch (litEncType) {
+		case set_repeat:
+			if (dctx->litEntropy == 0)
+				return ERROR(dictionary_corrupted);
+		/* fall-through */
+		case set_compressed:
+			if (srcSize < 5)
+				return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
+			{
+				size_t lhSize, litSize, litCSize;
+				U32 singleStream = 0;
+				U32 const lhlCode = (istart[0] >> 2) & 3;
+				U32 const lhc = ZSTD_readLE32(istart);
+				switch (lhlCode) {
+				case 0:
+				case 1:
+				default: /* note : default is impossible, since lhlCode into [0..3] */
+					/* 2 - 2 - 10 - 10 */
+					singleStream = !lhlCode;
+					lhSize = 3;
+					litSize = (lhc >> 4) & 0x3FF;
+					litCSize = (lhc >> 14) & 0x3FF;
+					break;
+				case 2:
+					/* 2 - 2 - 14 - 14 */
+					lhSize = 4;
+					litSize = (lhc >> 4) & 0x3FFF;
+					litCSize = lhc >> 18;
+					break;
+				case 3:
+					/* 2 - 2 - 18 - 18 */
+					lhSize = 5;
+					litSize = (lhc >> 4) & 0x3FFFF;
+					litCSize = (lhc >> 22) + (istart[4] << 10);
+					break;
+				}
+				if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX)
+					return ERROR(corruption_detected);
+				if (litCSize + lhSize > srcSize)
+					return ERROR(corruption_detected);
+
+				if (HUF_isError(
+					(litEncType == set_repeat)
+					    ? (singleStream ? HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr)
+							    : HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr))
+					    : (singleStream
+						   ? HUF_decompress1X2_DCtx_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
+										 dctx->entropy.workspace, sizeof(dctx->entropy.workspace))
+						   : HUF_decompress4X_hufOnly_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
+										   dctx->entropy.workspace, sizeof(dctx->entropy.workspace)))))
+					return ERROR(corruption_detected);
+
+				dctx->litPtr = dctx->litBuffer;
+				dctx->litSize = litSize;
+				dctx->litEntropy = 1;
+				if (litEncType == set_compressed)
+					dctx->HUFptr = dctx->entropy.hufTable;
+				memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+				return litCSize + lhSize;
+			}
+
+		case set_basic: {
+			size_t litSize, lhSize;
+			U32 const lhlCode = ((istart[0]) >> 2) & 3;
+			switch (lhlCode) {
+			case 0:
+			case 2:
+			default: /* note : default is impossible, since lhlCode into [0..3] */
+				lhSize = 1;
+				litSize = istart[0] >> 3;
+				break;
+			case 1:
+				lhSize = 2;
+				litSize = ZSTD_readLE16(istart) >> 4;
+				break;
+			case 3:
+				lhSize = 3;
+				litSize = ZSTD_readLE24(istart) >> 4;
+				break;
+			}
+
+			if (lhSize + litSize + WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
+				if (litSize + lhSize > srcSize)
+					return ERROR(corruption_detected);
+				memcpy(dctx->litBuffer, istart + lhSize, litSize);
+				dctx->litPtr = dctx->litBuffer;
+				dctx->litSize = litSize;
+				memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+				return lhSize + litSize;
+			}
+			/* direct reference into compressed stream */
+			dctx->litPtr = istart + lhSize;
+			dctx->litSize = litSize;
+			return lhSize + litSize;
+		}
+
+		case set_rle: {
+			U32 const lhlCode = ((istart[0]) >> 2) & 3;
+			size_t litSize, lhSize;
+			switch (lhlCode) {
+			case 0:
+			case 2:
+			default: /* note : default is impossible, since lhlCode into [0..3] */
+				lhSize = 1;
+				litSize = istart[0] >> 3;
+				break;
+			case 1:
+				lhSize = 2;
+				litSize = ZSTD_readLE16(istart) >> 4;
+				break;
+			case 3:
+				lhSize = 3;
+				litSize = ZSTD_readLE24(istart) >> 4;
+				if (srcSize < 4)
+					return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+				break;
+			}
+			if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX)
+				return ERROR(corruption_detected);
+			memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+			dctx->litPtr = dctx->litBuffer;
+			dctx->litSize = litSize;
+			return lhSize + 1;
+		}
+		default:
+			return ERROR(corruption_detected); /* impossible */
+		}
+	}
+}
+
+typedef union {
+	FSE_decode_t realData;
+	U32 alignedBy4;
+} FSE_decode_t4;
+
+static const FSE_decode_t4 LL_defaultDTable[(1 << LL_DEFAULTNORMLOG) + 1] = {
+    {{LL_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */
+    {{0, 0, 4}},		 /* 0 : base, symbol, bits */
+    {{16, 0, 4}},
+    {{32, 1, 5}},
+    {{0, 3, 5}},
+    {{0, 4, 5}},
+    {{0, 6, 5}},
+    {{0, 7, 5}},
+    {{0, 9, 5}},
+    {{0, 10, 5}},
+    {{0, 12, 5}},
+    {{0, 14, 6}},
+    {{0, 16, 5}},
+    {{0, 18, 5}},
+    {{0, 19, 5}},
+    {{0, 21, 5}},
+    {{0, 22, 5}},
+    {{0, 24, 5}},
+    {{32, 25, 5}},
+    {{0, 26, 5}},
+    {{0, 27, 6}},
+    {{0, 29, 6}},
+    {{0, 31, 6}},
+    {{32, 0, 4}},
+    {{0, 1, 4}},
+    {{0, 2, 5}},
+    {{32, 4, 5}},
+    {{0, 5, 5}},
+    {{32, 7, 5}},
+    {{0, 8, 5}},
+    {{32, 10, 5}},
+    {{0, 11, 5}},
+    {{0, 13, 6}},
+    {{32, 16, 5}},
+    {{0, 17, 5}},
+    {{32, 19, 5}},
+    {{0, 20, 5}},
+    {{32, 22, 5}},
+    {{0, 23, 5}},
+    {{0, 25, 4}},
+    {{16, 25, 4}},
+    {{32, 26, 5}},
+    {{0, 28, 6}},
+    {{0, 30, 6}},
+    {{48, 0, 4}},
+    {{16, 1, 4}},
+    {{32, 2, 5}},
+    {{32, 3, 5}},
+    {{32, 5, 5}},
+    {{32, 6, 5}},
+    {{32, 8, 5}},
+    {{32, 9, 5}},
+    {{32, 11, 5}},
+    {{32, 12, 5}},
+    {{0, 15, 6}},
+    {{32, 17, 5}},
+    {{32, 18, 5}},
+    {{32, 20, 5}},
+    {{32, 21, 5}},
+    {{32, 23, 5}},
+    {{32, 24, 5}},
+    {{0, 35, 6}},
+    {{0, 34, 6}},
+    {{0, 33, 6}},
+    {{0, 32, 6}},
+}; /* LL_defaultDTable */
+
+static const FSE_decode_t4 ML_defaultDTable[(1 << ML_DEFAULTNORMLOG) + 1] = {
+    {{ML_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */
+    {{0, 0, 6}},		 /* 0 : base, symbol, bits */
+    {{0, 1, 4}},
+    {{32, 2, 5}},
+    {{0, 3, 5}},
+    {{0, 5, 5}},
+    {{0, 6, 5}},
+    {{0, 8, 5}},
+    {{0, 10, 6}},
+    {{0, 13, 6}},
+    {{0, 16, 6}},
+    {{0, 19, 6}},
+    {{0, 22, 6}},
+    {{0, 25, 6}},
+    {{0, 28, 6}},
+    {{0, 31, 6}},
+    {{0, 33, 6}},
+    {{0, 35, 6}},
+    {{0, 37, 6}},
+    {{0, 39, 6}},
+    {{0, 41, 6}},
+    {{0, 43, 6}},
+    {{0, 45, 6}},
+    {{16, 1, 4}},
+    {{0, 2, 4}},
+    {{32, 3, 5}},
+    {{0, 4, 5}},
+    {{32, 6, 5}},
+    {{0, 7, 5}},
+    {{0, 9, 6}},
+    {{0, 12, 6}},
+    {{0, 15, 6}},
+    {{0, 18, 6}},
+    {{0, 21, 6}},
+    {{0, 24, 6}},
+    {{0, 27, 6}},
+    {{0, 30, 6}},
+    {{0, 32, 6}},
+    {{0, 34, 6}},
+    {{0, 36, 6}},
+    {{0, 38, 6}},
+    {{0, 40, 6}},
+    {{0, 42, 6}},
+    {{0, 44, 6}},
+    {{32, 1, 4}},
+    {{48, 1, 4}},
+    {{16, 2, 4}},
+    {{32, 4, 5}},
+    {{32, 5, 5}},
+    {{32, 7, 5}},
+    {{32, 8, 5}},
+    {{0, 11, 6}},
+    {{0, 14, 6}},
+    {{0, 17, 6}},
+    {{0, 20, 6}},
+    {{0, 23, 6}},
+    {{0, 26, 6}},
+    {{0, 29, 6}},
+    {{0, 52, 6}},
+    {{0, 51, 6}},
+    {{0, 50, 6}},
+    {{0, 49, 6}},
+    {{0, 48, 6}},
+    {{0, 47, 6}},
+    {{0, 46, 6}},
+}; /* ML_defaultDTable */
+
+static const FSE_decode_t4 OF_defaultDTable[(1 << OF_DEFAULTNORMLOG) + 1] = {
+    {{OF_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */
+    {{0, 0, 5}},		 /* 0 : base, symbol, bits */
+    {{0, 6, 4}},
+    {{0, 9, 5}},
+    {{0, 15, 5}},
+    {{0, 21, 5}},
+    {{0, 3, 5}},
+    {{0, 7, 4}},
+    {{0, 12, 5}},
+    {{0, 18, 5}},
+    {{0, 23, 5}},
+    {{0, 5, 5}},
+    {{0, 8, 4}},
+    {{0, 14, 5}},
+    {{0, 20, 5}},
+    {{0, 2, 5}},
+    {{16, 7, 4}},
+    {{0, 11, 5}},
+    {{0, 17, 5}},
+    {{0, 22, 5}},
+    {{0, 4, 5}},
+    {{16, 8, 4}},
+    {{0, 13, 5}},
+    {{0, 19, 5}},
+    {{0, 1, 5}},
+    {{16, 6, 4}},
+    {{0, 10, 5}},
+    {{0, 16, 5}},
+    {{0, 28, 5}},
+    {{0, 27, 5}},
+    {{0, 26, 5}},
+    {{0, 25, 5}},
+    {{0, 24, 5}},
+}; /* OF_defaultDTable */
+
+/*! ZSTD_buildSeqTable() :
+	@return : nb bytes read from src,
+			  or an error code if it fails, testable with ZSTD_isError()
+*/
+static size_t ZSTD_buildSeqTable(FSE_DTable *DTableSpace, const FSE_DTable **DTablePtr, symbolEncodingType_e type, U32 max, U32 maxLog, const void *src,
+				 size_t srcSize, const FSE_decode_t4 *defaultTable, U32 flagRepeatTable, void *workspace, size_t workspaceSize)
+{
+	const void *const tmpPtr = defaultTable; /* bypass strict aliasing */
+	switch (type) {
+	case set_rle:
+		if (!srcSize)
+			return ERROR(srcSize_wrong);
+		if ((*(const BYTE *)src) > max)
+			return ERROR(corruption_detected);
+		FSE_buildDTable_rle(DTableSpace, *(const BYTE *)src);
+		*DTablePtr = DTableSpace;
+		return 1;
+	case set_basic: *DTablePtr = (const FSE_DTable *)tmpPtr; return 0;
+	case set_repeat:
+		if (!flagRepeatTable)
+			return ERROR(corruption_detected);
+		return 0;
+	default: /* impossible */
+	case set_compressed: {
+		U32 tableLog;
+		S16 *norm = (S16 *)workspace;
+		size_t const spaceUsed32 = ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
+
+		if ((spaceUsed32 << 2) > workspaceSize)
+			return ERROR(GENERIC);
+		workspace = (U32 *)workspace + spaceUsed32;
+		workspaceSize -= (spaceUsed32 << 2);
+		{
+			size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+			if (FSE_isError(headerSize))
+				return ERROR(corruption_detected);
+			if (tableLog > maxLog)
+				return ERROR(corruption_detected);
+			FSE_buildDTable_wksp(DTableSpace, norm, max, tableLog, workspace, workspaceSize);
+			*DTablePtr = DTableSpace;
+			return headerSize;
+		}
+	}
+	}
+}
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx *dctx, int *nbSeqPtr, const void *src, size_t srcSize)
+{
+	const BYTE *const istart = (const BYTE *const)src;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *ip = istart;
+
+	/* check */
+	if (srcSize < MIN_SEQUENCES_SIZE)
+		return ERROR(srcSize_wrong);
+
+	/* SeqHead */
+	{
+		int nbSeq = *ip++;
+		if (!nbSeq) {
+			*nbSeqPtr = 0;
+			return 1;
+		}
+		if (nbSeq > 0x7F) {
+			if (nbSeq == 0xFF) {
+				if (ip + 2 > iend)
+					return ERROR(srcSize_wrong);
+				nbSeq = ZSTD_readLE16(ip) + LONGNBSEQ, ip += 2;
+			} else {
+				if (ip >= iend)
+					return ERROR(srcSize_wrong);
+				nbSeq = ((nbSeq - 0x80) << 8) + *ip++;
+			}
+		}
+		*nbSeqPtr = nbSeq;
+	}
+
+	/* FSE table descriptors */
+	if (ip + 4 > iend)
+		return ERROR(srcSize_wrong); /* minimum possible size */
+	{
+		symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+		symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+		symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+		ip++;
+
+		/* Build DTables */
+		{
+			size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, LLtype, MaxLL, LLFSELog, ip, iend - ip,
+								  LL_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
+			if (ZSTD_isError(llhSize))
+				return ERROR(corruption_detected);
+			ip += llhSize;
+		}
+		{
+			size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, OFtype, MaxOff, OffFSELog, ip, iend - ip,
+								  OF_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
+			if (ZSTD_isError(ofhSize))
+				return ERROR(corruption_detected);
+			ip += ofhSize;
+		}
+		{
+			size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, MLtype, MaxML, MLFSELog, ip, iend - ip,
+								  ML_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
+			if (ZSTD_isError(mlhSize))
+				return ERROR(corruption_detected);
+			ip += mlhSize;
+		}
+	}
+
+	return ip - istart;
+}
+
+typedef struct {
+	size_t litLength;
+	size_t matchLength;
+	size_t offset;
+	const BYTE *match;
+} seq_t;
+
+typedef struct {
+	BIT_DStream_t DStream;
+	FSE_DState_t stateLL;
+	FSE_DState_t stateOffb;
+	FSE_DState_t stateML;
+	size_t prevOffset[ZSTD_REP_NUM];
+	const BYTE *base;
+	size_t pos;
+	uPtrDiff gotoDict;
+} seqState_t;
+
+FORCE_NOINLINE
+size_t ZSTD_execSequenceLast7(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base,
+			      const BYTE *const vBase, const BYTE *const dictEnd)
+{
+	BYTE *const oLitEnd = op + sequence.litLength;
+	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+	BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+	BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH;
+	const BYTE *const iLitEnd = *litPtr + sequence.litLength;
+	const BYTE *match = oLitEnd - sequence.offset;
+
+	/* check */
+	if (oMatchEnd > oend)
+		return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+	if (iLitEnd > litLimit)
+		return ERROR(corruption_detected); /* over-read beyond lit buffer */
+	if (oLitEnd <= oend_w)
+		return ERROR(GENERIC); /* Precondition */
+
+	/* copy literals */
+	if (op < oend_w) {
+		ZSTD_wildcopy(op, *litPtr, oend_w - op);
+		*litPtr += oend_w - op;
+		op = oend_w;
+	}
+	while (op < oLitEnd)
+		*op++ = *(*litPtr)++;
+
+	/* copy Match */
+	if (sequence.offset > (size_t)(oLitEnd - base)) {
+		/* offset beyond prefix */
+		if (sequence.offset > (size_t)(oLitEnd - vBase))
+			return ERROR(corruption_detected);
+		match = dictEnd - (base - match);
+		if (match + sequence.matchLength <= dictEnd) {
+			memmove(oLitEnd, match, sequence.matchLength);
+			return sequenceLength;
+		}
+		/* span extDict & currPrefixSegment */
+		{
+			size_t const length1 = dictEnd - match;
+			memmove(oLitEnd, match, length1);
+			op = oLitEnd + length1;
+			sequence.matchLength -= length1;
+			match = base;
+		}
+	}
+	while (op < oMatchEnd)
+		*op++ = *match++;
+	return sequenceLength;
+}
+
+static seq_t ZSTD_decodeSequence(seqState_t *seqState)
+{
+	seq_t seq;
+
+	U32 const llCode = FSE_peekSymbol(&seqState->stateLL);
+	U32 const mlCode = FSE_peekSymbol(&seqState->stateML);
+	U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */
+
+	U32 const llBits = LL_bits[llCode];
+	U32 const mlBits = ML_bits[mlCode];
+	U32 const ofBits = ofCode;
+	U32 const totalBits = llBits + mlBits + ofBits;
+
+	static const U32 LL_base[MaxLL + 1] = {0,  1,  2,  3,  4,  5,  6,  7,  8,    9,     10,    11,    12,    13,     14,     15,     16,     18,
+					       20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000};
+
+	static const U32 ML_base[MaxML + 1] = {3,  4,  5,  6,  7,  8,  9,  10,   11,    12,    13,    14,    15,     16,     17,     18,     19,     20,
+					       21, 22, 23, 24, 25, 26, 27, 28,   29,    30,    31,    32,    33,     34,     35,     37,     39,     41,
+					       43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, 0x1003, 0x2003, 0x4003, 0x8003, 0x10003};
+
+	static const U32 OF_base[MaxOff + 1] = {0,       1,	1,	5,	0xD,      0x1D,      0x3D,      0x7D,      0xFD,     0x1FD,
+						0x3FD,   0x7FD,    0xFFD,    0x1FFD,   0x3FFD,   0x7FFD,    0xFFFD,    0x1FFFD,   0x3FFFD,  0x7FFFD,
+						0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD};
+
+	/* sequence */
+	{
+		size_t offset;
+		if (!ofCode)
+			offset = 0;
+		else {
+			offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+			if (ZSTD_32bits())
+				BIT_reloadDStream(&seqState->DStream);
+		}
+
+		if (ofCode <= 1) {
+			offset += (llCode == 0);
+			if (offset) {
+				size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+				temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
+				if (offset != 1)
+					seqState->prevOffset[2] = seqState->prevOffset[1];
+				seqState->prevOffset[1] = seqState->prevOffset[0];
+				seqState->prevOffset[0] = offset = temp;
+			} else {
+				offset = seqState->prevOffset[0];
+			}
+		} else {
+			seqState->prevOffset[2] = seqState->prevOffset[1];
+			seqState->prevOffset[1] = seqState->prevOffset[0];
+			seqState->prevOffset[0] = offset;
+		}
+		seq.offset = offset;
+	}
+
+	seq.matchLength = ML_base[mlCode] + ((mlCode > 31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <=  16 bits */
+	if (ZSTD_32bits() && (mlBits + llBits > 24))
+		BIT_reloadDStream(&seqState->DStream);
+
+	seq.litLength = LL_base[llCode] + ((llCode > 15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <=  16 bits */
+	if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
+		BIT_reloadDStream(&seqState->DStream);
+
+	/* ANS state update */
+	FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <=  9 bits */
+	FSE_updateState(&seqState->stateML, &seqState->DStream); /* <=  9 bits */
+	if (ZSTD_32bits())
+		BIT_reloadDStream(&seqState->DStream);		   /* <= 18 bits */
+	FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <=  8 bits */
+
+	seq.match = NULL;
+
+	return seq;
+}
+
+FORCE_INLINE
+size_t ZSTD_execSequence(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base,
+			 const BYTE *const vBase, const BYTE *const dictEnd)
+{
+	BYTE *const oLitEnd = op + sequence.litLength;
+	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+	BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+	BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH;
+	const BYTE *const iLitEnd = *litPtr + sequence.litLength;
+	const BYTE *match = oLitEnd - sequence.offset;
+
+	/* check */
+	if (oMatchEnd > oend)
+		return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+	if (iLitEnd > litLimit)
+		return ERROR(corruption_detected); /* over-read beyond lit buffer */
+	if (oLitEnd > oend_w)
+		return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+
+	/* copy Literals */
+	ZSTD_copy8(op, *litPtr);
+	if (sequence.litLength > 8)
+		ZSTD_wildcopy(op + 8, (*litPtr) + 8,
+			      sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+	op = oLitEnd;
+	*litPtr = iLitEnd; /* update for next sequence */
+
+	/* copy Match */
+	if (sequence.offset > (size_t)(oLitEnd - base)) {
+		/* offset beyond prefix */
+		if (sequence.offset > (size_t)(oLitEnd - vBase))
+			return ERROR(corruption_detected);
+		match = dictEnd + (match - base);
+		if (match + sequence.matchLength <= dictEnd) {
+			memmove(oLitEnd, match, sequence.matchLength);
+			return sequenceLength;
+		}
+		/* span extDict & currPrefixSegment */
+		{
+			size_t const length1 = dictEnd - match;
+			memmove(oLitEnd, match, length1);
+			op = oLitEnd + length1;
+			sequence.matchLength -= length1;
+			match = base;
+			if (op > oend_w || sequence.matchLength < MINMATCH) {
+				U32 i;
+				for (i = 0; i < sequence.matchLength; ++i)
+					op[i] = match[i];
+				return sequenceLength;
+			}
+		}
+	}
+	/* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+
+	/* match within prefix */
+	if (sequence.offset < 8) {
+		/* close range match, overlap */
+		static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+		static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */
+		int const sub2 = dec64table[sequence.offset];
+		op[0] = match[0];
+		op[1] = match[1];
+		op[2] = match[2];
+		op[3] = match[3];
+		match += dec32table[sequence.offset];
+		ZSTD_copy4(op + 4, match);
+		match -= sub2;
+	} else {
+		ZSTD_copy8(op, match);
+	}
+	op += 8;
+	match += 8;
+
+	if (oMatchEnd > oend - (16 - MINMATCH)) {
+		if (op < oend_w) {
+			ZSTD_wildcopy(op, match, oend_w - op);
+			match += oend_w - op;
+			op = oend_w;
+		}
+		while (op < oMatchEnd)
+			*op++ = *match++;
+	} else {
+		ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */
+	}
+	return sequenceLength;
+}
+
+static size_t ZSTD_decompressSequences(ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, const void *seqStart, size_t seqSize)
+{
+	const BYTE *ip = (const BYTE *)seqStart;
+	const BYTE *const iend = ip + seqSize;
+	BYTE *const ostart = (BYTE * const)dst;
+	BYTE *const oend = ostart + maxDstSize;
+	BYTE *op = ostart;
+	const BYTE *litPtr = dctx->litPtr;
+	const BYTE *const litEnd = litPtr + dctx->litSize;
+	const BYTE *const base = (const BYTE *)(dctx->base);
+	const BYTE *const vBase = (const BYTE *)(dctx->vBase);
+	const BYTE *const dictEnd = (const BYTE *)(dctx->dictEnd);
+	int nbSeq;
+
+	/* Build Decoding Tables */
+	{
+		size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize);
+		if (ZSTD_isError(seqHSize))
+			return seqHSize;
+		ip += seqHSize;
+	}
+
+	/* Regen sequences */
+	if (nbSeq) {
+		seqState_t seqState;
+		dctx->fseEntropy = 1;
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				seqState.prevOffset[i] = dctx->entropy.rep[i];
+		}
+		CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected);
+		FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+		FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+		FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+		for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq;) {
+			nbSeq--;
+			{
+				seq_t const sequence = ZSTD_decodeSequence(&seqState);
+				size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+				if (ZSTD_isError(oneSeqSize))
+					return oneSeqSize;
+				op += oneSeqSize;
+			}
+		}
+
+		/* check if reached exact end */
+		if (nbSeq)
+			return ERROR(corruption_detected);
+		/* save reps for next block */
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
+		}
+	}
+
+	/* last literal segment */
+	{
+		size_t const lastLLSize = litEnd - litPtr;
+		if (lastLLSize > (size_t)(oend - op))
+			return ERROR(dstSize_tooSmall);
+		memcpy(op, litPtr, lastLLSize);
+		op += lastLLSize;
+	}
+
+	return op - ostart;
+}
+
+FORCE_INLINE seq_t ZSTD_decodeSequenceLong_generic(seqState_t *seqState, int const longOffsets)
+{
+	seq_t seq;
+
+	U32 const llCode = FSE_peekSymbol(&seqState->stateLL);
+	U32 const mlCode = FSE_peekSymbol(&seqState->stateML);
+	U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */
+
+	U32 const llBits = LL_bits[llCode];
+	U32 const mlBits = ML_bits[mlCode];
+	U32 const ofBits = ofCode;
+	U32 const totalBits = llBits + mlBits + ofBits;
+
+	static const U32 LL_base[MaxLL + 1] = {0,  1,  2,  3,  4,  5,  6,  7,  8,    9,     10,    11,    12,    13,     14,     15,     16,     18,
+					       20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000};
+
+	static const U32 ML_base[MaxML + 1] = {3,  4,  5,  6,  7,  8,  9,  10,   11,    12,    13,    14,    15,     16,     17,     18,     19,     20,
+					       21, 22, 23, 24, 25, 26, 27, 28,   29,    30,    31,    32,    33,     34,     35,     37,     39,     41,
+					       43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, 0x1003, 0x2003, 0x4003, 0x8003, 0x10003};
+
+	static const U32 OF_base[MaxOff + 1] = {0,       1,	1,	5,	0xD,      0x1D,      0x3D,      0x7D,      0xFD,     0x1FD,
+						0x3FD,   0x7FD,    0xFFD,    0x1FFD,   0x3FFD,   0x7FFD,    0xFFFD,    0x1FFFD,   0x3FFFD,  0x7FFFD,
+						0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD};
+
+	/* sequence */
+	{
+		size_t offset;
+		if (!ofCode)
+			offset = 0;
+		else {
+			if (longOffsets) {
+				int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN);
+				offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+				if (ZSTD_32bits() || extraBits)
+					BIT_reloadDStream(&seqState->DStream);
+				if (extraBits)
+					offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+			} else {
+				offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+				if (ZSTD_32bits())
+					BIT_reloadDStream(&seqState->DStream);
+			}
+		}
+
+		if (ofCode <= 1) {
+			offset += (llCode == 0);
+			if (offset) {
+				size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+				temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
+				if (offset != 1)
+					seqState->prevOffset[2] = seqState->prevOffset[1];
+				seqState->prevOffset[1] = seqState->prevOffset[0];
+				seqState->prevOffset[0] = offset = temp;
+			} else {
+				offset = seqState->prevOffset[0];
+			}
+		} else {
+			seqState->prevOffset[2] = seqState->prevOffset[1];
+			seqState->prevOffset[1] = seqState->prevOffset[0];
+			seqState->prevOffset[0] = offset;
+		}
+		seq.offset = offset;
+	}
+
+	seq.matchLength = ML_base[mlCode] + ((mlCode > 31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <=  16 bits */
+	if (ZSTD_32bits() && (mlBits + llBits > 24))
+		BIT_reloadDStream(&seqState->DStream);
+
+	seq.litLength = LL_base[llCode] + ((llCode > 15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <=  16 bits */
+	if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
+		BIT_reloadDStream(&seqState->DStream);
+
+	{
+		size_t const pos = seqState->pos + seq.litLength;
+		seq.match = seqState->base + pos - seq.offset; /* single memory segment */
+		if (seq.offset > pos)
+			seq.match += seqState->gotoDict; /* separate memory segment */
+		seqState->pos = pos + seq.matchLength;
+	}
+
+	/* ANS state update */
+	FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <=  9 bits */
+	FSE_updateState(&seqState->stateML, &seqState->DStream); /* <=  9 bits */
+	if (ZSTD_32bits())
+		BIT_reloadDStream(&seqState->DStream);		   /* <= 18 bits */
+	FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <=  8 bits */
+
+	return seq;
+}
+
+static seq_t ZSTD_decodeSequenceLong(seqState_t *seqState, unsigned const windowSize)
+{
+	if (ZSTD_highbit32(windowSize) > STREAM_ACCUMULATOR_MIN) {
+		return ZSTD_decodeSequenceLong_generic(seqState, 1);
+	} else {
+		return ZSTD_decodeSequenceLong_generic(seqState, 0);
+	}
+}
+
+FORCE_INLINE
+size_t ZSTD_execSequenceLong(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base,
+			     const BYTE *const vBase, const BYTE *const dictEnd)
+{
+	BYTE *const oLitEnd = op + sequence.litLength;
+	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+	BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+	BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH;
+	const BYTE *const iLitEnd = *litPtr + sequence.litLength;
+	const BYTE *match = sequence.match;
+
+	/* check */
+	if (oMatchEnd > oend)
+		return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+	if (iLitEnd > litLimit)
+		return ERROR(corruption_detected); /* over-read beyond lit buffer */
+	if (oLitEnd > oend_w)
+		return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+
+	/* copy Literals */
+	ZSTD_copy8(op, *litPtr);
+	if (sequence.litLength > 8)
+		ZSTD_wildcopy(op + 8, (*litPtr) + 8,
+			      sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+	op = oLitEnd;
+	*litPtr = iLitEnd; /* update for next sequence */
+
+	/* copy Match */
+	if (sequence.offset > (size_t)(oLitEnd - base)) {
+		/* offset beyond prefix */
+		if (sequence.offset > (size_t)(oLitEnd - vBase))
+			return ERROR(corruption_detected);
+		if (match + sequence.matchLength <= dictEnd) {
+			memmove(oLitEnd, match, sequence.matchLength);
+			return sequenceLength;
+		}
+		/* span extDict & currPrefixSegment */
+		{
+			size_t const length1 = dictEnd - match;
+			memmove(oLitEnd, match, length1);
+			op = oLitEnd + length1;
+			sequence.matchLength -= length1;
+			match = base;
+			if (op > oend_w || sequence.matchLength < MINMATCH) {
+				U32 i;
+				for (i = 0; i < sequence.matchLength; ++i)
+					op[i] = match[i];
+				return sequenceLength;
+			}
+		}
+	}
+	/* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+
+	/* match within prefix */
+	if (sequence.offset < 8) {
+		/* close range match, overlap */
+		static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+		static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */
+		int const sub2 = dec64table[sequence.offset];
+		op[0] = match[0];
+		op[1] = match[1];
+		op[2] = match[2];
+		op[3] = match[3];
+		match += dec32table[sequence.offset];
+		ZSTD_copy4(op + 4, match);
+		match -= sub2;
+	} else {
+		ZSTD_copy8(op, match);
+	}
+	op += 8;
+	match += 8;
+
+	if (oMatchEnd > oend - (16 - MINMATCH)) {
+		if (op < oend_w) {
+			ZSTD_wildcopy(op, match, oend_w - op);
+			match += oend_w - op;
+			op = oend_w;
+		}
+		while (op < oMatchEnd)
+			*op++ = *match++;
+	} else {
+		ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */
+	}
+	return sequenceLength;
+}
+
+static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, const void *seqStart, size_t seqSize)
+{
+	const BYTE *ip = (const BYTE *)seqStart;
+	const BYTE *const iend = ip + seqSize;
+	BYTE *const ostart = (BYTE * const)dst;
+	BYTE *const oend = ostart + maxDstSize;
+	BYTE *op = ostart;
+	const BYTE *litPtr = dctx->litPtr;
+	const BYTE *const litEnd = litPtr + dctx->litSize;
+	const BYTE *const base = (const BYTE *)(dctx->base);
+	const BYTE *const vBase = (const BYTE *)(dctx->vBase);
+	const BYTE *const dictEnd = (const BYTE *)(dctx->dictEnd);
+	unsigned const windowSize = dctx->fParams.windowSize;
+	int nbSeq;
+
+	/* Build Decoding Tables */
+	{
+		size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize);
+		if (ZSTD_isError(seqHSize))
+			return seqHSize;
+		ip += seqHSize;
+	}
+
+	/* Regen sequences */
+	if (nbSeq) {
+#define STORED_SEQS 4
+#define STOSEQ_MASK (STORED_SEQS - 1)
+#define ADVANCED_SEQS 4
+		seq_t *sequences = (seq_t *)dctx->entropy.workspace;
+		int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+		seqState_t seqState;
+		int seqNb;
+		ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.workspace) >= sizeof(seq_t) * STORED_SEQS);
+		dctx->fseEntropy = 1;
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				seqState.prevOffset[i] = dctx->entropy.rep[i];
+		}
+		seqState.base = base;
+		seqState.pos = (size_t)(op - base);
+		seqState.gotoDict = (uPtrDiff)dictEnd - (uPtrDiff)base; /* cast to avoid undefined behaviour */
+		CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected);
+		FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+		FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+		FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+		/* prepare in advance */
+		for (seqNb = 0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb < seqAdvance; seqNb++) {
+			sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, windowSize);
+		}
+		if (seqNb < seqAdvance)
+			return ERROR(corruption_detected);
+
+		/* decode and decompress */
+		for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNb < nbSeq; seqNb++) {
+			seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, windowSize);
+			size_t const oneSeqSize =
+			    ZSTD_execSequenceLong(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd);
+			if (ZSTD_isError(oneSeqSize))
+				return oneSeqSize;
+			ZSTD_PREFETCH(sequence.match);
+			sequences[seqNb & STOSEQ_MASK] = sequence;
+			op += oneSeqSize;
+		}
+		if (seqNb < nbSeq)
+			return ERROR(corruption_detected);
+
+		/* finish queue */
+		seqNb -= seqAdvance;
+		for (; seqNb < nbSeq; seqNb++) {
+			size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd);
+			if (ZSTD_isError(oneSeqSize))
+				return oneSeqSize;
+			op += oneSeqSize;
+		}
+
+		/* save reps for next block */
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
+		}
+	}
+
+	/* last literal segment */
+	{
+		size_t const lastLLSize = litEnd - litPtr;
+		if (lastLLSize > (size_t)(oend - op))
+			return ERROR(dstSize_tooSmall);
+		memcpy(op, litPtr, lastLLSize);
+		op += lastLLSize;
+	}
+
+	return op - ostart;
+}
+
+static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{ /* blockType == blockCompressed */
+	const BYTE *ip = (const BYTE *)src;
+
+	if (srcSize >= ZSTD_BLOCKSIZE_ABSOLUTEMAX)
+		return ERROR(srcSize_wrong);
+
+	/* Decode literals section */
+	{
+		size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+		if (ZSTD_isError(litCSize))
+			return litCSize;
+		ip += litCSize;
+		srcSize -= litCSize;
+	}
+	if (sizeof(size_t) > 4) /* do not enable prefetching on 32-bits x86, as it's performance detrimental */
+				/* likely because of register pressure */
+				/* if that's the correct cause, then 32-bits ARM should be affected differently */
+				/* it would be good to test this on ARM real hardware, to see if prefetch version improves speed */
+		if (dctx->fParams.windowSize > (1 << 23))
+			return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize);
+	return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+static void ZSTD_checkContinuity(ZSTD_DCtx *dctx, const void *dst)
+{
+	if (dst != dctx->previousDstEnd) { /* not contiguous */
+		dctx->dictEnd = dctx->previousDstEnd;
+		dctx->vBase = (const char *)dst - ((const char *)(dctx->previousDstEnd) - (const char *)(dctx->base));
+		dctx->base = dst;
+		dctx->previousDstEnd = dst;
+	}
+}
+
+size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t dSize;
+	ZSTD_checkContinuity(dctx, dst);
+	dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+	dctx->previousDstEnd = (char *)dst + dSize;
+	return dSize;
+}
+
+/** ZSTD_insertBlock() :
+	insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart, size_t blockSize)
+{
+	ZSTD_checkContinuity(dctx, blockStart);
+	dctx->previousDstEnd = (const char *)blockStart + blockSize;
+	return blockSize;
+}
+
+size_t ZSTD_generateNxBytes(void *dst, size_t dstCapacity, BYTE byte, size_t length)
+{
+	if (length > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+	memset(dst, byte, length);
+	return length;
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the compressed size of the frame starting at `src` */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+	if (srcSize >= ZSTD_skippableHeaderSize && (ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+		return ZSTD_skippableHeaderSize + ZSTD_readLE32((const BYTE *)src + 4);
+	} else {
+		const BYTE *ip = (const BYTE *)src;
+		const BYTE *const ipstart = ip;
+		size_t remainingSize = srcSize;
+		ZSTD_frameParams fParams;
+
+		size_t const headerSize = ZSTD_frameHeaderSize(ip, remainingSize);
+		if (ZSTD_isError(headerSize))
+			return headerSize;
+
+		/* Frame Header */
+		{
+			size_t const ret = ZSTD_getFrameParams(&fParams, ip, remainingSize);
+			if (ZSTD_isError(ret))
+				return ret;
+			if (ret > 0)
+				return ERROR(srcSize_wrong);
+		}
+
+		ip += headerSize;
+		remainingSize -= headerSize;
+
+		/* Loop on each block */
+		while (1) {
+			blockProperties_t blockProperties;
+			size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+			if (ZSTD_isError(cBlockSize))
+				return cBlockSize;
+
+			if (ZSTD_blockHeaderSize + cBlockSize > remainingSize)
+				return ERROR(srcSize_wrong);
+
+			ip += ZSTD_blockHeaderSize + cBlockSize;
+			remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+
+			if (blockProperties.lastBlock)
+				break;
+		}
+
+		if (fParams.checksumFlag) { /* Frame content checksum */
+			if (remainingSize < 4)
+				return ERROR(srcSize_wrong);
+			ip += 4;
+			remainingSize -= 4;
+		}
+
+		return ip - ipstart;
+	}
+}
+
+/*! ZSTD_decompressFrame() :
+*   @dctx must be properly initialized */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void **srcPtr, size_t *srcSizePtr)
+{
+	const BYTE *ip = (const BYTE *)(*srcPtr);
+	BYTE *const ostart = (BYTE * const)dst;
+	BYTE *const oend = ostart + dstCapacity;
+	BYTE *op = ostart;
+	size_t remainingSize = *srcSizePtr;
+
+	/* check */
+	if (remainingSize < ZSTD_frameHeaderSize_min + ZSTD_blockHeaderSize)
+		return ERROR(srcSize_wrong);
+
+	/* Frame Header */
+	{
+		size_t const frameHeaderSize = ZSTD_frameHeaderSize(ip, ZSTD_frameHeaderSize_prefix);
+		if (ZSTD_isError(frameHeaderSize))
+			return frameHeaderSize;
+		if (remainingSize < frameHeaderSize + ZSTD_blockHeaderSize)
+			return ERROR(srcSize_wrong);
+		CHECK_F(ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize));
+		ip += frameHeaderSize;
+		remainingSize -= frameHeaderSize;
+	}
+
+	/* Loop on each block */
+	while (1) {
+		size_t decodedSize;
+		blockProperties_t blockProperties;
+		size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+		if (ZSTD_isError(cBlockSize))
+			return cBlockSize;
+
+		ip += ZSTD_blockHeaderSize;
+		remainingSize -= ZSTD_blockHeaderSize;
+		if (cBlockSize > remainingSize)
+			return ERROR(srcSize_wrong);
+
+		switch (blockProperties.blockType) {
+		case bt_compressed: decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend - op, ip, cBlockSize); break;
+		case bt_raw: decodedSize = ZSTD_copyRawBlock(op, oend - op, ip, cBlockSize); break;
+		case bt_rle: decodedSize = ZSTD_generateNxBytes(op, oend - op, *ip, blockProperties.origSize); break;
+		case bt_reserved:
+		default: return ERROR(corruption_detected);
+		}
+
+		if (ZSTD_isError(decodedSize))
+			return decodedSize;
+		if (dctx->fParams.checksumFlag)
+			xxh64_update(&dctx->xxhState, op, decodedSize);
+		op += decodedSize;
+		ip += cBlockSize;
+		remainingSize -= cBlockSize;
+		if (blockProperties.lastBlock)
+			break;
+	}
+
+	if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */
+		U32 const checkCalc = (U32)xxh64_digest(&dctx->xxhState);
+		U32 checkRead;
+		if (remainingSize < 4)
+			return ERROR(checksum_wrong);
+		checkRead = ZSTD_readLE32(ip);
+		if (checkRead != checkCalc)
+			return ERROR(checksum_wrong);
+		ip += 4;
+		remainingSize -= 4;
+	}
+
+	/* Allow caller to get size read */
+	*srcPtr = ip;
+	*srcSizePtr = remainingSize;
+	return op - ostart;
+}
+
+static const void *ZSTD_DDictDictContent(const ZSTD_DDict *ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict *ddict);
+
+static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize,
+					const ZSTD_DDict *ddict)
+{
+	void *const dststart = dst;
+
+	if (ddict) {
+		if (dict) {
+			/* programmer error, these two cases should be mutually exclusive */
+			return ERROR(GENERIC);
+		}
+
+		dict = ZSTD_DDictDictContent(ddict);
+		dictSize = ZSTD_DDictDictSize(ddict);
+	}
+
+	while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+		U32 magicNumber;
+
+		magicNumber = ZSTD_readLE32(src);
+		if (magicNumber != ZSTD_MAGICNUMBER) {
+			if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+				size_t skippableSize;
+				if (srcSize < ZSTD_skippableHeaderSize)
+					return ERROR(srcSize_wrong);
+				skippableSize = ZSTD_readLE32((const BYTE *)src + 4) + ZSTD_skippableHeaderSize;
+				if (srcSize < skippableSize) {
+					return ERROR(srcSize_wrong);
+				}
+
+				src = (const BYTE *)src + skippableSize;
+				srcSize -= skippableSize;
+				continue;
+			} else {
+				return ERROR(prefix_unknown);
+			}
+		}
+
+		if (ddict) {
+			/* we were called from ZSTD_decompress_usingDDict */
+			ZSTD_refDDict(dctx, ddict);
+		} else {
+			/* this will initialize correctly with no dict if dict == NULL, so
+			 * use this in all cases but ddict */
+			CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize));
+		}
+		ZSTD_checkContinuity(dctx, dst);
+
+		{
+			const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, &src, &srcSize);
+			if (ZSTD_isError(res))
+				return res;
+			/* don't need to bounds check this, ZSTD_decompressFrame will have
+			 * already */
+			dst = (BYTE *)dst + res;
+			dstCapacity -= res;
+		}
+	}
+
+	if (srcSize)
+		return ERROR(srcSize_wrong); /* input not entirely consumed */
+
+	return (BYTE *)dst - (BYTE *)dststart;
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize)
+{
+	return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	return ZSTD_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+/*-**************************************
+*   Advanced Streaming Decompression API
+*   Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx) { return dctx->expected; }
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx)
+{
+	switch (dctx->stage) {
+	default: /* should not happen */
+	case ZSTDds_getFrameHeaderSize:
+	case ZSTDds_decodeFrameHeader: return ZSTDnit_frameHeader;
+	case ZSTDds_decodeBlockHeader: return ZSTDnit_blockHeader;
+	case ZSTDds_decompressBlock: return ZSTDnit_block;
+	case ZSTDds_decompressLastBlock: return ZSTDnit_lastBlock;
+	case ZSTDds_checkChecksum: return ZSTDnit_checksum;
+	case ZSTDds_decodeSkippableHeader:
+	case ZSTDds_skipFrame: return ZSTDnit_skippableFrame;
+	}
+}
+
+int ZSTD_isSkipFrame(ZSTD_DCtx *dctx) { return dctx->stage == ZSTDds_skipFrame; } /* for zbuff */
+
+/** ZSTD_decompressContinue() :
+*   @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+*             or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	/* Sanity check */
+	if (srcSize != dctx->expected)
+		return ERROR(srcSize_wrong);
+	if (dstCapacity)
+		ZSTD_checkContinuity(dctx, dst);
+
+	switch (dctx->stage) {
+	case ZSTDds_getFrameHeaderSize:
+		if (srcSize != ZSTD_frameHeaderSize_prefix)
+			return ERROR(srcSize_wrong);					/* impossible */
+		if ((ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */
+			memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix);
+			dctx->expected = ZSTD_skippableHeaderSize - ZSTD_frameHeaderSize_prefix; /* magic number + skippable frame length */
+			dctx->stage = ZSTDds_decodeSkippableHeader;
+			return 0;
+		}
+		dctx->headerSize = ZSTD_frameHeaderSize(src, ZSTD_frameHeaderSize_prefix);
+		if (ZSTD_isError(dctx->headerSize))
+			return dctx->headerSize;
+		memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix);
+		if (dctx->headerSize > ZSTD_frameHeaderSize_prefix) {
+			dctx->expected = dctx->headerSize - ZSTD_frameHeaderSize_prefix;
+			dctx->stage = ZSTDds_decodeFrameHeader;
+			return 0;
+		}
+		dctx->expected = 0; /* not necessary to copy more */
+
+	case ZSTDds_decodeFrameHeader:
+		memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected);
+		CHECK_F(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize));
+		dctx->expected = ZSTD_blockHeaderSize;
+		dctx->stage = ZSTDds_decodeBlockHeader;
+		return 0;
+
+	case ZSTDds_decodeBlockHeader: {
+		blockProperties_t bp;
+		size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+		if (ZSTD_isError(cBlockSize))
+			return cBlockSize;
+		dctx->expected = cBlockSize;
+		dctx->bType = bp.blockType;
+		dctx->rleSize = bp.origSize;
+		if (cBlockSize) {
+			dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+			return 0;
+		}
+		/* empty block */
+		if (bp.lastBlock) {
+			if (dctx->fParams.checksumFlag) {
+				dctx->expected = 4;
+				dctx->stage = ZSTDds_checkChecksum;
+			} else {
+				dctx->expected = 0; /* end of frame */
+				dctx->stage = ZSTDds_getFrameHeaderSize;
+			}
+		} else {
+			dctx->expected = 3; /* go directly to next header */
+			dctx->stage = ZSTDds_decodeBlockHeader;
+		}
+		return 0;
+	}
+	case ZSTDds_decompressLastBlock:
+	case ZSTDds_decompressBlock: {
+		size_t rSize;
+		switch (dctx->bType) {
+		case bt_compressed: rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize); break;
+		case bt_raw: rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize); break;
+		case bt_rle: rSize = ZSTD_setRleBlock(dst, dstCapacity, src, srcSize, dctx->rleSize); break;
+		case bt_reserved: /* should never happen */
+		default: return ERROR(corruption_detected);
+		}
+		if (ZSTD_isError(rSize))
+			return rSize;
+		if (dctx->fParams.checksumFlag)
+			xxh64_update(&dctx->xxhState, dst, rSize);
+
+		if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */
+			if (dctx->fParams.checksumFlag) {	/* another round for frame checksum */
+				dctx->expected = 4;
+				dctx->stage = ZSTDds_checkChecksum;
+			} else {
+				dctx->expected = 0; /* ends here */
+				dctx->stage = ZSTDds_getFrameHeaderSize;
+			}
+		} else {
+			dctx->stage = ZSTDds_decodeBlockHeader;
+			dctx->expected = ZSTD_blockHeaderSize;
+			dctx->previousDstEnd = (char *)dst + rSize;
+		}
+		return rSize;
+	}
+	case ZSTDds_checkChecksum: {
+		U32 const h32 = (U32)xxh64_digest(&dctx->xxhState);
+		U32 const check32 = ZSTD_readLE32(src); /* srcSize == 4, guaranteed by dctx->expected */
+		if (check32 != h32)
+			return ERROR(checksum_wrong);
+		dctx->expected = 0;
+		dctx->stage = ZSTDds_getFrameHeaderSize;
+		return 0;
+	}
+	case ZSTDds_decodeSkippableHeader: {
+		memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected);
+		dctx->expected = ZSTD_readLE32(dctx->headerBuffer + 4);
+		dctx->stage = ZSTDds_skipFrame;
+		return 0;
+	}
+	case ZSTDds_skipFrame: {
+		dctx->expected = 0;
+		dctx->stage = ZSTDds_getFrameHeaderSize;
+		return 0;
+	}
+	default:
+		return ERROR(GENERIC); /* impossible */
+	}
+}
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx *dctx, const void *dict, size_t dictSize)
+{
+	dctx->dictEnd = dctx->previousDstEnd;
+	dctx->vBase = (const char *)dict - ((const char *)(dctx->previousDstEnd) - (const char *)(dctx->base));
+	dctx->base = dict;
+	dctx->previousDstEnd = (const char *)dict + dictSize;
+	return 0;
+}
+
+/* ZSTD_loadEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary
+ * @return : size of entropy tables read */
+static size_t ZSTD_loadEntropy(ZSTD_entropyTables_t *entropy, const void *const dict, size_t const dictSize)
+{
+	const BYTE *dictPtr = (const BYTE *)dict;
+	const BYTE *const dictEnd = dictPtr + dictSize;
+
+	if (dictSize <= 8)
+		return ERROR(dictionary_corrupted);
+	dictPtr += 8; /* skip header = magic + dictID */
+
+	{
+		size_t const hSize = HUF_readDTableX4_wksp(entropy->hufTable, dictPtr, dictEnd - dictPtr, entropy->workspace, sizeof(entropy->workspace));
+		if (HUF_isError(hSize))
+			return ERROR(dictionary_corrupted);
+		dictPtr += hSize;
+	}
+
+	{
+		short offcodeNCount[MaxOff + 1];
+		U32 offcodeMaxValue = MaxOff, offcodeLog;
+		size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(offcodeHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (offcodeLog > OffFSELog)
+			return ERROR(dictionary_corrupted);
+		CHECK_E(FSE_buildDTable_wksp(entropy->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
+		dictPtr += offcodeHeaderSize;
+	}
+
+	{
+		short matchlengthNCount[MaxML + 1];
+		unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+		size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(matchlengthHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (matchlengthLog > MLFSELog)
+			return ERROR(dictionary_corrupted);
+		CHECK_E(FSE_buildDTable_wksp(entropy->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
+		dictPtr += matchlengthHeaderSize;
+	}
+
+	{
+		short litlengthNCount[MaxLL + 1];
+		unsigned litlengthMaxValue = MaxLL, litlengthLog;
+		size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(litlengthHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (litlengthLog > LLFSELog)
+			return ERROR(dictionary_corrupted);
+		CHECK_E(FSE_buildDTable_wksp(entropy->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
+		dictPtr += litlengthHeaderSize;
+	}
+
+	if (dictPtr + 12 > dictEnd)
+		return ERROR(dictionary_corrupted);
+	{
+		int i;
+		size_t const dictContentSize = (size_t)(dictEnd - (dictPtr + 12));
+		for (i = 0; i < 3; i++) {
+			U32 const rep = ZSTD_readLE32(dictPtr);
+			dictPtr += 4;
+			if (rep == 0 || rep >= dictContentSize)
+				return ERROR(dictionary_corrupted);
+			entropy->rep[i] = rep;
+		}
+	}
+
+	return dictPtr - (const BYTE *)dict;
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx *dctx, const void *dict, size_t dictSize)
+{
+	if (dictSize < 8)
+		return ZSTD_refDictContent(dctx, dict, dictSize);
+	{
+		U32 const magic = ZSTD_readLE32(dict);
+		if (magic != ZSTD_DICT_MAGIC) {
+			return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */
+		}
+	}
+	dctx->dictID = ZSTD_readLE32((const char *)dict + 4);
+
+	/* load entropy tables */
+	{
+		size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
+		if (ZSTD_isError(eSize))
+			return ERROR(dictionary_corrupted);
+		dict = (const char *)dict + eSize;
+		dictSize -= eSize;
+	}
+	dctx->litEntropy = dctx->fseEntropy = 1;
+
+	/* reference dictionary content */
+	return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict, size_t dictSize)
+{
+	CHECK_F(ZSTD_decompressBegin(dctx));
+	if (dict && dictSize)
+		CHECK_E(ZSTD_decompress_insertDictionary(dctx, dict, dictSize), dictionary_corrupted);
+	return 0;
+}
+
+/* ======   ZSTD_DDict   ====== */
+
+struct ZSTD_DDict_s {
+	void *dictBuffer;
+	const void *dictContent;
+	size_t dictSize;
+	ZSTD_entropyTables_t entropy;
+	U32 dictID;
+	U32 entropyPresent;
+	ZSTD_customMem cMem;
+}; /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+size_t ZSTD_DDictWorkspaceBound(void) { return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_DDict)); }
+
+static const void *ZSTD_DDictDictContent(const ZSTD_DDict *ddict) { return ddict->dictContent; }
+
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict *ddict) { return ddict->dictSize; }
+
+static void ZSTD_refDDict(ZSTD_DCtx *dstDCtx, const ZSTD_DDict *ddict)
+{
+	ZSTD_decompressBegin(dstDCtx); /* init */
+	if (ddict) {		       /* support refDDict on NULL */
+		dstDCtx->dictID = ddict->dictID;
+		dstDCtx->base = ddict->dictContent;
+		dstDCtx->vBase = ddict->dictContent;
+		dstDCtx->dictEnd = (const BYTE *)ddict->dictContent + ddict->dictSize;
+		dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+		if (ddict->entropyPresent) {
+			dstDCtx->litEntropy = 1;
+			dstDCtx->fseEntropy = 1;
+			dstDCtx->LLTptr = ddict->entropy.LLTable;
+			dstDCtx->MLTptr = ddict->entropy.MLTable;
+			dstDCtx->OFTptr = ddict->entropy.OFTable;
+			dstDCtx->HUFptr = ddict->entropy.hufTable;
+			dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
+			dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
+			dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+		} else {
+			dstDCtx->litEntropy = 0;
+			dstDCtx->fseEntropy = 0;
+		}
+	}
+}
+
+static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict *ddict)
+{
+	ddict->dictID = 0;
+	ddict->entropyPresent = 0;
+	if (ddict->dictSize < 8)
+		return 0;
+	{
+		U32 const magic = ZSTD_readLE32(ddict->dictContent);
+		if (magic != ZSTD_DICT_MAGIC)
+			return 0; /* pure content mode */
+	}
+	ddict->dictID = ZSTD_readLE32((const char *)ddict->dictContent + 4);
+
+	/* load entropy tables */
+	CHECK_E(ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted);
+	ddict->entropyPresent = 1;
+	return 0;
+}
+
+static ZSTD_DDict *ZSTD_createDDict_advanced(const void *dict, size_t dictSize, unsigned byReference, ZSTD_customMem customMem)
+{
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	{
+		ZSTD_DDict *const ddict = (ZSTD_DDict *)ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
+		if (!ddict)
+			return NULL;
+		ddict->cMem = customMem;
+
+		if ((byReference) || (!dict) || (!dictSize)) {
+			ddict->dictBuffer = NULL;
+			ddict->dictContent = dict;
+		} else {
+			void *const internalBuffer = ZSTD_malloc(dictSize, customMem);
+			if (!internalBuffer) {
+				ZSTD_freeDDict(ddict);
+				return NULL;
+			}
+			memcpy(internalBuffer, dict, dictSize);
+			ddict->dictBuffer = internalBuffer;
+			ddict->dictContent = internalBuffer;
+		}
+		ddict->dictSize = dictSize;
+		ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+		/* parse dictionary content */
+		{
+			size_t const errorCode = ZSTD_loadEntropy_inDDict(ddict);
+			if (ZSTD_isError(errorCode)) {
+				ZSTD_freeDDict(ddict);
+				return NULL;
+			}
+		}
+
+		return ddict;
+	}
+}
+
+/*! ZSTD_initDDict() :
+*   Create a digested dictionary, to start decompression without startup delay.
+*   `dict` content is copied inside DDict.
+*   Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict *ZSTD_initDDict(const void *dict, size_t dictSize, void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	return ZSTD_createDDict_advanced(dict, dictSize, 1, stackMem);
+}
+
+size_t ZSTD_freeDDict(ZSTD_DDict *ddict)
+{
+	if (ddict == NULL)
+		return 0; /* support free on NULL */
+	{
+		ZSTD_customMem const cMem = ddict->cMem;
+		ZSTD_free(ddict->dictBuffer, cMem);
+		ZSTD_free(ddict, cMem);
+		return 0;
+	}
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void *dict, size_t dictSize)
+{
+	if (dictSize < 8)
+		return 0;
+	if (ZSTD_readLE32(dict) != ZSTD_DICT_MAGIC)
+		return 0;
+	return ZSTD_readLE32((const char *)dict + 4);
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict)
+{
+	if (ddict == NULL)
+		return 0;
+	return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void *src, size_t srcSize)
+{
+	ZSTD_frameParams zfp = {0, 0, 0, 0};
+	size_t const hError = ZSTD_getFrameParams(&zfp, src, srcSize);
+	if (ZSTD_isError(hError))
+		return 0;
+	return zfp.dictID;
+}
+
+/*! ZSTD_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const ZSTD_DDict *ddict)
+{
+	/* pass content and size in case legacy frames are encountered */
+	return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, NULL, 0, ddict);
+}
+
+/*=====================================
+*   Streaming decompression
+*====================================*/
+
+typedef enum { zdss_init, zdss_loadHeader, zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+/* *** Resource management *** */
+struct ZSTD_DStream_s {
+	ZSTD_DCtx *dctx;
+	ZSTD_DDict *ddictLocal;
+	const ZSTD_DDict *ddict;
+	ZSTD_frameParams fParams;
+	ZSTD_dStreamStage stage;
+	char *inBuff;
+	size_t inBuffSize;
+	size_t inPos;
+	size_t maxWindowSize;
+	char *outBuff;
+	size_t outBuffSize;
+	size_t outStart;
+	size_t outEnd;
+	size_t blockSize;
+	BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; /* tmp buffer to store frame header */
+	size_t lhSize;
+	ZSTD_customMem customMem;
+	void *legacyContext;
+	U32 previousLegacyVersion;
+	U32 legacyVersion;
+	U32 hostageByte;
+}; /* typedef'd to ZSTD_DStream within "zstd.h" */
+
+size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize)
+{
+	size_t const blockSize = MIN(maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+	size_t const inBuffSize = blockSize;
+	size_t const outBuffSize = maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+	return ZSTD_DCtxWorkspaceBound() + ZSTD_ALIGN(sizeof(ZSTD_DStream)) + ZSTD_ALIGN(inBuffSize) + ZSTD_ALIGN(outBuffSize);
+}
+
+static ZSTD_DStream *ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+	ZSTD_DStream *zds;
+
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	zds = (ZSTD_DStream *)ZSTD_malloc(sizeof(ZSTD_DStream), customMem);
+	if (zds == NULL)
+		return NULL;
+	memset(zds, 0, sizeof(ZSTD_DStream));
+	memcpy(&zds->customMem, &customMem, sizeof(ZSTD_customMem));
+	zds->dctx = ZSTD_createDCtx_advanced(customMem);
+	if (zds->dctx == NULL) {
+		ZSTD_freeDStream(zds);
+		return NULL;
+	}
+	zds->stage = zdss_init;
+	zds->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+	return zds;
+}
+
+ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	ZSTD_DStream *zds = ZSTD_createDStream_advanced(stackMem);
+	if (!zds) {
+		return NULL;
+	}
+
+	zds->maxWindowSize = maxWindowSize;
+	zds->stage = zdss_loadHeader;
+	zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+	ZSTD_freeDDict(zds->ddictLocal);
+	zds->ddictLocal = NULL;
+	zds->ddict = zds->ddictLocal;
+	zds->legacyVersion = 0;
+	zds->hostageByte = 0;
+
+	{
+		size_t const blockSize = MIN(zds->maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+		size_t const neededOutSize = zds->maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+
+		zds->inBuff = (char *)ZSTD_malloc(blockSize, zds->customMem);
+		zds->inBuffSize = blockSize;
+		zds->outBuff = (char *)ZSTD_malloc(neededOutSize, zds->customMem);
+		zds->outBuffSize = neededOutSize;
+		if (zds->inBuff == NULL || zds->outBuff == NULL) {
+			ZSTD_freeDStream(zds);
+			return NULL;
+		}
+	}
+	return zds;
+}
+
+ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize, const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize)
+{
+	ZSTD_DStream *zds = ZSTD_initDStream(maxWindowSize, workspace, workspaceSize);
+	if (zds) {
+		zds->ddict = ddict;
+	}
+	return zds;
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream *zds)
+{
+	if (zds == NULL)
+		return 0; /* support free on null */
+	{
+		ZSTD_customMem const cMem = zds->customMem;
+		ZSTD_freeDCtx(zds->dctx);
+		zds->dctx = NULL;
+		ZSTD_freeDDict(zds->ddictLocal);
+		zds->ddictLocal = NULL;
+		ZSTD_free(zds->inBuff, cMem);
+		zds->inBuff = NULL;
+		ZSTD_free(zds->outBuff, cMem);
+		zds->outBuff = NULL;
+		ZSTD_free(zds, cMem);
+		return 0;
+	}
+}
+
+/* *** Initialization *** */
+
+size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; }
+
+size_t ZSTD_resetDStream(ZSTD_DStream *zds)
+{
+	zds->stage = zdss_loadHeader;
+	zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+	zds->legacyVersion = 0;
+	zds->hostageByte = 0;
+	return ZSTD_frameHeaderSize_prefix;
+}
+
+/* *****   Decompression   ***** */
+
+ZSTD_STATIC size_t ZSTD_limitCopy(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t const length = MIN(dstCapacity, srcSize);
+	memcpy(dst, src, length);
+	return length;
+}
+
+size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output, ZSTD_inBuffer *input)
+{
+	const char *const istart = (const char *)(input->src) + input->pos;
+	const char *const iend = (const char *)(input->src) + input->size;
+	const char *ip = istart;
+	char *const ostart = (char *)(output->dst) + output->pos;
+	char *const oend = (char *)(output->dst) + output->size;
+	char *op = ostart;
+	U32 someMoreWork = 1;
+
+	while (someMoreWork) {
+		switch (zds->stage) {
+		case zdss_init:
+			ZSTD_resetDStream(zds); /* transparent reset on starting decoding a new frame */
+						/* fall-through */
+
+		case zdss_loadHeader: {
+			size_t const hSize = ZSTD_getFrameParams(&zds->fParams, zds->headerBuffer, zds->lhSize);
+			if (ZSTD_isError(hSize))
+				return hSize;
+			if (hSize != 0) {				   /* need more input */
+				size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */
+				if (toLoad > (size_t)(iend - ip)) {	/* not enough input to load full header */
+					memcpy(zds->headerBuffer + zds->lhSize, ip, iend - ip);
+					zds->lhSize += iend - ip;
+					input->pos = input->size;
+					return (MAX(ZSTD_frameHeaderSize_min, hSize) - zds->lhSize) +
+					       ZSTD_blockHeaderSize; /* remaining header bytes + next block header */
+				}
+				memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad);
+				zds->lhSize = hSize;
+				ip += toLoad;
+				break;
+			}
+
+			/* check for single-pass mode opportunity */
+			if (zds->fParams.frameContentSize && zds->fParams.windowSize /* skippable frame if == 0 */
+			    && (U64)(size_t)(oend - op) >= zds->fParams.frameContentSize) {
+				size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend - istart);
+				if (cSize <= (size_t)(iend - istart)) {
+					size_t const decompressedSize = ZSTD_decompress_usingDDict(zds->dctx, op, oend - op, istart, cSize, zds->ddict);
+					if (ZSTD_isError(decompressedSize))
+						return decompressedSize;
+					ip = istart + cSize;
+					op += decompressedSize;
+					zds->dctx->expected = 0;
+					zds->stage = zdss_init;
+					someMoreWork = 0;
+					break;
+				}
+			}
+
+			/* Consume header */
+			ZSTD_refDDict(zds->dctx, zds->ddict);
+			{
+				size_t const h1Size = ZSTD_nextSrcSizeToDecompress(zds->dctx); /* == ZSTD_frameHeaderSize_prefix */
+				CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer, h1Size));
+				{
+					size_t const h2Size = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+					CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer + h1Size, h2Size));
+				}
+			}
+
+			zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+			if (zds->fParams.windowSize > zds->maxWindowSize)
+				return ERROR(frameParameter_windowTooLarge);
+
+			/* Buffers are preallocated, but double check */
+			{
+				size_t const blockSize = MIN(zds->maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+				size_t const neededOutSize = zds->maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+				if (zds->inBuffSize < blockSize) {
+					return ERROR(GENERIC);
+				}
+				if (zds->outBuffSize < neededOutSize) {
+					return ERROR(GENERIC);
+				}
+				zds->blockSize = blockSize;
+			}
+			zds->stage = zdss_read;
+		}
+		/* pass-through */
+
+		case zdss_read: {
+			size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+			if (neededInSize == 0) { /* end of frame */
+				zds->stage = zdss_init;
+				someMoreWork = 0;
+				break;
+			}
+			if ((size_t)(iend - ip) >= neededInSize) { /* decode directly from src */
+				const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx);
+				size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, zds->outBuff + zds->outStart,
+										   (isSkipFrame ? 0 : zds->outBuffSize - zds->outStart), ip, neededInSize);
+				if (ZSTD_isError(decodedSize))
+					return decodedSize;
+				ip += neededInSize;
+				if (!decodedSize && !isSkipFrame)
+					break; /* this was just a header */
+				zds->outEnd = zds->outStart + decodedSize;
+				zds->stage = zdss_flush;
+				break;
+			}
+			if (ip == iend) {
+				someMoreWork = 0;
+				break;
+			} /* no more input */
+			zds->stage = zdss_load;
+			/* pass-through */
+		}
+
+		case zdss_load: {
+			size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+			size_t const toLoad = neededInSize - zds->inPos; /* should always be <= remaining space within inBuff */
+			size_t loadedSize;
+			if (toLoad > zds->inBuffSize - zds->inPos)
+				return ERROR(corruption_detected); /* should never happen */
+			loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend - ip);
+			ip += loadedSize;
+			zds->inPos += loadedSize;
+			if (loadedSize < toLoad) {
+				someMoreWork = 0;
+				break;
+			} /* not enough input, wait for more */
+
+			/* decode loaded input */
+			{
+				const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx);
+				size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, zds->outBuff + zds->outStart, zds->outBuffSize - zds->outStart,
+										   zds->inBuff, neededInSize);
+				if (ZSTD_isError(decodedSize))
+					return decodedSize;
+				zds->inPos = 0; /* input is consumed */
+				if (!decodedSize && !isSkipFrame) {
+					zds->stage = zdss_read;
+					break;
+				} /* this was just a header */
+				zds->outEnd = zds->outStart + decodedSize;
+				zds->stage = zdss_flush;
+				/* pass-through */
+			}
+		}
+
+		case zdss_flush: {
+			size_t const toFlushSize = zds->outEnd - zds->outStart;
+			size_t const flushedSize = ZSTD_limitCopy(op, oend - op, zds->outBuff + zds->outStart, toFlushSize);
+			op += flushedSize;
+			zds->outStart += flushedSize;
+			if (flushedSize == toFlushSize) { /* flush completed */
+				zds->stage = zdss_read;
+				if (zds->outStart + zds->blockSize > zds->outBuffSize)
+					zds->outStart = zds->outEnd = 0;
+				break;
+			}
+			/* cannot complete flush */
+			someMoreWork = 0;
+			break;
+		}
+		default:
+			return ERROR(GENERIC); /* impossible */
+		}
+	}
+
+	/* result */
+	input->pos += (size_t)(ip - istart);
+	output->pos += (size_t)(op - ostart);
+	{
+		size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+		if (!nextSrcSizeHint) {			    /* frame fully decoded */
+			if (zds->outEnd == zds->outStart) { /* output fully flushed */
+				if (zds->hostageByte) {
+					if (input->pos >= input->size) {
+						zds->stage = zdss_read;
+						return 1;
+					}	     /* can't release hostage (not present) */
+					input->pos++; /* release hostage */
+				}
+				return 0;
+			}
+			if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+				input->pos--;    /* note : pos > 0, otherwise, impossible to finish reading last block */
+				zds->hostageByte = 1;
+			}
+			return 1;
+		}
+		nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds->dctx) == ZSTDnit_block); /* preload header of next block */
+		if (zds->inPos > nextSrcSizeHint)
+			return ERROR(GENERIC); /* should never happen */
+		nextSrcSizeHint -= zds->inPos; /* already loaded*/
+		return nextSrcSizeHint;
+	}
+}
+
+EXPORT_SYMBOL(ZSTD_DCtxWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initDCtx);
+EXPORT_SYMBOL(ZSTD_decompressDCtx);
+EXPORT_SYMBOL(ZSTD_decompress_usingDict);
+
+EXPORT_SYMBOL(ZSTD_DDictWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initDDict);
+EXPORT_SYMBOL(ZSTD_decompress_usingDDict);
+
+EXPORT_SYMBOL(ZSTD_DStreamWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initDStream);
+EXPORT_SYMBOL(ZSTD_initDStream_usingDDict);
+EXPORT_SYMBOL(ZSTD_resetDStream);
+EXPORT_SYMBOL(ZSTD_decompressStream);
+EXPORT_SYMBOL(ZSTD_DStreamInSize);
+EXPORT_SYMBOL(ZSTD_DStreamOutSize);
+
+EXPORT_SYMBOL(ZSTD_findFrameCompressedSize);
+EXPORT_SYMBOL(ZSTD_getFrameContentSize);
+EXPORT_SYMBOL(ZSTD_findDecompressedSize);
+
+EXPORT_SYMBOL(ZSTD_isFrame);
+EXPORT_SYMBOL(ZSTD_getDictID_fromDict);
+EXPORT_SYMBOL(ZSTD_getDictID_fromDDict);
+EXPORT_SYMBOL(ZSTD_getDictID_fromFrame);
+
+EXPORT_SYMBOL(ZSTD_getFrameParams);
+EXPORT_SYMBOL(ZSTD_decompressBegin);
+EXPORT_SYMBOL(ZSTD_decompressBegin_usingDict);
+EXPORT_SYMBOL(ZSTD_copyDCtx);
+EXPORT_SYMBOL(ZSTD_nextSrcSizeToDecompress);
+EXPORT_SYMBOL(ZSTD_decompressContinue);
+EXPORT_SYMBOL(ZSTD_nextInputType);
+
+EXPORT_SYMBOL(ZSTD_decompressBlock);
+EXPORT_SYMBOL(ZSTD_insertBlock);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Decompressor");
diff --git a/lib/zstd/entropy_common.c b/lib/zstd/entropy_common.c
new file mode 100644
index 000000000000..2b0a643c32c4
--- /dev/null
+++ b/lib/zstd/entropy_common.c
@@ -0,0 +1,243 @@
+/*
+ * Common functions of New Generation Entropy library
+ * Copyright (C) 2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "error_private.h" /* ERR_*, ERROR */
+#include "fse.h"
+#include "huf.h"
+#include "mem.h"
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSVPtr, unsigned *tableLogPtr, const void *headerBuffer, size_t hbSize)
+{
+	const BYTE *const istart = (const BYTE *)headerBuffer;
+	const BYTE *const iend = istart + hbSize;
+	const BYTE *ip = istart;
+	int nbBits;
+	int remaining;
+	int threshold;
+	U32 bitStream;
+	int bitCount;
+	unsigned charnum = 0;
+	int previous0 = 0;
+
+	if (hbSize < 4)
+		return ERROR(srcSize_wrong);
+	bitStream = ZSTD_readLE32(ip);
+	nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */
+	if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX)
+		return ERROR(tableLog_tooLarge);
+	bitStream >>= 4;
+	bitCount = 4;
+	*tableLogPtr = nbBits;
+	remaining = (1 << nbBits) + 1;
+	threshold = 1 << nbBits;
+	nbBits++;
+
+	while ((remaining > 1) & (charnum <= *maxSVPtr)) {
+		if (previous0) {
+			unsigned n0 = charnum;
+			while ((bitStream & 0xFFFF) == 0xFFFF) {
+				n0 += 24;
+				if (ip < iend - 5) {
+					ip += 2;
+					bitStream = ZSTD_readLE32(ip) >> bitCount;
+				} else {
+					bitStream >>= 16;
+					bitCount += 16;
+				}
+			}
+			while ((bitStream & 3) == 3) {
+				n0 += 3;
+				bitStream >>= 2;
+				bitCount += 2;
+			}
+			n0 += bitStream & 3;
+			bitCount += 2;
+			if (n0 > *maxSVPtr)
+				return ERROR(maxSymbolValue_tooSmall);
+			while (charnum < n0)
+				normalizedCounter[charnum++] = 0;
+			if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
+				ip += bitCount >> 3;
+				bitCount &= 7;
+				bitStream = ZSTD_readLE32(ip) >> bitCount;
+			} else {
+				bitStream >>= 2;
+			}
+		}
+		{
+			int const max = (2 * threshold - 1) - remaining;
+			int count;
+
+			if ((bitStream & (threshold - 1)) < (U32)max) {
+				count = bitStream & (threshold - 1);
+				bitCount += nbBits - 1;
+			} else {
+				count = bitStream & (2 * threshold - 1);
+				if (count >= threshold)
+					count -= max;
+				bitCount += nbBits;
+			}
+
+			count--;				 /* extra accuracy */
+			remaining -= count < 0 ? -count : count; /* -1 means +1 */
+			normalizedCounter[charnum++] = (short)count;
+			previous0 = !count;
+			while (remaining < threshold) {
+				nbBits--;
+				threshold >>= 1;
+			}
+
+			if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
+				ip += bitCount >> 3;
+				bitCount &= 7;
+			} else {
+				bitCount -= (int)(8 * (iend - 4 - ip));
+				ip = iend - 4;
+			}
+			bitStream = ZSTD_readLE32(ip) >> (bitCount & 31);
+		}
+	} /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+	if (remaining != 1)
+		return ERROR(corruption_detected);
+	if (bitCount > 32)
+		return ERROR(corruption_detected);
+	*maxSVPtr = charnum - 1;
+
+	ip += (bitCount + 7) >> 3;
+	return ip - istart;
+}
+
+/*! HUF_readStats() :
+	Read compact Huffman tree, saved by HUF_writeCTable().
+	`huffWeight` is destination buffer.
+	`rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+	@return : size read from `src` , or an error Code .
+	Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 weightTotal;
+	const BYTE *ip = (const BYTE *)src;
+	size_t iSize;
+	size_t oSize;
+
+	if (!srcSize)
+		return ERROR(srcSize_wrong);
+	iSize = ip[0];
+	/* memset(huffWeight, 0, hwSize);   */ /* is not necessary, even though some analyzer complain ... */
+
+	if (iSize >= 128) { /* special header */
+		oSize = iSize - 127;
+		iSize = ((oSize + 1) / 2);
+		if (iSize + 1 > srcSize)
+			return ERROR(srcSize_wrong);
+		if (oSize >= hwSize)
+			return ERROR(corruption_detected);
+		ip += 1;
+		{
+			U32 n;
+			for (n = 0; n < oSize; n += 2) {
+				huffWeight[n] = ip[n / 2] >> 4;
+				huffWeight[n + 1] = ip[n / 2] & 15;
+			}
+		}
+	} else {						 /* header compressed with FSE (normal case) */
+		if (iSize + 1 > srcSize)
+			return ERROR(srcSize_wrong);
+		oSize = FSE_decompress_wksp(huffWeight, hwSize - 1, ip + 1, iSize, 6, workspace, workspaceSize); /* max (hwSize-1) values decoded, as last one is implied */
+		if (FSE_isError(oSize))
+			return oSize;
+	}
+
+	/* collect weight stats */
+	memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+	weightTotal = 0;
+	{
+		U32 n;
+		for (n = 0; n < oSize; n++) {
+			if (huffWeight[n] >= HUF_TABLELOG_MAX)
+				return ERROR(corruption_detected);
+			rankStats[huffWeight[n]]++;
+			weightTotal += (1 << huffWeight[n]) >> 1;
+		}
+	}
+	if (weightTotal == 0)
+		return ERROR(corruption_detected);
+
+	/* get last non-null symbol weight (implied, total must be 2^n) */
+	{
+		U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+		if (tableLog > HUF_TABLELOG_MAX)
+			return ERROR(corruption_detected);
+		*tableLogPtr = tableLog;
+		/* determine last weight */
+		{
+			U32 const total = 1 << tableLog;
+			U32 const rest = total - weightTotal;
+			U32 const verif = 1 << BIT_highbit32(rest);
+			U32 const lastWeight = BIT_highbit32(rest) + 1;
+			if (verif != rest)
+				return ERROR(corruption_detected); /* last value must be a clean power of 2 */
+			huffWeight[oSize] = (BYTE)lastWeight;
+			rankStats[lastWeight]++;
+		}
+	}
+
+	/* check tree construction validity */
+	if ((rankStats[1] < 2) || (rankStats[1] & 1))
+		return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */
+
+	/* results */
+	*nbSymbolsPtr = (U32)(oSize + 1);
+	return iSize + 1;
+}
diff --git a/lib/zstd/error_private.h b/lib/zstd/error_private.h
new file mode 100644
index 000000000000..1a60b31f706c
--- /dev/null
+++ b/lib/zstd/error_private.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include <linux/types.h> /* size_t */
+#include <linux/zstd.h>  /* enum list */
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#define ERR_STATIC static __attribute__((unused))
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#define ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code)
+{
+	if (!ERR_isError(code))
+		return (ERR_enum)0;
+	return (ERR_enum)(0 - code);
+}
+
+#endif /* ERROR_H_MODULE */
diff --git a/lib/zstd/fse.h b/lib/zstd/fse.h
new file mode 100644
index 000000000000..7460ab04b191
--- /dev/null
+++ b/lib/zstd/fse.h
@@ -0,0 +1,575 @@
+/*
+ * FSE : Finite State Entropy codec
+ * Public Prototypes declaration
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef FSE_H
+#define FSE_H
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include <linux/types.h> /* size_t, ptrdiff_t */
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#define FSE_PUBLIC_API
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR 0
+#define FSE_VERSION_MINOR 9
+#define FSE_VERSION_RELEASE 0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR * 100 * 100 + FSE_VERSION_MINOR * 100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+/*! FSE_optimalTableLog():
+	dynamically downsize 'tableLog' when conditions are met.
+	It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+	@return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+	normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+	'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+	@return : tableLog,
+			  or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+	Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+	Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+	Compactly save 'normalizedCounter' into 'buffer'.
+	@return : size of the compressed table,
+			  or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! Constructor and Destructor of FSE_CTable.
+	Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */
+
+/*! FSE_compress_usingCTable():
+	Compress `src` using `ct` into `dst` which must be already allocated.
+	@return : size of compressed data (<= `dstCapacity`),
+			  or 0 if compressed data could not fit into `dst`,
+			  or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable(void *dst, size_t dstCapacity, const void *src, size_t srcSize, const FSE_CTable *ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+	Read compactly saved 'normalizedCounter' from 'rBuffer'.
+	@return : size read from 'rBuffer',
+			  or an errorCode, which can be tested using FSE_isError().
+			  maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSymbolValuePtr, unsigned *tableLogPtr, const void *rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+	Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */
+
+/*! FSE_buildDTable():
+	Builds 'dt', which must be already allocated, using FSE_createDTable().
+	return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize);
+
+/*! FSE_decompress_usingDTable():
+	Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+	into `dst` which must be already allocated.
+	@return : size of regenerated data (necessarily <= `dstCapacity`),
+			  or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+/* *** Dependency *** */
+#include "bitstream.h"
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size >> 7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1 << (maxTableLog - 1)) + ((maxSymbolValue + 1) * 2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1 << maxTableLog))
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned
+ */
+size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace);
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of minimum `1024` unsigned
+ */
+size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize, unsigned *workSpace);
+
+/*! FSE_count_simple
+ * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
+ * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
+*/
+size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize);
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle(FSE_CTable *ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw(FSE_DTable *dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle(FSE_DTable *dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+	ptrdiff_t value;
+	const void *stateTable;
+	const void *symbolTT;
+	unsigned stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t *CStatePtr, const FSE_CTable *ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t *bitC, FSE_CState_t *CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t *bitC, const FSE_CState_t *CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+	size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+	FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+	FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+	BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+	BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+	FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+	size_t size = BIT_closeCStream(&bitStream);
+*/
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+	size_t state;
+	const void *table; /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+static void FSE_initDState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD, const FSE_DTable *dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t *DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+	errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+	errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+	unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+	size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+	endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+	BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+	BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+	FSE_endOfDState(&DState);
+*/
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+	int deltaFindState;
+	U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+ZSTD_STATIC void FSE_initCState(FSE_CState_t *statePtr, const FSE_CTable *ct)
+{
+	const void *ptr = ct;
+	const U16 *u16ptr = (const U16 *)ptr;
+	const U32 tableLog = ZSTD_read16(ptr);
+	statePtr->value = (ptrdiff_t)1 << tableLog;
+	statePtr->stateTable = u16ptr + 2;
+	statePtr->symbolTT = ((const U32 *)ct + 1 + (tableLog ? (1 << (tableLog - 1)) : 1));
+	statePtr->stateLog = tableLog;
+}
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+ZSTD_STATIC void FSE_initCState2(FSE_CState_t *statePtr, const FSE_CTable *ct, U32 symbol)
+{
+	FSE_initCState(statePtr, ct);
+	{
+		const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol];
+		const U16 *stateTable = (const U16 *)(statePtr->stateTable);
+		U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1 << 15)) >> 16);
+		statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+		statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+	}
+}
+
+ZSTD_STATIC void FSE_encodeSymbol(BIT_CStream_t *bitC, FSE_CState_t *statePtr, U32 symbol)
+{
+	const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol];
+	const U16 *const stateTable = (const U16 *)(statePtr->stateTable);
+	U32 nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+	BIT_addBits(bitC, statePtr->value, nbBitsOut);
+	statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+ZSTD_STATIC void FSE_flushCState(BIT_CStream_t *bitC, const FSE_CState_t *statePtr)
+{
+	BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+	BIT_flushBits(bitC);
+}
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+	U16 tableLog;
+	U16 fastMode;
+} FSE_DTableHeader; /* sizeof U32 */
+
+typedef struct {
+	unsigned short newState;
+	unsigned char symbol;
+	unsigned char nbBits;
+} FSE_decode_t; /* size == U32 */
+
+ZSTD_STATIC void FSE_initDState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD, const FSE_DTable *dt)
+{
+	const void *ptr = dt;
+	const FSE_DTableHeader *const DTableH = (const FSE_DTableHeader *)ptr;
+	DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+	BIT_reloadDStream(bitD);
+	DStatePtr->table = dt + 1;
+}
+
+ZSTD_STATIC BYTE FSE_peekSymbol(const FSE_DState_t *DStatePtr)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	return DInfo.symbol;
+}
+
+ZSTD_STATIC void FSE_updateState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	U32 const nbBits = DInfo.nbBits;
+	size_t const lowBits = BIT_readBits(bitD, nbBits);
+	DStatePtr->state = DInfo.newState + lowBits;
+}
+
+ZSTD_STATIC BYTE FSE_decodeSymbol(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	U32 const nbBits = DInfo.nbBits;
+	BYTE const symbol = DInfo.symbol;
+	size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+	DStatePtr->state = DInfo.newState + lowBits;
+	return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+	unsafe, only works if no symbol has a probability > 50% */
+ZSTD_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	U32 const nbBits = DInfo.nbBits;
+	BYTE const symbol = DInfo.symbol;
+	size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+	DStatePtr->state = DInfo.newState + lowBits;
+	return symbol;
+}
+
+ZSTD_STATIC unsigned FSE_endOfDState(const FSE_DState_t *DStatePtr) { return DStatePtr->state == 0; }
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE - 2)
+#define FSE_MAX_TABLESIZE (1U << FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE - 1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE - 2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize >> 1) + (tableSize >> 3) + 3)
+
+#endif /* FSE_H */
diff --git a/lib/zstd/fse_compress.c b/lib/zstd/fse_compress.c
new file mode 100644
index 000000000000..ef3d1741d532
--- /dev/null
+++ b/lib/zstd/fse_compress.c
@@ -0,0 +1,795 @@
+/*
+ * FSE : Finite State Entropy encoder
+ * Copyright (C) 2013-2015, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { FSE_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X, Y) X##Y
+#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
+#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
+{
+	U32 const tableSize = 1 << tableLog;
+	U32 const tableMask = tableSize - 1;
+	void *const ptr = ct;
+	U16 *const tableU16 = ((U16 *)ptr) + 2;
+	void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableLog ? tableSize >> 1 : 1);
+	FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
+	U32 const step = FSE_TABLESTEP(tableSize);
+	U32 highThreshold = tableSize - 1;
+
+	U32 *cumul;
+	FSE_FUNCTION_TYPE *tableSymbol;
+	size_t spaceUsed32 = 0;
+
+	cumul = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += FSE_MAX_SYMBOL_VALUE + 2;
+	tableSymbol = (FSE_FUNCTION_TYPE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(FSE_FUNCTION_TYPE) * ((size_t)1 << tableLog), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* CTable header */
+	tableU16[-2] = (U16)tableLog;
+	tableU16[-1] = (U16)maxSymbolValue;
+
+	/* For explanations on how to distribute symbol values over the table :
+	*  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+	/* symbol start positions */
+	{
+		U32 u;
+		cumul[0] = 0;
+		for (u = 1; u <= maxSymbolValue + 1; u++) {
+			if (normalizedCounter[u - 1] == -1) { /* Low proba symbol */
+				cumul[u] = cumul[u - 1] + 1;
+				tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u - 1);
+			} else {
+				cumul[u] = cumul[u - 1] + normalizedCounter[u - 1];
+			}
+		}
+		cumul[maxSymbolValue + 1] = tableSize + 1;
+	}
+
+	/* Spread symbols */
+	{
+		U32 position = 0;
+		U32 symbol;
+		for (symbol = 0; symbol <= maxSymbolValue; symbol++) {
+			int nbOccurences;
+			for (nbOccurences = 0; nbOccurences < normalizedCounter[symbol]; nbOccurences++) {
+				tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+				position = (position + step) & tableMask;
+				while (position > highThreshold)
+					position = (position + step) & tableMask; /* Low proba area */
+			}
+		}
+
+		if (position != 0)
+			return ERROR(GENERIC); /* Must have gone through all positions */
+	}
+
+	/* Build table */
+	{
+		U32 u;
+		for (u = 0; u < tableSize; u++) {
+			FSE_FUNCTION_TYPE s = tableSymbol[u];	/* note : static analyzer may not understand tableSymbol is properly initialized */
+			tableU16[cumul[s]++] = (U16)(tableSize + u); /* TableU16 : sorted by symbol order; gives next state value */
+		}
+	}
+
+	/* Build Symbol Transformation Table */
+	{
+		unsigned total = 0;
+		unsigned s;
+		for (s = 0; s <= maxSymbolValue; s++) {
+			switch (normalizedCounter[s]) {
+			case 0: break;
+
+			case -1:
+			case 1:
+				symbolTT[s].deltaNbBits = (tableLog << 16) - (1 << tableLog);
+				symbolTT[s].deltaFindState = total - 1;
+				total++;
+				break;
+			default: {
+				U32 const maxBitsOut = tableLog - BIT_highbit32(normalizedCounter[s] - 1);
+				U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+				symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+				symbolTT[s].deltaFindState = total - normalizedCounter[s];
+				total += normalizedCounter[s];
+			}
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+	size_t const maxHeaderSize = (((maxSymbolValue + 1) * tableLog) >> 3) + 3;
+	return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */
+}
+
+static size_t FSE_writeNCount_generic(void *header, size_t headerBufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+				      unsigned writeIsSafe)
+{
+	BYTE *const ostart = (BYTE *)header;
+	BYTE *out = ostart;
+	BYTE *const oend = ostart + headerBufferSize;
+	int nbBits;
+	const int tableSize = 1 << tableLog;
+	int remaining;
+	int threshold;
+	U32 bitStream;
+	int bitCount;
+	unsigned charnum = 0;
+	int previous0 = 0;
+
+	bitStream = 0;
+	bitCount = 0;
+	/* Table Size */
+	bitStream += (tableLog - FSE_MIN_TABLELOG) << bitCount;
+	bitCount += 4;
+
+	/* Init */
+	remaining = tableSize + 1; /* +1 for extra accuracy */
+	threshold = tableSize;
+	nbBits = tableLog + 1;
+
+	while (remaining > 1) { /* stops at 1 */
+		if (previous0) {
+			unsigned start = charnum;
+			while (!normalizedCounter[charnum])
+				charnum++;
+			while (charnum >= start + 24) {
+				start += 24;
+				bitStream += 0xFFFFU << bitCount;
+				if ((!writeIsSafe) && (out > oend - 2))
+					return ERROR(dstSize_tooSmall); /* Buffer overflow */
+				out[0] = (BYTE)bitStream;
+				out[1] = (BYTE)(bitStream >> 8);
+				out += 2;
+				bitStream >>= 16;
+			}
+			while (charnum >= start + 3) {
+				start += 3;
+				bitStream += 3 << bitCount;
+				bitCount += 2;
+			}
+			bitStream += (charnum - start) << bitCount;
+			bitCount += 2;
+			if (bitCount > 16) {
+				if ((!writeIsSafe) && (out > oend - 2))
+					return ERROR(dstSize_tooSmall); /* Buffer overflow */
+				out[0] = (BYTE)bitStream;
+				out[1] = (BYTE)(bitStream >> 8);
+				out += 2;
+				bitStream >>= 16;
+				bitCount -= 16;
+			}
+		}
+		{
+			int count = normalizedCounter[charnum++];
+			int const max = (2 * threshold - 1) - remaining;
+			remaining -= count < 0 ? -count : count;
+			count++; /* +1 for extra accuracy */
+			if (count >= threshold)
+				count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+			bitStream += count << bitCount;
+			bitCount += nbBits;
+			bitCount -= (count < max);
+			previous0 = (count == 1);
+			if (remaining < 1)
+				return ERROR(GENERIC);
+			while (remaining < threshold)
+				nbBits--, threshold >>= 1;
+		}
+		if (bitCount > 16) {
+			if ((!writeIsSafe) && (out > oend - 2))
+				return ERROR(dstSize_tooSmall); /* Buffer overflow */
+			out[0] = (BYTE)bitStream;
+			out[1] = (BYTE)(bitStream >> 8);
+			out += 2;
+			bitStream >>= 16;
+			bitCount -= 16;
+		}
+	}
+
+	/* flush remaining bitStream */
+	if ((!writeIsSafe) && (out > oend - 2))
+		return ERROR(dstSize_tooSmall); /* Buffer overflow */
+	out[0] = (BYTE)bitStream;
+	out[1] = (BYTE)(bitStream >> 8);
+	out += (bitCount + 7) / 8;
+
+	if (charnum > maxSymbolValue + 1)
+		return ERROR(GENERIC);
+
+	return (out - ostart);
+}
+
+size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+	if (tableLog > FSE_MAX_TABLELOG)
+		return ERROR(tableLog_tooLarge); /* Unsupported */
+	if (tableLog < FSE_MIN_TABLELOG)
+		return ERROR(GENERIC); /* Unsupported */
+
+	if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+		return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+	return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+}
+
+/*-**************************************************************
+*  Counting histogram
+****************************************************************/
+/*! FSE_count_simple
+	This function counts byte values within `src`, and store the histogram into table `count`.
+	It doesn't use any additional memory.
+	But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
+	For this reason, prefer using a table `count` with 256 elements.
+	@return : count of most numerous element
+*/
+size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize)
+{
+	const BYTE *ip = (const BYTE *)src;
+	const BYTE *const end = ip + srcSize;
+	unsigned maxSymbolValue = *maxSymbolValuePtr;
+	unsigned max = 0;
+
+	memset(count, 0, (maxSymbolValue + 1) * sizeof(*count));
+	if (srcSize == 0) {
+		*maxSymbolValuePtr = 0;
+		return 0;
+	}
+
+	while (ip < end)
+		count[*ip++]++;
+
+	while (!count[maxSymbolValue])
+		maxSymbolValue--;
+	*maxSymbolValuePtr = maxSymbolValue;
+
+	{
+		U32 s;
+		for (s = 0; s <= maxSymbolValue; s++)
+			if (count[s] > max)
+				max = count[s];
+	}
+
+	return (size_t)max;
+}
+
+/* FSE_count_parallel_wksp() :
+ * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
+ * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`` */
+static size_t FSE_count_parallel_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned checkMax,
+				      unsigned *const workSpace)
+{
+	const BYTE *ip = (const BYTE *)source;
+	const BYTE *const iend = ip + sourceSize;
+	unsigned maxSymbolValue = *maxSymbolValuePtr;
+	unsigned max = 0;
+	U32 *const Counting1 = workSpace;
+	U32 *const Counting2 = Counting1 + 256;
+	U32 *const Counting3 = Counting2 + 256;
+	U32 *const Counting4 = Counting3 + 256;
+
+	memset(Counting1, 0, 4 * 256 * sizeof(unsigned));
+
+	/* safety checks */
+	if (!sourceSize) {
+		memset(count, 0, maxSymbolValue + 1);
+		*maxSymbolValuePtr = 0;
+		return 0;
+	}
+	if (!maxSymbolValue)
+		maxSymbolValue = 255; /* 0 == default */
+
+	/* by stripes of 16 bytes */
+	{
+		U32 cached = ZSTD_read32(ip);
+		ip += 4;
+		while (ip < iend - 15) {
+			U32 c = cached;
+			cached = ZSTD_read32(ip);
+			ip += 4;
+			Counting1[(BYTE)c]++;
+			Counting2[(BYTE)(c >> 8)]++;
+			Counting3[(BYTE)(c >> 16)]++;
+			Counting4[c >> 24]++;
+			c = cached;
+			cached = ZSTD_read32(ip);
+			ip += 4;
+			Counting1[(BYTE)c]++;
+			Counting2[(BYTE)(c >> 8)]++;
+			Counting3[(BYTE)(c >> 16)]++;
+			Counting4[c >> 24]++;
+			c = cached;
+			cached = ZSTD_read32(ip);
+			ip += 4;
+			Counting1[(BYTE)c]++;
+			Counting2[(BYTE)(c >> 8)]++;
+			Counting3[(BYTE)(c >> 16)]++;
+			Counting4[c >> 24]++;
+			c = cached;
+			cached = ZSTD_read32(ip);
+			ip += 4;
+			Counting1[(BYTE)c]++;
+			Counting2[(BYTE)(c >> 8)]++;
+			Counting3[(BYTE)(c >> 16)]++;
+			Counting4[c >> 24]++;
+		}
+		ip -= 4;
+	}
+
+	/* finish last symbols */
+	while (ip < iend)
+		Counting1[*ip++]++;
+
+	if (checkMax) { /* verify stats will fit into destination table */
+		U32 s;
+		for (s = 255; s > maxSymbolValue; s--) {
+			Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+			if (Counting1[s])
+				return ERROR(maxSymbolValue_tooSmall);
+		}
+	}
+
+	{
+		U32 s;
+		for (s = 0; s <= maxSymbolValue; s++) {
+			count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+			if (count[s] > max)
+				max = count[s];
+		}
+	}
+
+	while (!count[maxSymbolValue])
+		maxSymbolValue--;
+	*maxSymbolValuePtr = maxSymbolValue;
+	return (size_t)max;
+}
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace)
+{
+	if (sourceSize < 1500)
+		return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+	return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
+}
+
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace)
+{
+	if (*maxSymbolValuePtr < 255)
+		return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
+	*maxSymbolValuePtr = 255;
+	return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
+}
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+/*! FSE_sizeof_CTable() :
+	FSE_CTable is a variable size structure which contains :
+	`U16 tableLog;`
+	`U16 maxSymbolValue;`
+	`U16 nextStateNumber[1 << tableLog];`                         // This size is variable
+	`FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
+Allocation is manual (C standard does not support variable-size structures).
+*/
+size_t FSE_sizeof_CTable(unsigned maxSymbolValue, unsigned tableLog)
+{
+	if (tableLog > FSE_MAX_TABLELOG)
+		return ERROR(tableLog_tooLarge);
+	return FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue) * sizeof(U32);
+}
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+	U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+	U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+	U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+	return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+	U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+	U32 tableLog = maxTableLog;
+	U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+	if (tableLog == 0)
+		tableLog = FSE_DEFAULT_TABLELOG;
+	if (maxBitsSrc < tableLog)
+		tableLog = maxBitsSrc; /* Accuracy can be reduced */
+	if (minBits > tableLog)
+		tableLog = minBits; /* Need a minimum to safely represent all symbol values */
+	if (tableLog < FSE_MIN_TABLELOG)
+		tableLog = FSE_MIN_TABLELOG;
+	if (tableLog > FSE_MAX_TABLELOG)
+		tableLog = FSE_MAX_TABLELOG;
+	return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+	return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short *norm, U32 tableLog, const unsigned *count, size_t total, U32 maxSymbolValue)
+{
+	short const NOT_YET_ASSIGNED = -2;
+	U32 s;
+	U32 distributed = 0;
+	U32 ToDistribute;
+
+	/* Init */
+	U32 const lowThreshold = (U32)(total >> tableLog);
+	U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+	for (s = 0; s <= maxSymbolValue; s++) {
+		if (count[s] == 0) {
+			norm[s] = 0;
+			continue;
+		}
+		if (count[s] <= lowThreshold) {
+			norm[s] = -1;
+			distributed++;
+			total -= count[s];
+			continue;
+		}
+		if (count[s] <= lowOne) {
+			norm[s] = 1;
+			distributed++;
+			total -= count[s];
+			continue;
+		}
+
+		norm[s] = NOT_YET_ASSIGNED;
+	}
+	ToDistribute = (1 << tableLog) - distributed;
+
+	if ((total / ToDistribute) > lowOne) {
+		/* risk of rounding to zero */
+		lowOne = (U32)((total * 3) / (ToDistribute * 2));
+		for (s = 0; s <= maxSymbolValue; s++) {
+			if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+				norm[s] = 1;
+				distributed++;
+				total -= count[s];
+				continue;
+			}
+		}
+		ToDistribute = (1 << tableLog) - distributed;
+	}
+
+	if (distributed == maxSymbolValue + 1) {
+		/* all values are pretty poor;
+		   probably incompressible data (should have already been detected);
+		   find max, then give all remaining points to max */
+		U32 maxV = 0, maxC = 0;
+		for (s = 0; s <= maxSymbolValue; s++)
+			if (count[s] > maxC)
+				maxV = s, maxC = count[s];
+		norm[maxV] += (short)ToDistribute;
+		return 0;
+	}
+
+	if (total == 0) {
+		/* all of the symbols were low enough for the lowOne or lowThreshold */
+		for (s = 0; ToDistribute > 0; s = (s + 1) % (maxSymbolValue + 1))
+			if (norm[s] > 0)
+				ToDistribute--, norm[s]++;
+		return 0;
+	}
+
+	{
+		U64 const vStepLog = 62 - tableLog;
+		U64 const mid = (1ULL << (vStepLog - 1)) - 1;
+		U64 const rStep = div_u64((((U64)1 << vStepLog) * ToDistribute) + mid, (U32)total); /* scale on remaining */
+		U64 tmpTotal = mid;
+		for (s = 0; s <= maxSymbolValue; s++) {
+			if (norm[s] == NOT_YET_ASSIGNED) {
+				U64 const end = tmpTotal + (count[s] * rStep);
+				U32 const sStart = (U32)(tmpTotal >> vStepLog);
+				U32 const sEnd = (U32)(end >> vStepLog);
+				U32 const weight = sEnd - sStart;
+				if (weight < 1)
+					return ERROR(GENERIC);
+				norm[s] = (short)weight;
+				tmpTotal = end;
+			}
+		}
+	}
+
+	return 0;
+}
+
+size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t total, unsigned maxSymbolValue)
+{
+	/* Sanity checks */
+	if (tableLog == 0)
+		tableLog = FSE_DEFAULT_TABLELOG;
+	if (tableLog < FSE_MIN_TABLELOG)
+		return ERROR(GENERIC); /* Unsupported size */
+	if (tableLog > FSE_MAX_TABLELOG)
+		return ERROR(tableLog_tooLarge); /* Unsupported size */
+	if (tableLog < FSE_minTableLog(total, maxSymbolValue))
+		return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */
+
+	{
+		U32 const rtbTable[] = {0, 473195, 504333, 520860, 550000, 700000, 750000, 830000};
+		U64 const scale = 62 - tableLog;
+		U64 const step = div_u64((U64)1 << 62, (U32)total); /* <== here, one division ! */
+		U64 const vStep = 1ULL << (scale - 20);
+		int stillToDistribute = 1 << tableLog;
+		unsigned s;
+		unsigned largest = 0;
+		short largestP = 0;
+		U32 lowThreshold = (U32)(total >> tableLog);
+
+		for (s = 0; s <= maxSymbolValue; s++) {
+			if (count[s] == total)
+				return 0; /* rle special case */
+			if (count[s] == 0) {
+				normalizedCounter[s] = 0;
+				continue;
+			}
+			if (count[s] <= lowThreshold) {
+				normalizedCounter[s] = -1;
+				stillToDistribute--;
+			} else {
+				short proba = (short)((count[s] * step) >> scale);
+				if (proba < 8) {
+					U64 restToBeat = vStep * rtbTable[proba];
+					proba += (count[s] * step) - ((U64)proba << scale) > restToBeat;
+				}
+				if (proba > largestP)
+					largestP = proba, largest = s;
+				normalizedCounter[s] = proba;
+				stillToDistribute -= proba;
+			}
+		}
+		if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+			/* corner case, need another normalization method */
+			size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+			if (FSE_isError(errorCode))
+				return errorCode;
+		} else
+			normalizedCounter[largest] += (short)stillToDistribute;
+	}
+
+	return tableLog;
+}
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits)
+{
+	const unsigned tableSize = 1 << nbBits;
+	const unsigned tableMask = tableSize - 1;
+	const unsigned maxSymbolValue = tableMask;
+	void *const ptr = ct;
+	U16 *const tableU16 = ((U16 *)ptr) + 2;
+	void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableSize >> 1); /* assumption : tableLog >= 1 */
+	FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
+	unsigned s;
+
+	/* Sanity checks */
+	if (nbBits < 1)
+		return ERROR(GENERIC); /* min size */
+
+	/* header */
+	tableU16[-2] = (U16)nbBits;
+	tableU16[-1] = (U16)maxSymbolValue;
+
+	/* Build table */
+	for (s = 0; s < tableSize; s++)
+		tableU16[s] = (U16)(tableSize + s);
+
+	/* Build Symbol Transformation Table */
+	{
+		const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+		for (s = 0; s <= maxSymbolValue; s++) {
+			symbolTT[s].deltaNbBits = deltaNbBits;
+			symbolTT[s].deltaFindState = s - 1;
+		}
+	}
+
+	return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle(FSE_CTable *ct, BYTE symbolValue)
+{
+	void *ptr = ct;
+	U16 *tableU16 = ((U16 *)ptr) + 2;
+	void *FSCTptr = (U32 *)ptr + 2;
+	FSE_symbolCompressionTransform *symbolTT = (FSE_symbolCompressionTransform *)FSCTptr;
+
+	/* header */
+	tableU16[-2] = (U16)0;
+	tableU16[-1] = (U16)symbolValue;
+
+	/* Build table */
+	tableU16[0] = 0;
+	tableU16[1] = 0; /* just in case */
+
+	/* Build Symbol Transformation Table */
+	symbolTT[symbolValue].deltaNbBits = 0;
+	symbolTT[symbolValue].deltaFindState = 0;
+
+	return 0;
+}
+
+static size_t FSE_compress_usingCTable_generic(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct, const unsigned fast)
+{
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *ip = iend;
+
+	BIT_CStream_t bitC;
+	FSE_CState_t CState1, CState2;
+
+	/* init */
+	if (srcSize <= 2)
+		return 0;
+	{
+		size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+		if (FSE_isError(initError))
+			return 0; /* not enough space available to write a bitstream */
+	}
+
+#define FSE_FLUSHBITS(s) (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+	if (srcSize & 1) {
+		FSE_initCState2(&CState1, ct, *--ip);
+		FSE_initCState2(&CState2, ct, *--ip);
+		FSE_encodeSymbol(&bitC, &CState1, *--ip);
+		FSE_FLUSHBITS(&bitC);
+	} else {
+		FSE_initCState2(&CState2, ct, *--ip);
+		FSE_initCState2(&CState1, ct, *--ip);
+	}
+
+	/* join to mod 4 */
+	srcSize -= 2;
+	if ((sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) && (srcSize & 2)) { /* test bit 2 */
+		FSE_encodeSymbol(&bitC, &CState2, *--ip);
+		FSE_encodeSymbol(&bitC, &CState1, *--ip);
+		FSE_FLUSHBITS(&bitC);
+	}
+
+	/* 2 or 4 encoding per loop */
+	while (ip > istart) {
+
+		FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+		if (sizeof(bitC.bitContainer) * 8 < FSE_MAX_TABLELOG * 2 + 7) /* this test must be static */
+			FSE_FLUSHBITS(&bitC);
+
+		FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+		if (sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) { /* this test must be static */
+			FSE_encodeSymbol(&bitC, &CState2, *--ip);
+			FSE_encodeSymbol(&bitC, &CState1, *--ip);
+		}
+
+		FSE_FLUSHBITS(&bitC);
+	}
+
+	FSE_flushCState(&bitC, &CState2);
+	FSE_flushCState(&bitC, &CState1);
+	return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct)
+{
+	unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+	if (fast)
+		return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+	else
+		return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
diff --git a/lib/zstd/fse_decompress.c b/lib/zstd/fse_decompress.c
new file mode 100644
index 000000000000..a84300e5a013
--- /dev/null
+++ b/lib/zstd/fse_decompress.c
@@ -0,0 +1,332 @@
+/*
+ * FSE : Finite State Entropy decoder
+ * Copyright (C) 2013-2015, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { FSE_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+
+/* check and forward error code */
+#define CHECK_F(f)                  \
+	{                           \
+		size_t const e = f; \
+		if (FSE_isError(e)) \
+			return e;   \
+	}
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X, Y) X##Y
+#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
+#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
+
+/* Function templates */
+
+size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
+{
+	void *const tdPtr = dt + 1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
+	FSE_DECODE_TYPE *const tableDecode = (FSE_DECODE_TYPE *)(tdPtr);
+	U16 *symbolNext = (U16 *)workspace;
+
+	U32 const maxSV1 = maxSymbolValue + 1;
+	U32 const tableSize = 1 << tableLog;
+	U32 highThreshold = tableSize - 1;
+
+	/* Sanity Checks */
+	if (workspaceSize < sizeof(U16) * (FSE_MAX_SYMBOL_VALUE + 1))
+		return ERROR(tableLog_tooLarge);
+	if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE)
+		return ERROR(maxSymbolValue_tooLarge);
+	if (tableLog > FSE_MAX_TABLELOG)
+		return ERROR(tableLog_tooLarge);
+
+	/* Init, lay down lowprob symbols */
+	{
+		FSE_DTableHeader DTableH;
+		DTableH.tableLog = (U16)tableLog;
+		DTableH.fastMode = 1;
+		{
+			S16 const largeLimit = (S16)(1 << (tableLog - 1));
+			U32 s;
+			for (s = 0; s < maxSV1; s++) {
+				if (normalizedCounter[s] == -1) {
+					tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+					symbolNext[s] = 1;
+				} else {
+					if (normalizedCounter[s] >= largeLimit)
+						DTableH.fastMode = 0;
+					symbolNext[s] = normalizedCounter[s];
+				}
+			}
+		}
+		memcpy(dt, &DTableH, sizeof(DTableH));
+	}
+
+	/* Spread symbols */
+	{
+		U32 const tableMask = tableSize - 1;
+		U32 const step = FSE_TABLESTEP(tableSize);
+		U32 s, position = 0;
+		for (s = 0; s < maxSV1; s++) {
+			int i;
+			for (i = 0; i < normalizedCounter[s]; i++) {
+				tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+				position = (position + step) & tableMask;
+				while (position > highThreshold)
+					position = (position + step) & tableMask; /* lowprob area */
+			}
+		}
+		if (position != 0)
+			return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+	}
+
+	/* Build Decoding table */
+	{
+		U32 u;
+		for (u = 0; u < tableSize; u++) {
+			FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+			U16 nextState = symbolNext[symbol]++;
+			tableDecode[u].nbBits = (BYTE)(tableLog - BIT_highbit32((U32)nextState));
+			tableDecode[u].newState = (U16)((nextState << tableDecode[u].nbBits) - tableSize);
+		}
+	}
+
+	return 0;
+}
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle(FSE_DTable *dt, BYTE symbolValue)
+{
+	void *ptr = dt;
+	FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr;
+	void *dPtr = dt + 1;
+	FSE_decode_t *const cell = (FSE_decode_t *)dPtr;
+
+	DTableH->tableLog = 0;
+	DTableH->fastMode = 0;
+
+	cell->newState = 0;
+	cell->symbol = symbolValue;
+	cell->nbBits = 0;
+
+	return 0;
+}
+
+size_t FSE_buildDTable_raw(FSE_DTable *dt, unsigned nbBits)
+{
+	void *ptr = dt;
+	FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr;
+	void *dPtr = dt + 1;
+	FSE_decode_t *const dinfo = (FSE_decode_t *)dPtr;
+	const unsigned tableSize = 1 << nbBits;
+	const unsigned tableMask = tableSize - 1;
+	const unsigned maxSV1 = tableMask + 1;
+	unsigned s;
+
+	/* Sanity checks */
+	if (nbBits < 1)
+		return ERROR(GENERIC); /* min size */
+
+	/* Build Decoding Table */
+	DTableH->tableLog = (U16)nbBits;
+	DTableH->fastMode = 1;
+	for (s = 0; s < maxSV1; s++) {
+		dinfo[s].newState = 0;
+		dinfo[s].symbol = (BYTE)s;
+		dinfo[s].nbBits = (BYTE)nbBits;
+	}
+
+	return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt,
+						       const unsigned fast)
+{
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *op = ostart;
+	BYTE *const omax = op + maxDstSize;
+	BYTE *const olimit = omax - 3;
+
+	BIT_DStream_t bitD;
+	FSE_DState_t state1;
+	FSE_DState_t state2;
+
+	/* Init */
+	CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+	FSE_initDState(&state1, &bitD, dt);
+	FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+	/* 4 symbols per loop */
+	for (; (BIT_reloadDStream(&bitD) == BIT_DStream_unfinished) & (op < olimit); op += 4) {
+		op[0] = FSE_GETSYMBOL(&state1);
+
+		if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+			BIT_reloadDStream(&bitD);
+
+		op[1] = FSE_GETSYMBOL(&state2);
+
+		if (FSE_MAX_TABLELOG * 4 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+		{
+			if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) {
+				op += 2;
+				break;
+			}
+		}
+
+		op[2] = FSE_GETSYMBOL(&state1);
+
+		if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+			BIT_reloadDStream(&bitD);
+
+		op[3] = FSE_GETSYMBOL(&state2);
+	}
+
+	/* tail */
+	/* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+	while (1) {
+		if (op > (omax - 2))
+			return ERROR(dstSize_tooSmall);
+		*op++ = FSE_GETSYMBOL(&state1);
+		if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
+			*op++ = FSE_GETSYMBOL(&state2);
+			break;
+		}
+
+		if (op > (omax - 2))
+			return ERROR(dstSize_tooSmall);
+		*op++ = FSE_GETSYMBOL(&state2);
+		if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
+			*op++ = FSE_GETSYMBOL(&state1);
+			break;
+		}
+	}
+
+	return op - ostart;
+}
+
+size_t FSE_decompress_usingDTable(void *dst, size_t originalSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt)
+{
+	const void *ptr = dt;
+	const FSE_DTableHeader *DTableH = (const FSE_DTableHeader *)ptr;
+	const U32 fastMode = DTableH->fastMode;
+
+	/* select fast mode (static) */
+	if (fastMode)
+		return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+	return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize)
+{
+	const BYTE *const istart = (const BYTE *)cSrc;
+	const BYTE *ip = istart;
+	unsigned tableLog;
+	unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+	size_t NCountLength;
+
+	FSE_DTable *dt;
+	short *counting;
+	size_t spaceUsed32 = 0;
+
+	FSE_STATIC_ASSERT(sizeof(FSE_DTable) == sizeof(U32));
+
+	dt = (FSE_DTable *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += FSE_DTABLE_SIZE_U32(maxLog);
+	counting = (short *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(short) * (FSE_MAX_SYMBOL_VALUE + 1), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* normal FSE decoding mode */
+	NCountLength = FSE_readNCount(counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+	if (FSE_isError(NCountLength))
+		return NCountLength;
+	// if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size; supposed to be already checked in NCountLength, only remaining
+	// case : NCountLength==cSrcSize */
+	if (tableLog > maxLog)
+		return ERROR(tableLog_tooLarge);
+	ip += NCountLength;
+	cSrcSize -= NCountLength;
+
+	CHECK_F(FSE_buildDTable_wksp(dt, counting, maxSymbolValue, tableLog, workspace, workspaceSize));
+
+	return FSE_decompress_usingDTable(dst, dstCapacity, ip, cSrcSize, dt); /* always return, even if it is an error code */
+}
diff --git a/lib/zstd/huf.h b/lib/zstd/huf.h
new file mode 100644
index 000000000000..2143da28d952
--- /dev/null
+++ b/lib/zstd/huf.h
@@ -0,0 +1,212 @@
+/*
+ * Huffman coder, part of New Generation Entropy library
+ * header file
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include <linux/types.h> /* size_t */
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */
+
+/* Error Management */
+unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */
+
+/* ***   Advanced function   *** */
+
+/** HUF_compress4X_wksp() :
+*   Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */
+size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+/* *** Dependencies *** */
+#include "mem.h" /* U32 */
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX 12     /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT 11 /* tableLog by default, when not specified */
+#define HUF_SYMBOLVALUE_MAX 255
+
+#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size >> 8) + 8)			 /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+	U32 name##hb[maxSymbolValue + 1];              \
+	void *name##hv = &(name##hb);                  \
+	HUF_CElt *name = (HUF_CElt *)(name##hv) /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1 << (maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = {((U32)((maxTableLog)-1) * 0x01000001)}
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = {((U32)(maxTableLog)*0x01000001)}
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_COMPRESS_WORKSPACE_SIZE (6 << 10)
+#define HUF_COMPRESS_WORKSPACE_SIZE_U32 (HUF_COMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (3 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize); /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				size_t workspaceSize);							       /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< double-symbols decoder */
+
+/* ****************************************
+*  HUF detailed API
+******************************************/
+/*!
+HUF_compress() does the following:
+1. count symbol occurrence from source[] into table count[] using FSE_count()
+2. (optional) refine tableLog using HUF_optimalTableLog()
+3. build Huffman table from count using HUF_buildCTable()
+4. save Huffman table to memory buffer using HUF_writeCTable_wksp()
+5. encode the data stream using HUF_compress4X_usingCTable()
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and regenerate 'CTable' using external methods.
+*/
+/* FSE_count() : find it within "fse.h" */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */
+size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, unsigned maxSymbolValue, unsigned huffLog, void *workspace, size_t workspaceSize);
+size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
+
+typedef enum {
+	HUF_repeat_none,  /**< Cannot use the previous table */
+	HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1,
+			     4}X_repeat */
+	HUF_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+} HUF_repeat;
+/** HUF_compress4X_repeat() :
+*   Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+*   If it uses hufTable it does not modify hufTable or repeat.
+*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+*   If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ */
+size_t HUF_buildCTable_wksp(HUF_CElt *tree, const U32 *count, U32 maxSymbolValue, U32 maxNbBits, void *workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+	Read compact Huffman tree, saved by HUF_writeCTable().
+	`huffWeight` is destination buffer.
+	@return : size read from `src` , or an error Code .
+	Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize,
+			  void *workspace, size_t workspaceSize);
+
+/** HUF_readCTable() :
+*   Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable_wksp(HUF_CElt *CTable, unsigned maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+
+/*
+HUF_decompress() does the following:
+1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+2. build Huffman table from save, using HUF_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
+*/
+
+/** HUF_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize);
+
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+
+size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress4X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress4X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+
+/* single stream variants */
+
+size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
+/** HUF_compress1X_repeat() :
+*   Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+*   If it uses hufTable it does not modify hufTable or repeat.
+*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+*   If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize);
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< double-symbols decoder */
+
+size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize,
+				    const HUF_DTable *DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress1X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+
+#endif /* HUF_H_298734234 */
diff --git a/lib/zstd/huf_compress.c b/lib/zstd/huf_compress.c
new file mode 100644
index 000000000000..40055a7016e6
--- /dev/null
+++ b/lib/zstd/huf_compress.c
@@ -0,0 +1,770 @@
+/*
+ * Huffman encoder, part of New Generation Entropy library
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h" /* header compression */
+#include "huf.h"
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { HUF_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+#define CHECK_V_F(e, f)     \
+	size_t const e = f; \
+	if (ERR_isError(e)) \
+	return f
+#define CHECK_F(f)                        \
+	{                                 \
+		CHECK_V_F(_var_err__, f); \
+	}
+
+/* **************************************************************
+*  Utils
+****************************************************************/
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+	return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+}
+
+/* *******************************************************
+*  HUF : Huffman block compression
+*********************************************************/
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+size_t HUF_compressWeights_wksp(void *dst, size_t dstSize, const void *weightTable, size_t wtSize, void *workspace, size_t workspaceSize)
+{
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *op = ostart;
+	BYTE *const oend = ostart + dstSize;
+
+	U32 maxSymbolValue = HUF_TABLELOG_MAX;
+	U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+
+	FSE_CTable *CTable;
+	U32 *count;
+	S16 *norm;
+	size_t spaceUsed32 = 0;
+
+	HUF_STATIC_ASSERT(sizeof(FSE_CTable) == sizeof(U32));
+
+	CTable = (FSE_CTable *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX);
+	count = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 1;
+	norm = (S16 *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(S16) * (HUF_TABLELOG_MAX + 1), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* init conditions */
+	if (wtSize <= 1)
+		return 0; /* Not compressible */
+
+	/* Scan input and build symbol stats */
+	{
+		CHECK_V_F(maxCount, FSE_count_simple(count, &maxSymbolValue, weightTable, wtSize));
+		if (maxCount == wtSize)
+			return 1; /* only a single symbol in src : rle */
+		if (maxCount == 1)
+			return 0; /* each symbol present maximum once => not compressible */
+	}
+
+	tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+	CHECK_F(FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue));
+
+	/* Write table description header */
+	{
+		CHECK_V_F(hSize, FSE_writeNCount(op, oend - op, norm, maxSymbolValue, tableLog));
+		op += hSize;
+	}
+
+	/* Compress */
+	CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, workspace, workspaceSize));
+	{
+		CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable));
+		if (cSize == 0)
+			return 0; /* not enough space for compressed data */
+		op += cSize;
+	}
+
+	return op - ostart;
+}
+
+struct HUF_CElt_s {
+	U16 val;
+	BYTE nbBits;
+}; /* typedef'd to HUF_CElt within "huf.h" */
+
+/*! HUF_writeCTable_wksp() :
+	`CTable` : Huffman tree to save, using huf representation.
+	@return : size of saved CTable */
+size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, U32 maxSymbolValue, U32 huffLog, void *workspace, size_t workspaceSize)
+{
+	BYTE *op = (BYTE *)dst;
+	U32 n;
+
+	BYTE *bitsToWeight;
+	BYTE *huffWeight;
+	size_t spaceUsed32 = 0;
+
+	bitsToWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_TABLELOG_MAX + 1, sizeof(U32)) >> 2;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* check conditions */
+	if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+		return ERROR(maxSymbolValue_tooLarge);
+
+	/* convert to weight */
+	bitsToWeight[0] = 0;
+	for (n = 1; n < huffLog + 1; n++)
+		bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+	for (n = 0; n < maxSymbolValue; n++)
+		huffWeight[n] = bitsToWeight[CTable[n].nbBits];
+
+	/* attempt weights compression by FSE */
+	{
+		CHECK_V_F(hSize, HUF_compressWeights_wksp(op + 1, maxDstSize - 1, huffWeight, maxSymbolValue, workspace, workspaceSize));
+		if ((hSize > 1) & (hSize < maxSymbolValue / 2)) { /* FSE compressed */
+			op[0] = (BYTE)hSize;
+			return hSize + 1;
+		}
+	}
+
+	/* write raw values as 4-bits (max : 15) */
+	if (maxSymbolValue > (256 - 128))
+		return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */
+	if (((maxSymbolValue + 1) / 2) + 1 > maxDstSize)
+		return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */
+	op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue - 1));
+	huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */
+	for (n = 0; n < maxSymbolValue; n += 2)
+		op[(n / 2) + 1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n + 1]);
+	return ((maxSymbolValue + 1) / 2) + 1;
+}
+
+size_t HUF_readCTable_wksp(HUF_CElt *CTable, U32 maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 *rankVal;
+	BYTE *huffWeight;
+	U32 tableLog = 0;
+	U32 nbSymbols = 0;
+	size_t readSize;
+	size_t spaceUsed32 = 0;
+
+	rankVal = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* get symbol weights */
+	readSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+	if (ERR_isError(readSize))
+		return readSize;
+
+	/* check result */
+	if (tableLog > HUF_TABLELOG_MAX)
+		return ERROR(tableLog_tooLarge);
+	if (nbSymbols > maxSymbolValue + 1)
+		return ERROR(maxSymbolValue_tooSmall);
+
+	/* Prepare base value per rank */
+	{
+		U32 n, nextRankStart = 0;
+		for (n = 1; n <= tableLog; n++) {
+			U32 curr = nextRankStart;
+			nextRankStart += (rankVal[n] << (n - 1));
+			rankVal[n] = curr;
+		}
+	}
+
+	/* fill nbBits */
+	{
+		U32 n;
+		for (n = 0; n < nbSymbols; n++) {
+			const U32 w = huffWeight[n];
+			CTable[n].nbBits = (BYTE)(tableLog + 1 - w);
+		}
+	}
+
+	/* fill val */
+	{
+		U16 nbPerRank[HUF_TABLELOG_MAX + 2] = {0}; /* support w=0=>n=tableLog+1 */
+		U16 valPerRank[HUF_TABLELOG_MAX + 2] = {0};
+		{
+			U32 n;
+			for (n = 0; n < nbSymbols; n++)
+				nbPerRank[CTable[n].nbBits]++;
+		}
+		/* determine stating value per rank */
+		valPerRank[tableLog + 1] = 0; /* for w==0 */
+		{
+			U16 min = 0;
+			U32 n;
+			for (n = tableLog; n > 0; n--) { /* start at n=tablelog <-> w=1 */
+				valPerRank[n] = min;     /* get starting value within each rank */
+				min += nbPerRank[n];
+				min >>= 1;
+			}
+		}
+		/* assign value within rank, symbol order */
+		{
+			U32 n;
+			for (n = 0; n <= maxSymbolValue; n++)
+				CTable[n].val = valPerRank[CTable[n].nbBits]++;
+		}
+	}
+
+	return readSize;
+}
+
+typedef struct nodeElt_s {
+	U32 count;
+	U16 parent;
+	BYTE byte;
+	BYTE nbBits;
+} nodeElt;
+
+static U32 HUF_setMaxHeight(nodeElt *huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+	const U32 largestBits = huffNode[lastNonNull].nbBits;
+	if (largestBits <= maxNbBits)
+		return largestBits; /* early exit : no elt > maxNbBits */
+
+	/* there are several too large elements (at least >= 2) */
+	{
+		int totalCost = 0;
+		const U32 baseCost = 1 << (largestBits - maxNbBits);
+		U32 n = lastNonNull;
+
+		while (huffNode[n].nbBits > maxNbBits) {
+			totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+			huffNode[n].nbBits = (BYTE)maxNbBits;
+			n--;
+		} /* n stops at huffNode[n].nbBits <= maxNbBits */
+		while (huffNode[n].nbBits == maxNbBits)
+			n--; /* n end at index of smallest symbol using < maxNbBits */
+
+		/* renorm totalCost */
+		totalCost >>= (largestBits - maxNbBits); /* note : totalCost is necessarily a multiple of baseCost */
+
+		/* repay normalized cost */
+		{
+			U32 const noSymbol = 0xF0F0F0F0;
+			U32 rankLast[HUF_TABLELOG_MAX + 2];
+			int pos;
+
+			/* Get pos of last (smallest) symbol per rank */
+			memset(rankLast, 0xF0, sizeof(rankLast));
+			{
+				U32 currNbBits = maxNbBits;
+				for (pos = n; pos >= 0; pos--) {
+					if (huffNode[pos].nbBits >= currNbBits)
+						continue;
+					currNbBits = huffNode[pos].nbBits; /* < maxNbBits */
+					rankLast[maxNbBits - currNbBits] = pos;
+				}
+			}
+
+			while (totalCost > 0) {
+				U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1;
+				for (; nBitsToDecrease > 1; nBitsToDecrease--) {
+					U32 highPos = rankLast[nBitsToDecrease];
+					U32 lowPos = rankLast[nBitsToDecrease - 1];
+					if (highPos == noSymbol)
+						continue;
+					if (lowPos == noSymbol)
+						break;
+					{
+						U32 const highTotal = huffNode[highPos].count;
+						U32 const lowTotal = 2 * huffNode[lowPos].count;
+						if (highTotal <= lowTotal)
+							break;
+					}
+				}
+				/* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+				/* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+				while ((nBitsToDecrease <= HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+					nBitsToDecrease++;
+				totalCost -= 1 << (nBitsToDecrease - 1);
+				if (rankLast[nBitsToDecrease - 1] == noSymbol)
+					rankLast[nBitsToDecrease - 1] = rankLast[nBitsToDecrease]; /* this rank is no longer empty */
+				huffNode[rankLast[nBitsToDecrease]].nbBits++;
+				if (rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */
+					rankLast[nBitsToDecrease] = noSymbol;
+				else {
+					rankLast[nBitsToDecrease]--;
+					if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits - nBitsToDecrease)
+						rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */
+				}
+			} /* while (totalCost > 0) */
+
+			while (totalCost < 0) {		       /* Sometimes, cost correction overshoot */
+				if (rankLast[1] == noSymbol) { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0
+								  (using maxNbBits) */
+					while (huffNode[n].nbBits == maxNbBits)
+						n--;
+					huffNode[n + 1].nbBits--;
+					rankLast[1] = n + 1;
+					totalCost++;
+					continue;
+				}
+				huffNode[rankLast[1] + 1].nbBits--;
+				rankLast[1]++;
+				totalCost++;
+			}
+		}
+	} /* there are several too large elements (at least >= 2) */
+
+	return maxNbBits;
+}
+
+typedef struct {
+	U32 base;
+	U32 curr;
+} rankPos;
+
+static void HUF_sort(nodeElt *huffNode, const U32 *count, U32 maxSymbolValue)
+{
+	rankPos rank[32];
+	U32 n;
+
+	memset(rank, 0, sizeof(rank));
+	for (n = 0; n <= maxSymbolValue; n++) {
+		U32 r = BIT_highbit32(count[n] + 1);
+		rank[r].base++;
+	}
+	for (n = 30; n > 0; n--)
+		rank[n - 1].base += rank[n].base;
+	for (n = 0; n < 32; n++)
+		rank[n].curr = rank[n].base;
+	for (n = 0; n <= maxSymbolValue; n++) {
+		U32 const c = count[n];
+		U32 const r = BIT_highbit32(c + 1) + 1;
+		U32 pos = rank[r].curr++;
+		while ((pos > rank[r].base) && (c > huffNode[pos - 1].count))
+			huffNode[pos] = huffNode[pos - 1], pos--;
+		huffNode[pos].count = c;
+		huffNode[pos].byte = (BYTE)n;
+	}
+}
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX + 1)
+typedef nodeElt huffNodeTable[2 * HUF_SYMBOLVALUE_MAX + 1 + 1];
+size_t HUF_buildCTable_wksp(HUF_CElt *tree, const U32 *count, U32 maxSymbolValue, U32 maxNbBits, void *workSpace, size_t wkspSize)
+{
+	nodeElt *const huffNode0 = (nodeElt *)workSpace;
+	nodeElt *const huffNode = huffNode0 + 1;
+	U32 n, nonNullRank;
+	int lowS, lowN;
+	U16 nodeNb = STARTNODE;
+	U32 nodeRoot;
+
+	/* safety checks */
+	if (wkspSize < sizeof(huffNodeTable))
+		return ERROR(GENERIC); /* workSpace is not large enough */
+	if (maxNbBits == 0)
+		maxNbBits = HUF_TABLELOG_DEFAULT;
+	if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+		return ERROR(GENERIC);
+	memset(huffNode0, 0, sizeof(huffNodeTable));
+
+	/* sort, decreasing order */
+	HUF_sort(huffNode, count, maxSymbolValue);
+
+	/* init for parents */
+	nonNullRank = maxSymbolValue;
+	while (huffNode[nonNullRank].count == 0)
+		nonNullRank--;
+	lowS = nonNullRank;
+	nodeRoot = nodeNb + lowS - 1;
+	lowN = nodeNb;
+	huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS - 1].count;
+	huffNode[lowS].parent = huffNode[lowS - 1].parent = nodeNb;
+	nodeNb++;
+	lowS -= 2;
+	for (n = nodeNb; n <= nodeRoot; n++)
+		huffNode[n].count = (U32)(1U << 30);
+	huffNode0[0].count = (U32)(1U << 31); /* fake entry, strong barrier */
+
+	/* create parents */
+	while (nodeNb <= nodeRoot) {
+		U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+		U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+		huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+		huffNode[n1].parent = huffNode[n2].parent = nodeNb;
+		nodeNb++;
+	}
+
+	/* distribute weights (unlimited tree height) */
+	huffNode[nodeRoot].nbBits = 0;
+	for (n = nodeRoot - 1; n >= STARTNODE; n--)
+		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1;
+	for (n = 0; n <= nonNullRank; n++)
+		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1;
+
+	/* enforce maxTableLog */
+	maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits);
+
+	/* fill result into tree (val, nbBits) */
+	{
+		U16 nbPerRank[HUF_TABLELOG_MAX + 1] = {0};
+		U16 valPerRank[HUF_TABLELOG_MAX + 1] = {0};
+		if (maxNbBits > HUF_TABLELOG_MAX)
+			return ERROR(GENERIC); /* check fit into table */
+		for (n = 0; n <= nonNullRank; n++)
+			nbPerRank[huffNode[n].nbBits]++;
+		/* determine stating value per rank */
+		{
+			U16 min = 0;
+			for (n = maxNbBits; n > 0; n--) {
+				valPerRank[n] = min; /* get starting value within each rank */
+				min += nbPerRank[n];
+				min >>= 1;
+			}
+		}
+		for (n = 0; n <= maxSymbolValue; n++)
+			tree[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */
+		for (n = 0; n <= maxSymbolValue; n++)
+			tree[n].val = valPerRank[tree[n].nbBits]++; /* assign value within rank, symbol order */
+	}
+
+	return maxNbBits;
+}
+
+static size_t HUF_estimateCompressedSize(HUF_CElt *CTable, const unsigned *count, unsigned maxSymbolValue)
+{
+	size_t nbBits = 0;
+	int s;
+	for (s = 0; s <= (int)maxSymbolValue; ++s) {
+		nbBits += CTable[s].nbBits * count[s];
+	}
+	return nbBits >> 3;
+}
+
+static int HUF_validateCTable(const HUF_CElt *CTable, const unsigned *count, unsigned maxSymbolValue)
+{
+	int bad = 0;
+	int s;
+	for (s = 0; s <= (int)maxSymbolValue; ++s) {
+		bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+	}
+	return !bad;
+}
+
+static void HUF_encodeSymbol(BIT_CStream_t *bitCPtr, U32 symbol, const HUF_CElt *CTable)
+{
+	BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+#define HUF_FLUSHBITS(s)  BIT_flushBits(s)
+
+#define HUF_FLUSHBITS_1(stream)                                            \
+	if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 2 + 7) \
+	HUF_FLUSHBITS(stream)
+
+#define HUF_FLUSHBITS_2(stream)                                            \
+	if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 4 + 7) \
+	HUF_FLUSHBITS(stream)
+
+size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable)
+{
+	const BYTE *ip = (const BYTE *)src;
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *const oend = ostart + dstSize;
+	BYTE *op = ostart;
+	size_t n;
+	BIT_CStream_t bitC;
+
+	/* init */
+	if (dstSize < 8)
+		return 0; /* not enough space to compress */
+	{
+		size_t const initErr = BIT_initCStream(&bitC, op, oend - op);
+		if (HUF_isError(initErr))
+			return 0;
+	}
+
+	n = srcSize & ~3; /* join to mod 4 */
+	switch (srcSize & 3) {
+	case 3: HUF_encodeSymbol(&bitC, ip[n + 2], CTable); HUF_FLUSHBITS_2(&bitC);
+	case 2: HUF_encodeSymbol(&bitC, ip[n + 1], CTable); HUF_FLUSHBITS_1(&bitC);
+	case 1: HUF_encodeSymbol(&bitC, ip[n + 0], CTable); HUF_FLUSHBITS(&bitC);
+	case 0:
+	default:;
+	}
+
+	for (; n > 0; n -= 4) { /* note : n&3==0 at this stage */
+		HUF_encodeSymbol(&bitC, ip[n - 1], CTable);
+		HUF_FLUSHBITS_1(&bitC);
+		HUF_encodeSymbol(&bitC, ip[n - 2], CTable);
+		HUF_FLUSHBITS_2(&bitC);
+		HUF_encodeSymbol(&bitC, ip[n - 3], CTable);
+		HUF_FLUSHBITS_1(&bitC);
+		HUF_encodeSymbol(&bitC, ip[n - 4], CTable);
+		HUF_FLUSHBITS(&bitC);
+	}
+
+	return BIT_closeCStream(&bitC);
+}
+
+size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable)
+{
+	size_t const segmentSize = (srcSize + 3) / 4; /* first 3 segments */
+	const BYTE *ip = (const BYTE *)src;
+	const BYTE *const iend = ip + srcSize;
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *const oend = ostart + dstSize;
+	BYTE *op = ostart;
+
+	if (dstSize < 6 + 1 + 1 + 1 + 8)
+		return 0; /* minimum space to compress successfully */
+	if (srcSize < 12)
+		return 0; /* no saving possible : too small input */
+	op += 6;	  /* jumpTable */
+
+	{
+		CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable));
+		if (cSize == 0)
+			return 0;
+		ZSTD_writeLE16(ostart, (U16)cSize);
+		op += cSize;
+	}
+
+	ip += segmentSize;
+	{
+		CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable));
+		if (cSize == 0)
+			return 0;
+		ZSTD_writeLE16(ostart + 2, (U16)cSize);
+		op += cSize;
+	}
+
+	ip += segmentSize;
+	{
+		CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable));
+		if (cSize == 0)
+			return 0;
+		ZSTD_writeLE16(ostart + 4, (U16)cSize);
+		op += cSize;
+	}
+
+	ip += segmentSize;
+	{
+		CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, iend - ip, CTable));
+		if (cSize == 0)
+			return 0;
+		op += cSize;
+	}
+
+	return op - ostart;
+}
+
+static size_t HUF_compressCTable_internal(BYTE *const ostart, BYTE *op, BYTE *const oend, const void *src, size_t srcSize, unsigned singleStream,
+					  const HUF_CElt *CTable)
+{
+	size_t const cSize =
+	    singleStream ? HUF_compress1X_usingCTable(op, oend - op, src, srcSize, CTable) : HUF_compress4X_usingCTable(op, oend - op, src, srcSize, CTable);
+	if (HUF_isError(cSize)) {
+		return cSize;
+	}
+	if (cSize == 0) {
+		return 0;
+	} /* uncompressible */
+	op += cSize;
+	/* check compressibility */
+	if ((size_t)(op - ostart) >= srcSize - 1) {
+		return 0;
+	}
+	return op - ostart;
+}
+
+/* `workSpace` must a table of at least 1024 unsigned */
+static size_t HUF_compress_internal(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog,
+				    unsigned singleStream, void *workSpace, size_t wkspSize, HUF_CElt *oldHufTable, HUF_repeat *repeat, int preferRepeat)
+{
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *const oend = ostart + dstSize;
+	BYTE *op = ostart;
+
+	U32 *count;
+	size_t const countSize = sizeof(U32) * (HUF_SYMBOLVALUE_MAX + 1);
+	HUF_CElt *CTable;
+	size_t const CTableSize = sizeof(HUF_CElt) * (HUF_SYMBOLVALUE_MAX + 1);
+
+	/* checks & inits */
+	if (wkspSize < sizeof(huffNodeTable) + countSize + CTableSize)
+		return ERROR(GENERIC);
+	if (!srcSize)
+		return 0; /* Uncompressed (note : 1 means rle, so first byte must be correct) */
+	if (!dstSize)
+		return 0; /* cannot fit within dst budget */
+	if (srcSize > HUF_BLOCKSIZE_MAX)
+		return ERROR(srcSize_wrong); /* curr block size limit */
+	if (huffLog > HUF_TABLELOG_MAX)
+		return ERROR(tableLog_tooLarge);
+	if (!maxSymbolValue)
+		maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+	if (!huffLog)
+		huffLog = HUF_TABLELOG_DEFAULT;
+
+	count = (U32 *)workSpace;
+	workSpace = (BYTE *)workSpace + countSize;
+	wkspSize -= countSize;
+	CTable = (HUF_CElt *)workSpace;
+	workSpace = (BYTE *)workSpace + CTableSize;
+	wkspSize -= CTableSize;
+
+	/* Heuristic : If we don't need to check the validity of the old table use the old table for small inputs */
+	if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+		return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+	}
+
+	/* Scan input and build symbol stats */
+	{
+		CHECK_V_F(largest, FSE_count_wksp(count, &maxSymbolValue, (const BYTE *)src, srcSize, (U32 *)workSpace));
+		if (largest == srcSize) {
+			*ostart = ((const BYTE *)src)[0];
+			return 1;
+		} /* single symbol, rle */
+		if (largest <= (srcSize >> 7) + 1)
+			return 0; /* Fast heuristic : not compressible enough */
+	}
+
+	/* Check validity of previous table */
+	if (repeat && *repeat == HUF_repeat_check && !HUF_validateCTable(oldHufTable, count, maxSymbolValue)) {
+		*repeat = HUF_repeat_none;
+	}
+	/* Heuristic : use existing table for small inputs */
+	if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+		return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+	}
+
+	/* Build Huffman Tree */
+	huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+	{
+		CHECK_V_F(maxBits, HUF_buildCTable_wksp(CTable, count, maxSymbolValue, huffLog, workSpace, wkspSize));
+		huffLog = (U32)maxBits;
+		/* Zero the unused symbols so we can check it for validity */
+		memset(CTable + maxSymbolValue + 1, 0, CTableSize - (maxSymbolValue + 1) * sizeof(HUF_CElt));
+	}
+
+	/* Write table description header */
+	{
+		CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, CTable, maxSymbolValue, huffLog, workSpace, wkspSize));
+		/* Check if using the previous table will be beneficial */
+		if (repeat && *repeat != HUF_repeat_none) {
+			size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue);
+			size_t const newSize = HUF_estimateCompressedSize(CTable, count, maxSymbolValue);
+			if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+				return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+			}
+		}
+		/* Use the new table */
+		if (hSize + 12ul >= srcSize) {
+			return 0;
+		}
+		op += hSize;
+		if (repeat) {
+			*repeat = HUF_repeat_none;
+		}
+		if (oldHufTable) {
+			memcpy(oldHufTable, CTable, CTableSize);
+		} /* Save the new table */
+	}
+	return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, CTable);
+}
+
+size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+			   size_t wkspSize)
+{
+	return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, NULL, NULL, 0);
+}
+
+size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, int preferRepeat)
+{
+	return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, hufTable, repeat,
+				     preferRepeat);
+}
+
+size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+			   size_t wkspSize)
+{
+	return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, NULL, NULL, 0);
+}
+
+size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, int preferRepeat)
+{
+	return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, hufTable, repeat,
+				     preferRepeat);
+}
diff --git a/lib/zstd/huf_decompress.c b/lib/zstd/huf_decompress.c
new file mode 100644
index 000000000000..6526482047dc
--- /dev/null
+++ b/lib/zstd/huf_decompress.c
@@ -0,0 +1,960 @@
+/*
+ * Huffman decoder, part of New Generation Entropy library
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include "bitstream.h" /* BIT_* */
+#include "fse.h"       /* header compression */
+#include "huf.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { HUF_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+
+typedef struct {
+	BYTE maxTableLog;
+	BYTE tableType;
+	BYTE tableLog;
+	BYTE reserved;
+} DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable *table)
+{
+	DTableDesc dtd;
+	memcpy(&dtd, table, sizeof(dtd));
+	return dtd;
+}
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+typedef struct {
+	BYTE byte;
+	BYTE nbBits;
+} HUF_DEltX2; /* single-symbol decoding */
+
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 tableLog = 0;
+	U32 nbSymbols = 0;
+	size_t iSize;
+	void *const dtPtr = DTable + 1;
+	HUF_DEltX2 *const dt = (HUF_DEltX2 *)dtPtr;
+
+	U32 *rankVal;
+	BYTE *huffWeight;
+	size_t spaceUsed32 = 0;
+
+	rankVal = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+	/* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
+
+	iSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+	if (HUF_isError(iSize))
+		return iSize;
+
+	/* Table header */
+	{
+		DTableDesc dtd = HUF_getDTableDesc(DTable);
+		if (tableLog > (U32)(dtd.maxTableLog + 1))
+			return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
+		dtd.tableType = 0;
+		dtd.tableLog = (BYTE)tableLog;
+		memcpy(DTable, &dtd, sizeof(dtd));
+	}
+
+	/* Calculate starting value for each rank */
+	{
+		U32 n, nextRankStart = 0;
+		for (n = 1; n < tableLog + 1; n++) {
+			U32 const curr = nextRankStart;
+			nextRankStart += (rankVal[n] << (n - 1));
+			rankVal[n] = curr;
+		}
+	}
+
+	/* fill DTable */
+	{
+		U32 n;
+		for (n = 0; n < nbSymbols; n++) {
+			U32 const w = huffWeight[n];
+			U32 const length = (1 << w) >> 1;
+			U32 u;
+			HUF_DEltX2 D;
+			D.byte = (BYTE)n;
+			D.nbBits = (BYTE)(tableLog + 1 - w);
+			for (u = rankVal[w]; u < rankVal[w] + length; u++)
+				dt[u] = D;
+			rankVal[w] += length;
+		}
+	}
+
+	return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t *Dstream, const HUF_DEltX2 *dt, const U32 dtLog)
+{
+	size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+	BYTE const c = dt[val].byte;
+	BIT_skipBits(Dstream, dt[val].nbBits);
+	return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)         \
+	if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \
+	HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+	if (ZSTD_64bits())                     \
+	HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+FORCE_INLINE size_t HUF_decodeStreamX2(BYTE *p, BIT_DStream_t *const bitDPtr, BYTE *const pEnd, const HUF_DEltX2 *const dt, const U32 dtLog)
+{
+	BYTE *const pStart = p;
+
+	/* up to 4 symbols at a time */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd - 4)) {
+		HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+		HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+	}
+
+	/* closer to the end */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+		HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+	/* no more data to retrieve from bitstream, hence no need to reload */
+	while (p < pEnd)
+		HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+	return pEnd - pStart;
+}
+
+static size_t HUF_decompress1X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	BYTE *op = (BYTE *)dst;
+	BYTE *const oend = op + dstSize;
+	const void *dtPtr = DTable + 1;
+	const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr;
+	BIT_DStream_t bitD;
+	DTableDesc const dtd = HUF_getDTableDesc(DTable);
+	U32 const dtLog = dtd.tableLog;
+
+	{
+		size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+		if (HUF_isError(errorCode))
+			return errorCode;
+	}
+
+	HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+	/* check */
+	if (!BIT_endOfDStream(&bitD))
+		return ERROR(corruption_detected);
+
+	return dstSize;
+}
+
+size_t HUF_decompress1X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 0)
+		return ERROR(GENERIC);
+	return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+static size_t HUF_decompress4X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	/* Check */
+	if (cSrcSize < 10)
+		return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+	{
+		const BYTE *const istart = (const BYTE *)cSrc;
+		BYTE *const ostart = (BYTE *)dst;
+		BYTE *const oend = ostart + dstSize;
+		const void *const dtPtr = DTable + 1;
+		const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr;
+
+		/* Init */
+		BIT_DStream_t bitD1;
+		BIT_DStream_t bitD2;
+		BIT_DStream_t bitD3;
+		BIT_DStream_t bitD4;
+		size_t const length1 = ZSTD_readLE16(istart);
+		size_t const length2 = ZSTD_readLE16(istart + 2);
+		size_t const length3 = ZSTD_readLE16(istart + 4);
+		size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+		const BYTE *const istart1 = istart + 6; /* jumpTable */
+		const BYTE *const istart2 = istart1 + length1;
+		const BYTE *const istart3 = istart2 + length2;
+		const BYTE *const istart4 = istart3 + length3;
+		const size_t segmentSize = (dstSize + 3) / 4;
+		BYTE *const opStart2 = ostart + segmentSize;
+		BYTE *const opStart3 = opStart2 + segmentSize;
+		BYTE *const opStart4 = opStart3 + segmentSize;
+		BYTE *op1 = ostart;
+		BYTE *op2 = opStart2;
+		BYTE *op3 = opStart3;
+		BYTE *op4 = opStart4;
+		U32 endSignal;
+		DTableDesc const dtd = HUF_getDTableDesc(DTable);
+		U32 const dtLog = dtd.tableLog;
+
+		if (length4 > cSrcSize)
+			return ERROR(corruption_detected); /* overflow */
+		{
+			size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+
+		/* 16-32 symbols per loop (4-8 symbols per stream) */
+		endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		for (; (endSignal == BIT_DStream_unfinished) && (op4 < (oend - 7));) {
+			HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+			HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+			endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		}
+
+		/* check corruption */
+		if (op1 > opStart2)
+			return ERROR(corruption_detected);
+		if (op2 > opStart3)
+			return ERROR(corruption_detected);
+		if (op3 > opStart4)
+			return ERROR(corruption_detected);
+		/* note : op4 supposed already verified within main loop */
+
+		/* finish bitStreams one by one */
+		HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+		HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+		HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+		HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog);
+
+		/* check */
+		endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+		if (!endSignal)
+			return ERROR(corruption_detected);
+
+		/* decoded size */
+		return dstSize;
+	}
+}
+
+size_t HUF_decompress4X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 0)
+		return ERROR(GENERIC);
+	return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+typedef struct {
+	U16 sequence;
+	BYTE nbBits;
+	BYTE length;
+} HUF_DEltX4; /* double-symbols decoding */
+
+typedef struct {
+	BYTE symbol;
+	BYTE weight;
+} sortedSymbol_t;
+
+/* HUF_fillDTableX4Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX4Level2(HUF_DEltX4 *DTable, U32 sizeLog, const U32 consumed, const U32 *rankValOrigin, const int minWeight,
+				   const sortedSymbol_t *sortedSymbols, const U32 sortedListSize, U32 nbBitsBaseline, U16 baseSeq)
+{
+	HUF_DEltX4 DElt;
+	U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+	/* get pre-calculated rankVal */
+	memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+	/* fill skipped values */
+	if (minWeight > 1) {
+		U32 i, skipSize = rankVal[minWeight];
+		ZSTD_writeLE16(&(DElt.sequence), baseSeq);
+		DElt.nbBits = (BYTE)(consumed);
+		DElt.length = 1;
+		for (i = 0; i < skipSize; i++)
+			DTable[i] = DElt;
+	}
+
+	/* fill DTable */
+	{
+		U32 s;
+		for (s = 0; s < sortedListSize; s++) { /* note : sortedSymbols already skipped */
+			const U32 symbol = sortedSymbols[s].symbol;
+			const U32 weight = sortedSymbols[s].weight;
+			const U32 nbBits = nbBitsBaseline - weight;
+			const U32 length = 1 << (sizeLog - nbBits);
+			const U32 start = rankVal[weight];
+			U32 i = start;
+			const U32 end = start + length;
+
+			ZSTD_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+			DElt.nbBits = (BYTE)(nbBits + consumed);
+			DElt.length = 2;
+			do {
+				DTable[i++] = DElt;
+			} while (i < end); /* since length >= 1 */
+
+			rankVal[weight] += length;
+		}
+	}
+}
+
+typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1];
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4 *DTable, const U32 targetLog, const sortedSymbol_t *sortedList, const U32 sortedListSize, const U32 *rankStart,
+			     rankVal_t rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline)
+{
+	U32 rankVal[HUF_TABLELOG_MAX + 1];
+	const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+	const U32 minBits = nbBitsBaseline - maxWeight;
+	U32 s;
+
+	memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+	/* fill DTable */
+	for (s = 0; s < sortedListSize; s++) {
+		const U16 symbol = sortedList[s].symbol;
+		const U32 weight = sortedList[s].weight;
+		const U32 nbBits = nbBitsBaseline - weight;
+		const U32 start = rankVal[weight];
+		const U32 length = 1 << (targetLog - nbBits);
+
+		if (targetLog - nbBits >= minBits) { /* enough room for a second symbol */
+			U32 sortedRank;
+			int minWeight = nbBits + scaleLog;
+			if (minWeight < 1)
+				minWeight = 1;
+			sortedRank = rankStart[minWeight];
+			HUF_fillDTableX4Level2(DTable + start, targetLog - nbBits, nbBits, rankValOrigin[nbBits], minWeight, sortedList + sortedRank,
+					       sortedListSize - sortedRank, nbBitsBaseline, symbol);
+		} else {
+			HUF_DEltX4 DElt;
+			ZSTD_writeLE16(&(DElt.sequence), symbol);
+			DElt.nbBits = (BYTE)(nbBits);
+			DElt.length = 1;
+			{
+				U32 const end = start + length;
+				U32 u;
+				for (u = start; u < end; u++)
+					DTable[u] = DElt;
+			}
+		}
+		rankVal[weight] += length;
+	}
+}
+
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 tableLog, maxW, sizeOfSort, nbSymbols;
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	U32 const maxTableLog = dtd.maxTableLog;
+	size_t iSize;
+	void *dtPtr = DTable + 1; /* force compiler to avoid strict-aliasing */
+	HUF_DEltX4 *const dt = (HUF_DEltX4 *)dtPtr;
+	U32 *rankStart;
+
+	rankValCol_t *rankVal;
+	U32 *rankStats;
+	U32 *rankStart0;
+	sortedSymbol_t *sortedSymbol;
+	BYTE *weightList;
+	size_t spaceUsed32 = 0;
+
+	HUF_STATIC_ASSERT((sizeof(rankValCol_t) & 3) == 0);
+
+	rankVal = (rankValCol_t *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+	rankStats = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 1;
+	rankStart0 = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 2;
+	sortedSymbol = (sortedSymbol_t *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+	weightList = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	rankStart = rankStart0 + 1;
+	memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
+
+	HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
+	if (maxTableLog > HUF_TABLELOG_MAX)
+		return ERROR(tableLog_tooLarge);
+	/* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
+
+	iSize = HUF_readStats_wksp(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+	if (HUF_isError(iSize))
+		return iSize;
+
+	/* check result */
+	if (tableLog > maxTableLog)
+		return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
+
+	/* find maxWeight */
+	for (maxW = tableLog; rankStats[maxW] == 0; maxW--) {
+	} /* necessarily finds a solution before 0 */
+
+	/* Get start index of each weight */
+	{
+		U32 w, nextRankStart = 0;
+		for (w = 1; w < maxW + 1; w++) {
+			U32 curr = nextRankStart;
+			nextRankStart += rankStats[w];
+			rankStart[w] = curr;
+		}
+		rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
+		sizeOfSort = nextRankStart;
+	}
+
+	/* sort symbols by weight */
+	{
+		U32 s;
+		for (s = 0; s < nbSymbols; s++) {
+			U32 const w = weightList[s];
+			U32 const r = rankStart[w]++;
+			sortedSymbol[r].symbol = (BYTE)s;
+			sortedSymbol[r].weight = (BYTE)w;
+		}
+		rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
+	}
+
+	/* Build rankVal */
+	{
+		U32 *const rankVal0 = rankVal[0];
+		{
+			int const rescale = (maxTableLog - tableLog) - 1; /* tableLog <= maxTableLog */
+			U32 nextRankVal = 0;
+			U32 w;
+			for (w = 1; w < maxW + 1; w++) {
+				U32 curr = nextRankVal;
+				nextRankVal += rankStats[w] << (w + rescale);
+				rankVal0[w] = curr;
+			}
+		}
+		{
+			U32 const minBits = tableLog + 1 - maxW;
+			U32 consumed;
+			for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+				U32 *const rankValPtr = rankVal[consumed];
+				U32 w;
+				for (w = 1; w < maxW + 1; w++) {
+					rankValPtr[w] = rankVal0[w] >> consumed;
+				}
+			}
+		}
+	}
+
+	HUF_fillDTableX4(dt, maxTableLog, sortedSymbol, sizeOfSort, rankStart0, rankVal, maxW, tableLog + 1);
+
+	dtd.tableLog = (BYTE)maxTableLog;
+	dtd.tableType = 1;
+	memcpy(DTable, &dtd, sizeof(dtd));
+	return iSize;
+}
+
+static U32 HUF_decodeSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog)
+{
+	size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+	memcpy(op, dt + val, 2);
+	BIT_skipBits(DStream, dt[val].nbBits);
+	return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog)
+{
+	size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+	memcpy(op, dt + val, 1);
+	if (dt[val].length == 1)
+		BIT_skipBits(DStream, dt[val].nbBits);
+	else {
+		if (DStream->bitsConsumed < (sizeof(DStream->bitContainer) * 8)) {
+			BIT_skipBits(DStream, dt[val].nbBits);
+			if (DStream->bitsConsumed > (sizeof(DStream->bitContainer) * 8))
+				/* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+				DStream->bitsConsumed = (sizeof(DStream->bitContainer) * 8);
+		}
+	}
+	return 1;
+}
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr)         \
+	if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \
+	ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+	if (ZSTD_64bits())                     \
+	ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+FORCE_INLINE size_t HUF_decodeStreamX4(BYTE *p, BIT_DStream_t *bitDPtr, BYTE *const pEnd, const HUF_DEltX4 *const dt, const U32 dtLog)
+{
+	BYTE *const pStart = p;
+
+	/* up to 8 symbols at a time */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd - (sizeof(bitDPtr->bitContainer) - 1))) {
+		HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+		HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+	}
+
+	/* closer to end : up to 2 symbols at a time */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd - 2))
+		HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+	while (p <= pEnd - 2)
+		HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
+
+	if (p < pEnd)
+		p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+	return p - pStart;
+}
+
+static size_t HUF_decompress1X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	BIT_DStream_t bitD;
+
+	/* Init */
+	{
+		size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+		if (HUF_isError(errorCode))
+			return errorCode;
+	}
+
+	/* decode */
+	{
+		BYTE *const ostart = (BYTE *)dst;
+		BYTE *const oend = ostart + dstSize;
+		const void *const dtPtr = DTable + 1; /* force compiler to not use strict-aliasing */
+		const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr;
+		DTableDesc const dtd = HUF_getDTableDesc(DTable);
+		HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+	}
+
+	/* check */
+	if (!BIT_endOfDStream(&bitD))
+		return ERROR(corruption_detected);
+
+	/* decoded size */
+	return dstSize;
+}
+
+size_t HUF_decompress1X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 1)
+		return ERROR(GENERIC);
+	return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+static size_t HUF_decompress4X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	if (cSrcSize < 10)
+		return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+	{
+		const BYTE *const istart = (const BYTE *)cSrc;
+		BYTE *const ostart = (BYTE *)dst;
+		BYTE *const oend = ostart + dstSize;
+		const void *const dtPtr = DTable + 1;
+		const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr;
+
+		/* Init */
+		BIT_DStream_t bitD1;
+		BIT_DStream_t bitD2;
+		BIT_DStream_t bitD3;
+		BIT_DStream_t bitD4;
+		size_t const length1 = ZSTD_readLE16(istart);
+		size_t const length2 = ZSTD_readLE16(istart + 2);
+		size_t const length3 = ZSTD_readLE16(istart + 4);
+		size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+		const BYTE *const istart1 = istart + 6; /* jumpTable */
+		const BYTE *const istart2 = istart1 + length1;
+		const BYTE *const istart3 = istart2 + length2;
+		const BYTE *const istart4 = istart3 + length3;
+		size_t const segmentSize = (dstSize + 3) / 4;
+		BYTE *const opStart2 = ostart + segmentSize;
+		BYTE *const opStart3 = opStart2 + segmentSize;
+		BYTE *const opStart4 = opStart3 + segmentSize;
+		BYTE *op1 = ostart;
+		BYTE *op2 = opStart2;
+		BYTE *op3 = opStart3;
+		BYTE *op4 = opStart4;
+		U32 endSignal;
+		DTableDesc const dtd = HUF_getDTableDesc(DTable);
+		U32 const dtLog = dtd.tableLog;
+
+		if (length4 > cSrcSize)
+			return ERROR(corruption_detected); /* overflow */
+		{
+			size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+
+		/* 16-32 symbols per loop (4-8 symbols per stream) */
+		endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		for (; (endSignal == BIT_DStream_unfinished) & (op4 < (oend - (sizeof(bitD4.bitContainer) - 1)));) {
+			HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+			HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+			endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		}
+
+		/* check corruption */
+		if (op1 > opStart2)
+			return ERROR(corruption_detected);
+		if (op2 > opStart3)
+			return ERROR(corruption_detected);
+		if (op3 > opStart4)
+			return ERROR(corruption_detected);
+		/* note : op4 already verified within main loop */
+
+		/* finish bitStreams one by one */
+		HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+		HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+		HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+		HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog);
+
+		/* check */
+		{
+			U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+			if (!endCheck)
+				return ERROR(corruption_detected);
+		}
+
+		/* decoded size */
+		return dstSize;
+	}
+}
+
+size_t HUF_decompress4X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 1)
+		return ERROR(GENERIC);
+	return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc const dtd = HUF_getDTableDesc(DTable);
+	return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable)
+			     : HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc const dtd = HUF_getDTableDesc(DTable);
+	return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable)
+			     : HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+typedef struct {
+	U32 tableTime;
+	U32 decode256Time;
+} algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = {
+    /* single, double, quad */
+    {{0, 0}, {1, 1}, {2, 2}},		     /* Q==0 : impossible */
+    {{0, 0}, {1, 1}, {2, 2}},		     /* Q==1 : impossible */
+    {{38, 130}, {1313, 74}, {2151, 38}},     /* Q == 2 : 12-18% */
+    {{448, 128}, {1353, 74}, {2238, 41}},    /* Q == 3 : 18-25% */
+    {{556, 128}, {1353, 74}, {2238, 47}},    /* Q == 4 : 25-32% */
+    {{714, 128}, {1418, 74}, {2436, 53}},    /* Q == 5 : 32-38% */
+    {{883, 128}, {1437, 74}, {2464, 61}},    /* Q == 6 : 38-44% */
+    {{897, 128}, {1515, 75}, {2622, 68}},    /* Q == 7 : 44-50% */
+    {{926, 128}, {1613, 75}, {2730, 75}},    /* Q == 8 : 50-56% */
+    {{947, 128}, {1729, 77}, {3359, 77}},    /* Q == 9 : 56-62% */
+    {{1107, 128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177, 128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242, 128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349, 128}, {2644, 106}, {5260, 106}}, /* Q ==13 : 81-87% */
+    {{1455, 128}, {2422, 124}, {4174, 124}}, /* Q ==14 : 87-93% */
+    {{722, 128}, {1891, 145}, {1936, 146}},  /* Q ==15 : 93-99% */
+};
+
+/** HUF_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize)
+{
+	/* decoder timing evaluation */
+	U32 const Q = (U32)(cSrcSize * 16 / dstSize); /* Q < 16 since dstSize > cSrcSize */
+	U32 const D256 = (U32)(dstSize >> 8);
+	U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+	U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+	DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, for cache eviction */
+
+	return DTime1 < DTime0;
+}
+
+typedef size_t (*decompressionAlgo)(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize);
+
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	/* validation checks */
+	if (dstSize == 0)
+		return ERROR(dstSize_tooSmall);
+	if (cSrcSize > dstSize)
+		return ERROR(corruption_detected); /* invalid */
+	if (cSrcSize == dstSize) {
+		memcpy(dst, cSrc, dstSize);
+		return dstSize;
+	} /* not compressed */
+	if (cSrcSize == 1) {
+		memset(dst, *(const BYTE *)cSrc, dstSize);
+		return dstSize;
+	} /* RLE */
+
+	{
+		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+	}
+}
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	/* validation checks */
+	if (dstSize == 0)
+		return ERROR(dstSize_tooSmall);
+	if ((cSrcSize >= dstSize) || (cSrcSize <= 1))
+		return ERROR(corruption_detected); /* invalid */
+
+	{
+		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+	}
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	/* validation checks */
+	if (dstSize == 0)
+		return ERROR(dstSize_tooSmall);
+	if (cSrcSize > dstSize)
+		return ERROR(corruption_detected); /* invalid */
+	if (cSrcSize == dstSize) {
+		memcpy(dst, cSrc, dstSize);
+		return dstSize;
+	} /* not compressed */
+	if (cSrcSize == 1) {
+		memset(dst, *(const BYTE *)cSrc, dstSize);
+		return dstSize;
+	} /* RLE */
+
+	{
+		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+		return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+	}
+}
diff --git a/lib/zstd/mem.h b/lib/zstd/mem.h
new file mode 100644
index 000000000000..3a0f34c8706c
--- /dev/null
+++ b/lib/zstd/mem.h
@@ -0,0 +1,151 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <asm/unaligned.h>
+#include <linux/string.h> /* memcpy */
+#include <linux/types.h>  /* size_t, ptrdiff_t */
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#define ZSTD_STATIC static __inline __attribute__((unused))
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+typedef uint8_t BYTE;
+typedef uint16_t U16;
+typedef int16_t S16;
+typedef uint32_t U32;
+typedef int32_t S32;
+typedef uint64_t U64;
+typedef int64_t S64;
+typedef ptrdiff_t iPtrDiff;
+typedef uintptr_t uPtrDiff;
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+ZSTD_STATIC unsigned ZSTD_32bits(void) { return sizeof(size_t) == 4; }
+ZSTD_STATIC unsigned ZSTD_64bits(void) { return sizeof(size_t) == 8; }
+
+#if defined(__LITTLE_ENDIAN)
+#define ZSTD_LITTLE_ENDIAN 1
+#else
+#define ZSTD_LITTLE_ENDIAN 0
+#endif
+
+ZSTD_STATIC unsigned ZSTD_isLittleEndian(void) { return ZSTD_LITTLE_ENDIAN; }
+
+ZSTD_STATIC U16 ZSTD_read16(const void *memPtr) { return get_unaligned((const U16 *)memPtr); }
+
+ZSTD_STATIC U32 ZSTD_read32(const void *memPtr) { return get_unaligned((const U32 *)memPtr); }
+
+ZSTD_STATIC U64 ZSTD_read64(const void *memPtr) { return get_unaligned((const U64 *)memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readST(const void *memPtr) { return get_unaligned((const size_t *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write16(void *memPtr, U16 value) { put_unaligned(value, (U16 *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write32(void *memPtr, U32 value) { put_unaligned(value, (U32 *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write64(void *memPtr, U64 value) { put_unaligned(value, (U64 *)memPtr); }
+
+/*=== Little endian r/w ===*/
+
+ZSTD_STATIC U16 ZSTD_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE16(void *memPtr, U16 val) { put_unaligned_le16(val, memPtr); }
+
+ZSTD_STATIC U32 ZSTD_readLE24(const void *memPtr) { return ZSTD_readLE16(memPtr) + (((const BYTE *)memPtr)[2] << 16); }
+
+ZSTD_STATIC void ZSTD_writeLE24(void *memPtr, U32 val)
+{
+	ZSTD_writeLE16(memPtr, (U16)val);
+	((BYTE *)memPtr)[2] = (BYTE)(val >> 16);
+}
+
+ZSTD_STATIC U32 ZSTD_readLE32(const void *memPtr) { return get_unaligned_le32(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE32(void *memPtr, U32 val32) { put_unaligned_le32(val32, memPtr); }
+
+ZSTD_STATIC U64 ZSTD_readLE64(const void *memPtr) { return get_unaligned_le64(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE64(void *memPtr, U64 val64) { put_unaligned_le64(val64, memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readLEST(const void *memPtr)
+{
+	if (ZSTD_32bits())
+		return (size_t)ZSTD_readLE32(memPtr);
+	else
+		return (size_t)ZSTD_readLE64(memPtr);
+}
+
+ZSTD_STATIC void ZSTD_writeLEST(void *memPtr, size_t val)
+{
+	if (ZSTD_32bits())
+		ZSTD_writeLE32(memPtr, (U32)val);
+	else
+		ZSTD_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+ZSTD_STATIC U32 ZSTD_readBE32(const void *memPtr) { return get_unaligned_be32(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeBE32(void *memPtr, U32 val32) { put_unaligned_be32(val32, memPtr); }
+
+ZSTD_STATIC U64 ZSTD_readBE64(const void *memPtr) { return get_unaligned_be64(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeBE64(void *memPtr, U64 val64) { put_unaligned_be64(val64, memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readBEST(const void *memPtr)
+{
+	if (ZSTD_32bits())
+		return (size_t)ZSTD_readBE32(memPtr);
+	else
+		return (size_t)ZSTD_readBE64(memPtr);
+}
+
+ZSTD_STATIC void ZSTD_writeBEST(void *memPtr, size_t val)
+{
+	if (ZSTD_32bits())
+		ZSTD_writeBE32(memPtr, (U32)val);
+	else
+		ZSTD_writeBE64(memPtr, (U64)val);
+}
+
+/* function safe only for comparisons */
+ZSTD_STATIC U32 ZSTD_readMINMATCH(const void *memPtr, U32 length)
+{
+	switch (length) {
+	default:
+	case 4: return ZSTD_read32(memPtr);
+	case 3:
+		if (ZSTD_isLittleEndian())
+			return ZSTD_read32(memPtr) << 8;
+		else
+			return ZSTD_read32(memPtr) >> 8;
+	}
+}
+
+#endif /* MEM_H_MODULE */
diff --git a/lib/zstd/zstd_common.c b/lib/zstd/zstd_common.c
new file mode 100644
index 000000000000..a282624ee155
--- /dev/null
+++ b/lib/zstd/zstd_common.c
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "error_private.h"
+#include "zstd_internal.h" /* declaration of ZSTD_isError, ZSTD_getErrorName, ZSTD_getErrorCode, ZSTD_getErrorString, ZSTD_versionNumber */
+#include <linux/kernel.h>
+
+/*=**************************************************************
+*  Custom allocator
+****************************************************************/
+
+#define stack_push(stack, size)                                 \
+	({                                                      \
+		void *const ptr = ZSTD_PTR_ALIGN((stack)->ptr); \
+		(stack)->ptr = (char *)ptr + (size);            \
+		(stack)->ptr <= (stack)->end ? ptr : NULL;      \
+	})
+
+ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem stackMem = {ZSTD_stackAlloc, ZSTD_stackFree, workspace};
+	ZSTD_stack *stack = (ZSTD_stack *)workspace;
+	/* Verify preconditions */
+	if (!workspace || workspaceSize < sizeof(ZSTD_stack) || workspace != ZSTD_PTR_ALIGN(workspace)) {
+		ZSTD_customMem error = {NULL, NULL, NULL};
+		return error;
+	}
+	/* Initialize the stack */
+	stack->ptr = workspace;
+	stack->end = (char *)workspace + workspaceSize;
+	stack_push(stack, sizeof(ZSTD_stack));
+	return stackMem;
+}
+
+void *ZSTD_stackAllocAll(void *opaque, size_t *size)
+{
+	ZSTD_stack *stack = (ZSTD_stack *)opaque;
+	*size = (BYTE const *)stack->end - (BYTE *)ZSTD_PTR_ALIGN(stack->ptr);
+	return stack_push(stack, *size);
+}
+
+void *ZSTD_stackAlloc(void *opaque, size_t size)
+{
+	ZSTD_stack *stack = (ZSTD_stack *)opaque;
+	return stack_push(stack, size);
+}
+void ZSTD_stackFree(void *opaque, void *address)
+{
+	(void)opaque;
+	(void)address;
+}
+
+void *ZSTD_malloc(size_t size, ZSTD_customMem customMem) { return customMem.customAlloc(customMem.opaque, size); }
+
+void ZSTD_free(void *ptr, ZSTD_customMem customMem)
+{
+	if (ptr != NULL)
+		customMem.customFree(customMem.opaque, ptr);
+}
diff --git a/lib/zstd/zstd_internal.h b/lib/zstd/zstd_internal.h
new file mode 100644
index 000000000000..1a79fab9e13a
--- /dev/null
+++ b/lib/zstd/zstd_internal.h
@@ -0,0 +1,263 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#define FORCE_INLINE static __always_inline
+#define FORCE_NOINLINE static noinline
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "error_private.h"
+#include "mem.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/xxhash.h>
+#include <linux/zstd.h>
+
+/*-*************************************
+*  shared macros
+***************************************/
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define CHECK_F(f)                       \
+	{                                \
+		size_t const errcod = f; \
+		if (ERR_isError(errcod)) \
+			return errcod;   \
+	} /* check and Forward error code */
+#define CHECK_E(f, e)                    \
+	{                                \
+		size_t const errcod = f; \
+		if (ERR_isError(errcod)) \
+			return ERROR(e); \
+	} /* check and send Error code */
+#define ZSTD_STATIC_ASSERT(c)                                   \
+	{                                                       \
+		enum { ZSTD_static_assert = 1 / (int)(!!(c)) }; \
+	}
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM (1 << 12)
+#define ZSTD_DICT_MAGIC 0xEC30A437 /* v0.7+ */
+
+#define ZSTD_REP_NUM 3		      /* number of repcodes */
+#define ZSTD_REP_CHECK (ZSTD_REP_NUM) /* number of repcodes to check by the optimal parser */
+#define ZSTD_REP_MOVE (ZSTD_REP_NUM - 1)
+#define ZSTD_REP_MOVE_OPT (ZSTD_REP_NUM)
+static const U32 repStartValue[ZSTD_REP_NUM] = {1, 4, 8};
+
+#define KB *(1 << 10)
+#define MB *(1 << 20)
+#define GB *(1U << 30)
+
+#define BIT7 128
+#define BIT6 64
+#define BIT5 32
+#define BIT4 16
+#define BIT1 2
+#define BIT0 1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static const size_t ZSTD_fcs_fieldSize[4] = {0, 2, 4, 8};
+static const size_t ZSTD_did_fieldSize[4] = {0, 1, 2, 4};
+
+#define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define MIN_SEQUENCES_SIZE 1									  /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */
+
+#define HufLog 12
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+#define EQUAL_READ32 4
+
+#define Litbits 8
+#define MaxLit ((1 << Litbits) - 1)
+#define MaxML 52
+#define MaxLL 35
+#define MaxOff 28
+#define MaxSeq MAX(MaxLL, MaxML) /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog 9
+#define LLFSELog 9
+#define OffFSELog 8
+
+static const U32 LL_bits[MaxLL + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+static const S16 LL_defaultNorm[MaxLL + 1] = {4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, -1, -1, -1, -1};
+#define LL_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static const U32 ML_bits[MaxML + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,
+				       0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+static const S16 ML_defaultNorm[MaxML + 1] = {1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1, 1,
+					      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1};
+#define ML_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static const S16 OF_defaultNorm[MaxOff + 1] = {1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1};
+#define OF_DEFAULTNORMLOG 5 /* for static allocation */
+static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+ZSTD_STATIC void ZSTD_copy8(void *dst, const void *src) {
+	memcpy(dst, src, 8);
+}
+/*! ZSTD_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+ZSTD_STATIC void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length)
+{
+	const BYTE* ip = (const BYTE*)src;
+	BYTE* op = (BYTE*)dst;
+	BYTE* const oend = op + length;
+	/* Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388.
+	 * Avoid the bad case where the loop only runs once by handling the
+	 * special case separately. This doesn't trigger the bug because it
+	 * doesn't involve pointer/integer overflow.
+	 */
+	if (length <= 8)
+		return ZSTD_copy8(dst, src);
+	do {
+		ZSTD_copy8(op, ip);
+		op += 8;
+		ip += 8;
+	} while (op < oend);
+}
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct ZSTD_stats_s ZSTD_stats_t;
+
+typedef struct {
+	U32 off;
+	U32 len;
+} ZSTD_match_t;
+
+typedef struct {
+	U32 price;
+	U32 off;
+	U32 mlen;
+	U32 litlen;
+	U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+typedef struct seqDef_s {
+	U32 offset;
+	U16 litLength;
+	U16 matchLength;
+} seqDef;
+
+typedef struct {
+	seqDef *sequencesStart;
+	seqDef *sequences;
+	BYTE *litStart;
+	BYTE *lit;
+	BYTE *llCode;
+	BYTE *mlCode;
+	BYTE *ofCode;
+	U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+	U32 longLengthPos;
+	/* opt */
+	ZSTD_optimal_t *priceTable;
+	ZSTD_match_t *matchTable;
+	U32 *matchLengthFreq;
+	U32 *litLengthFreq;
+	U32 *litFreq;
+	U32 *offCodeFreq;
+	U32 matchLengthSum;
+	U32 matchSum;
+	U32 litLengthSum;
+	U32 litSum;
+	U32 offCodeSum;
+	U32 log2matchLengthSum;
+	U32 log2matchSum;
+	U32 log2litLengthSum;
+	U32 log2litSum;
+	U32 log2offCodeSum;
+	U32 factor;
+	U32 staticPrices;
+	U32 cachedPrice;
+	U32 cachedLitLength;
+	const BYTE *cachedLiterals;
+} seqStore_t;
+
+const seqStore_t *ZSTD_getSeqStore(const ZSTD_CCtx *ctx);
+void ZSTD_seqToCodes(const seqStore_t *seqStorePtr);
+int ZSTD_isSkipFrame(ZSTD_DCtx *dctx);
+
+/*= Custom memory allocation functions */
+typedef void *(*ZSTD_allocFunction)(void *opaque, size_t size);
+typedef void (*ZSTD_freeFunction)(void *opaque, void *address);
+typedef struct {
+	ZSTD_allocFunction customAlloc;
+	ZSTD_freeFunction customFree;
+	void *opaque;
+} ZSTD_customMem;
+
+void *ZSTD_malloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_free(void *ptr, ZSTD_customMem customMem);
+
+/*====== stack allocation  ======*/
+
+typedef struct {
+	void *ptr;
+	const void *end;
+} ZSTD_stack;
+
+#define ZSTD_ALIGN(x) ALIGN(x, sizeof(size_t))
+#define ZSTD_PTR_ALIGN(p) PTR_ALIGN(p, sizeof(size_t))
+
+ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize);
+
+void *ZSTD_stackAllocAll(void *opaque, size_t *size);
+void *ZSTD_stackAlloc(void *opaque, size_t size);
+void ZSTD_stackFree(void *opaque, void *address);
+
+/*======  common function  ======*/
+
+ZSTD_STATIC U32 ZSTD_highbit32(U32 val) { return 31 - __builtin_clz(val); }
+
+/* hidden functions */
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx *cctx);
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx *cctx);
+size_t ZSTD_freeDCtx(ZSTD_DCtx *dctx);
+size_t ZSTD_freeCDict(ZSTD_CDict *cdict);
+size_t ZSTD_freeDDict(ZSTD_DDict *cdict);
+size_t ZSTD_freeCStream(ZSTD_CStream *zcs);
+size_t ZSTD_freeDStream(ZSTD_DStream *zds);
+
+#endif /* ZSTD_CCOMMON_H_MODULE */
diff --git a/lib/zstd/zstd_opt.h b/lib/zstd/zstd_opt.h
new file mode 100644
index 000000000000..55e1b4cba808
--- /dev/null
+++ b/lib/zstd/zstd_opt.h
@@ -0,0 +1,1014 @@
+/**
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/* Note : this file is intended to be included within zstd_compress.c */
+
+#ifndef ZSTD_OPT_H_91842398743
+#define ZSTD_OPT_H_91842398743
+
+#define ZSTD_LITFREQ_ADD 2
+#define ZSTD_FREQ_DIV 4
+#define ZSTD_MAX_PRICE (1 << 30)
+
+/*-*************************************
+*  Price functions for optimal parser
+***************************************/
+FORCE_INLINE void ZSTD_setLog2Prices(seqStore_t *ssPtr)
+{
+	ssPtr->log2matchLengthSum = ZSTD_highbit32(ssPtr->matchLengthSum + 1);
+	ssPtr->log2litLengthSum = ZSTD_highbit32(ssPtr->litLengthSum + 1);
+	ssPtr->log2litSum = ZSTD_highbit32(ssPtr->litSum + 1);
+	ssPtr->log2offCodeSum = ZSTD_highbit32(ssPtr->offCodeSum + 1);
+	ssPtr->factor = 1 + ((ssPtr->litSum >> 5) / ssPtr->litLengthSum) + ((ssPtr->litSum << 1) / (ssPtr->litSum + ssPtr->matchSum));
+}
+
+ZSTD_STATIC void ZSTD_rescaleFreqs(seqStore_t *ssPtr, const BYTE *src, size_t srcSize)
+{
+	unsigned u;
+
+	ssPtr->cachedLiterals = NULL;
+	ssPtr->cachedPrice = ssPtr->cachedLitLength = 0;
+	ssPtr->staticPrices = 0;
+
+	if (ssPtr->litLengthSum == 0) {
+		if (srcSize <= 1024)
+			ssPtr->staticPrices = 1;
+
+		for (u = 0; u <= MaxLit; u++)
+			ssPtr->litFreq[u] = 0;
+		for (u = 0; u < srcSize; u++)
+			ssPtr->litFreq[src[u]]++;
+
+		ssPtr->litSum = 0;
+		ssPtr->litLengthSum = MaxLL + 1;
+		ssPtr->matchLengthSum = MaxML + 1;
+		ssPtr->offCodeSum = (MaxOff + 1);
+		ssPtr->matchSum = (ZSTD_LITFREQ_ADD << Litbits);
+
+		for (u = 0; u <= MaxLit; u++) {
+			ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u] >> ZSTD_FREQ_DIV);
+			ssPtr->litSum += ssPtr->litFreq[u];
+		}
+		for (u = 0; u <= MaxLL; u++)
+			ssPtr->litLengthFreq[u] = 1;
+		for (u = 0; u <= MaxML; u++)
+			ssPtr->matchLengthFreq[u] = 1;
+		for (u = 0; u <= MaxOff; u++)
+			ssPtr->offCodeFreq[u] = 1;
+	} else {
+		ssPtr->matchLengthSum = 0;
+		ssPtr->litLengthSum = 0;
+		ssPtr->offCodeSum = 0;
+		ssPtr->matchSum = 0;
+		ssPtr->litSum = 0;
+
+		for (u = 0; u <= MaxLit; u++) {
+			ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u] >> (ZSTD_FREQ_DIV + 1));
+			ssPtr->litSum += ssPtr->litFreq[u];
+		}
+		for (u = 0; u <= MaxLL; u++) {
+			ssPtr->litLengthFreq[u] = 1 + (ssPtr->litLengthFreq[u] >> (ZSTD_FREQ_DIV + 1));
+			ssPtr->litLengthSum += ssPtr->litLengthFreq[u];
+		}
+		for (u = 0; u <= MaxML; u++) {
+			ssPtr->matchLengthFreq[u] = 1 + (ssPtr->matchLengthFreq[u] >> ZSTD_FREQ_DIV);
+			ssPtr->matchLengthSum += ssPtr->matchLengthFreq[u];
+			ssPtr->matchSum += ssPtr->matchLengthFreq[u] * (u + 3);
+		}
+		ssPtr->matchSum *= ZSTD_LITFREQ_ADD;
+		for (u = 0; u <= MaxOff; u++) {
+			ssPtr->offCodeFreq[u] = 1 + (ssPtr->offCodeFreq[u] >> ZSTD_FREQ_DIV);
+			ssPtr->offCodeSum += ssPtr->offCodeFreq[u];
+		}
+	}
+
+	ZSTD_setLog2Prices(ssPtr);
+}
+
+FORCE_INLINE U32 ZSTD_getLiteralPrice(seqStore_t *ssPtr, U32 litLength, const BYTE *literals)
+{
+	U32 price, u;
+
+	if (ssPtr->staticPrices)
+		return ZSTD_highbit32((U32)litLength + 1) + (litLength * 6);
+
+	if (litLength == 0)
+		return ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[0] + 1);
+
+	/* literals */
+	if (ssPtr->cachedLiterals == literals) {
+		U32 const additional = litLength - ssPtr->cachedLitLength;
+		const BYTE *literals2 = ssPtr->cachedLiterals + ssPtr->cachedLitLength;
+		price = ssPtr->cachedPrice + additional * ssPtr->log2litSum;
+		for (u = 0; u < additional; u++)
+			price -= ZSTD_highbit32(ssPtr->litFreq[literals2[u]] + 1);
+		ssPtr->cachedPrice = price;
+		ssPtr->cachedLitLength = litLength;
+	} else {
+		price = litLength * ssPtr->log2litSum;
+		for (u = 0; u < litLength; u++)
+			price -= ZSTD_highbit32(ssPtr->litFreq[literals[u]] + 1);
+
+		if (litLength >= 12) {
+			ssPtr->cachedLiterals = literals;
+			ssPtr->cachedPrice = price;
+			ssPtr->cachedLitLength = litLength;
+		}
+	}
+
+	/* literal Length */
+	{
+		const BYTE LL_deltaCode = 19;
+		const BYTE llCode = (litLength > 63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+		price += LL_bits[llCode] + ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[llCode] + 1);
+	}
+
+	return price;
+}
+
+FORCE_INLINE U32 ZSTD_getPrice(seqStore_t *seqStorePtr, U32 litLength, const BYTE *literals, U32 offset, U32 matchLength, const int ultra)
+{
+	/* offset */
+	U32 price;
+	BYTE const offCode = (BYTE)ZSTD_highbit32(offset + 1);
+
+	if (seqStorePtr->staticPrices)
+		return ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + ZSTD_highbit32((U32)matchLength + 1) + 16 + offCode;
+
+	price = offCode + seqStorePtr->log2offCodeSum - ZSTD_highbit32(seqStorePtr->offCodeFreq[offCode] + 1);
+	if (!ultra && offCode >= 20)
+		price += (offCode - 19) * 2;
+
+	/* match Length */
+	{
+		const BYTE ML_deltaCode = 36;
+		const BYTE mlCode = (matchLength > 127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength];
+		price += ML_bits[mlCode] + seqStorePtr->log2matchLengthSum - ZSTD_highbit32(seqStorePtr->matchLengthFreq[mlCode] + 1);
+	}
+
+	return price + ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + seqStorePtr->factor;
+}
+
+ZSTD_STATIC void ZSTD_updatePrice(seqStore_t *seqStorePtr, U32 litLength, const BYTE *literals, U32 offset, U32 matchLength)
+{
+	U32 u;
+
+	/* literals */
+	seqStorePtr->litSum += litLength * ZSTD_LITFREQ_ADD;
+	for (u = 0; u < litLength; u++)
+		seqStorePtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+
+	/* literal Length */
+	{
+		const BYTE LL_deltaCode = 19;
+		const BYTE llCode = (litLength > 63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+		seqStorePtr->litLengthFreq[llCode]++;
+		seqStorePtr->litLengthSum++;
+	}
+
+	/* match offset */
+	{
+		BYTE const offCode = (BYTE)ZSTD_highbit32(offset + 1);
+		seqStorePtr->offCodeSum++;
+		seqStorePtr->offCodeFreq[offCode]++;
+	}
+
+	/* match Length */
+	{
+		const BYTE ML_deltaCode = 36;
+		const BYTE mlCode = (matchLength > 127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength];
+		seqStorePtr->matchLengthFreq[mlCode]++;
+		seqStorePtr->matchLengthSum++;
+	}
+
+	ZSTD_setLog2Prices(seqStorePtr);
+}
+
+#define SET_PRICE(pos, mlen_, offset_, litlen_, price_)           \
+	{                                                         \
+		while (last_pos < pos) {                          \
+			opt[last_pos + 1].price = ZSTD_MAX_PRICE; \
+			last_pos++;                               \
+		}                                                 \
+		opt[pos].mlen = mlen_;                            \
+		opt[pos].off = offset_;                           \
+		opt[pos].litlen = litlen_;                        \
+		opt[pos].price = price_;                          \
+	}
+
+/* Update hashTable3 up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE
+U32 ZSTD_insertAndFindFirstIndexHash3(ZSTD_CCtx *zc, const BYTE *ip)
+{
+	U32 *const hashTable3 = zc->hashTable3;
+	U32 const hashLog3 = zc->hashLog3;
+	const BYTE *const base = zc->base;
+	U32 idx = zc->nextToUpdate3;
+	const U32 target = zc->nextToUpdate3 = (U32)(ip - base);
+	const size_t hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+
+	while (idx < target) {
+		hashTable3[ZSTD_hash3Ptr(base + idx, hashLog3)] = idx;
+		idx++;
+	}
+
+	return hashTable3[hash3];
+}
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+static U32 ZSTD_insertBtAndGetAllMatches(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, U32 nbCompares, const U32 mls, U32 extDict,
+					 ZSTD_match_t *matches, const U32 minMatchLen)
+{
+	const BYTE *const base = zc->base;
+	const U32 curr = (U32)(ip - base);
+	const U32 hashLog = zc->params.cParams.hashLog;
+	const size_t h = ZSTD_hashPtr(ip, hashLog, mls);
+	U32 *const hashTable = zc->hashTable;
+	U32 matchIndex = hashTable[h];
+	U32 *const bt = zc->chainTable;
+	const U32 btLog = zc->params.cParams.chainLog - 1;
+	const U32 btMask = (1U << btLog) - 1;
+	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+	const BYTE *const dictBase = zc->dictBase;
+	const U32 dictLimit = zc->dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+	const U32 windowLow = zc->lowLimit;
+	U32 *smallerPtr = bt + 2 * (curr & btMask);
+	U32 *largerPtr = bt + 2 * (curr & btMask) + 1;
+	U32 matchEndIdx = curr + 8;
+	U32 dummy32; /* to be nullified at the end */
+	U32 mnum = 0;
+
+	const U32 minMatch = (mls == 3) ? 3 : 4;
+	size_t bestLength = minMatchLen - 1;
+
+	if (minMatch == 3) { /* HC3 match finder */
+		U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(zc, ip);
+		if (matchIndex3 > windowLow && (curr - matchIndex3 < (1 << 18))) {
+			const BYTE *match;
+			size_t currMl = 0;
+			if ((!extDict) || matchIndex3 >= dictLimit) {
+				match = base + matchIndex3;
+				if (match[bestLength] == ip[bestLength])
+					currMl = ZSTD_count(ip, match, iLimit);
+			} else {
+				match = dictBase + matchIndex3;
+				if (ZSTD_readMINMATCH(match, MINMATCH) ==
+				    ZSTD_readMINMATCH(ip, MINMATCH)) /* assumption : matchIndex3 <= dictLimit-4 (by table construction) */
+					currMl = ZSTD_count_2segments(ip + MINMATCH, match + MINMATCH, iLimit, dictEnd, prefixStart) + MINMATCH;
+			}
+
+			/* save best solution */
+			if (currMl > bestLength) {
+				bestLength = currMl;
+				matches[mnum].off = ZSTD_REP_MOVE_OPT + curr - matchIndex3;
+				matches[mnum].len = (U32)currMl;
+				mnum++;
+				if (currMl > ZSTD_OPT_NUM)
+					goto update;
+				if (ip + currMl == iLimit)
+					goto update; /* best possible, and avoid read overflow*/
+			}
+		}
+	}
+
+	hashTable[h] = curr; /* Update Hash Table */
+
+	while (nbCompares-- && (matchIndex > windowLow)) {
+		U32 *nextPtr = bt + 2 * (matchIndex & btMask);
+		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+		const BYTE *match;
+
+		if ((!extDict) || (matchIndex + matchLength >= dictLimit)) {
+			match = base + matchIndex;
+			if (match[matchLength] == ip[matchLength]) {
+				matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iLimit) + 1;
+			}
+		} else {
+			match = dictBase + matchIndex;
+			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iLimit, dictEnd, prefixStart);
+			if (matchIndex + matchLength >= dictLimit)
+				match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+		}
+
+		if (matchLength > bestLength) {
+			if (matchLength > matchEndIdx - matchIndex)
+				matchEndIdx = matchIndex + (U32)matchLength;
+			bestLength = matchLength;
+			matches[mnum].off = ZSTD_REP_MOVE_OPT + curr - matchIndex;
+			matches[mnum].len = (U32)matchLength;
+			mnum++;
+			if (matchLength > ZSTD_OPT_NUM)
+				break;
+			if (ip + matchLength == iLimit) /* equal : no way to know if inf or sup */
+				break;			/* drop, to guarantee consistency (miss a little bit of compression) */
+		}
+
+		if (match[matchLength] < ip[matchLength]) {
+			/* match is smaller than curr */
+			*smallerPtr = matchIndex;	  /* update smaller idx */
+			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+			if (matchIndex <= btLow) {
+				smallerPtr = &dummy32;
+				break;
+			}			  /* beyond tree size, stop the search */
+			smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
+			matchIndex = nextPtr[1];  /* new matchIndex larger than previous (closer to curr) */
+		} else {
+			/* match is larger than curr */
+			*largerPtr = matchIndex;
+			commonLengthLarger = matchLength;
+			if (matchIndex <= btLow) {
+				largerPtr = &dummy32;
+				break;
+			} /* beyond tree size, stop the search */
+			largerPtr = nextPtr;
+			matchIndex = nextPtr[0];
+		}
+	}
+
+	*smallerPtr = *largerPtr = 0;
+
+update:
+	zc->nextToUpdate = (matchEndIdx > curr + 8) ? matchEndIdx - 8 : curr + 1;
+	return mnum;
+}
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, const U32 maxNbAttempts, const U32 mls, ZSTD_match_t *matches,
+				const U32 minMatchLen)
+{
+	if (ip < zc->base + zc->nextToUpdate)
+		return 0; /* skipped area */
+	ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
+	return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 0, matches, minMatchLen);
+}
+
+static U32 ZSTD_BtGetAllMatches_selectMLS(ZSTD_CCtx *zc, /* Index table will be updated */
+					  const BYTE *ip, const BYTE *const iHighLimit, const U32 maxNbAttempts, const U32 matchLengthSearch,
+					  ZSTD_match_t *matches, const U32 minMatchLen)
+{
+	switch (matchLengthSearch) {
+	case 3: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen);
+	default:
+	case 4: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
+	case 5: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+	case 7:
+	case 6: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
+	}
+}
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, const U32 maxNbAttempts, const U32 mls,
+					ZSTD_match_t *matches, const U32 minMatchLen)
+{
+	if (ip < zc->base + zc->nextToUpdate)
+		return 0; /* skipped area */
+	ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
+	return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 1, matches, minMatchLen);
+}
+
+static U32 ZSTD_BtGetAllMatches_selectMLS_extDict(ZSTD_CCtx *zc, /* Index table will be updated */
+						  const BYTE *ip, const BYTE *const iHighLimit, const U32 maxNbAttempts, const U32 matchLengthSearch,
+						  ZSTD_match_t *matches, const U32 minMatchLen)
+{
+	switch (matchLengthSearch) {
+	case 3: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen);
+	default:
+	case 4: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
+	case 5: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+	case 7:
+	case 6: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
+	}
+}
+
+/*-*******************************
+*  Optimal parser
+*********************************/
+FORCE_INLINE
+void ZSTD_compressBlock_opt_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const int ultra)
+{
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	const BYTE *const base = ctx->base;
+	const BYTE *const prefixStart = base + ctx->dictLimit;
+
+	const U32 maxSearches = 1U << ctx->params.cParams.searchLog;
+	const U32 sufficient_len = ctx->params.cParams.targetLength;
+	const U32 mls = ctx->params.cParams.searchLength;
+	const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4;
+
+	ZSTD_optimal_t *opt = seqStorePtr->priceTable;
+	ZSTD_match_t *matches = seqStorePtr->matchTable;
+	const BYTE *inr;
+	U32 offset, rep[ZSTD_REP_NUM];
+
+	/* init */
+	ctx->nextToUpdate3 = ctx->nextToUpdate;
+	ZSTD_rescaleFreqs(seqStorePtr, (const BYTE *)src, srcSize);
+	ip += (ip == prefixStart);
+	{
+		U32 i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			rep[i] = ctx->rep[i];
+	}
+
+	/* Match Loop */
+	while (ip < ilimit) {
+		U32 cur, match_num, last_pos, litlen, price;
+		U32 u, mlen, best_mlen, best_off, litLength;
+		memset(opt, 0, sizeof(ZSTD_optimal_t));
+		last_pos = 0;
+		litlen = (U32)(ip - anchor);
+
+		/* check repCode */
+		{
+			U32 i, last_i = ZSTD_REP_CHECK + (ip == anchor);
+			for (i = (ip == anchor); i < last_i; i++) {
+				const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i];
+				if ((repCur > 0) && (repCur < (S32)(ip - prefixStart)) &&
+				    (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repCur, minMatch))) {
+					mlen = (U32)ZSTD_count(ip + minMatch, ip + minMatch - repCur, iend) + minMatch;
+					if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+						best_mlen = mlen;
+						best_off = i;
+						cur = 0;
+						last_pos = 1;
+						goto _storeSequence;
+					}
+					best_off = i - (ip == anchor);
+					do {
+						price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+						if (mlen > last_pos || price < opt[mlen].price)
+							SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */
+						mlen--;
+					} while (mlen >= minMatch);
+				}
+			}
+		}
+
+		match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, ip, iend, maxSearches, mls, matches, minMatch);
+
+		if (!last_pos && !match_num) {
+			ip++;
+			continue;
+		}
+
+		if (match_num && (matches[match_num - 1].len > sufficient_len || matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+			best_mlen = matches[match_num - 1].len;
+			best_off = matches[match_num - 1].off;
+			cur = 0;
+			last_pos = 1;
+			goto _storeSequence;
+		}
+
+		/* set prices using matches at position = 0 */
+		best_mlen = (last_pos) ? last_pos : minMatch;
+		for (u = 0; u < match_num; u++) {
+			mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+			best_mlen = matches[u].len;
+			while (mlen <= best_mlen) {
+				price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+				if (mlen > last_pos || price < opt[mlen].price)
+					SET_PRICE(mlen, mlen, matches[u].off, litlen, price); /* note : macro modifies last_pos */
+				mlen++;
+			}
+		}
+
+		if (last_pos < minMatch) {
+			ip++;
+			continue;
+		}
+
+		/* initialize opt[0] */
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				opt[0].rep[i] = rep[i];
+		}
+		opt[0].mlen = 1;
+		opt[0].litlen = litlen;
+
+		/* check further positions */
+		for (cur = 1; cur <= last_pos; cur++) {
+			inr = ip + cur;
+
+			if (opt[cur - 1].mlen == 1) {
+				litlen = opt[cur - 1].litlen + 1;
+				if (cur > litlen) {
+					price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - litlen);
+				} else
+					price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+			} else {
+				litlen = 1;
+				price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - 1);
+			}
+
+			if (cur > last_pos || price <= opt[cur].price)
+				SET_PRICE(cur, 1, 0, litlen, price);
+
+			if (cur == last_pos)
+				break;
+
+			if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */
+				continue;
+
+			mlen = opt[cur].mlen;
+			if (opt[cur].off > ZSTD_REP_MOVE_OPT) {
+				opt[cur].rep[2] = opt[cur - mlen].rep[1];
+				opt[cur].rep[1] = opt[cur - mlen].rep[0];
+				opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT;
+			} else {
+				opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur - mlen].rep[1] : opt[cur - mlen].rep[2];
+				opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur - mlen].rep[0] : opt[cur - mlen].rep[1];
+				opt[cur].rep[0] =
+				    ((opt[cur].off == ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur - mlen].rep[0] - 1) : (opt[cur - mlen].rep[opt[cur].off]);
+			}
+
+			best_mlen = minMatch;
+			{
+				U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1);
+				for (i = (opt[cur].mlen != 1); i < last_i; i++) { /* check rep */
+					const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i];
+					if ((repCur > 0) && (repCur < (S32)(inr - prefixStart)) &&
+					    (ZSTD_readMINMATCH(inr, minMatch) == ZSTD_readMINMATCH(inr - repCur, minMatch))) {
+						mlen = (U32)ZSTD_count(inr + minMatch, inr + minMatch - repCur, iend) + minMatch;
+
+						if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+							best_mlen = mlen;
+							best_off = i;
+							last_pos = cur + 1;
+							goto _storeSequence;
+						}
+
+						best_off = i - (opt[cur].mlen != 1);
+						if (mlen > best_mlen)
+							best_mlen = mlen;
+
+						do {
+							if (opt[cur].mlen == 1) {
+								litlen = opt[cur].litlen;
+								if (cur > litlen) {
+									price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr - litlen,
+															best_off, mlen - MINMATCH, ultra);
+								} else
+									price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+							} else {
+								litlen = 0;
+								price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra);
+							}
+
+							if (cur + mlen > last_pos || price <= opt[cur + mlen].price)
+								SET_PRICE(cur + mlen, mlen, i, litlen, price);
+							mlen--;
+						} while (mlen >= minMatch);
+					}
+				}
+			}
+
+			match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, inr, iend, maxSearches, mls, matches, best_mlen);
+
+			if (match_num > 0 && (matches[match_num - 1].len > sufficient_len || cur + matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+				best_mlen = matches[match_num - 1].len;
+				best_off = matches[match_num - 1].off;
+				last_pos = cur + 1;
+				goto _storeSequence;
+			}
+
+			/* set prices using matches at position = cur */
+			for (u = 0; u < match_num; u++) {
+				mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+				best_mlen = matches[u].len;
+
+				while (mlen <= best_mlen) {
+					if (opt[cur].mlen == 1) {
+						litlen = opt[cur].litlen;
+						if (cur > litlen)
+							price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip + cur - litlen,
+													matches[u].off - 1, mlen - MINMATCH, ultra);
+						else
+							price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+					} else {
+						litlen = 0;
+						price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off - 1, mlen - MINMATCH, ultra);
+					}
+
+					if (cur + mlen > last_pos || (price < opt[cur + mlen].price))
+						SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price);
+
+					mlen++;
+				}
+			}
+		}
+
+		best_mlen = opt[last_pos].mlen;
+		best_off = opt[last_pos].off;
+		cur = last_pos - best_mlen;
+
+	/* store sequence */
+_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */
+		opt[0].mlen = 1;
+
+		while (1) {
+			mlen = opt[cur].mlen;
+			offset = opt[cur].off;
+			opt[cur].mlen = best_mlen;
+			opt[cur].off = best_off;
+			best_mlen = mlen;
+			best_off = offset;
+			if (mlen > cur)
+				break;
+			cur -= mlen;
+		}
+
+		for (u = 0; u <= last_pos;) {
+			u += opt[u].mlen;
+		}
+
+		for (cur = 0; cur < last_pos;) {
+			mlen = opt[cur].mlen;
+			if (mlen == 1) {
+				ip++;
+				cur++;
+				continue;
+			}
+			offset = opt[cur].off;
+			cur += mlen;
+			litLength = (U32)(ip - anchor);
+
+			if (offset > ZSTD_REP_MOVE_OPT) {
+				rep[2] = rep[1];
+				rep[1] = rep[0];
+				rep[0] = offset - ZSTD_REP_MOVE_OPT;
+				offset--;
+			} else {
+				if (offset != 0) {
+					best_off = (offset == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]);
+					if (offset != 1)
+						rep[2] = rep[1];
+					rep[1] = rep[0];
+					rep[0] = best_off;
+				}
+				if (litLength == 0)
+					offset--;
+			}
+
+			ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+			ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+			anchor = ip = ip + mlen;
+		}
+	} /* for (cur=0; cur < last_pos; ) */
+
+	/* Save reps for next block */
+	{
+		int i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			ctx->repToConfirm[i] = rep[i];
+	}
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+FORCE_INLINE
+void ZSTD_compressBlock_opt_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const int ultra)
+{
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	const BYTE *const base = ctx->base;
+	const U32 lowestIndex = ctx->lowLimit;
+	const U32 dictLimit = ctx->dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const BYTE *const dictBase = ctx->dictBase;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+
+	const U32 maxSearches = 1U << ctx->params.cParams.searchLog;
+	const U32 sufficient_len = ctx->params.cParams.targetLength;
+	const U32 mls = ctx->params.cParams.searchLength;
+	const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4;
+
+	ZSTD_optimal_t *opt = seqStorePtr->priceTable;
+	ZSTD_match_t *matches = seqStorePtr->matchTable;
+	const BYTE *inr;
+
+	/* init */
+	U32 offset, rep[ZSTD_REP_NUM];
+	{
+		U32 i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			rep[i] = ctx->rep[i];
+	}
+
+	ctx->nextToUpdate3 = ctx->nextToUpdate;
+	ZSTD_rescaleFreqs(seqStorePtr, (const BYTE *)src, srcSize);
+	ip += (ip == prefixStart);
+
+	/* Match Loop */
+	while (ip < ilimit) {
+		U32 cur, match_num, last_pos, litlen, price;
+		U32 u, mlen, best_mlen, best_off, litLength;
+		U32 curr = (U32)(ip - base);
+		memset(opt, 0, sizeof(ZSTD_optimal_t));
+		last_pos = 0;
+		opt[0].litlen = (U32)(ip - anchor);
+
+		/* check repCode */
+		{
+			U32 i, last_i = ZSTD_REP_CHECK + (ip == anchor);
+			for (i = (ip == anchor); i < last_i; i++) {
+				const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i];
+				const U32 repIndex = (U32)(curr - repCur);
+				const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+				const BYTE *const repMatch = repBase + repIndex;
+				if ((repCur > 0 && repCur <= (S32)curr) &&
+				    (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+				    && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) {
+					/* repcode detected we should take it */
+					const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+					mlen = (U32)ZSTD_count_2segments(ip + minMatch, repMatch + minMatch, iend, repEnd, prefixStart) + minMatch;
+
+					if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+						best_mlen = mlen;
+						best_off = i;
+						cur = 0;
+						last_pos = 1;
+						goto _storeSequence;
+					}
+
+					best_off = i - (ip == anchor);
+					litlen = opt[0].litlen;
+					do {
+						price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+						if (mlen > last_pos || price < opt[mlen].price)
+							SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */
+						mlen--;
+					} while (mlen >= minMatch);
+				}
+			}
+		}
+
+		match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, ip, iend, maxSearches, mls, matches, minMatch); /* first search (depth 0) */
+
+		if (!last_pos && !match_num) {
+			ip++;
+			continue;
+		}
+
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				opt[0].rep[i] = rep[i];
+		}
+		opt[0].mlen = 1;
+
+		if (match_num && (matches[match_num - 1].len > sufficient_len || matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+			best_mlen = matches[match_num - 1].len;
+			best_off = matches[match_num - 1].off;
+			cur = 0;
+			last_pos = 1;
+			goto _storeSequence;
+		}
+
+		best_mlen = (last_pos) ? last_pos : minMatch;
+
+		/* set prices using matches at position = 0 */
+		for (u = 0; u < match_num; u++) {
+			mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+			best_mlen = matches[u].len;
+			litlen = opt[0].litlen;
+			while (mlen <= best_mlen) {
+				price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+				if (mlen > last_pos || price < opt[mlen].price)
+					SET_PRICE(mlen, mlen, matches[u].off, litlen, price);
+				mlen++;
+			}
+		}
+
+		if (last_pos < minMatch) {
+			ip++;
+			continue;
+		}
+
+		/* check further positions */
+		for (cur = 1; cur <= last_pos; cur++) {
+			inr = ip + cur;
+
+			if (opt[cur - 1].mlen == 1) {
+				litlen = opt[cur - 1].litlen + 1;
+				if (cur > litlen) {
+					price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - litlen);
+				} else
+					price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+			} else {
+				litlen = 1;
+				price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - 1);
+			}
+
+			if (cur > last_pos || price <= opt[cur].price)
+				SET_PRICE(cur, 1, 0, litlen, price);
+
+			if (cur == last_pos)
+				break;
+
+			if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */
+				continue;
+
+			mlen = opt[cur].mlen;
+			if (opt[cur].off > ZSTD_REP_MOVE_OPT) {
+				opt[cur].rep[2] = opt[cur - mlen].rep[1];
+				opt[cur].rep[1] = opt[cur - mlen].rep[0];
+				opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT;
+			} else {
+				opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur - mlen].rep[1] : opt[cur - mlen].rep[2];
+				opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur - mlen].rep[0] : opt[cur - mlen].rep[1];
+				opt[cur].rep[0] =
+				    ((opt[cur].off == ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur - mlen].rep[0] - 1) : (opt[cur - mlen].rep[opt[cur].off]);
+			}
+
+			best_mlen = minMatch;
+			{
+				U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1);
+				for (i = (mlen != 1); i < last_i; i++) {
+					const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i];
+					const U32 repIndex = (U32)(curr + cur - repCur);
+					const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+					const BYTE *const repMatch = repBase + repIndex;
+					if ((repCur > 0 && repCur <= (S32)(curr + cur)) &&
+					    (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+					    && (ZSTD_readMINMATCH(inr, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) {
+						/* repcode detected */
+						const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+						mlen = (U32)ZSTD_count_2segments(inr + minMatch, repMatch + minMatch, iend, repEnd, prefixStart) + minMatch;
+
+						if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+							best_mlen = mlen;
+							best_off = i;
+							last_pos = cur + 1;
+							goto _storeSequence;
+						}
+
+						best_off = i - (opt[cur].mlen != 1);
+						if (mlen > best_mlen)
+							best_mlen = mlen;
+
+						do {
+							if (opt[cur].mlen == 1) {
+								litlen = opt[cur].litlen;
+								if (cur > litlen) {
+									price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr - litlen,
+															best_off, mlen - MINMATCH, ultra);
+								} else
+									price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+							} else {
+								litlen = 0;
+								price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra);
+							}
+
+							if (cur + mlen > last_pos || price <= opt[cur + mlen].price)
+								SET_PRICE(cur + mlen, mlen, i, litlen, price);
+							mlen--;
+						} while (mlen >= minMatch);
+					}
+				}
+			}
+
+			match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, inr, iend, maxSearches, mls, matches, minMatch);
+
+			if (match_num > 0 && (matches[match_num - 1].len > sufficient_len || cur + matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+				best_mlen = matches[match_num - 1].len;
+				best_off = matches[match_num - 1].off;
+				last_pos = cur + 1;
+				goto _storeSequence;
+			}
+
+			/* set prices using matches at position = cur */
+			for (u = 0; u < match_num; u++) {
+				mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+				best_mlen = matches[u].len;
+
+				while (mlen <= best_mlen) {
+					if (opt[cur].mlen == 1) {
+						litlen = opt[cur].litlen;
+						if (cur > litlen)
+							price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip + cur - litlen,
+													matches[u].off - 1, mlen - MINMATCH, ultra);
+						else
+							price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+					} else {
+						litlen = 0;
+						price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off - 1, mlen - MINMATCH, ultra);
+					}
+
+					if (cur + mlen > last_pos || (price < opt[cur + mlen].price))
+						SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price);
+
+					mlen++;
+				}
+			}
+		} /* for (cur = 1; cur <= last_pos; cur++) */
+
+		best_mlen = opt[last_pos].mlen;
+		best_off = opt[last_pos].off;
+		cur = last_pos - best_mlen;
+
+	/* store sequence */
+_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */
+		opt[0].mlen = 1;
+
+		while (1) {
+			mlen = opt[cur].mlen;
+			offset = opt[cur].off;
+			opt[cur].mlen = best_mlen;
+			opt[cur].off = best_off;
+			best_mlen = mlen;
+			best_off = offset;
+			if (mlen > cur)
+				break;
+			cur -= mlen;
+		}
+
+		for (u = 0; u <= last_pos;) {
+			u += opt[u].mlen;
+		}
+
+		for (cur = 0; cur < last_pos;) {
+			mlen = opt[cur].mlen;
+			if (mlen == 1) {
+				ip++;
+				cur++;
+				continue;
+			}
+			offset = opt[cur].off;
+			cur += mlen;
+			litLength = (U32)(ip - anchor);
+
+			if (offset > ZSTD_REP_MOVE_OPT) {
+				rep[2] = rep[1];
+				rep[1] = rep[0];
+				rep[0] = offset - ZSTD_REP_MOVE_OPT;
+				offset--;
+			} else {
+				if (offset != 0) {
+					best_off = (offset == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]);
+					if (offset != 1)
+						rep[2] = rep[1];
+					rep[1] = rep[0];
+					rep[0] = best_off;
+				}
+
+				if (litLength == 0)
+					offset--;
+			}
+
+			ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+			ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+			anchor = ip = ip + mlen;
+		}
+	} /* for (cur=0; cur < last_pos; ) */
+
+	/* Save reps for next block */
+	{
+		int i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			ctx->repToConfirm[i] = rep[i];
+	}
+
+	/* Last Literals */
+	{
+		size_t lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+#endif /* ZSTD_OPT_H_91842398743 */

From 78ceb3d0fbe9f77abf97758cec80f92c5f91096f Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Wed, 15 Nov 2017 17:33:49 -0800
Subject: [PATCH 1792/3220] UPSTREAM: zram: add zstd to the supported
 algorithms list

Add ZSTD to the list of supported compression algorithms.

ZRAM fio perf test:

                      LZO         DEFLATE         ZSTD

WRITE:              (2180MB/s)   (77.2MB/s)      (1429MB/s)
WRITE:              (1617MB/s)   (77.7MB/s)      (1202MB/s)
READ:                (426MB/s)   (595MB/s)       (1181MB/s)
READ:                (422MB/s)   (572MB/s)       (1020MB/s)
READ:                (318MB/s)   (67.8MB/s)      (563MB/s)
WRITE:               (318MB/s)   (67.9MB/s)      (564MB/s)
READ:                (336MB/s)   (68.3MB/s)      (583MB/s)
WRITE:               (335MB/s)   (68.2MB/s)      (582MB/s)
WRITE:              (3441MB/s)   (152MB/s)       (2141MB/s)
WRITE:              (2507MB/s)   (147MB/s)       (1888MB/s)
READ:                (801MB/s)   (1146MB/s)      (1890MB/s)
READ:                (767MB/s)   (1096MB/s)      (2073MB/s)
READ:                (621MB/s)   (126MB/s)       (1009MB/s)
WRITE:               (621MB/s)   (126MB/s)       (1009MB/s)
READ:                (656MB/s)   (125MB/s)       (1075MB/s)
WRITE:               (657MB/s)   (126MB/s)       (1077MB/s)
WRITE:              (4772MB/s)   (225MB/s)       (3394MB/s)
WRITE:              (3905MB/s)   (211MB/s)       (2939MB/s)
READ:               (1216MB/s)   (1608MB/s)      (3218MB/s)
READ:               (1159MB/s)   (1431MB/s)      (2981MB/s)
READ:                (906MB/s)   (156MB/s)       (1457MB/s)
WRITE:               (907MB/s)   (156MB/s)       (1458MB/s)
READ:                (953MB/s)   (158MB/s)       (1595MB/s)
WRITE:               (952MB/s)   (157MB/s)       (1593MB/s)
WRITE:              (6036MB/s)   (265MB/s)       (4469MB/s)
WRITE:              (5059MB/s)   (263MB/s)       (3951MB/s)
READ:               (1618MB/s)   (2066MB/s)      (4276MB/s)
READ:               (1573MB/s)   (1942MB/s)      (3830MB/s)
READ:               (1202MB/s)   (227MB/s)       (1971MB/s)
WRITE:              (1200MB/s)   (227MB/s)       (1968MB/s)
READ:               (1265MB/s)   (226MB/s)       (2116MB/s)
WRITE:              (1264MB/s)   (226MB/s)       (2114MB/s)
WRITE:              (5339MB/s)   (233MB/s)       (3781MB/s)
WRITE:              (4298MB/s)   (234MB/s)       (3276MB/s)
READ:               (1626MB/s)   (2048MB/s)      (4081MB/s)
READ:               (1567MB/s)   (1929MB/s)      (3758MB/s)
READ:               (1174MB/s)   (205MB/s)       (1747MB/s)
WRITE:              (1173MB/s)   (204MB/s)       (1746MB/s)
READ:               (1214MB/s)   (208MB/s)       (1890MB/s)
WRITE:              (1215MB/s)   (208MB/s)       (1892MB/s)
WRITE:              (5666MB/s)   (270MB/s)       (4338MB/s)
WRITE:              (4828MB/s)   (267MB/s)       (3772MB/s)
READ:               (1803MB/s)   (2058MB/s)      (4946MB/s)
READ:               (1805MB/s)   (2156MB/s)      (4711MB/s)
READ:               (1334MB/s)   (235MB/s)       (2135MB/s)
WRITE:              (1335MB/s)   (235MB/s)       (2137MB/s)
READ:               (1364MB/s)   (236MB/s)       (2268MB/s)
WRITE:              (1365MB/s)   (237MB/s)       (2270MB/s)
WRITE:              (5474MB/s)   (270MB/s)       (4300MB/s)
WRITE:              (4666MB/s)   (266MB/s)       (3817MB/s)
READ:               (2022MB/s)   (2319MB/s)      (5472MB/s)
READ:               (1924MB/s)   (2260MB/s)      (5031MB/s)
READ:               (1369MB/s)   (242MB/s)       (2153MB/s)
WRITE:              (1370MB/s)   (242MB/s)       (2155MB/s)
READ:               (1499MB/s)   (246MB/s)       (2310MB/s)
WRITE:              (1497MB/s)   (246MB/s)       (2307MB/s)
WRITE:              (5558MB/s)   (273MB/s)       (4439MB/s)
WRITE:              (4763MB/s)   (271MB/s)       (3918MB/s)
READ:               (2201MB/s)   (2599MB/s)      (6062MB/s)
READ:               (2105MB/s)   (2463MB/s)      (5413MB/s)
READ:               (1490MB/s)   (252MB/s)       (2238MB/s)
WRITE:              (1488MB/s)   (252MB/s)       (2236MB/s)
READ:               (1566MB/s)   (254MB/s)       (2434MB/s)
WRITE:              (1568MB/s)   (254MB/s)       (2437MB/s)
WRITE:              (5120MB/s)   (264MB/s)       (4035MB/s)
WRITE:              (4531MB/s)   (267MB/s)       (3740MB/s)
READ:               (1940MB/s)   (2258MB/s)      (4986MB/s)
READ:               (2024MB/s)   (2387MB/s)      (4871MB/s)
READ:               (1343MB/s)   (246MB/s)       (2038MB/s)
WRITE:              (1342MB/s)   (246MB/s)       (2037MB/s)
READ:               (1553MB/s)   (238MB/s)       (2243MB/s)
WRITE:              (1552MB/s)   (238MB/s)       (2242MB/s)
WRITE:              (5345MB/s)   (271MB/s)       (3988MB/s)
WRITE:              (4750MB/s)   (254MB/s)       (3668MB/s)
READ:               (1876MB/s)   (2363MB/s)      (5150MB/s)
READ:               (1990MB/s)   (2256MB/s)      (5080MB/s)
READ:               (1355MB/s)   (250MB/s)       (2019MB/s)
WRITE:              (1356MB/s)   (251MB/s)       (2020MB/s)
READ:               (1490MB/s)   (252MB/s)       (2202MB/s)
WRITE:              (1488MB/s)   (252MB/s)       (2199MB/s)

jobs1                              perfstat
instructions                 52,065,555,710 (    0.79)    855,731,114,587 (    2.64)       54,280,709,944 (    1.40)
branches                     14,020,427,116 ( 725.847)    101,733,449,582 (1074.521)       11,170,591,067 ( 992.869)
branch-misses                    22,626,174 (   0.16%)        274,197,885 (   0.27%)           25,915,805 (   0.23%)
jobs2                              perfstat
instructions                103,633,110,402 (    0.75)  1,710,822,100,914 (    2.59)      107,879,874,104 (    1.28)
branches                     27,931,237,282 ( 679.203)    203,298,267,479 (1037.326)       22,185,350,842 ( 884.427)
branch-misses                    46,103,811 (   0.17%)        533,747,204 (   0.26%)           49,682,483 (   0.22%)
jobs3                              perfstat
instructions                154,857,283,657 (    0.76)  2,565,748,974,197 (    2.57)      161,515,435,813 (    1.31)
branches                     41,759,490,355 ( 670.529)    304,905,605,277 ( 978.765)       33,215,805,907 ( 888.003)
branch-misses                    74,263,293 (   0.18%)        759,746,240 (   0.25%)           76,841,196 (   0.23%)
jobs4                              perfstat
instructions                206,215,849,076 (    0.75)  3,420,169,460,897 (    2.60)      215,003,061,664 (    1.31)
branches                     55,632,141,739 ( 666.501)    406,394,977,433 ( 927.241)       44,214,322,251 ( 883.532)
branch-misses                   102,287,788 (   0.18%)      1,098,617,314 (   0.27%)          103,891,040 (   0.23%)
jobs5                              perfstat
instructions                258,711,315,588 (    0.67)  4,275,657,533,244 (    2.23)      269,332,235,685 (    1.08)
branches                     69,802,821,166 ( 588.823)    507,996,211,252 ( 797.036)       55,450,846,129 ( 735.095)
branch-misses                   129,217,214 (   0.19%)      1,243,284,991 (   0.24%)          173,512,278 (   0.31%)
jobs6                              perfstat
instructions                312,796,166,008 (    0.61)  5,133,896,344,660 (    2.02)      323,658,769,588 (    1.04)
branches                     84,372,488,583 ( 520.541)    610,310,494,402 ( 697.642)       66,683,292,992 ( 693.939)
branch-misses                   159,438,978 (   0.19%)      1,396,368,563 (   0.23%)          174,406,934 (   0.26%)
jobs7                              perfstat
instructions                363,211,372,930 (    0.56)  5,988,205,600,879 (    1.75)      377,824,674,156 (    0.93)
branches                     98,057,013,765 ( 463.117)    711,841,255,974 ( 598.762)       77,879,009,954 ( 600.443)
branch-misses                   199,513,153 (   0.20%)      1,507,651,077 (   0.21%)          248,203,369 (   0.32%)
jobs8                              perfstat
instructions                413,960,354,615 (    0.52)  6,842,918,558,378 (    1.45)      431,938,486,581 (    0.83)
branches                    111,812,574,884 ( 414.224)    813,299,084,518 ( 491.173)       89,062,699,827 ( 517.795)
branch-misses                   233,584,845 (   0.21%)      1,531,593,921 (   0.19%)          286,818,489 (   0.32%)
jobs9                              perfstat
instructions                465,976,220,300 (    0.53)  7,698,467,237,372 (    1.47)      486,352,600,321 (    0.84)
branches                    125,931,456,162 ( 424.063)    915,207,005,715 ( 498.192)      100,370,404,090 ( 517.439)
branch-misses                   256,992,445 (   0.20%)      1,782,809,816 (   0.19%)          345,239,380 (   0.34%)
jobs10                             perfstat
instructions                517,406,372,715 (    0.53)  8,553,527,312,900 (    1.48)      540,732,653,094 (    0.84)
branches                    139,839,780,676 ( 427.732)  1,016,737,699,389 ( 503.172)      111,696,557,638 ( 516.750)
branch-misses                   259,595,561 (   0.19%)      1,952,570,279 (   0.19%)          357,818,661 (   0.32%)

seconds elapsed        20.630411534     96.084546565    12.743373571
seconds elapsed        22.292627625     100.984155001   14.407413560
seconds elapsed        22.396016966     110.344880848   14.032201392
seconds elapsed        22.517330949     113.351459170   14.243074935
seconds elapsed        28.548305104     156.515193765   19.159286861
seconds elapsed        30.453538116     164.559937678   19.362492717
seconds elapsed        33.467108086     188.486827481   21.492612173
seconds elapsed        35.617727591     209.602677783   23.256422492
seconds elapsed        42.584239509     243.959902566   28.458540338
seconds elapsed        47.683632526     269.635248851   31.542404137

Over all, ZSTD has slower WRITE, but much faster READ (perhaps
a static compression buffer used during the test helped ZSTD a
lot), which results in faster test results.

Memory consumption (zram mm_stat file):

zram LZO mm_stat
mm_stat (jobs1): 2147483648 23068672 33558528        0 33558528        0        0
mm_stat (jobs2): 2147483648 23068672 33558528        0 33558528        0        0
mm_stat (jobs3): 2147483648 23068672 33558528        0 33562624        0        0
mm_stat (jobs4): 2147483648 23068672 33558528        0 33558528        0        0
mm_stat (jobs5): 2147483648 23068672 33558528        0 33558528        0        0
mm_stat (jobs6): 2147483648 23068672 33558528        0 33562624        0        0
mm_stat (jobs7): 2147483648 23068672 33558528        0 33566720        0        0
mm_stat (jobs8): 2147483648 23068672 33558528        0 33558528        0        0
mm_stat (jobs9): 2147483648 23068672 33558528        0 33558528        0        0
mm_stat (jobs10): 2147483648 23068672 33558528        0 33562624        0        0

zram DEFLATE mm_stat
mm_stat (jobs1): 2147483648 16252928 25178112        0 25178112        0        0
mm_stat (jobs2): 2147483648 16252928 25178112        0 25178112        0        0
mm_stat (jobs3): 2147483648 16252928 25178112        0 25178112        0        0
mm_stat (jobs4): 2147483648 16252928 25178112        0 25178112        0        0
mm_stat (jobs5): 2147483648 16252928 25178112        0 25178112        0        0
mm_stat (jobs6): 2147483648 16252928 25178112        0 25178112        0        0
mm_stat (jobs7): 2147483648 16252928 25178112        0 25190400        0        0
mm_stat (jobs8): 2147483648 16252928 25178112        0 25190400        0        0
mm_stat (jobs9): 2147483648 16252928 25178112        0 25178112        0        0
mm_stat (jobs10): 2147483648 16252928 25178112        0 25178112        0        0

zram ZSTD mm_stat
mm_stat (jobs1): 2147483648 11010048 16781312        0 16781312        0        0
mm_stat (jobs2): 2147483648 11010048 16781312        0 16781312        0        0
mm_stat (jobs3): 2147483648 11010048 16781312        0 16785408        0        0
mm_stat (jobs4): 2147483648 11010048 16781312        0 16781312        0        0
mm_stat (jobs5): 2147483648 11010048 16781312        0 16781312        0        0
mm_stat (jobs6): 2147483648 11010048 16781312        0 16781312        0        0
mm_stat (jobs7): 2147483648 11010048 16781312        0 16781312        0        0
mm_stat (jobs8): 2147483648 11010048 16781312        0 16781312        0        0
mm_stat (jobs9): 2147483648 11010048 16781312        0 16785408        0        0
mm_stat (jobs10): 2147483648 11010048 16781312        0 16781312        0        0

==================================================================================

Official benchmarks [1]:

Compressor name         Ratio   Compression     Decompress.
zstd 1.1.3 -1           2.877   430 MB/s        1110 MB/s
zlib 1.2.8 -1           2.743   110 MB/s        400 MB/s
brotli 0.5.2 -0         2.708   400 MB/s        430 MB/s
quicklz 1.5.0 -1        2.238   550 MB/s        710 MB/s
lzo1x 2.09 -1           2.108   650 MB/s        830 MB/s
lz4 1.7.5               2.101   720 MB/s        3600 MB/s
snappy 1.1.3            2.091   500 MB/s        1650 MB/s
lzf 3.6 -1              2.077   400 MB/s        860 MB/s

Minchan said:

: I did test with my sample data and compared zstd with deflate.  zstd's
: compress ratio is lower a little bit but compression speed is much faster
: 3 times more and decompress speed is too 2 times more.  With different
: data, it is different but overall, zstd would be better for speed at the
: cost of a little lower compress ratio(about 5%) so I believe it's worth to
: replace deflate.

[1] https://github.com/facebook/zstd

Link: http://lkml.kernel.org/r/20170912050005.3247-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Tested-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 5ef3a8b12556d7fcba81edc74e9d85b029615ae0)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ieb6239dab92f560fa654d9cc29b1e266f2e44050
---
 drivers/block/zram/zcomp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 4b5cd3a7b2b6..c084a7f9763d 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -31,6 +31,9 @@ static const char * const backends[] = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_842)
 	"842",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_ZSTD)
+	"zstd",
 #endif
 	NULL
 };

From 637615b60e96e546bd5993f7e22c53ab395522d6 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Fri, 30 Mar 2018 12:14:53 -0700
Subject: [PATCH 1793/3220] BACKPORT: crypto: zstd - Add zstd support

Adds zstd support to crypto and scompress. Only supports the default
level.

Previously we held off on this patch, since there weren't any users.
Now zram is ready for zstd support, but depends on CONFIG_CRYPTO_ZSTD,
which isn't defined until this patch is in. I also see a patch adding
zstd to pstore [0], which depends on crypto zstd.

[0] lkml.kernel.org/r/9c9416b2dff19f05fb4c35879aaa83d11ff72c92.1521626182.git.geliangtang@gmail.com

Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

(cherry picked from commit d28fc3dbe1918333730d62aa5f0d84b6fb4e7254)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I070acf1dd8bf415f8997a48ee908d930754fc71e
---
 crypto/Kconfig   |   9 ++
 crypto/Makefile  |   1 +
 crypto/testmgr.c |  16 ++++
 crypto/testmgr.h |  74 +++++++++++++++++
 crypto/zstd.c    | 209 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 309 insertions(+)
 create mode 100644 crypto/zstd.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 7a9f4d3d089b..ba8ca7cedb7e 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1577,6 +1577,15 @@ config CRYPTO_LZ4HC
 	help
 	  This is the LZ4 high compression mode algorithm.
 
+config CRYPTO_ZSTD
+	tristate "Zstd compression algorithm"
+	select CRYPTO_ALGAPI
+	select CRYPTO_ACOMP2
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
+	help
+	  This is the zstd algorithm.
+
 comment "Random Number Generation"
 
 config CRYPTO_ANSI_CPRNG
diff --git a/crypto/Makefile b/crypto/Makefile
index 7ae15c47f684..74f36e7d163f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -126,6 +126,7 @@ obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
 obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
+obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o
 
 #
 # generic algorithms and the async_tx api
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index a4aef61e40d8..2329b5f16b8c 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3949,6 +3949,22 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "zstd",
+		.test = alg_test_comp,
+		.fips_allowed = 1,
+		.suite = {
+			.comp = {
+				.comp = {
+					.vecs = zstd_comp_tv_template,
+					.count = ZSTD_COMP_TEST_VECTORS
+				},
+				.decomp = {
+					.vecs = zstd_decomp_tv_template,
+					.count = ZSTD_DECOMP_TEST_VECTORS
+				}
+			}
+		}
 	}
 };
 
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 0bb950ea22ed..58072e1a4def 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -35331,4 +35331,78 @@ static struct comp_testvec lz4hc_decomp_tv_template[] = {
 	},
 };
 
+#define ZSTD_COMP_TEST_VECTORS 2
+#define ZSTD_DECOMP_TEST_VECTORS 2
+
+static struct comp_testvec zstd_comp_tv_template[] = {
+	{
+		.inlen	= 68,
+		.outlen	= 39,
+		.input	= "The algorithm is zstd. "
+			  "The algorithm is zstd. "
+			  "The algorithm is zstd.",
+		.output	= "\x28\xb5\x2f\xfd\x00\x50\xf5\x00\x00\xb8\x54\x68\x65"
+			  "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73"
+			  "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01"
+			  ,
+	},
+	{
+		.inlen	= 244,
+		.outlen	= 151,
+		.input	= "zstd, short for Zstandard, is a fast lossless "
+			  "compression algorithm, targeting real-time "
+			  "compression scenarios at zlib-level and better "
+			  "compression ratios. The zstd compression library "
+			  "provides in-memory compression and decompression "
+			  "functions.",
+		.output	= "\x28\xb5\x2f\xfd\x00\x50\x75\x04\x00\x42\x4b\x1e\x17"
+			  "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32"
+			  "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f"
+			  "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad"
+			  "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60"
+			  "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86"
+			  "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90"
+			  "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64"
+			  "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30"
+			  "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc"
+			  "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e"
+			  "\x20\xa9\x0e\x82\xb9\x43\x45\x01",
+	},
+};
+
+static struct comp_testvec zstd_decomp_tv_template[] = {
+	{
+		.inlen	= 43,
+		.outlen	= 68,
+		.input	= "\x28\xb5\x2f\xfd\x04\x50\xf5\x00\x00\xb8\x54\x68\x65"
+			  "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73"
+			  "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01"
+			  "\x6b\xf4\x13\x35",
+		.output	= "The algorithm is zstd. "
+			  "The algorithm is zstd. "
+			  "The algorithm is zstd.",
+	},
+	{
+		.inlen	= 155,
+		.outlen	= 244,
+		.input	= "\x28\xb5\x2f\xfd\x04\x50\x75\x04\x00\x42\x4b\x1e\x17"
+			  "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32"
+			  "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f"
+			  "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad"
+			  "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60"
+			  "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86"
+			  "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90"
+			  "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64"
+			  "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30"
+			  "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc"
+			  "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e"
+			  "\x20\xa9\x0e\x82\xb9\x43\x45\x01\xaa\x6d\xda\x0d",
+		.output	= "zstd, short for Zstandard, is a fast lossless "
+			  "compression algorithm, targeting real-time "
+			  "compression scenarios at zlib-level and better "
+			  "compression ratios. The zstd compression library "
+			  "provides in-memory compression and decompression "
+			  "functions.",
+	},
+};
 #endif	/* _CRYPTO_TESTMGR_H */
diff --git a/crypto/zstd.c b/crypto/zstd.c
new file mode 100644
index 000000000000..9bfd28f8cc77
--- /dev/null
+++ b/crypto/zstd.c
@@ -0,0 +1,209 @@
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/vmalloc.h>
+#include <linux/zstd.h>
+
+
+#define ZSTD_DEF_LEVEL	3
+
+struct zstd_ctx {
+	ZSTD_CCtx *cctx;
+	ZSTD_DCtx *dctx;
+	void *cwksp;
+	void *dwksp;
+};
+
+static ZSTD_parameters zstd_params(void)
+{
+	return ZSTD_getParams(ZSTD_DEF_LEVEL, 0, 0);
+}
+
+static int zstd_comp_init(struct zstd_ctx *ctx)
+{
+	int ret = 0;
+	const ZSTD_parameters params = zstd_params();
+	const size_t wksp_size = ZSTD_CCtxWorkspaceBound(params.cParams);
+
+	ctx->cwksp = vzalloc(wksp_size);
+	if (!ctx->cwksp) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ctx->cctx = ZSTD_initCCtx(ctx->cwksp, wksp_size);
+	if (!ctx->cctx) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+out:
+	return ret;
+out_free:
+	vfree(ctx->cwksp);
+	goto out;
+}
+
+static int zstd_decomp_init(struct zstd_ctx *ctx)
+{
+	int ret = 0;
+	const size_t wksp_size = ZSTD_DCtxWorkspaceBound();
+
+	ctx->dwksp = vzalloc(wksp_size);
+	if (!ctx->dwksp) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ctx->dctx = ZSTD_initDCtx(ctx->dwksp, wksp_size);
+	if (!ctx->dctx) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+out:
+	return ret;
+out_free:
+	vfree(ctx->dwksp);
+	goto out;
+}
+
+static void zstd_comp_exit(struct zstd_ctx *ctx)
+{
+	vfree(ctx->cwksp);
+	ctx->cwksp = NULL;
+	ctx->cctx = NULL;
+}
+
+static void zstd_decomp_exit(struct zstd_ctx *ctx)
+{
+	vfree(ctx->dwksp);
+	ctx->dwksp = NULL;
+	ctx->dctx = NULL;
+}
+
+static int __zstd_init(void *ctx)
+{
+	int ret;
+
+	ret = zstd_comp_init(ctx);
+	if (ret)
+		return ret;
+	ret = zstd_decomp_init(ctx);
+	if (ret)
+		zstd_comp_exit(ctx);
+	return ret;
+}
+
+static int zstd_init(struct crypto_tfm *tfm)
+{
+	struct zstd_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	return __zstd_init(ctx);
+}
+
+static void __zstd_exit(void *ctx)
+{
+	zstd_comp_exit(ctx);
+	zstd_decomp_exit(ctx);
+}
+
+static void zstd_exit(struct crypto_tfm *tfm)
+{
+	struct zstd_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	__zstd_exit(ctx);
+}
+
+static int __zstd_compress(const u8 *src, unsigned int slen,
+			   u8 *dst, unsigned int *dlen, void *ctx)
+{
+	size_t out_len;
+	struct zstd_ctx *zctx = ctx;
+	const ZSTD_parameters params = zstd_params();
+
+	out_len = ZSTD_compressCCtx(zctx->cctx, dst, *dlen, src, slen, params);
+	if (ZSTD_isError(out_len))
+		return -EINVAL;
+	*dlen = out_len;
+	return 0;
+}
+
+static int zstd_compress(struct crypto_tfm *tfm, const u8 *src,
+			 unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	struct zstd_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	return __zstd_compress(src, slen, dst, dlen, ctx);
+}
+
+static int __zstd_decompress(const u8 *src, unsigned int slen,
+			     u8 *dst, unsigned int *dlen, void *ctx)
+{
+	size_t out_len;
+	struct zstd_ctx *zctx = ctx;
+
+	out_len = ZSTD_decompressDCtx(zctx->dctx, dst, *dlen, src, slen);
+	if (ZSTD_isError(out_len))
+		return -EINVAL;
+	*dlen = out_len;
+	return 0;
+}
+
+static int zstd_decompress(struct crypto_tfm *tfm, const u8 *src,
+			   unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	struct zstd_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	return __zstd_decompress(src, slen, dst, dlen, ctx);
+}
+
+static struct crypto_alg alg = {
+	.cra_name		= "zstd",
+	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_ctxsize		= sizeof(struct zstd_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_init		= zstd_init,
+	.cra_exit		= zstd_exit,
+	.cra_u			= { .compress = {
+	.coa_compress		= zstd_compress,
+	.coa_decompress		= zstd_decompress } }
+};
+
+static int __init zstd_mod_init(void)
+{
+	int ret;
+
+	ret = crypto_register_alg(&alg);
+	if (ret)
+		return ret;
+
+	return ret;
+}
+
+static void __exit zstd_mod_fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(zstd_mod_init);
+module_exit(zstd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Zstd Compression Algorithm");
+MODULE_ALIAS_CRYPTO("zstd");

From fc0013d7824b5bd241398a6bad0ac4a0e27da4bc Mon Sep 17 00:00:00 2001
From: Peter Kalauskas <peskal@google.com>
Date: Thu, 23 Aug 2018 16:25:04 -0700
Subject: [PATCH 1794/3220] ANDROID: x86_64_cuttlefish_defconfig: Enable zram
 and zstd

Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Id53d213c1630f59cb7934309f0da4e9dae6545d8
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 99d7b53932f7..e42f748f7414 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -48,6 +48,7 @@ CONFIG_X86_CPUID=y
 CONFIG_KSM=y
 CONFIG_DEFAULT_MMAP_MIN_ADDR=65536
 CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_ZSMALLOC=y
 # CONFIG_MTRR is not set
 CONFIG_HZ_100=y
 CONFIG_KEXEC=y
@@ -199,6 +200,7 @@ CONFIG_DEBUG_DEVRES=y
 CONFIG_OF=y
 CONFIG_OF_UNITTEST=y
 # CONFIG_PNP_DEBUG_MESSAGES is not set
+CONFIG_ZRAM=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
@@ -450,6 +452,7 @@ CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
 CONFIG_CRYPTO_SHA512=y
+CONFIG_CRYPTO_ZSTD=y
 CONFIG_ASYMMETRIC_KEY_TYPE=y
 CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
 CONFIG_X509_CERTIFICATE_PARSER=y

From f588554b4ab4d0871757dfd5acaa0cfb251805e3 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:44 -0700
Subject: [PATCH 1795/3220] UPSTREAM: zram: clean up duplicated codes in
 __zram_bvec_write

Patch series "writeback incompressible pages to storage", v1.

zRam is useful for memory saving with compressible pages but sometime,
workload can be changed and system has lots of incompressible pages
which is very harmful for zram.

This patch supports writeback feature of zram so admin can set up a
block device and with it, zram can save the memory via writing out the
incompressile pages once it found it's incompressible pages (1/4 comp
ratio) instead of keeping the page in memory.

[1-3] is just clean up and [4-8] is step by step feature enablement.
[4-8] is logically not bisectable(ie, logical unit separation)
although I tried to compiled out without breaking but I think it would
be better to review.

This patch (of 9):

__zram_bvec_write has some of duplicated logic for zram meta data
handling of same_page|compressed_page.  This patch aims to clean it up
without behavior change.

[xieyisheng1@huawei.com: fix compr_data_size stat]
  Link: http://lkml.kernel.org/r/1502707447-6944-1-git-send-email-xieyisheng1@huawei.com
Link: http://lkml.kernel.org/r/1496019048-27016-1-git-send-email-minchan@kernel.org
Link: http://lkml.kernel.org/r/1498459987-24562-2-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Juneho Choi <juno.choi@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 4ebbe7f7fc99260afd51759e35dbfdd6010dc697)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I3fa150c869a66ff289712b956924ecb361864a2e
---
 drivers/block/zram/zram_drv.c | 56 ++++++++++++++---------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7bed8e73c376..2c3046b1f9b7 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -452,30 +452,6 @@ static bool zram_same_page_read(struct zram *zram, u32 index,
 	return false;
 }
 
-static bool zram_same_page_write(struct zram *zram, u32 index,
-					struct page *page)
-{
-	unsigned long element;
-	void *mem = kmap_atomic(page);
-
-	if (page_same_filled(mem, &element)) {
-		kunmap_atomic(mem);
-		/* Free memory associated with this sector now. */
-		zram_slot_lock(zram, index);
-		zram_free_page(zram, index);
-		zram_set_flag(zram, index, ZRAM_SAME);
-		zram_set_element(zram, index, element);
-		zram_slot_unlock(zram, index);
-
-		atomic64_inc(&zram->stats.same_pages);
-		atomic64_inc(&zram->stats.pages_stored);
-		return true;
-	}
-	kunmap_atomic(mem);
-
-	return false;
-}
-
 static void zram_meta_free(struct zram *zram, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
@@ -684,14 +660,23 @@ compress_again:
 static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 {
 	int ret;
-	unsigned long handle;
-	unsigned int comp_len;
-	void *src, *dst;
+	unsigned long handle = 0;
+	unsigned int comp_len = 0;
+	void *src, *dst, *mem;
 	struct zcomp_strm *zstrm;
 	struct page *page = bvec->bv_page;
+	unsigned long element = 0;
+	enum zram_pageflags flags = 0;
 
-	if (zram_same_page_write(zram, index, page))
-		return 0;
+	mem = kmap_atomic(page);
+	if (page_same_filled(mem, &element)) {
+		kunmap_atomic(mem);
+		/* Free memory associated with this sector now */
+		atomic64_inc(&zram->stats.same_pages);
+		flags = ZRAM_SAME;
+		goto out;
+	}
+	kunmap_atomic(mem);
 
 	zstrm = zcomp_stream_get(zram->comp);
 	ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
@@ -711,19 +696,24 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 
 	zcomp_stream_put(zram->comp);
 	zs_unmap_object(zram->mem_pool, handle);
-
+	atomic64_add(comp_len, &zram->stats.compr_data_size);
+out:
 	/*
 	 * Free memory associated with this sector
 	 * before overwriting unused sectors.
 	 */
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	zram_set_handle(zram, index, handle);
-	zram_set_obj_size(zram, index, comp_len);
+	if (flags == ZRAM_SAME) {
+		zram_set_flag(zram, index, ZRAM_SAME);
+		zram_set_element(zram, index, element);
+	} else {
+		zram_set_handle(zram, index, handle);
+		zram_set_obj_size(zram, index, comp_len);
+	}
 	zram_slot_unlock(zram, index);
 
 	/* Update stats */
-	atomic64_add(comp_len, &zram->stats.compr_data_size);
 	atomic64_inc(&zram->stats.pages_stored);
 	return 0;
 }

From 7d6933f6bf7ac9eaa7bbbb9039cdd60cbb8362ff Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:47 -0700
Subject: [PATCH 1796/3220] UPSTREAM: zram: inline zram_compress

zram_compress does several things, compress, entry alloc and check
limitation.  I did for just readbility but it hurts modulization.:(

So this patch removes zram_compress functions and inline it in
__zram_bvec_write for upcoming patches.

Link: http://lkml.kernel.org/r/1498459987-24562-3-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 97ec7c8bd5d029b2c3e40355c1204197094e9ba1)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ibb37d77168edd0b01d0b9820e431c73cc3c2ff20
---
 drivers/block/zram/zram_drv.c | 64 ++++++++++++-----------------------
 1 file changed, 22 insertions(+), 42 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 2c3046b1f9b7..96c175bb476f 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -588,25 +588,38 @@ out:
 	return ret;
 }
 
-static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
-			struct page *page,
-			unsigned long *out_handle, unsigned int *out_comp_len)
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 {
 	int ret;
-	unsigned int comp_len;
-	void *src;
 	unsigned long alloced_pages;
 	unsigned long handle = 0;
+	unsigned int comp_len = 0;
+	void *src, *dst, *mem;
+	struct zcomp_strm *zstrm;
+	struct page *page = bvec->bv_page;
+	unsigned long element = 0;
+	enum zram_pageflags flags = 0;
+
+	mem = kmap_atomic(page);
+	if (page_same_filled(mem, &element)) {
+		kunmap_atomic(mem);
+		/* Free memory associated with this sector now. */
+		flags = ZRAM_SAME;
+		atomic64_inc(&zram->stats.same_pages);
+		goto out;
+	}
+	kunmap_atomic(mem);
 
 compress_again:
+	zstrm = zcomp_stream_get(zram->comp);
 	src = kmap_atomic(page);
-	ret = zcomp_compress(*zstrm, src, &comp_len);
+	ret = zcomp_compress(zstrm, src, &comp_len);
 	kunmap_atomic(src);
 
 	if (unlikely(ret)) {
+		zcomp_stream_put(zram->comp);
 		pr_err("Compression failed! err=%d\n", ret);
-		if (handle)
-			zs_free(zram->mem_pool, handle);
+		zs_free(zram->mem_pool, handle);
 		return ret;
 	}
 
@@ -638,7 +651,6 @@ compress_again:
 		handle = zs_malloc(zram->mem_pool, comp_len,
 				GFP_NOIO | __GFP_HIGHMEM |
 				__GFP_MOVABLE);
-		*zstrm = zcomp_stream_get(zram->comp);
 		if (handle)
 			goto compress_again;
 		return -ENOMEM;
@@ -648,43 +660,11 @@ compress_again:
 	update_used_max(zram, alloced_pages);
 
 	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+		zcomp_stream_put(zram->comp);
 		zs_free(zram->mem_pool, handle);
 		return -ENOMEM;
 	}
 
-	*out_handle = handle;
-	*out_comp_len = comp_len;
-	return 0;
-}
-
-static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
-{
-	int ret;
-	unsigned long handle = 0;
-	unsigned int comp_len = 0;
-	void *src, *dst, *mem;
-	struct zcomp_strm *zstrm;
-	struct page *page = bvec->bv_page;
-	unsigned long element = 0;
-	enum zram_pageflags flags = 0;
-
-	mem = kmap_atomic(page);
-	if (page_same_filled(mem, &element)) {
-		kunmap_atomic(mem);
-		/* Free memory associated with this sector now */
-		atomic64_inc(&zram->stats.same_pages);
-		flags = ZRAM_SAME;
-		goto out;
-	}
-	kunmap_atomic(mem);
-
-	zstrm = zcomp_stream_get(zram->comp);
-	ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
-	if (ret) {
-		zcomp_stream_put(zram->comp);
-		return ret;
-	}
-
 	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
 
 	src = zstrm->buffer;

From 611815ed3a6a23961b7f3fbf34b8dd458b33a372 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:50 -0700
Subject: [PATCH 1797/3220] UPSTREAM: zram: rename zram_decompress_page to
 __zram_bvec_read

zram_decompress_page naming is not proper because it doesn't decompress
if page was dedup hit or stored with compression.

Use more abstract term and consistent with write path function
__zram_bvec_write.

Link: http://lkml.kernel.org/r/1498459987-24562-4-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 693dc1ce25b8c8fa33f930d47cd8f926eeb90812)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ia7c948f4b78601458b7ebc23ab345d4bc0a8d4a8
---
 drivers/block/zram/zram_drv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 96c175bb476f..9137eff24e6d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -517,7 +517,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 	zram_set_obj_size(zram, index, 0);
 }
 
-static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
+static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index)
 {
 	int ret;
 	unsigned long handle;
@@ -569,7 +569,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 			return -ENOMEM;
 	}
 
-	ret = zram_decompress_page(zram, page, index);
+	ret = __zram_bvec_read(zram, page, index);
 	if (unlikely(ret))
 		goto out;
 
@@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 		if (!page)
 			return -ENOMEM;
 
-		ret = zram_decompress_page(zram, page, index);
+		ret = __zram_bvec_read(zram, page, index);
 		if (ret)
 			goto out;
 

From 6b584e46ec500b12f2a4aecaecb3d6f293a9b350 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:54 -0700
Subject: [PATCH 1798/3220] UPSTREAM: zram: add interface to specif backing
 device

For writeback feature, user should set up backing device before the zram
working.

This patch enables the interface via /sys/block/zramX/backing_dev.

Currently, it supports block device only but it could be enhanced for
file as well.

Link: http://lkml.kernel.org/r/1498459987-24562-5-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 013bf95a83ec760a2afc37fabd6bf13a9cdae205)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I4bbf12ed7496d476bddd574e756bac5c8a838089
---
 drivers/block/zram/zram_drv.c | 142 ++++++++++++++++++++++++++++++++++
 drivers/block/zram/zram_drv.h |   5 ++
 2 files changed, 147 insertions(+)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9137eff24e6d..9732218d94a9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -269,6 +269,141 @@ static ssize_t mem_used_max_store(struct device *dev,
 	return len;
 }
 
+#ifdef CONFIG_ZRAM_WRITEBACK
+static bool zram_wb_enabled(struct zram *zram)
+{
+	return zram->backing_dev;
+}
+
+static void reset_bdev(struct zram *zram)
+{
+	struct block_device *bdev;
+
+	if (!zram_wb_enabled(zram))
+		return;
+
+	bdev = zram->bdev;
+	if (zram->old_block_size)
+		set_blocksize(bdev, zram->old_block_size);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	/* hope filp_close flush all of IO */
+	filp_close(zram->backing_dev, NULL);
+	zram->backing_dev = NULL;
+	zram->old_block_size = 0;
+	zram->bdev = NULL;
+}
+
+static ssize_t backing_dev_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+	struct file *file = zram->backing_dev;
+	char *p;
+	ssize_t ret;
+
+	down_read(&zram->init_lock);
+	if (!zram_wb_enabled(zram)) {
+		memcpy(buf, "none\n", 5);
+		up_read(&zram->init_lock);
+		return 5;
+	}
+
+	p = file_path(file, buf, PAGE_SIZE - 1);
+	if (IS_ERR(p)) {
+		ret = PTR_ERR(p);
+		goto out;
+	}
+
+	ret = strlen(p);
+	memmove(buf, p, ret);
+	buf[ret++] = '\n';
+out:
+	up_read(&zram->init_lock);
+	return ret;
+}
+
+static ssize_t backing_dev_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	char *file_name;
+	struct file *backing_dev = NULL;
+	struct inode *inode;
+	struct address_space *mapping;
+	unsigned int old_block_size = 0;
+	struct block_device *bdev = NULL;
+	int err;
+	struct zram *zram = dev_to_zram(dev);
+
+	file_name = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!file_name)
+		return -ENOMEM;
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Can't setup backing device for initialized device\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	strlcpy(file_name, buf, len);
+
+	backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
+	if (IS_ERR(backing_dev)) {
+		err = PTR_ERR(backing_dev);
+		backing_dev = NULL;
+		goto out;
+	}
+
+	mapping = backing_dev->f_mapping;
+	inode = mapping->host;
+
+	/* Support only block device in this moment */
+	if (!S_ISBLK(inode->i_mode)) {
+		err = -ENOTBLK;
+		goto out;
+	}
+
+	bdev = bdgrab(I_BDEV(inode));
+	err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
+	if (err < 0)
+		goto out;
+
+	old_block_size = block_size(bdev);
+	err = set_blocksize(bdev, PAGE_SIZE);
+	if (err)
+		goto out;
+
+	reset_bdev(zram);
+
+	zram->old_block_size = old_block_size;
+	zram->bdev = bdev;
+	zram->backing_dev = backing_dev;
+	up_write(&zram->init_lock);
+
+	pr_info("setup backing device %s\n", file_name);
+	kfree(file_name);
+
+	return len;
+out:
+	if (bdev)
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+	if (backing_dev)
+		filp_close(backing_dev, NULL);
+
+	up_write(&zram->init_lock);
+
+	kfree(file_name);
+
+	return err;
+}
+
+#else
+static bool zram_wb_enabled(struct zram *zram) { return false; }
+static inline void reset_bdev(struct zram *zram) {};
+#endif
+
+
 /*
  * We switched to per-cpu streams and this attr is not needed anymore.
  * However, we will keep it around for some time, because:
@@ -947,6 +1082,7 @@ static void zram_reset_device(struct zram *zram)
 	zram_meta_free(zram, disksize);
 	memset(&zram->stats, 0, sizeof(zram->stats));
 	zcomp_destroy(comp);
+	reset_bdev(zram);
 }
 
 static ssize_t disksize_store(struct device *dev,
@@ -1072,6 +1208,9 @@ static DEVICE_ATTR_WO(mem_limit);
 static DEVICE_ATTR_WO(mem_used_max);
 static DEVICE_ATTR_RW(max_comp_streams);
 static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_ZRAM_WRITEBACK
+static DEVICE_ATTR_RW(backing_dev);
+#endif
 
 static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_disksize.attr,
@@ -1082,6 +1221,9 @@ static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_mem_used_max.attr,
 	&dev_attr_max_comp_streams.attr,
 	&dev_attr_comp_algorithm.attr,
+#ifdef CONFIG_ZRAM_WRITEBACK
+	&dev_attr_backing_dev.attr,
+#endif
 	&dev_attr_io_stat.attr,
 	&dev_attr_mm_stat.attr,
 	&dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e34e44d02e3e..113a41118918 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -115,5 +115,10 @@ struct zram {
 	 * zram is claimed so open request will be failed
 	 */
 	bool claim; /* Protected by bdev->bd_mutex */
+#ifdef CONFIG_ZRAM_WRITEBACK
+	struct file *backing_dev;
+	struct block_device *bdev;
+	unsigned int old_block_size;
+#endif
 };
 #endif

From b865cfdbdc4a485a4eee44b67757496c28e13978 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:19:57 -0700
Subject: [PATCH 1799/3220] BACKPORT: zram: add free space management in
 backing device

With backing device, zram needs management of free space of backing
device.

This patch adds bitmap logic to manage free space which is very naive.
However, it would be simple enough as considering uncompressible pages's
frequenty in zram.

Link: http://lkml.kernel.org/r/1498459987-24562-6-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 1363d4662a0d28dfdb81ef426c88c9a8dbf7c338)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I37dc98b40bfddceb9eb6d989ca30683dbf89210c
---
 drivers/block/zram/zram_drv.c | 56 ++++++++++++++++++++++++++++++++++-
 drivers/block/zram/zram_drv.h |  3 ++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9732218d94a9..79b5e9c774a7 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -291,6 +291,9 @@ static void reset_bdev(struct zram *zram)
 	zram->backing_dev = NULL;
 	zram->old_block_size = 0;
 	zram->bdev = NULL;
+
+	kvfree(zram->bitmap);
+	zram->bitmap = NULL;
 }
 
 static ssize_t backing_dev_show(struct device *dev,
@@ -329,10 +332,12 @@ static ssize_t backing_dev_store(struct device *dev,
 	struct file *backing_dev = NULL;
 	struct inode *inode;
 	struct address_space *mapping;
-	unsigned int old_block_size = 0;
+	unsigned int bitmap_sz, old_block_size = 0;
+	unsigned long nr_pages, *bitmap = NULL;
 	struct block_device *bdev = NULL;
 	int err;
 	struct zram *zram = dev_to_zram(dev);
+	gfp_t kmalloc_flags;
 
 	file_name = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!file_name)
@@ -368,16 +373,34 @@ static ssize_t backing_dev_store(struct device *dev,
 	if (err < 0)
 		goto out;
 
+	nr_pages = i_size_read(inode) >> PAGE_SHIFT;
+	bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
+	kmalloc_flags = GFP_KERNEL | __GFP_ZERO;
+	if (bitmap_sz > PAGE_SIZE)
+		kmalloc_flags |= __GFP_NOWARN | __GFP_NORETRY;
+
+	bitmap = kmalloc_node(bitmap_sz, kmalloc_flags, NUMA_NO_NODE);
+	if (!bitmap && bitmap_sz > PAGE_SIZE)
+		bitmap = vzalloc(bitmap_sz);
+
+	if (!bitmap) {
+		err = -ENOMEM;
+		goto out;
+	}
+
 	old_block_size = block_size(bdev);
 	err = set_blocksize(bdev, PAGE_SIZE);
 	if (err)
 		goto out;
 
 	reset_bdev(zram);
+	spin_lock_init(&zram->bitmap_lock);
 
 	zram->old_block_size = old_block_size;
 	zram->bdev = bdev;
 	zram->backing_dev = backing_dev;
+	zram->bitmap = bitmap;
+	zram->nr_pages = nr_pages;
 	up_write(&zram->init_lock);
 
 	pr_info("setup backing device %s\n", file_name);
@@ -385,6 +408,9 @@ static ssize_t backing_dev_store(struct device *dev,
 
 	return len;
 out:
+	if (bitmap)
+		kvfree(bitmap);
+
 	if (bdev)
 		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 
@@ -398,6 +424,34 @@ out:
 	return err;
 }
 
+static unsigned long get_entry_bdev(struct zram *zram)
+{
+	unsigned long entry;
+
+	spin_lock(&zram->bitmap_lock);
+	/* skip 0 bit to confuse zram.handle = 0 */
+	entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
+	if (entry == zram->nr_pages) {
+		spin_unlock(&zram->bitmap_lock);
+		return 0;
+	}
+
+	set_bit(entry, zram->bitmap);
+	spin_unlock(&zram->bitmap_lock);
+
+	return entry;
+}
+
+static void put_entry_bdev(struct zram *zram, unsigned long entry)
+{
+	int was_set;
+
+	spin_lock(&zram->bitmap_lock);
+	was_set = test_and_clear_bit(entry, zram->bitmap);
+	spin_unlock(&zram->bitmap_lock);
+	WARN_ON_ONCE(!was_set);
+}
+
 #else
 static bool zram_wb_enabled(struct zram *zram) { return false; }
 static inline void reset_bdev(struct zram *zram) {};
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 113a41118918..707aec0a2681 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -119,6 +119,9 @@ struct zram {
 	struct file *backing_dev;
 	struct block_device *bdev;
 	unsigned int old_block_size;
+	unsigned long *bitmap;
+	unsigned long nr_pages;
+	spinlock_t bitmap_lock;
 #endif
 };
 #endif

From a1156cbb71f4ed745f8a14719d9307813e3c9151 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:20:00 -0700
Subject: [PATCH 1800/3220] BACKPORT: zram: identify asynchronous IO's return
 value

For upcoming asynchronous IO like writeback, zram_rw_page should be
aware of that whether requested IO was completed or submitted
successfully, otherwise error.

For the goal, zram_bvec_rw has three return values.

-errno: returns error number
     0: IO request is done synchronously
     1: IO request is issued successfully.

Link: http://lkml.kernel.org/r/1498459987-24562-7-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit ae85a8075c5b025b9d503554ddc480a346a24536)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Id6e764b3eacfebdca2f46050648a49fc5f276b2c
---
 drivers/block/zram/zram_drv.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 79b5e9c774a7..df0d61dfa110 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -779,7 +779,7 @@ out:
 
 static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 {
-	int ret;
+	int ret = 0;
 	unsigned long alloced_pages;
 	unsigned long handle = 0;
 	unsigned int comp_len = 0;
@@ -884,7 +884,7 @@ out:
 
 	/* Update stats */
 	atomic64_inc(&zram->stats.pages_stored);
-	return 0;
+	return ret;
 }
 
 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
@@ -966,6 +966,11 @@ static void zram_bio_discard(struct zram *zram, u32 index,
 	}
 }
 
+/*
+ * Returns errno if it has some problem. Otherwise return 0 or 1.
+ * Returns 0 if IO request was done synchronously
+ * Returns 1 if IO request was successfully submitted.
+ */
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 			int offset, int rw)
 {
@@ -986,7 +991,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 
 	generic_end_io_acct(rw, &zram->disk->part0, start_time);
 
-	if (unlikely(ret)) {
+	if (unlikely(ret < 0)) {
 		if (rw == READ)
 			atomic64_inc(&zram->stats.failed_reads);
 		else
@@ -1075,7 +1080,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, int rw)
 {
-	int offset, err = -EIO;
+	int offset, ret;
 	u32 index;
 	struct zram *zram;
 	struct bio_vec bv;
@@ -1084,7 +1089,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
 		atomic64_inc(&zram->stats.invalid_io);
-		err = -EINVAL;
+		ret = -EINVAL;
 		goto out;
 	}
 
@@ -1095,7 +1100,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_len = PAGE_SIZE;
 	bv.bv_offset = 0;
 
-	err = zram_bvec_rw(zram, &bv, index, offset, rw);
+	ret = zram_bvec_rw(zram, &bv, index, offset, rw);
 out:
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
@@ -1105,9 +1110,20 @@ out:
 	 * bio->bi_end_io does things to handle the error
 	 * (e.g., SetPageError, set_page_dirty and extra works).
 	 */
-	if (err == 0)
+	if (unlikely(ret < 0))
+		return ret;
+
+	switch (ret) {
+	case 0:
 		page_endio(page, rw, 0);
-	return err;
+		break;
+	case 1:
+		ret = 0;
+		break;
+	default:
+		WARN_ON(1);
+	}
+	return ret;
 }
 
 static void zram_reset_device(struct zram *zram)

From 5116f2cd6b5ce3647b346811619e8789413f329b Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:20:03 -0700
Subject: [PATCH 1801/3220] BACKPORT: zram: write incompressible pages to
 backing device

This patch enables write IO to transfer data to backing device.  For
that, it implements write_to_bdev function which creates new bio and
chaining with parent bio to make the parent bio asynchrnous.

For rw_page which don't have parent bio, it submit owned bio and handle
IO completion by zram_page_end_io.

Also, this patch defines new flag ZRAM_WB to mark written page for later
read IO.

[xieyisheng1@huawei.com: fix typo in comment]
  Link: http://lkml.kernel.org/r/1502707447-6944-2-git-send-email-xieyisheng1@huawei.com
Link: http://lkml.kernel.org/r/1498459987-24562-8-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit db8ffbd4e7634cc537c8d32e73e7ce0f06248645)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ie675efd6c3ec04a151443f1cd0bf798d4847710f
---
 drivers/block/zram/zram_drv.c | 112 ++++++++++++++++++++++++++++++----
 drivers/block/zram/zram_drv.h |   3 +-
 2 files changed, 102 insertions(+), 13 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index df0d61dfa110..a0838669b589 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -452,9 +452,75 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
 	WARN_ON_ONCE(!was_set);
 }
 
+void zram_page_end_io(struct bio *bio)
+{
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	page_endio(page, bio_data_dir(bio), bio->bi_error);
+	bio_put(bio);
+}
+
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+					u32 index, struct bio *parent,
+					unsigned long *pentry)
+{
+	struct bio *bio;
+	unsigned long entry;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+
+	entry = get_entry_bdev(zram);
+	if (!entry) {
+		bio_put(bio);
+		return -ENOSPC;
+	}
+
+	bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
+	bio->bi_bdev = zram->bdev;
+	if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+					bvec->bv_offset)) {
+		bio_put(bio);
+		put_entry_bdev(zram, entry);
+		return -EIO;
+	}
+
+	if (!parent) {
+		bio->bi_rw = REQ_WRITE | REQ_SYNC;
+		bio->bi_end_io = zram_page_end_io;
+	} else {
+		bio->bi_rw = parent->bi_rw;
+		bio_chain(bio, parent);
+	}
+
+	submit_bio(WRITE, bio);
+	*pentry = entry;
+
+	return 0;
+}
+
+static void zram_wb_clear(struct zram *zram, u32 index)
+{
+	unsigned long entry;
+
+	zram_clear_flag(zram, index, ZRAM_WB);
+	entry = zram_get_element(zram, index);
+	zram_set_element(zram, index, 0);
+	put_entry_bdev(zram, entry);
+}
+
 #else
 static bool zram_wb_enabled(struct zram *zram) { return false; }
 static inline void reset_bdev(struct zram *zram) {};
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+					u32 index, struct bio *parent,
+					unsigned long *pentry)
+
+{
+	return -EIO;
+}
+static void zram_wb_clear(struct zram *zram, u32 index) {}
 #endif
 
 
@@ -679,7 +745,13 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
  */
 static void zram_free_page(struct zram *zram, size_t index)
 {
-	unsigned long handle = zram_get_handle(zram, index);
+	unsigned long handle;
+
+	if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
+		zram_wb_clear(zram, index);
+		atomic64_dec(&zram->stats.pages_stored);
+		return;
+	}
 
 	/*
 	 * No memory is allocated for same element filled pages.
@@ -693,6 +765,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 		return;
 	}
 
+	handle = zram_get_handle(zram, index);
 	if (!handle)
 		return;
 
@@ -777,7 +850,8 @@ out:
 	return ret;
 }
 
-static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+				u32 index, struct bio *bio)
 {
 	int ret = 0;
 	unsigned long alloced_pages;
@@ -788,6 +862,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 	struct page *page = bvec->bv_page;
 	unsigned long element = 0;
 	enum zram_pageflags flags = 0;
+	bool allow_wb = true;
 
 	mem = kmap_atomic(page);
 	if (page_same_filled(mem, &element)) {
@@ -812,8 +887,20 @@ compress_again:
 		return ret;
 	}
 
-	if (unlikely(comp_len > max_zpage_size))
+	if (unlikely(comp_len > max_zpage_size)) {
+		if (zram_wb_enabled(zram) && allow_wb) {
+			zcomp_stream_put(zram->comp);
+			ret = write_to_bdev(zram, bvec, index, bio, &element);
+			if (!ret) {
+				flags = ZRAM_WB;
+				ret = 1;
+				goto out;
+			}
+			allow_wb = false;
+			goto compress_again;
+		}
 		comp_len = PAGE_SIZE;
+	}
 
 	/*
 	 * handle allocation has 2 paths:
@@ -873,10 +960,11 @@ out:
 	 */
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	if (flags == ZRAM_SAME) {
-		zram_set_flag(zram, index, ZRAM_SAME);
+
+	if (flags) {
+		zram_set_flag(zram, index, flags);
 		zram_set_element(zram, index, element);
-	} else {
+	}  else {
 		zram_set_handle(zram, index, handle);
 		zram_set_obj_size(zram, index, comp_len);
 	}
@@ -888,7 +976,7 @@ out:
 }
 
 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
-				u32 index, int offset)
+				u32 index, int offset, struct bio *bio)
 {
 	int ret;
 	struct page *page = NULL;
@@ -921,7 +1009,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 		vec.bv_offset = 0;
 	}
 
-	ret = __zram_bvec_write(zram, &vec, index);
+	ret = __zram_bvec_write(zram, &vec, index, bio);
 out:
 	if (is_partial_io(bvec))
 		__free_page(page);
@@ -972,7 +1060,7 @@ static void zram_bio_discard(struct zram *zram, u32 index,
  * Returns 1 if IO request was successfully submitted.
  */
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-			int offset, int rw)
+			int offset, int rw, struct bio *bio)
 {
 	unsigned long start_time = jiffies;
 	int ret;
@@ -986,7 +1074,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 		flush_dcache_page(bvec->bv_page);
 	} else {
 		atomic64_inc(&zram->stats.num_writes);
-		ret = zram_bvec_write(zram, bvec, index, offset);
+		ret = zram_bvec_write(zram, bvec, index, offset, bio);
 	}
 
 	generic_end_io_acct(rw, &zram->disk->part0, start_time);
@@ -1026,7 +1114,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 		do {
 			bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
 							unwritten);
-			if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
+			if (zram_bvec_rw(zram, &bv, index, offset, rw, bio) < 0)
 				goto out;
 
 			bv.bv_offset += bv.bv_len;
@@ -1100,7 +1188,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_len = PAGE_SIZE;
 	bv.bv_offset = 0;
 
-	ret = zram_bvec_rw(zram, &bv, index, offset, rw);
+	ret = zram_bvec_rw(zram, &bv, index, offset, rw, NULL);
 out:
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 707aec0a2681..31762db861e3 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -60,9 +60,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
 
 /* Flags for zram pages (table[page_no].value) */
 enum zram_pageflags {
-	/* Page consists entirely of zeros */
+	/* Page consists the same element */
 	ZRAM_SAME = ZRAM_FLAG_SHIFT,
 	ZRAM_ACCESS,	/* page is now accessed */
+	ZRAM_WB,	/* page is stored on backing_device */
 
 	__NR_ZRAM_PAGEFLAGS,
 };

From c771cc464309470690a8f97ad32819d1e4f64606 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:20:07 -0700
Subject: [PATCH 1802/3220] BACKPORT: zram: read page from backing device

This patch enables read IO from backing device.  For the feature, it
implements two IO read functions to transfer data from backing storage.

One is asynchronous IO function and other is synchronous one.

A reason I need synchrnous IO is due to partial write which need to
complete read IO before the overwriting partial data.

We can make the partial IO's case asynchronous, too but at the moment, I
don't feel adding more complexity to support such rare use cases so want
to go with simple.

[xieyisheng1@huawei.com: read_from_bdev_async(): return 1 to avoid call page_endio() in zram_rw_page()]
  Link: http://lkml.kernel.org/r/1502707447-6944-1-git-send-email-xieyisheng1@huawei.com
Link: http://lkml.kernel.org/r/1498459987-24562-9-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 8e654f8fbff52ac483fb69957222853d7e2fc588)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ia82f5fc4697aacc723a336e4dad4e7bc56a1bdb9
---
 drivers/block/zram/zram_drv.c | 123 ++++++++++++++++++++++++++++++++--
 1 file changed, 118 insertions(+), 5 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a0838669b589..656495d61998 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -460,6 +460,95 @@ void zram_page_end_io(struct bio *bio)
 	bio_put(bio);
 }
 
+/*
+ * Returns 1 if the submission is successful.
+ */
+static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
+			unsigned long entry, struct bio *parent)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
+	bio->bi_bdev = zram->bdev;
+	if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
+		bio_put(bio);
+		return -EIO;
+	}
+
+	if (!parent) {
+		bio->bi_rw = 0;
+		bio->bi_end_io = zram_page_end_io;
+	} else {
+		bio->bi_rw = parent->bi_rw;
+		bio_chain(bio, parent);
+	}
+
+	submit_bio(READ, bio);
+	return 1;
+}
+
+struct zram_work {
+	struct work_struct work;
+	struct zram *zram;
+	unsigned long entry;
+	struct bio *bio;
+};
+
+#if PAGE_SIZE != 4096
+static void zram_sync_read(struct work_struct *work)
+{
+	struct bio_vec bvec;
+	struct zram_work *zw = container_of(work, struct zram_work, work);
+	struct zram *zram = zw->zram;
+	unsigned long entry = zw->entry;
+	struct bio *bio = zw->bio;
+
+	read_from_bdev_async(zram, &bvec, entry, bio);
+}
+
+/*
+ * Block layer want one ->make_request_fn to be active at a time
+ * so if we use chained IO with parent IO in same context,
+ * it's a deadlock. To avoid, it, it uses worker thread context.
+ */
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+				unsigned long entry, struct bio *bio)
+{
+	struct zram_work work;
+
+	work.zram = zram;
+	work.entry = entry;
+	work.bio = bio;
+
+	INIT_WORK_ONSTACK(&work.work, zram_sync_read);
+	queue_work(system_unbound_wq, &work.work);
+	flush_work(&work.work);
+	destroy_work_on_stack(&work.work);
+
+	return 1;
+}
+#else
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+				unsigned long entry, struct bio *bio)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+#endif
+
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+			unsigned long entry, struct bio *parent, bool sync)
+{
+	if (sync)
+		return read_from_bdev_sync(zram, bvec, entry, parent);
+	else
+		return read_from_bdev_async(zram, bvec, entry, parent);
+}
+
 static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
 					u32 index, struct bio *parent,
 					unsigned long *pentry)
@@ -520,6 +609,12 @@ static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
 {
 	return -EIO;
 }
+
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+			unsigned long entry, struct bio *parent, bool sync)
+{
+	return -EIO;
+}
 static void zram_wb_clear(struct zram *zram, u32 index) {}
 #endif
 
@@ -779,13 +874,31 @@ static void zram_free_page(struct zram *zram, size_t index)
 	zram_set_obj_size(zram, index, 0);
 }
 
-static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index)
+static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
+				struct bio *bio, bool partial_io)
 {
 	int ret;
 	unsigned long handle;
 	unsigned int size;
 	void *src, *dst;
 
+	if (zram_wb_enabled(zram)) {
+		zram_slot_lock(zram, index);
+		if (zram_test_flag(zram, index, ZRAM_WB)) {
+			struct bio_vec bvec;
+
+			zram_slot_unlock(zram, index);
+
+			bvec.bv_page = page;
+			bvec.bv_len = PAGE_SIZE;
+			bvec.bv_offset = 0;
+			return read_from_bdev(zram, &bvec,
+					zram_get_element(zram, index),
+					bio, partial_io);
+		}
+		zram_slot_unlock(zram, index);
+	}
+
 	if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
 		return 0;
 
@@ -818,7 +931,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index)
 }
 
 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
-				u32 index, int offset)
+				u32 index, int offset, struct bio *bio)
 {
 	int ret;
 	struct page *page;
@@ -831,7 +944,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 			return -ENOMEM;
 	}
 
-	ret = __zram_bvec_read(zram, page, index);
+	ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
 	if (unlikely(ret))
 		goto out;
 
@@ -994,7 +1107,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 		if (!page)
 			return -ENOMEM;
 
-		ret = __zram_bvec_read(zram, page, index);
+		ret = __zram_bvec_read(zram, page, index, bio, true);
 		if (ret)
 			goto out;
 
@@ -1070,7 +1183,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 
 	if (rw == READ) {
 		atomic64_inc(&zram->stats.num_reads);
-		ret = zram_bvec_read(zram, bvec, index, offset);
+		ret = zram_bvec_read(zram, bvec, index, offset, bio);
 		flush_dcache_page(bvec->bv_page);
 	} else {
 		atomic64_inc(&zram->stats.num_writes);

From 8a610b438dbd129936d07f4aa0bd6ca42fca54c4 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 6 Sep 2017 16:20:10 -0700
Subject: [PATCH 1803/3220] UPSTREAM: zram: add config and doc file for
 writeback feature

This patch adds document and kconfig for using of writeback feature.

Link: http://lkml.kernel.org/r/1498459987-24562-10-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 5a47074f0279421778f97b1b1e75686696a5f42a)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I9ec2230739a6468a4481a90a9c9f966badf9ac48
---
 Documentation/ABI/testing/sysfs-block-zram |  8 ++++++++
 Documentation/blockdev/zram.txt            | 11 +++++++++++
 drivers/block/zram/Kconfig                 | 12 ++++++++++++
 3 files changed, 31 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 451b6d882b2c..c1513c756af1 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -90,3 +90,11 @@ Description:
 		device's debugging info useful for kernel developers. Its
 		format is not documented intentionally and may change
 		anytime without any notice.
+
+What:		/sys/block/zram<id>/backing_dev
+Date:		June 2017
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		The backing_dev file is read-write and set up backing
+		device for zram to write incompressible pages.
+		For using, user should enable CONFIG_ZRAM_WRITEBACK.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a21307..257e65714c6a 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -168,6 +168,7 @@ max_comp_streams  RW    the number of possible concurrent compress operations
 comp_algorithm    RW    show and change the compression algorithm
 compact           WO    trigger memory compaction
 debug_stat        RO    this file is used for zram debugging purposes
+backing_dev	  RW	set up backend storage for zram to write out
 
 
 User space is advised to use the following files to read the device statistics.
@@ -231,5 +232,15 @@ line of text and contains the following stats separated by whitespace:
 	resets the disksize to zero. You must set the disksize again
 	before reusing the device.
 
+* Optional Feature
+
+= writeback
+
+With incompressible pages, there is no memory saving with zram.
+Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page
+to backing storage rather than keeping it in memory.
+User should set up backing device via /sys/block/zramX/backing_dev
+before disksize setting.
+
 Nitin Gupta
 ngupta@vflare.org
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6dcd3b..7cd4a8ec3c8f 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,3 +13,15 @@ config ZRAM
 	  disks and maybe many more.
 
 	  See zram.txt for more information.
+
+config ZRAM_WRITEBACK
+       bool "Write back incompressible page to backing device"
+       depends on ZRAM
+       default n
+       help
+	 With incompressible page, there is no memory saving to keep it
+	 in memory. Instead, write it out to backing device.
+	 For this feature, admin should set up backing device via
+	 /sys/block/zramX/backing_dev.
+
+	 See zram.txt for more infomration.

From edf99a4a6eba1d4083479dd6b0a76552ac58701e Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 3 Oct 2017 16:15:19 -0700
Subject: [PATCH 1804/3220] UPSTREAM: zram: fix null dereference of handle

In testing I found handle passed to zs_map_object in __zram_bvec_read is
NULL so eh kernel goes oops in pin_object().

The reason is there is no routine to check the slot's freeing after
getting the slot's lock.  This patch fixes it.

[minchan@kernel.org: v2]
  Link: http://lkml.kernel.org/r/1505887347-10881-1-git-send-email-minchan@kernel.org
Link: http://lkml.kernel.org/r/1505788488-26723-1-git-send-email-minchan@kernel.org
Fixes: 1f7319c74275 ("zram: partial IO refactoring")
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit ae94264ed4b0cf7cd887947650db4c69acb62072)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I0ff4a8c2f1fcd0ee39511985809b58bf94b2d44c
---
 drivers/block/zram/zram_drv.c | 36 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 656495d61998..c1f15204ebb7 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -781,27 +781,6 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
 	bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
 }
 
-static bool zram_same_page_read(struct zram *zram, u32 index,
-				struct page *page,
-				unsigned int offset, unsigned int len)
-{
-	zram_slot_lock(zram, index);
-	if (unlikely(!zram_get_handle(zram, index) ||
-			zram_test_flag(zram, index, ZRAM_SAME))) {
-		void *mem;
-
-		zram_slot_unlock(zram, index);
-		mem = kmap_atomic(page);
-		zram_fill_page(mem + offset, len,
-					zram_get_element(zram, index));
-		kunmap_atomic(mem);
-		return true;
-	}
-	zram_slot_unlock(zram, index);
-
-	return false;
-}
-
 static void zram_meta_free(struct zram *zram, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
@@ -899,11 +878,20 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
 		zram_slot_unlock(zram, index);
 	}
 
-	if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
-		return 0;
-
 	zram_slot_lock(zram, index);
 	handle = zram_get_handle(zram, index);
+	if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
+		unsigned long value;
+		void *mem;
+
+		value = handle ? zram_get_element(zram, index) : 0;
+		mem = kmap_atomic(page);
+		zram_fill_page(mem, PAGE_SIZE, value);
+		kunmap_atomic(mem);
+		zram_slot_unlock(zram, index);
+		return 0;
+	}
+
 	size = zram_get_obj_size(zram, index);
 
 	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);

From a9392b72e1433b6da8005ebea7060b25bac0dc68 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Nov 2017 17:32:56 -0800
Subject: [PATCH 1805/3220] BACKPORT: zram: set BDI_CAP_STABLE_WRITES once

With fast swap storage, the platform wants to use swap more aggressively
and swap-in is crucial to application latency.

The rw_page() based synchronous devices like zram, pmem and btt are such
fast storage.  When I profile swapin performance with zram lz4
decompress test, S/W overhead is more than 70%.  Maybe, it would be
bigger in nvdimm.

This patchset reduces swap-in latency by skipping swapcache if the swap
device is a synchronous device like a rw_page() based device.

It enhances by 45% my swapin test (5G sequential swapin, no readahead)
from 2.41sec to 1.64sec.

This patch (of 4):

Commit 19b7ccf8651d ("block: get rid of blk_integrity_revalidate()")
fixed a weird thing (i.e., reset BDI_CAP_STABLE_WRITES flag
unconditionally whenever revalidat_disk is called) so zram doesn't need
to reset the flag any more when revalidating the bdev.  Instead, set the
flag just once when the zram device is created.

It shouldn't change any behavior.

Link: http://lkml.kernel.org/r/1505886205-9671-2-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Hugh Dickins <hughd@google.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit e447a0151f7ce8dd884fea48279274bd64434c29)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: If41edc4871ed470f050bbf4d51a24fe5c0e18738
---
 drivers/block/zram/zram_drv.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index c1f15204ebb7..1460bc5cc2fc 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -121,14 +121,6 @@ static inline bool is_partial_io(struct bio_vec *bvec)
 }
 #endif
 
-static void zram_revalidate_disk(struct zram *zram)
-{
-	revalidate_disk(zram->disk);
-	/* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */
-	zram->disk->queue->backing_dev_info.capabilities |=
-		BDI_CAP_STABLE_WRITES;
-}
-
 /*
  * Check if request is within bounds and aligned on zram logical blocks.
  */
@@ -1380,7 +1372,8 @@ static ssize_t disksize_store(struct device *dev,
 	zram->comp = comp;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-	zram_revalidate_disk(zram);
+
+	revalidate_disk(zram->disk);
 	up_write(&zram->init_lock);
 
 	return len;
@@ -1427,7 +1420,7 @@ static ssize_t reset_store(struct device *dev,
 	/* Make sure all the pending I/O are finished */
 	fsync_bdev(bdev);
 	zram_reset_device(zram);
-	zram_revalidate_disk(zram);
+	revalidate_disk(zram->disk);
 	bdput(bdev);
 
 	mutex_lock(&bdev->bd_mutex);
@@ -1546,6 +1539,7 @@ static int zram_add(void)
 	/* zram devices sort of resembles non-rotational disks */
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
+
 	/*
 	 * To ensure that we always get PAGE_SIZE aligned
 	 * and n*PAGE_SIZED sized I/O requests.
@@ -1571,6 +1565,8 @@ static int zram_add(void)
 		zram->disk->queue->limits.discard_zeroes_data = 0;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
 
+	zram->disk->queue->backing_dev_info.capabilities |=
+					BDI_CAP_STABLE_WRITES;
 	add_disk(zram->disk);
 
 	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,

From 983cc4588dedea1bf9b414ed93d488694a035299 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 15 Nov 2017 17:37:08 -0800
Subject: [PATCH 1806/3220] UPSTREAM: drivers/block/zram/zram_drv.c: make
 zram_page_end_io() static

zram_page_end_io() is local to the source and does not need to be in
global scope, so make it static.

Cleans up sparse warning:

  symbol 'zram_page_end_io' was not declared. Should it be static?

Link: http://lkml.kernel.org/r/20171016173336.20320-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 384bc41fc064bd8b12b7081aa3e81d26f3407045)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ie0f250e580bc1dd16e963b5dbe5bdc429fb4cd65
---
 drivers/block/zram/zram_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 1460bc5cc2fc..72f130c78bb1 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -444,7 +444,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
 	WARN_ON_ONCE(!was_set);
 }
 
-void zram_page_end_io(struct bio *bio)
+static void zram_page_end_io(struct bio *bio)
 {
 	struct page *page = bio->bi_io_vec[0].bv_page;
 

From 5a1553d80202ef8787c0049ebed9c43aecfce0b0 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 28 Feb 2018 10:15:30 -0800
Subject: [PATCH 1807/3220] UPSTREAM: zram: Delete gendisk before cleaning up
 the request queue

Remove the disk, partition and bdi sysfs attributes before cleaning up
the request queue associated with the disk.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

(cherry picked from commit 392db38058eb47250a9d0cc737af37e78a7e443d)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Ifbcb6e03fee764054dc9a371c00b95547e4de745
---
 drivers/block/zram/zram_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 72f130c78bb1..d65827eb9b27 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1628,8 +1628,8 @@ static int zram_remove(struct zram *zram)
 
 	pr_info("Removed device: %s\n", zram->disk->disk_name);
 
-	blk_cleanup_queue(zram->disk->queue);
 	del_gendisk(zram->disk);
+	blk_cleanup_queue(zram->disk->queue);
 	put_disk(zram->disk);
 	kfree(zram);
 	return 0;

From 404db8c7976b7de9d70fed776e4cbcf69516a500 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 7 Jun 2018 17:05:39 -0700
Subject: [PATCH 1808/3220] UPSTREAM: zram: correct flag name of ZRAM_ACCESS

Patch series "zram memory tracking", v5.

zRam as swap is useful for small memory device.  However, swap means
those pages on zram are mostly cold pages due to VM's LRU algorithm.
Especially, once init data for application are touched for launching,
they tend to be not accessed any more and finally swapped out.  zRAM can
store such cold pages as compressed form but it's pointless to keep in
memory.  As well, it's pointless to store incompressible pages to zram
so better idea is app developers manages them directly like free or
mlock rather than remaining them on heap.

This patch provides a debugfs /sys/kernel/debug/zram/zram0/block_state
to represent each block's state so admin can investigate what memory is
cold|incompressible|same page with using pagemap once the pages are
swapped out.

The output is as follows:
      300    75.033841 .wh
      301    63.806904 s..
      302    63.806919 ..h

First column is zram's block index and 3rh one represents symbol (s:
same page w: written page to backing store h: huge page) of the block
state.  Second column represents usec time unit of the block was last
accessed.  So above example means the 300th block is accessed at
75.033851 second and it was huge so it was written to the backing store.

This patch (of 4):

ZRAM_ACCESS is used for locking a slot of zram so correct the name.  It
is also not a common flag to indicate status of the block so move the
declare position on top of the flag.  Lastly, let's move the function to
the top of source code to be able to use it easily without forward
declaration.

Link: http://lkml.kernel.org/r/20180416090946.63057-2-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit c4d6c4cc7bfd5ecc18548420b7fb9440cf8416ae)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I037a22a739fb4005918eb668d10e8be354a1524f
---
 drivers/block/zram/zram_drv.c | 20 ++++++++++----------
 drivers/block/zram/zram_drv.h |  6 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d65827eb9b27..5439ccd0be9a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -46,6 +46,16 @@ static unsigned int num_devices = 1;
 
 static void zram_free_page(struct zram *zram, size_t index);
 
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+	bit_spin_lock(ZRAM_LOCK, &zram->table[index].value);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+	bit_spin_unlock(ZRAM_LOCK, &zram->table[index].value);
+}
+
 static inline bool init_done(struct zram *zram)
 {
 	return zram->disksize;
@@ -763,16 +773,6 @@ static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
 static DEVICE_ATTR_RO(debug_stat);
 
-static void zram_slot_lock(struct zram *zram, u32 index)
-{
-	bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
-}
-
-static void zram_slot_unlock(struct zram *zram, u32 index)
-{
-	bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
-}
-
 static void zram_meta_free(struct zram *zram, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 31762db861e3..a473d5c7d74f 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -60,9 +60,9 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
 
 /* Flags for zram pages (table[page_no].value) */
 enum zram_pageflags {
-	/* Page consists the same element */
-	ZRAM_SAME = ZRAM_FLAG_SHIFT,
-	ZRAM_ACCESS,	/* page is now accessed */
+	/* zram slot is locked */
+	ZRAM_LOCK = ZRAM_FLAG_SHIFT,
+	ZRAM_SAME,	/* Page consists the same element */
 	ZRAM_WB,	/* page is stored on backing_device */
 
 	__NR_ZRAM_PAGEFLAGS,

From 6b6c4b9e56e9ba88ef7a5f3906601b55a65b8260 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 7 Jun 2018 17:05:42 -0700
Subject: [PATCH 1809/3220] BACKPORT: zram: mark incompressible page as
 ZRAM_HUGE

Mark incompressible pages so that we could investigate who is the owner
of the incompressible pages once the page is swapped out via using
upcoming zram memory tracker feature.

With it, we could prevent such pages to be swapped out by using mlock.
Otherwise we might remove them.

This patch exposes new stat for huge pages via mm_stat.

Link: http://lkml.kernel.org/r/20180416090946.63057-3-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 89e85bce4b02edb7408aebf69d5d1a6692a05f4f)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: If1b7b2d6ea6672a575ffc3d70c2c8b58ecafd0d7
---
 Documentation/blockdev/zram.txt |  1 +
 drivers/block/zram/zram_drv.c   | 17 ++++++++++++++---
 drivers/block/zram/zram_drv.h   |  2 ++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 257e65714c6a..78db38d02bc9 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -218,6 +218,7 @@ line of text and contains the following stats separated by whitespace:
  same_pages       the number of same element filled pages written to this disk.
                   No memory is allocated for such pages.
  pages_compacted  the number of pages freed during compaction
+ huge_pages	  the number of incompressible pages
 
 9) Deactivate:
 	swapoff /dev/zram0
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 5439ccd0be9a..8e74cafe9008 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -739,14 +739,15 @@ static ssize_t mm_stat_show(struct device *dev,
 	max_used = atomic_long_read(&zram->stats.max_used_pages);
 
 	ret = scnprintf(buf, PAGE_SIZE,
-			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n",
 			orig_size << PAGE_SHIFT,
 			(u64)atomic64_read(&zram->stats.compr_data_size),
 			mem_used << PAGE_SHIFT,
 			zram->limit_pages << PAGE_SHIFT,
 			max_used << PAGE_SHIFT,
 			(u64)atomic64_read(&zram->stats.same_pages),
-			pool_stats.pages_compacted);
+			pool_stats.pages_compacted,
+			(u64)atomic64_read(&zram->stats.huge_pages));
 	up_read(&zram->init_lock);
 
 	return ret;
@@ -813,6 +814,11 @@ static void zram_free_page(struct zram *zram, size_t index)
 {
 	unsigned long handle;
 
+	if (zram_test_flag(zram, index, ZRAM_HUGE)) {
+		zram_clear_flag(zram, index, ZRAM_HUGE);
+		atomic64_dec(&zram->stats.huge_pages);
+	}
+
 	if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
 		zram_wb_clear(zram, index);
 		atomic64_dec(&zram->stats.pages_stored);
@@ -981,6 +987,7 @@ compress_again:
 	}
 
 	if (unlikely(comp_len > max_zpage_size)) {
+		comp_len = PAGE_SIZE;
 		if (zram_wb_enabled(zram) && allow_wb) {
 			zcomp_stream_put(zram->comp);
 			ret = write_to_bdev(zram, bvec, index, bio, &element);
@@ -992,7 +999,6 @@ compress_again:
 			allow_wb = false;
 			goto compress_again;
 		}
-		comp_len = PAGE_SIZE;
 	}
 
 	/*
@@ -1054,6 +1060,11 @@ out:
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
 
+	if (comp_len == PAGE_SIZE) {
+		zram_set_flag(zram, index, ZRAM_HUGE);
+		atomic64_inc(&zram->stats.huge_pages);
+	}
+
 	if (flags) {
 		zram_set_flag(zram, index, flags);
 		zram_set_element(zram, index, element);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index a473d5c7d74f..4c4bc6042c89 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -64,6 +64,7 @@ enum zram_pageflags {
 	ZRAM_LOCK = ZRAM_FLAG_SHIFT,
 	ZRAM_SAME,	/* Page consists the same element */
 	ZRAM_WB,	/* page is stored on backing_device */
+	ZRAM_HUGE,	/* Incompressible page */
 
 	__NR_ZRAM_PAGEFLAGS,
 };
@@ -88,6 +89,7 @@ struct zram_stats {
 	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
 	atomic64_t notify_free;	/* no. of swap slot free notifications */
 	atomic64_t same_pages;		/* no. of same element filled pages */
+	atomic64_t huge_pages;		/* no. of huge pages */
 	atomic64_t pages_stored;	/* no. of pages currently stored */
 	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
 	atomic64_t writestall;		/* no. of write slow paths */

From e761bead9cc60f452d02a2bf44c70866a01097d6 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 7 Jun 2018 17:05:45 -0700
Subject: [PATCH 1810/3220] BACKPORT: zram: record accessed second

zRam as swap is useful for small memory device.  However, swap means
those pages on zram are mostly cold pages due to VM's LRU algorithm.
Especially, once init data for application are touched for launching,
they tend to be not accessed any more and finally swapped out.  zRAM can
store such cold pages as compressed form but it's pointless to keep in
memory.  Better idea is app developers free them directly rather than
remaining them on heap.

This patch records last access time of each block of zram so that With
upcoming zram memory tracking, it could help userspace developers to
reduce memory footprint.

Link: http://lkml.kernel.org/r/20180416090946.63057-4-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit d7eac6b6e1838ef1a1400df4ec55daa34bbc855e)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I5b217d3cd4da57e548196658e0824d65a0cad631
---
 drivers/block/zram/zram_drv.c | 16 ++++++++++++++++
 drivers/block/zram/zram_drv.h |  1 +
 2 files changed, 17 insertions(+)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8e74cafe9008..de4fdb599a58 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -101,6 +101,16 @@ static inline void zram_set_element(struct zram *zram, u32 index,
 	zram->table[index].element = element;
 }
 
+static void zram_accessed(struct zram *zram, u32 index)
+{
+	zram->table[index].ac_time = sched_clock();
+}
+
+static void zram_reset_access(struct zram *zram, u32 index)
+{
+	zram->table[index].ac_time = 0;
+}
+
 static unsigned long zram_get_element(struct zram *zram, u32 index)
 {
 	return zram->table[index].element;
@@ -814,6 +824,8 @@ static void zram_free_page(struct zram *zram, size_t index)
 {
 	unsigned long handle;
 
+	zram_reset_access(zram, index);
+
 	if (zram_test_flag(zram, index, ZRAM_HUGE)) {
 		zram_clear_flag(zram, index, ZRAM_HUGE);
 		atomic64_dec(&zram->stats.huge_pages);
@@ -1183,6 +1195,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 
 	generic_end_io_acct(rw, &zram->disk->part0, start_time);
 
+	zram_slot_lock(zram, index);
+	zram_accessed(zram, index);
+	zram_slot_unlock(zram, index);
+
 	if (unlikely(ret < 0)) {
 		if (rw == READ)
 			atomic64_inc(&zram->stats.failed_reads);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 4c4bc6042c89..79c73f50a2a2 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -78,6 +78,7 @@ struct zram_table_entry {
 		unsigned long element;
 	};
 	unsigned long value;
+	u64 ac_time;
 };
 
 struct zram_stats {

From a900f9a050d7f152af7c0099274b8af2c7fb7228 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 7 Jun 2018 17:05:49 -0700
Subject: [PATCH 1811/3220] BACKPORT: zram: introduce zram memory tracking

zRam as swap is useful for small memory device.  However, swap means
those pages on zram are mostly cold pages due to VM's LRU algorithm.
Especially, once init data for application are touched for launching,
they tend to be not accessed any more and finally swapped out.  zRAM can
store such cold pages as compressed form but it's pointless to keep in
memory.  Better idea is app developers free them directly rather than
remaining them on heap.

This patch tell us last access time of each block of zram via "cat
/sys/kernel/debug/zram/zram0/block_state".

The output is as follows,
      300    75.033841 .wh
      301    63.806904 s..
      302    63.806919 ..h

First column is zram's block index and 3rh one represents symbol (s:
same page w: written page to backing store h: huge page) of the block
state.  Second column represents usec time unit of the block was last
accessed.  So above example means the 300th block is accessed at
75.033851 second and it was huge so it was written to the backing store.

Admin can leverage this information to catch cold|incompressible pages
of process with *pagemap* once part of heaps are swapped out.

I used the feature a few years ago to find memory hoggers in userspace
to notify them what memory they have wasted without touch for a long
time.  With it, they could reduce unnecessary memory space.  However, at
that time, I hacked up zram for the feature but now I need the feature
again so I decided it would be better to upstream rather than keeping it
alone.  I hope I submit the userspace tool to use the feature soon.

[akpm@linux-foundation.org: fix i386 printk warning]
[minchan@kernel.org: use ktime_get_boottime() instead of sched_clock()]
  Link: http://lkml.kernel.org/r/20180420063525.GA253739@rodete-desktop-imager.corp.google.com
[akpm@linux-foundation.org: documentation tweak]
[akpm@linux-foundation.org: fix i386 printk warning]
[minchan@kernel.org: fix compile warning]
  Link: http://lkml.kernel.org/r/20180508104849.GA8209@rodete-desktop-imager.corp.google.com
[rdunlap@infradead.org: fix printk formats]
  Link: http://lkml.kernel.org/r/3652ccb1-96ef-0b0b-05d1-f661d7733dcc@infradead.org
Link: http://lkml.kernel.org/r/20180416090946.63057-5-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit c0265342bff4fcaa2cdf13f4596244c18d4a7ae5)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I932447d33d1b6af78ae6463b494006c725e5e38c
---
 Documentation/blockdev/zram.txt |  24 ++++++
 drivers/block/zram/Kconfig      |  14 +++-
 drivers/block/zram/zram_drv.c   | 140 +++++++++++++++++++++++++++++---
 drivers/block/zram/zram_drv.h   |   7 +-
 4 files changed, 171 insertions(+), 14 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 78db38d02bc9..875b2b56b87f 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -243,5 +243,29 @@ to backing storage rather than keeping it in memory.
 User should set up backing device via /sys/block/zramX/backing_dev
 before disksize setting.
 
+= memory tracking
+
+With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
+zram block. It could be useful to catch cold or incompressible
+pages of the process with*pagemap.
+If you enable the feature, you could see block state via
+/sys/kernel/debug/zram/zram0/block_state". The output is as follows,
+
+	  300    75.033841 .wh
+	  301    63.806904 s..
+	  302    63.806919 ..h
+
+First column is zram's block index.
+Second column is access time since the system was booted
+Third column is state of the block.
+(s: same page
+w: written page to backing store
+h: huge page)
+
+First line of above example says 300th block is accessed at 75.033841sec
+and the block's state is huge so it is written back to the backing
+storage. It's a debugging feature so anyone shouldn't rely on it to work
+properly.
+
 Nitin Gupta
 ngupta@vflare.org
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 7cd4a8ec3c8f..cb53957d58f9 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -12,7 +12,7 @@ config ZRAM
 	  It has several use cases, for example: /tmp storage, use as swap
 	  disks and maybe many more.
 
-	  See zram.txt for more information.
+	  See Documentation/blockdev/zram.txt for more information.
 
 config ZRAM_WRITEBACK
        bool "Write back incompressible page to backing device"
@@ -24,4 +24,14 @@ config ZRAM_WRITEBACK
 	 For this feature, admin should set up backing device via
 	 /sys/block/zramX/backing_dev.
 
-	 See zram.txt for more infomration.
+	 See Documentation/blockdev/zram.txt for more information.
+
+config ZRAM_MEMORY_TRACKING
+	bool "Track zRam block status"
+	depends on ZRAM && DEBUG_FS
+	help
+	  With this feature, admin can track the state of allocated blocks
+	  of zRAM. Admin could see the information via
+	  /sys/kernel/debug/zram/zramX/block_state.
+
+	  See Documentation/blockdev/zram.txt for more information.
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index de4fdb599a58..5d8abb87eed2 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -31,6 +31,7 @@
 #include <linux/err.h>
 #include <linux/idr.h>
 #include <linux/sysfs.h>
+#include <linux/debugfs.h>
 
 #include "zram_drv.h"
 
@@ -61,6 +62,13 @@ static inline bool init_done(struct zram *zram)
 	return zram->disksize;
 }
 
+static inline bool zram_allocated(struct zram *zram, u32 index)
+{
+
+	return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) ||
+					zram->table[index].handle;
+}
+
 static inline struct zram *dev_to_zram(struct device *dev)
 {
 	return (struct zram *)dev_to_disk(dev)->private_data;
@@ -77,7 +85,7 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
 }
 
 /* flag operations require table entry bit_spin_lock() being held */
-static int zram_test_flag(struct zram *zram, u32 index,
+static bool zram_test_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
 	return zram->table[index].value & BIT(flag);
@@ -101,16 +109,6 @@ static inline void zram_set_element(struct zram *zram, u32 index,
 	zram->table[index].element = element;
 }
 
-static void zram_accessed(struct zram *zram, u32 index)
-{
-	zram->table[index].ac_time = sched_clock();
-}
-
-static void zram_reset_access(struct zram *zram, u32 index)
-{
-	zram->table[index].ac_time = 0;
-}
-
 static unsigned long zram_get_element(struct zram *zram, u32 index)
 {
 	return zram->table[index].element;
@@ -630,6 +628,122 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
 static void zram_wb_clear(struct zram *zram, u32 index) {}
 #endif
 
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+
+static struct dentry *zram_debugfs_root;
+
+static void zram_debugfs_create(void)
+{
+	zram_debugfs_root = debugfs_create_dir("zram", NULL);
+}
+
+static void zram_debugfs_destroy(void)
+{
+	debugfs_remove_recursive(zram_debugfs_root);
+}
+
+static void zram_accessed(struct zram *zram, u32 index)
+{
+	zram->table[index].ac_time = ktime_get_boottime();
+}
+
+static void zram_reset_access(struct zram *zram, u32 index)
+{
+	zram->table[index].ac_time.tv64 = 0;
+}
+
+static ssize_t read_block_state(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char *kbuf;
+	ssize_t index, written = 0;
+	struct zram *zram = file->private_data;
+	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+	struct timespec64 ts;
+	gfp_t kmalloc_flags;
+
+	kmalloc_flags = GFP_KERNEL;
+	if (count > PAGE_SIZE)
+		kmalloc_flags |= __GFP_NOWARN | __GFP_NORETRY;
+
+	kbuf = kmalloc_node(count, kmalloc_flags, NUMA_NO_NODE);
+	if (!kbuf && count > PAGE_SIZE)
+		kbuf = vmalloc(count);
+
+	if (!kbuf)
+		return -ENOMEM;
+
+	down_read(&zram->init_lock);
+	if (!init_done(zram)) {
+		up_read(&zram->init_lock);
+		kvfree(kbuf);
+		return -EINVAL;
+	}
+
+	for (index = *ppos; index < nr_pages; index++) {
+		int copied;
+
+		zram_slot_lock(zram, index);
+		if (!zram_allocated(zram, index))
+			goto next;
+
+		ts = ktime_to_timespec64(zram->table[index].ac_time);
+		copied = snprintf(kbuf + written, count,
+			"%12zd %12lld.%06lu %c%c%c\n",
+			index, (s64)ts.tv_sec,
+			ts.tv_nsec / NSEC_PER_USEC,
+			zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
+			zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
+			zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.');
+
+		if (count < copied) {
+			zram_slot_unlock(zram, index);
+			break;
+		}
+		written += copied;
+		count -= copied;
+next:
+		zram_slot_unlock(zram, index);
+		*ppos += 1;
+	}
+
+	up_read(&zram->init_lock);
+	if (copy_to_user(buf, kbuf, written))
+		written = -EFAULT;
+	kvfree(kbuf);
+
+	return written;
+}
+
+static const struct file_operations proc_zram_block_state_op = {
+	.open = simple_open,
+	.read = read_block_state,
+	.llseek = default_llseek,
+};
+
+static void zram_debugfs_register(struct zram *zram)
+{
+	if (!zram_debugfs_root)
+		return;
+
+	zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
+						zram_debugfs_root);
+	debugfs_create_file("block_state", 0400, zram->debugfs_dir,
+				zram, &proc_zram_block_state_op);
+}
+
+static void zram_debugfs_unregister(struct zram *zram)
+{
+	debugfs_remove_recursive(zram->debugfs_dir);
+}
+#else
+static void zram_debugfs_create(void) {};
+static void zram_debugfs_destroy(void) {};
+static void zram_accessed(struct zram *zram, u32 index) {};
+static void zram_reset_access(struct zram *zram, u32 index) {};
+static void zram_debugfs_register(struct zram *zram) {};
+static void zram_debugfs_unregister(struct zram *zram) {};
+#endif
 
 /*
  * We switched to per-cpu streams and this attr is not needed anymore.
@@ -1605,6 +1719,7 @@ static int zram_add(void)
 	}
 	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
 
+	zram_debugfs_register(zram);
 	pr_info("Added device: %s\n", zram->disk->disk_name);
 	return device_id;
 
@@ -1638,6 +1753,7 @@ static int zram_remove(struct zram *zram)
 	zram->claim = true;
 	mutex_unlock(&bdev->bd_mutex);
 
+	zram_debugfs_unregister(zram);
 	/*
 	 * Remove sysfs first, so no one will perform a disksize
 	 * store while we destroy the devices. This also helps during
@@ -1736,6 +1852,7 @@ static void destroy_devices(void)
 {
 	class_unregister(&zram_control_class);
 	idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
+	zram_debugfs_destroy();
 	idr_destroy(&zram_index_idr);
 	unregister_blkdev(zram_major, "zram");
 }
@@ -1750,6 +1867,7 @@ static int __init zram_init(void)
 		return ret;
 	}
 
+	zram_debugfs_create();
 	zram_major = register_blkdev(0, "zram");
 	if (zram_major <= 0) {
 		pr_err("Unable to get major number\n");
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 79c73f50a2a2..bbda650f0dc1 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -78,7 +78,9 @@ struct zram_table_entry {
 		unsigned long element;
 	};
 	unsigned long value;
-	u64 ac_time;
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+	ktime_t ac_time;
+#endif
 };
 
 struct zram_stats {
@@ -127,5 +129,8 @@ struct zram {
 	unsigned long nr_pages;
 	spinlock_t bitmap_lock;
 #endif
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+	struct dentry *debugfs_dir;
+#endif
 };
 #endif

From 09598cf987800ed1c9ee797db77ce3ad8dea5ca6 Mon Sep 17 00:00:00 2001
From: Peter Kalauskas <peskal@google.com>
Date: Tue, 21 Aug 2018 21:54:02 -0700
Subject: [PATCH 1812/3220] UPSTREAM: drivers/block/zram/zram_drv.c: fix bug
 storing backing_dev

The call to strlcpy in backing_dev_store is incorrect. It should take
the size of the destination buffer instead of the size of the source
buffer.  Additionally, ignore the newline character (\n) when reading
the new file_name buffer. This makes it possible to set the backing_dev
as follows:

	echo /dev/sdX > /sys/block/zram0/backing_dev

The reason it worked before was the fact that strlcpy() copies 'len - 1'
bytes, which is strlen(buf) - 1 in our case, so it accidentally didn't
copy the trailing new line symbol.  Which also means that "echo -n
/dev/sdX" most likely was broken.

Signed-off-by: Peter Kalauskas <peskal@google.com>
Link: http://lkml.kernel.org/r/20180813061623.GC64836@rodete-desktop-imager.corp.google.com
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: <stable@vger.kernel.org>    [4.14+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit c8bd134a4bddafe5917d163eea73873932c15e83)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: I0a0d602b61169ae9adc8f89914ce4e30cc10e191
---
 drivers/block/zram/zram_drv.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 5d8abb87eed2..ec034123cbec 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -339,6 +339,7 @@ static ssize_t backing_dev_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
 	char *file_name;
+	size_t sz;
 	struct file *backing_dev = NULL;
 	struct inode *inode;
 	struct address_space *mapping;
@@ -360,7 +361,11 @@ static ssize_t backing_dev_store(struct device *dev,
 		goto out;
 	}
 
-	strlcpy(file_name, buf, len);
+	strlcpy(file_name, buf, PATH_MAX);
+	/* ignore trailing newline */
+	sz = strlen(file_name);
+	if (sz > 0 && file_name[sz - 1] == '\n')
+		file_name[sz - 1] = 0x00;
 
 	backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
 	if (IS_ERR(backing_dev)) {

From e5c5f1fae55dbe8fe14b1526cc9c0ee740576ab2 Mon Sep 17 00:00:00 2001
From: Peter Kalauskas <peskal@google.com>
Date: Fri, 24 Aug 2018 12:27:10 -0700
Subject: [PATCH 1813/3220] ANDROID: x86_64_cuttlefish_defconfig: Enable lz4
 compression for zram

Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 112488418
Change-Id: Iab302cdf63691a3cc3124b5826206b9f6bd4adfb
Signed-off-by: Peter Kalauskas <peskal@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index e42f748f7414..38dd54633384 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -452,6 +452,7 @@ CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
 CONFIG_CRYPTO_SHA512=y
+CONFIG_CRYPTO_LZ4=y
 CONFIG_CRYPTO_ZSTD=y
 CONFIG_ASYMMETRIC_KEY_TYPE=y
 CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y

From 7e85a4b796e80a224b48a524d1efb0485d8b0602 Mon Sep 17 00:00:00 2001
From: Adrian Salido <salidoa@google.com>
Date: Tue, 18 Apr 2017 11:44:33 -0700
Subject: [PATCH 1814/3220] ANDROID: tracing: fix race condition reading saved
 tgids

Commit 939c7a4f04fc ("tracing: Introduce saved_cmdlines_size file")
introduced ability to change saved cmdlines size. This resized saved
command lines but missed resizing tgid mapping as well.

Another issue is that when the resize happens, it removes saved command
lines and reallocates new memory for it. This introduced a race
condition when reading the global savecmd as this can be freed in the
middle of accessing it causing a use after free access. Fix this by
implementing locking.

Signed-off-by: Adrian Salido <salidoa@google.com>
Bug: 36007735
Change-Id: I334791ac35f8bcbd34362ed112aa624275a46947
(cherry picked from commit 7116d306da66de0de21e982024b4d3a3056f4461)
---
 kernel/trace/trace.c | 101 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 81 insertions(+), 20 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index efdddec2af6e..7604c7afad78 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1359,11 +1359,11 @@ void tracing_reset_all_online_cpus(void)
 
 #define SAVED_CMDLINES_DEFAULT 128
 #define NO_CMDLINE_MAP UINT_MAX
-static unsigned saved_tgids[SAVED_CMDLINES_DEFAULT];
 static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 struct saved_cmdlines_buffer {
 	unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 	unsigned *map_cmdline_to_pid;
+	unsigned *map_cmdline_to_tgid;
 	unsigned cmdline_num;
 	int cmdline_idx;
 	char *saved_cmdlines;
@@ -1397,12 +1397,23 @@ static int allocate_cmdlines_buffer(unsigned int val,
 		return -ENOMEM;
 	}
 
+	s->map_cmdline_to_tgid = kmalloc_array(val,
+					       sizeof(*s->map_cmdline_to_tgid),
+					       GFP_KERNEL);
+	if (!s->map_cmdline_to_tgid) {
+		kfree(s->map_cmdline_to_pid);
+		kfree(s->saved_cmdlines);
+		return -ENOMEM;
+	}
+
 	s->cmdline_idx = 0;
 	s->cmdline_num = val;
 	memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
 	       sizeof(s->map_pid_to_cmdline));
 	memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
 	       val * sizeof(*s->map_cmdline_to_pid));
+	memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP,
+	       val * sizeof(*s->map_cmdline_to_tgid));
 
 	return 0;
 }
@@ -1568,14 +1579,17 @@ static int trace_save_cmdline(struct task_struct *tsk)
 	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
 		return 0;
 
+	preempt_disable();
 	/*
 	 * It's not the end of the world if we don't get
 	 * the lock, but we also don't want to spin
 	 * nor do we want to disable interrupts,
 	 * so if we miss here, then better luck next time.
 	 */
-	if (!arch_spin_trylock(&trace_cmdline_lock))
+	if (!arch_spin_trylock(&trace_cmdline_lock)) {
+		preempt_enable();
 		return 0;
+	}
 
 	idx = savedcmd->map_pid_to_cmdline[tsk->pid];
 	if (idx == NO_CMDLINE_MAP) {
@@ -1598,8 +1612,9 @@ static int trace_save_cmdline(struct task_struct *tsk)
 	}
 
 	set_cmdline(idx, tsk->comm);
-	saved_tgids[idx] = tsk->tgid;
+	savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid;
 	arch_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
 
 	return 1;
 }
@@ -1641,19 +1656,29 @@ void trace_find_cmdline(int pid, char comm[])
 	preempt_enable();
 }
 
-int trace_find_tgid(int pid)
+static int __find_tgid_locked(int pid)
 {
 	unsigned map;
 	int tgid;
 
-	preempt_disable();
-	arch_spin_lock(&trace_cmdline_lock);
 	map = savedcmd->map_pid_to_cmdline[pid];
 	if (map != NO_CMDLINE_MAP)
-		tgid = saved_tgids[map];
+		tgid = savedcmd->map_cmdline_to_tgid[map];
 	else
 		tgid = -1;
 
+	return tgid;
+}
+
+int trace_find_tgid(int pid)
+{
+	int tgid;
+
+	preempt_disable();
+	arch_spin_lock(&trace_cmdline_lock);
+
+	tgid = __find_tgid_locked(pid);
+
 	arch_spin_unlock(&trace_cmdline_lock);
 	preempt_enable();
 
@@ -3970,10 +3995,15 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
 {
 	char buf[64];
 	int r;
+	unsigned int n;
 
+	preempt_disable();
 	arch_spin_lock(&trace_cmdline_lock);
-	r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+	n = savedcmd->cmdline_num;
 	arch_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
+
+	r = scnprintf(buf, sizeof(buf), "%u\n", n);
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
@@ -3982,6 +4012,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
 {
 	kfree(s->saved_cmdlines);
 	kfree(s->map_cmdline_to_pid);
+	kfree(s->map_cmdline_to_tgid);
 	kfree(s);
 }
 
@@ -3998,10 +4029,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
 		return -ENOMEM;
 	}
 
+	preempt_disable();
 	arch_spin_lock(&trace_cmdline_lock);
 	savedcmd_temp = savedcmd;
 	savedcmd = s;
 	arch_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
 	free_saved_cmdlines_buffer(savedcmd_temp);
 
 	return 0;
@@ -4220,33 +4253,61 @@ tracing_saved_tgids_read(struct file *file, char __user *ubuf,
 	char *file_buf;
 	char *buf;
 	int len = 0;
-	int pid;
 	int i;
+	int *pids;
+	int n = 0;
 
-	file_buf = kmalloc(SAVED_CMDLINES_DEFAULT*(16+1+16), GFP_KERNEL);
-	if (!file_buf)
+	preempt_disable();
+	arch_spin_lock(&trace_cmdline_lock);
+
+	pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL);
+	if (!pids) {
+		arch_spin_unlock(&trace_cmdline_lock);
+		preempt_enable();
 		return -ENOMEM;
+	}
 
-	buf = file_buf;
-
-	for (i = 0; i < SAVED_CMDLINES_DEFAULT; i++) {
-		int tgid;
-		int r;
+	for (i = 0; i < savedcmd->cmdline_num; i++) {
+		int pid;
 
 		pid = savedcmd->map_cmdline_to_pid[i];
 		if (pid == -1 || pid == NO_CMDLINE_MAP)
 			continue;
 
-		tgid = trace_find_tgid(pid);
-		r = sprintf(buf, "%d %d\n", pid, tgid);
+		pids[n] = pid;
+		pids[n+1] = __find_tgid_locked(pid);
+		n += 2;
+	}
+	arch_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
+
+	if (n == 0) {
+		kfree(pids);
+		return 0;
+	}
+
+	/* enough to hold max pair of pids + space, lr and nul */
+	len = n * 12;
+	file_buf = kmalloc(len, GFP_KERNEL);
+	if (!file_buf) {
+		kfree(pids);
+		return -ENOMEM;
+	}
+
+	buf = file_buf;
+	for (i = 0; i < n && len > 0; i += 2) {
+		int r;
+
+		r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]);
 		buf += r;
-		len += r;
+		len -= r;
 	}
 
 	len = simple_read_from_buffer(ubuf, cnt, ppos,
-				      file_buf, len);
+				      file_buf, buf - file_buf);
 
 	kfree(file_buf);
+	kfree(pids);
 
 	return len;
 }

From 4d1ddb8d3b84e9e162217bf55e8aad6fa796b836 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Thu, 5 Apr 2018 16:24:43 -0700
Subject: [PATCH 1815/3220] BACKPORT: zsmalloc: introduce zs_huge_class_size()

Patch series "zsmalloc/zram: drop zram's max_zpage_size", v3.

ZRAM's max_zpage_size is a bad thing.  It forces zsmalloc to store
normal objects as huge ones, which results in bigger zsmalloc memory
usage.  Drop it and use actual zsmalloc huge-class value when decide if
the object is huge or not.

This patch (of 2):

Not every object can be share its zspage with other objects, e.g.  when
the object is as big as zspage or nearly as big a zspage.  For such
objects zsmalloc has a so called huge class - every object which belongs
to huge class consumes the entire zspage (which consists of a physical
page).  On x86_64, PAGE_SHIFT 12 box, the first non-huge class size is
3264, so starting down from size 3264, objects can share page(-s) and
thus minimize memory wastage.

ZRAM, however, has its own statically defined watermark for huge
objects, namely "3 * PAGE_SIZE / 4 = 3072", and forcibly stores every
object larger than this watermark (3072) as a PAGE_SIZE object, in other
words, to a huge class, while zsmalloc can keep some of those objects in
non-huge classes.  This results in increased memory consumption.

zsmalloc knows better if the object is huge or not.  Introduce
zs_huge_class_size() function which tells if the given object can be
stored in one of non-huge classes or not.  This will let us to drop
ZRAM's huge object watermark and fully rely on zsmalloc when we decide
if the object is huge.

[sergey.senozhatsky.work@gmail.com: add pool param to zs_huge_class_size()]
  Link: http://lkml.kernel.org/r/20180314081833.1096-2-sergey.senozhatsky@gmail.com
Link: http://lkml.kernel.org/r/20180306070639.7389-2-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 010b495e2fa32353d0ef6aa70a8169e5ef617a15)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 113183619
Change-Id: Ic35f8c1ec75f0b78bf2d83729b6aedd2999f25c8
---
 include/linux/zsmalloc.h |  2 ++
 mm/zsmalloc.c            | 43 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 57a8e98f2708..2219cce81ca4 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -47,6 +47,8 @@ void zs_destroy_pool(struct zs_pool *pool);
 unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
 void zs_free(struct zs_pool *pool, unsigned long obj);
 
+size_t zs_huge_class_size(struct zs_pool *pool);
+
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 			enum zs_mapmode mm);
 void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 8fced2101492..290e8210c13e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -202,6 +202,7 @@ static int zs_size_classes;
  * (see: fix_fullness_group())
  */
 static const int fullness_threshold_frac = 4;
+static size_t huge_class_size;
 
 struct size_class {
 	spinlock_t lock;
@@ -1351,6 +1352,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
 
+/**
+ * zs_huge_class_size() - Returns the size (in bytes) of the first huge
+ *                        zsmalloc &size_class.
+ * @pool: zsmalloc pool to use
+ *
+ * The function returns the size of the first huge class - any object of equal
+ * or bigger size will be stored in zspage consisting of a single physical
+ * page.
+ *
+ * Context: Any context.
+ *
+ * Return: the size (in bytes) of the first huge zsmalloc &size_class.
+ */
+size_t zs_huge_class_size(struct zs_pool *pool)
+{
+	return huge_class_size;
+}
+EXPORT_SYMBOL_GPL(zs_huge_class_size);
+
 static unsigned long obj_malloc(struct page *first_page,
 		struct size_class *class, unsigned long handle)
 {
@@ -1919,12 +1939,35 @@ struct zs_pool *zs_create_pool(const char *name)
 	for (i = zs_size_classes - 1; i >= 0; i--) {
 		int size;
 		int pages_per_zspage;
+		int objs_per_zspage;
 		struct size_class *class;
 
 		size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
 		if (size > ZS_MAX_ALLOC_SIZE)
 			size = ZS_MAX_ALLOC_SIZE;
 		pages_per_zspage = get_pages_per_zspage(size);
+		objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
+
+		/*
+		 * We iterate from biggest down to smallest classes,
+		 * so huge_class_size holds the size of the first huge
+		 * class. Any object bigger than or equal to that will
+		 * endup in the huge class.
+		 */
+		if (pages_per_zspage != 1 && objs_per_zspage != 1 &&
+				!huge_class_size) {
+			huge_class_size = size;
+			/*
+			 * The object uses ZS_HANDLE_SIZE bytes to store the
+			 * handle. We need to subtract it, because zs_malloc()
+			 * unconditionally adds handle size before it performs
+			 * size class search - so object may be smaller than
+			 * huge class size, yet it still can end up in the huge
+			 * class because it grows by ZS_HANDLE_SIZE extra bytes
+			 * right before class lookup.
+			 */
+			huge_class_size -= (ZS_HANDLE_SIZE - 1);
+		}
 
 		/*
 		 * size_class is used for normal zsmalloc operation such

From 05cccc3367f41db33ff794f4a9f6939f1a4cdd97 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Thu, 5 Apr 2018 16:24:47 -0700
Subject: [PATCH 1816/3220] BACKPORT: zram: drop max_zpage_size and use
 zs_huge_class_size()

Remove ZRAM's enforced "huge object" value and use zsmalloc huge-class
watermark instead, which makes more sense.

TEST
- I used a 1G zram device, LZO compression back-end, original
  data set size was 444MB. Looking at zsmalloc classes stats the
  test ended up to be pretty fair.

BASE ZRAM/ZSMALLOC
=====================
zram mm_stat

498978816 191482495 199831552        0 199831552    15634        0

zsmalloc classes

 class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage freeable
...
   151  2448           0            0          1240       1240        744                3        0
   168  2720           0            0          4200       4200       2800                2        0
   190  3072           0            0         10100      10100       7575                3        0
   202  3264           0            0           380        380        304                4        0
   254  4096           0            0         10620      10620      10620                1        0

 Total                 7           46        106982     106187      48787                         0

PATCHED ZRAM/ZSMALLOC
=====================

zram mm_stat

498978816 182579184 194248704        0 194248704    15628        0

zsmalloc classes

 class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage freeable
...
   151  2448           0            0          1240       1240        744                3        0
   168  2720           0            0          4200       4200       2800                2        0
   190  3072           0            0         10100      10100       7575                3        0
   202  3264           0            0          7180       7180       5744                4        0
   254  4096           0            0          3820       3820       3820                1        0

 Total                 8           45        106959     106193      47424                         0

As we can see, we reduced the number of objects stored in class-4096,
because a huge number of objects which we previously forcibly stored in
class-4096 now stored in non-huge class-3264.  This results in lower
memory consumption:

- zsmalloc now uses 47424 physical pages, which is less than 48787 pages
  zsmalloc used before.

- objects that we store in class-3264 share zspages.  That's why overall
  the number of pages that both class-4096 and class-3264 consumed went
  down from 10924 to 9564.

[sergey.senozhatsky.work@gmail.com: add pool param to zs_huge_class_size()]
  Link: http://lkml.kernel.org/r/20180314081833.1096-3-sergey.senozhatsky@gmail.com
Link: http://lkml.kernel.org/r/20180306070639.7389-3-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

(cherry picked from commit 60f5921a9a4f126e081318bd6bb2bc2798b7bba8)
Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 113183619
Change-Id: I1d3ede25543e99a24802ad03f68995f33aaf79b5
---
 drivers/block/zram/zram_drv.c | 10 ++++++++--
 drivers/block/zram/zram_drv.h | 16 ----------------
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ec034123cbec..7ccc2e3e4ca3 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -44,6 +44,11 @@ static const char *default_compressor = "lzo";
 
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
+/*
+ * Pages that compress to sizes equals or greater than this are stored
+ * uncompressed in memory.
+ */
+static size_t huge_class_size;
 
 static void zram_free_page(struct zram *zram, size_t index);
 
@@ -931,6 +936,8 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 		return false;
 	}
 
+	if (!huge_class_size)
+		huge_class_size = zs_huge_class_size(zram->mem_pool);
 	return true;
 }
 
@@ -1117,8 +1124,7 @@ compress_again:
 		return ret;
 	}
 
-	if (unlikely(comp_len > max_zpage_size)) {
-		comp_len = PAGE_SIZE;
+	if (unlikely(comp_len >= huge_class_size)) {
 		if (zram_wb_enabled(zram) && allow_wb) {
 			zcomp_stream_put(zram->comp);
 			ret = write_to_bdev(zram, bvec, index, bio, &element);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index bbda650f0dc1..3a1cac486e96 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -21,22 +21,6 @@
 
 #include "zcomp.h"
 
-/*-- Configurable parameters */
-
-/*
- * Pages that compress to size greater than this are stored
- * uncompressed in memory.
- */
-static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
-
-/*
- * NOTE: max_zpage_size must be less than or equal to:
- *   ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would
- * always return failure.
- */
-
-/*-- End of configurable params */
-
 #define SECTOR_SHIFT		9
 #define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)

From b2cf10ee86d762e460df73a823de24c62f702ecd Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Wed, 5 Sep 2018 09:39:22 -0700
Subject: [PATCH 1817/3220] ANDROID: arm64: mm: fix 4.4.154 merge

android-4.4 contains an out-of-tree backport of 68709f45385a ("arm64:
only consider memblocks with NOMAP cleared for linear mapping"), so it
should use the 4.9.y implementation of pfn_valid() that calls
memblock_is_map_memory().

Change-Id: Id1d67813ee2a0a85ec69ef255daa27c4f6286800
Reported-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/arm64/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 17393bad19eb..60e113a7c5db 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -134,7 +134,7 @@ int pfn_valid(unsigned long pfn)
 
 	if ((addr >> PAGE_SHIFT) != pfn)
 		return 0;
-	return memblock_is_memory(addr);
+	return memblock_is_map_memory(addr);
 }
 EXPORT_SYMBOL(pfn_valid);
 #endif

From b68e78cfbd1dcf33b84c53cde4fff2ebf37d095d Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 8 Jun 2017 16:44:22 -0700
Subject: [PATCH 1818/3220] BACKPORT: arm64/vdso: Fix nsec handling for
 CLOCK_MONOTONIC_RAW

commit dbb236c1ceb697a559e0694ac4c9e7b9131d0b16 upstream.

Recently vDSO support for CLOCK_MONOTONIC_RAW was added in
49eea433b326 ("arm64: Add support for CLOCK_MONOTONIC_RAW in
clock_gettime() vDSO"). Noticing that the core timekeeping code
never set tkr_raw.xtime_nsec, the vDSO implementation didn't
bother exposing it via the data page and instead took the
unshifted tk->raw_time.tv_nsec value which was then immediately
shifted left in the vDSO code.

Unfortunately, by accellerating the MONOTONIC_RAW clockid, it
uncovered potential 1ns time inconsistencies caused by the
timekeeping core not handing sub-ns resolution.

Now that the core code has been fixed and is actually setting
tkr_raw.xtime_nsec, we need to take that into account in the
vDSO by adding it to the shifted raw_time value, in order to
fix the user-visible inconsistency. Rather than do that at each
use (and expand the data page in the process), instead perform
the shift/addition operation when populating the data page and
remove the shift from the vDSO code entirely.

[jstultz: minor whitespace tweak, tried to improve commit
 message to make it more clear this fixes a regression]
Reported-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Tested-by: Daniel Mentz <danielmentz@google.com>
Acked-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Link: http://lkml.kernel.org/r/1496965462-20003-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Change-Id: I51b5ebd994635eb091ad6a084ddfa12074f27d81
Tested-by: Freddy Hsin <freddy.hsin@mediatek.com>
Signed-off-by: Miles Chen <miles.chen@mediatek.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 arch/arm64/kernel/vdso.c              | 1 -
 arch/arm64/kernel/vdso/gettimeofday.S | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 7e9dd94452bb..46fa4de29fb1 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -222,7 +222,6 @@ void update_vsyscall(struct timekeeper *tk)
 		vdso_data->raw_time_nsec        = tk->tkr_raw.xtime_nsec;
 		vdso_data->xtime_clock_sec	= tk->xtime_sec;
 		vdso_data->xtime_clock_nsec	= tk->tkr_mono.xtime_nsec;
-		/* tkr_raw.xtime_nsec == 0 */
 		vdso_data->cs_mono_mult		= tk->tkr_mono.mult;
 		vdso_data->cs_raw_mult		= tk->tkr_raw.mult;
 		/* tkr_mono.shift == tkr_raw.shift */
diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S
index c97ce91cf023..c39872a7b03c 100644
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ b/arch/arm64/kernel/vdso/gettimeofday.S
@@ -256,7 +256,6 @@ monotonic_raw:
 	seqcnt_check fail=monotonic_raw
 
 	/* All computations are done with left-shifted nsecs. */
-	lsl	x14, x14, x12
 	get_nsec_per_sec res=x9
 	lsl	x9, x9, x12
 

From cb28adba1a09a8efafb63dcb69c6dfca7e2d7c99 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Sat, 25 Aug 2018 13:50:56 -0700
Subject: [PATCH 1819/3220] FROMLIST: ANDROID: binder: Add
 BINDER_GET_NODE_INFO_FOR_REF ioctl.

This allows the context manager to retrieve information about nodes
that it holds a reference to, such as the current number of
references to those nodes.

Such information can for example be used to determine whether the
servicemanager is the only process holding a reference to a node.
This information can then be passed on to the process holding the
node, which can in turn decide whether it wants to shut down to
reduce resource usage.

Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/android/binder.c            | 55 +++++++++++++++++++++++++++++
 include/uapi/linux/android/binder.h | 10 ++++++
 2 files changed, 65 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index e0c46ce312d7..11c297806d7d 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4714,6 +4714,42 @@ out:
 	return ret;
 }
 
+static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc,
+		struct binder_node_info_for_ref *info)
+{
+	struct binder_node *node;
+	struct binder_context *context = proc->context;
+	__u32 handle = info->handle;
+
+	if (info->strong_count || info->weak_count || info->reserved1 ||
+	    info->reserved2 || info->reserved3) {
+		binder_user_error("%d BINDER_GET_NODE_INFO_FOR_REF: only handle may be non-zero.",
+				  proc->pid);
+		return -EINVAL;
+	}
+
+	/* This ioctl may only be used by the context manager */
+	mutex_lock(&context->context_mgr_node_lock);
+	if (!context->binder_context_mgr_node ||
+		context->binder_context_mgr_node->proc != proc) {
+		mutex_unlock(&context->context_mgr_node_lock);
+		return -EPERM;
+	}
+	mutex_unlock(&context->context_mgr_node_lock);
+
+	node = binder_get_node_from_ref(proc, handle, true, NULL);
+	if (!node)
+		return -EINVAL;
+
+	info->strong_count = node->local_strong_refs +
+		node->internal_strong_refs;
+	info->weak_count = node->local_weak_refs;
+
+	binder_put_node(node);
+
+	return 0;
+}
+
 static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
 				struct binder_node_debug_info *info) {
 	struct rb_node *n;
@@ -4807,6 +4843,25 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		break;
 	}
+	case BINDER_GET_NODE_INFO_FOR_REF: {
+		struct binder_node_info_for_ref info;
+
+		if (copy_from_user(&info, ubuf, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		ret = binder_ioctl_get_node_info_for_ref(proc, &info);
+		if (ret < 0)
+			goto err;
+
+		if (copy_to_user(ubuf, &info, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		break;
+	}
 	case BINDER_GET_NODE_DEBUG_INFO: {
 		struct binder_node_debug_info info;
 
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 5539933b3491..bd0da0e992b8 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -246,6 +246,15 @@ struct binder_node_debug_info {
 	__u32            has_weak_ref;
 };
 
+struct binder_node_info_for_ref {
+	__u32            handle;
+	__u32            strong_count;
+	__u32            weak_count;
+	__u32            reserved1;
+	__u32            reserved2;
+	__u32            reserved3;
+};
+
 #define BINDER_WRITE_READ		_IOWR('b', 1, struct binder_write_read)
 #define BINDER_SET_IDLE_TIMEOUT		_IOW('b', 3, __s64)
 #define BINDER_SET_MAX_THREADS		_IOW('b', 5, __u32)
@@ -254,6 +263,7 @@ struct binder_node_debug_info {
 #define BINDER_THREAD_EXIT		_IOW('b', 8, __s32)
 #define BINDER_VERSION			_IOWR('b', 9, struct binder_version)
 #define BINDER_GET_NODE_DEBUG_INFO	_IOWR('b', 11, struct binder_node_debug_info)
+#define BINDER_GET_NODE_INFO_FOR_REF	_IOWR('b', 12, struct binder_node_info_for_ref)
 
 /*
  * NOTE: Two special error codes you should check for when calling

From 68e051d4a782767639205e447c083d08c51bc028 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Aug 2016 14:42:08 -0700
Subject: [PATCH 1820/3220] BACKPORT: list: Split list_add() debug checking
 into separate function

(cherry-picked from d7c816733d501b59dbdc2483f2cc8e4431fd9160)

Right now, __list_add() code is repeated either in list.h or in
list_debug.c, but the only differences between the two versions
are the debug checks. This commit therefore extracts these debug
checks into a separate __list_add_valid() function and consolidates
__list_add(). Additionally this new __list_add_valid() function will stop
list manipulations if a corruption is detected, instead of allowing for
further corruption that may lead to even worse conditions.

This is slight refactoring of the same hardening done in PaX and Grsecurity.

Change-Id: I9a9c9a58857cf837bec7abdb2ee4970cd1242a5e
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 include/linux/list.h | 22 ++++++++++++++------
 lib/list_debug.c     | 48 +++++++++++++++++++++-----------------------
 2 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index 993395a2e55c..eb783a0192bd 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -28,27 +28,37 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
 	list->prev = list;
 }
 
+#ifdef CONFIG_DEBUG_LIST
+extern bool __list_add_valid(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next);
+#else
+static inline bool __list_add_valid(struct list_head *new,
+				struct list_head *prev,
+				struct list_head *next)
+{
+	return true;
+}
+#endif
+
 /*
  * Insert a new entry between two known consecutive entries.
  *
  * This is only for internal list manipulation where we know
  * the prev/next entries already!
  */
-#ifndef CONFIG_DEBUG_LIST
 static inline void __list_add(struct list_head *new,
 			      struct list_head *prev,
 			      struct list_head *next)
 {
+	if (!__list_add_valid(new, prev, next))
+		return;
+
 	next->prev = new;
 	new->next = next;
 	new->prev = prev;
 	prev->next = new;
 }
-#else
-extern void __list_add(struct list_head *new,
-			      struct list_head *prev,
-			      struct list_head *next);
-#endif
 
 /**
  * list_add - add a new entry
diff --git a/lib/list_debug.c b/lib/list_debug.c
index c24c2f7e296f..149dd57b583b 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -2,8 +2,7 @@
  * Copyright 2006, Red Hat, Inc., Dave Jones
  * Released under the General Public License (GPL).
  *
- * This file contains the linked list implementations for
- * DEBUG_LIST.
+ * This file contains the linked list validation for DEBUG_LIST.
  */
 
 #include <linux/export.h>
@@ -13,33 +12,32 @@
 #include <linux/rculist.h>
 
 /*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
+ * Check that the data structures for the list manipulations are reasonably
+ * valid. Failures here indicate memory corruption (and possibly an exploit
+ * attempt).
  */
 
-void __list_add(struct list_head *new,
-			      struct list_head *prev,
-			      struct list_head *next)
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+		      struct list_head *next)
 {
-	WARN(next->prev != prev,
-		"list_add corruption. next->prev should be "
-		"prev (%p), but was %p. (next=%p).\n",
-		prev, next->prev, next);
-	WARN(prev->next != next,
-		"list_add corruption. prev->next should be "
-		"next (%p), but was %p. (prev=%p).\n",
-		next, prev->next, prev);
-	WARN(new == prev || new == next,
-	     "list_add double add: new=%p, prev=%p, next=%p.\n",
-	     new, prev, next);
-	next->prev = new;
-	new->next = next;
-	new->prev = prev;
-	prev->next = new;
+	if (unlikely(next->prev != prev)) {
+		WARN(1, "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
+			prev, next->prev, next);
+		return false;
+	}
+	if (unlikely(prev->next != next)) {
+		WARN(1, "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
+			next, prev->next, prev);
+		return false;
+	}
+	if (unlikely(new == prev || new == next)) {
+		WARN(1, "list_add double add: new=%p, prev=%p, next=%p.\n",
+			new, prev, next);
+		return false;
+	}
+	return true;
 }
-EXPORT_SYMBOL(__list_add);
+EXPORT_SYMBOL(__list_add_valid);
 
 void __list_del_entry(struct list_head *entry)
 {

From f0b68d746af4aa4988463b24c6f5d0f97f0aaa82 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Aug 2016 14:42:09 -0700
Subject: [PATCH 1821/3220] UPSTREAM: rculist: Consolidate DEBUG_LIST for
 list_add_rcu()

(cherry-picked from 54acd4397d7e7a725c94101180cd9f38ef701acc)

This commit consolidates the debug checking for list_add_rcu() into the
new single __list_add_valid() debug function.  Notably, this commit fixes
the sanity check that was added in commit 17a801f4bfeb ("list_debug:
WARN for adding something already in the list"), which wasn't checking
RCU-protected lists.

Change-Id: I1f7e169d4dc45bbc9938087a171c5df747344414
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 include/linux/rculist.h |  8 +++-----
 lib/list_debug.c        | 19 -------------------
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 5ed540986019..0c94d17a4642 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -45,19 +45,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
  * This is only for internal list manipulation where we know
  * the prev/next entries already!
  */
-#ifndef CONFIG_DEBUG_LIST
 static inline void __list_add_rcu(struct list_head *new,
 		struct list_head *prev, struct list_head *next)
 {
+	if (!__list_add_valid(new, prev, next))
+		return;
+
 	new->next = next;
 	new->prev = prev;
 	rcu_assign_pointer(list_next_rcu(prev), new);
 	next->prev = new;
 }
-#else
-void __list_add_rcu(struct list_head *new,
-		    struct list_head *prev, struct list_head *next);
-#endif
 
 /**
  * list_add_rcu - add a new entry to rcu-protected list
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 149dd57b583b..d0b89b9d0736 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -77,22 +77,3 @@ void list_del(struct list_head *entry)
 	entry->prev = LIST_POISON2;
 }
 EXPORT_SYMBOL(list_del);
-
-/*
- * RCU variants.
- */
-void __list_add_rcu(struct list_head *new,
-		    struct list_head *prev, struct list_head *next)
-{
-	WARN(next->prev != prev,
-		"list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
-		prev, next->prev, next);
-	WARN(prev->next != next,
-		"list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
-		next, prev->next, prev);
-	new->next = next;
-	new->prev = prev;
-	rcu_assign_pointer(list_next_rcu(prev), new);
-	next->prev = new;
-}
-EXPORT_SYMBOL(__list_add_rcu);

From 10c8c8a765f1335c46817d6dbf3e288dd45884df Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Aug 2016 14:42:10 -0700
Subject: [PATCH 1822/3220] UPSTREAM: list: Split list_del() debug checking
 into separate function

(cherry-picked from 0cd340dcb05c4a43742fe156f36737bb2a321bfd)

Similar to the list_add() debug consolidation, this commit consolidates
the debug checking performed during CONFIG_DEBUG_LIST into a new
__list_del_entry_valid() function, and stops list updates when corruption
is found.

Refactored from same hardening in PaX and Grsecurity.

Change-Id: I9e3b8654ab25f3a196e3336fc4882b73010873e7
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 include/linux/list.h | 15 ++++++++-----
 lib/list_debug.c     | 53 +++++++++++++++++++-------------------------
 2 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index eb783a0192bd..d5750f2f1c36 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -32,6 +32,7 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
 extern bool __list_add_valid(struct list_head *new,
 			      struct list_head *prev,
 			      struct list_head *next);
+extern bool __list_del_entry_valid(struct list_head *entry);
 #else
 static inline bool __list_add_valid(struct list_head *new,
 				struct list_head *prev,
@@ -39,6 +40,10 @@ static inline bool __list_add_valid(struct list_head *new,
 {
 	return true;
 }
+static inline bool __list_del_entry_valid(struct list_head *entry)
+{
+	return true;
+}
 #endif
 
 /*
@@ -106,22 +111,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next)
  * Note: list_empty() on entry does not return true after this, the entry is
  * in an undefined state.
  */
-#ifndef CONFIG_DEBUG_LIST
 static inline void __list_del_entry(struct list_head *entry)
 {
+	if (!__list_del_entry_valid(entry))
+		return;
+
 	__list_del(entry->prev, entry->next);
 }
 
 static inline void list_del(struct list_head *entry)
 {
-	__list_del(entry->prev, entry->next);
+	__list_del_entry(entry);
 	entry->next = LIST_POISON1;
 	entry->prev = LIST_POISON2;
 }
-#else
-extern void __list_del_entry(struct list_head *entry);
-extern void list_del(struct list_head *entry);
-#endif
 
 /**
  * list_replace - replace old entry by new one
diff --git a/lib/list_debug.c b/lib/list_debug.c
index d0b89b9d0736..276565fca2a6 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -39,41 +39,34 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev,
 }
 EXPORT_SYMBOL(__list_add_valid);
 
-void __list_del_entry(struct list_head *entry)
+bool __list_del_entry_valid(struct list_head *entry)
 {
 	struct list_head *prev, *next;
 
 	prev = entry->prev;
 	next = entry->next;
 
-	if (WARN(next == LIST_POISON1,
-		"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
-		entry, LIST_POISON1) ||
-	    WARN(prev == LIST_POISON2,
-		"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
-		entry, LIST_POISON2) ||
-	    WARN(prev->next != entry,
-		"list_del corruption. prev->next should be %p, "
-		"but was %p\n", entry, prev->next) ||
-	    WARN(next->prev != entry,
-		"list_del corruption. next->prev should be %p, "
-		"but was %p\n", entry, next->prev))
-		return;
+	if (unlikely(next == LIST_POISON1)) {
+		WARN(1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n",
+			entry, LIST_POISON1);
+		return false;
+	}
+	if (unlikely(prev == LIST_POISON2)) {
+		WARN(1, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
+			entry, LIST_POISON2);
+		return false;
+	}
+	if (unlikely(prev->next != entry)) {
+		WARN(1, "list_del corruption. prev->next should be %p, but was %p\n",
+			entry, prev->next);
+		return false;
+	}
+	if (unlikely(next->prev != entry)) {
+		WARN(1, "list_del corruption. next->prev should be %p, but was %p\n",
+			entry, next->prev);
+		return false;
+	}
+	return true;
 
-	__list_del(prev, next);
 }
-EXPORT_SYMBOL(__list_del_entry);
-
-/**
- * list_del - deletes entry from list.
- * @entry: the element to delete from the list.
- * Note: list_empty on entry does not return true after this, the entry is
- * in an undefined state.
- */
-void list_del(struct list_head *entry)
-{
-	__list_del_entry(entry);
-	entry->next = LIST_POISON1;
-	entry->prev = LIST_POISON2;
-}
-EXPORT_SYMBOL(list_del);
+EXPORT_SYMBOL(__list_del_entry_valid);

From 7514f97280fdf4373f99ca28e573925cfb828aa7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Aug 2016 14:42:11 -0700
Subject: [PATCH 1823/3220] UPSTREAM: bug: Provide toggle for BUG on data
 corruption

(cherry-picked from de54ebbe26bb371a6f1fbc0593372232f04e3107)

The kernel checks for cases of data structure corruption under some
CONFIGs (e.g. CONFIG_DEBUG_LIST). When corruption is detected, some
systems may want to BUG() immediately instead of letting the system run
with known corruption.  Usually these kinds of manipulation primitives can
be used by security flaws to gain arbitrary memory write control. This
provides a new config CONFIG_BUG_ON_DATA_CORRUPTION and a corresponding
macro CHECK_DATA_CORRUPTION for handling these situations. Notably, even
if not BUGing, the kernel should not continue processing the corrupted
structure.

This is inspired by similar hardening by Syed Rameez Mustafa in MSM
kernels, and in PaX and Grsecurity, which is likely in response to earlier
removal of the BUG calls in commit 924d9addb9b1 ("list debugging: use
WARN() instead of BUG()").

Change-Id: I4cdfa9fbebe32a990a111d051e4ec4e421f77a09
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 include/linux/bug.h | 17 ++++++++++++++
 lib/Kconfig.debug   | 10 ++++++++
 lib/list_debug.c    | 57 +++++++++++++++++----------------------------
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/include/linux/bug.h b/include/linux/bug.h
index 7f4818673c41..2bafb1d6ee89 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -109,4 +109,21 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 }
 
 #endif	/* CONFIG_GENERIC_BUG */
+
+/*
+ * Since detected data corruption should stop operation on the affected
+ * structures, this returns false if the corruption condition is found.
+ */
+#define CHECK_DATA_CORRUPTION(condition, fmt, ...)			 \
+	do {								 \
+		if (unlikely(condition)) {				 \
+			if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
+				pr_err(fmt, ##__VA_ARGS__);		 \
+				BUG();					 \
+			} else						 \
+				WARN(1, fmt, ##__VA_ARGS__);		 \
+			return false;					 \
+		}							 \
+	} while (0)
+
 #endif	/* _LINUX_BUG_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 947ec1a19af5..d9adee033786 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1891,6 +1891,16 @@ config TEST_STATIC_KEYS
 
 	  If unsure, say N.
 
+config BUG_ON_DATA_CORRUPTION
+	bool "Trigger a BUG when data corruption is detected"
+	select CONFIG_DEBUG_LIST
+	help
+	  Select this option if the kernel should BUG when it encounters
+	  data corruption in kernel memory structures when they get checked
+	  for validity.
+
+	  If unsure, say N.
+
 source "samples/Kconfig"
 
 source "lib/Kconfig.kgdb"
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 276565fca2a6..7f7bfa55eb6d 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -20,21 +20,16 @@
 bool __list_add_valid(struct list_head *new, struct list_head *prev,
 		      struct list_head *next)
 {
-	if (unlikely(next->prev != prev)) {
-		WARN(1, "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
-			prev, next->prev, next);
-		return false;
-	}
-	if (unlikely(prev->next != next)) {
-		WARN(1, "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
-			next, prev->next, prev);
-		return false;
-	}
-	if (unlikely(new == prev || new == next)) {
-		WARN(1, "list_add double add: new=%p, prev=%p, next=%p.\n",
-			new, prev, next);
-		return false;
-	}
+	CHECK_DATA_CORRUPTION(next->prev != prev,
+		"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
+		prev, next->prev, next);
+	CHECK_DATA_CORRUPTION(prev->next != next,
+		"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
+		next, prev->next, prev);
+	CHECK_DATA_CORRUPTION(new == prev || new == next,
+		"list_add double add: new=%p, prev=%p, next=%p.\n",
+		new, prev, next);
+
 	return true;
 }
 EXPORT_SYMBOL(__list_add_valid);
@@ -46,26 +41,18 @@ bool __list_del_entry_valid(struct list_head *entry)
 	prev = entry->prev;
 	next = entry->next;
 
-	if (unlikely(next == LIST_POISON1)) {
-		WARN(1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n",
-			entry, LIST_POISON1);
-		return false;
-	}
-	if (unlikely(prev == LIST_POISON2)) {
-		WARN(1, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
-			entry, LIST_POISON2);
-		return false;
-	}
-	if (unlikely(prev->next != entry)) {
-		WARN(1, "list_del corruption. prev->next should be %p, but was %p\n",
-			entry, prev->next);
-		return false;
-	}
-	if (unlikely(next->prev != entry)) {
-		WARN(1, "list_del corruption. next->prev should be %p, but was %p\n",
-			entry, next->prev);
-		return false;
-	}
+	CHECK_DATA_CORRUPTION(next == LIST_POISON1,
+		"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
+		entry, LIST_POISON1);
+	CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
+		"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
+		entry, LIST_POISON2);
+	CHECK_DATA_CORRUPTION(prev->next != entry,
+		"list_del corruption. prev->next should be %p, but was %p\n",
+		entry, prev->next);
+	CHECK_DATA_CORRUPTION(next->prev != entry,
+		"list_del corruption. next->prev should be %p, but was %p\n",
+		entry, next->prev);
 	return true;
 
 }

From 3c318e77d1e4d32018def996c1241558822be0e5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Aug 2016 14:42:12 -0700
Subject: [PATCH 1824/3220] BACKPORT: lkdtm: Add tests for struct list
 corruption

(cherry-picked from 6819d101dd739dd4e8cbe60a98c9ebb224ecc992)

When building under CONFIG_DEBUG_LIST, list addition and removal will be
sanity-checked. This validates that the check is working as expected by
setting up classic corruption attacks against list manipulations, available
with the new lkdtm tests CORRUPT_LIST_ADD and CORRUPT_LIST_DEL.

Change-Id: Iddf70c61b745342dd4f055dc9c1eb221ca779c2e
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 drivers/misc/lkdtm.c | 70 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c
index 2a6eaf1122b4..42a0a99007be 100644
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -47,11 +47,16 @@
 #include <linux/vmalloc.h>
 #include <linux/mman.h>
 #include <asm/cacheflush.h>
+#include <linux/list.h>
 
 #ifdef CONFIG_IDE
 #include <linux/ide.h>
 #endif
 
+struct lkdtm_list {
+	struct list_head node;
+};
+
 /*
  * Make sure our attempts to over run the kernel stack doesn't trigger
  * a compiler warning when CONFIG_FRAME_WARN is set. Then make sure we
@@ -88,6 +93,8 @@ enum ctype {
 	CT_EXCEPTION,
 	CT_LOOP,
 	CT_OVERFLOW,
+	CT_CORRUPT_LIST_ADD,
+	CT_CORRUPT_LIST_DEL,
 	CT_CORRUPT_STACK,
 	CT_UNALIGNED_LOAD_STORE_WRITE,
 	CT_OVERWRITE_ALLOCATION,
@@ -126,6 +133,8 @@ static char* cp_type[] = {
 	"EXCEPTION",
 	"LOOP",
 	"OVERFLOW",
+	"CORRUPT_LIST_ADD",
+	"CORRUPT_LIST_DEL",
 	"CORRUPT_STACK",
 	"UNALIGNED_LOAD_STORE_WRITE",
 	"OVERWRITE_ALLOCATION",
@@ -548,6 +557,67 @@ static void lkdtm_do_action(enum ctype which)
 		do_overwritten();
 		break;
 	}
+	case CT_CORRUPT_LIST_ADD: {
+		/*
+		 * Initially, an empty list via LIST_HEAD:
+		 *	test_head.next = &test_head
+		 *	test_head.prev = &test_head
+		 */
+		LIST_HEAD(test_head);
+		struct lkdtm_list good, bad;
+		void *target[2] = { };
+		void *redirection = &target;
+
+		pr_info("attempting good list addition\n");
+
+		/*
+		 * Adding to the list performs these actions:
+		 *	test_head.next->prev = &good.node
+		 *	good.node.next = test_head.next
+		 *	good.node.prev = test_head
+		 *	test_head.next = good.node
+		 */
+		list_add(&good.node, &test_head);
+
+		pr_info("attempting corrupted list addition\n");
+		/*
+		 * In simulating this "write what where" primitive, the "what" is
+		 * the address of &bad.node, and the "where" is the address held
+		 * by "redirection".
+		 */
+		test_head.next = redirection;
+		list_add(&bad.node, &test_head);
+
+		if (target[0] == NULL && target[1] == NULL)
+			pr_err("Overwrite did not happen, but no BUG?!\n");
+		else
+			pr_err("list_add() corruption not detected!\n");
+		break;
+	}
+	case CT_CORRUPT_LIST_DEL: {
+		LIST_HEAD(test_head);
+		struct lkdtm_list item;
+		void *target[2] = { };
+		void *redirection = &target;
+
+		list_add(&item.node, &test_head);
+
+		pr_info("attempting good list removal\n");
+		list_del(&item.node);
+
+		pr_info("attempting corrupted list removal\n");
+		list_add(&item.node, &test_head);
+
+		/* As with the list_add() test above, this corrupts "next". */
+		item.node.next = redirection;
+		list_del(&item.node);
+
+		if (target[0] == NULL && target[1] == NULL)
+			pr_err("Overwrite did not happen, but no BUG?!\n");
+		else
+			pr_err("list_del() corruption not detected!\n");
+		break;
+	}
 	case CT_NONE:
 	default:
 		break;

From a3772a806a04164f2fc9a956bb997c4b8c8179e9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 24 Feb 2017 15:00:38 -0800
Subject: [PATCH 1825/3220] UPSTREAM: bug: switch data corruption check to
 __must_check

(cherry-picked from 85caa95b9f19bb3a26d7e025d1134760b69e0c40)

The CHECK_DATA_CORRUPTION() macro was designed to have callers do
something meaningful/protective on failure.  However, using "return
false" in the macro too strictly limits the design patterns of callers.
Instead, let callers handle the logic test directly, but make sure that
the result IS checked by forcing __must_check (which appears to not be
able to be used directly on macro expressions).

Change-Id: I635dc2f39959104ea8b475d2d5018af3502f33ba
Link: http://lkml.kernel.org/r/20170206204547.GA125312@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 include/linux/bug.h | 12 +++++++-----
 lib/list_debug.c    | 45 ++++++++++++++++++++++++---------------------
 2 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/include/linux/bug.h b/include/linux/bug.h
index 2bafb1d6ee89..833746d361cf 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -112,18 +112,20 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 
 /*
  * Since detected data corruption should stop operation on the affected
- * structures, this returns false if the corruption condition is found.
+ * structures. Return value must be checked and sanely acted on by caller.
  */
+static inline __must_check bool check_data_corruption(bool v) { return v; }
 #define CHECK_DATA_CORRUPTION(condition, fmt, ...)			 \
-	do {								 \
-		if (unlikely(condition)) {				 \
+	check_data_corruption(({					 \
+		bool corruption = unlikely(condition);			 \
+		if (corruption) {					 \
 			if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
 				pr_err(fmt, ##__VA_ARGS__);		 \
 				BUG();					 \
 			} else						 \
 				WARN(1, fmt, ##__VA_ARGS__);		 \
-			return false;					 \
 		}							 \
-	} while (0)
+		corruption;						 \
+	}))
 
 #endif	/* _LINUX_BUG_H */
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 7f7bfa55eb6d..a34db8d27667 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -20,15 +20,16 @@
 bool __list_add_valid(struct list_head *new, struct list_head *prev,
 		      struct list_head *next)
 {
-	CHECK_DATA_CORRUPTION(next->prev != prev,
-		"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
-		prev, next->prev, next);
-	CHECK_DATA_CORRUPTION(prev->next != next,
-		"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
-		next, prev->next, prev);
-	CHECK_DATA_CORRUPTION(new == prev || new == next,
-		"list_add double add: new=%p, prev=%p, next=%p.\n",
-		new, prev, next);
+	if (CHECK_DATA_CORRUPTION(next->prev != prev,
+			"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
+			prev, next->prev, next) ||
+	    CHECK_DATA_CORRUPTION(prev->next != next,
+			"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
+			next, prev->next, prev) ||
+	    CHECK_DATA_CORRUPTION(new == prev || new == next,
+			"list_add double add: new=%p, prev=%p, next=%p.\n",
+			new, prev, next))
+		return false;
 
 	return true;
 }
@@ -41,18 +42,20 @@ bool __list_del_entry_valid(struct list_head *entry)
 	prev = entry->prev;
 	next = entry->next;
 
-	CHECK_DATA_CORRUPTION(next == LIST_POISON1,
-		"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
-		entry, LIST_POISON1);
-	CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
-		"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
-		entry, LIST_POISON2);
-	CHECK_DATA_CORRUPTION(prev->next != entry,
-		"list_del corruption. prev->next should be %p, but was %p\n",
-		entry, prev->next);
-	CHECK_DATA_CORRUPTION(next->prev != entry,
-		"list_del corruption. next->prev should be %p, but was %p\n",
-		entry, next->prev);
+	if (CHECK_DATA_CORRUPTION(next == LIST_POISON1,
+			"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
+			entry, LIST_POISON1) ||
+	    CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
+			"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
+			entry, LIST_POISON2) ||
+	    CHECK_DATA_CORRUPTION(prev->next != entry,
+			"list_del corruption. prev->next should be %p, but was %p\n",
+			entry, prev->next) ||
+	    CHECK_DATA_CORRUPTION(next->prev != entry,
+			"list_del corruption. next->prev should be %p, but was %p\n",
+			entry, next->prev))
+		return false;
+
 	return true;
 
 }

From 4bc04315211c053502cf84e81d78af41af66c581 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 24 Mar 2017 10:51:25 -0700
Subject: [PATCH 1826/3220] BACKPORT: lkdtm: add bad USER_DS test

(cherry-picked from e22aa9d781a27a961581c57442911309fb86a48e)

This adds CORRUPT_USER_DS to check that the get_fs() test on syscall
return (via __VERIFY_PRE_USERMODE_STATE) still sees USER_DS. Since
trying to deal with values other than USER_DS and KERNEL_DS across all
architectures in a safe way is not sensible, this sets KERNEL_DS, but
since that could be extremely dangerous if the protection is not present,
it also raises SIGKILL for current, so that no matter what, the process
will die. A successful test will be visible with a BUG(), like all the
other LKDTM tests.

Change-Id: I1d2585de65032f0f6b9baea2a71f92bfc296c94b
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 drivers/misc/lkdtm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c
index 42a0a99007be..8e06e1020ad9 100644
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -48,6 +48,8 @@
 #include <linux/mman.h>
 #include <asm/cacheflush.h>
 #include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_IDE
 #include <linux/ide.h>
@@ -95,6 +97,7 @@ enum ctype {
 	CT_OVERFLOW,
 	CT_CORRUPT_LIST_ADD,
 	CT_CORRUPT_LIST_DEL,
+	CT_CORRUPT_USER_DS,
 	CT_CORRUPT_STACK,
 	CT_UNALIGNED_LOAD_STORE_WRITE,
 	CT_OVERWRITE_ALLOCATION,
@@ -135,6 +138,7 @@ static char* cp_type[] = {
 	"OVERFLOW",
 	"CORRUPT_LIST_ADD",
 	"CORRUPT_LIST_DEL",
+	"CORRUPT_USER_DS",
 	"CORRUPT_STACK",
 	"UNALIGNED_LOAD_STORE_WRITE",
 	"OVERWRITE_ALLOCATION",
@@ -618,6 +622,14 @@ static void lkdtm_do_action(enum ctype which)
 			pr_err("list_del() corruption not detected!\n");
 		break;
 	}
+	case CT_CORRUPT_USER_DS: {
+		pr_info("setting bad task size limit\n");
+		set_fs(KERNEL_DS);
+
+		/* Make sure we do not keep running with a KERNEL_DS! */
+		force_sig(SIGKILL, current);
+		break;
+	}
 	case CT_NONE:
 	default:
 		break;

From 2191e07a73b26fa1f56b9e64f983f9da3f173883 Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Wed, 14 Jun 2017 18:12:01 -0700
Subject: [PATCH 1827/3220] BACKPORT: x86/syscalls: Check address limit on
 user-mode return

(cherry-picked from 5ea0727b163cb5575e36397a12eade68a1f35f24)

Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
privileges [1].

The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.

The addr_limit_user_check function is added as a cross-architecture
function to check the address limit.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Change-Id: I604d85b262cc5b439b2665852865ca5a9ea6c5a3
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: kernel-hardening@lists.openwall.com
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Will Drewry <wad@chromium.org>
Cc: linux-api@vger.kernel.org
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Link: http://lkml.kernel.org/r/20170615011203.144108-1-thgarnie@google.com
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 arch/x86/entry/common.c            |  3 +++
 arch/x86/include/asm/thread_info.h |  4 +++-
 arch/x86/include/asm/uaccess.h     |  7 ++++++-
 include/linux/syscalls.h           | 16 ++++++++++++++++
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 071582a3b5c0..a9e501303e15 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -22,6 +22,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/nospec.h>
 #include <linux/uprobes.h>
+#include <linux/syscalls.h>
 
 #include <asm/desc.h>
 #include <asm/traps.h>
@@ -273,6 +274,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	struct thread_info *ti = pt_regs_to_thread_info(regs);
 	u32 cached_flags;
 
+	addr_limit_user_check();
+
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
 		local_irq_disable();
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e8d453732823..c14f699f5c36 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -111,6 +111,7 @@ struct thread_info {
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 #define TIF_ADDR32		29	/* 32-bit address space on 64 bits */
 #define TIF_X32			30	/* 32-bit native x86-64 binary */
+#define TIF_FSCHECK		31	/* Check FS is USER_DS on return */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -135,6 +136,7 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_ADDR32		(1 << TIF_ADDR32)
 #define _TIF_X32		(1 << TIF_X32)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -145,7 +147,7 @@ struct thread_info {
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK						\
 	((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT |	\
-	_TIF_NOHZ)
+	_TIF_NOHZ | _TIF_FSCHECK)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 6a07c05956a6..8857f6f4daa9 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -30,7 +30,12 @@
 
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current_thread_info()->addr_limit)
-#define set_fs(x)	(current_thread_info()->addr_limit = (x))
+static inline void set_fs(mm_segment_t fs)
+{
+	current_thread_info()->addr_limit = fs;
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
+}
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c2b66a277e98..a95cb2589765 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -205,6 +205,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	}								\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
+#ifdef TIF_FSCHECK
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void addr_limit_user_check(void)
+{
+
+	if (!test_thread_flag(TIF_FSCHECK))
+		return;
+
+	BUG_ON(!segment_eq(get_fs(), USER_DS));
+	clear_thread_flag(TIF_FSCHECK);
+}
+#endif
+
 asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 			       qid_t id, void __user *addr);
 asmlinkage long sys_time(time_t __user *tloc);

From 67e53dac92e798fc52fddd4634d0b1be259861ad Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Wed, 14 Jun 2017 18:12:03 -0700
Subject: [PATCH 1828/3220] BACKPORT: arm64/syscalls: Check address limit on
 user-mode return

(cherry-picked from cf7de27ab35172a9240f079477cae3146a182998)

Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].

The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Change-Id: Ic0e917286d7378cb26c15a7e553ef56fabb2f543
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: kernel-hardening@lists.openwall.com
Cc: Will Deacon <will.deacon@arm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Will Drewry <wad@chromium.org>
Cc: linux-api@vger.kernel.org
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Link: http://lkml.kernel.org/r/20170615011203.144108-3-thgarnie@google.com
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 arch/arm64/include/asm/thread_info.h | 5 ++++-
 arch/arm64/include/asm/uaccess.h     | 3 +++
 arch/arm64/kernel/signal.c           | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 67dd228c3f17..8c22d1618260 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -120,6 +120,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NEED_RESCHED	1
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE	3	/* CPU's FP state is not current's */
+#define TIF_FSCHECK		4	/* Check FS is USER_DS on return */
 #define TIF_NOHZ		7
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
@@ -140,10 +141,12 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 #define _TIF_32BIT		(1 << TIF_32BIT)
 
 #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
+				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
+				 _TIF_FSCHECK)
 
 #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index d39d8bde42d7..d0919bcb1953 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -73,6 +73,9 @@ static inline void set_fs(mm_segment_t fs)
 {
 	current_thread_info()->addr_limit = fs;
 
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
+
 	/*
 	 * Enable/disable UAO so that copy_to_user() etc can access
 	 * kernel memory with the unprivileged instructions.
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index a8eafdbc7cb8..0bed9a899850 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <linux/uaccess.h>
 #include <linux/tracehook.h>
 #include <linux/ratelimit.h>
+#include <linux/syscalls.h>
 
 #include <asm/debug-monitors.h>
 #include <asm/elf.h>
@@ -402,6 +403,9 @@ static void do_signal(struct pt_regs *regs)
 asmlinkage void do_notify_resume(struct pt_regs *regs,
 				 unsigned int thread_flags)
 {
+	/* Check valid user FS if needed */
+	addr_limit_user_check();
+
 	if (thread_flags & _TIF_SIGPENDING)
 		do_signal(regs);
 

From 1a0df285e47623996c2879ee2507a70730c5e0dc Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Thu, 7 Sep 2017 08:30:44 -0700
Subject: [PATCH 1829/3220] UPSTREAM: syscalls: Use CHECK_DATA_CORRUPTION for
 addr_limit_user_check

(cherry-picked from bf29ed1567b67854dc13504f685c45a2ea9b2081)

Use CHECK_DATA_CORRUPTION instead of BUG_ON to provide more flexibility
on address limit failures. By default, send a SIGKILL signal to kill the
current process preventing exploitation of a bad address limit.

Make the TIF_FSCHECK flag optional so ARM can use this function.

Change-Id: I02b39760aaa794db77de7b0c0b1b0ec66abe1cb1
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: Will Drewry <wad@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-api@vger.kernel.org
Cc: Yonghong Song <yhs@fb.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1504798247-48833-2-git-send-email-keescook@chromium.org
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 include/linux/syscalls.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a95cb2589765..5d2779aa4bbe 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -205,22 +205,26 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	}								\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
-#ifdef TIF_FSCHECK
 /*
  * Called before coming back to user-mode. Returning to user-mode with an
  * address limit different than USER_DS can allow to overwrite kernel memory.
  */
 static inline void addr_limit_user_check(void)
 {
-
+#ifdef TIF_FSCHECK
 	if (!test_thread_flag(TIF_FSCHECK))
 		return;
-
-	BUG_ON(!segment_eq(get_fs(), USER_DS));
-	clear_thread_flag(TIF_FSCHECK);
-}
 #endif
 
+	if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
+				  "Invalid address limit on user-mode return"))
+		force_sig(SIGKILL, current);
+
+#ifdef TIF_FSCHECK
+	clear_thread_flag(TIF_FSCHECK);
+#endif
+}
+
 asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 			       qid_t id, void __user *addr);
 asmlinkage long sys_time(time_t __user *tloc);

From a8368c98150273e6d5c6137c5656a979b98c58a6 Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Thu, 7 Sep 2017 08:30:46 -0700
Subject: [PATCH 1830/3220] BACKPORT: arm/syscalls: Optimize address limit
 check

(cherry-picked from e33f8d32677fa4f4f8996ef46748f86aac81ccff)

Disable the generic address limit check in favor of an architecture
specific optimized implementation. The generic implementation using
pending work flags did not work well with ARM and alignment faults.

The address limit is checked on each syscall return path to user-mode
path as well as the irq user-mode return function. If the address limit
was changed, a function is called to report data corruption (stopping
the kernel or process based on configuration).

The address limit check has to be done before any pending work because
they can reset the address limit and the process is killed using a
SIGKILL signal. For example the lkdtm address limit check does not work
because the signal to kill the process will reset the user-mode address
limit.

Change-Id: Ic61ba05961ad1dcf10c48040427d92bd650616af
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Leonard Crestez <leonard.crestez@nxp.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: Will Drewry <wad@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-api@vger.kernel.org
Cc: Yonghong Song <yhs@fb.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1504798247-48833-4-git-send-email-keescook@chromium.org
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 arch/arm/kernel/entry-common.S | 10 ++++++++++
 arch/arm/kernel/signal.c       |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 30a7228eaceb..9440b320a8a3 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -12,6 +12,7 @@
 #include <asm/unistd.h>
 #include <asm/ftrace.h>
 #include <asm/unwind.h>
+#include <asm/memory.h>
 
 #ifdef CONFIG_NEED_RET_TO_USER
 #include <mach/entry-macro.S>
@@ -35,6 +36,9 @@ ret_fast_syscall:
  UNWIND(.fnstart	)
  UNWIND(.cantunwind	)
 	disable_irq_notrace			@ disable interrupts
+	ldr	r2, [tsk, #TI_ADDR_LIMIT]
+	cmp	r2, #TASK_SIZE
+	blne	addr_limit_check_failed
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
 	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 	bne	fast_work_pending
@@ -61,6 +65,9 @@ ret_fast_syscall:
  UNWIND(.cantunwind	)
 	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
 	disable_irq_notrace			@ disable interrupts
+	ldr	r2, [tsk, #TI_ADDR_LIMIT]
+	cmp	r2, #TASK_SIZE
+	blne	addr_limit_check_failed
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
 	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 	beq	no_work_pending
@@ -93,6 +100,9 @@ ENTRY(ret_to_user)
 ret_slow_syscall:
 	disable_irq_notrace			@ disable interrupts
 ENTRY(ret_to_user_from_irq)
+	ldr	r2, [tsk, #TI_ADDR_LIMIT]
+	cmp	r2, #TASK_SIZE
+	blne	addr_limit_check_failed
 	ldr	r1, [tsk, #TI_FLAGS]
 	tst	r1, #_TIF_WORK_MASK
 	bne	slow_work_pending
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7b8f2141427b..304e68408f9c 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -14,6 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/tracehook.h>
 #include <linux/uprobes.h>
+#include <linux/syscalls.h>
 
 #include <asm/elf.h>
 #include <asm/cacheflush.h>
@@ -631,3 +632,9 @@ struct page *get_signal_page(void)
 
 	return page;
 }
+
+/* Defer to generic check */
+asmlinkage void addr_limit_check_failed(void)
+{
+	addr_limit_user_check();
+}

From 8d6b2dd946c6b0c2468003d0097f1737f467552a Mon Sep 17 00:00:00 2001
From: Wei Wang <wvw@google.com>
Date: Wed, 26 Sep 2018 13:48:19 -0700
Subject: [PATCH 1831/3220] ANDROID: restrict store of prefer_idle as boolean

It works as boolean so stores like a boolean too.

Bug: 116734731
Test: Set stune
Change-Id: I0daa3cc1723d009ed5bc2a71fa1c2e3d4ece6a7f
Signed-off-by: Wei Wang <wvw@google.com>
---
 kernel/sched/tune.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index d444fc1a4d58..5e47c29b44f6 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -569,7 +569,7 @@ prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
 	    u64 prefer_idle)
 {
 	struct schedtune *st = css_st(css);
-	st->prefer_idle = prefer_idle;
+	st->prefer_idle = !!prefer_idle;
 
 	return 0;
 }

From 8915582fd450bfabe06cf38c635111ae3e232c44 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 19 Jul 2018 18:08:35 -0700
Subject: [PATCH 1832/3220] ANDROID: sdcardfs: Don't use OVERRIDE_CRED macro

The macro hides some control flow, making it easier
to run into bugs.

bug: 111642636

Change-Id: I37ec207c277d97c4e7f1e8381bc9ae743ad78435
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/file.c     |  24 +++--
 fs/sdcardfs/inode.c    | 198 +++++++++--------------------------------
 fs/sdcardfs/lookup.c   |   9 +-
 fs/sdcardfs/sdcardfs.h |  25 ------
 4 files changed, 66 insertions(+), 190 deletions(-)

diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
index 1461254f301d..271c4c4cb760 100644
--- a/fs/sdcardfs/file.c
+++ b/fs/sdcardfs/file.c
@@ -118,7 +118,11 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd,
 		goto out;
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file)));
+	saved_cred = override_fsids(sbi, SDCARDFS_I(file_inode(file))->data);
+	if (!saved_cred) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	if (lower_file->f_op->unlocked_ioctl)
 		err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
@@ -127,7 +131,7 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd,
 	if (!err)
 		sdcardfs_copy_and_fix_attrs(file_inode(file),
 				      file_inode(lower_file));
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out:
 	return err;
 }
@@ -149,12 +153,16 @@ static long sdcardfs_compat_ioctl(struct file *file, unsigned int cmd,
 		goto out;
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file)));
+	saved_cred = override_fsids(sbi, SDCARDFS_I(file_inode(file))->data);
+	if (!saved_cred) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	if (lower_file->f_op->compat_ioctl)
 		err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
 
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out:
 	return err;
 }
@@ -241,7 +249,11 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(inode));
+	saved_cred = override_fsids(sbi, SDCARDFS_I(inode)->data);
+	if (!saved_cred) {
+		err = -ENOMEM;
+		goto out_err;
+	}
 
 	file->private_data =
 		kzalloc(sizeof(struct sdcardfs_file_info), GFP_KERNEL);
@@ -271,7 +283,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file)
 		sdcardfs_copy_and_fix_attrs(inode, sdcardfs_lower_inode(inode));
 
 out_revert_cred:
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out_err:
 	dput(parent);
 	return err;
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 75a8ab2ce5a8..0b9c345cd01c 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -22,7 +22,6 @@
 #include <linux/fs_struct.h>
 #include <linux/ratelimit.h>
 
-/* Do not directly use this function. Use OVERRIDE_CRED() instead. */
 const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
 		struct sdcardfs_inode_data *data)
 {
@@ -50,7 +49,6 @@ const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
 	return old_cred;
 }
 
-/* Do not directly use this function, use REVERT_CRED() instead. */
 void revert_fsids(const struct cred *old_cred)
 {
 	const struct cred *cur_cred;
@@ -78,7 +76,10 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+	saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+					SDCARDFS_I(dir)->data);
+	if (!saved_cred)
+		return -ENOMEM;
 
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
@@ -115,53 +116,11 @@ out:
 out_unlock:
 	unlock_dir(lower_parent_dentry);
 	sdcardfs_put_lower_path(dentry, &lower_path);
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out_eacces:
 	return err;
 }
 
-#if 0
-static int sdcardfs_link(struct dentry *old_dentry, struct inode *dir,
-		       struct dentry *new_dentry)
-{
-	struct dentry *lower_old_dentry;
-	struct dentry *lower_new_dentry;
-	struct dentry *lower_dir_dentry;
-	u64 file_size_save;
-	int err;
-	struct path lower_old_path, lower_new_path;
-
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb));
-
-	file_size_save = i_size_read(d_inode(old_dentry));
-	sdcardfs_get_lower_path(old_dentry, &lower_old_path);
-	sdcardfs_get_lower_path(new_dentry, &lower_new_path);
-	lower_old_dentry = lower_old_path.dentry;
-	lower_new_dentry = lower_new_path.dentry;
-	lower_dir_dentry = lock_parent(lower_new_dentry);
-
-	err = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
-		       lower_new_dentry, NULL);
-	if (err || !d_inode(lower_new_dentry))
-		goto out;
-
-	err = sdcardfs_interpose(new_dentry, dir->i_sb, &lower_new_path);
-	if (err)
-		goto out;
-	fsstack_copy_attr_times(dir, d_inode(lower_new_dentry));
-	fsstack_copy_inode_size(dir, d_inode(lower_new_dentry));
-	set_nlink(d_inode(old_dentry),
-		  sdcardfs_lower_inode(d_inode(old_dentry))->i_nlink);
-	i_size_write(d_inode(new_dentry), file_size_save);
-out:
-	unlock_dir(lower_dir_dentry);
-	sdcardfs_put_lower_path(old_dentry, &lower_old_path);
-	sdcardfs_put_lower_path(new_dentry, &lower_new_path);
-	REVERT_CRED();
-	return err;
-}
-#endif
-
 static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int err;
@@ -178,7 +137,10 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+	saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+						SDCARDFS_I(dir)->data);
+	if (!saved_cred)
+		return -ENOMEM;
 
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
@@ -209,43 +171,11 @@ out:
 	unlock_dir(lower_dir_dentry);
 	dput(lower_dentry);
 	sdcardfs_put_lower_path(dentry, &lower_path);
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out_eacces:
 	return err;
 }
 
-#if 0
-static int sdcardfs_symlink(struct inode *dir, struct dentry *dentry,
-			  const char *symname)
-{
-	int err;
-	struct dentry *lower_dentry;
-	struct dentry *lower_parent_dentry = NULL;
-	struct path lower_path;
-
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb));
-
-	sdcardfs_get_lower_path(dentry, &lower_path);
-	lower_dentry = lower_path.dentry;
-	lower_parent_dentry = lock_parent(lower_dentry);
-
-	err = vfs_symlink(d_inode(lower_parent_dentry), lower_dentry, symname);
-	if (err)
-		goto out;
-	err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path);
-	if (err)
-		goto out;
-	fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
-	fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
-
-out:
-	unlock_dir(lower_parent_dentry);
-	sdcardfs_put_lower_path(dentry, &lower_path);
-	REVERT_CRED();
-	return err;
-}
-#endif
-
 static int touch(char *abs_path, mode_t mode)
 {
 	struct file *filp = filp_open(abs_path, O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW, mode);
@@ -287,7 +217,10 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+	saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+						SDCARDFS_I(dir)->data);
+	if (!saved_cred)
+		return -ENOMEM;
 
 	/* check disk space */
 	parent_dentry = dget_parent(dentry);
@@ -366,13 +299,21 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	if (make_nomedia_in_obb ||
 		((pd->perm == PERM_ANDROID)
 				&& (qstr_case_eq(&dentry->d_name, &q_data)))) {
-		REVERT_CRED(saved_cred);
-		OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(d_inode(dentry)));
+		revert_fsids(saved_cred);
+		saved_cred = override_fsids(sbi,
+					SDCARDFS_I(d_inode(dentry))->data);
+		if (!saved_cred) {
+			pr_err("sdcardfs: failed to set up .nomedia in %s: %d\n",
+						lower_path.dentry->d_name.name,
+						-ENOMEM);
+			goto out;
+		}
 		set_fs_pwd(current->fs, &lower_path);
 		touch_err = touch(".nomedia", 0664);
 		if (touch_err) {
 			pr_err("sdcardfs: failed to create .nomedia in %s: %d\n",
-							lower_path.dentry->d_name.name, touch_err);
+						lower_path.dentry->d_name.name,
+						touch_err);
 			goto out;
 		}
 	}
@@ -382,7 +323,7 @@ out:
 out_unlock:
 	sdcardfs_put_lower_path(dentry, &lower_path);
 out_revert:
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out_eacces:
 	return err;
 }
@@ -402,7 +343,10 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+	saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+						SDCARDFS_I(dir)->data);
+	if (!saved_cred)
+		return -ENOMEM;
 
 	/* sdcardfs_get_real_lower(): in case of remove an user's obb dentry
 	 * the dentry on the original path should be deleted.
@@ -427,44 +371,11 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
 out:
 	unlock_dir(lower_dir_dentry);
 	sdcardfs_put_real_lower(dentry, &lower_path);
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out_eacces:
 	return err;
 }
 
-#if 0
-static int sdcardfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-			dev_t dev)
-{
-	int err;
-	struct dentry *lower_dentry;
-	struct dentry *lower_parent_dentry = NULL;
-	struct path lower_path;
-
-	OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb));
-
-	sdcardfs_get_lower_path(dentry, &lower_path);
-	lower_dentry = lower_path.dentry;
-	lower_parent_dentry = lock_parent(lower_dentry);
-
-	err = vfs_mknod(d_inode(lower_parent_dentry), lower_dentry, mode, dev);
-	if (err)
-		goto out;
-
-	err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path);
-	if (err)
-		goto out;
-	fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
-	fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
-
-out:
-	unlock_dir(lower_parent_dentry);
-	sdcardfs_put_lower_path(dentry, &lower_path);
-	REVERT_CRED();
-	return err;
-}
-#endif
-
 /*
  * The locking rules in sdcardfs_rename are complex.  We could use a simpler
  * superblock-level name-space lock for renames and copy-ups.
@@ -489,7 +400,10 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(old_dir->i_sb), saved_cred, SDCARDFS_I(new_dir));
+	saved_cred = override_fsids(SDCARDFS_SB(old_dir->i_sb),
+						SDCARDFS_I(new_dir)->data);
+	if (!saved_cred)
+		return -ENOMEM;
 
 	sdcardfs_get_real_lower(old_dentry, &lower_old_path);
 	sdcardfs_get_lower_path(new_dentry, &lower_new_path);
@@ -536,7 +450,7 @@ out:
 	dput(lower_new_dir_dentry);
 	sdcardfs_put_real_lower(old_dentry, &lower_old_path);
 	sdcardfs_put_lower_path(new_dentry, &lower_new_path);
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out_eacces:
 	return err;
 }
@@ -655,33 +569,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma
 	if (IS_POSIXACL(inode))
 		pr_warn("%s: This may be undefined behavior...\n", __func__);
 	err = generic_permission(&tmp, mask);
-	/* XXX
-	 * Original sdcardfs code calls inode_permission(lower_inode,.. )
-	 * for checking inode permission. But doing such things here seems
-	 * duplicated work, because the functions called after this func,
-	 * such as vfs_create, vfs_unlink, vfs_rename, and etc,
-	 * does exactly same thing, i.e., they calls inode_permission().
-	 * So we just let they do the things.
-	 * If there are any security hole, just uncomment following if block.
-	 */
-#if 0
-	if (!err) {
-		/*
-		 * Permission check on lower_inode(=EXT4).
-		 * we check it with AID_MEDIA_RW permission
-		 */
-		struct inode *lower_inode;
-
-		OVERRIDE_CRED(SDCARDFS_SB(inode->sb));
-
-		lower_inode = sdcardfs_lower_inode(inode);
-		err = inode_permission(lower_inode, mask);
-
-		REVERT_CRED();
-	}
-#endif
 	return err;
-
 }
 
 static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia)
@@ -756,7 +644,10 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 		goto out_err;
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred, SDCARDFS_I(inode));
+	saved_cred = override_fsids(SDCARDFS_SB(dentry->d_sb),
+						SDCARDFS_I(inode)->data);
+	if (!saved_cred)
+		return -ENOMEM;
 
 	sdcardfs_get_lower_path(dentry, &lower_path);
 	lower_dentry = lower_path.dentry;
@@ -815,7 +706,7 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct
 
 out:
 	sdcardfs_put_lower_path(dentry, &lower_path);
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out_err:
 	return err;
 }
@@ -898,13 +789,6 @@ const struct inode_operations sdcardfs_dir_iops = {
 	.setattr	= sdcardfs_setattr_wrn,
 	.setattr2	= sdcardfs_setattr,
 	.getattr	= sdcardfs_getattr,
-	/* XXX Following operations are implemented,
-	 *     but FUSE(sdcard) or FAT does not support them
-	 *     These methods are *NOT* perfectly tested.
-	.symlink	= sdcardfs_symlink,
-	.link		= sdcardfs_link,
-	.mknod		= sdcardfs_mknod,
-	 */
 };
 
 const struct inode_operations sdcardfs_main_iops = {
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
index 206f8cbc7d7d..a671ae2338ea 100644
--- a/fs/sdcardfs/lookup.c
+++ b/fs/sdcardfs/lookup.c
@@ -426,7 +426,12 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	/* save current_cred and override it */
-	OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+	saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+						SDCARDFS_I(dir)->data);
+	if (!saved_cred) {
+		ret = ERR_PTR(-ENOMEM);
+		goto out_err;
+	}
 
 	sdcardfs_get_lower_path(parent, &lower_parent_path);
 
@@ -457,7 +462,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
 
 out:
 	sdcardfs_put_lower_path(parent, &lower_parent_path);
-	REVERT_CRED(saved_cred);
+	revert_fsids(saved_cred);
 out_err:
 	dput(parent);
 	return ret;
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 055e413509e4..99227a07a8d6 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -88,31 +88,6 @@
 		(x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\
 	} while (0)
 
-/* OVERRIDE_CRED() and REVERT_CRED()
- *	OVERRIDE_CRED()
- *		backup original task->cred
- *		and modifies task->cred->fsuid/fsgid to specified value.
- *	REVERT_CRED()
- *		restore original task->cred->fsuid/fsgid.
- * These two macro should be used in pair, and OVERRIDE_CRED() should be
- * placed at the beginning of a function, right after variable declaration.
- */
-#define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info)		\
-	do {	\
-		saved_cred = override_fsids(sdcardfs_sbi, info->data);	\
-		if (!saved_cred)	\
-			return -ENOMEM;	\
-	} while (0)
-
-#define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info)	\
-	do {	\
-		saved_cred = override_fsids(sdcardfs_sbi, info->data);	\
-		if (!saved_cred)	\
-			return ERR_PTR(-ENOMEM);	\
-	} while (0)
-
-#define REVERT_CRED(saved_cred)	revert_fsids(saved_cred)
-
 /* Android 5.0 support */
 
 /* Permission mode for a specific node. Controls how file permissions

From ea3a6005d2806dd9bda3b3da8c6cd948584c1f83 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 20 Jul 2018 16:11:40 -0700
Subject: [PATCH 1833/3220] ANDROID: sdcardfs: Change current->fs under lock

bug: 111641492

Change-Id: I79e9894f94880048edaf0f7cfa2d180f65cbcf3b
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/inode.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
index 0b9c345cd01c..6c0039284ae0 100644
--- a/fs/sdcardfs/inode.c
+++ b/fs/sdcardfs/inode.c
@@ -96,8 +96,11 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 		err = -ENOMEM;
 		goto out_unlock;
 	}
+	copied_fs->umask = 0;
+	task_lock(current);
 	current->fs = copied_fs;
-	current->fs->umask = 0;
+	task_unlock(current);
+
 	err = vfs_create2(lower_dentry_mnt, d_inode(lower_parent_dentry), lower_dentry, mode, want_excl);
 	if (err)
 		goto out;
@@ -111,7 +114,9 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
 	fixup_lower_ownership(dentry, dentry->d_name.name);
 
 out:
+	task_lock(current);
 	current->fs = saved_fs;
+	task_unlock(current);
 	free_fs_struct(copied_fs);
 out_unlock:
 	unlock_dir(lower_parent_dentry);
@@ -249,8 +254,11 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 		unlock_dir(lower_parent_dentry);
 		goto out_unlock;
 	}
+	copied_fs->umask = 0;
+	task_lock(current);
 	current->fs = copied_fs;
-	current->fs->umask = 0;
+	task_unlock(current);
+
 	err = vfs_mkdir2(lower_mnt, d_inode(lower_parent_dentry), lower_dentry, mode);
 
 	if (err) {
@@ -318,7 +326,10 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 		}
 	}
 out:
+	task_lock(current);
 	current->fs = saved_fs;
+	task_unlock(current);
+
 	free_fs_struct(copied_fs);
 out_unlock:
 	sdcardfs_put_lower_path(dentry, &lower_path);

From 7a95540418bec4adafff304587715ffbc37d3fd9 Mon Sep 17 00:00:00 2001
From: Steve Muckle <smuckle@google.com>
Date: Tue, 16 Oct 2018 11:23:05 -0700
Subject: [PATCH 1834/3220] ANDROID: x86_64_cuttlefish_defconfig: disable
 CONFIG_MEMORY_STATE_TIME

Bug: 117847156
Change-Id: Idfbac9c1f0dc2617642c30ddb65400083da44b49
Signed-off-by: Steve Muckle <smuckle@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 38dd54633384..c5ccb61274c5 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -206,7 +206,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_VIRTIO_BLK=y
 CONFIG_UID_SYS_STATS=y
-CONFIG_MEMORY_STATE_TIME=y
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_BLK_DEV_SR=y

From 34e65b671bc7bd047847dd7e67392dc55585d5cf Mon Sep 17 00:00:00 2001
From: Jerry Zhang <zhangjerry@google.com>
Date: Wed, 27 Sep 2017 11:49:44 -0700
Subject: [PATCH 1835/3220] ANDROID: usb: gadget: f_mtp: Return error if count
 is negative

If the user passes in a negative file size in a int64,
this will compare to be smaller than buffer length,
and it will get truncated to form a read length that
is larger than the buffer length.

To fix, return -EINVAL if the count argument is negative,
so the loop will never happen.

Bug: 37429972
Test: Test with PoC
Change-Id: I5d52e38e6fbe2c17eb8c493f9eb81df6cfd780a4
Signed-off-by: Jerry Zhang <zhangjerry@google.com>
---
 drivers/usb/gadget/function/f_mtp.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
index b25cb3594d01..78b110dd2977 100644
--- a/drivers/usb/gadget/function/f_mtp.c
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -729,6 +729,11 @@ static void send_file_work(struct work_struct *data)
 	offset = dev->xfer_file_offset;
 	count = dev->xfer_file_length;
 
+	if (count < 0) {
+		dev->xfer_result = -EINVAL;
+		return;
+	}
+
 	DBG(cdev, "send_file_work(%lld %lld)\n", offset, count);
 
 	if (dev->xfer_send_header) {
@@ -835,6 +840,11 @@ static void receive_file_work(struct work_struct *data)
 	offset = dev->xfer_file_offset;
 	count = dev->xfer_file_length;
 
+	if (count < 0) {
+		dev->xfer_result = -EINVAL;
+		return;
+	}
+
 	DBG(cdev, "receive_file_work(%lld)\n", count);
 
 	while (count > 0 || write_req) {

From 6edf1ad773206e5e1ec76b6b8492c83eecae031d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 24 Aug 2017 00:03:43 -0700
Subject: [PATCH 1836/3220] UPSTREAM: loop: add ioctl for changing logical
 block size

This is a different approach from the first attempt in f2c6df7dbf9a
("loop: support 4k physical blocksize"). Rather than extending
LOOP_{GET,SET}_STATUS, add a separate ioctl just for setting the block
size.

Bug: 117823094
Change-Id: I8e69b8839d7fee3be564cbfce1797ce108e1aa1e
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
(cherry picked from commit 89e4fdecb51cf5535867026274bc97de9480ade5)
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/block/loop.c      | 24 ++++++++++++++++++++++++
 include/uapi/linux/loop.h |  1 +
 2 files changed, 25 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index da3902ac16c8..b98797682fe7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1070,6 +1070,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
+	blk_queue_logical_block_size(lo->lo_queue, 512);
 	if (bdev) {
 		bdput(bdev);
 		invalidate_bdev(bdev);
@@ -1355,6 +1356,24 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 	return error;
 }
 
+static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
+{
+	if (lo->lo_state != Lo_bound)
+		return -ENXIO;
+
+	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
+		return -EINVAL;
+
+	blk_mq_freeze_queue(lo->lo_queue);
+
+	blk_queue_logical_block_size(lo->lo_queue, arg);
+	loop_update_dio(lo);
+
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
+	return 0;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
@@ -1403,6 +1422,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
 			err = loop_set_dio(lo, arg);
 		break;
+	case LOOP_SET_BLOCK_SIZE:
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_block_size(lo, arg);
+		break;
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index c8125ec1f4f2..23158dbe2424 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -88,6 +88,7 @@ struct loop_info64 {
 #define LOOP_CHANGE_FD		0x4C06
 #define LOOP_SET_CAPACITY	0x4C07
 #define LOOP_SET_DIRECT_IO	0x4C08
+#define LOOP_SET_BLOCK_SIZE	0x4C09
 
 /* /dev/loop-control interface */
 #define LOOP_CTL_ADD		0x4C80

From 8567ea359cabc9805fb9fae457c0f8df3a2b9341 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 31 Aug 2017 22:09:45 -0700
Subject: [PATCH 1837/3220] BACKPORT: block/loop: set hw_sectors

Loop can handle any size of request. Limiting it to 255 sectors just
burns the CPU for bio split and request merge for underlayer disk and
also cause bad fs block allocation in directio mode.

Bug: 117823094
Change-Id: Ic4957181433c5a0d15f4cfdbf69dc5558d6dc5bd
Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
(cherry picked from commit 54bb0ade6627a183c211345761ec46e4bf0048fe)
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/block/loop.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b98797682fe7..3dc48abd22e8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1813,6 +1813,7 @@ static int loop_add(struct loop_device **l, int i)
 	}
 	lo->lo_queue->queuedata = lo;
 
+	blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
 	/*
 	 * It doesn't make sense to enable merge because the I/O
 	 * submitted to backing file is handled page by page.

From c82807c7dd9f0125b3eaf6aefae31d63c260a60b Mon Sep 17 00:00:00 2001
From: Evan Green <evgreen@chromium.org>
Date: Mon, 2 Jul 2018 16:03:46 -0700
Subject: [PATCH 1838/3220] UPSTREAM: loop: Add LOOP_SET_BLOCK_SIZE in compat
 ioctl

This change adds LOOP_SET_BLOCK_SIZE as one of the supported ioctls
in lo_compat_ioctl. It only takes an unsigned long argument, and
in practice a 32-bit value works fine.

Bug: 117823094
Change-Id: I0061a082eb2632c47b7d66f35f2c909d33ff1653
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Evan Green <evgreen@chromium.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
(cherry picked from commit 9fea4b395260175de4016b42982f45a3e6e03d0b)
Signed-off-by: Martijn Coenen <maco@android.com>
---
 drivers/block/loop.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3dc48abd22e8..7eac581912f0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1581,6 +1581,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 		arg = (unsigned long) compat_ptr(arg);
 	case LOOP_SET_FD:
 	case LOOP_CHANGE_FD:
+	case LOOP_SET_BLOCK_SIZE:
 		err = lo_ioctl(bdev, mode, cmd, arg);
 		break;
 	default:

From 076c36fce1ea0664db7ecbe9ec73a19c991c9501 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Mon, 15 Oct 2018 14:53:36 -0700
Subject: [PATCH 1839/3220] Revert "fscrypt: add Speck128/256 support"

This reverts commit eb13e0b69296ad1d3a9a3fa0cb6570aaf99f9f0c.

Resolves conflicts with a later CL, e7724207f71e "fscrypt: log the
crypto algorithm implementations".

Also leaves the include/uapi/linux/fs.h constants in place to prevent
future accidental re-use.

Bug: 116008047
Change-Id: I2d64d8d3e384400b7bdfc06a353c3844d4ebb377
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 Documentation/filesystems/fscrypt.rst | 626 --------------------------
 fs/crypto/fscrypt_private.h           |   4 -
 fs/crypto/keyinfo.c                   |  11 +-
 3 files changed, 1 insertion(+), 640 deletions(-)
 delete mode 100644 Documentation/filesystems/fscrypt.rst

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
deleted file mode 100644
index 48b424de85bb..000000000000
--- a/Documentation/filesystems/fscrypt.rst
+++ /dev/null
@@ -1,626 +0,0 @@
-=====================================
-Filesystem-level encryption (fscrypt)
-=====================================
-
-Introduction
-============
-
-fscrypt is a library which filesystems can hook into to support
-transparent encryption of files and directories.
-
-Note: "fscrypt" in this document refers to the kernel-level portion,
-implemented in ``fs/crypto/``, as opposed to the userspace tool
-`fscrypt <https://github.com/google/fscrypt>`_.  This document only
-covers the kernel-level portion.  For command-line examples of how to
-use encryption, see the documentation for the userspace tool `fscrypt
-<https://github.com/google/fscrypt>`_.  Also, it is recommended to use
-the fscrypt userspace tool, or other existing userspace tools such as
-`fscryptctl <https://github.com/google/fscryptctl>`_ or `Android's key
-management system
-<https://source.android.com/security/encryption/file-based>`_, over
-using the kernel's API directly.  Using existing tools reduces the
-chance of introducing your own security bugs.  (Nevertheless, for
-completeness this documentation covers the kernel's API anyway.)
-
-Unlike dm-crypt, fscrypt operates at the filesystem level rather than
-at the block device level.  This allows it to encrypt different files
-with different keys and to have unencrypted files on the same
-filesystem.  This is useful for multi-user systems where each user's
-data-at-rest needs to be cryptographically isolated from the others.
-However, except for filenames, fscrypt does not encrypt filesystem
-metadata.
-
-Unlike eCryptfs, which is a stacked filesystem, fscrypt is integrated
-directly into supported filesystems --- currently ext4, F2FS, and
-UBIFS.  This allows encrypted files to be read and written without
-caching both the decrypted and encrypted pages in the pagecache,
-thereby nearly halving the memory used and bringing it in line with
-unencrypted files.  Similarly, half as many dentries and inodes are
-needed.  eCryptfs also limits encrypted filenames to 143 bytes,
-causing application compatibility issues; fscrypt allows the full 255
-bytes (NAME_MAX).  Finally, unlike eCryptfs, the fscrypt API can be
-used by unprivileged users, with no need to mount anything.
-
-fscrypt does not support encrypting files in-place.  Instead, it
-supports marking an empty directory as encrypted.  Then, after
-userspace provides the key, all regular files, directories, and
-symbolic links created in that directory tree are transparently
-encrypted.
-
-Threat model
-============
-
-Offline attacks
----------------
-
-Provided that userspace chooses a strong encryption key, fscrypt
-protects the confidentiality of file contents and filenames in the
-event of a single point-in-time permanent offline compromise of the
-block device content.  fscrypt does not protect the confidentiality of
-non-filename metadata, e.g. file sizes, file permissions, file
-timestamps, and extended attributes.  Also, the existence and location
-of holes (unallocated blocks which logically contain all zeroes) in
-files is not protected.
-
-fscrypt is not guaranteed to protect confidentiality or authenticity
-if an attacker is able to manipulate the filesystem offline prior to
-an authorized user later accessing the filesystem.
-
-Online attacks
---------------
-
-fscrypt (and storage encryption in general) can only provide limited
-protection, if any at all, against online attacks.  In detail:
-
-fscrypt is only resistant to side-channel attacks, such as timing or
-electromagnetic attacks, to the extent that the underlying Linux
-Cryptographic API algorithms are.  If a vulnerable algorithm is used,
-such as a table-based implementation of AES, it may be possible for an
-attacker to mount a side channel attack against the online system.
-Side channel attacks may also be mounted against applications
-consuming decrypted data.
-
-After an encryption key has been provided, fscrypt is not designed to
-hide the plaintext file contents or filenames from other users on the
-same system, regardless of the visibility of the keyring key.
-Instead, existing access control mechanisms such as file mode bits,
-POSIX ACLs, LSMs, or mount namespaces should be used for this purpose.
-Also note that as long as the encryption keys are *anywhere* in
-memory, an online attacker can necessarily compromise them by mounting
-a physical attack or by exploiting any kernel security vulnerability
-which provides an arbitrary memory read primitive.
-
-While it is ostensibly possible to "evict" keys from the system,
-recently accessed encrypted files will remain accessible at least
-until the filesystem is unmounted or the VFS caches are dropped, e.g.
-using ``echo 2 > /proc/sys/vm/drop_caches``.  Even after that, if the
-RAM is compromised before being powered off, it will likely still be
-possible to recover portions of the plaintext file contents, if not
-some of the encryption keys as well.  (Since Linux v4.12, all
-in-kernel keys related to fscrypt are sanitized before being freed.
-However, userspace would need to do its part as well.)
-
-Currently, fscrypt does not prevent a user from maliciously providing
-an incorrect key for another user's existing encrypted files.  A
-protection against this is planned.
-
-Key hierarchy
-=============
-
-Master Keys
------------
-
-Each encrypted directory tree is protected by a *master key*.  Master
-keys can be up to 64 bytes long, and must be at least as long as the
-greater of the key length needed by the contents and filenames
-encryption modes being used.  For example, if AES-256-XTS is used for
-contents encryption, the master key must be 64 bytes (512 bits).  Note
-that the XTS mode is defined to require a key twice as long as that
-required by the underlying block cipher.
-
-To "unlock" an encrypted directory tree, userspace must provide the
-appropriate master key.  There can be any number of master keys, each
-of which protects any number of directory trees on any number of
-filesystems.
-
-Userspace should generate master keys either using a cryptographically
-secure random number generator, or by using a KDF (Key Derivation
-Function).  Note that whenever a KDF is used to "stretch" a
-lower-entropy secret such as a passphrase, it is critical that a KDF
-designed for this purpose be used, such as scrypt, PBKDF2, or Argon2.
-
-Per-file keys
--------------
-
-Master keys are not used to encrypt file contents or names directly.
-Instead, a unique key is derived for each encrypted file, including
-each regular file, directory, and symbolic link.  This has several
-advantages:
-
-- In cryptosystems, the same key material should never be used for
-  different purposes.  Using the master key as both an XTS key for
-  contents encryption and as a CTS-CBC key for filenames encryption
-  would violate this rule.
-- Per-file keys simplify the choice of IVs (Initialization Vectors)
-  for contents encryption.  Without per-file keys, to ensure IV
-  uniqueness both the inode and logical block number would need to be
-  encoded in the IVs.  This would make it impossible to renumber
-  inodes, which e.g. ``resize2fs`` can do when resizing an ext4
-  filesystem.  With per-file keys, it is sufficient to encode just the
-  logical block number in the IVs.
-- Per-file keys strengthen the encryption of filenames, where IVs are
-  reused out of necessity.  With a unique key per directory, IV reuse
-  is limited to within a single directory.
-- Per-file keys allow individual files to be securely erased simply by
-  securely erasing their keys.  (Not yet implemented.)
-
-A KDF (Key Derivation Function) is used to derive per-file keys from
-the master key.  This is done instead of wrapping a randomly-generated
-key for each file because it reduces the size of the encryption xattr,
-which for some filesystems makes the xattr more likely to fit in-line
-in the filesystem's inode table.  With a KDF, only a 16-byte nonce is
-required --- long enough to make key reuse extremely unlikely.  A
-wrapped key, on the other hand, would need to be up to 64 bytes ---
-the length of an AES-256-XTS key.  Furthermore, currently there is no
-requirement to support unlocking a file with multiple alternative
-master keys or to support rotating master keys.  Instead, the master
-keys may be wrapped in userspace, e.g. as done by the `fscrypt
-<https://github.com/google/fscrypt>`_ tool.
-
-The current KDF encrypts the master key using the 16-byte nonce as an
-AES-128-ECB key.  The output is used as the derived key.  If the
-output is longer than needed, then it is truncated to the needed
-length.  Truncation is the norm for directories and symlinks, since
-those use the CTS-CBC encryption mode which requires a key half as
-long as that required by the XTS encryption mode.
-
-Note: this KDF meets the primary security requirement, which is to
-produce unique derived keys that preserve the entropy of the master
-key, assuming that the master key is already a good pseudorandom key.
-However, it is nonstandard and has some problems such as being
-reversible, so it is generally considered to be a mistake!  It may be
-replaced with HKDF or another more standard KDF in the future.
-
-Encryption modes and usage
-==========================
-
-fscrypt allows one encryption mode to be specified for file contents
-and one encryption mode to be specified for filenames.  Different
-directory trees are permitted to use different encryption modes.
-Currently, the following pairs of encryption modes are supported:
-
-- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
-- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
-- Speck128/256-XTS for contents and Speck128/256-CTS-CBC for filenames
-
-It is strongly recommended to use AES-256-XTS for contents encryption.
-AES-128-CBC was added only for low-powered embedded devices with
-crypto accelerators such as CAAM or CESA that do not support XTS.
-
-Similarly, Speck128/256 support was only added for older or low-end
-CPUs which cannot do AES fast enough -- especially ARM CPUs which have
-NEON instructions but not the Cryptography Extensions -- and for which
-it would not otherwise be feasible to use encryption at all.  It is
-not recommended to use Speck on CPUs that have AES instructions.
-Speck support is only available if it has been enabled in the crypto
-API via CONFIG_CRYPTO_SPECK.  Also, on ARM platforms, to get
-acceptable performance CONFIG_CRYPTO_SPECK_NEON must be enabled.
-
-New encryption modes can be added relatively easily, without changes
-to individual filesystems.  However, authenticated encryption (AE)
-modes are not currently supported because of the difficulty of dealing
-with ciphertext expansion.
-
-For file contents, each filesystem block is encrypted independently.
-Currently, only the case where the filesystem block size is equal to
-the system's page size (usually 4096 bytes) is supported.  With the
-XTS mode of operation (recommended), the logical block number within
-the file is used as the IV.  With the CBC mode of operation (not
-recommended), ESSIV is used; specifically, the IV for CBC is the
-logical block number encrypted with AES-256, where the AES-256 key is
-the SHA-256 hash of the inode's data encryption key.
-
-For filenames, the full filename is encrypted at once.  Because of the
-requirements to retain support for efficient directory lookups and
-filenames of up to 255 bytes, a constant initialization vector (IV) is
-used.  However, each encrypted directory uses a unique key, which
-limits IV reuse to within a single directory.  Note that IV reuse in
-the context of CTS-CBC encryption means that when the original
-filenames share a common prefix at least as long as the cipher block
-size (16 bytes for AES), the corresponding encrypted filenames will
-also share a common prefix.  This is undesirable; it may be fixed in
-the future by switching to an encryption mode that is a strong
-pseudorandom permutation on arbitrary-length messages, e.g. the HEH
-(Hash-Encrypt-Hash) mode.
-
-Since filenames are encrypted with the CTS-CBC mode of operation, the
-plaintext and ciphertext filenames need not be multiples of the AES
-block size, i.e. 16 bytes.  However, the minimum size that can be
-encrypted is 16 bytes, so shorter filenames are NUL-padded to 16 bytes
-before being encrypted.  In addition, to reduce leakage of filename
-lengths via their ciphertexts, all filenames are NUL-padded to the
-next 4, 8, 16, or 32-byte boundary (configurable).  32 is recommended
-since this provides the best confidentiality, at the cost of making
-directory entries consume slightly more space.  Note that since NUL
-(``\0``) is not otherwise a valid character in filenames, the padding
-will never produce duplicate plaintexts.
-
-Symbolic link targets are considered a type of filename and are
-encrypted in the same way as filenames in directory entries.  Each
-symlink also uses a unique key; hence, the hardcoded IV is not a
-problem for symlinks.
-
-User API
-========
-
-Setting an encryption policy
-----------------------------
-
-The FS_IOC_SET_ENCRYPTION_POLICY ioctl sets an encryption policy on an
-empty directory or verifies that a directory or regular file already
-has the specified encryption policy.  It takes in a pointer to a
-:c:type:`struct fscrypt_policy`, defined as follows::
-
-    #define FS_KEY_DESCRIPTOR_SIZE  8
-
-    struct fscrypt_policy {
-            __u8 version;
-            __u8 contents_encryption_mode;
-            __u8 filenames_encryption_mode;
-            __u8 flags;
-            __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
-    };
-
-This structure must be initialized as follows:
-
-- ``version`` must be 0.
-
-- ``contents_encryption_mode`` and ``filenames_encryption_mode`` must
-  be set to constants from ``<linux/fs.h>`` which identify the
-  encryption modes to use.  If unsure, use
-  FS_ENCRYPTION_MODE_AES_256_XTS (1) for ``contents_encryption_mode``
-  and FS_ENCRYPTION_MODE_AES_256_CTS (4) for
-  ``filenames_encryption_mode``.
-
-- ``flags`` must be set to a value from ``<linux/fs.h>`` which
-  identifies the amount of NUL-padding to use when encrypting
-  filenames.  If unsure, use FS_POLICY_FLAGS_PAD_32 (0x3).
-
-- ``master_key_descriptor`` specifies how to find the master key in
-  the keyring; see `Adding keys`_.  It is up to userspace to choose a
-  unique ``master_key_descriptor`` for each master key.  The e4crypt
-  and fscrypt tools use the first 8 bytes of
-  ``SHA-512(SHA-512(master_key))``, but this particular scheme is not
-  required.  Also, the master key need not be in the keyring yet when
-  FS_IOC_SET_ENCRYPTION_POLICY is executed.  However, it must be added
-  before any files can be created in the encrypted directory.
-
-If the file is not yet encrypted, then FS_IOC_SET_ENCRYPTION_POLICY
-verifies that the file is an empty directory.  If so, the specified
-encryption policy is assigned to the directory, turning it into an
-encrypted directory.  After that, and after providing the
-corresponding master key as described in `Adding keys`_, all regular
-files, directories (recursively), and symlinks created in the
-directory will be encrypted, inheriting the same encryption policy.
-The filenames in the directory's entries will be encrypted as well.
-
-Alternatively, if the file is already encrypted, then
-FS_IOC_SET_ENCRYPTION_POLICY validates that the specified encryption
-policy exactly matches the actual one.  If they match, then the ioctl
-returns 0.  Otherwise, it fails with EEXIST.  This works on both
-regular files and directories, including nonempty directories.
-
-Note that the ext4 filesystem does not allow the root directory to be
-encrypted, even if it is empty.  Users who want to encrypt an entire
-filesystem with one key should consider using dm-crypt instead.
-
-FS_IOC_SET_ENCRYPTION_POLICY can fail with the following errors:
-
-- ``EACCES``: the file is not owned by the process's uid, nor does the
-  process have the CAP_FOWNER capability in a namespace with the file
-  owner's uid mapped
-- ``EEXIST``: the file is already encrypted with an encryption policy
-  different from the one specified
-- ``EINVAL``: an invalid encryption policy was specified (invalid
-  version, mode(s), or flags)
-- ``ENOTDIR``: the file is unencrypted and is a regular file, not a
-  directory
-- ``ENOTEMPTY``: the file is unencrypted and is a nonempty directory
-- ``ENOTTY``: this type of filesystem does not implement encryption
-- ``EOPNOTSUPP``: the kernel was not configured with encryption
-  support for this filesystem, or the filesystem superblock has not
-  had encryption enabled on it.  (For example, to use encryption on an
-  ext4 filesystem, CONFIG_EXT4_ENCRYPTION must be enabled in the
-  kernel config, and the superblock must have had the "encrypt"
-  feature flag enabled using ``tune2fs -O encrypt`` or ``mkfs.ext4 -O
-  encrypt``.)
-- ``EPERM``: this directory may not be encrypted, e.g. because it is
-  the root directory of an ext4 filesystem
-- ``EROFS``: the filesystem is readonly
-
-Getting an encryption policy
-----------------------------
-
-The FS_IOC_GET_ENCRYPTION_POLICY ioctl retrieves the :c:type:`struct
-fscrypt_policy`, if any, for a directory or regular file.  See above
-for the struct definition.  No additional permissions are required
-beyond the ability to open the file.
-
-FS_IOC_GET_ENCRYPTION_POLICY can fail with the following errors:
-
-- ``EINVAL``: the file is encrypted, but it uses an unrecognized
-  encryption context format
-- ``ENODATA``: the file is not encrypted
-- ``ENOTTY``: this type of filesystem does not implement encryption
-- ``EOPNOTSUPP``: the kernel was not configured with encryption
-  support for this filesystem
-
-Note: if you only need to know whether a file is encrypted or not, on
-most filesystems it is also possible to use the FS_IOC_GETFLAGS ioctl
-and check for FS_ENCRYPT_FL, or to use the statx() system call and
-check for STATX_ATTR_ENCRYPTED in stx_attributes.
-
-Getting the per-filesystem salt
--------------------------------
-
-Some filesystems, such as ext4 and F2FS, also support the deprecated
-ioctl FS_IOC_GET_ENCRYPTION_PWSALT.  This ioctl retrieves a randomly
-generated 16-byte value stored in the filesystem superblock.  This
-value is intended to used as a salt when deriving an encryption key
-from a passphrase or other low-entropy user credential.
-
-FS_IOC_GET_ENCRYPTION_PWSALT is deprecated.  Instead, prefer to
-generate and manage any needed salt(s) in userspace.
-
-Adding keys
------------
-
-To provide a master key, userspace must add it to an appropriate
-keyring using the add_key() system call (see:
-``Documentation/security/keys/core.rst``).  The key type must be
-"logon"; keys of this type are kept in kernel memory and cannot be
-read back by userspace.  The key description must be "fscrypt:"
-followed by the 16-character lower case hex representation of the
-``master_key_descriptor`` that was set in the encryption policy.  The
-key payload must conform to the following structure::
-
-    #define FS_MAX_KEY_SIZE 64
-
-    struct fscrypt_key {
-            u32 mode;
-            u8 raw[FS_MAX_KEY_SIZE];
-            u32 size;
-    };
-
-``mode`` is ignored; just set it to 0.  The actual key is provided in
-``raw`` with ``size`` indicating its size in bytes.  That is, the
-bytes ``raw[0..size-1]`` (inclusive) are the actual key.
-
-The key description prefix "fscrypt:" may alternatively be replaced
-with a filesystem-specific prefix such as "ext4:".  However, the
-filesystem-specific prefixes are deprecated and should not be used in
-new programs.
-
-There are several different types of keyrings in which encryption keys
-may be placed, such as a session keyring, a user session keyring, or a
-user keyring.  Each key must be placed in a keyring that is "attached"
-to all processes that might need to access files encrypted with it, in
-the sense that request_key() will find the key.  Generally, if only
-processes belonging to a specific user need to access a given
-encrypted directory and no session keyring has been installed, then
-that directory's key should be placed in that user's user session
-keyring or user keyring.  Otherwise, a session keyring should be
-installed if needed, and the key should be linked into that session
-keyring, or in a keyring linked into that session keyring.
-
-Note: introducing the complex visibility semantics of keyrings here
-was arguably a mistake --- especially given that by design, after any
-process successfully opens an encrypted file (thereby setting up the
-per-file key), possessing the keyring key is not actually required for
-any process to read/write the file until its in-memory inode is
-evicted.  In the future there probably should be a way to provide keys
-directly to the filesystem instead, which would make the intended
-semantics clearer.
-
-Access semantics
-================
-
-With the key
-------------
-
-With the encryption key, encrypted regular files, directories, and
-symlinks behave very similarly to their unencrypted counterparts ---
-after all, the encryption is intended to be transparent.  However,
-astute users may notice some differences in behavior:
-
-- Unencrypted files, or files encrypted with a different encryption
-  policy (i.e. different key, modes, or flags), cannot be renamed or
-  linked into an encrypted directory; see `Encryption policy
-  enforcement`_.  Attempts to do so will fail with EPERM.  However,
-  encrypted files can be renamed within an encrypted directory, or
-  into an unencrypted directory.
-
-- Direct I/O is not supported on encrypted files.  Attempts to use
-  direct I/O on such files will fall back to buffered I/O.
-
-- The fallocate operations FALLOC_FL_COLLAPSE_RANGE,
-  FALLOC_FL_INSERT_RANGE, and FALLOC_FL_ZERO_RANGE are not supported
-  on encrypted files and will fail with EOPNOTSUPP.
-
-- Online defragmentation of encrypted files is not supported.  The
-  EXT4_IOC_MOVE_EXT and F2FS_IOC_MOVE_RANGE ioctls will fail with
-  EOPNOTSUPP.
-
-- The ext4 filesystem does not support data journaling with encrypted
-  regular files.  It will fall back to ordered data mode instead.
-
-- DAX (Direct Access) is not supported on encrypted files.
-
-- The st_size of an encrypted symlink will not necessarily give the
-  length of the symlink target as required by POSIX.  It will actually
-  give the length of the ciphertext, which will be slightly longer
-  than the plaintext due to NUL-padding and an extra 2-byte overhead.
-
-- The maximum length of an encrypted symlink is 2 bytes shorter than
-  the maximum length of an unencrypted symlink.  For example, on an
-  EXT4 filesystem with a 4K block size, unencrypted symlinks can be up
-  to 4095 bytes long, while encrypted symlinks can only be up to 4093
-  bytes long (both lengths excluding the terminating null).
-
-Note that mmap *is* supported.  This is possible because the pagecache
-for an encrypted file contains the plaintext, not the ciphertext.
-
-Without the key
----------------
-
-Some filesystem operations may be performed on encrypted regular
-files, directories, and symlinks even before their encryption key has
-been provided:
-
-- File metadata may be read, e.g. using stat().
-
-- Directories may be listed, in which case the filenames will be
-  listed in an encoded form derived from their ciphertext.  The
-  current encoding algorithm is described in `Filename hashing and
-  encoding`_.  The algorithm is subject to change, but it is
-  guaranteed that the presented filenames will be no longer than
-  NAME_MAX bytes, will not contain the ``/`` or ``\0`` characters, and
-  will uniquely identify directory entries.
-
-  The ``.`` and ``..`` directory entries are special.  They are always
-  present and are not encrypted or encoded.
-
-- Files may be deleted.  That is, nondirectory files may be deleted
-  with unlink() as usual, and empty directories may be deleted with
-  rmdir() as usual.  Therefore, ``rm`` and ``rm -r`` will work as
-  expected.
-
-- Symlink targets may be read and followed, but they will be presented
-  in encrypted form, similar to filenames in directories.  Hence, they
-  are unlikely to point to anywhere useful.
-
-Without the key, regular files cannot be opened or truncated.
-Attempts to do so will fail with ENOKEY.  This implies that any
-regular file operations that require a file descriptor, such as
-read(), write(), mmap(), fallocate(), and ioctl(), are also forbidden.
-
-Also without the key, files of any type (including directories) cannot
-be created or linked into an encrypted directory, nor can a name in an
-encrypted directory be the source or target of a rename, nor can an
-O_TMPFILE temporary file be created in an encrypted directory.  All
-such operations will fail with ENOKEY.
-
-It is not currently possible to backup and restore encrypted files
-without the encryption key.  This would require special APIs which
-have not yet been implemented.
-
-Encryption policy enforcement
-=============================
-
-After an encryption policy has been set on a directory, all regular
-files, directories, and symbolic links created in that directory
-(recursively) will inherit that encryption policy.  Special files ---
-that is, named pipes, device nodes, and UNIX domain sockets --- will
-not be encrypted.
-
-Except for those special files, it is forbidden to have unencrypted
-files, or files encrypted with a different encryption policy, in an
-encrypted directory tree.  Attempts to link or rename such a file into
-an encrypted directory will fail with EPERM.  This is also enforced
-during ->lookup() to provide limited protection against offline
-attacks that try to disable or downgrade encryption in known locations
-where applications may later write sensitive data.  It is recommended
-that systems implementing a form of "verified boot" take advantage of
-this by validating all top-level encryption policies prior to access.
-
-Implementation details
-======================
-
-Encryption context
-------------------
-
-An encryption policy is represented on-disk by a :c:type:`struct
-fscrypt_context`.  It is up to individual filesystems to decide where
-to store it, but normally it would be stored in a hidden extended
-attribute.  It should *not* be exposed by the xattr-related system
-calls such as getxattr() and setxattr() because of the special
-semantics of the encryption xattr.  (In particular, there would be
-much confusion if an encryption policy were to be added to or removed
-from anything other than an empty directory.)  The struct is defined
-as follows::
-
-    #define FS_KEY_DESCRIPTOR_SIZE  8
-    #define FS_KEY_DERIVATION_NONCE_SIZE 16
-
-    struct fscrypt_context {
-            u8 format;
-            u8 contents_encryption_mode;
-            u8 filenames_encryption_mode;
-            u8 flags;
-            u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
-            u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
-    };
-
-Note that :c:type:`struct fscrypt_context` contains the same
-information as :c:type:`struct fscrypt_policy` (see `Setting an
-encryption policy`_), except that :c:type:`struct fscrypt_context`
-also contains a nonce.  The nonce is randomly generated by the kernel
-and is used to derive the inode's encryption key as described in
-`Per-file keys`_.
-
-Data path changes
------------------
-
-For the read path (->readpage()) of regular files, filesystems can
-read the ciphertext into the page cache and decrypt it in-place.  The
-page lock must be held until decryption has finished, to prevent the
-page from becoming visible to userspace prematurely.
-
-For the write path (->writepage()) of regular files, filesystems
-cannot encrypt data in-place in the page cache, since the cached
-plaintext must be preserved.  Instead, filesystems must encrypt into a
-temporary buffer or "bounce page", then write out the temporary
-buffer.  Some filesystems, such as UBIFS, already use temporary
-buffers regardless of encryption.  Other filesystems, such as ext4 and
-F2FS, have to allocate bounce pages specially for encryption.
-
-Filename hashing and encoding
------------------------------
-
-Modern filesystems accelerate directory lookups by using indexed
-directories.  An indexed directory is organized as a tree keyed by
-filename hashes.  When a ->lookup() is requested, the filesystem
-normally hashes the filename being looked up so that it can quickly
-find the corresponding directory entry, if any.
-
-With encryption, lookups must be supported and efficient both with and
-without the encryption key.  Clearly, it would not work to hash the
-plaintext filenames, since the plaintext filenames are unavailable
-without the key.  (Hashing the plaintext filenames would also make it
-impossible for the filesystem's fsck tool to optimize encrypted
-directories.)  Instead, filesystems hash the ciphertext filenames,
-i.e. the bytes actually stored on-disk in the directory entries.  When
-asked to do a ->lookup() with the key, the filesystem just encrypts
-the user-supplied name to get the ciphertext.
-
-Lookups without the key are more complicated.  The raw ciphertext may
-contain the ``\0`` and ``/`` characters, which are illegal in
-filenames.  Therefore, readdir() must base64-encode the ciphertext for
-presentation.  For most filenames, this works fine; on ->lookup(), the
-filesystem just base64-decodes the user-supplied name to get back to
-the raw ciphertext.
-
-However, for very long filenames, base64 encoding would cause the
-filename length to exceed NAME_MAX.  To prevent this, readdir()
-actually presents long filenames in an abbreviated form which encodes
-a strong "hash" of the ciphertext filename, along with the optional
-filesystem-specific hash(es) needed for directory lookups.  This
-allows the filesystem to still, with a high degree of confidence, map
-the filename given in ->lookup() back to a particular directory entry
-that was previously listed by readdir().  See :c:type:`struct
-fscrypt_digested_name` in the source for more details.
-
-Note that the precise way that filenames are presented to userspace
-without the key is subject to change in the future.  It is only meant
-as a way to temporarily present valid filenames so that commands like
-``rm -r`` work as expected on encrypted directories.
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index ea372cd53ab6..92c6c0ace1b1 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -93,10 +93,6 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
 	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
 		return true;
 
-	if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS &&
-	    filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS)
-		return true;
-
 	return false;
 }
 
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 382e828f2f9a..b13e968273a4 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -153,6 +153,7 @@ static struct fscrypt_mode {
 	int keysize;
 	bool logged_impl_name;
 } available_modes[] = {
+<<<<<<< HEAD
 	[FS_ENCRYPTION_MODE_AES_256_XTS] = {
 		.friendly_name = "AES-256-XTS",
 		.cipher_str = "xts(aes)",
@@ -173,16 +174,6 @@ static struct fscrypt_mode {
 		.cipher_str = "cts(cbc(aes))",
 		.keysize = 16,
 	},
-	[FS_ENCRYPTION_MODE_SPECK128_256_XTS] = {
-		.friendly_name = "Speck128/256-XTS",
-		.cipher_str = "xts(speck128)",
-		.keysize = 64,
-	},
-	[FS_ENCRYPTION_MODE_SPECK128_256_CTS] = {
-		.friendly_name = "Speck128/256-CTS-CBC",
-		.cipher_str = "cts(cbc(speck128))",
-		.keysize = 32,
-	},
 };
 
 static struct fscrypt_mode *

From 08400aec8d4a5f438e71502b4f7745dd5de1caa8 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Mon, 22 Oct 2018 14:13:16 -0700
Subject: [PATCH 1840/3220] Revert "BACKPORT, FROMLIST: crypto: arm64/speck -
 add NEON-accelerated implementation of Speck-XTS"

This reverts commit 37343fde0c4a8645e9530540bb8c10d18db3875a.

Bug: 116008047
Change-Id: Ic47509910c162a35f6fba10a196f4369299451ac
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/arm64/crypto/Kconfig           |   7 -
 arch/arm64/crypto/Makefile          |   3 -
 arch/arm64/crypto/speck-neon-core.S | 352 ----------------------------
 arch/arm64/crypto/speck-neon-glue.c | 308 ------------------------
 4 files changed, 670 deletions(-)
 delete mode 100644 arch/arm64/crypto/speck-neon-core.S
 delete mode 100644 arch/arm64/crypto/speck-neon-glue.c

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 487cd382d333..de1aab4b5da8 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -58,11 +58,4 @@ config CRYPTO_CRC32_ARM64
 	tristate "CRC32 and CRC32C using optional ARMv8 instructions"
 	depends on ARM64
 	select CRYPTO_HASH
-
-config CRYPTO_SPECK_NEON
-	tristate "NEON accelerated Speck cipher algorithms"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLKCIPHER
-	select CRYPTO_GF128MUL
-	select CRYPTO_SPECK
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 28a71246e0a3..f0a8f2475ea3 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -32,9 +32,6 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
 obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
 aes-neon-blk-y := aes-glue-neon.o aes-neon.o
 
-obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
-speck-neon-y := speck-neon-core.o speck-neon-glue.o
-
 AFLAGS_aes-ce.o		:= -DINTERLEAVE=4
 AFLAGS_aes-neon.o	:= -DINTERLEAVE=4
 
diff --git a/arch/arm64/crypto/speck-neon-core.S b/arch/arm64/crypto/speck-neon-core.S
deleted file mode 100644
index b14463438b09..000000000000
--- a/arch/arm64/crypto/speck-neon-core.S
+++ /dev/null
@@ -1,352 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-	.text
-
-	// arguments
-	ROUND_KEYS	.req	x0	// const {u64,u32} *round_keys
-	NROUNDS		.req	w1	// int nrounds
-	NROUNDS_X	.req	x1
-	DST		.req	x2	// void *dst
-	SRC		.req	x3	// const void *src
-	NBYTES		.req	w4	// unsigned int nbytes
-	TWEAK		.req	x5	// void *tweak
-
-	// registers which hold the data being encrypted/decrypted
-	// (underscores avoid a naming collision with ARM64 registers x0-x3)
-	X_0		.req	v0
-	Y_0		.req	v1
-	X_1		.req	v2
-	Y_1		.req	v3
-	X_2		.req	v4
-	Y_2		.req	v5
-	X_3		.req	v6
-	Y_3		.req	v7
-
-	// the round key, duplicated in all lanes
-	ROUND_KEY	.req	v8
-
-	// index vector for tbl-based 8-bit rotates
-	ROTATE_TABLE	.req	v9
-	ROTATE_TABLE_Q	.req	q9
-
-	// temporary registers
-	TMP0		.req	v10
-	TMP1		.req	v11
-	TMP2		.req	v12
-	TMP3		.req	v13
-
-	// multiplication table for updating XTS tweaks
-	GFMUL_TABLE	.req	v14
-	GFMUL_TABLE_Q	.req	q14
-
-	// next XTS tweak value(s)
-	TWEAKV_NEXT	.req	v15
-
-	// XTS tweaks for the blocks currently being encrypted/decrypted
-	TWEAKV0		.req	v16
-	TWEAKV1		.req	v17
-	TWEAKV2		.req	v18
-	TWEAKV3		.req	v19
-	TWEAKV4		.req	v20
-	TWEAKV5		.req	v21
-	TWEAKV6		.req	v22
-	TWEAKV7		.req	v23
-
-	.align		4
-.Lror64_8_table:
-	.octa		0x080f0e0d0c0b0a090007060504030201
-.Lror32_8_table:
-	.octa		0x0c0f0e0d080b0a090407060500030201
-.Lrol64_8_table:
-	.octa		0x0e0d0c0b0a09080f0605040302010007
-.Lrol32_8_table:
-	.octa		0x0e0d0c0f0a09080b0605040702010003
-.Lgf128mul_table:
-	.octa		0x00000000000000870000000000000001
-.Lgf64mul_table:
-	.octa		0x0000000000000000000000002d361b00
-
-/*
- * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
- *
- * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
- * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
- * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
- * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
- */
-.macro _speck_round_128bytes	n, lanes
-
-	// x = ror(x, 8)
-	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
-	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
-	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
-	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
-
-	// x += y
-	add		X_0.\lanes, X_0.\lanes, Y_0.\lanes
-	add		X_1.\lanes, X_1.\lanes, Y_1.\lanes
-	add		X_2.\lanes, X_2.\lanes, Y_2.\lanes
-	add		X_3.\lanes, X_3.\lanes, Y_3.\lanes
-
-	// x ^= k
-	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
-	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
-	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
-	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
-
-	// y = rol(y, 3)
-	shl		TMP0.\lanes, Y_0.\lanes, #3
-	shl		TMP1.\lanes, Y_1.\lanes, #3
-	shl		TMP2.\lanes, Y_2.\lanes, #3
-	shl		TMP3.\lanes, Y_3.\lanes, #3
-	sri		TMP0.\lanes, Y_0.\lanes, #(\n - 3)
-	sri		TMP1.\lanes, Y_1.\lanes, #(\n - 3)
-	sri		TMP2.\lanes, Y_2.\lanes, #(\n - 3)
-	sri		TMP3.\lanes, Y_3.\lanes, #(\n - 3)
-
-	// y ^= x
-	eor		Y_0.16b, TMP0.16b, X_0.16b
-	eor		Y_1.16b, TMP1.16b, X_1.16b
-	eor		Y_2.16b, TMP2.16b, X_2.16b
-	eor		Y_3.16b, TMP3.16b, X_3.16b
-.endm
-
-/*
- * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
- *
- * This is the inverse of _speck_round_128bytes().
- */
-.macro _speck_unround_128bytes	n, lanes
-
-	// y ^= x
-	eor		TMP0.16b, Y_0.16b, X_0.16b
-	eor		TMP1.16b, Y_1.16b, X_1.16b
-	eor		TMP2.16b, Y_2.16b, X_2.16b
-	eor		TMP3.16b, Y_3.16b, X_3.16b
-
-	// y = ror(y, 3)
-	ushr		Y_0.\lanes, TMP0.\lanes, #3
-	ushr		Y_1.\lanes, TMP1.\lanes, #3
-	ushr		Y_2.\lanes, TMP2.\lanes, #3
-	ushr		Y_3.\lanes, TMP3.\lanes, #3
-	sli		Y_0.\lanes, TMP0.\lanes, #(\n - 3)
-	sli		Y_1.\lanes, TMP1.\lanes, #(\n - 3)
-	sli		Y_2.\lanes, TMP2.\lanes, #(\n - 3)
-	sli		Y_3.\lanes, TMP3.\lanes, #(\n - 3)
-
-	// x ^= k
-	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
-	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
-	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
-	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
-
-	// x -= y
-	sub		X_0.\lanes, X_0.\lanes, Y_0.\lanes
-	sub		X_1.\lanes, X_1.\lanes, Y_1.\lanes
-	sub		X_2.\lanes, X_2.\lanes, Y_2.\lanes
-	sub		X_3.\lanes, X_3.\lanes, Y_3.\lanes
-
-	// x = rol(x, 8)
-	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
-	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
-	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
-	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
-.endm
-
-.macro _next_xts_tweak	next, cur, tmp, n
-.if \n == 64
-	/*
-	 * Calculate the next tweak by multiplying the current one by x,
-	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
-	 */
-	sshr		\tmp\().2d, \cur\().2d, #63
-	and		\tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
-	shl		\next\().2d, \cur\().2d, #1
-	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
-	eor		\next\().16b, \next\().16b, \tmp\().16b
-.else
-	/*
-	 * Calculate the next two tweaks by multiplying the current ones by x^2,
-	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
-	 */
-	ushr		\tmp\().2d, \cur\().2d, #62
-	shl		\next\().2d, \cur\().2d, #2
-	tbl		\tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
-	eor		\next\().16b, \next\().16b, \tmp\().16b
-.endif
-.endm
-
-/*
- * _speck_xts_crypt() - Speck-XTS encryption/decryption
- *
- * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
- * using Speck-XTS, specifically the variant with a block size of '2n' and round
- * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
- * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
- * nonzero multiple of 128.
- */
-.macro _speck_xts_crypt	n, lanes, decrypting
-
-	/*
-	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
-	 * round key rather than the first, since for decryption the round keys
-	 * are used in reverse order.
-	 */
-.if \decrypting
-	mov		NROUNDS, NROUNDS	/* zero the high 32 bits */
-.if \n == 64
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
-	sub		ROUND_KEYS, ROUND_KEYS, #8
-.else
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
-	sub		ROUND_KEYS, ROUND_KEYS, #4
-.endif
-.endif
-
-	// Load the index vector for tbl-based 8-bit rotates
-.if \decrypting
-	ldr		ROTATE_TABLE_Q, .Lrol\n\()_8_table
-.else
-	ldr		ROTATE_TABLE_Q, .Lror\n\()_8_table
-.endif
-
-	// One-time XTS preparation
-.if \n == 64
-	// Load first tweak
-	ld1		{TWEAKV0.16b}, [TWEAK]
-
-	// Load GF(2^128) multiplication table
-	ldr		GFMUL_TABLE_Q, .Lgf128mul_table
-.else
-	// Load first tweak
-	ld1		{TWEAKV0.8b}, [TWEAK]
-
-	// Load GF(2^64) multiplication table
-	ldr		GFMUL_TABLE_Q, .Lgf64mul_table
-
-	// Calculate second tweak, packing it together with the first
-	ushr		TMP0.2d, TWEAKV0.2d, #63
-	shl		TMP1.2d, TWEAKV0.2d, #1
-	tbl		TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
-	eor		TMP0.8b, TMP0.8b, TMP1.8b
-	mov		TWEAKV0.d[1], TMP0.d[0]
-.endif
-
-.Lnext_128bytes_\@:
-
-	// Calculate XTS tweaks for next 128 bytes
-	_next_xts_tweak	TWEAKV1, TWEAKV0, TMP0, \n
-	_next_xts_tweak	TWEAKV2, TWEAKV1, TMP0, \n
-	_next_xts_tweak	TWEAKV3, TWEAKV2, TMP0, \n
-	_next_xts_tweak	TWEAKV4, TWEAKV3, TMP0, \n
-	_next_xts_tweak	TWEAKV5, TWEAKV4, TMP0, \n
-	_next_xts_tweak	TWEAKV6, TWEAKV5, TMP0, \n
-	_next_xts_tweak	TWEAKV7, TWEAKV6, TMP0, \n
-	_next_xts_tweak	TWEAKV_NEXT, TWEAKV7, TMP0, \n
-
-	// Load the next source blocks into {X,Y}[0-3]
-	ld1		{X_0.16b-Y_1.16b}, [SRC], #64
-	ld1		{X_2.16b-Y_3.16b}, [SRC], #64
-
-	// XOR the source blocks with their XTS tweaks
-	eor		TMP0.16b, X_0.16b, TWEAKV0.16b
-	eor		Y_0.16b,  Y_0.16b, TWEAKV1.16b
-	eor		TMP1.16b, X_1.16b, TWEAKV2.16b
-	eor		Y_1.16b,  Y_1.16b, TWEAKV3.16b
-	eor		TMP2.16b, X_2.16b, TWEAKV4.16b
-	eor		Y_2.16b,  Y_2.16b, TWEAKV5.16b
-	eor		TMP3.16b, X_3.16b, TWEAKV6.16b
-	eor		Y_3.16b,  Y_3.16b, TWEAKV7.16b
-
-	/*
-	 * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
-	 * that the X[0-3] registers contain only the second halves of blocks,
-	 * and the Y[0-3] registers contain only the first halves of blocks.
-	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
-	 */
-	uzp2		X_0.\lanes, TMP0.\lanes, Y_0.\lanes
-	uzp1		Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
-	uzp2		X_1.\lanes, TMP1.\lanes, Y_1.\lanes
-	uzp1		Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
-	uzp2		X_2.\lanes, TMP2.\lanes, Y_2.\lanes
-	uzp1		Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
-	uzp2		X_3.\lanes, TMP3.\lanes, Y_3.\lanes
-	uzp1		Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
-
-	// Do the cipher rounds
-	mov		x6, ROUND_KEYS
-	mov		w7, NROUNDS
-.Lnext_round_\@:
-.if \decrypting
-	ld1r		{ROUND_KEY.\lanes}, [x6]
-	sub		x6, x6, #( \n / 8 )
-	_speck_unround_128bytes	\n, \lanes
-.else
-	ld1r		{ROUND_KEY.\lanes}, [x6], #( \n / 8 )
-	_speck_round_128bytes	\n, \lanes
-.endif
-	subs		w7, w7, #1
-	bne		.Lnext_round_\@
-
-	// Re-interleave the 'x' and 'y' elements of each block
-	zip1		TMP0.\lanes, Y_0.\lanes, X_0.\lanes
-	zip2		Y_0.\lanes,  Y_0.\lanes, X_0.\lanes
-	zip1		TMP1.\lanes, Y_1.\lanes, X_1.\lanes
-	zip2		Y_1.\lanes,  Y_1.\lanes, X_1.\lanes
-	zip1		TMP2.\lanes, Y_2.\lanes, X_2.\lanes
-	zip2		Y_2.\lanes,  Y_2.\lanes, X_2.\lanes
-	zip1		TMP3.\lanes, Y_3.\lanes, X_3.\lanes
-	zip2		Y_3.\lanes,  Y_3.\lanes, X_3.\lanes
-
-	// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
-	eor		X_0.16b, TMP0.16b, TWEAKV0.16b
-	eor		Y_0.16b, Y_0.16b,  TWEAKV1.16b
-	eor		X_1.16b, TMP1.16b, TWEAKV2.16b
-	eor		Y_1.16b, Y_1.16b,  TWEAKV3.16b
-	eor		X_2.16b, TMP2.16b, TWEAKV4.16b
-	eor		Y_2.16b, Y_2.16b,  TWEAKV5.16b
-	eor		X_3.16b, TMP3.16b, TWEAKV6.16b
-	eor		Y_3.16b, Y_3.16b,  TWEAKV7.16b
-	mov		TWEAKV0.16b, TWEAKV_NEXT.16b
-
-	// Store the ciphertext in the destination buffer
-	st1		{X_0.16b-Y_1.16b}, [DST], #64
-	st1		{X_2.16b-Y_3.16b}, [DST], #64
-
-	// Continue if there are more 128-byte chunks remaining
-	subs		NBYTES, NBYTES, #128
-	bne		.Lnext_128bytes_\@
-
-	// Store the next tweak and return
-.if \n == 64
-	st1		{TWEAKV_NEXT.16b}, [TWEAK]
-.else
-	st1		{TWEAKV_NEXT.8b}, [TWEAK]
-.endif
-	ret
-.endm
-
-ENTRY(speck128_xts_encrypt_neon)
-	_speck_xts_crypt	n=64, lanes=2d, decrypting=0
-ENDPROC(speck128_xts_encrypt_neon)
-
-ENTRY(speck128_xts_decrypt_neon)
-	_speck_xts_crypt	n=64, lanes=2d, decrypting=1
-ENDPROC(speck128_xts_decrypt_neon)
-
-ENTRY(speck64_xts_encrypt_neon)
-	_speck_xts_crypt	n=32, lanes=4s, decrypting=0
-ENDPROC(speck64_xts_encrypt_neon)
-
-ENTRY(speck64_xts_decrypt_neon)
-	_speck_xts_crypt	n=32, lanes=4s, decrypting=1
-ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm64/crypto/speck-neon-glue.c b/arch/arm64/crypto/speck-neon-glue.c
deleted file mode 100644
index c7d18569d408..000000000000
--- a/arch/arm64/crypto/speck-neon-glue.c
+++ /dev/null
@@ -1,308 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- * (64-bit version; based on the 32-bit version)
- *
- * Copyright (c) 2018 Google, Inc
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/algapi.h>
-#include <crypto/gf128mul.h>
-#include <crypto/speck.h>
-#include <crypto/xts.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/* The assembly functions only handle multiples of 128 bytes */
-#define SPECK_NEON_CHUNK_SIZE	128
-
-/* Speck128 */
-
-struct speck128_xts_tfm_ctx {
-	struct speck128_tfm_ctx main_key;
-	struct speck128_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
-				     u8 *, const u8 *);
-typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
-					  const void *, unsigned int, void *);
-
-static __always_inline int
-__speck128_xts_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		     struct scatterlist *src, unsigned int nbytes,
-		     speck128_crypt_one_t crypt_one,
-		     speck128_xts_crypt_many_t crypt_many)
-{
-	struct crypto_blkcipher *tfm = desc->tfm;
-	const struct speck128_xts_tfm_ctx *ctx = crypto_blkcipher_ctx(tfm);
-	struct blkcipher_walk walk;
-	le128 tweak;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, SPECK_NEON_CHUNK_SIZE);
-
-	crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			le128_xor((le128 *)dst, (const le128 *)src, &tweak);
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
-			gf128mul_x_ble((be128 *)&tweak, (const be128 *)&tweak);
-
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck128_xts_encrypt(struct blkcipher_desc *desc,
-				struct scatterlist *dst,
-				struct scatterlist *src,
-				unsigned int nbytes)
-{
-	return __speck128_xts_crypt(desc, dst, src, nbytes,
-				    crypto_speck128_encrypt,
-				    speck128_xts_encrypt_neon);
-}
-
-static int speck128_xts_decrypt(struct blkcipher_desc *desc,
-				struct scatterlist *dst,
-				struct scatterlist *src,
-				unsigned int nbytes)
-{
-	return __speck128_xts_crypt(desc, dst, src, nbytes,
-				    crypto_speck128_decrypt,
-				    speck128_xts_decrypt_neon);
-}
-
-static int speck128_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
-			       unsigned int keylen)
-{
-	struct speck128_xts_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	int err;
-
-	if (keylen % 2)
-		return -EINVAL;
-
-	keylen /= 2;
-
-	err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-/* Speck64 */
-
-struct speck64_xts_tfm_ctx {
-	struct speck64_tfm_ctx main_key;
-	struct speck64_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
-				    u8 *, const u8 *);
-typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
-					 const void *, unsigned int, void *);
-
-static __always_inline int
-__speck64_xts_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		    struct scatterlist *src, unsigned int nbytes,
-		    speck64_crypt_one_t crypt_one,
-		    speck64_xts_crypt_many_t crypt_many)
-{
-	struct crypto_blkcipher *tfm = desc->tfm;
-	const struct speck64_xts_tfm_ctx *ctx = crypto_blkcipher_ctx(tfm);
-	struct blkcipher_walk walk;
-	__le64 tweak;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, SPECK_NEON_CHUNK_SIZE);
-
-	crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			*(__le64 *)dst = *(__le64 *)src ^ tweak;
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			*(__le64 *)dst ^= tweak;
-			tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
-					    ((tweak & cpu_to_le64(1ULL << 63)) ?
-					     0x1B : 0));
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck64_xts_encrypt(struct blkcipher_desc *desc,
-			       struct scatterlist *dst, struct scatterlist *src,
-			       unsigned int nbytes)
-{
-	return __speck64_xts_crypt(desc, dst, src, nbytes,
-				   crypto_speck64_encrypt,
-				   speck64_xts_encrypt_neon);
-}
-
-static int speck64_xts_decrypt(struct blkcipher_desc *desc,
-			       struct scatterlist *dst, struct scatterlist *src,
-			       unsigned int nbytes)
-{
-	return __speck64_xts_crypt(desc, dst, src, nbytes,
-				   crypto_speck64_decrypt,
-				   speck64_xts_decrypt_neon);
-}
-
-static int speck64_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
-{
-	struct speck64_xts_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	int err;
-
-	if (keylen % 2)
-		return -EINVAL;
-
-	keylen /= 2;
-
-	err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-static struct crypto_alg speck_algs[] = {
-	{
-		.cra_name		= "xts(speck128)",
-		.cra_driver_name	= "xts-speck128-neon",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-		.cra_blocksize		= SPECK128_BLOCK_SIZE,
-		.cra_type		= &crypto_blkcipher_type,
-		.cra_ctxsize		= sizeof(struct speck128_xts_tfm_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-		.cra_u = {
-			.blkcipher = {
-				.min_keysize		= 2 * SPECK128_128_KEY_SIZE,
-				.max_keysize		= 2 * SPECK128_256_KEY_SIZE,
-				.ivsize			= SPECK128_BLOCK_SIZE,
-				.setkey			= speck128_xts_setkey,
-				.encrypt		= speck128_xts_encrypt,
-				.decrypt		= speck128_xts_decrypt,
-			}
-		}
-	}, {
-		.cra_name		= "xts(speck64)",
-		.cra_driver_name	= "xts-speck64-neon",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-		.cra_blocksize		= SPECK64_BLOCK_SIZE,
-		.cra_type		= &crypto_blkcipher_type,
-		.cra_ctxsize		= sizeof(struct speck64_xts_tfm_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-		.cra_u = {
-			.blkcipher = {
-				.min_keysize		= 2 * SPECK64_96_KEY_SIZE,
-				.max_keysize		= 2 * SPECK64_128_KEY_SIZE,
-				.ivsize			= SPECK64_BLOCK_SIZE,
-				.setkey			= speck64_xts_setkey,
-				.encrypt		= speck64_xts_encrypt,
-				.decrypt		= speck64_xts_decrypt,
-			}
-		}
-	}
-};
-
-static int __init speck_neon_module_init(void)
-{
-	if (!(elf_hwcap & HWCAP_ASIMD))
-		return -ENODEV;
-	return crypto_register_algs(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-static void __exit speck_neon_module_exit(void)
-{
-	crypto_unregister_algs(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-module_init(speck_neon_module_init);
-module_exit(speck_neon_module_exit);
-
-MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("xts(speck128)");
-MODULE_ALIAS_CRYPTO("xts-speck128-neon");
-MODULE_ALIAS_CRYPTO("xts(speck64)");
-MODULE_ALIAS_CRYPTO("xts-speck64-neon");

From ea2ad7b4110577437f968af5d51ec284ac463ea6 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Mon, 22 Oct 2018 14:13:49 -0700
Subject: [PATCH 1841/3220] Revert "BACKPORT, FROMGIT: crypto: speck - add test
 vectors for Speck64-XTS"

This reverts commit de4b647c0124a928b458d7d79927a3cd0a7462e3.

Bug: 116008047
Change-Id: I96897223f5daced94e3d62ae817c2de2b59b417f
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 crypto/testmgr.c |  15 --
 crypto/testmgr.h | 671 -----------------------------------------------
 2 files changed, 686 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 2329b5f16b8c..c9a965c47d37 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3903,21 +3903,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
-	}, {
-		.alg = "xts(speck64)",
-		.test = alg_test_skcipher,
-		.suite = {
-			.cipher = {
-				.enc = {
-					.vecs = speck64_xts_enc_tv_template,
-					.count = ARRAY_SIZE(speck64_xts_enc_tv_template)
-				},
-				.dec = {
-					.vecs = speck64_xts_dec_tv_template,
-					.count = ARRAY_SIZE(speck64_xts_dec_tv_template)
-				}
-			}
-		}
 	}, {
 		.alg = "xts(twofish)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 58072e1a4def..63e104bb8d72 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -13404,677 +13404,6 @@ static struct cipher_testvec speck64_dec_tv_template[] = {
 	},
 };
 
-/*
- * Speck64-XTS test vectors, taken from the AES-XTS test vectors with the result
- * recomputed with Speck64 as the cipher, and key lengths adjusted
- */
-
-static struct cipher_testvec speck64_xts_enc_tv_template[] = {
-	{
-		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.klen	= 24,
-		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.ilen	= 32,
-		.result	= "\x84\xaf\x54\x07\x19\xd4\x7c\xa6"
-			  "\xe4\xfe\xdf\xc4\x1f\x34\xc3\xc2"
-			  "\x80\xf5\x72\xe7\xcd\xf0\x99\x22"
-			  "\x35\xa7\x2f\x06\xef\xdc\x51\xaa",
-		.rlen	= 32,
-	}, {
-		.key	= "\x11\x11\x11\x11\x11\x11\x11\x11"
-			  "\x11\x11\x11\x11\x11\x11\x11\x11"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22",
-		.klen	= 24,
-		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44",
-		.ilen	= 32,
-		.result	= "\x12\x56\x73\xcd\x15\x87\xa8\x59"
-			  "\xcf\x84\xae\xd9\x1c\x66\xd6\x9f"
-			  "\xb3\x12\x69\x7e\x36\xeb\x52\xff"
-			  "\x62\xdd\xba\x90\xb3\xe1\xee\x99",
-		.rlen	= 32,
-	}, {
-		.key	= "\xff\xfe\xfd\xfc\xfb\xfa\xf9\xf8"
-			  "\xf7\xf6\xf5\xf4\xf3\xf2\xf1\xf0"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22",
-		.klen	= 24,
-		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44",
-		.ilen	= 32,
-		.result	= "\x15\x1b\xe4\x2c\xa2\x5a\x2d\x2c"
-			  "\x27\x36\xc0\xbf\x5d\xea\x36\x37"
-			  "\x2d\x1a\x88\xbc\x66\xb5\xd0\x0b"
-			  "\xa1\xbc\x19\xb2\x0f\x3b\x75\x34",
-		.rlen	= 32,
-	}, {
-		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
-			  "\x23\x53\x60\x28\x74\x71\x35\x26"
-			  "\x31\x41\x59\x26\x53\x58\x97\x93",
-		.klen	= 24,
-		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
-			  "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.ilen	= 512,
-		.result	= "\xaf\xa1\x81\xa6\x32\xbb\x15\x8e"
-			  "\xf8\x95\x2e\xd3\xe6\xee\x7e\x09"
-			  "\x0c\x1a\xf5\x02\x97\x8b\xe3\xb3"
-			  "\x11\xc7\x39\x96\xd0\x95\xf4\x56"
-			  "\xf4\xdd\x03\x38\x01\x44\x2c\xcf"
-			  "\x88\xae\x8e\x3c\xcd\xe7\xaa\x66"
-			  "\xfe\x3d\xc6\xfb\x01\x23\x51\x43"
-			  "\xd5\xd2\x13\x86\x94\x34\xe9\x62"
-			  "\xf9\x89\xe3\xd1\x7b\xbe\xf8\xef"
-			  "\x76\x35\x04\x3f\xdb\x23\x9d\x0b"
-			  "\x85\x42\xb9\x02\xd6\xcc\xdb\x96"
-			  "\xa7\x6b\x27\xb6\xd4\x45\x8f\x7d"
-			  "\xae\xd2\x04\xd5\xda\xc1\x7e\x24"
-			  "\x8c\x73\xbe\x48\x7e\xcf\x65\x28"
-			  "\x29\xe5\xbe\x54\x30\xcb\x46\x95"
-			  "\x4f\x2e\x8a\x36\xc8\x27\xc5\xbe"
-			  "\xd0\x1a\xaf\xab\x26\xcd\x9e\x69"
-			  "\xa1\x09\x95\x71\x26\xe9\xc4\xdf"
-			  "\xe6\x31\xc3\x46\xda\xaf\x0b\x41"
-			  "\x1f\xab\xb1\x8e\xd6\xfc\x0b\xb3"
-			  "\x82\xc0\x37\x27\xfc\x91\xa7\x05"
-			  "\xfb\xc5\xdc\x2b\x74\x96\x48\x43"
-			  "\x5d\x9c\x19\x0f\x60\x63\x3a\x1f"
-			  "\x6f\xf0\x03\xbe\x4d\xfd\xc8\x4a"
-			  "\xc6\xa4\x81\x6d\xc3\x12\x2a\x5c"
-			  "\x07\xff\xf3\x72\x74\x48\xb5\x40"
-			  "\x50\xb5\xdd\x90\x43\x31\x18\x15"
-			  "\x7b\xf2\xa6\xdb\x83\xc8\x4b\x4a"
-			  "\x29\x93\x90\x8b\xda\x07\xf0\x35"
-			  "\x6d\x90\x88\x09\x4e\x83\xf5\x5b"
-			  "\x94\x12\xbb\x33\x27\x1d\x3f\x23"
-			  "\x51\xa8\x7c\x07\xa2\xae\x77\xa6"
-			  "\x50\xfd\xcc\xc0\x4f\x80\x7a\x9f"
-			  "\x66\xdd\xcd\x75\x24\x8b\x33\xf7"
-			  "\x20\xdb\x83\x9b\x4f\x11\x63\x6e"
-			  "\xcf\x37\xef\xc9\x11\x01\x5c\x45"
-			  "\x32\x99\x7c\x3c\x9e\x42\x89\xe3"
-			  "\x70\x6d\x15\x9f\xb1\xe6\xb6\x05"
-			  "\xfe\x0c\xb9\x49\x2d\x90\x6d\xcc"
-			  "\x5d\x3f\xc1\xfe\x89\x0a\x2e\x2d"
-			  "\xa0\xa8\x89\x3b\x73\x39\xa5\x94"
-			  "\x4c\xa4\xa6\xbb\xa7\x14\x46\x89"
-			  "\x10\xff\xaf\xef\xca\xdd\x4f\x80"
-			  "\xb3\xdf\x3b\xab\xd4\xe5\x5a\xc7"
-			  "\x33\xca\x00\x8b\x8b\x3f\xea\xec"
-			  "\x68\x8a\xc2\x6d\xfd\xd4\x67\x0f"
-			  "\x22\x31\xe1\x0e\xfe\x5a\x04\xd5"
-			  "\x64\xa3\xf1\x1a\x76\x28\xcc\x35"
-			  "\x36\xa7\x0a\x74\xf7\x1c\x44\x9b"
-			  "\xc7\x1b\x53\x17\x02\xea\xd1\xad"
-			  "\x13\x51\x73\xc0\xa0\xb2\x05\x32"
-			  "\xa8\xa2\x37\x2e\xe1\x7a\x3a\x19"
-			  "\x26\xb4\x6c\x62\x5d\xb3\x1a\x1d"
-			  "\x59\xda\xee\x1a\x22\x18\xda\x0d"
-			  "\x88\x0f\x55\x8b\x72\x62\xfd\xc1"
-			  "\x69\x13\xcd\x0d\x5f\xc1\x09\x52"
-			  "\xee\xd6\xe3\x84\x4d\xee\xf6\x88"
-			  "\xaf\x83\xdc\x76\xf4\xc0\x93\x3f"
-			  "\x4a\x75\x2f\xb0\x0b\x3e\xc4\x54"
-			  "\x7d\x69\x8d\x00\x62\x77\x0d\x14"
-			  "\xbe\x7c\xa6\x7d\xc5\x24\x4f\xf3"
-			  "\x50\xf7\x5f\xf4\xc2\xca\x41\x97"
-			  "\x37\xbe\x75\x74\xcd\xf0\x75\x6e"
-			  "\x25\x23\x94\xbd\xda\x8d\xb0\xd4",
-		.rlen	= 512,
-	}, {
-		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
-			  "\x23\x53\x60\x28\x74\x71\x35\x26"
-			  "\x62\x49\x77\x57\x24\x70\x93\x69"
-			  "\x99\x59\x57\x49\x66\x96\x76\x27",
-		.klen	= 32,
-		.iv	= "\xff\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
-			  "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.ilen	= 512,
-		.result	= "\x55\xed\x71\xd3\x02\x8e\x15\x3b"
-			  "\xc6\x71\x29\x2d\x3e\x89\x9f\x59"
-			  "\x68\x6a\xcc\x8a\x56\x97\xf3\x95"
-			  "\x4e\x51\x08\xda\x2a\xf8\x6f\x3c"
-			  "\x78\x16\xea\x80\xdb\x33\x75\x94"
-			  "\xf9\x29\xc4\x2b\x76\x75\x97\xc7"
-			  "\xf2\x98\x2c\xf9\xff\xc8\xd5\x2b"
-			  "\x18\xf1\xaf\xcf\x7c\xc5\x0b\xee"
-			  "\xad\x3c\x76\x7c\xe6\x27\xa2\x2a"
-			  "\xe4\x66\xe1\xab\xa2\x39\xfc\x7c"
-			  "\xf5\xec\x32\x74\xa3\xb8\x03\x88"
-			  "\x52\xfc\x2e\x56\x3f\xa1\xf0\x9f"
-			  "\x84\x5e\x46\xed\x20\x89\xb6\x44"
-			  "\x8d\xd0\xed\x54\x47\x16\xbe\x95"
-			  "\x8a\xb3\x6b\x72\xc4\x32\x52\x13"
-			  "\x1b\xb0\x82\xbe\xac\xf9\x70\xa6"
-			  "\x44\x18\xdd\x8c\x6e\xca\x6e\x45"
-			  "\x8f\x1e\x10\x07\x57\x25\x98\x7b"
-			  "\x17\x8c\x78\xdd\x80\xa7\xd9\xd8"
-			  "\x63\xaf\xb9\x67\x57\xfd\xbc\xdb"
-			  "\x44\xe9\xc5\x65\xd1\xc7\x3b\xff"
-			  "\x20\xa0\x80\x1a\xc3\x9a\xad\x5e"
-			  "\x5d\x3b\xd3\x07\xd9\xf5\xfd\x3d"
-			  "\x4a\x8b\xa8\xd2\x6e\x7a\x51\x65"
-			  "\x6c\x8e\x95\xe0\x45\xc9\x5f\x4a"
-			  "\x09\x3c\x3d\x71\x7f\x0c\x84\x2a"
-			  "\xc8\x48\x52\x1a\xc2\xd5\xd6\x78"
-			  "\x92\x1e\xa0\x90\x2e\xea\xf0\xf3"
-			  "\xdc\x0f\xb1\xaf\x0d\x9b\x06\x2e"
-			  "\x35\x10\x30\x82\x0d\xe7\xc5\x9b"
-			  "\xde\x44\x18\xbd\x9f\xd1\x45\xa9"
-			  "\x7b\x7a\x4a\xad\x35\x65\x27\xca"
-			  "\xb2\xc3\xd4\x9b\x71\x86\x70\xee"
-			  "\xf1\x89\x3b\x85\x4b\x5b\xaa\xaf"
-			  "\xfc\x42\xc8\x31\x59\xbe\x16\x60"
-			  "\x4f\xf9\xfa\x12\xea\xd0\xa7\x14"
-			  "\xf0\x7a\xf3\xd5\x8d\xbd\x81\xef"
-			  "\x52\x7f\x29\x51\x94\x20\x67\x3c"
-			  "\xd1\xaf\x77\x9f\x22\x5a\x4e\x63"
-			  "\xe7\xff\x73\x25\xd1\xdd\x96\x8a"
-			  "\x98\x52\x6d\xf3\xac\x3e\xf2\x18"
-			  "\x6d\xf6\x0a\x29\xa6\x34\x3d\xed"
-			  "\xe3\x27\x0d\x9d\x0a\x02\x44\x7e"
-			  "\x5a\x7e\x67\x0f\x0a\x9e\xd6\xad"
-			  "\x91\xe6\x4d\x81\x8c\x5c\x59\xaa"
-			  "\xfb\xeb\x56\x53\xd2\x7d\x4c\x81"
-			  "\x65\x53\x0f\x41\x11\xbd\x98\x99"
-			  "\xf9\xc6\xfa\x51\x2e\xa3\xdd\x8d"
-			  "\x84\x98\xf9\x34\xed\x33\x2a\x1f"
-			  "\x82\xed\xc1\x73\x98\xd3\x02\xdc"
-			  "\xe6\xc2\x33\x1d\xa2\xb4\xca\x76"
-			  "\x63\x51\x34\x9d\x96\x12\xae\xce"
-			  "\x83\xc9\x76\x5e\xa4\x1b\x53\x37"
-			  "\x17\xd5\xc0\x80\x1d\x62\xf8\x3d"
-			  "\x54\x27\x74\xbb\x10\x86\x57\x46"
-			  "\x68\xe1\xed\x14\xe7\x9d\xfc\x84"
-			  "\x47\xbc\xc2\xf8\x19\x4b\x99\xcf"
-			  "\x7a\xe9\xc4\xb8\x8c\x82\x72\x4d"
-			  "\x7b\x4f\x38\x55\x36\x71\x64\xc1"
-			  "\xfc\x5c\x75\x52\x33\x02\x18\xf8"
-			  "\x17\xe1\x2b\xc2\x43\x39\xbd\x76"
-			  "\x9b\x63\x76\x32\x2f\x19\x72\x10"
-			  "\x9f\x21\x0c\xf1\x66\x50\x7f\xa5"
-			  "\x0d\x1f\x46\xe0\xba\xd3\x2f\x3c",
-		.rlen	= 512,
-		.also_non_np = 1,
-		.np	= 3,
-		.tap	= { 512 - 20, 4, 16 },
-	}
-};
-
-static struct cipher_testvec speck64_xts_dec_tv_template[] = {
-	{
-		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.klen	= 24,
-		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x84\xaf\x54\x07\x19\xd4\x7c\xa6"
-			  "\xe4\xfe\xdf\xc4\x1f\x34\xc3\xc2"
-			  "\x80\xf5\x72\xe7\xcd\xf0\x99\x22"
-			  "\x35\xa7\x2f\x06\xef\xdc\x51\xaa",
-		.ilen	= 32,
-		.result	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.rlen	= 32,
-	}, {
-		.key	= "\x11\x11\x11\x11\x11\x11\x11\x11"
-			  "\x11\x11\x11\x11\x11\x11\x11\x11"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22",
-		.klen	= 24,
-		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x12\x56\x73\xcd\x15\x87\xa8\x59"
-			  "\xcf\x84\xae\xd9\x1c\x66\xd6\x9f"
-			  "\xb3\x12\x69\x7e\x36\xeb\x52\xff"
-			  "\x62\xdd\xba\x90\xb3\xe1\xee\x99",
-		.ilen	= 32,
-		.result	= "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44",
-		.rlen	= 32,
-	}, {
-		.key	= "\xff\xfe\xfd\xfc\xfb\xfa\xf9\xf8"
-			  "\xf7\xf6\xf5\xf4\xf3\xf2\xf1\xf0"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22",
-		.klen	= 24,
-		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x15\x1b\xe4\x2c\xa2\x5a\x2d\x2c"
-			  "\x27\x36\xc0\xbf\x5d\xea\x36\x37"
-			  "\x2d\x1a\x88\xbc\x66\xb5\xd0\x0b"
-			  "\xa1\xbc\x19\xb2\x0f\x3b\x75\x34",
-		.ilen	= 32,
-		.result	= "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44",
-		.rlen	= 32,
-	}, {
-		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
-			  "\x23\x53\x60\x28\x74\x71\x35\x26"
-			  "\x31\x41\x59\x26\x53\x58\x97\x93",
-		.klen	= 24,
-		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\xaf\xa1\x81\xa6\x32\xbb\x15\x8e"
-			  "\xf8\x95\x2e\xd3\xe6\xee\x7e\x09"
-			  "\x0c\x1a\xf5\x02\x97\x8b\xe3\xb3"
-			  "\x11\xc7\x39\x96\xd0\x95\xf4\x56"
-			  "\xf4\xdd\x03\x38\x01\x44\x2c\xcf"
-			  "\x88\xae\x8e\x3c\xcd\xe7\xaa\x66"
-			  "\xfe\x3d\xc6\xfb\x01\x23\x51\x43"
-			  "\xd5\xd2\x13\x86\x94\x34\xe9\x62"
-			  "\xf9\x89\xe3\xd1\x7b\xbe\xf8\xef"
-			  "\x76\x35\x04\x3f\xdb\x23\x9d\x0b"
-			  "\x85\x42\xb9\x02\xd6\xcc\xdb\x96"
-			  "\xa7\x6b\x27\xb6\xd4\x45\x8f\x7d"
-			  "\xae\xd2\x04\xd5\xda\xc1\x7e\x24"
-			  "\x8c\x73\xbe\x48\x7e\xcf\x65\x28"
-			  "\x29\xe5\xbe\x54\x30\xcb\x46\x95"
-			  "\x4f\x2e\x8a\x36\xc8\x27\xc5\xbe"
-			  "\xd0\x1a\xaf\xab\x26\xcd\x9e\x69"
-			  "\xa1\x09\x95\x71\x26\xe9\xc4\xdf"
-			  "\xe6\x31\xc3\x46\xda\xaf\x0b\x41"
-			  "\x1f\xab\xb1\x8e\xd6\xfc\x0b\xb3"
-			  "\x82\xc0\x37\x27\xfc\x91\xa7\x05"
-			  "\xfb\xc5\xdc\x2b\x74\x96\x48\x43"
-			  "\x5d\x9c\x19\x0f\x60\x63\x3a\x1f"
-			  "\x6f\xf0\x03\xbe\x4d\xfd\xc8\x4a"
-			  "\xc6\xa4\x81\x6d\xc3\x12\x2a\x5c"
-			  "\x07\xff\xf3\x72\x74\x48\xb5\x40"
-			  "\x50\xb5\xdd\x90\x43\x31\x18\x15"
-			  "\x7b\xf2\xa6\xdb\x83\xc8\x4b\x4a"
-			  "\x29\x93\x90\x8b\xda\x07\xf0\x35"
-			  "\x6d\x90\x88\x09\x4e\x83\xf5\x5b"
-			  "\x94\x12\xbb\x33\x27\x1d\x3f\x23"
-			  "\x51\xa8\x7c\x07\xa2\xae\x77\xa6"
-			  "\x50\xfd\xcc\xc0\x4f\x80\x7a\x9f"
-			  "\x66\xdd\xcd\x75\x24\x8b\x33\xf7"
-			  "\x20\xdb\x83\x9b\x4f\x11\x63\x6e"
-			  "\xcf\x37\xef\xc9\x11\x01\x5c\x45"
-			  "\x32\x99\x7c\x3c\x9e\x42\x89\xe3"
-			  "\x70\x6d\x15\x9f\xb1\xe6\xb6\x05"
-			  "\xfe\x0c\xb9\x49\x2d\x90\x6d\xcc"
-			  "\x5d\x3f\xc1\xfe\x89\x0a\x2e\x2d"
-			  "\xa0\xa8\x89\x3b\x73\x39\xa5\x94"
-			  "\x4c\xa4\xa6\xbb\xa7\x14\x46\x89"
-			  "\x10\xff\xaf\xef\xca\xdd\x4f\x80"
-			  "\xb3\xdf\x3b\xab\xd4\xe5\x5a\xc7"
-			  "\x33\xca\x00\x8b\x8b\x3f\xea\xec"
-			  "\x68\x8a\xc2\x6d\xfd\xd4\x67\x0f"
-			  "\x22\x31\xe1\x0e\xfe\x5a\x04\xd5"
-			  "\x64\xa3\xf1\x1a\x76\x28\xcc\x35"
-			  "\x36\xa7\x0a\x74\xf7\x1c\x44\x9b"
-			  "\xc7\x1b\x53\x17\x02\xea\xd1\xad"
-			  "\x13\x51\x73\xc0\xa0\xb2\x05\x32"
-			  "\xa8\xa2\x37\x2e\xe1\x7a\x3a\x19"
-			  "\x26\xb4\x6c\x62\x5d\xb3\x1a\x1d"
-			  "\x59\xda\xee\x1a\x22\x18\xda\x0d"
-			  "\x88\x0f\x55\x8b\x72\x62\xfd\xc1"
-			  "\x69\x13\xcd\x0d\x5f\xc1\x09\x52"
-			  "\xee\xd6\xe3\x84\x4d\xee\xf6\x88"
-			  "\xaf\x83\xdc\x76\xf4\xc0\x93\x3f"
-			  "\x4a\x75\x2f\xb0\x0b\x3e\xc4\x54"
-			  "\x7d\x69\x8d\x00\x62\x77\x0d\x14"
-			  "\xbe\x7c\xa6\x7d\xc5\x24\x4f\xf3"
-			  "\x50\xf7\x5f\xf4\xc2\xca\x41\x97"
-			  "\x37\xbe\x75\x74\xcd\xf0\x75\x6e"
-			  "\x25\x23\x94\xbd\xda\x8d\xb0\xd4",
-		.ilen	= 512,
-		.result	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
-			  "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.rlen	= 512,
-	}, {
-		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
-			  "\x23\x53\x60\x28\x74\x71\x35\x26"
-			  "\x62\x49\x77\x57\x24\x70\x93\x69"
-			  "\x99\x59\x57\x49\x66\x96\x76\x27",
-		.klen	= 32,
-		.iv	= "\xff\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x55\xed\x71\xd3\x02\x8e\x15\x3b"
-			  "\xc6\x71\x29\x2d\x3e\x89\x9f\x59"
-			  "\x68\x6a\xcc\x8a\x56\x97\xf3\x95"
-			  "\x4e\x51\x08\xda\x2a\xf8\x6f\x3c"
-			  "\x78\x16\xea\x80\xdb\x33\x75\x94"
-			  "\xf9\x29\xc4\x2b\x76\x75\x97\xc7"
-			  "\xf2\x98\x2c\xf9\xff\xc8\xd5\x2b"
-			  "\x18\xf1\xaf\xcf\x7c\xc5\x0b\xee"
-			  "\xad\x3c\x76\x7c\xe6\x27\xa2\x2a"
-			  "\xe4\x66\xe1\xab\xa2\x39\xfc\x7c"
-			  "\xf5\xec\x32\x74\xa3\xb8\x03\x88"
-			  "\x52\xfc\x2e\x56\x3f\xa1\xf0\x9f"
-			  "\x84\x5e\x46\xed\x20\x89\xb6\x44"
-			  "\x8d\xd0\xed\x54\x47\x16\xbe\x95"
-			  "\x8a\xb3\x6b\x72\xc4\x32\x52\x13"
-			  "\x1b\xb0\x82\xbe\xac\xf9\x70\xa6"
-			  "\x44\x18\xdd\x8c\x6e\xca\x6e\x45"
-			  "\x8f\x1e\x10\x07\x57\x25\x98\x7b"
-			  "\x17\x8c\x78\xdd\x80\xa7\xd9\xd8"
-			  "\x63\xaf\xb9\x67\x57\xfd\xbc\xdb"
-			  "\x44\xe9\xc5\x65\xd1\xc7\x3b\xff"
-			  "\x20\xa0\x80\x1a\xc3\x9a\xad\x5e"
-			  "\x5d\x3b\xd3\x07\xd9\xf5\xfd\x3d"
-			  "\x4a\x8b\xa8\xd2\x6e\x7a\x51\x65"
-			  "\x6c\x8e\x95\xe0\x45\xc9\x5f\x4a"
-			  "\x09\x3c\x3d\x71\x7f\x0c\x84\x2a"
-			  "\xc8\x48\x52\x1a\xc2\xd5\xd6\x78"
-			  "\x92\x1e\xa0\x90\x2e\xea\xf0\xf3"
-			  "\xdc\x0f\xb1\xaf\x0d\x9b\x06\x2e"
-			  "\x35\x10\x30\x82\x0d\xe7\xc5\x9b"
-			  "\xde\x44\x18\xbd\x9f\xd1\x45\xa9"
-			  "\x7b\x7a\x4a\xad\x35\x65\x27\xca"
-			  "\xb2\xc3\xd4\x9b\x71\x86\x70\xee"
-			  "\xf1\x89\x3b\x85\x4b\x5b\xaa\xaf"
-			  "\xfc\x42\xc8\x31\x59\xbe\x16\x60"
-			  "\x4f\xf9\xfa\x12\xea\xd0\xa7\x14"
-			  "\xf0\x7a\xf3\xd5\x8d\xbd\x81\xef"
-			  "\x52\x7f\x29\x51\x94\x20\x67\x3c"
-			  "\xd1\xaf\x77\x9f\x22\x5a\x4e\x63"
-			  "\xe7\xff\x73\x25\xd1\xdd\x96\x8a"
-			  "\x98\x52\x6d\xf3\xac\x3e\xf2\x18"
-			  "\x6d\xf6\x0a\x29\xa6\x34\x3d\xed"
-			  "\xe3\x27\x0d\x9d\x0a\x02\x44\x7e"
-			  "\x5a\x7e\x67\x0f\x0a\x9e\xd6\xad"
-			  "\x91\xe6\x4d\x81\x8c\x5c\x59\xaa"
-			  "\xfb\xeb\x56\x53\xd2\x7d\x4c\x81"
-			  "\x65\x53\x0f\x41\x11\xbd\x98\x99"
-			  "\xf9\xc6\xfa\x51\x2e\xa3\xdd\x8d"
-			  "\x84\x98\xf9\x34\xed\x33\x2a\x1f"
-			  "\x82\xed\xc1\x73\x98\xd3\x02\xdc"
-			  "\xe6\xc2\x33\x1d\xa2\xb4\xca\x76"
-			  "\x63\x51\x34\x9d\x96\x12\xae\xce"
-			  "\x83\xc9\x76\x5e\xa4\x1b\x53\x37"
-			  "\x17\xd5\xc0\x80\x1d\x62\xf8\x3d"
-			  "\x54\x27\x74\xbb\x10\x86\x57\x46"
-			  "\x68\xe1\xed\x14\xe7\x9d\xfc\x84"
-			  "\x47\xbc\xc2\xf8\x19\x4b\x99\xcf"
-			  "\x7a\xe9\xc4\xb8\x8c\x82\x72\x4d"
-			  "\x7b\x4f\x38\x55\x36\x71\x64\xc1"
-			  "\xfc\x5c\x75\x52\x33\x02\x18\xf8"
-			  "\x17\xe1\x2b\xc2\x43\x39\xbd\x76"
-			  "\x9b\x63\x76\x32\x2f\x19\x72\x10"
-			  "\x9f\x21\x0c\xf1\x66\x50\x7f\xa5"
-			  "\x0d\x1f\x46\xe0\xba\xd3\x2f\x3c",
-		.ilen	= 512,
-		.result	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
-			  "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.rlen	= 512,
-		.also_non_np = 1,
-		.np	= 3,
-		.tap	= { 512 - 20, 4, 16 },
-	}
-};
-
 /* Cast6 test vectors from RFC 2612 */
 #define CAST6_ENC_TEST_VECTORS		4
 #define CAST6_DEC_TEST_VECTORS		4

From 559e4a2616afa0ff683a3cbf8ec2ee662cfadeb7 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Mon, 22 Oct 2018 14:14:18 -0700
Subject: [PATCH 1842/3220] Revert "BACKPORT, FROMGIT: crypto: speck - add test
 vectors for Speck128-XTS"

This reverts commit b37c1244dd074b052247290a8c8b1eb9a02b4f84.

Bug: 116008047
Change-Id: Id68b333c66c89bc69e61bdd96d21fa6bc6dbf3b8
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 crypto/testmgr.c |  15 --
 crypto/testmgr.h | 687 -----------------------------------------------
 2 files changed, 702 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index c9a965c47d37..4760b564dc17 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3888,21 +3888,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
-	}, {
-		.alg = "xts(speck128)",
-		.test = alg_test_skcipher,
-		.suite = {
-			.cipher = {
-				.enc = {
-					.vecs = speck128_xts_enc_tv_template,
-					.count = ARRAY_SIZE(speck128_xts_enc_tv_template)
-				},
-				.dec = {
-					.vecs = speck128_xts_dec_tv_template,
-					.count = ARRAY_SIZE(speck128_xts_dec_tv_template)
-				}
-			}
-		}
 	}, {
 		.alg = "xts(twofish)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 63e104bb8d72..9b59a21a89e8 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -12677,693 +12677,6 @@ static struct cipher_testvec speck128_dec_tv_template[] = {
 	},
 };
 
-/*
- * Speck128-XTS test vectors, taken from the AES-XTS test vectors with the
- * result recomputed with Speck128 as the cipher
- */
-
-static struct cipher_testvec speck128_xts_enc_tv_template[] = {
-	{
-		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.klen	= 32,
-		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.ilen	= 32,
-		.result	= "\xbe\xa0\xe7\x03\xd7\xfe\xab\x62"
-			  "\x3b\x99\x4a\x64\x74\x77\xac\xed"
-			  "\xd8\xf4\xa6\xcf\xae\xb9\x07\x42"
-			  "\x51\xd9\xb6\x1d\xe0\x5e\xbc\x54",
-		.rlen	= 32,
-	}, {
-		.key	= "\x11\x11\x11\x11\x11\x11\x11\x11"
-			  "\x11\x11\x11\x11\x11\x11\x11\x11"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22",
-		.klen	= 32,
-		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44",
-		.ilen	= 32,
-		.result	= "\xfb\x53\x81\x75\x6f\x9f\x34\xad"
-			  "\x7e\x01\xed\x7b\xcc\xda\x4e\x4a"
-			  "\xd4\x84\xa4\x53\xd5\x88\x73\x1b"
-			  "\xfd\xcb\xae\x0d\xf3\x04\xee\xe6",
-		.rlen	= 32,
-	}, {
-		.key	= "\xff\xfe\xfd\xfc\xfb\xfa\xf9\xf8"
-			  "\xf7\xf6\xf5\xf4\xf3\xf2\xf1\xf0"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22",
-		.klen	= 32,
-		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44",
-		.ilen	= 32,
-		.result	= "\x21\x52\x84\x15\xd1\xf7\x21\x55"
-			  "\xd9\x75\x4a\xd3\xc5\xdb\x9f\x7d"
-			  "\xda\x63\xb2\xf1\x82\xb0\x89\x59"
-			  "\x86\xd4\xaa\xaa\xdd\xff\x4f\x92",
-		.rlen	= 32,
-	}, {
-		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
-			  "\x23\x53\x60\x28\x74\x71\x35\x26"
-			  "\x31\x41\x59\x26\x53\x58\x97\x93"
-			  "\x23\x84\x62\x64\x33\x83\x27\x95",
-		.klen	= 32,
-		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
-			  "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.ilen	= 512,
-		.result	= "\x57\xb5\xf8\x71\x6e\x6d\xdd\x82"
-			  "\x53\xd0\xed\x2d\x30\xc1\x20\xef"
-			  "\x70\x67\x5e\xff\x09\x70\xbb\xc1"
-			  "\x3a\x7b\x48\x26\xd9\x0b\xf4\x48"
-			  "\xbe\xce\xb1\xc7\xb2\x67\xc4\xa7"
-			  "\x76\xf8\x36\x30\xb7\xb4\x9a\xd9"
-			  "\xf5\x9d\xd0\x7b\xc1\x06\x96\x44"
-			  "\x19\xc5\x58\x84\x63\xb9\x12\x68"
-			  "\x68\xc7\xaa\x18\x98\xf2\x1f\x5c"
-			  "\x39\xa6\xd8\x32\x2b\xc3\x51\xfd"
-			  "\x74\x79\x2e\xb4\x44\xd7\x69\xc4"
-			  "\xfc\x29\xe6\xed\x26\x1e\xa6\x9d"
-			  "\x1c\xbe\x00\x0e\x7f\x3a\xca\xfb"
-			  "\x6d\x13\x65\xa0\xf9\x31\x12\xe2"
-			  "\x26\xd1\xec\x2b\x0a\x8b\x59\x99"
-			  "\xa7\x49\xa0\x0e\x09\x33\x85\x50"
-			  "\xc3\x23\xca\x7a\xdd\x13\x45\x5f"
-			  "\xde\x4c\xa7\xcb\x00\x8a\x66\x6f"
-			  "\xa2\xb6\xb1\x2e\xe1\xa0\x18\xf6"
-			  "\xad\xf3\xbd\xeb\xc7\xef\x55\x4f"
-			  "\x79\x91\x8d\x36\x13\x7b\xd0\x4a"
-			  "\x6c\x39\xfb\x53\xb8\x6f\x02\x51"
-			  "\xa5\x20\xac\x24\x1c\x73\x59\x73"
-			  "\x58\x61\x3a\x87\x58\xb3\x20\x56"
-			  "\x39\x06\x2b\x4d\xd3\x20\x2b\x89"
-			  "\x3f\xa2\xf0\x96\xeb\x7f\xa4\xcd"
-			  "\x11\xae\xbd\xcb\x3a\xb4\xd9\x91"
-			  "\x09\x35\x71\x50\x65\xac\x92\xe3"
-			  "\x7b\x32\xc0\x7a\xdd\xd4\xc3\x92"
-			  "\x6f\xeb\x79\xde\x6f\xd3\x25\xc9"
-			  "\xcd\x63\xf5\x1e\x7a\x3b\x26\x9d"
-			  "\x77\x04\x80\xa9\xbf\x38\xb5\xbd"
-			  "\xb8\x05\x07\xbd\xfd\xab\x7b\xf8"
-			  "\x2a\x26\xcc\x49\x14\x6d\x55\x01"
-			  "\x06\x94\xd8\xb2\x2d\x53\x83\x1b"
-			  "\x8f\xd4\xdd\x57\x12\x7e\x18\xba"
-			  "\x8e\xe2\x4d\x80\xef\x7e\x6b\x9d"
-			  "\x24\xa9\x60\xa4\x97\x85\x86\x2a"
-			  "\x01\x00\x09\xf1\xcb\x4a\x24\x1c"
-			  "\xd8\xf6\xe6\x5b\xe7\x5d\xf2\xc4"
-			  "\x97\x1c\x10\xc6\x4d\x66\x4f\x98"
-			  "\x87\x30\xac\xd5\xea\x73\x49\x10"
-			  "\x80\xea\xe5\x5f\x4d\x5f\x03\x33"
-			  "\x66\x02\x35\x3d\x60\x06\x36\x4f"
-			  "\x14\x1c\xd8\x07\x1f\x78\xd0\xf8"
-			  "\x4f\x6c\x62\x7c\x15\xa5\x7c\x28"
-			  "\x7c\xcc\xeb\x1f\xd1\x07\x90\x93"
-			  "\x7e\xc2\xa8\x3a\x80\xc0\xf5\x30"
-			  "\xcc\x75\xcf\x16\x26\xa9\x26\x3b"
-			  "\xe7\x68\x2f\x15\x21\x5b\xe4\x00"
-			  "\xbd\x48\x50\xcd\x75\x70\xc4\x62"
-			  "\xbb\x41\xfb\x89\x4a\x88\x3b\x3b"
-			  "\x51\x66\x02\x69\x04\x97\x36\xd4"
-			  "\x75\xae\x0b\xa3\x42\xf8\xca\x79"
-			  "\x8f\x93\xe9\xcc\x38\xbd\xd6\xd2"
-			  "\xf9\x70\x4e\xc3\x6a\x8e\x25\xbd"
-			  "\xea\x15\x5a\xa0\x85\x7e\x81\x0d"
-			  "\x03\xe7\x05\x39\xf5\x05\x26\xee"
-			  "\xec\xaa\x1f\x3d\xc9\x98\x76\x01"
-			  "\x2c\xf4\xfc\xa3\x88\x77\x38\xc4"
-			  "\x50\x65\x50\x6d\x04\x1f\xdf\x5a"
-			  "\xaa\xf2\x01\xa9\xc1\x8d\xee\xca"
-			  "\x47\x26\xef\x39\xb8\xb4\xf2\xd1"
-			  "\xd6\xbb\x1b\x2a\xc1\x34\x14\xcf",
-		.rlen	= 512,
-	}, {
-		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
-			  "\x23\x53\x60\x28\x74\x71\x35\x26"
-			  "\x62\x49\x77\x57\x24\x70\x93\x69"
-			  "\x99\x59\x57\x49\x66\x96\x76\x27"
-			  "\x31\x41\x59\x26\x53\x58\x97\x93"
-			  "\x23\x84\x62\x64\x33\x83\x27\x95"
-			  "\x02\x88\x41\x97\x16\x93\x99\x37"
-			  "\x51\x05\x82\x09\x74\x94\x45\x92",
-		.klen	= 64,
-		.iv	= "\xff\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
-			  "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.ilen	= 512,
-		.result	= "\xc5\x85\x2a\x4b\x73\xe4\xf6\xf1"
-			  "\x7e\xf9\xf6\xe9\xa3\x73\x36\xcb"
-			  "\xaa\xb6\x22\xb0\x24\x6e\x3d\x73"
-			  "\x92\x99\xde\xd3\x76\xed\xcd\x63"
-			  "\x64\x3a\x22\x57\xc1\x43\x49\xd4"
-			  "\x79\x36\x31\x19\x62\xae\x10\x7e"
-			  "\x7d\xcf\x7a\xe2\x6b\xce\x27\xfa"
-			  "\xdc\x3d\xd9\x83\xd3\x42\x4c\xe0"
-			  "\x1b\xd6\x1d\x1a\x6f\xd2\x03\x00"
-			  "\xfc\x81\x99\x8a\x14\x62\xf5\x7e"
-			  "\x0d\xe7\x12\xe8\x17\x9d\x0b\xec"
-			  "\xe2\xf7\xc9\xa7\x63\xd1\x79\xb6"
-			  "\x62\x62\x37\xfe\x0a\x4c\x4a\x37"
-			  "\x70\xc7\x5e\x96\x5f\xbc\x8e\x9e"
-			  "\x85\x3c\x4f\x26\x64\x85\xbc\x68"
-			  "\xb0\xe0\x86\x5e\x26\x41\xce\x11"
-			  "\x50\xda\x97\x14\xe9\x9e\xc7\x6d"
-			  "\x3b\xdc\x43\xde\x2b\x27\x69\x7d"
-			  "\xfc\xb0\x28\xbd\x8f\xb1\xc6\x31"
-			  "\x14\x4d\xf0\x74\x37\xfd\x07\x25"
-			  "\x96\x55\xe5\xfc\x9e\x27\x2a\x74"
-			  "\x1b\x83\x4d\x15\x83\xac\x57\xa0"
-			  "\xac\xa5\xd0\x38\xef\x19\x56\x53"
-			  "\x25\x4b\xfc\xce\x04\x23\xe5\x6b"
-			  "\xf6\xc6\x6c\x32\x0b\xb3\x12\xc5"
-			  "\xed\x22\x34\x1c\x5d\xed\x17\x06"
-			  "\x36\xa3\xe6\x77\xb9\x97\x46\xb8"
-			  "\xe9\x3f\x7e\xc7\xbc\x13\x5c\xdc"
-			  "\x6e\x3f\x04\x5e\xd1\x59\xa5\x82"
-			  "\x35\x91\x3d\x1b\xe4\x97\x9f\x92"
-			  "\x1c\x5e\x5f\x6f\x41\xd4\x62\xa1"
-			  "\x8d\x39\xfc\x42\xfb\x38\x80\xb9"
-			  "\x0a\xe3\xcc\x6a\x93\xd9\x7a\xb1"
-			  "\xe9\x69\xaf\x0a\x6b\x75\x38\xa7"
-			  "\xa1\xbf\xf7\xda\x95\x93\x4b\x78"
-			  "\x19\xf5\x94\xf9\xd2\x00\x33\x37"
-			  "\xcf\xf5\x9e\x9c\xf3\xcc\xa6\xee"
-			  "\x42\xb2\x9e\x2c\x5f\x48\x23\x26"
-			  "\x15\x25\x17\x03\x3d\xfe\x2c\xfc"
-			  "\xeb\xba\xda\xe0\x00\x05\xb6\xa6"
-			  "\x07\xb3\xe8\x36\x5b\xec\x5b\xbf"
-			  "\xd6\x5b\x00\x74\xc6\x97\xf1\x6a"
-			  "\x49\xa1\xc3\xfa\x10\x52\xb9\x14"
-			  "\xad\xb7\x73\xf8\x78\x12\xc8\x59"
-			  "\x17\x80\x4c\x57\x39\xf1\x6d\x80"
-			  "\x25\x77\x0f\x5e\x7d\xf0\xaf\x21"
-			  "\xec\xce\xb7\xc8\x02\x8a\xed\x53"
-			  "\x2c\x25\x68\x2e\x1f\x85\x5e\x67"
-			  "\xd1\x07\x7a\x3a\x89\x08\xe0\x34"
-			  "\xdc\xdb\x26\xb4\x6b\x77\xfc\x40"
-			  "\x31\x15\x72\xa0\xf0\x73\xd9\x3b"
-			  "\xd5\xdb\xfe\xfc\x8f\xa9\x44\xa2"
-			  "\x09\x9f\xc6\x33\xe5\xe2\x88\xe8"
-			  "\xf3\xf0\x1a\xf4\xce\x12\x0f\xd6"
-			  "\xf7\x36\xe6\xa4\xf4\x7a\x10\x58"
-			  "\xcc\x1f\x48\x49\x65\x47\x75\xe9"
-			  "\x28\xe1\x65\x7b\xf2\xc4\xb5\x07"
-			  "\xf2\xec\x76\xd8\x8f\x09\xf3\x16"
-			  "\xa1\x51\x89\x3b\xeb\x96\x42\xac"
-			  "\x65\xe0\x67\x63\x29\xdc\xb4\x7d"
-			  "\xf2\x41\x51\x6a\xcb\xde\x3c\xfb"
-			  "\x66\x8d\x13\xca\xe0\x59\x2a\x00"
-			  "\xc9\x53\x4c\xe6\x9e\xe2\x73\xd5"
-			  "\x67\x19\xb2\xbd\x9a\x63\xd7\x5c",
-		.rlen	= 512,
-		.also_non_np = 1,
-		.np	= 3,
-		.tap	= { 512 - 20, 4, 16 },
-	}
-};
-
-static struct cipher_testvec speck128_xts_dec_tv_template[] = {
-	{
-		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.klen	= 32,
-		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\xbe\xa0\xe7\x03\xd7\xfe\xab\x62"
-			  "\x3b\x99\x4a\x64\x74\x77\xac\xed"
-			  "\xd8\xf4\xa6\xcf\xae\xb9\x07\x42"
-			  "\x51\xd9\xb6\x1d\xe0\x5e\xbc\x54",
-		.ilen	= 32,
-		.result	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.rlen	= 32,
-	}, {
-		.key	= "\x11\x11\x11\x11\x11\x11\x11\x11"
-			  "\x11\x11\x11\x11\x11\x11\x11\x11"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22",
-		.klen	= 32,
-		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\xfb\x53\x81\x75\x6f\x9f\x34\xad"
-			  "\x7e\x01\xed\x7b\xcc\xda\x4e\x4a"
-			  "\xd4\x84\xa4\x53\xd5\x88\x73\x1b"
-			  "\xfd\xcb\xae\x0d\xf3\x04\xee\xe6",
-		.ilen	= 32,
-		.result	= "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44",
-		.rlen	= 32,
-	}, {
-		.key	= "\xff\xfe\xfd\xfc\xfb\xfa\xf9\xf8"
-			  "\xf7\xf6\xf5\xf4\xf3\xf2\xf1\xf0"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22"
-			  "\x22\x22\x22\x22\x22\x22\x22\x22",
-		.klen	= 32,
-		.iv	= "\x33\x33\x33\x33\x33\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x21\x52\x84\x15\xd1\xf7\x21\x55"
-			  "\xd9\x75\x4a\xd3\xc5\xdb\x9f\x7d"
-			  "\xda\x63\xb2\xf1\x82\xb0\x89\x59"
-			  "\x86\xd4\xaa\xaa\xdd\xff\x4f\x92",
-		.ilen	= 32,
-		.result	= "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44"
-			  "\x44\x44\x44\x44\x44\x44\x44\x44",
-		.rlen	= 32,
-	}, {
-		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
-			  "\x23\x53\x60\x28\x74\x71\x35\x26"
-			  "\x31\x41\x59\x26\x53\x58\x97\x93"
-			  "\x23\x84\x62\x64\x33\x83\x27\x95",
-		.klen	= 32,
-		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\x57\xb5\xf8\x71\x6e\x6d\xdd\x82"
-			  "\x53\xd0\xed\x2d\x30\xc1\x20\xef"
-			  "\x70\x67\x5e\xff\x09\x70\xbb\xc1"
-			  "\x3a\x7b\x48\x26\xd9\x0b\xf4\x48"
-			  "\xbe\xce\xb1\xc7\xb2\x67\xc4\xa7"
-			  "\x76\xf8\x36\x30\xb7\xb4\x9a\xd9"
-			  "\xf5\x9d\xd0\x7b\xc1\x06\x96\x44"
-			  "\x19\xc5\x58\x84\x63\xb9\x12\x68"
-			  "\x68\xc7\xaa\x18\x98\xf2\x1f\x5c"
-			  "\x39\xa6\xd8\x32\x2b\xc3\x51\xfd"
-			  "\x74\x79\x2e\xb4\x44\xd7\x69\xc4"
-			  "\xfc\x29\xe6\xed\x26\x1e\xa6\x9d"
-			  "\x1c\xbe\x00\x0e\x7f\x3a\xca\xfb"
-			  "\x6d\x13\x65\xa0\xf9\x31\x12\xe2"
-			  "\x26\xd1\xec\x2b\x0a\x8b\x59\x99"
-			  "\xa7\x49\xa0\x0e\x09\x33\x85\x50"
-			  "\xc3\x23\xca\x7a\xdd\x13\x45\x5f"
-			  "\xde\x4c\xa7\xcb\x00\x8a\x66\x6f"
-			  "\xa2\xb6\xb1\x2e\xe1\xa0\x18\xf6"
-			  "\xad\xf3\xbd\xeb\xc7\xef\x55\x4f"
-			  "\x79\x91\x8d\x36\x13\x7b\xd0\x4a"
-			  "\x6c\x39\xfb\x53\xb8\x6f\x02\x51"
-			  "\xa5\x20\xac\x24\x1c\x73\x59\x73"
-			  "\x58\x61\x3a\x87\x58\xb3\x20\x56"
-			  "\x39\x06\x2b\x4d\xd3\x20\x2b\x89"
-			  "\x3f\xa2\xf0\x96\xeb\x7f\xa4\xcd"
-			  "\x11\xae\xbd\xcb\x3a\xb4\xd9\x91"
-			  "\x09\x35\x71\x50\x65\xac\x92\xe3"
-			  "\x7b\x32\xc0\x7a\xdd\xd4\xc3\x92"
-			  "\x6f\xeb\x79\xde\x6f\xd3\x25\xc9"
-			  "\xcd\x63\xf5\x1e\x7a\x3b\x26\x9d"
-			  "\x77\x04\x80\xa9\xbf\x38\xb5\xbd"
-			  "\xb8\x05\x07\xbd\xfd\xab\x7b\xf8"
-			  "\x2a\x26\xcc\x49\x14\x6d\x55\x01"
-			  "\x06\x94\xd8\xb2\x2d\x53\x83\x1b"
-			  "\x8f\xd4\xdd\x57\x12\x7e\x18\xba"
-			  "\x8e\xe2\x4d\x80\xef\x7e\x6b\x9d"
-			  "\x24\xa9\x60\xa4\x97\x85\x86\x2a"
-			  "\x01\x00\x09\xf1\xcb\x4a\x24\x1c"
-			  "\xd8\xf6\xe6\x5b\xe7\x5d\xf2\xc4"
-			  "\x97\x1c\x10\xc6\x4d\x66\x4f\x98"
-			  "\x87\x30\xac\xd5\xea\x73\x49\x10"
-			  "\x80\xea\xe5\x5f\x4d\x5f\x03\x33"
-			  "\x66\x02\x35\x3d\x60\x06\x36\x4f"
-			  "\x14\x1c\xd8\x07\x1f\x78\xd0\xf8"
-			  "\x4f\x6c\x62\x7c\x15\xa5\x7c\x28"
-			  "\x7c\xcc\xeb\x1f\xd1\x07\x90\x93"
-			  "\x7e\xc2\xa8\x3a\x80\xc0\xf5\x30"
-			  "\xcc\x75\xcf\x16\x26\xa9\x26\x3b"
-			  "\xe7\x68\x2f\x15\x21\x5b\xe4\x00"
-			  "\xbd\x48\x50\xcd\x75\x70\xc4\x62"
-			  "\xbb\x41\xfb\x89\x4a\x88\x3b\x3b"
-			  "\x51\x66\x02\x69\x04\x97\x36\xd4"
-			  "\x75\xae\x0b\xa3\x42\xf8\xca\x79"
-			  "\x8f\x93\xe9\xcc\x38\xbd\xd6\xd2"
-			  "\xf9\x70\x4e\xc3\x6a\x8e\x25\xbd"
-			  "\xea\x15\x5a\xa0\x85\x7e\x81\x0d"
-			  "\x03\xe7\x05\x39\xf5\x05\x26\xee"
-			  "\xec\xaa\x1f\x3d\xc9\x98\x76\x01"
-			  "\x2c\xf4\xfc\xa3\x88\x77\x38\xc4"
-			  "\x50\x65\x50\x6d\x04\x1f\xdf\x5a"
-			  "\xaa\xf2\x01\xa9\xc1\x8d\xee\xca"
-			  "\x47\x26\xef\x39\xb8\xb4\xf2\xd1"
-			  "\xd6\xbb\x1b\x2a\xc1\x34\x14\xcf",
-		.ilen	= 512,
-		.result	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
-			  "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.rlen	= 512,
-	}, {
-		.key	= "\x27\x18\x28\x18\x28\x45\x90\x45"
-			  "\x23\x53\x60\x28\x74\x71\x35\x26"
-			  "\x62\x49\x77\x57\x24\x70\x93\x69"
-			  "\x99\x59\x57\x49\x66\x96\x76\x27"
-			  "\x31\x41\x59\x26\x53\x58\x97\x93"
-			  "\x23\x84\x62\x64\x33\x83\x27\x95"
-			  "\x02\x88\x41\x97\x16\x93\x99\x37"
-			  "\x51\x05\x82\x09\x74\x94\x45\x92",
-		.klen	= 64,
-		.iv	= "\xff\x00\x00\x00\x00\x00\x00\x00"
-			  "\x00\x00\x00\x00\x00\x00\x00\x00",
-		.input	= "\xc5\x85\x2a\x4b\x73\xe4\xf6\xf1"
-			  "\x7e\xf9\xf6\xe9\xa3\x73\x36\xcb"
-			  "\xaa\xb6\x22\xb0\x24\x6e\x3d\x73"
-			  "\x92\x99\xde\xd3\x76\xed\xcd\x63"
-			  "\x64\x3a\x22\x57\xc1\x43\x49\xd4"
-			  "\x79\x36\x31\x19\x62\xae\x10\x7e"
-			  "\x7d\xcf\x7a\xe2\x6b\xce\x27\xfa"
-			  "\xdc\x3d\xd9\x83\xd3\x42\x4c\xe0"
-			  "\x1b\xd6\x1d\x1a\x6f\xd2\x03\x00"
-			  "\xfc\x81\x99\x8a\x14\x62\xf5\x7e"
-			  "\x0d\xe7\x12\xe8\x17\x9d\x0b\xec"
-			  "\xe2\xf7\xc9\xa7\x63\xd1\x79\xb6"
-			  "\x62\x62\x37\xfe\x0a\x4c\x4a\x37"
-			  "\x70\xc7\x5e\x96\x5f\xbc\x8e\x9e"
-			  "\x85\x3c\x4f\x26\x64\x85\xbc\x68"
-			  "\xb0\xe0\x86\x5e\x26\x41\xce\x11"
-			  "\x50\xda\x97\x14\xe9\x9e\xc7\x6d"
-			  "\x3b\xdc\x43\xde\x2b\x27\x69\x7d"
-			  "\xfc\xb0\x28\xbd\x8f\xb1\xc6\x31"
-			  "\x14\x4d\xf0\x74\x37\xfd\x07\x25"
-			  "\x96\x55\xe5\xfc\x9e\x27\x2a\x74"
-			  "\x1b\x83\x4d\x15\x83\xac\x57\xa0"
-			  "\xac\xa5\xd0\x38\xef\x19\x56\x53"
-			  "\x25\x4b\xfc\xce\x04\x23\xe5\x6b"
-			  "\xf6\xc6\x6c\x32\x0b\xb3\x12\xc5"
-			  "\xed\x22\x34\x1c\x5d\xed\x17\x06"
-			  "\x36\xa3\xe6\x77\xb9\x97\x46\xb8"
-			  "\xe9\x3f\x7e\xc7\xbc\x13\x5c\xdc"
-			  "\x6e\x3f\x04\x5e\xd1\x59\xa5\x82"
-			  "\x35\x91\x3d\x1b\xe4\x97\x9f\x92"
-			  "\x1c\x5e\x5f\x6f\x41\xd4\x62\xa1"
-			  "\x8d\x39\xfc\x42\xfb\x38\x80\xb9"
-			  "\x0a\xe3\xcc\x6a\x93\xd9\x7a\xb1"
-			  "\xe9\x69\xaf\x0a\x6b\x75\x38\xa7"
-			  "\xa1\xbf\xf7\xda\x95\x93\x4b\x78"
-			  "\x19\xf5\x94\xf9\xd2\x00\x33\x37"
-			  "\xcf\xf5\x9e\x9c\xf3\xcc\xa6\xee"
-			  "\x42\xb2\x9e\x2c\x5f\x48\x23\x26"
-			  "\x15\x25\x17\x03\x3d\xfe\x2c\xfc"
-			  "\xeb\xba\xda\xe0\x00\x05\xb6\xa6"
-			  "\x07\xb3\xe8\x36\x5b\xec\x5b\xbf"
-			  "\xd6\x5b\x00\x74\xc6\x97\xf1\x6a"
-			  "\x49\xa1\xc3\xfa\x10\x52\xb9\x14"
-			  "\xad\xb7\x73\xf8\x78\x12\xc8\x59"
-			  "\x17\x80\x4c\x57\x39\xf1\x6d\x80"
-			  "\x25\x77\x0f\x5e\x7d\xf0\xaf\x21"
-			  "\xec\xce\xb7\xc8\x02\x8a\xed\x53"
-			  "\x2c\x25\x68\x2e\x1f\x85\x5e\x67"
-			  "\xd1\x07\x7a\x3a\x89\x08\xe0\x34"
-			  "\xdc\xdb\x26\xb4\x6b\x77\xfc\x40"
-			  "\x31\x15\x72\xa0\xf0\x73\xd9\x3b"
-			  "\xd5\xdb\xfe\xfc\x8f\xa9\x44\xa2"
-			  "\x09\x9f\xc6\x33\xe5\xe2\x88\xe8"
-			  "\xf3\xf0\x1a\xf4\xce\x12\x0f\xd6"
-			  "\xf7\x36\xe6\xa4\xf4\x7a\x10\x58"
-			  "\xcc\x1f\x48\x49\x65\x47\x75\xe9"
-			  "\x28\xe1\x65\x7b\xf2\xc4\xb5\x07"
-			  "\xf2\xec\x76\xd8\x8f\x09\xf3\x16"
-			  "\xa1\x51\x89\x3b\xeb\x96\x42\xac"
-			  "\x65\xe0\x67\x63\x29\xdc\xb4\x7d"
-			  "\xf2\x41\x51\x6a\xcb\xde\x3c\xfb"
-			  "\x66\x8d\x13\xca\xe0\x59\x2a\x00"
-			  "\xc9\x53\x4c\xe6\x9e\xe2\x73\xd5"
-			  "\x67\x19\xb2\xbd\x9a\x63\xd7\x5c",
-		.ilen	= 512,
-		.result	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
-			  "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			  "\x20\x21\x22\x23\x24\x25\x26\x27"
-			  "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			  "\x30\x31\x32\x33\x34\x35\x36\x37"
-			  "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			  "\x40\x41\x42\x43\x44\x45\x46\x47"
-			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
-			  "\x50\x51\x52\x53\x54\x55\x56\x57"
-			  "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
-			  "\x60\x61\x62\x63\x64\x65\x66\x67"
-			  "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			  "\x70\x71\x72\x73\x74\x75\x76\x77"
-			  "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			  "\x80\x81\x82\x83\x84\x85\x86\x87"
-			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			  "\x90\x91\x92\x93\x94\x95\x96\x97"
-			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
-			  "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
-			  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
-			  "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-			  "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
-			  "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
-			  "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
-			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.rlen	= 512,
-		.also_non_np = 1,
-		.np	= 3,
-		.tap	= { 512 - 20, 4, 16 },
-	}
-};
-
 static struct cipher_testvec speck64_enc_tv_template[] = {
 	{ /* Speck64/96 */
 		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"

From e6a822e8dd797d2605afd394f59853bb1bdd1d6e Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Mon, 22 Oct 2018 14:14:57 -0700
Subject: [PATCH 1843/3220] Revert "BACKPORT, FROMGIT: crypto: arm/speck - add
 NEON-accelerated implementation of Speck-XTS"

This reverts commit f54e3ccbe8a2be3947e4238817de3e43f1da8fe9.

Bug: 116008047
Change-Id: I3c76a77dfae5894b46e39266f9f8d7c7294ab615
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/arm/crypto/Kconfig           |   7 -
 arch/arm/crypto/Makefile          |   2 -
 arch/arm/crypto/speck-neon-core.S | 432 ------------------------------
 arch/arm/crypto/speck-neon-glue.c | 314 ----------------------
 4 files changed, 755 deletions(-)
 delete mode 100644 arch/arm/crypto/speck-neon-core.S
 delete mode 100644 arch/arm/crypto/speck-neon-glue.c

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 72aa5ecd5580..27ed1b1cd1d7 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -120,11 +120,4 @@ config CRYPTO_GHASH_ARM_CE
 	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
 	  that is part of the ARMv8 Crypto Extensions
 
-config CRYPTO_SPECK_NEON
-	tristate "NEON accelerated Speck cipher algorithms"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLKCIPHER
-	select CRYPTO_GF128MUL
-	select CRYPTO_SPECK
-
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 1d0448a875ee..fc5150702b64 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,7 +8,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
-obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -37,7 +36,6 @@ sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
-speck-neon-y := speck-neon-core.o speck-neon-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/speck-neon-core.S b/arch/arm/crypto/speck-neon-core.S
deleted file mode 100644
index 3c1e203e53b9..000000000000
--- a/arch/arm/crypto/speck-neon-core.S
+++ /dev/null
@@ -1,432 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-	.text
-	.fpu		neon
-
-	// arguments
-	ROUND_KEYS	.req	r0	// const {u64,u32} *round_keys
-	NROUNDS		.req	r1	// int nrounds
-	DST		.req	r2	// void *dst
-	SRC		.req	r3	// const void *src
-	NBYTES		.req	r4	// unsigned int nbytes
-	TWEAK		.req	r5	// void *tweak
-
-	// registers which hold the data being encrypted/decrypted
-	X0		.req	q0
-	X0_L		.req	d0
-	X0_H		.req	d1
-	Y0		.req	q1
-	Y0_H		.req	d3
-	X1		.req	q2
-	X1_L		.req	d4
-	X1_H		.req	d5
-	Y1		.req	q3
-	Y1_H		.req	d7
-	X2		.req	q4
-	X2_L		.req	d8
-	X2_H		.req	d9
-	Y2		.req	q5
-	Y2_H		.req	d11
-	X3		.req	q6
-	X3_L		.req	d12
-	X3_H		.req	d13
-	Y3		.req	q7
-	Y3_H		.req	d15
-
-	// the round key, duplicated in all lanes
-	ROUND_KEY	.req	q8
-	ROUND_KEY_L	.req	d16
-	ROUND_KEY_H	.req	d17
-
-	// index vector for vtbl-based 8-bit rotates
-	ROTATE_TABLE	.req	d18
-
-	// multiplication table for updating XTS tweaks
-	GF128MUL_TABLE	.req	d19
-	GF64MUL_TABLE	.req	d19
-
-	// current XTS tweak value(s)
-	TWEAKV		.req	q10
-	TWEAKV_L	.req	d20
-	TWEAKV_H	.req	d21
-
-	TMP0		.req	q12
-	TMP0_L		.req	d24
-	TMP0_H		.req	d25
-	TMP1		.req	q13
-	TMP2		.req	q14
-	TMP3		.req	q15
-
-	.align		4
-.Lror64_8_table:
-	.byte		1, 2, 3, 4, 5, 6, 7, 0
-.Lror32_8_table:
-	.byte		1, 2, 3, 0, 5, 6, 7, 4
-.Lrol64_8_table:
-	.byte		7, 0, 1, 2, 3, 4, 5, 6
-.Lrol32_8_table:
-	.byte		3, 0, 1, 2, 7, 4, 5, 6
-.Lgf128mul_table:
-	.byte		0, 0x87
-	.fill		14
-.Lgf64mul_table:
-	.byte		0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
-	.fill		12
-
-/*
- * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
- *
- * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
- * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
- * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
- *
- * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
- * the vtbl approach is faster on some processors and the same speed on others.
- */
-.macro _speck_round_128bytes	n
-
-	// x = ror(x, 8)
-	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
-	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
-	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
-	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
-	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
-	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
-	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
-	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
-
-	// x += y
-	vadd.u\n	X0, Y0
-	vadd.u\n	X1, Y1
-	vadd.u\n	X2, Y2
-	vadd.u\n	X3, Y3
-
-	// x ^= k
-	veor		X0, ROUND_KEY
-	veor		X1, ROUND_KEY
-	veor		X2, ROUND_KEY
-	veor		X3, ROUND_KEY
-
-	// y = rol(y, 3)
-	vshl.u\n	TMP0, Y0, #3
-	vshl.u\n	TMP1, Y1, #3
-	vshl.u\n	TMP2, Y2, #3
-	vshl.u\n	TMP3, Y3, #3
-	vsri.u\n	TMP0, Y0, #(\n - 3)
-	vsri.u\n	TMP1, Y1, #(\n - 3)
-	vsri.u\n	TMP2, Y2, #(\n - 3)
-	vsri.u\n	TMP3, Y3, #(\n - 3)
-
-	// y ^= x
-	veor		Y0, TMP0, X0
-	veor		Y1, TMP1, X1
-	veor		Y2, TMP2, X2
-	veor		Y3, TMP3, X3
-.endm
-
-/*
- * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
- *
- * This is the inverse of _speck_round_128bytes().
- */
-.macro _speck_unround_128bytes	n
-
-	// y ^= x
-	veor		TMP0, Y0, X0
-	veor		TMP1, Y1, X1
-	veor		TMP2, Y2, X2
-	veor		TMP3, Y3, X3
-
-	// y = ror(y, 3)
-	vshr.u\n	Y0, TMP0, #3
-	vshr.u\n	Y1, TMP1, #3
-	vshr.u\n	Y2, TMP2, #3
-	vshr.u\n	Y3, TMP3, #3
-	vsli.u\n	Y0, TMP0, #(\n - 3)
-	vsli.u\n	Y1, TMP1, #(\n - 3)
-	vsli.u\n	Y2, TMP2, #(\n - 3)
-	vsli.u\n	Y3, TMP3, #(\n - 3)
-
-	// x ^= k
-	veor		X0, ROUND_KEY
-	veor		X1, ROUND_KEY
-	veor		X2, ROUND_KEY
-	veor		X3, ROUND_KEY
-
-	// x -= y
-	vsub.u\n	X0, Y0
-	vsub.u\n	X1, Y1
-	vsub.u\n	X2, Y2
-	vsub.u\n	X3, Y3
-
-	// x = rol(x, 8);
-	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
-	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
-	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
-	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
-	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
-	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
-	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
-	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
-.endm
-
-.macro _xts128_precrypt_one	dst_reg, tweak_buf, tmp
-
-	// Load the next source block
-	vld1.8		{\dst_reg}, [SRC]!
-
-	// Save the current tweak in the tweak buffer
-	vst1.8		{TWEAKV}, [\tweak_buf:128]!
-
-	// XOR the next source block with the current tweak
-	veor		\dst_reg, TWEAKV
-
-	/*
-	 * Calculate the next tweak by multiplying the current one by x,
-	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
-	 */
-	vshr.u64	\tmp, TWEAKV, #63
-	vshl.u64	TWEAKV, #1
-	veor		TWEAKV_H, \tmp\()_L
-	vtbl.8		\tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
-	veor		TWEAKV_L, \tmp\()_H
-.endm
-
-.macro _xts64_precrypt_two	dst_reg, tweak_buf, tmp
-
-	// Load the next two source blocks
-	vld1.8		{\dst_reg}, [SRC]!
-
-	// Save the current two tweaks in the tweak buffer
-	vst1.8		{TWEAKV}, [\tweak_buf:128]!
-
-	// XOR the next two source blocks with the current two tweaks
-	veor		\dst_reg, TWEAKV
-
-	/*
-	 * Calculate the next two tweaks by multiplying the current ones by x^2,
-	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
-	 */
-	vshr.u64	\tmp, TWEAKV, #62
-	vshl.u64	TWEAKV, #2
-	vtbl.8		\tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
-	vtbl.8		\tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
-	veor		TWEAKV, \tmp
-.endm
-
-/*
- * _speck_xts_crypt() - Speck-XTS encryption/decryption
- *
- * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
- * using Speck-XTS, specifically the variant with a block size of '2n' and round
- * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
- * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
- * nonzero multiple of 128.
- */
-.macro _speck_xts_crypt	n, decrypting
-	push		{r4-r7}
-	mov		r7, sp
-
-	/*
-	 * The first four parameters were passed in registers r0-r3.  Load the
-	 * additional parameters, which were passed on the stack.
-	 */
-	ldr		NBYTES, [sp, #16]
-	ldr		TWEAK, [sp, #20]
-
-	/*
-	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
-	 * round key rather than the first, since for decryption the round keys
-	 * are used in reverse order.
-	 */
-.if \decrypting
-.if \n == 64
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
-	sub		ROUND_KEYS, #8
-.else
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
-	sub		ROUND_KEYS, #4
-.endif
-.endif
-
-	// Load the index vector for vtbl-based 8-bit rotates
-.if \decrypting
-	ldr		r12, =.Lrol\n\()_8_table
-.else
-	ldr		r12, =.Lror\n\()_8_table
-.endif
-	vld1.8		{ROTATE_TABLE}, [r12:64]
-
-	// One-time XTS preparation
-
-	/*
-	 * Allocate stack space to store 128 bytes worth of tweaks.  For
-	 * performance, this space is aligned to a 16-byte boundary so that we
-	 * can use the load/store instructions that declare 16-byte alignment.
-	 */
-	sub		sp, #128
-	bic		sp, #0xf
-
-.if \n == 64
-	// Load first tweak
-	vld1.8		{TWEAKV}, [TWEAK]
-
-	// Load GF(2^128) multiplication table
-	ldr		r12, =.Lgf128mul_table
-	vld1.8		{GF128MUL_TABLE}, [r12:64]
-.else
-	// Load first tweak
-	vld1.8		{TWEAKV_L}, [TWEAK]
-
-	// Load GF(2^64) multiplication table
-	ldr		r12, =.Lgf64mul_table
-	vld1.8		{GF64MUL_TABLE}, [r12:64]
-
-	// Calculate second tweak, packing it together with the first
-	vshr.u64	TMP0_L, TWEAKV_L, #63
-	vtbl.u8		TMP0_L, {GF64MUL_TABLE}, TMP0_L
-	vshl.u64	TWEAKV_H, TWEAKV_L, #1
-	veor		TWEAKV_H, TMP0_L
-.endif
-
-.Lnext_128bytes_\@:
-
-	/*
-	 * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
-	 * values, and save the tweaks on the stack for later.  Then
-	 * de-interleave the 'x' and 'y' elements of each block, i.e. make it so
-	 * that the X[0-3] registers contain only the second halves of blocks,
-	 * and the Y[0-3] registers contain only the first halves of blocks.
-	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
-	 */
-	mov		r12, sp
-.if \n == 64
-	_xts128_precrypt_one	X0, r12, TMP0
-	_xts128_precrypt_one	Y0, r12, TMP0
-	_xts128_precrypt_one	X1, r12, TMP0
-	_xts128_precrypt_one	Y1, r12, TMP0
-	_xts128_precrypt_one	X2, r12, TMP0
-	_xts128_precrypt_one	Y2, r12, TMP0
-	_xts128_precrypt_one	X3, r12, TMP0
-	_xts128_precrypt_one	Y3, r12, TMP0
-	vswp		X0_L, Y0_H
-	vswp		X1_L, Y1_H
-	vswp		X2_L, Y2_H
-	vswp		X3_L, Y3_H
-.else
-	_xts64_precrypt_two	X0, r12, TMP0
-	_xts64_precrypt_two	Y0, r12, TMP0
-	_xts64_precrypt_two	X1, r12, TMP0
-	_xts64_precrypt_two	Y1, r12, TMP0
-	_xts64_precrypt_two	X2, r12, TMP0
-	_xts64_precrypt_two	Y2, r12, TMP0
-	_xts64_precrypt_two	X3, r12, TMP0
-	_xts64_precrypt_two	Y3, r12, TMP0
-	vuzp.32		Y0, X0
-	vuzp.32		Y1, X1
-	vuzp.32		Y2, X2
-	vuzp.32		Y3, X3
-.endif
-
-	// Do the cipher rounds
-
-	mov		r12, ROUND_KEYS
-	mov		r6, NROUNDS
-
-.Lnext_round_\@:
-.if \decrypting
-.if \n == 64
-	vld1.64		ROUND_KEY_L, [r12]
-	sub		r12, #8
-	vmov		ROUND_KEY_H, ROUND_KEY_L
-.else
-	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
-	sub		r12, #4
-.endif
-	_speck_unround_128bytes	\n
-.else
-.if \n == 64
-	vld1.64		ROUND_KEY_L, [r12]!
-	vmov		ROUND_KEY_H, ROUND_KEY_L
-.else
-	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
-.endif
-	_speck_round_128bytes	\n
-.endif
-	subs		r6, r6, #1
-	bne		.Lnext_round_\@
-
-	// Re-interleave the 'x' and 'y' elements of each block
-.if \n == 64
-	vswp		X0_L, Y0_H
-	vswp		X1_L, Y1_H
-	vswp		X2_L, Y2_H
-	vswp		X3_L, Y3_H
-.else
-	vzip.32		Y0, X0
-	vzip.32		Y1, X1
-	vzip.32		Y2, X2
-	vzip.32		Y3, X3
-.endif
-
-	// XOR the encrypted/decrypted blocks with the tweaks we saved earlier
-	mov		r12, sp
-	vld1.8		{TMP0, TMP1}, [r12:128]!
-	vld1.8		{TMP2, TMP3}, [r12:128]!
-	veor		X0, TMP0
-	veor		Y0, TMP1
-	veor		X1, TMP2
-	veor		Y1, TMP3
-	vld1.8		{TMP0, TMP1}, [r12:128]!
-	vld1.8		{TMP2, TMP3}, [r12:128]!
-	veor		X2, TMP0
-	veor		Y2, TMP1
-	veor		X3, TMP2
-	veor		Y3, TMP3
-
-	// Store the ciphertext in the destination buffer
-	vst1.8		{X0, Y0}, [DST]!
-	vst1.8		{X1, Y1}, [DST]!
-	vst1.8		{X2, Y2}, [DST]!
-	vst1.8		{X3, Y3}, [DST]!
-
-	// Continue if there are more 128-byte chunks remaining, else return
-	subs		NBYTES, #128
-	bne		.Lnext_128bytes_\@
-
-	// Store the next tweak
-.if \n == 64
-	vst1.8		{TWEAKV}, [TWEAK]
-.else
-	vst1.8		{TWEAKV_L}, [TWEAK]
-.endif
-
-	mov		sp, r7
-	pop		{r4-r7}
-	bx		lr
-.endm
-
-ENTRY(speck128_xts_encrypt_neon)
-	_speck_xts_crypt	n=64, decrypting=0
-ENDPROC(speck128_xts_encrypt_neon)
-
-ENTRY(speck128_xts_decrypt_neon)
-	_speck_xts_crypt	n=64, decrypting=1
-ENDPROC(speck128_xts_decrypt_neon)
-
-ENTRY(speck64_xts_encrypt_neon)
-	_speck_xts_crypt	n=32, decrypting=0
-ENDPROC(speck64_xts_encrypt_neon)
-
-ENTRY(speck64_xts_decrypt_neon)
-	_speck_xts_crypt	n=32, decrypting=1
-ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm/crypto/speck-neon-glue.c b/arch/arm/crypto/speck-neon-glue.c
deleted file mode 100644
index ea36c3a6f1d9..000000000000
--- a/arch/arm/crypto/speck-neon-glue.c
+++ /dev/null
@@ -1,314 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Note: the NIST recommendation for XTS only specifies a 128-bit block size,
- * but a 64-bit version (needed for Speck64) is fairly straightforward; the math
- * is just done in GF(2^64) instead of GF(2^128), with the reducing polynomial
- * x^64 + x^4 + x^3 + x + 1 from the original XEX paper (Rogaway, 2004:
- * "Efficient Instantiations of Tweakable Blockciphers and Refinements to Modes
- * OCB and PMAC"), represented as 0x1B.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/algapi.h>
-#include <crypto/gf128mul.h>
-#include <crypto/speck.h>
-#include <crypto/xts.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/* The assembly functions only handle multiples of 128 bytes */
-#define SPECK_NEON_CHUNK_SIZE	128
-
-/* Speck128 */
-
-struct speck128_xts_tfm_ctx {
-	struct speck128_tfm_ctx main_key;
-	struct speck128_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
-				     u8 *, const u8 *);
-typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
-					  const void *, unsigned int, void *);
-
-static __always_inline int
-__speck128_xts_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		     struct scatterlist *src, unsigned int nbytes,
-		     speck128_crypt_one_t crypt_one,
-		     speck128_xts_crypt_many_t crypt_many)
-{
-	struct crypto_blkcipher *tfm = desc->tfm;
-	const struct speck128_xts_tfm_ctx *ctx = crypto_blkcipher_ctx(tfm);
-	struct blkcipher_walk walk;
-	le128 tweak;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, SPECK_NEON_CHUNK_SIZE);
-
-	crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			le128_xor((le128 *)dst, (const le128 *)src, &tweak);
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
-			gf128mul_x_ble((be128 *)&tweak, (const be128 *)&tweak);
-
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck128_xts_encrypt(struct blkcipher_desc *desc,
-				struct scatterlist *dst,
-				struct scatterlist *src,
-				unsigned int nbytes)
-{
-	return __speck128_xts_crypt(desc, dst, src, nbytes,
-				    crypto_speck128_encrypt,
-				    speck128_xts_encrypt_neon);
-}
-
-static int speck128_xts_decrypt(struct blkcipher_desc *desc,
-				struct scatterlist *dst,
-				struct scatterlist *src,
-				unsigned int nbytes)
-{
-	return __speck128_xts_crypt(desc, dst, src, nbytes,
-				    crypto_speck128_decrypt,
-				    speck128_xts_decrypt_neon);
-}
-
-static int speck128_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
-			       unsigned int keylen)
-{
-	struct speck128_xts_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	int err;
-
-	if (keylen % 2)
-		return -EINVAL;
-
-	keylen /= 2;
-
-	err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-/* Speck64 */
-
-struct speck64_xts_tfm_ctx {
-	struct speck64_tfm_ctx main_key;
-	struct speck64_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
-				    u8 *, const u8 *);
-typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
-					 const void *, unsigned int, void *);
-
-static __always_inline int
-__speck64_xts_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		    struct scatterlist *src, unsigned int nbytes,
-		    speck64_crypt_one_t crypt_one,
-		    speck64_xts_crypt_many_t crypt_many)
-{
-	struct crypto_blkcipher *tfm = desc->tfm;
-	const struct speck64_xts_tfm_ctx *ctx = crypto_blkcipher_ctx(tfm);
-	struct blkcipher_walk walk;
-	__le64 tweak;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, SPECK_NEON_CHUNK_SIZE);
-
-	crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			*(__le64 *)dst = *(__le64 *)src ^ tweak;
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			*(__le64 *)dst ^= tweak;
-			tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
-					    ((tweak & cpu_to_le64(1ULL << 63)) ?
-					     0x1B : 0));
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck64_xts_encrypt(struct blkcipher_desc *desc,
-			       struct scatterlist *dst, struct scatterlist *src,
-			       unsigned int nbytes)
-{
-	return __speck64_xts_crypt(desc, dst, src, nbytes,
-				   crypto_speck64_encrypt,
-				   speck64_xts_encrypt_neon);
-}
-
-static int speck64_xts_decrypt(struct blkcipher_desc *desc,
-			       struct scatterlist *dst, struct scatterlist *src,
-			       unsigned int nbytes)
-{
-	return __speck64_xts_crypt(desc, dst, src, nbytes,
-				   crypto_speck64_decrypt,
-				   speck64_xts_decrypt_neon);
-}
-
-static int speck64_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
-{
-	struct speck64_xts_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	int err;
-
-	if (keylen % 2)
-		return -EINVAL;
-
-	keylen /= 2;
-
-	err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-static struct crypto_alg speck_algs[] = {
-	{
-		.cra_name		= "xts(speck128)",
-		.cra_driver_name	= "xts-speck128-neon",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-		.cra_blocksize		= SPECK128_BLOCK_SIZE,
-		.cra_type		= &crypto_blkcipher_type,
-		.cra_ctxsize		= sizeof(struct speck128_xts_tfm_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-		.cra_u = {
-			.blkcipher = {
-				.min_keysize		= 2 * SPECK128_128_KEY_SIZE,
-				.max_keysize		= 2 * SPECK128_256_KEY_SIZE,
-				.ivsize			= SPECK128_BLOCK_SIZE,
-				.setkey			= speck128_xts_setkey,
-				.encrypt		= speck128_xts_encrypt,
-				.decrypt		= speck128_xts_decrypt,
-			}
-		}
-	}, {
-		.cra_name		= "xts(speck64)",
-		.cra_driver_name	= "xts-speck64-neon",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-		.cra_blocksize		= SPECK64_BLOCK_SIZE,
-		.cra_type		= &crypto_blkcipher_type,
-		.cra_ctxsize		= sizeof(struct speck64_xts_tfm_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-		.cra_u = {
-			.blkcipher = {
-				.min_keysize		= 2 * SPECK64_96_KEY_SIZE,
-				.max_keysize		= 2 * SPECK64_128_KEY_SIZE,
-				.ivsize			= SPECK64_BLOCK_SIZE,
-				.setkey			= speck64_xts_setkey,
-				.encrypt		= speck64_xts_encrypt,
-				.decrypt		= speck64_xts_decrypt,
-			}
-		}
-	}
-};
-
-static int __init speck_neon_module_init(void)
-{
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
-	return crypto_register_algs(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-static void __exit speck_neon_module_exit(void)
-{
-	crypto_unregister_algs(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-module_init(speck_neon_module_init);
-module_exit(speck_neon_module_exit);
-
-MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("xts(speck128)");
-MODULE_ALIAS_CRYPTO("xts-speck128-neon");
-MODULE_ALIAS_CRYPTO("xts(speck64)");
-MODULE_ALIAS_CRYPTO("xts-speck64-neon");

From 96976f956db793c2cee2e8dc4507e2001d3ab34f Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Mon, 22 Oct 2018 14:15:33 -0700
Subject: [PATCH 1844/3220] Revert "FROMGIT: crypto: speck - export common
 helpers"

This reverts commit e86420efb2dfeee0152e125765134d423759bd5a.

Bug: 116008047
Change-Id: I9d0a8357be1ab090a793646716771015299fb7fe
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 crypto/speck.c         | 90 +++++++++++++++++++-----------------------
 include/crypto/speck.h | 62 -----------------------------
 2 files changed, 41 insertions(+), 111 deletions(-)
 delete mode 100644 include/crypto/speck.h

diff --git a/crypto/speck.c b/crypto/speck.c
index 58aa9f7f91f7..4e80ad76bcd7 100644
--- a/crypto/speck.c
+++ b/crypto/speck.c
@@ -24,7 +24,6 @@
  */
 
 #include <asm/unaligned.h>
-#include <crypto/speck.h>
 #include <linux/bitops.h>
 #include <linux/crypto.h>
 #include <linux/init.h>
@@ -32,6 +31,22 @@
 
 /* Speck128 */
 
+#define SPECK128_BLOCK_SIZE	16
+
+#define SPECK128_128_KEY_SIZE	16
+#define SPECK128_128_NROUNDS	32
+
+#define SPECK128_192_KEY_SIZE	24
+#define SPECK128_192_NROUNDS	33
+
+#define SPECK128_256_KEY_SIZE	32
+#define SPECK128_256_NROUNDS	34
+
+struct speck128_tfm_ctx {
+	u64 round_keys[SPECK128_256_NROUNDS];
+	int nrounds;
+};
+
 static __always_inline void speck128_round(u64 *x, u64 *y, u64 k)
 {
 	*x = ror64(*x, 8);
@@ -50,9 +65,9 @@ static __always_inline void speck128_unround(u64 *x, u64 *y, u64 k)
 	*x = rol64(*x, 8);
 }
 
-void crypto_speck128_encrypt(const struct speck128_tfm_ctx *ctx,
-			     u8 *out, const u8 *in)
+static void speck128_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
+	const struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 y = get_unaligned_le64(in);
 	u64 x = get_unaligned_le64(in + 8);
 	int i;
@@ -63,16 +78,10 @@ void crypto_speck128_encrypt(const struct speck128_tfm_ctx *ctx,
 	put_unaligned_le64(y, out);
 	put_unaligned_le64(x, out + 8);
 }
-EXPORT_SYMBOL_GPL(crypto_speck128_encrypt);
 
-static void speck128_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	crypto_speck128_encrypt(crypto_tfm_ctx(tfm), out, in);
-}
-
-void crypto_speck128_decrypt(const struct speck128_tfm_ctx *ctx,
-			     u8 *out, const u8 *in)
+static void speck128_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
+	const struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 y = get_unaligned_le64(in);
 	u64 x = get_unaligned_le64(in + 8);
 	int i;
@@ -83,16 +92,11 @@ void crypto_speck128_decrypt(const struct speck128_tfm_ctx *ctx,
 	put_unaligned_le64(y, out);
 	put_unaligned_le64(x, out + 8);
 }
-EXPORT_SYMBOL_GPL(crypto_speck128_decrypt);
 
-static void speck128_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	crypto_speck128_decrypt(crypto_tfm_ctx(tfm), out, in);
-}
-
-int crypto_speck128_setkey(struct speck128_tfm_ctx *ctx, const u8 *key,
+static int speck128_setkey(struct crypto_tfm *tfm, const u8 *key,
 			   unsigned int keylen)
 {
+	struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u64 l[3];
 	u64 k;
 	int i;
@@ -134,16 +138,22 @@ int crypto_speck128_setkey(struct speck128_tfm_ctx *ctx, const u8 *key,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(crypto_speck128_setkey);
-
-static int speck128_setkey(struct crypto_tfm *tfm, const u8 *key,
-			   unsigned int keylen)
-{
-	return crypto_speck128_setkey(crypto_tfm_ctx(tfm), key, keylen);
-}
 
 /* Speck64 */
 
+#define SPECK64_BLOCK_SIZE	8
+
+#define SPECK64_96_KEY_SIZE	12
+#define SPECK64_96_NROUNDS	26
+
+#define SPECK64_128_KEY_SIZE	16
+#define SPECK64_128_NROUNDS	27
+
+struct speck64_tfm_ctx {
+	u32 round_keys[SPECK64_128_NROUNDS];
+	int nrounds;
+};
+
 static __always_inline void speck64_round(u32 *x, u32 *y, u32 k)
 {
 	*x = ror32(*x, 8);
@@ -162,9 +172,9 @@ static __always_inline void speck64_unround(u32 *x, u32 *y, u32 k)
 	*x = rol32(*x, 8);
 }
 
-void crypto_speck64_encrypt(const struct speck64_tfm_ctx *ctx,
-			    u8 *out, const u8 *in)
+static void speck64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
+	const struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 y = get_unaligned_le32(in);
 	u32 x = get_unaligned_le32(in + 4);
 	int i;
@@ -175,16 +185,10 @@ void crypto_speck64_encrypt(const struct speck64_tfm_ctx *ctx,
 	put_unaligned_le32(y, out);
 	put_unaligned_le32(x, out + 4);
 }
-EXPORT_SYMBOL_GPL(crypto_speck64_encrypt);
 
-static void speck64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	crypto_speck64_encrypt(crypto_tfm_ctx(tfm), out, in);
-}
-
-void crypto_speck64_decrypt(const struct speck64_tfm_ctx *ctx,
-			    u8 *out, const u8 *in)
+static void speck64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
+	const struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 y = get_unaligned_le32(in);
 	u32 x = get_unaligned_le32(in + 4);
 	int i;
@@ -195,16 +199,11 @@ void crypto_speck64_decrypt(const struct speck64_tfm_ctx *ctx,
 	put_unaligned_le32(y, out);
 	put_unaligned_le32(x, out + 4);
 }
-EXPORT_SYMBOL_GPL(crypto_speck64_decrypt);
 
-static void speck64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	crypto_speck64_decrypt(crypto_tfm_ctx(tfm), out, in);
-}
-
-int crypto_speck64_setkey(struct speck64_tfm_ctx *ctx, const u8 *key,
+static int speck64_setkey(struct crypto_tfm *tfm, const u8 *key,
 			  unsigned int keylen)
 {
+	struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 l[3];
 	u32 k;
 	int i;
@@ -237,13 +236,6 @@ int crypto_speck64_setkey(struct speck64_tfm_ctx *ctx, const u8 *key,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(crypto_speck64_setkey);
-
-static int speck64_setkey(struct crypto_tfm *tfm, const u8 *key,
-			  unsigned int keylen)
-{
-	return crypto_speck64_setkey(crypto_tfm_ctx(tfm), key, keylen);
-}
 
 /* Algorithm definitions */
 
diff --git a/include/crypto/speck.h b/include/crypto/speck.h
deleted file mode 100644
index 73cfc952d405..000000000000
--- a/include/crypto/speck.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Common values for the Speck algorithm
- */
-
-#ifndef _CRYPTO_SPECK_H
-#define _CRYPTO_SPECK_H
-
-#include <linux/types.h>
-
-/* Speck128 */
-
-#define SPECK128_BLOCK_SIZE	16
-
-#define SPECK128_128_KEY_SIZE	16
-#define SPECK128_128_NROUNDS	32
-
-#define SPECK128_192_KEY_SIZE	24
-#define SPECK128_192_NROUNDS	33
-
-#define SPECK128_256_KEY_SIZE	32
-#define SPECK128_256_NROUNDS	34
-
-struct speck128_tfm_ctx {
-	u64 round_keys[SPECK128_256_NROUNDS];
-	int nrounds;
-};
-
-void crypto_speck128_encrypt(const struct speck128_tfm_ctx *ctx,
-			     u8 *out, const u8 *in);
-
-void crypto_speck128_decrypt(const struct speck128_tfm_ctx *ctx,
-			     u8 *out, const u8 *in);
-
-int crypto_speck128_setkey(struct speck128_tfm_ctx *ctx, const u8 *key,
-			   unsigned int keysize);
-
-/* Speck64 */
-
-#define SPECK64_BLOCK_SIZE	8
-
-#define SPECK64_96_KEY_SIZE	12
-#define SPECK64_96_NROUNDS	26
-
-#define SPECK64_128_KEY_SIZE	16
-#define SPECK64_128_NROUNDS	27
-
-struct speck64_tfm_ctx {
-	u32 round_keys[SPECK64_128_NROUNDS];
-	int nrounds;
-};
-
-void crypto_speck64_encrypt(const struct speck64_tfm_ctx *ctx,
-			    u8 *out, const u8 *in);
-
-void crypto_speck64_decrypt(const struct speck64_tfm_ctx *ctx,
-			    u8 *out, const u8 *in);
-
-int crypto_speck64_setkey(struct speck64_tfm_ctx *ctx, const u8 *key,
-			  unsigned int keysize);
-
-#endif /* _CRYPTO_SPECK_H */

From ced713fdd6a3f36f4bebd24f551203d1fd6d22e7 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Mon, 22 Oct 2018 14:15:58 -0700
Subject: [PATCH 1845/3220] Revert "BACKPORT, FROMGIT: crypto: speck - add
 support for the Speck block cipher"

This reverts commit 1decfa0b7db0512fcf6e55abc821d58773510a24.

Bug: 116008047
Change-Id: If9192b30cdb4212fb6c8111d70c532a109695fbd
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 crypto/Kconfig   |  14 ---
 crypto/Makefile  |   1 -
 crypto/speck.c   | 299 -----------------------------------------------
 crypto/testmgr.c |  30 -----
 crypto/testmgr.h | 128 --------------------
 5 files changed, 472 deletions(-)
 delete mode 100644 crypto/speck.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index ba8ca7cedb7e..7df8c42a3e01 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1390,20 +1390,6 @@ config CRYPTO_SERPENT_AVX2_X86_64
 	  See also:
 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
 
-config CRYPTO_SPECK
-	tristate "Speck cipher algorithm"
-	select CRYPTO_ALGAPI
-	help
-	  Speck is a lightweight block cipher that is tuned for optimal
-	  performance in software (rather than hardware).
-
-	  Speck may not be as secure as AES, and should only be used on systems
-	  where AES is not fast enough.
-
-	  See also: <https://eprint.iacr.org/2013/404.pdf>
-
-	  If unsure, say N.
-
 config CRYPTO_TEA
 	tristate "TEA, XTEA and XETA cipher algorithms"
 	select CRYPTO_ALGAPI
diff --git a/crypto/Makefile b/crypto/Makefile
index 74f36e7d163f..34a6b0bbccef 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -98,7 +98,6 @@ obj-$(CONFIG_CRYPTO_TEA) += tea.o
 obj-$(CONFIG_CRYPTO_KHAZAD) += khazad.o
 obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o
 obj-$(CONFIG_CRYPTO_SEED) += seed.o
-obj-$(CONFIG_CRYPTO_SPECK) += speck.o
 obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
 obj-$(CONFIG_CRYPTO_CHACHA20) += chacha20_generic.o
 obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o
diff --git a/crypto/speck.c b/crypto/speck.c
deleted file mode 100644
index 4e80ad76bcd7..000000000000
--- a/crypto/speck.c
+++ /dev/null
@@ -1,299 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Speck: a lightweight block cipher
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Speck has 10 variants, including 5 block sizes.  For now we only implement
- * the variants Speck128/128, Speck128/192, Speck128/256, Speck64/96, and
- * Speck64/128.   Speck${B}/${K} denotes the variant with a block size of B bits
- * and a key size of K bits.  The Speck128 variants are believed to be the most
- * secure variants, and they use the same block size and key sizes as AES.  The
- * Speck64 variants are less secure, but on 32-bit processors are usually
- * faster.  The remaining variants (Speck32, Speck48, and Speck96) are even less
- * secure and/or not as well suited for implementation on either 32-bit or
- * 64-bit processors, so are omitted.
- *
- * Reference: "The Simon and Speck Families of Lightweight Block Ciphers"
- * https://eprint.iacr.org/2013/404.pdf
- *
- * In a correspondence, the Speck designers have also clarified that the words
- * should be interpreted in little-endian format, and the words should be
- * ordered such that the first word of each block is 'y' rather than 'x', and
- * the first key word (rather than the last) becomes the first round key.
- */
-
-#include <asm/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
-#include <linux/module.h>
-
-/* Speck128 */
-
-#define SPECK128_BLOCK_SIZE	16
-
-#define SPECK128_128_KEY_SIZE	16
-#define SPECK128_128_NROUNDS	32
-
-#define SPECK128_192_KEY_SIZE	24
-#define SPECK128_192_NROUNDS	33
-
-#define SPECK128_256_KEY_SIZE	32
-#define SPECK128_256_NROUNDS	34
-
-struct speck128_tfm_ctx {
-	u64 round_keys[SPECK128_256_NROUNDS];
-	int nrounds;
-};
-
-static __always_inline void speck128_round(u64 *x, u64 *y, u64 k)
-{
-	*x = ror64(*x, 8);
-	*x += *y;
-	*x ^= k;
-	*y = rol64(*y, 3);
-	*y ^= *x;
-}
-
-static __always_inline void speck128_unround(u64 *x, u64 *y, u64 k)
-{
-	*y ^= *x;
-	*y = ror64(*y, 3);
-	*x ^= k;
-	*x -= *y;
-	*x = rol64(*x, 8);
-}
-
-static void speck128_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	const struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	u64 y = get_unaligned_le64(in);
-	u64 x = get_unaligned_le64(in + 8);
-	int i;
-
-	for (i = 0; i < ctx->nrounds; i++)
-		speck128_round(&x, &y, ctx->round_keys[i]);
-
-	put_unaligned_le64(y, out);
-	put_unaligned_le64(x, out + 8);
-}
-
-static void speck128_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	const struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	u64 y = get_unaligned_le64(in);
-	u64 x = get_unaligned_le64(in + 8);
-	int i;
-
-	for (i = ctx->nrounds - 1; i >= 0; i--)
-		speck128_unround(&x, &y, ctx->round_keys[i]);
-
-	put_unaligned_le64(y, out);
-	put_unaligned_le64(x, out + 8);
-}
-
-static int speck128_setkey(struct crypto_tfm *tfm, const u8 *key,
-			   unsigned int keylen)
-{
-	struct speck128_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	u64 l[3];
-	u64 k;
-	int i;
-
-	switch (keylen) {
-	case SPECK128_128_KEY_SIZE:
-		k = get_unaligned_le64(key);
-		l[0] = get_unaligned_le64(key + 8);
-		ctx->nrounds = SPECK128_128_NROUNDS;
-		for (i = 0; i < ctx->nrounds; i++) {
-			ctx->round_keys[i] = k;
-			speck128_round(&l[0], &k, i);
-		}
-		break;
-	case SPECK128_192_KEY_SIZE:
-		k = get_unaligned_le64(key);
-		l[0] = get_unaligned_le64(key + 8);
-		l[1] = get_unaligned_le64(key + 16);
-		ctx->nrounds = SPECK128_192_NROUNDS;
-		for (i = 0; i < ctx->nrounds; i++) {
-			ctx->round_keys[i] = k;
-			speck128_round(&l[i % 2], &k, i);
-		}
-		break;
-	case SPECK128_256_KEY_SIZE:
-		k = get_unaligned_le64(key);
-		l[0] = get_unaligned_le64(key + 8);
-		l[1] = get_unaligned_le64(key + 16);
-		l[2] = get_unaligned_le64(key + 24);
-		ctx->nrounds = SPECK128_256_NROUNDS;
-		for (i = 0; i < ctx->nrounds; i++) {
-			ctx->round_keys[i] = k;
-			speck128_round(&l[i % 3], &k, i);
-		}
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* Speck64 */
-
-#define SPECK64_BLOCK_SIZE	8
-
-#define SPECK64_96_KEY_SIZE	12
-#define SPECK64_96_NROUNDS	26
-
-#define SPECK64_128_KEY_SIZE	16
-#define SPECK64_128_NROUNDS	27
-
-struct speck64_tfm_ctx {
-	u32 round_keys[SPECK64_128_NROUNDS];
-	int nrounds;
-};
-
-static __always_inline void speck64_round(u32 *x, u32 *y, u32 k)
-{
-	*x = ror32(*x, 8);
-	*x += *y;
-	*x ^= k;
-	*y = rol32(*y, 3);
-	*y ^= *x;
-}
-
-static __always_inline void speck64_unround(u32 *x, u32 *y, u32 k)
-{
-	*y ^= *x;
-	*y = ror32(*y, 3);
-	*x ^= k;
-	*x -= *y;
-	*x = rol32(*x, 8);
-}
-
-static void speck64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	const struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	u32 y = get_unaligned_le32(in);
-	u32 x = get_unaligned_le32(in + 4);
-	int i;
-
-	for (i = 0; i < ctx->nrounds; i++)
-		speck64_round(&x, &y, ctx->round_keys[i]);
-
-	put_unaligned_le32(y, out);
-	put_unaligned_le32(x, out + 4);
-}
-
-static void speck64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	const struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	u32 y = get_unaligned_le32(in);
-	u32 x = get_unaligned_le32(in + 4);
-	int i;
-
-	for (i = ctx->nrounds - 1; i >= 0; i--)
-		speck64_unround(&x, &y, ctx->round_keys[i]);
-
-	put_unaligned_le32(y, out);
-	put_unaligned_le32(x, out + 4);
-}
-
-static int speck64_setkey(struct crypto_tfm *tfm, const u8 *key,
-			  unsigned int keylen)
-{
-	struct speck64_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
-	u32 l[3];
-	u32 k;
-	int i;
-
-	switch (keylen) {
-	case SPECK64_96_KEY_SIZE:
-		k = get_unaligned_le32(key);
-		l[0] = get_unaligned_le32(key + 4);
-		l[1] = get_unaligned_le32(key + 8);
-		ctx->nrounds = SPECK64_96_NROUNDS;
-		for (i = 0; i < ctx->nrounds; i++) {
-			ctx->round_keys[i] = k;
-			speck64_round(&l[i % 2], &k, i);
-		}
-		break;
-	case SPECK64_128_KEY_SIZE:
-		k = get_unaligned_le32(key);
-		l[0] = get_unaligned_le32(key + 4);
-		l[1] = get_unaligned_le32(key + 8);
-		l[2] = get_unaligned_le32(key + 12);
-		ctx->nrounds = SPECK64_128_NROUNDS;
-		for (i = 0; i < ctx->nrounds; i++) {
-			ctx->round_keys[i] = k;
-			speck64_round(&l[i % 3], &k, i);
-		}
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* Algorithm definitions */
-
-static struct crypto_alg speck_algs[] = {
-	{
-		.cra_name		= "speck128",
-		.cra_driver_name	= "speck128-generic",
-		.cra_priority		= 100,
-		.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
-		.cra_blocksize		= SPECK128_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct speck128_tfm_ctx),
-		.cra_module		= THIS_MODULE,
-		.cra_u			= {
-			.cipher = {
-				.cia_min_keysize	= SPECK128_128_KEY_SIZE,
-				.cia_max_keysize	= SPECK128_256_KEY_SIZE,
-				.cia_setkey		= speck128_setkey,
-				.cia_encrypt		= speck128_encrypt,
-				.cia_decrypt		= speck128_decrypt
-			}
-		}
-	}, {
-		.cra_name		= "speck64",
-		.cra_driver_name	= "speck64-generic",
-		.cra_priority		= 100,
-		.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
-		.cra_blocksize		= SPECK64_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct speck64_tfm_ctx),
-		.cra_module		= THIS_MODULE,
-		.cra_u			= {
-			.cipher = {
-				.cia_min_keysize	= SPECK64_96_KEY_SIZE,
-				.cia_max_keysize	= SPECK64_128_KEY_SIZE,
-				.cia_setkey		= speck64_setkey,
-				.cia_encrypt		= speck64_encrypt,
-				.cia_decrypt		= speck64_decrypt
-			}
-		}
-	}
-};
-
-static int __init speck_module_init(void)
-{
-	return crypto_register_algs(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-static void __exit speck_module_exit(void)
-{
-	crypto_unregister_algs(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-module_init(speck_module_init);
-module_exit(speck_module_exit);
-
-MODULE_DESCRIPTION("Speck block cipher (generic)");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("speck128");
-MODULE_ALIAS_CRYPTO("speck128-generic");
-MODULE_ALIAS_CRYPTO("speck64");
-MODULE_ALIAS_CRYPTO("speck64-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 4760b564dc17..88f1b6cb4dec 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3113,36 +3113,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
-	}, {
-		.alg = "ecb(speck128)",
-		.test = alg_test_skcipher,
-		.suite = {
-			.cipher = {
-				.enc = {
-					.vecs = speck128_enc_tv_template,
-					.count = ARRAY_SIZE(speck128_enc_tv_template)
-				},
-				.dec = {
-					.vecs = speck128_dec_tv_template,
-					.count = ARRAY_SIZE(speck128_dec_tv_template)
-				}
-			}
-		}
-	}, {
-		.alg = "ecb(speck64)",
-		.test = alg_test_skcipher,
-		.suite = {
-			.cipher = {
-				.enc = {
-					.vecs = speck64_enc_tv_template,
-					.count = ARRAY_SIZE(speck64_enc_tv_template)
-				},
-				.dec = {
-					.vecs = speck64_dec_tv_template,
-					.count = ARRAY_SIZE(speck64_dec_tv_template)
-				}
-			}
-		}
 	}, {
 		.alg = "ecb(tea)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 9b59a21a89e8..9aad61534246 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -12589,134 +12589,6 @@ static struct cipher_testvec serpent_xts_dec_tv_template[] = {
 	},
 };
 
-/*
- * Speck test vectors taken from the original paper:
- * "The Simon and Speck Families of Lightweight Block Ciphers"
- * https://eprint.iacr.org/2013/404.pdf
- *
- * Note that the paper does not make byte and word order clear.  But it was
- * confirmed with the authors that the intended orders are little endian byte
- * order and (y, x) word order.  Equivalently, the printed test vectors, when
- * looking at only the bytes (ignoring the whitespace that divides them into
- * words), are backwards: the left-most byte is actually the one with the
- * highest memory address, while the right-most byte is actually the one with
- * the lowest memory address.
- */
-
-static struct cipher_testvec speck128_enc_tv_template[] = {
-	{ /* Speck128/128 */
-		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
-		.klen	= 16,
-		.input	= "\x20\x6d\x61\x64\x65\x20\x69\x74"
-			  "\x20\x65\x71\x75\x69\x76\x61\x6c",
-		.ilen	= 16,
-		.result	= "\x18\x0d\x57\x5c\xdf\xfe\x60\x78"
-			  "\x65\x32\x78\x79\x51\x98\x5d\xa6",
-		.rlen	= 16,
-	}, { /* Speck128/192 */
-		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17",
-		.klen	= 24,
-		.input	= "\x65\x6e\x74\x20\x74\x6f\x20\x43"
-			  "\x68\x69\x65\x66\x20\x48\x61\x72",
-		.ilen	= 16,
-		.result	= "\x86\x18\x3c\xe0\x5d\x18\xbc\xf9"
-			  "\x66\x55\x13\x13\x3a\xcf\xe4\x1b",
-		.rlen	= 16,
-	}, { /* Speck128/256 */
-		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
-		.klen	= 32,
-		.input	= "\x70\x6f\x6f\x6e\x65\x72\x2e\x20"
-			  "\x49\x6e\x20\x74\x68\x6f\x73\x65",
-		.ilen	= 16,
-		.result	= "\x43\x8f\x18\x9c\x8d\xb4\xee\x4e"
-			  "\x3e\xf5\xc0\x05\x04\x01\x09\x41",
-		.rlen	= 16,
-	},
-};
-
-static struct cipher_testvec speck128_dec_tv_template[] = {
-	{ /* Speck128/128 */
-		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
-		.klen	= 16,
-		.input	= "\x18\x0d\x57\x5c\xdf\xfe\x60\x78"
-			  "\x65\x32\x78\x79\x51\x98\x5d\xa6",
-		.ilen	= 16,
-		.result	= "\x20\x6d\x61\x64\x65\x20\x69\x74"
-			  "\x20\x65\x71\x75\x69\x76\x61\x6c",
-		.rlen	= 16,
-	}, { /* Speck128/192 */
-		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17",
-		.klen	= 24,
-		.input	= "\x86\x18\x3c\xe0\x5d\x18\xbc\xf9"
-			  "\x66\x55\x13\x13\x3a\xcf\xe4\x1b",
-		.ilen	= 16,
-		.result	= "\x65\x6e\x74\x20\x74\x6f\x20\x43"
-			  "\x68\x69\x65\x66\x20\x48\x61\x72",
-		.rlen	= 16,
-	}, { /* Speck128/256 */
-		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
-			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			  "\x10\x11\x12\x13\x14\x15\x16\x17"
-			  "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
-		.klen	= 32,
-		.input	= "\x43\x8f\x18\x9c\x8d\xb4\xee\x4e"
-			  "\x3e\xf5\xc0\x05\x04\x01\x09\x41",
-		.ilen	= 16,
-		.result	= "\x70\x6f\x6f\x6e\x65\x72\x2e\x20"
-			  "\x49\x6e\x20\x74\x68\x6f\x73\x65",
-		.rlen	= 16,
-	},
-};
-
-static struct cipher_testvec speck64_enc_tv_template[] = {
-	{ /* Speck64/96 */
-		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"
-			  "\x10\x11\x12\x13",
-		.klen	= 12,
-		.input	= "\x65\x61\x6e\x73\x20\x46\x61\x74",
-		.ilen	= 8,
-		.result	= "\x6c\x94\x75\x41\xec\x52\x79\x9f",
-		.rlen	= 8,
-	}, { /* Speck64/128 */
-		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"
-			  "\x10\x11\x12\x13\x18\x19\x1a\x1b",
-		.klen	= 16,
-		.input	= "\x2d\x43\x75\x74\x74\x65\x72\x3b",
-		.ilen	= 8,
-		.result	= "\x8b\x02\x4e\x45\x48\xa5\x6f\x8c",
-		.rlen	= 8,
-	},
-};
-
-static struct cipher_testvec speck64_dec_tv_template[] = {
-	{ /* Speck64/96 */
-		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"
-			  "\x10\x11\x12\x13",
-		.klen	= 12,
-		.input	= "\x6c\x94\x75\x41\xec\x52\x79\x9f",
-		.ilen	= 8,
-		.result	= "\x65\x61\x6e\x73\x20\x46\x61\x74",
-		.rlen	= 8,
-	}, { /* Speck64/128 */
-		.key	= "\x00\x01\x02\x03\x08\x09\x0a\x0b"
-			  "\x10\x11\x12\x13\x18\x19\x1a\x1b",
-		.klen	= 16,
-		.input	= "\x8b\x02\x4e\x45\x48\xa5\x6f\x8c",
-		.ilen	= 8,
-		.result	= "\x2d\x43\x75\x74\x74\x65\x72\x3b",
-		.rlen	= 8,
-	},
-};
-
 /* Cast6 test vectors from RFC 2612 */
 #define CAST6_ENC_TEST_VECTORS		4
 #define CAST6_DEC_TEST_VECTORS		4

From fd73fe985329b36bbb1e0070d67da639c12b187c Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Mon, 22 Oct 2018 19:15:22 -0700
Subject: [PATCH 1846/3220] Build fix for 076c36fce1ea0.

Bug: 116008047
Change-Id: I053866f49e1ec090fde9a6fbf19a34c5e4e6e8e3
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 fs/crypto/keyinfo.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index b13e968273a4..3cbe0eff8767 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -153,7 +153,6 @@ static struct fscrypt_mode {
 	int keysize;
 	bool logged_impl_name;
 } available_modes[] = {
-<<<<<<< HEAD
 	[FS_ENCRYPTION_MODE_AES_256_XTS] = {
 		.friendly_name = "AES-256-XTS",
 		.cipher_str = "xts(aes)",

From c4b00eb7049611bc16a42ca0e21ea80f614ed3a4 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 23 Oct 2018 12:37:23 -0700
Subject: [PATCH 1847/3220] Revert "BACKPORT, FROMLIST: fscrypt: add
 Speck128/256 support"

This reverts commit 4b08356a76b859b8f4599bf8821a6702ac50a029.

Resolves conflicts with a later CL, e7724207f71e "fscrypt: log the
crypto algorithm implementations".

[astrachan: This is an update to 076c36fce1ea which reverted only
            the fscrypt changes, not the ext4 changes, due to a
            historical bad merge.]
Bug: 116008047
Change-Id: I772d78d6e4bd126dcd2b25049c719f8522a1c157
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 fs/ext4/crypto.c        | 12 ++----------
 fs/ext4/crypto_fname.c  |  6 ++++++
 fs/ext4/crypto_key.c    |  6 ------
 fs/ext4/crypto_policy.c | 14 +++++++++-----
 fs/ext4/ext4.h          |  5 ++---
 fs/ext4/ext4_crypto.h   |  4 ----
 6 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index f6096ee77662..f240cef8b326 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -457,17 +457,9 @@ errout:
 	return err;
 }
 
-bool ext4_valid_enc_modes(uint32_t contents_mode, uint32_t filenames_mode)
+bool ext4_valid_contents_enc_mode(uint32_t mode)
 {
-	if (contents_mode == EXT4_ENCRYPTION_MODE_AES_256_XTS) {
-		return (filenames_mode == EXT4_ENCRYPTION_MODE_AES_256_CTS ||
-			filenames_mode == EXT4_ENCRYPTION_MODE_AES_256_HEH);
-	}
-
-	if (contents_mode == EXT4_ENCRYPTION_MODE_SPECK128_256_XTS)
-		return filenames_mode == EXT4_ENCRYPTION_MODE_SPECK128_256_CTS;
-
-	return false;
+	return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS);
 }
 
 /**
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 5e5afb6ef71a..026716bdbbfc 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -42,6 +42,12 @@ static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res)
 	complete(&ecr->completion);
 }
 
+bool ext4_valid_filenames_enc_mode(uint32_t mode)
+{
+	return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS ||
+		mode == EXT4_ENCRYPTION_MODE_AES_256_HEH);
+}
+
 static unsigned max_name_len(struct inode *inode)
 {
 	return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 68225223ffd8..14ae7781f2a8 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -258,12 +258,6 @@ int ext4_get_encryption_info(struct inode *inode)
 	case EXT4_ENCRYPTION_MODE_AES_256_HEH:
 		cipher_str = "heh(aes)";
 		break;
-	case EXT4_ENCRYPTION_MODE_SPECK128_256_XTS:
-		cipher_str = "xts(speck128)";
-		break;
-	case EXT4_ENCRYPTION_MODE_SPECK128_256_CTS:
-		cipher_str = "cts(cbc(speck128))";
-		break;
 	default:
 		printk_once(KERN_WARNING
 			    "ext4: unsupported key mode %d (ino %u)\n",
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index 818fa45ecf08..e4f4fc4e56ab 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -60,12 +60,16 @@ static int ext4_create_encryption_context_from_policy(
 	ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
 	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
 	       EXT4_KEY_DESCRIPTOR_SIZE);
-	if (!ext4_valid_enc_modes(policy->contents_encryption_mode,
-				  policy->filenames_encryption_mode)) {
+	if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) {
 		printk(KERN_WARNING
-		       "%s: Invalid encryption modes (contents %d, filenames %d)\n",
-		       __func__, policy->contents_encryption_mode,
-		       policy->filenames_encryption_mode);
+		       "%s: Invalid contents encryption mode %d\n", __func__,
+			policy->contents_encryption_mode);
+		return -EINVAL;
+	}
+	if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
+		printk(KERN_WARNING
+		       "%s: Invalid filenames encryption mode %d\n", __func__,
+			policy->filenames_encryption_mode);
 		return -EINVAL;
 	}
 	if (policy->flags & ~EXT4_POLICY_FLAGS_VALID)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8452a5a405f4..1725a1538e87 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -589,8 +589,6 @@ enum {
 #define EXT4_ENCRYPTION_MODE_AES_256_GCM	2
 #define EXT4_ENCRYPTION_MODE_AES_256_CBC	3
 #define EXT4_ENCRYPTION_MODE_AES_256_CTS	4
-#define EXT4_ENCRYPTION_MODE_SPECK128_256_XTS	7
-#define EXT4_ENCRYPTION_MODE_SPECK128_256_CTS	8
 #define EXT4_ENCRYPTION_MODE_AES_256_HEH	126
 
 #include "ext4_crypto.h"
@@ -2256,7 +2254,7 @@ int ext4_get_policy(struct inode *inode,
 
 /* crypto.c */
 extern struct kmem_cache *ext4_crypt_info_cachep;
-bool ext4_valid_enc_modes(uint32_t contents_mode, uint32_t filenames_mode);
+bool ext4_valid_contents_enc_mode(uint32_t mode);
 uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
 extern struct workqueue_struct *ext4_read_workqueue;
 struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode,
@@ -2287,6 +2285,7 @@ static inline int ext4_sb_has_crypto(struct super_block *sb)
 #endif
 
 /* crypto_fname.c */
+bool ext4_valid_filenames_enc_mode(uint32_t mode);
 u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
 unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen);
 int ext4_fname_crypto_alloc_buffer(struct inode *inode,
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index f7ba3be9d7ac..e52637d969db 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -124,10 +124,6 @@ static inline int ext4_encryption_key_size(int mode)
 		return EXT4_AES_256_CTS_KEY_SIZE;
 	case EXT4_ENCRYPTION_MODE_AES_256_HEH:
 		return EXT4_AES_256_HEH_KEY_SIZE;
-	case EXT4_ENCRYPTION_MODE_SPECK128_256_XTS:
-		return 64;
-	case EXT4_ENCRYPTION_MODE_SPECK128_256_CTS:
-		return 32;
 	default:
 		BUG();
 	}

From 1bdb20fcd45751bb7ec7b4399579ceb0d1fcc504 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 6 Jul 2018 16:24:27 -0700
Subject: [PATCH 1848/3220] ANDROID: sdcardfs: Add option to drop unused
 dentries

This adds the nocache mount option, which will cause sdcardfs to always
drop dentries that are not in use, preventing cached entries from
holding on to lower dentries, which could  cause strange behavior when
bypassing the sdcardfs layer and directly changing the lower fs.

Change-Id: I70268584a20b989ae8cfdd278a2e4fa1605217fb
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 fs/sdcardfs/dentry.c   | 7 +++++++
 fs/sdcardfs/main.c     | 6 ++++++
 fs/sdcardfs/sdcardfs.h | 1 +
 fs/sdcardfs/super.c    | 2 ++
 4 files changed, 16 insertions(+)

diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
index 3795d2c26915..eb279de02ffb 100644
--- a/fs/sdcardfs/dentry.c
+++ b/fs/sdcardfs/dentry.c
@@ -123,6 +123,12 @@ out:
 	return err;
 }
 
+/* 1 = delete, 0 = cache */
+static int sdcardfs_d_delete(const struct dentry *d)
+{
+	return SDCARDFS_SB(d->d_sb)->options.nocache ? 1 : 0;
+}
+
 static void sdcardfs_d_release(struct dentry *dentry)
 {
 	if (!dentry || !dentry->d_fsdata)
@@ -182,6 +188,7 @@ static void sdcardfs_canonical_path(const struct path *path,
 
 const struct dentry_operations sdcardfs_ci_dops = {
 	.d_revalidate	= sdcardfs_d_revalidate,
+	.d_delete	= sdcardfs_d_delete,
 	.d_release	= sdcardfs_d_release,
 	.d_hash	= sdcardfs_hash_ci,
 	.d_compare	= sdcardfs_cmp_ci,
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index 27ec726e7a46..ba52af8644cc 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -34,6 +34,7 @@ enum {
 	Opt_reserved_mb,
 	Opt_gid_derivation,
 	Opt_default_normal,
+	Opt_nocache,
 	Opt_err,
 };
 
@@ -48,6 +49,7 @@ static const match_table_t sdcardfs_tokens = {
 	{Opt_gid_derivation, "derive_gid"},
 	{Opt_default_normal, "default_normal"},
 	{Opt_reserved_mb, "reserved_mb=%u"},
+	{Opt_nocache, "nocache"},
 	{Opt_err, NULL}
 };
 
@@ -71,6 +73,7 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 	/* by default, gid derivation is off */
 	opts->gid_derivation = false;
 	opts->default_normal = false;
+	opts->nocache = false;
 
 	*debug = 0;
 
@@ -128,6 +131,9 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 		case Opt_default_normal:
 			opts->default_normal = true;
 			break;
+		case Opt_nocache:
+			opts->nocache = true;
+			break;
 		/* unknown option */
 		default:
 			if (!silent)
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 99227a07a8d6..57be4761d32b 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -198,6 +198,7 @@ struct sdcardfs_mount_options {
 	bool gid_derivation;
 	bool default_normal;
 	unsigned int reserved_mb;
+	bool nocache;
 };
 
 struct sdcardfs_vfsmount_options {
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
index cffcdb11cb8a..140696ed3ed3 100644
--- a/fs/sdcardfs/super.c
+++ b/fs/sdcardfs/super.c
@@ -311,6 +311,8 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
 		seq_puts(m, ",default_normal");
 	if (opts->reserved_mb != 0)
 		seq_printf(m, ",reserved=%uMB", opts->reserved_mb);
+	if (opts->nocache)
+		seq_printf(m, ",nocache");
 
 	return 0;
 };

From 64102d341c131958f108b1fc397f16953803f53c Mon Sep 17 00:00:00 2001
From: Benedict Wong <benedictwong@google.com>
Date: Wed, 5 Sep 2018 11:00:08 -0700
Subject: [PATCH 1849/3220] BACKPORT: xfrm: Allow Output Mark to be Updated
 Using UPDSA

Allow UPDSA to change "output mark" to permit
policy separation of packet routing decisions from
SA keying in systems that use mark-based routing.

The set mark, used as a routing and firewall mark
for outbound packets, is made update-able which
allows routing decisions to be handled independently
of keying/SA creation. To maintain consistency with
other optional attributes, the output mark is only
updated if sent with a non-zero value.

The per-SA lock and the xfrm_state_lock are taken in
that order to avoid a deadlock with
xfrm_timer_handler(), which also takes the locks in
that order.

Signed-off-by: Nathan Harold <nharold@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>

(cherry picked from commit 6d8e85ffe17895d7bc632dfbaa9e2e33b22fe873)
Backport resolution required using props.output_mark instead of
props.smark

Change-Id: I08c7bfc114ac9826a8a18f5ac1c3ff17a4e0940b
Signed-off-by: Benedict Wong <benedictwong@google.com>
Bug: 114060045
---
 net/xfrm/xfrm_state.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 9b6e51450fc5..99e013f3a88d 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1357,6 +1357,15 @@ out:
 		if (x1->curlft.use_time)
 			xfrm_state_check_expire(x1);
 
+		if (x->props.output_mark) {
+			spin_lock_bh(&net->xfrm.xfrm_state_lock);
+
+			x1->props.output_mark = x->props.output_mark;
+
+			__xfrm_state_bump_genids(x1);
+			spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+		}
+
 		err = 0;
 		x->km.state = XFRM_STATE_DEAD;
 		__xfrm_state_put(x);

From 498bf61266f166b16e6990721a2c5fa25c89352d Mon Sep 17 00:00:00 2001
From: Peter Kalauskas <peskal@google.com>
Date: Thu, 8 Nov 2018 11:03:13 -0800
Subject: [PATCH 1850/3220] ANDROID: zram: set comp_len to PAGE_SIZE when page
 is huge

This bug was introduced when two patches were applied out of order.

* zram: drop max_zpage_size and use zs_huge_class_size()
* zram: mark incompressible page as ZRAM_HUGE

Signed-off-by: Peter Kalauskas <peskal@google.com>
Bug: 119260394
Change-Id: I437d35c8d23c15237ad9c2d5bd7f99d7bff42872
---
 drivers/block/zram/zram_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7ccc2e3e4ca3..5cc39583ad8d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1125,6 +1125,7 @@ compress_again:
 	}
 
 	if (unlikely(comp_len >= huge_class_size)) {
+		comp_len = PAGE_SIZE;
 		if (zram_wb_enabled(zram) && allow_wb) {
 			zcomp_stream_put(zram->comp);
 			ret = write_to_bdev(zram, bvec, index, bio, &element);

From b532ba9c26470f608725fe25a5748ac5894f9396 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Tue, 27 Nov 2018 10:22:46 -0700
Subject: [PATCH 1851/3220] Makefile: Tidy up 4.4.165 merge

Some parts of commit c630d13c9806 ("kbuild: Set KBUILD_CFLAGS before incl.
arch Makefile") and its follow up fixes/improvements and commit cfbabf536f6b
("kbuild: clang: disable unused variable warnings only when constant")
were not fully applied.

Additionally, commit f0907aa15ed9 ("ANDROID: Kbuild, LLVMLinux: allow
overriding clang target triple") is reapplied in the new CLANG_TARGET
location.

Change-Id: Id6332daac607e49213c9a5a594af015830e10d29
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
---
 Makefile | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index f98a680be7e8..6b0e7d0c984d 100644
--- a/Makefile
+++ b/Makefile
@@ -611,7 +611,8 @@ all: vmlinux
 
 ifeq ($(cc-name),clang)
 ifneq ($(CROSS_COMPILE),)
-CLANG_TARGET	:= --target=$(notdir $(CROSS_COMPILE:%-=%))
+CLANG_TRIPLE    ?= $(CROSS_COMPILE)
+CLANG_TARGET	:= --target=$(notdir $(CLANG_TRIPLE:%-=%))
 GCC_TOOLCHAIN_DIR := $(dir $(shell which $(LD)))
 CLANG_PREFIX	:= --prefix=$(GCC_TOOLCHAIN_DIR)
 GCC_TOOLCHAIN	:= $(realpath $(GCC_TOOLCHAIN_DIR)/..)
@@ -719,18 +720,7 @@ ifdef CONFIG_KCOV
 endif
 
 ifeq ($(cc-name),clang)
-ifneq ($(CROSS_COMPILE),)
-CLANG_TRIPLE    ?= $(CROSS_COMPILE)
-CLANG_TARGET	:= --target=$(notdir $(CLANG_TRIPLE:%-=%))
-GCC_TOOLCHAIN	:= $(realpath $(dir $(shell which $(LD)))/..)
-endif
-ifneq ($(GCC_TOOLCHAIN),)
-CLANG_GCC_TC	:= --gcc-toolchain=$(GCC_TOOLCHAIN)
-endif
-KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
-KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
 KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
 KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
 KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
@@ -742,8 +732,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
 # See modpost pattern 2
 KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
 KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
-KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
-KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
 else
 
 # These warnings generated too much noise in a regular build.

From 6595657a9436f683b7464213924ffd17d7255ba3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:21 -0800
Subject: [PATCH 1852/3220] UPSTREAM: x86/syscalls: Refactor syscalltbl.sh

This splits out the code to emit a syscall line.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1bfcbba991f5cfaa9291ff950a593daa972a205f.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit fba324744bfd2a7948a7710d7a021d76dafb9b67)

Bug: 119769499
Change-Id: Ie36f49882c4c3a69d87288795e4525353bb05ec5
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/x86/entry/syscalls/syscalltbl.sh | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec071e7..167965ee742e 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,13 +3,21 @@
 in="$1"
 out="$2"
 
+emit() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+    compat="$4"
+    if [ -n "$compat" ]; then
+	echo "__SYSCALL_${abi}($nr, $entry, $compat)"
+    elif [ -n "$entry" ]; then
+	echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+    fi
+}
+
 grep '^[0-9]' "$in" | sort -n | (
     while read nr abi name entry compat; do
 	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
-	if [ -n "$compat" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry, $compat)"
-	elif [ -n "$entry" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry, $entry)"
-	fi
+	emit "$abi" "$nr" "$entry" "$compat"
     done
 ) > "$out"

From fe2539e175b6ea816ea5e956138c2db1c52d39f1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:22 -0800
Subject: [PATCH 1853/3220] UPSTREAM: x86/syscalls: Remove __SYSCALL_COMMON and
 __SYSCALL_X32

The common/64/x32 distinction has no effect other than
determining which kernels actually support the syscall.  Move
the logic into syscalltbl.sh.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/58d4a95f40e43b894f93288b4a3633963d0ee22e.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 32324ce15ea8cb4c8acc28acb2fd36fabf73e9db)

Bug: 119769499
Change-Id: Ib994586ac47f8f4cbc3f746492c2b47b22e03d39
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/x86/entry/syscall_64.c           |  8 --------
 arch/x86/entry/syscalls/syscalltbl.sh | 17 ++++++++++++++++-
 arch/x86/kernel/asm-offsets_64.c      |  6 ------
 arch/x86/um/sys_call_table_64.c       |  3 ---
 arch/x86/um/user-offsets.c            |  2 --
 5 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 41283d22be7a..974fd89ac806 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,14 +6,6 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
-
 #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 167965ee742e..5ebeaf1041e7 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -18,6 +18,21 @@ emit() {
 grep '^[0-9]' "$in" | sort -n | (
     while read nr abi name entry compat; do
 	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
-	emit "$abi" "$nr" "$entry" "$compat"
+	if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
+	    # COMMON is the same as 64, except that we don't expect X32
+	    # programs to use it.  Our expectation has nothing to do with
+	    # any generated code, so treat them the same.
+	    emit 64 "$nr" "$entry" "$compat"
+	elif [ "$abi" == "X32" ]; then
+	    # X32 is equivalent to 64 on an X32-compatible kernel.
+	    echo "#ifdef CONFIG_X86_X32_ABI"
+	    emit 64 "$nr" "$entry" "$compat"
+	    echo "#endif"
+	elif [ "$abi" == "I386" ]; then
+	    emit "$abi" "$nr" "$entry" "$compat"
+	else
+	    echo "Unknown abi $abi" >&2
+	    exit 1
+	fi
     done
 ) > "$out"
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d8f42f902a0f..211224b68766 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -5,12 +5,6 @@
 #include <asm/ia32.h>
 
 #define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index b74ea6c2c0e7..71a497cde921 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,9 +35,6 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
-
 #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ce7e3607a870..5edf4f4bbf53 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -15,8 +15,6 @@ static char syscalls[] = {
 };
 #else
 #define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };

From a4f217a4a947dc1cda97490ba9dbceb4115746a0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:23 -0800
Subject: [PATCH 1854/3220] UPSTREAM: x86/syscalls: Move compat syscall entry
 handling into syscalltbl.sh

Rather than duplicating the compat entry handling in all
consumers of syscalls_BITS.h, handle it directly in
syscalltbl.sh.  Now we generate entries in syscalls_32.h like:

__SYSCALL_I386(5, sys_open)
__SYSCALL_I386(5, compat_sys_open)

and all of its consumers implicitly get the right entry point.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b7c2b501dc0e6e43050e916b95807c3e2e16e9bb.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 3e65654e3db6df6aba9c5b895f8b8e6a8d8eb508)

Bug: 119769499
Change-Id: I7b2b8206f243e33458fe6cc69affe043aaf177ce
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/x86/entry/syscall_32.c           | 10 ++--------
 arch/x86/entry/syscall_64.c           |  4 ++--
 arch/x86/entry/syscalls/syscalltbl.sh | 22 ++++++++++++++++++----
 arch/x86/kernel/asm-offsets_32.c      |  2 +-
 arch/x86/kernel/asm-offsets_64.c      |  4 ++--
 arch/x86/um/sys_call_table_32.c       |  4 ++--
 arch/x86/um/sys_call_table_64.c       |  4 ++--
 arch/x86/um/user-offsets.c            |  4 ++--
 8 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 9a6649857106..3e2829759da2 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,17 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#ifdef CONFIG_IA32_EMULATION
-#define SYM(sym, compat) compat
-#else
-#define SYM(sym, compat) sym
-#endif
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
+#define __SYSCALL_I386(nr, sym) [nr] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 974fd89ac806..3781989b180e 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_64(nr, sym) [nr] = sym,
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 5ebeaf1041e7..b81479c8c5fb 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -8,10 +8,24 @@ emit() {
     nr="$2"
     entry="$3"
     compat="$4"
-    if [ -n "$compat" ]; then
-	echo "__SYSCALL_${abi}($nr, $entry, $compat)"
-    elif [ -n "$entry" ]; then
-	echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+
+    if [ "$abi" == "64" -a -n "$compat" ]; then
+	echo "a compat entry for a 64-bit syscall makes no sense" >&2
+	exit 1
+    fi
+
+    if [ -z "$compat" ]; then
+	if [ -n "$entry" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry)"
+	fi
+    else
+	echo "#ifdef CONFIG_X86_32"
+	if [ -n "$entry" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry)"
+	fi
+	echo "#else"
+	echo "__SYSCALL_${abi}($nr, $compat)"
+	echo "#endif"
     fi
 }
 
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6ce39025f467..abec4c9f1c97 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 211224b68766..7d2b4d2e36b2 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,11 +4,11 @@
 
 #include <asm/ia32.h>
 
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_64(nr, sym) [nr] = 1,
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
 static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 439c0994b696..d4669a679fd0 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
 
 #define old_mmap sys_old_mmap
 
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 
 #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 71a497cde921..6ee5268beb05 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,11 +35,11 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
 #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index 5edf4f4bbf53..6c9a9c1eae32 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,12 +9,12 @@
 #include <asm/types.h>
 
 #ifdef __i386__
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
 #else
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_64(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };

From 6ae795154fbf576455aa0153b519dcdb34ea8d9e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:24 -0800
Subject: [PATCH 1855/3220] UPSTREAM: x86/syscalls: Add syscall entry
 qualifiers

This will let us specify something like 'sys_xyz/foo' instead of
'sys_xyz' in the syscall table, where the 'foo' qualifier conveys
some extra information to the C code.

The intent is to allow things like sys_execve/ptregs to indicate
that sys_execve() touches pt_regs.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2de06e33dce62556b3ec662006fcb295504e296e.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit cfcbadb49dabb05efa23e1a0f95f3391c0a815bc)

Bug: 119769499
Change-Id: I39c3b052526991d7958861712f1e3e9bf453225e
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/x86/entry/syscall_32.c           |  4 ++--
 arch/x86/entry/syscall_64.c           |  4 ++--
 arch/x86/entry/syscalls/syscalltbl.sh | 19 ++++++++++++++++---
 arch/x86/kernel/asm-offsets_32.c      |  2 +-
 arch/x86/kernel/asm-offsets_64.c      |  4 ++--
 arch/x86/um/sys_call_table_32.c       |  4 ++--
 arch/x86/um/sys_call_table_64.c       |  4 ++--
 arch/x86/um/user-offsets.c            |  4 ++--
 8 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 3e2829759da2..8f895ee13a1c 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,11 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 3781989b180e..a1d408772ae6 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index b81479c8c5fb..cd3d3015d7df 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,6 +3,19 @@
 in="$1"
 out="$2"
 
+syscall_macro() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+
+    # Entry can be either just a function name or "function/qualifier"
+    real_entry="${entry%%/*}"
+    qualifier="${entry:${#real_entry}}"		# Strip the function name
+    qualifier="${qualifier:1}"			# Strip the slash, if any
+
+    echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
+}
+
 emit() {
     abi="$1"
     nr="$2"
@@ -16,15 +29,15 @@ emit() {
 
     if [ -z "$compat" ]; then
 	if [ -n "$entry" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry)"
+	    syscall_macro "$abi" "$nr" "$entry"
 	fi
     else
 	echo "#ifdef CONFIG_X86_32"
 	if [ -n "$entry" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry)"
+	    syscall_macro "$abi" "$nr" "$entry"
 	fi
 	echo "#else"
-	echo "__SYSCALL_${abi}($nr, $compat)"
+	syscall_macro "$abi" "$nr" "$compat"
 	echo "#endif"
     fi
 }
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index abec4c9f1c97..fdeb0ce07c16 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7d2b4d2e36b2..5de52cf589be 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,11 +4,11 @@
 
 #include <asm/ia32.h>
 
-#define __SYSCALL_64(nr, sym) [nr] = 1,
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index d4669a679fd0..bfce503dffae 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
 
 #define old_mmap sys_old_mmap
 
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 
 #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 6ee5268beb05..f306413d3eb6 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,11 +35,11 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
 #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index 6c9a9c1eae32..470564bbd08e 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,12 +9,12 @@
 #include <asm/types.h>
 
 #ifdef __i386__
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
 #else
-#define __SYSCALL_64(nr, sym) [nr] = 1,
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };

From 850872d95fcc3475a88de03b27d9af99ba88b14c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:25 -0800
Subject: [PATCH 1856/3220] BACKPORT: x86/entry/64: Always run ptregs-using
 syscalls on the slow path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

64-bit syscalls currently have an optimization in which they are
called with partial pt_regs.  A small handful require full
pt_regs.

In the 32-bit and compat cases, I cleaned this up by forcing
full pt_regs for all syscalls.  The performance hit doesn't
really matter as the affected system calls are fundamentally
heavy and this is the 32-bit compat case.

I want to clean up the 64-bit case as well, but I don't want to
hurt fast path performance.  To do that, I want to force the
syscalls that use pt_regs onto the slow path.  This will enable
us to make slow path syscalls be real ABI-compliant C functions.

Use the new syscall entry qualification machinery for this.
'stub_clone' is now 'stub_clone/ptregs'.

The next patch will eliminate the stubs, and we'll just have
'sys_clone/ptregs'.

As of this patch, two-phase entry tracing is no longer used.  It
has served its purpose (namely a huge speedup on some workloads
prior to more general opportunistic SYSRET support), and once
the dust settles I'll send patches to back it out.

The implementation is heavily based on a patch from Brian Gerst:

  http://lkml.kernel.org/g/1449666173-15366-1-git-send-email-brgerst@gmail.com

Originally-From: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b9beda88460bcefec6e7d792bd44eca9b760b0c4.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 302f5b260c322696cbeb962a263a4d2d99864aed)

Bug: 119769499
Change-Id: I3e5ac760ef9ca8dcecd8075564118bd10a8be91f
[ghackmann@google.com: adjust context]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/x86/entry/entry_64.S              | 55 +++++++++++++++++++-------
 arch/x86/entry/syscall_64.c            |  7 +++-
 arch/x86/entry/syscalls/syscall_64.tbl | 16 ++++----
 3 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9a66d730c78f..6aa0c4ff0ee4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -187,12 +187,19 @@ entry_SYSCALL_64_fastpath:
 	sbb	%rcx, %rcx			/* array_index_mask_nospec() */
 	and	%rcx, %rax
 	movq	%r10, %rcx
+
+	/*
+	 * This call instruction is handled specially in stub_ptregs_64.
+	 * It might end up jumping to the slow path.  If it jumps, RAX is
+	 * clobbered.
+	 */
 #ifdef CONFIG_RETPOLINE
 	movq	sys_call_table(, %rax, 8), %rax
 	call	__x86_indirect_thunk_rax
 #else
 	call	*sys_call_table(, %rax, 8)
 #endif
+.Lentry_SYSCALL_64_after_fastpath_call:
 
 	movq	%rax, RAX(%rsp)
 1:
@@ -254,25 +261,13 @@ GLOBAL(int_ret_from_sys_call_irqs_off)
 
 	/* Do syscall entry tracing */
 tracesys:
-	movq	%rsp, %rdi
-	movl	$AUDIT_ARCH_X86_64, %esi
-	call	syscall_trace_enter_phase1
-	test	%rax, %rax
-	jnz	tracesys_phase2			/* if needed, run the slow path */
-	RESTORE_C_REGS_EXCEPT_RAX		/* else restore clobbered regs */
-	movq	ORIG_RAX(%rsp), %rax
-	jmp	entry_SYSCALL_64_fastpath	/* and return to the fast path */
-
-tracesys_phase2:
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
-	movl	$AUDIT_ARCH_X86_64, %esi
-	movq	%rax, %rdx
-	call	syscall_trace_enter_phase2
+	call	syscall_trace_enter
 
 	/*
 	 * Reload registers from stack in case ptrace changed them.
-	 * We don't reload %rax because syscall_trace_entry_phase2() returned
+	 * We don't reload %rax because syscall_trace_enter() returned
 	 * the value it wants us to use in the table lookup.
 	 */
 	RESTORE_C_REGS_EXCEPT_RAX
@@ -397,6 +392,38 @@ opportunistic_sysret_failed:
 	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)
 
+ENTRY(stub_ptregs_64)
+	/*
+	 * Syscalls marked as needing ptregs land here.
+	 * If we are on the fast path, we need to save the extra regs.
+	 * If we are on the slow path, the extra regs are already saved.
+	 *
+	 * RAX stores a pointer to the C function implementing the syscall.
+	 */
+	cmpq	$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
+	jne	1f
+
+	/* Called from fast path -- pop return address and jump to slow path */
+	popq	%rax
+	jmp	tracesys	/* called from fast path */
+
+1:
+	/* Called from C */
+	jmp	*%rax				/* called from C */
+END(stub_ptregs_64)
+
+.macro ptregs_stub func
+ENTRY(ptregs_\func)
+	leaq	\func(%rip), %rax
+	jmp	stub_ptregs_64
+END(ptregs_\func)
+.endm
+
+/* Instantiate ptregs_stub for each ptregs-using syscall */
+#define __SYSCALL_64_QUAL_(sym)
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
+#include <asm/syscalls_64.h>
 
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index a1d408772ae6..9dbc5abb6162 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,14 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64_QUAL_(sym) sym
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
+
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90bfc09c..aa51cdd2a11a 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
 12	common	brk			sys_brk
 13	64	rt_sigaction		sys_rt_sigaction
 14	common	rt_sigprocmask		sys_rt_sigprocmask
-15	64	rt_sigreturn		stub_rt_sigreturn
+15	64	rt_sigreturn		stub_rt_sigreturn/ptregs
 16	64	ioctl			sys_ioctl
 17	common	pread64			sys_pread64
 18	common	pwrite64		sys_pwrite64
@@ -62,10 +62,10 @@
 53	common	socketpair		sys_socketpair
 54	64	setsockopt		sys_setsockopt
 55	64	getsockopt		sys_getsockopt
-56	common	clone			stub_clone
-57	common	fork			stub_fork
-58	common	vfork			stub_vfork
-59	64	execve			stub_execve
+56	common	clone			stub_clone/ptregs
+57	common	fork			stub_fork/ptregs
+58	common	vfork			stub_vfork/ptregs
+59	64	execve			stub_execve/ptregs
 60	common	exit			sys_exit
 61	common	wait4			sys_wait4
 62	common	kill			sys_kill
@@ -328,7 +328,7 @@
 319	common	memfd_create		sys_memfd_create
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
-322	64	execveat		stub_execveat
+322	64	execveat		stub_execveat/ptregs
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
@@ -345,7 +345,7 @@
 517	x32	recvfrom		compat_sys_recvfrom
 518	x32	sendmsg			compat_sys_sendmsg
 519	x32	recvmsg			compat_sys_recvmsg
-520	x32	execve			stub_x32_execve
+520	x32	execve			stub_x32_execve/ptregs
 521	x32	ptrace			compat_sys_ptrace
 522	x32	rt_sigpending		compat_sys_rt_sigpending
 523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
@@ -370,4 +370,4 @@
 542	x32	getsockopt		compat_sys_getsockopt
 543	x32	io_setup		compat_sys_io_setup
 544	x32	io_submit		compat_sys_io_submit
-545	x32	execveat		stub_x32_execveat
+545	x32	execveat		stub_x32_execveat/ptregs

From 351181cfa0b36cf410cd97a93e4f32cb655fbe94 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 27 May 2016 12:57:02 -0700
Subject: [PATCH 1857/3220] BACKPORT: seccomp: Add a seccomp_data parameter
 secure_computing()

Currently, if arch code wants to supply seccomp_data directly to
seccomp (which is generally much faster than having seccomp do it
using the syscall_get_xyz() API), it has to use the two-phase
seccomp hooks. Add it to the easy hooks, too.

Cc: linux-arch@vger.kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
(cherry picked from commit 2f275de5d1ed7269913ef9b4c64a13952c0a38e8)

Bug: 119769499
Change-Id: I96876ecd8d1743c289ecef6d2deb65361d1f5baa
[ghackmann@google.com: drop changes to parisc, tile, and um, which
 didn't implement seccomp support in this kernel version]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/arm/kernel/ptrace.c              | 2 +-
 arch/arm64/kernel/ptrace.c            | 2 +-
 arch/mips/kernel/ptrace.c             | 2 +-
 arch/powerpc/kernel/ptrace.c          | 2 +-
 arch/s390/kernel/ptrace.c             | 2 +-
 arch/tile/kernel/ptrace.c             | 2 +-
 arch/x86/entry/vsyscall/vsyscall_64.c | 2 +-
 include/linux/seccomp.h               | 8 ++++----
 kernel/seccomp.c                      | 4 ++--
 9 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index d54c53b7ab63..e6a538be2d88 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -934,7 +934,7 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
 
 	/* Do the secure computing check first; failures should be fast. */
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return -1;
 #else
 	/* XXX: remove this once OABI gets fixed */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index beff0fb11b6b..afed110595d7 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1255,7 +1255,7 @@ static void tracehook_report_syscall(struct pt_regs *regs,
 asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 {
 	/* Do the secure computing check first; failures should be fast. */
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return -1;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 9d04392f7ef0..816c4b281c0f 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -1023,7 +1023,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
 
 	current_thread_info()->syscall = syscall;
 
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return -1;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 3b63655efa3c..e776567b281a 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1783,7 +1783,7 @@ static int do_seccomp(struct pt_regs *regs)
 	 * have already loaded -ENOSYS into r3, or seccomp has put
 	 * something else in r3 (via SECCOMP_RET_ERRNO/TRACE).
 	 */
-	if (__secure_computing())
+	if (__secure_computing(NULL))
 		return -1;
 
 	/*
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 02bd587b610b..fe35d5c27d3c 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -826,7 +826,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	long ret = 0;
 
 	/* Do the secure computing check first. */
-	if (secure_computing()) {
+	if (secure_computing(NULL)) {
 		/* seccomp failures shouldn't expose any additional code. */
 		ret = -1;
 		goto out;
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index 6239aa155f6d..35d786a840e6 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -262,7 +262,7 @@ int do_syscall_trace_enter(struct pt_regs *regs)
 	if (work & _TIF_NOHZ)
 		user_exit();
 
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return -1;
 
 	if (work & _TIF_SYSCALL_TRACE) {
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 2d359991a273..8054f25d9b42 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -213,7 +213,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	 */
 	regs->orig_ax = syscall_nr;
 	regs->ax = -ENOSYS;
-	tmp = secure_computing();
+	tmp = secure_computing(NULL);
 	if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
 		warn_bad_vsyscall(KERN_DEBUG, regs,
 				  "seccomp tried to change syscall nr or ip");
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 5a53d34bba26..62e149fe8ee4 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -29,11 +29,11 @@ struct seccomp {
 };
 
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-extern int __secure_computing(void);
-static inline int secure_computing(void)
+extern int __secure_computing(const struct seccomp_data *sd);
+static inline int secure_computing(const struct seccomp_data *sd)
 {
 	if (unlikely(test_thread_flag(TIF_SECCOMP)))
-		return  __secure_computing();
+		return  __secure_computing(sd);
 	return 0;
 }
 
@@ -62,7 +62,7 @@ struct seccomp { };
 struct seccomp_filter { };
 
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-static inline int secure_computing(void) { return 0; }
+static inline int secure_computing(struct seccomp_data *sd) { return 0; }
 #else
 static inline void secure_computing_strict(int this_syscall) { return; }
 #endif
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 9a9203b15cde..32d5137aae69 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -579,9 +579,9 @@ void secure_computing_strict(int this_syscall)
 		BUG();
 }
 #else
-int __secure_computing(void)
+int __secure_computing(const struct seccomp_data *sd)
 {
-	u32 phase1_result = seccomp_phase1(NULL);
+	u32 phase1_result = seccomp_phase1(sd);
 
 	if (likely(phase1_result == SECCOMP_PHASE1_OK))
 		return 0;

From ca192232dc269ac8223b951c2ac1c81f1ea5e694 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 27 May 2016 13:08:59 -0700
Subject: [PATCH 1858/3220] BACKPORT: x86/entry: Get rid of two-phase syscall
 entry work

I added two-phase syscall entry work back when the entry slow path
was very slow.  Nowadays, the entry slow path is fast and two-phase
entry work serves no purpose.  Remove it.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
(cherry picked from commit c87a85177e7a7f9a9ee32893fb99a928e02fe23a)

Bug: 119769499
Change-Id: Ieac4470411f88ca8830794d0322d8d8bb348039e
[ghackmann@google.com:
 - adjust for post-4.4 is_ia32_task() -> in_ia32_syscall() renaming
 - preserve TF flags fixup in syscall_trace_enter()
 - keep syscall_trace_enter() exported, since we haven't taken
   patches to move the calling code from entry_64.S to common.c]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/x86/entry/common.c       | 104 ++++++----------------------------
 arch/x86/include/asm/ptrace.h |   4 --
 2 files changed, 18 insertions(+), 90 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a9e501303e15..05613b8da509 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -64,20 +64,13 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
 }
 
 /*
- * We can return 0 to resume the syscall or anything else to go to phase
- * 2.  If we resume the syscall, we need to put something appropriate in
- * regs->orig_ax.
- *
- * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
- * are fully functional.
- *
- * For phase 2's benefit, our return value is:
- * 0:			resume the syscall
- * 1:			go to phase 2; no seccomp phase 2 needed
- * anything else:	go to phase 2; pass return value to seccomp
+ * Returns the syscall nr to run (which should match regs->orig_ax) or -1
+ * to skip the syscall.
  */
-unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+long syscall_trace_enter(struct pt_regs *regs)
 {
+	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+
 	struct thread_info *ti = pt_regs_to_thread_info(regs);
 	unsigned long ret = 0;
 	u32 work;
@@ -98,6 +91,16 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
 	}
 #endif
 
+	/*
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state.  If we entered on the slow path, TF was already set.
+	 */
+	if (work & _TIF_SINGLESTEP)
+		regs->flags |= X86_EFLAGS_TF;
+
 #ifdef CONFIG_SECCOMP
 	/*
 	 * Do seccomp first -- it should minimize exposure of other
@@ -129,69 +132,9 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
 			sd.args[5] = regs->bp;
 		}
 
-		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
-		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
-
-		ret = seccomp_phase1(&sd);
-		if (ret == SECCOMP_PHASE1_SKIP) {
-			regs->orig_ax = -1;
-			ret = 0;
-		} else if (ret != SECCOMP_PHASE1_OK) {
-			return ret;  /* Go directly to phase 2 */
-		}
-
-		work &= ~_TIF_SECCOMP;
-	}
-#endif
-
-	/* Do our best to finish without phase 2. */
-	if (work == 0)
-		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
-
-#ifdef CONFIG_AUDITSYSCALL
-	if (work == _TIF_SYSCALL_AUDIT) {
-		/*
-		 * If there is no more work to be done except auditing,
-		 * then audit in phase 1.  Phase 2 always audits, so, if
-		 * we audit here, then we can't go on to phase 2.
-		 */
-		do_audit_syscall_entry(regs, arch);
-		return 0;
-	}
-#endif
-
-	return 1;  /* Something is enabled that we can't handle in phase 1 */
-}
-
-/* Returns the syscall nr to run (which should match regs->orig_ax). */
-long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
-				unsigned long phase1_result)
-{
-	struct thread_info *ti = pt_regs_to_thread_info(regs);
-	long ret = 0;
-	u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
-
-	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
-		BUG_ON(regs != task_pt_regs(current));
-
-	/*
-	 * If we stepped into a sysenter/syscall insn, it trapped in
-	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-	 * If user-mode had set TF itself, then it's still clear from
-	 * do_debug() and we need to set it again to restore the user
-	 * state.  If we entered on the slow path, TF was already set.
-	 */
-	if (work & _TIF_SINGLESTEP)
-		regs->flags |= X86_EFLAGS_TF;
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Call seccomp_phase2 before running the other hooks so that
-	 * they can see any changes made by a seccomp tracer.
-	 */
-	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
-		/* seccomp failures shouldn't expose any additional code. */
-		return -1;
+		ret = __secure_computing(&sd);
+		if (ret == -1)
+			return ret;
 	}
 #endif
 
@@ -210,17 +153,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
 	return ret ?: regs->orig_ax;
 }
 
-long syscall_trace_enter(struct pt_regs *regs)
-{
-	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
-	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
-
-	if (phase1_result == 0)
-		return regs->orig_ax;
-	else
-		return syscall_trace_enter_phase2(regs, arch, phase1_result);
-}
-
 #define EXIT_TO_USERMODE_LOOP_FLAGS				\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
 	 _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6271281f947d..25479a144894 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -83,10 +83,6 @@ extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 			 int error_code, int si_code);
 
 
-extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch);
-extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
-				       unsigned long phase1_result);
-
 extern long syscall_trace_enter(struct pt_regs *);
 
 static inline unsigned long regs_return_value(struct pt_regs *regs)

From c1e7c222ce2c9cac83367cf3e15e69e39715563d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 1 Jun 2016 16:02:17 -0700
Subject: [PATCH 1859/3220] UPSTREAM: seccomp: remove 2-phase API

Since nothing is using the 2-phase API, and it adds more complexity than
benefit, remove it.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
(cherry picked from commit 8112c4f140fa03f9ee68aad2cc79afa7df5418d3)

Bug: 119769499
Change-Id: Iff6246c1e6e9dd0161b80b666a5e796f78a5c785
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 include/linux/seccomp.h |   6 --
 kernel/seccomp.c        | 129 +++++++++++++---------------------------
 2 files changed, 41 insertions(+), 94 deletions(-)

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 62e149fe8ee4..50c460a956f1 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -36,12 +36,6 @@ static inline int secure_computing(const struct seccomp_data *sd)
 		return  __secure_computing(sd);
 	return 0;
 }
-
-#define SECCOMP_PHASE1_OK	0
-#define SECCOMP_PHASE1_SKIP	1
-
-extern u32 seccomp_phase1(struct seccomp_data *sd);
-int seccomp_phase2(u32 phase1_result);
 #else
 extern void secure_computing_strict(int this_syscall);
 #endif
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 32d5137aae69..44af84e3bc75 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -175,7 +175,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
  *
  * Returns valid seccomp BPF response codes.
  */
-static u32 seccomp_run_filters(struct seccomp_data *sd)
+static u32 seccomp_run_filters(const struct seccomp_data *sd)
 {
 	struct seccomp_data sd_local;
 	u32 ret = SECCOMP_RET_ALLOW;
@@ -579,20 +579,9 @@ void secure_computing_strict(int this_syscall)
 		BUG();
 }
 #else
-int __secure_computing(const struct seccomp_data *sd)
-{
-	u32 phase1_result = seccomp_phase1(sd);
-
-	if (likely(phase1_result == SECCOMP_PHASE1_OK))
-		return 0;
-	else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
-		return -1;
-	else
-		return seccomp_phase2(phase1_result);
-}
 
 #ifdef CONFIG_SECCOMP_FILTER
-static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
 {
 	u32 filter_ret, action;
 	int data;
@@ -624,10 +613,33 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
 		goto skip;
 
 	case SECCOMP_RET_TRACE:
-		return filter_ret;  /* Save the rest for phase 2. */
+		/* ENOSYS these calls if there is no tracer attached. */
+		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+			syscall_set_return_value(current,
+						 task_pt_regs(current),
+						 -ENOSYS, 0);
+			goto skip;
+		}
+
+		/* Allow the BPF to provide the event message */
+		ptrace_event(PTRACE_EVENT_SECCOMP, data);
+		/*
+		 * The delivery of a fatal signal during event
+		 * notification may silently skip tracer notification.
+		 * Terminating the task now avoids executing a system
+		 * call that may not be intended.
+		 */
+		if (fatal_signal_pending(current))
+			do_exit(SIGSYS);
+		/* Check if the tracer forced the syscall to be skipped. */
+		this_syscall = syscall_get_nr(current, task_pt_regs(current));
+		if (this_syscall < 0)
+			goto skip;
+
+		return 0;
 
 	case SECCOMP_RET_ALLOW:
-		return SECCOMP_PHASE1_OK;
+		return 0;
 
 	case SECCOMP_RET_KILL:
 	default:
@@ -639,96 +651,37 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
 
 skip:
 	audit_seccomp(this_syscall, 0, action);
-	return SECCOMP_PHASE1_SKIP;
+	return -1;
+}
+#else
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
+{
+	BUG();
 }
 #endif
 
-/**
- * seccomp_phase1() - run fast path seccomp checks on the current syscall
- * @arg sd: The seccomp_data or NULL
- *
- * This only reads pt_regs via the syscall_xyz helpers.  The only change
- * it will make to pt_regs is via syscall_set_return_value, and it will
- * only do that if it returns SECCOMP_PHASE1_SKIP.
- *
- * If sd is provided, it will not read pt_regs at all.
- *
- * It may also call do_exit or force a signal; these actions must be
- * safe.
- *
- * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
- * be processed normally.
- *
- * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
- * invoked.  In this case, seccomp_phase1 will have set the return value
- * using syscall_set_return_value.
- *
- * If it returns anything else, then the return value should be passed
- * to seccomp_phase2 from a context in which ptrace hooks are safe.
- */
-u32 seccomp_phase1(struct seccomp_data *sd)
+int __secure_computing(const struct seccomp_data *sd)
 {
 	int mode = current->seccomp.mode;
-	int this_syscall = sd ? sd->nr :
-		syscall_get_nr(current, task_pt_regs(current));
+	int this_syscall;
 
 	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
-		return SECCOMP_PHASE1_OK;
+		return 0;
+
+	this_syscall = sd ? sd->nr :
+		syscall_get_nr(current, task_pt_regs(current));
 
 	switch (mode) {
 	case SECCOMP_MODE_STRICT:
 		__secure_computing_strict(this_syscall);  /* may call do_exit */
-		return SECCOMP_PHASE1_OK;
-#ifdef CONFIG_SECCOMP_FILTER
+		return 0;
 	case SECCOMP_MODE_FILTER:
-		return __seccomp_phase1_filter(this_syscall, sd);
-#endif
+		return __seccomp_filter(this_syscall, sd);
 	default:
 		BUG();
 	}
 }
-
-/**
- * seccomp_phase2() - finish slow path seccomp work for the current syscall
- * @phase1_result: The return value from seccomp_phase1()
- *
- * This must be called from a context in which ptrace hooks can be used.
- *
- * Returns 0 if the syscall should be processed or -1 to skip the syscall.
- */
-int seccomp_phase2(u32 phase1_result)
-{
-	struct pt_regs *regs = task_pt_regs(current);
-	u32 action = phase1_result & SECCOMP_RET_ACTION;
-	int data = phase1_result & SECCOMP_RET_DATA;
-
-	BUG_ON(action != SECCOMP_RET_TRACE);
-
-	audit_seccomp(syscall_get_nr(current, regs), 0, action);
-
-	/* Skip these calls if there is no tracer. */
-	if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
-		syscall_set_return_value(current, regs,
-					 -ENOSYS, 0);
-		return -1;
-	}
-
-	/* Allow the BPF to provide the event message */
-	ptrace_event(PTRACE_EVENT_SECCOMP, data);
-	/*
-	 * The delivery of a fatal signal during event
-	 * notification may silently skip tracer notification.
-	 * Terminating the task now avoids executing a system
-	 * call that may not be intended.
-	 */
-	if (fatal_signal_pending(current))
-		do_exit(SIGSYS);
-	if (syscall_get_nr(current, regs) < 0)
-		return -1;  /* Explicit request to skip. */
-
-	return 0;
-}
 #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
 
 long prctl_get_seccomp(void)

From 0aa0a57dc4d76af8c6ff5f554d1811a9beab5406 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 1 Jun 2016 19:29:15 -0700
Subject: [PATCH 1860/3220] UPSTREAM: seccomp: recheck the syscall after
 RET_TRACE

When RET_TRACE triggers, a tracer may change a syscall into something that
should be filtered by seccomp. This re-runs seccomp after a trace event
to make sure things continue to pass.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
(cherry picked from commit ce6526e8afa4b6ad0ab134a4cc50c9c863319637)

Bug: 119769499
Change-Id: Ib67732df3c2ac8c6b1de87e75f96aaed02f4627d
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 kernel/seccomp.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 44af84e3bc75..49eafa463853 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -581,7 +581,8 @@ void secure_computing_strict(int this_syscall)
 #else
 
 #ifdef CONFIG_SECCOMP_FILTER
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+			    const bool recheck_after_trace)
 {
 	u32 filter_ret, action;
 	int data;
@@ -613,6 +614,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
 		goto skip;
 
 	case SECCOMP_RET_TRACE:
+		/* We've been put in this state by the ptracer already. */
+		if (recheck_after_trace)
+			return 0;
+
 		/* ENOSYS these calls if there is no tracer attached. */
 		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
 			syscall_set_return_value(current,
@@ -636,6 +641,15 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
 		if (this_syscall < 0)
 			goto skip;
 
+		/*
+		 * Recheck the syscall, since it may have changed. This
+		 * intentionally uses a NULL struct seccomp_data to force
+		 * a reload of all registers. This does not goto skip since
+		 * a skip would have already been reported.
+		 */
+		if (__seccomp_filter(this_syscall, NULL, true))
+			return -1;
+
 		return 0;
 
 	case SECCOMP_RET_ALLOW:
@@ -654,7 +668,8 @@ skip:
 	return -1;
 }
 #else
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+			    const bool recheck_after_trace)
 {
 	BUG();
 }
@@ -677,7 +692,7 @@ int __secure_computing(const struct seccomp_data *sd)
 		__secure_computing_strict(this_syscall);  /* may call do_exit */
 		return 0;
 	case SECCOMP_MODE_FILTER:
-		return __seccomp_filter(this_syscall, sd);
+		return __seccomp_filter(this_syscall, sd, false);
 	default:
 		BUG();
 	}

From 7f891268d5b80d4b9ee38be1a37cce5959bec00f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 9 Jun 2016 12:36:50 -0700
Subject: [PATCH 1861/3220] BACKPORT: x86/ptrace: run seccomp after ptrace

This moves seccomp after ptrace on x86 to that seccomp can catch changes
made by ptrace. Emulation should skip the rest of processing too.

We can get rid of test_thread_flag because there's no longer any
opportunity for seccomp to mess with ptrace state before invoking
ptrace.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: x86@kernel.org
Cc: Andy Lutomirski <luto@kernel.org>
(cherry picked from commit 93e35efb8de45393cf61ed07f7b407629bf698ea)

Bug: 119769499
Change-Id: Ie1b9a18360799e68e22f67ce6a819c93433fdeaa
[ghackmann@google.com: adjust context]
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/x86/entry/common.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 05613b8da509..c611c1ed064b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -73,6 +73,7 @@ long syscall_trace_enter(struct pt_regs *regs)
 
 	struct thread_info *ti = pt_regs_to_thread_info(regs);
 	unsigned long ret = 0;
+	bool emulated = false;
 	u32 work;
 
 	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
@@ -101,11 +102,19 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (work & _TIF_SINGLESTEP)
 		regs->flags |= X86_EFLAGS_TF;
 
+	if (unlikely(work & _TIF_SYSCALL_EMU))
+		emulated = true;
+
+	if ((emulated || (work & _TIF_SYSCALL_TRACE)) &&
+	    tracehook_report_syscall_entry(regs))
+		return -1L;
+
+	if (emulated)
+		return -1L;
+
 #ifdef CONFIG_SECCOMP
 	/*
-	 * Do seccomp first -- it should minimize exposure of other
-	 * code, and keeping seccomp fast is probably more valuable
-	 * than the rest of this.
+	 * Do seccomp after ptrace, to catch any tracer changes.
 	 */
 	if (work & _TIF_SECCOMP) {
 		struct seccomp_data sd;
@@ -138,13 +147,6 @@ long syscall_trace_enter(struct pt_regs *regs)
 	}
 #endif
 
-	if (unlikely(work & _TIF_SYSCALL_EMU))
-		ret = -1L;
-
-	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
-	    tracehook_report_syscall_entry(regs))
-		ret = -1L;
-
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->orig_ax);
 

From d5ea505d8bc7339ec14bc3837dcfbdfaf3d68478 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 2 Jun 2016 12:16:31 -0700
Subject: [PATCH 1862/3220] UPSTREAM: arm/ptrace: run seccomp after ptrace

Close the hole where ptrace can change a syscall out from under seccomp.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
(cherry picked from commit 0f3912fd934cdfd03d93f2dc6f064099795bf638)

Bug: 119769499
Change-Id: Id82e4137207db42a8af31b2745581c53eaaf1f89
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/arm/kernel/ptrace.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index e6a538be2d88..ae738a6319f6 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -932,18 +932,19 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
 {
 	current_thread_info()->syscall = scno;
 
-	/* Do the secure computing check first; failures should be fast. */
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+
+	/* Do seccomp after ptrace; syscall may have changed. */
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
 	if (secure_computing(NULL) == -1)
 		return -1;
 #else
 	/* XXX: remove this once OABI gets fixed */
-	secure_computing_strict(scno);
+	secure_computing_strict(current_thread_info()->syscall);
 #endif
 
-	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
-
+	/* Tracer or seccomp may have changed syscall. */
 	scno = current_thread_info()->syscall;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))

From 68fd1017db7c1db94ad6ff93cd1b127aca47b42a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 2 Jun 2016 12:28:52 -0700
Subject: [PATCH 1863/3220] UPSTREAM: arm64/ptrace: run seccomp after ptrace

Close the hole where ptrace can change a syscall out from under seccomp.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
(cherry picked from commit a5cd110cb8369d6b37ef5ccfe56b3fa1338c9615)

Bug: 119769499
Change-Id: I9fd3e8e6d38122866df434b2676bf7ba0e808e32
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 arch/arm64/kernel/ptrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index afed110595d7..a546e22b5465 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1254,13 +1254,13 @@ static void tracehook_report_syscall(struct pt_regs *regs,
 
 asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 {
-	/* Do the secure computing check first; failures should be fast. */
-	if (secure_computing(NULL) == -1)
-		return -1;
-
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
 
+	/* Do the secure computing after ptrace; failures should be fast. */
+	if (secure_computing(NULL) == -1)
+		return -1;
+
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
 		trace_sys_enter(regs, regs->syscallno);
 

From a26c91b6d2b97fb0517849f789d9628b53eb7a94 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 10 Aug 2016 16:28:09 -0700
Subject: [PATCH 1864/3220] UPSTREAM: seccomp: Fix tracer exit notifications
 during fatal signals

This fixes a ptrace vs fatal pending signals bug as manifested in
seccomp now that seccomp was reordered to happen after ptrace. The
short version is that seccomp should not attempt to call do_exit()
while fatal signals are pending under a tracer. The existing code was
trying to be as defensively paranoid as possible, but it now ends up
confusing ptrace. Instead, the syscall can just be skipped (which solves
the original concern that the do_exit() was addressing) and normal signal
handling, tracer notification, and process death can happen.

Paraphrasing from the original bug report:

If a tracee task is in a PTRACE_EVENT_SECCOMP trap, or has been resumed
after such a trap but not yet been scheduled, and another task in the
thread-group calls exit_group(), then the tracee task exits without the
ptracer receiving a PTRACE_EVENT_EXIT notification. Test case here:
https://gist.github.com/khuey/3c43ac247c72cef8c956ca73281c9be7

The bug happens because when __seccomp_filter() detects
fatal_signal_pending(), it calls do_exit() without dequeuing the fatal
signal. When do_exit() sends the PTRACE_EVENT_EXIT notification and
that task is descheduled, __schedule() notices that there is a fatal
signal pending and changes its state from TASK_TRACED to TASK_RUNNING.
That prevents the ptracer's waitpid() from returning the ptrace event.
A more detailed analysis is here:
https://github.com/mozilla/rr/issues/1762#issuecomment-237396255.

Reported-by: Robert O'Callahan <robert@ocallahan.org>
Reported-by: Kyle Huey <khuey@kylehuey.com>
Tested-by: Kyle Huey <khuey@kylehuey.com>
Fixes: 93e35efb8de4 ("x86/ptrace: run seccomp after ptrace")
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: James Morris <james.l.morris@oracle.com>
(cherry picked from commit 485a252a5559b45d7df04c819ec91177c62c270b)

Bug: 119769499
Change-Id: I444e69093e88d58587b4d5c4f2d777985591c32d
Signed-off-by: Greg Hackmann <ghackmann@google.com>
---
 kernel/seccomp.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 49eafa463853..a39f81c7e87a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -630,12 +630,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		ptrace_event(PTRACE_EVENT_SECCOMP, data);
 		/*
 		 * The delivery of a fatal signal during event
-		 * notification may silently skip tracer notification.
-		 * Terminating the task now avoids executing a system
-		 * call that may not be intended.
+		 * notification may silently skip tracer notification,
+		 * which could leave us with a potentially unmodified
+		 * syscall that the tracer would have liked to have
+		 * changed. Since the process is about to die, we just
+		 * force the syscall to be skipped and let the signal
+		 * kill the process and correctly handle any tracer exit
+		 * notifications.
 		 */
 		if (fatal_signal_pending(current))
-			do_exit(SIGSYS);
+			goto skip;
 		/* Check if the tracer forced the syscall to be skipped. */
 		this_syscall = syscall_get_nr(current, task_pt_regs(current));
 		if (this_syscall < 0)

From fa9640e5dd6d7cd70d37e53584a3b8e46ef7b550 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 25 Oct 2018 16:22:50 -0700
Subject: [PATCH 1865/3220] ANDROID: sdcardfs: Add sandbox

Android/sandbox is treated the same as Android/data

Bug: 27915347
Test: ls -l /sdcard/Android/sandbox/*somepackage* after
      creating the folder.
Signed-off-by: Daniel Rosenberg <drosen@google.com>

Change-Id: I7ef440a88df72198303c419e1f2f7c4657f9c170
---
 fs/sdcardfs/derived_perm.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 85126ec6533c..62ffdb0cb17b 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -62,6 +62,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 	int err;
 	struct qstr q_Android = QSTR_LITERAL("Android");
 	struct qstr q_data = QSTR_LITERAL("data");
+	struct qstr q_sandbox = QSTR_LITERAL("sandbox");
 	struct qstr q_obb = QSTR_LITERAL("obb");
 	struct qstr q_media = QSTR_LITERAL("media");
 	struct qstr q_cache = QSTR_LITERAL("cache");
@@ -110,6 +111,9 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
 		if (qstr_case_eq(name, &q_data)) {
 			/* App-specific directories inside; let anyone traverse */
 			info->data->perm = PERM_ANDROID_DATA;
+		} else if (qstr_case_eq(name, &q_sandbox)) {
+			/* App-specific directories inside; let anyone traverse */
+			info->data->perm = PERM_ANDROID_DATA;
 		} else if (qstr_case_eq(name, &q_obb)) {
 			/* App-specific directories inside; let anyone traverse */
 			info->data->perm = PERM_ANDROID_OBB;

From bc1cf2226ce5ff1e93560e0220a2ecaccc2b196b Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 25 Oct 2018 16:25:15 -0700
Subject: [PATCH 1866/3220] ANDROID: sdcardfs: Add option to not link obb

Add mount option unshared_obb to not link the obb
folders of multiple users together.

Bug: 27915347
Test: mount with option. Check if altering one obb
      alters the other
Signed-off-by: Daniel Rosenberg <drosen@google.com>

Change-Id: I3956e06bd0a222b0bbb2768c9a8a8372ada85e1e
---
 fs/sdcardfs/derived_perm.c |  3 ++-
 fs/sdcardfs/main.c         | 10 +++++++++-
 fs/sdcardfs/sdcardfs.h     |  1 +
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
index 62ffdb0cb17b..1f142fed5a5e 100644
--- a/fs/sdcardfs/derived_perm.c
+++ b/fs/sdcardfs/derived_perm.c
@@ -360,7 +360,8 @@ int need_graft_path(struct dentry *dentry)
 	struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
 	struct qstr obb = QSTR_LITERAL("obb");
 
-	if (parent_info->data->perm == PERM_ANDROID &&
+	if (!sbi->options.unshared_obb &&
+			parent_info->data->perm == PERM_ANDROID &&
 			qstr_case_eq(&dentry->d_name, &obb)) {
 
 		/* /Android/obb is the base obbpath of DERIVED_UNIFIED */
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
index ba52af8644cc..d890c5711907 100644
--- a/fs/sdcardfs/main.c
+++ b/fs/sdcardfs/main.c
@@ -35,6 +35,7 @@ enum {
 	Opt_gid_derivation,
 	Opt_default_normal,
 	Opt_nocache,
+	Opt_unshared_obb,
 	Opt_err,
 };
 
@@ -48,6 +49,7 @@ static const match_table_t sdcardfs_tokens = {
 	{Opt_multiuser, "multiuser"},
 	{Opt_gid_derivation, "derive_gid"},
 	{Opt_default_normal, "default_normal"},
+	{Opt_unshared_obb, "unshared_obb"},
 	{Opt_reserved_mb, "reserved_mb=%u"},
 	{Opt_nocache, "nocache"},
 	{Opt_err, NULL}
@@ -134,6 +136,9 @@ static int parse_options(struct super_block *sb, char *options, int silent,
 		case Opt_nocache:
 			opts->nocache = true;
 			break;
+		case Opt_unshared_obb:
+			opts->unshared_obb = true;
+			break;
 		/* unknown option */
 		default:
 			if (!silent)
@@ -187,13 +192,16 @@ int parse_options_remount(struct super_block *sb, char *options, int silent,
 				return 0;
 			vfsopts->mask = option;
 			break;
+		case Opt_unshared_obb:
 		case Opt_default_normal:
 		case Opt_multiuser:
 		case Opt_userid:
 		case Opt_fsuid:
 		case Opt_fsgid:
 		case Opt_reserved_mb:
-			pr_warn("Option \"%s\" can't be changed during remount\n", p);
+		case Opt_gid_derivation:
+			if (!silent)
+				pr_warn("Option \"%s\" can't be changed during remount\n", p);
 			break;
 		/* unknown option */
 		default:
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
index 57be4761d32b..2bee162921ce 100644
--- a/fs/sdcardfs/sdcardfs.h
+++ b/fs/sdcardfs/sdcardfs.h
@@ -197,6 +197,7 @@ struct sdcardfs_mount_options {
 	bool multiuser;
 	bool gid_derivation;
 	bool default_normal;
+	bool unshared_obb;
 	unsigned int reserved_mb;
 	bool nocache;
 };

From e3d1cdc82d3f2cc248e4a49d43b526731a01054f Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Fri, 9 Nov 2018 10:26:38 -0800
Subject: [PATCH 1867/3220] Revert "ANDROID: Kbuild, LLVMLinux: allow
 overriding clang target triple"

This reverts commit f0907aa15ed9f9c7541bb244ed3f52c376ced19c.

We will apply a better version backported from experimental/android-4.19.

Bug: 118442619
Change-Id: I8e2be21dd6b4ce659d60cb871438d022adb680e1
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 6b0e7d0c984d..a6139c7923ec 100644
--- a/Makefile
+++ b/Makefile
@@ -611,8 +611,7 @@ all: vmlinux
 
 ifeq ($(cc-name),clang)
 ifneq ($(CROSS_COMPILE),)
-CLANG_TRIPLE    ?= $(CROSS_COMPILE)
-CLANG_TARGET	:= --target=$(notdir $(CLANG_TRIPLE:%-=%))
+CLANG_TARGET	:= --target=$(notdir $(CROSS_COMPILE:%-=%))
 GCC_TOOLCHAIN_DIR := $(dir $(shell which $(LD)))
 CLANG_PREFIX	:= --prefix=$(GCC_TOOLCHAIN_DIR)
 GCC_TOOLCHAIN	:= $(realpath $(GCC_TOOLCHAIN_DIR)/..)

From c04915fd693950067cc9dfc6f4046b80d80107e1 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Tue, 25 Oct 2016 13:59:59 -0700
Subject: [PATCH 1868/3220] ANDROID: Kbuild, LLVMLinux: allow overriding clang
 target triple

Android has an unusual setup where the kernel needs to target
[arch]-linux-gnu to avoid Android userspace-specific flags and
optimizations, but AOSP doesn't ship a matching binutils.

Add a new variable CLANG_TRIPLE which can override the "-target" triple
used to compile the kernel, while using a different CROSS_COMPILE to
pick the binutils/gcc installation.  For Android you'd do something
like:

  export CLANG_TRIPLE=aarch64-linux-gnu-
  export CROSS_COMPILE=aarch64-linux-android-

If you don't need something like this, leave CLANG_TRIPLE unset and it
will default to CROSS_COMPILE.

Change-Id: I85d63599c6ab8ed458071cdf9197d85b1f7f150b
Signed-off-by: Greg Hackmann <ghackmann@google.com>
[astrachan: Added a script to check for incorrectly falling back to the
            default when CLANG_TRIPLE is unset]
Bug: 118442619
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 Makefile                 | 6 +++++-
 scripts/clang-android.sh | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100755 scripts/clang-android.sh

diff --git a/Makefile b/Makefile
index a6139c7923ec..3f11f163b67c 100644
--- a/Makefile
+++ b/Makefile
@@ -611,7 +611,11 @@ all: vmlinux
 
 ifeq ($(cc-name),clang)
 ifneq ($(CROSS_COMPILE),)
-CLANG_TARGET	:= --target=$(notdir $(CROSS_COMPILE:%-=%))
+CLANG_TRIPLE	?= $(CROSS_COMPILE)
+CLANG_TARGET	:= --target=$(notdir $(CLANG_TRIPLE:%-=%))
+ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_TARGET)), y)
+$(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
+endif
 GCC_TOOLCHAIN_DIR := $(dir $(shell which $(LD)))
 CLANG_PREFIX	:= --prefix=$(GCC_TOOLCHAIN_DIR)
 GCC_TOOLCHAIN	:= $(realpath $(GCC_TOOLCHAIN_DIR)/..)
diff --git a/scripts/clang-android.sh b/scripts/clang-android.sh
new file mode 100755
index 000000000000..9186c4f48576
--- /dev/null
+++ b/scripts/clang-android.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+$* -dM -E - </dev/null 2>&1 | grep -q __ANDROID__ && echo "y"

From eec98fb4b06d96d85ff9546dbca18fe693de3f2f Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 24 Oct 2018 17:24:14 -0700
Subject: [PATCH 1869/3220] ANDROID: arm64 defconfig / build config for
 cuttlefish

Add an arm64 defconfig with the Android and cuttlefish feature sets
merged in. This has been boot tested only on the QEMU virt model for
AArch64.

$ qemu-system-aarch64 -M virt -cpu cortex-a57 \
    -kernel arch/arm64/boot/Image.gz -serial stdio

Bug: 118442619
Change-Id: I99f3b78af85de8d051226f202351bd852a032248
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/arm64/configs/cuttlefish_defconfig | 410 ++++++++++++++++++++++++
 build.config.cuttlefish.aarch64         |  16 +
 2 files changed, 426 insertions(+)
 create mode 100644 arch/arm64/configs/cuttlefish_defconfig
 create mode 100644 build.config.cuttlefish.aarch64

diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
new file mode 100644
index 000000000000..d77d20e5a728
--- /dev/null
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -0,0 +1,410 @@
+# CONFIG_USELIB is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+# CONFIG_PROC_PID_CPUSET is not set
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHEDTUNE=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_SCHED_TUNE=y
+CONFIG_DEFAULT_USE_ENERGY_AWARE=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_RD_BZIP2 is not set
+# CONFIG_RD_LZMA is not set
+# CONFIG_RD_XZ is not set
+# CONFIG_RD_LZO is not set
+# CONFIG_RD_LZ4 is not set
+CONFIG_SGETMASK_SYSCALL=y
+# CONFIG_SYSFS_SYSCALL is not set
+CONFIG_KALLSYMS_ALL=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_VM_EVENT_COUNTERS is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_JUMP_LABEL=y
+CONFIG_CC_STACKPROTECTOR_STRONG=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_PCI_HOST_GENERIC=y
+CONFIG_PREEMPT=y
+CONFIG_HZ_100=y
+# CONFIG_SPARSEMEM_VMEMMAP is not set
+CONFIG_KSM=y
+CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_ZSMALLOC=y
+CONFIG_SECCOMP=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_SWP_EMULATION=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_ARM64_SW_TTBR0_PAN=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_CMDLINE="console=ttyAMA0"
+CONFIG_CMDLINE_EXTEND=y
+# CONFIG_EFI is not set
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_COMPAT=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_ARM_CPUIDLE=y
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
+CONFIG_CPUFREQ_DT=y
+CONFIG_ARM_BIG_LITTLE_CPUFREQ=y
+CONFIG_ARM_DT_BL_CPUFREQ=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_NET_IPGRE_DEMUX=y
+CONFIG_NET_IPVTI=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_UDP_DIAG=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_TCP_CONG_ADVANCED=y
+# CONFIG_TCP_CONG_BIC is not set
+# CONFIG_TCP_CONG_WESTWOOD is not set
+# CONFIG_TCP_CONG_HTCP is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_VTI=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_BPF=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_RPFILTER=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_L2TP=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+# CONFIG_CFG80211_DEFAULT_PS is not set
+CONFIG_MAC80211=y
+# CONFIG_MAC80211_RC_MINSTREL is not set
+CONFIG_RFKILL=y
+# CONFIG_UEVENT_HELPER is not set
+CONFIG_DEVTMPFS=y
+# CONFIG_ALLOW_DEV_COREDUMP is not set
+CONFIG_DEBUG_DEVRES=y
+CONFIG_OF_UNITTEST=y
+CONFIG_ZRAM=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_UID_SYS_STATS=y
+CONFIG_SCSI=y
+# CONFIG_SCSI_PROC_FS is not set
+CONFIG_BLK_DEV_SD=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_DM_VERITY_AVB=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+# CONFIG_ETHERNET is not set
+CONFIG_PHYLIB=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPTP=y
+CONFIG_PPPOL2TP=y
+CONFIG_USB_USBNET=y
+# CONFIG_USB_NET_AX8817X is not set
+# CONFIG_USB_NET_AX88179_178A is not set
+# CONFIG_USB_NET_CDCETHER is not set
+# CONFIG_USB_NET_CDC_NCM is not set
+# CONFIG_USB_NET_NET1080 is not set
+# CONFIG_USB_NET_CDC_SUBSET is not set
+# CONFIG_USB_NET_ZAURUS is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=48
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+CONFIG_HW_RANDOM_VIRTIO=y
+# CONFIG_DEVPORT is not set
+# CONFIG_I2C_COMPAT is not set
+# CONFIG_I2C_HELPER_AUTO is not set
+CONFIG_GPIOLIB=y
+# CONFIG_HWMON is not set
+CONFIG_CPU_THERMAL=y
+CONFIG_MEDIA_SUPPORT=y
+# CONFIG_DVB_TUNER_DIB0070 is not set
+# CONFIG_DVB_TUNER_DIB0090 is not set
+# CONFIG_VGA_ARB is not set
+CONFIG_DRM=y
+# CONFIG_DRM_FBDEV_EMULATION is not set
+CONFIG_DRM_VIRTIO_GPU=y
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_ACC=y
+CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_MMC=y
+# CONFIG_MMC_BLOCK is not set
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+# CONFIG_RTC_SYSTOHC is not set
+CONFIG_RTC_DRV_PL031=y
+CONFIG_VIRTIO_PCI=y
+# CONFIG_VIRTIO_PCI_LEGACY is not set
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_SYNC=y
+CONFIG_ANDROID_VSOC=y
+CONFIG_ION=y
+# CONFIG_COMMON_CLK_XGENE is not set
+CONFIG_MAILBOX=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_F2FS_FS=y
+CONFIG_F2FS_FS_SECURITY=y
+CONFIG_F2FS_FS_ENCRYPTION=y
+# CONFIG_DNOTIFY is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V2=y
+CONFIG_FUSE_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_SDCARD_FS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_DEBUG_SET_MODULE_RONX=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_LSM_MMAP_MIN_ADDR=65536
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_SHA512=y
+CONFIG_CRYPTO_LZ4=y
+CONFIG_CRYPTO_ZSTD=y
+CONFIG_CRYPTO_ANSI_CPRNG=y
+CONFIG_XZ_DEC=y
diff --git a/build.config.cuttlefish.aarch64 b/build.config.cuttlefish.aarch64
new file mode 100644
index 000000000000..40e9a3bd8d0d
--- /dev/null
+++ b/build.config.cuttlefish.aarch64
@@ -0,0 +1,16 @@
+ARCH=arm64
+BRANCH=android-4.4
+CLANG_TRIPLE=aarch64-linux-gnu-
+CROSS_COMPILE=aarch64-linux-androidkernel-
+DEFCONFIG=cuttlefish_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+POST_DEFCONFIG_CMDS="check_defconfig"
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r328903/bin
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
+FILES="
+arch/arm64/boot/Image.gz
+vmlinux
+System.map
+"
+STOP_SHIP_TRACEPRINTK=1

From d40633ce0827b69547a5312a4a4416a7be06e5a5 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@android.com>
Date: Tue, 6 Nov 2018 15:55:32 -0800
Subject: [PATCH 1870/3220] UPSTREAM: binder: fix race that allows malicious
 free of live buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 7bada55ab50697861eee6bb7d60b41e68a961a9c upstream

Malicious code can attempt to free buffers using the BC_FREE_BUFFER
ioctl to binder. There are protections against a user freeing a buffer
while in use by the kernel, however there was a window where
BC_FREE_BUFFER could be used to free a recently allocated buffer that
was not completely initialized. This resulted in a use-after-free
detected by KASAN with a malicious test program.

This window is closed by setting the buffer's allow_user_free attribute
to 0 when the buffer is allocated or when the user has previously freed
it instead of waiting for the caller to set it. The problem was that
when the struct buffer was recycled, allow_user_free was stale and set
to 1 allowing a free to go through.

Bug: 116855682
Change-Id: I0b38089f6fdb1adbf7e1102747e4119c9a05b191
Signed-off-by: Todd Kjos <tkjos@google.com>
Acked-by: Arve Hjønnevåg <arve@android.com>
Cc: stable <stable@vger.kernel.org> # 4.14
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c       | 21 ++++++++++++---------
 drivers/android/binder_alloc.c | 14 ++++++--------
 drivers/android/binder_alloc.h |  3 +--
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 11c297806d7d..f1599c14f1b7 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3138,7 +3138,6 @@ static void binder_transaction(struct binder_proc *proc,
 		t->buffer = NULL;
 		goto err_binder_alloc_buf_failed;
 	}
-	t->buffer->allow_user_free = 0;
 	t->buffer->debug_id = t->debug_id;
 	t->buffer->transaction = t;
 	t->buffer->target_node = target_node;
@@ -3634,14 +3633,18 @@ static int binder_thread_write(struct binder_proc *proc,
 
 			buffer = binder_alloc_prepare_to_free(&proc->alloc,
 							      data_ptr);
-			if (buffer == NULL) {
-				binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n",
-					proc->pid, thread->pid, (u64)data_ptr);
-				break;
-			}
-			if (!buffer->allow_user_free) {
-				binder_user_error("%d:%d BC_FREE_BUFFER u%016llx matched unreturned buffer\n",
-					proc->pid, thread->pid, (u64)data_ptr);
+			if (IS_ERR_OR_NULL(buffer)) {
+				if (PTR_ERR(buffer) == -EPERM) {
+					binder_user_error(
+						"%d:%d BC_FREE_BUFFER u%016llx matched unreturned or currently freeing buffer\n",
+						proc->pid, thread->pid,
+						(u64)data_ptr);
+				} else {
+					binder_user_error(
+						"%d:%d BC_FREE_BUFFER u%016llx no match\n",
+						proc->pid, thread->pid,
+						(u64)data_ptr);
+				}
 				break;
 			}
 			binder_debug(BINDER_DEBUG_FREE_BUFFER,
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 1d9db2ef26bd..baecb4bf9d9b 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -149,14 +149,12 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked(
 		else {
 			/*
 			 * Guard against user threads attempting to
-			 * free the buffer twice
+			 * free the buffer when in use by kernel or
+			 * after it's already been freed.
 			 */
-			if (buffer->free_in_progress) {
-				pr_err("%d:%d FREE_BUFFER u%016llx user freed buffer twice\n",
-				       alloc->pid, current->pid, (u64)user_ptr);
-				return NULL;
-			}
-			buffer->free_in_progress = 1;
+			if (!buffer->allow_user_free)
+				return ERR_PTR(-EPERM);
+			buffer->allow_user_free = 0;
 			return buffer;
 		}
 	}
@@ -463,7 +461,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
 
 	rb_erase(best_fit, &alloc->free_buffers);
 	buffer->free = 0;
-	buffer->free_in_progress = 0;
+	buffer->allow_user_free = 0;
 	binder_insert_allocated_buffer_locked(alloc, buffer);
 	binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
 		     "%d: binder_alloc_buf size %zd got %pK\n",
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index 9ef64e563856..fb3238c74c8a 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -50,8 +50,7 @@ struct binder_buffer {
 	unsigned free:1;
 	unsigned allow_user_free:1;
 	unsigned async_transaction:1;
-	unsigned free_in_progress:1;
-	unsigned debug_id:28;
+	unsigned debug_id:29;
 
 	struct binder_transaction *transaction;
 

From 0d71150b2638c2ee8a7ee92a6e8b46dfb73215f8 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Fri, 7 Dec 2018 11:34:16 -0800
Subject: [PATCH 1871/3220] ANDROID: Move from clang r328903 to r346389b.

Bug: 120439617
Bug: 120503084
Change-Id: I21bb183cac03753d1ba719a69305e2199c3f3227
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 build.config.cuttlefish.aarch64 | 2 +-
 build.config.cuttlefish.x86_64  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.config.cuttlefish.aarch64 b/build.config.cuttlefish.aarch64
index 40e9a3bd8d0d..e36bae42443e 100644
--- a/build.config.cuttlefish.aarch64
+++ b/build.config.cuttlefish.aarch64
@@ -6,7 +6,7 @@ DEFCONFIG=cuttlefish_defconfig
 EXTRA_CMDS=''
 KERNEL_DIR=common
 POST_DEFCONFIG_CMDS="check_defconfig"
-CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r328903/bin
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r346389b/bin
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
 FILES="
 arch/arm64/boot/Image.gz
diff --git a/build.config.cuttlefish.x86_64 b/build.config.cuttlefish.x86_64
index ce8c226a1db4..0a77fa871e83 100644
--- a/build.config.cuttlefish.x86_64
+++ b/build.config.cuttlefish.x86_64
@@ -6,7 +6,7 @@ DEFCONFIG=x86_64_cuttlefish_defconfig
 EXTRA_CMDS=''
 KERNEL_DIR=common
 POST_DEFCONFIG_CMDS="check_defconfig"
-CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r328903/bin
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r346389b/bin
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
 FILES="
 arch/x86/boot/bzImage

From d0e75fe52e164421dce83ed4c57786ebdd7c0771 Mon Sep 17 00:00:00 2001
From: Cody Schuffelen <schuffelen@google.com>
Date: Tue, 20 Nov 2018 19:14:49 -0800
Subject: [PATCH 1872/3220] FROMGIT, BACKPORT: mac80211-next: rtnetlink wifi
 simulation device

This device takes over an existing network device and produces a
new one that appears like a wireless connection, returning enough canned
responses to nl80211 to satisfy a standard connection manager. If
necessary, it can also be set up one step removed from an existing
network device, such as through a vlan/80211Q or macvlan connection to
not disrupt the existing network interface.

To use it to wrap a bare ethernet connection:

ip link add link eth0 name wlan0 type virt_wifi

You may have to rename or otherwise hide the eth0 from your connection
manager, as the original network link will become unusuable and only
the wireless wrapper will be functional. This can also be combined with
vlan or macvlan links on top of eth0 to share the network between
distinct links, but that requires support outside the machine for
accepting vlan-tagged packets or packets from multiple MAC addresses.

This is being used for Google's Remote Android Virtual Device project,
which runs Android devices in virtual machines. The standard network
interfaces provided inside the virtual machines are all ethernet.
However, Android is not interested in ethernet devices and would rather
connect to a wireless interface. This patch allows the virtual machine
guest to treat one of its network connections as wireless rather than
ethernet, satisfying Android's network connection requirements.

We believe this is a generally useful driver for simulating wireless
network connections in other environments where a wireless connection is
desired by some userspace process but is not available.

This is distinct from other testing efforts such as mac80211_hwsim by
being a cfg80211 device instead of mac80211 device, allowing straight
pass-through on the data plane instead of forcing packaging of ethernet
data into mac80211 frames.

Signed-off-by: A. Cody Schuffelen <schuffelen@google.com>
Acked-by: Alistair Strachan <astrachan@google.com>
Acked-by: Greg Hartman <ghartman@google.com>
Acked-by: Tristan Muntsinger <muntsinger@google.com>
[make it a tristate]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
(cherry picked from commit c7cdba31ed8b87526db978976392802d3f93110c)
[astrachan: taken from mac80211-next/master]
[astrachan: removed 'extack' from call to netdev_upper_dev_link()]
[astrachan: removed 'extack' from virt_wifi_newlink() and initialized
            'destructor' instead of 'priv_destructor']
[astrachan: renamed NL80211_BAND_xxGHZ -> IEEE80211_BAND_xxGHZ]
Bug: 120682817
Test: boot tested cuttlefish and enabled wifi, saw AndroidWifi
Change-Id: I726ec28617574c0217d937da049089f0ab8e0da8
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/net/wireless/Kconfig     |   7 +
 drivers/net/wireless/Makefile    |   2 +
 drivers/net/wireless/virt_wifi.c | 629 +++++++++++++++++++++++++++++++
 3 files changed, 638 insertions(+)
 create mode 100644 drivers/net/wireless/virt_wifi.c

diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig
index f9f94229bf1b..0ab1a0f04075 100644
--- a/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@ -256,6 +256,13 @@ config MAC80211_HWSIM
 	  To compile this driver as a module, choose M here: the module will be
 	  called mac80211_hwsim.  If unsure, say N.
 
+config VIRT_WIFI
+	tristate "Wifi wrapper for ethernet drivers"
+	depends on CFG80211
+	---help---
+	  This option adds support for ethernet connections to appear as if they
+	  are wifi connections through a special rtnetlink device.
+
 config MWL8K
 	tristate "Marvell 88W8xxx PCI/PCIe Wireless support"
 	depends on MAC80211 && PCI
diff --git a/drivers/net/wireless/Makefile b/drivers/net/wireless/Makefile
index 740fdd353c5d..76b7b9198657 100644
--- a/drivers/net/wireless/Makefile
+++ b/drivers/net/wireless/Makefile
@@ -51,6 +51,8 @@ obj-$(CONFIG_ATH_CARDS)		+= ath/
 
 obj-$(CONFIG_MAC80211_HWSIM)	+= mac80211_hwsim.o
 
+obj-$(CONFIG_VIRT_WIFI)	+= virt_wifi.o
+
 obj-$(CONFIG_WL_TI)	+= ti/
 
 obj-$(CONFIG_MWIFIEX)	+= mwifiex/
diff --git a/drivers/net/wireless/virt_wifi.c b/drivers/net/wireless/virt_wifi.c
new file mode 100644
index 000000000000..90ecb7e8834f
--- /dev/null
+++ b/drivers/net/wireless/virt_wifi.c
@@ -0,0 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0
+/* drivers/net/wireless/virt_wifi.c
+ *
+ * A fake implementation of cfg80211_ops that can be tacked on to an ethernet
+ * net_device to make it appear as a wireless connection.
+ *
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * Author: schuffelen@google.com
+ */
+
+#include <net/cfg80211.h>
+#include <net/rtnetlink.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+
+#include <net/cfg80211.h>
+#include <net/rtnetlink.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+
+static struct wiphy *common_wiphy;
+
+struct virt_wifi_wiphy_priv {
+	struct delayed_work scan_result;
+	struct cfg80211_scan_request *scan_request;
+	bool being_deleted;
+};
+
+static struct ieee80211_channel channel_2ghz = {
+	.band = IEEE80211_BAND_2GHZ,
+	.center_freq = 2432,
+	.hw_value = 2432,
+	.max_power = 20,
+};
+
+static struct ieee80211_rate bitrates_2ghz[] = {
+	{ .bitrate = 10 },
+	{ .bitrate = 20 },
+	{ .bitrate = 55 },
+	{ .bitrate = 110 },
+	{ .bitrate = 60 },
+	{ .bitrate = 120 },
+	{ .bitrate = 240 },
+};
+
+static struct ieee80211_supported_band band_2ghz = {
+	.channels = &channel_2ghz,
+	.bitrates = bitrates_2ghz,
+	.band = IEEE80211_BAND_2GHZ,
+	.n_channels = 1,
+	.n_bitrates = ARRAY_SIZE(bitrates_2ghz),
+	.ht_cap = {
+		.ht_supported = true,
+		.cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
+		       IEEE80211_HT_CAP_GRN_FLD |
+		       IEEE80211_HT_CAP_SGI_20 |
+		       IEEE80211_HT_CAP_SGI_40 |
+		       IEEE80211_HT_CAP_DSSSCCK40,
+		.ampdu_factor = 0x3,
+		.ampdu_density = 0x6,
+		.mcs = {
+			.rx_mask = {0xff, 0xff},
+			.tx_params = IEEE80211_HT_MCS_TX_DEFINED,
+		},
+	},
+};
+
+static struct ieee80211_channel channel_5ghz = {
+	.band = IEEE80211_BAND_5GHZ,
+	.center_freq = 5240,
+	.hw_value = 5240,
+	.max_power = 20,
+};
+
+static struct ieee80211_rate bitrates_5ghz[] = {
+	{ .bitrate = 60 },
+	{ .bitrate = 120 },
+	{ .bitrate = 240 },
+};
+
+#define RX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 14)
+
+#define TX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \
+		    IEEE80211_VHT_MCS_SUPPORT_0_9 << 14)
+
+static struct ieee80211_supported_band band_5ghz = {
+	.channels = &channel_5ghz,
+	.bitrates = bitrates_5ghz,
+	.band = IEEE80211_BAND_5GHZ,
+	.n_channels = 1,
+	.n_bitrates = ARRAY_SIZE(bitrates_5ghz),
+	.ht_cap = {
+		.ht_supported = true,
+		.cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
+		       IEEE80211_HT_CAP_GRN_FLD |
+		       IEEE80211_HT_CAP_SGI_20 |
+		       IEEE80211_HT_CAP_SGI_40 |
+		       IEEE80211_HT_CAP_DSSSCCK40,
+		.ampdu_factor = 0x3,
+		.ampdu_density = 0x6,
+		.mcs = {
+			.rx_mask = {0xff, 0xff},
+			.tx_params = IEEE80211_HT_MCS_TX_DEFINED,
+		},
+	},
+	.vht_cap = {
+		.vht_supported = true,
+		.cap = IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 |
+		       IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ |
+		       IEEE80211_VHT_CAP_RXLDPC |
+		       IEEE80211_VHT_CAP_SHORT_GI_80 |
+		       IEEE80211_VHT_CAP_SHORT_GI_160 |
+		       IEEE80211_VHT_CAP_TXSTBC |
+		       IEEE80211_VHT_CAP_RXSTBC_1 |
+		       IEEE80211_VHT_CAP_RXSTBC_2 |
+		       IEEE80211_VHT_CAP_RXSTBC_3 |
+		       IEEE80211_VHT_CAP_RXSTBC_4 |
+		       IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK,
+		.vht_mcs = {
+			.rx_mcs_map = cpu_to_le16(RX_MCS_MAP),
+			.tx_mcs_map = cpu_to_le16(TX_MCS_MAP),
+		}
+	},
+};
+
+/* Assigned at module init. Guaranteed locally-administered and unicast. */
+static u8 fake_router_bssid[ETH_ALEN] __ro_after_init = {};
+
+/* Called with the rtnl lock held. */
+static int virt_wifi_scan(struct wiphy *wiphy,
+			  struct cfg80211_scan_request *request)
+{
+	struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy);
+
+	wiphy_debug(wiphy, "scan\n");
+
+	if (priv->scan_request || priv->being_deleted)
+		return -EBUSY;
+
+	priv->scan_request = request;
+	schedule_delayed_work(&priv->scan_result, HZ * 2);
+
+	return 0;
+}
+
+/* Acquires and releases the rdev BSS lock. */
+static void virt_wifi_scan_result(struct work_struct *work)
+{
+	struct {
+		u8 tag;
+		u8 len;
+		u8 ssid[8];
+	} __packed ssid = {
+		.tag = WLAN_EID_SSID, .len = 8, .ssid = "VirtWifi",
+	};
+	struct cfg80211_bss *informed_bss;
+	struct virt_wifi_wiphy_priv *priv =
+		container_of(work, struct virt_wifi_wiphy_priv,
+			     scan_result.work);
+	struct wiphy *wiphy = priv_to_wiphy(priv);
+
+	informed_bss = cfg80211_inform_bss(wiphy, &channel_5ghz,
+					   CFG80211_BSS_FTYPE_PRESP,
+					   fake_router_bssid,
+					   ktime_get_boot_ns(),
+					   WLAN_CAPABILITY_ESS, 0,
+					   (void *)&ssid, sizeof(ssid),
+					   DBM_TO_MBM(-50), GFP_KERNEL);
+	cfg80211_put_bss(wiphy, informed_bss);
+
+	/* Schedules work which acquires and releases the rtnl lock. */
+	cfg80211_scan_done(priv->scan_request, false);
+	priv->scan_request = NULL;
+}
+
+/* May acquire and release the rdev BSS lock. */
+static void virt_wifi_cancel_scan(struct wiphy *wiphy)
+{
+	struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy);
+
+	cancel_delayed_work_sync(&priv->scan_result);
+	/* Clean up dangling callbacks if necessary. */
+	if (priv->scan_request) {
+		/* Schedules work which acquires and releases the rtnl lock. */
+		cfg80211_scan_done(priv->scan_request, true);
+		priv->scan_request = NULL;
+	}
+}
+
+struct virt_wifi_netdev_priv {
+	struct delayed_work connect;
+	struct net_device *lowerdev;
+	struct net_device *upperdev;
+	u32 tx_packets;
+	u32 tx_failed;
+	u8 connect_requested_bss[ETH_ALEN];
+	bool is_up;
+	bool is_connected;
+	bool being_deleted;
+};
+
+/* Called with the rtnl lock held. */
+static int virt_wifi_connect(struct wiphy *wiphy, struct net_device *netdev,
+			     struct cfg80211_connect_params *sme)
+{
+	struct virt_wifi_netdev_priv *priv = netdev_priv(netdev);
+	bool could_schedule;
+
+	if (priv->being_deleted || !priv->is_up)
+		return -EBUSY;
+
+	could_schedule = schedule_delayed_work(&priv->connect, HZ * 2);
+	if (!could_schedule)
+		return -EBUSY;
+
+	if (sme->bssid)
+		ether_addr_copy(priv->connect_requested_bss, sme->bssid);
+	else
+		eth_zero_addr(priv->connect_requested_bss);
+
+	wiphy_debug(wiphy, "connect\n");
+
+	return 0;
+}
+
+/* Acquires and releases the rdev event lock. */
+static void virt_wifi_connect_complete(struct work_struct *work)
+{
+	struct virt_wifi_netdev_priv *priv =
+		container_of(work, struct virt_wifi_netdev_priv, connect.work);
+	u8 *requested_bss = priv->connect_requested_bss;
+	bool has_addr = !is_zero_ether_addr(requested_bss);
+	bool right_addr = ether_addr_equal(requested_bss, fake_router_bssid);
+	u16 status = WLAN_STATUS_SUCCESS;
+
+	if (!priv->is_up || (has_addr && !right_addr))
+		status = WLAN_STATUS_UNSPECIFIED_FAILURE;
+	else
+		priv->is_connected = true;
+
+	/* Schedules an event that acquires the rtnl lock. */
+	cfg80211_connect_result(priv->upperdev, requested_bss, NULL, 0, NULL, 0,
+				status, GFP_KERNEL);
+	netif_carrier_on(priv->upperdev);
+}
+
+/* May acquire and release the rdev event lock. */
+static void virt_wifi_cancel_connect(struct net_device *netdev)
+{
+	struct virt_wifi_netdev_priv *priv = netdev_priv(netdev);
+
+	/* If there is work pending, clean up dangling callbacks. */
+	if (cancel_delayed_work_sync(&priv->connect)) {
+		/* Schedules an event that acquires the rtnl lock. */
+		cfg80211_connect_result(priv->upperdev,
+					priv->connect_requested_bss, NULL, 0,
+					NULL, 0,
+					WLAN_STATUS_UNSPECIFIED_FAILURE,
+					GFP_KERNEL);
+	}
+}
+
+/* Called with the rtnl lock held. Acquires the rdev event lock. */
+static int virt_wifi_disconnect(struct wiphy *wiphy, struct net_device *netdev,
+				u16 reason_code)
+{
+	struct virt_wifi_netdev_priv *priv = netdev_priv(netdev);
+
+	if (priv->being_deleted)
+		return -EBUSY;
+
+	wiphy_debug(wiphy, "disconnect\n");
+	virt_wifi_cancel_connect(netdev);
+
+	cfg80211_disconnected(netdev, reason_code, NULL, 0, true, GFP_KERNEL);
+	priv->is_connected = false;
+	netif_carrier_off(netdev);
+
+	return 0;
+}
+
+/* Called with the rtnl lock held. */
+static int virt_wifi_get_station(struct wiphy *wiphy, struct net_device *dev,
+				 const u8 *mac, struct station_info *sinfo)
+{
+	struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+	wiphy_debug(wiphy, "get_station\n");
+
+	if (!priv->is_connected || !ether_addr_equal(mac, fake_router_bssid))
+		return -ENOENT;
+
+	sinfo->filled = BIT_ULL(NL80211_STA_INFO_TX_PACKETS) |
+		BIT_ULL(NL80211_STA_INFO_TX_FAILED) |
+		BIT_ULL(NL80211_STA_INFO_SIGNAL) |
+		BIT_ULL(NL80211_STA_INFO_TX_BITRATE);
+	sinfo->tx_packets = priv->tx_packets;
+	sinfo->tx_failed = priv->tx_failed;
+	/* For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ */
+	sinfo->signal = -50;
+	sinfo->txrate = (struct rate_info) {
+		.legacy = 10, /* units are 100kbit/s */
+	};
+	return 0;
+}
+
+/* Called with the rtnl lock held. */
+static int virt_wifi_dump_station(struct wiphy *wiphy, struct net_device *dev,
+				  int idx, u8 *mac, struct station_info *sinfo)
+{
+	struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+	wiphy_debug(wiphy, "dump_station\n");
+
+	if (idx != 0 || !priv->is_connected)
+		return -ENOENT;
+
+	ether_addr_copy(mac, fake_router_bssid);
+	return virt_wifi_get_station(wiphy, dev, fake_router_bssid, sinfo);
+}
+
+static const struct cfg80211_ops virt_wifi_cfg80211_ops = {
+	.scan = virt_wifi_scan,
+
+	.connect = virt_wifi_connect,
+	.disconnect = virt_wifi_disconnect,
+
+	.get_station = virt_wifi_get_station,
+	.dump_station = virt_wifi_dump_station,
+};
+
+/* Acquires and releases the rtnl lock. */
+static struct wiphy *virt_wifi_make_wiphy(void)
+{
+	struct wiphy *wiphy;
+	struct virt_wifi_wiphy_priv *priv;
+	int err;
+
+	wiphy = wiphy_new(&virt_wifi_cfg80211_ops, sizeof(*priv));
+
+	if (!wiphy)
+		return NULL;
+
+	wiphy->max_scan_ssids = 4;
+	wiphy->max_scan_ie_len = 1000;
+	wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
+
+	wiphy->bands[IEEE80211_BAND_2GHZ] = &band_2ghz;
+	wiphy->bands[IEEE80211_BAND_5GHZ] = &band_5ghz;
+	wiphy->bands[IEEE80211_BAND_60GHZ] = NULL;
+
+	wiphy->regulatory_flags = REGULATORY_WIPHY_SELF_MANAGED;
+	wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION);
+
+	priv = wiphy_priv(wiphy);
+	priv->being_deleted = false;
+	priv->scan_request = NULL;
+	INIT_DELAYED_WORK(&priv->scan_result, virt_wifi_scan_result);
+
+	err = wiphy_register(wiphy);
+	if (err < 0) {
+		wiphy_free(wiphy);
+		return NULL;
+	}
+
+	return wiphy;
+}
+
+/* Acquires and releases the rtnl lock. */
+static void virt_wifi_destroy_wiphy(struct wiphy *wiphy)
+{
+	struct virt_wifi_wiphy_priv *priv;
+
+	WARN(!wiphy, "%s called with null wiphy", __func__);
+	if (!wiphy)
+		return;
+
+	priv = wiphy_priv(wiphy);
+	priv->being_deleted = true;
+	virt_wifi_cancel_scan(wiphy);
+
+	if (wiphy->registered)
+		wiphy_unregister(wiphy);
+	wiphy_free(wiphy);
+}
+
+/* Enters and exits a RCU-bh critical section. */
+static netdev_tx_t virt_wifi_start_xmit(struct sk_buff *skb,
+					struct net_device *dev)
+{
+	struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+	priv->tx_packets++;
+	if (!priv->is_connected) {
+		priv->tx_failed++;
+		return NET_XMIT_DROP;
+	}
+
+	skb->dev = priv->lowerdev;
+	return dev_queue_xmit(skb);
+}
+
+/* Called with rtnl lock held. */
+static int virt_wifi_net_device_open(struct net_device *dev)
+{
+	struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+	priv->is_up = true;
+	return 0;
+}
+
+/* Called with rtnl lock held. */
+static int virt_wifi_net_device_stop(struct net_device *dev)
+{
+	struct virt_wifi_netdev_priv *n_priv = netdev_priv(dev);
+	struct virt_wifi_wiphy_priv *w_priv;
+
+	n_priv->is_up = false;
+
+	if (!dev->ieee80211_ptr)
+		return 0;
+	w_priv = wiphy_priv(dev->ieee80211_ptr->wiphy);
+
+	virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy);
+	virt_wifi_cancel_connect(dev);
+	netif_carrier_off(dev);
+
+	return 0;
+}
+
+static const struct net_device_ops virt_wifi_ops = {
+	.ndo_start_xmit = virt_wifi_start_xmit,
+	.ndo_open = virt_wifi_net_device_open,
+	.ndo_stop = virt_wifi_net_device_stop,
+};
+
+/* Invoked as part of rtnl lock release. */
+static void virt_wifi_net_device_destructor(struct net_device *dev)
+{
+	/* Delayed past dellink to allow nl80211 to react to the device being
+	 * deleted.
+	 */
+	kfree(dev->ieee80211_ptr);
+	dev->ieee80211_ptr = NULL;
+	free_netdev(dev);
+}
+
+/* No lock interaction. */
+static void virt_wifi_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &virt_wifi_ops;
+	dev->destructor = virt_wifi_net_device_destructor;
+}
+
+/* Called in a RCU read critical section from netif_receive_skb */
+static rx_handler_result_t virt_wifi_rx_handler(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct virt_wifi_netdev_priv *priv =
+		rcu_dereference(skb->dev->rx_handler_data);
+
+	if (!priv->is_connected)
+		return RX_HANDLER_PASS;
+
+	/* GFP_ATOMIC because this is a packet interrupt handler. */
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb) {
+		dev_err(&priv->upperdev->dev, "can't skb_share_check\n");
+		return RX_HANDLER_CONSUMED;
+	}
+
+	*pskb = skb;
+	skb->dev = priv->upperdev;
+	skb->pkt_type = PACKET_HOST;
+	return RX_HANDLER_ANOTHER;
+}
+
+/* Called with rtnl lock held. */
+static int virt_wifi_newlink(struct net *src_net, struct net_device *dev,
+			     struct nlattr *tb[], struct nlattr *data[])
+{
+	struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+	int err;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+
+	netif_carrier_off(dev);
+
+	priv->upperdev = dev;
+	priv->lowerdev = __dev_get_by_index(src_net,
+					    nla_get_u32(tb[IFLA_LINK]));
+
+	if (!priv->lowerdev)
+		return -ENODEV;
+	if (!tb[IFLA_MTU])
+		dev->mtu = priv->lowerdev->mtu;
+	else if (dev->mtu > priv->lowerdev->mtu)
+		return -EINVAL;
+
+	err = netdev_rx_handler_register(priv->lowerdev, virt_wifi_rx_handler,
+					 priv);
+	if (err) {
+		dev_err(&priv->lowerdev->dev,
+			"can't netdev_rx_handler_register: %d\n", err);
+		return err;
+	}
+
+	eth_hw_addr_inherit(dev, priv->lowerdev);
+	netif_stacked_transfer_operstate(priv->lowerdev, dev);
+
+	SET_NETDEV_DEV(dev, &priv->lowerdev->dev);
+	dev->ieee80211_ptr = kzalloc(sizeof(*dev->ieee80211_ptr), GFP_KERNEL);
+
+	if (!dev->ieee80211_ptr)
+		goto remove_handler;
+
+	dev->ieee80211_ptr->iftype = NL80211_IFTYPE_STATION;
+	dev->ieee80211_ptr->wiphy = common_wiphy;
+
+	err = register_netdevice(dev);
+	if (err) {
+		dev_err(&priv->lowerdev->dev, "can't register_netdevice: %d\n",
+			err);
+		goto free_wireless_dev;
+	}
+
+	err = netdev_upper_dev_link(priv->lowerdev, dev);
+	if (err) {
+		dev_err(&priv->lowerdev->dev, "can't netdev_upper_dev_link: %d\n",
+			err);
+		goto unregister_netdev;
+	}
+
+	priv->being_deleted = false;
+	priv->is_connected = false;
+	priv->is_up = false;
+	INIT_DELAYED_WORK(&priv->connect, virt_wifi_connect_complete);
+
+	return 0;
+unregister_netdev:
+	unregister_netdevice(dev);
+free_wireless_dev:
+	kfree(dev->ieee80211_ptr);
+	dev->ieee80211_ptr = NULL;
+remove_handler:
+	netdev_rx_handler_unregister(priv->lowerdev);
+
+	return err;
+}
+
+/* Called with rtnl lock held. */
+static void virt_wifi_dellink(struct net_device *dev,
+			      struct list_head *head)
+{
+	struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+	if (dev->ieee80211_ptr)
+		virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy);
+
+	priv->being_deleted = true;
+	virt_wifi_cancel_connect(dev);
+	netif_carrier_off(dev);
+
+	netdev_rx_handler_unregister(priv->lowerdev);
+	netdev_upper_dev_unlink(priv->lowerdev, dev);
+
+	unregister_netdevice_queue(dev, head);
+
+	/* Deleting the wiphy is handled in the module destructor. */
+}
+
+static struct rtnl_link_ops virt_wifi_link_ops = {
+	.kind		= "virt_wifi",
+	.setup		= virt_wifi_setup,
+	.newlink	= virt_wifi_newlink,
+	.dellink	= virt_wifi_dellink,
+	.priv_size	= sizeof(struct virt_wifi_netdev_priv),
+};
+
+/* Acquires and releases the rtnl lock. */
+static int __init virt_wifi_init_module(void)
+{
+	int err;
+
+	/* Guaranteed to be locallly-administered and not multicast. */
+	eth_random_addr(fake_router_bssid);
+
+	common_wiphy = virt_wifi_make_wiphy();
+	if (!common_wiphy)
+		return -ENOMEM;
+
+	err = rtnl_link_register(&virt_wifi_link_ops);
+	if (err)
+		virt_wifi_destroy_wiphy(common_wiphy);
+
+	return err;
+}
+
+/* Acquires and releases the rtnl lock. */
+static void __exit virt_wifi_cleanup_module(void)
+{
+	/* Will delete any devices that depend on the wiphy. */
+	rtnl_link_unregister(&virt_wifi_link_ops);
+	virt_wifi_destroy_wiphy(common_wiphy);
+}
+
+module_init(virt_wifi_init_module);
+module_exit(virt_wifi_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Cody Schuffelen <schuffelen@google.com>");
+MODULE_DESCRIPTION("Driver for a wireless wrapper of ethernet devices");
+MODULE_ALIAS_RTNL_LINK("virt_wifi");

From c680c9a47272ee529d730ccda48023f369c64efe Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Fri, 7 Dec 2018 16:40:23 -0800
Subject: [PATCH 1873/3220] ANDROID: cuttlefish_defconfig: Enable VIRT_WIFI

Bug: 120439617
Bug: 120682817
Change-Id: Ia1b66528bd9cb1e6e95bd75ac60f393978caa582
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/arm64/configs/cuttlefish_defconfig      | 1 +
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
index d77d20e5a728..c620078b587c 100644
--- a/arch/arm64/configs/cuttlefish_defconfig
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -227,6 +227,7 @@ CONFIG_USB_USBNET=y
 # CONFIG_USB_NET_NET1080 is not set
 # CONFIG_USB_NET_CDC_SUBSET is not set
 # CONFIG_USB_NET_ZAURUS is not set
+CONFIG_VIRT_WIFI=y
 CONFIG_INPUT_EVDEV=y
 CONFIG_INPUT_KEYRESET=y
 # CONFIG_INPUT_KEYBOARD is not set
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index c5ccb61274c5..8e4a038e9ac2 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -247,6 +247,7 @@ CONFIG_USB_USBNET=y
 # CONFIG_USB_NET_CDC_SUBSET is not set
 # CONFIG_USB_NET_ZAURUS is not set
 CONFIG_MAC80211_HWSIM=y
+CONFIG_VIRT_WIFI=y
 CONFIG_INPUT_EVDEV=y
 CONFIG_INPUT_KEYRESET=y
 # CONFIG_INPUT_KEYBOARD is not set

From b95a8c048a283f37c1864e4aff06257d9a91f694 Mon Sep 17 00:00:00 2001
From: Dmitry Shmidt <dimitrysh@google.com>
Date: Wed, 7 Mar 2018 14:11:03 -0800
Subject: [PATCH 1874/3220] ANDROID: uid_sys_stats: Copy task_struct comm field
 to bigger buffer

get_task_comm() currently checks if buf_size != TASK_COMM_LEN
and fails even if sizeof(buf) > TASK_COMM_LEN.

Change-Id: Icb3e9c172607534ef1db10baf5d626083db73498
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
---
 drivers/misc/uid_sys_stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
index cda32366cc7e..99230369f3ed 100644
--- a/drivers/misc/uid_sys_stats.c
+++ b/drivers/misc/uid_sys_stats.c
@@ -129,7 +129,7 @@ static void get_full_task_comm(struct task_entry *task_entry,
 	struct mm_struct *mm = task->mm;
 
 	/* fill the first TASK_COMM_LEN bytes with thread name */
-	get_task_comm(task_entry->comm, task);
+	__get_task_comm(task_entry->comm, TASK_COMM_LEN, task);
 	i = strlen(task_entry->comm);
 	while (i < TASK_COMM_LEN)
 		task_entry->comm[i++] = ' ';

From 298e11444788a49c7b8b26abe2761e26d0426a01 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 6 Dec 2018 19:30:36 +0100
Subject: [PATCH 1875/3220] ipv6: Check available headroom in ip6_xmit() even
 without options

[ Upstream commit 66033f47ca60294a95fc85ec3a3cc909dab7b765 ]

Even if we send an IPv6 packet without options, MAX_HEADER might not be
enough to account for the additional headroom required by alignment of
hardware headers.

On a configuration without HYPERV_NET, WLAN, AX25, and with IPV6_TUNNEL,
sending short SCTP packets over IPv4 over L2TP over IPv6, we start with
100 bytes of allocated headroom in sctp_packet_transmit(), end up with 54
bytes after l2tp_xmit_skb(), and 14 bytes in ip6_finish_output2().

Those would be enough to append our 14 bytes header, but we're going to
align that to 16 bytes, and write 2 bytes out of the allocated slab in
neigh_hh_output().

KASan says:

[  264.967848] ==================================================================
[  264.967861] BUG: KASAN: slab-out-of-bounds in ip6_finish_output2+0x1aec/0x1c70
[  264.967866] Write of size 16 at addr 000000006af1c7fe by task netperf/6201
[  264.967870]
[  264.967876] CPU: 0 PID: 6201 Comm: netperf Not tainted 4.20.0-rc4+ #1
[  264.967881] Hardware name: IBM 2827 H43 400 (z/VM 6.4.0)
[  264.967887] Call Trace:
[  264.967896] ([<00000000001347d6>] show_stack+0x56/0xa0)
[  264.967903]  [<00000000017e379c>] dump_stack+0x23c/0x290
[  264.967912]  [<00000000007bc594>] print_address_description+0xf4/0x290
[  264.967919]  [<00000000007bc8fc>] kasan_report+0x13c/0x240
[  264.967927]  [<000000000162f5e4>] ip6_finish_output2+0x1aec/0x1c70
[  264.967935]  [<000000000163f890>] ip6_finish_output+0x430/0x7f0
[  264.967943]  [<000000000163fe44>] ip6_output+0x1f4/0x580
[  264.967953]  [<000000000163882a>] ip6_xmit+0xfea/0x1ce8
[  264.967963]  [<00000000017396e2>] inet6_csk_xmit+0x282/0x3f8
[  264.968033]  [<000003ff805fb0ba>] l2tp_xmit_skb+0xe02/0x13e0 [l2tp_core]
[  264.968037]  [<000003ff80631192>] l2tp_eth_dev_xmit+0xda/0x150 [l2tp_eth]
[  264.968041]  [<0000000001220020>] dev_hard_start_xmit+0x268/0x928
[  264.968069]  [<0000000001330e8e>] sch_direct_xmit+0x7ae/0x1350
[  264.968071]  [<000000000122359c>] __dev_queue_xmit+0x2b7c/0x3478
[  264.968075]  [<00000000013d2862>] ip_finish_output2+0xce2/0x11a0
[  264.968078]  [<00000000013d9b14>] ip_finish_output+0x56c/0x8c8
[  264.968081]  [<00000000013ddd1e>] ip_output+0x226/0x4c0
[  264.968083]  [<00000000013dbd6c>] __ip_queue_xmit+0x894/0x1938
[  264.968100]  [<000003ff80bc3a5c>] sctp_packet_transmit+0x29d4/0x3648 [sctp]
[  264.968116]  [<000003ff80b7bf68>] sctp_outq_flush_ctrl.constprop.5+0x8d0/0xe50 [sctp]
[  264.968131]  [<000003ff80b7c716>] sctp_outq_flush+0x22e/0x7d8 [sctp]
[  264.968146]  [<000003ff80b35c68>] sctp_cmd_interpreter.isra.16+0x530/0x6800 [sctp]
[  264.968161]  [<000003ff80b3410a>] sctp_do_sm+0x222/0x648 [sctp]
[  264.968177]  [<000003ff80bbddac>] sctp_primitive_ASSOCIATE+0xbc/0xf8 [sctp]
[  264.968192]  [<000003ff80b93328>] __sctp_connect+0x830/0xc20 [sctp]
[  264.968208]  [<000003ff80bb11ce>] sctp_inet_connect+0x2e6/0x378 [sctp]
[  264.968212]  [<0000000001197942>] __sys_connect+0x21a/0x450
[  264.968215]  [<000000000119aff8>] sys_socketcall+0x3d0/0xb08
[  264.968218]  [<000000000184ea7a>] system_call+0x2a2/0x2c0

[...]

Just like ip_finish_output2() does for IPv4, check that we have enough
headroom in ip6_xmit(), and reallocate it if we don't.

This issue is older than git history.

Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6_output.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 530b62fd6b64..f8cca81d66f2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -169,37 +169,37 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	const struct ipv6_pinfo *np = inet6_sk(sk);
 	struct in6_addr *first_hop = &fl6->daddr;
 	struct dst_entry *dst = skb_dst(skb);
+	unsigned int head_room;
 	struct ipv6hdr *hdr;
 	u8  proto = fl6->flowi6_proto;
 	int seg_len = skb->len;
 	int hlimit = -1;
 	u32 mtu;
 
-	if (opt) {
-		unsigned int head_room;
+	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
+	if (opt)
+		head_room += opt->opt_nflen + opt->opt_flen;
 
-		/* First: exthdrs may take lots of space (~8K for now)
-		   MAX_HEADER is not enough.
-		 */
-		head_room = opt->opt_nflen + opt->opt_flen;
-		seg_len += head_room;
-		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
-
-		if (skb_headroom(skb) < head_room) {
-			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
-			if (!skb2) {
-				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					      IPSTATS_MIB_OUTDISCARDS);
-				kfree_skb(skb);
-				return -ENOBUFS;
-			}
-			if (skb->sk)
-				skb_set_owner_w(skb2, skb->sk);
-			consume_skb(skb);
-			skb = skb2;
+	if (unlikely(skb_headroom(skb) < head_room)) {
+		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
+		if (!skb2) {
+			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+				      IPSTATS_MIB_OUTDISCARDS);
+			kfree_skb(skb);
+			return -ENOBUFS;
 		}
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+		consume_skb(skb);
+		skb = skb2;
+	}
+
+	if (opt) {
+		seg_len += opt->opt_nflen + opt->opt_flen;
+
 		if (opt->opt_flen)
 			ipv6_push_frag_opts(skb, opt, &proto);
+
 		if (opt->opt_nflen)
 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 	}

From 8832e33045b5219357fe7c859fda385902c1d052 Mon Sep 17 00:00:00 2001
From: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Date: Mon, 3 Dec 2018 15:33:07 +0800
Subject: [PATCH 1876/3220] net: 8139cp: fix a BUG triggered by changing mtu
 with network traffic

[ Upstream commit a5d4a89245ead1f37ed135213653c5beebea4237 ]

When changing mtu many times with traffic, a bug is triggered:

[ 1035.684037] kernel BUG at lib/dynamic_queue_limits.c:26!
[ 1035.684042] invalid opcode: 0000 [#1] SMP
[ 1035.684049] Modules linked in: loop binfmt_misc 8139cp(OE) macsec
tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag tcp_lp
fuse uinput xt_CHECKSUM iptable_mangle ipt_MASQUERADE
nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4
nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun
bridge stp llc ebtable_filter ebtables ip6table_filter devlink
ip6_tables iptable_filter sunrpc snd_hda_codec_generic snd_hda_intel
snd_hda_codec snd_hda_core snd_hwdep ppdev snd_seq iosf_mbi crc32_pclmul
parport_pc snd_seq_device ghash_clmulni_intel parport snd_pcm
aesni_intel joydev lrw snd_timer virtio_balloon sg gf128mul glue_helper
ablk_helper cryptd snd soundcore i2c_piix4 pcspkr ip_tables xfs
libcrc32c sr_mod sd_mod cdrom crc_t10dif crct10dif_generic ata_generic
[ 1035.684102]  pata_acpi virtio_console qxl drm_kms_helper syscopyarea
sysfillrect sysimgblt floppy fb_sys_fops crct10dif_pclmul
crct10dif_common ttm crc32c_intel serio_raw ata_piix drm libata 8139too
virtio_pci drm_panel_orientation_quirks virtio_ring virtio mii dm_mirror
dm_region_hash dm_log dm_mod [last unloaded: 8139cp]
[ 1035.684132] CPU: 9 PID: 25140 Comm: if-mtu-change Kdump: loaded
Tainted: G           OE  ------------ T 3.10.0-957.el7.x86_64 #1
[ 1035.684134] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[ 1035.684136] task: ffff8f59b1f5a080 ti: ffff8f5a2e32c000 task.ti:
ffff8f5a2e32c000
[ 1035.684149] RIP: 0010:[<ffffffffba3a40d0>]  [<ffffffffba3a40d0>]
dql_completed+0x180/0x190
[ 1035.684162] RSP: 0000:ffff8f5a75483e50  EFLAGS: 00010093
[ 1035.684162] RAX: 00000000000000c2 RBX: ffff8f5a6f91c000 RCX:
0000000000000000
[ 1035.684162] RDX: 0000000000000000 RSI: 0000000000000184 RDI:
ffff8f599fea3ec0
[ 1035.684162] RBP: ffff8f5a75483ea8 R08: 00000000000000c2 R09:
0000000000000000
[ 1035.684162] R10: 00000000000616ef R11: ffff8f5a75483b56 R12:
ffff8f599fea3e00
[ 1035.684162] R13: 0000000000000001 R14: 0000000000000000 R15:
0000000000000184
[ 1035.684162] FS:  00007fa8434de740(0000) GS:ffff8f5a75480000(0000)
knlGS:0000000000000000
[ 1035.684162] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1035.684162] CR2: 00000000004305d0 CR3: 000000024eb66000 CR4:
00000000001406e0
[ 1035.684162] Call Trace:
[ 1035.684162]  <IRQ>
[ 1035.684162]  [<ffffffffc08cbaf8>] ? cp_interrupt+0x478/0x580 [8139cp]
[ 1035.684162]  [<ffffffffba14a294>]
__handle_irq_event_percpu+0x44/0x1c0
[ 1035.684162]  [<ffffffffba14a442>] handle_irq_event_percpu+0x32/0x80
[ 1035.684162]  [<ffffffffba14a4cc>] handle_irq_event+0x3c/0x60
[ 1035.684162]  [<ffffffffba14db29>] handle_fasteoi_irq+0x59/0x110
[ 1035.684162]  [<ffffffffba02e554>] handle_irq+0xe4/0x1a0
[ 1035.684162]  [<ffffffffba7795dd>] do_IRQ+0x4d/0xf0
[ 1035.684162]  [<ffffffffba76b362>] common_interrupt+0x162/0x162
[ 1035.684162]  <EOI>
[ 1035.684162]  [<ffffffffba0c2ae4>] ? __wake_up_bit+0x24/0x70
[ 1035.684162]  [<ffffffffba1e46f5>] ? do_set_pte+0xd5/0x120
[ 1035.684162]  [<ffffffffba1b64fb>] unlock_page+0x2b/0x30
[ 1035.684162]  [<ffffffffba1e4879>] do_read_fault.isra.61+0x139/0x1b0
[ 1035.684162]  [<ffffffffba1e9134>] handle_pte_fault+0x2f4/0xd10
[ 1035.684162]  [<ffffffffba1ebc6d>] handle_mm_fault+0x39d/0x9b0
[ 1035.684162]  [<ffffffffba76f5e3>] __do_page_fault+0x203/0x500
[ 1035.684162]  [<ffffffffba76f9c6>] trace_do_page_fault+0x56/0x150
[ 1035.684162]  [<ffffffffba76ef42>] do_async_page_fault+0x22/0xf0
[ 1035.684162]  [<ffffffffba76b788>] async_page_fault+0x28/0x30
[ 1035.684162] Code: 54 c7 47 54 ff ff ff ff 44 0f 49 ce 48 8b 35 48 2f
9c 00 48 89 77 58 e9 fe fe ff ff 0f 1f 80 00 00 00 00 41 89 d1 e9 ef fe
ff ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 55 8d 42 ff 48
[ 1035.684162] RIP  [<ffffffffba3a40d0>] dql_completed+0x180/0x190
[ 1035.684162]  RSP <ffff8f5a75483e50>

It's not the same as in 7fe0ee09 patch described.
As 8139cp uses shared irq mode, other device irq will trigger
cp_interrupt to execute.

cp_change_mtu
 -> cp_close
 -> cp_open

In cp_close routine  just before free_irq(), some interrupt may occur.
In my environment, cp_interrupt exectutes and IntrStatus is 0x4,
exactly TxOk. That will cause cp_tx to wake device queue.

As device queue is started, cp_start_xmit and cp_open will run at same
time which will cause kernel BUG.

For example:
[#] for tx descriptor

At start:

[#][#][#]
num_queued=3

After cp_init_hw->cp_start_hw->netdev_reset_queue:

[#][#][#]
num_queued=0

When 8139cp starts to work then cp_tx will check
num_queued mismatchs the complete_bytes.

The patch will check IntrMask before check IntrStatus in cp_interrupt.
When 8139cp interrupt is disabled, just return.

Signed-off-by: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/realtek/8139cp.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index deae10d7426d..9b588251f2a7 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -578,6 +578,7 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 	struct cp_private *cp;
 	int handled = 0;
 	u16 status;
+	u16 mask;
 
 	if (unlikely(dev == NULL))
 		return IRQ_NONE;
@@ -585,6 +586,10 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 
 	spin_lock(&cp->lock);
 
+	mask = cpr16(IntrMask);
+	if (!mask)
+		goto out_unlock;
+
 	status = cpr16(IntrStatus);
 	if (!status || (status == 0xFFFF))
 		goto out_unlock;

From a1ec658a588b6d18c1cc1a5760c34e1dcfca1696 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 3 Dec 2018 08:19:33 +0100
Subject: [PATCH 1877/3220] net: phy: don't allow __set_phy_supported to add
 unsupported modes

[ Upstream commit d2a36971ef595069b7a600d1144c2e0881a930a1 ]

Currently __set_phy_supported allows to add modes w/o checking whether
the PHY supports them. This is wrong, it should never add modes but
only remove modes we don't want to support.

The commit marked as fixed didn't do anything wrong, it just copied
existing functionality to the helper which is being fixed now.

Fixes: f3a6bd393c2c ("phylib: Add phy_set_max_speed helper")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/phy/phy_device.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 1f2f25a71d18..70f26b30729c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1265,20 +1265,17 @@ static int gen10g_resume(struct phy_device *phydev)
 
 static int __set_phy_supported(struct phy_device *phydev, u32 max_speed)
 {
-	phydev->supported &= ~(PHY_1000BT_FEATURES | PHY_100BT_FEATURES |
-			       PHY_10BT_FEATURES);
-
 	switch (max_speed) {
-	default:
-		return -ENOTSUPP;
-	case SPEED_1000:
-		phydev->supported |= PHY_1000BT_FEATURES;
+	case SPEED_10:
+		phydev->supported &= ~PHY_100BT_FEATURES;
 		/* fall through */
 	case SPEED_100:
-		phydev->supported |= PHY_100BT_FEATURES;
-		/* fall through */
-	case SPEED_10:
-		phydev->supported |= PHY_10BT_FEATURES;
+		phydev->supported &= ~PHY_1000BT_FEATURES;
+		break;
+	case SPEED_1000:
+		break;
+	default:
+		return -ENOTSUPP;
 	}
 
 	return 0;

From d7519c01c9b83129530434baa3d78ee23078400c Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Thu, 29 Nov 2018 16:01:04 -0800
Subject: [PATCH 1878/3220] net: Prevent invalid access to skb->prev in
 __qdisc_drop_all

[ Upstream commit 9410d386d0a829ace9558336263086c2fbbe8aed ]

__qdisc_drop_all() accesses skb->prev to get to the tail of the
segment-list.

With commit 68d2f84a1368 ("net: gro: properly remove skb from list")
the skb-list handling has been changed to set skb->next to NULL and set
the list-poison on skb->prev.

With that change, __qdisc_drop_all() will panic when it tries to
dereference skb->prev.

Since commit 992cba7e276d ("net: Add and use skb_list_del_init().")
__list_del_entry is used, leaving skb->prev unchanged (thus,
pointing to the list-head if it's the first skb of the list).
This will make __qdisc_drop_all modify the next-pointer of the list-head
and result in a panic later on:

[   34.501053] general protection fault: 0000 [#1] SMP KASAN PTI
[   34.501968] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.20.0-rc2.mptcp #108
[   34.502887] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011
[   34.504074] RIP: 0010:dev_gro_receive+0x343/0x1f90
[   34.504751] Code: e0 48 c1 e8 03 42 80 3c 30 00 0f 85 4a 1c 00 00 4d 8b 24 24 4c 39 65 d0 0f 84 0a 04 00 00 49 8d 7c 24 38 48 89 f8 48 c1 e8 03 <42> 0f b6 04 30 84 c0 74 08 3c 04
[   34.507060] RSP: 0018:ffff8883af507930 EFLAGS: 00010202
[   34.507761] RAX: 0000000000000007 RBX: ffff8883970b2c80 RCX: 1ffff11072e165a6
[   34.508640] RDX: 1ffff11075867008 RSI: ffff8883ac338040 RDI: 0000000000000038
[   34.509493] RBP: ffff8883af5079d0 R08: ffff8883970b2d40 R09: 0000000000000062
[   34.510346] R10: 0000000000000034 R11: 0000000000000000 R12: 0000000000000000
[   34.511215] R13: 0000000000000000 R14: dffffc0000000000 R15: ffff8883ac338008
[   34.512082] FS:  0000000000000000(0000) GS:ffff8883af500000(0000) knlGS:0000000000000000
[   34.513036] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   34.513741] CR2: 000055ccc3e9d020 CR3: 00000003abf32000 CR4: 00000000000006e0
[   34.514593] Call Trace:
[   34.514893]  <IRQ>
[   34.515157]  napi_gro_receive+0x93/0x150
[   34.515632]  receive_buf+0x893/0x3700
[   34.516094]  ? __netif_receive_skb+0x1f/0x1a0
[   34.516629]  ? virtnet_probe+0x1b40/0x1b40
[   34.517153]  ? __stable_node_chain+0x4d0/0x850
[   34.517684]  ? kfree+0x9a/0x180
[   34.518067]  ? __kasan_slab_free+0x171/0x190
[   34.518582]  ? detach_buf+0x1df/0x650
[   34.519061]  ? lapic_next_event+0x5a/0x90
[   34.519539]  ? virtqueue_get_buf_ctx+0x280/0x7f0
[   34.520093]  virtnet_poll+0x2df/0xd60
[   34.520533]  ? receive_buf+0x3700/0x3700
[   34.521027]  ? qdisc_watchdog_schedule_ns+0xd5/0x140
[   34.521631]  ? htb_dequeue+0x1817/0x25f0
[   34.522107]  ? sch_direct_xmit+0x142/0xf30
[   34.522595]  ? virtqueue_napi_schedule+0x26/0x30
[   34.523155]  net_rx_action+0x2f6/0xc50
[   34.523601]  ? napi_complete_done+0x2f0/0x2f0
[   34.524126]  ? kasan_check_read+0x11/0x20
[   34.524608]  ? _raw_spin_lock+0x7d/0xd0
[   34.525070]  ? _raw_spin_lock_bh+0xd0/0xd0
[   34.525563]  ? kvm_guest_apic_eoi_write+0x6b/0x80
[   34.526130]  ? apic_ack_irq+0x9e/0xe0
[   34.526567]  __do_softirq+0x188/0x4b5
[   34.527015]  irq_exit+0x151/0x180
[   34.527417]  do_IRQ+0xdb/0x150
[   34.527783]  common_interrupt+0xf/0xf
[   34.528223]  </IRQ>

This patch makes sure that skb->prev is set to NULL when entering
netem_enqueue.

Cc: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Cc: Tyler Hicks <tyhicks@canonical.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Fixes: 68d2f84a1368 ("net: gro: properly remove skb from list")
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sched/sch_netem.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 743ff23885da..7acf1f2b8dfc 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -432,6 +432,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	int count = 1;
 	int rc = NET_XMIT_SUCCESS;
 
+	/* Do not fool qdisc_drop_all() */
+	skb->prev = NULL;
+
 	/* Random duplication */
 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 		++count;

From 266b50e76449bf4a2391aabd9cc8ec364f8e0589 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 4 Dec 2018 09:40:35 -0800
Subject: [PATCH 1879/3220] rtnetlink: ndo_dflt_fdb_dump() only work for
 ARPHRD_ETHER devices

[ Upstream commit 688838934c231bb08f46db687e57f6d8bf82709c ]

kmsan was able to trigger a kernel-infoleak using a gre device [1]

nlmsg_populate_fdb_fill() has a hard coded assumption
that dev->addr_len is ETH_ALEN, as normally guaranteed
for ARPHRD_ETHER devices.

A similar issue was fixed recently in commit da71577545a5
("rtnetlink: Disallow FDB configuration for non-Ethernet device")

[1]
BUG: KMSAN: kernel-infoleak in copyout lib/iov_iter.c:143 [inline]
BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x4c0/0x2700 lib/iov_iter.c:576
CPU: 0 PID: 6697 Comm: syz-executor310 Not tainted 4.20.0-rc3+ #95
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x32d/0x480 lib/dump_stack.c:113
 kmsan_report+0x12c/0x290 mm/kmsan/kmsan.c:683
 kmsan_internal_check_memory+0x32a/0xa50 mm/kmsan/kmsan.c:743
 kmsan_copy_to_user+0x78/0xd0 mm/kmsan/kmsan_hooks.c:634
 copyout lib/iov_iter.c:143 [inline]
 _copy_to_iter+0x4c0/0x2700 lib/iov_iter.c:576
 copy_to_iter include/linux/uio.h:143 [inline]
 skb_copy_datagram_iter+0x4e2/0x1070 net/core/datagram.c:431
 skb_copy_datagram_msg include/linux/skbuff.h:3316 [inline]
 netlink_recvmsg+0x6f9/0x19d0 net/netlink/af_netlink.c:1975
 sock_recvmsg_nosec net/socket.c:794 [inline]
 sock_recvmsg+0x1d1/0x230 net/socket.c:801
 ___sys_recvmsg+0x444/0xae0 net/socket.c:2278
 __sys_recvmsg net/socket.c:2327 [inline]
 __do_sys_recvmsg net/socket.c:2337 [inline]
 __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334
 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334
 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x441119
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db 0a fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fffc7f008a8 EFLAGS: 00000207 ORIG_RAX: 000000000000002f
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000441119
RDX: 0000000000000040 RSI: 00000000200005c0 RDI: 0000000000000003
RBP: 00000000006cc018 R08: 0000000000000100 R09: 0000000000000100
R10: 0000000000000100 R11: 0000000000000207 R12: 0000000000402080
R13: 0000000000402110 R14: 0000000000000000 R15: 0000000000000000

Uninit was stored to memory at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:246 [inline]
 kmsan_save_stack mm/kmsan/kmsan.c:261 [inline]
 kmsan_internal_chain_origin+0x13d/0x240 mm/kmsan/kmsan.c:469
 kmsan_memcpy_memmove_metadata+0x1a9/0xf70 mm/kmsan/kmsan.c:344
 kmsan_memcpy_metadata+0xb/0x10 mm/kmsan/kmsan.c:362
 __msan_memcpy+0x61/0x70 mm/kmsan/kmsan_instr.c:162
 __nla_put lib/nlattr.c:744 [inline]
 nla_put+0x20a/0x2d0 lib/nlattr.c:802
 nlmsg_populate_fdb_fill+0x444/0x810 net/core/rtnetlink.c:3466
 nlmsg_populate_fdb net/core/rtnetlink.c:3775 [inline]
 ndo_dflt_fdb_dump+0x73a/0x960 net/core/rtnetlink.c:3807
 rtnl_fdb_dump+0x1318/0x1cb0 net/core/rtnetlink.c:3979
 netlink_dump+0xc79/0x1c90 net/netlink/af_netlink.c:2244
 __netlink_dump_start+0x10c4/0x11d0 net/netlink/af_netlink.c:2352
 netlink_dump_start include/linux/netlink.h:216 [inline]
 rtnetlink_rcv_msg+0x141b/0x1540 net/core/rtnetlink.c:4910
 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2477
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4965
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1699/0x1740 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x13c7/0x1440 net/netlink/af_netlink.c:1917
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xe3b/0x1240 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x305/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:246 [inline]
 kmsan_internal_poison_shadow+0x6d/0x130 mm/kmsan/kmsan.c:170
 kmsan_kmalloc+0xa1/0x100 mm/kmsan/kmsan_hooks.c:186
 __kmalloc+0x14c/0x4d0 mm/slub.c:3825
 kmalloc include/linux/slab.h:551 [inline]
 __hw_addr_create_ex net/core/dev_addr_lists.c:34 [inline]
 __hw_addr_add_ex net/core/dev_addr_lists.c:80 [inline]
 __dev_mc_add+0x357/0x8a0 net/core/dev_addr_lists.c:670
 dev_mc_add+0x6d/0x80 net/core/dev_addr_lists.c:687
 ip_mc_filter_add net/ipv4/igmp.c:1128 [inline]
 igmp_group_added+0x4d4/0xb80 net/ipv4/igmp.c:1311
 __ip_mc_inc_group+0xea9/0xf70 net/ipv4/igmp.c:1444
 ip_mc_inc_group net/ipv4/igmp.c:1453 [inline]
 ip_mc_up+0x1c3/0x400 net/ipv4/igmp.c:1775
 inetdev_event+0x1d03/0x1d80 net/ipv4/devinet.c:1522
 notifier_call_chain kernel/notifier.c:93 [inline]
 __raw_notifier_call_chain kernel/notifier.c:394 [inline]
 raw_notifier_call_chain+0x13d/0x240 kernel/notifier.c:401
 __dev_notify_flags+0x3da/0x860 net/core/dev.c:1733
 dev_change_flags+0x1ac/0x230 net/core/dev.c:7569
 do_setlink+0x165f/0x5ea0 net/core/rtnetlink.c:2492
 rtnl_newlink+0x2ad7/0x35a0 net/core/rtnetlink.c:3111
 rtnetlink_rcv_msg+0x1148/0x1540 net/core/rtnetlink.c:4947
 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2477
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4965
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1699/0x1740 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x13c7/0x1440 net/netlink/af_netlink.c:1917
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xe3b/0x1240 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x305/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

Bytes 36-37 of 105 are uninitialized
Memory access of size 105 starts at ffff88819686c000
Data copied to user address 0000000020000380

Fixes: d83b06036048 ("net: add fdb generic dump routine")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Ido Schimmel <idosch@mellanox.com>
Cc: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/rtnetlink.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d2a46ffe6382..d52b633164c9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2931,6 +2931,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
 {
 	int err;
 
+	if (dev->type != ARPHRD_ETHER)
+		return -EINVAL;
+
 	netif_addr_lock_bh(dev);
 	err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc);
 	if (err)

From ed7748bcf290ad8f80020217d832f458ac9bae7f Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 5 Dec 2018 14:38:38 -0800
Subject: [PATCH 1880/3220] tcp: fix NULL ref in tail loss probe

[ Upstream commit b2b7af861122a0c0f6260155c29a1b2e594cd5b5 ]

TCP loss probe timer may fire when the retranmission queue is empty but
has a non-zero tp->packets_out counter. tcp_send_loss_probe will call
tcp_rearm_rto which triggers NULL pointer reference by fetching the
retranmission queue head in its sub-routines.

Add a more detailed warning to help catch the root cause of the inflight
accounting inconsistency.

Reported-by: Rafael Tinoco <rafael.tinoco@linaro.org>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_output.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2d3c9df8d75c..b55b8954dae5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2263,14 +2263,18 @@ void tcp_send_loss_probe(struct sock *sk)
 		skb = tcp_write_queue_tail(sk);
 	}
 
+	if (unlikely(!skb)) {
+		WARN_ONCE(tp->packets_out,
+			  "invalid inflight: %u state %u cwnd %u mss %d\n",
+			  tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
+		inet_csk(sk)->icsk_pending = 0;
+		return;
+	}
+
 	/* At most one outstanding TLP retransmission. */
 	if (tp->tlp_high_seq)
 		goto rearm_timer;
 
-	/* Retransmit last segment. */
-	if (WARN_ON(!skb))
-		goto rearm_timer;
-
 	if (skb_still_in_host_queue(sk, skb))
 		goto rearm_timer;
 

From 2da7c7b22b33ea0e240b1627e10b76212cba94aa Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 29 Nov 2018 14:45:39 +0100
Subject: [PATCH 1881/3220] tun: forbid iface creation with rtnl ops

[ Upstream commit 35b827b6d06199841a83839e8bb69c0cd13a28be ]

It's not supported right now (the goal of the initial patch was to support
'ip link del' only).

Before the patch:
$ ip link add foo type tun
[  239.632660] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[snip]
[  239.636410] RIP: 0010:register_netdevice+0x8e/0x3a0

This panic occurs because dev->netdev_ops is not set by tun_setup(). But to
have something usable, it will require more than just setting
netdev_ops.

Fixes: f019a7a594d9 ("tun: Implement ip link del tunXXX")
CC: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/tun.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5ac0b850d6b1..fd9ff9eff237 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1475,9 +1475,9 @@ static void tun_setup(struct net_device *dev)
  */
 static int tun_validate(struct nlattr *tb[], struct nlattr *data[])
 {
-	if (!data)
-		return 0;
-	return -EINVAL;
+	/* NL_SET_ERR_MSG(extack,
+		       "tun/tap creation via rtnetlink is not supported."); */
+	return -EOPNOTSUPP;
 }
 
 static struct rtnl_link_ops tun_link_ops __read_mostly = {

From a56de6cd69bd700e0db0ad3e02e45a7910319cec Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 6 Dec 2018 19:30:37 +0100
Subject: [PATCH 1882/3220] neighbour: Avoid writing before skb->head in
 neigh_hh_output()

[ Upstream commit e6ac64d4c4d095085d7dd71cbd05704ac99829b2 ]

While skb_push() makes the kernel panic if the skb headroom is less than
the unaligned hardware header size, it will proceed normally in case we
copy more than that because of alignment, and we'll silently corrupt
adjacent slabs.

In the case fixed by the previous patch,
"ipv6: Check available headroom in ip6_xmit() even without options", we
end up in neigh_hh_output() with 14 bytes headroom, 14 bytes hardware
header and write 16 bytes, starting 2 bytes before the allocated buffer.

Always check we're not writing before skb->head and, if the headroom is
not enough, warn and drop the packet.

v2:
 - instead of panicking with BUG_ON(), WARN_ON_ONCE() and drop the packet
   (Eric Dumazet)
 - if we avoid the panic, though, we need to explicitly check the headroom
   before the memcpy(), otherwise we'll have corrupted slabs on a running
   kernel, after we warn
 - use __skb_push() instead of skb_push(), as the headroom check is
   already implemented here explicitly (Eric Dumazet)

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/neighbour.h | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 8b683841e574..f6017ddc4ded 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -448,6 +448,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
 
 static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
 {
+	unsigned int hh_alen = 0;
 	unsigned int seq;
 	int hh_len;
 
@@ -455,16 +456,33 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
 		seq = read_seqbegin(&hh->hh_lock);
 		hh_len = hh->hh_len;
 		if (likely(hh_len <= HH_DATA_MOD)) {
-			/* this is inlined by gcc */
-			memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD);
-		} else {
-			int hh_alen = HH_DATA_ALIGN(hh_len);
+			hh_alen = HH_DATA_MOD;
 
-			memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
+			/* skb_push() would proceed silently if we have room for
+			 * the unaligned size but not for the aligned size:
+			 * check headroom explicitly.
+			 */
+			if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
+				/* this is inlined by gcc */
+				memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
+				       HH_DATA_MOD);
+			}
+		} else {
+			hh_alen = HH_DATA_ALIGN(hh_len);
+
+			if (likely(skb_headroom(skb) >= hh_alen)) {
+				memcpy(skb->data - hh_alen, hh->hh_data,
+				       hh_alen);
+			}
 		}
 	} while (read_seqretry(&hh->hh_lock, seq));
 
-	skb_push(skb, hh_len);
+	if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
+		kfree_skb(skb);
+		return NET_XMIT_DROP;
+	}
+
+	__skb_push(skb, hh_len);
 	return dev_queue_xmit(skb);
 }
 

From d2f642b05832f54e46bb7cc601068038c27901ea Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Wed, 17 Oct 2018 17:54:00 -0700
Subject: [PATCH 1883/3220] ARM: OMAP2+: prm44xx: Fix section annotation on
 omap44xx_prm_enable_io_wakeup

[ Upstream commit eef3dc34a1e0b01d53328b88c25237bcc7323777 ]

When building the kernel with Clang, the following section mismatch
warning appears:

WARNING: vmlinux.o(.text+0x38b3c): Section mismatch in reference from
the function omap44xx_prm_late_init() to the function
.init.text:omap44xx_prm_enable_io_wakeup()
The function omap44xx_prm_late_init() references
the function __init omap44xx_prm_enable_io_wakeup().
This is often because omap44xx_prm_late_init lacks a __init
annotation or the annotation of omap44xx_prm_enable_io_wakeup is wrong.

Remove the __init annotation from omap44xx_prm_enable_io_wakeup so there
is no more mismatch.

Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/mach-omap2/prm44xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mach-omap2/prm44xx.c b/arch/arm/mach-omap2/prm44xx.c
index 30768003f854..8c505284bc0c 100644
--- a/arch/arm/mach-omap2/prm44xx.c
+++ b/arch/arm/mach-omap2/prm44xx.c
@@ -344,7 +344,7 @@ static void omap44xx_prm_reconfigure_io_chain(void)
  * to occur, WAKEUPENABLE bits must be set in the pad mux registers, and
  * omap44xx_prm_reconfigure_io_chain() must be called.  No return value.
  */
-static void __init omap44xx_prm_enable_io_wakeup(void)
+static void omap44xx_prm_enable_io_wakeup(void)
 {
 	s32 inst = omap4_prmst_get_prm_dev_inst();
 

From d7589b83d628eeb0850d61c354dbf47e15befc10 Mon Sep 17 00:00:00 2001
From: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Date: Wed, 7 Nov 2018 22:30:31 +0100
Subject: [PATCH 1884/3220] ARM: OMAP1: ams-delta: Fix possible use of
 uninitialized field

[ Upstream commit cec83ff1241ec98113a19385ea9e9cfa9aa4125b ]

While playing with initialization order of modem device, it has been
discovered that under some circumstances (early console init, I
believe) its .pm() callback may be called before the
uart_port->private_data pointer is initialized from
plat_serial8250_port->private_data, resulting in NULL pointer
dereference.  Fix it by checking for uninitialized pointer before using
it in modem_pm().

Fixes: aabf31737a6a ("ARM: OMAP1: ams-delta: update the modem to use regulator API")
Signed-off-by: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/mach-omap1/board-ams-delta.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c
index a95499ea8706..fa1d41edce68 100644
--- a/arch/arm/mach-omap1/board-ams-delta.c
+++ b/arch/arm/mach-omap1/board-ams-delta.c
@@ -511,6 +511,9 @@ static void modem_pm(struct uart_port *port, unsigned int state, unsigned old)
 {
 	struct modem_private_data *priv = port->private_data;
 
+	if (!priv)
+		return;
+
 	if (IS_ERR(priv->regulator))
 		return;
 

From c65b63a085c86d23b439f1a2831628c3c3ae4f94 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 10 Nov 2018 04:13:24 +0000
Subject: [PATCH 1885/3220] sysv: return 'err' instead of 0 in
 __sysv_write_inode

[ Upstream commit c4b7d1ba7d263b74bb72e9325262a67139605cde ]

Fixes gcc '-Wunused-but-set-variable' warning:

fs/sysv/inode.c: In function '__sysv_write_inode':
fs/sysv/inode.c:239:6: warning:
 variable 'err' set but not used [-Wunused-but-set-variable]

__sysv_write_inode should return 'err' instead of 0

Fixes: 05459ca81ac3 ("repair sysv_write_inode(), switch sysv to simple_fsync()")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/sysv/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 02fa1dcc5969..29f5b2e589a1 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -275,7 +275,7 @@ static int __sysv_write_inode(struct inode *inode, int wait)
                 }
         }
 	brelse(bh);
-	return 0;
+	return err;
 }
 
 int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)

From 9ba7a303ff290d21bade27317337505640e556c0 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Tue, 13 Nov 2018 15:38:22 +0000
Subject: [PATCH 1886/3220] s390/cpum_cf: Reject request for sampling in event
 initialization

[ Upstream commit 613a41b0d16e617f46776a93b975a1eeea96417c ]

On s390 command perf top fails
[root@s35lp76 perf] # ./perf top -F100000  --stdio
   Error:
   cycles: PMU Hardware doesn't support sampling/overflow-interrupts.
   	Try 'perf stat'
[root@s35lp76 perf] #

Using event -e rb0000 works as designed.  Event rb0000 is the event
number of the sampling facility for basic sampling.

During system start up the following PMUs are installed in the kernel's
PMU list (from head to tail):
   cpum_cf --> s390 PMU counter facility device driver
   cpum_sf --> s390 PMU sampling facility device driver
   uprobe
   kprobe
   tracepoint
   task_clock
   cpu_clock

Perf top executes following functions and calls perf_event_open(2) system
call with different parameters many times:

cmd_top
--> __cmd_top
    --> perf_evlist__add_default
        --> __perf_evlist__add_default
            --> perf_evlist__new_cycles (creates event type:0 (HW)
			    		config 0 (CPU_CYCLES)
	        --> perf_event_attr__set_max_precise_ip
		    Uses perf_event_open(2) to detect correct
		    precise_ip level. Fails 3 times on s390 which is ok.

Then functions cmd_top
--> __cmd_top
    --> perf_top__start_counters
        -->perf_evlist__config
	   --> perf_can_comm_exec
               --> perf_probe_api
	           This functions test support for the following events:
		   "cycles:u", "instructions:u", "cpu-clock:u" using
		   --> perf_do_probe_api
		       --> perf_event_open_cloexec
		           Test the close on exec flag support with
			   perf_event_open(2).
	               perf_do_probe_api returns true if the event is
		       supported.
		       The function returns true because event cpu-clock is
		       supported by the PMU cpu_clock.
	               This is achieved by many calls to perf_event_open(2).

Function perf_top__start_counters now calls perf_evsel__open() for every
event, which is the default event cpu_cycles (config:0) and type HARDWARE
(type:0) which a predfined frequence of 4000.

Given the above order of the PMU list, the PMU cpum_cf gets called first
and returns 0, which indicates support for this sampling. The event is
fully allocated in the function perf_event_open (file kernel/event/core.c
near line 10521 and the following check fails:

        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
		                 NULL, NULL, cgroup_fd);
	if (IS_ERR(event)) {
		err = PTR_ERR(event);
		goto err_cred;
	}

        if (is_sampling_event(event)) {
		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
			err = -EOPNOTSUPP;
			goto err_alloc;
		}
	}

The check for the interrupt capabilities fails and the system call
perf_event_open() returns -EOPNOTSUPP (-95).

Add a check to return -ENODEV when sampling is requested in PMU cpum_cf.
This allows common kernel code in the perf_event_open() system call to
test the next PMU in above list.

Fixes: 97b1198fece0 (" "s390, perf: Use common PMU interrupt disabled code")
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Hendrik Brueckner <brueckner@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/s390/kernel/perf_cpum_cf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 929c147e07b4..1b69bfdf59f9 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -344,6 +344,8 @@ static int __hw_perf_event_init(struct perf_event *event)
 		break;
 
 	case PERF_TYPE_HARDWARE:
+		if (is_sampling_event(event))	/* No sampling support */
+			return -ENOENT;
 		ev = attr->config;
 		/* Count user space (problem-state) only */
 		if (!attr->exclude_user && attr->exclude_kernel) {

From 15d8d7246173d11e7b75e34bc0226682468f9a8c Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicoleotsuka@gmail.com>
Date: Tue, 13 Nov 2018 19:48:54 -0800
Subject: [PATCH 1887/3220] hwmon: (ina2xx) Fix current value calculation

[ Upstream commit 38cd989ee38c16388cde89db5b734f9d55b905f9 ]

The current register (04h) has a sign bit at MSB. The comments
for this calculation also mention that it's a signed register.

However, the regval is unsigned type so result of calculation
turns out to be an incorrect value when current is negative.

This patch simply fixes this by adding a casting to s16.

Fixes: 5d389b125186c ("hwmon: (ina2xx) Make calibration register value fixed")
Signed-off-by: Nicolin Chen <nicoleotsuka@gmail.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwmon/ina2xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/ina2xx.c b/drivers/hwmon/ina2xx.c
index 9ac6e1673375..1f291b344178 100644
--- a/drivers/hwmon/ina2xx.c
+++ b/drivers/hwmon/ina2xx.c
@@ -273,7 +273,7 @@ static int ina2xx_get_value(struct ina2xx_data *data, u8 reg,
 		break;
 	case INA2XX_CURRENT:
 		/* signed register, result in mA */
-		val = regval * data->current_lsb_uA;
+		val = (s16)regval * data->current_lsb_uA;
 		val = DIV_ROUND_CLOSEST(val, 1000);
 		break;
 	case INA2XX_CALIBRATION:

From fd103a69b40023d38e8b93bfdc6a8c7522db0efd Mon Sep 17 00:00:00 2001
From: Tzung-Bi Shih <tzungbi@google.com>
Date: Wed, 14 Nov 2018 17:06:13 +0800
Subject: [PATCH 1888/3220] ASoC: dapm: Recalculate audio map forcely when card
 instantiated

[ Upstream commit 882eab6c28d23a970ae73b7eb831b169a672d456 ]

Audio map are possible in wrong state before card->instantiated has
been set to true.  Imaging the following examples:

time 1: at the beginning

  in:-1    in:-1    in:-1    in:-1
 out:-1   out:-1   out:-1   out:-1
 SIGGEN        A        B      Spk

time 2: after someone called snd_soc_dapm_new_widgets()
(e.g. create_fill_widget_route_map() in sound/soc/codecs/hdac_hdmi.c)

   in:1     in:0     in:0     in:0
  out:0    out:0    out:0    out:1
 SIGGEN        A        B      Spk

time 3: routes added

   in:1     in:0     in:0     in:0
  out:0    out:0    out:0    out:1
 SIGGEN -----> A -----> B ---> Spk

In the end, the path should be powered on but it did not.  At time 3,
"in" of SIGGEN and "out" of Spk did not propagate to their neighbors
because snd_soc_dapm_add_path() will not invalidate the paths if
the card has not instantiated (i.e. card->instantiated is false).
To correct the state of audio map, recalculate the whole map forcely.

Signed-off-by: Tzung-Bi Shih <tzungbi@google.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/soc-core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index fa6b74a304a7..b927f9c81d92 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1711,6 +1711,7 @@ static int snd_soc_instantiate_card(struct snd_soc_card *card)
 	}
 
 	card->instantiated = 1;
+	dapm_mark_endpoints_dirty(card);
 	snd_soc_dapm_sync(&card->dapm);
 	mutex_unlock(&card->mutex);
 	mutex_unlock(&client_mutex);

From b9b45f496f3ee085d6f8439f993124ef94fb13e9 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhc@lemote.com>
Date: Thu, 15 Nov 2018 10:44:57 +0800
Subject: [PATCH 1889/3220] hwmon: (w83795) temp4_type has writable permission

[ Upstream commit 09aaf6813cfca4c18034fda7a43e68763f34abb1 ]

Both datasheet and comments of store_temp_mode() tell us that temp1~4_type
is writable, so fix it.

Signed-off-by: Yao Wang <wangyao@lemote.com>
Signed-off-by: Huacai Chen <chenhc@lemote.com>
Fixes: 39deb6993e7c (" hwmon: (w83795) Simplify temperature sensor type handling")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwmon/w83795.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/w83795.c b/drivers/hwmon/w83795.c
index 49276bbdac3d..1bb80f992aa8 100644
--- a/drivers/hwmon/w83795.c
+++ b/drivers/hwmon/w83795.c
@@ -1691,7 +1691,7 @@ store_sf_setup(struct device *dev, struct device_attribute *attr,
  * somewhere else in the code
  */
 #define SENSOR_ATTR_TEMP(index) {					\
-	SENSOR_ATTR_2(temp##index##_type, S_IRUGO | (index < 4 ? S_IWUSR : 0), \
+	SENSOR_ATTR_2(temp##index##_type, S_IRUGO | (index < 5 ? S_IWUSR : 0), \
 		show_temp_mode, store_temp_mode, NOT_USED, index - 1),	\
 	SENSOR_ATTR_2(temp##index##_input, S_IRUGO, show_temp,		\
 		NULL, TEMP_READ, index - 1),				\

From 44b0243956db138911bafd0bc1ff8427a24b53e8 Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Wed, 14 Nov 2018 18:32:37 +0000
Subject: [PATCH 1890/3220] Btrfs: send, fix infinite loop due to directory
 rename dependencies

[ Upstream commit a4390aee72713d9e73f1132bcdeb17d72fbbf974 ]

When doing an incremental send, due to the need of delaying directory move
(rename) operations we can end up in infinite loop at
apply_children_dir_moves().

An example scenario that triggers this problem is described below, where
directory names correspond to the numbers of their respective inodes.

Parent snapshot:

 .
 |--- 261/
       |--- 271/
             |--- 266/
                   |--- 259/
                   |--- 260/
                   |     |--- 267
                   |
                   |--- 264/
                   |     |--- 258/
                   |           |--- 257/
                   |
                   |--- 265/
                   |--- 268/
                   |--- 269/
                   |     |--- 262/
                   |
                   |--- 270/
                   |--- 272/
                   |     |--- 263/
                   |     |--- 275/
                   |
                   |--- 274/
                         |--- 273/

Send snapshot:

 .
 |-- 275/
      |-- 274/
           |-- 273/
                |-- 262/
                     |-- 269/
                          |-- 258/
                               |-- 271/
                                    |-- 268/
                                         |-- 267/
                                              |-- 270/
                                                   |-- 259/
                                                   |    |-- 265/
                                                   |
                                                   |-- 272/
                                                        |-- 257/
                                                             |-- 260/
                                                             |-- 264/
                                                                  |-- 263/
                                                                       |-- 261/
                                                                            |-- 266/

When processing inode 257 we delay its move (rename) operation because its
new parent in the send snapshot, inode 272, was not yet processed. Then
when processing inode 272, we delay the move operation for that inode
because inode 274 is its ancestor in the send snapshot. Finally we delay
the move operation for inode 274 when processing it because inode 275 is
its new parent in the send snapshot and was not yet moved.

When finishing processing inode 275, we start to do the move operations
that were previously delayed (at apply_children_dir_moves()), resulting in
the following iterations:

1) We issue the move operation for inode 274;

2) Because inode 262 depended on the move operation of inode 274 (it was
   delayed because 274 is its ancestor in the send snapshot), we issue the
   move operation for inode 262;

3) We issue the move operation for inode 272, because it was delayed by
   inode 274 too (ancestor of 272 in the send snapshot);

4) We issue the move operation for inode 269 (it was delayed by 262);

5) We issue the move operation for inode 257 (it was delayed by 272);

6) We issue the move operation for inode 260 (it was delayed by 272);

7) We issue the move operation for inode 258 (it was delayed by 269);

8) We issue the move operation for inode 264 (it was delayed by 257);

9) We issue the move operation for inode 271 (it was delayed by 258);

10) We issue the move operation for inode 263 (it was delayed by 264);

11) We issue the move operation for inode 268 (it was delayed by 271);

12) We verify if we can issue the move operation for inode 270 (it was
    delayed by 271). We detect a path loop in the current state, because
    inode 267 needs to be moved first before we can issue the move
    operation for inode 270. So we delay again the move operation for
    inode 270, this time we will attempt to do it after inode 267 is
    moved;

13) We issue the move operation for inode 261 (it was delayed by 263);

14) We verify if we can issue the move operation for inode 266 (it was
    delayed by 263). We detect a path loop in the current state, because
    inode 270 needs to be moved first before we can issue the move
    operation for inode 266. So we delay again the move operation for
    inode 266, this time we will attempt to do it after inode 270 is
    moved (its move operation was delayed in step 12);

15) We issue the move operation for inode 267 (it was delayed by 268);

16) We verify if we can issue the move operation for inode 266 (it was
    delayed by 270). We detect a path loop in the current state, because
    inode 270 needs to be moved first before we can issue the move
    operation for inode 266. So we delay again the move operation for
    inode 266, this time we will attempt to do it after inode 270 is
    moved (its move operation was delayed in step 12). So here we added
    again the same delayed move operation that we added in step 14;

17) We attempt again to see if we can issue the move operation for inode
    266, and as in step 16, we realize we can not due to a path loop in
    the current state due to a dependency on inode 270. Again we delay
    inode's 266 rename to happen after inode's 270 move operation, adding
    the same dependency to the empty stack that we did in steps 14 and 16.
    The next iteration will pick the same move dependency on the stack
    (the only entry) and realize again there is still a path loop and then
    again the same dependency to the stack, over and over, resulting in
    an infinite loop.

So fix this by preventing adding the same move dependency entries to the
stack by removing each pending move record from the red black tree of
pending moves. This way the next call to get_pending_dir_moves() will
not return anything for the current parent inode.

A test case for fstests, with this reproducer, follows soon.

Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
[Wrote changelog with example and more clear explanation]
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/send.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 83c73738165e..40d1ab957fb6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3232,7 +3232,8 @@ static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
 	kfree(m);
 }
 
-static void tail_append_pending_moves(struct pending_dir_move *moves,
+static void tail_append_pending_moves(struct send_ctx *sctx,
+				      struct pending_dir_move *moves,
 				      struct list_head *stack)
 {
 	if (list_empty(&moves->list)) {
@@ -3243,6 +3244,10 @@ static void tail_append_pending_moves(struct pending_dir_move *moves,
 		list_add_tail(&moves->list, stack);
 		list_splice_tail(&list, stack);
 	}
+	if (!RB_EMPTY_NODE(&moves->node)) {
+		rb_erase(&moves->node, &sctx->pending_dir_moves);
+		RB_CLEAR_NODE(&moves->node);
+	}
 }
 
 static int apply_children_dir_moves(struct send_ctx *sctx)
@@ -3257,7 +3262,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx)
 		return 0;
 
 	INIT_LIST_HEAD(&stack);
-	tail_append_pending_moves(pm, &stack);
+	tail_append_pending_moves(sctx, pm, &stack);
 
 	while (!list_empty(&stack)) {
 		pm = list_first_entry(&stack, struct pending_dir_move, list);
@@ -3268,7 +3273,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx)
 			goto out;
 		pm = get_pending_dir_moves(sctx, parent_ino);
 		if (pm)
-			tail_append_pending_moves(pm, &stack);
+			tail_append_pending_moves(sctx, pm, &stack);
 	}
 	return 0;
 

From 3aa69785b7d5c0257a0d7c88c746d26c96a46e66 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 14 Nov 2018 13:06:22 +0200
Subject: [PATCH 1891/3220] ASoC: omap-mcpdm: Add pm_qos handling to avoid
 under/overruns with CPU_IDLE

[ Upstream commit 373a500e34aea97971c9d71e45edad458d3da98f ]

We need to block sleep states which would require longer time to leave than
the time the DMA must react to the DMA request in order to keep the FIFO
serviced without under of overrun.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Acked-by: Jarkko Nikula <jarkko.nikula@bitmer.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/omap/omap-mcpdm.c | 43 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/sound/soc/omap/omap-mcpdm.c b/sound/soc/omap/omap-mcpdm.c
index 8d0d45d330e7..8eb2d12b6a34 100644
--- a/sound/soc/omap/omap-mcpdm.c
+++ b/sound/soc/omap/omap-mcpdm.c
@@ -54,6 +54,8 @@ struct omap_mcpdm {
 	unsigned long phys_base;
 	void __iomem *io_base;
 	int irq;
+	struct pm_qos_request pm_qos_req;
+	int latency[2];
 
 	struct mutex mutex;
 
@@ -273,6 +275,9 @@ static void omap_mcpdm_dai_shutdown(struct snd_pcm_substream *substream,
 				  struct snd_soc_dai *dai)
 {
 	struct omap_mcpdm *mcpdm = snd_soc_dai_get_drvdata(dai);
+	int tx = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK);
+	int stream1 = tx ? SNDRV_PCM_STREAM_PLAYBACK : SNDRV_PCM_STREAM_CAPTURE;
+	int stream2 = tx ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK;
 
 	mutex_lock(&mcpdm->mutex);
 
@@ -285,6 +290,14 @@ static void omap_mcpdm_dai_shutdown(struct snd_pcm_substream *substream,
 		}
 	}
 
+	if (mcpdm->latency[stream2])
+		pm_qos_update_request(&mcpdm->pm_qos_req,
+				      mcpdm->latency[stream2]);
+	else if (mcpdm->latency[stream1])
+		pm_qos_remove_request(&mcpdm->pm_qos_req);
+
+	mcpdm->latency[stream1] = 0;
+
 	mutex_unlock(&mcpdm->mutex);
 }
 
@@ -296,7 +309,7 @@ static int omap_mcpdm_dai_hw_params(struct snd_pcm_substream *substream,
 	int stream = substream->stream;
 	struct snd_dmaengine_dai_dma_data *dma_data;
 	u32 threshold;
-	int channels;
+	int channels, latency;
 	int link_mask = 0;
 
 	channels = params_channels(params);
@@ -336,14 +349,25 @@ static int omap_mcpdm_dai_hw_params(struct snd_pcm_substream *substream,
 
 		dma_data->maxburst =
 				(MCPDM_DN_THRES_MAX - threshold) * channels;
+		latency = threshold;
 	} else {
 		/* If playback is not running assume a stereo stream to come */
 		if (!mcpdm->config[!stream].link_mask)
 			mcpdm->config[!stream].link_mask = (0x3 << 3);
 
 		dma_data->maxburst = threshold * channels;
+		latency = (MCPDM_DN_THRES_MAX - threshold);
 	}
 
+	/*
+	 * The DMA must act to a DMA request within latency time (usec) to avoid
+	 * under/overflow
+	 */
+	mcpdm->latency[stream] = latency * USEC_PER_SEC / params_rate(params);
+
+	if (!mcpdm->latency[stream])
+		mcpdm->latency[stream] = 10;
+
 	/* Check if we need to restart McPDM with this stream */
 	if (mcpdm->config[stream].link_mask &&
 	    mcpdm->config[stream].link_mask != link_mask)
@@ -358,6 +382,20 @@ static int omap_mcpdm_prepare(struct snd_pcm_substream *substream,
 				  struct snd_soc_dai *dai)
 {
 	struct omap_mcpdm *mcpdm = snd_soc_dai_get_drvdata(dai);
+	struct pm_qos_request *pm_qos_req = &mcpdm->pm_qos_req;
+	int tx = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK);
+	int stream1 = tx ? SNDRV_PCM_STREAM_PLAYBACK : SNDRV_PCM_STREAM_CAPTURE;
+	int stream2 = tx ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK;
+	int latency = mcpdm->latency[stream2];
+
+	/* Prevent omap hardware from hitting off between FIFO fills */
+	if (!latency || mcpdm->latency[stream1] < latency)
+		latency = mcpdm->latency[stream1];
+
+	if (pm_qos_request_active(pm_qos_req))
+		pm_qos_update_request(pm_qos_req, latency);
+	else if (latency)
+		pm_qos_add_request(pm_qos_req, PM_QOS_CPU_DMA_LATENCY, latency);
 
 	if (!omap_mcpdm_active(mcpdm)) {
 		omap_mcpdm_start(mcpdm);
@@ -419,6 +457,9 @@ static int omap_mcpdm_remove(struct snd_soc_dai *dai)
 	free_irq(mcpdm->irq, (void *)mcpdm);
 	pm_runtime_disable(mcpdm->dev);
 
+	if (pm_qos_request_active(&mcpdm->pm_qos_req))
+		pm_qos_remove_request(&mcpdm->pm_qos_req);
+
 	return 0;
 }
 

From 476bea0f653c47228d22975a16858a080de04993 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 14 Nov 2018 13:06:23 +0200
Subject: [PATCH 1892/3220] ASoC: omap-dmic: Add pm_qos handling to avoid
 overruns with CPU_IDLE

[ Upstream commit ffdcc3638c58d55a6fa68b6e5dfd4fb4109652eb ]

We need to block sleep states which would require longer time to leave than
the time the DMA must react to the DMA request in order to keep the FIFO
serviced without overrun.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Acked-by: Jarkko Nikula <jarkko.nikula@bitmer.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/omap/omap-dmic.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sound/soc/omap/omap-dmic.c b/sound/soc/omap/omap-dmic.c
index 09db2aec12a3..776e809a8aab 100644
--- a/sound/soc/omap/omap-dmic.c
+++ b/sound/soc/omap/omap-dmic.c
@@ -48,6 +48,8 @@ struct omap_dmic {
 	struct device *dev;
 	void __iomem *io_base;
 	struct clk *fclk;
+	struct pm_qos_request pm_qos_req;
+	int latency;
 	int fclk_freq;
 	int out_freq;
 	int clk_div;
@@ -124,6 +126,8 @@ static void omap_dmic_dai_shutdown(struct snd_pcm_substream *substream,
 
 	mutex_lock(&dmic->mutex);
 
+	pm_qos_remove_request(&dmic->pm_qos_req);
+
 	if (!dai->active)
 		dmic->active = 0;
 
@@ -226,6 +230,8 @@ static int omap_dmic_dai_hw_params(struct snd_pcm_substream *substream,
 	/* packet size is threshold * channels */
 	dma_data = snd_soc_dai_get_dma_data(dai, substream);
 	dma_data->maxburst = dmic->threshold * channels;
+	dmic->latency = (OMAP_DMIC_THRES_MAX - dmic->threshold) * USEC_PER_SEC /
+			params_rate(params);
 
 	return 0;
 }
@@ -236,6 +242,9 @@ static int omap_dmic_dai_prepare(struct snd_pcm_substream *substream,
 	struct omap_dmic *dmic = snd_soc_dai_get_drvdata(dai);
 	u32 ctrl;
 
+	if (pm_qos_request_active(&dmic->pm_qos_req))
+		pm_qos_update_request(&dmic->pm_qos_req, dmic->latency);
+
 	/* Configure uplink threshold */
 	omap_dmic_write(dmic, OMAP_DMIC_FIFO_CTRL_REG, dmic->threshold);
 

From d4327131dbe0147d0f07c5f0a65a0cb079bd2caa Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Fri, 23 Nov 2018 15:56:33 +0800
Subject: [PATCH 1893/3220] exportfs: do not read dentry after free

[ Upstream commit 2084ac6c505a58f7efdec13eba633c6aaa085ca5 ]

The function dentry_connected calls dput(dentry) to drop the previously
acquired reference to dentry. In this case, dentry can be released.
After that, IS_ROOT(dentry) checks the condition
(dentry == dentry->d_parent), which may result in a use-after-free bug.
This patch directly compares dentry with its parent obtained before
dropping the reference.

Fixes: a056cc8934c("exportfs: stop retrying once we race with
rename/remove")

Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/exportfs/expfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 714cd37a6ba3..6599c6124552 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -76,7 +76,7 @@ static bool dentry_connected(struct dentry *dentry)
 		struct dentry *parent = dget_parent(dentry);
 
 		dput(dentry);
-		if (IS_ROOT(dentry)) {
+		if (dentry == parent) {
 			dput(parent);
 			return false;
 		}

From ac86c99ca19ae57ba955312f5183e1a49aa9372e Mon Sep 17 00:00:00 2001
From: Martynas Pumputis <m@lambda.lt>
Date: Fri, 23 Nov 2018 17:43:26 +0100
Subject: [PATCH 1894/3220] bpf: fix check of allowed specifiers in
 bpf_trace_printk

[ Upstream commit 1efb6ee3edea57f57f9fb05dba8dcb3f7333f61f ]

A format string consisting of "%p" or "%s" followed by an invalid
specifier (e.g. "%p%\n" or "%s%") could pass the check which
would make format_decode (lib/vsprintf.c) to warn.

Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()")
Reported-by: syzbot+1ec5c5ec949c4adaa0c4@syzkaller.appspotmail.com
Signed-off-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/trace/bpf_trace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4228fd3682c3..3dd40c736067 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -119,11 +119,13 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 			i++;
 		} else if (fmt[i] == 'p' || fmt[i] == 's') {
 			mod[fmt_cnt]++;
-			i++;
-			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+			/* disallow any further format extensions */
+			if (fmt[i + 1] != 0 &&
+			    !isspace(fmt[i + 1]) &&
+			    !ispunct(fmt[i + 1]))
 				return -EINVAL;
 			fmt_cnt++;
-			if (fmt[i - 1] == 's') {
+			if (fmt[i] == 's') {
 				if (str_seen)
 					/* allow only one '%s' per fmt string */
 					return -EINVAL;

From a20d7a308fa2191256c1734499c7fb85a5e6f92d Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Sun, 25 Nov 2018 00:17:04 +0200
Subject: [PATCH 1895/3220] USB: omap_udc: use devm_request_irq()

[ Upstream commit 286afdde1640d8ea8916a0f05e811441fbbf4b9d ]

The current code fails to release the third irq on the error path
(observed by reading the code), and we get also multiple WARNs with
failing gadget drivers due to duplicate IRQ releases. Fix by using
devm_request_irq().

Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/gadget/udc/omap_udc.c | 37 +++++++++----------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index 9b7d39484ed3..b25eac2dcaf8 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -2886,8 +2886,8 @@ bad_on_1710:
 		udc->clr_halt = UDC_RESET_EP;
 
 	/* USB general purpose IRQ:  ep0, state changes, dma, etc */
-	status = request_irq(pdev->resource[1].start, omap_udc_irq,
-			0, driver_name, udc);
+	status = devm_request_irq(&pdev->dev, pdev->resource[1].start,
+				  omap_udc_irq, 0, driver_name, udc);
 	if (status != 0) {
 		ERR("can't get irq %d, err %d\n",
 			(int) pdev->resource[1].start, status);
@@ -2895,20 +2895,20 @@ bad_on_1710:
 	}
 
 	/* USB "non-iso" IRQ (PIO for all but ep0) */
-	status = request_irq(pdev->resource[2].start, omap_udc_pio_irq,
-			0, "omap_udc pio", udc);
+	status = devm_request_irq(&pdev->dev, pdev->resource[2].start,
+				  omap_udc_pio_irq, 0, "omap_udc pio", udc);
 	if (status != 0) {
 		ERR("can't get irq %d, err %d\n",
 			(int) pdev->resource[2].start, status);
-		goto cleanup2;
+		goto cleanup1;
 	}
 #ifdef	USE_ISO
-	status = request_irq(pdev->resource[3].start, omap_udc_iso_irq,
-			0, "omap_udc iso", udc);
+	status = devm_request_irq(&pdev->dev, pdev->resource[3].start,
+				  omap_udc_iso_irq, 0, "omap_udc iso", udc);
 	if (status != 0) {
 		ERR("can't get irq %d, err %d\n",
 			(int) pdev->resource[3].start, status);
-		goto cleanup3;
+		goto cleanup1;
 	}
 #endif
 	if (cpu_is_omap16xx() || cpu_is_omap7xx()) {
@@ -2921,22 +2921,11 @@ bad_on_1710:
 	create_proc_file();
 	status = usb_add_gadget_udc_release(&pdev->dev, &udc->gadget,
 			omap_udc_release);
-	if (status)
-		goto cleanup4;
+	if (!status)
+		return 0;
 
-	return 0;
-
-cleanup4:
 	remove_proc_file();
 
-#ifdef	USE_ISO
-cleanup3:
-	free_irq(pdev->resource[2].start, udc);
-#endif
-
-cleanup2:
-	free_irq(pdev->resource[1].start, udc);
-
 cleanup1:
 	kfree(udc);
 	udc = NULL;
@@ -2980,12 +2969,6 @@ static int omap_udc_remove(struct platform_device *pdev)
 
 	remove_proc_file();
 
-#ifdef	USE_ISO
-	free_irq(pdev->resource[3].start, udc);
-#endif
-	free_irq(pdev->resource[2].start, udc);
-	free_irq(pdev->resource[1].start, udc);
-
 	if (udc->dc_clk) {
 		if (udc->clk_requested)
 			omap_udc_enable_clock(0);

From 39d5919321d360227862a08a0788b205d9335b14 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Sun, 25 Nov 2018 00:17:05 +0200
Subject: [PATCH 1896/3220] USB: omap_udc: fix crashes on probe error and
 module removal

[ Upstream commit 99f700366fcea1aa2fa3c49c99f371670c3c62f8 ]

We currently crash if usb_add_gadget_udc_release() fails, since the
udc->done is not initialized until in the remove function.
Furthermore, on module removal the udc data is accessed although
the release function is already triggered by usb_del_gadget_udc()
early in the function.

Fix by rewriting the release and remove functions, basically moving
all the cleanup into the release function, and doing the completion
only in the module removal case.

The patch fixes omap_udc module probe with a failing gadged, and also
allows the removal of omap_udc. Tested by running "modprobe omap_udc;
modprobe -r omap_udc" in a loop.

Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/gadget/udc/omap_udc.c | 50 ++++++++++++-------------------
 1 file changed, 19 insertions(+), 31 deletions(-)

diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index b25eac2dcaf8..da1030f69145 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -2612,9 +2612,22 @@ omap_ep_setup(char *name, u8 addr, u8 type,
 
 static void omap_udc_release(struct device *dev)
 {
-	complete(udc->done);
+	pullup_disable(udc);
+	if (!IS_ERR_OR_NULL(udc->transceiver)) {
+		usb_put_phy(udc->transceiver);
+		udc->transceiver = NULL;
+	}
+	omap_writew(0, UDC_SYSCON1);
+	remove_proc_file();
+	if (udc->dc_clk) {
+		if (udc->clk_requested)
+			omap_udc_enable_clock(0);
+		clk_put(udc->hhc_clk);
+		clk_put(udc->dc_clk);
+	}
+	if (udc->done)
+		complete(udc->done);
 	kfree(udc);
-	udc = NULL;
 }
 
 static int
@@ -2919,12 +2932,8 @@ bad_on_1710:
 	}
 
 	create_proc_file();
-	status = usb_add_gadget_udc_release(&pdev->dev, &udc->gadget,
-			omap_udc_release);
-	if (!status)
-		return 0;
-
-	remove_proc_file();
+	return usb_add_gadget_udc_release(&pdev->dev, &udc->gadget,
+					  omap_udc_release);
 
 cleanup1:
 	kfree(udc);
@@ -2951,36 +2960,15 @@ static int omap_udc_remove(struct platform_device *pdev)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 
-	if (!udc)
-		return -ENODEV;
-
-	usb_del_gadget_udc(&udc->gadget);
-	if (udc->driver)
-		return -EBUSY;
-
 	udc->done = &done;
 
-	pullup_disable(udc);
-	if (!IS_ERR_OR_NULL(udc->transceiver)) {
-		usb_put_phy(udc->transceiver);
-		udc->transceiver = NULL;
-	}
-	omap_writew(0, UDC_SYSCON1);
+	usb_del_gadget_udc(&udc->gadget);
 
-	remove_proc_file();
-
-	if (udc->dc_clk) {
-		if (udc->clk_requested)
-			omap_udc_enable_clock(0);
-		clk_put(udc->hhc_clk);
-		clk_put(udc->dc_clk);
-	}
+	wait_for_completion(&done);
 
 	release_mem_region(pdev->resource[0].start,
 			pdev->resource[0].end - pdev->resource[0].start + 1);
 
-	wait_for_completion(&done);
-
 	return 0;
 }
 

From 09542b329dbf6dc12f3d7590adfe02429791cf49 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Sun, 25 Nov 2018 00:17:06 +0200
Subject: [PATCH 1897/3220] USB: omap_udc: fix omap_udc_start() on 15xx
 machines

[ Upstream commit 6ca6695f576b8453fe68865e84d25946d63b10ad ]

On OMAP 15xx machines there are no transceivers, and omap_udc_start()
always fails as it forgot to adjust the default return value.

Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/gadget/udc/omap_udc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index da1030f69145..653963459d78 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -2045,7 +2045,7 @@ static inline int machine_without_vbus_sense(void)
 static int omap_udc_start(struct usb_gadget *g,
 		struct usb_gadget_driver *driver)
 {
-	int		status = -ENODEV;
+	int		status;
 	struct omap_ep	*ep;
 	unsigned long	flags;
 
@@ -2083,6 +2083,7 @@ static int omap_udc_start(struct usb_gadget *g,
 			goto done;
 		}
 	} else {
+		status = 0;
 		if (can_pullup(udc))
 			pullup_enable(udc);
 		else

From 026e6bd1a41b375b40c013ee9e1dfa8afba3859d Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Sun, 25 Nov 2018 00:17:07 +0200
Subject: [PATCH 1898/3220] USB: omap_udc: fix USB gadget functionality on Palm
 Tungsten E

[ Upstream commit 2c2322fbcab8102b8cadc09d66714700a2da42c2 ]

On Palm TE nothing happens when you try to use gadget drivers and plug
the USB cable. Fix by adding the board to the vbus sense quirk list.

Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/gadget/udc/omap_udc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index 653963459d78..d1ed92acafa3 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -2037,6 +2037,7 @@ static inline int machine_without_vbus_sense(void)
 {
 	return machine_is_omap_innovator()
 		|| machine_is_omap_osk()
+		|| machine_is_omap_palmte()
 		|| machine_is_sx1()
 		/* No known omap7xx boards with vbus sense */
 		|| cpu_is_omap7xx();

From afca87e81681a55a1db525e1ea66cbf601a6fee9 Mon Sep 17 00:00:00 2001
From: Yi Wang <wang.yi59@zte.com.cn>
Date: Thu, 8 Nov 2018 16:48:36 +0800
Subject: [PATCH 1899/3220] KVM: x86: fix empty-body warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 354cb410d87314e2eda344feea84809e4261570a ]

We get the following warnings about empty statements when building
with 'W=1':

arch/x86/kvm/lapic.c:632:53: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body]
arch/x86/kvm/lapic.c:1907:42: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body]
arch/x86/kvm/lapic.c:1936:65: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body]
arch/x86/kvm/lapic.c:1975:44: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body]

Rework the debug helper macro to get rid of these warnings.

Signed-off-by: Yi Wang <wang.yi59@zte.com.cn>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/lapic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a1afd80a68aa..3c70f6c76d3a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -56,7 +56,7 @@
 #define APIC_BUS_CYCLE_NS 1
 
 /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define apic_debug(fmt, arg...)
+#define apic_debug(fmt, arg...) do {} while (0)
 
 #define APIC_LVT_NUM			6
 /* 14 is the version for Xeon and Pentium 8.4.8*/

From 6a1fb1bf8e6db1da631f14f790b5fc776cb93c7a Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Mon, 26 Nov 2018 15:07:16 +0100
Subject: [PATCH 1900/3220] net: thunderx: fix NULL pointer dereference in
 nic_remove

[ Upstream commit 24a6d2dd263bc910de018c78d1148b3e33b94512 ]

Fix a possible NULL pointer dereference in nic_remove routine
removing the nicpf module if nic_probe fails.
The issue can be triggered with the following reproducer:

$rmmod nicvf
$rmmod nicpf

[  521.412008] Unable to handle kernel access to user memory outside uaccess routines at virtual address 0000000000000014
[  521.422777] Mem abort info:
[  521.425561]   ESR = 0x96000004
[  521.428624]   Exception class = DABT (current EL), IL = 32 bits
[  521.434535]   SET = 0, FnV = 0
[  521.437579]   EA = 0, S1PTW = 0
[  521.440730] Data abort info:
[  521.443603]   ISV = 0, ISS = 0x00000004
[  521.447431]   CM = 0, WnR = 0
[  521.450417] user pgtable: 4k pages, 48-bit VAs, pgdp = 0000000072a3da42
[  521.457022] [0000000000000014] pgd=0000000000000000
[  521.461916] Internal error: Oops: 96000004 [#1] SMP
[  521.511801] Hardware name: GIGABYTE H270-T70/MT70-HD0, BIOS T49 02/02/2018
[  521.518664] pstate: 80400005 (Nzcv daif +PAN -UAO)
[  521.523451] pc : nic_remove+0x24/0x88 [nicpf]
[  521.527808] lr : pci_device_remove+0x48/0xd8
[  521.532066] sp : ffff000013433cc0
[  521.535370] x29: ffff000013433cc0 x28: ffff810f6ac50000
[  521.540672] x27: 0000000000000000 x26: 0000000000000000
[  521.545974] x25: 0000000056000000 x24: 0000000000000015
[  521.551274] x23: ffff8007ff89a110 x22: ffff000001667070
[  521.556576] x21: ffff8007ffb170b0 x20: ffff8007ffb17000
[  521.561877] x19: 0000000000000000 x18: 0000000000000025
[  521.567178] x17: 0000000000000000 x16: 000000000000010ffc33ff98 x8 : 0000000000000000
[  521.593683] x7 : 0000000000000000 x6 : 0000000000000001
[  521.598983] x5 : 0000000000000002 x4 : 0000000000000003
[  521.604284] x3 : ffff8007ffb17184 x2 : ffff8007ffb17184
[  521.609585] x1 : ffff000001662118 x0 : ffff000008557be0
[  521.614887] Process rmmod (pid: 1897, stack limit = 0x00000000859535c3)
[  521.621490] Call trace:
[  521.623928]  nic_remove+0x24/0x88 [nicpf]
[  521.627927]  pci_device_remove+0x48/0xd8
[  521.631847]  device_release_driver_internal+0x1b0/0x248
[  521.637062]  driver_detach+0x50/0xc0
[  521.640628]  bus_remove_driver+0x60/0x100
[  521.644627]  driver_unregister+0x34/0x60
[  521.648538]  pci_unregister_driver+0x24/0xd8
[  521.652798]  nic_cleanup_module+0x14/0x111c [nicpf]
[  521.657672]  __arm64_sys_delete_module+0x150/0x218
[  521.662460]  el0_svc_handler+0x94/0x110
[  521.666287]  el0_svc+0x8/0xc
[  521.669160] Code: aa1e03e0 9102c295 d503201f f9404eb3 (b9401660)

Fixes: 4863dea3fab0 ("net: Adding support for Cavium ThunderX network controller")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/cavium/thunder/nic_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 16baaafed26c..cbdeb54eab51 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -1090,6 +1090,9 @@ static void nic_remove(struct pci_dev *pdev)
 {
 	struct nicpf *nic = pci_get_drvdata(pdev);
 
+	if (!nic)
+		return;
+
 	if (nic->flags & NIC_SRIOV_ENABLED)
 		pci_disable_sriov(pdev);
 

From 553c8675f5fea8c9fbf178c3a2187a2423b4af38 Mon Sep 17 00:00:00 2001
From: Josh Elsasser <jelsasser@appneta.com>
Date: Sat, 24 Nov 2018 12:57:33 -0800
Subject: [PATCH 1901/3220] ixgbe: recognize 1000BaseLX SFP modules as 1Gbps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit a8bf879af7b1999eba36303ce9cc60e0e7dd816c ]

Add the two 1000BaseLX enum values to the X550's check for 1Gbps modules,
allowing the core driver code to establish a link over this SFP type.

This is done by the out-of-tree driver but the fix wasn't in mainline.

Fixes: e23f33367882 ("ixgbe: Fix 1G and 10G link stability for X550EM_x SFP+”)
Fixes: 6a14ee0cfb19 ("ixgbe: Add X550 support function pointers")
Signed-off-by: Josh Elsasser <jelsasser@appneta.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index ffd2e74e5638..dcd718ce13d5 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -1429,7 +1429,9 @@ static s32 ixgbe_get_link_capabilities_X550em(struct ixgbe_hw *hw,
 		*autoneg = false;
 
 		if (hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core0 ||
-		    hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core1) {
+		    hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core1 ||
+		    hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core0 ||
+		    hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core1) {
 			*speed = IXGBE_LINK_SPEED_1GB_FULL;
 			return 0;
 		}

From 573f19f1517bee0dd16ebf629c0746a2dfc20461 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Wed, 28 Nov 2018 15:30:24 +0800
Subject: [PATCH 1902/3220] net: hisilicon: remove unexpected free_netdev

[ Upstream commit c758940158bf29fe14e9d0f89d5848f227b48134 ]

The net device ndev is freed via free_netdev when failing to register
the device. The control flow then jumps to the error handling code
block. ndev is used and freed again. Resulting in a use-after-free bug.

Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/hisilicon/hip04_eth.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
index 253f8ed0537a..60c727b0b7ab 100644
--- a/drivers/net/ethernet/hisilicon/hip04_eth.c
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -919,10 +919,8 @@ static int hip04_mac_probe(struct platform_device *pdev)
 	}
 
 	ret = register_netdev(ndev);
-	if (ret) {
-		free_netdev(ndev);
+	if (ret)
 		goto alloc_fail;
-	}
 
 	return 0;
 

From 0f1b4c748370a567c6d0ecc0a48a079a450861a0 Mon Sep 17 00:00:00 2001
From: "Y.C. Chen" <yc_chen@aspeedtech.com>
Date: Thu, 22 Nov 2018 11:56:28 +0800
Subject: [PATCH 1903/3220] drm/ast: fixed reading monitor EDID not stable
 issue

[ Upstream commit 300625620314194d9e6d4f6dda71f2dc9cf62d9f ]

v1: over-sample data to increase the stability with some specific monitors
v2: refine to avoid infinite loop
v3: remove un-necessary "volatile" declaration

[airlied: fix two checkpatch warnings]

Signed-off-by: Y.C. Chen <yc_chen@aspeedtech.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1542858988-1127-1-git-send-email-yc_chen@aspeedtech.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/ast/ast_mode.c | 36 ++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c
index 21085f669e21..b19ba1792607 100644
--- a/drivers/gpu/drm/ast/ast_mode.c
+++ b/drivers/gpu/drm/ast/ast_mode.c
@@ -968,9 +968,21 @@ static int get_clock(void *i2c_priv)
 {
 	struct ast_i2c_chan *i2c = i2c_priv;
 	struct ast_private *ast = i2c->dev->dev_private;
-	uint32_t val;
+	uint32_t val, val2, count, pass;
+
+	count = 0;
+	pass = 0;
+	val = (ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x10) >> 4) & 0x01;
+	do {
+		val2 = (ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x10) >> 4) & 0x01;
+		if (val == val2) {
+			pass++;
+		} else {
+			pass = 0;
+			val = (ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x10) >> 4) & 0x01;
+		}
+	} while ((pass < 5) && (count++ < 0x10000));
 
-	val = ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x10) >> 4;
 	return val & 1 ? 1 : 0;
 }
 
@@ -978,9 +990,21 @@ static int get_data(void *i2c_priv)
 {
 	struct ast_i2c_chan *i2c = i2c_priv;
 	struct ast_private *ast = i2c->dev->dev_private;
-	uint32_t val;
+	uint32_t val, val2, count, pass;
+
+	count = 0;
+	pass = 0;
+	val = (ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x20) >> 5) & 0x01;
+	do {
+		val2 = (ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x20) >> 5) & 0x01;
+		if (val == val2) {
+			pass++;
+		} else {
+			pass = 0;
+			val = (ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x20) >> 5) & 0x01;
+		}
+	} while ((pass < 5) && (count++ < 0x10000));
 
-	val = ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x20) >> 5;
 	return val & 1 ? 1 : 0;
 }
 
@@ -993,7 +1017,7 @@ static void set_clock(void *i2c_priv, int clock)
 
 	for (i = 0; i < 0x10000; i++) {
 		ujcrb7 = ((clock & 0x01) ? 0 : 1);
-		ast_set_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0xfe, ujcrb7);
+		ast_set_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0xf4, ujcrb7);
 		jtemp = ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x01);
 		if (ujcrb7 == jtemp)
 			break;
@@ -1009,7 +1033,7 @@ static void set_data(void *i2c_priv, int data)
 
 	for (i = 0; i < 0x10000; i++) {
 		ujcrb7 = ((data & 0x01) ? 0 : 1) << 2;
-		ast_set_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0xfb, ujcrb7);
+		ast_set_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0xf1, ujcrb7);
 		jtemp = ast_get_index_reg_mask(ast, AST_IO_CRTC_PORT, 0xb7, 0x04);
 		if (ujcrb7 == jtemp)
 			break;

From 26e081481506908e75a202ac91d15b484be7b27c Mon Sep 17 00:00:00 2001
From: Srikanth Boddepalli <boddepalli.srikanth@gmail.com>
Date: Tue, 27 Nov 2018 19:53:27 +0530
Subject: [PATCH 1904/3220] xen: xlate_mmu: add missing header to fix 'W=1'
 warning

[ Upstream commit 72791ac854fea36034fa7976b748fde585008e78 ]

Add a missing header otherwise compiler warns about missed prototype:

drivers/xen/xlate_mmu.c:183:5: warning: no previous prototype for 'xen_xlate_unmap_gfn_range?' [-Wmissing-prototypes]
  int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
      ^~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Srikanth Boddepalli <boddepalli.srikanth@gmail.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Joey Pabalinas <joeypabalinas@gmail.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/xen/xlate_mmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
index 5063c5e796b7..84a1fab0dd6b 100644
--- a/drivers/xen/xlate_mmu.c
+++ b/drivers/xen/xlate_mmu.c
@@ -34,6 +34,7 @@
 #include <asm/xen/hypervisor.h>
 
 #include <xen/xen.h>
+#include <xen/xen-ops.h>
 #include <xen/page.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/memory.h>

From b1ef956a8ba5d57c7209539b149e7e4b1d22c208 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 26 Oct 2018 17:16:29 +1100
Subject: [PATCH 1905/3220] fscache: fix race between enablement and dropping
 of object

[ Upstream commit c5a94f434c82529afda290df3235e4d85873c5b4 ]

It was observed that a process blocked indefintely in
__fscache_read_or_alloc_page(), waiting for FSCACHE_COOKIE_LOOKING_UP
to be cleared via fscache_wait_for_deferred_lookup().

At this time, ->backing_objects was empty, which would normaly prevent
__fscache_read_or_alloc_page() from getting to the point of waiting.
This implies that ->backing_objects was cleared *after*
__fscache_read_or_alloc_page was was entered.

When an object is "killed" and then "dropped",
FSCACHE_COOKIE_LOOKING_UP is cleared in fscache_lookup_failure(), then
KILL_OBJECT and DROP_OBJECT are "called" and only in DROP_OBJECT is
->backing_objects cleared.  This leaves a window where
something else can set FSCACHE_COOKIE_LOOKING_UP and
__fscache_read_or_alloc_page() can start waiting, before
->backing_objects is cleared

There is some uncertainty in this analysis, but it seems to be fit the
observations.  Adding the wake in this patch will be handled correctly
by __fscache_read_or_alloc_page(), as it checks if ->backing_objects
is empty again, after waiting.

Customer which reported the hang, also report that the hang cannot be
reproduced with this fix.

The backtrace for the blocked process looked like:

PID: 29360  TASK: ffff881ff2ac0f80  CPU: 3   COMMAND: "zsh"
 #0 [ffff881ff43efbf8] schedule at ffffffff815e56f1
 #1 [ffff881ff43efc58] bit_wait at ffffffff815e64ed
 #2 [ffff881ff43efc68] __wait_on_bit at ffffffff815e61b8
 #3 [ffff881ff43efca0] out_of_line_wait_on_bit at ffffffff815e625e
 #4 [ffff881ff43efd08] fscache_wait_for_deferred_lookup at ffffffffa04f2e8f [fscache]
 #5 [ffff881ff43efd18] __fscache_read_or_alloc_page at ffffffffa04f2ffe [fscache]
 #6 [ffff881ff43efd58] __nfs_readpage_from_fscache at ffffffffa0679668 [nfs]
 #7 [ffff881ff43efd78] nfs_readpage at ffffffffa067092b [nfs]
 #8 [ffff881ff43efda0] generic_file_read_iter at ffffffff81187a73
 #9 [ffff881ff43efe50] nfs_file_read at ffffffffa066544b [nfs]
#10 [ffff881ff43efe70] __vfs_read at ffffffff811fc756
#11 [ffff881ff43efee8] vfs_read at ffffffff811fccfa
#12 [ffff881ff43eff18] sys_read at ffffffff811fda62
#13 [ffff881ff43eff50] entry_SYSCALL_64_fastpath at ffffffff815e986e

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/fscache/object.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 7a182c87f378..ab1d7f35f6c2 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -715,6 +715,9 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
 
 	if (awaken)
 		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+	if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+
 
 	/* Prevent a race with our last child, which has to signal EV_CLEARED
 	 * before dropping our spinlock.

From 8f1ee7557959af3367d573ec11d376f4a94e1f5e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 17 Jul 2018 09:53:42 +0100
Subject: [PATCH 1906/3220] fscache, cachefiles: remove redundant variable
 'cache'

[ Upstream commit 31ffa563833576bd49a8bf53120568312755e6e2 ]

Variable 'cache' is being assigned but is never used hence it is
redundant and can be removed.

Cleans up clang warning:
warning: variable 'cache' set but not used [-Wunused-but-set-variable]

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cachefiles/rdwr.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 5b68cf526887..c05ab2ec0fef 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -963,11 +963,8 @@ error:
 void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
 {
 	struct cachefiles_object *object;
-	struct cachefiles_cache *cache;
 
 	object = container_of(_object, struct cachefiles_object, fscache);
-	cache = container_of(object->fscache.cache,
-			     struct cachefiles_cache, cache);
 
 	_enter("%p,{%lu}", object, page->index);
 

From 9549f09b02ade613e233d61c1a058bee8a584425 Mon Sep 17 00:00:00 2001
From: Larry Chen <lchen@suse.com>
Date: Fri, 30 Nov 2018 14:08:56 -0800
Subject: [PATCH 1907/3220] ocfs2: fix deadlock caused by ocfs2_defrag_extent()

[ Upstream commit e21e57445a64598b29a6f629688f9b9a39e7242a ]

ocfs2_defrag_extent may fall into deadlock.

ocfs2_ioctl_move_extents
    ocfs2_ioctl_move_extents
      ocfs2_move_extents
        ocfs2_defrag_extent
          ocfs2_lock_allocators_move_extents

            ocfs2_reserve_clusters
              inode_lock GLOBAL_BITMAP_SYSTEM_INODE

	  __ocfs2_flush_truncate_log
              inode_lock GLOBAL_BITMAP_SYSTEM_INODE

As backtrace shows above, ocfs2_reserve_clusters() will call inode_lock
against the global bitmap if local allocator has not sufficient cluters.
Once global bitmap could meet the demand, ocfs2_reserve_cluster will
return success with global bitmap locked.

After ocfs2_reserve_cluster(), if truncate log is full,
__ocfs2_flush_truncate_log() will definitely fall into deadlock because
it needs to inode_lock global bitmap, which has already been locked.

To fix this bug, we could remove from
ocfs2_lock_allocators_move_extents() the code which intends to lock
global allocator, and put the removed code after
__ocfs2_flush_truncate_log().

ocfs2_lock_allocators_move_extents() is referred by 2 places, one is
here, the other does not need the data allocator context, which means
this patch does not affect the caller so far.

Link: http://lkml.kernel.org/r/20181101071422.14470-1-lchen@suse.com
Signed-off-by: Larry Chen <lchen@suse.com>
Reviewed-by: Changwei Ge <ge.changwei@h3c.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ocfs2/move_extents.c | 47 +++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 124471d26a73..c1a83c58456e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -156,18 +156,14 @@ out:
 }
 
 /*
- * lock allocators, and reserving appropriate number of bits for
- * meta blocks and data clusters.
- *
- * in some cases, we don't need to reserve clusters, just let data_ac
- * be NULL.
+ * lock allocator, and reserve appropriate number of bits for
+ * meta blocks.
  */
-static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode,
 					struct ocfs2_extent_tree *et,
 					u32 clusters_to_move,
 					u32 extents_to_split,
 					struct ocfs2_alloc_context **meta_ac,
-					struct ocfs2_alloc_context **data_ac,
 					int extra_blocks,
 					int *credits)
 {
@@ -192,13 +188,6 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
 		goto out;
 	}
 
-	if (data_ac) {
-		ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
 
 	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
 
@@ -260,10 +249,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
 		}
 	}
 
-	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
-						 &context->meta_ac,
-						 &context->data_ac,
-						 extra_blocks, &credits);
+	ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+						*len, 1,
+						&context->meta_ac,
+						extra_blocks, &credits);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -286,6 +275,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
 		}
 	}
 
+	/*
+	 * Make sure ocfs2_reserve_cluster is called after
+	 * __ocfs2_flush_truncate_log, otherwise, dead lock may happen.
+	 *
+	 * If ocfs2_reserve_cluster is called
+	 * before __ocfs2_flush_truncate_log, dead lock on global bitmap
+	 * may happen.
+	 *
+	 */
+	ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock_mutex;
+	}
+
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -606,9 +610,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 		}
 	}
 
-	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
-						 &context->meta_ac,
-						 NULL, extra_blocks, &credits);
+	ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+						len, 1,
+						&context->meta_ac,
+						extra_blocks, &credits);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;

From 736ba5cba268c3b2b6abd16947cf42ff5ad96164 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Fri, 30 Nov 2018 14:09:14 -0800
Subject: [PATCH 1908/3220] hfs: do not free node before using

[ Upstream commit ce96a407adef126870b3f4a1b73529dd8aa80f49 ]

hfs_bmap_free() frees the node via hfs_bnode_put(node).  However, it
then reads node->this when dumping error message on an error path, which
may result in a use-after-free bug.  This patch frees the node only when
it is never again used.

Link: http://lkml.kernel.org/r/1542963889-128825-1-git-send-email-bianpan2016@163.com
Fixes: a1185ffa2fc ("HFS rewrite")
Signed-off-by: Pan Bian <bianpan2016@163.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Joe Perches <joe@perches.com>
Cc: Ernesto A. Fernandez <ernesto.mnd.fernandez@gmail.com>
Cc: Viacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/hfs/btree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 1ab19e660e69..1ff5774a5382 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -328,13 +328,14 @@ void hfs_bmap_free(struct hfs_bnode *node)
 
 		nidx -= len * 8;
 		i = node->next;
-		hfs_bnode_put(node);
 		if (!i) {
 			/* panic */;
 			pr_crit("unable to free bnode %u. bmap not found!\n",
 				node->this);
+			hfs_bnode_put(node);
 			return;
 		}
+		hfs_bnode_put(node);
 		node = hfs_bnode_find(tree, i);
 		if (IS_ERR(node))
 			return;

From 168a1537ca6f5058243cc88c025a55ac87f22fd3 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Fri, 30 Nov 2018 14:09:18 -0800
Subject: [PATCH 1909/3220] hfsplus: do not free node before using

[ Upstream commit c7d7d620dcbd2a1c595092280ca943f2fced7bbd ]

hfs_bmap_free() frees node via hfs_bnode_put(node).  However it then
reads node->this when dumping error message on an error path, which may
result in a use-after-free bug.  This patch frees node only when it is
never used.

Link: http://lkml.kernel.org/r/1543053441-66942-1-git-send-email-bianpan2016@163.com
Signed-off-by: Pan Bian <bianpan2016@163.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Ernesto A. Fernandez <ernesto.mnd.fernandez@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Viacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/hfsplus/btree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 3345c7553edc..7adc8a327e03 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -453,14 +453,15 @@ void hfs_bmap_free(struct hfs_bnode *node)
 
 		nidx -= len * 8;
 		i = node->next;
-		hfs_bnode_put(node);
 		if (!i) {
 			/* panic */;
 			pr_crit("unable to free bnode %u. "
 					"bmap not found!\n",
 				node->this);
+			hfs_bnode_put(node);
 			return;
 		}
+		hfs_bnode_put(node);
 		node = hfs_bnode_find(tree, i);
 		if (IS_ERR(node))
 			return;

From 5d8fe653a9340cf2a4daca908e1a56984f1c0909 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@gmx.us>
Date: Fri, 30 Nov 2018 14:09:48 -0800
Subject: [PATCH 1910/3220] debugobjects: avoid recursive calls with kmemleak

[ Upstream commit 8de456cf87ba863e028c4dd01bae44255ce3d835 ]

CONFIG_DEBUG_OBJECTS_RCU_HEAD does not play well with kmemleak due to
recursive calls.

fill_pool
  kmemleak_ignore
    make_black_object
      put_object
        __call_rcu (kernel/rcu/tree.c)
          debug_rcu_head_queue
            debug_object_activate
              debug_object_init
                fill_pool
                  kmemleak_ignore
                    make_black_object
                      ...

So add SLAB_NOLEAKTRACE to kmem_cache_create() to not register newly
allocated debug objects at all.

Link: http://lkml.kernel.org/r/20181126165343.2339-1-cai@gmx.us
Signed-off-by: Qian Cai <cai@gmx.us>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/debugobjects.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index a26328ec39f1..bb37541cd441 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -1088,7 +1088,8 @@ void __init debug_objects_mem_init(void)
 
 	obj_cache = kmem_cache_create("debug_objects_cache",
 				      sizeof (struct debug_obj), 0,
-				      SLAB_DEBUG_OBJECTS, NULL);
+				      SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE,
+				      NULL);
 
 	if (!obj_cache || debug_objects_replace_static_objects()) {
 		debug_objects_enabled = 0;

From 466570dc30cf556a0f27c9d823341e249b2000a9 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Fri, 30 Nov 2018 14:10:54 -0800
Subject: [PATCH 1911/3220] ocfs2: fix potential use after free

[ Upstream commit 164f7e586739d07eb56af6f6d66acebb11f315c8 ]

ocfs2_get_dentry() calls iput(inode) to drop the reference count of
inode, and if the reference count hits 0, inode is freed.  However, in
this function, it then reads inode->i_generation, which may result in a
use after free bug.  Move the put operation later.

Link: http://lkml.kernel.org/r/1543109237-110227-1-git-send-email-bianpan2016@163.com
Fixes: 781f200cb7a("ocfs2: Remove masklog ML_EXPORT.")
Signed-off-by: Pan Bian <bianpan2016@163.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ocfs2/export.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 827fc9809bc2..3494e220b510 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -125,10 +125,10 @@ check_err:
 
 check_gen:
 	if (handle->ih_generation != inode->i_generation) {
-		iput(inode);
 		trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
 						  handle->ih_generation,
 						  inode->i_generation);
+		iput(inode);
 		result = ERR_PTR(-ESTALE);
 		goto bail;
 	}

From 7f0324fb34c4736bf12ae8a5137e6913772749cc Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 19 Oct 2016 10:23:41 +0900
Subject: [PATCH 1912/3220] pstore: Convert console write to use ->write_buf

[ Upstream commit 70ad35db3321a6d129245979de4ac9d06eed897c ]

Maybe I'm missing something, but I don't know why it needs to copy the
input buffer to psinfo->buf and then write.  Instead we can write the
input buffer directly.  The only implementation that supports console
message (i.e. ramoops) already does it for ftrace messages.

For the upcoming virtio backend driver, it needs to protect psinfo->buf
overwritten from console messages.  If it could use ->write_buf method
instead of ->write, the problem will be solved easily.

Cc: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/pstore/platform.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 588461bb2dd4..e97e7d74e134 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -392,8 +392,8 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 		} else {
 			spin_lock_irqsave(&psinfo->buf_lock, flags);
 		}
-		memcpy(psinfo->buf, s, c);
-		psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
+		psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0,
+				  s, 0, c, psinfo);
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 		s += c;
 		c = e - s;

From 8420459f1d938a02b060bde1e161fdd1de212fac Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Wed, 14 Jun 2017 19:30:03 +0900
Subject: [PATCH 1913/3220] ALSA: pcm: remove SNDRV_PCM_IOCTL1_INFO internal
 command

commit e11f0f90a626f93899687b1cc909ee37dd6c5809 upstream.

Drivers can implement 'struct snd_pcm_ops.ioctl' to handle some requests
from ALSA PCM core. These requests are internal purpose in kernel land.
Usually common set of operations are used for it.

SNDRV_PCM_IOCTL1_INFO is one of the requests. According to code comment,
it has been obsoleted in the old days.

We can see old releases in ftp.alsa-project.org. The command was firstly
introduced in v0.5.0 release as SND_PCM_IOCTL1_INFO, to allow drivers to
fill data of 'struct snd_pcm_channel_info' type. In v0.9.0 release,
this was obsoleted by the other commands for ioctl(2) such as
SNDRV_PCM_IOCTL_CHANNEL_INFO.

This commit removes the long-abandoned command, bye.

Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/sound/pcm.h     | 2 +-
 sound/core/pcm_lib.c    | 2 --
 sound/core/pcm_native.c | 6 +-----
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index b0be09279943..ffc161906d36 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -100,7 +100,7 @@ struct snd_pcm_ops {
 #endif
 
 #define SNDRV_PCM_IOCTL1_RESET		0
-#define SNDRV_PCM_IOCTL1_INFO		1
+/* 1 is absent slot. */
 #define SNDRV_PCM_IOCTL1_CHANNEL_INFO	2
 #define SNDRV_PCM_IOCTL1_GSTATE		3
 #define SNDRV_PCM_IOCTL1_FIFO_SIZE	4
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index 5bc7ddf8fc70..3ce2b8771762 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -1849,8 +1849,6 @@ int snd_pcm_lib_ioctl(struct snd_pcm_substream *substream,
 		      unsigned int cmd, void *arg)
 {
 	switch (cmd) {
-	case SNDRV_PCM_IOCTL1_INFO:
-		return 0;
 	case SNDRV_PCM_IOCTL1_RESET:
 		return snd_pcm_lib_ioctl_reset(substream, arg);
 	case SNDRV_PCM_IOCTL1_CHANNEL_INFO:
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 0ad194002c0c..9b6dcdea4431 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -214,11 +214,7 @@ int snd_pcm_info(struct snd_pcm_substream *substream, struct snd_pcm_info *info)
 	info->subdevices_avail = pstr->substream_count - pstr->substream_opened;
 	strlcpy(info->subname, substream->name, sizeof(info->subname));
 	runtime = substream->runtime;
-	/* AB: FIXME!!! This is definitely nonsense */
-	if (runtime) {
-		info->sync = runtime->sync;
-		substream->ops->ioctl(substream, SNDRV_PCM_IOCTL1_INFO, info);
-	}
+
 	return 0;
 }
 

From e90c6ad207bcb7a599c259596c9a9e1bb15eb7bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Mon, 8 Aug 2016 20:16:22 +0200
Subject: [PATCH 1914/3220] KVM: nVMX: fix msr bitmaps to prevent L2 from
 accessing L0 x2APIC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit d048c098218e91ed0e10dfa1f0f80e2567fe4ef7 upstream.

msr bitmap can be used to avoid a VM exit (interception) on guest MSR
accesses.  In some configurations of VMX controls, the guest can even
directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
APIC ACCESSES.

L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
To do so, L1 would first trick KVM to disable all possible interceptions
by enabling APICv features and then would turn those features off;
nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
not intercept previously enabled MSRs even though they were not safe
with the new configuration.

Correctly re-enabling interceptions is not enough as a second bug would
still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
VMCSs, so L1 could trigger a race to get the desired combination of msr
bitmap and VMX controls.

This fix allocates a msr bitmap for every L1 VCPU, allows only safe
x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
have to intercept everything anyway.

Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
Reported-by: Jim Mattson <jmattson@google.com>
Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
[bwh: Backported to 4.4:
 - handle_vmon() doesn't allocate a cached vmcs12
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 96 ++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 58 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c5a4b1978cbf..3df636400ec8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -431,6 +431,8 @@ struct nested_vmx {
 	u16 posted_intr_nv;
 	u64 msr_ia32_feature_control;
 
+	unsigned long *msr_bitmap;
+
 	struct hrtimer preemption_timer;
 	bool preemption_timer_expired;
 
@@ -912,7 +914,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -2358,7 +2359,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 	unsigned long *msr_bitmap;
 
 	if (is_guest_mode(vcpu))
-		msr_bitmap = vmx_msr_bitmap_nested;
+		msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
 	else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
 		if (is_long_mode(vcpu))
 			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
@@ -6192,13 +6193,6 @@ static __init int hardware_setup(void)
 	if (!vmx_msr_bitmap_longmode_x2apic)
 		goto out4;
 
-	if (nested) {
-		vmx_msr_bitmap_nested =
-			(unsigned long *)__get_free_page(GFP_KERNEL);
-		if (!vmx_msr_bitmap_nested)
-			goto out5;
-	}
-
 	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmread_bitmap)
 		goto out6;
@@ -6216,8 +6210,6 @@ static __init int hardware_setup(void)
 
 	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
 	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-	if (nested)
-		memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
 
 	if (setup_vmcs_config(&vmcs_config) < 0) {
 		r = -EIO;
@@ -6354,9 +6346,6 @@ out8:
 out7:
 	free_page((unsigned long)vmx_vmread_bitmap);
 out6:
-	if (nested)
-		free_page((unsigned long)vmx_msr_bitmap_nested);
-out5:
 	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
@@ -6382,8 +6371,6 @@ static __exit void hardware_unsetup(void)
 	free_page((unsigned long)vmx_io_bitmap_a);
 	free_page((unsigned long)vmx_vmwrite_bitmap);
 	free_page((unsigned long)vmx_vmread_bitmap);
-	if (nested)
-		free_page((unsigned long)vmx_msr_bitmap_nested);
 
 	free_kvm_area();
 }
@@ -6825,10 +6812,17 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	if (cpu_has_vmx_msr_bitmap()) {
+		vmx->nested.msr_bitmap =
+				(unsigned long *)__get_free_page(GFP_KERNEL);
+		if (!vmx->nested.msr_bitmap)
+			goto out_msr_bitmap;
+	}
+
 	if (enable_shadow_vmcs) {
 		shadow_vmcs = alloc_vmcs();
 		if (!shadow_vmcs)
-			return -ENOMEM;
+			goto out_shadow_vmcs;
 		/* mark vmcs as shadow */
 		shadow_vmcs->revision_id |= (1u << 31);
 		/* init shadow vmcs */
@@ -6850,6 +6844,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	skip_emulated_instruction(vcpu);
 	nested_vmx_succeed(vcpu);
 	return 1;
+
+out_shadow_vmcs:
+	free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+	return -ENOMEM;
 }
 
 /*
@@ -6919,6 +6919,10 @@ static void free_nested(struct vcpu_vmx *vmx)
 	vmx->nested.vmxon = false;
 	free_vpid(vmx->nested.vpid02);
 	nested_release_vmcs12(vmx);
+	if (vmx->nested.msr_bitmap) {
+		free_page((unsigned long)vmx->nested.msr_bitmap);
+		vmx->nested.msr_bitmap = NULL;
+	}
 	if (enable_shadow_vmcs)
 		free_vmcs(vmx->nested.current_shadow_vmcs);
 	/* Unpin physical memory we referred to in current vmcs02 */
@@ -9248,8 +9252,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 {
 	int msr;
 	struct page *page;
-	unsigned long *msr_bitmap;
+	unsigned long *msr_bitmap_l1;
+	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
 
+	/* This shortcut is ok because we support only x2APIC MSRs so far. */
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
 		return false;
 
@@ -9258,58 +9264,32 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 		WARN_ON(1);
 		return false;
 	}
-	msr_bitmap = (unsigned long *)kmap(page);
+	msr_bitmap_l1 = (unsigned long *)kmap(page);
+
+	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
 
 	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
 		if (nested_cpu_has_apic_reg_virt(vmcs12))
 			for (msr = 0x800; msr <= 0x8ff; msr++)
 				nested_vmx_disable_intercept_for_msr(
-					msr_bitmap,
-					vmx_msr_bitmap_nested,
+					msr_bitmap_l1, msr_bitmap_l0,
 					msr, MSR_TYPE_R);
-		/* TPR is allowed */
-		nested_vmx_disable_intercept_for_msr(msr_bitmap,
-				vmx_msr_bitmap_nested,
+
+		nested_vmx_disable_intercept_for_msr(
+				msr_bitmap_l1, msr_bitmap_l0,
 				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
 				MSR_TYPE_R | MSR_TYPE_W);
+
 		if (nested_cpu_has_vid(vmcs12)) {
-			/* EOI and self-IPI are allowed */
 			nested_vmx_disable_intercept_for_msr(
-				msr_bitmap,
-				vmx_msr_bitmap_nested,
+				msr_bitmap_l1, msr_bitmap_l0,
 				APIC_BASE_MSR + (APIC_EOI >> 4),
 				MSR_TYPE_W);
 			nested_vmx_disable_intercept_for_msr(
-				msr_bitmap,
-				vmx_msr_bitmap_nested,
+				msr_bitmap_l1, msr_bitmap_l0,
 				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
 				MSR_TYPE_W);
 		}
-	} else {
-		/*
-		 * Enable reading intercept of all the x2apic
-		 * MSRs. We should not rely on vmcs12 to do any
-		 * optimizations here, it may have been modified
-		 * by L1.
-		 */
-		for (msr = 0x800; msr <= 0x8ff; msr++)
-			__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				msr,
-				MSR_TYPE_R);
-
-		__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-				MSR_TYPE_W);
-		__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				APIC_BASE_MSR + (APIC_EOI >> 4),
-				MSR_TYPE_W);
-		__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
-				MSR_TYPE_W);
 	}
 	kunmap(page);
 	nested_release_page_clean(page);
@@ -9729,10 +9709,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	}
 
 	if (cpu_has_vmx_msr_bitmap() &&
-	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
-		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
-		/* MSR_BITMAP will be set by following vmx_set_efer. */
-	} else
+	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+		; /* MSR_BITMAP will be set by following vmx_set_efer. */
+	else
 		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
 
 	/*

From 1bec1a14bb080e86af254984135cd83e76f1f91d Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Tue, 1 Aug 2017 14:00:40 -0700
Subject: [PATCH 1915/3220] KVM: nVMX: mark vmcs12 pages dirty on L2 exit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570 upstream.

The host physical addresses of L1's Virtual APIC Page and Posted
Interrupt descriptor are loaded into the VMCS02. The CPU may write
to these pages via their host physical address while L2 is running,
bypassing address-translation-based dirty tracking (e.g. EPT write
protection). Mark them dirty on every exit from L2 to prevent them
from getting out of sync with dirty tracking.

Also mark the virtual APIC page and the posted interrupt descriptor
dirty when KVM is virtualizing posted interrupt processing.

Signed-off-by: David Matlack <dmatlack@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3df636400ec8..b886a7c9ed4b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4527,6 +4527,28 @@ static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 	return enable_apicv && lapic_in_kernel(vcpu);
 }
 
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	gfn_t gfn;
+
+	/*
+	 * Don't need to mark the APIC access page dirty; it is never
+	 * written to by the CPU during APIC virtualization.
+	 */
+
+	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+		gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	}
+
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	}
+}
+
+
 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4534,18 +4556,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 	void *vapic_page;
 	u16 status;
 
-	if (vmx->nested.pi_desc &&
-	    vmx->nested.pi_pending) {
-		vmx->nested.pi_pending = false;
-		if (!pi_test_and_clear_on(vmx->nested.pi_desc))
-			return;
+	if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+		return;
 
-		max_irr = find_last_bit(
-			(unsigned long *)vmx->nested.pi_desc->pir, 256);
-
-		if (max_irr == 256)
-			return;
+	vmx->nested.pi_pending = false;
+	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+		return;
 
+	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+	if (max_irr != 256) {
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
 		kunmap(vmx->nested.virtual_apic_page);
@@ -4557,6 +4576,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			vmcs_write16(GUEST_INTR_STATUS, status);
 		}
 	}
+
+	nested_mark_vmcs12_pages_dirty(vcpu);
 }
 
 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -7761,6 +7782,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
 				KVM_ISA_VMX);
 
+	/*
+	 * The host physical addresses of some pages of guest memory
+	 * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+	 * may write to these pages via their host physical address while
+	 * L2 is running, bypassing any address-translation-based dirty
+	 * tracking (e.g. EPT write protection).
+	 *
+	 * Mark them dirty on every exit from L2 to prevent them from
+	 * getting out of sync with dirty tracking.
+	 */
+	nested_mark_vmcs12_pages_dirty(vcpu);
+
 	if (vmx->nested.nested_run_pending)
 		return false;
 

From 81cd492667c69020b3f55bed8eb5bfa4bebf7895 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Mon, 27 Nov 2017 17:22:25 -0600
Subject: [PATCH 1916/3220] KVM: nVMX: Eliminate vmcs02 pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit de3a0021a60635de96aa92713c1a31a96747d72c upstream.

The potential performance advantages of a vmcs02 pool have never been
realized. To simplify the code, eliminate the pool. Instead, a single
vmcs02 is allocated per VCPU when the VCPU enters VMX operation.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
Reviewed-by: Ameya More <ameya.more@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4:
 - No loaded_vmcs::shadow_vmcs field to initialise
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 144 +++++++--------------------------------------
 1 file changed, 22 insertions(+), 122 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b886a7c9ed4b..cf131f41d4b9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -172,7 +172,6 @@ module_param(ple_window_max, int, S_IRUGO);
 extern const ulong vmx_return;
 
 #define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
 
 struct vmcs {
 	u32 revision_id;
@@ -205,7 +204,7 @@ struct shared_msr_entry {
  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
  * More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
  * underlying hardware which will be used to run L2.
  * This structure is packed to ensure that its layout is identical across
  * machines (necessary for live migration).
@@ -384,13 +383,6 @@ struct __packed vmcs12 {
  */
 #define VMCS12_SIZE 0x1000
 
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
-	struct list_head list;
-	gpa_t vmptr;
-	struct loaded_vmcs vmcs02;
-};
-
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -412,16 +404,16 @@ struct nested_vmx {
 	 */
 	bool sync_shadow_vmcs;
 
-	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
-	struct list_head vmcs02_pool;
-	int vmcs02_num;
 	u64 vmcs01_tsc_offset;
 	bool change_vmcs01_virtual_x2apic_mode;
 	/* L2 must run next, and mustn't decide to exit to L1. */
 	bool nested_run_pending;
+
+	struct loaded_vmcs vmcs02;
+
 	/*
-	 * Guest pages referred to in vmcs02 with host-physical pointers, so
-	 * we must keep them pinned while L2 runs.
+	 * Guest pages referred to in the vmcs02 with host-physical
+	 * pointers, so we must keep them pinned while L2 runs.
 	 */
 	struct page *apic_access_page;
 	struct page *virtual_apic_page;
@@ -6434,93 +6426,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
 	return handle_nop(vcpu);
 }
 
-/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
-	struct vmcs02_list *item;
-	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-		if (item->vmptr == vmx->nested.current_vmptr) {
-			list_move(&item->list, &vmx->nested.vmcs02_pool);
-			return &item->vmcs02;
-		}
-
-	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
-		/* Recycle the least recently used VMCS. */
-		item = list_entry(vmx->nested.vmcs02_pool.prev,
-			struct vmcs02_list, list);
-		item->vmptr = vmx->nested.current_vmptr;
-		list_move(&item->list, &vmx->nested.vmcs02_pool);
-		return &item->vmcs02;
-	}
-
-	/* Create a new VMCS */
-	item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
-	if (!item)
-		return NULL;
-	item->vmcs02.vmcs = alloc_vmcs();
-	if (!item->vmcs02.vmcs) {
-		kfree(item);
-		return NULL;
-	}
-	loaded_vmcs_init(&item->vmcs02);
-	item->vmptr = vmx->nested.current_vmptr;
-	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
-	vmx->nested.vmcs02_num++;
-	return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
-	struct vmcs02_list *item;
-	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-		if (item->vmptr == vmptr) {
-			free_loaded_vmcs(&item->vmcs02);
-			list_del(&item->list);
-			kfree(item);
-			vmx->nested.vmcs02_num--;
-			return;
-		}
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
-	struct vmcs02_list *item, *n;
-
-	WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
-	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
-		/*
-		 * Something will leak if the above WARN triggers.  Better than
-		 * a use-after-free.
-		 */
-		if (vmx->loaded_vmcs == &item->vmcs02)
-			continue;
-
-		free_loaded_vmcs(&item->vmcs02);
-		list_del(&item->list);
-		kfree(item);
-		vmx->nested.vmcs02_num--;
-	}
-}
-
 /*
  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
  * set the success or error code of an emulated VMX instruction, as specified
@@ -6833,6 +6738,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	vmx->nested.vmcs02.vmcs = alloc_vmcs();
+	if (!vmx->nested.vmcs02.vmcs)
+		goto out_vmcs02;
+	loaded_vmcs_init(&vmx->nested.vmcs02);
+
 	if (cpu_has_vmx_msr_bitmap()) {
 		vmx->nested.msr_bitmap =
 				(unsigned long *)__get_free_page(GFP_KERNEL);
@@ -6851,9 +6761,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		vmx->nested.current_shadow_vmcs = shadow_vmcs;
 	}
 
-	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
-	vmx->nested.vmcs02_num = 0;
-
 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_REL);
 	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -6870,6 +6777,9 @@ out_shadow_vmcs:
 	free_page((unsigned long)vmx->nested.msr_bitmap);
 
 out_msr_bitmap:
+	free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
 	return -ENOMEM;
 }
 
@@ -6946,7 +6856,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 	}
 	if (enable_shadow_vmcs)
 		free_vmcs(vmx->nested.current_shadow_vmcs);
-	/* Unpin physical memory we referred to in current vmcs02 */
+	/* Unpin physical memory we referred to in the vmcs02 */
 	if (vmx->nested.apic_access_page) {
 		nested_release_page(vmx->nested.apic_access_page);
 		vmx->nested.apic_access_page = NULL;
@@ -6962,7 +6872,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->nested.pi_desc = NULL;
 	}
 
-	nested_free_all_saved_vmcss(vmx);
+	free_loaded_vmcs(&vmx->nested.vmcs02);
 }
 
 /* Emulate the VMXOFF instruction */
@@ -6996,8 +6906,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 			vmptr + offsetof(struct vmcs12, launch_state),
 			&zero, sizeof(zero));
 
-	nested_free_vmcs02(vmx, vmptr);
-
 	skip_emulated_instruction(vcpu);
 	nested_vmx_succeed(vcpu);
 	return 1;
@@ -7784,10 +7692,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 
 	/*
 	 * The host physical addresses of some pages of guest memory
-	 * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
-	 * may write to these pages via their host physical address while
-	 * L2 is running, bypassing any address-translation-based dirty
-	 * tracking (e.g. EPT write protection).
+	 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+	 * Page). The CPU may write to these pages via their host
+	 * physical address while L2 is running, bypassing any
+	 * address-translation-based dirty tracking (e.g. EPT write
+	 * protection).
 	 *
 	 * Mark them dirty on every exit from L2 to prevent them from
 	 * getting out of sync with dirty tracking.
@@ -9889,7 +9798,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	struct vmcs12 *vmcs12;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int cpu;
-	struct loaded_vmcs *vmcs02;
 	bool ia32e;
 	u32 msr_entry_idx;
 
@@ -10029,10 +9937,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	 * the nested entry.
 	 */
 
-	vmcs02 = nested_get_current_vmcs02(vmx);
-	if (!vmcs02)
-		return -ENOMEM;
-
 	enter_guest_mode(vcpu);
 
 	vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
@@ -10041,7 +9945,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
 
 	cpu = get_cpu();
-	vmx->loaded_vmcs = vmcs02;
+	vmx->loaded_vmcs = &vmx->nested.vmcs02;
 	vmx_vcpu_put(vcpu);
 	vmx_vcpu_load(vcpu, cpu);
 	vcpu->cpu = cpu;
@@ -10553,10 +10457,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
 	vmx_segment_cache_clear(vmx);
 
-	/* if no vmcs02 cache requested, remove the one we used */
-	if (VMCS02_POOL_SIZE == 0)
-		nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
 	load_vmcs12_host_state(vcpu, vmcs12);
 
 	/* Update TSC_OFFSET if TSC was changed while L2 ran */

From 8f54df9756caed1d499bc8f412ab736a8928dc39 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 11 Jan 2018 12:16:15 +0100
Subject: [PATCH 1917/3220] KVM: VMX: introduce alloc_loaded_vmcs

commit f21f165ef922c2146cc5bdc620f542953c41714b upstream.

Group together the calls to alloc_vmcs and loaded_vmcs_init.  Soon we'll also
allocate an MSR bitmap there.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4:
 - No loaded_vmcs::shadow_vmcs field to initialise
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cf131f41d4b9..5ffc2731e14d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3345,11 +3345,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 	return vmcs;
 }
 
-static struct vmcs *alloc_vmcs(void)
-{
-	return alloc_vmcs_cpu(raw_smp_processor_id());
-}
-
 static void free_vmcs(struct vmcs *vmcs)
 {
 	free_pages((unsigned long)vmcs, vmcs_config.order);
@@ -3367,6 +3362,21 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 	loaded_vmcs->vmcs = NULL;
 }
 
+static struct vmcs *alloc_vmcs(void)
+{
+	return alloc_vmcs_cpu(raw_smp_processor_id());
+}
+
+static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+	loaded_vmcs->vmcs = alloc_vmcs();
+	if (!loaded_vmcs->vmcs)
+		return -ENOMEM;
+
+	loaded_vmcs_init(loaded_vmcs);
+	return 0;
+}
+
 static void free_kvm_area(void)
 {
 	int cpu;
@@ -6699,6 +6709,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	struct vmcs *shadow_vmcs;
 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+	int r;
 
 	/* The Intel VMX Instruction Reference lists a bunch of bits that
 	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -6738,10 +6749,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	vmx->nested.vmcs02.vmcs = alloc_vmcs();
-	if (!vmx->nested.vmcs02.vmcs)
+	r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
+	if (r < 0)
 		goto out_vmcs02;
-	loaded_vmcs_init(&vmx->nested.vmcs02);
 
 	if (cpu_has_vmx_msr_bitmap()) {
 		vmx->nested.msr_bitmap =
@@ -8802,16 +8812,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx->guest_msrs)
 		goto free_pml;
 
-	vmx->loaded_vmcs = &vmx->vmcs01;
-	vmx->loaded_vmcs->vmcs = alloc_vmcs();
-	if (!vmx->loaded_vmcs->vmcs)
-		goto free_msrs;
 	if (!vmm_exclusive)
 		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
-	loaded_vmcs_init(vmx->loaded_vmcs);
+	err = alloc_loaded_vmcs(&vmx->vmcs01);
 	if (!vmm_exclusive)
 		kvm_cpu_vmxoff();
+	if (err < 0)
+		goto free_msrs;
 
+	vmx->loaded_vmcs = &vmx->vmcs01;
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
 	vmx->vcpu.cpu = cpu;

From 321fbb1fad297ccbac0efd28e58851a085ac29fa Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 16 Jan 2018 16:51:18 +0100
Subject: [PATCH 1918/3220] KVM: VMX: make MSR bitmaps per-VCPU

commit 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6 upstream.

Place the MSR bitmap in struct loaded_vmcs, and update it in place
every time the x2apic or APICv state can change.  This is rare and
the loop can handle 64 MSRs per iteration, in a similar fashion as
nested_vmx_prepare_msr_bitmap.

This prepares for choosing, on a per-VM basis, whether to intercept
the SPEC_CTRL and PRED_CMD MSRs.

Suggested-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4:
 - APICv support looked different
 - We still need to intercept the APIC_ID MSR
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 254 ++++++++++++++++++++-------------------------
 1 file changed, 112 insertions(+), 142 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5ffc2731e14d..e0064855fbdb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -109,6 +109,14 @@ static u64 __read_mostly host_xss;
 static bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+#define MSR_TYPE_R	1
+#define MSR_TYPE_W	2
+#define MSR_TYPE_RW	3
+
+#define MSR_BITMAP_MODE_X2APIC		1
+#define MSR_BITMAP_MODE_X2APIC_APICV	2
+#define MSR_BITMAP_MODE_LM		4
+
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
@@ -188,6 +196,7 @@ struct loaded_vmcs {
 	struct vmcs *vmcs;
 	int cpu;
 	int launched;
+	unsigned long *msr_bitmap;
 	struct list_head loaded_vmcss_on_cpu_link;
 };
 
@@ -423,8 +432,6 @@ struct nested_vmx {
 	u16 posted_intr_nv;
 	u64 msr_ia32_feature_control;
 
-	unsigned long *msr_bitmap;
-
 	struct hrtimer preemption_timer;
 	bool preemption_timer_expired;
 
@@ -525,6 +532,7 @@ struct vcpu_vmx {
 	unsigned long         host_rsp;
 	u8                    fail;
 	bool                  nmi_known_unmasked;
+	u8		      msr_bitmap_mode;
 	u32                   exit_intr_info;
 	u32                   idt_vectoring_info;
 	ulong                 rflags;
@@ -883,6 +891,7 @@ static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static int alloc_identity_pagetable(struct kvm *kvm);
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -902,10 +911,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
 
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap_legacy;
-static unsigned long *vmx_msr_bitmap_longmode;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -2346,27 +2351,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 	vmx->guest_msrs[from] = tmp;
 }
 
-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
-{
-	unsigned long *msr_bitmap;
-
-	if (is_guest_mode(vcpu))
-		msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
-	else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
-		if (is_long_mode(vcpu))
-			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
-		else
-			msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
-	} else {
-		if (is_long_mode(vcpu))
-			msr_bitmap = vmx_msr_bitmap_longmode;
-		else
-			msr_bitmap = vmx_msr_bitmap_legacy;
-	}
-
-	vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
-}
-
 /*
  * Set up the vmcs to automatically save and restore system
  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
@@ -2407,7 +2391,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 	vmx->save_nmsrs = save_nmsrs;
 
 	if (cpu_has_vmx_msr_bitmap())
-		vmx_set_msr_bitmap(&vmx->vcpu);
+		vmx_update_msr_bitmap(&vmx->vcpu);
 }
 
 /*
@@ -3360,6 +3344,8 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 	loaded_vmcs_clear(loaded_vmcs);
 	free_vmcs(loaded_vmcs->vmcs);
 	loaded_vmcs->vmcs = NULL;
+	if (loaded_vmcs->msr_bitmap)
+		free_page((unsigned long)loaded_vmcs->msr_bitmap);
 }
 
 static struct vmcs *alloc_vmcs(void)
@@ -3374,7 +3360,18 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 		return -ENOMEM;
 
 	loaded_vmcs_init(loaded_vmcs);
+
+	if (cpu_has_vmx_msr_bitmap()) {
+		loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+		if (!loaded_vmcs->msr_bitmap)
+			goto out_vmcs;
+		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+	}
 	return 0;
+
+out_vmcs:
+	free_loaded_vmcs(loaded_vmcs);
+	return -ENOMEM;
 }
 
 static void free_kvm_area(void)
@@ -4373,10 +4370,8 @@ static void free_vpid(int vpid)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-#define MSR_TYPE_R	1
-#define MSR_TYPE_W	2
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
-						u32 msr, int type)
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+							  u32 msr, int type)
 {
 	int f = sizeof(unsigned long);
 
@@ -4410,8 +4405,8 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
 	}
 }
 
-static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
-						u32 msr, int type)
+static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+							 u32 msr, int type)
 {
 	int f = sizeof(unsigned long);
 
@@ -4491,37 +4486,76 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
 	}
 }
 
-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
+			     			      u32 msr, int type, bool value)
 {
-	if (!longmode_only)
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
-						msr, MSR_TYPE_R | MSR_TYPE_W);
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
-						msr, MSR_TYPE_R | MSR_TYPE_W);
+	if (value)
+		vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+	else
+		vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
 }
 
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
 {
-	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-			msr, MSR_TYPE_R);
-	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-			msr, MSR_TYPE_R);
+	u8 mode = 0;
+
+	if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+		mode |= MSR_BITMAP_MODE_X2APIC;
+		if (enable_apicv)
+			mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+	}
+
+	if (is_long_mode(vcpu))
+		mode |= MSR_BITMAP_MODE_LM;
+
+	return mode;
 }
 
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
+					 u8 mode)
 {
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-			msr, MSR_TYPE_R);
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-			msr, MSR_TYPE_R);
+	int msr;
+
+	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+		unsigned word = msr / BITS_PER_LONG;
+		msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
+		msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+	}
+
+	if (mode & MSR_BITMAP_MODE_X2APIC) {
+		/*
+		 * TPR reads and writes can be virtualized even if virtual interrupt
+		 * delivery is not in use.
+		 */
+		vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+		if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+			vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_ID), MSR_TYPE_R);
+			vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
+			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+		}
+	}
 }
 
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
 {
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-			msr, MSR_TYPE_W);
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-			msr, MSR_TYPE_W);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+	u8 mode = vmx_msr_bitmap_mode(vcpu);
+	u8 changed = mode ^ vmx->msr_bitmap_mode;
+
+	if (!changed)
+		return;
+
+	vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
+				  !(mode & MSR_BITMAP_MODE_LM));
+
+	if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
+		vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+
+	vmx->msr_bitmap_mode = mode;
 }
 
 static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
@@ -4842,7 +4876,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
 	}
 	if (cpu_has_vmx_msr_bitmap())
-		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
+		vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
 
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
@@ -6183,7 +6217,7 @@ static void wakeup_handler(void)
 
 static __init int hardware_setup(void)
 {
-	int r = -ENOMEM, i, msr;
+	int r = -ENOMEM, i;
 
 	rdmsrl_safe(MSR_EFER, &host_efer);
 
@@ -6198,31 +6232,13 @@ static __init int hardware_setup(void)
 	if (!vmx_io_bitmap_b)
 		goto out;
 
-	vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_legacy)
-		goto out1;
-
-	vmx_msr_bitmap_legacy_x2apic =
-				(unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_legacy_x2apic)
-		goto out2;
-
-	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_longmode)
-		goto out3;
-
-	vmx_msr_bitmap_longmode_x2apic =
-				(unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_longmode_x2apic)
-		goto out4;
-
 	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmread_bitmap)
-		goto out6;
+		goto out1;
 
 	vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmwrite_bitmap)
-		goto out7;
+		goto out2;
 
 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -6231,12 +6247,9 @@ static __init int hardware_setup(void)
 
 	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
 
-	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
-	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
 	if (setup_vmcs_config(&vmcs_config) < 0) {
 		r = -EIO;
-		goto out8;
+		goto out3;
 	}
 
 	if (boot_cpu_has(X86_FEATURE_NX))
@@ -6302,38 +6315,8 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
 	}
 
-	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
-	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
-	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
-	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
-	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
-	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-
-	memcpy(vmx_msr_bitmap_legacy_x2apic,
-			vmx_msr_bitmap_legacy, PAGE_SIZE);
-	memcpy(vmx_msr_bitmap_longmode_x2apic,
-			vmx_msr_bitmap_longmode, PAGE_SIZE);
-
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
-	if (enable_apicv) {
-		for (msr = 0x800; msr <= 0x8ff; msr++)
-			vmx_disable_intercept_msr_read_x2apic(msr);
-
-		/* According SDM, in x2apic mode, the whole id reg is used.
-		 * But in KVM, it only use the highest eight bits. Need to
-		 * intercept it */
-		vmx_enable_intercept_msr_read_x2apic(0x802);
-		/* TMCCT */
-		vmx_enable_intercept_msr_read_x2apic(0x839);
-		/* TPR */
-		vmx_disable_intercept_msr_write_x2apic(0x808);
-		/* EOI */
-		vmx_disable_intercept_msr_write_x2apic(0x80b);
-		/* SELF-IPI */
-		vmx_disable_intercept_msr_write_x2apic(0x83f);
-	}
-
 	if (enable_ept) {
 		kvm_mmu_set_mask_ptes(0ull,
 			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
@@ -6364,18 +6347,10 @@ static __init int hardware_setup(void)
 
 	return alloc_kvm_area();
 
-out8:
-	free_page((unsigned long)vmx_vmwrite_bitmap);
-out7:
-	free_page((unsigned long)vmx_vmread_bitmap);
-out6:
-	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
-	free_page((unsigned long)vmx_msr_bitmap_longmode);
 out3:
-	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+	free_page((unsigned long)vmx_vmwrite_bitmap);
 out2:
-	free_page((unsigned long)vmx_msr_bitmap_legacy);
+	free_page((unsigned long)vmx_vmread_bitmap);
 out1:
 	free_page((unsigned long)vmx_io_bitmap_b);
 out:
@@ -6386,10 +6361,6 @@ out:
 
 static __exit void hardware_unsetup(void)
 {
-	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-	free_page((unsigned long)vmx_msr_bitmap_legacy);
-	free_page((unsigned long)vmx_msr_bitmap_longmode);
 	free_page((unsigned long)vmx_io_bitmap_b);
 	free_page((unsigned long)vmx_io_bitmap_a);
 	free_page((unsigned long)vmx_vmwrite_bitmap);
@@ -6753,13 +6724,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	if (r < 0)
 		goto out_vmcs02;
 
-	if (cpu_has_vmx_msr_bitmap()) {
-		vmx->nested.msr_bitmap =
-				(unsigned long *)__get_free_page(GFP_KERNEL);
-		if (!vmx->nested.msr_bitmap)
-			goto out_msr_bitmap;
-	}
-
 	if (enable_shadow_vmcs) {
 		shadow_vmcs = alloc_vmcs();
 		if (!shadow_vmcs)
@@ -6784,9 +6748,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	return 1;
 
 out_shadow_vmcs:
-	free_page((unsigned long)vmx->nested.msr_bitmap);
-
-out_msr_bitmap:
 	free_loaded_vmcs(&vmx->nested.vmcs02);
 
 out_vmcs02:
@@ -6860,10 +6821,6 @@ static void free_nested(struct vcpu_vmx *vmx)
 	vmx->nested.vmxon = false;
 	free_vpid(vmx->nested.vpid02);
 	nested_release_vmcs12(vmx);
-	if (vmx->nested.msr_bitmap) {
-		free_page((unsigned long)vmx->nested.msr_bitmap);
-		vmx->nested.msr_bitmap = NULL;
-	}
 	if (enable_shadow_vmcs)
 		free_vmcs(vmx->nested.current_shadow_vmcs);
 	/* Unpin physical memory we referred to in the vmcs02 */
@@ -8200,7 +8157,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	}
 	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
 
-	vmx_set_msr_bitmap(vcpu);
+	vmx_update_msr_bitmap(vcpu);
 }
 
 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
@@ -8780,6 +8737,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
 	struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	unsigned long *msr_bitmap;
 	int cpu;
 
 	if (!vmx)
@@ -8820,6 +8778,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (err < 0)
 		goto free_msrs;
 
+	msr_bitmap = vmx->vmcs01.msr_bitmap;
+	vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+	vmx->msr_bitmap_mode = 0;
+
 	vmx->loaded_vmcs = &vmx->vmcs01;
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -9204,7 +9171,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 	int msr;
 	struct page *page;
 	unsigned long *msr_bitmap_l1;
-	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
+	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
 
 	/* This shortcut is ok because we support only x2APIC MSRs so far. */
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
@@ -9715,6 +9682,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	else
 		vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
 
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+
 	if (enable_vpid) {
 		/*
 		 * There is no direct mapping between vpid02 and vpid12, the
@@ -10415,7 +10385,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 
 	if (cpu_has_vmx_msr_bitmap())
-		vmx_set_msr_bitmap(vcpu);
+		vmx_update_msr_bitmap(vcpu);
 
 	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
 				vmcs12->vm_exit_msr_load_count))

From 4b3870c343a82cd2df7192cc5149c87205dcc611 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Thu, 1 Feb 2018 22:59:43 +0100
Subject: [PATCH 1919/3220] KVM/x86: Add IBPB support

commit 15d45071523d89b3fb7372e2135fbd72f6af9506 upstream.

The Indirect Branch Predictor Barrier (IBPB) is an indirect branch
control mechanism. It keeps earlier branches from influencing
later ones.

Unlike IBRS and STIBP, IBPB does not define a new mode of operation.
It's a command that ensures predicted branch targets aren't used after
the barrier. Although IBRS and IBPB are enumerated by the same CPUID
enumeration, IBPB is very different.

IBPB helps mitigate against three potential attacks:

* Mitigate guests from being attacked by other guests.
  - This is addressed by issing IBPB when we do a guest switch.

* Mitigate attacks from guest/ring3->host/ring3.
  These would require a IBPB during context switch in host, or after
  VMEXIT. The host process has two ways to mitigate
  - Either it can be compiled with retpoline
  - If its going through context switch, and has set !dumpable then
    there is a IBPB in that path.
    (Tim's patch: https://patchwork.kernel.org/patch/10192871)
  - The case where after a VMEXIT you return back to Qemu might make
    Qemu attackable from guest when Qemu isn't compiled with retpoline.
  There are issues reported when doing IBPB on every VMEXIT that resulted
  in some tsc calibration woes in guest.

* Mitigate guest/ring0->host/ring0 attacks.
  When host kernel is using retpoline it is safe against these attacks.
  If host kernel isn't using retpoline we might need to do a IBPB flush on
  every VMEXIT.

Even when using retpoline for indirect calls, in certain conditions 'ret'
can use the BTB on Skylake-era CPUs. There are other mitigations
available like RSB stuffing/clearing.

* IBPB is issued only for SVM during svm_free_vcpu().
  VMX has a vmclear and SVM doesn't.  Follow discussion here:
  https://lkml.org/lkml/2018/1/15/146

Please refer to the following spec for more details on the enumeration
and control.

Refer here to get documentation about mitigations.

https://software.intel.com/en-us/side-channel-security-support

[peterz: rebase and changelog rewrite]
[karahmed: - rebase
           - vmx: expose PRED_CMD if guest has it in CPUID
           - svm: only pass through IBPB if guest has it in CPUID
           - vmx: support !cpu_has_vmx_msr_bitmap()]
           - vmx: support nested]
[dwmw2: Expose CPUID bit too (AMD IBPB only for now as we lack IBRS)
        PRED_CMD is a write-only MSR]

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: kvm@vger.kernel.org
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jun Nakajima <jun.nakajima@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/1515720739-43819-6-git-send-email-ashok.raj@intel.com
Link: https://lkml.kernel.org/r/1517522386-18410-3-git-send-email-karahmed@amazon.de
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/cpuid.c | 11 +++++-
 arch/x86/kvm/cpuid.h | 12 +++++++
 arch/x86/kvm/svm.c   | 28 ++++++++++++++++
 arch/x86/kvm/vmx.c   | 79 ++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 127 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 338d13d4fd2f..35196f8e1ba6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -341,6 +341,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
 		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
 
+	/* cpuid 0x80000008.ebx */
+	const u32 kvm_cpuid_8000_0008_ebx_x86_features =
+		F(IBPB);
+
 	/* cpuid 0xC0000001.edx */
 	const u32 kvm_supported_word5_x86_features =
 		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
@@ -583,7 +587,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		if (!g_phys_as)
 			g_phys_as = phys_as;
 		entry->eax = g_phys_as | (virt_as << 8);
-		entry->ebx = entry->edx = 0;
+		entry->edx = 0;
+		/* IBPB isn't necessarily present in hardware cpuid */
+		if (boot_cpu_has(X86_FEATURE_IBPB))
+			entry->ebx |= F(IBPB);
+		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
+		cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
 		break;
 	}
 	case 0x80000019:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d1534feefcfe..213102389795 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -159,6 +159,18 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
 	return best && (best->edx & bit(X86_FEATURE_RDTSCP));
 }
 
+static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+	if (best && (best->ebx & bit(X86_FEATURE_IBPB)))
+		return true;
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
+}
+
+
 /*
  * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
  */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index df7827a981dd..d489836da6f5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -182,6 +182,7 @@ static const struct svm_direct_access_msrs {
 	{ .index = MSR_CSTAR,				.always = true  },
 	{ .index = MSR_SYSCALL_MASK,			.always = true  },
 #endif
+	{ .index = MSR_IA32_PRED_CMD,			.always = false },
 	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
 	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
 	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
@@ -411,6 +412,7 @@ struct svm_cpu_data {
 	struct kvm_ldttss_desc *tss_desc;
 
 	struct page *save_area;
+	struct vmcb *current_vmcb;
 };
 
 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
@@ -1210,11 +1212,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
+	/*
+	 * The vmcb page can be recycled, causing a false negative in
+	 * svm_vcpu_load(). So do a full IBPB now.
+	 */
+	indirect_branch_prediction_barrier();
 }
 
 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
+	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 	int i;
 
 	if (unlikely(cpu != vcpu->cpu)) {
@@ -1239,6 +1247,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
 		}
 	}
+	if (sd->current_vmcb != svm->vmcb) {
+		sd->current_vmcb = svm->vmcb;
+		indirect_branch_prediction_barrier();
+	}
 }
 
 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -3125,6 +3137,22 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	case MSR_IA32_TSC:
 		kvm_write_tsc(vcpu, msr);
 		break;
+	case MSR_IA32_PRED_CMD:
+		if (!msr->host_initiated &&
+		    !guest_cpuid_has_ibpb(vcpu))
+			return 1;
+
+		if (data & ~PRED_CMD_IBPB)
+			return 1;
+
+		if (!data)
+			break;
+
+		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+		if (is_guest_mode(vcpu))
+			break;
+		set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+		break;
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
 		break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e0064855fbdb..a19116fad680 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -544,6 +544,7 @@ struct vcpu_vmx {
 	u64 		      msr_host_kernel_gs_base;
 	u64 		      msr_guest_kernel_gs_base;
 #endif
+
 	u32 vm_entry_controls_shadow;
 	u32 vm_exit_controls_shadow;
 	/*
@@ -892,6 +893,8 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static int alloc_identity_pagetable(struct kvm *kvm);
 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+							  u32 msr, int type);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1687,6 +1690,29 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
+/*
+ * Check if MSR is intercepted for L01 MSR bitmap.
+ */
+static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
+{
+	unsigned long *msr_bitmap;
+	int f = sizeof(unsigned long);
+
+	if (!cpu_has_vmx_msr_bitmap())
+		return true;
+
+	msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
+
+	if (msr <= 0x1fff) {
+		return !!test_bit(msr, msr_bitmap + 0x800 / f);
+	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+		msr &= 0x1fff;
+		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+	}
+
+	return true;
+}
+
 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 		unsigned long entry, unsigned long exit)
 {
@@ -2072,6 +2098,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
 		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
 		vmcs_load(vmx->loaded_vmcs->vmcs);
+		indirect_branch_prediction_barrier();
 	}
 
 	if (vmx->loaded_vmcs->cpu != cpu) {
@@ -2904,6 +2931,33 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC:
 		kvm_write_tsc(vcpu, msr_info);
 		break;
+	case MSR_IA32_PRED_CMD:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has_ibpb(vcpu))
+			return 1;
+
+		if (data & ~PRED_CMD_IBPB)
+			return 1;
+
+		if (!data)
+			break;
+
+		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+
+		/*
+		 * For non-nested:
+		 * When it's written (to non-zero) for the first time, pass
+		 * it through.
+		 *
+		 * For nested:
+		 * The handling of the MSR bitmap for L2 guests is done in
+		 * nested_vmx_merge_msr_bitmap. We should not touch the
+		 * vmcs02.msr_bitmap here since it gets completely overwritten
+		 * in the merging.
+		 */
+		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
+					      MSR_TYPE_W);
+		break;
 	case MSR_IA32_CR_PAT:
 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
 			if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -9172,9 +9226,23 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 	struct page *page;
 	unsigned long *msr_bitmap_l1;
 	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+	/*
+	 * pred_cmd is trying to verify two things:
+	 *
+	 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
+	 *    ensures that we do not accidentally generate an L02 MSR bitmap
+	 *    from the L12 MSR bitmap that is too permissive.
+	 * 2. That L1 or L2s have actually used the MSR. This avoids
+	 *    unnecessarily merging of the bitmap if the MSR is unused. This
+	 *    works properly because we only update the L01 MSR bitmap lazily.
+	 *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
+	 *    updated to reflect this when L1 (or its L2s) actually write to
+	 *    the MSR.
+	 */
+	bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
 
-	/* This shortcut is ok because we support only x2APIC MSRs so far. */
-	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+	    !pred_cmd)
 		return false;
 
 	page = nested_get_page(vcpu, vmcs12->msr_bitmap);
@@ -9209,6 +9277,13 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 				MSR_TYPE_W);
 		}
 	}
+
+	if (pred_cmd)
+		nested_vmx_disable_intercept_for_msr(
+					msr_bitmap_l1, msr_bitmap_l0,
+					MSR_IA32_PRED_CMD,
+					MSR_TYPE_W);
+
 	kunmap(page);
 	nested_release_page_clean(page);
 

From 337c26f50a7189f114fce87e45eadd8d6dd9560b Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Thu, 1 Feb 2018 22:59:44 +0100
Subject: [PATCH 1920/3220] KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES

commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd upstream.

Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO
(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the
contents will come directly from the hardware, but user-space can still
override it.

[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional]

Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
Cc: kvm@vger.kernel.org
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Link: https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karahmed@amazon.de
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/cpuid.c | 11 +++++++++--
 arch/x86/kvm/cpuid.h |  8 ++++++++
 arch/x86/kvm/vmx.c   | 15 +++++++++++++++
 arch/x86/kvm/x86.c   |  1 +
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 35196f8e1ba6..2f3483e395bf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -362,6 +362,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	const u32 kvm_supported_word10_x86_features =
 		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
 
+	/* cpuid 7.0.edx*/
+	const u32 kvm_cpuid_7_0_edx_x86_features =
+		F(ARCH_CAPABILITIES);
+
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 
@@ -439,11 +443,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			cpuid_mask(&entry->ebx, 9);
 			// TSC_ADJUST is emulated
 			entry->ebx |= F(TSC_ADJUST);
-		} else
+			entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+			cpuid_mask(&entry->edx, CPUID_7_EDX);
+		} else {
 			entry->ebx = 0;
+			entry->edx = 0;
+		}
 		entry->eax = 0;
 		entry->ecx = 0;
-		entry->edx = 0;
 		break;
 	}
 	case 9:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 213102389795..67c35486d8d0 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -170,6 +170,14 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu)
 	return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
 }
 
+static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES));
+}
+
 
 /*
  * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a19116fad680..3a513997b1cb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -545,6 +545,8 @@ struct vcpu_vmx {
 	u64 		      msr_guest_kernel_gs_base;
 #endif
 
+	u64 		      arch_capabilities;
+
 	u32 vm_entry_controls_shadow;
 	u32 vm_exit_controls_shadow;
 	/*
@@ -2832,6 +2834,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC:
 		msr_info->data = guest_read_tsc(vcpu);
 		break;
+	case MSR_IA32_ARCH_CAPABILITIES:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has_arch_capabilities(vcpu))
+			return 1;
+		msr_info->data = to_vmx(vcpu)->arch_capabilities;
+		break;
 	case MSR_IA32_SYSENTER_CS:
 		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
 		break;
@@ -2958,6 +2966,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
 					      MSR_TYPE_W);
 		break;
+	case MSR_IA32_ARCH_CAPABILITIES:
+		if (!msr_info->host_initiated)
+			return 1;
+		vmx->arch_capabilities = data;
+		break;
 	case MSR_IA32_CR_PAT:
 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
 			if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -5002,6 +5015,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		++vmx->nmsrs;
 	}
 
+	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
 
 	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e6ab034f0bc7..276f978efeed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -961,6 +961,7 @@ static u32 msrs_to_save[] = {
 #endif
 	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+	MSR_IA32_ARCH_CAPABILITIES
 };
 
 static unsigned num_msrs_to_save;

From fc6aae9f407810cb153a9133c28735871f9f0a16 Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Thu, 1 Feb 2018 22:59:45 +0100
Subject: [PATCH 1921/3220] KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL

commit d28b387fb74da95d69d2615732f50cceb38e9a4d upstream.

[ Based on a patch from Ashok Raj <ashok.raj@intel.com> ]

Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
be using a retpoline+IBPB based approach.

To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
guests that do not actually use the MSR, only start saving and restoring
when a non-zero is written to it.

No attempt is made to handle STIBP here, intentionally. Filtering STIBP
may be added in a future patch, which may require trapping all writes
if we don't want to pass it through directly to the guest.

[dwmw2: Clean up CPUID bits, save/restore manually, handle reset]

Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
Cc: kvm@vger.kernel.org
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ashok Raj <ashok.raj@intel.com>
Link: https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karahmed@amazon.de
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/cpuid.c |   8 ++--
 arch/x86/kvm/cpuid.h |  11 +++++
 arch/x86/kvm/vmx.c   | 103 ++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c   |   2 +-
 4 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2f3483e395bf..0ab72a8387d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -343,7 +343,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 0x80000008.ebx */
 	const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-		F(IBPB);
+		F(IBPB) | F(IBRS);
 
 	/* cpuid 0xC0000001.edx */
 	const u32 kvm_supported_word5_x86_features =
@@ -364,7 +364,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
-		F(ARCH_CAPABILITIES);
+		F(SPEC_CTRL) | F(ARCH_CAPABILITIES);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
@@ -595,9 +595,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			g_phys_as = phys_as;
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
-		/* IBPB isn't necessarily present in hardware cpuid */
+		/* IBRS and IBPB aren't necessarily present in hardware cpuid */
 		if (boot_cpu_has(X86_FEATURE_IBPB))
 			entry->ebx |= F(IBPB);
+		if (boot_cpu_has(X86_FEATURE_IBRS))
+			entry->ebx |= F(IBRS);
 		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
 		cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
 		break;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 67c35486d8d0..7f74d7e18a01 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -170,6 +170,17 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu)
 	return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
 }
 
+static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+	if (best && (best->ebx & bit(X86_FEATURE_IBRS)))
+		return true;
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
+}
+
 static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3a513997b1cb..b118d415ca08 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -546,6 +546,7 @@ struct vcpu_vmx {
 #endif
 
 	u64 		      arch_capabilities;
+	u64 		      spec_ctrl;
 
 	u32 vm_entry_controls_shadow;
 	u32 vm_exit_controls_shadow;
@@ -1692,6 +1693,29 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
+/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+	unsigned long *msr_bitmap;
+	int f = sizeof(unsigned long);
+
+	if (!cpu_has_vmx_msr_bitmap())
+		return true;
+
+	msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+
+	if (msr <= 0x1fff) {
+		return !!test_bit(msr, msr_bitmap + 0x800 / f);
+	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+		msr &= 0x1fff;
+		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+	}
+
+	return true;
+}
+
 /*
  * Check if MSR is intercepted for L01 MSR bitmap.
  */
@@ -2834,6 +2858,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC:
 		msr_info->data = guest_read_tsc(vcpu);
 		break;
+	case MSR_IA32_SPEC_CTRL:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has_ibrs(vcpu))
+			return 1;
+
+		msr_info->data = to_vmx(vcpu)->spec_ctrl;
+		break;
 	case MSR_IA32_ARCH_CAPABILITIES:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has_arch_capabilities(vcpu))
@@ -2939,6 +2970,36 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC:
 		kvm_write_tsc(vcpu, msr_info);
 		break;
+	case MSR_IA32_SPEC_CTRL:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has_ibrs(vcpu))
+			return 1;
+
+		/* The STIBP bit doesn't fault even if it's not advertised */
+		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+			return 1;
+
+		vmx->spec_ctrl = data;
+
+		if (!data)
+			break;
+
+		/*
+		 * For non-nested:
+		 * When it's written (to non-zero) for the first time, pass
+		 * it through.
+		 *
+		 * For nested:
+		 * The handling of the MSR bitmap for L2 guests is done in
+		 * nested_vmx_merge_msr_bitmap. We should not touch the
+		 * vmcs02.msr_bitmap here since it gets completely overwritten
+		 * in the merging. We update the vmcs01 here for L1 as well
+		 * since it will end up touching the MSR anyway now.
+		 */
+		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+					      MSR_IA32_SPEC_CTRL,
+					      MSR_TYPE_RW);
+		break;
 	case MSR_IA32_PRED_CMD:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has_ibpb(vcpu))
@@ -5045,6 +5106,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	u64 cr0;
 
 	vmx->rmode.vm86_active = 0;
+	vmx->spec_ctrl = 0;
 
 	vmx->soft_vnmi_blocked = 0;
 
@@ -8589,6 +8651,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	atomic_switch_perf_msrs(vmx);
 	debugctlmsr = get_debugctlmsr();
 
+	/*
+	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
+	 * is no need to worry about the conditional branch over the wrmsr
+	 * being speculatively taken.
+	 */
+	if (vmx->spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+
 	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
 		/* Store host registers */
@@ -8707,6 +8778,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 	      );
 
+	/*
+	 * We do not use IBRS in the kernel. If this vCPU has used the
+	 * SPEC_CTRL MSR it may have left it on; save the value and
+	 * turn it off. This is much more efficient than blindly adding
+	 * it to the atomic save/restore list. Especially as the former
+	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+	 *
+	 * For non-nested case:
+	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
+	 * save it.
+	 *
+	 * For nested case:
+	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
+	 * save it.
+	 */
+	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+		rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+
+	if (vmx->spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
 
@@ -9242,7 +9334,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 	unsigned long *msr_bitmap_l1;
 	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
 	/*
-	 * pred_cmd is trying to verify two things:
+	 * pred_cmd & spec_ctrl are trying to verify two things:
 	 *
 	 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
 	 *    ensures that we do not accidentally generate an L02 MSR bitmap
@@ -9255,9 +9347,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 	 *    the MSR.
 	 */
 	bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+	bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
 
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
-	    !pred_cmd)
+	    !pred_cmd && !spec_ctrl)
 		return false;
 
 	page = nested_get_page(vcpu, vmcs12->msr_bitmap);
@@ -9293,6 +9386,12 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (spec_ctrl)
+		nested_vmx_disable_intercept_for_msr(
+					msr_bitmap_l1, msr_bitmap_l0,
+					MSR_IA32_SPEC_CTRL,
+					MSR_TYPE_R | MSR_TYPE_W);
+
 	if (pred_cmd)
 		nested_vmx_disable_intercept_for_msr(
 					msr_bitmap_l1, msr_bitmap_l0,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 276f978efeed..12a91ea85d3a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -961,7 +961,7 @@ static u32 msrs_to_save[] = {
 #endif
 	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
-	MSR_IA32_ARCH_CAPABILITIES
+	MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
 };
 
 static unsigned num_msrs_to_save;

From 89be8950bab799ddb9cc3777345e3c21bcb32dba Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Sat, 3 Feb 2018 15:56:23 +0100
Subject: [PATCH 1922/3220] KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL

commit b2ac58f90540e39324e7a29a7ad471407ae0bf48 upstream.

[ Based on a patch from Paolo Bonzini <pbonzini@redhat.com> ]

... basically doing exactly what we do for VMX:

- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID)
- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest
  actually used it.

Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
Cc: kvm@vger.kernel.org
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ashok Raj <ashok.raj@intel.com>
Link: https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karahmed@amazon.de
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/svm.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d489836da6f5..9a390fb95b4b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -147,6 +147,8 @@ struct vcpu_svm {
 		u64 gs_base;
 	} host;
 
+	u64 spec_ctrl;
+
 	u32 *msrpm;
 
 	ulong nmi_iret_rip;
@@ -182,6 +184,7 @@ static const struct svm_direct_access_msrs {
 	{ .index = MSR_CSTAR,				.always = true  },
 	{ .index = MSR_SYSCALL_MASK,			.always = true  },
 #endif
+	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
 	{ .index = MSR_IA32_PRED_CMD,			.always = false },
 	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
 	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
@@ -764,6 +767,25 @@ static bool valid_msr_intercept(u32 index)
 	return false;
 }
 
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+{
+	u8 bit_write;
+	unsigned long tmp;
+	u32 offset;
+	u32 *msrpm;
+
+	msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+				      to_svm(vcpu)->msrpm;
+
+	offset    = svm_msrpm_offset(msr);
+	bit_write = 2 * (msr & 0x0f) + 1;
+	tmp       = msrpm[offset];
+
+	BUG_ON(offset == MSR_INVALID);
+
+	return !!test_bit(bit_write,  &tmp);
+}
+
 static void set_msr_interception(u32 *msrpm, unsigned msr,
 				 int read, int write)
 {
@@ -1122,6 +1144,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	u32 dummy;
 	u32 eax = 1;
 
+	svm->spec_ctrl = 0;
+
 	if (!init_event) {
 		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
 					   MSR_IA32_APICBASE_ENABLE;
@@ -3063,6 +3087,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_VM_CR:
 		msr_info->data = svm->nested.vm_cr_msr;
 		break;
+	case MSR_IA32_SPEC_CTRL:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has_ibrs(vcpu))
+			return 1;
+
+		msr_info->data = svm->spec_ctrl;
+		break;
 	case MSR_IA32_UCODE_REV:
 		msr_info->data = 0x01000065;
 		break;
@@ -3137,6 +3168,33 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	case MSR_IA32_TSC:
 		kvm_write_tsc(vcpu, msr);
 		break;
+	case MSR_IA32_SPEC_CTRL:
+		if (!msr->host_initiated &&
+		    !guest_cpuid_has_ibrs(vcpu))
+			return 1;
+
+		/* The STIBP bit doesn't fault even if it's not advertised */
+		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+			return 1;
+
+		svm->spec_ctrl = data;
+
+		if (!data)
+			break;
+
+		/*
+		 * For non-nested:
+		 * When it's written (to non-zero) for the first time, pass
+		 * it through.
+		 *
+		 * For nested:
+		 * The handling of the MSR bitmap for L2 guests is done in
+		 * nested_svm_vmrun_msrpm.
+		 * We update the L1 MSR bit as well since it will end up
+		 * touching the MSR anyway now.
+		 */
+		set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+		break;
 	case MSR_IA32_PRED_CMD:
 		if (!msr->host_initiated &&
 		    !guest_cpuid_has_ibpb(vcpu))
@@ -3839,6 +3897,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	local_irq_enable();
 
+	/*
+	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
+	 * is no need to worry about the conditional branch over the wrmsr
+	 * being speculatively taken.
+	 */
+	if (svm->spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
 		"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
@@ -3931,6 +3998,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		);
 
+	/*
+	 * We do not use IBRS in the kernel. If this vCPU has used the
+	 * SPEC_CTRL MSR it may have left it on; save the value and
+	 * turn it off. This is much more efficient than blindly adding
+	 * it to the atomic save/restore list. Especially as the former
+	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+	 *
+	 * For non-nested case:
+	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
+	 * save it.
+	 *
+	 * For nested case:
+	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
+	 * save it.
+	 */
+	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+		rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
+	if (svm->spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
 

From d0169c04fee013922a272a19f7950439a5e07230 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 22 Feb 2018 16:43:17 +0100
Subject: [PATCH 1923/3220] KVM/x86: Remove indirect MSR op calls from
 SPEC_CTRL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit ecb586bd29c99fb4de599dec388658e74388daad upstream.

Having a paravirt indirect call in the IBRS restore path is not a
good idea, since we are trying to protect from speculative execution
of bogus indirect branch targets.  It is also slower, so use
native_wrmsrl() on the vmentry path too.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: KarimAllah Ahmed <karahmed@amazon.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: d28b387fb74da95d69d2615732f50cceb38e9a4d
Link: http://lkml.kernel.org/r/20180222154318.20361-2-pbonzini@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/svm.c | 7 ++++---
 arch/x86/kvm/vmx.c | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9a390fb95b4b..e1f20e0d62c2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -37,6 +37,7 @@
 #include <asm/desc.h>
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
+#include <asm/microcode.h>
 #include <asm/spec-ctrl.h>
 
 #include <asm/virtext.h>
@@ -3904,7 +3905,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * being speculatively taken.
 	 */
 	if (svm->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+		native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
 
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
@@ -4014,10 +4015,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * save it.
 	 */
 	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
-		rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
 	if (svm->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b118d415ca08..f7b5c009859e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -48,6 +48,7 @@
 #include <asm/kexec.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
+#include <asm/microcode.h>
 #include <asm/spec-ctrl.h>
 
 #include "trace.h"
@@ -8658,7 +8659,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * being speculatively taken.
 	 */
 	if (vmx->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
@@ -8794,10 +8795,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * save it.
 	 */
 	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
-		rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
 	if (vmx->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();

From cc8c5450b6d7ac0cbc36d133ffa21a01dd21b3d8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 17 Dec 2015 09:45:09 -0800
Subject: [PATCH 1924/3220] x86: reorganize SMAP handling in user space
 accesses

commit 11f1a4b9755f5dbc3e822a96502ebe9b044b14d8 upstream.

This reorganizes how we do the stac/clac instructions in the user access
code.  Instead of adding the instructions directly to the same inline
asm that does the actual user level access and exception handling, add
them at a higher level.

This is mainly preparation for the next step, where we will expose an
interface to allow users to mark several accesses together as being user
space accesses, but it does already clean up some code:

 - the inlined trivial cases of copy_in_user() now do stac/clac just
   once over the accesses: they used to do one pair around the user
   space read, and another pair around the write-back.

 - the {get,put}_user_ex() macros that are used with the catch/try
   handling don't do any stac/clac at all, because that happens in the
   try/catch surrounding them.

Other than those two cleanups that happened naturally from the
re-organization, this should not make any difference. Yet.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/uaccess.h    | 53 +++++++++++------
 arch/x86/include/asm/uaccess_64.h | 94 ++++++++++++++++++++++---------
 2 files changed, 101 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index d788b0cdc0ad..e93a69f9a225 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -144,6 +144,9 @@ extern int __get_user_4(void);
 extern int __get_user_8(void);
 extern int __get_user_bad(void);
 
+#define __uaccess_begin() stac()
+#define __uaccess_end()   clac()
+
 /*
  * This is a type: either unsigned long, if the argument fits into
  * that type, or otherwise unsigned long long.
@@ -203,10 +206,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 
 #ifdef CONFIG_X86_32
 #define __put_user_asm_u64(x, addr, err, errret)			\
-	asm volatile(ASM_STAC "\n"					\
+	asm volatile("\n"						\
 		     "1:	movl %%eax,0(%2)\n"			\
 		     "2:	movl %%edx,4(%2)\n"			\
-		     "3: " ASM_CLAC "\n"				\
+		     "3:"						\
 		     ".section .fixup,\"ax\"\n"				\
 		     "4:	movl %3,%0\n"				\
 		     "	jmp 3b\n"					\
@@ -217,10 +220,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 		     : "A" (x), "r" (addr), "i" (errret), "0" (err))
 
 #define __put_user_asm_ex_u64(x, addr)					\
-	asm volatile(ASM_STAC "\n"					\
+	asm volatile("\n"						\
 		     "1:	movl %%eax,0(%1)\n"			\
 		     "2:	movl %%edx,4(%1)\n"			\
-		     "3: " ASM_CLAC "\n"				\
+		     "3:"						\
 		     _ASM_EXTABLE_EX(1b, 2b)				\
 		     _ASM_EXTABLE_EX(2b, 3b)				\
 		     : : "A" (x), "r" (addr))
@@ -314,6 +317,10 @@ do {									\
 	}								\
 } while (0)
 
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
 #define __put_user_size_ex(x, ptr, size)				\
 do {									\
 	__chk_user_ptr(ptr);						\
@@ -368,9 +375,9 @@ do {									\
 } while (0)
 
 #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret)	\
-	asm volatile(ASM_STAC "\n"					\
+	asm volatile("\n"						\
 		     "1:	mov"itype" %2,%"rtype"1\n"		\
-		     "2: " ASM_CLAC "\n"				\
+		     "2:\n"						\
 		     ".section .fixup,\"ax\"\n"				\
 		     "3:	mov %3,%0\n"				\
 		     "	xor"itype" %"rtype"1,%"rtype"1\n"		\
@@ -380,6 +387,10 @@ do {									\
 		     : "=r" (err), ltype(x)				\
 		     : "m" (__m(addr)), "i" (errret), "0" (err))
 
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
 #define __get_user_size_ex(x, ptr, size)				\
 do {									\
 	__chk_user_ptr(ptr);						\
@@ -410,7 +421,9 @@ do {									\
 #define __put_user_nocheck(x, ptr, size)			\
 ({								\
 	int __pu_err;						\
+	__uaccess_begin();					\
 	__put_user_size((x), (ptr), (size), __pu_err, -EFAULT);	\
+	__uaccess_end();					\
 	__builtin_expect(__pu_err, 0);				\
 })
 
@@ -418,7 +431,9 @@ do {									\
 ({									\
 	int __gu_err;							\
 	unsigned long __gu_val;						\
+	__uaccess_begin();						\
 	__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT);	\
+	__uaccess_end();						\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
 	__builtin_expect(__gu_err, 0);					\
 })
@@ -433,9 +448,9 @@ struct __large_struct { unsigned long buf[100]; };
  * aliasing issues.
  */
 #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret)	\
-	asm volatile(ASM_STAC "\n"					\
+	asm volatile("\n"						\
 		     "1:	mov"itype" %"rtype"1,%2\n"		\
-		     "2: " ASM_CLAC "\n"				\
+		     "2:\n"						\
 		     ".section .fixup,\"ax\"\n"				\
 		     "3:	mov %3,%0\n"				\
 		     "	jmp 2b\n"					\
@@ -455,11 +470,11 @@ struct __large_struct { unsigned long buf[100]; };
  */
 #define uaccess_try	do {						\
 	current_thread_info()->uaccess_err = 0;				\
-	stac();								\
+	__uaccess_begin();						\
 	barrier();
 
 #define uaccess_catch(err)						\
-	clac();								\
+	__uaccess_end();						\
 	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\
 } while (0)
 
@@ -557,12 +572,13 @@ extern void __cmpxchg_wrong_size(void)
 	__typeof__(ptr) __uval = (uval);				\
 	__typeof__(*(ptr)) __old = (old);				\
 	__typeof__(*(ptr)) __new = (new);				\
+	__uaccess_begin();						\
 	switch (size) {							\
 	case 1:								\
 	{								\
-		asm volatile("\t" ASM_STAC "\n"				\
+		asm volatile("\n"					\
 			"1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n"		\
-			"2:\t" ASM_CLAC "\n"				\
+			"2:\n"						\
 			"\t.section .fixup, \"ax\"\n"			\
 			"3:\tmov     %3, %0\n"				\
 			"\tjmp     2b\n"				\
@@ -576,9 +592,9 @@ extern void __cmpxchg_wrong_size(void)
 	}								\
 	case 2:								\
 	{								\
-		asm volatile("\t" ASM_STAC "\n"				\
+		asm volatile("\n"					\
 			"1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n"		\
-			"2:\t" ASM_CLAC "\n"				\
+			"2:\n"						\
 			"\t.section .fixup, \"ax\"\n"			\
 			"3:\tmov     %3, %0\n"				\
 			"\tjmp     2b\n"				\
@@ -592,9 +608,9 @@ extern void __cmpxchg_wrong_size(void)
 	}								\
 	case 4:								\
 	{								\
-		asm volatile("\t" ASM_STAC "\n"				\
+		asm volatile("\n"					\
 			"1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"		\
-			"2:\t" ASM_CLAC "\n"				\
+			"2:\n"						\
 			"\t.section .fixup, \"ax\"\n"			\
 			"3:\tmov     %3, %0\n"				\
 			"\tjmp     2b\n"				\
@@ -611,9 +627,9 @@ extern void __cmpxchg_wrong_size(void)
 		if (!IS_ENABLED(CONFIG_X86_64))				\
 			__cmpxchg_wrong_size();				\
 									\
-		asm volatile("\t" ASM_STAC "\n"				\
+		asm volatile("\n"					\
 			"1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n"		\
-			"2:\t" ASM_CLAC "\n"				\
+			"2:\n"						\
 			"\t.section .fixup, \"ax\"\n"			\
 			"3:\tmov     %3, %0\n"				\
 			"\tjmp     2b\n"				\
@@ -628,6 +644,7 @@ extern void __cmpxchg_wrong_size(void)
 	default:							\
 		__cmpxchg_wrong_size();					\
 	}								\
+	__uaccess_end();						\
 	*__uval = __old;						\
 	__ret;								\
 })
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index d83a55b95a48..307698688fa1 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -56,35 +56,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
 	if (!__builtin_constant_p(size))
 		return copy_user_generic(dst, (__force void *)src, size);
 	switch (size) {
-	case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
+	case 1:
+		__uaccess_begin();
+		__get_user_asm(*(u8 *)dst, (u8 __user *)src,
 			      ret, "b", "b", "=q", 1);
+		__uaccess_end();
 		return ret;
-	case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
+	case 2:
+		__uaccess_begin();
+		__get_user_asm(*(u16 *)dst, (u16 __user *)src,
 			      ret, "w", "w", "=r", 2);
+		__uaccess_end();
 		return ret;
-	case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
+	case 4:
+		__uaccess_begin();
+		__get_user_asm(*(u32 *)dst, (u32 __user *)src,
 			      ret, "l", "k", "=r", 4);
+		__uaccess_end();
 		return ret;
-	case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
+	case 8:
+		__uaccess_begin();
+		__get_user_asm(*(u64 *)dst, (u64 __user *)src,
 			      ret, "q", "", "=r", 8);
+		__uaccess_end();
 		return ret;
 	case 10:
+		__uaccess_begin();
 		__get_user_asm(*(u64 *)dst, (u64 __user *)src,
 			       ret, "q", "", "=r", 10);
-		if (unlikely(ret))
-			return ret;
-		__get_user_asm(*(u16 *)(8 + (char *)dst),
-			       (u16 __user *)(8 + (char __user *)src),
-			       ret, "w", "w", "=r", 2);
+		if (likely(!ret))
+			__get_user_asm(*(u16 *)(8 + (char *)dst),
+				       (u16 __user *)(8 + (char __user *)src),
+				       ret, "w", "w", "=r", 2);
+		__uaccess_end();
 		return ret;
 	case 16:
+		__uaccess_begin();
 		__get_user_asm(*(u64 *)dst, (u64 __user *)src,
 			       ret, "q", "", "=r", 16);
-		if (unlikely(ret))
-			return ret;
-		__get_user_asm(*(u64 *)(8 + (char *)dst),
-			       (u64 __user *)(8 + (char __user *)src),
-			       ret, "q", "", "=r", 8);
+		if (likely(!ret))
+			__get_user_asm(*(u64 *)(8 + (char *)dst),
+				       (u64 __user *)(8 + (char __user *)src),
+				       ret, "q", "", "=r", 8);
+		__uaccess_end();
 		return ret;
 	default:
 		return copy_user_generic(dst, (__force void *)src, size);
@@ -106,35 +120,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
 	if (!__builtin_constant_p(size))
 		return copy_user_generic((__force void *)dst, src, size);
 	switch (size) {
-	case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
+	case 1:
+		__uaccess_begin();
+		__put_user_asm(*(u8 *)src, (u8 __user *)dst,
 			      ret, "b", "b", "iq", 1);
+		__uaccess_end();
 		return ret;
-	case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
+	case 2:
+		__uaccess_begin();
+		__put_user_asm(*(u16 *)src, (u16 __user *)dst,
 			      ret, "w", "w", "ir", 2);
+		__uaccess_end();
 		return ret;
-	case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
+	case 4:
+		__uaccess_begin();
+		__put_user_asm(*(u32 *)src, (u32 __user *)dst,
 			      ret, "l", "k", "ir", 4);
+		__uaccess_end();
 		return ret;
-	case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
+	case 8:
+		__uaccess_begin();
+		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
 			      ret, "q", "", "er", 8);
+		__uaccess_end();
 		return ret;
 	case 10:
+		__uaccess_begin();
 		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
 			       ret, "q", "", "er", 10);
-		if (unlikely(ret))
-			return ret;
-		asm("":::"memory");
-		__put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
-			       ret, "w", "w", "ir", 2);
+		if (likely(!ret)) {
+			asm("":::"memory");
+			__put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+				       ret, "w", "w", "ir", 2);
+		}
+		__uaccess_end();
 		return ret;
 	case 16:
+		__uaccess_begin();
 		__put_user_asm(*(u64 *)src, (u64 __user *)dst,
 			       ret, "q", "", "er", 16);
-		if (unlikely(ret))
-			return ret;
-		asm("":::"memory");
-		__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
-			       ret, "q", "", "er", 8);
+		if (likely(!ret)) {
+			asm("":::"memory");
+			__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+				       ret, "q", "", "er", 8);
+		}
+		__uaccess_end();
 		return ret;
 	default:
 		return copy_user_generic((__force void *)dst, src, size);
@@ -160,39 +190,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 	switch (size) {
 	case 1: {
 		u8 tmp;
+		__uaccess_begin();
 		__get_user_asm(tmp, (u8 __user *)src,
 			       ret, "b", "b", "=q", 1);
 		if (likely(!ret))
 			__put_user_asm(tmp, (u8 __user *)dst,
 				       ret, "b", "b", "iq", 1);
+		__uaccess_end();
 		return ret;
 	}
 	case 2: {
 		u16 tmp;
+		__uaccess_begin();
 		__get_user_asm(tmp, (u16 __user *)src,
 			       ret, "w", "w", "=r", 2);
 		if (likely(!ret))
 			__put_user_asm(tmp, (u16 __user *)dst,
 				       ret, "w", "w", "ir", 2);
+		__uaccess_end();
 		return ret;
 	}
 
 	case 4: {
 		u32 tmp;
+		__uaccess_begin();
 		__get_user_asm(tmp, (u32 __user *)src,
 			       ret, "l", "k", "=r", 4);
 		if (likely(!ret))
 			__put_user_asm(tmp, (u32 __user *)dst,
 				       ret, "l", "k", "ir", 4);
+		__uaccess_end();
 		return ret;
 	}
 	case 8: {
 		u64 tmp;
+		__uaccess_begin();
 		__get_user_asm(tmp, (u64 __user *)src,
 			       ret, "q", "", "=r", 8);
 		if (likely(!ret))
 			__put_user_asm(tmp, (u64 __user *)dst,
 				       ret, "q", "", "er", 8);
+		__uaccess_end();
 		return ret;
 	}
 	default:

From 45b871531cb6d2973d0c68083376719c50779a96 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 23 Feb 2016 14:58:52 -0800
Subject: [PATCH 1925/3220] x86: fix SMAP in 32-bit environments

commit de9e478b9d49f3a0214310d921450cf5bb4a21e6 upstream.

In commit 11f1a4b9755f ("x86: reorganize SMAP handling in user space
accesses") I changed how the stac/clac instructions were generated
around the user space accesses, which then made it possible to do
batched accesses efficiently for user string copies etc.

However, in doing so, I completely spaced out, and didn't even think
about the 32-bit case.  And nobody really even seemed to notice, because
SMAP doesn't even exist until modern Skylake processors, and you'd have
to be crazy to run 32-bit kernels on a modern CPU.

Which brings us to Andy Lutomirski.

He actually tested the 32-bit kernel on new hardware, and noticed that
it doesn't work.  My bad.  The trivial fix is to add the required
uaccess begin/end markers around the raw accesses in <asm/uaccess_32.h>.

I feel a bit bad about this patch, just because that header file really
should be cleaned up to avoid all the duplicated code in it, and this
commit just expands on the problem.  But this just fixes the bug without
any bigger cleanup surgery.

Reported-and-tested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/uaccess_32.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index f5dcb5204dcd..3fe0eac59462 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -48,20 +48,28 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__put_user_size(*(u8 *)from, (u8 __user *)to,
 					1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__put_user_size(*(u16 *)from, (u16 __user *)to,
 					2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__put_user_size(*(u32 *)from, (u32 __user *)to,
 					4, ret, 4);
+			__uaccess_end();
 			return ret;
 		case 8:
+			__uaccess_begin();
 			__put_user_size(*(u64 *)from, (u64 __user *)to,
 					8, ret, 8);
+			__uaccess_end();
 			return ret;
 		}
 	}
@@ -103,13 +111,19 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			__uaccess_end();
 			return ret;
 		}
 	}
@@ -148,13 +162,19 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			__uaccess_end();
 			return ret;
 		}
 	}
@@ -170,13 +190,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			__uaccess_end();
 			return ret;
 		}
 	}

From 6d1d4fc34287da617b50bd7139e536a8d69c24ea Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 29 Jan 2018 17:02:39 -0800
Subject: [PATCH 1926/3220] x86: Introduce __uaccess_begin_nospec() and
 uaccess_try_nospec

commit b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd upstream.

For __get_user() paths, do not allow the kernel to speculate on the value
of a user controlled pointer. In addition to the 'stac' instruction for
Supervisor Mode Access Protection (SMAP), a barrier_nospec() causes the
access_ok() result to resolve in the pipeline before the CPU might take any
speculative action on the pointer value. Given the cost of 'stac' the
speculation barrier is placed after 'stac' to hopefully overlap the cost of
disabling SMAP with the cost of flushing the instruction pipeline.

Since __get_user is a major kernel interface that deals with user
controlled pointers, the __uaccess_begin_nospec() mechanism will prevent
speculative execution past an access_ok() permission check. While
speculative execution past access_ok() is not enough to lead to a kernel
memory leak, it is a necessary precondition.

To be clear, __uaccess_begin_nospec() is addressing a class of potential
problems near __get_user() usages.

Note, that while the barrier_nospec() in __uaccess_begin_nospec() is used
to protect __get_user(), pointer masking similar to array_index_nospec()
will be used for get_user() since it incorporates a bounds check near the
usage.

uaccess_try_nospec provides the same mechanism for get_user_try.

No functional changes.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: Andi Kleen <ak@linux.intel.com>
Suggested-by: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: kernel-hardening@lists.openwall.com
Cc: gregkh@linuxfoundation.org
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: alan@linux.intel.com
Link: https://lkml.kernel.org/r/151727415922.33451.5796614273104346583.stgit@dwillia2-desk3.amr.corp.intel.com
[bwh: Backported to 4.4: use current_thread_info()]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/uaccess.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index e93a69f9a225..4b50bc52ea3e 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -146,6 +146,11 @@ extern int __get_user_bad(void);
 
 #define __uaccess_begin() stac()
 #define __uaccess_end()   clac()
+#define __uaccess_begin_nospec()	\
+({					\
+	stac();				\
+	barrier_nospec();		\
+})
 
 /*
  * This is a type: either unsigned long, if the argument fits into
@@ -473,6 +478,10 @@ struct __large_struct { unsigned long buf[100]; };
 	__uaccess_begin();						\
 	barrier();
 
+#define uaccess_try_nospec do {						\
+	current_thread_info()->uaccess_err = 0;				\
+	__uaccess_begin_nospec();					\
+
 #define uaccess_catch(err)						\
 	__uaccess_end();						\
 	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\

From 557cd0d20ec971f52e4b9482d551b41503bb3e55 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 29 Jan 2018 17:02:44 -0800
Subject: [PATCH 1927/3220] x86/usercopy: Replace open coded stac/clac with
 __uaccess_{begin, end}

commit b5c4ae4f35325d520b230bab6eb3310613b72ac1 upstream.

In preparation for converting some __uaccess_begin() instances to
__uacess_begin_nospec(), make sure all 'from user' uaccess paths are
using the _begin(), _end() helpers rather than open-coded stac() and
clac().

No functional changes.

Suggested-by: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: kernel-hardening@lists.openwall.com
Cc: gregkh@linuxfoundation.org
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: torvalds@linux-foundation.org
Cc: alan@linux.intel.com
Link: https://lkml.kernel.org/r/151727416438.33451.17309465232057176966.stgit@dwillia2-desk3.amr.corp.intel.com
[bwh: Backported to 4.4:
 - Convert several more functions to use __uaccess_begin_nospec(), that
   are just wrappers in mainline
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/lib/usercopy_32.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 91d93b95bd86..5755942f5eb2 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -570,12 +570,12 @@ do {									\
 unsigned long __copy_to_user_ll(void __user *to, const void *from,
 				unsigned long n)
 {
-	stac();
+	__uaccess_begin();
 	if (movsl_is_ok(to, from, n))
 		__copy_user(to, from, n);
 	else
 		n = __copy_user_intel(to, from, n);
-	clac();
+	__uaccess_end();
 	return n;
 }
 EXPORT_SYMBOL(__copy_to_user_ll);
@@ -583,12 +583,12 @@ EXPORT_SYMBOL(__copy_to_user_ll);
 unsigned long __copy_from_user_ll(void *to, const void __user *from,
 					unsigned long n)
 {
-	stac();
+	__uaccess_begin();
 	if (movsl_is_ok(to, from, n))
 		__copy_user_zeroing(to, from, n);
 	else
 		n = __copy_user_zeroing_intel(to, from, n);
-	clac();
+	__uaccess_end();
 	return n;
 }
 EXPORT_SYMBOL(__copy_from_user_ll);
@@ -596,13 +596,13 @@ EXPORT_SYMBOL(__copy_from_user_ll);
 unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
 					 unsigned long n)
 {
-	stac();
+	__uaccess_begin();
 	if (movsl_is_ok(to, from, n))
 		__copy_user(to, from, n);
 	else
 		n = __copy_user_intel((void __user *)to,
 				      (const void *)from, n);
-	clac();
+	__uaccess_end();
 	return n;
 }
 EXPORT_SYMBOL(__copy_from_user_ll_nozero);
@@ -610,7 +610,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero);
 unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
 					unsigned long n)
 {
-	stac();
+	__uaccess_begin();
 #ifdef CONFIG_X86_INTEL_USERCOPY
 	if (n > 64 && cpu_has_xmm2)
 		n = __copy_user_zeroing_intel_nocache(to, from, n);
@@ -619,7 +619,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
 #else
 	__copy_user_zeroing(to, from, n);
 #endif
-	clac();
+	__uaccess_end();
 	return n;
 }
 EXPORT_SYMBOL(__copy_from_user_ll_nocache);
@@ -627,7 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache);
 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
 					unsigned long n)
 {
-	stac();
+	__uaccess_begin();
 #ifdef CONFIG_X86_INTEL_USERCOPY
 	if (n > 64 && cpu_has_xmm2)
 		n = __copy_user_intel_nocache(to, from, n);
@@ -636,7 +636,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
 #else
 	__copy_user(to, from, n);
 #endif
-	clac();
+	__uaccess_end();
 	return n;
 }
 EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);

From 67e326e034383857f0cd0a2bc92c6b525fc710e6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 29 Jan 2018 17:02:49 -0800
Subject: [PATCH 1928/3220] x86/uaccess: Use __uaccess_begin_nospec() and
 uaccess_try_nospec

commit 304ec1b050310548db33063e567123fae8fd0301 upstream.

Quoting Linus:

    I do think that it would be a good idea to very expressly document
    the fact that it's not that the user access itself is unsafe. I do
    agree that things like "get_user()" want to be protected, but not
    because of any direct bugs or problems with get_user() and friends,
    but simply because get_user() is an excellent source of a pointer
    that is obviously controlled from a potentially attacking user
    space. So it's a prime candidate for then finding _subsequent_
    accesses that can then be used to perturb the cache.

__uaccess_begin_nospec() covers __get_user() and copy_from_iter() where the
limit check is far away from the user pointer de-reference. In those cases
a barrier_nospec() prevents speculation with a potential pointer to
privileged memory. uaccess_try_nospec covers get_user_try.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Kees Cook <keescook@chromium.org>
Cc: kernel-hardening@lists.openwall.com
Cc: gregkh@linuxfoundation.org
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: alan@linux.intel.com
Link: https://lkml.kernel.org/r/151727416953.33451.10508284228526170604.stgit@dwillia2-desk3.amr.corp.intel.com
[bwh: Backported to 4.4:
 - Convert several more functions to use __uaccess_begin_nospec(), that
   are just wrappers in mainline
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/uaccess.h    |  6 +++---
 arch/x86/include/asm/uaccess_32.h | 26 +++++++++++++-------------
 arch/x86/include/asm/uaccess_64.h | 20 ++++++++++----------
 arch/x86/lib/usercopy_32.c        | 10 +++++-----
 4 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 4b50bc52ea3e..6f8eadf0681f 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -436,7 +436,7 @@ do {									\
 ({									\
 	int __gu_err;							\
 	unsigned long __gu_val;						\
-	__uaccess_begin();						\
+	__uaccess_begin_nospec();					\
 	__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT);	\
 	__uaccess_end();						\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
@@ -546,7 +546,7 @@ struct __large_struct { unsigned long buf[100]; };
  *	get_user_ex(...);
  * } get_user_catch(err)
  */
-#define get_user_try		uaccess_try
+#define get_user_try		uaccess_try_nospec
 #define get_user_catch(err)	uaccess_catch(err)
 
 #define get_user_ex(x, ptr)	do {					\
@@ -581,7 +581,7 @@ extern void __cmpxchg_wrong_size(void)
 	__typeof__(ptr) __uval = (uval);				\
 	__typeof__(*(ptr)) __old = (old);				\
 	__typeof__(*(ptr)) __new = (new);				\
-	__uaccess_begin();						\
+	__uaccess_begin_nospec();					\
 	switch (size) {							\
 	case 1:								\
 	{								\
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 3fe0eac59462..f575ee3aea5c 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -48,25 +48,25 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 
 		switch (n) {
 		case 1:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__put_user_size(*(u8 *)from, (u8 __user *)to,
 					1, ret, 1);
 			__uaccess_end();
 			return ret;
 		case 2:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__put_user_size(*(u16 *)from, (u16 __user *)to,
 					2, ret, 2);
 			__uaccess_end();
 			return ret;
 		case 4:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__put_user_size(*(u32 *)from, (u32 __user *)to,
 					4, ret, 4);
 			__uaccess_end();
 			return ret;
 		case 8:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__put_user_size(*(u64 *)from, (u64 __user *)to,
 					8, ret, 8);
 			__uaccess_end();
@@ -111,17 +111,17 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 
 		switch (n) {
 		case 1:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
 			__uaccess_end();
 			return ret;
 		case 2:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
 			__uaccess_end();
 			return ret;
 		case 4:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
 			__uaccess_end();
 			return ret;
@@ -162,17 +162,17 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 
 		switch (n) {
 		case 1:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
 			__uaccess_end();
 			return ret;
 		case 2:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
 			__uaccess_end();
 			return ret;
 		case 4:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
 			__uaccess_end();
 			return ret;
@@ -190,17 +190,17 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
 		switch (n) {
 		case 1:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
 			__uaccess_end();
 			return ret;
 		case 2:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
 			__uaccess_end();
 			return ret;
 		case 4:
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
 			__uaccess_end();
 			return ret;
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 307698688fa1..dc2d00e7ced3 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -57,31 +57,31 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
 		return copy_user_generic(dst, (__force void *)src, size);
 	switch (size) {
 	case 1:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(*(u8 *)dst, (u8 __user *)src,
 			      ret, "b", "b", "=q", 1);
 		__uaccess_end();
 		return ret;
 	case 2:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(*(u16 *)dst, (u16 __user *)src,
 			      ret, "w", "w", "=r", 2);
 		__uaccess_end();
 		return ret;
 	case 4:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(*(u32 *)dst, (u32 __user *)src,
 			      ret, "l", "k", "=r", 4);
 		__uaccess_end();
 		return ret;
 	case 8:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(*(u64 *)dst, (u64 __user *)src,
 			      ret, "q", "", "=r", 8);
 		__uaccess_end();
 		return ret;
 	case 10:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(*(u64 *)dst, (u64 __user *)src,
 			       ret, "q", "", "=r", 10);
 		if (likely(!ret))
@@ -91,7 +91,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
 		__uaccess_end();
 		return ret;
 	case 16:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(*(u64 *)dst, (u64 __user *)src,
 			       ret, "q", "", "=r", 16);
 		if (likely(!ret))
@@ -190,7 +190,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 	switch (size) {
 	case 1: {
 		u8 tmp;
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(tmp, (u8 __user *)src,
 			       ret, "b", "b", "=q", 1);
 		if (likely(!ret))
@@ -201,7 +201,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 	}
 	case 2: {
 		u16 tmp;
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(tmp, (u16 __user *)src,
 			       ret, "w", "w", "=r", 2);
 		if (likely(!ret))
@@ -213,7 +213,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 
 	case 4: {
 		u32 tmp;
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(tmp, (u32 __user *)src,
 			       ret, "l", "k", "=r", 4);
 		if (likely(!ret))
@@ -224,7 +224,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 	}
 	case 8: {
 		u64 tmp;
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm(tmp, (u64 __user *)src,
 			       ret, "q", "", "=r", 8);
 		if (likely(!ret))
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 5755942f5eb2..0a6fcae404f8 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -570,7 +570,7 @@ do {									\
 unsigned long __copy_to_user_ll(void __user *to, const void *from,
 				unsigned long n)
 {
-	__uaccess_begin();
+	__uaccess_begin_nospec();
 	if (movsl_is_ok(to, from, n))
 		__copy_user(to, from, n);
 	else
@@ -583,7 +583,7 @@ EXPORT_SYMBOL(__copy_to_user_ll);
 unsigned long __copy_from_user_ll(void *to, const void __user *from,
 					unsigned long n)
 {
-	__uaccess_begin();
+	__uaccess_begin_nospec();
 	if (movsl_is_ok(to, from, n))
 		__copy_user_zeroing(to, from, n);
 	else
@@ -596,7 +596,7 @@ EXPORT_SYMBOL(__copy_from_user_ll);
 unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
 					 unsigned long n)
 {
-	__uaccess_begin();
+	__uaccess_begin_nospec();
 	if (movsl_is_ok(to, from, n))
 		__copy_user(to, from, n);
 	else
@@ -610,7 +610,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero);
 unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
 					unsigned long n)
 {
-	__uaccess_begin();
+	__uaccess_begin_nospec();
 #ifdef CONFIG_X86_INTEL_USERCOPY
 	if (n > 64 && cpu_has_xmm2)
 		n = __copy_user_zeroing_intel_nocache(to, from, n);
@@ -627,7 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache);
 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
 					unsigned long n)
 {
-	__uaccess_begin();
+	__uaccess_begin_nospec();
 #ifdef CONFIG_X86_INTEL_USERCOPY
 	if (n > 64 && cpu_has_xmm2)
 		n = __copy_user_intel_nocache(to, from, n);

From 2658e4d66deca4c1fc6eb59514bded62dd0a7812 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:19 -0400
Subject: [PATCH 1929/3220] x86/bugs, KVM: Support the combination of guest and
 host IBRS

commit 5cf687548705412da47c9cec342fd952d71ed3d5 upstream.

A guest may modify the SPEC_CTRL MSR from the value used by the
kernel. Since the kernel doesn't use IBRS, this means a value of zero is
what is needed in the host.

But the 336996-Speculative-Execution-Side-Channel-Mitigations.pdf refers to
the other bits as reserved so the kernel should respect the boot time
SPEC_CTRL value and use that.

This allows to deal with future extensions to the SPEC_CTRL interface if
any at all.

Note: This uses wrmsrl() instead of native_wrmsl(). I does not make any
difference as paravirt will over-write the callq *0xfff.. with the wrmsrl
assembler code.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
[bwh: Backported to 4.4: This was partly applied before; apply just the
 missing bits]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/svm.c | 6 ++----
 arch/x86/kvm/vmx.c | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e1f20e0d62c2..f86303592768 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3904,8 +3904,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (svm->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+	x86_spec_ctrl_set_guest(svm->spec_ctrl);
 
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
@@ -4017,8 +4016,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
 		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	if (svm->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	x86_spec_ctrl_restore_host(svm->spec_ctrl);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b5c009859e..0fffd247037b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8658,8 +8658,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (vmx->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+	x86_spec_ctrl_set_guest(vmx->spec_ctrl);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
@@ -8797,8 +8796,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
 		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	if (vmx->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	x86_spec_ctrl_restore_host(vmx->spec_ctrl);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();

From 0109a1b0a5cababd514671b517722585302c0d4f Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 25 Apr 2018 22:04:25 -0400
Subject: [PATCH 1930/3220] x86/KVM/VMX: Expose SPEC_CTRL Bit(2) to the guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit da39556f66f5cfe8f9c989206974f1cb16ca5d7c upstream.

Expose the CPUID.7.EDX[31] bit to the guest, and also guard against various
combinations of SPEC_CTRL MSR values.

The handling of the MSR (to take into account the host value of SPEC_CTRL
Bit(2)) is taken care of in patch:

  KVM/SVM/VMX/x86/spectre_v2: Support the combination of guest and host IBRS

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>

[dwmw2: Handle 4.9 guest CPUID differences, rename
        guest_cpu_has_ibrs() → guest_cpu_has_spec_ctrl()]
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4: Update feature bit name]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/cpuid.c | 2 +-
 arch/x86/kvm/cpuid.h | 4 ++--
 arch/x86/kvm/svm.c   | 4 ++--
 arch/x86/kvm/vmx.c   | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0ab72a8387d4..6b20e0a823da 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -364,7 +364,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
-		F(SPEC_CTRL) | F(ARCH_CAPABILITIES);
+		F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 7f74d7e18a01..31ff5d2d0536 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -170,7 +170,7 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu)
 	return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
 }
 
-static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu)
+static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 
@@ -178,7 +178,7 @@ static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu)
 	if (best && (best->ebx & bit(X86_FEATURE_IBRS)))
 		return true;
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
+	return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD)));
 }
 
 static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f86303592768..9b3ac8c54a59 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3090,7 +3090,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has_ibrs(vcpu))
+		    !guest_cpuid_has_spec_ctrl(vcpu))
 			return 1;
 
 		msr_info->data = svm->spec_ctrl;
@@ -3171,7 +3171,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr->host_initiated &&
-		    !guest_cpuid_has_ibrs(vcpu))
+		    !guest_cpuid_has_spec_ctrl(vcpu))
 			return 1;
 
 		/* The STIBP bit doesn't fault even if it's not advertised */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0fffd247037b..ea2e36d85569 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2861,7 +2861,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has_ibrs(vcpu))
+		    !guest_cpuid_has_spec_ctrl(vcpu))
 			return 1;
 
 		msr_info->data = to_vmx(vcpu)->spec_ctrl;
@@ -2973,11 +2973,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has_ibrs(vcpu))
+		    !guest_cpuid_has_spec_ctrl(vcpu))
 			return 1;
 
 		/* The STIBP bit doesn't fault even if it's not advertised */
-		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
 			return 1;
 
 		vmx->spec_ctrl = data;

From 7f77d36ab3f3d3dc09af0afbc7b58198382e9941 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 May 2018 15:21:01 +0200
Subject: [PATCH 1931/3220] KVM: SVM: Move spec control call after restore of
 GS

commit 15e6c22fd8e5a42c5ed6d487b7c9fe44c2517765 upstream.

svm_vcpu_run() invokes x86_spec_ctrl_restore_host() after VMEXIT, but
before the host GS is restored. x86_spec_ctrl_restore_host() uses 'current'
to determine the host SSBD state of the thread. 'current' is GS based, but
host GS is not yet restored and the access causes a triple fault.

Move the call after the host GS restore.

Fixes: 885f82bfbc6f x86/process: Allow runtime control of Speculative Store Bypass
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/svm.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9b3ac8c54a59..49d5543ebc98 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3998,6 +3998,18 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		);
 
+	/* Eliminate branch target predictions from guest mode */
+	vmexit_fill_RSB();
+
+#ifdef CONFIG_X86_64
+	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
+	loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+	loadsegment(gs, svm->host.gs);
+#endif
+#endif
+
 	/*
 	 * We do not use IBRS in the kernel. If this vCPU has used the
 	 * SPEC_CTRL MSR it may have left it on; save the value and
@@ -4018,18 +4030,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	x86_spec_ctrl_restore_host(svm->spec_ctrl);
 
-	/* Eliminate branch target predictions from guest mode */
-	vmexit_fill_RSB();
-
-#ifdef CONFIG_X86_64
-	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
-	loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
-	loadsegment(gs, svm->host.gs);
-#endif
-#endif
-
 	reload_tss(vcpu);
 
 	local_irq_disable();

From b5ec2b3f11993d843f75c2d2954ece20af96dc88 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 May 2018 23:01:01 +0200
Subject: [PATCH 1932/3220] x86/bugs, KVM: Extend speculation control for
 VIRT_SPEC_CTRL

commit ccbcd2674472a978b48c91c1fbfb66c0ff959f24 upstream.

AMD is proposing a VIRT_SPEC_CTRL MSR to handle the Speculative Store
Bypass Disable via MSR_AMD64_LS_CFG so that guests do not have to care
about the bit position of the SSBD bit and thus facilitate migration.
Also, the sibling coordination on Family 17H CPUs can only be done on
the host.

Extend x86_spec_ctrl_set_guest() and x86_spec_ctrl_restore_host() with an
extra argument for the VIRT_SPEC_CTRL MSR.

Hand in 0 from VMX and in SVM add a new virt_spec_ctrl member to the CPU
data structure which is going to be used in later patches for the actual
implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4: This was partly applied before; apply just the
 missing bits]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/svm.c | 11 +++++++++--
 arch/x86/kvm/vmx.c |  5 +++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 49d5543ebc98..9abcc08f4e93 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -149,6 +149,12 @@ struct vcpu_svm {
 	} host;
 
 	u64 spec_ctrl;
+	/*
+	 * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+	 * translated into the appropriate L2_CFG bits on the host to
+	 * perform speculative control.
+	 */
+	u64 virt_spec_ctrl;
 
 	u32 *msrpm;
 
@@ -1146,6 +1152,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	u32 eax = 1;
 
 	svm->spec_ctrl = 0;
+	svm->virt_spec_ctrl = 0;
 
 	if (!init_event) {
 		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
@@ -3904,7 +3911,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	x86_spec_ctrl_set_guest(svm->spec_ctrl);
+	x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
@@ -4028,7 +4035,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
 		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	x86_spec_ctrl_restore_host(svm->spec_ctrl);
+	x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
 
 	reload_tss(vcpu);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ea2e36d85569..e99994cc1266 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8658,9 +8658,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	x86_spec_ctrl_set_guest(vmx->spec_ctrl);
+	x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
+
 	asm(
 		/* Store host registers */
 		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -8796,7 +8797,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
 		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	x86_spec_ctrl_restore_host(vmx->spec_ctrl);
+	x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();

From 3e3a1c2ee031cd3d1a8fe9a990b61c8f17a6dd83 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 2 May 2018 18:15:14 +0200
Subject: [PATCH 1933/3220] x86/speculation: Use synthetic bits for
 IBRS/IBPB/STIBP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e7c587da125291db39ddf1f49b18e5970adbac17 upstream.

Intel and AMD have different CPUID bits hence for those use synthetic bits
which get set on the respective vendor's in init_speculation_control(). So
that debacles like what the commit message of

  c65732e4f721 ("x86/cpu: Restore CPUID_8000_0008_EBX reload")

talks about don't happen anymore.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Jörg Otte <jrg.otte@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Link: https://lkml.kernel.org/r/20180504161815.GG9257@pd.tnic
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4: This was partly applied before; apply just the
 missing bits]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/cpuid.c | 10 +++++-----
 arch/x86/kvm/cpuid.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6b20e0a823da..b0371a77cbc8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -343,7 +343,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 0x80000008.ebx */
 	const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-		F(IBPB) | F(IBRS);
+		F(AMD_IBPB) | F(AMD_IBRS);
 
 	/* cpuid 0xC0000001.edx */
 	const u32 kvm_supported_word5_x86_features =
@@ -596,10 +596,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
 		/* IBRS and IBPB aren't necessarily present in hardware cpuid */
-		if (boot_cpu_has(X86_FEATURE_IBPB))
-			entry->ebx |= F(IBPB);
-		if (boot_cpu_has(X86_FEATURE_IBRS))
-			entry->ebx |= F(IBRS);
+		if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
+			entry->ebx |= F(AMD_IBPB);
+		if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
+			entry->ebx |= F(AMD_IBRS);
 		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
 		cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
 		break;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 31ff5d2d0536..ba8988707e9d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -164,7 +164,7 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu)
 	struct kvm_cpuid_entry2 *best;
 
 	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
-	if (best && (best->ebx & bit(X86_FEATURE_IBPB)))
+	if (best && (best->ebx & bit(X86_FEATURE_AMD_IBPB)))
 		return true;
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
@@ -175,7 +175,7 @@ static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu)
 	struct kvm_cpuid_entry2 *best;
 
 	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
-	if (best && (best->ebx & bit(X86_FEATURE_IBRS)))
+	if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS)))
 		return true;
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD)));

From ff3c3b181c5ee5930b9cc6ca59c4c985a3d93220 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Thu, 10 May 2018 22:06:39 +0200
Subject: [PATCH 1934/3220] KVM: SVM: Implement VIRT_SPEC_CTRL support for SSBD

commit bc226f07dcd3c9ef0b7f6236fe356ea4a9cb4769 upstream.

Expose the new virtualized architectural mechanism, VIRT_SSBD, for using
speculative store bypass disable (SSBD) under SVM.  This will allow guests
to use SSBD on hardware that uses non-architectural mechanisms for enabling
SSBD.

[ tglx: Folded the migration fixup from Paolo Bonzini ]

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kernel/cpu/common.c    |  3 ++-
 arch/x86/kvm/cpuid.c            | 11 +++++++++--
 arch/x86/kvm/cpuid.h            |  9 +++++++++
 arch/x86/kvm/svm.c              | 21 +++++++++++++++++++--
 arch/x86/kvm/vmx.c              | 18 +++++++++++++++---
 arch/x86/kvm/x86.c              | 13 ++++---------
 7 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3a37cdbdfbaa..c048d0d70cc4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -765,7 +765,7 @@ struct kvm_x86_ops {
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 	bool (*cpu_has_accelerated_tpr)(void);
-	bool (*cpu_has_high_real_mode_segbase)(void);
+	bool (*has_emulated_msr)(int index);
 	void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
 	/* Create, but do not attach this VCPU */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b12c0287d6cf..e8b46f575306 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -693,7 +693,8 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
 		set_cpu_cap(c, X86_FEATURE_STIBP);
 
-	if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD))
+	if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) ||
+	    cpu_has(c, X86_FEATURE_VIRT_SSBD))
 		set_cpu_cap(c, X86_FEATURE_SSBD);
 
 	if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b0371a77cbc8..b857bb9f6f23 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -343,7 +343,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 0x80000008.ebx */
 	const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-		F(AMD_IBPB) | F(AMD_IBRS);
+		F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
 
 	/* cpuid 0xC0000001.edx */
 	const u32 kvm_supported_word5_x86_features =
@@ -595,13 +595,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			g_phys_as = phys_as;
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
-		/* IBRS and IBPB aren't necessarily present in hardware cpuid */
+		/*
+		 * IBRS, IBPB and VIRT_SSBD aren't necessarily present in
+		 * hardware cpuid
+		 */
 		if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
 			entry->ebx |= F(AMD_IBPB);
 		if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
 			entry->ebx |= F(AMD_IBRS);
+		if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+			entry->ebx |= F(VIRT_SSBD);
 		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
 		cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+		if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+			entry->ebx |= F(VIRT_SSBD);
 		break;
 	}
 	case 0x80000019:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ba8988707e9d..72f159f4d456 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -189,6 +189,15 @@ static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
 	return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES));
 }
 
+static inline bool guest_cpuid_has_virt_ssbd(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+	return best && (best->ebx & bit(X86_FEATURE_VIRT_SSBD));
+}
+
+
 
 /*
  * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9abcc08f4e93..ecdf724da371 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3102,6 +3102,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		msr_info->data = svm->spec_ctrl;
 		break;
+	case MSR_AMD64_VIRT_SPEC_CTRL:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has_virt_ssbd(vcpu))
+			return 1;
+
+		msr_info->data = svm->virt_spec_ctrl;
+		break;
 	case MSR_IA32_UCODE_REV:
 		msr_info->data = 0x01000065;
 		break;
@@ -3219,6 +3226,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 			break;
 		set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
 		break;
+	case MSR_AMD64_VIRT_SPEC_CTRL:
+		if (!msr->host_initiated &&
+		    !guest_cpuid_has_virt_ssbd(vcpu))
+			return 1;
+
+		if (data & ~SPEC_CTRL_SSBD)
+			return 1;
+
+		svm->virt_spec_ctrl = data;
+		break;
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
 		break;
@@ -4137,7 +4154,7 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
-static bool svm_has_high_real_mode_segbase(void)
+static bool svm_has_emulated_msr(int index)
 {
 	return true;
 }
@@ -4421,7 +4438,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,
 	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
-	.cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
+	.has_emulated_msr = svm_has_emulated_msr,
 
 	.vcpu_create = svm_create_vcpu,
 	.vcpu_free = svm_free_vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e99994cc1266..e4b5fd72ca24 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8458,9 +8458,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 		local_irq_enable();
 }
 
-static bool vmx_has_high_real_mode_segbase(void)
+static bool vmx_has_emulated_msr(int index)
 {
-	return enable_unrestricted_guest || emulate_invalid_guest_state;
+	switch (index) {
+	case MSR_IA32_SMBASE:
+		/*
+		 * We cannot do SMM unless we can run the guest in big
+		 * real mode.
+		 */
+		return enable_unrestricted_guest || emulate_invalid_guest_state;
+	case MSR_AMD64_VIRT_SPEC_CTRL:
+		/* This is AMD only.  */
+		return false;
+	default:
+		return true;
+	}
 }
 
 static bool vmx_mpx_supported(void)
@@ -10952,7 +10964,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
 	.cpu_has_accelerated_tpr = report_flexpriority,
-	.cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
+	.has_emulated_msr = vmx_has_emulated_msr,
 
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 12a91ea85d3a..aa1a0277a678 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -985,6 +985,7 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
 	MSR_IA32_SMBASE,
+	MSR_AMD64_VIRT_SPEC_CTRL,
 };
 
 static unsigned num_emulated_msrs;
@@ -2584,7 +2585,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		 * fringe case that is not enabled except via specific settings
 		 * of the module parameters.
 		 */
-		r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+		r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
 		break;
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
@@ -4073,14 +4074,8 @@ static void kvm_init_msr_list(void)
 	num_msrs_to_save = j;
 
 	for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
-		switch (emulated_msrs[i]) {
-		case MSR_IA32_SMBASE:
-			if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
-				continue;
-			break;
-		default:
-			break;
-		}
+		if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+			continue;
 
 		if (j < i)
 			emulated_msrs[j] = emulated_msrs[i];

From 3c4bb079e16e222324c68d7594b1ab6f699edfca Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 1 Sep 2016 18:37:21 -0700
Subject: [PATCH 1935/3220] bpf: support 8-byte metafield access

commit cedaf52693f02372010548c63b2e63228b959099 upstream.

The verifier supported only 4-byte metafields in
struct __sk_buff and struct xdp_md. The metafields in upcoming
struct bpf_perf_event are 8-byte to match register width in struct pt_regs.
Teach verifier to recognize 8-byte metafield access.
The patch doesn't affect safety of sockets and xdp programs.
They check for 4-byte only ctx access before these conditions are hit.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/verifier.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 35dfa9e9d69e..a937611c2570 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1844,7 +1844,8 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
-			if (BPF_SIZE(insn->code) != BPF_W) {
+			if (BPF_SIZE(insn->code) != BPF_W &&
+			    BPF_SIZE(insn->code) != BPF_DW) {
 				insn_idx++;
 				continue;
 			}
@@ -2220,9 +2221,11 @@ static int convert_ctx_accesses(struct verifier_env *env)
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		u32 cnt;
 
-		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
 			type = BPF_READ;
-		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+			 insn->code == (BPF_STX | BPF_MEM | BPF_DW))
 			type = BPF_WRITE;
 		else
 			continue;

From 168cb9b7b2839e861278f9fde03820aba32c4ee0 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Wed, 5 Dec 2018 22:45:15 +0000
Subject: [PATCH 1936/3220] bpf/verifier: Add spi variable to
 check_stack_write()

Extracted from commit dc503a8ad984 "bpf/verifier: track liveness for
pruning".

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/verifier.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a937611c2570..4756b88c828e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -572,7 +572,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 static int check_stack_write(struct verifier_state *state, int off, int size,
 			     int value_regno)
 {
-	int i;
+	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
 	 * so it's aligned access and [off, off + size) are within stack limits
 	 */
@@ -587,15 +587,13 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
 		}
 
 		/* save register state */
-		state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
-			state->regs[value_regno];
+		state->spilled_regs[spi] = state->regs[value_regno];
 
 		for (i = 0; i < BPF_REG_SIZE; i++)
 			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
 	} else {
 		/* regular write of data into stack */
-		state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
-			(struct reg_state) {};
+		state->spilled_regs[spi] = (struct reg_state) {};
 
 		for (i = 0; i < size; i++)
 			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;

From 451624d47005aace4e314b488cb70ba3ee5dcce8 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Wed, 5 Dec 2018 22:41:36 +0000
Subject: [PATCH 1937/3220] bpf/verifier: Pass instruction index to
 check_mem_access() and check_xadd()

Extracted from commit 31fd85816dbe "bpf: permits narrower load from
bpf program context fields".

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/verifier.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4756b88c828e..060cb8cba56b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -694,7 +694,7 @@ static bool is_ctx_reg(struct verifier_env *env, int regno)
  * if t==write && value_regno==-1, some unknown value is stored into memory
  * if t==read && value_regno==-1, don't care what we read from memory
  */
-static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct verifier_env *env, int insn_idx, u32 regno, int off,
 			    int bpf_size, enum bpf_access_type t,
 			    int value_regno)
 {
@@ -758,7 +758,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 	return err;
 }
 
-static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct verifier_env *env, int insn_idx, struct bpf_insn *insn)
 {
 	struct reg_state *regs = env->cur_state.regs;
 	int err;
@@ -791,13 +791,13 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
 	}
 
 	/* check whether atomic_add can read the memory */
-	err = check_mem_access(env, insn->dst_reg, insn->off,
+	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 			       BPF_SIZE(insn->code), BPF_READ, -1);
 	if (err)
 		return err;
 
 	/* check whether atomic_add can write into the same memory */
-	return check_mem_access(env, insn->dst_reg, insn->off,
+	return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 				BPF_SIZE(insn->code), BPF_WRITE, -1);
 }
 
@@ -1836,7 +1836,7 @@ static int do_check(struct verifier_env *env)
 			/* check that memory (src_reg + off) is readable,
 			 * the state of dst_reg will be updated by this func
 			 */
-			err = check_mem_access(env, insn->src_reg, insn->off,
+			err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_READ,
 					       insn->dst_reg);
 			if (err)
@@ -1875,7 +1875,7 @@ static int do_check(struct verifier_env *env)
 			enum bpf_reg_type *prev_dst_type, dst_reg_type;
 
 			if (BPF_MODE(insn->code) == BPF_XADD) {
-				err = check_xadd(env, insn);
+				err = check_xadd(env, insn_idx, insn);
 				if (err)
 					return err;
 				insn_idx++;
@@ -1894,7 +1894,7 @@ static int do_check(struct verifier_env *env)
 			dst_reg_type = regs[insn->dst_reg].type;
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn->dst_reg, insn->off,
+			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
 					       insn->src_reg);
 			if (err)
@@ -1929,7 +1929,7 @@ static int do_check(struct verifier_env *env)
 			}
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn->dst_reg, insn->off,
+			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
 					       -1);
 			if (err)

From 1c74bd22e846b162ea6401e8d43172e0e7256ccf Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 May 2018 09:27:05 -0700
Subject: [PATCH 1938/3220] bpf: Prevent memory disambiguation attack

commit af86ca4e3088fe5eacf2f7e58c01fa68ca067672 upstream.

Detect code patterns where malicious 'speculative store bypass' can be used
and sanitize such patterns.

 39: (bf) r3 = r10
 40: (07) r3 += -216
 41: (79) r8 = *(u64 *)(r7 +0)   // slow read
 42: (7a) *(u64 *)(r10 -72) = 0  // verifier inserts this instruction
 43: (7b) *(u64 *)(r8 +0) = r3   // this store becomes slow due to r8
 44: (79) r1 = *(u64 *)(r6 +0)   // cpu speculatively executes this load
 45: (71) r2 = *(u8 *)(r1 +0)    // speculatively arbitrary 'load byte'
                                 // is now sanitized

Above code after x86 JIT becomes:
 e5: mov    %rbp,%rdx
 e8: add    $0xffffffffffffff28,%rdx
 ef: mov    0x0(%r13),%r14
 f3: movq   $0x0,-0x48(%rbp)
 fb: mov    %rdx,0x0(%r14)
 ff: mov    0x0(%rbx),%rdi
103: movzbq 0x0(%rdi),%rsi

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bwh: Backported to 4.4:
 - Add verifier_env parameter to check_stack_write()
 - Look up stack slot_types with state->stack_slot_type[] rather than
   state->stack[].slot_type[]
 - Drop bpf_verifier_env argument to verbose()
 - Adjust filename, context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/verifier.c | 63 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 060cb8cba56b..c43ca9857479 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -191,6 +191,7 @@ struct bpf_insn_aux_data {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
 		struct bpf_map *map_ptr;	/* pointer for call insn into lookup_elem */
 	};
+	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
 };
 
@@ -569,8 +570,9 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
-static int check_stack_write(struct verifier_state *state, int off, int size,
-			     int value_regno)
+static int check_stack_write(struct verifier_env *env,
+			     struct verifier_state *state, int off,
+			     int size, int value_regno, int insn_idx)
 {
 	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -589,8 +591,32 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
 		/* save register state */
 		state->spilled_regs[spi] = state->regs[value_regno];
 
-		for (i = 0; i < BPF_REG_SIZE; i++)
+		for (i = 0; i < BPF_REG_SIZE; i++) {
+			if (state->stack_slot_type[MAX_BPF_STACK + off + i] == STACK_MISC &&
+			    !env->allow_ptr_leaks) {
+				int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+				int soff = (-spi - 1) * BPF_REG_SIZE;
+
+				/* detected reuse of integer stack slot with a pointer
+				 * which means either llvm is reusing stack slot or
+				 * an attacker is trying to exploit CVE-2018-3639
+				 * (speculative store bypass)
+				 * Have to sanitize that slot with preemptive
+				 * store of zero.
+				 */
+				if (*poff && *poff != soff) {
+					/* disallow programs where single insn stores
+					 * into two different stack slots, since verifier
+					 * cannot sanitize them
+					 */
+					verbose("insn %d cannot access two stack slots fp%d and fp%d",
+						insn_idx, *poff, soff);
+					return -EINVAL;
+				}
+				*poff = soff;
+			}
 			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+		}
 	} else {
 		/* regular write of data into stack */
 		state->spilled_regs[spi] = (struct reg_state) {};
@@ -746,7 +772,8 @@ static int check_mem_access(struct verifier_env *env, int insn_idx, u32 regno, i
 				verbose("attempt to corrupt spilled pointer on stack\n");
 				return -EACCES;
 			}
-			err = check_stack_write(state, off, size, value_regno);
+			err = check_stack_write(env, state, off, size,
+						value_regno, insn_idx);
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
@@ -2228,6 +2255,34 @@ static int convert_ctx_accesses(struct verifier_env *env)
 		else
 			continue;
 
+		if (type == BPF_WRITE &&
+		    env->insn_aux_data[i + delta].sanitize_stack_off) {
+			struct bpf_insn patch[] = {
+				/* Sanitize suspicious stack slot with zero.
+				 * There are no memory dependencies for this store,
+				 * since it's only using frame pointer and immediate
+				 * constant of zero
+				 */
+				BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+					   env->insn_aux_data[i + delta].sanitize_stack_off,
+					   0),
+				/* the original STX instruction will immediately
+				 * overwrite the same stack slot with appropriate value
+				 */
+				*insn,
+			};
+
+			cnt = ARRAY_SIZE(patch);
+			new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
 			continue;
 

From e47b9b2b005ab8b1b83bc0ac4aa2803cba57182a Mon Sep 17 00:00:00 2001
From: Lior David <qca_liord@qca.qualcomm.com>
Date: Tue, 14 Nov 2017 15:25:39 +0200
Subject: [PATCH 1939/3220] wil6210: missing length check in wmi_set_ie

commit b5a8ffcae4103a9d823ea3aa3a761f65779fbe2a upstream.

Add a length check in wmi_set_ie to detect unsigned integer
overflow.

Signed-off-by: Lior David <qca_liord@qca.qualcomm.com>
Signed-off-by: Maya Erez <qca_merez@qca.qualcomm.com>
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/wireless/ath/wil6210/wmi.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index 6ed26baca0e5..7af8479acb98 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -1035,8 +1035,14 @@ int wmi_set_ie(struct wil6210_priv *wil, u8 type, u16 ie_len, const void *ie)
 	};
 	int rc;
 	u16 len = sizeof(struct wmi_set_appie_cmd) + ie_len;
-	struct wmi_set_appie_cmd *cmd = kzalloc(len, GFP_KERNEL);
+	struct wmi_set_appie_cmd *cmd;
 
+	if (len < ie_len) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	cmd = kzalloc(len, GFP_KERNEL);
 	if (!cmd) {
 		rc = -ENOMEM;
 		goto out;

From 954648ebf8e27fcbf23b7954b79a22a5cacc83b1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Nov 2018 13:02:38 -0700
Subject: [PATCH 1940/3220] posix-timers: Sanitize overrun handling

commit 78c9c4dfbf8c04883941445a195276bb4bb92c76 upstream.

The posix timer overrun handling is broken because the forwarding functions
can return a huge number of overruns which does not fit in an int. As a
consequence timer_getoverrun(2) and siginfo::si_overrun can turn into
random number generators.

The k_clock::timer_forward() callbacks return a 64 bit value now. Make
k_itimer::ti_overrun[_last] 64bit as well, so the kernel internal
accounting is correct. 3Remove the temporary (int) casts.

Add a helper function which clamps the overrun value returned to user space
via timer_getoverrun(2) or siginfo::si_overrun limited to a positive value
between 0 and INT_MAX. INT_MAX is an indicator for user space that the
overrun value has been clamped.

Reported-by: Team OWL337 <icytxw@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Link: https://lkml.kernel.org/r/20180626132705.018623573@linutronix.de
[florian: Make patch apply to v4.9.135]
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/posix-timers.h   |  4 ++--
 kernel/time/posix-cpu-timers.c |  2 +-
 kernel/time/posix-timers.c     | 29 +++++++++++++++++++----------
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 907f3fd191ac..3e28a1a8d823 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -65,8 +65,8 @@ struct k_itimer {
 	spinlock_t it_lock;
 	clockid_t it_clock;		/* which timer type */
 	timer_t it_id;			/* timer id */
-	int it_overrun;			/* overrun on pending signal  */
-	int it_overrun_last;		/* overrun on last delivered signal */
+	s64 it_overrun;			/* overrun on pending signal  */
+	s64 it_overrun_last;		/* overrun on last delivered signal */
 	int it_requeue_pending;		/* waiting to requeue this timer */
 #define REQUEUE_PENDING 1
 	int it_sigev_notify;		/* notify word of sigevent struct */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 80016b329d94..8fc68e60c795 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -103,7 +103,7 @@ static void bump_cpu_timer(struct k_itimer *timer,
 			continue;
 
 		timer->it.cpu.expires += incr;
-		timer->it_overrun += 1 << i;
+		timer->it_overrun += 1LL << i;
 		delta -= incr;
 	}
 }
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index fc7c37ad90a0..0e6ed2e7d066 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -355,6 +355,17 @@ static __init int init_posix_timers(void)
 
 __initcall(init_posix_timers);
 
+/*
+ * The siginfo si_overrun field and the return value of timer_getoverrun(2)
+ * are of type int. Clamp the overrun value to INT_MAX
+ */
+static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval)
+{
+	s64 sum = timr->it_overrun_last + (s64)baseval;
+
+	return sum > (s64)INT_MAX ? INT_MAX : (int)sum;
+}
+
 static void schedule_next_timer(struct k_itimer *timr)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
@@ -362,12 +373,11 @@ static void schedule_next_timer(struct k_itimer *timr)
 	if (timr->it.real.interval.tv64 == 0)
 		return;
 
-	timr->it_overrun += (unsigned int) hrtimer_forward(timer,
-						timer->base->get_time(),
-						timr->it.real.interval);
+	timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
+					    timr->it.real.interval);
 
 	timr->it_overrun_last = timr->it_overrun;
-	timr->it_overrun = -1;
+	timr->it_overrun = -1LL;
 	++timr->it_requeue_pending;
 	hrtimer_restart(timer);
 }
@@ -396,7 +406,7 @@ void do_schedule_next_timer(struct siginfo *info)
 		else
 			schedule_next_timer(timr);
 
-		info->si_overrun += timr->it_overrun_last;
+		info->si_overrun = timer_overrun_to_int(timr, info->si_overrun);
 	}
 
 	if (timr)
@@ -491,8 +501,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 					now = ktime_add(now, kj);
 			}
 #endif
-			timr->it_overrun += (unsigned int)
-				hrtimer_forward(timer, now,
+			timr->it_overrun += hrtimer_forward(timer, now,
 						timr->it.real.interval);
 			ret = HRTIMER_RESTART;
 			++timr->it_requeue_pending;
@@ -633,7 +642,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 	it_id_set = IT_ID_SET;
 	new_timer->it_id = (timer_t) new_timer_id;
 	new_timer->it_clock = which_clock;
-	new_timer->it_overrun = -1;
+	new_timer->it_overrun = -1LL;
 
 	if (timer_event_spec) {
 		if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
@@ -762,7 +771,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 	 */
 	if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
 			timr->it_sigev_notify == SIGEV_NONE))
-		timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
+		timr->it_overrun += hrtimer_forward(timer, now, iv);
 
 	remaining = __hrtimer_expires_remaining_adjusted(timer, now);
 	/* Return 0 only, when the timer is expired and not pending */
@@ -824,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 	if (!timr)
 		return -EINVAL;
 
-	overrun = timr->it_overrun_last;
+	overrun = timer_overrun_to_int(timr, 0);
 	unlock_timer(timr, flags);
 
 	return overrun;

From 30a2ae50aef84ce6bb6132859a04dca461dbafdd Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Fri, 31 Mar 2017 15:12:07 -0700
Subject: [PATCH 1941/3220] mm/hugetlb.c: don't call region_abort if region_chg
 fails

commit ff8c0c53c47530ffea82c22a0a6df6332b56c957 upstream.

Changes to hugetlbfs reservation maps is a two step process.  The first
step is a call to region_chg to determine what needs to be changed, and
prepare that change.  This should be followed by a call to call to
region_add to commit the change, or region_abort to abort the change.

The error path in hugetlb_reserve_pages called region_abort after a
failed call to region_chg.  As a result, the adds_in_progress counter in
the reservation map is off by 1.  This is caught by a VM_BUG_ON in
resv_map_release when the reservation map is freed.

syzkaller fuzzer (when using an injected kmalloc failure) found this
bug, that resulted in the following:

 kernel BUG at mm/hugetlb.c:742!
 Call Trace:
  hugetlbfs_evict_inode+0x7b/0xa0 fs/hugetlbfs/inode.c:493
  evict+0x481/0x920 fs/inode.c:553
  iput_final fs/inode.c:1515 [inline]
  iput+0x62b/0xa20 fs/inode.c:1542
  hugetlb_file_setup+0x593/0x9f0 fs/hugetlbfs/inode.c:1306
  newseg+0x422/0xd30 ipc/shm.c:575
  ipcget_new ipc/util.c:285 [inline]
  ipcget+0x21e/0x580 ipc/util.c:639
  SYSC_shmget ipc/shm.c:673 [inline]
  SyS_shmget+0x158/0x230 ipc/shm.c:657
  entry_SYSCALL_64_fastpath+0x1f/0xc2
 RIP: resv_map_release+0x265/0x330 mm/hugetlb.c:742

Link: http://lkml.kernel.org/r/1490821682-23228-1-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/hugetlb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6f99a0f906bb..591e297f0ace 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4142,7 +4142,9 @@ int hugetlb_reserve_pages(struct inode *inode,
 	return 0;
 out_err:
 	if (!vma || vma->vm_flags & VM_MAYSHARE)
-		region_abort(resv_map, from, to);
+		/* Don't call region_abort if region_chg failed */
+		if (chg >= 0)
+			region_abort(resv_map, from, to);
 	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
 		kref_put(&resv_map->refs, resv_map_release);
 	return ret;

From f24fb9efea4df0376c5bda53a2e4b189a28dc09e Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 13 Apr 2017 14:56:32 -0700
Subject: [PATCH 1942/3220] hugetlbfs: fix offset overflow in hugetlbfs mmap

commit 045c7a3f53d9403b62d396b6d051c4be5044cdb4 upstream.

If mmap() maps a file, it can be passed an offset into the file at which
the mapping is to start.  Offset could be a negative value when
represented as a loff_t.  The offset plus length will be used to update
the file size (i_size) which is also a loff_t.

Validate the value of offset and offset + length to make sure they do
not overflow and appear as negative.

Found by syzcaller with commit ff8c0c53c475 ("mm/hugetlb.c: don't call
region_abort if region_chg fails") applied.  Prior to this commit, the
overflow would still occur but we would luckily return ENOMEM.

To reproduce:

   mmap(0, 0x2000, 0, 0x40021, 0xffffffffffffffffULL, 0x8000000000000000ULL);

Resulted in,

  kernel BUG at mm/hugetlb.c:742!
  Call Trace:
   hugetlbfs_evict_inode+0x80/0xa0
   evict+0x24a/0x620
   iput+0x48f/0x8c0
   dentry_unlink_inode+0x31f/0x4d0
   __dentry_kill+0x292/0x5e0
   dput+0x730/0x830
   __fput+0x438/0x720
   ____fput+0x1a/0x20
   task_work_run+0xfe/0x180
   exit_to_usermode_loop+0x133/0x150
   syscall_return_slowpath+0x184/0x1c0
   entry_SYSCALL_64_fastpath+0xab/0xad

Fixes: ff8c0c53c475 ("mm/hugetlb.c: don't call region_abort if region_chg fails")
Link: http://lkml.kernel.org/r/1491951118-30678-1-git-send-email-mike.kravetz@oracle.com
Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/hugetlbfs/inode.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a17da8b57fc6..95091797c4d1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -136,17 +136,26 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
 	vma->vm_ops = &hugetlb_vm_ops;
 
+	/*
+	 * Offset passed to mmap (before page shift) could have been
+	 * negative when represented as a (l)off_t.
+	 */
+	if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0)
+		return -EINVAL;
+
 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
+	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	/* check for overflow */
+	if (len < vma_len)
+		return -EINVAL;
 
 	mutex_lock(&inode->i_mutex);
 	file_accessed(file);
 
 	ret = -ENOMEM;
-	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-
 	if (hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
 				len >> huge_page_shift(h), vma,
@@ -155,7 +164,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	ret = 0;
 	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
-		inode->i_size = len;
+		i_size_write(inode, len);
 out:
 	mutex_unlock(&inode->i_mutex);
 

From 38d924e1d040a5632c023d7b9a3a6bfbaf4d49d1 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 22 Mar 2018 16:17:13 -0700
Subject: [PATCH 1943/3220] hugetlbfs: check for pgoff value overflow

commit 63489f8e821144000e0bdca7e65a8d1cc23a7ee7 upstream.

A vma with vm_pgoff large enough to overflow a loff_t type when
converted to a byte offset can be passed via the remap_file_pages system
call.  The hugetlbfs mmap routine uses the byte offset to calculate
reservations and file size.

A sequence such as:

  mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0);
  remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0);

will result in the following when task exits/file closed,

  kernel BUG at mm/hugetlb.c:749!
  Call Trace:
    hugetlbfs_evict_inode+0x2f/0x40
    evict+0xcb/0x190
    __dentry_kill+0xcb/0x150
    __fput+0x164/0x1e0
    task_work_run+0x84/0xa0
    exit_to_usermode_loop+0x7d/0x80
    do_syscall_64+0x18b/0x190
    entry_SYSCALL_64_after_hwframe+0x3d/0xa2

The overflowed pgoff value causes hugetlbfs to try to set up a mapping
with a negative range (end < start) that leaves invalid state which
causes the BUG.

The previous overflow fix to this code was incomplete and did not take
the remap_file_pages system call into account.

[mike.kravetz@oracle.com: v3]
  Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com
[akpm@linux-foundation.org: include mmdebug.h]
[akpm@linux-foundation.org: fix -ve left shift count on sh]
Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com
Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Nic Losby <blurbdust@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 4.4: Use a conditional WARN() instead of VM_WARN()]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/hugetlbfs/inode.c | 17 ++++++++++++++---
 mm/hugetlb.c         |  8 ++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 95091797c4d1..fa8eb59b06a8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -118,6 +118,16 @@ static void huge_pagevec_release(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
 
+/*
+ * Mask used when checking the page offset value passed in via system
+ * calls.  This value will be converted to a loff_t which is signed.
+ * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
+ * value.  The extra bit (- 1 in the shift value) is to take the sign
+ * bit into account.
+ */
+#define PGOFF_LOFFT_MAX \
+	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
+
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
@@ -137,12 +147,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_ops = &hugetlb_vm_ops;
 
 	/*
-	 * Offset passed to mmap (before page shift) could have been
-	 * negative when represented as a (l)off_t.
+	 * page based offset in vm_pgoff could be sufficiently large to
+	 * overflow a (l)off_t when converted to byte offset.
 	 */
-	if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0)
+	if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
 		return -EINVAL;
 
+	/* must be huge page aligned */
 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 591e297f0ace..f1a45f5077fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4053,6 +4053,14 @@ int hugetlb_reserve_pages(struct inode *inode,
 	struct resv_map *resv_map;
 	long gbl_reserve;
 
+	/* This should never happen */
+	if (from > to) {
+#ifdef CONFIG_DEBUG_VM
+		WARN(1, "%s called with a negative range\n", __func__);
+#endif
+		return -EINVAL;
+	}
+
 	/*
 	 * Only apply hugepage reservation if asked. At fault time, an
 	 * attempt will be made for VM_NORESERVE to allocate a page

From 0d547583faaa4371ec566bdd38488b1eba5e8a2d Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 5 Apr 2018 16:18:21 -0700
Subject: [PATCH 1944/3220] hugetlbfs: fix bug in pgoff overflow checking

commit 5df63c2a149ae65a9ec239e7c2af44efa6f79beb upstream.

This is a fix for a regression in 32 bit kernels caused by an invalid
check for pgoff overflow in hugetlbfs mmap setup.  The check incorrectly
specified that the size of a loff_t was the same as the size of a long.
The regression prevents mapping hugetlbfs files at offsets greater than
4GB on 32 bit kernels.

On 32 bit kernels conversion from a page based unsigned long can not
overflow a loff_t byte offset.  Therefore, skip this check if
sizeof(unsigned long) != sizeof(loff_t).

Link: http://lkml.kernel.org/r/20180330145402.5053-1-mike.kravetz@oracle.com
Fixes: 63489f8e8211 ("hugetlbfs: check for pgoff value overflow")
Reported-by: Dan Rue <dan.rue@linaro.org>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Tested-by: Anders Roxell <anders.roxell@linaro.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Nic Losby <blurbdust@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/hugetlbfs/inode.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index fa8eb59b06a8..ab34f613fa85 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -148,10 +148,14 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	/*
 	 * page based offset in vm_pgoff could be sufficiently large to
-	 * overflow a (l)off_t when converted to byte offset.
+	 * overflow a loff_t when converted to byte offset.  This can
+	 * only happen on architectures where sizeof(loff_t) ==
+	 * sizeof(unsigned long).  So, only check in those instances.
 	 */
-	if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
-		return -EINVAL;
+	if (sizeof(unsigned long) == sizeof(loff_t)) {
+		if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
+			return -EINVAL;
+	}
 
 	/* must be huge page aligned */
 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))

From c873dfa0ccbdb08e9fb42f497503e148f79cdebb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 Jul 2018 16:22:22 -0700
Subject: [PATCH 1945/3220] swiotlb: clean up reporting

commit 7d63fb3af87aa67aa7d24466e792f9d7c57d8e79 upstream.

This removes needless use of '%p', and refactors the printk calls to
use pr_*() helpers instead.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
[bwh: Backported to 4.4:
 - Adjust filename
 - Remove "swiotlb: " prefix from an additional log message]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 lib/swiotlb.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 771234d050c7..6bc452b33b76 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -17,6 +17,8 @@
  * 08/12/11 beckyb	Add highmem support
  */
 
+#define pr_fmt(fmt) "software IO TLB: " fmt
+
 #include <linux/cache.h>
 #include <linux/dma-mapping.h>
 #include <linux/mm.h>
@@ -143,20 +145,16 @@ static bool no_iotlb_memory;
 void swiotlb_print_info(void)
 {
 	unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
-	unsigned char *vstart, *vend;
 
 	if (no_iotlb_memory) {
-		pr_warn("software IO TLB: No low mem\n");
+		pr_warn("No low mem\n");
 		return;
 	}
 
-	vstart = phys_to_virt(io_tlb_start);
-	vend = phys_to_virt(io_tlb_end);
-
-	printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n",
+	pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n",
 	       (unsigned long long)io_tlb_start,
 	       (unsigned long long)io_tlb_end,
-	       bytes >> 20, vstart, vend - 1);
+	       bytes >> 20);
 }
 
 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
@@ -230,7 +228,7 @@ swiotlb_init(int verbose)
 	if (io_tlb_start)
 		memblock_free_early(io_tlb_start,
 				    PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
-	pr_warn("Cannot allocate SWIOTLB buffer");
+	pr_warn("Cannot allocate buffer");
 	no_iotlb_memory = true;
 }
 
@@ -272,8 +270,8 @@ swiotlb_late_init_with_default_size(size_t default_size)
 		return -ENOMEM;
 	}
 	if (order != get_order(bytes)) {
-		printk(KERN_WARNING "Warning: only able to allocate %ld MB "
-		       "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+		pr_warn("only able to allocate %ld MB\n",
+			(PAGE_SIZE << order) >> 20);
 		io_tlb_nslabs = SLABS_PER_PAGE << order;
 	}
 	rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
@@ -680,7 +678,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	return ret;
 
 err_warn:
-	pr_warn("swiotlb: coherent allocation failed for device %s size=%zu\n",
+	pr_warn("coherent allocation failed for device %s size=%zu\n",
 		dev_name(hwdev), size);
 	dump_stack();
 

From 8aca77150a9a2f89aba94f62a09c95f9a00c2956 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 21 May 2018 12:21:14 -0600
Subject: [PATCH 1946/3220] sr: pass down correctly sized SCSI sense buffer

commit f7068114d45ec55996b9040e98111afa56e010fe upstream.

We're casting the CDROM layer request_sense to the SCSI sense
buffer, but the former is 64 bytes and the latter is 96 bytes.
As we generally allocate these on the stack, we end up blowing
up the stack.

Fix this by wrapping the scsi_execute() call with a properly
sized sense buffer, and copying back the bits for the CDROM
layer.

Reported-by: Piotr Gabriel Kosinski <pg.kosinski@gmail.com>
Reported-by: Daniel Shapira <daniel@twistlock.com>
Tested-by: Kees Cook <keescook@chromium.org>
Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
[bwh: Despite what the "Fixes" field says, a buffer overrun was already
 possible if the sense data was really > 64 bytes long.
 Backported to 4.4:
 - We always need to allocate a sense buffer in order to call
   scsi_normalize_sense()
 - Remove the existing conditional heap-allocation of the sense buffer]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/scsi/sr_ioctl.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c
index 03054c0e7689..3c3e8115f73d 100644
--- a/drivers/scsi/sr_ioctl.c
+++ b/drivers/scsi/sr_ioctl.c
@@ -187,30 +187,25 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
 	struct scsi_device *SDev;
 	struct scsi_sense_hdr sshdr;
 	int result, err = 0, retries = 0;
-	struct request_sense *sense = cgc->sense;
+	unsigned char sense_buffer[SCSI_SENSE_BUFFERSIZE];
 
 	SDev = cd->device;
 
-	if (!sense) {
-		sense = kmalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
-		if (!sense) {
-			err = -ENOMEM;
-			goto out;
-		}
-	}
-
       retry:
 	if (!scsi_block_when_processing_errors(SDev)) {
 		err = -ENODEV;
 		goto out;
 	}
 
-	memset(sense, 0, sizeof(*sense));
+	memset(sense_buffer, 0, sizeof(sense_buffer));
 	result = scsi_execute(SDev, cgc->cmd, cgc->data_direction,
-			      cgc->buffer, cgc->buflen, (char *)sense,
+			      cgc->buffer, cgc->buflen, sense_buffer,
 			      cgc->timeout, IOCTL_RETRIES, 0, NULL);
 
-	scsi_normalize_sense((char *)sense, sizeof(*sense), &sshdr);
+	scsi_normalize_sense(sense_buffer, sizeof(sense_buffer), &sshdr);
+
+	if (cgc->sense)
+		memcpy(cgc->sense, sense_buffer, sizeof(*cgc->sense));
 
 	/* Minimal error checking.  Ignore cases we know about, and report the rest. */
 	if (driver_byte(result) != 0) {
@@ -261,8 +256,6 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
 
 	/* Wake up a process waiting for device */
       out:
-	if (!cgc->sense)
-		kfree(sense);
 	cgc->stat = err;
 	return err;
 }

From 6a3c9524df93e9597d30c2d32114950772fce58e Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:11 +0100
Subject: [PATCH 1947/3220] mm: remove write/force parameters from
 __get_user_pages_locked()

commit 859110d7497cdd0e6b21010d6f777049d676382c upstream.

This removes the redundant 'write' and 'force' parameters from
__get_user_pages_locked() to make the use of FOLL_FORCE explicit in
callers as use of this flag can result in surprising behaviour (and
hence bugs) within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 4.4:
 - Drop change in get_user_pages_remote()
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/gup.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 018144c4b9ec..bed758f7bd63 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -627,7 +627,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 						struct mm_struct *mm,
 						unsigned long start,
 						unsigned long nr_pages,
-						int write, int force,
 						struct page **pages,
 						struct vm_area_struct **vmas,
 						int *locked, bool notify_drop,
@@ -645,10 +644,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 
 	if (pages)
 		flags |= FOLL_GET;
-	if (write)
-		flags |= FOLL_WRITE;
-	if (force)
-		flags |= FOLL_FORCE;
 
 	pages_done = 0;
 	lock_dropped = false;
@@ -745,8 +740,15 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 			   int write, int force, struct page **pages,
 			   int *locked)
 {
-	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-				       pages, NULL, locked, true, FOLL_TOUCH);
+	unsigned int flags = FOLL_TOUCH;
+
+	if (write)
+		flags |= FOLL_WRITE;
+	if (force)
+		flags |= FOLL_FORCE;
+
+	return __get_user_pages_locked(tsk, mm, start, nr_pages,
+				       pages, NULL, locked, true, flags);
 }
 EXPORT_SYMBOL(get_user_pages_locked);
 
@@ -767,9 +769,15 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m
 {
 	long ret;
 	int locked = 1;
+
+	if (write)
+		gup_flags |= FOLL_WRITE;
+	if (force)
+		gup_flags |= FOLL_FORCE;
+
 	down_read(&mm->mmap_sem);
-	ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-				      pages, NULL, &locked, false, gup_flags);
+	ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, NULL,
+				      &locked, false, gup_flags);
 	if (locked)
 		up_read(&mm->mmap_sem);
 	return ret;
@@ -861,8 +869,15 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages, int write,
 		int force, struct page **pages, struct vm_area_struct **vmas)
 {
-	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-				       pages, vmas, NULL, false, FOLL_TOUCH);
+	unsigned int flags = FOLL_TOUCH;
+
+	if (write)
+		flags |= FOLL_WRITE;
+	if (force)
+		flags |= FOLL_FORCE;
+
+	return __get_user_pages_locked(tsk, mm, start, nr_pages,
+				       pages, vmas, NULL, false, flags);
 }
 EXPORT_SYMBOL(get_user_pages);
 

From ab424c8eb71ee9ea4ba798faaeaf62e84048cb9a Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:12 +0100
Subject: [PATCH 1948/3220] mm: remove write/force parameters from
 __get_user_pages_unlocked()

commit d4944b0ecec0af882483fe44b66729316e575208 upstream.

This removes the redundant 'write' and 'force' parameters from
__get_user_pages_unlocked() to make the use of FOLL_FORCE explicit in
callers as use of this flag can result in surprising behaviour (and
hence bugs) within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 4.4:
 - Defer changes in process_vm_rw_single_vec() and async_pf_execute() since
   they use get_user_pages_unlocked() here
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mm.h  |  3 +--
 mm/gup.c            | 19 ++++++++++---------
 mm/nommu.c          | 14 ++++++++++----
 virt/kvm/kvm_main.c | 11 ++++++++---
 4 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d4e8077fca96..391ae8a3d5f9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1207,8 +1207,7 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 		    int *locked);
 long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			       unsigned long start, unsigned long nr_pages,
-			       int write, int force, struct page **pages,
-			       unsigned int gup_flags);
+			       struct page **pages, unsigned int gup_flags);
 long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
 		    int write, int force, struct page **pages);
diff --git a/mm/gup.c b/mm/gup.c
index bed758f7bd63..9aee6945f349 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -764,17 +764,11 @@ EXPORT_SYMBOL(get_user_pages_locked);
  */
 __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 					       unsigned long start, unsigned long nr_pages,
-					       int write, int force, struct page **pages,
-					       unsigned int gup_flags)
+					       struct page **pages, unsigned int gup_flags)
 {
 	long ret;
 	int locked = 1;
 
-	if (write)
-		gup_flags |= FOLL_WRITE;
-	if (force)
-		gup_flags |= FOLL_FORCE;
-
 	down_read(&mm->mmap_sem);
 	ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, NULL,
 				      &locked, false, gup_flags);
@@ -805,8 +799,15 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			     unsigned long start, unsigned long nr_pages,
 			     int write, int force, struct page **pages)
 {
-	return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
-					 force, pages, FOLL_TOUCH);
+	unsigned int flags = FOLL_TOUCH;
+
+	if (write)
+		flags |= FOLL_WRITE;
+	if (force)
+		flags |= FOLL_FORCE;
+
+	return __get_user_pages_unlocked(tsk, mm, start, nr_pages,
+					 pages, flags);
 }
 EXPORT_SYMBOL(get_user_pages_unlocked);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 92be862c859b..5462ed88a548 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -211,8 +211,7 @@ EXPORT_SYMBOL(get_user_pages_locked);
 
 long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			       unsigned long start, unsigned long nr_pages,
-			       int write, int force, struct page **pages,
-			       unsigned int gup_flags)
+			       struct page **pages, unsigned int gup_flags)
 {
 	long ret;
 	down_read(&mm->mmap_sem);
@@ -227,8 +226,15 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			     unsigned long start, unsigned long nr_pages,
 			     int write, int force, struct page **pages)
 {
-	return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
-					 force, pages, 0);
+	unsigned int flags = 0;
+
+	if (write)
+		flags |= FOLL_WRITE;
+	if (force)
+		flags |= FOLL_FORCE;
+
+	return __get_user_pages_unlocked(tsk, mm, start, nr_pages,
+					 pages, flags);
 }
 EXPORT_SYMBOL(get_user_pages_unlocked);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b814ae6822b6..e4be695eb789 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1352,10 +1352,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 		npages = get_user_page_nowait(current, current->mm,
 					      addr, write_fault, page);
 		up_read(&current->mm->mmap_sem);
-	} else
+	} else {
+		unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON;
+
+		if (write_fault)
+			flags |= FOLL_WRITE;
+
 		npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
-						   write_fault, 0, page,
-						   FOLL_TOUCH|FOLL_HWPOISON);
+						   page, flags);
+	}
 	if (npages != 1)
 		return npages;
 

From ff099ed77421a5dc8206bb61e5a598b28ab39ebb Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Sun, 16 Dec 2018 23:50:08 +0000
Subject: [PATCH 1949/3220] mm/nommu.c: Switch __get_user_pages_unlocked() to
 use __get_user_pages()

Extracted from commit cde70140fed8 "mm/gup: Overload get_user_pages()
functions".  This is needed before picking commit 768ae309a961
"mm: replace get_user_pages() write/force parameters with gup_flags".

Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/nommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/nommu.c b/mm/nommu.c
index 5462ed88a548..f5322195dbcd 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -215,8 +215,8 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 {
 	long ret;
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
-			     pages, NULL);
+	ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages,
+			       NULL, NULL);
 	up_read(&mm->mmap_sem);
 	return ret;
 }

From 2b29980eb75bc7dcb23ed0436fe805ac6e684542 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:13 +0100
Subject: [PATCH 1950/3220] mm: replace get_user_pages_unlocked() write/force
 parameters with gup_flags

commit c164154f66f0c9b02673f07aa4f044f1d9c70274 upstream.

This removes the 'write' and 'force' use from get_user_pages_unlocked()
and replaces them with 'gup_flags' to make the use of FOLL_FORCE
explicit in callers as use of this flag can result in surprising
behaviour (and hence bugs) within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 4.4:
 - Also update calls from process_vm_rw_single_vec() and async_pf_execute()
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/mm/gup.c                 |  2 +-
 arch/s390/mm/gup.c                 |  2 +-
 arch/sh/mm/gup.c                   |  3 ++-
 arch/sparc/mm/gup.c                |  3 ++-
 arch/x86/mm/gup.c                  |  2 +-
 drivers/media/pci/ivtv/ivtv-udma.c |  3 ++-
 drivers/media/pci/ivtv/ivtv-yuv.c  |  8 ++++----
 drivers/scsi/st.c                  |  5 ++---
 drivers/video/fbdev/pvr2fb.c       |  2 +-
 include/linux/mm.h                 |  2 +-
 mm/gup.c                           | 14 ++++----------
 mm/nommu.c                         | 11 ++---------
 mm/process_vm_access.c             |  6 +++++-
 mm/util.c                          |  2 +-
 net/ceph/pagevec.c                 |  2 +-
 virt/kvm/async_pf.c                |  2 +-
 16 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 349995d19c7f..e596e0a1cecc 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -303,7 +303,7 @@ slow_irqon:
 
 	ret = get_user_pages_unlocked(current, mm, start,
 				      (end - start) >> PAGE_SHIFT,
-				      write, 0, pages);
+				      pages, write ? FOLL_WRITE : 0);
 
 	/* Have to be a bit careful with return values */
 	if (nr > 0) {
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 12bbf0e8478f..7ad41be8b373 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -242,7 +242,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	start += nr << PAGE_SHIFT;
 	pages += nr;
 	ret = get_user_pages_unlocked(current, mm, start,
-			     nr_pages - nr, write, 0, pages);
+			     nr_pages - nr, pages, write ? FOLL_WRITE : 0);
 	/* Have to be a bit careful with return values */
 	if (nr > 0)
 		ret = (ret < 0) ? nr : ret + nr;
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index e7af6a65baab..8c51a0e94854 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -258,7 +258,8 @@ slow_irqon:
 		pages += nr;
 
 		ret = get_user_pages_unlocked(current, mm, start,
-			(end - start) >> PAGE_SHIFT, write, 0, pages);
+			(end - start) >> PAGE_SHIFT, pages,
+			write ? FOLL_WRITE : 0);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index 2e5c4fc2daa9..150f48303fb0 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -250,7 +250,8 @@ slow:
 		pages += nr;
 
 		ret = get_user_pages_unlocked(current, mm, start,
-			(end - start) >> PAGE_SHIFT, write, 0, pages);
+			(end - start) >> PAGE_SHIFT, pages,
+			write ? FOLL_WRITE : 0);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index ae9a37bf1371..7d2542ad346a 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -388,7 +388,7 @@ slow_irqon:
 
 		ret = get_user_pages_unlocked(current, mm, start,
 					      (end - start) >> PAGE_SHIFT,
-					      write, 0, pages);
+					      pages, write ? FOLL_WRITE : 0);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c
index 24152accc66c..8729fdebef8f 100644
--- a/drivers/media/pci/ivtv/ivtv-udma.c
+++ b/drivers/media/pci/ivtv/ivtv-udma.c
@@ -125,7 +125,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
 
 	/* Get user pages for DMA Xfer */
 	err = get_user_pages_unlocked(current, current->mm,
-			user_dma.uaddr, user_dma.page_count, 0, 1, dma->map);
+			user_dma.uaddr, user_dma.page_count, dma->map,
+			FOLL_FORCE);
 
 	if (user_dma.page_count != err) {
 		IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n",
diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c
index 2b8e7b2f2b86..9cd995f418e0 100644
--- a/drivers/media/pci/ivtv/ivtv-yuv.c
+++ b/drivers/media/pci/ivtv/ivtv-yuv.c
@@ -76,13 +76,13 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma,
 
 	/* Get user pages for DMA Xfer */
 	y_pages = get_user_pages_unlocked(current, current->mm,
-				y_dma.uaddr, y_dma.page_count, 0, 1,
-				&dma->map[0]);
+				y_dma.uaddr, y_dma.page_count,
+				&dma->map[0], FOLL_FORCE);
 	uv_pages = 0; /* silence gcc. value is set and consumed only if: */
 	if (y_pages == y_dma.page_count) {
 		uv_pages = get_user_pages_unlocked(current, current->mm,
-					uv_dma.uaddr, uv_dma.page_count, 0, 1,
-					&dma->map[y_pages]);
+					uv_dma.uaddr, uv_dma.page_count,
+					&dma->map[y_pages], FOLL_FORCE);
 	}
 
 	if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) {
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 2e522951b619..088a68ab4246 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4821,9 +4821,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
 		current->mm,
 		uaddr,
 		nr_pages,
-		rw == READ,
-		0, /* don't force */
-		pages);
+		pages,
+		rw == READ ? FOLL_WRITE : 0); /* don't force */
 
 	/* Errors and no page mapped should return here */
 	if (res < nr_pages)
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c
index 0e24eb9c219c..750a384bf191 100644
--- a/drivers/video/fbdev/pvr2fb.c
+++ b/drivers/video/fbdev/pvr2fb.c
@@ -687,7 +687,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
 		return -ENOMEM;
 
 	ret = get_user_pages_unlocked(current, current->mm, (unsigned long)buf,
-				      nr_pages, WRITE, 0, pages);
+				      nr_pages, pages, FOLL_WRITE);
 
 	if (ret < nr_pages) {
 		nr_pages = ret;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 391ae8a3d5f9..c043c936a4a5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1210,7 +1210,7 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			       struct page **pages, unsigned int gup_flags);
 long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
-		    int write, int force, struct page **pages);
+		    struct page **pages, unsigned int gup_flags);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 
diff --git a/mm/gup.c b/mm/gup.c
index 9aee6945f349..6bbec1b694ea 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -797,17 +797,10 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
  */
 long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			     unsigned long start, unsigned long nr_pages,
-			     int write, int force, struct page **pages)
+			     struct page **pages, unsigned int gup_flags)
 {
-	unsigned int flags = FOLL_TOUCH;
-
-	if (write)
-		flags |= FOLL_WRITE;
-	if (force)
-		flags |= FOLL_FORCE;
-
 	return __get_user_pages_unlocked(tsk, mm, start, nr_pages,
-					 pages, flags);
+					 pages, gup_flags | FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages_unlocked);
 
@@ -1427,7 +1420,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		pages += nr;
 
 		ret = get_user_pages_unlocked(current, mm, start,
-					      nr_pages - nr, write, 0, pages);
+					      nr_pages - nr, pages,
+					      write ? FOLL_WRITE : 0);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/mm/nommu.c b/mm/nommu.c
index f5322195dbcd..e249e06579fb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -224,17 +224,10 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
 
 long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			     unsigned long start, unsigned long nr_pages,
-			     int write, int force, struct page **pages)
+			     struct page **pages, unsigned int gup_flags)
 {
-	unsigned int flags = 0;
-
-	if (write)
-		flags |= FOLL_WRITE;
-	if (force)
-		flags |= FOLL_FORCE;
-
 	return __get_user_pages_unlocked(tsk, mm, start, nr_pages,
-					 pages, flags);
+					 pages, gup_flags);
 }
 EXPORT_SYMBOL(get_user_pages_unlocked);
 
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5d453e58ddbf..1b5a6104c5fc 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -88,19 +88,23 @@ static int process_vm_rw_single_vec(unsigned long addr,
 	ssize_t rc = 0;
 	unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
 		/ sizeof(struct pages *);
+	unsigned int flags = 0;
 
 	/* Work out address and page range required */
 	if (len == 0)
 		return 0;
 	nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
 
+	if (vm_write)
+		flags |= FOLL_WRITE;
+
 	while (!rc && nr_pages && iov_iter_count(iter)) {
 		int pages = min(nr_pages, max_pages_per_loop);
 		size_t bytes;
 
 		/* Get the pages we're interested in */
 		pages = get_user_pages_unlocked(task, mm, pa, pages,
-						vm_write, 0, process_pages);
+						process_pages, flags);
 		if (pages <= 0)
 			return -EFAULT;
 
diff --git a/mm/util.c b/mm/util.c
index 5fae5b9c2885..db39235970c6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -278,7 +278,7 @@ int __weak get_user_pages_fast(unsigned long start,
 {
 	struct mm_struct *mm = current->mm;
 	return get_user_pages_unlocked(current, mm, start, nr_pages,
-				       write, 0, pages);
+				       pages, write ? FOLL_WRITE : 0);
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index d4f5f220a8e5..28453d698d86 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -26,7 +26,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
 	while (got < num_pages) {
 		rc = get_user_pages_unlocked(current, current->mm,
 		    (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
-		    num_pages - got, write_page, 0, pages + got);
+		    num_pages - got, pages + got, write_page ? FOLL_WRITE : 0);
 		if (rc < 0)
 			break;
 		BUG_ON(rc == 0);
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 4f70d12e392d..eddce59986ee 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,7 +80,7 @@ static void async_pf_execute(struct work_struct *work)
 
 	might_sleep();
 
-	get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
+	get_user_pages_unlocked(NULL, mm, addr, 1, NULL, FOLL_WRITE);
 	kvm_async_page_present_sync(vcpu, apf);
 
 	spin_lock(&vcpu->async_pf.lock);

From ffe6376c1803b5e964f609e8bc01507642397641 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:14 +0100
Subject: [PATCH 1951/3220] mm: replace get_user_pages_locked() write/force
 parameters with gup_flags

commit 3b913179c3fa89dd0e304193fa0c746fc0481447 upstream.

This removes the 'write' and 'force' use from get_user_pages_locked()
and replaces them with 'gup_flags' to make the use of FOLL_FORCE
explicit in callers as use of this flag can result in surprising
behaviour (and hence bugs) within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mm.h |  3 +--
 mm/frame_vector.c  |  8 +++++++-
 mm/gup.c           | 12 +++---------
 mm/nommu.c         |  5 ++++-
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c043c936a4a5..cbb9bd8b910a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1203,8 +1203,7 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		    struct vm_area_struct **vmas);
 long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
-		    int write, int force, struct page **pages,
-		    int *locked);
+		    unsigned int gup_flags, struct page **pages, int *locked);
 long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			       unsigned long start, unsigned long nr_pages,
 			       struct page **pages, unsigned int gup_flags);
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 7cf2b7163222..40e3116b5666 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -41,10 +41,16 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
 	int ret = 0;
 	int err;
 	int locked;
+	unsigned int gup_flags = 0;
 
 	if (nr_frames == 0)
 		return 0;
 
+	if (write)
+		gup_flags |= FOLL_WRITE;
+	if (force)
+		gup_flags |= FOLL_FORCE;
+
 	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
 		nr_frames = vec->nr_allocated;
 
@@ -59,7 +65,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
 		vec->got_ref = true;
 		vec->is_pfns = false;
 		ret = get_user_pages_locked(current, mm, start, nr_frames,
-			write, force, (struct page **)(vec->ptrs), &locked);
+			gup_flags, (struct page **)(vec->ptrs), &locked);
 		goto out;
 	}
 
diff --git a/mm/gup.c b/mm/gup.c
index 6bbec1b694ea..bcdefb977269 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -737,18 +737,12 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
  */
 long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 			   unsigned long start, unsigned long nr_pages,
-			   int write, int force, struct page **pages,
+			   unsigned int gup_flags, struct page **pages,
 			   int *locked)
 {
-	unsigned int flags = FOLL_TOUCH;
-
-	if (write)
-		flags |= FOLL_WRITE;
-	if (force)
-		flags |= FOLL_FORCE;
-
 	return __get_user_pages_locked(tsk, mm, start, nr_pages,
-				       pages, NULL, locked, true, flags);
+				       pages, NULL, locked, true,
+				       gup_flags | FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages_locked);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index e249e06579fb..6cbb985238c4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -201,9 +201,12 @@ EXPORT_SYMBOL(get_user_pages);
 
 long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 			   unsigned long start, unsigned long nr_pages,
-			   int write, int force, struct page **pages,
+			   unsigned int gup_flags, struct page **pages,
 			   int *locked)
 {
+	int write = gup_flags & FOLL_WRITE;
+	int force = gup_flags & FOLL_FORCE;
+
 	return get_user_pages(tsk, mm, start, nr_pages, write, force,
 			      pages, NULL);
 }

From 3ec22a6bce3f06aa3b8a399ea456fb1cb3792584 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:15 +0100
Subject: [PATCH 1952/3220] mm: replace get_vaddr_frames() write/force
 parameters with gup_flags

commit 7f23b3504a0df63b724180262c5f3f117f21bcae upstream.

This removes the 'write' and 'force' from get_vaddr_frames() and
replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in
callers as use of this flag can result in surprising behaviour (and
hence bugs) within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/exynos/exynos_drm_g2d.c    |  3 ++-
 drivers/media/platform/omap/omap_vout.c    |  2 +-
 drivers/media/v4l2-core/videobuf2-memops.c |  6 +++++-
 include/linux/mm.h                         |  2 +-
 mm/frame_vector.c                          | 13 ++-----------
 5 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
index c17efdb238a6..639ea28808e2 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
@@ -471,7 +471,8 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
 		goto err_free;
 	}
 
-	ret = get_vaddr_frames(start, npages, true, true, g2d_userptr->vec);
+	ret = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE,
+		g2d_userptr->vec);
 	if (ret != npages) {
 		DRM_ERROR("failed to get user pages from userptr.\n");
 		if (ret < 0)
diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c
index 70c28d19ea04..596359576109 100644
--- a/drivers/media/platform/omap/omap_vout.c
+++ b/drivers/media/platform/omap/omap_vout.c
@@ -214,7 +214,7 @@ static int omap_vout_get_userptr(struct videobuf_buffer *vb, u32 virtp,
 	if (!vec)
 		return -ENOMEM;
 
-	ret = get_vaddr_frames(virtp, 1, true, false, vec);
+	ret = get_vaddr_frames(virtp, 1, FOLL_WRITE, vec);
 	if (ret != 1) {
 		frame_vector_destroy(vec);
 		return -EINVAL;
diff --git a/drivers/media/v4l2-core/videobuf2-memops.c b/drivers/media/v4l2-core/videobuf2-memops.c
index 3c3b517f1d1c..1cd322e939c7 100644
--- a/drivers/media/v4l2-core/videobuf2-memops.c
+++ b/drivers/media/v4l2-core/videobuf2-memops.c
@@ -42,6 +42,10 @@ struct frame_vector *vb2_create_framevec(unsigned long start,
 	unsigned long first, last;
 	unsigned long nr;
 	struct frame_vector *vec;
+	unsigned int flags = FOLL_FORCE;
+
+	if (write)
+		flags |= FOLL_WRITE;
 
 	first = start >> PAGE_SHIFT;
 	last = (start + length - 1) >> PAGE_SHIFT;
@@ -49,7 +53,7 @@ struct frame_vector *vb2_create_framevec(unsigned long start,
 	vec = frame_vector_create(nr);
 	if (!vec)
 		return ERR_PTR(-ENOMEM);
-	ret = get_vaddr_frames(start & PAGE_MASK, nr, write, true, vec);
+	ret = get_vaddr_frames(start & PAGE_MASK, nr, flags, vec);
 	if (ret < 0)
 		goto out_destroy;
 	/* We accept only complete set of PFNs */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cbb9bd8b910a..5c18cd9c72d2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1227,7 +1227,7 @@ struct frame_vector {
 struct frame_vector *frame_vector_create(unsigned int nr_frames);
 void frame_vector_destroy(struct frame_vector *vec);
 int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
-		     bool write, bool force, struct frame_vector *vec);
+		     unsigned int gup_flags, struct frame_vector *vec);
 void put_vaddr_frames(struct frame_vector *vec);
 int frame_vector_to_pages(struct frame_vector *vec);
 void frame_vector_to_pfns(struct frame_vector *vec);
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 40e3116b5666..c1e7926a41c4 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -11,10 +11,7 @@
  * get_vaddr_frames() - map virtual addresses to pfns
  * @start:	starting user address
  * @nr_frames:	number of pages / pfns from start to map
- * @write:	whether pages will be written to by the caller
- * @force:	whether to force write access even if user mapping is
- *		readonly. See description of the same argument of
-		get_user_pages().
+ * @gup_flags:	flags modifying lookup behaviour
  * @vec:	structure which receives pages / pfns of the addresses mapped.
  *		It should have space for at least nr_frames entries.
  *
@@ -34,23 +31,17 @@
  * This function takes care of grabbing mmap_sem as necessary.
  */
 int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
-		     bool write, bool force, struct frame_vector *vec)
+		     unsigned int gup_flags, struct frame_vector *vec)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	int ret = 0;
 	int err;
 	int locked;
-	unsigned int gup_flags = 0;
 
 	if (nr_frames == 0)
 		return 0;
 
-	if (write)
-		gup_flags |= FOLL_WRITE;
-	if (force)
-		gup_flags |= FOLL_FORCE;
-
 	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
 		nr_frames = vec->nr_allocated;
 

From 8e50b8b07f462ab4b91bc1491b1c91bd75e4ad40 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:16 +0100
Subject: [PATCH 1953/3220] mm: replace get_user_pages() write/force parameters
 with gup_flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 768ae309a96103ed02eb1e111e838c87854d8b51 upstream.

This removes the 'write' and 'force' from get_user_pages() and replaces
them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers
as use of this flag can result in surprising behaviour (and hence bugs)
within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 4.4:
 - Drop changes in rapidio, vchiq, goldfish
 - Keep the "write" variable in amdgpu_ttm_tt_pin_userptr() as it's still
   needed
 - Also update calls from various other places that now use
   get_user_pages_remote() upstream, which were updated there by commit
   9beae1ea8930 "mm: replace get_user_pages_remote() write/force ..."
 - Also update calls from hfi1 and ipath
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/cris/arch-v32/drivers/cryptocop.c        |  4 +---
 arch/ia64/kernel/err_inject.c                 |  2 +-
 arch/x86/mm/mpx.c                             |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       |  6 +++++-
 drivers/gpu/drm/i915/i915_gem_userptr.c       |  6 +++++-
 drivers/gpu/drm/radeon/radeon_ttm.c           |  2 +-
 drivers/gpu/drm/via/via_dmablit.c             |  4 ++--
 drivers/infiniband/core/umem.c                |  6 +++++-
 drivers/infiniband/core/umem_odp.c            |  7 +++++--
 drivers/infiniband/hw/mthca/mthca_memfree.c   |  4 ++--
 drivers/infiniband/hw/qib/qib_user_pages.c    |  3 ++-
 drivers/infiniband/hw/usnic/usnic_uiom.c      |  5 ++++-
 drivers/media/v4l2-core/videobuf-dma-sg.c     |  7 +++++--
 drivers/misc/mic/scif/scif_rma.c              |  3 +--
 drivers/misc/sgi-gru/grufault.c               |  2 +-
 drivers/staging/rdma/hfi1/user_pages.c        |  2 +-
 drivers/staging/rdma/ipath/ipath_user_pages.c |  2 +-
 drivers/virt/fsl_hypervisor.c                 |  4 ++--
 fs/exec.c                                     |  9 +++++++--
 include/linux/mm.h                            |  2 +-
 kernel/events/uprobes.c                       |  4 ++--
 mm/gup.c                                      | 15 +++++----------
 mm/memory.c                                   |  6 +++++-
 mm/mempolicy.c                                |  2 +-
 mm/nommu.c                                    | 18 ++++--------------
 security/tomoyo/domain.c                      |  3 ++-
 26 files changed, 72 insertions(+), 59 deletions(-)

diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index 877da1908234..98e2a5dbcfda 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -2724,7 +2724,6 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
 			     (unsigned long int)(oper.indata + prev_ix),
 			     noinpages,
 			     0,  /* read access only for in data */
-			     0, /* no force */
 			     inpages,
 			     NULL);
 
@@ -2740,8 +2739,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
 				     current->mm,
 				     (unsigned long int)oper.cipher_outdata,
 				     nooutpages,
-				     1, /* write access for out data */
-				     0, /* no force */
+				     FOLL_WRITE, /* write access for out data */
 				     outpages,
 				     NULL);
 		up_read(&current->mm->mmap_sem);
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index 0c161ed6d18e..8205b456de7a 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -143,7 +143,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
 	int ret;
 
         ret = get_user_pages(current, current->mm, virt_addr,
-                        1, VM_READ, 0, NULL, NULL);
+			     1, FOLL_WRITE, NULL, NULL);
 	if (ret<=0) {
 #ifdef ERR_INJ_DEBUG
 		printk("Virtual address %lx is not existing.\n",virt_addr);
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 7ed47b1e6f42..7e94fc6f608a 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -536,10 +536,9 @@ static int mpx_resolve_fault(long __user *addr, int write)
 {
 	long gup_ret;
 	int nr_pages = 1;
-	int force = 0;
 
 	gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
-				 nr_pages, write, force, NULL, NULL);
+				 nr_pages, write ? FOLL_WRITE : 0, NULL, NULL);
 	/*
 	 * get_user_pages() returns number of pages gotten.
 	 * 0 means we failed to fault in and get anything,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index e40a6d8b0b92..062c23125b2a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -496,9 +496,13 @@ static int amdgpu_ttm_tt_pin_userptr(struct ttm_tt *ttm)
 	int r;
 
 	int write = !(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY);
+	unsigned int flags = 0;
 	enum dma_data_direction direction = write ?
 		DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
 
+	if (write)
+		flags |= FOLL_WRITE;
+
 	if (current->mm != gtt->usermm)
 		return -EPERM;
 
@@ -519,7 +523,7 @@ static int amdgpu_ttm_tt_pin_userptr(struct ttm_tt *ttm)
 		struct page **pages = ttm->pages + pinned;
 
 		r = get_user_pages(current, current->mm, userptr, num_pages,
-				   write, 0, pages, NULL);
+				   flags, pages, NULL);
 		if (r < 0)
 			goto release_pages;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 359fe2b8bb8a..b02113b57d51 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -581,13 +581,17 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 		pvec = drm_malloc_ab(npages, sizeof(struct page *));
 	if (pvec != NULL) {
 		struct mm_struct *mm = obj->userptr.mm->mm;
+		unsigned int flags = 0;
+
+		if (!obj->userptr.read_only)
+			flags |= FOLL_WRITE;
 
 		down_read(&mm->mmap_sem);
 		while (pinned < npages) {
 			ret = get_user_pages(work->task, mm,
 					     obj->userptr.ptr + pinned * PAGE_SIZE,
 					     npages - pinned,
-					     !obj->userptr.read_only, 0,
+					     flags,
 					     pvec + pinned, NULL);
 			if (ret < 0)
 				break;
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index d684e2b79d2b..0c380fe77382 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -557,7 +557,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm)
 		struct page **pages = ttm->pages + pinned;
 
 		r = get_user_pages(current, current->mm, userptr, num_pages,
-				   write, 0, pages, NULL);
+				   write ? FOLL_WRITE : 0, pages, NULL);
 		if (r < 0)
 			goto release_pages;
 
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index d0cbd5ecd7f0..4459cb32d1fe 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -242,8 +242,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg,  drm_via_dmablit_t *xfer)
 	ret = get_user_pages(current, current->mm,
 			     (unsigned long)xfer->mem_addr,
 			     vsg->num_pages,
-			     (vsg->direction == DMA_FROM_DEVICE),
-			     0, vsg->pages, NULL);
+			     (vsg->direction == DMA_FROM_DEVICE) ? FOLL_WRITE : 0,
+			     vsg->pages, NULL);
 
 	up_read(&current->mm->mmap_sem);
 	if (ret != vsg->num_pages) {
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 98fd9a594841..8762eac47570 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -95,6 +95,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	DEFINE_DMA_ATTRS(attrs);
 	struct scatterlist *sg, *sg_list_start;
 	int need_release = 0;
+	unsigned int gup_flags = FOLL_WRITE;
 
 	if (dmasync)
 		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
@@ -177,6 +178,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	if (ret)
 		goto out;
 
+	if (!umem->writable)
+		gup_flags |= FOLL_FORCE;
+
 	need_release = 1;
 	sg_list_start = umem->sg_head.sgl;
 
@@ -184,7 +188,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 		ret = get_user_pages(current, current->mm, cur_base,
 				     min_t(unsigned long, npages,
 					   PAGE_SIZE / sizeof (struct page *)),
-				     1, !umem->writable, page_list, vma_list);
+				     gup_flags, page_list, vma_list);
 
 		if (ret < 0)
 			goto out;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 40becdb3196e..738ccfee7cae 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -527,6 +527,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 	u64 off;
 	int j, k, ret = 0, start_idx, npages = 0;
 	u64 base_virt_addr;
+	unsigned int flags = 0;
 
 	if (access_mask == 0)
 		return -EINVAL;
@@ -556,6 +557,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 		goto out_put_task;
 	}
 
+	if (access_mask & ODP_WRITE_ALLOWED_BIT)
+		flags |= FOLL_WRITE;
+
 	start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
 	k = start_idx;
 
@@ -574,8 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 		 */
 		npages = get_user_pages(owning_process, owning_mm, user_virt,
 					gup_num_pages,
-					access_mask & ODP_WRITE_ALLOWED_BIT, 0,
-					local_page_list, NULL);
+					flags, local_page_list, NULL);
 		up_read(&owning_mm->mmap_sem);
 
 		if (npages < 0)
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 7d2e42dd6926..8676685dbf3d 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -472,8 +472,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
 		goto out;
 	}
 
-	ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0,
-			     pages, NULL);
+	ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1,
+			     FOLL_WRITE, pages, NULL);
 	if (ret < 0)
 		goto out;
 
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index ab1588ae1c85..75c3f0dffe63 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -68,7 +68,8 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
 	for (got = 0; got < num_pages; got += ret) {
 		ret = get_user_pages(current, current->mm,
 				     start_page + got * PAGE_SIZE,
-				     num_pages - got, 1, 1,
+				     num_pages - got,
+				     FOLL_WRITE | FOLL_FORCE,
 				     p + got, NULL);
 		if (ret < 0)
 			goto bail_release;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 645a5f6e6c88..7f0d75e29441 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -113,6 +113,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
 	int flags;
 	dma_addr_t pa;
 	DEFINE_DMA_ATTRS(attrs);
+	unsigned int gup_flags;
 
 	if (dmasync)
 		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
@@ -140,6 +141,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
 
 	flags = IOMMU_READ | IOMMU_CACHE;
 	flags |= (writable) ? IOMMU_WRITE : 0;
+	gup_flags = FOLL_WRITE;
+	gup_flags |= (writable) ? 0 : FOLL_FORCE;
 	cur_base = addr & PAGE_MASK;
 	ret = 0;
 
@@ -147,7 +150,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
 		ret = get_user_pages(current, current->mm, cur_base,
 					min_t(unsigned long, npages,
 					PAGE_SIZE / sizeof(struct page *)),
-					1, !writable, page_list, NULL);
+					gup_flags, page_list, NULL);
 
 		if (ret < 0)
 			goto out;
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index f669cedca8bd..f74a74d91b9e 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -156,6 +156,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
 {
 	unsigned long first, last;
 	int err, rw = 0;
+	unsigned int flags = FOLL_FORCE;
 
 	dma->direction = direction;
 	switch (dma->direction) {
@@ -178,13 +179,15 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
 	if (NULL == dma->pages)
 		return -ENOMEM;
 
+	if (rw == READ)
+		flags |= FOLL_WRITE;
+
 	dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
 		data, size, dma->nr_pages);
 
 	err = get_user_pages(current, current->mm,
 			     data & PAGE_MASK, dma->nr_pages,
-			     rw == READ, 1, /* force */
-			     dma->pages, NULL);
+			     flags, dma->pages, NULL);
 
 	if (err != dma->nr_pages) {
 		dma->nr_pages = (err >= 0) ? err : 0;
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index 8bd63128d536..71c69e1c4ac0 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -1398,8 +1398,7 @@ retry:
 				mm,
 				(u64)addr,
 				nr_pages,
-				!!(prot & SCIF_PROT_WRITE),
-				0,
+				(prot & SCIF_PROT_WRITE) ? FOLL_WRITE : 0,
 				pinned_pages->pages,
 				NULL);
 		up_write(&mm->mmap_sem);
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index f74fc0ca2ef9..e6b723c6a2af 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -199,7 +199,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma,
 	*pageshift = PAGE_SHIFT;
 #endif
 	if (get_user_pages
-	    (current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0)
+	    (current, current->mm, vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0)
 		return -EFAULT;
 	*paddr = page_to_phys(page);
 	put_page(page);
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c
index 9071afbd7bf4..b776b74d3d14 100644
--- a/drivers/staging/rdma/hfi1/user_pages.c
+++ b/drivers/staging/rdma/hfi1/user_pages.c
@@ -85,7 +85,7 @@ static int __hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
 	for (got = 0; got < num_pages; got += ret) {
 		ret = get_user_pages(current, current->mm,
 				     start_page + got * PAGE_SIZE,
-				     num_pages - got, 1, 1,
+				     num_pages - got, FOLL_WRITE | FOLL_FORCE,
 				     p + got, NULL);
 		if (ret < 0)
 			goto bail_release;
diff --git a/drivers/staging/rdma/ipath/ipath_user_pages.c b/drivers/staging/rdma/ipath/ipath_user_pages.c
index d29b4daf61f8..f69ec728e0de 100644
--- a/drivers/staging/rdma/ipath/ipath_user_pages.c
+++ b/drivers/staging/rdma/ipath/ipath_user_pages.c
@@ -72,7 +72,7 @@ static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
 	for (got = 0; got < num_pages; got += ret) {
 		ret = get_user_pages(current, current->mm,
 				     start_page + got * PAGE_SIZE,
-				     num_pages - got, 1, 1,
+				     num_pages - got, FOLL_WRITE | FOLL_FORCE,
 				     p + got, NULL);
 		if (ret < 0)
 			goto bail_release;
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 32c8fc5f7a5c..590a0f51a249 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -246,8 +246,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
 	down_read(&current->mm->mmap_sem);
 	num_pinned = get_user_pages(current, current->mm,
 		param.local_vaddr - lb_offset, num_pages,
-		(param.source == -1) ? READ : WRITE,
-		0, pages, NULL);
+		(param.source == -1) ? 0 : FOLL_WRITE,
+		pages, NULL);
 	up_read(&current->mm->mmap_sem);
 
 	if (num_pinned != num_pages) {
diff --git a/fs/exec.c b/fs/exec.c
index 910fc70c4542..3dad755b7048 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 {
 	struct page *page;
 	int ret;
+	unsigned int gup_flags = FOLL_FORCE;
 
 #ifdef CONFIG_STACK_GROWSUP
 	if (write) {
@@ -199,8 +200,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 			return NULL;
 	}
 #endif
-	ret = get_user_pages(current, bprm->mm, pos,
-			1, write, 1, &page, NULL);
+
+	if (write)
+		gup_flags |= FOLL_WRITE;
+
+	ret = get_user_pages(current, bprm->mm, pos, 1, gup_flags,
+			&page, NULL);
 	if (ret <= 0)
 		return NULL;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5c18cd9c72d2..d1cfd3657cce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1199,7 +1199,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		      struct vm_area_struct **vmas, int *nonblocking);
 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
-		    int write, int force, struct page **pages,
+		    unsigned int gup_flags, struct page **pages,
 		    struct vm_area_struct **vmas);
 long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7108097fa2f2..aad43c88a668 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -299,7 +299,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
 
 retry:
 	/* Read the page with vaddr into memory */
-	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+	ret = get_user_pages(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, &vma);
 	if (ret <= 0)
 		return ret;
 
@@ -1700,7 +1700,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	if (likely(result == 0))
 		goto out;
 
-	result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+	result = get_user_pages(NULL, mm, vaddr, 1, FOLL_FORCE, &page, NULL);
 	if (result < 0)
 		return result;
 
diff --git a/mm/gup.c b/mm/gup.c
index bcdefb977269..2370e2417d61 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -854,18 +854,13 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
  * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
  */
 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long start, unsigned long nr_pages, int write,
-		int force, struct page **pages, struct vm_area_struct **vmas)
+		unsigned long start, unsigned long nr_pages,
+		unsigned int gup_flags, struct page **pages,
+		struct vm_area_struct **vmas)
 {
-	unsigned int flags = FOLL_TOUCH;
-
-	if (write)
-		flags |= FOLL_WRITE;
-	if (force)
-		flags |= FOLL_FORCE;
-
 	return __get_user_pages_locked(tsk, mm, start, nr_pages,
-				       pages, vmas, NULL, false, flags);
+				       pages, vmas, NULL, false,
+				       gup_flags | FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages);
 
diff --git a/mm/memory.c b/mm/memory.c
index 5aee9ec8b8c6..13142accda4f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3715,6 +3715,10 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 {
 	struct vm_area_struct *vma;
 	void *old_buf = buf;
+	unsigned int flags = FOLL_FORCE;
+
+	if (write)
+		flags |= FOLL_WRITE;
 
 	down_read(&mm->mmap_sem);
 	/* ignore errors, just check how much was successfully transferred */
@@ -3724,7 +3728,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		struct page *page = NULL;
 
 		ret = get_user_pages(tsk, mm, addr, 1,
-				write, 1, &page, &vma);
+				flags, &page, &vma);
 		if (ret <= 0) {
 #ifndef CONFIG_HAVE_IOREMAP_PROT
 			break;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index be9840bf11d1..44134ba6fb53 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -818,7 +818,7 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
 	struct page *p;
 	int err;
 
-	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
+	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, &p, NULL);
 	if (err >= 0) {
 		err = page_to_nid(p);
 		put_page(p);
diff --git a/mm/nommu.c b/mm/nommu.c
index 6cbb985238c4..073ea36abc21 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -184,18 +184,11 @@ finish_or_fault:
  */
 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
-		    int write, int force, struct page **pages,
+		    unsigned int gup_flags, struct page **pages,
 		    struct vm_area_struct **vmas)
 {
-	int flags = 0;
-
-	if (write)
-		flags |= FOLL_WRITE;
-	if (force)
-		flags |= FOLL_FORCE;
-
-	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
-				NULL);
+	return __get_user_pages(tsk, mm, start, nr_pages,
+				gup_flags, pages, vmas, NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
 
@@ -204,10 +197,7 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 			   unsigned int gup_flags, struct page **pages,
 			   int *locked)
 {
-	int write = gup_flags & FOLL_WRITE;
-	int force = gup_flags & FOLL_FORCE;
-
-	return get_user_pages(tsk, mm, start, nr_pages, write, force,
+	return get_user_pages(tsk, mm, start, nr_pages, gup_flags,
 			      pages, NULL);
 }
 EXPORT_SYMBOL(get_user_pages_locked);
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 38651454ed08..6f388e77999c 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -874,7 +874,8 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
 	}
 	/* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */
 #ifdef CONFIG_MMU
-	if (get_user_pages(current, bprm->mm, pos, 1, 0, 1, &page, NULL) <= 0)
+	if (get_user_pages(current, bprm->mm, pos, 1,
+			   FOLL_FORCE, &page, NULL) <= 0)
 		return false;
 #else
 	page = bprm->page[pos / PAGE_SIZE];

From 2b8143d6874b385c79b60257bb0f0ad328ee2194 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:18 +0100
Subject: [PATCH 1954/3220] mm: replace __access_remote_vm() write parameter
 with gup_flags

commit 442486ec1096781c50227b73f721a63974b0fdda upstream.

This removes the 'write' argument from __access_remote_vm() and replaces
it with 'gup_flags' as use of this function previously silently implied
FOLL_FORCE, whereas after this patch callers explicitly pass this flag.

We make this explicit as use of FOLL_FORCE can result in surprising
behaviour (and hence bugs) within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/memory.c | 23 +++++++++++++++--------
 mm/nommu.c  |  9 ++++++---
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 13142accda4f..1a0d727687d0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3711,14 +3711,11 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
  * given task for page fault accounting.
  */
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long addr, void *buf, int len, int write)
+		unsigned long addr, void *buf, int len, unsigned int gup_flags)
 {
 	struct vm_area_struct *vma;
 	void *old_buf = buf;
-	unsigned int flags = FOLL_FORCE;
-
-	if (write)
-		flags |= FOLL_WRITE;
+	int write = gup_flags & FOLL_WRITE;
 
 	down_read(&mm->mmap_sem);
 	/* ignore errors, just check how much was successfully transferred */
@@ -3728,7 +3725,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		struct page *page = NULL;
 
 		ret = get_user_pages(tsk, mm, addr, 1,
-				flags, &page, &vma);
+				gup_flags, &page, &vma);
 		if (ret <= 0) {
 #ifndef CONFIG_HAVE_IOREMAP_PROT
 			break;
@@ -3787,7 +3784,12 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, int write)
 {
-	return __access_remote_vm(NULL, mm, addr, buf, len, write);
+	unsigned int flags = FOLL_FORCE;
+
+	if (write)
+		flags |= FOLL_WRITE;
+
+	return __access_remote_vm(NULL, mm, addr, buf, len, flags);
 }
 
 /*
@@ -3800,12 +3802,17 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
 {
 	struct mm_struct *mm;
 	int ret;
+	unsigned int flags = FOLL_FORCE;
 
 	mm = get_task_mm(tsk);
 	if (!mm)
 		return 0;
 
-	ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
+	if (write)
+		flags |= FOLL_WRITE;
+
+	ret = __access_remote_vm(tsk, mm, addr, buf, len, flags);
+
 	mmput(mm);
 
 	return ret;
diff --git a/mm/nommu.c b/mm/nommu.c
index 073ea36abc21..6ffc6be4344f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1929,9 +1929,10 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
 EXPORT_SYMBOL(filemap_map_pages);
 
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long addr, void *buf, int len, int write)
+		unsigned long addr, void *buf, int len, unsigned int gup_flags)
 {
 	struct vm_area_struct *vma;
+	int write = gup_flags & FOLL_WRITE;
 
 	down_read(&mm->mmap_sem);
 
@@ -1973,7 +1974,8 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, int write)
 {
-	return __access_remote_vm(NULL, mm, addr, buf, len, write);
+	return __access_remote_vm(NULL, mm, addr, buf, len,
+			write ? FOLL_WRITE : 0);
 }
 
 /*
@@ -1991,7 +1993,8 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 	if (!mm)
 		return 0;
 
-	len = __access_remote_vm(tsk, mm, addr, buf, len, write);
+	len = __access_remote_vm(tsk, mm, addr, buf, len,
+			write ? FOLL_WRITE : 0);
 
 	mmput(mm);
 	return len;

From 079d9ea86202777cd57c69879a5ba8db6a2c1b1e Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:19 +0100
Subject: [PATCH 1955/3220] mm: replace access_remote_vm() write parameter with
 gup_flags

commit 6347e8d5bcce33fc36e651901efefbe2c93a43ef upstream.

This removes the 'write' argument from access_remote_vm() and replaces
it with 'gup_flags' as use of this function previously silently implied
FOLL_FORCE, whereas after this patch callers explicitly pass this flag.

We make this explicit as use of FOLL_FORCE can result in surprising
behaviour (and hence bugs) within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/base.c     | 19 +++++++++++++------
 include/linux/mm.h |  2 +-
 mm/memory.c        | 11 +++--------
 mm/nommu.c         |  7 +++----
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4beed301e224..445c42b2ec61 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -254,7 +254,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 	 * Inherently racy -- command line shares address space
 	 * with code and data.
 	 */
-	rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+	rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_FORCE);
 	if (rv <= 0)
 		goto out_free_page;
 
@@ -272,7 +272,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 			int nr_read;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			nr_read = access_remote_vm(mm, p, page, _count,
+					FOLL_FORCE);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -307,7 +308,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 			bool final;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			nr_read = access_remote_vm(mm, p, page, _count,
+					FOLL_FORCE);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -356,7 +358,8 @@ skip_argv:
 			bool final;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			nr_read = access_remote_vm(mm, p, page, _count,
+					FOLL_FORCE);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -868,6 +871,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 	unsigned long addr = *ppos;
 	ssize_t copied;
 	char *page;
+	unsigned int flags = FOLL_FORCE;
 
 	if (!mm)
 		return 0;
@@ -880,6 +884,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 	if (!atomic_inc_not_zero(&mm->mm_users))
 		goto free;
 
+	if (write)
+		flags |= FOLL_WRITE;
+
 	while (count > 0) {
 		int this_len = min_t(int, count, PAGE_SIZE);
 
@@ -888,7 +895,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 			break;
 		}
 
-		this_len = access_remote_vm(mm, addr, page, this_len, write);
+		this_len = access_remote_vm(mm, addr, page, this_len, flags);
 		if (!this_len) {
 			if (!copied)
 				copied = -EIO;
@@ -1001,7 +1008,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 		this_len = min(max_len, this_len);
 
 		retval = access_remote_vm(mm, (env_start + src),
-			page, this_len, 0);
+			page, this_len, FOLL_FORCE);
 
 		if (retval <= 0) {
 			ret = retval;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d1cfd3657cce..9a0716e900b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1191,7 +1191,7 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
-		void *buf, int len, int write);
+		void *buf, int len, unsigned int gup_flags);
 
 long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		      unsigned long start, unsigned long nr_pages,
diff --git a/mm/memory.c b/mm/memory.c
index 1a0d727687d0..fa752df6dc85 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3777,19 +3777,14 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
  * @addr:	start address to access
  * @buf:	source or destination buffer
  * @len:	number of bytes to transfer
- * @write:	whether the access is a write
+ * @gup_flags:	flags modifying lookup behaviour
  *
  * The caller must hold a reference on @mm.
  */
 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
-		void *buf, int len, int write)
+		void *buf, int len, unsigned int gup_flags)
 {
-	unsigned int flags = FOLL_FORCE;
-
-	if (write)
-		flags |= FOLL_WRITE;
-
-	return __access_remote_vm(NULL, mm, addr, buf, len, flags);
+	return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
 }
 
 /*
diff --git a/mm/nommu.c b/mm/nommu.c
index 6ffc6be4344f..2360546db065 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1967,15 +1967,14 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
  * @addr:	start address to access
  * @buf:	source or destination buffer
  * @len:	number of bytes to transfer
- * @write:	whether the access is a write
+ * @gup_flags:	flags modifying lookup behaviour
  *
  * The caller must hold a reference on @mm.
  */
 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
-		void *buf, int len, int write)
+		void *buf, int len, unsigned int gup_flags)
 {
-	return __access_remote_vm(NULL, mm, addr, buf, len,
-			write ? FOLL_WRITE : 0);
+	return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
 }
 
 /*

From 56941bb6400ca6ed0fdcaaa1f8c8183234bf199c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 24 Oct 2016 19:00:44 -0700
Subject: [PATCH 1956/3220] proc: don't use FOLL_FORCE for reading cmdline and
 environment

commit 272ddc8b37354c3fe111ab26d25e792629148eee upstream.

Now that Lorenzo cleaned things up and made the FOLL_FORCE users
explicit, it becomes obvious how some of them don't really need
FOLL_FORCE at all.

So remove FOLL_FORCE from the proc code that reads the command line and
arguments from user space.

The mem_rw() function actually does want FOLL_FORCE, because gdd (and
possibly many other debuggers) use it as a much more convenient version
of PTRACE_PEEKDATA, but we should consider making the FOLL_FORCE part
conditional on actually being a ptracer.  This does not actually do
that, just moves adds a comment to that effect and moves the gup_flags
settings next to each other.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/base.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 445c42b2ec61..5b8d840cf2b4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -254,7 +254,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 	 * Inherently racy -- command line shares address space
 	 * with code and data.
 	 */
-	rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_FORCE);
+	rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
 	if (rv <= 0)
 		goto out_free_page;
 
@@ -272,8 +272,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 			int nr_read;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count,
-					FOLL_FORCE);
+			nr_read = access_remote_vm(mm, p, page, _count, 0);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -308,8 +307,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 			bool final;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count,
-					FOLL_FORCE);
+			nr_read = access_remote_vm(mm, p, page, _count, 0);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -358,8 +356,7 @@ skip_argv:
 			bool final;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count,
-					FOLL_FORCE);
+			nr_read = access_remote_vm(mm, p, page, _count, 0);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -871,7 +868,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 	unsigned long addr = *ppos;
 	ssize_t copied;
 	char *page;
-	unsigned int flags = FOLL_FORCE;
+	unsigned int flags;
 
 	if (!mm)
 		return 0;
@@ -884,6 +881,8 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 	if (!atomic_inc_not_zero(&mm->mm_users))
 		goto free;
 
+	/* Maybe we should limit FOLL_FORCE to actual ptrace users? */
+	flags = FOLL_FORCE;
 	if (write)
 		flags |= FOLL_WRITE;
 
@@ -1007,8 +1006,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 		max_len = min_t(size_t, PAGE_SIZE, count);
 		this_len = min(max_len, this_len);
 
-		retval = access_remote_vm(mm, (env_start + src),
-			page, this_len, FOLL_FORCE);
+		retval = access_remote_vm(mm, (env_start + src), page, this_len, 0);
 
 		if (retval <= 0) {
 			ret = retval;

From adc143b97d06a3305707726e69b4247db050cb88 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Fri, 11 May 2018 08:11:44 +0200
Subject: [PATCH 1957/3220] proc: do not access cmdline nor environ from
 file-backed areas

commit 7f7ccc2ccc2e70c6054685f5e3522efa81556830 upstream.

proc_pid_cmdline_read() and environ_read() directly access the target
process' VM to retrieve the command line and environment. If this
process remaps these areas onto a file via mmap(), the requesting
process may experience various issues such as extra delays if the
underlying device is slow to respond.

Let's simply refuse to access file-backed areas in these functions.
For this we add a new FOLL_ANON gup flag that is passed to all calls
to access_remote_vm(). The code already takes care of such failures
(including unmapped areas). Accesses via /proc/pid/mem were not
changed though.

This was assigned CVE-2018-1120.

Note for stable backports: the patch may apply to kernels prior to 4.11
but silently miss one location; it must be checked that no call to
access_remote_vm() keeps zero as the last argument.

Reported-by: Qualys Security Advisory <qsa@qualys.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[bwh: Backported to 4.4:
 - Update the extra call to access_remote_vm() from proc_pid_cmdline_read()
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/base.c     | 10 +++++-----
 include/linux/mm.h |  1 +
 mm/gup.c           |  3 +++
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5b8d840cf2b4..bd8c26a409a7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -254,7 +254,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 	 * Inherently racy -- command line shares address space
 	 * with code and data.
 	 */
-	rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+	rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON);
 	if (rv <= 0)
 		goto out_free_page;
 
@@ -272,7 +272,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 			int nr_read;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -307,7 +307,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 			bool final;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -356,7 +356,7 @@ skip_argv:
 			bool final;
 
 			_count = min3(count, len, PAGE_SIZE);
-			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
 			if (nr_read < 0)
 				rv = nr_read;
 			if (nr_read <= 0)
@@ -1006,7 +1006,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 		max_len = min_t(size_t, PAGE_SIZE, count);
 		this_len = min(max_len, this_len);
 
-		retval = access_remote_vm(mm, (env_start + src), page, this_len, 0);
+		retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
 
 		if (retval <= 0) {
 			ret = retval;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9a0716e900b5..251adf4d8a71 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2120,6 +2120,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
 #define FOLL_MLOCK	0x1000	/* lock present pages */
 #define FOLL_COW	0x4000	/* internal GUP flag */
+#define FOLL_ANON	0x8000	/* don't do file mappings */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/mm/gup.c b/mm/gup.c
index 2370e2417d61..2cd3b31e3666 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -368,6 +368,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 	if (vm_flags & (VM_IO | VM_PFNMAP))
 		return -EFAULT;
 
+	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+		return -EFAULT;
+
 	if (gup_flags & FOLL_WRITE) {
 		if (!(vm_flags & VM_WRITE)) {
 			if (!(gup_flags & FOLL_FORCE))

From dc4bc70259daba144f799e40a99413a86c601006 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 30 Nov 2017 11:55:46 -0500
Subject: [PATCH 1958/3220] media: dvb-frontends: fix i2c access helpers for
 KASAN

commit 3cd890dbe2a4f14cc44c85bb6cf37e5e22d4dd0e upstream.

A typical code fragment was copied across many dvb-frontend drivers and
causes large stack frames when built with with CONFIG_KASAN on gcc-5/6/7:

drivers/media/dvb-frontends/cxd2841er.c:3225:1: error: the frame size of 3992 bytes is larger than 3072 bytes [-Werror=frame-larger-than=]
drivers/media/dvb-frontends/cxd2841er.c:3404:1: error: the frame size of 3136 bytes is larger than 3072 bytes [-Werror=frame-larger-than=]
drivers/media/dvb-frontends/stv0367.c:3143:1: error: the frame size of 4016 bytes is larger than 3072 bytes [-Werror=frame-larger-than=]
drivers/media/dvb-frontends/stv090x.c:3430:1: error: the frame size of 5312 bytes is larger than 3072 bytes [-Werror=frame-larger-than=]
drivers/media/dvb-frontends/stv090x.c:4248:1: error: the frame size of 4872 bytes is larger than 3072 bytes [-Werror=frame-larger-than=]

gcc-8 now solves this by consolidating the stack slots for the argument
variables, but on older compilers we can get the same behavior by taking
the pointer of a local variable rather than the inline function argument.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715

Cc: stable@vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/dvb-frontends/ascot2e.c     | 4 +++-
 drivers/media/dvb-frontends/cxd2841er.c   | 4 +++-
 drivers/media/dvb-frontends/horus3a.c     | 4 +++-
 drivers/media/dvb-frontends/itd1000.c     | 5 +++--
 drivers/media/dvb-frontends/mt312.c       | 5 ++++-
 drivers/media/dvb-frontends/stb0899_drv.c | 3 ++-
 drivers/media/dvb-frontends/stb6100.c     | 6 ++++--
 drivers/media/dvb-frontends/stv0367.c     | 4 +++-
 drivers/media/dvb-frontends/stv090x.c     | 4 +++-
 drivers/media/dvb-frontends/stv6110x.c    | 4 +++-
 drivers/media/dvb-frontends/zl10039.c     | 4 +++-
 11 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/drivers/media/dvb-frontends/ascot2e.c b/drivers/media/dvb-frontends/ascot2e.c
index f770f6a2c987..3ea9edc8cdbe 100644
--- a/drivers/media/dvb-frontends/ascot2e.c
+++ b/drivers/media/dvb-frontends/ascot2e.c
@@ -155,7 +155,9 @@ static int ascot2e_write_regs(struct ascot2e_priv *priv,
 
 static int ascot2e_write_reg(struct ascot2e_priv *priv, u8 reg, u8 val)
 {
-	return ascot2e_write_regs(priv, reg, &val, 1);
+	u8 tmp = val; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+
+	return ascot2e_write_regs(priv, reg, &tmp, 1);
 }
 
 static int ascot2e_read_regs(struct ascot2e_priv *priv,
diff --git a/drivers/media/dvb-frontends/cxd2841er.c b/drivers/media/dvb-frontends/cxd2841er.c
index 107853b0fddd..bde77671a37c 100644
--- a/drivers/media/dvb-frontends/cxd2841er.c
+++ b/drivers/media/dvb-frontends/cxd2841er.c
@@ -241,7 +241,9 @@ static int cxd2841er_write_regs(struct cxd2841er_priv *priv,
 static int cxd2841er_write_reg(struct cxd2841er_priv *priv,
 			       u8 addr, u8 reg, u8 val)
 {
-	return cxd2841er_write_regs(priv, addr, reg, &val, 1);
+	u8 tmp = val; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+
+	return cxd2841er_write_regs(priv, addr, reg, &tmp, 1);
 }
 
 static int cxd2841er_read_regs(struct cxd2841er_priv *priv,
diff --git a/drivers/media/dvb-frontends/horus3a.c b/drivers/media/dvb-frontends/horus3a.c
index 000606af70f7..f770ab72a8e3 100644
--- a/drivers/media/dvb-frontends/horus3a.c
+++ b/drivers/media/dvb-frontends/horus3a.c
@@ -89,7 +89,9 @@ static int horus3a_write_regs(struct horus3a_priv *priv,
 
 static int horus3a_write_reg(struct horus3a_priv *priv, u8 reg, u8 val)
 {
-	return horus3a_write_regs(priv, reg, &val, 1);
+	u8 tmp = val; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+
+	return horus3a_write_regs(priv, reg, &tmp, 1);
 }
 
 static int horus3a_enter_power_save(struct horus3a_priv *priv)
diff --git a/drivers/media/dvb-frontends/itd1000.c b/drivers/media/dvb-frontends/itd1000.c
index cadcae4cff89..ac9d2591bb6f 100644
--- a/drivers/media/dvb-frontends/itd1000.c
+++ b/drivers/media/dvb-frontends/itd1000.c
@@ -99,8 +99,9 @@ static int itd1000_read_reg(struct itd1000_state *state, u8 reg)
 
 static inline int itd1000_write_reg(struct itd1000_state *state, u8 r, u8 v)
 {
-	int ret = itd1000_write_regs(state, r, &v, 1);
-	state->shadow[r] = v;
+	u8 tmp = v; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+	int ret = itd1000_write_regs(state, r, &tmp, 1);
+	state->shadow[r] = tmp;
 	return ret;
 }
 
diff --git a/drivers/media/dvb-frontends/mt312.c b/drivers/media/dvb-frontends/mt312.c
index c36e6764eead..c44188271028 100644
--- a/drivers/media/dvb-frontends/mt312.c
+++ b/drivers/media/dvb-frontends/mt312.c
@@ -142,7 +142,10 @@ static inline int mt312_readreg(struct mt312_state *state,
 static inline int mt312_writereg(struct mt312_state *state,
 				 const enum mt312_reg_addr reg, const u8 val)
 {
-	return mt312_write(state, reg, &val, 1);
+	u8 tmp = val; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+
+
+	return mt312_write(state, reg, &tmp, 1);
 }
 
 static inline u32 mt312_div(u32 a, u32 b)
diff --git a/drivers/media/dvb-frontends/stb0899_drv.c b/drivers/media/dvb-frontends/stb0899_drv.c
index 756650f154ab..ad9b7d4f8d95 100644
--- a/drivers/media/dvb-frontends/stb0899_drv.c
+++ b/drivers/media/dvb-frontends/stb0899_drv.c
@@ -552,7 +552,8 @@ int stb0899_write_regs(struct stb0899_state *state, unsigned int reg, u8 *data,
 
 int stb0899_write_reg(struct stb0899_state *state, unsigned int reg, u8 data)
 {
-	return stb0899_write_regs(state, reg, &data, 1);
+	u8 tmp = data;
+	return stb0899_write_regs(state, reg, &tmp, 1);
 }
 
 /*
diff --git a/drivers/media/dvb-frontends/stb6100.c b/drivers/media/dvb-frontends/stb6100.c
index 4ef8a5c7003e..44fac2570034 100644
--- a/drivers/media/dvb-frontends/stb6100.c
+++ b/drivers/media/dvb-frontends/stb6100.c
@@ -226,12 +226,14 @@ static int stb6100_write_reg_range(struct stb6100_state *state, u8 buf[], int st
 
 static int stb6100_write_reg(struct stb6100_state *state, u8 reg, u8 data)
 {
+	u8 tmp = data; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+
 	if (unlikely(reg >= STB6100_NUMREGS)) {
 		dprintk(verbose, FE_ERROR, 1, "Invalid register offset 0x%x", reg);
 		return -EREMOTEIO;
 	}
-	data = (data & stb6100_template[reg].mask) | stb6100_template[reg].set;
-	return stb6100_write_reg_range(state, &data, reg, 1);
+	tmp = (tmp & stb6100_template[reg].mask) | stb6100_template[reg].set;
+	return stb6100_write_reg_range(state, &tmp, reg, 1);
 }
 
 
diff --git a/drivers/media/dvb-frontends/stv0367.c b/drivers/media/dvb-frontends/stv0367.c
index 44cb73f68af6..ddd0d778ad6e 100644
--- a/drivers/media/dvb-frontends/stv0367.c
+++ b/drivers/media/dvb-frontends/stv0367.c
@@ -804,7 +804,9 @@ int stv0367_writeregs(struct stv0367_state *state, u16 reg, u8 *data, int len)
 
 static int stv0367_writereg(struct stv0367_state *state, u16 reg, u8 data)
 {
-	return stv0367_writeregs(state, reg, &data, 1);
+	u8 tmp = data; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+
+	return stv0367_writeregs(state, reg, &tmp, 1);
 }
 
 static u8 stv0367_readreg(struct stv0367_state *state, u16 reg)
diff --git a/drivers/media/dvb-frontends/stv090x.c b/drivers/media/dvb-frontends/stv090x.c
index 25bdf6e0f963..f0377e2b341b 100644
--- a/drivers/media/dvb-frontends/stv090x.c
+++ b/drivers/media/dvb-frontends/stv090x.c
@@ -761,7 +761,9 @@ static int stv090x_write_regs(struct stv090x_state *state, unsigned int reg, u8
 
 static int stv090x_write_reg(struct stv090x_state *state, unsigned int reg, u8 data)
 {
-	return stv090x_write_regs(state, reg, &data, 1);
+	u8 tmp = data; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+
+	return stv090x_write_regs(state, reg, &tmp, 1);
 }
 
 static int stv090x_i2c_gate_ctrl(struct stv090x_state *state, int enable)
diff --git a/drivers/media/dvb-frontends/stv6110x.c b/drivers/media/dvb-frontends/stv6110x.c
index e66154e5c1d7..45d14869e7b8 100644
--- a/drivers/media/dvb-frontends/stv6110x.c
+++ b/drivers/media/dvb-frontends/stv6110x.c
@@ -97,7 +97,9 @@ static int stv6110x_write_regs(struct stv6110x_state *stv6110x, int start, u8 da
 
 static int stv6110x_write_reg(struct stv6110x_state *stv6110x, u8 reg, u8 data)
 {
-	return stv6110x_write_regs(stv6110x, reg, &data, 1);
+	u8 tmp = data; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+
+	return stv6110x_write_regs(stv6110x, reg, &tmp, 1);
 }
 
 static int stv6110x_init(struct dvb_frontend *fe)
diff --git a/drivers/media/dvb-frontends/zl10039.c b/drivers/media/dvb-frontends/zl10039.c
index ee09ec26c553..b273e4fd8024 100644
--- a/drivers/media/dvb-frontends/zl10039.c
+++ b/drivers/media/dvb-frontends/zl10039.c
@@ -138,7 +138,9 @@ static inline int zl10039_writereg(struct zl10039_state *state,
 				const enum zl10039_reg_addr reg,
 				const u8 val)
 {
-	return zl10039_write(state, reg, &val, 1);
+	const u8 tmp = val; /* see gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 */
+
+	return zl10039_write(state, reg, &tmp, 1);
 }
 
 static int zl10039_init(struct dvb_frontend *fe)

From ed8c3cb1e66ac583ae461fef25b0cd01df1e50e7 Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Thu, 25 Aug 2016 23:14:12 +0530
Subject: [PATCH 1959/3220] matroxfb: fix size of memcpy

commit 59921b239056fb6389a865083284e00ce0518db6 upstream.

hw->DACreg has a size of 80 bytes and MGADACbpp32 has 21. So when
memcpy copies MGADACbpp32 to hw->DACreg it copies 80 bytes but
only 21 bytes are valid.

Signed-off-by: Sudip Mukherjee <sudip.mukherjee@codethink.co.uk>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/video/fbdev/matrox/matroxfb_Ti3026.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/fbdev/matrox/matroxfb_Ti3026.c b/drivers/video/fbdev/matrox/matroxfb_Ti3026.c
index 195ad7cac1ba..68fa037d8cbc 100644
--- a/drivers/video/fbdev/matrox/matroxfb_Ti3026.c
+++ b/drivers/video/fbdev/matrox/matroxfb_Ti3026.c
@@ -372,7 +372,7 @@ static int Ti3026_init(struct matrox_fb_info *minfo, struct my_timming *m)
 
 	DBG(__func__)
 
-	memcpy(hw->DACreg, MGADACbpp32, sizeof(hw->DACreg));
+	memcpy(hw->DACreg, MGADACbpp32, sizeof(MGADACbpp32));
 	switch (minfo->fbcon.var.bits_per_pixel) {
 		case 4:	hw->DACreg[POS3026_XLATCHCTRL] = TVP3026_XLATCHCTRL_16_1;	/* or _8_1, they are same */
 			hw->DACreg[POS3026_XTRUECOLORCTRL] = TVP3026_XTRUECOLORCTRL_PSEUDOCOLOR;

From e375a05c5082d708bdd79eff150fe2e53cccf5e8 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sun, 1 Jul 2018 13:57:24 -0700
Subject: [PATCH 1960/3220] staging: speakup: Replace strncpy with memcpy

commit fd29edc7232bc19f969e8f463138afc5472b3d5f upstream.

gcc 8.1.0 generates the following warnings.

drivers/staging/speakup/kobjects.c: In function 'punc_store':
drivers/staging/speakup/kobjects.c:522:2: warning:
	'strncpy' output truncated before terminating nul
	copying as many bytes from a string as its length
drivers/staging/speakup/kobjects.c:504:6: note: length computed here

drivers/staging/speakup/kobjects.c: In function 'synth_store':
drivers/staging/speakup/kobjects.c:391:2: warning:
	'strncpy' output truncated before terminating nul
	copying as many bytes from a string as its length
drivers/staging/speakup/kobjects.c:388:8: note: length computed here

Using strncpy() is indeed less than perfect since the length of data to
be copied has already been determined with strlen(). Replace strncpy()
with memcpy() to address the warning and optimize the code a little.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/speakup/kobjects.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/speakup/kobjects.c b/drivers/staging/speakup/kobjects.c
index 06ef26872462..52aed7cfeb24 100644
--- a/drivers/staging/speakup/kobjects.c
+++ b/drivers/staging/speakup/kobjects.c
@@ -387,7 +387,7 @@ static ssize_t synth_store(struct kobject *kobj, struct kobj_attribute *attr,
 	len = strlen(buf);
 	if (len < 2 || len > 9)
 		return -EINVAL;
-	strncpy(new_synth_name, buf, len);
+	memcpy(new_synth_name, buf, len);
 	if (new_synth_name[len - 1] == '\n')
 		len--;
 	new_synth_name[len] = '\0';
@@ -514,7 +514,7 @@ static ssize_t punc_store(struct kobject *kobj, struct kobj_attribute *attr,
 		return -EINVAL;
 	}
 
-	strncpy(punc_buf, buf, x);
+	memcpy(punc_buf, buf, x);
 
 	while (x && punc_buf[x - 1] == '\n')
 		x--;

From 2397ae1e4b9a914346fa77acfc9b066fdea00833 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 22 Sep 2017 23:29:18 +0200
Subject: [PATCH 1961/3220] rocker: fix rocker_tlv_put_* functions for KASAN

commit 6098d7ddd62f532f80ee2a4b01aca500a8e4e9e4 upstream.

Inlining these functions creates lots of stack variables that each take
64 bytes when KASAN is enabled, leading to this warning about potential
stack overflow:

drivers/net/ethernet/rocker/rocker_ofdpa.c: In function 'ofdpa_cmd_flow_tbl_add':
drivers/net/ethernet/rocker/rocker_ofdpa.c:621:1: error: the frame size of 2752 bytes is larger than 1536 bytes [-Werror=frame-larger-than=]

gcc-8 can now consolidate the stack slots itself, but on older versions
we get the same behavior by using a temporary variable that holds a
copy of the inline function argument.

Cc: stable@vger.kernel.org
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/rocker/rocker.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 3920c3eb6006..df6063faad2e 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -821,37 +821,49 @@ static int rocker_tlv_put(struct rocker_desc_info *desc_info,
 static int rocker_tlv_put_u8(struct rocker_desc_info *desc_info,
 			     int attrtype, u8 value)
 {
-	return rocker_tlv_put(desc_info, attrtype, sizeof(u8), &value);
+	u8 tmp = value; /* work around GCC PR81715 */
+
+	return rocker_tlv_put(desc_info, attrtype, sizeof(u8), &tmp);
 }
 
 static int rocker_tlv_put_u16(struct rocker_desc_info *desc_info,
 			      int attrtype, u16 value)
 {
-	return rocker_tlv_put(desc_info, attrtype, sizeof(u16), &value);
+	u16 tmp = value;
+
+	return rocker_tlv_put(desc_info, attrtype, sizeof(u16), &tmp);
 }
 
 static int rocker_tlv_put_be16(struct rocker_desc_info *desc_info,
 			       int attrtype, __be16 value)
 {
-	return rocker_tlv_put(desc_info, attrtype, sizeof(__be16), &value);
+	__be16 tmp = value;
+
+	return rocker_tlv_put(desc_info, attrtype, sizeof(__be16), &tmp);
 }
 
 static int rocker_tlv_put_u32(struct rocker_desc_info *desc_info,
 			      int attrtype, u32 value)
 {
-	return rocker_tlv_put(desc_info, attrtype, sizeof(u32), &value);
+	u32 tmp = value;
+
+	return rocker_tlv_put(desc_info, attrtype, sizeof(u32), &tmp);
 }
 
 static int rocker_tlv_put_be32(struct rocker_desc_info *desc_info,
 			       int attrtype, __be32 value)
 {
-	return rocker_tlv_put(desc_info, attrtype, sizeof(__be32), &value);
+	__be32 tmp = value;
+
+	return rocker_tlv_put(desc_info, attrtype, sizeof(__be32), &tmp);
 }
 
 static int rocker_tlv_put_u64(struct rocker_desc_info *desc_info,
 			      int attrtype, u64 value)
 {
-	return rocker_tlv_put(desc_info, attrtype, sizeof(u64), &value);
+	u64 tmp = value;
+
+	return rocker_tlv_put(desc_info, attrtype, sizeof(u64), &tmp);
 }
 
 static struct rocker_tlv *

From 5e27a782d2cd82e7ac599b1769ad061370a55aff Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Thu, 15 Sep 2016 08:36:07 -0600
Subject: [PATCH 1962/3220] selftests: Move networking/timestamping from
 Documentation

commit 3d2c86e3057995270e08693231039d9d942871f0 upstream.

Remove networking from Documentation Makefile to move the test to
selftests. Update networking/timestamping Makefile to work under
selftests. These tests will not be run as part of selftests suite
and will not be included in install targets. They can be built and
run separately for now.

This is part of the effort to move runnable code from Documentation.

Acked-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
[ added to 4.4.y stable to remove a build warning - gregkh]
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/Makefile                             |  3 +--
 Documentation/networking/Makefile                  |  1 -
 Documentation/networking/timestamping/Makefile     | 14 --------------
 .../selftests}/networking/timestamping/.gitignore  |  0
 .../selftests/networking/timestamping/Makefile     |  8 ++++++++
 .../networking/timestamping/hwtstamp_config.c      |  0
 .../networking/timestamping/timestamping.c         |  0
 .../networking/timestamping/txtimestamp.c          |  0
 8 files changed, 9 insertions(+), 17 deletions(-)
 delete mode 100644 Documentation/networking/Makefile
 delete mode 100644 Documentation/networking/timestamping/Makefile
 rename {Documentation => tools/testing/selftests}/networking/timestamping/.gitignore (100%)
 create mode 100644 tools/testing/selftests/networking/timestamping/Makefile
 rename {Documentation => tools/testing/selftests}/networking/timestamping/hwtstamp_config.c (100%)
 rename {Documentation => tools/testing/selftests}/networking/timestamping/timestamping.c (100%)
 rename {Documentation => tools/testing/selftests}/networking/timestamping/txtimestamp.c (100%)

diff --git a/Documentation/Makefile b/Documentation/Makefile
index fc759598c4c9..59d516b7afcb 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -1,4 +1,3 @@
 subdir-y := accounting auxdisplay blackfin connector \
 	filesystems filesystems ia64 laptops misc-devices \
-	networking pcmcia prctl ptp spi timers vDSO video4linux \
-	watchdog
+	pcmcia prctl ptp spi timers vDSO video4linux watchdog
diff --git a/Documentation/networking/Makefile b/Documentation/networking/Makefile
deleted file mode 100644
index 4c5d7c485439..000000000000
--- a/Documentation/networking/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-subdir-y := timestamping
diff --git a/Documentation/networking/timestamping/Makefile b/Documentation/networking/timestamping/Makefile
deleted file mode 100644
index 8c20dfaa4d6e..000000000000
--- a/Documentation/networking/timestamping/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# To compile, from the source root
-#
-#    make headers_install
-#    make M=documentation
-
-# List of programs to build
-hostprogs-y := hwtstamp_config timestamping txtimestamp
-
-# Tell kbuild to always build the programs
-always := $(hostprogs-y)
-
-HOSTCFLAGS_timestamping.o += -I$(objtree)/usr/include
-HOSTCFLAGS_txtimestamp.o += -I$(objtree)/usr/include
-HOSTCFLAGS_hwtstamp_config.o += -I$(objtree)/usr/include
diff --git a/Documentation/networking/timestamping/.gitignore b/tools/testing/selftests/networking/timestamping/.gitignore
similarity index 100%
rename from Documentation/networking/timestamping/.gitignore
rename to tools/testing/selftests/networking/timestamping/.gitignore
diff --git a/tools/testing/selftests/networking/timestamping/Makefile b/tools/testing/selftests/networking/timestamping/Makefile
new file mode 100644
index 000000000000..ccbb9edbbbb9
--- /dev/null
+++ b/tools/testing/selftests/networking/timestamping/Makefile
@@ -0,0 +1,8 @@
+TEST_PROGS := hwtstamp_config timestamping txtimestamp
+
+all: $(TEST_PROGS)
+
+include ../../lib.mk
+
+clean:
+	rm -fr $(TEST_PROGS)
diff --git a/Documentation/networking/timestamping/hwtstamp_config.c b/tools/testing/selftests/networking/timestamping/hwtstamp_config.c
similarity index 100%
rename from Documentation/networking/timestamping/hwtstamp_config.c
rename to tools/testing/selftests/networking/timestamping/hwtstamp_config.c
diff --git a/Documentation/networking/timestamping/timestamping.c b/tools/testing/selftests/networking/timestamping/timestamping.c
similarity index 100%
rename from Documentation/networking/timestamping/timestamping.c
rename to tools/testing/selftests/networking/timestamping/timestamping.c
diff --git a/Documentation/networking/timestamping/txtimestamp.c b/tools/testing/selftests/networking/timestamping/txtimestamp.c
similarity index 100%
rename from Documentation/networking/timestamping/txtimestamp.c
rename to tools/testing/selftests/networking/timestamping/txtimestamp.c

From d3c67a52a66ba2d44bcf1b8262609148c7c73113 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 17 Dec 2018 21:55:18 +0100
Subject: [PATCH 1963/3220] Linux 4.4.168

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6b30551caee4..082f82471b51 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 167
+SUBLEVEL = 168
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From 34a4d7ffc21bafe8af1e39eae951e3255f2b1c37 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 19 Dec 2018 15:37:41 -0800
Subject: [PATCH 1964/3220] ANDROID: Revert fs/squashfs back to linux-4.4.y

Bug: 29521202
Test: local build test only
Change-Id: I117e6e733f0ece85fd4d90604efc4d59fa545464
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 fs/squashfs/Kconfig          |  28 ++
 fs/squashfs/Makefile         |   3 +-
 fs/squashfs/block.c          | 561 ++++++++++-------------------------
 fs/squashfs/cache.c          |  75 +++--
 fs/squashfs/decompressor.c   |  51 ++--
 fs/squashfs/file.c           | 142 +++------
 fs/squashfs/file_cache.c     |  38 +++
 fs/squashfs/file_direct.c    | 241 ++++++++-------
 fs/squashfs/lz4_wrapper.c    |  32 +-
 fs/squashfs/lzo_wrapper.c    |  40 ++-
 fs/squashfs/page_actor.c     | 201 +++++--------
 fs/squashfs/page_actor.h     |  84 +++---
 fs/squashfs/squashfs.h       |  11 +-
 fs/squashfs/squashfs_fs_sb.h |   2 +-
 fs/squashfs/super.c          |   7 -
 fs/squashfs/xz_wrapper.c     |  15 +-
 fs/squashfs/zlib_wrapper.c   |  14 +-
 17 files changed, 642 insertions(+), 903 deletions(-)
 create mode 100644 fs/squashfs/file_cache.c

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 6dd158a216f4..ffb093e72b6c 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -25,6 +25,34 @@ config SQUASHFS
 
 	  If unsure, say N.
 
+choice
+	prompt "File decompression options"
+	depends on SQUASHFS
+	help
+	  Squashfs now supports two options for decompressing file
+	  data.  Traditionally Squashfs has decompressed into an
+	  intermediate buffer and then memcopied it into the page cache.
+	  Squashfs now supports the ability to decompress directly into
+	  the page cache.
+
+	  If unsure, select "Decompress file data into an intermediate buffer"
+
+config SQUASHFS_FILE_CACHE
+	bool "Decompress file data into an intermediate buffer"
+	help
+	  Decompress file data into an intermediate buffer and then
+	  memcopy it into the page cache.
+
+config SQUASHFS_FILE_DIRECT
+	bool "Decompress files directly into the page cache"
+	help
+	  Directly decompress file data into the page cache.
+	  Doing so can significantly improve performance because
+	  it eliminates a memcpy and it also removes the lock contention
+	  on the single buffer.
+
+endchoice
+
 choice
 	prompt "Decompressor parallelisation options"
 	depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index fe51f1507ed1..246a6f329d89 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,7 +5,8 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o decompressor.o
-squashfs-y += file_direct.o page_actor.o
+squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
+squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index b3b95e2ae2ff..82bc942fc437 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -28,12 +28,9 @@
 
 #include <linux/fs.h>
 #include <linux/vfs.h>
-#include <linux/bio.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/workqueue.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -41,381 +38,45 @@
 #include "decompressor.h"
 #include "page_actor.h"
 
-static struct workqueue_struct *squashfs_read_wq;
-
-struct squashfs_read_request {
-	struct super_block *sb;
-	u64 index;
-	int length;
-	int compressed;
-	int offset;
-	u64 read_end;
-	struct squashfs_page_actor *output;
-	enum {
-		SQUASHFS_COPY,
-		SQUASHFS_DECOMPRESS,
-		SQUASHFS_METADATA,
-	} data_processing;
-	bool synchronous;
-
-	/*
-	 * If the read is synchronous, it is possible to retrieve information
-	 * about the request by setting these pointers.
-	 */
-	int *res;
-	int *bytes_read;
-	int *bytes_uncompressed;
-
-	int nr_buffers;
-	struct buffer_head **bh;
-	struct work_struct offload;
-};
-
-struct squashfs_bio_request {
-	struct buffer_head **bh;
-	int nr_buffers;
-};
-
-static int squashfs_bio_submit(struct squashfs_read_request *req);
-
-int squashfs_init_read_wq(void)
+/*
+ * Read the metadata block length, this is stored in the first two
+ * bytes of the metadata block.
+ */
+static struct buffer_head *get_block_length(struct super_block *sb,
+			u64 *cur_index, int *offset, int *length)
 {
-	squashfs_read_wq = create_workqueue("SquashFS read wq");
-	return !!squashfs_read_wq;
-}
-
-void squashfs_destroy_read_wq(void)
-{
-	flush_workqueue(squashfs_read_wq);
-	destroy_workqueue(squashfs_read_wq);
-}
-
-static void free_read_request(struct squashfs_read_request *req, int error)
-{
-	if (!req->synchronous)
-		squashfs_page_actor_free(req->output, error);
-	if (req->res)
-		*(req->res) = error;
-	kfree(req->bh);
-	kfree(req);
-}
-
-static void squashfs_process_blocks(struct squashfs_read_request *req)
-{
-	int error = 0;
-	int bytes, i, length;
-	struct squashfs_sb_info *msblk = req->sb->s_fs_info;
-	struct squashfs_page_actor *actor = req->output;
-	struct buffer_head **bh = req->bh;
-	int nr_buffers = req->nr_buffers;
-
-	for (i = 0; i < nr_buffers; ++i) {
-		if (!bh[i])
-			continue;
-		wait_on_buffer(bh[i]);
-		if (!buffer_uptodate(bh[i]))
-			error = -EIO;
-	}
-	if (error)
-		goto cleanup;
-
-	if (req->data_processing == SQUASHFS_METADATA) {
-		/* Extract the length of the metadata block */
-		if (req->offset != msblk->devblksize - 1) {
-			length = le16_to_cpup((__le16 *)
-					(bh[0]->b_data + req->offset));
-		} else {
-			length = (unsigned char)bh[0]->b_data[req->offset];
-			length |= (unsigned char)bh[1]->b_data[0] << 8;
-		}
-		req->compressed = SQUASHFS_COMPRESSED(length);
-		req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS
-						       : SQUASHFS_COPY;
-		length = SQUASHFS_COMPRESSED_SIZE(length);
-		if (req->index + length + 2 > req->read_end) {
-			for (i = 0; i < nr_buffers; ++i)
-				put_bh(bh[i]);
-			kfree(bh);
-			req->length = length;
-			req->index += 2;
-			squashfs_bio_submit(req);
-			return;
-		}
-		req->length = length;
-		req->offset = (req->offset + 2) % PAGE_SIZE;
-		if (req->offset < 2) {
-			put_bh(bh[0]);
-			++bh;
-			--nr_buffers;
-		}
-	}
-	if (req->bytes_read)
-		*(req->bytes_read) = req->length;
-
-	if (req->data_processing == SQUASHFS_COPY) {
-		squashfs_bh_to_actor(bh, nr_buffers, req->output, req->offset,
-			req->length, msblk->devblksize);
-	} else if (req->data_processing == SQUASHFS_DECOMPRESS) {
-		req->length = squashfs_decompress(msblk, bh, nr_buffers,
-			req->offset, req->length, actor);
-		if (req->length < 0) {
-			error = -EIO;
-			goto cleanup;
-		}
-	}
-
-	/* Last page may have trailing bytes not filled */
-	bytes = req->length % PAGE_SIZE;
-	if (bytes && actor->page[actor->pages - 1])
-		zero_user_segment(actor->page[actor->pages - 1], bytes,
-				  PAGE_SIZE);
-
-cleanup:
-	if (req->bytes_uncompressed)
-		*(req->bytes_uncompressed) = req->length;
-	if (error) {
-		for (i = 0; i < nr_buffers; ++i)
-			if (bh[i])
-				put_bh(bh[i]);
-	}
-	free_read_request(req, error);
-}
-
-static void read_wq_handler(struct work_struct *work)
-{
-	squashfs_process_blocks(container_of(work,
-		    struct squashfs_read_request, offload));
-}
-
-static void squashfs_bio_end_io(struct bio *bio)
-{
-	int i;
-	int error = bio->bi_error;
-	struct squashfs_bio_request *bio_req = bio->bi_private;
-
-	bio_put(bio);
-
-	for (i = 0; i < bio_req->nr_buffers; ++i) {
-		if (!bio_req->bh[i])
-			continue;
-		if (!error)
-			set_buffer_uptodate(bio_req->bh[i]);
-		else
-			clear_buffer_uptodate(bio_req->bh[i]);
-		unlock_buffer(bio_req->bh[i]);
-	}
-	kfree(bio_req);
-}
-
-static int bh_is_optional(struct squashfs_read_request *req, int idx)
-{
-	int start_idx, end_idx;
-	struct squashfs_sb_info *msblk = req->sb->s_fs_info;
-
-	start_idx = (idx * msblk->devblksize - req->offset) >> PAGE_SHIFT;
-	end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) >> PAGE_SHIFT;
-	if (start_idx >= req->output->pages)
-		return 1;
-	if (start_idx < 0)
-		start_idx = end_idx;
-	if (end_idx >= req->output->pages)
-		end_idx = start_idx;
-	return !req->output->page[start_idx] && !req->output->page[end_idx];
-}
-
-static int actor_getblks(struct squashfs_read_request *req, u64 block)
-{
-	int i;
-
-	req->bh = kmalloc_array(req->nr_buffers, sizeof(*(req->bh)), GFP_NOIO);
-	if (!req->bh)
-		return -ENOMEM;
-
-	for (i = 0; i < req->nr_buffers; ++i) {
-		/*
-		 * When dealing with an uncompressed block, the actor may
-		 * contains NULL pages. There's no need to read the buffers
-		 * associated with these pages.
-		 */
-		if (!req->compressed && bh_is_optional(req, i)) {
-			req->bh[i] = NULL;
-			continue;
-		}
-		req->bh[i] = sb_getblk(req->sb, block + i);
-		if (!req->bh[i]) {
-			while (--i) {
-				if (req->bh[i])
-					put_bh(req->bh[i]);
-			}
-			return -1;
-		}
-	}
-	return 0;
-}
-
-static int squashfs_bio_submit(struct squashfs_read_request *req)
-{
-	struct bio *bio = NULL;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	struct buffer_head *bh;
-	struct squashfs_bio_request *bio_req = NULL;
-	int b = 0, prev_block = 0;
-	struct squashfs_sb_info *msblk = req->sb->s_fs_info;
 
-	u64 read_start = round_down(req->index, msblk->devblksize);
-	u64 read_end = round_up(req->index + req->length, msblk->devblksize);
-	sector_t block = read_start >> msblk->devblksize_log2;
-	sector_t block_end = read_end >> msblk->devblksize_log2;
-	int offset = read_start - round_down(req->index, PAGE_SIZE);
-	int nr_buffers = block_end - block;
-	int blksz = msblk->devblksize;
-	int bio_max_pages = nr_buffers > BIO_MAX_PAGES ? BIO_MAX_PAGES
-						       : nr_buffers;
+	bh = sb_bread(sb, *cur_index);
+	if (bh == NULL)
+		return NULL;
 
-	/* Setup the request */
-	req->read_end = read_end;
-	req->offset = req->index - read_start;
-	req->nr_buffers = nr_buffers;
-	if (actor_getblks(req, block) < 0)
-		goto getblk_failed;
+	if (msblk->devblksize - *offset == 1) {
+		*length = (unsigned char) bh->b_data[*offset];
+		put_bh(bh);
+		bh = sb_bread(sb, ++(*cur_index));
+		if (bh == NULL)
+			return NULL;
+		*length |= (unsigned char) bh->b_data[0] << 8;
+		*offset = 1;
+	} else {
+		*length = (unsigned char) bh->b_data[*offset] |
+			(unsigned char) bh->b_data[*offset + 1] << 8;
+		*offset += 2;
 
-	/* Create and submit the BIOs */
-	for (b = 0; b < nr_buffers; ++b, offset += blksz) {
-		bh = req->bh[b];
-		if (!bh || !trylock_buffer(bh))
-			continue;
-		if (buffer_uptodate(bh)) {
-			unlock_buffer(bh);
-			continue;
+		if (*offset == msblk->devblksize) {
+			put_bh(bh);
+			bh = sb_bread(sb, ++(*cur_index));
+			if (bh == NULL)
+				return NULL;
+			*offset = 0;
 		}
-		offset %= PAGE_SIZE;
-
-		/* Append the buffer to the current BIO if it is contiguous */
-		if (bio && bio_req && prev_block + 1 == b) {
-			if (bio_add_page(bio, bh->b_page, blksz, offset)) {
-				bio_req->nr_buffers += 1;
-				prev_block = b;
-				continue;
-			}
-		}
-
-		/* Otherwise, submit the current BIO and create a new one */
-		if (bio)
-			submit_bio(READ, bio);
-		bio_req = kcalloc(1, sizeof(struct squashfs_bio_request),
-				  GFP_NOIO);
-		if (!bio_req)
-			goto req_alloc_failed;
-		bio_req->bh = &req->bh[b];
-		bio = bio_alloc(GFP_NOIO, bio_max_pages);
-		if (!bio)
-			goto bio_alloc_failed;
-		bio->bi_bdev = req->sb->s_bdev;
-		bio->bi_iter.bi_sector = (block + b)
-				       << (msblk->devblksize_log2 - 9);
-		bio->bi_private = bio_req;
-		bio->bi_end_io = squashfs_bio_end_io;
-
-		bio_add_page(bio, bh->b_page, blksz, offset);
-		bio_req->nr_buffers += 1;
-		prev_block = b;
 	}
-	if (bio)
-		submit_bio(READ, bio);
 
-	if (req->synchronous)
-		squashfs_process_blocks(req);
-	else {
-		INIT_WORK(&req->offload, read_wq_handler);
-		schedule_work(&req->offload);
-	}
-	return 0;
-
-bio_alloc_failed:
-	kfree(bio_req);
-req_alloc_failed:
-	unlock_buffer(bh);
-	while (--nr_buffers >= b)
-		if (req->bh[nr_buffers])
-			put_bh(req->bh[nr_buffers]);
-	while (--b >= 0)
-		if (req->bh[b])
-			wait_on_buffer(req->bh[b]);
-getblk_failed:
-	free_read_request(req, -ENOMEM);
-	return -ENOMEM;
+	return bh;
 }
 
-static int read_metadata_block(struct squashfs_read_request *req,
-			       u64 *next_index)
-{
-	int ret, error, bytes_read = 0, bytes_uncompressed = 0;
-	struct squashfs_sb_info *msblk = req->sb->s_fs_info;
-
-	if (req->index + 2 > msblk->bytes_used) {
-		free_read_request(req, -EINVAL);
-		return -EINVAL;
-	}
-	req->length = 2;
-
-	/* Do not read beyond the end of the device */
-	if (req->index + req->length > msblk->bytes_used)
-		req->length = msblk->bytes_used - req->index;
-	req->data_processing = SQUASHFS_METADATA;
-
-	/*
-	 * Reading metadata is always synchronous because we don't know the
-	 * length in advance and the function is expected to update
-	 * 'next_index' and return the length.
-	 */
-	req->synchronous = true;
-	req->res = &error;
-	req->bytes_read = &bytes_read;
-	req->bytes_uncompressed = &bytes_uncompressed;
-
-	TRACE("Metadata block @ 0x%llx, %scompressed size %d, src size %d\n",
-	      req->index, req->compressed ? "" : "un", bytes_read,
-	      req->output->length);
-
-	ret = squashfs_bio_submit(req);
-	if (ret)
-		return ret;
-	if (error)
-		return error;
-	if (next_index)
-		*next_index += 2 + bytes_read;
-	return bytes_uncompressed;
-}
-
-static int read_data_block(struct squashfs_read_request *req, int length,
-			   u64 *next_index, bool synchronous)
-{
-	int ret, error = 0, bytes_uncompressed = 0, bytes_read = 0;
-
-	req->compressed = SQUASHFS_COMPRESSED_BLOCK(length);
-	req->length = length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
-	req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS
-					       : SQUASHFS_COPY;
-
-	req->synchronous = synchronous;
-	if (synchronous) {
-		req->res = &error;
-		req->bytes_read = &bytes_read;
-		req->bytes_uncompressed = &bytes_uncompressed;
-	}
-
-	TRACE("Data block @ 0x%llx, %scompressed size %d, src size %d\n",
-	      req->index, req->compressed ? "" : "un", req->length,
-	      req->output->length);
-
-	ret = squashfs_bio_submit(req);
-	if (ret)
-		return ret;
-	if (synchronous)
-		ret = error ? error : bytes_uncompressed;
-	if (next_index)
-		*next_index += length;
-	return ret;
-}
 
 /*
  * Read and decompress a metadata block or datablock.  Length is non-zero
@@ -426,50 +87,130 @@ static int read_data_block(struct squashfs_read_request *req, int length,
  * generated a larger block - this does occasionally happen with compression
  * algorithms).
  */
-static int __squashfs_read_data(struct super_block *sb, u64 index, int length,
-	u64 *next_index, struct squashfs_page_actor *output, bool sync)
-{
-	struct squashfs_read_request *req;
-
-	req = kcalloc(1, sizeof(struct squashfs_read_request), GFP_KERNEL);
-	if (!req) {
-		if (!sync)
-			squashfs_page_actor_free(output, -ENOMEM);
-		return -ENOMEM;
-	}
-
-	req->sb = sb;
-	req->index = index;
-	req->output = output;
-
-	if (next_index)
-		*next_index = index;
-
-	if (length)
-		length = read_data_block(req, length, next_index, sync);
-	else
-		length = read_metadata_block(req, next_index);
-
-	if (length < 0) {
-		ERROR("squashfs_read_data failed to read block 0x%llx\n",
-		      (unsigned long long)index);
-		return -EIO;
-	}
-
-	return length;
-}
-
 int squashfs_read_data(struct super_block *sb, u64 index, int length,
-	u64 *next_index, struct squashfs_page_actor *output)
+		u64 *next_index, struct squashfs_page_actor *output)
 {
-	return __squashfs_read_data(sb, index, length, next_index, output,
-				    true);
-}
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	struct buffer_head **bh;
+	int offset = index & ((1 << msblk->devblksize_log2) - 1);
+	u64 cur_index = index >> msblk->devblksize_log2;
+	int bytes, compressed, b = 0, k = 0, avail, i;
 
-int squashfs_read_data_async(struct super_block *sb, u64 index, int length,
-	u64 *next_index, struct squashfs_page_actor *output)
-{
+	bh = kcalloc(((output->length + msblk->devblksize - 1)
+		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
+	if (bh == NULL)
+		return -ENOMEM;
 
-	return __squashfs_read_data(sb, index, length, next_index, output,
-				    false);
+	if (length) {
+		/*
+		 * Datablock.
+		 */
+		bytes = -offset;
+		compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+		length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+		if (next_index)
+			*next_index = index + length;
+
+		TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
+			index, compressed ? "" : "un", length, output->length);
+
+		if (length < 0 || length > output->length ||
+				(index + length) > msblk->bytes_used)
+			goto read_failure;
+
+		for (b = 0; bytes < length; b++, cur_index++) {
+			bh[b] = sb_getblk(sb, cur_index);
+			if (bh[b] == NULL)
+				goto block_release;
+			bytes += msblk->devblksize;
+		}
+		ll_rw_block(READ, b, bh);
+	} else {
+		/*
+		 * Metadata block.
+		 */
+		if ((index + 2) > msblk->bytes_used)
+			goto read_failure;
+
+		bh[0] = get_block_length(sb, &cur_index, &offset, &length);
+		if (bh[0] == NULL)
+			goto read_failure;
+		b = 1;
+
+		bytes = msblk->devblksize - offset;
+		compressed = SQUASHFS_COMPRESSED(length);
+		length = SQUASHFS_COMPRESSED_SIZE(length);
+		if (next_index)
+			*next_index = index + length + 2;
+
+		TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+				compressed ? "" : "un", length);
+
+		if (length < 0 || length > output->length ||
+					(index + length) > msblk->bytes_used)
+			goto block_release;
+
+		for (; bytes < length; b++) {
+			bh[b] = sb_getblk(sb, ++cur_index);
+			if (bh[b] == NULL)
+				goto block_release;
+			bytes += msblk->devblksize;
+		}
+		ll_rw_block(READ, b - 1, bh + 1);
+	}
+
+	for (i = 0; i < b; i++) {
+		wait_on_buffer(bh[i]);
+		if (!buffer_uptodate(bh[i]))
+			goto block_release;
+	}
+
+	if (compressed) {
+		if (!msblk->stream)
+			goto read_failure;
+		length = squashfs_decompress(msblk, bh, b, offset, length,
+			output);
+		if (length < 0)
+			goto read_failure;
+	} else {
+		/*
+		 * Block is uncompressed.
+		 */
+		int in, pg_offset = 0;
+		void *data = squashfs_first_page(output);
+
+		for (bytes = length; k < b; k++) {
+			in = min(bytes, msblk->devblksize - offset);
+			bytes -= in;
+			while (in) {
+				if (pg_offset == PAGE_CACHE_SIZE) {
+					data = squashfs_next_page(output);
+					pg_offset = 0;
+				}
+				avail = min_t(int, in, PAGE_CACHE_SIZE -
+						pg_offset);
+				memcpy(data + pg_offset, bh[k]->b_data + offset,
+						avail);
+				in -= avail;
+				pg_offset += avail;
+				offset += avail;
+			}
+			offset = 0;
+			put_bh(bh[k]);
+		}
+		squashfs_finish_page(output);
+	}
+
+	kfree(bh);
+	return length;
+
+block_release:
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+read_failure:
+	ERROR("squashfs_read_data failed to read block 0x%llx\n",
+					(unsigned long long) index);
+	kfree(bh);
+	return -EIO;
 }
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 7d78c3d28bbd..91ce49c05b7c 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -209,14 +209,17 @@ void squashfs_cache_put(struct squashfs_cache_entry *entry)
  */
 void squashfs_cache_delete(struct squashfs_cache *cache)
 {
-	int i;
+	int i, j;
 
 	if (cache == NULL)
 		return;
 
 	for (i = 0; i < cache->entries; i++) {
-		if (cache->entry[i].page)
-			free_page_array(cache->entry[i].page, cache->pages);
+		if (cache->entry[i].data) {
+			for (j = 0; j < cache->pages; j++)
+				kfree(cache->entry[i].data[j]);
+			kfree(cache->entry[i].data);
+		}
 		kfree(cache->entry[i].actor);
 	}
 
@@ -233,7 +236,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
 struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 	int block_size)
 {
-	int i;
+	int i, j;
 	struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
 
 	if (cache == NULL) {
@@ -265,13 +268,22 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 		init_waitqueue_head(&cache->entry[i].wait_queue);
 		entry->cache = cache;
 		entry->block = SQUASHFS_INVALID_BLK;
-		entry->page = alloc_page_array(cache->pages, GFP_KERNEL);
-		if (!entry->page) {
+		entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
+		if (entry->data == NULL) {
 			ERROR("Failed to allocate %s cache entry\n", name);
 			goto cleanup;
 		}
-		entry->actor = squashfs_page_actor_init(entry->page,
-			cache->pages, 0, NULL);
+
+		for (j = 0; j < cache->pages; j++) {
+			entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+			if (entry->data[j] == NULL) {
+				ERROR("Failed to allocate %s buffer\n", name);
+				goto cleanup;
+			}
+		}
+
+		entry->actor = squashfs_page_actor_init(entry->data,
+						cache->pages, 0);
 		if (entry->actor == NULL) {
 			ERROR("Failed to allocate %s cache entry\n", name);
 			goto cleanup;
@@ -302,20 +314,18 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
 		return min(length, entry->length - offset);
 
 	while (offset < entry->length) {
-		void *buff = kmap_atomic(entry->page[offset / PAGE_CACHE_SIZE])
-			     + (offset % PAGE_CACHE_SIZE);
+		void *buff = entry->data[offset / PAGE_CACHE_SIZE]
+				+ (offset % PAGE_CACHE_SIZE);
 		int bytes = min_t(int, entry->length - offset,
 				PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
 
 		if (bytes >= remaining) {
 			memcpy(buffer, buff, remaining);
-			kunmap_atomic(buff);
 			remaining = 0;
 			break;
 		}
 
 		memcpy(buffer, buff, bytes);
-		kunmap_atomic(buff);
 		buffer += bytes;
 		remaining -= bytes;
 		offset += bytes;
@@ -409,38 +419,43 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
 void *squashfs_read_table(struct super_block *sb, u64 block, int length)
 {
 	int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	struct page **page;
-	void *buff;
-	int res;
+	int i, res;
+	void *table, *buffer, **data;
 	struct squashfs_page_actor *actor;
 
-	page = alloc_page_array(pages, GFP_KERNEL);
-	if (!page)
+	table = buffer = kmalloc(length, GFP_KERNEL);
+	if (table == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	actor = squashfs_page_actor_init(page, pages, length, NULL);
-	if (actor == NULL) {
+	data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+	if (data == NULL) {
 		res = -ENOMEM;
 		goto failed;
 	}
 
+	actor = squashfs_page_actor_init(data, pages, length);
+	if (actor == NULL) {
+		res = -ENOMEM;
+		goto failed2;
+	}
+
+	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+		data[i] = buffer;
+
 	res = squashfs_read_data(sb, block, length |
 		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);
 
-	if (res < 0)
-		goto failed2;
+	kfree(data);
+	kfree(actor);
 
-	buff = kmalloc(length, GFP_KERNEL);
-	if (!buff)
-		goto failed2;
-	squashfs_actor_to_buf(actor, buff, length);
-	squashfs_page_actor_free(actor, 0);
-	free_page_array(page, pages);
-	return buff;
+	if (res < 0)
+		goto failed;
+
+	return table;
 
 failed2:
-	squashfs_page_actor_free(actor, 0);
+	kfree(data);
 failed:
-	free_page_array(page, pages);
+	kfree(table);
 	return ERR_PTR(res);
 }
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 7de35bf297aa..e9034bf6e5ae 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -24,8 +24,7 @@
 #include <linux/types.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/fs.h>
+#include <linux/buffer_head.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -95,44 +94,40 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
 static void *get_comp_opts(struct super_block *sb, unsigned short flags)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
-	void *comp_opts, *buffer = NULL;
-	struct page *page;
+	void *buffer = NULL, *comp_opts;
 	struct squashfs_page_actor *actor = NULL;
 	int length = 0;
 
-	if (!SQUASHFS_COMP_OPTS(flags))
-		return squashfs_comp_opts(msblk, buffer, length);
-
 	/*
 	 * Read decompressor specific options from file system if present
 	 */
+	if (SQUASHFS_COMP_OPTS(flags)) {
+		buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		if (buffer == NULL) {
+			comp_opts = ERR_PTR(-ENOMEM);
+			goto out;
+		}
 
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
+		actor = squashfs_page_actor_init(&buffer, 1, 0);
+		if (actor == NULL) {
+			comp_opts = ERR_PTR(-ENOMEM);
+			goto out;
+		}
 
-	actor = squashfs_page_actor_init(&page, 1, 0, NULL);
-	if (actor == NULL) {
-		comp_opts = ERR_PTR(-ENOMEM);
-		goto actor_error;
+		length = squashfs_read_data(sb,
+			sizeof(struct squashfs_super_block), 0, NULL, actor);
+
+		if (length < 0) {
+			comp_opts = ERR_PTR(length);
+			goto out;
+		}
 	}
 
-	length = squashfs_read_data(sb,
-		sizeof(struct squashfs_super_block), 0, NULL, actor);
-
-	if (length < 0) {
-		comp_opts = ERR_PTR(length);
-		goto read_error;
-	}
-
-	buffer = kmap_atomic(page);
 	comp_opts = squashfs_comp_opts(msblk, buffer, length);
-	kunmap_atomic(buffer);
 
-read_error:
-	squashfs_page_actor_free(actor, 0);
-actor_error:
-	__free_page(page);
+out:
+	kfree(actor);
+	kfree(buffer);
 	return comp_opts;
 }
 
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 43d946617d2d..1ec7bae2751d 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,16 +47,12 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/mutex.h>
-#include <linux/mm_inline.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 
-// Backported from 4.5
-#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
-
 /*
  * Locate cache slot in range [offset, index] for specified inode.  If
  * there's more than one return the slot closest to index.
@@ -446,21 +442,6 @@ static int squashfs_readpage_fragment(struct page *page)
 	return res;
 }
 
-static int squashfs_readpages_fragment(struct page *page,
-	struct list_head *readahead_pages, struct address_space *mapping)
-{
-	if (!page) {
-		page = lru_to_page(readahead_pages);
-		list_del(&page->lru);
-		if (add_to_page_cache_lru(page, mapping, page->index,
-			mapping_gfp_constraint(mapping, GFP_KERNEL))) {
-			put_page(page);
-			return 0;
-		}
-	}
-	return squashfs_readpage_fragment(page);
-}
-
 static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
 {
 	struct inode *inode = page->mapping->host;
@@ -473,105 +454,54 @@ static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
 	return 0;
 }
 
-static int squashfs_readpages_sparse(struct page *page,
-	struct list_head *readahead_pages, int index, int file_end,
-	struct address_space *mapping)
-{
-	if (!page) {
-		page = lru_to_page(readahead_pages);
-		list_del(&page->lru);
-		if (add_to_page_cache_lru(page, mapping, page->index,
-			mapping_gfp_constraint(mapping, GFP_KERNEL))) {
-			put_page(page);
-			return 0;
-		}
-	}
-	return squashfs_readpage_sparse(page, index, file_end);
-}
-
-static int __squashfs_readpages(struct file *file, struct page *page,
-	struct list_head *readahead_pages, unsigned int nr_pages,
-	struct address_space *mapping)
-{
-	struct inode *inode = mapping->host;
-	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-	int file_end = i_size_read(inode) >> msblk->block_log;
-	int res;
-
-	do {
-		struct page *cur_page = page ? page
-					     : lru_to_page(readahead_pages);
-		int page_index = cur_page->index;
-		int index = page_index >> (msblk->block_log - PAGE_CACHE_SHIFT);
-
-		if (page_index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-						PAGE_CACHE_SHIFT))
-			return 1;
-
-		if (index < file_end || squashfs_i(inode)->fragment_block ==
-						SQUASHFS_INVALID_BLK) {
-			u64 block = 0;
-			int bsize = read_blocklist(inode, index, &block);
-
-			if (bsize < 0)
-				return -1;
-
-			if (bsize == 0) {
-				res = squashfs_readpages_sparse(page,
-					readahead_pages, index, file_end,
-					mapping);
-			} else {
-				res = squashfs_readpages_block(page,
-					readahead_pages, &nr_pages, mapping,
-					page_index, block, bsize);
-			}
-		} else {
-			res = squashfs_readpages_fragment(page,
-				readahead_pages, mapping);
-		}
-		if (res)
-			return 0;
-		page = NULL;
-	} while (readahead_pages && !list_empty(readahead_pages));
-
-	return 0;
-}
-
 static int squashfs_readpage(struct file *file, struct page *page)
 {
-	int ret;
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+	int file_end = i_size_read(inode) >> msblk->block_log;
+	int res;
+	void *pageaddr;
 
 	TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
-	      page->index, squashfs_i(page->mapping->host)->start);
+				page->index, squashfs_i(inode)->start);
 
-	get_page(page);
+	if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT))
+		goto out;
 
-	ret = __squashfs_readpages(file, page, NULL, 1, page->mapping);
-	if (ret) {
-		flush_dcache_page(page);
-		if (ret < 0)
-			SetPageError(page);
+	if (index < file_end || squashfs_i(inode)->fragment_block ==
+					SQUASHFS_INVALID_BLK) {
+		u64 block = 0;
+		int bsize = read_blocklist(inode, index, &block);
+		if (bsize < 0)
+			goto error_out;
+
+		if (bsize == 0)
+			res = squashfs_readpage_sparse(page, index, file_end);
 		else
-			SetPageUptodate(page);
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
-		unlock_page(page);
-		put_page(page);
-	}
+			res = squashfs_readpage_block(page, block, bsize);
+	} else
+		res = squashfs_readpage_fragment(page);
 
-	return 0;
-}
+	if (!res)
+		return 0;
+
+error_out:
+	SetPageError(page);
+out:
+	pageaddr = kmap_atomic(page);
+	memset(pageaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(pageaddr);
+	flush_dcache_page(page);
+	if (!PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
 
-static int squashfs_readpages(struct file *file, struct address_space *mapping,
-			      struct list_head *pages, unsigned int nr_pages)
-{
-	TRACE("Entered squashfs_readpages, %u pages, first page index %lx\n",
-		nr_pages, lru_to_page(pages)->index);
-	__squashfs_readpages(file, NULL, pages, nr_pages, mapping);
 	return 0;
 }
 
 
 const struct address_space_operations squashfs_aops = {
-	.readpage = squashfs_readpage,
-	.readpages = squashfs_readpages,
+	.readpage = squashfs_readpage
 };
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
new file mode 100644
index 000000000000..f2310d2a2019
--- /dev/null
+++ b/fs/squashfs/file_cache.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/* Read separately compressed datablock and memcopy into page cache */
+int squashfs_readpage_block(struct page *page, u64 block, int bsize)
+{
+	struct inode *i = page->mapping->host;
+	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+		block, bsize);
+	int res = buffer->error;
+
+	if (res)
+		ERROR("Unable to read page, block %llx, size %x\n", block,
+			bsize);
+	else
+		squashfs_copy_cache(page, buffer, buffer->length, 0);
+
+	squashfs_cache_put(buffer);
+	return res;
+}
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index c97af4c6ccd0..43e7a7eddac0 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -13,7 +13,6 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/mutex.h>
-#include <linux/mm_inline.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -21,139 +20,157 @@
 #include "squashfs.h"
 #include "page_actor.h"
 
-// Backported from 4.5
-#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+	int pages, struct page **page);
+
+/* Read separately compressed datablock directly into page cache */
+int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 
-static void release_actor_pages(struct page **page, int pages, int error)
 {
-	int i;
+	struct inode *inode = target_page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
 
-	for (i = 0; i < pages; i++) {
-		if (!page[i])
-			continue;
-		flush_dcache_page(page[i]);
-		if (!error)
-			SetPageUptodate(page[i]);
-		else {
-			SetPageError(page[i]);
-			zero_user_segment(page[i], 0, PAGE_CACHE_SIZE);
-		}
-		unlock_page(page[i]);
-		put_page(page[i]);
-	}
-	kfree(page);
-}
-
-/*
- * Create a "page actor" which will kmap and kunmap the
- * page cache pages appropriately within the decompressor
- */
-static struct squashfs_page_actor *actor_from_page_cache(
-	unsigned int actor_pages, struct page *target_page,
-	struct list_head *rpages, unsigned int *nr_pages, int start_index,
-	struct address_space *mapping)
-{
+	int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	int start_index = target_page->index & ~mask;
+	int end_index = start_index | mask;
+	int i, n, pages, missing_pages, bytes, res = -ENOMEM;
 	struct page **page;
 	struct squashfs_page_actor *actor;
-	int i, n;
-	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	void *pageaddr;
 
-	page = kmalloc_array(actor_pages, sizeof(void *), GFP_KERNEL);
-	if (!page)
-		return NULL;
+	if (end_index > file_end)
+		end_index = file_end;
 
-	for (i = 0, n = start_index; i < actor_pages; i++, n++) {
-		if (target_page == NULL && rpages && !list_empty(rpages)) {
-			struct page *cur_page = lru_to_page(rpages);
+	pages = end_index - start_index + 1;
 
-			if (cur_page->index < start_index + actor_pages) {
-				list_del(&cur_page->lru);
-				--(*nr_pages);
-				if (add_to_page_cache_lru(cur_page, mapping,
-							  cur_page->index, gfp))
-					put_page(cur_page);
-				else
-					target_page = cur_page;
-			} else
-				rpages = NULL;
-		}
+	page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
+	if (page == NULL)
+		return res;
 
-		if (target_page && target_page->index == n) {
-			page[i] = target_page;
-			target_page = NULL;
-		} else {
-			page[i] = grab_cache_page_nowait(mapping, n);
-			if (page[i] == NULL)
-				continue;
+	/*
+	 * Create a "page actor" which will kmap and kunmap the
+	 * page cache pages appropriately within the decompressor
+	 */
+	actor = squashfs_page_actor_init_special(page, pages, 0);
+	if (actor == NULL)
+		goto out;
+
+	/* Try to grab all the pages covered by the Squashfs block */
+	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
+		page[i] = (n == target_page->index) ? target_page :
+			grab_cache_page_nowait(target_page->mapping, n);
+
+		if (page[i] == NULL) {
+			missing_pages++;
+			continue;
 		}
 
 		if (PageUptodate(page[i])) {
 			unlock_page(page[i]);
-			put_page(page[i]);
+			page_cache_release(page[i]);
 			page[i] = NULL;
+			missing_pages++;
 		}
 	}
 
-	actor = squashfs_page_actor_init(page, actor_pages, 0,
-			release_actor_pages);
-	if (!actor) {
-		release_actor_pages(page, actor_pages, -ENOMEM);
-		kfree(page);
-		return NULL;
+	if (missing_pages) {
+		/*
+		 * Couldn't get one or more pages, this page has either
+		 * been VM reclaimed, but others are still in the page cache
+		 * and uptodate, or we're racing with another thread in
+		 * squashfs_readpage also trying to grab them.  Fall back to
+		 * using an intermediate buffer.
+		 */
+		res = squashfs_read_cache(target_page, block, bsize, pages,
+								page);
+		if (res < 0)
+			goto mark_errored;
+
+		goto out;
 	}
-	return actor;
-}
 
-int squashfs_readpages_block(struct page *target_page,
-			     struct list_head *readahead_pages,
-			     unsigned int *nr_pages,
-			     struct address_space *mapping,
-			     int page_index, u64 block, int bsize)
+	/* Decompress directly into the page cache buffers */
+	res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+	if (res < 0)
+		goto mark_errored;
 
-{
-	struct squashfs_page_actor *actor;
-	struct inode *inode = mapping->host;
-	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-	int start_index, end_index, file_end, actor_pages, res;
-	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	/* Last page may have trailing bytes not filled */
+	bytes = res % PAGE_CACHE_SIZE;
+	if (bytes) {
+		pageaddr = kmap_atomic(page[pages - 1]);
+		memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+		kunmap_atomic(pageaddr);
+	}
 
-	/*
-	 * If readpage() is called on an uncompressed datablock, we can just
-	 * read the pages instead of fetching the whole block.
-	 * This greatly improves the performance when a process keep doing
-	 * random reads because we only fetch the necessary data.
-	 * The readahead algorithm will take care of doing speculative reads
-	 * if necessary.
-	 * We can't read more than 1 block even if readahead provides use more
-	 * pages because we don't know yet if the next block is compressed or
-	 * not.
+	/* Mark pages as uptodate, unlock and release */
+	for (i = 0; i < pages; i++) {
+		flush_dcache_page(page[i]);
+		SetPageUptodate(page[i]);
+		unlock_page(page[i]);
+		if (page[i] != target_page)
+			page_cache_release(page[i]);
+	}
+
+	kfree(actor);
+	kfree(page);
+
+	return 0;
+
+mark_errored:
+	/* Decompression failed, mark pages as errored.  Target_page is
+	 * dealt with by the caller
 	 */
-	if (bsize && !SQUASHFS_COMPRESSED_BLOCK(bsize)) {
-		u64 block_end = block + msblk->block_size;
-
-		block += (page_index & mask) * PAGE_CACHE_SIZE;
-		actor_pages = (block_end - block) / PAGE_CACHE_SIZE;
-		if (*nr_pages < actor_pages)
-			actor_pages = *nr_pages;
-		start_index = page_index;
-		bsize = min_t(int, bsize, (PAGE_CACHE_SIZE * actor_pages)
-					  | SQUASHFS_COMPRESSED_BIT_BLOCK);
-	} else {
-		file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
-		start_index = page_index & ~mask;
-		end_index = start_index | mask;
-		if (end_index > file_end)
-			end_index = file_end;
-		actor_pages = end_index - start_index + 1;
+	for (i = 0; i < pages; i++) {
+		if (page[i] == NULL || page[i] == target_page)
+			continue;
+		flush_dcache_page(page[i]);
+		SetPageError(page[i]);
+		unlock_page(page[i]);
+		page_cache_release(page[i]);
 	}
 
-	actor = actor_from_page_cache(actor_pages, target_page,
-				      readahead_pages, nr_pages, start_index,
-				      mapping);
-	if (!actor)
-		return -ENOMEM;
-
-	res = squashfs_read_data_async(inode->i_sb, block, bsize, NULL,
-				       actor);
-	return res < 0 ? res : 0;
+out:
+	kfree(actor);
+	kfree(page);
+	return res;
+}
+
+
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+	int pages, struct page **page)
+{
+	struct inode *i = target_page->mapping->host;
+	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+						 block, bsize);
+	int bytes = buffer->length, res = buffer->error, n, offset = 0;
+	void *pageaddr;
+
+	if (res) {
+		ERROR("Unable to read page, block %llx, size %x\n", block,
+			bsize);
+		goto out;
+	}
+
+	for (n = 0; n < pages && bytes > 0; n++,
+			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+		int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+
+		if (page[n] == NULL)
+			continue;
+
+		pageaddr = kmap_atomic(page[n]);
+		squashfs_copy_data(pageaddr, buffer, offset, avail);
+		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+		kunmap_atomic(pageaddr);
+		flush_dcache_page(page[n]);
+		SetPageUptodate(page[n]);
+		unlock_page(page[n]);
+		if (page[n] != target_page)
+			page_cache_release(page[n]);
+	}
+
+out:
+	squashfs_cache_put(buffer);
+	return res;
 }
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index df4fa3c7ddd0..c31e2bc9c081 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -94,17 +94,39 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	struct buffer_head **bh, int b, int offset, int length,
 	struct squashfs_page_actor *output)
 {
-	int res;
-	size_t dest_len = output->length;
 	struct squashfs_lz4 *stream = strm;
+	void *buff = stream->input, *data;
+	int avail, i, bytes = length, res;
+	size_t dest_len = output->length;
+
+	for (i = 0; i < b; i++) {
+		avail = min(bytes, msblk->devblksize - offset);
+		memcpy(buff, bh[i]->b_data + offset, avail);
+		buff += avail;
+		bytes -= avail;
+		offset = 0;
+		put_bh(bh[i]);
+	}
 
-	squashfs_bh_to_buf(bh, b, stream->input, offset, length,
-		msblk->devblksize);
 	res = lz4_decompress_unknownoutputsize(stream->input, length,
 					stream->output, &dest_len);
 	if (res)
 		return -EIO;
-	squashfs_buf_to_actor(stream->output, output, dest_len);
+
+	bytes = dest_len;
+	data = squashfs_first_page(output);
+	buff = stream->output;
+	while (data) {
+		if (bytes <= PAGE_CACHE_SIZE) {
+			memcpy(data, buff, bytes);
+			break;
+		}
+		memcpy(data, buff, PAGE_CACHE_SIZE);
+		buff += PAGE_CACHE_SIZE;
+		bytes -= PAGE_CACHE_SIZE;
+		data = squashfs_next_page(output);
+	}
+	squashfs_finish_page(output);
 
 	return dest_len;
 }
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 2c844d53a59e..244b9fbfff7b 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -79,19 +79,45 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	struct buffer_head **bh, int b, int offset, int length,
 	struct squashfs_page_actor *output)
 {
-	int res;
-	size_t out_len = output->length;
 	struct squashfs_lzo *stream = strm;
+	void *buff = stream->input, *data;
+	int avail, i, bytes = length, res;
+	size_t out_len = output->length;
+
+	for (i = 0; i < b; i++) {
+		avail = min(bytes, msblk->devblksize - offset);
+		memcpy(buff, bh[i]->b_data + offset, avail);
+		buff += avail;
+		bytes -= avail;
+		offset = 0;
+		put_bh(bh[i]);
+	}
 
-	squashfs_bh_to_buf(bh, b, stream->input, offset, length,
-		msblk->devblksize);
 	res = lzo1x_decompress_safe(stream->input, (size_t)length,
 					stream->output, &out_len);
 	if (res != LZO_E_OK)
-		return -EIO;
-	squashfs_buf_to_actor(stream->output, output, out_len);
+		goto failed;
 
-	return out_len;
+	res = bytes = (int)out_len;
+	data = squashfs_first_page(output);
+	buff = stream->output;
+	while (data) {
+		if (bytes <= PAGE_CACHE_SIZE) {
+			memcpy(data, buff, bytes);
+			break;
+		} else {
+			memcpy(data, buff, PAGE_CACHE_SIZE);
+			buff += PAGE_CACHE_SIZE;
+			bytes -= PAGE_CACHE_SIZE;
+			data = squashfs_next_page(output);
+		}
+	}
+	squashfs_finish_page(output);
+
+	return res;
+
+failed:
+	return -EIO;
 }
 
 const struct squashfs_decompressor squashfs_lzo_comp_ops = {
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 53863508e400..5a1c11f56441 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -9,11 +9,79 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/buffer_head.h>
 #include "page_actor.h"
 
-struct squashfs_page_actor *squashfs_page_actor_init(struct page **page,
-	int pages, int length, void (*release_pages)(struct page **, int, int))
+/*
+ * This file contains implementations of page_actor for decompressing into
+ * an intermediate buffer, and for decompressing directly into the
+ * page cache.
+ *
+ * Calling code should avoid sleeping between calls to squashfs_first_page()
+ * and squashfs_finish_page().
+ */
+
+/* Implementation of page_actor for decompressing into intermediate buffer */
+static void *cache_first_page(struct squashfs_page_actor *actor)
+{
+	actor->next_page = 1;
+	return actor->buffer[0];
+}
+
+static void *cache_next_page(struct squashfs_page_actor *actor)
+{
+	if (actor->next_page == actor->pages)
+		return NULL;
+
+	return actor->buffer[actor->next_page++];
+}
+
+static void cache_finish_page(struct squashfs_page_actor *actor)
+{
+	/* empty */
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
+	int pages, int length)
+{
+	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+	if (actor == NULL)
+		return NULL;
+
+	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->buffer = buffer;
+	actor->pages = pages;
+	actor->next_page = 0;
+	actor->squashfs_first_page = cache_first_page;
+	actor->squashfs_next_page = cache_next_page;
+	actor->squashfs_finish_page = cache_finish_page;
+	return actor;
+}
+
+/* Implementation of page_actor for decompressing directly into page cache. */
+static void *direct_first_page(struct squashfs_page_actor *actor)
+{
+	actor->next_page = 1;
+	return actor->pageaddr = kmap_atomic(actor->page[0]);
+}
+
+static void *direct_next_page(struct squashfs_page_actor *actor)
+{
+	if (actor->pageaddr)
+		kunmap_atomic(actor->pageaddr);
+
+	return actor->pageaddr = actor->next_page == actor->pages ? NULL :
+		kmap_atomic(actor->page[actor->next_page++]);
+}
+
+static void direct_finish_page(struct squashfs_page_actor *actor)
+{
+	if (actor->pageaddr)
+		kunmap_atomic(actor->pageaddr);
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
+	int pages, int length)
 {
 	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
 
@@ -25,129 +93,8 @@ struct squashfs_page_actor *squashfs_page_actor_init(struct page **page,
 	actor->pages = pages;
 	actor->next_page = 0;
 	actor->pageaddr = NULL;
-	actor->release_pages = release_pages;
+	actor->squashfs_first_page = direct_first_page;
+	actor->squashfs_next_page = direct_next_page;
+	actor->squashfs_finish_page = direct_finish_page;
 	return actor;
 }
-
-void squashfs_page_actor_free(struct squashfs_page_actor *actor, int error)
-{
-	if (!actor)
-		return;
-
-	if (actor->release_pages)
-		actor->release_pages(actor->page, actor->pages, error);
-	kfree(actor);
-}
-
-void squashfs_actor_to_buf(struct squashfs_page_actor *actor, void *buf,
-	int length)
-{
-	void *pageaddr;
-	int pos = 0, avail, i;
-
-	for (i = 0; i < actor->pages && pos < length; ++i) {
-		avail = min_t(int, length - pos, PAGE_CACHE_SIZE);
-		if (actor->page[i]) {
-			pageaddr = kmap_atomic(actor->page[i]);
-			memcpy(buf + pos, pageaddr, avail);
-			kunmap_atomic(pageaddr);
-		}
-		pos += avail;
-	}
-}
-
-void squashfs_buf_to_actor(void *buf, struct squashfs_page_actor *actor,
-	int length)
-{
-	void *pageaddr;
-	int pos = 0, avail, i;
-
-	for (i = 0; i < actor->pages && pos < length; ++i) {
-		avail = min_t(int, length - pos, PAGE_CACHE_SIZE);
-		if (actor->page[i]) {
-			pageaddr = kmap_atomic(actor->page[i]);
-			memcpy(pageaddr, buf + pos, avail);
-			kunmap_atomic(pageaddr);
-		}
-		pos += avail;
-	}
-}
-
-void squashfs_bh_to_actor(struct buffer_head **bh, int nr_buffers,
-	struct squashfs_page_actor *actor, int offset, int length, int blksz)
-{
-	void *kaddr = NULL;
-	int bytes = 0, pgoff = 0, b = 0, p = 0, avail, i;
-
-	while (bytes < length) {
-		if (actor->page[p]) {
-			kaddr = kmap_atomic(actor->page[p]);
-			while (pgoff < PAGE_CACHE_SIZE && bytes < length) {
-				avail = min_t(int, blksz - offset,
-						PAGE_CACHE_SIZE - pgoff);
-				memcpy(kaddr + pgoff, bh[b]->b_data + offset,
-				       avail);
-				pgoff += avail;
-				bytes += avail;
-				offset = (offset + avail) % blksz;
-				if (!offset) {
-					put_bh(bh[b]);
-					++b;
-				}
-			}
-			kunmap_atomic(kaddr);
-			pgoff = 0;
-		} else {
-			for (i = 0; i < PAGE_CACHE_SIZE / blksz; ++i) {
-				if (bh[b])
-					put_bh(bh[b]);
-				++b;
-			}
-			bytes += PAGE_CACHE_SIZE;
-		}
-		++p;
-	}
-}
-
-void squashfs_bh_to_buf(struct buffer_head **bh, int nr_buffers, void *buf,
-	int offset, int length, int blksz)
-{
-	int i, avail, bytes = 0;
-
-	for (i = 0; i < nr_buffers && bytes < length; ++i) {
-		avail = min_t(int, length - bytes, blksz - offset);
-		if (bh[i]) {
-			memcpy(buf + bytes, bh[i]->b_data + offset, avail);
-			put_bh(bh[i]);
-		}
-		bytes += avail;
-		offset = 0;
-	}
-}
-
-void free_page_array(struct page **page, int nr_pages)
-{
-	int i;
-
-	for (i = 0; i < nr_pages; ++i)
-		__free_page(page[i]);
-	kfree(page);
-}
-
-struct page **alloc_page_array(int nr_pages, int gfp_mask)
-{
-	int i;
-	struct page **page;
-
-	page = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
-	if (!page)
-		return NULL;
-	for (i = 0; i < nr_pages; ++i) {
-		page[i] = alloc_page(gfp_mask);
-		if (!page[i]) {
-			free_page_array(page, i);
-			return NULL;
-		}
-	}
-	return page;
-}
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index aa1ed790b5a3..26dd82008b82 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -5,61 +5,77 @@
  * Phillip Lougher <phillip@squashfs.org.uk>
  *
  * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level squashfsory.
+ * the COPYING file in the top-level directory.
  */
 
+#ifndef CONFIG_SQUASHFS_FILE_DIRECT
 struct squashfs_page_actor {
-	struct page	**page;
-	void	*pageaddr;
+	void	**page;
 	int	pages;
 	int	length;
 	int	next_page;
-	void	(*release_pages)(struct page **, int, int);
 };
 
-extern struct squashfs_page_actor *squashfs_page_actor_init(struct page **,
-	int, int, void (*)(struct page **, int, int));
-extern void squashfs_page_actor_free(struct squashfs_page_actor *, int);
+static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
+	int pages, int length)
+{
+	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
 
-extern void squashfs_actor_to_buf(struct squashfs_page_actor *, void *, int);
-extern void squashfs_buf_to_actor(void *, struct squashfs_page_actor *, int);
-extern void squashfs_bh_to_actor(struct buffer_head **, int,
-	struct squashfs_page_actor *, int, int, int);
-extern void squashfs_bh_to_buf(struct buffer_head **, int, void *, int, int,
-	int);
+	if (actor == NULL)
+		return NULL;
+
+	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->page = page;
+	actor->pages = pages;
+	actor->next_page = 0;
+	return actor;
+}
 
-/*
- * Calling code should avoid sleeping between calls to squashfs_first_page()
- * and squashfs_finish_page().
- */
 static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
 {
 	actor->next_page = 1;
-	return actor->pageaddr = actor->page[0] ? kmap_atomic(actor->page[0])
-						: NULL;
+	return actor->page[0];
 }
 
 static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
 {
-	if (!IS_ERR_OR_NULL(actor->pageaddr))
-		kunmap_atomic(actor->pageaddr);
-
-	if (actor->next_page == actor->pages)
-		return actor->pageaddr = ERR_PTR(-ENODATA);
-
-	actor->pageaddr = actor->page[actor->next_page] ?
-	    kmap_atomic(actor->page[actor->next_page]) : NULL;
-	++actor->next_page;
-	return actor->pageaddr;
+	return actor->next_page == actor->pages ? NULL :
+		actor->page[actor->next_page++];
 }
 
 static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
 {
-	if (!IS_ERR_OR_NULL(actor->pageaddr))
-		kunmap_atomic(actor->pageaddr);
+	/* empty */
 }
+#else
+struct squashfs_page_actor {
+	union {
+		void		**buffer;
+		struct page	**page;
+	};
+	void	*pageaddr;
+	void    *(*squashfs_first_page)(struct squashfs_page_actor *);
+	void    *(*squashfs_next_page)(struct squashfs_page_actor *);
+	void    (*squashfs_finish_page)(struct squashfs_page_actor *);
+	int	pages;
+	int	length;
+	int	next_page;
+};
 
-extern struct page **alloc_page_array(int, int);
-extern void free_page_array(struct page **, int);
-
+extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
+							 **, int, int);
+static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
+{
+	return actor->squashfs_first_page(actor);
+}
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+	return actor->squashfs_next_page(actor);
+}
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+	actor->squashfs_finish_page(actor);
+}
+#endif
 #endif
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6093579c6c5d..887d6d270080 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -28,14 +28,8 @@
 #define WARNING(s, args...)	pr_warn("SQUASHFS: "s, ## args)
 
 /* block.c */
-extern int squashfs_init_read_wq(void);
-extern void squashfs_destroy_read_wq(void);
 extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
 				struct squashfs_page_actor *);
-extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
-	struct squashfs_page_actor *);
-extern int squashfs_read_data_async(struct super_block *, u64, int, u64 *,
-	struct squashfs_page_actor *);
 
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
@@ -76,9 +70,8 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
 void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
 				int);
 
-/* file_direct.c */
-extern int squashfs_readpages_block(struct page *, struct list_head *,
-	unsigned int *, struct address_space *, int, u64, int);
+/* file_xxx.c */
+extern int squashfs_readpage_block(struct page *, u64, int);
 
 /* id.c */
 extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 3b767ce1e46d..ef69c31947bf 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -49,7 +49,7 @@ struct squashfs_cache_entry {
 	int			num_waiters;
 	wait_queue_head_t	wait_queue;
 	struct squashfs_cache	*cache;
-	struct page		**page;
+	void			**data;
 	struct squashfs_page_actor	*actor;
 };
 
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 4ffbddb688aa..93aa3e23c845 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -445,15 +445,9 @@ static int __init init_squashfs_fs(void)
 	if (err)
 		return err;
 
-	if (!squashfs_init_read_wq()) {
-		destroy_inodecache();
-		return -ENOMEM;
-        }
-
 	err = register_filesystem(&squashfs_fs_type);
 	if (err) {
 		destroy_inodecache();
-		squashfs_destroy_read_wq();
 		return err;
 	}
 
@@ -467,7 +461,6 @@ static void __exit exit_squashfs_fs(void)
 {
 	unregister_filesystem(&squashfs_fs_type);
 	destroy_inodecache();
-	squashfs_destroy_read_wq();
 }
 
 
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 14cd373e1897..c609624e4b8a 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -55,7 +55,7 @@ static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk,
 	struct comp_opts *opts;
 	int err = 0, n;
 
-	opts = kmalloc(sizeof(*opts), GFP_ATOMIC);
+	opts = kmalloc(sizeof(*opts), GFP_KERNEL);
 	if (opts == NULL) {
 		err = -ENOMEM;
 		goto out2;
@@ -136,7 +136,6 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	enum xz_ret xz_err;
 	int avail, total = 0, k = 0;
 	struct squashfs_xz *stream = strm;
-	void *buf = NULL;
 
 	xz_dec_reset(stream->state);
 	stream->buf.in_pos = 0;
@@ -157,20 +156,12 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (stream->buf.out_pos == stream->buf.out_size) {
 			stream->buf.out = squashfs_next_page(output);
-			if (!IS_ERR(stream->buf.out)) {
+			if (stream->buf.out != NULL) {
 				stream->buf.out_pos = 0;
 				total += PAGE_CACHE_SIZE;
 			}
 		}
 
-		if (!stream->buf.out) {
-			if (!buf) {
-				buf = kmalloc(PAGE_CACHE_SIZE, GFP_ATOMIC);
-				if (!buf)
-					goto out;
-			}
-			stream->buf.out = buf;
-		}
 		xz_err = xz_dec_run(stream->state, &stream->buf);
 
 		if (stream->buf.in_pos == stream->buf.in_size && k < b)
@@ -182,13 +173,11 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	if (xz_err != XZ_STREAM_END || k < b)
 		goto out;
 
-	kfree(buf);
 	return total + stream->buf.out_pos;
 
 out:
 	for (; k < b; k++)
 		put_bh(bh[k]);
-	kfree(buf);
 
 	return -EIO;
 }
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 09c892b5308e..8727caba6882 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -66,7 +66,6 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	struct buffer_head **bh, int b, int offset, int length,
 	struct squashfs_page_actor *output)
 {
-	void *buf = NULL;
 	int zlib_err, zlib_init = 0, k = 0;
 	z_stream *stream = strm;
 
@@ -85,19 +84,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 
 		if (stream->avail_out == 0) {
 			stream->next_out = squashfs_next_page(output);
-			if (!IS_ERR(stream->next_out))
+			if (stream->next_out != NULL)
 				stream->avail_out = PAGE_CACHE_SIZE;
 		}
 
-		if (!stream->next_out) {
-			if (!buf) {
-				buf = kmalloc(PAGE_CACHE_SIZE, GFP_ATOMIC);
-				if (!buf)
-					goto out;
-			}
-			stream->next_out = buf;
-		}
-
 		if (!zlib_init) {
 			zlib_err = zlib_inflateInit(stream);
 			if (zlib_err != Z_OK) {
@@ -125,13 +115,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	if (k < b)
 		goto out;
 
-	kfree(buf);
 	return stream->total_out;
 
 out:
 	for (; k < b; k++)
 		put_bh(bh[k]);
-	kfree(buf);
 
 	return -EIO;
 }

From 07a1b872a154d944db272d4767c9b763d323d1d2 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 10 Jul 2017 15:51:46 -0700
Subject: [PATCH 1965/3220] lib/interval_tree_test.c: make test options module
 parameters

[ Upstream commit a54dae0338b7f01eb0f9c7571fb9b74f791d1c6b ]

Allows for more flexible debugging.

Link: http://lkml.kernel.org/r/20170518174936.20265-3-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/interval_tree_test.c | 57 ++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c
index 245900b98c8e..1093f0496d5e 100644
--- a/lib/interval_tree_test.c
+++ b/lib/interval_tree_test.c
@@ -1,16 +1,25 @@
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/interval_tree.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 #include <asm/timex.h>
 
-#define NODES        100
-#define PERF_LOOPS   100000
-#define SEARCHES     100
-#define SEARCH_LOOPS 10000
+#define __param(type, name, init, msg)		\
+	static type name = init;		\
+	module_param(name, type, 0444);		\
+	MODULE_PARM_DESC(name, msg);
+
+__param(int, nnodes, 100, "Number of nodes in the interval tree");
+__param(int, perf_loops, 100000, "Number of iterations modifying the tree");
+
+__param(int, nsearches, 100, "Number of searches to the interval tree");
+__param(int, search_loops, 10000, "Number of iterations searching the tree");
+
 
 static struct rb_root root = RB_ROOT;
-static struct interval_tree_node nodes[NODES];
-static u32 queries[SEARCHES];
+static struct interval_tree_node *nodes = NULL;
+static u32 *queries = NULL;
 
 static struct rnd_state rnd;
 
@@ -29,7 +38,8 @@ search(unsigned long query, struct rb_root *root)
 static void init(void)
 {
 	int i;
-	for (i = 0; i < NODES; i++) {
+
+	for (i = 0; i < nnodes; i++) {
 		u32 a = prandom_u32_state(&rnd);
 		u32 b = prandom_u32_state(&rnd);
 		if (a <= b) {
@@ -40,7 +50,7 @@ static void init(void)
 			nodes[i].last = a;
 		}
 	}
-	for (i = 0; i < SEARCHES; i++)
+	for (i = 0; i < nsearches; i++)
 		queries[i] = prandom_u32_state(&rnd);
 }
 
@@ -50,6 +60,16 @@ static int interval_tree_test_init(void)
 	unsigned long results;
 	cycles_t time1, time2, time;
 
+	nodes = kmalloc(nnodes * sizeof(struct interval_tree_node), GFP_KERNEL);
+	if (!nodes)
+		return -ENOMEM;
+
+	queries = kmalloc(nsearches * sizeof(int), GFP_KERNEL);
+	if (!queries) {
+		kfree(nodes);
+		return -ENOMEM;
+	}
+
 	printk(KERN_ALERT "interval tree insert/remove");
 
 	prandom_seed_state(&rnd, 3141592653589793238ULL);
@@ -57,39 +77,42 @@ static int interval_tree_test_init(void)
 
 	time1 = get_cycles();
 
-	for (i = 0; i < PERF_LOOPS; i++) {
-		for (j = 0; j < NODES; j++)
+	for (i = 0; i < perf_loops; i++) {
+		for (j = 0; j < nnodes; j++)
 			interval_tree_insert(nodes + j, &root);
-		for (j = 0; j < NODES; j++)
+		for (j = 0; j < nnodes; j++)
 			interval_tree_remove(nodes + j, &root);
 	}
 
 	time2 = get_cycles();
 	time = time2 - time1;
 
-	time = div_u64(time, PERF_LOOPS);
+	time = div_u64(time, perf_loops);
 	printk(" -> %llu cycles\n", (unsigned long long)time);
 
 	printk(KERN_ALERT "interval tree search");
 
-	for (j = 0; j < NODES; j++)
+	for (j = 0; j < nnodes; j++)
 		interval_tree_insert(nodes + j, &root);
 
 	time1 = get_cycles();
 
 	results = 0;
-	for (i = 0; i < SEARCH_LOOPS; i++)
-		for (j = 0; j < SEARCHES; j++)
+	for (i = 0; i < search_loops; i++)
+		for (j = 0; j < nsearches; j++)
 			results += search(queries[j], &root);
 
 	time2 = get_cycles();
 	time = time2 - time1;
 
-	time = div_u64(time, SEARCH_LOOPS);
-	results = div_u64(results, SEARCH_LOOPS);
+	time = div_u64(time, search_loops);
+	results = div_u64(results, search_loops);
 	printk(" -> %llu cycles (%lu results)\n",
 	       (unsigned long long)time, results);
 
+	kfree(queries);
+	kfree(nodes);
+
 	return -EAGAIN; /* Fail will directly unload the module */
 }
 

From 186d114f0e5569a89162591aca4ce02f0c68c5da Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 10 Jul 2017 15:51:52 -0700
Subject: [PATCH 1966/3220] lib/interval_tree_test.c: allow full tree search

[ Upstream commit c46ecce431ebe6b1a9551d1f530eb432dae5c39b ]

...  such that a user can specify visiting all the nodes in the tree
(intersects with the world).  This is a nice opposite from the very
basic default query which is a single point.

Link: http://lkml.kernel.org/r/20170518174936.20265-5-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/interval_tree_test.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c
index 1093f0496d5e..409383463879 100644
--- a/lib/interval_tree_test.c
+++ b/lib/interval_tree_test.c
@@ -15,6 +15,7 @@ __param(int, perf_loops, 100000, "Number of iterations modifying the tree");
 
 __param(int, nsearches, 100, "Number of searches to the interval tree");
 __param(int, search_loops, 10000, "Number of iterations searching the tree");
+__param(bool, search_all, false, "Searches will iterate all nodes in the tree");
 
 
 static struct rb_root root = RB_ROOT;
@@ -24,13 +25,13 @@ static u32 *queries = NULL;
 static struct rnd_state rnd;
 
 static inline unsigned long
-search(unsigned long query, struct rb_root *root)
+search(struct rb_root *root, unsigned long start, unsigned long last)
 {
 	struct interval_tree_node *node;
 	unsigned long results = 0;
 
-	for (node = interval_tree_iter_first(root, query, query); node;
-	     node = interval_tree_iter_next(node, query, query))
+	for (node = interval_tree_iter_first(root, start, last); node;
+	     node = interval_tree_iter_next(node, start, last))
 		results++;
 	return results;
 }
@@ -99,8 +100,12 @@ static int interval_tree_test_init(void)
 
 	results = 0;
 	for (i = 0; i < search_loops; i++)
-		for (j = 0; j < nsearches; j++)
-			results += search(queries[j], &root);
+		for (j = 0; j < nsearches; j++) {
+			unsigned long start = search_all ? 0 : queries[j];
+			unsigned long last = search_all ? max_endpoint : queries[j];
+
+			results += search(&root, start, last);
+		}
 
 	time2 = get_cycles();
 	time = time2 - time1;

From 46d4da6249b99d986223b49048dfb1d1c8a859d2 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 8 Sep 2017 16:14:46 -0700
Subject: [PATCH 1967/3220] lib/rbtree_test.c: make input module parameters

[ Upstream commit 223f8911eace60c787f8767c25148b80ece9732a ]

Allows for more flexible debugging.

Link: http://lkml.kernel.org/r/20170719014603.19029-5-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/rbtree_test.c | 55 +++++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 8b3c9dc88262..e83331aa1b7f 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -1,11 +1,18 @@
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/rbtree_augmented.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 #include <asm/timex.h>
 
-#define NODES       100
-#define PERF_LOOPS  100000
-#define CHECK_LOOPS 100
+#define __param(type, name, init, msg)		\
+	static type name = init;		\
+	module_param(name, type, 0444);		\
+	MODULE_PARM_DESC(name, msg);
+
+__param(int, nnodes, 100, "Number of nodes in the rb-tree");
+__param(int, perf_loops, 100000, "Number of iterations modifying the rb-tree");
+__param(int, check_loops, 100, "Number of iterations modifying and verifying the rb-tree");
 
 struct test_node {
 	u32 key;
@@ -17,7 +24,7 @@ struct test_node {
 };
 
 static struct rb_root root = RB_ROOT;
-static struct test_node nodes[NODES];
+static struct test_node *nodes = NULL;
 
 static struct rnd_state rnd;
 
@@ -95,7 +102,7 @@ static void erase_augmented(struct test_node *node, struct rb_root *root)
 static void init(void)
 {
 	int i;
-	for (i = 0; i < NODES; i++) {
+	for (i = 0; i < nnodes; i++) {
 		nodes[i].key = prandom_u32_state(&rnd);
 		nodes[i].val = prandom_u32_state(&rnd);
 	}
@@ -177,6 +184,10 @@ static int __init rbtree_test_init(void)
 	int i, j;
 	cycles_t time1, time2, time;
 
+	nodes = kmalloc(nnodes * sizeof(*nodes), GFP_KERNEL);
+	if (!nodes)
+		return -ENOMEM;
+
 	printk(KERN_ALERT "rbtree testing");
 
 	prandom_seed_state(&rnd, 3141592653589793238ULL);
@@ -184,27 +195,27 @@ static int __init rbtree_test_init(void)
 
 	time1 = get_cycles();
 
-	for (i = 0; i < PERF_LOOPS; i++) {
-		for (j = 0; j < NODES; j++)
+	for (i = 0; i < perf_loops; i++) {
+		for (j = 0; j < nnodes; j++)
 			insert(nodes + j, &root);
-		for (j = 0; j < NODES; j++)
+		for (j = 0; j < nnodes; j++)
 			erase(nodes + j, &root);
 	}
 
 	time2 = get_cycles();
 	time = time2 - time1;
 
-	time = div_u64(time, PERF_LOOPS);
+	time = div_u64(time, perf_loops);
 	printk(" -> %llu cycles\n", (unsigned long long)time);
 
-	for (i = 0; i < CHECK_LOOPS; i++) {
+	for (i = 0; i < check_loops; i++) {
 		init();
-		for (j = 0; j < NODES; j++) {
+		for (j = 0; j < nnodes; j++) {
 			check(j);
 			insert(nodes + j, &root);
 		}
-		for (j = 0; j < NODES; j++) {
-			check(NODES - j);
+		for (j = 0; j < nnodes; j++) {
+			check(nnodes - j);
 			erase(nodes + j, &root);
 		}
 		check(0);
@@ -216,32 +227,34 @@ static int __init rbtree_test_init(void)
 
 	time1 = get_cycles();
 
-	for (i = 0; i < PERF_LOOPS; i++) {
-		for (j = 0; j < NODES; j++)
+	for (i = 0; i < perf_loops; i++) {
+		for (j = 0; j < nnodes; j++)
 			insert_augmented(nodes + j, &root);
-		for (j = 0; j < NODES; j++)
+		for (j = 0; j < nnodes; j++)
 			erase_augmented(nodes + j, &root);
 	}
 
 	time2 = get_cycles();
 	time = time2 - time1;
 
-	time = div_u64(time, PERF_LOOPS);
+	time = div_u64(time, perf_loops);
 	printk(" -> %llu cycles\n", (unsigned long long)time);
 
-	for (i = 0; i < CHECK_LOOPS; i++) {
+	for (i = 0; i < check_loops; i++) {
 		init();
-		for (j = 0; j < NODES; j++) {
+		for (j = 0; j < nnodes; j++) {
 			check_augmented(j);
 			insert_augmented(nodes + j, &root);
 		}
-		for (j = 0; j < NODES; j++) {
-			check_augmented(NODES - j);
+		for (j = 0; j < nnodes; j++) {
+			check_augmented(nnodes - j);
 			erase_augmented(nodes + j, &root);
 		}
 		check_augmented(0);
 	}
 
+	kfree(nodes);
+
 	return -EAGAIN; /* Fail will directly unload the module */
 }
 

From fa1054e903ae9381ac6f3fd27353515d5807ab64 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 17 Nov 2017 15:28:27 -0800
Subject: [PATCH 1968/3220] lib/rbtree-test: lower default params

[ Upstream commit 0b548e33e6cb2bff240fdaf1783783be15c29080 ]

Fengguang reported soft lockups while running the rbtree and interval
tree test modules.  The logic for these tests all occur in init phase,
and we currently are pounding with the default values for number of
nodes and number of iterations of each test.  Reduce the latter by two
orders of magnitude.  This does not influence the value of the tests in
that one thousand times by default is enough to get the picture.

Link: http://lkml.kernel.org/r/20171109161715.xai2dtwqw2frhkcm@linux-n805
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/interval_tree_test.c | 4 ++--
 lib/rbtree_test.c        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c
index 409383463879..bababcf7ffdd 100644
--- a/lib/interval_tree_test.c
+++ b/lib/interval_tree_test.c
@@ -11,10 +11,10 @@
 	MODULE_PARM_DESC(name, msg);
 
 __param(int, nnodes, 100, "Number of nodes in the interval tree");
-__param(int, perf_loops, 100000, "Number of iterations modifying the tree");
+__param(int, perf_loops, 1000, "Number of iterations modifying the tree");
 
 __param(int, nsearches, 100, "Number of searches to the interval tree");
-__param(int, search_loops, 10000, "Number of iterations searching the tree");
+__param(int, search_loops, 1000, "Number of iterations searching the tree");
 __param(bool, search_all, false, "Searches will iterate all nodes in the tree");
 
 
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index e83331aa1b7f..afedd3770562 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -11,7 +11,7 @@
 	MODULE_PARM_DESC(name, msg);
 
 __param(int, nnodes, 100, "Number of nodes in the rb-tree");
-__param(int, perf_loops, 100000, "Number of iterations modifying the rb-tree");
+__param(int, perf_loops, 1000, "Number of iterations modifying the rb-tree");
 __param(int, check_loops, 100, "Number of iterations modifying and verifying the rb-tree");
 
 struct test_node {

From 74fc7af2d613a5a8dd37047a6eb218491794c6e0 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 10 Jul 2017 15:51:49 -0700
Subject: [PATCH 1969/3220] lib/interval_tree_test.c: allow users to limit
 scope of endpoint

[ Upstream commit a8ec14d4f6aa8e245efacc992c8ee6ea0464ce2a ]

Add a 'max_endpoint' parameter such that users may easily limit the size
of the intervals that are randomly generated.

Link: http://lkml.kernel.org/r/20170518174936.20265-4-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/interval_tree_test.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c
index bababcf7ffdd..222c8010bda0 100644
--- a/lib/interval_tree_test.c
+++ b/lib/interval_tree_test.c
@@ -17,6 +17,7 @@ __param(int, nsearches, 100, "Number of searches to the interval tree");
 __param(int, search_loops, 1000, "Number of iterations searching the tree");
 __param(bool, search_all, false, "Searches will iterate all nodes in the tree");
 
+__param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint");
 
 static struct rb_root root = RB_ROOT;
 static struct interval_tree_node *nodes = NULL;
@@ -41,18 +42,20 @@ static void init(void)
 	int i;
 
 	for (i = 0; i < nnodes; i++) {
-		u32 a = prandom_u32_state(&rnd);
-		u32 b = prandom_u32_state(&rnd);
-		if (a <= b) {
-			nodes[i].start = a;
-			nodes[i].last = b;
-		} else {
-			nodes[i].start = b;
-			nodes[i].last = a;
-		}
+		u32 b = (prandom_u32_state(&rnd) >> 4) % max_endpoint;
+		u32 a = (prandom_u32_state(&rnd) >> 4) % b;
+
+		nodes[i].start = a;
+		nodes[i].last = b;
 	}
+
+	/*
+	 * Limit the search scope to what the user defined.
+	 * Otherwise we are merely measuring empty walks,
+	 * which is pointless.
+	 */
 	for (i = 0; i < nsearches; i++)
-		queries[i] = prandom_u32_state(&rnd);
+		queries[i] = (prandom_u32_state(&rnd) >> 4) % max_endpoint;
 }
 
 static int interval_tree_test_init(void)

From bd5ceb985c569c5f8ea180c2028a2eb29f61c874 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 13 Nov 2017 07:15:41 +0100
Subject: [PATCH 1970/3220] timer/debug: Change /proc/timer_list from 0444 to
 0400

[ Upstream commit 8e7df2b5b7f245c9bd11064712db5cb69044a362 ]

While it uses %pK, there's still few reasons to read this file
as non-root.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/time/timer_list.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ef4f16e81283..1407ed20ea93 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -399,7 +399,7 @@ static int __init init_timer_list_procfs(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
+	pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
 	if (!pe)
 		return -ENOMEM;
 	return 0;

From f9b94f823aebb377bdd7e8e8e3de982e9df82606 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sat, 15 Dec 2018 07:30:39 -0800
Subject: [PATCH 1971/3220] powerpc/boot: Fix random libfdt related build
 errors

[ Upstream commit 64c3f648c25d108f346fdc96c15180c6b7d250e9 ]

Once in a while I see build errors similar to the following
when building images from a clean tree.

  Building powerpc:virtex-ml507:44x/virtex5_defconfig ... failed
  ------------
  Error log:
  arch/powerpc/boot/treeboot-akebono.c:37:20: fatal error:
  	libfdt.h: No such file or directory

  Building powerpc:bamboo:smpdev:44x/bamboo_defconfig ... failed
  ------------
  Error log:
  arch/powerpc/boot/treeboot-akebono.c:37:20: fatal error:
  	libfdt.h: No such file or directory

  arch/powerpc/boot/treeboot-currituck.c:35:20: fatal error:
       libfdt.h: No such file or directory

Rebuilds will succeed.

Turns out that several source files in arch/powerpc/boot/ include
libfdt.h, but Makefile dependencies are incomplete. Let's fix that.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[groeck: Backport to v4.4.y]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/boot/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 99e4487248ff..57003d1bd243 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -70,7 +70,8 @@ $(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \
 libfdt       := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c
 libfdtheader := fdt.h libfdt.h libfdt_internal.h
 
-$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o): \
+$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o \
+	treeboot-akebono.o treeboot-currituck.o treeboot-iss4xx.o): \
 	$(addprefix $(obj)/,$(libfdtheader))
 
 src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \

From a5365ad60df2d488fc99e05c0fcba6dfeec59dd6 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Tue, 4 Dec 2018 17:04:57 +0800
Subject: [PATCH 1972/3220] pinctrl: sunxi: a83t: Fix IRQ offset typo for PH11

commit 478b6767ad26ab86d9ecc341027dd09a87b1f997 upstream.

Pin PH11 is used on various A83T board to detect a change in the OTG
port's ID pin, as in when an OTG host cable is plugged in.

The incorrect offset meant the gpiochip/irqchip was activating the wrong
pin for interrupts.

Fixes: 4730f33f0d82 ("pinctrl: sunxi: add allwinner A83T PIO controller support")
Cc: <stable@vger.kernel.org>
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pinctrl/sunxi/pinctrl-sun8i-a83t.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pinctrl/sunxi/pinctrl-sun8i-a83t.c b/drivers/pinctrl/sunxi/pinctrl-sun8i-a83t.c
index a7c81e988656..383977ea3a3c 100644
--- a/drivers/pinctrl/sunxi/pinctrl-sun8i-a83t.c
+++ b/drivers/pinctrl/sunxi/pinctrl-sun8i-a83t.c
@@ -568,7 +568,7 @@ static const struct sunxi_desc_pin sun8i_a83t_pins[] = {
 	SUNXI_PIN(SUNXI_PINCTRL_PIN(H, 11),
 		  SUNXI_FUNCTION(0x0, "gpio_in"),
 		  SUNXI_FUNCTION(0x1, "gpio_out"),
-		  SUNXI_FUNCTION_IRQ_BANK(0x6, 2, 1)),	/* PH_EINT11 */
+		  SUNXI_FUNCTION_IRQ_BANK(0x6, 2, 11)),	/* PH_EINT11 */
 };
 
 static const struct sunxi_pinctrl_desc sun8i_a83t_pinctrl_data = {

From 068be33b99c9152e935ec2db8445d4669e69e451 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 11 Dec 2018 12:37:49 -0500
Subject: [PATCH 1973/3220] aio: fix spectre gadget in lookup_ioctx

commit a538e3ff9dabcdf6c3f477a373c629213d1c3066 upstream.

Matthew pointed out that the ioctx_table is susceptible to spectre v1,
because the index can be controlled by an attacker.  The below patch
should mitigate the attack for all of the aio system calls.

Cc: stable@vger.kernel.org
Reported-by: Matthew Wilcox <willy@infradead.org>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/aio.c b/fs/aio.c
index c283eb03cb38..7187d03aa0bc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -40,6 +40,7 @@
 #include <linux/ramfs.h>
 #include <linux/percpu-refcount.h>
 #include <linux/mount.h>
+#include <linux/nospec.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -1063,6 +1064,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	if (!table || id >= table->nr)
 		goto out;
 
+	id = array_index_nospec(id, table->nr);
 	ctx = rcu_dereference(table->table[id]);
 	if (ctx && ctx->user_id == ctx_id) {
 		if (percpu_ref_tryget_live(&ctx->users))

From c046fa1ca387f39a4774eda5a825a0e1902ab6f8 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Tue, 20 Nov 2018 01:14:00 +0200
Subject: [PATCH 1974/3220] MMC: OMAP: fix broken MMC on
 OMAP15XX/OMAP5910/OMAP310

commit e8cde625bfe8a714a856e1366bcbb259d7346095 upstream.

Since v2.6.22 or so there has been reports [1] about OMAP MMC being
broken on OMAP15XX based hardware (OMAP5910 and OMAP310). The breakage
seems to have been caused by commit 46a6730e3ff9 ("mmc-omap: Fix
omap to use MMC_POWER_ON") that changed clock enabling to be done
on MMC_POWER_ON. This can happen multiple times in a row, and on 15XX
the hardware doesn't seem to like it and the MMC just stops responding.
Fix by memorizing the power mode and do the init only when necessary.

Before the patch (on Palm TE):

	mmc0: new SD card at address b368
	mmcblk0: mmc0:b368 SDC   977 MiB
	mmci-omap mmci-omap.0: command timeout (CMD18)
	mmci-omap mmci-omap.0: command timeout (CMD13)
	mmci-omap mmci-omap.0: command timeout (CMD13)
	mmci-omap mmci-omap.0: command timeout (CMD12) [x 6]
	mmci-omap mmci-omap.0: command timeout (CMD13) [x 6]
	mmcblk0: error -110 requesting status
	mmci-omap mmci-omap.0: command timeout (CMD8)
	mmci-omap mmci-omap.0: command timeout (CMD18)
	mmci-omap mmci-omap.0: command timeout (CMD13)
	mmci-omap mmci-omap.0: command timeout (CMD13)
	mmci-omap mmci-omap.0: command timeout (CMD12) [x 6]
	mmci-omap mmci-omap.0: command timeout (CMD13) [x 6]
	mmcblk0: error -110 requesting status
	mmcblk0: recovery failed!
	print_req_error: I/O error, dev mmcblk0, sector 0
	Buffer I/O error on dev mmcblk0, logical block 0, async page read
	 mmcblk0: unable to read partition table

After the patch:

	mmc0: new SD card at address b368
	mmcblk0: mmc0:b368 SDC   977 MiB
	 mmcblk0: p1

The patch is based on a fix and analysis done by Ladislav Michl.

Tested on OMAP15XX/OMAP310 (Palm TE), OMAP1710 (Nokia 770)
and OMAP2420 (Nokia N810).

[1] https://marc.info/?t=123175197000003&r=1&w=2

Fixes: 46a6730e3ff9 ("mmc-omap: Fix omap to use MMC_POWER_ON")
Reported-by: Ladislav Michl <ladis@linux-mips.org>
Reported-by: Andrzej Zaborowski <balrogg@gmail.com>
Tested-by: Ladislav Michl <ladis@linux-mips.org>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Cc: stable@vger.kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/host/omap.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c
index b9958a123594..5bcf4f45f8b4 100644
--- a/drivers/mmc/host/omap.c
+++ b/drivers/mmc/host/omap.c
@@ -105,6 +105,7 @@ struct mmc_omap_slot {
 	unsigned int		vdd;
 	u16			saved_con;
 	u16			bus_mode;
+	u16			power_mode;
 	unsigned int		fclk_freq;
 
 	struct tasklet_struct	cover_tasklet;
@@ -1156,7 +1157,7 @@ static void mmc_omap_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	struct mmc_omap_slot *slot = mmc_priv(mmc);
 	struct mmc_omap_host *host = slot->host;
 	int i, dsor;
-	int clk_enabled;
+	int clk_enabled, init_stream;
 
 	mmc_omap_select_slot(slot, 0);
 
@@ -1166,6 +1167,7 @@ static void mmc_omap_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 		slot->vdd = ios->vdd;
 
 	clk_enabled = 0;
+	init_stream = 0;
 	switch (ios->power_mode) {
 	case MMC_POWER_OFF:
 		mmc_omap_set_power(slot, 0, ios->vdd);
@@ -1173,13 +1175,17 @@ static void mmc_omap_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	case MMC_POWER_UP:
 		/* Cannot touch dsor yet, just power up MMC */
 		mmc_omap_set_power(slot, 1, ios->vdd);
+		slot->power_mode = ios->power_mode;
 		goto exit;
 	case MMC_POWER_ON:
 		mmc_omap_fclk_enable(host, 1);
 		clk_enabled = 1;
 		dsor |= 1 << 11;
+		if (slot->power_mode != MMC_POWER_ON)
+			init_stream = 1;
 		break;
 	}
+	slot->power_mode = ios->power_mode;
 
 	if (slot->bus_mode != ios->bus_mode) {
 		if (slot->pdata->set_bus_mode != NULL)
@@ -1195,7 +1201,7 @@ static void mmc_omap_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	for (i = 0; i < 2; i++)
 		OMAP_MMC_WRITE(host, CON, dsor);
 	slot->saved_con = dsor;
-	if (ios->power_mode == MMC_POWER_ON) {
+	if (init_stream) {
 		/* worst case at 400kHz, 80 cycles makes 200 microsecs */
 		int usecs = 250;
 
@@ -1233,6 +1239,7 @@ static int mmc_omap_new_slot(struct mmc_omap_host *host, int id)
 	slot->host = host;
 	slot->mmc = mmc;
 	slot->id = id;
+	slot->power_mode = MMC_POWER_UNDEFINED;
 	slot->pdata = &host->pdata->slots[id];
 
 	host->slots[id] = slot;

From 60ed7a77f8f90ba93fc12f8ddb86751fe8817428 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Sun, 9 Dec 2018 21:17:30 -0500
Subject: [PATCH 1975/3220] tracing: Fix memory leak in set_trigger_filter()

commit 3cec638b3d793b7cacdec5b8072364b41caeb0e1 upstream.

When create_event_filter() fails in set_trigger_filter(), the filter may
still be allocated and needs to be freed. The caller expects the
data->filter to be updated with the new filter, even if the new filter
failed (we could add an error message by setting set_str parameter of
create_event_filter(), but that's another update).

But because the error would just exit, filter was left hanging and
nothing could free it.

Found by kmemleak detector.

Cc: stable@vger.kernel.org
Fixes: bac5fb97a173a ("tracing: Add and use generic set_trigger_filter() implementation")
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/trace/trace_events_trigger.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index b8a894adab2c..8be66a2b0cac 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -727,8 +727,10 @@ static int set_trigger_filter(char *filter_str,
 
 	/* The filter is for the 'trigger' event, not the triggered event */
 	ret = create_event_filter(file->event_call, filter_str, false, &filter);
-	if (ret)
-		goto out;
+	/*
+	 * If create_event_filter() fails, filter still needs to be freed.
+	 * Which the calling code will do with data->filter.
+	 */
  assign:
 	tmp = rcu_access_pointer(data->filter);
 

From f2e7e67e459394faf1c56f1391c0e6dd4bd9e2f0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 10 Dec 2018 23:58:01 -0500
Subject: [PATCH 1976/3220] tracing: Fix memory leak of instance function hash
 filters

commit 2840f84f74035e5a535959d5f17269c69fa6edc5 upstream.

The following commands will cause a memory leak:

 # cd /sys/kernel/tracing
 # mkdir instances/foo
 # echo schedule > instance/foo/set_ftrace_filter
 # rmdir instances/foo

The reason is that the hashes that hold the filters to set_ftrace_filter and
set_ftrace_notrace are not freed if they contain any data on the instance
and the instance is removed.

Found by kmemleak detector.

Cc: stable@vger.kernel.org
Fixes: 591dffdade9f ("ftrace: Allow for function tracing instance to filter functions")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/trace/ftrace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ac758a53fcea..d90b42b39908 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4767,6 +4767,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
 	if (ops->flags & FTRACE_OPS_FL_ENABLED)
 		ftrace_shutdown(ops, 0);
 	ops->flags |= FTRACE_OPS_FL_DELETED;
+	ftrace_free_filter(ops);
 	mutex_unlock(&ftrace_lock);
 }
 

From ee2bc807cbf0dfcf0f4ee522e3ca3d250c269158 Mon Sep 17 00:00:00 2001
From: Radu Rendec <radu.rendec@gmail.com>
Date: Tue, 27 Nov 2018 22:20:48 -0500
Subject: [PATCH 1977/3220] powerpc/msi: Fix NULL pointer access in teardown
 code

commit 78e7b15e17ac175e7eed9e21c6f92d03d3b0a6fa upstream.

The arch_teardown_msi_irqs() function assumes that controller ops
pointers were already checked in arch_setup_msi_irqs(), but this
assumption is wrong: arch_teardown_msi_irqs() can be called even when
arch_setup_msi_irqs() returns an error (-ENOSYS).

This can happen in the following scenario:
  - msi_capability_init() calls pci_msi_setup_msi_irqs()
  - pci_msi_setup_msi_irqs() returns -ENOSYS
  - msi_capability_init() notices the error and calls free_msi_irqs()
  - free_msi_irqs() calls pci_msi_teardown_msi_irqs()

This is easier to see when CONFIG_PCI_MSI_IRQ_DOMAIN is not set and
pci_msi_setup_msi_irqs() and pci_msi_teardown_msi_irqs() are just
aliases to arch_setup_msi_irqs() and arch_teardown_msi_irqs().

The call to free_msi_irqs() upon pci_msi_setup_msi_irqs() failure
seems legit, as it does additional cleanup; e.g.
list_del(&entry->list) and kfree(entry) inside free_msi_irqs() do
happen (MSI descriptors are allocated before pci_msi_setup_msi_irqs()
is called and need to be cleaned up if that fails).

Fixes: 6b2fd7efeb88 ("PCI/MSI/PPC: Remove arch_msi_check_device()")
Cc: stable@vger.kernel.org # v3.18+
Signed-off-by: Radu Rendec <radu.rendec@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/msi.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index dab616a33b8d..f2197654be07 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -34,5 +34,10 @@ void arch_teardown_msi_irqs(struct pci_dev *dev)
 {
 	struct pci_controller *phb = pci_bus_to_host(dev->bus);
 
-	phb->controller_ops.teardown_msi_irqs(dev);
+	/*
+	 * We can be called even when arch_setup_msi_irqs() returns -ENOSYS,
+	 * so check the pointer again.
+	 */
+	if (phb->controller_ops.teardown_msi_irqs)
+		phb->controller_ops.teardown_msi_irqs(dev);
 }

From 985ee65d554c2c48107b929b26e28fae2b946809 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Wed, 5 Dec 2018 10:16:57 -0800
Subject: [PATCH 1978/3220] Revert "drm/rockchip: Allow driver to be shutdown
 on reboot/kexec"

commit 63238173b2faf3d6b85a416f1c69af6c7be2413f upstream.

This reverts commit 7f3ef5dedb146e3d5063b6845781ad1bb59b92b5.

It causes new warnings [1] on shutdown when running the Google Kevin or
Scarlet (RK3399) boards under Chrome OS. Presumably our usage of DRM is
different than what Marc and Heiko test.

We're looking at a different approach (e.g., [2]) to replace this, but
IMO the revert should be taken first, as it already propagated to
-stable.

[1] Report here:
http://lkml.kernel.org/lkml/20181205030127.GA200921@google.com

WARNING: CPU: 4 PID: 2035 at drivers/gpu/drm/drm_mode_config.c:477 drm_mode_config_cleanup+0x1c4/0x294
...
 Call trace:
  drm_mode_config_cleanup+0x1c4/0x294
  rockchip_drm_unbind+0x4c/0x8c
  component_master_del+0x88/0xb8
  rockchip_drm_platform_remove+0x2c/0x44
  rockchip_drm_platform_shutdown+0x20/0x2c
  platform_drv_shutdown+0x2c/0x38
  device_shutdown+0x164/0x1b8
  kernel_restart_prepare+0x40/0x48
  kernel_restart+0x20/0x68
...
 Memory manager not clean during takedown.
 WARNING: CPU: 4 PID: 2035 at drivers/gpu/drm/drm_mm.c:950 drm_mm_takedown+0x34/0x44
...
  drm_mm_takedown+0x34/0x44
  rockchip_drm_unbind+0x64/0x8c
  component_master_del+0x88/0xb8
  rockchip_drm_platform_remove+0x2c/0x44
  rockchip_drm_platform_shutdown+0x20/0x2c
  platform_drv_shutdown+0x2c/0x38
  device_shutdown+0x164/0x1b8
  kernel_restart_prepare+0x40/0x48
  kernel_restart+0x20/0x68
...

[2] https://patchwork.kernel.org/patch/10556151/
    https://www.spinics.net/lists/linux-rockchip/msg21342.html
    [PATCH] drm/rockchip: shutdown drm subsystem on shutdown

Fixes: 7f3ef5dedb14 ("drm/rockchip: Allow driver to be shutdown on reboot/kexec")
Cc: Jeffy Chen <jeffy.chen@rock-chips.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Vicente Bergas <vicencb@gmail.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: stable@vger.kernel.org
Signed-off-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Link: https://patchwork.freedesktop.org/patch/msgid/20181205181657.177703-1-briannorris@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/rockchip/rockchip_drm_drv.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_drv.c b/drivers/gpu/drm/rockchip/rockchip_drm_drv.c
index d1f3be78c649..f22e1e1ee64a 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_drv.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_drv.c
@@ -547,11 +547,6 @@ static int rockchip_drm_platform_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static void rockchip_drm_platform_shutdown(struct platform_device *pdev)
-{
-	rockchip_drm_platform_remove(pdev);
-}
-
 static const struct of_device_id rockchip_drm_dt_ids[] = {
 	{ .compatible = "rockchip,display-subsystem", },
 	{ /* sentinel */ },
@@ -561,7 +556,6 @@ MODULE_DEVICE_TABLE(of, rockchip_drm_dt_ids);
 static struct platform_driver rockchip_drm_platform_driver = {
 	.probe = rockchip_drm_platform_probe,
 	.remove = rockchip_drm_platform_remove,
-	.shutdown = rockchip_drm_platform_shutdown,
 	.driver = {
 		.name = "rockchip-drm",
 		.of_match_table = rockchip_drm_dt_ids,

From 0fd7726d69afb0b61817f8d51e80f385cbfb2b10 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Thu, 1 Jun 2017 16:43:51 +0800
Subject: [PATCH 1979/3220] f2fs: fix a panic caused by NULL flush_cmd_control

commit d4fdf8ba0e5808ba9ad6b44337783bd9935e0982 upstream.

Mount fs with option noflush_merge, boot failed for illegal address
fcc in function f2fs_issue_flush:

        if (!test_opt(sbi, FLUSH_MERGE)) {
                ret = submit_flush_wait(sbi);
                atomic_inc(&fcc->issued_flush);   ->  Here, fcc illegal
                return ret;
        }

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.9: adjust context]
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/segment.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 2bba0c4ef4b7..39ec9da08bb5 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -398,6 +398,9 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	init_waitqueue_head(&fcc->flush_wait_queue);
 	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->cmd_control_info = fcc;
+	if (!test_opt(sbi, FLUSH_MERGE))
+		return err;
+
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(fcc->f2fs_issue_flush)) {
@@ -2316,7 +2319,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 
 	INIT_LIST_HEAD(&sm_info->sit_entry_set);
 
-	if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
+	if (!f2fs_readonly(sbi->sb)) {
 		err = create_flush_cmd_control(sbi);
 		if (err)
 			return err;

From 7309da08e3f43c781d0e08871431c51b104ff1b5 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Mon, 26 Mar 2018 16:21:04 +0300
Subject: [PATCH 1980/3220] mac80211: don't WARN on bad WMM parameters from
 buggy APs

[ Upstream commit c470bdc1aaf36669e04ba65faf1092b2d1c6cabe ]

Apparently, some APs are buggy enough to send a zeroed
WMM IE. Don't WARN on this since this is not caused by a bug
on the client's system.

This aligns the condition of the WARNING in drv_conf_tx
with the validity check in ieee80211_sta_wmm_params.
We will now pick the default values whenever we get
a zeroed WMM IE.

This has been reported here:
https://bugzilla.kernel.org/show_bug.cgi?id=199161

Fixes: f409079bb678 ("mac80211: sanity check CW_min/CW_max towards driver")
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/mac80211/mlme.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a5e11280f405..51f31e77c19b 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1886,7 +1886,8 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
 		params[ac].acm = acm;
 		params[ac].uapsd = uapsd;
 
-		if (params[ac].cw_min > params[ac].cw_max) {
+		if (params->cw_min == 0 ||
+		    params[ac].cw_min > params[ac].cw_max) {
 			sdata_info(sdata,
 				   "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n",
 				   params[ac].cw_min, params[ac].cw_max, aci);

From 9cdcdad4c2ff0ee42fe6004322f15745c3b749c8 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Tue, 3 Apr 2018 11:35:22 +0300
Subject: [PATCH 1981/3220] mac80211: Fix condition validating WMM IE

[ Upstream commit 911a26484c33e10de6237228ca1d7293548e9f49 ]

Commit c470bdc1aaf3 ("mac80211: don't WARN on bad WMM parameters from
buggy APs") handled cases where an AP reports a zeroed WMM
IE. However, the condition that checks the validity accessed the wrong
index in the ieee80211_tx_queue_params array, thus wrongly deducing
that the parameters are invalid. Fix it.

Fixes: c470bdc1aaf3 ("mac80211: don't WARN on bad WMM parameters from buggy APs")
Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/mac80211/mlme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 51f31e77c19b..ed4fef32b394 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1886,7 +1886,7 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
 		params[ac].acm = acm;
 		params[ac].uapsd = uapsd;
 
-		if (params->cw_min == 0 ||
+		if (params[ac].cw_min == 0 ||
 		    params[ac].cw_min > params[ac].cw_max) {
 			sdata_info(sdata,
 				   "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n",

From a33ba2d2aa7e28be3773e0787ddcdbb754717449 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Fri, 5 Oct 2018 23:22:06 +0300
Subject: [PATCH 1982/3220] mac80211_hwsim: fix module init error paths for
 netlink

[ Upstream commit 05cc09de4c017663a217630682041066f2f9a5cd ]

There is no unregister netlink notifier and family on error paths
in init_mac80211_hwsim(). Also there is an error path where
hwsim_class is not destroyed.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Fixes: 62759361eb49 ("mac80211-hwsim: Provide multicast event for HWSIM_CMD_NEW_RADIO")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/wireless/mac80211_hwsim.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index ab480ea6d95a..0d1abcfec003 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -3195,16 +3195,16 @@ static int __init init_mac80211_hwsim(void)
 	if (err)
 		return err;
 
+	err = hwsim_init_netlink();
+	if (err)
+		goto out_unregister_driver;
+
 	hwsim_class = class_create(THIS_MODULE, "mac80211_hwsim");
 	if (IS_ERR(hwsim_class)) {
 		err = PTR_ERR(hwsim_class);
-		goto out_unregister_driver;
+		goto out_exit_netlink;
 	}
 
-	err = hwsim_init_netlink();
-	if (err < 0)
-		goto out_unregister_driver;
-
 	for (i = 0; i < radios; i++) {
 		struct hwsim_new_radio_params param = { 0 };
 
@@ -3310,6 +3310,8 @@ out_free_mon:
 	free_netdev(hwsim_mon);
 out_free_radios:
 	mac80211_hwsim_free();
+out_exit_netlink:
+	hwsim_exit_netlink();
 out_unregister_driver:
 	platform_driver_unregister(&mac80211_hwsim_driver);
 	return err;

From 054dbacf1ce9f512a76f78ba2da190a33b6038fd Mon Sep 17 00:00:00 2001
From: Fred Herard <fred.herard@oracle.com>
Date: Tue, 20 Nov 2018 20:22:45 -0500
Subject: [PATCH 1983/3220] scsi: libiscsi: Fix NULL pointer dereference in
 iscsi_eh_session_reset

[ Upstream commit 5db6dd14b31397e8cccaaddab2ff44ebec1acf25 ]

This commit addresses NULL pointer dereference in iscsi_eh_session_reset.
Reference should not be made to session->leadconn when session->state is
set to ISCSI_STATE_TERMINATE.

Signed-off-by: Fred Herard <fred.herard@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/libiscsi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index a74f8fbefd33..009a2ef829d6 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2416,8 +2416,8 @@ int iscsi_eh_session_reset(struct scsi_cmnd *sc)
 failed:
 		ISCSI_DBG_EH(session,
 			     "failing session reset: Could not log back into "
-			     "%s, %s [age %d]\n", session->targetname,
-			     conn->persistent_address, session->age);
+			     "%s [age %d]\n", session->targetname,
+			     session->age);
 		spin_unlock_bh(&session->frwd_lock);
 		mutex_unlock(&session->eh_mutex);
 		return FAILED;

From 4b0ad28b0bf9d383680064502f4e93c77b2b7585 Mon Sep 17 00:00:00 2001
From: Cathy Avery <cavery@redhat.com>
Date: Tue, 27 Nov 2018 14:28:53 -0500
Subject: [PATCH 1984/3220] scsi: vmw_pscsi: Rearrange code to avoid multiple
 calls to free_irq during unload

[ Upstream commit 02f425f811cefcc4d325d7a72272651e622dc97e ]

Currently pvscsi_remove calls free_irq more than once as
pvscsi_release_resources and __pvscsi_shutdown both call
pvscsi_shutdown_intr. This results in a 'Trying to free already-free IRQ'
warning and stack trace. To solve the problem pvscsi_shutdown_intr has been
moved out of pvscsi_release_resources.

Signed-off-by: Cathy Avery <cavery@redhat.com>
Reviewed-by: Ewan D. Milne <emilne@redhat.com>
Reviewed-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/vmw_pvscsi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/vmw_pvscsi.c b/drivers/scsi/vmw_pvscsi.c
index 0de2f9069e23..23081ed8f1e3 100644
--- a/drivers/scsi/vmw_pvscsi.c
+++ b/drivers/scsi/vmw_pvscsi.c
@@ -1199,8 +1199,6 @@ static void pvscsi_shutdown_intr(struct pvscsi_adapter *adapter)
 
 static void pvscsi_release_resources(struct pvscsi_adapter *adapter)
 {
-	pvscsi_shutdown_intr(adapter);
-
 	if (adapter->workqueue)
 		destroy_workqueue(adapter->workqueue);
 
@@ -1529,6 +1527,7 @@ static int pvscsi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 out_reset_adapter:
 	ll_adapter_reset(adapter);
 out_release_resources:
+	pvscsi_shutdown_intr(adapter);
 	pvscsi_release_resources(adapter);
 	scsi_host_put(host);
 out_disable_device:
@@ -1537,6 +1536,7 @@ out_disable_device:
 	return error;
 
 out_release_resources_and_disable:
+	pvscsi_shutdown_intr(adapter);
 	pvscsi_release_resources(adapter);
 	goto out_disable_device;
 }

From 5c42212ae2d7261a33e7aba5c40904e129061cef Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei1999@gmail.com>
Date: Thu, 29 Nov 2018 18:12:30 +0100
Subject: [PATCH 1985/3220] x86/earlyprintk/efi: Fix infinite loop on some
 screen widths

[ Upstream commit 79c2206d369b87b19ac29cb47601059b6bf5c291 ]

An affected screen resolution is 1366 x 768, which width is not
divisible by 8, the default font width. On such screens, when longer
lines are earlyprintk'ed, overflow-to-next-line can never trigger,
due to the left-most x-coordinate of the next character always less
than the screen width. Earlyprintk will infinite loop in trying to
print the rest of the string but unable to, due to the line being
full.

This patch makes the trigger consider the right-most x-coordinate,
instead of left-most, as the value to compare against the screen
width threshold.

Signed-off-by: YiFei Zhu <zhuyifei1999@gmail.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arend van Spriel <arend.vanspriel@broadcom.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Eric Snowberg <eric.snowberg@oracle.com>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jon Hunter <jonathanh@nvidia.com>
Cc: Julien Thierry <julien.thierry@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20181129171230.18699-12-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/platform/efi/early_printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c
index 524142117296..82324fc25d5e 100644
--- a/arch/x86/platform/efi/early_printk.c
+++ b/arch/x86/platform/efi/early_printk.c
@@ -179,7 +179,7 @@ early_efi_write(struct console *con, const char *str, unsigned int num)
 			num--;
 		}
 
-		if (efi_x >= si->lfb_width) {
+		if (efi_x + font->width > si->lfb_width) {
 			efi_x = 0;
 			efi_y += font->height;
 		}

From d0aaab74b2b95430ee9c2234867d41e7127640ad Mon Sep 17 00:00:00 2001
From: Sean Paul <seanpaul@chromium.org>
Date: Wed, 3 Oct 2018 16:22:31 -0400
Subject: [PATCH 1986/3220] drm/msm: Grab a vblank reference when waiting for
 commit_done

[ Upstream commit 3b712e43e3876b42b38321ecf790a1f5fe59c834 ]

Similar to the atomic helpers, we should enable vblank while we're
waiting for the commit to finish. DPU needs this, MDP5 seems to work
fine without it.

Reviewed-by: Abhinav Kumar <abhinavk@codeaurora.org>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Signed-off-by: Rob Clark <robdclark@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/msm/msm_atomic.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/msm/msm_atomic.c b/drivers/gpu/drm/msm/msm_atomic.c
index 7eb253bc24df..221eaea651d4 100644
--- a/drivers/gpu/drm/msm/msm_atomic.c
+++ b/drivers/gpu/drm/msm/msm_atomic.c
@@ -107,7 +107,12 @@ static void msm_atomic_wait_for_commit_done(struct drm_device *dev,
 		if (old_state->legacy_cursor_update)
 			continue;
 
+		if (drm_crtc_vblank_get(crtc))
+			continue;
+
 		kms->funcs->wait_for_crtc_commit_done(kms, crtc);
+
+		drm_crtc_vblank_put(crtc);
 	}
 }
 

From 44b4717e50555c27424ceabcff443f07b1db58ed Mon Sep 17 00:00:00 2001
From: Jose Abreu <joabreu@synopsys.com>
Date: Fri, 30 Nov 2018 09:47:31 +0000
Subject: [PATCH 1987/3220] ARC: io.h: Implement reads{x}()/writes{x}()

[ Upstream commit 10d443431dc2bb733cf7add99b453e3fb9047a2e ]

Some ARC CPU's do not support unaligned loads/stores. Currently, generic
implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
This can lead to misfunction of some drivers as generic functions do a
plain dereference of a pointer that can be unaligned.

Let's use {get/put}_unaligned() helpers instead of plain dereference of
pointer in order to fix. The helpers allow to get and store data from an
unaligned address whilst preserving the CPU internal alignment.
According to [1], the use of these helpers are costly in terms of
performance so we added an initial check for a buffer already aligned so
that the usage of the helpers can be avoided, when possible.

[1] Documentation/unaligned-memory-access.txt

Cc: Alexey Brodkin <abrodkin@synopsys.com>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Tested-by: Vitor Soares <soares@synopsys.com>
Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arc/include/asm/io.h | 72 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/arch/arc/include/asm/io.h b/arch/arc/include/asm/io.h
index cb69299a492e..f120d823e8c2 100644
--- a/arch/arc/include/asm/io.h
+++ b/arch/arc/include/asm/io.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <asm/byteorder.h>
 #include <asm/page.h>
+#include <asm/unaligned.h>
 
 #ifdef CONFIG_ISA_ARCV2
 #include <asm/barrier.h>
@@ -85,6 +86,42 @@ static inline u32 __raw_readl(const volatile void __iomem *addr)
 	return w;
 }
 
+/*
+ * {read,write}s{b,w,l}() repeatedly access the same IO address in
+ * native endianness in 8-, 16-, 32-bit chunks {into,from} memory,
+ * @count times
+ */
+#define __raw_readsx(t,f) \
+static inline void __raw_reads##f(const volatile void __iomem *addr,	\
+				  void *ptr, unsigned int count)	\
+{									\
+	bool is_aligned = ((unsigned long)ptr % ((t) / 8)) == 0;	\
+	u##t *buf = ptr;						\
+									\
+	if (!count)							\
+		return;							\
+									\
+	/* Some ARC CPU's don't support unaligned accesses */		\
+	if (is_aligned) {						\
+		do {							\
+			u##t x = __raw_read##f(addr);			\
+			*buf++ = x;					\
+		} while (--count);					\
+	} else {							\
+		do {							\
+			u##t x = __raw_read##f(addr);			\
+			put_unaligned(x, buf++);			\
+		} while (--count);					\
+	}								\
+}
+
+#define __raw_readsb __raw_readsb
+__raw_readsx(8, b)
+#define __raw_readsw __raw_readsw
+__raw_readsx(16, w)
+#define __raw_readsl __raw_readsl
+__raw_readsx(32, l)
+
 #define __raw_writeb __raw_writeb
 static inline void __raw_writeb(u8 b, volatile void __iomem *addr)
 {
@@ -117,6 +154,35 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
 
 }
 
+#define __raw_writesx(t,f)						\
+static inline void __raw_writes##f(volatile void __iomem *addr, 	\
+				   const void *ptr, unsigned int count)	\
+{									\
+	bool is_aligned = ((unsigned long)ptr % ((t) / 8)) == 0;	\
+	const u##t *buf = ptr;						\
+									\
+	if (!count)							\
+		return;							\
+									\
+	/* Some ARC CPU's don't support unaligned accesses */		\
+	if (is_aligned) {						\
+		do {							\
+			__raw_write##f(*buf++, addr);			\
+		} while (--count);					\
+	} else {							\
+		do {							\
+			__raw_write##f(get_unaligned(buf++), addr);	\
+		} while (--count);					\
+	}								\
+}
+
+#define __raw_writesb __raw_writesb
+__raw_writesx(8, b)
+#define __raw_writesw __raw_writesw
+__raw_writesx(16, w)
+#define __raw_writesl __raw_writesl
+__raw_writesx(32, l)
+
 /*
  * MMIO can also get buffered/optimized in micro-arch, so barriers needed
  * Based on ARM model for the typical use case
@@ -132,10 +198,16 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
 #define readb(c)		({ u8  __v = readb_relaxed(c); __iormb(); __v; })
 #define readw(c)		({ u16 __v = readw_relaxed(c); __iormb(); __v; })
 #define readl(c)		({ u32 __v = readl_relaxed(c); __iormb(); __v; })
+#define readsb(p,d,l)		({ __raw_readsb(p,d,l); __iormb(); })
+#define readsw(p,d,l)		({ __raw_readsw(p,d,l); __iormb(); })
+#define readsl(p,d,l)		({ __raw_readsl(p,d,l); __iormb(); })
 
 #define writeb(v,c)		({ __iowmb(); writeb_relaxed(v,c); })
 #define writew(v,c)		({ __iowmb(); writew_relaxed(v,c); })
 #define writel(v,c)		({ __iowmb(); writel_relaxed(v,c); })
+#define writesb(p,d,l)		({ __iowmb(); __raw_writesb(p,d,l); })
+#define writesw(p,d,l)		({ __iowmb(); __raw_writesw(p,d,l); })
+#define writesl(p,d,l)		({ __iowmb(); __raw_writesl(p,d,l); })
 
 /*
  * Relaxed API for drivers which can handle barrier ordering themselves

From 25b077464a6b0c3f5e2196adc41d7d2d932c69b2 Mon Sep 17 00:00:00 2001
From: Toni Peltonen <peltzi@peltzi.fi>
Date: Tue, 27 Nov 2018 16:56:57 +0200
Subject: [PATCH 1988/3220] bonding: fix 802.3ad state sent to partner when
 unbinding slave

[ Upstream commit 3b5b3a3331d141e8f2a7aaae3a94dfa1e61ecbe4 ]

Previously when unbinding a slave the 802.3ad implementation only told
partner that the port is not suitable for aggregation by setting the port
aggregation state from aggregatable to individual. This is not enough. If the
physical layer still stays up and we only unbinded this port from the bond there
is nothing in the aggregation status alone to prevent the partner from sending
traffic towards us. To ensure that the partner doesn't consider this
port at all anymore we should also disable collecting and distributing to
signal that this actor is going away. Also clear AD_STATE_SYNCHRONIZATION to
ensure partner exits collecting + distributing state.

I have tested this behaviour againts Arista EOS switches with mlx5 cards
(physical link stays up even when interface is down) and simulated
the same situation virtually Linux <-> Linux with two network namespaces
running two veth device pairs. In both cases setting aggregation to
individual doesn't alone prevent traffic from being to sent towards this
port given that the link stays up in partners end. Partner still keeps
it's end in collecting + distributing state and continues until timeout is
reached. In most cases this means we are losing the traffic partner sends
towards our port while we wait for timeout. This is most visible with slow
periodic time (LACP rate slow).

Other open source implementations like Open VSwitch and libreswitch, and
vendor implementations like Arista EOS, seem to disable collecting +
distributing to when doing similar port disabling/detaching/removing change.
With this patch kernel implementation would behave the same way and ensure
partner doesn't consider our actor viable anymore.

Signed-off-by: Toni Peltonen <peltzi@peltzi.fi>
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Acked-by: Jonathan Toppins <jtoppins@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/bonding/bond_3ad.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 940e2ebbdea8..399c627b15cc 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2011,6 +2011,9 @@ void bond_3ad_unbind_slave(struct slave *slave)
 		   aggregator->aggregator_identifier);
 
 	/* Tell the partner that this port is not suitable for aggregation */
+	port->actor_oper_port_state &= ~AD_STATE_SYNCHRONIZATION;
+	port->actor_oper_port_state &= ~AD_STATE_COLLECTING;
+	port->actor_oper_port_state &= ~AD_STATE_DISTRIBUTING;
 	port->actor_oper_port_state &= ~AD_STATE_AGGREGATION;
 	__update_lacpdu_from_port(port);
 	ad_lacpdu_send(port);

From 61bbbe52bb3929ea654891121f646d6c2df95e57 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 1 Dec 2018 23:18:00 -0500
Subject: [PATCH 1989/3220] SUNRPC: Fix a potential race in xprt_connect()

[ Upstream commit 0a9a4304f3614e25d9de9b63502ca633c01c0d70 ]

If an asynchronous connection attempt completes while another task is
in xprt_connect(), then the call to rpc_sleep_on() could end up
racing with the call to xprt_wake_pending_tasks().
So add a second test of the connection state after we've put the
task to sleep and set the XPRT_CONNECTING flag, when we know that there
can be no asynchronous connection attempts still in progress.

Fixes: 0b9e79431377d ("SUNRPC: Move the test for XPRT_CONNECTING into...")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/sunrpc/xprt.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2e98f4a243e5..112c191b8336 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -758,8 +758,15 @@ void xprt_connect(struct rpc_task *task)
 			return;
 		if (xprt_test_and_set_connecting(xprt))
 			return;
-		xprt->stat.connect_start = jiffies;
-		xprt->ops->connect(xprt, task);
+		/* Race breaker */
+		if (!xprt_connected(xprt)) {
+			xprt->stat.connect_start = jiffies;
+			xprt->ops->connect(xprt, task);
+		} else {
+			xprt_clear_connecting(xprt);
+			task->tk_status = 0;
+			rpc_wake_up_queued_task(&xprt->pending, task);
+		}
 	}
 	xprt_release_write(xprt, task);
 }

From 1cb0f441faa3faedb71c390a4be4946ff2d3dca8 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Tue, 20 Nov 2018 08:30:40 -0500
Subject: [PATCH 1990/3220] sbus: char: add of_node_put()

[ Upstream commit 87d81a23e24f24ebe014891e8bdf3ff8785031e8 ]

use of_node_put() to release the refcount.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/sbus/char/display7seg.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c
index 33fbe8249fd5..044cffbc45e8 100644
--- a/drivers/sbus/char/display7seg.c
+++ b/drivers/sbus/char/display7seg.c
@@ -221,6 +221,7 @@ static int d7s_probe(struct platform_device *op)
 	dev_set_drvdata(&op->dev, p);
 	d7s_device = p;
 	err = 0;
+	of_node_put(opts);
 
 out:
 	return err;

From 1953eec6684baa78f366ee4af9914edfc3f7608d Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Tue, 20 Nov 2018 08:38:26 -0500
Subject: [PATCH 1991/3220] drivers/sbus/char: add of_node_put()

[ Upstream commit 6bd520ab7cf69486ea81fd3cdfd2d5a390ad1100 ]

use of_node_put() to release the refcount.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/sbus/char/envctrl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 5609b602c54d..baa9b322520b 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -910,8 +910,10 @@ static void envctrl_init_i2c_child(struct device_node *dp,
 			for (len = 0; len < PCF8584_MAX_CHANNELS; ++len) {
 				pchild->mon_type[len] = ENVCTRL_NOMON;
 			}
+			of_node_put(root_node);
 			return;
 		}
+		of_node_put(root_node);
 	}
 
 	/* Get the monitor channels. */

From bf920ab8d61a79f2944beac69526b6d59623c888 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Wed, 21 Nov 2018 10:22:54 -0500
Subject: [PATCH 1992/3220] drivers/tty: add missing of_node_put()

[ Upstream commit dac097c4546e4c5b16dd303a1e97c1d319c8ab3e ]

of_find_node_by_path() acquires a reference to the node
returned by it and that reference needs to be dropped by its caller.
This place is not doing this, so fix it.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/tty/serial/suncore.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/tty/serial/suncore.c b/drivers/tty/serial/suncore.c
index 127472bd6a7c..209f314745ab 100644
--- a/drivers/tty/serial/suncore.c
+++ b/drivers/tty/serial/suncore.c
@@ -111,6 +111,7 @@ void sunserial_console_termios(struct console *con, struct device_node *uart_dp)
 		mode = of_get_property(dp, mode_prop, NULL);
 		if (!mode)
 			mode = "9600,8,n,1,-";
+		of_node_put(dp);
 	}
 
 	cflag = CREAD | HUPCL | CLOCAL;

From a4d64fc0968d36a2c5ec9f5e9194ac01c5498a6e Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Tue, 20 Nov 2018 08:02:49 -0500
Subject: [PATCH 1993/3220] ide: pmac: add of_node_put()

[ Upstream commit a51921c0db3fd26c4ed83dc0ec5d32988fa02aa5 ]

use of_node_put() to release the refcount.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/ide/pmac.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index 96a345248224..0add5bb3cee8 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -920,6 +920,7 @@ static u8 pmac_ide_cable_detect(ide_hwif_t *hwif)
 	struct device_node *root = of_find_node_by_path("/");
 	const char *model = of_get_property(root, "model", NULL);
 
+	of_node_put(root);
 	/* Get cable type from device-tree. */
 	if (cable && !strncmp(cable, "80-", 3)) {
 		/* Some drives fail to detect 80c cable in PowerBook */

From 1370aabe19c1e55fbe4bb62b035be797ab7db7ad Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 3 Dec 2018 17:51:43 +0300
Subject: [PATCH 1994/3220] clk: mmp: Off by one in mmp_clk_add()

[ Upstream commit 2e85c57493e391b93445c1e0d530b36b95becc64 ]

The > comparison should be >= or we write one element beyond the end of
the unit->clk_table[] array.

(The unit->clk_table[] array is allocated in the mmp_clk_init() function
and it has unit->nr_clks elements).

Fixes: 4661fda10f8b ("clk: mmp: add basic support functions for DT support")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/clk/mmp/clk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clk/mmp/clk.c b/drivers/clk/mmp/clk.c
index 61893fe73251..18b6c9b55b95 100644
--- a/drivers/clk/mmp/clk.c
+++ b/drivers/clk/mmp/clk.c
@@ -182,7 +182,7 @@ void mmp_clk_add(struct mmp_clk_unit *unit, unsigned int id,
 		pr_err("CLK %d has invalid pointer %p\n", id, clk);
 		return;
 	}
-	if (id > unit->nr_clks) {
+	if (id >= unit->nr_clks) {
 		pr_err("CLK %d is invalid\n", id);
 		return;
 	}

From 7150d100dcf45a9bec14d32f5735226b73abebe8 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 3 Dec 2018 11:24:30 -0800
Subject: [PATCH 1995/3220] Input: omap-keypad - fix keyboard debounce
 configuration

[ Upstream commit 6c3516fed7b61a3527459ccfa67fab130d910610 ]

I noticed that the Android v3.0.8 kernel on droid4 is using different
keypad values from the mainline kernel and does not have issues with
keys occasionally being stuck until pressed again. Turns out there was
an earlier patch posted to fix this as "Input: omap-keypad: errata i689:
Correct debounce time", but it was never reposted to fix use macros
for timing calculations.

This updated version is using macros, and also fixes the use of the
input clock rate to use 32768KiHz instead of 32000KiHz. And we want to
use the known good Android kernel values of 3 and 6 instead of 2 and 6
in the earlier patch.

Reported-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/input/keyboard/omap4-keypad.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/input/keyboard/omap4-keypad.c b/drivers/input/keyboard/omap4-keypad.c
index 6639b2b8528a..f78c464899db 100644
--- a/drivers/input/keyboard/omap4-keypad.c
+++ b/drivers/input/keyboard/omap4-keypad.c
@@ -60,8 +60,18 @@
 
 /* OMAP4 values */
 #define OMAP4_VAL_IRQDISABLE		0x0
-#define OMAP4_VAL_DEBOUNCINGTIME	0x7
-#define OMAP4_VAL_PVT			0x7
+
+/*
+ * Errata i689: If a key is released for a time shorter than debounce time,
+ * the keyboard will idle and never detect the key release. The workaround
+ * is to use at least a 12ms debounce time. See omap5432 TRM chapter
+ * "26.4.6.2 Keyboard Controller Timer" for more information.
+ */
+#define OMAP4_KEYPAD_PTV_DIV_128        0x6
+#define OMAP4_KEYPAD_DEBOUNCINGTIME_MS(dbms, ptv)     \
+	((((dbms) * 1000) / ((1 << ((ptv) + 1)) * (1000000 / 32768))) - 1)
+#define OMAP4_VAL_DEBOUNCINGTIME_16MS					\
+	OMAP4_KEYPAD_DEBOUNCINGTIME_MS(16, OMAP4_KEYPAD_PTV_DIV_128)
 
 enum {
 	KBD_REVISION_OMAP4 = 0,
@@ -181,9 +191,9 @@ static int omap4_keypad_open(struct input_dev *input)
 
 	kbd_writel(keypad_data, OMAP4_KBD_CTRL,
 			OMAP4_DEF_CTRL_NOSOFTMODE |
-			(OMAP4_VAL_PVT << OMAP4_DEF_CTRL_PTV_SHIFT));
+			(OMAP4_KEYPAD_PTV_DIV_128 << OMAP4_DEF_CTRL_PTV_SHIFT));
 	kbd_writel(keypad_data, OMAP4_KBD_DEBOUNCINGTIME,
-			OMAP4_VAL_DEBOUNCINGTIME);
+			OMAP4_VAL_DEBOUNCINGTIME_16MS);
 	/* clear pending interrupts */
 	kbd_write_irqreg(keypad_data, OMAP4_KBD_IRQSTATUS,
 			 kbd_read_irqreg(keypad_data, OMAP4_KBD_IRQSTATUS));

From 08349ed6ea70f26ce279059521d329e850bc8fdf Mon Sep 17 00:00:00 2001
From: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
Date: Sun, 2 Dec 2018 12:47:08 +0200
Subject: [PATCH 1996/3220] libata: whitelist all SAMSUNG MZ7KM* solid-state
 disks

[ Upstream commit fd6f32f78645db32b6b95a42e45da2ddd6de0e67 ]

These devices support read zero after trim (RZAT), as they advertise to
the OS. However, the OS doesn't believe the SSDs unless they are
explicitly whitelisted.

Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/ata/libata-core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ba514fa733de..d543172b20b3 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4297,6 +4297,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "SSD*INTEL*",			NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
 	{ "Samsung*SSD*",		NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
 	{ "SAMSUNG*SSD*",		NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
+	{ "SAMSUNG*MZ7KM*",		NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
 	{ "ST[1248][0248]0[FH]*",	NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
 
 	/*

From 59654d59165b2c5781b785e9baa2510d9e3c295d Mon Sep 17 00:00:00 2001
From: Anderson Luiz Alves <alacn1@gmail.com>
Date: Fri, 30 Nov 2018 21:58:36 -0200
Subject: [PATCH 1997/3220] mv88e6060: disable hardware level MAC learning

[ Upstream commit a74515604a7b171f2702bdcbd1e231225fb456d0 ]

Disable hardware level MAC learning because it breaks station roaming.
When enabled it drops all frames that arrive from a MAC address
that is on a different port at learning table.

Signed-off-by: Anderson Luiz Alves <alacn1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/dsa/mv88e6060.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index 0527f485c3dc..973fcd442aea 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -98,8 +98,7 @@ static int mv88e6060_switch_reset(struct dsa_switch *ds)
 	/* Reset the switch. */
 	REG_WRITE(REG_GLOBAL, GLOBAL_ATU_CONTROL,
 		  GLOBAL_ATU_CONTROL_SWRESET |
-		  GLOBAL_ATU_CONTROL_ATUSIZE_1024 |
-		  GLOBAL_ATU_CONTROL_ATE_AGE_5MIN);
+		  GLOBAL_ATU_CONTROL_LEARNDIS);
 
 	/* Wait up to one second for reset to complete. */
 	timeout = jiffies + 1 * HZ;
@@ -124,13 +123,10 @@ static int mv88e6060_setup_global(struct dsa_switch *ds)
 	 */
 	REG_WRITE(REG_GLOBAL, GLOBAL_CONTROL, GLOBAL_CONTROL_MAX_FRAME_1536);
 
-	/* Enable automatic address learning, set the address
-	 * database size to 1024 entries, and set the default aging
-	 * time to 5 minutes.
+	/* Disable automatic address learning.
 	 */
 	REG_WRITE(REG_GLOBAL, GLOBAL_ATU_CONTROL,
-		  GLOBAL_ATU_CONTROL_ATUSIZE_1024 |
-		  GLOBAL_ATU_CONTROL_ATE_AGE_5MIN);
+		  GLOBAL_ATU_CONTROL_LEARNDIS);
 
 	return 0;
 }

From fa5d9b585e83232d286188e1e28f433401394948 Mon Sep 17 00:00:00 2001
From: Chris Cole <chris@sageembedded.com>
Date: Fri, 23 Nov 2018 12:20:45 +0100
Subject: [PATCH 1998/3220] ARM: 8814/1: mm: improve/fix ARM v7_dma_inv_range()
 unaligned address handling

[ Upstream commit a1208f6a822ac29933e772ef1f637c5d67838da9 ]

This patch addresses possible memory corruption when
v7_dma_inv_range(start_address, end_address) address parameters are not
aligned to whole cache lines. This function issues "invalidate" cache
management operations to all cache lines from start_address (inclusive)
to end_address (exclusive). When start_address and/or end_address are
not aligned, the start and/or end cache lines are first issued "clean &
invalidate" operation. The assumption is this is done to ensure that any
dirty data addresses outside the address range (but part of the first or
last cache lines) are cleaned/flushed so that data is not lost, which
could happen if just an invalidate is issued.

The problem is that these first/last partial cache lines are issued
"clean & invalidate" and then "invalidate". This second "invalidate" is
not required and worse can cause "lost" writes to addresses outside the
address range but part of the cache line. If another component writes to
its part of the cache line between the "clean & invalidate" and
"invalidate" operations, the write can get lost. This fix is to remove
the extra "invalidate" operation when unaligned addressed are used.

A kernel module is available that has a stress test to reproduce the
issue and a unit test of the updated v7_dma_inv_range(). It can be
downloaded from
http://ftp.sageembedded.com/outgoing/linux/cache-test-20181107.tgz.

v7_dma_inv_range() is call by dmac_[un]map_area(addr, len, direction)
when the direction is DMA_FROM_DEVICE. One can (I believe) successfully
argue that DMA from a device to main memory should use buffers aligned
to cache line size, because the "clean & invalidate" might overwrite
data that the device just wrote using DMA. But if a driver does use
unaligned buffers, at least this fix will prevent memory corruption
outside the buffer.

Signed-off-by: Chris Cole <chris@sageembedded.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/mm/cache-v7.S | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index a134d8a13d00..11d699af30ed 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -359,14 +359,16 @@ v7_dma_inv_range:
 	ALT_UP(W(nop))
 #endif
 	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
+	addne	r0, r0, r2
 
 	tst	r1, r3
 	bic	r1, r1, r3
 	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D / U line
-1:
-	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D / U line
-	add	r0, r0, r2
 	cmp	r0, r1
+1:
+	mcrlo	p15, 0, r0, c7, c6, 1		@ invalidate D / U line
+	addlo	r0, r0, r2
+	cmplo	r0, r1
 	blo	1b
 	dsb	st
 	ret	lr

From 60da90b224ba77a934decbb8129dabc861edd526 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Sat, 3 Nov 2018 15:02:44 -0500
Subject: [PATCH 1999/3220] cifs: In Kconfig CONFIG_CIFS_POSIX needs depends on
 legacy (insecure cifs)

[ Upstream commit 6e785302dad32228819d8066e5376acd15d0e6ba ]

Missing a dependency.  Shouldn't show cifs posix extensions
in Kconfig if CONFIG_CIFS_ALLOW_INSECURE_DIALECTS (ie SMB1
protocol) is disabled.

Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index e7b478b49985..8bef27b8f85d 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -111,7 +111,7 @@ config CIFS_XATTR
 
 config CIFS_POSIX
         bool "CIFS POSIX Extensions"
-        depends on CIFS_XATTR
+        depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
         help
           Enabling this option will cause the cifs client to attempt to
 	  negotiate a newer dialect with servers, such as Samba 3.0.5

From a84f9b11efe03d54ab1e2174f2ae84b38ef6e7b1 Mon Sep 17 00:00:00 2001
From: "Adamski, Krzysztof (Nokia - PL/Wroclaw)" <krzysztof.adamski@nokia.com>
Date: Fri, 16 Nov 2018 13:24:41 +0000
Subject: [PATCH 2000/3220] i2c: axxia: properly handle master timeout

[ Upstream commit 6c7f25cae54b840302e4f1b371dbf318fbf09ab2 ]

According to Intel (R) Axxia TM Lionfish Communication Processor
Peripheral Subsystem Hardware Reference Manual, the AXXIA I2C module
have a programmable Master Wait Timer, which among others, checks the
time between commands send in manual mode. When a timeout (25ms) passes,
TSS bit is set in Master Interrupt Status register and a Stop command is
issued by the hardware.

The axxia_i2c_xfer(), does not properly handle this situation, however.
For each message a separate axxia_i2c_xfer_msg() is called and this
function incorrectly assumes that any interrupt might happen only when
waiting for completion. This is mostly correct but there is one
exception - a master timeout can trigger if enough time has passed
between individual transfers. It will, by definition, happen between
transfers when the interrupts are disabled by the code. If that happens,
the hardware issues Stop command.

The interrupt indicating timeout will not be triggered as soon as we
enable them since the Master Interrupt Status is cleared when master
mode is entered again (which happens before enabling irqs) meaning this
error is lost and the transfer is continued even though the Stop was
issued on the bus. The subsequent operations completes without error but
a bogus value (0xFF in case of read) is read as the client device is
confused because aborted transfer. No error is returned from
master_xfer() making caller believe that a valid value was read.

To fix the problem, the TSS bit (indicating timeout) in Master Interrupt
Status register is checked before each transfer. If it is set, there was
a timeout before this transfer and (as described above) the hardware
already issued Stop command so the transaction should be aborted thus
-ETIMEOUT is returned from the master_xfer() callback. In order to be
sure no timeout was issued we can't just read the status just before
starting new transaction as there will always be a small window of time
(few CPU cycles at best) where this might still happen. For this reason
we have to temporally disable the timer before checking for TSS bit.
Disabling it will, however, clear the TSS bit so in order to preserve
that information, we have to read it in ISR so we have to ensure that
the TSS interrupt is not masked between transfers of one transaction.
There is no need to call bus recovery or controller reinitialization if
that happens so it's skipped.

Signed-off-by: Krzysztof Adamski <krzysztof.adamski@nokia.com>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/i2c/busses/i2c-axxia.c | 40 ++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/i2c/busses/i2c-axxia.c b/drivers/i2c/busses/i2c-axxia.c
index c335cc7852f9..9c9fd2e87a4b 100644
--- a/drivers/i2c/busses/i2c-axxia.c
+++ b/drivers/i2c/busses/i2c-axxia.c
@@ -74,8 +74,7 @@
 				 MST_STATUS_ND)
 #define   MST_STATUS_ERR	(MST_STATUS_NAK | \
 				 MST_STATUS_AL  | \
-				 MST_STATUS_IP  | \
-				 MST_STATUS_TSS)
+				 MST_STATUS_IP)
 #define MST_TX_BYTES_XFRD	0x50
 #define MST_RX_BYTES_XFRD	0x54
 #define SCL_HIGH_PERIOD		0x80
@@ -241,7 +240,7 @@ static int axxia_i2c_empty_rx_fifo(struct axxia_i2c_dev *idev)
 			 */
 			if (c <= 0 || c > I2C_SMBUS_BLOCK_MAX) {
 				idev->msg_err = -EPROTO;
-				i2c_int_disable(idev, ~0);
+				i2c_int_disable(idev, ~MST_STATUS_TSS);
 				complete(&idev->msg_complete);
 				break;
 			}
@@ -299,14 +298,19 @@ static irqreturn_t axxia_i2c_isr(int irq, void *_dev)
 
 	if (status & MST_STATUS_SCC) {
 		/* Stop completed */
-		i2c_int_disable(idev, ~0);
+		i2c_int_disable(idev, ~MST_STATUS_TSS);
 		complete(&idev->msg_complete);
 	} else if (status & MST_STATUS_SNS) {
 		/* Transfer done */
-		i2c_int_disable(idev, ~0);
+		i2c_int_disable(idev, ~MST_STATUS_TSS);
 		if (i2c_m_rd(idev->msg) && idev->msg_xfrd < idev->msg->len)
 			axxia_i2c_empty_rx_fifo(idev);
 		complete(&idev->msg_complete);
+	} else if (status & MST_STATUS_TSS) {
+		/* Transfer timeout */
+		idev->msg_err = -ETIMEDOUT;
+		i2c_int_disable(idev, ~MST_STATUS_TSS);
+		complete(&idev->msg_complete);
 	} else if (unlikely(status & MST_STATUS_ERR)) {
 		/* Transfer error */
 		i2c_int_disable(idev, ~0);
@@ -339,10 +343,10 @@ static int axxia_i2c_xfer_msg(struct axxia_i2c_dev *idev, struct i2c_msg *msg)
 	u32 rx_xfer, tx_xfer;
 	u32 addr_1, addr_2;
 	unsigned long time_left;
+	unsigned int wt_value;
 
 	idev->msg = msg;
 	idev->msg_xfrd = 0;
-	idev->msg_err = 0;
 	reinit_completion(&idev->msg_complete);
 
 	if (i2c_m_ten(msg)) {
@@ -382,9 +386,18 @@ static int axxia_i2c_xfer_msg(struct axxia_i2c_dev *idev, struct i2c_msg *msg)
 	else if (axxia_i2c_fill_tx_fifo(idev) != 0)
 		int_mask |= MST_STATUS_TFL;
 
+	wt_value = WT_VALUE(readl(idev->base + WAIT_TIMER_CONTROL));
+	/* Disable wait timer temporarly */
+	writel(wt_value, idev->base + WAIT_TIMER_CONTROL);
+	/* Check if timeout error happened */
+	if (idev->msg_err)
+		goto out;
+
 	/* Start manual mode */
 	writel(CMD_MANUAL, idev->base + MST_COMMAND);
 
+	writel(WT_EN | wt_value, idev->base + WAIT_TIMER_CONTROL);
+
 	i2c_int_enable(idev, int_mask);
 
 	time_left = wait_for_completion_timeout(&idev->msg_complete,
@@ -395,13 +408,15 @@ static int axxia_i2c_xfer_msg(struct axxia_i2c_dev *idev, struct i2c_msg *msg)
 	if (readl(idev->base + MST_COMMAND) & CMD_BUSY)
 		dev_warn(idev->dev, "busy after xfer\n");
 
-	if (time_left == 0)
+	if (time_left == 0) {
 		idev->msg_err = -ETIMEDOUT;
-
-	if (idev->msg_err == -ETIMEDOUT)
 		i2c_recover_bus(&idev->adapter);
+		axxia_i2c_init(idev);
+	}
 
-	if (unlikely(idev->msg_err) && idev->msg_err != -ENXIO)
+out:
+	if (unlikely(idev->msg_err) && idev->msg_err != -ENXIO &&
+			idev->msg_err != -ETIMEDOUT)
 		axxia_i2c_init(idev);
 
 	return idev->msg_err;
@@ -409,7 +424,7 @@ static int axxia_i2c_xfer_msg(struct axxia_i2c_dev *idev, struct i2c_msg *msg)
 
 static int axxia_i2c_stop(struct axxia_i2c_dev *idev)
 {
-	u32 int_mask = MST_STATUS_ERR | MST_STATUS_SCC;
+	u32 int_mask = MST_STATUS_ERR | MST_STATUS_SCC | MST_STATUS_TSS;
 	unsigned long time_left;
 
 	reinit_completion(&idev->msg_complete);
@@ -436,6 +451,9 @@ axxia_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num)
 	int i;
 	int ret = 0;
 
+	idev->msg_err = 0;
+	i2c_int_enable(idev, MST_STATUS_TSS);
+
 	for (i = 0; ret == 0 && i < num; ++i)
 		ret = axxia_i2c_xfer_msg(idev, &msgs[i]);
 

From 02979d43b36330ff2819cf15517bafd9b4154b7f Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 21 Nov 2018 10:19:55 +0100
Subject: [PATCH 2001/3220] i2c: scmi: Fix probe error on devices with an empty
 SMB0001 ACPI device node

[ Upstream commit 0544ee4b1ad574aec3b6379af5f5cdee42840971 ]

Some AMD based HP laptops have a SMB0001 ACPI device node which does not
define any methods.

This leads to the following error in dmesg:

[    5.222731] cmi: probe of SMB0001:00 failed with error -5

This commit makes acpi_smbus_cmi_add() return -ENODEV instead in this case
silencing the error. In case of a failure of the i2c_add_adapter() call
this commit now propagates the error from that call instead of -EIO.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/i2c/busses/i2c-scmi.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/i2c/busses/i2c-scmi.c b/drivers/i2c/busses/i2c-scmi.c
index efefcfa24a4c..d2178f701b41 100644
--- a/drivers/i2c/busses/i2c-scmi.c
+++ b/drivers/i2c/busses/i2c-scmi.c
@@ -364,6 +364,7 @@ static int acpi_smbus_cmi_add(struct acpi_device *device)
 {
 	struct acpi_smbus_cmi *smbus_cmi;
 	const struct acpi_device_id *id;
+	int ret;
 
 	smbus_cmi = kzalloc(sizeof(struct acpi_smbus_cmi), GFP_KERNEL);
 	if (!smbus_cmi)
@@ -385,8 +386,10 @@ static int acpi_smbus_cmi_add(struct acpi_device *device)
 	acpi_walk_namespace(ACPI_TYPE_METHOD, smbus_cmi->handle, 1,
 			    acpi_smbus_cmi_query_methods, NULL, smbus_cmi, NULL);
 
-	if (smbus_cmi->cap_info == 0)
+	if (smbus_cmi->cap_info == 0) {
+		ret = -ENODEV;
 		goto err;
+	}
 
 	snprintf(smbus_cmi->adapter.name, sizeof(smbus_cmi->adapter.name),
 		"SMBus CMI adapter %s",
@@ -397,7 +400,8 @@ static int acpi_smbus_cmi_add(struct acpi_device *device)
 	smbus_cmi->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	smbus_cmi->adapter.dev.parent = &device->dev;
 
-	if (i2c_add_adapter(&smbus_cmi->adapter)) {
+	ret = i2c_add_adapter(&smbus_cmi->adapter);
+	if (ret) {
 		dev_err(&device->dev, "Couldn't register adapter!\n");
 		goto err;
 	}
@@ -407,7 +411,7 @@ static int acpi_smbus_cmi_add(struct acpi_device *device)
 err:
 	kfree(smbus_cmi);
 	device->driver_data = NULL;
-	return -EIO;
+	return ret;
 }
 
 static int acpi_smbus_cmi_remove(struct acpi_device *device)

From 4178875d9ccb042d99f81e91b37f82e7e4603d73 Mon Sep 17 00:00:00 2001
From: Guy Shapiro <guy.shapiro@mobi-wize.com>
Date: Sun, 29 Jan 2017 11:57:19 +0200
Subject: [PATCH 2002/3220] rtc: snvs: add a missing write sync

[ Upstream commit 7bb633b1a9812a6b9f3e49d0cf17f60a633914e5 ]

The clear of the LPTA_EN flag should be synced before writing to the
alarm register. Omitting this synchronization creates a race when
trying to change existing alarm.

Signed-off-by: Guy Shapiro <guy.shapiro@mobi-wize.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/rtc/rtc-snvs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c
index a161fbf6f172..af131682dbc9 100644
--- a/drivers/rtc/rtc-snvs.c
+++ b/drivers/rtc/rtc-snvs.c
@@ -187,6 +187,7 @@ static int snvs_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 	rtc_tm_to_time(alrm_tm, &time);
 
 	regmap_update_bits(data->regmap, data->offset + SNVS_LPCR, SNVS_LPCR_LPTA_EN, 0);
+	rtc_write_sync_lp(data);
 	regmap_write(data->regmap, data->offset + SNVS_LPTAR, time);
 
 	/* Clear alarm interrupt status bit */

From e3f904b939d2b70207f0decdacbf6d231bc3d7bc Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@impinj.com>
Date: Wed, 16 May 2018 16:45:51 -0700
Subject: [PATCH 2003/3220] rtc: snvs: Add timeouts to avoid kernel lockups

[ Upstream commit cd7f3a249dbed2858e6c2f30e5be7f1f7a709ee2 ]

In order to read correctly from asynchronously updated RTC registers,
it's necessary to read repeatedly until their values do not change from
read to read.  It's also necessary to wait for three RTC clock ticks for
certain operations.  There are no timeouts in this code and these
operations could possibly loop forever.

To avoid kernel hangs, put in timeouts.

The iMX7d can be configured to stop the SRTC on a tamper event, which
will lockup the kernel inside this driver as described above.

These hangs can happen when running under qemu, which doesn't emulate
the SNVS RTC, though currently the driver will refuse to load on qemu
due to a timeout in the driver probe method.

It could also happen if the SRTC block where somehow placed into reset
or the slow speed clock that drives the SRTC counter (but not the CPU)
were to stop.

The symptoms on a two core iMX7d are a work queue hang on
rtc_timer_do_work(), which eventually blocks a systemd fsnotify
operation that triggers a work queue flush, causing systemd to hang and
thus causing all services that should be started by systemd, like a
console getty, to fail to start or stop.

Also optimize the wait code to wait less.  It only needs to wait for the
clock to advance three ticks, not to see it change three times.

Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Fabio Estevam <fabio.estevam@nxp.com>
Cc: Shawn Guo <shawn.guo@linaro.org>
Cc: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Signed-off-by: Trent Piepho <tpiepho@impinj.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/rtc/rtc-snvs.c | 103 +++++++++++++++++++++++++++--------------
 1 file changed, 69 insertions(+), 34 deletions(-)

diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c
index af131682dbc9..63ad5b543f14 100644
--- a/drivers/rtc/rtc-snvs.c
+++ b/drivers/rtc/rtc-snvs.c
@@ -47,49 +47,83 @@ struct snvs_rtc_data {
 	struct clk *clk;
 };
 
+/* Read 64 bit timer register, which could be in inconsistent state */
+static u64 rtc_read_lpsrt(struct snvs_rtc_data *data)
+{
+	u32 msb, lsb;
+
+	regmap_read(data->regmap, data->offset + SNVS_LPSRTCMR, &msb);
+	regmap_read(data->regmap, data->offset + SNVS_LPSRTCLR, &lsb);
+	return (u64)msb << 32 | lsb;
+}
+
+/* Read the secure real time counter, taking care to deal with the cases of the
+ * counter updating while being read.
+ */
 static u32 rtc_read_lp_counter(struct snvs_rtc_data *data)
 {
 	u64 read1, read2;
-	u32 val;
+	unsigned int timeout = 100;
 
+	/* As expected, the registers might update between the read of the LSB
+	 * reg and the MSB reg.  It's also possible that one register might be
+	 * in partially modified state as well.
+	 */
+	read1 = rtc_read_lpsrt(data);
 	do {
-		regmap_read(data->regmap, data->offset + SNVS_LPSRTCMR, &val);
-		read1 = val;
-		read1 <<= 32;
-		regmap_read(data->regmap, data->offset + SNVS_LPSRTCLR, &val);
-		read1 |= val;
-
-		regmap_read(data->regmap, data->offset + SNVS_LPSRTCMR, &val);
-		read2 = val;
-		read2 <<= 32;
-		regmap_read(data->regmap, data->offset + SNVS_LPSRTCLR, &val);
-		read2 |= val;
-	} while (read1 != read2);
+		read2 = read1;
+		read1 = rtc_read_lpsrt(data);
+	} while (read1 != read2 && --timeout);
+	if (!timeout)
+		dev_err(&data->rtc->dev, "Timeout trying to get valid LPSRT Counter read\n");
 
 	/* Convert 47-bit counter to 32-bit raw second count */
 	return (u32) (read1 >> CNTR_TO_SECS_SH);
 }
 
-static void rtc_write_sync_lp(struct snvs_rtc_data *data)
+/* Just read the lsb from the counter, dealing with inconsistent state */
+static int rtc_read_lp_counter_lsb(struct snvs_rtc_data *data, u32 *lsb)
 {
-	u32 count1, count2, count3;
-	int i;
+	u32 count1, count2;
+	unsigned int timeout = 100;
 
-	/* Wait for 3 CKIL cycles */
-	for (i = 0; i < 3; i++) {
-		do {
-			regmap_read(data->regmap, data->offset + SNVS_LPSRTCLR, &count1);
-			regmap_read(data->regmap, data->offset + SNVS_LPSRTCLR, &count2);
-		} while (count1 != count2);
-
-		/* Now wait until counter value changes */
-		do {
-			do {
-				regmap_read(data->regmap, data->offset + SNVS_LPSRTCLR, &count2);
-				regmap_read(data->regmap, data->offset + SNVS_LPSRTCLR, &count3);
-			} while (count2 != count3);
-		} while (count3 == count1);
+	regmap_read(data->regmap, data->offset + SNVS_LPSRTCLR, &count1);
+	do {
+		count2 = count1;
+		regmap_read(data->regmap, data->offset + SNVS_LPSRTCLR, &count1);
+	} while (count1 != count2 && --timeout);
+	if (!timeout) {
+		dev_err(&data->rtc->dev, "Timeout trying to get valid LPSRT Counter read\n");
+		return -ETIMEDOUT;
 	}
+
+	*lsb = count1;
+	return 0;
+}
+
+static int rtc_write_sync_lp(struct snvs_rtc_data *data)
+{
+	u32 count1, count2;
+	u32 elapsed;
+	unsigned int timeout = 1000;
+	int ret;
+
+	ret = rtc_read_lp_counter_lsb(data, &count1);
+	if (ret)
+		return ret;
+
+	/* Wait for 3 CKIL cycles, about 61.0-91.5 µs */
+	do {
+		ret = rtc_read_lp_counter_lsb(data, &count2);
+		if (ret)
+			return ret;
+		elapsed = count2 - count1; /* wrap around _is_ handled! */
+	} while (elapsed < 3 && --timeout);
+	if (!timeout) {
+		dev_err(&data->rtc->dev, "Timeout waiting for LPSRT Counter to change\n");
+		return -ETIMEDOUT;
+	}
+	return 0;
 }
 
 static int snvs_rtc_enable(struct snvs_rtc_data *data, bool enable)
@@ -173,9 +207,7 @@ static int snvs_rtc_alarm_irq_enable(struct device *dev, unsigned int enable)
 			   (SNVS_LPCR_LPTA_EN | SNVS_LPCR_LPWUI_EN),
 			   enable ? (SNVS_LPCR_LPTA_EN | SNVS_LPCR_LPWUI_EN) : 0);
 
-	rtc_write_sync_lp(data);
-
-	return 0;
+	return rtc_write_sync_lp(data);
 }
 
 static int snvs_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
@@ -183,11 +215,14 @@ static int snvs_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 	struct snvs_rtc_data *data = dev_get_drvdata(dev);
 	struct rtc_time *alrm_tm = &alrm->time;
 	unsigned long time;
+	int ret;
 
 	rtc_tm_to_time(alrm_tm, &time);
 
 	regmap_update_bits(data->regmap, data->offset + SNVS_LPCR, SNVS_LPCR_LPTA_EN, 0);
-	rtc_write_sync_lp(data);
+	ret = rtc_write_sync_lp(data);
+	if (ret)
+		return ret;
 	regmap_write(data->regmap, data->offset + SNVS_LPTAR, time);
 
 	/* Clear alarm interrupt status bit */

From 76feba7d21d85975bb97d4dc8249e5a963657380 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 4 May 2016 09:27:37 +0300
Subject: [PATCH 2004/3220] ALSA: isa/wavefront: prevent some out of bound
 writes

[ Upstream commit 84d7a4470dbac0dd9389050100b54a1625d04264 ]

"header->number" can be up to USHRT_MAX and it comes from the ioctl so
it needs to be capped.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/isa/wavefront/wavefront_synth.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 69f76ff5693d..718d5e3b7806 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -785,6 +785,9 @@ wavefront_send_patch (snd_wavefront_t *dev, wavefront_patch_info *header)
 	DPRINT (WF_DEBUG_LOAD_PATCH, "downloading patch %d\n",
 				      header->number);
 
+	if (header->number >= ARRAY_SIZE(dev->patch_status))
+		return -EINVAL;
+
 	dev->patch_status[header->number] |= WF_SLOT_FILLED;
 
 	bptr = buf;
@@ -809,6 +812,9 @@ wavefront_send_program (snd_wavefront_t *dev, wavefront_patch_info *header)
 	DPRINT (WF_DEBUG_LOAD_PATCH, "downloading program %d\n",
 		header->number);
 
+	if (header->number >= ARRAY_SIZE(dev->prog_status))
+		return -EINVAL;
+
 	dev->prog_status[header->number] = WF_SLOT_USED;
 
 	/* XXX need to zero existing SLOT_USED bit for program_status[i]
@@ -898,6 +904,9 @@ wavefront_send_sample (snd_wavefront_t *dev,
 		header->number = x;
 	}
 
+	if (header->number >= WF_MAX_SAMPLE)
+		return -EINVAL;
+
 	if (header->size) {
 
 		/* XXX it's a debatable point whether or not RDONLY semantics

From ba0da581b466dcd333eeb326919a698116926258 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 21 Dec 2018 14:09:54 +0100
Subject: [PATCH 2005/3220] Linux 4.4.169

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 082f82471b51..0d41b0626c0c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 168
+SUBLEVEL = 169
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From d08574b6f0aee9b04bb386eacd9eb89f76da0fea Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 2 Jan 2019 15:24:37 -0800
Subject: [PATCH 2006/3220] ANDROID: cuttlefish_defconfig: Enable VIRTIO_INPUT

Bug: 120439617
Change-Id: I83fdb2088f17e71f377e5864d217655b47839267
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/arm64/configs/cuttlefish_defconfig      | 1 +
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
index c620078b587c..2dacb5ca76df 100644
--- a/arch/arm64/configs/cuttlefish_defconfig
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -353,6 +353,7 @@ CONFIG_RTC_DRV_PL031=y
 CONFIG_VIRTIO_PCI=y
 # CONFIG_VIRTIO_PCI_LEGACY is not set
 CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_INPUT=y
 CONFIG_VIRTIO_MMIO=y
 CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
 CONFIG_STAGING=y
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 8e4a038e9ac2..975d083b2317 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -380,6 +380,7 @@ CONFIG_RTC_CLASS=y
 # CONFIG_RTC_HCTOSYS is not set
 CONFIG_VIRTIO_PCI=y
 CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_INPUT=y
 CONFIG_VIRTIO_MMIO=y
 CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
 CONFIG_STAGING=y

From c3f6fda5cbeec5e1b9c1d27db8f21c5eb32db6d1 Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 18 Sep 2018 20:39:53 +0800
Subject: [PATCH 2007/3220] f2fs: avoid GC causing encrypted file corrupted

The encrypted file may be corrupted by GC in following case:

Time 1: | segment 1 blkaddr = A |  GC -> | segment 2 blkaddr = B |
Encrypted block 1 is moved from blkaddr A of segment 1 to blkaddr B of
segment 2,

Time 2: | segment 1 blkaddr = B |  GC -> | segment 3 blkaddr = C |

Before page 1 is written back and if segment 2 become a victim, then
page 1 is moved from blkaddr B of segment 2 to blkaddr Cof segment 3,
during the GC process of Time 2, f2fs should wait for page 1 written back
before reading it, or move_data_block will read a garbage block from
blkaddr B since page is not written back to blkaddr B yet.

Commit 6aa58d8a ("f2fs: readahead encrypted block during GC") introduce
ra_data_block to read encrypted block, but it forgets to add
f2fs_wait_on_page_writeback to avoid racing between GC and flush.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 843430b55a73..f1ca62f80a68 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -658,6 +658,14 @@ got_it:
 	fio.page = page;
 	fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
 
+	/*
+	 * don't cache encrypted data into meta inode until previous dirty
+	 * data were writebacked to avoid racing between GC and flush.
+	 */
+	f2fs_wait_on_page_writeback(page, DATA, true);
+
+	f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
+
 	fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
 					dn.data_blkaddr,
 					FGP_LOCK | FGP_CREAT, GFP_NOFS);
@@ -745,6 +753,8 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	 */
 	f2fs_wait_on_page_writeback(page, DATA, true);
 
+	f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
+
 	err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
 	if (err)
 		goto put_out;

From 4040022b1c0644ffde3f636cd6e46f58098ba574 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 24 Oct 2018 17:24:10 +0800
Subject: [PATCH 2008/3220] f2fs: fix to account preflush command for
 noflush_merge mode

Previously, we only account preflush command for flush_merge mode,
so for noflush_merge mode, we can not know in-flight preflush
command count, fix it.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fb4be4629add..9f241f804b3d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -620,7 +620,9 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
 		return 0;
 
 	if (!test_opt(sbi, FLUSH_MERGE)) {
+		atomic_inc(&fcc->issing_flush);
 		ret = submit_flush_wait(sbi, ino);
+		atomic_dec(&fcc->issing_flush);
 		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}

From 73c1fdf1d7fdfe9a254bd490a0d380c704f9fd2f Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Wed, 24 Oct 2018 16:08:30 +0800
Subject: [PATCH 2009/3220] f2fs: fix count of seg_freed to make sec_freed
 correct

When sbi->segs_per_sec > 1, and if some segno has 0 valid blocks before
gc starts, do_garbage_collect will skip counting seg_freed++, and this
will cause seg_freed < sbi->segs_per_sec and finally skip sec_freed++.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f1ca62f80a68..043459bf86fc 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1140,9 +1140,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 					GET_SUM_BLOCK(sbi, segno));
 		f2fs_put_page(sum_page, 0);
 
-		if (get_valid_blocks(sbi, segno, false) == 0 ||
-				!PageUptodate(sum_page) ||
-				unlikely(f2fs_cp_error(sbi)))
+		if (get_valid_blocks(sbi, segno, false) == 0)
+			goto freed;
+		if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
 			goto next;
 
 		sum = page_address(sum_page);
@@ -1170,6 +1170,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 
 		stat_inc_seg_count(sbi, type, gc_type);
 
+freed:
 		if (gc_type == FG_GC &&
 				get_valid_blocks(sbi, segno, false) == 0)
 			seg_freed++;

From c7210a4c2a376907efa677ec5fece91ad405d21e Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Wed, 24 Oct 2018 16:09:42 +0800
Subject: [PATCH 2010/3220] f2fs: remove codes of unused wio_mutex

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 1 -
 fs/f2fs/super.c | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ac0c10990bbd..3ce4bc371b52 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1235,7 +1235,6 @@ struct f2fs_sb_info {
 
 	/* for bio operations */
 	struct f2fs_bio_info *write_io[NR_PAGE_TYPE];	/* for write bios */
-	struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE];
 						/* bio ordering for NODE/DATA */
 	/* keep migration IO order for LFS mode */
 	struct rw_semaphore io_order_lock;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c551d5336473..605861bd3d34 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2678,7 +2678,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
 static void init_sb_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = sbi->raw_super;
-	int i, j;
+	int i;
 
 	sbi->log_sectors_per_block =
 		le32_to_cpu(raw_super->log_sectors_per_block);
@@ -2714,9 +2714,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 
 	INIT_LIST_HEAD(&sbi->s_list);
 	mutex_init(&sbi->umount_mutex);
-	for (i = 0; i < NR_PAGE_TYPE - 1; i++)
-		for (j = HOT; j < NR_TEMP_TYPE; j++)
-			mutex_init(&sbi->wio_mutex[i][j]);
 	init_rwsem(&sbi->io_order_lock);
 	spin_lock_init(&sbi->cp_lock);
 

From 1f28ee17778de9855922cfef20ab78ea34bf1ebf Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 24 Oct 2018 18:34:26 +0800
Subject: [PATCH 2011/3220] f2fs: clean up f2fs_sb_has_##feature_name

In F2FS_HAS_FEATURE(), we will use F2FS_SB(sb) to get sbi pointer to
access .raw_super field, to avoid unneeded pointer conversion, this
patch changes to F2FS_HAS_FEATURE() accept sbi parameter directly.

Just do cleanup, no logic change.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  2 +-
 fs/f2fs/f2fs.h       | 23 ++++++++++----------
 fs/f2fs/file.c       |  8 +++----
 fs/f2fs/inode.c      | 18 +++++++--------
 fs/f2fs/namei.c      |  6 ++---
 fs/f2fs/node.c       |  6 ++---
 fs/f2fs/recovery.c   |  2 +-
 fs/f2fs/segment.c    |  4 ++--
 fs/f2fs/super.c      | 52 ++++++++++++++++++++++----------------------
 fs/f2fs/sysfs.c      | 20 ++++++++---------
 10 files changed, 71 insertions(+), 70 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 4ed2d2a0bf02..f13548e59124 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1466,7 +1466,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	 * invalidate intermediate page cache borrowed from meta inode
 	 * which are used for migration of encrypted inode's blocks.
 	 */
-	if (f2fs_sb_has_encrypt(sbi->sb))
+	if (f2fs_sb_has_encrypt(sbi))
 		invalidate_mapping_pages(META_MAPPING(sbi),
 				MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3ce4bc371b52..69f3df95c430 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -154,12 +154,13 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_VERITY		0x0400	/* reserved */
 #define F2FS_FEATURE_SB_CHKSUM		0x0800
 
-#define F2FS_HAS_FEATURE(sb, mask)					\
-	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
-#define F2FS_SET_FEATURE(sb, mask)					\
-	(F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask))
-#define F2FS_CLEAR_FEATURE(sb, mask)					\
-	(F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask))
+#define __F2FS_HAS_FEATURE(raw_super, mask)				\
+	((raw_super->feature & cpu_to_le32(mask)) != 0)
+#define F2FS_HAS_FEATURE(sbi, mask)	__F2FS_HAS_FEATURE(sbi->raw_super, mask)
+#define F2FS_SET_FEATURE(sbi, mask)					\
+	(sbi->raw_super->feature |= cpu_to_le32(mask))
+#define F2FS_CLEAR_FEATURE(sbi, mask)					\
+	(sbi->raw_super->feature &= ~cpu_to_le32(mask))
 
 /* bio stuffs */
 #define REQ_OP_READ	READ
@@ -3541,9 +3542,9 @@ static inline bool f2fs_post_read_required(struct inode *inode)
 }
 
 #define F2FS_FEATURE_FUNCS(name, flagname) \
-static inline int f2fs_sb_has_##name(struct super_block *sb) \
+static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
 { \
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \
+	return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \
 }
 
 F2FS_FEATURE_FUNCS(encrypt, ENCRYPT);
@@ -3573,7 +3574,7 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 
 static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
 {
-	return f2fs_sb_has_blkzoned(sbi->sb);
+	return f2fs_sb_has_blkzoned(sbi);
 }
 
 static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
@@ -3648,7 +3649,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
 	 * for blkzoned device, fallback direct IO to buffered IO, so
 	 * all IOs can be serialized by log-structured write.
 	 */
-	if (f2fs_sb_has_blkzoned(sbi->sb))
+	if (f2fs_sb_has_blkzoned(sbi))
 		return true;
 	if (test_opt(sbi, LFS) && (rw == WRITE) &&
 				block_unaligned_IO(inode, iocb, iter))
@@ -3671,7 +3672,7 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
 static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
 {
 #ifdef CONFIG_QUOTA
-	if (f2fs_sb_has_quota_ino(sbi->sb))
+	if (f2fs_sb_has_quota_ino(sbi))
 		return true;
 	if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
 		F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c61f9619be1..878f51af686c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -700,7 +700,7 @@ int f2fs_getattr(struct vfsmount *mnt,
 	unsigned int flags;
 
 	if (f2fs_has_extra_attr(inode) &&
-			f2fs_sb_has_inode_crtime(inode->i_sb) &&
+			f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) &&
 			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
 		stat->result_mask |= STATX_BTIME;
 		stat->btime.tv_sec = fi->i_crtime.tv_sec;
@@ -2030,7 +2030,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 
-	if (!f2fs_sb_has_encrypt(inode->i_sb))
+	if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode)))
 		return -EOPNOTSUPP;
 
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
@@ -2040,7 +2040,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
-	if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb))
+	if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
 		return -EOPNOTSUPP;
 	return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 }
@@ -2051,7 +2051,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int err;
 
-	if (!f2fs_sb_has_encrypt(inode->i_sb))
+	if (!f2fs_sb_has_encrypt(sbi))
 		return -EOPNOTSUPP;
 
 	err = mnt_want_write_file(filp);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 1b1f22faff22..80c02c4d75c2 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -118,7 +118,7 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page
 {
 	struct f2fs_inode *ri = &F2FS_NODE(page)->i;
 
-	if (!f2fs_sb_has_inode_chksum(sbi->sb))
+	if (!f2fs_sb_has_inode_chksum(sbi))
 		return false;
 
 	if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR))
@@ -218,7 +218,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 		return false;
 	}
 
-	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)
+	if (f2fs_sb_has_flexible_inline_xattr(sbi)
 			&& !f2fs_has_extra_attr(inode)) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
 		f2fs_msg(sbi->sb, KERN_WARNING,
@@ -228,7 +228,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 	}
 
 	if (f2fs_has_extra_attr(inode) &&
-			!f2fs_sb_has_extra_attr(sbi->sb)) {
+			!f2fs_sb_has_extra_attr(sbi)) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
 		f2fs_msg(sbi->sb, KERN_WARNING,
 			"%s: inode (ino=%lx) is with extra_attr, "
@@ -340,7 +340,7 @@ static int do_read_inode(struct inode *inode)
 	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
 					le16_to_cpu(ri->i_extra_isize) : 0;
 
-	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+	if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
 		fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
 	} else if (f2fs_has_inline_xattr(inode) ||
 				f2fs_has_inline_dentry(inode)) {
@@ -390,14 +390,14 @@ static int do_read_inode(struct inode *inode)
 	if (fi->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
 
-	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) &&
+	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi) &&
 			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
 		i_projid = (projid_t)le32_to_cpu(ri->i_projid);
 	else
 		i_projid = F2FS_DEF_PROJID;
 	fi->i_projid = make_kprojid(&init_user_ns, i_projid);
 
-	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) &&
+	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi) &&
 			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
 		fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime);
 		fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
@@ -542,11 +542,11 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 	if (f2fs_has_extra_attr(inode)) {
 		ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
 
-		if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb))
+		if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)))
 			ri->i_inline_xattr_size =
 				cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
 
-		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
+		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) &&
 			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
 								i_projid)) {
 			projid_t i_projid;
@@ -556,7 +556,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 			ri->i_projid = cpu_to_le32(i_projid);
 		}
 
-		if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) &&
+		if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) &&
 			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
 								i_crtime)) {
 			ri->i_crtime =
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 3ae6f5ae597e..de1517810f65 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -61,7 +61,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 		goto fail;
 	}
 
-	if (f2fs_sb_has_project_quota(sbi->sb) &&
+	if (f2fs_sb_has_project_quota(sbi) &&
 		(F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
 		F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
 	else
@@ -79,7 +79,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 				f2fs_may_encrypt(inode))
 		f2fs_set_encrypted_inode(inode);
 
-	if (f2fs_sb_has_extra_attr(sbi->sb)) {
+	if (f2fs_sb_has_extra_attr(sbi)) {
 		set_inode_flag(inode, FI_EXTRA_ATTR);
 		F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
 	}
@@ -92,7 +92,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (f2fs_may_inline_dentry(inode))
 		set_inode_flag(inode, FI_INLINE_DENTRY);
 
-	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+	if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
 		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
 		if (f2fs_has_inline_xattr(inode))
 			xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 08edc0930831..a5eec61f34a6 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2566,17 +2566,17 @@ retry:
 	if (dst->i_inline & F2FS_EXTRA_ATTR) {
 		dst->i_extra_isize = src->i_extra_isize;
 
-		if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
+		if (f2fs_sb_has_flexible_inline_xattr(sbi) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 							i_inline_xattr_size))
 			dst->i_inline_xattr_size = src->i_inline_xattr_size;
 
-		if (f2fs_sb_has_project_quota(sbi->sb) &&
+		if (f2fs_sb_has_project_quota(sbi) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 								i_projid))
 			dst->i_projid = src->i_projid;
 
-		if (f2fs_sb_has_inode_crtime(sbi->sb) &&
+		if (f2fs_sb_has_inode_crtime(sbi) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 							i_crtime_nsec)) {
 			dst->i_crtime = src->i_crtime;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 336273a81fba..cf583b4b6c9f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -250,7 +250,7 @@ static int recover_inode(struct inode *inode, struct page *page)
 	i_gid_write(inode, le32_to_cpu(raw->i_gid));
 
 	if (raw->i_inline & F2FS_EXTRA_ATTR) {
-		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
+		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) &&
 			F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize),
 								i_projid)) {
 			projid_t i_projid;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9f241f804b3d..6aa84778b848 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1812,7 +1812,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
 #ifdef CONFIG_BLK_DEV_ZONED
-	if (f2fs_sb_has_blkzoned(sbi->sb) &&
+	if (f2fs_sb_has_blkzoned(sbi) &&
 				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
 		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
 #endif
@@ -2024,7 +2024,7 @@ find_next:
 					sbi->blocks_per_seg, cur_pos);
 			len = next_pos - cur_pos;
 
-			if (f2fs_sb_has_blkzoned(sbi->sb) ||
+			if (f2fs_sb_has_blkzoned(sbi) ||
 			    (force && len < cpc->trim_minlen))
 				goto skip;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 605861bd3d34..b96019c62a2f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -259,7 +259,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 			"quota options when quota turned on");
 		return -EINVAL;
 	}
-	if (f2fs_sb_has_quota_ino(sb)) {
+	if (f2fs_sb_has_quota_ino(sbi)) {
 		f2fs_msg(sb, KERN_INFO,
 			"QUOTA feature is enabled, so ignore qf_name");
 		return 0;
@@ -314,7 +314,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
 	 * 'grpquota' mount options are allowed even without quota feature
 	 * to support legacy quotas in quota files.
 	 */
-	if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) {
+	if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) {
 		f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. "
 			 "Cannot enable project quota enforcement.");
 		return -1;
@@ -348,7 +348,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
 		}
 	}
 
-	if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) {
+	if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) {
 		f2fs_msg(sbi->sb, KERN_INFO,
 			"QUOTA feature is enabled, so ignore jquota_fmt");
 		F2FS_OPTION(sbi).s_jquota_fmt = 0;
@@ -417,7 +417,7 @@ static int parse_options(struct super_block *sb, char *options)
 			set_opt(sbi, DISCARD);
 			break;
 		case Opt_nodiscard:
-			if (f2fs_sb_has_blkzoned(sb)) {
+			if (f2fs_sb_has_blkzoned(sbi)) {
 				f2fs_msg(sb, KERN_WARNING,
 					"discard is required for zoned block devices");
 				return -EINVAL;
@@ -566,7 +566,7 @@ static int parse_options(struct super_block *sb, char *options)
 				return -ENOMEM;
 			if (strlen(name) == 8 &&
 					!strncmp(name, "adaptive", 8)) {
-				if (f2fs_sb_has_blkzoned(sb)) {
+				if (f2fs_sb_has_blkzoned(sbi)) {
 					f2fs_msg(sb, KERN_WARNING,
 						 "adaptive mode is not allowed with "
 						 "zoned block device feature");
@@ -758,7 +758,7 @@ static int parse_options(struct super_block *sb, char *options)
 			break;
 		case Opt_test_dummy_encryption:
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
-			if (!f2fs_sb_has_encrypt(sb)) {
+			if (!f2fs_sb_has_encrypt(sbi)) {
 				f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
 				return -EINVAL;
 			}
@@ -799,13 +799,13 @@ static int parse_options(struct super_block *sb, char *options)
 	if (f2fs_check_quota_options(sbi))
 		return -EINVAL;
 #else
-	if (f2fs_sb_has_quota_ino(sbi->sb) && !f2fs_readonly(sbi->sb)) {
+	if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) {
 		f2fs_msg(sbi->sb, KERN_INFO,
 			 "Filesystem with quota feature cannot be mounted RDWR "
 			 "without CONFIG_QUOTA");
 		return -EINVAL;
 	}
-	if (f2fs_sb_has_project_quota(sbi->sb) && !f2fs_readonly(sbi->sb)) {
+	if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) {
 		f2fs_msg(sb, KERN_ERR,
 			"Filesystem with project quota feature cannot be "
 			"mounted RDWR without CONFIG_QUOTA");
@@ -821,8 +821,8 @@ static int parse_options(struct super_block *sb, char *options)
 	}
 
 	if (test_opt(sbi, INLINE_XATTR_SIZE)) {
-		if (!f2fs_sb_has_extra_attr(sb) ||
-			!f2fs_sb_has_flexible_inline_xattr(sb)) {
+		if (!f2fs_sb_has_extra_attr(sbi) ||
+			!f2fs_sb_has_flexible_inline_xattr(sbi)) {
 			f2fs_msg(sb, KERN_ERR,
 					"extra_attr or flexible_inline_xattr "
 					"feature is off");
@@ -1432,7 +1432,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	clear_opt(sbi, DISABLE_CHECKPOINT);
 	set_opt(sbi, FLUSH_MERGE);
 	set_opt(sbi, DISCARD);
-	if (f2fs_sb_has_blkzoned(sbi->sb))
+	if (f2fs_sb_has_blkzoned(sbi))
 		set_opt_mode(sbi, F2FS_MOUNT_LFS);
 	else
 		set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
@@ -1576,7 +1576,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		sb->s_flags &= ~MS_RDONLY;
 		if (sb_any_quota_suspended(sb)) {
 			dquot_resume(sb, -1);
-		} else if (f2fs_sb_has_quota_ino(sb)) {
+		} else if (f2fs_sb_has_quota_ino(sbi)) {
 			err = f2fs_enable_quotas(sb);
 			if (err)
 				goto restore_opts;
@@ -1818,7 +1818,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
 	int enabled = 0;
 	int i, err;
 
-	if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) {
+	if (f2fs_sb_has_quota_ino(sbi) && rdonly) {
 		err = f2fs_enable_quotas(sbi->sb);
 		if (err) {
 			f2fs_msg(sbi->sb, KERN_ERR,
@@ -1849,7 +1849,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
 	unsigned long qf_inum;
 	int err;
 
-	BUG_ON(!f2fs_sb_has_quota_ino(sb));
+	BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb)));
 
 	qf_inum = f2fs_qf_ino(sb, type);
 	if (!qf_inum)
@@ -1994,7 +1994,7 @@ static int f2fs_quota_off(struct super_block *sb, int type)
 		goto out_put;
 
 	err = dquot_quota_off(sb, type);
-	if (err || f2fs_sb_has_quota_ino(sb))
+	if (err || f2fs_sb_has_quota_ino(F2FS_SB(sb)))
 		goto out_put;
 
 	inode_lock(inode);
@@ -2177,7 +2177,7 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
 	 * if LOST_FOUND feature is enabled.
 	 *
 	 */
-	if (f2fs_sb_has_lost_found(sbi->sb) &&
+	if (f2fs_sb_has_lost_found(sbi) &&
 			inode->i_ino == F2FS_ROOT_INO(sbi))
 		return -EPERM;
 
@@ -2400,7 +2400,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 	__u32 crc = 0;
 
 	/* Check checksum_offset and crc in superblock */
-	if (le32_to_cpu(raw_super->feature) & F2FS_FEATURE_SB_CHKSUM) {
+	if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) {
 		crc_offset = le32_to_cpu(raw_super->checksum_offset);
 		if (crc_offset !=
 			offsetof(struct f2fs_super_block, crc)) {
@@ -2750,7 +2750,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 	unsigned int n = 0;
 	int err = -EIO;
 
-	if (!f2fs_sb_has_blkzoned(sbi->sb))
+	if (!f2fs_sb_has_blkzoned(sbi))
 		return 0;
 
 	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
@@ -2881,7 +2881,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 	}
 
 	/* we should update superblock crc here */
-	if (!recover && f2fs_sb_has_sb_chksum(sbi->sb)) {
+	if (!recover && f2fs_sb_has_sb_chksum(sbi)) {
 		crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi),
 				offsetof(struct f2fs_super_block, crc));
 		F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc);
@@ -2973,7 +2973,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 
 #ifdef CONFIG_BLK_DEV_ZONED
 		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
-				!f2fs_sb_has_blkzoned(sbi->sb)) {
+				!f2fs_sb_has_blkzoned(sbi)) {
 			f2fs_msg(sbi->sb, KERN_ERR,
 				"Zoned block device feature not enabled\n");
 			return -EINVAL;
@@ -3069,7 +3069,7 @@ try_onemore:
 	sbi->raw_super = raw_super;
 
 	/* precompute checksum seed for metadata */
-	if (f2fs_sb_has_inode_chksum(sb))
+	if (f2fs_sb_has_inode_chksum(sbi))
 		sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
 						sizeof(raw_super->uuid));
 
@@ -3079,7 +3079,7 @@ try_onemore:
 	 * devices, but mandatory for host-managed zoned block devices.
 	 */
 #ifndef CONFIG_BLK_DEV_ZONED
-	if (f2fs_sb_has_blkzoned(sb)) {
+	if (f2fs_sb_has_blkzoned(sbi)) {
 		f2fs_msg(sb, KERN_ERR,
 			 "Zoned block device support is not enabled\n");
 		err = -EOPNOTSUPP;
@@ -3106,13 +3106,13 @@ try_onemore:
 
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &f2fs_quota_operations;
-	if (f2fs_sb_has_quota_ino(sb))
+	if (f2fs_sb_has_quota_ino(sbi))
 		sb->s_qcop = &dquot_quotactl_sysfile_ops;
 	else
 		sb->s_qcop = &f2fs_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 
-	if (f2fs_sb_has_quota_ino(sbi->sb)) {
+	if (f2fs_sb_has_quota_ino(sbi)) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if (f2fs_qf_ino(sbi->sb, i))
 				sbi->nquota_files++;
@@ -3301,7 +3301,7 @@ try_onemore:
 
 #ifdef CONFIG_QUOTA
 	/* Enable quota usage during mount */
-	if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) {
+	if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) {
 		err = f2fs_enable_quotas(sb);
 		if (err)
 			f2fs_msg(sb, KERN_ERR,
@@ -3396,7 +3396,7 @@ skip_recovery:
 free_meta:
 #ifdef CONFIG_QUOTA
 	f2fs_truncate_quota_inode_pages(sb);
-	if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb))
+	if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb))
 		f2fs_quota_off_umount(sbi->sb);
 #endif
 	/*
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index db89741b219b..7a17bcac3253 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -90,34 +90,34 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (!sb->s_bdev->bd_part)
 		return snprintf(buf, PAGE_SIZE, "0\n");
 
-	if (f2fs_sb_has_encrypt(sb))
+	if (f2fs_sb_has_encrypt(sbi))
 		len += snprintf(buf, PAGE_SIZE - len, "%s",
 						"encryption");
-	if (f2fs_sb_has_blkzoned(sb))
+	if (f2fs_sb_has_blkzoned(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "blkzoned");
-	if (f2fs_sb_has_extra_attr(sb))
+	if (f2fs_sb_has_extra_attr(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "extra_attr");
-	if (f2fs_sb_has_project_quota(sb))
+	if (f2fs_sb_has_project_quota(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "projquota");
-	if (f2fs_sb_has_inode_chksum(sb))
+	if (f2fs_sb_has_inode_chksum(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "inode_checksum");
-	if (f2fs_sb_has_flexible_inline_xattr(sb))
+	if (f2fs_sb_has_flexible_inline_xattr(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "flexible_inline_xattr");
-	if (f2fs_sb_has_quota_ino(sb))
+	if (f2fs_sb_has_quota_ino(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "quota_ino");
-	if (f2fs_sb_has_inode_crtime(sb))
+	if (f2fs_sb_has_inode_crtime(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "inode_crtime");
-	if (f2fs_sb_has_lost_found(sb))
+	if (f2fs_sb_has_lost_found(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "lost_found");
-	if (f2fs_sb_has_sb_chksum(sb))
+	if (f2fs_sb_has_sb_chksum(sbi))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "sb_checksum");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");

From eccae7fc76cc573fbeeb83a3d11713c4cfb4f542 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 24 Oct 2018 18:37:26 +0800
Subject: [PATCH 2012/3220] f2fs: introduce __is_large_section() for cleanup

Introduce a wrapper __is_large_section() to clean up codes.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c   |  2 +-
 fs/f2fs/f2fs.h    |  2 ++
 fs/f2fs/file.c    |  2 +-
 fs/f2fs/gc.c      |  4 ++--
 fs/f2fs/segment.c | 16 ++++++++--------
 fs/f2fs/segment.h |  2 +-
 6 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 139b4d5c83d5..e327eefdbc02 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -197,7 +197,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
 	si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
 	si->base_mem += SIT_VBLOCK_MAP_SIZE;
-	if (sbi->segs_per_sec > 1)
+	if (__is_large_section(sbi))
 		si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
 	si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 69f3df95c430..f94636f59377 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2847,6 +2847,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
 	spin_unlock(&sbi->iostat_lock);
 }
 
+#define __is_large_section(sbi)		((sbi)->segs_per_sec > 1)
+
 #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META &&	\
 				(!is_read_io(fio->op) || fio->is_meta))
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 878f51af686c..3a8117873bc7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2560,7 +2560,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
 		return -EFAULT;
 
 	if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
-			sbi->segs_per_sec != 1) {
+			__is_large_section(sbi)) {
 		f2fs_msg(sbi->sb, KERN_WARNING,
 			"Can't flush %u in %d for segs_per_sec %u != 1\n",
 				range.dev_num, sbi->s_ndevs,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 043459bf86fc..19272835595f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1109,7 +1109,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 	int submitted = 0;
 
 	/* readahead multi ssa blocks those have contiguous address */
-	if (sbi->segs_per_sec > 1)
+	if (__is_large_section(sbi))
 		f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
 					sbi->segs_per_sec, META_SSA, true);
 
@@ -1318,7 +1318,7 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
 	sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
 
 	/* give warm/cold data area from slower device */
-	if (sbi->s_ndevs && sbi->segs_per_sec == 1)
+	if (sbi->s_ndevs && !__is_large_section(sbi))
 		SIT_I(sbi)->last_victim[ALLOC_NEXT] =
 				GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6aa84778b848..9c9199df9773 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1960,7 +1960,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 	unsigned int start = 0, end = -1;
 	unsigned int secno, start_segno;
 	bool force = (cpc->reason & CP_DISCARD);
-	bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1;
+	bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
 
 	mutex_lock(&dirty_i->seglist_lock);
 
@@ -1992,7 +1992,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 					(end - 1) <= cpc->trim_end)
 				continue;
 
-		if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) {
+		if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) {
 			f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
 				(end - start) << sbi->log_blocks_per_seg);
 			continue;
@@ -2224,7 +2224,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 	/* update total number of valid blocks to be written in ckpt area */
 	SIT_I(sbi)->written_valid_blocks += del;
 
-	if (sbi->segs_per_sec > 1)
+	if (__is_large_section(sbi))
 		get_sec_entry(sbi, segno)->valid_blocks += del;
 }
 
@@ -2490,7 +2490,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
 static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
 {
 	/* if segs_per_sec is large than 1, we need to keep original policy. */
-	if (sbi->segs_per_sec != 1)
+	if (__is_large_section(sbi))
 		return CURSEG_I(sbi, type)->segno;
 
 	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
@@ -2800,7 +2800,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	struct discard_policy dpolicy;
 	unsigned long long trimmed = 0;
 	int err = 0;
-	bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1;
+	bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
 
 	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
 		return -EINVAL;
@@ -3959,7 +3959,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	if (!sit_i->tmp_map)
 		return -ENOMEM;
 
-	if (sbi->segs_per_sec > 1) {
+	if (__is_large_section(sbi)) {
 		sit_i->sec_entries =
 			f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry),
 						      MAIN_SECS(sbi)),
@@ -4114,7 +4114,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 					se->valid_blocks;
 			}
 
-			if (sbi->segs_per_sec > 1)
+			if (__is_large_section(sbi))
 				get_sec_entry(sbi, start)->valid_blocks +=
 							se->valid_blocks;
 		}
@@ -4158,7 +4158,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 			sbi->discard_blks -= se->valid_blocks;
 		}
 
-		if (sbi->segs_per_sec > 1) {
+		if (__is_large_section(sbi)) {
 			get_sec_entry(sbi, start)->valid_blocks +=
 							se->valid_blocks;
 			get_sec_entry(sbi, start)->valid_blocks -=
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ab3465faddf1..a77f76f528b6 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -333,7 +333,7 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
 	 * In order to get # of valid blocks in a section instantly from many
 	 * segments, f2fs manages two counting structures separately.
 	 */
-	if (use_section && sbi->segs_per_sec > 1)
+	if (use_section && __is_large_section(sbi))
 		return get_sec_entry(sbi, segno)->valid_blocks;
 	else
 		return get_seg_entry(sbi, segno)->valid_blocks;

From 923e976f34cbbc8b6baa142fc1d4b1b58840e266 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 24 Oct 2018 18:37:27 +0800
Subject: [PATCH 2013/3220] f2fs: support subsectional garbage collection

Section is minimal garbage collection unit of f2fs, in zoned block
device, or ancient block mapping flash device, in order to improve
GC efficiency, we can align GC unit to lower device erase unit,
normally, it consists of multiple of segments.

Once background or foreground GC triggers, it brings a large number
of IOs, which will impact user IO, and also occupy cpu/memory resource
intensively.

So, to reduce impact of GC on large size section, this patch supports
subsectional GC, in one cycle of GC, it only migrate partial segment{s}
in victim section. Currently, by default, we use sbi->segs_per_sec as
migration granularity.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  |  3 +++
 fs/f2fs/gc.c    | 39 +++++++++++++++++++++++++++++++++------
 fs/f2fs/super.c |  3 +++
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f94636f59377..a731d253ec10 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1328,6 +1328,7 @@ struct f2fs_sb_info {
 	struct f2fs_gc_kthread	*gc_thread;	/* GC thread */
 	unsigned int cur_victim_sec;		/* current victim section num */
 	unsigned int gc_mode;			/* current GC state */
+	unsigned int next_victim_seg[2];	/* next segment in victim section */
 	/* for skip statistic */
 	unsigned long long skipped_atomic_files[2];	/* FG_GC and BG_GC */
 	unsigned long long skipped_gc_rwsem;		/* FG_GC only */
@@ -1337,6 +1338,8 @@ struct f2fs_sb_info {
 
 	/* maximum # of trials to find a victim segment for SSR and GC */
 	unsigned int max_victim_search;
+	/* migration granularity of garbage collection, unit: segment */
+	unsigned int migration_granularity;
 
 	/*
 	 * for stat information.
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 19272835595f..d6e4ceef964b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -333,6 +333,22 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 	if (p.max_search == 0)
 		goto out;
 
+	if (__is_large_section(sbi) && p.alloc_mode == LFS) {
+		if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) {
+			p.min_segno = sbi->next_victim_seg[BG_GC];
+			*result = p.min_segno;
+			sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
+			goto got_result;
+		}
+		if (gc_type == FG_GC &&
+				sbi->next_victim_seg[FG_GC] != NULL_SEGNO) {
+			p.min_segno = sbi->next_victim_seg[FG_GC];
+			*result = p.min_segno;
+			sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
+			goto got_result;
+		}
+	}
+
 	last_victim = sm->last_victim[p.gc_mode];
 	if (p.alloc_mode == LFS && gc_type == FG_GC) {
 		p.min_segno = check_bg_victims(sbi);
@@ -395,6 +411,8 @@ next:
 	}
 	if (p.min_segno != NULL_SEGNO) {
 got_it:
+		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+got_result:
 		if (p.alloc_mode == LFS) {
 			secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
 			if (gc_type == FG_GC)
@@ -402,7 +420,6 @@ got_it:
 			else
 				set_bit(secno, dirty_i->victim_secmap);
 		}
-		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
 
 		trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
 				sbi->cur_victim_sec,
@@ -1103,15 +1120,18 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 	struct blk_plug plug;
 	unsigned int segno = start_segno;
 	unsigned int end_segno = start_segno + sbi->segs_per_sec;
-	int seg_freed = 0;
+	int seg_freed = 0, migrated = 0;
 	unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
 						SUM_TYPE_DATA : SUM_TYPE_NODE;
 	int submitted = 0;
 
+	if (__is_large_section(sbi))
+		end_segno = rounddown(end_segno, sbi->segs_per_sec);
+
 	/* readahead multi ssa blocks those have contiguous address */
 	if (__is_large_section(sbi))
 		f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
-					sbi->segs_per_sec, META_SSA, true);
+					end_segno - segno, META_SSA, true);
 
 	/* reference all summary page */
 	while (segno < end_segno) {
@@ -1142,8 +1162,11 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 
 		if (get_valid_blocks(sbi, segno, false) == 0)
 			goto freed;
+		if (__is_large_section(sbi) &&
+				migrated >= sbi->migration_granularity)
+			goto skip;
 		if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
-			goto next;
+			goto skip;
 
 		sum = page_address(sum_page);
 		if (type != GET_SUM_TYPE((&sum->footer))) {
@@ -1151,7 +1174,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 				"type [%d, %d] in SSA and SIT",
 				segno, type, GET_SUM_TYPE((&sum->footer)));
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
-			goto next;
+			goto skip;
 		}
 
 		/*
@@ -1174,7 +1197,11 @@ freed:
 		if (gc_type == FG_GC &&
 				get_valid_blocks(sbi, segno, false) == 0)
 			seg_freed++;
-next:
+		migrated++;
+
+		if (__is_large_section(sbi) && segno + 1 < end_segno)
+			sbi->next_victim_seg[gc_type] = segno + 1;
+skip:
 		f2fs_put_page(sum_page, 0);
 	}
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b96019c62a2f..cb41f2e5ed18 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2696,7 +2696,10 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
 	sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
 	sbi->cur_victim_sec = NULL_SECNO;
+	sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
+	sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
 	sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
+	sbi->migration_granularity = sbi->segs_per_sec;
 
 	sbi->dir_level = DEF_DIR_LEVEL;
 	sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;

From 9b82e292ff189daac50eac1bf9d71236b97d2921 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 25 Oct 2018 16:19:28 +0800
Subject: [PATCH 2014/3220] f2fs: export migration_granularity sysfs entry

Add one sysfs entry to control migration granularity of GC in large
section f2fs, it can be tuned to mitigate heavy overhead of migrating
huge number of blocks in large section.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs | 9 +++++++++
 fs/f2fs/sysfs.c                         | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 6b5e31f470dc..e09db014830f 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -92,6 +92,15 @@ Contact:	"Jaegeuk Kim" <jaegeuk.kim@samsung.com>
 Description:
 		 Controls the number of trials to find a victim segment.
 
+What:		/sys/fs/f2fs/<disk>/migration_granularity
+Date:		October 2018
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Description:
+		 Controls migration granularity of garbage collection on large
+		 section, it can let GC move partial segment{s} of one section
+		 in one GC cycle, so that dispersing heavy overhead GC to
+		 multiple lightweight one.
+
 What:		/sys/fs/f2fs/<disk>/dir_level
 Date:		March 2014
 Contact:	"Jaegeuk Kim" <jaegeuk.kim@samsung.com>
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 7a17bcac3253..948c1a211341 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -246,6 +246,11 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "migration_granularity")) {
+		if (t == 0 || t > sbi->segs_per_sec)
+			return -EINVAL;
+	}
+
 	if (!strcmp(a->attr.name, "trim_sections"))
 		return -EINVAL;
 
@@ -406,6 +411,7 @@ F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
@@ -460,6 +466,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(min_hot_blocks),
 	ATTR_LIST(min_ssr_sections),
 	ATTR_LIST(max_victim_search),
+	ATTR_LIST(migration_granularity),
 	ATTR_LIST(dir_level),
 	ATTR_LIST(ram_thresh),
 	ATTR_LIST(ra_nid_pages),

From 81f26294b698985062e8346ea64906102be6814e Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 30 Oct 2018 20:37:55 +0800
Subject: [PATCH 2015/3220] f2fs: change segment to section in
 f2fs_ioc_gc_range

f2fs_ioc_gc_range skips blocks_per_seg each time, however, f2fs_gc moves
blocks of section each time, so fix it from segment to section.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3a8117873bc7..cf143dfbc145 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2155,7 +2155,7 @@ do_more:
 	}
 
 	ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start));
-	range.start += sbi->blocks_per_seg;
+	range.start += BLKS_PER_SEC(sbi);
 	if (range.start <= end)
 		goto do_more;
 out:

From 225132593f4b4d4a42f6e83d9a074ca3184af7c4 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Tue, 6 Nov 2018 10:25:29 +0800
Subject: [PATCH 2016/3220] f2fs: move dir data flush to write checkpoint
 process

This patch move dir data flush to write checkpoint process, by
doing this, it may reduce some time for dir fsync.

pre:
	-f2fs_do_sync_file enter
		-file_write_and_wait_range  <- flush & wait
		-write_checkpoint
			-do_checkpoint	    <- wait all
	-f2fs_do_sync_file exit

now:
	-f2fs_do_sync_file enter
		-write_checkpoint
			-block_operations   <- flush dir & no wait
			-do_checkpoint	    <- wait all
	-f2fs_do_sync_file exit

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index cf143dfbc145..206d2188a770 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -219,6 +219,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 
 	trace_f2fs_sync_file_enter(inode);
 
+	if (S_ISDIR(inode->i_mode))
+		goto go_write;
+
 	/* if fdatasync is triggered, let's do in-place-update */
 	if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
 		set_inode_flag(inode, FI_NEED_IPU);

From e7e527cd3767b2b97c6b7122ee7c1545e453442d Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Mon, 12 Nov 2018 00:46:46 +0800
Subject: [PATCH 2017/3220] f2fs: add to account direct IO

This patch adds f2fs_dio_submit_bio() to hook submit_io/end_io functions
in direct IO path, in order to account DIO.

Later, we will add this count into is_idle() to let background GC/Discard
thread be aware of DIO.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c  | 52 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/f2fs/debug.c |  4 ++++
 fs/f2fs/f2fs.h  | 10 ++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ac25667e9999..0ea5cee98996 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2544,6 +2544,53 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
 	return 0;
 }
 
+static void f2fs_dio_end_io(struct bio *bio)
+{
+	struct f2fs_private_dio *dio = bio->bi_private;
+
+	dec_page_count(F2FS_I_SB(dio->inode),
+			dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
+
+	bio->bi_private = dio->orig_private;
+	bio->bi_end_io = dio->orig_end_io;
+
+	kfree(dio);
+
+	bio_endio(bio);
+}
+
+static void f2fs_dio_submit_bio(int rw, struct bio *bio, struct inode *inode,
+							loff_t file_offset)
+{
+	struct f2fs_private_dio *dio;
+	bool write = (rw == REQ_OP_WRITE);
+	int err;
+
+	dio = f2fs_kzalloc(F2FS_I_SB(inode),
+			sizeof(struct f2fs_private_dio), GFP_NOFS);
+	if (!dio) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	dio->inode = inode;
+	dio->orig_end_io = bio->bi_end_io;
+	dio->orig_private = bio->bi_private;
+	dio->write = write;
+
+	bio->bi_end_io = f2fs_dio_end_io;
+	bio->bi_private = dio;
+
+	inc_page_count(F2FS_I_SB(inode),
+			write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
+
+	submit_bio(rw, bio);
+	return;
+out:
+	bio->bi_error = -EIO;
+	bio_endio(bio);
+}
+
 static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 				loff_t offset)
 {
@@ -2590,7 +2637,10 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 			down_read(&fi->i_gc_rwsem[READ]);
 	}
 
-	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
+	err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+			iter, offset,
+			get_data_block_dio, NULL, f2fs_dio_submit_bio,
+			DIO_LOCKING | DIO_SKIP_HOLES);
 
 	if (do_opu)
 		up_read(&fi->i_gc_rwsem[READ]);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index e327eefdbc02..06e72f9c8654 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -53,6 +53,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->vw_cnt = atomic_read(&sbi->vw_cnt);
 	si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
 	si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
+	si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ);
+	si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE);
 	si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
 	si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
 	si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA);
@@ -374,6 +376,8 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
+		seq_printf(s, "  - DIO (R: %4d, W: %4d)\n",
+			   si->nr_dio_read, si->nr_dio_write);
 		seq_printf(s, "  - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
 			   si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
 		seq_printf(s, "  - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a731d253ec10..3d26e3d1eef2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1022,6 +1022,8 @@ enum count_type {
 	F2FS_RD_DATA,
 	F2FS_RD_NODE,
 	F2FS_RD_META,
+	F2FS_DIO_WRITE,
+	F2FS_DIO_READ,
 	NR_COUNT_TYPE,
 };
 
@@ -1398,6 +1400,13 @@ struct f2fs_sb_info {
 	__u32 s_chksum_seed;
 };
 
+struct f2fs_private_dio {
+	struct inode *inode;
+	void *orig_private;
+	bio_end_io_t *orig_end_io;
+	bool write;
+};
+
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 #define f2fs_show_injection_info(type)					\
 	printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n",	\
@@ -3236,6 +3245,7 @@ struct f2fs_stat_info {
 	int total_count, utilization;
 	int bg_gc, nr_wb_cp_data, nr_wb_data;
 	int nr_rd_data, nr_rd_node, nr_rd_meta;
+	int nr_dio_read, nr_dio_write;
 	unsigned int io_skip_bggc, other_skip_bggc;
 	int nr_flushing, nr_flushed, flush_list_empty;
 	int nr_discarding, nr_discarded;

From 437725968fede0588d6f18981d72f39766f38885 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Mon, 12 Nov 2018 00:55:44 +0800
Subject: [PATCH 2018/3220] f2fs: fix to be aware discard/preflush/dio command
 in is_idle()

This patch adds missing in-flight discard/preflush/dio command count
check in is_idle().

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3d26e3d1eef2..3b8bc9a78606 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2223,7 +2223,11 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
 {
 	if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
 		get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
-		get_pages(sbi, F2FS_WB_CP_DATA))
+		get_pages(sbi, F2FS_WB_CP_DATA) ||
+		get_pages(sbi, F2FS_DIO_READ) ||
+		get_pages(sbi, F2FS_DIO_WRITE) ||
+		atomic_read(&SM_I(sbi)->dcc_info->issing_discard) ||
+		atomic_read(&SM_I(sbi)->fcc_info->issing_flush))
 		return false;
 	return f2fs_time_over(sbi, type);
 }

From 43ee8d651fce45806ca9b51f84788046952ce096 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 13 Nov 2018 14:33:45 +0800
Subject: [PATCH 2019/3220] f2fs: fix out-place-update DIO write

In get_more_blocks(), we may override @create as below code:

	create = dio->op == REQ_OP_WRITE;
	if (dio->flags & DIO_SKIP_HOLES) {
		if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
						i_blkbits))
			create = 0;
	}

But in f2fs_map_blocks(), we only trigger f2fs_balance_fs() if @create
is 1, so in LFS mode, dio overwrite under LFS mode can easily run out
of free segments, result in below panic.

 Call Trace:
  allocate_segment_by_default+0xa8/0x270 [f2fs]
  f2fs_allocate_data_block+0x1ea/0x5c0 [f2fs]
  __allocate_data_block+0x306/0x480 [f2fs]
  f2fs_map_blocks+0x6f6/0x920 [f2fs]
  __get_data_block+0x4f/0xb0 [f2fs]
  get_data_block_dio_write+0x50/0x60 [f2fs]
  do_blockdev_direct_IO+0xcd5/0x21e0
  __blockdev_direct_IO+0x3a/0x3c
  f2fs_direct_IO+0x1ff/0x4a0 [f2fs]
  generic_file_direct_write+0xd9/0x160
  __generic_file_write_iter+0xbb/0x1e0
  f2fs_file_write_iter+0xaf/0x220 [f2fs]
  __vfs_write+0xd0/0x130
  vfs_write+0xb2/0x1b0
  SyS_pwrite64+0x69/0xa0
  ? vtime_user_exit+0x29/0x70
  do_syscall_64+0x6e/0x160
  entry_SYSCALL64_slow_path+0x25/0x25
 RIP: new_curseg+0x36f/0x380 [f2fs] RSP: ffffac570393f7a8

So this patch introduces a parameter map.m_may_create to indicate that
f2fs_map_blocks() is called from write or read path, which can give the
right hint to let f2fs_map_blocks() trigger OPU allocation and call
f2fs_balanc_fs() correctly.

BTW, it disables physical address preallocation for direct IO in
f2fs_preallocate_blocks, which is redundant to OPU allocation of
f2fs_map_blocks.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 40 ++++++++++++++++++++++++++++------------
 fs/f2fs/f2fs.h |  1 +
 fs/f2fs/file.c |  3 ++-
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0ea5cee98996..ec43cf97bf54 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -954,6 +954,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 			return err;
 	}
 
+	if (direct_io && allow_outplace_dio(inode, iocb, from))
+		return 0;
+
 	if (is_inode_flag_set(inode, FI_NO_PREALLOC))
 		return 0;
 
@@ -967,6 +970,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	map.m_next_pgofs = NULL;
 	map.m_next_extent = NULL;
 	map.m_seg_type = NO_CHECK_TYPE;
+	map.m_may_create = true;
 
 	if (direct_io) {
 		map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
@@ -1025,7 +1029,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	unsigned int maxblocks = map->m_len;
 	struct dnode_of_data dn;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	int mode = create ? ALLOC_NODE : LOOKUP_NODE;
+	int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE;
 	pgoff_t pgofs, end_offset, end;
 	int err = 0, ofs = 1;
 	unsigned int ofs_in_node, last_ofs_in_node;
@@ -1059,7 +1063,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	}
 
 next_dnode:
-	if (create)
+	if (map->m_may_create)
 		__do_map_lock(sbi, flag, true);
 
 	/* When reading holes, we need its node page */
@@ -1096,8 +1100,8 @@ next_block:
 
 	if (is_valid_data_blkaddr(sbi, blkaddr)) {
 		/* use out-place-update for driect IO under LFS mode */
-		if (test_opt(sbi, LFS) && create &&
-				flag == F2FS_GET_BLOCK_DIO) {
+		if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+							map->m_may_create) {
 			err = __allocate_data_block(&dn, map->m_seg_type);
 			if (!err)
 				set_inode_flag(inode, FI_APPEND_WRITE);
@@ -1206,7 +1210,7 @@ skip:
 
 	f2fs_put_dnode(&dn);
 
-	if (create) {
+	if (map->m_may_create) {
 		__do_map_lock(sbi, flag, false);
 		f2fs_balance_fs(sbi, dn.node_changed);
 	}
@@ -1232,7 +1236,7 @@ sync_out:
 	}
 	f2fs_put_dnode(&dn);
 unlock_out:
-	if (create) {
+	if (map->m_may_create) {
 		__do_map_lock(sbi, flag, false);
 		f2fs_balance_fs(sbi, dn.node_changed);
 	}
@@ -1268,7 +1272,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
 
 static int __get_data_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh, int create, int flag,
-			pgoff_t *next_pgofs, int seg_type)
+			pgoff_t *next_pgofs, int seg_type, bool may_write)
 {
 	struct f2fs_map_blocks map;
 	int err;
@@ -1278,6 +1282,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 	map.m_next_pgofs = next_pgofs;
 	map.m_next_extent = NULL;
 	map.m_seg_type = seg_type;
+	map.m_may_create = may_write;
 
 	err = f2fs_map_blocks(inode, &map, create, flag);
 	if (!err) {
@@ -1294,16 +1299,25 @@ static int get_data_block(struct inode *inode, sector_t iblock,
 {
 	return __get_data_block(inode, iblock, bh_result, create,
 							flag, next_pgofs,
-							NO_CHECK_TYPE);
+							NO_CHECK_TYPE, create);
+}
+
+static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create)
+{
+	return __get_data_block(inode, iblock, bh_result, create,
+				F2FS_GET_BLOCK_DIO, NULL,
+				f2fs_rw_hint_to_seg_type(inode->i_write_hint),
+				true);
 }
 
 static int get_data_block_dio(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	return __get_data_block(inode, iblock, bh_result, create,
-						F2FS_GET_BLOCK_DIO, NULL,
-						f2fs_rw_hint_to_seg_type(
-							inode->i_write_hint));
+				F2FS_GET_BLOCK_DIO, NULL,
+				f2fs_rw_hint_to_seg_type(inode->i_write_hint),
+				false);
 }
 
 static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -1315,7 +1329,7 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock,
 
 	return __get_data_block(inode, iblock, bh_result, create,
 						F2FS_GET_BLOCK_BMAP, NULL,
-						NO_CHECK_TYPE);
+						NO_CHECK_TYPE, create);
 }
 
 static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -1522,6 +1536,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 	map.m_next_pgofs = NULL;
 	map.m_next_extent = NULL;
 	map.m_seg_type = NO_CHECK_TYPE;
+	map.m_may_create = false;
 
 	for (; nr_pages; nr_pages--) {
 		if (pages) {
@@ -2639,6 +2654,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
 			iter, offset,
+			rw == WRITE ? get_data_block_dio_write :
 			get_data_block_dio, NULL, f2fs_dio_submit_bio,
 			DIO_LOCKING | DIO_SKIP_HOLES);
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3b8bc9a78606..34fad8f024ac 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -667,6 +667,7 @@ struct f2fs_map_blocks {
 	pgoff_t *m_next_pgofs;		/* point next possible non-hole pgofs */
 	pgoff_t *m_next_extent;		/* point to next possible extent */
 	int m_seg_type;
+	bool m_may_create;		/* indicate it is from write path */
 };
 
 /* for flag in get_data_block */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 206d2188a770..f44e7c6ae595 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1506,7 +1506,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_map_blocks map = { .m_next_pgofs = NULL,
-			.m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE };
+			.m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE,
+			.m_may_create = true };
 	pgoff_t pg_end;
 	loff_t new_size = i_size_read(inode);
 	loff_t off_end;

From c461cc70cf27ea100c722e7ac7f15a442e450fd3 Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 13 Nov 2018 11:57:32 +0800
Subject: [PATCH 2020/3220] f2fs: only flush the single temp bio cache which
 owns the target page

Previously, when f2fs finds which temp bio cache owns the target page,
it will flush all the three temp bio caches, but we only need to flush
one single bio cache indeed, which can help to keep bio merged.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 37 ++++++++++---------------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ec43cf97bf54..b98671df78e7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -369,29 +369,6 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
 	return false;
 }
 
-static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
-						struct page *page, nid_t ino,
-						enum page_type type)
-{
-	enum page_type btype = PAGE_TYPE_OF_BIO(type);
-	enum temp_type temp;
-	struct f2fs_bio_info *io;
-	bool ret = false;
-
-	for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
-		io = sbi->write_io[btype] + temp;
-
-		down_read(&io->io_rwsem);
-		ret = __has_merged_page(io, inode, page, ino);
-		up_read(&io->io_rwsem);
-
-		/* TODO: use HOT temp only for meta pages now. */
-		if (ret || btype == META)
-			break;
-	}
-	return ret;
-}
-
 static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
 				enum page_type type, enum temp_type temp)
 {
@@ -417,13 +394,19 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
 				nid_t ino, enum page_type type, bool force)
 {
 	enum temp_type temp;
-
-	if (!force && !has_merged_page(sbi, inode, page, ino, type))
-		return;
+	bool ret = true;
 
 	for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
+		if (!force)	{
+			enum page_type btype = PAGE_TYPE_OF_BIO(type);
+			struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
 
-		__f2fs_submit_merged_write(sbi, type, temp);
+			down_read(&io->io_rwsem);
+			ret = __has_merged_page(io, inode, page, ino);
+			up_read(&io->io_rwsem);
+		}
+		if (ret)
+			__f2fs_submit_merged_write(sbi, type, temp);
 
 		/* TODO: use HOT temp only for meta pages now. */
 		if (type >= META)

From bec98df4142c3d4e761a35bfeb53da0a9544955a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 14 Nov 2018 12:40:30 -0800
Subject: [PATCH 2021/3220] f2fs: check memory boundary by insane namelen

If namelen is corrupted to have very long value, fill_dentries can copy
wrong memory area.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index eedc07b46a52..5cd19f2e6e04 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -808,6 +808,17 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 		de_name.name = d->filename[bit_pos];
 		de_name.len = le16_to_cpu(de->name_len);
 
+		/* check memory boundary before moving forward */
+		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+		if (unlikely(bit_pos > d->max)) {
+			f2fs_msg(sbi->sb, KERN_WARNING,
+				"%s: corrupted namelen=%d, run fsck to fix.",
+				__func__, le16_to_cpu(de->name_len));
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			err = -EINVAL;
+			goto out;
+		}
+
 		if (f2fs_encrypted_inode(d->inode)) {
 			int save_len = fstr->len;
 
@@ -830,7 +841,6 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 		if (readdir_ra)
 			f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
 
-		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
 		ctx->pos = start_pos + bit_pos;
 	}
 out:

From e4f63bec5edb22cbb527b00a63ae3da4337b71f0 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong1@huawei.com>
Date: Wed, 14 Nov 2018 19:34:28 +0800
Subject: [PATCH 2022/3220] f2fs: fix race between write_checkpoint and
 write_begin

The following race could lead to inconsistent SIT bitmap:

Task A                          Task B
======                          ======
f2fs_write_checkpoint
  block_operations
    f2fs_lock_all
      down_write(node_change)
      down_write(node_write)
      ... sync ...
      up_write(node_change)
                                f2fs_file_write_iter
                                  set_inode_flag(FI_NO_PREALLOC)
                                  ......
                                  f2fs_write_begin(index=0, has inline data)
                                    prepare_write_begin
                                      __do_map_lock(AIO) => down_read(node_change)
                                      f2fs_convert_inline_page => update SIT
                                      __do_map_lock(AIO) => up_read(node_change)
  f2fs_flush_sit_entries <= inconsistent SIT
  finish write checkpoint
  sudden-power-off

If SPO occurs after checkpoint is finished, SIT bitmap will be set
incorrectly.

Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b98671df78e7..d2d3b6492924 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2318,6 +2318,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 	bool locked = false;
 	struct extent_info ei = {0,0,0};
 	int err = 0;
+	int flag;
 
 	/*
 	 * we already allocated all the blocks, so we don't need to get
@@ -2327,9 +2328,15 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 			!is_inode_flag_set(inode, FI_NO_PREALLOC))
 		return 0;
 
+	/* f2fs_lock_op avoids race between write CP and convert_inline_page */
+	if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode))
+		flag = F2FS_GET_BLOCK_DEFAULT;
+	else
+		flag = F2FS_GET_BLOCK_PRE_AIO;
+
 	if (f2fs_has_inline_data(inode) ||
 			(pos & PAGE_MASK) >= i_size_read(inode)) {
-		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+		__do_map_lock(sbi, flag, true);
 		locked = true;
 	}
 restart:
@@ -2367,6 +2374,7 @@ restart:
 				f2fs_put_dnode(&dn);
 				__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
 								true);
+				WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
 				locked = true;
 				goto restart;
 			}
@@ -2380,7 +2388,7 @@ out:
 	f2fs_put_dnode(&dn);
 unlock_out:
 	if (locked)
-		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+		__do_map_lock(sbi, flag, false);
 	return err;
 }
 

From 8baea90490c0a30352a527e2051ca44331febd4c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 26 Nov 2018 14:20:32 -0800
Subject: [PATCH 2023/3220] f2fs: avoid build warn of fall_through

After merging the f2fs tree, today's linux-next build
 (x86_64_allmodconfig) produced this warning:

 In file included from fs/f2fs/dir.c:11:
 fs/f2fs/f2fs.h: In function '__mark_inode_dirty_flag':
 fs/f2fs/f2fs.h:2388:6: warning: this statement may fall through [-Wimplicit-fallthrough=]
    if (set)
       ^
 fs/f2fs/f2fs.h:2390:2: note: here
   case FI_DATA_EXIST:
   ^~~~

 Exposed by my use of -Wimplicit-fallthrough

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 34fad8f024ac..c61d9c8a5613 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2452,6 +2452,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
 	case FI_NEW_INODE:
 		if (set)
 			return;
+		/* fall through */
 	case FI_DATA_EXIST:
 	case FI_INLINE_DOTS:
 	case FI_PIN_FILE:

From f0d3a002d4d71e4c5bc5ce3f82c8a9652c2e9bc0 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <kernelpatch@126.com>
Date: Wed, 21 Nov 2018 07:21:38 +0800
Subject: [PATCH 2024/3220] f2fs: fix wrong return value of f2fs_acl_create

When call f2fs_acl_create_masq() failed, the caller f2fs_acl_create()
should return -EIO instead of -ENOMEM, this patch makes it consistent
with posix_acl_create() which has been fixed in commit beaf226b863a
("posix_acl: don't ignore return value of posix_acl_create_masq()").

Fixes: 83dfe53c185e ("f2fs: fix reference leaks in f2fs_acl_create")
Signed-off-by: Tiezhu Yang <kernelpatch@126.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/acl.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index f82f916e1f2c..5f224550951b 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -355,12 +355,14 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
 		return PTR_ERR(p);
 
 	clone = f2fs_acl_clone(p, GFP_NOFS);
-	if (!clone)
-		goto no_mem;
+	if (!clone) {
+		ret = -ENOMEM;
+		goto release_acl;
+	}
 
 	ret = f2fs_acl_create_masq(clone, mode);
 	if (ret < 0)
-		goto no_mem_clone;
+		goto release_clone;
 
 	if (ret == 0)
 		posix_acl_release(clone);
@@ -374,11 +376,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
 
 	return 0;
 
-no_mem_clone:
+release_clone:
 	posix_acl_release(clone);
-no_mem:
+release_acl:
 	posix_acl_release(p);
-	return -ENOMEM;
+	return ret;
 }
 
 int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,

From 3747e9b23618ba3e45f93b21d09fc918d649c32d Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Thu, 22 Nov 2018 18:58:46 +0800
Subject: [PATCH 2025/3220] f2fs: read page index before freeing

The function truncate_node frees the page with f2fs_put_page. However,
the page index is read after that. So, the patch reads the index before
freeing the page.

Fixes: bf39c00a9a7f ("f2fs: drop obsolete node page when it is truncated")
Cc: <stable@vger.kernel.org>
Signed-off-by: Pan Bian <bianpan2016@163.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a5eec61f34a6..a63ce35a988f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -826,6 +826,7 @@ static int truncate_node(struct dnode_of_data *dn)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct node_info ni;
 	int err;
+	pgoff_t index;
 
 	err = f2fs_get_node_info(sbi, dn->nid, &ni);
 	if (err)
@@ -845,10 +846,11 @@ static int truncate_node(struct dnode_of_data *dn)
 	clear_node_page_dirty(dn->node_page);
 	set_sbi_flag(sbi, SBI_IS_DIRTY);
 
+	index = dn->node_page->index;
 	f2fs_put_page(dn->node_page, 1);
 
 	invalidate_mapping_pages(NODE_MAPPING(sbi),
-			dn->node_page->index, dn->node_page->index);
+			index, index);
 
 	dn->node_page = NULL;
 	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);

From 90a67fc7b5eabdf142f30dbd6b31d0473c048031 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 24 Nov 2018 12:06:42 +0300
Subject: [PATCH 2026/3220] f2fs: make "f2fs_fault_name[]" const char *

Those strings are immutable.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 2 +-
 fs/f2fs/super.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c61d9c8a5613..4906417ea61a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -69,7 +69,7 @@ struct f2fs_fault_info {
 	unsigned int inject_type;
 };
 
-extern char *f2fs_fault_name[FAULT_MAX];
+extern const char *f2fs_fault_name[FAULT_MAX];
 #define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
 #endif
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index cb41f2e5ed18..7a870c77564a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -38,7 +38,7 @@ static struct kmem_cache *f2fs_inode_cachep;
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 
-char *f2fs_fault_name[FAULT_MAX] = {
+const char *f2fs_fault_name[FAULT_MAX] = {
 	[FAULT_KMALLOC]		= "kmalloc",
 	[FAULT_KVMALLOC]	= "kvmalloc",
 	[FAULT_PAGE_ALLOC]	= "page alloc",

From 89ff71065347405357632fc56bc47453c883d01c Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Mon, 26 Nov 2018 13:31:41 +0530
Subject: [PATCH 2027/3220] f2fs: fix to allow node segment for GC by ioctl
 path

Allow node type segments also to be GC'd via f2fs ioctl
F2FS_IOC_GARBAGE_COLLECT_RANGE.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d6e4ceef964b..6af8c3fbfc14 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -323,8 +323,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 	p.min_cost = get_max_cost(sbi, &p);
 
 	if (*result != NULL_SEGNO) {
-		if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
-			get_valid_blocks(sbi, *result, false) &&
+		if (get_valid_blocks(sbi, *result, false) &&
 			!sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
 			p.min_segno = *result;
 		goto out;

From 7117d4877f224e3f37b197575a93959d11c73f8b Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Mon, 26 Nov 2018 13:31:42 +0530
Subject: [PATCH 2028/3220] f2fs: adjust trace print in f2fs_get_victim() to
 cover all paths

Adjust the trace print in f2fs_get_victim() to cover GC done by
F2FS_IOC_GARBAGE_COLLECT_RANGE.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6af8c3fbfc14..d3208cc75222 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -420,11 +420,12 @@ got_result:
 				set_bit(secno, dirty_i->victim_secmap);
 		}
 
+	}
+out:
+	if (p.min_segno != NULL_SEGNO)
 		trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
 				sbi->cur_victim_sec,
 				prefree_segments(sbi), free_segments(sbi));
-	}
-out:
 	mutex_unlock(&dirty_i->seglist_lock);
 
 	return (p.min_segno == NULL_SEGNO) ? 0 : 1;

From b16b66345db666d0e98024c6b2ddd2d74d126db4 Mon Sep 17 00:00:00 2001
From: Jia Zhu <zhujia13@huawei.com>
Date: Tue, 27 Nov 2018 02:32:32 +0800
Subject: [PATCH 2029/3220] f2fs: fix to update new block address correctly for
 OPU

Previously, we allocated a new block address for OPU mode in direct_IO.

But the new address couldn't be assigned to @map->m_pblk correctly.

This patch fix it.

Cc: <stable@vger.kernel.org>
Fixes: 511f52d02f05 ("f2fs: allow out-place-update for direct IO in LFS mode")
Signed-off-by: Jia Zhu <zhujia13@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d2d3b6492924..22ace799bc68 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1086,8 +1086,10 @@ next_block:
 		if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
 							map->m_may_create) {
 			err = __allocate_data_block(&dn, map->m_seg_type);
-			if (!err)
+			if (!err) {
+				blkaddr = dn.data_blkaddr;
 				set_inode_flag(inode, FI_APPEND_WRITE);
+			}
 		}
 	} else {
 		if (create) {

From 1bfbe019b4f1660e1e47c77db60d5e57bcbe580f Mon Sep 17 00:00:00 2001
From: Jia Zhu <zhujia13@huawei.com>
Date: Tue, 20 Nov 2018 04:29:35 +0800
Subject: [PATCH 2030/3220] f2fs: fix m_may_create to make OPU DIO write
 correctly

Previously, we added a parameter @map.m_may_create to trigger OPU
allocation and call f2fs_balance_fs() correctly.

But in get_more_blocks(), @create has been overwritten by below code.
So the function f2fs_map_blocks() will not allocate new block address
but directly go out. Meanwile,there are several functions calling
f2fs_map_blocks() directly and @map.m_may_create not initialized.
CODE:
create = dio->op == REQ_OP_WRITE;
	if (dio->flags & DIO_SKIP_HOLES) {
		if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
						i_blkbits))
			create = 0;
	}

This patch fixes it.

Signed-off-by: Jia Zhu <zhujia13@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 5 +++++
 fs/f2fs/file.c | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 22ace799bc68..4e8bc849c82b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1032,6 +1032,10 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	end = pgofs + maxblocks;
 
 	if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+		if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+							map->m_may_create)
+			goto next_dnode;
+
 		map->m_pblk = ei.blk + pgofs - ei.fofs;
 		map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
 		map->m_flags = F2FS_MAP_MAPPED;
@@ -1243,6 +1247,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
 	map.m_next_pgofs = NULL;
 	map.m_next_extent = NULL;
 	map.m_seg_type = NO_CHECK_TYPE;
+	map.m_may_create = false;
 	last_lblk = F2FS_BLK_ALIGN(pos + len);
 
 	while (map.m_lblk < last_lblk) {
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f44e7c6ae595..88ecbf278788 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2201,7 +2201,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 {
 	struct inode *inode = file_inode(filp);
 	struct f2fs_map_blocks map = { .m_next_extent = NULL,
-					.m_seg_type = NO_CHECK_TYPE };
+					.m_seg_type = NO_CHECK_TYPE ,
+					.m_may_create = false };
 	struct extent_info ei = {0, 0, 0};
 	pgoff_t pg_start, pg_end, next_pgofs;
 	unsigned int blk_per_seg = sbi->blocks_per_seg;
@@ -2715,6 +2716,7 @@ int f2fs_precache_extents(struct inode *inode)
 	map.m_next_pgofs = NULL;
 	map.m_next_extent = &m_next_extent;
 	map.m_seg_type = NO_CHECK_TYPE;
+	map.m_may_create = false;
 	end = F2FS_I_SB(inode)->max_file_blocks;
 
 	while (map.m_lblk < end) {

From a390614ee535407b24d056c204dcadc970328f4e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 27 Nov 2018 23:28:37 -0800
Subject: [PATCH 2031/3220] f2fs: avoid frequent costly fsck triggers

If we want to re-enable nat_bits, we rely on fsck which requires full scan
of directory tree. Let's do that by regular fsck or unclean shutdown.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4906417ea61a..64bdf7b45e4b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1686,7 +1686,11 @@ static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
 {
 	unsigned long flags;
 
-	set_sbi_flag(sbi, SBI_NEED_FSCK);
+	/*
+	 * In order to re-enable nat_bits we need to call fsck.f2fs by
+	 * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost,
+	 * so let's rely on regular fsck or unclean shutdown.
+	 */
 
 	if (lock)
 		spin_lock_irqsave(&sbi->cp_lock, flags);

From d8e56f5fbbe4f279e73ec9e1f484bc35756daadd Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 28 Nov 2018 13:26:03 -0800
Subject: [PATCH 2032/3220] f2fs: add an ioctl() to explicitly trigger fsck
 later

This adds an option in ioctl(F2FS_IOC_SHUTDOWN) in order to trigger fsck by
setting a NEED_FSCK flag.

Generally, shutdown is used for the test to validate filesystem consistency, and
setting NEED_FSCK flag can be used for Android to trigger fsck.f2fs at boot time
explicitly so that we could measure the elapsed time as well as force filesystem
check.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 1 +
 fs/f2fs/file.c | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 64bdf7b45e4b..01524de81d10 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -485,6 +485,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
 #define F2FS_GOING_DOWN_METASYNC	0x1	/* going down with metadata */
 #define F2FS_GOING_DOWN_NOSYNC		0x2	/* going down */
 #define F2FS_GOING_DOWN_METAFLUSH	0x3	/* going down with meta flush */
+#define F2FS_GOING_DOWN_NEED_FSCK	0x4	/* going down to trigger fsck */
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 88ecbf278788..cadfecbd7233 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1966,6 +1966,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 		f2fs_stop_checkpoint(sbi, false);
 		set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
 		break;
+	case F2FS_GOING_DOWN_NEED_FSCK:
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		/* do checkpoint only */
+		ret = f2fs_sync_fs(sb, 1);
+		if (ret)
+			goto out;
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;

From 7c3f76cff3c4768665a34d1a9cddf70db11a67fa Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong1@huawei.com>
Date: Tue, 4 Dec 2018 22:59:21 +0800
Subject: [PATCH 2033/3220] f2fs: clear PG_writeback if IPU failed

If IPU failed, nothing is commited, we should end page writeback.

Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4e8bc849c82b..0cea551ee567 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1856,6 +1856,8 @@ got_it:
 		if (fio->need_lock == LOCK_REQ)
 			f2fs_unlock_op(fio->sbi);
 		err = f2fs_inplace_write_data(fio);
+		if (err && PageWriteback(page))
+			end_page_writeback(page);
 		trace_f2fs_do_write_data_page(fio->page, IPU);
 		set_inode_flag(inode, FI_UPDATE_WRITE);
 		return err;

From b7b4be291325639ab9c2676c38d373356eaa5520 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 12 Dec 2018 18:12:30 +0800
Subject: [PATCH 2034/3220] f2fs: fix to reorder set_page_dirty and
 wait_on_page_writeback

This patch reorders flow from

- update page
- set_page_dirty
- wait_on_page_writeback

to

- wait_on_page_writeback
- update page
- set_page_dirty

The reason is:
- set_page_dirty will increase reference of dirty page, the reference
should be cleared before wait_on_page_writeback to keep its consistency.
- some devices need stable page during page writebacking, so we
should not change page's data.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 7 ++++---
 fs/f2fs/gc.c         | 5 +++--
 fs/f2fs/node.c       | 5 +++--
 fs/f2fs/segment.c    | 3 ++-
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f13548e59124..eb5991d57e06 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1291,11 +1291,12 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi,
 	struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
 	int err;
 
-	memcpy(page_address(page), src, PAGE_SIZE);
-	set_page_dirty(page);
-
 	f2fs_wait_on_page_writeback(page, META, true);
 	f2fs_bug_on(sbi, PageWriteback(page));
+
+	memcpy(page_address(page), src, PAGE_SIZE);
+
+	set_page_dirty(page);
 	if (unlikely(!clear_page_dirty_for_io(page)))
 		f2fs_bug_on(sbi, 1);
 
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d3208cc75222..06f2a4f0bf8b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -829,8 +829,8 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	}
 
 write_page:
-	set_page_dirty(fio.encrypted_page);
 	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
+	set_page_dirty(fio.encrypted_page);
 	if (clear_page_dirty_for_io(fio.encrypted_page))
 		dec_page_count(fio.sbi, F2FS_DIRTY_META);
 
@@ -924,8 +924,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
 		bool is_dirty = PageDirty(page);
 
 retry:
-		set_page_dirty(page);
 		f2fs_wait_on_page_writeback(page, DATA, true);
+
+		set_page_dirty(page);
 		if (clear_page_dirty_for_io(page)) {
 			inode_dec_dirty_pages(inode);
 			f2fs_remove_dirty_inode(inode);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a63ce35a988f..f23bc3ebbdec 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1600,10 +1600,11 @@ int f2fs_move_node_page(struct page *node_page, int gc_type)
 			.for_reclaim = 0,
 		};
 
-		set_page_dirty(node_page);
 		f2fs_wait_on_page_writeback(node_page, NODE, true);
-
 		f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
+
+		set_page_dirty(node_page);
+
 		if (!clear_page_dirty_for_io(node_page)) {
 			err = -EAGAIN;
 			goto out_page;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9c9199df9773..6be378f548bc 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -387,8 +387,9 @@ static int __f2fs_commit_inmem_pages(struct inode *inode)
 		if (page->mapping == inode->i_mapping) {
 			trace_f2fs_commit_inmem_page(page, INMEM);
 
-			set_page_dirty(page);
 			f2fs_wait_on_page_writeback(page, DATA, true);
+
+			set_page_dirty(page);
 			if (clear_page_dirty_for_io(page)) {
 				inode_dec_dirty_pages(inode);
 				f2fs_remove_dirty_inode(inode);

From 26c5b3d768517cfae43508b11a7f5bd98fe8c085 Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Thu, 13 Dec 2018 17:43:11 +0800
Subject: [PATCH 2035/3220] f2fs: remove redundant comment of unused wio_mutex

Commit 089842de ("f2fs: remove codes of unused wio_mutex") removes codes
of unused wio_mutex, but missing the comment, so delete it.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 01524de81d10..87fed4d84ce0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1240,7 +1240,6 @@ struct f2fs_sb_info {
 
 	/* for bio operations */
 	struct f2fs_bio_info *write_io[NR_PAGE_TYPE];	/* for write bios */
-						/* bio ordering for NODE/DATA */
 	/* keep migration IO order for LFS mode */
 	struct rw_semaphore io_order_lock;
 	mempool_t *write_io_dummy;		/* Dummy pages */

From c7495fa806988dee65f33c24b93bdb8c2a8ed32e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 13 Dec 2018 18:38:33 -0800
Subject: [PATCH 2036/3220] f2fs: use kvmalloc, if kmalloc is failed

One report says memalloc failure during mount.

 (unwind_backtrace) from [<c010cd4c>] (show_stack+0x10/0x14)
 (show_stack) from [<c049c6b8>] (dump_stack+0x8c/0xa0)
 (dump_stack) from [<c024fcf0>] (warn_alloc+0xc4/0x160)
 (warn_alloc) from [<c0250218>] (__alloc_pages_nodemask+0x3f4/0x10d0)
 (__alloc_pages_nodemask) from [<c0270450>] (kmalloc_order_trace+0x2c/0x120)
 (kmalloc_order_trace) from [<c03fa748>] (build_node_manager+0x35c/0x688)
 (build_node_manager) from [<c03de494>] (f2fs_fill_super+0xf0c/0x16cc)
 (f2fs_fill_super) from [<c02a5864>] (mount_bdev+0x15c/0x188)
 (mount_bdev) from [<c03da624>] (f2fs_mount+0x18/0x20)
 (f2fs_mount) from [<c02a68b8>] (mount_fs+0x158/0x19c)
 (mount_fs) from [<c02c3c9c>] (vfs_kern_mount+0x78/0x134)
 (vfs_kern_mount) from [<c02c76ac>] (do_mount+0x474/0xca4)
 (do_mount) from [<c02c8264>] (SyS_mount+0x94/0xbc)
 (SyS_mount) from [<c0108180>] (ret_fast_syscall+0x0/0x48)

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/acl.c        |  6 ++--
 fs/f2fs/checkpoint.c |  2 +-
 fs/f2fs/data.c       |  2 +-
 fs/f2fs/debug.c      |  2 +-
 fs/f2fs/f2fs.h       | 30 +++++++++++--------
 fs/f2fs/gc.c         |  4 +--
 fs/f2fs/inline.c     |  4 +--
 fs/f2fs/namei.c      |  2 +-
 fs/f2fs/node.c       | 10 +++----
 fs/f2fs/segment.c    | 36 +++++++++++------------
 fs/f2fs/super.c      | 68 ++++++++++++++++++++++----------------------
 11 files changed, 86 insertions(+), 80 deletions(-)

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 5f224550951b..7ebf7b958f9e 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -160,7 +160,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
 	return (void *)f2fs_acl;
 
 fail:
-	kfree(f2fs_acl);
+	kvfree(f2fs_acl);
 	return ERR_PTR(-EINVAL);
 }
 
@@ -190,7 +190,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
 		acl = NULL;
 	else
 		acl = ERR_PTR(retval);
-	kfree(value);
+	kvfree(value);
 
 	if (!IS_ERR(acl))
 		set_cached_acl(inode, type, acl);
@@ -243,7 +243,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 
 	error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
 
-	kfree(value);
+	kvfree(value);
 	if (!error)
 		set_cached_acl(inode, type, acl);
 
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index eb5991d57e06..cb1609ee2a3c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -912,7 +912,7 @@ free_fail_no_cp:
 	f2fs_put_page(cp1, 1);
 	f2fs_put_page(cp2, 1);
 fail_no_cp:
-	kfree(sbi->ckpt);
+	kvfree(sbi->ckpt);
 	return -EINVAL;
 }
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0cea551ee567..028ebc31c60a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2569,7 +2569,7 @@ static void f2fs_dio_end_io(struct bio *bio)
 	bio->bi_private = dio->orig_private;
 	bio->bi_end_io = dio->orig_end_io;
 
-	kfree(dio);
+	kvfree(dio);
 
 	bio_endio(bio);
 }
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 06e72f9c8654..0f1d99b6dd8c 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -514,7 +514,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
 	list_del(&si->stat_list);
 	mutex_unlock(&f2fs_stat_mutex);
 
-	kfree(si);
+	kvfree(si);
 }
 
 int __init f2fs_create_root_stats(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 87fed4d84ce0..72fcc3a731e2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1695,7 +1695,7 @@ static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
 	if (lock)
 		spin_lock_irqsave(&sbi->cp_lock, flags);
 	__clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
-	kfree(NM_I(sbi)->nat_bits);
+	kvfree(NM_I(sbi)->nat_bits);
 	NM_I(sbi)->nat_bits = NULL;
 	if (lock)
 		spin_unlock_irqrestore(&sbi->cp_lock, flags);
@@ -2766,17 +2766,6 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
 	return S_ISREG(inode->i_mode);
 }
 
-static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
-					size_t size, gfp_t flags)
-{
-	if (time_to_inject(sbi, FAULT_KMALLOC)) {
-		f2fs_show_injection_info(FAULT_KMALLOC);
-		return NULL;
-	}
-
-	return kmalloc(size, flags);
-}
-
 static inline void *kvmalloc(size_t size, gfp_t flags)
 {
 	void *ret;
@@ -2787,6 +2776,23 @@ static inline void *kvmalloc(size_t size, gfp_t flags)
 	return ret;
 }
 
+static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
+					size_t size, gfp_t flags)
+{
+	void *ret;
+
+	if (time_to_inject(sbi, FAULT_KMALLOC)) {
+		f2fs_show_injection_info(FAULT_KMALLOC);
+		return NULL;
+	}
+
+	ret = kmalloc(size, flags);
+	if (ret)
+		return ret;
+
+	return kvmalloc(size, flags);
+}
+
 static inline void *kvzalloc(size_t size, gfp_t flags)
 {
 	void *ret;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 06f2a4f0bf8b..f1dfb710aae1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -142,7 +142,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
 			"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(gc_th->f2fs_gc_task)) {
 		err = PTR_ERR(gc_th->f2fs_gc_task);
-		kfree(gc_th);
+		kvfree(gc_th);
 		sbi->gc_thread = NULL;
 	}
 out:
@@ -155,7 +155,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
 	if (!gc_th)
 		return;
 	kthread_stop(gc_th->f2fs_gc_task);
-	kfree(gc_th);
+	kvfree(gc_th);
 	sbi->gc_thread = NULL;
 }
 
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 15269b971e59..69059f27c6f0 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -501,7 +501,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
 
 	stat_dec_inline_dir(dir);
 	clear_inode_flag(dir, FI_INLINE_DENTRY);
-	kfree(backup_dentry);
+	kvfree(backup_dentry);
 	return 0;
 recover:
 	lock_page(ipage);
@@ -512,7 +512,7 @@ recover:
 	set_page_dirty(ipage);
 	f2fs_put_page(ipage, 1);
 
-	kfree(backup_dentry);
+	kvfree(backup_dentry);
 	return err;
 }
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index de1517810f65..e682589e07c4 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -634,7 +634,7 @@ out_f2fs_handle_failed_inode:
 	f2fs_handle_failed_inode(inode);
 out_free_encrypted_link:
 	if (disk_link.name != (unsigned char *)symname)
-		kfree(disk_link.name);
+		kvfree(disk_link.name);
 	return err;
 }
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f23bc3ebbdec..f500b8e7cd65 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -3121,17 +3121,17 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
 
 		for (i = 0; i < nm_i->nat_blocks; i++)
 			kvfree(nm_i->free_nid_bitmap[i]);
-		kfree(nm_i->free_nid_bitmap);
+		kvfree(nm_i->free_nid_bitmap);
 	}
 	kvfree(nm_i->free_nid_count);
 
-	kfree(nm_i->nat_bitmap);
-	kfree(nm_i->nat_bits);
+	kvfree(nm_i->nat_bitmap);
+	kvfree(nm_i->nat_bits);
 #ifdef CONFIG_F2FS_CHECK_FS
-	kfree(nm_i->nat_bitmap_mir);
+	kvfree(nm_i->nat_bitmap_mir);
 #endif
 	sbi->nm_info = NULL;
-	kfree(nm_i);
+	kvfree(nm_i);
 }
 
 int __init f2fs_create_node_manager_caches(void)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6be378f548bc..acdf14c82935 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -706,7 +706,7 @@ init_thread:
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(fcc->f2fs_issue_flush)) {
 		err = PTR_ERR(fcc->f2fs_issue_flush);
-		kfree(fcc);
+		kvfree(fcc);
 		SM_I(sbi)->fcc_info = NULL;
 		return err;
 	}
@@ -725,7 +725,7 @@ void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
 		kthread_stop(flush_thread);
 	}
 	if (free) {
-		kfree(fcc);
+		kvfree(fcc);
 		SM_I(sbi)->fcc_info = NULL;
 	}
 }
@@ -2089,7 +2089,7 @@ init_thread:
 				"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(dcc->f2fs_issue_discard)) {
 		err = PTR_ERR(dcc->f2fs_issue_discard);
-		kfree(dcc);
+		kvfree(dcc);
 		SM_I(sbi)->dcc_info = NULL;
 		return err;
 	}
@@ -2106,7 +2106,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
 
 	f2fs_stop_discard_thread(sbi);
 
-	kfree(dcc);
+	kvfree(dcc);
 	SM_I(sbi)->dcc_info = NULL;
 }
 
@@ -4394,7 +4394,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
 
 	destroy_victim_secmap(sbi);
 	SM_I(sbi)->dirty_info = NULL;
-	kfree(dirty_i);
+	kvfree(dirty_i);
 }
 
 static void destroy_curseg(struct f2fs_sb_info *sbi)
@@ -4406,10 +4406,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
 		return;
 	SM_I(sbi)->curseg_array = NULL;
 	for (i = 0; i < NR_CURSEG_TYPE; i++) {
-		kfree(array[i].sum_blk);
-		kfree(array[i].journal);
+		kvfree(array[i].sum_blk);
+		kvfree(array[i].journal);
 	}
-	kfree(array);
+	kvfree(array);
 }
 
 static void destroy_free_segmap(struct f2fs_sb_info *sbi)
@@ -4420,7 +4420,7 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi)
 	SM_I(sbi)->free_info = NULL;
 	kvfree(free_i->free_segmap);
 	kvfree(free_i->free_secmap);
-	kfree(free_i);
+	kvfree(free_i);
 }
 
 static void destroy_sit_info(struct f2fs_sb_info *sbi)
@@ -4433,26 +4433,26 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
 
 	if (sit_i->sentries) {
 		for (start = 0; start < MAIN_SEGS(sbi); start++) {
-			kfree(sit_i->sentries[start].cur_valid_map);
+			kvfree(sit_i->sentries[start].cur_valid_map);
 #ifdef CONFIG_F2FS_CHECK_FS
-			kfree(sit_i->sentries[start].cur_valid_map_mir);
+			kvfree(sit_i->sentries[start].cur_valid_map_mir);
 #endif
-			kfree(sit_i->sentries[start].ckpt_valid_map);
-			kfree(sit_i->sentries[start].discard_map);
+			kvfree(sit_i->sentries[start].ckpt_valid_map);
+			kvfree(sit_i->sentries[start].discard_map);
 		}
 	}
-	kfree(sit_i->tmp_map);
+	kvfree(sit_i->tmp_map);
 
 	kvfree(sit_i->sentries);
 	kvfree(sit_i->sec_entries);
 	kvfree(sit_i->dirty_sentries_bitmap);
 
 	SM_I(sbi)->sit_info = NULL;
-	kfree(sit_i->sit_bitmap);
+	kvfree(sit_i->sit_bitmap);
 #ifdef CONFIG_F2FS_CHECK_FS
-	kfree(sit_i->sit_bitmap_mir);
+	kvfree(sit_i->sit_bitmap_mir);
 #endif
-	kfree(sit_i);
+	kvfree(sit_i);
 }
 
 void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
@@ -4468,7 +4468,7 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
 	destroy_free_segmap(sbi);
 	destroy_sit_info(sbi);
 	sbi->sm_info = NULL;
-	kfree(sm_info);
+	kvfree(sm_info);
 }
 
 int __init f2fs_create_segment_manager_caches(void)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 7a870c77564a..1b59991a5645 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -289,7 +289,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 	set_opt(sbi, QUOTA);
 	return 0;
 errout:
-	kfree(qname);
+	kvfree(qname);
 	return ret;
 }
 
@@ -302,7 +302,7 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
 			" when quota turned on");
 		return -EINVAL;
 	}
-	kfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
+	kvfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
 	F2FS_OPTION(sbi).s_qf_names[qtype] = NULL;
 	return 0;
 }
@@ -399,10 +399,10 @@ static int parse_options(struct super_block *sb, char *options)
 				set_opt(sbi, BG_GC);
 				set_opt(sbi, FORCE_FG_GC);
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_disable_roll_forward:
 			set_opt(sbi, DISABLE_ROLL_FORWARD);
@@ -570,7 +570,7 @@ static int parse_options(struct super_block *sb, char *options)
 					f2fs_msg(sb, KERN_WARNING,
 						 "adaptive mode is not allowed with "
 						 "zoned block device feature");
-					kfree(name);
+					kvfree(name);
 					return -EINVAL;
 				}
 				set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
@@ -578,10 +578,10 @@ static int parse_options(struct super_block *sb, char *options)
 					!strncmp(name, "lfs", 3)) {
 				set_opt_mode(sbi, F2FS_MOUNT_LFS);
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_io_size_bits:
 			if (args->from && match_int(args, &arg))
@@ -714,10 +714,10 @@ static int parse_options(struct super_block *sb, char *options)
 					!strncmp(name, "fs-based", 8)) {
 				F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_alloc:
 			name = match_strdup(&args[0]);
@@ -731,10 +731,10 @@ static int parse_options(struct super_block *sb, char *options)
 					!strncmp(name, "reuse", 5)) {
 				F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_fsync:
 			name = match_strdup(&args[0]);
@@ -751,10 +751,10 @@ static int parse_options(struct super_block *sb, char *options)
 				F2FS_OPTION(sbi).fsync_mode =
 							FSYNC_MODE_NOBARRIER;
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		case Opt_test_dummy_encryption:
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
@@ -783,10 +783,10 @@ static int parse_options(struct super_block *sb, char *options)
 					!strncmp(name, "disable", 7)) {
 				set_opt(sbi, DISABLE_CHECKPOINT);
 			} else {
-				kfree(name);
+				kvfree(name);
 				return -EINVAL;
 			}
-			kfree(name);
+			kvfree(name);
 			break;
 		default:
 			f2fs_msg(sb, KERN_ERR,
@@ -1017,10 +1017,10 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
 	for (i = 0; i < sbi->s_ndevs; i++) {
 		blkdev_put(FDEV(i).bdev, FMODE_EXCL);
 #ifdef CONFIG_BLK_DEV_ZONED
-		kfree(FDEV(i).blkz_type);
+		kvfree(FDEV(i).blkz_type);
 #endif
 	}
-	kfree(sbi->devs);
+	kvfree(sbi->devs);
 }
 
 static void f2fs_put_super(struct super_block *sb)
@@ -1084,26 +1084,26 @@ static void f2fs_put_super(struct super_block *sb)
 	f2fs_destroy_node_manager(sbi);
 	f2fs_destroy_segment_manager(sbi);
 
-	kfree(sbi->ckpt);
+	kvfree(sbi->ckpt);
 
 	f2fs_unregister_sysfs(sbi);
 
 	sb->s_fs_info = NULL;
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
-	kfree(sbi->raw_super);
+	kvfree(sbi->raw_super);
 
 	destroy_device_list(sbi);
 	if (sbi->write_io_dummy)
 		mempool_destroy(sbi->write_io_dummy);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
-		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+		kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
 #endif
 	destroy_percpu_info(sbi);
 	for (i = 0; i < NR_PAGE_TYPE; i++)
-		kfree(sbi->write_io[i]);
-	kfree(sbi);
+		kvfree(sbi->write_io[i]);
+	kvfree(sbi);
 }
 
 int f2fs_sync_fs(struct super_block *sb, int sync)
@@ -1532,7 +1532,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 				GFP_KERNEL);
 			if (!org_mount_opt.s_qf_names[i]) {
 				for (j = 0; j < i; j++)
-					kfree(org_mount_opt.s_qf_names[j]);
+					kvfree(org_mount_opt.s_qf_names[j]);
 				return -ENOMEM;
 			}
 		} else {
@@ -1652,7 +1652,7 @@ skip:
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
-		kfree(org_mount_opt.s_qf_names[i]);
+		kvfree(org_mount_opt.s_qf_names[i]);
 #endif
 	/* Update the POSIXACL Flag */
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -1673,7 +1673,7 @@ restore_opts:
 #ifdef CONFIG_QUOTA
 	F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
-		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+		kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
 		F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i];
 	}
 #endif
@@ -2804,7 +2804,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 		}
 	}
 
-	kfree(zones);
+	kvfree(zones);
 
 	return err;
 }
@@ -2864,7 +2864,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
 
 	/* No valid superblock */
 	if (!*raw_super)
-		kfree(super);
+		kvfree(super);
 	else
 		err = 0;
 
@@ -3376,7 +3376,7 @@ skip_recovery:
 		if (err)
 			goto free_meta;
 	}
-	kfree(options);
+	kvfree(options);
 
 	/* recover broken superblock */
 	if (recovery) {
@@ -3425,7 +3425,7 @@ free_sm:
 	f2fs_destroy_segment_manager(sbi);
 free_devices:
 	destroy_device_list(sbi);
-	kfree(sbi->ckpt);
+	kvfree(sbi->ckpt);
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);
 	iput(sbi->meta_inode);
@@ -3435,19 +3435,19 @@ free_percpu:
 	destroy_percpu_info(sbi);
 free_bio_info:
 	for (i = 0; i < NR_PAGE_TYPE; i++)
-		kfree(sbi->write_io[i]);
+		kvfree(sbi->write_io[i]);
 free_options:
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
-		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
+		kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
 #endif
-	kfree(options);
+	kvfree(options);
 free_sb_buf:
-	kfree(raw_super);
+	kvfree(raw_super);
 free_sbi:
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
-	kfree(sbi);
+	kvfree(sbi);
 
 	/* give only one another chance */
 	if (retry) {

From 6f3ef9d3c43c59ac92745c33f34aebae1970d3d8 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 13 Dec 2018 16:53:57 -0800
Subject: [PATCH 2037/3220] f2fs: correct wrong spelling, issing_*

Let's use "queued" instead of "issuing".

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c   |  4 ++--
 fs/f2fs/f2fs.h    | 10 +++++-----
 fs/f2fs/segment.c | 26 +++++++++++++-------------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 0f1d99b6dd8c..3e6af63d4ebd 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -64,7 +64,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 		si->nr_flushed =
 			atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
 		si->nr_flushing =
-			atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+			atomic_read(&SM_I(sbi)->fcc_info->queued_flush);
 		si->flush_list_empty =
 			llist_empty(&SM_I(sbi)->fcc_info->issue_list);
 	}
@@ -72,7 +72,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 		si->nr_discarded =
 			atomic_read(&SM_I(sbi)->dcc_info->issued_discard);
 		si->nr_discarding =
-			atomic_read(&SM_I(sbi)->dcc_info->issing_discard);
+			atomic_read(&SM_I(sbi)->dcc_info->queued_discard);
 		si->nr_discard_cmd =
 			atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
 		si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 72fcc3a731e2..b24c1848a28f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -353,7 +353,7 @@ struct discard_cmd {
 	struct block_device *bdev;	/* bdev */
 	unsigned short ref;		/* reference count */
 	unsigned char state;		/* state */
-	unsigned char issuing;		/* issuing discard */
+	unsigned char queued;		/* queued discard */
 	int error;			/* bio error */
 	spinlock_t lock;		/* for state/bio_ref updating */
 	unsigned short bio_ref;		/* bio reference count */
@@ -395,7 +395,7 @@ struct discard_cmd_control {
 	unsigned int undiscard_blks;		/* # of undiscard blocks */
 	unsigned int next_pos;			/* next discard position */
 	atomic_t issued_discard;		/* # of issued discard */
-	atomic_t issing_discard;		/* # of issing discard */
+	atomic_t queued_discard;		/* # of queued discard */
 	atomic_t discard_cmd_cnt;		/* # of cached cmd count */
 	struct rb_root root;			/* root of discard rb-tree */
 	bool rbtree_check;			/* config for consistence check */
@@ -957,7 +957,7 @@ struct flush_cmd_control {
 	struct task_struct *f2fs_issue_flush;	/* flush thread */
 	wait_queue_head_t flush_wait_queue;	/* waiting queue for wake-up */
 	atomic_t issued_flush;			/* # of issued flushes */
-	atomic_t issing_flush;			/* # of issing flushes */
+	atomic_t queued_flush;			/* # of queued flushes */
 	struct llist_head issue_list;		/* list for command issue */
 	struct llist_node *dispatch_list;	/* list for command dispatch */
 };
@@ -2231,8 +2231,8 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
 		get_pages(sbi, F2FS_WB_CP_DATA) ||
 		get_pages(sbi, F2FS_DIO_READ) ||
 		get_pages(sbi, F2FS_DIO_WRITE) ||
-		atomic_read(&SM_I(sbi)->dcc_info->issing_discard) ||
-		atomic_read(&SM_I(sbi)->fcc_info->issing_flush))
+		atomic_read(&SM_I(sbi)->dcc_info->queued_discard) ||
+		atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
 		return false;
 	return f2fs_time_over(sbi, type);
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index acdf14c82935..d674e9693d08 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -621,16 +621,16 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
 		return 0;
 
 	if (!test_opt(sbi, FLUSH_MERGE)) {
-		atomic_inc(&fcc->issing_flush);
+		atomic_inc(&fcc->queued_flush);
 		ret = submit_flush_wait(sbi, ino);
-		atomic_dec(&fcc->issing_flush);
+		atomic_dec(&fcc->queued_flush);
 		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}
 
-	if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) {
+	if (atomic_inc_return(&fcc->queued_flush) == 1 || sbi->s_ndevs > 1) {
 		ret = submit_flush_wait(sbi, ino);
-		atomic_dec(&fcc->issing_flush);
+		atomic_dec(&fcc->queued_flush);
 
 		atomic_inc(&fcc->issued_flush);
 		return ret;
@@ -649,14 +649,14 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
 
 	if (fcc->f2fs_issue_flush) {
 		wait_for_completion(&cmd.wait);
-		atomic_dec(&fcc->issing_flush);
+		atomic_dec(&fcc->queued_flush);
 	} else {
 		struct llist_node *list;
 
 		list = llist_del_all(&fcc->issue_list);
 		if (!list) {
 			wait_for_completion(&cmd.wait);
-			atomic_dec(&fcc->issing_flush);
+			atomic_dec(&fcc->queued_flush);
 		} else {
 			struct flush_cmd *tmp, *next;
 
@@ -665,7 +665,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
 			llist_for_each_entry_safe(tmp, next, list, llnode) {
 				if (tmp == &cmd) {
 					cmd.ret = ret;
-					atomic_dec(&fcc->issing_flush);
+					atomic_dec(&fcc->queued_flush);
 					continue;
 				}
 				tmp->ret = ret;
@@ -694,7 +694,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	if (!fcc)
 		return -ENOMEM;
 	atomic_set(&fcc->issued_flush, 0);
-	atomic_set(&fcc->issing_flush, 0);
+	atomic_set(&fcc->queued_flush, 0);
 	init_waitqueue_head(&fcc->flush_wait_queue);
 	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->fcc_info = fcc;
@@ -910,7 +910,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
 	dc->len = len;
 	dc->ref = 0;
 	dc->state = D_PREP;
-	dc->issuing = 0;
+	dc->queued = 0;
 	dc->error = 0;
 	init_completion(&dc->wait);
 	list_add_tail(&dc->list, pend_list);
@@ -942,7 +942,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc,
 							struct discard_cmd *dc)
 {
 	if (dc->state == D_DONE)
-		atomic_sub(dc->issuing, &dcc->issing_discard);
+		atomic_sub(dc->queued, &dcc->queued_discard);
 
 	list_del(&dc->list);
 	rb_erase(&dc->rb_node, &dcc->root);
@@ -1226,8 +1226,8 @@ submit:
 		dc->bio_ref++;
 		spin_unlock_irqrestore(&dc->lock, flags);
 
-		atomic_inc(&dcc->issing_discard);
-		dc->issuing++;
+		atomic_inc(&dcc->queued_discard);
+		dc->queued++;
 		list_move_tail(&dc->list, wait_list);
 
 		/* sanity check on discard range */
@@ -2073,7 +2073,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 	INIT_LIST_HEAD(&dcc->fstrim_list);
 	mutex_init(&dcc->cmd_lock);
 	atomic_set(&dcc->issued_discard, 0);
-	atomic_set(&dcc->issing_discard, 0);
+	atomic_set(&dcc->queued_discard, 0);
 	atomic_set(&dcc->discard_cmd_cnt, 0);
 	dcc->nr_discards = 0;
 	dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;

From 7beec23358c44d1c876141f4e6847c1ddf318ffb Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 13 Dec 2018 20:50:51 -0800
Subject: [PATCH 2038/3220] f2fs: flush stale issued discard candidates

Sometimes, I could observe # of issuing_discard to be 1 which blocks background
jobs due to is_idle()=false.
The only way to get out of it was to trigger gc_urgent. This patch avoids that
by checking any candidates as done in the list.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d674e9693d08..aa6b5fcf4567 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1728,6 +1728,10 @@ static int issue_discard_thread(void *data)
 		if (dcc->discard_wake)
 			dcc->discard_wake = 0;
 
+		/* clean up pending candidates before going to sleep */
+		if (atomic_read(&dcc->queued_discard))
+			__wait_all_discard_cmd(sbi, NULL);
+
 		if (try_to_freeze())
 			continue;
 		if (f2fs_readonly(sbi->sb))

From 51aeb0d993c496728b0cfd4d864967a9e81924de Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 12 Dec 2018 18:12:32 +0800
Subject: [PATCH 2039/3220] f2fs: clean up checkpoint flow

This patch cleans up checkpoint flow a bit:
- remove unneeded circulation of flushing meta pages.
- don't flush nat_bits pages in prior to other checkpoint pages.
- add bug_on to check remained meta pages after flushing.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index cb1609ee2a3c..8cfc48dd71de 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1330,11 +1330,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	int err;
 
 	/* Flush all the NAT/SIT pages */
-	while (get_pages(sbi, F2FS_DIRTY_META)) {
-		f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
-		if (unlikely(f2fs_cp_error(sbi)))
-			break;
-	}
+	f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+	f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
+					!f2fs_cp_error(sbi));
 
 	/*
 	 * modify checkpoint
@@ -1407,14 +1405,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		for (i = 0; i < nm_i->nat_bits_blocks; i++)
 			f2fs_update_meta_page(sbi, nm_i->nat_bits +
 					(i << F2FS_BLKSIZE_BITS), blk + i);
-
-		/* Flush all the NAT BITS pages */
-		while (get_pages(sbi, F2FS_DIRTY_META)) {
-			f2fs_sync_meta_pages(sbi, META, LONG_MAX,
-							FS_CP_META_IO);
-			if (unlikely(f2fs_cp_error(sbi)))
-				break;
-		}
 	}
 
 	/* write out checkpoint buffer at block 0 */
@@ -1450,6 +1440,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	/* Here, we have one bio having CP pack except cp pack 2 page */
 	f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+	f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
+					!f2fs_cp_error(sbi));
 
 	/* wait for previous submitted meta pages writeback */
 	f2fs_wait_on_all_pages_writeback(sbi);

From ac9f979f2ed948962807436bf25b51d13ef4d881 Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Tue, 18 Dec 2018 16:39:24 +0530
Subject: [PATCH 2040/3220] f2fs: fix sbi->extent_list corruption issue

When there is a failure in f2fs_fill_super() after/during
the recovery of fsync'd nodes, it frees the current sbi and
retries again. This time the mount is successful, but the files
that got recovered before retry, still holds the extent tree,
whose extent nodes list is corrupted since sbi and sbi->extent_list
is freed up. The list_del corruption issue is observed when the
file system is getting unmounted and when those recoverd files extent
node is being freed up in the below context.

list_del corruption. prev->next should be fffffff1e1ef5480, but was (null)
<...>
kernel BUG at kernel/msm-4.14/lib/list_debug.c:53!
lr : __list_del_entry_valid+0x94/0xb4
pc : __list_del_entry_valid+0x94/0xb4
<...>
Call trace:
__list_del_entry_valid+0x94/0xb4
__release_extent_node+0xb0/0x114
__free_extent_tree+0x58/0x7c
f2fs_shrink_extent_tree+0xdc/0x3b0
f2fs_leave_shrinker+0x28/0x7c
f2fs_put_super+0xfc/0x1e0
generic_shutdown_super+0x70/0xf4
kill_block_super+0x2c/0x5c
kill_f2fs_super+0x44/0x50
deactivate_locked_super+0x60/0x8c
deactivate_super+0x68/0x74
cleanup_mnt+0x40/0x78
__cleanup_mnt+0x1c/0x28
task_work_run+0x48/0xd0
do_notify_resume+0x678/0xe98
work_pending+0x8/0x14

Fix this by not creating extents for those recovered files if shrinker is
not registered yet. Once mount is successful and shrinker is registered,
those files can have extents again.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h     | 11 ++++++++++-
 fs/f2fs/shrinker.c |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b24c1848a28f..0ee6783751df 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2759,10 +2759,19 @@ static inline bool is_dot_dotdot(const struct qstr *str)
 
 static inline bool f2fs_may_extent_tree(struct inode *inode)
 {
-	if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (!test_opt(sbi, EXTENT_CACHE) ||
 			is_inode_flag_set(inode, FI_NO_EXTENT))
 		return false;
 
+	/*
+	 * for recovered files during mount do not create extents
+	 * if shrinker is not registered.
+	 */
+	if (list_empty(&sbi->s_list))
+		return false;
+
 	return S_ISREG(inode->i_mode);
 }
 
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 9e13db994fdf..a467aca29cfe 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -135,6 +135,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
 	f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
 
 	spin_lock(&f2fs_list_lock);
-	list_del(&sbi->s_list);
+	list_del_init(&sbi->s_list);
 	spin_unlock(&f2fs_list_lock);
 }

From 491624d7ea08e2d21b8262cf06f4a3576de58b77 Mon Sep 17 00:00:00 2001
From: Qiuyang Sun <sunqiuyang@huawei.com>
Date: Tue, 18 Dec 2018 17:32:23 +0800
Subject: [PATCH 2041/3220] f2fs: fix block address for __check_sit_bitmap

Should use lstart (logical start address) instead of start (in dev) here.
This fixes a bug in multi-device scenarios.

Signed-off-by: Qiuyang Sun <sunqiuyang@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index aa6b5fcf4567..c19c1b20796c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1231,7 +1231,7 @@ submit:
 		list_move_tail(&dc->list, wait_list);
 
 		/* sanity check on discard range */
-		__check_sit_bitmap(sbi, start, start + len);
+		__check_sit_bitmap(sbi, lstart, lstart + len);
 
 		bio->bi_private = dc;
 		bio->bi_end_io = f2fs_submit_discard_endio;

From 0f9dc70b5f858a68123183ff633985db24cbe5fb Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 18 Dec 2018 19:20:16 +0800
Subject: [PATCH 2042/3220] f2fs: clean up structure extent_node

The union in struct extent_node wass only to indicate below fields

	struct rb_node rb_node;
	union {
		struct {
			unsigned int fofs;
			unsigned int len;
		...
	...

can be parsed as fields in struct rb_entry, but they were never be
used explicitly before, so let's remove them for cleanup.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0ee6783751df..984a13912286 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -624,16 +624,8 @@ struct extent_info {
 };
 
 struct extent_node {
-	struct rb_node rb_node;
-	union {
-		struct {
-			unsigned int fofs;
-			unsigned int len;
-			u32 blk;
-		};
-		struct extent_info ei;	/* extent info */
-
-	};
+	struct rb_node rb_node;		/* rb node located in rb-tree */
+	struct extent_info ei;		/* extent info */
 	struct list_head list;		/* node in global extent list of sbi */
 	struct extent_tree *et;		/* extent tree pointer */
 };

From ed5b0cbccc1d295c1bc2509ecaacaf2ba251f376 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 17 Dec 2018 17:08:26 -0800
Subject: [PATCH 2043/3220] f2fs: fix missing unlock(sbi->gc_mutex)

This fixes missing unlock call.

Cc: <stable@vger.kernel.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1b59991a5645..f39114017565 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1458,19 +1458,16 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
 
 	sbi->sb->s_flags |= MS_ACTIVE;
 
-	mutex_lock(&sbi->gc_mutex);
 	f2fs_update_time(sbi, DISABLE_TIME);
 
 	while (!f2fs_time_over(sbi, DISABLE_TIME)) {
+		mutex_lock(&sbi->gc_mutex);
 		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
 		if (err == -ENODATA)
 			break;
-		if (err && err != -EAGAIN) {
-			mutex_unlock(&sbi->gc_mutex);
+		if (err && err != -EAGAIN)
 			return err;
-		}
 	}
-	mutex_unlock(&sbi->gc_mutex);
 
 	err = sync_filesystem(sbi->sb);
 	if (err)

From 7d78945c83a1ccf91788ff3d40917387048cc066 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 22 Dec 2018 11:22:26 +0100
Subject: [PATCH 2044/3220] f2fs: fix validation of the block count in
 sanity_check_raw_super

Treat "block_count" from struct f2fs_super_block as 64-bit little endian
value in sanity_check_raw_super() because struct f2fs_super_block
declares "block_count" as "__le64".

This fixes a bug where the superblock validation fails on big endian
devices with the following error:
  F2FS-fs (sda1): Wrong segment_count / block_count (61439 > 0)
  F2FS-fs (sda1): Can't find valid F2FS filesystem in 1th superblock
  F2FS-fs (sda1): Wrong segment_count / block_count (61439 > 0)
  F2FS-fs (sda1): Can't find valid F2FS filesystem in 2th superblock
As result of this the partition cannot be mounted.

With this patch applied the superblock validation works fine and the
partition can be mounted again:
  F2FS-fs (sda1): Mounted with checkpoint version = 7c84

My little endian x86-64 hardware was able to mount the partition without
this fix.
To confirm that mounting f2fs filesystems works on big endian machines
again I tested this on a 32-bit MIPS big endian (lantiq) device.

Fixes: 0cfe75c5b01199 ("f2fs: enhance sanity_check_raw_super() to avoid potential overflows")
Cc: stable@vger.kernel.org
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f39114017565..59d9263b8f49 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2497,10 +2497,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 		return 1;
 	}
 
-	if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) {
+	if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) {
 		f2fs_msg(sb, KERN_INFO,
-			"Wrong segment_count / block_count (%u > %u)",
-			segment_count, le32_to_cpu(raw_super->block_count));
+			"Wrong segment_count / block_count (%u > %llu)",
+			segment_count, le64_to_cpu(raw_super->block_count));
 		return 1;
 	}
 

From c0d46fe1b7cf123dea82bcb530eb02ca8b974a4a Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 25 Dec 2018 17:43:42 +0800
Subject: [PATCH 2045/3220] f2fs: check PageWriteback flag for ordered case

For all ordered cases in f2fs_wait_on_page_writeback(), we need to
check PageWriteback status, so let's clean up to relocate the check
into f2fs_wait_on_page_writeback().

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  8 +++-----
 fs/f2fs/data.c       |  9 ++++-----
 fs/f2fs/dir.c        |  8 ++++----
 fs/f2fs/f2fs.h       |  2 +-
 fs/f2fs/file.c       |  6 +++---
 fs/f2fs/gc.c         | 10 +++++-----
 fs/f2fs/inline.c     | 16 ++++++++--------
 fs/f2fs/inode.c      |  4 ++--
 fs/f2fs/node.c       | 19 ++++++++-----------
 fs/f2fs/node.h       |  2 +-
 fs/f2fs/recovery.c   |  2 +-
 fs/f2fs/segment.c    | 14 ++++++++------
 fs/f2fs/xattr.c      |  4 ++--
 13 files changed, 50 insertions(+), 54 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 8cfc48dd71de..940372ebfc60 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -44,7 +44,7 @@ repeat:
 		cond_resched();
 		goto repeat;
 	}
-	f2fs_wait_on_page_writeback(page, META, true);
+	f2fs_wait_on_page_writeback(page, META, true, true);
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 	return page;
@@ -371,9 +371,8 @@ continue_unlock:
 				goto continue_unlock;
 			}
 
-			f2fs_wait_on_page_writeback(page, META, true);
+			f2fs_wait_on_page_writeback(page, META, true, true);
 
-			BUG_ON(PageWriteback(page));
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
@@ -1291,8 +1290,7 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi,
 	struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
 	int err;
 
-	f2fs_wait_on_page_writeback(page, META, true);
-	f2fs_bug_on(sbi, PageWriteback(page));
+	f2fs_wait_on_page_writeback(page, META, true, true);
 
 	memcpy(page_address(page), src, PAGE_SIZE);
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 028ebc31c60a..35285a3d3ff6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -623,7 +623,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn)
  */
 void f2fs_set_data_blkaddr(struct dnode_of_data *dn)
 {
-	f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
 	__set_data_blkaddr(dn);
 	if (set_page_dirty(dn->node_page))
 		dn->node_changed = true;
@@ -653,7 +653,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
 	trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
 						dn->ofs_in_node, count);
 
-	f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
 
 	for (; count > 0; dn->ofs_in_node++) {
 		block_t blkaddr = datablock_addr(dn->inode,
@@ -2145,12 +2145,11 @@ continue_unlock:
 			if (PageWriteback(page)) {
 				if (wbc->sync_mode != WB_SYNC_NONE)
 					f2fs_wait_on_page_writeback(page,
-								DATA, true);
+							DATA, true, true);
 				else
 					goto continue_unlock;
 			}
 
-			BUG_ON(PageWriteback(page));
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
@@ -2468,7 +2467,7 @@ repeat:
 		}
 	}
 
-	f2fs_wait_on_page_writeback(page, DATA, false);
+	f2fs_wait_on_page_writeback(page, DATA, false, true);
 
 	if (len == PAGE_SIZE || PageUptodate(page))
 		return 0;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 5cd19f2e6e04..719ba4d92098 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -293,7 +293,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 {
 	enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
 	lock_page(page);
-	f2fs_wait_on_page_writeback(page, type, true);
+	f2fs_wait_on_page_writeback(page, type, true, true);
 	de->ino = cpu_to_le32(inode->i_ino);
 	set_de_type(de, inode->i_mode);
 	set_page_dirty(page);
@@ -307,7 +307,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
 	struct f2fs_inode *ri;
 
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 
 	/* copy name info. to this inode page */
 	ri = F2FS_INODE(ipage);
@@ -550,7 +550,7 @@ start:
 	++level;
 	goto start;
 add_dentry:
-	f2fs_wait_on_page_writeback(dentry_page, DATA, true);
+	f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
 
 	if (inode) {
 		down_write(&F2FS_I(inode)->i_sem);
@@ -705,7 +705,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 		return f2fs_delete_inline_entry(dentry, page, dir, inode);
 
 	lock_page(page);
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 	dentry_blk = page_address(page);
 	bit_pos = dentry - dentry_blk->dentry;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 984a13912286..11f86e439ac7 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3122,7 +3122,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 			struct f2fs_summary *sum, int type,
 			struct f2fs_io_info *fio, bool add_list);
 void f2fs_wait_on_page_writeback(struct page *page,
-			enum page_type type, bool ordered);
+			enum page_type type, bool ordered, bool locked);
 void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
 void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
 								block_t len);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index cadfecbd7233..bcca76de8233 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -85,7 +85,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	}
 
 	/* fill the page */
-	f2fs_wait_on_page_writeback(page, DATA, false);
+	f2fs_wait_on_page_writeback(page, DATA, false, true);
 
 	/* wait for GCed page writeback via META_MAPPING */
 	f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
@@ -581,7 +581,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 	if (IS_ERR(page))
 		return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
 truncate_out:
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 	zero_user(page, offset, PAGE_SIZE - offset);
 
 	/* An encrypted inode should have a key and truncate the last page. */
@@ -902,7 +902,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 	zero_user(page, start, len);
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f1dfb710aae1..3624d1336bd1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -679,7 +679,7 @@ got_it:
 	 * don't cache encrypted data into meta inode until previous dirty
 	 * data were writebacked to avoid racing between GC and flush.
 	 */
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 	f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
 
@@ -768,7 +768,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	 * don't cache encrypted data into meta inode until previous dirty
 	 * data were writebacked to avoid racing between GC and flush.
 	 */
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 	f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
 
@@ -829,7 +829,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	}
 
 write_page:
-	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
+	f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true);
 	set_page_dirty(fio.encrypted_page);
 	if (clear_page_dirty_for_io(fio.encrypted_page))
 		dec_page_count(fio.sbi, F2FS_DIRTY_META);
@@ -838,7 +838,7 @@ write_page:
 	ClearPageError(page);
 
 	/* allocate block address */
-	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
 
 	fio.op = REQ_OP_WRITE;
 	fio.op_flags = REQ_SYNC | REQ_NOIDLE;
@@ -924,7 +924,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
 		bool is_dirty = PageDirty(page);
 
 retry:
-		f2fs_wait_on_page_writeback(page, DATA, true);
+		f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 		set_page_dirty(page);
 		if (clear_page_dirty_for_io(page)) {
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 69059f27c6f0..6b4aa5db6a9a 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -72,7 +72,7 @@ void f2fs_truncate_inline_inode(struct inode *inode,
 
 	addr = inline_data_addr(inode, ipage);
 
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 	memset(addr + from, 0, MAX_INLINE_DATA(inode) - from);
 	set_page_dirty(ipage);
 
@@ -161,7 +161,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	fio.old_blkaddr = dn->data_blkaddr;
 	set_inode_flag(dn->inode, FI_HOT_DATA);
 	f2fs_outplace_write_data(dn, &fio);
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 	if (dirty) {
 		inode_dec_dirty_pages(dn->inode);
 		f2fs_remove_dirty_inode(dn->inode);
@@ -236,7 +236,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
 
 	f2fs_bug_on(F2FS_I_SB(inode), page->index);
 
-	f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true);
 	src_addr = kmap_atomic(page);
 	dst_addr = inline_data_addr(inode, dn.inode_page);
 	memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
@@ -277,7 +277,7 @@ process_inline:
 		ipage = f2fs_get_node_page(sbi, inode->i_ino);
 		f2fs_bug_on(sbi, IS_ERR(ipage));
 
-		f2fs_wait_on_page_writeback(ipage, NODE, true);
+		f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 
 		src_addr = inline_data_addr(inode, npage);
 		dst_addr = inline_data_addr(inode, ipage);
@@ -391,7 +391,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 		goto out;
 	}
 
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 	dentry_blk = page_address(page);
 
@@ -505,7 +505,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
 	return 0;
 recover:
 	lock_page(ipage);
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir));
 	f2fs_i_depth_write(dir, 0);
 	f2fs_i_size_write(dir, MAX_INLINE_DATA(dir));
@@ -565,7 +565,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
 		}
 	}
 
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 
 	name_hash = f2fs_dentry_hash(new_name, NULL);
 	f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
@@ -597,7 +597,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	int i;
 
 	lock_page(page);
-	f2fs_wait_on_page_writeback(page, NODE, true);
+	f2fs_wait_on_page_writeback(page, NODE, true, true);
 
 	inline_dentry = inline_data_addr(dir, page);
 	make_dentry_ptr_inline(dir, &d, inline_dentry);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 80c02c4d75c2..3e739e4986b8 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -103,7 +103,7 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
 
 	while (start < end) {
 		if (*start++) {
-			f2fs_wait_on_page_writeback(ipage, NODE, true);
+			f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 
 			set_inode_flag(inode, FI_DATA_EXIST);
 			set_raw_inline(inode, F2FS_INODE(ipage));
@@ -497,7 +497,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 	struct f2fs_inode *ri;
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 
-	f2fs_wait_on_page_writeback(node_page, NODE, true);
+	f2fs_wait_on_page_writeback(node_page, NODE, true, true);
 	set_page_dirty(node_page);
 
 	f2fs_inode_synced(inode);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f500b8e7cd65..f6a1e1e0f891 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1106,7 +1106,7 @@ skip_partial:
 				ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
 			lock_page(page);
 			BUG_ON(page->mapping != NODE_MAPPING(sbi));
-			f2fs_wait_on_page_writeback(page, NODE, true);
+			f2fs_wait_on_page_writeback(page, NODE, true, true);
 			ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
 			set_page_dirty(page);
 			unlock_page(page);
@@ -1234,7 +1234,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
 	new_ni.version = 0;
 	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
 
-	f2fs_wait_on_page_writeback(page, NODE, true);
+	f2fs_wait_on_page_writeback(page, NODE, true, true);
 	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
 	set_cold_node(page, S_ISDIR(dn->inode->i_mode));
 	if (!PageUptodate(page))
@@ -1600,8 +1600,7 @@ int f2fs_move_node_page(struct page *node_page, int gc_type)
 			.for_reclaim = 0,
 		};
 
-		f2fs_wait_on_page_writeback(node_page, NODE, true);
-		f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
+		f2fs_wait_on_page_writeback(node_page, NODE, true, true);
 
 		set_page_dirty(node_page);
 
@@ -1692,8 +1691,7 @@ continue_unlock:
 				goto continue_unlock;
 			}
 
-			f2fs_wait_on_page_writeback(page, NODE, true);
-			BUG_ON(PageWriteback(page));
+			f2fs_wait_on_page_writeback(page, NODE, true, true);
 
 			set_fsync_mark(page, 0);
 			set_dentry_mark(page, 0);
@@ -1744,7 +1742,7 @@ continue_unlock:
 			"Retry to write fsync mark: ino=%u, idx=%lx",
 					ino, last_page->index);
 		lock_page(last_page);
-		f2fs_wait_on_page_writeback(last_page, NODE, true);
+		f2fs_wait_on_page_writeback(last_page, NODE, true, true);
 		set_page_dirty(last_page);
 		unlock_page(last_page);
 		goto retry;
@@ -1825,9 +1823,8 @@ continue_unlock:
 				goto lock_node;
 			}
 
-			f2fs_wait_on_page_writeback(page, NODE, true);
+			f2fs_wait_on_page_writeback(page, NODE, true, true);
 
-			BUG_ON(PageWriteback(page));
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
@@ -1894,7 +1891,7 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
 		get_page(page);
 		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
 
-		f2fs_wait_on_page_writeback(page, NODE, true);
+		f2fs_wait_on_page_writeback(page, NODE, true, false);
 		if (TestClearPageError(page))
 			ret = -EIO;
 
@@ -2475,7 +2472,7 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
 	src_addr = inline_xattr_addr(inode, page);
 	inline_size = inline_xattr_size(inode);
 
-	f2fs_wait_on_page_writeback(ipage, NODE, true);
+	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 	memcpy(dst_addr, src_addr, inline_size);
 update_inode:
 	f2fs_update_inode(inode, ipage);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 1c73d879a9bc..e05af5df5648 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -361,7 +361,7 @@ static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
 {
 	struct f2fs_node *rn = F2FS_NODE(p);
 
-	f2fs_wait_on_page_writeback(p, NODE, true);
+	f2fs_wait_on_page_writeback(p, NODE, true, true);
 
 	if (i)
 		rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index cf583b4b6c9f..aa3f5633a20c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -531,7 +531,7 @@ retry_dn:
 		goto out;
 	}
 
-	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+	f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
 
 	err = f2fs_get_node_info(sbi, dn.nid, &ni);
 	if (err)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c19c1b20796c..ccefcf8016c4 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -229,7 +229,7 @@ static int __revoke_inmem_pages(struct inode *inode,
 
 		lock_page(page);
 
-		f2fs_wait_on_page_writeback(page, DATA, true);
+		f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 		if (recover) {
 			struct dnode_of_data dn;
@@ -387,7 +387,7 @@ static int __f2fs_commit_inmem_pages(struct inode *inode)
 		if (page->mapping == inode->i_mapping) {
 			trace_f2fs_commit_inmem_page(page, INMEM);
 
-			f2fs_wait_on_page_writeback(page, DATA, true);
+			f2fs_wait_on_page_writeback(page, DATA, true, true);
 
 			set_page_dirty(page);
 			if (clear_page_dirty_for_io(page)) {
@@ -3356,16 +3356,18 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
 }
 
 void f2fs_wait_on_page_writeback(struct page *page,
-				enum page_type type, bool ordered)
+				enum page_type type, bool ordered, bool locked)
 {
 	if (PageWriteback(page)) {
 		struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 
 		f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
-		if (ordered)
+		if (ordered) {
 			wait_on_page_writeback(page);
-		else
+			f2fs_bug_on(sbi, locked && PageWriteback(page));
+		} else {
 			wait_for_stable_page(page);
+		}
 	}
 }
 
@@ -3382,7 +3384,7 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
 
 	cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
 	if (cpage) {
-		f2fs_wait_on_page_writeback(cpage, DATA, true);
+		f2fs_wait_on_page_writeback(cpage, DATA, true, true);
 		f2fs_put_page(cpage, 1);
 	}
 }
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index c7c3c2a63a85..22b3cfd710b9 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -461,7 +461,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 		}
 
 		f2fs_wait_on_page_writeback(ipage ? ipage : in_page,
-							NODE, true);
+							NODE, true, true);
 		/* no need to use xattr node block */
 		if (hsize <= inline_size) {
 			err = f2fs_truncate_xattr_node(inode);
@@ -485,7 +485,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 			goto in_page_out;
 		}
 		f2fs_bug_on(sbi, new_nid);
-		f2fs_wait_on_page_writeback(xpage, NODE, true);
+		f2fs_wait_on_page_writeback(xpage, NODE, true, true);
 	} else {
 		struct dnode_of_data dn;
 		set_new_dnode(&dn, inode, NULL, NULL, new_nid);

From 652f47425fe041459d03cdde0cfcbb88d79e4e26 Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Wed, 26 Dec 2018 11:20:29 +0530
Subject: [PATCH 2046/3220] f2fs: fix use-after-free issue when accessing
 sbi->stat_info

iput() on sbi->node_inode can update sbi->stat_info
in the below context, if the f2fs_write_checkpoint()
has failed with error.

f2fs_balance_fs_bg+0x1ac/0x1ec
f2fs_write_node_pages+0x4c/0x260
do_writepages+0x80/0xbc
__writeback_single_inode+0xdc/0x4ac
writeback_single_inode+0x9c/0x144
write_inode_now+0xc4/0xec
iput+0x194/0x22c
f2fs_put_super+0x11c/0x1e8
generic_shutdown_super+0x70/0xf4
kill_block_super+0x2c/0x5c
kill_f2fs_super+0x44/0x50
deactivate_locked_super+0x60/0x8c
deactivate_super+0x68/0x74
cleanup_mnt+0x40/0x78

Fix this by moving f2fs_destroy_stats() further below iput() in
both f2fs_put_super() and f2fs_fill_super() paths.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 59d9263b8f49..232889a9f550 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1058,9 +1058,6 @@ static void f2fs_put_super(struct super_block *sb)
 		f2fs_write_checkpoint(sbi, &cpc);
 	}
 
-	/* f2fs_write_checkpoint can update stat informaion */
-	f2fs_destroy_stats(sbi);
-
 	/*
 	 * normally superblock is clean, so we need to release this.
 	 * In addition, EIO will skip do checkpoint, we need this as well.
@@ -1080,6 +1077,12 @@ static void f2fs_put_super(struct super_block *sb)
 	iput(sbi->node_inode);
 	iput(sbi->meta_inode);
 
+	/*
+	 * iput() can update stat information, if f2fs_write_checkpoint()
+	 * above failed with error.
+	 */
+	f2fs_destroy_stats(sbi);
+
 	/* destroy f2fs internal modules */
 	f2fs_destroy_node_manager(sbi);
 	f2fs_destroy_segment_manager(sbi);
@@ -3263,30 +3266,30 @@ try_onemore:
 
 	f2fs_build_gc_manager(sbi);
 
+	err = f2fs_build_stats(sbi);
+	if (err)
+		goto free_nm;
+
 	/* get an inode for node space */
 	sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
 	if (IS_ERR(sbi->node_inode)) {
 		f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
 		err = PTR_ERR(sbi->node_inode);
-		goto free_nm;
+		goto free_stats;
 	}
 
-	err = f2fs_build_stats(sbi);
-	if (err)
-		goto free_node_inode;
-
 	/* read root inode and dentry */
 	root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
 	if (IS_ERR(root)) {
 		f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
 		err = PTR_ERR(root);
-		goto free_stats;
+		goto free_node_inode;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks ||
 			!root->i_size || !root->i_nlink) {
 		iput(root);
 		err = -EINVAL;
-		goto free_stats;
+		goto free_node_inode;
 	}
 
 	sb->s_root = d_make_root(root); /* allocate root dentry */
@@ -3410,12 +3413,12 @@ free_meta:
 free_root_inode:
 	dput(sb->s_root);
 	sb->s_root = NULL;
-free_stats:
-	f2fs_destroy_stats(sbi);
 free_node_inode:
 	f2fs_release_ino_entry(sbi, true);
 	truncate_inode_pages_final(NODE_MAPPING(sbi));
 	iput(sbi->node_inode);
+free_stats:
+	f2fs_destroy_stats(sbi);
 free_nm:
 	f2fs_destroy_node_manager(sbi);
 free_sm:

From 91aae9fd92425b92712b6fe9ad059dd56f926364 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 26 Dec 2018 19:54:07 -0800
Subject: [PATCH 2047/3220] f2fs: sanity check of xattr entry size

There is a security report where f2fs_getxattr() has a hole to expose wrong
memory region when the image is malformed like this.

f2fs_getxattr: entry->e_name_len: 4, size: 12288, buffer_size: 16384, len: 4

Cc: <stable@vger.kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/xattr.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 22b3cfd710b9..71b27c55988a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -334,7 +334,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr)
 static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 				unsigned int index, unsigned int len,
 				const char *name, struct f2fs_xattr_entry **xe,
-				void **base_addr)
+				void **base_addr, int *base_size)
 {
 	void *cur_addr, *txattr_addr, *last_addr = NULL;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
@@ -345,8 +345,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 	if (!size && !inline_size)
 		return -ENODATA;
 
-	txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode),
-			inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS);
+	*base_size = inline_size + size + XATTR_PADDING_SIZE;
+	txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS);
 	if (!txattr_addr)
 		return -ENOMEM;
 
@@ -358,8 +358,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 		*xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
 						index, len, name);
-		if (*xe)
+		if (*xe) {
+			*base_size = inline_size;
 			goto check;
+		}
 	}
 
 	/* read from xattr node block */
@@ -520,6 +522,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 	int error = 0;
 	unsigned int size, len;
 	void *base_addr = NULL;
+	int base_size;
 
 	if (name == NULL)
 		return -EINVAL;
@@ -530,7 +533,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 
 	down_read(&F2FS_I(inode)->i_xattr_sem);
 	error = lookup_all_xattrs(inode, ipage, index, len, name,
-				&entry, &base_addr);
+				&entry, &base_addr, &base_size);
 	up_read(&F2FS_I(inode)->i_xattr_sem);
 	if (error)
 		return error;
@@ -544,6 +547,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 
 	if (buffer) {
 		char *pval = entry->e_name + entry->e_name_len;
+
+		if (base_size - (pval - (char *)base_addr) < size) {
+			error = -ERANGE;
+			goto out;
+		}
 		memcpy(buffer, pval, size);
 	}
 	error = size;

From 45afbd380596bb505970070d9521d607de22bd6b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 28 Dec 2018 11:00:38 -0800
Subject: [PATCH 2048/3220] f2fs: wait on atomic writes to count
 F2FS_CP_WB_DATA

Otherwise, we can get wrong counts incurring checkpoint hang.

IO_W (CP:  -24, Data:   24, Flush: (   0    0    1), Discard: (   0    0))

Thread A                        Thread B
- f2fs_write_data_pages
 -  __write_data_page
  - f2fs_submit_page_write
   - inc_page_count(F2FS_WB_DATA)
     type is F2FS_WB_DATA due to file is non-atomic one
- f2fs_ioc_start_atomic_write
 - set_inode_flag(FI_ATOMIC_FILE)
                                - f2fs_write_end_io
                                 - dec_page_count(F2FS_WB_CP_DATA)
                                   type is F2FS_WB_DATA due to file becomes
                                   atomic one

Cc: <stable@vger.kernel.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index bcca76de8233..b44e02b8c677 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1750,10 +1750,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 
 	down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 
-	if (!get_dirty_pages(inode))
-		goto skip_flush;
-
-	f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+	/*
+	 * Should wait end_io to count F2FS_WB_CP_DATA correctly by
+	 * f2fs_is_atomic_file.
+	 */
+	if (get_dirty_pages(inode))
+		f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
 		"Unexpected flush for atomic writes: ino=%lu, npages=%u",
 					inode->i_ino, get_dirty_pages(inode));
 	ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
@@ -1761,7 +1763,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 		up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 		goto out;
 	}
-skip_flush:
+
 	set_inode_flag(inode, FI_ATOMIC_FILE);
 	clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
 	up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);

From 4cb399ef78be1bb1ce62410017526efc080d3fa9 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 1 Jan 2019 00:11:30 -0800
Subject: [PATCH 2049/3220] f2fs: don't access node/meta inode mapping after
 iput

This fixes wrong access of address spaces of node and meta inodes after iput.

Fixes: 60aa4d5536ab ("f2fs: fix use-after-free issue when accessing sbi->stat_info")
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c | 19 ++++++++++++-------
 fs/f2fs/super.c |  5 +++++
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 3e6af63d4ebd..f05b37ef7182 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -96,8 +96,10 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->free_secs = free_sections(sbi);
 	si->prefree_count = prefree_segments(sbi);
 	si->dirty_count = dirty_segments(sbi);
-	si->node_pages = NODE_MAPPING(sbi)->nrpages;
-	si->meta_pages = META_MAPPING(sbi)->nrpages;
+	if (sbi->node_inode)
+		si->node_pages = NODE_MAPPING(sbi)->nrpages;
+	if (sbi->meta_inode)
+		si->meta_pages = META_MAPPING(sbi)->nrpages;
 	si->nats = NM_I(sbi)->nat_cnt;
 	si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
 	si->sits = MAIN_SEGS(sbi);
@@ -175,7 +177,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 static void update_mem_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_stat_info *si = F2FS_STAT(sbi);
-	unsigned npages;
 	int i;
 
 	if (si->base_mem)
@@ -258,10 +259,14 @@ get_cache:
 						sizeof(struct extent_node);
 
 	si->page_mem = 0;
-	npages = NODE_MAPPING(sbi)->nrpages;
-	si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
-	npages = META_MAPPING(sbi)->nrpages;
-	si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+	if (sbi->node_inode) {
+		unsigned npages = NODE_MAPPING(sbi)->nrpages;
+		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+	}
+	if (sbi->meta_inode) {
+		unsigned npages = META_MAPPING(sbi)->nrpages;
+		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+	}
 }
 
 static int stat_show(struct seq_file *s, void *v)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 232889a9f550..a9e7a1e62c66 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1075,7 +1075,10 @@ static void f2fs_put_super(struct super_block *sb)
 	f2fs_bug_on(sbi, sbi->fsync_node_num);
 
 	iput(sbi->node_inode);
+	sbi->node_inode = NULL;
+
 	iput(sbi->meta_inode);
+	sbi->meta_inode = NULL;
 
 	/*
 	 * iput() can update stat information, if f2fs_write_checkpoint()
@@ -3417,6 +3420,7 @@ free_node_inode:
 	f2fs_release_ino_entry(sbi, true);
 	truncate_inode_pages_final(NODE_MAPPING(sbi));
 	iput(sbi->node_inode);
+	sbi->node_inode = NULL;
 free_stats:
 	f2fs_destroy_stats(sbi);
 free_nm:
@@ -3429,6 +3433,7 @@ free_devices:
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);
 	iput(sbi->meta_inode);
+	sbi->meta_inode = NULL;
 free_io_dummy:
 	mempool_destroy(sbi->write_io_dummy);
 free_percpu:

From 8846b1dbfd2146b145d73ba31a4caa4a4789aefb Mon Sep 17 00:00:00 2001
From: Hui Peng <benquike@gmail.com>
Date: Wed, 12 Dec 2018 12:42:24 +0100
Subject: [PATCH 2050/3220] USB: hso: Fix OOB memory access in
 hso_probe/hso_get_config_data

commit 5146f95df782b0ac61abde36567e718692725c89 upstream.

The function hso_probe reads if_num from the USB device (as an u8) and uses
it without a length check to index an array, resulting in an OOB memory read
in hso_probe or hso_get_config_data.

Add a length check for both locations and updated hso_probe to bail on
error.

This issue has been assigned CVE-2018-19985.

Reported-by: Hui Peng <benquike@gmail.com>
Reported-by: Mathias Payer <mathias.payer@nebelwelt.net>
Signed-off-by: Hui Peng <benquike@gmail.com>
Signed-off-by: Mathias Payer <mathias.payer@nebelwelt.net>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/usb/hso.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index 111d907e0c11..79cede19e0c4 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -2825,6 +2825,12 @@ static int hso_get_config_data(struct usb_interface *interface)
 		return -EIO;
 	}
 
+	/* check if we have a valid interface */
+	if (if_num > 16) {
+		kfree(config_data);
+		return -EINVAL;
+	}
+
 	switch (config_data[if_num]) {
 	case 0x0:
 		result = 0;
@@ -2895,10 +2901,18 @@ static int hso_probe(struct usb_interface *interface,
 
 	/* Get the interface/port specification from either driver_info or from
 	 * the device itself */
-	if (id->driver_info)
+	if (id->driver_info) {
+		/* if_num is controlled by the device, driver_info is a 0 terminated
+		 * array. Make sure, the access is in bounds! */
+		for (i = 0; i <= if_num; ++i)
+			if (((u32 *)(id->driver_info))[i] == 0)
+				goto exit;
 		port_spec = ((u32 *)(id->driver_info))[if_num];
-	else
+	} else {
 		port_spec = hso_get_config_data(interface);
+		if (port_spec < 0)
+			goto exit;
+	}
 
 	/* Check if we need to switch to alt interfaces prior to port
 	 * configuration */

From bdb82196e75ba082080f1d443757a26fbfc0f6ce Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Fri, 14 Dec 2018 10:54:43 +0200
Subject: [PATCH 2051/3220] xhci: Don't prevent USB2 bus suspend in state check
 intended for USB3 only

commit 45f750c16cae3625014c14c77bd9005eda975d35 upstream.

The code to prevent a bus suspend if a USB3 port was still in link training
also reacted to USB2 port polling state.
This caused bus suspend to busyloop in some cases.
USB2 polling state is different from USB3, and should not prevent bus
suspend.

Limit the USB3 link training state check to USB3 root hub ports only.
The origial commit went to stable so this need to be applied there as well

Fixes: 2f31a67f01a8 ("usb: xhci: Prevent bus suspend if a port connect change or polling state is detected")
Cc: stable@vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-hub.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index 5d21cd8359d4..421825b44202 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -1329,7 +1329,8 @@ int xhci_bus_suspend(struct usb_hcd *hcd)
 		portsc_buf[port_index] = 0;
 
 		/* Bail out if a USB3 port has a new device in link training */
-		if ((t1 & PORT_PLS_MASK) == XDEV_POLLING) {
+		if ((hcd->speed >= HCD_USB3) &&
+		    (t1 & PORT_PLS_MASK) == XDEV_POLLING) {
 			bus_state->bus_suspended = 0;
 			spin_unlock_irqrestore(&xhci->lock, flags);
 			xhci_dbg(xhci, "Bus suspend bailout, port in polling\n");

From ff3663c771c0e5aa12929dfe07f39914b4cd0aff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rgen=20Storvist?= <jorgen.storvist@gmail.com>
Date: Tue, 11 Dec 2018 18:28:28 +0100
Subject: [PATCH 2052/3220] USB: serial: option: add GosunCn ZTE WeLink ME3630
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 70a7444c550a75584ffcfae95267058817eff6a7 upstream.

Added USB serial option driver support for GosunCn ZTE WeLink ME3630
series cellular modules for USB modes ECM/NCM and MBIM.

usb-devices output MBIM mode:
T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 10 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=19d2 ProdID=0602 Rev=03.18
S:  Manufacturer=Android
S:  Product=Android
S:  SerialNumber=
C:  #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
I:  If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 3 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim
I:  If#= 4 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim

usb-devices output ECM/NCM mode:
T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 11 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=19d2 ProdID=1476 Rev=03.18
S:  Manufacturer=Android
S:  Product=Android
S:  SerialNumber=
C:  #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
I:  If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 3 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether
I:  If#= 4 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether

Signed-off-by: Jörgen Storvist <jorgen.storvist@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 2b81939fecd7..b2aa7c70560f 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1327,6 +1327,7 @@ static const struct usb_device_id option_ids[] = {
 	  .driver_info = RSVD(4) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0414, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0417, 0xff, 0xff, 0xff) },
+	{ USB_DEVICE_INTERFACE_CLASS(ZTE_VENDOR_ID, 0x0602, 0xff) },	/* GosunCn ZTE WeLink ME3630 (MBIM mode) */
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1008, 0xff, 0xff, 0xff),
 	  .driver_info = RSVD(4) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1010, 0xff, 0xff, 0xff),
@@ -1530,6 +1531,7 @@ static const struct usb_device_id option_ids[] = {
 	  .driver_info = RSVD(2) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1428, 0xff, 0xff, 0xff),  /* Telewell TW-LTE 4G v2 */
 	  .driver_info = RSVD(2) },
+	{ USB_DEVICE_INTERFACE_CLASS(ZTE_VENDOR_ID, 0x1476, 0xff) },	/* GosunCn ZTE WeLink ME3630 (ECM/NCM mode) */
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1533, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1534, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1535, 0xff, 0xff, 0xff) },

From d12f397f129a1d6c4f44c09a0f212e69fb665053 Mon Sep 17 00:00:00 2001
From: Tore Anderson <tore@fud.no>
Date: Sat, 8 Dec 2018 19:05:12 +0100
Subject: [PATCH 2053/3220] USB: serial: option: add HP lt4132

commit d57ec3c83b5153217a70b561d4fb6ed96f2f7a25 upstream.

The HP lt4132 is a rebranded Huawei ME906s-158 LTE modem.

The interface with protocol 0x16 is "CDC ECM & NCM" according to the *.inf
files included with the Windows driver. Attaching the option driver to it
doesn't result in a /dev/ttyUSB* device being created, so I've excluded it.
Note that it is also excluded for corresponding Huawei-branded devices, cf.
commit d544db293a44 ("USB: support new huawei devices in option.c").

T:  Bus=01 Lev=01 Prnt=01 Port=02 Cnt=02 Dev#=  3 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=ff MxPS=64 #Cfgs=  3
P:  Vendor=03f0 ProdID=a31d Rev=01.02
S:  Manufacturer=HP Inc.
S:  Product=HP lt4132 LTE/HSPA+ 4G Module
S:  SerialNumber=0123456789ABCDEF
C:  #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=2mA
I:  If#=0x0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=06 Prot=10 Driver=option
I:  If#=0x1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=13 Driver=option
I:  If#=0x2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=12 Driver=option
I:  If#=0x3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=06 Prot=16 Driver=(none)
I:  If#=0x4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=14 Driver=option
I:  If#=0x5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=1b Driver=option

T:  Bus=01 Lev=01 Prnt=01 Port=02 Cnt=02 Dev#=  3 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=ff MxPS=64 #Cfgs=  3
P:  Vendor=03f0 ProdID=a31d Rev=01.02
S:  Manufacturer=HP Inc.
S:  Product=HP lt4132 LTE/HSPA+ 4G Module
S:  SerialNumber=0123456789ABCDEF
C:  #Ifs= 7 Cfg#= 2 Atr=a0 MxPwr=2mA
I:  If#=0x0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether
I:  If#=0x1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=06 Prot=00 Driver=cdc_ether
I:  If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=06 Prot=10 Driver=option
I:  If#=0x3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=13 Driver=option
I:  If#=0x4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=12 Driver=option
I:  If#=0x5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=14 Driver=option
I:  If#=0x6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=1b Driver=option

T:  Bus=01 Lev=01 Prnt=01 Port=02 Cnt=02 Dev#=  3 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=ff MxPS=64 #Cfgs=  3
P:  Vendor=03f0 ProdID=a31d Rev=01.02
S:  Manufacturer=HP Inc.
S:  Product=HP lt4132 LTE/HSPA+ 4G Module
S:  SerialNumber=0123456789ABCDEF
C:  #Ifs= 3 Cfg#= 3 Atr=a0 MxPwr=2mA
I:  If#=0x0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim
I:  If#=0x1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim
I:  If#=0x2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=06 Prot=14 Driver=option

Signed-off-by: Tore Anderson <tore@fud.no>
Cc: stable@vger.kernel.org
[ johan: drop id defines ]
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index b2aa7c70560f..4cd445efe249 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1943,7 +1943,12 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_WMD200, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_6802, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_WMD300, 0xff, 0xff, 0xff) },
-	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0x421d, 0xff, 0xff, 0xff) }, /* HP lt2523 (Novatel E371) */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0x421d, 0xff, 0xff, 0xff) },	/* HP lt2523 (Novatel E371) */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x10) },	/* HP lt4132 (Huawei ME906s-158) */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x12) },
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x13) },
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x14) },
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x1b) },
 	{ } /* Terminating entry */
 };
 MODULE_DEVICE_TABLE(usb, option_ids);

From 3181afbf21f9c7264fed3d42e7d83e2f51f1a957 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rgen=20Storvist?= <jorgen.storvist@gmail.com>
Date: Wed, 12 Dec 2018 08:39:39 +0100
Subject: [PATCH 2054/3220] USB: serial: option: add Simcom SIM7500/SIM7600
 (MBIM mode)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit cc6730df08a291e51e145bc65e24ffb5e2f17ab6 upstream.

Added USB serial option driver support for Simcom SIM7500/SIM7600 series
cellular modules exposing MBIM interface (VID 0x1e0e,PID 0x9003)

T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 14 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=1e0e ProdID=9003 Rev=03.18
S:  Manufacturer=SimTech, Incorporated
S:  Product=SimTech, Incorporated
S:  SerialNumber=0123456789ABCDEF
C:  #Ifs= 7 Cfg#= 1 Atr=a0 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
I:  If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 5 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim
I:  If#= 6 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim

Signed-off-by: Jörgen Storvist <jorgen.storvist@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 4cd445efe249..f7c13e5f7cae 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1759,6 +1759,7 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE_AND_INTERFACE_INFO(ALINK_VENDOR_ID, ALINK_PRODUCT_3GU, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE(ALINK_VENDOR_ID, SIMCOM_PRODUCT_SIM7100E),
 	  .driver_info = RSVD(5) | RSVD(6) },
+	{ USB_DEVICE_INTERFACE_CLASS(0x1e0e, 0x9003, 0xff) },	/* Simcom SIM7500/SIM7600 MBIM mode */
 	{ USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_X060S_X200),
 	  .driver_info = NCTRL(0) | NCTRL(1) | RSVD(4) },
 	{ USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_X220_X500D),

From 339d1495cb37e1b90f263f05a5704438580e3579 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rgen=20Storvist?= <jorgen.storvist@gmail.com>
Date: Wed, 12 Dec 2018 21:47:36 +0100
Subject: [PATCH 2055/3220] USB: serial: option: add Fibocom NL668 series
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 30360224441ce89a98ed627861e735beb4010775 upstream.

Added USB serial option driver support for Fibocom NL668 series cellular
modules. Reserved USB endpoints 4, 5 and 6 for network + ADB interfaces.

usb-devices output (QMI mode)
T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 16 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=1508 ProdID=1001 Rev=03.18
S:  Manufacturer=Nodecom NL668 Modem
S:  Product=Nodecom NL668-CN Modem
S:  SerialNumber=
C:  #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
I:  If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
I:  If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none)

usb-devices output (ECM mode)
T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 17 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=1508 ProdID=1001 Rev=03.18
S:  Manufacturer=Nodecom NL668 Modem
S:  Product=Nodecom NL668-CN Modem
S:  SerialNumber=
C:  #Ifs= 7 Cfg#= 1 Atr=a0 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
I:  If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 4 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether
I:  If#= 5 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether
I:  If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none)

Signed-off-by: Jörgen Storvist <jorgen.storvist@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index f7c13e5f7cae..412d9442a760 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1950,6 +1950,8 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x13) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x14) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x1b) },
+	{ USB_DEVICE(0x1508, 0x1001),						/* Fibocom NL668 */
+	  .driver_info = RSVD(4) | RSVD(5) | RSVD(6) },
 	{ } /* Terminating entry */
 };
 MODULE_DEVICE_TABLE(usb, option_ids);

From b0c27dc554ee9d617fedaab9388158d531fb0aab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rgen=20Storvist?= <jorgen.storvist@gmail.com>
Date: Thu, 13 Dec 2018 17:32:08 +0100
Subject: [PATCH 2056/3220] USB: serial: option: add Telit LN940 series
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 28a86092b1753b802ef7e3de8a4c4a69a9c1bb03 upstream.

Added USB serial option driver support for Telit LN940 series cellular
modules. Covering both QMI and MBIM modes.

usb-devices output (0x1900):
T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 21 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=1bc7 ProdID=1900 Rev=03.10
S:  Manufacturer=Telit
S:  Product=Telit LN940 Mobile Broadband
S:  SerialNumber=0123456789ABCDEF
C:  #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
I:  If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
I:  If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option

usb-devices output (0x1901):
T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 20 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=1bc7 ProdID=1901 Rev=03.10
S:  Manufacturer=Telit
S:  Product=Telit LN940 Mobile Broadband
S:  SerialNumber=0123456789ABCDEF
C:  #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
I:  If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 4 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim
I:  If#= 5 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim

Signed-off-by: Jörgen Storvist <jorgen.storvist@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 412d9442a760..1e3445dd84b2 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1163,6 +1163,10 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1213, 0xff) },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1214),
 	  .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) },
+	{ USB_DEVICE(TELIT_VENDOR_ID, 0x1900),				/* Telit LN940 (QMI) */
+	  .driver_info = NCTRL(0) | RSVD(1) },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1901, 0xff),	/* Telit LN940 (MBIM) */
+	  .driver_info = NCTRL(0) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0002, 0xff, 0xff, 0xff),
 	  .driver_info = RSVD(1) },

From f2eca86effbeba30190f368f70b26779a04c3bad Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 10 Dec 2018 17:52:36 +0100
Subject: [PATCH 2057/3220] mmc: core: Reset HPI enabled state during re-init
 and in case of errors

commit a0741ba40a009f97c019ae7541dc61c1fdf41efb upstream.

During a re-initialization of the eMMC card, we may fail to re-enable HPI.
In these cases, that isn't properly reflected in the card->ext_csd.hpi_en
bit, as it keeps being set. This may cause following attempts to use HPI,
even if's not enabled. Let's fix this!

Fixes: eb0d8f135b67 ("mmc: core: support HPI send command")
Cc: <stable@vger.kernel.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/core/mmc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 79a0c26e1419..a31789be0840 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1608,9 +1608,11 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 		if (err) {
 			pr_warn("%s: Enabling HPI failed\n",
 				mmc_hostname(card->host));
+			card->ext_csd.hpi_en = 0;
 			err = 0;
-		} else
+		} else {
 			card->ext_csd.hpi_en = 1;
+		}
 	}
 
 	/*

From a5c4aa9ca53defd609efdaaff623624d236c67b8 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 11 Dec 2018 14:41:31 +0000
Subject: [PATCH 2058/3220] mmc: omap_hsmmc: fix DMA API warning

commit 0b479790684192ab7024ce6a621f93f6d0a64d92 upstream.

While booting with rootfs on MMC, the following warning is encountered
on OMAP4430:

omap-dma-engine 4a056000.dma-controller: DMA-API: mapping sg segment longer than device claims to support [len=69632] [max=65536]

This is because the DMA engine has a default maximum segment size of 64K
but HSMMC sets:

        mmc->max_blk_size = 512;       /* Block Length at max can be 1024 */
        mmc->max_blk_count = 0xFFFF;    /* No. of Blocks is 16 bits */
        mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
        mmc->max_seg_size = mmc->max_req_size;

which ends up telling the block layer that we support a maximum segment
size of 65535*512, which exceeds the advertised DMA engine capabilities.

Fix this by clamping the maximum segment size to the lower of the
maximum request size and of the DMA engine device used for either DMA
channel.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/host/omap_hsmmc.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c
index 6b814d7d6560..af937d3e8c3e 100644
--- a/drivers/mmc/host/omap_hsmmc.c
+++ b/drivers/mmc/host/omap_hsmmc.c
@@ -2117,7 +2117,6 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 	mmc->max_blk_size = 512;       /* Block Length at max can be 1024 */
 	mmc->max_blk_count = 0xFFFF;    /* No. of Blocks is 16 bits */
 	mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
-	mmc->max_seg_size = mmc->max_req_size;
 
 	mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED |
 		     MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_ERASE;
@@ -2174,6 +2173,17 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 		goto err_irq;
 	}
 
+	/*
+	 * Limit the maximum segment size to the lower of the request size
+	 * and the DMA engine device segment size limits.  In reality, with
+	 * 32-bit transfers, the DMA engine can do longer segments than this
+	 * but there is no way to represent that in the DMA model - if we
+	 * increase this figure here, we get warnings from the DMA API debug.
+	 */
+	mmc->max_seg_size = min3(mmc->max_req_size,
+			dma_get_max_seg_size(host->rx_chan->device->dev),
+			dma_get_max_seg_size(host->tx_chan->device->dev));
+
 	/* Request IRQ for MMC operations */
 	ret = devm_request_irq(&pdev->dev, host->irq, omap_hsmmc_irq, 0,
 			mmc_hostname(mmc), host);

From 61b4285244c1c910d3527090b49c5908beb92bf5 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 7 Dec 2018 13:07:55 +0000
Subject: [PATCH 2059/3220] gpio: max7301: fix driver for use with
 CONFIG_VMAP_STACK

commit abf221d2f51b8ce7b9959a8953f880a8b0a1400d upstream.

spi_read() and spi_write() require DMA-safe memory. When
CONFIG_VMAP_STACK is selected, those functions cannot be used
with buffers on stack.

This patch replaces calls to spi_read() and spi_write() by
spi_write_then_read() which doesn't require DMA-safe buffers.

Fixes: 0c36ec314735 ("gpio: gpio driver for max7301 SPI GPIO expander")
Cc: <stable@vger.kernel.org>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpio/gpio-max7301.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/gpio/gpio-max7301.c b/drivers/gpio/gpio-max7301.c
index 05813fbf3daf..647dfbbc4e1c 100644
--- a/drivers/gpio/gpio-max7301.c
+++ b/drivers/gpio/gpio-max7301.c
@@ -25,7 +25,7 @@ static int max7301_spi_write(struct device *dev, unsigned int reg,
 	struct spi_device *spi = to_spi_device(dev);
 	u16 word = ((reg & 0x7F) << 8) | (val & 0xFF);
 
-	return spi_write(spi, (const u8 *)&word, sizeof(word));
+	return spi_write_then_read(spi, &word, sizeof(word), NULL, 0);
 }
 
 /* A read from the MAX7301 means two transfers; here, one message each */
@@ -37,14 +37,8 @@ static int max7301_spi_read(struct device *dev, unsigned int reg)
 	struct spi_device *spi = to_spi_device(dev);
 
 	word = 0x8000 | (reg << 8);
-	ret = spi_write(spi, (const u8 *)&word, sizeof(word));
-	if (ret)
-		return ret;
-	/*
-	 * This relies on the fact, that a transfer with NULL tx_buf shifts out
-	 * zero bytes (=NOOP for MAX7301)
-	 */
-	ret = spi_read(spi, (u8 *)&word, sizeof(word));
+	ret = spi_write_then_read(spi, &word, sizeof(word), &word,
+				  sizeof(word));
 	if (ret)
 		return ret;
 	return word & 0xff;

From c866fa26823d952605b78931709b2f0954dc1145 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Thu, 13 Dec 2018 16:35:43 +0000
Subject: [PATCH 2060/3220] Drivers: hv: vmbus: Return -EINVAL for the sys
 files for unopened channels

commit fc96df16a1ce80cbb3c316ab7d4dc8cd5c2852ce upstream.

Before 98f4c651762c, we returned zeros for unopened channels.
With 98f4c651762c, we started to return random on-stack values.

We'd better return -EINVAL instead.

Fixes: 98f4c651762c ("hv: move ringbuffer bus attributes to dev_groups")
Cc: stable@vger.kernel.org
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/vmbus_drv.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 802dcb409030..b877cce0409b 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -316,6 +316,8 @@ static ssize_t out_intr_mask_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound);
 	return sprintf(buf, "%d\n", outbound.current_interrupt_mask);
 }
@@ -329,6 +331,8 @@ static ssize_t out_read_index_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound);
 	return sprintf(buf, "%d\n", outbound.current_read_index);
 }
@@ -343,6 +347,8 @@ static ssize_t out_write_index_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound);
 	return sprintf(buf, "%d\n", outbound.current_write_index);
 }
@@ -357,6 +363,8 @@ static ssize_t out_read_bytes_avail_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound);
 	return sprintf(buf, "%d\n", outbound.bytes_avail_toread);
 }
@@ -371,6 +379,8 @@ static ssize_t out_write_bytes_avail_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound);
 	return sprintf(buf, "%d\n", outbound.bytes_avail_towrite);
 }
@@ -384,6 +394,8 @@ static ssize_t in_intr_mask_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
 	return sprintf(buf, "%d\n", inbound.current_interrupt_mask);
 }
@@ -397,6 +409,8 @@ static ssize_t in_read_index_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
 	return sprintf(buf, "%d\n", inbound.current_read_index);
 }
@@ -410,6 +424,8 @@ static ssize_t in_write_index_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
 	return sprintf(buf, "%d\n", inbound.current_write_index);
 }
@@ -424,6 +440,8 @@ static ssize_t in_read_bytes_avail_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
 	return sprintf(buf, "%d\n", inbound.bytes_avail_toread);
 }
@@ -438,6 +456,8 @@ static ssize_t in_write_bytes_avail_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
+	if (hv_dev->channel->state != CHANNEL_OPENED_STATE)
+		return -EINVAL;
 	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
 	return sprintf(buf, "%d\n", inbound.bytes_avail_towrite);
 }

From 38b1b66e5796cc9089c02720b78d50d0682e66b0 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 18 Dec 2018 17:29:56 +0000
Subject: [PATCH 2061/3220] x86/mtrr: Don't copy uninitialized gentry fields
 back to userspace

commit 32043fa065b51e0b1433e48d118821c71b5cd65d upstream.

Currently the copy_to_user of data in the gentry struct is copying
uninitiaized data in field _pad from the stack to userspace.

Fix this by explicitly memset'ing gentry to zero, this also will zero any
compiler added padding fields that may be in struct (currently there are
none).

Detected by CoverityScan, CID#200783 ("Uninitialized scalar variable")

Fixes: b263b31e8ad6 ("x86, mtrr: Use explicit sizing and padding for the 64-bit ioctls")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Cc: security@kernel.org
Link: https://lkml.kernel.org/r/20181218172956.1440-1-colin.king@canonical.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/mtrr/if.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index d76f13d6d8d6..ec894bf5eeb0 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -173,6 +173,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 	struct mtrr_gentry gentry;
 	void __user *arg = (void __user *) __arg;
 
+	memset(&gentry, 0, sizeof(gentry));
+
 	switch (cmd) {
 	case MTRRIOC_ADD_ENTRY:
 	case MTRRIOC_SET_ENTRY:

From a2a840d6dcae960c2dfdf3fcb1b759e1b7d90663 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 19 Dec 2018 18:00:15 -0600
Subject: [PATCH 2062/3220] drm/ioctl: Fix Spectre v1 vulnerabilities

commit 505b5240329b922f21f91d5b5d1e535c805eca6d upstream.

nr is indirectly controlled by user-space, hence leading to a
potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

drivers/gpu/drm/drm_ioctl.c:805 drm_ioctl() warn: potential spectre issue 'dev->driver->ioctls' [r]
drivers/gpu/drm/drm_ioctl.c:810 drm_ioctl() warn: potential spectre issue 'drm_ioctls' [r] (local cap)
drivers/gpu/drm/drm_ioctl.c:892 drm_ioctl_flags() warn: potential spectre issue 'drm_ioctls' [r] (local cap)

Fix this by sanitizing nr before using it to index dev->driver->ioctls
and drm_ioctls.

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2

Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20181220000015.GA18973@embeddedor
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/drm_ioctl.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 8ce2a0c59116..a7030ada81fd 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -36,6 +36,7 @@
 
 #include <linux/pci.h>
 #include <linux/export.h>
+#include <linux/nospec.h>
 
 static int drm_version(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
@@ -702,13 +703,17 @@ long drm_ioctl(struct file *filp,
 
 	if (is_driver_ioctl) {
 		/* driver ioctl */
-		if (nr - DRM_COMMAND_BASE >= dev->driver->num_ioctls)
+		unsigned int index = nr - DRM_COMMAND_BASE;
+
+		if (index >= dev->driver->num_ioctls)
 			goto err_i1;
-		ioctl = &dev->driver->ioctls[nr - DRM_COMMAND_BASE];
+		index = array_index_nospec(index, dev->driver->num_ioctls);
+		ioctl = &dev->driver->ioctls[index];
 	} else {
 		/* core ioctl */
 		if (nr >= DRM_CORE_IOCTL_COUNT)
 			goto err_i1;
+		nr = array_index_nospec(nr, DRM_CORE_IOCTL_COUNT);
 		ioctl = &drm_ioctls[nr];
 	}
 
@@ -810,6 +815,7 @@ bool drm_ioctl_flags(unsigned int nr, unsigned int *flags)
 
 	if (nr >= DRM_CORE_IOCTL_COUNT)
 		return false;
+	nr = array_index_nospec(nr, DRM_CORE_IOCTL_COUNT);
 
 	*flags = drm_ioctls[nr].flags;
 	return true;

From 6dc50507697ce02fc8118e794b9ffdb20e6ed4ea Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 11 Dec 2018 14:10:08 -0600
Subject: [PATCH 2063/3220] ip6mr: Fix potential Spectre v1 vulnerability

[ Upstream commit 69d2c86766da2ded2b70281f1bf242cb0d58a778 ]

vr.mifi is indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

net/ipv6/ip6mr.c:1845 ip6mr_ioctl() warn: potential spectre issue 'mrt->vif_table' [r] (local cap)
net/ipv6/ip6mr.c:1919 ip6mr_compat_ioctl() warn: potential spectre issue 'mrt->vif_table' [r] (local cap)

Fix this by sanitizing vr.mifi before using it to index mrt->vif_table'

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6mr.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 9b92960f024d..74b3e9718e84 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -72,6 +72,8 @@ struct mr6_table {
 #endif
 };
 
+#include <linux/nospec.h>
+
 struct ip6mr_rule {
 	struct fib_rule		common;
 };
@@ -1871,6 +1873,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 			return -EFAULT;
 		if (vr.mifi >= mrt->maxvif)
 			return -EINVAL;
+		vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
 		read_lock(&mrt_lock);
 		vif = &mrt->vif6_table[vr.mifi];
 		if (MIF_EXISTS(mrt, vr.mifi)) {
@@ -1945,6 +1948,7 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 			return -EFAULT;
 		if (vr.mifi >= mrt->maxvif)
 			return -EINVAL;
+		vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
 		read_lock(&mrt_lock);
 		vif = &mrt->vif6_table[vr.mifi];
 		if (MIF_EXISTS(mrt, vr.mifi)) {

From 74d6170eb63f458a0ddfe75f90214d7ff03c1cc2 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 10 Dec 2018 12:41:24 -0600
Subject: [PATCH 2064/3220] ipv4: Fix potential Spectre v1 vulnerability

[ Upstream commit 5648451e30a0d13d11796574919a359025d52cce ]

vr.vifi is indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

net/ipv4/ipmr.c:1616 ipmr_ioctl() warn: potential spectre issue 'mrt->vif_table' [r] (local cap)
net/ipv4/ipmr.c:1690 ipmr_compat_ioctl() warn: potential spectre issue 'mrt->vif_table' [r] (local cap)

Fix this by sanitizing vr.vifi before using it to index mrt->vif_table'

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ipmr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8e77786549c6..1cb865fcc91b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -66,6 +66,7 @@
 #include <net/netlink.h>
 #include <net/fib_rules.h>
 #include <linux/netconf.h>
+#include <linux/nospec.h>
 
 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
 #define CONFIG_IP_PIMSM	1
@@ -1574,6 +1575,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 			return -EFAULT;
 		if (vr.vifi >= mrt->maxvif)
 			return -EINVAL;
+		vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
 		read_lock(&mrt_lock);
 		vif = &mrt->vif_table[vr.vifi];
 		if (VIF_EXISTS(mrt, vr.vifi)) {

From c0e93a6d36135d5082cb3af8352f5b69c9f58d6e Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 29 Dec 2018 13:56:36 -0800
Subject: [PATCH 2065/3220] ax25: fix a use-after-free in ax25_fillin_cb()

[ Upstream commit c433570458e49bccea5c551df628d058b3526289 ]

There are multiple issues here:

1. After freeing dev->ax25_ptr, we need to set it to NULL otherwise
   we may use a dangling pointer.

2. There is a race between ax25_setsockopt() and device notifier as
   reported by syzbot. Close it by holding RTNL lock.

3. We need to test if dev->ax25_ptr is NULL before using it.

Reported-and-tested-by: syzbot+ae6bb869cbed29b29040@syzkaller.appspotmail.com
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ax25/af_ax25.c  | 11 +++++++++--
 net/ax25/ax25_dev.c |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 2fdebabbfacd..2772f6a13fcb 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -654,15 +654,22 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
 			break;
 		}
 
-		dev = dev_get_by_name(&init_net, devname);
+		rtnl_lock();
+		dev = __dev_get_by_name(&init_net, devname);
 		if (!dev) {
+			rtnl_unlock();
 			res = -ENODEV;
 			break;
 		}
 
 		ax25->ax25_dev = ax25_dev_ax25dev(dev);
+		if (!ax25->ax25_dev) {
+			rtnl_unlock();
+			res = -ENODEV;
+			break;
+		}
 		ax25_fillin_cb(ax25, ax25->ax25_dev);
-		dev_put(dev);
+		rtnl_unlock();
 		break;
 
 	default:
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 3d106767b272..5faca5db6385 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -116,6 +116,7 @@ void ax25_dev_device_down(struct net_device *dev)
 	if ((s = ax25_dev_list) == ax25_dev) {
 		ax25_dev_list = s->next;
 		spin_unlock_bh(&ax25_dev_lock);
+		dev->ax25_ptr = NULL;
 		dev_put(dev);
 		kfree(ax25_dev);
 		return;
@@ -125,6 +126,7 @@ void ax25_dev_device_down(struct net_device *dev)
 		if (s->next == ax25_dev) {
 			s->next = ax25_dev->next;
 			spin_unlock_bh(&ax25_dev_lock);
+			dev->ax25_ptr = NULL;
 			dev_put(dev);
 			kfree(ax25_dev);
 			return;

From ba2f5c18050e5d2a04c1479c05ec5bfe2d0e8b53 Mon Sep 17 00:00:00 2001
From: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Date: Mon, 31 Dec 2018 15:43:01 -0600
Subject: [PATCH 2066/3220] ibmveth: fix DMA unmap error in ibmveth_xmit_start
 error path

[ Upstream commit 756af9c642329d54f048bac2a62f829b391f6944 ]

Commit 33a48ab105a7 ("ibmveth: Fix DMA unmap error") fixed an issue in the
normal code path of ibmveth_xmit_start() that was originally introduced by
Commit 6e8ab30ec677 ("ibmveth: Add scatter-gather support"). This original
fix missed the error path where dma_unmap_page is wrongly called on the
header portion in descs[0] which was mapped with dma_map_single. As a
result a failure to DMA map any of the frags results in a dmesg warning
when CONFIG_DMA_API_DEBUG is enabled.

------------[ cut here ]------------
DMA-API: ibmveth 30000002: device driver frees DMA memory with wrong function
  [device address=0x000000000a430000] [size=172 bytes] [mapped as page] [unmapped as single]
WARNING: CPU: 1 PID: 8426 at kernel/dma/debug.c:1085 check_unmap+0x4fc/0xe10
...
<snip>
...
DMA-API: Mapped at:
ibmveth_start_xmit+0x30c/0xb60
dev_hard_start_xmit+0x100/0x450
sch_direct_xmit+0x224/0x490
__qdisc_run+0x20c/0x980
__dev_queue_xmit+0x1bc/0xf20

This fixes the API misuse by unampping descs[0] with dma_unmap_single.

Fixes: 6e8ab30ec677 ("ibmveth: Add scatter-gather support")
Signed-off-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/ibm/ibmveth.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index 2f9b12cf9ee5..61a9ab4fe047 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1163,11 +1163,15 @@ out:
 
 map_failed_frags:
 	last = i+1;
-	for (i = 0; i < last; i++)
+	for (i = 1; i < last; i++)
 		dma_unmap_page(&adapter->vdev->dev, descs[i].fields.address,
 			       descs[i].fields.flags_len & IBMVETH_BUF_LEN_MASK,
 			       DMA_TO_DEVICE);
 
+	dma_unmap_single(&adapter->vdev->dev,
+			 descs[0].fields.address,
+			 descs[0].fields.flags_len & IBMVETH_BUF_LEN_MASK,
+			 DMA_TO_DEVICE);
 map_failed:
 	if (!firmware_has_feature(FW_FEATURE_CMO))
 		netdev_err(netdev, "tx: unable to map xmit buffer\n");

From 615b74643bceb07bf1ce91462da1d521eea2c9ae Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sun, 23 Dec 2018 12:52:18 -0500
Subject: [PATCH 2067/3220] ieee802154: lowpan_header_create check must check
 daddr

[ Upstream commit 40c3ff6d5e0809505a067dd423c110c5658c478c ]

Packet sockets may call dev_header_parse with NULL daddr. Make
lowpan_header_ops.create fail.

Fixes: 87a93e4eceb4 ("ieee802154: change needed headroom/tailroom")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ieee802154/6lowpan/tx.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index a10db45b2e1e..df32134da924 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -55,6 +55,9 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev,
 	const u8 *daddr = _daddr;
 	struct lowpan_addr_info *info;
 
+	if (!daddr)
+		return -EINVAL;
+
 	/* TODO:
 	 * if this package isn't ipv6 one, where should it be routed?
 	 */

From 708ae57321cd09ccc09e313e95a6d47329597d1b Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 18 Dec 2018 21:17:44 -0800
Subject: [PATCH 2068/3220] ipv6: explicitly initialize udp6_addr in
 udp_sock_create6()

[ Upstream commit fb24274546310872eeeaf3d1d53799d8414aa0f2 ]

syzbot reported the use of uninitialized udp6_addr::sin6_scope_id.
We can just set ::sin6_scope_id to zero, as tunnels are unlikely
to use an IPv6 address that needs a scope id and there is no
interface to bind in this context.

For net-next, it looks different as we have cfg->bind_ifindex there
so we can probably call ipv6_iface_scope_id().

Same for ::sin6_flowinfo, tunnels don't use it.

Fixes: 8024e02879dd ("udp: Add udp_sock_create for UDP tunnels to open listener socket")
Reported-by: syzbot+c56449ed3652e6720f30@syzkaller.appspotmail.com
Cc: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6_udp_tunnel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 14dacf1df529..30b03d8e321a 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -15,7 +15,7 @@
 int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 		     struct socket **sockp)
 {
-	struct sockaddr_in6 udp6_addr;
+	struct sockaddr_in6 udp6_addr = {};
 	int err;
 	struct socket *sock = NULL;
 
@@ -42,6 +42,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 		goto error;
 
 	if (cfg->peer_udp_port) {
+		memset(&udp6_addr, 0, sizeof(udp6_addr));
 		udp6_addr.sin6_family = AF_INET6;
 		memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
 		       sizeof(udp6_addr.sin6_addr));

From 866408f607aca42e77ad6e6a535c63df9ae40277 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Jan 2019 09:20:27 -0800
Subject: [PATCH 2069/3220] isdn: fix kernel-infoleak in capi_unlocked_ioctl

[ Upstream commit d63967e475ae10f286dbd35e189cb241e0b1f284 ]

Since capi_ioctl() copies 64 bytes after calling
capi20_get_manufacturer() we need to ensure to not leak
information to user.

BUG: KMSAN: kernel-infoleak in _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
CPU: 0 PID: 11245 Comm: syz-executor633 Not tainted 4.20.0-rc7+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x173/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613
 kmsan_internal_check_memory+0x9d4/0xb00 mm/kmsan/kmsan.c:704
 kmsan_copy_to_user+0xab/0xc0 mm/kmsan/kmsan_hooks.c:601
 _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
 capi_ioctl include/linux/uaccess.h:177 [inline]
 capi_unlocked_ioctl+0x1a0b/0x1bf0 drivers/isdn/capi/capi.c:939
 do_vfs_ioctl+0xebd/0x2bf0 fs/ioctl.c:46
 ksys_ioctl fs/ioctl.c:713 [inline]
 __do_sys_ioctl fs/ioctl.c:720 [inline]
 __se_sys_ioctl+0x1da/0x270 fs/ioctl.c:718
 __x64_sys_ioctl+0x4a/0x70 fs/ioctl.c:718
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x440019
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffdd4659fb8 EFLAGS: 00000213 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440019
RDX: 0000000020000080 RSI: 00000000c0044306 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8
R10: 0000000000000000 R11: 0000000000000213 R12: 00000000004018a0
R13: 0000000000401930 R14: 0000000000000000 R15: 0000000000000000

Local variable description: ----data.i@capi_unlocked_ioctl
Variable was created at:
 capi_ioctl drivers/isdn/capi/capi.c:747 [inline]
 capi_unlocked_ioctl+0x82/0x1bf0 drivers/isdn/capi/capi.c:939
 do_vfs_ioctl+0xebd/0x2bf0 fs/ioctl.c:46

Bytes 12-63 of 64 are uninitialized
Memory access of size 64 starts at ffff88807ac5fce8
Data copied to user address 0000000020000080

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Karsten Keil <isdn@linux-pingi.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/capi/kcapi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index dd7e38ac29bd..d15347de415a 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -851,7 +851,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf)
 	u16 ret;
 
 	if (contr == 0) {
-		strlcpy(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN);
+		strncpy(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN);
 		return CAPI_NOERROR;
 	}
 
@@ -859,7 +859,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf)
 
 	ctr = get_capi_ctr_by_nr(contr);
 	if (ctr && ctr->state == CAPI_CTR_RUNNING) {
-		strlcpy(buf, ctr->manu, CAPI_MANUFACTURER_LEN);
+		strncpy(buf, ctr->manu, CAPI_MANUFACTURER_LEN);
 		ret = CAPI_NOERROR;
 	} else
 		ret = CAPI_REGNOTINSTALLED;

From 429b64a9eb90b22c5000d008763b4cabf0db4fb5 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 29 Dec 2018 13:56:38 -0800
Subject: [PATCH 2070/3220] netrom: fix locking in nr_find_socket()

[ Upstream commit 7314f5480f3e37e570104dc5e0f28823ef849e72 ]

nr_find_socket(), nr_find_peer() and nr_find_listener() lock the
sock after finding it in the global list. However, the call path
requires BH disabled for the sock lock consistently.

Actually the locking is unnecessary at this point, we can just hold
the sock refcnt to make sure it is not gone after we unlock the global
list, and lock it later only when needed.

Reported-and-tested-by: syzbot+f621cda8b7e598908efa@syzkaller.appspotmail.com
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/netrom/af_netrom.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index ed212ffc1d9d..046ae1caecea 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -153,7 +153,7 @@ static struct sock *nr_find_listener(ax25_address *addr)
 	sk_for_each(s, &nr_list)
 		if (!ax25cmp(&nr_sk(s)->source_addr, addr) &&
 		    s->sk_state == TCP_LISTEN) {
-			bh_lock_sock(s);
+			sock_hold(s);
 			goto found;
 		}
 	s = NULL;
@@ -174,7 +174,7 @@ static struct sock *nr_find_socket(unsigned char index, unsigned char id)
 		struct nr_sock *nr = nr_sk(s);
 
 		if (nr->my_index == index && nr->my_id == id) {
-			bh_lock_sock(s);
+			sock_hold(s);
 			goto found;
 		}
 	}
@@ -198,7 +198,7 @@ static struct sock *nr_find_peer(unsigned char index, unsigned char id,
 
 		if (nr->your_index == index && nr->your_id == id &&
 		    !ax25cmp(&nr->dest_addr, dest)) {
-			bh_lock_sock(s);
+			sock_hold(s);
 			goto found;
 		}
 	}
@@ -224,7 +224,7 @@ static unsigned short nr_find_next_circuit(void)
 		if (i != 0 && j != 0) {
 			if ((sk=nr_find_socket(i, j)) == NULL)
 				break;
-			bh_unlock_sock(sk);
+			sock_put(sk);
 		}
 
 		id++;
@@ -918,6 +918,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (sk != NULL) {
+		bh_lock_sock(sk);
 		skb_reset_transport_header(skb);
 
 		if (frametype == NR_CONNACK && skb->len == 22)
@@ -927,6 +928,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
 
 		ret = nr_process_rx_frame(sk, skb);
 		bh_unlock_sock(sk);
+		sock_put(sk);
 		return ret;
 	}
 
@@ -958,10 +960,12 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
 	    (make = nr_make_new(sk)) == NULL) {
 		nr_transmit_refusal(skb, 0);
 		if (sk)
-			bh_unlock_sock(sk);
+			sock_put(sk);
 		return 0;
 	}
 
+	bh_lock_sock(sk);
+
 	window = skb->data[20];
 
 	skb->sk             = make;
@@ -1014,6 +1018,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
 		sk->sk_data_ready(sk);
 
 	bh_unlock_sock(sk);
+	sock_put(sk);
 
 	nr_insert_socket(make);
 

From 5b4dc608f82a7f2504619b3889334e0b621d84b5 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 21 Dec 2018 12:06:59 -0500
Subject: [PATCH 2071/3220] packet: validate address length

[ Upstream commit 99137b7888f4058087895d035d81c6b2d31015c5 ]

Packet sockets with SOCK_DGRAM may pass an address for use in
dev_hard_header. Ensure that it is of sufficient length.

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/packet/af_packet.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 07668f152a3a..050dcb71e54e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2513,6 +2513,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		proto	= saddr->sll_protocol;
 		addr	= saddr->sll_addr;
 		dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
+		if (addr && dev && saddr->sll_halen < dev->addr_len)
+			goto out;
 	}
 
 	err = -ENXIO;
@@ -2680,6 +2682,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		proto	= saddr->sll_protocol;
 		addr	= saddr->sll_addr;
 		dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
+		if (addr && dev && saddr->sll_halen < dev->addr_len)
+			goto out;
 	}
 
 	err = -ENXIO;

From 33a083483c24fddc2ece3c2d322a6fc39ae9f8b6 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 22 Dec 2018 16:53:45 -0500
Subject: [PATCH 2072/3220] packet: validate address length if non-zero

[ Upstream commit 6b8d95f1795c42161dc0984b6863e95d6acf24ed ]

Validate packet socket address length if a length is given. Zero
length is equivalent to not setting an address.

Fixes: 99137b7888f4 ("packet: validate address length")
Reported-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/packet/af_packet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 050dcb71e54e..0f50977ed53b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2511,7 +2511,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 						sll_addr)))
 			goto out;
 		proto	= saddr->sll_protocol;
-		addr	= saddr->sll_addr;
+		addr	= saddr->sll_halen ? saddr->sll_addr : NULL;
 		dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
 		if (addr && dev && saddr->sll_halen < dev->addr_len)
 			goto out;
@@ -2680,7 +2680,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
 			goto out;
 		proto	= saddr->sll_protocol;
-		addr	= saddr->sll_addr;
+		addr	= saddr->sll_halen ? saddr->sll_addr : NULL;
 		dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
 		if (addr && dev && saddr->sll_halen < dev->addr_len)
 			goto out;

From 15c17f3654f74b4772a0a4813cf3b975d12a321c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 10 Dec 2018 18:00:52 +0800
Subject: [PATCH 2073/3220] sctp: initialize sin6_flowinfo for ipv6 addrs in
 sctp_inet6addr_event

[ Upstream commit 4a2eb0c37b4759416996fbb4c45b932500cf06d3 ]

syzbot reported a kernel-infoleak, which is caused by an uninitialized
field(sin6_flowinfo) of addr->a.v6 in sctp_inet6addr_event().
The call trace is as below:

  BUG: KMSAN: kernel-infoleak in _copy_to_user+0x19a/0x230 lib/usercopy.c:33
  CPU: 1 PID: 8164 Comm: syz-executor2 Not tainted 4.20.0-rc3+ #95
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
  Google 01/01/2011
  Call Trace:
    __dump_stack lib/dump_stack.c:77 [inline]
    dump_stack+0x32d/0x480 lib/dump_stack.c:113
    kmsan_report+0x12c/0x290 mm/kmsan/kmsan.c:683
    kmsan_internal_check_memory+0x32a/0xa50 mm/kmsan/kmsan.c:743
    kmsan_copy_to_user+0x78/0xd0 mm/kmsan/kmsan_hooks.c:634
    _copy_to_user+0x19a/0x230 lib/usercopy.c:33
    copy_to_user include/linux/uaccess.h:183 [inline]
    sctp_getsockopt_local_addrs net/sctp/socket.c:5998 [inline]
    sctp_getsockopt+0x15248/0x186f0 net/sctp/socket.c:7477
    sock_common_getsockopt+0x13f/0x180 net/core/sock.c:2937
    __sys_getsockopt+0x489/0x550 net/socket.c:1939
    __do_sys_getsockopt net/socket.c:1950 [inline]
    __se_sys_getsockopt+0xe1/0x100 net/socket.c:1947
    __x64_sys_getsockopt+0x62/0x80 net/socket.c:1947
    do_syscall_64+0xcf/0x110 arch/x86/entry/common.c:291
    entry_SYSCALL_64_after_hwframe+0x63/0xe7

sin6_flowinfo is not really used by SCTP, so it will be fixed by simply
setting it to 0.

The issue exists since very beginning.
Thanks Alexander for the reproducer provided.

Reported-by: syzbot+ad5d327e6936a2e284be@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sctp/ipv6.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 5ca8309ea7b1..7dffc97a953c 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -101,6 +101,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
 		if (addr) {
 			addr->a.v6.sin6_family = AF_INET6;
 			addr->a.v6.sin6_port = 0;
+			addr->a.v6.sin6_flowinfo = 0;
 			addr->a.v6.sin6_addr = ifa->addr;
 			addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex;
 			addr->valid = 1;

From 04a1c4080cbfade5b445e15d0e64dc98e32fe484 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 13 Dec 2018 10:53:37 +0800
Subject: [PATCH 2074/3220] vhost: make sure used idx is seen before log in
 vhost_add_used_n()

[ Upstream commit 841df922417eb82c835e93d4b93eb6a68c99d599 ]

We miss a write barrier that guarantees used idx is updated and seen
before log. This will let userspace sync and copy used ring before
used idx is update. Fix this by adding a barrier before log_write().

Fixes: 8dd014adfea6f ("vhost-net: mergeable buffers support")
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/vhost/vhost.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c54d388310f0..2ed0a356d1d3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1550,6 +1550,8 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
 		return -EFAULT;
 	}
 	if (unlikely(vq->log_used)) {
+		/* Make sure used idx is seen before log. */
+		smp_wmb();
 		/* Log used index update. */
 		log_write(vq->log_base,
 			  vq->log_addr + offsetof(struct vring_used, idx),

From 291d37f7c3571ba7764fc20291754767af53ea0e Mon Sep 17 00:00:00 2001
From: Jorgen Hansen <jhansen@vmware.com>
Date: Tue, 18 Dec 2018 00:34:06 -0800
Subject: [PATCH 2075/3220] VSOCK: Send reset control packet when socket is
 partially bound

[ Upstream commit a915b982d8f5e4295f64b8dd37ce753874867e88 ]

If a server side socket is bound to an address, but not in the listening
state yet, incoming connection requests should receive a reset control
packet in response. However, the function used to send the reset
silently drops the reset packet if the sending socket isn't bound
to a remote address (as is the case for a bound socket not yet in
the listening state). This change fixes this by using the src
of the incoming packet as destination for the reset packet in
this case.

Fixes: d021c344051a ("VSOCK: Introduce VM Sockets")
Reviewed-by: Adit Ranadive <aditr@vmware.com>
Reviewed-by: Vishnu Dasa <vdasa@vmware.com>
Signed-off-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/vmw_vsock/vmci_transport.c | 67 +++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 589c8b9908a5..d24773552b64 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -272,6 +272,31 @@ vmci_transport_send_control_pkt_bh(struct sockaddr_vm *src,
 						 false);
 }
 
+static int
+vmci_transport_alloc_send_control_pkt(struct sockaddr_vm *src,
+				      struct sockaddr_vm *dst,
+				      enum vmci_transport_packet_type type,
+				      u64 size,
+				      u64 mode,
+				      struct vmci_transport_waiting_info *wait,
+				      u16 proto,
+				      struct vmci_handle handle)
+{
+	struct vmci_transport_packet *pkt;
+	int err;
+
+	pkt = kmalloc(sizeof(*pkt), GFP_KERNEL);
+	if (!pkt)
+		return -ENOMEM;
+
+	err = __vmci_transport_send_control_pkt(pkt, src, dst, type, size,
+						mode, wait, proto, handle,
+						true);
+	kfree(pkt);
+
+	return err;
+}
+
 static int
 vmci_transport_send_control_pkt(struct sock *sk,
 				enum vmci_transport_packet_type type,
@@ -281,9 +306,7 @@ vmci_transport_send_control_pkt(struct sock *sk,
 				u16 proto,
 				struct vmci_handle handle)
 {
-	struct vmci_transport_packet *pkt;
 	struct vsock_sock *vsk;
-	int err;
 
 	vsk = vsock_sk(sk);
 
@@ -293,17 +316,10 @@ vmci_transport_send_control_pkt(struct sock *sk,
 	if (!vsock_addr_bound(&vsk->remote_addr))
 		return -EINVAL;
 
-	pkt = kmalloc(sizeof(*pkt), GFP_KERNEL);
-	if (!pkt)
-		return -ENOMEM;
-
-	err = __vmci_transport_send_control_pkt(pkt, &vsk->local_addr,
-						&vsk->remote_addr, type, size,
-						mode, wait, proto, handle,
-						true);
-	kfree(pkt);
-
-	return err;
+	return vmci_transport_alloc_send_control_pkt(&vsk->local_addr,
+						     &vsk->remote_addr,
+						     type, size, mode,
+						     wait, proto, handle);
 }
 
 static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst,
@@ -321,12 +337,29 @@ static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst,
 static int vmci_transport_send_reset(struct sock *sk,
 				     struct vmci_transport_packet *pkt)
 {
+	struct sockaddr_vm *dst_ptr;
+	struct sockaddr_vm dst;
+	struct vsock_sock *vsk;
+
 	if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST)
 		return 0;
-	return vmci_transport_send_control_pkt(sk,
-					VMCI_TRANSPORT_PACKET_TYPE_RST,
-					0, 0, NULL, VSOCK_PROTO_INVALID,
-					VMCI_INVALID_HANDLE);
+
+	vsk = vsock_sk(sk);
+
+	if (!vsock_addr_bound(&vsk->local_addr))
+		return -EINVAL;
+
+	if (vsock_addr_bound(&vsk->remote_addr)) {
+		dst_ptr = &vsk->remote_addr;
+	} else {
+		vsock_addr_init(&dst, pkt->dg.src.context,
+				pkt->src_port);
+		dst_ptr = &dst;
+	}
+	return vmci_transport_alloc_send_control_pkt(&vsk->local_addr, dst_ptr,
+					     VMCI_TRANSPORT_PACKET_TYPE_RST,
+					     0, 0, NULL, VSOCK_PROTO_INVALID,
+					     VMCI_INVALID_HANDLE);
 }
 
 static int vmci_transport_send_negotiate(struct sock *sk, size_t size)

From c986a25496f578a5d3422b94d45ab661d904158f Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 18 Dec 2018 16:06:19 +0100
Subject: [PATCH 2076/3220] xen/netfront: tolerate frags with no data

[ Upstream commit d81c5054a5d1d4999c7cdead7636b6cd4af83d36 ]

At least old Xen net backends seem to send frags with no real data
sometimes. In case such a fragment happens to occur with the frag limit
already reached the frontend will BUG currently even if this situation
is easily recoverable.

Modify the BUG_ON() condition accordingly.

Tested-by: Dietmar Hahn <dietmar.hahn@ts.fujitsu.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/xen-netfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 0a4bd73caae5..6f55ab4f7959 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -889,7 +889,7 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
 		if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
 			unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
 
-			BUG_ON(pull_to <= skb_headlen(skb));
+			BUG_ON(pull_to < skb_headlen(skb));
 			__pskb_pull_tail(skb, pull_to - skb_headlen(skb));
 		}
 		if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {

From 770b0ad5ffe420a116ecd9458528451ce2be775f Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Wed, 19 Dec 2018 23:23:00 +0100
Subject: [PATCH 2077/3220] gro_cell: add napi_disable in gro_cells_destroy

[ Upstream commit 8e1da73acded4751a93d4166458a7e640f37d26c ]

Add napi_disable routine in gro_cells_destroy since starting from
commit c42858eaf492 ("gro_cells: remove spinlock protecting receive
queues") gro_cell_poll and gro_cells_destroy can run concurrently on
napi_skbs list producing a kernel Oops if the tunnel interface is
removed while gro_cell_poll is running. The following Oops has been
triggered removing a vxlan device while the interface is receiving
traffic

[ 5628.948853] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
[ 5628.949981] PGD 0 P4D 0
[ 5628.950308] Oops: 0002 [#1] SMP PTI
[ 5628.950748] CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 4.20.0-rc6+ #41
[ 5628.952940] RIP: 0010:gro_cell_poll+0x49/0x80
[ 5628.955615] RSP: 0018:ffffc9000004fdd8 EFLAGS: 00010202
[ 5628.956250] RAX: 0000000000000000 RBX: ffffe8ffffc08150 RCX: 0000000000000000
[ 5628.957102] RDX: 0000000000000000 RSI: ffff88802356bf00 RDI: ffffe8ffffc08150
[ 5628.957940] RBP: 0000000000000026 R08: 0000000000000000 R09: 0000000000000000
[ 5628.958803] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000040
[ 5628.959661] R13: ffffe8ffffc08100 R14: 0000000000000000 R15: 0000000000000040
[ 5628.960682] FS:  0000000000000000(0000) GS:ffff88803ea00000(0000) knlGS:0000000000000000
[ 5628.961616] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5628.962359] CR2: 0000000000000008 CR3: 000000000221c000 CR4: 00000000000006b0
[ 5628.963188] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 5628.964034] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 5628.964871] Call Trace:
[ 5628.965179]  net_rx_action+0xf0/0x380
[ 5628.965637]  __do_softirq+0xc7/0x431
[ 5628.966510]  run_ksoftirqd+0x24/0x30
[ 5628.966957]  smpboot_thread_fn+0xc5/0x160
[ 5628.967436]  kthread+0x113/0x130
[ 5628.968283]  ret_from_fork+0x3a/0x50
[ 5628.968721] Modules linked in:
[ 5628.969099] CR2: 0000000000000008
[ 5628.969510] ---[ end trace 9d9dedc7181661fe ]---
[ 5628.970073] RIP: 0010:gro_cell_poll+0x49/0x80
[ 5628.972965] RSP: 0018:ffffc9000004fdd8 EFLAGS: 00010202
[ 5628.973611] RAX: 0000000000000000 RBX: ffffe8ffffc08150 RCX: 0000000000000000
[ 5628.974504] RDX: 0000000000000000 RSI: ffff88802356bf00 RDI: ffffe8ffffc08150
[ 5628.975462] RBP: 0000000000000026 R08: 0000000000000000 R09: 0000000000000000
[ 5628.976413] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000040
[ 5628.977375] R13: ffffe8ffffc08100 R14: 0000000000000000 R15: 0000000000000040
[ 5628.978296] FS:  0000000000000000(0000) GS:ffff88803ea00000(0000) knlGS:0000000000000000
[ 5628.979327] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5628.980044] CR2: 0000000000000008 CR3: 000000000221c000 CR4: 00000000000006b0
[ 5628.980929] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 5628.981736] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 5628.982409] Kernel panic - not syncing: Fatal exception in interrupt
[ 5628.983307] Kernel Offset: disabled

Fixes: c42858eaf492 ("gro_cells: remove spinlock protecting receive queues")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/gro_cells.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h
index cf6c74550baa..86316f90ea1e 100644
--- a/include/net/gro_cells.h
+++ b/include/net/gro_cells.h
@@ -84,6 +84,7 @@ static inline void gro_cells_destroy(struct gro_cells *gcells)
 	for_each_possible_cpu(i) {
 		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
 
+		napi_disable(&cell->napi);
 		netif_napi_del(&cell->napi);
 		__skb_queue_purge(&cell->napi_skbs);
 	}

From f8de5a38cc3bb75db98669ffbc243175f32946a3 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 27 Dec 2018 18:55:09 -0800
Subject: [PATCH 2078/3220] sock: Make sock->sk_stamp thread-safe

[ Upstream commit 3a0ed3e9619738067214871e9cb826fa23b2ddb9 ]

Al Viro mentioned (Message-ID
<20170626041334.GZ10672@ZenIV.linux.org.uk>)
that there is probably a race condition
lurking in accesses of sk_stamp on 32-bit machines.

sock->sk_stamp is of type ktime_t which is always an s64.
On a 32 bit architecture, we might run into situations of
unsafe access as the access to the field becomes non atomic.

Use seqlocks for synchronization.
This allows us to avoid using spinlocks for readers as
readers do not need mutual exclusion.

Another approach to solve this is to require sk_lock for all
modifications of the timestamps. The current approach allows
for timestamps to have their own lock: sk_stamp_lock.
This allows for the patch to not compete with already
existing critical sections, and side effects are limited
to the paths in the patch.

The addition of the new field maintains the data locality
optimizations from
commit 9115e8cd2a0c ("net: reorganize struct sock for better data
locality")

Note that all the instances of the sk_stamp accesses
are either through the ioctl or the syscall recvmsg.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/sock.h   | 36 ++++++++++++++++++++++++++++++++++--
 net/compat.c         | 15 +++++++++------
 net/core/sock.c      |  3 +++
 net/sunrpc/svcsock.c |  2 +-
 4 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 577075713ad5..7420299c31f5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -299,6 +299,7 @@ struct cg_proto;
   *	@sk_filter: socket filtering instructions
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
+  *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
   *	@sk_tsflags: SO_TIMESTAMPING socket options
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
   *	@sk_socket: Identd and reporting IO signals
@@ -434,6 +435,9 @@ struct sock {
 	long			sk_sndtimeo;
 	struct timer_list	sk_timer;
 	ktime_t			sk_stamp;
+#if BITS_PER_LONG==32
+	seqlock_t		sk_stamp_seq;
+#endif
 	u16			sk_tsflags;
 	u32			sk_tskey;
 	struct socket		*sk_socket;
@@ -2146,6 +2150,34 @@ static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
 	atomic_add(segs, &sk->sk_drops);
 }
 
+static inline ktime_t sock_read_timestamp(struct sock *sk)
+{
+#if BITS_PER_LONG==32
+	unsigned int seq;
+	ktime_t kt;
+
+	do {
+		seq = read_seqbegin(&sk->sk_stamp_seq);
+		kt = sk->sk_stamp;
+	} while (read_seqretry(&sk->sk_stamp_seq, seq));
+
+	return kt;
+#else
+	return sk->sk_stamp;
+#endif
+}
+
+static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
+{
+#if BITS_PER_LONG==32
+	write_seqlock(&sk->sk_stamp_seq);
+	sk->sk_stamp = kt;
+	write_sequnlock(&sk->sk_stamp_seq);
+#else
+	sk->sk_stamp = kt;
+#endif
+}
+
 void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 			   struct sk_buff *skb);
 void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
@@ -2170,7 +2202,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 	     (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
 		__sock_recv_timestamp(msg, sk, skb);
 	else
-		sk->sk_stamp = kt;
+		sock_write_timestamp(sk, kt);
 
 	if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid)
 		__sock_recv_wifi_status(msg, sk, skb);
@@ -2190,7 +2222,7 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 	if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
 		__sock_recv_ts_and_drops(msg, sk, skb);
 	else
-		sk->sk_stamp = skb->tstamp;
+		sock_write_timestamp(sk, skb->tstamp);
 }
 
 void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags);
diff --git a/net/compat.c b/net/compat.c
index 17e97b106458..d67684010455 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -443,12 +443,14 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
 	err = -ENOENT;
 	if (!sock_flag(sk, SOCK_TIMESTAMP))
 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	tv = ktime_to_timeval(sk->sk_stamp);
+	tv = ktime_to_timeval(sock_read_timestamp(sk));
+
 	if (tv.tv_sec == -1)
 		return err;
 	if (tv.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
-		tv = ktime_to_timeval(sk->sk_stamp);
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
+		tv = ktime_to_timeval(kt);
 	}
 	err = 0;
 	if (put_user(tv.tv_sec, &ctv->tv_sec) ||
@@ -471,12 +473,13 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta
 	err = -ENOENT;
 	if (!sock_flag(sk, SOCK_TIMESTAMP))
 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	ts = ktime_to_timespec(sk->sk_stamp);
+	ts = ktime_to_timespec(sock_read_timestamp(sk));
 	if (ts.tv_sec == -1)
 		return err;
 	if (ts.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
-		ts = ktime_to_timespec(sk->sk_stamp);
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
+		ts = ktime_to_timespec(kt);
 	}
 	err = 0;
 	if (put_user(ts.tv_sec, &ctv->tv_sec) ||
diff --git a/net/core/sock.c b/net/core/sock.c
index 4238835a0e4e..9fb1c073d0c4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2423,6 +2423,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
 
 	sk->sk_stamp = ktime_set(-1L, 0);
+#if BITS_PER_LONG==32
+	seqlock_init(&sk->sk_stamp_seq);
+#endif
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	sk->sk_napi_id		=	0;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1413cdcc131c..9701fcca002c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -614,7 +614,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		/* Don't enable netstamp, sunrpc doesn't
 		   need that much accuracy */
 	}
-	svsk->sk_sk->sk_stamp = skb->tstamp;
+	sock_write_timestamp(svsk->sk_sk, skb->tstamp);
 	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
 
 	len  = skb->len - sizeof(struct udphdr);

From 8286dcc1d7aa46543ea3d11ce84c24965576193b Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 18 Dec 2018 11:18:34 -0600
Subject: [PATCH 2079/3220] ALSA: rme9652: Fix potential Spectre v1
 vulnerability

commit 0b84304ef5da92add8dc75a1b07879c5374cdb05 upstream.

info->channel is indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

sound/pci/rme9652/hdsp.c:4100 snd_hdsp_channel_info() warn: potential spectre issue 'hdsp->channel_map' [r] (local cap)

Fix this by sanitizing info->channel before using it to index hdsp->channel_map

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

Also, notice that I refactored the code a bit in order to get rid of the
following checkpatch warning:

ERROR: do not use assignment in if condition
FILE: sound/pci/rme9652/hdsp.c:4103:
	if ((mapped_channel = hdsp->channel_map[info->channel]) < 0)

[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2

Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/rme9652/hdsp.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/sound/pci/rme9652/hdsp.c b/sound/pci/rme9652/hdsp.c
index 7c8941b8b2de..dd6c9e6a1d53 100644
--- a/sound/pci/rme9652/hdsp.c
+++ b/sound/pci/rme9652/hdsp.c
@@ -30,6 +30,7 @@
 #include <linux/math64.h>
 #include <linux/vmalloc.h>
 #include <linux/io.h>
+#include <linux/nospec.h>
 
 #include <sound/core.h>
 #include <sound/control.h>
@@ -4065,15 +4066,16 @@ static int snd_hdsp_channel_info(struct snd_pcm_substream *substream,
 				    struct snd_pcm_channel_info *info)
 {
 	struct hdsp *hdsp = snd_pcm_substream_chip(substream);
-	int mapped_channel;
+	unsigned int channel = info->channel;
 
-	if (snd_BUG_ON(info->channel >= hdsp->max_channels))
+	if (snd_BUG_ON(channel >= hdsp->max_channels))
+		return -EINVAL;
+	channel = array_index_nospec(channel, hdsp->max_channels);
+
+	if (hdsp->channel_map[channel] < 0)
 		return -EINVAL;
 
-	if ((mapped_channel = hdsp->channel_map[info->channel]) < 0)
-		return -EINVAL;
-
-	info->offset = mapped_channel * HDSP_CHANNEL_BUFFER_BYTES;
+	info->offset = hdsp->channel_map[channel] * HDSP_CHANNEL_BUFFER_BYTES;
 	info->first = 0;
 	info->step = 32;
 	return 0;

From e8ed54c8c379da0fdf8ae04b7e45fd8367fe0f2e Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 18 Dec 2018 11:52:16 -0600
Subject: [PATCH 2080/3220] ALSA: emu10k1: Fix potential Spectre v1
 vulnerabilities

commit 5ae4f61f012a097df93de2285070ec8e34716d29 upstream.

ipcm->substream is indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

sound/pci/emu10k1/emufx.c:1031 snd_emu10k1_ipcm_poke() warn: potential spectre issue 'emu->fx8010.pcm' [r] (local cap)
sound/pci/emu10k1/emufx.c:1075 snd_emu10k1_ipcm_peek() warn: potential spectre issue 'emu->fx8010.pcm' [r] (local cap)

Fix this by sanitizing ipcm->substream before using it to index emu->fx8010.pcm

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2

Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/emu10k1/emufx.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c
index 50b216fc369f..5d422d65e62b 100644
--- a/sound/pci/emu10k1/emufx.c
+++ b/sound/pci/emu10k1/emufx.c
@@ -36,6 +36,7 @@
 #include <linux/init.h>
 #include <linux/mutex.h>
 #include <linux/moduleparam.h>
+#include <linux/nospec.h>
 
 #include <sound/core.h>
 #include <sound/tlv.h>
@@ -1000,6 +1001,8 @@ static int snd_emu10k1_ipcm_poke(struct snd_emu10k1 *emu,
 
 	if (ipcm->substream >= EMU10K1_FX8010_PCM_COUNT)
 		return -EINVAL;
+	ipcm->substream = array_index_nospec(ipcm->substream,
+					     EMU10K1_FX8010_PCM_COUNT);
 	if (ipcm->channels > 32)
 		return -EINVAL;
 	pcm = &emu->fx8010.pcm[ipcm->substream];
@@ -1046,6 +1049,8 @@ static int snd_emu10k1_ipcm_peek(struct snd_emu10k1 *emu,
 
 	if (ipcm->substream >= EMU10K1_FX8010_PCM_COUNT)
 		return -EINVAL;
+	ipcm->substream = array_index_nospec(ipcm->substream,
+					     EMU10K1_FX8010_PCM_COUNT);
 	pcm = &emu->fx8010.pcm[ipcm->substream];
 	mutex_lock(&emu->fx8010.lock);
 	spin_lock_irq(&emu->reg_lock);

From f56eb9dfd1b0e86b2d76ae966bd18d03603db1c3 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 12 Dec 2018 15:36:28 -0600
Subject: [PATCH 2081/3220] ALSA: pcm: Fix potential Spectre v1 vulnerability

commit 94ffb030b6d31ec840bb811be455dd2e26a4f43e upstream.

stream is indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

sound/core/pcm.c:140 snd_pcm_control_ioctl() warn: potential spectre issue 'pcm->streams' [r] (local cap)

Fix this by sanitizing stream before using it to index pcm->streams

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Cc: stable@vger.kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/core/pcm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index 6bda8f6c5f84..cdff5f976480 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -25,6 +25,7 @@
 #include <linux/time.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/nospec.h>
 #include <sound/core.h>
 #include <sound/minors.h>
 #include <sound/pcm.h>
@@ -125,6 +126,7 @@ static int snd_pcm_control_ioctl(struct snd_card *card,
 				return -EFAULT;
 			if (stream < 0 || stream > 1)
 				return -EINVAL;
+			stream = array_index_nospec(stream, 2);
 			if (get_user(subdevice, &info->subdevice))
 				return -EFAULT;
 			mutex_lock(&register_mutex);

From 31eadb108bf7be5e55fe725ced8a1e4c85009c95 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 12 Dec 2018 11:20:49 -0600
Subject: [PATCH 2082/3220] ALSA: emux: Fix potential Spectre v1
 vulnerabilities

commit 4aea96f4237cea0c51a8bc87c0db31f0f932f1f0 upstream.

info.mode and info.port are indirectly controlled by user-space,
hence leading to a potential exploitation of the Spectre variant 1
vulnerability.

These issues were detected with the help of Smatch:

sound/synth/emux/emux_hwdep.c:72 snd_emux_hwdep_misc_mode() warn: potential spectre issue 'emu->portptrs[i]->ctrls' [w] (local cap)
sound/synth/emux/emux_hwdep.c:75 snd_emux_hwdep_misc_mode() warn: potential spectre issue 'emu->portptrs' [w] (local cap)
sound/synth/emux/emux_hwdep.c:75 snd_emux_hwdep_misc_mode() warn: potential spectre issue 'emu->portptrs[info.port]->ctrls' [w] (local cap)

Fix this by sanitizing both info.mode and info.port before using them
to index emu->portptrs[i]->ctrls, emu->portptrs[info.port]->ctrls and
emu->portptrs.

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Cc: stable@vger.kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/synth/emux/emux_hwdep.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sound/synth/emux/emux_hwdep.c b/sound/synth/emux/emux_hwdep.c
index e557946718a9..d9fcae071b47 100644
--- a/sound/synth/emux/emux_hwdep.c
+++ b/sound/synth/emux/emux_hwdep.c
@@ -22,9 +22,9 @@
 #include <sound/core.h>
 #include <sound/hwdep.h>
 #include <linux/uaccess.h>
+#include <linux/nospec.h>
 #include "emux_voice.h"
 
-
 #define TMP_CLIENT_ID	0x1001
 
 /*
@@ -66,13 +66,16 @@ snd_emux_hwdep_misc_mode(struct snd_emux *emu, void __user *arg)
 		return -EFAULT;
 	if (info.mode < 0 || info.mode >= EMUX_MD_END)
 		return -EINVAL;
+	info.mode = array_index_nospec(info.mode, EMUX_MD_END);
 
 	if (info.port < 0) {
 		for (i = 0; i < emu->num_ports; i++)
 			emu->portptrs[i]->ctrls[info.mode] = info.value;
 	} else {
-		if (info.port < emu->num_ports)
+		if (info.port < emu->num_ports) {
+			info.port = array_index_nospec(info.port, emu->num_ports);
 			emu->portptrs[info.port]->ctrls[info.mode] = info.value;
+		}
 	}
 	return 0;
 }

From 64b9976a624d46058ff337560f0a07b4afe4326b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mantas=20Mikul=C4=97nas?= <grawity@gmail.com>
Date: Sun, 16 Dec 2018 15:44:47 +0200
Subject: [PATCH 2083/3220] ALSA: hda: add mute LED support for HP EliteBook
 840 G4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 40906ebe3af6a48457151b3c6726b480f6a6cb13 upstream.

Tested with 4.19.9.

v2: Changed from CXT_FIXUP_MUTE_LED_GPIO to CXT_FIXUP_HP_DOCK because
    that's what the existing fixups for EliteBooks use.

Signed-off-by: Mantas Mikulėnas <grawity@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/patch_conexant.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index aea3cc2abe3a..536184ac315d 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -853,6 +853,7 @@ static const struct snd_pci_quirk cxt5066_fixups[] = {
 	SND_PCI_QUIRK(0x103c, 0x8079, "HP EliteBook 840 G3", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x807C, "HP EliteBook 820 G3", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x80FD, "HP ProBook 640 G2", CXT_FIXUP_HP_DOCK),
+	SND_PCI_QUIRK(0x103c, 0x828c, "HP EliteBook 840 G4", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x83b3, "HP EliteBook 830 G5", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x83d3, "HP ProBook 640 G4", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x8174, "HP Spectre x360", CXT_FIXUP_HP_SPECTRE),

From e9e68f76701c67ab34c586523635352fdf722edf Mon Sep 17 00:00:00 2001
From: Sameer Pujar <spujar@nvidia.com>
Date: Wed, 26 Dec 2018 16:04:49 +0530
Subject: [PATCH 2084/3220] ALSA: hda/tegra: clear pending irq handlers

commit 63d2a9ec310d8bcc955574220d4631aa55c1a80c upstream.

Even after disabling interrupts on the module, it could be possible
that irq handlers are still running. System hang is seen during
suspend path. It was found that, there were pending writes on the
HDA bus and clock was disabled by that time.

Above mentioned issue is fixed by clearing any pending irq handlers
before disabling clocks and returning from hda suspend.

Suggested-by: Mohan Kumar <mkumard@nvidia.com>
Suggested-by: Dara Ramesh <dramesh@nvidia.com>
Signed-off-by: Sameer Pujar <spujar@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/hda_tegra.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/pci/hda/hda_tegra.c b/sound/pci/hda/hda_tegra.c
index 17fd81736d3d..039fbbb1e53c 100644
--- a/sound/pci/hda/hda_tegra.c
+++ b/sound/pci/hda/hda_tegra.c
@@ -249,10 +249,12 @@ static int hda_tegra_suspend(struct device *dev)
 	struct snd_card *card = dev_get_drvdata(dev);
 	struct azx *chip = card->private_data;
 	struct hda_tegra *hda = container_of(chip, struct hda_tegra, chip);
+	struct hdac_bus *bus = azx_bus(chip);
 
 	snd_power_change_state(card, SNDRV_CTL_POWER_D3hot);
 
 	azx_stop_chip(chip);
+	synchronize_irq(bus->irq);
 	azx_enter_link_reset(chip);
 	hda_tegra_disable_clocks(hda);
 

From 1e7a4f2665315338b203d513eef33ec568ad5486 Mon Sep 17 00:00:00 2001
From: Scott Chen <scott@labau.com.tw>
Date: Thu, 13 Dec 2018 06:01:47 -0500
Subject: [PATCH 2085/3220] USB: serial: pl2303: add ids for Hewlett-Packard HP
 POS pole displays

commit 8d503f206c336677954160ac62f0c7d9c219cd89 upstream.

Add device ids to pl2303 for the HP POS pole displays:
LM920:   03f0:026b
TD620:   03f0:0956
LD960TA: 03f0:4439
LD220TA: 03f0:4349
LM940:   03f0:5039

Signed-off-by: Scott Chen <scott@labau.com.tw>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/pl2303.c | 5 +++++
 drivers/usb/serial/pl2303.h | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 3da25ad267a2..4966768d3c98 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -86,9 +86,14 @@ static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(YCCABLE_VENDOR_ID, YCCABLE_PRODUCT_ID) },
 	{ USB_DEVICE(SUPERIAL_VENDOR_ID, SUPERIAL_PRODUCT_ID) },
 	{ USB_DEVICE(HP_VENDOR_ID, HP_LD220_PRODUCT_ID) },
+	{ USB_DEVICE(HP_VENDOR_ID, HP_LD220TA_PRODUCT_ID) },
 	{ USB_DEVICE(HP_VENDOR_ID, HP_LD960_PRODUCT_ID) },
+	{ USB_DEVICE(HP_VENDOR_ID, HP_LD960TA_PRODUCT_ID) },
 	{ USB_DEVICE(HP_VENDOR_ID, HP_LCM220_PRODUCT_ID) },
 	{ USB_DEVICE(HP_VENDOR_ID, HP_LCM960_PRODUCT_ID) },
+	{ USB_DEVICE(HP_VENDOR_ID, HP_LM920_PRODUCT_ID) },
+	{ USB_DEVICE(HP_VENDOR_ID, HP_LM940_PRODUCT_ID) },
+	{ USB_DEVICE(HP_VENDOR_ID, HP_TD620_PRODUCT_ID) },
 	{ USB_DEVICE(CRESSI_VENDOR_ID, CRESSI_EDY_PRODUCT_ID) },
 	{ USB_DEVICE(ZEAGLE_VENDOR_ID, ZEAGLE_N2ITION3_PRODUCT_ID) },
 	{ USB_DEVICE(SONY_VENDOR_ID, SONY_QN3USB_PRODUCT_ID) },
diff --git a/drivers/usb/serial/pl2303.h b/drivers/usb/serial/pl2303.h
index 123289085ee2..a84f0959ab34 100644
--- a/drivers/usb/serial/pl2303.h
+++ b/drivers/usb/serial/pl2303.h
@@ -123,10 +123,15 @@
 
 /* Hewlett-Packard POS Pole Displays */
 #define HP_VENDOR_ID		0x03f0
+#define HP_LM920_PRODUCT_ID	0x026b
+#define HP_TD620_PRODUCT_ID	0x0956
 #define HP_LD960_PRODUCT_ID	0x0b39
 #define HP_LCM220_PRODUCT_ID	0x3139
 #define HP_LCM960_PRODUCT_ID	0x3239
 #define HP_LD220_PRODUCT_ID	0x3524
+#define HP_LD220TA_PRODUCT_ID	0x4349
+#define HP_LD960TA_PRODUCT_ID	0x4439
+#define HP_LM940_PRODUCT_ID	0x5039
 
 /* Cressi Edy (diving computer) PC interface */
 #define CRESSI_VENDOR_ID	0x04b8

From bef5854270bad17164af00861b9ecb3dede5cf88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rgen=20Storvist?= <jorgen.storvist@gmail.com>
Date: Fri, 21 Dec 2018 14:40:44 +0100
Subject: [PATCH 2086/3220] USB: serial: option: add Fibocom NL678 series
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 4b2c01ad902ec02fa962b233decd2f14be3714ba upstream.

Added USB serial option driver support for Fibocom NL678 series cellular
module: VID 2cb7 and PIDs 0x0104 and 0x0105.
Reserved network and ADB interfaces.

T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#=  2 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=2cb7 ProdID=0104 Rev=03.10
S:  Manufacturer=Fibocom
S:  Product=Fibocom NL678-E Modem
S:  SerialNumber=12345678
C:  #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
I:  If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
I:  If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none)

T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#=  3 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=2cb7 ProdID=0105 Rev=03.10
S:  Manufacturer=Fibocom
S:  Product=Fibocom NL678-E Modem
S:  SerialNumber=12345678
C:  #Ifs= 7 Cfg#= 1 Atr=a0 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
I:  If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#= 4 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether
I:  If#= 5 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether
I:  If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none)

Signed-off-by: Jörgen Storvist <jorgen.storvist@gmail.com>
Cc: stable <stable@vger.kernel.org>
Acked-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 1e3445dd84b2..7bc2c9fef605 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1956,6 +1956,10 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, 0xff, 0x06, 0x1b) },
 	{ USB_DEVICE(0x1508, 0x1001),						/* Fibocom NL668 */
 	  .driver_info = RSVD(4) | RSVD(5) | RSVD(6) },
+	{ USB_DEVICE(0x2cb7, 0x0104),						/* Fibocom NL678 series */
+	  .driver_info = RSVD(4) | RSVD(5) },
+	{ USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0105, 0xff),			/* Fibocom NL678 series */
+	  .driver_info = RSVD(6) },
 	{ } /* Terminating entry */
 };
 MODULE_DEVICE_TABLE(usb, option_ids);

From a5d6edaafdcfa43f6c7aaa66d1bd1faba9a04cb4 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Tue, 18 Dec 2018 20:04:25 +0800
Subject: [PATCH 2087/3220] usb: r8a66597: Fix a possible concurrency
 use-after-free bug in r8a66597_endpoint_disable()

commit c85400f886e3d41e69966470879f635a2b50084c upstream.

The function r8a66597_endpoint_disable() and r8a66597_urb_enqueue() may
be concurrently executed.
The two functions both access a possible shared variable "hep->hcpriv".

This shared variable is freed by r8a66597_endpoint_disable() via the
call path:
r8a66597_endpoint_disable
  kfree(hep->hcpriv) (line 1995 in Linux-4.19)

This variable is read by r8a66597_urb_enqueue() via the call path:
r8a66597_urb_enqueue
  spin_lock_irqsave(&r8a66597->lock)
  init_pipe_info
    enable_r8a66597_pipe
      pipe = hep->hcpriv (line 802 in Linux-4.19)

The read operation is protected by a spinlock, but the free operation
is not protected by this spinlock, thus a concurrency use-after-free bug
may occur.

To fix this bug, the spin-lock and spin-unlock function calls in
r8a66597_endpoint_disable() are moved to protect the free operation.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/r8a66597-hcd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c
index a11c2c8bda53..a217f71b45c6 100644
--- a/drivers/usb/host/r8a66597-hcd.c
+++ b/drivers/usb/host/r8a66597-hcd.c
@@ -1990,6 +1990,8 @@ static int r8a66597_urb_dequeue(struct usb_hcd *hcd, struct urb *urb,
 
 static void r8a66597_endpoint_disable(struct usb_hcd *hcd,
 				      struct usb_host_endpoint *hep)
+__acquires(r8a66597->lock)
+__releases(r8a66597->lock)
 {
 	struct r8a66597 *r8a66597 = hcd_to_r8a66597(hcd);
 	struct r8a66597_pipe *pipe = (struct r8a66597_pipe *)hep->hcpriv;
@@ -2002,13 +2004,14 @@ static void r8a66597_endpoint_disable(struct usb_hcd *hcd,
 		return;
 	pipenum = pipe->info.pipenum;
 
+	spin_lock_irqsave(&r8a66597->lock, flags);
 	if (pipenum == 0) {
 		kfree(hep->hcpriv);
 		hep->hcpriv = NULL;
+		spin_unlock_irqrestore(&r8a66597->lock, flags);
 		return;
 	}
 
-	spin_lock_irqsave(&r8a66597->lock, flags);
 	pipe_stop(r8a66597, pipe);
 	pipe_irq_disable(r8a66597, pipenum);
 	disable_irq_empty(r8a66597, pipenum);

From aaf7797da3d4c71585aaceb0368273c8171f4d54 Mon Sep 17 00:00:00 2001
From: Patrick Dreyer <Patrick@Dreyer.name>
Date: Sun, 23 Dec 2018 10:06:35 -0800
Subject: [PATCH 2088/3220] Input: elan_i2c - add ACPI ID for touchpad in ASUS
 Aspire F5-573G

commit 7db54c89f0b30a101584e09d3729144e6170059d upstream.

This adds ELAN0501 to the ACPI table to support Elan touchpad found in ASUS
Aspire F5-573G.

Signed-off-by: Patrick Dreyer <Patrick.Dreyer@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/input/mouse/elan_i2c_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c
index 471984ec2db0..30adc5745cba 100644
--- a/drivers/input/mouse/elan_i2c_core.c
+++ b/drivers/input/mouse/elan_i2c_core.c
@@ -1240,6 +1240,7 @@ MODULE_DEVICE_TABLE(i2c, elan_id);
 static const struct acpi_device_id elan_acpi_id[] = {
 	{ "ELAN0000", 0 },
 	{ "ELAN0100", 0 },
+	{ "ELAN0501", 0 },
 	{ "ELAN0600", 0 },
 	{ "ELAN0602", 0 },
 	{ "ELAN0605", 0 },

From e2f780613e3b0b220c5bffefd94381df9c1c933e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Thu, 20 Dec 2018 14:21:08 -0800
Subject: [PATCH 2089/3220] KVM: x86: Use jmp to invoke kvm_spurious_fault()
 from .fixup

commit e81434995081fd7efb755fd75576b35dbb0850b1 upstream.

____kvm_handle_fault_on_reboot() provides a generic exception fixup
handler that is used to cleanly handle faults on VMX/SVM instructions
during reboot (or at least try to).  If there isn't a reboot in
progress, ____kvm_handle_fault_on_reboot() treats any exception as
fatal to KVM and invokes kvm_spurious_fault(), which in turn generates
a BUG() to get a stack trace and die.

When it was originally added by commit 4ecac3fd6dc2 ("KVM: Handle
virtualization instruction #UD faults during reboot"), the "call" to
kvm_spurious_fault() was handcoded as PUSH+JMP, where the PUSH'd value
is the RIP of the faulting instructing.

The PUSH+JMP trickery is necessary because the exception fixup handler
code lies outside of its associated function, e.g. right after the
function.  An actual CALL from the .fixup code would show a slightly
bogus stack trace, e.g. an extra "random" function would be inserted
into the trace, as the return RIP on the stack would point to no known
function (and the unwinder will likely try to guess who owns the RIP).

Unfortunately, the JMP was replaced with a CALL when the macro was
reworked to not spin indefinitely during reboot (commit b7c4145ba2eb
"KVM: Don't spin on virt instruction faults during reboot").  This
causes the aforementioned behavior where a bogus function is inserted
into the stack trace, e.g. my builds like to blame free_kvm_area().

Revert the CALL back to a JMP.  The changelog for commit b7c4145ba2eb
("KVM: Don't spin on virt instruction faults during reboot") contains
nothing that indicates the switch to CALL was deliberate.  This is
backed up by the fact that the PUSH <insn RIP> was left intact.

Note that an alternative to the PUSH+JMP magic would be to JMP back
to the "real" code and CALL from there, but that would require adding
a JMP in the non-faulting path to avoid calling kvm_spurious_fault()
and would add no value, i.e. the stack trace would be the same.

Using CALL:

------------[ cut here ]------------
kernel BUG at /home/sean/go/src/kernel.org/linux/arch/x86/kvm/x86.c:356!
invalid opcode: 0000 [#1] SMP
CPU: 4 PID: 1057 Comm: qemu-system-x86 Not tainted 4.20.0-rc6+ #75
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
RIP: 0010:kvm_spurious_fault+0x5/0x10 [kvm]
Code: <0f> 0b 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 55 49 89 fd 41
RSP: 0018:ffffc900004bbcc8 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffffffffffff
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff888273fd8000 R08: 00000000000003e8 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000784 R12: ffffc90000371fb0
R13: 0000000000000000 R14: 000000026d763cf4 R15: ffff888273fd8000
FS:  00007f3d69691700(0000) GS:ffff888277800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055f89bc56fe0 CR3: 0000000271a5a001 CR4: 0000000000362ee0
Call Trace:
 free_kvm_area+0x1044/0x43ea [kvm_intel]
 ? vmx_vcpu_run+0x156/0x630 [kvm_intel]
 ? kvm_arch_vcpu_ioctl_run+0x447/0x1a40 [kvm]
 ? kvm_vcpu_ioctl+0x368/0x5c0 [kvm]
 ? kvm_vcpu_ioctl+0x368/0x5c0 [kvm]
 ? __set_task_blocked+0x38/0x90
 ? __set_current_blocked+0x50/0x60
 ? __fpu__restore_sig+0x97/0x490
 ? do_vfs_ioctl+0xa1/0x620
 ? __x64_sys_futex+0x89/0x180
 ? ksys_ioctl+0x66/0x70
 ? __x64_sys_ioctl+0x16/0x20
 ? do_syscall_64+0x4f/0x100
 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
Modules linked in: vhost_net vhost tap kvm_intel kvm irqbypass bridge stp llc
---[ end trace 9775b14b123b1713 ]---

Using JMP:

------------[ cut here ]------------
kernel BUG at /home/sean/go/src/kernel.org/linux/arch/x86/kvm/x86.c:356!
invalid opcode: 0000 [#1] SMP
CPU: 6 PID: 1067 Comm: qemu-system-x86 Not tainted 4.20.0-rc6+ #75
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
RIP: 0010:kvm_spurious_fault+0x5/0x10 [kvm]
Code: <0f> 0b 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 55 49 89 fd 41
RSP: 0018:ffffc90000497cd0 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffffffffffff
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff88827058bd40 R08: 00000000000003e8 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000784 R12: ffffc90000369fb0
R13: 0000000000000000 R14: 00000003c8fc6642 R15: ffff88827058bd40
FS:  00007f3d7219e700(0000) GS:ffff888277900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f3d64001000 CR3: 0000000271c6b004 CR4: 0000000000362ee0
Call Trace:
 vmx_vcpu_run+0x156/0x630 [kvm_intel]
 ? kvm_arch_vcpu_ioctl_run+0x447/0x1a40 [kvm]
 ? kvm_vcpu_ioctl+0x368/0x5c0 [kvm]
 ? kvm_vcpu_ioctl+0x368/0x5c0 [kvm]
 ? __set_task_blocked+0x38/0x90
 ? __set_current_blocked+0x50/0x60
 ? __fpu__restore_sig+0x97/0x490
 ? do_vfs_ioctl+0xa1/0x620
 ? __x64_sys_futex+0x89/0x180
 ? ksys_ioctl+0x66/0x70
 ? __x64_sys_ioctl+0x16/0x20
 ? do_syscall_64+0x4f/0x100
 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
Modules linked in: vhost_net vhost tap kvm_intel kvm irqbypass bridge stp llc
---[ end trace f9daedb85ab3ddba ]---

Fixes: b7c4145ba2eb ("KVM: Don't spin on virt instruction faults during reboot")
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c048d0d70cc4..2cb49ac1b2b2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1200,7 +1200,7 @@ asmlinkage void kvm_spurious_fault(void);
 	"cmpb $0, kvm_rebooting \n\t"	      \
 	"jne 668b \n\t"      		      \
 	__ASM_SIZE(push) " $666b \n\t"	      \
-	"call kvm_spurious_fault \n\t"	      \
+	"jmp kvm_spurious_fault \n\t"	      \
 	".popsection \n\t" \
 	_ASM_EXTABLE(666b, 667b)
 

From efe22fbaa30db6e2222f1f6f388f852a06f60854 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Sun, 11 Nov 2018 18:45:24 +0000
Subject: [PATCH 2090/3220] perf pmu: Suppress potential format-truncation
 warning

commit 11a64a05dc649815670b1be9fe63d205cb076401 upstream.

Depending on which functions are inlined in util/pmu.c, the snprintf()
calls in perf_pmu__parse_{scale,unit,per_pkg,snapshot}() might trigger a
warning:

  util/pmu.c: In function 'pmu_aliases':
  util/pmu.c:178:31: error: '%s' directive output may be truncated writing up to 255 bytes into a region of size between 0 and 4095 [-Werror=format-truncation=]
    snprintf(path, PATH_MAX, "%s/%s.unit", dir, name);
                               ^~

I found this when trying to build perf from Linux 3.16 with gcc 8.
However I can reproduce the problem in mainline if I force
__perf_pmu__new_alias() to be inlined.

Suppress this by using scnprintf() as has been done elsewhere in perf.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20181111184524.fux4taownc6ndbx6@decadent.org.uk
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/perf/util/pmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 593066c68e3d..4f650ebd564a 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -100,7 +100,7 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
 	char path[PATH_MAX];
 	const char *lc;
 
-	snprintf(path, PATH_MAX, "%s/%s.scale", dir, name);
+	scnprintf(path, PATH_MAX, "%s/%s.scale", dir, name);
 
 	fd = open(path, O_RDONLY);
 	if (fd == -1)
@@ -147,7 +147,7 @@ static int perf_pmu__parse_unit(struct perf_pmu_alias *alias, char *dir, char *n
 	ssize_t sret;
 	int fd;
 
-	snprintf(path, PATH_MAX, "%s/%s.unit", dir, name);
+	scnprintf(path, PATH_MAX, "%s/%s.unit", dir, name);
 
 	fd = open(path, O_RDONLY);
 	if (fd == -1)
@@ -177,7 +177,7 @@ perf_pmu__parse_per_pkg(struct perf_pmu_alias *alias, char *dir, char *name)
 	char path[PATH_MAX];
 	int fd;
 
-	snprintf(path, PATH_MAX, "%s/%s.per-pkg", dir, name);
+	scnprintf(path, PATH_MAX, "%s/%s.per-pkg", dir, name);
 
 	fd = open(path, O_RDONLY);
 	if (fd == -1)
@@ -195,7 +195,7 @@ static int perf_pmu__parse_snapshot(struct perf_pmu_alias *alias,
 	char path[PATH_MAX];
 	int fd;
 
-	snprintf(path, PATH_MAX, "%s/%s.snapshot", dir, name);
+	scnprintf(path, PATH_MAX, "%s/%s.snapshot", dir, name);
 
 	fd = open(path, O_RDONLY);
 	if (fd == -1)

From 6e011c3af91cde044c0aac23ac0394709147b2c3 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Mon, 3 Dec 2018 23:28:02 -0500
Subject: [PATCH 2091/3220] ext4: fix possible use after free in
 ext4_quota_enable

commit 61157b24e60fb3cd1f85f2c76a7b1d628f970144 upstream.

The function frees qf_inode via iput but then pass qf_inode to
lockdep_set_quota_inode on the failure path. This may result in a
use-after-free bug. The patch frees df_inode only when it is never used.

Fixes: daf647d2dd5 ("ext4: add lockdep annotations for i_data_sem")
Cc: stable@kernel.org # 4.6
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cd9cd581fd92..62a6b75969cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5184,9 +5184,9 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 	qf_inode->i_flags |= S_NOQUOTA;
 	lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
 	err = dquot_enable(qf_inode, type, format_id, flags);
-	iput(qf_inode);
 	if (err)
 		lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
+	iput(qf_inode);
 
 	return err;
 }

From 84ad8791c9b2a8d90575dab58f0af98906f55452 Mon Sep 17 00:00:00 2001
From: Maurizio Lombardi <mlombard@redhat.com>
Date: Tue, 4 Dec 2018 00:06:53 -0500
Subject: [PATCH 2092/3220] ext4: missing unlock/put_page() in
 ext4_try_to_write_inline_data()

commit 132d00becb31e88469334e1e62751c81345280e0 upstream.

In case of error, ext4_try_to_write_inline_data() should unlock
and release the page it holds.

Fixes: f19d5870cbf7 ("ext4: add normal write support for inline data")
Cc: stable@kernel.org # 3.8
Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inline.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1aec46733ef8..46d4fac48cf4 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -701,8 +701,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 
 	if (!PageUptodate(page)) {
 		ret = ext4_read_inline_page(inode, page);
-		if (ret < 0)
+		if (ret < 0) {
+			unlock_page(page);
+			put_page(page);
 			goto out_up_read;
+		}
 	}
 
 	ret = 1;

From e270923b3c1499cb88edef936f7f1048ecb3c713 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ruippan=20=28=E6=BD=98=E7=9D=BF=29?= <ruippan@tencent.com>
Date: Tue, 4 Dec 2018 01:04:12 -0500
Subject: [PATCH 2093/3220] ext4: fix EXT4_IOC_GROUP_ADD ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e647e29196b7f802f8242c39ecb7cc937f5ef217 upstream.

Commit e2b911c53584 ("ext4: clean up feature test macros with
predicate functions") broke the EXT4_IOC_GROUP_ADD ioctl.  This was
not noticed since only very old versions of resize2fs (before
e2fsprogs 1.42) use this ioctl.  However, using a new kernel with an
enterprise Linux userspace will cause attempts to use online resize to
fail with "No reserved GDT blocks".

Fixes: e2b911c53584 ("ext4: clean up feature test macros with predicate...")
Cc: stable@kernel.org # v4.4
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: ruippan (潘睿) <ruippan@tencent.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/resize.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bad13f049fb0..2fc1564f62dd 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1600,7 +1600,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	}
 
 	if (reserved_gdb || gdb_off == 0) {
-		if (ext4_has_feature_resize_inode(sb) ||
+		if (!ext4_has_feature_resize_inode(sb) ||
 		    !le16_to_cpu(es->s_reserved_gdt_blocks)) {
 			ext4_warning(sb,
 				     "No reserved GDT blocks, can't resize");

From aa5cab08cd9a2a8c15e52c57ebd108eac88e3e8c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 19 Dec 2018 14:07:58 -0500
Subject: [PATCH 2094/3220] ext4: force inode writes when nfsd calls
 commit_metadata()

commit fde872682e175743e0c3ef939c89e3c6008a1529 upstream.

Some time back, nfsd switched from calling vfs_fsync() to using a new
commit_metadata() hook in export_operations().  If the file system did
not provide a commit_metadata() hook, it fell back to using
sync_inode_metadata().  Unfortunately doesn't work on all file
systems.  In particular, it doesn't work on ext4 due to how the inode
gets journalled --- the VFS writeback code will not always call
ext4_write_inode().

So we need to provide our own ext4_nfs_commit_metdata() method which
calls ext4_write_inode() directly.

Google-Bug-Id: 121195940
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c             | 11 +++++++++++
 include/trace/events/ext4.h | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 62a6b75969cf..6a7df72cb3da 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1049,6 +1049,16 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 				    ext4_nfs_get_inode);
 }
 
+static int ext4_nfs_commit_metadata(struct inode *inode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL
+	};
+
+	trace_ext4_nfs_commit_metadata(inode);
+	return ext4_write_inode(inode, &wbc);
+}
+
 /*
  * Try to release metadata pages (indirect blocks, directories) which are
  * mapped via the block device.  Since these pages could have journal heads
@@ -1143,6 +1153,7 @@ static const struct export_operations ext4_export_ops = {
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
 	.get_parent = ext4_get_parent,
+	.commit_metadata = ext4_nfs_commit_metadata,
 };
 
 enum {
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 594b4b29a224..7ef11b97cb2a 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -223,6 +223,26 @@ TRACE_EVENT(ext4_drop_inode,
 		  (unsigned long) __entry->ino, __entry->drop)
 );
 
+TRACE_EVENT(ext4_nfs_commit_metadata,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+	),
+
+	TP_printk("dev %d,%d ino %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino)
+);
+
 TRACE_EVENT(ext4_mark_inode_dirty,
 	TP_PROTO(struct inode *inode, unsigned long IP),
 

From 1330179228ac342f8f638d0d541e6f4a50b29b8f Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Thu, 8 Nov 2018 08:06:10 +0100
Subject: [PATCH 2095/3220] spi: bcm2835: Fix race on DMA termination
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e82b0b3828451c1cd331d9f304c6078fcd43b62e upstream.

If a DMA transfer finishes orderly right when spi_transfer_one_message()
determines that it has timed out, the callbacks bcm2835_spi_dma_done()
and bcm2835_spi_handle_err() race to call dmaengine_terminate_all(),
potentially leading to double termination.

Prevent by atomically changing the dma_pending flag before calling
dmaengine_terminate_all().

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Fixes: 3ecd37edaa2a ("spi: bcm2835: enable dma modes for transfers meeting certain conditions")
Cc: stable@vger.kernel.org # v4.2+
Cc: Mathias Duckeck <m.duckeck@kunbus.de>
Cc: Frank Pavlic <f.pavlic@kunbus.de>
Cc: Martin Sperl <kernel@martin.sperl.org>
Cc: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/spi/spi-bcm2835.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index cf04960cc3e6..62875d855627 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -233,10 +233,9 @@ static void bcm2835_spi_dma_done(void *data)
 	 * is called the tx-dma must have finished - can't get to this
 	 * situation otherwise...
 	 */
-	dmaengine_terminate_all(master->dma_tx);
-
-	/* mark as no longer pending */
-	bs->dma_pending = 0;
+	if (cmpxchg(&bs->dma_pending, true, false)) {
+		dmaengine_terminate_all(master->dma_tx);
+	}
 
 	/* and mark as completed */;
 	complete(&master->xfer_completion);
@@ -617,10 +616,9 @@ static void bcm2835_spi_handle_err(struct spi_master *master,
 	struct bcm2835_spi *bs = spi_master_get_devdata(master);
 
 	/* if an error occurred and we have an active dma, then terminate */
-	if (bs->dma_pending) {
+	if (cmpxchg(&bs->dma_pending, true, false)) {
 		dmaengine_terminate_all(master->dma_tx);
 		dmaengine_terminate_all(master->dma_rx);
-		bs->dma_pending = 0;
 	}
 	/* and reset */
 	bcm2835_spi_reset_hw(master);

From 4d69a11939573aa2e3295be456e05da70e286596 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Thu, 8 Nov 2018 08:06:10 +0100
Subject: [PATCH 2096/3220] spi: bcm2835: Fix book-keeping of DMA termination
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit dbc944115eed48af110646992893dc43321368d8 upstream.

If submission of a DMA TX transfer succeeds but submission of the
corresponding RX transfer does not, the BCM2835 SPI driver terminates
the TX transfer but neglects to reset the dma_pending flag to false.

Thus, if the next transfer uses interrupt mode (because it is shorter
than BCM2835_SPI_DMA_MIN_LENGTH) and runs into a timeout,
dmaengine_terminate_all() will be called both for TX (once more) and
for RX (which was never started in the first place).  Fix it.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Fixes: 3ecd37edaa2a ("spi: bcm2835: enable dma modes for transfers meeting certain conditions")
Cc: stable@vger.kernel.org # v4.2+
Cc: Mathias Duckeck <m.duckeck@kunbus.de>
Cc: Frank Pavlic <f.pavlic@kunbus.de>
Cc: Martin Sperl <kernel@martin.sperl.org>
Cc: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/spi/spi-bcm2835.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index 62875d855627..bb0ea7b578d1 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -341,6 +341,7 @@ static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 	if (ret) {
 		/* need to reset on errors */
 		dmaengine_terminate_all(master->dma_tx);
+		bs->dma_pending = false;
 		bcm2835_spi_reset_hw(master);
 		return ret;
 	}

From dc1715a2b2334a862b2eb41eea2b59ff28ed6a8f Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Thu, 8 Nov 2018 08:06:10 +0100
Subject: [PATCH 2097/3220] spi: bcm2835: Avoid finishing transfer prematurely
 in IRQ mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 56c1723426d3cfd4723bfbfce531d7b38bae6266 upstream.

The IRQ handler bcm2835_spi_interrupt() first reads as much as possible
from the RX FIFO, then writes as much as possible to the TX FIFO.
Afterwards it decides whether the transfer is finished by checking if
the TX FIFO is empty.

If very few bytes were written to the TX FIFO, they may already have
been transmitted by the time the FIFO's emptiness is checked.  As a
result, the transfer will be declared finished and the chip will be
reset without reading the corresponding received bytes from the RX FIFO.

The odds of this happening increase with a high clock frequency (such
that the TX FIFO drains quickly) and either passing "threadirqs" on the
command line or enabling CONFIG_PREEMPT_RT_BASE (such that the IRQ
handler may be preempted between filling the TX FIFO and checking its
emptiness).

Fix by instead checking whether rx_len has reached zero, which means
that the transfer has been received in full.  This is also more
efficient as it avoids one bus read access per interrupt.  Note that
bcm2835_spi_transfer_one_poll() likewise uses rx_len to determine
whether the transfer has finished.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Fixes: e34ff011c70e ("spi: bcm2835: move to the transfer_one driver model")
Cc: stable@vger.kernel.org # v4.1+
Cc: Mathias Duckeck <m.duckeck@kunbus.de>
Cc: Frank Pavlic <f.pavlic@kunbus.de>
Cc: Martin Sperl <kernel@martin.sperl.org>
Cc: Noralf Trønnes <noralf@tronnes.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/spi/spi-bcm2835.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index bb0ea7b578d1..92f45b6fd278 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -155,8 +155,7 @@ static irqreturn_t bcm2835_spi_interrupt(int irq, void *dev_id)
 	/* Write as many bytes as possible to FIFO */
 	bcm2835_wr_fifo(bs);
 
-	/* based on flags decide if we can finish the transfer */
-	if (bcm2835_rd(bs, BCM2835_SPI_CS) & BCM2835_SPI_CS_DONE) {
+	if (!bs->rx_len) {
 		/* Transfer complete - reset SPI HW */
 		bcm2835_spi_reset_hw(master);
 		/* wake up the framework */

From 8a92ec92c7892633f5bc82aa8d97e4bd80bf3ebd Mon Sep 17 00:00:00 2001
From: Macpaul Lin <macpaul.lin@mediatek.com>
Date: Wed, 19 Dec 2018 12:11:03 +0800
Subject: [PATCH 2098/3220] cdc-acm: fix abnormal DATA RX issue for Mediatek
 Preloader.

commit eafb27fa5283599ce6c5492ea18cf636a28222bb upstream.

Mediatek Preloader is a proprietary embedded boot loader for loading
Little Kernel and Linux into device DRAM.

This boot loader also handle firmware update. Mediatek Preloader will be
enumerated as a virtual COM port when the device is connected to Windows
or Linux OS via CDC-ACM class driver. When the USB enumeration has been
done, Mediatek Preloader will send out handshake command "READY" to PC
actively instead of waiting command from the download tool.

Since Linux 4.12, the commit "tty: reset termios state on device
registration" (93857edd9829e144acb6c7e72d593f6e01aead66) causes Mediatek
Preloader receiving some abnoraml command like "READYXX" as it sent.
This will be recognized as an incorrect response. The behavior change
also causes the download handshake fail. This change only affects
subsequent connects if the reconnected device happens to get the same minor
number.

By disabling the ECHO termios flag could avoid this problem. However, it
cannot be done by user space configuration when download tool open
/dev/ttyACM0. This is because the device running Mediatek Preloader will
send handshake command "READY" immediately once the CDC-ACM driver is
ready.

This patch wants to fix above problem by introducing "DISABLE_ECHO"
property in driver_info. When Mediatek Preloader is connected, the
CDC-ACM driver could disable ECHO flag in termios to avoid the problem.

Signed-off-by: Macpaul Lin <macpaul.lin@mediatek.com>
Cc: stable@vger.kernel.org
Reviewed-by: Johan Hovold <johan@kernel.org>
Acked-by: Oliver Neukum <oneukum@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/cdc-acm.c | 10 ++++++++++
 drivers/usb/class/cdc-acm.h |  1 +
 2 files changed, 11 insertions(+)

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 0a8e5ac891d4..3919ea066bf9 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -507,6 +507,13 @@ static int acm_tty_install(struct tty_driver *driver, struct tty_struct *tty)
 	if (retval)
 		goto error_init_termios;
 
+	/*
+	 * Suppress initial echoing for some devices which might send data
+	 * immediately after acm driver has been installed.
+	 */
+	if (acm->quirks & DISABLE_ECHO)
+		tty->termios.c_lflag &= ~ECHO;
+
 	tty->driver_data = acm;
 
 	return 0;
@@ -1677,6 +1684,9 @@ static const struct usb_device_id acm_ids[] = {
 	{ USB_DEVICE(0x0e8d, 0x0003), /* FIREFLY, MediaTek Inc; andrey.arapov@gmail.com */
 	.driver_info = NO_UNION_NORMAL, /* has no union descriptor */
 	},
+	{ USB_DEVICE(0x0e8d, 0x2000), /* MediaTek Inc Preloader */
+	.driver_info = DISABLE_ECHO, /* DISABLE ECHO in termios flag */
+	},
 	{ USB_DEVICE(0x0e8d, 0x3329), /* MediaTek Inc GPS */
 	.driver_info = NO_UNION_NORMAL, /* has no union descriptor */
 	},
diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h
index b30ac5fcde68..1ad9ff9f493d 100644
--- a/drivers/usb/class/cdc-acm.h
+++ b/drivers/usb/class/cdc-acm.h
@@ -134,3 +134,4 @@ struct acm {
 #define QUIRK_CONTROL_LINE_STATE	BIT(6)
 #define CLEAR_HALT_CONDITIONS		BIT(7)
 #define SEND_ZERO_PACKET		BIT(8)
+#define DISABLE_ECHO			BIT(9)

From c2b02f925a649b571e587f08fadc454ed10ef82c Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Fri, 9 Nov 2018 08:37:44 -0500
Subject: [PATCH 2099/3220] media: vivid: free bitmap_cap when updating
 std/timings/etc.

commit 560ccb75c2caa6b1039dec1a53cd2ef526f5bf03 upstream.

When vivid_update_format_cap() is called it should free any overlay
bitmap since the compose size will change.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reported-by: syzbot+0cc8e3cc63ca373722c6@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>      # for v3.18 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/platform/vivid/vivid-vid-cap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/media/platform/vivid/vivid-vid-cap.c b/drivers/media/platform/vivid/vivid-vid-cap.c
index ef5412311b2f..a84954f1be34 100644
--- a/drivers/media/platform/vivid/vivid-vid-cap.c
+++ b/drivers/media/platform/vivid/vivid-vid-cap.c
@@ -461,6 +461,8 @@ void vivid_update_format_cap(struct vivid_dev *dev, bool keep_controls)
 		tpg_s_rgb_range(&dev->tpg, v4l2_ctrl_g_ctrl(dev->rgb_range_cap));
 		break;
 	}
+	vfree(dev->bitmap_cap);
+	dev->bitmap_cap = NULL;
 	vivid_update_quality(dev);
 	tpg_reset_source(&dev->tpg, dev->src_rect.width, dev->src_rect.height, dev->field_cap);
 	dev->crop_cap = dev->src_rect;

From 79d02189e782873e4ce7fd676b3eec8a040217e6 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhc@lemote.com>
Date: Thu, 15 Nov 2018 15:53:54 +0800
Subject: [PATCH 2100/3220] MIPS: Ensure pmd_present() returns false after
 pmd_mknotpresent()

commit 92aa0718c9fa5160ad2f0e7b5bffb52f1ea1e51a upstream.

This patch is borrowed from ARM64 to ensure pmd_present() returns false
after pmd_mknotpresent(). This is needed for THP.

References: 5bb1cc0ff9a6 ("arm64: Ensure pmd_present() returns false after pmd_mknotpresent()")
Reviewed-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Huacai Chen <chenhc@lemote.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Patchwork: https://patchwork.linux-mips.org/patch/21135/
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <james.hogan@mips.com>
Cc: Steven J . Hill <Steven.Hill@cavium.com>
Cc: linux-mips@linux-mips.org
Cc: Fuxin Zhang <zhangfx@lemote.com>
Cc: Zhangjin Wu <wuzhangjin@gmail.com>
Cc: <stable@vger.kernel.org> # 3.8+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/include/asm/pgtable-64.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index cf661a2fb141..16fade4f49dd 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -189,6 +189,11 @@ static inline int pmd_bad(pmd_t pmd)
 
 static inline int pmd_present(pmd_t pmd)
 {
+#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
+	if (unlikely(pmd_val(pmd) & _PAGE_HUGE))
+		return pmd_val(pmd) & _PAGE_PRESENT;
+#endif
+
 	return pmd_val(pmd) != (unsigned long) invalid_pte_table;
 }
 

From a214fe5560e7129d50e8d1157293e770276d5d73 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhc@lemote.com>
Date: Thu, 15 Nov 2018 15:53:56 +0800
Subject: [PATCH 2101/3220] MIPS: Align kernel load address to 64KB

commit bec0de4cfad21bd284dbddee016ed1767a5d2823 upstream.

KEXEC needs the new kernel's load address to be aligned on a page
boundary (see sanity_check_segment_list()), but on MIPS the default
vmlinuz load address is only explicitly aligned to 16 bytes.

Since the largest PAGE_SIZE supported by MIPS kernels is 64KB, increase
the alignment calculated by calc_vmlinuz_load_addr to 64KB.

Signed-off-by: Huacai Chen <chenhc@lemote.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Patchwork: https://patchwork.linux-mips.org/patch/21131/
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <james.hogan@mips.com>
Cc: Steven J . Hill <Steven.Hill@cavium.com>
Cc: linux-mips@linux-mips.org
Cc: Fuxin Zhang <zhangfx@lemote.com>
Cc: Zhangjin Wu <wuzhangjin@gmail.com>
Cc: <stable@vger.kernel.org> # 2.6.36+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/boot/compressed/calc_vmlinuz_load_addr.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c b/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c
index 37fe58c19a90..542c3ede9722 100644
--- a/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c
+++ b/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c
@@ -13,6 +13,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include "../../../../include/linux/sizes.h"
 
 int main(int argc, char *argv[])
 {
@@ -45,11 +46,11 @@ int main(int argc, char *argv[])
 	vmlinuz_load_addr = vmlinux_load_addr + vmlinux_size;
 
 	/*
-	 * Align with 16 bytes: "greater than that used for any standard data
-	 * types by a MIPS compiler." -- See MIPS Run Linux (Second Edition).
+	 * Align with 64KB: KEXEC needs load sections to be aligned to PAGE_SIZE,
+	 * which may be as large as 64KB depending on the kernel configuration.
 	 */
 
-	vmlinuz_load_addr += (16 - vmlinux_size % 16);
+	vmlinuz_load_addr += (SZ_64K - vmlinux_size % SZ_64K);
 
 	printf("0x%llx\n", vmlinuz_load_addr);
 

From c78a5d4a64811b9f5d9d3b9dfa88ab786d949c1b Mon Sep 17 00:00:00 2001
From: Georgy A Bystrenin <gkot@altlinux.org>
Date: Fri, 21 Dec 2018 00:11:42 -0600
Subject: [PATCH 2102/3220] CIFS: Fix error mapping for SMB2_LOCK command which
 caused OFD lock problem

commit 9a596f5b39593414c0ec80f71b94a226286f084e upstream.

While resolving a bug with locks on samba shares found a strange behavior.
When a file locked by one node and we trying to lock it from another node
it fail with errno 5 (EIO) but in that case errno must be set to
(EACCES | EAGAIN).
This isn't happening when we try to lock file second time on same node.
In this case it returns EACCES as expected.
Also this issue not reproduces when we use SMB1 protocol (vers=1.0 in
mount options).

Further investigation showed that the mapping from status_to_posix_error
is different for SMB1 and SMB2+ implementations.
For SMB1 mapping is [NT_STATUS_LOCK_NOT_GRANTED to ERRlock]
(See fs/cifs/netmisc.c line 66)
but for SMB2+ mapping is [STATUS_LOCK_NOT_GRANTED to -EIO]
(see fs/cifs/smb2maperror.c line 383)

Quick changes in SMB2+ mapping from EIO to EACCES has fixed issue.

BUG: https://bugzilla.kernel.org/show_bug.cgi?id=201971

Signed-off-by: Georgy A Bystrenin <gkot@altlinux.org>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2maperror.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 8257a5a97cc0..98c25b969ab8 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -377,8 +377,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
 	{STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"},
 	{STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"},
 	{STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"},
-	{STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"},
-	{STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"},
+	{STATUS_FILE_LOCK_CONFLICT, -EACCES, "STATUS_FILE_LOCK_CONFLICT"},
+	{STATUS_LOCK_NOT_GRANTED, -EACCES, "STATUS_LOCK_NOT_GRANTED"},
 	{STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"},
 	{STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS,
 	"STATUS_CTL_FILE_NOT_SUPPORTED"},

From 0c53038267a9883e4d0d591dc620fc7f0da4c584 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 25 Jan 2018 16:37:07 +0100
Subject: [PATCH 2103/3220] x86/kvm/vmx: do not use vm-exit instruction length
 for fast MMIO when running nested
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit d391f1207067268261add0485f0f34503539c5b0 upstream.

I was investigating an issue with seabios >= 1.10 which stopped working
for nested KVM on Hyper-V. The problem appears to be in
handle_ept_violation() function: when we do fast mmio we need to skip
the instruction so we do kvm_skip_emulated_instruction(). This, however,
depends on VM_EXIT_INSTRUCTION_LEN field being set correctly in VMCS.
However, this is not the case.

Intel's manual doesn't mandate VM_EXIT_INSTRUCTION_LEN to be set when
EPT MISCONFIG occurs. While on real hardware it was observed to be set,
some hypervisors follow the spec and don't set it; we end up advancing
IP with some random value.

I checked with Microsoft and they confirmed they don't fill
VM_EXIT_INSTRUCTION_LEN on EPT MISCONFIG.

Fix the issue by doing instruction skip through emulator when running
nested.

Fixes: 68c3b4d1676d870f0453c31d5a52e7e65c7448ae
Suggested-by: Radim Krčmář <rkrcmar@redhat.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
[mhaboustak: backport to 4.9.y]
Signed-off-by: Mike Haboustak <haboustak@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 19 +++++++++++++++++--
 arch/x86/kvm/x86.c |  3 ++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e4b5fd72ca24..3bdb2e747b89 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6163,9 +6163,24 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
-		skip_emulated_instruction(vcpu);
 		trace_kvm_fast_mmio(gpa);
-		return 1;
+		/*
+		* Doing kvm_skip_emulated_instruction() depends on undefined
+		* behavior: Intel's manual doesn't mandate
+		* VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
+		* occurs and while on real hardware it was observed to be set,
+		* other hypervisors (namely Hyper-V) don't set it, we end up
+		* advancing IP with some random value. Disable fast mmio when
+		* running nested and keep it for real hardware in hope that
+		* VM_EXIT_INSTRUCTION_LEN will always be set correctly.
+		*/
+		if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) {
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		else
+			return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
+						       NULL, 0) == EMULATE_DONE;
 	}
 
 	ret = handle_mmio_page_fault(vcpu, gpa, true);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aa1a0277a678..1a934bb8ed1c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5436,7 +5436,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		 * handle watchpoints yet, those would be handled in
 		 * the emulate_ops.
 		 */
-		if (kvm_vcpu_check_breakpoint(vcpu, &r))
+		if (!(emulation_type & EMULTYPE_SKIP) &&
+		    kvm_vcpu_check_breakpoint(vcpu, &r))
 			return r;
 
 		ctxt->interruptibility = 0;

From b76db5ad2f97acfe14993caad0431904eed37dea Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Thu, 29 Nov 2018 15:14:49 +0100
Subject: [PATCH 2104/3220] spi: bcm2835: Unbreak the build of esoteric configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 29bdedfd9cf40e59456110ca417a8cb672ac9b92 upstream.

Commit e82b0b382845 ("spi: bcm2835: Fix race on DMA termination") broke
the build with COMPILE_TEST=y on arches whose cmpxchg() requires 32-bit
operands (xtensa, older arm ISAs).

Fix by changing the dma_pending flag's type from bool to unsigned int.

Fixes: e82b0b382845 ("spi: bcm2835: Fix race on DMA termination")
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: Frank Pavlic <f.pavlic@kunbus.de>
Cc: Martin Sperl <kernel@martin.sperl.org>
Cc: Noralf Trønnes <noralf@tronnes.org>
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/spi/spi-bcm2835.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index 92f45b6fd278..1a1368f5863c 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -88,7 +88,7 @@ struct bcm2835_spi {
 	u8 *rx_buf;
 	int tx_len;
 	int rx_len;
-	bool dma_pending;
+	unsigned int dma_pending;
 };
 
 static inline u32 bcm2835_rd(struct bcm2835_spi *bs, unsigned reg)

From c7c77cdf7b494ab4a6e92013c7fc5281d0207a86 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 27 Nov 2018 09:01:54 +1100
Subject: [PATCH 2105/3220] powerpc: Fix COFF zImage booting on old powermacs

[ Upstream commit 5564597d51c8ff5b88d95c76255e18b13b760879 ]

Commit 6975a783d7b4 ("powerpc/boot: Allow building the zImage wrapper
as a relocatable ET_DYN", 2011-04-12) changed the procedure descriptor
at the start of crt0.S to have a hard-coded start address of 0x500000
rather than a reference to _zimage_start, presumably because having
a reference to a symbol introduced a relocation which is awkward to
handle in a position-independent executable.  Unfortunately, what is
at 0x500000 in the COFF image is not the first instruction, but the
procedure descriptor itself, that is, a word containing 0x500000,
which is not a valid instruction.  Hence, booting a COFF zImage
results in a "DEFAULT CATCH!, code=FFF00700" message from Open
Firmware.

This fixes the problem by (a) putting the procedure descriptor in the
data section and (b) adding a branch to _zimage_start as the first
instruction in the program.

Fixes: 6975a783d7b4 ("powerpc/boot: Allow building the zImage wrapper as a relocatable ET_DYN")
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/boot/crt0.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/boot/crt0.S b/arch/powerpc/boot/crt0.S
index 5c2199857aa8..a3550e8f1a77 100644
--- a/arch/powerpc/boot/crt0.S
+++ b/arch/powerpc/boot/crt0.S
@@ -15,7 +15,7 @@
 RELA = 7
 RELACOUNT = 0x6ffffff9
 
-	.text
+	.data
 	/* A procedure descriptor used when booting this as a COFF file.
 	 * When making COFF, this comes first in the link and we're
 	 * linked at 0x500000.
@@ -23,6 +23,8 @@ RELACOUNT = 0x6ffffff9
 	.globl	_zimage_start_opd
 _zimage_start_opd:
 	.long	0x500000, 0, 0, 0
+	.text
+	b	_zimage_start
 
 #ifdef __powerpc64__
 .balign 8

From 88206b0e304275c64d98bc26c798ee2299bba8c3 Mon Sep 17 00:00:00 2001
From: Anson Huang <anson.huang@nxp.com>
Date: Tue, 4 Dec 2018 03:17:45 +0000
Subject: [PATCH 2106/3220] ARM: imx: update the cpu power up timing setting on
 i.mx6sx

[ Upstream commit 1e434b703248580b7aaaf8a115d93e682f57d29f ]

The sw2iso count should cover ARM LDO ramp-up time,
the MAX ARM LDO ramp-up time may be up to more than
100us on some boards, this patch sets sw2iso to 0xf
(~384us) which is the reset value, and it is much
more safe to cover different boards, since we have
observed that some customer boards failed with current
setting of 0x2.

Fixes: 05136f0897b5 ("ARM: imx: support arm power off in cpuidle for i.mx6sx")
Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/mach-imx/cpuidle-imx6sx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mach-imx/cpuidle-imx6sx.c b/arch/arm/mach-imx/cpuidle-imx6sx.c
index 3c6672b3796b..7f5df8992008 100644
--- a/arch/arm/mach-imx/cpuidle-imx6sx.c
+++ b/arch/arm/mach-imx/cpuidle-imx6sx.c
@@ -97,7 +97,7 @@ int __init imx6sx_cpuidle_init(void)
 	 * except for power up sw2iso which need to be
 	 * larger than LDO ramp up time.
 	 */
-	imx_gpc_set_arm_power_up_timing(2, 1);
+	imx_gpc_set_arm_power_up_timing(0xf, 1);
 	imx_gpc_set_arm_power_down_timing(1, 1);
 
 	return cpuidle_register(&imx6sx_cpuidle_driver, NULL);

From 38a3371158967be5a7a314449f2163d32fd2b8c8 Mon Sep 17 00:00:00 2001
From: Peter Hutterer <peter.hutterer@who-t.net>
Date: Thu, 6 Dec 2018 09:03:36 +1000
Subject: [PATCH 2107/3220] Input: restore EV_ABS ABS_RESERVED

[ Upstream commit c201e3808e0e4be9b98d192802085a9f491bd80c ]

ABS_RESERVED was added in d9ca1c990a7 and accidentally removed as part of
ffe0e7cf290f5c9 when the high-resolution scrolling code was removed.

Signed-off-by: Peter Hutterer <peter.hutterer@who-t.net>
Reviewed-by: Martin Kepplinger <martin.kepplinger@ginzinger.com>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/uapi/linux/input-event-codes.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 87cf351bab03..9e07bf4259e1 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -708,6 +708,15 @@
 
 #define ABS_MISC		0x28
 
+/*
+ * 0x2e is reserved and should not be used in input drivers.
+ * It was used by HID as ABS_MISC+6 and userspace needs to detect if
+ * the next ABS_* event is correct or is just ABS_MISC + n.
+ * We define here ABS_RESERVED so userspace can rely on it and detect
+ * the situation described above.
+ */
+#define ABS_RESERVED		0x2e
+
 #define ABS_MT_SLOT		0x2f	/* MT slot being modified */
 #define ABS_MT_TOUCH_MAJOR	0x30	/* Major axis of touching ellipse */
 #define ABS_MT_TOUCH_MINOR	0x31	/* Minor axis (omit if circular) */

From 0769670720e046f850695256547e8e09d40d7b76 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Fri, 14 Dec 2018 14:17:20 -0800
Subject: [PATCH 2108/3220] checkstack.pl: fix for aarch64

[ Upstream commit f1733a1d3cd32a9492f4cf866be37bb46e10163d ]

There is actually a space after "sp," like this,

    ffff2000080813c8:       a9bb7bfd        stp     x29, x30, [sp, #-80]!

Right now, checkstack.pl isn't able to print anything on aarch64,
because it won't be able to match the stating objdump line of a function
due to this missing space.  Hence, it displays every stack as zero-size.

After this patch, checkpatch.pl is able to match the start of a
function's objdump, and is then able to calculate each function's stack
correctly.

Link: http://lkml.kernel.org/r/20181207195843.38528-1-cai@lca.pw
Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 scripts/checkstack.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl
index dd8397894d5c..12a6940741fe 100755
--- a/scripts/checkstack.pl
+++ b/scripts/checkstack.pl
@@ -46,8 +46,8 @@ my (@stack, $re, $dre, $x, $xs, $funcre);
 	$xs	= "[0-9a-f ]";	# hex character or space
 	$funcre = qr/^$x* <(.*)>:$/;
 	if ($arch eq 'aarch64') {
-		#ffffffc0006325cc:       a9bb7bfd        stp     x29, x30, [sp,#-80]!
-		$re = qr/^.*stp.*sp,\#-([0-9]{1,8})\]\!/o;
+		#ffffffc0006325cc:       a9bb7bfd        stp     x29, x30, [sp, #-80]!
+		$re = qr/^.*stp.*sp, \#-([0-9]{1,8})\]\!/o;
 	} elsif ($arch eq 'arm') {
 		#c0008ffc:	e24dd064	sub	sp, sp, #100	; 0x64
 		$re = qr/.*sub.*sp, sp, #(([0-9]{2}|[3-9])[0-9]{2})/o;

From 987f20645911a057ea530de68716101e5001c715 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.com>
Date: Mon, 5 Nov 2018 17:00:53 +0900
Subject: [PATCH 2109/3220] xfrm: Fix bucket count reported to userspace

[ Upstream commit ca92e173ab34a4f7fc4128bd372bd96f1af6f507 ]

sadhcnt is reported by `ip -s xfrm state count` as "buckets count", not the
hash mask.

Fixes: 28d8909bc790 ("[XFRM]: Export SAD info.")
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/xfrm/xfrm_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 9b6e51450fc5..13f261feb75c 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -623,7 +623,7 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
 {
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
 	si->sadcnt = net->xfrm.state_num;
-	si->sadhcnt = net->xfrm.state_hmask;
+	si->sadhcnt = net->xfrm.state_hmask + 1;
 	si->sadhmcnt = xfrm_state_hashmax;
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 }

From ebcdd1195ee26fbc172cde4915e9b1ae2c4bfd7e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 1 Nov 2018 08:25:30 +0300
Subject: [PATCH 2110/3220] scsi: bnx2fc: Fix NULL dereference in error
 handling

[ Upstream commit 9ae4f8420ed7be4b13c96600e3568c144d101a23 ]

If "interface" is NULL then we can't release it and trying to will only
lead to an Oops.

Fixes: aea71a024914 ("[SCSI] bnx2fc: Introduce interface structure for each vlan interface")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index d0b227ffbd5f..573aeec7a02b 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -2279,7 +2279,7 @@ static int _bnx2fc_create(struct net_device *netdev,
 	if (!interface) {
 		printk(KERN_ERR PFX "bnx2fc_interface_create failed\n");
 		rc = -ENOMEM;
-		goto ifput_err;
+		goto netdev_err;
 	}
 
 	if (netdev->priv_flags & IFF_802_1Q_VLAN) {

From beb2654f07a2e1e8f0c647a9adfec8ca6e7709a0 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 4 Dec 2018 13:52:49 -0800
Subject: [PATCH 2111/3220] Input: omap-keypad - fix idle configuration to not
 block SoC idle states

[ Upstream commit e2ca26ec4f01486661b55b03597c13e2b9c18b73 ]

With PM enabled, I noticed that pressing a key on the droid4 keyboard will
block deeper idle states for the SoC. Let's fix this by using IRQF_ONESHOT
and stop constantly toggling the device OMAP4_KBD_IRQENABLE register as
suggested by Dmitry Torokhov <dmitry.torokhov@gmail.com>.

From the hardware point of view, looks like we need to manage the registers
for OMAP4_KBD_IRQENABLE and OMAP4_KBD_WAKEUPENABLE together to avoid
blocking deeper SoC idle states. And with toggling of OMAP4_KBD_IRQENABLE
register now gone with IRQF_ONESHOT, also the SoC idle state problem is
gone during runtime. We still also need to clear OMAP4_KBD_WAKEUPENABLE in
omap4_keypad_close() though to pair it with omap4_keypad_open() to prevent
blocking deeper SoC idle states after rmmod omap4-keypad.

Reported-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/input/keyboard/omap4-keypad.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/input/keyboard/omap4-keypad.c b/drivers/input/keyboard/omap4-keypad.c
index f78c464899db..3d2c60c8de83 100644
--- a/drivers/input/keyboard/omap4-keypad.c
+++ b/drivers/input/keyboard/omap4-keypad.c
@@ -126,12 +126,8 @@ static irqreturn_t omap4_keypad_irq_handler(int irq, void *dev_id)
 {
 	struct omap4_keypad *keypad_data = dev_id;
 
-	if (kbd_read_irqreg(keypad_data, OMAP4_KBD_IRQSTATUS)) {
-		/* Disable interrupts */
-		kbd_write_irqreg(keypad_data, OMAP4_KBD_IRQENABLE,
-				 OMAP4_VAL_IRQDISABLE);
+	if (kbd_read_irqreg(keypad_data, OMAP4_KBD_IRQSTATUS))
 		return IRQ_WAKE_THREAD;
-	}
 
 	return IRQ_NONE;
 }
@@ -173,11 +169,6 @@ static irqreturn_t omap4_keypad_irq_thread_fn(int irq, void *dev_id)
 	kbd_write_irqreg(keypad_data, OMAP4_KBD_IRQSTATUS,
 			 kbd_read_irqreg(keypad_data, OMAP4_KBD_IRQSTATUS));
 
-	/* enable interrupts */
-	kbd_write_irqreg(keypad_data, OMAP4_KBD_IRQENABLE,
-		OMAP4_DEF_IRQENABLE_EVENTEN |
-				OMAP4_DEF_IRQENABLE_LONGKEY);
-
 	return IRQ_HANDLED;
 }
 
@@ -214,9 +205,10 @@ static void omap4_keypad_close(struct input_dev *input)
 
 	disable_irq(keypad_data->irq);
 
-	/* Disable interrupts */
+	/* Disable interrupts and wake-up events */
 	kbd_write_irqreg(keypad_data, OMAP4_KBD_IRQENABLE,
 			 OMAP4_VAL_IRQDISABLE);
+	kbd_writel(keypad_data, OMAP4_KBD_WAKEUPENABLE, 0);
 
 	/* clear pending interrupts */
 	kbd_write_irqreg(keypad_data, OMAP4_KBD_IRQSTATUS,
@@ -364,7 +356,7 @@ static int omap4_keypad_probe(struct platform_device *pdev)
 	}
 
 	error = request_threaded_irq(keypad_data->irq, omap4_keypad_irq_handler,
-				     omap4_keypad_irq_thread_fn, 0,
+				     omap4_keypad_irq_thread_fn, IRQF_ONESHOT,
 				     "omap4-keypad", keypad_data);
 	if (error) {
 		dev_err(&pdev->dev, "failed to register interrupt\n");

From d15d1677aa32d3981489bd3abc17d4c101f6011e Mon Sep 17 00:00:00 2001
From: Steffen Maier <maier@linux.ibm.com>
Date: Thu, 6 Dec 2018 17:31:20 +0100
Subject: [PATCH 2112/3220] scsi: zfcp: fix posting too many status read
 buffers leading to adapter shutdown

commit 60a161b7e5b2a252ff0d4c622266a7d8da1120ce upstream.

Suppose adapter (open) recovery is between opened QDIO queues and before
(the end of) initial posting of status read buffers (SRBs). This time
window can be seconds long due to FSF_PROT_HOST_CONNECTION_INITIALIZING
causing by design looping with exponential increase sleeps in the function
performing exchange config data during recovery
[zfcp_erp_adapter_strat_fsf_xconf()]. Recovery triggered by local link up.

Suppose an event occurs for which the FCP channel would send an unsolicited
notification to zfcp by means of a previously posted SRB.  We saw it with
local cable pull (link down) in multi-initiator zoning with multiple
NPIV-enabled subchannels of the same shared FCP channel.

As soon as zfcp_erp_adapter_strategy_open_fsf() starts posting the initial
status read buffers from within the adapter's ERP thread, the channel does
send an unsolicited notification.

Since v2.6.27 commit d26ab06ede83 ("[SCSI] zfcp: receiving an unsolicted
status can lead to I/O stall"), zfcp_fsf_status_read_handler() schedules
adapter->stat_work to re-fill the just consumed SRB from a work item.

Now the ERP thread and the work item post SRBs in parallel.  Both contexts
call the helper function zfcp_status_read_refill().  The tracking of
missing (to be posted / re-filled) SRBs is not thread-safe due to separate
atomic_read() and atomic_dec(), in order to depend on posting
success. Hence, both contexts can see
atomic_read(&adapter->stat_miss) == 1. One of the two contexts posts
one too many SRB. Zfcp gets QDIO_ERROR_SLSB_STATE on the output queue
(trace tag "qdireq1") leading to zfcp_erp_adapter_shutdown() in
zfcp_qdio_handler_error().

An obvious and seemingly clean fix would be to schedule stat_work from the
ERP thread and wait for it to finish. This would serialize all SRB
re-fills. However, we already have another work item wait on the ERP
thread: adapter->scan_work runs zfcp_fc_scan_ports() which calls
zfcp_fc_eval_gpn_ft(). The latter calls zfcp_erp_wait() to wait for all the
open port recoveries during zfcp auto port scan, but in fact it waits for
any pending recovery including an adapter recovery. This approach leads to
a deadlock.  [see also v3.19 commit 18f87a67e6d6 ("zfcp: auto port scan
resiliency"); v2.6.37 commit d3e1088d6873
("[SCSI] zfcp: No ERP escalation on gpn_ft eval");
v2.6.28 commit fca55b6fb587
("[SCSI] zfcp: fix deadlock between wq triggered port scan and ERP")
fixing v2.6.27 commit c57a39a45a76
("[SCSI] zfcp: wait until adapter is finished with ERP during auto-port");
v2.6.27 commit cc8c282963bd
("[SCSI] zfcp: Automatically attach remote ports")]

Instead make the accounting of missing SRBs atomic for parallel execution
in both the ERP thread and adapter->stat_work.

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Fixes: d26ab06ede83 ("[SCSI] zfcp: receiving an unsolicted status can lead to I/O stall")
Cc: <stable@vger.kernel.org> #2.6.27+
Reviewed-by: Jens Remus <jremus@linux.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/s390/scsi/zfcp_aux.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c
index 38c8e308d4c8..a96c98e3fc73 100644
--- a/drivers/s390/scsi/zfcp_aux.c
+++ b/drivers/s390/scsi/zfcp_aux.c
@@ -275,16 +275,16 @@ static void zfcp_free_low_mem_buffers(struct zfcp_adapter *adapter)
  */
 int zfcp_status_read_refill(struct zfcp_adapter *adapter)
 {
-	while (atomic_read(&adapter->stat_miss) > 0)
+	while (atomic_add_unless(&adapter->stat_miss, -1, 0))
 		if (zfcp_fsf_status_read(adapter->qdio)) {
+			atomic_inc(&adapter->stat_miss); /* undo add -1 */
 			if (atomic_read(&adapter->stat_miss) >=
 			    adapter->stat_read_buf_num) {
 				zfcp_erp_adapter_reopen(adapter, 0, "axsref1");
 				return 1;
 			}
 			break;
-		} else
-			atomic_dec(&adapter->stat_miss);
+		}
 	return 0;
 }
 

From d447cf0ceefa01ee9203145d011eedca6e1194e6 Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Tue, 8 Jan 2019 13:58:52 +0100
Subject: [PATCH 2113/3220] fork: record start_time late

commit 7b55851367136b1efd84d98fea81ba57a98304cf upstream.

This changes the fork(2) syscall to record the process start_time after
initializing the basic task structure but still before making the new
process visible to user-space.

Technically, we could record the start_time anytime during fork(2).  But
this might lead to scenarios where a start_time is recorded long before
a process becomes visible to user-space.  For instance, with
userfaultfd(2) and TLS, user-space can delay the execution of fork(2)
for an indefinite amount of time (and will, if this causes network
access, or similar).

By recording the start_time late, it much closer reflects the point in
time where the process becomes live and can be observed by other
processes.

Lastly, this makes it much harder for user-space to predict and control
the start_time they get assigned.  Previously, user-space could fork a
process and stall it in copy_thread_tls() before its pid is allocated,
but after its start_time is recorded.  This can be misused to later-on
cycle through PIDs and resume the stalled fork(2) yielding a process
that has the same pid and start_time as a process that existed before.
This can be used to circumvent security systems that identify processes
by their pid+start_time combination.

Even though user-space was always aware that start_time recording is
flaky (but several projects are known to still rely on start_time-based
identification), changing the start_time to be recorded late will help
mitigate existing attacks and make it much harder for user-space to
control the start_time a process gets assigned.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/fork.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index dd2f79ac0771..e4b81913a998 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1411,8 +1411,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	posix_cpu_timers_init(p);
 
-	p->start_time = ktime_get_ns();
-	p->real_start_time = ktime_get_boot_ns();
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
@@ -1572,6 +1570,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (retval)
 		goto bad_fork_free_pid;
 
+	/*
+	 * From this point on we must avoid any synchronous user-space
+	 * communication until we take the tasklist-lock. In particular, we do
+	 * not want user-space to be able to predict the process start-time by
+	 * stalling fork(2) after we recorded the start_time but before it is
+	 * visible to the system.
+	 */
+
+	p->start_time = ktime_get_ns();
+	p->real_start_time = ktime_get_boot_ns();
+
 	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!

From 060853fdd434ce620dd1dd7619ede834bd33b9d0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 28 Dec 2018 00:38:01 -0800
Subject: [PATCH 2114/3220] hwpoison, memory_hotplug: allow hwpoisoned pages to
 be offlined

commit b15c87263a69272423771118c653e9a1d0672caa upstream.

We have received a bug report that an injected MCE about faulty memory
prevents memory offline to succeed on 4.4 base kernel.  The underlying
reason was that the HWPoison page has an elevated reference count and the
migration keeps failing.  There are two problems with that.  First of all
it is dubious to migrate the poisoned page because we know that accessing
that memory is possible to fail.  Secondly it doesn't make any sense to
migrate a potentially broken content and preserve the memory corruption
over to a new location.

Oscar has found out that 4.4 and the current upstream kernels behave
slightly differently with his simply testcase

===

int main(void)
{
        int ret;
        int i;
        int fd;
        char *array = malloc(4096);
        char *array_locked = malloc(4096);

        fd = open("/tmp/data", O_RDONLY);
        read(fd, array, 4095);

        for (i = 0; i < 4096; i++)
                array_locked[i] = 'd';

        ret = mlock((void *)PAGE_ALIGN((unsigned long)array_locked), sizeof(array_locked));
        if (ret)
                perror("mlock");

        sleep (20);

        ret = madvise((void *)PAGE_ALIGN((unsigned long)array_locked), 4096, MADV_HWPOISON);
        if (ret)
                perror("madvise");

        for (i = 0; i < 4096; i++)
                array_locked[i] = 'd';

        return 0;
}
===

+ offline this memory.

In 4.4 kernels he saw the hwpoisoned page to be returned back to the LRU
list
kernel:  [<ffffffff81019ac9>] dump_trace+0x59/0x340
kernel:  [<ffffffff81019e9a>] show_stack_log_lvl+0xea/0x170
kernel:  [<ffffffff8101ac71>] show_stack+0x21/0x40
kernel:  [<ffffffff8132bb90>] dump_stack+0x5c/0x7c
kernel:  [<ffffffff810815a1>] warn_slowpath_common+0x81/0xb0
kernel:  [<ffffffff811a275c>] __pagevec_lru_add_fn+0x14c/0x160
kernel:  [<ffffffff811a2eed>] pagevec_lru_move_fn+0xad/0x100
kernel:  [<ffffffff811a334c>] __lru_cache_add+0x6c/0xb0
kernel:  [<ffffffff81195236>] add_to_page_cache_lru+0x46/0x70
kernel:  [<ffffffffa02b4373>] extent_readpages+0xc3/0x1a0 [btrfs]
kernel:  [<ffffffff811a16d7>] __do_page_cache_readahead+0x177/0x200
kernel:  [<ffffffff811a18c8>] ondemand_readahead+0x168/0x2a0
kernel:  [<ffffffff8119673f>] generic_file_read_iter+0x41f/0x660
kernel:  [<ffffffff8120e50d>] __vfs_read+0xcd/0x140
kernel:  [<ffffffff8120e9ea>] vfs_read+0x7a/0x120
kernel:  [<ffffffff8121404b>] kernel_read+0x3b/0x50
kernel:  [<ffffffff81215c80>] do_execveat_common.isra.29+0x490/0x6f0
kernel:  [<ffffffff81215f08>] do_execve+0x28/0x30
kernel:  [<ffffffff81095ddb>] call_usermodehelper_exec_async+0xfb/0x130
kernel:  [<ffffffff8161c045>] ret_from_fork+0x55/0x80

And that latter confuses the hotremove path because an LRU page is
attempted to be migrated and that fails due to an elevated reference
count.  It is quite possible that the reuse of the HWPoisoned page is some
kind of fixed race condition but I am not really sure about that.

With the upstream kernel the failure is slightly different.  The page
doesn't seem to have LRU bit set but isolate_movable_page simply fails and
do_migrate_range simply puts all the isolated pages back to LRU and
therefore no progress is made and scan_movable_pages finds same set of
pages over and over again.

Fix both cases by explicitly checking HWPoisoned pages before we even try
to get reference on the page, try to unmap it if it is still mapped.  As
explained by Naoya:

: Hwpoison code never unmapped those for no big reason because
: Ksm pages never dominate memory, so we simply didn't have strong
: motivation to save the pages.

Also put WARN_ON(PageLRU) in case there is a race and we can hit LRU
HWPoison pages which shouldn't happen but I couldn't convince myself about
that.  Naoya has noted the following:

: Theoretically no such gurantee, because try_to_unmap() doesn't have a
: guarantee of success and then memory_failure() returns immediately
: when hwpoison_user_mappings fails.
: Or the following code (comes after hwpoison_user_mappings block) also impli=
: es
: that the target page can still have PageLRU flag.
:
:         /*
:          * Torn down by someone else?
:          */
:         if (PageLRU(p) && !PageSwapCache(p) && p->mapping =3D=3D NULL) {
:                 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
:                 res =3D -EBUSY;
:                 goto out;
:         }
:
: So I think it's OK to keep "if (WARN_ON(PageLRU(page)))" block in
: current version of your patch.

Link: http://lkml.kernel.org/r/20181206120135.14079-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Oscar Salvador <osalvador@suse.com>
Debugged-by: Oscar Salvador <osalvador@suse.com>
Tested-by: Oscar Salvador <osalvador@suse.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/memory_hotplug.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a18923e4359d..0addef5f8aa3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -32,6 +32,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memblock.h>
 #include <linux/bootmem.h>
+#include <linux/rmap.h>
 
 #include <asm/tlbflush.h>
 
@@ -1471,6 +1472,21 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			continue;
 		}
 
+		/*
+		 * HWPoison pages have elevated reference counts so the migration would
+		 * fail on them. It also doesn't make any sense to migrate them in the
+		 * first place. Still try to unmap such a page in case it is still mapped
+		 * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+		 * the unmap as the catch all safety net).
+		 */
+		if (PageHWPoison(page)) {
+			if (WARN_ON(PageLRU(page)))
+				isolate_lru_page(page);
+			if (page_mapped(page))
+				try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+			continue;
+		}
+
 		if (!get_page_unless_zero(page))
 			continue;
 		/*

From a93d56de4533839f6cdb5865bf91d01f02eb10e9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Dec 2018 00:34:50 -0800
Subject: [PATCH 2115/3220] mm, devm_memremap_pages: mark devm_memremap_pages()
 EXPORT_SYMBOL_GPL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 808153e1187fa77ac7d7dad261ff476888dcf398 upstream.

devm_memremap_pages() is a facility that can create struct page entries
for any arbitrary range and give drivers the ability to subvert core
aspects of page management.

Specifically the facility is tightly integrated with the kernel's memory
hotplug functionality.  It injects an altmap argument deep into the
architecture specific vmemmap implementation to allow allocating from
specific reserved pages, and it has Linux specific assumptions about page
structure reference counting relative to get_user_pages() and
get_user_pages_fast().  It was an oversight and a mistake that this was
not marked EXPORT_SYMBOL_GPL from the outset.

Again, devm_memremap_pagex() exposes and relies upon core kernel internal
assumptions and will continue to evolve along with 'struct page', memory
hotplug, and support for new memory types / topologies.  Only an in-kernel
GPL-only driver is expected to keep up with this ongoing evolution.  This
interface, and functionality derived from this interface, is not suitable
for kernel-external drivers.

Link: http://lkml.kernel.org/r/154275557457.76910.16923571232582744134.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/memremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index f719c925cb54..71d28a5dbfc2 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -202,5 +202,5 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
 	devres_add(dev, page_map);
 	return __va(res->start);
 }
-EXPORT_SYMBOL(devm_memremap_pages);
+EXPORT_SYMBOL_GPL(devm_memremap_pages);
 #endif /* CONFIG_ZONE_DEVICE */

From 6331b9d7ac6c6f6dcd5a3e2a35ce650f82d16796 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Dec 2018 00:34:54 -0800
Subject: [PATCH 2116/3220] mm, devm_memremap_pages: kill mapping "System RAM"
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 06489cfbd915ff36c8e36df27f1c2dc60f97ca56 upstream.

Given the fact that devm_memremap_pages() requires a percpu_ref that is
torn down by devm_memremap_pages_release() the current support for mapping
RAM is broken.

Support for remapping "System RAM" has been broken since the beginning and
there is no existing user of this this code path, so just kill the support
and make it an explicit error.

This cleanup also simplifies a follow-on patch to fix the error path when
setting a devm release action for devm_memremap_pages_release() fails.

Link: http://lkml.kernel.org/r/154275557997.76910.14689813630968180480.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/memremap.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 71d28a5dbfc2..1be42f9b3e00 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -171,15 +171,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
 	struct page_map *page_map;
 	int error, nid;
 
-	if (is_ram == REGION_MIXED) {
-		WARN_ONCE(1, "%s attempted on mixed region %pr\n",
-				__func__, res);
+	if (is_ram != REGION_DISJOINT) {
+		WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
+				is_ram == REGION_MIXED ? "mixed" : "ram", res);
 		return ERR_PTR(-ENXIO);
 	}
 
-	if (is_ram == REGION_INTERSECTS)
-		return __va(res->start);
-
 	page_map = devres_alloc_node(devm_memremap_pages_release,
 			sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
 	if (!page_map)

From 192f7ca0c7946f56d0f4fb858b386599a382e0f7 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Wed, 28 Nov 2018 11:45:57 +0300
Subject: [PATCH 2117/3220] sunrpc: fix cache_head leak due to queued request

commit 4ecd55ea074217473f94cfee21bb72864d39f8d7 upstream.

After commit d202cce8963d, an expired cache_head can be removed from the
cache_detail's hash.

However, the expired cache_head may be waiting for a reply from a
previously submitted request. Such a cache_head has an increased
refcounter and therefore it won't be freed after cache_put(freeme).

Because the cache_head was removed from the hash it cannot be found
during cache_clean() and can be leaked forever, together with stalled
cache_request and other taken resources.

In our case we noticed it because an entry in the export cache was
holding a reference on a filesystem.

Fixes d202cce8963d ("sunrpc: never return expired entries in sunrpc_cache_lookup")
Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Cc: stable@kernel.org # 2.6.35
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sunrpc/cache.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 63fb5ee212cf..af17b00145e1 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -54,6 +54,11 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail)
 	h->last_refresh = now;
 }
 
+static void cache_fresh_locked(struct cache_head *head, time_t expiry,
+				struct cache_detail *detail);
+static void cache_fresh_unlocked(struct cache_head *head,
+				struct cache_detail *detail);
+
 struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 				       struct cache_head *key, int hash)
 {
@@ -95,6 +100,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 			if (cache_is_expired(detail, tmp)) {
 				hlist_del_init(&tmp->cache_list);
 				detail->entries --;
+				cache_fresh_locked(tmp, 0, detail);
 				freeme = tmp;
 				break;
 			}
@@ -110,8 +116,10 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 	cache_get(new);
 	write_unlock(&detail->hash_lock);
 
-	if (freeme)
+	if (freeme) {
+		cache_fresh_unlocked(freeme, detail);
 		cache_put(freeme, detail);
+	}
 	return new;
 }
 EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);

From 69c1fd103bbb9afe3a6b345a3e026a627257d76b Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Mon, 24 Dec 2018 14:44:42 +0300
Subject: [PATCH 2118/3220] sunrpc: use SVC_NET() in svcauth_gss_* functions

commit b8be5674fa9a6f3677865ea93f7803c4212f3e10 upstream.

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sunrpc/auth_gss/svcauth_gss.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 036bbf2b44c1..b5291ea54a3d 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1105,7 +1105,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp,
 	struct kvec *resv = &rqstp->rq_res.head[0];
 	struct rsi *rsip, rsikey;
 	int ret;
-	struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+	struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
 	memset(&rsikey, 0, sizeof(rsikey));
 	ret = gss_read_verf(gc, argv, authp,
@@ -1216,7 +1216,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
 	uint64_t handle;
 	int status;
 	int ret;
-	struct net *net = rqstp->rq_xprt->xpt_net;
+	struct net *net = SVC_NET(rqstp);
 	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
 
 	memset(&ud, 0, sizeof(ud));
@@ -1406,7 +1406,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 	__be32		*rpcstart;
 	__be32		*reject_stat = resv->iov_base + resv->iov_len;
 	int		ret;
-	struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+	struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
 	dprintk("RPC:       svcauth_gss: argv->iov_len = %zd\n",
 			argv->iov_len);
@@ -1694,7 +1694,7 @@ svcauth_gss_release(struct svc_rqst *rqstp)
 	struct rpc_gss_wire_cred *gc = &gsd->clcred;
 	struct xdr_buf *resbuf = &rqstp->rq_res;
 	int stat = -EINVAL;
-	struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+	struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
 	if (gc->gc_proc != RPC_GSS_PROC_DATA)
 		goto out;

From 557f16c7fe26a2d16013c2821c5ce5a90a7da97e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 7 Jan 2019 15:15:59 -0800
Subject: [PATCH 2119/3220] crypto: x86/chacha20 - avoid sleeping with
 preemption disabled

In chacha20-simd, clear the MAY_SLEEP flag in the blkcipher_desc to
prevent sleeping with preemption disabled, under kernel_fpu_begin().

This was fixed upstream incidentally by a large refactoring,
commit 9ae433bc79f9 ("crypto: chacha20 - convert generic and x86
versions to skcipher").  But syzkaller easily trips over this when
running on older kernels, as it's easily reachable via AF_ALG.
Therefore, this patch makes the minimal fix for older kernels.

Fixes: c9320b6dcb89 ("crypto: chacha20 - Add a SSSE3 SIMD variant for x86_64")
Cc: linux-crypto@vger.kernel.org
Cc: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/crypto/chacha20_glue.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 8baaff5af0b5..75b9d43069f1 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -77,6 +77,7 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
 
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
 

From 83f470ebd75e3fbe56e177763337dc9326af02a4 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 8 Jan 2019 10:43:30 +0300
Subject: [PATCH 2120/3220] ALSA: cs46xx: Potential NULL dereference in probe

commit 1524f4e47f90b27a3ac84efbdd94c63172246a6f upstream.

The "chip->dsp_spos_instance" can be NULL on some of the ealier error
paths in snd_cs46xx_create().

Reported-by: "Yavuz, Tuba" <tuba@ece.ufl.edu>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/cs46xx/dsp_spos.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sound/pci/cs46xx/dsp_spos.c b/sound/pci/cs46xx/dsp_spos.c
index d2951ed4bf71..1984291ebd07 100644
--- a/sound/pci/cs46xx/dsp_spos.c
+++ b/sound/pci/cs46xx/dsp_spos.c
@@ -899,6 +899,9 @@ int cs46xx_dsp_proc_done (struct snd_cs46xx *chip)
 	struct dsp_spos_instance * ins = chip->dsp_spos_instance;
 	int i;
 
+	if (!ins)
+		return 0;
+
 	snd_info_free_entry(ins->proc_sym_info_entry);
 	ins->proc_sym_info_entry = NULL;
 

From a5e09a908ea3c64bf522822b7923d2d8fc1a7af2 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 19 Dec 2018 12:36:27 +0100
Subject: [PATCH 2121/3220] ALSA: usb-audio: Avoid access before bLength check
 in build_audio_procunit()

commit f4351a199cc120ff9d59e06d02e8657d08e6cc46 upstream.

The parser for the processing unit reads bNrInPins field before the
bLength sanity check, which may lead to an out-of-bound access when a
malformed descriptor is given.  Fix it by assignment after the bLength
check.

Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/usb/mixer.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index 97d6a18e6956..f7eb0d2f797b 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -1816,7 +1816,7 @@ static int build_audio_procunit(struct mixer_build *state, int unitid,
 				char *name)
 {
 	struct uac_processing_unit_descriptor *desc = raw_desc;
-	int num_ins = desc->bNrInPins;
+	int num_ins;
 	struct usb_mixer_elem_info *cval;
 	struct snd_kcontrol *kctl;
 	int i, err, nameid, type, len;
@@ -1831,7 +1831,13 @@ static int build_audio_procunit(struct mixer_build *state, int unitid,
 		0, NULL, default_value_info
 	};
 
-	if (desc->bLength < 13 || desc->bLength < 13 + num_ins ||
+	if (desc->bLength < 13) {
+		usb_audio_err(state->chip, "invalid %s descriptor (id %d)\n", name, unitid);
+		return -EINVAL;
+	}
+
+	num_ins = desc->bNrInPins;
+	if (desc->bLength < 13 + num_ins ||
 	    desc->bLength < num_ins + uac_processing_unit_bControlSize(desc, state->mixer->protocol)) {
 		usb_audio_err(state->chip, "invalid %s descriptor (id %d)\n", name, unitid);
 		return -EINVAL;

From 11e047131fff3966f4b6c6929d14a378c5e916d1 Mon Sep 17 00:00:00 2001
From: Hui Peng <benquike@163.com>
Date: Tue, 25 Dec 2018 18:11:52 -0500
Subject: [PATCH 2122/3220] ALSA: usb-audio: Fix an out-of-bound read in
 create_composite_quirks

commit cbb2ebf70daf7f7d97d3811a2ff8e39655b8c184 upstream.

In `create_composite_quirk`, the terminating condition of for loops is
`quirk->ifnum < 0`. So any composite quirks should end with `struct
snd_usb_audio_quirk` object with ifnum < 0.

    for (quirk = quirk_comp->data; quirk->ifnum >= 0; ++quirk) {

    	.....
    }

the data field of Bower's & Wilkins PX headphones usb device device quirks
do not end with {.ifnum = -1}, wihch may result in out-of-bound read.

This Patch fix the bug by adding an ending quirk object.

Fixes: 240a8af929c7 ("ALSA: usb-audio: Add a quirck for B&W PX headphones")
Signed-off-by: Hui Peng <benquike@163.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/usb/quirks-table.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h
index 15cbe2565703..d32727c74a16 100644
--- a/sound/usb/quirks-table.h
+++ b/sound/usb/quirks-table.h
@@ -3321,6 +3321,9 @@ AU0828_DEVICE(0x2040, 0x7270, "Hauppauge", "HVR-950Q"),
 					}
 				}
 			},
+			{
+				.ifnum = -1
+			},
 		}
 	}
 },

From a09b8db22851526c3607dc4d48c22bf9a444ae98 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 15 Nov 2018 13:15:05 +0300
Subject: [PATCH 2123/3220] dlm: fixed memory leaks after failed
 ls_remove_names allocation

commit b982896cdb6e6a6b89d86dfb39df489d9df51e14 upstream.

If allocation fails on last elements of array need to free already
allocated elements.

v2: just move existing out_rsbtbl label to right place

Fixes 789924ba635f ("dlm: fix race between remove and lookup")
Cc: stable@kernel.org # 3.6

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dlm/lockspace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f3e72787e7f9..30e4e01db35a 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -673,11 +673,11 @@ static int new_lockspace(const char *name, const char *cluster,
 	kfree(ls->ls_recover_buf);
  out_lkbidr:
 	idr_destroy(&ls->ls_lkbidr);
+ out_rsbtbl:
 	for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
 		if (ls->ls_remove_names[i])
 			kfree(ls->ls_remove_names[i]);
 	}
- out_rsbtbl:
 	vfree(ls->ls_rsbtbl);
  out_lsfree:
 	if (do_unreg)

From 27f4aa2a0c12077184168a40880ddd4cd8aa0e49 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 15 Nov 2018 13:18:18 +0300
Subject: [PATCH 2124/3220] dlm: possible memory leak on error path in
 create_lkb()

commit 23851e978f31eda8b2d01bd410d3026659ca06c7 upstream.

Fixes 3d6aa675fff9 ("dlm: keep lkbs in idr")
Cc: stable@kernel.org # 3.1

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dlm/lock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 35502d4046f5..1d404c832e33 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1210,6 +1210,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 
 	if (rv < 0) {
 		log_error(ls, "create_lkb idr error %d", rv);
+		dlm_free_lkb(lkb);
 		return rv;
 	}
 

From 3ed774e59ce57be7cae15b9eec1a40ccfbafe57c Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 15 Nov 2018 13:18:24 +0300
Subject: [PATCH 2125/3220] dlm: lost put_lkb on error path in
 receive_convert() and receive_unlock()

commit c0174726c3976e67da8649ac62cae43220ae173a upstream.

Fixes 6d40c4a708e0 ("dlm: improve error and debug messages")
Cc: stable@kernel.org # 3.5

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dlm/lock.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1d404c832e33..1e6a3a391849 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -4178,6 +4178,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
 			  (unsigned long long)lkb->lkb_recover_seq,
 			  ms->m_header.h_nodeid, ms->m_lkid);
 		error = -ENOENT;
+		dlm_put_lkb(lkb);
 		goto fail;
 	}
 
@@ -4231,6 +4232,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
 			  lkb->lkb_id, lkb->lkb_remid,
 			  ms->m_header.h_nodeid, ms->m_lkid);
 		error = -ENOENT;
+		dlm_put_lkb(lkb);
 		goto fail;
 	}
 

From bf72973ce165cb6715ae1dac1252c28f3e104dda Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 15 Nov 2018 13:18:56 +0300
Subject: [PATCH 2126/3220] dlm: memory leaks on error path in
 dlm_user_request()

commit d47b41aceeadc6b58abc9c7c6485bef7cfb75636 upstream.

According to comment in dlm_user_request() ua should be freed
in dlm_free_lkb() after successful attach to lkb.

However ua is attached to lkb not in set_lock_args() but later,
inside request_lock().

Fixes 597d0cae0f99 ("[DLM] dlm: user locks")
Cc: stable@kernel.org # 2.6.19

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dlm/lock.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1e6a3a391849..3a7f401e943c 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5795,20 +5795,20 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 			goto out;
 		}
 	}
-
-	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
-	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
-	   lock and that lkb_astparam is the dlm_user_args structure. */
-
 	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
 			      fake_astfn, ua, fake_bastfn, &args);
-	lkb->lkb_flags |= DLM_IFL_USER;
-
 	if (error) {
+		kfree(ua->lksb.sb_lvbptr);
+		ua->lksb.sb_lvbptr = NULL;
+		kfree(ua);
 		__put_lkb(ls, lkb);
 		goto out;
 	}
 
+	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
+	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
+	   lock and that lkb_astparam is the dlm_user_args structure. */
+	lkb->lkb_flags |= DLM_IFL_USER;
 	error = request_lock(ls, lkb, name, namelen, &args);
 
 	switch (error) {

From 18451898261bbaaf94e05ad9a4d2faed6f996180 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 4 Dec 2018 15:06:27 +0100
Subject: [PATCH 2127/3220] gfs2: Fix loop in gfs2_rbm_find

commit 2d29f6b96d8f80322ed2dd895bca590491c38d34 upstream.

Fix the resource group wrap-around logic in gfs2_rbm_find that commit
e579ed4f44 broke.  The bug can lead to unnecessary repeated scanning of the
same bitmaps; there is a risk that future changes will turn this into an
endless loop.

Fixes: e579ed4f44 ("GFS2: Introduce rbm field bii")
Cc: stable@vger.kernel.org # v3.13+
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ef24894edecc..763fe7737065 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1720,9 +1720,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
 			goto next_iter;
 		}
 		if (ret == -E2BIG) {
+			n += rbm->bii - initial_bii;
 			rbm->bii = 0;
 			rbm->offset = 0;
-			n += (rbm->bii - initial_bii);
 			goto res_covered_end_of_rgrp;
 		}
 		return ret;

From c820ac339c98aa27dd63758de3cd0b33feb97513 Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Mon, 19 Nov 2018 20:01:24 +0200
Subject: [PATCH 2128/3220] b43: Fix error in cordic routine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 8ea3819c0bbef57a51d8abe579e211033e861677 upstream.

The cordic routine for calculating sines and cosines that was added in
commit 6f98e62a9f1b ("b43: update cordic code to match current specs")
contains an error whereby a quantity declared u32 can in fact go negative.

This problem was detected by Priit Laes who is switching b43 to use the
routine in the library functions of the kernel.

Fixes: 986504540306 ("b43: make cordic common (LP-PHY and N-PHY need it)")
Reported-by: Priit Laes <plaes@plaes.org>
Cc: Rafał Miłecki <zajec5@gmail.com>
Cc: Stable <stable@vger.kernel.org> # 2.6.34
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Priit Laes <plaes@plaes.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/wireless/b43/phy_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/b43/phy_common.c b/drivers/net/wireless/b43/phy_common.c
index ec2b9c577b90..3644c9edaf81 100644
--- a/drivers/net/wireless/b43/phy_common.c
+++ b/drivers/net/wireless/b43/phy_common.c
@@ -616,7 +616,7 @@ struct b43_c32 b43_cordic(int theta)
 	u8 i;
 	s32 tmp;
 	s8 signx = 1;
-	u32 angle = 0;
+	s32 angle = 0;
 	struct b43_c32 ret = { .i = 39797, .q = 0, };
 
 	while (theta > (180 << 16))

From 2d7501440715f2b4e098ad2b7fd07c5960e57c52 Mon Sep 17 00:00:00 2001
From: Dominique Martinet <dominique.martinet@cea.fr>
Date: Mon, 5 Nov 2018 09:52:48 +0100
Subject: [PATCH 2129/3220] 9p/net: put a lower bound on msize

commit 574d356b7a02c7e1b01a1d9cba8a26b3c2888f45 upstream.

If the requested msize is too small (either from command line argument
or from the server version reply), we won't get any work done.
If it's *really* too small, nothing will work, and this got caught by
syzbot recently (on a new kmem_cache_create_usercopy() call)

Just set a minimum msize to 4k in both code paths, until someone
complains they have a use-case for a smaller msize.

We need to check in both mount option and server reply individually
because the msize for the first version request would be unchecked
with just a global check on clnt->msize.

Link: http://lkml.kernel.org/r/1541407968-31350-1-git-send-email-asmadeus@codewreck.org
Reported-by: syzbot+0c1d61e4db7db94102ca@syzkaller.appspotmail.com
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/9p/client.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/net/9p/client.c b/net/9p/client.c
index ed8738c4dc09..8fba9cd973c1 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -156,6 +156,12 @@ static int parse_opts(char *opts, struct p9_client *clnt)
 				ret = r;
 				continue;
 			}
+			if (option < 4096) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "msize should be at least 4k\n");
+				ret = -EINVAL;
+				continue;
+			}
 			clnt->msize = option;
 			break;
 		case Opt_trans:
@@ -972,10 +978,18 @@ static int p9_client_version(struct p9_client *c)
 	else if (!strncmp(version, "9P2000", 6))
 		c->proto_version = p9_proto_legacy;
 	else {
+		p9_debug(P9_DEBUG_ERROR,
+			 "server returned an unknown version: %s\n", version);
 		err = -EREMOTEIO;
 		goto error;
 	}
 
+	if (msize < 4096) {
+		p9_debug(P9_DEBUG_ERROR,
+			 "server returned a msize < 4096: %d\n", msize);
+		err = -EREMOTEIO;
+		goto error;
+	}
 	if (msize < c->msize)
 		c->msize = msize;
 
@@ -1040,6 +1054,13 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
 	if (clnt->msize > clnt->trans_mod->maxsize)
 		clnt->msize = clnt->trans_mod->maxsize;
 
+	if (clnt->msize < 4096) {
+		p9_debug(P9_DEBUG_ERROR,
+			 "Please specify a msize of at least 4k\n");
+		err = -EINVAL;
+		goto free_client;
+	}
+
 	err = p9_client_version(clnt);
 	if (err)
 		goto close_trans;

From 0216bf654af7f1963e67602ce1af56575634facb Mon Sep 17 00:00:00 2001
From: Sohil Mehta <sohil.mehta@intel.com>
Date: Wed, 21 Nov 2018 15:29:33 -0800
Subject: [PATCH 2130/3220] iommu/vt-d: Handle domain agaw being less than
 iommu agaw

commit 3569dd07aaad71920c5ea4da2d5cc9a167c1ffd4 upstream.

The Intel IOMMU driver opportunistically skips a few top level page
tables from the domain paging directory while programming the IOMMU
context entry. However there is an implicit assumption in the code that
domain's adjusted guest address width (agaw) would always be greater
than IOMMU's agaw.

The IOMMU capabilities in an upcoming platform cause the domain's agaw
to be lower than IOMMU's agaw. The issue is seen when the IOMMU supports
both 4-level and 5-level paging. The domain builds a 4-level page table
based on agaw of 2. However the IOMMU's agaw is set as 3 (5-level). In
this case the code incorrectly tries to skip page page table levels.
This causes the IOMMU driver to avoid programming the context entry. The
fix handles this case and programs the context entry accordingly.

Fixes: de24e55395698 ("iommu/vt-d: Simplify domain_context_mapping_one")
Cc: <stable@vger.kernel.org>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reported-by: Ramos Falcon, Ernesto R <ernesto.r.ramos.falcon@intel.com>
Tested-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/iommu/intel-iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 7feaa82f8c7c..8b4a4d95669a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2041,7 +2041,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 	 * than default.  Unnecessary for PT mode.
 	 */
 	if (translation != CONTEXT_TT_PASS_THROUGH) {
-		for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
+		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
 			ret = -ENOMEM;
 			pgd = phys_to_virt(dma_pte_addr(pgd));
 			if (!dma_pte_present(pgd))
@@ -2055,7 +2055,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 			translation = CONTEXT_TT_MULTI_LEVEL;
 
 		context_set_address_root(context, virt_to_phys(pgd));
-		context_set_address_width(context, iommu->agaw);
+		context_set_address_width(context, agaw);
 	} else {
 		/*
 		 * In pass through mode, AW must be programmed to

From 5f6ce5ea8393c0826c0c0b85278ff1d49b94da69 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 29 Nov 2018 11:22:50 +0800
Subject: [PATCH 2131/3220] ceph: don't update importing cap's mseq when
 handing cap export

commit 3c1392d4c49962a31874af14ae9ff289cb2b3851 upstream.

Updating mseq makes client think importer mds has accepted all prior
cap messages and importer mds knows what caps client wants. Actually
some cap messages may have been dropped because of mseq mismatch.

If mseq is left untouched, importing cap's mds_wanted later will get
reset by cap import message.

Cc: stable@vger.kernel.org
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/caps.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0e3de1bb6500..e7b54514d99a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3243,7 +3243,6 @@ retry:
 			tcap->cap_id = t_cap_id;
 			tcap->seq = t_seq - 1;
 			tcap->issue_seq = t_seq - 1;
-			tcap->mseq = t_mseq;
 			tcap->issued |= issued;
 			tcap->implemented |= issued;
 			if (cap == ci->i_auth_cap)

From 21c7b1377859630d0d6c47dd0b0de75f40642b77 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 12 Dec 2018 14:45:18 +0100
Subject: [PATCH 2132/3220] genwqe: Fix size check

commit fdd669684655c07dacbdb0d753fd13833de69a33 upstream.

Calling the test program genwqe_cksum with the default buffer size of
2MB triggers the following kernel warning on s390:

WARNING: CPU: 30 PID: 9311 at mm/page_alloc.c:3189 __alloc_pages_nodemask+0x45c/0xbe0
CPU: 30 PID: 9311 Comm: genwqe_cksum Kdump: loaded Not tainted 3.10.0-957.el7.s390x #1
task: 00000005e5d13980 ti: 00000005e7c6c000 task.ti: 00000005e7c6c000
Krnl PSW : 0704c00180000000 00000000002780ac (__alloc_pages_nodemask+0x45c/0xbe0)
           R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 EA:3
Krnl GPRS: 00000000002932b8 0000000000b73d7c 0000000000000010 0000000000000009
           0000000000000041 00000005e7c6f9b8 0000000000000001 00000000000080d0
           0000000000000000 0000000000b70500 0000000000000001 0000000000000000
           0000000000b70528 00000000007682c0 0000000000277df2 00000005e7c6f9a0
Krnl Code: 000000000027809e: de7195001000	ed	1280(114,%r9),0(%r1)
	   00000000002780a4: a774fead		brc	7,277dfe
	  #00000000002780a8: a7f40001		brc	15,2780aa
	  >00000000002780ac: 92011000		mvi	0(%r1),1
	   00000000002780b0: a7f4fea7		brc	15,277dfe
	   00000000002780b4: 9101c6b6		tm	1718(%r12),1
	   00000000002780b8: a784ff3a		brc	8,277f2c
	   00000000002780bc: a7f4fe2e		brc	15,277d18
Call Trace:
([<0000000000277df2>] __alloc_pages_nodemask+0x1a2/0xbe0)
 [<000000000013afae>] s390_dma_alloc+0xfe/0x310
 [<000003ff8065f362>] __genwqe_alloc_consistent+0xfa/0x148 [genwqe_card]
 [<000003ff80658f7a>] genwqe_mmap+0xca/0x248 [genwqe_card]
 [<00000000002b2712>] mmap_region+0x4e2/0x778
 [<00000000002b2c54>] do_mmap+0x2ac/0x3e0
 [<0000000000292d7e>] vm_mmap_pgoff+0xd6/0x118
 [<00000000002b081c>] SyS_mmap_pgoff+0xdc/0x268
 [<00000000002b0a34>] SyS_old_mmap+0x8c/0xb0
 [<000000000074e518>] sysc_tracego+0x14/0x1e
 [<000003ffacf87dc6>] 0x3ffacf87dc6

turns out the check in __genwqe_alloc_consistent uses "> MAX_ORDER"
while the mm code uses ">= MAX_ORDER". Fix genwqe.

Cc: stable@vger.kernel.org
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Frank Haverkamp <haver@linux.vnet.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/genwqe/card_utils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 524660510599..0c15ba21fa54 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -217,7 +217,7 @@ u32 genwqe_crc32(u8 *buff, size_t len, u32 init)
 void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size,
 			       dma_addr_t *dma_handle)
 {
-	if (get_order(size) > MAX_ORDER)
+	if (get_order(size) >= MAX_ORDER)
 		return NULL;
 
 	return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle,

From b1892669f77f41ffdb1439010dc29cf9f2d7f2fb Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 19 Dec 2018 17:19:22 +0200
Subject: [PATCH 2133/3220] intel_th: msu: Fix an off-by-one in attribute store

commit ec5b5ad6e272d8d6b92d1007f79574919862a2d2 upstream.

The 'nr_pages' attribute of the 'msc' subdevices parses a comma-separated
list of window sizes, passed from userspace. However, there is a bug in
the string parsing logic wherein it doesn't exclude the comma character
from the range of characters as it consumes them. This leads to an
out-of-bounds access given a sufficiently long list. For example:

> # echo 8,8,8,8 > /sys/bus/intel_th/devices/0-msc0/nr_pages
> ==================================================================
> BUG: KASAN: slab-out-of-bounds in memchr+0x1e/0x40
> Read of size 1 at addr ffff8803ffcebcd1 by task sh/825
>
> CPU: 3 PID: 825 Comm: npktest.sh Tainted: G        W         4.20.0-rc1+
> Call Trace:
>  dump_stack+0x7c/0xc0
>  print_address_description+0x6c/0x23c
>  ? memchr+0x1e/0x40
>  kasan_report.cold.5+0x241/0x308
>  memchr+0x1e/0x40
>  nr_pages_store+0x203/0xd00 [intel_th_msu]

Fix this by accounting for the comma character.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Fixes: ba82664c134ef ("intel_th: Add Memory Storage Unit driver")
Cc: stable@vger.kernel.org # v4.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/intel_th/msu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index 70ca27e45602..9d9e47eb0842 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1418,7 +1418,8 @@ nr_pages_store(struct device *dev, struct device_attribute *attr,
 		if (!end)
 			break;
 
-		len -= end - p;
+		/* consume the number and the following comma, hence +1 */
+		len -= end - p + 1;
 		p = end + 1;
 	} while (len);
 

From 1bd63edb92acf909db66e22bbfb51a43aeb5743a Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Fri, 16 Nov 2018 17:23:47 +0100
Subject: [PATCH 2134/3220] power: supply: olpc_battery: correct the
 temperature units

commit ed54ffbe554f0902689fd6d1712bbacbacd11376 upstream.

According to [1] and [2], the temperature values are in tenths of degree
Celsius. Exposing the Celsius value makes the battery appear on fire:

  $ upower -i /org/freedesktop/UPower/devices/battery_olpc_battery
  ...
      temperature:         236.9 degrees C

Tested on OLPC XO-1 and OLPC XO-1.75 laptops.

[1] include/linux/power_supply.h
[2] Documentation/power/power_supply_class.txt

Fixes: fb972873a767 ("[BATTERY] One Laptop Per Child power/battery driver")
Cc: stable@vger.kernel.org
Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/power/olpc_battery.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c
index 9e29b1321648..15783869e1a0 100644
--- a/drivers/power/olpc_battery.c
+++ b/drivers/power/olpc_battery.c
@@ -427,14 +427,14 @@ static int olpc_bat_get_property(struct power_supply *psy,
 		if (ret)
 			return ret;
 
-		val->intval = (s16)be16_to_cpu(ec_word) * 100 / 256;
+		val->intval = (s16)be16_to_cpu(ec_word) * 10 / 256;
 		break;
 	case POWER_SUPPLY_PROP_TEMP_AMBIENT:
 		ret = olpc_ec_cmd(EC_AMB_TEMP, NULL, 0, (void *)&ec_word, 2);
 		if (ret)
 			return ret;
 
-		val->intval = (int)be16_to_cpu(ec_word) * 100 / 256;
+		val->intval = (int)be16_to_cpu(ec_word) * 10 / 256;
 		break;
 	case POWER_SUPPLY_PROP_CHARGE_COUNTER:
 		ret = olpc_ec_cmd(EC_BAT_ACR, NULL, 0, (void *)&ec_word, 2);

From b83b3fa78445387f351cef477a112e503d72b9f0 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 13 Jan 2019 10:05:34 +0100
Subject: [PATCH 2135/3220] Linux 4.4.170

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 0d41b0626c0c..bc58f206c0da 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 169
+SUBLEVEL = 170
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From c69bfa0ff38e692d04c562d05107d1020f5e7dc7 Mon Sep 17 00:00:00 2001
From: Peng Zhou <peng.zhou@mediatek.com>
Date: Tue, 25 Dec 2018 11:02:11 +0800
Subject: [PATCH 2136/3220] ANDROID: f2fs: Complement "android_fs" tracepoint
 of read path

It's only in DIO before, complement for BIO.

Bug: 120445624
Change-Id: I90b6fb15e355978da8805ed6306c595819be989d
Signed-off-by: Peng Zhou <peng.zhou@mediatek.com>
---
 fs/f2fs/data.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ae3691bd279b..84b3ee71d175 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -142,6 +142,8 @@ static bool f2fs_bio_post_read_required(struct bio *bio)
 
 static void f2fs_read_end_io(struct bio *bio)
 {
+	struct page *first_page = bio->bi_io_vec[0].bv_page;
+
 	if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_READ_IO)) {
 		f2fs_show_injection_info(FAULT_READ_IO);
 		bio->bi_error = -EIO;
@@ -155,6 +157,13 @@ static void f2fs_read_end_io(struct bio *bio)
 		return;
 	}
 
+	if (first_page != NULL &&
+		__read_io_type(first_page) == F2FS_RD_DATA) {
+		trace_android_fs_dataread_end(first_page->mapping->host,
+						page_offset(first_page),
+						bio->bi_iter.bi_size);
+	}
+
 	__read_end_io(bio);
 }
 
@@ -321,6 +330,32 @@ submit_io:
 	submit_bio(bio_op(bio), bio);
 }
 
+static void __f2fs_submit_read_bio(struct f2fs_sb_info *sbi,
+				struct bio *bio, enum page_type type)
+{
+	if (trace_android_fs_dataread_start_enabled() && (type == DATA)) {
+		struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+		if (first_page != NULL &&
+			__read_io_type(first_page) == F2FS_RD_DATA) {
+			char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+			path = android_fstrace_get_pathname(pathbuf,
+						MAX_TRACE_PATHBUF_LEN,
+						first_page->mapping->host);
+
+			trace_android_fs_dataread_start(
+				first_page->mapping->host,
+				page_offset(first_page),
+				bio->bi_iter.bi_size,
+				current->pid,
+				path,
+				current->comm);
+		}
+	}
+	__submit_bio(sbi, bio, type);
+}
+
 static void __submit_merged_bio(struct f2fs_bio_info *io)
 {
 	struct f2fs_io_info *fio = &io->fio;
@@ -468,7 +503,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	inc_page_count(fio->sbi, is_read_io(fio->op) ?
 			__read_io_type(page): WB_DATA_TYPE(fio->page));
 
-	__submit_bio(fio->sbi, bio, fio->type);
+	__f2fs_submit_read_bio(fio->sbi, bio, fio->type);
 	return 0;
 }
 
@@ -598,7 +633,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
 	}
 	ClearPageError(page);
 	inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
-	__submit_bio(F2FS_I_SB(inode), bio, DATA);
+	__f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
 	return 0;
 }
 
@@ -1597,7 +1632,7 @@ got_it:
 		if (bio && (last_block_in_bio != block_nr - 1 ||
 			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
 submit_and_realloc:
-			__submit_bio(F2FS_I_SB(inode), bio, DATA);
+			__f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
 			bio = NULL;
 		}
 		if (bio == NULL) {
@@ -1629,7 +1664,7 @@ set_error_page:
 		goto next_page;
 confused:
 		if (bio) {
-			__submit_bio(F2FS_I_SB(inode), bio, DATA);
+			__f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
 			bio = NULL;
 		}
 		unlock_page(page);
@@ -1639,7 +1674,7 @@ next_page:
 	}
 	BUG_ON(pages && !list_empty(pages));
 	if (bio)
-		__submit_bio(F2FS_I_SB(inode), bio, DATA);
+		__f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
 	return 0;
 }
 

From ec8fcb232d839b36399fae3de408488959e22e6d Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 10 Jan 2019 16:07:19 -0800
Subject: [PATCH 2137/3220] Revert "ANDROID: dm verity: add minimum prefetch
 size"

This reverts commit ace74ccf82cfb2b73ce1df2e698d20c2fbc559dd.

Bug: 71728490
Change-Id: Iebcb0cd9982f36c4bd2552811f9147325a291db0
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 drivers/md/Kconfig            | 16 ----------------
 drivers/md/dm-verity-target.c |  9 +--------
 2 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 95de7ee56dfc..44b2db61859a 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -459,21 +459,6 @@ config DM_VERITY
 
 	  If unsure, say N.
 
-config DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
-	bool "Prefetch size 128"
-
-config DM_VERITY_HASH_PREFETCH_MIN_SIZE
-	int "Verity hash prefetch minimum size"
-	depends on DM_VERITY
-	range 1 4096
-	default 128 if DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
-	default 1
-	---help---
-	  This sets minimum number of hash blocks to prefetch for dm-verity.
-	  For devices like eMMC, having larger prefetch size like 128 can improve
-	  performance with increased memory consumption for keeping more hashes
-	  in RAM.
-
 config DM_VERITY_FEC
 	bool "Verity forward error correction support"
 	depends on DM_VERITY
@@ -537,7 +522,6 @@ config DM_ANDROID_VERITY
 	depends on ASYMMETRIC_KEY_TYPE
 	depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
 	depends on MD_LINEAR=y
-	select DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
 	---help---
 	  This device-mapper target is virtually a VERITY target. This
 	  target is setup by reading the metadata contents piggybacked
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index d2e3abc182b3..131077aabd08 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -529,7 +529,6 @@ static void verity_prefetch_io(struct work_struct *work)
 		container_of(work, struct dm_verity_prefetch_work, work);
 	struct dm_verity *v = pw->v;
 	int i;
-	sector_t prefetch_size;
 
 	for (i = v->levels - 2; i >= 0; i--) {
 		sector_t hash_block_start;
@@ -552,14 +551,8 @@ static void verity_prefetch_io(struct work_struct *work)
 				hash_block_end = v->hash_blocks - 1;
 		}
 no_prefetch_cluster:
-		// for emmc, it is more efficient to send bigger read
-		prefetch_size = max((sector_t)CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE,
-			hash_block_end - hash_block_start + 1);
-		if ((hash_block_start + prefetch_size) >= (v->hash_start + v->hash_blocks)) {
-			prefetch_size = hash_block_end - hash_block_start + 1;
-		}
 		dm_bufio_prefetch(v->bufio, hash_block_start,
-				  prefetch_size);
+				  hash_block_end - hash_block_start + 1);
 	}
 
 	kfree(pw);

From acb5ff1905fd03fa74ebd07c8df5bcbe80204b79 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 10 Jan 2019 16:28:12 -0800
Subject: [PATCH 2138/3220] ANDROID: cuttlefish_defconfig: remove
 DM_VERITY_HASH_PREFETCH_MIN_SIZE

This option is reverted in change Iebcb0cd9982f36c4bd2552811f9147325a291db0.

Bug: 71728490
Change-Id: If25dfd56abed16bfe579840e4a52d6bedbae69ca
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 975d083b2317..8383fea15ad7 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -223,7 +223,6 @@ CONFIG_DM_MIRROR=y
 CONFIG_DM_ZERO=y
 CONFIG_DM_UEVENT=y
 CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE=1
 CONFIG_DM_VERITY_FEC=y
 CONFIG_DM_ANDROID_VERITY=y
 CONFIG_NETDEVICES=y

From 3576d75c8de3d22bc211e563e3263629aaa83502 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:31:01 -0800
Subject: [PATCH 2139/3220] UPSTREAM: net: move napi_hash[] into read mostly
 section

We do not often add/delete a napi context.
Moving napi_hash[] into read mostly section avoids potential false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 6180d9de61a5c461f9e3efef5417a844701dbbb2)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I891b9dfef88c44f6480842754cd7d81e7a250792
---
 include/linux/hashtable.h | 4 ++++
 net/core/dev.c            | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h
index 519b6e2d769e..661e5c2a8e2a 100644
--- a/include/linux/hashtable.h
+++ b/include/linux/hashtable.h
@@ -16,6 +16,10 @@
 	struct hlist_head name[1 << (bits)] =					\
 			{ [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }
 
+#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits)				\
+	struct hlist_head name[1 << (bits)] __read_mostly =			\
+			{ [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }
+
 #define DECLARE_HASHTABLE(name, bits)                                   	\
 	struct hlist_head name[1 << (bits)]
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e03c1d2f6707..d205a3a566b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -183,7 +183,7 @@ EXPORT_SYMBOL(dev_base_lock);
 static DEFINE_SPINLOCK(napi_hash_lock);
 
 static unsigned int napi_gen_id = NR_CPUS;
-static DEFINE_HASHTABLE(napi_hash, 8);
+static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 
 static seqcount_t devnet_rename_seq;
 

From c7c7d1506cb8ac3714afce21912447601299014b Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 17 Dec 2015 16:53:43 +0800
Subject: [PATCH 2140/3220] UPSTREAM: virtio: make find_vqs()
 checkpatch.pl-friendly

checkpatch.pl wants arrays of strings declared as follows:

  static const char * const names[] = { "vq-1", "vq-2", "vq-3" };

Currently the find_vqs() function takes a const char *names[] argument
so passing checkpatch.pl's const char * const names[] results in a
compiler error due to losing the second const.

This patch adjusts the find_vqs() prototype and updates all virtio
transports.  This makes it possible for virtio_balloon.c, virtio_input.c,
virtgpu_kms.c, and virtio_rpmsg_bus.c to use the checkpatch.pl-friendly
type.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
(cherry picked from commit f7ad26ff952b3ca2702d7da03aad0ab1f6c01d7c)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I23513ea85e7a43efd0c604fc4445b301b4f610ba
---
 drivers/gpu/drm/virtio/virtgpu_kms.c   | 2 +-
 drivers/misc/mic/card/mic_virtio.c     | 2 +-
 drivers/remoteproc/remoteproc_virtio.c | 2 +-
 drivers/rpmsg/virtio_rpmsg_bus.c       | 2 +-
 drivers/s390/virtio/kvm_virtio.c       | 2 +-
 drivers/s390/virtio/virtio_ccw.c       | 2 +-
 drivers/virtio/virtio_balloon.c        | 2 +-
 drivers/virtio/virtio_input.c          | 2 +-
 drivers/virtio/virtio_mmio.c           | 2 +-
 drivers/virtio/virtio_pci_common.c     | 4 ++--
 drivers/virtio/virtio_pci_common.h     | 2 +-
 drivers/virtio/virtio_pci_modern.c     | 2 +-
 include/linux/virtio_config.h          | 2 +-
 13 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/virtio/virtgpu_kms.c b/drivers/gpu/drm/virtio/virtgpu_kms.c
index 06496a128162..4150873d432e 100644
--- a/drivers/gpu/drm/virtio/virtgpu_kms.c
+++ b/drivers/gpu/drm/virtio/virtgpu_kms.c
@@ -130,7 +130,7 @@ int virtio_gpu_driver_load(struct drm_device *dev, unsigned long flags)
 	static vq_callback_t *callbacks[] = {
 		virtio_gpu_ctrl_ack, virtio_gpu_cursor_ack
 	};
-	static const char *names[] = { "control", "cursor" };
+	static const char * const names[] = { "control", "cursor" };
 
 	struct virtio_gpu_device *vgdev;
 	/* this will expand later */
diff --git a/drivers/misc/mic/card/mic_virtio.c b/drivers/misc/mic/card/mic_virtio.c
index e486a0c26267..f6ed57d3125c 100644
--- a/drivers/misc/mic/card/mic_virtio.c
+++ b/drivers/misc/mic/card/mic_virtio.c
@@ -311,7 +311,7 @@ unmap:
 static int mic_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			struct virtqueue *vqs[],
 			vq_callback_t *callbacks[],
-			const char *names[])
+			const char * const names[])
 {
 	struct mic_vdev *mvdev = to_micvdev(vdev);
 	struct mic_device_ctrl __iomem *dc = mvdev->dc;
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index e1a10232a943..e44872fb9e5e 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -147,7 +147,7 @@ static void rproc_virtio_del_vqs(struct virtio_device *vdev)
 static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
-		       const char *names[])
+		       const char * const names[])
 {
 	struct rproc *rproc = vdev_to_rproc(vdev);
 	int i, ret;
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 73354ee27877..1fcd27c1f183 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -945,7 +945,7 @@ static void rpmsg_ns_cb(struct rpmsg_channel *rpdev, void *data, int len,
 static int rpmsg_probe(struct virtio_device *vdev)
 {
 	vq_callback_t *vq_cbs[] = { rpmsg_recv_done, rpmsg_xmit_done };
-	const char *names[] = { "input", "output" };
+	static const char * const names[] = { "input", "output" };
 	struct virtqueue *vqs[2];
 	struct virtproc_info *vrp;
 	void *bufs_va;
diff --git a/drivers/s390/virtio/kvm_virtio.c b/drivers/s390/virtio/kvm_virtio.c
index 53fb975c404b..1d060fd293a3 100644
--- a/drivers/s390/virtio/kvm_virtio.c
+++ b/drivers/s390/virtio/kvm_virtio.c
@@ -255,7 +255,7 @@ static void kvm_del_vqs(struct virtio_device *vdev)
 static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			struct virtqueue *vqs[],
 			vq_callback_t *callbacks[],
-			const char *names[])
+			const char * const names[])
 {
 	struct kvm_device *kdev = to_kvmdev(vdev);
 	int i;
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index ff06bdfd2b20..9e685246b98d 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -639,7 +639,7 @@ out:
 static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			       struct virtqueue *vqs[],
 			       vq_callback_t *callbacks[],
-			       const char *names[])
+			       const char * const names[])
 {
 	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
 	unsigned long *indicatorp = NULL;
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index cbe9e2295752..e3ac3e157485 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -396,7 +396,7 @@ static int init_vqs(struct virtio_balloon *vb)
 {
 	struct virtqueue *vqs[3];
 	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
-	const char *names[] = { "inflate", "deflate", "stats" };
+	static const char * const names[] = { "inflate", "deflate", "stats" };
 	int err, nvqs;
 
 	/*
diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c
index c96944b59856..350a2a5a49db 100644
--- a/drivers/virtio/virtio_input.c
+++ b/drivers/virtio/virtio_input.c
@@ -170,7 +170,7 @@ static int virtinput_init_vqs(struct virtio_input *vi)
 	struct virtqueue *vqs[2];
 	vq_callback_t *cbs[] = { virtinput_recv_events,
 				 virtinput_recv_status };
-	static const char *names[] = { "events", "status" };
+	static const char * const names[] = { "events", "status" };
 	int err;
 
 	err = vi->vdev->config->find_vqs(vi->vdev, 2, vqs, cbs, names);
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index f499d9da7237..745c6ee1bb3e 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -482,7 +482,7 @@ error_available:
 static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
-		       const char *names[])
+		       const char * const names[])
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
 	unsigned int irq = platform_get_irq(vm_dev->pdev, 0);
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 2046a68ad0ba..f6bed86c17f9 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -296,7 +296,7 @@ void vp_del_vqs(struct virtio_device *vdev)
 static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			      struct virtqueue *vqs[],
 			      vq_callback_t *callbacks[],
-			      const char *names[],
+			      const char * const names[],
 			      bool use_msix,
 			      bool per_vq_vectors)
 {
@@ -376,7 +376,7 @@ error_find:
 int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[],
 		vq_callback_t *callbacks[],
-		const char *names[])
+		const char * const names[])
 {
 	int err;
 
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index b976d968e793..2cc252270b2d 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -139,7 +139,7 @@ void vp_del_vqs(struct virtio_device *vdev);
 int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
-		       const char *names[]);
+		       const char * const names[]);
 const char *vp_bus_name(struct virtio_device *vdev);
 
 /* Setup the affinity for a virtqueue:
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 4469202eaa8e..631021cfc740 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -423,7 +423,7 @@ err_new_queue:
 static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			      struct virtqueue *vqs[],
 			      vq_callback_t *callbacks[],
-			      const char *names[])
+			      const char * const names[])
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq;
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index e5ce8ab0b8b0..6e6cb0c9d7cb 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -70,7 +70,7 @@ struct virtio_config_ops {
 	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
 			struct virtqueue *vqs[],
 			vq_callback_t *callbacks[],
-			const char *names[]);
+			const char * const names[]);
 	void (*del_vqs)(struct virtio_device *);
 	u64 (*get_features)(struct virtio_device *vdev);
 	int (*finalize_features)(struct virtio_device *vdev);

From 6ef11fba4c33622efd6cd11a23a7a532d5f64f05 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Tue, 16 Feb 2016 15:54:28 +0100
Subject: [PATCH 2141/3220] UPSTREAM: vhost: fix error path in
 vhost_init_used()

We don't want side effects. If something fails, we rollback vq->is_le to
its previous value.

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit e1f33be9186363da7955bcb5f0b03e6685544c50)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I18c57e5c78aa3e89d267213770f915a5a5c76100
---
 drivers/vhost/vhost.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ed0a356d1d3..e00271076996 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1157,6 +1157,8 @@ int vhost_init_used(struct vhost_virtqueue *vq)
 {
 	__virtio16 last_used_idx;
 	int r;
+	bool is_le = vq->is_le;
+
 	if (!vq->private_data) {
 		vq->is_le = virtio_legacy_is_little_endian();
 		return 0;
@@ -1166,15 +1168,20 @@ int vhost_init_used(struct vhost_virtqueue *vq)
 
 	r = vhost_update_used_flags(vq);
 	if (r)
-		return r;
+		goto err;
 	vq->signalled_used_valid = false;
-	if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx))
-		return -EFAULT;
+	if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
+		r = -EFAULT;
+		goto err;
+	}
 	r = __get_user(last_used_idx, &vq->used->idx);
 	if (r)
-		return r;
+		goto err;
 	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
 	return 0;
+err:
+	vq->is_le = is_le;
+	return r;
 }
 EXPORT_SYMBOL_GPL(vhost_init_used);
 

From 2ab0816efdda9386d8a70a63df56aea81623d763 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Tue, 16 Feb 2016 15:59:34 +0100
Subject: [PATCH 2142/3220] UPSTREAM: vhost: rename cross-endian helpers

The default use case for vhost is when the host and the vring have the
same endianness (default native endianness). But there are cases where
they differ and vhost should byteswap when accessing the vring.

The first case is when the host is big endian and the vring belongs to
a virtio 1.0 device, which is always little endian.

This is covered by the vq->is_le field. This field is initialized when
userspace calls the VHOST_SET_FEATURES ioctl. It is reset when the device
stops.

We already have a vhost_init_is_le() helper, but the reset operation is
opencoded as follows:

	vq->is_le = virtio_legacy_is_little_endian();

It isn't clear that we are resetting vq->is_le here.

This patch moves the code to a helper with a more explicit name.

The other case where we may have to byteswap is when the architecture can
switch endianness at runtime (bi-endian). If endianness differs in the host
and in the guest, then legacy devices need to be used in cross-endian mode.

This mode is available with CONFIG_VHOST_CROSS_ENDIAN_LEGACY=y, which
introduces a vq->user_be field. Userspace may enable cross-endian mode
by calling the SET_VRING_ENDIAN ioctl before the device is started. The
cross-endian mode is disabled when the device is stopped.

The current names of the helpers that manipulate vq->user_be are unclear.

This patch renames those helpers to clearly show that this is cross-endian
stuff and with explicit enable/disable semantics.

No behaviour change.

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit c507203756ca6303df2191c96c1385e965e2f0b7)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I42271a7cbc4ecc3826116943805bc59d4f8bd192
---
 drivers/vhost/vhost.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e00271076996..135946a21658 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -44,11 +44,21 @@ enum {
 #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
 
 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
-static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
+static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
 {
 	vq->user_be = !virtio_legacy_is_little_endian();
 }
 
+static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
+{
+	vq->user_be = true;
+}
+
+static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
+{
+	vq->user_be = false;
+}
+
 static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
 {
 	struct vhost_vring_state s;
@@ -63,7 +73,10 @@ static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
 	    s.num != VHOST_VRING_BIG_ENDIAN)
 		return -EINVAL;
 
-	vq->user_be = s.num;
+	if (s.num == VHOST_VRING_BIG_ENDIAN)
+		vhost_enable_cross_endian_big(vq);
+	else
+		vhost_enable_cross_endian_little(vq);
 
 	return 0;
 }
@@ -92,7 +105,7 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
 	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
 }
 #else
-static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
+static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
 {
 }
 
@@ -114,6 +127,11 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
 }
 #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
 
+static void vhost_reset_is_le(struct vhost_virtqueue *vq)
+{
+	vq->is_le = virtio_legacy_is_little_endian();
+}
+
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
@@ -276,8 +294,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call = NULL;
 	vq->log_ctx = NULL;
 	vq->memory = NULL;
-	vq->is_le = virtio_legacy_is_little_endian();
-	vhost_vq_reset_user_be(vq);
+	vhost_reset_is_le(vq);
+	vhost_disable_cross_endian(vq);
 }
 
 static int vhost_worker(void *data)
@@ -1160,7 +1178,7 @@ int vhost_init_used(struct vhost_virtqueue *vq)
 	bool is_le = vq->is_le;
 
 	if (!vq->private_data) {
-		vq->is_le = virtio_legacy_is_little_endian();
+		vhost_reset_is_le(vq);
 		return 0;
 	}
 

From a5de7503e90d46d316ead34533190e36e39141a3 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Tue, 16 Feb 2016 15:59:44 +0100
Subject: [PATCH 2143/3220] UPSTREAM: vhost: rename vhost_init_used()

Looking at how callers use this, maybe we should just rename init_used
to vhost_vq_init_access. The _used suffix was a hint that we
access the vq used ring. But maybe what callers care about is
that it must be called after access_ok.

Also, this function manipulates the vq->is_le field which isn't related
to the vq used ring.

This patch simply renames vhost_init_used() to vhost_vq_init_access() as
suggested by Michael.

No behaviour change.

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 80f7d0301e7913f704d3505722f806717c61dff5)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: If8ef33d7f00e515afbff3adc7b00607c4a58fdf6
---
 drivers/vhost/net.c   | 2 +-
 drivers/vhost/scsi.c  | 2 +-
 drivers/vhost/test.c  | 2 +-
 drivers/vhost/vhost.c | 4 ++--
 drivers/vhost/vhost.h | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 645b2197930e..211b5538258e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -917,7 +917,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 
 		vhost_net_disable_vq(n, vq);
 		vq->private_data = sock;
-		r = vhost_init_used(vq);
+		r = vhost_vq_init_access(vq);
 		if (r)
 			goto err_used;
 		r = vhost_net_enable_vq(n, vq);
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 8fc62a03637a..009315f006bf 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1277,7 +1277,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
 			vq = &vs->vqs[i].vq;
 			mutex_lock(&vq->mutex);
 			vq->private_data = vs_tpg;
-			vhost_init_used(vq);
+			vhost_vq_init_access(vq);
 			mutex_unlock(&vq->mutex);
 		}
 		ret = 0;
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index f2882ac98726..388eec4e1a90 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -196,7 +196,7 @@ static long vhost_test_run(struct vhost_test *n, int test)
 		oldpriv = vq->private_data;
 		vq->private_data = priv;
 
-		r = vhost_init_used(&n->vqs[index]);
+		r = vhost_vq_init_access(&n->vqs[index]);
 
 		mutex_unlock(&vq->mutex);
 
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 135946a21658..167a263dfb86 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1171,7 +1171,7 @@ static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 	return 0;
 }
 
-int vhost_init_used(struct vhost_virtqueue *vq)
+int vhost_vq_init_access(struct vhost_virtqueue *vq)
 {
 	__virtio16 last_used_idx;
 	int r;
@@ -1201,7 +1201,7 @@ err:
 	vq->is_le = is_le;
 	return r;
 }
-EXPORT_SYMBOL_GPL(vhost_init_used);
+EXPORT_SYMBOL_GPL(vhost_vq_init_access);
 
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 			  struct iovec iov[], int iov_size)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d3f767448a72..8f0dd0d915d4 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -148,7 +148,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
-int vhost_init_used(struct vhost_virtqueue *);
+int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
 int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
 		     unsigned count);

From 2dd59f910bfddc78e7314782b7d060a74abc57c5 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 4 Mar 2016 06:24:51 -0500
Subject: [PATCH 2144/3220] UPSTREAM: vhost: introduce vhost_has_work()

This path introduces a helper which can give a hint for whether or not
there's a work queued in the work list. This could be used for busy
polling code to exit the busy loop.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 526d3e7ff515582eaf7cf5d9da2a11bf681204e6)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I2717a212f888b0d3ea78f1ce0bcc21ec9dee5050
---
 drivers/vhost/vhost.c | 7 +++++++
 drivers/vhost/vhost.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 167a263dfb86..65a1d2da114a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -263,6 +263,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 }
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
+/* A lockless hint for busy polling code to exit the loop */
+bool vhost_has_work(struct vhost_dev *dev)
+{
+	return !list_empty(&dev->work_list);
+}
+EXPORT_SYMBOL_GPL(vhost_has_work);
+
 void vhost_poll_queue(struct vhost_poll *poll)
 {
 	vhost_work_queue(poll->dev, &poll->work);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8f0dd0d915d4..eef33b11a7ed 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -37,6 +37,7 @@ struct vhost_poll {
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
+bool vhost_has_work(struct vhost_dev *dev);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 		     unsigned long mask, struct vhost_dev *dev);

From 29181c5bfa697b4bf0f80999c16f5906869fbb9b Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 4 Mar 2016 06:24:52 -0500
Subject: [PATCH 2145/3220] UPSTREAM: vhost: introduce vhost_vq_avail_empty()

This patch introduces a helper which will return true if we're sure
that the available ring is empty for a specific vq. When we're not
sure, e.g vq access failure, return false instead. This could be used
for busy polling code to exit the busy loop.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit d4a60603fa0b42012decfa058dfa44cffde7a10c)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I1d4e972543470530a5bc37e19f4199b1f345e042
---
 drivers/vhost/vhost.c | 14 ++++++++++++++
 drivers/vhost/vhost.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 65a1d2da114a..73d4545ee0d3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1661,6 +1661,20 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 
+/* return true if we're sure that avaiable ring is empty */
+bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	__virtio16 avail_idx;
+	int r;
+
+	r = __get_user(avail_idx, &vq->avail->idx);
+	if (r)
+		return false;
+
+	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
+}
+EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
+
 /* OK, now we need to know about added descriptors. */
 bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index eef33b11a7ed..af5af773bf7a 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -159,6 +159,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
 			       struct vring_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
 bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,

From 6c81476a7cbccdcf79f32ada4422b2884e29ff5a Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 4 Mar 2016 06:24:53 -0500
Subject: [PATCH 2146/3220] UPSTREAM: vhost_net: basic polling support

This patch tries to poll for new added tx buffer or socket receive
queue for a while at the end of tx/rx processing. The maximum time
spent on polling were specified through a new kind of vring ioctl.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 0308813724606549436d30efd877a80c8e00790e)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I93db6c14e39b2db04e047ee2e0dce3af5436d95e
---
 drivers/vhost/net.c        | 78 +++++++++++++++++++++++++++++++++++---
 drivers/vhost/vhost.c      | 14 +++++++
 drivers/vhost/vhost.h      |  1 +
 include/uapi/linux/vhost.h |  6 +++
 4 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 211b5538258e..7d411d85de52 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -287,6 +287,43 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 	rcu_read_unlock_bh();
 }
 
+static inline unsigned long busy_clock(void)
+{
+	return local_clock() >> 10;
+}
+
+static bool vhost_can_busy_poll(struct vhost_dev *dev,
+				unsigned long endtime)
+{
+	return likely(!need_resched()) &&
+	       likely(!time_after(busy_clock(), endtime)) &&
+	       likely(!signal_pending(current)) &&
+	       !vhost_has_work(dev);
+}
+
+static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
+				    struct vhost_virtqueue *vq,
+				    struct iovec iov[], unsigned int iov_size,
+				    unsigned int *out_num, unsigned int *in_num)
+{
+	unsigned long uninitialized_var(endtime);
+	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+				    out_num, in_num, NULL, NULL);
+
+	if (r == vq->num && vq->busyloop_timeout) {
+		preempt_disable();
+		endtime = busy_clock() + vq->busyloop_timeout;
+		while (vhost_can_busy_poll(vq->dev, endtime) &&
+		       vhost_vq_avail_empty(vq->dev, vq))
+			cpu_relax_lowlatency();
+		preempt_enable();
+		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+					out_num, in_num, NULL, NULL);
+	}
+
+	return r;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -331,10 +368,9 @@ static void handle_tx(struct vhost_net *net)
 			      % UIO_MAXIOV == nvq->done_idx))
 			break;
 
-		head = vhost_get_vq_desc(vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 NULL, NULL);
+		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
+						ARRAY_SIZE(vq->iov),
+						&out, &in);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(head < 0))
 			break;
@@ -435,6 +471,38 @@ static int peek_head_len(struct sock *sk)
 	return len;
 }
 
+static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
+{
+	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+	struct vhost_virtqueue *vq = &nvq->vq;
+	unsigned long uninitialized_var(endtime);
+	int len = peek_head_len(sk);
+
+	if (!len && vq->busyloop_timeout) {
+		/* Both tx vq and rx socket were polled here */
+		mutex_lock(&vq->mutex);
+		vhost_disable_notify(&net->dev, vq);
+
+		preempt_disable();
+		endtime = busy_clock() + vq->busyloop_timeout;
+
+		while (vhost_can_busy_poll(&net->dev, endtime) &&
+		       skb_queue_empty(&sk->sk_receive_queue) &&
+		       vhost_vq_avail_empty(&net->dev, vq))
+			cpu_relax_lowlatency();
+
+		preempt_enable();
+
+		if (vhost_enable_notify(&net->dev, vq))
+			vhost_poll_queue(&vq->poll);
+		mutex_unlock(&vq->mutex);
+
+		len = peek_head_len(sk);
+	}
+
+	return len;
+}
+
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
  * @vq		- the relevant virtqueue
@@ -553,7 +621,7 @@ static void handle_rx(struct vhost_net *net)
 		vq->log : NULL;
 	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
 
-	while ((sock_len = peek_head_len(sock->sk))) {
+	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
 		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 73d4545ee0d3..a10459c5ca37 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -303,6 +303,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->memory = NULL;
 	vhost_reset_is_le(vq);
 	vhost_disable_cross_endian(vq);
+	vq->busyloop_timeout = 0;
 }
 
 static int vhost_worker(void *data)
@@ -938,6 +939,19 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
 	case VHOST_GET_VRING_ENDIAN:
 		r = vhost_get_vring_endian(vq, idx, argp);
 		break;
+	case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
+		if (copy_from_user(&s, argp, sizeof(s))) {
+			r = -EFAULT;
+			break;
+		}
+		vq->busyloop_timeout = s.num;
+		break;
+	case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
+		s.index = idx;
+		s.num = vq->busyloop_timeout;
+		if (copy_to_user(argp, &s, sizeof(s)))
+			r = -EFAULT;
+		break;
 	default:
 		r = -ENOIOCTLCMD;
 	}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index af5af773bf7a..d36d8beb3351 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -115,6 +115,7 @@ struct vhost_virtqueue {
 	/* Ring endianness requested by userspace for cross-endian support. */
 	bool user_be;
 #endif
+	u32 busyloop_timeout;
 };
 
 struct vhost_dev {
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index ab3731917bac..61a8777178c6 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -126,6 +126,12 @@ struct vhost_memory {
 #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
 /* Set eventfd to signal an error */
 #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
+/* Set busy loop timeout (in us) */
+#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23,	\
+					 struct vhost_vring_state)
+/* Get busy loop timeout (in us) */
+#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24,	\
+					 struct vhost_vring_state)
 
 /* VHOST_NET specific defines */
 

From 4c634053305d57cb88880a3d6cddb4f3dcafaa27 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Date: Tue, 22 Mar 2016 17:05:52 +0100
Subject: [PATCH 2147/3220] BACKPORT: AF_VSOCK: Shrink the area influenced by
 prepare_to_wait

When a thread is prepared for waiting by calling prepare_to_wait, sleeping
is not allowed until either the wait has taken place or finish_wait has
been called.  The existing code in af_vsock imposed unnecessary no-sleep
assumptions to a broad list of backend functions.
This patch shrinks the influence of prepare_to_wait to the area where it
is strictly needed, therefore relaxing the no-sleep restriction there.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit f7f9b5e7f8eccfd68ffa7b8d74b07c478bb9e7f0)
[astrachan: Backported around stable backport 3223ea1 ("vsock: use new
            wait API for vsock_stream_sendmsg()")]
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Ic1d7bae4b1187ad48194bf4d0b1dd09ab0275734
---
 net/vmw_vsock/af_vsock.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 7f1d166ce612..2cb12fe3b60d 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1559,7 +1559,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	while (total_written < len) {
 		ssize_t written;
 
-		add_wait_queue(sk_sleep(sk), &wait);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		while (vsock_stream_has_space(vsk) == 0 &&
 		       sk->sk_err == 0 &&
 		       !(sk->sk_shutdown & SEND_SHUTDOWN) &&
@@ -1568,13 +1568,13 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 			/* Don't wait for non-blocking sockets. */
 			if (timeout == 0) {
 				err = -EAGAIN;
-				remove_wait_queue(sk_sleep(sk), &wait);
+				finish_wait(sk_sleep(sk), &wait);
 				goto out_err;
 			}
 
 			err = transport->notify_send_pre_block(vsk, &send_data);
 			if (err < 0) {
-				remove_wait_queue(sk_sleep(sk), &wait);
+				finish_wait(sk_sleep(sk), &wait);
 				goto out_err;
 			}
 
@@ -1583,15 +1583,15 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 			lock_sock(sk);
 			if (signal_pending(current)) {
 				err = sock_intr_errno(timeout);
-				remove_wait_queue(sk_sleep(sk), &wait);
+				finish_wait(sk_sleep(sk), &wait);
 				goto out_err;
 			} else if (timeout == 0) {
 				err = -EAGAIN;
-				remove_wait_queue(sk_sleep(sk), &wait);
+				finish_wait(sk_sleep(sk), &wait);
 				goto out_err;
 			}
 		}
-		remove_wait_queue(sk_sleep(sk), &wait);
+		finish_wait(sk_sleep(sk), &wait);
 
 		/* These checks occur both as part of and after the loop
 		 * conditional since we need to check before and after

From 7103ea77d35bce09975d07b3bafa9e70c189eacb Mon Sep 17 00:00:00 2001
From: Jorgen Hansen <jhansen@vmware.com>
Date: Mon, 18 Apr 2016 23:58:52 -0700
Subject: [PATCH 2148/3220] UPSTREAM: VSOCK: Only check error on
 skb_recv_datagram when skb is NULL

If skb_recv_datagram returns an skb, we should ignore the err
value returned. Otherwise, datagram receives will return EAGAIN
when they have to wait for a datagram.

Acked-by: Adit Ranadive <aditr@vmware.com>
Signed-off-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 9c995cc9a206a008699da82f6cd01e9b2615649a)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I69d487529656cb2fe8be9c2cef0db440d4db5cac
---
 net/vmw_vsock/vmci_transport.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index d24773552b64..20667a13bb23 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1767,11 +1767,8 @@ static int vmci_transport_dgram_dequeue(struct vsock_sock *vsk,
 	/* Retrieve the head sk_buff from the socket's receive queue. */
 	err = 0;
 	skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err);
-	if (err)
-		return err;
-
 	if (!skb)
-		return -EAGAIN;
+		return err;
 
 	dg = (struct vmci_datagram *)skb->data;
 	if (!dg)
@@ -2186,7 +2183,7 @@ module_exit(vmci_transport_exit);
 
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMCI transport for Virtual Sockets");
-MODULE_VERSION("1.0.3.0-k");
+MODULE_VERSION("1.0.4.0-k");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS("vmware_vsock");
 MODULE_ALIAS_NETPROTO(PF_VSOCK);

From 773bac0e1f140010b0b25cc85667cef921af173f Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 25 Apr 2016 22:14:32 -0400
Subject: [PATCH 2149/3220] UPSTREAM: vhost: simplify work flushing

We used to implement the work flushing through tracking queued seq,
done seq, and the number of flushing. This patch simplify this by just
implement work flushing through another kind of vhost work with
completion. This will be used by lockless enqueuing patch.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 7235acdb1144460d9f520f0d931f3cbb79eb244c)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: Id3903050706dd734a0217c56ad8ca99b2b22471e
---
 drivers/vhost/vhost.c | 53 +++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 32 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a10459c5ca37..f942082b984c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -132,6 +132,19 @@ static void vhost_reset_is_le(struct vhost_virtqueue *vq)
 	vq->is_le = virtio_legacy_is_little_endian();
 }
 
+struct vhost_flush_struct {
+	struct vhost_work work;
+	struct completion wait_event;
+};
+
+static void vhost_flush_work(struct vhost_work *work)
+{
+	struct vhost_flush_struct *s;
+
+	s = container_of(work, struct vhost_flush_struct, work);
+	complete(&s->wait_event);
+}
+
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
@@ -159,8 +172,6 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
 	INIT_LIST_HEAD(&work->node);
 	work->fn = fn;
 	init_waitqueue_head(&work->done);
-	work->flushing = 0;
-	work->queue_seq = work->done_seq = 0;
 }
 EXPORT_SYMBOL_GPL(vhost_work_init);
 
@@ -211,31 +222,17 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
-static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
-				unsigned seq)
-{
-	int left;
-
-	spin_lock_irq(&dev->work_lock);
-	left = seq - work->done_seq;
-	spin_unlock_irq(&dev->work_lock);
-	return left <= 0;
-}
-
 void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
 {
-	unsigned seq;
-	int flushing;
+	struct vhost_flush_struct flush;
 
-	spin_lock_irq(&dev->work_lock);
-	seq = work->queue_seq;
-	work->flushing++;
-	spin_unlock_irq(&dev->work_lock);
-	wait_event(work->done, vhost_work_seq_done(dev, work, seq));
-	spin_lock_irq(&dev->work_lock);
-	flushing = --work->flushing;
-	spin_unlock_irq(&dev->work_lock);
-	BUG_ON(flushing < 0);
+	if (dev->worker) {
+		init_completion(&flush.wait_event);
+		vhost_work_init(&flush.work, vhost_flush_work);
+
+		vhost_work_queue(dev, &flush.work);
+		wait_for_completion(&flush.wait_event);
+	}
 }
 EXPORT_SYMBOL_GPL(vhost_work_flush);
 
@@ -254,7 +251,6 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 	spin_lock_irqsave(&dev->work_lock, flags);
 	if (list_empty(&work->node)) {
 		list_add_tail(&work->node, &dev->work_list);
-		work->queue_seq++;
 		spin_unlock_irqrestore(&dev->work_lock, flags);
 		wake_up_process(dev->worker);
 	} else {
@@ -310,7 +306,6 @@ static int vhost_worker(void *data)
 {
 	struct vhost_dev *dev = data;
 	struct vhost_work *work = NULL;
-	unsigned uninitialized_var(seq);
 	mm_segment_t oldfs = get_fs();
 
 	set_fs(USER_DS);
@@ -321,11 +316,6 @@ static int vhost_worker(void *data)
 		set_current_state(TASK_INTERRUPTIBLE);
 
 		spin_lock_irq(&dev->work_lock);
-		if (work) {
-			work->done_seq = seq;
-			if (work->flushing)
-				wake_up_all(&work->done);
-		}
 
 		if (kthread_should_stop()) {
 			spin_unlock_irq(&dev->work_lock);
@@ -336,7 +326,6 @@ static int vhost_worker(void *data)
 			work = list_first_entry(&dev->work_list,
 						struct vhost_work, node);
 			list_del_init(&work->node);
-			seq = work->queue_seq;
 		} else
 			work = NULL;
 		spin_unlock_irq(&dev->work_lock);

From e6fdb474765f4e019df24f7b871a91c7a635f100 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 25 Apr 2016 22:14:33 -0400
Subject: [PATCH 2150/3220] UPSTREAM: vhost: lockless enqueuing

We use spinlock to synchronize the work list now which may cause
unnecessary contentions. So this patch switch to use llist to remove
this contention. Pktgen tests shows about 5% improvement:

Before:
~1300000 pps
After:
~1370000 pps

Signed-off-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 04b96e5528ca97199b429810fe963185a67dd40e)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>
Change-Id: Icf032cf2010eaedd92dc5906d454274709b4a9b4
---
 drivers/vhost/vhost.c | 52 +++++++++++++++++++++----------------------
 drivers/vhost/vhost.h |  7 +++---
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f942082b984c..a508c40c9e25 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -169,7 +169,7 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
 {
-	INIT_LIST_HEAD(&work->node);
+	clear_bit(VHOST_WORK_QUEUED, &work->flags);
 	work->fn = fn;
 	init_waitqueue_head(&work->done);
 }
@@ -246,15 +246,16 @@ EXPORT_SYMBOL_GPL(vhost_poll_flush);
 
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
-	unsigned long flags;
+	if (!dev->worker)
+		return;
 
-	spin_lock_irqsave(&dev->work_lock, flags);
-	if (list_empty(&work->node)) {
-		list_add_tail(&work->node, &dev->work_list);
-		spin_unlock_irqrestore(&dev->work_lock, flags);
+	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
+		/* We can only add the work to the list after we're
+		 * sure it was not in the list.
+		 */
+		smp_mb();
+		llist_add(&work->node, &dev->work_list);
 		wake_up_process(dev->worker);
-	} else {
-		spin_unlock_irqrestore(&dev->work_lock, flags);
 	}
 }
 EXPORT_SYMBOL_GPL(vhost_work_queue);
@@ -262,7 +263,7 @@ EXPORT_SYMBOL_GPL(vhost_work_queue);
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_has_work(struct vhost_dev *dev)
 {
-	return !list_empty(&dev->work_list);
+	return !llist_empty(&dev->work_list);
 }
 EXPORT_SYMBOL_GPL(vhost_has_work);
 
@@ -305,7 +306,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 static int vhost_worker(void *data)
 {
 	struct vhost_dev *dev = data;
-	struct vhost_work *work = NULL;
+	struct vhost_work *work, *work_next;
+	struct llist_node *node;
 	mm_segment_t oldfs = get_fs();
 
 	set_fs(USER_DS);
@@ -315,29 +317,25 @@ static int vhost_worker(void *data)
 		/* mb paired w/ kthread_stop */
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		spin_lock_irq(&dev->work_lock);
-
 		if (kthread_should_stop()) {
-			spin_unlock_irq(&dev->work_lock);
 			__set_current_state(TASK_RUNNING);
 			break;
 		}
-		if (!list_empty(&dev->work_list)) {
-			work = list_first_entry(&dev->work_list,
-						struct vhost_work, node);
-			list_del_init(&work->node);
-		} else
-			work = NULL;
-		spin_unlock_irq(&dev->work_lock);
 
-		if (work) {
+		node = llist_del_all(&dev->work_list);
+		if (!node)
+			schedule();
+
+		node = llist_reverse_order(node);
+		/* make sure flag is seen after deletion */
+		smp_wmb();
+		llist_for_each_entry_safe(work, work_next, node, node) {
+			clear_bit(VHOST_WORK_QUEUED, &work->flags);
 			__set_current_state(TASK_RUNNING);
 			work->fn(work);
 			if (need_resched())
 				schedule();
-		} else
-			schedule();
-
+		}
 	}
 	unuse_mm(dev->mm);
 	set_fs(oldfs);
@@ -398,9 +396,9 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->log_file = NULL;
 	dev->memory = NULL;
 	dev->mm = NULL;
-	spin_lock_init(&dev->work_lock);
-	INIT_LIST_HEAD(&dev->work_list);
 	dev->worker = NULL;
+	init_llist_head(&dev->work_list);
+
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -566,7 +564,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 	/* No one will access memory at this point */
 	kvfree(dev->memory);
 	dev->memory = NULL;
-	WARN_ON(!list_empty(&dev->work_list));
+	WARN_ON(!llist_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d36d8beb3351..6690e645d2f8 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -15,13 +15,15 @@
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
 
+#define VHOST_WORK_QUEUED 1
 struct vhost_work {
-	struct list_head	  node;
+	struct llist_node	  node;
 	vhost_work_fn_t		  fn;
 	wait_queue_head_t	  done;
 	int			  flushing;
 	unsigned		  queue_seq;
 	unsigned		  done_seq;
+	unsigned long		  flags;
 };
 
 /* Poll a file (eventfd or socket) */
@@ -126,8 +128,7 @@ struct vhost_dev {
 	int nvqs;
 	struct file *log_file;
 	struct eventfd_ctx *log_ctx;
-	spinlock_t work_lock;
-	struct list_head work_list;
+	struct llist_head work_list;
 	struct task_struct *worker;
 };
 

From 708df0e257dac2e0beda954714a3a0feedcdb709 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Sun, 1 May 2016 14:49:15 +0200
Subject: [PATCH 2151/3220] UPSTREAM: VSOCK: constify vsock_transport structure

The vsock_transport structure is never modified, so declare it as const.

Done with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 56130915bbe31656c80f7493d28536693f8de0e2)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I59ffded185fbf22aaeb39753870ef9c866ab1a2a
---
 net/vmw_vsock/vmci_transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 20667a13bb23..ecdcc1ada947 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -2083,7 +2083,7 @@ static u32 vmci_transport_get_local_cid(void)
 	return vmci_get_context_id();
 }
 
-static struct vsock_transport vmci_transport = {
+static const struct vsock_transport vmci_transport = {
 	.init = vmci_transport_socket_init,
 	.destruct = vmci_transport_destruct,
 	.release = vmci_transport_release,

From 6b9b4adc1febeed50fc2c7d4ad90ca4d6b3ad174 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 1 Jun 2016 01:56:33 -0400
Subject: [PATCH 2152/3220] UPSTREAM: vhost_net: stop polling socket during rx
 processing

We don't stop rx polling socket during rx processing, this will lead
unnecessary wakeups from under layer net devices (E.g
sock_def_readable() form tun). Rx will be slowed down in this
way. This patch avoids this by stop polling socket during rx
processing. A small drawback is that this introduces some overheads in
light load case because of the extra start/stop polling, but single
netperf TCP_RR does not notice any change. In a super heavy load case,
e.g using pktgen to inject packet to guest, we get about ~8.8%
improvement on pps:

before: ~1240000 pkt/s
after:  ~1350000 pkt/s

Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 8241a1e466cd56e6c10472cac9c1ad4e54bc65db)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: Ia51c61a6c1976b6f6406342f369ffc365d964078
---
 drivers/vhost/net.c | 64 +++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7d411d85de52..9bf9af7667f4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -301,6 +301,32 @@ static bool vhost_can_busy_poll(struct vhost_dev *dev,
 	       !vhost_has_work(dev);
 }
 
+static void vhost_net_disable_vq(struct vhost_net *n,
+				 struct vhost_virtqueue *vq)
+{
+	struct vhost_net_virtqueue *nvq =
+		container_of(vq, struct vhost_net_virtqueue, vq);
+	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
+	if (!vq->private_data)
+		return;
+	vhost_poll_stop(poll);
+}
+
+static int vhost_net_enable_vq(struct vhost_net *n,
+				struct vhost_virtqueue *vq)
+{
+	struct vhost_net_virtqueue *nvq =
+		container_of(vq, struct vhost_net_virtqueue, vq);
+	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
+	struct socket *sock;
+
+	sock = vq->private_data;
+	if (!sock)
+		return 0;
+
+	return vhost_poll_start(poll, sock->file);
+}
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
 				    struct iovec iov[], unsigned int iov_size,
@@ -613,6 +639,7 @@ static void handle_rx(struct vhost_net *net)
 	if (!sock)
 		goto out;
 	vhost_disable_notify(&net->dev, vq);
+	vhost_net_disable_vq(net, vq);
 
 	vhost_hlen = nvq->vhost_hlen;
 	sock_hlen = nvq->sock_hlen;
@@ -629,7 +656,7 @@ static void handle_rx(struct vhost_net *net)
 					likely(mergeable) ? UIO_MAXIOV : 1);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(headcount < 0))
-			break;
+			goto out;
 		/* On overrun, truncate and discard */
 		if (unlikely(headcount > UIO_MAXIOV)) {
 			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
@@ -648,7 +675,7 @@ static void handle_rx(struct vhost_net *net)
 			}
 			/* Nothing new?  Wait for eventfd to tell us
 			 * they refilled. */
-			break;
+			goto out;
 		}
 		/* We don't need to be notified again. */
 		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
@@ -676,7 +703,7 @@ static void handle_rx(struct vhost_net *net)
 					 &fixup) != sizeof(hdr)) {
 				vq_err(vq, "Unable to write vnet_hdr "
 				       "at addr %p\n", vq->iov->iov_base);
-				break;
+				goto out;
 			}
 		} else {
 			/* Header came from socket; we'll need to patch
@@ -692,7 +719,7 @@ static void handle_rx(struct vhost_net *net)
 				 &fixup) != sizeof num_buffers) {
 			vq_err(vq, "Failed num_buffers write");
 			vhost_discard_vq_desc(vq, headcount);
-			break;
+			goto out;
 		}
 		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
 					    headcount);
@@ -701,9 +728,10 @@ static void handle_rx(struct vhost_net *net)
 		total_len += vhost_len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
-			break;
+			goto out;
 		}
 	}
+	vhost_net_enable_vq(net, vq);
 out:
 	mutex_unlock(&vq->mutex);
 }
@@ -782,32 +810,6 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	return 0;
 }
 
-static void vhost_net_disable_vq(struct vhost_net *n,
-				 struct vhost_virtqueue *vq)
-{
-	struct vhost_net_virtqueue *nvq =
-		container_of(vq, struct vhost_net_virtqueue, vq);
-	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
-	if (!vq->private_data)
-		return;
-	vhost_poll_stop(poll);
-}
-
-static int vhost_net_enable_vq(struct vhost_net *n,
-				struct vhost_virtqueue *vq)
-{
-	struct vhost_net_virtqueue *nvq =
-		container_of(vq, struct vhost_net_virtqueue, vq);
-	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
-	struct socket *sock;
-
-	sock = vq->private_data;
-	if (!sock)
-		return 0;
-
-	return vhost_poll_start(poll, sock->file);
-}
-
 static struct socket *vhost_net_stop_vq(struct vhost_net *n,
 					struct vhost_virtqueue *vq)
 {

From ec8d83a0748a9f9c7f136c76b455ad35b1bcfa8a Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 23 Jun 2016 02:04:30 -0400
Subject: [PATCH 2153/3220] UPSTREAM: vhost: introduce vhost memory accessors

This patch introduces vhost memory accessors which were just wrappers
for userspace address access helpers. This is a requirement for vhost
device iotlb implementation which will add iotlb translations in those
accessors.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit bfe2bc512884d0b1c5297a15350f940ca80e439b)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: Ia67c171384109e646f55027a71dd8df9c6b9c61a
---
 drivers/vhost/vhost.c | 50 ++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a508c40c9e25..5725063e320b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -638,6 +638,22 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
 	return 1;
 }
 
+#define vhost_put_user(vq, x, ptr)  __put_user(x, ptr)
+
+static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
+			      const void *from, unsigned size)
+{
+	return __copy_to_user(to, from, size);
+}
+
+#define vhost_get_user(vq, x, ptr) __get_user(x, ptr)
+
+static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
+				void *from, unsigned size)
+{
+	return __copy_from_user(to, from, size);
+}
+
 static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			struct vring_desc __user *desc,
 			struct vring_avail __user *avail,
@@ -1144,7 +1160,8 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
 	void __user *used;
-	if (__put_user(cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags) < 0)
+	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
+			   &vq->used->flags) < 0)
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		/* Make sure the flag is seen before log. */
@@ -1162,7 +1179,8 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 {
-	if (__put_user(cpu_to_vhost16(vq, vq->avail_idx), vhost_avail_event(vq)))
+	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
+			   vhost_avail_event(vq)))
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		void __user *used;
@@ -1200,7 +1218,7 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 		r = -EFAULT;
 		goto err;
 	}
-	r = __get_user(last_used_idx, &vq->used->idx);
+	r = vhost_get_user(vq, last_used_idx, &vq->used->idx);
 	if (r)
 		goto err;
 	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
@@ -1380,7 +1398,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
 	last_avail_idx = vq->last_avail_idx;
-	if (unlikely(__get_user(avail_idx, &vq->avail->idx))) {
+	if (unlikely(vhost_get_user(vq, avail_idx, &vq->avail->idx))) {
 		vq_err(vq, "Failed to access avail idx at %p\n",
 		       &vq->avail->idx);
 		return -EFAULT;
@@ -1402,8 +1420,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
 	/* Grab the next descriptor number they're advertising, and increment
 	 * the index we've seen. */
-	if (unlikely(__get_user(ring_head,
-				&vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
+	if (unlikely(vhost_get_user(vq, ring_head,
+		     &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
 		vq_err(vq, "Failed to read head: idx %d address %p\n",
 		       last_avail_idx,
 		       &vq->avail->ring[last_avail_idx % vq->num]);
@@ -1438,7 +1456,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			       i, vq->num, head);
 			return -EINVAL;
 		}
-		ret = __copy_from_user(&desc, vq->desc + i, sizeof desc);
+		ret = vhost_copy_from_user(vq, &desc, vq->desc + i,
+					   sizeof desc);
 		if (unlikely(ret)) {
 			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
 			       i, vq->desc + i);
@@ -1526,15 +1545,15 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 	start = vq->last_used_idx & (vq->num - 1);
 	used = vq->used->ring + start;
 	if (count == 1) {
-		if (__put_user(heads[0].id, &used->id)) {
+		if (vhost_put_user(vq, heads[0].id, &used->id)) {
 			vq_err(vq, "Failed to write used id");
 			return -EFAULT;
 		}
-		if (__put_user(heads[0].len, &used->len)) {
+		if (vhost_put_user(vq, heads[0].len, &used->len)) {
 			vq_err(vq, "Failed to write used len");
 			return -EFAULT;
 		}
-	} else if (__copy_to_user(used, heads, count * sizeof *used)) {
+	} else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
 		vq_err(vq, "Failed to write used");
 		return -EFAULT;
 	}
@@ -1578,7 +1597,8 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
 
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
-	if (__put_user(cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx)) {
+	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
+			   &vq->used->idx)) {
 		vq_err(vq, "Failed to increment used idx");
 		return -EFAULT;
 	}
@@ -1612,7 +1632,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 
 	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
 		__virtio16 flags;
-		if (__get_user(flags, &vq->avail->flags)) {
+		if (vhost_get_user(vq, flags, &vq->avail->flags)) {
 			vq_err(vq, "Failed to get flags");
 			return true;
 		}
@@ -1626,7 +1646,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	if (unlikely(!v))
 		return true;
 
-	if (__get_user(event, vhost_used_event(vq))) {
+	if (vhost_get_user(vq, event, vhost_used_event(vq))) {
 		vq_err(vq, "Failed to get used event idx");
 		return true;
 	}
@@ -1668,7 +1688,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	__virtio16 avail_idx;
 	int r;
 
-	r = __get_user(avail_idx, &vq->avail->idx);
+	r = vhost_get_user(vq, avail_idx, &vq->avail->idx);
 	if (r)
 		return false;
 
@@ -1703,7 +1723,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	/* They could have slipped one in as we were doing that: make
 	 * sure it's written, then check again. */
 	smp_mb();
-	r = __get_user(avail_idx, &vq->avail->idx);
+	r = vhost_get_user(vq, avail_idx, &vq->avail->idx);
 	if (r) {
 		vq_err(vq, "Failed to check avail idx at %p: %d\n",
 		       &vq->avail->idx, r);

From 2fb5f444a82f686541f8a9ecfb942cef9232c5d0 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 15 Jan 2019 15:45:17 -0800
Subject: [PATCH 2154/3220] BACKPORT: vhost: convert pre sorted vhost memory
 array to interval tree

Current pre-sorted memory region array has some limitations for future
device IOTLB conversion:

1) need extra work for adding and removing a single region, and it's
   expected to be slow because of sorting or memory re-allocation.
2) need extra work of removing a large range which may intersect
   several regions with different size.
3) need trick for a replacement policy like LRU

To overcome the above shortcomings, this patch convert it to interval
tree which can easily address the above issue with almost no extra
work.

The patch could be used for:

- Extend the current API and only let the userspace to send diffs of
  memory table.
- Simplify Device IOTLB implementation.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit a9709d6874d55130663567577a9b05c35138cc6b)
[astrachan: Backported around stable backport 711df71 ("vhost_net: stop
            device during reset owner")]
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>
Change-Id: I51c7c7229908a5ce1f082a80eeda5a01e85c2234
---
 drivers/vhost/net.c   |   8 +-
 drivers/vhost/vhost.c | 182 +++++++++++++++++++++++-------------------
 drivers/vhost/vhost.h |  27 ++++++-
 3 files changed, 128 insertions(+), 89 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9bf9af7667f4..9b5e6788f9bf 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1039,21 +1039,21 @@ static long vhost_net_reset_owner(struct vhost_net *n)
 	struct socket *tx_sock = NULL;
 	struct socket *rx_sock = NULL;
 	long err;
-	struct vhost_memory *memory;
+	struct vhost_umem *umem;
 
 	mutex_lock(&n->dev.mutex);
 	err = vhost_dev_check_owner(&n->dev);
 	if (err)
 		goto done;
-	memory = vhost_dev_reset_owner_prepare();
-	if (!memory) {
+	umem = vhost_dev_reset_owner_prepare();
+	if (!umem) {
 		err = -ENOMEM;
 		goto done;
 	}
 	vhost_net_stop(n, &tx_sock, &rx_sock);
 	vhost_net_flush(n);
 	vhost_dev_stop(&n->dev);
-	vhost_dev_reset_owner(&n->dev, memory);
+	vhost_dev_reset_owner(&n->dev, umem);
 	vhost_net_vq_reset(n);
 done:
 	mutex_unlock(&n->dev.mutex);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5725063e320b..bb06d3b3392a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -27,6 +27,7 @@
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <linux/interval_tree_generic.h>
 #include <linux/nospec.h>
 
 #include "vhost.h"
@@ -43,6 +44,10 @@ enum {
 #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
 #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
 
+INTERVAL_TREE_DEFINE(struct vhost_umem_node,
+		     rb, __u64, __subtree_last,
+		     START, LAST, , vhost_umem_interval_tree);
+
 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
 static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
 {
@@ -297,10 +302,10 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
-	vq->memory = NULL;
 	vhost_reset_is_le(vq);
 	vhost_disable_cross_endian(vq);
 	vq->busyloop_timeout = 0;
+	vq->umem = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -394,7 +399,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	mutex_init(&dev->mutex);
 	dev->log_ctx = NULL;
 	dev->log_file = NULL;
-	dev->memory = NULL;
+	dev->umem = NULL;
 	dev->mm = NULL;
 	dev->worker = NULL;
 	init_llist_head(&dev->work_list);
@@ -499,27 +504,36 @@ err_mm:
 }
 EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
 
-struct vhost_memory *vhost_dev_reset_owner_prepare(void)
+static void *vhost_kvzalloc(unsigned long size)
 {
-	return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
+	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+
+	if (!n)
+		n = vzalloc(size);
+	return n;
+}
+
+struct vhost_umem *vhost_dev_reset_owner_prepare(void)
+{
+	return vhost_kvzalloc(sizeof(struct vhost_umem));
 }
 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
 
 /* Caller should have device mutex */
-void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory)
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
 {
 	int i;
 
 	vhost_dev_cleanup(dev, true);
 
 	/* Restore memory to default empty mapping. */
-	memory->nregions = 0;
-	dev->memory = memory;
+	INIT_LIST_HEAD(&umem->umem_list);
+	dev->umem = umem;
 	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
 	 * VQs aren't running.
 	 */
 	for (i = 0; i < dev->nvqs; ++i)
-		dev->vqs[i]->memory = memory;
+		dev->vqs[i]->umem = umem;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
 
@@ -536,6 +550,21 @@ void vhost_dev_stop(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_stop);
 
+static void vhost_umem_clean(struct vhost_umem *umem)
+{
+	struct vhost_umem_node *node, *tmp;
+
+	if (!umem)
+		return;
+
+	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
+		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
+		list_del(&node->link);
+		kvfree(node);
+	}
+	kvfree(umem);
+}
+
 /* Caller should have device mutex if and only if locked is set */
 void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 {
@@ -562,8 +591,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 		fput(dev->log_file);
 	dev->log_file = NULL;
 	/* No one will access memory at this point */
-	kvfree(dev->memory);
-	dev->memory = NULL;
+	vhost_umem_clean(dev->umem);
+	dev->umem = NULL;
 	WARN_ON(!llist_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
@@ -589,25 +618,25 @@ static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 }
 
 /* Caller should have vq mutex and device mutex. */
-static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
+static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
 			       int log_all)
 {
-	int i;
+	struct vhost_umem_node *node;
 
-	if (!mem)
+	if (!umem)
 		return 0;
 
-	for (i = 0; i < mem->nregions; ++i) {
-		struct vhost_memory_region *m = mem->regions + i;
-		unsigned long a = m->userspace_addr;
-		if (m->memory_size > ULONG_MAX)
+	list_for_each_entry(node, &umem->umem_list, link) {
+		unsigned long a = node->userspace_addr;
+
+		if (node->size > ULONG_MAX)
 			return 0;
 		else if (!access_ok(VERIFY_WRITE, (void __user *)a,
-				    m->memory_size))
+				    node->size))
 			return 0;
 		else if (log_all && !log_access_ok(log_base,
-						   m->guest_phys_addr,
-						   m->memory_size))
+						   node->start,
+						   node->size))
 			return 0;
 	}
 	return 1;
@@ -615,7 +644,7 @@ static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
 
 /* Can we switch to this memory table? */
 /* Caller should have device mutex but not vq mutex */
-static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
+static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
 			    int log_all)
 {
 	int i;
@@ -628,7 +657,8 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
 		log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
 		/* If ring is inactive, will check when it's enabled. */
 		if (d->vqs[i]->private_data)
-			ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log);
+			ok = vq_memory_access_ok(d->vqs[i]->log_base,
+						 umem, log);
 		else
 			ok = 1;
 		mutex_unlock(&d->vqs[i]->mutex);
@@ -671,7 +701,7 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 /* Caller should have device mutex but not vq mutex */
 int vhost_log_access_ok(struct vhost_dev *dev)
 {
-	return memory_access_ok(dev, dev->memory, 1);
+	return memory_access_ok(dev, dev->umem, 1);
 }
 EXPORT_SYMBOL_GPL(vhost_log_access_ok);
 
@@ -682,7 +712,7 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 
-	return vq_memory_access_ok(log_base, vq->memory,
+	return vq_memory_access_ok(log_base, vq->umem,
 				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
 		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
 					sizeof *vq->used +
@@ -698,28 +728,12 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
 
-static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2)
-{
-	const struct vhost_memory_region *r1 = p1, *r2 = p2;
-	if (r1->guest_phys_addr < r2->guest_phys_addr)
-		return 1;
-	if (r1->guest_phys_addr > r2->guest_phys_addr)
-		return -1;
-	return 0;
-}
-
-static void *vhost_kvzalloc(unsigned long size)
-{
-	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
-
-	if (!n)
-		n = vzalloc(size);
-	return n;
-}
-
 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 {
-	struct vhost_memory mem, *newmem, *oldmem;
+	struct vhost_memory mem, *newmem;
+	struct vhost_memory_region *region;
+	struct vhost_umem_node *node;
+	struct vhost_umem *newumem, *oldumem;
 	unsigned long size = offsetof(struct vhost_memory, regions);
 	int i;
 
@@ -739,24 +753,52 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 		kvfree(newmem);
 		return -EFAULT;
 	}
-	sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions),
-		vhost_memory_reg_sort_cmp, NULL);
 
-	if (!memory_access_ok(d, newmem, 0)) {
+	newumem = vhost_kvzalloc(sizeof(*newumem));
+	if (!newumem) {
 		kvfree(newmem);
-		return -EFAULT;
+		return -ENOMEM;
 	}
-	oldmem = d->memory;
-	d->memory = newmem;
+
+	newumem->umem_tree = RB_ROOT;
+	INIT_LIST_HEAD(&newumem->umem_list);
+
+	for (region = newmem->regions;
+	     region < newmem->regions + mem.nregions;
+	     region++) {
+		node = vhost_kvzalloc(sizeof(*node));
+		if (!node)
+			goto err;
+		node->start = region->guest_phys_addr;
+		node->size = region->memory_size;
+		node->last = node->start + node->size - 1;
+		node->userspace_addr = region->userspace_addr;
+		INIT_LIST_HEAD(&node->link);
+		list_add_tail(&node->link, &newumem->umem_list);
+		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
+	}
+
+	if (!memory_access_ok(d, newumem, 0))
+		goto err;
+
+	oldumem = d->umem;
+	d->umem = newumem;
 
 	/* All memory accesses are done under some VQ mutex. */
 	for (i = 0; i < d->nvqs; ++i) {
 		mutex_lock(&d->vqs[i]->mutex);
-		d->vqs[i]->memory = newmem;
+		d->vqs[i]->umem = newumem;
 		mutex_unlock(&d->vqs[i]->mutex);
 	}
-	kvfree(oldmem);
+
+	kvfree(newmem);
+	vhost_umem_clean(oldumem);
 	return 0;
+
+err:
+	vhost_umem_clean(newumem);
+	kvfree(newmem);
+	return -EFAULT;
 }
 
 long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
@@ -1060,28 +1102,6 @@ done:
 }
 EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
 
-static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
-						     __u64 addr, __u32 len)
-{
-	const struct vhost_memory_region *reg;
-	int start = 0, end = mem->nregions;
-
-	while (start < end) {
-		int slot = start + (end - start) / 2;
-		reg = mem->regions + slot;
-		if (addr >= reg->guest_phys_addr)
-			end = slot;
-		else
-			start = slot + 1;
-	}
-
-	reg = mem->regions + start;
-	if (addr >= reg->guest_phys_addr &&
-		reg->guest_phys_addr + reg->memory_size > addr)
-		return reg;
-	return NULL;
-}
-
 /* TODO: This is really inefficient.  We need something like get_user()
  * (instruction directly accesses the data, with an exception table entry
  * returning -EFAULT). See Documentation/x86/exception-tables.txt.
@@ -1232,29 +1252,29 @@ EXPORT_SYMBOL_GPL(vhost_vq_init_access);
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 			  struct iovec iov[], int iov_size)
 {
-	const struct vhost_memory_region *reg;
-	struct vhost_memory *mem;
+	const struct vhost_umem_node *node;
+	struct vhost_umem *umem = vq->umem;
 	struct iovec *_iov;
 	u64 s = 0;
 	int ret = 0;
 
-	mem = vq->memory;
 	while ((u64)len > s) {
 		u64 size;
 		if (unlikely(ret >= iov_size)) {
 			ret = -ENOBUFS;
 			break;
 		}
-		reg = find_region(mem, addr, len);
-		if (unlikely(!reg)) {
+		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+							addr, addr + len - 1);
+		if (node == NULL || node->start > addr) {
 			ret = -EFAULT;
 			break;
 		}
 		_iov = iov + ret;
-		size = reg->memory_size - addr + reg->guest_phys_addr;
+		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
 		_iov->iov_base = (void __user *)(unsigned long)
-			(reg->userspace_addr + addr - reg->guest_phys_addr);
+			(node->userspace_addr + addr - node->start);
 		s += size;
 		addr += size;
 		++ret;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6690e645d2f8..eaaf6df72218 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -55,6 +55,25 @@ struct vhost_log {
 	u64 len;
 };
 
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+
+struct vhost_umem_node {
+	struct rb_node rb;
+	struct list_head link;
+	__u64 start;
+	__u64 last;
+	__u64 size;
+	__u64 userspace_addr;
+	__u64 flags_padding;
+	__u64 __subtree_last;
+};
+
+struct vhost_umem {
+	struct rb_root umem_tree;
+	struct list_head umem_list;
+};
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -103,7 +122,7 @@ struct vhost_virtqueue {
 	struct iovec *indirect;
 	struct vring_used_elem *heads;
 	/* Protected by virtqueue mutex. */
-	struct vhost_memory *memory;
+	struct vhost_umem *umem;
 	void *private_data;
 	u64 acked_features;
 	/* Log write descriptors */
@@ -121,7 +140,6 @@ struct vhost_virtqueue {
 };
 
 struct vhost_dev {
-	struct vhost_memory *memory;
 	struct mm_struct *mm;
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
@@ -130,14 +148,15 @@ struct vhost_dev {
 	struct eventfd_ctx *log_ctx;
 	struct llist_head work_list;
 	struct task_struct *worker;
+	struct vhost_umem *umem;
 };
 
 void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
 long vhost_dev_set_owner(struct vhost_dev *dev);
 bool vhost_dev_has_owner(struct vhost_dev *dev);
 long vhost_dev_check_owner(struct vhost_dev *);
-struct vhost_memory *vhost_dev_reset_owner_prepare(void);
-void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *);
+struct vhost_umem *vhost_dev_reset_owner_prepare(void);
+void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
 void vhost_dev_cleanup(struct vhost_dev *, bool locked);
 void vhost_dev_stop(struct vhost_dev *);
 long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);

From 0f0ec3accb21ac2f7e510c44326294bd4164fc19 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 23 Jun 2016 02:04:32 -0400
Subject: [PATCH 2155/3220] UPSTREAM: vhost: new device IOTLB API

This patch tries to implement an device IOTLB for vhost. This could be
used with userspace(qemu) implementation of DMA remapping
to emulate an IOMMU for the guest.

The idea is simple, cache the translation in a software device IOTLB
(which is implemented as an interval tree) in vhost and use vhost_net
file descriptor for reporting IOTLB miss and IOTLB
update/invalidation. When vhost meets an IOTLB miss, the fault
address, size and access can be read from the file. After userspace
finishes the translation, it writes the translated address to the
vhost_net file to update the device IOTLB.

When device IOTLB is enabled by setting VIRTIO_F_IOMMU_PLATFORM all vq
addresses set by ioctl are treated as iova instead of virtual address and
the accessing can only be done through IOTLB instead of direct userspace
memory access. Before each round or vq processing, all vq metadata is
prefetched in device IOTLB to make sure no translation fault happens
during vq processing.

In most cases, virtqueues are contiguous even in virtual address space.
The IOTLB translation for virtqueue itself may make it a little
slower. We might add fast path cache on top of this patch.

Signed-off-by: Jason Wang <jasowang@redhat.com>
[mst: use virtio feature bit: VHOST_F_DEVICE_IOTLB -> VIRTIO_F_IOMMU_PLATFORM ]
[mst: fix build warnings ]
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
[ weiyj.lk: missing unlock on error ]
Signed-off-by: Wei Yongjun <weiyj.lk@gmail.com>
(cherry picked from commit 6b1e6cc7855b09a0a9bfa1d9f30172ba366f161c)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I10e4e64d6bc9b36a0d9b444c2319e290921c63c6
---
 drivers/vhost/net.c        |  59 +++-
 drivers/vhost/vhost.c      | 636 ++++++++++++++++++++++++++++++++++---
 drivers/vhost/vhost.h      |  32 +-
 include/uapi/linux/vhost.h |  28 ++
 4 files changed, 705 insertions(+), 50 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9b5e6788f9bf..77e94ecfd608 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -61,7 +61,8 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 enum {
 	VHOST_NET_FEATURES = VHOST_FEATURES |
 			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
-			 (1ULL << VIRTIO_NET_F_MRG_RXBUF)
+			 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+			 (1ULL << VIRTIO_F_IOMMU_PLATFORM)
 };
 
 enum {
@@ -334,7 +335,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 {
 	unsigned long uninitialized_var(endtime);
 	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				    out_num, in_num, NULL, NULL);
+				  out_num, in_num, NULL, NULL);
 
 	if (r == vq->num && vq->busyloop_timeout) {
 		preempt_disable();
@@ -344,7 +345,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 			cpu_relax_lowlatency();
 		preempt_enable();
 		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					out_num, in_num, NULL, NULL);
+				      out_num, in_num, NULL, NULL);
 	}
 
 	return r;
@@ -377,6 +378,9 @@ static void handle_tx(struct vhost_net *net)
 	if (!sock)
 		goto out;
 
+	if (!vq_iotlb_prefetch(vq))
+		goto out;
+
 	vhost_disable_notify(&net->dev, vq);
 
 	hdr_size = nvq->vhost_hlen;
@@ -638,6 +642,10 @@ static void handle_rx(struct vhost_net *net)
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
+
+	if (!vq_iotlb_prefetch(vq))
+		goto out;
+
 	vhost_disable_notify(&net->dev, vq);
 	vhost_net_disable_vq(net, vq);
 
@@ -1084,10 +1092,14 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
 	}
 	mutex_lock(&n->dev.mutex);
 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
-	    !vhost_log_access_ok(&n->dev)) {
-		mutex_unlock(&n->dev.mutex);
-		return -EFAULT;
+	    !vhost_log_access_ok(&n->dev))
+		goto out_unlock;
+
+	if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
+		if (vhost_init_device_iotlb(&n->dev, true))
+			goto out_unlock;
 	}
+
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
 		mutex_lock(&n->vqs[i].vq.mutex);
 		n->vqs[i].vq.acked_features = features;
@@ -1097,6 +1109,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
 	}
 	mutex_unlock(&n->dev.mutex);
 	return 0;
+
+out_unlock:
+	mutex_unlock(&n->dev.mutex);
+	return -EFAULT;
 }
 
 static long vhost_net_set_owner(struct vhost_net *n)
@@ -1170,9 +1186,40 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
 }
 #endif
 
+static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct vhost_net *n = file->private_data;
+	struct vhost_dev *dev = &n->dev;
+	int noblock = file->f_flags & O_NONBLOCK;
+
+	return vhost_chr_read_iter(dev, to, noblock);
+}
+
+static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
+					struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct vhost_net *n = file->private_data;
+	struct vhost_dev *dev = &n->dev;
+
+	return vhost_chr_write_iter(dev, from);
+}
+
+static unsigned int vhost_net_chr_poll(struct file *file, poll_table *wait)
+{
+	struct vhost_net *n = file->private_data;
+	struct vhost_dev *dev = &n->dev;
+
+	return vhost_chr_poll(file, dev, wait);
+}
+
 static const struct file_operations vhost_net_fops = {
 	.owner          = THIS_MODULE,
 	.release        = vhost_net_release,
+	.read_iter      = vhost_net_chr_read_iter,
+	.write_iter     = vhost_net_chr_write_iter,
+	.poll           = vhost_net_chr_poll,
 	.unlocked_ioctl = vhost_net_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = vhost_net_compat_ioctl,
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index bb06d3b3392a..bd3428ce6b9b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -36,6 +36,10 @@ static ushort max_mem_regions = 64;
 module_param(max_mem_regions, ushort, 0444);
 MODULE_PARM_DESC(max_mem_regions,
 	"Maximum number of memory regions in memory map. (default: 64)");
+static int max_iotlb_entries = 2048;
+module_param(max_iotlb_entries, int, 0444);
+MODULE_PARM_DESC(max_iotlb_entries,
+	"Maximum number of iotlb entries. (default: 2048)");
 
 enum {
 	VHOST_MEMORY_F_LOG = 0x1,
@@ -306,6 +310,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vhost_disable_cross_endian(vq);
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
+	vq->iotlb = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -400,9 +405,14 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->log_ctx = NULL;
 	dev->log_file = NULL;
 	dev->umem = NULL;
+	dev->iotlb = NULL;
 	dev->mm = NULL;
 	dev->worker = NULL;
 	init_llist_head(&dev->work_list);
+	init_waitqueue_head(&dev->wait);
+	INIT_LIST_HEAD(&dev->read_list);
+	INIT_LIST_HEAD(&dev->pending_list);
+	spin_lock_init(&dev->iotlb_lock);
 
 
 	for (i = 0; i < dev->nvqs; ++i) {
@@ -550,6 +560,15 @@ void vhost_dev_stop(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_stop);
 
+static void vhost_umem_free(struct vhost_umem *umem,
+			    struct vhost_umem_node *node)
+{
+	vhost_umem_interval_tree_remove(node, &umem->umem_tree);
+	list_del(&node->link);
+	kfree(node);
+	umem->numem--;
+}
+
 static void vhost_umem_clean(struct vhost_umem *umem)
 {
 	struct vhost_umem_node *node, *tmp;
@@ -557,14 +576,31 @@ static void vhost_umem_clean(struct vhost_umem *umem)
 	if (!umem)
 		return;
 
-	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
-		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
-		list_del(&node->link);
-		kvfree(node);
-	}
+	list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
+		vhost_umem_free(umem, node);
+
 	kvfree(umem);
 }
 
+static void vhost_clear_msg(struct vhost_dev *dev)
+{
+	struct vhost_msg_node *node, *n;
+
+	spin_lock(&dev->iotlb_lock);
+
+	list_for_each_entry_safe(node, n, &dev->read_list, node) {
+		list_del(&node->node);
+		kfree(node);
+	}
+
+	list_for_each_entry_safe(node, n, &dev->pending_list, node) {
+		list_del(&node->node);
+		kfree(node);
+	}
+
+	spin_unlock(&dev->iotlb_lock);
+}
+
 /* Caller should have device mutex if and only if locked is set */
 void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 {
@@ -593,6 +629,10 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 	/* No one will access memory at this point */
 	vhost_umem_clean(dev->umem);
 	dev->umem = NULL;
+	vhost_umem_clean(dev->iotlb);
+	dev->iotlb = NULL;
+	vhost_clear_msg(dev);
+	wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
 	WARN_ON(!llist_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
@@ -668,28 +708,381 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
 	return 1;
 }
 
-#define vhost_put_user(vq, x, ptr)  __put_user(x, ptr)
+static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
+			  struct iovec iov[], int iov_size, int access);
 
 static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
 			      const void *from, unsigned size)
 {
-	return __copy_to_user(to, from, size);
-}
+	int ret;
 
-#define vhost_get_user(vq, x, ptr) __get_user(x, ptr)
+	if (!vq->iotlb)
+		return __copy_to_user(to, from, size);
+	else {
+		/* This function should be called after iotlb
+		 * prefetch, which means we're sure that all vq
+		 * could be access through iotlb. So -EAGAIN should
+		 * not happen in this case.
+		 */
+		/* TODO: more fast path */
+		struct iov_iter t;
+		ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
+				     ARRAY_SIZE(vq->iotlb_iov),
+				     VHOST_ACCESS_WO);
+		if (ret < 0)
+			goto out;
+		iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
+		ret = copy_to_iter(from, size, &t);
+		if (ret == size)
+			ret = 0;
+	}
+out:
+	return ret;
+}
 
 static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
 				void *from, unsigned size)
 {
-	return __copy_from_user(to, from, size);
+	int ret;
+
+	if (!vq->iotlb)
+		return __copy_from_user(to, from, size);
+	else {
+		/* This function should be called after iotlb
+		 * prefetch, which means we're sure that vq
+		 * could be access through iotlb. So -EAGAIN should
+		 * not happen in this case.
+		 */
+		/* TODO: more fast path */
+		struct iov_iter f;
+		ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
+				     ARRAY_SIZE(vq->iotlb_iov),
+				     VHOST_ACCESS_RO);
+		if (ret < 0) {
+			vq_err(vq, "IOTLB translation failure: uaddr "
+			       "%p size 0x%llx\n", from,
+			       (unsigned long long) size);
+			goto out;
+		}
+		iov_iter_init(&f, READ, vq->iotlb_iov, ret, size);
+		ret = copy_from_iter(to, size, &f);
+		if (ret == size)
+			ret = 0;
+	}
+
+out:
+	return ret;
+}
+
+static void __user *__vhost_get_user(struct vhost_virtqueue *vq,
+				     void *addr, unsigned size)
+{
+	int ret;
+
+	/* This function should be called after iotlb
+	 * prefetch, which means we're sure that vq
+	 * could be access through iotlb. So -EAGAIN should
+	 * not happen in this case.
+	 */
+	/* TODO: more fast path */
+	ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
+			     ARRAY_SIZE(vq->iotlb_iov),
+			     VHOST_ACCESS_RO);
+	if (ret < 0) {
+		vq_err(vq, "IOTLB translation failure: uaddr "
+			"%p size 0x%llx\n", addr,
+			(unsigned long long) size);
+		return NULL;
+	}
+
+	if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
+		vq_err(vq, "Non atomic userspace memory access: uaddr "
+			"%p size 0x%llx\n", addr,
+			(unsigned long long) size);
+		return NULL;
+	}
+
+	return vq->iotlb_iov[0].iov_base;
+}
+
+#define vhost_put_user(vq, x, ptr) \
+({ \
+	int ret = -EFAULT; \
+	if (!vq->iotlb) { \
+		ret = __put_user(x, ptr); \
+	} else { \
+		__typeof__(ptr) to = \
+			(__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+		if (to != NULL) \
+			ret = __put_user(x, to); \
+		else \
+			ret = -EFAULT;	\
+	} \
+	ret; \
+})
+
+#define vhost_get_user(vq, x, ptr) \
+({ \
+	int ret; \
+	if (!vq->iotlb) { \
+		ret = __get_user(x, ptr); \
+	} else { \
+		__typeof__(ptr) from = \
+			(__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+		if (from != NULL) \
+			ret = __get_user(x, from); \
+		else \
+			ret = -EFAULT; \
+	} \
+	ret; \
+})
+
+static void vhost_dev_lock_vqs(struct vhost_dev *d)
+{
+	int i = 0;
+	for (i = 0; i < d->nvqs; ++i)
+		mutex_lock(&d->vqs[i]->mutex);
+}
+
+static void vhost_dev_unlock_vqs(struct vhost_dev *d)
+{
+	int i = 0;
+	for (i = 0; i < d->nvqs; ++i)
+		mutex_unlock(&d->vqs[i]->mutex);
+}
+
+static int vhost_new_umem_range(struct vhost_umem *umem,
+				u64 start, u64 size, u64 end,
+				u64 userspace_addr, int perm)
+{
+	struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC);
+
+	if (!node)
+		return -ENOMEM;
+
+	if (umem->numem == max_iotlb_entries) {
+		tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
+		vhost_umem_free(umem, tmp);
+	}
+
+	node->start = start;
+	node->size = size;
+	node->last = end;
+	node->userspace_addr = userspace_addr;
+	node->perm = perm;
+	INIT_LIST_HEAD(&node->link);
+	list_add_tail(&node->link, &umem->umem_list);
+	vhost_umem_interval_tree_insert(node, &umem->umem_tree);
+	umem->numem++;
+
+	return 0;
+}
+
+static void vhost_del_umem_range(struct vhost_umem *umem,
+				 u64 start, u64 end)
+{
+	struct vhost_umem_node *node;
+
+	while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+							   start, end)))
+		vhost_umem_free(umem, node);
+}
+
+static void vhost_iotlb_notify_vq(struct vhost_dev *d,
+				  struct vhost_iotlb_msg *msg)
+{
+	struct vhost_msg_node *node, *n;
+
+	spin_lock(&d->iotlb_lock);
+
+	list_for_each_entry_safe(node, n, &d->pending_list, node) {
+		struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
+		if (msg->iova <= vq_msg->iova &&
+		    msg->iova + msg->size - 1 > vq_msg->iova &&
+		    vq_msg->type == VHOST_IOTLB_MISS) {
+			vhost_poll_queue(&node->vq->poll);
+			list_del(&node->node);
+			kfree(node);
+		}
+	}
+
+	spin_unlock(&d->iotlb_lock);
+}
+
+static int umem_access_ok(u64 uaddr, u64 size, int access)
+{
+	unsigned long a = uaddr;
+
+	if ((access & VHOST_ACCESS_RO) &&
+	    !access_ok(VERIFY_READ, (void __user *)a, size))
+		return -EFAULT;
+	if ((access & VHOST_ACCESS_WO) &&
+	    !access_ok(VERIFY_WRITE, (void __user *)a, size))
+		return -EFAULT;
+	return 0;
+}
+
+int vhost_process_iotlb_msg(struct vhost_dev *dev,
+			    struct vhost_iotlb_msg *msg)
+{
+	int ret = 0;
+
+	vhost_dev_lock_vqs(dev);
+	switch (msg->type) {
+	case VHOST_IOTLB_UPDATE:
+		if (!dev->iotlb) {
+			ret = -EFAULT;
+			break;
+		}
+		if (umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
+			ret = -EFAULT;
+			break;
+		}
+		if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
+					 msg->iova + msg->size - 1,
+					 msg->uaddr, msg->perm)) {
+			ret = -ENOMEM;
+			break;
+		}
+		vhost_iotlb_notify_vq(dev, msg);
+		break;
+	case VHOST_IOTLB_INVALIDATE:
+		vhost_del_umem_range(dev->iotlb, msg->iova,
+				     msg->iova + msg->size - 1);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	vhost_dev_unlock_vqs(dev);
+	return ret;
+}
+ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
+			     struct iov_iter *from)
+{
+	struct vhost_msg_node node;
+	unsigned size = sizeof(struct vhost_msg);
+	size_t ret;
+	int err;
+
+	if (iov_iter_count(from) < size)
+		return 0;
+	ret = copy_from_iter(&node.msg, size, from);
+	if (ret != size)
+		goto done;
+
+	switch (node.msg.type) {
+	case VHOST_IOTLB_MSG:
+		err = vhost_process_iotlb_msg(dev, &node.msg.iotlb);
+		if (err)
+			ret = err;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+done:
+	return ret;
+}
+EXPORT_SYMBOL(vhost_chr_write_iter);
+
+unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
+			    poll_table *wait)
+{
+	unsigned int mask = 0;
+
+	poll_wait(file, &dev->wait, wait);
+
+	if (!list_empty(&dev->read_list))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+EXPORT_SYMBOL(vhost_chr_poll);
+
+ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
+			    int noblock)
+{
+	DEFINE_WAIT(wait);
+	struct vhost_msg_node *node;
+	ssize_t ret = 0;
+	unsigned size = sizeof(struct vhost_msg);
+
+	if (iov_iter_count(to) < size)
+		return 0;
+
+	while (1) {
+		if (!noblock)
+			prepare_to_wait(&dev->wait, &wait,
+					TASK_INTERRUPTIBLE);
+
+		node = vhost_dequeue_msg(dev, &dev->read_list);
+		if (node)
+			break;
+		if (noblock) {
+			ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		if (!dev->iotlb) {
+			ret = -EBADFD;
+			break;
+		}
+
+		schedule();
+	}
+
+	if (!noblock)
+		finish_wait(&dev->wait, &wait);
+
+	if (node) {
+		ret = copy_to_iter(&node->msg, size, to);
+
+		if (ret != size || node->msg.type != VHOST_IOTLB_MISS) {
+			kfree(node);
+			return ret;
+		}
+
+		vhost_enqueue_msg(dev, &dev->pending_list, node);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
+
+static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
+{
+	struct vhost_dev *dev = vq->dev;
+	struct vhost_msg_node *node;
+	struct vhost_iotlb_msg *msg;
+
+	node = vhost_new_msg(vq, VHOST_IOTLB_MISS);
+	if (!node)
+		return -ENOMEM;
+
+	msg = &node->msg.iotlb;
+	msg->type = VHOST_IOTLB_MISS;
+	msg->iova = iova;
+	msg->perm = access;
+
+	vhost_enqueue_msg(dev, &dev->read_list, node);
+
+	return 0;
 }
 
 static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			struct vring_desc __user *desc,
 			struct vring_avail __user *avail,
 			struct vring_used __user *used)
+
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
 	return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
 	       access_ok(VERIFY_READ, avail,
 			 sizeof *avail + num * sizeof *avail->ring + s) &&
@@ -697,6 +1090,54 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			sizeof *used + num * sizeof *used->ring + s);
 }
 
+static int iotlb_access_ok(struct vhost_virtqueue *vq,
+			   int access, u64 addr, u64 len)
+{
+	const struct vhost_umem_node *node;
+	struct vhost_umem *umem = vq->iotlb;
+	u64 s = 0, size;
+
+	while (len > s) {
+		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+							   addr,
+							   addr + len - 1);
+		if (node == NULL || node->start > addr) {
+			vhost_iotlb_miss(vq, addr, access);
+			return false;
+		} else if (!(node->perm & access)) {
+			/* Report the possible access violation by
+			 * request another translation from userspace.
+			 */
+			return false;
+		}
+
+		size = node->size - addr + node->start;
+		s += size;
+		addr += size;
+	}
+
+	return true;
+}
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+	unsigned int num = vq->num;
+
+	if (!vq->iotlb)
+		return 1;
+
+	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+			       num * sizeof *vq->desc) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
+			       sizeof *vq->avail +
+			       num * sizeof *vq->avail->ring + s) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
+			       sizeof *vq->used +
+			       num * sizeof *vq->used->ring + s);
+}
+EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
+
 /* Can we log writes? */
 /* Caller should have device mutex but not vq mutex */
 int vhost_log_access_ok(struct vhost_dev *dev)
@@ -723,16 +1164,35 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
 /* Caller should have vq mutex and device mutex */
 int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 {
+	if (vq->iotlb) {
+		/* When device IOTLB was used, the access validation
+		 * will be validated during prefetching.
+		 */
+		return 1;
+	}
 	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used) &&
 		vq_log_access_ok(vq, vq->log_base);
 }
 EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
 
+static struct vhost_umem *vhost_umem_alloc(void)
+{
+	struct vhost_umem *umem = vhost_kvzalloc(sizeof(*umem));
+
+	if (!umem)
+		return NULL;
+
+	umem->umem_tree = RB_ROOT;
+	umem->numem = 0;
+	INIT_LIST_HEAD(&umem->umem_list);
+
+	return umem;
+}
+
 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 {
 	struct vhost_memory mem, *newmem;
 	struct vhost_memory_region *region;
-	struct vhost_umem_node *node;
 	struct vhost_umem *newumem, *oldumem;
 	unsigned long size = offsetof(struct vhost_memory, regions);
 	int i;
@@ -754,28 +1214,23 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 		return -EFAULT;
 	}
 
-	newumem = vhost_kvzalloc(sizeof(*newumem));
+	newumem = vhost_umem_alloc();
 	if (!newumem) {
 		kvfree(newmem);
 		return -ENOMEM;
 	}
 
-	newumem->umem_tree = RB_ROOT;
-	INIT_LIST_HEAD(&newumem->umem_list);
-
 	for (region = newmem->regions;
 	     region < newmem->regions + mem.nregions;
 	     region++) {
-		node = vhost_kvzalloc(sizeof(*node));
-		if (!node)
+		if (vhost_new_umem_range(newumem,
+					 region->guest_phys_addr,
+					 region->memory_size,
+					 region->guest_phys_addr +
+					 region->memory_size - 1,
+					 region->userspace_addr,
+					 VHOST_ACCESS_RW))
 			goto err;
-		node->start = region->guest_phys_addr;
-		node->size = region->memory_size;
-		node->last = node->start + node->size - 1;
-		node->userspace_addr = region->userspace_addr;
-		INIT_LIST_HEAD(&node->link);
-		list_add_tail(&node->link, &newumem->umem_list);
-		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
 	}
 
 	if (!memory_access_ok(d, newumem, 0))
@@ -1020,6 +1475,30 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
 }
 EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
 
+int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
+{
+	struct vhost_umem *niotlb, *oiotlb;
+	int i;
+
+	niotlb = vhost_umem_alloc();
+	if (!niotlb)
+		return -ENOMEM;
+
+	oiotlb = d->iotlb;
+	d->iotlb = niotlb;
+
+	for (i = 0; i < d->nvqs; ++i) {
+		mutex_lock(&d->vqs[i]->mutex);
+		d->vqs[i]->iotlb = niotlb;
+		mutex_unlock(&d->vqs[i]->mutex);
+	}
+
+	vhost_umem_clean(oiotlb);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
+
 /* Caller must have device mutex */
 long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 {
@@ -1234,15 +1713,20 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 	if (r)
 		goto err;
 	vq->signalled_used_valid = false;
-	if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
+	if (!vq->iotlb &&
+	    !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
 		r = -EFAULT;
 		goto err;
 	}
 	r = vhost_get_user(vq, last_used_idx, &vq->used->idx);
-	if (r)
+	if (r) {
+		vq_err(vq, "Can't access used idx at %p\n",
+		       &vq->used->idx);
 		goto err;
+	}
 	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
 	return 0;
+
 err:
 	vq->is_le = is_le;
 	return r;
@@ -1250,10 +1734,11 @@ err:
 EXPORT_SYMBOL_GPL(vhost_vq_init_access);
 
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
-			  struct iovec iov[], int iov_size)
+			  struct iovec iov[], int iov_size, int access)
 {
 	const struct vhost_umem_node *node;
-	struct vhost_umem *umem = vq->umem;
+	struct vhost_dev *dev = vq->dev;
+	struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
 	struct iovec *_iov;
 	u64 s = 0;
 	int ret = 0;
@@ -1264,12 +1749,21 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 			ret = -ENOBUFS;
 			break;
 		}
+
 		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
 							addr, addr + len - 1);
 		if (node == NULL || node->start > addr) {
-			ret = -EFAULT;
+			if (umem != dev->iotlb) {
+				ret = -EFAULT;
+				break;
+			}
+			ret = -EAGAIN;
+			break;
+		} else if (!(node->perm & access)) {
+			ret = -EPERM;
 			break;
 		}
+
 		_iov = iov + ret;
 		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
@@ -1280,6 +1774,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 		++ret;
 	}
 
+	if (ret == -EAGAIN)
+		vhost_iotlb_miss(vq, addr, access);
 	return ret;
 }
 
@@ -1314,7 +1810,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	unsigned int i = 0, count, found = 0;
 	u32 len = vhost32_to_cpu(vq, indirect->len);
 	struct iov_iter from;
-	int ret;
+	int ret, access;
 
 	/* Sanity check */
 	if (unlikely(len % sizeof desc)) {
@@ -1326,9 +1822,10 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	}
 
 	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
-			     UIO_MAXIOV);
+			     UIO_MAXIOV, VHOST_ACCESS_RO);
 	if (unlikely(ret < 0)) {
-		vq_err(vq, "Translation failure %d in indirect.\n", ret);
+		if (ret != -EAGAIN)
+			vq_err(vq, "Translation failure %d in indirect.\n", ret);
 		return ret;
 	}
 	iov_iter_init(&from, READ, vq->indirect, ret, len);
@@ -1366,16 +1863,22 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			return -EINVAL;
 		}
 
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
+
 		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
 				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
-				     iov_size - iov_count);
+				     iov_size - iov_count, access);
 		if (unlikely(ret < 0)) {
-			vq_err(vq, "Translation failure %d indirect idx %d\n",
-			       ret, i);
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d indirect idx %d\n",
+					ret, i);
 			return ret;
 		}
 		/* If this is an input descriptor, increment that count. */
-		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
+		if (access == VHOST_ACCESS_WO) {
 			*in_num += ret;
 			if (unlikely(log)) {
 				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
@@ -1414,7 +1917,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	u16 last_avail_idx;
 	__virtio16 avail_idx;
 	__virtio16 ring_head;
-	int ret;
+	int ret, access;
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
 	last_avail_idx = vq->last_avail_idx;
@@ -1488,22 +1991,28 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 					   out_num, in_num,
 					   log, log_num, &desc);
 			if (unlikely(ret < 0)) {
-				vq_err(vq, "Failure detected "
-				       "in indirect descriptor at idx %d\n", i);
+				if (ret != -EAGAIN)
+					vq_err(vq, "Failure detected "
+						"in indirect descriptor at idx %d\n", i);
 				return ret;
 			}
 			continue;
 		}
 
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
 		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
 				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
-				     iov_size - iov_count);
+				     iov_size - iov_count, access);
 		if (unlikely(ret < 0)) {
-			vq_err(vq, "Translation failure %d descriptor idx %d\n",
-			       ret, i);
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d descriptor idx %d\n",
+					ret, i);
 			return ret;
 		}
-		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
+		if (access == VHOST_ACCESS_WO) {
 			/* If this is an input descriptor,
 			 * increment that count. */
 			*in_num += ret;
@@ -1771,6 +2280,47 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(vhost_disable_notify);
 
+/* Create a new message. */
+struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
+{
+	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
+	if (!node)
+		return NULL;
+	node->vq = vq;
+	node->msg.type = type;
+	return node;
+}
+EXPORT_SYMBOL_GPL(vhost_new_msg);
+
+void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
+		       struct vhost_msg_node *node)
+{
+	spin_lock(&dev->iotlb_lock);
+	list_add_tail(&node->node, head);
+	spin_unlock(&dev->iotlb_lock);
+
+	wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
+}
+EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
+
+struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
+					 struct list_head *head)
+{
+	struct vhost_msg_node *node = NULL;
+
+	spin_lock(&dev->iotlb_lock);
+	if (!list_empty(head)) {
+		node = list_first_entry(head, struct vhost_msg_node,
+					node);
+		list_del(&node->node);
+	}
+	spin_unlock(&dev->iotlb_lock);
+
+	return node;
+}
+EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
+
+
 static int __init vhost_init(void)
 {
 	return 0;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index eaaf6df72218..78f3c5fc02e4 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -65,13 +65,15 @@ struct vhost_umem_node {
 	__u64 last;
 	__u64 size;
 	__u64 userspace_addr;
-	__u64 flags_padding;
+	__u32 perm;
+	__u32 flags_padding;
 	__u64 __subtree_last;
 };
 
 struct vhost_umem {
 	struct rb_root umem_tree;
 	struct list_head umem_list;
+	int numem;
 };
 
 /* The virtqueue structure describes a queue attached to a device. */
@@ -119,10 +121,12 @@ struct vhost_virtqueue {
 	u64 log_addr;
 
 	struct iovec iov[UIO_MAXIOV];
+	struct iovec iotlb_iov[64];
 	struct iovec *indirect;
 	struct vring_used_elem *heads;
 	/* Protected by virtqueue mutex. */
 	struct vhost_umem *umem;
+	struct vhost_umem *iotlb;
 	void *private_data;
 	u64 acked_features;
 	/* Log write descriptors */
@@ -139,6 +143,12 @@ struct vhost_virtqueue {
 	u32 busyloop_timeout;
 };
 
+struct vhost_msg_node {
+  struct vhost_msg msg;
+  struct vhost_virtqueue *vq;
+  struct list_head node;
+};
+
 struct vhost_dev {
 	struct mm_struct *mm;
 	struct mutex mutex;
@@ -149,6 +159,11 @@ struct vhost_dev {
 	struct llist_head work_list;
 	struct task_struct *worker;
 	struct vhost_umem *umem;
+	struct vhost_umem *iotlb;
+	spinlock_t iotlb_lock;
+	struct list_head read_list;
+	struct list_head pending_list;
+	wait_queue_head_t wait;
 };
 
 void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
@@ -185,6 +200,21 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
+
+struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);
+void vhost_enqueue_msg(struct vhost_dev *dev,
+		       struct list_head *head,
+		       struct vhost_msg_node *node);
+struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
+					 struct list_head *head);
+unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
+			    poll_table *wait);
+ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
+			    int noblock);
+ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
+			     struct iov_iter *from);
+int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 61a8777178c6..8cb0a65afd64 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -47,6 +47,32 @@ struct vhost_vring_addr {
 	__u64 log_guest_addr;
 };
 
+/* no alignment requirement */
+struct vhost_iotlb_msg {
+	__u64 iova;
+	__u64 size;
+	__u64 uaddr;
+#define VHOST_ACCESS_RO      0x1
+#define VHOST_ACCESS_WO      0x2
+#define VHOST_ACCESS_RW      0x3
+	__u8 perm;
+#define VHOST_IOTLB_MISS           1
+#define VHOST_IOTLB_UPDATE         2
+#define VHOST_IOTLB_INVALIDATE     3
+#define VHOST_IOTLB_ACCESS_FAIL    4
+	__u8 type;
+};
+
+#define VHOST_IOTLB_MSG 0x1
+
+struct vhost_msg {
+	int type;
+	union {
+		struct vhost_iotlb_msg iotlb;
+		__u8 padding[64];
+	};
+};
+
 struct vhost_memory_region {
 	__u64 guest_phys_addr;
 	__u64 memory_size; /* bytes */
@@ -146,6 +172,8 @@ struct vhost_memory {
 #define VHOST_F_LOG_ALL 26
 /* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
 #define VHOST_NET_F_VIRTIO_NET_HDR 27
+/* Vhost have device IOTLB */
+#define VHOST_F_DEVICE_IOTLB 63
 
 /* VHOST_SCSI specific definitions */
 

From a598d93c2a3ab180d9a52ad15fcf023a1ee897d8 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 23 Jun 2016 16:28:58 +0100
Subject: [PATCH 2156/3220] UPSTREAM: vsock: make listener child lock ordering
 explicit

There are several places where the listener and pending or accept queue
child sockets are accessed at the same time.  Lockdep is unhappy that
two locks from the same class are held.

Tell lockdep that it is safe and document the lock ordering.

Originally Claudio Imbrenda <imbrenda@linux.vnet.ibm.com> sent a similar
patch asking whether this is safe.  I have audited the code and also
covered the vsock_pending_work() function.

Suggested-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 4192f672fae559f32d82de72a677701853cc98a7)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I0cb7ee964057e9338971e1a2043ae17557feaec7
---
 net/vmw_vsock/af_vsock.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 2cb12fe3b60d..611e3b13675e 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -61,6 +61,14 @@
  * function will also cleanup rejected sockets, those that reach the connected
  * state but leave it before they have been accepted.
  *
+ * - Lock ordering for pending or accept queue sockets is:
+ *
+ *     lock_sock(listener);
+ *     lock_sock_nested(pending, SINGLE_DEPTH_NESTING);
+ *
+ * Using explicit nested locking keeps lockdep happy since normally only one
+ * lock of a given class may be taken at a time.
+ *
  * - Sockets created by user action will be cleaned up when the user process
  * calls close(2), causing our release implementation to be called. Our release
  * implementation will perform some cleanup then drop the last reference so our
@@ -443,7 +451,7 @@ static void vsock_pending_work(struct work_struct *work)
 	cleanup = true;
 
 	lock_sock(listener);
-	lock_sock(sk);
+	lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
 
 	if (vsock_is_pending(sk)) {
 		vsock_remove_pending(listener, sk);
@@ -1293,7 +1301,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
 	if (connected) {
 		listener->sk_ack_backlog--;
 
-		lock_sock(connected);
+		lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
 		vconnected = vsock_sk(connected);
 
 		/* If the listener socket has received an error, then we should

From 3fc44c12b2c42f5ca8474e252fb41f4cde9824c9 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 28 Jul 2016 15:36:30 +0100
Subject: [PATCH 2157/3220] UPSTREAM: VSOCK: transport-specific vsock_transport
 functions

struct vsock_transport contains function pointers called by AF_VSOCK
core code.  The transport may want its own transport-specific function
pointers and they can be added after struct vsock_transport.

Allow the transport to fetch vsock_transport.  It can downcast it to
access transport-specific function pointers.

The virtio transport will use this.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 0b01aeb3d2fbf16787f0c9629f4ca52ae792f732)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I442706ae71dc14c70fc4033d9719134c2d034509
---
 include/net/af_vsock.h   | 3 +++
 net/vmw_vsock/af_vsock.c | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index f7a35fcaaaf6..2f987679239b 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -165,6 +165,9 @@ static inline int vsock_core_init(const struct vsock_transport *t)
 }
 void vsock_core_exit(void);
 
+/* The transport may downcast this to access transport-specific functions */
+const struct vsock_transport *vsock_core_get_transport(void);
+
 /**** UTILS ****/
 
 void vsock_release_pending(struct sock *pending);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 611e3b13675e..fa5bed6dcd30 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1991,6 +1991,15 @@ void vsock_core_exit(void)
 }
 EXPORT_SYMBOL_GPL(vsock_core_exit);
 
+const struct vsock_transport *vsock_core_get_transport(void)
+{
+	/* vsock_register_mutex not taken since only the transport uses this
+	 * function and only while registered.
+	 */
+	return transport;
+}
+EXPORT_SYMBOL_GPL(vsock_core_get_transport);
+
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMware Virtual Socket Family");
 MODULE_VERSION("1.0.1.0-k");

From 716adf173f7fae9319fc6f30325fe2264ec2cfcd Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 28 Jul 2016 15:36:31 +0100
Subject: [PATCH 2158/3220] UPSTREAM: VSOCK: defer sock removal to transports

The virtio transport will implement graceful shutdown and the related
SO_LINGER socket option.  This requires orphaning the sock but keeping
it in the table of connections after .release().

This patch adds the vsock_remove_sock() function and leaves it up to the
transport when to remove the sock.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 6773b7dc39f165bd9d824b50ac52cbb3f87d53c8)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I889cdbc0b1de8d2ff54a70ab7a6b4623edb3de06
---
 include/net/af_vsock.h         |  1 +
 net/vmw_vsock/af_vsock.c       | 16 ++++++++++------
 net/vmw_vsock/vmci_transport.c |  2 ++
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 2f987679239b..d8a1a41d5f8f 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -180,6 +180,7 @@ void vsock_remove_connected(struct vsock_sock *vsk);
 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
 					 struct sockaddr_vm *dst);
+void vsock_remove_sock(struct vsock_sock *vsk);
 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
 
 #endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index fa5bed6dcd30..db7c6615371b 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -344,6 +344,16 @@ static bool vsock_in_connected_table(struct vsock_sock *vsk)
 	return ret;
 }
 
+void vsock_remove_sock(struct vsock_sock *vsk)
+{
+	if (vsock_in_bound_table(vsk))
+		vsock_remove_bound(vsk);
+
+	if (vsock_in_connected_table(vsk))
+		vsock_remove_connected(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_sock);
+
 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
 {
 	int i;
@@ -663,12 +673,6 @@ static void __vsock_release(struct sock *sk)
 		vsk = vsock_sk(sk);
 		pending = NULL;	/* Compiler warning. */
 
-		if (vsock_in_bound_table(vsk))
-			vsock_remove_bound(vsk);
-
-		if (vsock_in_connected_table(vsk))
-			vsock_remove_connected(vsk);
-
 		transport->release(vsk);
 
 		lock_sock(sk);
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index ecdcc1ada947..008f3424dcbc 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1676,6 +1676,8 @@ static void vmci_transport_destruct(struct vsock_sock *vsk)
 
 static void vmci_transport_release(struct vsock_sock *vsk)
 {
+	vsock_remove_sock(vsk);
+
 	if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) {
 		vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle);
 		vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;

From d5590af888cb5de3e30ea96dd35f07393fd5ab84 Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Thu, 28 Jul 2016 15:36:32 +0100
Subject: [PATCH 2159/3220] BACKPORT: VSOCK: Introduce virtio_vsock_common.ko

This module contains the common code and header files for the following
virtio_transporto and vhost_vsock kernel modules.

Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 06a8fc78367d070720af960dcecec917d3ae5f3b)
[astrachan: Backported around stable backport 62209d1 ("vsock: split
            dwork to avoid reinitializations")]
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I723c073db804663ad4bf83b657c72b16cbdb220a
---
 MAINTAINERS                                   |  10 +
 include/linux/virtio_vsock.h                  | 154 +++
 include/net/af_vsock.h                        |   2 +
 .../events/vsock_virtio_transport_common.h    | 144 +++
 include/uapi/linux/Kbuild                     |   1 +
 include/uapi/linux/virtio_ids.h               |   1 +
 include/uapi/linux/virtio_vsock.h             |  94 ++
 net/vmw_vsock/virtio_transport_common.c       | 992 ++++++++++++++++++
 8 files changed, 1398 insertions(+)
 create mode 100644 include/linux/virtio_vsock.h
 create mode 100644 include/trace/events/vsock_virtio_transport_common.h
 create mode 100644 include/uapi/linux/virtio_vsock.h
 create mode 100644 net/vmw_vsock/virtio_transport_common.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 3d9edebb58a9..6f23bb412b98 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11448,6 +11448,16 @@ S:	Maintained
 F:	drivers/media/v4l2-core/videobuf2-*
 F:	include/media/videobuf2-*
 
+VIRTIO AND VHOST VSOCK DRIVER
+M:	Stefan Hajnoczi <stefanha@redhat.com>
+L:	kvm@vger.kernel.org
+L:	virtualization@lists.linux-foundation.org
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	include/linux/virtio_vsock.h
+F:	include/uapi/linux/virtio_vsock.h
+F:	net/vmw_vsock/virtio_transport_common.c
+
 VIRTUAL SERIO DEVICE DRIVER
 M:	Stephen Chandler Paul <thatslyude@gmail.com>
 S:	Maintained
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
new file mode 100644
index 000000000000..9638bfeb0d1f
--- /dev/null
+++ b/include/linux/virtio_vsock.h
@@ -0,0 +1,154 @@
+#ifndef _LINUX_VIRTIO_VSOCK_H
+#define _LINUX_VIRTIO_VSOCK_H
+
+#include <uapi/linux/virtio_vsock.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE	128
+#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE		(1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE	(1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
+#define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
+
+enum {
+	VSOCK_VQ_RX     = 0, /* for host to guest data */
+	VSOCK_VQ_TX     = 1, /* for guest to host data */
+	VSOCK_VQ_EVENT  = 2,
+	VSOCK_VQ_MAX    = 3,
+};
+
+/* Per-socket state (accessed via vsk->trans) */
+struct virtio_vsock_sock {
+	struct vsock_sock *vsk;
+
+	/* Protected by lock_sock(sk_vsock(trans->vsk)) */
+	u32 buf_size;
+	u32 buf_size_min;
+	u32 buf_size_max;
+
+	spinlock_t tx_lock;
+	spinlock_t rx_lock;
+
+	/* Protected by tx_lock */
+	u32 tx_cnt;
+	u32 buf_alloc;
+	u32 peer_fwd_cnt;
+	u32 peer_buf_alloc;
+
+	/* Protected by rx_lock */
+	u32 fwd_cnt;
+	u32 rx_bytes;
+	struct list_head rx_queue;
+};
+
+struct virtio_vsock_pkt {
+	struct virtio_vsock_hdr	hdr;
+	struct work_struct work;
+	struct list_head list;
+	void *buf;
+	u32 len;
+	u32 off;
+	bool reply;
+};
+
+struct virtio_vsock_pkt_info {
+	u32 remote_cid, remote_port;
+	struct msghdr *msg;
+	u32 pkt_len;
+	u16 type;
+	u16 op;
+	u32 flags;
+	bool reply;
+};
+
+struct virtio_transport {
+	/* This must be the first field */
+	struct vsock_transport transport;
+
+	/* Takes ownership of the packet */
+	int (*send_pkt)(struct virtio_vsock_pkt *pkt);
+};
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len,
+				int type);
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+			       struct msghdr *msg,
+			       size_t len, int flags);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+				 struct vsock_sock *psk);
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk);
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val);
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+				size_t target,
+				bool *data_ready_now);
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+				 size_t target,
+				 bool *space_available_now);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+	size_t target, ssize_t copied, bool data_read,
+	struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+	ssize_t written, struct vsock_transport_send_notify_data *data);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
+bool virtio_transport_stream_allow(u32 cid, u32 port);
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+				struct sockaddr_vm *addr);
+bool virtio_transport_dgram_allow(u32 cid, u32 port);
+
+int virtio_transport_connect(struct vsock_sock *vsk);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
+
+void virtio_transport_release(struct vsock_sock *vsk);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len);
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+			       struct sockaddr_vm *remote_addr,
+			       struct msghdr *msg,
+			       size_t len);
+
+void virtio_transport_destruct(struct vsock_sock *vsk);
+
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt);
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
+
+#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index d8a1a41d5f8f..e6675f79b925 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -64,6 +64,8 @@ struct vsock_sock {
 	bool rejected;
 	struct delayed_work connect_work;
 	struct delayed_work pending_work;
+	struct delayed_work close_work;
+	bool close_work_scheduled;
 	u32 peer_shutdown;
 	bool sent_request;
 	bool ignore_connecting_rst;
diff --git a/include/trace/events/vsock_virtio_transport_common.h b/include/trace/events/vsock_virtio_transport_common.h
new file mode 100644
index 000000000000..b7f1d6278280
--- /dev/null
+++ b/include/trace/events/vsock_virtio_transport_common.h
@@ -0,0 +1,144 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vsock
+
+#if !defined(_TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H) || \
+    defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H
+
+#include <linux/tracepoint.h>
+
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM);
+
+#define show_type(val) \
+	__print_symbolic(val, { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" })
+
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RESPONSE);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RST);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_SHUTDOWN);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RW);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_UPDATE);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_REQUEST);
+
+#define show_op(val) \
+	__print_symbolic(val, \
+			 { VIRTIO_VSOCK_OP_INVALID, "INVALID" }, \
+			 { VIRTIO_VSOCK_OP_REQUEST, "REQUEST" }, \
+			 { VIRTIO_VSOCK_OP_RESPONSE, "RESPONSE" }, \
+			 { VIRTIO_VSOCK_OP_RST, "RST" }, \
+			 { VIRTIO_VSOCK_OP_SHUTDOWN, "SHUTDOWN" }, \
+			 { VIRTIO_VSOCK_OP_RW, "RW" }, \
+			 { VIRTIO_VSOCK_OP_CREDIT_UPDATE, "CREDIT_UPDATE" }, \
+			 { VIRTIO_VSOCK_OP_CREDIT_REQUEST, "CREDIT_REQUEST" })
+
+TRACE_EVENT(virtio_transport_alloc_pkt,
+	TP_PROTO(
+		 __u32 src_cid, __u32 src_port,
+		 __u32 dst_cid, __u32 dst_port,
+		 __u32 len,
+		 __u16 type,
+		 __u16 op,
+		 __u32 flags
+	),
+	TP_ARGS(
+		src_cid, src_port,
+		dst_cid, dst_port,
+		len,
+		type,
+		op,
+		flags
+	),
+	TP_STRUCT__entry(
+		__field(__u32, src_cid)
+		__field(__u32, src_port)
+		__field(__u32, dst_cid)
+		__field(__u32, dst_port)
+		__field(__u32, len)
+		__field(__u16, type)
+		__field(__u16, op)
+		__field(__u32, flags)
+	),
+	TP_fast_assign(
+		__entry->src_cid = src_cid;
+		__entry->src_port = src_port;
+		__entry->dst_cid = dst_cid;
+		__entry->dst_port = dst_port;
+		__entry->len = len;
+		__entry->type = type;
+		__entry->op = op;
+		__entry->flags = flags;
+	),
+	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x",
+		  __entry->src_cid, __entry->src_port,
+		  __entry->dst_cid, __entry->dst_port,
+		  __entry->len,
+		  show_type(__entry->type),
+		  show_op(__entry->op),
+		  __entry->flags)
+);
+
+TRACE_EVENT(virtio_transport_recv_pkt,
+	TP_PROTO(
+		 __u32 src_cid, __u32 src_port,
+		 __u32 dst_cid, __u32 dst_port,
+		 __u32 len,
+		 __u16 type,
+		 __u16 op,
+		 __u32 flags,
+		 __u32 buf_alloc,
+		 __u32 fwd_cnt
+	),
+	TP_ARGS(
+		src_cid, src_port,
+		dst_cid, dst_port,
+		len,
+		type,
+		op,
+		flags,
+		buf_alloc,
+		fwd_cnt
+	),
+	TP_STRUCT__entry(
+		__field(__u32, src_cid)
+		__field(__u32, src_port)
+		__field(__u32, dst_cid)
+		__field(__u32, dst_port)
+		__field(__u32, len)
+		__field(__u16, type)
+		__field(__u16, op)
+		__field(__u32, flags)
+		__field(__u32, buf_alloc)
+		__field(__u32, fwd_cnt)
+	),
+	TP_fast_assign(
+		__entry->src_cid = src_cid;
+		__entry->src_port = src_port;
+		__entry->dst_cid = dst_cid;
+		__entry->dst_port = dst_port;
+		__entry->len = len;
+		__entry->type = type;
+		__entry->op = op;
+		__entry->flags = flags;
+		__entry->buf_alloc = buf_alloc;
+		__entry->fwd_cnt = fwd_cnt;
+	),
+	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x "
+		  "buf_alloc=%u fwd_cnt=%u",
+		  __entry->src_cid, __entry->src_port,
+		  __entry->dst_cid, __entry->dst_port,
+		  __entry->len,
+		  show_type(__entry->type),
+		  show_op(__entry->op),
+		  __entry->flags,
+		  __entry->buf_alloc,
+		  __entry->fwd_cnt)
+);
+
+#endif /* _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H */
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE vsock_virtio_transport_common
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ebd10e624598..6c51a4d8bb62 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -447,6 +447,7 @@ header-y += virtio_ring.h
 header-y += virtio_rng.h
 header-y += virtio_scsi.h
 header-y += virtio_types.h
+header-y += virtio_vsock.h
 header-y += vm_sockets.h
 header-y += vt.h
 header-y += wait.h
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 77925f587b15..3228d582234a 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -41,5 +41,6 @@
 #define VIRTIO_ID_CAIF	       12 /* Virtio caif */
 #define VIRTIO_ID_GPU          16 /* virtio GPU */
 #define VIRTIO_ID_INPUT        18 /* virtio input */
+#define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
new file mode 100644
index 000000000000..6b011c19b50f
--- /dev/null
+++ b/include/uapi/linux/virtio_vsock.h
@@ -0,0 +1,94 @@
+/*
+ * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
+ * anyone can use the definitions to implement compatible drivers/servers:
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) Red Hat, Inc., 2013-2015
+ * Copyright (C) Asias He <asias@redhat.com>, 2013
+ * Copyright (C) Stefan Hajnoczi <stefanha@redhat.com>, 2015
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H
+#define _UAPI_LINUX_VIRTIO_VOSCK_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+
+struct virtio_vsock_config {
+	__le64 guest_cid;
+} __attribute__((packed));
+
+enum virtio_vsock_event_id {
+	VIRTIO_VSOCK_EVENT_TRANSPORT_RESET = 0,
+};
+
+struct virtio_vsock_event {
+	__le32 id;
+} __attribute__((packed));
+
+struct virtio_vsock_hdr {
+	__le64	src_cid;
+	__le64	dst_cid;
+	__le32	src_port;
+	__le32	dst_port;
+	__le32	len;
+	__le16	type;		/* enum virtio_vsock_type */
+	__le16	op;		/* enum virtio_vsock_op */
+	__le32	flags;
+	__le32	buf_alloc;
+	__le32	fwd_cnt;
+} __attribute__((packed));
+
+enum virtio_vsock_type {
+	VIRTIO_VSOCK_TYPE_STREAM = 1,
+};
+
+enum virtio_vsock_op {
+	VIRTIO_VSOCK_OP_INVALID = 0,
+
+	/* Connect operations */
+	VIRTIO_VSOCK_OP_REQUEST = 1,
+	VIRTIO_VSOCK_OP_RESPONSE = 2,
+	VIRTIO_VSOCK_OP_RST = 3,
+	VIRTIO_VSOCK_OP_SHUTDOWN = 4,
+
+	/* To send payload */
+	VIRTIO_VSOCK_OP_RW = 5,
+
+	/* Tell the peer our credit info */
+	VIRTIO_VSOCK_OP_CREDIT_UPDATE = 6,
+	/* Request the peer to send the credit info to us */
+	VIRTIO_VSOCK_OP_CREDIT_REQUEST = 7,
+};
+
+/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */
+enum virtio_vsock_shutdown {
+	VIRTIO_VSOCK_SHUTDOWN_RCV = 1,
+	VIRTIO_VSOCK_SHUTDOWN_SEND = 2,
+};
+
+#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
new file mode 100644
index 000000000000..a53b3a16b4f1
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -0,0 +1,992 @@
+/*
+ * common code for virtio vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ *         Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/vsock_virtio_transport_common.h>
+
+/* How long to wait for graceful shutdown of a connection */
+#define VSOCK_CLOSE_TIMEOUT (8 * HZ)
+
+static const struct virtio_transport *virtio_transport_get_ops(void)
+{
+	const struct vsock_transport *t = vsock_core_get_transport();
+
+	return container_of(t, struct virtio_transport, transport);
+}
+
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
+			   size_t len,
+			   u32 src_cid,
+			   u32 src_port,
+			   u32 dst_cid,
+			   u32 dst_port)
+{
+	struct virtio_vsock_pkt *pkt;
+	int err;
+
+	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+	if (!pkt)
+		return NULL;
+
+	pkt->hdr.type		= cpu_to_le16(info->type);
+	pkt->hdr.op		= cpu_to_le16(info->op);
+	pkt->hdr.src_cid	= cpu_to_le64(src_cid);
+	pkt->hdr.dst_cid	= cpu_to_le64(dst_cid);
+	pkt->hdr.src_port	= cpu_to_le32(src_port);
+	pkt->hdr.dst_port	= cpu_to_le32(dst_port);
+	pkt->hdr.flags		= cpu_to_le32(info->flags);
+	pkt->len		= len;
+	pkt->hdr.len		= cpu_to_le32(len);
+	pkt->reply		= info->reply;
+
+	if (info->msg && len > 0) {
+		pkt->buf = kmalloc(len, GFP_KERNEL);
+		if (!pkt->buf)
+			goto out_pkt;
+		err = memcpy_from_msg(pkt->buf, info->msg, len);
+		if (err)
+			goto out;
+	}
+
+	trace_virtio_transport_alloc_pkt(src_cid, src_port,
+					 dst_cid, dst_port,
+					 len,
+					 info->type,
+					 info->op,
+					 info->flags);
+
+	return pkt;
+
+out:
+	kfree(pkt->buf);
+out_pkt:
+	kfree(pkt);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
+
+static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
+					  struct virtio_vsock_pkt_info *info)
+{
+	u32 src_cid, src_port, dst_cid, dst_port;
+	struct virtio_vsock_sock *vvs;
+	struct virtio_vsock_pkt *pkt;
+	u32 pkt_len = info->pkt_len;
+
+	src_cid = vm_sockets_get_local_cid();
+	src_port = vsk->local_addr.svm_port;
+	if (!info->remote_cid) {
+		dst_cid	= vsk->remote_addr.svm_cid;
+		dst_port = vsk->remote_addr.svm_port;
+	} else {
+		dst_cid = info->remote_cid;
+		dst_port = info->remote_port;
+	}
+
+	vvs = vsk->trans;
+
+	/* we can send less than pkt_len bytes */
+	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
+		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+
+	/* virtio_transport_get_credit might return less than pkt_len credit */
+	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
+
+	/* Do not send zero length OP_RW pkt */
+	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
+		return pkt_len;
+
+	pkt = virtio_transport_alloc_pkt(info, pkt_len,
+					 src_cid, src_port,
+					 dst_cid, dst_port);
+	if (!pkt) {
+		virtio_transport_put_credit(vvs, pkt_len);
+		return -ENOMEM;
+	}
+
+	virtio_transport_inc_tx_pkt(vvs, pkt);
+
+	return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
+					struct virtio_vsock_pkt *pkt)
+{
+	vvs->rx_bytes += pkt->len;
+}
+
+static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
+					struct virtio_vsock_pkt *pkt)
+{
+	vvs->rx_bytes -= pkt->len;
+	vvs->fwd_cnt += pkt->len;
+}
+
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
+{
+	spin_lock_bh(&vvs->tx_lock);
+	pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
+	pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
+	spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
+
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+	u32 ret;
+
+	spin_lock_bh(&vvs->tx_lock);
+	ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+	if (ret > credit)
+		ret = credit;
+	vvs->tx_cnt += ret;
+	spin_unlock_bh(&vvs->tx_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_credit);
+
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+	spin_lock_bh(&vvs->tx_lock);
+	vvs->tx_cnt -= credit;
+	spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
+
+static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
+					       int type,
+					       struct virtio_vsock_hdr *hdr)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
+		.type = type,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+static ssize_t
+virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
+				   struct msghdr *msg,
+				   size_t len)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	struct virtio_vsock_pkt *pkt;
+	size_t bytes, total = 0;
+	int err = -EFAULT;
+
+	spin_lock_bh(&vvs->rx_lock);
+	while (total < len && !list_empty(&vvs->rx_queue)) {
+		pkt = list_first_entry(&vvs->rx_queue,
+				       struct virtio_vsock_pkt, list);
+
+		bytes = len - total;
+		if (bytes > pkt->len - pkt->off)
+			bytes = pkt->len - pkt->off;
+
+		/* sk_lock is held by caller so no one else can dequeue.
+		 * Unlock rx_lock since memcpy_to_msg() may sleep.
+		 */
+		spin_unlock_bh(&vvs->rx_lock);
+
+		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
+		if (err)
+			goto out;
+
+		spin_lock_bh(&vvs->rx_lock);
+
+		total += bytes;
+		pkt->off += bytes;
+		if (pkt->off == pkt->len) {
+			virtio_transport_dec_rx_pkt(vvs, pkt);
+			list_del(&pkt->list);
+			virtio_transport_free_pkt(pkt);
+		}
+	}
+	spin_unlock_bh(&vvs->rx_lock);
+
+	/* Send a credit pkt to peer */
+	virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
+					    NULL);
+
+	return total;
+
+out:
+	if (total)
+		err = total;
+	return err;
+}
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len, int flags)
+{
+	if (flags & MSG_PEEK)
+		return -EOPNOTSUPP;
+
+	return virtio_transport_stream_do_dequeue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
+
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+			       struct msghdr *msg,
+			       size_t len, int flags)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	s64 bytes;
+
+	spin_lock_bh(&vvs->rx_lock);
+	bytes = vvs->rx_bytes;
+	spin_unlock_bh(&vvs->rx_lock);
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
+
+static s64 virtio_transport_has_space(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	s64 bytes;
+
+	bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+	if (bytes < 0)
+		bytes = 0;
+
+	return bytes;
+}
+
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	s64 bytes;
+
+	spin_lock_bh(&vvs->tx_lock);
+	bytes = virtio_transport_has_space(vsk);
+	spin_unlock_bh(&vvs->tx_lock);
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+				    struct vsock_sock *psk)
+{
+	struct virtio_vsock_sock *vvs;
+
+	vvs = kzalloc(sizeof(*vvs), GFP_KERNEL);
+	if (!vvs)
+		return -ENOMEM;
+
+	vsk->trans = vvs;
+	vvs->vsk = vsk;
+	if (psk) {
+		struct virtio_vsock_sock *ptrans = psk->trans;
+
+		vvs->buf_size	= ptrans->buf_size;
+		vvs->buf_size_min = ptrans->buf_size_min;
+		vvs->buf_size_max = ptrans->buf_size_max;
+		vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
+	} else {
+		vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
+		vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
+		vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
+	}
+
+	vvs->buf_alloc = vvs->buf_size;
+
+	spin_lock_init(&vvs->rx_lock);
+	spin_lock_init(&vvs->tx_lock);
+	INIT_LIST_HEAD(&vvs->rx_queue);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
+
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
+
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size_min;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
+
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size_max;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
+
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val < vvs->buf_size_min)
+		vvs->buf_size_min = val;
+	if (val > vvs->buf_size_max)
+		vvs->buf_size_max = val;
+	vvs->buf_size = val;
+	vvs->buf_alloc = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
+
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val > vvs->buf_size)
+		vvs->buf_size = val;
+	vvs->buf_size_min = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
+
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val < vvs->buf_size)
+		vvs->buf_size = val;
+	vvs->buf_size_max = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+				size_t target,
+				bool *data_ready_now)
+{
+	if (vsock_stream_has_data(vsk))
+		*data_ready_now = true;
+	else
+		*data_ready_now = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in);
+
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+				 size_t target,
+				 bool *space_avail_now)
+{
+	s64 free_space;
+
+	free_space = vsock_stream_has_space(vsk);
+	if (free_space > 0)
+		*space_avail_now = true;
+	else if (free_space == 0)
+		*space_avail_now = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init);
+
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block);
+
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue);
+
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+	size_t target, ssize_t copied, bool data_read,
+	struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue);
+
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init);
+
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block);
+
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue);
+
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+	ssize_t written, struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
+
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
+
+bool virtio_transport_stream_allow(u32 cid, u32 port)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
+
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+				struct sockaddr_vm *addr)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
+
+bool virtio_transport_dgram_allow(u32 cid, u32 port)
+{
+	return false;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
+
+int virtio_transport_connect(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_REQUEST,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_connect);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_SHUTDOWN,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.flags = (mode & RCV_SHUTDOWN ?
+			  VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
+			 (mode & SEND_SHUTDOWN ?
+			  VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_shutdown);
+
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+			       struct sockaddr_vm *remote_addr,
+			       struct msghdr *msg,
+			       size_t dgram_len)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RW,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.msg = msg,
+		.pkt_len = len,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue);
+
+void virtio_transport_destruct(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	kfree(vvs);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_destruct);
+
+static int virtio_transport_reset(struct vsock_sock *vsk,
+				  struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RST,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.reply = !!pkt,
+	};
+
+	/* Send RST only if the original pkt is not a RST pkt */
+	if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		return 0;
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Normally packets are associated with a socket.  There may be no socket if an
+ * attempt was made to connect to a socket that does not exist.
+ */
+static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RST,
+		.type = le16_to_cpu(pkt->hdr.type),
+		.reply = true,
+	};
+
+	/* Send RST only if the original pkt is not a RST pkt */
+	if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		return 0;
+
+	pkt = virtio_transport_alloc_pkt(&info, 0,
+					 le32_to_cpu(pkt->hdr.dst_cid),
+					 le32_to_cpu(pkt->hdr.dst_port),
+					 le32_to_cpu(pkt->hdr.src_cid),
+					 le32_to_cpu(pkt->hdr.src_port));
+	if (!pkt)
+		return -ENOMEM;
+
+	return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_wait_close(struct sock *sk, long timeout)
+{
+	if (timeout) {
+		DEFINE_WAIT(wait);
+
+		do {
+			prepare_to_wait(sk_sleep(sk), &wait,
+					TASK_INTERRUPTIBLE);
+			if (sk_wait_event(sk, &timeout,
+					  sock_flag(sk, SOCK_DONE)))
+				break;
+		} while (!signal_pending(current) && timeout);
+
+		finish_wait(sk_sleep(sk), &wait);
+	}
+}
+
+static void virtio_transport_do_close(struct vsock_sock *vsk,
+				      bool cancel_timeout)
+{
+	struct sock *sk = sk_vsock(vsk);
+
+	sock_set_flag(sk, SOCK_DONE);
+	vsk->peer_shutdown = SHUTDOWN_MASK;
+	if (vsock_stream_has_data(vsk) <= 0)
+		sk->sk_state = SS_DISCONNECTING;
+	sk->sk_state_change(sk);
+
+	if (vsk->close_work_scheduled &&
+	    (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
+		vsk->close_work_scheduled = false;
+
+		vsock_remove_sock(vsk);
+
+		/* Release refcnt obtained when we scheduled the timeout */
+		sock_put(sk);
+	}
+}
+
+static void virtio_transport_close_timeout(struct work_struct *work)
+{
+	struct vsock_sock *vsk =
+		container_of(work, struct vsock_sock, close_work.work);
+	struct sock *sk = sk_vsock(vsk);
+
+	sock_hold(sk);
+	lock_sock(sk);
+
+	if (!sock_flag(sk, SOCK_DONE)) {
+		(void)virtio_transport_reset(vsk, NULL);
+
+		virtio_transport_do_close(vsk, false);
+	}
+
+	vsk->close_work_scheduled = false;
+
+	release_sock(sk);
+	sock_put(sk);
+}
+
+/* User context, vsk->sk is locked */
+static bool virtio_transport_close(struct vsock_sock *vsk)
+{
+	struct sock *sk = &vsk->sk;
+
+	if (!(sk->sk_state == SS_CONNECTED ||
+	      sk->sk_state == SS_DISCONNECTING))
+		return true;
+
+	/* Already received SHUTDOWN from peer, reply with RST */
+	if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) {
+		(void)virtio_transport_reset(vsk, NULL);
+		return true;
+	}
+
+	if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
+		(void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
+
+	if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING))
+		virtio_transport_wait_close(sk, sk->sk_lingertime);
+
+	if (sock_flag(sk, SOCK_DONE)) {
+		return true;
+	}
+
+	sock_hold(sk);
+	INIT_DELAYED_WORK(&vsk->close_work,
+			  virtio_transport_close_timeout);
+	vsk->close_work_scheduled = true;
+	schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT);
+	return false;
+}
+
+void virtio_transport_release(struct vsock_sock *vsk)
+{
+	struct sock *sk = &vsk->sk;
+	bool remove_sock = true;
+
+	lock_sock(sk);
+	if (sk->sk_type == SOCK_STREAM)
+		remove_sock = virtio_transport_close(vsk);
+	release_sock(sk);
+
+	if (remove_sock)
+		vsock_remove_sock(vsk);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_release);
+
+static int
+virtio_transport_recv_connecting(struct sock *sk,
+				 struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	int err;
+	int skerr;
+
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_RESPONSE:
+		sk->sk_state = SS_CONNECTED;
+		sk->sk_socket->state = SS_CONNECTED;
+		vsock_insert_connected(vsk);
+		sk->sk_state_change(sk);
+		break;
+	case VIRTIO_VSOCK_OP_INVALID:
+		break;
+	case VIRTIO_VSOCK_OP_RST:
+		skerr = ECONNRESET;
+		err = 0;
+		goto destroy;
+	default:
+		skerr = EPROTO;
+		err = -EINVAL;
+		goto destroy;
+	}
+	return 0;
+
+destroy:
+	virtio_transport_reset(vsk, pkt);
+	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_err = skerr;
+	sk->sk_error_report(sk);
+	return err;
+}
+
+static int
+virtio_transport_recv_connected(struct sock *sk,
+				struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	int err = 0;
+
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_RW:
+		pkt->len = le32_to_cpu(pkt->hdr.len);
+		pkt->off = 0;
+
+		spin_lock_bh(&vvs->rx_lock);
+		virtio_transport_inc_rx_pkt(vvs, pkt);
+		list_add_tail(&pkt->list, &vvs->rx_queue);
+		spin_unlock_bh(&vvs->rx_lock);
+
+		sk->sk_data_ready(sk);
+		return err;
+	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+		sk->sk_write_space(sk);
+		break;
+	case VIRTIO_VSOCK_OP_SHUTDOWN:
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
+			vsk->peer_shutdown |= RCV_SHUTDOWN;
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+			vsk->peer_shutdown |= SEND_SHUTDOWN;
+		if (vsk->peer_shutdown == SHUTDOWN_MASK &&
+		    vsock_stream_has_data(vsk) <= 0)
+			sk->sk_state = SS_DISCONNECTING;
+		if (le32_to_cpu(pkt->hdr.flags))
+			sk->sk_state_change(sk);
+		break;
+	case VIRTIO_VSOCK_OP_RST:
+		virtio_transport_do_close(vsk, true);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	virtio_transport_free_pkt(pkt);
+	return err;
+}
+
+static void
+virtio_transport_recv_disconnecting(struct sock *sk,
+				    struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+
+	if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		virtio_transport_do_close(vsk, true);
+}
+
+static int
+virtio_transport_send_response(struct vsock_sock *vsk,
+			       struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RESPONSE,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+		.remote_port = le32_to_cpu(pkt->hdr.src_port),
+		.reply = true,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Handle server socket */
+static int
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct vsock_sock *vchild;
+	struct sock *child;
+
+	if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
+		virtio_transport_reset(vsk, pkt);
+		return -EINVAL;
+	}
+
+	if (sk_acceptq_is_full(sk)) {
+		virtio_transport_reset(vsk, pkt);
+		return -ENOMEM;
+	}
+
+	child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
+			       sk->sk_type, 0);
+	if (!child) {
+		virtio_transport_reset(vsk, pkt);
+		return -ENOMEM;
+	}
+
+	sk->sk_ack_backlog++;
+
+	lock_sock_nested(child, SINGLE_DEPTH_NESTING);
+
+	child->sk_state = SS_CONNECTED;
+
+	vchild = vsock_sk(child);
+	vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+			le32_to_cpu(pkt->hdr.dst_port));
+	vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+			le32_to_cpu(pkt->hdr.src_port));
+
+	vsock_insert_connected(vchild);
+	vsock_enqueue_accept(sk, child);
+	virtio_transport_send_response(vchild, pkt);
+
+	release_sock(child);
+
+	sk->sk_data_ready(sk);
+	return 0;
+}
+
+static bool virtio_transport_space_update(struct sock *sk,
+					  struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	bool space_available;
+
+	/* buf_alloc and fwd_cnt is always included in the hdr */
+	spin_lock_bh(&vvs->tx_lock);
+	vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+	vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+	space_available = virtio_transport_has_space(vsk);
+	spin_unlock_bh(&vvs->tx_lock);
+	return space_available;
+}
+
+/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
+ * lock.
+ */
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+{
+	struct sockaddr_vm src, dst;
+	struct vsock_sock *vsk;
+	struct sock *sk;
+	bool space_available;
+
+	vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
+			le32_to_cpu(pkt->hdr.src_port));
+	vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
+			le32_to_cpu(pkt->hdr.dst_port));
+
+	trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
+					dst.svm_cid, dst.svm_port,
+					le32_to_cpu(pkt->hdr.len),
+					le16_to_cpu(pkt->hdr.type),
+					le16_to_cpu(pkt->hdr.op),
+					le32_to_cpu(pkt->hdr.flags),
+					le32_to_cpu(pkt->hdr.buf_alloc),
+					le32_to_cpu(pkt->hdr.fwd_cnt));
+
+	if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
+		(void)virtio_transport_reset_no_sock(pkt);
+		goto free_pkt;
+	}
+
+	/* The socket must be in connected or bound table
+	 * otherwise send reset back
+	 */
+	sk = vsock_find_connected_socket(&src, &dst);
+	if (!sk) {
+		sk = vsock_find_bound_socket(&dst);
+		if (!sk) {
+			(void)virtio_transport_reset_no_sock(pkt);
+			goto free_pkt;
+		}
+	}
+
+	vsk = vsock_sk(sk);
+
+	space_available = virtio_transport_space_update(sk, pkt);
+
+	lock_sock(sk);
+
+	/* Update CID in case it has changed after a transport reset event */
+	vsk->local_addr.svm_cid = dst.svm_cid;
+
+	if (space_available)
+		sk->sk_write_space(sk);
+
+	switch (sk->sk_state) {
+	case VSOCK_SS_LISTEN:
+		virtio_transport_recv_listen(sk, pkt);
+		virtio_transport_free_pkt(pkt);
+		break;
+	case SS_CONNECTING:
+		virtio_transport_recv_connecting(sk, pkt);
+		virtio_transport_free_pkt(pkt);
+		break;
+	case SS_CONNECTED:
+		virtio_transport_recv_connected(sk, pkt);
+		break;
+	case SS_DISCONNECTING:
+		virtio_transport_recv_disconnecting(sk, pkt);
+		virtio_transport_free_pkt(pkt);
+		break;
+	default:
+		virtio_transport_free_pkt(pkt);
+		break;
+	}
+	release_sock(sk);
+
+	/* Release refcnt obtained when we fetched this socket out of the
+	 * bound or connected list.
+	 */
+	sock_put(sk);
+	return;
+
+free_pkt:
+	virtio_transport_free_pkt(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
+
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
+{
+	kfree(pkt->buf);
+	kfree(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("common code for virtio vsock");

From 60c196454277cdec9a96544a302081eab17c2f25 Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Thu, 28 Jul 2016 15:36:33 +0100
Subject: [PATCH 2160/3220] UPSTREAM: VSOCK: Introduce virtio_transport.ko

VM sockets virtio transport implementation.  This driver runs in the
guest.

Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 0ea9e1d3a9e3ef7d2a1462d3de6b95131dc7d872)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Ib12e1e4d21183ac3d917316566694758717596bd
---
 MAINTAINERS                      |   1 +
 net/vmw_vsock/virtio_transport.c | 624 +++++++++++++++++++++++++++++++
 2 files changed, 625 insertions(+)
 create mode 100644 net/vmw_vsock/virtio_transport.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 6f23bb412b98..4098f22546ef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11457,6 +11457,7 @@ S:	Maintained
 F:	include/linux/virtio_vsock.h
 F:	include/uapi/linux/virtio_vsock.h
 F:	net/vmw_vsock/virtio_transport_common.c
+F:	net/vmw_vsock/virtio_transport.c
 
 VIRTUAL SERIO DEVICE DRIVER
 M:	Stephen Chandler Paul <thatslyude@gmail.com>
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
new file mode 100644
index 000000000000..699dfabdbccd
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport.c
@@ -0,0 +1,624 @@
+/*
+ * virtio transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ *         Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * Some of the code is take from Gerd Hoffmann <kraxel@redhat.com>'s
+ * early virtio-vsock proof-of-concept bits.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+#include <net/sock.h>
+#include <linux/mutex.h>
+#include <net/af_vsock.h>
+
+static struct workqueue_struct *virtio_vsock_workqueue;
+static struct virtio_vsock *the_virtio_vsock;
+static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */
+
+struct virtio_vsock {
+	struct virtio_device *vdev;
+	struct virtqueue *vqs[VSOCK_VQ_MAX];
+
+	/* Virtqueue processing is deferred to a workqueue */
+	struct work_struct tx_work;
+	struct work_struct rx_work;
+	struct work_struct event_work;
+
+	/* The following fields are protected by tx_lock.  vqs[VSOCK_VQ_TX]
+	 * must be accessed with tx_lock held.
+	 */
+	struct mutex tx_lock;
+
+	struct work_struct send_pkt_work;
+	spinlock_t send_pkt_list_lock;
+	struct list_head send_pkt_list;
+
+	atomic_t queued_replies;
+
+	/* The following fields are protected by rx_lock.  vqs[VSOCK_VQ_RX]
+	 * must be accessed with rx_lock held.
+	 */
+	struct mutex rx_lock;
+	int rx_buf_nr;
+	int rx_buf_max_nr;
+
+	/* The following fields are protected by event_lock.
+	 * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held.
+	 */
+	struct mutex event_lock;
+	struct virtio_vsock_event event_list[8];
+
+	u32 guest_cid;
+};
+
+static struct virtio_vsock *virtio_vsock_get(void)
+{
+	return the_virtio_vsock;
+}
+
+static u32 virtio_transport_get_local_cid(void)
+{
+	struct virtio_vsock *vsock = virtio_vsock_get();
+
+	return vsock->guest_cid;
+}
+
+static void
+virtio_transport_send_pkt_work(struct work_struct *work)
+{
+	struct virtio_vsock *vsock =
+		container_of(work, struct virtio_vsock, send_pkt_work);
+	struct virtqueue *vq;
+	bool added = false;
+	bool restart_rx = false;
+
+	mutex_lock(&vsock->tx_lock);
+
+	vq = vsock->vqs[VSOCK_VQ_TX];
+
+	/* Avoid unnecessary interrupts while we're processing the ring */
+	virtqueue_disable_cb(vq);
+
+	for (;;) {
+		struct virtio_vsock_pkt *pkt;
+		struct scatterlist hdr, buf, *sgs[2];
+		int ret, in_sg = 0, out_sg = 0;
+		bool reply;
+
+		spin_lock_bh(&vsock->send_pkt_list_lock);
+		if (list_empty(&vsock->send_pkt_list)) {
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			virtqueue_enable_cb(vq);
+			break;
+		}
+
+		pkt = list_first_entry(&vsock->send_pkt_list,
+				       struct virtio_vsock_pkt, list);
+		list_del_init(&pkt->list);
+		spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+		reply = pkt->reply;
+
+		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+		sgs[out_sg++] = &hdr;
+		if (pkt->buf) {
+			sg_init_one(&buf, pkt->buf, pkt->len);
+			sgs[out_sg++] = &buf;
+		}
+
+		ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL);
+		if (ret < 0) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+			if (!virtqueue_enable_cb(vq) && ret == -ENOSPC)
+				continue; /* retry now that we have more space */
+			break;
+		}
+
+		if (reply) {
+			struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+			int val;
+
+			val = atomic_dec_return(&vsock->queued_replies);
+
+			/* Do we now have resources to resume rx processing? */
+			if (val + 1 == virtqueue_get_vring_size(rx_vq))
+				restart_rx = true;
+		}
+
+		added = true;
+	}
+
+	if (added)
+		virtqueue_kick(vq);
+
+	mutex_unlock(&vsock->tx_lock);
+
+	if (restart_rx)
+		queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static int
+virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock *vsock;
+	int len = pkt->len;
+
+	vsock = virtio_vsock_get();
+	if (!vsock) {
+		virtio_transport_free_pkt(pkt);
+		return -ENODEV;
+	}
+
+	if (pkt->reply)
+		atomic_inc(&vsock->queued_replies);
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	list_add_tail(&pkt->list, &vsock->send_pkt_list);
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+	return len;
+}
+
+static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
+{
+	int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+	struct virtio_vsock_pkt *pkt;
+	struct scatterlist hdr, buf, *sgs[2];
+	struct virtqueue *vq;
+	int ret;
+
+	vq = vsock->vqs[VSOCK_VQ_RX];
+
+	do {
+		pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+		if (!pkt)
+			break;
+
+		pkt->buf = kmalloc(buf_len, GFP_KERNEL);
+		if (!pkt->buf) {
+			virtio_transport_free_pkt(pkt);
+			break;
+		}
+
+		pkt->len = buf_len;
+
+		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+		sgs[0] = &hdr;
+
+		sg_init_one(&buf, pkt->buf, buf_len);
+		sgs[1] = &buf;
+		ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
+		if (ret) {
+			virtio_transport_free_pkt(pkt);
+			break;
+		}
+		vsock->rx_buf_nr++;
+	} while (vq->num_free);
+	if (vsock->rx_buf_nr > vsock->rx_buf_max_nr)
+		vsock->rx_buf_max_nr = vsock->rx_buf_nr;
+	virtqueue_kick(vq);
+}
+
+static void virtio_transport_tx_work(struct work_struct *work)
+{
+	struct virtio_vsock *vsock =
+		container_of(work, struct virtio_vsock, tx_work);
+	struct virtqueue *vq;
+	bool added = false;
+
+	vq = vsock->vqs[VSOCK_VQ_TX];
+	mutex_lock(&vsock->tx_lock);
+	do {
+		struct virtio_vsock_pkt *pkt;
+		unsigned int len;
+
+		virtqueue_disable_cb(vq);
+		while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) {
+			virtio_transport_free_pkt(pkt);
+			added = true;
+		}
+	} while (!virtqueue_enable_cb(vq));
+	mutex_unlock(&vsock->tx_lock);
+
+	if (added)
+		queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+}
+
+/* Is there space left for replies to rx packets? */
+static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
+{
+	struct virtqueue *vq = vsock->vqs[VSOCK_VQ_RX];
+	int val;
+
+	smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
+	val = atomic_read(&vsock->queued_replies);
+
+	return val < virtqueue_get_vring_size(vq);
+}
+
+static void virtio_transport_rx_work(struct work_struct *work)
+{
+	struct virtio_vsock *vsock =
+		container_of(work, struct virtio_vsock, rx_work);
+	struct virtqueue *vq;
+
+	vq = vsock->vqs[VSOCK_VQ_RX];
+
+	mutex_lock(&vsock->rx_lock);
+
+	do {
+		virtqueue_disable_cb(vq);
+		for (;;) {
+			struct virtio_vsock_pkt *pkt;
+			unsigned int len;
+
+			if (!virtio_transport_more_replies(vsock)) {
+				/* Stop rx until the device processes already
+				 * pending replies.  Leave rx virtqueue
+				 * callbacks disabled.
+				 */
+				goto out;
+			}
+
+			pkt = virtqueue_get_buf(vq, &len);
+			if (!pkt) {
+				break;
+			}
+
+			vsock->rx_buf_nr--;
+
+			/* Drop short/long packets */
+			if (unlikely(len < sizeof(pkt->hdr) ||
+				     len > sizeof(pkt->hdr) + pkt->len)) {
+				virtio_transport_free_pkt(pkt);
+				continue;
+			}
+
+			pkt->len = len - sizeof(pkt->hdr);
+			virtio_transport_recv_pkt(pkt);
+		}
+	} while (!virtqueue_enable_cb(vq));
+
+out:
+	if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
+		virtio_vsock_rx_fill(vsock);
+	mutex_unlock(&vsock->rx_lock);
+}
+
+/* event_lock must be held */
+static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock,
+				       struct virtio_vsock_event *event)
+{
+	struct scatterlist sg;
+	struct virtqueue *vq;
+
+	vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+	sg_init_one(&sg, event, sizeof(*event));
+
+	return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_fill(struct virtio_vsock *vsock)
+{
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(vsock->event_list); i++) {
+		struct virtio_vsock_event *event = &vsock->event_list[i];
+
+		virtio_vsock_event_fill_one(vsock, event);
+	}
+
+	virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+}
+
+static void virtio_vsock_reset_sock(struct sock *sk)
+{
+	lock_sock(sk);
+	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_err = ECONNRESET;
+	sk->sk_error_report(sk);
+	release_sock(sk);
+}
+
+static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
+{
+	struct virtio_device *vdev = vsock->vdev;
+	u64 guest_cid;
+
+	vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
+			  &guest_cid, sizeof(guest_cid));
+	vsock->guest_cid = le64_to_cpu(guest_cid);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_handle(struct virtio_vsock *vsock,
+				      struct virtio_vsock_event *event)
+{
+	switch (le32_to_cpu(event->id)) {
+	case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET:
+		virtio_vsock_update_guest_cid(vsock);
+		vsock_for_each_connected_socket(virtio_vsock_reset_sock);
+		break;
+	}
+}
+
+static void virtio_transport_event_work(struct work_struct *work)
+{
+	struct virtio_vsock *vsock =
+		container_of(work, struct virtio_vsock, event_work);
+	struct virtqueue *vq;
+
+	vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+	mutex_lock(&vsock->event_lock);
+
+	do {
+		struct virtio_vsock_event *event;
+		unsigned int len;
+
+		virtqueue_disable_cb(vq);
+		while ((event = virtqueue_get_buf(vq, &len)) != NULL) {
+			if (len == sizeof(*event))
+				virtio_vsock_event_handle(vsock, event);
+
+			virtio_vsock_event_fill_one(vsock, event);
+		}
+	} while (!virtqueue_enable_cb(vq));
+
+	virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+
+	mutex_unlock(&vsock->event_lock);
+}
+
+static void virtio_vsock_event_done(struct virtqueue *vq)
+{
+	struct virtio_vsock *vsock = vq->vdev->priv;
+
+	if (!vsock)
+		return;
+	queue_work(virtio_vsock_workqueue, &vsock->event_work);
+}
+
+static void virtio_vsock_tx_done(struct virtqueue *vq)
+{
+	struct virtio_vsock *vsock = vq->vdev->priv;
+
+	if (!vsock)
+		return;
+	queue_work(virtio_vsock_workqueue, &vsock->tx_work);
+}
+
+static void virtio_vsock_rx_done(struct virtqueue *vq)
+{
+	struct virtio_vsock *vsock = vq->vdev->priv;
+
+	if (!vsock)
+		return;
+	queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static struct virtio_transport virtio_transport = {
+	.transport = {
+		.get_local_cid            = virtio_transport_get_local_cid,
+
+		.init                     = virtio_transport_do_socket_init,
+		.destruct                 = virtio_transport_destruct,
+		.release                  = virtio_transport_release,
+		.connect                  = virtio_transport_connect,
+		.shutdown                 = virtio_transport_shutdown,
+
+		.dgram_bind               = virtio_transport_dgram_bind,
+		.dgram_dequeue            = virtio_transport_dgram_dequeue,
+		.dgram_enqueue            = virtio_transport_dgram_enqueue,
+		.dgram_allow              = virtio_transport_dgram_allow,
+
+		.stream_dequeue           = virtio_transport_stream_dequeue,
+		.stream_enqueue           = virtio_transport_stream_enqueue,
+		.stream_has_data          = virtio_transport_stream_has_data,
+		.stream_has_space         = virtio_transport_stream_has_space,
+		.stream_rcvhiwat          = virtio_transport_stream_rcvhiwat,
+		.stream_is_active         = virtio_transport_stream_is_active,
+		.stream_allow             = virtio_transport_stream_allow,
+
+		.notify_poll_in           = virtio_transport_notify_poll_in,
+		.notify_poll_out          = virtio_transport_notify_poll_out,
+		.notify_recv_init         = virtio_transport_notify_recv_init,
+		.notify_recv_pre_block    = virtio_transport_notify_recv_pre_block,
+		.notify_recv_pre_dequeue  = virtio_transport_notify_recv_pre_dequeue,
+		.notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+		.notify_send_init         = virtio_transport_notify_send_init,
+		.notify_send_pre_block    = virtio_transport_notify_send_pre_block,
+		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
+		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+
+		.set_buffer_size          = virtio_transport_set_buffer_size,
+		.set_min_buffer_size      = virtio_transport_set_min_buffer_size,
+		.set_max_buffer_size      = virtio_transport_set_max_buffer_size,
+		.get_buffer_size          = virtio_transport_get_buffer_size,
+		.get_min_buffer_size      = virtio_transport_get_min_buffer_size,
+		.get_max_buffer_size      = virtio_transport_get_max_buffer_size,
+	},
+
+	.send_pkt = virtio_transport_send_pkt,
+};
+
+static int virtio_vsock_probe(struct virtio_device *vdev)
+{
+	vq_callback_t *callbacks[] = {
+		virtio_vsock_rx_done,
+		virtio_vsock_tx_done,
+		virtio_vsock_event_done,
+	};
+	static const char * const names[] = {
+		"rx",
+		"tx",
+		"event",
+	};
+	struct virtio_vsock *vsock = NULL;
+	int ret;
+
+	ret = mutex_lock_interruptible(&the_virtio_vsock_mutex);
+	if (ret)
+		return ret;
+
+	/* Only one virtio-vsock device per guest is supported */
+	if (the_virtio_vsock) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	vsock = kzalloc(sizeof(*vsock), GFP_KERNEL);
+	if (!vsock) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	vsock->vdev = vdev;
+
+	ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+					    vsock->vqs, callbacks, names);
+	if (ret < 0)
+		goto out;
+
+	virtio_vsock_update_guest_cid(vsock);
+
+	ret = vsock_core_init(&virtio_transport.transport);
+	if (ret < 0)
+		goto out_vqs;
+
+	vsock->rx_buf_nr = 0;
+	vsock->rx_buf_max_nr = 0;
+	atomic_set(&vsock->queued_replies, 0);
+
+	vdev->priv = vsock;
+	the_virtio_vsock = vsock;
+	mutex_init(&vsock->tx_lock);
+	mutex_init(&vsock->rx_lock);
+	mutex_init(&vsock->event_lock);
+	spin_lock_init(&vsock->send_pkt_list_lock);
+	INIT_LIST_HEAD(&vsock->send_pkt_list);
+	INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
+	INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
+	INIT_WORK(&vsock->event_work, virtio_transport_event_work);
+	INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
+
+	mutex_lock(&vsock->rx_lock);
+	virtio_vsock_rx_fill(vsock);
+	mutex_unlock(&vsock->rx_lock);
+
+	mutex_lock(&vsock->event_lock);
+	virtio_vsock_event_fill(vsock);
+	mutex_unlock(&vsock->event_lock);
+
+	mutex_unlock(&the_virtio_vsock_mutex);
+	return 0;
+
+out_vqs:
+	vsock->vdev->config->del_vqs(vsock->vdev);
+out:
+	kfree(vsock);
+	mutex_unlock(&the_virtio_vsock_mutex);
+	return ret;
+}
+
+static void virtio_vsock_remove(struct virtio_device *vdev)
+{
+	struct virtio_vsock *vsock = vdev->priv;
+	struct virtio_vsock_pkt *pkt;
+
+	flush_work(&vsock->rx_work);
+	flush_work(&vsock->tx_work);
+	flush_work(&vsock->event_work);
+	flush_work(&vsock->send_pkt_work);
+
+	vdev->config->reset(vdev);
+
+	mutex_lock(&vsock->rx_lock);
+	while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX])))
+		virtio_transport_free_pkt(pkt);
+	mutex_unlock(&vsock->rx_lock);
+
+	mutex_lock(&vsock->tx_lock);
+	while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX])))
+		virtio_transport_free_pkt(pkt);
+	mutex_unlock(&vsock->tx_lock);
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	while (!list_empty(&vsock->send_pkt_list)) {
+		pkt = list_first_entry(&vsock->send_pkt_list,
+				       struct virtio_vsock_pkt, list);
+		list_del(&pkt->list);
+		virtio_transport_free_pkt(pkt);
+	}
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	mutex_lock(&the_virtio_vsock_mutex);
+	the_virtio_vsock = NULL;
+	vsock_core_exit();
+	mutex_unlock(&the_virtio_vsock_mutex);
+
+	vdev->config->del_vqs(vdev);
+
+	kfree(vsock);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static unsigned int features[] = {
+};
+
+static struct virtio_driver virtio_vsock_driver = {
+	.feature_table = features,
+	.feature_table_size = ARRAY_SIZE(features),
+	.driver.name = KBUILD_MODNAME,
+	.driver.owner = THIS_MODULE,
+	.id_table = id_table,
+	.probe = virtio_vsock_probe,
+	.remove = virtio_vsock_remove,
+};
+
+static int __init virtio_vsock_init(void)
+{
+	int ret;
+
+	virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0);
+	if (!virtio_vsock_workqueue)
+		return -ENOMEM;
+	ret = register_virtio_driver(&virtio_vsock_driver);
+	if (ret)
+		destroy_workqueue(virtio_vsock_workqueue);
+	return ret;
+}
+
+static void __exit virtio_vsock_exit(void)
+{
+	unregister_virtio_driver(&virtio_vsock_driver);
+	destroy_workqueue(virtio_vsock_workqueue);
+}
+
+module_init(virtio_vsock_init);
+module_exit(virtio_vsock_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("virtio transport for vsock");
+MODULE_DEVICE_TABLE(virtio, id_table);

From 01a1cf6063b4d6c75c422fbe2bb664a5739f5e9d Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Thu, 28 Jul 2016 15:36:34 +0100
Subject: [PATCH 2161/3220] UPSTREAM: VSOCK: Introduce vhost_vsock.ko

VM sockets vhost transport implementation.  This driver runs on the
host.

Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 433fc58e6bf2c8bd97e57153ed28e64fd78207b8)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Id90d852ffd498a7d89075cddb6d8ed0b9af5e69f
---
 MAINTAINERS                |   2 +
 drivers/vhost/vsock.c      | 722 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vhost.h |   5 +
 3 files changed, 729 insertions(+)
 create mode 100644 drivers/vhost/vsock.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 4098f22546ef..3ae7b57b5f1a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11458,6 +11458,8 @@ F:	include/linux/virtio_vsock.h
 F:	include/uapi/linux/virtio_vsock.h
 F:	net/vmw_vsock/virtio_transport_common.c
 F:	net/vmw_vsock/virtio_transport.c
+F:	drivers/vhost/vsock.c
+F:	drivers/vhost/vsock.h
 
 VIRTUAL SERIO DEVICE DRIVER
 M:	Stephen Chandler Paul <thatslyude@gmail.com>
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
new file mode 100644
index 000000000000..028ca16c2d36
--- /dev/null
+++ b/drivers/vhost/vsock.c
@@ -0,0 +1,722 @@
+/*
+ * vhost transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ *         Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/miscdevice.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <net/sock.h>
+#include <linux/virtio_vsock.h>
+#include <linux/vhost.h>
+
+#include <net/af_vsock.h>
+#include "vhost.h"
+
+#define VHOST_VSOCK_DEFAULT_HOST_CID	2
+
+enum {
+	VHOST_VSOCK_FEATURES = VHOST_FEATURES,
+};
+
+/* Used to track all the vhost_vsock instances on the system. */
+static DEFINE_SPINLOCK(vhost_vsock_lock);
+static LIST_HEAD(vhost_vsock_list);
+
+struct vhost_vsock {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[2];
+
+	/* Link to global vhost_vsock_list, protected by vhost_vsock_lock */
+	struct list_head list;
+
+	struct vhost_work send_pkt_work;
+	spinlock_t send_pkt_list_lock;
+	struct list_head send_pkt_list;	/* host->guest pending packets */
+
+	atomic_t queued_replies;
+
+	u32 guest_cid;
+};
+
+static u32 vhost_transport_get_local_cid(void)
+{
+	return VHOST_VSOCK_DEFAULT_HOST_CID;
+}
+
+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
+{
+	struct vhost_vsock *vsock;
+
+	spin_lock_bh(&vhost_vsock_lock);
+	list_for_each_entry(vsock, &vhost_vsock_list, list) {
+		u32 other_cid = vsock->guest_cid;
+
+		/* Skip instances that have no CID yet */
+		if (other_cid == 0)
+			continue;
+
+		if (other_cid == guest_cid) {
+			spin_unlock_bh(&vhost_vsock_lock);
+			return vsock;
+		}
+	}
+	spin_unlock_bh(&vhost_vsock_lock);
+
+	return NULL;
+}
+
+static void
+vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
+			    struct vhost_virtqueue *vq)
+{
+	struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+	bool added = false;
+	bool restart_tx = false;
+
+	mutex_lock(&vq->mutex);
+
+	if (!vq->private_data)
+		goto out;
+
+	/* Avoid further vmexits, we're already processing the virtqueue */
+	vhost_disable_notify(&vsock->dev, vq);
+
+	for (;;) {
+		struct virtio_vsock_pkt *pkt;
+		struct iov_iter iov_iter;
+		unsigned out, in;
+		size_t nbytes;
+		size_t len;
+		int head;
+
+		spin_lock_bh(&vsock->send_pkt_list_lock);
+		if (list_empty(&vsock->send_pkt_list)) {
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			vhost_enable_notify(&vsock->dev, vq);
+			break;
+		}
+
+		pkt = list_first_entry(&vsock->send_pkt_list,
+				       struct virtio_vsock_pkt, list);
+		list_del_init(&pkt->list);
+		spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head < 0) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			break;
+		}
+
+		if (head == vq->num) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+			/* We cannot finish yet if more buffers snuck in while
+			 * re-enabling notify.
+			 */
+			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
+				vhost_disable_notify(&vsock->dev, vq);
+				continue;
+			}
+			break;
+		}
+
+		if (out) {
+			virtio_transport_free_pkt(pkt);
+			vq_err(vq, "Expected 0 output buffers, got %u\n", out);
+			break;
+		}
+
+		len = iov_length(&vq->iov[out], in);
+		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
+
+		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
+		if (nbytes != sizeof(pkt->hdr)) {
+			virtio_transport_free_pkt(pkt);
+			vq_err(vq, "Faulted on copying pkt hdr\n");
+			break;
+		}
+
+		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
+		if (nbytes != pkt->len) {
+			virtio_transport_free_pkt(pkt);
+			vq_err(vq, "Faulted on copying pkt buf\n");
+			break;
+		}
+
+		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		added = true;
+
+		if (pkt->reply) {
+			int val;
+
+			val = atomic_dec_return(&vsock->queued_replies);
+
+			/* Do we have resources to resume tx processing? */
+			if (val + 1 == tx_vq->num)
+				restart_tx = true;
+		}
+
+		virtio_transport_free_pkt(pkt);
+	}
+	if (added)
+		vhost_signal(&vsock->dev, vq);
+
+out:
+	mutex_unlock(&vq->mutex);
+
+	if (restart_tx)
+		vhost_poll_queue(&tx_vq->poll);
+}
+
+static void vhost_transport_send_pkt_work(struct vhost_work *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_vsock *vsock;
+
+	vsock = container_of(work, struct vhost_vsock, send_pkt_work);
+	vq = &vsock->vqs[VSOCK_VQ_RX];
+
+	vhost_transport_do_send_pkt(vsock, vq);
+}
+
+static int
+vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
+{
+	struct vhost_vsock *vsock;
+	struct vhost_virtqueue *vq;
+	int len = pkt->len;
+
+	/* Find the vhost_vsock according to guest context id  */
+	vsock = vhost_vsock_get(le64_to_cpu(pkt->hdr.dst_cid));
+	if (!vsock) {
+		virtio_transport_free_pkt(pkt);
+		return -ENODEV;
+	}
+
+	vq = &vsock->vqs[VSOCK_VQ_RX];
+
+	if (pkt->reply)
+		atomic_inc(&vsock->queued_replies);
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	list_add_tail(&pkt->list, &vsock->send_pkt_list);
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+	return len;
+}
+
+static struct virtio_vsock_pkt *
+vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
+		      unsigned int out, unsigned int in)
+{
+	struct virtio_vsock_pkt *pkt;
+	struct iov_iter iov_iter;
+	size_t nbytes;
+	size_t len;
+
+	if (in != 0) {
+		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
+		return NULL;
+	}
+
+	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+	if (!pkt)
+		return NULL;
+
+	len = iov_length(vq->iov, out);
+	iov_iter_init(&iov_iter, WRITE, vq->iov, out, len);
+
+	nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
+	if (nbytes != sizeof(pkt->hdr)) {
+		vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
+		       sizeof(pkt->hdr), nbytes);
+		kfree(pkt);
+		return NULL;
+	}
+
+	if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM)
+		pkt->len = le32_to_cpu(pkt->hdr.len);
+
+	/* No payload */
+	if (!pkt->len)
+		return pkt;
+
+	/* The pkt is too big */
+	if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
+		kfree(pkt);
+		return NULL;
+	}
+
+	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
+	if (!pkt->buf) {
+		kfree(pkt);
+		return NULL;
+	}
+
+	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
+	if (nbytes != pkt->len) {
+		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
+		       pkt->len, nbytes);
+		virtio_transport_free_pkt(pkt);
+		return NULL;
+	}
+
+	return pkt;
+}
+
+/* Is there space left for replies to rx packets? */
+static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
+{
+	struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX];
+	int val;
+
+	smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
+	val = atomic_read(&vsock->queued_replies);
+
+	return val < vq->num;
+}
+
+static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
+{
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						  poll.work);
+	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
+						 dev);
+	struct virtio_vsock_pkt *pkt;
+	int head;
+	unsigned int out, in;
+	bool added = false;
+
+	mutex_lock(&vq->mutex);
+
+	if (!vq->private_data)
+		goto out;
+
+	vhost_disable_notify(&vsock->dev, vq);
+	for (;;) {
+		if (!vhost_vsock_more_replies(vsock)) {
+			/* Stop tx until the device processes already
+			 * pending replies.  Leave tx virtqueue
+			 * callbacks disabled.
+			 */
+			goto no_more_replies;
+		}
+
+		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head < 0)
+			break;
+
+		if (head == vq->num) {
+			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
+				vhost_disable_notify(&vsock->dev, vq);
+				continue;
+			}
+			break;
+		}
+
+		pkt = vhost_vsock_alloc_pkt(vq, out, in);
+		if (!pkt) {
+			vq_err(vq, "Faulted on pkt\n");
+			continue;
+		}
+
+		/* Only accept correctly addressed packets */
+		if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid)
+			virtio_transport_recv_pkt(pkt);
+		else
+			virtio_transport_free_pkt(pkt);
+
+		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		added = true;
+	}
+
+no_more_replies:
+	if (added)
+		vhost_signal(&vsock->dev, vq);
+
+out:
+	mutex_unlock(&vq->mutex);
+}
+
+static void vhost_vsock_handle_rx_kick(struct vhost_work *work)
+{
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						poll.work);
+	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
+						 dev);
+
+	vhost_transport_do_send_pkt(vsock, vq);
+}
+
+static int vhost_vsock_start(struct vhost_vsock *vsock)
+{
+	size_t i;
+	int ret;
+
+	mutex_lock(&vsock->dev.mutex);
+
+	ret = vhost_dev_check_owner(&vsock->dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+		struct vhost_virtqueue *vq = &vsock->vqs[i];
+
+		mutex_lock(&vq->mutex);
+
+		if (!vhost_vq_access_ok(vq)) {
+			ret = -EFAULT;
+			mutex_unlock(&vq->mutex);
+			goto err_vq;
+		}
+
+		if (!vq->private_data) {
+			vq->private_data = vsock;
+			vhost_vq_init_access(vq);
+		}
+
+		mutex_unlock(&vq->mutex);
+	}
+
+	mutex_unlock(&vsock->dev.mutex);
+	return 0;
+
+err_vq:
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+		struct vhost_virtqueue *vq = &vsock->vqs[i];
+
+		mutex_lock(&vq->mutex);
+		vq->private_data = NULL;
+		mutex_unlock(&vq->mutex);
+	}
+err:
+	mutex_unlock(&vsock->dev.mutex);
+	return ret;
+}
+
+static int vhost_vsock_stop(struct vhost_vsock *vsock)
+{
+	size_t i;
+	int ret;
+
+	mutex_lock(&vsock->dev.mutex);
+
+	ret = vhost_dev_check_owner(&vsock->dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+		struct vhost_virtqueue *vq = &vsock->vqs[i];
+
+		mutex_lock(&vq->mutex);
+		vq->private_data = NULL;
+		mutex_unlock(&vq->mutex);
+	}
+
+err:
+	mutex_unlock(&vsock->dev.mutex);
+	return ret;
+}
+
+static void vhost_vsock_free(struct vhost_vsock *vsock)
+{
+	if (is_vmalloc_addr(vsock))
+		vfree(vsock);
+	else
+		kfree(vsock);
+}
+
+static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
+{
+	struct vhost_virtqueue **vqs;
+	struct vhost_vsock *vsock;
+	int ret;
+
+	/* This struct is large and allocation could fail, fall back to vmalloc
+	 * if there is no other way.
+	 */
+	vsock = kzalloc(sizeof(*vsock), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	if (!vsock) {
+		vsock = vmalloc(sizeof(*vsock));
+		if (!vsock)
+			return -ENOMEM;
+	}
+
+	vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL);
+	if (!vqs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&vsock->queued_replies, 0);
+
+	vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX];
+	vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX];
+	vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
+	vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;
+
+	vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs));
+
+	file->private_data = vsock;
+	spin_lock_init(&vsock->send_pkt_list_lock);
+	INIT_LIST_HEAD(&vsock->send_pkt_list);
+	vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
+
+	spin_lock_bh(&vhost_vsock_lock);
+	list_add_tail(&vsock->list, &vhost_vsock_list);
+	spin_unlock_bh(&vhost_vsock_lock);
+	return 0;
+
+out:
+	vhost_vsock_free(vsock);
+	return ret;
+}
+
+static void vhost_vsock_flush(struct vhost_vsock *vsock)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++)
+		if (vsock->vqs[i].handle_kick)
+			vhost_poll_flush(&vsock->vqs[i].poll);
+	vhost_work_flush(&vsock->dev, &vsock->send_pkt_work);
+}
+
+static void vhost_vsock_reset_orphans(struct sock *sk)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+
+	/* vmci_transport.c doesn't take sk_lock here either.  At least we're
+	 * under vsock_table_lock so the sock cannot disappear while we're
+	 * executing.
+	 */
+
+	if (!vhost_vsock_get(vsk->local_addr.svm_cid)) {
+		sock_set_flag(sk, SOCK_DONE);
+		vsk->peer_shutdown = SHUTDOWN_MASK;
+		sk->sk_state = SS_UNCONNECTED;
+		sk->sk_err = ECONNRESET;
+		sk->sk_error_report(sk);
+	}
+}
+
+static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
+{
+	struct vhost_vsock *vsock = file->private_data;
+
+	spin_lock_bh(&vhost_vsock_lock);
+	list_del(&vsock->list);
+	spin_unlock_bh(&vhost_vsock_lock);
+
+	/* Iterating over all connections for all CIDs to find orphans is
+	 * inefficient.  Room for improvement here. */
+	vsock_for_each_connected_socket(vhost_vsock_reset_orphans);
+
+	vhost_vsock_stop(vsock);
+	vhost_vsock_flush(vsock);
+	vhost_dev_stop(&vsock->dev);
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	while (!list_empty(&vsock->send_pkt_list)) {
+		struct virtio_vsock_pkt *pkt;
+
+		pkt = list_first_entry(&vsock->send_pkt_list,
+				struct virtio_vsock_pkt, list);
+		list_del_init(&pkt->list);
+		virtio_transport_free_pkt(pkt);
+	}
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	vhost_dev_cleanup(&vsock->dev, false);
+	kfree(vsock->dev.vqs);
+	vhost_vsock_free(vsock);
+	return 0;
+}
+
+static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
+{
+	struct vhost_vsock *other;
+
+	/* Refuse reserved CIDs */
+	if (guest_cid <= VMADDR_CID_HOST ||
+	    guest_cid == U32_MAX)
+		return -EINVAL;
+
+	/* 64-bit CIDs are not yet supported */
+	if (guest_cid > U32_MAX)
+		return -EINVAL;
+
+	/* Refuse if CID is already in use */
+	other = vhost_vsock_get(guest_cid);
+	if (other && other != vsock)
+		return -EADDRINUSE;
+
+	spin_lock_bh(&vhost_vsock_lock);
+	vsock->guest_cid = guest_cid;
+	spin_unlock_bh(&vhost_vsock_lock);
+
+	return 0;
+}
+
+static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
+{
+	struct vhost_virtqueue *vq;
+	int i;
+
+	if (features & ~VHOST_VSOCK_FEATURES)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&vsock->dev.mutex);
+	if ((features & (1 << VHOST_F_LOG_ALL)) &&
+	    !vhost_log_access_ok(&vsock->dev)) {
+		mutex_unlock(&vsock->dev.mutex);
+		return -EFAULT;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+		vq = &vsock->vqs[i];
+		mutex_lock(&vq->mutex);
+		vq->acked_features = features;
+		mutex_unlock(&vq->mutex);
+	}
+	mutex_unlock(&vsock->dev.mutex);
+	return 0;
+}
+
+static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
+				  unsigned long arg)
+{
+	struct vhost_vsock *vsock = f->private_data;
+	void __user *argp = (void __user *)arg;
+	u64 guest_cid;
+	u64 features;
+	int start;
+	int r;
+
+	switch (ioctl) {
+	case VHOST_VSOCK_SET_GUEST_CID:
+		if (copy_from_user(&guest_cid, argp, sizeof(guest_cid)))
+			return -EFAULT;
+		return vhost_vsock_set_cid(vsock, guest_cid);
+	case VHOST_VSOCK_SET_RUNNING:
+		if (copy_from_user(&start, argp, sizeof(start)))
+			return -EFAULT;
+		if (start)
+			return vhost_vsock_start(vsock);
+		else
+			return vhost_vsock_stop(vsock);
+	case VHOST_GET_FEATURES:
+		features = VHOST_VSOCK_FEATURES;
+		if (copy_to_user(argp, &features, sizeof(features)))
+			return -EFAULT;
+		return 0;
+	case VHOST_SET_FEATURES:
+		if (copy_from_user(&features, argp, sizeof(features)))
+			return -EFAULT;
+		return vhost_vsock_set_features(vsock, features);
+	default:
+		mutex_lock(&vsock->dev.mutex);
+		r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
+		if (r == -ENOIOCTLCMD)
+			r = vhost_vring_ioctl(&vsock->dev, ioctl, argp);
+		else
+			vhost_vsock_flush(vsock);
+		mutex_unlock(&vsock->dev.mutex);
+		return r;
+	}
+}
+
+static const struct file_operations vhost_vsock_fops = {
+	.owner          = THIS_MODULE,
+	.open           = vhost_vsock_dev_open,
+	.release        = vhost_vsock_dev_release,
+	.llseek		= noop_llseek,
+	.unlocked_ioctl = vhost_vsock_dev_ioctl,
+};
+
+static struct miscdevice vhost_vsock_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "vhost-vsock",
+	.fops = &vhost_vsock_fops,
+};
+
+static struct virtio_transport vhost_transport = {
+	.transport = {
+		.get_local_cid            = vhost_transport_get_local_cid,
+
+		.init                     = virtio_transport_do_socket_init,
+		.destruct                 = virtio_transport_destruct,
+		.release                  = virtio_transport_release,
+		.connect                  = virtio_transport_connect,
+		.shutdown                 = virtio_transport_shutdown,
+
+		.dgram_enqueue            = virtio_transport_dgram_enqueue,
+		.dgram_dequeue            = virtio_transport_dgram_dequeue,
+		.dgram_bind               = virtio_transport_dgram_bind,
+		.dgram_allow              = virtio_transport_dgram_allow,
+
+		.stream_enqueue           = virtio_transport_stream_enqueue,
+		.stream_dequeue           = virtio_transport_stream_dequeue,
+		.stream_has_data          = virtio_transport_stream_has_data,
+		.stream_has_space         = virtio_transport_stream_has_space,
+		.stream_rcvhiwat          = virtio_transport_stream_rcvhiwat,
+		.stream_is_active         = virtio_transport_stream_is_active,
+		.stream_allow             = virtio_transport_stream_allow,
+
+		.notify_poll_in           = virtio_transport_notify_poll_in,
+		.notify_poll_out          = virtio_transport_notify_poll_out,
+		.notify_recv_init         = virtio_transport_notify_recv_init,
+		.notify_recv_pre_block    = virtio_transport_notify_recv_pre_block,
+		.notify_recv_pre_dequeue  = virtio_transport_notify_recv_pre_dequeue,
+		.notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+		.notify_send_init         = virtio_transport_notify_send_init,
+		.notify_send_pre_block    = virtio_transport_notify_send_pre_block,
+		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
+		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+
+		.set_buffer_size          = virtio_transport_set_buffer_size,
+		.set_min_buffer_size      = virtio_transport_set_min_buffer_size,
+		.set_max_buffer_size      = virtio_transport_set_max_buffer_size,
+		.get_buffer_size          = virtio_transport_get_buffer_size,
+		.get_min_buffer_size      = virtio_transport_get_min_buffer_size,
+		.get_max_buffer_size      = virtio_transport_get_max_buffer_size,
+	},
+
+	.send_pkt = vhost_transport_send_pkt,
+};
+
+static int __init vhost_vsock_init(void)
+{
+	int ret;
+
+	ret = vsock_core_init(&vhost_transport.transport);
+	if (ret < 0)
+		return ret;
+	return misc_register(&vhost_vsock_misc);
+};
+
+static void __exit vhost_vsock_exit(void)
+{
+	misc_deregister(&vhost_vsock_misc);
+	vsock_core_exit();
+};
+
+module_init(vhost_vsock_init);
+module_exit(vhost_vsock_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("vhost transport for vsock ");
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 8cb0a65afd64..56b7ab584cc0 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -203,4 +203,9 @@ struct vhost_scsi_target {
 #define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
 #define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
 
+/* VHOST_VSOCK specific defines */
+
+#define VHOST_VSOCK_SET_GUEST_CID	_IOW(VHOST_VIRTIO, 0x60, __u64)
+#define VHOST_VSOCK_SET_RUNNING		_IOW(VHOST_VIRTIO, 0x61, int)
+
 #endif

From e177c82ed1fa522e4634ef30162211d43e5b642e Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Thu, 28 Jul 2016 15:36:35 +0100
Subject: [PATCH 2162/3220] UPSTREAM: VSOCK: Add Makefile and Kconfig

Enable virtio-vsock and vhost-vsock.

Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 304ba62fd4e670c1a5784585da0fac9f7309ef6c)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I0b0f08cd28a94516903dbf3452e5999375e0f85a
---
 drivers/vhost/Kconfig  | 14 ++++++++++++++
 drivers/vhost/Makefile |  4 ++++
 net/vmw_vsock/Kconfig  | 20 ++++++++++++++++++++
 net/vmw_vsock/Makefile |  6 ++++++
 4 files changed, 44 insertions(+)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 533eaf04f12f..2b5f588f5b1e 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -21,6 +21,20 @@ config VHOST_SCSI
 	Say M here to enable the vhost_scsi TCM fabric module
 	for use with virtio-scsi guests
 
+config VHOST_VSOCK
+	tristate "vhost virtio-vsock driver"
+	depends on VSOCKETS && EVENTFD
+	select VIRTIO_VSOCKETS_COMMON
+	select VHOST
+	default n
+	---help---
+	This kernel module can be loaded in the host kernel to provide AF_VSOCK
+	sockets for communicating with guests.  The guests must have the
+	virtio_transport.ko driver loaded to use the virtio-vsock device.
+
+	To compile this driver as a module, choose M here: the module will be called
+	vhost_vsock.
+
 config VHOST_RING
 	tristate
 	---help---
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index e0441c34db1c..6b012b986b57 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -4,5 +4,9 @@ vhost_net-y := net.o
 obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
 vhost_scsi-y := scsi.o
 
+obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
+vhost_vsock-y := vsock.o
+
 obj-$(CONFIG_VHOST_RING) += vringh.o
+
 obj-$(CONFIG_VHOST)	+= vhost.o
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index 14810abedc2e..8831e7c42167 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -26,3 +26,23 @@ config VMWARE_VMCI_VSOCKETS
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called vmw_vsock_vmci_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS
+	tristate "virtio transport for Virtual Sockets"
+	depends on VSOCKETS && VIRTIO
+	select VIRTIO_VSOCKETS_COMMON
+	help
+	  This module implements a virtio transport for Virtual Sockets.
+
+	  Enable this transport if your Virtual Machine host supports Virtual
+	  Sockets over virtio.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called vmw_vsock_virtio_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS_COMMON
+	tristate
+	help
+	  This option is selected by any driver which needs to access
+	  the virtio_vsock.  The module will be called
+	  vmw_vsock_virtio_transport_common.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 2ce52d70f224..bc27c70e0e59 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -1,7 +1,13 @@
 obj-$(CONFIG_VSOCKETS) += vsock.o
 obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o
 
 vsock-y += af_vsock.o vsock_addr.o
 
 vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
 	vmci_transport_notify_qstate.o
+
+vmw_vsock_virtio_transport-y += virtio_transport.o
+
+vmw_vsock_virtio_transport_common-y += virtio_transport_common.o

From a2a85e6807dc3f9b34000cd0f23968d29260af0c Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 1 Aug 2016 23:20:53 +0300
Subject: [PATCH 2163/3220] UPSTREAM: vhost: detect 32 bit integer wrap around

Detect and fail early if long wrap around is triggered.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit ec33d031a14b3c5dd516627139c9550350dbba3e)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: Id71c1ea0355ce3e403bb5865dc3056d197fe218b
---
 drivers/vhost/vhost.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index bd3428ce6b9b..154a566f7b97 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -657,6 +657,12 @@ static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 			 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
 }
 
+static bool vhost_overflow(u64 uaddr, u64 size)
+{
+	/* Make sure 64 bit math will not overflow. */
+	return uaddr > ULONG_MAX || size > ULONG_MAX || uaddr > ULONG_MAX - size;
+}
+
 /* Caller should have vq mutex and device mutex. */
 static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
 			       int log_all)
@@ -669,9 +675,11 @@ static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
 	list_for_each_entry(node, &umem->umem_list, link) {
 		unsigned long a = node->userspace_addr;
 
-		if (node->size > ULONG_MAX)
+		if (vhost_overflow(node->userspace_addr, node->size))
 			return 0;
-		else if (!access_ok(VERIFY_WRITE, (void __user *)a,
+
+
+		if (!access_ok(VERIFY_WRITE, (void __user *)a,
 				    node->size))
 			return 0;
 		else if (log_all && !log_access_ok(log_base,
@@ -913,6 +921,10 @@ static int umem_access_ok(u64 uaddr, u64 size, int access)
 {
 	unsigned long a = uaddr;
 
+	/* Make sure 64 bit math will not overflow. */
+	if (vhost_overflow(uaddr, size))
+		return -EFAULT;
+
 	if ((access & VHOST_ACCESS_RO) &&
 	    !access_ok(VERIFY_READ, (void __user *)a, size))
 		return -EFAULT;

From 7fcee9753439dae03cae853979e32027ffeff135 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 2 Aug 2016 01:53:13 +0300
Subject: [PATCH 2164/3220] UPSTREAM: vhost: drop vringh dependency

vringh isn't used by vhost net or scsi - it's used
by CAIF only at the moment. Drop the dependency.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 6190efb08c16dcd68c64b096a28f47ab33f017d7)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I2373bf9a5cd21d350af3aca957df811c6aaeae63
---
 drivers/vhost/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 2b5f588f5b1e..f77f15bd3b0c 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -2,7 +2,6 @@ config VHOST_NET
 	tristate "Host kernel accelerator for virtio net"
 	depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
 	select VHOST
-	select VHOST_RING
 	---help---
 	  This kernel module can be loaded in host kernel to accelerate
 	  guest networking with virtio_net. Not to be confused with virtio_net
@@ -15,7 +14,6 @@ config VHOST_SCSI
 	tristate "VHOST_SCSI TCM fabric driver"
 	depends on TARGET_CORE && EVENTFD && m
 	select VHOST
-	select VHOST_RING
 	default n
 	---help---
 	Say M here to enable the vhost_scsi TCM fabric module

From ec5d426be248b0f44e1e26f059267cd267e96cfa Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 2 Aug 2016 01:53:13 +0300
Subject: [PATCH 2165/3220] UPSTREAM: vhost: drop vringh dependency

vringh isn't used by vhost net or scsi - it's used
by CAIF only at the moment. Drop the dependency.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 6190efb08c16dcd68c64b096a28f47ab33f017d7)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I9db24d3ced664637cffcf27fde8a1c08962bbebe
---
 drivers/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/Makefile b/drivers/Makefile
index 69d2ec997564..a8167ee6d865 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -140,6 +140,7 @@ obj-$(CONFIG_OF)		+= of/
 obj-$(CONFIG_SSB)		+= ssb/
 obj-$(CONFIG_BCMA)		+= bcma/
 obj-$(CONFIG_VHOST_RING)	+= vhost/
+obj-$(CONFIG_VHOST)		+= vhost/
 obj-$(CONFIG_VLYNQ)		+= vlynq/
 obj-$(CONFIG_STAGING)		+= staging/
 obj-y				+= platform/

From 80090036d4718744f6dc78f24b7e820b3bf97f87 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 2 Aug 2016 03:03:35 +0300
Subject: [PATCH 2166/3220] BACKPORT: vhost: split out vringh Kconfig

vringh is pulled in by caif and mic, but the other
vhost config does not need to be there.
In particular, it makes no sense to have vhost net/scsi/sock
under caif/mic.

Create a separate Kconfig file and put vringh bits there.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 4d93824561057d54712066544609dfc7453b210f)
[astrachan: Backported around no mic driver on 4.4]
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I6e630228bf91a8569bc665e9ad5d399eca8f7384
---
 drivers/net/caif/Kconfig     | 2 +-
 drivers/vhost/Kconfig        | 6 ------
 drivers/vhost/Kconfig.vringh | 5 +++++
 3 files changed, 6 insertions(+), 7 deletions(-)
 create mode 100644 drivers/vhost/Kconfig.vringh

diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index 547098086773..f81df91a9ce1 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -52,5 +52,5 @@ config CAIF_VIRTIO
 	The caif driver for CAIF over Virtio.
 
 if CAIF_VIRTIO
-source "drivers/vhost/Kconfig"
+source "drivers/vhost/Kconfig.vringh"
 endif
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index f77f15bd3b0c..40764ecad9ce 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -33,12 +33,6 @@ config VHOST_VSOCK
 	To compile this driver as a module, choose M here: the module will be called
 	vhost_vsock.
 
-config VHOST_RING
-	tristate
-	---help---
-	  This option is selected by any driver which needs to access
-	  the host side of a virtio ring.
-
 config VHOST
 	tristate
 	---help---
diff --git a/drivers/vhost/Kconfig.vringh b/drivers/vhost/Kconfig.vringh
new file mode 100644
index 000000000000..6a4490c09d7f
--- /dev/null
+++ b/drivers/vhost/Kconfig.vringh
@@ -0,0 +1,5 @@
+config VHOST_RING
+	tristate
+	---help---
+	  This option is selected by any driver which needs to access
+	  the host side of a virtio ring.

From b09cc234f36b1c1b26a20ef32ae6da817bd63ed6 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Tue, 2 Aug 2016 13:50:42 +0000
Subject: [PATCH 2167/3220] UPSTREAM: VSOCK: Use kvfree()

Use kvfree() instead of open-coding it.

Signed-off-by: Wei Yongjun <weiyj.lk@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit b226acab2f6aaa45c2af27279b63f622b23a44bd)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Ie270449a727baf74ee547137928e166e772b62ad
---
 drivers/vhost/vsock.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 028ca16c2d36..0ddf3a2dbfc4 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -434,10 +434,7 @@ err:
 
 static void vhost_vsock_free(struct vhost_vsock *vsock)
 {
-	if (is_vmalloc_addr(vsock))
-		vfree(vsock);
-	else
-		kfree(vsock);
+	kvfree(vsock);
 }
 
 static int vhost_vsock_dev_open(struct inode *inode, struct file *file)

From 613ffe183896c82b2d065efc39192686c25110bb Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 4 Aug 2016 14:52:53 +0100
Subject: [PATCH 2168/3220] UPSTREAM: vhost/vsock: fix vhost virtio_vsock_pkt
 use-after-free

Stash the packet length in a local variable before handing over
ownership of the packet to virtio_transport_recv_pkt() or
virtio_transport_free_pkt().

This patch solves the use-after-free since pkt is no longer guaranteed
to be alive.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 3fda5d6e580193fa005014355b3a61498f1b3ae0)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I2a6a8b2eb1b647645ff7c76a37f61dce3b0fab9f
---
 drivers/vhost/vsock.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 0ddf3a2dbfc4..e3b30ea9ece5 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -307,6 +307,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 
 	vhost_disable_notify(&vsock->dev, vq);
 	for (;;) {
+		u32 len;
+
 		if (!vhost_vsock_more_replies(vsock)) {
 			/* Stop tx until the device processes already
 			 * pending replies.  Leave tx virtqueue
@@ -334,13 +336,15 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 			continue;
 		}
 
+		len = pkt->len;
+
 		/* Only accept correctly addressed packets */
 		if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid)
 			virtio_transport_recv_pkt(pkt);
 		else
 			virtio_transport_free_pkt(pkt);
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
 		added = true;
 	}
 

From 72e3ee7594c8af213a5423bc52ab86b67702c045 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Fri, 5 Aug 2016 13:52:09 +0100
Subject: [PATCH 2169/3220] UPSTREAM: virtio-vsock: fix include guard typo

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 28ad55578b8a76390d966b09da8c7fa3644f5140)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I79c87359bb06f4f90bc2d2a09971b61a3a381499
---
 include/uapi/linux/virtio_vsock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
index 6b011c19b50f..1d57ed3d84d2 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -32,7 +32,7 @@
  */
 
 #ifndef _UAPI_LINUX_VIRTIO_VSOCK_H
-#define _UAPI_LINUX_VIRTIO_VOSCK_H
+#define _UAPI_LINUX_VIRTIO_VSOCK_H
 
 #include <linux/types.h>
 #include <linux/virtio_ids.h>

From ddabe560202a019873cc62b225f2006b68110e15 Mon Sep 17 00:00:00 2001
From: Gerard Garcia <ggarcia@deic.uab.cat>
Date: Wed, 10 Aug 2016 17:24:34 +0200
Subject: [PATCH 2170/3220] UPSTREAM: vhost/vsock: drop space available check
 for TX vq

Remove unnecessary use of enable/disable callback notifications
and the incorrect more space available check.

The virtio_transport_tx_work handles when the TX virtqueue
has more buffers available.

Signed-off-by: Gerard Garcia <ggarcia@deic.uab.cat>
Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 21bc54fc0cdc31de72b57d2b3c79cf9c2b83cf39)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Id76bb00fb4393add39d876e29b0e20617c6033e7
---
 net/vmw_vsock/virtio_transport.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 699dfabdbccd..936d7eee62d0 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -87,9 +87,6 @@ virtio_transport_send_pkt_work(struct work_struct *work)
 
 	vq = vsock->vqs[VSOCK_VQ_TX];
 
-	/* Avoid unnecessary interrupts while we're processing the ring */
-	virtqueue_disable_cb(vq);
-
 	for (;;) {
 		struct virtio_vsock_pkt *pkt;
 		struct scatterlist hdr, buf, *sgs[2];
@@ -99,7 +96,6 @@ virtio_transport_send_pkt_work(struct work_struct *work)
 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
 			spin_unlock_bh(&vsock->send_pkt_list_lock);
-			virtqueue_enable_cb(vq);
 			break;
 		}
 
@@ -118,13 +114,13 @@ virtio_transport_send_pkt_work(struct work_struct *work)
 		}
 
 		ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL);
+		/* Usually this means that there is no more space available in
+		 * the vq
+		 */
 		if (ret < 0) {
 			spin_lock_bh(&vsock->send_pkt_list_lock);
 			list_add(&pkt->list, &vsock->send_pkt_list);
 			spin_unlock_bh(&vsock->send_pkt_list_lock);
-
-			if (!virtqueue_enable_cb(vq) && ret == -ENOSPC)
-				continue; /* retry now that we have more space */
 			break;
 		}
 

From 704b8c6108c0c037d4a2f1a4b28156bc16456f09 Mon Sep 17 00:00:00 2001
From: Jorgen Hansen <jhansen@vmware.com>
Date: Mon, 26 Sep 2016 23:59:53 -0700
Subject: [PATCH 2171/3220] UPSTREAM: VSOCK: Don't dec ack backlog twice for
 rejected connections

If a pending socket is marked as rejected, we will decrease the
sk_ack_backlog twice. So don't decrement it for rejected sockets
in vsock_pending_work().

Testing of the rejected socket path was done through code
modifications.

Reported-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Jorgen Hansen <jhansen@vmware.com>
Reviewed-by: Adit Ranadive <aditr@vmware.com>
Reviewed-by: Aditya Sarwade <asarwade@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 1190cfdb1a19d89561ae51cff7d9c2ead24b3ebe)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Ic610a13e81f53fc570e037d0685c1263dc646e70
---
 net/vmw_vsock/af_vsock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index db7c6615371b..d646359bbf6d 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -465,6 +465,8 @@ static void vsock_pending_work(struct work_struct *work)
 
 	if (vsock_is_pending(sk)) {
 		vsock_remove_pending(listener, sk);
+
+		listener->sk_ack_backlog--;
 	} else if (!vsk->rejected) {
 		/* We are not on the pending list and accept() did not reject
 		 * us, so we must have been accepted by our user process.  We
@@ -475,8 +477,6 @@ static void vsock_pending_work(struct work_struct *work)
 		goto out;
 	}
 
-	listener->sk_ack_backlog--;
-
 	/* We need to remove ourself from the global connected sockets list so
 	 * incoming packets can't find this socket, and to reduce the reference
 	 * count.
@@ -2006,5 +2006,5 @@ EXPORT_SYMBOL_GPL(vsock_core_get_transport);
 
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMware Virtual Socket Family");
-MODULE_VERSION("1.0.1.0-k");
+MODULE_VERSION("1.0.2.0-k");
 MODULE_LICENSE("GPL v2");

From de7a923e3093d57d3c5e93757c88347f97d5b6a2 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 6 Dec 2016 06:07:15 +0200
Subject: [PATCH 2172/3220] UPSTREAM: vsock/virtio: fix src/dst cid format

commit f83f12d660d11718d3eed9d979ee03e83aa55544 upstream.

These fields are 64 bit, using le32_to_cpu and friends
on these will not do the right thing.
Fix this up.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(cherry picked from commit e80ceb2da52e0aae8e0ae9632c3abbfdd579cf61)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Ieccae61162243040f9ad29dc465a50be7f45ee62
---
 net/vmw_vsock/virtio_transport_common.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index a53b3a16b4f1..62c056ea403b 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -606,9 +606,9 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
 		return 0;
 
 	pkt = virtio_transport_alloc_pkt(&info, 0,
-					 le32_to_cpu(pkt->hdr.dst_cid),
+					 le64_to_cpu(pkt->hdr.dst_cid),
 					 le32_to_cpu(pkt->hdr.dst_port),
-					 le32_to_cpu(pkt->hdr.src_cid),
+					 le64_to_cpu(pkt->hdr.src_cid),
 					 le32_to_cpu(pkt->hdr.src_port));
 	if (!pkt)
 		return -ENOMEM;
@@ -823,7 +823,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
 	struct virtio_vsock_pkt_info info = {
 		.op = VIRTIO_VSOCK_OP_RESPONSE,
 		.type = VIRTIO_VSOCK_TYPE_STREAM,
-		.remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+		.remote_cid = le64_to_cpu(pkt->hdr.src_cid),
 		.remote_port = le32_to_cpu(pkt->hdr.src_port),
 		.reply = true,
 	};
@@ -863,9 +863,9 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
 	child->sk_state = SS_CONNECTED;
 
 	vchild = vsock_sk(child);
-	vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+	vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid),
 			le32_to_cpu(pkt->hdr.dst_port));
-	vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+	vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
 			le32_to_cpu(pkt->hdr.src_port));
 
 	vsock_insert_connected(vchild);
@@ -904,9 +904,9 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
 	struct sock *sk;
 	bool space_available;
 
-	vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
+	vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid),
 			le32_to_cpu(pkt->hdr.src_port));
-	vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
+	vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid),
 			le32_to_cpu(pkt->hdr.dst_port));
 
 	trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,

From f95e936b327d46040db3d9d786ac8a67a21ee1f9 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Fri, 9 Dec 2016 01:10:46 +0800
Subject: [PATCH 2173/3220] UPSTREAM: vhost-vsock: fix orphan connection reset

local_addr.svm_cid is host cid. We should check guest cid instead,
which is remote_addr.svm_cid. Otherwise we end up resetting all
connections to all guests.

Cc: stable@vger.kernel.org [4.8+]
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit c4587631c7bad47c045e081d1553cd73a23be59a)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Ib9bf9e1f0befa71f4ebc0ff0cdcef7115a41b110
---
 drivers/vhost/vsock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index e3b30ea9ece5..a504e2e003da 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -506,7 +506,7 @@ static void vhost_vsock_reset_orphans(struct sock *sk)
 	 * executing.
 	 */
 
-	if (!vhost_vsock_get(vsk->local_addr.svm_cid)) {
+	if (!vhost_vsock_get(vsk->remote_addr.svm_cid)) {
 		sock_set_flag(sk, SOCK_DONE);
 		vsk->peer_shutdown = SHUTDOWN_MASK;
 		sk->sk_state = SS_UNCONNECTED;

From d8e26f878c3517d795bc4e847691b568947e9b42 Mon Sep 17 00:00:00 2001
From: Gao feng <omarapazanadi@gmail.com>
Date: Wed, 14 Dec 2016 19:24:36 +0800
Subject: [PATCH 2174/3220] UPSTREAM: vsock: lookup and setup guest_cid inside
 vhost_vsock_lock

[ Upstream commit 6c083c2b8a0a110cad936bc0a2c089f0d8115175 ]

Multi vsocks may setup the same cid at the same time.

Signed-off-by: Gao feng <omarapazanadi@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
(cherry picked from commit 2d5a1b31799efcd37a57fe4d2f492d8dc2a0a334)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Ic2fa852a2080a4df1cb3b823c02d65a354c15b34
---
 drivers/vhost/vsock.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index a504e2e003da..4db3a487867a 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -50,11 +50,10 @@ static u32 vhost_transport_get_local_cid(void)
 	return VHOST_VSOCK_DEFAULT_HOST_CID;
 }
 
-static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
+static struct vhost_vsock *__vhost_vsock_get(u32 guest_cid)
 {
 	struct vhost_vsock *vsock;
 
-	spin_lock_bh(&vhost_vsock_lock);
 	list_for_each_entry(vsock, &vhost_vsock_list, list) {
 		u32 other_cid = vsock->guest_cid;
 
@@ -63,15 +62,24 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 			continue;
 
 		if (other_cid == guest_cid) {
-			spin_unlock_bh(&vhost_vsock_lock);
 			return vsock;
 		}
 	}
-	spin_unlock_bh(&vhost_vsock_lock);
 
 	return NULL;
 }
 
+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
+{
+	struct vhost_vsock *vsock;
+
+	spin_lock_bh(&vhost_vsock_lock);
+	vsock = __vhost_vsock_get(guest_cid);
+	spin_unlock_bh(&vhost_vsock_lock);
+
+	return vsock;
+}
+
 static void
 vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			    struct vhost_virtqueue *vq)
@@ -562,11 +570,12 @@ static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
 		return -EINVAL;
 
 	/* Refuse if CID is already in use */
-	other = vhost_vsock_get(guest_cid);
-	if (other && other != vsock)
-		return -EADDRINUSE;
-
 	spin_lock_bh(&vhost_vsock_lock);
+	other = __vhost_vsock_get(guest_cid);
+	if (other && other != vsock) {
+		spin_unlock_bh(&vhost_vsock_lock);
+		return -EADDRINUSE;
+	}
 	vsock->guest_cid = guest_cid;
 	spin_unlock_bh(&vhost_vsock_lock);
 

From 7f320a93701c2f2102b4d6675f6f22a4a67a98e2 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 19 Jan 2017 10:43:53 +0000
Subject: [PATCH 2175/3220] UPSTREAM: vhost/vsock: handle
 vhost_vq_init_access() error

[ Upstream commit 0516ffd88fa0d006ee80389ce14a9ca5ae45e845 ]

Propagate the error when vhost_vq_init_access() fails and set
vq->private_data to NULL.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit ae36f6a65af6f4eaca01cc5b68d8ecb266dbcc17)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I1e062ae003b92d000967921014a31bfc574632d8
---
 drivers/vhost/vsock.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 4db3a487867a..e398a4dbba59 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -376,6 +376,7 @@ static void vhost_vsock_handle_rx_kick(struct vhost_work *work)
 
 static int vhost_vsock_start(struct vhost_vsock *vsock)
 {
+	struct vhost_virtqueue *vq;
 	size_t i;
 	int ret;
 
@@ -386,19 +387,20 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
 		goto err;
 
 	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
-		struct vhost_virtqueue *vq = &vsock->vqs[i];
+		vq = &vsock->vqs[i];
 
 		mutex_lock(&vq->mutex);
 
 		if (!vhost_vq_access_ok(vq)) {
 			ret = -EFAULT;
-			mutex_unlock(&vq->mutex);
 			goto err_vq;
 		}
 
 		if (!vq->private_data) {
 			vq->private_data = vsock;
-			vhost_vq_init_access(vq);
+			ret = vhost_vq_init_access(vq);
+			if (ret)
+				goto err_vq;
 		}
 
 		mutex_unlock(&vq->mutex);
@@ -408,8 +410,11 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
 	return 0;
 
 err_vq:
+	vq->private_data = NULL;
+	mutex_unlock(&vq->mutex);
+
 	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
-		struct vhost_virtqueue *vq = &vsock->vqs[i];
+		vq = &vsock->vqs[i];
 
 		mutex_lock(&vq->mutex);
 		vq->private_data = NULL;

From 982b533c5d2b95abb7e7558f99a00def1b02e4a9 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic@linux.vnet.ibm.com>
Date: Mon, 30 Jan 2017 11:09:36 +0100
Subject: [PATCH 2176/3220] UPSTREAM: vhost: fix initialization for vq->is_le

commit cda8bba0f99d25d2061c531113c14fa41effc3ae upstream.

Currently, under certain circumstances vhost_init_is_le does just a part
of the initialization job, and depends on vhost_reset_is_le being called
too. For this reason vhost_vq_init_access used to call vhost_reset_is_le
when vq->private_data is NULL. This is not only counter intuitive, but
also real a problem because it breaks vhost_net. The bug was introduced to
vhost_net with commit 2751c9882b94 ("vhost: cross-endian support for
legacy devices"). The symptom is corruption of the vq's used.idx field
(virtio) after VHOST_NET_SET_BACKEND was issued as a part of the vhost
shutdown on a vq with pending descriptors.

Let us make sure the outcome of vhost_init_is_le never depend on the state
it is actually supposed to initialize, and fix virtio_net by removing the
reset from vhost_vq_init_access.

With the above, there is no reason for vhost_reset_is_le to do just half
of the job. Let us make vhost_reset_is_le reinitialize is_le.

Signed-off-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reported-by: Michael A. Tebolt <miket@us.ibm.com>
Reported-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Fixes: commit 2751c9882b94 ("vhost: cross-endian support for legacy devices")
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Tested-by: Michael A. Tebolt <miket@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 1594edd9ea0d75ef106bffc23c2b07b509f3301c)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I5436f3410dc32fe059c904c8001fbe05feae756d
---
 drivers/vhost/vhost.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 154a566f7b97..80013975cfac 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -131,14 +131,14 @@ static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
 
 static void vhost_init_is_le(struct vhost_virtqueue *vq)
 {
-	if (vhost_has_feature(vq, VIRTIO_F_VERSION_1))
-		vq->is_le = true;
+	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
+		|| virtio_legacy_is_little_endian();
 }
 #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
 
 static void vhost_reset_is_le(struct vhost_virtqueue *vq)
 {
-	vq->is_le = virtio_legacy_is_little_endian();
+	vhost_init_is_le(vq);
 }
 
 struct vhost_flush_struct {
@@ -1714,10 +1714,8 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 	int r;
 	bool is_le = vq->is_le;
 
-	if (!vq->private_data) {
-		vhost_reset_is_le(vq);
+	if (!vq->private_data)
 		return 0;
-	}
 
 	vhost_init_is_le(vq);
 

From 2b8dd4215e444e7c58b7ec0f352cb85f3208f845 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Wed, 15 Mar 2017 09:32:14 +0800
Subject: [PATCH 2177/3220] UPSTREAM: vsock: track pkt owner vsock

[ Upstream commit 36d277bac8080202684e67162ebb157f16631581 ]

So that we can cancel a queued pkt later if necessary.

Signed-off-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 6f1848e778d9a9f9dd89abee53d2a688277d1784)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I29b8bacf06eaf57a91291c1d088802c0dea7fed0
---
 include/linux/virtio_vsock.h            | 3 +++
 net/vmw_vsock/virtio_transport_common.c | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 9638bfeb0d1f..584f9a647ad4 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -48,6 +48,8 @@ struct virtio_vsock_pkt {
 	struct virtio_vsock_hdr	hdr;
 	struct work_struct work;
 	struct list_head list;
+	/* socket refcnt not held, only use for cancellation */
+	struct vsock_sock *vsk;
 	void *buf;
 	u32 len;
 	u32 off;
@@ -56,6 +58,7 @@ struct virtio_vsock_pkt {
 
 struct virtio_vsock_pkt_info {
 	u32 remote_cid, remote_port;
+	struct vsock_sock *vsk;
 	struct msghdr *msg;
 	u32 pkt_len;
 	u16 type;
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 62c056ea403b..9c07c76c504d 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -57,6 +57,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
 	pkt->len		= len;
 	pkt->hdr.len		= cpu_to_le32(len);
 	pkt->reply		= info->reply;
+	pkt->vsk		= info->vsk;
 
 	if (info->msg && len > 0) {
 		pkt->buf = kmalloc(len, GFP_KERNEL);
@@ -180,6 +181,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
 	struct virtio_vsock_pkt_info info = {
 		.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
 		.type = type,
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
@@ -519,6 +521,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
 	struct virtio_vsock_pkt_info info = {
 		.op = VIRTIO_VSOCK_OP_REQUEST,
 		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
@@ -534,6 +537,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
 			  VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
 			 (mode & SEND_SHUTDOWN ?
 			  VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
@@ -560,6 +564,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
 		.type = VIRTIO_VSOCK_TYPE_STREAM,
 		.msg = msg,
 		.pkt_len = len,
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
@@ -581,6 +586,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
 		.op = VIRTIO_VSOCK_OP_RST,
 		.type = VIRTIO_VSOCK_TYPE_STREAM,
 		.reply = !!pkt,
+		.vsk = vsk,
 	};
 
 	/* Send RST only if the original pkt is not a RST pkt */
@@ -826,6 +832,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
 		.remote_cid = le64_to_cpu(pkt->hdr.src_cid),
 		.remote_port = le32_to_cpu(pkt->hdr.src_port),
 		.reply = true,
+		.vsk = vsk,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);

From 8d30307c8dc91e7ae6d08d2344615599810ab19a Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Wed, 15 Mar 2017 09:32:15 +0800
Subject: [PATCH 2178/3220] UPSTREAM: vhost-vsock: add pkt cancel capability

[ Upstream commit 16320f363ae128d9b9c70e60f00f2a572f57c23d ]

To allow canceling all packets of a connection.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 482b3f92aea249b031ba0bc24df73f3c48f1f5c2)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I1a06010bbcbcf03e1557d053b773cd6a2c1c0b02
---
 drivers/vhost/vsock.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 include/net/af_vsock.h |  3 +++
 2 files changed, 44 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index e398a4dbba59..6bca57896915 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -226,6 +226,46 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
 	return len;
 }
 
+static int
+vhost_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+	struct vhost_vsock *vsock;
+	struct virtio_vsock_pkt *pkt, *n;
+	int cnt = 0;
+	LIST_HEAD(freeme);
+
+	/* Find the vhost_vsock according to guest context id  */
+	vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
+	if (!vsock)
+		return -ENODEV;
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+		if (pkt->vsk != vsk)
+			continue;
+		list_move(&pkt->list, &freeme);
+	}
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	list_for_each_entry_safe(pkt, n, &freeme, list) {
+		if (pkt->reply)
+			cnt++;
+		list_del(&pkt->list);
+		virtio_transport_free_pkt(pkt);
+	}
+
+	if (cnt) {
+		struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+		int new_cnt;
+
+		new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
+		if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
+			vhost_poll_queue(&tx_vq->poll);
+	}
+
+	return 0;
+}
+
 static struct virtio_vsock_pkt *
 vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
 		      unsigned int out, unsigned int in)
@@ -678,6 +718,7 @@ static struct virtio_transport vhost_transport = {
 		.release                  = virtio_transport_release,
 		.connect                  = virtio_transport_connect,
 		.shutdown                 = virtio_transport_shutdown,
+		.cancel_pkt               = vhost_transport_cancel_pkt,
 
 		.dgram_enqueue            = virtio_transport_dgram_enqueue,
 		.dgram_dequeue            = virtio_transport_dgram_dequeue,
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e6675f79b925..f38fe1c00564 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -100,6 +100,9 @@ struct vsock_transport {
 	void (*destruct)(struct vsock_sock *);
 	void (*release)(struct vsock_sock *);
 
+	/* Cancel all pending packets sent on vsock. */
+	int (*cancel_pkt)(struct vsock_sock *vsk);
+
 	/* Connections. */
 	int (*connect)(struct vsock_sock *);
 

From 821984ea0cac6e2a37003c87df743c8e3ee56aa9 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Wed, 15 Mar 2017 09:32:17 +0800
Subject: [PATCH 2179/3220] UPSTREAM: vsock: cancel packets when failing to
 connect

[ Upstream commit 380feae0def7e6a115124a3219c3ec9b654dca32 ]

Otherwise we'll leave the packets queued until releasing vsock device.
E.g., if guest is slow to start up, resulting ETIMEDOUT on connect, guest
will get the connect requests from failed host sockets.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 98d20e5902667f9b44a75116041a630823f81e46)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I0685ec15925c662d4a51cdd24fc4f26a8c7ff723
---
 net/vmw_vsock/af_vsock.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d646359bbf6d..7e3c04eb01ee 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1104,10 +1104,19 @@ static const struct proto_ops vsock_dgram_ops = {
 	.sendpage = sock_no_sendpage,
 };
 
+static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+	if (!transport->cancel_pkt)
+		return -EOPNOTSUPP;
+
+	return transport->cancel_pkt(vsk);
+}
+
 static void vsock_connect_timeout(struct work_struct *work)
 {
 	struct sock *sk;
 	struct vsock_sock *vsk;
+	int cancel = 0;
 
 	vsk = container_of(work, struct vsock_sock, connect_work.work);
 	sk = sk_vsock(vsk);
@@ -1118,8 +1127,11 @@ static void vsock_connect_timeout(struct work_struct *work)
 		sk->sk_state = SS_UNCONNECTED;
 		sk->sk_err = ETIMEDOUT;
 		sk->sk_error_report(sk);
+		cancel = 1;
 	}
 	release_sock(sk);
+	if (cancel)
+		vsock_transport_cancel_pkt(vsk);
 
 	sock_put(sk);
 }
@@ -1224,11 +1236,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 			err = sock_intr_errno(timeout);
 			sk->sk_state = SS_UNCONNECTED;
 			sock->state = SS_UNCONNECTED;
+			vsock_transport_cancel_pkt(vsk);
 			goto out_wait;
 		} else if (timeout == 0) {
 			err = -ETIMEDOUT;
 			sk->sk_state = SS_UNCONNECTED;
 			sock->state = SS_UNCONNECTED;
+			vsock_transport_cancel_pkt(vsk);
 			goto out_wait;
 		}
 

From d6c52dda2c08eb74a92c722fa2655f5447161db7 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Fri, 19 May 2017 11:21:59 -0700
Subject: [PATCH 2180/3220] UPSTREAM: vsock: use new wait API for
 vsock_stream_sendmsg()

commit 499fde662f1957e3cb8d192a94a099ebe19c714b upstream.

As reported by Michal, vsock_stream_sendmsg() could still
sleep at vsock_stream_has_space() after prepare_to_wait():

  vsock_stream_has_space
    vmci_transport_stream_has_space
      vmci_qpair_produce_free_space
        qp_lock
          qp_acquire_queue_mutex
            mutex_lock

Just switch to the new wait API like we did for commit
d9dc8b0f8b4e ("net: fix sleeping for sk_wait_event()").

Reported-by: Michal Kubecek <mkubecek@suse.cz>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Jorgen Hansen <jhansen@vmware.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: "Jorgen S. Hansen" <jhansen@vmware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(cherry picked from commit 6be6e48daabc18858a0a5a9d18fbd399714fc9e4)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: Ie0ac0685071c2034c9565e2554ff5e41f6cfd619
---
 net/vmw_vsock/af_vsock.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 7e3c04eb01ee..7566395e526d 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1585,7 +1585,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	while (total_written < len) {
 		ssize_t written;
 
-		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		add_wait_queue(sk_sleep(sk), &wait);
 		while (vsock_stream_has_space(vsk) == 0 &&
 		       sk->sk_err == 0 &&
 		       !(sk->sk_shutdown & SEND_SHUTDOWN) &&
@@ -1594,13 +1594,13 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 			/* Don't wait for non-blocking sockets. */
 			if (timeout == 0) {
 				err = -EAGAIN;
-				finish_wait(sk_sleep(sk), &wait);
+				remove_wait_queue(sk_sleep(sk), &wait);
 				goto out_err;
 			}
 
 			err = transport->notify_send_pre_block(vsk, &send_data);
 			if (err < 0) {
-				finish_wait(sk_sleep(sk), &wait);
+				remove_wait_queue(sk_sleep(sk), &wait);
 				goto out_err;
 			}
 
@@ -1609,15 +1609,15 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 			lock_sock(sk);
 			if (signal_pending(current)) {
 				err = sock_intr_errno(timeout);
-				finish_wait(sk_sleep(sk), &wait);
+				remove_wait_queue(sk_sleep(sk), &wait);
 				goto out_err;
 			} else if (timeout == 0) {
 				err = -EAGAIN;
-				finish_wait(sk_sleep(sk), &wait);
+				remove_wait_queue(sk_sleep(sk), &wait);
 				goto out_err;
 			}
 		}
-		finish_wait(sk_sleep(sk), &wait);
+		remove_wait_queue(sk_sleep(sk), &wait);
 
 		/* These checks occur both as part of and after the loop
 		 * conditional since we need to check before and after

From ea183988dee1e37542ddc1bbc84c5a2390074bfd Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 5 Sep 2017 09:22:05 +0800
Subject: [PATCH 2181/3220] UPSTREAM: vhost_net: correctly check tx avail
 during rx busy polling

[ Upstream commit 8b949bef9172ca69d918e93509a4ecb03d0355e0 ]

We check tx avail through vhost_enable_notify() in the past which is
wrong since it only checks whether or not guest has filled more
available buffer since last avail idx synchronization which was just
done by vhost_vq_avail_empty() before. What we really want is checking
pending buffers in the avail ring. Fix this by calling
vhost_vq_avail_empty() instead.

This issue could be noticed by doing netperf TCP_RR benchmark as
client from guest (but not host). With this fix, TCP_RR from guest to
localhost restores from 1375.91 trans per sec to 55235.28 trans per
sec on my laptop (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz).

Fixes: 030881372460 ("vhost_net: basic polling support")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit f5755c0e870056dd35c95a0b5c0a038cdb4382ee)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: Ia61d229fb5c82e64fd1c30aff06a9f938d20e467
---
 drivers/vhost/net.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 77e94ecfd608..4b3cae02c0dc 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -523,8 +523,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 
 		preempt_enable();
 
-		if (vhost_enable_notify(&net->dev, vq))
+		if (!vhost_vq_avail_empty(&net->dev, vq))
 			vhost_poll_queue(&vq->poll);
+		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+			vhost_disable_notify(&net->dev, vq);
+			vhost_poll_queue(&vq->poll);
+		}
+
 		mutex_unlock(&vq->mutex);
 
 		len = peek_head_len(sk);

From b53fc49147e25e6a6f672b319f3091c53f024f10 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 9 Nov 2017 13:29:10 +0000
Subject: [PATCH 2182/3220] UPSTREAM: vhost/vsock: fix uninitialized
 vhost_vsock->guest_cid

commit a72b69dc083a931422cc8a5e33841aff7d5312f2 upstream.

The vhost_vsock->guest_cid field is uninitialized when /dev/vhost-vsock
is opened until the VHOST_VSOCK_SET_GUEST_CID ioctl is called.

kvmalloc(..., GFP_KERNEL | __GFP_RETRY_MAYFAIL) does not zero memory.
All other vhost_vsock fields are initialized explicitly so just
initialize this field too.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Daniel Verkamp <dverkamp@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 258d8549b55e2cd5d73d9ff3d18b39aeeab08e1f)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I230006942ceb015f6549449160205cbb501b8fa6
---
 drivers/vhost/vsock.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 6bca57896915..e6ba768ab01c 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -516,6 +516,8 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
+	vsock->guest_cid = 0; /* no CID assigned yet */
+
 	atomic_set(&vsock->queued_replies, 0);
 
 	vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX];

From b2696b4a37bdb9f2716017859a88b95ca6e30088 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 23 Jan 2018 17:27:25 +0800
Subject: [PATCH 2183/3220] UPSTREAM: vhost: use mutex_lock_nested() in
 vhost_dev_lock_vqs()

commit e9cb4239134c860e5f92c75bf5321bd377bb505b upstream.

We used to call mutex_lock() in vhost_dev_lock_vqs() which tries to
hold mutexes of all virtqueues. This may confuse lockdep to report a
possible deadlock because of trying to hold locks belong to same
class. Switch to use mutex_lock_nested() to avoid false positive.

Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Reported-by: syzbot+dbb7c1161485e61b0241@syzkaller.appspotmail.com
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit bd3ccdc6f922c6b7db4b7075d1b6596ddb986a98)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I20829e721706c10ef1696de0d78dd99110e40a6b
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 80013975cfac..854eb40f019d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -849,7 +849,7 @@ static void vhost_dev_lock_vqs(struct vhost_dev *d)
 {
 	int i = 0;
 	for (i = 0; i < d->nvqs; ++i)
-		mutex_lock(&d->vqs[i]->mutex);
+		mutex_lock_nested(&d->vqs[i]->mutex, i);
 }
 
 static void vhost_dev_unlock_vqs(struct vhost_dev *d)

From 579cd6fe1e70cdba651eeac567df3ad0dc2272ad Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 26 Mar 2018 16:10:23 +0800
Subject: [PATCH 2184/3220] UPSTREAM: vhost_net: add missing lock nesting
 notation

[ Upstream commit aaa3149bbee9ba9b4e6f0bd6e3e7d191edeae942 ]

We try to hold TX virtqueue mutex in vhost_net_rx_peek_head_len()
after RX virtqueue mutex is held in handle_rx(). This requires an
appropriate lock nesting notation to calm down deadlock detector.

Fixes: 0308813724606 ("vhost_net: basic polling support")
Reported-by: syzbot+7f073540b1384a614e09@syzkaller.appspotmail.com
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 1cb81756b7c3813f7d65d3c6bb1133cb53b69775)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: Ia72badca8f5f92095cfad91051fe47f76e6b03b7
---
 drivers/vhost/net.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4b3cae02c0dc..53cf130922f3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -510,7 +510,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 
 	if (!len && vq->busyloop_timeout) {
 		/* Both tx vq and rx socket were polled here */
-		mutex_lock(&vq->mutex);
+		mutex_lock_nested(&vq->mutex, 1);
 		vhost_disable_notify(&net->dev, vq);
 
 		preempt_disable();
@@ -643,7 +643,7 @@ static void handle_rx(struct vhost_net *net)
 	struct iov_iter fixup;
 	__virtio16 num_buffers;
 
-	mutex_lock(&vq->mutex);
+	mutex_lock_nested(&vq->mutex, 0);
 	sock = vq->private_data;
 	if (!sock)
 		goto out;

From 167925fc0eff8e96b3387741bad23ce080b50a84 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 29 Mar 2018 16:00:04 +0800
Subject: [PATCH 2185/3220] UPSTREAM: vhost: validate log when IOTLB is enabled

[ Upstream commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ]

Vq log_base is the userspace address of bitmap which has nothing to do
with IOTLB. So it needs to be validated unconditionally otherwise we
may try use 0 as log_base which may lead to pin pages that will lead
unexpected result (e.g trigger BUG_ON() in set_bit_to_user()).

Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Reported-by: syzbot+6304bf97ef436580fede@syzkaller.appspotmail.com
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 992dbb1d5a653c49a7f2159f75d9cda0d8248f1f)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I64ef687ec8f55800ce300d0a0eac5473853bf6fd
---
 drivers/vhost/vhost.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 854eb40f019d..6f4c7d2d82b4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1176,14 +1176,12 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
 /* Caller should have vq mutex and device mutex */
 int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 {
-	if (vq->iotlb) {
-		/* When device IOTLB was used, the access validation
-		 * will be validated during prefetching.
-		 */
-		return 1;
-	}
-	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used) &&
-		vq_log_access_ok(vq, vq->log_base);
+	int ret = vq_log_access_ok(vq, vq->log_base);
+
+	if (ret || vq->iotlb)
+		return ret;
+
+	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
 }
 EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
 

From 6619733eab40b44102585658892ec96513106701 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 11 Apr 2018 10:35:40 +0800
Subject: [PATCH 2186/3220] UPSTREAM: vhost: fix vhost_vq_access_ok() log check

[ Upstream commit d14d2b78090c7de0557362b26a4ca591aa6a9faa ]

Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log
when IOTLB is enabled") introduced a regression.  The logic was
originally:

  if (vq->iotlb)
      return 1;
  return A && B;

After the patch the short-circuit logic for A was inverted:

  if (A || vq->iotlb)
      return A;
  return B;

This patch fixes the regression by rewriting the checks in the obvious
way, no longer returning A when vq->iotlb is non-NULL (which is hard to
understand).

Reported-by: syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 72de9891b5f46f1f98e7e6243c47076a4b4daa3c)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I63938aaa9f1cf44e4eb1a18693f8c7963eff927e
---
 drivers/vhost/vhost.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6f4c7d2d82b4..12a01bc51cd5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1176,10 +1176,12 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
 /* Caller should have vq mutex and device mutex */
 int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 {
-	int ret = vq_log_access_ok(vq, vq->log_base);
+	if (!vq_log_access_ok(vq, vq->log_base))
+		return 0;
 
-	if (ret || vq->iotlb)
-		return ret;
+	/* Access validation occurs at prefetch time with IOTLB */
+	if (vq->iotlb)
+		return 1;
 
 	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
 }

From f46187d14af311b03cbe605321b49fc0747b4a49 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sat, 12 May 2018 00:33:10 +0300
Subject: [PATCH 2187/3220] UPSTREAM: vhost: fix info leak due to uninitialized
 memory

commit 670ae9caaca467ea1bfd325cb2a5c98ba87f94ad upstream.

struct vhost_msg within struct vhost_msg_node is copied to userspace.
Unfortunately it turns out on 64 bit systems vhost_msg has padding after
type which gcc doesn't initialize, leaking 4 uninitialized bytes to
userspace.

This padding also unfortunately means 32 bit users of this interface are
broken on a 64 bit kernel which will need to be fixed separately.

Fixes: CVE-2018-1118
Cc: stable@vger.kernel.org
Reported-by: Kevin Easton <kevin@guarana.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reported-by: syzbot+87cfa083e727a224754b@syzkaller.appspotmail.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 9681c3bdb098f6c87a0422b6b63912c1b90ad197)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: Ie5a29c3946792ae0f20e04015ba28c89fd90becb
---
 drivers/vhost/vhost.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 12a01bc51cd5..fb94b311f498 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2296,6 +2296,9 @@ struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
 	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
 	if (!node)
 		return NULL;
+
+	/* Make sure all padding within the structure is initialized. */
+	memset(&node->msg, 0, sizeof node->msg);
 	node->vq = vq;
 	node->msg.type = type;
 	return node;

From 732e0b13c5d1cf99e6c19bdeb43a3da750242837 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 22 May 2018 19:58:57 +0800
Subject: [PATCH 2188/3220] UPSTREAM: vhost: synchronize IOTLB message with dev
 cleanup

[ Upstream commit 1b15ad683ab42a203f98b67045b40720e99d0e9a ]

DaeRyong Jeong reports a race between vhost_dev_cleanup() and
vhost_process_iotlb_msg():

Thread interleaving:
CPU0 (vhost_process_iotlb_msg)			CPU1 (vhost_dev_cleanup)
(In the case of both VHOST_IOTLB_UPDATE and
VHOST_IOTLB_INVALIDATE)

=====						=====
						vhost_umem_clean(dev->iotlb);
if (!dev->iotlb) {
	        ret = -EFAULT;
		        break;
}
						dev->iotlb = NULL;

The reason is we don't synchronize between them, fixing by protecting
vhost_process_iotlb_msg() with dev mutex.

Reported-by: DaeRyong Jeong <threeearcat@gmail.com>
Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit f833209e15bd6cf066e731463308f0058736a74b)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I5f372cf9e2d0cf0002ebe01d692e33c3ff52f5d2
---
 drivers/vhost/vhost.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index fb94b311f498..acdb9ac34242 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -939,6 +939,7 @@ int vhost_process_iotlb_msg(struct vhost_dev *dev,
 {
 	int ret = 0;
 
+	mutex_lock(&dev->mutex);
 	vhost_dev_lock_vqs(dev);
 	switch (msg->type) {
 	case VHOST_IOTLB_UPDATE:
@@ -968,6 +969,8 @@ int vhost_process_iotlb_msg(struct vhost_dev *dev,
 	}
 
 	vhost_dev_unlock_vqs(dev);
+	mutex_unlock(&dev->mutex);
+
 	return ret;
 }
 ssize_t vhost_chr_write_iter(struct vhost_dev *dev,

From eb2ca3c19653d03815a491cb914e21247f0dc177 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 24 Aug 2018 16:53:13 +0800
Subject: [PATCH 2189/3220] UPSTREAM: vhost: correctly check the iova range
 when waking virtqueue

[ Upstream commit 2d66f997f0545c8f7fc5cf0b49af1decb35170e7 ]

We don't wakeup the virtqueue if the first byte of pending iova range
is the last byte of the range we just got updated. This will lead a
virtqueue to wait for IOTLB updating forever. Fixing by correct the
check and wake up the virtqueue in this case.

Fixes: 6b1e6cc7855b ("vhost: new device IOTLB API")
Reported-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Tested-by: Peter Xu <peterx@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit a21a39a9c37b8f629633d22a29cab69bbce38261)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I2cd64a1d7e195a508a3385e3afe4ecf22632770f
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index acdb9ac34242..53b1b3cfce84 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -906,7 +906,7 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
 	list_for_each_entry_safe(node, n, &d->pending_list, node) {
 		struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
 		if (msg->iova <= vq_msg->iova &&
-		    msg->iova + msg->size - 1 > vq_msg->iova &&
+		    msg->iova + msg->size - 1 >= vq_msg->iova &&
 		    vq_msg->type == VHOST_IOTLB_MISS) {
 			vhost_poll_queue(&node->vq->poll);
 			list_del(&node->node);

From e633b6d8e0dfbb5d3497faabf19d9f42bd2f5827 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 5 Nov 2018 10:35:47 +0000
Subject: [PATCH 2190/3220] UPSTREAM: vhost/vsock: fix use-after-free in
 network stack callers

[ Upstream commit 834e772c8db0c6a275d75315d90aba4ebbb1e249 ]

If the network stack calls .send_pkt()/.cancel_pkt() during .release(),
a struct vhost_vsock use-after-free is possible.  This occurs because
.release() does not wait for other CPUs to stop using struct
vhost_vsock.

Switch to an RCU-enabled hashtable (indexed by guest CID) so that
.release() can wait for other CPUs by calling synchronize_rcu().  This
also eliminates vhost_vsock_lock acquisition in the data path so it
could have a positive effect on performance.

This is CVE-2018-14625 "kernel: use-after-free Read in vhost_transport_send_pkt".

Cc: stable@vger.kernel.org
Reported-and-tested-by: syzbot+bd391451452fb0b93039@syzkaller.appspotmail.com
Reported-by: syzbot+e3e074963495f92a89ed@syzkaller.appspotmail.com
Reported-by: syzbot+d5a0a170c5069658b141@syzkaller.appspotmail.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
(cherry picked from commit 569fc4ffb5de8f12fe01759f0b85098b7b9bba8e)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I87f1e0fbe3fc01ccc18924085e33220373856e29
---
 drivers/vhost/vsock.c | 57 +++++++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index e6ba768ab01c..744ee3a17768 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -15,6 +15,7 @@
 #include <net/sock.h>
 #include <linux/virtio_vsock.h>
 #include <linux/vhost.h>
+#include <linux/hashtable.h>
 
 #include <net/af_vsock.h>
 #include "vhost.h"
@@ -27,14 +28,14 @@ enum {
 
 /* Used to track all the vhost_vsock instances on the system. */
 static DEFINE_SPINLOCK(vhost_vsock_lock);
-static LIST_HEAD(vhost_vsock_list);
+static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
 
 struct vhost_vsock {
 	struct vhost_dev dev;
 	struct vhost_virtqueue vqs[2];
 
-	/* Link to global vhost_vsock_list, protected by vhost_vsock_lock */
-	struct list_head list;
+	/* Link to global vhost_vsock_hash, writes use vhost_vsock_lock */
+	struct hlist_node hash;
 
 	struct vhost_work send_pkt_work;
 	spinlock_t send_pkt_list_lock;
@@ -50,11 +51,14 @@ static u32 vhost_transport_get_local_cid(void)
 	return VHOST_VSOCK_DEFAULT_HOST_CID;
 }
 
-static struct vhost_vsock *__vhost_vsock_get(u32 guest_cid)
+/* Callers that dereference the return value must hold vhost_vsock_lock or the
+ * RCU read lock.
+ */
+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 {
 	struct vhost_vsock *vsock;
 
-	list_for_each_entry(vsock, &vhost_vsock_list, list) {
+	hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) {
 		u32 other_cid = vsock->guest_cid;
 
 		/* Skip instances that have no CID yet */
@@ -69,17 +73,6 @@ static struct vhost_vsock *__vhost_vsock_get(u32 guest_cid)
 	return NULL;
 }
 
-static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
-{
-	struct vhost_vsock *vsock;
-
-	spin_lock_bh(&vhost_vsock_lock);
-	vsock = __vhost_vsock_get(guest_cid);
-	spin_unlock_bh(&vhost_vsock_lock);
-
-	return vsock;
-}
-
 static void
 vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			    struct vhost_virtqueue *vq)
@@ -206,9 +199,12 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
 	struct vhost_virtqueue *vq;
 	int len = pkt->len;
 
+	rcu_read_lock();
+
 	/* Find the vhost_vsock according to guest context id  */
 	vsock = vhost_vsock_get(le64_to_cpu(pkt->hdr.dst_cid));
 	if (!vsock) {
+		rcu_read_unlock();
 		virtio_transport_free_pkt(pkt);
 		return -ENODEV;
 	}
@@ -223,6 +219,8 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
 	spin_unlock_bh(&vsock->send_pkt_list_lock);
 
 	vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+
+	rcu_read_unlock();
 	return len;
 }
 
@@ -232,12 +230,15 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
 	struct vhost_vsock *vsock;
 	struct virtio_vsock_pkt *pkt, *n;
 	int cnt = 0;
+	int ret = -ENODEV;
 	LIST_HEAD(freeme);
 
+	rcu_read_lock();
+
 	/* Find the vhost_vsock according to guest context id  */
 	vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
 	if (!vsock)
-		return -ENODEV;
+		goto out;
 
 	spin_lock_bh(&vsock->send_pkt_list_lock);
 	list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
@@ -263,7 +264,10 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
 			vhost_poll_queue(&tx_vq->poll);
 	}
 
-	return 0;
+	ret = 0;
+out:
+	rcu_read_unlock();
+	return ret;
 }
 
 static struct virtio_vsock_pkt *
@@ -531,10 +535,6 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
 	spin_lock_init(&vsock->send_pkt_list_lock);
 	INIT_LIST_HEAD(&vsock->send_pkt_list);
 	vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
-
-	spin_lock_bh(&vhost_vsock_lock);
-	list_add_tail(&vsock->list, &vhost_vsock_list);
-	spin_unlock_bh(&vhost_vsock_lock);
 	return 0;
 
 out:
@@ -575,9 +575,13 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
 	struct vhost_vsock *vsock = file->private_data;
 
 	spin_lock_bh(&vhost_vsock_lock);
-	list_del(&vsock->list);
+	if (vsock->guest_cid)
+		hash_del_rcu(&vsock->hash);
 	spin_unlock_bh(&vhost_vsock_lock);
 
+	/* Wait for other CPUs to finish using vsock */
+	synchronize_rcu();
+
 	/* Iterating over all connections for all CIDs to find orphans is
 	 * inefficient.  Room for improvement here. */
 	vsock_for_each_connected_socket(vhost_vsock_reset_orphans);
@@ -618,12 +622,17 @@ static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
 
 	/* Refuse if CID is already in use */
 	spin_lock_bh(&vhost_vsock_lock);
-	other = __vhost_vsock_get(guest_cid);
+	other = vhost_vsock_get(guest_cid);
 	if (other && other != vsock) {
 		spin_unlock_bh(&vhost_vsock_lock);
 		return -EADDRINUSE;
 	}
+
+	if (vsock->guest_cid)
+		hash_del_rcu(&vsock->hash);
+
 	vsock->guest_cid = guest_cid;
+	hash_add_rcu(vhost_vsock_hash, &vsock->hash, guest_cid);
 	spin_unlock_bh(&vhost_vsock_lock);
 
 	return 0;

From 06477f7ef8c103a71d2e8a251681242f1846bd19 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 6 Dec 2018 19:14:34 +0000
Subject: [PATCH 2191/3220] UPSTREAM: vhost/vsock: fix reset orphans race with
 close timeout

[ Upstream commit c38f57da428b033f2721b611d84b1f40bde674a8 ]

If a local process has closed a connected socket and hasn't received a
RST packet yet, then the socket remains in the table until a timeout
expires.

When a vhost_vsock instance is released with the timeout still pending,
the socket is never freed because vhost_vsock has already set the
SOCK_DONE flag.

Check if the close timer is pending and let it close the socket.  This
prevents the race which can leak sockets.

Reported-by: Maximilian Riemensberger <riemensberger@cadami.net>
Cc: Graham Whaley <graham.whaley@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
(cherry picked from commit 06ec6679fe12cacafce68ab7b509586482a2ae1b)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I6b4564aebecc3938b789e56499a3f9b8bbbeb2a1
---
 drivers/vhost/vsock.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 744ee3a17768..72e914de473e 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -561,13 +561,21 @@ static void vhost_vsock_reset_orphans(struct sock *sk)
 	 * executing.
 	 */
 
-	if (!vhost_vsock_get(vsk->remote_addr.svm_cid)) {
-		sock_set_flag(sk, SOCK_DONE);
-		vsk->peer_shutdown = SHUTDOWN_MASK;
-		sk->sk_state = SS_UNCONNECTED;
-		sk->sk_err = ECONNRESET;
-		sk->sk_error_report(sk);
-	}
+	/* If the peer is still valid, no need to reset connection */
+	if (vhost_vsock_get(vsk->remote_addr.svm_cid))
+		return;
+
+	/* If the close timeout is pending, let it expire.  This avoids races
+	 * with the timeout callback.
+	 */
+	if (vsk->close_work_scheduled)
+		return;
+
+	sock_set_flag(sk, SOCK_DONE);
+	vsk->peer_shutdown = SHUTDOWN_MASK;
+	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_err = ECONNRESET;
+	sk->sk_error_report(sk);
 }
 
 static int vhost_vsock_dev_release(struct inode *inode, struct file *file)

From 5ec66a6767366c06e6a0c3f093dcf1d4e5aaa471 Mon Sep 17 00:00:00 2001
From: Cody Schuffelen <schuffelen@google.com>
Date: Mon, 14 Jan 2019 18:37:28 -0800
Subject: [PATCH 2192/3220] ANDROID: cuttlefish_defconfig: Enable vsock options

Bug: 121166534
Test: Ran cuttlefish with android-4.4 + vsock adb tunnel
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Change-Id: I8168a710052c7daada306a915c56230c961accd4
---
 arch/arm64/configs/cuttlefish_defconfig      | 4 ++++
 arch/x86/configs/x86_64_cuttlefish_defconfig | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
index 2dacb5ca76df..7457781a6f4f 100644
--- a/arch/arm64/configs/cuttlefish_defconfig
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -179,6 +179,8 @@ CONFIG_NET_CLS_U32=y
 CONFIG_NET_EMATCH=y
 CONFIG_NET_EMATCH_U32=y
 CONFIG_NET_CLS_ACT=y
+CONFIG_VSOCKETS=y
+CONFIG_VIRTIO_VSOCKETS=y
 CONFIG_CFG80211=y
 # CONFIG_CFG80211_DEFAULT_PS is not set
 CONFIG_MAC80211=y
@@ -385,6 +387,8 @@ CONFIG_SDCARD_FS=y
 CONFIG_PSTORE=y
 CONFIG_PSTORE_CONSOLE=y
 CONFIG_PSTORE_RAM=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_VHOST_VSOCK=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 # CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 8383fea15ad7..7e83b1f6b015 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -191,6 +191,8 @@ CONFIG_NET_CLS_U32=y
 CONFIG_NET_EMATCH=y
 CONFIG_NET_EMATCH_U32=y
 CONFIG_NET_CLS_ACT=y
+CONFIG_VSOCKETS=y
+CONFIG_VIRTIO_VSOCKETS=y
 CONFIG_CFG80211=y
 CONFIG_MAC80211=y
 CONFIG_RFKILL=y
@@ -459,3 +461,4 @@ CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
 CONFIG_X509_CERTIFICATE_PARSER=y
 CONFIG_SYSTEM_TRUSTED_KEYRING=y
 CONFIG_SYSTEM_TRUSTED_KEYS="verity_dev_keys.x509"
+CONFIG_VHOST_VSOCK=y

From dbe5b486ca9a7f537c0dc5bfe9f875948b43e91b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 2 Feb 2016 21:46:35 -0800
Subject: [PATCH 2193/3220] UPSTREAM: vring: Introduce vring_use_dma_api()

This is a kludge, but no one has come up with a a better idea yet.
We'll introduce DMA API support guarded by vring_use_dma_api().
Eventually we may be able to return true on more and more systems,
and hopefully we can get rid of vring_use_dma_api() entirely some
day.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit d26c96c8102549f91eb0bea6196d54711ab52176)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: Icfe1fdb85df794d0238fef53123cd10d99d10aa4
---
 drivers/virtio/virtio_ring.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a01a41a41269..eba6deb94685 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -104,6 +104,30 @@ struct vring_virtqueue {
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+/*
+ * The interaction between virtio and a possible IOMMU is a mess.
+ *
+ * On most systems with virtio, physical addresses match bus addresses,
+ * and it doesn't particularly matter whether we use the DMA API.
+ *
+ * On some systems, including Xen and any system with a physical device
+ * that speaks virtio behind a physical IOMMU, we must use the DMA API
+ * for virtio DMA to work at all.
+ *
+ * On other systems, including SPARC and PPC64, virtio-pci devices are
+ * enumerated as though they are behind an IOMMU, but the virtio host
+ * ignores the IOMMU, so we must either pretend that the IOMMU isn't
+ * there or somehow map everything as the identity.
+ *
+ * For the time being, we preserve historic behavior and bypass the DMA
+ * API.
+ */
+
+static bool vring_use_dma_api(struct virtio_device *vdev)
+{
+	return false;
+}
+
 static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 					 unsigned int total_sg, gfp_t gfp)
 {

From 6bb68f96a00a13a71c3fa8c77aa0609a6b31dc2e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 2 Feb 2016 21:46:36 -0800
Subject: [PATCH 2194/3220] UPSTREAM: virtio_ring: Support DMA APIs

virtio_ring currently sends the device (usually a hypervisor)
physical addresses of its I/O buffers.  This is okay when DMA
addresses and physical addresses are the same thing, but this isn't
always the case.  For example, this never works on Xen guests, and
it is likely to fail if a physical "virtio" device ever ends up
behind an IOMMU or swiotlb.

The immediate use case for me is to enable virtio on Xen guests.
For that to work, we need vring to support DMA address translation
as well as a corresponding change to virtio_pci or to another
driver.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 780bc7903a32edb63be138487fd981694d993610)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: I0b21ea8a054df791fe114a10701a1120341806ce
---
 drivers/virtio/Kconfig           |   2 +-
 drivers/virtio/virtio_ring.c     | 200 +++++++++++++++++++++++++------
 tools/virtio/linux/dma-mapping.h |  17 +++
 3 files changed, 183 insertions(+), 36 deletions(-)
 create mode 100644 tools/virtio/linux/dma-mapping.h

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index cab9f3f63a38..77590320d44c 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -60,7 +60,7 @@ config VIRTIO_INPUT
 
  config VIRTIO_MMIO
 	tristate "Platform bus driver for memory mapped virtio devices"
-	depends on HAS_IOMEM
+	depends on HAS_IOMEM && HAS_DMA
  	select VIRTIO
  	---help---
  	 This drivers provides support for memory mapped virtio
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index eba6deb94685..7e74a5386ef3 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/hrtimer.h>
 #include <linux/kmemleak.h>
+#include <linux/dma-mapping.h>
 
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
@@ -54,6 +55,11 @@
 #define END_USE(vq)
 #endif
 
+struct vring_desc_state {
+	void *data;			/* Data for callback. */
+	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
+};
+
 struct vring_virtqueue {
 	struct virtqueue vq;
 
@@ -98,8 +104,8 @@ struct vring_virtqueue {
 	ktime_t last_add_time;
 #endif
 
-	/* Tokens for callbacks. */
-	void *data[];
+	/* Per-descriptor state. */
+	struct vring_desc_state desc_state[];
 };
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
@@ -128,6 +134,79 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
 	return false;
 }
 
+/*
+ * The DMA ops on various arches are rather gnarly right now, and
+ * making all of the arch DMA ops work on the vring device itself
+ * is a mess.  For now, we use the parent device for DMA ops.
+ */
+struct device *vring_dma_dev(const struct vring_virtqueue *vq)
+{
+	return vq->vq.vdev->dev.parent;
+}
+
+/* Map one sg entry. */
+static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
+				   struct scatterlist *sg,
+				   enum dma_data_direction direction)
+{
+	if (!vring_use_dma_api(vq->vq.vdev))
+		return (dma_addr_t)sg_phys(sg);
+
+	/*
+	 * We can't use dma_map_sg, because we don't use scatterlists in
+	 * the way it expects (we don't guarantee that the scatterlist
+	 * will exist for the lifetime of the mapping).
+	 */
+	return dma_map_page(vring_dma_dev(vq),
+			    sg_page(sg), sg->offset, sg->length,
+			    direction);
+}
+
+static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
+				   void *cpu_addr, size_t size,
+				   enum dma_data_direction direction)
+{
+	if (!vring_use_dma_api(vq->vq.vdev))
+		return (dma_addr_t)virt_to_phys(cpu_addr);
+
+	return dma_map_single(vring_dma_dev(vq),
+			      cpu_addr, size, direction);
+}
+
+static void vring_unmap_one(const struct vring_virtqueue *vq,
+			    struct vring_desc *desc)
+{
+	u16 flags;
+
+	if (!vring_use_dma_api(vq->vq.vdev))
+		return;
+
+	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+	if (flags & VRING_DESC_F_INDIRECT) {
+		dma_unmap_single(vring_dma_dev(vq),
+				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
+				 virtio32_to_cpu(vq->vq.vdev, desc->len),
+				 (flags & VRING_DESC_F_WRITE) ?
+				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	} else {
+		dma_unmap_page(vring_dma_dev(vq),
+			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
+			       virtio32_to_cpu(vq->vq.vdev, desc->len),
+			       (flags & VRING_DESC_F_WRITE) ?
+			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	}
+}
+
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+			       dma_addr_t addr)
+{
+	if (!vring_use_dma_api(vq->vq.vdev))
+		return 0;
+
+	return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
 static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 					 unsigned int total_sg, gfp_t gfp)
 {
@@ -161,7 +240,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
 	struct vring_desc *desc;
-	unsigned int i, n, avail, descs_used, uninitialized_var(prev);
+	unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
 	int head;
 	bool indirect;
 
@@ -201,21 +280,15 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 
 	if (desc) {
 		/* Use a single buffer which doesn't continue */
-		vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
-		vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc));
-		/* avoid kmemleak false positive (hidden by virt_to_phys) */
-		kmemleak_ignore(desc);
-		vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
-
+		indirect = true;
 		/* Set up rest to use this indirect table. */
 		i = 0;
 		descs_used = 1;
-		indirect = true;
 	} else {
+		indirect = false;
 		desc = vq->vring.desc;
 		i = head;
 		descs_used = total_sg;
-		indirect = false;
 	}
 
 	if (vq->vq.num_free < descs_used) {
@@ -232,13 +305,14 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 		return -ENOSPC;
 	}
 
-	/* We're about to use some buffers from the free list. */
-	vq->vq.num_free -= descs_used;
-
 	for (n = 0; n < out_sgs; n++) {
 		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
+			if (vring_mapping_error(vq, addr))
+				goto unmap_release;
+
 			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
-			desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
+			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
 			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
 			prev = i;
 			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
@@ -246,8 +320,12 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
 		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
+			if (vring_mapping_error(vq, addr))
+				goto unmap_release;
+
 			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
-			desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
+			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
 			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
 			prev = i;
 			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
@@ -256,14 +334,33 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	/* Last one doesn't continue. */
 	desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
 
+	if (indirect) {
+		/* Now that the indirect table is filled in, map it. */
+		dma_addr_t addr = vring_map_single(
+			vq, desc, total_sg * sizeof(struct vring_desc),
+			DMA_TO_DEVICE);
+		if (vring_mapping_error(vq, addr))
+			goto unmap_release;
+
+		vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
+		vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
+
+		vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
+	}
+
+	/* We're using some buffers from the free list. */
+	vq->vq.num_free -= descs_used;
+
 	/* Update free pointer */
 	if (indirect)
 		vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
 	else
 		vq->free_head = i;
 
-	/* Set token. */
-	vq->data[head] = data;
+	/* Store token and indirect buffer state. */
+	vq->desc_state[head].data = data;
+	if (indirect)
+		vq->desc_state[head].indir_desc = desc;
 
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
@@ -286,6 +383,24 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 		virtqueue_kick(_vq);
 
 	return 0;
+
+unmap_release:
+	err_idx = i;
+	i = head;
+
+	for (n = 0; n < total_sg; n++) {
+		if (i == err_idx)
+			break;
+		vring_unmap_one(vq, &desc[i]);
+		i = vq->vring.desc[i].next;
+	}
+
+	vq->vq.num_free += total_sg;
+
+	if (indirect)
+		kfree(desc);
+
+	return -EIO;
 }
 
 /**
@@ -456,27 +571,43 @@ EXPORT_SYMBOL_GPL(virtqueue_kick);
 
 static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 {
-	unsigned int i;
+	unsigned int i, j;
+	u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
 
 	/* Clear data ptr. */
-	vq->data[head] = NULL;
+	vq->desc_state[head].data = NULL;
 
-	/* Put back on free list: find end */
+	/* Put back on free list: unmap first-level descriptors and find end */
 	i = head;
 
-	/* Free the indirect table */
-	if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))
-		kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr)));
-
-	while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) {
+	while (vq->vring.desc[i].flags & nextflag) {
+		vring_unmap_one(vq, &vq->vring.desc[i]);
 		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
 		vq->vq.num_free++;
 	}
 
+	vring_unmap_one(vq, &vq->vring.desc[i]);
 	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
 	vq->free_head = head;
+
 	/* Plus final descriptor */
 	vq->vq.num_free++;
+
+	/* Free the indirect table, if any, now that it's unmapped. */
+	if (vq->desc_state[head].indir_desc) {
+		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
+		u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
+
+		BUG_ON(!(vq->vring.desc[head].flags &
+			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
+		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+
+		for (j = 0; j < len / sizeof(struct vring_desc); j++)
+			vring_unmap_one(vq, &indir_desc[j]);
+
+		kfree(vq->desc_state[head].indir_desc);
+		vq->desc_state[head].indir_desc = NULL;
+	}
 }
 
 static inline bool more_used(const struct vring_virtqueue *vq)
@@ -531,13 +662,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 		BAD_RING(vq, "id %u out of range\n", i);
 		return NULL;
 	}
-	if (unlikely(!vq->data[i])) {
+	if (unlikely(!vq->desc_state[i].data)) {
 		BAD_RING(vq, "id %u is not a head!\n", i);
 		return NULL;
 	}
 
 	/* detach_buf clears data, so grab it now. */
-	ret = vq->data[i];
+	ret = vq->desc_state[i].data;
 	detach_buf(vq, i);
 	vq->last_used_idx++;
 	/* If we expect an interrupt for the next entry, tell host
@@ -711,10 +842,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 	START_USE(vq);
 
 	for (i = 0; i < vq->vring.num; i++) {
-		if (!vq->data[i])
+		if (!vq->desc_state[i].data)
 			continue;
 		/* detach_buf clears data, so grab it now. */
-		buf = vq->data[i];
+		buf = vq->desc_state[i].data;
 		detach_buf(vq, i);
 		vq->avail_idx_shadow--;
 		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
@@ -768,7 +899,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 		return NULL;
 	}
 
-	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
+	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
+		     GFP_KERNEL);
 	if (!vq)
 		return NULL;
 
@@ -803,11 +935,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 
 	/* Put everything in free lists. */
 	vq->free_head = 0;
-	for (i = 0; i < num-1; i++) {
+	for (i = 0; i < num-1; i++)
 		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
-		vq->data[i] = NULL;
-	}
-	vq->data[i] = NULL;
+	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
 
 	return &vq->vq;
 }
diff --git a/tools/virtio/linux/dma-mapping.h b/tools/virtio/linux/dma-mapping.h
new file mode 100644
index 000000000000..4f93af89ae16
--- /dev/null
+++ b/tools/virtio/linux/dma-mapping.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_DMA_MAPPING_H
+#define _LINUX_DMA_MAPPING_H
+
+#ifdef CONFIG_HAS_DMA
+# error Virtio userspace code does not support CONFIG_HAS_DMA
+#endif
+
+#define PCI_DMA_BUS_IS_PHYS 1
+
+enum dma_data_direction {
+	DMA_BIDIRECTIONAL = 0,
+	DMA_TO_DEVICE = 1,
+	DMA_FROM_DEVICE = 2,
+	DMA_NONE = 3,
+};
+
+#endif

From 7389e826abae820e1149cfb8a39f2a877bb2a478 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 2 Feb 2016 21:46:40 -0800
Subject: [PATCH 2195/3220] UPSTREAM: vring: Use the DMA API on Xen

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
(cherry picked from commit 78fe39872378b0bef00a91181f1947acb8a08500)
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: Ibf85aad139f319354cd2ee8833ca9434c15640d8
---
 drivers/virtio/virtio_ring.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 7e74a5386ef3..9a31cb74b4cd 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -25,6 +25,7 @@
 #include <linux/hrtimer.h>
 #include <linux/kmemleak.h>
 #include <linux/dma-mapping.h>
+#include <xen/xen.h>
 
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
@@ -131,6 +132,17 @@ struct vring_virtqueue {
 
 static bool vring_use_dma_api(struct virtio_device *vdev)
 {
+	/*
+	 * In theory, it's possible to have a buggy QEMU-supposed
+	 * emulated Q35 IOMMU and Xen enabled at the same time.  On
+	 * such a configuration, virtio has never worked and will
+	 * not work without an even larger kludge.  Instead, enable
+	 * the DMA API if we're a Xen guest, which at least allows
+	 * all of the sensible Xen configurations to work correctly.
+	 */
+	if (xen_domain())
+		return true;
+
 	return false;
 }
 

From e137948e4b118c77c81942768d1d8059d6ecbffd Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 18 Apr 2016 12:58:14 +0300
Subject: [PATCH 2196/3220] UPSTREAM: virtio: new feature to detect IOMMU
 device quirk

The interaction between virtio and IOMMUs is messy.

On most systems with virtio, physical addresses match bus addresses,
and it doesn't particularly matter which one we use to program
the device.

On some systems, including Xen and any system with a physical device
that speaks virtio behind a physical IOMMU, we must program the IOMMU
for virtio DMA to work at all.

On other systems, including SPARC and PPC64, virtio-pci devices are
enumerated as though they are behind an IOMMU, but the virtio host
ignores the IOMMU, so we must either pretend that the IOMMU isn't
there or somehow map everything as the identity.

Add a feature bit to detect that quirk: VIRTIO_F_IOMMU_PLATFORM.

Any device with this feature bit set to 0 needs a quirk and has to be
passed physical addresses (as opposed to bus addresses) even though
the device is behind an IOMMU.

Note: it has to be a per-device quirk because for example, there could
be a mix of passed-through and virtual virtio devices. As another
example, some devices could be implemented by an out of process
hypervisor backend (in case of qemu vhost, or vhost-user) and so support
for an IOMMU needs to be coded up separately.

It would be cleanest to handle this in IOMMU core code, but that needs
per-device DMA ops. While we are waiting for that to be implemented, use
a work-around in virtio core.

Note: a "noiommu" feature is a quirk - add a wrapper to make
that clear.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 1a937693993ff10d7e80cca6ddd55f3000aa6376)
[astrachan: should fix CONFIG_IOMMU builds]
Bug: 121166534
Test: Ran cuttlefish with android-4.4 + VSOCKETS, VMWARE_VMCI_VSOCKETS
Signed-off-by: Alistair Strachan <astrachan@google.com>

Change-Id: If09b9b34a2410de7453d2c38812550615e098b10
---
 drivers/virtio/virtio_ring.c       | 15 ++++++++++++++-
 include/linux/virtio_config.h      | 13 +++++++++++++
 include/uapi/linux/virtio_config.h | 10 +++++++++-
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 9a31cb74b4cd..761f28ffd40e 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -112,7 +112,10 @@ struct vring_virtqueue {
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
 /*
- * The interaction between virtio and a possible IOMMU is a mess.
+ * Modern virtio devices have feature bits to specify whether they need a
+ * quirk and bypass the IOMMU. If not there, just use the DMA API.
+ *
+ * If there, the interaction between virtio and DMA API is messy.
  *
  * On most systems with virtio, physical addresses match bus addresses,
  * and it doesn't particularly matter whether we use the DMA API.
@@ -128,10 +131,18 @@ struct vring_virtqueue {
  *
  * For the time being, we preserve historic behavior and bypass the DMA
  * API.
+ *
+ * TODO: install a per-device DMA ops structure that does the right thing
+ * taking into account all the above quirks, and use the DMA API
+ * unconditionally on data path.
  */
 
 static bool vring_use_dma_api(struct virtio_device *vdev)
 {
+	if (!virtio_has_iommu_quirk(vdev))
+		return true;
+
+	/* Otherwise, we are left to guess. */
 	/*
 	 * In theory, it's possible to have a buggy QEMU-supposed
 	 * emulated Q35 IOMMU and Xen enabled at the same time.  On
@@ -975,6 +986,8 @@ void vring_transport_features(struct virtio_device *vdev)
 			break;
 		case VIRTIO_F_VERSION_1:
 			break;
+		case VIRTIO_F_IOMMU_PLATFORM:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 6e6cb0c9d7cb..26c155bb639b 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -149,6 +149,19 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
 	return __virtio_test_bit(vdev, fbit);
 }
 
+/**
+ * virtio_has_iommu_quirk - determine whether this device has the iommu quirk
+ * @vdev: the device
+ */
+static inline bool virtio_has_iommu_quirk(const struct virtio_device *vdev)
+{
+	/*
+	 * Note the reverse polarity of the quirk feature (compared to most
+	 * other features), this is for compatibility with legacy systems.
+	 */
+	return !virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+}
+
 static inline
 struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 					vq_callback_t *c, const char *n)
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index c18264df9504..cf49c7e2cfdb 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -47,7 +47,7 @@
  * transport being used (eg. virtio_ring), the rest are per-device feature
  * bits. */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		33
+#define VIRTIO_TRANSPORT_F_END		34
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -61,4 +61,12 @@
 /* v1.0 compliant. */
 #define VIRTIO_F_VERSION_1		32
 
+/*
+ * If clear - device has the IOMMU bypass quirk feature.
+ * If set - use platform tools to detect the IOMMU.
+ *
+ * Note the reverse polarity (compared to most other features),
+ * this is for compatibility with legacy systems.
+ */
+#define VIRTIO_F_IOMMU_PLATFORM		33
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */

From fea03efa24d48514be94e93affae6d0ae8ab3edb Mon Sep 17 00:00:00 2001
From: Kailang Yang <kailang@realtek.com>
Date: Wed, 9 Jan 2019 17:05:24 +0800
Subject: [PATCH 2197/3220] ALSA: hda/realtek - Disable headset Mic VREF for
 headset mode of ALC225

commit d1dd42110d2727e81b9265841a62bc84c454c3a2 upstream.

Disable Headset Mic VREF for headset mode of ALC225.
This will be controlled by coef bits of headset mode functions.

[ Fixed a compile warning and code simplification -- tiwai ]

Signed-off-by: Kailang Yang <kailang@realtek.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/patch_realtek.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 0467e5ba82e0..5d8ac2d798df 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -4792,6 +4792,13 @@ static void alc280_fixup_hp_9480m(struct hda_codec *codec,
 	}
 }
 
+static void alc_fixup_disable_mic_vref(struct hda_codec *codec,
+				  const struct hda_fixup *fix, int action)
+{
+	if (action == HDA_FIXUP_ACT_PRE_PROBE)
+		snd_hda_codec_set_pin_target(codec, 0x19, PIN_VREFHIZ);
+}
+
 /* for hda_fixup_thinkpad_acpi() */
 #include "thinkpad_helper.c"
 
@@ -4891,6 +4898,7 @@ enum {
 	ALC293_FIXUP_LENOVO_SPK_NOISE,
 	ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY,
 	ALC255_FIXUP_DELL_SPK_NOISE,
+	ALC225_FIXUP_DISABLE_MIC_VREF,
 	ALC225_FIXUP_DELL1_MIC_NO_PRESENCE,
 	ALC295_FIXUP_DISABLE_DAC3,
 	ALC280_FIXUP_HP_HEADSET_MIC,
@@ -5546,6 +5554,12 @@ static const struct hda_fixup alc269_fixups[] = {
 		.chained = true,
 		.chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE
 	},
+	[ALC225_FIXUP_DISABLE_MIC_VREF] = {
+		.type = HDA_FIXUP_FUNC,
+		.v.func = alc_fixup_disable_mic_vref,
+		.chained = true,
+		.chain_id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE
+	},
 	[ALC225_FIXUP_DELL1_MIC_NO_PRESENCE] = {
 		.type = HDA_FIXUP_VERBS,
 		.v.verbs = (const struct hda_verb[]) {
@@ -5555,7 +5569,7 @@ static const struct hda_fixup alc269_fixups[] = {
 			{}
 		},
 		.chained = true,
-		.chain_id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE
+		.chain_id = ALC225_FIXUP_DISABLE_MIC_VREF
 	},
 	[ALC280_FIXUP_HP_HEADSET_MIC] = {
 		.type = HDA_FIXUP_FUNC,

From 67405a1e780c38b54c980a47f1cd5d9b633a60f2 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 3 Jun 2015 10:55:48 -0400
Subject: [PATCH 2198/3220] btrfs: cleanup, stop casting for extent_map->lookup
 everywhere

commit 95617d69326ce386c95e33db7aeb832b45ee9f8f upstream.

Overloading extent_map->bdev to struct map_lookup * might have started out
as a means to an end, but it's a pattern that's used all over the place
now. Let's get rid of the casting and just add a union instead.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/dev-replace.c |  2 +-
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/extent_map.c  |  2 +-
 fs/btrfs/extent_map.h  | 10 +++++++++-
 fs/btrfs/scrub.c       |  2 +-
 fs/btrfs/volumes.c     | 24 ++++++++++++------------
 6 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 176a27bc63aa..81e5bc62e8e3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -620,7 +620,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
 		em = lookup_extent_mapping(em_tree, start, (u64)-1);
 		if (!em)
 			break;
-		map = (struct map_lookup *)em->bdev;
+		map = em->map_lookup;
 		for (i = 0; i < map->num_stripes; i++)
 			if (srcdev == map->stripes[i].dev)
 				map->stripes[i].dev = tgtdev;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 13ff0fdae03e..afbfa7c36ff6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10388,7 +10388,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
 	 * more device items and remove one chunk item), but this is done at
 	 * btrfs_remove_chunk() through a call to check_system_chunk().
 	 */
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	num_items = 3 + map->num_stripes;
 	free_extent_map(em);
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6a98bddd8f33..84fb56d5c018 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em)
 		WARN_ON(extent_map_in_tree(em));
 		WARN_ON(!list_empty(&em->list));
 		if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
-			kfree(em->bdev);
+			kfree(em->map_lookup);
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b2991fd8583e..eb8b8fae036b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -32,7 +32,15 @@ struct extent_map {
 	u64 block_len;
 	u64 generation;
 	unsigned long flags;
-	struct block_device *bdev;
+	union {
+		struct block_device *bdev;
+
+		/*
+		 * used for chunk mappings
+		 * flags & EXTENT_FLAG_FS_MAPPING must be set
+		 */
+		struct map_lookup *map_lookup;
+	};
 	atomic_t refs;
 	unsigned int compress_type;
 	struct list_head list;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dca9f937bf6..cc9ccc42f469 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3460,7 +3460,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 		return ret;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (em->start != chunk_offset)
 		goto out;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b4d63a9842fa..f9de5072061d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1184,7 +1184,7 @@ again:
 		struct map_lookup *map;
 		int i;
 
-		map = (struct map_lookup *)em->bdev;
+		map = em->map_lookup;
 		for (i = 0; i < map->num_stripes; i++) {
 			u64 end;
 
@@ -2757,7 +2757,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 			free_extent_map(em);
 		return -EINVAL;
 	}
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	lock_chunks(root->fs_info->chunk_root);
 	check_system_chunk(trans, extent_root, map->type);
 	unlock_chunks(root->fs_info->chunk_root);
@@ -4731,7 +4731,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		goto error;
 	}
 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-	em->bdev = (struct block_device *)map;
+	em->map_lookup = map;
 	em->start = start;
 	em->len = num_bytes;
 	em->block_start = 0;
@@ -4826,7 +4826,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 		return -EINVAL;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	item_size = btrfs_chunk_item_size(map->num_stripes);
 	stripe_size = em->orig_block_len;
 
@@ -4968,7 +4968,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 	if (!em)
 		return 1;
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	for (i = 0; i < map->num_stripes; i++) {
 		if (map->stripes[i].dev->missing) {
 			miss_ndevs++;
@@ -5048,7 +5048,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		return 1;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
 		ret = map->num_stripes;
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
@@ -5091,7 +5091,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
 		len = map->stripe_len * nr_data_stripes(map);
 	free_extent_map(em);
@@ -5112,7 +5112,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
 		ret = 1;
 	free_extent_map(em);
@@ -5271,7 +5271,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		return -EINVAL;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	offset = logical - em->start;
 
 	stripe_len = map->stripe_len;
@@ -5813,7 +5813,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		free_extent_map(em);
 		return -EIO;
 	}
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 
 	length = em->len;
 	rmap_len = map->stripe_len;
@@ -6249,7 +6249,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	}
 
 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-	em->bdev = (struct block_device *)map;
+	em->map_lookup = map;
 	em->start = logical;
 	em->len = length;
 	em->orig_start = 0;
@@ -6948,7 +6948,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
 	/* In order to kick the device replace finish process */
 	lock_chunks(root);
 	list_for_each_entry(em, &transaction->pending_chunks, list) {
-		map = (struct map_lookup *)em->bdev;
+		map = em->map_lookup;
 
 		for (i = 0; i < map->num_stripes; i++) {
 			dev = map->stripes[i].dev;

From 66b8c06f936598895397b92b2ab82b77ac642f66 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 15 Dec 2015 09:14:37 +0800
Subject: [PATCH 2199/3220] btrfs: Enhance chunk validation check

commit f04b772bfc17f502703794f4d100d12155c1a1a9 upstream.

Enhance chunk validation:
1) Num_stripes
   We already have such check but it's only in super block sys chunk
   array.
   Now check all on-disk chunks.

2) Chunk logical
   It should be aligned to sector size.
   This behavior should be *DOUBLE CHECKED* for 64K sector size like
   PPC64 or AArch64.
   Maybe we can found some hidden bugs.

3) Chunk length
   Same as chunk logical, should be aligned to sector size.

4) Stripe length
   It should be power of 2.

5) Chunk type
   Any bit out of TYPE_MAS | PROFILE_MASK is invalid.

With all these much restrict rules, several fuzzed image reported in
mail list should no longer cause kernel panic.

Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/volumes.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f9de5072061d..b81081b082f6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6217,6 +6217,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	struct extent_map *em;
 	u64 logical;
 	u64 length;
+	u64 stripe_len;
 	u64 devid;
 	u8 uuid[BTRFS_UUID_SIZE];
 	int num_stripes;
@@ -6225,6 +6226,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
+	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	/* Validation check */
+	if (!num_stripes) {
+		btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+			  num_stripes);
+		return -EIO;
+	}
+	if (!IS_ALIGNED(logical, root->sectorsize)) {
+		btrfs_err(root->fs_info,
+			  "invalid chunk logical %llu", logical);
+		return -EIO;
+	}
+	if (!length || !IS_ALIGNED(length, root->sectorsize)) {
+		btrfs_err(root->fs_info,
+			"invalid chunk length %llu", length);
+		return -EIO;
+	}
+	if (!is_power_of_2(stripe_len)) {
+		btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+			  stripe_len);
+		return -EIO;
+	}
+	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+	    btrfs_chunk_type(leaf, chunk)) {
+		btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+			  btrfs_chunk_type(leaf, chunk));
+		return -EIO;
+	}
 
 	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6241,7 +6273,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em = alloc_extent_map();
 	if (!em)
 		return -ENOMEM;
-	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		free_extent_map(em);

From 41fbc649308c559cb35fec6a2692809ecd52bd62 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 3 Jun 2016 12:05:15 -0700
Subject: [PATCH 2200/3220] Btrfs: add validadtion checks for chunk loading

commit e06cd3dd7cea50e87663a88acdfdb7ac1c53a5ca upstream.

To prevent fuzzed filesystem images from panic the whole system,
we need various validation checks to refuse to mount such an image
if btrfs finds any invalid value during loading chunks, including
both sys_array and regular chunks.

Note that these checks may not be sufficient to cover all corner cases,
feel free to add more checks.

Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Reported-by: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/volumes.c | 110 +++++++++++++++++++++++++++++++++------------
 1 file changed, 81 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b81081b082f6..75211630a864 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6208,6 +6208,73 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	return dev;
 }
 
+/* Return -EIO if any error, otherwise return 0. */
+static int btrfs_check_chunk_valid(struct btrfs_root *root,
+				   struct extent_buffer *leaf,
+				   struct btrfs_chunk *chunk, u64 logical)
+{
+	u64 length;
+	u64 stripe_len;
+	u16 num_stripes;
+	u16 sub_stripes;
+	u64 type;
+
+	length = btrfs_chunk_length(leaf, chunk);
+	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+	type = btrfs_chunk_type(leaf, chunk);
+
+	if (!num_stripes) {
+		btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+			  num_stripes);
+		return -EIO;
+	}
+	if (!IS_ALIGNED(logical, root->sectorsize)) {
+		btrfs_err(root->fs_info,
+			  "invalid chunk logical %llu", logical);
+		return -EIO;
+	}
+	if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) {
+		btrfs_err(root->fs_info, "invalid chunk sectorsize %u",
+			  btrfs_chunk_sector_size(leaf, chunk));
+		return -EIO;
+	}
+	if (!length || !IS_ALIGNED(length, root->sectorsize)) {
+		btrfs_err(root->fs_info,
+			"invalid chunk length %llu", length);
+		return -EIO;
+	}
+	if (!is_power_of_2(stripe_len)) {
+		btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+			  stripe_len);
+		return -EIO;
+	}
+	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+	    type) {
+		btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+			  btrfs_chunk_type(leaf, chunk));
+		return -EIO;
+	}
+	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+	     num_stripes != 1)) {
+		btrfs_err(root->fs_info,
+			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
+			num_stripes, sub_stripes,
+			type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+		return -EIO;
+	}
+
+	return 0;
+}
+
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -6228,35 +6295,10 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	length = btrfs_chunk_length(leaf, chunk);
 	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
-	/* Validation check */
-	if (!num_stripes) {
-		btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
-			  num_stripes);
-		return -EIO;
-	}
-	if (!IS_ALIGNED(logical, root->sectorsize)) {
-		btrfs_err(root->fs_info,
-			  "invalid chunk logical %llu", logical);
-		return -EIO;
-	}
-	if (!length || !IS_ALIGNED(length, root->sectorsize)) {
-		btrfs_err(root->fs_info,
-			"invalid chunk length %llu", length);
-		return -EIO;
-	}
-	if (!is_power_of_2(stripe_len)) {
-		btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
-			  stripe_len);
-		return -EIO;
-	}
-	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
-	    btrfs_chunk_type(leaf, chunk)) {
-		btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
-			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
-			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
-			  btrfs_chunk_type(leaf, chunk));
-		return -EIO;
-	}
+
+	ret = btrfs_check_chunk_valid(root, leaf, chunk, logical);
+	if (ret)
+		return ret;
 
 	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6504,6 +6546,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	u32 array_size;
 	u32 len = 0;
 	u32 cur_offset;
+	u64 type;
 	struct btrfs_key key;
 
 	ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
@@ -6570,6 +6613,15 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 				break;
 			}
 
+			type = btrfs_chunk_type(sb, chunk);
+			if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
+				btrfs_err(root->fs_info,
+			    "invalid chunk type %llu in sys_array at offset %u",
+					type, cur_offset);
+				ret = -EIO;
+				break;
+			}
+
 			len = btrfs_chunk_item_size(num_stripes);
 			if (cur_offset + len > array_size)
 				goto out_short_read;

From a155ec93222648115e6cdf64a1ca6a929c8a66f1 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 22 Jun 2016 18:31:27 -0700
Subject: [PATCH 2201/3220] Btrfs: check inconsistence between chunk and block
 group

commit 6fb37b756acce6d6e045f79c3764206033f617b4 upstream.

With btrfs-corrupt-block, one can drop one chunk item and mounting
will end up with a panic in btrfs_full_stripe_len().

This doesn't not remove the BUG_ON, but instead checks it a bit
earlier when we find the block group item.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index afbfa7c36ff6..2b7cba4fc8f1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9502,7 +9502,22 @@ static int find_first_block_group(struct btrfs_root *root,
 
 		if (found_key.objectid >= key->objectid &&
 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
-			ret = 0;
+			struct extent_map_tree *em_tree;
+			struct extent_map *em;
+
+			em_tree = &root->fs_info->mapping_tree.map_tree;
+			read_lock(&em_tree->lock);
+			em = lookup_extent_mapping(em_tree, found_key.objectid,
+						   found_key.offset);
+			read_unlock(&em_tree->lock);
+			if (!em) {
+				btrfs_err(root->fs_info,
+			"logical %llu len %llu found bg but no related chunk",
+					  found_key.objectid, found_key.offset);
+				ret = -ENOENT;
+			} else {
+				ret = 0;
+			}
 			goto out;
 		}
 		path->slots[0]++;

From 4bf414d2ff5758d64d4c785339c92ba22af979c3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 18 Aug 2016 15:30:06 -0400
Subject: [PATCH 2202/3220] Btrfs: fix em leak in find_first_block_group

commit 187ee58c62c1d0d238d3dc4835869d33e1869906 upstream.

We need to call free_extent_map() on the em we look up.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2b7cba4fc8f1..abfd2320873d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9518,6 +9518,7 @@ static int find_first_block_group(struct btrfs_root *root,
 			} else {
 				ret = 0;
 			}
+			free_extent_map(em);
 			goto out;
 		}
 		path->slots[0]++;

From dc059e58af01c31a8d593cc99086cf41600852b2 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 23 Aug 2016 15:22:58 -0700
Subject: [PATCH 2203/3220] Btrfs: detect corruption when non-root leaf has
 zero item

commit 1ba98d086fe3a14d6a31f2f66dbab70c45d00f63 upstream.

Right now we treat leaf which has zero item as a valid one
because we could have an empty tree, that is, a root that is
also a leaf without any item, however, in the same case but
when the leaf is not a root, we can end up with hitting the
BUG_ON(1) in btrfs_extend_item() called by
setup_inline_extent_backref().

This makes us check the situation as a corruption if leaf is
not its own root.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1f21c6c33228..20923889de69 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -535,8 +535,29 @@ static noinline int check_leaf(struct btrfs_root *root,
 	u32 nritems = btrfs_header_nritems(leaf);
 	int slot;
 
-	if (nritems == 0)
+	if (nritems == 0) {
+		struct btrfs_root *check_root;
+
+		key.objectid = btrfs_header_owner(leaf);
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+
+		check_root = btrfs_get_fs_root(root->fs_info, &key, false);
+		/*
+		 * The only reason we also check NULL here is that during
+		 * open_ctree() some roots has not yet been set up.
+		 */
+		if (!IS_ERR_OR_NULL(check_root)) {
+			/* if leaf is the root, then it's fine */
+			if (leaf->start !=
+			    btrfs_root_bytenr(&check_root->root_item)) {
+				CORRUPT("non-root leaf's nritems is 0",
+					leaf, root, 0);
+				return -EIO;
+			}
+		}
 		return 0;
+	}
 
 	/* Check the 0 item */
 	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=

From 1ef3ecf373ae2c650894fbfc4de385dde4fd76cb Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 23 Aug 2016 17:37:45 -0700
Subject: [PATCH 2204/3220] Btrfs: check btree node's nritems

commit 053ab70f0604224c7893b43f9d9d5efa283580d6 upstream.

When btree node (level = 1) has nritems which equals to zero,
we can end up with panic due to insert_ptr()'s

BUG_ON(slot > nritems);

where slot is 1 and nritems is 0, as copy_for_split() calls
insert_ptr(.., path->slots[1] + 1, ...);

A invalid value results in the whole mess, this adds the check
for btree's node nritems so that we stop reading block when
when something is wrong.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 20923889de69..55b4f5e02550 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -609,6 +609,19 @@ static noinline int check_leaf(struct btrfs_root *root,
 	return 0;
 }
 
+static int check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+	unsigned long nr = btrfs_header_nritems(node);
+
+	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+		btrfs_crit(root->fs_info,
+			   "corrupt node: block %llu root %llu nritems %lu",
+			   node->start, root->objectid, nr);
+		return -EIO;
+	}
+	return 0;
+}
+
 static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 				      u64 phy_offset, struct page *page,
 				      u64 start, u64 end, int mirror)
@@ -680,6 +693,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 		ret = -EIO;
 	}
 
+	if (found_level > 0 && check_node(root, eb))
+		ret = -EIO;
+
 	if (!ret)
 		set_extent_buffer_uptodate(eb);
 err:

From 4b522570f9743b190319768144a52b9907c5d18a Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 2 Sep 2016 12:35:34 -0700
Subject: [PATCH 2205/3220] Btrfs: fix BUG_ON in btrfs_mark_buffer_dirty

commit ef85b25e982b5bba1530b936e283ef129f02ab9d upstream.

This can only happen with CONFIG_BTRFS_FS_CHECK_INTEGRITY=y.

Commit 1ba98d0 ("Btrfs: detect corruption when non-root leaf has zero item")
assumes that a leaf is its root when leaf->bytenr == btrfs_root_bytenr(root),
however, we should not use btrfs_root_bytenr(root) since it's mainly got
updated during committing transaction.  So the check can fail when doing
COW on this leaf while it is a root.

This changes to use "if (leaf == btrfs_root_node(root))" instead, just like
how we check whether leaf is a root in __btrfs_cow_block().

Fixes: 1ba98d086fe3 (Btrfs: detect corruption when non-root leaf has zero item)
Reported-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 55b4f5e02550..5805b4f4702e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -548,13 +548,17 @@ static noinline int check_leaf(struct btrfs_root *root,
 		 * open_ctree() some roots has not yet been set up.
 		 */
 		if (!IS_ERR_OR_NULL(check_root)) {
+			struct extent_buffer *eb;
+
+			eb = btrfs_root_node(check_root);
 			/* if leaf is the root, then it's fine */
-			if (leaf->start !=
-			    btrfs_root_bytenr(&check_root->root_item)) {
+			if (leaf != eb) {
 				CORRUPT("non-root leaf's nritems is 0",
-					leaf, root, 0);
+					leaf, check_root, 0);
+				free_extent_buffer(eb);
 				return -EIO;
 			}
+			free_extent_buffer(eb);
 		}
 		return 0;
 	}

From 731b8667952815844de39412fb202292cf679f38 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 14 Sep 2016 17:22:57 -0700
Subject: [PATCH 2206/3220] Btrfs: memset to avoid stale content in btree node
 block

commit 3eb548ee3a8042d95ad81be254e67a5222c24e03 upstream.

During updating btree, we could push items between sibling
nodes/leaves, for leaves data sections starts reversely from
the end of the block while for nodes we only have key pairs
which are stored one by one from the start of the block.

So we could do try to push key pairs from one node to the next
node right in the tree, and after that, we update the node's
nritems to reflect the correct end while leaving the stale
content in the node.  One may intentionally corrupt the fs
image and access the stale content by bumping the nritems and
causes various crashes.

This takes the in-memory @nritems as the correct one and
gets to memset the unused part of a btree node.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent_io.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 88bee6703cc0..f8b2fe9944a7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3858,6 +3858,17 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
 		bio_flags = EXTENT_BIO_TREE_LOG;
 
+	/* set btree node beyond nritems with 0 to avoid stale content */
+	if (btrfs_header_level(eb) > 0) {
+		u32 nritems;
+		unsigned long end;
+
+		nritems = btrfs_header_nritems(eb);
+		end = btrfs_node_key_ptr_offset(nritems);
+
+		memset_extent_buffer(eb, 0, end, eb->len - end);
+	}
+
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = eb->pages[i];
 

From d6719cf116b2fc8a7f51e84430c28febfac1eded Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 14 Sep 2016 17:23:24 -0700
Subject: [PATCH 2207/3220] Btrfs: improve check_node to avoid reading
 corrupted nodes

commit 6b722c1747d533ac6d4df110dc8233db46918b65 upstream.

We need to check items in a node to make sure that we're reading
a valid one, otherwise we could get various crashes while processing
delayed_refs.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5805b4f4702e..73e4e5f56692 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -523,9 +523,10 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 }
 
 #define CORRUPT(reason, eb, root, slot)				\
-	btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu,"	\
-		   "root=%llu, slot=%d", reason,			\
-	       btrfs_header_bytenr(eb),	root->objectid, slot)
+	btrfs_crit(root->fs_info, "corrupt %s, %s: block=%llu,"	\
+		   " root=%llu, slot=%d",			\
+		   btrfs_header_level(eb) == 0 ? "leaf" : "node",\
+		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
 
 static noinline int check_leaf(struct btrfs_root *root,
 			       struct extent_buffer *leaf)
@@ -616,6 +617,10 @@ static noinline int check_leaf(struct btrfs_root *root,
 static int check_node(struct btrfs_root *root, struct extent_buffer *node)
 {
 	unsigned long nr = btrfs_header_nritems(node);
+	struct btrfs_key key, next_key;
+	int slot;
+	u64 bytenr;
+	int ret = 0;
 
 	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
 		btrfs_crit(root->fs_info,
@@ -623,7 +628,26 @@ static int check_node(struct btrfs_root *root, struct extent_buffer *node)
 			   node->start, root->objectid, nr);
 		return -EIO;
 	}
-	return 0;
+
+	for (slot = 0; slot < nr - 1; slot++) {
+		bytenr = btrfs_node_blockptr(node, slot);
+		btrfs_node_key_to_cpu(node, &key, slot);
+		btrfs_node_key_to_cpu(node, &next_key, slot + 1);
+
+		if (!bytenr) {
+			CORRUPT("invalid item slot", node, root, slot);
+			ret = -EIO;
+			goto out;
+		}
+
+		if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+			CORRUPT("bad key order", node, root, slot);
+			ret = -EIO;
+			goto out;
+		}
+	}
+out:
+	return ret;
 }
 
 static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,

From 79ddacdf1df2dc886d11737da2816423129ba4a4 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 14 Sep 2016 19:19:05 -0700
Subject: [PATCH 2208/3220] Btrfs: kill BUG_ON in run_delayed_tree_ref

commit 02794222c4132ac003e7281fb71f4ec1645ffc87 upstream.

In a corrupted btrfs image, we can come across this BUG_ON and
get an unreponsive system, but if we return errors instead,
its caller can handle everything gracefully by aborting the current
transaction.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index abfd2320873d..14549387b518 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2342,7 +2342,13 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		ins.type = BTRFS_EXTENT_ITEM_KEY;
 	}
 
-	BUG_ON(node->ref_mod != 1);
+	if (node->ref_mod != 1) {
+		btrfs_err(root->fs_info,
+	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
+			  node->bytenr, node->ref_mod, node->action, ref_root,
+			  parent);
+		return -EIO;
+	}
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
 		BUG_ON(!extent_op || !extent_op->update_flags);
 		ret = alloc_reserved_tree_block(trans, root,

From 452042d27b6d390a4dac5536539693eca31d67ec Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 23 Sep 2016 13:44:44 -0700
Subject: [PATCH 2209/3220] Btrfs: memset to avoid stale content in btree leaf

commit 851cd173f06045816528176001cf82948282029c upstream.

This is an additional patch to
"Btrfs: memset to avoid stale content in btree node block".

This uses memset to initialize the unused space in a leaf to avoid
potential stale content, which may be incurred by pushing items
between sibling leaves.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ctree.c     | 14 --------------
 fs/btrfs/ctree.h     | 15 +++++++++++++++
 fs/btrfs/extent_io.c | 18 +++++++++++++-----
 3 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 38ee08675468..8f4baa3cb992 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1726,20 +1726,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	return err;
 }
 
-/*
- * The leaf data grows from end-to-front in the node.
- * this returns the address of the start of the last item,
- * which is the stop of the leaf data stack
- */
-static inline unsigned int leaf_data_end(struct btrfs_root *root,
-					 struct extent_buffer *leaf)
-{
-	u32 nr = btrfs_header_nritems(leaf);
-	if (nr == 0)
-		return BTRFS_LEAF_DATA_SIZE(root);
-	return btrfs_item_offset_nr(leaf, nr - 1);
-}
-
 
 /*
  * search for key in the extent_buffer.  The items start at offset p,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e847573c6db0..fc6ba1547d75 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3158,6 +3158,21 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 	return offsetof(struct btrfs_leaf, items);
 }
 
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+					 struct extent_buffer *leaf)
+{
+	u32 nr = btrfs_header_nritems(leaf);
+
+	if (nr == 0)
+		return BTRFS_LEAF_DATA_SIZE(root);
+	return btrfs_item_offset_nr(leaf, nr - 1);
+}
+
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f8b2fe9944a7..a2fff9a0fe91 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3847,8 +3847,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
 	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 	u64 offset = eb->start;
+	u32 nritems;
 	unsigned long i, num_pages;
 	unsigned long bio_flags = 0;
+	unsigned long start, end;
 	int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
 	int ret = 0;
 
@@ -3858,15 +3860,21 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
 		bio_flags = EXTENT_BIO_TREE_LOG;
 
-	/* set btree node beyond nritems with 0 to avoid stale content */
+	/* set btree blocks beyond nritems with 0 to avoid stale content. */
+	nritems = btrfs_header_nritems(eb);
 	if (btrfs_header_level(eb) > 0) {
-		u32 nritems;
-		unsigned long end;
-
-		nritems = btrfs_header_nritems(eb);
 		end = btrfs_node_key_ptr_offset(nritems);
 
 		memset_extent_buffer(eb, 0, end, eb->len - end);
+	} else {
+		/*
+		 * leaf:
+		 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
+		 */
+		start = btrfs_item_nr_offset(nritems);
+		end = btrfs_leaf_data(eb) +
+		      leaf_data_end(fs_info->tree_root, eb);
+		memset_extent_buffer(eb, 0, start, end - start);
 	}
 
 	for (i = 0; i < num_pages; i++) {

From 89643604a879a9d0400f817168737cf261a997e4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 23 Nov 2016 16:21:18 +0000
Subject: [PATCH 2210/3220] Btrfs: fix emptiness check for dirtied extent
 buffers at check_leaf()

commit f177d73949bf758542ca15a1c1945bd2e802cc65 upstream.

We can not simply use the owner field from an extent buffer's header to
get the id of the respective tree when the extent buffer is from a
relocation tree. When we create the root for a relocation tree we leave
(on purpose) the owner field with the same value as the subvolume's tree
root (we do this at ctree.c:btrfs_copy_root()). So we must ignore extent
buffers from relocation trees, which have the BTRFS_HEADER_FLAG_RELOC
flag set, because otherwise we will always consider the extent buffer
as not being the root of the tree (the root of original subvolume tree
is always different from the root of the respective relocation tree).

This lead to assertion failures when running with the integrity checker
enabled (CONFIG_BTRFS_FS_CHECK_INTEGRITY=y) such as the following:

[  643.393409] BTRFS critical (device sdg): corrupt leaf, non-root leaf's nritems is 0: block=38506496, root=260, slot=0
[  643.397609] BTRFS info (device sdg): leaf 38506496 total ptrs 0 free space 3995
[  643.407075] assertion failed: 0, file: fs/btrfs/disk-io.c, line: 4078
[  643.408425] ------------[ cut here ]------------
[  643.409112] kernel BUG at fs/btrfs/ctree.h:3419!
[  643.409773] invalid opcode: 0000 [#1] PREEMPT SMP
[  643.410447] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq ppdev psmouse acpi_cpufreq parport_pc evdev parport tpm_tis tpm_tis_core pcspkr serio_raw i2c_piix4 sg tpm i2c_core button processor loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring scsi_mod virtio e1000 floppy
[  643.414356] CPU: 11 PID: 32726 Comm: btrfs Not tainted 4.8.0-rc8-btrfs-next-35+ #1
[  643.414356] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014
[  643.414356] task: ffff880145e95b00 task.stack: ffff88014826c000
[  643.414356] RIP: 0010:[<ffffffffa0352759>]  [<ffffffffa0352759>] assfail.constprop.41+0x1c/0x1e [btrfs]
[  643.414356] RSP: 0018:ffff88014826fa28  EFLAGS: 00010292
[  643.414356] RAX: 0000000000000039 RBX: ffff88014e2d7c38 RCX: 0000000000000001
[  643.414356] RDX: ffff88023f4d2f58 RSI: ffffffff81806c63 RDI: 00000000ffffffff
[  643.414356] RBP: ffff88014826fa28 R08: 0000000000000001 R09: 0000000000000000
[  643.414356] R10: ffff88014826f918 R11: ffffffff82f3c5ed R12: ffff880172910000
[  643.414356] R13: ffff880233992230 R14: ffff8801a68a3310 R15: fffffffffffffff8
[  643.414356] FS:  00007f9ca305e8c0(0000) GS:ffff88023f4c0000(0000) knlGS:0000000000000000
[  643.414356] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  643.414356] CR2: 00007f9ca3071000 CR3: 000000015d01b000 CR4: 00000000000006e0
[  643.414356] Stack:
[  643.414356]  ffff88014826fa50 ffffffffa02d655a 000000000000000a ffff88014e2d7c38
[  643.414356]  0000000000000000 ffff88014826faa8 ffffffffa02b72f3 ffff88014826fab8
[  643.414356]  00ffffffa03228e4 0000000000000000 0000000000000000 ffff8801bbd4e000
[  643.414356] Call Trace:
[  643.414356]  [<ffffffffa02d655a>] btrfs_mark_buffer_dirty+0xdf/0xe5 [btrfs]
[  643.414356]  [<ffffffffa02b72f3>] btrfs_copy_root+0x18a/0x1d1 [btrfs]
[  643.414356]  [<ffffffffa0322921>] create_reloc_root+0x72/0x1ba [btrfs]
[  643.414356]  [<ffffffffa03267c2>] btrfs_init_reloc_root+0x7b/0xa7 [btrfs]
[  643.414356]  [<ffffffffa02d9e44>] record_root_in_trans+0xdf/0xed [btrfs]
[  643.414356]  [<ffffffffa02db04e>] btrfs_record_root_in_trans+0x50/0x6a [btrfs]
[  643.414356]  [<ffffffffa030ad2b>] create_subvol+0x472/0x773 [btrfs]
[  643.414356]  [<ffffffffa030b406>] btrfs_mksubvol+0x3da/0x463 [btrfs]
[  643.414356]  [<ffffffffa030b406>] ? btrfs_mksubvol+0x3da/0x463 [btrfs]
[  643.414356]  [<ffffffff810781ac>] ? preempt_count_add+0x65/0x68
[  643.414356]  [<ffffffff811a6e97>] ? __mnt_want_write+0x62/0x77
[  643.414356]  [<ffffffffa030b55d>] btrfs_ioctl_snap_create_transid+0xce/0x187 [btrfs]
[  643.414356]  [<ffffffffa030b67d>] btrfs_ioctl_snap_create+0x67/0x81 [btrfs]
[  643.414356]  [<ffffffffa030ecfd>] btrfs_ioctl+0x508/0x20dd [btrfs]
[  643.414356]  [<ffffffff81293e39>] ? __this_cpu_preempt_check+0x13/0x15
[  643.414356]  [<ffffffff81155eca>] ? handle_mm_fault+0x976/0x9ab
[  643.414356]  [<ffffffff81091300>] ? arch_local_irq_save+0x9/0xc
[  643.414356]  [<ffffffff8119a2b0>] vfs_ioctl+0x18/0x34
[  643.414356]  [<ffffffff8119a8e8>] do_vfs_ioctl+0x581/0x600
[  643.414356]  [<ffffffff814b9552>] ? entry_SYSCALL_64_fastpath+0x5/0xa8
[  643.414356]  [<ffffffff81093fe9>] ? trace_hardirqs_on_caller+0x17b/0x197
[  643.414356]  [<ffffffff8119a9be>] SyS_ioctl+0x57/0x79
[  643.414356]  [<ffffffff814b9565>] entry_SYSCALL_64_fastpath+0x18/0xa8
[  643.414356]  [<ffffffff81091b08>] ? trace_hardirqs_off_caller+0x3f/0xaa
[  643.414356] Code: 89 83 88 00 00 00 31 c0 5b 41 5c 41 5d 5d c3 55 89 f1 48 c7 c2 98 bc 35 a0 48 89 fe 48 c7 c7 05 be 35 a0 48 89 e5 e8 13 46 dd e0 <0f> 0b 55 89 f1 48 c7 c2 9f d3 35 a0 48 89 fe 48 c7 c7 7a d5 35
[  643.414356] RIP  [<ffffffffa0352759>] assfail.constprop.41+0x1c/0x1e [btrfs]
[  643.414356]  RSP <ffff88014826fa28>
[  643.468267] ---[ end trace 6a1b3fb1a9d7d6e3 ]---

This can be easily reproduced by running xfstests with the integrity
checker enabled.

Fixes: 1ba98d086fe3 (Btrfs: detect corruption when non-root leaf has zero item)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 73e4e5f56692..fc70be66ee79 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -536,7 +536,15 @@ static noinline int check_leaf(struct btrfs_root *root,
 	u32 nritems = btrfs_header_nritems(leaf);
 	int slot;
 
-	if (nritems == 0) {
+	/*
+	 * Extent buffers from a relocation tree have a owner field that
+	 * corresponds to the subvolume tree they are based on. So just from an
+	 * extent buffer alone we can not find out what is the id of the
+	 * corresponding subvolume tree, so we can not figure out if the extent
+	 * buffer corresponds to the root of the relocation tree or not. So skip
+	 * this check for relocation trees.
+	 */
+	if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
 		struct btrfs_root *check_root;
 
 		key.objectid = btrfs_header_owner(leaf);
@@ -564,6 +572,9 @@ static noinline int check_leaf(struct btrfs_root *root,
 		return 0;
 	}
 
+	if (nritems == 0)
+		return 0;
+
 	/* Check the 0 item */
 	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
 	    BTRFS_LEAF_DATA_SIZE(root)) {

From 31c986aa7e6907f28c19cd09e0b17de68f45d1c2 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 28 Jun 2017 21:56:53 -0600
Subject: [PATCH 2211/3220] btrfs: struct-funcs, constify readers

commit 1cbb1f454e5321e47fc1e6b233066c7ccc979d15 upstream.

We have reader helpers for most of the on-disk structures that use
an extent_buffer and pointer as offset into the buffer that are
read-only.  We should mark them as const and, in turn, allow consumers
of these interfaces to mark the buffers const as well.

No impact on code, but serves as documentation that a buffer is intended
not to be modified.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ctree.h        | 128 +++++++++++++++++++++-------------------
 fs/btrfs/extent_io.c    |  24 ++++----
 fs/btrfs/extent_io.h    |  19 +++---
 fs/btrfs/struct-funcs.c |   9 +--
 4 files changed, 91 insertions(+), 89 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fc6ba1547d75..dbad9c9f8d78 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2283,7 +2283,7 @@ do {                                                                   \
 #define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
 
 struct btrfs_map_token {
-	struct extent_buffer *eb;
+	const struct extent_buffer *eb;
 	char *kaddr;
 	unsigned long offset;
 };
@@ -2314,18 +2314,19 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
 			   sizeof(((type *)0)->member)))
 
 #define DECLARE_BTRFS_SETGET_BITS(bits)					\
-u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
-			       unsigned long off,			\
-                              struct btrfs_map_token *token);		\
-void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr,	\
+u##bits btrfs_get_token_##bits(const struct extent_buffer *eb,		\
+			       const void *ptr, unsigned long off,	\
+			       struct btrfs_map_token *token);		\
+void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr,	\
 			    unsigned long off, u##bits val,		\
 			    struct btrfs_map_token *token);		\
-static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb,	\
+				       const void *ptr,			\
 				       unsigned long off)		\
 {									\
 	return btrfs_get_token_##bits(eb, ptr, off, NULL);		\
 }									\
-static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\
 				    unsigned long off, u##bits val)	\
 {									\
        btrfs_set_token_##bits(eb, ptr, off, val, NULL);			\
@@ -2337,7 +2338,8 @@ DECLARE_BTRFS_SETGET_BITS(32)
 DECLARE_BTRFS_SETGET_BITS(64)
 
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
-static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s)	\
+static inline u##bits btrfs_##name(const struct extent_buffer *eb,	\
+				   const type *s)			\
 {									\
 	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
 	return btrfs_get_##bits(eb, s, offsetof(type, member));		\
@@ -2348,7 +2350,8 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, type *s,	\
 	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
 	btrfs_set_##bits(eb, s, offsetof(type, member), val);		\
 }									\
-static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\
+					 const type *s,			\
 					 struct btrfs_map_token *token)	\
 {									\
 	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
@@ -2363,9 +2366,9 @@ static inline void btrfs_set_token_##name(struct extent_buffer *eb,	\
 }
 
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
-static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
+static inline u##bits btrfs_##name(const struct extent_buffer *eb)	\
 {									\
-	type *p = page_address(eb->pages[0]);				\
+	const type *p = page_address(eb->pages[0]);			\
 	u##bits res = le##bits##_to_cpu(p->member);			\
 	return res;							\
 }									\
@@ -2377,7 +2380,7 @@ static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 }
 
 #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
-static inline u##bits btrfs_##name(type *s)				\
+static inline u##bits btrfs_##name(const type *s)			\
 {									\
 	return le##bits##_to_cpu(s->member);				\
 }									\
@@ -2678,7 +2681,7 @@ static inline unsigned long btrfs_node_key_ptr_offset(int nr)
 		sizeof(struct btrfs_key_ptr) * nr;
 }
 
-void btrfs_node_key(struct extent_buffer *eb,
+void btrfs_node_key(const struct extent_buffer *eb,
 		    struct btrfs_disk_key *disk_key, int nr);
 
 static inline void btrfs_set_node_key(struct extent_buffer *eb,
@@ -2707,28 +2710,28 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
 	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
 }
 
-static inline u32 btrfs_item_end(struct extent_buffer *eb,
+static inline u32 btrfs_item_end(const struct extent_buffer *eb,
 				 struct btrfs_item *item)
 {
 	return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
 }
 
-static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
 {
 	return btrfs_item_end(eb, btrfs_item_nr(nr));
 }
 
-static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
 {
 	return btrfs_item_offset(eb, btrfs_item_nr(nr));
 }
 
-static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
 {
 	return btrfs_item_size(eb, btrfs_item_nr(nr));
 }
 
-static inline void btrfs_item_key(struct extent_buffer *eb,
+static inline void btrfs_item_key(const struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
 	struct btrfs_item *item = btrfs_item_nr(nr);
@@ -2764,8 +2767,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
 BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
 			 transid, 64);
 
-static inline void btrfs_dir_item_key(struct extent_buffer *eb,
-				      struct btrfs_dir_item *item,
+static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
+				      const struct btrfs_dir_item *item,
 				      struct btrfs_disk_key *key)
 {
 	read_eb_member(eb, item, struct btrfs_dir_item, location, key);
@@ -2773,7 +2776,7 @@ static inline void btrfs_dir_item_key(struct extent_buffer *eb,
 
 static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
 					  struct btrfs_dir_item *item,
-					  struct btrfs_disk_key *key)
+					  const struct btrfs_disk_key *key)
 {
 	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
 }
@@ -2785,8 +2788,8 @@ BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
 BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
 		   generation, 64);
 
-static inline void btrfs_free_space_key(struct extent_buffer *eb,
-					struct btrfs_free_space_header *h,
+static inline void btrfs_free_space_key(const struct extent_buffer *eb,
+					const struct btrfs_free_space_header *h,
 					struct btrfs_disk_key *key)
 {
 	read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
@@ -2794,7 +2797,7 @@ static inline void btrfs_free_space_key(struct extent_buffer *eb,
 
 static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
 					    struct btrfs_free_space_header *h,
-					    struct btrfs_disk_key *key)
+					    const struct btrfs_disk_key *key)
 {
 	write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
 }
@@ -2821,25 +2824,25 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
 	disk->objectid = cpu_to_le64(cpu->objectid);
 }
 
-static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
-				  struct btrfs_key *key, int nr)
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+					 struct btrfs_key *key, int nr)
 {
 	struct btrfs_disk_key disk_key;
 	btrfs_node_key(eb, &disk_key, nr);
 	btrfs_disk_key_to_cpu(key, &disk_key);
 }
 
-static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
-				  struct btrfs_key *key, int nr)
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+					 struct btrfs_key *key, int nr)
 {
 	struct btrfs_disk_key disk_key;
 	btrfs_item_key(eb, &disk_key, nr);
 	btrfs_disk_key_to_cpu(key, &disk_key);
 }
 
-static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
-				      struct btrfs_dir_item *item,
-				      struct btrfs_key *key)
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+					     const struct btrfs_dir_item *item,
+					     struct btrfs_key *key)
 {
 	struct btrfs_disk_key disk_key;
 	btrfs_dir_item_key(eb, item, &disk_key);
@@ -2872,7 +2875,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
 			 nritems, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
 
-static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
 {
 	return (btrfs_header_flags(eb) & flag) == flag;
 }
@@ -2891,7 +2894,7 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
 	return (flags & flag) == flag;
 }
 
-static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
 {
 	u64 flags = btrfs_header_flags(eb);
 	return flags >> BTRFS_BACKREF_REV_SHIFT;
@@ -2911,12 +2914,12 @@ static inline unsigned long btrfs_header_fsid(void)
 	return offsetof(struct btrfs_header, fsid);
 }
 
-static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb)
 {
 	return offsetof(struct btrfs_header, chunk_tree_uuid);
 }
 
-static inline int btrfs_is_leaf(struct extent_buffer *eb)
+static inline int btrfs_is_leaf(const struct extent_buffer *eb)
 {
 	return btrfs_header_level(eb) == 0;
 }
@@ -2950,12 +2953,12 @@ BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
 BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
 			 rtransid, 64);
 
-static inline bool btrfs_root_readonly(struct btrfs_root *root)
+static inline bool btrfs_root_readonly(const struct btrfs_root *root)
 {
 	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
 }
 
-static inline bool btrfs_root_dead(struct btrfs_root *root)
+static inline bool btrfs_root_dead(const struct btrfs_root *root)
 {
 	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
 }
@@ -3012,51 +3015,51 @@ BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
 /* struct btrfs_balance_item */
 BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
 
-static inline void btrfs_balance_data(struct extent_buffer *eb,
-				      struct btrfs_balance_item *bi,
+static inline void btrfs_balance_data(const struct extent_buffer *eb,
+				      const struct btrfs_balance_item *bi,
 				      struct btrfs_disk_balance_args *ba)
 {
 	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
 }
 
 static inline void btrfs_set_balance_data(struct extent_buffer *eb,
-					  struct btrfs_balance_item *bi,
-					  struct btrfs_disk_balance_args *ba)
+				  struct btrfs_balance_item *bi,
+				  const struct btrfs_disk_balance_args *ba)
 {
 	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
 }
 
-static inline void btrfs_balance_meta(struct extent_buffer *eb,
-				      struct btrfs_balance_item *bi,
+static inline void btrfs_balance_meta(const struct extent_buffer *eb,
+				      const struct btrfs_balance_item *bi,
 				      struct btrfs_disk_balance_args *ba)
 {
 	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
 }
 
 static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
-					  struct btrfs_balance_item *bi,
-					  struct btrfs_disk_balance_args *ba)
+				  struct btrfs_balance_item *bi,
+				  const struct btrfs_disk_balance_args *ba)
 {
 	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
 }
 
-static inline void btrfs_balance_sys(struct extent_buffer *eb,
-				     struct btrfs_balance_item *bi,
+static inline void btrfs_balance_sys(const struct extent_buffer *eb,
+				     const struct btrfs_balance_item *bi,
 				     struct btrfs_disk_balance_args *ba)
 {
 	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
 }
 
 static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
-					 struct btrfs_balance_item *bi,
-					 struct btrfs_disk_balance_args *ba)
+				 struct btrfs_balance_item *bi,
+				 const struct btrfs_disk_balance_args *ba)
 {
 	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
 }
 
 static inline void
 btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
-			       struct btrfs_disk_balance_args *disk)
+			       const struct btrfs_disk_balance_args *disk)
 {
 	memset(cpu, 0, sizeof(*cpu));
 
@@ -3076,7 +3079,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
 
 static inline void
 btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
-			       struct btrfs_balance_args *cpu)
+			       const struct btrfs_balance_args *cpu)
 {
 	memset(disk, 0, sizeof(*disk));
 
@@ -3144,7 +3147,7 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
 BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
 			 uuid_tree_generation, 64);
 
-static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+static inline int btrfs_super_csum_size(const struct btrfs_super_block *s)
 {
 	u16 t = btrfs_super_csum_type(s);
 	/*
@@ -3163,8 +3166,8 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
  * this returns the address of the start of the last item,
  * which is the stop of the leaf data stack
  */
-static inline unsigned int leaf_data_end(struct btrfs_root *root,
-					 struct extent_buffer *leaf)
+static inline unsigned int leaf_data_end(const struct btrfs_root *root,
+					 const struct extent_buffer *leaf)
 {
 	u32 nr = btrfs_header_nritems(leaf);
 
@@ -3189,7 +3192,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
 			 struct btrfs_file_extent_item, compression, 8);
 
 static inline unsigned long
-btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e)
 {
 	return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
 }
@@ -3223,8 +3226,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
  * size of any extent headers.  If a file is compressed on disk, this is
  * the compressed size
  */
-static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
-						    struct btrfs_item *e)
+static inline u32 btrfs_file_extent_inline_item_len(
+						const struct extent_buffer *eb,
+						struct btrfs_item *e)
 {
 	return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
 }
@@ -3232,9 +3236,9 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
 /* this returns the number of file bytes represented by the inline item.
  * If an item is compressed, this is the uncompressed size
  */
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-					       int slot,
-					       struct btrfs_file_extent_item *fi)
+static inline u32 btrfs_file_extent_inline_len(const struct extent_buffer *eb,
+					int slot,
+					const struct btrfs_file_extent_item *fi)
 {
 	struct btrfs_map_token token;
 
@@ -3256,8 +3260,8 @@ static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
 
 
 /* btrfs_dev_stats_item */
-static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
-					struct btrfs_dev_stats_item *ptr,
+static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
+					const struct btrfs_dev_stats_item *ptr,
 					int index)
 {
 	u64 val;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a2fff9a0fe91..42e7f6a8f91d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5381,9 +5381,8 @@ unlock_exit:
 	return ret;
 }
 
-void read_extent_buffer(struct extent_buffer *eb, void *dstv,
-			unsigned long start,
-			unsigned long len)
+void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
+			unsigned long start, unsigned long len)
 {
 	size_t cur;
 	size_t offset;
@@ -5412,9 +5411,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	}
 }
 
-int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
-			unsigned long start,
-			unsigned long len)
+int read_extent_buffer_to_user(const struct extent_buffer *eb,
+			       void __user *dstv,
+			       unsigned long start, unsigned long len)
 {
 	size_t cur;
 	size_t offset;
@@ -5449,10 +5448,10 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
 	return ret;
 }
 
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
-			       unsigned long min_len, char **map,
-			       unsigned long *map_start,
-			       unsigned long *map_len)
+int map_private_extent_buffer(const struct extent_buffer *eb,
+			      unsigned long start, unsigned long min_len,
+			      char **map, unsigned long *map_start,
+			      unsigned long *map_len)
 {
 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
 	char *kaddr;
@@ -5487,9 +5486,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	return 0;
 }
 
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
-			  unsigned long start,
-			  unsigned long len)
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+			 unsigned long start, unsigned long len)
 {
 	size_t cur;
 	size_t offset;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f4c1ae11855f..751435967724 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -308,14 +308,13 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
 	atomic_inc(&eb->refs);
 }
 
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
-			  unsigned long start,
-			  unsigned long len);
-void read_extent_buffer(struct extent_buffer *eb, void *dst,
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+			 unsigned long start, unsigned long len);
+void read_extent_buffer(const struct extent_buffer *eb, void *dst,
 			unsigned long start,
 			unsigned long len);
-int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
-			       unsigned long start,
+int read_extent_buffer_to_user(const struct extent_buffer *eb,
+			       void __user *dst, unsigned long start,
 			       unsigned long len);
 void write_extent_buffer(struct extent_buffer *eb, const void *src,
 			 unsigned long start, unsigned long len);
@@ -334,10 +333,10 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_under_io(struct extent_buffer *eb);
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-		      unsigned long min_len, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len);
+int map_private_extent_buffer(const struct extent_buffer *eb,
+			      unsigned long offset, unsigned long min_len,
+			      char **map, unsigned long *map_start,
+			      unsigned long *map_len);
 int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
 int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index b976597b0721..63ffd213b0b7 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -50,8 +50,8 @@ static inline void put_unaligned_le8(u8 val, void *p)
  */
 
 #define DEFINE_BTRFS_SETGET_BITS(bits)					\
-u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
-			       unsigned long off,			\
+u##bits btrfs_get_token_##bits(const struct extent_buffer *eb,		\
+			       const void *ptr, unsigned long off,	\
 			       struct btrfs_map_token *token)		\
 {									\
 	unsigned long part_offset = (unsigned long)ptr;			\
@@ -90,7 +90,8 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
 	return res;							\
 }									\
 void btrfs_set_token_##bits(struct extent_buffer *eb,			\
-			    void *ptr, unsigned long off, u##bits val,	\
+			    const void *ptr, unsigned long off,		\
+			    u##bits val,				\
 			    struct btrfs_map_token *token)		\
 {									\
 	unsigned long part_offset = (unsigned long)ptr;			\
@@ -133,7 +134,7 @@ DEFINE_BTRFS_SETGET_BITS(16)
 DEFINE_BTRFS_SETGET_BITS(32)
 DEFINE_BTRFS_SETGET_BITS(64)
 
-void btrfs_node_key(struct extent_buffer *eb,
+void btrfs_node_key(const struct extent_buffer *eb,
 		    struct btrfs_disk_key *disk_key, int nr)
 {
 	unsigned long ptr = btrfs_node_key_ptr_offset(nr);

From 4583bc2c64208002bcbcc09e086450ddc176c128 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Wed, 23 Aug 2017 16:57:56 +0900
Subject: [PATCH 2212/3220] btrfs: Refactor check_leaf function for later
 expansion

commit c3267bbaa9cae09b62960eafe33ad19196803285 upstream.

Current check_leaf() function does a good job checking key order and
item offset/size.

However it only checks from slot 0 to the last but one slot, this is
good but makes later expansion hard.

So this refactoring iterates from slot 0 to the last slot.
For key comparison, it uses a key with all 0 as initial key, so all
valid keys should be larger than that.

And for item size/offset checks, it compares current item end with
previous item offset.
For slot 0, use leaf end as a special case.

This makes later item/key offset checks and item size checks easier to
be implemented.

Also, makes check_leaf() to return -EUCLEAN other than -EIO to indicate
error.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4:
 - BTRFS_LEAF_DATA_SIZE() takes a root rather than an fs_info
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 50 +++++++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fc70be66ee79..d513650d1b77 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -531,8 +531,9 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 static noinline int check_leaf(struct btrfs_root *root,
 			       struct extent_buffer *leaf)
 {
+	/* No valid key type is 0, so all key should be larger than this key */
+	struct btrfs_key prev_key = {0, 0, 0};
 	struct btrfs_key key;
-	struct btrfs_key leaf_key;
 	u32 nritems = btrfs_header_nritems(leaf);
 	int slot;
 
@@ -565,7 +566,7 @@ static noinline int check_leaf(struct btrfs_root *root,
 				CORRUPT("non-root leaf's nritems is 0",
 					leaf, check_root, 0);
 				free_extent_buffer(eb);
-				return -EIO;
+				return -EUCLEAN;
 			}
 			free_extent_buffer(eb);
 		}
@@ -575,28 +576,23 @@ static noinline int check_leaf(struct btrfs_root *root,
 	if (nritems == 0)
 		return 0;
 
-	/* Check the 0 item */
-	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
-	    BTRFS_LEAF_DATA_SIZE(root)) {
-		CORRUPT("invalid item offset size pair", leaf, root, 0);
-		return -EIO;
-	}
-
 	/*
-	 * Check to make sure each items keys are in the correct order and their
-	 * offsets make sense.  We only have to loop through nritems-1 because
-	 * we check the current slot against the next slot, which verifies the
-	 * next slot's offset+size makes sense and that the current's slot
-	 * offset is correct.
+	 * Check the following things to make sure this is a good leaf, and
+	 * leaf users won't need to bother with similar sanity checks:
+	 *
+	 * 1) key order
+	 * 2) item offset and size
+	 *    No overlap, no hole, all inside the leaf.
 	 */
-	for (slot = 0; slot < nritems - 1; slot++) {
-		btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
-		btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+	for (slot = 0; slot < nritems; slot++) {
+		u32 item_end_expected;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
 
 		/* Make sure the keys are in the right order */
-		if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+		if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
 			CORRUPT("bad key order", leaf, root, slot);
-			return -EIO;
+			return -EUCLEAN;
 		}
 
 		/*
@@ -604,10 +600,14 @@ static noinline int check_leaf(struct btrfs_root *root,
 		 * item data starts at the end of the leaf and grows towards the
 		 * front.
 		 */
-		if (btrfs_item_offset_nr(leaf, slot) !=
-			btrfs_item_end_nr(leaf, slot + 1)) {
+		if (slot == 0)
+			item_end_expected = BTRFS_LEAF_DATA_SIZE(root);
+		else
+			item_end_expected = btrfs_item_offset_nr(leaf,
+								 slot - 1);
+		if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
 			CORRUPT("slot offset bad", leaf, root, slot);
-			return -EIO;
+			return -EUCLEAN;
 		}
 
 		/*
@@ -618,8 +618,12 @@ static noinline int check_leaf(struct btrfs_root *root,
 		if (btrfs_item_end_nr(leaf, slot) >
 		    BTRFS_LEAF_DATA_SIZE(root)) {
 			CORRUPT("slot end outside of leaf", leaf, root, slot);
-			return -EIO;
+			return -EUCLEAN;
 		}
+
+		prev_key.objectid = key.objectid;
+		prev_key.type = key.type;
+		prev_key.offset = key.offset;
 	}
 
 	return 0;

From dc7a1fbccd957904f7a0a9f665c38b6431f30b60 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Wed, 23 Aug 2017 16:57:57 +0900
Subject: [PATCH 2213/3220] btrfs: Check if item pointer overlaps with the item
 itself

commit 7f43d4affb2a254d421ab20b0cf65ac2569909fb upstream.

Function check_leaf() checks if any item pointer points outside of the
leaf, but it doesn't check if the pointer overlaps with the item itself.

Normally only the last item may be the victim, but adding such check is
never a bad idea anyway.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d513650d1b77..d2725a8798fe 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -621,6 +621,13 @@ static noinline int check_leaf(struct btrfs_root *root,
 			return -EUCLEAN;
 		}
 
+		/* Also check if the item pointer overlaps with btrfs item. */
+		if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
+		    btrfs_item_ptr_offset(leaf, slot)) {
+			CORRUPT("slot overlap with its data", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
 		prev_key.objectid = key.objectid;
 		prev_key.type = key.type;
 		prev_key.offset = key.offset;

From 56ab82441c3330dd21cb654ae029f82a8a633007 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Wed, 23 Aug 2017 16:57:58 +0900
Subject: [PATCH 2214/3220] btrfs: Add sanity check for EXTENT_DATA when
 reading out leaf

commit 40c3c40947324d9f40bf47830c92c59a9bbadf4a upstream.

Add extra checks for item with EXTENT_DATA type.  This checks the
following thing:

0) Key offset
   All key offsets must be aligned to sectorsize.
   Inline extent must have 0 for key offset.

1) Item size
   Uncompressed inline file extent size must match item size.
   (Compressed inline file extent has no information about its on-disk size.)
   Regular/preallocated file extent size must be a fixed value.

2) Every member of regular file extent item
   Including alignment for bytenr and offset, possible value for
   compression/encryption/type.

3) Type/compression/encode must be one of the valid values.

This should be the most comprehensive and strict check in the context
of btrfs_item for EXTENT_DATA.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ switch to BTRFS_FILE_EXTENT_TYPES, similar to what
  BTRFS_COMPRESS_TYPES does ]
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4:
 - Use root->sectorsize instead of root->fs_info->sectorsize
 - Adjust filename]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ctree.h   |   1 +
 fs/btrfs/disk-io.c | 103 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dbad9c9f8d78..3a162ddbe8b7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -897,6 +897,7 @@ struct btrfs_balance_item {
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
+#define BTRFS_FILE_EXTENT_TYPES	2
 
 struct btrfs_file_extent_item {
 	/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d2725a8798fe..80ebcfdf6051 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -528,6 +528,100 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 		   btrfs_header_level(eb) == 0 ? "leaf" : "node",\
 		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
 
+static int check_extent_data_item(struct btrfs_root *root,
+				  struct extent_buffer *leaf,
+				  struct btrfs_key *key, int slot)
+{
+	struct btrfs_file_extent_item *fi;
+	u32 sectorsize = root->sectorsize;
+	u32 item_size = btrfs_item_size_nr(leaf, slot);
+
+	if (!IS_ALIGNED(key->offset, sectorsize)) {
+		CORRUPT("unaligned key offset for file extent",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+		CORRUPT("invalid file extent type", leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	/*
+	 * Support for new compression/encrption must introduce incompat flag,
+	 * and must be caught in open_ctree().
+	 */
+	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+		CORRUPT("invalid file extent compression", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_encryption(leaf, fi)) {
+		CORRUPT("invalid file extent encryption", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+		/* Inline extent must have 0 as key offset */
+		if (key->offset) {
+			CORRUPT("inline extent has non-zero key offset",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/* Compressed inline extent has no on-disk size, skip it */
+		if (btrfs_file_extent_compression(leaf, fi) !=
+		    BTRFS_COMPRESS_NONE)
+			return 0;
+
+		/* Uncompressed inline extent size must match item size */
+		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+		    btrfs_file_extent_ram_bytes(leaf, fi)) {
+			CORRUPT("plaintext inline extent has invalid size",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+		return 0;
+	}
+
+	/* Regular or preallocated extent has fixed item size */
+	if (item_size != sizeof(*fi)) {
+		CORRUPT(
+		"regluar or preallocated extent data item size is invalid",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
+		CORRUPT(
+		"regular or preallocated extent data item has unaligned value",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct btrfs_root *root,
+			   struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot)
+{
+	int ret = 0;
+
+	switch (key->type) {
+	case BTRFS_EXTENT_DATA_KEY:
+		ret = check_extent_data_item(root, leaf, key, slot);
+		break;
+	}
+	return ret;
+}
+
 static noinline int check_leaf(struct btrfs_root *root,
 			       struct extent_buffer *leaf)
 {
@@ -583,9 +677,13 @@ static noinline int check_leaf(struct btrfs_root *root,
 	 * 1) key order
 	 * 2) item offset and size
 	 *    No overlap, no hole, all inside the leaf.
+	 * 3) item content
+	 *    If possible, do comprehensive sanity check.
+	 *    NOTE: All checks must only rely on the item data itself.
 	 */
 	for (slot = 0; slot < nritems; slot++) {
 		u32 item_end_expected;
+		int ret;
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 
@@ -628,6 +726,11 @@ static noinline int check_leaf(struct btrfs_root *root,
 			return -EUCLEAN;
 		}
 
+		/* Check if the item size and content meet other criteria */
+		ret = check_leaf_item(root, leaf, &key, slot);
+		if (ret < 0)
+			return ret;
+
 		prev_key.objectid = key.objectid;
 		prev_key.type = key.type;
 		prev_key.offset = key.offset;

From 68c23d09749e18e1099959e84aedcfc30fc8f633 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Wed, 23 Aug 2017 16:57:59 +0900
Subject: [PATCH 2215/3220] btrfs: Add checker for EXTENT_CSUM

commit 4b865cab96fe2a30ed512cf667b354bd291b3b0a upstream.

EXTENT_CSUM checker is a relatively easy one, only needs to check:

1) Objectid
   Fixed to BTRFS_EXTENT_CSUM_OBJECTID

2) Key offset alignment
   Must be aligned to sectorsize

3) Item size alignedment
   Must be aligned to csum size

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4: Use root->sectorsize instead of
 root->fs_info->sectorsize]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 80ebcfdf6051..e1ac4b8de251 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -605,6 +605,27 @@ static int check_extent_data_item(struct btrfs_root *root,
 	return 0;
 }
 
+static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot)
+{
+	u32 sectorsize = root->sectorsize;
+	u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+
+	if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+		CORRUPT("invalid objectid for csum item", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(key->offset, sectorsize)) {
+		CORRUPT("unaligned key offset for csum item", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+		CORRUPT("unaligned csum item size", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	return 0;
+}
+
 /*
  * Common point to switch the item-specific validation.
  */
@@ -618,6 +639,9 @@ static int check_leaf_item(struct btrfs_root *root,
 	case BTRFS_EXTENT_DATA_KEY:
 		ret = check_extent_data_item(root, leaf, key, slot);
 		break;
+	case BTRFS_EXTENT_CSUM_KEY:
+		ret = check_csum_item(root, leaf, key, slot);
+		break;
 	}
 	return ret;
 }

From e74422ec1442a4fcd9b68074aaaa9de115ac9ea0 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Mon, 9 Oct 2017 01:51:02 +0000
Subject: [PATCH 2216/3220] btrfs: Move leaf and node validation checker to
 tree-checker.c

commit 557ea5dd003d371536f6b4e8f7c8209a2b6fd4e3 upstream.

It's no doubt the comprehensive tree block checker will become larger,
so moving them into their own files is quite reasonable.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
[ wording adjustments ]
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4:
 - The moved code is slightly different
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/Makefile       |   2 +-
 fs/btrfs/disk-io.c      | 284 +-----------------------------------
 fs/btrfs/tree-checker.c | 309 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/tree-checker.h |  26 ++++
 4 files changed, 340 insertions(+), 281 deletions(-)
 create mode 100644 fs/btrfs/tree-checker.c
 create mode 100644 fs/btrfs/tree-checker.h

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6d1d0b93b1aa..c792df826e12 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-	   uuid-tree.o props.o hash.o
+	   uuid-tree.o props.o hash.o tree-checker.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1ac4b8de251..2a93930ec103 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -49,6 +49,7 @@
 #include "raid56.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "tree-checker.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -522,283 +523,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-#define CORRUPT(reason, eb, root, slot)				\
-	btrfs_crit(root->fs_info, "corrupt %s, %s: block=%llu,"	\
-		   " root=%llu, slot=%d",			\
-		   btrfs_header_level(eb) == 0 ? "leaf" : "node",\
-		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
-
-static int check_extent_data_item(struct btrfs_root *root,
-				  struct extent_buffer *leaf,
-				  struct btrfs_key *key, int slot)
-{
-	struct btrfs_file_extent_item *fi;
-	u32 sectorsize = root->sectorsize;
-	u32 item_size = btrfs_item_size_nr(leaf, slot);
-
-	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		CORRUPT("unaligned key offset for file extent",
-			leaf, root, slot);
-		return -EUCLEAN;
-	}
-
-	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-
-	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
-		CORRUPT("invalid file extent type", leaf, root, slot);
-		return -EUCLEAN;
-	}
-
-	/*
-	 * Support for new compression/encrption must introduce incompat flag,
-	 * and must be caught in open_ctree().
-	 */
-	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
-		CORRUPT("invalid file extent compression", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (btrfs_file_extent_encryption(leaf, fi)) {
-		CORRUPT("invalid file extent encryption", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
-		/* Inline extent must have 0 as key offset */
-		if (key->offset) {
-			CORRUPT("inline extent has non-zero key offset",
-				leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/* Compressed inline extent has no on-disk size, skip it */
-		if (btrfs_file_extent_compression(leaf, fi) !=
-		    BTRFS_COMPRESS_NONE)
-			return 0;
-
-		/* Uncompressed inline extent size must match item size */
-		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
-		    btrfs_file_extent_ram_bytes(leaf, fi)) {
-			CORRUPT("plaintext inline extent has invalid size",
-				leaf, root, slot);
-			return -EUCLEAN;
-		}
-		return 0;
-	}
-
-	/* Regular or preallocated extent has fixed item size */
-	if (item_size != sizeof(*fi)) {
-		CORRUPT(
-		"regluar or preallocated extent data item size is invalid",
-			leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
-	    !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
-		CORRUPT(
-		"regular or preallocated extent data item has unaligned value",
-			leaf, root, slot);
-		return -EUCLEAN;
-	}
-
-	return 0;
-}
-
-static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
-			   struct btrfs_key *key, int slot)
-{
-	u32 sectorsize = root->sectorsize;
-	u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
-
-	if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
-		CORRUPT("invalid objectid for csum item", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		CORRUPT("unaligned key offset for csum item", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
-		CORRUPT("unaligned csum item size", leaf, root, slot);
-		return -EUCLEAN;
-	}
-	return 0;
-}
-
-/*
- * Common point to switch the item-specific validation.
- */
-static int check_leaf_item(struct btrfs_root *root,
-			   struct extent_buffer *leaf,
-			   struct btrfs_key *key, int slot)
-{
-	int ret = 0;
-
-	switch (key->type) {
-	case BTRFS_EXTENT_DATA_KEY:
-		ret = check_extent_data_item(root, leaf, key, slot);
-		break;
-	case BTRFS_EXTENT_CSUM_KEY:
-		ret = check_csum_item(root, leaf, key, slot);
-		break;
-	}
-	return ret;
-}
-
-static noinline int check_leaf(struct btrfs_root *root,
-			       struct extent_buffer *leaf)
-{
-	/* No valid key type is 0, so all key should be larger than this key */
-	struct btrfs_key prev_key = {0, 0, 0};
-	struct btrfs_key key;
-	u32 nritems = btrfs_header_nritems(leaf);
-	int slot;
-
-	/*
-	 * Extent buffers from a relocation tree have a owner field that
-	 * corresponds to the subvolume tree they are based on. So just from an
-	 * extent buffer alone we can not find out what is the id of the
-	 * corresponding subvolume tree, so we can not figure out if the extent
-	 * buffer corresponds to the root of the relocation tree or not. So skip
-	 * this check for relocation trees.
-	 */
-	if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
-		struct btrfs_root *check_root;
-
-		key.objectid = btrfs_header_owner(leaf);
-		key.type = BTRFS_ROOT_ITEM_KEY;
-		key.offset = (u64)-1;
-
-		check_root = btrfs_get_fs_root(root->fs_info, &key, false);
-		/*
-		 * The only reason we also check NULL here is that during
-		 * open_ctree() some roots has not yet been set up.
-		 */
-		if (!IS_ERR_OR_NULL(check_root)) {
-			struct extent_buffer *eb;
-
-			eb = btrfs_root_node(check_root);
-			/* if leaf is the root, then it's fine */
-			if (leaf != eb) {
-				CORRUPT("non-root leaf's nritems is 0",
-					leaf, check_root, 0);
-				free_extent_buffer(eb);
-				return -EUCLEAN;
-			}
-			free_extent_buffer(eb);
-		}
-		return 0;
-	}
-
-	if (nritems == 0)
-		return 0;
-
-	/*
-	 * Check the following things to make sure this is a good leaf, and
-	 * leaf users won't need to bother with similar sanity checks:
-	 *
-	 * 1) key order
-	 * 2) item offset and size
-	 *    No overlap, no hole, all inside the leaf.
-	 * 3) item content
-	 *    If possible, do comprehensive sanity check.
-	 *    NOTE: All checks must only rely on the item data itself.
-	 */
-	for (slot = 0; slot < nritems; slot++) {
-		u32 item_end_expected;
-		int ret;
-
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-
-		/* Make sure the keys are in the right order */
-		if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
-			CORRUPT("bad key order", leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/*
-		 * Make sure the offset and ends are right, remember that the
-		 * item data starts at the end of the leaf and grows towards the
-		 * front.
-		 */
-		if (slot == 0)
-			item_end_expected = BTRFS_LEAF_DATA_SIZE(root);
-		else
-			item_end_expected = btrfs_item_offset_nr(leaf,
-								 slot - 1);
-		if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
-			CORRUPT("slot offset bad", leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/*
-		 * Check to make sure that we don't point outside of the leaf,
-		 * just incase all the items are consistent to eachother, but
-		 * all point outside of the leaf.
-		 */
-		if (btrfs_item_end_nr(leaf, slot) >
-		    BTRFS_LEAF_DATA_SIZE(root)) {
-			CORRUPT("slot end outside of leaf", leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/* Also check if the item pointer overlaps with btrfs item. */
-		if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
-		    btrfs_item_ptr_offset(leaf, slot)) {
-			CORRUPT("slot overlap with its data", leaf, root, slot);
-			return -EUCLEAN;
-		}
-
-		/* Check if the item size and content meet other criteria */
-		ret = check_leaf_item(root, leaf, &key, slot);
-		if (ret < 0)
-			return ret;
-
-		prev_key.objectid = key.objectid;
-		prev_key.type = key.type;
-		prev_key.offset = key.offset;
-	}
-
-	return 0;
-}
-
-static int check_node(struct btrfs_root *root, struct extent_buffer *node)
-{
-	unsigned long nr = btrfs_header_nritems(node);
-	struct btrfs_key key, next_key;
-	int slot;
-	u64 bytenr;
-	int ret = 0;
-
-	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
-		btrfs_crit(root->fs_info,
-			   "corrupt node: block %llu root %llu nritems %lu",
-			   node->start, root->objectid, nr);
-		return -EIO;
-	}
-
-	for (slot = 0; slot < nr - 1; slot++) {
-		bytenr = btrfs_node_blockptr(node, slot);
-		btrfs_node_key_to_cpu(node, &key, slot);
-		btrfs_node_key_to_cpu(node, &next_key, slot + 1);
-
-		if (!bytenr) {
-			CORRUPT("invalid item slot", node, root, slot);
-			ret = -EIO;
-			goto out;
-		}
-
-		if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
-			CORRUPT("bad key order", node, root, slot);
-			ret = -EIO;
-			goto out;
-		}
-	}
-out:
-	return ret;
-}
-
 static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 				      u64 phy_offset, struct page *page,
 				      u64 start, u64 end, int mirror)
@@ -865,12 +589,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	 * that we don't try and read the other copies of this block, just
 	 * return -EIO.
 	 */
-	if (found_level == 0 && check_leaf(root, eb)) {
+	if (found_level == 0 && btrfs_check_leaf(root, eb)) {
 		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = -EIO;
 	}
 
-	if (found_level > 0 && check_node(root, eb))
+	if (found_level > 0 && btrfs_check_node(root, eb))
 		ret = -EIO;
 
 	if (!ret)
@@ -4172,7 +3896,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 				     buf->len,
 				     root->fs_info->dirty_metadata_batch);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+	if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) {
 		btrfs_print_leaf(root, buf);
 		ASSERT(0);
 	}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
new file mode 100644
index 000000000000..dfba9f6b79a2
--- /dev/null
+++ b/fs/btrfs/tree-checker.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (C) Qu Wenruo 2017.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+/*
+ * The module is used to catch unexpected/corrupted tree block data.
+ * Such behavior can be caused either by a fuzzed image or bugs.
+ *
+ * The objective is to do leaf/node validation checks when tree block is read
+ * from disk, and check *every* possible member, so other code won't
+ * need to checking them again.
+ *
+ * Due to the potential and unwanted damage, every checker needs to be
+ * carefully reviewed otherwise so it does not prevent mount of valid images.
+ */
+
+#include "ctree.h"
+#include "tree-checker.h"
+#include "disk-io.h"
+#include "compression.h"
+
+#define CORRUPT(reason, eb, root, slot)					\
+	btrfs_crit(root->fs_info,					\
+		   "corrupt %s, %s: block=%llu, root=%llu, slot=%d",	\
+		   btrfs_header_level(eb) == 0 ? "leaf" : "node",	\
+		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
+
+static int check_extent_data_item(struct btrfs_root *root,
+				  struct extent_buffer *leaf,
+				  struct btrfs_key *key, int slot)
+{
+	struct btrfs_file_extent_item *fi;
+	u32 sectorsize = root->sectorsize;
+	u32 item_size = btrfs_item_size_nr(leaf, slot);
+
+	if (!IS_ALIGNED(key->offset, sectorsize)) {
+		CORRUPT("unaligned key offset for file extent",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+		CORRUPT("invalid file extent type", leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	/*
+	 * Support for new compression/encrption must introduce incompat flag,
+	 * and must be caught in open_ctree().
+	 */
+	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+		CORRUPT("invalid file extent compression", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_encryption(leaf, fi)) {
+		CORRUPT("invalid file extent encryption", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+		/* Inline extent must have 0 as key offset */
+		if (key->offset) {
+			CORRUPT("inline extent has non-zero key offset",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/* Compressed inline extent has no on-disk size, skip it */
+		if (btrfs_file_extent_compression(leaf, fi) !=
+		    BTRFS_COMPRESS_NONE)
+			return 0;
+
+		/* Uncompressed inline extent size must match item size */
+		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+		    btrfs_file_extent_ram_bytes(leaf, fi)) {
+			CORRUPT("plaintext inline extent has invalid size",
+				leaf, root, slot);
+			return -EUCLEAN;
+		}
+		return 0;
+	}
+
+	/* Regular or preallocated extent has fixed item size */
+	if (item_size != sizeof(*fi)) {
+		CORRUPT(
+		"regluar or preallocated extent data item size is invalid",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) ||
+	    !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) {
+		CORRUPT(
+		"regular or preallocated extent data item has unaligned value",
+			leaf, root, slot);
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
+static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot)
+{
+	u32 sectorsize = root->sectorsize;
+	u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+
+	if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+		CORRUPT("invalid objectid for csum item", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(key->offset, sectorsize)) {
+		CORRUPT("unaligned key offset for csum item", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+		CORRUPT("unaligned csum item size", leaf, root, slot);
+		return -EUCLEAN;
+	}
+	return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct btrfs_root *root,
+			   struct extent_buffer *leaf,
+			   struct btrfs_key *key, int slot)
+{
+	int ret = 0;
+
+	switch (key->type) {
+	case BTRFS_EXTENT_DATA_KEY:
+		ret = check_extent_data_item(root, leaf, key, slot);
+		break;
+	case BTRFS_EXTENT_CSUM_KEY:
+		ret = check_csum_item(root, leaf, key, slot);
+		break;
+	}
+	return ret;
+}
+
+int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	/* No valid key type is 0, so all key should be larger than this key */
+	struct btrfs_key prev_key = {0, 0, 0};
+	struct btrfs_key key;
+	u32 nritems = btrfs_header_nritems(leaf);
+	int slot;
+
+	/*
+	 * Extent buffers from a relocation tree have a owner field that
+	 * corresponds to the subvolume tree they are based on. So just from an
+	 * extent buffer alone we can not find out what is the id of the
+	 * corresponding subvolume tree, so we can not figure out if the extent
+	 * buffer corresponds to the root of the relocation tree or not. So
+	 * skip this check for relocation trees.
+	 */
+	if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+		struct btrfs_root *check_root;
+
+		key.objectid = btrfs_header_owner(leaf);
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+
+		check_root = btrfs_get_fs_root(fs_info, &key, false);
+		/*
+		 * The only reason we also check NULL here is that during
+		 * open_ctree() some roots has not yet been set up.
+		 */
+		if (!IS_ERR_OR_NULL(check_root)) {
+			struct extent_buffer *eb;
+
+			eb = btrfs_root_node(check_root);
+			/* if leaf is the root, then it's fine */
+			if (leaf != eb) {
+				CORRUPT("non-root leaf's nritems is 0",
+					leaf, check_root, 0);
+				free_extent_buffer(eb);
+				return -EUCLEAN;
+			}
+			free_extent_buffer(eb);
+		}
+		return 0;
+	}
+
+	if (nritems == 0)
+		return 0;
+
+	/*
+	 * Check the following things to make sure this is a good leaf, and
+	 * leaf users won't need to bother with similar sanity checks:
+	 *
+	 * 1) key ordering
+	 * 2) item offset and size
+	 *    No overlap, no hole, all inside the leaf.
+	 * 3) item content
+	 *    If possible, do comprehensive sanity check.
+	 *    NOTE: All checks must only rely on the item data itself.
+	 */
+	for (slot = 0; slot < nritems; slot++) {
+		u32 item_end_expected;
+		int ret;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		/* Make sure the keys are in the right order */
+		if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
+			CORRUPT("bad key order", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/*
+		 * Make sure the offset and ends are right, remember that the
+		 * item data starts at the end of the leaf and grows towards the
+		 * front.
+		 */
+		if (slot == 0)
+			item_end_expected = BTRFS_LEAF_DATA_SIZE(root);
+		else
+			item_end_expected = btrfs_item_offset_nr(leaf,
+								 slot - 1);
+		if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
+			CORRUPT("slot offset bad", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/*
+		 * Check to make sure that we don't point outside of the leaf,
+		 * just in case all the items are consistent to each other, but
+		 * all point outside of the leaf.
+		 */
+		if (btrfs_item_end_nr(leaf, slot) >
+		    BTRFS_LEAF_DATA_SIZE(root)) {
+			CORRUPT("slot end outside of leaf", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/* Also check if the item pointer overlaps with btrfs item. */
+		if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
+		    btrfs_item_ptr_offset(leaf, slot)) {
+			CORRUPT("slot overlap with its data", leaf, root, slot);
+			return -EUCLEAN;
+		}
+
+		/* Check if the item size and content meet other criteria */
+		ret = check_leaf_item(root, leaf, &key, slot);
+		if (ret < 0)
+			return ret;
+
+		prev_key.objectid = key.objectid;
+		prev_key.type = key.type;
+		prev_key.offset = key.offset;
+	}
+
+	return 0;
+}
+
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+	unsigned long nr = btrfs_header_nritems(node);
+	struct btrfs_key key, next_key;
+	int slot;
+	u64 bytenr;
+	int ret = 0;
+
+	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+		btrfs_crit(root->fs_info,
+			   "corrupt node: block %llu root %llu nritems %lu",
+			   node->start, root->objectid, nr);
+		return -EIO;
+	}
+
+	for (slot = 0; slot < nr - 1; slot++) {
+		bytenr = btrfs_node_blockptr(node, slot);
+		btrfs_node_key_to_cpu(node, &key, slot);
+		btrfs_node_key_to_cpu(node, &next_key, slot + 1);
+
+		if (!bytenr) {
+			CORRUPT("invalid item slot", node, root, slot);
+			ret = -EIO;
+			goto out;
+		}
+
+		if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+			CORRUPT("bad key order", node, root, slot);
+			ret = -EIO;
+			goto out;
+		}
+	}
+out:
+	return ret;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
new file mode 100644
index 000000000000..96c486e95d70
--- /dev/null
+++ b/fs/btrfs/tree-checker.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) Qu Wenruo 2017.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+#ifndef __BTRFS_TREE_CHECKER__
+#define __BTRFS_TREE_CHECKER__
+
+#include "ctree.h"
+#include "extent_io.h"
+
+int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
+
+#endif

From b3b3eaf729284dedb533d21e522d3c109521646c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo.btrfs@gmx.com>
Date: Mon, 9 Oct 2017 01:51:03 +0000
Subject: [PATCH 2217/3220] btrfs: tree-checker: Enhance btrfs_check_node
 output

commit bba4f29896c986c4cec17bc0f19f2ce644fceae1 upstream.

Use inline function to replace macro since we don't need
stringification.
(Macro still exists until all callers get updated)

And add more info about the error, and replace EIO with EUCLEAN.

For nr_items error, report if it's too large or too small, and output
the valid value range.

For node block pointer, added a new alignment checker.

For key order, also output the next key to make the problem more
obvious.

Signed-off-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
[ wording adjustments, unindented long strings ]
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4:
 - Use root->sectorsize instead of root->fs_info->sectorsize
 - BTRFS_NODEPTRS_PER_BLOCK() takes a root instead of an fs_info, and yields
   a value of type size_t instead of unsigned int]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-checker.c | 68 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index dfba9f6b79a2..c3f971194bd1 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -37,6 +37,46 @@
 		   btrfs_header_level(eb) == 0 ? "leaf" : "node",	\
 		   reason, btrfs_header_bytenr(eb), root->objectid, slot)
 
+/*
+ * Error message should follow the following format:
+ * corrupt <type>: <identifier>, <reason>[, <bad_value>]
+ *
+ * @type:	leaf or node
+ * @identifier:	the necessary info to locate the leaf/node.
+ * 		It's recommened to decode key.objecitd/offset if it's
+ * 		meaningful.
+ * @reason:	describe the error
+ * @bad_value:	optional, it's recommened to output bad value and its
+ *		expected value (range).
+ *
+ * Since comma is used to separate the components, only space is allowed
+ * inside each component.
+ */
+
+/*
+ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
+ * Allows callers to customize the output.
+ */
+__printf(4, 5)
+static void generic_err(const struct btrfs_root *root,
+			const struct extent_buffer *eb, int slot,
+			const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	btrfs_crit(root->fs_info,
+		"corrupt %s: root=%llu block=%llu slot=%d, %pV",
+		btrfs_header_level(eb) == 0 ? "leaf" : "node",
+		root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
+	va_end(args);
+}
+
 static int check_extent_data_item(struct btrfs_root *root,
 				  struct extent_buffer *leaf,
 				  struct btrfs_key *key, int slot)
@@ -282,9 +322,11 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
 
 	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
 		btrfs_crit(root->fs_info,
-			   "corrupt node: block %llu root %llu nritems %lu",
-			   node->start, root->objectid, nr);
-		return -EIO;
+"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%zu]",
+			   root->objectid, node->start,
+			   nr == 0 ? "small" : "large", nr,
+			   BTRFS_NODEPTRS_PER_BLOCK(root));
+		return -EUCLEAN;
 	}
 
 	for (slot = 0; slot < nr - 1; slot++) {
@@ -293,14 +335,26 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
 		btrfs_node_key_to_cpu(node, &next_key, slot + 1);
 
 		if (!bytenr) {
-			CORRUPT("invalid item slot", node, root, slot);
-			ret = -EIO;
+			generic_err(root, node, slot,
+				"invalid NULL node pointer");
+			ret = -EUCLEAN;
+			goto out;
+		}
+		if (!IS_ALIGNED(bytenr, root->sectorsize)) {
+			generic_err(root, node, slot,
+			"unaligned pointer, have %llu should be aligned to %u",
+				bytenr, root->sectorsize);
+			ret = -EUCLEAN;
 			goto out;
 		}
 
 		if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
-			CORRUPT("bad key order", node, root, slot);
-			ret = -EIO;
+			generic_err(root, node, slot,
+	"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
+				key.objectid, key.type, key.offset,
+				next_key.objectid, next_key.type,
+				next_key.offset);
+			ret = -EUCLEAN;
 			goto out;
 		}
 	}

From 7782b38595ae1739ce55898103fccff220df2cdf Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 8 Nov 2017 08:54:24 +0800
Subject: [PATCH 2218/3220] btrfs: tree-checker: Fix false panic for sanity
 test

commit 69fc6cbbac542c349b3d350d10f6e394c253c81d upstream.

[BUG]
If we run btrfs with CONFIG_BTRFS_FS_RUN_SANITY_TESTS=y, it will
instantly cause kernel panic like:

------
...
assertion failed: 0, file: fs/btrfs/disk-io.c, line: 3853
...
Call Trace:
 btrfs_mark_buffer_dirty+0x187/0x1f0 [btrfs]
 setup_items_for_insert+0x385/0x650 [btrfs]
 __btrfs_drop_extents+0x129a/0x1870 [btrfs]
...
-----

[Cause]
Btrfs will call btrfs_check_leaf() in btrfs_mark_buffer_dirty() to check
if the leaf is valid with CONFIG_BTRFS_FS_RUN_SANITY_TESTS=y.

However quite some btrfs_mark_buffer_dirty() callers(*) don't really
initialize its item data but only initialize its item pointers, leaving
item data uninitialized.

This makes tree-checker catch uninitialized data as error, causing
such panic.

*: These callers include but not limited to
setup_items_for_insert()
btrfs_split_item()
btrfs_expand_item()

[Fix]
Add a new parameter @check_item_data to btrfs_check_leaf().
With @check_item_data set to false, item data check will be skipped and
fallback to old btrfs_check_leaf() behavior.

So we can still get early warning if we screw up item pointers, and
avoid false panic.

Cc: Filipe Manana <fdmanana@gmail.com>
Reported-by: Lakshmipathi.G <lakshmipathi.g@gmail.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c      | 10 ++++++++--
 fs/btrfs/tree-checker.c | 27 ++++++++++++++++++++++-----
 fs/btrfs/tree-checker.h | 14 +++++++++++++-
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2a93930ec103..f80a0af68736 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -589,7 +589,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	 * that we don't try and read the other copies of this block, just
 	 * return -EIO.
 	 */
-	if (found_level == 0 && btrfs_check_leaf(root, eb)) {
+	if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
 		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = -EIO;
 	}
@@ -3896,7 +3896,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 				     buf->len,
 				     root->fs_info->dirty_metadata_batch);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) {
+	/*
+	 * Since btrfs_mark_buffer_dirty() can be called with item pointer set
+	 * but item data not updated.
+	 * So here we should only check item pointers, not item data.
+	 */
+	if (btrfs_header_level(buf) == 0 &&
+	    btrfs_check_leaf_relaxed(root, buf)) {
 		btrfs_print_leaf(root, buf);
 		ASSERT(0);
 	}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index c3f971194bd1..7bd2ab47f422 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -195,7 +195,8 @@ static int check_leaf_item(struct btrfs_root *root,
 	return ret;
 }
 
-int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
+static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
+		      bool check_item_data)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	/* No valid key type is 0, so all key should be larger than this key */
@@ -299,10 +300,15 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
 			return -EUCLEAN;
 		}
 
-		/* Check if the item size and content meet other criteria */
-		ret = check_leaf_item(root, leaf, &key, slot);
-		if (ret < 0)
-			return ret;
+		if (check_item_data) {
+			/*
+			 * Check if the item size and content meet other
+			 * criteria
+			 */
+			ret = check_leaf_item(root, leaf, &key, slot);
+			if (ret < 0)
+				return ret;
+		}
 
 		prev_key.objectid = key.objectid;
 		prev_key.type = key.type;
@@ -312,6 +318,17 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf)
 	return 0;
 }
 
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf)
+{
+	return check_leaf(root, leaf, true);
+}
+
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+			     struct extent_buffer *leaf)
+{
+	return check_leaf(root, leaf, false);
+}
+
 int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
 {
 	unsigned long nr = btrfs_header_nritems(node);
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 96c486e95d70..3d53e8d6fda0 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -20,7 +20,19 @@
 #include "ctree.h"
 #include "extent_io.h"
 
-int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf);
+/*
+ * Comprehensive leaf checker.
+ * Will check not only the item pointers, but also every possible member
+ * in item data.
+ */
+int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf);
+
+/*
+ * Less strict leaf checker.
+ * Will only check item pointers, not reading item data.
+ */
+int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+			     struct extent_buffer *leaf);
 int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
 
 #endif

From a8f416492297c75550ced8c19cb4ab0274fd6c63 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 8 Nov 2017 08:54:25 +0800
Subject: [PATCH 2219/3220] btrfs: tree-checker: Add checker for dir item

commit ad7b0368f33cffe67fecd302028915926e50ef7e upstream.

Add checker for dir item, for key types DIR_ITEM, DIR_INDEX and
XATTR_ITEM.

This checker does comprehensive checks for:

1) dir_item header and its data size
   Against item boundary and maximum name/xattr length.
   This part is mostly the same as old verify_dir_item().

2) dir_type
   Against maximum file types, and against key type.
   Since XATTR key should only have FT_XATTR dir item, and normal dir
   item type should not have XATTR key.

   The check between key->type and dir_type is newly introduced by this
   patch.

3) name hash
   For XATTR and DIR_ITEM key, key->offset is name hash (crc32c).
   Check the hash of the name against the key to ensure it's correct.

   The name hash check is only found in btrfs-progs before this patch.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Su Yue <suy.fnst@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4: BTRFS_MAX_XATTR_SIZE() takes a root instead of an
 fs_info, and yields a value of type size_t instead of unsigned int]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-checker.c | 141 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7bd2ab47f422..0715b39afa5d 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -30,6 +30,7 @@
 #include "tree-checker.h"
 #include "disk-io.h"
 #include "compression.h"
+#include "hash.h"
 
 #define CORRUPT(reason, eb, root, slot)					\
 	btrfs_crit(root->fs_info,					\
@@ -175,6 +176,141 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
 	return 0;
 }
 
+/*
+ * Customized reported for dir_item, only important new info is key->objectid,
+ * which represents inode number
+ */
+__printf(4, 5)
+static void dir_item_err(const struct btrfs_root *root,
+			 const struct extent_buffer *eb, int slot,
+			 const char *fmt, ...)
+{
+	struct btrfs_key key;
+	struct va_format vaf;
+	va_list args;
+
+	btrfs_item_key_to_cpu(eb, &key, slot);
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	btrfs_crit(root->fs_info,
+	"corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+		btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
+		btrfs_header_bytenr(eb), slot, key.objectid, &vaf);
+	va_end(args);
+}
+
+static int check_dir_item(struct btrfs_root *root,
+			  struct extent_buffer *leaf,
+			  struct btrfs_key *key, int slot)
+{
+	struct btrfs_dir_item *di;
+	u32 item_size = btrfs_item_size_nr(leaf, slot);
+	u32 cur = 0;
+
+	di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+	while (cur < item_size) {
+		char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+		u32 name_len;
+		u32 data_len;
+		u32 max_name_len;
+		u32 total_size;
+		u32 name_hash;
+		u8 dir_type;
+
+		/* header itself should not cross item boundary */
+		if (cur + sizeof(*di) > item_size) {
+			dir_item_err(root, leaf, slot,
+		"dir item header crosses item boundary, have %lu boundary %u",
+				cur + sizeof(*di), item_size);
+			return -EUCLEAN;
+		}
+
+		/* dir type check */
+		dir_type = btrfs_dir_type(leaf, di);
+		if (dir_type >= BTRFS_FT_MAX) {
+			dir_item_err(root, leaf, slot,
+			"invalid dir item type, have %u expect [0, %u)",
+				dir_type, BTRFS_FT_MAX);
+			return -EUCLEAN;
+		}
+
+		if (key->type == BTRFS_XATTR_ITEM_KEY &&
+		    dir_type != BTRFS_FT_XATTR) {
+			dir_item_err(root, leaf, slot,
+		"invalid dir item type for XATTR key, have %u expect %u",
+				dir_type, BTRFS_FT_XATTR);
+			return -EUCLEAN;
+		}
+		if (dir_type == BTRFS_FT_XATTR &&
+		    key->type != BTRFS_XATTR_ITEM_KEY) {
+			dir_item_err(root, leaf, slot,
+			"xattr dir type found for non-XATTR key");
+			return -EUCLEAN;
+		}
+		if (dir_type == BTRFS_FT_XATTR)
+			max_name_len = XATTR_NAME_MAX;
+		else
+			max_name_len = BTRFS_NAME_LEN;
+
+		/* Name/data length check */
+		name_len = btrfs_dir_name_len(leaf, di);
+		data_len = btrfs_dir_data_len(leaf, di);
+		if (name_len > max_name_len) {
+			dir_item_err(root, leaf, slot,
+			"dir item name len too long, have %u max %u",
+				name_len, max_name_len);
+			return -EUCLEAN;
+		}
+		if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
+			dir_item_err(root, leaf, slot,
+			"dir item name and data len too long, have %u max %zu",
+				name_len + data_len,
+				BTRFS_MAX_XATTR_SIZE(root));
+			return -EUCLEAN;
+		}
+
+		if (data_len && dir_type != BTRFS_FT_XATTR) {
+			dir_item_err(root, leaf, slot,
+			"dir item with invalid data len, have %u expect 0",
+				data_len);
+			return -EUCLEAN;
+		}
+
+		total_size = sizeof(*di) + name_len + data_len;
+
+		/* header and name/data should not cross item boundary */
+		if (cur + total_size > item_size) {
+			dir_item_err(root, leaf, slot,
+		"dir item data crosses item boundary, have %u boundary %u",
+				cur + total_size, item_size);
+			return -EUCLEAN;
+		}
+
+		/*
+		 * Special check for XATTR/DIR_ITEM, as key->offset is name
+		 * hash, should match its name
+		 */
+		if (key->type == BTRFS_DIR_ITEM_KEY ||
+		    key->type == BTRFS_XATTR_ITEM_KEY) {
+			read_extent_buffer(leaf, namebuf,
+					(unsigned long)(di + 1), name_len);
+			name_hash = btrfs_name_hash(namebuf, name_len);
+			if (key->offset != name_hash) {
+				dir_item_err(root, leaf, slot,
+		"name hash mismatch with key, have 0x%016x expect 0x%016llx",
+					name_hash, key->offset);
+				return -EUCLEAN;
+			}
+		}
+		cur += total_size;
+		di = (struct btrfs_dir_item *)((void *)di + total_size);
+	}
+	return 0;
+}
+
 /*
  * Common point to switch the item-specific validation.
  */
@@ -191,6 +327,11 @@ static int check_leaf_item(struct btrfs_root *root,
 	case BTRFS_EXTENT_CSUM_KEY:
 		ret = check_csum_item(root, leaf, key, slot);
 		break;
+	case BTRFS_DIR_ITEM_KEY:
+	case BTRFS_DIR_INDEX_KEY:
+	case BTRFS_XATTR_ITEM_KEY:
+		ret = check_dir_item(root, leaf, key, slot);
+		break;
 	}
 	return ret;
 }

From 047ac3914e581ce48cf84476c89892cce4ce4ba3 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 6 Dec 2017 15:18:14 +0100
Subject: [PATCH 2220/3220] btrfs: tree-checker: use %zu format string for
 size_t

commit 7cfad65297bfe0aa2996cd72d21c898aa84436d9 upstream.

The return value of sizeof() is of type size_t, so we must print it
using the %z format modifier rather than %l to avoid this warning
on some architectures:

fs/btrfs/tree-checker.c: In function 'check_dir_item':
fs/btrfs/tree-checker.c:273:50: error: format '%lu' expects argument of type 'long unsigned int', but argument 5 has type 'u32' {aka 'unsigned int'} [-Werror=format=]

Fixes: 005887f2e3e0 ("btrfs: tree-checker: Add checker for dir item")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-checker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 0715b39afa5d..71997af35f3e 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -223,7 +223,7 @@ static int check_dir_item(struct btrfs_root *root,
 		/* header itself should not cross item boundary */
 		if (cur + sizeof(*di) > item_size) {
 			dir_item_err(root, leaf, slot,
-		"dir item header crosses item boundary, have %lu boundary %u",
+		"dir item header crosses item boundary, have %zu boundary %u",
 				cur + sizeof(*di), item_size);
 			return -EUCLEAN;
 		}

From 5b84bbce60732649f6fcee901773e2c55c86d50e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 10 Jan 2018 15:13:07 +0100
Subject: [PATCH 2221/3220] btrfs: tree-check: reduce stack consumption in
 check_dir_item

commit e2683fc9d219430f5b78889b50cde7f40efeba7b upstream.

I've noticed that the updated item checker stack consumption increased
dramatically in 542f5385e20cf97447 ("btrfs: tree-checker: Add checker
for dir item")

tree-checker.c:check_leaf                    +552 (176 -> 728)

The array is 255 bytes long, dynamic allocation would slow down the
sanity checks so it's more reasonable to keep it on-stack. Moving the
variable to the scope of use reduces the stack usage again

tree-checker.c:check_leaf                    -264 (728 -> 464)

Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-checker.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 71997af35f3e..7d4e4fe69762 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -212,7 +212,6 @@ static int check_dir_item(struct btrfs_root *root,
 
 	di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 	while (cur < item_size) {
-		char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
 		u32 name_len;
 		u32 data_len;
 		u32 max_name_len;
@@ -295,6 +294,8 @@ static int check_dir_item(struct btrfs_root *root,
 		 */
 		if (key->type == BTRFS_DIR_ITEM_KEY ||
 		    key->type == BTRFS_XATTR_ITEM_KEY) {
+			char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+
 			read_extent_buffer(leaf, namebuf,
 					(unsigned long)(di + 1), name_len);
 			name_hash = btrfs_name_hash(namebuf, name_len);

From ae94efaf2b609e811bce6280d5c88cf557cd1238 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 3 Jul 2018 17:10:05 +0800
Subject: [PATCH 2222/3220] btrfs: tree-checker: Verify block_group_item

commit fce466eab7ac6baa9d2dcd88abcf945be3d4a089 upstream.

A crafted image with invalid block group items could make free space cache
code to cause panic.

We could detect such invalid block group item by checking:
1) Item size
   Known fixed value.
2) Block group size (key.offset)
   We have an upper limit on block group item (10G)
3) Chunk objectid
   Known fixed value.
4) Type
   Only 4 valid type values, DATA, METADATA, SYSTEM and DATA|METADATA.
   No more than 1 bit set for profile type.
5) Used space
   No more than the block group size.

This should allow btrfs to detect and refuse to mount the crafted image.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199849
Reported-by: Xu Wen <wen.xu@gatech.edu>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Tested-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4:
 - In check_leaf_item(), pass root->fs_info to check_block_group_item()
 - Include <linux/sizes.h> (in ctree.h, to match upstream)
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ctree.h        |   1 +
 fs/btrfs/tree-checker.c | 100 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.c      |   2 +-
 fs/btrfs/volumes.h      |   2 +
 4 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3a162ddbe8b7..4a91d3119e59 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
 #include <linux/btrfs.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
+#include <linux/sizes.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7d4e4fe69762..cd7064e0c3c5 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -31,6 +31,7 @@
 #include "disk-io.h"
 #include "compression.h"
 #include "hash.h"
+#include "volumes.h"
 
 #define CORRUPT(reason, eb, root, slot)					\
 	btrfs_crit(root->fs_info,					\
@@ -312,6 +313,102 @@ static int check_dir_item(struct btrfs_root *root,
 	return 0;
 }
 
+__printf(4, 5)
+__cold
+static void block_group_err(const struct btrfs_fs_info *fs_info,
+			    const struct extent_buffer *eb, int slot,
+			    const char *fmt, ...)
+{
+	struct btrfs_key key;
+	struct va_format vaf;
+	va_list args;
+
+	btrfs_item_key_to_cpu(eb, &key, slot);
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	btrfs_crit(fs_info,
+	"corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
+		btrfs_header_level(eb) == 0 ? "leaf" : "node",
+		btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+		key.objectid, key.offset, &vaf);
+	va_end(args);
+}
+
+static int check_block_group_item(struct btrfs_fs_info *fs_info,
+				  struct extent_buffer *leaf,
+				  struct btrfs_key *key, int slot)
+{
+	struct btrfs_block_group_item bgi;
+	u32 item_size = btrfs_item_size_nr(leaf, slot);
+	u64 flags;
+	u64 type;
+
+	/*
+	 * Here we don't really care about alignment since extent allocator can
+	 * handle it.  We care more about the size, as if one block group is
+	 * larger than maximum size, it's must be some obvious corruption.
+	 */
+	if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) {
+		block_group_err(fs_info, leaf, slot,
+			"invalid block group size, have %llu expect (0, %llu]",
+				key->offset, BTRFS_MAX_DATA_CHUNK_SIZE);
+		return -EUCLEAN;
+	}
+
+	if (item_size != sizeof(bgi)) {
+		block_group_err(fs_info, leaf, slot,
+			"invalid item size, have %u expect %zu",
+				item_size, sizeof(bgi));
+		return -EUCLEAN;
+	}
+
+	read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+			   sizeof(bgi));
+	if (btrfs_block_group_chunk_objectid(&bgi) !=
+	    BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
+		block_group_err(fs_info, leaf, slot,
+		"invalid block group chunk objectid, have %llu expect %llu",
+				btrfs_block_group_chunk_objectid(&bgi),
+				BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+		return -EUCLEAN;
+	}
+
+	if (btrfs_block_group_used(&bgi) > key->offset) {
+		block_group_err(fs_info, leaf, slot,
+			"invalid block group used, have %llu expect [0, %llu)",
+				btrfs_block_group_used(&bgi), key->offset);
+		return -EUCLEAN;
+	}
+
+	flags = btrfs_block_group_flags(&bgi);
+	if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
+		block_group_err(fs_info, leaf, slot,
+"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
+			flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
+			hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
+		return -EUCLEAN;
+	}
+
+	type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+	if (type != BTRFS_BLOCK_GROUP_DATA &&
+	    type != BTRFS_BLOCK_GROUP_METADATA &&
+	    type != BTRFS_BLOCK_GROUP_SYSTEM &&
+	    type != (BTRFS_BLOCK_GROUP_METADATA |
+			   BTRFS_BLOCK_GROUP_DATA)) {
+		block_group_err(fs_info, leaf, slot,
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx",
+			type, hweight64(type),
+			BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
+			BTRFS_BLOCK_GROUP_SYSTEM,
+			BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA);
+		return -EUCLEAN;
+	}
+	return 0;
+}
+
 /*
  * Common point to switch the item-specific validation.
  */
@@ -333,6 +430,9 @@ static int check_leaf_item(struct btrfs_root *root,
 	case BTRFS_XATTR_ITEM_KEY:
 		ret = check_dir_item(root, leaf, key, slot);
 		break;
+	case BTRFS_BLOCK_GROUP_ITEM_KEY:
+		ret = check_block_group_item(root->fs_info, leaf, key, slot);
+		break;
 	}
 	return ret;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 75211630a864..da94d2fc1419 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4540,7 +4540,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
 		max_stripe_size = 1024 * 1024 * 1024;
-		max_chunk_size = 10 * max_stripe_size;
+		max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
 		if (!devs_max)
 			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d5c84f6b1353..3c651df420be 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
 #include <linux/btrfs.h>
 #include "async-thread.h"
 
+#define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
+
 extern struct mutex uuid_mutex;
 
 #define BTRFS_STRIPE_LEN	(64 * 1024)

From 42d263820480ab1f7eba54590f2c7283b3428723 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 3 Jul 2018 17:10:06 +0800
Subject: [PATCH 2223/3220] btrfs: tree-checker: Detect invalid and empty
 essential trees

commit ba480dd4db9f1798541eb2d1c423fc95feee8d36 upstream.

A crafted image has empty root tree block, which will later cause NULL
pointer dereference.

The following trees should never be empty:
1) Tree root
   Must contain at least root items for extent tree, device tree and fs
   tree

2) Chunk tree
   Or we can't even bootstrap as it contains the mapping.

3) Fs tree
   At least inode item for top level inode (.).

4) Device tree
   Dev extents for chunks

5) Extent tree
   Must have corresponding extent for each chunk.

If any of them is empty, we are sure the fs is corrupted and no need to
mount it.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
Reported-by: Xu Wen <wen.xu@gatech.edu>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Tested-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4: Pass root instead of fs_info to generic_err()]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-checker.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index cd7064e0c3c5..f26659aa5fd5 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -456,9 +456,22 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
 	 * skip this check for relocation trees.
 	 */
 	if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+		u64 owner = btrfs_header_owner(leaf);
 		struct btrfs_root *check_root;
 
-		key.objectid = btrfs_header_owner(leaf);
+		/* These trees must never be empty */
+		if (owner == BTRFS_ROOT_TREE_OBJECTID ||
+		    owner == BTRFS_CHUNK_TREE_OBJECTID ||
+		    owner == BTRFS_EXTENT_TREE_OBJECTID ||
+		    owner == BTRFS_DEV_TREE_OBJECTID ||
+		    owner == BTRFS_FS_TREE_OBJECTID ||
+		    owner == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+			generic_err(root, leaf, 0,
+			"invalid root, root %llu must never be empty",
+				    owner);
+			return -EUCLEAN;
+		}
+		key.objectid = owner;
 		key.type = BTRFS_ROOT_ITEM_KEY;
 		key.offset = (u64)-1;
 

From 50962a7b4877f26d1f3f49cd77ad1814a9e81bac Mon Sep 17 00:00:00 2001
From: Gu Jinxiang <gujx@cn.fujitsu.com>
Date: Wed, 4 Jul 2018 18:16:39 +0800
Subject: [PATCH 2224/3220] btrfs: validate type when reading a chunk

commit 315409b0098fb2651d86553f0436b70502b29bb2 upstream.

Reported in https://bugzilla.kernel.org/show_bug.cgi?id=199839, with an
image that has an invalid chunk type but does not return an error.

Add chunk type check in btrfs_check_chunk_valid, to detect the wrong
type combinations.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199839
Reported-by: Xu Wen <wen.xu@gatech.edu>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4: Use root->fs_info instead of fs_info]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/volumes.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index da94d2fc1419..5e8fe8f3942d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6218,6 +6218,8 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root,
 	u16 num_stripes;
 	u16 sub_stripes;
 	u64 type;
+	u64 features;
+	bool mixed = false;
 
 	length = btrfs_chunk_length(leaf, chunk);
 	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
@@ -6258,6 +6260,32 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root,
 			  btrfs_chunk_type(leaf, chunk));
 		return -EIO;
 	}
+
+	if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+		btrfs_err(root->fs_info, "missing chunk type flag: 0x%llx", type);
+		return -EIO;
+	}
+
+	if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+	    (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+		btrfs_err(root->fs_info,
+			"system chunk with data or metadata type: 0x%llx", type);
+		return -EIO;
+	}
+
+	features = btrfs_super_incompat_flags(root->fs_info->super_copy);
+	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		mixed = true;
+
+	if (!mixed) {
+		if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
+		    (type & BTRFS_BLOCK_GROUP_DATA)) {
+			btrfs_err(root->fs_info,
+			"mixed chunk type in non-mixed mode: 0x%llx", type);
+			return -EIO;
+		}
+	}
+
 	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
 	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
 	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||

From ee5e37a26791f9c842b3298e594c6e3c93bb1355 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 1 Aug 2018 10:37:16 +0800
Subject: [PATCH 2225/3220] btrfs: Check that each block group has
 corresponding chunk at mount time

commit 514c7dca85a0bf40be984dab0b477403a6db901f upstream.

A crafted btrfs image with incorrect chunk<->block group mapping will
trigger a lot of unexpected things as the mapping is essential.

Although the problem can be caught by block group item checker
added in "btrfs: tree-checker: Verify block_group_item", it's still not
sufficient.  A sufficiently valid block group item can pass the check
added by the mentioned patch but could fail to match the existing chunk.

This patch will add extra block group -> chunk mapping check, to ensure
we have a completely matching (start, len, flags) chunk for each block
group at mount time.

Here we reuse the original helper find_first_block_group(), which is
already doing the basic bg -> chunk checks, adding further checks of the
start/len and type flags.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199837
Reported-by: Xu Wen <wen.xu@gatech.edu>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Su Yue <suy.fnst@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4: Use root->fs_info instead of fs_info]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 14549387b518..f0f95381ae80 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9487,6 +9487,8 @@ static int find_first_block_group(struct btrfs_root *root,
 	int ret = 0;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
+	struct btrfs_block_group_item bg;
+	u64 flags;
 	int slot;
 
 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
@@ -9521,8 +9523,32 @@ static int find_first_block_group(struct btrfs_root *root,
 			"logical %llu len %llu found bg but no related chunk",
 					  found_key.objectid, found_key.offset);
 				ret = -ENOENT;
+			} else if (em->start != found_key.objectid ||
+				   em->len != found_key.offset) {
+				btrfs_err(root->fs_info,
+		"block group %llu len %llu mismatch with chunk %llu len %llu",
+					  found_key.objectid, found_key.offset,
+					  em->start, em->len);
+				ret = -EUCLEAN;
 			} else {
-				ret = 0;
+				read_extent_buffer(leaf, &bg,
+					btrfs_item_ptr_offset(leaf, slot),
+					sizeof(bg));
+				flags = btrfs_block_group_flags(&bg) &
+					BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+				if (flags != (em->map_lookup->type &
+					      BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+					btrfs_err(root->fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+						found_key.objectid,
+						found_key.offset, flags,
+						(BTRFS_BLOCK_GROUP_TYPE_MASK &
+						 em->map_lookup->type));
+					ret = -EUCLEAN;
+				} else {
+					ret = 0;
+				}
 			}
 			free_extent_map(em);
 			goto out;

From 98620167ed91cfef2bf24b058170d5194e0c4c45 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 1 Aug 2018 10:37:17 +0800
Subject: [PATCH 2226/3220] btrfs: Verify that every chunk has corresponding
 block group at mount time

commit 7ef49515fa6727cb4b6f2f5b0ffbc5fc20a9f8c6 upstream.

If a crafted image has missing block group items, it could cause
unexpected behavior and breaks the assumption of 1:1 chunk<->block group
mapping.

Although we have the block group -> chunk mapping check, we still need
chunk -> block group mapping check.

This patch will do extra check to ensure each chunk has its
corresponding block group.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
Reported-by: Xu Wen <wen.xu@gatech.edu>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Gu Jinxiang <gujx@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 58 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f0f95381ae80..978bbfed5a2c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9765,6 +9765,62 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 	return cache;
 }
 
+
+/*
+ * Iterate all chunks and verify that each of them has the corresponding block
+ * group
+ */
+static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map *em;
+	struct btrfs_block_group_cache *bg;
+	u64 start = 0;
+	int ret = 0;
+
+	while (1) {
+		read_lock(&map_tree->map_tree.lock);
+		/*
+		 * lookup_extent_mapping will return the first extent map
+		 * intersecting the range, so setting @len to 1 is enough to
+		 * get the first chunk.
+		 */
+		em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
+		read_unlock(&map_tree->map_tree.lock);
+		if (!em)
+			break;
+
+		bg = btrfs_lookup_block_group(fs_info, em->start);
+		if (!bg) {
+			btrfs_err(fs_info,
+	"chunk start=%llu len=%llu doesn't have corresponding block group",
+				     em->start, em->len);
+			ret = -EUCLEAN;
+			free_extent_map(em);
+			break;
+		}
+		if (bg->key.objectid != em->start ||
+		    bg->key.offset != em->len ||
+		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+			btrfs_err(fs_info,
+"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
+				em->start, em->len,
+				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+				bg->key.objectid, bg->key.offset,
+				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+			ret = -EUCLEAN;
+			free_extent_map(em);
+			btrfs_put_block_group(bg);
+			break;
+		}
+		start = em->start + em->len;
+		free_extent_map(em);
+		btrfs_put_block_group(bg);
+	}
+	return ret;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -9951,7 +10007,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	}
 
 	init_global_block_rsv(info);
-	ret = 0;
+	ret = check_chunk_block_group_mappings(info);
 error:
 	btrfs_free_path(path);
 	return ret;

From 40f2dd610635d8540c334c4fa40d432e7b06a941 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 28 Sep 2018 07:59:34 +0800
Subject: [PATCH 2227/3220] btrfs: tree-checker: Check level for leaves and
 nodes

commit f556faa46eb4e96d0d0772e74ecf66781e132f72 upstream.

Although we have tree level check at tree read runtime, it's completely
based on its parent level.
We still need to do accurate level check to avoid invalid tree blocks
sneak into kernel space.

The check itself is simple, for leaf its level should always be 0.
For nodes its level should be in range [1, BTRFS_MAX_LEVEL - 1].

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Su Yue <suy.fnst@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
[bwh: Backported to 4.4:
 - Pass root instead of fs_info to generic_err()
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-checker.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index f26659aa5fd5..03936164a8eb 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -447,6 +447,13 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
 	u32 nritems = btrfs_header_nritems(leaf);
 	int slot;
 
+	if (btrfs_header_level(leaf) != 0) {
+		generic_err(root, leaf, 0,
+			"invalid level for leaf, have %d expect 0",
+			btrfs_header_level(leaf));
+		return -EUCLEAN;
+	}
+
 	/*
 	 * Extent buffers from a relocation tree have a owner field that
 	 * corresponds to the subvolume tree they are based on. So just from an
@@ -589,9 +596,16 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
 	unsigned long nr = btrfs_header_nritems(node);
 	struct btrfs_key key, next_key;
 	int slot;
+	int level = btrfs_header_level(node);
 	u64 bytenr;
 	int ret = 0;
 
+	if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+		generic_err(root, node, 0,
+			"invalid level for node, have %d expect [1, %d]",
+			level, BTRFS_MAX_LEVEL - 1);
+		return -EUCLEAN;
+	}
 	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
 		btrfs_crit(root->fs_info,
 "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%zu]",

From 09527e390e81652188a7b9bbceb775dbed343483 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Mon, 5 Nov 2018 18:49:09 +0800
Subject: [PATCH 2228/3220] btrfs: tree-checker: Fix misleading group system
 information

commit 761333f2f50ccc887aa9957ae829300262c0d15b upstream.

block_group_err shows the group system as a decimal value with a '0x'
prefix, which is somewhat misleading.

Fix it to print hexadecimal, as was intended.

Fixes: fce466eab7ac6 ("btrfs: tree-checker: Verify block_group_item")
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-checker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 03936164a8eb..5b98f3c76ce4 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -399,7 +399,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
 	    type != (BTRFS_BLOCK_GROUP_METADATA |
 			   BTRFS_BLOCK_GROUP_DATA)) {
 		block_group_err(fs_info, leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx",
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
 			type, hweight64(type),
 			BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
 			BTRFS_BLOCK_GROUP_SYSTEM,

From 05ce0d9d5600b88f72427fc93613ef09006b3f91 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Thu, 10 Jan 2019 11:27:28 -0800
Subject: [PATCH 2229/3220] CIFS: Do not hide EINTR after sending network
 packets

commit ee13919c2e8d1f904e035ad4b4239029a8994131 upstream.

Currently we hide EINTR code returned from sock_sendmsg()
and return 0 instead. This makes a caller think that we
successfully completed the network operation which is not
true. Fix this by properly returning EINTR to callers.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 54af10204e83..1cf0a336ec06 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -360,7 +360,7 @@ uncork:
 	if (rc < 0 && rc != -EINTR)
 		cifs_dbg(VFS, "Error %d sending data on socket to server\n",
 			 rc);
-	else
+	else if (rc > 0)
 		rc = 0;
 
 	return rc;

From 9c87abaf4d12413531ce689b781120ebaeeef044 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Tue, 8 Jan 2019 18:30:57 +0000
Subject: [PATCH 2230/3220] cifs: Fix potential OOB access of lock element
 array

commit b9a74cde94957d82003fb9f7ab4777938ca851cd upstream.

If maxBuf is small but non-zero, it could result in a zero sized lock
element array which we would then try and access OOB.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/file.c     | 8 ++++----
 fs/cifs/smb2file.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0141aba9eca6..026b399af215 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1073,10 +1073,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 
 	/*
 	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
-	 * and check it for zero before using.
+	 * and check it before using.
 	 */
 	max_buf = tcon->ses->server->maxBuf;
-	if (!max_buf) {
+	if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) {
 		free_xid(xid);
 		return -EINVAL;
 	}
@@ -1404,10 +1404,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
 
 	/*
 	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
-	 * and check it for zero before using.
+	 * and check it before using.
 	 */
 	max_buf = tcon->ses->server->maxBuf;
-	if (!max_buf)
+	if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE)))
 		return -EINVAL;
 
 	max_num = (max_buf - sizeof(struct smb_hdr)) /
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index b2aff0c6f22c..b7885dc0d9bb 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -123,10 +123,10 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
 
 	/*
 	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
-	 * and check it for zero before using.
+	 * and check it before using.
 	 */
 	max_buf = tcon->ses->server->maxBuf;
-	if (!max_buf)
+	if (max_buf < sizeof(struct smb2_lock_element))
 		return -EINVAL;
 
 	max_num = max_buf / sizeof(struct smb2_lock_element);

From 460398af9f97910992f1a3bad215b659229ed8b9 Mon Sep 17 00:00:00 2001
From: Daniele Palmas <dnlplm@gmail.com>
Date: Fri, 28 Dec 2018 16:15:41 +0100
Subject: [PATCH 2231/3220] usb: cdc-acm: send ZLP for Telit 3G Intel based
 modems

commit 34aabf918717dd14e05051896aaecd3b16b53d95 upstream.

Telit 3G Intel based modems require zero packet to be sent if
out data size is equal to the endpoint max packet size.

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/cdc-acm.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 3919ea066bf9..736de1021d8b 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1885,6 +1885,13 @@ static const struct usb_device_id acm_ids[] = {
 	.driver_info = IGNORE_DEVICE,
 	},
 
+	{ USB_DEVICE(0x1bc7, 0x0021), /* Telit 3G ACM only composition */
+	.driver_info = SEND_ZERO_PACKET,
+	},
+	{ USB_DEVICE(0x1bc7, 0x0023), /* Telit 3G ACM + ECM composition */
+	.driver_info = SEND_ZERO_PACKET,
+	},
+
 	/* control interfaces without any protocol set */
 	{ USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
 		USB_CDC_PROTO_NONE) },

From bbe85dc3ce39adea73049e3dbfc04543d056d02b Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Thu, 3 Jan 2019 11:26:17 +0800
Subject: [PATCH 2232/3220] USB: storage: don't insert sane sense for SPC3+
 when bad sense specified

commit c5603d2fdb424849360fe7e3f8c1befc97571b8c upstream.

Currently the code will set US_FL_SANE_SENSE flag unconditionally if
device claims SPC3+, however we should allow US_FL_BAD_SENSE flag to
prevent this behavior, because SMI SM3350 UFS-USB bridge controller,
which claims SPC4, will show strange behavior with 96-byte sense
(put the chip into a wrong state that cannot read/write anything).

Check the presence of US_FL_BAD_SENSE when assuming US_FL_SANE_SENSE on
SPC4+ devices.

Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
Cc: stable <stable@vger.kernel.org>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/storage/scsiglue.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 6c186b4df94a..b3344a77dcce 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -223,8 +223,12 @@ static int slave_configure(struct scsi_device *sdev)
 		if (!(us->fflags & US_FL_NEEDS_CAP16))
 			sdev->try_rc_10_first = 1;
 
-		/* assume SPC3 or latter devices support sense size > 18 */
-		if (sdev->scsi_level > SCSI_SPC_2)
+		/*
+		 * assume SPC3 or latter devices support sense size > 18
+		 * unless US_FL_BAD_SENSE quirk is specified.
+		 */
+		if (sdev->scsi_level > SCSI_SPC_2 &&
+		    !(us->fflags & US_FL_BAD_SENSE))
 			us->fflags |= US_FL_SANE_SENSE;
 
 		/* USB-IDE bridges tend to report SK = 0x04 (Non-recoverable

From 7a87e23537db289ceae5cf65d70df3e8645878b6 Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Thu, 3 Jan 2019 11:26:18 +0800
Subject: [PATCH 2233/3220] USB: storage: add quirk for SMI SM3350

commit 0a99cc4b8ee83885ab9f097a3737d1ab28455ac0 upstream.

The SMI SM3350 USB-UFS bridge controller cannot handle long sense request
correctly and will make the chip refuse to do read/write when requested
long sense.

Add a bad sense quirk for it.

Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
Cc: stable <stable@vger.kernel.org>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/storage/unusual_devs.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 898215cad351..d92b974f0635 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -1392,6 +1392,18 @@ UNUSUAL_DEV(  0x0d49, 0x7310, 0x0000, 0x9999,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_SANE_SENSE),
 
+/*
+ * Reported by Icenowy Zheng <icenowy@aosc.io>
+ * The SMI SM3350 USB-UFS bridge controller will enter a wrong state
+ * that do not process read/write command if a long sense is requested,
+ * so force to use 18-byte sense.
+ */
+UNUSUAL_DEV(  0x090c, 0x3350, 0x0000, 0xffff,
+		"SMI",
+		"SM3350 UFS-to-USB-Mass-Storage bridge",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_BAD_SENSE ),
+
 /*
  * Pete Zaitcev <zaitcev@yahoo.com>, bz#164688.
  * The device blatantly ignores LUN and returns 1 in GetMaxLUN.

From a04afa68c06208f3644750e520f2965df2f62f6c Mon Sep 17 00:00:00 2001
From: Jack Stocker <jackstocker.93@gmail.com>
Date: Thu, 3 Jan 2019 21:56:53 +0000
Subject: [PATCH 2234/3220] USB: Add USB_QUIRK_DELAY_CTRL_MSG quirk for Corsair
 K70 RGB

commit 3483254b89438e60f719937376c5e0ce2bc46761 upstream.

To match the Corsair Strafe RGB, the Corsair K70 RGB also requires
USB_QUIRK_DELAY_CTRL_MSG to completely resolve boot connection issues
discussed here: https://github.com/ckb-next/ckb-next/issues/42.
Otherwise roughly 1 in 10 boots the keyboard will fail to be detected.

Patch that applied delay control quirk for Corsair Strafe RGB:
cb88a0588717 ("usb: quirks: add control message delay for 1b1c:1b20")

Previous K70 RGB patch to add delay-init quirk:
7a1646d92257 ("Add delay-init quirk for Corsair K70 RGB keyboards")

Signed-off-by: Jack Stocker <jackstocker.93@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/quirks.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index cf378b1ed373..733479ddf8a7 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -240,7 +240,8 @@ static const struct usb_device_id usb_quirk_list[] = {
 			USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL },
 
 	/* Corsair K70 RGB */
-	{ USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT },
+	{ USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT |
+	  USB_QUIRK_DELAY_CTRL_MSG },
 
 	/* Corsair Strafe */
 	{ USB_DEVICE(0x1b1c, 0x1b15), .driver_info = USB_QUIRK_DELAY_INIT |

From 271137c0385839090dc195d5e3f41fd1d8999372 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 8 Jan 2019 15:23:00 -0800
Subject: [PATCH 2235/3220] slab: alien caches must not be initialized if the
 allocation of the alien cache failed

commit 09c2e76ed734a1d36470d257a778aaba28e86531 upstream.

Callers of __alloc_alien() check for NULL.  We must do the same check in
__alloc_alien_cache to avoid NULL pointer dereferences on allocation
failures.

Link: http://lkml.kernel.org/r/010001680f42f192-82b4e12e-1565-4ee0-ae1f-1e98974906aa-000000@email.amazonses.com
Fixes: 49dfc304ba241 ("slab: use the lock on alien_cache, instead of the lock on array_cache")
Fixes: c8522a3a5832b ("Slab: introduce alloc_alien")
Signed-off-by: Christoph Lameter <cl@linux.com>
Reported-by: syzbot+d6ed4ec679652b4fd4e4@syzkaller.appspotmail.com
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/slab.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index fa49c01225a7..92df044f5e00 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -875,8 +875,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
 	struct alien_cache *alc = NULL;
 
 	alc = kmalloc_node(memsize, gfp, node);
-	init_arraycache(&alc->ac, entries, batch);
-	spin_lock_init(&alc->lock);
+	if (alc) {
+		init_arraycache(&alc->ac, entries, batch);
+		spin_lock_init(&alc->lock);
+	}
 	return alc;
 }
 

From ecd77ebfa6ddcf728c8eccb4cdfebc23ab7125b9 Mon Sep 17 00:00:00 2001
From: Ley Foon Tan <lftan@altera.com>
Date: Wed, 2 Mar 2016 17:43:07 +0800
Subject: [PATCH 2236/3220] PCI: altera: Fix altera_pcie_link_is_up()

commit eff31f4002c4e25b9b8c39d0a3a551c6c64c77e8 upstream.

Originally altera_pcie_link_is_up() decided the link was up if any of the
low four bits of the LTSSM register were set.  But the link is only up if
the LTSSM state is L0, so check for that exact value.

[bhelgaas: changelog]
Signed-off-by: Ley Foon Tan <lftan@altera.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Claudius Heine <claudius.heine.ext@siemens.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/host/pcie-altera.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index 99da549d5d06..dbac6fb3f0bd 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -40,6 +40,7 @@
 #define P2A_INT_ENABLE			0x3070
 #define P2A_INT_ENA_ALL			0xf
 #define RP_LTSSM			0x3c64
+#define RP_LTSSM_MASK			0x1f
 #define LTSSM_L0			0xf
 
 /* TLP configuration type 0 and 1 */
@@ -140,7 +141,7 @@ static void tlp_write_tx(struct altera_pcie *pcie,
 
 static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
 {
-	return !!(cra_readl(pcie, RP_LTSSM) & LTSSM_L0);
+	return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
 }
 
 static bool altera_pcie_valid_config(struct altera_pcie *pcie,

From 9efffbc8191dd989065d4cf12e50d2abc84bd1fb Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 22 Jul 2016 15:54:41 -0500
Subject: [PATCH 2237/3220] PCI: altera: Reorder read/write functions

commit f8be11ae3d2c9a1338da37ff91ff4c65922d21be upstream.

Move cra_writel(), cra_readl(), and altera_pcie_link_is_up() so a future
patch can use them in altera_pcie_retrain().  No functional change
intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Claudius Heine <claudius.heine.ext@siemens.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/host/pcie-altera.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index dbac6fb3f0bd..a1e782263dec 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -81,6 +81,22 @@ struct tlp_rp_regpair_t {
 	u32 reg1;
 };
 
+static inline void cra_writel(struct altera_pcie *pcie, const u32 value,
+			      const u32 reg)
+{
+	writel_relaxed(value, pcie->cra_base + reg);
+}
+
+static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg)
+{
+	return readl_relaxed(pcie->cra_base + reg);
+}
+
+static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
+{
+	return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
+}
+
 static void altera_pcie_retrain(struct pci_dev *dev)
 {
 	u16 linkcap, linkstat;
@@ -120,17 +136,6 @@ static bool altera_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int  devfn,
 	return false;
 }
 
-static inline void cra_writel(struct altera_pcie *pcie, const u32 value,
-			      const u32 reg)
-{
-	writel_relaxed(value, pcie->cra_base + reg);
-}
-
-static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg)
-{
-	return readl_relaxed(pcie->cra_base + reg);
-}
-
 static void tlp_write_tx(struct altera_pcie *pcie,
 			 struct tlp_rp_regpair_t *tlp_rp_regdata)
 {
@@ -139,11 +144,6 @@ static void tlp_write_tx(struct altera_pcie *pcie,
 	cra_writel(pcie, tlp_rp_regdata->ctrl, RP_TX_CNTRL);
 }
 
-static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
-{
-	return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
-}
-
 static bool altera_pcie_valid_config(struct altera_pcie *pcie,
 				     struct pci_bus *bus, int dev)
 {

From cb3ff0382e582c823cbd2ba549b7754c6416ab78 Mon Sep 17 00:00:00 2001
From: Ley Foon Tan <lftan@altera.com>
Date: Tue, 21 Jun 2016 16:53:12 +0800
Subject: [PATCH 2238/3220] PCI: altera: Check link status before retrain link

commit c622032ebc538cb3869c312ae3ad235a99da84b6 upstream.

Check the link status before retraining.  If the link is not up, don't
bother trying to retrain it.

[bhelgaas: split code move to separate patch, changelog]
Signed-off-by: Ley Foon Tan <lftan@altera.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Claudius Heine <claudius.heine.ext@siemens.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/host/pcie-altera.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index a1e782263dec..b61025ee07de 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -100,6 +100,10 @@ static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
 static void altera_pcie_retrain(struct pci_dev *dev)
 {
 	u16 linkcap, linkstat;
+	struct altera_pcie *pcie = dev->bus->sysdata;
+
+	if (!altera_pcie_link_is_up(pcie))
+		return;
 
 	/*
 	 * Set the retrain bit if the PCIe rootport support > 2.5GB/s, but

From 704a120d8868cbe670830826b997c2ecdec32041 Mon Sep 17 00:00:00 2001
From: Ley Foon Tan <lftan@altera.com>
Date: Tue, 21 Jun 2016 16:53:13 +0800
Subject: [PATCH 2239/3220] PCI: altera: Poll for link up status after
 retraining the link

commit 3a928e98a833e1a470a60d2fedf3c55502185fb7 upstream.

Some PCIe devices take a long time to reach link up state after retrain.
Poll for link up status after retraining the link.  This is to make sure
the link is up before we access configuration space.

[bhelgaas: changelog]
Signed-off-by: Ley Foon Tan <lftan@altera.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Claudius Heine <claudius.heine.ext@siemens.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/host/pcie-altera.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index b61025ee07de..e4154b2a23ce 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -61,6 +61,8 @@
 #define TLP_LOOP			500
 #define RP_DEVFN			0
 
+#define LINK_UP_TIMEOUT			5000
+
 #define INTX_NUM			4
 
 #define DWORD_MASK			3
@@ -101,6 +103,7 @@ static void altera_pcie_retrain(struct pci_dev *dev)
 {
 	u16 linkcap, linkstat;
 	struct altera_pcie *pcie = dev->bus->sysdata;
+	int timeout =  0;
 
 	if (!altera_pcie_link_is_up(pcie))
 		return;
@@ -115,9 +118,16 @@ static void altera_pcie_retrain(struct pci_dev *dev)
 		return;
 
 	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &linkstat);
-	if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB)
+	if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
 		pcie_capability_set_word(dev, PCI_EXP_LNKCTL,
 					 PCI_EXP_LNKCTL_RL);
+		while (!altera_pcie_link_is_up(pcie)) {
+			timeout++;
+			if (timeout > LINK_UP_TIMEOUT)
+				break;
+			udelay(5);
+		}
+	}
 }
 DECLARE_PCI_FIXUP_EARLY(0x1172, PCI_ANY_ID, altera_pcie_retrain);
 

From db94a1ebabc4eb3791d20b8a3eccf2d29d1f77a4 Mon Sep 17 00:00:00 2001
From: Ley Foon Tan <lftan@altera.com>
Date: Mon, 15 Aug 2016 14:06:02 +0800
Subject: [PATCH 2240/3220] PCI: altera: Poll for link training status after
 retraining the link

commit 411dc32d8810e0a204c799ce5c97cb56990de1cb upstream.

Poll for link training status is cleared before poll for link up status.
This can help to get the reliable link up status, especially when PCIe is
in Gen 3 speed.

Signed-off-by: Ley Foon Tan <lftan@altera.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Claudius Heine <claudius.heine.ext@siemens.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/host/pcie-altera.c | 45 ++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index e4154b2a23ce..e0be9ac5701a 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -61,7 +61,8 @@
 #define TLP_LOOP			500
 #define RP_DEVFN			0
 
-#define LINK_UP_TIMEOUT			5000
+#define LINK_UP_TIMEOUT			HZ
+#define LINK_RETRAIN_TIMEOUT		HZ
 
 #define INTX_NUM			4
 
@@ -99,11 +100,44 @@ static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
 	return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
 }
 
+static void altera_wait_link_retrain(struct pci_dev *dev)
+{
+	u16 reg16;
+	unsigned long start_jiffies;
+	struct altera_pcie *pcie = dev->bus->sysdata;
+
+	/* Wait for link training end. */
+	start_jiffies = jiffies;
+	for (;;) {
+		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &reg16);
+		if (!(reg16 & PCI_EXP_LNKSTA_LT))
+			break;
+
+		if (time_after(jiffies, start_jiffies + LINK_RETRAIN_TIMEOUT)) {
+			dev_err(&pcie->pdev->dev, "link retrain timeout\n");
+			break;
+		}
+		udelay(100);
+	}
+
+	/* Wait for link is up */
+	start_jiffies = jiffies;
+	for (;;) {
+		if (altera_pcie_link_is_up(pcie))
+			break;
+
+		if (time_after(jiffies, start_jiffies + LINK_UP_TIMEOUT)) {
+			dev_err(&pcie->pdev->dev, "link up timeout\n");
+			break;
+		}
+		udelay(100);
+	}
+}
+
 static void altera_pcie_retrain(struct pci_dev *dev)
 {
 	u16 linkcap, linkstat;
 	struct altera_pcie *pcie = dev->bus->sysdata;
-	int timeout =  0;
 
 	if (!altera_pcie_link_is_up(pcie))
 		return;
@@ -121,12 +155,7 @@ static void altera_pcie_retrain(struct pci_dev *dev)
 	if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
 		pcie_capability_set_word(dev, PCI_EXP_LNKCTL,
 					 PCI_EXP_LNKCTL_RL);
-		while (!altera_pcie_link_is_up(pcie)) {
-			timeout++;
-			if (timeout > LINK_UP_TIMEOUT)
-				break;
-			udelay(5);
-		}
+		altera_wait_link_retrain(dev);
 	}
 }
 DECLARE_PCI_FIXUP_EARLY(0x1172, PCI_ANY_ID, altera_pcie_retrain);

From 2f5e06cf8ff289bb2b2cf278c1f78c5d63f4870a Mon Sep 17 00:00:00 2001
From: Ley Foon Tan <lftan@altera.com>
Date: Fri, 26 Aug 2016 09:47:24 +0800
Subject: [PATCH 2241/3220] PCI: altera: Rework config accessors for use
 without a struct pci_bus

commit 31fc0ad47e2e0b8417616aa0f1ddcc67edf1e109 upstream.

Rework configs accessors so a future patch can use them in _probe() with
struct altera_pcie instead of struct pci_bus.

Signed-off-by: Ley Foon Tan <lftan@altera.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Claudius Heine <claudius.heine.ext@siemens.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/host/pcie-altera.c | 64 ++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index e0be9ac5701a..a95c9c4e03c7 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -330,22 +330,14 @@ static int tlp_cfg_dword_write(struct altera_pcie *pcie, u8 bus, u32 devfn,
 	return PCIBIOS_SUCCESSFUL;
 }
 
-static int altera_pcie_cfg_read(struct pci_bus *bus, unsigned int devfn,
-				int where, int size, u32 *value)
+static int _altera_pcie_cfg_read(struct altera_pcie *pcie, u8 busno,
+				 unsigned int devfn, int where, int size,
+				 u32 *value)
 {
-	struct altera_pcie *pcie = bus->sysdata;
 	int ret;
 	u32 data;
 	u8 byte_en;
 
-	if (altera_pcie_hide_rc_bar(bus, devfn, where))
-		return PCIBIOS_BAD_REGISTER_NUMBER;
-
-	if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn))) {
-		*value = 0xffffffff;
-		return PCIBIOS_DEVICE_NOT_FOUND;
-	}
-
 	switch (size) {
 	case 1:
 		byte_en = 1 << (where & 3);
@@ -358,7 +350,7 @@ static int altera_pcie_cfg_read(struct pci_bus *bus, unsigned int devfn,
 		break;
 	}
 
-	ret = tlp_cfg_dword_read(pcie, bus->number, devfn,
+	ret = tlp_cfg_dword_read(pcie, busno, devfn,
 				 (where & ~DWORD_MASK), byte_en, &data);
 	if (ret != PCIBIOS_SUCCESSFUL)
 		return ret;
@@ -378,20 +370,14 @@ static int altera_pcie_cfg_read(struct pci_bus *bus, unsigned int devfn,
 	return PCIBIOS_SUCCESSFUL;
 }
 
-static int altera_pcie_cfg_write(struct pci_bus *bus, unsigned int devfn,
-				 int where, int size, u32 value)
+static int _altera_pcie_cfg_write(struct altera_pcie *pcie, u8 busno,
+				  unsigned int devfn, int where, int size,
+				  u32 value)
 {
-	struct altera_pcie *pcie = bus->sysdata;
 	u32 data32;
 	u32 shift = 8 * (where & 3);
 	u8 byte_en;
 
-	if (altera_pcie_hide_rc_bar(bus, devfn, where))
-		return PCIBIOS_BAD_REGISTER_NUMBER;
-
-	if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn)))
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
 	switch (size) {
 	case 1:
 		data32 = (value & 0xff) << shift;
@@ -407,8 +393,40 @@ static int altera_pcie_cfg_write(struct pci_bus *bus, unsigned int devfn,
 		break;
 	}
 
-	return tlp_cfg_dword_write(pcie, bus->number, devfn,
-		(where & ~DWORD_MASK), byte_en, data32);
+	return tlp_cfg_dword_write(pcie, busno, devfn, (where & ~DWORD_MASK),
+				   byte_en, data32);
+}
+
+static int altera_pcie_cfg_read(struct pci_bus *bus, unsigned int devfn,
+				int where, int size, u32 *value)
+{
+	struct altera_pcie *pcie = bus->sysdata;
+
+	if (altera_pcie_hide_rc_bar(bus, devfn, where))
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+
+	if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn))) {
+		*value = 0xffffffff;
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	return _altera_pcie_cfg_read(pcie, bus->number, devfn, where, size,
+				     value);
+}
+
+static int altera_pcie_cfg_write(struct pci_bus *bus, unsigned int devfn,
+				 int where, int size, u32 value)
+{
+	struct altera_pcie *pcie = bus->sysdata;
+
+	if (altera_pcie_hide_rc_bar(bus, devfn, where))
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+
+	if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn)))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	return _altera_pcie_cfg_write(pcie, bus->number, devfn, where, size,
+				     value);
 }
 
 static struct pci_ops altera_pcie_ops = {

From bb1de61e06c6eb0a602e0e86399af22476bc2289 Mon Sep 17 00:00:00 2001
From: Ley Foon Tan <lftan@altera.com>
Date: Fri, 26 Aug 2016 09:47:25 +0800
Subject: [PATCH 2242/3220] PCI: altera: Move retrain from fixup to
 altera_pcie_host_init()

commit ce4f1c7ad490aa7129bde5632d6e53943f8a866c upstream.

Previously we used a PCI early fixup to initiate a link retrain on Altera
devices.  But Altera PCIe IP can be configured as either a Root Port or an
Endpoint, and they might have same vendor ID, so the fixup would be run for
both.

We only want to initiate a link retrain for Altera Root Port devices, not
for Endpoints, so move the link retrain functionality from the fixup to
altera_pcie_host_init().

[bhelgaas: changelog]
Signed-off-by: Ley Foon Tan <lftan@altera.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Claudius Heine <claudius.heine.ext@siemens.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/host/pcie-altera.c | 151 ++++++++++++++++++++-------------
 1 file changed, 91 insertions(+), 60 deletions(-)

diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index a95c9c4e03c7..0118287a8a10 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -43,6 +43,7 @@
 #define RP_LTSSM_MASK			0x1f
 #define LTSSM_L0			0xf
 
+#define PCIE_CAP_OFFSET			0x80
 /* TLP configuration type 0 and 1 */
 #define TLP_FMTTYPE_CFGRD0		0x04	/* Configuration Read Type 0 */
 #define TLP_FMTTYPE_CFGWR0		0x44	/* Configuration Write Type 0 */
@@ -100,66 +101,6 @@ static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
 	return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
 }
 
-static void altera_wait_link_retrain(struct pci_dev *dev)
-{
-	u16 reg16;
-	unsigned long start_jiffies;
-	struct altera_pcie *pcie = dev->bus->sysdata;
-
-	/* Wait for link training end. */
-	start_jiffies = jiffies;
-	for (;;) {
-		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &reg16);
-		if (!(reg16 & PCI_EXP_LNKSTA_LT))
-			break;
-
-		if (time_after(jiffies, start_jiffies + LINK_RETRAIN_TIMEOUT)) {
-			dev_err(&pcie->pdev->dev, "link retrain timeout\n");
-			break;
-		}
-		udelay(100);
-	}
-
-	/* Wait for link is up */
-	start_jiffies = jiffies;
-	for (;;) {
-		if (altera_pcie_link_is_up(pcie))
-			break;
-
-		if (time_after(jiffies, start_jiffies + LINK_UP_TIMEOUT)) {
-			dev_err(&pcie->pdev->dev, "link up timeout\n");
-			break;
-		}
-		udelay(100);
-	}
-}
-
-static void altera_pcie_retrain(struct pci_dev *dev)
-{
-	u16 linkcap, linkstat;
-	struct altera_pcie *pcie = dev->bus->sysdata;
-
-	if (!altera_pcie_link_is_up(pcie))
-		return;
-
-	/*
-	 * Set the retrain bit if the PCIe rootport support > 2.5GB/s, but
-	 * current speed is 2.5 GB/s.
-	 */
-	pcie_capability_read_word(dev, PCI_EXP_LNKCAP, &linkcap);
-
-	if ((linkcap & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB)
-		return;
-
-	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &linkstat);
-	if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
-		pcie_capability_set_word(dev, PCI_EXP_LNKCTL,
-					 PCI_EXP_LNKCTL_RL);
-		altera_wait_link_retrain(dev);
-	}
-}
-DECLARE_PCI_FIXUP_EARLY(0x1172, PCI_ANY_ID, altera_pcie_retrain);
-
 /*
  * Altera PCIe port uses BAR0 of RC's configuration space as the translation
  * from PCI bus to native BUS.  Entire DDR region is mapped into PCIe space
@@ -434,6 +375,90 @@ static struct pci_ops altera_pcie_ops = {
 	.write = altera_pcie_cfg_write,
 };
 
+static int altera_read_cap_word(struct altera_pcie *pcie, u8 busno,
+				unsigned int devfn, int offset, u16 *value)
+{
+	u32 data;
+	int ret;
+
+	ret = _altera_pcie_cfg_read(pcie, busno, devfn,
+				    PCIE_CAP_OFFSET + offset, sizeof(*value),
+				    &data);
+	*value = data;
+	return ret;
+}
+
+static int altera_write_cap_word(struct altera_pcie *pcie, u8 busno,
+				 unsigned int devfn, int offset, u16 value)
+{
+	return _altera_pcie_cfg_write(pcie, busno, devfn,
+				      PCIE_CAP_OFFSET + offset, sizeof(value),
+				      value);
+}
+
+static void altera_wait_link_retrain(struct altera_pcie *pcie)
+{
+	u16 reg16;
+	unsigned long start_jiffies;
+
+	/* Wait for link training end. */
+	start_jiffies = jiffies;
+	for (;;) {
+		altera_read_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN,
+				     PCI_EXP_LNKSTA, &reg16);
+		if (!(reg16 & PCI_EXP_LNKSTA_LT))
+			break;
+
+		if (time_after(jiffies, start_jiffies + LINK_RETRAIN_TIMEOUT)) {
+			dev_err(&pcie->pdev->dev, "link retrain timeout\n");
+			break;
+		}
+		udelay(100);
+	}
+
+	/* Wait for link is up */
+	start_jiffies = jiffies;
+	for (;;) {
+		if (altera_pcie_link_is_up(pcie))
+			break;
+
+		if (time_after(jiffies, start_jiffies + LINK_UP_TIMEOUT)) {
+			dev_err(&pcie->pdev->dev, "link up timeout\n");
+			break;
+		}
+		udelay(100);
+	}
+}
+
+static void altera_pcie_retrain(struct altera_pcie *pcie)
+{
+	u16 linkcap, linkstat, linkctl;
+
+	if (!altera_pcie_link_is_up(pcie))
+		return;
+
+	/*
+	 * Set the retrain bit if the PCIe rootport support > 2.5GB/s, but
+	 * current speed is 2.5 GB/s.
+	 */
+	altera_read_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN, PCI_EXP_LNKCAP,
+			     &linkcap);
+	if ((linkcap & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB)
+		return;
+
+	altera_read_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN, PCI_EXP_LNKSTA,
+			     &linkstat);
+	if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
+		altera_read_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN,
+				     PCI_EXP_LNKCTL, &linkctl);
+		linkctl |= PCI_EXP_LNKCTL_RL;
+		altera_write_cap_word(pcie, pcie->root_bus_nr, RP_DEVFN,
+				      PCI_EXP_LNKCTL, linkctl);
+
+		altera_wait_link_retrain(pcie);
+	}
+}
+
 static int altera_pcie_intx_map(struct irq_domain *domain, unsigned int irq,
 				irq_hw_number_t hwirq)
 {
@@ -568,6 +593,11 @@ static int altera_pcie_parse_dt(struct altera_pcie *pcie)
 	return 0;
 }
 
+static void altera_pcie_host_init(struct altera_pcie *pcie)
+{
+	altera_pcie_retrain(pcie);
+}
+
 static int altera_pcie_probe(struct platform_device *pdev)
 {
 	struct altera_pcie *pcie;
@@ -605,6 +635,7 @@ static int altera_pcie_probe(struct platform_device *pdev)
 	cra_writel(pcie, P2A_INT_STS_ALL, P2A_INT_STATUS);
 	/* enable all interrupts */
 	cra_writel(pcie, P2A_INT_ENA_ALL, P2A_INT_ENABLE);
+	altera_pcie_host_init(pcie);
 
 	bus = pci_scan_root_bus(&pdev->dev, pcie->root_bus_nr, &altera_pcie_ops,
 				pcie, &pcie->resources);

From 31f76d65612faadb7c3390ee746ea27e379a7752 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 30 Dec 2018 18:25:00 +0100
Subject: [PATCH 2243/3220] ACPI: power: Skip duplicate power resource
 references in _PRx

commit 7d7b467cb95bf29597b417d4990160d4ea6d69b9 upstream.

Some ACPI tables contain duplicate power resource references like this:

        Name (_PR0, Package (0x04)  // _PR0: Power Resources for D0
        {
            P28P,
            P18P,
            P18P,
            CLK4
        })

This causes a WARN_ON in sysfs_add_link_to_group() because we end up
adding a link to the same acpi_device twice:

sysfs: cannot create duplicate filename '/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A08:00/808622C1:00/OVTI2680:00/power_resources_D0/LNXPOWER:0a'
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.12-301.fc29.x86_64 #1
Hardware name: Insyde CherryTrail/Type2 - Board Product Name, BIOS jumperx.T87.KFBNEEA02 04/13/2016
Call Trace:
 dump_stack+0x5c/0x80
 sysfs_warn_dup.cold.3+0x17/0x2a
 sysfs_do_create_link_sd.isra.2+0xa9/0xb0
 sysfs_add_link_to_group+0x30/0x50
 acpi_power_expose_list+0x74/0xa0
 acpi_power_add_remove_device+0x50/0xa0
 acpi_add_single_object+0x26b/0x5f0
 acpi_bus_check_add+0xc4/0x250
 ...

To address this issue, make acpi_extract_power_resources() check for
duplicates and simply skip them when found.

Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
[ rjw: Subject & changelog, comments ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/acpi/power.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c
index 1c2b846c5776..f28b4949cb9d 100644
--- a/drivers/acpi/power.c
+++ b/drivers/acpi/power.c
@@ -131,6 +131,23 @@ void acpi_power_resources_list_free(struct list_head *list)
 	}
 }
 
+static bool acpi_power_resource_is_dup(union acpi_object *package,
+				       unsigned int start, unsigned int i)
+{
+	acpi_handle rhandle, dup;
+	unsigned int j;
+
+	/* The caller is expected to check the package element types */
+	rhandle = package->package.elements[i].reference.handle;
+	for (j = start; j < i; j++) {
+		dup = package->package.elements[j].reference.handle;
+		if (dup == rhandle)
+			return true;
+	}
+
+	return false;
+}
+
 int acpi_extract_power_resources(union acpi_object *package, unsigned int start,
 				 struct list_head *list)
 {
@@ -150,6 +167,11 @@ int acpi_extract_power_resources(union acpi_object *package, unsigned int start,
 			err = -ENODEV;
 			break;
 		}
+
+		/* Some ACPI tables contain duplicate power resource references */
+		if (acpi_power_resource_is_dup(package, start, i))
+			continue;
+
 		err = acpi_add_power_resource(rhandle);
 		if (err)
 			break;

From 61dd99c3788d9752453c5406b8ae6d6e2197cf34 Mon Sep 17 00:00:00 2001
From: Yi Zeng <yizeng@asrmicro.com>
Date: Wed, 9 Jan 2019 15:33:07 +0800
Subject: [PATCH 2244/3220] i2c: dev: prevent adapter retries and timeout being
 set as minus value

commit 6ebec961d59bccf65d08b13fc1ad4e6272a89338 upstream.

If adapter->retries is set to a minus value from user space via ioctl,
it will make __i2c_transfer and __i2c_smbus_xfer skip the calling to
adapter->algo->master_xfer and adapter->algo->smbus_xfer that is
registered by the underlying bus drivers, and return value 0 to all the
callers. The bus driver will never be accessed anymore by all users,
besides, the users may still get successful return value without any
error or information log print out.

If adapter->timeout is set to minus value from user space via ioctl,
it will make the retrying loop in __i2c_transfer and __i2c_smbus_xfer
always break after the the first try, due to the time_after always
returns true.

Signed-off-by: Yi Zeng <yizeng@asrmicro.com>
[wsa: minor grammar updates to commit message]
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/i2c/i2c-dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index 94c837046786..57e3790c87b1 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -459,9 +459,15 @@ static long i2cdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return i2cdev_ioctl_smbus(client, arg);
 
 	case I2C_RETRIES:
+		if (arg > INT_MAX)
+			return -EINVAL;
+
 		client->adapter->retries = arg;
 		break;
 	case I2C_TIMEOUT:
+		if (arg > INT_MAX)
+			return -EINVAL;
+
 		/* For historical reasons, user-space sets the timeout
 		 * value in units of 10 ms.
 		 */

From 6b22de54346b5d00e3fcf3b0bd0bc2958cc52c41 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 14 Jan 2019 15:21:45 -0800
Subject: [PATCH 2245/3220] crypto: cts - fix crash on short inputs

[It's a minimal fix for a bug that was fixed incidentally by a large
refactoring in v4.8.]

In the CTS template, when the input length is <= one block cipher block
(e.g. <= 16 bytes for AES) pass the correct length to the underlying CBC
transform rather than one block.  This matches the upstream behavior and
makes the encryption/decryption operation correctly return -EINVAL when
1 <= nbytes < bsize or succeed when nbytes == 0, rather than crashing.

This was fixed upstream incidentally by a large refactoring,
commit 0605c41cc53c ("crypto: cts - Convert to skcipher").  But
syzkaller easily trips over this when running on older kernels, as it's
easily reachable via AF_ALG.  Therefore, this patch makes the minimal
fix for older kernels.

Cc: linux-crypto@vger.kernel.org
Fixes: 76cb9521795a ("[CRYPTO] cts: Add CTS mode required for Kerberos AES support")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 crypto/cts.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crypto/cts.c b/crypto/cts.c
index e467ec0acf9f..e65688d6a4ca 100644
--- a/crypto/cts.c
+++ b/crypto/cts.c
@@ -137,8 +137,8 @@ static int crypto_cts_encrypt(struct blkcipher_desc *desc,
 	lcldesc.info = desc->info;
 	lcldesc.flags = desc->flags;
 
-	if (tot_blocks == 1) {
-		err = crypto_blkcipher_encrypt_iv(&lcldesc, dst, src, bsize);
+	if (tot_blocks <= 1) {
+		err = crypto_blkcipher_encrypt_iv(&lcldesc, dst, src, nbytes);
 	} else if (nbytes <= bsize * 2) {
 		err = cts_cbc_encrypt(ctx, desc, dst, src, 0, nbytes);
 	} else {
@@ -232,8 +232,8 @@ static int crypto_cts_decrypt(struct blkcipher_desc *desc,
 	lcldesc.info = desc->info;
 	lcldesc.flags = desc->flags;
 
-	if (tot_blocks == 1) {
-		err = crypto_blkcipher_decrypt_iv(&lcldesc, dst, src, bsize);
+	if (tot_blocks <= 1) {
+		err = crypto_blkcipher_decrypt_iv(&lcldesc, dst, src, nbytes);
 	} else if (nbytes <= bsize * 2) {
 		err = cts_cbc_decrypt(ctx, desc, dst, src, 0, nbytes);
 	} else {

From 3897b4ae1fdc2cdc90594f817c69166855139c06 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 25 Dec 2018 00:56:33 -0500
Subject: [PATCH 2246/3220] ext4: fix a potential fiemap/page fault deadlock w/
 inline_data

commit 2b08b1f12cd664dc7d5c84ead9ff25ae97ad5491 upstream.

The ext4_inline_data_fiemap() function calls fiemap_fill_next_extent()
while still holding the xattr semaphore.  This is not necessary and it
triggers a circular lockdep warning.  This is because
fiemap_fill_next_extent() could trigger a page fault when it writes
into page which triggers a page fault.  If that page is mmaped from
the inline file in question, this could very well result in a
deadlock.

This problem can be reproduced using generic/519 with a file system
configuration which has the inline_data feature enabled.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inline.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 46d4fac48cf4..0dcd33f62637 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1861,12 +1861,12 @@ int ext4_inline_data_fiemap(struct inode *inode,
 	physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
 	physical += offsetof(struct ext4_inode, i_block);
 
-	if (physical)
-		error = fiemap_fill_next_extent(fieinfo, start, physical,
-						inline_len, flags);
 	brelse(iloc.bh);
 out:
 	up_read(&EXT4_I(inode)->xattr_sem);
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, start, physical,
+						inline_len, flags);
 	return (error < 0 ? error : 0);
 }
 

From 9615b6aeccbfb233fd672107aa6885bf039c3de3 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Mon, 24 Dec 2018 14:44:52 +0300
Subject: [PATCH 2247/3220] sunrpc: use-after-free in svc_process_common()

commit d4b09acf924b84bae77cad090a9d108e70b43643 upstream.

if node have NFSv41+ mounts inside several net namespaces
it can lead to use-after-free in svc_process_common()

svc_process_common()
        /* Setup reply header */
        rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); <<< HERE

svc_process_common() can use incorrect rqstp->rq_xprt,
its caller function bc_svc_process() takes it from serv->sv_bc_xprt.
The problem is that serv is global structure but sv_bc_xprt
is assigned per-netnamespace.

According to Trond, the whole "let's set up rqstp->rq_xprt
for the back channel" is nothing but a giant hack in order
to work around the fact that svc_process_common() uses it
to find the xpt_ops, and perform a couple of (meaningless
for the back channel) tests of xpt_flags.

All we really need in svc_process_common() is to be able to run
rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr()

Bruce J Fields points that this xpo_prep_reply_hdr() call
is an awfully roundabout way just to do "svc_putnl(resv, 0);"
in the tcp case.

This patch does not initialiuze rqstp->rq_xprt in bc_svc_process(),
now it calls svc_process_common() with rqstp->rq_xprt = NULL.

To adjust reply header svc_process_common() just check
rqstp->rq_prot and calls svc_tcp_prep_reply_hdr() for tcp case.

To handle rqstp->rq_xprt = NULL case in functions called from
svc_process_common() patch intruduces net namespace pointer
svc_rqst->rq_bc_net and adjust SVC_NET() definition.
Some other function was also adopted to properly handle described case.

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Cc: stable@vger.kernel.org
Fixes: 23c20ecd4475 ("NFS: callback up - users counting cleanup")
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
v2: - added lost extern svc_tcp_prep_reply_hdr()
    - dropped trace_svc_process() changes
    - context fixes in svc_process_common()
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sunrpc/svc.h |  5 ++++-
 net/sunrpc/svc.c           | 10 +++++++---
 net/sunrpc/svc_xprt.c      |  5 +++--
 net/sunrpc/svcsock.c       |  2 +-
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index cc0fc712bb82..a8ac3f25b4ec 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -290,9 +290,12 @@ struct svc_rqst {
 	struct svc_cacherep *	rq_cacherep;	/* cache info */
 	struct task_struct	*rq_task;	/* service thread */
 	spinlock_t		rq_lock;	/* per-request lock */
+	struct net		*rq_bc_net;	/* pointer to backchannel's
+						 * net namespace
+						 */
 };
 
-#define SVC_NET(svc_rqst)	(svc_rqst->rq_xprt->xpt_net)
+#define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
 
 /*
  * Rigorous type checking on sockaddr type conversions
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c5b0cb4f4056..41f6e964fe91 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1062,6 +1062,8 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
 static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {}
 #endif
 
+extern void svc_tcp_prep_reply_hdr(struct svc_rqst *);
+
 /*
  * Common routine for processing the RPC request.
  */
@@ -1091,7 +1093,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	clear_bit(RQ_DROPME, &rqstp->rq_flags);
 
 	/* Setup reply header */
-	rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
+	if (rqstp->rq_prot == IPPROTO_TCP)
+		svc_tcp_prep_reply_hdr(rqstp);
 
 	svc_putu32(resv, rqstp->rq_xid);
 
@@ -1138,7 +1141,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	case SVC_DENIED:
 		goto err_bad_auth;
 	case SVC_CLOSE:
-		if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+		if (rqstp->rq_xprt &&
+		    test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
 			svc_close_xprt(rqstp->rq_xprt);
 	case SVC_DROP:
 		goto dropit;
@@ -1360,10 +1364,10 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	dprintk("svc: %s(%p)\n", __func__, req);
 
 	/* Build the svc_rqst used by the common processing routine */
-	rqstp->rq_xprt = serv->sv_bc_xprt;
 	rqstp->rq_xid = req->rq_xid;
 	rqstp->rq_prot = req->rq_xprt->prot;
 	rqstp->rq_server = serv;
+	rqstp->rq_bc_net = req->rq_xprt->xprt_net;
 
 	rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
 	memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 71f15da72f02..2b8e80c721db 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -454,10 +454,11 @@ out:
  */
 void svc_reserve(struct svc_rqst *rqstp, int space)
 {
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+
 	space += rqstp->rq_res.head[0].iov_len;
 
-	if (space < rqstp->rq_reserved) {
-		struct svc_xprt *xprt = rqstp->rq_xprt;
+	if (xprt && space < rqstp->rq_reserved) {
 		atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
 		rqstp->rq_reserved = space;
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 9701fcca002c..0a9fe033132c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1240,7 +1240,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
 /*
  * Setup response header. TCP has a 4B record length field.
  */
-static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
+void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
 {
 	struct kvec *resv = &rqstp->rq_res.head[0];
 

From c5feba4358ed485b91c2f535d703758aa6d8872f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 16 Jan 2019 22:16:12 +0100
Subject: [PATCH 2248/3220] Linux 4.4.171

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index bc58f206c0da..c6b680faedd8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 170
+SUBLEVEL = 171
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From 3fcfb1ac66997546f25f41ce40da0ff158e59b2d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 14 Dec 2018 11:14:38 -0800
Subject: [PATCH 2249/3220] UPSTREAM: loop: drop caches if offset or block_size
 are changed

If we don't drop caches used in old offset or block_size, we can get old data
from new offset/block_size, which gives unexpected data to user.

For example, Martijn found a loopback bug in the below scenario.
1) LOOP_SET_FD loads first two pages on loop file
2) LOOP_SET_STATUS64 changes the offset on the loop file
3) mount is failed due to the cached pages having wrong superblock

Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Reported-by: Martijn Coenen <maco@google.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
(cherry picked from commit 490b8c65b9db45896769e1095e78725775f47b3e)
Change-Id: Iffb7e1f04ab587e1a8785bc862a425efb654be24
---
 drivers/block/loop.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 7eac581912f0..1a1b094da6cb 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1121,6 +1121,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
 		return -EINVAL;
 
+	if (lo->lo_offset != info->lo_offset ||
+	    lo->lo_sizelimit != info->lo_sizelimit) {
+		sync_blockdev(lo->lo_device);
+		kill_bdev(lo->lo_device);
+	}
+
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
 
@@ -1148,11 +1154,20 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		goto exit;
 
 	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit)
+	    lo->lo_sizelimit != info->lo_sizelimit) {
+		/* kill_bdev should have truncated all the pages */
+		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
+			err = -EAGAIN;
+			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
+				__func__, lo->lo_number, lo->lo_file_name,
+				lo->lo_device->bd_inode->i_mapping->nrpages);
+			goto exit;
+		}
 		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
 			goto exit;
 		}
+	}
 
 	loop_config_discard(lo);
 
@@ -1358,20 +1373,37 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 
 static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 {
+	int err = 0;
+
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
 
 	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
 		return -EINVAL;
 
+	if (lo->lo_queue->limits.logical_block_size != arg) {
+		sync_blockdev(lo->lo_device);
+		kill_bdev(lo->lo_device);
+	}
+
 	blk_mq_freeze_queue(lo->lo_queue);
 
+	/* kill_bdev should have truncated all the pages */
+	if (lo->lo_queue->limits.logical_block_size != arg &&
+			lo->lo_device->bd_inode->i_mapping->nrpages) {
+		err = -EAGAIN;
+		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
+			__func__, lo->lo_number, lo->lo_file_name,
+			lo->lo_device->bd_inode->i_mapping->nrpages);
+		goto out_unfreeze;
+	}
+
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	loop_update_dio(lo);
-
+out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
-	return 0;
+	return err;
 }
 
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,

From 24189101975d0efe8d75557f47f1406faa4dffaa Mon Sep 17 00:00:00 2001
From: Cody Schuffelen <schuffelen@google.com>
Date: Wed, 16 Jan 2019 16:45:29 -0800
Subject: [PATCH 2250/3220] ANDROID: Fix cuttlefish redundant vsock connection.

I initially believed CONFIG_VHOST_VSOCK was necessary on the guest side,
but astrachan@ correctly pointed out that this was for setting up vsock
on a host system.

With both CONFIG_VHOST_VSOCK and the other vsock options enabled, vsock
fails on startup with the error:
vmw_vsock_virtio_transport: probe of virtio9 failed with error -16

This is probably from the guest-side and host-side vsock fighting over
ownership on the vsock device.

Bug: 121166534
Test: Ran cuttlefish with the android-4.4 kernel.
Change-Id: Ib23a5d756f02708984babc73e26fdbef8435bfb4
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
---
 arch/arm64/configs/cuttlefish_defconfig      | 2 --
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 -
 2 files changed, 3 deletions(-)

diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
index 7457781a6f4f..ad998649d71f 100644
--- a/arch/arm64/configs/cuttlefish_defconfig
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -387,8 +387,6 @@ CONFIG_SDCARD_FS=y
 CONFIG_PSTORE=y
 CONFIG_PSTORE_CONSOLE=y
 CONFIG_PSTORE_RAM=y
-CONFIG_VIRTUALIZATION=y
-CONFIG_VHOST_VSOCK=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 # CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 7e83b1f6b015..fc2d30e1361d 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -461,4 +461,3 @@ CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
 CONFIG_X509_CERTIFICATE_PARSER=y
 CONFIG_SYSTEM_TRUSTED_KEYRING=y
 CONFIG_SYSTEM_TRUSTED_KEYS="verity_dev_keys.x509"
-CONFIG_VHOST_VSOCK=y

From e7c8b35e486775af65636a2a2b2766c3def9a8e8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 12 Dec 2016 16:43:26 -0800
Subject: [PATCH 2251/3220] UPSTREAM: mm: don't cap request size based on
 read-ahead setting

We ran into a funky issue, where someone doing 256K buffered reads saw
128K requests at the device level.  Turns out it is read-ahead capping
the request size, since we use 128K as the default setting.  This
doesn't make a lot of sense - if someone is issuing 256K reads, they
should see 256K reads, regardless of the read-ahead setting, if the
underlying device can support a 256K read in a single command.

This patch introduces a bdi hint, io_pages.  This is the soft max IO
size for the lower level, I've hooked it up to the bdev settings here.
Read-ahead is modified to issue the maximum of the user request size,
and the read-ahead max size, but capped to the max request size on the
device side.  The latter is done to avoid reading ahead too much, if the
application asks for a huge read.  With this patch, the kernel behaves
like the application expects.

Change-Id: Ibe52ffac7a6e1ac86ed0c6a59a0f7a32d651ee5f
Link: http://lkml.kernel.org/r/1479498073-8657-1-git-send-email-axboe@fb.com
Signed-off-by: Jens Axboe <axboe@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
---
 block/blk-settings.c             |  1 +
 block/blk-sysfs.c                |  1 +
 include/linux/backing-dev-defs.h |  1 +
 mm/readahead.c                   | 39 +++++++++++++++++++++++---------
 4 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index c7bb666aafd1..7c1e07165e3c 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
 	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
 	limits->max_sectors = max_sectors;
+	q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e140cc487ce1..68155dd21910 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -213,6 +213,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 	spin_lock_irq(q->queue_lock);
 	q->limits.max_sectors = max_sectors_kb << 1;
+	q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a307c37c2e6c..782d62673292 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -136,6 +136,7 @@ struct bdi_writeback {
 struct backing_dev_info {
 	struct list_head bdi_list;
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
+	unsigned long io_pages;	/* max allowed IO size */
 	unsigned int capabilities; /* Device capabilities */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
diff --git a/mm/readahead.c b/mm/readahead.c
index ba22d7fe0afb..920aa20de891 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -208,12 +208,21 @@ out:
  * memory at once.
  */
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
-		pgoff_t offset, unsigned long nr_to_read)
+			       pgoff_t offset, unsigned long nr_to_read)
 {
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	struct file_ra_state *ra = &filp->f_ra;
+	unsigned long max_pages;
+
 	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
 		return -EINVAL;
 
-	nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
+	/*
+	 * If the request exceeds the readahead window, allow the read to
+	 * be up to the optimal hardware IO size
+	 */
+	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
+	nr_to_read = min(nr_to_read, max_pages);
 	while (nr_to_read) {
 		int err;
 
@@ -370,9 +379,17 @@ ondemand_readahead(struct address_space *mapping,
 		   bool hit_readahead_marker, pgoff_t offset,
 		   unsigned long req_size)
 {
-	unsigned long max = ra->ra_pages;
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	unsigned long max_pages = ra->ra_pages;
 	pgoff_t prev_offset;
 
+	/*
+	 * If the request exceeds the readahead window, allow the read to
+	 * be up to the optimal hardware IO size
+	 */
+	if (req_size > max_pages && bdi->io_pages > max_pages)
+		max_pages = min(req_size, bdi->io_pages);
+
 	/*
 	 * start of file
 	 */
@@ -386,7 +403,7 @@ ondemand_readahead(struct address_space *mapping,
 	if ((offset == (ra->start + ra->size - ra->async_size) ||
 	     offset == (ra->start + ra->size))) {
 		ra->start += ra->size;
-		ra->size = get_next_ra_size(ra, max);
+		ra->size = get_next_ra_size(ra, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -401,16 +418,16 @@ ondemand_readahead(struct address_space *mapping,
 		pgoff_t start;
 
 		rcu_read_lock();
-		start = page_cache_next_hole(mapping, offset + 1, max);
+		start = page_cache_next_hole(mapping, offset + 1, max_pages);
 		rcu_read_unlock();
 
-		if (!start || start - offset > max)
+		if (!start || start - offset > max_pages)
 			return 0;
 
 		ra->start = start;
 		ra->size = start - offset;	/* old async_size */
 		ra->size += req_size;
-		ra->size = get_next_ra_size(ra, max);
+		ra->size = get_next_ra_size(ra, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -418,7 +435,7 @@ ondemand_readahead(struct address_space *mapping,
 	/*
 	 * oversize read
 	 */
-	if (req_size > max)
+	if (req_size > max_pages)
 		goto initial_readahead;
 
 	/*
@@ -434,7 +451,7 @@ ondemand_readahead(struct address_space *mapping,
 	 * Query the page cache and look for the traces(cached history pages)
 	 * that a sequential stream would leave behind.
 	 */
-	if (try_context_readahead(mapping, ra, offset, req_size, max))
+	if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
 		goto readit;
 
 	/*
@@ -445,7 +462,7 @@ ondemand_readahead(struct address_space *mapping,
 
 initial_readahead:
 	ra->start = offset;
-	ra->size = get_init_ra_size(req_size, max);
+	ra->size = get_init_ra_size(req_size, max_pages);
 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 
 readit:
@@ -455,7 +472,7 @@ readit:
 	 * the resulted next readahead window into the current one.
 	 */
 	if (offset == ra->start && ra->size == ra->async_size) {
-		ra->async_size = get_next_ra_size(ra, max);
+		ra->async_size = get_next_ra_size(ra, max_pages);
 		ra->size += ra->async_size;
 	}
 

From f2d65ea603e2043e80c63cf6dcd41d6a0ce8c841 Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Fri, 27 Jul 2018 09:09:53 -0600
Subject: [PATCH 2252/3220] UPSTREAM: readahead: stricter check for bdi
 io_pages

ondemand_readahead() checks bdi->io_pages to cap the maximum pages
that need to be processed. This works until the readit section. If
we would do an async only readahead (async size = sync size) and
target is at beginning of window we expand the pages by another
get_next_ra_size() pages. Btrace for large reads shows that kernel
always issues a doubled size read at the beginning of processing.
Add an additional check for io_pages in the lower part of the func.
The fix helps devices that hard limit bio pages and rely on proper
handling of max_hw_read_sectors (e.g. older FusionIO cards). For
that reason it could qualify for stable.

Fixes: 9491ae4a ("mm: don't cap request size based on read-ahead setting")
Change-Id: If111344b54897555085c2a6c442d697069962f11
Cc: stable@vger.kernel.org
Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
---
 mm/readahead.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 920aa20de891..f230b942cda2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -381,6 +381,7 @@ ondemand_readahead(struct address_space *mapping,
 {
 	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	unsigned long max_pages = ra->ra_pages;
+	unsigned long add_pages;
 	pgoff_t prev_offset;
 
 	/*
@@ -470,10 +471,17 @@ readit:
 	 * Will this read hit the readahead marker made by itself?
 	 * If so, trigger the readahead marker hit now, and merge
 	 * the resulted next readahead window into the current one.
+	 * Take care of maximum IO pages as above.
 	 */
 	if (offset == ra->start && ra->size == ra->async_size) {
-		ra->async_size = get_next_ra_size(ra, max_pages);
-		ra->size += ra->async_size;
+		add_pages = get_next_ra_size(ra, max_pages);
+		if (ra->size + add_pages <= max_pages) {
+			ra->async_size = add_pages;
+			ra->size += add_pages;
+		} else {
+			ra->size = max_pages;
+			ra->async_size = max_pages >> 1;
+		}
 	}
 
 	return ra_submit(ra, mapping, filp);

From d0c3914ffbe4c00f0a131bae83f811d5606699bc Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 18 Dec 2018 05:43:41 -0800
Subject: [PATCH 2253/3220] UPSTREAM: dm: do not allow readahead to limit IO
 size

Update DM to set the bdi's io_pages.  This fixes reads to be capped at
the device's max request size (even if user's read IO exceeds the
established readahead setting).

Fixes: 9491ae4a ("mm: don't cap request size based on read-ahead setting")
Cc: stable@vger.kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
(cherry picked from commit c6d6e9b0f6b4201c77f2cea3964dd122697e3543)
Bug: 120757803
Link: https://patchwork.kernel.org/patch/10746255/
Change-Id: I876f4c5425253c2a0876d5ef7fefe36c4edcb46c
---
 drivers/md/dm-table.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 33b4afc6a687..a37bdf6cd66f 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1542,6 +1542,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	smp_mb();
 	if (dm_table_request_based(t))
 		queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
+
+	/* io_pages is used for readahead */
+	q->backing_dev_info.io_pages = limits->max_sectors >> (PAGE_SHIFT - 9);
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)

From d93216e51e19c2a53bf76db0e9ac61fffc30b831 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 1 Nov 2018 00:24:46 +0000
Subject: [PATCH 2254/3220] tty/ldsem: Wake up readers after timed out
 down_write()

commit 231f8fd0cca078bd4396dd7e380db813ac5736e2 upstream.

ldsem_down_read() will sleep if there is pending writer in the queue.
If the writer times out, readers in the queue should be woken up,
otherwise they may miss a chance to acquire the semaphore until the last
active reader will do ldsem_up_read().

There was a couple of reports where there was one active reader and
other readers soft locked up:
  Showing all locks held in the system:
  2 locks held by khungtaskd/17:
   #0:  (rcu_read_lock){......}, at: watchdog+0x124/0x6d1
   #1:  (tasklist_lock){.+.+..}, at: debug_show_all_locks+0x72/0x2d3
  2 locks held by askfirst/123:
   #0:  (&tty->ldisc_sem){.+.+.+}, at: ldsem_down_read+0x46/0x58
   #1:  (&ldata->atomic_read_lock){+.+...}, at: n_tty_read+0x115/0xbe4

Prevent readers wait for active readers to release ldisc semaphore.

Link: lkml.kernel.org/r/20171121132855.ajdv4k6swzhvktl6@wfg-t540p.sh.intel.com
Link: lkml.kernel.org/r/20180907045041.GF1110@shao2-debian
Cc: Jiri Slaby <jslaby@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_ldsem.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
index ad7eba5ca380..34234c233851 100644
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@@ -307,6 +307,16 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout)
 	if (!locked)
 		ldsem_atomic_update(-LDSEM_WAIT_BIAS, sem);
 	list_del(&waiter.list);
+
+	/*
+	 * In case of timeout, wake up every reader who gave the right of way
+	 * to writer. Prevent separation readers into two groups:
+	 * one that helds semaphore and another that sleeps.
+	 * (in case of no contention with a writer)
+	 */
+	if (!locked && list_empty(&sem->write_wait))
+		__ldsem_wake_readers(sem);
+
 	raw_spin_unlock_irq(&sem->wait_lock);
 
 	__set_task_state(tsk, TASK_RUNNING);

From 693ae291197429f404e7d9c191e1541f61925278 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Fri, 4 Jan 2019 15:55:26 +0100
Subject: [PATCH 2255/3220] can: gw: ensure DLC boundaries after CAN frame
 modification

commit 0aaa81377c5a01f686bcdb8c7a6929a7bf330c68 upstream.

Muyu Yu provided a POC where user root with CAP_NET_ADMIN can create a CAN
frame modification rule that makes the data length code a higher value than
the available CAN frame data size. In combination with a configured checksum
calculation where the result is stored relatively to the end of the data
(e.g. cgw_csum_xor_rel) the tail of the skb (e.g. frag_list pointer in
skb_shared_info) can be rewritten which finally can cause a system crash.

Michael Kubecek suggested to drop frames that have a DLC exceeding the
available space after the modification process and provided a patch that can
handle CAN FD frames too. Within this patch we also limit the length for the
checksum calculations to the maximum of Classic CAN data length (8).

CAN frames that are dropped by these additional checks are counted with the
CGW_DELETED counter which indicates misconfigurations in can-gw rules.

This fixes CVE-2019-3701.

Reported-by: Muyu Yu <ieatmuttonchuan@gmail.com>
Reported-by: Marcus Meissner <meissner@suse.de>
Suggested-by: Michal Kubecek <mkubecek@suse.cz>
Tested-by: Muyu Yu <ieatmuttonchuan@gmail.com>
Tested-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: linux-stable <stable@vger.kernel.org> # >= v3.2
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/can/gw.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/net/can/gw.c b/net/can/gw.c
index 77c8af4047ef..81650affa3fa 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -418,13 +418,29 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 	while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx])
 		(*gwj->mod.modfunc[modidx++])(cf, &gwj->mod);
 
-	/* check for checksum updates when the CAN frame has been modified */
+	/* Has the CAN frame been modified? */
 	if (modidx) {
-		if (gwj->mod.csumfunc.crc8)
-			(*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);
+		/* get available space for the processed CAN frame type */
+		int max_len = nskb->len - offsetof(struct can_frame, data);
+
+		/* dlc may have changed, make sure it fits to the CAN frame */
+		if (cf->can_dlc > max_len)
+			goto out_delete;
+
+		/* check for checksum updates in classic CAN length only */
+		if (gwj->mod.csumfunc.crc8) {
+			if (cf->can_dlc > 8)
+				goto out_delete;
+
+			(*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);
+		}
+
+		if (gwj->mod.csumfunc.xor) {
+			if (cf->can_dlc > 8)
+				goto out_delete;
 
-		if (gwj->mod.csumfunc.xor)
 			(*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor);
+		}
 	}
 
 	/* clear the skb timestamp if not configured the other way */
@@ -436,6 +452,14 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 		gwj->dropped_frames++;
 	else
 		gwj->handled_frames++;
+
+	return;
+
+ out_delete:
+	/* delete frame due to misconfiguration */
+	gwj->deleted_frames++;
+	kfree_skb(nskb);
+	return;
 }
 
 static inline int cgw_register_filter(struct cgw_job *gwj)

From 139211c6c6276201801c78f8216f69c850974b38 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Tue, 1 Dec 2015 11:43:59 +0800
Subject: [PATCH 2256/3220] f2fs: clean up argument of recover_data

commit b7973f2378c619d0e17a075f13350bd58a9ebe3d upstream.

In recover_data, value of argument 'type' will be CURSEG_WARM_NODE all
the time, remove it for cleanup.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Picked as dependency of commit 6781eabba1bd "f2fs: give -EINVAL for
 norecovery and rw mount"]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/recovery.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index e32f349f341b..c6f508256ff4 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -459,8 +459,7 @@ out:
 	return err;
 }
 
-static int recover_data(struct f2fs_sb_info *sbi,
-				struct list_head *head, int type)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
 {
 	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
@@ -469,7 +468,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
 	block_t blkaddr;
 
 	/* get node pages in the current segment */
-	curseg = CURSEG_I(sbi, type);
+	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
 	while (1) {
@@ -556,7 +555,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 	need_writecp = true;
 
 	/* step #2: recover data */
-	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+	err = recover_data(sbi, &inode_list);
 	if (!err)
 		f2fs_bug_on(sbi, !list_empty(&inode_list));
 out:

From 87a099c6bd14a4b9fef9a2b5d7e332502879ac2e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sat, 2 Jan 2016 09:19:41 -0800
Subject: [PATCH 2257/3220] f2fs: cover more area with nat_tree_lock

commit a51311938e14c17f5a94d30baac9d7bec71f5858 upstream.

There was a subtle bug on nat cache management which incurs wrong nid allocation
or wrong block addresses when try_to_free_nats is triggered heavily.
This patch enlarges the previous coverage of nat_tree_lock to avoid data race.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/node.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7bcbc6e9c40d..8f6784fad918 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -261,13 +261,11 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
 {
 	struct nat_entry *e;
 
-	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
 	if (!e) {
 		e = grab_nat_entry(nm_i, nid);
 		node_info_from_raw_nat(&e->ni, ne);
 	}
-	up_write(&nm_i->nat_tree_lock);
 }
 
 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -379,6 +377,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 
 	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
 
+	down_write(&nm_i->nat_tree_lock);
+
 	/* Check current segment summary */
 	mutex_lock(&curseg->curseg_mutex);
 	i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
@@ -399,6 +399,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 cache:
 	/* cache nat entry */
 	cache_nat_entry(NM_I(sbi), nid, &ne);
+	up_write(&nm_i->nat_tree_lock);
 }
 
 /*
@@ -1440,13 +1441,10 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 	if (build) {
 		/* do not add allocated nids */
-		down_read(&nm_i->nat_tree_lock);
 		ne = __lookup_nat_cache(nm_i, nid);
-		if (ne &&
-			(!get_nat_flag(ne, IS_CHECKPOINTED) ||
+		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
 				nat_get_blkaddr(ne) != NULL_ADDR))
 			allocated = true;
-		up_read(&nm_i->nat_tree_lock);
 		if (allocated)
 			return 0;
 	}
@@ -1532,6 +1530,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
 							META_NAT, true);
 
+	down_read(&nm_i->nat_tree_lock);
+
 	while (1) {
 		struct page *page = get_current_nat_page(sbi, nid);
 
@@ -1560,6 +1560,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 			remove_free_nid(nm_i, nid);
 	}
 	mutex_unlock(&curseg->curseg_mutex);
+	up_read(&nm_i->nat_tree_lock);
 
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
 					nm_i->ra_nid_pages, META_NAT, false);
@@ -1842,14 +1843,12 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 
 		raw_ne = nat_in_journal(sum, i);
 
-		down_write(&nm_i->nat_tree_lock);
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (!ne) {
 			ne = grab_nat_entry(nm_i, nid);
 			node_info_from_raw_nat(&ne->ni, &raw_ne);
 		}
 		__set_nat_cache_dirty(nm_i, ne);
-		up_write(&nm_i->nat_tree_lock);
 	}
 	update_nats_in_cursum(sum, -i);
 	mutex_unlock(&curseg->curseg_mutex);
@@ -1883,7 +1882,6 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 	struct f2fs_nat_block *nat_blk;
 	struct nat_entry *ne, *cur;
 	struct page *page = NULL;
-	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
 	/*
 	 * there are two steps to flush nat entries:
@@ -1920,12 +1918,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 			raw_ne = &nat_blk->entries[nid - start_nid];
 		}
 		raw_nat_from_node_info(raw_ne, &ne->ni);
-
-		down_write(&NM_I(sbi)->nat_tree_lock);
 		nat_reset_flag(ne);
 		__clear_nat_cache_dirty(NM_I(sbi), ne);
-		up_write(&NM_I(sbi)->nat_tree_lock);
-
 		if (nat_get_blkaddr(ne) == NULL_ADDR)
 			add_free_nid(sbi, nid, false);
 	}
@@ -1937,9 +1931,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 
 	f2fs_bug_on(sbi, set->entry_cnt);
 
-	down_write(&nm_i->nat_tree_lock);
 	radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
-	up_write(&nm_i->nat_tree_lock);
 	kmem_cache_free(nat_entry_set_slab, set);
 }
 
@@ -1959,6 +1951,9 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 
 	if (!nm_i->dirty_nat_cnt)
 		return;
+
+	down_write(&nm_i->nat_tree_lock);
+
 	/*
 	 * if there are no enough space in journal to store dirty nat
 	 * entries, remove all entries from journal and merge them
@@ -1967,7 +1962,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 	if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
 		remove_nats_in_journal(sbi);
 
-	down_write(&nm_i->nat_tree_lock);
 	while ((found = __gang_lookup_nat_set(nm_i,
 					set_idx, SETVEC_SIZE, setvec))) {
 		unsigned idx;
@@ -1976,12 +1970,13 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 			__adjust_nat_entry_set(setvec[idx], &sets,
 							MAX_NAT_JENTRIES(sum));
 	}
-	up_write(&nm_i->nat_tree_lock);
 
 	/* flush dirty nats in nat entry set */
 	list_for_each_entry_safe(set, tmp, &sets, set_list)
 		__flush_nat_entry_set(sbi, set);
 
+	up_write(&nm_i->nat_tree_lock);
+
 	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
 }
 

From 8c5dfff5de5f220bd604a24d6667c7c0688763e6 Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Wed, 17 Feb 2016 11:26:32 +0800
Subject: [PATCH 2258/3220] f2fs: move sanity checking of cp into
 get_valid_checkpoint

commit 984ec63c5a82a07ad4490ecc69bebacd23f6fa64 upstream.

>From the function name of get_valid_checkpoint, it seems to return
the valid cp or NULL for caller to check. If no valid one is found,
f2fs_fill_super will print the err log. But if get_valid_checkpoint
get one valid(the return value indicate that it's valid, however actually
it is invalid after sanity checking), then print another similar err
log. That seems strange. Let's keep sanity checking inside the procedure
of geting valid cp. Another improvement we gained from this move is
that even the large volume is supported, we check the cp in advanced
to skip the following procedure if failing the sanity checking.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c |  4 ++++
 fs/f2fs/f2fs.h       |  1 +
 fs/f2fs/super.c      | 10 +---------
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f661d80474be..a6221f4fd581 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -696,6 +696,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 	cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
 	memcpy(sbi->ckpt, cp_block, blk_size);
 
+	/* Sanity checking of checkpoint */
+	if (sanity_check_ckpt(sbi))
+		goto fail_no_cp;
+
 	if (cp_blks <= 1)
 		goto done;
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2871576fbca4..b9b9370830b4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1718,6 +1718,7 @@ int f2fs_commit_super(struct f2fs_sb_info *, bool);
 int f2fs_sync_fs(struct super_block *, int);
 extern __printf(3, 4)
 void f2fs_msg(struct super_block *, const char *, const char *, ...);
+int sanity_check_ckpt(struct f2fs_sb_info *sbi);
 
 /*
  * hash.c
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 2ffc53d0c9c7..88a769d719ca 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1073,7 +1073,7 @@ static int sanity_check_raw_super(struct super_block *sb,
 	return 0;
 }
 
-static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
+int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 {
 	unsigned int total, fsmeta;
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
@@ -1358,13 +1358,6 @@ try_onemore:
 		goto free_meta_inode;
 	}
 
-	/* sanity checking of checkpoint */
-	err = -EINVAL;
-	if (sanity_check_ckpt(sbi)) {
-		f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
-		goto free_cp;
-	}
-
 	sbi->total_valid_node_count =
 				le32_to_cpu(sbi->ckpt->valid_node_count);
 	sbi->total_valid_inode_count =
@@ -1517,7 +1510,6 @@ free_nm:
 	destroy_node_manager(sbi);
 free_sm:
 	destroy_segment_manager(sbi);
-free_cp:
 	kfree(sbi->ckpt);
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);

From 523972a6e347c178204cb891bd18322df47b134b Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 22 Feb 2016 18:29:18 +0800
Subject: [PATCH 2259/3220] f2fs: fix to convert inline directory correctly

With below serials, we will lose parts of dirents:

1) mount f2fs with inline_dentry option
2) echo 1 > /sys/fs/f2fs/sdX/dir_level
3) mkdir dir
4) touch 180 files named [1-180] in dir
5) touch 181 in dir
6) echo 3 > /proc/sys/vm/drop_caches
7) ll dir

ls: cannot access 2: No such file or directory
ls: cannot access 4: No such file or directory
ls: cannot access 5: No such file or directory
ls: cannot access 6: No such file or directory
ls: cannot access 8: No such file or directory
ls: cannot access 9: No such file or directory
...
total 360
drwxr-xr-x 2 root root 4096 Feb 19 15:12 ./
drwxr-xr-x 3 root root 4096 Feb 19 15:11 ../
-rw-r--r-- 1 root root    0 Feb 19 15:12 1
-rw-r--r-- 1 root root    0 Feb 19 15:12 10
-rw-r--r-- 1 root root    0 Feb 19 15:12 100
-????????? ? ?    ?       ?            ? 101
-????????? ? ?    ?       ?            ? 102
-????????? ? ?    ?       ?            ? 103
...

The reason is: when doing the inline dir conversion, we didn't consider
that directory has hierarchical hash structure which can be configured
through sysfs interface 'dir_level'.

By default, dir_level of directory inode is 0, it means we have one bucket
in hash table located in first level, all dirents will be hashed in this
bucket, so it has no problem for us to do the duplication simply between
inline dentry page and converted normal dentry page.

However, if we configured dir_level with the value N (greater than 0), it
will expand the bucket number of first level hash table by 2^N - 1, it
hashs dirents into different buckets according their hash value, if we
still move all dirents to first bucket, it makes incorrent locating for
inline dirents, the result is, although we can iterate all dirents through
->readdir, we can't stat some of them in ->lookup which based on hash
table searching.

This patch fixes this issue by rehashing dirents into correct position
when converting inline directory.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - Keep using f2fs_crypto functions instead of generic fscrypt API
 - Use remove_dirty_dir_inode() instead of remove_dirty_inode()
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/dir.c           | 87 ++++++++++++++++++++------------------
 fs/f2fs/f2fs.h          |  4 +-
 fs/f2fs/inline.c        | 94 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/f2fs_fs.h |  2 +
 4 files changed, 144 insertions(+), 43 deletions(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 60972a559685..92a240616f52 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
 	[F2FS_FT_SYMLINK]	= DT_LNK,
 };
 
-#define S_SHIFT 12
 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFREG >> S_SHIFT]	= F2FS_FT_REG_FILE,
 	[S_IFDIR >> S_SHIFT]	= F2FS_FT_DIR,
@@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
 	de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
 }
 
+unsigned char get_de_type(struct f2fs_dir_entry *de)
+{
+	if (de->file_type < F2FS_FT_MAX)
+		return f2fs_filetype_table[de->file_type];
+	return DT_UNKNOWN;
+}
+
 static unsigned long dir_block_index(unsigned int level,
 				int dir_level, unsigned int idx)
 {
@@ -519,11 +525,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
 		test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
 }
 
-/*
- * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op().
- */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
 				struct inode *inode, nid_t ino, umode_t mode)
 {
 	unsigned int bit_pos;
@@ -536,28 +538,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct f2fs_dentry_ptr d;
 	struct page *page = NULL;
-	struct f2fs_filename fname;
-	struct qstr new_name;
-	int slots, err;
-
-	err = f2fs_fname_setup_filename(dir, name, 0, &fname);
-	if (err)
-		return err;
-
-	new_name.name = fname_name(&fname);
-	new_name.len = fname_len(&fname);
-
-	if (f2fs_has_inline_dentry(dir)) {
-		err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
-		if (!err || err != -EAGAIN)
-			goto out;
-		else
-			err = 0;
-	}
+	int slots, err = 0;
 
 	level = 0;
-	slots = GET_DENTRY_SLOTS(new_name.len);
-	dentry_hash = f2fs_dentry_hash(&new_name, NULL);
+	slots = GET_DENTRY_SLOTS(new_name->len);
+	dentry_hash = f2fs_dentry_hash(new_name, NULL);
 
 	current_depth = F2FS_I(dir)->i_current_depth;
 	if (F2FS_I(dir)->chash == dentry_hash) {
@@ -566,10 +551,8 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	}
 
 start:
-	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) {
-		err = -ENOSPC;
-		goto out;
-	}
+	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
+		return -ENOSPC;
 
 	/* Increase the depth, if required */
 	if (level == current_depth)
@@ -583,10 +566,8 @@ start:
 
 	for (block = bidx; block <= (bidx + nblock - 1); block++) {
 		dentry_page = get_new_data_page(dir, NULL, block, true);
-		if (IS_ERR(dentry_page)) {
-			err = PTR_ERR(dentry_page);
-			goto out;
-		}
+		if (IS_ERR(dentry_page))
+			return PTR_ERR(dentry_page);
 
 		dentry_blk = kmap(dentry_page);
 		bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
@@ -606,7 +587,7 @@ add_dentry:
 
 	if (inode) {
 		down_write(&F2FS_I(inode)->i_sem);
-		page = init_inode_metadata(inode, dir, &new_name, NULL);
+		page = init_inode_metadata(inode, dir, new_name, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto fail;
@@ -616,7 +597,7 @@ add_dentry:
 	}
 
 	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
-	f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos);
+	f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
 
 	set_page_dirty(dentry_page);
 
@@ -638,7 +619,34 @@ fail:
 	}
 	kunmap(dentry_page);
 	f2fs_put_page(dentry_page, 1);
-out:
+
+	return err;
+}
+
+/*
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ */
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+				struct inode *inode, nid_t ino, umode_t mode)
+{
+	struct f2fs_filename fname;
+	struct qstr new_name;
+	int err;
+
+	err = f2fs_fname_setup_filename(dir, name, 0, &fname);
+	if (err)
+		return err;
+
+	new_name.name = fname_name(&fname);
+	new_name.len = fname_len(&fname);
+
+	err = -EAGAIN;
+	if (f2fs_has_inline_dentry(dir))
+		err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
+	if (err == -EAGAIN)
+		err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode);
+
 	f2fs_fname_free_filename(&fname);
 	return err;
 }
@@ -792,10 +800,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			break;
 
 		de = &d->dentry[bit_pos];
-		if (de->file_type < F2FS_FT_MAX)
-			d_type = f2fs_filetype_table[de->file_type];
-		else
-			d_type = DT_UNKNOWN;
+		d_type = get_de_type(de);
 
 		de_name.name = d->filename[bit_pos];
 		de_name.len = le16_to_cpu(de->name_len);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b9b9370830b4..9648ddd2583b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1677,7 +1677,7 @@ struct dentry *f2fs_get_parent(struct dentry *child);
  */
 extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
 void set_de_type(struct f2fs_dir_entry *, umode_t);
-
+unsigned char get_de_type(struct f2fs_dir_entry *);
 struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *,
 			f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
 bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
@@ -1698,6 +1698,8 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
 int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
 void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
 			const struct qstr *, f2fs_hash_t , unsigned int);
+int f2fs_add_regular_entry(struct inode *, const struct qstr *,
+						struct inode *, nid_t, umode_t);
 int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
 			umode_t);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index ad80f916b64d..123b4dc90a23 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -367,7 +367,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
  * NOTE: ipage is grabbed by caller, but if any error occurs, we should
  * release ipage in this function.
  */
-static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 				struct f2fs_inline_dentry *inline_dentry)
 {
 	struct page *page;
@@ -428,6 +428,98 @@ out:
 	return err;
 }
 
+static int f2fs_add_inline_entries(struct inode *dir,
+			struct f2fs_inline_dentry *inline_dentry)
+{
+	struct f2fs_dentry_ptr d;
+	unsigned long bit_pos = 0;
+	int err = 0;
+
+	make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+
+	while (bit_pos < d.max) {
+		struct f2fs_dir_entry *de;
+		struct qstr new_name;
+		nid_t ino;
+		umode_t fake_mode;
+
+		if (!test_bit_le(bit_pos, d.bitmap)) {
+			bit_pos++;
+			continue;
+		}
+
+		de = &d.dentry[bit_pos];
+		new_name.name = d.filename[bit_pos];
+		new_name.len = de->name_len;
+
+		ino = le32_to_cpu(de->ino);
+		fake_mode = get_de_type(de) << S_SHIFT;
+
+		err = f2fs_add_regular_entry(dir, &new_name, NULL,
+							ino, fake_mode);
+		if (err)
+			goto punch_dentry_pages;
+
+		if (unlikely(!de->name_len))
+			d.max = -1;
+
+		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+	}
+	return 0;
+punch_dentry_pages:
+	truncate_inode_pages(&dir->i_data, 0);
+	truncate_blocks(dir, 0, false);
+	remove_dirty_dir_inode(dir);
+	return err;
+}
+
+static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
+				struct f2fs_inline_dentry *inline_dentry)
+{
+	struct f2fs_inline_dentry *backup_dentry;
+	int err;
+
+	backup_dentry = kmalloc(sizeof(struct f2fs_inline_dentry),
+							GFP_F2FS_ZERO);
+	if (!backup_dentry)
+		return -ENOMEM;
+
+	memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
+	truncate_inline_inode(ipage, 0);
+
+	unlock_page(ipage);
+
+	err = f2fs_add_inline_entries(dir, backup_dentry);
+	if (err)
+		goto recover;
+
+	lock_page(ipage);
+
+	stat_dec_inline_dir(dir);
+	clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
+	update_inode(dir, ipage);
+	kfree(backup_dentry);
+	return 0;
+recover:
+	lock_page(ipage);
+	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+	i_size_write(dir, MAX_INLINE_DATA);
+	update_inode(dir, ipage);
+	f2fs_put_page(ipage, 1);
+
+	kfree(backup_dentry);
+	return err;
+}
+
+static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+				struct f2fs_inline_dentry *inline_dentry)
+{
+	if (!F2FS_I(dir)->i_dir_level)
+		return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
+	else
+		return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
+}
+
 int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
 			struct inode *inode, nid_t ino, umode_t mode)
 {
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 3d6e6ce44c5c..1f81ebcc2948 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -497,4 +497,6 @@ enum {
 	F2FS_FT_MAX
 };
 
+#define S_SHIFT 12
+
 #endif  /* _LINUX_F2FS_FS_H */

From 1499d39b74f5957e932639a86487ccea5a0a9740 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 23 Mar 2016 16:12:58 -0700
Subject: [PATCH 2260/3220] f2fs: give -EINVAL for norecovery and rw mount

commit 6781eabba1bdb133eb9125c4acf6704ccbe4df02 upstream.

Once detecting something to recover, f2fs should stop mounting, given norecovery
and rw mount options.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/f2fs.h     |  2 +-
 fs/f2fs/recovery.c | 11 +++++++----
 fs/f2fs/super.c    | 14 ++++++++++++--
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9648ddd2583b..c1d086fc79b5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1867,7 +1867,7 @@ void build_gc_manager(struct f2fs_sb_info *);
 /*
  * recovery.c
  */
-int recover_fsync_data(struct f2fs_sb_info *);
+int recover_fsync_data(struct f2fs_sb_info *, bool);
 bool space_for_roll_forward(struct f2fs_sb_info *);
 
 /*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index c6f508256ff4..d1a4eece299a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -524,12 +524,13 @@ next:
 	return err;
 }
 
-int recover_fsync_data(struct f2fs_sb_info *sbi)
+int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	struct list_head inode_list;
 	block_t blkaddr;
 	int err;
+	int ret = 0;
 	bool need_writecp = false;
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -546,11 +547,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 
 	/* step #1: find fsynced inode numbers */
 	err = find_fsync_dnodes(sbi, &inode_list);
-	if (err)
+	if (err || list_empty(&inode_list))
 		goto out;
 
-	if (list_empty(&inode_list))
+	if (check_only) {
+		ret = 1;
 		goto out;
+	}
 
 	need_writecp = true;
 
@@ -598,5 +601,5 @@ out:
 	} else {
 		mutex_unlock(&sbi->cp_mutex);
 	}
-	return err;
+	return ret ? ret: err;
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 88a769d719ca..3d5c8a60ac6e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1457,14 +1457,24 @@ try_onemore:
 		if (need_fsck)
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
 
-		err = recover_fsync_data(sbi);
-		if (err) {
+		err = recover_fsync_data(sbi, false);
+		if (err < 0) {
 			need_fsck = true;
 			f2fs_msg(sb, KERN_ERR,
 				"Cannot recover all fsync data errno=%ld", err);
 			goto free_kobj;
 		}
+	} else {
+		err = recover_fsync_data(sbi, true);
+
+		if (!f2fs_readonly(sb) && err > 0) {
+			err = -EINVAL;
+			f2fs_msg(sb, KERN_ERR,
+				"Need to recover fsync data");
+			goto free_kobj;
+		}
 	}
+
 	/* recover_fsync_data() cleared this already */
 	clear_sbi_flag(sbi, SBI_POR_DOING);
 

From 8f7c4fb9914132a25a893735c94a90c2d6d8d047 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 4 May 2016 09:58:10 -0700
Subject: [PATCH 2261/3220] f2fs: remove an obsolete variable

commit fb58ae22067e0595d974e3d856522c1ed6d2d7bf upstream.

This patch removes an obsolete variable used in add_free_nid.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Picked as dependency of commit 30a61ddf8117 "f2fs: fix race condition
 in between free nid allocator/initializer"]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/node.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8f6784fad918..ced2afd8e3f2 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1430,7 +1430,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i;
 	struct nat_entry *ne;
-	bool allocated = false;
 
 	if (!available_free_memory(sbi, FREE_NIDS))
 		return -1;
@@ -1444,8 +1443,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
 				nat_get_blkaddr(ne) != NULL_ADDR))
-			allocated = true;
-		if (allocated)
 			return 0;
 	}
 

From 6d07c0f4a4322962d034b2d09716a45c1d3fb1af Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 29 Apr 2016 20:13:37 +0800
Subject: [PATCH 2262/3220] f2fs: factor out fsync inode entry operations

commit 3f8ab270855b0b461995da5dc48dce9461c85d94 upstream.

Factor out fsync inode entry operations into {add,del}_fsync_inode.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/recovery.c | 59 +++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d1a4eece299a..8900c299de02 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -67,6 +67,28 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 	return NULL;
 }
 
+static struct fsync_inode_entry *add_fsync_inode(struct list_head *head,
+							struct inode *inode)
+{
+	struct fsync_inode_entry *entry;
+
+	entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
+	if (!entry)
+		return NULL;
+
+	entry->inode = inode;
+	list_add_tail(&entry->list, head);
+
+	return entry;
+}
+
+static void del_fsync_inode(struct fsync_inode_entry *entry)
+{
+	iput(entry->inode);
+	list_del(&entry->list);
+	kmem_cache_free(fsync_entry_slab, entry);
+}
+
 static int recover_dentry(struct inode *inode, struct page *ipage)
 {
 	struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
@@ -172,6 +194,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 {
 	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
+	struct inode *inode;
 	struct page *page = NULL;
 	block_t blkaddr;
 	int err = 0;
@@ -204,27 +227,27 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 					break;
 			}
 
-			/* add this fsync inode to the list */
-			entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
-			if (!entry) {
-				err = -ENOMEM;
-				break;
-			}
 			/*
 			 * CP | dnode(F) | inode(DF)
 			 * For this case, we should not give up now.
 			 */
-			entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
-			if (IS_ERR(entry->inode)) {
-				err = PTR_ERR(entry->inode);
-				kmem_cache_free(fsync_entry_slab, entry);
+			inode = f2fs_iget(sbi->sb, ino_of_node(page));
+			if (IS_ERR(inode)) {
+				err = PTR_ERR(inode);
 				if (err == -ENOENT) {
 					err = 0;
 					goto next;
 				}
 				break;
 			}
-			list_add_tail(&entry->list, head);
+
+			/* add this fsync inode to the list */
+			entry = add_fsync_inode(head, inode);
+			if (!entry) {
+				err = -ENOMEM;
+				iput(inode);
+				break;
+			}
 		}
 		entry->blkaddr = blkaddr;
 
@@ -248,11 +271,8 @@ static void destroy_fsync_dnodes(struct list_head *head)
 {
 	struct fsync_inode_entry *entry, *tmp;
 
-	list_for_each_entry_safe(entry, tmp, head, list) {
-		iput(entry->inode);
-		list_del(&entry->list);
-		kmem_cache_free(fsync_entry_slab, entry);
-	}
+	list_for_each_entry_safe(entry, tmp, head, list)
+		del_fsync_inode(entry);
 }
 
 static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
@@ -509,11 +529,8 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
 			break;
 		}
 
-		if (entry->blkaddr == blkaddr) {
-			iput(entry->inode);
-			list_del(&entry->list);
-			kmem_cache_free(fsync_entry_slab, entry);
-		}
+		if (entry->blkaddr == blkaddr)
+			del_fsync_inode(entry);
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);

From 4aa4ce1c4ad5e46994de309543d9177fa3b65301 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 7 May 2016 16:15:05 +0800
Subject: [PATCH 2263/3220] f2fs: fix inode cache leak

commit f61cce5b81f91ba336184008b24baec84afbb3dd upstream.

When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds.  Have a nice day...

After rmmod f2fs module, kenrel shows following dmesg:
 =============================================================================
 BUG f2fs_inode_cache (Tainted: G           O   ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
 -----------------------------------------------------------------------------

 Disabling lock debugging due to kernel taint
 INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
 CPU: 3 PID: 7455 Comm: rmmod Tainted: G    B      O    4.6.0-rc4+ #16
 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
  00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
  c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
  616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
 Call Trace:
  [<c13a83a0>] dump_stack+0x5f/0x8f
  [<c11c7276>] slab_err+0x76/0x80
  [<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
  [<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
  [<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
  [<c1198a38>] kmem_cache_destroy+0x158/0x1f0
  [<c176b43d>] ? mutex_unlock+0xd/0x10
  [<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
  [<c10f596c>] SyS_delete_module+0x16c/0x1d0
  [<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
  [<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
  [<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
  [<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
  [<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
  [<c176d888>] sysenter_past_esp+0x45/0x74
 INFO: Object 0xd1e6d9e0 @offset=6624
 kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
 CPU: 3 PID: 7455 Comm: rmmod Tainted: G    B      O    4.6.0-rc4+ #16
 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
  00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
  c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
  f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
 Call Trace:
  [<c13a83a0>] dump_stack+0x5f/0x8f
  [<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
  [<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
  [<c10f596c>] SyS_delete_module+0x16c/0x1d0
  [<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
  [<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
  [<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
  [<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
  [<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
  [<c176d888>] sysenter_past_esp+0x45/0x74

The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.

But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.

We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.

This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - Deleted add_dirty_dir_inode() function is different
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c | 24 -------------------
 fs/f2fs/f2fs.h       |  2 --
 fs/f2fs/recovery.c   | 56 ++++++++++++++++++++++++--------------------
 3 files changed, 31 insertions(+), 51 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index a6221f4fd581..692b6a3c8971 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -771,24 +771,6 @@ out:
 	f2fs_trace_pid(page);
 }
 
-void add_dirty_dir_inode(struct inode *inode)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct inode_entry *new =
-			f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
-	int ret = 0;
-
-	new->inode = inode;
-	INIT_LIST_HEAD(&new->list);
-
-	spin_lock(&sbi->dir_inode_lock);
-	ret = __add_dirty_inode(inode, new);
-	spin_unlock(&sbi->dir_inode_lock);
-
-	if (ret)
-		kmem_cache_free(inode_entry_slab, new);
-}
-
 void remove_dirty_dir_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -811,12 +793,6 @@ void remove_dirty_dir_inode(struct inode *inode)
 	stat_dec_dirty_dir(sbi);
 	spin_unlock(&sbi->dir_inode_lock);
 	kmem_cache_free(inode_entry_slab, entry);
-
-	/* Only from the recovery routine */
-	if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
-		clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
-		iput(inode);
-	}
 }
 
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c1d086fc79b5..9296161c0519 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1402,7 +1402,6 @@ enum {
 	FI_NO_ALLOC,		/* should not allocate any blocks */
 	FI_FREE_NID,		/* free allocated nide */
 	FI_UPDATE_DIR,		/* should update inode block for consistency */
-	FI_DELAY_IPUT,		/* used for the recovery */
 	FI_NO_EXTENT,		/* not to use the extent cache */
 	FI_INLINE_XATTR,	/* used for inline xattr */
 	FI_INLINE_DATA,		/* used for inline data*/
@@ -1828,7 +1827,6 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
 int recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void update_dirty_page(struct inode *, struct page *);
-void add_dirty_dir_inode(struct inode *);
 void remove_dirty_dir_inode(struct inode *);
 void sync_dirty_dir_inodes(struct f2fs_sb_info *);
 void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 8900c299de02..31169dd40d32 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -89,7 +89,8 @@ static void del_fsync_inode(struct fsync_inode_entry *entry)
 	kmem_cache_free(fsync_entry_slab, entry);
 }
 
-static int recover_dentry(struct inode *inode, struct page *ipage)
+static int recover_dentry(struct inode *inode, struct page *ipage,
+						struct list_head *dir_list)
 {
 	struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
 	nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -97,18 +98,29 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
 	struct qstr name;
 	struct page *page;
 	struct inode *dir, *einode;
+	struct fsync_inode_entry *entry;
 	int err = 0;
 
-	dir = f2fs_iget(inode->i_sb, pino);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
+	entry = get_fsync_inode(dir_list, pino);
+	if (!entry) {
+		dir = f2fs_iget(inode->i_sb, pino);
+		if (IS_ERR(dir)) {
+			err = PTR_ERR(dir);
+			goto out;
+		}
+
+		entry = add_fsync_inode(dir_list, dir);
+		if (!entry) {
+			err = -ENOMEM;
+			iput(dir);
+			goto out;
+		}
 	}
 
-	if (file_enc_name(inode)) {
-		iput(dir);
+	dir = entry->inode;
+
+	if (file_enc_name(inode))
 		return 0;
-	}
 
 	name.len = le32_to_cpu(raw_inode->i_namelen);
 	name.name = raw_inode->i_name;
@@ -116,7 +128,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
 	if (unlikely(name.len > F2FS_NAME_LEN)) {
 		WARN_ON(1);
 		err = -ENAMETOOLONG;
-		goto out_err;
+		goto out;
 	}
 retry:
 	de = f2fs_find_entry(dir, &name, &page);
@@ -142,23 +154,12 @@ retry:
 		goto retry;
 	}
 	err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
-	if (err)
-		goto out_err;
-
-	if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) {
-		iput(dir);
-	} else {
-		add_dirty_dir_inode(dir);
-		set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
-	}
 
 	goto out;
 
 out_unmap_put:
 	f2fs_dentry_kunmap(dir, page);
 	f2fs_put_page(page, 0);
-out_err:
-	iput(dir);
 out:
 	f2fs_msg(inode->i_sb, KERN_NOTICE,
 			"%s: ino = %x, name = %s, dir = %lx, err = %d",
@@ -479,7 +480,8 @@ out:
 	return err;
 }
 
-static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
+						struct list_head *dir_list)
 {
 	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
@@ -506,7 +508,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
 			break;
 		}
 
-		entry = get_fsync_inode(head, ino_of_node(page));
+		entry = get_fsync_inode(inode_list, ino_of_node(page));
 		if (!entry)
 			goto next;
 		/*
@@ -517,7 +519,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
 		if (entry->last_inode == blkaddr)
 			recover_inode(entry->inode, page);
 		if (entry->last_dentry == blkaddr) {
-			err = recover_dentry(entry->inode, page);
+			err = recover_dentry(entry->inode, page, dir_list);
 			if (err) {
 				f2fs_put_page(page, 1);
 				break;
@@ -545,6 +547,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	struct list_head inode_list;
+	struct list_head dir_list;
 	block_t blkaddr;
 	int err;
 	int ret = 0;
@@ -556,6 +559,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&inode_list);
+	INIT_LIST_HEAD(&dir_list);
 
 	/* prevent checkpoint */
 	mutex_lock(&sbi->cp_mutex);
@@ -575,12 +579,11 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	need_writecp = true;
 
 	/* step #2: recover data */
-	err = recover_data(sbi, &inode_list);
+	err = recover_data(sbi, &inode_list, &dir_list);
 	if (!err)
 		f2fs_bug_on(sbi, !list_empty(&inode_list));
 out:
 	destroy_fsync_dnodes(&inode_list);
-	kmem_cache_destroy(fsync_entry_slab);
 
 	/* truncate meta pages to be used by the recovery */
 	truncate_inode_pages_range(META_MAPPING(sbi),
@@ -618,5 +621,8 @@ out:
 	} else {
 		mutex_unlock(&sbi->cp_mutex);
 	}
+
+	destroy_fsync_dnodes(&dir_list);
+	kmem_cache_destroy(fsync_entry_slab);
 	return ret ? ret: err;
 }

From 70c357851b019d234d4af290febe06ed2563ccb9 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sun, 3 Jul 2016 22:05:11 +0800
Subject: [PATCH 2264/3220] f2fs: fix to avoid reading out encrypted data in
 page cache

commit 78682f79447998369a85f12b6437fa8fdbbdca50 upstream.

For encrypted inode, if user overwrites data of the inode, f2fs will read
encrypted data into page cache, and then do the decryption.

However reader can race with overwriter, and it will see encrypted data
which has not been decrypted by overwriter yet. Fix it by moving decrypting
work to background and keep page non-uptodated until data is decrypted.

Thread A				Thread B
- f2fs_file_write_iter
 - __generic_file_write_iter
  - generic_perform_write
   - f2fs_write_begin
    - f2fs_submit_page_bio
					- generic_file_read_iter
					 - do_generic_file_read
					  - lock_page_killable
					  - unlock_page
					  - copy_page_to_iter
					  hit the encrypted data in updated page
    - lock_page
    - fscrypt_decrypt_page

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - Keep using f2fs_crypto functions instead of generic fscrypt API
 - Use PAGE_CACHE_SIZE instead of PAGE_SIZE
 - Use submit_bio() instead of __submit_bio()
 - In f2fs_write_begin(), use dn.data_blkaddr instead of blkaddr
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/data.c | 89 ++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f6ccb21f286b..b0fcfaee354f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -866,6 +866,37 @@ out:
 	return ret;
 }
 
+struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
+							unsigned nr_pages)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_crypto_ctx *ctx = NULL;
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct bio *bio;
+
+	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+		ctx = f2fs_get_crypto_ctx(inode);
+		if (IS_ERR(ctx))
+			return ERR_CAST(ctx);
+
+		/* wait the page to be moved by cleaning */
+		f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+	}
+
+	bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+	if (!bio) {
+		if (ctx)
+			f2fs_release_crypto_ctx(ctx);
+		return ERR_PTR(-ENOMEM);
+	}
+	bio->bi_bdev = bdev;
+	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
+	bio->bi_end_io = f2fs_read_end_io;
+	bio->bi_private = ctx;
+
+	return bio;
+}
+
 /*
  * This function was originally taken from fs/mpage.c, and customized for f2fs.
  * Major change was from block_size == page_size in f2fs by default.
@@ -884,7 +915,6 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 	sector_t last_block;
 	sector_t last_block_in_file;
 	sector_t block_nr;
-	struct block_device *bdev = inode->i_sb->s_bdev;
 	struct f2fs_map_blocks map;
 
 	map.m_pblk = 0;
@@ -958,31 +988,9 @@ submit_and_realloc:
 			bio = NULL;
 		}
 		if (bio == NULL) {
-			struct f2fs_crypto_ctx *ctx = NULL;
-
-			if (f2fs_encrypted_inode(inode) &&
-					S_ISREG(inode->i_mode)) {
-
-				ctx = f2fs_get_crypto_ctx(inode);
-				if (IS_ERR(ctx))
-					goto set_error_page;
-
-				/* wait the page to be moved by cleaning */
-				f2fs_wait_on_encrypted_page_writeback(
-						F2FS_I_SB(inode), block_nr);
-			}
-
-			bio = bio_alloc(GFP_KERNEL,
-				min_t(int, nr_pages, BIO_MAX_PAGES));
-			if (!bio) {
-				if (ctx)
-					f2fs_release_crypto_ctx(ctx);
+			bio = f2fs_grab_bio(inode, block_nr, nr_pages);
+			if (IS_ERR(bio))
 				goto set_error_page;
-			}
-			bio->bi_bdev = bdev;
-			bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr);
-			bio->bi_end_io = f2fs_read_end_io;
-			bio->bi_private = ctx;
 		}
 
 		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
@@ -1482,17 +1490,21 @@ put_next:
 	if (dn.data_blkaddr == NEW_ADDR) {
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 	} else {
-		struct f2fs_io_info fio = {
-			.sbi = sbi,
-			.type = DATA,
-			.rw = READ_SYNC,
-			.blk_addr = dn.data_blkaddr,
-			.page = page,
-			.encrypted_page = NULL,
-		};
-		err = f2fs_submit_page_bio(&fio);
-		if (err)
+		struct bio *bio;
+
+		bio = f2fs_grab_bio(inode, dn.data_blkaddr, 1);
+		if (IS_ERR(bio)) {
+			err = PTR_ERR(bio);
 			goto fail;
+		}
+
+		if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+			bio_put(bio);
+			err = -EFAULT;
+			goto fail;
+		}
+
+		submit_bio(READ_SYNC, bio);
 
 		lock_page(page);
 		if (unlikely(!PageUptodate(page))) {
@@ -1503,13 +1515,6 @@ put_next:
 			f2fs_put_page(page, 1);
 			goto repeat;
 		}
-
-		/* avoid symlink page */
-		if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
-			err = f2fs_decrypt_one(inode, page);
-			if (err)
-				goto fail;
-		}
 	}
 out_update:
 	SetPageUptodate(page);

From 6ef26eb15521edf9819bb05b2eda0ed5213f2c08 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Thu, 28 Jul 2016 12:12:38 +0800
Subject: [PATCH 2265/3220] f2fs: not allow to write illegal blkaddr

commit bb413d6acd4e1c361daebf8486efc3923f429792 upstream.

we came across an error as below:

[build_nat_area_bitmap:1710] nid[0x    1718] addr[0x         1c18ddc] ino[0x    1718]
[build_nat_area_bitmap:1710] nid[0x    1719] addr[0x         1c193d5] ino[0x    1719]
[build_nat_area_bitmap:1710] nid[0x    171a] addr[0x         1c1736e] ino[0x    171a]
[build_nat_area_bitmap:1710] nid[0x    171b] addr[0x        58b3ee8f] ino[0x815f92ed]
[build_nat_area_bitmap:1710] nid[0x    171c] addr[0x         fcdc94b] ino[0x49366377]
[build_nat_area_bitmap:1710] nid[0x    171d] addr[0x        7cd2facf] ino[0xb3c55300]
[build_nat_area_bitmap:1710] nid[0x    171e] addr[0x        bd4e25d0] ino[0x77c34c09]

... ...

[build_nat_area_bitmap:1710] nid[0x    1718] addr[0x         1c18ddc] ino[0x    1718]
[build_nat_area_bitmap:1710] nid[0x    1719] addr[0x         1c193d5] ino[0x    1719]
[build_nat_area_bitmap:1710] nid[0x    171a] addr[0x         1c1736e] ino[0x    171a]
[build_nat_area_bitmap:1710] nid[0x    171b] addr[0x        58b3ee8f] ino[0x815f92ed]
[build_nat_area_bitmap:1710] nid[0x    171c] addr[0x         fcdc94b] ino[0x49366377]
[build_nat_area_bitmap:1710] nid[0x    171d] addr[0x        7cd2facf] ino[0xb3c55300]
[build_nat_area_bitmap:1710] nid[0x    171e] addr[0x        bd4e25d0] ino[0x77c34c09]

One nat block may be stepped by a data block, so this patch forbid to
write if the blkaddr is illegal

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/segment.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index bfa1d31f79aa..133f3c4cce75 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -576,8 +576,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 
 static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 {
-	f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi)
-					|| blk_addr >= MAX_BLKADDR(sbi));
+	BUG_ON(blk_addr < SEG0_BLKADDR(sbi)
+			|| blk_addr >= MAX_BLKADDR(sbi));
 }
 
 /*

From 5dfb9eb6ca6ad3de8af37210b1eadf5faf7cdc32 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 19 Aug 2016 23:13:47 +0800
Subject: [PATCH 2266/3220] f2fs: avoid unneeded loop in build_sit_entries

commit d600af236da51d9e3b90d21a23f95b820bd02e2f upstream.

When building each sit entry in cache, firstly, we will load it from
sit page, and then check all entries in sit journal, if there is one
updated entry in journal, cover cached entry with the journaled one.

Actually, most of check operation is unneeded since we only need
to update cached entries with journaled entries in batch, so
changing the flow as below for more efficient:
1. load all sit entries into cache from sit pages;
2. update sit entries with journal.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - Keep using curseg->curseg_mutex for serialisation
 - Use sum instead of journal
 - Don't add f2fs_discard_en() condition]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/segment.c | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 39ec9da08bb5..84dc723e9b85 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2145,22 +2145,11 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 			struct f2fs_sit_entry sit;
 			struct page *page;
 
-			mutex_lock(&curseg->curseg_mutex);
-			for (i = 0; i < sits_in_cursum(sum); i++) {
-				if (le32_to_cpu(segno_in_journal(sum, i))
-								== start) {
-					sit = sit_in_journal(sum, i);
-					mutex_unlock(&curseg->curseg_mutex);
-					goto got_it;
-				}
-			}
-			mutex_unlock(&curseg->curseg_mutex);
-
 			page = get_current_sit_page(sbi, start);
 			sit_blk = (struct f2fs_sit_block *)page_address(page);
 			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
 			f2fs_put_page(page, 1);
-got_it:
+
 			check_block_count(sbi, start, &sit);
 			seg_info_from_raw_sit(se, &sit);
 
@@ -2168,13 +2157,36 @@ got_it:
 			memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
 			sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks;
 
-			if (sbi->segs_per_sec > 1) {
-				struct sec_entry *e = get_sec_entry(sbi, start);
-				e->valid_blocks += se->valid_blocks;
-			}
+			if (sbi->segs_per_sec > 1)
+				get_sec_entry(sbi, start)->valid_blocks +=
+							se->valid_blocks;
 		}
 		start_blk += readed;
 	} while (start_blk < sit_blk_cnt);
+
+	mutex_lock(&curseg->curseg_mutex);
+	for (i = 0; i < sits_in_cursum(sum); i++) {
+		struct f2fs_sit_entry sit;
+		struct seg_entry *se;
+		unsigned int old_valid_blocks;
+
+		start = le32_to_cpu(segno_in_journal(sum, i));
+		se = &sit_i->sentries[start];
+		sit = sit_in_journal(sum, i);
+
+		old_valid_blocks = se->valid_blocks;
+
+		check_block_count(sbi, start, &sit);
+		seg_info_from_raw_sit(se, &sit);
+
+		memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
+		sbi->discard_blks += old_valid_blocks - se->valid_blocks;
+
+		if (sbi->segs_per_sec > 1)
+			get_sec_entry(sbi, start)->valid_blocks +=
+				se->valid_blocks - old_valid_blocks;
+	}
+	mutex_unlock(&curseg->curseg_mutex);
 }
 
 static void init_free_segmap(struct f2fs_sb_info *sbi)

From 65b9d5326d7d6dcd9239481e98ed2d4938420f31 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 19 Sep 2016 17:55:10 -0700
Subject: [PATCH 2267/3220] f2fs: use crc and cp version to determine
 roll-forward recovery

commit a468f0ef516fda9c7d91bb550d458e853d76955e upstream.

Previously, we used cp_version only to detect recoverable dnodes.
In order to avoid same garbage cp_version, we needed to truncate the next
dnode during checkpoint, resulting in additional discard or data write.
If we can distinguish this by using crc in addition to cp_version, we can
remove this overhead.

There is backward compatibility concern where it changes node_footer layout.
So, this patch introduces a new checkpoint flag, CP_CRC_RECOVERY_FLAG, to
detect new layout. New layout will be activated only when this flag is set.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - Deleted code is slightly different
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c    | 21 ++---------
 fs/f2fs/f2fs.h          |  1 -
 fs/f2fs/node.h          | 77 ++++++++++++++++++++++++++---------------
 fs/f2fs/recovery.c      | 30 ++++------------
 fs/f2fs/segment.c       | 22 ------------
 fs/f2fs/super.c         |  5 ++-
 include/linux/f2fs_fs.h |  1 +
 7 files changed, 63 insertions(+), 94 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 692b6a3c8971..39387f0f7609 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -902,7 +902,6 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
 static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
-	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
 	nid_t last_nid = nm_i->next_scan_nid;
@@ -911,15 +910,6 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	__u32 crc32 = 0;
 	int i;
 	int cp_payload_blks = __cp_payload(sbi);
-	block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
-	bool invalidate = false;
-
-	/*
-	 * This avoids to conduct wrong roll-forward operations and uses
-	 * metapages, so should be called prior to sync_meta_pages below.
-	 */
-	if (discard_next_dnode(sbi, discard_blk))
-		invalidate = true;
 
 	/* Flush all the NAT/SIT pages */
 	while (get_pages(sbi, F2FS_DIRTY_META)) {
@@ -996,6 +986,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
 		set_ckpt_flags(ckpt, CP_FSCK_FLAG);
 
+	/* set this flag to activate crc|cp_ver for recovery */
+	set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
+
 	/* update SIT/NAT bitmap */
 	get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
 	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
@@ -1053,14 +1046,6 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* wait for previous submitted meta pages writeback */
 	wait_on_all_pages_writeback(sbi);
 
-	/*
-	 * invalidate meta page which is used temporarily for zeroing out
-	 * block at the end of warm node chain.
-	 */
-	if (invalidate)
-		invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
-								discard_blk);
-
 	release_dirty_inode(sbi);
 
 	if (unlikely(f2fs_cp_error(sbi)))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9296161c0519..2c521da33413 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1780,7 +1780,6 @@ bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
 void release_discard_addrs(struct f2fs_sb_info *);
-bool discard_next_dnode(struct f2fs_sb_info *, block_t);
 int npages_for_summary_flush(struct f2fs_sb_info *, bool);
 void allocate_new_segments(struct f2fs_sb_info *);
 int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index e4fffd2d98c4..0d6f0e3dc655 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -212,6 +212,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
 	f2fs_change_bit(block_off, nm_i->nat_bitmap);
 }
 
+static inline nid_t ino_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le32_to_cpu(rn->footer.ino);
+}
+
+static inline nid_t nid_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le32_to_cpu(rn->footer.nid);
+}
+
+static inline unsigned int ofs_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	unsigned flag = le32_to_cpu(rn->footer.flag);
+	return flag >> OFFSET_BIT_SHIFT;
+}
+
+static inline __u64 cpver_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le64_to_cpu(rn->footer.cp_ver);
+}
+
+static inline block_t next_blkaddr_of_node(struct page *node_page)
+{
+	struct f2fs_node *rn = F2FS_NODE(node_page);
+	return le32_to_cpu(rn->footer.next_blkaddr);
+}
+
 static inline void fill_node_footer(struct page *page, nid_t nid,
 				nid_t ino, unsigned int ofs, bool reset)
 {
@@ -242,40 +273,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
 	struct f2fs_node *rn = F2FS_NODE(page);
+	size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
+	__u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver);
 
-	rn->footer.cp_ver = ckpt->checkpoint_ver;
+	if (is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
+		__u64 crc = le32_to_cpu(*((__le32 *)
+				((unsigned char *)ckpt + crc_offset)));
+		cp_ver |= (crc << 32);
+	}
+	rn->footer.cp_ver = cpu_to_le64(cp_ver);
 	rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
 }
 
-static inline nid_t ino_of_node(struct page *node_page)
+static inline bool is_recoverable_dnode(struct page *page)
 {
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	return le32_to_cpu(rn->footer.ino);
-}
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
+	size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
+	__u64 cp_ver = cur_cp_version(ckpt);
 
-static inline nid_t nid_of_node(struct page *node_page)
-{
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	return le32_to_cpu(rn->footer.nid);
-}
-
-static inline unsigned int ofs_of_node(struct page *node_page)
-{
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	unsigned flag = le32_to_cpu(rn->footer.flag);
-	return flag >> OFFSET_BIT_SHIFT;
-}
-
-static inline unsigned long long cpver_of_node(struct page *node_page)
-{
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	return le64_to_cpu(rn->footer.cp_ver);
-}
-
-static inline block_t next_blkaddr_of_node(struct page *node_page)
-{
-	struct f2fs_node *rn = F2FS_NODE(node_page);
-	return le32_to_cpu(rn->footer.next_blkaddr);
+	if (is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
+		__u64 crc = le32_to_cpu(*((__le32 *)
+				((unsigned char *)ckpt + crc_offset)));
+		cp_ver |= (crc << 32);
+	}
+	return cpu_to_le64(cp_ver) == cpver_of_node(page);
 }
 
 /*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 31169dd40d32..b59285f4736c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -193,7 +193,6 @@ static void recover_inode(struct inode *inode, struct page *page)
 
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 {
-	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
 	struct inode *inode;
 	struct page *page = NULL;
@@ -214,7 +213,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 
 		page = get_tmp_page(sbi, blkaddr);
 
-		if (cp_ver != cpver_of_node(page))
+		if (!is_recoverable_dnode(page))
 			break;
 
 		if (!is_fsync_dnode(page))
@@ -483,7 +482,6 @@ out:
 static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
 						struct list_head *dir_list)
 {
-	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
 	struct page *page = NULL;
 	int err = 0;
@@ -503,7 +501,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
 
 		page = get_tmp_page(sbi, blkaddr);
 
-		if (cp_ver != cpver_of_node(page)) {
+		if (!is_recoverable_dnode(page)) {
 			f2fs_put_page(page, 1);
 			break;
 		}
@@ -595,31 +593,15 @@ out:
 	}
 
 	clear_sbi_flag(sbi, SBI_POR_DOING);
-	if (err) {
-		bool invalidate = false;
-
-		if (discard_next_dnode(sbi, blkaddr))
-			invalidate = true;
-
-		/* Flush all the NAT/SIT pages */
-		while (get_pages(sbi, F2FS_DIRTY_META))
-			sync_meta_pages(sbi, META, LONG_MAX);
-
-		/* invalidate temporary meta page */
-		if (invalidate)
-			invalidate_mapping_pages(META_MAPPING(sbi),
-							blkaddr, blkaddr);
-
+	if (err)
 		set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
-		mutex_unlock(&sbi->cp_mutex);
-	} else if (need_writecp) {
+	mutex_unlock(&sbi->cp_mutex);
+
+	if (!err && need_writecp) {
 		struct cp_control cpc = {
 			.reason = CP_RECOVERY,
 		};
-		mutex_unlock(&sbi->cp_mutex);
 		write_checkpoint(sbi, &cpc);
-	} else {
-		mutex_unlock(&sbi->cp_mutex);
 	}
 
 	destroy_fsync_dnodes(&dir_list);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 84dc723e9b85..c0dfe2a7295f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -519,28 +519,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 	return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
 }
 
-bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
-{
-	int err = -ENOTSUPP;
-
-	if (test_opt(sbi, DISCARD)) {
-		struct seg_entry *se = get_seg_entry(sbi,
-				GET_SEGNO(sbi, blkaddr));
-		unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
-
-		if (f2fs_test_bit(offset, se->discard_map))
-			return false;
-
-		err = f2fs_issue_discard(sbi, blkaddr, 1);
-	}
-
-	if (err) {
-		update_meta_page(sbi, NULL, blkaddr);
-		return true;
-	}
-	return false;
-}
-
 static void __add_discard_entry(struct f2fs_sb_info *sbi,
 		struct cp_control *cpc, struct seg_entry *se,
 		unsigned int start, unsigned int end)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3d5c8a60ac6e..9e9481446eee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1457,6 +1457,9 @@ try_onemore:
 		if (need_fsck)
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
 
+		if (!retry)
+			goto skip_recovery;
+
 		err = recover_fsync_data(sbi, false);
 		if (err < 0) {
 			need_fsck = true;
@@ -1474,7 +1477,7 @@ try_onemore:
 			goto free_kobj;
 		}
 	}
-
+skip_recovery:
 	/* recover_fsync_data() cleared this already */
 	clear_sbi_flag(sbi, SBI_POR_DOING);
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 1f81ebcc2948..520fd854e7b3 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -99,6 +99,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_CRC_RECOVERY_FLAG	0x00000040
 #define CP_FASTBOOT_FLAG	0x00000020
 #define CP_FSCK_FLAG		0x00000010
 #define CP_ERROR_FLAG		0x00000008

From 2f958b8e22eca953eef22eece9a018cda96a1e2d Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <kernelpatch@126.com>
Date: Fri, 30 Sep 2016 08:24:53 +0800
Subject: [PATCH 2268/3220] f2fs: introduce get_checkpoint_version for cleanup

commit fc0065adb202518e25fb929cda7d5887a456f774 upstream.

There exists almost same codes when get the value of pre_version
and cur_version in function validate_checkpoint, this patch adds
get_checkpoint_version to clean up redundant codes.

Signed-off-by: Tiezhu Yang <kernelpatch@126.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4: f2fs_crc_valid() doesn't take an f2fs_sb_info pointer]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c | 70 +++++++++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 39387f0f7609..337e132ba5ab 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -601,45 +601,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 	}
 }
 
+static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
+		struct f2fs_checkpoint **cp_block, struct page **cp_page,
+		unsigned long long *version)
+{
+	unsigned long blk_size = sbi->blocksize;
+	size_t crc_offset = 0;
+	__u32 crc = 0;
+
+	*cp_page = get_meta_page(sbi, cp_addr);
+	*cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
+
+	crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
+	if (crc_offset >= blk_size) {
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"invalid crc_offset: %zu", crc_offset);
+		return -EINVAL;
+	}
+
+	crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block
+							+ crc_offset)));
+	if (!f2fs_crc_valid(crc, *cp_block, crc_offset)) {
+		f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value");
+		return -EINVAL;
+	}
+
+	*version = cur_cp_version(*cp_block);
+	return 0;
+}
+
 static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 				block_t cp_addr, unsigned long long *version)
 {
-	struct page *cp_page_1, *cp_page_2 = NULL;
-	unsigned long blk_size = sbi->blocksize;
-	struct f2fs_checkpoint *cp_block;
+	struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
+	struct f2fs_checkpoint *cp_block = NULL;
 	unsigned long long cur_version = 0, pre_version = 0;
-	size_t crc_offset;
-	__u32 crc = 0;
+	int err;
 
-	/* Read the 1st cp block in this CP pack */
-	cp_page_1 = get_meta_page(sbi, cp_addr);
-
-	/* get the version number */
-	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
-	crc_offset = le32_to_cpu(cp_block->checksum_offset);
-	if (crc_offset >= blk_size)
+	err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+					&cp_page_1, version);
+	if (err)
 		goto invalid_cp1;
+	pre_version = *version;
 
-	crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
-	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
-		goto invalid_cp1;
-
-	pre_version = cur_cp_version(cp_block);
-
-	/* Read the 2nd cp block in this CP pack */
 	cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
-	cp_page_2 = get_meta_page(sbi, cp_addr);
-
-	cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
-	crc_offset = le32_to_cpu(cp_block->checksum_offset);
-	if (crc_offset >= blk_size)
+	err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+					&cp_page_2, version);
+	if (err)
 		goto invalid_cp2;
-
-	crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
-	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
-		goto invalid_cp2;
-
-	cur_version = cur_cp_version(cp_block);
+	cur_version = *version;
 
 	if (cur_version == pre_version) {
 		*version = cur_version;

From 9213c2b5119ae7cd536b8198ac5ded8c7c74b5c7 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 19 Sep 2016 18:13:54 -0700
Subject: [PATCH 2269/3220] f2fs: put directory inodes before checkpoint in
 roll-forward recovery

commit 9e1e6df412a28cdbbd2909de5c6189eda4a3383d upstream.

Before checkpoint, we'd be better drop any inodes.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/recovery.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index b59285f4736c..9b625ca19001 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -597,6 +597,9 @@ out:
 		set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
 	mutex_unlock(&sbi->cp_mutex);
 
+	/* let's drop all the directory inodes for clean checkpoint */
+	destroy_fsync_dnodes(&dir_list);
+
 	if (!err && need_writecp) {
 		struct cp_control cpc = {
 			.reason = CP_RECOVERY,
@@ -604,7 +607,6 @@ out:
 		write_checkpoint(sbi, &cpc);
 	}
 
-	destroy_fsync_dnodes(&dir_list);
 	kmem_cache_destroy(fsync_entry_slab);
 	return ret ? ret: err;
 }

From e465e93d88cb2a1c17a21acb35de5e589820844b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 24 Nov 2016 12:45:15 -0800
Subject: [PATCH 2270/3220] f2fs: fix to determine start_cp_addr by
 sbi->cur_cp_pack

commit 8508e44ae98622f841f5ef29d0bf3d5db4e0c1cc upstream.

We don't guarantee cp_addr is fixed by cp_version.
This is to sync with f2fs-tools.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c |  8 +++++++-
 fs/f2fs/f2fs.h       | 28 +++++++++++++++++-----------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 337e132ba5ab..b5b8b38b1c66 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -710,6 +710,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 	if (sanity_check_ckpt(sbi))
 		goto fail_no_cp;
 
+	if (cur_page == cp1)
+		sbi->cur_cp_pack = 1;
+	else
+		sbi->cur_cp_pack = 2;
+
 	if (cp_blks <= 1)
 		goto done;
 
@@ -1008,7 +1013,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 				le32_to_cpu(ckpt->checksum_offset)))
 				= cpu_to_le32(crc32);
 
-	start_blk = __start_cp_addr(sbi);
+	start_blk = __start_cp_next_addr(sbi);
 
 	/* need to wait for end_io results */
 	wait_on_all_pages_writeback(sbi);
@@ -1063,6 +1068,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	clear_prefree_segments(sbi, cpc);
 	clear_sbi_flag(sbi, SBI_IS_DIRTY);
+	__set_cp_next_pack(sbi);
 }
 
 /*
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2c521da33413..f4379a141c76 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -731,6 +731,7 @@ struct f2fs_sb_info {
 
 	/* for checkpoint */
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
+	int cur_cp_pack;			/* remain current cp pack */
 	struct inode *meta_inode;		/* cache meta blocks */
 	struct mutex cp_mutex;			/* checkpoint procedure lock */
 	struct rw_semaphore cp_rwsem;		/* blocking FS operations */
@@ -1140,22 +1141,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 
 static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
 {
-	block_t start_addr;
-	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
-	unsigned long long ckpt_version = cur_cp_version(ckpt);
+	block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
 
-	start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
-
-	/*
-	 * odd numbered checkpoint should at cp segment 0
-	 * and even segment must be at cp segment 1
-	 */
-	if (!(ckpt_version & 1))
+	if (sbi->cur_cp_pack == 2)
 		start_addr += sbi->blocks_per_seg;
-
 	return start_addr;
 }
 
+static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
+{
+	block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+
+	if (sbi->cur_cp_pack == 1)
+		start_addr += sbi->blocks_per_seg;
+	return start_addr;
+}
+
+static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi)
+{
+	sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1;
+}
+
 static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
 {
 	return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);

From b37804d039bf3417a2a5eb8840584be13a01f142 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 5 Dec 2016 13:56:04 -0800
Subject: [PATCH 2271/3220] f2fs: detect wrong layout

commit 2040fce83fe17763b07c97c1f691da2bb85e4135 upstream.

Previous mkfs.f2fs allows small partition inappropriately, so f2fs should detect
that as well.

Refer this in f2fs-tools.

mkfs.f2fs: detect small partition by overprovision ratio and # of segments

Reported-and-Tested-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/segment.h |  2 ++
 fs/f2fs/super.c   | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 133f3c4cce75..3ccd02396e83 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -17,6 +17,8 @@
 
 #define DEF_RECLAIM_PREFREE_SEGMENTS	5	/* 5% over total segments */
 
+#define F2FS_MIN_SEGMENTS	9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
+
 /* L: Logical segment # in volume, R: Relative segment # in main area */
 #define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno)
 #define GET_R2L_SEGNO(free_i, segno)	(segno + free_i->start_segno)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 9e9481446eee..8194ca48f14d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1078,6 +1078,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	unsigned int total, fsmeta;
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	unsigned int ovp_segments, reserved_segments;
 	unsigned int main_segs, blocks_per_seg;
 	unsigned int sit_segs, nat_segs;
 	unsigned int sit_bitmap_size, nat_bitmap_size;
@@ -1096,6 +1097,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	if (unlikely(fsmeta >= total))
 		return 1;
 
+	ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+	reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+
+	if (unlikely(fsmeta < F2FS_MIN_SEGMENTS ||
+			ovp_segments == 0 || reserved_segments == 0)) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+			"Wrong layout: check mkfs.f2fs version");
+		return 1;
+	}
+
 	main_segs = le32_to_cpu(raw_super->segment_count_main);
 	blocks_per_seg = sbi->blocks_per_seg;
 

From db0e91e3fe72b762fd2753336ebe043c9547c7ce Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 5 Dec 2016 17:25:32 -0800
Subject: [PATCH 2272/3220] f2fs: free meta pages if sanity check for ckpt is
 failed

commit a2125ff7dd1ed3a2a53cdc1f8f9c9cec9cfaa7ab upstream.

This fixes missing freeing meta pages in the error case.

Tested-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b5b8b38b1c66..f0576b82cbfa 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -708,7 +708,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 
 	/* Sanity checking of checkpoint */
 	if (sanity_check_ckpt(sbi))
-		goto fail_no_cp;
+		goto free_fail_no_cp;
 
 	if (cur_page == cp1)
 		sbi->cur_cp_pack = 1;
@@ -736,6 +736,9 @@ done:
 	f2fs_put_page(cp2, 1);
 	return 0;
 
+free_fail_no_cp:
+	f2fs_put_page(cp1, 1);
+	f2fs_put_page(cp2, 1);
 fail_no_cp:
 	kfree(sbi->ckpt);
 	return -EINVAL;

From a4f4f97573bfb057bbc30696d803cc37ed629d02 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 22 Mar 2017 14:45:05 +0800
Subject: [PATCH 2273/3220] f2fs: fix race condition in between free nid
 allocator/initializer

commit 30a61ddf8117c26ac5b295e1233eaa9629a94ca3 upstream.

In below concurrent case, allocated nid can be loaded into free nid cache
and be allocated again.

Thread A				Thread B
- f2fs_create
 - f2fs_new_inode
  - alloc_nid
   - __insert_nid_to_list(ALLOC_NID_LIST)
					- f2fs_balance_fs_bg
					 - build_free_nids
					  - __build_free_nids
					   - scan_nat_page
					    - add_free_nid
					     - __lookup_nat_cache
 - f2fs_add_link
  - init_inode_metadata
   - new_inode_page
    - new_node_page
     - set_node_addr
 - alloc_nid_done
  - __remove_nid_from_list(ALLOC_NID_LIST)
					     - __insert_nid_to_list(FREE_NID_LIST)

This patch makes nat cache lookup and free nid list operation being atomical
to avoid this race condition.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - add_free_nid() returns 0 in case of any error (except low memory)
 - Tree/list addition has not been moved into __insert_nid_to_list()]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/node.c | 62 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ced2afd8e3f2..1b2bdb8168e9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1428,8 +1428,9 @@ static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
 static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct free_nid *i;
+	struct free_nid *i, *e;
 	struct nat_entry *ne;
+	int err = -EINVAL;
 
 	if (!available_free_memory(sbi, FREE_NIDS))
 		return -1;
@@ -1438,35 +1439,58 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	if (unlikely(nid == 0))
 		return 0;
 
-	if (build) {
-		/* do not add allocated nids */
-		ne = __lookup_nat_cache(nm_i, nid);
-		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
-				nat_get_blkaddr(ne) != NULL_ADDR))
-			return 0;
-	}
-
 	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
 	i->nid = nid;
 	i->state = NID_NEW;
 
-	if (radix_tree_preload(GFP_NOFS)) {
-		kmem_cache_free(free_nid_slab, i);
-		return 0;
-	}
+	if (radix_tree_preload(GFP_NOFS))
+		goto err;
 
 	spin_lock(&nm_i->free_nid_list_lock);
-	if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
-		spin_unlock(&nm_i->free_nid_list_lock);
-		radix_tree_preload_end();
-		kmem_cache_free(free_nid_slab, i);
-		return 0;
+
+	if (build) {
+		/*
+		 *   Thread A             Thread B
+		 *  - f2fs_create
+		 *   - f2fs_new_inode
+		 *    - alloc_nid
+		 *     - __insert_nid_to_list(ALLOC_NID_LIST)
+		 *                     - f2fs_balance_fs_bg
+		 *                      - build_free_nids
+		 *                       - __build_free_nids
+		 *                        - scan_nat_page
+		 *                         - add_free_nid
+		 *                          - __lookup_nat_cache
+		 *  - f2fs_add_link
+		 *   - init_inode_metadata
+		 *    - new_inode_page
+		 *     - new_node_page
+		 *      - set_node_addr
+		 *  - alloc_nid_done
+		 *   - __remove_nid_from_list(ALLOC_NID_LIST)
+		 *                         - __insert_nid_to_list(FREE_NID_LIST)
+		 */
+		ne = __lookup_nat_cache(nm_i, nid);
+		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+				nat_get_blkaddr(ne) != NULL_ADDR))
+			goto err_out;
+
+		e = __lookup_free_nid_list(nm_i, nid);
+		if (e)
+			goto err_out;
 	}
+	if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i))
+		goto err_out;
+	err = 0;
 	list_add_tail(&i->list, &nm_i->free_nid_list);
 	nm_i->fcnt++;
+err_out:
 	spin_unlock(&nm_i->free_nid_list_lock);
 	radix_tree_preload_end();
-	return 1;
+err:
+	if (err)
+		kmem_cache_free(free_nid_slab, i);
+	return !err;
 }
 
 static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)

From f666f11a266a5702073df7a02f1076ea3f4d092c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 19 Dec 2017 19:16:34 -0800
Subject: [PATCH 2274/3220] f2fs: return error during fill_super

commit c39a1b348c4fe172729eff77c533dabc3c7cdaa7 upstream.

Let's avoid BUG_ON during fill_super, when on-disk was totall corrupted.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/segment.c | 16 ++++++++++++----
 fs/f2fs/segment.h | 22 ++++++++++++++++++----
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c0dfe2a7295f..4219335ce1bc 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2101,7 +2101,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 	return restore_curseg_summaries(sbi);
 }
 
-static void build_sit_entries(struct f2fs_sb_info *sbi)
+static int build_sit_entries(struct f2fs_sb_info *sbi)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
@@ -2110,6 +2110,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 	unsigned int i, start, end;
 	unsigned int readed, start_blk = 0;
 	int nrpages = MAX_BIO_BLOCKS(sbi);
+	int err = 0;
 
 	do {
 		readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
@@ -2128,7 +2129,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
 			f2fs_put_page(page, 1);
 
-			check_block_count(sbi, start, &sit);
+			err = check_block_count(sbi, start, &sit);
+			if (err)
+				return err;
 			seg_info_from_raw_sit(se, &sit);
 
 			/* build discard map only one time */
@@ -2154,7 +2157,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 
 		old_valid_blocks = se->valid_blocks;
 
-		check_block_count(sbi, start, &sit);
+		err = check_block_count(sbi, start, &sit);
+		if (err)
+			break;
 		seg_info_from_raw_sit(se, &sit);
 
 		memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
@@ -2165,6 +2170,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 				se->valid_blocks - old_valid_blocks;
 	}
 	mutex_unlock(&curseg->curseg_mutex);
+	return err;
 }
 
 static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -2326,7 +2332,9 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 		return err;
 
 	/* reinit free segmap based on SIT */
-	build_sit_entries(sbi);
+	err = build_sit_entries(sbi);
+	if (err)
+		return err;
 
 	init_free_segmap(sbi);
 	err = build_dirty_segmap(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 3ccd02396e83..fde16f2a065b 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -585,7 +585,7 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 /*
  * Summary block is always treated as an invalid block
  */
-static inline void check_block_count(struct f2fs_sb_info *sbi,
+static inline int check_block_count(struct f2fs_sb_info *sbi,
 		int segno, struct f2fs_sit_entry *raw_sit)
 {
 #ifdef CONFIG_F2FS_CHECK_FS
@@ -607,11 +607,25 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 		cur_pos = next_pos;
 		is_valid = !is_valid;
 	} while (cur_pos < sbi->blocks_per_seg);
-	BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
+
+	if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+				"Mismatch valid blocks %d vs. %d",
+					GET_SIT_VBLOCKS(raw_sit), valid_blocks);
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		return -EINVAL;
+	}
 #endif
 	/* check segment usage, and check boundary of a given segment number */
-	f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
-					|| segno > TOTAL_SEGS(sbi) - 1);
+	if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
+					|| segno > TOTAL_SEGS(sbi) - 1)) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+				"Wrong valid blocks %d or segno %u",
+					GET_SIT_VBLOCKS(raw_sit), segno);
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		return -EINVAL;
+	}
+	return 0;
 }
 
 static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,

From d5516450d29a64439ec9b2f60a4f436eb8691944 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Thu, 8 Mar 2018 16:29:13 +0800
Subject: [PATCH 2275/3220] f2fs: check blkaddr more accuratly before issue a
 bio

commit 0833721ec3658a4e9d5e58b6fa82cf9edc431e59 upstream.

This patch check blkaddr more accuratly before issue a
write or read bio.

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - CoW is not implemented so check f2fs_io_info::blk_addr instead of
   f2fs_io_info::{old,new}_blkaddr
 - Operation code is f2fs_io_info::rw instead of f2fs_io_info::op
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c |  2 ++
 fs/f2fs/data.c       |  3 ++-
 fs/f2fs/f2fs.h       |  1 +
 fs/f2fs/segment.h    | 25 +++++++++++++++++++------
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f0576b82cbfa..70ef49bb2803 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -58,6 +58,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
 		.rw = READ_SYNC | REQ_META | REQ_PRIO,
 		.blk_addr = index,
 		.encrypted_page = NULL,
+		.is_meta = is_meta,
 	};
 
 	if (unlikely(!is_meta))
@@ -151,6 +152,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 		.type = META,
 		.rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
 		.encrypted_page = NULL,
+		.is_meta = (type != META_POR),
 	};
 
 	if (unlikely(type == META_POR))
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b0fcfaee354f..a11a49ece238 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -147,6 +147,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	struct bio *bio;
 	struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
+	verify_block_addr(fio, fio->blk_addr);
 	trace_f2fs_submit_page_bio(page, fio);
 	f2fs_trace_ios(fio, 0);
 
@@ -172,7 +173,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 
 	io = is_read ? &sbi->read_io : &sbi->write_io[btype];
 
-	verify_block_addr(sbi, fio->blk_addr);
+	verify_block_addr(fio, fio->blk_addr);
 
 	down_write(&io->io_rwsem);
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f4379a141c76..9cf417ef871a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -684,6 +684,7 @@ struct f2fs_io_info {
 	block_t blk_addr;	/* block address to be written */
 	struct page *page;	/* page to be written */
 	struct page *encrypted_page;	/* encrypted page */
+	bool is_meta;		/* indicate borrow meta inode mapping or not */
 };
 
 #define is_read_io(rw)	(((rw) & 1) == READ)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index fde16f2a065b..cb61716070e8 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -48,13 +48,19 @@
 	 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /		\
 	  sbi->segs_per_sec))	\
 
-#define MAIN_BLKADDR(sbi)	(SM_I(sbi)->main_blkaddr)
-#define SEG0_BLKADDR(sbi)	(SM_I(sbi)->seg0_blkaddr)
+#define MAIN_BLKADDR(sbi)						\
+	(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : 				\
+		le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr))
+#define SEG0_BLKADDR(sbi)						\
+	(SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : 				\
+		le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr))
 
 #define MAIN_SEGS(sbi)	(SM_I(sbi)->main_segments)
 #define MAIN_SECS(sbi)	(sbi->total_sections)
 
-#define TOTAL_SEGS(sbi)	(SM_I(sbi)->segment_count)
+#define TOTAL_SEGS(sbi)							\
+	(SM_I(sbi) ? SM_I(sbi)->segment_count : 				\
+		le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count))
 #define TOTAL_BLKS(sbi)	(TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
 
 #define MAX_BLKADDR(sbi)	(SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
@@ -576,10 +582,17 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 	f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1);
 }
 
-static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr)
 {
-	BUG_ON(blk_addr < SEG0_BLKADDR(sbi)
-			|| blk_addr >= MAX_BLKADDR(sbi));
+	struct f2fs_sb_info *sbi = fio->sbi;
+
+	if (PAGE_TYPE_OF_BIO(fio->type) == META &&
+				(!is_read_io(fio->rw) || fio->is_meta))
+		BUG_ON(blk_addr < SEG0_BLKADDR(sbi) ||
+				blk_addr >= MAIN_BLKADDR(sbi));
+	else
+		BUG_ON(blk_addr < MAIN_BLKADDR(sbi) ||
+				blk_addr >= MAX_BLKADDR(sbi));
 }
 
 /*

From 4e262aeed720067c30593b9f8b3948bbfdf794a8 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 24 Apr 2018 15:44:16 -0600
Subject: [PATCH 2276/3220] f2fs: sanity check on sit entry

commit b2ca374f33bd33fd822eb871876e4888cf79dc97 upstream.

syzbot hit the following crash on upstream commit
87ef12027b9b1dd0e0b12cf311fbcb19f9d92539 (Wed Apr 18 19:48:17 2018 +0000)
Merge tag 'ceph-for-4.17-rc2' of git://github.com/ceph/ceph-client
syzbot dashboard link: https://syzkaller.appspot.com/bug?extid=83699adeb2d13579c31e

C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5805208181407744
syzkaller reproducer: https://syzkaller.appspot.com/x/repro.syz?id=6005073343676416
Raw console output: https://syzkaller.appspot.com/x/log.txt?id=6555047731134464
Kernel config: https://syzkaller.appspot.com/x/.config?id=1808800213120130118
compiler: gcc (GCC) 8.0.1 20180413 (experimental)

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+83699adeb2d13579c31e@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for details.
If you forward the report, please keep this part and the footer.

F2FS-fs (loop0): Magic Mismatch, valid(0xf2f52010) - read(0x0)
F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock
F2FS-fs (loop0): invalid crc value
BUG: unable to handle kernel paging request at ffffed006b2a50c0
PGD 21ffee067 P4D 21ffee067 PUD 21fbeb067 PMD 0
Oops: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 4514 Comm: syzkaller989480 Not tainted 4.17.0-rc1+ #8
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:build_sit_entries fs/f2fs/segment.c:3653 [inline]
RIP: 0010:build_segment_manager+0x7ef7/0xbf70 fs/f2fs/segment.c:3852
RSP: 0018:ffff8801b102e5b0 EFLAGS: 00010a06
RAX: 1ffff1006b2a50c0 RBX: 0000000000000004 RCX: 0000000000000001
RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8801ac74243e
RBP: ffff8801b102f410 R08: ffff8801acbd46c0 R09: fffffbfff14d9af8
R10: fffffbfff14d9af8 R11: ffff8801acbd46c0 R12: ffff8801ac742a80
R13: ffff8801d9519100 R14: dffffc0000000000 R15: ffff880359528600
FS:  0000000001e04880(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffed006b2a50c0 CR3: 00000001ac6ac000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 f2fs_fill_super+0x4095/0x7bf0 fs/f2fs/super.c:2803
 mount_bdev+0x30c/0x3e0 fs/super.c:1165
 f2fs_mount+0x34/0x40 fs/f2fs/super.c:3020
 mount_fs+0xae/0x328 fs/super.c:1268
 vfs_kern_mount.part.34+0xd4/0x4d0 fs/namespace.c:1037
 vfs_kern_mount fs/namespace.c:1027 [inline]
 do_new_mount fs/namespace.c:2517 [inline]
 do_mount+0x564/0x3070 fs/namespace.c:2847
 ksys_mount+0x12d/0x140 fs/namespace.c:3063
 __do_sys_mount fs/namespace.c:3077 [inline]
 __se_sys_mount fs/namespace.c:3074 [inline]
 __x64_sys_mount+0xbe/0x150 fs/namespace.c:3074
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x443d6a
RSP: 002b:00007ffd312813c8 EFLAGS: 00000297 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 0000000020000c00 RCX: 0000000000443d6a
RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffd312813d0
RBP: 0000000000000003 R08: 0000000020016a00 R09: 000000000000000a
R10: 0000000000000000 R11: 0000000000000297 R12: 0000000000000004
R13: 0000000000402c60 R14: 0000000000000000 R15: 0000000000000000
RIP: build_sit_entries fs/f2fs/segment.c:3653 [inline] RSP: ffff8801b102e5b0
RIP: build_segment_manager+0x7ef7/0xbf70 fs/f2fs/segment.c:3852 RSP: ffff8801b102e5b0
CR2: ffffed006b2a50c0
---[ end trace a2034989e196ff17 ]---

Reported-and-tested-by: syzbot+83699adeb2d13579c31e@syzkaller.appspotmail.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/segment.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4219335ce1bc..adeeff67751b 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2152,6 +2152,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 		unsigned int old_valid_blocks;
 
 		start = le32_to_cpu(segno_in_journal(sum, i));
+		if (start >= MAIN_SEGS(sbi)) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+					"Wrong journal entry on segno %u",
+					start);
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			err = -EINVAL;
+			break;
+		}
+
 		se = &sit_i->sentries[start];
 		sit = sit_in_journal(sum, i);
 

From 812bc5b6d68bbf0218f49193ef41de67cc54d054 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 27 Apr 2018 19:03:22 -0700
Subject: [PATCH 2277/3220] f2fs: enhance sanity_check_raw_super() to avoid
 potential overflow

commit 0cfe75c5b011994651a4ca6d74f20aa997bfc69a upstream.

In order to avoid the below overflow issue, we should have checked the
boundaries in superblock before reaching out to allocation. As Linus suggested,
the right place should be sanity_check_raw_super().

Dr Silvio Cesare of InfoSect reported:

There are integer overflows with using the cp_payload superblock field in the
f2fs filesystem potentially leading to memory corruption.

include/linux/f2fs_fs.h

struct f2fs_super_block {
...
        __le32 cp_payload;

fs/f2fs/f2fs.h

typedef u32 block_t;    /*
                         * should not change u32, since it is the on-disk block
                         * address format, __le32.
                         */
...

static inline block_t __cp_payload(struct f2fs_sb_info *sbi)
{
        return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
}

fs/f2fs/checkpoint.c

        block_t start_blk, orphan_blocks, i, j;
...
        start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
        orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);

+++ integer overflows

...
        unsigned int cp_blks = 1 + __cp_payload(sbi);
...
        sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);

+++ integer overflow leading to incorrect heap allocation.

        int cp_payload_blks = __cp_payload(sbi);
...
        ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
                        orphan_blocks);

+++ sign bug and integer overflow

...
        for (i = 1; i < 1 + cp_payload_blks; i++)

+++ integer overflow

...

      sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
                        NR_CURSEG_TYPE - __cp_payload(sbi)) *
                                F2FS_ORPHANS_PER_BLOCK;

+++ integer overflow

Reported-by: Greg KH <greg@kroah.com>
Reported-by: Silvio Cesare <silvio.cesare@gmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - No hot file extension support
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/super.c | 71 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 64 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8194ca48f14d..0293cc6bb58e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -994,6 +994,8 @@ static inline bool sanity_check_area_boundary(struct super_block *sb,
 static int sanity_check_raw_super(struct super_block *sb,
 			struct f2fs_super_block *raw_super)
 {
+	block_t segment_count, segs_per_sec, secs_per_zone;
+	block_t total_sections, blocks_per_seg;
 	unsigned int blocksize;
 
 	if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
@@ -1047,6 +1049,68 @@ static int sanity_check_raw_super(struct super_block *sb,
 		return 1;
 	}
 
+	segment_count = le32_to_cpu(raw_super->segment_count);
+	segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
+	secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
+	total_sections = le32_to_cpu(raw_super->section_count);
+
+	/* blocks_per_seg should be 512, given the above check */
+	blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg);
+
+	if (segment_count > F2FS_MAX_SEGMENT ||
+				segment_count < F2FS_MIN_SEGMENTS) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid segment count (%u)",
+			segment_count);
+		return 1;
+	}
+
+	if (total_sections > segment_count ||
+			total_sections < F2FS_MIN_SEGMENTS ||
+			segs_per_sec > segment_count || !segs_per_sec) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid segment/section count (%u, %u x %u)",
+			segment_count, total_sections, segs_per_sec);
+		return 1;
+	}
+
+	if ((segment_count / segs_per_sec) < total_sections) {
+		f2fs_msg(sb, KERN_INFO,
+			"Small segment_count (%u < %u * %u)",
+			segment_count, segs_per_sec, total_sections);
+		return 1;
+	}
+
+	if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) {
+		f2fs_msg(sb, KERN_INFO,
+			"Wrong segment_count / block_count (%u > %u)",
+			segment_count, le32_to_cpu(raw_super->block_count));
+		return 1;
+	}
+
+	if (secs_per_zone > total_sections) {
+		f2fs_msg(sb, KERN_INFO,
+			"Wrong secs_per_zone (%u > %u)",
+			secs_per_zone, total_sections);
+		return 1;
+	}
+	if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION) {
+		f2fs_msg(sb, KERN_INFO,
+			"Corrupted extension count (%u > %u)",
+			le32_to_cpu(raw_super->extension_count),
+			F2FS_MAX_EXTENSION);
+		return 1;
+	}
+
+	if (le32_to_cpu(raw_super->cp_payload) >
+				(blocks_per_seg - F2FS_CP_PACKS)) {
+		f2fs_msg(sb, KERN_INFO,
+			"Insane cp_payload (%u > %u)",
+			le32_to_cpu(raw_super->cp_payload),
+			blocks_per_seg - F2FS_CP_PACKS);
+		return 1;
+	}
+
 	/* check reserved ino info */
 	if (le32_to_cpu(raw_super->node_ino) != 1 ||
 		le32_to_cpu(raw_super->meta_ino) != 2 ||
@@ -1059,13 +1123,6 @@ static int sanity_check_raw_super(struct super_block *sb,
 		return 1;
 	}
 
-	if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) {
-		f2fs_msg(sb, KERN_INFO,
-			"Invalid segment count (%u)",
-			le32_to_cpu(raw_super->segment_count));
-		return 1;
-	}
-
 	/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
 	if (sanity_check_area_boundary(sb, raw_super))
 		return 1;

From 89e13ff3ba161d054cfba63809bcdc18c73796cd Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 23 May 2018 22:25:08 +0800
Subject: [PATCH 2278/3220] f2fs: clean up with is_valid_blkaddr()

commit 7b525dd01365c6764018e374d391c92466be1b7a upstream.

- rename is_valid_blkaddr() to is_valid_meta_blkaddr() for readability.
- introduce is_valid_blkaddr() for cleanup.

No logic change in this patch.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - Drop inapplicable change to check on f2fs_fio_info::old_blkaddr
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c | 4 ++--
 fs/f2fs/data.c       | 4 ++--
 fs/f2fs/f2fs.h       | 9 ++++++++-
 fs/f2fs/file.c       | 2 +-
 fs/f2fs/inode.c      | 2 +-
 fs/f2fs/node.c       | 5 ++---
 fs/f2fs/recovery.c   | 6 +++---
 fs/f2fs/segment.c    | 4 ++--
 fs/f2fs/segment.h    | 2 +-
 9 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 70ef49bb2803..dffce82b11be 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -107,7 +107,7 @@ struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
 	return __get_meta_page(sbi, index, false);
 }
 
-bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
+bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
 {
 	switch (type) {
 	case META_NAT:
@@ -160,7 +160,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 
 	for (; nrpages-- > 0; blkno++) {
 
-		if (!is_valid_blkaddr(sbi, blkno, type))
+		if (!is_valid_meta_blkaddr(sbi, blkno, type))
 			goto out;
 
 		switch (type) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a11a49ece238..504f0126b369 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -604,7 +604,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 		goto unlock_out;
 	}
 
-	if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
+	if (!is_valid_blkaddr(dn.data_blkaddr)) {
 		if (create) {
 			if (unlikely(f2fs_cp_error(sbi))) {
 				err = -EIO;
@@ -1090,7 +1090,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
 	 * If current allocation needs SSR,
 	 * it had better in-place writes for updated data.
 	 */
-	if (unlikely(fio->blk_addr != NEW_ADDR &&
+	if (unlikely(is_valid_blkaddr(fio->blk_addr) &&
 			!is_cold_data(page) &&
 			need_inplace_update(inode))) {
 		rewrite_data_page(fio);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9cf417ef871a..851eb457fc7c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1647,6 +1647,13 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
 	(pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) /	\
 	ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
 
+static inline bool is_valid_blkaddr(block_t blkaddr)
+{
+	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+		return false;
+	return true;
+}
+
 /*
  * file.c
  */
@@ -1818,7 +1825,7 @@ void destroy_segment_manager_caches(void);
 struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
-bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
+bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type);
 int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
 void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 01eed94b01ea..d4eba12d4d17 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -311,7 +311,7 @@ static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs,
 	switch (whence) {
 	case SEEK_DATA:
 		if ((blkaddr == NEW_ADDR && dirty == pgofs) ||
-			(blkaddr != NEW_ADDR && blkaddr != NULL_ADDR))
+			is_valid_blkaddr(blkaddr))
 			return true;
 		break;
 	case SEEK_HOLE:
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 5528801a5baf..aa0ce06d143a 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -54,7 +54,7 @@ static bool __written_first_block(struct f2fs_inode *ri)
 {
 	block_t addr = le32_to_cpu(ri->i_addr[0]);
 
-	if (addr != NEW_ADDR && addr != NULL_ADDR)
+	if (is_valid_blkaddr(addr))
 		return true;
 	return false;
 }
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 1b2bdb8168e9..75f0a7fc891b 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -296,8 +296,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 			new_blkaddr == NULL_ADDR);
 	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
 			new_blkaddr == NEW_ADDR);
-	f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
-			nat_get_blkaddr(e) != NULL_ADDR &&
+	f2fs_bug_on(sbi, is_valid_blkaddr(nat_get_blkaddr(e)) &&
 			new_blkaddr == NEW_ADDR);
 
 	/* increment version no as node is removed */
@@ -312,7 +311,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 
 	/* change address */
 	nat_set_blkaddr(e, new_blkaddr);
-	if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
+	if (!is_valid_blkaddr(new_blkaddr))
 		set_nat_flag(e, IS_CHECKPOINTED, false);
 	__set_nat_cache_dirty(nm_i, e);
 
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9b625ca19001..ae0d7d0e0e28 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -208,7 +208,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
+		if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR))
 			return 0;
 
 		page = get_tmp_page(sbi, blkaddr);
@@ -443,7 +443,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 		}
 
 		/* dest is valid block, try to recover from src to dest */
-		if (is_valid_blkaddr(sbi, dest, META_POR)) {
+		if (is_valid_meta_blkaddr(sbi, dest, META_POR)) {
 
 			if (src == NULL_ADDR) {
 				err = reserve_new_block(&dn);
@@ -494,7 +494,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
+		if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR))
 			break;
 
 		ra_meta_pages_cond(sbi, blkaddr);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index adeeff67751b..a09849a39074 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -752,7 +752,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
 	struct seg_entry *se;
 	bool is_cp = false;
 
-	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+	if (!is_valid_blkaddr(blkaddr))
 		return true;
 
 	mutex_lock(&sit_i->sentry_lock);
@@ -1466,7 +1466,7 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
 {
 	struct page *cpage;
 
-	if (blkaddr == NEW_ADDR)
+	if (!is_valid_blkaddr(blkaddr))
 		return;
 
 	f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index cb61716070e8..c936e1a3f04f 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -80,7 +80,7 @@
 	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
 
 #define GET_SEGNO(sbi, blk_addr)					\
-	(((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ?		\
+	((!is_valid_blkaddr(blk_addr)) ?			\
 	NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),			\
 		GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
 #define GET_SECNO(sbi, segno)					\

From 5b0e78cc5cf01e80a79b813c31fdc56f0e37f5df Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 5 Jun 2018 17:44:11 +0800
Subject: [PATCH 2279/3220] f2fs: introduce and spread verify_blkaddr

commit e1da7872f6eda977bd812346bf588c35e4495a1e upstream.

This patch introduces verify_blkaddr to check meta/data block address
with valid range to detect bug earlier.

In addition, once we encounter an invalid blkaddr, notice user to run
fsck to fix, and let the kernel panic.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - I skipped an earlier renaming of is_valid_meta_blkaddr() to
   f2fs_is_valid_meta_blkaddr()
 - Drop inapplicable change to check on f2fs_fio_info::old_blkaddr
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c | 11 +++++++++--
 fs/f2fs/data.c       |  4 ++--
 fs/f2fs/f2fs.h       | 32 +++++++++++++++++++++++++++++---
 fs/f2fs/file.c       |  9 +++++----
 fs/f2fs/inode.c      |  7 ++++---
 fs/f2fs/node.c       |  4 ++--
 fs/f2fs/recovery.c   |  6 +++---
 fs/f2fs/segment.c    |  4 ++--
 fs/f2fs/segment.h    |  8 +++-----
 9 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index dffce82b11be..64b65d509f7c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -107,7 +107,8 @@ struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
 	return __get_meta_page(sbi, index, false);
 }
 
-bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
+bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type)
 {
 	switch (type) {
 	case META_NAT:
@@ -127,10 +128,16 @@ bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
 			return false;
 		break;
 	case META_POR:
+	case DATA_GENERIC:
 		if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
 			blkaddr < MAIN_BLKADDR(sbi)))
 			return false;
 		break;
+	case META_GENERIC:
+		if (unlikely(blkaddr < SEG0_BLKADDR(sbi) ||
+			blkaddr >= MAIN_BLKADDR(sbi)))
+			return false;
+		break;
 	default:
 		BUG();
 	}
@@ -160,7 +167,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 
 	for (; nrpages-- > 0; blkno++) {
 
-		if (!is_valid_meta_blkaddr(sbi, blkno, type))
+		if (!f2fs_is_valid_blkaddr(sbi, blkno, type))
 			goto out;
 
 		switch (type) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 504f0126b369..46dcf856e848 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -604,7 +604,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 		goto unlock_out;
 	}
 
-	if (!is_valid_blkaddr(dn.data_blkaddr)) {
+	if (!is_valid_data_blkaddr(sbi, dn.data_blkaddr)) {
 		if (create) {
 			if (unlikely(f2fs_cp_error(sbi))) {
 				err = -EIO;
@@ -1090,7 +1090,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
 	 * If current allocation needs SSR,
 	 * it had better in-place writes for updated data.
 	 */
-	if (unlikely(is_valid_blkaddr(fio->blk_addr) &&
+	if (unlikely(is_valid_data_blkaddr(fio->sbi, fio->blk_addr) &&
 			!is_cold_data(page) &&
 			need_inplace_update(inode))) {
 		rewrite_data_page(fio);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 851eb457fc7c..e562d8bd5b8a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -135,7 +135,7 @@ struct cp_control {
 };
 
 /*
- * For CP/NAT/SIT/SSA readahead
+ * indicate meta/data type
  */
 enum {
 	META_CP,
@@ -143,6 +143,8 @@ enum {
 	META_SIT,
 	META_SSA,
 	META_POR,
+	DATA_GENERIC,
+	META_GENERIC,
 };
 
 /* for the list of ino */
@@ -1647,13 +1649,36 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
 	(pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) /	\
 	ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
 
-static inline bool is_valid_blkaddr(block_t blkaddr)
+bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type);
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+static inline void verify_blkaddr(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type)
+{
+	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+			"invalid blkaddr: %u, type: %d, run fsck to fix.",
+			blkaddr, type);
+		f2fs_bug_on(sbi, 1);
+	}
+}
+
+static inline bool __is_valid_data_blkaddr(block_t blkaddr)
 {
 	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
 		return false;
 	return true;
 }
 
+static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi,
+						block_t blkaddr)
+{
+	if (!__is_valid_data_blkaddr(blkaddr))
+		return false;
+	verify_blkaddr(sbi, blkaddr, DATA_GENERIC);
+	return true;
+}
+
 /*
  * file.c
  */
@@ -1825,7 +1850,8 @@ void destroy_segment_manager_caches(void);
 struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
-bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type);
+bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type);
 int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
 void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d4eba12d4d17..b153b37ae038 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -305,13 +305,13 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping,
 	return pgofs;
 }
 
-static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs,
-							int whence)
+static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr,
+				pgoff_t dirty, pgoff_t pgofs, int whence)
 {
 	switch (whence) {
 	case SEEK_DATA:
 		if ((blkaddr == NEW_ADDR && dirty == pgofs) ||
-			is_valid_blkaddr(blkaddr))
+			is_valid_data_blkaddr(sbi, blkaddr))
 			return true;
 		break;
 	case SEEK_HOLE:
@@ -374,7 +374,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 			block_t blkaddr;
 			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 
-			if (__found_offset(blkaddr, dirty, pgofs, whence)) {
+			if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty,
+							pgofs, whence)) {
 				f2fs_put_dnode(&dn);
 				goto found;
 			}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index aa0ce06d143a..702e9b637fc0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -50,11 +50,12 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 	}
 }
 
-static bool __written_first_block(struct f2fs_inode *ri)
+static bool __written_first_block(struct f2fs_sb_info *sbi,
+					struct f2fs_inode *ri)
 {
 	block_t addr = le32_to_cpu(ri->i_addr[0]);
 
-	if (is_valid_blkaddr(addr))
+	if (is_valid_data_blkaddr(sbi, addr))
 		return true;
 	return false;
 }
@@ -149,7 +150,7 @@ static int do_read_inode(struct inode *inode)
 	/* get rdev by using inline_info */
 	__get_inode_rdev(inode, ri);
 
-	if (__written_first_block(ri))
+	if (__written_first_block(sbi, ri))
 		set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
 
 	f2fs_put_page(node_page, 1);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 75f0a7fc891b..5801c70c4964 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -296,7 +296,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 			new_blkaddr == NULL_ADDR);
 	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
 			new_blkaddr == NEW_ADDR);
-	f2fs_bug_on(sbi, is_valid_blkaddr(nat_get_blkaddr(e)) &&
+	f2fs_bug_on(sbi, is_valid_data_blkaddr(sbi, nat_get_blkaddr(e)) &&
 			new_blkaddr == NEW_ADDR);
 
 	/* increment version no as node is removed */
@@ -311,7 +311,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 
 	/* change address */
 	nat_set_blkaddr(e, new_blkaddr);
-	if (!is_valid_blkaddr(new_blkaddr))
+	if (!is_valid_data_blkaddr(sbi, new_blkaddr))
 		set_nat_flag(e, IS_CHECKPOINTED, false);
 	__set_nat_cache_dirty(nm_i, e);
 
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index ae0d7d0e0e28..2878be3e448f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -208,7 +208,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR))
+		if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
 			return 0;
 
 		page = get_tmp_page(sbi, blkaddr);
@@ -443,7 +443,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 		}
 
 		/* dest is valid block, try to recover from src to dest */
-		if (is_valid_meta_blkaddr(sbi, dest, META_POR)) {
+		if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
 
 			if (src == NULL_ADDR) {
 				err = reserve_new_block(&dn);
@@ -494,7 +494,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR))
+		if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
 			break;
 
 		ra_meta_pages_cond(sbi, blkaddr);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a09849a39074..6802cd754eda 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -752,7 +752,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
 	struct seg_entry *se;
 	bool is_cp = false;
 
-	if (!is_valid_blkaddr(blkaddr))
+	if (!is_valid_data_blkaddr(sbi, blkaddr))
 		return true;
 
 	mutex_lock(&sit_i->sentry_lock);
@@ -1466,7 +1466,7 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
 {
 	struct page *cpage;
 
-	if (!is_valid_blkaddr(blkaddr))
+	if (!is_valid_data_blkaddr(sbi, blkaddr))
 		return;
 
 	f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index c936e1a3f04f..204a3251e123 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -80,7 +80,7 @@
 	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
 
 #define GET_SEGNO(sbi, blk_addr)					\
-	((!is_valid_blkaddr(blk_addr)) ?			\
+	((!is_valid_data_blkaddr(sbi, blk_addr)) ?			\
 	NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),			\
 		GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
 #define GET_SECNO(sbi, segno)					\
@@ -588,11 +588,9 @@ static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr)
 
 	if (PAGE_TYPE_OF_BIO(fio->type) == META &&
 				(!is_read_io(fio->rw) || fio->is_meta))
-		BUG_ON(blk_addr < SEG0_BLKADDR(sbi) ||
-				blk_addr >= MAIN_BLKADDR(sbi));
+		verify_blkaddr(sbi, blk_addr, META_GENERIC);
 	else
-		BUG_ON(blk_addr < MAIN_BLKADDR(sbi) ||
-				blk_addr >= MAX_BLKADDR(sbi));
+		verify_blkaddr(sbi, blk_addr, DATA_GENERIC);
 }
 
 /*

From 056120a8c192871c558e742f83ae18bf8721d97c Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 23 Jun 2018 00:12:36 +0800
Subject: [PATCH 2280/3220] f2fs: fix to do sanity check with secs_per_zone

commit 42bf546c1fe3f3654bdf914e977acbc2b80a5be5 upstream.

As Wen Xu reported in below link:

https://bugzilla.kernel.org/show_bug.cgi?id=200183

- Overview
Divide zero in reset_curseg() when mounting a crafted f2fs image

- Reproduce

- Kernel message
[  588.281510] divide error: 0000 [#1] SMP KASAN PTI
[  588.282701] CPU: 0 PID: 1293 Comm: mount Not tainted 4.18.0-rc1+ #4
[  588.284000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  588.286178] RIP: 0010:reset_curseg+0x94/0x1a0
[  588.298166] RSP: 0018:ffff8801e88d7940 EFLAGS: 00010246
[  588.299360] RAX: 0000000000000014 RBX: ffff8801e1d46d00 RCX: ffffffffb88bf60b
[  588.300809] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e1d46d64
[  588.305272] R13: 0000000000000000 R14: 0000000000000014 R15: 0000000000000000
[  588.306822] FS:  00007fad85008840(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[  588.308456] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  588.309623] CR2: 0000000001705078 CR3: 00000001f30f8000 CR4: 00000000000006f0
[  588.311085] Call Trace:
[  588.311637]  f2fs_build_segment_manager+0x103f/0x3410
[  588.316136]  ? f2fs_commit_super+0x1b0/0x1b0
[  588.317031]  ? set_blocksize+0x90/0x140
[  588.319473]  f2fs_mount+0x15/0x20
[  588.320166]  mount_fs+0x60/0x1a0
[  588.320847]  ? alloc_vfsmnt+0x309/0x360
[  588.321647]  vfs_kern_mount+0x6b/0x1a0
[  588.322432]  do_mount+0x34a/0x18c0
[  588.323175]  ? strndup_user+0x46/0x70
[  588.323937]  ? copy_mount_string+0x20/0x20
[  588.324793]  ? memcg_kmem_put_cache+0x1b/0xa0
[  588.325702]  ? kasan_check_write+0x14/0x20
[  588.326562]  ? _copy_from_user+0x6a/0x90
[  588.327375]  ? memdup_user+0x42/0x60
[  588.328118]  ksys_mount+0x83/0xd0
[  588.328808]  __x64_sys_mount+0x67/0x80
[  588.329607]  do_syscall_64+0x78/0x170
[  588.330400]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  588.331461] RIP: 0033:0x7fad848e8b9a
[  588.336022] RSP: 002b:00007ffd7c5b6be8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5
[  588.337547] RAX: ffffffffffffffda RBX: 00000000016f8030 RCX: 00007fad848e8b9a
[  588.338999] RDX: 00000000016f8210 RSI: 00000000016f9f30 RDI: 0000000001700ec0
[  588.340442] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013
[  588.341887] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000000001700ec0
[  588.343341] R13: 00000000016f8210 R14: 0000000000000000 R15: 0000000000000003
[  588.354891] ---[ end trace 4ce02f25ff7d3df5 ]---
[  588.355862] RIP: 0010:reset_curseg+0x94/0x1a0
[  588.360742] RSP: 0018:ffff8801e88d7940 EFLAGS: 00010246
[  588.361812] RAX: 0000000000000014 RBX: ffff8801e1d46d00 RCX: ffffffffb88bf60b
[  588.363485] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e1d46d64
[  588.365213] RBP: ffff8801e88d7968 R08: ffffed003c32266f R09: ffffed003c32266f
[  588.366661] R10: 0000000000000001 R11: ffffed003c32266e R12: ffff8801f0337700
[  588.368110] R13: 0000000000000000 R14: 0000000000000014 R15: 0000000000000000
[  588.370057] FS:  00007fad85008840(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[  588.372099] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  588.373291] CR2: 0000000001705078 CR3: 00000001f30f8000 CR4: 00000000000006f0

- Location
https://elixir.bootlin.com/linux/latest/source/fs/f2fs/segment.c#L2147
        curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);

If secs_per_zone is corrupted due to fuzzing test, it will cause divide
zero operation when using GET_ZONE_FROM_SEG macro, so we should do more
sanity check with secs_per_zone during mount to avoid this issue.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 0293cc6bb58e..b8e160b17505 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1088,9 +1088,9 @@ static int sanity_check_raw_super(struct super_block *sb,
 		return 1;
 	}
 
-	if (secs_per_zone > total_sections) {
+	if (secs_per_zone > total_sections || !secs_per_zone) {
 		f2fs_msg(sb, KERN_INFO,
-			"Wrong secs_per_zone (%u > %u)",
+			"Wrong secs_per_zone / total_sections (%u, %u)",
 			secs_per_zone, total_sections);
 		return 1;
 	}

From b31ccde086671b372957e1fe1c60968e6d7464d7 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 27 Jun 2018 18:05:54 +0800
Subject: [PATCH 2281/3220] f2fs: fix to do sanity check with user_block_count

commit 9dc956b2c8523aed39d1e6508438be9fea28c8fc upstream.

This patch fixs to do sanity check with user_block_count.

- Overview
Divide zero in utilization when mount() a corrupted f2fs image

- Reproduce (4.18 upstream kernel)

- Kernel message
[  564.099503] F2FS-fs (loop0): invalid crc value
[  564.101991] divide error: 0000 [#1] SMP KASAN PTI
[  564.103103] CPU: 1 PID: 1298 Comm: f2fs_discard-7: Not tainted 4.18.0-rc1+ #4
[  564.104584] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  564.106624] RIP: 0010:issue_discard_thread+0x248/0x5c0
[  564.107692] Code: ff ff 48 8b bd e8 fe ff ff 41 8b 9d 4c 04 00 00 e8 cd b8 ad ff 41 8b 85 50 04 00 00 31 d2 48 8d 04 80 48 8d 04 80 48 c1 e0 02 <48> f7 f3 83 f8 50 7e 16 41 c7 86 7c ff ff ff 01 00 00 00 41 c7 86
[  564.111686] RSP: 0018:ffff8801f3117dc0 EFLAGS: 00010206
[  564.112775] RAX: 0000000000000384 RBX: 0000000000000000 RCX: ffffffffb88c1e03
[  564.114250] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e3aa4850
[  564.115706] RBP: ffff8801f3117f00 R08: 1ffffffff751a1d0 R09: fffffbfff751a1d0
[  564.117177] R10: 0000000000000001 R11: fffffbfff751a1d0 R12: 00000000fffffffc
[  564.118634] R13: ffff8801e3aa4400 R14: ffff8801f3117ed8 R15: ffff8801e2050000
[  564.120094] FS:  0000000000000000(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000
[  564.121748] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  564.122923] CR2: 000000000202b078 CR3: 00000001f11ac000 CR4: 00000000000006e0
[  564.124383] Call Trace:
[  564.124924]  ? __issue_discard_cmd+0x480/0x480
[  564.125882]  ? __sched_text_start+0x8/0x8
[  564.126756]  ? __kthread_parkme+0xcb/0x100
[  564.127620]  ? kthread_blkcg+0x70/0x70
[  564.128412]  kthread+0x180/0x1d0
[  564.129105]  ? __issue_discard_cmd+0x480/0x480
[  564.130029]  ? kthread_associate_blkcg+0x150/0x150
[  564.131033]  ret_from_fork+0x35/0x40
[  564.131794] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[  564.141798] ---[ end trace 4ce02f25ff7d3df5 ]---
[  564.142773] RIP: 0010:issue_discard_thread+0x248/0x5c0
[  564.143885] Code: ff ff 48 8b bd e8 fe ff ff 41 8b 9d 4c 04 00 00 e8 cd b8 ad ff 41 8b 85 50 04 00 00 31 d2 48 8d 04 80 48 8d 04 80 48 c1 e0 02 <48> f7 f3 83 f8 50 7e 16 41 c7 86 7c ff ff ff 01 00 00 00 41 c7 86
[  564.147776] RSP: 0018:ffff8801f3117dc0 EFLAGS: 00010206
[  564.148856] RAX: 0000000000000384 RBX: 0000000000000000 RCX: ffffffffb88c1e03
[  564.150424] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e3aa4850
[  564.151906] RBP: ffff8801f3117f00 R08: 1ffffffff751a1d0 R09: fffffbfff751a1d0
[  564.153463] R10: 0000000000000001 R11: fffffbfff751a1d0 R12: 00000000fffffffc
[  564.154915] R13: ffff8801e3aa4400 R14: ffff8801f3117ed8 R15: ffff8801e2050000
[  564.156405] FS:  0000000000000000(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000
[  564.158070] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  564.159279] CR2: 000000000202b078 CR3: 00000001f11ac000 CR4: 00000000000006e0
[  564.161043] ==================================================================
[  564.162587] BUG: KASAN: stack-out-of-bounds in from_kuid_munged+0x1d/0x50
[  564.163994] Read of size 4 at addr ffff8801f3117c84 by task f2fs_discard-7:/1298

[  564.165852] CPU: 1 PID: 1298 Comm: f2fs_discard-7: Tainted: G      D           4.18.0-rc1+ #4
[  564.167593] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  564.169522] Call Trace:
[  564.170057]  dump_stack+0x7b/0xb5
[  564.170778]  print_address_description+0x70/0x290
[  564.171765]  kasan_report+0x291/0x390
[  564.172540]  ? from_kuid_munged+0x1d/0x50
[  564.173408]  __asan_load4+0x78/0x80
[  564.174148]  from_kuid_munged+0x1d/0x50
[  564.174962]  do_notify_parent+0x1f5/0x4f0
[  564.175808]  ? send_sigqueue+0x390/0x390
[  564.176639]  ? css_set_move_task+0x152/0x340
[  564.184197]  do_exit+0x1290/0x1390
[  564.184950]  ? __issue_discard_cmd+0x480/0x480
[  564.185884]  ? mm_update_next_owner+0x380/0x380
[  564.186829]  ? __sched_text_start+0x8/0x8
[  564.187672]  ? __kthread_parkme+0xcb/0x100
[  564.188528]  ? kthread_blkcg+0x70/0x70
[  564.189333]  ? kthread+0x180/0x1d0
[  564.190052]  ? __issue_discard_cmd+0x480/0x480
[  564.190983]  rewind_stack_do_exit+0x17/0x20

[  564.192190] The buggy address belongs to the page:
[  564.193213] page:ffffea0007cc45c0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[  564.194856] flags: 0x2ffff0000000000()
[  564.195644] raw: 02ffff0000000000 0000000000000000 dead000000000200 0000000000000000
[  564.197247] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
[  564.198826] page dumped because: kasan: bad access detected

[  564.200299] Memory state around the buggy address:
[  564.201306]  ffff8801f3117b80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  564.202779]  ffff8801f3117c00: 00 00 00 00 00 00 00 00 00 00 00 f3 f3 f3 f3 f3
[  564.204252] >ffff8801f3117c80: f3 f3 f3 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1
[  564.205742]                    ^
[  564.206424]  ffff8801f3117d00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  564.207908]  ffff8801f3117d80: f3 f3 f3 f3 f3 f3 f3 f3 00 00 00 00 00 00 00 00
[  564.209389] ==================================================================
[  564.231795] F2FS-fs (loop0): Mounted with checkpoint version = 2

- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L586
	return div_u64((u64)valid_user_blocks(sbi) * 100,
					sbi->user_block_count);
Missing checks on sbi->user_block_count.

Reported-by: Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/super.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b8e160b17505..61222a586c46 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1140,6 +1140,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	unsigned int sit_segs, nat_segs;
 	unsigned int sit_bitmap_size, nat_bitmap_size;
 	unsigned int log_blocks_per_seg;
+	unsigned int segment_count_main;
+	block_t user_block_count;
 	int i;
 
 	total = le32_to_cpu(raw_super->segment_count);
@@ -1164,6 +1166,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 		return 1;
 	}
 
+	user_block_count = le64_to_cpu(ckpt->user_block_count);
+	segment_count_main = le32_to_cpu(raw_super->segment_count_main);
+	log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+	if (!user_block_count || user_block_count >=
+			segment_count_main << log_blocks_per_seg) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+			"Wrong user_block_count: %u", user_block_count);
+		return 1;
+	}
+
 	main_segs = le32_to_cpu(raw_super->segment_count_main);
 	blocks_per_seg = sbi->blocks_per_seg;
 
@@ -1180,7 +1192,6 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 
 	sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
 	nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize);
-	log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
 
 	if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 ||
 		nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) {

From 24f8bb89456419da6a5907b262afd1d0934c3bc9 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Thu, 29 Nov 2018 19:17:34 +0000
Subject: [PATCH 2282/3220] f2fs: Add sanity_check_inode() function

This was done as part of commits 5d64600d4f33 "f2fs: avoid bug_on on
corrupted inode" and 76d56d4ab4f2 "f2fs: fix to do sanity check with
extra_attr feature" upstream, but the specific checks they added are
not applicable to 4.4.

Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/inode.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 702e9b637fc0..0f49638d2a5d 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -95,6 +95,13 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
 	return;
 }
 
+static bool sanity_check_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	return true;
+}
+
 static int do_read_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -143,6 +150,11 @@ static int do_read_inode(struct inode *inode)
 
 	get_inline_info(fi, ri);
 
+	if (!sanity_check_inode(inode)) {
+		f2fs_put_page(node_page, 1);
+		return -EINVAL;
+	}
+
 	/* check data exist */
 	if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
 		__recover_inline_status(inode, node_page);

From cbe5e5cd70c0f6fd187114e7f146f29830fedf9c Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 29 Jun 2018 13:55:22 +0800
Subject: [PATCH 2283/3220] f2fs: fix to do sanity check with node footer and
 iblocks

commit e34438c903b653daca2b2a7de95aed46226f8ed3 upstream.

This patch adds to do sanity check with below fields of inode to
avoid reported panic.
- node footer
- iblocks

https://bugzilla.kernel.org/show_bug.cgi?id=200223

- Overview
BUG() triggered in f2fs_truncate_inode_blocks() when un-mounting a mounted f2fs image after writing to it

- Reproduce

- POC (poc.c)

static void activity(char *mpoint) {

  char *foo_bar_baz;
  int err;

  static int buf[8192];
  memset(buf, 0, sizeof(buf));

  err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);

  // open / write / read
  int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
  if (fd >= 0) {
    write(fd, (char *)buf, 517);
    write(fd, (char *)buf, sizeof(buf));
    close(fd);
  }

}

int main(int argc, char *argv[]) {
  activity(argv[1]);
  return 0;
}

- Kernel meesage
[  552.479723] F2FS-fs (loop0): Mounted with checkpoint version = 2
[  556.451891] ------------[ cut here ]------------
[  556.451899] kernel BUG at fs/f2fs/node.c:987!
[  556.452920] invalid opcode: 0000 [#1] SMP KASAN PTI
[  556.453936] CPU: 1 PID: 1310 Comm: umount Not tainted 4.18.0-rc1+ #4
[  556.455213] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  556.457140] RIP: 0010:f2fs_truncate_inode_blocks+0x4a7/0x6f0
[  556.458280] Code: e8 ae ea ff ff 41 89 c7 c1 e8 1f 84 c0 74 0a 41 83 ff fe 0f 85 35 ff ff ff 81 85 b0 fe ff ff fb 03 00 00 e9 f7 fd ff ff 0f 0b <0f> 0b e8 62 b7 9a 00 48 8b bd a0 fe ff ff e8 56 54 ae ff 48 8b b5
[  556.462015] RSP: 0018:ffff8801f292f808 EFLAGS: 00010286
[  556.463068] RAX: ffffed003e73242d RBX: ffff8801f292f958 RCX: ffffffffb88b81bc
[  556.464479] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff8801f3992164
[  556.465901] RBP: ffff8801f292f980 R08: ffffed003e73242d R09: ffffed003e73242d
[  556.467311] R10: 0000000000000001 R11: ffffed003e73242c R12: 00000000fffffc64
[  556.468706] R13: ffff8801f3992000 R14: 0000000000000058 R15: 00000000ffff8801
[  556.470117] FS:  00007f8029297840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000
[  556.471702] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  556.472838] CR2: 000055f5f57305d8 CR3: 00000001f18b0000 CR4: 00000000000006e0
[  556.474265] Call Trace:
[  556.474782]  ? f2fs_alloc_nid_failed+0xf0/0xf0
[  556.475686]  ? truncate_nodes+0x980/0x980
[  556.476516]  ? pagecache_get_page+0x21f/0x2f0
[  556.477412]  ? __asan_loadN+0xf/0x20
[  556.478153]  ? __get_node_page+0x331/0x5b0
[  556.478992]  ? reweight_entity+0x1e6/0x3b0
[  556.479826]  f2fs_truncate_blocks+0x55e/0x740
[  556.480709]  ? f2fs_truncate_data_blocks+0x20/0x20
[  556.481689]  ? __radix_tree_lookup+0x34/0x160
[  556.482630]  ? radix_tree_lookup+0xd/0x10
[  556.483445]  f2fs_truncate+0xd4/0x1a0
[  556.484206]  f2fs_evict_inode+0x5ce/0x630
[  556.485032]  evict+0x16f/0x290
[  556.485664]  iput+0x280/0x300
[  556.486300]  dentry_unlink_inode+0x165/0x1e0
[  556.487169]  __dentry_kill+0x16a/0x260
[  556.487936]  dentry_kill+0x70/0x250
[  556.488651]  shrink_dentry_list+0x125/0x260
[  556.489504]  shrink_dcache_parent+0xc1/0x110
[  556.490379]  ? shrink_dcache_sb+0x200/0x200
[  556.491231]  ? bit_wait_timeout+0xc0/0xc0
[  556.492047]  do_one_tree+0x12/0x40
[  556.492743]  shrink_dcache_for_umount+0x3f/0xa0
[  556.493656]  generic_shutdown_super+0x43/0x1c0
[  556.494561]  kill_block_super+0x52/0x80
[  556.495341]  kill_f2fs_super+0x62/0x70
[  556.496105]  deactivate_locked_super+0x6f/0xa0
[  556.497004]  deactivate_super+0x5e/0x80
[  556.497785]  cleanup_mnt+0x61/0xa0
[  556.498492]  __cleanup_mnt+0x12/0x20
[  556.499218]  task_work_run+0xc8/0xf0
[  556.499949]  exit_to_usermode_loop+0x125/0x130
[  556.500846]  do_syscall_64+0x138/0x170
[  556.501609]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  556.502659] RIP: 0033:0x7f8028b77487
[  556.503384] Code: 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e1 c9 2b 00 f7 d8 64 89 01 48
[  556.507137] RSP: 002b:00007fff9f2e3598 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[  556.508637] RAX: 0000000000000000 RBX: 0000000000ebd030 RCX: 00007f8028b77487
[  556.510069] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000ec41e0
[  556.511481] RBP: 0000000000ec41e0 R08: 0000000000000000 R09: 0000000000000014
[  556.512892] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f802908083c
[  556.514320] R13: 0000000000000000 R14: 0000000000ebd210 R15: 00007fff9f2e3820
[  556.515745] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[  556.529276] ---[ end trace 4ce02f25ff7d3df5 ]---
[  556.530340] RIP: 0010:f2fs_truncate_inode_blocks+0x4a7/0x6f0
[  556.531513] Code: e8 ae ea ff ff 41 89 c7 c1 e8 1f 84 c0 74 0a 41 83 ff fe 0f 85 35 ff ff ff 81 85 b0 fe ff ff fb 03 00 00 e9 f7 fd ff ff 0f 0b <0f> 0b e8 62 b7 9a 00 48 8b bd a0 fe ff ff e8 56 54 ae ff 48 8b b5
[  556.535330] RSP: 0018:ffff8801f292f808 EFLAGS: 00010286
[  556.536395] RAX: ffffed003e73242d RBX: ffff8801f292f958 RCX: ffffffffb88b81bc
[  556.537824] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff8801f3992164
[  556.539290] RBP: ffff8801f292f980 R08: ffffed003e73242d R09: ffffed003e73242d
[  556.540709] R10: 0000000000000001 R11: ffffed003e73242c R12: 00000000fffffc64
[  556.542131] R13: ffff8801f3992000 R14: 0000000000000058 R15: 00000000ffff8801
[  556.543579] FS:  00007f8029297840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000
[  556.545180] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  556.546338] CR2: 000055f5f57305d8 CR3: 00000001f18b0000 CR4: 00000000000006e0
[  556.547809] ==================================================================
[  556.549248] BUG: KASAN: stack-out-of-bounds in arch_tlb_gather_mmu+0x52/0x170
[  556.550672] Write of size 8 at addr ffff8801f292fd10 by task umount/1310

[  556.552338] CPU: 1 PID: 1310 Comm: umount Tainted: G      D           4.18.0-rc1+ #4
[  556.553886] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  556.555756] Call Trace:
[  556.556264]  dump_stack+0x7b/0xb5
[  556.556944]  print_address_description+0x70/0x290
[  556.557903]  kasan_report+0x291/0x390
[  556.558649]  ? arch_tlb_gather_mmu+0x52/0x170
[  556.559537]  __asan_store8+0x57/0x90
[  556.560268]  arch_tlb_gather_mmu+0x52/0x170
[  556.561110]  tlb_gather_mmu+0x12/0x40
[  556.561862]  exit_mmap+0x123/0x2a0
[  556.562555]  ? __ia32_sys_munmap+0x50/0x50
[  556.563384]  ? exit_aio+0x98/0x230
[  556.564079]  ? __x32_compat_sys_io_submit+0x260/0x260
[  556.565099]  ? taskstats_exit+0x1f4/0x640
[  556.565925]  ? kasan_check_read+0x11/0x20
[  556.566739]  ? mm_update_next_owner+0x322/0x380
[  556.567652]  mmput+0x8b/0x1d0
[  556.568260]  do_exit+0x43a/0x1390
[  556.568937]  ? mm_update_next_owner+0x380/0x380
[  556.569855]  ? deactivate_super+0x5e/0x80
[  556.570668]  ? cleanup_mnt+0x61/0xa0
[  556.571395]  ? __cleanup_mnt+0x12/0x20
[  556.572156]  ? task_work_run+0xc8/0xf0
[  556.572917]  ? exit_to_usermode_loop+0x125/0x130
[  556.573861]  rewind_stack_do_exit+0x17/0x20
[  556.574707] RIP: 0033:0x7f8028b77487
[  556.575428] Code: Bad RIP value.
[  556.576106] RSP: 002b:00007fff9f2e3598 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[  556.577599] RAX: 0000000000000000 RBX: 0000000000ebd030 RCX: 00007f8028b77487
[  556.579020] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000ec41e0
[  556.580422] RBP: 0000000000ec41e0 R08: 0000000000000000 R09: 0000000000000014
[  556.581833] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f802908083c
[  556.583252] R13: 0000000000000000 R14: 0000000000ebd210 R15: 00007fff9f2e3820

[  556.584983] The buggy address belongs to the page:
[  556.585961] page:ffffea0007ca4bc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[  556.587540] flags: 0x2ffff0000000000()
[  556.588296] raw: 02ffff0000000000 0000000000000000 dead000000000200 0000000000000000
[  556.589822] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
[  556.591359] page dumped because: kasan: bad access detected

[  556.592786] Memory state around the buggy address:
[  556.593753]  ffff8801f292fc00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  556.595191]  ffff8801f292fc80: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 00 00 00
[  556.596613] >ffff8801f292fd00: 00 00 f3 00 00 00 00 f3 f3 00 00 00 00 f4 f4 f4
[  556.598044]                          ^
[  556.598797]  ffff8801f292fd80: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
[  556.600225]  ffff8801f292fe00: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 f4 f4 f4
[  556.601647] ==================================================================

- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/node.c#L987
		case NODE_DIND_BLOCK:
			err = truncate_nodes(&dn, nofs, offset[1], 3);
			cont = 0;
			break;

		default:
			BUG(); <---
		}

Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/inode.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 0f49638d2a5d..f22bcd0be7e0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -95,9 +95,30 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
 	return;
 }
 
-static bool sanity_check_inode(struct inode *inode)
+static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	unsigned long long iblocks;
+
+	iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks);
+	if (!iblocks) {
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, "
+			"run fsck to fix.",
+			__func__, inode->i_ino, iblocks);
+		return false;
+	}
+
+	if (ino_of_node(node_page) != nid_of_node(node_page)) {
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"%s: corrupted inode footer i_ino=%lx, ino,nid: "
+			"[%u, %u] run fsck to fix.",
+			__func__, inode->i_ino,
+			ino_of_node(node_page), nid_of_node(node_page));
+		return false;
+	}
 
 	return true;
 }
@@ -150,7 +171,7 @@ static int do_read_inode(struct inode *inode)
 
 	get_inline_info(fi, ri);
 
-	if (!sanity_check_inode(inode)) {
+	if (!sanity_check_inode(inode, node_page)) {
 		f2fs_put_page(node_page, 1);
 		return -EINVAL;
 	}

From 3bfe2049c222b23342ff2a216cd5a869e8a14897 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 30 Jun 2018 18:13:40 +0800
Subject: [PATCH 2284/3220] f2fs: fix to do sanity check with reserved blkaddr
 of inline inode

commit 4dbe38dc386910c668c75ae616b99b823b59f3eb upstream.

As Wen Xu reported in bugzilla, after image was injected with random data
by fuzzing, inline inode would contain invalid reserved blkaddr, then
during inline conversion, we will encounter illegal memory accessing
reported by KASAN, the root cause of this is when writing out converted
inline page, we will use invalid reserved blkaddr to update sit bitmap,
result in accessing memory beyond sit bitmap boundary.

In order to fix this issue, let's do sanity check with reserved block
address of inline inode to avoid above condition.

https://bugzilla.kernel.org/show_bug.cgi?id=200179

[ 1428.846352] BUG: KASAN: use-after-free in update_sit_entry+0x80/0x7f0
[ 1428.846618] Read of size 4 at addr ffff880194483540 by task a.out/2741

[ 1428.846855] CPU: 0 PID: 2741 Comm: a.out Tainted: G        W         4.17.0+ #1
[ 1428.846858] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 1428.846860] Call Trace:
[ 1428.846868]  dump_stack+0x71/0xab
[ 1428.846875]  print_address_description+0x6b/0x290
[ 1428.846881]  kasan_report+0x28e/0x390
[ 1428.846888]  ? update_sit_entry+0x80/0x7f0
[ 1428.846898]  update_sit_entry+0x80/0x7f0
[ 1428.846906]  f2fs_allocate_data_block+0x6db/0xc70
[ 1428.846914]  ? f2fs_get_node_info+0x14f/0x590
[ 1428.846920]  do_write_page+0xc8/0x150
[ 1428.846928]  f2fs_outplace_write_data+0xfe/0x210
[ 1428.846935]  ? f2fs_do_write_node_page+0x170/0x170
[ 1428.846941]  ? radix_tree_tag_clear+0xff/0x130
[ 1428.846946]  ? __mod_node_page_state+0x22/0xa0
[ 1428.846951]  ? inc_zone_page_state+0x54/0x100
[ 1428.846956]  ? __test_set_page_writeback+0x336/0x5d0
[ 1428.846964]  f2fs_convert_inline_page+0x407/0x6d0
[ 1428.846971]  ? f2fs_read_inline_data+0x3b0/0x3b0
[ 1428.846978]  ? __get_node_page+0x335/0x6b0
[ 1428.846987]  f2fs_convert_inline_inode+0x41b/0x500
[ 1428.846994]  ? f2fs_convert_inline_page+0x6d0/0x6d0
[ 1428.847000]  ? kasan_unpoison_shadow+0x31/0x40
[ 1428.847005]  ? kasan_kmalloc+0xa6/0xd0
[ 1428.847024]  f2fs_file_mmap+0x79/0xc0
[ 1428.847029]  mmap_region+0x58b/0x880
[ 1428.847037]  ? arch_get_unmapped_area+0x370/0x370
[ 1428.847042]  do_mmap+0x55b/0x7a0
[ 1428.847048]  vm_mmap_pgoff+0x16f/0x1c0
[ 1428.847055]  ? vma_is_stack_for_current+0x50/0x50
[ 1428.847062]  ? __fsnotify_update_child_dentry_flags.part.1+0x160/0x160
[ 1428.847068]  ? do_sys_open+0x206/0x2a0
[ 1428.847073]  ? __fget+0xb4/0x100
[ 1428.847079]  ksys_mmap_pgoff+0x278/0x360
[ 1428.847085]  ? find_mergeable_anon_vma+0x50/0x50
[ 1428.847091]  do_syscall_64+0x73/0x160
[ 1428.847098]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1428.847102] RIP: 0033:0x7fb1430766ba
[ 1428.847103] Code: 89 f5 41 54 49 89 fc 55 53 74 35 49 63 e8 48 63 da 4d 89 f9 49 89 e8 4d 63 d6 48 89 da 4c 89 ee 4c 89 e7 b8 09 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 56 5b 5d 41 5c 41 5d 41 5e 41 5f c3 0f 1f 00
[ 1428.847162] RSP: 002b:00007ffc651d9388 EFLAGS: 00000246 ORIG_RAX: 0000000000000009
[ 1428.847167] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fb1430766ba
[ 1428.847170] RDX: 0000000000000001 RSI: 0000000000001000 RDI: 0000000000000000
[ 1428.847173] RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000000
[ 1428.847176] R10: 0000000000008002 R11: 0000000000000246 R12: 0000000000000000
[ 1428.847179] R13: 0000000000001000 R14: 0000000000008002 R15: 0000000000000000

[ 1428.847252] Allocated by task 2683:
[ 1428.847372]  kasan_kmalloc+0xa6/0xd0
[ 1428.847380]  kmem_cache_alloc+0xc8/0x1e0
[ 1428.847385]  getname_flags+0x73/0x2b0
[ 1428.847390]  user_path_at_empty+0x1d/0x40
[ 1428.847395]  vfs_statx+0xc1/0x150
[ 1428.847401]  __do_sys_newlstat+0x7e/0xd0
[ 1428.847405]  do_syscall_64+0x73/0x160
[ 1428.847411]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

[ 1428.847466] Freed by task 2683:
[ 1428.847566]  __kasan_slab_free+0x137/0x190
[ 1428.847571]  kmem_cache_free+0x85/0x1e0
[ 1428.847575]  filename_lookup+0x191/0x280
[ 1428.847580]  vfs_statx+0xc1/0x150
[ 1428.847585]  __do_sys_newlstat+0x7e/0xd0
[ 1428.847590]  do_syscall_64+0x73/0x160
[ 1428.847596]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

[ 1428.847648] The buggy address belongs to the object at ffff880194483300
                which belongs to the cache names_cache of size 4096
[ 1428.847946] The buggy address is located 576 bytes inside of
                4096-byte region [ffff880194483300, ffff880194484300)
[ 1428.848234] The buggy address belongs to the page:
[ 1428.848366] page:ffffea0006512000 count:1 mapcount:0 mapping:ffff8801f3586380 index:0x0 compound_mapcount: 0
[ 1428.848606] flags: 0x17fff8000008100(slab|head)
[ 1428.848737] raw: 017fff8000008100 dead000000000100 dead000000000200 ffff8801f3586380
[ 1428.848931] raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000
[ 1428.849122] page dumped because: kasan: bad access detected

[ 1428.849305] Memory state around the buggy address:
[ 1428.849436]  ffff880194483400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1428.849620]  ffff880194483480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1428.849804] >ffff880194483500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1428.849985]                                            ^
[ 1428.850120]  ffff880194483580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1428.850303]  ffff880194483600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1428.850498] ==================================================================

Reported-by: Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/inline.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 123b4dc90a23..00685a8b1418 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -127,6 +127,16 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	if (err)
 		return err;
 
+	if (unlikely(dn->data_blkaddr != NEW_ADDR)) {
+		f2fs_put_dnode(dn);
+		set_sbi_flag(fio.sbi, SBI_NEED_FSCK);
+		f2fs_msg(fio.sbi->sb, KERN_WARNING,
+			"%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, "
+			"run fsck to fix.",
+			__func__, dn->inode->i_ino, dn->data_blkaddr);
+		return -EINVAL;
+	}
+
 	f2fs_wait_on_page_writeback(page, DATA);
 
 	if (PageUptodate(page))
@@ -386,6 +396,17 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 	if (err)
 		goto out;
 
+	if (unlikely(dn.data_blkaddr != NEW_ADDR)) {
+		f2fs_put_dnode(&dn);
+		set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK);
+		f2fs_msg(F2FS_P_SB(page)->sb, KERN_WARNING,
+			"%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, "
+			"run fsck to fix.",
+			__func__, dir->i_ino, dn.data_blkaddr);
+		err = -EINVAL;
+		goto out;
+	}
+
 	f2fs_wait_on_page_writeback(page, DATA);
 	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
 

From bdffda8db8d9e0125b9a41edc420c39f9905c6ab Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 1 Aug 2018 19:13:44 +0800
Subject: [PATCH 2285/3220] f2fs: fix to do sanity check with block address in
 main area

commit c9b60788fc760d136211853f10ce73dc152d1f4a upstream.

This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info

- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image

- Reproduce (4.18 upstream kernel)

- POC (poc.c)

static void activity(char *mpoint) {

  char *foo_bar_baz;
  int err;

  static int buf[8192];
  memset(buf, 0, sizeof(buf));

  err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);

  int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
  if (fd >= 0) {
    write(fd, (char *)buf, sizeof(buf));
    fdatasync(fd);
    close(fd);
  }
}

int main(int argc, char *argv[]) {
  activity(argv[1]);
  return 0;
}

- Kernel message
[  689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[  699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[  699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[  699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[  699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[  699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[  699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[  699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[  699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[  699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[  699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[  699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[  699.729154] FS:  00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[  699.729156] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[  699.729171] Call Trace:
[  699.729192]  f2fs_do_write_data_page+0x2e2/0xe00
[  699.729203]  ? f2fs_should_update_outplace+0xd0/0xd0
[  699.729238]  ? memcg_drain_all_list_lrus+0x280/0x280
[  699.729269]  ? __radix_tree_replace+0xa3/0x120
[  699.729276]  __write_data_page+0x5c7/0xe30
[  699.729291]  ? kasan_check_read+0x11/0x20
[  699.729310]  ? page_mapped+0x8a/0x110
[  699.729321]  ? page_mkclean+0xe9/0x160
[  699.729327]  ? f2fs_do_write_data_page+0xe00/0xe00
[  699.729331]  ? invalid_page_referenced_vma+0x130/0x130
[  699.729345]  ? clear_page_dirty_for_io+0x332/0x450
[  699.729351]  f2fs_write_cache_pages+0x4ca/0x860
[  699.729358]  ? __write_data_page+0xe30/0xe30
[  699.729374]  ? percpu_counter_add_batch+0x22/0xa0
[  699.729380]  ? kasan_check_write+0x14/0x20
[  699.729391]  ? _raw_spin_lock+0x17/0x40
[  699.729403]  ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[  699.729413]  ? iov_iter_advance+0x113/0x640
[  699.729418]  ? f2fs_write_end+0x133/0x2e0
[  699.729423]  ? balance_dirty_pages_ratelimited+0x239/0x640
[  699.729428]  f2fs_write_data_pages+0x329/0x520
[  699.729433]  ? generic_perform_write+0x250/0x320
[  699.729438]  ? f2fs_write_cache_pages+0x860/0x860
[  699.729454]  ? current_time+0x110/0x110
[  699.729459]  ? f2fs_preallocate_blocks+0x1ef/0x370
[  699.729464]  do_writepages+0x37/0xb0
[  699.729468]  ? f2fs_write_cache_pages+0x860/0x860
[  699.729472]  ? do_writepages+0x37/0xb0
[  699.729478]  __filemap_fdatawrite_range+0x19a/0x1f0
[  699.729483]  ? delete_from_page_cache_batch+0x4e0/0x4e0
[  699.729496]  ? __vfs_write+0x2b2/0x410
[  699.729501]  file_write_and_wait_range+0x66/0xb0
[  699.729506]  f2fs_do_sync_file+0x1f9/0xd90
[  699.729511]  ? truncate_partial_data_page+0x290/0x290
[  699.729521]  ? __sb_end_write+0x30/0x50
[  699.729526]  ? vfs_write+0x20f/0x260
[  699.729530]  f2fs_sync_file+0x9a/0xb0
[  699.729534]  ? f2fs_do_sync_file+0xd90/0xd90
[  699.729548]  vfs_fsync_range+0x68/0x100
[  699.729554]  ? __fget_light+0xc9/0xe0
[  699.729558]  do_fsync+0x3d/0x70
[  699.729562]  __x64_sys_fdatasync+0x24/0x30
[  699.729585]  do_syscall_64+0x78/0x170
[  699.729595]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  699.729613] RIP: 0033:0x7f9bf930d800
[  699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[  699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[  699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[  699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[  699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[  699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[  699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[  699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[  699.729782] ------------[ cut here ]------------
[  699.729785] kernel BUG at fs/f2fs/segment.h:654!
[  699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[  699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G        W         4.18.0-rc1+ #4
[  699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[  699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[  699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[  699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[  699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[  699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[  699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[  699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[  699.748683] FS:  00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[  699.750293] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[  699.752874] Call Trace:
[  699.753386]  ? f2fs_inplace_write_data+0x93/0x240
[  699.754341]  f2fs_inplace_write_data+0xd2/0x240
[  699.755271]  f2fs_do_write_data_page+0x2e2/0xe00
[  699.756214]  ? f2fs_should_update_outplace+0xd0/0xd0
[  699.757215]  ? memcg_drain_all_list_lrus+0x280/0x280
[  699.758209]  ? __radix_tree_replace+0xa3/0x120
[  699.759164]  __write_data_page+0x5c7/0xe30
[  699.760002]  ? kasan_check_read+0x11/0x20
[  699.760823]  ? page_mapped+0x8a/0x110
[  699.761573]  ? page_mkclean+0xe9/0x160
[  699.762345]  ? f2fs_do_write_data_page+0xe00/0xe00
[  699.763332]  ? invalid_page_referenced_vma+0x130/0x130
[  699.764374]  ? clear_page_dirty_for_io+0x332/0x450
[  699.765347]  f2fs_write_cache_pages+0x4ca/0x860
[  699.766276]  ? __write_data_page+0xe30/0xe30
[  699.767161]  ? percpu_counter_add_batch+0x22/0xa0
[  699.768112]  ? kasan_check_write+0x14/0x20
[  699.768951]  ? _raw_spin_lock+0x17/0x40
[  699.769739]  ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[  699.770885]  ? iov_iter_advance+0x113/0x640
[  699.771743]  ? f2fs_write_end+0x133/0x2e0
[  699.772569]  ? balance_dirty_pages_ratelimited+0x239/0x640
[  699.773680]  f2fs_write_data_pages+0x329/0x520
[  699.774603]  ? generic_perform_write+0x250/0x320
[  699.775544]  ? f2fs_write_cache_pages+0x860/0x860
[  699.776510]  ? current_time+0x110/0x110
[  699.777299]  ? f2fs_preallocate_blocks+0x1ef/0x370
[  699.778279]  do_writepages+0x37/0xb0
[  699.779026]  ? f2fs_write_cache_pages+0x860/0x860
[  699.779978]  ? do_writepages+0x37/0xb0
[  699.780755]  __filemap_fdatawrite_range+0x19a/0x1f0
[  699.781746]  ? delete_from_page_cache_batch+0x4e0/0x4e0
[  699.782820]  ? __vfs_write+0x2b2/0x410
[  699.783597]  file_write_and_wait_range+0x66/0xb0
[  699.784540]  f2fs_do_sync_file+0x1f9/0xd90
[  699.785381]  ? truncate_partial_data_page+0x290/0x290
[  699.786415]  ? __sb_end_write+0x30/0x50
[  699.787204]  ? vfs_write+0x20f/0x260
[  699.787941]  f2fs_sync_file+0x9a/0xb0
[  699.788694]  ? f2fs_do_sync_file+0xd90/0xd90
[  699.789572]  vfs_fsync_range+0x68/0x100
[  699.790360]  ? __fget_light+0xc9/0xe0
[  699.791128]  do_fsync+0x3d/0x70
[  699.791779]  __x64_sys_fdatasync+0x24/0x30
[  699.792614]  do_syscall_64+0x78/0x170
[  699.793371]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  699.794406] RIP: 0033:0x7f9bf930d800
[  699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[  699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[  699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[  699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[  699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[  699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[  699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[  699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[  699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[  699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[  699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[  699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[  699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[  699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[  699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[  699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[  699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[  699.831192] FS:  00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[  699.832793] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[  699.835556] ==================================================================
[  699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[  699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309

[  699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G      D W         4.18.0-rc1+ #4
[  699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  699.843475] Call Trace:
[  699.843982]  dump_stack+0x7b/0xb5
[  699.844661]  print_address_description+0x70/0x290
[  699.845607]  kasan_report+0x291/0x390
[  699.846351]  ? update_stack_state+0x38c/0x3e0
[  699.853831]  __asan_load8+0x54/0x90
[  699.854569]  update_stack_state+0x38c/0x3e0
[  699.855428]  ? __read_once_size_nocheck.constprop.7+0x20/0x20
[  699.856601]  ? __save_stack_trace+0x5e/0x100
[  699.857476]  unwind_next_frame.part.5+0x18e/0x490
[  699.858448]  ? unwind_dump+0x290/0x290
[  699.859217]  ? clear_page_dirty_for_io+0x332/0x450
[  699.860185]  __unwind_start+0x106/0x190
[  699.860974]  __save_stack_trace+0x5e/0x100
[  699.861808]  ? __save_stack_trace+0x5e/0x100
[  699.862691]  ? unlink_anon_vmas+0xba/0x2c0
[  699.863525]  save_stack_trace+0x1f/0x30
[  699.864312]  save_stack+0x46/0xd0
[  699.864993]  ? __alloc_pages_slowpath+0x1420/0x1420
[  699.865990]  ? flush_tlb_mm_range+0x15e/0x220
[  699.866889]  ? kasan_check_write+0x14/0x20
[  699.867724]  ? __dec_node_state+0x92/0xb0
[  699.868543]  ? lock_page_memcg+0x85/0xf0
[  699.869350]  ? unlock_page_memcg+0x16/0x80
[  699.870185]  ? page_remove_rmap+0x198/0x520
[  699.871048]  ? mark_page_accessed+0x133/0x200
[  699.871930]  ? _cond_resched+0x1a/0x50
[  699.872700]  ? unmap_page_range+0xcd4/0xe50
[  699.873551]  ? rb_next+0x58/0x80
[  699.874217]  ? rb_next+0x58/0x80
[  699.874895]  __kasan_slab_free+0x13c/0x1a0
[  699.875734]  ? unlink_anon_vmas+0xba/0x2c0
[  699.876563]  kasan_slab_free+0xe/0x10
[  699.877315]  kmem_cache_free+0x89/0x1e0
[  699.878095]  unlink_anon_vmas+0xba/0x2c0
[  699.878913]  free_pgtables+0x101/0x1b0
[  699.879677]  exit_mmap+0x146/0x2a0
[  699.880378]  ? __ia32_sys_munmap+0x50/0x50
[  699.881214]  ? kasan_check_read+0x11/0x20
[  699.882052]  ? mm_update_next_owner+0x322/0x380
[  699.882985]  mmput+0x8b/0x1d0
[  699.883602]  do_exit+0x43a/0x1390
[  699.884288]  ? mm_update_next_owner+0x380/0x380
[  699.885212]  ? f2fs_sync_file+0x9a/0xb0
[  699.885995]  ? f2fs_do_sync_file+0xd90/0xd90
[  699.886877]  ? vfs_fsync_range+0x68/0x100
[  699.887694]  ? __fget_light+0xc9/0xe0
[  699.888442]  ? do_fsync+0x3d/0x70
[  699.889118]  ? __x64_sys_fdatasync+0x24/0x30
[  699.889996]  rewind_stack_do_exit+0x17/0x20
[  699.890860] RIP: 0033:0x7f9bf930d800
[  699.891585] Code: Bad RIP value.
[  699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[  699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[  699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[  699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[  699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[  699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000

[  699.901241] The buggy address belongs to the page:
[  699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[  699.903811] flags: 0x2ffff0000000000()
[  699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[  699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[  699.907673] page dumped because: kasan: bad access detected

[  699.909108] Memory state around the buggy address:
[  699.910077]  ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[  699.911528]  ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[  699.914392]                                                              ^
[  699.915758]  ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[  699.917193]  ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[  699.918634] ==================================================================

- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644

Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4:
 - CoW is not implemented so check f2fs_io_info::blk_addr instead of
   f2fs_io_info::{old,new}_blkaddr
 - Operation code is f2fs_io_info::rw instead of f2fs_io_info::op
 - f2fs_stop_checkpoint() only takes one argument
 - In f2fs_map_blocks(), validate dn.data_blkaddr instead of blkaddr
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c | 22 +++++++++++++++++++---
 fs/f2fs/data.c       | 21 ++++++++++++++++++++-
 fs/f2fs/f2fs.h       |  3 +++
 fs/f2fs/file.c       | 12 ++++++++++++
 fs/f2fs/inode.c      | 16 ++++++++++++++++
 fs/f2fs/node.c       |  4 ++++
 fs/f2fs/segment.h    |  3 +--
 7 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 64b65d509f7c..b6edb394069d 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -75,8 +75,10 @@ repeat:
 	fio.page = page;
 
 	if (f2fs_submit_page_bio(&fio)) {
-		f2fs_put_page(page, 1);
-		goto repeat;
+		memset(page_address(page), 0, PAGE_SIZE);
+		f2fs_stop_checkpoint(sbi);
+		f2fs_bug_on(sbi, 1);
+		return page;
 	}
 
 	lock_page(page);
@@ -130,8 +132,14 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 	case META_POR:
 	case DATA_GENERIC:
 		if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
-			blkaddr < MAIN_BLKADDR(sbi)))
+			blkaddr < MAIN_BLKADDR(sbi))) {
+			if (type == DATA_GENERIC) {
+				f2fs_msg(sbi->sb, KERN_WARNING,
+					"access invalid blkaddr:%u", blkaddr);
+				WARN_ON(1);
+			}
 			return false;
+		}
 		break;
 	case META_GENERIC:
 		if (unlikely(blkaddr < SEG0_BLKADDR(sbi) ||
@@ -651,6 +659,14 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 					&cp_page_1, version);
 	if (err)
 		goto invalid_cp1;
+
+	if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
+					sbi->blocks_per_seg) {
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"invalid cp_pack_total_block_count:%u",
+			le32_to_cpu(cp_block->cp_pack_total_block_count));
+		goto invalid_cp1;
+	}
 	pre_version = *version;
 
 	cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 46dcf856e848..fe7694ec3546 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -147,7 +147,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	struct bio *bio;
 	struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
-	verify_block_addr(fio, fio->blk_addr);
+	if (!f2fs_is_valid_blkaddr(fio->sbi, fio->blk_addr,
+			__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
+		return -EFAULT;
+
 	trace_f2fs_submit_page_bio(page, fio);
 	f2fs_trace_ios(fio, 0);
 
@@ -604,6 +607,12 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 		goto unlock_out;
 	}
 
+	if (__is_valid_data_blkaddr(dn.data_blkaddr) &&
+		!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC)) {
+		err = -EFAULT;
+		goto sync_out;
+	}
+
 	if (!is_valid_data_blkaddr(sbi, dn.data_blkaddr)) {
 		if (create) {
 			if (unlikely(f2fs_cp_error(sbi))) {
@@ -972,6 +981,10 @@ got_it:
 				SetPageUptodate(page);
 				goto confused;
 			}
+
+			if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+								DATA_GENERIC))
+				goto set_error_page;
 		} else {
 			zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 			SetPageUptodate(page);
@@ -1086,6 +1099,12 @@ int do_write_data_page(struct f2fs_io_info *fio)
 
 	set_page_writeback(page);
 
+	if (__is_valid_data_blkaddr(fio->blk_addr) &&
+		!f2fs_is_valid_blkaddr(fio->sbi, fio->blk_addr,
+							DATA_GENERIC)) {
+		err = -EFAULT;
+		goto out_writepage;
+	}
 	/*
 	 * If current allocation needs SSR,
 	 * it had better in-place writes for updated data.
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e562d8bd5b8a..2bfce887dce2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1649,6 +1649,9 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
 	(pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) /	\
 	ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
 
+#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META &&	\
+				(!is_read_io(fio->rw) || fio->is_meta))
+
 bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type);
 void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b153b37ae038..96bfd9f0ea02 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -374,6 +374,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 			block_t blkaddr;
 			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 
+			if (__is_valid_data_blkaddr(blkaddr) &&
+				!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
+						blkaddr, DATA_GENERIC)) {
+				f2fs_put_dnode(&dn);
+				goto fail;
+			}
+
 			if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty,
 							pgofs, whence)) {
 				f2fs_put_dnode(&dn);
@@ -467,6 +474,11 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 
 		dn->data_blkaddr = NULL_ADDR;
 		set_data_blkaddr(dn);
+
+		if (__is_valid_data_blkaddr(blkaddr) &&
+			!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC))
+			continue;
+
 		invalidate_blocks(sbi, blkaddr);
 		if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
 			clear_inode_flag(F2FS_I(dn->inode),
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index f22bcd0be7e0..3ac57cbee0fd 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -120,6 +120,22 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 		return false;
 	}
 
+	if (F2FS_I(inode)->extent_tree) {
+		struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest;
+
+		if (ei->len &&
+			(!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC) ||
+			!f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1,
+							DATA_GENERIC))) {
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			f2fs_msg(sbi->sb, KERN_WARNING,
+				"%s: inode (ino=%lx) extent info [%u, %u, %u] "
+				"is incorrect, run fsck to fix",
+				__func__, inode->i_ino,
+				ei->blk, ei->fofs, ei->len);
+			return false;
+		}
+	}
 	return true;
 }
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 5801c70c4964..016473fc267d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1341,6 +1341,10 @@ static int f2fs_write_node_page(struct page *page,
 		return 0;
 	}
 
+	if (__is_valid_data_blkaddr(ni.blk_addr) &&
+		!f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC))
+		goto redirty_out;
+
 	set_page_writeback(page);
 	fio.blk_addr = ni.blk_addr;
 	write_node_page(nid, &fio);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 204a3251e123..08b08ae6ba9d 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -586,8 +586,7 @@ static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr)
 {
 	struct f2fs_sb_info *sbi = fio->sbi;
 
-	if (PAGE_TYPE_OF_BIO(fio->type) == META &&
-				(!is_read_io(fio->rw) || fio->is_meta))
+	if (__is_meta_io(fio))
 		verify_blkaddr(sbi, blk_addr, META_GENERIC);
 	else
 		verify_blkaddr(sbi, blk_addr, DATA_GENERIC);

From 4901e126b6e1677c90d3c0a668193e52ecdd4971 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 10 Jul 2018 23:01:45 +0800
Subject: [PATCH 2286/3220] f2fs: fix to do sanity check with block address in
 main area v2

commit 91291e9998d208370eb8156c760691b873bd7522 upstream.

This patch adds f2fs_is_valid_blkaddr() in below functions to do sanity
check with block address to avoid pentential panic:
- f2fs_grab_read_bio()
- __written_first_block()

https://bugzilla.kernel.org/show_bug.cgi?id=200465

- Reproduce

- POC (poc.c)
    #define _GNU_SOURCE
    #include <sys/types.h>
    #include <sys/mount.h>
    #include <sys/mman.h>
    #include <sys/stat.h>
    #include <sys/xattr.h>

    #include <dirent.h>
    #include <errno.h>
    #include <error.h>
    #include <fcntl.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <unistd.h>

    #include <linux/falloc.h>
    #include <linux/loop.h>

    static void activity(char *mpoint) {

      char *xattr;
      int err;

      err = asprintf(&xattr, "%s/foo/bar/xattr", mpoint);

      char buf2[113];
      memset(buf2, 0, sizeof(buf2));
      listxattr(xattr, buf2, sizeof(buf2));

    }

    int main(int argc, char *argv[]) {
      activity(argv[1]);
      return 0;
    }

- kernel message
[  844.718738] F2FS-fs (loop0): Mounted with checkpoint version = 2
[  846.430929] F2FS-fs (loop0): access invalid blkaddr:1024
[  846.431058] WARNING: CPU: 1 PID: 1249 at fs/f2fs/checkpoint.c:154 f2fs_is_valid_blkaddr+0x10f/0x160
[  846.431059] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper
[  846.431310] CPU: 1 PID: 1249 Comm: a.out Not tainted 4.18.0-rc3+ #1
[  846.431312] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  846.431315] RIP: 0010:f2fs_is_valid_blkaddr+0x10f/0x160
[  846.431316] Code: 00 eb ed 31 c0 83 fa 05 75 ae 48 83 ec 08 48 8b 3f 89 f1 48 c7 c2 fc 0b 0f 8b 48 c7 c6 8b d7 09 8b 88 44 24 07 e8 61 8b ff ff <0f> 0b 0f b6 44 24 07 48 83 c4 08 eb 81 4c 8b 47 10 8b 8f 38 04 00
[  846.431347] RSP: 0018:ffff961c414a7bc0 EFLAGS: 00010282
[  846.431349] RAX: 0000000000000000 RBX: ffffc5f787b8ea80 RCX: 0000000000000000
[  846.431350] RDX: 0000000000000000 RSI: ffff89dfffd165d8 RDI: ffff89dfffd165d8
[  846.431351] RBP: ffff961c414a7c20 R08: 0000000000000001 R09: 0000000000000248
[  846.431353] R10: 0000000000000000 R11: 0000000000000248 R12: 0000000000000007
[  846.431369] R13: ffff89dff5492800 R14: ffff89dfae3aa000 R15: ffff89dff4ff88d0
[  846.431372] FS:  00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000
[  846.431373] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  846.431374] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0
[  846.431384] Call Trace:
[  846.431426]  f2fs_iget+0x6f4/0xe70
[  846.431430]  ? f2fs_find_entry+0x71/0x90
[  846.431432]  f2fs_lookup+0x1aa/0x390
[  846.431452]  __lookup_slow+0x97/0x150
[  846.431459]  lookup_slow+0x35/0x50
[  846.431462]  walk_component+0x1c6/0x470
[  846.431479]  ? memcg_kmem_charge_memcg+0x70/0x90
[  846.431488]  ? page_add_file_rmap+0x13/0x200
[  846.431491]  path_lookupat+0x76/0x230
[  846.431501]  ? __alloc_pages_nodemask+0xfc/0x280
[  846.431504]  filename_lookup+0xb8/0x1a0
[  846.431534]  ? _cond_resched+0x16/0x40
[  846.431541]  ? kmem_cache_alloc+0x160/0x1d0
[  846.431549]  ? path_listxattr+0x41/0xa0
[  846.431551]  path_listxattr+0x41/0xa0
[  846.431570]  do_syscall_64+0x55/0x100
[  846.431583]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  846.431607] RIP: 0033:0x7f882de1c0d7
[  846.431607] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48
[  846.431639] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2
[  846.431641] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7
[  846.431642] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0
[  846.431643] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000
[  846.431645] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550
[  846.431646] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000
[  846.431648] ---[ end trace abca54df39d14f5c ]---
[  846.431651] F2FS-fs (loop0): invalid blkaddr: 1024, type: 5, run fsck to fix.
[  846.431762] WARNING: CPU: 1 PID: 1249 at fs/f2fs/f2fs.h:2697 f2fs_iget+0xd17/0xe70
[  846.431763] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper
[  846.431797] CPU: 1 PID: 1249 Comm: a.out Tainted: G        W         4.18.0-rc3+ #1
[  846.431798] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  846.431800] RIP: 0010:f2fs_iget+0xd17/0xe70
[  846.431801] Code: ff ff 48 63 d8 e9 e1 f6 ff ff 48 8b 45 c8 41 b8 05 00 00 00 48 c7 c2 d8 e8 0e 8b 48 c7 c6 1d b0 0a 8b 48 8b 38 e8 f9 b4 00 00 <0f> 0b 48 8b 45 c8 f0 80 48 48 04 e9 d8 f9 ff ff 0f 0b 48 8b 43 18
[  846.431832] RSP: 0018:ffff961c414a7bd0 EFLAGS: 00010282
[  846.431834] RAX: 0000000000000000 RBX: ffffc5f787b8ea80 RCX: 0000000000000006
[  846.431835] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffff89dfffd165d0
[  846.431836] RBP: ffff961c414a7c20 R08: 0000000000000000 R09: 0000000000000273
[  846.431837] R10: 0000000000000000 R11: ffff89dfad50ca60 R12: 0000000000000007
[  846.431838] R13: ffff89dff5492800 R14: ffff89dfae3aa000 R15: ffff89dff4ff88d0
[  846.431840] FS:  00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000
[  846.431841] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  846.431842] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0
[  846.431846] Call Trace:
[  846.431850]  ? f2fs_find_entry+0x71/0x90
[  846.431853]  f2fs_lookup+0x1aa/0x390
[  846.431856]  __lookup_slow+0x97/0x150
[  846.431858]  lookup_slow+0x35/0x50
[  846.431874]  walk_component+0x1c6/0x470
[  846.431878]  ? memcg_kmem_charge_memcg+0x70/0x90
[  846.431880]  ? page_add_file_rmap+0x13/0x200
[  846.431882]  path_lookupat+0x76/0x230
[  846.431884]  ? __alloc_pages_nodemask+0xfc/0x280
[  846.431886]  filename_lookup+0xb8/0x1a0
[  846.431890]  ? _cond_resched+0x16/0x40
[  846.431891]  ? kmem_cache_alloc+0x160/0x1d0
[  846.431894]  ? path_listxattr+0x41/0xa0
[  846.431896]  path_listxattr+0x41/0xa0
[  846.431898]  do_syscall_64+0x55/0x100
[  846.431901]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  846.431902] RIP: 0033:0x7f882de1c0d7
[  846.431903] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48
[  846.431934] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2
[  846.431936] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7
[  846.431937] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0
[  846.431939] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000
[  846.431940] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550
[  846.431941] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000
[  846.431943] ---[ end trace abca54df39d14f5d ]---
[  846.432033] F2FS-fs (loop0): access invalid blkaddr:1024
[  846.432051] WARNING: CPU: 1 PID: 1249 at fs/f2fs/checkpoint.c:154 f2fs_is_valid_blkaddr+0x10f/0x160
[  846.432051] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper
[  846.432085] CPU: 1 PID: 1249 Comm: a.out Tainted: G        W         4.18.0-rc3+ #1
[  846.432086] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  846.432089] RIP: 0010:f2fs_is_valid_blkaddr+0x10f/0x160
[  846.432089] Code: 00 eb ed 31 c0 83 fa 05 75 ae 48 83 ec 08 48 8b 3f 89 f1 48 c7 c2 fc 0b 0f 8b 48 c7 c6 8b d7 09 8b 88 44 24 07 e8 61 8b ff ff <0f> 0b 0f b6 44 24 07 48 83 c4 08 eb 81 4c 8b 47 10 8b 8f 38 04 00
[  846.432120] RSP: 0018:ffff961c414a7900 EFLAGS: 00010286
[  846.432122] RAX: 0000000000000000 RBX: 0000000000000400 RCX: 0000000000000006
[  846.432123] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffff89dfffd165d0
[  846.432124] RBP: ffff89dff5492800 R08: 0000000000000001 R09: 000000000000029d
[  846.432125] R10: ffff961c414a7820 R11: 000000000000029d R12: 0000000000000400
[  846.432126] R13: 0000000000000000 R14: ffff89dff4ff88d0 R15: 0000000000000000
[  846.432128] FS:  00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000
[  846.432130] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  846.432131] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0
[  846.432135] Call Trace:
[  846.432151]  f2fs_wait_on_block_writeback+0x20/0x110
[  846.432158]  f2fs_grab_read_bio+0xbc/0xe0
[  846.432161]  f2fs_submit_page_read+0x21/0x280
[  846.432163]  f2fs_get_read_data_page+0xb7/0x3c0
[  846.432165]  f2fs_get_lock_data_page+0x29/0x1e0
[  846.432167]  f2fs_get_new_data_page+0x148/0x550
[  846.432170]  f2fs_add_regular_entry+0x1d2/0x550
[  846.432178]  ? __switch_to+0x12f/0x460
[  846.432181]  f2fs_add_dentry+0x6a/0xd0
[  846.432184]  f2fs_do_add_link+0xe9/0x140
[  846.432186]  __recover_dot_dentries+0x260/0x280
[  846.432189]  f2fs_lookup+0x343/0x390
[  846.432193]  __lookup_slow+0x97/0x150
[  846.432195]  lookup_slow+0x35/0x50
[  846.432208]  walk_component+0x1c6/0x470
[  846.432212]  ? memcg_kmem_charge_memcg+0x70/0x90
[  846.432215]  ? page_add_file_rmap+0x13/0x200
[  846.432217]  path_lookupat+0x76/0x230
[  846.432219]  ? __alloc_pages_nodemask+0xfc/0x280
[  846.432221]  filename_lookup+0xb8/0x1a0
[  846.432224]  ? _cond_resched+0x16/0x40
[  846.432226]  ? kmem_cache_alloc+0x160/0x1d0
[  846.432228]  ? path_listxattr+0x41/0xa0
[  846.432230]  path_listxattr+0x41/0xa0
[  846.432233]  do_syscall_64+0x55/0x100
[  846.432235]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  846.432237] RIP: 0033:0x7f882de1c0d7
[  846.432237] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48
[  846.432269] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2
[  846.432271] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7
[  846.432272] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0
[  846.432273] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000
[  846.432274] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550
[  846.432275] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000
[  846.432277] ---[ end trace abca54df39d14f5e ]---
[  846.432279] F2FS-fs (loop0): invalid blkaddr: 1024, type: 5, run fsck to fix.
[  846.432376] WARNING: CPU: 1 PID: 1249 at fs/f2fs/f2fs.h:2697 f2fs_wait_on_block_writeback+0xb1/0x110
[  846.432376] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper
[  846.432410] CPU: 1 PID: 1249 Comm: a.out Tainted: G        W         4.18.0-rc3+ #1
[  846.432411] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  846.432413] RIP: 0010:f2fs_wait_on_block_writeback+0xb1/0x110
[  846.432414] Code: 66 90 f0 ff 4b 34 74 59 5b 5d c3 48 8b 7d 00 41 b8 05 00 00 00 89 d9 48 c7 c2 d8 e8 0e 8b 48 c7 c6 1d b0 0a 8b e8 df bc fd ff <0f> 0b f0 80 4d 48 04 e9 67 ff ff ff 48 8b 03 48 c1 e8 37 83 e0 07
[  846.432445] RSP: 0018:ffff961c414a7910 EFLAGS: 00010286
[  846.432447] RAX: 0000000000000000 RBX: 0000000000000400 RCX: 0000000000000006
[  846.432448] RDX: 0000000000000000 RSI: 0000000000000092 RDI: ffff89dfffd165d0
[  846.432449] RBP: ffff89dff5492800 R08: 0000000000000000 R09: 00000000000002d1
[  846.432450] R10: ffff961c414a7820 R11: ffff89dfad50cf80 R12: 0000000000000400
[  846.432451] R13: 0000000000000000 R14: ffff89dff4ff88d0 R15: 0000000000000000
[  846.432453] FS:  00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000
[  846.432454] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  846.432455] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0
[  846.432459] Call Trace:
[  846.432463]  f2fs_grab_read_bio+0xbc/0xe0
[  846.432464]  f2fs_submit_page_read+0x21/0x280
[  846.432466]  f2fs_get_read_data_page+0xb7/0x3c0
[  846.432468]  f2fs_get_lock_data_page+0x29/0x1e0
[  846.432470]  f2fs_get_new_data_page+0x148/0x550
[  846.432473]  f2fs_add_regular_entry+0x1d2/0x550
[  846.432475]  ? __switch_to+0x12f/0x460
[  846.432477]  f2fs_add_dentry+0x6a/0xd0
[  846.432480]  f2fs_do_add_link+0xe9/0x140
[  846.432483]  __recover_dot_dentries+0x260/0x280
[  846.432485]  f2fs_lookup+0x343/0x390
[  846.432488]  __lookup_slow+0x97/0x150
[  846.432490]  lookup_slow+0x35/0x50
[  846.432505]  walk_component+0x1c6/0x470
[  846.432509]  ? memcg_kmem_charge_memcg+0x70/0x90
[  846.432511]  ? page_add_file_rmap+0x13/0x200
[  846.432513]  path_lookupat+0x76/0x230
[  846.432515]  ? __alloc_pages_nodemask+0xfc/0x280
[  846.432517]  filename_lookup+0xb8/0x1a0
[  846.432520]  ? _cond_resched+0x16/0x40
[  846.432522]  ? kmem_cache_alloc+0x160/0x1d0
[  846.432525]  ? path_listxattr+0x41/0xa0
[  846.432526]  path_listxattr+0x41/0xa0
[  846.432529]  do_syscall_64+0x55/0x100
[  846.432531]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  846.432533] RIP: 0033:0x7f882de1c0d7
[  846.432533] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48
[  846.432565] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2
[  846.432567] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7
[  846.432568] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0
[  846.432569] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000
[  846.432570] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550
[  846.432571] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000
[  846.432573] ---[ end trace abca54df39d14f5f ]---
[  846.434280] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
[  846.434424] PGD 80000001ebd3a067 P4D 80000001ebd3a067 PUD 1eb1ae067 PMD 0
[  846.434551] Oops: 0000 [#1] SMP PTI
[  846.434697] CPU: 0 PID: 44 Comm: kworker/u5:0 Tainted: G        W         4.18.0-rc3+ #1
[  846.434805] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  846.435000] Workqueue: fscrypt_read_queue decrypt_work
[  846.435174] RIP: 0010:fscrypt_do_page_crypto+0x6e/0x2d0
[  846.435351] Code: 00 65 48 8b 04 25 28 00 00 00 48 89 84 24 88 00 00 00 31 c0 e8 43 c2 e0 ff 49 8b 86 48 02 00 00 85 ed c7 44 24 70 00 00 00 00 <48> 8b 58 08 0f 84 14 02 00 00 48 8b 78 10 48 8b 0c 24 48 c7 84 24
[  846.435696] RSP: 0018:ffff961c40f9bd60 EFLAGS: 00010206
[  846.435870] RAX: 0000000000000000 RBX: ffffc5f787719b80 RCX: ffffc5f787719b80
[  846.436051] RDX: ffffffff8b9f4b88 RSI: ffffffff8b0ae622 RDI: ffff961c40f9bdb8
[  846.436261] RBP: 0000000000001000 R08: ffffc5f787719b80 R09: 0000000000001000
[  846.436433] R10: 0000000000000018 R11: fefefefefefefeff R12: ffffc5f787719b80
[  846.436562] R13: ffffc5f787719b80 R14: ffff89dff4ff88d0 R15: 0ffff89dfaddee60
[  846.436658] FS:  0000000000000000(0000) GS:ffff89dfffc00000(0000) knlGS:0000000000000000
[  846.436758] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  846.436898] CR2: 0000000000000008 CR3: 00000001eddd0000 CR4: 00000000000006f0
[  846.437001] Call Trace:
[  846.437181]  ? check_preempt_wakeup+0xf2/0x230
[  846.437276]  ? check_preempt_curr+0x7c/0x90
[  846.437370]  fscrypt_decrypt_page+0x48/0x4d
[  846.437466]  __fscrypt_decrypt_bio+0x5b/0x90
[  846.437542]  decrypt_work+0x12/0x20
[  846.437651]  process_one_work+0x15e/0x3d0
[  846.437740]  worker_thread+0x4c/0x440
[  846.437848]  kthread+0xf8/0x130
[  846.437938]  ? rescuer_thread+0x350/0x350
[  846.438022]  ? kthread_associate_blkcg+0x90/0x90
[  846.438117]  ret_from_fork+0x35/0x40
[  846.438201] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper
[  846.438653] CR2: 0000000000000008
[  846.438713] ---[ end trace abca54df39d14f60 ]---
[  846.438796] RIP: 0010:fscrypt_do_page_crypto+0x6e/0x2d0
[  846.438844] Code: 00 65 48 8b 04 25 28 00 00 00 48 89 84 24 88 00 00 00 31 c0 e8 43 c2 e0 ff 49 8b 86 48 02 00 00 85 ed c7 44 24 70 00 00 00 00 <48> 8b 58 08 0f 84 14 02 00 00 48 8b 78 10 48 8b 0c 24 48 c7 84 24
[  846.439084] RSP: 0018:ffff961c40f9bd60 EFLAGS: 00010206
[  846.439176] RAX: 0000000000000000 RBX: ffffc5f787719b80 RCX: ffffc5f787719b80
[  846.440927] RDX: ffffffff8b9f4b88 RSI: ffffffff8b0ae622 RDI: ffff961c40f9bdb8
[  846.442083] RBP: 0000000000001000 R08: ffffc5f787719b80 R09: 0000000000001000
[  846.443284] R10: 0000000000000018 R11: fefefefefefefeff R12: ffffc5f787719b80
[  846.444448] R13: ffffc5f787719b80 R14: ffff89dff4ff88d0 R15: 0ffff89dfaddee60
[  846.445558] FS:  0000000000000000(0000) GS:ffff89dfffc00000(0000) knlGS:0000000000000000
[  846.446687] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  846.447796] CR2: 0000000000000008 CR3: 00000001eddd0000 CR4: 00000000000006f0

- Location
https://elixir.bootlin.com/linux/v4.18-rc4/source/fs/crypto/crypto.c#L149
	struct crypto_skcipher *tfm = ci->ci_ctfm;
Here ci can be NULL

Note that this issue maybe require CONFIG_F2FS_FS_ENCRYPTION=y to reproduce.

Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/data.c  |  3 +++
 fs/f2fs/inode.c | 18 +++++++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fe7694ec3546..2b0b671484bd 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -884,6 +884,9 @@ struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
 	struct block_device *bdev = sbi->sb->s_bdev;
 	struct bio *bio;
 
+	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC))
+		return ERR_PTR(-EFAULT);
+
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
 		ctx = f2fs_get_crypto_ctx(inode);
 		if (IS_ERR(ctx))
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 3ac57cbee0fd..89bf8dd7758c 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -50,14 +50,16 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 	}
 }
 
-static bool __written_first_block(struct f2fs_sb_info *sbi,
+static int __written_first_block(struct f2fs_sb_info *sbi,
 					struct f2fs_inode *ri)
 {
 	block_t addr = le32_to_cpu(ri->i_addr[0]);
 
-	if (is_valid_data_blkaddr(sbi, addr))
-		return true;
-	return false;
+	if (!__is_valid_data_blkaddr(addr))
+		return 1;
+	if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC))
+		return -EFAULT;
+	return 0;
 }
 
 static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -145,6 +147,7 @@ static int do_read_inode(struct inode *inode)
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct page *node_page;
 	struct f2fs_inode *ri;
+	int err;
 
 	/* Check if ino is within scope */
 	if (check_nid_range(sbi, inode->i_ino)) {
@@ -199,7 +202,12 @@ static int do_read_inode(struct inode *inode)
 	/* get rdev by using inline_info */
 	__get_inode_rdev(inode, ri);
 
-	if (__written_first_block(sbi, ri))
+	err = __written_first_block(sbi, ri);
+	if (err < 0) {
+		f2fs_put_page(node_page, 1);
+		return err;
+	}
+	if (!err)
 		set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
 
 	f2fs_put_page(node_page, 1);

From 98beb84af7212a2ba50370497e569ae3f61b1c8b Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 1 Aug 2018 19:16:11 +0800
Subject: [PATCH 2287/3220] f2fs: fix to do sanity check with cp_pack_start_sum

commit e494c2f995d6181d6e29c4927d68e0f295ecf75b upstream.

After fuzzing, cp_pack_start_sum could be corrupted, so current log's
summary info should be wrong due to loading incorrect summary block.
Then, if segment's type in current log is exceeded NR_CURSEG_TYPE, it
can lead accessing invalid dirty_i->dirty_segmap bitmap finally.

Add sanity check for cp_pack_start_sum to fix this issue.

https://bugzilla.kernel.org/show_bug.cgi?id=200419

- Reproduce

- Kernel message (f2fs-dev w/ KASAN)
[ 3117.578432] F2FS-fs (loop0): Invalid log blocks per segment (8)

[ 3117.578445] F2FS-fs (loop0): Can't find valid F2FS filesystem in 2th superblock
[ 3117.581364] F2FS-fs (loop0): invalid crc_offset: 30716
[ 3117.583564] WARNING: CPU: 1 PID: 1225 at fs/f2fs/checkpoint.c:90 __get_meta_page+0x448/0x4b0
[ 3117.583570] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer joydev input_leds serio_raw snd soundcore mac_hid i2c_piix4 ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear 8139too qxl ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel psmouse aes_x86_64 8139cp crypto_simd cryptd mii glue_helper pata_acpi floppy
[ 3117.584014] CPU: 1 PID: 1225 Comm: mount Not tainted 4.17.0+ #1
[ 3117.584017] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 3117.584022] RIP: 0010:__get_meta_page+0x448/0x4b0
[ 3117.584023] Code: 00 49 8d bc 24 84 00 00 00 e8 74 54 da ff 41 83 8c 24 84 00 00 00 08 4c 89 f6 4c 89 ef e8 c0 d9 95 00 48 89 ef e8 18 e3 00 00 <0f> 0b f0 80 4d 48 04 e9 0f fe ff ff 0f 0b 48 89 c7 48 89 04 24 e8
[ 3117.584072] RSP: 0018:ffff88018eb678c0 EFLAGS: 00010286
[ 3117.584082] RAX: ffff88018f0a6a78 RBX: ffffea0007a46600 RCX: ffffffff9314d1b2
[ 3117.584085] RDX: ffffffff00000001 RSI: 0000000000000000 RDI: ffff88018f0a6a98
[ 3117.584087] RBP: ffff88018ebe9980 R08: 0000000000000002 R09: 0000000000000001
[ 3117.584090] R10: 0000000000000001 R11: ffffed00326e4450 R12: ffff880193722200
[ 3117.584092] R13: ffff88018ebe9afc R14: 0000000000000206 R15: ffff88018eb67900
[ 3117.584096] FS:  00007f5694636840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000
[ 3117.584098] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3117.584101] CR2: 00000000016f21b8 CR3: 0000000191c22000 CR4: 00000000000006e0
[ 3117.584112] Call Trace:
[ 3117.584121]  ? f2fs_set_meta_page_dirty+0x150/0x150
[ 3117.584127]  ? f2fs_build_segment_manager+0xbf9/0x3190
[ 3117.584133]  ? f2fs_npages_for_summary_flush+0x75/0x120
[ 3117.584145]  f2fs_build_segment_manager+0xda8/0x3190
[ 3117.584151]  ? f2fs_get_valid_checkpoint+0x298/0xa00
[ 3117.584156]  ? f2fs_flush_sit_entries+0x10e0/0x10e0
[ 3117.584184]  ? map_id_range_down+0x17c/0x1b0
[ 3117.584188]  ? __put_user_ns+0x30/0x30
[ 3117.584206]  ? find_next_bit+0x53/0x90
[ 3117.584237]  ? cpumask_next+0x16/0x20
[ 3117.584249]  f2fs_fill_super+0x1948/0x2b40
[ 3117.584258]  ? f2fs_commit_super+0x1a0/0x1a0
[ 3117.584279]  ? sget_userns+0x65e/0x690
[ 3117.584296]  ? set_blocksize+0x88/0x130
[ 3117.584302]  ? f2fs_commit_super+0x1a0/0x1a0
[ 3117.584305]  mount_bdev+0x1c0/0x200
[ 3117.584310]  mount_fs+0x5c/0x190
[ 3117.584320]  vfs_kern_mount+0x64/0x190
[ 3117.584330]  do_mount+0x2e4/0x1450
[ 3117.584343]  ? lockref_put_return+0x130/0x130
[ 3117.584347]  ? copy_mount_string+0x20/0x20
[ 3117.584357]  ? kasan_unpoison_shadow+0x31/0x40
[ 3117.584362]  ? kasan_kmalloc+0xa6/0xd0
[ 3117.584373]  ? memcg_kmem_put_cache+0x16/0x90
[ 3117.584377]  ? __kmalloc_track_caller+0x196/0x210
[ 3117.584383]  ? _copy_from_user+0x61/0x90
[ 3117.584396]  ? memdup_user+0x3e/0x60
[ 3117.584401]  ksys_mount+0x7e/0xd0
[ 3117.584405]  __x64_sys_mount+0x62/0x70
[ 3117.584427]  do_syscall_64+0x73/0x160
[ 3117.584440]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3117.584455] RIP: 0033:0x7f5693f14b9a
[ 3117.584456] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48
[ 3117.584505] RSP: 002b:00007fff27346488 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5
[ 3117.584510] RAX: ffffffffffffffda RBX: 00000000016e2030 RCX: 00007f5693f14b9a
[ 3117.584512] RDX: 00000000016e2210 RSI: 00000000016e3f30 RDI: 00000000016ee040
[ 3117.584514] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013
[ 3117.584516] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 00000000016ee040
[ 3117.584519] R13: 00000000016e2210 R14: 0000000000000000 R15: 0000000000000003
[ 3117.584523] ---[ end trace a8e0d899985faf31 ]---
[ 3117.685663] F2FS-fs (loop0): f2fs_check_nid_range: out-of-range nid=2, run fsck to fix.
[ 3117.685673] F2FS-fs (loop0): recover_data: ino = 2 (i_size: recover) recovered = 1, err = 0
[ 3117.685707] ==================================================================
[ 3117.685955] BUG: KASAN: slab-out-of-bounds in __remove_dirty_segment+0xdd/0x1e0
[ 3117.686175] Read of size 8 at addr ffff88018f0a63d0 by task mount/1225

[ 3117.686477] CPU: 0 PID: 1225 Comm: mount Tainted: G        W         4.17.0+ #1
[ 3117.686481] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 3117.686483] Call Trace:
[ 3117.686494]  dump_stack+0x71/0xab
[ 3117.686512]  print_address_description+0x6b/0x290
[ 3117.686517]  kasan_report+0x28e/0x390
[ 3117.686522]  ? __remove_dirty_segment+0xdd/0x1e0
[ 3117.686527]  __remove_dirty_segment+0xdd/0x1e0
[ 3117.686532]  locate_dirty_segment+0x189/0x190
[ 3117.686538]  f2fs_allocate_new_segments+0xa9/0xe0
[ 3117.686543]  recover_data+0x703/0x2c20
[ 3117.686547]  ? f2fs_recover_fsync_data+0x48f/0xd50
[ 3117.686553]  ? ksys_mount+0x7e/0xd0
[ 3117.686564]  ? policy_nodemask+0x1a/0x90
[ 3117.686567]  ? policy_node+0x56/0x70
[ 3117.686571]  ? add_fsync_inode+0xf0/0xf0
[ 3117.686592]  ? blk_finish_plug+0x44/0x60
[ 3117.686597]  ? f2fs_ra_meta_pages+0x38b/0x5e0
[ 3117.686602]  ? find_inode_fast+0xac/0xc0
[ 3117.686606]  ? f2fs_is_valid_blkaddr+0x320/0x320
[ 3117.686618]  ? __radix_tree_lookup+0x150/0x150
[ 3117.686633]  ? dqget+0x670/0x670
[ 3117.686648]  ? pagecache_get_page+0x29/0x410
[ 3117.686656]  ? kmem_cache_alloc+0x176/0x1e0
[ 3117.686660]  ? f2fs_is_valid_blkaddr+0x11d/0x320
[ 3117.686664]  f2fs_recover_fsync_data+0xc23/0xd50
[ 3117.686670]  ? f2fs_space_for_roll_forward+0x60/0x60
[ 3117.686674]  ? rb_insert_color+0x323/0x3d0
[ 3117.686678]  ? f2fs_recover_orphan_inodes+0xa5/0x700
[ 3117.686683]  ? proc_register+0x153/0x1d0
[ 3117.686686]  ? f2fs_remove_orphan_inode+0x10/0x10
[ 3117.686695]  ? f2fs_attr_store+0x50/0x50
[ 3117.686700]  ? proc_create_single_data+0x52/0x60
[ 3117.686707]  f2fs_fill_super+0x1d06/0x2b40
[ 3117.686728]  ? f2fs_commit_super+0x1a0/0x1a0
[ 3117.686735]  ? sget_userns+0x65e/0x690
[ 3117.686740]  ? set_blocksize+0x88/0x130
[ 3117.686745]  ? f2fs_commit_super+0x1a0/0x1a0
[ 3117.686748]  mount_bdev+0x1c0/0x200
[ 3117.686753]  mount_fs+0x5c/0x190
[ 3117.686758]  vfs_kern_mount+0x64/0x190
[ 3117.686762]  do_mount+0x2e4/0x1450
[ 3117.686769]  ? lockref_put_return+0x130/0x130
[ 3117.686773]  ? copy_mount_string+0x20/0x20
[ 3117.686777]  ? kasan_unpoison_shadow+0x31/0x40
[ 3117.686780]  ? kasan_kmalloc+0xa6/0xd0
[ 3117.686786]  ? memcg_kmem_put_cache+0x16/0x90
[ 3117.686790]  ? __kmalloc_track_caller+0x196/0x210
[ 3117.686795]  ? _copy_from_user+0x61/0x90
[ 3117.686801]  ? memdup_user+0x3e/0x60
[ 3117.686804]  ksys_mount+0x7e/0xd0
[ 3117.686809]  __x64_sys_mount+0x62/0x70
[ 3117.686816]  do_syscall_64+0x73/0x160
[ 3117.686824]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3117.686829] RIP: 0033:0x7f5693f14b9a
[ 3117.686830] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48
[ 3117.686887] RSP: 002b:00007fff27346488 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5
[ 3117.686892] RAX: ffffffffffffffda RBX: 00000000016e2030 RCX: 00007f5693f14b9a
[ 3117.686894] RDX: 00000000016e2210 RSI: 00000000016e3f30 RDI: 00000000016ee040
[ 3117.686896] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013
[ 3117.686899] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 00000000016ee040
[ 3117.686901] R13: 00000000016e2210 R14: 0000000000000000 R15: 0000000000000003

[ 3117.687005] Allocated by task 1225:
[ 3117.687152]  kasan_kmalloc+0xa6/0xd0
[ 3117.687157]  kmem_cache_alloc_trace+0xfd/0x200
[ 3117.687161]  f2fs_build_segment_manager+0x2d09/0x3190
[ 3117.687165]  f2fs_fill_super+0x1948/0x2b40
[ 3117.687168]  mount_bdev+0x1c0/0x200
[ 3117.687171]  mount_fs+0x5c/0x190
[ 3117.687174]  vfs_kern_mount+0x64/0x190
[ 3117.687177]  do_mount+0x2e4/0x1450
[ 3117.687180]  ksys_mount+0x7e/0xd0
[ 3117.687182]  __x64_sys_mount+0x62/0x70
[ 3117.687186]  do_syscall_64+0x73/0x160
[ 3117.687190]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

[ 3117.687285] Freed by task 19:
[ 3117.687412]  __kasan_slab_free+0x137/0x190
[ 3117.687416]  kfree+0x8b/0x1b0
[ 3117.687460]  ttm_bo_man_put_node+0x61/0x80 [ttm]
[ 3117.687476]  ttm_bo_cleanup_refs+0x15f/0x250 [ttm]
[ 3117.687492]  ttm_bo_delayed_delete+0x2f0/0x300 [ttm]
[ 3117.687507]  ttm_bo_delayed_workqueue+0x17/0x50 [ttm]
[ 3117.687528]  process_one_work+0x2f9/0x740
[ 3117.687531]  worker_thread+0x78/0x6b0
[ 3117.687541]  kthread+0x177/0x1c0
[ 3117.687545]  ret_from_fork+0x35/0x40

[ 3117.687638] The buggy address belongs to the object at ffff88018f0a6300
                which belongs to the cache kmalloc-192 of size 192
[ 3117.688014] The buggy address is located 16 bytes to the right of
                192-byte region [ffff88018f0a6300, ffff88018f0a63c0)
[ 3117.688382] The buggy address belongs to the page:
[ 3117.688554] page:ffffea00063c2980 count:1 mapcount:0 mapping:ffff8801f3403180 index:0x0
[ 3117.688788] flags: 0x17fff8000000100(slab)
[ 3117.688944] raw: 017fff8000000100 ffffea00063c2840 0000000e0000000e ffff8801f3403180
[ 3117.689166] raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000
[ 3117.689386] page dumped because: kasan: bad access detected

[ 3117.689653] Memory state around the buggy address:
[ 3117.689816]  ffff88018f0a6280: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
[ 3117.690027]  ffff88018f0a6300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 3117.690239] >ffff88018f0a6380: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 3117.690448]                                                  ^
[ 3117.690644]  ffff88018f0a6400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 3117.690868]  ffff88018f0a6480: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 3117.691077] ==================================================================
[ 3117.691290] Disabling lock debugging due to kernel taint
[ 3117.693893] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[ 3117.694120] PGD 80000001f01bc067 P4D 80000001f01bc067 PUD 1d9638067 PMD 0
[ 3117.694338] Oops: 0002 [#1] SMP KASAN PTI
[ 3117.694490] CPU: 1 PID: 1225 Comm: mount Tainted: G    B   W         4.17.0+ #1
[ 3117.694703] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 3117.695073] RIP: 0010:__remove_dirty_segment+0xe2/0x1e0
[ 3117.695246] Code: c4 48 89 c7 e8 cf bb d7 ff 45 0f b6 24 24 41 83 e4 3f 44 88 64 24 07 41 83 e4 3f 4a 8d 7c e3 08 e8 b3 bc d7 ff 4a 8b 4c e3 08 <f0> 4c 0f b3 29 0f 82 94 00 00 00 48 8d bd 20 04 00 00 e8 97 bb d7
[ 3117.695793] RSP: 0018:ffff88018eb67638 EFLAGS: 00010292
[ 3117.695969] RAX: 0000000000000000 RBX: ffff88018f0a6300 RCX: 0000000000000000
[ 3117.696182] RDX: 0000000000000000 RSI: 0000000000000297 RDI: 0000000000000297
[ 3117.696391] RBP: ffff88018ebe9980 R08: ffffed003e743ebb R09: ffffed003e743ebb
[ 3117.696604] R10: 0000000000000001 R11: ffffed003e743eba R12: 0000000000000019
[ 3117.696813] R13: 0000000000000014 R14: 0000000000000320 R15: ffff88018ebe99e0
[ 3117.697032] FS:  00007f5694636840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000
[ 3117.697280] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3117.702357] CR2: 00007fe89bb1a000 CR3: 0000000191c22000 CR4: 00000000000006e0
[ 3117.707235] Call Trace:
[ 3117.712077]  locate_dirty_segment+0x189/0x190
[ 3117.716891]  f2fs_allocate_new_segments+0xa9/0xe0
[ 3117.721617]  recover_data+0x703/0x2c20
[ 3117.726316]  ? f2fs_recover_fsync_data+0x48f/0xd50
[ 3117.730957]  ? ksys_mount+0x7e/0xd0
[ 3117.735573]  ? policy_nodemask+0x1a/0x90
[ 3117.740198]  ? policy_node+0x56/0x70
[ 3117.744829]  ? add_fsync_inode+0xf0/0xf0
[ 3117.749487]  ? blk_finish_plug+0x44/0x60
[ 3117.754152]  ? f2fs_ra_meta_pages+0x38b/0x5e0
[ 3117.758831]  ? find_inode_fast+0xac/0xc0
[ 3117.763448]  ? f2fs_is_valid_blkaddr+0x320/0x320
[ 3117.768046]  ? __radix_tree_lookup+0x150/0x150
[ 3117.772603]  ? dqget+0x670/0x670
[ 3117.777159]  ? pagecache_get_page+0x29/0x410
[ 3117.781648]  ? kmem_cache_alloc+0x176/0x1e0
[ 3117.786067]  ? f2fs_is_valid_blkaddr+0x11d/0x320
[ 3117.790476]  f2fs_recover_fsync_data+0xc23/0xd50
[ 3117.794790]  ? f2fs_space_for_roll_forward+0x60/0x60
[ 3117.799086]  ? rb_insert_color+0x323/0x3d0
[ 3117.803304]  ? f2fs_recover_orphan_inodes+0xa5/0x700
[ 3117.807563]  ? proc_register+0x153/0x1d0
[ 3117.811766]  ? f2fs_remove_orphan_inode+0x10/0x10
[ 3117.815947]  ? f2fs_attr_store+0x50/0x50
[ 3117.820087]  ? proc_create_single_data+0x52/0x60
[ 3117.824262]  f2fs_fill_super+0x1d06/0x2b40
[ 3117.828367]  ? f2fs_commit_super+0x1a0/0x1a0
[ 3117.832432]  ? sget_userns+0x65e/0x690
[ 3117.836500]  ? set_blocksize+0x88/0x130
[ 3117.840501]  ? f2fs_commit_super+0x1a0/0x1a0
[ 3117.844420]  mount_bdev+0x1c0/0x200
[ 3117.848275]  mount_fs+0x5c/0x190
[ 3117.852053]  vfs_kern_mount+0x64/0x190
[ 3117.855810]  do_mount+0x2e4/0x1450
[ 3117.859441]  ? lockref_put_return+0x130/0x130
[ 3117.862996]  ? copy_mount_string+0x20/0x20
[ 3117.866417]  ? kasan_unpoison_shadow+0x31/0x40
[ 3117.869719]  ? kasan_kmalloc+0xa6/0xd0
[ 3117.872948]  ? memcg_kmem_put_cache+0x16/0x90
[ 3117.876121]  ? __kmalloc_track_caller+0x196/0x210
[ 3117.879333]  ? _copy_from_user+0x61/0x90
[ 3117.882467]  ? memdup_user+0x3e/0x60
[ 3117.885604]  ksys_mount+0x7e/0xd0
[ 3117.888700]  __x64_sys_mount+0x62/0x70
[ 3117.891742]  do_syscall_64+0x73/0x160
[ 3117.894692]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3117.897669] RIP: 0033:0x7f5693f14b9a
[ 3117.900563] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48
[ 3117.906922] RSP: 002b:00007fff27346488 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5
[ 3117.910159] RAX: ffffffffffffffda RBX: 00000000016e2030 RCX: 00007f5693f14b9a
[ 3117.913469] RDX: 00000000016e2210 RSI: 00000000016e3f30 RDI: 00000000016ee040
[ 3117.916764] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013
[ 3117.920071] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 00000000016ee040
[ 3117.923393] R13: 00000000016e2210 R14: 0000000000000000 R15: 0000000000000003
[ 3117.926680] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer joydev input_leds serio_raw snd soundcore mac_hid i2c_piix4 ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear 8139too qxl ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel psmouse aes_x86_64 8139cp crypto_simd cryptd mii glue_helper pata_acpi floppy
[ 3117.949979] CR2: 0000000000000000
[ 3117.954283] ---[ end trace a8e0d899985faf32 ]---
[ 3117.958575] RIP: 0010:__remove_dirty_segment+0xe2/0x1e0
[ 3117.962810] Code: c4 48 89 c7 e8 cf bb d7 ff 45 0f b6 24 24 41 83 e4 3f 44 88 64 24 07 41 83 e4 3f 4a 8d 7c e3 08 e8 b3 bc d7 ff 4a 8b 4c e3 08 <f0> 4c 0f b3 29 0f 82 94 00 00 00 48 8d bd 20 04 00 00 e8 97 bb d7
[ 3117.971789] RSP: 0018:ffff88018eb67638 EFLAGS: 00010292
[ 3117.976333] RAX: 0000000000000000 RBX: ffff88018f0a6300 RCX: 0000000000000000
[ 3117.980926] RDX: 0000000000000000 RSI: 0000000000000297 RDI: 0000000000000297
[ 3117.985497] RBP: ffff88018ebe9980 R08: ffffed003e743ebb R09: ffffed003e743ebb
[ 3117.990098] R10: 0000000000000001 R11: ffffed003e743eba R12: 0000000000000019
[ 3117.994761] R13: 0000000000000014 R14: 0000000000000320 R15: ffff88018ebe99e0
[ 3117.999392] FS:  00007f5694636840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000
[ 3118.004096] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3118.008816] CR2: 00007fe89bb1a000 CR3: 0000000191c22000 CR4: 00000000000006e0

- Location
https://elixir.bootlin.com/linux/v4.18-rc3/source/fs/f2fs/segment.c#L775
		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
			dirty_i->nr_dirty[t]--;
Here dirty_i->dirty_segmap[t] can be NULL which leads to crash in test_and_clear_bit()

Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4: The function is called sanity_check_ckpt()]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c |  8 ++++----
 fs/f2fs/super.c      | 12 ++++++++++++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b6edb394069d..513e5b8131dd 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -731,15 +731,15 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 	cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
 	memcpy(sbi->ckpt, cp_block, blk_size);
 
-	/* Sanity checking of checkpoint */
-	if (sanity_check_ckpt(sbi))
-		goto free_fail_no_cp;
-
 	if (cur_page == cp1)
 		sbi->cur_cp_pack = 1;
 	else
 		sbi->cur_cp_pack = 2;
 
+	/* Sanity checking of checkpoint */
+	if (sanity_check_ckpt(sbi))
+		goto free_fail_no_cp;
+
 	if (cp_blks <= 1)
 		goto done;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 61222a586c46..46f78b5162e7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1141,6 +1141,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	unsigned int sit_bitmap_size, nat_bitmap_size;
 	unsigned int log_blocks_per_seg;
 	unsigned int segment_count_main;
+	unsigned int cp_pack_start_sum, cp_payload;
 	block_t user_block_count;
 	int i;
 
@@ -1201,6 +1202,17 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 		return 1;
 	}
 
+	cp_pack_start_sum = __start_sum_addr(sbi);
+	cp_payload = __cp_payload(sbi);
+	if (cp_pack_start_sum < cp_payload + 1 ||
+		cp_pack_start_sum > blocks_per_seg - 1 -
+			NR_CURSEG_TYPE) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+			"Wrong cp_pack_start_sum: %u",
+			cp_pack_start_sum);
+		return 1;
+	}
+
 	if (unlikely(f2fs_cp_error(sbi))) {
 		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
 		return 1;

From b50d8b9e17507b3cbd414efc5a7249ba4f5b2314 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 2 Aug 2018 22:59:12 +0800
Subject: [PATCH 2288/3220] f2fs: fix invalid memory access

commit d3f07c049dab1a3f1740f476afd3d5e5b738c21c upstream.

syzbot found the following crash on:

HEAD commit:    d9bd94c0bcaa Add linux-next specific files for 20180801
git tree:       linux-next
console output: https://syzkaller.appspot.com/x/log.txt?x=1001189c400000
kernel config:  https://syzkaller.appspot.com/x/.config?x=cc8964ea4d04518c
dashboard link: https://syzkaller.appspot.com/bug?extid=c966a82db0b14aa37e81
compiler:       gcc (GCC) 8.0.1 20180413 (experimental)

Unfortunately, I don't have any reproducer for this crash yet.

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+c966a82db0b14aa37e81@syzkaller.appspotmail.com

loop7: rw=12288, want=8200, limit=20
netlink: 65342 bytes leftover after parsing attributes in process `syz-executor4'.
openvswitch: netlink: Message has 8 unknown bytes.
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
CPU: 1 PID: 7615 Comm: syz-executor7 Not tainted 4.18.0-rc7-next-20180801+ #29
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:__read_once_size include/linux/compiler.h:188 [inline]
RIP: 0010:compound_head include/linux/page-flags.h:142 [inline]
RIP: 0010:PageLocked include/linux/page-flags.h:272 [inline]
RIP: 0010:f2fs_put_page fs/f2fs/f2fs.h:2011 [inline]
RIP: 0010:validate_checkpoint+0x66d/0xec0 fs/f2fs/checkpoint.c:835
Code: e8 58 05 7f fe 4c 8d 6b 80 4d 8d 74 24 08 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 ea 03 c6 04 02 00 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 f4 06 00 00 4c 89 ea 4d 8b 7c 24 08 48 b8 00 00
RSP: 0018:ffff8801937cebe8 EFLAGS: 00010246
RAX: dffffc0000000000 RBX: ffff8801937cef30 RCX: ffffc90006035000
RDX: 0000000000000000 RSI: ffffffff82fd9658 RDI: 0000000000000005
RBP: ffff8801937cef58 R08: ffff8801ab254700 R09: fffff94000d9e026
R10: fffff94000d9e026 R11: ffffea0006cf0137 R12: fffffffffffffffb
R13: ffff8801937ceeb0 R14: 0000000000000003 R15: ffff880193419b40
FS:  00007f36a61d5700(0000) GS:ffff8801db100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fc04ff93000 CR3: 00000001d0562000 CR4: 00000000001426e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 f2fs_get_valid_checkpoint+0x436/0x1ec0 fs/f2fs/checkpoint.c:860
 f2fs_fill_super+0x2d42/0x8110 fs/f2fs/super.c:2883
 mount_bdev+0x314/0x3e0 fs/super.c:1344
 f2fs_mount+0x3c/0x50 fs/f2fs/super.c:3133
 legacy_get_tree+0x131/0x460 fs/fs_context.c:729
 vfs_get_tree+0x1cb/0x5c0 fs/super.c:1743
 do_new_mount fs/namespace.c:2603 [inline]
 do_mount+0x6f2/0x1e20 fs/namespace.c:2927
 ksys_mount+0x12d/0x140 fs/namespace.c:3143
 __do_sys_mount fs/namespace.c:3157 [inline]
 __se_sys_mount fs/namespace.c:3154 [inline]
 __x64_sys_mount+0xbe/0x150 fs/namespace.c:3154
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x45943a
Code: b8 a6 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 bd 8a fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 0f 83 9a 8a fb ff c3 66 0f 1f 84 00 00 00 00 00
RSP: 002b:00007f36a61d4a88 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 00007f36a61d4b30 RCX: 000000000045943a
RDX: 00007f36a61d4ad0 RSI: 0000000020000100 RDI: 00007f36a61d4af0
RBP: 0000000020000100 R08: 00007f36a61d4b30 R09: 00007f36a61d4ad0
R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000013
R13: 0000000000000000 R14: 00000000004c8ea0 R15: 0000000000000000
Modules linked in:
Dumping ftrace buffer:
   (ftrace buffer empty)
---[ end trace bd8550c129352286 ]---
RIP: 0010:__read_once_size include/linux/compiler.h:188 [inline]
RIP: 0010:compound_head include/linux/page-flags.h:142 [inline]
RIP: 0010:PageLocked include/linux/page-flags.h:272 [inline]
RIP: 0010:f2fs_put_page fs/f2fs/f2fs.h:2011 [inline]
RIP: 0010:validate_checkpoint+0x66d/0xec0 fs/f2fs/checkpoint.c:835
Code: e8 58 05 7f fe 4c 8d 6b 80 4d 8d 74 24 08 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 ea 03 c6 04 02 00 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 f4 06 00 00 4c 89 ea 4d 8b 7c 24 08 48 b8 00 00
RSP: 0018:ffff8801937cebe8 EFLAGS: 00010246
RAX: dffffc0000000000 RBX: ffff8801937cef30 RCX: ffffc90006035000
RDX: 0000000000000000 RSI: ffffffff82fd9658 RDI: 0000000000000005
netlink: 65342 bytes leftover after parsing attributes in process `syz-executor4'.
RBP: ffff8801937cef58 R08: ffff8801ab254700 R09: fffff94000d9e026
openvswitch: netlink: Message has 8 unknown bytes.
R10: fffff94000d9e026 R11: ffffea0006cf0137 R12: fffffffffffffffb
R13: ffff8801937ceeb0 R14: 0000000000000003 R15: ffff880193419b40
FS:  00007f36a61d5700(0000) GS:ffff8801db100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fc04ff93000 CR3: 00000001d0562000 CR4: 00000000001426e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

In validate_checkpoint(), if we failed to call get_checkpoint_version(), we
will pass returned invalid page pointer into f2fs_put_page, cause accessing
invalid memory, this patch tries to handle error path correctly to fix this
issue.

Signed-off-by: Chao Yu <yuchao0@huawei.com>

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/checkpoint.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 513e5b8131dd..4b2f609f376d 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -631,6 +631,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
 
 	crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
 	if (crc_offset >= blk_size) {
+		f2fs_put_page(*cp_page, 1);
 		f2fs_msg(sbi->sb, KERN_WARNING,
 			"invalid crc_offset: %zu", crc_offset);
 		return -EINVAL;
@@ -639,6 +640,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
 	crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block
 							+ crc_offset)));
 	if (!f2fs_crc_valid(crc, *cp_block, crc_offset)) {
+		f2fs_put_page(*cp_page, 1);
 		f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value");
 		return -EINVAL;
 	}
@@ -658,14 +660,14 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 	err = get_checkpoint_version(sbi, cp_addr, &cp_block,
 					&cp_page_1, version);
 	if (err)
-		goto invalid_cp1;
+		return NULL;
 
 	if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
 					sbi->blocks_per_seg) {
 		f2fs_msg(sbi->sb, KERN_WARNING,
 			"invalid cp_pack_total_block_count:%u",
 			le32_to_cpu(cp_block->cp_pack_total_block_count));
-		goto invalid_cp1;
+		goto invalid_cp;
 	}
 	pre_version = *version;
 
@@ -673,7 +675,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 	err = get_checkpoint_version(sbi, cp_addr, &cp_block,
 					&cp_page_2, version);
 	if (err)
-		goto invalid_cp2;
+		goto invalid_cp;
 	cur_version = *version;
 
 	if (cur_version == pre_version) {
@@ -681,9 +683,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 		f2fs_put_page(cp_page_2, 1);
 		return cp_page_1;
 	}
-invalid_cp2:
 	f2fs_put_page(cp_page_2, 1);
-invalid_cp1:
+invalid_cp:
 	f2fs_put_page(cp_page_1, 1);
 	return NULL;
 }

From a0c9aa92c7992452d18b420671167aba42128653 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 27 Sep 2018 22:15:31 -0700
Subject: [PATCH 2289/3220] f2fs: fix missing up_read

commit 89d13c38501df730cbb2e02c4499da1b5187119d upstream.

This patch fixes missing up_read call.

Fixes: c9b60788fc76 ("f2fs: fix to do sanity check with block address in main area")
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/node.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 016473fc267d..3685fea62333 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1342,8 +1342,10 @@ static int f2fs_write_node_page(struct page *page,
 	}
 
 	if (__is_valid_data_blkaddr(ni.blk_addr) &&
-		!f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC))
+		!f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) {
+		up_read(&sbi->node_write);
 		goto redirty_out;
+	}
 
 	set_page_writeback(page);
 	fio.blk_addr = ni.blk_addr;

From afd11670e22177c4a0724377398fa10436aa8722 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 22 Dec 2018 11:22:26 +0100
Subject: [PATCH 2290/3220] f2fs: fix validation of the block count in
 sanity_check_raw_super

commit 88960068f25fcc3759455d85460234dcc9d43fef upstream.

Treat "block_count" from struct f2fs_super_block as 64-bit little endian
value in sanity_check_raw_super() because struct f2fs_super_block
declares "block_count" as "__le64".

This fixes a bug where the superblock validation fails on big endian
devices with the following error:
  F2FS-fs (sda1): Wrong segment_count / block_count (61439 > 0)
  F2FS-fs (sda1): Can't find valid F2FS filesystem in 1th superblock
  F2FS-fs (sda1): Wrong segment_count / block_count (61439 > 0)
  F2FS-fs (sda1): Can't find valid F2FS filesystem in 2th superblock
As result of this the partition cannot be mounted.

With this patch applied the superblock validation works fine and the
partition can be mounted again:
  F2FS-fs (sda1): Mounted with checkpoint version = 7c84

My little endian x86-64 hardware was able to mount the partition without
this fix.
To confirm that mounting f2fs filesystems works on big endian machines
again I tested this on a 32-bit MIPS big endian (lantiq) device.

Fixes: 0cfe75c5b01199 ("f2fs: enhance sanity_check_raw_super() to avoid potential overflows")
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 46f78b5162e7..dbd7adff8b5a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1081,10 +1081,10 @@ static int sanity_check_raw_super(struct super_block *sb,
 		return 1;
 	}
 
-	if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) {
+	if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) {
 		f2fs_msg(sb, KERN_INFO,
-			"Wrong segment_count / block_count (%u > %u)",
-			segment_count, le32_to_cpu(raw_super->block_count));
+			"Wrong segment_count / block_count (%u > %llu)",
+			segment_count, le64_to_cpu(raw_super->block_count));
 		return 1;
 	}
 

From 9e86549c56f7dee7913044de7fb25e006bb54181 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Thu, 17 Jan 2019 00:22:48 +0000
Subject: [PATCH 2291/3220] media: em28xx: Fix misplaced reset of
 dev->v4l::field_count

The backport of commit afeaade90db4 "media: em28xx: make
v4l2-compliance happier by starting sequence on zero" added a
reset on em28xx_v4l2::field_count to em28xx_ctrl_notify(),
but it should be done in em28xx_start_analog_streaming().

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/usb/em28xx/em28xx-video.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/media/usb/em28xx/em28xx-video.c b/drivers/media/usb/em28xx/em28xx-video.c
index 6cfcdcea27e0..873948e429e8 100644
--- a/drivers/media/usb/em28xx/em28xx-video.c
+++ b/drivers/media/usb/em28xx/em28xx-video.c
@@ -930,6 +930,8 @@ int em28xx_start_analog_streaming(struct vb2_queue *vq, unsigned int count)
 
 	em28xx_videodbg("%s\n", __func__);
 
+	dev->v4l2->field_count = 0;
+
 	/* Make sure streaming is not already in progress for this type
 	   of filehandle (e.g. video, vbi) */
 	rc = res_get(dev, vq->type);
@@ -1149,8 +1151,6 @@ static void em28xx_ctrl_notify(struct v4l2_ctrl *ctrl, void *priv)
 {
 	struct em28xx *dev = priv;
 
-	dev->v4l2->field_count = 0;
-
 	/*
 	 * In the case of non-AC97 volume controls, we still need
 	 * to do some setups at em28xx, in order to mute/unmute

From b820fe255e5092ddbc4fa940b2f85a62a2986abf Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Mon, 14 Jan 2019 10:13:36 -0800
Subject: [PATCH 2292/3220] proc: Remove empty line in /proc/self/status

Prevent an empty line in /proc/self/status, allow iotop to work.

iotop does not like empty lines, fails with:
  File "/usr/local/lib64/python2.7/site-packages/iotop/data.py", line
196, in parse_proc_pid_status
    key, value = line.split(':\t', 1)
ValueError: need more than 1 value to unpack

[reading /proc/self/status]

Fixes: 84964fa3e5a0 ("proc: Provide details on speculation flaw mitigations")
Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/array.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index cb71cbae606d..60cbaa821164 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -333,7 +333,7 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 #ifdef CONFIG_SECCOMP
 	seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
 #endif
-	seq_printf(m, "\nSpeculation_Store_Bypass:\t");
+	seq_printf(m, "Speculation_Store_Bypass:\t");
 	switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
 	case -EINVAL:
 		seq_printf(m, "unknown");

From 0b6c2279b7a4a3b4c26781948c7edb08c7dd4488 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 18 Jan 2019 17:55:53 +0000
Subject: [PATCH 2293/3220] arm64/kvm: consistently handle host HCR_EL2 flags

[ Backport of upstream commit 4eaed6aa2c628101246bcabc91b203bfac1193f8 ]

In KVM we define the configuration of HCR_EL2 for a VHE HOST in
HCR_HOST_VHE_FLAGS, but we don't have a similar definition for the
non-VHE host flags, and open-code HCR_RW. Further, in head.S we
open-code the flags for VHE and non-VHE configurations.

In future, we're going to want to configure more flags for the host, so
lets add a HCR_HOST_NVHE_FLAGS defintion, and consistently use both
HCR_HOST_VHE_FLAGS and HCR_HOST_NVHE_FLAGS in the kvm code and head.S.

We now use mov_q to generate the HCR_EL2 value, as we use when
configuring other registers in head.S.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
Signed-off-by: Will Deacon <will.deacon@arm.com>
[kristina: backport to 4.4.y: non-VHE only; __deactivate_traps_nvhe in
  assembly; add #include]
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/include/asm/kvm_arm.h | 1 +
 arch/arm64/kernel/head.S         | 3 ++-
 arch/arm64/kvm/hyp.S             | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index ef8e13d379cb..013b7de45ee7 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -81,6 +81,7 @@
 			 HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW)
 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
 #define HCR_INT_OVERRIDE   (HCR_FMO | HCR_IMO)
+#define HCR_HOST_NVHE_FLAGS (HCR_RW)
 
 
 /* Hyp System Control Register (SCTLR_EL2) bits */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index d019c3a58cc2..0382eba4bf7b 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -30,6 +30,7 @@
 #include <asm/cache.h>
 #include <asm/cputype.h>
 #include <asm/kernel-pgtable.h>
+#include <asm/kvm_arm.h>
 #include <asm/memory.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/pgtable.h>
@@ -464,7 +465,7 @@ CPU_LE(	bic	x0, x0, #(3 << 24)	)	// Clear the EE and E0E bits for EL1
 	ret
 
 	/* Hyp configuration. */
-2:	mov	x0, #(1 << 31)			// 64-bit EL1
+2:	mov_q	x0, HCR_HOST_NVHE_FLAGS
 	msr	hcr_el2, x0
 
 	/* Generic timers. */
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 86c289832272..8d3da858c257 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -494,7 +494,7 @@
 .endm
 
 .macro deactivate_traps
-	mov	x2, #HCR_RW
+	mov_q	x2, HCR_HOST_NVHE_FLAGS
 	msr	hcr_el2, x2
 	msr	hstr_el2, xzr
 

From 5b1d8e5d86c29ddc9d2ee13dd82de98a5d2692d7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 18 Jan 2019 17:55:54 +0000
Subject: [PATCH 2294/3220] arm64: Don't trap host pointer auth use to EL2

[ Backport of upstream commit b3669b1e1c09890d61109a1a8ece2c5b66804714 ]

To allow EL0 (and/or EL1) to use pointer authentication functionality,
we must ensure that pointer authentication instructions and accesses to
pointer authentication keys are not trapped to EL2.

This patch ensures that HCR_EL2 is configured appropriately when the
kernel is booted at EL2. For non-VHE kernels we set HCR_EL2.{API,APK},
ensuring that EL1 can access keys and permit EL0 use of instructions.
For VHE kernels host EL0 (TGE && E2H) is unaffected by these settings,
and it doesn't matter how we configure HCR_EL2.{API,APK}, so we don't
bother setting them.

This does not enable support for KVM guests, since KVM manages HCR_EL2
itself when running VMs.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: kvmarm@lists.cs.columbia.edu
Signed-off-by: Will Deacon <will.deacon@arm.com>
[kristina: backport to 4.4.y: adjust context]
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/include/asm/kvm_arm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 013b7de45ee7..d7e7cf56e8d6 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -23,6 +23,8 @@
 #include <asm/types.h>
 
 /* Hyp Configuration Register (HCR) bits */
+#define HCR_API		(UL(1) << 41)
+#define HCR_APK		(UL(1) << 40)
 #define HCR_ID		(UL(1) << 33)
 #define HCR_CD		(UL(1) << 32)
 #define HCR_RW_SHIFT	31
@@ -81,7 +83,7 @@
 			 HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW)
 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
 #define HCR_INT_OVERRIDE   (HCR_FMO | HCR_IMO)
-#define HCR_HOST_NVHE_FLAGS (HCR_RW)
+#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK)
 
 
 /* Hyp System Control Register (SCTLR_EL2) bits */

From 876d68105d8aa250aa035455f450cc378c2303a0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Jan 2019 04:06:14 -0800
Subject: [PATCH 2295/3220] ipv6: fix kernel-infoleak in ipv6_local_error()

[ Upstream commit 7d033c9f6a7fd3821af75620a0257db87c2b552a ]

This patch makes sure the flow label in the IPv6 header
forged in ipv6_local_error() is initialized.

BUG: KMSAN: kernel-infoleak in _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
CPU: 1 PID: 24675 Comm: syz-executor1 Not tainted 4.20.0-rc7+ #4
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x173/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613
 kmsan_internal_check_memory+0x455/0xb00 mm/kmsan/kmsan.c:675
 kmsan_copy_to_user+0xab/0xc0 mm/kmsan/kmsan_hooks.c:601
 _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
 copy_to_user include/linux/uaccess.h:177 [inline]
 move_addr_to_user+0x2e9/0x4f0 net/socket.c:227
 ___sys_recvmsg+0x5d7/0x1140 net/socket.c:2284
 __sys_recvmsg net/socket.c:2327 [inline]
 __do_sys_recvmsg net/socket.c:2337 [inline]
 __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334
 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x457ec9
Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f8750c06c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002f
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457ec9
RDX: 0000000000002000 RSI: 0000000020000400 RDI: 0000000000000005
RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f8750c076d4
R13: 00000000004c4a60 R14: 00000000004d8140 R15: 00000000ffffffff

Uninit was stored to memory at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline]
 kmsan_save_stack mm/kmsan/kmsan.c:219 [inline]
 kmsan_internal_chain_origin+0x134/0x230 mm/kmsan/kmsan.c:439
 __msan_chain_origin+0x70/0xe0 mm/kmsan/kmsan_instr.c:200
 ipv6_recv_error+0x1e3f/0x1eb0 net/ipv6/datagram.c:475
 udpv6_recvmsg+0x398/0x2ab0 net/ipv6/udp.c:335
 inet_recvmsg+0x4fb/0x600 net/ipv4/af_inet.c:830
 sock_recvmsg_nosec net/socket.c:794 [inline]
 sock_recvmsg+0x1d1/0x230 net/socket.c:801
 ___sys_recvmsg+0x4d5/0x1140 net/socket.c:2278
 __sys_recvmsg net/socket.c:2327 [inline]
 __do_sys_recvmsg net/socket.c:2337 [inline]
 __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334
 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline]
 kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:158
 kmsan_kmalloc+0xa6/0x130 mm/kmsan/kmsan_hooks.c:176
 kmsan_slab_alloc+0xe/0x10 mm/kmsan/kmsan_hooks.c:185
 slab_post_alloc_hook mm/slab.h:446 [inline]
 slab_alloc_node mm/slub.c:2759 [inline]
 __kmalloc_node_track_caller+0xe18/0x1030 mm/slub.c:4383
 __kmalloc_reserve net/core/skbuff.c:137 [inline]
 __alloc_skb+0x309/0xa20 net/core/skbuff.c:205
 alloc_skb include/linux/skbuff.h:998 [inline]
 ipv6_local_error+0x1a7/0x9e0 net/ipv6/datagram.c:334
 __ip6_append_data+0x129f/0x4fd0 net/ipv6/ip6_output.c:1311
 ip6_make_skb+0x6cc/0xcf0 net/ipv6/ip6_output.c:1775
 udpv6_sendmsg+0x3f8e/0x45d0 net/ipv6/udp.c:1384
 inet_sendmsg+0x54a/0x720 net/ipv4/af_inet.c:798
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 __sys_sendto+0x8c4/0xac0 net/socket.c:1788
 __do_sys_sendto net/socket.c:1800 [inline]
 __se_sys_sendto+0x107/0x130 net/socket.c:1796
 __x64_sys_sendto+0x6e/0x90 net/socket.c:1796
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

Bytes 4-7 of 28 are uninitialized
Memory access of size 28 starts at ffff8881937bfce0
Data copied to user address 0000000020000000

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/datagram.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 9f6e57ded338..7e34bc750e4c 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -290,6 +290,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
 	skb_reset_network_header(skb);
 	iph = ipv6_hdr(skb);
 	iph->daddr = fl6->daddr;
+	ip6_flow_hdr(iph, 0, 0);
 
 	serr = SKB_EXT_ERR(skb);
 	serr->ee.ee_errno = err;

From 01267fc7a7723d307baf34992657f1010f2ece61 Mon Sep 17 00:00:00 2001
From: JianJhen Chen <kchen@synology.com>
Date: Sun, 6 Jan 2019 11:28:13 +0800
Subject: [PATCH 2296/3220] net: bridge: fix a bug on using a neighbour cache
 entry without checking its state

[ Upstream commit 4c84edc11b76590859b1e45dd676074c59602dc4 ]

When handling DNAT'ed packets on a bridge device, the neighbour cache entry
from lookup was used without checking its state. It means that a cache entry
in the NUD_STALE state will be used directly instead of entering the NUD_DELAY
state to confirm the reachability of the neighbor.

This problem becomes worse after commit 2724680bceee ("neigh: Keep neighbour
cache entries if number of them is small enough."), since all neighbour cache
entries in the NUD_STALE state will be kept in the neighbour table as long as
the number of cache entries does not exceed the value specified in gc_thresh1.

This commit validates the state of a neighbour cache entry before using
the entry.

Signed-off-by: JianJhen Chen <kchen@synology.com>
Reviewed-by: JinLin Chen <jlchen@synology.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/br_netfilter_hooks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 55dcb2b20b59..6def85d75b1d 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -267,7 +267,7 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_
 		struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 		int ret;
 
-		if (neigh->hh.hh_len) {
+		if ((neigh->nud_state & NUD_CONNECTED) && neigh->hh.hh_len) {
 			neigh_hh_bridge(&neigh->hh, skb);
 			skb->dev = nf_bridge->physindev;
 			ret = br_handle_frame_finish(net, sk, skb);

From b57db51007a0c171e102fa079ad427b4ed6977c5 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 8 Jan 2019 23:27:06 +0000
Subject: [PATCH 2297/3220] packet: Do not leak dev refcounts on error exit

[ Upstream commit d972f3dce8d161e2142da0ab1ef25df00e2f21a9 ]

'dev' is non NULL when the addr_len check triggers so it must goto a label
that does the dev_put otherwise dev will have a leaked refcount.

This bug causes the ib_ipoib module to become unloadable when using
systemd-network as it triggers this check on InfiniBand links.

Fixes: 99137b7888f4 ("packet: validate address length")
Reported-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/packet/af_packet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0f50977ed53b..753b2837318d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2514,7 +2514,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		addr	= saddr->sll_halen ? saddr->sll_addr : NULL;
 		dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
 		if (addr && dev && saddr->sll_halen < dev->addr_len)
-			goto out;
+			goto out_put;
 	}
 
 	err = -ENXIO;
@@ -2683,7 +2683,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		addr	= saddr->sll_halen ? saddr->sll_addr : NULL;
 		dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
 		if (addr && dev && saddr->sll_halen < dev->addr_len)
-			goto out;
+			goto out_unlock;
 	}
 
 	err = -ENXIO;

From 471a110cd29534d1e31739cf69014eea5eadcf23 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 7 Jan 2019 16:47:33 -0500
Subject: [PATCH 2298/3220] ip: on queued skb use skb_header_pointer instead of
 pskb_may_pull

[ Upstream commit 4a06fa67c4da20148803525151845276cdb995c1 ]

Commit 2efd4fca703a ("ip: in cmsg IP(V6)_ORIGDSTADDR call
pskb_may_pull") avoided a read beyond the end of the skb linear
segment by calling pskb_may_pull.

That function can trigger a BUG_ON in pskb_expand_head if the skb is
shared, which it is when when peeking. It can also return ENOMEM.

Avoid both by switching to safer skb_header_pointer.

Fixes: 2efd4fca703a ("ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull")
Reported-by: syzbot <syzkaller@googlegroups.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_sockglue.c | 12 +++++-------
 net/ipv6/datagram.c    | 10 ++++------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 3f8caf7d19b8..1ea36bf778e6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -133,19 +133,17 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
 
 static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 {
+	__be16 _ports[2], *ports;
 	struct sockaddr_in sin;
-	__be16 *ports;
-	int end;
-
-	end = skb_transport_offset(skb) + 4;
-	if (end > 0 && !pskb_may_pull(skb, end))
-		return;
 
 	/* All current transport protocols have the port numbers in the
 	 * first four bytes of the transport header and this function is
 	 * written with this assumption in mind.
 	 */
-	ports = (__be16 *)skb_transport_header(skb);
+	ports = skb_header_pointer(skb, skb_transport_offset(skb),
+				   sizeof(_ports), &_ports);
+	if (!ports)
+		return;
 
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = ip_hdr(skb)->daddr;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 7e34bc750e4c..27cdf543c539 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -658,17 +658,15 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
 	}
 	if (np->rxopt.bits.rxorigdstaddr) {
 		struct sockaddr_in6 sin6;
-		__be16 *ports;
-		int end;
+		__be16 _ports[2], *ports;
 
-		end = skb_transport_offset(skb) + 4;
-		if (end <= 0 || pskb_may_pull(skb, end)) {
+		ports = skb_header_pointer(skb, skb_transport_offset(skb),
+					   sizeof(_ports), &_ports);
+		if (ports) {
 			/* All current transport protocols have the port numbers in the
 			 * first four bytes of the transport header and this function is
 			 * written with this assumption in mind.
 			 */
-			ports = (__be16 *)skb_transport_header(skb);
-
 			sin6.sin6_family = AF_INET6;
 			sin6.sin6_addr = ipv6_hdr(skb)->daddr;
 			sin6.sin6_port = ports[1];

From ca3f892b57ef5d0c44741e9550825b0b9aae9e3f Mon Sep 17 00:00:00 2001
From: Harsh Jain <harsh@chelsio.com>
Date: Thu, 3 Jan 2019 14:21:05 +0530
Subject: [PATCH 2299/3220] crypto: authencesn - Avoid twice completion call in
 decrypt path

commit a7773363624b034ab198c738661253d20a8055c2 upstream.

Authencesn template in decrypt path unconditionally calls aead_request_complete
after ahash_verify which leads to following kernel panic in after decryption.

[  338.539800] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
[  338.548372] PGD 0 P4D 0
[  338.551157] Oops: 0000 [#1] SMP PTI
[  338.554919] CPU: 0 PID: 0 Comm: swapper/0 Kdump: loaded Tainted: G        W I       4.19.7+ #13
[  338.564431] Hardware name: Supermicro X8ST3/X8ST3, BIOS 2.0        07/29/10
[  338.572212] RIP: 0010:esp_input_done2+0x350/0x410 [esp4]
[  338.578030] Code: ff 0f b6 68 10 48 8b 83 c8 00 00 00 e9 8e fe ff ff 8b 04 25 04 00 00 00 83 e8 01 48 98 48 8b 3c c5 10 00 00 00 e9 f7 fd ff ff <8b> 04 25 04 00 00 00 83 e8 01 48 98 4c 8b 24 c5 10 00 00 00 e9 3b
[  338.598547] RSP: 0018:ffff911c97803c00 EFLAGS: 00010246
[  338.604268] RAX: 0000000000000002 RBX: ffff911c4469ee00 RCX: 0000000000000000
[  338.612090] RDX: 0000000000000000 RSI: 0000000000000130 RDI: ffff911b87c20400
[  338.619874] RBP: 0000000000000000 R08: ffff911b87c20498 R09: 000000000000000a
[  338.627610] R10: 0000000000000001 R11: 0000000000000004 R12: 0000000000000000
[  338.635402] R13: ffff911c89590000 R14: ffff911c91730000 R15: 0000000000000000
[  338.643234] FS:  0000000000000000(0000) GS:ffff911c97800000(0000) knlGS:0000000000000000
[  338.652047] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  338.658299] CR2: 0000000000000004 CR3: 00000001ec20a000 CR4: 00000000000006f0
[  338.666382] Call Trace:
[  338.669051]  <IRQ>
[  338.671254]  esp_input_done+0x12/0x20 [esp4]
[  338.675922]  chcr_handle_resp+0x3b5/0x790 [chcr]
[  338.680949]  cpl_fw6_pld_handler+0x37/0x60 [chcr]
[  338.686080]  chcr_uld_rx_handler+0x22/0x50 [chcr]
[  338.691233]  uldrx_handler+0x8c/0xc0 [cxgb4]
[  338.695923]  process_responses+0x2f0/0x5d0 [cxgb4]
[  338.701177]  ? bitmap_find_next_zero_area_off+0x3a/0x90
[  338.706882]  ? matrix_alloc_area.constprop.7+0x60/0x90
[  338.712517]  ? apic_update_irq_cfg+0x82/0xf0
[  338.717177]  napi_rx_handler+0x14/0xe0 [cxgb4]
[  338.722015]  net_rx_action+0x2aa/0x3e0
[  338.726136]  __do_softirq+0xcb/0x280
[  338.730054]  irq_exit+0xde/0xf0
[  338.733504]  do_IRQ+0x54/0xd0
[  338.736745]  common_interrupt+0xf/0xf

Fixes: 104880a6b470 ("crypto: authencesn - Convert to new AEAD...")
Signed-off-by: Harsh Jain <harsh@chelsio.com>
Cc: stable@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 crypto/authencesn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/authencesn.c b/crypto/authencesn.c
index fa0c4567f697..5fdf3e532310 100644
--- a/crypto/authencesn.c
+++ b/crypto/authencesn.c
@@ -276,7 +276,7 @@ static void authenc_esn_verify_ahash_done(struct crypto_async_request *areq,
 	struct aead_request *req = areq->data;
 
 	err = err ?: crypto_authenc_esn_decrypt_tail(req, 0);
-	aead_request_complete(req, err);
+	authenc_esn_request_complete(req, err);
 }
 
 static int crypto_authenc_esn_decrypt(struct aead_request *req)

From 461652efc1545a1a074bf0df826dcd3bce520db4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 16 Dec 2018 23:23:22 -0800
Subject: [PATCH 2300/3220] crypto: authenc - fix parsing key with misaligned
 rta_len

commit 8f9c469348487844328e162db57112f7d347c49f upstream.

Keys for "authenc" AEADs are formatted as an rtattr containing a 4-byte
'enckeylen', followed by an authentication key and an encryption key.
crypto_authenc_extractkeys() parses the key to find the inner keys.

However, it fails to consider the case where the rtattr's payload is
longer than 4 bytes but not 4-byte aligned, and where the key ends
before the next 4-byte aligned boundary.  In this case, 'keylen -=
RTA_ALIGN(rta->rta_len);' underflows to a value near UINT_MAX.  This
causes a buffer overread and crash during crypto_ahash_setkey().

Fix it by restricting the rtattr payload to the expected size.

Reproducer using AF_ALG:

	#include <linux/if_alg.h>
	#include <linux/rtnetlink.h>
	#include <sys/socket.h>

	int main()
	{
		int fd;
		struct sockaddr_alg addr = {
			.salg_type = "aead",
			.salg_name = "authenc(hmac(sha256),cbc(aes))",
		};
		struct {
			struct rtattr attr;
			__be32 enckeylen;
			char keys[1];
		} __attribute__((packed)) key = {
			.attr.rta_len = sizeof(key),
			.attr.rta_type = 1 /* CRYPTO_AUTHENC_KEYA_PARAM */,
		};

		fd = socket(AF_ALG, SOCK_SEQPACKET, 0);
		bind(fd, (void *)&addr, sizeof(addr));
		setsockopt(fd, SOL_ALG, ALG_SET_KEY, &key, sizeof(key));
	}

It caused:

	BUG: unable to handle kernel paging request at ffff88007ffdc000
	PGD 2e01067 P4D 2e01067 PUD 2e04067 PMD 2e05067 PTE 0
	Oops: 0000 [#1] SMP
	CPU: 0 PID: 883 Comm: authenc Not tainted 4.20.0-rc1-00108-g00c9fe37a7f27 #13
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
	RIP: 0010:sha256_ni_transform+0xb3/0x330 arch/x86/crypto/sha256_ni_asm.S:155
	[...]
	Call Trace:
	 sha256_ni_finup+0x10/0x20 arch/x86/crypto/sha256_ssse3_glue.c:321
	 crypto_shash_finup+0x1a/0x30 crypto/shash.c:178
	 shash_digest_unaligned+0x45/0x60 crypto/shash.c:186
	 crypto_shash_digest+0x24/0x40 crypto/shash.c:202
	 hmac_setkey+0x135/0x1e0 crypto/hmac.c:66
	 crypto_shash_setkey+0x2b/0xb0 crypto/shash.c:66
	 shash_async_setkey+0x10/0x20 crypto/shash.c:223
	 crypto_ahash_setkey+0x2d/0xa0 crypto/ahash.c:202
	 crypto_authenc_setkey+0x68/0x100 crypto/authenc.c:96
	 crypto_aead_setkey+0x2a/0xc0 crypto/aead.c:62
	 aead_setkey+0xc/0x10 crypto/algif_aead.c:526
	 alg_setkey crypto/af_alg.c:223 [inline]
	 alg_setsockopt+0xfe/0x130 crypto/af_alg.c:256
	 __sys_setsockopt+0x6d/0xd0 net/socket.c:1902
	 __do_sys_setsockopt net/socket.c:1913 [inline]
	 __se_sys_setsockopt net/socket.c:1910 [inline]
	 __x64_sys_setsockopt+0x1f/0x30 net/socket.c:1910
	 do_syscall_64+0x4a/0x180 arch/x86/entry/common.c:290
	 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fixes: e236d4a89a2f ("[CRYPTO] authenc: Move enckeylen into key itself")
Cc: <stable@vger.kernel.org> # v2.6.25+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 crypto/authenc.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/crypto/authenc.c b/crypto/authenc.c
index b7290c5b1eaa..5c25005ff398 100644
--- a/crypto/authenc.c
+++ b/crypto/authenc.c
@@ -58,14 +58,22 @@ int crypto_authenc_extractkeys(struct crypto_authenc_keys *keys, const u8 *key,
 		return -EINVAL;
 	if (rta->rta_type != CRYPTO_AUTHENC_KEYA_PARAM)
 		return -EINVAL;
-	if (RTA_PAYLOAD(rta) < sizeof(*param))
+
+	/*
+	 * RTA_OK() didn't align the rtattr's payload when validating that it
+	 * fits in the buffer.  Yet, the keys should start on the next 4-byte
+	 * aligned boundary.  To avoid confusion, require that the rtattr
+	 * payload be exactly the param struct, which has a 4-byte aligned size.
+	 */
+	if (RTA_PAYLOAD(rta) != sizeof(*param))
 		return -EINVAL;
+	BUILD_BUG_ON(sizeof(*param) % RTA_ALIGNTO);
 
 	param = RTA_DATA(rta);
 	keys->enckeylen = be32_to_cpu(param->enckeylen);
 
-	key += RTA_ALIGN(rta->rta_len);
-	keylen -= RTA_ALIGN(rta->rta_len);
+	key += rta->rta_len;
+	keylen -= rta->rta_len;
 
 	if (keylen < keys->enckeylen)
 		return -EINVAL;

From bc42c4f3384660793cc28f01e8145c1776c02f51 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 21 Nov 2018 14:05:45 -0500
Subject: [PATCH 2301/3220] btrfs: wait on ordered extents on abort cleanup

commit 74d5d229b1bf60f93bff244b2dfc0eb21ec32a07 upstream.

If we flip read-only before we initiate writeback on all dirty pages for
ordered extents we've created then we'll have ordered extents left over
on umount, which results in all sorts of bad things happening.  Fix this
by making sure we wait on ordered extents if we have to do the aborted
transaction cleanup stuff.

generic/475 can produce this warning:

 [ 8531.177332] WARNING: CPU: 2 PID: 11997 at fs/btrfs/disk-io.c:3856 btrfs_free_fs_root+0x95/0xa0 [btrfs]
 [ 8531.183282] CPU: 2 PID: 11997 Comm: umount Tainted: G        W 5.0.0-rc1-default+ #394
 [ 8531.185164] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014
 [ 8531.187851] RIP: 0010:btrfs_free_fs_root+0x95/0xa0 [btrfs]
 [ 8531.193082] RSP: 0018:ffffb1ab86163d98 EFLAGS: 00010286
 [ 8531.194198] RAX: ffff9f3449494d18 RBX: ffff9f34a2695000 RCX:0000000000000000
 [ 8531.195629] RDX: 0000000000000002 RSI: 0000000000000001 RDI:0000000000000000
 [ 8531.197315] RBP: ffff9f344e930000 R08: 0000000000000001 R09:0000000000000000
 [ 8531.199095] R10: 0000000000000000 R11: ffff9f34494d4ff8 R12:ffffb1ab86163dc0
 [ 8531.200870] R13: ffff9f344e9300b0 R14: ffffb1ab86163db8 R15:0000000000000000
 [ 8531.202707] FS:  00007fc68e949fc0(0000) GS:ffff9f34bd800000(0000)knlGS:0000000000000000
 [ 8531.204851] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [ 8531.205942] CR2: 00007ffde8114dd8 CR3: 000000002dfbd000 CR4:00000000000006e0
 [ 8531.207516] Call Trace:
 [ 8531.208175]  btrfs_free_fs_roots+0xdb/0x170 [btrfs]
 [ 8531.210209]  ? wait_for_completion+0x5b/0x190
 [ 8531.211303]  close_ctree+0x157/0x350 [btrfs]
 [ 8531.212412]  generic_shutdown_super+0x64/0x100
 [ 8531.213485]  kill_anon_super+0x14/0x30
 [ 8531.214430]  btrfs_kill_super+0x12/0xa0 [btrfs]
 [ 8531.215539]  deactivate_locked_super+0x29/0x60
 [ 8531.216633]  cleanup_mnt+0x3b/0x70
 [ 8531.217497]  task_work_run+0x98/0xc0
 [ 8531.218397]  exit_to_usermode_loop+0x83/0x90
 [ 8531.219324]  do_syscall_64+0x15b/0x180
 [ 8531.220192]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
 [ 8531.221286] RIP: 0033:0x7fc68e5e4d07
 [ 8531.225621] RSP: 002b:00007ffde8116608 EFLAGS: 00000246 ORIG_RAX:00000000000000a6
 [ 8531.227512] RAX: 0000000000000000 RBX: 00005580c2175970 RCX:00007fc68e5e4d07
 [ 8531.229098] RDX: 0000000000000001 RSI: 0000000000000000 RDI:00005580c2175b80
 [ 8531.230730] RBP: 0000000000000000 R08: 00005580c2175ba0 R09:00007ffde8114e80
 [ 8531.232269] R10: 0000000000000000 R11: 0000000000000246 R12:00005580c2175b80
 [ 8531.233839] R13: 00007fc68eac61c4 R14: 00005580c2175a68 R15:0000000000000000

Leaving a tree in the rb-tree:

3853 void btrfs_free_fs_root(struct btrfs_root *root)
3854 {
3855         iput(root->ino_cache_inode);
3856         WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));

CC: stable@vger.kernel.org
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ add stacktrace ]
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f80a0af68736..78722aaffecd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4111,6 +4111,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 		spin_lock(&fs_info->ordered_root_lock);
 	}
 	spin_unlock(&fs_info->ordered_root_lock);
+
+	/*
+	 * We need this here because if we've been flipped read-only we won't
+	 * get sync() from the umount, so we need to make sure any ordered
+	 * extents that haven't had their dirty pages IO start writeout yet
+	 * actually get run and error out properly.
+	 */
+	btrfs_wait_ordered_roots(fs_info, -1);
 }
 
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,

From e08323f75b56dfd97711e2bfda3582bb42a5ceab Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 16 Jan 2019 10:31:09 -0800
Subject: [PATCH 2302/3220] Yama: Check for pid death before checking ancestry

commit 9474f4e7cd71a633fa1ef93b7daefd44bbdfd482 upstream.

It's possible that a pid has died before we take the rcu lock, in which
case we can't walk the ancestry list as it may be detached. Instead, check
for death first before doing the walk.

Reported-by: syzbot+a9ac39bf55329e206219@syzkaller.appspotmail.com
Fixes: 2d514487faf1 ("security: Yama LSM")
Cc: stable@vger.kernel.org
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.morris@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 security/yama/yama_lsm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index cb6ed10816d4..0a8808954bd8 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -288,7 +288,9 @@ static int yama_ptrace_access_check(struct task_struct *child,
 			break;
 		case YAMA_SCOPE_RELATIONAL:
 			rcu_read_lock();
-			if (!task_is_descendant(current, child) &&
+			if (!pid_alive(child))
+				rc = -EPERM;
+			if (!rc && !task_is_descendant(current, child) &&
 			    !ptracer_exception_found(current, child) &&
 			    !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
 				rc = -EPERM;

From 2f8018df3f50a82458614154b6f5acbb81ea63b6 Mon Sep 17 00:00:00 2001
From: Ivan Mironov <mironov.ivan@gmail.com>
Date: Sun, 23 Dec 2018 12:41:58 +0500
Subject: [PATCH 2303/3220] scsi: sd: Fix cache_type_store()

commit 44759979a49bfd2d20d789add7fa81a21eb1a4ab upstream.

Changing of caching mode via /sys/devices/.../scsi_disk/.../cache_type may
fail if device responds to MODE SENSE command with DPOFUA flag set, and
then checks this flag to be not set on MODE SELECT command.

In this scenario, when trying to change cache_type, write always fails:

	# echo "none" >cache_type
	bash: echo: write error: Invalid argument

And following appears in dmesg:

	[13007.865745] sd 1:0:1:0: [sda] Sense Key : Illegal Request [current]
	[13007.865753] sd 1:0:1:0: [sda] Add. Sense: Invalid field in parameter list

From SBC-4 r15, 6.5.1 "Mode pages overview", description of DEVICE-SPECIFIC
PARAMETER field in the mode parameter header:
	...
	The write protect (WP) bit for mode data sent with a MODE SELECT
	command shall be ignored by the device server.
	...
	The DPOFUA bit is reserved for mode data sent with a MODE SELECT
	command.
	...

The remaining bits in the DEVICE-SPECIFIC PARAMETER byte are also reserved
and shall be set to zero.

[mkp: shuffled commentary to commit description]

Cc: stable@vger.kernel.org
Signed-off-by: Ivan Mironov <mironov.ivan@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/scsi/sd.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 6fffb73766de..ec80a0077ace 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -207,6 +207,12 @@ cache_type_store(struct device *dev, struct device_attribute *attr,
 	sp = buffer_data[0] & 0x80 ? 1 : 0;
 	buffer_data[0] &= ~0x80;
 
+	/*
+	 * Ensure WP, DPOFUA, and RESERVED fields are cleared in
+	 * received mode parameter buffer before doing MODE SELECT.
+	 */
+	data.device_specific = 0;
+
 	if (scsi_mode_select(sdp, 1, sp, 8, buffer_data, len, SD_TIMEOUT,
 			     SD_MAX_RETRIES, &data, &sshdr)) {
 		if (scsi_sense_valid(&sshdr))

From db58a203792a853c700af7147c3bcbd82adbfe75 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 10 Jan 2019 17:24:31 +0100
Subject: [PATCH 2304/3220] mips: fix n32 compat_ipc_parse_version

commit 5a9372f751b5350e0ce3d2ee91832f1feae2c2e5 upstream.

While reading through the sysvipc implementation, I noticed that the n32
semctl/shmctl/msgctl system calls behave differently based on whether
o32 support is enabled or not: Without o32, the IPC_64 flag passed by
user space is rejected but calls without that flag get IPC_64 behavior.

As far as I can tell, this was inadvertently changed by a cleanup patch
but never noticed by anyone, possibly nobody has tried using sysvipc
on n32 after linux-3.19.

Change it back to the old behavior now.

Fixes: 78aaf956ba3a ("MIPS: Compat: Fix build error if CONFIG_MIPS32_COMPAT but no compat ABI.")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@vger.kernel.org
Cc: stable@vger.kernel.org # 3.19+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8b0424abc84c..3a908cc81317 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2972,6 +2972,7 @@ config MIPS32_O32
 config MIPS32_N32
 	bool "Kernel support for n32 binaries"
 	depends on 64BIT
+	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
 	select COMPAT
 	select MIPS32_COMPAT
 	select SYSVIPC_COMPAT if SYSVIPC

From af135bd839452e0c6b4f764255b50a372ef30efd Mon Sep 17 00:00:00 2001
From: Jonathan Hunter <jonathanh@nvidia.com>
Date: Tue, 13 Nov 2018 08:56:31 +0000
Subject: [PATCH 2305/3220] mfd: tps6586x: Handle interrupts on suspend

commit ac4ca4b9f4623ba5e1ea7a582f286567c611e027 upstream.

The tps6586x driver creates an irqchip that is used by its various child
devices for managing interrupts. The tps6586x-rtc device is one of its
children that uses the tps6586x irqchip. When using the tps6586x-rtc as
a wake-up device from suspend, the following is seen:

 PM: Syncing filesystems ... done.
 Freezing user space processes ... (elapsed 0.001 seconds) done.
 OOM killer disabled.
 Freezing remaining freezable tasks ... (elapsed 0.000 seconds) done.
 Disabling non-boot CPUs ...
 Entering suspend state LP1
 Enabling non-boot CPUs ...
 CPU1 is up
 tps6586x 3-0034: failed to read interrupt status
 tps6586x 3-0034: failed to read interrupt status

The reason why the tps6586x interrupt status cannot be read is because
the tps6586x interrupt is not masked during suspend and when the
tps6586x-rtc interrupt occurs, to wake-up the device, the interrupt is
seen before the i2c controller has been resumed in order to read the
tps6586x interrupt status.

The tps6586x-rtc driver sets it's interrupt as a wake-up source during
suspend, which gets propagated to the parent tps6586x interrupt.
However, the tps6586x-rtc driver cannot disable it's interrupt during
suspend otherwise we would never be woken up and so the tps6586x must
disable it's interrupt instead.

Prevent the tps6586x interrupt handler from executing on exiting suspend
before the i2c controller has been resumed by disabling the tps6586x
interrupt on entering suspend and re-enabling it on resuming from
suspend.

Cc: stable@vger.kernel.org
Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mfd/tps6586x.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/mfd/tps6586x.c b/drivers/mfd/tps6586x.c
index 5628a6b5b19b..c5c320efc7b4 100644
--- a/drivers/mfd/tps6586x.c
+++ b/drivers/mfd/tps6586x.c
@@ -594,6 +594,29 @@ static int tps6586x_i2c_remove(struct i2c_client *client)
 	return 0;
 }
 
+static int __maybe_unused tps6586x_i2c_suspend(struct device *dev)
+{
+	struct tps6586x *tps6586x = dev_get_drvdata(dev);
+
+	if (tps6586x->client->irq)
+		disable_irq(tps6586x->client->irq);
+
+	return 0;
+}
+
+static int __maybe_unused tps6586x_i2c_resume(struct device *dev)
+{
+	struct tps6586x *tps6586x = dev_get_drvdata(dev);
+
+	if (tps6586x->client->irq)
+		enable_irq(tps6586x->client->irq);
+
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(tps6586x_pm_ops, tps6586x_i2c_suspend,
+			 tps6586x_i2c_resume);
+
 static const struct i2c_device_id tps6586x_id_table[] = {
 	{ "tps6586x", 0 },
 	{ },
@@ -604,6 +627,7 @@ static struct i2c_driver tps6586x_driver = {
 	.driver	= {
 		.name	= "tps6586x",
 		.of_match_table = of_match_ptr(tps6586x_of_match),
+		.pm	= &tps6586x_pm_ops,
 	},
 	.probe		= tps6586x_i2c_probe,
 	.remove		= tps6586x_i2c_remove,

From 93e6b2659b167b15207221bc4c05c11d94e6a14c Mon Sep 17 00:00:00 2001
From: YunQiang Su <ysu@wavecomp.com>
Date: Tue, 8 Jan 2019 13:45:10 +0800
Subject: [PATCH 2306/3220] Disable MSI also when pcie-octeon.pcie_disable on

commit a214720cbf50cd8c3f76bbb9c3f5c283910e9d33 upstream.

Octeon has an boot-time option to disable pcie.

Since MSI depends on PCI-E, we should also disable MSI also with
this option is on in order to avoid inadvertently accessing PCIe
registers.

Signed-off-by: YunQiang Su <ysu@wavecomp.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: pburton@wavecomp.com
Cc: linux-mips@vger.kernel.org
Cc: aaro.koskinen@iki.fi
Cc: stable@vger.kernel.org # v3.3+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/pci/msi-octeon.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/mips/pci/msi-octeon.c b/arch/mips/pci/msi-octeon.c
index 2a5bb849b10e..288b58b00dc8 100644
--- a/arch/mips/pci/msi-octeon.c
+++ b/arch/mips/pci/msi-octeon.c
@@ -369,7 +369,9 @@ int __init octeon_msi_initialize(void)
 	int irq;
 	struct irq_chip *msi;
 
-	if (octeon_dma_bar_type == OCTEON_DMA_BAR_TYPE_PCIE) {
+	if (octeon_dma_bar_type == OCTEON_DMA_BAR_TYPE_INVALID) {
+		return 0;
+	} else if (octeon_dma_bar_type == OCTEON_DMA_BAR_TYPE_PCIE) {
 		msi_rcv_reg[0] = CVMX_PEXP_NPEI_MSI_RCV0;
 		msi_rcv_reg[1] = CVMX_PEXP_NPEI_MSI_RCV1;
 		msi_rcv_reg[2] = CVMX_PEXP_NPEI_MSI_RCV2;

From 505c2ac72a099266f4c530ed987132c98372367f Mon Sep 17 00:00:00 2001
From: Vlad Tsyrklevich <vlad@tsyrklevich.net>
Date: Fri, 11 Jan 2019 14:34:38 +0100
Subject: [PATCH 2307/3220] omap2fb: Fix stack memory disclosure

commit a01421e4484327fe44f8e126793ed5a48a221e24 upstream.

Using [1] for static analysis I found that the OMAPFB_QUERY_PLANE,
OMAPFB_GET_COLOR_KEY, OMAPFB_GET_DISPLAY_INFO, and OMAPFB_GET_VRAM_INFO
cases could all leak uninitialized stack memory--either due to
uninitialized padding or 'reserved' fields.

Fix them by clearing the shared union used to store copied out data.

[1] https://github.com/vlad902/kernel-uninitialized-memory-checker

Signed-off-by: Vlad Tsyrklevich <vlad@tsyrklevich.net>
Reviewed-by: Kees Cook <keescook@chromium.org>
Fixes: b39a982ddecf ("OMAP: DSS2: omapfb driver")
Cc: security@kernel.org
[b.zolnierkie: prefix patch subject with "omap2fb: "]
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c
index 34ab4f950f0a..0c1c34ff40a9 100644
--- a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c
+++ b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c
@@ -609,6 +609,8 @@ int omapfb_ioctl(struct fb_info *fbi, unsigned int cmd, unsigned long arg)
 
 	int r = 0;
 
+	memset(&p, 0, sizeof(p));
+
 	switch (cmd) {
 	case OMAPFB_SYNC_GFX:
 		DBG("ioctl SYNC_GFX\n");

From 8185cc4ff897d89f2f64ab56d77b7a4c69cbe377 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 29 Oct 2018 06:15:31 -0400
Subject: [PATCH 2308/3220] media: vivid: fix error handling of kthread_run

commit 701f49bc028edb19ffccd101997dd84f0d71e279 upstream.

kthread_run returns an error pointer, but elsewhere in the code
dev->kthread_vid_cap/out is checked against NULL.

If kthread_run returns an error, then set the pointer to NULL.

I chose this method over changing all kthread_vid_cap/out tests
elsewhere since this is more robust.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reported-by: syzbot+53d5b2df0d9744411e2e@syzkaller.appspotmail.com
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/platform/vivid/vivid-kthread-cap.c | 5 ++++-
 drivers/media/platform/vivid/vivid-kthread-out.c | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/media/platform/vivid/vivid-kthread-cap.c b/drivers/media/platform/vivid/vivid-kthread-cap.c
index 83cc6d3b4784..81ba454a6d95 100644
--- a/drivers/media/platform/vivid/vivid-kthread-cap.c
+++ b/drivers/media/platform/vivid/vivid-kthread-cap.c
@@ -863,8 +863,11 @@ int vivid_start_generating_vid_cap(struct vivid_dev *dev, bool *pstreaming)
 			"%s-vid-cap", dev->v4l2_dev.name);
 
 	if (IS_ERR(dev->kthread_vid_cap)) {
+		int err = PTR_ERR(dev->kthread_vid_cap);
+
+		dev->kthread_vid_cap = NULL;
 		v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n");
-		return PTR_ERR(dev->kthread_vid_cap);
+		return err;
 	}
 	*pstreaming = true;
 	vivid_grab_controls(dev, true);
diff --git a/drivers/media/platform/vivid/vivid-kthread-out.c b/drivers/media/platform/vivid/vivid-kthread-out.c
index c2c46dcdbe95..2c5dbdcb576a 100644
--- a/drivers/media/platform/vivid/vivid-kthread-out.c
+++ b/drivers/media/platform/vivid/vivid-kthread-out.c
@@ -248,8 +248,11 @@ int vivid_start_generating_vid_out(struct vivid_dev *dev, bool *pstreaming)
 			"%s-vid-out", dev->v4l2_dev.name);
 
 	if (IS_ERR(dev->kthread_vid_out)) {
+		int err = PTR_ERR(dev->kthread_vid_out);
+
+		dev->kthread_vid_out = NULL;
 		v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n");
-		return PTR_ERR(dev->kthread_vid_out);
+		return err;
 	}
 	*pstreaming = true;
 	vivid_grab_controls(dev, true);

From 4bc86212e19979b44fe2d80a253c02608f02a959 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 29 Oct 2018 13:32:38 -0400
Subject: [PATCH 2309/3220] media: vivid: set min width/height to a value > 0

commit 9729d6d282a6d7ce88e64c9119cecdf79edf4e88 upstream.

The capture DV timings capabilities allowed for a minimum width and
height of 0. So passing a timings struct with 0 values is allowed
and will later cause a division by zero.

Ensure that the width and height must be >= 16 to avoid this.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reported-by: syzbot+57c3d83d71187054d56f@syzkaller.appspotmail.com
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/platform/vivid/vivid-vid-common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/media/platform/vivid/vivid-vid-common.c b/drivers/media/platform/vivid/vivid-vid-common.c
index 1678b730dba2..2e82f520a869 100644
--- a/drivers/media/platform/vivid/vivid-vid-common.c
+++ b/drivers/media/platform/vivid/vivid-vid-common.c
@@ -33,7 +33,7 @@ const struct v4l2_dv_timings_cap vivid_dv_timings_cap = {
 	.type = V4L2_DV_BT_656_1120,
 	/* keep this initialization for compatibility with GCC < 4.4.6 */
 	.reserved = { 0 },
-	V4L2_INIT_BT_TIMINGS(0, MAX_WIDTH, 0, MAX_HEIGHT, 14000000, 775000000,
+	V4L2_INIT_BT_TIMINGS(16, MAX_WIDTH, 16, MAX_HEIGHT, 14000000, 775000000,
 		V4L2_DV_BT_STD_CEA861 | V4L2_DV_BT_STD_DMT |
 		V4L2_DV_BT_STD_CVT | V4L2_DV_BT_STD_GTF,
 		V4L2_DV_BT_CAP_PROGRESSIVE | V4L2_DV_BT_CAP_INTERLACED)

From ac0e225395382a731e6a8ca98419eac9bb43936e Mon Sep 17 00:00:00 2001
From: James Morris <james.morris@microsoft.com>
Date: Wed, 16 Jan 2019 15:41:11 -0800
Subject: [PATCH 2310/3220] LSM: Check for NULL cred-security on free

commit a5795fd38ee8194451ba3f281f075301a3696ce2 upstream.

From: Casey Schaufler <casey@schaufler-ca.com>

Check that the cred security blob has been set before trying
to clean it up. There is a case during credential initialization
that could result in this.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: James Morris <james.morris@microsoft.com>
Reported-by: syzbot+69ca07954461f189e808@syzkaller.appspotmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 security/security.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/security/security.c b/security/security.c
index 46f405ce6b0f..0dde287db5c5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -861,6 +861,13 @@ int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 
 void security_cred_free(struct cred *cred)
 {
+	/*
+	 * There is a failure case in prepare_creds() that
+	 * may result in a call here with ->security being NULL.
+	 */
+	if (unlikely(cred->security == NULL))
+		return;
+
 	call_void_hook(cred_free, cred);
 }
 

From f76e38ea42f5241eb8b0ec49db3197735b44320d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Tue, 13 Nov 2018 09:06:46 -0500
Subject: [PATCH 2311/3220] media: vb2: vb2_mmap: move lock up

commit cd26d1c4d1bc947b56ae404998ae2276df7b39b7 upstream.

If a filehandle is dup()ped, then it is possible to close it from one fd
and call mmap from the other. This creates a race condition in vb2_mmap
where it is using queue data that __vb2_queue_free (called from close())
is in the process of releasing.

By moving up the mutex_lock(mmap_lock) in vb2_mmap this race is avoided
since __vb2_queue_free is called with the same mutex locked. So vb2_mmap
now reads consistent buffer data.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Reported-by: syzbot+be93025dd45dccd8923c@syzkaller.appspotmail.com
Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/v4l2-core/videobuf2-core.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c
index 8ce9c63dfc59..4f1baf17c6b8 100644
--- a/drivers/media/v4l2-core/videobuf2-core.c
+++ b/drivers/media/v4l2-core/videobuf2-core.c
@@ -1976,9 +1976,13 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma)
 			return -EINVAL;
 		}
 	}
+
+	mutex_lock(&q->mmap_lock);
+
 	if (vb2_fileio_is_active(q)) {
 		dprintk(1, "mmap: file io in progress\n");
-		return -EBUSY;
+		ret = -EBUSY;
+		goto unlock;
 	}
 
 	/*
@@ -1986,7 +1990,7 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma)
 	 */
 	ret = __find_plane_by_offset(q, off, &buffer, &plane);
 	if (ret)
-		return ret;
+		goto unlock;
 
 	vb = q->bufs[buffer];
 
@@ -2002,8 +2006,9 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma)
 		return -EINVAL;
 	}
 
-	mutex_lock(&q->mmap_lock);
 	ret = call_memop(vb, mmap, vb->planes[plane].mem_priv, vma);
+
+unlock:
 	mutex_unlock(&q->mmap_lock);
 	if (ret)
 		return ret;

From 84ba6b78ba0e4a9046dc8b9eaeb1e77bf5052bea Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 20 Dec 2018 10:35:11 -0500
Subject: [PATCH 2312/3220] sunrpc: handle ENOMEM in rpcb_getport_async

commit 81c88b18de1f11f70c97f28ced8d642c00bb3955 upstream.

If we ignore the error we'll hit a null dereference a little later.

Reported-by: syzbot+4b98281f2401ab849f4b@syzkaller.appspotmail.com
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sunrpc/rpcb_clnt.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index cf5770d8f49a..c89626b2afff 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -772,6 +772,12 @@ void rpcb_getport_async(struct rpc_task *task)
 	case RPCBVERS_3:
 		map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID];
 		map->r_addr = rpc_sockaddr2uaddr(sap, GFP_ATOMIC);
+		if (!map->r_addr) {
+			status = -ENOMEM;
+			dprintk("RPC: %5u %s: no memory available\n",
+				task->tk_pid, __func__);
+			goto bailout_free_args;
+		}
 		map->r_owner = "";
 		break;
 	case RPCBVERS_2:
@@ -794,6 +800,8 @@ void rpcb_getport_async(struct rpc_task *task)
 	rpc_put_task(child);
 	return;
 
+bailout_free_args:
+	kfree(map);
 bailout_release_client:
 	rpc_release_client(rpcb_clnt);
 bailout_nofree:

From 9ef38b24344ebb0befdff8d4d427f6581fc24d32 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Wed, 9 Jan 2019 10:55:10 -0500
Subject: [PATCH 2313/3220] selinux: fix GPF on invalid policy

commit 5b0e7310a2a33c06edc7eb81ffc521af9b2c5610 upstream.

levdatum->level can be NULL if we encounter an error while loading
the policy during sens_read prior to initializing it.  Make sure
sens_destroy handles that case correctly.

Reported-by: syzbot+6664500f0f18f07a5c0e@syzkaller.appspotmail.com
Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 security/selinux/ss/policydb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index 992a31530825..965a55eacaba 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -726,7 +726,8 @@ static int sens_destroy(void *key, void *datum, void *p)
 	kfree(key);
 	if (datum) {
 		levdatum = datum;
-		ebitmap_destroy(&levdatum->level->cat);
+		if (levdatum->level)
+			ebitmap_destroy(&levdatum->level->cat);
 		kfree(levdatum->level);
 	}
 	kfree(datum);

From 3fb0cbefc6006c4fd7921fbe47eb4d1ab4b8ef88 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 14 Jan 2019 18:34:02 +0800
Subject: [PATCH 2314/3220] sctp: allocate sctp_sockaddr_entry with kzalloc

commit 400b8b9a2a17918f8ce00786f596f530e7f30d50 upstream.

The similar issue as fixed in Commit 4a2eb0c37b47 ("sctp: initialize
sin6_flowinfo for ipv6 addrs in sctp_inet6addr_event") also exists
in sctp_inetaddr_event, as Alexander noticed.

To fix it, allocate sctp_sockaddr_entry with kzalloc for both sctp
ipv4 and ipv6 addresses, as does in sctp_v4/6_copy_addrlist().

Reported-by: Alexander Potapenko <glider@google.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reported-by: syzbot+ae0c70c0c2d40c51bb92@syzkaller.appspotmail.com
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sctp/ipv6.c     | 5 +----
 net/sctp/protocol.c | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 7dffc97a953c..9fa0b0dc3868 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -97,11 +97,9 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
 
 	switch (ev) {
 	case NETDEV_UP:
-		addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
+		addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
 		if (addr) {
 			addr->a.v6.sin6_family = AF_INET6;
-			addr->a.v6.sin6_port = 0;
-			addr->a.v6.sin6_flowinfo = 0;
 			addr->a.v6.sin6_addr = ifa->addr;
 			addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex;
 			addr->valid = 1;
@@ -412,7 +410,6 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
 		addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
 		if (addr) {
 			addr->a.v6.sin6_family = AF_INET6;
-			addr->a.v6.sin6_port = 0;
 			addr->a.v6.sin6_addr = ifp->addr;
 			addr->a.v6.sin6_scope_id = dev->ifindex;
 			addr->valid = 1;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index dc030efa4447..9f2f3c48b7b6 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -151,7 +151,6 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist,
 		addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
 		if (addr) {
 			addr->a.v4.sin_family = AF_INET;
-			addr->a.v4.sin_port = 0;
 			addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
 			addr->valid = 1;
 			INIT_LIST_HEAD(&addr->list);
@@ -775,10 +774,9 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 
 	switch (ev) {
 	case NETDEV_UP:
-		addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
+		addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
 		if (addr) {
 			addr->a.v4.sin_family = AF_INET;
-			addr->a.v4.sin_port = 0;
 			addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
 			addr->valid = 1;
 			spin_lock_bh(&net->sctp.local_addr_lock);

From 93ece9285ec6cbcc610b3fff5ab0a04cd8153790 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 14 Jan 2019 17:22:25 +0800
Subject: [PATCH 2315/3220] tipc: fix uninit-value in
 tipc_nl_compat_link_reset_stats

commit 8b66fee7f8ee18f9c51260e7a43ab37db5177a05 upstream.

syzbot reports following splat:

BUG: KMSAN: uninit-value in strlen+0x3b/0xa0 lib/string.c:486
CPU: 1 PID: 11057 Comm: syz-executor0 Not tainted 4.20.0-rc7+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x173/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613
 __msan_warning+0x82/0xf0 mm/kmsan/kmsan_instr.c:295
 strlen+0x3b/0xa0 lib/string.c:486
 nla_put_string include/net/netlink.h:1154 [inline]
 tipc_nl_compat_link_reset_stats+0x1f0/0x360 net/tipc/netlink_compat.c:760
 __tipc_nl_compat_doit net/tipc/netlink_compat.c:311 [inline]
 tipc_nl_compat_doit+0x3aa/0xaf0 net/tipc/netlink_compat.c:344
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1107 [inline]
 tipc_nl_compat_recv+0x14d7/0x2760 net/tipc/netlink_compat.c:1210
 genl_family_rcv_msg net/netlink/genetlink.c:601 [inline]
 genl_rcv_msg+0x185f/0x1a60 net/netlink/genetlink.c:626
 netlink_rcv_skb+0x444/0x640 net/netlink/af_netlink.c:2477
 genl_rcv+0x63/0x80 net/netlink/genetlink.c:637
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0xf40/0x1020 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x127f/0x1300 net/netlink/af_netlink.c:1917
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xdb9/0x11b0 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x305/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x457ec9
Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f2557338c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457ec9
RDX: 0000000000000000 RSI: 00000000200001c0 RDI: 0000000000000003
RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f25573396d4
R13: 00000000004cb478 R14: 00000000004d86c8 R15: 00000000ffffffff

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline]
 kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:158
 kmsan_kmalloc+0xa6/0x130 mm/kmsan/kmsan_hooks.c:176
 kmsan_slab_alloc+0xe/0x10 mm/kmsan/kmsan_hooks.c:185
 slab_post_alloc_hook mm/slab.h:446 [inline]
 slab_alloc_node mm/slub.c:2759 [inline]
 __kmalloc_node_track_caller+0xe18/0x1030 mm/slub.c:4383
 __kmalloc_reserve net/core/skbuff.c:137 [inline]
 __alloc_skb+0x309/0xa20 net/core/skbuff.c:205
 alloc_skb include/linux/skbuff.h:998 [inline]
 netlink_alloc_large_skb net/netlink/af_netlink.c:1182 [inline]
 netlink_sendmsg+0xb82/0x1300 net/netlink/af_netlink.c:1892
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xdb9/0x11b0 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x305/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

The uninitialised access happened in tipc_nl_compat_link_reset_stats:
    nla_put_string(skb, TIPC_NLA_LINK_NAME, name)

This is because name string is not validated before it's used.

Reported-by: syzbot+e01d94b5a4c266be6e4c@syzkaller.appspotmail.com
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/tipc/netlink_compat.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index f86c6555a539..cace6e645818 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -87,6 +87,11 @@ static int tipc_skb_tailroom(struct sk_buff *skb)
 	return limit;
 }
 
+static inline int TLV_GET_DATA_LEN(struct tlv_desc *tlv)
+{
+	return TLV_GET_LEN(tlv) - TLV_SPACE(0);
+}
+
 static int tipc_add_tlv(struct sk_buff *skb, u16 type, void *data, u16 len)
 {
 	struct tlv_desc *tlv = (struct tlv_desc *)skb_tail_pointer(skb);
@@ -166,6 +171,11 @@ static struct sk_buff *tipc_get_err_tlv(char *str)
 	return buf;
 }
 
+static inline bool string_is_valid(char *s, int len)
+{
+	return memchr(s, '\0', len) ? true : false;
+}
+
 static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
 				   struct tipc_nl_compat_msg *msg,
 				   struct sk_buff *arg)
@@ -711,6 +721,7 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd,
 {
 	char *name;
 	struct nlattr *link;
+	int len;
 
 	name = (char *)TLV_DATA(msg->req);
 
@@ -718,6 +729,10 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd,
 	if (!link)
 		return -EMSGSIZE;
 
+	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_LINK_NAME);
+	if (!string_is_valid(name, len))
+		return -EINVAL;
+
 	if (nla_put_string(skb, TIPC_NLA_LINK_NAME, name))
 		return -EMSGSIZE;
 

From 0ae6754379be24c7eb2c73bf6823072f05283791 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 14 Jan 2019 17:22:26 +0800
Subject: [PATCH 2316/3220] tipc: fix uninit-value in
 tipc_nl_compat_bearer_enable

commit 0762216c0ad2a2fccd63890648eca491f2c83d9a upstream.

syzbot reported:

BUG: KMSAN: uninit-value in strlen+0x3b/0xa0 lib/string.c:484
CPU: 1 PID: 6371 Comm: syz-executor652 Not tainted 4.19.0-rc8+ #70
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x306/0x460 lib/dump_stack.c:113
 kmsan_report+0x1a2/0x2e0 mm/kmsan/kmsan.c:917
 __msan_warning+0x7c/0xe0 mm/kmsan/kmsan_instr.c:500
 strlen+0x3b/0xa0 lib/string.c:484
 nla_put_string include/net/netlink.h:1011 [inline]
 tipc_nl_compat_bearer_enable+0x238/0x7b0 net/tipc/netlink_compat.c:389
 __tipc_nl_compat_doit net/tipc/netlink_compat.c:311 [inline]
 tipc_nl_compat_doit+0x39f/0xae0 net/tipc/netlink_compat.c:344
 tipc_nl_compat_recv+0x147c/0x2760 net/tipc/netlink_compat.c:1107
 genl_family_rcv_msg net/netlink/genetlink.c:601 [inline]
 genl_rcv_msg+0x185c/0x1a20 net/netlink/genetlink.c:626
 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2454
 genl_rcv+0x63/0x80 net/netlink/genetlink.c:637
 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
 netlink_unicast+0x166d/0x1720 net/netlink/af_netlink.c:1343
 netlink_sendmsg+0x1391/0x1420 net/netlink/af_netlink.c:1908
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x307/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x440179
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fffef7beee8 EFLAGS: 00000213 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440179
RDX: 0000000000000000 RSI: 0000000020000100 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8
R10: 0000000000000000 R11: 0000000000000213 R12: 0000000000401a00
R13: 0000000000401a90 R14: 0000000000000000 R15: 0000000000000000

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:255 [inline]
 kmsan_internal_poison_shadow+0xc8/0x1d0 mm/kmsan/kmsan.c:180
 kmsan_kmalloc+0xa4/0x120 mm/kmsan/kmsan_hooks.c:104
 kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan_hooks.c:113
 slab_post_alloc_hook mm/slab.h:446 [inline]
 slab_alloc_node mm/slub.c:2727 [inline]
 __kmalloc_node_track_caller+0xb43/0x1400 mm/slub.c:4360
 __kmalloc_reserve net/core/skbuff.c:138 [inline]
 __alloc_skb+0x422/0xe90 net/core/skbuff.c:206
 alloc_skb include/linux/skbuff.h:996 [inline]
 netlink_alloc_large_skb net/netlink/af_netlink.c:1189 [inline]
 netlink_sendmsg+0xcaf/0x1420 net/netlink/af_netlink.c:1883
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x307/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

The root cause is that we don't validate whether bear name is a valid
string in tipc_nl_compat_bearer_enable().

Meanwhile, we also fix the same issue in the following functions:
tipc_nl_compat_bearer_disable()
tipc_nl_compat_link_stat_dump()
tipc_nl_compat_media_set()
tipc_nl_compat_bearer_set()

Reported-by: syzbot+b33d5cae0efd35dbfe77@syzkaller.appspotmail.com
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/tipc/netlink_compat.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index cace6e645818..79dfbc090e41 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -374,6 +374,7 @@ static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd,
 	struct nlattr *prop;
 	struct nlattr *bearer;
 	struct tipc_bearer_config *b;
+	int len;
 
 	b = (struct tipc_bearer_config *)TLV_DATA(msg->req);
 
@@ -381,6 +382,10 @@ static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd,
 	if (!bearer)
 		return -EMSGSIZE;
 
+	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_BEARER_NAME);
+	if (!string_is_valid(b->name, len))
+		return -EINVAL;
+
 	if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, b->name))
 		return -EMSGSIZE;
 
@@ -406,6 +411,7 @@ static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd,
 {
 	char *name;
 	struct nlattr *bearer;
+	int len;
 
 	name = (char *)TLV_DATA(msg->req);
 
@@ -413,6 +419,10 @@ static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd,
 	if (!bearer)
 		return -EMSGSIZE;
 
+	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_BEARER_NAME);
+	if (!string_is_valid(name, len))
+		return -EINVAL;
+
 	if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, name))
 		return -EMSGSIZE;
 
@@ -472,6 +482,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
 	struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
 	struct nlattr *prop[TIPC_NLA_PROP_MAX + 1];
 	struct nlattr *stats[TIPC_NLA_STATS_MAX + 1];
+	int len;
 
 	nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
 
@@ -482,6 +493,11 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
 			 NULL);
 
 	name = (char *)TLV_DATA(msg->req);
+
+	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_LINK_NAME);
+	if (!string_is_valid(name, len))
+		return -EINVAL;
+
 	if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0)
 		return 0;
 
@@ -615,6 +631,7 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb,
 	struct nlattr *prop;
 	struct nlattr *media;
 	struct tipc_link_config *lc;
+	int len;
 
 	lc = (struct tipc_link_config *)TLV_DATA(msg->req);
 
@@ -622,6 +639,10 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb,
 	if (!media)
 		return -EMSGSIZE;
 
+	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME);
+	if (!string_is_valid(lc->name, len))
+		return -EINVAL;
+
 	if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name))
 		return -EMSGSIZE;
 
@@ -642,6 +663,7 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb,
 	struct nlattr *prop;
 	struct nlattr *bearer;
 	struct tipc_link_config *lc;
+	int len;
 
 	lc = (struct tipc_link_config *)TLV_DATA(msg->req);
 
@@ -649,6 +671,10 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb,
 	if (!bearer)
 		return -EMSGSIZE;
 
+	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME);
+	if (!string_is_valid(lc->name, len))
+		return -EINVAL;
+
 	if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name))
 		return -EMSGSIZE;
 

From 0ff9dec295ef19448cffee478c4703969e484791 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 14 Jan 2019 17:22:27 +0800
Subject: [PATCH 2317/3220] tipc: fix uninit-value in tipc_nl_compat_link_set

commit edf5ff04a45750ac8ce2435974f001dc9cfbf055 upstream.

syzbot reports following splat:

BUG: KMSAN: uninit-value in strlen+0x3b/0xa0 lib/string.c:486
CPU: 1 PID: 9306 Comm: syz-executor172 Not tainted 4.20.0-rc7+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x173/0x1d0 lib/dump_stack.c:113
  kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613
  __msan_warning+0x82/0xf0 mm/kmsan/kmsan_instr.c:313
  strlen+0x3b/0xa0 lib/string.c:486
  nla_put_string include/net/netlink.h:1154 [inline]
  __tipc_nl_compat_link_set net/tipc/netlink_compat.c:708 [inline]
  tipc_nl_compat_link_set+0x929/0x1220 net/tipc/netlink_compat.c:744
  __tipc_nl_compat_doit net/tipc/netlink_compat.c:311 [inline]
  tipc_nl_compat_doit+0x3aa/0xaf0 net/tipc/netlink_compat.c:344
  tipc_nl_compat_handle net/tipc/netlink_compat.c:1107 [inline]
  tipc_nl_compat_recv+0x14d7/0x2760 net/tipc/netlink_compat.c:1210
  genl_family_rcv_msg net/netlink/genetlink.c:601 [inline]
  genl_rcv_msg+0x185f/0x1a60 net/netlink/genetlink.c:626
  netlink_rcv_skb+0x444/0x640 net/netlink/af_netlink.c:2477
  genl_rcv+0x63/0x80 net/netlink/genetlink.c:637
  netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
  netlink_unicast+0xf40/0x1020 net/netlink/af_netlink.c:1336
  netlink_sendmsg+0x127f/0x1300 net/netlink/af_netlink.c:1917
  sock_sendmsg_nosec net/socket.c:621 [inline]
  sock_sendmsg net/socket.c:631 [inline]
  ___sys_sendmsg+0xdb9/0x11b0 net/socket.c:2116
  __sys_sendmsg net/socket.c:2154 [inline]
  __do_sys_sendmsg net/socket.c:2163 [inline]
  __se_sys_sendmsg+0x305/0x460 net/socket.c:2161
  __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
  do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
  entry_SYSCALL_64_after_hwframe+0x63/0xe7

The uninitialised access happened in
    nla_put_string(skb, TIPC_NLA_LINK_NAME, lc->name)

This is because lc->name string is not validated before it's used.

Reported-by: syzbot+d78b8a29241a195aefb8@syzkaller.appspotmail.com
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/tipc/netlink_compat.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 79dfbc090e41..6d4b21a4bc06 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -723,9 +723,14 @@ static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd,
 	struct tipc_link_config *lc;
 	struct tipc_bearer *bearer;
 	struct tipc_media *media;
+	int len;
 
 	lc = (struct tipc_link_config *)TLV_DATA(msg->req);
 
+	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_LINK_NAME);
+	if (!string_is_valid(lc->name, len))
+		return -EINVAL;
+
 	media = tipc_media_find(lc->name);
 	if (media) {
 		cmd->doit = &tipc_nl_media_set;

From c25352f9ad5dffb4de95069e67891e2aa2e99e50 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 14 Jan 2019 17:22:28 +0800
Subject: [PATCH 2318/3220] tipc: fix uninit-value in
 tipc_nl_compat_name_table_dump

commit 974cb0e3e7c963ced06c4e32c5b2884173fa5e01 upstream.

syzbot reported:

BUG: KMSAN: uninit-value in __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline]
BUG: KMSAN: uninit-value in __fswab32 include/uapi/linux/swab.h:59 [inline]
BUG: KMSAN: uninit-value in tipc_nl_compat_name_table_dump+0x4a8/0xba0 net/tipc/netlink_compat.c:826
CPU: 0 PID: 6290 Comm: syz-executor848 Not tainted 4.19.0-rc8+ #70
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x306/0x460 lib/dump_stack.c:113
 kmsan_report+0x1a2/0x2e0 mm/kmsan/kmsan.c:917
 __msan_warning+0x7c/0xe0 mm/kmsan/kmsan_instr.c:500
 __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline]
 __fswab32 include/uapi/linux/swab.h:59 [inline]
 tipc_nl_compat_name_table_dump+0x4a8/0xba0 net/tipc/netlink_compat.c:826
 __tipc_nl_compat_dumpit+0x59e/0xdb0 net/tipc/netlink_compat.c:205
 tipc_nl_compat_dumpit+0x63a/0x820 net/tipc/netlink_compat.c:270
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1151 [inline]
 tipc_nl_compat_recv+0x1402/0x2760 net/tipc/netlink_compat.c:1210
 genl_family_rcv_msg net/netlink/genetlink.c:601 [inline]
 genl_rcv_msg+0x185c/0x1a20 net/netlink/genetlink.c:626
 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2454
 genl_rcv+0x63/0x80 net/netlink/genetlink.c:637
 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
 netlink_unicast+0x166d/0x1720 net/netlink/af_netlink.c:1343
 netlink_sendmsg+0x1391/0x1420 net/netlink/af_netlink.c:1908
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x307/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x440179
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffecec49318 EFLAGS: 00000213 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440179
RDX: 0000000000000000 RSI: 0000000020000100 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8
R10: 0000000000000000 R11: 0000000000000213 R12: 0000000000401a00
R13: 0000000000401a90 R14: 0000000000000000 R15: 0000000000000000

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:255 [inline]
 kmsan_internal_poison_shadow+0xc8/0x1d0 mm/kmsan/kmsan.c:180
 kmsan_kmalloc+0xa4/0x120 mm/kmsan/kmsan_hooks.c:104
 kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan_hooks.c:113
 slab_post_alloc_hook mm/slab.h:446 [inline]
 slab_alloc_node mm/slub.c:2727 [inline]
 __kmalloc_node_track_caller+0xb43/0x1400 mm/slub.c:4360
 __kmalloc_reserve net/core/skbuff.c:138 [inline]
 __alloc_skb+0x422/0xe90 net/core/skbuff.c:206
 alloc_skb include/linux/skbuff.h:996 [inline]
 netlink_alloc_large_skb net/netlink/af_netlink.c:1189 [inline]
 netlink_sendmsg+0xcaf/0x1420 net/netlink/af_netlink.c:1883
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116
 __sys_sendmsg net/socket.c:2154 [inline]
 __do_sys_sendmsg net/socket.c:2163 [inline]
 __se_sys_sendmsg+0x307/0x460 net/socket.c:2161
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
 do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

We cannot take for granted the thing that the length of data contained
in TLV is longer than the size of struct tipc_name_table_query in
tipc_nl_compat_name_table_dump().

Reported-by: syzbot+06e771a754829716a327@syzkaller.appspotmail.com
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/tipc/netlink_compat.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 6d4b21a4bc06..d0a086d804c2 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -785,6 +785,8 @@ static int tipc_nl_compat_name_table_dump_header(struct tipc_nl_compat_msg *msg)
 	};
 
 	ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req);
+	if (TLV_GET_DATA_LEN(msg->req) < sizeof(struct tipc_name_table_query))
+		return -EINVAL;
 
 	depth = ntohl(ntq->depth);
 

From 02035bea64b7e4f74937e0f8e63e5c7a7cc4982d Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 14 Jan 2019 17:22:29 +0800
Subject: [PATCH 2319/3220] tipc: fix uninit-value in tipc_nl_compat_doit

commit 2753ca5d9009c180dbfd4c802c80983b4b6108d1 upstream.

BUG: KMSAN: uninit-value in tipc_nl_compat_doit+0x404/0xa10 net/tipc/netlink_compat.c:335
CPU: 0 PID: 4514 Comm: syz-executor485 Not tainted 4.16.0+ #87
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x185/0x1d0 lib/dump_stack.c:53
 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
 tipc_nl_compat_doit+0x404/0xa10 net/tipc/netlink_compat.c:335
 tipc_nl_compat_recv+0x164b/0x2700 net/tipc/netlink_compat.c:1153
 genl_family_rcv_msg net/netlink/genetlink.c:599 [inline]
 genl_rcv_msg+0x1686/0x1810 net/netlink/genetlink.c:624
 netlink_rcv_skb+0x378/0x600 net/netlink/af_netlink.c:2447
 genl_rcv+0x63/0x80 net/netlink/genetlink.c:635
 netlink_unicast_kernel net/netlink/af_netlink.c:1311 [inline]
 netlink_unicast+0x166b/0x1740 net/netlink/af_netlink.c:1337
 netlink_sendmsg+0x1048/0x1310 net/netlink/af_netlink.c:1900
 sock_sendmsg_nosec net/socket.c:630 [inline]
 sock_sendmsg net/socket.c:640 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046
 __sys_sendmsg net/socket.c:2080 [inline]
 SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091
 SyS_sendmsg+0x54/0x80 net/socket.c:2087
 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x43fda9
RSP: 002b:00007ffd0c184ba8 EFLAGS: 00000213 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fda9
RDX: 0000000000000000 RSI: 0000000020023000 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 00000000004002c8 R09: 00000000004002c8
R10: 00000000004002c8 R11: 0000000000000213 R12: 00000000004016d0
R13: 0000000000401760 R14: 0000000000000000 R15: 0000000000000000

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
 kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
 kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
 slab_post_alloc_hook mm/slab.h:445 [inline]
 slab_alloc_node mm/slub.c:2737 [inline]
 __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
 __kmalloc_reserve net/core/skbuff.c:138 [inline]
 __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
 alloc_skb include/linux/skbuff.h:984 [inline]
 netlink_alloc_large_skb net/netlink/af_netlink.c:1183 [inline]
 netlink_sendmsg+0x9a6/0x1310 net/netlink/af_netlink.c:1875
 sock_sendmsg_nosec net/socket.c:630 [inline]
 sock_sendmsg net/socket.c:640 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046
 __sys_sendmsg net/socket.c:2080 [inline]
 SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091
 SyS_sendmsg+0x54/0x80 net/socket.c:2087
 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

In tipc_nl_compat_recv(), when the len variable returned by
nlmsg_attrlen() is 0, the message is still treated as a valid one,
which is obviously unresonable. When len is zero, it means the
message not only doesn't contain any valid TLV payload, but also
TLV header is not included. Under this stituation, tlv_type field
in TLV header is still accessed in tipc_nl_compat_dumpit() or
tipc_nl_compat_doit(), but the field space is obviously illegal.
Of course, it is not initialized.

Reported-by: syzbot+bca0dc46634781f08b38@syzkaller.appspotmail.com
Reported-by: syzbot+6bdb590321a7ae40c1a6@syzkaller.appspotmail.com
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/tipc/netlink_compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index d0a086d804c2..e9653c42cdd1 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1165,7 +1165,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN);
-	if (len && !TLV_OK(msg.req, len)) {
+	if (!len || !TLV_OK(msg.req, len)) {
 		msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED);
 		err = -EOPNOTSUPP;
 		goto send;

From b3f3107fbd928fed6e4fecbe3da2ed5f43216439 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 8 Nov 2018 14:01:02 +0100
Subject: [PATCH 2320/3220] block/loop: Use global lock for ioctl() operation.

commit 310ca162d779efee8a2dc3731439680f3e9c1e86 upstream.

syzbot is reporting NULL pointer dereference [1] which is caused by
race condition between ioctl(loop_fd, LOOP_CLR_FD, 0) versus
ioctl(other_loop_fd, LOOP_SET_FD, loop_fd) due to traversing other
loop devices at loop_validate_file() without holding corresponding
lo->lo_ctl_mutex locks.

Since ioctl() request on loop devices is not frequent operation, we don't
need fine grained locking. Let's use global lock in order to allow safe
traversal at loop_validate_file().

Note that syzbot is also reporting circular locking dependency between
bdev->bd_mutex and lo->lo_ctl_mutex [2] which is caused by calling
blkdev_reread_part() with lock held. This patch does not address it.

[1] https://syzkaller.appspot.com/bug?id=f3cfe26e785d85f9ee259f385515291d21bd80a3
[2] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d15889

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+bf89c128e05dd6c62523@syzkaller.appspotmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/block/loop.c | 42 +++++++++++++++++++++---------------------
 drivers/block/loop.h |  1 -
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index da3902ac16c8..ae361ee90587 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -82,6 +82,7 @@
 
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_index_mutex);
+static DEFINE_MUTEX(loop_ctl_mutex);
 
 static int max_part;
 static int part_shift;
@@ -1044,7 +1045,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	 */
 	if (atomic_read(&lo->lo_refcnt) > 1) {
 		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return 0;
 	}
 
@@ -1093,12 +1094,12 @@ static int loop_clr_fd(struct loop_device *lo)
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 	loop_unprepare_queue(lo);
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	/*
-	 * Need not hold lo_ctl_mutex to fput backing file.
-	 * Calling fput holding lo_ctl_mutex triggers a circular
+	 * Need not hold loop_ctl_mutex to fput backing file.
+	 * Calling fput holding loop_ctl_mutex triggers a circular
 	 * lock dependency possibility warning as fput can take
-	 * bd_mutex which is usually taken before lo_ctl_mutex.
+	 * bd_mutex which is usually taken before loop_ctl_mutex.
 	 */
 	fput(filp);
 	return 0;
@@ -1361,7 +1362,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	struct loop_device *lo = bdev->bd_disk->private_data;
 	int err;
 
-	mutex_lock_nested(&lo->lo_ctl_mutex, 1);
+	mutex_lock_nested(&loop_ctl_mutex, 1);
 	switch (cmd) {
 	case LOOP_SET_FD:
 		err = loop_set_fd(lo, mode, bdev, arg);
@@ -1370,7 +1371,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		err = loop_change_fd(lo, bdev, arg);
 		break;
 	case LOOP_CLR_FD:
-		/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
+		/* loop_clr_fd would have unlocked loop_ctl_mutex on success */
 		err = loop_clr_fd(lo);
 		if (!err)
 			goto out_unlocked;
@@ -1406,7 +1407,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 out_unlocked:
 	return err;
@@ -1539,16 +1540,16 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 	switch(cmd) {
 	case LOOP_SET_STATUS:
-		mutex_lock(&lo->lo_ctl_mutex);
+		mutex_lock(&loop_ctl_mutex);
 		err = loop_set_status_compat(
 			lo, (const struct compat_loop_info __user *) arg);
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_GET_STATUS:
-		mutex_lock(&lo->lo_ctl_mutex);
+		mutex_lock(&loop_ctl_mutex);
 		err = loop_get_status_compat(
 			lo, (struct compat_loop_info __user *) arg);
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_SET_CAPACITY:
 	case LOOP_CLR_FD:
@@ -1592,7 +1593,7 @@ static void __lo_release(struct loop_device *lo)
 	if (atomic_dec_return(&lo->lo_refcnt))
 		return;
 
-	mutex_lock(&lo->lo_ctl_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		/*
 		 * In autoclear mode, stop the loop thread
@@ -1609,7 +1610,7 @@ static void __lo_release(struct loop_device *lo)
 		loop_flush(lo);
 	}
 
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 }
 
 static void lo_release(struct gendisk *disk, fmode_t mode)
@@ -1655,10 +1656,10 @@ static int unregister_transfer_cb(int id, void *ptr, void *data)
 	struct loop_device *lo = ptr;
 	struct loop_func_table *xfer = data;
 
-	mutex_lock(&lo->lo_ctl_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_encryption == xfer)
 		loop_release_xfer(lo);
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	return 0;
 }
 
@@ -1820,7 +1821,6 @@ static int loop_add(struct loop_device **l, int i)
 	if (!part_shift)
 		disk->flags |= GENHD_FL_NO_PART_SCAN;
 	disk->flags |= GENHD_FL_EXT_DEVT;
-	mutex_init(&lo->lo_ctl_mutex);
 	atomic_set(&lo->lo_refcnt, 0);
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
@@ -1933,19 +1933,19 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 		ret = loop_lookup(&lo, parm);
 		if (ret < 0)
 			break;
-		mutex_lock(&lo->lo_ctl_mutex);
+		mutex_lock(&loop_ctl_mutex);
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
-			mutex_unlock(&lo->lo_ctl_mutex);
+			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		if (atomic_read(&lo->lo_refcnt) > 0) {
 			ret = -EBUSY;
-			mutex_unlock(&lo->lo_ctl_mutex);
+			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		lo->lo_disk->private_data = NULL;
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		idr_remove(&loop_index_idr, lo->lo_number);
 		loop_remove(lo);
 		break;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 60f0fd2c0c65..a923e74495ce 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -55,7 +55,6 @@ struct loop_device {
 
 	spinlock_t		lo_lock;
 	int			lo_state;
-	struct mutex		lo_ctl_mutex;
 	struct kthread_worker	worker;
 	struct task_struct	*worker_task;
 	bool			use_dio;

From 4ee414c3b6021db621901f2697d35774926268f6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:03 +0100
Subject: [PATCH 2321/3220] loop: Fold __loop_release into loop_release

commit 967d1dc144b50ad005e5eecdfadfbcfb399ffff6 upstream.

__loop_release() has a single call site. Fold it there. This is
currently not a huge win but it will make following replacement of
loop_index_mutex more obvious.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/block/loop.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ae361ee90587..c15767ed92e3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1586,12 +1586,15 @@ out:
 	return err;
 }
 
-static void __lo_release(struct loop_device *lo)
+static void lo_release(struct gendisk *disk, fmode_t mode)
 {
+	struct loop_device *lo;
 	int err;
 
+	mutex_lock(&loop_index_mutex);
+	lo = disk->private_data;
 	if (atomic_dec_return(&lo->lo_refcnt))
-		return;
+		goto unlock_index;
 
 	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
@@ -1601,7 +1604,7 @@ static void __lo_release(struct loop_device *lo)
 		 */
 		err = loop_clr_fd(lo);
 		if (!err)
-			return;
+			goto unlock_index;
 	} else {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1611,12 +1614,7 @@ static void __lo_release(struct loop_device *lo)
 	}
 
 	mutex_unlock(&loop_ctl_mutex);
-}
-
-static void lo_release(struct gendisk *disk, fmode_t mode)
-{
-	mutex_lock(&loop_index_mutex);
-	__lo_release(disk->private_data);
+unlock_index:
 	mutex_unlock(&loop_index_mutex);
 }
 

From 611f77199cd763e6b7c0462c2f199ddb3a089750 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:04 +0100
Subject: [PATCH 2322/3220] loop: Get rid of loop_index_mutex

commit 0a42e99b58a208839626465af194cfe640ef9493 upstream.

Now that loop_ctl_mutex is global, just get rid of loop_index_mutex as
there is no good reason to keep these two separate and it just
complicates the locking.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/block/loop.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c15767ed92e3..a51f7b066f17 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -81,7 +81,6 @@
 #include <asm/uaccess.h>
 
 static DEFINE_IDR(loop_index_idr);
-static DEFINE_MUTEX(loop_index_mutex);
 static DEFINE_MUTEX(loop_ctl_mutex);
 
 static int max_part;
@@ -1571,9 +1570,11 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 static int lo_open(struct block_device *bdev, fmode_t mode)
 {
 	struct loop_device *lo;
-	int err = 0;
+	int err;
 
-	mutex_lock(&loop_index_mutex);
+	err = mutex_lock_killable(&loop_ctl_mutex);
+	if (err)
+		return err;
 	lo = bdev->bd_disk->private_data;
 	if (!lo) {
 		err = -ENXIO;
@@ -1582,7 +1583,7 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 
 	atomic_inc(&lo->lo_refcnt);
 out:
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	return err;
 }
 
@@ -1591,12 +1592,11 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	struct loop_device *lo;
 	int err;
 
-	mutex_lock(&loop_index_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	lo = disk->private_data;
 	if (atomic_dec_return(&lo->lo_refcnt))
-		goto unlock_index;
+		goto out_unlock;
 
-	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		/*
 		 * In autoclear mode, stop the loop thread
@@ -1604,7 +1604,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 */
 		err = loop_clr_fd(lo);
 		if (!err)
-			goto unlock_index;
+			return;
 	} else {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1613,9 +1613,8 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		loop_flush(lo);
 	}
 
+out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
-unlock_index:
-	mutex_unlock(&loop_index_mutex);
 }
 
 static const struct block_device_operations lo_fops = {
@@ -1897,7 +1896,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 	struct kobject *kobj;
 	int err;
 
-	mutex_lock(&loop_index_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	err = loop_lookup(&lo, MINOR(dev) >> part_shift);
 	if (err < 0)
 		err = loop_add(&lo, MINOR(dev) >> part_shift);
@@ -1905,7 +1904,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 		kobj = NULL;
 	else
 		kobj = get_disk(lo->lo_disk);
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 	*part = 0;
 	return kobj;
@@ -1915,9 +1914,13 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			       unsigned long parm)
 {
 	struct loop_device *lo;
-	int ret = -ENOSYS;
+	int ret;
 
-	mutex_lock(&loop_index_mutex);
+	ret = mutex_lock_killable(&loop_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ret = -ENOSYS;
 	switch (cmd) {
 	case LOOP_CTL_ADD:
 		ret = loop_lookup(&lo, parm);
@@ -1931,7 +1934,6 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 		ret = loop_lookup(&lo, parm);
 		if (ret < 0)
 			break;
-		mutex_lock(&loop_ctl_mutex);
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
 			mutex_unlock(&loop_ctl_mutex);
@@ -1943,7 +1945,6 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		}
 		lo->lo_disk->private_data = NULL;
-		mutex_unlock(&loop_ctl_mutex);
 		idr_remove(&loop_index_idr, lo->lo_number);
 		loop_remove(lo);
 		break;
@@ -1953,7 +1954,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		ret = loop_add(&lo, -1);
 	}
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 	return ret;
 }
@@ -2036,10 +2037,10 @@ static int __init loop_init(void)
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
 	/* pre-create number of devices given by config or max_loop */
-	mutex_lock(&loop_index_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	for (i = 0; i < nr; i++)
 		loop_add(&lo, i);
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 	printk(KERN_INFO "loop: module loaded\n");
 	return 0;

From 9ec298cc874d08020f45791a8396e1593c3278c1 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 12 Nov 2018 08:42:14 -0700
Subject: [PATCH 2323/3220] loop: Fix double mutex_unlock(&loop_ctl_mutex) in
 loop_control_ioctl()

commit 628bd85947091830a8c4872adfd5ed1d515a9cf2 upstream.

Commit 0a42e99b58a20883 ("loop: Get rid of loop_index_mutex") forgot to
remove mutex_unlock(&loop_ctl_mutex) from loop_control_ioctl() when
replacing loop_index_mutex with loop_ctl_mutex.

Fixes: 0a42e99b58a20883 ("loop: Get rid of loop_index_mutex")
Reported-by: syzbot <syzbot+c0138741c2290fc5e63f@syzkaller.appspotmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/block/loop.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a51f7b066f17..b1cf891cb3d9 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1936,12 +1936,10 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
-			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		if (atomic_read(&lo->lo_refcnt) > 0) {
 			ret = -EBUSY;
-			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		lo->lo_disk->private_data = NULL;

From e896840a8a6680e53dcaafc398010b0c6e40a213 Mon Sep 17 00:00:00 2001
From: Ivan Mironov <mironov.ivan@gmail.com>
Date: Tue, 8 Jan 2019 12:23:53 +0500
Subject: [PATCH 2324/3220] drm/fb-helper: Ignore the value of
 fb_var_screeninfo.pixclock

commit 66a8d5bfb518f9f12d47e1d2dce1732279f9451e upstream.

Strict requirement of pixclock to be zero breaks support of SDL 1.2
which contains hardcoded table of supported video modes with non-zero
pixclock values[1].

To better understand which pixclock values are considered valid and how
driver should handle these values, I briefly examined few existing fbdev
drivers and documentation in Documentation/fb/. And it looks like there
are no strict rules on that and actual behaviour varies:

	* some drivers treat (pixclock == 0) as "use defaults" (uvesafb.c);
	* some treat (pixclock == 0) as invalid value which leads to
	  -EINVAL (clps711x-fb.c);
	* some pass converted pixclock value to hardware (uvesafb.c);
	* some are trying to find nearest value from predefined table
          (vga16fb.c, video_gx.c).

Given this, I believe that it should be safe to just ignore this value if
changing is not supported. It seems that any portable fbdev application
which was not written only for one specific device working under one
specific kernel version should not rely on any particular behaviour of
pixclock anyway.

However, while enabling SDL1 applications to work out of the box when
there is no /etc/fb.modes with valid settings, this change affects the
video mode choosing logic in SDL. Depending on current screen
resolution, contents of /etc/fb.modes and resolution requested by
application, this may lead to user-visible difference (not always):
image will be displayed in a right way, but it will be aligned to the
left instead of center. There is no "right behaviour" here as well, as
emulated fbdev, opposing to old fbdev drivers, simply ignores any
requsts of video mode changes with resolutions smaller than current.

The easiest way to reproduce this problem is to install sdl-sopwith[2],
remove /etc/fb.modes file if it exists, and then try to run sopwith
from console without X. At least in Fedora 29, sopwith may be simply
installed from standard repositories.

[1] SDL 1.2.15 source code, src/video/fbcon/SDL_fbvideo.c, vesa_timings
[2] http://sdl-sopwith.sourceforge.net/

Signed-off-by: Ivan Mironov <mironov.ivan@gmail.com>
Cc: stable@vger.kernel.org
Fixes: 79e539453b34e ("DRM: i915: add mode setting support")
Fixes: 771fe6b912fca ("drm/radeon: introduce kernel modesetting for radeon hardware")
Fixes: 785b93ef8c309 ("drm/kms: move driver specific fb common code to helper functions (v2)")
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20190108072353.28078-3-mironov.ivan@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/drm_fb_helper.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 5ad036741b99..e449f22c8f29 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -1109,9 +1109,14 @@ int drm_fb_helper_check_var(struct fb_var_screeninfo *var,
 	struct drm_framebuffer *fb = fb_helper->fb;
 	int depth;
 
-	if (var->pixclock != 0 || in_dbg_master())
+	if (in_dbg_master())
 		return -EINVAL;
 
+	if (var->pixclock != 0) {
+		DRM_DEBUG("fbdev emulation doesn't support changing the pixel clock, value of pixclock is ignored\n");
+		var->pixclock = 0;
+	}
+
 	/* Need to resize the fb object !!! */
 	if (var->bits_per_pixel > fb->bits_per_pixel ||
 	    var->xres > fb->width || var->yres > fb->height ||

From da6c1b10ba4c5a3329e7df0f994fe90975e60613 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Fri, 23 Nov 2018 07:05:58 -0500
Subject: [PATCH 2325/3220] media: vb2: be sure to unlock mutex on errors

commit c06ef2e9acef4cda1feee2ce055b8086e33d251a upstream.

As reported by smatch:
drivers/media/common/videobuf2/videobuf2-core.c: drivers/media/common/videobuf2/videobuf2-core.c:2159 vb2_mmap() warn: inconsistent returns 'mutex:&q->mmap_lock'.
  Locked on:   line 2148
  Unlocked on: line 2100
               line 2108
               line 2113
               line 2118
               line 2156
               line 2159

There is one error condition that doesn't unlock a mutex.

Fixes: cd26d1c4d1bc ("media: vb2: vb2_mmap: move lock up")
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/v4l2-core/videobuf2-core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c
index 4f1baf17c6b8..e0041fcfa783 100644
--- a/drivers/media/v4l2-core/videobuf2-core.c
+++ b/drivers/media/v4l2-core/videobuf2-core.c
@@ -2003,7 +2003,8 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma)
 	if (length < (vma->vm_end - vma->vm_start)) {
 		dprintk(1,
 			"MMAP invalid, as it would overflow buffer length\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto unlock;
 	}
 
 	ret = call_memop(vb, mmap, vb->planes[plane].mem_priv, vma);

From e4193747a5ccbdbe3b00a6e8ec0316c9abab3911 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Wed, 2 Jan 2019 14:45:07 +0800
Subject: [PATCH 2326/3220] r8169: Add support for new Realtek Ethernet

[ Upstream commit 36352991835ce99e46b4441dd0eb6980f9a83e8f ]

There are two new Realtek Ethernet devices which are re-branded r8168h.
Add the IDs to to support them.

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/realtek/r8169.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 93543e176829..8f40e121f7d4 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -324,6 +324,8 @@ enum cfg_version {
 };
 
 static const struct pci_device_id rtl8169_pci_tbl[] = {
+	{ PCI_VDEVICE(REALTEK,	0x2502), RTL_CFG_1 },
+	{ PCI_VDEVICE(REALTEK,	0x2600), RTL_CFG_1 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_REALTEK,	0x8129), 0, 0, RTL_CFG_0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_REALTEK,	0x8136), 0, 0, RTL_CFG_2 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_REALTEK,	0x8161), 0, 0, RTL_CFG_1 },

From 8fccab3b98ad178adfc8f2d90bf5c30ea0fc2e7b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 4 Jan 2019 16:58:15 -0800
Subject: [PATCH 2327/3220] ipv6: Consider sk_bound_dev_if when binding a
 socket to a v4 mapped address

[ Upstream commit ec90ad334986fa5856d11dd272f7f22fa86c55c4 ]

Similar to c5ee066333eb ("ipv6: Consider sk_bound_dev_if when binding a
socket to an address"), binding a socket to v4 mapped addresses needs to
consider if the socket is bound to a device.

This problem also exists from the beginning of git history.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/af_inet6.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 637a0e41b0aa..2f11163f4d09 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -292,6 +292,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	/* Check if the address belongs to the host. */
 	if (addr_type == IPV6_ADDR_MAPPED) {
+		struct net_device *dev = NULL;
 		int chk_addr_ret;
 
 		/* Binding to v4-mapped address on a v6-only socket
@@ -302,9 +303,17 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			goto out;
 		}
 
+		if (sk->sk_bound_dev_if) {
+			dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+			if (!dev) {
+				err = -ENODEV;
+				goto out;
+			}
+		}
+
 		/* Reproduce AF_INET checks to make the bindings consistent */
 		v4addr = addr->sin6_addr.s6_addr32[3];
-		chk_addr_ret = inet_addr_type(net, v4addr);
+		chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr);
 		if (!net->ipv4.sysctl_ip_nonlocal_bind &&
 		    !(inet->freebind || inet->transparent) &&
 		    v4addr != htonl(INADDR_ANY) &&

From cc975000ebb58d2e04bce8bf5a6a354dc72588eb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 5 Jan 2019 07:35:04 -0800
Subject: [PATCH 2328/3220] ipv6: Take rcu_read_lock in __inet6_bind for mapped
 addresses

[ Upstream commit d4a7e9bb74b5aaf07b89f6531c080b1130bdf019 ]

I realized the last patch calls dev_get_by_index_rcu in a branch not
holding the rcu lock. Add the calls to rcu_read_lock and rcu_read_unlock.

Fixes: ec90ad334986 ("ipv6: Consider sk_bound_dev_if when binding a socket to a v4 mapped address")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/af_inet6.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2f11163f4d09..d6f2dab28d14 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -303,17 +303,20 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			goto out;
 		}
 
+		rcu_read_lock();
 		if (sk->sk_bound_dev_if) {
 			dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
 			if (!dev) {
 				err = -ENODEV;
-				goto out;
+				goto out_unlock;
 			}
 		}
 
 		/* Reproduce AF_INET checks to make the bindings consistent */
 		v4addr = addr->sin6_addr.s6_addr32[3];
 		chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr);
+		rcu_read_unlock();
+
 		if (!net->ipv4.sysctl_ip_nonlocal_bind &&
 		    !(inet->freebind || inet->transparent) &&
 		    v4addr != htonl(INADDR_ANY) &&

From f00ebf4f84ed2e9344743d86e274ff77269df02a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 17 Apr 2018 19:10:15 -0700
Subject: [PATCH 2329/3220] xfs: don't fail when converting shortform attr to
 long form during ATTR_REPLACE

commit 7b38460dc8e4eafba06c78f8e37099d3b34d473c upstream.

Kanda Motohiro reported that expanding a tiny xattr into a large xattr
fails on XFS because we remove the tiny xattr from a shortform fork and
then try to re-add it after converting the fork to extents format having
not removed the ATTR_REPLACE flag.  This fails because the attr is no
longer present, causing a fs shutdown.

This is derived from the patch in his bug report, but we really
shouldn't ignore a nonzero retval from the remove call.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199119
Reported-by: kanda.motohiro@gmail.com
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/libxfs/xfs_attr.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index fb9636cc927c..5d8d12746e6e 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -528,7 +528,14 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
 		if (args->flags & ATTR_CREATE)
 			return retval;
 		retval = xfs_attr_shortform_remove(args);
-		ASSERT(retval == 0);
+		if (retval)
+			return retval;
+		/*
+		 * Since we have removed the old attr, clear ATTR_REPLACE so
+		 * that the leaf format add routine won't trip over the attr
+		 * not being around.
+		 */
+		args->flags &= ~ATTR_REPLACE;
 	}
 
 	if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||

From 0c4a25cc6f2934f3aa99a0bbfd20b71949bcad25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Rechi=20Vita?= <jprvita@gmail.com>
Date: Wed, 31 Oct 2018 17:21:26 -0700
Subject: [PATCH 2330/3220] platform/x86: asus-wmi: Tell the EC the OS will
 handle the display off hotkey
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 78f3ac76d9e5219589718b9e4733bee21627b3f5 ]

In the past, Asus firmwares would change the panel backlight directly
through the EC when the display off hotkey (Fn+F7) was pressed, and
only notify the OS of such change, with 0x33 when the LCD was ON and
0x34 when the LCD was OFF. These are currently mapped to
KEY_DISPLAYTOGGLE and KEY_DISPLAY_OFF, respectively.

Most recently the EC on Asus most machines lost ability to toggle the
LCD backlight directly, but unless the OS informs the firmware it is
going to handle the display toggle hotkey events, the firmware still
tries change the brightness through the EC, to no effect. The end result
is a long list (at Endless we counted 11) of Asus laptop models where
the display toggle hotkey does not perform any action. Our firmware
engineers contacts at Asus were surprised that there were still machines
out there with the old behavior.

Calling WMNB(ASUS_WMI_DEVID_BACKLIGHT==0x00050011, 2) on the _WDG device
tells the firmware that it should let the OS handle the display toggle
event, in which case it will simply notify the OS of a key press with
0x35, as shown by the DSDT excerpts bellow.

 Scope (_SB)
 {
     (...)

     Device (ATKD)
     {
         (...)

         Name (_WDG, Buffer (0x28)
         {
             /* 0000 */  0xD0, 0x5E, 0x84, 0x97, 0x6D, 0x4E, 0xDE, 0x11,
             /* 0008 */  0x8A, 0x39, 0x08, 0x00, 0x20, 0x0C, 0x9A, 0x66,
             /* 0010 */  0x4E, 0x42, 0x01, 0x02, 0x35, 0xBB, 0x3C, 0x0B,
             /* 0018 */  0xC2, 0xE3, 0xED, 0x45, 0x91, 0xC2, 0x4C, 0x5A,
             /* 0020 */  0x6D, 0x19, 0x5D, 0x1C, 0xFF, 0x00, 0x01, 0x08
         })
         Method (WMNB, 3, Serialized)
         {
             CreateDWordField (Arg2, Zero, IIA0)
             CreateDWordField (Arg2, 0x04, IIA1)
             Local0 = (Arg1 & 0xFFFFFFFF)

             (...)

             If ((Local0 == 0x53564544))
             {
                 (...)

                 If ((IIA0 == 0x00050011))
                 {
                     If ((IIA1 == 0x02))
                     {
                         ^^PCI0.SBRG.EC0.SPIN (0x72, One)
                         ^^PCI0.SBRG.EC0.BLCT = One
                     }

                     Return (One)
                 }
             }
             (...)
         }
         (...)
     }
     (...)
 }
 (...)

 Scope (_SB.PCI0.SBRG.EC0)
 {
     (...)

     Name (BLCT, Zero)

     (...)

     Method (_Q10, 0, NotSerialized)  // _Qxx: EC Query
     {
         If ((BLCT == Zero))
         {
             Local0 = One
             Local0 = RPIN (0x72)
             Local0 ^= One
             SPIN (0x72, Local0)
             If (ATKP)
             {
                 Local0 = (0x34 - Local0)
                 ^^^^ATKD.IANE (Local0)
             }
         }
         ElseIf ((BLCT == One))
         {
             If (ATKP)
             {
                 ^^^^ATKD.IANE (0x35)
             }
         }
     }
     (...)
 }

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/platform/x86/asus-wmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index f96f7b865267..7c1defaef3f5 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -2084,7 +2084,8 @@ static int asus_wmi_add(struct platform_device *pdev)
 		err = asus_wmi_backlight_init(asus);
 		if (err && err != -ENODEV)
 			goto fail_backlight;
-	}
+	} else
+		err = asus_wmi_set_devstate(ASUS_WMI_DEVID_BACKLIGHT, 2, NULL);
 
 	status = wmi_install_notify_handler(asus->driver->event_guid,
 					    asus_wmi_notify, asus);

From 73bf147f46fbeeb127fd8efe9eed5a76a6a97180 Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Tue, 23 Oct 2018 14:37:39 +0200
Subject: [PATCH 2331/3220] e1000e: allow non-monotonic SYSTIM readings

[ Upstream commit e1f65b0d70e9e5c80e15105cd96fa00174d7c436 ]

It seems with some NICs supported by the e1000e driver a SYSTIM reading
may occasionally be few microseconds before the previous reading and if
enabled also pass e1000e_sanitize_systim() without reaching the maximum
number of rereads, even if the function is modified to check three
consecutive readings (i.e. it doesn't look like a double read error).
This causes an underflow in the timecounter and the PHC time jumps hours
ahead.

This was observed on 82574, I217 and I219. The fastest way to reproduce
it is to run a program that continuously calls the PTP_SYS_OFFSET ioctl
on the PHC.

Modify e1000e_phc_gettime() to use timecounter_cyc2time() instead of
timecounter_read() in order to allow non-monotonic SYSTIM readings and
prevent the PHC from jumping.

Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Acked-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/intel/e1000e/ptp.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c
index 25a0ad5102d6..855cf8c15c8a 100644
--- a/drivers/net/ethernet/intel/e1000e/ptp.c
+++ b/drivers/net/ethernet/intel/e1000e/ptp.c
@@ -111,10 +111,14 @@ static int e1000e_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 	struct e1000_adapter *adapter = container_of(ptp, struct e1000_adapter,
 						     ptp_clock_info);
 	unsigned long flags;
-	u64 ns;
+	u64 cycles, ns;
 
 	spin_lock_irqsave(&adapter->systim_lock, flags);
-	ns = timecounter_read(&adapter->tc);
+
+	/* Use timecounter_cyc2time() to allow non-monotonic SYSTIM readings */
+	cycles = adapter->cc.read(&adapter->cc);
+	ns = timecounter_cyc2time(&adapter->tc, cycles);
+
 	spin_unlock_irqrestore(&adapter->systim_lock, flags);
 
 	*ts = ns_to_timespec64(ns);
@@ -170,9 +174,12 @@ static void e1000e_systim_overflow_work(struct work_struct *work)
 						     systim_overflow_work.work);
 	struct e1000_hw *hw = &adapter->hw;
 	struct timespec64 ts;
+	u64 ns;
 
-	adapter->ptp_clock_info.gettime64(&adapter->ptp_clock_info, &ts);
+	/* Update the timecounter */
+	ns = timecounter_read(&adapter->tc);
 
+	ts = ns_to_timespec64(ns);
 	e_dbg("SYSTIM overflow check at %lld.%09lu\n",
 	      (long long) ts.tv_sec, ts.tv_nsec);
 

From 568736f8e5a7c5a1ca4d00e028f952da9bd0934c Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Tue, 30 Oct 2018 12:35:45 +0100
Subject: [PATCH 2332/3220] writeback: don't decrement wb->refcnt if !wb->bdi

[ Upstream commit 347a28b586802d09604a149c1a1f6de5dccbe6fa ]

This happened while running in qemu-system-aarch64, the AMBA PL011 UART
driver when enabling CONFIG_DEBUG_TEST_DRIVER_REMOVE.
arch_initcall(pl011_init) came before subsys_initcall(default_bdi_init),
devtmpfs' handle_remove() crashes because the reference count is a NULL
pointer only because wb->bdi hasn't been initialized yet.

Rework so that wb_put have an extra check if wb->bdi before decrement
wb->refcnt and also add a WARN_ON_ONCE to get a warning if it happens again
in other drivers.

Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks")
Co-developed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/linux/backing-dev-defs.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a307c37c2e6c..072501a0ac86 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -225,6 +225,14 @@ static inline void wb_get(struct bdi_writeback *wb)
  */
 static inline void wb_put(struct bdi_writeback *wb)
 {
+	if (WARN_ON_ONCE(!wb->bdi)) {
+		/*
+		 * A driver bug might cause a file to be removed before bdi was
+		 * initialized.
+		 */
+		return;
+	}
+
 	if (wb != &wb->bdi->wb)
 		percpu_ref_put(&wb->refcnt);
 }

From c890a458e27210d1a749a18941047a9e4209fa93 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 13 Nov 2018 22:42:44 +0000
Subject: [PATCH 2333/3220] MIPS: SiByte: Enable swiotlb for SWARM, LittleSur
 and BigSur

[ Upstream commit e4849aff1e169b86c561738daf8ff020e9de1011 ]

The Broadcom SiByte BCM1250, BCM1125, and BCM1125H SOCs have an onchip
DRAM controller that supports memory amounts of up to 16GiB, and due to
how the address decoder has been wired in the SOC any memory beyond 1GiB
is actually mapped starting from 4GiB physical up, that is beyond the
32-bit addressable limit[1].  Consequently if the maximum amount of
memory has been installed, then it will span up to 19GiB.

Many of the evaluation boards we support that are based on one of these
SOCs have their memory soldered and the amount present fits in the
32-bit address range.  The BCM91250A SWARM board however has actual DIMM
slots and accepts, depending on the peripherals revision of the SOC, up
to 4GiB or 8GiB of memory in commercially available JEDEC modules[2].
I believe this is also the case with the BCM91250C2 LittleSur board.
This means that up to either 3GiB or 7GiB of memory requires 64-bit
addressing to access.

I believe the BCM91480B BigSur board, which has the BCM1480 SOC instead,
accepts at least as much memory, although I have no documentation or
actual hardware available to verify that.

Both systems have PCI slots installed for use by any PCI option boards,
including ones that only support 32-bit addressing (additionally the
32-bit PCI host bridge of the BCM1250, BCM1125, and BCM1125H SOCs limits
addressing to 32-bits), and there is no IOMMU available.  Therefore for
PCI DMA to work in the presence of memory beyond enable swiotlb for the
affected systems.

All the other SOC onchip DMA devices use 40-bit addressing and therefore
can address the whole memory, so only enable swiotlb if PCI support and
support for DMA beyond 4GiB have been both enabled in the configuration
of the kernel.

This shows up as follows:

Broadcom SiByte BCM1250 B2 @ 800 MHz (SB1 rev 2)
Board type: SiByte BCM91250A (SWARM)
Determined physical RAM map:
 memory: 000000000fe7fe00 @ 0000000000000000 (usable)
 memory: 000000001ffffe00 @ 0000000080000000 (usable)
 memory: 000000000ffffe00 @ 00000000c0000000 (usable)
 memory: 0000000087fffe00 @ 0000000100000000 (usable)
software IO TLB: mapped [mem 0xcbffc000-0xcfffc000] (64MB)

in the bootstrap log and removes failures like these:

defxx 0000:02:00.0: dma_direct_map_page: overflow 0x0000000185bc6080+4608 of device mask ffffffff bus mask 0
fddi0: Receive buffer allocation failed
fddi0: Adapter open failed!
IP-Config: Failed to open fddi0
defxx 0000:09:08.0: dma_direct_map_page: overflow 0x0000000185bc6080+4608 of device mask ffffffff bus mask 0
fddi1: Receive buffer allocation failed
fddi1: Adapter open failed!
IP-Config: Failed to open fddi1

when memory beyond 4GiB is handed out to devices that can only do 32-bit
addressing.

This updates commit cce335ae47e2 ("[MIPS] 64-bit Sibyte kernels need
DMA32.").

References:

[1] "BCM1250/BCM1125/BCM1125H User Manual", Revision 1250_1125-UM100-R,
    Broadcom Corporation, 21 Oct 2002, Section 3: "System Overview",
    "Memory Map", pp. 34-38

[2] "BCM91250A User Manual", Revision 91250A-UM100-R, Broadcom
    Corporation, 18 May 2004, Section 3: "Physical Description",
    "Supported DRAM", p. 23

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
[paul.burton@mips.com: Remove GPL text from dma.c; SPDX tag covers it]
Signed-off-by: Paul Burton <paul.burton@mips.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Patchwork: https://patchwork.linux-mips.org/patch/21108/
References: cce335ae47e2 ("[MIPS] 64-bit Sibyte kernels need DMA32.")
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/mips/Kconfig                |  3 +++
 arch/mips/sibyte/common/Makefile |  1 +
 arch/mips/sibyte/common/dma.c    | 14 ++++++++++++++
 3 files changed, 18 insertions(+)
 create mode 100644 arch/mips/sibyte/common/dma.c

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 3a908cc81317..333ea0389adb 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -760,6 +760,7 @@ config SIBYTE_SWARM
 	select SYS_SUPPORTS_HIGHMEM
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select ZONE_DMA32 if 64BIT
+	select SWIOTLB if ARCH_DMA_ADDR_T_64BIT && PCI
 
 config SIBYTE_LITTLESUR
 	bool "Sibyte BCM91250C2-LittleSur"
@@ -782,6 +783,7 @@ config SIBYTE_SENTOSA
 	select SYS_HAS_CPU_SB1
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
+	select SWIOTLB if ARCH_DMA_ADDR_T_64BIT && PCI
 
 config SIBYTE_BIGSUR
 	bool "Sibyte BCM91480B-BigSur"
@@ -795,6 +797,7 @@ config SIBYTE_BIGSUR
 	select SYS_SUPPORTS_HIGHMEM
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select ZONE_DMA32 if 64BIT
+	select SWIOTLB if ARCH_DMA_ADDR_T_64BIT && PCI
 
 config SNI_RM
 	bool "SNI RM200/300/400"
diff --git a/arch/mips/sibyte/common/Makefile b/arch/mips/sibyte/common/Makefile
index b3d6bf23a662..3ef3fb658136 100644
--- a/arch/mips/sibyte/common/Makefile
+++ b/arch/mips/sibyte/common/Makefile
@@ -1,4 +1,5 @@
 obj-y := cfe.o
+obj-$(CONFIG_SWIOTLB)			+= dma.o
 obj-$(CONFIG_SIBYTE_BUS_WATCHER)	+= bus_watcher.o
 obj-$(CONFIG_SIBYTE_CFE_CONSOLE)	+= cfe_console.o
 obj-$(CONFIG_SIBYTE_TBPROF)		+= sb_tbprof.o
diff --git a/arch/mips/sibyte/common/dma.c b/arch/mips/sibyte/common/dma.c
new file mode 100644
index 000000000000..eb47a94f3583
--- /dev/null
+++ b/arch/mips/sibyte/common/dma.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *	DMA support for Broadcom SiByte platforms.
+ *
+ *	Copyright (c) 2018  Maciej W. Rozycki
+ */
+
+#include <linux/swiotlb.h>
+#include <asm/bootinfo.h>
+
+void __init plat_swiotlb_setup(void)
+{
+	swiotlb_init(1);
+}

From 2ec43b2673525149bfeaadfc5b527bf8581afe1c Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Wed, 17 Oct 2018 17:26:22 +0200
Subject: [PATCH 2334/3220] arm64: perf: set suppress_bind_attrs flag to true

[ Upstream commit 81e9fa8bab381f8b6eb04df7cdf0f71994099bd4 ]

The armv8_pmuv3 driver doesn't have a remove function, and when the test
'CONFIG_DEBUG_TEST_DRIVER_REMOVE=y' is enabled, the following Call trace
can be seen.

[    1.424287] Failed to register pmu: armv8_pmuv3, reason -17
[    1.424870] WARNING: CPU: 0 PID: 1 at ../kernel/events/core.c:11771 perf_event_sysfs_init+0x98/0xdc
[    1.425220] Modules linked in:
[    1.425531] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W         4.19.0-rc7-next-20181012-00003-ge7a97b1ad77b-dirty #35
[    1.425951] Hardware name: linux,dummy-virt (DT)
[    1.426212] pstate: 80000005 (Nzcv daif -PAN -UAO)
[    1.426458] pc : perf_event_sysfs_init+0x98/0xdc
[    1.426720] lr : perf_event_sysfs_init+0x98/0xdc
[    1.426908] sp : ffff00000804bd50
[    1.427077] x29: ffff00000804bd50 x28: ffff00000934e078
[    1.427429] x27: ffff000009546000 x26: 0000000000000007
[    1.427757] x25: ffff000009280710 x24: 00000000ffffffef
[    1.428086] x23: ffff000009408000 x22: 0000000000000000
[    1.428415] x21: ffff000009136008 x20: ffff000009408730
[    1.428744] x19: ffff80007b20b400 x18: 000000000000000a
[    1.429075] x17: 0000000000000000 x16: 0000000000000000
[    1.429418] x15: 0000000000000400 x14: 2e79726f74636572
[    1.429748] x13: 696420656d617320 x12: 656874206e692065
[    1.430060] x11: 6d616e20656d6173 x10: 2065687420687469
[    1.430335] x9 : ffff00000804bd50 x8 : 206e6f7361657220
[    1.430610] x7 : 2c3376756d705f38 x6 : ffff00000954d7ce
[    1.430880] x5 : 0000000000000000 x4 : 0000000000000000
[    1.431226] x3 : 0000000000000000 x2 : ffffffffffffffff
[    1.431554] x1 : 4d151327adc50b00 x0 : 0000000000000000
[    1.431868] Call trace:
[    1.432102]  perf_event_sysfs_init+0x98/0xdc
[    1.432382]  do_one_initcall+0x6c/0x1a8
[    1.432637]  kernel_init_freeable+0x1bc/0x280
[    1.432905]  kernel_init+0x18/0x160
[    1.433115]  ret_from_fork+0x10/0x18
[    1.433297] ---[ end trace 27fd415390eb9883 ]---

Rework to set suppress_bind_attrs flag to avoid removing the device when
CONFIG_DEBUG_TEST_DRIVER_REMOVE=y, since there's no real reason to
remove the armv8_pmuv3 driver.

Cc: Arnd Bergmann <arnd@arndb.de>
Co-developed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 62d3dc60ca09..e99a0ed7e66b 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -670,6 +670,7 @@ static struct platform_driver armv8_pmu_driver = {
 	.driver		= {
 		.name	= "armv8-pmu",
 		.of_match_table = armv8_pmu_of_device_ids,
+		.suppress_bind_attrs = true,
 	},
 	.probe		= armv8_pmu_device_probe,
 };

From 4d919739df1f044cd4f1b8e2b36f0144a0995967 Mon Sep 17 00:00:00 2001
From: Daniel Santos <daniel.santos@pobox.com>
Date: Fri, 19 Oct 2018 03:30:20 -0500
Subject: [PATCH 2335/3220] jffs2: Fix use of uninitialized delayed_work,
 lockdep breakage

[ Upstream commit a788c5272769ddbcdbab297cf386413eeac04463 ]

jffs2_sync_fs makes the assumption that if CONFIG_JFFS2_FS_WRITEBUFFER
is defined then a write buffer is available and has been initialized.
However, this does is not the case when the mtd device has no
out-of-band buffer:

int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
{
        if (!c->mtd->oobsize)
                return 0;
...

The resulting call to cancel_delayed_work_sync passing a uninitialized
(but zeroed) delayed_work struct forces lockdep to become disabled.

[   90.050639] overlayfs: upper fs does not support tmpfile.
[   90.652264] INFO: trying to register non-static key.
[   90.662171] the code is fine but needs lockdep annotation.
[   90.673090] turning off the locking correctness validator.
[   90.684021] CPU: 0 PID: 1762 Comm: mount_root Not tainted 4.14.63 #0
[   90.696672] Stack : 00000000 00000000 80d8f6a2 00000038 805f0000 80444600 8fe364f4 805dfbe7
[   90.713349]         80563a30 000006e2 8068370c 00000001 00000000 00000001 8e2fdc48 ffffffff
[   90.730020]         00000000 00000000 80d90000 00000000 00000106 00000000 6465746e 312e3420
[   90.746690]         6b636f6c 03bf0000 f8000000 20676e69 00000000 80000000 00000000 8e2c2a90
[   90.763362]         80d90000 00000001 00000000 8e2c2a90 00000003 80260dc0 08052098 80680000
[   90.780033]         ...
[   90.784902] Call Trace:
[   90.789793] [<8000f0d8>] show_stack+0xb8/0x148
[   90.798659] [<8005a000>] register_lock_class+0x270/0x55c
[   90.809247] [<8005cb64>] __lock_acquire+0x13c/0xf7c
[   90.818964] [<8005e314>] lock_acquire+0x194/0x1dc
[   90.828345] [<8003f27c>] flush_work+0x200/0x24c
[   90.837374] [<80041dfc>] __cancel_work_timer+0x158/0x210
[   90.847958] [<801a8770>] jffs2_sync_fs+0x20/0x54
[   90.857173] [<80125cf4>] iterate_supers+0xf4/0x120
[   90.866729] [<80158fc4>] sys_sync+0x44/0x9c
[   90.875067] [<80014424>] syscall_common+0x34/0x58

Signed-off-by: Daniel Santos <daniel.santos@pobox.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/jffs2/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 1544f530ccd0..023e7f32ee1b 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -101,7 +101,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
-	cancel_delayed_work_sync(&c->wbuf_dwork);
+	if (jffs2_is_writebuffered(c))
+		cancel_delayed_work_sync(&c->wbuf_dwork);
 #endif
 
 	mutex_lock(&c->alloc_sem);

From f250e4c562a3bd106575032666e9ef46f31231f8 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 3 Nov 2018 16:38:18 -0700
Subject: [PATCH 2336/3220] pstore/ram: Do not treat empty buffers as valid

[ Upstream commit 30696378f68a9e3dad6bfe55938b112e72af00c2 ]

The ramoops backend currently calls persistent_ram_save_old() even
if a buffer is empty. While this appears to work, it is does not seem
like the right thing to do and could lead to future bugs so lets avoid
that. It also prevents misleading prints in the logs which claim the
buffer is valid.

I got something like:

	found existing buffer, size 0, start 0

When I was expecting:

	no valid data in buffer (sig = ...)

This bails out early (and reports with pr_debug()), since it's an
acceptable state.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/pstore/ram_core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index bd21795ce657..679d75a864d0 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -445,6 +445,11 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
 	sig ^= PERSISTENT_RAM_SIG;
 
 	if (prz->buffer->sig == sig) {
+		if (buffer_size(prz) == 0) {
+			pr_debug("found existing empty buffer\n");
+			return 0;
+		}
+
 		if (buffer_size(prz) > prz->buffer_size ||
 		    buffer_start(prz) > buffer_size(prz))
 			pr_info("found existing invalid buffer, size %zu, start %zu\n",

From 3f92e24be829573d9bb1a599a39624104316a2dd Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 23 Nov 2018 14:30:11 -0200
Subject: [PATCH 2337/3220] powerpc/pseries/cpuidle: Fix preempt warning

[ Upstream commit 2b038cbc5fcf12a7ee1cc9bfd5da1e46dacdee87 ]

When booting a pseries kernel with PREEMPT enabled, it dumps the
following warning:

   BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
   caller is pseries_processor_idle_init+0x5c/0x22c
   CPU: 13 PID: 1 Comm: swapper/0 Not tainted 4.20.0-rc3-00090-g12201a0128bc-dirty #828
   Call Trace:
   [c000000429437ab0] [c0000000009c8878] dump_stack+0xec/0x164 (unreliable)
   [c000000429437b00] [c0000000005f2f24] check_preemption_disabled+0x154/0x160
   [c000000429437b90] [c000000000cab8e8] pseries_processor_idle_init+0x5c/0x22c
   [c000000429437c10] [c000000000010ed4] do_one_initcall+0x64/0x300
   [c000000429437ce0] [c000000000c54500] kernel_init_freeable+0x3f0/0x500
   [c000000429437db0] [c0000000000112dc] kernel_init+0x2c/0x160
   [c000000429437e20] [c00000000000c1d0] ret_from_kernel_thread+0x5c/0x6c

This happens because the code calls get_lppaca() which calls
get_paca() and it checks if preemption is disabled through
check_preemption_disabled().

Preemption should be disabled because the per CPU variable may make no
sense if there is a preemption (and a CPU switch) after it reads the
per CPU data and when it is used.

In this device driver specifically, it is not a problem, because this
code just needs to have access to one lppaca struct, and it does not
matter if it is the current per CPU lppaca struct or not (i.e. when
there is a preemption and a CPU migration).

That said, the most appropriate fix seems to be related to avoiding
the debug_smp_processor_id() call at get_paca(), instead of calling
preempt_disable() before get_paca().

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/cpuidle/cpuidle-pseries.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 07135e009d8b..601a6c3acc7f 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -240,7 +240,13 @@ static int pseries_idle_probe(void)
 		return -ENODEV;
 
 	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-		if (lppaca_shared_proc(get_lppaca())) {
+		/*
+		 * Use local_paca instead of get_lppaca() since
+		 * preemption is not disabled, and it is not required in
+		 * fact, since lppaca_ptr does not need to be the value
+		 * associated to the current CPU, it can be from any CPU.
+		 */
+		if (lppaca_shared_proc(local_paca->lppaca_ptr)) {
 			cpuidle_state_table = shared_states;
 			max_idle_state = ARRAY_SIZE(shared_states);
 		} else {

From bb23dfd69e372eac770e01b85d96833a5cc506a2 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Thu, 18 Oct 2018 16:03:06 -0400
Subject: [PATCH 2338/3220] media: firewire: Fix app_info parameter type in
 avc_ca{,_app}_info

[ Upstream commit b2e9a4eda11fd2cb1e6714e9ad3f455c402568ff ]

Clang warns:

drivers/media/firewire/firedtv-avc.c:999:45: warning: implicit
conversion from 'int' to 'char' changes value from 159 to -97
[-Wconstant-conversion]
        app_info[0] = (EN50221_TAG_APP_INFO >> 16) & 0xff;
                    ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~
drivers/media/firewire/firedtv-avc.c:1000:45: warning: implicit
conversion from 'int' to 'char' changes value from 128 to -128
[-Wconstant-conversion]
        app_info[1] = (EN50221_TAG_APP_INFO >>  8) & 0xff;
                    ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~
drivers/media/firewire/firedtv-avc.c:1040:44: warning: implicit
conversion from 'int' to 'char' changes value from 159 to -97
[-Wconstant-conversion]
        app_info[0] = (EN50221_TAG_CA_INFO >> 16) & 0xff;
                    ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~
drivers/media/firewire/firedtv-avc.c:1041:44: warning: implicit
conversion from 'int' to 'char' changes value from 128 to -128
[-Wconstant-conversion]
        app_info[1] = (EN50221_TAG_CA_INFO >>  8) & 0xff;
                    ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~
4 warnings generated.

Change app_info's type to unsigned char to match the type of the
member msg in struct ca_msg, which is the only thing passed into the
app_info parameter in this function.

Link: https://github.com/ClangBuiltLinux/linux/issues/105

Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/media/firewire/firedtv-avc.c | 6 ++++--
 drivers/media/firewire/firedtv.h     | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/media/firewire/firedtv-avc.c b/drivers/media/firewire/firedtv-avc.c
index 251a556112a9..280b5ffea592 100644
--- a/drivers/media/firewire/firedtv-avc.c
+++ b/drivers/media/firewire/firedtv-avc.c
@@ -968,7 +968,8 @@ static int get_ca_object_length(struct avc_response_frame *r)
 	return r->operand[7];
 }
 
-int avc_ca_app_info(struct firedtv *fdtv, char *app_info, unsigned int *len)
+int avc_ca_app_info(struct firedtv *fdtv, unsigned char *app_info,
+		    unsigned int *len)
 {
 	struct avc_command_frame *c = (void *)fdtv->avc_data;
 	struct avc_response_frame *r = (void *)fdtv->avc_data;
@@ -1009,7 +1010,8 @@ out:
 	return ret;
 }
 
-int avc_ca_info(struct firedtv *fdtv, char *app_info, unsigned int *len)
+int avc_ca_info(struct firedtv *fdtv, unsigned char *app_info,
+		unsigned int *len)
 {
 	struct avc_command_frame *c = (void *)fdtv->avc_data;
 	struct avc_response_frame *r = (void *)fdtv->avc_data;
diff --git a/drivers/media/firewire/firedtv.h b/drivers/media/firewire/firedtv.h
index 345d1eda8c05..5b18a08c6285 100644
--- a/drivers/media/firewire/firedtv.h
+++ b/drivers/media/firewire/firedtv.h
@@ -124,8 +124,10 @@ int avc_lnb_control(struct firedtv *fdtv, char voltage, char burst,
 		    struct dvb_diseqc_master_cmd *diseqcmd);
 void avc_remote_ctrl_work(struct work_struct *work);
 int avc_register_remote_control(struct firedtv *fdtv);
-int avc_ca_app_info(struct firedtv *fdtv, char *app_info, unsigned int *len);
-int avc_ca_info(struct firedtv *fdtv, char *app_info, unsigned int *len);
+int avc_ca_app_info(struct firedtv *fdtv, unsigned char *app_info,
+		    unsigned int *len);
+int avc_ca_info(struct firedtv *fdtv, unsigned char *app_info,
+		unsigned int *len);
 int avc_ca_reset(struct firedtv *fdtv);
 int avc_ca_pmt(struct firedtv *fdtv, char *app_info, int length);
 int avc_ca_get_time_date(struct firedtv *fdtv, int *interval);

From 043858080aacb35e58d6b1366db7c0e8b20e3594 Mon Sep 17 00:00:00 2001
From: yupeng <yupeng0921@gmail.com>
Date: Wed, 5 Dec 2018 18:56:28 -0800
Subject: [PATCH 2339/3220] net: call sk_dst_reset when set SO_DONTROUTE

[ Upstream commit 0fbe82e628c817e292ff588cd5847fc935e025f2 ]

after set SO_DONTROUTE to 1, the IP layer should not route packets if
the dest IP address is not in link scope. But if the socket has cached
the dst_entry, such packets would be routed until the sk_dst_cache
expires. So we should clean the sk_dst_cache when a user set
SO_DONTROUTE option. Below are server/client python scripts which
could reprodue this issue:

server side code:

==========================================================================
import socket
import struct
import time

s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(('0.0.0.0', 9000))
s.listen(1)
sock, addr = s.accept()
sock.setsockopt(socket.SOL_SOCKET, socket.SO_DONTROUTE, struct.pack('i', 1))
while True:
    sock.send(b'foo')
    time.sleep(1)
==========================================================================

client side code:
==========================================================================
import socket
import time

s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(('server_address', 9000))
while True:
    data = s.recv(1024)
    print(data)
==========================================================================

Signed-off-by: yupeng <yupeng0921@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/core/sock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/sock.c b/net/core/sock.c
index 9fb1c073d0c4..8aa4a5f89572 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -732,6 +732,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		break;
 	case SO_DONTROUTE:
 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+		sk_dst_reset(sk);
 		break;
 	case SO_BROADCAST:
 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);

From 10a97294ffe8b76ea9dba2d6f4e0c66043974a7e Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Wed, 5 Dec 2018 13:18:34 +0100
Subject: [PATCH 2340/3220] scsi: target: use consistent left-aligned ASCII
 INQUIRY data

[ Upstream commit 0de263577de5d5e052be5f4f93334e63cc8a7f0b ]

spc5r17.pdf specifies:

  4.3.1 ASCII data field requirements
  ASCII data fields shall contain only ASCII printable characters (i.e.,
  code values 20h to 7Eh) and may be terminated with one or more ASCII null
  (00h) characters.  ASCII data fields described as being left-aligned
  shall have any unused bytes at the end of the field (i.e., highest
  offset) and the unused bytes shall be filled with ASCII space characters
  (20h).

LIO currently space-pads the T10 VENDOR IDENTIFICATION and PRODUCT
IDENTIFICATION fields in the standard INQUIRY data. However, the PRODUCT
REVISION LEVEL field in the standard INQUIRY data as well as the T10 VENDOR
IDENTIFICATION field in the INQUIRY Device Identification VPD Page are
zero-terminated/zero-padded.

Fix this inconsistency by using space-padding for all of the above fields.

Signed-off-by: David Disseldorp <ddiss@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bryant G. Ly <bly@catalogicsoftware.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Roman Bolshakov <r.bolshakov@yadro.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/target/target_core_spc.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/target/target_core_spc.c b/drivers/target/target_core_spc.c
index 9413e1a949e5..5af4d6a03d6e 100644
--- a/drivers/target/target_core_spc.c
+++ b/drivers/target/target_core_spc.c
@@ -108,12 +108,17 @@ spc_emulate_inquiry_std(struct se_cmd *cmd, unsigned char *buf)
 
 	buf[7] = 0x2; /* CmdQue=1 */
 
-	memcpy(&buf[8], "LIO-ORG ", 8);
-	memset(&buf[16], 0x20, 16);
+	/*
+	 * ASCII data fields described as being left-aligned shall have any
+	 * unused bytes at the end of the field (i.e., highest offset) and the
+	 * unused bytes shall be filled with ASCII space characters (20h).
+	 */
+	memset(&buf[8], 0x20, 8 + 16 + 4);
+	memcpy(&buf[8], "LIO-ORG", sizeof("LIO-ORG") - 1);
 	memcpy(&buf[16], dev->t10_wwn.model,
-	       min_t(size_t, strlen(dev->t10_wwn.model), 16));
+	       strnlen(dev->t10_wwn.model, 16));
 	memcpy(&buf[32], dev->t10_wwn.revision,
-	       min_t(size_t, strlen(dev->t10_wwn.revision), 4));
+	       strnlen(dev->t10_wwn.revision, 4));
 	buf[4] = 31; /* Set additional length to 31 */
 
 	return 0;
@@ -251,7 +256,9 @@ check_t10_vend_desc:
 	buf[off] = 0x2; /* ASCII */
 	buf[off+1] = 0x1; /* T10 Vendor ID */
 	buf[off+2] = 0x0;
-	memcpy(&buf[off+4], "LIO-ORG", 8);
+	/* left align Vendor ID and pad with spaces */
+	memset(&buf[off+4], 0x20, 8);
+	memcpy(&buf[off+4], "LIO-ORG", sizeof("LIO-ORG") - 1);
 	/* Extra Byte for NULL Terminator */
 	id_len++;
 	/* Identifier Length */

From 60a7d189a13ff4de7527201bcca8fcabe98df5f5 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Thu, 15 Nov 2018 15:30:26 +0100
Subject: [PATCH 2341/3220] clk: imx6q: reset exclusive gates on init

[ Upstream commit f7542d817733f461258fd3a47d77da35b2d9fc81 ]

The exclusive gates may be set up in the wrong way by software running
before the clock driver comes up. In that case the exclusive setup is
locked in its initial state, as the complementary function can't be
activated without disabling the initial setup first.

To avoid this lock situation, reset the exclusive gates to the off
state and allow the kernel to provide the proper setup.

Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Reviewed-by: Dong Aisheng <Aisheng.dong@nxp.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/clk/imx/clk-imx6q.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/imx/clk-imx6q.c b/drivers/clk/imx/clk-imx6q.c
index a0df83e6b84b..46c05c9a9354 100644
--- a/drivers/clk/imx/clk-imx6q.c
+++ b/drivers/clk/imx/clk-imx6q.c
@@ -239,8 +239,12 @@ static void __init imx6q_clocks_init(struct device_node *ccm_node)
 	 * lvds1_gate and lvds2_gate are pseudo-gates.  Both can be
 	 * independently configured as clock inputs or outputs.  We treat
 	 * the "output_enable" bit as a gate, even though it's really just
-	 * enabling clock output.
+	 * enabling clock output. Initially the gate bits are cleared, as
+	 * otherwise the exclusive configuration gets locked in the setup done
+	 * by software running before the clock driver, with no way to change
+	 * it.
 	 */
+	writel(readl(base + 0x160) & ~0x3c00, base + 0x160);
 	clk[IMX6QDL_CLK_LVDS1_GATE] = imx_clk_gate_exclusive("lvds1_gate", "lvds1_sel", base + 0x160, 10, BIT(12));
 	clk[IMX6QDL_CLK_LVDS2_GATE] = imx_clk_gate_exclusive("lvds2_gate", "lvds2_sel", base + 0x160, 11, BIT(13));
 

From 6b25c8de31f1551a00cba8856cc7dae4fab2991c Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 11 Dec 2018 20:00:44 +0900
Subject: [PATCH 2342/3220] kconfig: fix file name and line number of
 warn_ignored_character()

[ Upstream commit 77c1c0fa8b1477c5799bdad65026ea5ff676da44 ]

Currently, warn_ignore_character() displays invalid file name and
line number.

The lexer should use current_file->name and yylineno, while the parser
should use zconf_curname() and zconf_lineno().

This difference comes from that the lexer is always going ahead
of the parser. The parser needs to look ahead one token to make a
shift/reduce decision, so the lexer is requested to scan more text
from the input file.

This commit fixes the warning message from warn_ignored_character().

[Test Code]

  ----(Kconfig begin)----
  /
  -----(Kconfig end)-----

[Output]

  Before the fix:

  <none>:0:warning: ignoring unsupported character '/'

  After the fix:

  Kconfig:1:warning: ignoring unsupported character '/'

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 scripts/kconfig/zconf.l | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/kconfig/zconf.l b/scripts/kconfig/zconf.l
index c410d257da06..6534dc5ac803 100644
--- a/scripts/kconfig/zconf.l
+++ b/scripts/kconfig/zconf.l
@@ -71,7 +71,7 @@ static void warn_ignored_character(char chr)
 {
 	fprintf(stderr,
 	        "%s:%d:warning: ignoring unsupported character '%c'\n",
-	        zconf_curname(), zconf_lineno(), chr);
+	        current_file->name, yylineno, chr);
 }
 %}
 

From 0e817654f80b9df490861eaf365055f62d11e95f Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 11 Dec 2018 20:00:45 +0900
Subject: [PATCH 2343/3220] kconfig: fix memory leak when EOF is encountered in
 quotation

[ Upstream commit fbac5977d81cb2b2b7e37b11c459055d9585273c ]

An unterminated string literal followed by new line is passed to the
parser (with "multi-line strings not supported" warning shown), then
handled properly there.

On the other hand, an unterminated string literal at end of file is
never passed to the parser, then results in memory leak.

[Test Code]

  ----------(Kconfig begin)----------
  source "Kconfig.inc"

  config A
          bool "a"
  -----------(Kconfig end)-----------

  --------(Kconfig.inc begin)--------
  config B
          bool "b\No new line at end of file
  ---------(Kconfig.inc end)---------

[Summary from Valgrind]

  Before the fix:

    LEAK SUMMARY:
       definitely lost: 16 bytes in 1 blocks
       ...

  After the fix:

    LEAK SUMMARY:
       definitely lost: 0 bytes in 0 blocks
       ...

Eliminate the memory leak path by handling this case. Of course, such
a Kconfig file is wrong already, so I will add an error message later.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 scripts/kconfig/zconf.l | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/kconfig/zconf.l b/scripts/kconfig/zconf.l
index 6534dc5ac803..0c7800112ff5 100644
--- a/scripts/kconfig/zconf.l
+++ b/scripts/kconfig/zconf.l
@@ -191,6 +191,8 @@ n	[A-Za-z0-9_-]
 	}
 	<<EOF>>	{
 		BEGIN(INITIAL);
+		yylval.string = text;
+		return T_WORD_QUOTE;
 	}
 }
 

From 48cb2db5a15ea99459fc509a1a67ca3339653f04 Mon Sep 17 00:00:00 2001
From: Jonas Danielsson <jonas@orbital-systems.com>
Date: Fri, 19 Oct 2018 16:40:05 +0200
Subject: [PATCH 2344/3220] mmc: atmel-mci: do not assume idle after
 atmci_request_end

[ Upstream commit ae460c115b7aa50c9a36cf78fced07b27962c9d0 ]

On our AT91SAM9260 board we use the same sdio bus for wifi and for the
sd card slot. This caused the atmel-mci to give the following splat on
the serial console:

  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 538 at drivers/mmc/host/atmel-mci.c:859 atmci_send_command+0x24/0x44
  Modules linked in:
  CPU: 0 PID: 538 Comm: mmcqd/0 Not tainted 4.14.76 #14
  Hardware name: Atmel AT91SAM9
  [<c000fccc>] (unwind_backtrace) from [<c000d3dc>] (show_stack+0x10/0x14)
  [<c000d3dc>] (show_stack) from [<c0017644>] (__warn+0xd8/0xf4)
  [<c0017644>] (__warn) from [<c0017704>] (warn_slowpath_null+0x1c/0x24)
  [<c0017704>] (warn_slowpath_null) from [<c033bb9c>] (atmci_send_command+0x24/0x44)
  [<c033bb9c>] (atmci_send_command) from [<c033e984>] (atmci_start_request+0x1f4/0x2dc)
  [<c033e984>] (atmci_start_request) from [<c033f3b4>] (atmci_request+0xf0/0x164)
  [<c033f3b4>] (atmci_request) from [<c0327108>] (mmc_start_request+0x280/0x2d0)
  [<c0327108>] (mmc_start_request) from [<c032800c>] (mmc_start_areq+0x230/0x330)
  [<c032800c>] (mmc_start_areq) from [<c03366f8>] (mmc_blk_issue_rw_rq+0xc4/0x310)
  [<c03366f8>] (mmc_blk_issue_rw_rq) from [<c03372c4>] (mmc_blk_issue_rq+0x118/0x5ac)
  [<c03372c4>] (mmc_blk_issue_rq) from [<c033781c>] (mmc_queue_thread+0xc4/0x118)
  [<c033781c>] (mmc_queue_thread) from [<c002daf8>] (kthread+0x100/0x118)
  [<c002daf8>] (kthread) from [<c000a580>] (ret_from_fork+0x14/0x34)
  ---[ end trace 594371ddfa284bd6 ]---

This is:
  WARN_ON(host->cmd);

This was fixed on our board by letting atmci_request_end determine what
state we are in. Instead of unconditionally setting it to STATE_IDLE on
STATE_END_REQUEST.

Signed-off-by: Jonas Danielsson <jonas@orbital-systems.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mmc/host/atmel-mci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index bf62e429f7fc..98be9eb3184b 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -1840,13 +1840,14 @@ static void atmci_tasklet_func(unsigned long priv)
 			}
 
 			atmci_request_end(host, host->mrq);
-			state = STATE_IDLE;
+			goto unlock; /* atmci_request_end() sets host->state */
 			break;
 		}
 	} while (state != prev_state);
 
 	host->state = state;
 
+unlock:
 	spin_unlock(&host->lock);
 }
 

From 55f67c984c7d7cc7856533501c7b40475cd20522 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 26 Nov 2018 14:12:52 +0200
Subject: [PATCH 2345/3220] perf intel-pt: Fix error with config term "pt=0"

[ Upstream commit 1c6f709b9f96366cc47af23c05ecec9b8c0c392d ]

Users should never use 'pt=0', but if they do it may give a meaningless
error:

	$ perf record -e intel_pt/pt=0/u uname
	Error:
	The sys_perf_event_open() syscall returned with 22 (Invalid argument) for
	event (intel_pt/pt=0/u).

Fix that by forcing 'pt=1'.

Committer testing:

  # perf record -e intel_pt/pt=0/u uname
  Error:
  The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (intel_pt/pt=0/u).
  /bin/dmesg | grep -i perf may provide additional information.

  # perf record -e intel_pt/pt=0/u uname
  pt=0 doesn't make sense, forcing pt=1
  Linux
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.020 MB perf.data ]
  #

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/b7c5b4e5-9497-10e5-fd43-5f3e4a0fe51d@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/arch/x86/util/intel-pt.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index c53f78767568..df21da796fa7 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -471,10 +471,21 @@ static int intel_pt_validate_config(struct perf_pmu *intel_pt_pmu,
 				    struct perf_evsel *evsel)
 {
 	int err;
+	char c;
 
 	if (!evsel)
 		return 0;
 
+	/*
+	 * If supported, force pass-through config term (pt=1) even if user
+	 * sets pt=0, which avoids senseless kernel errors.
+	 */
+	if (perf_pmu__scan_file(intel_pt_pmu, "format/pt", "%c", &c) == 1 &&
+	    !(evsel->attr.config & 1)) {
+		pr_warning("pt=0 doesn't make sense, forcing pt=1\n");
+		evsel->attr.config |= 1;
+	}
+
 	err = intel_pt_val_config_term(intel_pt_pmu, "caps/cycle_thresholds",
 				       "cyc_thresh", "caps/psb_cyc",
 				       evsel->attr.config);

From 7b9677e88b359599afb7fe4f7795a5d29c401567 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 6 Dec 2018 11:29:48 -0300
Subject: [PATCH 2346/3220] perf svghelper: Fix unchecked usage of strncpy()

[ Upstream commit 2f5302533f306d5ee87bd375aef9ca35b91762cb ]

The strncpy() function may leave the destination string buffer
unterminated, better use strlcpy() that we have a __weak fallback
implementation for systems without it.

In this specific case this would only happen if fgets() was buggy, as
its man page states that it should read one less byte than the size of
the destination buffer, so that it can put the nul byte at the end of
it, so it would never copy 255 non-nul chars, as fgets reads into the
orig buffer at most 254 non-nul chars and terminates it. But lets just
switch to strlcpy to keep the original intent and silence the gcc 8.2
warning.

This fixes this warning on an Alpine Linux Edge system with gcc 8.2:

  In function 'cpu_model',
      inlined from 'svg_cpu_box' at util/svghelper.c:378:2:
  util/svghelper.c:337:5: error: 'strncpy' output may be truncated copying 255 bytes from a string of length 255 [-Werror=stringop-truncation]
       strncpy(cpu_m, &buf[13], 255);
       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Fixes: f48d55ce7871 ("perf: Add a SVG helper library file")
Link: https://lkml.kernel.org/n/tip-xzkoo0gyr56gej39ltivuh9g@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/util/svghelper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c
index eec6c1149f44..132878d4847a 100644
--- a/tools/perf/util/svghelper.c
+++ b/tools/perf/util/svghelper.c
@@ -333,7 +333,7 @@ static char *cpu_model(void)
 	if (file) {
 		while (fgets(buf, 255, file)) {
 			if (strstr(buf, "model name")) {
-				strncpy(cpu_m, &buf[13], 255);
+				strlcpy(cpu_m, &buf[13], 255);
 				break;
 			}
 		}

From 0bce6d203ac10c04eea6ac1d9b3b1204ffa76218 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 6 Dec 2018 13:52:13 -0300
Subject: [PATCH 2347/3220] perf parse-events: Fix unchecked usage of strncpy()

[ Upstream commit bd8d57fb7e25e9fcf67a9eef5fa13aabe2016e07 ]

The strncpy() function may leave the destination string buffer
unterminated, better use strlcpy() that we have a __weak fallback
implementation for systems without it.

This fixes this warning on an Alpine Linux Edge system with gcc 8.2:

  util/parse-events.c: In function 'print_symbol_events':
  util/parse-events.c:2465:4: error: 'strncpy' specified bound 100 equals destination size [-Werror=stringop-truncation]
      strncpy(name, syms->symbol, MAX_NAME_LEN);
      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  In function 'print_symbol_events.constprop',
      inlined from 'print_events' at util/parse-events.c:2508:2:
  util/parse-events.c:2465:4: error: 'strncpy' specified bound 100 equals destination size [-Werror=stringop-truncation]
      strncpy(name, syms->symbol, MAX_NAME_LEN);
      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  In function 'print_symbol_events.constprop',
      inlined from 'print_events' at util/parse-events.c:2511:2:
  util/parse-events.c:2465:4: error: 'strncpy' specified bound 100 equals destination size [-Werror=stringop-truncation]
      strncpy(name, syms->symbol, MAX_NAME_LEN);
      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  cc1: all warnings being treated as errors

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Fixes: 947b4ad1d198 ("perf list: Fix max event string size")
Link: https://lkml.kernel.org/n/tip-b663e33bm6x8hrkie4uxh7u2@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/util/parse-events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index e81dfb2e239c..9351738df703 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1903,7 +1903,7 @@ restart:
 		if (!name_only && strlen(syms->alias))
 			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
 		else
-			strncpy(name, syms->symbol, MAX_NAME_LEN);
+			strlcpy(name, syms->symbol, MAX_NAME_LEN);
 
 		evt_list[evt_i] = strdup(name);
 		if (evt_list[evt_i] == NULL)

From e31cc4b7fe5d28761093f9824551300ecbc9d795 Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Wed, 31 Oct 2018 17:53:09 -0400
Subject: [PATCH 2348/3220] dm kcopyd: Fix bug causing workqueue stalls

[ Upstream commit d7e6b8dfc7bcb3f4f3a18313581f67486a725b52 ]

When using kcopyd to run callbacks through dm_kcopyd_do_callback() or
submitting copy jobs with a source size of 0, the jobs are pushed
directly to the complete_jobs list, which could be under processing by
the kcopyd thread. As a result, the kcopyd thread can continue running
completed jobs indefinitely, without releasing the CPU, as long as
someone keeps submitting new completed jobs through the aforementioned
paths. Processing of work items, queued for execution on the same CPU as
the currently running kcopyd thread, is thus stalled for excessive
amounts of time, hurting performance.

Running the following test, from the device mapper test suite [1],

  dmtest run --suite snapshot -n parallel_io_to_many_snaps_N

, with 8 active snapshots, we get, in dmesg, messages like the
following:

[68899.948523] BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 95s!
[68899.949282] Showing busy workqueues and worker pools:
[68899.949288] workqueue events: flags=0x0
[68899.949295]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256
[68899.949306]     pending: vmstat_shepherd, cache_reap
[68899.949331] workqueue mm_percpu_wq: flags=0x8
[68899.949337]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949345]     pending: vmstat_update
[68899.949387] workqueue dm_bufio_cache: flags=0x8
[68899.949392]   pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=1/256
[68899.949400]     pending: work_fn [dm_bufio]
[68899.949423] workqueue kcopyd: flags=0x8
[68899.949429]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949437]     pending: do_work [dm_mod]
[68899.949452] workqueue kcopyd: flags=0x8
[68899.949458]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256
[68899.949466]     in-flight: 13:do_work [dm_mod]
[68899.949474]     pending: do_work [dm_mod]
[68899.949487] workqueue kcopyd: flags=0x8
[68899.949493]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949501]     pending: do_work [dm_mod]
[68899.949515] workqueue kcopyd: flags=0x8
[68899.949521]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949529]     pending: do_work [dm_mod]
[68899.949541] workqueue kcopyd: flags=0x8
[68899.949547]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949555]     pending: do_work [dm_mod]
[68899.949568] pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=95s workers=4 idle: 27130 27223 1084

Fix this by splitting the complete_jobs list into two parts: A user
facing part, named callback_jobs, and one used internally by kcopyd,
retaining the name complete_jobs. dm_kcopyd_do_callback() and
dispatch_job() now push their jobs to the callback_jobs list, which is
spliced to the complete_jobs list once, every time the kcopyd thread
wakes up. This prevents kcopyd from hogging the CPU indefinitely and
causing workqueue stalls.

Re-running the aforementioned test:

  * Workqueue stalls are eliminated
  * The maximum writing time among all targets is reduced from 09m37.10s
    to 06m04.85s and the total run time of the test is reduced from
    10m43.591s to 7m19.199s

[1] https://github.com/jthornber/device-mapper-test-suite

Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/dm-kcopyd.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 54c308e6704f..04248394843e 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -55,15 +55,17 @@ struct dm_kcopyd_client {
 	struct dm_kcopyd_throttle *throttle;
 
 /*
- * We maintain three lists of jobs:
+ * We maintain four lists of jobs:
  *
  * i)   jobs waiting for pages
  * ii)  jobs that have pages, and are waiting for the io to be issued.
- * iii) jobs that have completed.
+ * iii) jobs that don't need to do any IO and just run a callback
+ * iv) jobs that have completed.
  *
- * All three of these are protected by job_lock.
+ * All four of these are protected by job_lock.
  */
 	spinlock_t job_lock;
+	struct list_head callback_jobs;
 	struct list_head complete_jobs;
 	struct list_head io_jobs;
 	struct list_head pages_jobs;
@@ -583,6 +585,7 @@ static void do_work(struct work_struct *work)
 	struct dm_kcopyd_client *kc = container_of(work,
 					struct dm_kcopyd_client, kcopyd_work);
 	struct blk_plug plug;
+	unsigned long flags;
 
 	/*
 	 * The order that these are called is *very* important.
@@ -591,6 +594,10 @@ static void do_work(struct work_struct *work)
 	 * list.  io jobs call wake when they complete and it all
 	 * starts again.
 	 */
+	spin_lock_irqsave(&kc->job_lock, flags);
+	list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
+	spin_unlock_irqrestore(&kc->job_lock, flags);
+
 	blk_start_plug(&plug);
 	process_jobs(&kc->complete_jobs, kc, run_complete_job);
 	process_jobs(&kc->pages_jobs, kc, run_pages_job);
@@ -608,7 +615,7 @@ static void dispatch_job(struct kcopyd_job *job)
 	struct dm_kcopyd_client *kc = job->kc;
 	atomic_inc(&kc->nr_jobs);
 	if (unlikely(!job->source.count))
-		push(&kc->complete_jobs, job);
+		push(&kc->callback_jobs, job);
 	else if (job->pages == &zero_page_list)
 		push(&kc->io_jobs, job);
 	else
@@ -795,7 +802,7 @@ void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
 	job->read_err = read_err;
 	job->write_err = write_err;
 
-	push(&kc->complete_jobs, job);
+	push(&kc->callback_jobs, job);
 	wake(kc);
 }
 EXPORT_SYMBOL(dm_kcopyd_do_callback);
@@ -825,6 +832,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
 		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&kc->job_lock);
+	INIT_LIST_HEAD(&kc->callback_jobs);
 	INIT_LIST_HEAD(&kc->complete_jobs);
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
@@ -874,6 +882,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
 	/* Wait for completion of all jobs submitted by this client. */
 	wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
 
+	BUG_ON(!list_empty(&kc->callback_jobs));
 	BUG_ON(!list_empty(&kc->complete_jobs));
 	BUG_ON(!list_empty(&kc->io_jobs));
 	BUG_ON(!list_empty(&kc->pages_jobs));

From 8ccd81ed35606e721aeb875abdcc0eda21644bc7 Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Wed, 31 Oct 2018 17:53:08 -0400
Subject: [PATCH 2349/3220] dm snapshot: Fix excessive memory usage and
 workqueue stalls

[ Upstream commit 721b1d98fb517ae99ab3b757021cf81db41e67be ]

kcopyd has no upper limit to the number of jobs one can allocate and
issue. Under certain workloads this can lead to excessive memory usage
and workqueue stalls. For example, when creating multiple dm-snapshot
targets with a 4K chunk size and then writing to the origin through the
page cache. Syncing the page cache causes a large number of BIOs to be
issued to the dm-snapshot origin target, which itself issues an even
larger (because of the BIO splitting taking place) number of kcopyd
jobs.

Running the following test, from the device mapper test suite [1],

  dmtest run --suite snapshot -n many_snapshots_of_same_volume_N

, with 8 active snapshots, results in the kcopyd job slab cache growing
to 10G. Depending on the available system RAM this can lead to the OOM
killer killing user processes:

[463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP),
              nodemask=(null), order=1, oom_score_adj=0
[463.492894] kthreadd cpuset=/ mems_allowed=0
[463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3
[463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[463.492952] Call Trace:
[463.492964]  dump_stack+0x7d/0xbb
[463.492973]  dump_header+0x6b/0x2fc
[463.492987]  ? lockdep_hardirqs_on+0xee/0x190
[463.493012]  oom_kill_process+0x302/0x370
[463.493021]  out_of_memory+0x113/0x560
[463.493030]  __alloc_pages_slowpath+0xf40/0x1020
[463.493055]  __alloc_pages_nodemask+0x348/0x3c0
[463.493067]  cache_grow_begin+0x81/0x8b0
[463.493072]  ? cache_grow_begin+0x874/0x8b0
[463.493078]  fallback_alloc+0x1e4/0x280
[463.493092]  kmem_cache_alloc_node+0xd6/0x370
[463.493098]  ? copy_process.part.31+0x1c5/0x20d0
[463.493105]  copy_process.part.31+0x1c5/0x20d0
[463.493115]  ? __lock_acquire+0x3cc/0x1550
[463.493121]  ? __switch_to_asm+0x34/0x70
[463.493129]  ? kthread_create_worker_on_cpu+0x70/0x70
[463.493135]  ? finish_task_switch+0x90/0x280
[463.493165]  _do_fork+0xe0/0x6d0
[463.493191]  ? kthreadd+0x19f/0x220
[463.493233]  kernel_thread+0x25/0x30
[463.493235]  kthreadd+0x1bf/0x220
[463.493242]  ? kthread_create_on_cpu+0x90/0x90
[463.493248]  ret_from_fork+0x3a/0x50
[463.493279] Mem-Info:
[463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0
[463.493285]  active_file:80216 inactive_file:80107 isolated_file:435
[463.493285]  unevictable:0 dirty:51266 writeback:109372 unstable:0
[463.493285]  slab_reclaimable:31191 slab_unreclaimable:3483521
[463.493285]  mapped:526 shmem:4903 pagetables:1759 bounce:0
[463.493285]  free:33623 free_pcp:2392 free_cma:0
...
[463.493489] Unreclaimable slab info:
[463.493513] Name                      Used          Total
[463.493522] bio-6                   1028KB       1028KB
[463.493525] bio-5                   1028KB       1028KB
[463.493528] dm_snap_pending_exception     236783KB     243789KB
[463.493531] dm_exception              41KB         42KB
[463.493534] bio-4                   1216KB       1216KB
[463.493537] bio-3                 439396KB     439396KB
[463.493539] kcopyd_job           6973427KB    6973427KB
...
[463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child
[463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB
[463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB

Moreover, issuing a large number of kcopyd jobs results in kcopyd
hogging the CPU, while processing them. As a result, processing of work
items, queued for execution on the same CPU as the currently running
kcopyd thread, is stalled for long periods of time, hurting performance.
Running the aforementioned test we get, in dmesg, messages like the
following:

[67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s!
[67501.195586] Showing busy workqueues and worker pools:
[67501.195591] workqueue events: flags=0x0
[67501.195597]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195611]     pending: cache_reap
[67501.195641] workqueue mm_percpu_wq: flags=0x8
[67501.195645]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195656]     pending: vmstat_update
[67501.195682] workqueue kblockd: flags=0x18
[67501.195687]   pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256
[67501.195698]     pending: blk_timeout_work
[67501.195753] workqueue kcopyd: flags=0x8
[67501.195757]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195768]     pending: do_work [dm_mod]
[67501.195802] workqueue kcopyd: flags=0x8
[67501.195806]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195817]     pending: do_work [dm_mod]
[67501.195834] workqueue kcopyd: flags=0x8
[67501.195838]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195848]     pending: do_work [dm_mod]
[67501.195881] workqueue kcopyd: flags=0x8
[67501.195885]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195896]     pending: do_work [dm_mod]
[67501.195920] workqueue kcopyd: flags=0x8
[67501.195924]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256
[67501.195935]     in-flight: 67:do_work [dm_mod]
[67501.195945]     pending: do_work [dm_mod]
[67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765

The root cause for these issues is the way dm-snapshot uses kcopyd. In
particular, the lack of an explicit or implicit limit to the maximum
number of in-flight COW jobs. The merging path is not affected because
it implicitly limits the in-flight kcopyd jobs to one.

Fix these issues by using a semaphore to limit the maximum number of
in-flight kcopyd jobs. We grab the semaphore before allocating a new
kcopyd job in start_copy() and start_full_bio() and release it after the
job finishes in copy_callback().

The initial semaphore value is configurable through a module parameter,
to allow fine tuning the maximum number of in-flight COW jobs. Setting
this parameter to zero initializes the semaphore to INT_MAX.

A default value of 2048 maximum in-flight kcopyd jobs was chosen. This
value was decided experimentally as a trade-off between memory
consumption, stalling the kernel's workqueues and maintaining a high
enough throughput.

Re-running the aforementioned test:

  * Workqueue stalls are eliminated
  * kcopyd's job slab cache uses a maximum of 130MB
  * The time taken by the test to write to the snapshot-origin target is
    reduced from 05m20.48s to 03m26.38s

[1] https://github.com/jthornber/device-mapper-test-suite

Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/dm-snap.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index e108deebbaaa..5d3797728b9c 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,6 +19,7 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
+#include <linux/semaphore.h>
 
 #include "dm.h"
 
@@ -105,6 +106,9 @@ struct dm_snapshot {
 	/* The on disk metadata handler */
 	struct dm_exception_store *store;
 
+	/* Maximum number of in-flight COW jobs. */
+	struct semaphore cow_count;
+
 	struct dm_kcopyd_client *kcopyd_client;
 
 	/* Wait for events based on state_bits */
@@ -145,6 +149,19 @@ struct dm_snapshot {
 #define RUNNING_MERGE          0
 #define SHUTDOWN_MERGE         1
 
+/*
+ * Maximum number of chunks being copied on write.
+ *
+ * The value was decided experimentally as a trade-off between memory
+ * consumption, stalling the kernel's workqueues and maintaining a high enough
+ * throughput.
+ */
+#define DEFAULT_COW_THRESHOLD 2048
+
+static int cow_threshold = DEFAULT_COW_THRESHOLD;
+module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
+MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
+
 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
 		"A percentage of time allocated for copy on write");
 
@@ -1190,6 +1207,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_hash_tables;
 	}
 
+	sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
+
 	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
 	if (IS_ERR(s->kcopyd_client)) {
 		r = PTR_ERR(s->kcopyd_client);
@@ -1563,6 +1582,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
 		}
 		list_add(&pe->out_of_order_entry, lh);
 	}
+	up(&s->cow_count);
 }
 
 /*
@@ -1586,6 +1606,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 	dest.count = src.count;
 
 	/* Hand over to kcopyd */
+	down(&s->cow_count);
 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
 }
 
@@ -1606,6 +1627,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
 	pe->full_bio_end_io = bio->bi_end_io;
 	pe->full_bio_private = bio->bi_private;
 
+	down(&s->cow_count);
 	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
 						   copy_callback, pe);
 

From 5d0ab08f715ffa0386930297cad39354dee5857a Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Wed, 19 Dec 2018 20:00:42 +0900
Subject: [PATCH 2350/3220] ALSA: bebob: fix model-id of unit for Apogee
 Ensemble

[ Upstream commit 644b2e97405b0b74845e1d3c2b4fe4c34858062b ]

This commit fixes hard-coded model-id for an unit of Apogee Ensemble with
a correct value. This unit uses DM1500 ASIC produced ArchWave AG (formerly
known as BridgeCo AG).

I note that this model supports three modes in the number of data channels
in tx/rx streams; 8 ch pairs, 10 ch pairs, 18 ch pairs. The mode is
switched by Vendor-dependent AV/C command, like:

$ cd linux-firewire-utils
$ ./firewire-request /dev/fw1 fcp 0x00ff000003dbeb0600000000 (8ch pairs)
$ ./firewire-request /dev/fw1 fcp 0x00ff000003dbeb0601000000 (10ch pairs)
$ ./firewire-request /dev/fw1 fcp 0x00ff000003dbeb0602000000 (18ch pairs)

When switching between different mode, the unit disappears from IEEE 1394
bus, then appears on the bus with different combination of stream formats.
In a mode of 18 ch pairs, available sampling rate is up to 96.0 kHz, else
up to 192.0 kHz.

$ ./hinawa-config-rom-printer /dev/fw1
{ 'bus-info': { 'adj': False,
                'bmc': True,
                'chip_ID': 21474898341,
                'cmc': True,
                'cyc_clk_acc': 100,
                'generation': 2,
                'imc': True,
                'isc': True,
                'link_spd': 2,
                'max_ROM': 1,
                'max_rec': 512,
                'name': '1394',
                'node_vendor_ID': 987,
                'pmc': False},
  'root-directory': [ ['HARDWARE_VERSION', 19],
                      [ 'NODE_CAPABILITIES',
                        { 'addressing': {'64': True, 'fix': True, 'prv': False},
                          'misc': {'int': False, 'ms': False, 'spt': True},
                          'state': { 'atn': False,
                                     'ded': False,
                                     'drq': True,
                                     'elo': False,
                                     'init': False,
                                     'lst': True,
                                     'off': False},
                          'testing': {'bas': False, 'ext': False}}],
                      ['VENDOR', 987],
                      ['DESCRIPTOR', 'Apogee Electronics'],
                      ['MODEL', 126702],
                      ['DESCRIPTOR', 'Ensemble'],
                      ['VERSION', 5297],
                      [ 'UNIT',
                        [ ['SPECIFIER_ID', 41005],
                          ['VERSION', 65537],
                          ['MODEL', 126702],
                          ['DESCRIPTOR', 'Ensemble']]],
                      [ 'DEPENDENT_INFO',
                        [ ['SPECIFIER_ID', 2037],
                          ['VERSION', 1],
                          [(58, 'IMMEDIATE'), 16777159],
                          [(59, 'IMMEDIATE'), 1048576],
                          [(60, 'IMMEDIATE'), 16777159],
                          [(61, 'IMMEDIATE'), 6291456]]]]}

Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/firewire/bebob/bebob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/firewire/bebob/bebob.c b/sound/firewire/bebob/bebob.c
index 091290d1f3ea..1898fa4228ad 100644
--- a/sound/firewire/bebob/bebob.c
+++ b/sound/firewire/bebob/bebob.c
@@ -382,7 +382,7 @@ static const struct ieee1394_device_id bebob_id_table[] = {
 	/* Apogee Electronics, DA/AD/DD-16X (X-FireWire card) */
 	SND_BEBOB_DEV_ENTRY(VEN_APOGEE, 0x00010048, &spec_normal),
 	/* Apogee Electronics, Ensemble */
-	SND_BEBOB_DEV_ENTRY(VEN_APOGEE, 0x00001eee, &spec_normal),
+	SND_BEBOB_DEV_ENTRY(VEN_APOGEE, 0x01eeee, &spec_normal),
 	/* ESI, Quatafire610 */
 	SND_BEBOB_DEV_ENTRY(VEN_ESI, 0x00010064, &spec_normal),
 	/* AcousticReality, eARMasterOne */

From 36b963041857aba194ba3e1fbf8aa62f1b71dfd9 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 19 Dec 2018 13:39:09 +0100
Subject: [PATCH 2351/3220] sysfs: Disable lockdep for driver bind/unbind files

[ Upstream commit 4f4b374332ec0ae9c738ff8ec9bed5cd97ff9adc ]

This is the much more correct fix for my earlier attempt at:

https://lkml.org/lkml/2018/12/10/118

Short recap:

- There's not actually a locking issue, it's just lockdep being a bit
  too eager to complain about a possible deadlock.

- Contrary to what I claimed the real problem is recursion on
  kn->count. Greg pointed me at sysfs_break_active_protection(), used
  by the scsi subsystem to allow a sysfs file to unbind itself. That
  would be a real deadlock, which isn't what's happening here. Also,
  breaking the active protection means we'd need to manually handle
  all the lifetime fun.

- With Rafael we discussed the task_work approach, which kinda works,
  but has two downsides: It's a functional change for a lockdep
  annotation issue, and it won't work for the bind file (which needs
  to get the errno from the driver load function back to userspace).

- Greg also asked why this never showed up: To hit this you need to
  unregister a 2nd driver from the unload code of your first driver. I
  guess only gpus do that. The bug has always been there, but only
  with a recent patch series did we add more locks so that lockdep
  built a chain from unbinding the snd-hda driver to the
  acpi_video_unregister call.

Full lockdep splat:

[12301.898799] ============================================
[12301.898805] WARNING: possible recursive locking detected
[12301.898811] 4.20.0-rc7+ #84 Not tainted
[12301.898815] --------------------------------------------
[12301.898821] bash/5297 is trying to acquire lock:
[12301.898826] 00000000f61c6093 (kn->count#39){++++}, at: kernfs_remove_by_name_ns+0x3b/0x80
[12301.898841] but task is already holding lock:
[12301.898847] 000000005f634021 (kn->count#39){++++}, at: kernfs_fop_write+0xdc/0x190
[12301.898856] other info that might help us debug this:
[12301.898862]  Possible unsafe locking scenario:
[12301.898867]        CPU0
[12301.898870]        ----
[12301.898874]   lock(kn->count#39);
[12301.898879]   lock(kn->count#39);
[12301.898883] *** DEADLOCK ***
[12301.898891]  May be due to missing lock nesting notation
[12301.898899] 5 locks held by bash/5297:
[12301.898903]  #0: 00000000cd800e54 (sb_writers#4){.+.+}, at: vfs_write+0x17f/0x1b0
[12301.898915]  #1: 000000000465e7c2 (&of->mutex){+.+.}, at: kernfs_fop_write+0xd3/0x190
[12301.898925]  #2: 000000005f634021 (kn->count#39){++++}, at: kernfs_fop_write+0xdc/0x190
[12301.898936]  #3: 00000000414ef7ac (&dev->mutex){....}, at: device_release_driver_internal+0x34/0x240
[12301.898950]  #4: 000000003218fbdf (register_count_mutex){+.+.}, at: acpi_video_unregister+0xe/0x40
[12301.898960] stack backtrace:
[12301.898968] CPU: 1 PID: 5297 Comm: bash Not tainted 4.20.0-rc7+ #84
[12301.898974] Hardware name: Hewlett-Packard HP EliteBook 8460p/161C, BIOS 68SCF Ver. F.01 03/11/2011
[12301.898982] Call Trace:
[12301.898989]  dump_stack+0x67/0x9b
[12301.898997]  __lock_acquire+0x6ad/0x1410
[12301.899003]  ? kernfs_remove_by_name_ns+0x3b/0x80
[12301.899010]  ? find_held_lock+0x2d/0x90
[12301.899017]  ? mutex_spin_on_owner+0xe4/0x150
[12301.899023]  ? find_held_lock+0x2d/0x90
[12301.899030]  ? lock_acquire+0x90/0x180
[12301.899036]  lock_acquire+0x90/0x180
[12301.899042]  ? kernfs_remove_by_name_ns+0x3b/0x80
[12301.899049]  __kernfs_remove+0x296/0x310
[12301.899055]  ? kernfs_remove_by_name_ns+0x3b/0x80
[12301.899060]  ? kernfs_name_hash+0xd/0x80
[12301.899066]  ? kernfs_find_ns+0x6c/0x100
[12301.899073]  kernfs_remove_by_name_ns+0x3b/0x80
[12301.899080]  bus_remove_driver+0x92/0xa0
[12301.899085]  acpi_video_unregister+0x24/0x40
[12301.899127]  i915_driver_unload+0x42/0x130 [i915]
[12301.899160]  i915_pci_remove+0x19/0x30 [i915]
[12301.899169]  pci_device_remove+0x36/0xb0
[12301.899176]  device_release_driver_internal+0x185/0x240
[12301.899183]  unbind_store+0xaf/0x180
[12301.899189]  kernfs_fop_write+0x104/0x190
[12301.899195]  __vfs_write+0x31/0x180
[12301.899203]  ? rcu_read_lock_sched_held+0x6f/0x80
[12301.899209]  ? rcu_sync_lockdep_assert+0x29/0x50
[12301.899216]  ? __sb_start_write+0x13c/0x1a0
[12301.899221]  ? vfs_write+0x17f/0x1b0
[12301.899227]  vfs_write+0xb9/0x1b0
[12301.899233]  ksys_write+0x50/0xc0
[12301.899239]  do_syscall_64+0x4b/0x180
[12301.899247]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[12301.899253] RIP: 0033:0x7f452ac7f7a4
[12301.899259] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 80 00 00 00 00 8b 05 aa f0 2c 00 48 63 ff 85 c0 75 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 f3 c3 66 90 55 53 48 89 d5 48 89 f3 48 83
[12301.899273] RSP: 002b:00007ffceafa6918 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[12301.899282] RAX: ffffffffffffffda RBX: 000000000000000d RCX: 00007f452ac7f7a4
[12301.899288] RDX: 000000000000000d RSI: 00005612a1abf7c0 RDI: 0000000000000001
[12301.899295] RBP: 00005612a1abf7c0 R08: 000000000000000a R09: 00005612a1c46730
[12301.899301] R10: 000000000000000a R11: 0000000000000246 R12: 000000000000000d
[12301.899308] R13: 0000000000000001 R14: 00007f452af4a740 R15: 000000000000000d

Looking around I've noticed that usb and i2c already handle similar
recursion problems, where a sysfs file can unbind the same type of
sysfs somewhere else in the hierarchy. Relevant commits are:

commit 356c05d58af05d582e634b54b40050c73609617b
Author: Alan Stern <stern@rowland.harvard.edu>
Date:   Mon May 14 13:30:03 2012 -0400

    sysfs: get rid of some lockdep false positives

commit e9b526fe704812364bca07edd15eadeba163ebfb
Author: Alexander Sverdlin <alexander.sverdlin@nsn.com>
Date:   Fri May 17 14:56:35 2013 +0200

    i2c: suppress lockdep warning on delete_device

Implement the same trick for driver bind/unbind.

v2: Put the macro into bus.c (Greg).

Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
Cc: Arend van Spriel <aspriel@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: Bartosz Golaszewski <brgl@bgdev.pl>
Cc: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Cc: Vivek Gautam <vivek.gautam@codeaurora.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/base/bus.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 0346e46e2871..ecca4ae248e0 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -33,6 +33,9 @@ static struct kset *system_kset;
 
 #define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr)
 
+#define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
+	struct driver_attribute driver_attr_##_name =		\
+		__ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)
 
 static int __must_check bus_rescan_devices_helper(struct device *dev,
 						void *data);
@@ -198,7 +201,7 @@ static ssize_t unbind_store(struct device_driver *drv, const char *buf,
 	bus_put(bus);
 	return err;
 }
-static DRIVER_ATTR_WO(unbind);
+static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, S_IWUSR, NULL, unbind_store);
 
 /*
  * Manually attach a device to a driver.
@@ -234,7 +237,7 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf,
 	bus_put(bus);
 	return err;
 }
-static DRIVER_ATTR_WO(bind);
+static DRIVER_ATTR_IGNORE_LOCKDEP(bind, S_IWUSR, NULL, bind_store);
 
 static ssize_t show_drivers_autoprobe(struct bus_type *bus, char *buf)
 {

From 09d3be78ee45011bbd3fe1cdb76148c07ebf787b Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 13 Dec 2018 08:27:27 -0500
Subject: [PATCH 2352/3220] scsi: megaraid: fix out-of-bound array accesses

[ Upstream commit c7a082e4242fd8cd21a441071e622f87c16bdacc ]

UBSAN reported those with MegaRAID SAS-3 3108,

[   77.467308] UBSAN: Undefined behaviour in drivers/scsi/megaraid/megaraid_sas_fp.c:117:32
[   77.475402] index 255 is out of range for type 'MR_LD_SPAN_MAP [1]'
[   77.481677] CPU: 16 PID: 333 Comm: kworker/16:1 Not tainted 4.20.0-rc5+ #1
[   77.488556] Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.50 06/01/2018
[   77.495791] Workqueue: events work_for_cpu_fn
[   77.500154] Call trace:
[   77.502610]  dump_backtrace+0x0/0x2c8
[   77.506279]  show_stack+0x24/0x30
[   77.509604]  dump_stack+0x118/0x19c
[   77.513098]  ubsan_epilogue+0x14/0x60
[   77.516765]  __ubsan_handle_out_of_bounds+0xfc/0x13c
[   77.521767]  mr_update_load_balance_params+0x150/0x158 [megaraid_sas]
[   77.528230]  MR_ValidateMapInfo+0x2cc/0x10d0 [megaraid_sas]
[   77.533825]  megasas_get_map_info+0x244/0x2f0 [megaraid_sas]
[   77.539505]  megasas_init_adapter_fusion+0x9b0/0xf48 [megaraid_sas]
[   77.545794]  megasas_init_fw+0x1ab4/0x3518 [megaraid_sas]
[   77.551212]  megasas_probe_one+0x2c4/0xbe0 [megaraid_sas]
[   77.556614]  local_pci_probe+0x7c/0xf0
[   77.560365]  work_for_cpu_fn+0x34/0x50
[   77.564118]  process_one_work+0x61c/0xf08
[   77.568129]  worker_thread+0x534/0xa70
[   77.571882]  kthread+0x1c8/0x1d0
[   77.575114]  ret_from_fork+0x10/0x1c

[   89.240332] UBSAN: Undefined behaviour in drivers/scsi/megaraid/megaraid_sas_fp.c:117:32
[   89.248426] index 255 is out of range for type 'MR_LD_SPAN_MAP [1]'
[   89.254700] CPU: 16 PID: 95 Comm: kworker/u130:0 Not tainted 4.20.0-rc5+ #1
[   89.261665] Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.50 06/01/2018
[   89.268903] Workqueue: events_unbound async_run_entry_fn
[   89.274222] Call trace:
[   89.276680]  dump_backtrace+0x0/0x2c8
[   89.280348]  show_stack+0x24/0x30
[   89.283671]  dump_stack+0x118/0x19c
[   89.287167]  ubsan_epilogue+0x14/0x60
[   89.290835]  __ubsan_handle_out_of_bounds+0xfc/0x13c
[   89.295828]  MR_LdRaidGet+0x50/0x58 [megaraid_sas]
[   89.300638]  megasas_build_io_fusion+0xbb8/0xd90 [megaraid_sas]
[   89.306576]  megasas_build_and_issue_cmd_fusion+0x138/0x460 [megaraid_sas]
[   89.313468]  megasas_queue_command+0x398/0x3d0 [megaraid_sas]
[   89.319222]  scsi_dispatch_cmd+0x1dc/0x8a8
[   89.323321]  scsi_request_fn+0x8e8/0xdd0
[   89.327249]  __blk_run_queue+0xc4/0x158
[   89.331090]  blk_execute_rq_nowait+0xf4/0x158
[   89.335449]  blk_execute_rq+0xdc/0x158
[   89.339202]  __scsi_execute+0x130/0x258
[   89.343041]  scsi_probe_and_add_lun+0x2fc/0x1488
[   89.347661]  __scsi_scan_target+0x1cc/0x8c8
[   89.351848]  scsi_scan_channel.part.3+0x8c/0xc0
[   89.356382]  scsi_scan_host_selected+0x130/0x1f0
[   89.361002]  do_scsi_scan_host+0xd8/0xf0
[   89.364927]  do_scan_async+0x9c/0x320
[   89.368594]  async_run_entry_fn+0x138/0x420
[   89.372780]  process_one_work+0x61c/0xf08
[   89.376793]  worker_thread+0x13c/0xa70
[   89.380546]  kthread+0x1c8/0x1d0
[   89.383778]  ret_from_fork+0x10/0x1c

This is because when populating Driver Map using firmware raid map, all
non-existing VDs set their ldTgtIdToLd to 0xff, so it can be skipped later.

From drivers/scsi/megaraid/megaraid_sas_base.c ,
memset(instance->ld_ids, 0xff, MEGASAS_MAX_LD_IDS);

From drivers/scsi/megaraid/megaraid_sas_fp.c ,
/* For non existing VDs, iterate to next VD*/
if (ld >= (MAX_LOGICAL_DRIVES_EXT - 1))
	continue;

However, there are a few places that failed to skip those non-existing VDs
due to off-by-one errors. Then, those 0xff leaked into MR_LdRaidGet(0xff,
map) and triggered the out-of-bound accesses.

Fixes: 51087a8617fe ("megaraid_sas : Extended VD support")
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Sumit Saxena <sumit.saxena@broadcom.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/megaraid/megaraid_sas_fp.c     | 2 +-
 drivers/scsi/megaraid/megaraid_sas_fusion.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/megaraid/megaraid_sas_fp.c b/drivers/scsi/megaraid/megaraid_sas_fp.c
index 741509b35617..14f32c114c55 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fp.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fp.c
@@ -1273,7 +1273,7 @@ void mr_update_load_balance_params(struct MR_DRV_RAID_MAP_ALL *drv_map,
 
 	for (ldCount = 0; ldCount < MAX_LOGICAL_DRIVES_EXT; ldCount++) {
 		ld = MR_TargetIdToLdGet(ldCount, drv_map);
-		if (ld >= MAX_LOGICAL_DRIVES_EXT) {
+		if (ld >= MAX_LOGICAL_DRIVES_EXT - 1) {
 			lbInfo[ldCount].loadBalanceFlag = 0;
 			continue;
 		}
diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index 213944ed64d9..3d3bfa814093 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
@@ -1758,7 +1758,7 @@ static void megasas_build_ld_nonrw_fusion(struct megasas_instance *instance,
 		device_id < instance->fw_supported_vd_count)) {
 
 		ld = MR_TargetIdToLdGet(device_id, local_map_ptr);
-		if (ld >= instance->fw_supported_vd_count)
+		if (ld >= instance->fw_supported_vd_count - 1)
 			fp_possible = 0;
 
 		raid = MR_LdRaidGet(ld, local_map_ptr);

From 829ff9b456ac285576a500323cc3da2d595f08c9 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Fri, 28 Dec 2018 00:32:50 -0800
Subject: [PATCH 2353/3220] ocfs2: fix panic due to unrecovered local alloc

[ Upstream commit 532e1e54c8140188e192348c790317921cb2dc1c ]

mount.ocfs2 ignore the inconsistent error that journal is clean but
local alloc is unrecovered.  After mount, local alloc not empty, then
reserver cluster didn't alloc a new local alloc window, reserveration
map is empty(ocfs2_reservation_map.m_bitmap_len = 0), that triggered the
following panic.

This issue was reported at

  https://oss.oracle.com/pipermail/ocfs2-devel/2015-May/010854.html

and was advised to fixed during mount.  But this is a very unusual
inconsistent state, usually journal dirty flag should be cleared at the
last stage of umount until every other things go right.  We may need do
further debug to check that.  Any way to avoid possible futher
corruption, mount should be abort and fsck should be run.

  (mount.ocfs2,1765,1):ocfs2_load_local_alloc:353 ERROR: Local alloc hasn't been recovered!
  found = 6518, set = 6518, taken = 8192, off = 15912372
  ocfs2: Mounting device (202,64) on (node 0, slot 3) with ordered data mode.
  o2dlm: Joining domain 89CEAC63CC4F4D03AC185B44E0EE0F3F ( 0 1 2 3 4 5 6 8 ) 8 nodes
  ocfs2: Mounting device (202,80) on (node 0, slot 3) with ordered data mode.
  o2hb: Region 89CEAC63CC4F4D03AC185B44E0EE0F3F (xvdf) is now a quorum device
  o2net: Accepted connection from node yvwsoa17p (num 7) at 172.22.77.88:7777
  o2dlm: Node 7 joins domain 64FE421C8C984E6D96ED12C55FEE2435 ( 0 1 2 3 4 5 6 7 8 ) 9 nodes
  o2dlm: Node 7 joins domain 89CEAC63CC4F4D03AC185B44E0EE0F3F ( 0 1 2 3 4 5 6 7 8 ) 9 nodes
  ------------[ cut here ]------------
  kernel BUG at fs/ocfs2/reservations.c:507!
  invalid opcode: 0000 [#1] SMP
  Modules linked in: ocfs2 rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs fscache lockd grace ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sunrpc ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 ovmapi ppdev parport_pc parport xen_netfront fb_sys_fops sysimgblt sysfillrect syscopyarea acpi_cpufreq pcspkr i2c_piix4 i2c_core sg ext4 jbd2 mbcache2 sr_mod cdrom xen_blkfront pata_acpi ata_generic ata_piix floppy dm_mirror dm_region_hash dm_log dm_mod
  CPU: 0 PID: 4349 Comm: startWebLogic.s Not tainted 4.1.12-124.19.2.el6uek.x86_64 #2
  Hardware name: Xen HVM domU, BIOS 4.4.4OVM 09/06/2018
  task: ffff8803fb04e200 ti: ffff8800ea4d8000 task.ti: ffff8800ea4d8000
  RIP: 0010:[<ffffffffa05e96a8>]  [<ffffffffa05e96a8>] __ocfs2_resv_find_window+0x498/0x760 [ocfs2]
  Call Trace:
    ocfs2_resmap_resv_bits+0x10d/0x400 [ocfs2]
    ocfs2_claim_local_alloc_bits+0xd0/0x640 [ocfs2]
    __ocfs2_claim_clusters+0x178/0x360 [ocfs2]
    ocfs2_claim_clusters+0x1f/0x30 [ocfs2]
    ocfs2_convert_inline_data_to_extents+0x634/0xa60 [ocfs2]
    ocfs2_write_begin_nolock+0x1c6/0x1da0 [ocfs2]
    ocfs2_write_begin+0x13e/0x230 [ocfs2]
    generic_perform_write+0xbf/0x1c0
    __generic_file_write_iter+0x19c/0x1d0
    ocfs2_file_write_iter+0x589/0x1360 [ocfs2]
    __vfs_write+0xb8/0x110
    vfs_write+0xa9/0x1b0
    SyS_write+0x46/0xb0
    system_call_fastpath+0x18/0xd7
  Code: ff ff 8b 75 b8 39 75 b0 8b 45 c8 89 45 98 0f 84 e5 fe ff ff 45 8b 74 24 18 41 8b 54 24 1c e9 56 fc ff ff 85 c0 0f 85 48 ff ff ff <0f> 0b 48 8b 05 cf c3 de ff 48 ba 00 00 00 00 00 00 00 10 48 85
  RIP   __ocfs2_resv_find_window+0x498/0x760 [ocfs2]
   RSP <ffff8800ea4db668>
  ---[ end trace 566f07529f2edf3c ]---
  Kernel panic - not syncing: Fatal exception
  Kernel Offset: disabled

Link: http://lkml.kernel.org/r/20181121020023.3034-2-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Acked-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ocfs2/localalloc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0a4457fb0711..85111d740c9d 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -345,13 +345,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 	if (num_used
 	    || alloc->id1.bitmap1.i_used
 	    || alloc->id1.bitmap1.i_total
-	    || la->la_bm_off)
-		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+	    || la->la_bm_off) {
+		mlog(ML_ERROR, "inconsistent detected, clean journal with"
+		     " unrecovered local alloc, please run fsck.ocfs2!\n"
 		     "found = %u, set = %u, taken = %u, off = %u\n",
 		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
 		     le32_to_cpu(alloc->id1.bitmap1.i_total),
 		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
 
+		status = -EINVAL;
+		goto bail;
+	}
+
 	osb->local_alloc_bh = alloc_bh;
 	osb->local_alloc_state = OCFS2_LA_ENABLED;
 

From 4c0b9a2eae0444b5ff32a0e9268b1caa636c17e4 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 28 Dec 2018 00:37:20 -0800
Subject: [PATCH 2354/3220] mm/page-writeback.c: don't break integrity
 writeback on ->writepage() error

[ Upstream commit 3fa750dcf29e8606e3969d13d8e188cc1c0f511d ]

write_cache_pages() is used in both background and integrity writeback
scenarios by various filesystems.  Background writeback is mostly
concerned with cleaning a certain number of dirty pages based on various
mm heuristics.  It may not write the full set of dirty pages or wait for
I/O to complete.  Integrity writeback is responsible for persisting a set
of dirty pages before the writeback job completes.  For example, an
fsync() call must perform integrity writeback to ensure data is on disk
before the call returns.

write_cache_pages() unconditionally breaks out of its processing loop in
the event of a ->writepage() error.  This is fine for background
writeback, which had no strict requirements and will eventually come
around again.  This can cause problems for integrity writeback on
filesystems that might need to clean up state associated with failed page
writeouts.  For example, XFS performs internal delayed allocation
accounting before returning a ->writepage() error, where applicable.  If
the current writeback happens to be associated with an unmount and
write_cache_pages() completes the writeback prematurely due to error, the
filesystem is unmounted in an inconsistent state if dirty+delalloc pages
still exist.

To handle this problem, update write_cache_pages() to always process the
full set of pages for integrity writeback regardless of ->writepage()
errors.  Save the first encountered error and return it to the caller once
complete.  This facilitates XFS (or any other fs that expects integrity
writeback to process the entire set of dirty pages) to clean up its
internal state completely in the event of persistent mapping errors.
Background writeback continues to exit on the first error encountered.

[akpm@linux-foundation.org: fix typo in comment]
Link: http://lkml.kernel.org/r/20181116134304.32440-1-bfoster@redhat.com
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/page-writeback.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3309dbda7ffa..0bc7fa21db85 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2151,6 +2151,7 @@ int write_cache_pages(struct address_space *mapping,
 {
 	int ret = 0;
 	int done = 0;
+	int error;
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t uninitialized_var(writeback_index);
@@ -2247,25 +2248,31 @@ continue_unlock:
 				goto continue_unlock;
 
 			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
-			ret = (*writepage)(page, wbc, data);
-			if (unlikely(ret)) {
-				if (ret == AOP_WRITEPAGE_ACTIVATE) {
+			error = (*writepage)(page, wbc, data);
+			if (unlikely(error)) {
+				/*
+				 * Handle errors according to the type of
+				 * writeback. There's no need to continue for
+				 * background writeback. Just push done_index
+				 * past this page so media errors won't choke
+				 * writeout for the entire file. For integrity
+				 * writeback, we must process the entire dirty
+				 * set regardless of errors because the fs may
+				 * still have state to clear for each page. In
+				 * that case we continue processing and return
+				 * the first error.
+				 */
+				if (error == AOP_WRITEPAGE_ACTIVATE) {
 					unlock_page(page);
-					ret = 0;
-				} else {
-					/*
-					 * done_index is set past this page,
-					 * so media errors will not choke
-					 * background writeout for the entire
-					 * file. This has consequences for
-					 * range_cyclic semantics (ie. it may
-					 * not be suitable for data integrity
-					 * writeout).
-					 */
+					error = 0;
+				} else if (wbc->sync_mode != WB_SYNC_ALL) {
+					ret = error;
 					done_index = page->index + 1;
 					done = 1;
 					break;
 				}
+				if (!ret)
+					ret = error;
 			}
 
 			/*

From e660576a53db1f4d73c786fe3a2c67fc2d878d93 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 28 Dec 2018 00:38:17 -0800
Subject: [PATCH 2355/3220] mm, proc: be more verbose about unstable VMA flags
 in /proc/<pid>/smaps

[ Upstream commit 7550c6079846a24f30d15ac75a941c8515dbedfb ]

Patch series "THP eligibility reporting via proc".

This series of three patches aims at making THP eligibility reporting much
more robust and long term sustainable.  The trigger for the change is a
regression report [2] and the long follow up discussion.  In short the
specific application didn't have good API to query whether a particular
mapping can be backed by THP so it has used VMA flags to workaround that.
These flags represent a deep internal state of VMAs and as such they
should be used by userspace with a great deal of caution.

A similar has happened for [3] when users complained that VM_MIXEDMAP is
no longer set on DAX mappings.  Again a lack of a proper API led to an
abuse.

The first patch in the series tries to emphasise that that the semantic of
flags might change and any application consuming those should be really
careful.

The remaining two patches provide a more suitable interface to address [2]
and provide a consistent API to query the THP status both for each VMA and
process wide as well.  [1]

http://lkml.kernel.org/r/20181120103515.25280-1-mhocko@kernel.org [2]
http://lkml.kernel.org/r/http://lkml.kernel.org/r/alpine.DEB.2.21.1809241054050.224429@chino.kir.corp.google.com
[3] http://lkml.kernel.org/r/20181002100531.GC4135@quack2.suse.cz

This patch (of 3):

Even though vma flags exported via /proc/<pid>/smaps are explicitly
documented to be not guaranteed for future compatibility the warning
doesn't go far enough because it doesn't mention semantic changes to those
flags.  And they are important as well because these flags are a deep
implementation internal to the MM code and the semantic might change at
any time.

Let's consider two recent examples:
http://lkml.kernel.org/r/20181002100531.GC4135@quack2.suse.cz
: commit e1fb4a086495 "dax: remove VM_MIXEDMAP for fsdax and device dax" has
: removed VM_MIXEDMAP flag from DAX VMAs. Now our testing shows that in the
: mean time certain customer of ours started poking into /proc/<pid>/smaps
: and looks at VMA flags there and if VM_MIXEDMAP is missing among the VMA
: flags, the application just fails to start complaining that DAX support is
: missing in the kernel.

http://lkml.kernel.org/r/alpine.DEB.2.21.1809241054050.224429@chino.kir.corp.google.com
: Commit 1860033237d4 ("mm: make PR_SET_THP_DISABLE immediately active")
: introduced a regression in that userspace cannot always determine the set
: of vmas where thp is ineligible.
: Userspace relies on the "nh" flag being emitted as part of /proc/pid/smaps
: to determine if a vma is eligible to be backed by hugepages.
: Previous to this commit, prctl(PR_SET_THP_DISABLE, 1) would cause thp to
: be disabled and emit "nh" as a flag for the corresponding vmas as part of
: /proc/pid/smaps.  After the commit, thp is disabled by means of an mm
: flag and "nh" is not emitted.
: This causes smaps parsing libraries to assume a vma is eligible for thp
: and ends up puzzling the user on why its memory is not backed by thp.

In both cases userspace was relying on a semantic of a specific VMA flag.
The primary reason why that happened is a lack of a proper interface.
While this has been worked on and it will be fixed properly, it seems that
our wording could see some refinement and be more vocal about semantic
aspect of these flags as well.

Link: http://lkml.kernel.org/r/20181211143641.3503-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Oppenheimer <bepvte@gmail.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 Documentation/filesystems/proc.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 6d2689ebf824..5b87946a53a3 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -466,7 +466,9 @@ manner. The codes are the following:
 
 Note that there is no guarantee that every flag and associated mnemonic will
 be present in all further kernel releases. Things get changed, the flags may
-be vanished or the reverse -- new added.
+be vanished or the reverse -- new added. Interpretation of their meaning
+might change in future as well. So each consumer of these flags has to
+follow each specific kernel version for the exact semantic.
 
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.

From de614973ee159fef48ca6255a7324cb64ea31f44 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 23 Sep 2017 12:39:12 -0700
Subject: [PATCH 2356/3220] net: speed up skb_rbtree_purge()

commit 7c90584c66cc4b033a3b684b0e0950f79e7b7166 upstream.

As measured in my prior patch ("sch_netem: faster rb tree removal"),
rbtree_postorder_for_each_entry_safe() is nice looking but much slower
than using rb_next() directly, except when tree is small enough
to fit in CPU caches (then the cost is the same)

Also note that there is not even an increase of text size :
$ size net/core/skbuff.o.before net/core/skbuff.o
   text	   data	    bss	    dec	    hex	filename
  40711	   1298	      0	  42009	   a419	net/core/skbuff.o.before
  40711	   1298	      0	  42009	   a419	net/core/skbuff.o

From: Eric Dumazet <edumazet@google.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/skbuff.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9703924ed071..8a57bbaf7452 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2388,12 +2388,15 @@ EXPORT_SYMBOL(skb_queue_purge);
  */
 void skb_rbtree_purge(struct rb_root *root)
 {
-	struct sk_buff *skb, *next;
+	struct rb_node *p = rb_first(root);
 
-	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
+	while (p) {
+		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, root);
 		kfree_skb(skb);
-
-	*root = RB_ROOT;
+	}
 }
 
 /**

From cd2402db92cea431e681a0996cf79d3808173d21 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Fri, 16 Nov 2018 09:59:21 -0600
Subject: [PATCH 2357/3220] ipmi:ssif: Fix handling of multi-part return
 messages

commit 7d6380cd40f7993f75c4bde5b36f6019237e8719 upstream.

The block number was not being compared right, it was off by one
when checking the response.

Some statistics wouldn't be incremented properly in some cases.

Check to see if that middle-part messages always have 31 bytes of
data.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
Cc: stable@vger.kernel.org # 4.4
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/ipmi/ipmi_ssif.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 7a2e23d6bfdd..b2da2382d544 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -637,8 +637,9 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
 
 		/* Remove the multi-part read marker. */
 		len -= 2;
+		data += 2;
 		for (i = 0; i < len; i++)
-			ssif_info->data[i] = data[i+2];
+			ssif_info->data[i] = data[i];
 		ssif_info->multi_len = len;
 		ssif_info->multi_pos = 1;
 
@@ -666,8 +667,19 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
 		}
 
 		blocknum = data[0];
+		len--;
+		data++;
 
-		if (ssif_info->multi_len + len - 1 > IPMI_MAX_MSG_LENGTH) {
+		if (blocknum != 0xff && len != 31) {
+		    /* All blocks but the last must have 31 data bytes. */
+			result = -EIO;
+			if (ssif_info->ssif_debug & SSIF_DEBUG_MSG)
+				pr_info("Received middle message <31\n");
+
+			goto continue_op;
+		}
+
+		if (ssif_info->multi_len + len > IPMI_MAX_MSG_LENGTH) {
 			/* Received message too big, abort the operation. */
 			result = -E2BIG;
 			if (ssif_info->ssif_debug & SSIF_DEBUG_MSG)
@@ -676,16 +688,14 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
 			goto continue_op;
 		}
 
-		/* Remove the blocknum from the data. */
-		len--;
 		for (i = 0; i < len; i++)
-			ssif_info->data[i + ssif_info->multi_len] = data[i + 1];
+			ssif_info->data[i + ssif_info->multi_len] = data[i];
 		ssif_info->multi_len += len;
 		if (blocknum == 0xff) {
 			/* End of read */
 			len = ssif_info->multi_len;
 			data = ssif_info->data;
-		} else if (blocknum + 1 != ssif_info->multi_pos) {
+		} else if (blocknum != ssif_info->multi_pos) {
 			/*
 			 * Out of sequence block, just abort.  Block
 			 * numbers start at zero for the second block,
@@ -713,6 +723,7 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
 		}
 	}
 
+ continue_op:
 	if (result < 0) {
 		ssif_inc_stat(ssif_info, receive_errors);
 	} else {
@@ -720,8 +731,6 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result,
 		ssif_inc_stat(ssif_info, received_message_parts);
 	}
 
-
- continue_op:
 	if (ssif_info->ssif_debug & SSIF_DEBUG_STATE)
 		pr_info(PFX "DONE 1: state = %d, result=%d.\n",
 			ssif_info->ssif_state, result);

From 626b008972db6e4977f6b3460400f6f4e6731081 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 26 Jan 2019 09:42:55 +0100
Subject: [PATCH 2358/3220] Linux 4.4.172

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c6b680faedd8..2aa8db459a74 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 171
+SUBLEVEL = 172
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From f38b204f76a947c686a61ac7b2609936006010b1 Mon Sep 17 00:00:00 2001
From: Matthias Maennich <maennich@google.com>
Date: Mon, 4 Feb 2019 17:33:19 +0000
Subject: [PATCH 2359/3220] ANDROID: cuttlefish_defconfig: Enable
 CONFIG_RTC_HCTOSYS

This configuration is required for the VTS test
VtsKernelApiSysfsTest#testRtcHctosys to pass.

Bug: 123860857
Test: run vts-kernel -m VtsKernelApiSysfsTest
Signed-off-by: Matthias Maennich <maennich@google.com>
Change-Id: Icae17c74460bcd2aef4cf4e3ec5381de9ea0a66c
---
 arch/arm64/configs/cuttlefish_defconfig      | 1 -
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
index ad998649d71f..931204f707a3 100644
--- a/arch/arm64/configs/cuttlefish_defconfig
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -349,7 +349,6 @@ CONFIG_USB_CONFIGFS_F_MIDI=y
 CONFIG_MMC=y
 # CONFIG_MMC_BLOCK is not set
 CONFIG_RTC_CLASS=y
-# CONFIG_RTC_HCTOSYS is not set
 # CONFIG_RTC_SYSTOHC is not set
 CONFIG_RTC_DRV_PL031=y
 CONFIG_VIRTIO_PCI=y
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index fc2d30e1361d..afa9b589d1cc 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -378,7 +378,6 @@ CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
 CONFIG_USB_CONFIGFS_UEVENT=y
 CONFIG_USB_CONFIGFS_F_MIDI=y
 CONFIG_RTC_CLASS=y
-# CONFIG_RTC_HCTOSYS is not set
 CONFIG_VIRTIO_PCI=y
 CONFIG_VIRTIO_BALLOON=y
 CONFIG_VIRTIO_INPUT=y

From b224ba20b4d44e7fed8eaa92d20c9ff3009023fe Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Tue, 1 Jan 2019 21:33:11 +0800
Subject: [PATCH 2360/3220] f2fs: change error code to -ENOMEM from -EINVAL

The error case of failing allocating memory should
return -ENOMEM.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a9e7a1e62c66..4d23344976fd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 	if (!qname) {
 		f2fs_msg(sb, KERN_ERR,
 			"Not enough memory for storing quotafile name");
-		return -EINVAL;
+		return -ENOMEM;
 	}
 	if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
 		if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)

From 746a1117c57ad4cdbc2ae38568ad115af8d92336 Mon Sep 17 00:00:00 2001
From: Zhikang Zhang <zhangzhikang1@huawei.com>
Date: Thu, 3 Jan 2019 20:06:38 +0800
Subject: [PATCH 2361/3220] f2fs: fix compile warnings: 'struct *' declared
 inside parameter list

We meet these compile warnings below, which caused by missing declare structs:
struct f2fs_io_info, struct extent, struct f2fs_sb_info.

warning: 'struct f2fs_io_info' declared inside parameter list
warning: 'struct extent_info' declared inside parameter list
warning: 'struct f2fs_sb_info' declared inside parameter list

Signed-off-by: Zhikang Zhang <zhangzhikang1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/trace/events/f2fs.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 0cdf6cc5c557..9bbb7c916c12 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -141,6 +141,9 @@ TRACE_DEFINE_ENUM(CP_TRIMMED);
 		{ CP_SPEC_LOG_NUM,	"log type is 2" },		\
 		{ CP_RECOVER_DIR,	"dir needs recovery" })
 
+struct f2fs_sb_info;
+struct f2fs_io_info;
+struct extent_info;
 struct victim_sel_policy;
 struct f2fs_map_blocks;
 

From f3a0a4ad1ea473706924e3edd45b74bd0b831258 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 4 Jan 2019 01:38:29 +0000
Subject: [PATCH 2362/3220] f2fs: remove set but not used variable 'err'

Fixes gcc '-Wunused-but-set-variable' warning:

fs/f2fs/data.c: In function 'f2fs_dio_submit_bio':
fs/f2fs/data.c:2585:6: warning:
 variable 'err' set but not used [-Wunused-but-set-variable]

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 35285a3d3ff6..2386a2839842 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2578,14 +2578,11 @@ static void f2fs_dio_submit_bio(int rw, struct bio *bio, struct inode *inode,
 {
 	struct f2fs_private_dio *dio;
 	bool write = (rw == REQ_OP_WRITE);
-	int err;
 
 	dio = f2fs_kzalloc(F2FS_I_SB(inode),
 			sizeof(struct f2fs_private_dio), GFP_NOFS);
-	if (!dio) {
-		err = -ENOMEM;
+	if (!dio)
 		goto out;
-	}
 
 	dio->inode = inode;
 	dio->orig_end_io = bio->bi_end_io;

From b5897a1fad59367a783c98babafff41c6a3731e2 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 4 Jan 2019 17:39:53 +0800
Subject: [PATCH 2363/3220] f2fs: check inject_rate validity during configuring

Type of inject_rate is unsigned int, let's check new value's
validity during configuring.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/sysfs.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 948c1a211341..b427122cb71a 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -222,6 +222,8 @@ out:
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
 		return -EINVAL;
+	if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
+		return -EINVAL;
 #endif
 	if (a->struct_type == RESERVED_BLOCKS) {
 		spin_lock(&sbi->stat_lock);

From b9aad0e3d072272cc27ec556538866df74a75af8 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 3 Jan 2019 17:19:08 -0800
Subject: [PATCH 2364/3220] f2fs: export FS_NOCOW_FL flag to user

This exports pin_file status to user.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 3 ++-
 fs/f2fs/file.c | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 11f86e439ac7..09862e862b06 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2366,11 +2366,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 #define F2FS_EXTENTS_FL			0x00080000 /* Inode uses extents */
 #define F2FS_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 #define F2FS_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
+#define F2FS_NOCOW_FL			0x00800000 /* Do not cow file */
 #define F2FS_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
 #define F2FS_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
 #define F2FS_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define F2FS_FL_USER_VISIBLE		0x304BDFFF /* User visible flags */
+#define F2FS_FL_USER_VISIBLE		0x30CBDFFF /* User visible flags */
 #define F2FS_FL_USER_MODIFIABLE		0x204BC0FF /* User modifiable flags */
 
 /* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b44e02b8c677..5942e6d544d2 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1658,6 +1658,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
 		flags |= F2FS_ENCRYPT_FL;
 	if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
 		flags |= F2FS_INLINE_DATA_FL;
+	if (is_inode_flag_set(inode, FI_PIN_FILE))
+		flags |= F2FS_NOCOW_FL;
 
 	flags &= F2FS_FL_USER_VISIBLE;
 

From 1ba66eab9c23ab887c5c8ff1194c06d5b3e9caf7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 4 Jan 2019 14:26:18 +0100
Subject: [PATCH 2365/3220] f2fs: no need to check return value of
 debugfs_create functions

When calling debugfs functions, there is no need to ever check the
return value.  The function can work or not, but the code logic should
never do something different based on this.

Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: linux-f2fs-devel@lists.sourceforge.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c | 20 +++-----------------
 fs/f2fs/f2fs.h  |  4 ++--
 fs/f2fs/super.c |  5 +----
 3 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index f05b37ef7182..d00ba9b711a4 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -522,30 +522,16 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
 	kvfree(si);
 }
 
-int __init f2fs_create_root_stats(void)
+void __init f2fs_create_root_stats(void)
 {
-	struct dentry *file;
-
 	f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
-	if (!f2fs_debugfs_root)
-		return -ENOMEM;
 
-	file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
-			NULL, &stat_fops);
-	if (!file) {
-		debugfs_remove(f2fs_debugfs_root);
-		f2fs_debugfs_root = NULL;
-		return -ENOMEM;
-	}
-
-	return 0;
+	debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
+			    &stat_fops);
 }
 
 void f2fs_destroy_root_stats(void)
 {
-	if (!f2fs_debugfs_root)
-		return;
-
 	debugfs_remove_recursive(f2fs_debugfs_root);
 	f2fs_debugfs_root = NULL;
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 09862e862b06..b90ee556b66d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3413,7 +3413,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 
 int f2fs_build_stats(struct f2fs_sb_info *sbi);
 void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
-int __init f2fs_create_root_stats(void);
+void __init f2fs_create_root_stats(void);
 void f2fs_destroy_root_stats(void);
 #else
 #define stat_inc_cp_count(si)				do { } while (0)
@@ -3451,7 +3451,7 @@ void f2fs_destroy_root_stats(void);
 
 static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline int __init f2fs_create_root_stats(void) { return 0; }
+static inline void __init f2fs_create_root_stats(void) { }
 static inline void f2fs_destroy_root_stats(void) { }
 #endif
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4d23344976fd..033e1e9f89d2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3557,9 +3557,7 @@ static int __init init_f2fs_fs(void)
 	err = register_filesystem(&f2fs_fs_type);
 	if (err)
 		goto free_shrinker;
-	err = f2fs_create_root_stats();
-	if (err)
-		goto free_filesystem;
+	f2fs_create_root_stats();
 	err = f2fs_init_post_read_processing();
 	if (err)
 		goto free_root_stats;
@@ -3567,7 +3565,6 @@ static int __init init_f2fs_fs(void)
 
 free_root_stats:
 	f2fs_destroy_root_stats();
-free_filesystem:
 	unregister_filesystem(&f2fs_fs_type);
 free_shrinker:
 	unregister_shrinker(&f2fs_shrinker_info);

From 82e2ea2753149761ec98206b117051eee3e743a3 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 8 Jan 2019 10:21:24 +0800
Subject: [PATCH 2366/3220] f2fs: fix to trigger fsck if dirent.name_len is
 zero

While traversing dirents in f2fs_fill_dentries(), if bitmap is valid,
filename length should not be zero, otherwise, directory structure
consistency could be corrupted, in this case, let's print related
info and set SBI_NEED_FSCK to trigger fsck for repairing.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 719ba4d92098..fabdf8428c7a 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 		if (de->name_len == 0) {
 			bit_pos++;
 			ctx->pos = start_pos + bit_pos;
+			printk_ratelimited(
+				"%s, invalid namelen(0), ino:%u, run fsck to fix.",
+				KERN_WARNING, le32_to_cpu(de->ino));
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
 			continue;
 		}
 

From 9ca1182a3c21e67d62d2b7566b11f3ead82534d9 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong1@huawei.com>
Date: Mon, 7 Jan 2019 15:02:34 +0800
Subject: [PATCH 2367/3220] f2fs: check if file namelen exceeds max value

Dentry bitmap is not enough to detect incorrect dentries. So this patch
also checks the namelen value of a dentry.

Signed-off-by: Gong Chen <gongchen4@huawei.com>
Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index fabdf8428c7a..8dff29053150 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -814,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 
 		/* check memory boundary before moving forward */
 		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
-		if (unlikely(bit_pos > d->max)) {
+		if (unlikely(bit_pos > d->max ||
+				le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) {
 			f2fs_msg(sbi->sb, KERN_WARNING,
 				"%s: corrupted namelen=%d, run fsck to fix.",
 				__func__, le16_to_cpu(de->name_len));

From b8243df07b586d46ab5bb7f7b9dd8b315f2439b5 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong1@huawei.com>
Date: Mon, 14 Jan 2019 22:05:14 +0800
Subject: [PATCH 2368/3220] f2fs: add brackets for macros

Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b90ee556b66d..eaa71bf8d49d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -322,7 +322,7 @@ struct discard_entry {
 /* max discard pend list number */
 #define MAX_PLIST_NUM		512
 #define plist_idx(blk_num)	((blk_num) >= MAX_PLIST_NUM ?		\
-					(MAX_PLIST_NUM - 1) : (blk_num - 1))
+					(MAX_PLIST_NUM - 1) : ((blk_num) - 1))
 
 enum {
 	D_PREP,			/* initial */
@@ -2848,9 +2848,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
 
 #define F2FS_OLD_ATTRIBUTE_SIZE	(offsetof(struct f2fs_inode, i_addr))
 #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field)		\
-		((offsetof(typeof(*f2fs_inode), field) +	\
+		((offsetof(typeof(*(f2fs_inode)), field) +	\
 		sizeof((f2fs_inode)->field))			\
-		<= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize))	\
+		<= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize)))	\
 
 static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
 {
@@ -2879,8 +2879,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
 
 #define __is_large_section(sbi)		((sbi)->segs_per_sec > 1)
 
-#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META &&	\
-				(!is_read_io(fio->op) || fio->is_meta))
+#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META &&	\
+				(!is_read_io((fio)->op) || (fio)->is_meta))
 
 bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type);

From 1b64f91923d05f3fd2b3dfe18c0b0159377d0745 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong1@huawei.com>
Date: Tue, 15 Jan 2019 20:02:15 +0000
Subject: [PATCH 2369/3220] f2fs: UBSAN: set boolean value iostat_enable
 correctly

When setting /sys/fs/f2fs/<DEV>/iostat_enable with non-bool value, UBSAN
reports the following warning.

[ 7562.295484] ================================================================================
[ 7562.296531] UBSAN: Undefined behaviour in fs/f2fs/f2fs.h:2776:10
[ 7562.297651] load of value 64 is not a valid value for type '_Bool'
[ 7562.298642] CPU: 1 PID: 7487 Comm: dd Not tainted 4.20.0-rc4+ #79
[ 7562.298653] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[ 7562.298662] Call Trace:
[ 7562.298760]  dump_stack+0x46/0x5b
[ 7562.298811]  ubsan_epilogue+0x9/0x40
[ 7562.298830]  __ubsan_handle_load_invalid_value+0x72/0x90
[ 7562.298863]  f2fs_file_write_iter+0x29f/0x3f0
[ 7562.298905]  __vfs_write+0x115/0x160
[ 7562.298922]  vfs_write+0xa7/0x190
[ 7562.298934]  ksys_write+0x50/0xc0
[ 7562.298973]  do_syscall_64+0x4a/0xe0
[ 7562.298992]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 7562.299001] RIP: 0033:0x7fa45ec19c00
[ 7562.299004] Code: 73 01 c3 48 8b 0d 88 92 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d dd eb 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04 24
[ 7562.299044] RSP: 002b:00007ffca52b49e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 7562.299052] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa45ec19c00
[ 7562.299059] RDX: 0000000000000400 RSI: 000000000093f000 RDI: 0000000000000001
[ 7562.299065] RBP: 000000000093f000 R08: 0000000000000004 R09: 0000000000000000
[ 7562.299071] R10: 00007ffca52b47b0 R11: 0000000000000246 R12: 0000000000000400
[ 7562.299077] R13: 000000000093f000 R14: 000000000093f400 R15: 0000000000000000
[ 7562.299091] ================================================================================

So, if iostat_enable is enabled, set its value as true.

Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/sysfs.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index b427122cb71a..a609c0abbdd0 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -280,10 +280,16 @@ out:
 		return count;
 	}
 
-	*ui = t;
 
-	if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)
-		f2fs_reset_iostat(sbi);
+	if (!strcmp(a->attr.name, "iostat_enable")) {
+		sbi->iostat_enable = !!t;
+		if (!sbi->iostat_enable)
+			f2fs_reset_iostat(sbi);
+		return count;
+	}
+
+	*ui = (unsigned int)t;
+
 	return count;
 }
 

From c90a357aee5f155d94f46ab0b109388b72fac2ce Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 10 Jan 2019 16:40:12 +0800
Subject: [PATCH 2370/3220] f2fs: fix to set sbi dirty correctly

In order to record direct IO count, we add two additional type in
enum count_type: F2FS_DIO_{WRITE,READ}, but those IO won't dirty
filesystem metadata, so we don't need to set filesystem dirty in
inc_page_count(), fix it.

Fixes: 02b16d0a34a1 ("f2fs: add to account direct IO")
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index eaa71bf8d49d..705d9373b5ba 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1864,13 +1864,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
 	atomic_inc(&sbi->nr_pages[count_type]);
 
-	if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
-		count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA ||
-		count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE ||
-		count_type == F2FS_RD_META)
-		return;
-
-	set_sbi_flag(sbi, SBI_IS_DIRTY);
+	if (count_type == F2FS_DIRTY_DENTS ||
+			count_type == F2FS_DIRTY_NODES ||
+			count_type == F2FS_DIRTY_META ||
+			count_type == F2FS_DIRTY_QDATA ||
+			count_type == F2FS_DIRTY_IMETA)
+		set_sbi_flag(sbi, SBI_IS_DIRTY);
 }
 
 static inline void inode_inc_dirty_pages(struct inode *inode)

From 35f7e2f6b1b39e6b1704567c27c8cacdd836e8ff Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 14 Jan 2019 10:42:11 -0800
Subject: [PATCH 2371/3220] f2fs: run discard jobs when put_super

When we umount f2fs, we need to avoid long delay due to discard commands, which
is actually taking tens of seconds, if storage is very slow on UNMAP. So, this
patch introduces timeout-based work on it.

By default, let me give 5 seconds for discard.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |  7 +++++++
 fs/f2fs/f2fs.h                          |  5 ++++-
 fs/f2fs/segment.c                       | 11 ++++++++++-
 fs/f2fs/super.c                         |  4 +++-
 fs/f2fs/sysfs.c                         |  3 +++
 5 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index e09db014830f..e916c1eddf1b 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -86,6 +86,13 @@ Description:
 		The unit size is one block, now only support configuring in range
 		of [1, 512].
 
+What:          /sys/fs/f2fs/<disk>/umount_discard_timeout
+Date:          January 2019
+Contact:       "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+		Set timeout to issue discard commands during umount.
+		Default: 5 secs
+
 What:		/sys/fs/f2fs/<disk>/max_victim_search
 Date:		January 2014
 Contact:	"Jaegeuk Kim" <jaegeuk.kim@samsung.com>
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 705d9373b5ba..4c6b7d93d75f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -259,6 +259,7 @@ enum {
 #define DEF_CP_INTERVAL			60	/* 60 secs */
 #define DEF_IDLE_INTERVAL		5	/* 5 secs */
 #define DEF_DISABLE_INTERVAL		5	/* 5 secs */
+#define DEF_UMOUNT_DISCARD_TIMEOUT	5	/* 5 secs */
 
 struct cp_control {
 	int reason;
@@ -378,6 +379,7 @@ struct discard_policy {
 	bool sync;			/* submit discard with REQ_SYNC flag */
 	bool ordered;			/* issue discard by lba order */
 	unsigned int granularity;	/* discard granularity */
+	int timeout;			/* discard timeout for put_super */
 };
 
 struct discard_cmd_control {
@@ -1175,6 +1177,7 @@ enum {
 	DISCARD_TIME,
 	GC_TIME,
 	DISABLE_TIME,
+	UMOUNT_DISCARD_TIMEOUT,
 	MAX_TIME,
 };
 
@@ -3090,7 +3093,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
 bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
 void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi);
 void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi);
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi);
 void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 					struct cp_control *cpc);
 void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index ccefcf8016c4..7bb5ef731cbe 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1117,6 +1117,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
 
 	dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
 	dpolicy->io_aware_gran = MAX_PLIST_NUM;
+	dpolicy->timeout = 0;
 
 	if (discard_type == DPOLICY_BG) {
 		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1500,7 +1501,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
 	int i, issued = 0;
 	bool io_interrupted = false;
 
+	if (dpolicy->timeout != 0)
+		f2fs_update_time(sbi, dpolicy->timeout);
+
 	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+		if (dpolicy->timeout != 0 &&
+				f2fs_time_over(sbi, dpolicy->timeout))
+			break;
+
 		if (i + 1 < dpolicy->granularity)
 			break;
 
@@ -1687,7 +1695,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi)
 }
 
 /* This comes from f2fs_put_super */
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct discard_policy dpolicy;
@@ -1695,6 +1703,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
 
 	__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
 					dcc->discard_granularity);
+	dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT;
 	__issue_discard_cmd(sbi, &dpolicy);
 	dropped = __drop_discard_cmd(sbi);
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 033e1e9f89d2..a579c46a5107 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1048,7 +1048,7 @@ static void f2fs_put_super(struct super_block *sb)
 	}
 
 	/* be sure to wait for any on-going discard commands */
-	dropped = f2fs_wait_discard_bios(sbi);
+	dropped = f2fs_issue_discard_timeout(sbi);
 
 	if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
 					!sbi->discard_blks && !dropped) {
@@ -2710,6 +2710,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL;
 	sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL;
 	sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL;
+	sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] =
+				DEF_UMOUNT_DISCARD_TIMEOUT;
 	clear_sbi_flag(sbi, SBI_NEED_FSCK);
 
 	for (i = 0; i < NR_COUNT_TYPE; i++)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a609c0abbdd0..d7b47662a0aa 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -426,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
 					interval_time[DISCARD_TIME]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
+		umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
@@ -483,6 +485,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(idle_interval),
 	ATTR_LIST(discard_idle_interval),
 	ATTR_LIST(gc_idle_interval),
+	ATTR_LIST(umount_discard_timeout),
 	ATTR_LIST(iostat_enable),
 	ATTR_LIST(readdir_ra),
 	ATTR_LIST(gc_pin_file_thresh),

From 3907a1e3993ac8220b71fe8cf9530fca390da610 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 24 Jan 2019 17:48:38 -0800
Subject: [PATCH 2372/3220] f2fs: add quick mode of checkpoint=disable for QA

This mode returns mount() quickly with EAGAIN. We can trigger this by
shutdown(F2FS_GOING_DOWN_NEED_FSCK).

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c    | 5 +++++
 fs/f2fs/f2fs.h          | 2 ++
 fs/f2fs/file.c          | 6 +++---
 fs/f2fs/segment.c       | 3 +++
 fs/f2fs/super.c         | 5 +++++
 include/linux/f2fs_fs.h | 1 +
 6 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 940372ebfc60..3fa4587ab0e8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1260,6 +1260,11 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	else
 		__clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
 
+	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK))
+		__set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+	else
+		__clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+
 	if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
 		__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
 	else
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4c6b7d93d75f..a1ae613fb28a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -259,6 +259,7 @@ enum {
 #define DEF_CP_INTERVAL			60	/* 60 secs */
 #define DEF_IDLE_INTERVAL		5	/* 5 secs */
 #define DEF_DISABLE_INTERVAL		5	/* 5 secs */
+#define DEF_DISABLE_QUICK_INTERVAL	1	/* 1 secs */
 #define DEF_UMOUNT_DISCARD_TIMEOUT	5	/* 5 secs */
 
 struct cp_control {
@@ -1166,6 +1167,7 @@ enum {
 	SBI_IS_SHUTDOWN,			/* shutdown by ioctl */
 	SBI_IS_RECOVERED,			/* recovered orphan/data */
 	SBI_CP_DISABLED,			/* CP was disabled last mount */
+	SBI_CP_DISABLED_QUICK,			/* CP was disabled quickly */
 	SBI_QUOTA_NEED_FLUSH,			/* need to flush quota info in CP */
 	SBI_QUOTA_SKIP_FLUSH,			/* skip flushing quota in current CP */
 	SBI_QUOTA_NEED_REPAIR,			/* quota file may be corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5942e6d544d2..7a1217a236f0 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1972,11 +1972,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 		break;
 	case F2FS_GOING_DOWN_NEED_FSCK:
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+		set_sbi_flag(sbi, SBI_IS_DIRTY);
 		/* do checkpoint only */
 		ret = f2fs_sync_fs(sb, 1);
-		if (ret)
-			goto out;
-		break;
+		goto out;
 	default:
 		ret = -EINVAL;
 		goto out;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7bb5ef731cbe..1a091890b4c8 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -868,6 +868,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi)
 
 	if (holes[DATA] > ovp || holes[NODE] > ovp)
 		return -EAGAIN;
+	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
+		dirty_segments(sbi) > overprovision_segments(sbi))
+		return -EAGAIN;
 	return 0;
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a579c46a5107..6cb1c2e94f34 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3213,6 +3213,10 @@ try_onemore:
 
 	if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG))
 		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) {
+		set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+		sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL;
+	}
 
 	/* Initialize device list */
 	err = f2fs_scan_devices(sbi);
@@ -3399,6 +3403,7 @@ skip_recovery:
 				cur_cp_version(F2FS_CKPT(sbi)));
 	f2fs_update_time(sbi, CP_TIME);
 	f2fs_update_time(sbi, REQ_TIME);
+	clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
 	return 0;
 
 free_meta:
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 162f83358abb..fc763eab10b2 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -116,6 +116,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_DISABLED_QUICK_FLAG		0x00002000
 #define CP_DISABLED_FLAG		0x00001000
 #define CP_QUOTA_NEED_FSCK_FLAG		0x00000800
 #define CP_LARGE_NAT_BITMAP_FLAG	0x00000400

From c21ea75cece27778b67f2135742132f2087c7357 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 25 Jan 2019 09:12:13 -0800
Subject: [PATCH 2373/3220] f2fs: try to keep CP_TRIMMED_FLAG after successful
 umount

If every discard were issued successfully, we can avoid further discard.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1a091890b4c8..6f0bfaeebb47 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1143,6 +1143,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
 	} else if (discard_type == DPOLICY_UMOUNT) {
 		dpolicy->max_requests = UINT_MAX;
 		dpolicy->io_aware = false;
+		/* we need to issue all to keep CP_TRIMMED_FLAG */
+		dpolicy->granularity = 1;
 	}
 }
 

From 0335a980d2380af9205af73ef2633d7b69960a4d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 25 Jan 2019 10:26:39 -0800
Subject: [PATCH 2374/3220] f2fs: don't wake up too frequently, if there is
 lots of IOs

Otherwise, it wakes up discard thread which will sleep again by busy IOs
in a loop.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index a77f76f528b6..5c7ed0442d6e 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
 		}
 	}
 	mutex_unlock(&dcc->cmd_lock);
-	if (!wakeup)
+	if (!wakeup || !is_idle(sbi, DISCARD_TIME))
 		return;
 wake_up:
 	dcc->discard_wake = 1;

From 909e1f05afb5522a3e5bf34e07e6c8a58d1d56cb Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 25 Jan 2019 12:05:25 -0800
Subject: [PATCH 2375/3220] f2fs: avoid null pointer exception in dcc_info

If dcc_info is not set yet, we can get null pointer panic.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a1ae613fb28a..b6582b8bbd72 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2226,10 +2226,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
 		get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
 		get_pages(sbi, F2FS_WB_CP_DATA) ||
 		get_pages(sbi, F2FS_DIO_READ) ||
-		get_pages(sbi, F2FS_DIO_WRITE) ||
-		atomic_read(&SM_I(sbi)->dcc_info->queued_discard) ||
-		atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+		get_pages(sbi, F2FS_DIO_WRITE))
 		return false;
+
+	if (SM_I(sbi) && SM_I(sbi)->dcc_info &&
+			atomic_read(&SM_I(sbi)->dcc_info->queued_discard))
+		return false;
+
+	if (SM_I(sbi) && SM_I(sbi)->fcc_info &&
+			atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+		return false;
+
 	return f2fs_time_over(sbi, type);
 }
 

From 22cb0dec0ee2c1d493d3e4b409cafddfca9e3e72 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Sun, 27 Jan 2019 17:59:53 -0800
Subject: [PATCH 2376/3220] f2fs: flush quota blocks after turnning it off

After quota_off, we'll get some dirty blocks. If put_super don't have a chance
to flush them by checkpoint, it causes NULL pointer exception in end_io after
iput(node_inode). (e.g., by checkpoint=disable)

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6cb1c2e94f34..38e50a4bf26d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2027,6 +2027,12 @@ void f2fs_quota_off_umount(struct super_block *sb)
 			set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
 		}
 	}
+	/*
+	 * In case of checkpoint=disable, we must flush quota blocks.
+	 * This can cause NULL exception for node_inode in end_io, since
+	 * put_super already dropped it.
+	 */
+	sync_filesystem(sb);
 }
 
 static void f2fs_truncate_quota_inode_pages(struct super_block *sb)

From 6df501312cd530ee49c4739caac9d579a45301ee Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 4 Feb 2019 10:46:42 -0800
Subject: [PATCH 2377/3220] fs: export evict_inodes

This comes by 799ea9e9c599 ("xfs: evict all inodes involved with log redo item")

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/inode.c         | 1 +
 fs/internal.h      | 1 -
 include/linux/fs.h | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/inode.c b/fs/inode.c
index b0edef500590..9fa27369e8a2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -626,6 +626,7 @@ again:
 
 	dispose_list(&dispose);
 }
+EXPORT_SYMBOL_GPL(evict_inodes);
 
 /**
  * invalidate_inodes	- attempt to free all inodes on a superblock
diff --git a/fs/internal.h b/fs/internal.h
index 71859c4d0b41..6e17ee3b7c2f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -121,7 +121,6 @@ extern void inode_add_lru(struct inode *inode);
 extern void inode_io_list_del(struct inode *inode);
 
 extern long get_nr_dirty_inodes(void);
-extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e9382296305d..314bf410e3ae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2640,6 +2640,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
 #endif
 extern void unlock_new_inode(struct inode *);
 extern unsigned int get_next_ino(void);
+extern void evict_inodes(struct super_block *sb);
 
 extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);

From 9eb897a05082d67ca10b977e5cc240bd42df3921 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 22 Jan 2019 14:04:33 -0800
Subject: [PATCH 2378/3220] f2fs: sync filesystem after roll-forward recovery

Some works after roll-forward recovery can get an error which will release
all the data structures. Let's flush them in order to make it clean.

One possible corruption came from:

[   90.400500] list_del corruption. prev->next should be ffffffed1f566208, but was (null)
[   90.675349] Call trace:
[   90.677869]  __list_del_entry_valid+0x94/0xb4
[   90.682351]  remove_dirty_inode+0xac/0x114
[   90.686563]  __f2fs_write_data_pages+0x6a8/0x6c8
[   90.691302]  f2fs_write_data_pages+0x40/0x4c
[   90.695695]  do_writepages+0x80/0xf0
[   90.699372]  __writeback_single_inode+0xdc/0x4ac
[   90.704113]  writeback_sb_inodes+0x280/0x440
[   90.708501]  wb_writeback+0x1b8/0x3d0
[   90.712267]  wb_workfn+0x1a8/0x4d4
[   90.715765]  process_one_work+0x1c0/0x3d4
[   90.719883]  worker_thread+0x224/0x344
[   90.723739]  kthread+0x120/0x130
[   90.727055]  ret_from_fork+0x10/0x18

Reported-by: Sahitya Tummala <stummala@codeaurora.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  5 +++--
 fs/f2fs/node.c       |  4 +++-
 fs/f2fs/super.c      | 44 +++++++++++++++++++++++++++++++++-----------
 3 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 3fa4587ab0e8..0dc483a7f3d8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -307,8 +307,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
 		goto skip_write;
 
 	/* collect a number of dirty meta pages and write together */
-	if (wbc->for_kupdate ||
-		get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
+	if (wbc->sync_mode != WB_SYNC_ALL &&
+			get_pages(sbi, F2FS_DIRTY_META) <
+					nr_pages_to_skip(sbi, META))
 		goto skip_write;
 
 	/* if locked failed, cp will flush dirty pages instead */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f6a1e1e0f891..0f76c5e7998c 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1925,7 +1925,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 	f2fs_balance_fs_bg(sbi);
 
 	/* collect a number of dirty node pages and write together */
-	if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
+	if (wbc->sync_mode != WB_SYNC_ALL &&
+			get_pages(sbi, F2FS_DIRTY_NODES) <
+					nr_pages_to_skip(sbi, NODE))
 		goto skip_write;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 38e50a4bf26d..798db23f58a6 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1459,9 +1459,16 @@ static int f2fs_enable_quotas(struct super_block *sb);
 
 static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
 {
+	unsigned int s_flags = sbi->sb->s_flags;
 	struct cp_control cpc;
-	int err;
+	int err = 0;
+	int ret;
 
+	if (s_flags & MS_RDONLY) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+				"checkpoint=disable on readonly fs");
+		return -EINVAL;
+	}
 	sbi->sb->s_flags |= MS_ACTIVE;
 
 	f2fs_update_time(sbi, DISABLE_TIME);
@@ -1469,18 +1476,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
 	while (!f2fs_time_over(sbi, DISABLE_TIME)) {
 		mutex_lock(&sbi->gc_mutex);
 		err = f2fs_gc(sbi, true, false, NULL_SEGNO);
-		if (err == -ENODATA)
+		if (err == -ENODATA) {
+			err = 0;
 			break;
+		}
 		if (err && err != -EAGAIN)
-			return err;
+			break;
 	}
 
-	err = sync_filesystem(sbi->sb);
-	if (err)
-		return err;
+	ret = sync_filesystem(sbi->sb);
+	if (ret || err) {
+		err = ret ? ret: err;
+		goto restore_flag;
+	}
 
-	if (f2fs_disable_cp_again(sbi))
-		return -EAGAIN;
+	if (f2fs_disable_cp_again(sbi)) {
+		err = -EAGAIN;
+		goto restore_flag;
+	}
 
 	mutex_lock(&sbi->gc_mutex);
 	cpc.reason = CP_PAUSE;
@@ -1489,7 +1502,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
 
 	sbi->unusable_block_count = 0;
 	mutex_unlock(&sbi->gc_mutex);
-	return 0;
+restore_flag:
+	sbi->sb->s_flags = s_flags;	/* Restore MS_RDONLY status */
+	return err;
 }
 
 static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
@@ -3376,7 +3391,7 @@ skip_recovery:
 	if (test_opt(sbi, DISABLE_CHECKPOINT)) {
 		err = f2fs_disable_checkpoint(sbi);
 		if (err)
-			goto free_meta;
+			goto sync_free_meta;
 	} else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) {
 		f2fs_enable_checkpoint(sbi);
 	}
@@ -3389,7 +3404,7 @@ skip_recovery:
 		/* After POR, we can run background GC thread.*/
 		err = f2fs_start_gc_thread(sbi);
 		if (err)
-			goto free_meta;
+			goto sync_free_meta;
 	}
 	kvfree(options);
 
@@ -3412,6 +3427,11 @@ skip_recovery:
 	clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
 	return 0;
 
+sync_free_meta:
+	/* safe to flush all the data */
+	sync_filesystem(sbi->sb);
+	retry = false;
+
 free_meta:
 #ifdef CONFIG_QUOTA
 	f2fs_truncate_quota_inode_pages(sb);
@@ -3425,6 +3445,8 @@ free_meta:
 	 * falls into an infinite loop in f2fs_sync_meta_pages().
 	 */
 	truncate_inode_pages_final(META_MAPPING(sbi));
+	/* evict some inodes being cached by GC */
+	evict_inodes(sb);
 	f2fs_unregister_sysfs(sbi);
 free_root_inode:
 	dput(sb->s_root);

From 04c71ef5d9f7531e58be5f219950f6527b03580a Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Fri, 25 Jan 2019 15:35:01 +0800
Subject: [PATCH 2379/3220] f2fs: fix typos in code comments

lengh -> length

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index fc763eab10b2..6dcde67ba171 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -187,7 +187,7 @@ struct f2fs_orphan_block {
 struct f2fs_extent {
 	__le32 fofs;		/* start file offset of the extent */
 	__le32 blk;		/* start block address of the extent */
-	__le32 len;		/* lengh of the extent */
+	__le32 len;		/* length of the extent */
 } __packed;
 
 #define F2FS_NAME_LEN		255
@@ -512,7 +512,7 @@ typedef __le32	f2fs_hash_t;
 struct f2fs_dir_entry {
 	__le32 hash_code;	/* hash code of file name */
 	__le32 ino;		/* inode number */
-	__le16 name_len;	/* lengh of file name */
+	__le16 name_len;	/* length of file name */
 	__u8 file_type;		/* file type */
 } __packed;
 

From b339a10ec62d86e6423852edafa0fc9c194f420d Mon Sep 17 00:00:00 2001
From: zhengliang <zhengliang6@huawei.com>
Date: Thu, 24 Jan 2019 20:57:03 +0800
Subject: [PATCH 2380/3220] f2fs: fix to data block override node segment by
 mistake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following race could lead to data block override node segment by mistake.

Task A            |    Task B         |  Task C            |    Task D
=======           |   ========        |==========          |  =========
open file         |                   |                    |
white file        |                   |                    |
submit bio        |                   |                    |
wait io complete  |                   |                    |
		  |   remove file     |                    |
........          |   iput_final      |                    |
		  |                   |   sync             |
		  |                   |  do checkpoint     |
		  |		      |  data segment free |
		  |                   |                    | create file1
		  |		      |		           | allocate node segment（if it is the same segment freed by Task C）
f2fs_write_end_io |		      |                    |

So we need to guarantee io complete before truncate inode
in f2fs_drop_inode.

Signed-off-by: Zheng Liang <zhengliang6@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 798db23f58a6..166e999774bd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -915,6 +915,10 @@ static int f2fs_drop_inode(struct inode *inode)
 			sb_start_intwrite(inode->i_sb);
 			f2fs_i_size_write(inode, 0);
 
+			f2fs_submit_merged_write_cond(F2FS_I_SB(inode),
+					inode, NULL, 0, DATA);
+			truncate_inode_pages_final(inode->i_mapping);
+
 			if (F2FS_HAS_BLOCKS(inode))
 				f2fs_truncate(inode);
 

From 843809bad2741d2440df328d1bcb5e6faf9b8ad4 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 24 Jan 2019 17:18:07 +0800
Subject: [PATCH 2381/3220] f2fs: fix to document inline_xattr_size option

We missed to add document for inline_xattr_size mount option in f2fs.txt,
add it.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 348d80440433..564ccc6c7bad 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -126,6 +126,8 @@ disable_ext_identify   Disable the extension list configured by mkfs, so f2fs
                        does not aware of cold files such as media files.
 inline_xattr           Enable the inline xattrs feature.
 noinline_xattr         Disable the inline xattrs feature.
+inline_xattr_size=%u   Support configuring inline xattr size, it depends on
+		       flexible inline xattr feature.
 inline_data            Enable the inline data feature: New created small(<~3.4k)
                        files can be written into inode block.
 inline_dentry          Enable the inline dir feature: data in new created

From b8fe720a614e912889603e4658223d9dec48b361 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Wed, 23 Jan 2019 15:49:44 +0800
Subject: [PATCH 2382/3220] f2fs: jump to label 'free_node_inode' when failing
 from d_make_root()

When sb->s_root is NULL dput() will do nothing,
so jump to label 'free_node_inode' instead of lable
'free_root_inode' when failing from d_make_root().

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 166e999774bd..fa54715e4132 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3329,7 +3329,7 @@ try_onemore:
 	sb->s_root = d_make_root(root); /* allocate root dentry */
 	if (!sb->s_root) {
 		err = -ENOMEM;
-		goto free_root_inode;
+		goto free_node_inode;
 	}
 
 	err = f2fs_register_sysfs(sbi);

From be2a96a3badc2056e1a66d5613c87b957b3a3f69 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 24 Jan 2019 15:18:47 +0800
Subject: [PATCH 2383/3220] f2fs: fix to check inline_xattr_size boundary
 correctly

We use below condition to check inline_xattr_size boundary:

	if (!F2FS_OPTION(sbi).inline_xattr_size ||
		F2FS_OPTION(sbi).inline_xattr_size >=
				DEF_ADDRS_PER_INODE -
				F2FS_TOTAL_EXTRA_ATTR_SIZE -
				DEF_INLINE_RESERVED_SIZE -
				DEF_MIN_INLINE_SIZE)

There is there problems in that check:
- we should allow inline_xattr_size equaling to min size of inline
{data,dentry} area.
- F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on
different size unit, previous one is 4 bytes, latter one is 1 bytes.
- DEF_MIN_INLINE_SIZE only indicate min size of inline data area,
however, we need to consider min size of inline dentry area as well,
minimal inline dentry should at least contain two entries: '.' and
'..', so that min inline_dentry size is 40 bytes.

.bitmap		1 * 1 = 1
.reserved	1 * 1 = 1
.dentry		11 * 2 = 22
.filename	8 * 2 = 16
total		40

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          |  1 -
 fs/f2fs/super.c         | 10 +++++-----
 include/linux/f2fs_fs.h | 13 +++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b6582b8bbd72..8690d5cb32f4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -524,7 +524,6 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
-#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
 static inline int get_inline_xattr_addrs(struct inode *inode);
 #define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fa54715e4132..4c3015a119b4 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -835,11 +835,11 @@ static int parse_options(struct super_block *sb, char *options)
 			return -EINVAL;
 		}
 		if (!F2FS_OPTION(sbi).inline_xattr_size ||
-			F2FS_OPTION(sbi).inline_xattr_size >=
-					DEF_ADDRS_PER_INODE -
-					F2FS_TOTAL_EXTRA_ATTR_SIZE -
-					DEF_INLINE_RESERVED_SIZE -
-					DEF_MIN_INLINE_SIZE) {
+			F2FS_OPTION(sbi).inline_xattr_size >
+				DEF_ADDRS_PER_INODE -
+				F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) -
+				DEF_INLINE_RESERVED_SIZE -
+				MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) {
 			f2fs_msg(sb, KERN_ERR,
 					"inline xattr size is out of range");
 			return -EINVAL;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 6dcde67ba171..2f5fbc41118c 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -490,12 +490,12 @@ typedef __le32	f2fs_hash_t;
 
 /*
  * space utilization of regular dentry and inline dentry (w/o extra reservation)
- *		regular dentry			inline dentry
- * bitmap	1 * 27 = 27			1 * 23 = 23
- * reserved	1 * 3 = 3			1 * 7 = 7
- * dentry	11 * 214 = 2354			11 * 182 = 2002
- * filename	8 * 214 = 1712			8 * 182 = 1456
- * total	4096				3488
+ *		regular dentry		inline dentry (def)	inline dentry (min)
+ * bitmap	1 * 27 = 27		1 * 23 = 23		1 * 1 = 1
+ * reserved	1 * 3 = 3		1 * 7 = 7		1 * 1 = 1
+ * dentry	11 * 214 = 2354		11 * 182 = 2002		11 * 2 = 22
+ * filename	8 * 214 = 1712		8 * 182 = 1456		8 * 2 = 16
+ * total	4096			3488			40
  *
  * Note: there are more reserved space in inline dentry than in regular
  * dentry, when converting inline dentry we should handle this carefully.
@@ -507,6 +507,7 @@ typedef __le32	f2fs_hash_t;
 #define SIZE_OF_RESERVED	(PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
 				F2FS_SLOT_LEN) * \
 				NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
+#define MIN_INLINE_DENTRY_SIZE		40	/* just include '.' and '..' entries */
 
 /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
 struct f2fs_dir_entry {

From 8ef83050d5c115b5455758c7f2ac2da115c5f9f2 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 22 Jan 2019 20:18:06 +0800
Subject: [PATCH 2384/3220] f2fs: fix to avoid deadlock of atomic file
 operations

Thread A				Thread B
- __fput
 - f2fs_release_file
  - drop_inmem_pages
   - mutex_lock(&fi->inmem_lock)
   - __revoke_inmem_pages
    - lock_page(page)
					- open
					- f2fs_setattr
					- truncate_setsize
					 - truncate_inode_pages_range
					  - lock_page(page)
					  - truncate_cleanup_page
					   - f2fs_invalidate_page
					    - drop_inmem_page
					    - mutex_lock(&fi->inmem_lock);

We may encounter above ABBA deadlock as reported by Kyungtae Kim:

I'm reporting a bug in linux-4.17.19: "INFO: task hung in
drop_inmem_page" (no reproducer)

I think this might be somehow related to the following:
https://groups.google.com/forum/#!searchin/syzkaller-bugs/INFO$3A$20task$20hung$20in$20%7Csort:date/syzkaller-bugs/c6soBTrdaIo/AjAzPeIzCgAJ

=========================================
INFO: task syz-executor7:10822 blocked for more than 120 seconds.
      Not tainted 4.17.19 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor7   D27024 10822   6346 0x00000004
Call Trace:
 context_switch kernel/sched/core.c:2867 [inline]
 __schedule+0x721/0x1e60 kernel/sched/core.c:3515
 schedule+0x88/0x1c0 kernel/sched/core.c:3559
 schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:3617
 __mutex_lock_common kernel/locking/mutex.c:833 [inline]
 __mutex_lock+0x5bd/0x1410 kernel/locking/mutex.c:893
 mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:908
 drop_inmem_page+0xcb/0x810 fs/f2fs/segment.c:327
 f2fs_invalidate_page+0x337/0x5e0 fs/f2fs/data.c:2401
 do_invalidatepage mm/truncate.c:165 [inline]
 truncate_cleanup_page+0x261/0x330 mm/truncate.c:187
 truncate_inode_pages_range+0x552/0x1610 mm/truncate.c:367
 truncate_inode_pages mm/truncate.c:478 [inline]
 truncate_pagecache+0x6d/0x90 mm/truncate.c:801
 truncate_setsize+0x81/0xa0 mm/truncate.c:826
 f2fs_setattr+0x44f/0x1270 fs/f2fs/file.c:781
 notify_change+0xa62/0xe80 fs/attr.c:313
 do_truncate+0x12e/0x1e0 fs/open.c:63
 do_last fs/namei.c:2955 [inline]
 path_openat+0x2042/0x29f0 fs/namei.c:3505
 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540
 do_sys_open+0x35e/0x4e0 fs/open.c:1101
 __do_sys_open fs/open.c:1119 [inline]
 __se_sys_open fs/open.c:1114 [inline]
 __x64_sys_open+0x89/0xc0 fs/open.c:1114
 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4497b9
RSP: 002b:00007f734e459c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002
RAX: ffffffffffffffda RBX: 00007f734e45a6cc RCX: 00000000004497b9
RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080
RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e45a700
INFO: task syz-executor7:10858 blocked for more than 120 seconds.
      Not tainted 4.17.19 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor7   D28880 10858   6346 0x00000004
Call Trace:
 context_switch kernel/sched/core.c:2867 [inline]
 __schedule+0x721/0x1e60 kernel/sched/core.c:3515
 schedule+0x88/0x1c0 kernel/sched/core.c:3559
 __rwsem_down_write_failed_common kernel/locking/rwsem-xadd.c:565 [inline]
 rwsem_down_write_failed+0x5e6/0xc90 kernel/locking/rwsem-xadd.c:594
 call_rwsem_down_write_failed+0x17/0x30 arch/x86/lib/rwsem.S:117
 __down_write arch/x86/include/asm/rwsem.h:142 [inline]
 down_write+0x58/0xa0 kernel/locking/rwsem.c:72
 inode_lock include/linux/fs.h:713 [inline]
 do_truncate+0x120/0x1e0 fs/open.c:61
 do_last fs/namei.c:2955 [inline]
 path_openat+0x2042/0x29f0 fs/namei.c:3505
 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540
 do_sys_open+0x35e/0x4e0 fs/open.c:1101
 __do_sys_open fs/open.c:1119 [inline]
 __se_sys_open fs/open.c:1114 [inline]
 __x64_sys_open+0x89/0xc0 fs/open.c:1114
 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4497b9
RSP: 002b:00007f734e3b4c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002
RAX: ffffffffffffffda RBX: 00007f734e3b56cc RCX: 00000000004497b9
RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080
RBP: 000000000071c238 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e3b5700
INFO: task syz-executor5:10829 blocked for more than 120 seconds.
      Not tainted 4.17.19 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor5   D28760 10829   6308 0x80000002
Call Trace:
 context_switch kernel/sched/core.c:2867 [inline]
 __schedule+0x721/0x1e60 kernel/sched/core.c:3515
 schedule+0x88/0x1c0 kernel/sched/core.c:3559
 io_schedule+0x21/0x80 kernel/sched/core.c:5179
 wait_on_page_bit_common mm/filemap.c:1100 [inline]
 __lock_page+0x2b5/0x390 mm/filemap.c:1273
 lock_page include/linux/pagemap.h:483 [inline]
 __revoke_inmem_pages+0xb35/0x11c0 fs/f2fs/segment.c:231
 drop_inmem_pages+0xa3/0x3e0 fs/f2fs/segment.c:306
 f2fs_release_file+0x2c7/0x330 fs/f2fs/file.c:1556
 __fput+0x2c7/0x780 fs/file_table.c:209
 ____fput+0x1a/0x20 fs/file_table.c:243
 task_work_run+0x151/0x1d0 kernel/task_work.c:113
 exit_task_work include/linux/task_work.h:22 [inline]
 do_exit+0x8ba/0x30a0 kernel/exit.c:865
 do_group_exit+0x13b/0x3a0 kernel/exit.c:968
 get_signal+0x6bb/0x1650 kernel/signal.c:2482
 do_signal+0x84/0x1b70 arch/x86/kernel/signal.c:810
 exit_to_usermode_loop+0x155/0x190 arch/x86/entry/common.c:162
 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
 do_syscall_64+0x445/0x4e0 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4497b9
RSP: 002b:00007f1c68e74ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
RAX: fffffffffffffe00 RBX: 000000000071bf80 RCX: 00000000004497b9
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000071bf80
RBP: 000000000071bf80 R08: 0000000000000000 R09: 000000000071bf58
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000000000 R14: 00007f1c68e759c0 R15: 00007f1c68e75700

This patch tries to use trylock_page to mitigate such deadlock condition
for fix.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6f0bfaeebb47..67bcecfcb55c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -215,7 +215,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
 }
 
 static int __revoke_inmem_pages(struct inode *inode,
-				struct list_head *head, bool drop, bool recover)
+				struct list_head *head, bool drop, bool recover,
+				bool trylock)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct inmem_pages *cur, *tmp;
@@ -227,7 +228,16 @@ static int __revoke_inmem_pages(struct inode *inode,
 		if (drop)
 			trace_f2fs_commit_inmem_page(page, INMEM_DROP);
 
-		lock_page(page);
+		if (trylock) {
+			/*
+			 * to avoid deadlock in between page lock and
+			 * inmem_lock.
+			 */
+			if (!trylock_page(page))
+				continue;
+		} else {
+			lock_page(page);
+		}
 
 		f2fs_wait_on_page_writeback(page, DATA, true, true);
 
@@ -318,13 +328,19 @@ void f2fs_drop_inmem_pages(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	mutex_lock(&fi->inmem_lock);
-	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
-	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
-	if (!list_empty(&fi->inmem_ilist))
-		list_del_init(&fi->inmem_ilist);
-	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
-	mutex_unlock(&fi->inmem_lock);
+	while (!list_empty(&fi->inmem_pages)) {
+		mutex_lock(&fi->inmem_lock);
+		__revoke_inmem_pages(inode, &fi->inmem_pages,
+						true, false, true);
+
+		if (list_empty(&fi->inmem_pages)) {
+			spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+			if (!list_empty(&fi->inmem_ilist))
+				list_del_init(&fi->inmem_ilist);
+			spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+			mutex_unlock(&fi->inmem_lock);
+		}
+	}
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
@@ -429,12 +445,15 @@ retry:
 		 * recovery or rewrite & commit last transaction. For other
 		 * error number, revoking was done by filesystem itself.
 		 */
-		err = __revoke_inmem_pages(inode, &revoke_list, false, true);
+		err = __revoke_inmem_pages(inode, &revoke_list,
+						false, true, false);
 
 		/* drop all uncommitted pages */
-		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+		__revoke_inmem_pages(inode, &fi->inmem_pages,
+						true, false, false);
 	} else {
-		__revoke_inmem_pages(inode, &revoke_list, false, false);
+		__revoke_inmem_pages(inode, &revoke_list,
+						false, false, false);
 	}
 
 	return err;

From ecf753b3701d1556cdb57bf8442caf05b1030be6 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 2 Feb 2019 17:33:01 +0800
Subject: [PATCH 2385/3220] f2fs: fix potential data inconsistence of
 checkpoint

Previously, we changed lock from cp_rwsem to node_change, it solved
the deadlock issue which was caused by below race condition:

Thread A			Thread B
- f2fs_setattr
 - f2fs_lock_op  -- read_lock
 - dquot_transfer
  - __dquot_transfer
   - dquot_acquire
    - commit_dqblk
     - f2fs_quota_write
      - f2fs_write_begin
       - f2fs_write_failed
				- write_checkpoint
				 - block_operations
				  - f2fs_lock_all  -- write_lock
        - f2fs_truncate_blocks
         - f2fs_lock_op  -- read_lock

But it breaks the sematics of cp_rwsem, in other callers like:
- f2fs_file_write_iter -> f2fs_write_begin -> f2fs_write_failed
- f2fs_direct_IO -> f2fs_write_failed

We allow to truncate dnode w/o cp_rwsem held, result in incorrect sit
bitmap update, which can cause further data corruption.

So this patch reverts previous fix implementation, and try to fix
deadlock by skipping calling f2fs_truncate_blocks() in f2fs_write_failed()
only for quota file, and keep the preallocated data/node in the tail of
quota file, we can expecte that the preallocated space can be used to
store quota info latter soon.

Fixes: af033b2aa8a8 ("f2fs: guarantee journalled quota data by checkpoint")
Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c   |  3 ++-
 fs/f2fs/f2fs.h   |  3 +--
 fs/f2fs/file.c   | 14 ++++++--------
 fs/f2fs/inline.c |  4 ++--
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2386a2839842..16a389733eb8 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2308,7 +2308,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
 		down_write(&F2FS_I(inode)->i_mmap_sem);
 
 		truncate_pagecache(inode, i_size);
-		f2fs_truncate_blocks(inode, i_size, true, true);
+		if (!IS_NOQUOTA(inode))
+			f2fs_truncate_blocks(inode, i_size, true);
 
 		up_write(&F2FS_I(inode)->i_mmap_sem);
 		up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8690d5cb32f4..607abef0abcf 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2927,8 +2927,7 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi,
  */
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
-							bool buf_write);
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
 int f2fs_truncate(struct inode *inode);
 int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			struct kstat *stat);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7a1217a236f0..724b2094bdec 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -592,8 +592,7 @@ truncate_out:
 	return 0;
 }
 
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
-							bool buf_write)
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn;
@@ -601,7 +600,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
 	int count = 0, err = 0;
 	struct page *ipage;
 	bool truncate_page = false;
-	int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO;
 
 	trace_f2fs_truncate_blocks_enter(inode, from);
 
@@ -611,7 +609,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
 		goto free_partial;
 
 	if (lock)
-		__do_map_lock(sbi, flag, true);
+		f2fs_lock_op(sbi);
 
 	ipage = f2fs_get_node_page(sbi, inode->i_ino);
 	if (IS_ERR(ipage)) {
@@ -649,7 +647,7 @@ free_next:
 	err = f2fs_truncate_inode_blocks(inode, free_from);
 out:
 	if (lock)
-		__do_map_lock(sbi, flag, false);
+		f2fs_unlock_op(sbi);
 free_partial:
 	/* lastly zero out the first data page */
 	if (!err)
@@ -684,7 +682,7 @@ int f2fs_truncate(struct inode *inode)
 			return err;
 	}
 
-	err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
+	err = f2fs_truncate_blocks(inode, i_size_read(inode), true);
 	if (err)
 		return err;
 
@@ -1269,7 +1267,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	new_size = i_size_read(inode) - len;
 	truncate_pagecache(inode, new_size);
 
-	ret = f2fs_truncate_blocks(inode, new_size, true, false);
+	ret = f2fs_truncate_blocks(inode, new_size, true);
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
@@ -1454,7 +1452,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	f2fs_balance_fs(sbi, true);
 
 	down_write(&F2FS_I(inode)->i_mmap_sem);
-	ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
+	ret = f2fs_truncate_blocks(inode, i_size_read(inode), true);
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 	if (ret)
 		return ret;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 6b4aa5db6a9a..ec067935ca78 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -298,7 +298,7 @@ process_inline:
 		clear_inode_flag(inode, FI_INLINE_DATA);
 		f2fs_put_page(ipage, 1);
 	} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
-		if (f2fs_truncate_blocks(inode, 0, false, false))
+		if (f2fs_truncate_blocks(inode, 0, false))
 			return false;
 		goto process_inline;
 	}
@@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
 	return 0;
 punch_dentry_pages:
 	truncate_inode_pages(&dir->i_data, 0);
-	f2fs_truncate_blocks(dir, 0, false, false);
+	f2fs_truncate_blocks(dir, 0, false);
 	f2fs_remove_dirty_inode(dir);
 	return err;
 }

From 011f2e498d944c991ffd72955d654520afc29f4a Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Mon, 4 Feb 2019 13:36:53 +0530
Subject: [PATCH 2386/3220] f2fs: do not use mutex lock in atomic context

Fix below warning coming because of using mutex lock in atomic context.

BUG: sleeping function called from invalid context at kernel/locking/mutex.c:98
in_atomic(): 1, irqs_disabled(): 0, pid: 585, name: sh
Preemption disabled at: __radix_tree_preload+0x28/0x130
Call trace:
 dump_backtrace+0x0/0x2b4
 show_stack+0x20/0x28
 dump_stack+0xa8/0xe0
 ___might_sleep+0x144/0x194
 __might_sleep+0x58/0x8c
 mutex_lock+0x2c/0x48
 f2fs_trace_pid+0x88/0x14c
 f2fs_set_node_page_dirty+0xd0/0x184

Do not use f2fs_radix_tree_insert() to avoid doing cond_resched() with
spin_lock() acquired.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/trace.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index ce2a5eb210b6..d0ab533a9ce8 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -14,7 +14,7 @@
 #include "trace.h"
 
 static RADIX_TREE(pids, GFP_ATOMIC);
-static struct mutex pids_lock;
+static spinlock_t pids_lock;
 static struct last_io_info last_io;
 
 static inline void __print_last_io(void)
@@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page)
 
 	set_page_private(page, (unsigned long)pid);
 
+retry:
 	if (radix_tree_preload(GFP_NOFS))
 		return;
 
-	mutex_lock(&pids_lock);
+	spin_lock(&pids_lock);
 	p = radix_tree_lookup(&pids, pid);
 	if (p == current)
 		goto out;
 	if (p)
 		radix_tree_delete(&pids, pid);
 
-	f2fs_radix_tree_insert(&pids, pid, current);
+	if (radix_tree_insert(&pids, pid, current)) {
+		spin_unlock(&pids_lock);
+		radix_tree_preload_end();
+		cond_resched();
+		goto retry;
+	}
 
 	trace_printk("%3x:%3x %4x %-16s\n",
 			MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
 			pid, current->comm);
 out:
-	mutex_unlock(&pids_lock);
+	spin_unlock(&pids_lock);
 	radix_tree_preload_end();
 }
 
@@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
 
 void f2fs_build_trace_ios(void)
 {
-	mutex_init(&pids_lock);
+	spin_lock_init(&pids_lock);
 }
 
 #define PIDVEC_SIZE	128
@@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void)
 	pid_t next_pid = 0;
 	unsigned int found;
 
-	mutex_lock(&pids_lock);
+	spin_lock(&pids_lock);
 	while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
 		unsigned idx;
 
@@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void)
 		for (idx = 0; idx < found; idx++)
 			radix_tree_delete(&pids, pid[idx]);
 	}
-	mutex_unlock(&pids_lock);
+	spin_unlock(&pids_lock);
 }

From 6d1e42fb1a284de71e619fdb22cfb55e64e0c8ee Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@android.com>
Date: Wed, 5 Dec 2018 15:19:26 -0800
Subject: [PATCH 2387/3220] UPSTREAM: binder: filter out nodes when showing
 binder procs

When dumping out binder transactions via a debug node,
the output is too verbose if a process has many nodes.
Change the output for transaction dumps to only display
nodes with pending async transactions.

Signed-off-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit ecd589d8f5661dd3a9545079a29b678cd9e3ecf3)
Bug: 112037142
Change-Id: Iaa76ebdc844037ce1ee3bf2e590676790a959cef
---
 drivers/android/binder.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index f1599c14f1b7..bd9e285df613 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -5472,6 +5472,9 @@ static void print_binder_proc(struct seq_file *m,
 	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
 		struct binder_node *node = rb_entry(n, struct binder_node,
 						    rb_node);
+		if (!print_all && !node->has_async_transaction)
+			continue;
+
 		/*
 		 * take a temporary reference on the node so it
 		 * survives and isn't removed from the tree

From 52a30a6e141a103601e3039fb9cabb3babf9b2c2 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Thu, 17 Jan 2019 15:34:38 +0000
Subject: [PATCH 2388/3220] net: Fix usage of pskb_trim_rcsum

[ Upstream commit 6c57f0458022298e4da1729c67bd33ce41c14e7a ]

In certain cases, pskb_trim_rcsum() may change skb pointers.
Reinitialize header pointers afterwards to avoid potential
use-after-frees. Add a note in the documentation of
pskb_trim_rcsum(). Found by KASAN.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ppp/pppoe.c                  | 1 +
 include/linux/skbuff.h                   | 1 +
 net/bridge/br_netfilter_ipv6.c           | 1 +
 net/bridge/netfilter/nft_reject_bridge.c | 1 +
 net/ipv4/ip_input.c                      | 1 +
 5 files changed, 5 insertions(+)

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 583d50f80b24..02327e6c4819 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -442,6 +442,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (pskb_trim_rcsum(skb, len))
 		goto drop;
 
+	ph = pppoe_hdr(skb);
 	pn = pppoe_pernet(dev_net(dev));
 
 	/* Note that get_item does a sock_hold(), so sk_pppox(po)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a490dd718654..6d39d81d3c38 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2798,6 +2798,7 @@ static inline unsigned char *skb_push_rcsum(struct sk_buff *skb,
  *
  *	This is exactly the same as pskb_trim except that it ensures the
  *	checksum of received packets are still valid after the operation.
+ *	It can change skb pointers.
  */
 
 static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index d61f56efc8dc..69dfd212e50d 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -131,6 +131,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb)
 					 IPSTATS_MIB_INDISCARDS);
 			goto drop;
 		}
+		hdr = ipv6_hdr(skb);
 	}
 	if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb))
 		goto drop;
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index fdba3d9fbff3..6e48aa69fa24 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -192,6 +192,7 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int hook)
 	    pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
 		return false;
 
+	ip6h = ipv6_hdr(skb);
 	thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
 	if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
 		return false;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b1209b63381f..eb1834f2682f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -444,6 +444,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 		goto drop;
 	}
 
+	iph = ip_hdr(skb);
 	skb->transport_header = skb->network_header + iph->ihl*4;
 
 	/* Remove any debris in the socket control block */

From e5c13a9c75ea5db2290c06efd2a83e658b406bdd Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Mon, 14 Jan 2019 09:16:56 +0000
Subject: [PATCH 2389/3220] openvswitch: Avoid OOB read when parsing flow
 nlattrs

[ Upstream commit 04a4af334b971814eedf4e4a413343ad3287d9a9 ]

For nested and variable attributes, the expected length of an attribute
is not known and marked by a negative number.  This results in an OOB
read when the expected length is later used to check if the attribute is
all zeros. Fix this by using the actual length of the attribute rather
than the expected length.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/openvswitch/flow_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 624c4719e404..537917dfa83a 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -409,7 +409,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr,
 			return -EINVAL;
 		}
 
-		if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
+		if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) {
 			attrs |= 1 << type;
 			a[type] = nla;
 		}

From 289992eb6c1af89f8e9265276788e99cacd601b2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 9 Jan 2019 09:57:39 +0000
Subject: [PATCH 2390/3220] net: ipv4: Fix memory leak in network namespace
 dismantle

[ Upstream commit f97f4dd8b3bb9d0993d2491e0f22024c68109184 ]

IPv4 routing tables are flushed in two cases:

1. In response to events in the netdev and inetaddr notification chains
2. When a network namespace is being dismantled

In both cases only routes associated with a dead nexthop group are
flushed. However, a nexthop group will only be marked as dead in case it
is populated with actual nexthops using a nexthop device. This is not
the case when the route in question is an error route (e.g.,
'blackhole', 'unreachable').

Therefore, when a network namespace is being dismantled such routes are
not flushed and leaked [1].

To reproduce:
# ip netns add blue
# ip -n blue route add unreachable 192.0.2.0/24
# ip netns del blue

Fix this by not skipping error routes that are not marked with
RTNH_F_DEAD when flushing the routing tables.

To prevent the flushing of such routes in case #1, add a parameter to
fib_table_flush() that indicates if the table is flushed as part of
namespace dismantle or not.

Note that this problem does not exist in IPv6 since error routes are
associated with the loopback device.

[1]
unreferenced object 0xffff888066650338 (size 56):
  comm "ip", pid 1206, jiffies 4294786063 (age 26.235s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 b0 1c 62 61 80 88 ff ff  ..........ba....
    e8 8b a1 64 80 88 ff ff 00 07 00 08 fe 00 00 00  ...d............
  backtrace:
    [<00000000856ed27d>] inet_rtm_newroute+0x129/0x220
    [<00000000fcdfc00a>] rtnetlink_rcv_msg+0x397/0xa20
    [<00000000cb85801a>] netlink_rcv_skb+0x132/0x380
    [<00000000ebc991d2>] netlink_unicast+0x4c0/0x690
    [<0000000014f62875>] netlink_sendmsg+0x929/0xe10
    [<00000000bac9d967>] sock_sendmsg+0xc8/0x110
    [<00000000223e6485>] ___sys_sendmsg+0x77a/0x8f0
    [<000000002e94f880>] __sys_sendmsg+0xf7/0x250
    [<00000000ccb1fa72>] do_syscall_64+0x14d/0x610
    [<00000000ffbe3dae>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<000000003a8b605b>] 0xffffffffffffffff
unreferenced object 0xffff888061621c88 (size 48):
  comm "ip", pid 1206, jiffies 4294786063 (age 26.235s)
  hex dump (first 32 bytes):
    6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
    6b 6b 6b 6b 6b 6b 6b 6b d8 8e 26 5f 80 88 ff ff  kkkkkkkk..&_....
  backtrace:
    [<00000000733609e3>] fib_table_insert+0x978/0x1500
    [<00000000856ed27d>] inet_rtm_newroute+0x129/0x220
    [<00000000fcdfc00a>] rtnetlink_rcv_msg+0x397/0xa20
    [<00000000cb85801a>] netlink_rcv_skb+0x132/0x380
    [<00000000ebc991d2>] netlink_unicast+0x4c0/0x690
    [<0000000014f62875>] netlink_sendmsg+0x929/0xe10
    [<00000000bac9d967>] sock_sendmsg+0xc8/0x110
    [<00000000223e6485>] ___sys_sendmsg+0x77a/0x8f0
    [<000000002e94f880>] __sys_sendmsg+0xf7/0x250
    [<00000000ccb1fa72>] do_syscall_64+0x14d/0x610
    [<00000000ffbe3dae>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<000000003a8b605b>] 0xffffffffffffffff

Fixes: 8cced9eff1d4 ("[NETNS]: Enable routing configuration in non-initial namespace.")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/ip_fib.h    |  2 +-
 net/ipv4/fib_frontend.c |  4 ++--
 net/ipv4/fib_trie.c     | 14 ++++++++++++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 2a25b53cd427..f6ff83b2ac87 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -200,7 +200,7 @@ int fib_table_insert(struct fib_table *, struct fib_config *);
 int fib_table_delete(struct fib_table *, struct fib_config *);
 int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
 		   struct netlink_callback *cb);
-int fib_table_flush(struct fib_table *table);
+int fib_table_flush(struct fib_table *table, bool flush_all);
 struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
 void fib_table_flush_external(struct fib_table *table);
 void fib_free_table(struct fib_table *tb);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ce646572b912..1f7b47ca2243 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -187,7 +187,7 @@ static void fib_flush(struct net *net)
 		struct fib_table *tb;
 
 		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
-			flushed += fib_table_flush(tb);
+			flushed += fib_table_flush(tb, false);
 	}
 
 	if (flushed)
@@ -1277,7 +1277,7 @@ static void ip_fib_net_exit(struct net *net)
 
 		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
 			hlist_del(&tb->tb_hlist);
-			fib_table_flush(tb);
+			fib_table_flush(tb, true);
 			fib_free_table(tb);
 		}
 	}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5c598f99a500..fdaa905dccdd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1806,7 +1806,7 @@ void fib_table_flush_external(struct fib_table *tb)
 }
 
 /* Caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
+int fib_table_flush(struct fib_table *tb, bool flush_all)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct key_vector *pn = t->kv;
@@ -1850,7 +1850,17 @@ int fib_table_flush(struct fib_table *tb)
 		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
 			struct fib_info *fi = fa->fa_info;
 
-			if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+			if (!fi ||
+			    (!(fi->fib_flags & RTNH_F_DEAD) &&
+			     !fib_props[fa->fa_type].error)) {
+				slen = fa->fa_slen;
+				continue;
+			}
+
+			/* Do not flush error routes if network namespace is
+			 * not being dismantled
+			 */
+			if (!flush_all && fib_props[fa->fa_type].error) {
 				slen = fa->fa_slen;
 				continue;
 			}

From 749cbfc0ad60f21a759950c782338d74dfac873b Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 11 Jan 2019 18:55:42 -0800
Subject: [PATCH 2391/3220] net_sched: refetch skb protocol for each filter

[ Upstream commit cd0c4e70fc0ccfa705cdf55efb27519ce9337a26 ]

Martin reported a set of filters don't work after changing
from reclassify to continue. Looking into the code, it
looks like skb protocol is not always fetched for each
iteration of the filters. But, as demonstrated by Martin,
TC actions could modify skb->protocol, for example act_vlan,
this means we have to refetch skb protocol in each iteration,
rather than using the one we fetch in the beginning of the loop.

This bug is _not_ introduced by commit 3b3ae880266d
("net: sched: consolidate tc_classify{,_compat}"), technically,
if act_vlan is the only action that modifies skb protocol, then
it is commit c7e2b9689ef8 ("sched: introduce vlan action") which
introduced this bug.

Reported-by: Martin Olsson <martin.olsson+netdev@sentorsecurity.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sched/sch_api.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 6d340cd6e2a7..b379c330a338 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1823,7 +1823,6 @@ done:
 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		struct tcf_result *res, bool compat_mode)
 {
-	__be16 protocol = tc_skb_protocol(skb);
 #ifdef CONFIG_NET_CLS_ACT
 	const struct tcf_proto *old_tp = tp;
 	int limit = 0;
@@ -1831,6 +1830,7 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 reclassify:
 #endif
 	for (; tp; tp = rcu_dereference_bh(tp->next)) {
+		__be16 protocol = tc_skb_protocol(skb);
 		int err;
 
 		if (tp->protocol != protocol &&
@@ -1857,7 +1857,6 @@ reset:
 	}
 
 	tp = old_tp;
-	protocol = tc_skb_protocol(skb);
 	goto reclassify;
 #endif
 }

From e98f787a9975c3fe268f87ac611ac26697df7a20 Mon Sep 17 00:00:00 2001
From: Yunjian Wang <wangyunjian@huawei.com>
Date: Thu, 17 Jan 2019 09:46:41 +0800
Subject: [PATCH 2392/3220] net: bridge: Fix ethernet header pointer before
 check skb forwardable

[ Upstream commit 28c1382fa28f2e2d9d0d6f25ae879b5af2ecbd03 ]

The skb header should be set to ethernet header before using
is_skb_forwardable. Because the ethernet header length has been
considered in is_skb_forwardable(including dev->hard_header_len
length).

To reproduce the issue:
1, add 2 ports on linux bridge br using following commands:
$ brctl addbr br
$ brctl addif br eth0
$ brctl addif br eth1
2, the MTU of eth0 and eth1 is 1500
3, send a packet(Data 1480, UDP 8, IP 20, Ethernet 14, VLAN 4)
from eth0 to eth1

So the expect result is packet larger than 1500 cannot pass through
eth0 and eth1. But currently, the packet passes through success, it
means eth1's MTU limit doesn't take effect.

Fixes: f6367b4660dd ("bridge: use is_skb_forwardable in forward path")
Cc: bridge@lists.linux-foundation.org
Cc: Nkolay Aleksandrov <nikolay@cumulusnetworks.com>
Cc: Roopa Prabhu <roopa@cumulusnetworks.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/br_forward.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index fcdb86dd5a23..c21209aada8c 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -39,10 +39,10 @@ static inline int should_deliver(const struct net_bridge_port *p,
 
 int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+	skb_push(skb, ETH_HLEN);
 	if (!is_skb_forwardable(skb->dev, skb))
 		goto drop;
 
-	skb_push(skb, ETH_HLEN);
 	br_drop_fake_rtable(skb);
 	skb_sender_cpu_clear(skb);
 
@@ -88,12 +88,11 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
 	skb->dev = to->dev;
 
 	if (unlikely(netpoll_tx_running(to->br->dev))) {
+		skb_push(skb, ETH_HLEN);
 		if (!is_skb_forwardable(skb->dev, skb))
 			kfree_skb(skb);
-		else {
-			skb_push(skb, ETH_HLEN);
+		else
 			br_netpoll_send_skb(to, skb);
-		}
 		return;
 	}
 

From 1d8dfede3f718d9edbab9fd3c5ece9b044c1baeb Mon Sep 17 00:00:00 2001
From: Max Schulze <max.schulze@posteo.de>
Date: Mon, 7 Jan 2019 08:31:49 +0100
Subject: [PATCH 2393/3220] USB: serial: simple: add Motorola Tetra TPG2200
 device id

commit b81c2c33eab79dfd3650293b2227ee5c6036585c upstream.

Add new Motorola Tetra device id for Motorola Solutions TETRA PEI device

T:  Bus=02 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#=  4 Spd=480 MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=0cad ProdID=9016 Rev=24.16
S:  Manufacturer=Motorola Solutions, Inc.
S:  Product=TETRA PEI interface
C:  #Ifs= 2 Cfg#= 1 Atr=80 MxPwr=500mA
I:  If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=usb_serial_simple
I:  If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=usb_serial_simple

Signed-off-by: Max Schulze <max.schulze@posteo.de>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/usb-serial-simple.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/serial/usb-serial-simple.c b/drivers/usb/serial/usb-serial-simple.c
index 6d6acf2c07c3..511242111403 100644
--- a/drivers/usb/serial/usb-serial-simple.c
+++ b/drivers/usb/serial/usb-serial-simple.c
@@ -88,7 +88,8 @@ DEVICE(moto_modem, MOTO_IDS);
 /* Motorola Tetra driver */
 #define MOTOROLA_TETRA_IDS()			\
 	{ USB_DEVICE(0x0cad, 0x9011) },	/* Motorola Solutions TETRA PEI */ \
-	{ USB_DEVICE(0x0cad, 0x9012) }	/* MTP6550 */
+	{ USB_DEVICE(0x0cad, 0x9012) },	/* MTP6550 */ \
+	{ USB_DEVICE(0x0cad, 0x9016) }	/* TPG2200 */
 DEVICE(motorola_tetra, MOTOROLA_TETRA_IDS);
 
 /* Novatel Wireless GPS driver */

From 437f0e444ddfc098294f7c60ef25fbb9c2f180d9 Mon Sep 17 00:00:00 2001
From: Charles Yeh <charlesyeh522@gmail.com>
Date: Tue, 15 Jan 2019 23:13:56 +0800
Subject: [PATCH 2394/3220] USB: serial: pl2303: add new PID to support
 PL2303TB

commit 4dcf9ddc9ad5ab649abafa98c5a4d54b1a33dabb upstream.

Add new PID to support PL2303TB (TYPE_HX)

Signed-off-by: Charles Yeh <charlesyeh522@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/pl2303.c | 1 +
 drivers/usb/serial/pl2303.h | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 4966768d3c98..9706d214c409 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -47,6 +47,7 @@ static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_HCR331) },
 	{ USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_MOTOROLA) },
 	{ USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_ZTEK) },
+	{ USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_TB) },
 	{ USB_DEVICE(IODATA_VENDOR_ID, IODATA_PRODUCT_ID) },
 	{ USB_DEVICE(IODATA_VENDOR_ID, IODATA_PRODUCT_ID_RSAQ5) },
 	{ USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_ID) },
diff --git a/drivers/usb/serial/pl2303.h b/drivers/usb/serial/pl2303.h
index a84f0959ab34..d84c3b3d477b 100644
--- a/drivers/usb/serial/pl2303.h
+++ b/drivers/usb/serial/pl2303.h
@@ -13,6 +13,7 @@
 
 #define PL2303_VENDOR_ID	0x067b
 #define PL2303_PRODUCT_ID	0x2303
+#define PL2303_PRODUCT_ID_TB		0x2304
 #define PL2303_PRODUCT_ID_RSAQ2		0x04bb
 #define PL2303_PRODUCT_ID_DCU11		0x1234
 #define PL2303_PRODUCT_ID_PHAROS	0xaaa0
@@ -25,6 +26,7 @@
 #define PL2303_PRODUCT_ID_MOTOROLA	0x0307
 #define PL2303_PRODUCT_ID_ZTEK		0xe1f1
 
+
 #define ATEN_VENDOR_ID		0x0557
 #define ATEN_VENDOR_ID2		0x0547
 #define ATEN_PRODUCT_ID		0x2008

From d9084840d00a91e9820f074b9d3e1988f6431f82 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Tue, 25 Dec 2018 20:29:48 -0600
Subject: [PATCH 2395/3220] ASoC: atom: fix a missing check of
 snd_pcm_lib_malloc_pages

commit 44fabd8cdaaa3acb80ad2bb3b5c61ae2136af661 upstream.

snd_pcm_lib_malloc_pages() may fail, so let's check its status and
return its error code upstream.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Acked-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/soc/intel/atom/sst-mfld-platform-pcm.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sound/soc/intel/atom/sst-mfld-platform-pcm.c b/sound/soc/intel/atom/sst-mfld-platform-pcm.c
index 2b96b11fbe71..1d9dfb92b3b4 100644
--- a/sound/soc/intel/atom/sst-mfld-platform-pcm.c
+++ b/sound/soc/intel/atom/sst-mfld-platform-pcm.c
@@ -398,7 +398,13 @@ static int sst_media_hw_params(struct snd_pcm_substream *substream,
 				struct snd_pcm_hw_params *params,
 				struct snd_soc_dai *dai)
 {
-	snd_pcm_lib_malloc_pages(substream, params_buffer_bytes(params));
+	int ret;
+
+	ret =
+		snd_pcm_lib_malloc_pages(substream,
+				params_buffer_bytes(params));
+	if (ret)
+		return ret;
 	memset(substream->runtime->dma_area, 0, params_buffer_bytes(params));
 	return 0;
 }

From c25a126d1a651a07dcdddc8f8eb3ef8c01fec932 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Mon, 17 Dec 2018 12:54:23 +0300
Subject: [PATCH 2396/3220] ARC: perf: map generic branches to correct hardware
 condition

commit 3affbf0e154ee351add6fcc254c59c3f3947fa8f upstream.

So far we've mapped branches to "ijmp" which also counts conditional
branches NOT taken. This makes us different from other architectures
such as ARM which seem to be counting only taken branches.

So use "ijmptak" hardware condition which only counts (all jump
instructions that are taken)

'ijmptak' event is available on both ARCompact and ARCv2 ISA based
cores.

Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Cc: stable@vger.kernel.org
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
[vgupta: reworked changelog]
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arc/include/asm/perf_event.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h
index 5f071762fb1c..6a2ae61748e4 100644
--- a/arch/arc/include/asm/perf_event.h
+++ b/arch/arc/include/asm/perf_event.h
@@ -103,7 +103,8 @@ static const char * const arc_pmu_ev_hw_map[] = {
 
 	/* counts condition */
 	[PERF_COUNT_HW_INSTRUCTIONS] = "iall",
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */
+	/* All jump instructions that are taken */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmptak",
 	[PERF_COUNT_ARC_BPOK]         = "bpok",	  /* NP-NT, PT-T, PNT-NT */
 #ifdef CONFIG_ISA_ARCV2
 	[PERF_COUNT_HW_BRANCH_MISSES] = "bpmp",

From 74be2fcda651e596fe4e96c3c1ae4d76aa683655 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 9 Nov 2018 09:21:47 +0100
Subject: [PATCH 2397/3220] s390/early: improve machine detection

commit 03aa047ef2db4985e444af6ee1c1dd084ad9fb4c upstream.

Right now the early machine detection code check stsi 3.2.2 for "KVM"
and set MACHINE_IS_VM if this is different. As the console detection
uses diagnose 8 if MACHINE_IS_VM returns true this will crash Linux
early for any non z/VM system that sets a different value than KVM.
So instead of assuming z/VM, do not set any of MACHINE_IS_LPAR,
MACHINE_IS_VM, or MACHINE_IS_KVM.

CC: stable@vger.kernel.org
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/s390/kernel/early.c | 4 ++--
 arch/s390/kernel/setup.c | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 8eccead675d4..cc7b450a7766 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -224,10 +224,10 @@ static noinline __init void detect_machine_type(void)
 	if (stsi(vmms, 3, 2, 2) || !vmms->count)
 		return;
 
-	/* Running under KVM? If not we assume z/VM */
+	/* Detect known hypervisors */
 	if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3))
 		S390_lowcore.machine_flags |= MACHINE_FLAG_KVM;
-	else
+	else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4))
 		S390_lowcore.machine_flags |= MACHINE_FLAG_VM;
 }
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index e7a43a30e3ff..47692c78d09c 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -833,6 +833,8 @@ void __init setup_arch(char **cmdline_p)
 		pr_info("Linux is running under KVM in 64-bit mode\n");
 	else if (MACHINE_IS_LPAR)
 		pr_info("Linux is running natively in 64-bit mode\n");
+	else
+		pr_info("Linux is running as a guest in 64-bit mode\n");
 
 	/* Have one command line that is parsed and saved in /proc/cmdline */
 	/* boot_command_line has been already set up in early.c */

From 86dd006cffecc263f8dc2cf2a787e0c53bb03b80 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Wed, 9 Jan 2019 13:00:03 +0100
Subject: [PATCH 2398/3220] s390/smp: fix CPU hotplug deadlock with CPU rescan

commit b7cb707c373094ce4008d4a6ac9b6b366ec52da5 upstream.

smp_rescan_cpus() is called without the device_hotplug_lock, which can lead
to a dedlock when a new CPU is found and immediately set online by a udev
rule.

This was observed on an older kernel version, where the cpu_hotplug_begin()
loop was still present, and it resulted in hanging chcpu and systemd-udev
processes. This specific deadlock will not show on current kernels. However,
there may be other possible deadlocks, and since smp_rescan_cpus() can still
trigger a CPU hotplug operation, the device_hotplug_lock should be held.

For reference, this was the deadlock with the old cpu_hotplug_begin() loop:

        chcpu (rescan)                       systemd-udevd

 echo 1 > /sys/../rescan
 -> smp_rescan_cpus()
 -> (*) get_online_cpus()
    (increases refcount)
 -> smp_add_present_cpu()
    (new CPU found)
 -> register_cpu()
 -> device_add()
 -> udev "add" event triggered -----------> udev rule sets CPU online
                                         -> echo 1 > /sys/.../online
                                         -> lock_device_hotplug_sysfs()
                                            (this is missing in rescan path)
                                         -> device_online()
                                         -> (**) device_lock(new CPU dev)
                                         -> cpu_up()
                                         -> cpu_hotplug_begin()
                                            (loops until refcount == 0)
                                            -> deadlock with (*)
 -> bus_probe_device()
 -> device_attach()
 -> device_lock(new CPU dev)
    -> deadlock with (**)

Fix this by taking the device_hotplug_lock in the CPU rescan path.

Cc: <stable@vger.kernel.org>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/s390/kernel/smp.c          | 4 ++++
 drivers/s390/char/sclp_config.c | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 77f4f334a465..dbf883e3247f 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -1152,7 +1152,11 @@ static ssize_t __ref rescan_store(struct device *dev,
 {
 	int rc;
 
+	rc = lock_device_hotplug_sysfs();
+	if (rc)
+		return rc;
 	rc = smp_rescan_cpus();
+	unlock_device_hotplug();
 	return rc ? rc : count;
 }
 static DEVICE_ATTR(rescan, 0200, NULL, rescan_store);
diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c
index 944156207477..dcb949dcfa66 100644
--- a/drivers/s390/char/sclp_config.c
+++ b/drivers/s390/char/sclp_config.c
@@ -43,7 +43,9 @@ static void sclp_cpu_capability_notify(struct work_struct *work)
 
 static void __ref sclp_cpu_change_notify(struct work_struct *work)
 {
+	lock_device_hotplug();
 	smp_rescan_cpus();
+	unlock_device_hotplug();
 }
 
 static void sclp_conf_receiver_fn(struct evbuf_header *evbuf)

From 29f7c747a57ee9f38d5b91bc42b20f125a43bb1e Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 9 Jan 2019 13:02:36 -0600
Subject: [PATCH 2399/3220] char/mwave: fix potential Spectre v1 vulnerability

commit 701956d4018e5d5438570e39e8bda47edd32c489 upstream.

ipcnum is indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

drivers/char/mwave/mwavedd.c:299 mwave_ioctl() warn: potential spectre issue 'pDrvData->IPCs' [w] (local cap)

Fix this by sanitizing ipcnum before using it to index pDrvData->IPCs.

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2

Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/mwave/mwavedd.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/char/mwave/mwavedd.c b/drivers/char/mwave/mwavedd.c
index 164544afd680..618f3df6c3b9 100644
--- a/drivers/char/mwave/mwavedd.c
+++ b/drivers/char/mwave/mwavedd.c
@@ -59,6 +59,7 @@
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/serial_8250.h>
+#include <linux/nospec.h>
 #include "smapi.h"
 #include "mwavedd.h"
 #include "3780i.h"
@@ -289,6 +290,8 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
 						ipcnum);
 				return -EINVAL;
 			}
+			ipcnum = array_index_nospec(ipcnum,
+						    ARRAY_SIZE(pDrvData->IPCs));
 			PRINTK_3(TRACE_MWAVE,
 				"mwavedd::mwave_ioctl IOCTL_MW_REGISTER_IPC"
 				" ipcnum %x entry usIntCount %x\n",
@@ -317,6 +320,8 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
 						" Invalid ipcnum %x\n", ipcnum);
 				return -EINVAL;
 			}
+			ipcnum = array_index_nospec(ipcnum,
+						    ARRAY_SIZE(pDrvData->IPCs));
 			PRINTK_3(TRACE_MWAVE,
 				"mwavedd::mwave_ioctl IOCTL_MW_GET_IPC"
 				" ipcnum %x, usIntCount %x\n",
@@ -383,6 +388,8 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
 						ipcnum);
 				return -EINVAL;
 			}
+			ipcnum = array_index_nospec(ipcnum,
+						    ARRAY_SIZE(pDrvData->IPCs));
 			mutex_lock(&mwave_mutex);
 			if (pDrvData->IPCs[ipcnum].bIsEnabled == TRUE) {
 				pDrvData->IPCs[ipcnum].bIsEnabled = FALSE;

From b6a23bda233ae5dce88f48108016d545c7e84e01 Mon Sep 17 00:00:00 2001
From: Michael Straube <straube.linux@gmail.com>
Date: Mon, 7 Jan 2019 18:28:58 +0100
Subject: [PATCH 2400/3220] staging: rtl8188eu: Add device code for D-Link
 DWA-121 rev B1

commit 5f74a8cbb38d10615ed46bc3e37d9a4c9af8045a upstream.

This device was added to the stand-alone driver on github.
Add it to the staging driver as well.

Link: https://github.com/lwfinger/rtl8188eu/commit/a0619a07cd1e
Signed-off-by: Michael Straube <straube.linux@gmail.com>
Acked-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/rtl8188eu/os_dep/usb_intf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/staging/rtl8188eu/os_dep/usb_intf.c b/drivers/staging/rtl8188eu/os_dep/usb_intf.c
index c2d2c17550a7..951f22265105 100644
--- a/drivers/staging/rtl8188eu/os_dep/usb_intf.c
+++ b/drivers/staging/rtl8188eu/os_dep/usb_intf.c
@@ -47,6 +47,7 @@ static struct usb_device_id rtw_usb_id_tbl[] = {
 	{USB_DEVICE(0x2001, 0x330F)}, /* DLink DWA-125 REV D1 */
 	{USB_DEVICE(0x2001, 0x3310)}, /* Dlink DWA-123 REV D1 */
 	{USB_DEVICE(0x2001, 0x3311)}, /* DLink GO-USB-N150 REV B1 */
+	{USB_DEVICE(0x2001, 0x331B)}, /* D-Link DWA-121 rev B1 */
 	{USB_DEVICE(0x2357, 0x010c)}, /* TP-Link TL-WN722N v2 */
 	{USB_DEVICE(0x0df6, 0x0076)}, /* Sitecom N150 v2 */
 	{USB_DEVICE(USB_VENDER_ID_REALTEK, 0xffef)}, /* Rosewill RNX-N150NUB */

From fe881381874a2dd2b790ffe5399aaa2b1ed51bc9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 20 Jan 2019 10:46:58 +0100
Subject: [PATCH 2401/3220] tty: Handle problem if line discipline does not
 have receive_buf

commit 27cfb3a53be46a54ec5e0bd04e51995b74c90343 upstream.

Some tty line disciplines do not have a receive buf callback, so
properly check for that before calling it.  If they do not have this
callback, just eat the character quietly, as we can't fail this call.

Reported-by: Jann Horn <jannh@google.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index c1cff2b455ae..5b86ebc76a8a 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2297,7 +2297,8 @@ static int tiocsti(struct tty_struct *tty, char __user *p)
 		return -EFAULT;
 	tty_audit_tiocsti(tty, ch);
 	ld = tty_ldisc_ref_wait(tty);
-	ld->ops->receive_buf(tty, &ch, &mbz, 1);
+	if (ld->ops->receive_buf)
+		ld->ops->receive_buf(tty, &ch, &mbz, 1);
 	tty_ldisc_deref(ld);
 	return 0;
 }

From 8db4fe27f2a6ae978bd231d71853308b41e2d609 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Tue, 1 Jan 2019 12:28:53 -0800
Subject: [PATCH 2402/3220] tty/n_hdlc: fix __might_sleep warning

commit fc01d8c61ce02c034e67378cd3e645734bc18c8c upstream.

Fix __might_sleep warning[1] in tty/n_hdlc.c read due to copy_to_user
call while current is TASK_INTERRUPTIBLE.  This is a false positive
since the code path does not depend on current state remaining
TASK_INTERRUPTIBLE.  The loop breaks out and sets TASK_RUNNING after
calling copy_to_user.

This patch supresses the warning by setting TASK_RUNNING before calling
copy_to_user.

[1] https://syzkaller.appspot.com/bug?id=17d5de7f1fcab794cb8c40032f893f52de899324

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Reported-by: syzbot <syzbot+c244af085a0159d22879@syzkaller.appspotmail.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: stable <stable@vger.kernel.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_hdlc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index 6d1e2f746ab4..8d6253903f24 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -598,6 +598,7 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
 				/* too large for caller's buffer */
 				ret = -EOVERFLOW;
 			} else {
+				__set_current_state(TASK_RUNNING);
 				if (copy_to_user(buf, rbuf->buf, rbuf->count))
 					ret = -EFAULT;
 				else

From 278541ac0534bc199cf228d945653c04cbbd06a5 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Thu, 17 Jan 2019 08:21:24 -0800
Subject: [PATCH 2403/3220] CIFS: Fix possible hang during async MTU reads and
 writes

commit acc58d0bab55a50e02c25f00bd6a210ee121595f upstream.

When doing MTU i/o we need to leave some credits for
possible reopen requests and other operations happening
in parallel. Currently we leave 1 credit which is not
enough even for reopen only: we need at least 2 credits
if durable handle reconnect fails. Also there may be
other operations at the same time including compounding
ones which require 3 credits at a time each. Fix this
by leaving 8 credits which is big enough to cover most
scenarios.

Was able to reproduce this when server was configured
to give out fewer credits than usual.

The proper fix would be to reconnect a file handle first
and then obtain credits for an MTU request but this leads
to bigger code changes and should happen in other patches.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2ops.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 2725085a3f9f..eae3cdffaf7f 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -143,14 +143,14 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
 
 			scredits = server->credits;
 			/* can deadlock with reopen */
-			if (scredits == 1) {
+			if (scredits <= 8) {
 				*num = SMB2_MAX_BUFFER_SIZE;
 				*credits = 0;
 				break;
 			}
 
-			/* leave one credit for a possible reopen */
-			scredits--;
+			/* leave some credits for reopen and other ops */
+			scredits -= 8;
 			*num = min_t(unsigned int, size,
 				     scredits * SMB2_MAX_BUFFER_SIZE);
 

From 74d609f091fee1af788f5e47f9e761364b1fad54 Mon Sep 17 00:00:00 2001
From: Tom Panfil <tom@steelseries.com>
Date: Fri, 11 Jan 2019 17:49:40 -0800
Subject: [PATCH 2404/3220] Input: xpad - add support for SteelSeries Stratus
 Duo

commit fe2bfd0d40c935763812973ce15f5764f1c12833 upstream.

Add support for the SteelSeries Stratus Duo, a wireless Xbox 360
controller. The Stratus Duo ships with a USB dongle to enable wireless
connectivity, but it can also function as a wired controller by connecting
it directly to a PC via USB, hence the need for two USD PIDs. 0x1430 is the
dongle, and 0x1431 is the controller.

Signed-off-by: Tom Panfil <tom@steelseries.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/input/joystick/xpad.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c
index f55dcdf99bc5..26476a64e663 100644
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c
@@ -255,6 +255,8 @@ static const struct xpad_device {
 	{ 0x0f30, 0x0202, "Joytech Advanced Controller", 0, XTYPE_XBOX },
 	{ 0x0f30, 0x8888, "BigBen XBMiniPad Controller", 0, XTYPE_XBOX },
 	{ 0x102c, 0xff0c, "Joytech Wireless Advanced Controller", 0, XTYPE_XBOX },
+	{ 0x1038, 0x1430, "SteelSeries Stratus Duo", 0, XTYPE_XBOX360 },
+	{ 0x1038, 0x1431, "SteelSeries Stratus Duo", 0, XTYPE_XBOX360 },
 	{ 0x11c9, 0x55f0, "Nacon GC-100XF", 0, XTYPE_XBOX360 },
 	{ 0x12ab, 0x0004, "Honey Bee Xbox360 dancepad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
 	{ 0x12ab, 0x0301, "PDP AFTERGLOW AX.1", 0, XTYPE_XBOX360 },
@@ -431,6 +433,7 @@ static const struct usb_device_id xpad_table[] = {
 	XPAD_XBOXONE_VENDOR(0x0e6f),		/* 0x0e6f X-Box One controllers */
 	XPAD_XBOX360_VENDOR(0x0f0d),		/* Hori Controllers */
 	XPAD_XBOXONE_VENDOR(0x0f0d),		/* Hori Controllers */
+	XPAD_XBOX360_VENDOR(0x1038),		/* SteelSeries Controllers */
 	XPAD_XBOX360_VENDOR(0x11c9),		/* Nacon GC100XF */
 	XPAD_XBOX360_VENDOR(0x12ab),		/* X-Box 360 dance pads */
 	XPAD_XBOX360_VENDOR(0x1430),		/* RedOctane X-Box 360 controllers */

From 43473a6f6734e83ff5d458bedfe2e88b51d161e9 Mon Sep 17 00:00:00 2001
From: Alexander Popov <alex.popov@linux.com>
Date: Mon, 21 Jan 2019 15:48:40 +0300
Subject: [PATCH 2405/3220] KVM: x86: Fix single-step debugging

commit 5cc244a20b86090c087073c124284381cdf47234 upstream.

The single-step debugging of KVM guests on x86 is broken: if we run
gdb 'stepi' command at the breakpoint when the guest interrupts are
enabled, RIP always jumps to native_apic_mem_write(). Then other
nasty effects follow.

Long investigation showed that on Jun 7, 2017 the
commit c8401dda2f0a00cd25c0 ("KVM: x86: fix singlestepping over syscall")
introduced the kvm_run.debug corruption: kvm_vcpu_do_singlestep() can
be called without X86_EFLAGS_TF set.

Let's fix it. Please consider that for -stable.

Signed-off-by: Alexander Popov <alex.popov@linux.com>
Cc: stable@vger.kernel.org
Fixes: c8401dda2f0a00cd25c0 ("KVM: x86: fix singlestepping over syscall")
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/x86.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1a934bb8ed1c..758e2b39567d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5524,8 +5524,7 @@ restart:
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 		kvm_rip_write(vcpu, ctxt->eip);
-		if (r == EMULATE_DONE &&
-		    (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+		if (r == EMULATE_DONE && ctxt->tf)
 			kvm_vcpu_do_singlestep(vcpu, &r);
 		if (!ctxt->have_exception ||
 		    exception_type(ctxt->exception.vector) == EXCPT_TRAP)

From 5efadf3b3e7f7387e48527e8581976968dc2d11c Mon Sep 17 00:00:00 2001
From: Daniel Drake <drake@endlessm.com>
Date: Mon, 7 Jan 2019 11:40:24 +0800
Subject: [PATCH 2406/3220] x86/kaslr: Fix incorrect i8254 outb() parameters

commit 7e6fc2f50a3197d0e82d1c0e86282976c9e6c8a4 upstream.

The outb() function takes parameters value and port, in that order.  Fix
the parameters used in the kalsr i8254 fallback code.

Fixes: 5bfce5ef55cb ("x86, kaslr: Provide randomness functions")
Signed-off-by: Daniel Drake <drake@endlessm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@alien8.de
Cc: hpa@zytor.com
Cc: linux@endlessm.com
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20190107034024.15005-1-drake@endlessm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/boot/compressed/aslr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 31dab2135188..21332b431f10 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -25,8 +25,8 @@ static inline u16 i8254(void)
 	u16 status, timer;
 
 	do {
-		outb(I8254_PORT_CONTROL,
-		     I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
+		outb(I8254_CMD_READBACK | I8254_SELECT_COUNTER0,
+		     I8254_PORT_CONTROL);
 		status = inb(I8254_PORT_COUNTER0);
 		timer  = inb(I8254_PORT_COUNTER0);
 		timer |= inb(I8254_PORT_COUNTER0) << 8;

From 17cb93920405fe8d7105191cc1733cc8a4e2d19e Mon Sep 17 00:00:00 2001
From: Manfred Schlaegl <manfred.schlaegl@ginzinger.com>
Date: Wed, 19 Dec 2018 19:39:58 +0100
Subject: [PATCH 2407/3220] can: dev: __can_get_echo_skb(): fix bogous check
 for non-existing skb by removing it

commit 7b12c8189a3dc50638e7d53714c88007268d47ef upstream.

This patch revert commit 7da11ba5c506
("can: dev: __can_get_echo_skb(): print error message, if trying to echo non existing skb")

After introduction of this change we encountered following new error
message on various i.MX plattforms (flexcan):

| flexcan 53fc8000.can can0: __can_get_echo_skb: BUG! Trying to echo non
| existing skb: can_priv::echo_skb[0]

The introduction of the message was a mistake because
priv->echo_skb[idx] = NULL is a perfectly valid in following case: If
CAN_RAW_LOOPBACK is disabled (setsockopt) in applications, the pkt_type
of the tx skb's given to can_put_echo_skb is set to PACKET_LOOPBACK. In
this case can_put_echo_skb will not set priv->echo_skb[idx]. It is
therefore kept NULL.

As additional argument for revert: The order of check and usage of idx
was changed. idx is used to access an array element before checking it's
boundaries.

Signed-off-by: Manfred Schlaegl <manfred.schlaegl@ginzinger.com>
Fixes: 7da11ba5c506 ("can: dev: __can_get_echo_skb(): print error message, if trying to echo non existing skb")
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/can/dev.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 1dbee1cb3df9..8b7c6425b681 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -426,8 +426,6 @@ EXPORT_SYMBOL_GPL(can_put_echo_skb);
 struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr)
 {
 	struct can_priv *priv = netdev_priv(dev);
-	struct sk_buff *skb = priv->echo_skb[idx];
-	struct canfd_frame *cf;
 
 	if (idx >= priv->echo_skb_max) {
 		netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n",
@@ -435,20 +433,21 @@ struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8
 		return NULL;
 	}
 
-	if (!skb) {
-		netdev_err(dev, "%s: BUG! Trying to echo non existing skb: can_priv::echo_skb[%u]\n",
-			   __func__, idx);
-		return NULL;
+	if (priv->echo_skb[idx]) {
+		/* Using "struct canfd_frame::len" for the frame
+		 * length is supported on both CAN and CANFD frames.
+		 */
+		struct sk_buff *skb = priv->echo_skb[idx];
+		struct canfd_frame *cf = (struct canfd_frame *)skb->data;
+		u8 len = cf->len;
+
+		*len_ptr = len;
+		priv->echo_skb[idx] = NULL;
+
+		return skb;
 	}
 
-	/* Using "struct canfd_frame::len" for the frame
-	 * length is supported on both CAN and CANFD frames.
-	 */
-	cf = (struct canfd_frame *)skb->data;
-	*len_ptr = cf->len;
-	priv->echo_skb[idx] = NULL;
-
-	return skb;
+	return NULL;
 }
 
 /*

From 8781bfdf733ec4c078454535f1354fc915a13786 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Sun, 13 Jan 2019 19:31:43 +0100
Subject: [PATCH 2408/3220] can: bcm: check timer values before ktime
 conversion

commit 93171ba6f1deffd82f381d36cb13177872d023f6 upstream.

Kyungtae Kim detected a potential integer overflow in bcm_[rx|tx]_setup()
when the conversion into ktime multiplies the given value with NSEC_PER_USEC
(1000).

Reference: https://marc.info/?l=linux-can&m=154732118819828&w=2

Add a check for the given tv_usec, so that the value stays below one second.
Additionally limit the tv_sec value to a reasonable value for CAN related
use-cases of 400 days and ensure all values to be positive.

Reported-by: Kyungtae Kim <kt0755@gmail.com>
Tested-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: linux-stable <stable@vger.kernel.org> # versions 2.6.26 to 4.7
Tested-by: Kyungtae Kim <kt0755@gmail.com>
Acked-by: Andre Naujoks <nautsch2@gmail.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/can/bcm.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/net/can/bcm.c b/net/can/bcm.c
index 4ccfd356baed..1f15622d3c65 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -67,6 +67,9 @@
  */
 #define MAX_NFRAMES 256
 
+/* limit timers to 400 days for sending/timeouts */
+#define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60)
+
 /* use of last_frames[index].can_dlc */
 #define RX_RECV    0x40 /* received data for this element */
 #define RX_THR     0x80 /* element not been sent due to throttle feature */
@@ -136,6 +139,22 @@ static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv)
 	return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
 }
 
+/* check limitations for timeval provided by user */
+static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head)
+{
+	if ((msg_head->ival1.tv_sec < 0) ||
+	    (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) ||
+	    (msg_head->ival1.tv_usec < 0) ||
+	    (msg_head->ival1.tv_usec >= USEC_PER_SEC) ||
+	    (msg_head->ival2.tv_sec < 0) ||
+	    (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) ||
+	    (msg_head->ival2.tv_usec < 0) ||
+	    (msg_head->ival2.tv_usec >= USEC_PER_SEC))
+		return true;
+
+	return false;
+}
+
 #define CFSIZ sizeof(struct can_frame)
 #define OPSIZ sizeof(struct bcm_op)
 #define MHSIZ sizeof(struct bcm_msg_head)
@@ -855,6 +874,10 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 	if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
 		return -EINVAL;
 
+	/* check timeval limitations */
+	if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
+		return -EINVAL;
+
 	/* check the given can_id */
 	op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex);
 
@@ -1020,6 +1043,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 	     (!(msg_head->can_id & CAN_RTR_FLAG))))
 		return -EINVAL;
 
+	/* check timeval limitations */
+	if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
+		return -EINVAL;
+
 	/* check the given can_id */
 	op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex);
 	if (op) {

From 6a923fc6fe1e887e3f260d060144f01f85b1f081 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 8 Jan 2019 22:55:01 -0500
Subject: [PATCH 2409/3220] vt: invoke notifier on screen size change

commit 0c9b1965faddad7534b6974b5b36c4ad37998f8e upstream.

User space using poll() on /dev/vcs devices are not awaken when a
screen size change occurs. Let's fix that.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/vt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index ff3286fc22d8..6779f733bb83 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -958,6 +958,7 @@ static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc,
 	if (CON_IS_VISIBLE(vc))
 		update_screen(vc);
 	vt_event_post(VT_EVENT_RESIZE, vc->vc_num, vc->vc_num);
+	notify_update(vc);
 	return err;
 }
 

From 38155e1044c86c91ee18ca62d1aca3a15021d39d Mon Sep 17 00:00:00 2001
From: Martin Vuille <jpmv27@aim.com>
Date: Sun, 11 Feb 2018 16:24:20 -0500
Subject: [PATCH 2410/3220] perf unwind: Unwind with libdw doesn't take symfs
 into account

[ Upstream commit 3d20c6246690219881786de10d2dda93f616d0ac ]

Path passed to libdw for unwinding doesn't include symfs path
if specified, so unwinding fails because ELF file is not found.

Similar to unwinding with libunwind, pass symsrc_filename instead
of long_name. If there is no symsrc_filename, fallback to long_name.

Signed-off-by: Martin Vuille <jpmv27@aim.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20180211212420.18388-1-jpmv27@aim.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/util/unwind-libdw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 60edec383281..880e8db32484 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -47,7 +47,7 @@ static int __report_module(struct addr_location *al, u64 ip,
 
 	if (!mod)
 		mod = dwfl_report_elf(ui->dwfl, dso->short_name,
-				      dso->long_name, -1, al->map->start,
+				      (dso->symsrc_filename ? dso->symsrc_filename : dso->long_name), -1, al->map->start,
 				      false);
 
 	return mod && dwfl_addrmodule(ui->dwfl, ip) == mod ? 0 : -1;

From 2cbf0a6c9a6267aeb9d5ddf95a2e2222c41a1343 Mon Sep 17 00:00:00 2001
From: Milian Wolff <milian.wolff@kdab.com>
Date: Mon, 29 Oct 2018 15:16:44 +0100
Subject: [PATCH 2411/3220] perf unwind: Take pgoff into account when reporting
 elf to libdwfl

[ Upstream commit 1fe627da30331024f453faef04d500079b901107 ]

libdwfl parses an ELF file itself and creates mappings for the
individual sections. perf on the other hand sees raw mmap events which
represent individual sections. When we encounter an address pointing
into a mapping with pgoff != 0, we must take that into account and
report the file at the non-offset base address.

This fixes unwinding with libdwfl in some cases. E.g. for a file like:

```

using namespace std;

mutex g_mutex;

double worker()
{
    lock_guard<mutex> guard(g_mutex);
    uniform_real_distribution<double> uniform(-1E5, 1E5);
    default_random_engine engine;
    double s = 0;
    for (int i = 0; i < 1000; ++i) {
        s += norm(complex<double>(uniform(engine), uniform(engine)));
    }
    cout << s << endl;
    return s;
}

int main()
{
    vector<std::future<double>> results;
    for (int i = 0; i < 10000; ++i) {
        results.push_back(async(launch::async, worker));
    }
    return 0;
}
```

Compile it with `g++ -g -O2 -lpthread cpp-locking.cpp  -o cpp-locking`,
then record it with `perf record --call-graph dwarf -e
sched:sched_switch`.

When you analyze it with `perf script` and libunwind, you should see:

```
cpp-locking 20038 [005] 54830.236589: sched:sched_switch: prev_comm=cpp-locking prev_pid=20038 prev_prio=120 prev_state=T ==> next_comm=swapper/5 next_pid=0 next_prio=120
        ffffffffb166fec5 __sched_text_start+0x545 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb166fec5 __sched_text_start+0x545 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb1670208 schedule+0x28 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb16737cc rwsem_down_read_failed+0xec (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb1665e04 call_rwsem_down_read_failed+0x14 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb1672a03 down_read+0x13 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb106bd85 __do_page_fault+0x445 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb18015f5 page_fault+0x45 (/lib/modules/4.14.78-1-lts/build/vmlinux)
            7f38e4252591 new_heap+0x101 (/usr/lib/libc-2.28.so)
            7f38e4252d0b arena_get2.part.4+0x2fb (/usr/lib/libc-2.28.so)
            7f38e4255b1c tcache_init.part.6+0xec (/usr/lib/libc-2.28.so)
            7f38e42569e5 __GI___libc_malloc+0x115 (inlined)
            7f38e4241790 __GI__IO_file_doallocate+0x90 (inlined)
            7f38e424fbbf __GI__IO_doallocbuf+0x4f (inlined)
            7f38e424ee47 __GI__IO_file_overflow+0x197 (inlined)
            7f38e424df36 _IO_new_file_xsputn+0x116 (inlined)
            7f38e4242bfb __GI__IO_fwrite+0xdb (inlined)
            7f38e463fa6d std::basic_streambuf<char, std::char_traits<char> >::sputn(char const*, long)+0x1cd (inlined)
            7f38e463fa6d std::ostreambuf_iterator<char, std::char_traits<char> >::_M_put(char const*, long)+0x1cd (inlined)
            7f38e463fa6d std::ostreambuf_iterator<char, std::char_traits<char> > std::__write<char>(std::ostreambuf_iterator<char, std::char_traits<char> >, char const*, int)+0x1cd (inlined)
            7f38e463fa6d std::ostreambuf_iterator<char, std::char_traits<char> > std::num_put<char, std::ostreambuf_iterator<char, std::char_traits<char> > >::_M_insert_float<double>(std::ostreambuf_iterator<c>
            7f38e464bd70 std::num_put<char, std::ostreambuf_iterator<char, std::char_traits<char> > >::put(std::ostreambuf_iterator<char, std::char_traits<char> >, std::ios_base&, char, double) const+0x90 (inl>
            7f38e464bd70 std::ostream& std::ostream::_M_insert<double>(double)+0x90 (/usr/lib/libstdc++.so.6.0.25)
            563b9cb502f7 std::ostream::operator<<(double)+0xb7 (inlined)
            563b9cb502f7 worker()+0xb7 (/ssd/milian/projects/kdab/rnd/hotspot/build/tests/test-clients/cpp-locking/cpp-locking)
            563b9cb506fb double std::__invoke_impl<double, double (*)()>(std::__invoke_other, double (*&&)())+0x2b (inlined)
            563b9cb506fb std::__invoke_result<double (*)()>::type std::__invoke<double (*)()>(double (*&&)())+0x2b (inlined)
            563b9cb506fb decltype (__invoke((_S_declval<0ul>)())) std::thread::_Invoker<std::tuple<double (*)()> >::_M_invoke<0ul>(std::_Index_tuple<0ul>)+0x2b (inlined)
            563b9cb506fb std::thread::_Invoker<std::tuple<double (*)()> >::operator()()+0x2b (inlined)
            563b9cb506fb std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<double>, std::__future_base::_Result_base::_Deleter>, std::thread::_Invoker<std::tuple<double (*)()> >, dou>
            563b9cb506fb std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_>
            563b9cb507e8 std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>::operator()() const+0x28 (inlined)
            563b9cb507e8 std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)+0x28 (/ssd/milian/>
            7f38e46d24fe __pthread_once_slow+0xbe (/usr/lib/libpthread-2.28.so)
            563b9cb51149 __gthread_once+0xe9 (inlined)
            563b9cb51149 void std::call_once<void (std::__future_base::_State_baseV2::*)(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)>
            563b9cb51149 std::__future_base::_State_baseV2::_M_set_result(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>, bool)+0xe9 (inlined)
            563b9cb51149 std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<double (*)()> >, double>::_Async_state_impl(std::thread::_Invoker<std::tuple<double (*)()> >&&)::{lambda()#1}::op>
            563b9cb51149 void std::__invoke_impl<void, std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<double (*)()> >, double>::_Async_state_impl(std::thread::_Invoker<std::tuple<double>
            563b9cb51149 std::__invoke_result<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<double (*)()> >, double>::_Async_state_impl(std::thread::_Invoker<std::tuple<double (*)()> >>
            563b9cb51149 decltype (__invoke((_S_declval<0ul>)())) std::thread::_Invoker<std::tuple<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<double (*)()> >, double>::_Async_state_>
            563b9cb51149 std::thread::_Invoker<std::tuple<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<double (*)()> >, double>::_Async_state_impl(std::thread::_Invoker<std::tuple<dou>
            563b9cb51149 std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<double (*)()> >, double>::_Async_state_impl(std::thread>
            7f38e45f0062 execute_native_thread_routine+0x12 (/usr/lib/libstdc++.so.6.0.25)
            7f38e46caa9c start_thread+0xfc (/usr/lib/libpthread-2.28.so)
            7f38e42ccb22 __GI___clone+0x42 (inlined)
```

Before this patch, using libdwfl, you would see:

```
cpp-locking 20038 [005] 54830.236589: sched:sched_switch: prev_comm=cpp-locking prev_pid=20038 prev_prio=120 prev_state=T ==> next_comm=swapper/5 next_pid=0 next_prio=120
        ffffffffb166fec5 __sched_text_start+0x545 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb166fec5 __sched_text_start+0x545 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb1670208 schedule+0x28 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb16737cc rwsem_down_read_failed+0xec (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb1665e04 call_rwsem_down_read_failed+0x14 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb1672a03 down_read+0x13 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb106bd85 __do_page_fault+0x445 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb18015f5 page_fault+0x45 (/lib/modules/4.14.78-1-lts/build/vmlinux)
            7f38e4252591 new_heap+0x101 (/usr/lib/libc-2.28.so)
        a041161e77950c5c [unknown] ([unknown])
```

With this patch applied, we get a bit further in unwinding:

```
cpp-locking 20038 [005] 54830.236589: sched:sched_switch: prev_comm=cpp-locking prev_pid=20038 prev_prio=120 prev_state=T ==> next_comm=swapper/5 next_pid=0 next_prio=120
        ffffffffb166fec5 __sched_text_start+0x545 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb166fec5 __sched_text_start+0x545 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb1670208 schedule+0x28 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb16737cc rwsem_down_read_failed+0xec (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb1665e04 call_rwsem_down_read_failed+0x14 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb1672a03 down_read+0x13 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb106bd85 __do_page_fault+0x445 (/lib/modules/4.14.78-1-lts/build/vmlinux)
        ffffffffb18015f5 page_fault+0x45 (/lib/modules/4.14.78-1-lts/build/vmlinux)
            7f38e4252591 new_heap+0x101 (/usr/lib/libc-2.28.so)
            7f38e4252d0b arena_get2.part.4+0x2fb (/usr/lib/libc-2.28.so)
            7f38e4255b1c tcache_init.part.6+0xec (/usr/lib/libc-2.28.so)
            7f38e42569e5 __GI___libc_malloc+0x115 (inlined)
            7f38e4241790 __GI__IO_file_doallocate+0x90 (inlined)
            7f38e424fbbf __GI__IO_doallocbuf+0x4f (inlined)
            7f38e424ee47 __GI__IO_file_overflow+0x197 (inlined)
            7f38e424df36 _IO_new_file_xsputn+0x116 (inlined)
            7f38e4242bfb __GI__IO_fwrite+0xdb (inlined)
            7f38e463fa6d std::basic_streambuf<char, std::char_traits<char> >::sputn(char const*, long)+0x1cd (inlined)
            7f38e463fa6d std::ostreambuf_iterator<char, std::char_traits<char> >::_M_put(char const*, long)+0x1cd (inlined)
            7f38e463fa6d std::ostreambuf_iterator<char, std::char_traits<char> > std::__write<char>(std::ostreambuf_iterator<char, std::char_traits<char> >, char const*, int)+0x1cd (inlined)
            7f38e463fa6d std::ostreambuf_iterator<char, std::char_traits<char> > std::num_put<char, std::ostreambuf_iterator<char, std::char_traits<char> > >::_M_insert_float<double>(std::ostreambuf_iterator<c>
            7f38e464bd70 std::num_put<char, std::ostreambuf_iterator<char, std::char_traits<char> > >::put(std::ostreambuf_iterator<char, std::char_traits<char> >, std::ios_base&, char, double) const+0x90 (inl>
            7f38e464bd70 std::ostream& std::ostream::_M_insert<double>(double)+0x90 (/usr/lib/libstdc++.so.6.0.25)
            563b9cb502f7 std::ostream::operator<<(double)+0xb7 (inlined)
            563b9cb502f7 worker()+0xb7 (/ssd/milian/projects/kdab/rnd/hotspot/build/tests/test-clients/cpp-locking/cpp-locking)
        6eab825c1ee3e4ff [unknown] ([unknown])
```

Note that the backtrace is still stopping too early, when compared to
the nice results obtained via libunwind. It's unclear so far what the
reason for that is.

Committer note:

Further comment by Milian on the thread started on the Link: tag below:

 ---
The remaining issue is due to a bug in elfutils:

https://sourceware.org/ml/elfutils-devel/2018-q4/msg00089.html

With both patches applied, libunwind and elfutils produce the same output for
the above scenario.
 ---

Signed-off-by: Milian Wolff <milian.wolff@kdab.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20181029141644.3907-1-milian.wolff@kdab.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/util/unwind-libdw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 880e8db32484..bf5ee8906fb2 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -41,13 +41,13 @@ static int __report_module(struct addr_location *al, u64 ip,
 		Dwarf_Addr s;
 
 		dwfl_module_info(mod, NULL, &s, NULL, NULL, NULL, NULL, NULL);
-		if (s != al->map->start)
+		if (s != al->map->start - al->map->pgoff)
 			mod = 0;
 	}
 
 	if (!mod)
 		mod = dwfl_report_elf(ui->dwfl, dso->short_name,
-				      (dso->symsrc_filename ? dso->symsrc_filename : dso->long_name), -1, al->map->start,
+				      (dso->symsrc_filename ? dso->symsrc_filename : dso->long_name), -1, al->map->start - al->map->pgoff,
 				      false);
 
 	return mod && dwfl_addrmodule(ui->dwfl, ip) == mod ? 0 : -1;

From 1b284d784624dbe747f24936b7b3e29e5eaffabc Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 18 Jan 2019 14:08:59 +0000
Subject: [PATCH 2412/3220] irqchip/gic-v3-its: Align PCI Multi-MSI allocation
 on their size

commit 8208d1708b88b412ca97f50a6d951242c88cbbac upstream.

The way we allocate events works fine in most cases, except
when multiple PCI devices share an ITS-visible DevID, and that
one of them is trying to use MultiMSI allocation.

In that case, our allocation is not guaranteed to be zero-based
anymore, and we have to make sure we allocate it on a boundary
that is compatible with the PCI Multi-MSI constraints.

Fix this by allocating the full region upfront instead of iterating
over the number of MSIs. MSI-X are always allocated one by one,
so this shouldn't change anything on that front.

Fixes: b48ac83d6bbc2 ("irqchip: GICv3: ITS: MSI support")
Cc: stable@vger.kernel.org
Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
[ardb: rebased onto v4.9.153, should apply cleanly onto v4.4.y as well]
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 drivers/irqchip/irq-gic-v3-its.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index c3d7a1461043..114d5883d497 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1230,13 +1230,14 @@ static void its_free_device(struct its_device *its_dev)
 	kfree(its_dev);
 }
 
-static int its_alloc_device_irq(struct its_device *dev, irq_hw_number_t *hwirq)
+static int its_alloc_device_irq(struct its_device *dev, int nvecs, irq_hw_number_t *hwirq)
 {
 	int idx;
 
-	idx = find_first_zero_bit(dev->event_map.lpi_map,
-				  dev->event_map.nr_lpis);
-	if (idx == dev->event_map.nr_lpis)
+	idx = bitmap_find_free_region(dev->event_map.lpi_map,
+				      dev->event_map.nr_lpis,
+				      get_count_order(nvecs));
+	if (idx < 0)
 		return -ENOSPC;
 
 	*hwirq = dev->event_map.lpi_base + idx;
@@ -1317,20 +1318,20 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	int err;
 	int i;
 
-	for (i = 0; i < nr_irqs; i++) {
-		err = its_alloc_device_irq(its_dev, &hwirq);
-		if (err)
-			return err;
+	err = its_alloc_device_irq(its_dev, nr_irqs, &hwirq);
+	if (err)
+		return err;
 
-		err = its_irq_gic_domain_alloc(domain, virq + i, hwirq);
+	for (i = 0; i < nr_irqs; i++) {
+		err = its_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
 		if (err)
 			return err;
 
 		irq_domain_set_hwirq_and_chip(domain, virq + i,
-					      hwirq, &its_irq_chip, its_dev);
+					      hwirq + i, &its_irq_chip, its_dev);
 		pr_debug("ID:%d pID:%d vID:%d\n",
-			 (int)(hwirq - its_dev->event_map.lpi_base),
-			 (int) hwirq, virq + i);
+			 (int)(hwirq + i - its_dev->event_map.lpi_base),
+			 (int)(hwirq + i), virq + i);
 	}
 
 	return 0;

From 029b5be50938e6f13ce76ca747b43a16fd010252 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Tue, 21 Jun 2016 15:32:57 +0800
Subject: [PATCH 2413/3220] arm64: mm: remove page_mapping check in
 __sync_icache_dcache

commit 20c27a4270c775d7ed661491af8ac03264d60fc6 upstream.

__sync_icache_dcache unconditionally skips the cache maintenance for
anonymous pages, under the assumption that flushing is only required in
the presence of D-side aliases [see 7249b79f6b4cc ("arm64: Do not flush
the D-cache for anonymous pages")].

Unfortunately, this breaks migration of anonymous pages holding
self-modifying code, where userspace cannot be reasonably expected to
reissue maintenance instructions in response to a migration.

This patch fixes the problem by removing the broken page_mapping(page)
check from the cache syncing code, otherwise we may end up fetching and
executing stale instructions from the PoU.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Amanieu d'Antras <amanieu@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/mm/flush.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index c26b804015e8..a90615baa529 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -70,10 +70,6 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr)
 {
 	struct page *page = pte_page(pte);
 
-	/* no flushing needed for anonymous pages */
-	if (!page_mapping(page))
-		return;
-
 	if (!test_and_set_bit(PG_dcache_clean, &page->flags)) {
 		__flush_dcache_area(page_address(page),
 				PAGE_SIZE << compound_order(page));

From 896354cce3866a4b505307e940273d3109b19470 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Thu, 22 Nov 2018 18:58:46 +0800
Subject: [PATCH 2414/3220] f2fs: read page index before freeing

commit 0ea295dd853e0879a9a30ab61f923c26be35b902 upstream.

The function truncate_node frees the page with f2fs_put_page. However,
the page index is read after that. So, the patch reads the index before
freeing the page.

Fixes: bf39c00a9a7f ("f2fs: drop obsolete node page when it is truncated")
Cc: <stable@vger.kernel.org>
Signed-off-by: Pan Bian <bianpan2016@163.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/node.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3685fea62333..582373849332 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -590,6 +590,7 @@ static void truncate_node(struct dnode_of_data *dn)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct node_info ni;
+	pgoff_t index;
 
 	get_node_info(sbi, dn->nid, &ni);
 	if (dn->inode->i_blocks == 0) {
@@ -613,10 +614,11 @@ invalidate:
 	clear_node_page_dirty(dn->node_page);
 	set_sbi_flag(sbi, SBI_IS_DIRTY);
 
+	index = dn->node_page->index;
 	f2fs_put_page(dn->node_page, 1);
 
 	invalidate_mapping_pages(NODE_MAPPING(sbi),
-			dn->node_page->index, dn->node_page->index);
+			index, index);
 
 	dn->node_page = NULL;
 	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);

From 55bbe71559ac574f4be070194a9170ca16f965fe Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 30 Jan 2019 08:33:14 +0100
Subject: [PATCH 2415/3220] Revert "loop: Fix double
 mutex_unlock(&loop_ctl_mutex) in loop_control_ioctl()"

This reverts commit 9ec298cc874d08020f45791a8396e1593c3278c1 which is
commit 628bd85947091830a8c4872adfd5ed1d515a9cf2 upstream.

It is not needed in the 4.4.y tree at this point in time.

Reported-by: Jan Kara <jack@suse.cz>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/block/loop.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b1cf891cb3d9..a51f7b066f17 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1936,10 +1936,12 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
+			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		if (atomic_read(&lo->lo_refcnt) > 0) {
 			ret = -EBUSY;
+			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		lo->lo_disk->private_data = NULL;

From f1f952b305344a9d3b1b6a9021067d1d9809f5a6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 30 Jan 2019 08:34:22 +0100
Subject: [PATCH 2416/3220] Revert "loop: Get rid of loop_index_mutex"

This reverts commit 611f77199cd763e6b7c0462c2f199ddb3a089750 which is
commit 0a42e99b58a208839626465af194cfe640ef9493 upstream.

It is not needed in the 4.4.y tree at this time.

Reported-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/block/loop.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a51f7b066f17..c15767ed92e3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -81,6 +81,7 @@
 #include <asm/uaccess.h>
 
 static DEFINE_IDR(loop_index_idr);
+static DEFINE_MUTEX(loop_index_mutex);
 static DEFINE_MUTEX(loop_ctl_mutex);
 
 static int max_part;
@@ -1570,11 +1571,9 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 static int lo_open(struct block_device *bdev, fmode_t mode)
 {
 	struct loop_device *lo;
-	int err;
+	int err = 0;
 
-	err = mutex_lock_killable(&loop_ctl_mutex);
-	if (err)
-		return err;
+	mutex_lock(&loop_index_mutex);
 	lo = bdev->bd_disk->private_data;
 	if (!lo) {
 		err = -ENXIO;
@@ -1583,7 +1582,7 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 
 	atomic_inc(&lo->lo_refcnt);
 out:
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&loop_index_mutex);
 	return err;
 }
 
@@ -1592,11 +1591,12 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	struct loop_device *lo;
 	int err;
 
-	mutex_lock(&loop_ctl_mutex);
+	mutex_lock(&loop_index_mutex);
 	lo = disk->private_data;
 	if (atomic_dec_return(&lo->lo_refcnt))
-		goto out_unlock;
+		goto unlock_index;
 
+	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		/*
 		 * In autoclear mode, stop the loop thread
@@ -1604,7 +1604,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 */
 		err = loop_clr_fd(lo);
 		if (!err)
-			return;
+			goto unlock_index;
 	} else {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1613,8 +1613,9 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		loop_flush(lo);
 	}
 
-out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
+unlock_index:
+	mutex_unlock(&loop_index_mutex);
 }
 
 static const struct block_device_operations lo_fops = {
@@ -1896,7 +1897,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 	struct kobject *kobj;
 	int err;
 
-	mutex_lock(&loop_ctl_mutex);
+	mutex_lock(&loop_index_mutex);
 	err = loop_lookup(&lo, MINOR(dev) >> part_shift);
 	if (err < 0)
 		err = loop_add(&lo, MINOR(dev) >> part_shift);
@@ -1904,7 +1905,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 		kobj = NULL;
 	else
 		kobj = get_disk(lo->lo_disk);
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&loop_index_mutex);
 
 	*part = 0;
 	return kobj;
@@ -1914,13 +1915,9 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			       unsigned long parm)
 {
 	struct loop_device *lo;
-	int ret;
+	int ret = -ENOSYS;
 
-	ret = mutex_lock_killable(&loop_ctl_mutex);
-	if (ret)
-		return ret;
-
-	ret = -ENOSYS;
+	mutex_lock(&loop_index_mutex);
 	switch (cmd) {
 	case LOOP_CTL_ADD:
 		ret = loop_lookup(&lo, parm);
@@ -1934,6 +1931,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 		ret = loop_lookup(&lo, parm);
 		if (ret < 0)
 			break;
+		mutex_lock(&loop_ctl_mutex);
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
 			mutex_unlock(&loop_ctl_mutex);
@@ -1945,6 +1943,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		}
 		lo->lo_disk->private_data = NULL;
+		mutex_unlock(&loop_ctl_mutex);
 		idr_remove(&loop_index_idr, lo->lo_number);
 		loop_remove(lo);
 		break;
@@ -1954,7 +1953,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		ret = loop_add(&lo, -1);
 	}
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&loop_index_mutex);
 
 	return ret;
 }
@@ -2037,10 +2036,10 @@ static int __init loop_init(void)
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
 	/* pre-create number of devices given by config or max_loop */
-	mutex_lock(&loop_ctl_mutex);
+	mutex_lock(&loop_index_mutex);
 	for (i = 0; i < nr; i++)
 		loop_add(&lo, i);
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&loop_index_mutex);
 
 	printk(KERN_INFO "loop: module loaded\n");
 	return 0;

From e1e584bb545427d1cb754a0831c2520699444114 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 30 Jan 2019 08:35:00 +0100
Subject: [PATCH 2417/3220] Revert "loop: Fold __loop_release into
 loop_release"

This reverts commit 4ee414c3b6021db621901f2697d35774926268f6 which is
commit 967d1dc144b50ad005e5eecdfadfbcfb399ffff6 upstream.

It is not needed in the 4.4.y tree at this time.

Reported-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/block/loop.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c15767ed92e3..ae361ee90587 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1586,15 +1586,12 @@ out:
 	return err;
 }
 
-static void lo_release(struct gendisk *disk, fmode_t mode)
+static void __lo_release(struct loop_device *lo)
 {
-	struct loop_device *lo;
 	int err;
 
-	mutex_lock(&loop_index_mutex);
-	lo = disk->private_data;
 	if (atomic_dec_return(&lo->lo_refcnt))
-		goto unlock_index;
+		return;
 
 	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
@@ -1604,7 +1601,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 */
 		err = loop_clr_fd(lo);
 		if (!err)
-			goto unlock_index;
+			return;
 	} else {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1614,7 +1611,12 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	}
 
 	mutex_unlock(&loop_ctl_mutex);
-unlock_index:
+}
+
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+	mutex_lock(&loop_index_mutex);
+	__lo_release(disk->private_data);
 	mutex_unlock(&loop_index_mutex);
 }
 

From dda201759ece8134d389257b5a762b01b588b28b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 11 Jan 2019 15:18:22 +0100
Subject: [PATCH 2418/3220] s390/smp: Fix calling smp_call_ipl_cpu() from ipl
 CPU

commit 60f1bf29c0b2519989927cae640cd1f50f59dc7f upstream.

When calling smp_call_ipl_cpu() from the IPL CPU, we will try to read
from pcpu_devices->lowcore. However, due to prefixing, that will result
in reading from absolute address 0 on that CPU. We have to go via the
actual lowcore instead.

This means that right now, we will read lc->nodat_stack == 0 and
therfore work on a very wrong stack.

This BUG essentially broke rebooting under QEMU TCG (which will report
a low address protection exception). And checking under KVM, it is
also broken under KVM. With 1 VCPU it can be easily triggered.

:/# echo 1 > /proc/sys/kernel/sysrq
:/# echo b > /proc/sysrq-trigger
[   28.476745] sysrq: SysRq : Resetting
[   28.476793] Kernel stack overflow.
[   28.476817] CPU: 0 PID: 424 Comm: sh Not tainted 5.0.0-rc1+ #13
[   28.476820] Hardware name: IBM 2964 NE1 716 (KVM/Linux)
[   28.476826] Krnl PSW : 0400c00180000000 0000000000115c0c (pcpu_delegate+0x12c/0x140)
[   28.476861]            R:0 T:1 IO:0 EX:0 Key:0 M:0 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[   28.476863] Krnl GPRS: ffffffffffffffff 0000000000000000 000000000010dff8 0000000000000000
[   28.476864]            0000000000000000 0000000000000000 0000000000ab7090 000003e0006efbf0
[   28.476864]            000000000010dff8 0000000000000000 0000000000000000 0000000000000000
[   28.476865]            000000007fffc000 0000000000730408 000003e0006efc58 0000000000000000
[   28.476887] Krnl Code: 0000000000115bfe: 4170f000            la      %r7,0(%r15)
[   28.476887]            0000000000115c02: 41f0a000            la      %r15,0(%r10)
[   28.476887]           #0000000000115c06: e370f0980024        stg     %r7,152(%r15)
[   28.476887]           >0000000000115c0c: c0e5fffff86e        brasl   %r14,114ce8
[   28.476887]            0000000000115c12: 41f07000            la      %r15,0(%r7)
[   28.476887]            0000000000115c16: a7f4ffa8            brc     15,115b66
[   28.476887]            0000000000115c1a: 0707                bcr     0,%r7
[   28.476887]            0000000000115c1c: 0707                bcr     0,%r7
[   28.476901] Call Trace:
[   28.476902] Last Breaking-Event-Address:
[   28.476920]  [<0000000000a01c4a>] arch_call_rest_init+0x22/0x80
[   28.476927] Kernel panic - not syncing: Corrupt kernel stack, can't continue.
[   28.476930] CPU: 0 PID: 424 Comm: sh Not tainted 5.0.0-rc1+ #13
[   28.476932] Hardware name: IBM 2964 NE1 716 (KVM/Linux)
[   28.476932] Call Trace:

Fixes: 2f859d0dad81 ("s390/smp: reduce size of struct pcpu")
Cc: stable@vger.kernel.org # 4.0+
Reported-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/s390/kernel/smp.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index dbf883e3247f..29e5409c0d48 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -360,9 +360,13 @@ void smp_call_online_cpu(void (*func)(void *), void *data)
  */
 void smp_call_ipl_cpu(void (*func)(void *), void *data)
 {
+	struct _lowcore *lc = pcpu_devices->lowcore;
+
+	if (pcpu_devices[0].address == stap())
+		lc = &S390_lowcore;
+
 	pcpu_delegate(&pcpu_devices[0], func, data,
-		      pcpu_devices->lowcore->panic_stack -
-		      PANIC_FRAME_OFFSET + PAGE_SIZE);
+		      lc->panic_stack - PANIC_FRAME_OFFSET + PAGE_SIZE);
 }
 
 int smp_find_processor_id(u16 address)

From fb0c3321e03f14e9c7b0c8ac9e724ae3b59a14a9 Mon Sep 17 00:00:00 2001
From: Jimmy Durand Wesolowski <jdw@amazon.de>
Date: Thu, 31 Jan 2019 15:19:39 +0100
Subject: [PATCH 2419/3220] fs: add the fsnotify call to vfs_iter_write

A bug has been discovered when redirecting splice output to regular files
on EXT4 and tmpfs. Other filesystems might be affected.
This commit fixes the issue for stable series kernel, using one of the
change introduced during the rewrite and refactoring of vfs_iter_write in
4.13, specifically in the
commit abbb65899aec ("fs: implement vfs_iter_write using do_iter_write").

This issue affects v4.4 and v4.9 stable series of kernels.

Without this fix for v4.4 and v4.9 stable, the following upstream commits
(and their dependencies would need to be backported):
* commit abbb65899aec ("fs: implement vfs_iter_write using do_iter_write")
* commit 18e9710ee59c ("fs: implement vfs_iter_read using do_iter_read")
* commit edab5fe38c2c
  ("fs: move more code into do_iter_read/do_iter_write")
* commit 19c735868dd0 ("fs: remove __do_readv_writev")
* commit 26c87fb7d10d ("fs: remove do_compat_readv_writev")
* commit 251b42a1dc64 ("fs: remove do_readv_writev")

as well as the following dependencies:
* commit bb7462b6fd64
  ("vfs: use helpers for calling f_op->{read,write}_iter()")
* commit 0f78d06ac1e9
  ("vfs: pass type instead of fn to do_{loop,iter}_readv_writev()")
* commit 7687a7a4435f
  ("vfs: extract common parts of {compat_,}do_readv_writev()")

In order to reduce the changes, this commit uses only the part of
commit abbb65899aec ("fs: implement vfs_iter_write using do_iter_write")
that fixes the issue.

This issue and the reproducer can be found on
https://bugzilla.kernel.org/show_bug.cgi?id=85381

Reported-by: Richard Li <richardpku@gmail.com>
Reported-by: Chad Miller <millchad@amazon.com>
Reviewed-by: Stefan Nuernberger <snu@amazon.de>
Reviewed-by: Frank Becker <becke@amazon.de>
Signed-off-by: Jimmy Durand Wesolowski <jdw@amazon.de>
---
 fs/read_write.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index bfd1a5dddf6e..16e554ba885d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -363,8 +363,10 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
 	iter->type |= WRITE;
 	ret = file->f_op->write_iter(&kiocb, iter);
 	BUG_ON(ret == -EIOCBQUEUED);
-	if (ret > 0)
+	if (ret > 0) {
 		*ppos = kiocb.ki_pos;
+		fsnotify_modify(file);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(vfs_iter_write);

From f9b9a8ea4796767bc56a3a188dde62f31298d287 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 2 Jan 2019 18:57:09 -0800
Subject: [PATCH 2420/3220] ipv6: Consider sk_bound_dev_if when binding a
 socket to an address

[ Upstream commit c5ee066333ebc322a24a00a743ed941a0c68617e ]

IPv6 does not consider if the socket is bound to a device when binding
to an address. The result is that a socket can be bound to eth0 and then
bound to the address of eth1. If the device is a VRF, the result is that
a socket can only be bound to an address in the default VRF.

Resolve by considering the device if sk_bound_dev_if is set.

This problem exists from the beginning of git history.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/af_inet6.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d6f2dab28d14..f9a4447ca002 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -345,6 +345,9 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 					err = -EINVAL;
 					goto out_unlock;
 				}
+			}
+
+			if (sk->sk_bound_dev_if) {
 				dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
 				if (!dev) {
 					err = -ENODEV;

From a0d1b4af6e9a9185c20deb5c9d61f14f5f1a6380 Mon Sep 17 00:00:00 2001
From: Jacob Wen <jian.w.wen@oracle.com>
Date: Thu, 31 Jan 2019 15:18:56 +0800
Subject: [PATCH 2421/3220] l2tp: copy 4 more bytes to linear part if necessary

[ Upstream commit 91c524708de6207f59dd3512518d8a1c7b434ee3 ]

The size of L2TPv2 header with all optional fields is 14 bytes.
l2tp_udp_recv_core only moves 10 bytes to the linear part of a
skb. This may lead to l2tp_recv_common read data outside of a skb.

This patch make sure that there is at least 14 bytes in the linear
part of a skb to meet the maximum need of l2tp_udp_recv_core and
l2tp_recv_common. The minimum size of both PPP HDLC-like frame and
Ethernet frame is larger than 14 bytes, so we are safe to do so.

Also remove L2TP_HDR_SIZE_NOSEQ, it is unused now.

Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts")
Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Jacob Wen <jian.w.wen@oracle.com>
Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/l2tp/l2tp_core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 591d18785285..4b49e6602113 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -83,8 +83,7 @@
 #define L2TP_SLFLAG_S	   0x40000000
 #define L2TP_SL_SEQ_MASK   0x00ffffff
 
-#define L2TP_HDR_SIZE_SEQ		10
-#define L2TP_HDR_SIZE_NOSEQ		6
+#define L2TP_HDR_SIZE_MAX		14
 
 /* Default trace flags */
 #define L2TP_DEFAULT_DEBUG_FLAGS	0
@@ -860,7 +859,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
 	__skb_pull(skb, sizeof(struct udphdr));
 
 	/* Short packet? */
-	if (!pskb_may_pull(skb, L2TP_HDR_SIZE_SEQ)) {
+	if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) {
 		l2tp_info(tunnel, L2TP_MSG_DATA,
 			  "%s: recv short packet (len=%d)\n",
 			  tunnel->name, skb->len);

From 265f211a29ca2649d5102f0257128e2ea0f0d8a3 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Tue, 22 Jan 2019 15:19:44 +0200
Subject: [PATCH 2422/3220] net/mlx4_core: Add masking for a few queries on HCA
 caps

[ Upstream commit a40ded6043658444ee4dd6ee374119e4e98b33fc ]

Driver reads the query HCA capabilities without the corresponding masks.
Without the correct masks, the base addresses of the queues are
unaligned.  In addition some reserved bits were wrongly read.  Using the
correct masks, ensures alignment of the base addresses and allows future
firmware versions safe use of the reserved bits.

Fixes: ab9c17a009ee ("mlx4_core: Modify driver initialization flow to accommodate SRIOV for Ethernet")
Fixes: 0ff1fb654bec ("{NET, IB}/mlx4: Add device managed flow steering firmware API")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c | 75 +++++++++++++++----------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 90db94e83fde..033f99d2f15c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -1906,9 +1906,11 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	__be32 *outbox;
+	u64 qword_field;
 	u32 dword_field;
-	int err;
+	u16 word_field;
 	u8 byte_field;
+	int err;
 	static const u8 a0_dmfs_query_hw_steering[] =  {
 		[0] = MLX4_STEERING_DMFS_A0_DEFAULT,
 		[1] = MLX4_STEERING_DMFS_A0_DYNAMIC,
@@ -1936,19 +1938,32 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 
 	/* QPC/EEC/CQC/EQC/RDMARC attributes */
 
-	MLX4_GET(param->qpc_base,      outbox, INIT_HCA_QPC_BASE_OFFSET);
-	MLX4_GET(param->log_num_qps,   outbox, INIT_HCA_LOG_QP_OFFSET);
-	MLX4_GET(param->srqc_base,     outbox, INIT_HCA_SRQC_BASE_OFFSET);
-	MLX4_GET(param->log_num_srqs,  outbox, INIT_HCA_LOG_SRQ_OFFSET);
-	MLX4_GET(param->cqc_base,      outbox, INIT_HCA_CQC_BASE_OFFSET);
-	MLX4_GET(param->log_num_cqs,   outbox, INIT_HCA_LOG_CQ_OFFSET);
-	MLX4_GET(param->altc_base,     outbox, INIT_HCA_ALTC_BASE_OFFSET);
-	MLX4_GET(param->auxc_base,     outbox, INIT_HCA_AUXC_BASE_OFFSET);
-	MLX4_GET(param->eqc_base,      outbox, INIT_HCA_EQC_BASE_OFFSET);
-	MLX4_GET(param->log_num_eqs,   outbox, INIT_HCA_LOG_EQ_OFFSET);
-	MLX4_GET(param->num_sys_eqs,   outbox, INIT_HCA_NUM_SYS_EQS_OFFSET);
-	MLX4_GET(param->rdmarc_base,   outbox, INIT_HCA_RDMARC_BASE_OFFSET);
-	MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET);
+	MLX4_GET(qword_field, outbox, INIT_HCA_QPC_BASE_OFFSET);
+	param->qpc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_QP_OFFSET);
+	param->log_num_qps = byte_field & 0x1f;
+	MLX4_GET(qword_field, outbox, INIT_HCA_SRQC_BASE_OFFSET);
+	param->srqc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_SRQ_OFFSET);
+	param->log_num_srqs = byte_field & 0x1f;
+	MLX4_GET(qword_field, outbox, INIT_HCA_CQC_BASE_OFFSET);
+	param->cqc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_CQ_OFFSET);
+	param->log_num_cqs = byte_field & 0x1f;
+	MLX4_GET(qword_field, outbox, INIT_HCA_ALTC_BASE_OFFSET);
+	param->altc_base = qword_field;
+	MLX4_GET(qword_field, outbox, INIT_HCA_AUXC_BASE_OFFSET);
+	param->auxc_base = qword_field;
+	MLX4_GET(qword_field, outbox, INIT_HCA_EQC_BASE_OFFSET);
+	param->eqc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_EQ_OFFSET);
+	param->log_num_eqs = byte_field & 0x1f;
+	MLX4_GET(word_field, outbox, INIT_HCA_NUM_SYS_EQS_OFFSET);
+	param->num_sys_eqs = word_field & 0xfff;
+	MLX4_GET(qword_field, outbox, INIT_HCA_RDMARC_BASE_OFFSET);
+	param->rdmarc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_RD_OFFSET);
+	param->log_rd_per_qp = byte_field & 0x7;
 
 	MLX4_GET(dword_field, outbox, INIT_HCA_FLAGS_OFFSET);
 	if (dword_field & (1 << INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN)) {
@@ -1967,22 +1982,21 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	/* steering attributes */
 	if (param->steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
 		MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET);
-		MLX4_GET(param->log_mc_entry_sz, outbox,
-			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
-		MLX4_GET(param->log_mc_table_sz, outbox,
-			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
-		MLX4_GET(byte_field, outbox,
-			 INIT_HCA_FS_A0_OFFSET);
+		MLX4_GET(byte_field, outbox, INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		param->log_mc_entry_sz = byte_field & 0x1f;
+		MLX4_GET(byte_field, outbox, INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		param->log_mc_table_sz = byte_field & 0x1f;
+		MLX4_GET(byte_field, outbox, INIT_HCA_FS_A0_OFFSET);
 		param->dmfs_high_steer_mode =
 			a0_dmfs_query_hw_steering[(byte_field >> 6) & 3];
 	} else {
 		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
-		MLX4_GET(param->log_mc_entry_sz, outbox,
-			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
-		MLX4_GET(param->log_mc_hash_sz,  outbox,
-			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
-		MLX4_GET(param->log_mc_table_sz, outbox,
-			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		MLX4_GET(byte_field, outbox, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		param->log_mc_entry_sz = byte_field & 0x1f;
+		MLX4_GET(byte_field,  outbox, INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		param->log_mc_hash_sz = byte_field & 0x1f;
+		MLX4_GET(byte_field, outbox, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		param->log_mc_table_sz = byte_field & 0x1f;
 	}
 
 	/* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */
@@ -2006,15 +2020,18 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	/* TPT attributes */
 
 	MLX4_GET(param->dmpt_base,  outbox, INIT_HCA_DMPT_BASE_OFFSET);
-	MLX4_GET(param->mw_enabled, outbox, INIT_HCA_TPT_MW_OFFSET);
-	MLX4_GET(param->log_mpt_sz, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MLX4_GET(byte_field, outbox, INIT_HCA_TPT_MW_OFFSET);
+	param->mw_enabled = byte_field >> 7;
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	param->log_mpt_sz = byte_field & 0x3f;
 	MLX4_GET(param->mtt_base,   outbox, INIT_HCA_MTT_BASE_OFFSET);
 	MLX4_GET(param->cmpt_base,  outbox, INIT_HCA_CMPT_BASE_OFFSET);
 
 	/* UAR attributes */
 
 	MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET);
-	MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
+	param->log_uar_sz = byte_field & 0xf;
 
 	/* phv_check enable */
 	MLX4_GET(byte_field, outbox, INIT_HCA_CACHELINE_SZ_OFFSET);

From ce29e8a259de767f7210d346ad2b031cb8ab2732 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 24 Jan 2019 14:18:18 -0800
Subject: [PATCH 2423/3220] netrom: switch to sock timer API

[ Upstream commit 63346650c1a94a92be61a57416ac88c0a47c4327 ]

sk_reset_timer() and sk_stop_timer() properly handle
sock refcnt for timer function. Switching to them
could fix a refcounting bug reported by syzbot.

Reported-and-tested-by: syzbot+defa700d16f1bd1b9a05@syzkaller.appspotmail.com
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-hams@vger.kernel.org
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/netrom/nr_timer.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index 94d05806a9a2..f0ecaec1ff3d 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -53,21 +53,21 @@ void nr_start_t1timer(struct sock *sk)
 {
 	struct nr_sock *nr = nr_sk(sk);
 
-	mod_timer(&nr->t1timer, jiffies + nr->t1);
+	sk_reset_timer(sk, &nr->t1timer, jiffies + nr->t1);
 }
 
 void nr_start_t2timer(struct sock *sk)
 {
 	struct nr_sock *nr = nr_sk(sk);
 
-	mod_timer(&nr->t2timer, jiffies + nr->t2);
+	sk_reset_timer(sk, &nr->t2timer, jiffies + nr->t2);
 }
 
 void nr_start_t4timer(struct sock *sk)
 {
 	struct nr_sock *nr = nr_sk(sk);
 
-	mod_timer(&nr->t4timer, jiffies + nr->t4);
+	sk_reset_timer(sk, &nr->t4timer, jiffies + nr->t4);
 }
 
 void nr_start_idletimer(struct sock *sk)
@@ -75,37 +75,37 @@ void nr_start_idletimer(struct sock *sk)
 	struct nr_sock *nr = nr_sk(sk);
 
 	if (nr->idle > 0)
-		mod_timer(&nr->idletimer, jiffies + nr->idle);
+		sk_reset_timer(sk, &nr->idletimer, jiffies + nr->idle);
 }
 
 void nr_start_heartbeat(struct sock *sk)
 {
-	mod_timer(&sk->sk_timer, jiffies + 5 * HZ);
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + 5 * HZ);
 }
 
 void nr_stop_t1timer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->t1timer);
+	sk_stop_timer(sk, &nr_sk(sk)->t1timer);
 }
 
 void nr_stop_t2timer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->t2timer);
+	sk_stop_timer(sk, &nr_sk(sk)->t2timer);
 }
 
 void nr_stop_t4timer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->t4timer);
+	sk_stop_timer(sk, &nr_sk(sk)->t4timer);
 }
 
 void nr_stop_idletimer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->idletimer);
+	sk_stop_timer(sk, &nr_sk(sk)->idletimer);
 }
 
 void nr_stop_heartbeat(struct sock *sk)
 {
-	del_timer(&sk->sk_timer);
+	sk_stop_timer(sk, &sk->sk_timer);
 }
 
 int nr_t1timer_running(struct sock *sk)

From 8034f3610b7840f58ffbacf2db49680a81779d6d Mon Sep 17 00:00:00 2001
From: Bernard Pidoux <f6bvp@free.fr>
Date: Fri, 25 Jan 2019 11:46:40 +0100
Subject: [PATCH 2424/3220] net/rose: fix NULL ax25_cb kernel panic

[ Upstream commit b0cf029234f9b18e10703ba5147f0389c382bccc ]

When an internally generated frame is handled by rose_xmit(),
rose_route_frame() is called:

        if (!rose_route_frame(skb, NULL)) {
                dev_kfree_skb(skb);
                stats->tx_errors++;
                return NETDEV_TX_OK;
        }

We have the same code sequence in Net/Rom where an internally generated
frame is handled by nr_xmit() calling nr_route_frame(skb, NULL).
However, in this function NULL argument is tested while it is not in
rose_route_frame().
Then kernel panic occurs later on when calling ax25cmp() with a NULL
ax25_cb argument as reported many times and recently with syzbot.

We need to test if ax25 is NULL before using it.

Testing:
Built kernel with CONFIG_ROSE=y.

Signed-off-by: Bernard Pidoux <f6bvp@free.fr>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Reported-by: syzbot+1a2c456a1ea08fa5b5f7@syzkaller.appspotmail.com
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bernard Pidoux <f6bvp@free.fr>
Cc: linux-hams@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/rose/rose_route.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 0fc76d845103..9f704a7f2a28 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -848,6 +848,7 @@ void rose_link_device_down(struct net_device *dev)
 
 /*
  *	Route a frame to an appropriate AX.25 connection.
+ *	A NULL ax25_cb indicates an internally generated frame.
  */
 int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 {
@@ -865,6 +866,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 
 	if (skb->len < ROSE_MIN_LEN)
 		return res;
+
+	if (!ax25)
+		return rose_loopback_queue(skb, NULL);
+
 	frametype = skb->data[2];
 	lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
 	if (frametype == ROSE_CALL_REQUEST &&

From 3154a8ad0541dae3665a08b1c60f1c81ee7d5cd8 Mon Sep 17 00:00:00 2001
From: Mathias Thore <mathias.thore@infinera.com>
Date: Mon, 28 Jan 2019 10:07:47 +0100
Subject: [PATCH 2425/3220] ucc_geth: Reset BQL queue when stopping device

[ Upstream commit e15aa3b2b1388c399c1a2ce08550d2cc4f7e3e14 ]

After a timeout event caused by for example a broadcast storm, when
the MAC and PHY are reset, the BQL TX queue needs to be reset as
well. Otherwise, the device will exhibit severe performance issues
even after the storm has ended.

Co-authored-by: David Gounaris <david.gounaris@infinera.com>
Signed-off-by: Mathias Thore <mathias.thore@infinera.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/freescale/ucc_geth.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index 650f7888e32b..55ac00055977 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -1888,6 +1888,8 @@ static void ucc_geth_free_tx(struct ucc_geth_private *ugeth)
 	u16 i, j;
 	u8 __iomem *bd;
 
+	netdev_reset_queue(ugeth->ndev);
+
 	ug_info = ugeth->ug_info;
 	uf_info = &ug_info->uf_info;
 

From f5f5d316fa2959cd2bd85e61ff50d0a68cd885a8 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 16 Jan 2018 23:01:55 +0100
Subject: [PATCH 2426/3220] l2tp: remove l2specific_len dependency in l2tp_core

commit 62e7b6a57c7b9bf3c6fd99418eeec05b08a85c38 upstream.

Remove l2specific_len dependency while building l2tpv3 header or
parsing the received frame since default L2-Specific Sublayer is
always four bytes long and we don't need to rely on a user supplied
value.
Moreover in l2tp netlink code there are no sanity checks to
enforce the relation between l2specific_len and l2specific_type,
so sending a malformed netlink message is possible to set
l2specific_type to L2TP_L2SPECTYPE_DEFAULT (or even
L2TP_L2SPECTYPE_NONE) and set l2specific_len to a value greater than
4 leaking memory on the wire and sending corrupted frames.

Reviewed-by: Guillaume Nault <g.nault@alphalink.fr>
Tested-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/l2tp/l2tp_core.c | 34 ++++++++++++++++------------------
 net/l2tp/l2tp_core.h | 11 +++++++++++
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 4b49e6602113..a8c3856e2a15 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -704,11 +704,9 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 				 "%s: recv data ns=%u, session nr=%u\n",
 				 session->name, ns, session->nr);
 		}
+		ptr += 4;
 	}
 
-	/* Advance past L2-specific header, if present */
-	ptr += session->l2specific_len;
-
 	if (L2TP_SKB_CB(skb)->has_seq) {
 		/* Received a packet with sequence numbers. If we're the LNS,
 		 * check if we sre sending sequence numbers and if not,
@@ -1030,21 +1028,20 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
 		memcpy(bufp, &session->cookie[0], session->cookie_len);
 		bufp += session->cookie_len;
 	}
-	if (session->l2specific_len) {
-		if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
-			u32 l2h = 0;
-			if (session->send_seq) {
-				l2h = 0x40000000 | session->ns;
-				session->ns++;
-				session->ns &= 0xffffff;
-				l2tp_dbg(session, L2TP_MSG_SEQ,
-					 "%s: updated ns to %u\n",
-					 session->name, session->ns);
-			}
+	if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
+		u32 l2h = 0;
 
-			*((__be32 *) bufp) = htonl(l2h);
+		if (session->send_seq) {
+			l2h = 0x40000000 | session->ns;
+			session->ns++;
+			session->ns &= 0xffffff;
+			l2tp_dbg(session, L2TP_MSG_SEQ,
+				 "%s: updated ns to %u\n",
+				 session->name, session->ns);
 		}
-		bufp += session->l2specific_len;
+
+		*((__be32 *)bufp) = htonl(l2h);
+		bufp += 4;
 	}
 	if (session->offset)
 		bufp += session->offset;
@@ -1723,7 +1720,7 @@ int l2tp_session_delete(struct l2tp_session *session)
 EXPORT_SYMBOL_GPL(l2tp_session_delete);
 
 /* We come here whenever a session's send_seq, cookie_len or
- * l2specific_len parameters are set.
+ * l2specific_type parameters are set.
  */
 void l2tp_session_set_header_len(struct l2tp_session *session, int version)
 {
@@ -1732,7 +1729,8 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version)
 		if (session->send_seq)
 			session->hdr_len += 4;
 	} else {
-		session->hdr_len = 4 + session->cookie_len + session->l2specific_len + session->offset;
+		session->hdr_len = 4 + session->cookie_len + session->offset;
+		session->hdr_len += l2tp_get_l2specific_len(session);
 		if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP)
 			session->hdr_len += 4;
 	}
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 9cf546846edb..3cb3fc6ac982 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -313,6 +313,17 @@ do {									\
 #define l2tp_session_dec_refcount(s) l2tp_session_dec_refcount_1(s)
 #endif
 
+static inline int l2tp_get_l2specific_len(struct l2tp_session *session)
+{
+	switch (session->l2specific_type) {
+	case L2TP_L2SPECTYPE_DEFAULT:
+		return 4;
+	case L2TP_L2SPECTYPE_NONE:
+	default:
+		return 0;
+	}
+}
+
 #define l2tp_printk(ptr, type, func, fmt, ...)				\
 do {									\
 	if (((ptr)->debug) & (type))					\

From 245426dc30affbdfd0488f85014e25340e3004ee Mon Sep 17 00:00:00 2001
From: Jacob Wen <jian.w.wen@oracle.com>
Date: Wed, 30 Jan 2019 14:55:14 +0800
Subject: [PATCH 2427/3220] l2tp: fix reading optional fields of L2TPv3

[ Upstream commit 4522a70db7aa5e77526a4079628578599821b193 ]

Use pskb_may_pull() to make sure the optional fields are in skb linear
parts, so we can safely read them later.

It's easy to reproduce the issue with a net driver that supports paged
skb data. Just create a L2TPv3 over IP tunnel and then generates some
network traffic.
Once reproduced, rx err in /sys/kernel/debug/l2tp/tunnels will increase.

Changes in v4:
1. s/l2tp_v3_pull_opt/l2tp_v3_ensure_opt_in_linear/
2. s/tunnel->version != L2TP_HDR_VER_2/tunnel->version == L2TP_HDR_VER_3/
3. Add 'Fixes' in commit messages.

Changes in v3:
1. To keep consistency, move the code out of l2tp_recv_common.
2. Use "net" instead of "net-next", since this is a bug fix.

Changes in v2:
1. Only fix L2TPv3 to make code simple.
   To fix both L2TPv3 and L2TPv2, we'd better refactor l2tp_recv_common.
   It's complicated to do so.
2. Reloading pointers after pskb_may_pull

Fixes: f7faffa3ff8e ("l2tp: Add L2TPv3 protocol support")
Fixes: 0d76751fad77 ("l2tp: Add L2TPv3 IP encapsulation (no UDP) support")
Fixes: a32e0eec7042 ("l2tp: introduce L2TPv3 IP encapsulation support for IPv6")
Signed-off-by: Jacob Wen <jian.w.wen@oracle.com>
Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/l2tp/l2tp_core.c |  4 ++++
 net/l2tp/l2tp_core.h | 20 ++++++++++++++++++++
 net/l2tp/l2tp_ip.c   |  3 +++
 net/l2tp/l2tp_ip6.c  |  3 +++
 4 files changed, 30 insertions(+)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index a8c3856e2a15..429dbb064240 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -930,6 +930,10 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
 		goto error;
 	}
 
+	if (tunnel->version == L2TP_HDR_VER_3 &&
+	    l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
+		goto error;
+
 	l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook);
 
 	return 0;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 3cb3fc6ac982..fad47e9d74bc 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -324,6 +324,26 @@ static inline int l2tp_get_l2specific_len(struct l2tp_session *session)
 	}
 }
 
+static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, struct sk_buff *skb,
+					       unsigned char **ptr, unsigned char **optr)
+{
+	int opt_len = session->peer_cookie_len + l2tp_get_l2specific_len(session);
+
+	if (opt_len > 0) {
+		int off = *ptr - *optr;
+
+		if (!pskb_may_pull(skb, off + opt_len))
+			return -1;
+
+		if (skb->data != *optr) {
+			*optr = skb->data;
+			*ptr = skb->data + off;
+		}
+	}
+
+	return 0;
+}
+
 #define l2tp_printk(ptr, type, func, fmt, ...)				\
 do {									\
 	if (((ptr)->debug) & (type))					\
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index af74e3ba0f92..7efb3cadc152 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -163,6 +163,9 @@ static int l2tp_ip_recv(struct sk_buff *skb)
 		print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
 	}
 
+	if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
+		goto discard;
+
 	l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook);
 
 	return 0;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 591d308bf63a..e066111b9398 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -174,6 +174,9 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
 		print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
 	}
 
+	if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
+		goto discard;
+
 	l2tp_recv_common(session, skb, ptr, optr, 0, skb->len,
 			 tunnel->recv_payload_hook);
 	return 0;

From 600c4bd14dc7aee84427e01dfd5d4bbe91bbb79b Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Sat, 26 Jan 2019 12:21:32 -0800
Subject: [PATCH 2428/3220] CIFS: Do not count -ENODATA as failure for query
 directory

commit 8e6e72aeceaaed5aeeb1cb43d3085de7ceb14f79 upstream.

Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index f7111bb88ec1..5e21d58c49ef 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2523,8 +2523,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 		if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
 			srch_inf->endOfSearch = true;
 			rc = 0;
-		}
-		cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+		} else
+			cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
 		goto qdir_exit;
 	}
 

From e31a6a9d27cdf8ef407cc7051a4784785caeb76b Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 30 Jan 2019 13:52:36 -0500
Subject: [PATCH 2429/3220] fs/dcache: Fix incorrect nr_dentry_unused
 accounting in shrink_dcache_sb()

commit 1dbd449c9943e3145148cc893c2461b72ba6fef0 upstream.

The nr_dentry_unused per-cpu counter tracks dentries in both the LRU
lists and the shrink lists where the DCACHE_LRU_LIST bit is set.

The shrink_dcache_sb() function moves dentries from the LRU list to a
shrink list and subtracts the dentry count from nr_dentry_unused.  This
is incorrect as the nr_dentry_unused count will also be decremented in
shrink_dentry_list() via d_shrink_del().

To fix this double decrement, the decrement in the shrink_dcache_sb()
function is taken out.

Fixes: 4e717f5c1083 ("list_lru: remove special case function list_lru_dispose_all."
Cc: stable@kernel.org
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dcache.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 141651b0c766..9ffe60702299 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1155,15 +1155,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
  */
 void shrink_dcache_sb(struct super_block *sb)
 {
-	long freed;
-
 	do {
 		LIST_HEAD(dispose);
 
-		freed = list_lru_walk(&sb->s_dentry_lru,
+		list_lru_walk(&sb->s_dentry_lru,
 			dentry_lru_isolate_shrink, &dispose, 1024);
-
-		this_cpu_sub(nr_dentry_unused, freed);
 		shrink_dentry_list(&dispose);
 		cond_resched();
 	} while (list_lru_count(&sb->s_dentry_lru) > 0);

From 1472585f511fc3a6e95f1be1d6acf87fbb4cde43 Mon Sep 17 00:00:00 2001
From: Koen Vandeputte <koen.vandeputte@ncentric.com>
Date: Thu, 31 Jan 2019 15:00:01 -0600
Subject: [PATCH 2430/3220] ARM: cns3xxx: Fix writing to wrong PCI config
 registers after alignment

commit 65dbb423cf28232fed1732b779249d6164c5999b upstream.

Originally, cns3xxx used its own functions for mapping, reading and
writing config registers.

Commit 802b7c06adc7 ("ARM: cns3xxx: Convert PCI to use generic config
accessors") removed the internal PCI config write function in favor of
the generic one:

  cns3xxx_pci_write_config() --> pci_generic_config_write()

cns3xxx_pci_write_config() expected aligned addresses, being produced by
cns3xxx_pci_map_bus() while the generic one pci_generic_config_write()
actually expects the real address as both the function and hardware are
capable of byte-aligned writes.

This currently leads to pci_generic_config_write() writing to the wrong
registers.

For instance, upon ath9k module loading:

- driver ath9k gets loaded
- The driver wants to write value 0xA8 to register PCI_LATENCY_TIMER,
  located at 0x0D
- cns3xxx_pci_map_bus() aligns the address to 0x0C
- pci_generic_config_write() effectively writes 0xA8 into register 0x0C
  (CACHE_LINE_SIZE)

Fix the bug by removing the alignment in the cns3xxx mapping function.

Fixes: 802b7c06adc7 ("ARM: cns3xxx: Convert PCI to use generic config accessors")
Signed-off-by: Koen Vandeputte <koen.vandeputte@ncentric.com>
[lorenzo.pieralisi@arm.com: updated commit log]
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Krzysztof Halasa <khalasa@piap.pl>
Acked-by: Tim Harvey <tharvey@gateworks.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
CC: stable@vger.kernel.org	# v4.0+
CC: Bjorn Helgaas <bhelgaas@google.com>
CC: Olof Johansson <olof@lixom.net>
CC: Robin Leblon <robin.leblon@ncentric.com>
CC: Rob Herring <robh@kernel.org>
CC: Russell King <linux@armlinux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-cns3xxx/pcie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mach-cns3xxx/pcie.c b/arch/arm/mach-cns3xxx/pcie.c
index 318394ed5c7a..5e11ad3164e0 100644
--- a/arch/arm/mach-cns3xxx/pcie.c
+++ b/arch/arm/mach-cns3xxx/pcie.c
@@ -83,7 +83,7 @@ static void __iomem *cns3xxx_pci_map_bus(struct pci_bus *bus,
 	} else /* remote PCI bus */
 		base = cnspci->cfg1_regs + ((busno & 0xf) << 20);
 
-	return base + (where & 0xffc) + (devfn << 12);
+	return base + where + (devfn << 12);
 }
 
 static int cns3xxx_pci_read_config(struct pci_bus *bus, unsigned int devfn,

From 7685bb0efb0c9ecec18abe9c6226243014222344 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Thu, 24 Jan 2019 16:32:56 +0000
Subject: [PATCH 2431/3220] arm64: hyp-stub: Forbid kprobing of the hyp-stub

commit 8fac5cbdfe0f01254d9d265c6aa1a95f94f58595 upstream.

The hyp-stub is loaded by the kernel's early startup code at EL2
during boot, before KVM takes ownership later. The hyp-stub's
text is part of the regular kernel text, meaning it can be kprobed.

A breakpoint in the hyp-stub causes the CPU to spin in el2_sync_invalid.

Add it to the __hyp_text.

Signed-off-by: James Morse <james.morse@arm.com>
Cc: stable@vger.kernel.org
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/kernel/hyp-stub.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index a272f335c289..096e957aecb0 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -26,6 +26,8 @@
 #include <asm/virt.h>
 
 	.text
+	.pushsection	.hyp.text, "ax"
+
 	.align 11
 
 ENTRY(__hyp_stub_vectors)

From 970e5c267cf0c803058e373389b76b4b37c07eb8 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 30 Jan 2019 21:30:36 +0100
Subject: [PATCH 2432/3220] gfs2: Revert "Fix loop in gfs2_rbm_find"

commit e74c98ca2d6ae4376cc15fa2a22483430909d96b upstream.

This reverts commit 2d29f6b96d8f80322ed2dd895bca590491c38d34.

It turns out that the fix can lead to a ~20 percent performance regression
in initial writes to the page cache according to iozone.  Let's revert this
for now to have more time for a proper fix.

Cc: stable@vger.kernel.org # v3.13+
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 763fe7737065..ef24894edecc 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1720,9 +1720,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
 			goto next_iter;
 		}
 		if (ret == -E2BIG) {
-			n += rbm->bii - initial_bii;
 			rbm->bii = 0;
 			rbm->offset = 0;
+			n += (rbm->bii - initial_bii);
 			goto res_covered_end_of_rgrp;
 		}
 		return ret;

From 5d0868d8729221d0fcf7651c2e4f882138fcaf85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Rechi=20Vita?= <jprvita@gmail.com>
Date: Wed, 31 Oct 2018 17:21:27 -0700
Subject: [PATCH 2433/3220] platform/x86: asus-nb-wmi: Map 0x35 to
 KEY_SCREENLOCK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit b3f2f3799a972d3863d0fdc2ab6287aef6ca631f ]

When the OS registers to handle events from the display off hotkey the
EC will send a notification with 0x35 for every key press, independent
of the backlight state.

The behavior of this key on Windows, with the ATKACPI driver from Asus
installed, is turning off the backlight of all connected displays with a
fading effect, and any cursor input or key press turning the backlight
back on. The key press or cursor input that wakes up the display is also
passed through to the application under the cursor or under focus.

The key that matches this behavior the closest is KEY_SCREENLOCK.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/platform/x86/asus-nb-wmi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c
index 852d2de7f69f..2ee8d016fa5c 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -341,6 +341,7 @@ static const struct key_entry asus_nb_wmi_keymap[] = {
 	{ KE_KEY, 0x32, { KEY_MUTE } },
 	{ KE_KEY, 0x33, { KEY_DISPLAYTOGGLE } }, /* LCD on */
 	{ KE_KEY, 0x34, { KEY_DISPLAY_OFF } }, /* LCD off */
+	{ KE_KEY, 0x35, { KEY_SCREENLOCK } },
 	{ KE_KEY, 0x40, { KEY_PREVIOUSSONG } },
 	{ KE_KEY, 0x41, { KEY_NEXTSONG } },
 	{ KE_KEY, 0x43, { KEY_STOPCD } }, /* Stop/Eject */

From 86a395c27a5d264ccecf50eaf9b5cd035c5b3a4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Rechi=20Vita?= <jprvita@gmail.com>
Date: Wed, 31 Oct 2018 17:21:28 -0700
Subject: [PATCH 2434/3220] platform/x86: asus-nb-wmi: Drop mapping of 0x33 and
 0x34 scan codes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 71b12beaf12f21a53bfe100795d0797f1035b570 ]

According to Asus firmware engineers, the meaning of these codes is only
to notify the OS that the screen brightness has been turned on/off by
the EC. This does not match the meaning of KEY_DISPLAYTOGGLE /
KEY_DISPLAY_OFF, where userspace is expected to change the display
brightness.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/platform/x86/asus-nb-wmi.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c
index 2ee8d016fa5c..a284a2b42bcd 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -339,8 +339,6 @@ static const struct key_entry asus_nb_wmi_keymap[] = {
 	{ KE_KEY, 0x30, { KEY_VOLUMEUP } },
 	{ KE_KEY, 0x31, { KEY_VOLUMEDOWN } },
 	{ KE_KEY, 0x32, { KEY_MUTE } },
-	{ KE_KEY, 0x33, { KEY_DISPLAYTOGGLE } }, /* LCD on */
-	{ KE_KEY, 0x34, { KEY_DISPLAY_OFF } }, /* LCD off */
 	{ KE_KEY, 0x35, { KEY_SCREENLOCK } },
 	{ KE_KEY, 0x40, { KEY_PREVIOUSSONG } },
 	{ KE_KEY, 0x41, { KEY_NEXTSONG } },

From 3027ecc100683caa1b95aa807503d4b57c9e5496 Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Sun, 23 Dec 2018 21:59:17 +0100
Subject: [PATCH 2435/3220] mmc: sdhci-iproc: handle mmc_of_parse() errors
 during probe

commit 2bd44dadd5bfb4135162322fd0b45a174d4ad5bf upstream.

We need to handle mmc_of_parse() errors during probe.

This finally fixes the wifi regression on Raspberry Pi 3 series.
In error case the wifi chip was permanently in reset because of
the power sequence depending on the deferred probe of the GPIO expander.

Fixes: b580c52d58d9 ("mmc: sdhci-iproc: add IPROC SDHCI driver")
Cc: stable@vger.kernel.org
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/host/sdhci-iproc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c
index ffd448149796..4a2ae06d0da4 100644
--- a/drivers/mmc/host/sdhci-iproc.c
+++ b/drivers/mmc/host/sdhci-iproc.c
@@ -217,7 +217,10 @@ static int sdhci_iproc_probe(struct platform_device *pdev)
 
 	iproc_host->data = iproc_data;
 
-	mmc_of_parse(host->mmc);
+	ret = mmc_of_parse(host->mmc);
+	if (ret)
+		goto err;
+
 	sdhci_get_of_property(pdev);
 
 	/* Enable EMMC 1/8V DDR capable */

From e790eeabc47cd4905606839fe3eb18df2bbd4d3e Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Fri, 1 Feb 2019 14:20:24 -0800
Subject: [PATCH 2436/3220] kernel/exit.c: release ptraced tasks before
 zap_pid_ns_processes

commit 8fb335e078378c8426fabeed1ebee1fbf915690c upstream.

Currently, exit_ptrace() adds all ptraced tasks in a dead list, then
zap_pid_ns_processes() waits on all tasks in a current pidns, and only
then are tasks from the dead list released.

zap_pid_ns_processes() can get stuck on waiting tasks from the dead
list.  In this case, we will have one unkillable process with one or
more dead children.

Thanks to Oleg for the advice to release tasks in find_child_reaper().

Link: http://lkml.kernel.org/r/20190110175200.12442-1-avagin@gmail.com
Fixes: 7c8bd2322c7f ("exit: ptrace: shift "reap dead" code from exit_ptrace() to forget_original_parent()")
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/exit.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index f20e6339761b..03f6722302b5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -450,12 +450,14 @@ static struct task_struct *find_alive_thread(struct task_struct *p)
 	return NULL;
 }
 
-static struct task_struct *find_child_reaper(struct task_struct *father)
+static struct task_struct *find_child_reaper(struct task_struct *father,
+						struct list_head *dead)
 	__releases(&tasklist_lock)
 	__acquires(&tasklist_lock)
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(father);
 	struct task_struct *reaper = pid_ns->child_reaper;
+	struct task_struct *p, *n;
 
 	if (likely(reaper != father))
 		return reaper;
@@ -471,6 +473,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
 		panic("Attempted to kill init! exitcode=0x%08x\n",
 			father->signal->group_exit_code ?: father->exit_code);
 	}
+
+	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+
 	zap_pid_ns_processes(pid_ns);
 	write_lock_irq(&tasklist_lock);
 
@@ -557,7 +565,7 @@ static void forget_original_parent(struct task_struct *father,
 		exit_ptrace(father, dead);
 
 	/* Can drop and reacquire tasklist_lock */
-	reaper = find_child_reaper(father);
+	reaper = find_child_reaper(father, dead);
 	if (list_empty(&father->children))
 		return;
 

From c3ef8a44e758f9596f09c684c617eb197e8f0df8 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Fri, 1 Feb 2019 14:20:54 -0800
Subject: [PATCH 2437/3220] mm, oom: fix use-after-free in oom_kill_process

commit cefc7ef3c87d02fc9307835868ff721ea12cc597 upstream.

Syzbot instance running on upstream kernel found a use-after-free bug in
oom_kill_process.  On further inspection it seems like the process
selected to be oom-killed has exited even before reaching
read_lock(&tasklist_lock) in oom_kill_process().  More specifically the
tsk->usage is 1 which is due to get_task_struct() in oom_evaluate_task()
and the put_task_struct within for_each_thread() frees the tsk and
for_each_thread() tries to access the tsk.  The easiest fix is to do
get/put across the for_each_thread() on the selected task.

Now the next question is should we continue with the oom-kill as the
previously selected task has exited? However before adding more
complexity and heuristics, let's answer why we even look at the children
of oom-kill selected task? The select_bad_process() has already selected
the worst process in the system/memcg.  Due to race, the selected
process might not be the worst at the kill time but does that matter?
The userspace can use the oom_score_adj interface to prefer children to
be killed before the parent.  I looked at the history but it seems like
this is there before git history.

Link: http://lkml.kernel.org/r/20190121215850.221745-1-shakeelb@google.com
Reported-by: syzbot+7fbbfa368521945f0e3d@syzkaller.appspotmail.com
Fixes: 6b0c81b3be11 ("mm, oom: reduce dependency on tasklist_lock")
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/oom_kill.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c12680993ff3..bc781cdc0d04 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -544,6 +544,13 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	 * still freeing memory.
 	 */
 	read_lock(&tasklist_lock);
+
+	/*
+	 * The task 'p' might have already exited before reaching here. The
+	 * put_task_struct() will free task_struct 'p' while the loop still try
+	 * to access the field of 'p', so, get an extra reference.
+	 */
+	get_task_struct(p);
 	for_each_thread(p, t) {
 		list_for_each_entry(child, &t->children, sibling) {
 			unsigned int child_points;
@@ -563,6 +570,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 			}
 		}
 	}
+	put_task_struct(p);
 	read_unlock(&tasklist_lock);
 
 	p = find_lock_task_mm(victim);

From 8f51e79ceeb40daddeac931b2c542574f6eae231 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <paulo@paulo.ac>
Date: Tue, 20 Nov 2018 15:16:36 -0200
Subject: [PATCH 2438/3220] cifs: Always resolve hostname before reconnecting

commit 28eb24ff75c5ac130eb326b3b4d0dcecfc0f427d upstream.

In case a hostname resolves to a different IP address (e.g. long
running mounts), make sure to resolve it every time prior to calling
generic_ip_connect() in reconnect.

Suggested-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Paulo Alcantara <palcantara@suse.de>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/connect.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1eeb4780c3ed..eacf57c24ca9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -48,6 +48,7 @@
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "dns_resolve.h"
 #include "ntlmssp.h"
 #include "nterr.h"
 #include "rfc1002pdu.h"
@@ -303,6 +304,53 @@ static void cifs_prune_tlinks(struct work_struct *work);
 static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
 					const char *devname);
 
+/*
+ * Resolve hostname and set ip addr in tcp ses. Useful for hostnames that may
+ * get their ip addresses changed at some point.
+ *
+ * This should be called with server->srv_mutex held.
+ */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+	int rc;
+	int len;
+	char *unc, *ipaddr = NULL;
+
+	if (!server->hostname)
+		return -EINVAL;
+
+	len = strlen(server->hostname) + 3;
+
+	unc = kmalloc(len, GFP_KERNEL);
+	if (!unc) {
+		cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
+		return -ENOMEM;
+	}
+	snprintf(unc, len, "\\\\%s", server->hostname);
+
+	rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
+	kfree(unc);
+
+	if (rc < 0) {
+		cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n",
+			 __func__, server->hostname, rc);
+		return rc;
+	}
+
+	rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
+				  strlen(ipaddr));
+	kfree(ipaddr);
+
+	return !rc ? -1 : 0;
+}
+#else
+static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+	return 0;
+}
+#endif
+
 /*
  * cifs tcp session reconnection
  *
@@ -400,6 +448,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		rc = generic_ip_connect(server);
 		if (rc) {
 			cifs_dbg(FYI, "reconnect error %d\n", rc);
+			rc = reconn_set_ipaddr(server);
+			if (rc) {
+				cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
+					 __func__, rc);
+			}
 			mutex_unlock(&server->srv_mutex);
 			msleep(3000);
 		} else {

From a51bbfef6d1e4513ab6ed7ccd44a881db7e97222 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 10 Jul 2018 10:29:10 +1000
Subject: [PATCH 2439/3220] drivers: core: Remove glue dirs from sysfs earlier

commit 726e41097920a73e4c7c33385dcc0debb1281e18 upstream.

For devices with a class, we create a "glue" directory between
the parent device and the new device with the class name.

This directory is never "explicitely" removed when empty however,
this is left to the implicit sysfs removal done by kobject_release()
when the object loses its last reference via kobject_put().

This is problematic because as long as it's not been removed from
sysfs, it is still present in the class kset and in sysfs directory
structure.

The presence in the class kset exposes a use after free bug fixed
by the previous patch, but the presence in sysfs means that until
the kobject is released, which can take a while (especially with
kobject debugging), any attempt at re-creating such as binding a
new device for that class/parent pair, will result in a sysfs
duplicate file name error.

This fixes it by instead doing an explicit kobject_del() when
the glue dir is empty, by keeping track of the number of
child devices of the gluedir.

This is made easy by the fact that all glue dir operations are
done with a global mutex, and there's already a function
(cleanup_glue_dir) called in all the right places taking that
mutex that can be enhanced for this. It appears that this was
in fact the intent of the function, but the implementation was
wrong.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Zubin Mithra <zsm@chromium.org>
Cc: Guenter Roeck <groeck@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c     |  2 ++
 include/linux/kobject.h | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 049ccc070ce5..cb5718d2669e 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -862,6 +862,8 @@ static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
 		return;
 
 	mutex_lock(&gdp_mutex);
+	if (!kobject_has_children(glue_dir))
+		kobject_del(glue_dir);
 	kobject_put(glue_dir);
 	mutex_unlock(&gdp_mutex);
 }
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e6284591599e..5957c6a3fd7f 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -113,6 +113,23 @@ extern void kobject_put(struct kobject *kobj);
 extern const void *kobject_namespace(struct kobject *kobj);
 extern char *kobject_get_path(struct kobject *kobj, gfp_t flag);
 
+/**
+ * kobject_has_children - Returns whether a kobject has children.
+ * @kobj: the object to test
+ *
+ * This will return whether a kobject has other kobjects as children.
+ *
+ * It does NOT account for the presence of attribute files, only sub
+ * directories. It also assumes there is no concurrent addition or
+ * removal of such children, and thus relies on external locking.
+ */
+static inline bool kobject_has_children(struct kobject *kobj)
+{
+	WARN_ON_ONCE(atomic_read(&kobj->kref.refcount) == 0);
+
+	return kobj->sd && kobj->sd->dir.subdirs;
+}
+
 struct kobj_type {
 	void (*release)(struct kobject *kobj);
 	const struct sysfs_ops *sysfs_ops;

From 57d813863143eb18769ac685d300487625f7e75a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 1 Feb 2019 14:21:19 -0800
Subject: [PATCH 2440/3220] mm: migrate: don't rely on __PageMovable() of
 newpage after unlocking it

commit e0a352fabce61f730341d119fbedf71ffdb8663f upstream.

We had a race in the old balloon compaction code before b1123ea6d3b3
("mm: balloon: use general non-lru movable page feature") refactored it
that became visible after backporting 195a8c43e93d ("virtio-balloon:
deflate via a page list") without the refactoring.

The bug existed from commit d6d86c0a7f8d ("mm/balloon_compaction:
redesign ballooned pages management") till b1123ea6d3b3 ("mm: balloon:
use general non-lru movable page feature").  d6d86c0a7f8d
("mm/balloon_compaction: redesign ballooned pages management") was
backported to 3.12, so the broken kernels are stable kernels [3.12 -
4.7].

There was a subtle race between dropping the page lock of the newpage in
__unmap_and_move() and checking for __is_movable_balloon_page(newpage).

Just after dropping this page lock, virtio-balloon could go ahead and
deflate the newpage, effectively dequeueing it and clearing PageBalloon,
in turn making __is_movable_balloon_page(newpage) fail.

This resulted in dropping the reference of the newpage via
putback_lru_page(newpage) instead of put_page(newpage), leading to
page->lru getting modified and a !LRU page ending up in the LRU lists.
With 195a8c43e93d ("virtio-balloon: deflate via a page list")
backported, one would suddenly get corrupted lists in
release_pages_balloon():

- WARNING: CPU: 13 PID: 6586 at lib/list_debug.c:59 __list_del_entry+0xa1/0xd0
- list_del corruption. prev->next should be ffffe253961090a0, but was dead000000000100

Nowadays this race is no longer possible, but it is hidden behind very
ugly handling of __ClearPageMovable() and __PageMovable().

__ClearPageMovable() will not make __PageMovable() fail, only
PageMovable().  So the new check (__PageMovable(newpage)) will still
hold even after newpage was dequeued by virtio-balloon.

If anybody would ever change that special handling, the BUG would be
introduced again.  So instead, make it explicit and use the information
of the original isolated page before migration.

This patch can be backported fairly easy to stable kernels (in contrast
to the refactoring).

Link: http://lkml.kernel.org/r/20190129233217.10747-1-david@redhat.com
Fixes: d6d86c0a7f8d ("mm/balloon_compaction: redesign ballooned pages management")
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: Vratislav Bendel <vbendel@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vratislav Bendel <vbendel@redhat.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Konstantin Khlebnikov <k.khlebnikov@samsung.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: <stable@vger.kernel.org>	[3.12 - 4.7]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/migrate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index afedcfab60e2..ce88dff1da98 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -936,6 +936,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 	int rc = MIGRATEPAGE_SUCCESS;
 	int *result = NULL;
 	struct page *newpage;
+	bool is_lru = !isolated_balloon_page(page);
 
 	newpage = get_new_page(page, private, &result);
 	if (!newpage)
@@ -983,11 +984,13 @@ out:
 	/*
 	 * If migration was not successful and there's a freeing callback, use
 	 * it.  Otherwise, putback_lru_page() will drop the reference grabbed
-	 * during isolation.
+	 * during isolation. Use the old state of the isolated source page to
+	 * determine if we migrated a LRU page. newpage was already unlocked
+	 * and possibly modified by its owner - don't rely on the page state.
 	 */
 	if (put_new_page)
 		put_new_page(newpage, private);
-	else if (unlikely(__is_movable_balloon_page(newpage))) {
+	else if (rc == MIGRATEPAGE_SUCCESS && unlikely(!is_lru)) {
 		/* drop our reference, page already in the balloon */
 		put_page(newpage);
 	} else

From 16925957b829f725e71fe4a662552a80e3d7d0ad Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 11 May 2018 11:20:57 +1000
Subject: [PATCH 2441/3220] fs: don't scan the inode cache before SB_BORN is
 set

commit 79f546a696bff2590169fb5684e23d65f4d9f591 upstream.

We recently had an oops reported on a 4.14 kernel in
xfs_reclaim_inodes_count() where sb->s_fs_info pointed to garbage
and so the m_perag_tree lookup walked into lala land.  It produces
an oops down this path during the failed mount:

  radix_tree_gang_lookup_tag+0xc4/0x130
  xfs_perag_get_tag+0x37/0xf0
  xfs_reclaim_inodes_count+0x32/0x40
  xfs_fs_nr_cached_objects+0x11/0x20
  super_cache_count+0x35/0xc0
  shrink_slab.part.66+0xb1/0x370
  shrink_node+0x7e/0x1a0
  try_to_free_pages+0x199/0x470
  __alloc_pages_slowpath+0x3a1/0xd20
  __alloc_pages_nodemask+0x1c3/0x200
  cache_grow_begin+0x20b/0x2e0
  fallback_alloc+0x160/0x200
  kmem_cache_alloc+0x111/0x4e0

The problem is that the superblock shrinker is running before the
filesystem structures it depends on have been fully set up. i.e.
the shrinker is registered in sget(), before ->fill_super() has been
called, and the shrinker can call into the filesystem before
fill_super() does it's setup work. Essentially we are exposed to
both use-after-free and use-before-initialisation bugs here.

To fix this, add a check for the SB_BORN flag in super_cache_count.
In general, this flag is not set until ->fs_mount() completes
successfully, so we know that it is set after the filesystem
setup has completed. This matches the trylock_super() behaviour
which will not let super_cache_scan() run if SB_BORN is not set, and
hence will not allow the superblock shrinker from entering the
filesystem while it is being set up or after it has failed setup
and is being torn down.

Cc: stable@kernel.org
Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Aaron Lu <aaron.lu@linux.alibaba.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/super.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 09b526a50986..b9cd7982f6e2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -118,13 +118,23 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	sb = container_of(shrink, struct super_block, s_shrink);
 
 	/*
-	 * Don't call trylock_super as it is a potential
-	 * scalability bottleneck. The counts could get updated
-	 * between super_cache_count and super_cache_scan anyway.
-	 * Call to super_cache_count with shrinker_rwsem held
-	 * ensures the safety of call to list_lru_shrink_count() and
-	 * s_op->nr_cached_objects().
+	 * We don't call trylock_super() here as it is a scalability bottleneck,
+	 * so we're exposed to partial setup state. The shrinker rwsem does not
+	 * protect filesystem operations backing list_lru_shrink_count() or
+	 * s_op->nr_cached_objects(). Counts can change between
+	 * super_cache_count and super_cache_scan, so we really don't need locks
+	 * here.
+	 *
+	 * However, if we are currently mounting the superblock, the underlying
+	 * filesystem might be in a state of partial construction and hence it
+	 * is dangerous to access it.  trylock_super() uses a MS_BORN check to
+	 * avoid this situation, so do the same here. The memory barrier is
+	 * matched with the one in mount_fs() as we don't hold locks here.
 	 */
+	if (!(sb->s_flags & MS_BORN))
+		return 0;
+	smp_rmb();
+
 	if (sb->s_op && sb->s_op->nr_cached_objects)
 		total_objects = sb->s_op->nr_cached_objects(sb, sc);
 
@@ -1133,6 +1143,14 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	sb = root->d_sb;
 	BUG_ON(!sb);
 	WARN_ON(!sb->s_bdi);
+
+	/*
+	 * Write barrier is for super_cache_count(). We place it before setting
+	 * MS_BORN as the data dependency between the two functions is the
+	 * superblock structure contents that we just set up, not the MS_BORN
+	 * flag.
+	 */
+	smp_wmb();
 	sb->s_flags |= MS_BORN;
 
 	error = security_sb_kern_mount(sb, flags, secdata);

From 343f981c760801e7cd4715bcdb54fb075f59335a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 6 Feb 2019 19:43:08 +0100
Subject: [PATCH 2442/3220] Linux 4.4.173

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2aa8db459a74..db7665e32da8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 172
+SUBLEVEL = 173
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From 5eb2471ef43ea672cef90c7c8d66313aae8745f8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:49 -0700
Subject: [PATCH 2443/3220] inet: frags: change inet_frags_init_net() return
 value

commit 787bea7748a76130566f881c2342a0be4127d182 upstream.

We will soon initialize one rhashtable per struct netns_frags
in inet_frags_init_net().

This patch changes the return value to eventually propagate an
error.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_frag.h                 |  3 ++-
 net/ieee802154/6lowpan/reassembly.c     | 11 ++++++++---
 net/ipv4/ip_fragment.c                  | 12 +++++++++---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++++++++---
 net/ipv6/reassembly.c                   | 11 +++++++++--
 5 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index c26a6e4dc306..7881c80feefd 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -103,9 +103,10 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline void inet_frags_init_net(struct netns_frags *nf)
+static inline int inet_frags_init_net(struct netns_frags *nf)
 {
 	atomic_set(&nf->mem, 0);
+	return 0;
 }
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 12e8cf4bda9f..e14962c1fca2 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -580,14 +580,19 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 {
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
+	int res;
 
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	inet_frags_init_net(&ieee802154_lowpan->frags);
-
-	return lowpan_frags_ns_sysctl_register(net);
+	res = inet_frags_init_net(&ieee802154_lowpan->frags);
+	if (res < 0)
+		return res;
+	res = lowpan_frags_ns_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+	return res;
 }
 
 static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 72915658a6b1..14c40f799bd4 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -853,6 +853,8 @@ static void __init ip4_frags_ctl_register(void)
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
+	int res;
+
 	/* Fragment cache limits.
 	 *
 	 * The fragment memory accounting code, (tries to) account for
@@ -876,9 +878,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	 */
 	net->ipv4.frags.timeout = IP_FRAG_TIME;
 
-	inet_frags_init_net(&net->ipv4.frags);
-
-	return ip4_frags_ns_ctl_register(net);
+	res = inet_frags_init_net(&net->ipv4.frags);
+	if (res < 0)
+		return res;
+	res = ip4_frags_ns_ctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 5a9ae56e7868..6859d1e084fe 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -650,12 +650,18 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_orig);
 
 static int nf_ct_net_init(struct net *net)
 {
+	int res;
+
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-	inet_frags_init_net(&net->nf_frag.frags);
-
-	return nf_ct_frag6_sysctl_register(net);
+	res = inet_frags_init_net(&net->nf_frag.frags);
+	if (res < 0)
+		return res;
+	res = nf_ct_frag6_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 58f2139ebb5e..c38a1abb5c62 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -708,13 +708,20 @@ static void ip6_frags_sysctl_unregister(void)
 
 static int __net_init ipv6_frags_init_net(struct net *net)
 {
+	int res;
+
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	inet_frags_init_net(&net->ipv6.frags);
+	res = inet_frags_init_net(&net->ipv6.frags);
+	if (res < 0)
+		return res;
 
-	return ip6_frags_ns_sysctl_register(net);
+	res = ip6_frags_ns_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)

From 9c6727de82e47238f389b415e25443ac46a06de5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:50 -0700
Subject: [PATCH 2444/3220] inet: frags: add a pointer to struct netns_frags

commit 093ba72914b696521e4885756a68a3332782c8de upstream.

In order to simplify the API, add a pointer to struct inet_frags.
This will allow us to make things less complex.

These functions no longer have a struct inet_frags parameter :

inet_frag_destroy(struct inet_frag_queue *q  /*, struct inet_frags *f */)
inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */)
ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backported to 4.4: inet_frag_{kill,put}() are called in some
 different places; update all calls]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_frag.h                 | 11 ++++++-----
 include/net/ipv6.h                      |  3 +--
 net/ieee802154/6lowpan/reassembly.c     | 13 +++++++------
 net/ipv4/inet_fragment.c                | 17 ++++++++++-------
 net/ipv4/ip_fragment.c                  | 10 ++++++----
 net/ipv6/netfilter/nf_conntrack_reasm.c | 16 +++++++++-------
 net/ipv6/reassembly.c                   | 20 ++++++++++----------
 7 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 7881c80feefd..12589f08e064 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -8,6 +8,7 @@ struct netns_frags {
 	int			timeout;
 	int			high_thresh;
 	int			low_thresh;
+	struct inet_frags	*f;
 };
 
 /**
@@ -108,20 +109,20 @@ static inline int inet_frags_init_net(struct netns_frags *nf)
 	atomic_set(&nf->mem, 0);
 	return 0;
 }
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+void inet_frags_exit_net(struct netns_frags *nf);
 
-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
+void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash);
 
 void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 				   const char *prefix);
 
-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
+static inline void inet_frag_put(struct inet_frag_queue *q)
 {
 	if (atomic_dec_and_test(&q->refcnt))
-		inet_frag_destroy(q, f);
+		inet_frag_destroy(q);
 }
 
 static inline bool inet_frag_evicting(struct inet_frag_queue *q)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0e01d570fa22..9d8eace6d455 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -534,8 +534,7 @@ struct frag_queue {
 	u8			ecn;
 };
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-			   struct inet_frags *frags);
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
 
 static inline bool ipv6_addr_any(const struct in6_addr *a)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index e14962c1fca2..8c532743e917 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -93,10 +93,10 @@ static void lowpan_frag_expire(unsigned long data)
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, &lowpan_frags);
+	inet_frag_put(&fq->q);
 }
 
 static inline struct lowpan_frag_queue *
@@ -229,7 +229,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 	struct sk_buff *fp, *head = fq->q.fragments;
 	int sum_truesize;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 
 	/* Make the one we just received the head. */
 	if (prev) {
@@ -437,7 +437,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 		ret = lowpan_frag_queue(fq, skb, frag_type);
 		spin_unlock(&fq->q.lock);
 
-		inet_frag_put(&fq->q, &lowpan_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -585,13 +585,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+	ieee802154_lowpan->frags.f = &lowpan_frags;
 
 	res = inet_frags_init_net(&ieee802154_lowpan->frags);
 	if (res < 0)
 		return res;
 	res = lowpan_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+		inet_frags_exit_net(&ieee802154_lowpan->frags);
 	return res;
 }
 
@@ -601,7 +602,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 		net_ieee802154_lowpan(net);
 
 	lowpan_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+	inet_frags_exit_net(&ieee802154_lowpan->frags);
 }
 
 static struct pernet_operations lowpan_frags_ops = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index b2001b20e029..8c2b869d035f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -219,8 +219,9 @@ void inet_frags_fini(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+void inet_frags_exit_net(struct netns_frags *nf)
 {
+	struct inet_frags *f =nf->f;
 	unsigned int seq;
 	int i;
 
@@ -264,23 +265,23 @@ __acquires(hb->chain_lock)
 	return hb;
 }
 
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+static inline void fq_unlink(struct inet_frag_queue *fq)
 {
 	struct inet_frag_bucket *hb;
 
-	hb = get_frag_bucket_locked(fq, f);
+	hb = get_frag_bucket_locked(fq, fq->net->f);
 	hlist_del(&fq->list);
 	fq->flags |= INET_FRAG_COMPLETE;
 	spin_unlock(&hb->chain_lock);
 }
 
-void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
+void inet_frag_kill(struct inet_frag_queue *fq)
 {
 	if (del_timer(&fq->timer))
 		atomic_dec(&fq->refcnt);
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
-		fq_unlink(fq, f);
+		fq_unlink(fq);
 		atomic_dec(&fq->refcnt);
 	}
 }
@@ -294,11 +295,12 @@ static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
 	kfree_skb(skb);
 }
 
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
+void inet_frag_destroy(struct inet_frag_queue *q)
 {
 	struct sk_buff *fp;
 	struct netns_frags *nf;
 	unsigned int sum, sum_truesize = 0;
+	struct inet_frags *f;
 
 	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 	WARN_ON(del_timer(&q->timer) != 0);
@@ -306,6 +308,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 	/* Release all fragment data. */
 	fp = q->fragments;
 	nf = q->net;
+	f = nf->f;
 	while (fp) {
 		struct sk_buff *xp = fp->next;
 
@@ -341,7 +344,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 			atomic_inc(&qp->refcnt);
 			spin_unlock(&hb->chain_lock);
 			qp_in->flags |= INET_FRAG_COMPLETE;
-			inet_frag_put(qp_in, f);
+			inet_frag_put(qp_in);
 			return qp;
 		}
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 14c40f799bd4..b87f4185bf63 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -169,7 +169,7 @@ static void ip4_frag_free(struct inet_frag_queue *q)
 
 static void ipq_put(struct ipq *ipq)
 {
-	inet_frag_put(&ipq->q, &ip4_frags);
+	inet_frag_put(&ipq->q);
 }
 
 /* Kill ipq entry. It is not destroyed immediately,
@@ -177,7 +177,7 @@ static void ipq_put(struct ipq *ipq)
  */
 static void ipq_kill(struct ipq *ipq)
 {
-	inet_frag_kill(&ipq->q, &ip4_frags);
+	inet_frag_kill(&ipq->q);
 }
 
 static bool frag_expire_skip_icmp(u32 user)
@@ -878,19 +878,21 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	 */
 	net->ipv4.frags.timeout = IP_FRAG_TIME;
 
+	net->ipv4.frags.f = &ip4_frags;
+
 	res = inet_frags_init_net(&net->ipv4.frags);
 	if (res < 0)
 		return res;
 	res = ip4_frags_ns_ctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+		inet_frags_exit_net(&net->ipv4.frags);
 	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
 	ip4_frags_ns_ctl_unregister(net);
-	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+	inet_frags_exit_net(&net->ipv4.frags);
 }
 
 static struct pernet_operations ip4_frags_ops = {
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6859d1e084fe..15a9da5cf296 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -184,7 +184,7 @@ static void nf_ct_frag6_expire(unsigned long data)
 	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, nf_frag.frags);
 
-	ip6_expire_frag_queue(net, fq, &nf_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 /* Creation primitives. */
@@ -362,7 +362,7 @@ found:
 	return 0;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 err:
 	return -1;
 }
@@ -383,7 +383,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 	int    payload_len;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 
 	WARN_ON(head == NULL);
 	WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
@@ -614,7 +614,7 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
 	if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
 		spin_unlock_bh(&fq->q.lock);
 		pr_debug("Can't insert skb to queue\n");
-		inet_frag_put(&fq->q, &nf_frags);
+		inet_frag_put(&fq->q);
 		goto ret_orig;
 	}
 
@@ -626,7 +626,7 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
 	}
 	spin_unlock_bh(&fq->q.lock);
 
-	inet_frag_put(&fq->q, &nf_frags);
+	inet_frag_put(&fq->q);
 	return ret_skb;
 
 ret_orig:
@@ -655,19 +655,21 @@ static int nf_ct_net_init(struct net *net)
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->nf_frag.frags.f = &nf_frags;
+
 	res = inet_frags_init_net(&net->nf_frag.frags);
 	if (res < 0)
 		return res;
 	res = nf_ct_frag6_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+		inet_frags_exit_net(&net->nf_frag.frags);
 	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
 {
 	nf_ct_frags6_sysctl_unregister(net);
-	inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+	inet_frags_exit_net(&net->nf_frag.frags);
 }
 
 static struct pernet_operations nf_ct_net_ops = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index c38a1abb5c62..bcefda85d27b 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -128,8 +128,7 @@ void ip6_frag_init(struct inet_frag_queue *q, const void *a)
 }
 EXPORT_SYMBOL(ip6_frag_init);
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-			   struct inet_frags *frags)
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 {
 	struct net_device *dev = NULL;
 
@@ -138,7 +137,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, frags);
+	inet_frag_kill(&fq->q);
 
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, fq->iif);
@@ -166,7 +165,7 @@ out_rcu_unlock:
 	rcu_read_unlock();
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, frags);
+	inet_frag_put(&fq->q);
 }
 EXPORT_SYMBOL(ip6_expire_frag_queue);
 
@@ -178,7 +177,7 @@ static void ip6_frag_expire(unsigned long data)
 	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, ipv6.frags);
 
-	ip6_expire_frag_queue(net, fq, &ip6_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 static struct frag_queue *
@@ -359,7 +358,7 @@ found:
 	return -1;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 err:
 	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
 			 IPSTATS_MIB_REASMFAILS);
@@ -386,7 +385,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	int sum_truesize;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 
 	ecn = ip_frag_ecn_table[fq->ecn];
 	if (unlikely(ecn == 0xff))
@@ -562,7 +561,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		spin_unlock(&fq->q.lock);
-		inet_frag_put(&fq->q, &ip6_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -713,6 +712,7 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->ipv6.frags.f = &ip6_frags;
 
 	res = inet_frags_init_net(&net->ipv6.frags);
 	if (res < 0)
@@ -720,14 +720,14 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 
 	res = ip6_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+		inet_frags_exit_net(&net->ipv6.frags);
 	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
 {
 	ip6_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+	inet_frags_exit_net(&net->ipv6.frags);
 }
 
 static struct pernet_operations ip6_frags_ops = {

From 8c639cad891c671b449014887678f11b8a1f519a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:51 -0700
Subject: [PATCH 2445/3220] inet: frags: refactor ipfrag_init()

commit 483a6e4fa055123142d8956866fe2aa9c98d546d upstream.

We need to call inet_frags_init() before register_pernet_subsys(),
as a prereq for following patch ("inet: frags: use rhashtables for reassembly units")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_fragment.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b87f4185bf63..f4978a5aa55e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -902,8 +902,6 @@ static struct pernet_operations ip4_frags_ops = {
 
 void __init ipfrag_init(void)
 {
-	ip4_frags_ctl_register();
-	register_pernet_subsys(&ip4_frags_ops);
 	ip4_frags.hashfn = ip4_hashfn;
 	ip4_frags.constructor = ip4_frag_init;
 	ip4_frags.destructor = ip4_frag_free;
@@ -914,4 +912,6 @@ void __init ipfrag_init(void)
 	ip4_frags.frags_cache_name = ip_frag_cache_name;
 	if (inet_frags_init(&ip4_frags))
 		panic("IP: failed to allocate ip4_frags cache\n");
+	ip4_frags_ctl_register();
+	register_pernet_subsys(&ip4_frags_ops);
 }

From 705e71ed99017c123cb09a516f51e0203446ba56 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:52 -0700
Subject: [PATCH 2446/3220] inet: frags: refactor ipv6_frag_init()

commit 5b975bab23615cd0fdf67af6c9298eb01c4b9f61 upstream.

We want to call inet_frags_init() earlier.

This is a prereq to "inet: frags: use rhashtables for reassembly units"

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backported to 4.4: Also delete a redundant assignment to
 ip6_frags.skb_free]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/reassembly.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index bcefda85d27b..0b6f4e1f43db 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -739,10 +739,21 @@ int __init ipv6_frag_init(void)
 {
 	int ret;
 
-	ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+	ip6_frags.hashfn = ip6_hashfn;
+	ip6_frags.constructor = ip6_frag_init;
+	ip6_frags.destructor = NULL;
+	ip6_frags.qsize = sizeof(struct frag_queue);
+	ip6_frags.match = ip6_frag_match;
+	ip6_frags.frag_expire = ip6_frag_expire;
+	ip6_frags.frags_cache_name = ip6_frag_cache_name;
+	ret = inet_frags_init(&ip6_frags);
 	if (ret)
 		goto out;
 
+	ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+	if (ret)
+		goto err_protocol;
+
 	ret = ip6_frags_sysctl_register();
 	if (ret)
 		goto err_sysctl;
@@ -751,17 +762,6 @@ int __init ipv6_frag_init(void)
 	if (ret)
 		goto err_pernet;
 
-	ip6_frags.hashfn = ip6_hashfn;
-	ip6_frags.constructor = ip6_frag_init;
-	ip6_frags.destructor = NULL;
-	ip6_frags.skb_free = NULL;
-	ip6_frags.qsize = sizeof(struct frag_queue);
-	ip6_frags.match = ip6_frag_match;
-	ip6_frags.frag_expire = ip6_frag_expire;
-	ip6_frags.frags_cache_name = ip6_frag_cache_name;
-	ret = inet_frags_init(&ip6_frags);
-	if (ret)
-		goto err_pernet;
 out:
 	return ret;
 
@@ -769,6 +769,8 @@ err_pernet:
 	ip6_frags_sysctl_unregister();
 err_sysctl:
 	inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+err_protocol:
+	inet_frags_fini(&ip6_frags);
 	goto out;
 }
 

From a7fb573c164488b36aeafe63a1531b9e7cf27fad Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:53 -0700
Subject: [PATCH 2447/3220] inet: frags: refactor lowpan_net_frag_init()

commit 807f1844df4ac23594268fa9f41902d0549e92aa upstream.

We want to call lowpan_net_frag_init() earlier.
Similar to commit "inet: frags: refactor ipv6_frag_init()"

This is a prereq to "inet: frags: use rhashtables for reassembly units"

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ieee802154/6lowpan/reassembly.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 8c532743e917..dba0a34cf92a 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -614,14 +614,6 @@ int __init lowpan_net_frag_init(void)
 {
 	int ret;
 
-	ret = lowpan_frags_sysctl_register();
-	if (ret)
-		return ret;
-
-	ret = register_pernet_subsys(&lowpan_frags_ops);
-	if (ret)
-		goto err_pernet;
-
 	lowpan_frags.hashfn = lowpan_hashfn;
 	lowpan_frags.constructor = lowpan_frag_init;
 	lowpan_frags.destructor = NULL;
@@ -632,11 +624,21 @@ int __init lowpan_net_frag_init(void)
 	lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
 	ret = inet_frags_init(&lowpan_frags);
 	if (ret)
-		goto err_pernet;
+		goto out;
 
+	ret = lowpan_frags_sysctl_register();
+	if (ret)
+		goto err_sysctl;
+
+	ret = register_pernet_subsys(&lowpan_frags_ops);
+	if (ret)
+		goto err_pernet;
+out:
 	return ret;
 err_pernet:
 	lowpan_frags_sysctl_unregister();
+err_sysctl:
+	inet_frags_fini(&lowpan_frags);
 	return ret;
 }
 

From 8295999786a5141bb3a8db8e7a8b40e82e66ce3f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Aug 2016 12:31:31 +0200
Subject: [PATCH 2448/3220] rhashtable: add rhashtable_lookup_get_insert_key()

commit 5ca8cc5bf11faed257c762018aea9106d529232f upstream.

This patch modifies __rhashtable_insert_fast() so it returns the
existing object that clashes with the one that you want to insert.
In case the object is successfully inserted, NULL is returned.
Otherwise, you get an error via ERR_PTR().

This patch adapts the existing callers of __rhashtable_insert_fast()
so they handle this new logic, and it adds a new
rhashtable_lookup_get_insert_key() interface to fetch this existing
object.

nf_tables needs this change to improve handling of EEXIST cases via
honoring the NLM_F_EXCL flag and by checking if the data part of the
mapping matches what we have.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/rhashtable.h | 70 +++++++++++++++++++++++++++++++-------
 lib/rhashtable.c           | 10 ++++--
 2 files changed, 64 insertions(+), 16 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index e50b31d18462..e07eca01316c 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -343,7 +343,8 @@ int rhashtable_init(struct rhashtable *ht,
 struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
 					    const void *key,
 					    struct rhash_head *obj,
-					    struct bucket_table *old_tbl);
+					    struct bucket_table *old_tbl,
+					    void **data);
 int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
 
 int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
@@ -562,8 +563,11 @@ restart:
 	return NULL;
 }
 
-/* Internal function, please use rhashtable_insert_fast() instead */
-static inline int __rhashtable_insert_fast(
+/* Internal function, please use rhashtable_insert_fast() instead. This
+ * function returns the existing element already in hashes in there is a clash,
+ * otherwise it returns an error via ERR_PTR().
+ */
+static inline void *__rhashtable_insert_fast(
 	struct rhashtable *ht, const void *key, struct rhash_head *obj,
 	const struct rhashtable_params params)
 {
@@ -576,6 +580,7 @@ static inline int __rhashtable_insert_fast(
 	spinlock_t *lock;
 	unsigned int elasticity;
 	unsigned int hash;
+	void *data = NULL;
 	int err;
 
 restart:
@@ -600,11 +605,14 @@ restart:
 
 	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	if (unlikely(new_tbl)) {
-		tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
+		tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data);
 		if (!IS_ERR_OR_NULL(tbl))
 			goto slow_path;
 
 		err = PTR_ERR(tbl);
+		if (err == -EEXIST)
+			err = 0;
+
 		goto out;
 	}
 
@@ -618,25 +626,25 @@ slow_path:
 		err = rhashtable_insert_rehash(ht, tbl);
 		rcu_read_unlock();
 		if (err)
-			return err;
+			return ERR_PTR(err);
 
 		goto restart;
 	}
 
-	err = -EEXIST;
+	err = 0;
 	elasticity = ht->elasticity;
 	rht_for_each(head, tbl, hash) {
 		if (key &&
 		    unlikely(!(params.obj_cmpfn ?
 			       params.obj_cmpfn(&arg, rht_obj(ht, head)) :
-			       rhashtable_compare(&arg, rht_obj(ht, head)))))
+			       rhashtable_compare(&arg, rht_obj(ht, head))))) {
+			data = rht_obj(ht, head);
 			goto out;
+		}
 		if (!--elasticity)
 			goto slow_path;
 	}
 
-	err = 0;
-
 	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
@@ -651,7 +659,7 @@ out:
 	spin_unlock_bh(lock);
 	rcu_read_unlock();
 
-	return err;
+	return err ? ERR_PTR(err) : data;
 }
 
 /**
@@ -674,7 +682,13 @@ static inline int rhashtable_insert_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
 	const struct rhashtable_params params)
 {
-	return __rhashtable_insert_fast(ht, NULL, obj, params);
+	void *ret;
+
+	ret = __rhashtable_insert_fast(ht, NULL, obj, params);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+
+	return ret == NULL ? 0 : -EEXIST;
 }
 
 /**
@@ -703,11 +717,15 @@ static inline int rhashtable_lookup_insert_fast(
 	const struct rhashtable_params params)
 {
 	const char *key = rht_obj(ht, obj);
+	void *ret;
 
 	BUG_ON(ht->p.obj_hashfn);
 
-	return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
-					params);
+	ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+
+	return ret == NULL ? 0 : -EEXIST;
 }
 
 /**
@@ -735,6 +753,32 @@ static inline int rhashtable_lookup_insert_fast(
 static inline int rhashtable_lookup_insert_key(
 	struct rhashtable *ht, const void *key, struct rhash_head *obj,
 	const struct rhashtable_params params)
+{
+	void *ret;
+
+	BUG_ON(!ht->p.obj_hashfn || !key);
+
+	ret = __rhashtable_insert_fast(ht, key, obj, params);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+
+	return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ * @data:	pointer to element data already in hashes
+ *
+ * Just like rhashtable_lookup_insert_key(), but this function returns the
+ * object if it exists, NULL if it does not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_key(
+	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+	const struct rhashtable_params params)
 {
 	BUG_ON(!ht->p.obj_hashfn || !key);
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 37ea94b636a3..991bee32e52e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -441,7 +441,8 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
 struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
 					    const void *key,
 					    struct rhash_head *obj,
-					    struct bucket_table *tbl)
+					    struct bucket_table *tbl,
+					    void **data)
 {
 	struct rhash_head *head;
 	unsigned int hash;
@@ -452,8 +453,11 @@ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
 	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
 
 	err = -EEXIST;
-	if (key && rhashtable_lookup_fast(ht, key, ht->p))
-		goto exit;
+	if (key) {
+		*data = rhashtable_lookup_fast(ht, key, ht->p);
+		if (*data)
+			goto exit;
+	}
 
 	err = -E2BIG;
 	if (unlikely(rht_grow_above_max(ht, tbl)))

From 9e5f4d0b79f8708db79c912404e68c915eb54f4d Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Fri, 7 Dec 2018 17:16:46 +0000
Subject: [PATCH 2449/3220] rhashtable: Add rhashtable_lookup()

Extracted from commit ca26893f05e8 "rhashtable: Add rhlist interface".

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/rhashtable.h | 69 ++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 17 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index e07eca01316c..753835d05be8 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -515,18 +515,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
 	return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
 }
 
-/**
- * rhashtable_lookup_fast - search hash table, inlined version
- * @ht:		hash table
- * @key:	the pointer to the key
- * @params:	hash table parameters
- *
- * Computes the hash value for the key and traverses the bucket chain looking
- * for a entry with an identical key. The first matching entry is returned.
- *
- * Returns the first entry on which the compare function returned true.
- */
-static inline void *rhashtable_lookup_fast(
+/* Internal function, do not use. */
+static inline struct rhash_head *__rhashtable_lookup(
 	struct rhashtable *ht, const void *key,
 	const struct rhashtable_params params)
 {
@@ -538,8 +528,6 @@ static inline void *rhashtable_lookup_fast(
 	struct rhash_head *he;
 	unsigned int hash;
 
-	rcu_read_lock();
-
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
@@ -548,8 +536,7 @@ restart:
 		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 		    rhashtable_compare(&arg, rht_obj(ht, he)))
 			continue;
-		rcu_read_unlock();
-		return rht_obj(ht, he);
+		return he;
 	}
 
 	/* Ensure we see any new tables. */
@@ -558,11 +545,59 @@ restart:
 	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	if (unlikely(tbl))
 		goto restart;
-	rcu_read_unlock();
 
 	return NULL;
 }
 
+/**
+ * rhashtable_lookup - search hash table
+ * @ht:		hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhash_head *he = __rhashtable_lookup(ht, key, params);
+
+	return he ? rht_obj(ht, he) : NULL;
+}
+
+/**
+ * rhashtable_lookup_fast - search hash table, without RCU read lock
+ * @ht:		hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * Only use this function when you have other mechanisms guaranteeing
+ * that the object won't go away after the RCU read lock is released.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup_fast(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	void *obj;
+
+	rcu_read_lock();
+	obj = rhashtable_lookup(ht, key, params);
+	rcu_read_unlock();
+
+	return obj;
+}
+
 /* Internal function, please use rhashtable_insert_fast() instead. This
  * function returns the existing element already in hashes in there is a clash,
  * otherwise it returns an error via ERR_PTR().

From bf5ea30ff5a143cf7ccb068259d16f4d2d1a2dd9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:55 -0700
Subject: [PATCH 2450/3220] rhashtable: add schedule points

commit ae6da1f503abb5a5081f9f6c4a6881de97830f3e upstream.

Rehashing and destroying large hash table takes a lot of time,
and happens in process context. It is safe to add cond_resched()
in rhashtable_rehash_table() and rhashtable_free_and_destroy()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 lib/rhashtable.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 991bee32e52e..7bb8649429bf 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -250,8 +250,10 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	if (!new_tbl)
 		return 0;
 
-	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
+	for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
 		rhashtable_rehash_chain(ht, old_hash);
+		cond_resched();
+	}
 
 	/* Publish the new table pointer. */
 	rcu_assign_pointer(ht->tbl, new_tbl);
@@ -842,6 +844,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 		for (i = 0; i < tbl->size; i++) {
 			struct rhash_head *pos, *next;
 
+			cond_resched();
 			for (pos = rht_dereference(tbl->buckets[i], ht),
 			     next = !rht_is_a_nulls(pos) ?
 					rht_dereference(pos->next, ht) : NULL;

From 493107105843f299662b3b664a83804645564f12 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:56 -0700
Subject: [PATCH 2451/3220] inet: frags: use rhashtables for reassembly units

commit 648700f76b03b7e8149d13cc2bdb3355035258a9 upstream.

Some applications still rely on IP fragmentation, and to be fair linux
reassembly unit is not working under any serious load.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU. A followup patch will even remove
the refcount hold/release left from prior implementation and save
a couple of atomic operations.

Before this patch, 16 cpus (16 RX queue NIC) could not handle more
than 1 Mpps frags DDOS.

After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB
of storage for the fragments (exact number depends on frags being evicted
after timeout)

$ grep FRAG /proc/net/sockstat
FRAG: inuse 1966916 memory 2140004608

A followup patch will change the limits for 64bit arches.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Florian Westphal <fw@strlen.de>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Alexander Aring <alex.aring@gmail.com>
Cc: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/networking/ip-sysctl.txt  |   7 +-
 include/net/inet_frag.h                 |  81 +++---
 include/net/ipv6.h                      |  16 +-
 net/ieee802154/6lowpan/6lowpan_i.h      |  26 +-
 net/ieee802154/6lowpan/reassembly.c     |  91 +++---
 net/ipv4/inet_fragment.c                | 359 +++++-------------------
 net/ipv4/ip_fragment.c                  | 112 ++++----
 net/ipv6/netfilter/nf_conntrack_reasm.c |  51 +---
 net/ipv6/reassembly.c                   | 110 ++++----
 9 files changed, 272 insertions(+), 581 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2ea4c45cf1c8..2aa56ccaa996 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -113,13 +113,10 @@ min_adv_mss - INTEGER
 IP Fragmentation:
 
 ipfrag_high_thresh - INTEGER
-	Maximum memory used to reassemble IP fragments. When
-	ipfrag_high_thresh bytes of memory is allocated for this purpose,
-	the fragment handler will toss packets until ipfrag_low_thresh
-	is reached. This also serves as a maximum limit to namespaces
-	different from the initial one.
+	Maximum memory used to reassemble IP fragments.
 
 ipfrag_low_thresh - INTEGER
+	(Obsolete since linux-4.17)
 	Maximum memory used to reassemble IP fragments before the kernel
 	begins to remove incomplete fragment queues to free up resources.
 	The kernel still accepts new fragments for defragmentation.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 12589f08e064..623eb8222a75 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -1,7 +1,11 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
+#include <linux/rhashtable.h>
+
 struct netns_frags {
+	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
+
 	/* Keep atomic mem on separate cachelines in structs that include it */
 	atomic_t		mem ____cacheline_aligned_in_smp;
 	/* sysctls */
@@ -24,12 +28,30 @@ enum {
 	INET_FRAG_COMPLETE	= BIT(2),
 };
 
+struct frag_v4_compare_key {
+	__be32		saddr;
+	__be32		daddr;
+	u32		user;
+	u32		vif;
+	__be16		id;
+	u16		protocol;
+};
+
+struct frag_v6_compare_key {
+	struct in6_addr	saddr;
+	struct in6_addr	daddr;
+	u32		user;
+	__be32		id;
+	u32		iif;
+};
+
 /**
  * struct inet_frag_queue - fragment queue
  *
- * @lock: spinlock protecting the queue
+ * @node: rhash node
+ * @key: keys identifying this frag.
  * @timer: queue expiration timer
- * @list: hash bucket list
+ * @lock: spinlock protecting this frag
  * @refcnt: reference count of the queue
  * @fragments: received fragments head
  * @fragments_tail: received fragments tail
@@ -39,12 +61,16 @@ enum {
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
  * @net: namespace that this frag belongs to
- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
+ * @rcu: rcu head for freeing deferall
  */
 struct inet_frag_queue {
-	spinlock_t		lock;
+	struct rhash_head	node;
+	union {
+		struct frag_v4_compare_key v4;
+		struct frag_v6_compare_key v6;
+	} key;
 	struct timer_list	timer;
-	struct hlist_node	list;
+	spinlock_t		lock;
 	atomic_t		refcnt;
 	struct sk_buff		*fragments;
 	struct sk_buff		*fragments_tail;
@@ -53,45 +79,13 @@ struct inet_frag_queue {
 	int			meat;
 	__u8			flags;
 	u16			max_size;
-	struct netns_frags	*net;
-	struct hlist_node	list_evictor;
-};
-
-#define INETFRAGS_HASHSZ	1024
-
-/* averaged:
- * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
- *	       rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
- *	       struct frag_queue))
- */
-#define INETFRAGS_MAXDEPTH	128
-
-struct inet_frag_bucket {
-	struct hlist_head	chain;
-	spinlock_t		chain_lock;
+	struct netns_frags      *net;
+	struct rcu_head		rcu;
 };
 
 struct inet_frags {
-	struct inet_frag_bucket	hash[INETFRAGS_HASHSZ];
-
-	struct work_struct	frags_work;
-	unsigned int next_bucket;
-	unsigned long last_rebuild_jiffies;
-	bool rebuild;
-
-	/* The first call to hashfn is responsible to initialize
-	 * rnd. This is best done with net_get_random_once.
-	 *
-	 * rnd_seqlock is used to let hash insertion detect
-	 * when it needs to re-lookup the hash chain to use.
-	 */
-	u32			rnd;
-	seqlock_t		rnd_seqlock;
 	int			qsize;
 
-	unsigned int		(*hashfn)(const struct inet_frag_queue *);
-	bool			(*match)(const struct inet_frag_queue *q,
-					 const void *arg);
 	void			(*constructor)(struct inet_frag_queue *q,
 					       const void *arg);
 	void			(*destructor)(struct inet_frag_queue *);
@@ -99,6 +93,7 @@ struct inet_frags {
 	void			(*frag_expire)(unsigned long data);
 	struct kmem_cache	*frags_cachep;
 	const char		*frags_cache_name;
+	struct rhashtable_params rhash_params;
 };
 
 int inet_frags_init(struct inet_frags *);
@@ -107,15 +102,13 @@ void inet_frags_fini(struct inet_frags *);
 static inline int inet_frags_init_net(struct netns_frags *nf)
 {
 	atomic_set(&nf->mem, 0);
-	return 0;
+	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
 }
 void inet_frags_exit_net(struct netns_frags *nf);
 
 void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
-		struct inet_frags *f, void *key, unsigned int hash);
-
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
 void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 				   const char *prefix);
 
@@ -127,7 +120,7 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
 
 static inline bool inet_frag_evicting(struct inet_frag_queue *q)
 {
-	return !hlist_unhashed(&q->list_evictor);
+	return false;
 }
 
 /* Memory Tracking Functions. */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9d8eace6d455..2067bbec021c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -505,17 +505,8 @@ enum ip6_defrag_users {
 	__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
 };
 
-struct ip6_create_arg {
-	__be32 id;
-	u32 user;
-	const struct in6_addr *src;
-	const struct in6_addr *dst;
-	int iif;
-	u8 ecn;
-};
-
 void ip6_frag_init(struct inet_frag_queue *q, const void *a);
-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+extern const struct rhashtable_params ip6_rhash_params;
 
 /*
  *	Equivalent of ipv4 struct ip
@@ -523,11 +514,6 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
 struct frag_queue {
 	struct inet_frag_queue	q;
 
-	__be32			id;		/* fragment id		*/
-	u32			user;
-	struct in6_addr		saddr;
-	struct in6_addr		daddr;
-
 	int			iif;
 	unsigned int		csum;
 	__u16			nhoffset;
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index b4e17a7c0df0..3c5fc06883ba 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -16,37 +16,19 @@ typedef unsigned __bitwise__ lowpan_rx_result;
 #define LOWPAN_DISPATCH_FRAG1           0xc0
 #define LOWPAN_DISPATCH_FRAGN           0xe0
 
-struct lowpan_create_arg {
+struct frag_lowpan_compare_key {
 	u16 tag;
 	u16 d_size;
-	const struct ieee802154_addr *src;
-	const struct ieee802154_addr *dst;
+	const struct ieee802154_addr src;
+	const struct ieee802154_addr dst;
 };
 
-/* Equivalent of ipv4 struct ip
+/* Equivalent of ipv4 struct ipq
  */
 struct lowpan_frag_queue {
 	struct inet_frag_queue	q;
-
-	u16			tag;
-	u16			d_size;
-	struct ieee802154_addr	saddr;
-	struct ieee802154_addr	daddr;
 };
 
-static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
-{
-	switch (a->mode) {
-	case IEEE802154_ADDR_LONG:
-		return (((__force u64)a->extended_addr) >> 32) ^
-			(((__force u64)a->extended_addr) & 0xffffffff);
-	case IEEE802154_ADDR_SHORT:
-		return (__force u32)(a->short_addr);
-	default:
-		return 0;
-	}
-}
-
 /* private device info */
 struct lowpan_dev_info {
 	struct net_device	*wdev; /* wpan device ptr */
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index dba0a34cf92a..65c0b7349f9d 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
 static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
 			     struct sk_buff *prev, struct net_device *ldev);
 
-static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
-				     const struct ieee802154_addr *saddr,
-				     const struct ieee802154_addr *daddr)
-{
-	net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
-	return jhash_3words(ieee802154_addr_hash(saddr),
-			    ieee802154_addr_hash(daddr),
-			    (__force u32)(tag + (d_size << 16)),
-			    lowpan_frags.rnd);
-}
-
-static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
-{
-	const struct lowpan_frag_queue *fq;
-
-	fq = container_of(q, struct lowpan_frag_queue, q);
-	return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
-}
-
-static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
-{
-	const struct lowpan_frag_queue *fq;
-	const struct lowpan_create_arg *arg = a;
-
-	fq = container_of(q, struct lowpan_frag_queue, q);
-	return	fq->tag == arg->tag && fq->d_size == arg->d_size &&
-		ieee802154_addr_equal(&fq->saddr, arg->src) &&
-		ieee802154_addr_equal(&fq->daddr, arg->dst);
-}
-
 static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
 {
-	const struct lowpan_create_arg *arg = a;
+	const struct frag_lowpan_compare_key *key = a;
 	struct lowpan_frag_queue *fq;
 
 	fq = container_of(q, struct lowpan_frag_queue, q);
 
-	fq->tag = arg->tag;
-	fq->d_size = arg->d_size;
-	fq->saddr = *arg->src;
-	fq->daddr = *arg->dst;
+	BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
+	memcpy(&q->key, key, sizeof(*key));
 }
 
 static void lowpan_frag_expire(unsigned long data)
@@ -104,21 +72,17 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 	const struct ieee802154_addr *src,
 	const struct ieee802154_addr *dst)
 {
-	struct inet_frag_queue *q;
-	struct lowpan_create_arg arg;
-	unsigned int hash;
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
+	struct frag_lowpan_compare_key key = {
+		.tag = cb->d_tag,
+		.d_size = cb->d_size,
+		.src = *src,
+		.dst = *dst,
+	};
+	struct inet_frag_queue *q;
 
-	arg.tag = cb->d_tag;
-	arg.d_size = cb->d_size;
-	arg.src = src;
-	arg.dst = dst;
-
-	hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
-
-	q = inet_frag_find(&ieee802154_lowpan->frags,
-			   &lowpan_frags, &arg, hash);
+	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
 		return NULL;
@@ -610,18 +574,47 @@ static struct pernet_operations lowpan_frags_ops = {
 	.exit = lowpan_frags_exit_net,
 };
 
+static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
+}
+
+static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key,
+		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
+}
+
+static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_lowpan_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+static const struct rhashtable_params lowpan_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.hashfn			= lowpan_key_hashfn,
+	.obj_hashfn		= lowpan_obj_hashfn,
+	.obj_cmpfn		= lowpan_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+
 int __init lowpan_net_frag_init(void)
 {
 	int ret;
 
-	lowpan_frags.hashfn = lowpan_hashfn;
 	lowpan_frags.constructor = lowpan_frag_init;
 	lowpan_frags.destructor = NULL;
 	lowpan_frags.skb_free = NULL;
 	lowpan_frags.qsize = sizeof(struct frag_queue);
-	lowpan_frags.match = lowpan_frag_match;
 	lowpan_frags.frag_expire = lowpan_frag_expire;
 	lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
+	lowpan_frags.rhash_params = lowpan_rhash_params;
 	ret = inet_frags_init(&lowpan_frags);
 	if (ret)
 		goto out;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 8c2b869d035f..10cd7c182136 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,12 +25,6 @@
 #include <net/inet_frag.h>
 #include <net/inet_ecn.h>
 
-#define INETFRAGS_EVICT_BUCKETS   128
-#define INETFRAGS_EVICT_MAX	  512
-
-/* don't rebuild inetfrag table with new secret more often than this */
-#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
-
 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  * Value : 0xff if frame should be dropped.
  *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
 };
 EXPORT_SYMBOL(ip_frag_ecn_table);
 
-static unsigned int
-inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
-{
-	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
-}
-
-static bool inet_frag_may_rebuild(struct inet_frags *f)
-{
-	return time_after(jiffies,
-	       f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
-}
-
-static void inet_frag_secret_rebuild(struct inet_frags *f)
-{
-	int i;
-
-	write_seqlock_bh(&f->rnd_seqlock);
-
-	if (!inet_frag_may_rebuild(f))
-		goto out;
-
-	get_random_bytes(&f->rnd, sizeof(u32));
-
-	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-		struct inet_frag_bucket *hb;
-		struct inet_frag_queue *q;
-		struct hlist_node *n;
-
-		hb = &f->hash[i];
-		spin_lock(&hb->chain_lock);
-
-		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
-			unsigned int hval = inet_frag_hashfn(f, q);
-
-			if (hval != i) {
-				struct inet_frag_bucket *hb_dest;
-
-				hlist_del(&q->list);
-
-				/* Relink to new hash chain. */
-				hb_dest = &f->hash[hval];
-
-				/* This is the only place where we take
-				 * another chain_lock while already holding
-				 * one.  As this will not run concurrently,
-				 * we cannot deadlock on hb_dest lock below, if its
-				 * already locked it will be released soon since
-				 * other caller cannot be waiting for hb lock
-				 * that we've taken above.
-				 */
-				spin_lock_nested(&hb_dest->chain_lock,
-						 SINGLE_DEPTH_NESTING);
-				hlist_add_head(&q->list, &hb_dest->chain);
-				spin_unlock(&hb_dest->chain_lock);
-			}
-		}
-		spin_unlock(&hb->chain_lock);
-	}
-
-	f->rebuild = false;
-	f->last_rebuild_jiffies = jiffies;
-out:
-	write_sequnlock_bh(&f->rnd_seqlock);
-}
-
-static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
-{
-	if (!hlist_unhashed(&q->list_evictor))
-		return false;
-
-	return q->net->low_thresh == 0 ||
-	       frag_mem_limit(q->net) >= q->net->low_thresh;
-}
-
-static unsigned int
-inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
-{
-	struct inet_frag_queue *fq;
-	struct hlist_node *n;
-	unsigned int evicted = 0;
-	HLIST_HEAD(expired);
-
-	spin_lock(&hb->chain_lock);
-
-	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
-		if (!inet_fragq_should_evict(fq))
-			continue;
-
-		if (!del_timer(&fq->timer))
-			continue;
-
-		hlist_add_head(&fq->list_evictor, &expired);
-		++evicted;
-	}
-
-	spin_unlock(&hb->chain_lock);
-
-	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
-		f->frag_expire((unsigned long) fq);
-
-	return evicted;
-}
-
-static void inet_frag_worker(struct work_struct *work)
-{
-	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
-	unsigned int i, evicted = 0;
-	struct inet_frags *f;
-
-	f = container_of(work, struct inet_frags, frags_work);
-
-	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
-
-	local_bh_disable();
-
-	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
-		evicted += inet_evict_bucket(f, &f->hash[i]);
-		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
-		if (evicted > INETFRAGS_EVICT_MAX)
-			break;
-	}
-
-	f->next_bucket = i;
-
-	local_bh_enable();
-
-	if (f->rebuild && inet_frag_may_rebuild(f))
-		inet_frag_secret_rebuild(f);
-}
-
-static void inet_frag_schedule_worker(struct inet_frags *f)
-{
-	if (unlikely(!work_pending(&f->frags_work)))
-		schedule_work(&f->frags_work);
-}
-
 int inet_frags_init(struct inet_frags *f)
 {
-	int i;
-
-	INIT_WORK(&f->frags_work, inet_frag_worker);
-
-	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-		struct inet_frag_bucket *hb = &f->hash[i];
-
-		spin_lock_init(&hb->chain_lock);
-		INIT_HLIST_HEAD(&hb->chain);
-	}
-
-	seqlock_init(&f->rnd_seqlock);
-	f->last_rebuild_jiffies = 0;
 	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
 					    NULL);
 	if (!f->frags_cachep)
@@ -214,74 +59,53 @@ EXPORT_SYMBOL(inet_frags_init);
 
 void inet_frags_fini(struct inet_frags *f)
 {
-	cancel_work_sync(&f->frags_work);
+	/* We must wait that all inet_frag_destroy_rcu() have completed. */
+	rcu_barrier();
+
 	kmem_cache_destroy(f->frags_cachep);
+	f->frags_cachep = NULL;
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
+static void inet_frags_free_cb(void *ptr, void *arg)
+{
+	struct inet_frag_queue *fq = ptr;
+
+	/* If we can not cancel the timer, it means this frag_queue
+	 * is already disappearing, we have nothing to do.
+	 * Otherwise, we own a refcount until the end of this function.
+	 */
+	if (!del_timer(&fq->timer))
+		return;
+
+	spin_lock_bh(&fq->lock);
+	if (!(fq->flags & INET_FRAG_COMPLETE)) {
+		fq->flags |= INET_FRAG_COMPLETE;
+		atomic_dec(&fq->refcnt);
+	}
+	spin_unlock_bh(&fq->lock);
+
+	inet_frag_put(fq);
+}
+
 void inet_frags_exit_net(struct netns_frags *nf)
 {
-	struct inet_frags *f =nf->f;
-	unsigned int seq;
-	int i;
+	nf->low_thresh = 0; /* prevent creation of new frags */
 
-	nf->low_thresh = 0;
-
-evict_again:
-	local_bh_disable();
-	seq = read_seqbegin(&f->rnd_seqlock);
-
-	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
-		inet_evict_bucket(f, &f->hash[i]);
-
-	local_bh_enable();
-	cond_resched();
-
-	if (read_seqretry(&f->rnd_seqlock, seq) ||
-	    sum_frag_mem_limit(nf))
-		goto evict_again;
+	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
 
-static struct inet_frag_bucket *
-get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
-__acquires(hb->chain_lock)
-{
-	struct inet_frag_bucket *hb;
-	unsigned int seq, hash;
-
- restart:
-	seq = read_seqbegin(&f->rnd_seqlock);
-
-	hash = inet_frag_hashfn(f, fq);
-	hb = &f->hash[hash];
-
-	spin_lock(&hb->chain_lock);
-	if (read_seqretry(&f->rnd_seqlock, seq)) {
-		spin_unlock(&hb->chain_lock);
-		goto restart;
-	}
-
-	return hb;
-}
-
-static inline void fq_unlink(struct inet_frag_queue *fq)
-{
-	struct inet_frag_bucket *hb;
-
-	hb = get_frag_bucket_locked(fq, fq->net->f);
-	hlist_del(&fq->list);
-	fq->flags |= INET_FRAG_COMPLETE;
-	spin_unlock(&hb->chain_lock);
-}
-
 void inet_frag_kill(struct inet_frag_queue *fq)
 {
 	if (del_timer(&fq->timer))
 		atomic_dec(&fq->refcnt);
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
-		fq_unlink(fq);
+		struct netns_frags *nf = fq->net;
+
+		fq->flags |= INET_FRAG_COMPLETE;
+		rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
 		atomic_dec(&fq->refcnt);
 	}
 }
@@ -295,6 +119,17 @@ static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
 	kfree_skb(skb);
 }
 
+static void inet_frag_destroy_rcu(struct rcu_head *head)
+{
+	struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
+						 rcu);
+	struct inet_frags *f = q->net->f;
+
+	if (f->destructor)
+		f->destructor(q);
+	kmem_cache_free(f->frags_cachep, q);
+}
+
 void inet_frag_destroy(struct inet_frag_queue *q)
 {
 	struct sk_buff *fp;
@@ -318,55 +153,21 @@ void inet_frag_destroy(struct inet_frag_queue *q)
 	}
 	sum = sum_truesize + f->qsize;
 
-	if (f->destructor)
-		f->destructor(q);
-	kmem_cache_free(f->frags_cachep, q);
+	call_rcu(&q->rcu, inet_frag_destroy_rcu);
 
 	sub_frag_mem_limit(nf, sum);
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
-						struct inet_frag_queue *qp_in,
-						struct inet_frags *f,
-						void *arg)
-{
-	struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
-	struct inet_frag_queue *qp;
-
-#ifdef CONFIG_SMP
-	/* With SMP race we have to recheck hash table, because
-	 * such entry could have been created on other cpu before
-	 * we acquired hash bucket lock.
-	 */
-	hlist_for_each_entry(qp, &hb->chain, list) {
-		if (qp->net == nf && f->match(qp, arg)) {
-			atomic_inc(&qp->refcnt);
-			spin_unlock(&hb->chain_lock);
-			qp_in->flags |= INET_FRAG_COMPLETE;
-			inet_frag_put(qp_in);
-			return qp;
-		}
-	}
-#endif
-	qp = qp_in;
-	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
-		atomic_inc(&qp->refcnt);
-
-	atomic_inc(&qp->refcnt);
-	hlist_add_head(&qp->list, &hb->chain);
-
-	spin_unlock(&hb->chain_lock);
-
-	return qp;
-}
-
 static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 					       struct inet_frags *f,
 					       void *arg)
 {
 	struct inet_frag_queue *q;
 
+	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+		return NULL;
+
 	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 	if (!q)
 		return NULL;
@@ -377,64 +178,51 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 
 	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
 	spin_lock_init(&q->lock);
-	atomic_set(&q->refcnt, 1);
+	atomic_set(&q->refcnt, 3);
 
 	return q;
 }
 
 static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
-						struct inet_frags *f,
 						void *arg)
 {
+	struct inet_frags *f = nf->f;
 	struct inet_frag_queue *q;
+	int err;
 
 	q = inet_frag_alloc(nf, f, arg);
 	if (!q)
 		return NULL;
 
-	return inet_frag_intern(nf, q, f, arg);
-}
+	mod_timer(&q->timer, jiffies + nf->timeout);
 
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
-				       struct inet_frags *f, void *key,
-				       unsigned int hash)
-{
-	struct inet_frag_bucket *hb;
-	struct inet_frag_queue *q;
-	int depth = 0;
-
-	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
-		inet_frag_schedule_worker(f);
+	err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
+				     f->rhash_params);
+	if (err < 0) {
+		q->flags |= INET_FRAG_COMPLETE;
+		inet_frag_kill(q);
+		inet_frag_destroy(q);
 		return NULL;
 	}
+	return q;
+}
+EXPORT_SYMBOL(inet_frag_create);
 
-	if (frag_mem_limit(nf) > nf->low_thresh)
-		inet_frag_schedule_worker(f);
+/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
+{
+	struct inet_frag_queue *fq;
 
-	hash &= (INETFRAGS_HASHSZ - 1);
-	hb = &f->hash[hash];
-
-	spin_lock(&hb->chain_lock);
-	hlist_for_each_entry(q, &hb->chain, list) {
-		if (q->net == nf && f->match(q, key)) {
-			atomic_inc(&q->refcnt);
-			spin_unlock(&hb->chain_lock);
-			return q;
-		}
-		depth++;
+	rcu_read_lock();
+	fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
+	if (fq) {
+		if (!atomic_inc_not_zero(&fq->refcnt))
+			fq = NULL;
+		rcu_read_unlock();
+		return fq;
 	}
-	spin_unlock(&hb->chain_lock);
-
-	if (depth <= INETFRAGS_MAXDEPTH)
-		return inet_frag_create(nf, f, key);
-
-	if (inet_frag_may_rebuild(f)) {
-		if (!f->rebuild)
-			f->rebuild = true;
-		inet_frag_schedule_worker(f);
-	}
-
-	return ERR_PTR(-ENOBUFS);
+	rcu_read_unlock();
+	return inet_frag_create(nf, key);
 }
 EXPORT_SYMBOL(inet_frag_find);
 
@@ -442,8 +230,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 				   const char *prefix)
 {
 	static const char msg[] = "inet_frag_find: Fragment hash bucket"
-		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
-		". Dropping fragment.\n";
+		" list length grew over limit. Dropping fragment.\n";
 
 	if (PTR_ERR(q) == -ENOBUFS)
 		net_dbg_ratelimited("%s%s", prefix, msg);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index f4978a5aa55e..b383cbc86b13 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -70,15 +70,9 @@ struct ipfrag_skb_cb
 struct ipq {
 	struct inet_frag_queue q;
 
-	u32		user;
-	__be32		saddr;
-	__be32		daddr;
-	__be16		id;
-	u8		protocol;
 	u8		ecn; /* RFC3168 support */
 	u16		max_df_size; /* largest frag with DF set seen */
 	int             iif;
-	int             vif;   /* L3 master device index */
 	unsigned int    rid;
 	struct inet_peer *peer;
 };
@@ -98,41 +92,6 @@ int ip_frag_mem(struct net *net)
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			 struct net_device *dev);
 
-struct ip4_create_arg {
-	struct iphdr *iph;
-	u32 user;
-	int vif;
-};
-
-static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
-{
-	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
-	return jhash_3words((__force u32)id << 16 | prot,
-			    (__force u32)saddr, (__force u32)daddr,
-			    ip4_frags.rnd);
-}
-
-static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
-{
-	const struct ipq *ipq;
-
-	ipq = container_of(q, struct ipq, q);
-	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
-}
-
-static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
-{
-	const struct ipq *qp;
-	const struct ip4_create_arg *arg = a;
-
-	qp = container_of(q, struct ipq, q);
-	return	qp->id == arg->iph->id &&
-		qp->saddr == arg->iph->saddr &&
-		qp->daddr == arg->iph->daddr &&
-		qp->protocol == arg->iph->protocol &&
-		qp->user == arg->user &&
-		qp->vif == arg->vif;
-}
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 {
@@ -141,17 +100,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 					       frags);
 	struct net *net = container_of(ipv4, struct net, ipv4);
 
-	const struct ip4_create_arg *arg = a;
+	const struct frag_v4_compare_key *key = a;
 
-	qp->protocol = arg->iph->protocol;
-	qp->id = arg->iph->id;
-	qp->ecn = ip4_frag_ecn(arg->iph->tos);
-	qp->saddr = arg->iph->saddr;
-	qp->daddr = arg->iph->daddr;
-	qp->vif = arg->vif;
-	qp->user = arg->user;
+	q->key.v4 = *key;
+	qp->ecn = 0;
 	qp->peer = sysctl_ipfrag_max_dist ?
-		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
+		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
 		NULL;
 }
 
@@ -234,7 +188,7 @@ static void ip_expire(unsigned long arg)
 		/* Only an end host needs to send an ICMP
 		 * "Fragment Reassembly Timeout" message, per RFC792.
 		 */
-		if (frag_expire_skip_icmp(qp->user) &&
+		if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
 		    (skb_rtable(head)->rt_type != RTN_LOCAL))
 			goto out;
 
@@ -262,17 +216,17 @@ out_rcu_unlock:
 static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 			   u32 user, int vif)
 {
+	struct frag_v4_compare_key key = {
+		.saddr = iph->saddr,
+		.daddr = iph->daddr,
+		.user = user,
+		.vif = vif,
+		.id = iph->id,
+		.protocol = iph->protocol,
+	};
 	struct inet_frag_queue *q;
-	struct ip4_create_arg arg;
-	unsigned int hash;
 
-	arg.iph = iph;
-	arg.user = user;
-	arg.vif = vif;
-
-	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
-
-	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
+	q = inet_frag_find(&net->ipv4.frags, &key);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
 		return NULL;
@@ -656,7 +610,7 @@ out_nomem:
 	err = -ENOMEM;
 	goto out_fail;
 out_oversize:
-	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
+	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
 out_fail:
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 	return err;
@@ -900,16 +854,48 @@ static struct pernet_operations ip4_frags_ops = {
 	.exit = ipv4_frags_exit_net,
 };
 
+
+static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key.v4,
+		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_v4_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+static const struct rhashtable_params ip4_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.key_offset		= offsetof(struct inet_frag_queue, key),
+	.key_len		= sizeof(struct frag_v4_compare_key),
+	.hashfn			= ip4_key_hashfn,
+	.obj_hashfn		= ip4_obj_hashfn,
+	.obj_cmpfn		= ip4_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+
 void __init ipfrag_init(void)
 {
-	ip4_frags.hashfn = ip4_hashfn;
 	ip4_frags.constructor = ip4_frag_init;
 	ip4_frags.destructor = ip4_frag_free;
 	ip4_frags.skb_free = NULL;
 	ip4_frags.qsize = sizeof(struct ipq);
-	ip4_frags.match = ip4_frag_match;
 	ip4_frags.frag_expire = ip_expire;
 	ip4_frags.frags_cache_name = ip_frag_cache_name;
+	ip4_frags.rhash_params = ip4_rhash_params;
 	if (inet_frags_init(&ip4_frags))
 		panic("IP: failed to allocate ip4_frags cache\n");
 	ip4_frags_ctl_register();
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 15a9da5cf296..985a9b6411af 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -153,23 +153,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 	return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
 }
 
-static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
-				 const struct in6_addr *daddr)
-{
-	net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
-	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
-			    (__force u32)id, nf_frags.rnd);
-}
-
-
-static unsigned int nf_hashfn(const struct inet_frag_queue *q)
-{
-	const struct frag_queue *nq;
-
-	nq = container_of(q, struct frag_queue, q);
-	return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
-}
-
 static void nf_skb_free(struct sk_buff *skb)
 {
 	if (NFCT_FRAG6_CB(skb)->orig)
@@ -188,26 +171,19 @@ static void nf_ct_frag6_expire(unsigned long data)
 }
 
 /* Creation primitives. */
-static inline struct frag_queue *fq_find(struct net *net, __be32 id,
-					 u32 user, struct in6_addr *src,
-					 struct in6_addr *dst, int iif, u8 ecn)
+static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
+				  const struct ipv6hdr *hdr, int iif)
 {
+	struct frag_v6_compare_key key = {
+		.id = id,
+		.saddr = hdr->saddr,
+		.daddr = hdr->daddr,
+		.user = user,
+		.iif = iif,
+	};
 	struct inet_frag_queue *q;
-	struct ip6_create_arg arg;
-	unsigned int hash;
 
-	arg.id = id;
-	arg.user = user;
-	arg.src = src;
-	arg.dst = dst;
-	arg.iif = iif;
-	arg.ecn = ecn;
-
-	local_bh_disable();
-	hash = nf_hash_frag(id, src, dst);
-
-	q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
-	local_bh_enable();
+	q = inet_frag_find(&net->nf_frag.frags, &key);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
 		return NULL;
@@ -602,8 +578,8 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
 	fhdr = (struct frag_hdr *)skb_transport_header(clone);
 
 	skb_orphan(skb);
-	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
-		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+	fq = fq_find(net, fhdr->identification, user, hdr,
+		     skb->dev ? skb->dev->ifindex : 0);
 	if (fq == NULL) {
 		pr_debug("Can't find and can't create new queue\n");
 		goto ret_orig;
@@ -681,14 +657,13 @@ int nf_ct_frag6_init(void)
 {
 	int ret = 0;
 
-	nf_frags.hashfn = nf_hashfn;
 	nf_frags.constructor = ip6_frag_init;
 	nf_frags.destructor = NULL;
 	nf_frags.skb_free = nf_skb_free;
 	nf_frags.qsize = sizeof(struct frag_queue);
-	nf_frags.match = ip6_frag_match;
 	nf_frags.frag_expire = nf_ct_frag6_expire;
 	nf_frags.frags_cache_name = nf_frags_cache_name;
+	nf_frags.rhash_params = ip6_rhash_params;
 	ret = inet_frags_init(&nf_frags);
 	if (ret)
 		goto out;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 0b6f4e1f43db..0626c1b894e4 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -79,52 +79,13 @@ static struct inet_frags ip6_frags;
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 			  struct net_device *dev);
 
-/*
- * callers should be careful not to use the hash value outside the ipfrag_lock
- * as doing so could race with ipfrag_hash_rnd being recalculated.
- */
-static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
-				    const struct in6_addr *daddr)
-{
-	net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
-	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
-			    (__force u32)id, ip6_frags.rnd);
-}
-
-static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
-{
-	const struct frag_queue *fq;
-
-	fq = container_of(q, struct frag_queue, q);
-	return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
-}
-
-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
-{
-	const struct frag_queue *fq;
-	const struct ip6_create_arg *arg = a;
-
-	fq = container_of(q, struct frag_queue, q);
-	return	fq->id == arg->id &&
-		fq->user == arg->user &&
-		ipv6_addr_equal(&fq->saddr, arg->src) &&
-		ipv6_addr_equal(&fq->daddr, arg->dst) &&
-		(arg->iif == fq->iif ||
-		 !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
-					       IPV6_ADDR_LINKLOCAL)));
-}
-EXPORT_SYMBOL(ip6_frag_match);
-
 void ip6_frag_init(struct inet_frag_queue *q, const void *a)
 {
 	struct frag_queue *fq = container_of(q, struct frag_queue, q);
-	const struct ip6_create_arg *arg = a;
+	const struct frag_v6_compare_key *key = a;
 
-	fq->id = arg->id;
-	fq->user = arg->user;
-	fq->saddr = *arg->src;
-	fq->daddr = *arg->dst;
-	fq->ecn = arg->ecn;
+	q->key.v6 = *key;
+	fq->ecn = 0;
 }
 EXPORT_SYMBOL(ip6_frag_init);
 
@@ -181,23 +142,22 @@ static void ip6_frag_expire(unsigned long data)
 }
 
 static struct frag_queue *
-fq_find(struct net *net, __be32 id, const struct in6_addr *src,
-	const struct in6_addr *dst, int iif, u8 ecn)
+fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 {
+	struct frag_v6_compare_key key = {
+		.id = id,
+		.saddr = hdr->saddr,
+		.daddr = hdr->daddr,
+		.user = IP6_DEFRAG_LOCAL_DELIVER,
+		.iif = iif,
+	};
 	struct inet_frag_queue *q;
-	struct ip6_create_arg arg;
-	unsigned int hash;
 
-	arg.id = id;
-	arg.user = IP6_DEFRAG_LOCAL_DELIVER;
-	arg.src = src;
-	arg.dst = dst;
-	arg.iif = iif;
-	arg.ecn = ecn;
+	if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
+					    IPV6_ADDR_LINKLOCAL)))
+		key.iif = 0;
 
-	hash = inet6_hash_frag(id, src, dst);
-
-	q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
+	q = inet_frag_find(&net->ipv6.frags, &key);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
 		return NULL;
@@ -523,6 +483,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	struct frag_queue *fq;
 	const struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct net *net = dev_net(skb_dst(skb)->dev);
+	int iif;
 
 	if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
 		goto fail_hdr;
@@ -551,13 +512,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
-	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
-		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+	iif = skb->dev ? skb->dev->ifindex : 0;
+	fq = fq_find(net, fhdr->identification, hdr, iif);
 	if (fq) {
 		int ret;
 
 		spin_lock(&fq->q.lock);
 
+		fq->iif = iif;
 		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		spin_unlock(&fq->q.lock);
@@ -735,17 +697,47 @@ static struct pernet_operations ip6_frags_ops = {
 	.exit = ipv6_frags_exit_net,
 };
 
+static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
+}
+
+static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key.v6,
+		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
+}
+
+static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_v6_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+const struct rhashtable_params ip6_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.hashfn			= ip6_key_hashfn,
+	.obj_hashfn		= ip6_obj_hashfn,
+	.obj_cmpfn		= ip6_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+EXPORT_SYMBOL(ip6_rhash_params);
+
 int __init ipv6_frag_init(void)
 {
 	int ret;
 
-	ip6_frags.hashfn = ip6_hashfn;
 	ip6_frags.constructor = ip6_frag_init;
 	ip6_frags.destructor = NULL;
 	ip6_frags.qsize = sizeof(struct frag_queue);
-	ip6_frags.match = ip6_frag_match;
 	ip6_frags.frag_expire = ip6_frag_expire;
 	ip6_frags.frags_cache_name = ip6_frag_cache_name;
+	ip6_frags.rhash_params = ip6_rhash_params;
 	ret = inet_frags_init(&ip6_frags);
 	if (ret)
 		goto out;

From 24641fb6453c4cc642f548e1f11502c7c0caaa54 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Fri, 20 Apr 2018 14:54:13 -0400
Subject: [PATCH 2452/3220] net: ieee802154: 6lowpan: fix frag reassembly

commit f18fa5de5ba7f1d6650951502bb96a6e4715a948 upstream.

This patch initialize stack variables which are used in
frag_lowpan_compare_key to zero. In my case there are padding bytes in the
structures ieee802154_addr as well in frag_lowpan_compare_key. Otherwise
the key variable contains random bytes. The result is that a compare of
two keys by memcmp works incorrect.

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Reported-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ieee802154/6lowpan/6lowpan_i.h  |  4 ++--
 net/ieee802154/6lowpan/reassembly.c | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index 3c5fc06883ba..fdbebe51446f 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -19,8 +19,8 @@ typedef unsigned __bitwise__ lowpan_rx_result;
 struct frag_lowpan_compare_key {
 	u16 tag;
 	u16 d_size;
-	const struct ieee802154_addr src;
-	const struct ieee802154_addr dst;
+	struct ieee802154_addr src;
+	struct ieee802154_addr dst;
 };
 
 /* Equivalent of ipv4 struct ipq
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 65c0b7349f9d..510568c37476 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -74,14 +74,14 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 {
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
-	struct frag_lowpan_compare_key key = {
-		.tag = cb->d_tag,
-		.d_size = cb->d_size,
-		.src = *src,
-		.dst = *dst,
-	};
+	struct frag_lowpan_compare_key key = {};
 	struct inet_frag_queue *q;
 
+	key.tag = cb->d_tag;
+	key.d_size = cb->d_size;
+	key.src = *src;
+	key.dst = *dst;
+
 	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
@@ -372,7 +372,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 	struct lowpan_frag_queue *fq;
 	struct net *net = dev_net(skb->dev);
 	struct lowpan_802154_cb *cb = lowpan_802154_cb(skb);
-	struct ieee802154_hdr hdr;
+	struct ieee802154_hdr hdr = {};
 	int err;
 
 	if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)

From f67b17c0233a3eb18463a661fbeb671110ada4b9 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 6 Jul 2018 12:30:20 +0200
Subject: [PATCH 2453/3220] ipfrag: really prevent allocation on netns exit

commit f6f2a4a2eb92bc73671204198bb2f8ab53ff59fb upstream.

Setting the low threshold to 0 has no effect on frags allocation,
we need to clear high_thresh instead.

The code was pre-existent to commit 648700f76b03 ("inet: frags:
use rhashtables for reassembly units"), but before the above,
such assignment had a different role: prevent concurrent eviction
from the worker and the netns cleanup helper.

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/inet_fragment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 10cd7c182136..cca7362c8834 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -90,7 +90,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 
 void inet_frags_exit_net(struct netns_frags *nf)
 {
-	nf->low_thresh = 0; /* prevent creation of new frags */
+	nf->high_thresh = 0; /* prevent creation of new frags */
 
 	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 }

From cf2b9e68a684bf8dd6c1a7671f59faaab8e4ca2c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:57 -0700
Subject: [PATCH 2454/3220] inet: frags: remove some helpers

commit 6befe4a78b1553edb6eed3a78b4bcd9748526672 upstream.

Remove sum_frag_mem_limit(), ip_frag_mem() & ip6_frag_mem()

Also since we use rhashtable we can bring back the number of fragments
in "grep FRAG /proc/net/sockstat /proc/net/sockstat6" that was
removed in commit 434d305405ab ("inet: frag: don't account number
of fragment queues")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_frag.h | 5 -----
 include/net/ip.h        | 1 -
 include/net/ipv6.h      | 7 -------
 net/ipv4/ip_fragment.c  | 5 -----
 net/ipv4/proc.c         | 6 +++---
 net/ipv6/proc.c         | 5 +++--
 6 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 623eb8222a75..319a435cd723 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -140,11 +140,6 @@ static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
 	atomic_add(i, &nf->mem);
 }
 
-static inline int sum_frag_mem_limit(struct netns_frags *nf)
-{
-	return atomic_read(&nf->mem);
-}
-
 /* RFC 3168 support :
  * We want to check ECN values of all fragments, do detect invalid combinations.
  * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
diff --git a/include/net/ip.h b/include/net/ip.h
index 0530bcdbc212..7b968927477d 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -524,7 +524,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s
 	return skb;
 }
 #endif
-int ip_frag_mem(struct net *net);
 
 /*
  *	Functions provided by ip_forward.c
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 2067bbec021c..c07cf9596b6f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -320,13 +320,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 	    idev->cnf.accept_ra;
 }
 
-#if IS_ENABLED(CONFIG_IPV6)
-static inline int ip6_frag_mem(struct net *net)
-{
-	return sum_frag_mem_limit(&net->ipv6.frags);
-}
-#endif
-
 #define IPV6_FRAG_HIGH_THRESH	(4 * 1024*1024)	/* 4194304 */
 #define IPV6_FRAG_LOW_THRESH	(3 * 1024*1024)	/* 3145728 */
 #define IPV6_FRAG_TIMEOUT	(60 * HZ)	/* 60 seconds */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b383cbc86b13..44a3982563cc 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -84,11 +84,6 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-int ip_frag_mem(struct net *net)
-{
-	return sum_frag_mem_limit(&net->ipv4.frags);
-}
-
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			 struct net_device *dev);
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3abd9d7a3adf..d729ad2e4ad1 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -52,7 +52,6 @@
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
-	unsigned int frag_mem;
 	int orphans, sockets;
 
 	local_bh_disable();
@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
 		   sock_prot_inuse_get(net, &raw_prot));
-	frag_mem = ip_frag_mem(net);
-	seq_printf(seq,  "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
+	seq_printf(seq,  "FRAG: inuse %u memory %u\n",
+		   atomic_read(&net->ipv4.frags.rhashtable.nelems),
+		   frag_mem_limit(&net->ipv4.frags));
 	return 0;
 }
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 679253d0af84..177e2fe20915 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -33,7 +33,6 @@
 static int sockstat6_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
-	unsigned int frag_mem = ip6_frag_mem(net);
 
 	seq_printf(seq, "TCP6: inuse %d\n",
 		       sock_prot_inuse_get(net, &tcpv6_prot));
@@ -43,7 +42,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 			sock_prot_inuse_get(net, &udplitev6_prot));
 	seq_printf(seq, "RAW6: inuse %d\n",
 		       sock_prot_inuse_get(net, &rawv6_prot));
-	seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
+	seq_printf(seq, "FRAG6: inuse %u memory %u\n",
+		   atomic_read(&net->ipv6.frags.rhashtable.nelems),
+		   frag_mem_limit(&net->ipv6.frags));
 	return 0;
 }
 

From b047c796dedca591d1a3dc1ba8166d5082c337ef Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:58 -0700
Subject: [PATCH 2455/3220] inet: frags: get rif of inet_frag_evicting()

commit 399d1404be660d355192ff4df5ccc3f4159ec1e4 upstream.

This refactors ip_expire() since one indentation level is removed.

Note: in the future, we should try hard to avoid the skb_clone()
since this is a serious performance cost.
Under DDOS, the ICMP message wont be sent because of rate limits.

Fact that ip6_expire_frag_queue() does not use skb_clone() is
disturbing too. Presumably IPv6 should have the same
issue than the one we fixed in commit ec4fbd64751d
("inet: frag: release spinlock before calling icmp_send()")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_frag.h |  5 ----
 net/ipv4/ip_fragment.c  | 61 ++++++++++++++++++++---------------------
 net/ipv6/reassembly.c   |  4 ---
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 319a435cd723..6eb9f7cf1f12 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -118,11 +118,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
 		inet_frag_destroy(q);
 }
 
-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
-{
-	return false;
-}
-
 /* Memory Tracking Functions. */
 
 static inline int frag_mem_limit(struct netns_frags *nf)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 44a3982563cc..e0465ad21868 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -143,8 +143,11 @@ static bool frag_expire_skip_icmp(u32 user)
  */
 static void ip_expire(unsigned long arg)
 {
-	struct ipq *qp;
+	struct sk_buff *clone, *head;
+	const struct iphdr *iph;
 	struct net *net;
+	struct ipq *qp;
+	int err;
 
 	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
 	net = container_of(qp->q.net, struct net, ipv4.frags);
@@ -158,45 +161,41 @@ static void ip_expire(unsigned long arg)
 	ipq_kill(qp);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 
-	if (!inet_frag_evicting(&qp->q)) {
-		struct sk_buff *clone, *head = qp->q.fragments;
-		const struct iphdr *iph;
-		int err;
+	head = qp->q.fragments;
 
-		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
 
-		if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
-			goto out;
+	if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+		goto out;
 
-		head->dev = dev_get_by_index_rcu(net, qp->iif);
-		if (!head->dev)
-			goto out;
+	head->dev = dev_get_by_index_rcu(net, qp->iif);
+	if (!head->dev)
+		goto out;
 
 
-		/* skb has no dst, perform route lookup again */
-		iph = ip_hdr(head);
-		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+	/* skb has no dst, perform route lookup again */
+	iph = ip_hdr(head);
+	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
 					   iph->tos, head->dev);
-		if (err)
-			goto out;
+	if (err)
+		goto out;
 
-		/* Only an end host needs to send an ICMP
-		 * "Fragment Reassembly Timeout" message, per RFC792.
-		 */
-		if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
-		    (skb_rtable(head)->rt_type != RTN_LOCAL))
-			goto out;
+	/* Only an end host needs to send an ICMP
+	 * "Fragment Reassembly Timeout" message, per RFC792.
+	 */
+	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+	    (skb_rtable(head)->rt_type != RTN_LOCAL))
+		goto out;
 
-		clone = skb_clone(head, GFP_ATOMIC);
+	clone = skb_clone(head, GFP_ATOMIC);
 
-		/* Send an ICMP "Fragment Reassembly Timeout" message. */
-		if (clone) {
-			spin_unlock(&qp->q.lock);
-			icmp_send(clone, ICMP_TIME_EXCEEDED,
-				  ICMP_EXC_FRAGTIME, 0);
-			consume_skb(clone);
-			goto out_rcu_unlock;
-		}
+	/* Send an ICMP "Fragment Reassembly Timeout" message. */
+	if (clone) {
+		spin_unlock(&qp->q.lock);
+		icmp_send(clone, ICMP_TIME_EXCEEDED,
+			  ICMP_EXC_FRAGTIME, 0);
+		consume_skb(clone);
+		goto out_rcu_unlock;
 	}
 out:
 	spin_unlock(&qp->q.lock);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 0626c1b894e4..0076d7a03323 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -106,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 		goto out_rcu_unlock;
 
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
-
-	if (inet_frag_evicting(&fq->q))
-		goto out_rcu_unlock;
-
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
 
 	/* Don't send error if the first segment did not arrive. */

From 50fc08963b0ccc01bc9a01a4e699aed9eb0137f1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:29:59 -0700
Subject: [PATCH 2456/3220] inet: frags: remove inet_frag_maybe_warn_overflow()

commit 2d44ed22e607f9a285b049de2263e3840673a260 upstream.

This function is obsolete, after rhashtable addition to inet defrag.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_frag.h                 |  2 --
 net/ieee802154/6lowpan/reassembly.c     |  5 ++---
 net/ipv4/inet_fragment.c                | 11 -----------
 net/ipv4/ip_fragment.c                  |  5 ++---
 net/ipv6/netfilter/nf_conntrack_reasm.c |  5 ++---
 net/ipv6/reassembly.c                   |  5 ++---
 6 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 6eb9f7cf1f12..a9453d9e562b 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -109,8 +109,6 @@ void inet_frags_exit_net(struct netns_frags *nf);
 void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix);
 
 static inline void inet_frag_put(struct inet_frag_queue *q)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 510568c37476..c5ad89f5f028 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -83,10 +83,9 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 	key.dst = *dst;
 
 	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct lowpan_frag_queue, q);
 }
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index cca7362c8834..bb784c3a40fb 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -225,14 +225,3 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 	return inet_frag_create(nf, key);
 }
 EXPORT_SYMBOL(inet_frag_find);
-
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix)
-{
-	static const char msg[] = "inet_frag_find: Fragment hash bucket"
-		" list length grew over limit. Dropping fragment.\n";
-
-	if (PTR_ERR(q) == -ENOBUFS)
-		net_dbg_ratelimited("%s%s", prefix, msg);
-}
-EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e0465ad21868..c6711b04ad79 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -221,10 +221,9 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&net->ipv4.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct ipq, q);
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 985a9b6411af..41c66395df82 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -184,10 +184,9 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&net->nf_frag.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 0076d7a03323..1ab32ff0823e 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -154,10 +154,9 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 		key.iif = 0;
 
 	q = inet_frag_find(&net->ipv6.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 

From 567ef0554b91de121e9c1ad6b30d0077a5ea1fbf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:30:00 -0700
Subject: [PATCH 2457/3220] inet: frags: break the 2GB limit for frags storage

commit 3e67f106f619dcfaf6f4e2039599bdb69848c714 upstream.

Some users are willing to provision huge amounts of memory to be able
to perform reassembly reasonnably well under pressure.

Current memory tracking is using one atomic_t and integers.

Switch to atomic_long_t so that 64bit arches can use more than 2GB,
without any cost for 32bit arches.

Note that this patch avoids an overflow error, if high_thresh was set
to ~2GB, since this test in inet_frag_alloc() was never true :

if (... || frag_mem_limit(nf) > nf->high_thresh)

Tested:

$ echo 16000000000 >/proc/sys/net/ipv4/ipfrag_high_thresh

<frag DDOS>

$ grep FRAG /proc/net/sockstat
FRAG: inuse 14705885 memory 16000002880

$ nstat -n ; sleep 1 ; nstat | grep Reas
IpReasmReqds                    3317150            0.0
IpReasmFails                    3317112            0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/networking/ip-sysctl.txt  |  4 ++--
 include/net/inet_frag.h                 | 20 ++++++++++----------
 net/ieee802154/6lowpan/reassembly.c     | 10 +++++-----
 net/ipv4/ip_fragment.c                  | 10 +++++-----
 net/ipv4/proc.c                         |  2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++++-----
 net/ipv6/proc.c                         |  2 +-
 net/ipv6/reassembly.c                   |  6 +++---
 8 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2aa56ccaa996..7c229f59016f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -112,10 +112,10 @@ min_adv_mss - INTEGER
 
 IP Fragmentation:
 
-ipfrag_high_thresh - INTEGER
+ipfrag_high_thresh - LONG INTEGER
 	Maximum memory used to reassemble IP fragments.
 
-ipfrag_low_thresh - INTEGER
+ipfrag_low_thresh - LONG INTEGER
 	(Obsolete since linux-4.17)
 	Maximum memory used to reassemble IP fragments before the kernel
 	begins to remove incomplete fragment queues to free up resources.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index a9453d9e562b..197c172cc811 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -7,11 +7,11 @@ struct netns_frags {
 	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
 
 	/* Keep atomic mem on separate cachelines in structs that include it */
-	atomic_t		mem ____cacheline_aligned_in_smp;
+	atomic_long_t		mem ____cacheline_aligned_in_smp;
 	/* sysctls */
+	long			high_thresh;
+	long			low_thresh;
 	int			timeout;
-	int			high_thresh;
-	int			low_thresh;
 	struct inet_frags	*f;
 };
 
@@ -101,7 +101,7 @@ void inet_frags_fini(struct inet_frags *);
 
 static inline int inet_frags_init_net(struct netns_frags *nf)
 {
-	atomic_set(&nf->mem, 0);
+	atomic_long_set(&nf->mem, 0);
 	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
 }
 void inet_frags_exit_net(struct netns_frags *nf);
@@ -118,19 +118,19 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
 
 /* Memory Tracking Functions. */
 
-static inline int frag_mem_limit(struct netns_frags *nf)
+static inline long frag_mem_limit(const struct netns_frags *nf)
 {
-	return atomic_read(&nf->mem);
+	return atomic_long_read(&nf->mem);
 }
 
-static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
 {
-	atomic_sub(i, &nf->mem);
+	atomic_long_sub(val, &nf->mem);
 }
 
-static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
 {
-	atomic_add(i, &nf->mem);
+	atomic_long_add(val, &nf->mem);
 }
 
 /* RFC 3168 support :
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index c5ad89f5f028..94ef920530a5 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -410,23 +410,23 @@ err:
 }
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table lowpan_frags_ns_ctl_table[] = {
 	{
 		.procname	= "6lowpanfrag_high_thresh",
 		.data		= &init_net.ieee802154_lowpan.frags.high_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.ieee802154_lowpan.frags.low_thresh
 	},
 	{
 		.procname	= "6lowpanfrag_low_thresh",
 		.data		= &init_net.ieee802154_lowpan.frags.low_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.ieee802154_lowpan.frags.high_thresh
 	},
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index c6711b04ad79..a068a3bcd2d4 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -682,23 +682,23 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
 EXPORT_SYMBOL(ip_check_defrag);
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table ip4_frags_ns_ctl_table[] = {
 	{
 		.procname	= "ipfrag_high_thresh",
 		.data		= &init_net.ipv4.frags.high_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.ipv4.frags.low_thresh
 	},
 	{
 		.procname	= "ipfrag_low_thresh",
 		.data		= &init_net.ipv4.frags.low_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.ipv4.frags.high_thresh
 	},
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index d729ad2e4ad1..d9415fae9467 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -71,7 +71,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
 		   sock_prot_inuse_get(net, &raw_prot));
-	seq_printf(seq,  "FRAG: inuse %u memory %u\n",
+	seq_printf(seq,  "FRAG: inuse %u memory %lu\n",
 		   atomic_read(&net->ipv4.frags.rhashtable.nelems),
 		   frag_mem_limit(&net->ipv4.frags));
 	return 0;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 41c66395df82..618dc76003a2 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -64,7 +64,7 @@ struct nf_ct_frag6_skb_cb
 static struct inet_frags nf_frags;
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
@@ -77,18 +77,18 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_frag6_low_thresh",
 		.data		= &init_net.nf_frag.frags.low_thresh,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.nf_frag.frags.high_thresh
 	},
 	{
 		.procname	= "nf_conntrack_frag6_high_thresh",
 		.data		= &init_net.nf_frag.frags.high_thresh,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.nf_frag.frags.low_thresh
 	},
 	{ }
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 177e2fe20915..73e766e7bc37 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -42,7 +42,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 			sock_prot_inuse_get(net, &udplitev6_prot));
 	seq_printf(seq, "RAW6: inuse %d\n",
 		       sock_prot_inuse_get(net, &rawv6_prot));
-	seq_printf(seq, "FRAG6: inuse %u memory %u\n",
+	seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
 		   atomic_read(&net->ipv6.frags.rhashtable.nelems),
 		   frag_mem_limit(&net->ipv6.frags));
 	return 0;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 1ab32ff0823e..39e05821b956 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -545,15 +545,15 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = {
 	{
 		.procname	= "ip6frag_high_thresh",
 		.data		= &init_net.ipv6.frags.high_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.ipv6.frags.low_thresh
 	},
 	{
 		.procname	= "ip6frag_low_thresh",
 		.data		= &init_net.ipv6.frags.low_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,

From f925a29652a00e312d373b19f177af17be4ba5be Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:30:01 -0700
Subject: [PATCH 2458/3220] inet: frags: do not clone skb in ip_expire()

commit 1eec5d5670084ee644597bd26c25e22c69b9f748 upstream.

An skb_clone() was added in commit ec4fbd64751d ("inet: frag: release
spinlock before calling icmp_send()")

While fixing the bug at that time, it also added a very high cost
for DDOS frags, as the ICMP rate limit is applied after this
expensive operation (skb_clone() + consume_skb(), implying memory
allocations, copy, and freeing)

We can use skb_get(head) here, all we want is to make sure skb wont
be freed by another cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_fragment.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index a068a3bcd2d4..24665f004556 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -143,8 +143,8 @@ static bool frag_expire_skip_icmp(u32 user)
  */
 static void ip_expire(unsigned long arg)
 {
-	struct sk_buff *clone, *head;
 	const struct iphdr *iph;
+	struct sk_buff *head;
 	struct net *net;
 	struct ipq *qp;
 	int err;
@@ -187,16 +187,12 @@ static void ip_expire(unsigned long arg)
 	    (skb_rtable(head)->rt_type != RTN_LOCAL))
 		goto out;
 
-	clone = skb_clone(head, GFP_ATOMIC);
+	skb_get(head);
+	spin_unlock(&qp->q.lock);
+	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+	kfree_skb(head);
+	goto out_rcu_unlock;
 
-	/* Send an ICMP "Fragment Reassembly Timeout" message. */
-	if (clone) {
-		spin_unlock(&qp->q.lock);
-		icmp_send(clone, ICMP_TIME_EXCEEDED,
-			  ICMP_EXC_FRAGTIME, 0);
-		consume_skb(clone);
-		goto out_rcu_unlock;
-	}
 out:
 	spin_unlock(&qp->q.lock);
 out_rcu_unlock:

From bf8187348f264b720d5bc165703afeb1dee36f14 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:30:02 -0700
Subject: [PATCH 2459/3220] ipv6: frags: rewrite ip6_expire_frag_queue()

commit 05c0b86b9696802fd0ce5676a92a63f1b455bdf3 upstream.

Make it similar to IPv4 ip_expire(), and release the lock
before calling icmp functions.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/reassembly.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 39e05821b956..83aa027d24b6 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -92,7 +92,9 @@ EXPORT_SYMBOL(ip6_frag_init);
 void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 {
 	struct net_device *dev = NULL;
+	struct sk_buff *head;
 
+	rcu_read_lock();
 	spin_lock(&fq->q.lock);
 
 	if (fq->q.flags & INET_FRAG_COMPLETE)
@@ -100,28 +102,34 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 
 	inet_frag_kill(&fq->q);
 
-	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, fq->iif);
 	if (!dev)
-		goto out_rcu_unlock;
+		goto out;
 
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
 
 	/* Don't send error if the first segment did not arrive. */
-	if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments)
-		goto out_rcu_unlock;
+	head = fq->q.fragments;
+	if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
+		goto out;
 
 	/* But use as source device on which LAST ARRIVED
 	 * segment was received. And do not use fq->dev
 	 * pointer directly, device might already disappeared.
 	 */
-	fq->q.fragments->dev = dev;
-	icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
-out_rcu_unlock:
-	rcu_read_unlock();
+	head->dev = dev;
+	skb_get(head);
+	spin_unlock(&fq->q.lock);
+
+	icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
+	kfree_skb(head);
+	goto out_rcu_unlock;
+
 out:
 	spin_unlock(&fq->q.lock);
+out_rcu_unlock:
+	rcu_read_unlock();
 	inet_frag_put(&fq->q);
 }
 EXPORT_SYMBOL(ip6_expire_frag_queue);

From 33990010ea40454a41c4c5dc78d26165579578ca Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:30:03 -0700
Subject: [PATCH 2460/3220] rhashtable: reorganize struct rhashtable layout

commit e5d672a0780d9e7118caad4c171ec88b8299398d upstream.

While under frags DDOS I noticed unfortunate false sharing between
@nelems and @params.automatic_shrinking

Move @nelems at the end of struct rhashtable so that first cache line
is shared between all cpus, because almost never dirtied.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/rhashtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 753835d05be8..e97cdfd6cba9 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -133,23 +133,23 @@ struct rhashtable_params {
 /**
  * struct rhashtable - Hash table handle
  * @tbl: Bucket table
- * @nelems: Number of elements in table
  * @key_len: Key length for hashfn
  * @elasticity: Maximum chain length before rehash
  * @p: Configuration parameters
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
  */
 struct rhashtable {
 	struct bucket_table __rcu	*tbl;
-	atomic_t			nelems;
 	unsigned int			key_len;
 	unsigned int			elasticity;
 	struct rhashtable_params	p;
 	struct work_struct		run_work;
 	struct mutex                    mutex;
 	spinlock_t			lock;
+	atomic_t			nelems;
 };
 
 /**

From 29ff723c549906a5d8d64dd406d1b1b4da0eb85b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:30:04 -0700
Subject: [PATCH 2461/3220] inet: frags: reorganize struct netns_frags

commit c2615cf5a761b32bf74e85bddc223dfff3d9b9f0 upstream.

Put the read-mostly fields in a separate cache line
at the beginning of struct netns_frags, to reduce
false sharing noticed in inet_frag_kill()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_frag.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 197c172cc811..41a830ba11fc 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -4,15 +4,16 @@
 #include <linux/rhashtable.h>
 
 struct netns_frags {
-	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
-
-	/* Keep atomic mem on separate cachelines in structs that include it */
-	atomic_long_t		mem ____cacheline_aligned_in_smp;
 	/* sysctls */
 	long			high_thresh;
 	long			low_thresh;
 	int			timeout;
 	struct inet_frags	*f;
+
+	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
+
+	/* Keep atomic mem on separate cachelines in structs that include it */
+	atomic_long_t		mem ____cacheline_aligned_in_smp;
 };
 
 /**

From 826ff799146685450f84e3158ce66499c928c8ea Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:30:05 -0700
Subject: [PATCH 2462/3220] inet: frags: get rid of ipfrag_skb_cb/FRAG_CB

commit bf66337140c64c27fa37222b7abca7e49d63fb57 upstream.

ip_defrag uses skb->cb[] to store the fragment offset, and unfortunately
this integer is currently in a different cache line than skb->next,
meaning that we use two cache lines per skb when finding the insertion point.

By aliasing skb->ip_defrag_offset and skb->dev, we pack all the fields
in a single cache line and save precious memory bandwidth.

Note that after the fast path added by Changli Gao in commit
d6bebca92c66 ("fragment: add fast path for in-order fragments")
this change wont help the fast path, since we still need
to access prev->len (2nd cache line), but will show great
benefits when slow path is entered, since we perform
a linear scan of a potentially long list.

Also, note that this potential long list is an attack vector,
we might consider also using an rb-tree there eventually.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/skbuff.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6d39d81d3c38..053bdfb526f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -558,6 +558,11 @@ struct sk_buff {
 		};
 		struct rb_node	rbnode; /* used in netem & tcp stack */
 	};
+
+	union {
+		int			ip_defrag_offset;
+	};
+
 	struct sock		*sk;
 	struct net_device	*dev;
 

From bf40801a018bbe8f2076fab89f538cc05eb15c03 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:30:06 -0700
Subject: [PATCH 2463/3220] inet: frags: fix ip6frag_low_thresh boundary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 3d23401283e80ceb03f765842787e0e79ff598b7 upstream.

Giving an integer to proc_doulongvec_minmax() is dangerous on 64bit arches,
since linker might place next to it a non zero value preventing a change
to ip6frag_low_thresh.

ip6frag_low_thresh is not used anymore in the kernel, but we do not
want to prematuraly break user scripts wanting to change it.

Since specifying a minimal value of 0 for proc_doulongvec_minmax()
is moot, let's remove these zero values in all defrag units.

Fixes: 6e00f7dd5e4e ("ipv6: frags: fix /proc/sys/net/ipv6/ip6frag_low_thresh")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ieee802154/6lowpan/reassembly.c     |  2 --
 net/ipv4/ip_fragment.c                  | 40 ++++++++++---------------
 net/ipv6/netfilter/nf_conntrack_reasm.c |  2 --
 net/ipv6/reassembly.c                   |  4 +--
 4 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 94ef920530a5..6183730d38db 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -410,7 +410,6 @@ err:
 }
 
 #ifdef CONFIG_SYSCTL
-static long zero;
 
 static struct ctl_table lowpan_frags_ns_ctl_table[] = {
 	{
@@ -427,7 +426,6 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = {
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &zero,
 		.extra2		= &init_net.ieee802154_lowpan.frags.high_thresh
 	},
 	{
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 24665f004556..15d5349180cc 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -58,14 +58,6 @@
 static int sysctl_ipfrag_max_dist __read_mostly = 64;
 static const char ip_frag_cache_name[] = "ip4-frags";
 
-struct ipfrag_skb_cb
-{
-	struct inet_skb_parm	h;
-	int			offset;
-};
-
-#define FRAG_CB(skb)	((struct ipfrag_skb_cb *)((skb)->cb))
-
 /* Describe an entry in the "incomplete datagrams" queue. */
 struct ipq {
 	struct inet_frag_queue q;
@@ -353,13 +345,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	 * this fragment, right?
 	 */
 	prev = qp->q.fragments_tail;
-	if (!prev || FRAG_CB(prev)->offset < offset) {
+	if (!prev || prev->ip_defrag_offset < offset) {
 		next = NULL;
 		goto found;
 	}
 	prev = NULL;
 	for (next = qp->q.fragments; next != NULL; next = next->next) {
-		if (FRAG_CB(next)->offset >= offset)
+		if (next->ip_defrag_offset >= offset)
 			break;	/* bingo! */
 		prev = next;
 	}
@@ -370,7 +362,7 @@ found:
 	 * any overlaps are eliminated.
 	 */
 	if (prev) {
-		int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+		int i = (prev->ip_defrag_offset + prev->len) - offset;
 
 		if (i > 0) {
 			offset += i;
@@ -387,8 +379,8 @@ found:
 
 	err = -ENOMEM;
 
-	while (next && FRAG_CB(next)->offset < end) {
-		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+	while (next && next->ip_defrag_offset < end) {
+		int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
 
 		if (i < next->len) {
 			/* Eat head of the next overlapped fragment
@@ -396,7 +388,7 @@ found:
 			 */
 			if (!pskb_pull(next, i))
 				goto err;
-			FRAG_CB(next)->offset += i;
+			next->ip_defrag_offset += i;
 			qp->q.meat -= i;
 			if (next->ip_summed != CHECKSUM_UNNECESSARY)
 				next->ip_summed = CHECKSUM_NONE;
@@ -420,7 +412,13 @@ found:
 		}
 	}
 
-	FRAG_CB(skb)->offset = offset;
+	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
+	dev = skb->dev;
+	if (dev)
+		qp->iif = dev->ifindex;
+	/* Makes sure compiler wont do silly aliasing games */
+	barrier();
+	skb->ip_defrag_offset = offset;
 
 	/* Insert this fragment in the chain of fragments. */
 	skb->next = next;
@@ -431,11 +429,6 @@ found:
 	else
 		qp->q.fragments = skb;
 
-	dev = skb->dev;
-	if (dev) {
-		qp->iif = dev->ifindex;
-		skb->dev = NULL;
-	}
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
@@ -511,7 +504,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	}
 
 	WARN_ON(!head);
-	WARN_ON(FRAG_CB(head)->offset != 0);
+	WARN_ON(head->ip_defrag_offset != 0);
 
 	/* Allocate a new buffer for the datagram. */
 	ihlen = ip_hdrlen(head);
@@ -678,7 +671,7 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
 EXPORT_SYMBOL(ip_check_defrag);
 
 #ifdef CONFIG_SYSCTL
-static long zero;
+static int dist_min;
 
 static struct ctl_table ip4_frags_ns_ctl_table[] = {
 	{
@@ -695,7 +688,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &zero,
 		.extra2		= &init_net.ipv4.frags.high_thresh
 	},
 	{
@@ -724,7 +716,7 @@ static struct ctl_table ip4_frags_ctl_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero
+		.extra1		= &dist_min,
 	},
 	{ }
 };
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 618dc76003a2..a39b5af43efe 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -64,7 +64,6 @@ struct nf_ct_frag6_skb_cb
 static struct inet_frags nf_frags;
 
 #ifdef CONFIG_SYSCTL
-static long zero;
 
 static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
@@ -80,7 +79,6 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &zero,
 		.extra2		= &init_net.nf_frag.frags.high_thresh
 	},
 	{
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 83aa027d24b6..32d4659d2d83 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -547,7 +547,6 @@ static const struct inet6_protocol frag_protocol = {
 };
 
 #ifdef CONFIG_SYSCTL
-static int zero;
 
 static struct ctl_table ip6_frags_ns_ctl_table[] = {
 	{
@@ -563,8 +562,7 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = {
 		.data		= &init_net.ipv6.frags.low_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra2		= &init_net.ipv6.frags.high_thresh
 	},
 	{

From ef0f963de1d2c5bc99d3d6ace3dd44a7d6002717 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 10 Oct 2018 12:30:07 -0700
Subject: [PATCH 2464/3220] ip: discard IPv4 datagrams with overlapping
 segments.

commit 7969e5c40dfd04799d4341f1b7cd266b6e47f227 upstream.

This behavior is required in IPv6, and there is little need
to tolerate overlapping fragments in IPv4. This change
simplifies the code and eliminates potential DDoS attack vectors.

Tested: ran ip_defrag selftest (not yet available uptream).

Suggested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
[bwh: Backported to 4.4:
 - s/__IP_INC_STATS/IP_INC_STATS_BH/
 - Deleted code is slightly different]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/snmp.h |  1 +
 net/ipv4/ip_fragment.c    | 70 +++++++++++----------------------------
 net/ipv4/proc.c           |  1 +
 3 files changed, 21 insertions(+), 51 deletions(-)

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 25a9ad8bcef1..9de808ebce05 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -55,6 +55,7 @@ enum
 	IPSTATS_MIB_ECT1PKTS,			/* InECT1Pkts */
 	IPSTATS_MIB_ECT0PKTS,			/* InECT0Pkts */
 	IPSTATS_MIB_CEPKTS,			/* InCEPkts */
+	IPSTATS_MIB_REASM_OVERLAPS,		/* ReasmOverlaps */
 	__IPSTATS_MIB_MAX
 };
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 15d5349180cc..6e9d0c7c5159 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -277,6 +277,7 @@ static int ip_frag_reinit(struct ipq *qp)
 /* Add new segment to existing queue. */
 static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
+	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct sk_buff *prev, *next;
 	struct net_device *dev;
 	unsigned int fragsize;
@@ -357,60 +358,23 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	}
 
 found:
-	/* We found where to put this one.  Check for overlap with
-	 * preceding fragment, and, if needed, align things so that
-	 * any overlaps are eliminated.
+	/* RFC5722, Section 4, amended by Errata ID : 3089
+	 *                          When reassembling an IPv6 datagram, if
+	 *   one or more its constituent fragments is determined to be an
+	 *   overlapping fragment, the entire datagram (and any constituent
+	 *   fragments) MUST be silently discarded.
+	 *
+	 * We do the same here for IPv4.
 	 */
-	if (prev) {
-		int i = (prev->ip_defrag_offset + prev->len) - offset;
 
-		if (i > 0) {
-			offset += i;
-			err = -EINVAL;
-			if (end <= offset)
-				goto err;
-			err = -ENOMEM;
-			if (!pskb_pull(skb, i))
-				goto err;
-			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-				skb->ip_summed = CHECKSUM_NONE;
-		}
-	}
+	/* Is there an overlap with the previous fragment? */
+	if (prev &&
+	    (prev->ip_defrag_offset + prev->len) > offset)
+		goto discard_qp;
 
-	err = -ENOMEM;
-
-	while (next && next->ip_defrag_offset < end) {
-		int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
-
-		if (i < next->len) {
-			/* Eat head of the next overlapped fragment
-			 * and leave the loop. The next ones cannot overlap.
-			 */
-			if (!pskb_pull(next, i))
-				goto err;
-			next->ip_defrag_offset += i;
-			qp->q.meat -= i;
-			if (next->ip_summed != CHECKSUM_UNNECESSARY)
-				next->ip_summed = CHECKSUM_NONE;
-			break;
-		} else {
-			struct sk_buff *free_it = next;
-
-			/* Old fragment is completely overridden with
-			 * new one drop it.
-			 */
-			next = next->next;
-
-			if (prev)
-				prev->next = next;
-			else
-				qp->q.fragments = next;
-
-			qp->q.meat -= free_it->len;
-			sub_frag_mem_limit(qp->q.net, free_it->truesize);
-			kfree_skb(free_it);
-		}
-	}
+	/* Is there an overlap with the next fragment? */
+	if (next && next->ip_defrag_offset < end)
+		goto discard_qp;
 
 	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
 	dev = skb->dev;
@@ -458,6 +422,10 @@ found:
 	skb_dst_drop(skb);
 	return -EINPROGRESS;
 
+discard_qp:
+	inet_frag_kill(&qp->q);
+	err = -EINVAL;
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASM_OVERLAPS);
 err:
 	kfree_skb(skb);
 	return err;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index d9415fae9467..b001ad668108 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -132,6 +132,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
 	SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
 	SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
 	SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
+	SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
 	SNMP_MIB_SENTINEL
 };
 

From 26cfea3c1d041d08edacae291565f295553e15ce Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 10 Oct 2018 12:30:09 -0700
Subject: [PATCH 2465/3220] net: modify skb_rbtree_purge to return the truesize
 of all purged skbs.

commit 385114dec8a49b5e5945e77ba7de6356106713f4 upstream.

Tested: see the next patch is the series.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/skbuff.h | 2 +-
 net/core/skbuff.c      | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 053bdfb526f7..aa4753ae1ff2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2278,7 +2278,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
-void skb_rbtree_purge(struct rb_root *root);
+unsigned int skb_rbtree_purge(struct rb_root *root);
 
 void *netdev_alloc_frag(unsigned int fragsz);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8a57bbaf7452..49f73fb0840b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2380,23 +2380,27 @@ EXPORT_SYMBOL(skb_queue_purge);
 /**
  *	skb_rbtree_purge - empty a skb rbtree
  *	@root: root of the rbtree to empty
+ *	Return value: the sum of truesizes of all purged skbs.
  *
  *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
  *	the list and one reference dropped. This function does not take
  *	any lock. Synchronization should be handled by the caller (e.g., TCP
  *	out-of-order queue is protected by the socket lock).
  */
-void skb_rbtree_purge(struct rb_root *root)
+unsigned int skb_rbtree_purge(struct rb_root *root)
 {
 	struct rb_node *p = rb_first(root);
+	unsigned int sum = 0;
 
 	while (p) {
 		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, root);
+		sum += skb->truesize;
 		kfree_skb(skb);
 	}
+	return sum;
 }
 
 /**

From 5f2d68b6b5a439c3223d8fa6ba20736f91fc58d8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 10 Oct 2018 12:30:10 -0700
Subject: [PATCH 2466/3220] ipv6: defrag: drop non-last frags smaller than min
 mtu

commit 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 upstream.

don't bother with pathological cases, they only waste cycles.
IPv6 requires a minimum MTU of 1280 so we should never see fragments
smaller than this (except last frag).

v3: don't use awkward "-offset + len"
v2: drop IPv4 part, which added same check w. IPV4_MIN_MTU (68).
    There were concerns that there could be even smaller frags
    generated by intermediate nodes, e.g. on radio networks.

Cc: Peter Oskolkov <posk@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
[bwh: Backported to 4.4: In nf_ct_frag6_gather() use clone instead of skb,
 and goto ret_orig in case of error]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++++
 net/ipv6/reassembly.c                   | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index a39b5af43efe..cb72239dcff7 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -574,6 +574,10 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
 	hdr = ipv6_hdr(clone);
 	fhdr = (struct frag_hdr *)skb_transport_header(clone);
 
+	if (clone->len - skb_network_offset(clone) < IPV6_MIN_MTU &&
+	    fhdr->frag_off & htons(IP6_MF))
+		goto ret_orig;
+
 	skb_orphan(skb);
 	fq = fq_find(net, fhdr->identification, user, hdr,
 		     skb->dev ? skb->dev->ifindex : 0);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 32d4659d2d83..3cbcf099f6b2 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -515,6 +515,10 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
+	if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
+	    fhdr->frag_off & htons(IP6_MF))
+		goto fail_hdr;
+
 	iif = skb->dev ? skb->dev->ifindex : 0;
 	fq = fq_find(net, fhdr->identification, hdr, iif);
 	if (fq) {

From 7fab8b2f0c994decf580027286b97533c8b7f6fd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 12:30:11 -0700
Subject: [PATCH 2467/3220] net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are
 friends

commit 88078d98d1bb085d72af8437707279e203524fa5 upstream.

After working on IP defragmentation lately, I found that some large
packets defeat CHECKSUM_COMPLETE optimization because of NIC adding
zero paddings on the last (small) fragment.

While removing the padding with pskb_trim_rcsum(), we set skb->ip_summed
to CHECKSUM_NONE, forcing a full csum validation, even if all prior
fragments had CHECKSUM_COMPLETE set.

We can instead compute the checksum of the part we are trimming,
usually smaller than the part we keep.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/skbuff.h |  5 ++---
 net/core/skbuff.c      | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index aa4753ae1ff2..1e734e221ea3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2796,6 +2796,7 @@ static inline unsigned char *skb_push_rcsum(struct sk_buff *skb,
 	return skb->data;
 }
 
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
 /**
  *	pskb_trim_rcsum - trim received skb and update checksum
  *	@skb: buffer to trim
@@ -2810,9 +2811,7 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 {
 	if (likely(len >= skb->len))
 		return 0;
-	if (skb->ip_summed == CHECKSUM_COMPLETE)
-		skb->ip_summed = CHECKSUM_NONE;
-	return __pskb_trim(skb, len);
+	return pskb_trim_rcsum_slow(skb, len);
 }
 
 #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 49f73fb0840b..3d7c92d38a78 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1502,6 +1502,20 @@ done:
 }
 EXPORT_SYMBOL(___pskb_trim);
 
+/* Note : use pskb_trim_rcsum() instead of calling this directly
+ */
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		int delta = skb->len - len;
+
+		skb->csum = csum_sub(skb->csum,
+				     skb_checksum(skb, len, delta, 0));
+	}
+	return __pskb_trim(skb, len);
+}
+EXPORT_SYMBOL(pskb_trim_rcsum_slow);
+
 /**
  *	__pskb_pull_tail - advance tail of skb header
  *	@skb: buffer to reallocate

From 3f78a3f45e79ca378cb850a598e4c76633710e92 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 10 Oct 2018 12:30:13 -0700
Subject: [PATCH 2468/3220] ip: use rb trees for IP frag queue.

commit fa0f527358bd900ef92f925878ed6bfbd51305cc upstream.

Similar to TCP OOO RX queue, it makes sense to use rb trees to store
IP fragments, so that OOO fragments are inserted faster.

Tested:

- a follow-up patch contains a rather comprehensive ip defrag
  self-test (functional)
- ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`:
    netstat --statistics
    Ip:
        282078937 total packets received
        0 forwarded
        0 incoming packets discarded
        946760 incoming packets delivered
        18743456 requests sent out
        101 fragments dropped after timeout
        282077129 reassemblies required
        944952 packets reassembled ok
        262734239 packet reassembles failed
   (The numbers/stats above are somewhat better re:
    reassemblies vs a kernel without this patchset. More
    comprehensive performance testing TBD).

Reported-by: Jann Horn <jannh@google.com>
Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
[bwh: Backported to 4.4:
 - Keep using frag_kfree_skb() in inet_frag_destroy()
 - Adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/skbuff.h                  |   4 +-
 include/net/inet_frag.h                 |   3 +-
 net/ipv4/inet_fragment.c                |  14 +-
 net/ipv4/ip_fragment.c                  | 182 +++++++++++++-----------
 net/ipv6/netfilter/nf_conntrack_reasm.c |   1 +
 net/ipv6/reassembly.c                   |   1 +
 6 files changed, 116 insertions(+), 89 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1e734e221ea3..502787c29ce9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -556,14 +556,14 @@ struct sk_buff {
 				struct skb_mstamp skb_mstamp;
 			};
 		};
-		struct rb_node	rbnode; /* used in netem & tcp stack */
+		struct rb_node		rbnode; /* used in netem, ip4 defrag, and tcp stack */
 	};
 
 	union {
+		struct sock		*sk;
 		int			ip_defrag_offset;
 	};
 
-	struct sock		*sk;
 	struct net_device	*dev;
 
 	/*
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 41a830ba11fc..5fd8e76dcd26 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -73,7 +73,8 @@ struct inet_frag_queue {
 	struct timer_list	timer;
 	spinlock_t		lock;
 	atomic_t		refcnt;
-	struct sk_buff		*fragments;
+	struct sk_buff		*fragments;  /* Used in IPv6. */
+	struct rb_root		rb_fragments; /* Used in IPv4. */
 	struct sk_buff		*fragments_tail;
 	ktime_t			stamp;
 	int			len;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index bb784c3a40fb..02d399618b8f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -144,12 +144,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
 	fp = q->fragments;
 	nf = q->net;
 	f = nf->f;
-	while (fp) {
-		struct sk_buff *xp = fp->next;
+	if (fp) {
+		do {
+			struct sk_buff *xp = fp->next;
 
-		sum_truesize += fp->truesize;
-		frag_kfree_skb(nf, f, fp);
-		fp = xp;
+			sum_truesize += fp->truesize;
+			frag_kfree_skb(nf, f, fp);
+			fp = xp;
+		} while (fp);
+	} else {
+		sum_truesize = skb_rbtree_purge(&q->rb_fragments);
 	}
 	sum = sum_truesize + f->qsize;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 6e9d0c7c5159..270792776403 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -136,7 +136,7 @@ static bool frag_expire_skip_icmp(u32 user)
 static void ip_expire(unsigned long arg)
 {
 	const struct iphdr *iph;
-	struct sk_buff *head;
+	struct sk_buff *head = NULL;
 	struct net *net;
 	struct ipq *qp;
 	int err;
@@ -152,14 +152,31 @@ static void ip_expire(unsigned long arg)
 
 	ipq_kill(qp);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
-
-	head = qp->q.fragments;
-
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
 
-	if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+	if (!qp->q.flags & INET_FRAG_FIRST_IN)
 		goto out;
 
+	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
+	 * pull the head out of the tree in order to be able to
+	 * deal with head->dev.
+	 */
+	if (qp->q.fragments) {
+		head = qp->q.fragments;
+		qp->q.fragments = head->next;
+	} else {
+		head = skb_rb_first(&qp->q.rb_fragments);
+		if (!head)
+			goto out;
+		rb_erase(&head->rbnode, &qp->q.rb_fragments);
+		memset(&head->rbnode, 0, sizeof(head->rbnode));
+		barrier();
+	}
+	if (head == qp->q.fragments_tail)
+		qp->q.fragments_tail = NULL;
+
+	sub_frag_mem_limit(qp->q.net, head->truesize);
+
 	head->dev = dev_get_by_index_rcu(net, qp->iif);
 	if (!head->dev)
 		goto out;
@@ -179,16 +196,16 @@ static void ip_expire(unsigned long arg)
 	    (skb_rtable(head)->rt_type != RTN_LOCAL))
 		goto out;
 
-	skb_get(head);
 	spin_unlock(&qp->q.lock);
 	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
-	kfree_skb(head);
 	goto out_rcu_unlock;
 
 out:
 	spin_unlock(&qp->q.lock);
 out_rcu_unlock:
 	rcu_read_unlock();
+	if (head)
+		kfree_skb(head);
 	ipq_put(qp);
 }
 
@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
 	end = atomic_inc_return(&peer->rid);
 	qp->rid = end;
 
-	rc = qp->q.fragments && (end - start) > max;
+	rc = qp->q.fragments_tail && (end - start) > max;
 
 	if (rc) {
 		struct net *net;
@@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp)
 
 static int ip_frag_reinit(struct ipq *qp)
 {
-	struct sk_buff *fp;
 	unsigned int sum_truesize = 0;
 
 	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp)
 		return -ETIMEDOUT;
 	}
 
-	fp = qp->q.fragments;
-	do {
-		struct sk_buff *xp = fp->next;
-
-		sum_truesize += fp->truesize;
-		kfree_skb(fp);
-		fp = xp;
-	} while (fp);
+	sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
 	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	qp->q.flags = 0;
 	qp->q.len = 0;
 	qp->q.meat = 0;
 	qp->q.fragments = NULL;
+	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
 	qp->iif = 0;
 	qp->ecn = 0;
@@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp)
 static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
-	struct sk_buff *prev, *next;
+	struct rb_node **rbn, *parent;
+	struct sk_buff *skb1;
 	struct net_device *dev;
 	unsigned int fragsize;
 	int flags, offset;
@@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	if (err)
 		goto err;
 
-	/* Find out which fragments are in front and at the back of us
-	 * in the chain of fragments so far.  We must know where to put
-	 * this fragment, right?
-	 */
-	prev = qp->q.fragments_tail;
-	if (!prev || prev->ip_defrag_offset < offset) {
-		next = NULL;
-		goto found;
-	}
-	prev = NULL;
-	for (next = qp->q.fragments; next != NULL; next = next->next) {
-		if (next->ip_defrag_offset >= offset)
-			break;	/* bingo! */
-		prev = next;
-	}
+	/* Note : skb->rbnode and skb->dev share the same location. */
+	dev = skb->dev;
+	/* Makes sure compiler wont do silly aliasing games */
+	barrier();
 
-found:
 	/* RFC5722, Section 4, amended by Errata ID : 3089
 	 *                          When reassembling an IPv6 datagram, if
 	 *   one or more its constituent fragments is determined to be an
 	 *   overlapping fragment, the entire datagram (and any constituent
 	 *   fragments) MUST be silently discarded.
 	 *
-	 * We do the same here for IPv4.
+	 * We do the same here for IPv4 (and increment an snmp counter).
 	 */
 
-	/* Is there an overlap with the previous fragment? */
-	if (prev &&
-	    (prev->ip_defrag_offset + prev->len) > offset)
-		goto discard_qp;
+	/* Find out where to put this fragment.  */
+	skb1 = qp->q.fragments_tail;
+	if (!skb1) {
+		/* This is the first fragment we've received. */
+		rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
+		qp->q.fragments_tail = skb;
+	} else if ((skb1->ip_defrag_offset + skb1->len) < end) {
+		/* This is the common/special case: skb goes to the end. */
+		/* Detect and discard overlaps. */
+		if (offset < (skb1->ip_defrag_offset + skb1->len))
+			goto discard_qp;
+		/* Insert after skb1. */
+		rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+		qp->q.fragments_tail = skb;
+	} else {
+		/* Binary search. Note that skb can become the first fragment, but
+		 * not the last (covered above). */
+		rbn = &qp->q.rb_fragments.rb_node;
+		do {
+			parent = *rbn;
+			skb1 = rb_to_skb(parent);
+			if (end <= skb1->ip_defrag_offset)
+				rbn = &parent->rb_left;
+			else if (offset >= skb1->ip_defrag_offset + skb1->len)
+				rbn = &parent->rb_right;
+			else /* Found an overlap with skb1. */
+				goto discard_qp;
+		} while (*rbn);
+		/* Here we have parent properly set, and rbn pointing to
+		 * one of its NULL left/right children. Insert skb. */
+		rb_link_node(&skb->rbnode, parent, rbn);
+	}
+	rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 
-	/* Is there an overlap with the next fragment? */
-	if (next && next->ip_defrag_offset < end)
-		goto discard_qp;
-
-	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
-	dev = skb->dev;
 	if (dev)
 		qp->iif = dev->ifindex;
-	/* Makes sure compiler wont do silly aliasing games */
-	barrier();
 	skb->ip_defrag_offset = offset;
 
-	/* Insert this fragment in the chain of fragments. */
-	skb->next = next;
-	if (!next)
-		qp->q.fragments_tail = skb;
-	if (prev)
-		prev->next = skb;
-	else
-		qp->q.fragments = skb;
-
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
@@ -414,7 +425,7 @@ found:
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = ip_frag_reasm(qp, prev, dev);
+		err = ip_frag_reasm(qp, skb, dev);
 		skb->_skb_refdst = orefdst;
 		return err;
 	}
@@ -431,15 +442,15 @@ err:
 	return err;
 }
 
-
 /* Build a new IP datagram from all its fragments. */
-
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 			 struct net_device *dev)
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct iphdr *iph;
-	struct sk_buff *fp, *head = qp->q.fragments;
+	struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
+	struct sk_buff **nextp; /* To build frag_list. */
+	struct rb_node *rbn;
 	int len;
 	int ihlen;
 	int err;
@@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		goto out_fail;
 	}
 	/* Make the one we just received the head. */
-	if (prev) {
-		head = prev->next;
-		fp = skb_clone(head, GFP_ATOMIC);
+	if (head != skb) {
+		fp = skb_clone(skb, GFP_ATOMIC);
 		if (!fp)
 			goto out_nomem;
-
-		fp->next = head->next;
-		if (!fp->next)
+		rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+		if (qp->q.fragments_tail == skb)
 			qp->q.fragments_tail = fp;
-		prev->next = fp;
-
-		skb_morph(head, qp->q.fragments);
-		head->next = qp->q.fragments->next;
-
-		consume_skb(qp->q.fragments);
-		qp->q.fragments = head;
+		skb_morph(skb, head);
+		rb_replace_node(&head->rbnode, &skb->rbnode,
+				&qp->q.rb_fragments);
+		consume_skb(head);
+		head = skb;
 	}
 
-	WARN_ON(!head);
 	WARN_ON(head->ip_defrag_offset != 0);
 
 	/* Allocate a new buffer for the datagram. */
@@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		clone = alloc_skb(0, GFP_ATOMIC);
 		if (!clone)
 			goto out_nomem;
-		clone->next = head->next;
-		head->next = clone;
 		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
 		skb_frag_list_init(head);
 		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 		clone->len = clone->data_len = head->data_len - plen;
-		head->data_len -= clone->len;
-		head->len -= clone->len;
+		skb->truesize += clone->truesize;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
 		add_frag_mem_limit(qp->q.net, clone->truesize);
+		skb_shinfo(head)->frag_list = clone;
+		nextp = &clone->next;
+	} else {
+		nextp = &skb_shinfo(head)->frag_list;
 	}
 
-	skb_shinfo(head)->frag_list = head->next;
 	skb_push(head, head->data - skb_network_header(head));
 
-	for (fp=head->next; fp; fp = fp->next) {
+	/* Traverse the tree in order, to build frag_list. */
+	rbn = rb_next(&head->rbnode);
+	rb_erase(&head->rbnode, &qp->q.rb_fragments);
+	while (rbn) {
+		struct rb_node *rbnext = rb_next(rbn);
+		fp = rb_to_skb(rbn);
+		rb_erase(rbn, &qp->q.rb_fragments);
+		rbn = rbnext;
+		*nextp = fp;
+		nextp = &fp->next;
+		fp->prev = NULL;
+		memset(&fp->rbnode, 0, sizeof(fp->rbnode));
 		head->data_len += fp->len;
 		head->len += fp->len;
 		if (head->ip_summed != fp->ip_summed)
@@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	}
 	sub_frag_mem_limit(qp->q.net, head->truesize);
 
+	*nextp = NULL;
 	head->next = NULL;
+	head->prev = NULL;
 	head->dev = dev;
 	head->tstamp = qp->q.stamp;
 	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
+	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
 	return 0;
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index cb72239dcff7..5ce591aef46e 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -445,6 +445,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 					  head->csum);
 
 	fq->q.fragments = NULL;
+	fq->q.rb_fragments = RB_ROOT;
 	fq->q.fragments_tail = NULL;
 
 	/* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3cbcf099f6b2..ec917f58d105 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -465,6 +465,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
 	rcu_read_unlock();
 	fq->q.fragments = NULL;
+	fq->q.rb_fragments = RB_ROOT;
 	fq->q.fragments_tail = NULL;
 	return 1;
 

From 2039bd8669f4ecefca163f0c9d8c5f5f6a4c8610 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 10 Oct 2018 12:30:14 -0700
Subject: [PATCH 2469/3220] ip: add helpers to process in-order fragments
 faster.

commit 353c9cb360874e737fb000545f783df756c06f9a upstream.

This patch introduces several helper functions/macros that will be
used in the follow-up patch. No runtime changes yet.

The new logic (fully implemented in the second patch) is as follows:

* Nodes in the rb-tree will now contain not single fragments, but lists
  of consecutive fragments ("runs").

* At each point in time, the current "active" run at the tail is
  maintained/tracked. Fragments that arrive in-order, adjacent
  to the previous tail fragment, are added to this tail run without
  triggering the re-balancing of the rb-tree.

* If a fragment arrives out of order with the offset _before_ the tail run,
  it is inserted into the rb-tree as a single fragment.

* If a fragment arrives after the current tail fragment (with a gap),
  it starts a new "tail" run, as is inserted into the rb-tree
  at the end as the head of the new run.

skb->cb is used to store additional information
needed here (suggested by Eric Dumazet).

Reported-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_frag.h |  6 ++++
 net/ipv4/ip_fragment.c  | 73 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 5fd8e76dcd26..6260ec146142 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -55,7 +55,9 @@ struct frag_v6_compare_key {
  * @lock: spinlock protecting this frag
  * @refcnt: reference count of the queue
  * @fragments: received fragments head
+ * @rb_fragments: received fragments rb-tree root
  * @fragments_tail: received fragments tail
+ * @last_run_head: the head of the last "run". see ip_fragment.c
  * @stamp: timestamp of the last received fragment
  * @len: total length of the original datagram
  * @meat: length of received fragments so far
@@ -76,6 +78,7 @@ struct inet_frag_queue {
 	struct sk_buff		*fragments;  /* Used in IPv6. */
 	struct rb_root		rb_fragments; /* Used in IPv4. */
 	struct sk_buff		*fragments_tail;
+	struct sk_buff		*last_run_head;
 	ktime_t			stamp;
 	int			len;
 	int			meat;
@@ -112,6 +115,9 @@ void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
 
+/* Free all skbs in the queue; return the sum of their truesizes. */
+unsigned int inet_frag_rbtree_purge(struct rb_root *root);
+
 static inline void inet_frag_put(struct inet_frag_queue *q)
 {
 	if (atomic_dec_and_test(&q->refcnt))
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 270792776403..a32d4e3c99f2 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -58,6 +58,57 @@
 static int sysctl_ipfrag_max_dist __read_mostly = 64;
 static const char ip_frag_cache_name[] = "ip4-frags";
 
+/* Use skb->cb to track consecutive/adjacent fragments coming at
+ * the end of the queue. Nodes in the rb-tree queue will
+ * contain "runs" of one or more adjacent fragments.
+ *
+ * Invariants:
+ * - next_frag is NULL at the tail of a "run";
+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
+ */
+struct ipfrag_skb_cb {
+	struct inet_skb_parm	h;
+	struct sk_buff		*next_frag;
+	int			frag_run_len;
+};
+
+#define FRAG_CB(skb)		((struct ipfrag_skb_cb *)((skb)->cb))
+
+static void ip4_frag_init_run(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
+
+	FRAG_CB(skb)->next_frag = NULL;
+	FRAG_CB(skb)->frag_run_len = skb->len;
+}
+
+/* Append skb to the last "run". */
+static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
+					struct sk_buff *skb)
+{
+	RB_CLEAR_NODE(&skb->rbnode);
+	FRAG_CB(skb)->next_frag = NULL;
+
+	FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
+	FRAG_CB(q->fragments_tail)->next_frag = skb;
+	q->fragments_tail = skb;
+}
+
+/* Create a new "run" with the skb. */
+static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
+{
+	if (q->last_run_head)
+		rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
+			     &q->last_run_head->rbnode.rb_right);
+	else
+		rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
+	rb_insert_color(&skb->rbnode, &q->rb_fragments);
+
+	ip4_frag_init_run(skb);
+	q->fragments_tail = skb;
+	q->last_run_head = skb;
+}
+
 /* Describe an entry in the "incomplete datagrams" queue. */
 struct ipq {
 	struct inet_frag_queue q;
@@ -658,6 +709,28 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
 }
 EXPORT_SYMBOL(ip_check_defrag);
 
+unsigned int inet_frag_rbtree_purge(struct rb_root *root)
+{
+	struct rb_node *p = rb_first(root);
+	unsigned int sum = 0;
+
+	while (p) {
+		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, root);
+		while (skb) {
+			struct sk_buff *next = FRAG_CB(skb)->next_frag;
+
+			sum += skb->truesize;
+			kfree_skb(skb);
+			skb = next;
+		}
+	}
+	return sum;
+}
+EXPORT_SYMBOL(inet_frag_rbtree_purge);
+
 #ifdef CONFIG_SYSCTL
 static int dist_min;
 

From 2822475e70db5a4b46de88a5b66eb2aceb3734af Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 10 Oct 2018 12:30:15 -0700
Subject: [PATCH 2470/3220] ip: process in-order fragments efficiently

commit a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c upstream.

This patch changes the runtime behavior of IP defrag queue:
incoming in-order fragments are added to the end of the current
list/"run" of in-order fragments at the tail.

On some workloads, UDP stream performance is substantially improved:

RX: ./udp_stream -F 10 -T 2 -l 60
TX: ./udp_stream -c -H <host> -F 10 -T 5 -l 60

with this patchset applied on a 10Gbps receiver:

  throughput=9524.18
  throughput_units=Mbit/s

upstream (net-next):

  throughput=4608.93
  throughput_units=Mbit/s

Reported-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/inet_fragment.c |   2 +-
 net/ipv4/ip_fragment.c   | 110 ++++++++++++++++++++++++---------------
 2 files changed, 70 insertions(+), 42 deletions(-)

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 02d399618b8f..b6cb9958b34d 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -153,7 +153,7 @@ void inet_frag_destroy(struct inet_frag_queue *q)
 			fp = xp;
 		} while (fp);
 	} else {
-		sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+		sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
 	}
 	sum = sum_truesize + f->qsize;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index a32d4e3c99f2..3f6c170b40e1 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -127,8 +127,8 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
-			 struct net_device *dev);
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+			 struct sk_buff *prev_tail, struct net_device *dev);
 
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -219,7 +219,12 @@ static void ip_expire(unsigned long arg)
 		head = skb_rb_first(&qp->q.rb_fragments);
 		if (!head)
 			goto out;
-		rb_erase(&head->rbnode, &qp->q.rb_fragments);
+		if (FRAG_CB(head)->next_frag)
+			rb_replace_node(&head->rbnode,
+					&FRAG_CB(head)->next_frag->rbnode,
+					&qp->q.rb_fragments);
+		else
+			rb_erase(&head->rbnode, &qp->q.rb_fragments);
 		memset(&head->rbnode, 0, sizeof(head->rbnode));
 		barrier();
 	}
@@ -320,7 +325,7 @@ static int ip_frag_reinit(struct ipq *qp)
 		return -ETIMEDOUT;
 	}
 
-	sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
 	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	qp->q.flags = 0;
@@ -329,6 +334,7 @@ static int ip_frag_reinit(struct ipq *qp)
 	qp->q.fragments = NULL;
 	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
 	qp->iif = 0;
 	qp->ecn = 0;
 
@@ -340,7 +346,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct rb_node **rbn, *parent;
-	struct sk_buff *skb1;
+	struct sk_buff *skb1, *prev_tail;
 	struct net_device *dev;
 	unsigned int fragsize;
 	int flags, offset;
@@ -418,38 +424,41 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	 */
 
 	/* Find out where to put this fragment.  */
-	skb1 = qp->q.fragments_tail;
-	if (!skb1) {
-		/* This is the first fragment we've received. */
-		rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
-		qp->q.fragments_tail = skb;
-	} else if ((skb1->ip_defrag_offset + skb1->len) < end) {
-		/* This is the common/special case: skb goes to the end. */
+	prev_tail = qp->q.fragments_tail;
+	if (!prev_tail)
+		ip4_frag_create_run(&qp->q, skb);  /* First fragment. */
+	else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
+		/* This is the common case: skb goes to the end. */
 		/* Detect and discard overlaps. */
-		if (offset < (skb1->ip_defrag_offset + skb1->len))
+		if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
 			goto discard_qp;
-		/* Insert after skb1. */
-		rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
-		qp->q.fragments_tail = skb;
+		if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
+			ip4_frag_append_to_last_run(&qp->q, skb);
+		else
+			ip4_frag_create_run(&qp->q, skb);
 	} else {
-		/* Binary search. Note that skb can become the first fragment, but
-		 * not the last (covered above). */
+		/* Binary search. Note that skb can become the first fragment,
+		 * but not the last (covered above).
+		 */
 		rbn = &qp->q.rb_fragments.rb_node;
 		do {
 			parent = *rbn;
 			skb1 = rb_to_skb(parent);
 			if (end <= skb1->ip_defrag_offset)
 				rbn = &parent->rb_left;
-			else if (offset >= skb1->ip_defrag_offset + skb1->len)
+			else if (offset >= skb1->ip_defrag_offset +
+						FRAG_CB(skb1)->frag_run_len)
 				rbn = &parent->rb_right;
 			else /* Found an overlap with skb1. */
 				goto discard_qp;
 		} while (*rbn);
 		/* Here we have parent properly set, and rbn pointing to
-		 * one of its NULL left/right children. Insert skb. */
+		 * one of its NULL left/right children. Insert skb.
+		 */
+		ip4_frag_init_run(skb);
 		rb_link_node(&skb->rbnode, parent, rbn);
+		rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 	}
-	rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 
 	if (dev)
 		qp->iif = dev->ifindex;
@@ -476,7 +485,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = ip_frag_reasm(qp, skb, dev);
+		err = ip_frag_reasm(qp, skb, prev_tail, dev);
 		skb->_skb_refdst = orefdst;
 		return err;
 	}
@@ -495,7 +504,7 @@ err:
 
 /* Build a new IP datagram from all its fragments. */
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-			 struct net_device *dev)
+			 struct sk_buff *prev_tail, struct net_device *dev)
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct iphdr *iph;
@@ -519,10 +528,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 		fp = skb_clone(skb, GFP_ATOMIC);
 		if (!fp)
 			goto out_nomem;
-		rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+		FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+		if (RB_EMPTY_NODE(&skb->rbnode))
+			FRAG_CB(prev_tail)->next_frag = fp;
+		else
+			rb_replace_node(&skb->rbnode, &fp->rbnode,
+					&qp->q.rb_fragments);
 		if (qp->q.fragments_tail == skb)
 			qp->q.fragments_tail = fp;
 		skb_morph(skb, head);
+		FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
 		rb_replace_node(&head->rbnode, &skb->rbnode,
 				&qp->q.rb_fragments);
 		consume_skb(head);
@@ -558,7 +573,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 		clone->len = clone->data_len = head->data_len - plen;
-		skb->truesize += clone->truesize;
+		head->truesize += clone->truesize;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
 		add_frag_mem_limit(qp->q.net, clone->truesize);
@@ -571,24 +586,36 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	skb_push(head, head->data - skb_network_header(head));
 
 	/* Traverse the tree in order, to build frag_list. */
+	fp = FRAG_CB(head)->next_frag;
 	rbn = rb_next(&head->rbnode);
 	rb_erase(&head->rbnode, &qp->q.rb_fragments);
-	while (rbn) {
-		struct rb_node *rbnext = rb_next(rbn);
-		fp = rb_to_skb(rbn);
-		rb_erase(rbn, &qp->q.rb_fragments);
-		rbn = rbnext;
-		*nextp = fp;
-		nextp = &fp->next;
-		fp->prev = NULL;
-		memset(&fp->rbnode, 0, sizeof(fp->rbnode));
-		head->data_len += fp->len;
-		head->len += fp->len;
-		if (head->ip_summed != fp->ip_summed)
-			head->ip_summed = CHECKSUM_NONE;
-		else if (head->ip_summed == CHECKSUM_COMPLETE)
-			head->csum = csum_add(head->csum, fp->csum);
-		head->truesize += fp->truesize;
+	while (rbn || fp) {
+		/* fp points to the next sk_buff in the current run;
+		 * rbn points to the next run.
+		 */
+		/* Go through the current run. */
+		while (fp) {
+			*nextp = fp;
+			nextp = &fp->next;
+			fp->prev = NULL;
+			memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+			head->data_len += fp->len;
+			head->len += fp->len;
+			if (head->ip_summed != fp->ip_summed)
+				head->ip_summed = CHECKSUM_NONE;
+			else if (head->ip_summed == CHECKSUM_COMPLETE)
+				head->csum = csum_add(head->csum, fp->csum);
+			head->truesize += fp->truesize;
+			fp = FRAG_CB(fp)->next_frag;
+		}
+		/* Move to the next run. */
+		if (rbn) {
+			struct rb_node *rbnext = rb_next(rbn);
+
+			fp = rb_to_skb(rbn);
+			rb_erase(rbn, &qp->q.rb_fragments);
+			rbn = rbnext;
+		}
 	}
 	sub_frag_mem_limit(qp->q.net, head->truesize);
 
@@ -624,6 +651,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	qp->q.fragments = NULL;
 	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
 	return 0;
 
 out_nomem:

From cb5fd4aa24b57206548d5940dc359f0b181a2688 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Wed, 10 Oct 2018 12:30:16 -0700
Subject: [PATCH 2471/3220] ip: frags: fix crash in ip_do_fragment()

commit 5d407b071dc369c26a38398326ee2be53651cfe4 upstream.

A kernel crash occurrs when defragmented packet is fragmented
in ip_do_fragment().
In defragment routine, skb_orphan() is called and
skb->ip_defrag_offset is set. but skb->sk and
skb->ip_defrag_offset are same union member. so that
frag->sk is not NULL.
Hence crash occurrs in skb->sk check routine in ip_do_fragment() when
defragmented packet is fragmented.

test commands:
   %iptables -t nat -I POSTROUTING -j MASQUERADE
   %hping3 192.168.4.2 -s 1000 -p 2000 -d 60000

splat looks like:
[  261.069429] kernel BUG at net/ipv4/ip_output.c:636!
[  261.075753] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
[  261.083854] CPU: 1 PID: 1349 Comm: hping3 Not tainted 4.19.0-rc2+ #3
[  261.100977] RIP: 0010:ip_do_fragment+0x1613/0x2600
[  261.106945] Code: e8 e2 38 e3 fe 4c 8b 44 24 18 48 8b 74 24 08 e9 92 f6 ff ff 80 3c 02 00 0f 85 da 07 00 00 48 8b b5 d0 00 00 00 e9 25 f6 ff ff <0f> 0b 0f 0b 44 8b 54 24 58 4c 8b 4c 24 18 4c 8b 5c 24 60 4c 8b 6c
[  261.127015] RSP: 0018:ffff8801031cf2c0 EFLAGS: 00010202
[  261.134156] RAX: 1ffff1002297537b RBX: ffffed0020639e6e RCX: 0000000000000004
[  261.142156] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880114ba9bd8
[  261.150157] RBP: ffff880114ba8a40 R08: ffffed0022975395 R09: ffffed0022975395
[  261.158157] R10: 0000000000000001 R11: ffffed0022975394 R12: ffff880114ba9ca4
[  261.166159] R13: 0000000000000010 R14: ffff880114ba9bc0 R15: dffffc0000000000
[  261.174169] FS:  00007fbae2199700(0000) GS:ffff88011b400000(0000) knlGS:0000000000000000
[  261.183012] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  261.189013] CR2: 00005579244fe000 CR3: 0000000119bf4000 CR4: 00000000001006e0
[  261.198158] Call Trace:
[  261.199018]  ? dst_output+0x180/0x180
[  261.205011]  ? save_trace+0x300/0x300
[  261.209018]  ? ip_copy_metadata+0xb00/0xb00
[  261.213034]  ? sched_clock_local+0xd4/0x140
[  261.218158]  ? kill_l4proto+0x120/0x120 [nf_conntrack]
[  261.223014]  ? rt_cpu_seq_stop+0x10/0x10
[  261.227014]  ? find_held_lock+0x39/0x1c0
[  261.233008]  ip_finish_output+0x51d/0xb50
[  261.237006]  ? ip_fragment.constprop.56+0x220/0x220
[  261.243011]  ? nf_ct_l4proto_register_one+0x5b0/0x5b0 [nf_conntrack]
[  261.250152]  ? rcu_is_watching+0x77/0x120
[  261.255010]  ? nf_nat_ipv4_out+0x1e/0x2b0 [nf_nat_ipv4]
[  261.261033]  ? nf_hook_slow+0xb1/0x160
[  261.265007]  ip_output+0x1c7/0x710
[  261.269005]  ? ip_mc_output+0x13f0/0x13f0
[  261.273002]  ? __local_bh_enable_ip+0xe9/0x1b0
[  261.278152]  ? ip_fragment.constprop.56+0x220/0x220
[  261.282996]  ? nf_hook_slow+0xb1/0x160
[  261.287007]  raw_sendmsg+0x21f9/0x4420
[  261.291008]  ? dst_output+0x180/0x180
[  261.297003]  ? sched_clock_cpu+0x126/0x170
[  261.301003]  ? find_held_lock+0x39/0x1c0
[  261.306155]  ? stop_critical_timings+0x420/0x420
[  261.311004]  ? check_flags.part.36+0x450/0x450
[  261.315005]  ? _raw_spin_unlock_irq+0x29/0x40
[  261.320995]  ? _raw_spin_unlock_irq+0x29/0x40
[  261.326142]  ? cyc2ns_read_end+0x10/0x10
[  261.330139]  ? raw_bind+0x280/0x280
[  261.334138]  ? sched_clock_cpu+0x126/0x170
[  261.338995]  ? check_flags.part.36+0x450/0x450
[  261.342991]  ? __lock_acquire+0x4500/0x4500
[  261.348994]  ? inet_sendmsg+0x11c/0x500
[  261.352989]  ? dst_output+0x180/0x180
[  261.357012]  inet_sendmsg+0x11c/0x500
[ ... ]

v2:
 - clear skb->sk at reassembly routine.(Eric Dumarzet)

Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_fragment.c                  | 1 +
 net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 3f6c170b40e1..cfa523302890 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -599,6 +599,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 			nextp = &fp->next;
 			fp->prev = NULL;
 			memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+			fp->sk = NULL;
 			head->data_len += fp->len;
 			head->len += fp->len;
 			if (head->ip_summed != fp->ip_summed)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 5ce591aef46e..664c84e47bab 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -427,6 +427,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 		else if (head->ip_summed == CHECKSUM_COMPLETE)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
+		fp->sk = NULL;
 	}
 	sub_frag_mem_limit(fq->q.net, head->truesize);
 

From acd00a0692072b374afee4b6f38c1eb1c6cf6f4a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 10 Oct 2018 12:30:17 -0700
Subject: [PATCH 2472/3220] ipv4: frags: precedence bug in ip_expire()

commit 70837ffe3085c9a91488b52ca13ac84424da1042 upstream.

We accidentally removed the parentheses here, but they are required
because '!' has higher precedence than '&'.

Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_fragment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cfa523302890..dbf8045b917f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -205,7 +205,7 @@ static void ip_expire(unsigned long arg)
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
 
-	if (!qp->q.flags & INET_FRAG_FIRST_IN)
+	if (!(qp->q.flags & INET_FRAG_FIRST_IN))
 		goto out;
 
 	/* sk_buff::dev and sk_buff::rbnode are unionized. So we

From e92b8475d6d7319bb69c3bdf307f3d9384c3776a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Nov 2018 17:34:27 -0800
Subject: [PATCH 2473/3220] inet: frags: better deal with smp races
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 0d5b9311baf27bb545f187f12ecfd558220c607d upstream.

Multiple cpus might attempt to insert a new fragment in rhashtable,
if for example RPS is buggy, as reported by 배석진 in
https://patchwork.ozlabs.org/patch/994601/

We use rhashtable_lookup_get_insert_key() instead of
rhashtable_insert_fast() to let cpus losing the race
free their own inet_frag_queue and use the one that
was inserted by another cpu.

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: 배석진 <soukjin.bae@samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/inet_fragment.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index b6cb9958b34d..c03e5f5859e1 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -188,21 +188,22 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 }
 
 static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
-						void *arg)
+						void *arg,
+						struct inet_frag_queue **prev)
 {
 	struct inet_frags *f = nf->f;
 	struct inet_frag_queue *q;
-	int err;
 
 	q = inet_frag_alloc(nf, f, arg);
-	if (!q)
+	if (!q) {
+		*prev = ERR_PTR(-ENOMEM);
 		return NULL;
-
+	}
 	mod_timer(&q->timer, jiffies + nf->timeout);
 
-	err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
-				     f->rhash_params);
-	if (err < 0) {
+	*prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
+						 &q->node, f->rhash_params);
+	if (*prev) {
 		q->flags |= INET_FRAG_COMPLETE;
 		inet_frag_kill(q);
 		inet_frag_destroy(q);
@@ -215,17 +216,18 @@ EXPORT_SYMBOL(inet_frag_create);
 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 {
-	struct inet_frag_queue *fq;
+	struct inet_frag_queue *fq = NULL, *prev;
 
 	rcu_read_lock();
-	fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
-	if (fq) {
+	prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
+	if (!prev)
+		fq = inet_frag_create(nf, key, &prev);
+	if (prev && !IS_ERR(prev)) {
+		fq = prev;
 		if (!atomic_inc_not_zero(&fq->refcnt))
 			fq = NULL;
-		rcu_read_unlock();
-		return fq;
 	}
 	rcu_read_unlock();
-	return inet_frag_create(nf, key);
+	return fq;
 }
 EXPORT_SYMBOL(inet_frag_find);

From 0836bdfee568a417ffba238fc180bf5c00c648e3 Mon Sep 17 00:00:00 2001
From: Dimitris Michailidis <dmichail@google.com>
Date: Fri, 19 Oct 2018 17:07:13 -0700
Subject: [PATCH 2474/3220] net: fix pskb_trim_rcsum_slow() with odd trim
 offset

commit d55bef5059dd057bd077155375c581b49d25be7e upstream.

We've been getting checksum errors involving small UDP packets, usually
59B packets with 1 extra non-zero padding byte. netdev_rx_csum_fault()
has been complaining that HW is providing bad checksums. Turns out the
problem is in pskb_trim_rcsum_slow(), introduced in commit 88078d98d1bb
("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends").

The source of the problem is that when the bytes we are trimming start
at an odd address, as in the case of the 1 padding byte above,
skb_checksum() returns a byte-swapped value. We cannot just combine this
with skb->csum using csum_sub(). We need to use csum_block_sub() here
that takes into account the parity of the start address and handles the
swapping.

Matches existing code in __skb_postpull_rcsum() and esp_remove_trailer().

Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends")
Signed-off-by: Dimitris Michailidis <dmichail@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/skbuff.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3d7c92d38a78..fea7c24e99d0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1509,8 +1509,9 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
 		int delta = skb->len - len;
 
-		skb->csum = csum_sub(skb->csum,
-				     skb_checksum(skb, len, delta, 0));
+		skb->csum = csum_block_sub(skb->csum,
+					   skb_checksum(skb, len, delta, 0),
+					   len);
 	}
 	return __pskb_trim(skb, len);
 }

From 873beab3740a085ba284ed7daa4ab4068074db53 Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Thu, 13 Dec 2018 17:23:32 +0100
Subject: [PATCH 2475/3220] net: ipv4: do not handle duplicate fragments as
 overlapping

commit ade446403bfb79d3528d56071a84b15351a139ad upstream.

Since commit 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping
segments.") IPv4 reassembly code drops the whole queue whenever an
overlapping fragment is received. However, the test is written in a way
which detects duplicate fragments as overlapping so that in environments
with many duplicate packets, fragmented packets may be undeliverable.

Add an extra test and for (potentially) duplicate fragment, only drop the
new fragment rather than the whole queue. Only starting offset and length
are checked, not the contents of the fragments as that would be too
expensive. For similar reason, linear list ("run") of a rbtree node is not
iterated, we only check if the new fragment is a subset of the interval
covered by existing consecutive fragments.

v2: instead of an exact check iterating through linear list of an rbtree
node, only check if the new fragment is subset of the "run" (suggested
by Eric Dumazet)

Fixes: 7969e5c40dfd ("ip: discard IPv4 datagrams with overlapping segments.")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
[bwh: Backported to 4.4:
 - goto discard_qp, not err, in case of overlap
 - Set err earlier variable, as done upstream in commit 0ff89efb5246
   "ip: fail fast on IP defrag errors"]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_fragment.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index dbf8045b917f..9b09a9b5a4fe 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -347,10 +347,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct rb_node **rbn, *parent;
 	struct sk_buff *skb1, *prev_tail;
+	int ihl, end, skb1_run_end;
 	struct net_device *dev;
 	unsigned int fragsize;
 	int flags, offset;
-	int ihl, end;
 	int err = -ENOENT;
 	u8 ecn;
 
@@ -420,9 +420,12 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	 *   overlapping fragment, the entire datagram (and any constituent
 	 *   fragments) MUST be silently discarded.
 	 *
-	 * We do the same here for IPv4 (and increment an snmp counter).
+	 * We do the same here for IPv4 (and increment an snmp counter) but
+	 * we do not want to drop the whole queue in response to a duplicate
+	 * fragment.
 	 */
 
+	err = -EINVAL;
 	/* Find out where to put this fragment.  */
 	prev_tail = qp->q.fragments_tail;
 	if (!prev_tail)
@@ -444,13 +447,17 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		do {
 			parent = *rbn;
 			skb1 = rb_to_skb(parent);
+			skb1_run_end = skb1->ip_defrag_offset +
+				       FRAG_CB(skb1)->frag_run_len;
 			if (end <= skb1->ip_defrag_offset)
 				rbn = &parent->rb_left;
-			else if (offset >= skb1->ip_defrag_offset +
-						FRAG_CB(skb1)->frag_run_len)
+			else if (offset >= skb1_run_end)
 				rbn = &parent->rb_right;
-			else /* Found an overlap with skb1. */
-				goto discard_qp;
+			else if (offset >= skb1->ip_defrag_offset &&
+				 end <= skb1_run_end)
+				goto err; /* No new data, potential duplicate */
+			else
+				goto discard_qp; /* Found an overlap */
 		} while (*rbn);
 		/* Here we have parent properly set, and rbn pointing to
 		 * one of its NULL left/right children. Insert skb.
@@ -495,7 +502,6 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 
 discard_qp:
 	inet_frag_kill(&qp->q);
-	err = -EINVAL;
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASM_OVERLAPS);
 err:
 	kfree_skb(skb);

From 60c7f8fca1bbc6ac677a76f25843202b91f52abb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 11 Jan 2016 16:29:29 -0800
Subject: [PATCH 2476/3220] rcu: Force boolean subscript for expedited stall
 warnings

commit ec3833ed02ae6ef2a933ece9de7cbab0c64c699e upstream.

The cpu_online() function can return values other than 0 and 1, which
can result in subscript overflow when applied to a two-element array.
This commit allows for this behavior by using "!!" on the return value
from cpu_online() when used as a subscript.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: "Rantala, Tommi" <tommi.t.rantala@nokia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8a62cbfe1f2f..4e886ccd40db 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3817,7 +3817,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 					continue;
 				rdp = per_cpu_ptr(rsp->rda, cpu);
 				pr_cont(" %d-%c%c%c", cpu,
-					"O."[cpu_online(cpu)],
+					"O."[!!cpu_online(cpu)],
 					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
 					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
 			}

From dc5e8c99975bb1a1561de884a83b3c19e4ac7ada Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 8 Feb 2019 11:25:33 +0100
Subject: [PATCH 2477/3220] Linux 4.4.174

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index db7665e32da8..1fa281069379 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 173
+SUBLEVEL = 174
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From 983e064bcbe0da3b5df688dca6baf417773cee39 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 9 Sep 2016 15:32:34 -0700
Subject: [PATCH 2478/3220] ion: Disable ION_HEAP_TYPE_SYSTEM_CONTIG

Bug: 30400942
Change-Id: I19fa5bf6e5c66b532b842180b2cf0ae04ddca337
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 drivers/staging/android/ion/ion_heap.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/android/ion/ion_heap.c b/drivers/staging/android/ion/ion_heap.c
index 13a9b4c42b26..ac331c898f5e 100644
--- a/drivers/staging/android/ion/ion_heap.c
+++ b/drivers/staging/android/ion/ion_heap.c
@@ -321,8 +321,9 @@ struct ion_heap *ion_heap_create(struct ion_platform_heap *heap_data)
 
 	switch (heap_data->type) {
 	case ION_HEAP_TYPE_SYSTEM_CONTIG:
-		heap = ion_system_contig_heap_create(heap_data);
-		break;
+		pr_err("%s: Heap type is disabled: %d\n", __func__,
+		       heap_data->type);
+		return ERR_PTR(-EINVAL);
 	case ION_HEAP_TYPE_SYSTEM:
 		heap = ion_system_heap_create(heap_data);
 		break;
@@ -361,7 +362,8 @@ void ion_heap_destroy(struct ion_heap *heap)
 
 	switch (heap->type) {
 	case ION_HEAP_TYPE_SYSTEM_CONTIG:
-		ion_system_contig_heap_destroy(heap);
+		pr_err("%s: Heap type is disabled: %d\n", __func__,
+		       heap->type);
 		break;
 	case ION_HEAP_TYPE_SYSTEM:
 		ion_system_heap_destroy(heap);

From 363f6f40b1e2b7a8a1dec96bcd6088c7c6dd37d0 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Fri, 18 Jan 2019 07:29:52 +0000
Subject: [PATCH 2479/3220] UPSTREAM: virt_wifi: fix error return code in
 virt_wifi_newlink()

Fix to return a negative error code from the error handling
case instead of 0, as done elsewhere in this function.

Fixes: c7cdba31ed8b ("mac80211-next: rtnetlink wifi simulation device")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
(cherry picked from commit f9d672f1c2ca36b788511bbd773d650c744e109a)
Change-Id: Ia6290ab996afca7b33e1a8e9b8bcfc0ecbe252f4
Signed-off-by: Cody Schuffelen <schuffelen@google.com>
---
 drivers/net/wireless/virt_wifi.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/virt_wifi.c b/drivers/net/wireless/virt_wifi.c
index 90ecb7e8834f..9c303f482aa9 100644
--- a/drivers/net/wireless/virt_wifi.c
+++ b/drivers/net/wireless/virt_wifi.c
@@ -527,8 +527,10 @@ static int virt_wifi_newlink(struct net *src_net, struct net_device *dev,
 	SET_NETDEV_DEV(dev, &priv->lowerdev->dev);
 	dev->ieee80211_ptr = kzalloc(sizeof(*dev->ieee80211_ptr), GFP_KERNEL);
 
-	if (!dev->ieee80211_ptr)
+	if (!dev->ieee80211_ptr) {
+		err = -ENOMEM;
 		goto remove_handler;
+	}
 
 	dev->ieee80211_ptr->iftype = NL80211_IFTYPE_STATION;
 	dev->ieee80211_ptr->wiphy = common_wiphy;

From 4d417f9d93ff88b7db86dde910148a317ab23303 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 12 Feb 2019 13:23:11 -0800
Subject: [PATCH 2480/3220] ANDROID: Move from clang r346389b to r349610.

Bug: 123635022
Test: make ARCH=arm64 cuttlefish_defconfig && make ARCH=arm64
Test: make ARCH=x86_64 x86_64_cuttlefish_defconfig && make ARCH=x86_64
Change-Id: Icc02ea92c13435fa5a6ecd33d8878629762fd2f7
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 build.config.cuttlefish.aarch64 | 2 +-
 build.config.cuttlefish.x86_64  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.config.cuttlefish.aarch64 b/build.config.cuttlefish.aarch64
index e36bae42443e..6bf6755ab3a8 100644
--- a/build.config.cuttlefish.aarch64
+++ b/build.config.cuttlefish.aarch64
@@ -6,7 +6,7 @@ DEFCONFIG=cuttlefish_defconfig
 EXTRA_CMDS=''
 KERNEL_DIR=common
 POST_DEFCONFIG_CMDS="check_defconfig"
-CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r346389b/bin
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r349610/bin
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
 FILES="
 arch/arm64/boot/Image.gz
diff --git a/build.config.cuttlefish.x86_64 b/build.config.cuttlefish.x86_64
index 0a77fa871e83..e3477c3b3cbe 100644
--- a/build.config.cuttlefish.x86_64
+++ b/build.config.cuttlefish.x86_64
@@ -6,7 +6,7 @@ DEFCONFIG=x86_64_cuttlefish_defconfig
 EXTRA_CMDS=''
 KERNEL_DIR=common
 POST_DEFCONFIG_CMDS="check_defconfig"
-CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r346389b/bin
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r349610/bin
 LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
 FILES="
 arch/x86/boot/bzImage

From 74b50ba2ec2379e9cc50265bee38b6847176dbbe Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Fri, 15 Feb 2019 12:57:28 -0800
Subject: [PATCH 2481/3220] ANDROID: cuttlefish_defconfig: Enable
 DEBUG_SET_MODULE_RONX

This feature is required by the KernelConfigTest#testConfigROData
security CTS test. It was already a base option in kernel/configs and
was enabled in the arm64 defconfig. It was simply missed for x86_64.

As we do not currently use kernel modules on cuttlefish this option
should have no other effect.

Bug: 124521452
Test: booted cuttlefish with updated kernel (x86_64)
Change-Id: Id6ab679b2412159d2f8c8e72b683bd2dee59e512
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index afa9b589d1cc..374caac24cb6 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -441,6 +441,7 @@ CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_DEBUG_SET_MODULE_RONX=y
 CONFIG_IO_DELAY_NONE=y
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_OPTIMIZE_INLINING=y

From aaf863165e0920abe6c98026b131f9cca6ac9a0c Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Fri, 15 Feb 2019 16:48:07 -0500
Subject: [PATCH 2482/3220] BACKPORT: userfaultfd: shmem/hugetlbfs: only allow
 to register VM_MAYWRITE vmas

commit 29ec90660d68bbdd69507c1c8b4e33aa299278b1 upstream.

After the VMA to register the uffd onto is found, check that it has
VM_MAYWRITE set before allowing registration.  This way we inherit all
common code checks before allowing to fill file holes in shmem and
hugetlbfs with UFFDIO_COPY.

The userfaultfd memory model is not applicable for readonly files unless
it's a MAP_PRIVATE.

Change-Id: I55976c6eda20d68bdb8708d3b244731e3e4ce55c
Link: http://lkml.kernel.org/r/20181126173452.26955-4-aarcange@redhat.com
Fixes: ff62a3421044 ("hugetlb: implement memfd sealing")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Hugh Dickins <hughd@google.com>
Reported-by: Jann Horn <jannh@google.com>
Fixes: 4c27fe4c4c84 ("userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support")
Cc: <stable@vger.kernel.org>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 fs/userfaultfd.c | 14 ++++++++++++++
 mm/userfaultfd.c | 10 +++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d859d8bd1f96..e7cc0d860499 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -792,6 +792,18 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		if (cur->vm_ops)
 			goto out_unlock;
 
+		/*
+		 * UFFDIO_COPY will fill file holes even without
+		 * PROT_WRITE. This check enforces that if this is a
+		 * MAP_SHARED, the process has write permission to the backing
+		 * file. If VM_MAYWRITE is set it also enforces that on a
+		 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
+		 * F_WRITE_SEAL can be taken until the vma is destroyed.
+		 */
+		ret = -EPERM;
+		if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
+			goto out_unlock;
+
 		/*
 		 * Check that this vma isn't already owned by a
 		 * different userfaultfd. We can't allow more than one
@@ -817,6 +829,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		BUG_ON(vma->vm_ops);
 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 		       vma->vm_userfaultfd_ctx.ctx != ctx);
+		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
@@ -953,6 +966,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		cond_resched();
 
 		BUG_ON(vma->vm_ops);
+		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 77fee9325a57..497248b93a4c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -182,13 +182,9 @@ retry:
 		goto out_unlock;
 
 	/*
-	 * Be strict and only allow __mcopy_atomic on userfaultfd
-	 * registered ranges to prevent userland errors going
-	 * unnoticed. As far as the VM consistency is concerned, it
-	 * would be perfectly safe to remove this check, but there's
-	 * no useful usage for __mcopy_atomic ouside of userfaultfd
-	 * registered ranges. This is after all why these are ioctls
-	 * belonging to the userfaultfd and not syscalls.
+	 * Check the vma is registered in uffd, this is required to
+	 * enforce the VM_MAYWRITE check done at uffd registration
+	 * time.
 	 */
 	if (!dst_vma->vm_userfaultfd_ctx.ctx)
 		goto out_unlock;

From 3f951709eb2412974077d50b8e14cbc675b246a4 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 16 Oct 2018 11:55:49 +0200
Subject: [PATCH 2483/3220] drm/bufs: Fix Spectre v1 vulnerability

[ Upstream commit a37805098900a6e73a55b3a43b7d3bcd987bb3f4 ]

idx can be indirectly controlled by user-space, hence leading to a
potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

drivers/gpu/drm/drm_bufs.c:1420 drm_legacy_freebufs() warn: potential
spectre issue 'dma->buflist' [r] (local cap)

Fix this by sanitizing idx before using it to index dma->buflist

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20181016095549.GA23586@embeddedor.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/drm_bufs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/drm_bufs.c b/drivers/gpu/drm/drm_bufs.c
index f1a204d253cc..ac22b8d86249 100644
--- a/drivers/gpu/drm/drm_bufs.c
+++ b/drivers/gpu/drm/drm_bufs.c
@@ -36,6 +36,8 @@
 #include <drm/drmP.h>
 #include "drm_legacy.h"
 
+#include <linux/nospec.h>
+
 static struct drm_map_list *drm_find_matching_map(struct drm_device *dev,
 						  struct drm_local_map *map)
 {
@@ -1332,6 +1334,7 @@ int drm_legacy_freebufs(struct drm_device *dev, void *data,
 				  idx, dma->buf_count - 1);
 			return -EINVAL;
 		}
+		idx = array_index_nospec(idx, dma->buf_count);
 		buf = dma->buflist[idx];
 		if (buf->file_priv != file_priv) {
 			DRM_ERROR("Process %d freeing buffer not owned\n",

From 6a7e6587a0c440d1c6d0747c795e1f059f13db8d Mon Sep 17 00:00:00 2001
From: Slawomir Stepien <sst@poczta.fm>
Date: Sat, 20 Oct 2018 23:04:11 +0200
Subject: [PATCH 2484/3220] staging: iio: adc: ad7280a: handle error from
 __ad7280_read32()

[ Upstream commit 0559ef7fde67bc6c83c6eb6329dbd6649528263e ]

Inside __ad7280_read32(), the spi_sync_transfer() can fail with negative
error code. This change will ensure that this error is being passed up
in the call stack, so it can be handled.

Signed-off-by: Slawomir Stepien <sst@poczta.fm>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/staging/iio/adc/ad7280a.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/iio/adc/ad7280a.c b/drivers/staging/iio/adc/ad7280a.c
index 35acb1a4669b..db8390022732 100644
--- a/drivers/staging/iio/adc/ad7280a.c
+++ b/drivers/staging/iio/adc/ad7280a.c
@@ -250,7 +250,9 @@ static int ad7280_read(struct ad7280_state *st, unsigned devaddr,
 	if (ret)
 		return ret;
 
-	__ad7280_read32(st, &tmp);
+	ret = __ad7280_read32(st, &tmp);
+	if (ret)
+		return ret;
 
 	if (ad7280_check_crc(st, tmp))
 		return -EIO;
@@ -288,7 +290,9 @@ static int ad7280_read_channel(struct ad7280_state *st, unsigned devaddr,
 
 	ad7280_delay(st);
 
-	__ad7280_read32(st, &tmp);
+	ret = __ad7280_read32(st, &tmp);
+	if (ret)
+		return ret;
 
 	if (ad7280_check_crc(st, tmp))
 		return -EIO;
@@ -321,7 +325,9 @@ static int ad7280_read_all_channels(struct ad7280_state *st, unsigned cnt,
 	ad7280_delay(st);
 
 	for (i = 0; i < cnt; i++) {
-		__ad7280_read32(st, &tmp);
+		ret = __ad7280_read32(st, &tmp);
+		if (ret)
+			return ret;
 
 		if (ad7280_check_crc(st, tmp))
 			return -EIO;
@@ -364,7 +370,10 @@ static int ad7280_chain_setup(struct ad7280_state *st)
 		return ret;
 
 	for (n = 0; n <= AD7280A_MAX_CHAIN; n++) {
-		__ad7280_read32(st, &val);
+		ret = __ad7280_read32(st, &val);
+		if (ret)
+			return ret;
+
 		if (val == 0)
 			return n - 1;
 

From f9d6798d26b93d91e8a34131f397dd14ec22de42 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 3 Nov 2018 22:21:22 +0100
Subject: [PATCH 2485/3220] ASoC: Intel: mrfld: fix uninitialized variable
 access

[ Upstream commit 1539c7f23f256120f89f8b9ec53160790bce9ed2 ]

Randconfig testing revealed a very old bug, with gcc-8:

sound/soc/intel/atom/sst/sst_loader.c: In function 'sst_load_fw':
sound/soc/intel/atom/sst/sst_loader.c:357:5: error: 'fw' may be used uninitialized in this function [-Werror=maybe-uninitialized]
  if (fw == NULL) {
     ^
sound/soc/intel/atom/sst/sst_loader.c:354:25: note: 'fw' was declared here
  const struct firmware *fw;

We must check the return code of request_firmware() before we look at the
pointer result that may be uninitialized when the function fails.

Fixes: 9012c9544eea ("ASoC: Intel: mrfld - Add DSP load and management")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/intel/atom/sst/sst_loader.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sound/soc/intel/atom/sst/sst_loader.c b/sound/soc/intel/atom/sst/sst_loader.c
index 33917146d9c4..054b1d514e8a 100644
--- a/sound/soc/intel/atom/sst/sst_loader.c
+++ b/sound/soc/intel/atom/sst/sst_loader.c
@@ -354,14 +354,14 @@ static int sst_request_fw(struct intel_sst_drv *sst)
 	const struct firmware *fw;
 
 	retval = request_firmware(&fw, sst->firmware_name, sst->dev);
-	if (fw == NULL) {
-		dev_err(sst->dev, "fw is returning as null\n");
-		return -EINVAL;
-	}
 	if (retval) {
 		dev_err(sst->dev, "request fw failed %d\n", retval);
 		return retval;
 	}
+	if (fw == NULL) {
+		dev_err(sst->dev, "fw is returning as null\n");
+		return -EINVAL;
+	}
 	mutex_lock(&sst->sst_lock);
 	retval = sst_cache_and_parse_fw(sst, fw);
 	mutex_unlock(&sst->sst_lock);

From 871243fc4903aa65df5ad3af38bec413cb8225c9 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 23 Oct 2018 13:41:07 -0700
Subject: [PATCH 2486/3220] scsi: lpfc: Correct LCB RJT handling

[ Upstream commit b114d9009d386276bfc3352289fc235781ae3353 ]

When LCB's are rejected, if beaconing was already in progress, the
Reason Code Explanation was not being set. Should have been set to
command in progress.

Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: James Smart <jsmart2021@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/lpfc/lpfc_els.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index fd8fe1202dbe..398c9a0a5ade 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -5105,6 +5105,9 @@ error:
 	stat = (struct ls_rjt *)(pcmd + sizeof(uint32_t));
 	stat->un.b.lsRjtRsnCode = LSRJT_UNABLE_TPC;
 
+	if (shdr_add_status == ADD_STATUS_OPERATION_ALREADY_ACTIVE)
+		stat->un.b.lsRjtRsnCodeExp = LSEXP_CMD_IN_PROGRESS;
+
 	elsiocb->iocb_cmpl = lpfc_cmpl_els_rsp;
 	phba->fc_stat.elsXmitLSRJT++;
 	rc = lpfc_sli_issue_iocb(phba, LPFC_ELS_RING, elsiocb, 0);

From aeef84f38288f15d80905513c4eaf5b5cc66badd Mon Sep 17 00:00:00 2001
From: Yufen Wang <wangyufen@huawei.com>
Date: Fri, 2 Nov 2018 11:51:31 +0100
Subject: [PATCH 2487/3220] ARM: 8808/1: kexec:offline panic_smp_self_stop CPU

[ Upstream commit 82c08c3e7f171aa7f579b231d0abbc1d62e91974 ]

In case panic() and panic() called at the same time on different CPUS.
For example:
CPU 0:
  panic()
     __crash_kexec
       machine_crash_shutdown
         crash_smp_send_stop
       machine_kexec
         BUG_ON(num_online_cpus() > 1);

CPU 1:
  panic()
    local_irq_disable
    panic_smp_self_stop

If CPU 1 calls panic_smp_self_stop() before crash_smp_send_stop(), kdump
fails. CPU1 can't receive the ipi irq, CPU1 will be always online.
To fix this problem, this patch split out the panic_smp_self_stop()
and add set_cpu_online(smp_processor_id(), false).

Signed-off-by: Yufen Wang <wangyufen@huawei.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/kernel/smp.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index b26361355dae..e42be5800f37 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -687,6 +687,21 @@ void smp_send_stop(void)
 		pr_warn("SMP: failed to stop secondary CPUs\n");
 }
 
+/* In case panic() and panic() called at the same time on CPU1 and CPU2,
+ * and CPU 1 calls panic_smp_self_stop() before crash_smp_send_stop()
+ * CPU1 can't receive the ipi irqs from CPU2, CPU1 will be always online,
+ * kdump fails. So split out the panic_smp_self_stop() and add
+ * set_cpu_online(smp_processor_id(), false).
+ */
+void panic_smp_self_stop(void)
+{
+	pr_debug("CPU %u will stop doing anything useful since another CPU has paniced\n",
+	         smp_processor_id());
+	set_cpu_online(smp_processor_id(), false);
+	while (1)
+		cpu_relax();
+}
+
 /*
  * not supported here
  */

From dd307a8d156bf0d2a53bd389f48f90f86d604496 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 8 Nov 2018 14:04:50 -0500
Subject: [PATCH 2488/3220] dlm: Don't swamp the CPU with callbacks queued
 during recovery

[ Upstream commit 216f0efd19b9cc32207934fd1b87a45f2c4c593e ]

Before this patch, recovery would cause all callbacks to be delayed,
put on a queue, and afterward they were all queued to the callback
work queue. This patch does the same thing, but occasionally takes
a break after 25 of them so it won't swamp the CPU at the expense
of other RT processes like corosync.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/dlm/ast.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dcea1e37a1b7..f18619bc2e09 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -290,6 +290,8 @@ void dlm_callback_suspend(struct dlm_ls *ls)
 		flush_workqueue(ls->ls_callback_wq);
 }
 
+#define MAX_CB_QUEUE 25
+
 void dlm_callback_resume(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb, *safe;
@@ -300,15 +302,23 @@ void dlm_callback_resume(struct dlm_ls *ls)
 	if (!ls->ls_callback_wq)
 		return;
 
+more:
 	mutex_lock(&ls->ls_cb_mutex);
 	list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
 		list_del_init(&lkb->lkb_cb_list);
 		queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
 		count++;
+		if (count == MAX_CB_QUEUE)
+			break;
 	}
 	mutex_unlock(&ls->ls_cb_mutex);
 
 	if (count)
 		log_rinfo(ls, "dlm_callback_resume %d", count);
+	if (count == MAX_CB_QUEUE) {
+		count = 0;
+		cond_resched();
+		goto more;
+	}
 }
 

From 2f85daf84873df51f533894e50d9c0b80137dca0 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 25 Oct 2018 14:52:31 +0100
Subject: [PATCH 2489/3220] x86/PCI: Fix Broadcom CNB20LE unintended sign
 extension (redux)

[ Upstream commit 53bb565fc5439f2c8c57a786feea5946804aa3e9 ]

In the expression "word1 << 16", word1 starts as u16, but is promoted to a
signed int, then sign-extended to resource_size_t, which is probably not
what was intended.  Cast to resource_size_t to avoid the sign extension.

This fixes an identical issue as fixed by commit 0b2d70764bb3 ("x86/PCI:
Fix Broadcom CNB20LE unintended sign extension") back in 2014.

Detected by CoverityScan, CID#138749, 138750 ("Unintended sign extension")

Fixes: 3f6ea84a3035 ("PCI: read memory ranges out of Broadcom CNB20LE host bridge")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Bjorn Helgaas <helgaas@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/pci/broadcom_bus.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 526536c81ddc..ca1e8e6dccc8 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -50,8 +50,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
 	word1 = read_pci_config_16(bus, slot, func, 0xc0);
 	word2 = read_pci_config_16(bus, slot, func, 0xc2);
 	if (word1 != word2) {
-		res.start = (word1 << 16) | 0x0000;
-		res.end   = (word2 << 16) | 0xffff;
+		res.start = ((resource_size_t) word1 << 16) | 0x0000;
+		res.end   = ((resource_size_t) word2 << 16) | 0xffff;
 		res.flags = IORESOURCE_MEM;
 		update_res(info, res.start, res.end, res.flags, 0);
 	}

From c9ddfda641c4e868b08c37ed51debee2f539ea0d Mon Sep 17 00:00:00 2001
From: Frank Rowand <frank.rowand@sony.com>
Date: Thu, 4 Oct 2018 20:27:16 -0700
Subject: [PATCH 2490/3220] powerpc/pseries: add of_node_put() in
 dlpar_detach_node()

[ Upstream commit 5b3f5c408d8cc59b87e47f1ab9803dbd006e4a91 ]

The previous commit, "of: overlay: add missing of_node_get() in
__of_attach_node_sysfs" added a missing of_node_get() to
__of_attach_node_sysfs().  This results in a refcount imbalance
for nodes attached with dlpar_attach_node().  The calling sequence
from dlpar_attach_node() to __of_attach_node_sysfs() is:

   dlpar_attach_node()
      of_attach_node()
         __of_attach_node_sysfs()

For more detailed description of the node refcount, see
commit 68baf692c435 ("powerpc/pseries: Fix of_node_put() underflow
during DLPAR remove").

Tested-by: Alan Tull <atull@kernel.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/platforms/pseries/dlpar.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 96536c969c9c..a8efed3b4691 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -280,6 +280,8 @@ int dlpar_detach_node(struct device_node *dn)
 	if (rc)
 		return rc;
 
+	of_node_put(dn);
+
 	return 0;
 }
 

From 7c6340dfba35672db971c87bdc4fa178720abc12 Mon Sep 17 00:00:00 2001
From: Andy Duan <fugang.duan@nxp.com>
Date: Tue, 16 Oct 2018 07:32:22 +0000
Subject: [PATCH 2491/3220] serial: fsl_lpuart: clear parity enable bit when
 disable parity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 397bd9211fe014b347ca8f95a8f4e1017bac1aeb ]

Current driver only enable parity enable bit and never clear it
when user set the termios. The fix clear the parity enable bit when
PARENB flag is not set in termios->c_cflag.

Cc: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Andy Duan <fugang.duan@nxp.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/tty/serial/fsl_lpuart.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index 01e2274b23f2..8b5ec9386f0f 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -1267,6 +1267,8 @@ lpuart_set_termios(struct uart_port *port, struct ktermios *termios,
 			else
 				cr1 &= ~UARTCR1_PT;
 		}
+	} else {
+		cr1 &= ~UARTCR1_PE;
 	}
 
 	/* ask the core to calculate the divisor */
@@ -1402,6 +1404,8 @@ lpuart32_set_termios(struct uart_port *port, struct ktermios *termios,
 			else
 				ctrl &= ~UARTCTRL_PT;
 		}
+	} else {
+		ctrl &= ~UARTCTRL_PE;
 	}
 
 	/* ask the core to calculate the divisor */

From caf6a81cd7037787d34db04a63d28a419d5d01e8 Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Fri, 9 Nov 2018 11:14:43 +0100
Subject: [PATCH 2492/3220] ptp: check gettime64 return code in PTP_SYS_OFFSET
 ioctl

[ Upstream commit 83d0bdc7390b890905634186baaa294475cd6a06 ]

If a gettime64 call fails, return the error and avoid copying data back
to user.

Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/ptp/ptp_chardev.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 4eb254a273f8..4861cfddcdd3 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -204,7 +204,9 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			pct->sec = ts.tv_sec;
 			pct->nsec = ts.tv_nsec;
 			pct++;
-			ptp->info->gettime64(ptp->info, &ts);
+			err = ptp->info->gettime64(ptp->info, &ts);
+			if (err)
+				goto out;
 			pct->sec = ts.tv_sec;
 			pct->nsec = ts.tv_nsec;
 			pct++;
@@ -257,6 +259,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 	}
 
+out:
 	kfree(sysoff);
 	return err;
 }

From 524b2be52946ce90b1716a2fd6b298182e325dc9 Mon Sep 17 00:00:00 2001
From: Matheus Tavares <matheus.bernardino@usp.br>
Date: Sat, 3 Nov 2018 19:49:44 -0300
Subject: [PATCH 2493/3220] staging:iio:ad2s90: Make probe handle spi_setup
 failure

[ Upstream commit b3a3eafeef769c6982e15f83631dcbf8d1794efb ]

Previously, ad2s90_probe ignored the return code from spi_setup, not
handling its possible failure. This patch makes ad2s90_probe check if
the code is an error code and, if so, do the following:

- Call dev_err with an appropriate error message.
- Return the spi_setup's error code.

Note: The 'return ret' statement could be out of the 'if' block, but
this whole block will be moved up in the function in the patch:
'staging:iio:ad2s90: Move device registration to the end of probe'.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/staging/iio/resolver/ad2s90.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/iio/resolver/ad2s90.c b/drivers/staging/iio/resolver/ad2s90.c
index 5b1c0db33e7f..b44253eb62ec 100644
--- a/drivers/staging/iio/resolver/ad2s90.c
+++ b/drivers/staging/iio/resolver/ad2s90.c
@@ -86,7 +86,12 @@ static int ad2s90_probe(struct spi_device *spi)
 	/* need 600ns between CS and the first falling edge of SCLK */
 	spi->max_speed_hz = 830000;
 	spi->mode = SPI_MODE_3;
-	spi_setup(spi);
+	ret = spi_setup(spi);
+
+	if (ret < 0) {
+		dev_err(&spi->dev, "spi_setup failed!\n");
+		return ret;
+	}
 
 	return 0;
 }

From eba6d94b54e74064ca7c57fe7242726f2fa8e9f1 Mon Sep 17 00:00:00 2001
From: Renato Lui Geh <renatogeh@gmail.com>
Date: Mon, 5 Nov 2018 17:14:58 -0200
Subject: [PATCH 2494/3220] staging: iio: ad7780: update voltage on read

[ Upstream commit 336650c785b62c3bea7c8cf6061c933a90241f67 ]

The ad7780 driver previously did not read the correct device output, as
it read an outdated value set at initialization. It now updates its
voltage on read.

Signed-off-by: Renato Lui Geh <renatogeh@gmail.com>
Acked-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/staging/iio/adc/ad7780.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/iio/adc/ad7780.c b/drivers/staging/iio/adc/ad7780.c
index 3abc7789237f..531338ea5eb4 100644
--- a/drivers/staging/iio/adc/ad7780.c
+++ b/drivers/staging/iio/adc/ad7780.c
@@ -90,12 +90,16 @@ static int ad7780_read_raw(struct iio_dev *indio_dev,
 			   long m)
 {
 	struct ad7780_state *st = iio_priv(indio_dev);
+	int voltage_uv;
 
 	switch (m) {
 	case IIO_CHAN_INFO_RAW:
 		return ad_sigma_delta_single_conversion(indio_dev, chan, val);
 	case IIO_CHAN_INFO_SCALE:
-		*val = st->int_vref_mv * st->gain;
+		voltage_uv = regulator_get_voltage(st->reg);
+		if (voltage_uv < 0)
+			return voltage_uv;
+		*val = (voltage_uv / 1000) * st->gain;
 		*val2 = chan->scan_type.realbits - 1;
 		return IIO_VAL_FRACTIONAL_LOG2;
 	case IIO_CHAN_INFO_OFFSET:

From c34330e1eccdd47268a9ed8c8e84632a2aa72001 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Wed, 17 Oct 2018 17:52:07 -0700
Subject: [PATCH 2495/3220] ARM: OMAP2+: hwmod: Fix some section annotations

[ Upstream commit c10b26abeb53cabc1e6271a167d3f3d396ce0218 ]

When building the kernel with Clang, the following section mismatch
warnings appears:

WARNING: vmlinux.o(.text+0x2d398): Section mismatch in reference from
the function _setup() to the function .init.text:_setup_iclk_autoidle()
The function _setup() references
the function __init _setup_iclk_autoidle().
This is often because _setup lacks a __init
annotation or the annotation of _setup_iclk_autoidle is wrong.

WARNING: vmlinux.o(.text+0x2d3a0): Section mismatch in reference from
the function _setup() to the function .init.text:_setup_reset()
The function _setup() references
the function __init _setup_reset().
This is often because _setup lacks a __init
annotation or the annotation of _setup_reset is wrong.

WARNING: vmlinux.o(.text+0x2d408): Section mismatch in reference from
the function _setup() to the function .init.text:_setup_postsetup()
The function _setup() references
the function __init _setup_postsetup().
This is often because _setup lacks a __init
annotation or the annotation of _setup_postsetup is wrong.

_setup is used in omap_hwmod_allocate_module, which isn't marked __init
and looks like it shouldn't be, meaning to fix these warnings, those
functions must be moved out of the init section, which this patch does.

Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/mach-omap2/omap_hwmod.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index 147c90e70b2e..36706d32d656 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -2526,7 +2526,7 @@ static int __init _init(struct omap_hwmod *oh, void *data)
  * a stub; implementing this properly requires iclk autoidle usecounting in
  * the clock code.   No return value.
  */
-static void __init _setup_iclk_autoidle(struct omap_hwmod *oh)
+static void _setup_iclk_autoidle(struct omap_hwmod *oh)
 {
 	struct omap_hwmod_ocp_if *os;
 	struct list_head *p;
@@ -2561,7 +2561,7 @@ static void __init _setup_iclk_autoidle(struct omap_hwmod *oh)
  * reset.  Returns 0 upon success or a negative error code upon
  * failure.
  */
-static int __init _setup_reset(struct omap_hwmod *oh)
+static int _setup_reset(struct omap_hwmod *oh)
 {
 	int r;
 
@@ -2622,7 +2622,7 @@ static int __init _setup_reset(struct omap_hwmod *oh)
  *
  * No return value.
  */
-static void __init _setup_postsetup(struct omap_hwmod *oh)
+static void _setup_postsetup(struct omap_hwmod *oh)
 {
 	u8 postsetup_state;
 

From b02bfd8416d361a78a46a45ccb20e46d5f97204c Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Tue, 23 Oct 2018 15:15:35 -0700
Subject: [PATCH 2496/3220] modpost: validate symbol names also in
 find_elf_symbol

[ Upstream commit 5818c683a619c534c113e1f66d24f636defc29bc ]

If an ARM mapping symbol shares an address with a valid symbol,
find_elf_symbol can currently return the mapping symbol instead, as the
symbol is not validated. This can result in confusing warnings:

  WARNING: vmlinux.o(.text+0x18f4028): Section mismatch in reference
  from the function set_reset_devices() to the variable .init.text:$x.0

This change adds a call to is_valid_name to find_elf_symbol, similarly
to how it's already used in find_elf_symbol2.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 scripts/mod/modpost.c | 50 ++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 064fbfbbb22c..81b1c02a76fa 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1197,6 +1197,30 @@ static int secref_whitelist(const struct sectioncheck *mismatch,
 	return 1;
 }
 
+static inline int is_arm_mapping_symbol(const char *str)
+{
+	return str[0] == '$' && strchr("axtd", str[1])
+	       && (str[2] == '\0' || str[2] == '.');
+}
+
+/*
+ * If there's no name there, ignore it; likewise, ignore it if it's
+ * one of the magic symbols emitted used by current ARM tools.
+ *
+ * Otherwise if find_symbols_between() returns those symbols, they'll
+ * fail the whitelist tests and cause lots of false alarms ... fixable
+ * only by merging __exit and __init sections into __text, bloating
+ * the kernel (which is especially evil on embedded platforms).
+ */
+static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym)
+{
+	const char *name = elf->strtab + sym->st_name;
+
+	if (!name || !strlen(name))
+		return 0;
+	return !is_arm_mapping_symbol(name);
+}
+
 /**
  * Find symbol based on relocation record info.
  * In some cases the symbol supplied is a valid symbol so
@@ -1222,6 +1246,8 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr,
 			continue;
 		if (ELF_ST_TYPE(sym->st_info) == STT_SECTION)
 			continue;
+		if (!is_valid_name(elf, sym))
+			continue;
 		if (sym->st_value == addr)
 			return sym;
 		/* Find a symbol nearby - addr are maybe negative */
@@ -1240,30 +1266,6 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr,
 		return NULL;
 }
 
-static inline int is_arm_mapping_symbol(const char *str)
-{
-	return str[0] == '$' && strchr("axtd", str[1])
-	       && (str[2] == '\0' || str[2] == '.');
-}
-
-/*
- * If there's no name there, ignore it; likewise, ignore it if it's
- * one of the magic symbols emitted used by current ARM tools.
- *
- * Otherwise if find_symbols_between() returns those symbols, they'll
- * fail the whitelist tests and cause lots of false alarms ... fixable
- * only by merging __exit and __init sections into __text, bloating
- * the kernel (which is especially evil on embedded platforms).
- */
-static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym)
-{
-	const char *name = elf->strtab + sym->st_name;
-
-	if (!name || !strlen(name))
-		return 0;
-	return !is_arm_mapping_symbol(name);
-}
-
 /*
  * Find symbols before or equal addr and after addr - in the section sec.
  * If we find two symbols with equal offset prefer one with a valid name.

From 1e94c7b4db9abc95320f98bc23eea1878cd80e01 Mon Sep 17 00:00:00 2001
From: Pu Wen <puwen@hygon.cn>
Date: Mon, 12 Nov 2018 15:40:51 +0800
Subject: [PATCH 2497/3220] perf tools: Add Hygon Dhyana support

[ Upstream commit 4787eff3fa88f62fede6ed7afa06477ae6bf984d ]

The tool perf is useful for the performance analysis on the Hygon Dhyana
platform. But right now there is no Hygon support for it to analyze the
KVM guest os data. So add Hygon Dhyana support to it by checking vendor
string to share the code path of AMD.

Signed-off-by: Pu Wen <puwen@hygon.cn>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1542008451-31735-1-git-send-email-puwen@hygon.cn
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/arch/x86/util/kvm-stat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c
index 14e4e668fad7..f97696a418cc 100644
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ b/tools/perf/arch/x86/util/kvm-stat.c
@@ -146,7 +146,7 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
 	if (strstr(cpuid, "Intel")) {
 		kvm->exit_reasons = vmx_exit_reasons;
 		kvm->exit_reasons_isa = "VMX";
-	} else if (strstr(cpuid, "AMD")) {
+	} else if (strstr(cpuid, "AMD") || strstr(cpuid, "Hygon")) {
 		kvm->exit_reasons = svm_exit_reasons;
 		kvm->exit_reasons_isa = "SVM";
 	} else

From f936083094d8c13444aa1d1af586f282ff542191 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Wed, 21 Nov 2018 07:49:12 -0500
Subject: [PATCH 2498/3220] soc/tegra: Don't leak device tree node reference

[ Upstream commit 9eb40fa2cd2d1f6829e7b49bb22692f754b9cfe0 ]

of_find_node_by_path() acquires a reference to the node returned by it
and that reference needs to be dropped by its caller. soc_is_tegra()
doesn't do that, so fix it.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Acked-by: Jon Hunter <jonathanh@nvidia.com>
[treding: slightly rewrite to avoid inline comparison]
Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/soc/tegra/common.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/soc/tegra/common.c b/drivers/soc/tegra/common.c
index cd8f41351add..7bfb154d6fa5 100644
--- a/drivers/soc/tegra/common.c
+++ b/drivers/soc/tegra/common.c
@@ -22,11 +22,15 @@ static const struct of_device_id tegra_machine_match[] = {
 
 bool soc_is_tegra(void)
 {
+	const struct of_device_id *match;
 	struct device_node *root;
 
 	root = of_find_node_by_path("/");
 	if (!root)
 		return false;
 
-	return of_match_node(tegra_machine_match, root) != NULL;
+	match = of_match_node(tegra_machine_match, root);
+	of_node_put(root);
+
+	return match != NULL;
 }

From 262871e633994cd2fd5b68a2a8c3984081576511 Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Tue, 6 Nov 2018 10:25:29 +0800
Subject: [PATCH 2499/3220] f2fs: move dir data flush to write checkpoint
 process

[ Upstream commit b61ac5b720146c619c7cdf17eff2551b934399e5 ]

This patch move dir data flush to write checkpoint process, by
doing this, it may reduce some time for dir fsync.

pre:
	-f2fs_do_sync_file enter
		-file_write_and_wait_range  <- flush & wait
		-write_checkpoint
			-do_checkpoint	    <- wait all
	-f2fs_do_sync_file exit

now:
	-f2fs_do_sync_file enter
		-write_checkpoint
			-block_operations   <- flush dir & no wait
			-do_checkpoint	    <- wait all
	-f2fs_do_sync_file exit

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/file.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 96bfd9f0ea02..bee3bc7a16ac 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -200,6 +200,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	trace_f2fs_sync_file_enter(inode);
 
+	if (S_ISDIR(inode->i_mode))
+		goto go_write;
+
 	/* if fdatasync is triggered, let's do in-place-update */
 	if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
 		set_inode_flag(fi, FI_NEED_IPU);

From d67dec906b7a9069c5541fce2b50118178133a1a Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <kernelpatch@126.com>
Date: Wed, 21 Nov 2018 07:21:38 +0800
Subject: [PATCH 2500/3220] f2fs: fix wrong return value of f2fs_acl_create

[ Upstream commit f6176473a0c7472380eef72ebeb330cf9485bf0a ]

When call f2fs_acl_create_masq() failed, the caller f2fs_acl_create()
should return -EIO instead of -ENOMEM, this patch makes it consistent
with posix_acl_create() which has been fixed in commit beaf226b863a
("posix_acl: don't ignore return value of posix_acl_create_masq()").

Fixes: 83dfe53c185e ("f2fs: fix reference leaks in f2fs_acl_create")
Signed-off-by: Tiezhu Yang <kernelpatch@126.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/acl.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 83dcf7bfd7b8..f0ea91925343 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -350,12 +350,14 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
 		return PTR_ERR(p);
 
 	clone = f2fs_acl_clone(p, GFP_NOFS);
-	if (!clone)
-		goto no_mem;
+	if (!clone) {
+		ret = -ENOMEM;
+		goto release_acl;
+	}
 
 	ret = f2fs_acl_create_masq(clone, mode);
 	if (ret < 0)
-		goto no_mem_clone;
+		goto release_clone;
 
 	if (ret == 0)
 		posix_acl_release(clone);
@@ -369,11 +371,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
 
 	return 0;
 
-no_mem_clone:
+release_clone:
 	posix_acl_release(clone);
-no_mem:
+release_acl:
 	posix_acl_release(p);
-	return -ENOMEM;
+	return ret;
 }
 
 int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,

From 627e656239c63d8a81672d08de7885e89c12fa75 Mon Sep 17 00:00:00 2001
From: Young Xiao <YangX92@hotmail.com>
Date: Wed, 28 Nov 2018 12:36:39 +0000
Subject: [PATCH 2501/3220] sunvdc: Do not spin in an infinite loop when
 vio_ldc_send() returns EAGAIN

[ Upstream commit a11f6ca9aef989b56cd31ff4ee2af4fb31a172ec ]

__vdc_tx_trigger should only loop on EAGAIN a finite
number of times.

See commit adddc32d6fde ("sunvnet: Do not spin in an
infinite loop when vio_ldc_send() returns EAGAIN") for detail.

Signed-off-by: Young Xiao <YangX92@hotmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/block/sunvdc.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 4b911ed96ea3..31219fb9e7f4 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -40,6 +40,8 @@ MODULE_VERSION(DRV_MODULE_VERSION);
 #define WAITING_FOR_GEN_CMD	0x04
 #define WAITING_FOR_ANY		-1
 
+#define	VDC_MAX_RETRIES	10
+
 static struct workqueue_struct *sunvdc_wq;
 
 struct vdc_req_entry {
@@ -419,6 +421,7 @@ static int __vdc_tx_trigger(struct vdc_port *port)
 		.end_idx		= dr->prod,
 	};
 	int err, delay;
+	int retries = 0;
 
 	hdr.seq = dr->snd_nxt;
 	delay = 1;
@@ -431,6 +434,8 @@ static int __vdc_tx_trigger(struct vdc_port *port)
 		udelay(delay);
 		if ((delay <<= 1) > 128)
 			delay = 128;
+		if (retries++ > VDC_MAX_RETRIES)
+			break;
 	} while (err == -EAGAIN);
 
 	if (err == -ENOTCONN)

From ef9cc1ec63de22707cefdc57da64522376d7dbc2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 27 Nov 2018 15:54:17 -0500
Subject: [PATCH 2502/3220] nfsd4: fix crash on writing v4_end_grace before
 nfsd startup

[ Upstream commit 62a063b8e7d1db684db3f207261a466fa3194e72 ]

Anatoly Trosinenko reports that this:

1) Checkout fresh master Linux branch (tested with commit e195ca6cb)
2) Copy x84_64-config-4.14 to .config, then enable NFS server v4 and build
3) From `kvm-xfstests shell`:

results in NULL dereference in locks_end_grace.

Check that nfsd has been started before trying to end the grace period.

Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfsd/nfsctl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 9690cb4dd588..03c7a4e7b6ba 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1106,6 +1106,8 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
 		case 'Y':
 		case 'y':
 		case '1':
+			if (nn->nfsd_serv)
+				return -EBUSY;
 			nfsd4_end_grace(nn);
 			break;
 		default:

From 2d6f9a86cd9a6081e0431f19757cc0879a701775 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 15 Nov 2018 22:42:01 +0000
Subject: [PATCH 2503/3220] arm64: ftrace: don't adjust the LR value

[ Upstream commit 6e803e2e6e367db9a0d6ecae1bd24bb5752011bd ]

The core ftrace code requires that when it is handed the PC of an
instrumented function, this PC is the address of the instrumented
instruction. This is necessary so that the core ftrace code can identify
the specific instrumentation site. Since the instrumented function will
be a BL, the address of the instrumented function is LR - 4 at entry to
the ftrace code.

This fixup is applied in the mcount_get_pc and mcount_get_pc0 helpers,
which acquire the PC of the instrumented function.

The mcount_get_lr helper is used to acquire the LR of the instrumented
function, whose value does not require this adjustment, and cannot be
adjusted to anything meaningful. No adjustment of this value is made on
other architectures, including arm. However, arm64 adjusts this value by
4.

This patch brings arm64 in line with other architectures and removes the
adjustment of the LR value.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Torsten Duwe <duwe@suse.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/kernel/entry-ftrace.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 0f03a8fe2314..d18d15810d19 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -78,7 +78,6 @@
 	.macro mcount_get_lr reg
 	ldr	\reg, [x29]
 	ldr	\reg, [\reg, #8]
-	mcount_adjust_addr	\reg, \reg
 	.endm
 
 	.macro mcount_get_lr_addr reg

From 75086100ab60a68b22d9eaea71a45de66102dd1a Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Wed, 28 Nov 2018 18:53:10 +0100
Subject: [PATCH 2504/3220] ARM: dts: mmp2: fix TWSI2

[ Upstream commit 1147e05ac9fc2ef86a3691e7ca5c2db7602d81dd ]

Marvell keeps their MMP2 datasheet secret, but there are good clues
that TWSI2 is not on 0xd4025000 on that platform, not does it use
IRQ 58. In fact, the IRQ 58 on MMP2 seems to be a signal processor:

   arch/arm/mach-mmp/irqs.h:#define IRQ_MMP2_MSP  58

I'm taking a somewhat educated guess that is probably a copy & paste
error from PXA168 or PXA910 and that the real controller in fact hides
at address 0xd4031000 and uses an interrupt line multiplexed via IRQ 17.

I'm also copying some properties from TWSI1 that were missing or
incorrect.

Tested on a OLPC XO 1.75 machine, where the RTC is on TWSI2.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Tested-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/boot/dts/mmp2.dtsi | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/arm/boot/dts/mmp2.dtsi b/arch/arm/boot/dts/mmp2.dtsi
index 766bbb8495b6..47e5b63339d1 100644
--- a/arch/arm/boot/dts/mmp2.dtsi
+++ b/arch/arm/boot/dts/mmp2.dtsi
@@ -220,12 +220,15 @@
 				status = "disabled";
 			};
 
-			twsi2: i2c@d4025000 {
+			twsi2: i2c@d4031000 {
 				compatible = "mrvl,mmp-twsi";
-				reg = <0xd4025000 0x1000>;
-				interrupts = <58>;
+				reg = <0xd4031000 0x1000>;
+				interrupt-parent = <&intcmux17>;
+				interrupts = <0>;
 				clocks = <&soc_clocks MMP2_CLK_TWSI1>;
 				resets = <&soc_clocks MMP2_CLK_TWSI1>;
+				#address-cells = <1>;
+				#size-cells = <0>;
 				status = "disabled";
 			};
 

From 028f17e00e1aae50465b058f4f985fd17e134887 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 28 Nov 2018 23:20:11 +0100
Subject: [PATCH 2505/3220] x86/fpu: Add might_fault() to user_insn()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 6637401c35b2f327a35d27f44bda05e327f2f017 ]

Every user of user_insn() passes an user memory pointer to this macro.

Add might_fault() to user_insn() so we can spot users which are using
this macro in sections where page faulting is not allowed.

 [ bp: Space it out to make it more visible. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Rik van Riel <riel@surriel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20181128222035.2996-6-bigeasy@linutronix.de
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 16825dda18dc..66a5e60f60c4 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -94,6 +94,9 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
 #define user_insn(insn, output, input...)				\
 ({									\
 	int err;							\
+									\
+	might_fault();							\
+									\
 	asm volatile(ASM_STAC "\n"					\
 		     "1:" #insn "\n\t"					\
 		     "2: " ASM_CLAC "\n"				\

From da327232f52b60ab960ffea3089bb46915b9e536 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Fri, 23 Nov 2018 16:56:26 -0500
Subject: [PATCH 2506/3220] media: DaVinci-VPBE: fix error handling in
 vpbe_initialize()

[ Upstream commit aa35dc3c71950e3fec3e230c06c27c0fbd0067f8 ]

If vpbe_set_default_output() or vpbe_set_default_mode() fails,
vpbe_initialize() returns error code without releasing resources.

The patch adds error handling for that case.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/media/platform/davinci/vpbe.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/media/platform/davinci/vpbe.c b/drivers/media/platform/davinci/vpbe.c
index 9a6c2cc38acb..abce9c4a1a8e 100644
--- a/drivers/media/platform/davinci/vpbe.c
+++ b/drivers/media/platform/davinci/vpbe.c
@@ -753,7 +753,7 @@ static int vpbe_initialize(struct device *dev, struct vpbe_device *vpbe_dev)
 	if (ret) {
 		v4l2_err(&vpbe_dev->v4l2_dev, "Failed to set default output %s",
 			 def_output);
-		return ret;
+		goto fail_kfree_amp;
 	}
 
 	printk(KERN_NOTICE "Setting default mode to %s\n", def_mode);
@@ -761,12 +761,15 @@ static int vpbe_initialize(struct device *dev, struct vpbe_device *vpbe_dev)
 	if (ret) {
 		v4l2_err(&vpbe_dev->v4l2_dev, "Failed to set default mode %s",
 			 def_mode);
-		return ret;
+		goto fail_kfree_amp;
 	}
 	vpbe_dev->initialized = 1;
 	/* TBD handling of bootargs for default output and mode */
 	return 0;
 
+fail_kfree_amp:
+	mutex_lock(&vpbe_dev->lock);
+	kfree(vpbe_dev->amp);
 fail_kfree_encoders:
 	kfree(vpbe_dev->encoders);
 fail_dev_unregister:

From 7518fa7dc10aae3dcc80ef0176ffa9d1ae450f83 Mon Sep 17 00:00:00 2001
From: Zoran Markovic <zmarkovic@sierrawireless.com>
Date: Wed, 17 Oct 2018 16:25:44 -0700
Subject: [PATCH 2507/3220] smack: fix access permissions for keyring

[ Upstream commit 5b841bfab695e3b8ae793172a9ff7990f99cc3e2 ]

Function smack_key_permission() only issues smack requests for the
following operations:
 - KEY_NEED_READ (issues MAY_READ)
 - KEY_NEED_WRITE (issues MAY_WRITE)
 - KEY_NEED_LINK (issues MAY_WRITE)
 - KEY_NEED_SETATTR (issues MAY_WRITE)
A blank smack request is issued in all other cases, resulting in
smack access being granted if there is any rule defined between
subject and object, or denied with -EACCES otherwise.

Request MAY_READ access for KEY_NEED_SEARCH and KEY_NEED_VIEW.
Fix the logic in the unlikely case when both MAY_READ and
MAY_WRITE are needed. Validate access permission field for valid
contents.

Signed-off-by: Zoran Markovic <zmarkovic@sierrawireless.com>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: James Morris <jmorris@namei.org>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 security/smack/smack_lsm.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index c73361859d11..9db7c80a74aa 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4311,6 +4311,12 @@ static int smack_key_permission(key_ref_t key_ref,
 	int request = 0;
 	int rc;
 
+	/*
+	 * Validate requested permissions
+	 */
+	if (perm & ~KEY_NEED_ALL)
+		return -EINVAL;
+
 	keyp = key_ref_to_ptr(key_ref);
 	if (keyp == NULL)
 		return -EINVAL;
@@ -4330,10 +4336,10 @@ static int smack_key_permission(key_ref_t key_ref,
 	ad.a.u.key_struct.key = keyp->serial;
 	ad.a.u.key_struct.key_desc = keyp->description;
 #endif
-	if (perm & KEY_NEED_READ)
-		request = MAY_READ;
+	if (perm & (KEY_NEED_READ | KEY_NEED_SEARCH | KEY_NEED_VIEW))
+		request |= MAY_READ;
 	if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETATTR))
-		request = MAY_WRITE;
+		request |= MAY_WRITE;
 	rc = smk_access(tkp, keyp->security, request, &ad);
 	rc = smk_bu_note("key access", tkp, keyp->security, request, rc);
 	return rc;

From 235f1e5e124d6ad598dbb1be4ddcef4f952a352a Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Wed, 28 Nov 2018 15:55:21 +0200
Subject: [PATCH 2508/3220] usb: hub: delay hub autosuspend if USB3 port is
 still link training

[ Upstream commit e86108940e541febf35813402ff29fa6f4a9ac0b ]

When initializing a hub we want to give a USB3 port in link training
the same debounce delay time before autosuspening the hub as already
trained, connected enabled ports.

USB3 ports won't reach the enabled state with "current connect status" and
"connect status change" bits set until the USB3 link training finishes.

Catching the port in link training (polling) and adding the debounce delay
prevents unnecessary failed attempts to autosuspend the hub.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/core/hub.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index be63db142d3f..3a6978458d95 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -1092,6 +1092,16 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type)
 						   USB_PORT_FEAT_ENABLE);
 		}
 
+		/*
+		 * Add debounce if USB3 link is in polling/link training state.
+		 * Link will automatically transition to Enabled state after
+		 * link training completes.
+		 */
+		if (hub_is_superspeed(hdev) &&
+		    ((portstatus & USB_PORT_STAT_LINK_STATE) ==
+						USB_SS_PORT_LS_POLLING))
+			need_debounce_delay = true;
+
 		/* Clear status-change flags; we'll debounce later */
 		if (portchange & USB_PORT_STAT_C_CONNECTION) {
 			need_debounce_delay = true;

From eb9c64e728f262a62969cb0fa2f8fe4798cfe97b Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 28 Nov 2018 15:43:09 -0800
Subject: [PATCH 2509/3220] timekeeping: Use proper seqcount initializer

[ Upstream commit ce10a5b3954f2514af726beb78ed8d7350c5e41c ]

tk_core.seq is initialized open coded, but that misses to initialize the
lockdep map when lockdep is enabled. Lockdep splats involving tk_core seq
consequently lack a name and are hard to read.

Use the proper initializer which takes care of the lockdep map
initialization.

[ tglx: Massaged changelog ]

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: tj@kernel.org
Cc: johannes.berg@intel.com
Link: https://lkml.kernel.org/r/20181128234325.110011-12-bvanassche@acm.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/time/timekeeping.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fed86b2dfc89..d9837d25dfe0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -39,7 +39,9 @@
 static struct {
 	seqcount_t		seq;
 	struct timekeeper	timekeeper;
-} tk_core ____cacheline_aligned;
+} tk_core ____cacheline_aligned = {
+	.seq = SEQCNT_ZERO(tk_core.seq),
+};
 
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
 static struct timekeeper shadow_timekeeper;

From 54b53c5994528bacf6f3e517d75943c147e82b97 Mon Sep 17 00:00:00 2001
From: Russell King - ARM Linux <linux@armlinux.org.uk>
Date: Fri, 7 Dec 2018 09:17:07 -0800
Subject: [PATCH 2510/3220] ARM: dts: Fix OMAP4430 SDP Ethernet startup

[ Upstream commit 84fb6c7feb1494ebb7d1ec8b95cfb7ada0264465 ]

It was noticed that unbinding and rebinding the KSZ8851 ethernet
resulted in the driver reporting "failed to read device ID" at probe.
Probing the reset line with a 'scope while repeatedly attempting to
bind the driver in a shell loop revealed that the KSZ8851 RSTN pin is
constantly held at zero, meaning the device is held in reset, and
does not respond on the SPI bus.

Experimentation with the startup delay on the regulator set to 50ms
shows that the reset is positively released after 20ms.

Schematics for this board are not available, and the traces are buried
in the inner layers of the board which makes tracing where the RSTN pin
extremely difficult.  We can only guess that the RSTN pin is wired to a
reset generator chip driven off the ethernet supply, which fits the
observed behaviour.

Include this delay in the regulator startup delay - effectively
treating the reset as a "supply stable" indicator.

This can not be modelled as a delay in the KSZ8851 driver since the
reset generation is board specific - if the RSTN pin had been wired to
a GPIO, reset could be released earlier via the already provided support
in the KSZ8851 driver.

This also got confirmed by Peter Ujfalusi <peter.ujfalusi@ti.com> based
on Blaze schematics that should be very close to SDP4430:

TPS22902YFPR is used as the regulator switch (gpio48 controlled):
Convert arm boot_lock to raw The VOUT is routed to TPS3808G01DBV.
(SCH Note: Threshold set at 90%. Vsense: 0.405V).

According to the TPS3808 data sheet the RESET delay time when Ct is
open (this is the case in the schema): MIN/TYP/MAX: 12/20/28 ms.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
[tony@atomide.com: updated with notes from schematics from Peter]
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/boot/dts/omap4-sdp.dts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/omap4-sdp.dts b/arch/arm/boot/dts/omap4-sdp.dts
index f0bdc41f8eff..235d1493f8aa 100644
--- a/arch/arm/boot/dts/omap4-sdp.dts
+++ b/arch/arm/boot/dts/omap4-sdp.dts
@@ -33,6 +33,7 @@
 		gpio = <&gpio2 16 GPIO_ACTIVE_HIGH>;  /* gpio line 48 */
 		enable-active-high;
 		regulator-boot-on;
+		startup-delay-us = <25000>;
 	};
 
 	vbat: fixedregulator-vbat {

From cab345f0e9b454422c3ec6578003926105414653 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Mon, 3 Dec 2018 17:27:54 -0500
Subject: [PATCH 2511/3220] mips: bpf: fix encoding bug for mm_srlv32_op

[ Upstream commit 17f6c83fb5ebf7db4fcc94a5be4c22d5a7bfe428 ]

For micro-mips, srlv inside POOL32A encoding space should use 0x50
sub-opcode, NOT 0x90.

Some early version ISA doc describes the encoding as 0x90 for both srlv and
srav, this looks to me was a typo. I checked Binutils libopcode
implementation which is using 0x50 for srlv and 0x90 for srav.

v1->v2:
  - Keep mm_srlv32_op sorted by value.

Fixes: f31318fdf324 ("MIPS: uasm: Add srlv uasm instruction")
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@vger.kernel.org
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/mips/include/uapi/asm/inst.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 1b6f2f219298..9db764b51ffe 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -290,8 +290,8 @@ enum mm_32a_minor_op {
 	mm_ext_op = 0x02c,
 	mm_pool32axf_op = 0x03c,
 	mm_srl32_op = 0x040,
+	mm_srlv32_op = 0x050,
 	mm_sra_op = 0x080,
-	mm_srlv32_op = 0x090,
 	mm_rotr_op = 0x0c0,
 	mm_lwxs_op = 0x118,
 	mm_addu32_op = 0x150,

From 771727c059d92d0a3f78c1adce1865bd5001e5cc Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 7 Nov 2018 22:58:24 +0000
Subject: [PATCH 2512/3220] iommu/arm-smmu-v3: Use explicit mb() when moving
 cons pointer

[ Upstream commit a868e8530441286342f90c1fd9c5f24de3aa2880 ]

After removing an entry from a queue (e.g. reading an event in
arm_smmu_evtq_thread()) it is necessary to advance the MMIO consumer
pointer to free the queue slot back to the SMMU. A memory barrier is
required here so that all reads targetting the queue entry have
completed before the consumer pointer is updated.

The implementation of queue_inc_cons() relies on a writel() to complete
the previous reads, but this is incorrect because writel() is only
guaranteed to complete prior writes. This patch replaces the call to
writel() with an mb(); writel_relaxed() sequence, which gives us the
read->write ordering which we require.

Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/iommu/arm-smmu-v3.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index fc6eb752ab35..eb9937225d64 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -683,7 +683,13 @@ static void queue_inc_cons(struct arm_smmu_queue *q)
 	u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1;
 
 	q->cons = Q_OVF(q, q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons);
-	writel(q->cons, q->cons_reg);
+
+	/*
+	 * Ensure that all CPU accesses (reads and writes) to the queue
+	 * are complete before we update the cons pointer.
+	 */
+	mb();
+	writel_relaxed(q->cons, q->cons_reg);
 }
 
 static int queue_sync_prod(struct arm_smmu_queue *q)

From c90636fda3d6f1db1d33d2e3d41f35c43eff456e Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sat, 24 Nov 2018 21:14:16 +0300
Subject: [PATCH 2513/3220] sata_rcar: fix deferred probing

[ Upstream commit 9f83cfdb1ace3ef268ecc6fda50058d2ec37d603 ]

The driver overrides the error codes returned by platform_get_irq() to
-EINVAL, so if it returns -EPROBE_DEFER, the driver would fail the probe
permanently instead of the deferred probing. Switch to propagating the
error code upstream, still checking/overriding IRQ0 as libata regards it
as "no IRQ" (thus polling) anyway...

Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq")
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/ata/sata_rcar.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/ata/sata_rcar.c b/drivers/ata/sata_rcar.c
index 8804127b108c..21b80f5ee092 100644
--- a/drivers/ata/sata_rcar.c
+++ b/drivers/ata/sata_rcar.c
@@ -875,7 +875,9 @@ static int sata_rcar_probe(struct platform_device *pdev)
 	int ret = 0;
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq <= 0)
+	if (irq < 0)
+		return irq;
+	if (!irq)
 		return -EINVAL;
 
 	priv = devm_kzalloc(&pdev->dev, sizeof(struct sata_rcar_priv),

From e21e67ef6ec8866d859353328b68df99a8030d82 Mon Sep 17 00:00:00 2001
From: Anson Huang <anson.huang@nxp.com>
Date: Fri, 30 Nov 2018 07:23:47 +0000
Subject: [PATCH 2514/3220] clk: imx6sl: ensure MMDC CH0 handshake is bypassed

[ Upstream commit 0efcc2c0fd2001a83240a8c3d71f67770484917e ]

Same as other i.MX6 SoCs, ensure unused MMDC channel's
handshake is bypassed, this is to make sure no request
signal will be generated when periphe_clk_sel is changed
or SRC warm reset is triggered.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/clk/imx/clk-imx6sl.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/clk/imx/clk-imx6sl.c b/drivers/clk/imx/clk-imx6sl.c
index 1be6230a07af..8b6306dc5fc6 100644
--- a/drivers/clk/imx/clk-imx6sl.c
+++ b/drivers/clk/imx/clk-imx6sl.c
@@ -17,6 +17,8 @@
 
 #include "clk.h"
 
+#define CCDR				0x4
+#define BM_CCM_CCDR_MMDC_CH0_MASK	(1 << 17)
 #define CCSR			0xc
 #define BM_CCSR_PLL1_SW_CLK_SEL	(1 << 2)
 #define CACRR			0x10
@@ -414,6 +416,10 @@ static void __init imx6sl_clocks_init(struct device_node *ccm_node)
 	clks[IMX6SL_CLK_USDHC3]       = imx_clk_gate2("usdhc3",       "usdhc3_podf",       base + 0x80, 6);
 	clks[IMX6SL_CLK_USDHC4]       = imx_clk_gate2("usdhc4",       "usdhc4_podf",       base + 0x80, 8);
 
+	/* Ensure the MMDC CH0 handshake is bypassed */
+	writel_relaxed(readl_relaxed(base + CCDR) |
+		BM_CCM_CCDR_MMDC_CH0_MASK, base + CCDR);
+
 	imx_check_clocks(clks, ARRAY_SIZE(clks));
 
 	clk_data.clks = clks;

From c4d10ceb95344ccca376a515baf5078f94aa2977 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Mon, 10 Dec 2018 11:26:41 -0500
Subject: [PATCH 2515/3220] cpuidle: big.LITTLE: fix refcount leak

[ Upstream commit 9456823c842f346c74265fcd98d008d87a7eb6f5 ]

of_find_node_by_path() acquires a reference to the node
returned by it and that reference needs to be dropped by its caller.
bl_idle_init() doesn't do that, so fix it.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/cpuidle/cpuidle-big_little.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-big_little.c b/drivers/cpuidle/cpuidle-big_little.c
index db2ede565f1a..b44476a1b7ad 100644
--- a/drivers/cpuidle/cpuidle-big_little.c
+++ b/drivers/cpuidle/cpuidle-big_little.c
@@ -167,6 +167,7 @@ static int __init bl_idle_init(void)
 {
 	int ret;
 	struct device_node *root = of_find_node_by_path("/");
+	const struct of_device_id *match_id;
 
 	if (!root)
 		return -ENODEV;
@@ -174,7 +175,11 @@ static int __init bl_idle_init(void)
 	/*
 	 * Initialize the driver just for a compliant set of machines
 	 */
-	if (!of_match_node(compatible_machine_match, root))
+	match_id = of_match_node(compatible_machine_match, root);
+
+	of_node_put(root);
+
+	if (!match_id)
 		return -ENODEV;
 
 	if (!mcpm_is_available())

From 0d997e1635f7fccc4e7b572b814734b0f6ac9095 Mon Sep 17 00:00:00 2001
From: "Adamski, Krzysztof (Nokia - PL/Wroclaw)" <krzysztof.adamski@nokia.com>
Date: Mon, 10 Dec 2018 15:01:27 +0000
Subject: [PATCH 2516/3220] i2c-axxia: check for error conditions first

[ Upstream commit 4f5c85fe3a60ace555d09898166af372547f97fc ]

It was observed that when using seqentional mode contrary to the
documentation, the SS bit (which is supposed to only be set if
automatic/sequence command completed normally), is sometimes set
together with NA (NAK in address phase) causing transfer to falsely be
considered successful.

My assumption is that this does not happen during manual mode since the
controller is stopping its work the moment it sets NA/ND bit in status
register. This is not the case in Automatic/Sequentional mode where it
is still working to send STOP condition and the actual status we get
depends on the time when the ISR is run.

This patch changes the order of checking status bits in ISR - error
conditions are checked first and only if none of them occurred, the
transfer may be considered successful. This is required to introduce
using of sequentional mode in next patch.

Signed-off-by: Krzysztof Adamski <krzysztof.adamski@nokia.com>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/i2c/busses/i2c-axxia.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/i2c/busses/i2c-axxia.c b/drivers/i2c/busses/i2c-axxia.c
index 9c9fd2e87a4b..1c68b05c8649 100644
--- a/drivers/i2c/busses/i2c-axxia.c
+++ b/drivers/i2c/busses/i2c-axxia.c
@@ -296,22 +296,7 @@ static irqreturn_t axxia_i2c_isr(int irq, void *_dev)
 			i2c_int_disable(idev, MST_STATUS_TFL);
 	}
 
-	if (status & MST_STATUS_SCC) {
-		/* Stop completed */
-		i2c_int_disable(idev, ~MST_STATUS_TSS);
-		complete(&idev->msg_complete);
-	} else if (status & MST_STATUS_SNS) {
-		/* Transfer done */
-		i2c_int_disable(idev, ~MST_STATUS_TSS);
-		if (i2c_m_rd(idev->msg) && idev->msg_xfrd < idev->msg->len)
-			axxia_i2c_empty_rx_fifo(idev);
-		complete(&idev->msg_complete);
-	} else if (status & MST_STATUS_TSS) {
-		/* Transfer timeout */
-		idev->msg_err = -ETIMEDOUT;
-		i2c_int_disable(idev, ~MST_STATUS_TSS);
-		complete(&idev->msg_complete);
-	} else if (unlikely(status & MST_STATUS_ERR)) {
+	if (unlikely(status & MST_STATUS_ERR)) {
 		/* Transfer error */
 		i2c_int_disable(idev, ~0);
 		if (status & MST_STATUS_AL)
@@ -328,6 +313,21 @@ static irqreturn_t axxia_i2c_isr(int irq, void *_dev)
 			readl(idev->base + MST_TX_BYTES_XFRD),
 			readl(idev->base + MST_TX_XFER));
 		complete(&idev->msg_complete);
+	} else if (status & MST_STATUS_SCC) {
+		/* Stop completed */
+		i2c_int_disable(idev, ~MST_STATUS_TSS);
+		complete(&idev->msg_complete);
+	} else if (status & MST_STATUS_SNS) {
+		/* Transfer done */
+		i2c_int_disable(idev, ~MST_STATUS_TSS);
+		if (i2c_m_rd(idev->msg) && idev->msg_xfrd < idev->msg->len)
+			axxia_i2c_empty_rx_fifo(idev);
+		complete(&idev->msg_complete);
+	} else if (status & MST_STATUS_TSS) {
+		/* Transfer timeout */
+		idev->msg_err = -ETIMEDOUT;
+		i2c_int_disable(idev, ~MST_STATUS_TSS);
+		complete(&idev->msg_complete);
 	}
 
 out:

From ea59fcf67d78432cb87bbdc41936637112570120 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Dec 2018 14:29:20 +0100
Subject: [PATCH 2517/3220] udf: Fix BUG on corrupted inode

[ Upstream commit d288d95842f1503414b7eebce3773bac3390457e ]

When inode is corrupted so that extent type is invalid, some functions
(such as udf_truncate_extents()) will just BUG. Check that extent type
is valid when loading the inode to memory.

Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/udf/inode.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 0e659d9c69a1..613193c6bb42 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1364,6 +1364,12 @@ reread:
 
 	iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) &
 							ICBTAG_FLAG_AD_MASK;
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT &&
+	    iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG &&
+	    iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		ret = -EIO;
+		goto out;
+	}
 	iinfo->i_unique = 0;
 	iinfo->i_lenEAttr = 0;
 	iinfo->i_lenExtents = 0;

From b9037406e1516411dc79985436aa4629c7f5070b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 10 Dec 2018 22:58:39 +0100
Subject: [PATCH 2518/3220] ARM: pxa: avoid section mismatch warning

[ Upstream commit 88af3209aa0881aa5ffd99664b6080a4be5f24e5 ]

WARNING: vmlinux.o(.text+0x19f90): Section mismatch in reference from the function littleton_init_lcd() to the function .init.text:pxa_set_fb_info()
The function littleton_init_lcd() references
the function __init pxa_set_fb_info().
This is often because littleton_init_lcd lacks a __init
annotation or the annotation of pxa_set_fb_info is wrong.

WARNING: vmlinux.o(.text+0xf824): Section mismatch in reference from the function zeus_register_ohci() to the function .init.text:pxa_set_ohci_info()
The function zeus_register_ohci() references
the function __init pxa_set_ohci_info().
This is often because zeus_register_ohci lacks a __init
annotation or the annotation of pxa_set_ohci_info is wrong.

WARNING: vmlinux.o(.text+0xf95c): Section mismatch in reference from the function cm_x300_init_u2d() to the function .init.text:pxa3xx_set_u2d_info()
The function cm_x300_init_u2d() references
the function __init pxa3xx_set_u2d_info().
This is often because cm_x300_init_u2d lacks a __init
annotation or the annotation of pxa3xx_set_u2d_info is wrong.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/mach-pxa/cm-x300.c   | 2 +-
 arch/arm/mach-pxa/littleton.c | 2 +-
 arch/arm/mach-pxa/zeus.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/mach-pxa/cm-x300.c b/arch/arm/mach-pxa/cm-x300.c
index a7dae60810e8..307fc18edede 100644
--- a/arch/arm/mach-pxa/cm-x300.c
+++ b/arch/arm/mach-pxa/cm-x300.c
@@ -547,7 +547,7 @@ static struct pxa3xx_u2d_platform_data cm_x300_u2d_platform_data = {
 	.exit		= cm_x300_u2d_exit,
 };
 
-static void cm_x300_init_u2d(void)
+static void __init cm_x300_init_u2d(void)
 {
 	pxa3xx_set_u2d_info(&cm_x300_u2d_platform_data);
 }
diff --git a/arch/arm/mach-pxa/littleton.c b/arch/arm/mach-pxa/littleton.c
index 5d665588c7eb..05aa7071efd6 100644
--- a/arch/arm/mach-pxa/littleton.c
+++ b/arch/arm/mach-pxa/littleton.c
@@ -183,7 +183,7 @@ static struct pxafb_mach_info littleton_lcd_info = {
 	.lcd_conn		= LCD_COLOR_TFT_16BPP,
 };
 
-static void littleton_init_lcd(void)
+static void __init littleton_init_lcd(void)
 {
 	pxa_set_fb_info(NULL, &littleton_lcd_info);
 }
diff --git a/arch/arm/mach-pxa/zeus.c b/arch/arm/mach-pxa/zeus.c
index d757cfb5f8a6..4da2458d7f32 100644
--- a/arch/arm/mach-pxa/zeus.c
+++ b/arch/arm/mach-pxa/zeus.c
@@ -558,7 +558,7 @@ static struct pxaohci_platform_data zeus_ohci_platform_data = {
 	.flags		= ENABLE_PORT_ALL | POWER_SENSE_LOW,
 };
 
-static void zeus_register_ohci(void)
+static void __init zeus_register_ohci(void)
 {
 	/* Port 2 is shared between host and client interface. */
 	UP2OCR = UP2OCR_HXOE | UP2OCR_HXS | UP2OCR_DMPDE | UP2OCR_DPPDE;

From fff375ea867e209ba4e999eab92f3d75dc404f41 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Thu, 13 Dec 2018 00:08:38 -0200
Subject: [PATCH 2519/3220] ASoC: fsl: Fix SND_SOC_EUKREA_TLV320 build error on
 i.MX8M

[ Upstream commit add6883619a9e3bf9658eaff1a547354131bbcd9 ]

eukrea-tlv320.c machine driver runs on non-DT platforms
and include <asm/mach-types.h> header file in order to be able
to use some machine_is_eukrea_xxx() macros.

Building it for ARM64 causes the following build error:

sound/soc/fsl/eukrea-tlv320.c:28:10: fatal error: asm/mach-types.h: No such file or directory

Avoid this error by not allowing to build the SND_SOC_EUKREA_TLV320
driver when ARM64 is selected.

This is needed in preparation for the i.MX8M support.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Fabio Estevam <festevam@gmail.com>
Acked-by: Shawn Guo <shawnguo@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/fsl/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/fsl/Kconfig b/sound/soc/fsl/Kconfig
index 14dfdee05fd5..3066e068aae5 100644
--- a/sound/soc/fsl/Kconfig
+++ b/sound/soc/fsl/Kconfig
@@ -219,7 +219,7 @@ config SND_SOC_PHYCORE_AC97
 
 config SND_SOC_EUKREA_TLV320
 	tristate "Eukrea TLV320"
-	depends on ARCH_MXC && I2C
+	depends on ARCH_MXC && !ARM64 && I2C
 	select SND_SOC_TLV320AIC23_I2C
 	select SND_SOC_IMX_AUDMUX
 	select SND_SOC_IMX_SSI

From 98920f4a5d5b29307fc60d8774f8478efd902199 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Mon, 5 Nov 2018 16:45:04 +0800
Subject: [PATCH 2520/3220] memstick: Prevent memstick host from getting
 runtime suspended during card detection

[ Upstream commit e03e303edf1c63e6dd455ccd568c74e93ef3ba8c ]

We can use MEMSTICK_POWER_{ON,OFF} along with pm_runtime_{get,put}
helpers to let memstick host support runtime pm.

The rpm count may go down to zero before the memstick host powers on, so
the host can be runtime suspended.

So before doing card detection, increment the rpm count to avoid the
host gets runtime suspended. Balance the rpm count after card detection
is done.

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/memstick/core/memstick.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index a0547dbf9806..4d673a626db4 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -18,6 +18,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/pm_runtime.h>
 
 #define DRIVER_NAME "memstick"
 
@@ -436,6 +437,7 @@ static void memstick_check(struct work_struct *work)
 	struct memstick_dev *card;
 
 	dev_dbg(&host->dev, "memstick_check started\n");
+	pm_runtime_get_noresume(host->dev.parent);
 	mutex_lock(&host->lock);
 	if (!host->card) {
 		if (memstick_power_on(host))
@@ -479,6 +481,7 @@ out_power_off:
 		host->set_param(host, MEMSTICK_POWER, MEMSTICK_POWER_OFF);
 
 	mutex_unlock(&host->lock);
+	pm_runtime_put(host->dev.parent);
 	dev_dbg(&host->dev, "memstick_check finished\n");
 }
 

From 9f01d0d19c73f73551e86628caedc12df523fd39 Mon Sep 17 00:00:00 2001
From: Beomho Seo <beomho.seo@samsung.com>
Date: Fri, 14 Dec 2018 12:34:08 +0100
Subject: [PATCH 2521/3220] tty: serial: samsung: Properly set flags in autoCTS
 mode

[ Upstream commit 31e933645742ee6719d37573a27cce0761dcf92b ]

Commit 391f93f2ec9f ("serial: core: Rework hw-assited flow control support")
has changed the way the autoCTS mode is handled.

According to that change, serial drivers which enable H/W autoCTS mode must
set UPSTAT_AUTOCTS to prevent the serial core from inadvertently disabling
TX. This patch adds proper handling of UPSTAT_AUTOCTS flag.

Signed-off-by: Beomho Seo <beomho.seo@samsung.com>
[mszyprow: rephrased commit message]
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/tty/serial/samsung.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/tty/serial/samsung.c b/drivers/tty/serial/samsung.c
index 4d532a085db9..12bac2cbae4b 100644
--- a/drivers/tty/serial/samsung.c
+++ b/drivers/tty/serial/samsung.c
@@ -1329,11 +1329,14 @@ static void s3c24xx_serial_set_termios(struct uart_port *port,
 	wr_regl(port, S3C2410_ULCON, ulcon);
 	wr_regl(port, S3C2410_UBRDIV, quot);
 
+	port->status &= ~UPSTAT_AUTOCTS;
+
 	umcon = rd_regl(port, S3C2410_UMCON);
 	if (termios->c_cflag & CRTSCTS) {
 		umcon |= S3C2410_UMCOM_AFC;
 		/* Disable RTS when RX FIFO contains 63 bytes */
 		umcon &= ~S3C2412_UMCON_AFC_8;
+		port->status = UPSTAT_AUTOCTS;
 	} else {
 		umcon &= ~S3C2410_UMCOM_AFC;
 	}

From fac39ee2e52b19f826a2c041a41f534fd35717e0 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 9 Nov 2018 15:07:10 +0000
Subject: [PATCH 2522/3220] arm64: KVM: Skip MMIO insn after emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 0d640732dbebed0f10f18526de21652931f0b2f2 ]

When we emulate an MMIO instruction, we advance the CPU state within
decode_hsr(), before emulating the instruction effects.

Having this logic in decode_hsr() is opaque, and advancing the state
before emulation is problematic. It gets in the way of applying
consistent single-step logic, and it prevents us from being able to fail
an MMIO instruction with a synchronous exception.

Clean this up by only advancing the CPU state *after* the effects of the
instruction are emulated.

Cc: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/kvm/mmio.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
index 387ee2a11e36..885cd0e0015b 100644
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@@ -118,6 +118,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data);
 	}
 
+	/*
+	 * The MMIO instruction is emulated and should not be re-executed
+	 * in the guest.
+	 */
+	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
 	return 0;
 }
 
@@ -151,11 +157,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
 	vcpu->arch.mmio_decode.sign_extend = sign_extend;
 	vcpu->arch.mmio_decode.rt = rt;
 
-	/*
-	 * The MMIO instruction is emulated and should not be re-executed
-	 * in the guest.
-	 */
-	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
 	return 0;
 }
 

From 3df04b50ef3f92f10aca8eb70f6db7cb89146258 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 10 Dec 2018 06:50:09 +0000
Subject: [PATCH 2523/3220] powerpc/uaccess: fix warning/error with access_ok()

[ Upstream commit 05a4ab823983d9136a460b7b5e0d49ee709a6f86 ]

With the following piece of code, the following compilation warning
is encountered:

	if (_IOC_DIR(ioc) != _IOC_NONE) {
		int verify = _IOC_DIR(ioc) & _IOC_READ ? VERIFY_WRITE : VERIFY_READ;

		if (!access_ok(verify, ioarg, _IOC_SIZE(ioc))) {

drivers/platform/test/dev.c: In function 'my_ioctl':
drivers/platform/test/dev.c:219:7: warning: unused variable 'verify' [-Wunused-variable]
   int verify = _IOC_DIR(ioc) & _IOC_READ ? VERIFY_WRITE : VERIFY_READ;

This patch fixes it by referencing 'type' in the macro allthough
doing nothing with it.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index a5ffe0207c16..05f1389228d2 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -59,7 +59,7 @@
 #endif
 
 #define access_ok(type, addr, size)		\
-	(__chk_user_ptr(addr),			\
+	(__chk_user_ptr(addr), (void)(type),		\
 	 __access_ok((__force unsigned long)(addr), (size), get_fs()))
 
 /*

From b489ed3fbc3b894a1fe82d16d7570eeba2a9a799 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sat, 15 Dec 2018 11:03:12 +0200
Subject: [PATCH 2524/3220] mac80211: fix radiotap vendor presence bitmap
 handling

[ Upstream commit efc38dd7d5fa5c8cdd0c917c5d00947aa0539443 ]

Due to the alignment handling, it actually matters where in the code
we add the 4 bytes for the presence bitmap to the length; the first
field is the timestamp with 8 byte alignment so we need to add the
space for the extra vendor namespace presence bitmap *before* we do
any alignment for the fields.

Move the presence bitmap length accounting to the right place to fix
the alignment for the data properly.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/mac80211/rx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 64f76f88f819..acacceec8cd8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -149,6 +149,9 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 	/* allocate extra bitmaps */
 	if (status->chains)
 		len += 4 * hweight8(status->chains);
+	/* vendor presence bitmap */
+	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)
+		len += 4;
 
 	if (ieee80211_have_rx_timestamp(status)) {
 		len = ALIGN(len, 8);
@@ -185,8 +188,6 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
 		struct ieee80211_vendor_radiotap *rtap = (void *)skb->data;
 
-		/* vendor presence bitmap */
-		len += 4;
 		/* alignment for fixed 6-byte vendor data header */
 		len = ALIGN(len, 2);
 		/* vendor data header */

From f74e04acc2d76ae90ae46adf524d7baf9c1a243c Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 19 Dec 2018 14:45:09 +0800
Subject: [PATCH 2525/3220] xfrm6_tunnel: Fix spi check in
 __xfrm6_tunnel_alloc_spi

[ Upstream commit fa89a4593b927b3f59c3b69379f31d3b22272e4e ]

gcc warn this:

net/ipv6/xfrm6_tunnel.c:143 __xfrm6_tunnel_alloc_spi() warn:
 always true condition '(spi <= 4294967295) => (0-u32max <= u32max)'

'spi' is u32, which always not greater than XFRM6_TUNNEL_SPI_MAX
because of wrap around. So the second forloop will never reach.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/ipv6/xfrm6_tunnel.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 5743044cd660..56b72cada346 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -144,6 +144,9 @@ static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr)
 		index = __xfrm6_tunnel_spi_check(net, spi);
 		if (index >= 0)
 			goto alloc_spi;
+
+		if (spi == XFRM6_TUNNEL_SPI_MAX)
+			break;
 	}
 	for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) {
 		index = __xfrm6_tunnel_spi_check(net, spi);

From a66490925b7052fc2c6fca4babb738febfa5bb2e Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Tue, 27 Nov 2018 11:37:46 +0200
Subject: [PATCH 2526/3220] Bluetooth: Fix unnecessary error message for HCI
 request completion

[ Upstream commit 1629db9c75342325868243d6bca5853017d91cf8 ]

In case a command which completes in Command Status was sent using the
hci_cmd_send-family of APIs there would be a misleading error in the
hci_get_cmd_complete function, since the code would be trying to fetch
the Command Complete parameters when there are none.

Avoid the misleading error and silently bail out from the function in
case the received event is a command status.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Acked-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/bluetooth/hci_event.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index d40d32a2c12d..37fe2b158c2a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5185,6 +5185,12 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
 		return true;
 	}
 
+	/* Check if request ended in Command Status - no way to retreive
+	 * any extra parameters in this case.
+	 */
+	if (hdr->evt == HCI_EV_CMD_STATUS)
+		return false;
+
 	if (hdr->evt != HCI_EV_CMD_COMPLETE) {
 		BT_DBG("Last event is not cmd complete (0x%2.2x)", hdr->evt);
 		return false;

From 8b79471d23526de167534fb352fe6eac4ca70d40 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Fri, 14 Dec 2018 11:55:21 +0800
Subject: [PATCH 2527/3220] cw1200: Fix concurrency use-after-free bugs in
 cw1200_hw_scan()

[ Upstream commit 4f68ef64cd7feb1220232bd8f501d8aad340a099 ]

The function cw1200_bss_info_changed() and cw1200_hw_scan() can be
concurrently executed.
The two functions both access a possible shared variable "frame.skb".

This shared variable is freed by dev_kfree_skb() in cw1200_upload_beacon(),
which is called by cw1200_bss_info_changed(). The free operation is
protected by a mutex lock "priv->conf_mutex" in cw1200_bss_info_changed().

In cw1200_hw_scan(), this shared variable is accessed without the
protection of the mutex lock "priv->conf_mutex".
Thus, concurrency use-after-free bugs may occur.

To fix these bugs, the original calls to mutex_lock(&priv->conf_mutex) and
mutex_unlock(&priv->conf_mutex) are moved to the places, which can
protect the accesses to the shared variable.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/wireless/cw1200/scan.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/cw1200/scan.c b/drivers/net/wireless/cw1200/scan.c
index bff81b8d4164..9f1037e7e55c 100644
--- a/drivers/net/wireless/cw1200/scan.c
+++ b/drivers/net/wireless/cw1200/scan.c
@@ -78,6 +78,10 @@ int cw1200_hw_scan(struct ieee80211_hw *hw,
 	if (req->n_ssids > WSM_SCAN_MAX_NUM_OF_SSIDS)
 		return -EINVAL;
 
+	/* will be unlocked in cw1200_scan_work() */
+	down(&priv->scan.lock);
+	mutex_lock(&priv->conf_mutex);
+
 	frame.skb = ieee80211_probereq_get(hw, priv->vif->addr, NULL, 0,
 		req->ie_len);
 	if (!frame.skb)
@@ -86,19 +90,15 @@ int cw1200_hw_scan(struct ieee80211_hw *hw,
 	if (req->ie_len)
 		memcpy(skb_put(frame.skb, req->ie_len), req->ie, req->ie_len);
 
-	/* will be unlocked in cw1200_scan_work() */
-	down(&priv->scan.lock);
-	mutex_lock(&priv->conf_mutex);
-
 	ret = wsm_set_template_frame(priv, &frame);
 	if (!ret) {
 		/* Host want to be the probe responder. */
 		ret = wsm_set_probe_responder(priv, true);
 	}
 	if (ret) {
+		dev_kfree_skb(frame.skb);
 		mutex_unlock(&priv->conf_mutex);
 		up(&priv->scan.lock);
-		dev_kfree_skb(frame.skb);
 		return ret;
 	}
 
@@ -120,10 +120,9 @@ int cw1200_hw_scan(struct ieee80211_hw *hw,
 		++priv->scan.n_ssids;
 	}
 
-	mutex_unlock(&priv->conf_mutex);
-
 	if (frame.skb)
 		dev_kfree_skb(frame.skb);
+	mutex_unlock(&priv->conf_mutex);
 	queue_work(priv->workqueue, &priv->scan.work);
 	return 0;
 }

From 3b5d41253b54509853bfde93b6e2f17427e07908 Mon Sep 17 00:00:00 2001
From: Roland Kammerer <roland.kammerer@linbit.com>
Date: Thu, 20 Dec 2018 17:23:28 +0100
Subject: [PATCH 2528/3220] drbd: narrow rcu_read_lock in drbd_sync_handshake

[ Upstream commit d29e89e34952a9ad02c77109c71a80043544296e ]

So far there was the possibility that we called
genlmsg_new(GFP_NOIO)/mutex_lock() while holding an rcu_read_lock().

This included cases like:

drbd_sync_handshake (acquire the RCU lock)
  drbd_asb_recover_1p
    drbd_khelper
      drbd_bcast_event
        genlmsg_new(GFP_NOIO) --> may sleep

drbd_sync_handshake (acquire the RCU lock)
  drbd_asb_recover_1p
    drbd_khelper
      notify_helper
        genlmsg_new(GFP_NOIO) --> may sleep

drbd_sync_handshake (acquire the RCU lock)
  drbd_asb_recover_1p
    drbd_khelper
      notify_helper
        mutex_lock --> may sleep

While using GFP_ATOMIC whould have been possible in the first two cases,
the real fix is to narrow the rcu_read_lock.

Reported-by: Jia-Ju Bai <baijiaju1990@163.com>
Reviewed-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Roland Kammerer <roland.kammerer@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/block/drbd/drbd_receiver.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b4b5680ac6ad..2fedab9349f6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3126,7 +3126,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
 	enum drbd_conns rv = C_MASK;
 	enum drbd_disk_state mydisk;
 	struct net_conf *nc;
-	int hg, rule_nr, rr_conflict, tentative;
+	int hg, rule_nr, rr_conflict, tentative, always_asbp;
 
 	mydisk = device->state.disk;
 	if (mydisk == D_NEGOTIATING)
@@ -3168,8 +3168,12 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
 
 	rcu_read_lock();
 	nc = rcu_dereference(peer_device->connection->net_conf);
+	always_asbp = nc->always_asbp;
+	rr_conflict = nc->rr_conflict;
+	tentative = nc->tentative;
+	rcu_read_unlock();
 
-	if (hg == 100 || (hg == -100 && nc->always_asbp)) {
+	if (hg == 100 || (hg == -100 && always_asbp)) {
 		int pcount = (device->state.role == R_PRIMARY)
 			   + (peer_role == R_PRIMARY);
 		int forced = (hg == -100);
@@ -3208,9 +3212,6 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
 			     "Sync from %s node\n",
 			     (hg < 0) ? "peer" : "this");
 	}
-	rr_conflict = nc->rr_conflict;
-	tentative = nc->tentative;
-	rcu_read_unlock();
 
 	if (hg == -100) {
 		/* FIXME this log message is not correct if we end up here

From 5c857f014b05c3565becd447d94d0d9e7e88f96a Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 20 Dec 2018 17:23:32 +0100
Subject: [PATCH 2529/3220] drbd: disconnect, if the wrong UUIDs are attached
 on a connected peer

[ Upstream commit b17b59602b6dcf8f97a7dc7bc489a48388d7063a ]

With "on-no-data-accessible suspend-io", DRBD requires the next attach
or connect to be to the very same data generation uuid tag it lost last.

If we first lost connection to the peer,
then later lost connection to our own disk,
we would usually refuse to re-connect to the peer,
because it presents the wrong data set.

However, if the peer first connects without a disk,
and then attached its disk, we accepted that same wrong data set,
which would be "unexpected" by any user of that DRBD
and cause "undefined results" (read: very likely data corruption).

The fix is to forcefully disconnect as soon as we notice that the peer
attached to the "wrong" dataset.

Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/block/drbd/drbd_receiver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 2fedab9349f6..b1ee358edd3b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3890,7 +3890,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info
 	kfree(device->p_uuid);
 	device->p_uuid = p_uuid;
 
-	if (device->state.conn < C_CONNECTED &&
+	if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) &&
 	    device->state.disk < D_INCONSISTENT &&
 	    device->state.role == R_PRIMARY &&
 	    (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {

From eb15351d9c125d6f4e34ffbc327f4f2d7d7de4d2 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 20 Dec 2018 17:23:41 +0100
Subject: [PATCH 2530/3220] drbd: skip spurious timeout (ping-timeo) when
 failing promote

[ Upstream commit 9848b6ddd8c92305252f94592c5e278574e7a6ac ]

If you try to promote a Secondary while connected to a Primary
and allow-two-primaries is NOT set, we will wait for "ping-timeout"
to give this node a chance to detect a dead primary,
in case the cluster manager noticed faster than we did.

But if we then are *still* connected to a Primary,
we fail (after an additional timeout of ping-timout).

This change skips the spurious second timeout.

Most people won't notice really,
since "ping-timeout" by default is half a second.

But in some installations, ping-timeout may be 10 or 20 seconds or more,
and spuriously delaying the error return becomes annoying.

Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/block/drbd/drbd_nl.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e80cbefbc2b5..27e1abcf5710 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -632,14 +632,15 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for
 		if (rv == SS_TWO_PRIMARIES) {
 			/* Maybe the peer is detected as dead very soon...
 			   retry at most once more in this case. */
-			int timeo;
-			rcu_read_lock();
-			nc = rcu_dereference(connection->net_conf);
-			timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
-			rcu_read_unlock();
-			schedule_timeout_interruptible(timeo);
-			if (try < max_tries)
+			if (try < max_tries) {
+				int timeo;
 				try = max_tries - 1;
+				rcu_read_lock();
+				nc = rcu_dereference(connection->net_conf);
+				timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+				rcu_read_unlock();
+				schedule_timeout_interruptible(timeo);
+			}
 			continue;
 		}
 		if (rv < SS_SUCCESS) {

From 1fbbe0ccd56a59a5c07f73fe47bd60ca9a464409 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Thu, 20 Dec 2018 17:23:43 +0100
Subject: [PATCH 2531/3220] drbd: Avoid Clang warning about pointless switch
 statment

[ Upstream commit a52c5a16cf19d8a85831bb1b915a221dd4ffae3c ]

There are several warnings from Clang about no case statement matching
the constant 0:

In file included from drivers/block/drbd/drbd_receiver.c:48:
In file included from drivers/block/drbd/drbd_int.h:48:
In file included from ./include/linux/drbd_genl_api.h:54:
In file included from ./include/linux/genl_magic_struct.h:236:
./include/linux/drbd_genl.h:321:1: warning: no case matching constant
switch condition '0'
GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
./include/linux/genl_magic_struct.h:220:10: note: expanded from macro
'GENL_struct'
        switch (0) {
                ^

Silence this warning by adding a 'case 0:' statement. Additionally,
adjust the alignment of the statements in the ct_assert_unique macro to
avoid a checkpatch warning.

This solution was originally sent by Arnd Bergmann with a default case
statement: https://lore.kernel.org/patchwork/patch/756723/

Link: https://github.com/ClangBuiltLinux/linux/issues/43
Suggested-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/linux/genl_magic_struct.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h
index eecd19b37001..250e9be65e74 100644
--- a/include/linux/genl_magic_struct.h
+++ b/include/linux/genl_magic_struct.h
@@ -185,6 +185,7 @@ static inline void ct_assert_unique_operations(void)
 {
 	switch (0) {
 #include GENL_MAGIC_INCLUDE_FILE
+	case 0:
 		;
 	}
 }
@@ -203,6 +204,7 @@ static inline void ct_assert_unique_top_level_attributes(void)
 {
 	switch (0) {
 #include GENL_MAGIC_INCLUDE_FILE
+	case 0:
 		;
 	}
 }
@@ -212,7 +214,8 @@ static inline void ct_assert_unique_top_level_attributes(void)
 static inline void ct_assert_unique_ ## s_name ## _attributes(void)	\
 {									\
 	switch (0) {							\
-		s_fields						\
+	s_fields							\
+	case 0:								\
 			;						\
 	}								\
 }

From da079f87f933ddb8beee0b31bad57a3e5ba230b5 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Thu, 20 Dec 2018 19:13:07 +0100
Subject: [PATCH 2532/3220] video: clps711x-fb: release disp device node in
 probe()

[ Upstream commit fdac751355cd76e049f628afe6acb8ff4b1399f7 ]

clps711x_fb_probe() increments refcnt of disp device node by
of_parse_phandle() and leaves it undecremented on both
successful and error paths.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Cc: Alexander Shiyan <shc_work@mail.ru>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/video/fbdev/clps711x-fb.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/video/fbdev/clps711x-fb.c b/drivers/video/fbdev/clps711x-fb.c
index 649b32f78c08..c55109524fd5 100644
--- a/drivers/video/fbdev/clps711x-fb.c
+++ b/drivers/video/fbdev/clps711x-fb.c
@@ -287,14 +287,17 @@ static int clps711x_fb_probe(struct platform_device *pdev)
 	}
 
 	ret = of_get_fb_videomode(disp, &cfb->mode, OF_USE_NATIVE_MODE);
-	if (ret)
+	if (ret) {
+		of_node_put(disp);
 		goto out_fb_release;
+	}
 
 	of_property_read_u32(disp, "ac-prescale", &cfb->ac_prescale);
 	cfb->cmap_invert = of_property_read_bool(disp, "cmap-invert");
 
 	ret = of_property_read_u32(disp, "bits-per-pixel",
 				   &info->var.bits_per_pixel);
+	of_node_put(disp);
 	if (ret)
 		goto out_fb_release;
 

From 472f3f067cb96eac72991f5162594cf51e507bca Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Thu, 20 Dec 2018 19:13:07 +0100
Subject: [PATCH 2533/3220] fbdev: fbmem: behave better with small rotated
 displays and many CPUs

[ Upstream commit f75df8d4b4fabfad7e3cba2debfad12741c6fde7 ]

Blitting an image with "negative" offsets is not working since there
is no clipping. It hopefully just crashes. For the bootup logo, there
is protection so that blitting does not happen as the image is drawn
further and further to the right (ROTATE_UR) or further and further
down (ROTATE_CW). There is however no protection when drawing in the
opposite directions (ROTATE_UD and ROTATE_CCW).

Add back this protection.

The regression is 20-odd years old but the mindless warning-killing
mentality displayed in commit 34bdb666f4b2 ("fbdev: fbmem: remove
positive test on unsigned values") is also to blame, methinks.

Fixes: 448d479747b8 ("fbdev: fb_do_show_logo() updates")
Signed-off-by: Peter Rosin <peda@axentia.se>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: Fabian Frederick <ffrederick@users.sourceforge.net>
Cc: Geert Uytterhoeven <geert+renesas@glider.be>
cc: Geoff Levand <geoff@infradead.org>
Cc: James Simmons <jsimmons@users.sf.net>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/video/fbdev/core/fbmem.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 8a29ec5992fd..ea2bd6208a2f 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -433,7 +433,9 @@ static void fb_do_show_logo(struct fb_info *info, struct fb_image *image,
 			image->dx += image->width + 8;
 		}
 	} else if (rotate == FB_ROTATE_UD) {
-		for (x = 0; x < num; x++) {
+		u32 dx = image->dx;
+
+		for (x = 0; x < num && image->dx <= dx; x++) {
 			info->fbops->fb_imageblit(info, image);
 			image->dx -= image->width + 8;
 		}
@@ -445,7 +447,9 @@ static void fb_do_show_logo(struct fb_info *info, struct fb_image *image,
 			image->dy += image->height + 8;
 		}
 	} else if (rotate == FB_ROTATE_CCW) {
-		for (x = 0; x < num; x++) {
+		u32 dy = image->dy;
+
+		for (x = 0; x < num && image->dy <= dy; x++) {
 			info->fbops->fb_imageblit(info, image);
 			image->dy -= image->height + 8;
 		}

From 03c5ed7dc9b9c051ff966f69e4645d623d4d187d Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Mon, 3 Dec 2018 13:54:38 +0800
Subject: [PATCH 2534/3220] igb: Fix an issue that PME is not enabled during
 runtime suspend

[ Upstream commit 1fb3a7a75e2efcc83ef21f2434069cddd6fae6f5 ]

I210 ethernet card doesn't wakeup when a cable gets plugged. It's
because its PME is not set.

Since commit 42eca2302146 ("PCI: Don't touch card regs after runtime
suspend D3"), if the PCI state is saved, pci_pm_runtime_suspend() stops
calling pci_finish_runtime_suspend(), which enables the PCI PME.

To fix the issue, let's not to save PCI states when it's runtime
suspend, to let the PCI subsystem enables PME.

Fixes: 42eca2302146 ("PCI: Don't touch card regs after runtime suspend D3")
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 02b23f6277fb..c1796aa2dde5 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -7339,9 +7339,11 @@ static int __igb_shutdown(struct pci_dev *pdev, bool *enable_wake,
 	rtnl_unlock();
 
 #ifdef CONFIG_PM
-	retval = pci_save_state(pdev);
-	if (retval)
-		return retval;
+	if (!runtime) {
+		retval = pci_save_state(pdev);
+		if (retval)
+			return retval;
+	}
 #endif
 
 	status = rd32(E1000_STATUS);

From 2e7c1f0dedb0838035fa7c7629b388ee2a5c10fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Noralf=20Tr=C3=B8nnes?= <noralf@tronnes.org>
Date: Thu, 20 Dec 2018 19:13:09 +0100
Subject: [PATCH 2535/3220] fbdev: fbcon: Fix unregister crash when more than
 one framebuffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 2122b40580dd9d0620398739c773d07a7b7939d0 ]

When unregistering fbdev using unregister_framebuffer(), any bound
console will unbind automatically. This is working fine if this is the
only framebuffer, resulting in a switch to the dummy console. However if
there is a fb0 and I unregister fb1 having a bound console, I eventually
get a crash. The fastest way for me to trigger the crash is to do a
reboot, resulting in this splat:

[   76.478825] WARNING: CPU: 0 PID: 527 at linux/kernel/workqueue.c:1442 __queue_work+0x2d4/0x41c
[   76.478849] Modules linked in: raspberrypi_hwmon gpio_backlight backlight bcm2835_rng rng_core [last unloaded: tinydrm]
[   76.478916] CPU: 0 PID: 527 Comm: systemd-udevd Not tainted 4.20.0-rc4+ #4
[   76.478933] Hardware name: BCM2835
[   76.478949] Backtrace:
[   76.478995] [<c010d388>] (dump_backtrace) from [<c010d670>] (show_stack+0x20/0x24)
[   76.479022]  r6:00000000 r5:c0bc73be r4:00000000 r3:6fb5bf81
[   76.479060] [<c010d650>] (show_stack) from [<c08e82f4>] (dump_stack+0x20/0x28)
[   76.479102] [<c08e82d4>] (dump_stack) from [<c0120070>] (__warn+0xec/0x12c)
[   76.479134] [<c011ff84>] (__warn) from [<c01201e4>] (warn_slowpath_null+0x4c/0x58)
[   76.479165]  r9:c0eb6944 r8:00000001 r7:c0e927f8 r6:c0bc73be r5:000005a2 r4:c0139e84
[   76.479197] [<c0120198>] (warn_slowpath_null) from [<c0139e84>] (__queue_work+0x2d4/0x41c)
[   76.479222]  r6:d7666a00 r5:c0e918ee r4:dbc4e700
[   76.479251] [<c0139bb0>] (__queue_work) from [<c013a02c>] (queue_work_on+0x60/0x88)
[   76.479281]  r10:c0496bf8 r9:00000100 r8:c0e92ae0 r7:00000001 r6:d9403700 r5:d7666a00
[   76.479298]  r4:20000113
[   76.479348] [<c0139fcc>] (queue_work_on) from [<c0496c28>] (cursor_timer_handler+0x30/0x54)
[   76.479374]  r7:d8a8fabc r6:c0e08088 r5:d8afdc5c r4:d8a8fabc
[   76.479413] [<c0496bf8>] (cursor_timer_handler) from [<c0178744>] (call_timer_fn+0x100/0x230)
[   76.479435]  r4:c0e9192f r3:d758a340
[   76.479465] [<c0178644>] (call_timer_fn) from [<c0178980>] (expire_timers+0x10c/0x12c)
[   76.479495]  r10:40000000 r9:c0e9192f r8:c0e92ae0 r7:d8afdccc r6:c0e19280 r5:c0496bf8
[   76.479513]  r4:d8a8fabc
[   76.479541] [<c0178874>] (expire_timers) from [<c0179630>] (run_timer_softirq+0xa8/0x184)
[   76.479570]  r9:00000001 r8:c0e19280 r7:00000000 r6:c0e08088 r5:c0e1a3e0 r4:c0e19280
[   76.479603] [<c0179588>] (run_timer_softirq) from [<c0102404>] (__do_softirq+0x1ac/0x3fc)
[   76.479632]  r10:c0e91680 r9:d8afc020 r8:0000000a r7:00000100 r6:00000001 r5:00000002
[   76.479650]  r4:c0eb65ec
[   76.479686] [<c0102258>] (__do_softirq) from [<c0124d10>] (irq_exit+0xe8/0x168)
[   76.479716]  r10:d8d1a9b0 r9:d8afc000 r8:00000001 r7:d949c000 r6:00000000 r5:c0e8b3f0
[   76.479734]  r4:00000000
[   76.479764] [<c0124c28>] (irq_exit) from [<c016b72c>] (__handle_domain_irq+0x94/0xb0)
[   76.479793] [<c016b698>] (__handle_domain_irq) from [<c01021dc>] (bcm2835_handle_irq+0x3c/0x48)
[   76.479823]  r8:d8afdebc r7:d8afddfc r6:ffffffff r5:c0e089f8 r4:d8afddc8 r3:d8afddc8
[   76.479851] [<c01021a0>] (bcm2835_handle_irq) from [<c01019f0>] (__irq_svc+0x70/0x98)

The problem is in the console rebinding in fbcon_fb_unbind(). It uses the
virtual console index as the new framebuffer index to bind the console(s)
to. The correct way is to use the con2fb_map lookup table to find the
framebuffer index.

Fixes: cfafca8067c6 ("fbdev: fbcon: console unregistration from unregister_framebuffer")
Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/video/console/fbcon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 4e3c78d88832..c03c5b9602bb 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -3032,7 +3032,7 @@ static int fbcon_fb_unbind(int idx)
 	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		if (con2fb_map[i] != idx &&
 		    con2fb_map[i] != -1) {
-			new_idx = i;
+			new_idx = con2fb_map[i];
 			break;
 		}
 	}

From a8014f2741b434598c5323f7fb2984725fbc5ad5 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 19 Dec 2018 12:06:13 +0100
Subject: [PATCH 2536/3220] KVM: x86: svm: report MSR_IA32_MCG_EXT_CTL as
 unsupported
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit e87555e550cef4941579cd879759a7c0dee24e68 ]

AMD doesn't seem to implement MSR_IA32_MCG_EXT_CTL and svm code in kvm
knows nothing about it, however, this MSR is among emulated_msrs and
thus returned with KVM_GET_MSR_INDEX_LIST. The consequent KVM_GET_MSRS,
of course, fails.

Report the MSR as unsupported to not confuse userspace.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/svm.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ecdf724da371..7ce1a19d9d8b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4156,6 +4156,13 @@ static bool svm_cpu_has_accelerated_tpr(void)
 
 static bool svm_has_emulated_msr(int index)
 {
+	switch (index) {
+	case MSR_IA32_MCG_EXT_CTL:
+		return false;
+	default:
+		break;
+	}
+
 	return true;
 }
 

From 8ba3a4eec0a5ec5428f431ec9fe35e3c6cd08ce1 Mon Sep 17 00:00:00 2001
From: Chris Perl <cperl@janestreet.com>
Date: Mon, 17 Dec 2018 10:56:38 -0500
Subject: [PATCH 2537/3220] NFS: nfs_compare_mount_options always compare auth
 flavors.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 594d1644cd59447f4fceb592448d5cd09eb09b5e ]

This patch removes the check from nfs_compare_mount_options to see if a
`sec' option was passed for the current mount before comparing auth
flavors and instead just always compares auth flavors.

Consider the following scenario:

You have a server with the address 192.168.1.1 and two exports /export/a
and /export/b.  The first export supports `sys' and `krb5' security, the
second just `sys'.

Assume you start with no mounts from the server.

The following results in EIOs being returned as the kernel nfs client
incorrectly thinks it can share the underlying `struct nfs_server's:

$ mkdir /tmp/{a,b}
$ sudo mount -t nfs -o vers=3,sec=krb5 192.168.1.1:/export/a /tmp/a
$ sudo mount -t nfs -o vers=3          192.168.1.1:/export/b /tmp/b
$ df >/dev/null
df: ‘/tmp/b’: Input/output error

Signed-off-by: Chris Perl <cperl@janestreet.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfs/super.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 62f358f67764..412fcfbc50e2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2376,8 +2376,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
 		goto Ebusy;
 	if (a->acdirmax != b->acdirmax)
 		goto Ebusy;
-	if (b->auth_info.flavor_len > 0 &&
-	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
+	if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
 		goto Ebusy;
 	return 1;
 Ebusy:

From 259c502d0a0fee860097ed5fa0061223a302003b Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Fri, 21 Dec 2018 13:01:33 -0600
Subject: [PATCH 2538/3220] hwmon: (lm80) fix a missing check of the status of
 SMBus read

[ Upstream commit c9c63915519b1def7043b184680f33c24cd49d7b ]

If lm80_read_value() fails, it returns a negative number instead of the
correct read data. Therefore, we should avoid using the data if it
fails.

The fix checks if lm80_read_value() fails, and if so, returns with the
error number.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
[groeck: One variable for return values is enough]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwmon/lm80.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/hwmon/lm80.c b/drivers/hwmon/lm80.c
index 4bcd9b882948..47ddae6b7038 100644
--- a/drivers/hwmon/lm80.c
+++ b/drivers/hwmon/lm80.c
@@ -360,9 +360,11 @@ static ssize_t set_fan_div(struct device *dev, struct device_attribute *attr,
 	struct i2c_client *client = data->client;
 	unsigned long min, val;
 	u8 reg;
-	int err = kstrtoul(buf, 10, &val);
-	if (err < 0)
-		return err;
+	int rv;
+
+	rv = kstrtoul(buf, 10, &val);
+	if (rv < 0)
+		return rv;
 
 	/* Save fan_min */
 	mutex_lock(&data->update_lock);
@@ -390,8 +392,11 @@ static ssize_t set_fan_div(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 	}
 
-	reg = (lm80_read_value(client, LM80_REG_FANDIV) &
-	       ~(3 << (2 * (nr + 1)))) | (data->fan_div[nr] << (2 * (nr + 1)));
+	rv = lm80_read_value(client, LM80_REG_FANDIV);
+	if (rv < 0)
+		return rv;
+	reg = (rv & ~(3 << (2 * (nr + 1))))
+	    | (data->fan_div[nr] << (2 * (nr + 1)));
 	lm80_write_value(client, LM80_REG_FANDIV, reg);
 
 	/* Restore fan_min */

From 90beb6bbed6719bf5ee73dadebe6fd99cb57cc98 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Fri, 21 Dec 2018 13:10:39 -0600
Subject: [PATCH 2539/3220] hwmon: (lm80) fix a missing check of bus read in
 lm80 probe

[ Upstream commit 9aa3aa15f4c2f74f47afd6c5db4b420fadf3f315 ]

In lm80_probe(), if lm80_read_value() fails, it returns a negative
error number which is stored to data->fan[f_min] and will be further
used. We should avoid using the data if the read fails.

The fix checks if lm80_read_value() fails, and if so, returns with the
error number.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwmon/lm80.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/hwmon/lm80.c b/drivers/hwmon/lm80.c
index 47ddae6b7038..cb6606a0470d 100644
--- a/drivers/hwmon/lm80.c
+++ b/drivers/hwmon/lm80.c
@@ -628,6 +628,7 @@ static int lm80_probe(struct i2c_client *client,
 	struct device *dev = &client->dev;
 	struct device *hwmon_dev;
 	struct lm80_data *data;
+	int rv;
 
 	data = devm_kzalloc(dev, sizeof(struct lm80_data), GFP_KERNEL);
 	if (!data)
@@ -640,8 +641,14 @@ static int lm80_probe(struct i2c_client *client,
 	lm80_init_client(client);
 
 	/* A few vars need to be filled upon startup */
-	data->fan[f_min][0] = lm80_read_value(client, LM80_REG_FAN_MIN(1));
-	data->fan[f_min][1] = lm80_read_value(client, LM80_REG_FAN_MIN(2));
+	rv = lm80_read_value(client, LM80_REG_FAN_MIN(1));
+	if (rv < 0)
+		return rv;
+	data->fan[f_min][0] = rv;
+	rv = lm80_read_value(client, LM80_REG_FAN_MIN(2));
+	if (rv < 0)
+		return rv;
+	data->fan[f_min][1] = rv;
 
 	hwmon_dev = devm_hwmon_device_register_with_groups(dev, client->name,
 							   data, lm80_groups);

From cc980857718f2dd1edc385fade160b21f7770816 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 19 Oct 2018 15:21:08 +1100
Subject: [PATCH 2540/3220] seq_buf: Make seq_buf_puts() null-terminate the
 buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 0464ed24380905d640030d368cd84a4e4d1e15e2 ]

Currently seq_buf_puts() will happily create a non null-terminated
string for you in the buffer. This is particularly dangerous if the
buffer is on the stack.

For example:

  char buf[8];
  char secret = "secret";
  struct seq_buf s;

  seq_buf_init(&s, buf, sizeof(buf));
  seq_buf_puts(&s, "foo");
  printk("Message is %s\n", buf);

Can result in:

  Message is fooªªªªªsecret

We could require all users to memset() their buffer to zero before
use. But that seems likely to be forgotten and lead to bugs.

Instead we can change seq_buf_puts() to always leave the buffer in a
null-terminated state.

The only downside is that this makes the buffer 1 character smaller
for seq_buf_puts(), but that seems like a good trade off.

Link: http://lkml.kernel.org/r/20181019042109.8064-1-mpe@ellerman.id.au

Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/seq_buf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index 5c94e1012a91..cbef5ee4c459 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -143,9 +143,13 @@ int seq_buf_puts(struct seq_buf *s, const char *str)
 
 	WARN_ON(s->size == 0);
 
+	/* Add 1 to len for the trailing null byte which must be there */
+	len += 1;
+
 	if (seq_buf_can_fit(s, len)) {
 		memcpy(s->buffer + s->len, str, len);
-		s->len += len;
+		/* Don't count the trailing null byte against the capacity */
+		s->len += len - 1;
 		return 0;
 	}
 	seq_buf_set_overflow(s);

From 846aa256e6102004eaa4491350e3ab9c1784262b Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Mon, 10 Dec 2018 16:49:29 -0700
Subject: [PATCH 2541/3220] crypto: ux500 - Use proper enum in
 cryp_set_dma_transfer

[ Upstream commit 9d880c5945c748d8edcac30965f3349a602158c4 ]

Clang warns when one enumerated type is implicitly converted to another:

drivers/crypto/ux500/cryp/cryp_core.c:559:5: warning: implicit
conversion from enumeration type 'enum dma_data_direction' to different
enumeration type 'enum dma_transfer_direction' [-Wenum-conversion]
                                direction, DMA_CTRL_ACK);
                                ^~~~~~~~~
drivers/crypto/ux500/cryp/cryp_core.c:583:5: warning: implicit
conversion from enumeration type 'enum dma_data_direction' to different
enumeration type 'enum dma_transfer_direction' [-Wenum-conversion]
                                direction,
                                ^~~~~~~~~
2 warnings generated.

dmaengine_prep_slave_sg expects an enum from dma_transfer_direction.
Because we know the value of the dma_data_direction enum from the
switch statement, we can just use the proper value from
dma_transfer_direction so there is no more conversion.

DMA_TO_DEVICE = DMA_MEM_TO_DEV = 1
DMA_FROM_DEVICE = DMA_DEV_TO_MEM = 2

Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/crypto/ux500/cryp/cryp_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/ux500/cryp/cryp_core.c b/drivers/crypto/ux500/cryp/cryp_core.c
index 790f7cadc1ed..efebc484e371 100644
--- a/drivers/crypto/ux500/cryp/cryp_core.c
+++ b/drivers/crypto/ux500/cryp/cryp_core.c
@@ -555,7 +555,7 @@ static int cryp_set_dma_transfer(struct cryp_ctx *ctx,
 		desc = dmaengine_prep_slave_sg(channel,
 				ctx->device->dma.sg_src,
 				ctx->device->dma.sg_src_len,
-				direction, DMA_CTRL_ACK);
+				DMA_MEM_TO_DEV, DMA_CTRL_ACK);
 		break;
 
 	case DMA_FROM_DEVICE:
@@ -579,7 +579,7 @@ static int cryp_set_dma_transfer(struct cryp_ctx *ctx,
 		desc = dmaengine_prep_slave_sg(channel,
 				ctx->device->dma.sg_dst,
 				ctx->device->dma.sg_dst_len,
-				direction,
+				DMA_DEV_TO_MEM,
 				DMA_CTRL_ACK |
 				DMA_PREP_INTERRUPT);
 

From b49344a2dc4bc1acb58b27fc2a3286661ad4fa88 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Mon, 10 Dec 2018 16:49:54 -0700
Subject: [PATCH 2542/3220] crypto: ux500 - Use proper enum in
 hash_set_dma_transfer

[ Upstream commit 5ac93f808338f4dd465402e91869702eb87db241 ]

Clang warns when one enumerated type is implicitly converted to another:

drivers/crypto/ux500/hash/hash_core.c:169:4: warning: implicit
conversion from enumeration type 'enum dma_data_direction' to different
enumeration type 'enum dma_transfer_direction' [-Wenum-conversion]
                        direction, DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
                        ^~~~~~~~~
1 warning generated.

dmaengine_prep_slave_sg expects an enum from dma_transfer_direction.
We know that the only direction supported by this function is
DMA_TO_DEVICE because of the check at the top of this function so we can
just use the equivalent value from dma_transfer_direction.

DMA_TO_DEVICE = DMA_MEM_TO_DEV = 1

Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/crypto/ux500/hash/hash_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/ux500/hash/hash_core.c b/drivers/crypto/ux500/hash/hash_core.c
index cd4398498495..bca6b701c067 100644
--- a/drivers/crypto/ux500/hash/hash_core.c
+++ b/drivers/crypto/ux500/hash/hash_core.c
@@ -181,7 +181,7 @@ static int hash_set_dma_transfer(struct hash_ctx *ctx, struct scatterlist *sg,
 		__func__);
 	desc = dmaengine_prep_slave_sg(channel,
 			ctx->device->dma.sg, ctx->device->dma.sg_len,
-			direction, DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
+			DMA_MEM_TO_DEV, DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
 	if (!desc) {
 		dev_err(ctx->device->dev,
 			"%s: dmaengine_prep_slave_sg() failed!\n", __func__);

From 7d1cfc10d42bbffa224ee7734f129834ad94abc8 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <lsahlber@redhat.com>
Date: Thu, 13 Dec 2018 08:06:16 +1000
Subject: [PATCH 2543/3220] cifs: check ntwrk_buf_start for NULL before
 dereferencing it

[ Upstream commit 59a63e479ce36a3f24444c3a36efe82b78e4a8e0 ]

RHBZ: 1021460

There is an issue where when multiple threads open/close the same directory
ntwrk_buf_start might end up being NULL, causing the call to smbCalcSize
later to oops with a NULL deref.

The real bug is why this happens and why this can become NULL for an
open cfile, which should not be allowed.
This patch tries to avoid a oops until the time when we fix the underlying
issue.

Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/readdir.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 57b039ebfb1f..43fa471c88d7 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -652,7 +652,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
 		/* scan and find it */
 		int i;
 		char *cur_ent;
-		char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
+		char *end_of_smb;
+
+		if (cfile->srch_inf.ntwrk_buf_start == NULL) {
+			cifs_dbg(VFS, "ntwrk_buf_start is NULL during readdir\n");
+			return -EIO;
+		}
+
+		end_of_smb = cfile->srch_inf.ntwrk_buf_start +
 			server->ops->calc_smb_size(
 					cfile->srch_inf.ntwrk_buf_start);
 

From ccc9ed24493bc02b5174184ae5c73fee422cd0bd Mon Sep 17 00:00:00 2001
From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Date: Wed, 5 Dec 2018 12:37:41 +0000
Subject: [PATCH 2544/3220] um: Avoid marking pages with "changed protection"

[ Upstream commit 8892d8545f2d0342b9c550defbfb165db237044b ]

Changing protection is a very high cost operation in UML
because in addition to an extra syscall it also interrupts
mmap merge sequences generated by the tlb.

While the condition is not particularly common it is worth
avoiding.

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/um/include/asm/pgtable.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 18eb9924dda3..aeb430212947 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -197,12 +197,17 @@ static inline pte_t pte_mkold(pte_t pte)
 
 static inline pte_t pte_wrprotect(pte_t pte)
 { 
-	pte_clear_bits(pte, _PAGE_RW);
+	if (likely(pte_get_bits(pte, _PAGE_RW)))
+		pte_clear_bits(pte, _PAGE_RW);
+	else
+		return pte;
 	return(pte_mknewprot(pte)); 
 }
 
 static inline pte_t pte_mkread(pte_t pte)
 { 
+	if (unlikely(pte_get_bits(pte, _PAGE_USER)))
+		return pte;
 	pte_set_bits(pte, _PAGE_USER);
 	return(pte_mknewprot(pte)); 
 }
@@ -221,6 +226,8 @@ static inline pte_t pte_mkyoung(pte_t pte)
 
 static inline pte_t pte_mkwrite(pte_t pte)	
 {
+	if (unlikely(pte_get_bits(pte,  _PAGE_RW)))
+		return pte;
 	pte_set_bits(pte, _PAGE_RW);
 	return(pte_mknewprot(pte)); 
 }

From 1f8aea084cb3d962bdd1ceaadf0f6d54aa7862d5 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Tue, 25 Dec 2018 01:56:14 -0600
Subject: [PATCH 2545/3220] niu: fix missing checks of niu_pci_eeprom_read

[ Upstream commit 26fd962bde0b15e54234fe762d86bc0349df1de4 ]

niu_pci_eeprom_read() may fail, so we should check its return value
before using the read data.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Acked-by: Shannon Nelson <shannon.lee.nelson@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/sun/niu.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index ccebf89aa1e4..85f3a2c0d4dd 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -8121,6 +8121,8 @@ static int niu_pci_vpd_scan_props(struct niu *np, u32 start, u32 end)
 		start += 3;
 
 		prop_len = niu_pci_eeprom_read(np, start + 4);
+		if (prop_len < 0)
+			return prop_len;
 		err = niu_pci_vpd_get_propname(np, start + 5, namebuf, 64);
 		if (err < 0)
 			return err;
@@ -8165,8 +8167,12 @@ static int niu_pci_vpd_scan_props(struct niu *np, u32 start, u32 end)
 			netif_printk(np, probe, KERN_DEBUG, np->dev,
 				     "VPD_SCAN: Reading in property [%s] len[%d]\n",
 				     namebuf, prop_len);
-			for (i = 0; i < prop_len; i++)
-				*prop_buf++ = niu_pci_eeprom_read(np, off + i);
+			for (i = 0; i < prop_len; i++) {
+				err = niu_pci_eeprom_read(np, off + i);
+				if (err >= 0)
+					*prop_buf = err;
+				++prop_buf;
+			}
 		}
 
 		start += len;

From bc5abb80f1299581ae6d55fabd5da7bcbc72e3ea Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 28 Dec 2018 00:31:25 -0800
Subject: [PATCH 2546/3220] scripts/decode_stacktrace: only strip base path
 when a prefix of the path

[ Upstream commit 67a28de47faa83585dd644bd4c31e5a1d9346c50 ]

Running something like:

	decodecode vmlinux .

leads to interested results where not only the leading "." gets stripped
from the displayed paths, but also anywhere in the string, displaying
something like:

	kvm_vcpu_check_block (arch/arm64/kvm/virt/kvm/kvm_mainc:2141)

which doesn't help further processing.

Fix it by only stripping the base path if it is a prefix of the path.

Link: http://lkml.kernel.org/r/20181210174659.31054-3-marc.zyngier@arm.com
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 scripts/decode_stacktrace.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 00d6d53c2681..ffc46c7c3afb 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -64,7 +64,7 @@ parse_symbol() {
 	fi
 
 	# Strip out the base of the path
-	code=${code//$basepath/""}
+	code=${code//^$basepath/""}
 
 	# In the case of inlines, move everything to same line
 	code=${code//$'\n'/' '}

From 80f8149151d8ab9a4a4d70e0eb3b57a1be34153e Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Fri, 28 Dec 2018 00:32:57 -0800
Subject: [PATCH 2547/3220] ocfs2: don't clear bh uptodate for block read

[ Upstream commit 70306d9dce75abde855cefaf32b3f71eed8602a3 ]

For sync io read in ocfs2_read_blocks_sync(), first clear bh uptodate flag
and submit the io, second wait io done, last check whether bh uptodate, if
not return io error.

If two sync io for the same bh were issued, it could be the first io done
and set uptodate flag, but just before check that flag, the second io came
in and cleared uptodate, then ocfs2_read_blocks_sync() for the first io
will return IO error.

Indeed it's not necessary to clear uptodate flag, as the io end handler
end_buffer_read_sync() will set or clear it based on io succeed or failed.

The following message was found from a nfs server but the underlying
storage returned no error.

[4106438.567376] (nfsd,7146,3):ocfs2_get_suballoc_slot_bit:2780 ERROR: read block 1238823695 failed -5
[4106438.567569] (nfsd,7146,3):ocfs2_get_suballoc_slot_bit:2812 ERROR: status = -5
[4106438.567611] (nfsd,7146,3):ocfs2_test_inode_bit:2894 ERROR: get alloc slot and bit failed -5
[4106438.567643] (nfsd,7146,3):ocfs2_test_inode_bit:2932 ERROR: status = -5
[4106438.567675] (nfsd,7146,3):ocfs2_get_dentry:94 ERROR: test inode bit failed -5

Same issue in non sync read ocfs2_read_blocks(), fixed it as well.

Link: http://lkml.kernel.org/r/20181121020023.3034-4-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Changwei Ge <ge.changwei@h3c.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ocfs2/buffer_head_io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 272269f1c310..9ee8bcfbf00f 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -146,7 +146,6 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 			BUG();
 		}
 
-		clear_buffer_uptodate(bh);
 		get_bh(bh); /* for end_buffer_read_sync() */
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh);
@@ -300,7 +299,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 				continue;
 			}
 
-			clear_buffer_uptodate(bh);
 			get_bh(bh); /* for end_buffer_read_sync() */
 			if (validate)
 				set_buffer_needs_validate(bh);

From cab4f01c9cd250b207f869a301c71d075499a39d Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Wed, 26 Dec 2018 22:09:34 +0800
Subject: [PATCH 2548/3220] isdn: hisax: hfc_pci: Fix a possible concurrency
 use-after-free bug in HFCPCI_l1hw()

[ Upstream commit 7418e6520f22a2e35815122fa5a53d5bbfa2c10f ]

In drivers/isdn/hisax/hfc_pci.c, the functions hfcpci_interrupt() and
HFCPCI_l1hw() may be concurrently executed.

HFCPCI_l1hw()
  line 1173: if (!cs->tx_skb)

hfcpci_interrupt()
  line 942: spin_lock_irqsave();
  line 1066: dev_kfree_skb_irq(cs->tx_skb);

Thus, a possible concurrency use-after-free bug may occur
in HFCPCI_l1hw().

To fix these bugs, the calls to spin_lock_irqsave() and
spin_unlock_irqrestore() are added in HFCPCI_l1hw(), to protect the
access to cs->tx_skb.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/isdn/hisax/hfc_pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/isdn/hisax/hfc_pci.c b/drivers/isdn/hisax/hfc_pci.c
index 90449e1e91e5..1b1453d62fed 100644
--- a/drivers/isdn/hisax/hfc_pci.c
+++ b/drivers/isdn/hisax/hfc_pci.c
@@ -1169,11 +1169,13 @@ HFCPCI_l1hw(struct PStack *st, int pr, void *arg)
 		if (cs->debug & L1_DEB_LAPD)
 			debugl1(cs, "-> PH_REQUEST_PULL");
 #endif
+		spin_lock_irqsave(&cs->lock, flags);
 		if (!cs->tx_skb) {
 			test_and_clear_bit(FLG_L1_PULL_REQ, &st->l1.Flags);
 			st->l1.l1l2(st, PH_PULL | CONFIRM, NULL);
 		} else
 			test_and_set_bit(FLG_L1_PULL_REQ, &st->l1.Flags);
+		spin_unlock_irqrestore(&cs->lock, flags);
 		break;
 	case (HW_RESET | REQUEST):
 		spin_lock_irqsave(&cs->lock, flags);

From 4c549499c4c96c876289c73dca2e7ed995d80de8 Mon Sep 17 00:00:00 2001
From: Wenwen Wang <wang6495@umn.edu>
Date: Wed, 26 Dec 2018 20:15:13 -0600
Subject: [PATCH 2549/3220] gdrom: fix a memory leak bug

[ Upstream commit 093c48213ee37c3c3ff1cf5ac1aa2a9d8bc66017 ]

In probe_gdrom(), the buffer pointed by 'gd.cd_info' is allocated through
kzalloc() and is used to hold the information of the gdrom device. To
register and unregister the device, the pointer 'gd.cd_info' is passed to
the functions register_cdrom() and unregister_cdrom(), respectively.
However, this buffer is not freed after it is used, which can cause a
memory leak bug.

This patch simply frees the buffer 'gd.cd_info' in exit_gdrom() to fix the
above issue.

Signed-off-by: Wenwen Wang <wang6495@umn.edu>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/cdrom/gdrom.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index e2808fefbb78..1852d19d0d7b 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -882,6 +882,7 @@ static void __exit exit_gdrom(void)
 	platform_device_unregister(pd);
 	platform_driver_unregister(&gdrom_driver);
 	kfree(gd.toc);
+	kfree(gd.cd_info);
 }
 
 module_init(init_gdrom);

From a711dcb283391b29f13f392304a582839fa8f8fb Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Mon, 31 Dec 2018 16:44:09 +1100
Subject: [PATCH 2550/3220] block/swim3: Fix -EBUSY error when re-opening
 device after unmount

[ Upstream commit 296dcc40f2f2e402facf7cd26cf3f2c8f4b17d47 ]

When the block device is opened with FMODE_EXCL, ref_count is set to -1.
This value doesn't get reset when the device is closed which means the
device cannot be opened again. Fix this by checking for refcount <= 0
in the release method.

Reported-and-tested-by: Stan Johnson <userm57@yahoo.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/block/swim3.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c264f2d284a7..2e0a9e2531cb 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1027,7 +1027,11 @@ static void floppy_release(struct gendisk *disk, fmode_t mode)
 	struct swim3 __iomem *sw = fs->swim3;
 
 	mutex_lock(&swim3_mutex);
-	if (fs->ref_count > 0 && --fs->ref_count == 0) {
+	if (fs->ref_count > 0)
+		--fs->ref_count;
+	else if (fs->ref_count == -1)
+		fs->ref_count = 0;
+	if (fs->ref_count == 0) {
 		swim3_action(fs, MOTOR_OFF);
 		out_8(&sw->control_bic, 0xff);
 		swim3_select(fs, RELAX);

From 3f516da858df983c9c730541214c3ae426925749 Mon Sep 17 00:00:00 2001
From: Aditya Pakki <pakki001@umn.edu>
Date: Mon, 24 Dec 2018 15:39:14 -0600
Subject: [PATCH 2551/3220] HID: lenovo: Add checks to fix
 of_led_classdev_register

[ Upstream commit 6ae16dfb61bce538d48b7fe98160fada446056c5 ]

In lenovo_probe_tpkbd(), the function of_led_classdev_register() could
return an error value that is unchecked. The fix adds these checks.

Signed-off-by: Aditya Pakki <pakki001@umn.edu>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hid/hid-lenovo.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c
index 8979f1fd5208..24a4a23bdc90 100644
--- a/drivers/hid/hid-lenovo.c
+++ b/drivers/hid/hid-lenovo.c
@@ -703,7 +703,9 @@ static int lenovo_probe_tpkbd(struct hid_device *hdev)
 	data_pointer->led_mute.brightness_get = lenovo_led_brightness_get_tpkbd;
 	data_pointer->led_mute.brightness_set = lenovo_led_brightness_set_tpkbd;
 	data_pointer->led_mute.dev = dev;
-	led_classdev_register(dev, &data_pointer->led_mute);
+	ret = led_classdev_register(dev, &data_pointer->led_mute);
+	if (ret < 0)
+		goto err;
 
 	data_pointer->led_micmute.name = name_micmute;
 	data_pointer->led_micmute.brightness_get =
@@ -711,7 +713,11 @@ static int lenovo_probe_tpkbd(struct hid_device *hdev)
 	data_pointer->led_micmute.brightness_set =
 		lenovo_led_brightness_set_tpkbd;
 	data_pointer->led_micmute.dev = dev;
-	led_classdev_register(dev, &data_pointer->led_micmute);
+	ret = led_classdev_register(dev, &data_pointer->led_micmute);
+	if (ret < 0) {
+		led_classdev_unregister(&data_pointer->led_mute);
+		goto err;
+	}
 
 	lenovo_features_set_tpkbd(hdev);
 

From 25768bb65482dd453ab7ebf8c103ec5313fd8aa5 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 3 Jan 2019 15:26:31 -0800
Subject: [PATCH 2552/3220] kernel/hung_task.c: break RCU locks based on
 jiffies

[ Upstream commit 304ae42739b108305f8d7b3eb3c1aec7c2b643a9 ]

check_hung_uninterruptible_tasks() is currently calling rcu_lock_break()
for every 1024 threads.  But check_hung_task() is very slow if printk()
was called, and is very fast otherwise.

If many threads within some 1024 threads called printk(), the RCU grace
period might be extended enough to trigger RCU stall warnings.
Therefore, calling rcu_lock_break() for every some fixed jiffies will be
safer.

Link: http://lkml.kernel.org/r/1544800658-11423-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/hung_task.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index e0f90c2b57aa..cc05b97ba569 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -30,7 +30,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
  * is disabled during the critical section. It also controls the size of
  * the RCU grace period. So it needs to be upper-bound.
  */
-#define HUNG_TASK_BATCHING 1024
+#define HUNG_TASK_LOCK_BREAK (HZ / 10)
 
 /*
  * Zero means infinite timeout - no checking done:
@@ -158,7 +158,7 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
 static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
-	int batch_count = HUNG_TASK_BATCHING;
+	unsigned long last_break = jiffies;
 	struct task_struct *g, *t;
 
 	/*
@@ -172,10 +172,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	for_each_process_thread(g, t) {
 		if (!max_count--)
 			goto unlock;
-		if (!--batch_count) {
-			batch_count = HUNG_TASK_BATCHING;
+		if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
 			if (!rcu_lock_break(g, t))
 				goto unlock;
+			last_break = jiffies;
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)

From e1d575b52c32b40184c1d1ca6039e8c7d3ce2db7 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Thu, 3 Jan 2019 15:27:09 -0800
Subject: [PATCH 2553/3220] fs/epoll: drop ovflist branch prediction

[ Upstream commit 76699a67f3041ff4c7af6d6ee9be2bfbf1ffb671 ]

The ep->ovflist is a secondary ready-list to temporarily store events
that might occur when doing sproc without holding the ep->wq.lock.  This
accounts for every time we check for ready events and also send events
back to userspace; both callbacks, particularly the latter because of
copy_to_user, can account for a non-trivial time.

As such, the unlikely() check to see if the pointer is being used, seems
both misleading and sub-optimal.  In fact, we go to an awful lot of
trouble to sync both lists, and populating the ovflist is far from an
uncommon scenario.

For example, profiling a concurrent epoll_wait(2) benchmark, with
CONFIG_PROFILE_ANNOTATED_BRANCHES shows that for a two threads a 33%
incorrect rate was seen; and when incrementally increasing the number of
epoll instances (which is used, for example for multiple queuing load
balancing models), up to a 90% incorrect rate was seen.

Similarly, by deleting the prediction, 3% throughput boost was seen
across incremental threads.

Link: http://lkml.kernel.org/r/20181108051006.18751-4-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jason Baron <jbaron@akamai.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/eventpoll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1b08556776ce..240d9ceb8d0c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1034,7 +1034,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	 * semantics). All the events that happen during that period of time are
 	 * chained in ep->ovflist and requeued later on.
 	 */
-	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
+	if (ep->ovflist != EP_UNACTIVE_PTR) {
 		if (epi->next == EP_UNACTIVE_PTR) {
 			epi->next = ep->ovflist;
 			ep->ovflist = epi;

From 9a376109f0c3b78d56adeb2134d4f2cb2a765ce9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 3 Jan 2019 15:28:07 -0800
Subject: [PATCH 2554/3220] exec: load_script: don't blindly truncate shebang
 string

[ Upstream commit 8099b047ecc431518b9bb6bdbba3549bbecdc343 ]

load_script() simply truncates bprm->buf and this is very wrong if the
length of shebang string exceeds BINPRM_BUF_SIZE-2.  This can silently
truncate i_arg or (worse) we can execute the wrong binary if buf[2:126]
happens to be the valid executable path.

Change load_script() to return ENOEXEC if it can't find '\n' or zero in
bprm->buf.  Note that '\0' can come from either
prepare_binprm()->memset() or from kernel_read(), we do not care.

Link: http://lkml.kernel.org/r/20181112160931.GA28463@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Ben Woodard <woodard@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/binfmt_script.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index afdf4e3cafc2..634bdbb23851 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -43,10 +43,14 @@ static int load_script(struct linux_binprm *bprm)
 	fput(bprm->file);
 	bprm->file = NULL;
 
-	bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
-	if ((cp = strchr(bprm->buf, '\n')) == NULL)
-		cp = bprm->buf+BINPRM_BUF_SIZE-1;
+	for (cp = bprm->buf+2;; cp++) {
+		if (cp >= bprm->buf + BINPRM_BUF_SIZE)
+			return -ENOEXEC;
+		if (!*cp || (*cp == '\n'))
+			break;
+	}
 	*cp = '\0';
+
 	while (cp > bprm->buf) {
 		cp--;
 		if ((*cp == ' ') || (*cp == '\t'))

From 93af75d0aba032fd8407be632c8f2f8657829a10 Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <edubezval@gmail.com>
Date: Wed, 2 Jan 2019 00:34:03 +0000
Subject: [PATCH 2555/3220] thermal: hwmon: inline helpers when
 CONFIG_THERMAL_HWMON is not set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 03334ba8b425b2ad275c8f390cf83c7b081c3095 upstream.

Avoid warnings like this:
thermal_hwmon.h:29:1: warning: ‘thermal_remove_hwmon_sysfs’ defined but not used [-Wunused-function]
 thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)

Fixes: 0dd88793aacd ("thermal: hwmon: move hwmon support to single file")
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/thermal/thermal_hwmon.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/thermal/thermal_hwmon.h b/drivers/thermal/thermal_hwmon.h
index c798fdb2ae43..f97f76691bd0 100644
--- a/drivers/thermal/thermal_hwmon.h
+++ b/drivers/thermal/thermal_hwmon.h
@@ -34,13 +34,13 @@
 int thermal_add_hwmon_sysfs(struct thermal_zone_device *tz);
 void thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz);
 #else
-static int
+static inline int
 thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 {
 	return 0;
 }
 
-static void
+static inline void
 thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
 {
 }

From 5ef0ebd78509388fbff3fb326744384d6f27e3ae Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 30 Nov 2018 12:13:15 -0800
Subject: [PATCH 2556/3220] test_hexdump: use memcpy instead of strncpy

commit b1286ed7158e9b62787508066283ab0b8850b518 upstream.

New versions of gcc reasonably warn about the odd pattern of

	strncpy(p, q, strlen(q));

which really doesn't make sense: the strncpy() ends up being just a slow
and odd way to write memcpy() in this case.

Apparently there was a patch for this floating around earlier, but it
got lost.

Acked-again-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 lib/test-hexdump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c
index 5241df36eedf..dadcabe50988 100644
--- a/lib/test-hexdump.c
+++ b/lib/test-hexdump.c
@@ -81,7 +81,7 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize,
 		const char *q = *result++;
 		size_t amount = strlen(q);
 
-		strncpy(p, q, amount);
+		memcpy(p, q, amount);
 		p += amount + 1;
 	}
 	if (i)

From ae46de2430b7bfd5971de37589d565fdfc6463b3 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 19 Oct 2018 12:08:22 +0800
Subject: [PATCH 2557/3220] tipc: use destination length for copy string
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 29e270fc32192e7729057963ae7120663856c93e upstream.

Got below warning with gcc 8.2 compiler.

net/tipc/topsrv.c: In function ‘tipc_topsrv_start’:
net/tipc/topsrv.c:660:2: warning: ‘strncpy’ specified bound depends on the length of the source argument [-Wstringop-overflow=]
  strncpy(srv->name, name, strlen(name) + 1);
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
net/tipc/topsrv.c:660:27: note: length computed here
  strncpy(srv->name, name, strlen(name) + 1);
                           ^~~~~~~~~~~~
So change it to correct length and use strscpy.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/tipc/subscr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index f9ff73a8d815..500c9e614a06 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -337,7 +337,7 @@ int tipc_topsrv_start(struct net *net)
 	topsrv->tipc_conn_new		= tipc_subscrb_connect_cb;
 	topsrv->tipc_conn_shutdown	= tipc_subscrb_shutdown_cb;
 
-	strncpy(topsrv->name, name, strlen(name) + 1);
+	strscpy(topsrv->name, name, sizeof(topsrv->name));
 	tn->topsrv = topsrv;
 	atomic_set(&tn->subscription_count, 0);
 

From a7ea4de3664593a4d79cb7e62e3c8736cabb5d3c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 07:21:15 -0800
Subject: [PATCH 2558/3220] string: drop __must_check from strscpy() and
 restore strscpy() usages in cgroup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 08a77676f9c5fc69a681ccd2cd8140e65dcb26c7 upstream.

e7fd37ba1217 ("cgroup: avoid copying strings longer than the buffers")
converted possibly unsafe strncpy() usages in cgroup to strscpy().
However, although the callsites are completely fine with truncated
copied, because strscpy() is marked __must_check, it led to the
following warnings.

  kernel/cgroup/cgroup.c: In function ‘cgroup_file_name’:
  kernel/cgroup/cgroup.c:1400:10: warning: ignoring return value of ‘strscpy’, declared with attribute warn_unused_result [-Wunused-result]
     strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
	       ^

To avoid the warnings, 50034ed49645 ("cgroup: use strlcpy() instead of
strscpy() to avoid spurious warning") switched them to strlcpy().

strlcpy() is worse than strlcpy() because it unconditionally runs
strlen() on the source string, and the only reason we switched to
strlcpy() here was because it was lacking __must_check, which doesn't
reflect any material differences between the two function.  It's just
that someone added __must_check to strscpy() and not to strlcpy().

These basic string copy operations are used in variety of ways, and
one of not-so-uncommon use cases is safely handling truncated copies,
where the caller naturally doesn't care about the return value.  The
__must_check doesn't match the actual use cases and forces users to
opt for inferior variants which lack __must_check by happenstance or
spread ugly (void) casts.

Remove __must_check from strscpy() and restore strscpy() usages in
cgroup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ma Shimiao <mashimiao.fnst@cn.fujitsu.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
[backport only the string.h portion to remove build warnings starting to show up - gregkh]
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/string.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/string.h b/include/linux/string.h
index 98bb781a2eff..c026b7a19e26 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -26,7 +26,7 @@ extern char * strncpy(char *,const char *, __kernel_size_t);
 size_t strlcpy(char *, const char *, size_t);
 #endif
 #ifndef __HAVE_ARCH_STRSCPY
-ssize_t __must_check strscpy(char *, const char *, size_t);
+ssize_t strscpy(char *, const char *, size_t);
 #endif
 #ifndef __HAVE_ARCH_STRCAT
 extern char * strcat(char *, const char *);

From ff0a4fa3e70f8c6106d53a895ef378636e2bc490 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Jan 2019 11:39:41 -0800
Subject: [PATCH 2559/3220] dccp: fool proof ccid_hc_[rt]x_parse_options()

[ Upstream commit 9b1f19d810e92d6cdc68455fbc22d9f961a58ce1 ]

Similarly to commit 276bdb82dedb ("dccp: check ccid before dereferencing")
it is wise to test for a NULL ccid.

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 5.0.0-rc3+ #37
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:ccid_hc_tx_parse_options net/dccp/ccid.h:205 [inline]
RIP: 0010:dccp_parse_options+0x8d9/0x12b0 net/dccp/options.c:233
Code: c5 0f b6 75 b3 80 38 00 0f 85 d6 08 00 00 48 b9 00 00 00 00 00 fc ff df 48 8b 45 b8 4c 8b b8 f8 07 00 00 4c 89 f8 48 c1 e8 03 <80> 3c 08 00 0f 85 95 08 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8b
kobject: 'loop5' (0000000080f78fc1): kobject_uevent_env
RSP: 0018:ffff8880a94df0b8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff8880858ac723 RCX: dffffc0000000000
RDX: 0000000000000100 RSI: 0000000000000007 RDI: 0000000000000001
RBP: ffff8880a94df140 R08: 0000000000000001 R09: ffff888061b83a80
R10: ffffed100c370752 R11: ffff888061b83a97 R12: 0000000000000026
R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f0defa33518 CR3: 000000008db5e000 CR4: 00000000001406e0
kobject: 'loop5' (0000000080f78fc1): fill_kobj_path: path = '/devices/virtual/block/loop5'
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 dccp_rcv_state_process+0x2b6/0x1af6 net/dccp/input.c:654
 dccp_v4_do_rcv+0x100/0x190 net/dccp/ipv4.c:688
 sk_backlog_rcv include/net/sock.h:936 [inline]
 __sk_receive_skb+0x3a9/0xea0 net/core/sock.c:473
 dccp_v4_rcv+0x10cb/0x1f80 net/dccp/ipv4.c:880
 ip_protocol_deliver_rcu+0xb6/0xa20 net/ipv4/ip_input.c:208
 ip_local_deliver_finish+0x23b/0x390 net/ipv4/ip_input.c:234
 NF_HOOK include/linux/netfilter.h:289 [inline]
 NF_HOOK include/linux/netfilter.h:283 [inline]
 ip_local_deliver+0x1f0/0x740 net/ipv4/ip_input.c:255
 dst_input include/net/dst.h:450 [inline]
 ip_rcv_finish+0x1f4/0x2f0 net/ipv4/ip_input.c:414
 NF_HOOK include/linux/netfilter.h:289 [inline]
 NF_HOOK include/linux/netfilter.h:283 [inline]
 ip_rcv+0xed/0x620 net/ipv4/ip_input.c:524
 __netif_receive_skb_one_core+0x160/0x210 net/core/dev.c:4973
 __netif_receive_skb+0x2c/0x1c0 net/core/dev.c:5083
 process_backlog+0x206/0x750 net/core/dev.c:5923
 napi_poll net/core/dev.c:6346 [inline]
 net_rx_action+0x76d/0x1930 net/core/dev.c:6412
 __do_softirq+0x30b/0xb11 kernel/softirq.c:292
 run_ksoftirqd kernel/softirq.c:654 [inline]
 run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646
 smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164
 kthread+0x357/0x430 kernel/kthread.c:246
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
Modules linked in:
---[ end trace 58a0ba03bea2c376 ]---
RIP: 0010:ccid_hc_tx_parse_options net/dccp/ccid.h:205 [inline]
RIP: 0010:dccp_parse_options+0x8d9/0x12b0 net/dccp/options.c:233
Code: c5 0f b6 75 b3 80 38 00 0f 85 d6 08 00 00 48 b9 00 00 00 00 00 fc ff df 48 8b 45 b8 4c 8b b8 f8 07 00 00 4c 89 f8 48 c1 e8 03 <80> 3c 08 00 0f 85 95 08 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8b
RSP: 0018:ffff8880a94df0b8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff8880858ac723 RCX: dffffc0000000000
RDX: 0000000000000100 RSI: 0000000000000007 RDI: 0000000000000001
RBP: ffff8880a94df140 R08: 0000000000000001 R09: ffff888061b83a80
R10: ffffed100c370752 R11: ffff888061b83a97 R12: 0000000000000026
R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f0defa33518 CR3: 0000000009871000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/dccp/ccid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 6eb837a47b5c..baaaeb2b2c42 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -202,7 +202,7 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
 static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
 					   u8 pkt, u8 opt, u8 *val, u8 len)
 {
-	if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+	if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options)
 		return 0;
 	return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
 }
@@ -214,7 +214,7 @@ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
 static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
 					   u8 pkt, u8 opt, u8 *val, u8 len)
 {
-	if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+	if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options)
 		return 0;
 	return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
 }

From 16570a4ab84d7017da388c59cc48494ad452b8a5 Mon Sep 17 00:00:00 2001
From: Govindarajulu Varadarajan <gvaradar@cisco.com>
Date: Wed, 30 Jan 2019 06:59:00 -0800
Subject: [PATCH 2560/3220] enic: fix checksum validation for IPv6

[ Upstream commit 7596175e99b3d4bce28022193efd954c201a782a ]

In case of IPv6 pkts, ipv4_csum_ok is 0. Because of this, driver does
not set skb->ip_summed. So IPv6 rx checksum is not offloaded.

Signed-off-by: Govindarajulu Varadarajan <gvaradar@cisco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/cisco/enic/enic_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 0433fdebda25..9ef4caa4b84d 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -1180,7 +1180,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq,
 		 * CHECSUM_UNNECESSARY.
 		 */
 		if ((netdev->features & NETIF_F_RXCSUM) && tcp_udp_csum_ok &&
-		    ipv4_csum_ok)
+		    (ipv4_csum_ok || ipv6))
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 		if (vlan_stripped)

From b1a5e14594b37632183d42e7eaba821ed2e8e0a6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 4 Feb 2019 11:20:29 +0100
Subject: [PATCH 2561/3220] net: dp83640: expire old TX-skb

[ Upstream commit 53bc8d2af08654659abfadfd3e98eb9922ff787c ]

During sendmsg() a cloned skb is saved via dp83640_txtstamp() in
->tx_queue. After the NIC sends this packet, the PHY will reply with a
timestamp for that TX packet. If the cable is pulled at the right time I
don't see that packet. It might gets flushed as part of queue shutdown
on NIC's side.
Once the link is up again then after the next sendmsg() we enqueue
another skb in dp83640_txtstamp() and have two on the list. Then the PHY
will send a reply and decode_txts() attaches it to the first skb on the
list.
No crash occurs since refcounting works but we are one packet behind.
linuxptp/ptp4l usually closes the socket and opens a new one (in such a
timeout case) so those "stale" replies never get there. However it does
not resume normal operation anymore.

Purge old skbs in decode_txts().

Fixes: cb646e2b02b2 ("ptp: Added a clock driver for the National Semiconductor PHYTER.")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/phy/dp83640.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index dc934347ae28..e6f564d50663 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -890,14 +890,14 @@ static void decode_txts(struct dp83640_private *dp83640,
 			struct phy_txts *phy_txts)
 {
 	struct skb_shared_hwtstamps shhwtstamps;
+	struct dp83640_skb_info *skb_info;
 	struct sk_buff *skb;
-	u64 ns;
 	u8 overflow;
+	u64 ns;
 
 	/* We must already have the skb that triggered this. */
-
+again:
 	skb = skb_dequeue(&dp83640->tx_queue);
-
 	if (!skb) {
 		pr_debug("have timestamp but tx_queue empty\n");
 		return;
@@ -912,6 +912,11 @@ static void decode_txts(struct dp83640_private *dp83640,
 		}
 		return;
 	}
+	skb_info = (struct dp83640_skb_info *)skb->cb;
+	if (time_after(jiffies, skb_info->tmo)) {
+		kfree_skb(skb);
+		goto again;
+	}
 
 	ns = phy2txts(phy_txts);
 	memset(&shhwtstamps, 0, sizeof(shhwtstamps));
@@ -1461,6 +1466,7 @@ static bool dp83640_rxtstamp(struct phy_device *phydev,
 static void dp83640_txtstamp(struct phy_device *phydev,
 			     struct sk_buff *skb, int type)
 {
+	struct dp83640_skb_info *skb_info = (struct dp83640_skb_info *)skb->cb;
 	struct dp83640_private *dp83640 = phydev->priv;
 
 	switch (dp83640->hwts_tx_en) {
@@ -1473,6 +1479,7 @@ static void dp83640_txtstamp(struct phy_device *phydev,
 		/* fall through */
 	case HWTSTAMP_TX_ON:
 		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+		skb_info->tmo = jiffies + SKB_TIMESTAMP_TIMEOUT;
 		skb_queue_tail(&dp83640->tx_queue, skb);
 		break;
 

From 5296ebc355b230cbabdf05525001631a394285d3 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 1 Feb 2019 11:28:16 +0300
Subject: [PATCH 2562/3220] skge: potential memory corruption in
 skge_get_regs()

[ Upstream commit 294c149a209c6196c2de85f512b52ef50f519949 ]

The "p" buffer is 0x4000 bytes long.  B3_RI_WTO_R1 is 0x190.  The value
of "regs->len" is in the 1-0x4000 range.  The bug here is that
"regs->len - B3_RI_WTO_R1" can be a negative value which would lead to
memory corruption and an abrupt crash.

Fixes: c3f8be961808 ("[PATCH] skge: expand ethtool debug register dump")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/marvell/skge.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index 7173836fe361..c9f4b5412844 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -152,8 +152,10 @@ static void skge_get_regs(struct net_device *dev, struct ethtool_regs *regs,
 	memset(p, 0, regs->len);
 	memcpy_fromio(p, io, B3_RAM_ADDR);
 
-	memcpy_fromio(p + B3_RI_WTO_R1, io + B3_RI_WTO_R1,
-		      regs->len - B3_RI_WTO_R1);
+	if (regs->len > B3_RI_WTO_R1) {
+		memcpy_fromio(p + B3_RI_WTO_R1, io + B3_RI_WTO_R1,
+			      regs->len - B3_RI_WTO_R1);
+	}
 }
 
 /* Wake on Lan only supported on Yukon chips with rev 1 or above */

From 8d8aafcdd74f5c0964056378ca3267b118103179 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 1 Feb 2019 13:23:38 -0800
Subject: [PATCH 2563/3220] net: systemport: Fix WoL with password after deep
 sleep

[ Upstream commit 8dfb8d2cceb76b74ad5b58cc65c75994329b4d5e ]

Broadcom STB chips support a deep sleep mode where all register
contents are lost. Because we were stashing the MagicPacket password
into some of these registers a suspend into that deep sleep then a
resumption would not lead to being able to wake-up from MagicPacket with
password again.

Fix this by keeping a software copy of the password and program it
during suspend.

Fixes: 83e82f4c706b ("net: systemport: add Wake-on-LAN support")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 25 +++++++++-------------
 drivers/net/ethernet/broadcom/bcmsysport.h |  2 ++
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 7a6dd5e5e498..143b9a384af8 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -400,7 +400,6 @@ static void bcm_sysport_get_wol(struct net_device *dev,
 				struct ethtool_wolinfo *wol)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
-	u32 reg;
 
 	wol->supported = WAKE_MAGIC | WAKE_MAGICSECURE;
 	wol->wolopts = priv->wolopts;
@@ -408,11 +407,7 @@ static void bcm_sysport_get_wol(struct net_device *dev,
 	if (!(priv->wolopts & WAKE_MAGICSECURE))
 		return;
 
-	/* Return the programmed SecureOn password */
-	reg = umac_readl(priv, UMAC_PSW_MS);
-	put_unaligned_be16(reg, &wol->sopass[0]);
-	reg = umac_readl(priv, UMAC_PSW_LS);
-	put_unaligned_be32(reg, &wol->sopass[2]);
+	memcpy(wol->sopass, priv->sopass, sizeof(priv->sopass));
 }
 
 static int bcm_sysport_set_wol(struct net_device *dev,
@@ -428,13 +423,8 @@ static int bcm_sysport_set_wol(struct net_device *dev,
 	if (wol->wolopts & ~supported)
 		return -EINVAL;
 
-	/* Program the SecureOn password */
-	if (wol->wolopts & WAKE_MAGICSECURE) {
-		umac_writel(priv, get_unaligned_be16(&wol->sopass[0]),
-			    UMAC_PSW_MS);
-		umac_writel(priv, get_unaligned_be32(&wol->sopass[2]),
-			    UMAC_PSW_LS);
-	}
+	if (wol->wolopts & WAKE_MAGICSECURE)
+		memcpy(priv->sopass, wol->sopass, sizeof(priv->sopass));
 
 	/* Flag the device and relevant IRQ as wakeup capable */
 	if (wol->wolopts) {
@@ -1889,12 +1879,17 @@ static int bcm_sysport_suspend_to_wol(struct bcm_sysport_priv *priv)
 	unsigned int timeout = 1000;
 	u32 reg;
 
-	/* Password has already been programmed */
 	reg = umac_readl(priv, UMAC_MPD_CTRL);
 	reg |= MPD_EN;
 	reg &= ~PSW_EN;
-	if (priv->wolopts & WAKE_MAGICSECURE)
+	if (priv->wolopts & WAKE_MAGICSECURE) {
+		/* Program the SecureOn password */
+		umac_writel(priv, get_unaligned_be16(&priv->sopass[0]),
+			    UMAC_PSW_MS);
+		umac_writel(priv, get_unaligned_be32(&priv->sopass[2]),
+			    UMAC_PSW_LS);
 		reg |= PSW_EN;
+	}
 	umac_writel(priv, reg, UMAC_MPD_CTRL);
 
 	/* Make sure RBUF entered WoL mode as result */
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index 8ace6ecb5f79..e668b1ce5828 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -11,6 +11,7 @@
 #ifndef __BCM_SYSPORT_H
 #define __BCM_SYSPORT_H
 
+#include <linux/ethtool.h>
 #include <linux/if_vlan.h>
 
 /* Receive/transmit descriptor format */
@@ -682,6 +683,7 @@ struct bcm_sysport_priv {
 	unsigned int		crc_fwd:1;
 	u16			rev;
 	u32			wolopts;
+	u8			sopass[SOPASS_MAX];
 	unsigned int		wol_irq_disabled:1;
 
 	/* MIB related fields */

From b1746f9f05d35f1c55015899d40789b7bfc184bd Mon Sep 17 00:00:00 2001
From: Rundong Ge <rdong.ge@gmail.com>
Date: Sat, 2 Feb 2019 14:29:35 +0000
Subject: [PATCH 2564/3220] net: dsa: slave: Don't propagate flag changes on
 down slave interfaces

[ Upstream commit 17ab4f61b8cd6f9c38e9d0b935d86d73b5d0d2b5 ]

The unbalance of master's promiscuity or allmulti will happen after ifdown
and ifup a slave interface which is in a bridge.

When we ifdown a slave interface , both the 'dsa_slave_close' and
'dsa_slave_change_rx_flags' will clear the master's flags. The flags
of master will be decrease twice.
In the other hand, if we ifup the slave interface again, since the
slave's flags were cleared the 'dsa_slave_open' won't set the master's
flag, only 'dsa_slave_change_rx_flags' that triggered by 'br_add_if'
will set the master's flags. The flags of master is increase once.

Only propagating flag changes when a slave interface is up makes
sure this does not happen. The 'vlan_dev_change_rx_flags' had the
same problem and was fixed, and changes here follows that fix.

Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support")
Signed-off-by: Rundong Ge <rdong.ge@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/dsa/slave.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 48b28a7ecc7a..4256ac95a141 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -157,10 +157,14 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct net_device *master = p->parent->dst->master_netdev;
 
-	if (change & IFF_ALLMULTI)
-		dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
-	if (change & IFF_PROMISC)
-		dev_set_promiscuity(master, dev->flags & IFF_PROMISC ? 1 : -1);
+	if (dev->flags & IFF_UP) {
+		if (change & IFF_ALLMULTI)
+			dev_set_allmulti(master,
+					 dev->flags & IFF_ALLMULTI ? 1 : -1);
+		if (change & IFF_PROMISC)
+			dev_set_promiscuity(master,
+					    dev->flags & IFF_PROMISC ? 1 : -1);
+	}
 }
 
 static void dsa_slave_set_rx_mode(struct net_device *dev)

From d7204d3860ee12f1d3da9680e9f25afa9ad65c1f Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 5 Feb 2019 16:29:40 +0000
Subject: [PATCH 2565/3220] ALSA: compress: Fix stop handling on compressed
 capture streams

commit 4f2ab5e1d13d6aa77c55f4914659784efd776eb4 upstream.

It is normal user behaviour to start, stop, then start a stream
again without closing it. Currently this works for compressed
playback streams but not capture ones.

The states on a compressed capture stream go directly from OPEN to
PREPARED, unlike a playback stream which moves to SETUP and waits
for a write of data before moving to PREPARED. Currently however,
when a stop is sent the state is set to SETUP for both types of
streams. This leaves a capture stream in the situation where a new
start can't be sent as that requires the state to be PREPARED and
a new set_params can't be sent as that requires the state to be
OPEN. The only option being to close the stream, and then reopen.

Correct this issues by allowing snd_compr_drain_notify to set the
state depending on the stream direction, as we already do in
set_params.

Fixes: 49bb6402f1aa ("ALSA: compress_core: Add support for capture streams")
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/sound/compress_driver.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/sound/compress_driver.h b/include/sound/compress_driver.h
index fa1d05512c09..85ff3181e6f1 100644
--- a/include/sound/compress_driver.h
+++ b/include/sound/compress_driver.h
@@ -178,7 +178,11 @@ static inline void snd_compr_drain_notify(struct snd_compr_stream *stream)
 	if (snd_BUG_ON(!stream))
 		return;
 
-	stream->runtime->state = SNDRV_PCM_STATE_SETUP;
+	if (stream->direction == SND_COMPRESS_PLAYBACK)
+		stream->runtime->state = SNDRV_PCM_STATE_SETUP;
+	else
+		stream->runtime->state = SNDRV_PCM_STATE_PREPARED;
+
 	wake_up(&stream->runtime->sleep);
 }
 

From 71ce2e8957ff6eed31953f54a02fc3bd083f0d26 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 30 Jan 2019 17:46:03 +0100
Subject: [PATCH 2566/3220] ALSA: hda - Serialize codec registrations

commit 305a0ade180981686eec1f92aa6252a7c6ebb1cf upstream.

In the current code, the codec registration may happen both at the
codec bind time and the end of the controller probe time.  In a rare
occasion, they race with each other, leading to Oops due to the still
uninitialized card device.

This patch introduces a simple flag to prevent the codec registration
at the codec bind time as long as the controller probe is going on.
The controller probe invokes snd_card_register() that does the whole
registration task, and we don't need to register each piece
beforehand.

Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/hda_bind.c  | 3 ++-
 sound/pci/hda/hda_codec.h | 1 +
 sound/pci/hda/hda_intel.c | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c
index 6efadbfb3fe3..7ea201c05e5d 100644
--- a/sound/pci/hda/hda_bind.c
+++ b/sound/pci/hda/hda_bind.c
@@ -109,7 +109,8 @@ static int hda_codec_driver_probe(struct device *dev)
 	err = snd_hda_codec_build_controls(codec);
 	if (err < 0)
 		goto error_module;
-	if (codec->card->registered) {
+	/* only register after the bus probe finished; otherwise it's racy */
+	if (!codec->bus->bus_probing && codec->card->registered) {
 		err = snd_card_register(codec->card);
 		if (err < 0)
 			goto error_module;
diff --git a/sound/pci/hda/hda_codec.h b/sound/pci/hda/hda_codec.h
index 776dffa88aee..171e11be938d 100644
--- a/sound/pci/hda/hda_codec.h
+++ b/sound/pci/hda/hda_codec.h
@@ -68,6 +68,7 @@ struct hda_bus {
 	unsigned int response_reset:1;	/* controller was reset */
 	unsigned int in_reset:1;	/* during reset operation */
 	unsigned int no_response_fallback:1; /* don't fallback at RIRB error */
+	unsigned int bus_probing :1;	/* during probing process */
 
 	int primary_dig_out_type;	/* primary digital out PCM type */
 	unsigned int mixer_assigned;	/* codec addr for mixer name */
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index f964743b104c..74c9600876d6 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2100,6 +2100,7 @@ static int azx_probe_continue(struct azx *chip)
 	int val;
 	int err;
 
+	to_hda_bus(bus)->bus_probing = 1;
 	hda->probe_continued = 1;
 
 	/* Request display power well for the HDA controller or codec. For
@@ -2200,6 +2201,7 @@ i915_power_fail:
 	if (err < 0)
 		hda->init_failed = 1;
 	complete_all(&hda->probe_wait);
+	to_hda_bus(bus)->bus_probing = 0;
 	return err;
 }
 

From a7dfde0f5bc0998487d8f6e33e1d503934852020 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Sat, 12 Jan 2019 02:39:05 +0100
Subject: [PATCH 2567/3220] fuse: call pipe_buf_release() under pipe lock

commit 9509941e9c534920ccc4771ae70bd6cbbe79df1c upstream.

Some of the pipe_buf_release() handlers seem to assume that the pipe is
locked - in particular, anon_pipe_buf_release() accesses pipe->tmp_page
without taking any extra locks. From a glance through the callers of
pipe_buf_release(), it looks like FUSE is the only one that calls
pipe_buf_release() without having the pipe locked.

This bug should only lead to a memory leak, nothing terrible.

Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device")
Cc: stable@vger.kernel.org
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/dev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e566652ac922..a2cd166a6c69 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2074,10 +2074,13 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 
 	ret = fuse_dev_do_write(fud, &cs, len);
 
+	pipe_lock(pipe);
 	for (idx = 0; idx < nbuf; idx++) {
 		struct pipe_buffer *buf = &bufs[idx];
 		buf->ops->release(pipe, buf);
 	}
+	pipe_unlock(pipe);
+
 out:
 	kfree(bufs);
 	return ret;

From bade8e5f2686ed458c4b71061a00a33ae1e5154c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 16 Jan 2019 10:27:59 +0100
Subject: [PATCH 2568/3220] fuse: decrement NR_WRITEBACK_TEMP on the right page

commit a2ebba824106dabe79937a9f29a875f837e1b6d4 upstream.

NR_WRITEBACK_TEMP is accounted on the temporary page in the request, not
the page cache page.

Fixes: 8b284dc47291 ("fuse: writepages: handle same page rewrites")
Cc: <stable@vger.kernel.org> # v3.13
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7014318f6d18..d40c2451487c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1784,7 +1784,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 		spin_unlock(&fc->lock);
 
 		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-		dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+		dec_zone_page_state(new_req->pages[0], NR_WRITEBACK_TEMP);
 		wb_writeout_inc(&bdi->wb);
 		fuse_writepage_free(fc, new_req);
 		fuse_request_free(new_req);

From c5cf17c81df2b82ea25e8e05e297272adcdc3f49 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 16 Jan 2019 10:27:59 +0100
Subject: [PATCH 2569/3220] fuse: handle zero sized retrieve correctly

commit 97e1532ef81acb31c30f9e75bf00306c33a77812 upstream.

Dereferencing req->page_descs[0] will Oops if req->max_pages is zero.

Reported-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com
Tested-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com
Fixes: b2430d7567a3 ("fuse: add per-page descriptor <offset, length> to fuse_req")
Cc: <stable@vger.kernel.org> # v3.9
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a2cd166a6c69..341196338e48 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1741,7 +1741,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 	req->in.h.nodeid = outarg->nodeid;
 	req->in.numargs = 2;
 	req->in.argpages = 1;
-	req->page_descs[0].offset = offset;
 	req->end = fuse_retrieve_end;
 
 	index = outarg->offset >> PAGE_CACHE_SHIFT;
@@ -1756,6 +1755,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 
 		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
 		req->pages[req->num_pages] = page;
+		req->page_descs[req->num_pages].offset = offset;
 		req->page_descs[req->num_pages].length = this_num;
 		req->num_pages++;
 

From b191f1953846029cff59e07eb73502dccc87010d Mon Sep 17 00:00:00 2001
From: Leonid Iziumtsev <leonid.iziumtsev@gmail.com>
Date: Tue, 15 Jan 2019 17:15:23 +0000
Subject: [PATCH 2570/3220] dmaengine: imx-dma: fix wrong callback invoke

commit 341198eda723c8c1cddbb006a89ad9e362502ea2 upstream.

Once the "ld_queue" list is not empty, next descriptor will migrate
into "ld_active" list. The "desc" variable will be overwritten
during that transition. And later the dmaengine_desc_get_callback_invoke()
will use it as an argument. As result we invoke wrong callback.

That behaviour was in place since:
commit fcaaba6c7136 ("dmaengine: imx-dma: fix callback path in tasklet").
But after commit 4cd13c21b207 ("softirq: Let ksoftirqd do its job")
things got worse, since possible delay between tasklet_schedule()
from DMA irq handler and actual tasklet function execution got bigger.
And that gave more time for new DMA request to be submitted and
to be put into "ld_queue" list.

It has been noticed that DMA issue is causing problems for "mxc-mmc"
driver. While stressing the system with heavy network traffic and
writing/reading to/from sd card simultaneously the timeout may happen:

10013000.sdhci: mxcmci_watchdog: read time out (status = 0x30004900)

That often lead to file system corruption.

Signed-off-by: Leonid Iziumtsev <leonid.iziumtsev@gmail.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/dma/imx-dma.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index 48d85f8b95fe..dfa337ae06fc 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -619,7 +619,7 @@ static void imxdma_tasklet(unsigned long data)
 {
 	struct imxdma_channel *imxdmac = (void *)data;
 	struct imxdma_engine *imxdma = imxdmac->imxdma;
-	struct imxdma_desc *desc;
+	struct imxdma_desc *desc, *next_desc;
 	unsigned long flags;
 
 	spin_lock_irqsave(&imxdma->lock, flags);
@@ -649,10 +649,10 @@ static void imxdma_tasklet(unsigned long data)
 	list_move_tail(imxdmac->ld_active.next, &imxdmac->ld_free);
 
 	if (!list_empty(&imxdmac->ld_queue)) {
-		desc = list_first_entry(&imxdmac->ld_queue, struct imxdma_desc,
-					node);
+		next_desc = list_first_entry(&imxdmac->ld_queue,
+					     struct imxdma_desc, node);
 		list_move_tail(imxdmac->ld_queue.next, &imxdmac->ld_active);
-		if (imxdma_xfer_desc(desc) < 0)
+		if (imxdma_xfer_desc(next_desc) < 0)
 			dev_warn(imxdma->dev, "%s: channel: %d couldn't xfer desc\n",
 				 __func__, imxdmac->channel);
 	}

From ff8c1826eec15e441c6d19c0de327e2cd786fde5 Mon Sep 17 00:00:00 2001
From: Bin Liu <b-liu@ti.com>
Date: Wed, 16 Jan 2019 11:54:07 -0600
Subject: [PATCH 2571/3220] usb: phy: am335x: fix race condition in _probe

commit a53469a68eb886e84dd8b69a1458a623d3591793 upstream.

power off the phy should be done before populate the phy. Otherwise,
am335x_init() could be called by the phy owner to power on the phy first,
then am335x_phy_probe() turns off the phy again without the caller knowing
it.

Fixes: 2fc711d76352 ("usb: phy: am335x: Enable USB remote wakeup using PHY wakeup")
Cc: stable@vger.kernel.org # v3.18+
Signed-off-by: Bin Liu <b-liu@ti.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/phy/phy-am335x.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/usb/phy/phy-am335x.c b/drivers/usb/phy/phy-am335x.c
index 90b67a4ca221..558f33a75fd9 100644
--- a/drivers/usb/phy/phy-am335x.c
+++ b/drivers/usb/phy/phy-am335x.c
@@ -56,9 +56,6 @@ static int am335x_phy_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = usb_add_phy_dev(&am_phy->usb_phy_gen.phy);
-	if (ret)
-		return ret;
 	am_phy->usb_phy_gen.phy.init = am335x_init;
 	am_phy->usb_phy_gen.phy.shutdown = am335x_shutdown;
 
@@ -77,7 +74,7 @@ static int am335x_phy_probe(struct platform_device *pdev)
 	device_set_wakeup_enable(dev, false);
 	phy_ctrl_power(am_phy->phy_ctrl, am_phy->id, false);
 
-	return 0;
+	return usb_add_phy_dev(&am_phy->usb_phy_gen.phy);
 }
 
 static int am335x_phy_remove(struct platform_device *pdev)

From b0f59c3d1fc63f58dada13e7ba2b7a0b54559d43 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 22 Jan 2019 15:28:08 -0600
Subject: [PATCH 2572/3220] usb: gadget: udc: net2272: Fix bitwise and boolean
 operations

commit 07c69f1148da7de3978686d3af9263325d9d60bd upstream.

(!x & y) strikes again.

Fix bitwise and boolean operations by enclosing the expression:

	intcsr & (1 << NET2272_PCI_IRQ)

in parentheses, before applying the boolean operator '!'.

Notice that this code has been there since 2011. So, it would
be helpful if someone can double-check this.

This issue was detected with the help of Coccinelle.

Fixes: ceb80363b2ec ("USB: net2272: driver for PLX NET2272 USB device controller")
Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/udc/net2272.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/gadget/udc/net2272.c b/drivers/usb/gadget/udc/net2272.c
index 18f5ebd447b8..3b6e34fc032b 100644
--- a/drivers/usb/gadget/udc/net2272.c
+++ b/drivers/usb/gadget/udc/net2272.c
@@ -2100,7 +2100,7 @@ static irqreturn_t net2272_irq(int irq, void *_dev)
 #if defined(PLX_PCI_RDK2)
 	/* see if PCI int for us by checking irqstat */
 	intcsr = readl(dev->rdk2.fpga_base_addr + RDK2_IRQSTAT);
-	if (!intcsr & (1 << NET2272_PCI_IRQ)) {
+	if (!(intcsr & (1 << NET2272_PCI_IRQ))) {
 		spin_unlock(&dev->lock);
 		return IRQ_NONE;
 	}

From 1b5fd913a4eb07cb13e969bb8e3b1633a40e683f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 29 Jan 2019 18:41:16 +0100
Subject: [PATCH 2573/3220] KVM: x86: work around leak of uninitialized stack
 contents (CVE-2019-7222)

commit 353c0956a618a07ba4bbe7ad00ff29fe70e8412a upstream.

Bugzilla: 1671930

Emulation of certain instructions (VMXON, VMCLEAR, VMPTRLD, VMWRITE with
memory operand, INVEPT, INVVPID) can incorrectly inject a page fault
when passed an operand that points to an MMIO address.  The page fault
will use uninitialized kernel stack memory as the CR2 and error code.

The right behavior would be to abort the VM with a KVM_EXIT_INTERNAL_ERROR
exit to userspace; however, it is not an easy fix, so for now just
ensure that the error code and CR2 are zero.

Embargoed until Feb 7th 2019.

Reported-by: Felix Wilhelm <fwilhelm@google.com>
Cc: stable@kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/x86.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 758e2b39567d..6bd0538d8ebf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4247,6 +4247,13 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 
+	/*
+	 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+	 * is returned, but our callers are not ready for that and they blindly
+	 * call kvm_inject_page_fault.  Ensure that they at least do not leak
+	 * uninitialized kernel stack memory into cr2 and error code.
+	 */
+	memset(exception, 0, sizeof(*exception));
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
 					  exception);
 }

From 9872ddae1949b46d5310e0e71ca26bb5c4e52a70 Mon Sep 17 00:00:00 2001
From: Peter Shier <pshier@google.com>
Date: Thu, 11 Oct 2018 11:46:46 -0700
Subject: [PATCH 2574/3220] KVM: nVMX: unconditionally cancel preemption timer
 in free_nested (CVE-2019-7221)

commit ecec76885bcfe3294685dc363fd1273df0d5d65f upstream.

Bugzilla: 1671904

There are multiple code paths where an hrtimer may have been started to
emulate an L1 VMX preemption timer that can result in a call to free_nested
without an intervening L2 exit where the hrtimer is normally
cancelled. Unconditionally cancel in free_nested to cover all cases.

Embargoed until Feb 7th 2019.

Signed-off-by: Peter Shier <pshier@google.com>
Reported-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Reported-by: Felix Wilhelm <fwilhelm@google.com>
Cc: stable@kernel.org
Message-Id: <20181011184646.154065-1-pshier@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3bdb2e747b89..aee2886a387c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6965,6 +6965,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 	if (!vmx->nested.vmxon)
 		return;
 
+	hrtimer_cancel(&vmx->nested.preemption_timer);
 	vmx->nested.vmxon = false;
 	free_vpid(vmx->nested.vpid02);
 	nested_release_vmcs12(vmx);

From 00025153e202c6040763cce695004dc977823822 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Sun, 27 Jan 2019 06:53:14 -0800
Subject: [PATCH 2575/3220] perf/x86/intel/uncore: Add Node ID mask

commit 9e63a7894fd302082cf3627fe90844421a6cbe7f upstream.

Some PCI uncore PMUs cannot be registered on an 8-socket system (HPE
Superdome Flex).

To understand which Socket the PCI uncore PMUs belongs to, perf retrieves
the local Node ID of the uncore device from CPUNODEID(0xC0) of the PCI
configuration space, and the mapping between Socket ID and Node ID from
GIDNIDMAP(0xD4). The Socket ID can be calculated accordingly.

The local Node ID is only available at bit 2:0, but current code doesn't
mask it. If a BIOS doesn't clear the rest of the bits, an incorrect Node ID
will be fetched.

Filter the Node ID by adding a mask.

Reported-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org> # v3.7+
Fixes: 7c94ee2e0917 ("perf/x86: Add Intel Nehalem and Sandy Bridge-EP uncore support")
Link: https://lkml.kernel.org/r/1548600794-33162-1-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index f0f4fcba252e..947579425861 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1081,6 +1081,8 @@ static struct pci_driver snbep_uncore_pci_driver = {
 	.id_table	= snbep_uncore_pci_ids,
 };
 
+#define NODE_ID_MASK	0x7
+
 /*
  * build pci bus to socket mapping
  */
@@ -1102,7 +1104,7 @@ static int snbep_pci2phy_map_init(int devid)
 		err = pci_read_config_dword(ubox_dev, 0x40, &config);
 		if (err)
 			break;
-		nodeid = config;
+		nodeid = config & NODE_ID_MASK;
 		/* get the Node ID mapping */
 		err = pci_read_config_dword(ubox_dev, 0x54, &config);
 		if (err)

From 122c0149ad74375d2e740048298aa0b82342004f Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 31 Jan 2019 16:33:41 -0800
Subject: [PATCH 2576/3220] x86/MCE: Initialize mce.bank in the case of a fatal
 error in mce_no_way_out()

commit d28af26faa0b1daf3c692603d46bc4687c16f19e upstream.

Internal injection testing crashed with a console log that said:

  mce: [Hardware Error]: CPU 7: Machine Check Exception: f Bank 0: bd80000000100134

This caused a lot of head scratching because the MCACOD (bits 15:0) of
that status is a signature from an L1 data cache error. But Linux says
that it found it in "Bank 0", which on this model CPU only reports L1
instruction cache errors.

The answer was that Linux doesn't initialize "m->bank" in the case that
it finds a fatal error in the mce_no_way_out() pre-scan of banks. If
this was a local machine check, then this partially initialized struct
mce is being passed to mce_panic().

Fix is simple: just initialize m->bank in the case of a fatal error.

Fixes: 40c36e2741d7 ("x86/mce: Fix incorrect "Machine check from unknown source" message")
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: x86-ml <x86@kernel.org>
Cc: stable@vger.kernel.org # v4.18 Note pre-v5.0 arch/x86/kernel/cpu/mce/core.c was called arch/x86/kernel/cpu/mcheck/mce.c
Link: https://lkml.kernel.org/r/20190201003341.10638-1-tony.luck@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7b8c8c838191..77f7580e22c6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -670,6 +670,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 		}
 
 		if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+			m->bank = i;
 			*msg = tmp;
 			ret = 1;
 		}

From 06bbc4838a38854b8487aa19cdcb5816837fc059 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 10 Jan 2019 14:27:45 +0000
Subject: [PATCH 2577/3220] perf/core: Don't WARN() for impossible ring-buffer
 sizes

commit 9dff0aa95a324e262ffb03f425d00e4751f3294e upstream.

The perf tool uses /proc/sys/kernel/perf_event_mlock_kb to determine how
large its ringbuffer mmap should be. This can be configured to arbitrary
values, which can be larger than the maximum possible allocation from
kmalloc.

When this is configured to a suitably large value (e.g. thanks to the
perf fuzzer), attempting to use perf record triggers a WARN_ON_ONCE() in
__alloc_pages_nodemask():

   WARNING: CPU: 2 PID: 5666 at mm/page_alloc.c:4511 __alloc_pages_nodemask+0x3f8/0xbc8

Let's avoid this by checking that the requested allocation is possible
before calling kzalloc.

Reported-by: Julien Thierry <julien.thierry@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Julien Thierry <julien.thierry@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20190110142745.25495-1-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/events/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 58013ef228a1..93bfb61506fa 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -637,6 +637,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 	size = sizeof(struct ring_buffer);
 	size += nr_pages * sizeof(void *);
 
+	if (order_base_2(size) >= MAX_ORDER)
+		goto fail;
+
 	rb = kzalloc(size, GFP_KERNEL);
 	if (!rb)
 		goto fail;

From cb7c96ee91df92164ade7575f117b3962e5ab260 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 22 Jan 2019 17:34:39 -0600
Subject: [PATCH 2578/3220] perf tests evsel-tp-sched: Fix bitwise operator

commit 489338a717a0dfbbd5a3fabccf172b78f0ac9015 upstream.

Notice that the use of the bitwise OR operator '|' always leads to true
in this particular case, which seems a bit suspicious due to the context
in which this expression is being used.

Fix this by using bitwise AND operator '&' instead.

This bug was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Fixes: 6a6cd11d4e57 ("perf test: Add test for the sched tracepoint format fields")
Link: http://lkml.kernel.org/r/20190122233439.GA5868@embeddedor
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/perf/tests/evsel-tp-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/tests/evsel-tp-sched.c b/tools/perf/tests/evsel-tp-sched.c
index 790e413d9a1f..da474d743b6a 100644
--- a/tools/perf/tests/evsel-tp-sched.c
+++ b/tools/perf/tests/evsel-tp-sched.c
@@ -16,7 +16,7 @@ static int perf_evsel__test_field(struct perf_evsel *evsel, const char *name,
 		return -1;
 	}
 
-	is_signed = !!(field->flags | FIELD_IS_SIGNED);
+	is_signed = !!(field->flags & FIELD_IS_SIGNED);
 	if (should_be_signed && !is_signed) {
 		pr_debug("%s: \"%s\" signedness(%d) is wrong, should be %d\n",
 			 evsel->name, name, is_signed, should_be_signed);

From 6f17dfe5bbd52189249d8ef359c93e9300712cea Mon Sep 17 00:00:00 2001
From: Martin Kepplinger <martin.kepplinger@ginzinger.com>
Date: Tue, 5 Feb 2019 16:52:51 +0100
Subject: [PATCH 2579/3220] mtd: rawnand: gpmi: fix MX28 bus master lockup
 problem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit d5d27fd9826b59979b184ec288e4812abac0e988 upstream.

Disable BCH soft reset according to MX23 erratum #2847 ("BCH soft
reset may cause bus master lock up") for MX28 too. It has the same
problem.

Observed problem: once per 100,000+ MX28 reboots NAND read failed on
DMA timeout errors:
[    1.770823] UBI: attaching mtd3 to ubi0
[    2.768088] gpmi_nand: DMA timeout, last DMA :1
[    3.958087] gpmi_nand: BCH timeout, last DMA :1
[    4.156033] gpmi_nand: Error in ECC-based read: -110
[    4.161136] UBI warning: ubi_io_read: error -110 while reading 64
bytes from PEB 0:0, read only 0 bytes, retry
[    4.171283] step 1 error
[    4.173846] gpmi_nand: Chip: 0, Error -1

Without BCH soft reset we successfully executed 1,000,000 MX28 reboots.

I have a quote from NXP regarding this problem, from July 18th 2016:

"As the i.MX23 and i.MX28 are of the same generation, they share many
characteristics. Unfortunately, also the erratas may be shared.
In case of the documented erratas and the workarounds, you can also
apply the workaround solution of one device on the other one. This have
been reported, but I’m afraid that there are not an estimated date for
updating the Errata documents.
Please accept our apologies for any inconveniences this may cause."

Fixes: 6f2a6a52560a ("mtd: nand: gpmi: reset BCH earlier, too, to avoid NAND startup problems")
Cc: stable@vger.kernel.org
Signed-off-by: Manfred Schlaegl <manfred.schlaegl@ginzinger.com>
Signed-off-by: Martin Kepplinger <martin.kepplinger@ginzinger.com>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Acked-by: Han Xu <han.xu@nxp.com>
Signed-off-by: Boris Brezillon <bbrezillon@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mtd/nand/gpmi-nand/gpmi-lib.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-lib.c b/drivers/mtd/nand/gpmi-nand/gpmi-lib.c
index 43fa16b5f510..672c02e32a39 100644
--- a/drivers/mtd/nand/gpmi-nand/gpmi-lib.c
+++ b/drivers/mtd/nand/gpmi-nand/gpmi-lib.c
@@ -168,9 +168,10 @@ int gpmi_init(struct gpmi_nand_data *this)
 
 	/*
 	 * Reset BCH here, too. We got failures otherwise :(
-	 * See later BCH reset for explanation of MX23 handling
+	 * See later BCH reset for explanation of MX23 and MX28 handling
 	 */
-	ret = gpmi_reset_block(r->bch_regs, GPMI_IS_MX23(this));
+	ret = gpmi_reset_block(r->bch_regs,
+			       GPMI_IS_MX23(this) || GPMI_IS_MX28(this));
 	if (ret)
 		goto err_out;
 
@@ -274,13 +275,11 @@ int bch_set_geometry(struct gpmi_nand_data *this)
 
 	/*
 	* Due to erratum #2847 of the MX23, the BCH cannot be soft reset on this
-	* chip, otherwise it will lock up. So we skip resetting BCH on the MX23.
-	* On the other hand, the MX28 needs the reset, because one case has been
-	* seen where the BCH produced ECC errors constantly after 10000
-	* consecutive reboots. The latter case has not been seen on the MX23
-	* yet, still we don't know if it could happen there as well.
+	* chip, otherwise it will lock up. So we skip resetting BCH on the MX23
+	* and MX28.
 	*/
-	ret = gpmi_reset_block(r->bch_regs, GPMI_IS_MX23(this));
+	ret = gpmi_reset_block(r->bch_regs,
+			       GPMI_IS_MX23(this) || GPMI_IS_MX28(this));
 	if (ret)
 		goto err_out;
 

From 381fc50960aa551da7abb4a630fde2d0aee737b4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 6 Feb 2019 18:39:40 -0600
Subject: [PATCH 2580/3220] signal: Always notice exiting tasks

commit 35634ffa1751b6efd8cf75010b509dcb0263e29b upstream.

Recently syzkaller was able to create unkillablle processes by
creating a timer that is delivered as a thread local signal on SIGHUP,
and receiving SIGHUP SA_NODEFERER.  Ultimately causing a loop
failing to deliver SIGHUP but always trying.

Upon examination it turns out part of the problem is actually most of
the solution.  Since 2.5 signal delivery has found all fatal signals,
marked the signal group for death, and queued SIGKILL in every threads
thread queue relying on signal->group_exit_code to preserve the
information of which was the actual fatal signal.

The conversion of all fatal signals to SIGKILL results in the
synchronous signal heuristic in next_signal kicking in and preferring
SIGHUP to SIGKILL.  Which is especially problematic as all
fatal signals have already been transformed into SIGKILL.

Instead of dequeueing signals and depending upon SIGKILL to
be the first signal dequeued, first test if the signal group
has already been marked for death.  This guarantees that
nothing in the signal queue can prevent a process that needs
to exit from exiting.

Cc: stable@vger.kernel.org
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Ref: ebf5ebe31d2c ("[PATCH] signal-fixes-2.5.59-A4")
History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/signal.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/signal.c b/kernel/signal.c
index 5b1313309356..f26cabeb705d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2198,6 +2198,11 @@ relock:
 		goto relock;
 	}
 
+	/* Has this task already been marked for death? */
+	ksig->info.si_signo = signr = SIGKILL;
+	if (signal_group_exit(signal))
+		goto fatal;
+
 	for (;;) {
 		struct k_sigaction *ka;
 
@@ -2293,6 +2298,7 @@ relock:
 			continue;
 		}
 
+	fatal:
 		spin_unlock_irq(&sighand->siglock);
 
 		/*

From 60de9fffbe5d59325d9c2319bac1620588b20d17 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 6 Feb 2019 17:51:47 -0600
Subject: [PATCH 2581/3220] signal: Better detection of synchronous signals

commit 7146db3317c67b517258cb5e1b08af387da0618b upstream.

Recently syzkaller was able to create unkillablle processes by
creating a timer that is delivered as a thread local signal on SIGHUP,
and receiving SIGHUP SA_NODEFERER.  Ultimately causing a loop failing
to deliver SIGHUP but always trying.

When the stack overflows delivery of SIGHUP fails and force_sigsegv is
called.  Unfortunately because SIGSEGV is numerically higher than
SIGHUP next_signal tries again to deliver a SIGHUP.

From a quality of implementation standpoint attempting to deliver the
timer SIGHUP signal is wrong.  We should attempt to deliver the
synchronous SIGSEGV signal we just forced.

We can make that happening in a fairly straight forward manner by
instead of just looking at the signal number we also look at the
si_code.  In particular for exceptions (aka synchronous signals) the
si_code is always greater than 0.

That still has the potential to pick up a number of asynchronous
signals as in a few cases the same si_codes that are used
for synchronous signals are also used for asynchronous signals,
and SI_KERNEL is also included in the list of possible si_codes.

Still the heuristic is much better and timer signals are definitely
excluded.  Which is enough to prevent all known ways for someone
sending a process signals fast enough to cause unexpected and
arguably incorrect behavior.

Cc: stable@vger.kernel.org
Fixes: a27341cd5fcb ("Prioritize synchronous signals over 'normal' signals")
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/signal.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index f26cabeb705d..e464a2ef4ff5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -696,6 +696,48 @@ static inline bool si_fromuser(const struct siginfo *info)
 		(!is_si_special(info) && SI_FROMUSER(info));
 }
 
+static int dequeue_synchronous_signal(siginfo_t *info)
+{
+	struct task_struct *tsk = current;
+	struct sigpending *pending = &tsk->pending;
+	struct sigqueue *q, *sync = NULL;
+
+	/*
+	 * Might a synchronous signal be in the queue?
+	 */
+	if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
+		return 0;
+
+	/*
+	 * Return the first synchronous signal in the queue.
+	 */
+	list_for_each_entry(q, &pending->list, list) {
+		/* Synchronous signals have a postive si_code */
+		if ((q->info.si_code > SI_USER) &&
+		    (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
+			sync = q;
+			goto next;
+		}
+	}
+	return 0;
+next:
+	/*
+	 * Check if there is another siginfo for the same signal.
+	 */
+	list_for_each_entry_continue(q, &pending->list, list) {
+		if (q->info.si_signo == sync->info.si_signo)
+			goto still_pending;
+	}
+
+	sigdelset(&pending->signal, sync->info.si_signo);
+	recalc_sigpending();
+still_pending:
+	list_del_init(&sync->list);
+	copy_siginfo(info, &sync->info);
+	__sigqueue_free(sync);
+	return info->si_signo;
+}
+
 /*
  * called with RCU read lock from check_kill_permission()
  */
@@ -2216,7 +2258,15 @@ relock:
 			goto relock;
 		}
 
-		signr = dequeue_signal(current, &current->blocked, &ksig->info);
+		/*
+		 * Signals generated by the execution of an instruction
+		 * need to be delivered before any other pending signals
+		 * so that the instruction pointer in the signal stack
+		 * frame points to the faulting instruction.
+		 */
+		signr = dequeue_synchronous_signal(&ksig->info);
+		if (!signr)
+			signr = dequeue_signal(current, &current->blocked, &ksig->info);
 
 		if (!signr)
 			break; /* will return 0 */

From 99b23b0d5fd57634ba3c4659dea6a654c2f99753 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 3 Dec 2018 17:52:19 +0300
Subject: [PATCH 2582/3220] misc: vexpress: Off by one in
 vexpress_syscfg_exec()

commit f8a70d8b889f180e6860cb1f85fed43d37844c5a upstream.

The > comparison should be >= to prevent reading beyond the end of the
func->template[] array.

(The func->template array is allocated in vexpress_syscfg_regmap_init()
and it has func->num_templates elements.)

Fixes: 974cc7b93441 ("mfd: vexpress: Define the device as MFD cells")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/vexpress-syscfg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/vexpress-syscfg.c b/drivers/misc/vexpress-syscfg.c
index c344483fa7d6..9f257c53e6d4 100644
--- a/drivers/misc/vexpress-syscfg.c
+++ b/drivers/misc/vexpress-syscfg.c
@@ -61,7 +61,7 @@ static int vexpress_syscfg_exec(struct vexpress_syscfg_func *func,
 	int tries;
 	long timeout;
 
-	if (WARN_ON(index > func->num_templates))
+	if (WARN_ON(index >= func->num_templates))
 		return -EINVAL;
 
 	command = readl(syscfg->base + SYS_CFGCTRL);

From 2b46cd1ae6876a4fba4cce51161d904ba41a4f7c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 23 Jan 2019 11:27:02 +0100
Subject: [PATCH 2583/3220] debugfs: fix debugfs_rename parameter checking

commit d88c93f090f708c18195553b352b9f205e65418f upstream.

debugfs_rename() needs to check that the dentries passed into it really
are valid, as sometimes they are not (i.e. if the return value of
another debugfs call is passed into this one.)  So fix this up by
properly checking if the two parent directories are errors (they are
allowed to be NULL), and if the dentry to rename is not NULL or an
error.

Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e49ba072bd64..22fe11baef2b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -671,6 +671,13 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
 	struct dentry *dentry = NULL, *trap;
 	struct name_snapshot old_name;
 
+	if (IS_ERR(old_dir))
+		return old_dir;
+	if (IS_ERR(new_dir))
+		return new_dir;
+	if (IS_ERR_OR_NULL(old_dentry))
+		return old_dentry;
+
 	trap = lock_rename(new_dir, old_dir);
 	/* Source or destination directories don't exist? */
 	if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))

From d93fdf4464a0fedd4d797f27a3945092681f6e11 Mon Sep 17 00:00:00 2001
From: Vladimir Kondratiev <vladimir.kondratiev@linux.intel.com>
Date: Wed, 6 Feb 2019 13:46:17 +0200
Subject: [PATCH 2584/3220] mips: cm: reprime error cause

commit 05dc6001af0630e200ad5ea08707187fe5537e6d upstream.

Accordingly to the documentation
---cut---
The GCR_ERROR_CAUSE.ERR_TYPE field and the GCR_ERROR_MULT.ERR_TYPE
fields can be cleared by either a reset or by writing the current
value of GCR_ERROR_CAUSE.ERR_TYPE to the
GCR_ERROR_CAUSE.ERR_TYPE register.
---cut---
Do exactly this. Original value of cm_error may be safely written back;
it clears error cause and keeps other bits untouched.

Fixes: 3885c2b463f6 ("MIPS: CM: Add support for reporting CM cache errors")
Signed-off-by: Vladimir Kondratiev <vladimir.kondratiev@linux.intel.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # v4.3+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/mips-cm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c
index 1448c1f43d4e..76f18c56141c 100644
--- a/arch/mips/kernel/mips-cm.c
+++ b/arch/mips/kernel/mips-cm.c
@@ -424,5 +424,5 @@ void mips_cm_error_report(void)
 	}
 
 	/* reprime cause register */
-	write_gcr_error_cause(0);
+	write_gcr_error_cause(cm_error);
 }

From 91aaa0dd778be889777e53a1576c31bf249e0a5f Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Sun, 27 Jan 2019 23:28:33 +0200
Subject: [PATCH 2585/3220] MIPS: OCTEON: don't set octeon_dma_bar_type if PCI
 is disabled

commit dcf300a69ac307053dfb35c2e33972e754a98bce upstream.

Don't set octeon_dma_bar_type if PCI is disabled. This avoids creation
of the MSI irqchip later on, and saves a bit of memory.

Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Fixes: a214720cbf50 ("Disable MSI also when pcie-octeon.pcie_disable on")
Cc: stable@vger.kernel.org # v3.3+
Cc: linux-mips@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/pci/pci-octeon.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c
index c258cd406fbb..b36bbda31058 100644
--- a/arch/mips/pci/pci-octeon.c
+++ b/arch/mips/pci/pci-octeon.c
@@ -571,6 +571,11 @@ static int __init octeon_pci_setup(void)
 	if (octeon_has_feature(OCTEON_FEATURE_PCIE))
 		return 0;
 
+	if (!octeon_is_pci_host()) {
+		pr_notice("Not in host mode, PCI Controller not initialized\n");
+		return 0;
+	}
+
 	/* Point pcibios_map_irq() to the PCI version of it */
 	octeon_pcibios_map_irq = octeon_pci_pcibios_map_irq;
 
@@ -582,11 +587,6 @@ static int __init octeon_pci_setup(void)
 	else
 		octeon_dma_bar_type = OCTEON_DMA_BAR_TYPE_BIG;
 
-	if (!octeon_is_pci_host()) {
-		pr_notice("Not in host mode, PCI Controller not initialized\n");
-		return 0;
-	}
-
 	/* PCI I/O and PCI MEM values */
 	set_io_port_base(OCTEON_PCI_IOSPACE_BASE);
 	ioport_resource.start = 0;

From dfb3536268fc22706ff33d127557f78033b5b1cd Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Mon, 28 Jan 2019 23:16:22 +0000
Subject: [PATCH 2586/3220] MIPS: VDSO: Include $(ccflags-vdso) in o32,n32 .lds
 builds

commit 67fc5dc8a541e8f458d7f08bf88ff55933bf9f9d upstream.

When generating vdso-o32.lds & vdso-n32.lds for use with programs
running as compat ABIs under 64b kernels, we previously haven't included
the compiler flags that are supposedly common to all ABIs - ie. those in
the ccflags-vdso variable.

This is problematic in cases where we need to provide the -m%-float flag
in order to ensure that we don't attempt to use a floating point ABI
that's incompatible with the target CPU & ABI. For example a toolchain
using current gcc trunk configured --with-fp-32=xx fails to build a
64r6el_defconfig kernel with the following error:

  cc1: error: '-march=mips1' requires '-mfp32'
  make[2]: *** [arch/mips/vdso/Makefile:135: arch/mips/vdso/vdso-o32.lds] Error 1

Include $(ccflags-vdso) for the compat VDSO .lds builds, just as it is
included for the native VDSO .lds & when compiling objects for the
compat VDSOs. This ensures we consistently provide the -msoft-float flag
amongst others, avoiding the problem by ensuring we're agnostic to the
toolchain defaults.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Fixes: ebb5e78cc634 ("MIPS: Initial implementation of a VDSO")
Cc: linux-mips@vger.kernel.org
Cc: Kevin Hilman <khilman@baylibre.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Maciej W . Rozycki <macro@linux-mips.org>
Cc: stable@vger.kernel.org # v4.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/vdso/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index 6c7d78546eee..886005b1e87d 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -107,7 +107,7 @@ $(obj)/%-o32.o: $(src)/%.c FORCE
 	$(call cmd,force_checksrc)
 	$(call if_changed_rule,cc_o_c)
 
-$(obj)/vdso-o32.lds: KBUILD_CPPFLAGS := -mabi=32
+$(obj)/vdso-o32.lds: KBUILD_CPPFLAGS := $(ccflags-vdso) -mabi=32
 $(obj)/vdso-o32.lds: $(src)/vdso.lds.S FORCE
 	$(call if_changed_dep,cpp_lds_S)
 
@@ -143,7 +143,7 @@ $(obj)/%-n32.o: $(src)/%.c FORCE
 	$(call cmd,force_checksrc)
 	$(call if_changed_rule,cc_o_c)
 
-$(obj)/vdso-n32.lds: KBUILD_CPPFLAGS := -mabi=n32
+$(obj)/vdso-n32.lds: KBUILD_CPPFLAGS := $(ccflags-vdso) -mabi=n32
 $(obj)/vdso-n32.lds: $(src)/vdso.lds.S FORCE
 	$(call if_changed_dep,cpp_lds_S)
 

From fd9d0553fba3d24fc3ba14012489de148a368e3b Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 25 Jan 2019 20:10:15 +0000
Subject: [PATCH 2587/3220] ARM: iop32x/n2100: fix PCI IRQ mapping

commit db4090920ba2d61a5827a23e441447926a02ffee upstream.

Booting 4.20 on a TheCUS N2100 results in a kernel oops while probing
PCI, due to n2100_pci_map_irq() having been discarded during boot.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Cc: stable@vger.kernel.org # 2.6.18+
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-iop32x/n2100.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c
index c1cd80ecc219..a904244264ce 100644
--- a/arch/arm/mach-iop32x/n2100.c
+++ b/arch/arm/mach-iop32x/n2100.c
@@ -75,8 +75,7 @@ void __init n2100_map_io(void)
 /*
  * N2100 PCI.
  */
-static int __init
-n2100_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
+static int n2100_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 {
 	int irq;
 

From 06288a8e37077da3502c4cd4a7716491c96a75e1 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 29 Jan 2019 11:10:57 +0100
Subject: [PATCH 2588/3220] mac80211: ensure that mgmt tx skbs have tailroom
 for encryption

commit 9d0f50b80222dc273e67e4e14410fcfa4130a90c upstream.

Some drivers use IEEE80211_KEY_FLAG_SW_MGMT_TX to indicate that management
frames need to be software encrypted. Since normal data packets are still
encrypted by the hardware, crypto_tx_tailroom_needed_cnt gets decremented
after key upload to hw. This can lead to passing skbs to ccmp_encrypt_skb,
which don't have the necessary tailroom for software encryption.

Change the code to add tailroom for encrypted management packets, even if
crypto_tx_tailroom_needed_cnt is 0.

Cc: stable@vger.kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/mac80211/tx.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index c1c27a516e45..41f3eb565ef3 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1599,9 +1599,16 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
 				int head_need, bool may_encrypt)
 {
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_hdr *hdr;
+	bool enc_tailroom;
 	int tail_need = 0;
 
-	if (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt) {
+	hdr = (struct ieee80211_hdr *) skb->data;
+	enc_tailroom = may_encrypt &&
+		       (sdata->crypto_tx_tailroom_needed_cnt ||
+			ieee80211_is_mgmt(hdr->frame_control));
+
+	if (enc_tailroom) {
 		tail_need = IEEE80211_ENCRYPT_TAILROOM;
 		tail_need -= skb_tailroom(skb);
 		tail_need = max_t(int, tail_need, 0);
@@ -1609,8 +1616,7 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
 
 	if (skb_cloned(skb) &&
 	    (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) ||
-	     !skb_clone_writable(skb, ETH_HLEN) ||
-	     (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt)))
+	     !skb_clone_writable(skb, ETH_HLEN) || enc_tailroom))
 		I802_DEBUG_INC(local->tx_expand_skb_head_cloned);
 	else if (head_need || tail_need)
 		I802_DEBUG_INC(local->tx_expand_skb_head);

From f1cd557ecbbfc1338c38e15f19f519854a0b4799 Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Wed, 23 Jan 2019 15:28:59 +0800
Subject: [PATCH 2589/3220] drm/modes: Prevent division by zero htotal

commit a2fcd5c84f7a7825e028381b10182439067aa90d upstream.

This patch prevents division by zero htotal.

In a follow-up mail Tina writes:

> > How did you manage to get here with htotal == 0? This needs backtraces (or if
> > this is just about static checkers, a mention of that).
> > -Daniel
>
> In GVT-g, we are trying to enable a virtual display w/o setting timings for a pipe
> (a.k.a htotal=0), then we met the following kernel panic:
>
> [   32.832048] divide error: 0000 [#1] SMP PTI
> [   32.833614] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.18.0-rc4-sriov+ #33
> [   32.834438] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.10.1-0-g8891697-dirty-20180511_165818-tinazhang-linux-1 04/01/2014
> [   32.835901] RIP: 0010:drm_mode_hsync+0x1e/0x40
> [   32.836004] Code: 31 c0 c3 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 8b 87 d8 00 00 00 85 c0 75 22 8b 4f 68 85 c9 78 1b 69 47 58 e8 03 00 00 99 <f7> f9 b9 d3 4d 62 10 05 f4 01 00 00 f7 e1 89 d0 c1 e8 06 f3 c3 66
> [   32.836004] RSP: 0000:ffffc900000ebb90 EFLAGS: 00010206
> [   32.836004] RAX: 0000000000000000 RBX: ffff88001c67c8a0 RCX: 0000000000000000
> [   32.836004] RDX: 0000000000000000 RSI: ffff88001c67c000 RDI: ffff88001c67c8a0
> [   32.836004] RBP: ffff88001c7d03a0 R08: ffff88001c67c8a0 R09: ffff88001c7d0330
> [   32.836004] R10: ffffffff822c3a98 R11: 0000000000000001 R12: ffff88001c67c000
> [   32.836004] R13: ffff88001c7d0370 R14: ffffffff8207eb78 R15: ffff88001c67c800
> [   32.836004] FS:  0000000000000000(0000) GS:ffff88001da00000(0000) knlGS:0000000000000000
> [   32.836004] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   32.836004] CR2: 0000000000000000 CR3: 000000000220a000 CR4: 00000000000006f0
> [   32.836004] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   32.836004] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   32.836004] Call Trace:
> [   32.836004]  intel_mode_from_pipe_config+0x72/0x90
> [   32.836004]  intel_modeset_setup_hw_state+0x569/0xf90
> [   32.836004]  intel_modeset_init+0x905/0x1db0
> [   32.836004]  i915_driver_load+0xb8c/0x1120
> [   32.836004]  i915_pci_probe+0x4d/0xb0
> [   32.836004]  local_pci_probe+0x44/0xa0
> [   32.836004]  ? pci_assign_irq+0x27/0x130
> [   32.836004]  pci_device_probe+0x102/0x1c0
> [   32.836004]  driver_probe_device+0x2b8/0x480
> [   32.836004]  __driver_attach+0x109/0x110
> [   32.836004]  ? driver_probe_device+0x480/0x480
> [   32.836004]  bus_for_each_dev+0x67/0xc0
> [   32.836004]  ? klist_add_tail+0x3b/0x70
> [   32.836004]  bus_add_driver+0x1e8/0x260
> [   32.836004]  driver_register+0x5b/0xe0
> [   32.836004]  ? mipi_dsi_bus_init+0x11/0x11
> [   32.836004]  do_one_initcall+0x4d/0x1eb
> [   32.836004]  kernel_init_freeable+0x197/0x237
> [   32.836004]  ? rest_init+0xd0/0xd0
> [   32.836004]  kernel_init+0xa/0x110
> [   32.836004]  ret_from_fork+0x35/0x40
> [   32.836004] Modules linked in:
> [   32.859183] ---[ end trace 525608b0ed0e8665 ]---
> [   32.859722] RIP: 0010:drm_mode_hsync+0x1e/0x40
> [   32.860287] Code: 31 c0 c3 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 8b 87 d8 00 00 00 85 c0 75 22 8b 4f 68 85 c9 78 1b 69 47 58 e8 03 00 00 99 <f7> f9 b9 d3 4d 62 10 05 f4 01 00 00 f7 e1 89 d0 c1 e8 06 f3 c3 66
> [   32.862680] RSP: 0000:ffffc900000ebb90 EFLAGS: 00010206
> [   32.863309] RAX: 0000000000000000 RBX: ffff88001c67c8a0 RCX: 0000000000000000
> [   32.864182] RDX: 0000000000000000 RSI: ffff88001c67c000 RDI: ffff88001c67c8a0
> [   32.865206] RBP: ffff88001c7d03a0 R08: ffff88001c67c8a0 R09: ffff88001c7d0330
> [   32.866359] R10: ffffffff822c3a98 R11: 0000000000000001 R12: ffff88001c67c000
> [   32.867213] R13: ffff88001c7d0370 R14: ffffffff8207eb78 R15: ffff88001c67c800
> [   32.868075] FS:  0000000000000000(0000) GS:ffff88001da00000(0000) knlGS:0000000000000000
> [   32.868983] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   32.869659] CR2: 0000000000000000 CR3: 000000000220a000 CR4: 00000000000006f0
> [   32.870599] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   32.871598] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   32.872549] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
>
> Since drm_mode_hsync() has the logic to check mode->htotal, I just extend it to cover the case htotal==0.

Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Cc: Adam Jackson <ajax@redhat.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
[danvet: Add additional explanations + cc: stable.]
Cc: stable@vger.kernel.org
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1548228539-3061-1-git-send-email-tina.zhang@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/drm_modes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index 71a10f08522e..a5b052203c2c 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -722,7 +722,7 @@ int drm_mode_hsync(const struct drm_display_mode *mode)
 	if (mode->hsync)
 		return mode->hsync;
 
-	if (mode->htotal < 0)
+	if (mode->htotal <= 0)
 		return 0;
 
 	calc_val = (mode->clock * 1000) / mode->htotal; /* hsync in Hz */

From 6bcca0bc47652a55ff1d2a71f50665661f24dfaa Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Mon, 28 Jan 2019 10:31:33 +0100
Subject: [PATCH 2590/3220] drm/vmwgfx: Fix setting of dma masks

commit 4cbfa1e6c09e98450aab3240e5119b0ab2c9795b upstream.

Previously we set only the dma mask and not the coherent mask. Fix that.
Also, for clarity, make sure both are initially set to 64 bits.

Cc: <stable@vger.kernel.org>
Fixes: 0d00c488f3de: ("drm/vmwgfx: Fix the driver for large dma addresses")
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Deepak Rawat <drawat@vmware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index be3971b22a02..ed92b9ac01b2 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -594,13 +594,16 @@ out_fixup:
 static int vmw_dma_masks(struct vmw_private *dev_priv)
 {
 	struct drm_device *dev = dev_priv->dev;
+	int ret = 0;
 
-	if (intel_iommu_enabled &&
+	ret = dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64));
+	if (dev_priv->map_mode != vmw_dma_phys &&
 	    (sizeof(unsigned long) == 4 || vmw_restrict_dma_mask)) {
 		DRM_INFO("Restricting DMA addresses to 44 bits.\n");
-		return dma_set_mask(dev->dev, DMA_BIT_MASK(44));
+		return dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(44));
 	}
-	return 0;
+
+	return ret;
 }
 #else
 static int vmw_dma_masks(struct vmw_private *dev_priv)

From 697c6f72c4935a6361fb36d7d80fa6a8f958c271 Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Thu, 31 Jan 2019 10:55:37 +0100
Subject: [PATCH 2591/3220] drm/vmwgfx: Return error code from
 vmw_execbuf_copy_fence_user

commit 728354c005c36eaf44b6e5552372b67e60d17f56 upstream.

The function was unconditionally returning 0, and a caller would have to
rely on the returned fence pointer being NULL to detect errors. However,
the function vmw_execbuf_copy_fence_user() would expect a non-zero error
code in that case and would BUG otherwise.

So make sure we return a proper non-zero error code if the fence pointer
returned is NULL.

Cc: <stable@vger.kernel.org>
Fixes: ae2a104058e2: ("vmwgfx: Implement fence objects")
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Deepak Rawat <drawat@vmware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index fda8e85dd5a2..ad0dd566aded 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -3663,7 +3663,7 @@ int vmw_execbuf_fence_commands(struct drm_file *file_priv,
 		*p_fence = NULL;
 	}
 
-	return 0;
+	return ret;
 }
 
 /**

From b661fff5f8a0f19824df91cc3905ba2c5b54dc87 Mon Sep 17 00:00:00 2001
From: Vladis Dronov <vdronov@redhat.com>
Date: Tue, 29 Jan 2019 11:58:35 +0100
Subject: [PATCH 2592/3220] HID: debug: fix the ring buffer implementation

commit 13054abbaa4f1fd4e6f3b4b63439ec033b4c8035 upstream.

Ring buffer implementation in hid_debug_event() and hid_debug_events_read()
is strange allowing lost or corrupted data. After commit 717adfdaf147
("HID: debug: check length before copy_to_user()") it is possible to enter
an infinite loop in hid_debug_events_read() by providing 0 as count, this
locks up a system. Fix this by rewriting the ring buffer implementation
with kfifo and simplify the code.

This fixes CVE-2019-3819.

v2: fix an execution logic and add a comment
v3: use __set_current_state() instead of set_current_state()

Backport to v4.4: some (tree-wide) patches are missing in v4.4 so
cherry-pick relevant pieces from:
 * 6396bb22151 ("treewide: kzalloc() -> kcalloc()")
 * a9a08845e9ac ("vfs: do bulk POLL* -> EPOLL* replacement")
 * 92529623d242 ("HID: debug: improve hid_debug_event()")
 * 174cd4b1e5fb ("sched/headers: Prepare to move signal wakeup & sigpending
   methods from <linux/sched.h> into <linux/sched/signal.h>")

Link: https://bugzilla.redhat.com/show_bug.cgi?id=1669187
Cc: stable@vger.kernel.org # v4.18+
Fixes: cd667ce24796 ("HID: use debugfs for events/reports dumping")
Fixes: 717adfdaf147 ("HID: debug: check length before copy_to_user()")
Signed-off-by: Vladis Dronov <vdronov@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hid/hid-debug.c   | 124 +++++++++++++++-----------------------
 include/linux/hid-debug.h |   9 ++-
 2 files changed, 53 insertions(+), 80 deletions(-)

diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 6c60f4b63d21..d7179dd3c9ef 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -30,6 +30,7 @@
 
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/kfifo.h>
 #include <linux/sched.h>
 #include <linux/export.h>
 #include <linux/slab.h>
@@ -455,7 +456,7 @@ static char *resolv_usage_page(unsigned page, struct seq_file *f) {
 	char *buf = NULL;
 
 	if (!f) {
-		buf = kzalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_ATOMIC);
+		buf = kzalloc(HID_DEBUG_BUFSIZE, GFP_ATOMIC);
 		if (!buf)
 			return ERR_PTR(-ENOMEM);
 	}
@@ -659,17 +660,12 @@ EXPORT_SYMBOL_GPL(hid_dump_device);
 /* enqueue string to 'events' ring buffer */
 void hid_debug_event(struct hid_device *hdev, char *buf)
 {
-	int i;
 	struct hid_debug_list *list;
 	unsigned long flags;
 
 	spin_lock_irqsave(&hdev->debug_list_lock, flags);
-	list_for_each_entry(list, &hdev->debug_list, node) {
-		for (i = 0; i < strlen(buf); i++)
-			list->hid_debug_buf[(list->tail + i) % HID_DEBUG_BUFSIZE] =
-				buf[i];
-		list->tail = (list->tail + i) % HID_DEBUG_BUFSIZE;
-        }
+	list_for_each_entry(list, &hdev->debug_list, node)
+		kfifo_in(&list->hid_debug_fifo, buf, strlen(buf));
 	spin_unlock_irqrestore(&hdev->debug_list_lock, flags);
 
 	wake_up_interruptible(&hdev->debug_wait);
@@ -720,8 +716,7 @@ void hid_dump_input(struct hid_device *hdev, struct hid_usage *usage, __s32 valu
 	hid_debug_event(hdev, buf);
 
 	kfree(buf);
-        wake_up_interruptible(&hdev->debug_wait);
-
+	wake_up_interruptible(&hdev->debug_wait);
 }
 EXPORT_SYMBOL_GPL(hid_dump_input);
 
@@ -1086,8 +1081,8 @@ static int hid_debug_events_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
-	if (!(list->hid_debug_buf = kzalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_KERNEL))) {
-		err = -ENOMEM;
+	err = kfifo_alloc(&list->hid_debug_fifo, HID_DEBUG_FIFOSIZE, GFP_KERNEL);
+	if (err) {
 		kfree(list);
 		goto out;
 	}
@@ -1107,77 +1102,57 @@ static ssize_t hid_debug_events_read(struct file *file, char __user *buffer,
 		size_t count, loff_t *ppos)
 {
 	struct hid_debug_list *list = file->private_data;
-	int ret = 0, len;
+	int ret = 0, copied;
 	DECLARE_WAITQUEUE(wait, current);
 
 	mutex_lock(&list->read_mutex);
-	while (ret == 0) {
-		if (list->head == list->tail) {
-			add_wait_queue(&list->hdev->debug_wait, &wait);
-			set_current_state(TASK_INTERRUPTIBLE);
+	if (kfifo_is_empty(&list->hid_debug_fifo)) {
+		add_wait_queue(&list->hdev->debug_wait, &wait);
+		set_current_state(TASK_INTERRUPTIBLE);
 
-			while (list->head == list->tail) {
-				if (file->f_flags & O_NONBLOCK) {
-					ret = -EAGAIN;
-					break;
-				}
-				if (signal_pending(current)) {
-					ret = -ERESTARTSYS;
-					break;
-				}
-
-				if (!list->hdev || !list->hdev->debug) {
-					ret = -EIO;
-					set_current_state(TASK_RUNNING);
-					goto out;
-				}
-
-				/* allow O_NONBLOCK from other threads */
-				mutex_unlock(&list->read_mutex);
-				schedule();
-				mutex_lock(&list->read_mutex);
-				set_current_state(TASK_INTERRUPTIBLE);
+		while (kfifo_is_empty(&list->hid_debug_fifo)) {
+			if (file->f_flags & O_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
 			}
 
-			set_current_state(TASK_RUNNING);
-			remove_wait_queue(&list->hdev->debug_wait, &wait);
+			if (signal_pending(current)) {
+				ret = -ERESTARTSYS;
+				break;
+			}
+
+			/* if list->hdev is NULL we cannot remove_wait_queue().
+			 * if list->hdev->debug is 0 then hid_debug_unregister()
+			 * was already called and list->hdev is being destroyed.
+			 * if we add remove_wait_queue() here we can hit a race.
+			 */
+			if (!list->hdev || !list->hdev->debug) {
+				ret = -EIO;
+				set_current_state(TASK_RUNNING);
+				goto out;
+			}
+
+			/* allow O_NONBLOCK from other threads */
+			mutex_unlock(&list->read_mutex);
+			schedule();
+			mutex_lock(&list->read_mutex);
+			set_current_state(TASK_INTERRUPTIBLE);
 		}
 
+		__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&list->hdev->debug_wait, &wait);
+
 		if (ret)
 			goto out;
-
-		/* pass the ringbuffer contents to userspace */
-copy_rest:
-		if (list->tail == list->head)
-			goto out;
-		if (list->tail > list->head) {
-			len = list->tail - list->head;
-			if (len > count)
-				len = count;
-
-			if (copy_to_user(buffer + ret, &list->hid_debug_buf[list->head], len)) {
-				ret = -EFAULT;
-				goto out;
-			}
-			ret += len;
-			list->head += len;
-		} else {
-			len = HID_DEBUG_BUFSIZE - list->head;
-			if (len > count)
-				len = count;
-
-			if (copy_to_user(buffer, &list->hid_debug_buf[list->head], len)) {
-				ret = -EFAULT;
-				goto out;
-			}
-			list->head = 0;
-			ret += len;
-			count -= len;
-			if (count > 0)
-				goto copy_rest;
-		}
-
 	}
+
+	/* pass the fifo content to userspace, locking is not needed with only
+	 * one concurrent reader and one concurrent writer
+	 */
+	ret = kfifo_to_user(&list->hid_debug_fifo, buffer, count, &copied);
+	if (ret)
+		goto out;
+	ret = copied;
 out:
 	mutex_unlock(&list->read_mutex);
 	return ret;
@@ -1188,7 +1163,7 @@ static unsigned int hid_debug_events_poll(struct file *file, poll_table *wait)
 	struct hid_debug_list *list = file->private_data;
 
 	poll_wait(file, &list->hdev->debug_wait, wait);
-	if (list->head != list->tail)
+	if (!kfifo_is_empty(&list->hid_debug_fifo))
 		return POLLIN | POLLRDNORM;
 	if (!list->hdev->debug)
 		return POLLERR | POLLHUP;
@@ -1203,7 +1178,7 @@ static int hid_debug_events_release(struct inode *inode, struct file *file)
 	spin_lock_irqsave(&list->hdev->debug_list_lock, flags);
 	list_del(&list->node);
 	spin_unlock_irqrestore(&list->hdev->debug_list_lock, flags);
-	kfree(list->hid_debug_buf);
+	kfifo_free(&list->hid_debug_fifo);
 	kfree(list);
 
 	return 0;
@@ -1254,4 +1229,3 @@ void hid_debug_exit(void)
 {
 	debugfs_remove_recursive(hid_debug_root);
 }
-
diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h
index 8663f216c563..2d6100edf204 100644
--- a/include/linux/hid-debug.h
+++ b/include/linux/hid-debug.h
@@ -24,7 +24,10 @@
 
 #ifdef CONFIG_DEBUG_FS
 
+#include <linux/kfifo.h>
+
 #define HID_DEBUG_BUFSIZE 512
+#define HID_DEBUG_FIFOSIZE 512
 
 void hid_dump_input(struct hid_device *, struct hid_usage *, __s32);
 void hid_dump_report(struct hid_device *, int , u8 *, int);
@@ -37,11 +40,8 @@ void hid_debug_init(void);
 void hid_debug_exit(void);
 void hid_debug_event(struct hid_device *, char *);
 
-
 struct hid_debug_list {
-	char *hid_debug_buf;
-	int head;
-	int tail;
+	DECLARE_KFIFO_PTR(hid_debug_fifo, char);
 	struct fasync_struct *fasync;
 	struct hid_device *hdev;
 	struct list_head node;
@@ -64,4 +64,3 @@ struct hid_debug_list {
 #endif
 
 #endif
-

From 2a41ed30abe710f39717e42e76a8466c15965eee Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sat, 1 Aug 2015 06:59:29 -0700
Subject: [PATCH 2593/3220] NFC: nxp-nci: Include unaligned.h instead of
 access_ok.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 2eee74b7e2a496dea49847c36fd09320505f45b7 upstream.

Directly including access_ok.h can result in the following compile errors
if an architecture such as ia64 does not support direct unaligned accesses.

include/linux/unaligned/access_ok.h:7:19: error:
	redefinition of 'get_unaligned_le16'
include/linux/unaligned/le_struct.h:6:19: note:
	previous definition of 'get_unaligned_le16' was here
include/linux/unaligned/access_ok.h:12:19: error:
	redefinition of 'get_unaligned_le32'
include/linux/unaligned/le_struct.h:11:19: note:
	previous definition of 'get_unaligned_le32' was here

Include asm/unaligned.h instead and let the architecture decide which
access functions to use.

Cc: Clément Perrochaud <clement.perrochaud@effinnov.com>
Cc: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Cc: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nfc/nxp-nci/firmware.c | 2 +-
 drivers/nfc/nxp-nci/i2c.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nfc/nxp-nci/firmware.c b/drivers/nfc/nxp-nci/firmware.c
index 5291797324ba..553011f58339 100644
--- a/drivers/nfc/nxp-nci/firmware.c
+++ b/drivers/nfc/nxp-nci/firmware.c
@@ -24,7 +24,7 @@
 #include <linux/completion.h>
 #include <linux/firmware.h>
 #include <linux/nfc.h>
-#include <linux/unaligned/access_ok.h>
+#include <asm/unaligned.h>
 
 #include "nxp-nci.h"
 
diff --git a/drivers/nfc/nxp-nci/i2c.c b/drivers/nfc/nxp-nci/i2c.c
index df4333c7ee0f..0b1122cb5d0c 100644
--- a/drivers/nfc/nxp-nci/i2c.c
+++ b/drivers/nfc/nxp-nci/i2c.c
@@ -36,7 +36,7 @@
 #include <linux/of_gpio.h>
 #include <linux/of_irq.h>
 #include <linux/platform_data/nxp-nci.h>
-#include <linux/unaligned/access_ok.h>
+#include <asm/unaligned.h>
 
 #include <net/nfc/nfc.h>
 

From 1793dc65ee85f96ce2ddcdb586e98214ac0dc474 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 13 Feb 2019 16:01:54 +0100
Subject: [PATCH 2594/3220] Revert "cifs: In Kconfig CONFIG_CIFS_POSIX needs
 depends on legacy (insecure cifs)"

This reverts commit 60da90b224ba77a934decbb8129dabc861edd526 which is
commit 6e785302dad32228819d8066e5376acd15d0e6ba upstream.

Yi writes:
	I notice that 4.4.169 merged 60da90b224ba7 ("cifs: In Kconfig
	CONFIG_CIFS_POSIX needs depends on legacy (insecure cifs)") add
	a Kconfig dependency CIFS_ALLOW_INSECURE_LEGACY, which was not
	defined in 4.4 stable, so after this patch we are not able to
	enable CIFS_POSIX anymore. Linux 4.4 stable didn't merge the
	legacy dialects codes, so do we really need this patch for 4.4?

So revert this patch.

Reported-by: "zhangyi (F)" <yi.zhang@huawei.com>
Cc: Steve French <stfrench@microsoft.com>
Cc: Pavel Shilovsky <pshilov@microsoft.com>
Cc: Sasha Levin <sashal@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 8bef27b8f85d..e7b478b49985 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -111,7 +111,7 @@ config CIFS_XATTR
 
 config CIFS_POSIX
         bool "CIFS POSIX Extensions"
-        depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
+        depends on CIFS_XATTR
         help
           Enabling this option will cause the cifs client to attempt to
 	  negotiate a newer dialect with servers, such as Samba 3.0.5

From 2e9e4e1590d2f9e572b6a72991ba6ca54019374c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 14 Jan 2019 21:13:10 +0100
Subject: [PATCH 2595/3220] libceph: avoid KEEPALIVE_PENDING races in
 ceph_con_keepalive()

commit 4aac9228d16458cedcfd90c7fb37211cf3653ac3 upstream.

con_fault() can transition the connection into STANDBY right after
ceph_con_keepalive() clears STANDBY in clear_standby():

    libceph user thread               ceph-msgr worker

ceph_con_keepalive()
  mutex_lock(&con->mutex)
  clear_standby(con)
  mutex_unlock(&con->mutex)
                                mutex_lock(&con->mutex)
                                con_fault()
                                  ...
                                  if KEEPALIVE_PENDING isn't set
                                    set state to STANDBY
                                  ...
                                mutex_unlock(&con->mutex)
  set KEEPALIVE_PENDING
  set WRITE_PENDING

This triggers warnings in clear_standby() when either ceph_con_send()
or ceph_con_keepalive() get to clearing STANDBY next time.

I don't see a reason to condition queue_con() call on the previous
value of KEEPALIVE_PENDING, so move the setting of KEEPALIVE_PENDING
into the critical section -- unlike WRITE_PENDING, KEEPALIVE_PENDING
could have been a non-atomic flag.

Reported-by: syzbot+acdeb633f6211ccdf886@syzkaller.appspotmail.com
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Tested-by: Myungho Jung <mhjungk@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ceph/messenger.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ad3c9e96a275..3e6897efe1eb 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3181,9 +3181,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	dout("con_keepalive %p\n", con);
 	mutex_lock(&con->mutex);
 	clear_standby(con);
+	con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING);
 	mutex_unlock(&con->mutex);
-	if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 &&
-	    con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+
+	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);

From ca6fd8df6f737e0a108e6b77486d6d6ea8a0897e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 9 Jan 2019 14:37:34 +0100
Subject: [PATCH 2596/3220] xfrm: refine validation of template and selector
 families

commit 35e6103861a3a970de6c84688c6e7a1f65b164ca upstream.

The check assumes that in transport mode, the first templates family
must match the address family of the policy selector.

Syzkaller managed to build a template using MODE_ROUTEOPTIMIZATION,
with ipv4-in-ipv6 chain, leading to following splat:

BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x1db/0x1854
Read of size 4 at addr ffff888063e57aa0 by task a.out/2050
 xfrm_state_find+0x1db/0x1854
 xfrm_tmpl_resolve+0x100/0x1d0
 xfrm_resolve_and_create_bundle+0x108/0x1000 [..]

Problem is that addresses point into flowi4 struct, but xfrm_state_find
treats them as being ipv6 because it uses templ->encap_family is used
(AF_INET6 in case of reproducer) rather than family (AF_INET).

This patch inverts the logic: Enforce 'template family must match
selector' EXCEPT for tunnel and BEET mode.

In BEET and Tunnel mode, xfrm_tmpl_resolve_one will have remote/local
address pointers changed to point at the addresses found in the template,
rather than the flowi ones, so no oob read will occur.

Reported-by: 3ntr0py1337@gmail.com
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/xfrm/xfrm_user.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 476f1fc6d655..177a6c75f136 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1404,10 +1404,15 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
 		if (!ut[i].family)
 			ut[i].family = family;
 
-		if ((ut[i].mode == XFRM_MODE_TRANSPORT) &&
-		    (ut[i].family != prev_family))
-			return -EINVAL;
-
+		switch (ut[i].mode) {
+		case XFRM_MODE_TUNNEL:
+		case XFRM_MODE_BEET:
+			break;
+		default:
+			if (ut[i].family != prev_family)
+				return -EINVAL;
+			break;
+		}
 		if (ut[i].mode >= XFRM_MODE_MAX)
 			return -EINVAL;
 

From 8fe161469f1847d14068aec829d77e428cb653d1 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 30 Dec 2018 12:46:01 +0100
Subject: [PATCH 2597/3220] batman-adv: Avoid WARN on net_device without parent
 in netns

commit 955d3411a17f590364238bd0d3329b61f20c1cd2 upstream.

It is not allowed to use WARN* helpers on potential incorrect input from
the user or transient problems because systems configured as panic_on_warn
will reboot due to such a problem.

A NULL return value of __dev_get_by_index can be caused by various problems
which can either be related to the system configuration or problems
(incorrectly returned network namespaces) in other (virtual) net_device
drivers. batman-adv should not cause a (harmful) WARN in this situation and
instead only report it via a simple message.

Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface")
Reported-by: syzbot+c764de0fcfadca9a8595@syzkaller.appspotmail.com
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/batman-adv/hard-interface.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index f11345e163d7..3c8d8142e8c6 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -18,7 +18,6 @@
 #include "hard-interface.h"
 #include "main.h"
 
-#include <linux/bug.h>
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -104,8 +103,10 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
 	/* recurse over the parent device */
 	parent_dev = __dev_get_by_index(&init_net, dev_get_iflink(net_dev));
 	/* if we got a NULL parent_dev there is something broken.. */
-	if (WARN(!parent_dev, "Cannot find parent device"))
+	if (!parent_dev) {
+		pr_err("Cannot find parent device\n");
 		return false;
+	}
 
 	ret = batadv_is_on_batman_iface(parent_dev);
 

From b2942d59a3cd66a5202eedcd41e751c0d0bd6e5e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Mon, 31 Dec 2018 22:31:01 +0100
Subject: [PATCH 2598/3220] batman-adv: Force mac header to start of data on
 xmit

commit 9114daa825fc3f335f9bea3313ce667090187280 upstream.

The caller of ndo_start_xmit may not already have called
skb_reset_mac_header. The returned value of skb_mac_header/eth_hdr
therefore can be in the wrong position and even outside the current skbuff.
This for example happens when the user binds to the device using a
PF_PACKET-SOCK_RAW with enabled qdisc-bypass:

  int opt = 4;
  setsockopt(sock, SOL_PACKET, PACKET_QDISC_BYPASS, &opt, sizeof(opt));

Since eth_hdr is used all over the codebase, the batadv_interface_tx
function must always take care of resetting it.

Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol")
Reported-by: syzbot+9d7405c7faa390e60b4e@syzkaller.appspotmail.com
Reported-by: syzbot+7d20bc3f1ddddc0f9079@syzkaller.appspotmail.com
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/batman-adv/soft-interface.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 9f1fe6169bef..5aeb585571ed 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -209,6 +209,8 @@ static int batadv_interface_tx(struct sk_buff *skb,
 
 	soft_iface->trans_start = jiffies;
 	vid = batadv_get_vid(skb, 0);
+
+	skb_reset_mac_header(skb);
 	ethhdr = eth_hdr(skb);
 
 	switch (ntohs(ethhdr->h_proto)) {

From 7cbbbf750ef6542aebc00993ded62cc9c3256a6c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 14 Feb 2019 15:02:18 -0800
Subject: [PATCH 2599/3220] Revert "exec: load_script: don't blindly truncate
 shebang string"

commit cb5b020a8d38f77209d0472a0fea755299a8ec78 upstream.

This reverts commit 8099b047ecc431518b9bb6bdbba3549bbecdc343.

It turns out that people do actually depend on the shebang string being
truncated, and on the fact that an interpreter (like perl) will often
just re-interpret it entirely to get the full argument list.

Reported-by: Samuel Dionne-Riel <samuel@dionne-riel.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/binfmt_script.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 634bdbb23851..afdf4e3cafc2 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -43,14 +43,10 @@ static int load_script(struct linux_binprm *bprm)
 	fput(bprm->file);
 	bprm->file = NULL;
 
-	for (cp = bprm->buf+2;; cp++) {
-		if (cp >= bprm->buf + BINPRM_BUF_SIZE)
-			return -ENOEXEC;
-		if (!*cp || (*cp == '\n'))
-			break;
-	}
+	bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
+	if ((cp = strchr(bprm->buf, '\n')) == NULL)
+		cp = bprm->buf+BINPRM_BUF_SIZE-1;
 	*cp = '\0';
-
 	while (cp > bprm->buf) {
 		cp--;
 		if ((*cp == ' ') || (*cp == '\t'))

From 28b5d0be0ab6427ee495fb4f10c0574f4132b360 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Thu, 14 Feb 2019 14:18:00 +0100
Subject: [PATCH 2600/3220] uapi/if_ether.h: prevent redefinition of struct
 ethhdr

commit 6926e041a8920c8ec27e4e155efa760aa01551fd upstream.

Musl provides its own ethhdr struct definition. Add a guard to prevent
its definition of the appropriate musl header has already been included.

glibc does not implement this header, but when glibc will implement this
they can just define __UAPI_DEF_ETHHDR 0 to make it work with the
kernel.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/uapi/linux/if_ether.h    | 3 +++
 include/uapi/linux/libc-compat.h | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 064d2026ab38..cb490cd9376f 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -22,6 +22,7 @@
 #define _UAPI_LINUX_IF_ETHER_H
 
 #include <linux/types.h>
+#include <linux/libc-compat.h>
 
 /*
  *	IEEE 802.3 Ethernet magic constants.  The frame sizes omit the preamble
@@ -136,11 +137,13 @@
  *	This is an Ethernet frame header.
  */
 
+#if __UAPI_DEF_ETHHDR
 struct ethhdr {
 	unsigned char	h_dest[ETH_ALEN];	/* destination eth addr	*/
 	unsigned char	h_source[ETH_ALEN];	/* source ether addr	*/
 	__be16		h_proto;		/* packet type ID field	*/
 } __attribute__((packed));
+#endif
 
 
 #endif /* _UAPI_LINUX_IF_ETHER_H */
diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index e4f048ee7043..5da44c571cdd 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -184,4 +184,10 @@
 
 #endif /* __GLIBC__ */
 
+/* Definitions for if_ether.h */
+/* allow libcs like musl to deactivate this, glibc does not implement this. */
+#ifndef __UAPI_DEF_ETHHDR
+#define __UAPI_DEF_ETHHDR		1
+#endif
+
 #endif /* _UAPI_LIBC_COMPAT_H */

From 20ad50464b74dd2fc066f32cc4cbe2a3ac3673df Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 19 Dec 2018 13:47:24 +0200
Subject: [PATCH 2601/3220] ARM: dts: da850-evm: Correct the sound card name

[ Upstream commit 7fca69d4e43fa1ae9cb4f652772c132dc5a659c6 ]

To avoid  the following error:
asoc-simple-card sound: ASoC: Failed to create card debugfs directory

Which is because the card name contains '/' character, which can not be
used in file or directory names.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/boot/dts/da850-evm.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/da850-evm.dts b/arch/arm/boot/dts/da850-evm.dts
index 6881757b03e8..67369f284b91 100644
--- a/arch/arm/boot/dts/da850-evm.dts
+++ b/arch/arm/boot/dts/da850-evm.dts
@@ -147,7 +147,7 @@
 
 	sound {
 		compatible = "simple-audio-card";
-		simple-audio-card,name = "DA850/OMAP-L138 EVM";
+		simple-audio-card,name = "DA850-OMAPL138 EVM";
 		simple-audio-card,widgets =
 			"Line", "Line In",
 			"Line", "Line Out";

From 32f04710885113bc1124380e0764372dceb0144a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 8 Jan 2019 00:08:18 +0100
Subject: [PATCH 2602/3220] ARM: dts: kirkwood: Fix polarity of GPIO fan lines

[ Upstream commit b5f034845e70916fd33e172fad5ad530a29c10ab ]

These two lines are active high, not active low. The bug was
found when we changed the kernel to respect the polarity defined
in the device tree.

Fixes: 1b90e06b1429 ("ARM: kirkwood: Use devicetree to define DNS-32[05] fan")
Cc: Jamie Lentin <jm@lentin.co.uk>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Gregory Clement <gregory.clement@bootlin.com>
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Cc: Julien D'Ascenzio <jdascenzio@posteo.net>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Jamie Lentin <jm@lentin.co.uk>
Reported-by: Julien D'Ascenzio <jdascenzio@posteo.net>
Tested-by: Julien D'Ascenzio <jdascenzio@posteo.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/boot/dts/kirkwood-dnskw.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/kirkwood-dnskw.dtsi b/arch/arm/boot/dts/kirkwood-dnskw.dtsi
index 113dcf056dcf..1b2dacfa6132 100644
--- a/arch/arm/boot/dts/kirkwood-dnskw.dtsi
+++ b/arch/arm/boot/dts/kirkwood-dnskw.dtsi
@@ -35,8 +35,8 @@
 		compatible = "gpio-fan";
 		pinctrl-0 = <&pmx_fan_high_speed &pmx_fan_low_speed>;
 		pinctrl-names = "default";
-		gpios = <&gpio1 14 GPIO_ACTIVE_LOW
-			 &gpio1 13 GPIO_ACTIVE_LOW>;
+		gpios = <&gpio1 14 GPIO_ACTIVE_HIGH
+			 &gpio1 13 GPIO_ACTIVE_HIGH>;
 		gpio-fan,speed-map = <0    0
 				      3000 1
 				      6000 2>;

From 8347670127511f031d2e3503ae00a25d1956f19b Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Sat, 1 Dec 2018 12:57:18 +0100
Subject: [PATCH 2603/3220] gpio: pl061: handle failed allocations

[ Upstream commit df209c43a0e8258e096fb722dfbdae4f0dd13fde ]

devm_kzalloc(), devm_kstrdup() and devm_kasprintf() all can
fail internal allocation and return NULL. Using any of the assigned
objects without checking is not safe. As this is early in the boot
phase and these allocations really should not fail, any failure here
is probably an indication of a more serious issue so it makes little
sense to try and rollback the previous allocated resources or try to
continue;  but rather the probe function is simply exited with -ENOMEM.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Fixes: 684284b64aae ("ARM: integrator: add MMCI device to IM-PD1")
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/mach-integrator/impd1.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c
index 38b0da300dd5..423a88ff908c 100644
--- a/arch/arm/mach-integrator/impd1.c
+++ b/arch/arm/mach-integrator/impd1.c
@@ -394,7 +394,11 @@ static int __init_refok impd1_probe(struct lm_device *dev)
 					      sizeof(*lookup) + 3 * sizeof(struct gpiod_lookup),
 					      GFP_KERNEL);
 			chipname = devm_kstrdup(&dev->dev, devname, GFP_KERNEL);
-			mmciname = kasprintf(GFP_KERNEL, "lm%x:00700", dev->id);
+			mmciname = devm_kasprintf(&dev->dev, GFP_KERNEL,
+						  "lm%x:00700", dev->id);
+			if (!lookup || !chipname || !mmciname)
+				return -ENOMEM;
+
 			lookup->dev_id = mmciname;
 			/*
 			 * Offsets on GPIO block 1:

From 1f39e518cf9190422119a53a6d6964e800e7e789 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Tue, 8 Jan 2019 18:30:56 +0000
Subject: [PATCH 2604/3220] cifs: Limit memory used by lock request calls to a
 page

[ Upstream commit 92a8109e4d3a34fb6b115c9098b51767dc933444 ]

The code tries to allocate a contiguous buffer with a size supplied by
the server (maxBuf). This could fail if memory is fragmented since it
results in high order allocations for commonly used server
implementations. It is also wasteful since there are probably
few locks in the usual case. Limit the buffer to be no larger than a
page to avoid memory allocation failures due to fragmentation.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/file.c     | 8 ++++++++
 fs/cifs/smb2file.c | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 026b399af215..1062e96ee272 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1081,6 +1081,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 		return -EINVAL;
 	}
 
+	BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+		     PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+			PAGE_SIZE);
 	max_num = (max_buf - sizeof(struct smb_hdr)) /
 						sizeof(LOCKING_ANDX_RANGE);
 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
@@ -1410,6 +1414,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
 	if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE)))
 		return -EINVAL;
 
+	BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+		     PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+			PAGE_SIZE);
 	max_num = (max_buf - sizeof(struct smb_hdr)) /
 						sizeof(LOCKING_ANDX_RANGE);
 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index b7885dc0d9bb..dee5250701de 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -129,6 +129,8 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
 	if (max_buf < sizeof(struct smb2_lock_element))
 		return -EINVAL;
 
+	BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
 	max_num = max_buf / sizeof(struct smb2_lock_element);
 	buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
 	if (!buf)
@@ -265,6 +267,8 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
 		return -EINVAL;
 	}
 
+	BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
 	max_num = max_buf / sizeof(struct smb2_lock_element);
 	buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
 	if (!buf) {

From 29c84aa9f2a2c5215b910685549e798e7514ae6c Mon Sep 17 00:00:00 2001
From: Mark Rustad <mrustad@gmail.com>
Date: Fri, 15 Feb 2019 08:19:55 -0800
Subject: [PATCH 2605/3220] Documentation/network: reword kernel version
 reference

It seemed odd to say "since 4.17" in a 4.4 kernel. Consider
rewording the reference to indicate where in the stable series
it was introduced as well as where it originated.

Signed-off-by: Mark Rustad <mrustad@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 Documentation/networking/ip-sysctl.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7c229f59016f..2fb35658d151 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -116,7 +116,7 @@ ipfrag_high_thresh - LONG INTEGER
 	Maximum memory used to reassemble IP fragments.
 
 ipfrag_low_thresh - LONG INTEGER
-	(Obsolete since linux-4.17)
+	(Obsolete since linux-4.4.174, backported from linux-4.17)
 	Maximum memory used to reassemble IP fragments before the kernel
 	begins to remove incomplete fragment queues to free up resources.
 	The kernel still accepts new fragments for defragmentation.

From d347a8948c92e5ae07aac0a5f9d7ced595eedfda Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 11 Feb 2019 14:32:40 -0800
Subject: [PATCH 2606/3220] Revert "Input: elan_i2c - add ACPI ID for touchpad
 in ASUS Aspire F5-573G"

commit f420c54e4b12c1361c6ed313002ee7bd7ac58362 upstream.

This reverts commit 7db54c89f0b30a101584e09d3729144e6170059d as it
breaks Acer Aspire V-371 and other devices. According to Elan:

"Acer Aspire F5-573G is MS Precision touchpad which should use hid
 multitouch driver. ELAN0501 should not be added in elan_i2c."

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202503
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/input/mouse/elan_i2c_core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c
index 30adc5745cba..471984ec2db0 100644
--- a/drivers/input/mouse/elan_i2c_core.c
+++ b/drivers/input/mouse/elan_i2c_core.c
@@ -1240,7 +1240,6 @@ MODULE_DEVICE_TABLE(i2c, elan_id);
 static const struct acpi_device_id elan_acpi_id[] = {
 	{ "ELAN0000", 0 },
 	{ "ELAN0100", 0 },
-	{ "ELAN0501", 0 },
 	{ "ELAN0600", 0 },
 	{ "ELAN0602", 0 },
 	{ "ELAN0605", 0 },

From b7febf3b2909e98024e90fc5144b5075bfaddf03 Mon Sep 17 00:00:00 2001
From: Mauro Ciancio <mauro@acadeu.com>
Date: Mon, 14 Jan 2019 10:24:53 -0300
Subject: [PATCH 2607/3220] Input: elan_i2c - add ACPI ID for touchpad in
 Lenovo V330-15ISK

commit 7ad222b3aed350adfc27ee7eec4587ffe55dfdce upstream.

This adds ELAN0617 to the ACPI table to support Elan touchpad found in
Lenovo V330-15ISK.

Signed-off-by: Mauro Ciancio <mauro@acadeu.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/input/mouse/elan_i2c_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c
index 471984ec2db0..25ce9047b682 100644
--- a/drivers/input/mouse/elan_i2c_core.c
+++ b/drivers/input/mouse/elan_i2c_core.c
@@ -1250,6 +1250,7 @@ static const struct acpi_device_id elan_acpi_id[] = {
 	{ "ELAN060C", 0 },
 	{ "ELAN0611", 0 },
 	{ "ELAN0612", 0 },
+	{ "ELAN0617", 0 },
 	{ "ELAN0618", 0 },
 	{ "ELAN061C", 0 },
 	{ "ELAN061D", 0 },

From 222b22e1f32d7f96d5a4fdb15f3be3eb82cf340c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 13 Feb 2019 07:57:02 +0100
Subject: [PATCH 2608/3220] perf/core: Fix impossible ring-buffer sizes warning

commit 528871b456026e6127d95b1b2bd8e3a003dc1614 upstream.

The following commit:

  9dff0aa95a32 ("perf/core: Don't WARN() for impossible ring-buffer sizes")

results in perf recording failures with larger mmap areas:

  root@skl:/tmp# perf record -g -a
  failed to mmap with 12 (Cannot allocate memory)

The root cause is that the following condition is buggy:

	if (order_base_2(size) >= MAX_ORDER)
		goto fail;

The problem is that @size is in bytes and MAX_ORDER is in pages,
so the right test is:

	if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
		goto fail;

Fix it.

Reported-by: "Jin, Yao" <yao.jin@linux.intel.com>
Bisected-by: Borislav Petkov <bp@alien8.de>
Analyzed-by: Peter Zijlstra <peterz@infradead.org>
Cc: Julien Thierry <julien.thierry@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: <stable@vger.kernel.org>
Fixes: 9dff0aa95a32 ("perf/core: Don't WARN() for impossible ring-buffer sizes")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/events/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 93bfb61506fa..358bb53c1e74 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -637,7 +637,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 	size = sizeof(struct ring_buffer);
 	size += nr_pages * sizeof(void *);
 
-	if (order_base_2(size) >= MAX_ORDER)
+	if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
 		goto fail;
 
 	rb = kzalloc(size, GFP_KERNEL);

From 422e1adf1714d18ea07a48d44084907596be1f47 Mon Sep 17 00:00:00 2001
From: Jurica Vukadin <jurica.vukadin@rt-rk.com>
Date: Thu, 7 Feb 2019 16:29:37 +0100
Subject: [PATCH 2609/3220] ALSA: hda - Add quirk for HP EliteBook 840 G5

commit 4cd3016ce996494f78fdfd87ea35c8ca5d0b413e upstream.

This enables mute LED support and fixes switching jacks when the laptop
is docked.

Signed-off-by: Jurica Vukadin <jurica.vukadin@rt-rk.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/patch_conexant.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 536184ac315d..40dd46556452 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -854,6 +854,7 @@ static const struct snd_pci_quirk cxt5066_fixups[] = {
 	SND_PCI_QUIRK(0x103c, 0x807C, "HP EliteBook 820 G3", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x80FD, "HP ProBook 640 G2", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x828c, "HP EliteBook 840 G4", CXT_FIXUP_HP_DOCK),
+	SND_PCI_QUIRK(0x103c, 0x83b2, "HP EliteBook 840 G5", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x83b3, "HP EliteBook 830 G5", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x83d3, "HP ProBook 640 G4", CXT_FIXUP_HP_DOCK),
 	SND_PCI_QUIRK(0x103c, 0x8174, "HP Spectre x360", CXT_FIXUP_HP_SPECTRE),

From 05268b5cca5fe1664687f630ee82fdf6b5fa6797 Mon Sep 17 00:00:00 2001
From: Manuel Reinhardt <manuel.rhdt@gmail.com>
Date: Thu, 31 Jan 2019 15:32:35 +0100
Subject: [PATCH 2610/3220] ALSA: usb-audio: Fix implicit fb endpoint setup by
 quirk

commit 2bc16b9f3223d049b57202ee702fcb5b9b507019 upstream.

The commit a60945fd08e4 ("ALSA: usb-audio: move implicit fb quirks to
separate function") introduced an error in the handling of quirks for
implicit feedback endpoints. This commit fixes this.

If a quirk successfully sets up an implicit feedback endpoint, usb-audio
no longer tries to find the implicit fb endpoint itself.

Fixes: a60945fd08e4 ("ALSA: usb-audio: move implicit fb quirks to separate function")
Signed-off-by: Manuel Reinhardt <manuel.rhdt@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/usb/pcm.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c
index a9079654107c..1ea1384bc236 100644
--- a/sound/usb/pcm.c
+++ b/sound/usb/pcm.c
@@ -313,6 +313,9 @@ static int search_roland_implicit_fb(struct usb_device *dev, int ifnum,
 	return 0;
 }
 
+/* Setup an implicit feedback endpoint from a quirk. Returns 0 if no quirk
+ * applies. Returns 1 if a quirk was found.
+ */
 static int set_sync_ep_implicit_fb_quirk(struct snd_usb_substream *subs,
 					 struct usb_device *dev,
 					 struct usb_interface_descriptor *altsd,
@@ -381,7 +384,7 @@ add_sync_ep:
 
 	subs->data_endpoint->sync_master = subs->sync_endpoint;
 
-	return 0;
+	return 1;
 }
 
 static int set_sync_endpoint(struct snd_usb_substream *subs,
@@ -420,6 +423,10 @@ static int set_sync_endpoint(struct snd_usb_substream *subs,
 	if (err < 0)
 		return err;
 
+	/* endpoint set by quirk */
+	if (err > 0)
+		return 0;
+
 	if (altsd->bNumEndpoints < 2)
 		return 0;
 

From 6d6d6255b7a8987259b75eb1aa07b62cd5e20a27 Mon Sep 17 00:00:00 2001
From: Jonathan Bakker <xc-racer2@live.ca>
Date: Wed, 6 Feb 2019 10:45:37 -0800
Subject: [PATCH 2611/3220] Input: bma150 - register input device after setting
 private data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 90cc55f067f6ca0e64e5e52883ece47d8af7b67b upstream.

Otherwise we introduce a race condition where userspace can request input
before we're ready leading to null pointer dereference such as

input: bma150 as /devices/platform/i2c-gpio-2/i2c-5/5-0038/input/input3
Unable to handle kernel NULL pointer dereference at virtual address 00000018
pgd = (ptrval)
[00000018] *pgd=55dac831, *pte=00000000, *ppte=00000000
Internal error: Oops: 17 [#1] PREEMPT ARM
Modules linked in: bma150 input_polldev [last unloaded: bma150]
CPU: 0 PID: 2870 Comm: accelerometer Not tainted 5.0.0-rc3-dirty #46
Hardware name: Samsung S5PC110/S5PV210-based board
PC is at input_event+0x8/0x60
LR is at bma150_report_xyz+0x9c/0xe0 [bma150]
pc : [<80450f70>]    lr : [<7f0a614c>]    psr: 800d0013
sp : a4c1fd78  ip : 00000081  fp : 00020000
r10: 00000000  r9 : a5e2944c  r8 : a7455000
r7 : 00000016  r6 : 00000101  r5 : a7617940  r4 : 80909048
r3 : fffffff2  r2 : 00000000  r1 : 00000003  r0 : 00000000
Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 54e34019  DAC: 00000051
Process accelerometer (pid: 2870, stack limit = 0x(ptrval))
Stackck: (0xa4c1fd78 to 0xa4c20000)
fd60:                                                       fffffff3 fc813f6c
fd80: 40410581 d7530ce3 a5e2817c a7617f00 a5e29404 a5e2817c 00000000 7f008324
fda0: a5e28000 8044f59c a5fdd9d0 a5e2945c a46a4a00 a5e29668 a7455000 80454f10
fdc0: 80909048 a5e29668 a5fdd9d0 a46a4a00 806316d0 00000000 a46a4a00 801df5f0
fde0: 00000000 d7530ce3 a4c1fec0 a46a4a00 00000000 a5fdd9d0 a46a4a08 801df53c
fe00: 00000000 801d74bc a4c1fec0 00000000 a4c1ff70 00000000 a7038da8 00000000
fe20: a46a4a00 801e91fc a411bbe0 801f2e88 00000004 00000000 80909048 00000041
fe40: 00000000 00020000 00000000 dead4ead a6a88da0 00000000 ffffe000 806fcae8
fe60: a4c1fec8 00000000 80909048 00000002 a5fdd9d0 a7660110 a411bab0 00000001
fe80: dead4ead ffffffff ffffffff a4c1fe8c a4c1fe8c d7530ce3 20000013 80909048
fea0: 80909048 a4c1ff70 00000001 fffff000 a4c1e000 00000005 00026038 801eabd8
fec0: a7660110 a411bab0 b9394901 00000006 a696201b 76fb3000 00000000 a7039720
fee0: a5fdd9d0 00000101 00000002 00000096 00000000 00000000 00000000 a4c1ff00
ff00: a6b310f4 805cb174 a6b310f4 00000010 00000fe0 00000010 a4c1e000 d7530ce3
ff20: 00000003 a5f41400 a5f41424 00000000 a6962000 00000000 00000003 00000002
ff40: ffffff9c 000a0000 80909048 d7530ce3 a6962000 00000003 80909048 ffffff9c
ff60: a6962000 801d890c 00000000 00000000 00020000 a7590000 00000004 00000100
ff80: 00000001 d7530ce3 000288b8 00026320 000288b8 00000005 80101204 a4c1e000
ffa0: 00000005 80101000 000288b8 00026320 000288b8 000a0000 00000000 00000000
ffc0: 000288b8 00026320 000288b8 00000005 7eef3bac 000264e8 00028ad8 00026038
ffe0: 00000005 7eef3300 76f76e91 76f78546 800d0030 000288b8 00000000 00000000
[<80450f70>] (input_event) from [<a5e2817c>] (0xa5e2817c)
Code: e1a08148 eaffffa8 e351001f 812fff1e (e590c018)
---[ end trace 1c691ee85f2ff243 ]---

Signed-off-by: Jonathan Bakker <xc-racer2@live.ca>
Signed-off-by: Paweł Chmiel <pawel.mikolaj.chmiel@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/input/misc/bma150.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/input/misc/bma150.c b/drivers/input/misc/bma150.c
index 1d0e61d7c131..b6c1d1d482c1 100644
--- a/drivers/input/misc/bma150.c
+++ b/drivers/input/misc/bma150.c
@@ -482,13 +482,14 @@ static int bma150_register_input_device(struct bma150_data *bma150)
 	idev->close = bma150_irq_close;
 	input_set_drvdata(idev, bma150);
 
+	bma150->input = idev;
+
 	error = input_register_device(idev);
 	if (error) {
 		input_free_device(idev);
 		return error;
 	}
 
-	bma150->input = idev;
 	return 0;
 }
 
@@ -511,15 +512,15 @@ static int bma150_register_polled_device(struct bma150_data *bma150)
 
 	bma150_init_input_device(bma150, ipoll_dev->input);
 
+	bma150->input_polled = ipoll_dev;
+	bma150->input = ipoll_dev->input;
+
 	error = input_register_polled_device(ipoll_dev);
 	if (error) {
 		input_free_polled_device(ipoll_dev);
 		return error;
 	}
 
-	bma150->input_polled = ipoll_dev;
-	bma150->input = ipoll_dev->input;
-
 	return 0;
 }
 

From fd08513bf56ecff0e16e53dfb18e873d522b732c Mon Sep 17 00:00:00 2001
From: Matti Kurkela <Matti.Kurkela@iki.fi>
Date: Thu, 7 Feb 2019 23:49:23 -0800
Subject: [PATCH 2612/3220] Input: elantech - enable 3rd button support on
 Fujitsu CELSIUS H780

commit e8b22d0a329f0fb5c7ef95406872d268f01ee3b1 upstream.

Like Fujitsu CELSIUS H760, the H780 also has a three-button Elantech
touchpad, but the driver needs to be told so to enable the middle touchpad
button.

The elantech_dmi_force_crc_enabled quirk was not necessary with the H780.

Also document the fw_version and caps values detected for both H760 and
H780 models.

Signed-off-by: Matti Kurkela <Matti.Kurkela@iki.fi>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/input/mouse/elantech.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c
index 84aead19622c..4c1e527f14a5 100644
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c
@@ -1121,6 +1121,8 @@ static int elantech_get_resolution_v4(struct psmouse *psmouse,
  * Asus UX31               0x361f00        20, 15, 0e      clickpad
  * Asus UX32VD             0x361f02        00, 15, 0e      clickpad
  * Avatar AVIU-145A2       0x361f00        ?               clickpad
+ * Fujitsu CELSIUS H760    0x570f02        40, 14, 0c      3 hw buttons (**)
+ * Fujitsu CELSIUS H780    0x5d0f02        41, 16, 0d      3 hw buttons (**)
  * Fujitsu LIFEBOOK E544   0x470f00        d0, 12, 09      2 hw buttons
  * Fujitsu LIFEBOOK E546   0x470f00        50, 12, 09      2 hw buttons
  * Fujitsu LIFEBOOK E547   0x470f00        50, 12, 09      2 hw buttons
@@ -1173,6 +1175,13 @@ static const struct dmi_system_id elantech_dmi_has_middle_button[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "CELSIUS H760"),
 		},
 	},
+	{
+		/* Fujitsu H780 also has a middle button */
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "CELSIUS H780"),
+		},
+	},
 #endif
 	{ }
 };

From 1d8b20304de194cc426aa015001340ff54598906 Mon Sep 17 00:00:00 2001
From: Sergei Trofimovich <slyfox@gentoo.org>
Date: Mon, 31 Dec 2018 11:53:55 +0000
Subject: [PATCH 2613/3220] alpha: fix page fault handling for r16-r18 targets

commit 491af60ffb848b59e82f7c9145833222e0bf27a5 upstream.

Fix page fault handling code to fixup r16-r18 registers.
Before the patch code had off-by-two registers bug.
This bug caused overwriting of ps,pc,gp registers instead
of fixing intended r16,r17,r18 (see `struct pt_regs`).

More details:

Initially Dmitry noticed a kernel bug as a failure
on strace test suite. Test passes unmapped userspace
pointer to io_submit:

```c
    #include <err.h>
    #include <unistd.h>
    #include <sys/mman.h>
    #include <asm/unistd.h>
    int main(void)
    {
        unsigned long ctx = 0;
        if (syscall(__NR_io_setup, 1, &ctx))
            err(1, "io_setup");
        const size_t page_size = sysconf(_SC_PAGESIZE);
        const size_t size = page_size * 2;
        void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (MAP_FAILED == ptr)
            err(1, "mmap(%zu)", size);
        if (munmap(ptr, size))
            err(1, "munmap");
        syscall(__NR_io_submit, ctx, 1, ptr + page_size);
        syscall(__NR_io_destroy, ctx);
        return 0;
    }
```

Running this test causes kernel to crash when handling page fault:

```
    Unable to handle kernel paging request at virtual address ffffffffffff9468
    CPU 3
    aio(26027): Oops 0
    pc = [<fffffc00004eddf8>]  ra = [<fffffc00004edd5c>]  ps = 0000    Not tainted
    pc is at sys_io_submit+0x108/0x200
    ra is at sys_io_submit+0x6c/0x200
    v0 = fffffc00c58e6300  t0 = fffffffffffffff2  t1 = 000002000025e000
    t2 = fffffc01f159fef8  t3 = fffffc0001009640  t4 = fffffc0000e0f6e0
    t5 = 0000020001002e9e  t6 = 4c41564e49452031  t7 = fffffc01f159c000
    s0 = 0000000000000002  s1 = 000002000025e000  s2 = 0000000000000000
    s3 = 0000000000000000  s4 = 0000000000000000  s5 = fffffffffffffff2
    s6 = fffffc00c58e6300
    a0 = fffffc00c58e6300  a1 = 0000000000000000  a2 = 000002000025e000
    a3 = 00000200001ac260  a4 = 00000200001ac1e8  a5 = 0000000000000001
    t8 = 0000000000000008  t9 = 000000011f8bce30  t10= 00000200001ac440
    t11= 0000000000000000  pv = fffffc00006fd320  at = 0000000000000000
    gp = 0000000000000000  sp = 00000000265fd174
    Disabling lock debugging due to kernel taint
    Trace:
    [<fffffc0000311404>] entSys+0xa4/0xc0
```

Here `gp` has invalid value. `gp is s overwritten by a fixup for the
following page fault handler in `io_submit` syscall handler:

```
    __se_sys_io_submit
    ...
        ldq     a1,0(t1)
        bne     t0,4280 <__se_sys_io_submit+0x180>
```

After a page fault `t0` should contain -EFALUT and `a1` is 0.
Instead `gp` was overwritten in place of `a1`.

This happens due to a off-by-two bug in `dpf_reg()` for `r16-r18`
(aka `a0-a2`).

I think the bug went unnoticed for a long time as `gp` is one
of scratch registers. Any kernel function call would re-calculate `gp`.

Dmitry tracked down the bug origin back to 2.1.32 kernel version
where trap_a{0,1,2} fields were inserted into struct pt_regs.
And even before that `dpf_reg()` contained off-by-one error.

Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: linux-alpha@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reported-and-reviewed-by: "Dmitry V. Levin" <ldv@altlinux.org>
Cc: stable@vger.kernel.org # v2.1.32+
Bug: https://bugs.gentoo.org/672040
Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
Signed-off-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/alpha/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 4a905bd667e2..0f68f0de9b5e 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -77,7 +77,7 @@ __load_new_mm_context(struct mm_struct *next_mm)
 /* Macro for exception fixup code to access integer registers.  */
 #define dpf_reg(r)							\
 	(((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 :	\
-				 (r) <= 18 ? (r)+8 : (r)-10])
+				 (r) <= 18 ? (r)+10 : (r)-10])
 
 asmlinkage void
 do_page_fault(unsigned long address, unsigned long mmcsr,

From 5fc9518605a99a63ca35d2b3ab61112b9ea45161 Mon Sep 17 00:00:00 2001
From: Meelis Roos <mroos@linux.ee>
Date: Fri, 12 Oct 2018 12:27:51 +0300
Subject: [PATCH 2614/3220] alpha: Fix Eiger NR_IRQS to 128

commit bfc913682464f45bc4d6044084e370f9048de9d5 upstream.

Eiger machine vector definition has nr_irqs 128, and working 2.6.26
boot shows SCSI getting IRQ-s 64 and 65. Current kernel boot fails
because Symbios SCSI fails to request IRQ-s and does not find the disks.
It has been broken at least since 3.18 - the earliest I could test with
my gcc-5.

The headers have moved around and possibly another order of defines has
worked in the past - but since 128 seems to be correct and used, fix
arch/alpha/include/asm/irq.h to have NR_IRQS=128 for Eiger.

This fixes 4.19-rc7 boot on my Force Flexor A264 (Eiger subarch).

Cc: stable@vger.kernel.org # v3.18+
Signed-off-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/alpha/include/asm/irq.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/alpha/include/asm/irq.h b/arch/alpha/include/asm/irq.h
index 06377400dc09..469642801a68 100644
--- a/arch/alpha/include/asm/irq.h
+++ b/arch/alpha/include/asm/irq.h
@@ -55,15 +55,15 @@
 
 #elif defined(CONFIG_ALPHA_DP264) || \
       defined(CONFIG_ALPHA_LYNX)  || \
-      defined(CONFIG_ALPHA_SHARK) || \
-      defined(CONFIG_ALPHA_EIGER)
+      defined(CONFIG_ALPHA_SHARK)
 # define NR_IRQS	64
 
 #elif defined(CONFIG_ALPHA_TITAN)
 #define NR_IRQS		80
 
 #elif defined(CONFIG_ALPHA_RAWHIDE) || \
-	defined(CONFIG_ALPHA_TAKARA)
+      defined(CONFIG_ALPHA_TAKARA) || \
+      defined(CONFIG_ALPHA_EIGER)
 # define NR_IRQS	128
 
 #elif defined(CONFIG_ALPHA_WILDFIRE)

From 137f4db172afeb1aef3157e992e0e663c7d53aaa Mon Sep 17 00:00:00 2001
From: Andreas Ziegler <andreas.ziegler@fau.de>
Date: Wed, 16 Jan 2019 15:16:29 +0100
Subject: [PATCH 2615/3220] tracing/uprobes: Fix output for multiple string
 arguments

commit 0722069a5374b904ec1a67f91249f90e1cfae259 upstream.

When printing multiple uprobe arguments as strings the output for the
earlier arguments would also include all later string arguments.

This is best explained in an example:

Consider adding a uprobe to a function receiving two strings as
parameters which is at offset 0xa0 in strlib.so and we want to print
both parameters when the uprobe is hit (on x86_64):

$ echo 'p:func /lib/strlib.so:0xa0 +0(%di):string +0(%si):string' > \
    /sys/kernel/debug/tracing/uprobe_events

When the function is called as func("foo", "bar") and we hit the probe,
the trace file shows a line like the following:

  [...] func: (0x7f7e683706a0) arg1="foobar" arg2="bar"

Note the extra "bar" printed as part of arg1. This behaviour stacks up
for additional string arguments.

The strings are stored in a dynamically growing part of the uprobe
buffer by fetch_store_string() after copying them from userspace via
strncpy_from_user(). The return value of strncpy_from_user() is then
directly used as the required size for the string. However, this does
not take the terminating null byte into account as the documentation
for strncpy_from_user() cleary states that it "[...] returns the
length of the string (not including the trailing NUL)" even though the
null byte will be copied to the destination.

Therefore, subsequent calls to fetch_store_string() will overwrite
the terminating null byte of the most recently fetched string with
the first character of the current string, leading to the
"accumulation" of strings in earlier arguments in the output.

Fix this by incrementing the return value of strncpy_from_user() by
one if we did not hit the maximum buffer size.

Link: http://lkml.kernel.org/r/20190116141629.5752-1-andreas.ziegler@fau.de

Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 5baaa59ef09e ("tracing/probes: Implement 'memory' fetch method for uprobes")
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Andreas Ziegler <andreas.ziegler@fau.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/trace/trace_uprobe.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 1dc887bab085..518e62a398d2 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -150,7 +150,14 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 
 	ret = strncpy_from_user(dst, src, maxlen);
 	if (ret == maxlen)
-		dst[--ret] = '\0';
+		dst[ret - 1] = '\0';
+	else if (ret >= 0)
+		/*
+		 * Include the terminating null byte. In this case it
+		 * was copied by strncpy_from_user but not accounted
+		 * for in ret.
+		 */
+		ret++;
 
 	if (ret < 0) {	/* Failed to fetch string */
 		((u8 *)get_rloc_data(dest))[0] = '\0';

From 7212e37cbdf99f48e4a6c689a42f4bda1ae69001 Mon Sep 17 00:00:00 2001
From: Hedi Berriche <hedi.berriche@hpe.com>
Date: Wed, 13 Feb 2019 19:34:13 +0000
Subject: [PATCH 2616/3220] x86/platform/UV: Use efi_runtime_lock to serialise
 BIOS calls

commit f331e766c4be33f4338574f3c9f7f77e98ab4571 upstream.

Calls into UV firmware must be protected against concurrency, expose the
efi_runtime_lock to the UV platform, and use it to serialise UV BIOS
calls.

Signed-off-by: Hedi Berriche <hedi.berriche@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Russ Anderson <rja@hpe.com>
Reviewed-by: Dimitri Sivanich <sivanich@hpe.com>
Reviewed-by: Mike Travis <mike.travis@hpe.com>
Cc: Andy Shevchenko <andy@infradead.org>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-efi <linux-efi@vger.kernel.org>
Cc: platform-driver-x86@vger.kernel.org
Cc: stable@vger.kernel.org # v4.9+
Cc: Steve Wahl <steve.wahl@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190213193413.25560-5-hedi.berriche@hpe.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/uv/bios.h          |  8 +++++++-
 arch/x86/platform/uv/bios_uv.c          | 23 +++++++++++++++++++++--
 drivers/firmware/efi/runtime-wrappers.c |  7 +++++++
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 71605c7d5c5c..8b7594f2d48f 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -48,7 +48,8 @@ enum {
 	BIOS_STATUS_SUCCESS		=  0,
 	BIOS_STATUS_UNIMPLEMENTED	= -ENOSYS,
 	BIOS_STATUS_EINVAL		= -EINVAL,
-	BIOS_STATUS_UNAVAIL		= -EBUSY
+	BIOS_STATUS_UNAVAIL		= -EBUSY,
+	BIOS_STATUS_ABORT		= -EINTR,
 };
 
 /*
@@ -111,4 +112,9 @@ extern long system_serial_number;
 
 extern struct kobject *sgi_uv_kobj;	/* /sys/firmware/sgi_uv */
 
+/*
+ * EFI runtime lock; cf. firmware/efi/runtime-wrappers.c for details
+ */
+extern struct semaphore __efi_uv_runtime_lock;
+
 #endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 1584cbed0dce..a45a1c5aabea 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -28,7 +28,8 @@
 
 static struct uv_systab uv_systab;
 
-s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+			u64 a4, u64 a5)
 {
 	struct uv_systab *tab = &uv_systab;
 	s64 ret;
@@ -43,6 +44,19 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
 			a1, a2, a3, a4, a5);
 	return ret;
 }
+
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+{
+	s64 ret;
+
+	if (down_interruptible(&__efi_uv_runtime_lock))
+		return BIOS_STATUS_ABORT;
+
+	ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
+	up(&__efi_uv_runtime_lock);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(uv_bios_call);
 
 s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
@@ -51,10 +65,15 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
 	unsigned long bios_flags;
 	s64 ret;
 
+	if (down_interruptible(&__efi_uv_runtime_lock))
+		return BIOS_STATUS_ABORT;
+
 	local_irq_save(bios_flags);
-	ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+	ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
 	local_irq_restore(bios_flags);
 
+	up(&__efi_uv_runtime_lock);
+
 	return ret;
 }
 
diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index 228bbf910461..906d0224f50d 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -87,6 +87,13 @@ static DEFINE_SPINLOCK(efi_runtime_lock);
  * context through efi_pstore_write().
  */
 
+/*
+ * Expose the EFI runtime lock to the UV platform
+ */
+#ifdef CONFIG_X86_UV
+extern struct semaphore __efi_uv_runtime_lock __alias(efi_runtime_lock);
+#endif
+
 /*
  * As per commit ef68c8f87ed1 ("x86: Serialize EFI time accesses on rtc_lock"),
  * the EFI specification requires that callers of the time related runtime

From 492647b22f997a4ad1b7db790a55c3c3b80fec4b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 11 Feb 2019 23:27:42 -0600
Subject: [PATCH 2617/3220] signal: Restore the stop PTRACE_EVENT_EXIT

commit cf43a757fd49442bc38f76088b70c2299eed2c2f upstream.

In the middle of do_exit() there is there is a call
"ptrace_event(PTRACE_EVENT_EXIT, code);" That call places the process
in TACKED_TRACED aka "(TASK_WAKEKILL | __TASK_TRACED)" and waits for
for the debugger to release the task or SIGKILL to be delivered.

Skipping past dequeue_signal when we know a fatal signal has already
been delivered resulted in SIGKILL remaining pending and
TIF_SIGPENDING remaining set.  This in turn caused the
scheduler to not sleep in PTACE_EVENT_EXIT as it figured
a fatal signal was pending.  This also caused ptrace_freeze_traced
in ptrace_check_attach to fail because it left a per thread
SIGKILL pending which is what fatal_signal_pending tests for.

This difference in signal state caused strace to report
strace: Exit of unknown pid NNNNN ignored

Therefore update the signal handling state like dequeue_signal
would when removing a per thread SIGKILL, by removing SIGKILL
from the per thread signal mask and clearing TIF_SIGPENDING.

Acked-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Ivan Delalande <colona@arista.com>
Cc: stable@vger.kernel.org
Fixes: 35634ffa1751 ("signal: Always notice exiting tasks")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/signal.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index e464a2ef4ff5..96e8c3cbfa38 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2241,9 +2241,12 @@ relock:
 	}
 
 	/* Has this task already been marked for death? */
-	ksig->info.si_signo = signr = SIGKILL;
-	if (signal_group_exit(signal))
+	if (signal_group_exit(signal)) {
+		ksig->info.si_signo = signr = SIGKILL;
+		sigdelset(&current->pending.signal, SIGKILL);
+		recalc_sigpending();
 		goto fatal;
+	}
 
 	for (;;) {
 		struct k_sigaction *ka;

From 5079b1d1aac92afd534e6a6f6a127817c0a96a52 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 12 Feb 2019 14:28:03 +0100
Subject: [PATCH 2618/3220] x86/a.out: Clear the dump structure initially

commit 10970e1b4be9c74fce8ab6e3c34a7d718f063f2c upstream.

dump_thread32() in aout_core_dump() does not clear the user32 structure
allocated on the stack as the first thing on function entry.

As a result, the dump.u_comm, dump.u_ar0 and dump.signal which get
assigned before the clearing, get overwritten.

Rename that function to fill_dump() to make it clear what it does and
call it first thing.

This was caught while staring at a patch by Derek Robson
<robsonde@gmail.com>.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Derek Robson <robsonde@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Matz <matz@suse.de>
Cc: x86@kernel.org
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20190202005512.3144-1-robsonde@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/ia32/ia32_aout.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index ae6aad1d24f7..b348c4641312 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -50,7 +50,7 @@ static unsigned long get_dr(int n)
 /*
  * fill in the user structure for a core dump..
  */
-static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
+static void fill_dump(struct pt_regs *regs, struct user32 *dump)
 {
 	u32 fs, gs;
 	memset(dump, 0, sizeof(*dump));
@@ -156,10 +156,12 @@ static int aout_core_dump(struct coredump_params *cprm)
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 	has_dumped = 1;
+
+	fill_dump(cprm->regs, &dump);
+
 	strncpy(dump.u_comm, current->comm, sizeof(current->comm));
 	dump.u_ar0 = offsetof(struct user32, regs);
 	dump.signal = cprm->siginfo->si_signo;
-	dump_thread32(cprm->regs, &dump);
 
 	/*
 	 * If the size of the dump file exceeds the rlimit, then see

From 27a70770c37d6ae70c5e5989202af2a8e3c77bca Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Thu, 14 Feb 2019 20:38:47 +0200
Subject: [PATCH 2619/3220] dm thin: fix bug where bio that overwrites thin
 block ignores FUA

commit 4ae280b4ee3463fa57bbe6eede26b97daff8a0f1 upstream.

When provisioning a new data block for a virtual block, either because
the block was previously unallocated or because we are breaking sharing,
if the whole block of data is being overwritten the bio that triggered
the provisioning is issued immediately, skipping copying or zeroing of
the data block.

When this bio completes the new mapping is inserted in to the pool's
metadata by process_prepared_mapping(), where the bio completion is
signaled to the upper layers.

This completion is signaled without first committing the metadata.  If
the bio in question has the REQ_FUA flag set and the system crashes
right after its completion and before the next metadata commit, then the
write is lost despite the REQ_FUA flag requiring that I/O completion for
this request must only be signaled after the data has been committed to
non-volatile storage.

Fix this by deferring the completion of overwrite bios, with the REQ_FUA
flag set, until after the metadata has been committed.

Cc: stable@vger.kernel.org
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/md/dm-thin.c | 55 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index bc4e6825ff62..07eaa9f90712 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -256,6 +256,7 @@ struct pool {
 
 	spinlock_t lock;
 	struct bio_list deferred_flush_bios;
+	struct bio_list deferred_flush_completions;
 	struct list_head prepared_mappings;
 	struct list_head prepared_discards;
 	struct list_head active_thins;
@@ -920,6 +921,39 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 	mempool_free(m, m->tc->pool->mapping_pool);
 }
 
+static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
+{
+	struct pool *pool = tc->pool;
+	unsigned long flags;
+
+	/*
+	 * If the bio has the REQ_FUA flag set we must commit the metadata
+	 * before signaling its completion.
+	 */
+	if (!bio_triggers_commit(tc, bio)) {
+		bio_endio(bio);
+		return;
+	}
+
+	/*
+	 * Complete bio with an error if earlier I/O caused changes to the
+	 * metadata that can't be committed, e.g, due to I/O errors on the
+	 * metadata device.
+	 */
+	if (dm_thin_aborted_changes(tc->td)) {
+		bio_io_error(bio);
+		return;
+	}
+
+	/*
+	 * Batch together any bios that trigger commits and then issue a
+	 * single commit for them in process_deferred_bios().
+	 */
+	spin_lock_irqsave(&pool->lock, flags);
+	bio_list_add(&pool->deferred_flush_completions, bio);
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
@@ -952,7 +986,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 */
 	if (bio) {
 		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
-		bio_endio(bio);
+		complete_overwrite_bio(tc, bio);
 	} else {
 		inc_all_io_entry(tc->pool, m->cell->holder);
 		remap_and_issue(tc, m->cell->holder, m->data_block);
@@ -2228,7 +2262,7 @@ static void process_deferred_bios(struct pool *pool)
 {
 	unsigned long flags;
 	struct bio *bio;
-	struct bio_list bios;
+	struct bio_list bios, bio_completions;
 	struct thin_c *tc;
 
 	tc = get_first_thin(pool);
@@ -2239,26 +2273,36 @@ static void process_deferred_bios(struct pool *pool)
 	}
 
 	/*
-	 * If there are any deferred flush bios, we must commit
-	 * the metadata before issuing them.
+	 * If there are any deferred flush bios, we must commit the metadata
+	 * before issuing them or signaling their completion.
 	 */
 	bio_list_init(&bios);
+	bio_list_init(&bio_completions);
+
 	spin_lock_irqsave(&pool->lock, flags);
 	bio_list_merge(&bios, &pool->deferred_flush_bios);
 	bio_list_init(&pool->deferred_flush_bios);
+
+	bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
+	bio_list_init(&pool->deferred_flush_completions);
 	spin_unlock_irqrestore(&pool->lock, flags);
 
-	if (bio_list_empty(&bios) &&
+	if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
 	    !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
 		return;
 
 	if (commit(pool)) {
+		bio_list_merge(&bios, &bio_completions);
+
 		while ((bio = bio_list_pop(&bios)))
 			bio_io_error(bio);
 		return;
 	}
 	pool->last_commit_jiffies = jiffies;
 
+	while ((bio = bio_list_pop(&bio_completions)))
+		bio_endio(bio);
+
 	while ((bio = bio_list_pop(&bios)))
 		generic_make_request(bio);
 }
@@ -2885,6 +2929,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
 	spin_lock_init(&pool->lock);
 	bio_list_init(&pool->deferred_flush_bios);
+	bio_list_init(&pool->deferred_flush_completions);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
 	INIT_LIST_HEAD(&pool->prepared_discards);
 	INIT_LIST_HEAD(&pool->active_thins);

From f8e3d1b1014009c223793813cbc00a05706c908f Mon Sep 17 00:00:00 2001
From: James Hughes <james.hughes@raspberrypi.org>
Date: Wed, 19 Apr 2017 11:13:40 +0100
Subject: [PATCH 2620/3220] smsc95xx: Use skb_cow_head to deal with cloned skbs

commit e9156cd26a495a18706e796f02a81fee41ec14f4 upstream.

The driver was failing to check that the SKB wasn't cloned
before adding checksum data.
Replace existing handling to extend/copy the header buffer
with skb_cow_head.

Signed-off-by: James Hughes <james.hughes@raspberrypi.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Woojung Huh <Woojung.Huh@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/usb/smsc95xx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index 7cee7777d13f..b6b8aec73b28 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -1838,13 +1838,13 @@ static struct sk_buff *smsc95xx_tx_fixup(struct usbnet *dev,
 	/* We do not advertise SG, so skbs should be already linearized */
 	BUG_ON(skb_shinfo(skb)->nr_frags);
 
-	if (skb_headroom(skb) < overhead) {
-		struct sk_buff *skb2 = skb_copy_expand(skb,
-			overhead, 0, flags);
+	/* Make writable and expand header space by overhead if required */
+	if (skb_cow_head(skb, overhead)) {
+		/* Must deallocate here as returning NULL to indicate error
+		 * means the skb won't be deallocated in the caller.
+		 */
 		dev_kfree_skb_any(skb);
-		skb = skb2;
-		if (!skb)
-			return NULL;
+		return NULL;
 	}
 
 	if (csum) {

From 4307a871ce5778ff1f994ad24b3c3ec97efeaf16 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 19 Apr 2017 09:59:25 -0700
Subject: [PATCH 2621/3220] ch9200: use skb_cow_head() to deal with cloned skbs

commit 6bc6895bdd6744e0136eaa4a11fbdb20a7db4e40 upstream.

We need to ensure there is enough headroom to push extra header,
but we also need to check if we are allowed to change headers.

skb_cow_head() is the proper helper to deal with this.

Fixes: 4a476bd6d1d9 ("usbnet: New driver for QinHeng CH9200 devices")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: James Hughes <james.hughes@raspberrypi.org>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/usb/ch9200.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/usb/ch9200.c b/drivers/net/usb/ch9200.c
index 5e151e6a3e09..3c7715ea40c1 100644
--- a/drivers/net/usb/ch9200.c
+++ b/drivers/net/usb/ch9200.c
@@ -255,14 +255,9 @@ static struct sk_buff *ch9200_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 	tx_overhead = 0x40;
 
 	len = skb->len;
-	if (skb_headroom(skb) < tx_overhead) {
-		struct sk_buff *skb2;
-
-		skb2 = skb_copy_expand(skb, tx_overhead, 0, flags);
+	if (skb_cow_head(skb, tx_overhead)) {
 		dev_kfree_skb_any(skb);
-		skb = skb2;
-		if (!skb)
-			return NULL;
+		return NULL;
 	}
 
 	__skb_push(skb, tx_overhead);

From 49cf68d70f47d951c9053d769d3976d9e1be1a25 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 19 Apr 2017 09:59:26 -0700
Subject: [PATCH 2622/3220] kaweth: use skb_cow_head() to deal with cloned skbs

commit 39fba7835aacda65284a86e611774cbba71dac20 upstream.

We can use skb_cow_head() to properly deal with clones,
especially the ones coming from TCP stack that allow their head being
modified. This avoids a copy.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: James Hughes <james.hughes@raspberrypi.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/usb/kaweth.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c
index cd93220c9b45..a628db738b8a 100644
--- a/drivers/net/usb/kaweth.c
+++ b/drivers/net/usb/kaweth.c
@@ -812,18 +812,12 @@ static netdev_tx_t kaweth_start_xmit(struct sk_buff *skb,
 	}
 
 	/* We now decide whether we can put our special header into the sk_buff */
-	if (skb_cloned(skb) || skb_headroom(skb) < 2) {
-		/* no such luck - we make our own */
-		struct sk_buff *copied_skb;
-		copied_skb = skb_copy_expand(skb, 2, 0, GFP_ATOMIC);
-		dev_kfree_skb_irq(skb);
-		skb = copied_skb;
-		if (!copied_skb) {
-			kaweth->stats.tx_errors++;
-			netif_start_queue(net);
-			spin_unlock_irq(&kaweth->device_lock);
-			return NETDEV_TX_OK;
-		}
+	if (skb_cow_head(skb, 2)) {
+		kaweth->stats.tx_errors++;
+		netif_start_queue(net);
+		spin_unlock_irq(&kaweth->device_lock);
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
 	}
 
 	private_header = (__le16 *)__skb_push(skb, 2);

From a4af2252e5e2dafa0eb732ecde40385ee2229d7a Mon Sep 17 00:00:00 2001
From: John Youn <johnyoun@synopsys.com>
Date: Thu, 3 Nov 2016 17:55:45 -0700
Subject: [PATCH 2623/3220] usb: dwc2: Remove unnecessary kfree

commit cd4b1e34655d46950c065d9284b596cd8d7b28cd upstream.

This shouldn't be freed by the HCD as it is owned by the core and
allocated with devm_kzalloc.

Signed-off-by: John Youn <johnyoun@synopsys.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/dwc2/hcd.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c
index 85fb6226770c..98339a850940 100644
--- a/drivers/usb/dwc2/hcd.c
+++ b/drivers/usb/dwc2/hcd.c
@@ -3164,7 +3164,6 @@ error3:
 error2:
 	usb_put_hcd(hcd);
 error1:
-	kfree(hsotg->core_params);
 
 #ifdef CONFIG_USB_DWC2_TRACK_MISSED_SOFS
 	kfree(hsotg->last_frame_num_array);

From d5800cff4fb8a4889103a6768629fd236ddf09a6 Mon Sep 17 00:00:00 2001
From: Christian Lamparter <chunkeey@gmail.com>
Date: Mon, 21 May 2018 22:57:37 +0200
Subject: [PATCH 2624/3220] pinctrl: msm: fix gpio-hog related boot issues

commit a86caa9ba5d70696ceb35d1d39caa20d8b641387 upstream.

Sven Eckelmann reported an issue with the current IPQ4019 pinctrl.
Setting up any gpio-hog in the device-tree for his device would
"kill the bootup completely":

| [    0.477838] msm_serial 78af000.serial: could not find pctldev for node /soc/pinctrl@1000000/serial_pinmux, deferring probe
| [    0.499828] spi_qup 78b5000.spi: could not find pctldev for node /soc/pinctrl@1000000/spi_0_pinmux, deferring probe
| [    1.298883] requesting hog GPIO enable USB2 power (chip 1000000.pinctrl, offset 58) failed, -517
| [    1.299609] gpiochip_add_data: GPIOs 0..99 (1000000.pinctrl) failed to register
| [    1.308589] ipq4019-pinctrl 1000000.pinctrl: Failed register gpiochip
| [    1.316586] msm_serial 78af000.serial: could not find pctldev for node /soc/pinctrl@1000000/serial_pinmux, deferring probe
| [    1.322415] spi_qup 78b5000.spi: could not find pctldev for node /soc/pinctrl@1000000/spi_0_pinmux, deferri

This was also verified on a RT-AC58U (IPQ4018) which would
no longer boot, if a gpio-hog was specified. (Tried forcing
the USB LED PIN (GPIO0) to high.).

The problem is that Pinctrl+GPIO registration is currently
peformed in the following order in pinctrl-msm.c:
	1. pinctrl_register()
	2. gpiochip_add()
	3. gpiochip_add_pin_range()

The actual error code -517 == -EPROBE_DEFER is coming from
pinctrl_get_device_gpio_range(), which is called through:
        gpiochip_add
            of_gpiochip_add
                of_gpiochip_scan_gpios
                    gpiod_hog
                        gpiochip_request_own_desc
                            __gpiod_request
                                chip->request
                                    gpiochip_generic_request
                                       pinctrl_gpio_request
                                          pinctrl_get_device_gpio_range

pinctrl_get_device_gpio_range() is unable to find any valid
pin ranges, since nothing has been added to the pinctrldev_list yet.
so the range can't be found, and the operation fails with -EPROBE_DEFER.

This patch fixes the issue by adding the "gpio-ranges" property to
the pinctrl device node of all upstream Qcom SoC. The pin ranges are
then added by the gpio core.

In order to remain compatible with older, existing DTs (and ACPI)
a check for the "gpio-ranges" property has been added to
msm_gpio_init(). This prevents the driver of adding the same entry
to the pinctrldev_list twice.

Reported-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Tested-by: Sven Eckelmann <sven.eckelmann@openmesh.com> [ipq4019]
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pinctrl/qcom/pinctrl-msm.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index 9736f9be5447..a9d2e8a0aa85 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -806,11 +806,24 @@ static int msm_gpio_init(struct msm_pinctrl *pctrl)
 		return ret;
 	}
 
-	ret = gpiochip_add_pin_range(&pctrl->chip, dev_name(pctrl->dev), 0, 0, chip->ngpio);
-	if (ret) {
-		dev_err(pctrl->dev, "Failed to add pin range\n");
-		gpiochip_remove(&pctrl->chip);
-		return ret;
+	/*
+	 * For DeviceTree-supported systems, the gpio core checks the
+	 * pinctrl's device node for the "gpio-ranges" property.
+	 * If it is present, it takes care of adding the pin ranges
+	 * for the driver. In this case the driver can skip ahead.
+	 *
+	 * In order to remain compatible with older, existing DeviceTree
+	 * files which don't set the "gpio-ranges" property or systems that
+	 * utilize ACPI the driver has to call gpiochip_add_pin_range().
+	 */
+	if (!of_property_read_bool(pctrl->dev->of_node, "gpio-ranges")) {
+		ret = gpiochip_add_pin_range(&pctrl->chip,
+			dev_name(pctrl->dev), 0, 0, chip->ngpio);
+		if (ret) {
+			dev_err(pctrl->dev, "Failed to add pin range\n");
+			gpiochip_remove(&pctrl->chip);
+			return ret;
+		}
 	}
 
 	ret = gpiochip_irqchip_add(chip,

From cb4f43bf87eb7332b3df1e78e50dee2dffea06f8 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Mon, 12 Feb 2018 23:59:51 +0100
Subject: [PATCH 2625/3220] uapi/if_ether.h: move __UAPI_DEF_ETHHDR libc define

commit da360299b6734135a5f66d7db458dcc7801c826a upstream.

This fixes a compile problem of some user space applications by not
including linux/libc-compat.h in uapi/if_ether.h.

linux/libc-compat.h checks which "features" the header files, included
from the libc, provide to make the Linux kernel uapi header files only
provide no conflicting structures and enums. If a user application mixes
kernel headers and libc headers it could happen that linux/libc-compat.h
gets included too early where not all other libc headers are included
yet. Then the linux/libc-compat.h would not prevent all the
redefinitions and we run into compile problems.
This patch removes the include of linux/libc-compat.h from
uapi/if_ether.h to fix the recently introduced case, but not all as this
is more or less impossible.

It is no problem to do the check directly in the if_ether.h file and not
in libc-compat.h as this does not need any fancy glibc header detection
as glibc never provided struct ethhdr and should define
__UAPI_DEF_ETHHDR by them self when they will provide this.

The following test program did not compile correctly any more:

#include <linux/if_ether.h>
#include <netinet/in.h>
#include <linux/in.h>

int main(void)
{
	return 0;
}

Fixes: 6926e041a892 ("uapi/if_ether.h: prevent redefinition of struct ethhdr")
Reported-by: Guillaume Nault <g.nault@alphalink.fr>
Cc: <stable@vger.kernel.org> # 4.15
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/if_ether.h    | 6 +++++-
 include/uapi/linux/libc-compat.h | 6 ------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index cb490cd9376f..373afec2ed34 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -22,7 +22,6 @@
 #define _UAPI_LINUX_IF_ETHER_H
 
 #include <linux/types.h>
-#include <linux/libc-compat.h>
 
 /*
  *	IEEE 802.3 Ethernet magic constants.  The frame sizes omit the preamble
@@ -137,6 +136,11 @@
  *	This is an Ethernet frame header.
  */
 
+/* allow libcs like musl to deactivate this, glibc does not implement this. */
+#ifndef __UAPI_DEF_ETHHDR
+#define __UAPI_DEF_ETHHDR		1
+#endif
+
 #if __UAPI_DEF_ETHHDR
 struct ethhdr {
 	unsigned char	h_dest[ETH_ALEN];	/* destination eth addr	*/
diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index 5da44c571cdd..e4f048ee7043 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -184,10 +184,4 @@
 
 #endif /* __GLIBC__ */
 
-/* Definitions for if_ether.h */
-/* allow libcs like musl to deactivate this, glibc does not implement this. */
-#ifndef __UAPI_DEF_ETHHDR
-#define __UAPI_DEF_ETHHDR		1
-#endif
-
 #endif /* _UAPI_LIBC_COMPAT_H */

From 332deb1f5ce960682d35a2519f1bd50f8ba52820 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 20 Feb 2019 10:13:24 +0100
Subject: [PATCH 2626/3220] Linux 4.4.175

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1fa281069379..5f0bdef2af99 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 174
+SUBLEVEL = 175
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From 9467d98f6d484257ddb6004721a4e2c957c39c73 Mon Sep 17 00:00:00 2001
From: Zhiqiang Liu <liuzhiqiang26@huawei.com>
Date: Mon, 11 Feb 2019 10:57:46 +0800
Subject: [PATCH 2627/3220] net: fix IPv6 prefix route residue

[ Upstream commit e75913c93f7cd5f338ab373c34c93a655bd309cb ]

Follow those steps:
 # ip addr add 2001:123::1/32 dev eth0
 # ip addr add 2001:123:456::2/64 dev eth0
 # ip addr del 2001:123::1/32 dev eth0
 # ip addr del 2001:123:456::2/64 dev eth0
and then prefix route of 2001:123::1/32 will still exist.

This is because ipv6_prefix_equal in check_cleanup_prefix_route
func does not check whether two IPv6 addresses have the same
prefix length. If the prefix of one address starts with another
shorter address prefix, even though their prefix lengths are
different, the return value of ipv6_prefix_equal is true.

Here I add a check of whether two addresses have the same prefix
to decide whether their prefixes are equal.

Fixes: 5b84efecb7d9 ("ipv6 addrconf: don't cleanup prefix route for IFA_F_NOPREFIXROUTE")
Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
Reported-by: Wenhao Zhang <zhangwenhao8@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/ipv6/addrconf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4dde1e0e7d37..086cdf9f0501 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1043,7 +1043,8 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
 	list_for_each_entry(ifa, &idev->addr_list, if_list) {
 		if (ifa == ifp)
 			continue;
-		if (!ipv6_prefix_equal(&ifa->addr, &ifp->addr,
+		if (ifa->prefix_len != ifp->prefix_len ||
+		    !ipv6_prefix_equal(&ifa->addr, &ifp->addr,
 				       ifp->prefix_len))
 			continue;
 		if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE))

From 9d6ca4d88e276a61fac3e6b47908f2abb0e6f3ae Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 7 Feb 2019 14:13:18 +0100
Subject: [PATCH 2628/3220] vsock: cope with memory allocation failure at
 socket creation time

[ Upstream commit 225d9464268599a5b4d094d02ec17808e44c7553 ]

In the unlikely event that the kmalloc call in vmci_transport_socket_init()
fails, we end-up calling vmci_transport_destruct() with a NULL vmci_trans()
and oopsing.

This change addresses the above explicitly checking for zero vmci_trans()
at destruction time.

Reported-by: Xiumei Mu <xmu@redhat.com>
Fixes: d021c344051a ("VSOCK: Introduce VM Sockets")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/vmw_vsock/vmci_transport.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index d24773552b64..217810674c35 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1656,6 +1656,10 @@ static void vmci_transport_cleanup(struct work_struct *work)
 
 static void vmci_transport_destruct(struct vsock_sock *vsk)
 {
+	/* transport can be NULL if we hit a failure at init() time */
+	if (!vmci_trans(vsk))
+		return;
+
 	/* Ensure that the detach callback doesn't use the sk/vsk
 	 * we are about to destruct.
 	 */

From 27763b01a4ff6a7cd87ed388f317ac489631468b Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 26 Dec 2018 11:28:24 +0000
Subject: [PATCH 2629/3220] hwmon: (lm80) Fix missing unlock on error in
 set_fan_div()

[ Upstream commit 07bd14ccc3049f9c0147a91a4227a571f981601a ]

Add the missing unlock before return from function set_fan_div()
in the error handling case.

Fixes: c9c63915519b ("hwmon: (lm80) fix a missing check of the status of SMBus read")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwmon/lm80.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/hwmon/lm80.c b/drivers/hwmon/lm80.c
index cb6606a0470d..be60bd5bab78 100644
--- a/drivers/hwmon/lm80.c
+++ b/drivers/hwmon/lm80.c
@@ -393,8 +393,10 @@ static ssize_t set_fan_div(struct device *dev, struct device_attribute *attr,
 	}
 
 	rv = lm80_read_value(client, LM80_REG_FANDIV);
-	if (rv < 0)
+	if (rv < 0) {
+		mutex_unlock(&data->update_lock);
 		return rv;
+	}
 	reg = (rv & ~(3 << (2 * (nr + 1))))
 	    | (data->fan_div[nr] << (2 * (nr + 1)));
 	lm80_write_value(client, LM80_REG_FANDIV, reg);

From 1815d11b544aad4bbc67b614fed560eb8ffbce4b Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke.mehrtens@intel.com>
Date: Fri, 15 Feb 2019 17:58:54 +0100
Subject: [PATCH 2630/3220] net: Fix for_each_netdev_feature on Big endian

[ Upstream commit 3b89ea9c5902acccdbbdec307c85edd1bf52515e ]

The features attribute is of type u64 and stored in the native endianes on
the system. The for_each_set_bit() macro takes a pointer to a 32 bit array
and goes over the bits in this area. On little Endian systems this also
works with an u64 as the most significant bit is on the highest address,
but on big endian the words are swapped. When we expect bit 15 here we get
bit 47 (15 + 32).

This patch converts it more or less to its own for_each_set_bit()
implementation which works on 64 bit integers directly. This is then
completely in host endianness and should work like expected.

Fixes: fd867d51f ("net/core: generic support for disabling netdev features down stack")
Signed-off-by: Hauke Mehrtens <hauke.mehrtens@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/netdev_features.h | 23 +++++++++++++++++++++--
 net/core/dev.c                  |  4 ++--
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index f0d87347df19..a4edbb93e2c6 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -11,6 +11,7 @@
 #define _LINUX_NETDEV_FEATURES_H
 
 #include <linux/types.h>
+#include <asm/byteorder.h>
 
 typedef u64 netdev_features_t;
 
@@ -125,8 +126,26 @@ enum {
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
 
-#define for_each_netdev_feature(mask_addr, bit)	\
-	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
+/* Finds the next feature with the highest number of the range of start till 0.
+ */
+static inline int find_next_netdev_feature(u64 feature, unsigned long start)
+{
+	/* like BITMAP_LAST_WORD_MASK() for u64
+	 * this sets the most significant 64 - start to 0.
+	 */
+	feature &= ~0ULL >> (-start & ((sizeof(feature) * 8) - 1));
+
+	return fls64(feature) - 1;
+}
+
+/* This goes for the MSB to the LSB through the set feature bits,
+ * mask_addr should be a u64 and bit an int
+ */
+#define for_each_netdev_feature(mask_addr, bit)				\
+	for ((bit) = find_next_netdev_feature((mask_addr),		\
+					      NETDEV_FEATURE_COUNT);	\
+	     (bit) >= 0;						\
+	     (bit) = find_next_netdev_feature((mask_addr), (bit) - 1))
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
diff --git a/net/core/dev.c b/net/core/dev.c
index e03c1d2f6707..49f78bce5795 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6421,7 +6421,7 @@ static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 	netdev_features_t feature;
 	int feature_bit;
 
-	for_each_netdev_feature(&upper_disables, feature_bit) {
+	for_each_netdev_feature(upper_disables, feature_bit) {
 		feature = __NETIF_F_BIT(feature_bit);
 		if (!(upper->wanted_features & feature)
 		    && (features & feature)) {
@@ -6441,7 +6441,7 @@ static void netdev_sync_lower_features(struct net_device *upper,
 	netdev_features_t feature;
 	int feature_bit;
 
-	for_each_netdev_feature(&upper_disables, feature_bit) {
+	for_each_netdev_feature(upper_disables, feature_bit) {
 		feature = __NETIF_F_BIT(feature_bit);
 		if (!(features & feature) && (lower->features & feature)) {
 			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",

From 7fecacc0d5ff0dfabb1b9358279ca316cd0bbda6 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Tue, 19 Feb 2019 23:45:29 +0800
Subject: [PATCH 2631/3220] sky2: Increase D3 delay again

[ Upstream commit 1765f5dcd00963e33f1b8a4e0f34061fbc0e2f7f ]

Another platform requires even longer delay to make the device work
correctly after S3.

So increase the delay to 300ms.

BugLink: https://bugs.launchpad.net/bugs/1798921

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/marvell/sky2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 6e5065f0907b..4b97aa24559a 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -5079,7 +5079,7 @@ static int sky2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	INIT_WORK(&hw->restart_work, sky2_restart);
 
 	pci_set_drvdata(pdev, hw);
-	pdev->d3_delay = 200;
+	pdev->d3_delay = 300;
 
 	return 0;
 

From 817d4a4295661c0864919db213d9a81b0d2db6f7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 16 Feb 2019 13:44:39 -0800
Subject: [PATCH 2632/3220] net: Add header for usage of fls64()

[ Upstream commit 8681ef1f3d295bd3600315325f3b3396d76d02f6 ]

Fixes: 3b89ea9c5902 ("net: Fix for_each_netdev_feature on Big endian")
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/netdev_features.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a4edbb93e2c6..0508fcc67480 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -11,6 +11,7 @@
 #define _LINUX_NETDEV_FEATURES_H
 
 #include <linux/types.h>
+#include <linux/bitops.h>
 #include <asm/byteorder.h>
 
 typedef u64 netdev_features_t;

From 122e4a30779336643614fe0f81e1f3fcbd0a371c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 Feb 2019 13:36:21 -0800
Subject: [PATCH 2633/3220] tcp: tcp_v4_err() should be more careful

[ Upstream commit 2c4cc9712364c051b1de2d175d5fbea6be948ebf ]

ICMP handlers are not very often stressed, we should
make them more resilient to bugs that might surface in
the future.

If there is no packet in retransmit queue, we should
avoid a NULL deref.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: soukjin bae <soukjin.bae@samsung.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_ipv4.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ee8399f11fd0..b3d6b8e77300 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -466,14 +466,15 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		if (sock_owned_by_user(sk))
 			break;
 
+		skb = tcp_write_queue_head(sk);
+		if (WARN_ON_ONCE(!skb))
+			break;
+
 		icsk->icsk_backoff--;
 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 					       TCP_TIMEOUT_INIT;
 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 
-		skb = tcp_write_queue_head(sk);
-		BUG_ON(!skb);
-
 		remaining = icsk->icsk_rto -
 			    min(icsk->icsk_rto,
 				tcp_time_stamp - tcp_skb_timestamp(skb));

From 2e7b872833c72bca349f50feec83bf352664937a Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Fri, 15 Feb 2019 14:44:18 -0800
Subject: [PATCH 2634/3220] net: Do not allocate page fragments that are not
 skb aligned

[ Upstream commit 3bed3cc4156eedf652b4df72bdb35d4f1a2a739d ]

This patch addresses the fact that there are drivers, specifically tun,
that will call into the network page fragment allocators with buffer sizes
that are not cache aligned. Doing this could result in data alignment
and DMA performance issues as these fragment pools are also shared with the
skb allocator and any other devices that will use napi_alloc_frags or
netdev_alloc_frags.

Fixes: ffde7328a36d ("net: Split netdev_alloc_frag into __alloc_page_frag and add __napi_alloc_frag")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/skbuff.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fea7c24e99d0..2f63a90065e6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -374,6 +374,8 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
  */
 void *netdev_alloc_frag(unsigned int fragsz)
 {
+	fragsz = SKB_DATA_ALIGN(fragsz);
+
 	return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
 }
 EXPORT_SYMBOL(netdev_alloc_frag);
@@ -387,6 +389,8 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 
 void *napi_alloc_frag(unsigned int fragsz)
 {
+	fragsz = SKB_DATA_ALIGN(fragsz);
+
 	return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
 }
 EXPORT_SYMBOL(napi_alloc_frag);

From a466589807a13da2b7fbb2776e01634b38a4233b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 Feb 2019 13:36:20 -0800
Subject: [PATCH 2635/3220] tcp: clear icsk_backoff in tcp_write_queue_purge()

[ Upstream commit 04c03114be82194d4a4858d41dba8e286ad1787c ]

soukjin bae reported a crash in tcp_v4_err() handling
ICMP_DEST_UNREACH after tcp_write_queue_head(sk)
returned a NULL pointer.

Current logic should have prevented this :

  if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
      !icsk->icsk_backoff || fastopen)
      break;

Problem is the write queue might have been purged
and icsk_backoff has not been cleared.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: soukjin bae <soukjin.bae@samsung.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/tcp.h | 1 +
 net/ipv4/tcp.c    | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a99f75ef6a73..14ec97309581 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1457,6 +1457,7 @@ static inline void tcp_write_queue_purge(struct sock *sk)
 		sk_wmem_free_skb(sk, skb);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
+	inet_csk(sk)->icsk_backoff = 0;
 }
 
 static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b7492aabe710..f3a4d2dcbf7a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2253,7 +2253,6 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->write_seq += tp->max_window + 2;
 	if (tp->write_seq == 0)
 		tp->write_seq = 1;
-	icsk->icsk_backoff = 0;
 	tp->snd_cwnd = 2;
 	icsk->icsk_probes_out = 0;
 	tp->packets_out = 0;

From ffed570a85ad163d3d4d3573941da6998ae2c0dc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Feb 2019 12:27:38 -0800
Subject: [PATCH 2636/3220] vxlan: test dev->flags & IFF_UP before calling
 netif_rx()

[ Upstream commit 4179cb5a4c924cd233eaadd081882425bc98f44e ]

netif_rx() must be called under a strict contract.

At device dismantle phase, core networking clears IFF_UP
and flush_all_backlogs() is called after rcu grace period
to make sure no incoming packet might be in a cpu backlog
and still referencing the device.

Most drivers call netif_rx() from their interrupt handler,
and since the interrupts are disabled at device dismantle,
netif_rx() does not have to check dev->flags & IFF_UP

Virtual drivers do not have this guarantee, and must
therefore make the check themselves.

Otherwise we risk use-after-free and/or crashes.

Note this patch also fixes a small issue that came
with commit ce6502a8f957 ("vxlan: fix a use after free
in vxlan_encap_bypass"), since the dev->stats.rx_dropped
change was done on the wrong device.

Fixes: d342894c5d2f ("vxlan: virtual extensible lan")
Fixes: ce6502a8f957 ("vxlan: fix a use after free in vxlan_encap_bypass")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Petr Machata <petrm@mellanox.com>
Cc: Ido Schimmel <idosch@mellanox.com>
Cc: Roopa Prabhu <roopa@cumulusnetworks.com>
Cc: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/vxlan.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c41378214ede..553908adf3c5 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1881,7 +1881,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 	struct pcpu_sw_netstats *tx_stats, *rx_stats;
 	union vxlan_addr loopback;
 	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
-	struct net_device *dev = skb->dev;
+	struct net_device *dev;
 	int len = skb->len;
 
 	tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
@@ -1901,8 +1901,15 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 #endif
 	}
 
+	rcu_read_lock();
+	dev = skb->dev;
+	if (unlikely(!(dev->flags & IFF_UP))) {
+		kfree_skb(skb);
+		goto drop;
+	}
+
 	if (dst_vxlan->flags & VXLAN_F_LEARN)
-		vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source);
+		vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source);
 
 	u64_stats_update_begin(&tx_stats->syncp);
 	tx_stats->tx_packets++;
@@ -1915,8 +1922,10 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 		rx_stats->rx_bytes += len;
 		u64_stats_update_end(&rx_stats->syncp);
 	} else {
+drop:
 		dev->stats.rx_dropped++;
 	}
+	rcu_read_unlock();
 }
 
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,

From 3fa6a285279f2aab6455026c08b2bad4d972cefa Mon Sep 17 00:00:00 2001
From: Jose Abreu <jose.abreu@synopsys.com>
Date: Mon, 18 Feb 2019 14:35:03 +0100
Subject: [PATCH 2637/3220] net: stmmac: Fix a race in EEE enable callback

[ Upstream commit 8a7493e58ad688eb23b81e45461c5d314f4402f1 ]

We are saving the status of EEE even before we try to enable it. This
leads to a race with XMIT function that tries to arm EEE timer before we
set it up.

Fix this by only saving the EEE parameters after all operations are
performed with success.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Fixes: d765955d2ae0 ("stmmac: add the Energy Efficient Ethernet support")
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../ethernet/stmicro/stmmac/stmmac_ethtool.c  | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 2e51b816a7e8..fbf701e5f1e9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -614,25 +614,27 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev,
 				     struct ethtool_eee *edata)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
+	int ret;
 
-	priv->eee_enabled = edata->eee_enabled;
-
-	if (!priv->eee_enabled)
+	if (!edata->eee_enabled) {
 		stmmac_disable_eee_mode(priv);
-	else {
+	} else {
 		/* We are asking for enabling the EEE but it is safe
 		 * to verify all by invoking the eee_init function.
 		 * In case of failure it will return an error.
 		 */
-		priv->eee_enabled = stmmac_eee_init(priv);
-		if (!priv->eee_enabled)
+		edata->eee_enabled = stmmac_eee_init(priv);
+		if (!edata->eee_enabled)
 			return -EOPNOTSUPP;
-
-		/* Do not change tx_lpi_timer in case of failure */
-		priv->tx_lpi_timer = edata->tx_lpi_timer;
 	}
 
-	return phy_ethtool_set_eee(priv->phydev, edata);
+	ret = phy_ethtool_set_eee(dev->phydev, edata);
+	if (ret)
+		return ret;
+
+	priv->eee_enabled = edata->eee_enabled;
+	priv->tx_lpi_timer = edata->tx_lpi_timer;
+	return 0;
 }
 
 static u32 stmmac_usec2riwt(u32 usec, struct stmmac_priv *priv)

From 01fb36063d972e1b1ac751d9249300098ba20794 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Wed, 6 Feb 2019 19:18:04 +0100
Subject: [PATCH 2638/3220] net: ipv4: use a dedicated counter for icmp_v4
 redirect packets

[ Upstream commit c09551c6ff7fe16a79a42133bcecba5fc2fc3291 ]

According to the algorithm described in the comment block at the
beginning of ip_rt_send_redirect, the host should try to send
'ip_rt_redirect_number' ICMP redirect packets with an exponential
backoff and then stop sending them at all assuming that the destination
ignores redirects.
If the device has previously sent some ICMP error packets that are
rate-limited (e.g TTL expired) and continues to receive traffic,
the redirect packets will never be transmitted. This happens since
peer->rate_tokens will be typically greater than 'ip_rt_redirect_number'
and so it will never be reset even if the redirect silence timeout
(ip_rt_redirect_silence) has elapsed without receiving any packet
requiring redirects.

Fix it by using a dedicated counter for the number of ICMP redirect
packets that has been sent by the host

I have not been able to identify a given commit that introduced the
issue since ip_rt_send_redirect implements the same rate-limiting
algorithm from commit 1da177e4c3f4 ("Linux-2.6.12-rc2")

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inetpeer.h | 1 +
 net/ipv4/inetpeer.c    | 1 +
 net/ipv4/route.c       | 7 +++++--
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 235c7811a86a..408d76f47bd2 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -40,6 +40,7 @@ struct inet_peer {
 
 	u32			metrics[RTAX_MAX];
 	u32			rate_tokens;	/* rate limiting for ICMP */
+	u32			n_redirects;
 	unsigned long		rate_last;
 	union {
 		struct list_head	gc_list;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 86fa45809540..0c5862914f05 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -448,6 +448,7 @@ relookup:
 		atomic_set(&p->rid, 0);
 		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 		p->rate_tokens = 0;
+		p->n_redirects = 0;
 		/* 60*HZ is arbitrary, but chosen enough high so that the first
 		 * calculation of tokens is at its maximum.
 		 */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3251dede1815..80ce6b0672d2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -876,13 +876,15 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	/* No redirected packets during ip_rt_redirect_silence;
 	 * reset the algorithm.
 	 */
-	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 		peer->rate_tokens = 0;
+		peer->n_redirects = 0;
+	}
 
 	/* Too many ignored redirects; do not send anything
 	 * set dst.rate_last to the last seen redirected packet.
 	 */
-	if (peer->rate_tokens >= ip_rt_redirect_number) {
+	if (peer->n_redirects >= ip_rt_redirect_number) {
 		peer->rate_last = jiffies;
 		goto out_put_peer;
 	}
@@ -899,6 +901,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 		peer->rate_last = jiffies;
 		++peer->rate_tokens;
+		++peer->n_redirects;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
 		if (log_martians &&
 		    peer->rate_tokens == ip_rt_redirect_number)

From f5aebe74113f488cad5444ab66f28c08d9c0a39a Mon Sep 17 00:00:00 2001
From: "chenzefeng (A)" <chenzefeng2@huawei.com>
Date: Wed, 20 Feb 2019 12:37:54 +0000
Subject: [PATCH 2639/3220] x86: livepatch: Treat R_X86_64_PLT32 as
 R_X86_64_PC32

Signed-off-by: chenzefeng <chenzefeng2@huawei.com>

On x86-64, for 32-bit PC-relacive branches, we can generate PLT32
relocation, instead of PC32 relocation. and R_X86_64_PLT32 can be
treated the same as R_X86_64_PC32 since linux kernel doesn't use PLT.

commit b21ebf2fb4cd ("x86: Treat R_X86_64_PLT32 as R_X86_64_PC32") been
fixed for the module loading, but not fixed for livepatch relocation,
which will fail to load livepatch with the error message as follow:
relocation failed for symbol <symbol name> at <symbol address>

This issue only effacted the kernel version from 4.0 to 4.6, becauce the
function klp_write_module_reloc is introduced by: commit b700e7f03df5
("livepatch: kernel: add support for live patching") and deleted by:
commit 425595a7fc20 ("livepatch: reuse module loader code to write
relocations")

Signed-off-by: chenzefeng <chenzefeng2@huawei.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/livepatch.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
index d1d35ccffed3..579f8f813ce0 100644
--- a/arch/x86/kernel/livepatch.c
+++ b/arch/x86/kernel/livepatch.c
@@ -58,6 +58,7 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
 		val = (s32)value;
 		break;
 	case R_X86_64_PC32:
+	case R_X86_64_PLT32:
 		val = (u32)(value - loc);
 		break;
 	default:

From bc4db52485cc6edfd6861417d178bb71f743ced7 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Sat, 26 Jan 2019 01:54:33 +0100
Subject: [PATCH 2640/3220] kvm: fix kvm_ioctl_create_device() reference
 counting (CVE-2019-6974)

commit cfa39381173d5f969daf43582c95ad679189cbc9 upstream.

kvm_ioctl_create_device() does the following:

1. creates a device that holds a reference to the VM object (with a borrowed
   reference, the VM's refcount has not been bumped yet)
2. initializes the device
3. transfers the reference to the device to the caller's file descriptor table
4. calls kvm_get_kvm() to turn the borrowed reference to the VM into a real
   reference

The ownership transfer in step 3 must not happen before the reference to the VM
becomes a proper, non-borrowed reference, which only happens in step 4.
After step 3, an attacker can close the file descriptor and drop the borrowed
reference, which can cause the refcount of the kvm object to drop to zero.

This means that we need to grab a reference for the device before
anon_inode_getfd(), otherwise the VM can disappear from under us.

Fixes: 852b6d57dc7f ("kvm: add device control API")
Cc: stable@kernel.org
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 virt/kvm/kvm_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e4be695eb789..fce48d11ae07 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2711,14 +2711,15 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 		return ret;
 	}
 
+	kvm_get_kvm(kvm);
 	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
 	if (ret < 0) {
+		kvm_put_kvm(kvm);
 		ops->destroy(dev);
 		return ret;
 	}
 
 	list_add(&dev->vm_node, &kvm->devices);
-	kvm_get_kvm(kvm);
 	cd->fd = ret;
 	return 0;
 }

From e7dff89d771365d2d96e7584a633e196448fefcf Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Fri, 20 Nov 2015 10:51:00 +0000
Subject: [PATCH 2641/3220] mfd: as3722: Handle interrupts on suspend

commit 35deff7eb212b661b32177b6043f674fde6314d7 upstream.

The as3722 device is registered as an irqchip and the as3722-rtc interrupt
is one of it's interrupt sources. When using the as3722-rtc as a wake-up
device from suspend, the following is seen:

  PM: Syncing filesystems ... done.
  Freezing user space processes ... (elapsed 0.001 seconds) done.
  Freezing remaining freezable tasks ... (elapsed 0.001 seconds) done.
  Suspending console(s) (use no_console_suspend to debug)
  PM: suspend of devices complete after 161.119 msecs
  PM: late suspend of devices complete after 1.048 msecs
  PM: noirq suspend of devices complete after 0.756 msecs
  Disabling non-boot CPUs ...
  CPU1: shutdown
  CPU2: shutdown
  CPU3: shutdown
  Entering suspend state LP1
  Enabling non-boot CPUs ...
  CPU1 is up
  CPU2 is up
  CPU3 is up
  PM: noirq resume of devices complete after 0.487 msecs
  as3722 4-0040: Failed to read IRQ status: -16
  as3722 4-0040: Failed to read IRQ status: -16
  as3722 4-0040: Failed to read IRQ status: -16
  as3722 4-0040: Failed to read IRQ status: -16
  ...

The reason why the as3722 interrupt status cannot be read is because the
as3722 interrupt is not masked during suspend and when the as3722-rtc
interrupt occurs, to wake-up the device, the interrupt is seen before the
i2c controller has been resumed in order to read the as3722 interrupt
status.

The as3722-rtc driver sets it's interrupt as a wake-up source during
suspend, which gets propagated to the parent as3722 interrupt. However,
the as3722-rtc driver cannot disable it's interrupt during suspend
otherwise we would never be woken up and so the as3722 must disable it's
interrupt instead.

Fix this by disabling the as3722 interrupt during suspend. To ensure that
a wake-up event from the as3722 is not missing, enable the as3722 interrupt
as a wake-up source before disabling the interrupt on entering suspend.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mfd/as3722.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/mfd/as3722.c b/drivers/mfd/as3722.c
index 924ea90494ae..3b21eb4bc480 100644
--- a/drivers/mfd/as3722.c
+++ b/drivers/mfd/as3722.c
@@ -405,6 +405,8 @@ static int as3722_i2c_probe(struct i2c_client *i2c,
 		goto scrub;
 	}
 
+	device_init_wakeup(as3722->dev, true);
+
 	dev_dbg(as3722->dev, "AS3722 core driver initialized successfully\n");
 	return 0;
 
@@ -422,6 +424,29 @@ static int as3722_i2c_remove(struct i2c_client *i2c)
 	return 0;
 }
 
+static int as3722_i2c_suspend(struct device *dev)
+{
+	struct as3722 *as3722 = dev_get_drvdata(dev);
+
+	if (device_may_wakeup(dev))
+		enable_irq_wake(as3722->chip_irq);
+	disable_irq(as3722->chip_irq);
+
+	return 0;
+}
+
+static int as3722_i2c_resume(struct device *dev)
+{
+	struct as3722 *as3722 = dev_get_drvdata(dev);
+
+	enable_irq(as3722->chip_irq);
+
+	if (device_may_wakeup(dev))
+		disable_irq_wake(as3722->chip_irq);
+
+	return 0;
+}
+
 static const struct of_device_id as3722_of_match[] = {
 	{ .compatible = "ams,as3722", },
 	{},
@@ -434,10 +459,15 @@ static const struct i2c_device_id as3722_i2c_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, as3722_i2c_id);
 
+static const struct dev_pm_ops as3722_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(as3722_i2c_suspend, as3722_i2c_resume)
+};
+
 static struct i2c_driver as3722_i2c_driver = {
 	.driver = {
 		.name = "as3722",
 		.of_match_table = as3722_of_match,
+		.pm = &as3722_pm_ops,
 	},
 	.probe = as3722_i2c_probe,
 	.remove = as3722_i2c_remove,

From c78c9f52afe57848a5262afe3c12c7d9930d0831 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 8 Dec 2015 16:21:05 +0100
Subject: [PATCH 2642/3220] mfd: as3722: Mark PM functions as __maybe_unused

commit a7b956fd38dd217dd78e3058110929f5ac914df1 upstream.

The newly introduced as3722_i2c_suspend/resume functions are built
unconditionally, but only used when power management is enabled,
so we get a warning otherwise:

drivers/mfd/as3722.c:427:12: warning: 'as3722_i2c_suspend' defined but not used [-Wunused-function]
drivers/mfd/as3722.c:438:12: warning: 'as3722_i2c_resume' defined but not used [-Wunused-function]

This marks them both as __maybe_unused, which avoids an ugly #ifdef
and gives us best compile-time coverage. When they are unused, the
compiler will silently drop the functions from its output.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 35deff7eb212 ("mfd: as3722: Handle interrupts on suspend")
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Cc: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mfd/as3722.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/as3722.c b/drivers/mfd/as3722.c
index 3b21eb4bc480..e1f597f97f86 100644
--- a/drivers/mfd/as3722.c
+++ b/drivers/mfd/as3722.c
@@ -424,7 +424,7 @@ static int as3722_i2c_remove(struct i2c_client *i2c)
 	return 0;
 }
 
-static int as3722_i2c_suspend(struct device *dev)
+static int __maybe_unused as3722_i2c_suspend(struct device *dev)
 {
 	struct as3722 *as3722 = dev_get_drvdata(dev);
 
@@ -435,7 +435,7 @@ static int as3722_i2c_suspend(struct device *dev)
 	return 0;
 }
 
-static int as3722_i2c_resume(struct device *dev)
+static int __maybe_unused as3722_i2c_resume(struct device *dev)
 {
 	struct as3722 *as3722 = dev_get_drvdata(dev);
 

From 82379cf03bee3b8f7d2973363723f561dfe5c9c0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 Feb 2019 12:41:05 -0800
Subject: [PATCH 2643/3220] net/x25: do not hold the cpu too long in
 x25_new_lci()

commit cf657d22ee1f0e887326a92169f2e28dc932fd10 upstream.

Due to quadratic behavior of x25_new_lci(), syzbot was able
to trigger an rcu stall.

Fix this by not blocking BH for the whole duration of
the function, and inserting a reschedule point when possible.

If we care enough, using a bitmap could get rid of the quadratic
behavior.

syzbot report :

rcu: INFO: rcu_preempt self-detected stall on CPU
rcu:    0-...!: (10500 ticks this GP) idle=4fa/1/0x4000000000000002 softirq=283376/283376 fqs=0
rcu:     (t=10501 jiffies g=383105 q=136)
rcu: rcu_preempt kthread starved for 10502 jiffies! g383105 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=0
rcu: RCU grace-period kthread stack dump:
rcu_preempt     I28928    10      2 0x80000000
Call Trace:
 context_switch kernel/sched/core.c:2844 [inline]
 __schedule+0x817/0x1cc0 kernel/sched/core.c:3485
 schedule+0x92/0x180 kernel/sched/core.c:3529
 schedule_timeout+0x4db/0xfd0 kernel/time/timer.c:1803
 rcu_gp_fqs_loop kernel/rcu/tree.c:1948 [inline]
 rcu_gp_kthread+0x956/0x17a0 kernel/rcu/tree.c:2105
 kthread+0x357/0x430 kernel/kthread.c:246
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
NMI backtrace for cpu 0
CPU: 0 PID: 8759 Comm: syz-executor2 Not tainted 5.0.0-rc4+ #51
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 nmi_cpu_backtrace.cold+0x63/0xa4 lib/nmi_backtrace.c:101
 nmi_trigger_cpumask_backtrace+0x1be/0x236 lib/nmi_backtrace.c:62
 arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
 trigger_single_cpu_backtrace include/linux/nmi.h:164 [inline]
 rcu_dump_cpu_stacks+0x183/0x1cf kernel/rcu/tree.c:1211
 print_cpu_stall kernel/rcu/tree.c:1348 [inline]
 check_cpu_stall kernel/rcu/tree.c:1422 [inline]
 rcu_pending kernel/rcu/tree.c:3018 [inline]
 rcu_check_callbacks.cold+0x500/0xa4a kernel/rcu/tree.c:2521
 update_process_times+0x32/0x80 kernel/time/timer.c:1635
 tick_sched_handle+0xa2/0x190 kernel/time/tick-sched.c:161
 tick_sched_timer+0x47/0x130 kernel/time/tick-sched.c:1271
 __run_hrtimer kernel/time/hrtimer.c:1389 [inline]
 __hrtimer_run_queues+0x33e/0xde0 kernel/time/hrtimer.c:1451
 hrtimer_interrupt+0x314/0x770 kernel/time/hrtimer.c:1509
 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1035 [inline]
 smp_apic_timer_interrupt+0x120/0x570 arch/x86/kernel/apic/apic.c:1060
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:807
 </IRQ>
RIP: 0010:__read_once_size include/linux/compiler.h:193 [inline]
RIP: 0010:queued_write_lock_slowpath+0x13e/0x290 kernel/locking/qrwlock.c:86
Code: 00 00 fc ff df 4c 8d 2c 01 41 83 c7 03 41 0f b6 45 00 41 38 c7 7c 08 84 c0 0f 85 0c 01 00 00 8b 03 3d 00 01 00 00 74 1a f3 90 <41> 0f b6 55 00 41 38 d7 7c eb 84 d2 74 e7 48 89 df e8 6c 0f 4f 00
RSP: 0018:ffff88805f117bd8 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000300 RBX: ffffffff89413ba0 RCX: 1ffffffff1282774
RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffff89413ba0
RBP: ffff88805f117c70 R08: 1ffffffff1282774 R09: fffffbfff1282775
R10: fffffbfff1282774 R11: ffffffff89413ba3 R12: 00000000000000ff
R13: fffffbfff1282774 R14: 1ffff1100be22f7d R15: 0000000000000003
 queued_write_lock include/asm-generic/qrwlock.h:104 [inline]
 do_raw_write_lock+0x1d6/0x290 kernel/locking/spinlock_debug.c:203
 __raw_write_lock_bh include/linux/rwlock_api_smp.h:204 [inline]
 _raw_write_lock_bh+0x3b/0x50 kernel/locking/spinlock.c:312
 x25_insert_socket+0x21/0xe0 net/x25/af_x25.c:267
 x25_bind+0x273/0x340 net/x25/af_x25.c:705
 __sys_bind+0x23f/0x290 net/socket.c:1505
 __do_sys_bind net/socket.c:1516 [inline]
 __se_sys_bind net/socket.c:1514 [inline]
 __x64_sys_bind+0x73/0xb0 net/socket.c:1514
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x457e39
Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fafccd0dc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000031
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457e39
RDX: 0000000000000012 RSI: 0000000020000240 RDI: 0000000000000004
RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fafccd0e6d4
R13: 00000000004bdf8b R14: 00000000004ce4b8 R15: 00000000ffffffff
Sending NMI from CPU 0 to CPUs 1:
NMI backtrace for cpu 1
CPU: 1 PID: 8752 Comm: syz-executor4 Not tainted 5.0.0-rc4+ #51
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:__x25_find_socket+0x78/0x120 net/x25/af_x25.c:328
Code: 89 f8 48 c1 e8 03 80 3c 18 00 0f 85 a6 00 00 00 4d 8b 64 24 68 4d 85 e4 74 7f e8 03 97 3d fb 49 83 ec 68 74 74 e8 f8 96 3d fb <49> 8d bc 24 88 04 00 00 48 89 f8 48 c1 e8 03 0f b6 04 18 84 c0 74
RSP: 0018:ffff8880639efc58 EFLAGS: 00000246
RAX: 0000000000040000 RBX: dffffc0000000000 RCX: ffffc9000e677000
RDX: 0000000000040000 RSI: ffffffff863244b8 RDI: ffff88806a764628
RBP: ffff8880639efc80 R08: ffff8880a80d05c0 R09: fffffbfff1282775
R10: fffffbfff1282774 R11: ffffffff89413ba3 R12: ffff88806a7645c0
R13: 0000000000000001 R14: ffff88809f29ac00 R15: 0000000000000000
FS:  00007fe8d0c58700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001b32823000 CR3: 00000000672eb000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 x25_new_lci net/x25/af_x25.c:357 [inline]
 x25_connect+0x374/0xdf0 net/x25/af_x25.c:786
 __sys_connect+0x266/0x330 net/socket.c:1686
 __do_sys_connect net/socket.c:1697 [inline]
 __se_sys_connect net/socket.c:1694 [inline]
 __x64_sys_connect+0x73/0xb0 net/socket.c:1694
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x457e39
Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fe8d0c57c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457e39
RDX: 0000000000000012 RSI: 0000000020000200 RDI: 0000000000000004
RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fe8d0c586d4
R13: 00000000004be378 R14: 00000000004ceb00 R15: 00000000ffffffff

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Andrew Hendry <andrew.hendry@gmail.com>
Cc: linux-x25@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/x25/af_x25.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index c6ab4da4b8e2..8d7b2802d33f 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -352,17 +352,15 @@ static unsigned int x25_new_lci(struct x25_neigh *nb)
 	unsigned int lci = 1;
 	struct sock *sk;
 
-	read_lock_bh(&x25_list_lock);
-
-	while ((sk = __x25_find_socket(lci, nb)) != NULL) {
+	while ((sk = x25_find_socket(lci, nb)) != NULL) {
 		sock_put(sk);
 		if (++lci == 4096) {
 			lci = 0;
 			break;
 		}
+		cond_resched();
 	}
 
-	read_unlock_bh(&x25_list_lock);
 	return lci;
 }
 

From c532eb1d4527d45962b6e14720f6d8c51b688889 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Feb 2019 15:38:44 -0800
Subject: [PATCH 2644/3220] mISDN: fix a race in dev_expire_timer()

commit bdcc5bc25548ef6b08e2e43937148f907c212292 upstream.

Since mISDN_close() uses dev->pending to iterate over active
timers, there is a chance that one timer got removed from the
->pending list in dev_expire_timer() but that the thread
has not called yet wake_up_interruptible()

So mISDN_close() could miss this and free dev before
completion of at least one dev_expire_timer()

syzbot was able to catch this race :

BUG: KASAN: use-after-free in register_lock_class+0x140c/0x1bf0 kernel/locking/lockdep.c:827
Write of size 8 at addr ffff88809fc18948 by task syz-executor1/24769

CPU: 1 PID: 24769 Comm: syz-executor1 Not tainted 5.0.0-rc5 #60
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187
 kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 __asan_report_store8_noabort+0x17/0x20 mm/kasan/generic_report.c:140
 register_lock_class+0x140c/0x1bf0 kernel/locking/lockdep.c:827
 __lock_acquire+0x11f/0x4700 kernel/locking/lockdep.c:3224
 lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841
 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
 _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:152
 __wake_up_common_lock+0xc7/0x190 kernel/sched/wait.c:120
 __wake_up+0xe/0x10 kernel/sched/wait.c:145
 dev_expire_timer+0xe4/0x3b0 drivers/isdn/mISDN/timerdev.c:174
 call_timer_fn+0x190/0x720 kernel/time/timer.c:1325
protocol 88fb is buggy, dev hsr_slave_0
protocol 88fb is buggy, dev hsr_slave_1
 expire_timers kernel/time/timer.c:1362 [inline]
 __run_timers kernel/time/timer.c:1681 [inline]
 __run_timers kernel/time/timer.c:1649 [inline]
 run_timer_softirq+0x652/0x1700 kernel/time/timer.c:1694
 __do_softirq+0x266/0x95a kernel/softirq.c:292
 invoke_softirq kernel/softirq.c:373 [inline]
 irq_exit+0x180/0x1d0 kernel/softirq.c:413
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 smp_apic_timer_interrupt+0x14a/0x570 arch/x86/kernel/apic/apic.c:1062
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:807
 </IRQ>
RIP: 0010:__sanitizer_cov_trace_pc+0x26/0x50 kernel/kcov.c:101
Code: 90 90 90 90 55 48 89 e5 48 8b 75 08 65 48 8b 04 25 40 ee 01 00 65 8b 15 98 12 92 7e 81 e2 00 01 1f 00 75 2b 8b 90 d8 12 00 00 <83> fa 02 75 20 48 8b 88 e0 12 00 00 8b 80 dc 12 00 00 48 8b 11 48
RSP: 0018:ffff8880589b7a60 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
RAX: ffff888087ce25c0 RBX: 0000000000000001 RCX: ffffffff818f8ca3
RDX: 0000000000000000 RSI: ffffffff818f8b48 RDI: 0000000000000001
RBP: ffff8880589b7a60 R08: ffff888087ce25c0 R09: ffffed1015d25bd0
R10: ffffed1015d25bcf R11: ffff8880ae92de7b R12: ffffea0001ae4680
R13: ffffea0001ae4688 R14: 0000000000000000 R15: ffffea0001b41648
 PageIdle include/linux/page-flags.h:398 [inline]
 page_is_idle include/linux/page_idle.h:29 [inline]
 mark_page_accessed+0x618/0x1140 mm/swap.c:398
 touch_buffer fs/buffer.c:59 [inline]
 __find_get_block+0x312/0xcc0 fs/buffer.c:1298
 sb_find_get_block include/linux/buffer_head.h:338 [inline]
 recently_deleted fs/ext4/ialloc.c:682 [inline]
 find_inode_bit.isra.0+0x202/0x510 fs/ext4/ialloc.c:722
 __ext4_new_inode+0x14ad/0x52c0 fs/ext4/ialloc.c:914
 ext4_symlink+0x3f8/0xbe0 fs/ext4/namei.c:3096
 vfs_symlink fs/namei.c:4126 [inline]
 vfs_symlink+0x378/0x5d0 fs/namei.c:4112
 do_symlinkat+0x22b/0x290 fs/namei.c:4153
 __do_sys_symlink fs/namei.c:4172 [inline]
 __se_sys_symlink fs/namei.c:4170 [inline]
 __x64_sys_symlink+0x59/0x80 fs/namei.c:4170
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x457b67
Code: 0f 1f 00 b8 5c 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 6d bb fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 58 00 00 00 0f 05 <48> 3d 01 f0 ff ff 0f 83 4d bb fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fff045ce0f8 EFLAGS: 00000202 ORIG_RAX: 0000000000000058
RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 0000000000457b67
RDX: 00007fff045ce173 RSI: 00000000004bd63f RDI: 00007fff045ce160
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013
R10: 0000000000000075 R11: 0000000000000202 R12: 0000000000000000
R13: 0000000000000001 R14: 000000000000029b R15: 0000000000000001

Allocated by task 24763:
 save_stack+0x45/0xd0 mm/kasan/common.c:73
 set_track mm/kasan/common.c:85 [inline]
 __kasan_kmalloc mm/kasan/common.c:496 [inline]
 __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:469
 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:504
 kmem_cache_alloc_trace+0x151/0x760 mm/slab.c:3609
 kmalloc include/linux/slab.h:545 [inline]
 mISDN_open+0x9a/0x270 drivers/isdn/mISDN/timerdev.c:59
 misc_open+0x398/0x4c0 drivers/char/misc.c:141
 chrdev_open+0x247/0x6b0 fs/char_dev.c:417
 do_dentry_open+0x47d/0x1130 fs/open.c:771
 vfs_open+0xa0/0xd0 fs/open.c:880
 do_last fs/namei.c:3418 [inline]
 path_openat+0x10d7/0x4690 fs/namei.c:3534
 do_filp_open+0x1a1/0x280 fs/namei.c:3564
 do_sys_open+0x3fe/0x5d0 fs/open.c:1063
 __do_sys_openat fs/open.c:1090 [inline]
 __se_sys_openat fs/open.c:1084 [inline]
 __x64_sys_openat+0x9d/0x100 fs/open.c:1084
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 24762:
 save_stack+0x45/0xd0 mm/kasan/common.c:73
 set_track mm/kasan/common.c:85 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/common.c:458
 kasan_slab_free+0xe/0x10 mm/kasan/common.c:466
 __cache_free mm/slab.c:3487 [inline]
 kfree+0xcf/0x230 mm/slab.c:3806
 mISDN_close+0x2a1/0x390 drivers/isdn/mISDN/timerdev.c:97
 __fput+0x2df/0x8d0 fs/file_table.c:278
 ____fput+0x16/0x20 fs/file_table.c:309
 task_work_run+0x14a/0x1c0 kernel/task_work.c:113
 tracehook_notify_resume include/linux/tracehook.h:188 [inline]
 exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:166
 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:268 [inline]
 do_syscall_64+0x52d/0x610 arch/x86/entry/common.c:293
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff88809fc18900
 which belongs to the cache kmalloc-192 of size 192
The buggy address is located 72 bytes inside of
 192-byte region [ffff88809fc18900, ffff88809fc189c0)
The buggy address belongs to the page:
page:ffffea00027f0600 count:1 mapcount:0 mapping:ffff88812c3f0040 index:0xffff88809fc18000
flags: 0x1fffc0000000200(slab)
raw: 01fffc0000000200 ffffea000269f648 ffffea00029f7408 ffff88812c3f0040
raw: ffff88809fc18000 ffff88809fc18000 000000010000000b 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88809fc18800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff88809fc18880: 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff88809fc18900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                              ^
 ffff88809fc18980: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
 ffff88809fc18a00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Karsten Keil <isdn@linux-pingi.de>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/mISDN/timerdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/isdn/mISDN/timerdev.c b/drivers/isdn/mISDN/timerdev.c
index 9438d7ec3308..8b29e97cf668 100644
--- a/drivers/isdn/mISDN/timerdev.c
+++ b/drivers/isdn/mISDN/timerdev.c
@@ -168,8 +168,8 @@ dev_expire_timer(unsigned long data)
 	spin_lock_irqsave(&timer->dev->lock, flags);
 	if (timer->id >= 0)
 		list_move_tail(&timer->list, &timer->dev->expired);
-	spin_unlock_irqrestore(&timer->dev->lock, flags);
 	wake_up_interruptible(&timer->dev->wait);
+	spin_unlock_irqrestore(&timer->dev->lock, flags);
 }
 
 static int

From 94801fd5ab2352830a86ca59b1e6377e35b6a074 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 22 Jan 2019 10:40:59 -0800
Subject: [PATCH 2645/3220] ax25: fix possible use-after-free

commit 63530aba7826a0f8e129874df9c4d264f9db3f9e upstream.

syzbot found that ax25 routes where not properly protected
against concurrent use [1].

In this particular report the bug happened while
copying ax25->digipeat.

Fix this problem by making sure we call ax25_get_route()
while ax25_route_lock is held, so that no modification
could happen while using the route.

The current two ax25_get_route() callers do not sleep,
so this change should be fine.

Once we do that, ax25_get_route() no longer needs to
grab a reference on the found route.

[1]
ax25_connect(): syz-executor0 uses autobind, please contact jreuter@yaina.de
BUG: KASAN: use-after-free in memcpy include/linux/string.h:352 [inline]
BUG: KASAN: use-after-free in kmemdup+0x42/0x60 mm/util.c:113
Read of size 66 at addr ffff888066641a80 by task syz-executor2/531

ax25_connect(): syz-executor0 uses autobind, please contact jreuter@yaina.de
CPU: 1 PID: 531 Comm: syz-executor2 Not tainted 5.0.0-rc2+ #10
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1db/0x2d0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187
 kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 check_memory_region_inline mm/kasan/generic.c:185 [inline]
 check_memory_region+0x123/0x190 mm/kasan/generic.c:191
 memcpy+0x24/0x50 mm/kasan/common.c:130
 memcpy include/linux/string.h:352 [inline]
 kmemdup+0x42/0x60 mm/util.c:113
 kmemdup include/linux/string.h:425 [inline]
 ax25_rt_autobind+0x25d/0x750 net/ax25/ax25_route.c:424
 ax25_connect.cold+0x30/0xa4 net/ax25/af_ax25.c:1224
 __sys_connect+0x357/0x490 net/socket.c:1664
 __do_sys_connect net/socket.c:1675 [inline]
 __se_sys_connect net/socket.c:1672 [inline]
 __x64_sys_connect+0x73/0xb0 net/socket.c:1672
 do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x458099
Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f870ee22c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000458099
RDX: 0000000000000048 RSI: 0000000020000080 RDI: 0000000000000005
RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
ax25_connect(): syz-executor4 uses autobind, please contact jreuter@yaina.de
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f870ee236d4
R13: 00000000004be48e R14: 00000000004ce9a8 R15: 00000000ffffffff

Allocated by task 526:
 save_stack+0x45/0xd0 mm/kasan/common.c:73
 set_track mm/kasan/common.c:85 [inline]
 __kasan_kmalloc mm/kasan/common.c:496 [inline]
 __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:469
 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:504
ax25_connect(): syz-executor5 uses autobind, please contact jreuter@yaina.de
 kmem_cache_alloc_trace+0x151/0x760 mm/slab.c:3609
 kmalloc include/linux/slab.h:545 [inline]
 ax25_rt_add net/ax25/ax25_route.c:95 [inline]
 ax25_rt_ioctl+0x3b9/0x1270 net/ax25/ax25_route.c:233
 ax25_ioctl+0x322/0x10b0 net/ax25/af_ax25.c:1763
 sock_do_ioctl+0xe2/0x400 net/socket.c:950
 sock_ioctl+0x32f/0x6c0 net/socket.c:1074
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:509 [inline]
 do_vfs_ioctl+0x107b/0x17d0 fs/ioctl.c:696
 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713
 __do_sys_ioctl fs/ioctl.c:720 [inline]
 __se_sys_ioctl fs/ioctl.c:718 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718
 do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

ax25_connect(): syz-executor5 uses autobind, please contact jreuter@yaina.de
Freed by task 550:
 save_stack+0x45/0xd0 mm/kasan/common.c:73
 set_track mm/kasan/common.c:85 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/common.c:458
 kasan_slab_free+0xe/0x10 mm/kasan/common.c:466
 __cache_free mm/slab.c:3487 [inline]
 kfree+0xcf/0x230 mm/slab.c:3806
 ax25_rt_add net/ax25/ax25_route.c:92 [inline]
 ax25_rt_ioctl+0x304/0x1270 net/ax25/ax25_route.c:233
 ax25_ioctl+0x322/0x10b0 net/ax25/af_ax25.c:1763
 sock_do_ioctl+0xe2/0x400 net/socket.c:950
 sock_ioctl+0x32f/0x6c0 net/socket.c:1074
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:509 [inline]
 do_vfs_ioctl+0x107b/0x17d0 fs/ioctl.c:696
 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713
 __do_sys_ioctl fs/ioctl.c:720 [inline]
 __se_sys_ioctl fs/ioctl.c:718 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718
 do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff888066641a80
 which belongs to the cache kmalloc-96 of size 96
The buggy address is located 0 bytes inside of
 96-byte region [ffff888066641a80, ffff888066641ae0)
The buggy address belongs to the page:
page:ffffea0001999040 count:1 mapcount:0 mapping:ffff88812c3f04c0 index:0x0
flags: 0x1fffc0000000200(slab)
ax25_connect(): syz-executor4 uses autobind, please contact jreuter@yaina.de
raw: 01fffc0000000200 ffffea0001817948 ffffea0002341dc8 ffff88812c3f04c0
raw: 0000000000000000 ffff888066641000 0000000100000020 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff888066641980: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
 ffff888066641a00: 00 00 00 00 00 00 00 00 02 fc fc fc fc fc fc fc
>ffff888066641a80: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
                   ^
 ffff888066641b00: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
 ffff888066641b80: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/ax25.h    | 12 ++++++++++++
 net/ax25/ax25_ip.c    |  4 ++--
 net/ax25/ax25_route.c | 19 ++++++++-----------
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/include/net/ax25.h b/include/net/ax25.h
index e602f8177ebf..b507ce2b1952 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -199,6 +199,18 @@ static inline void ax25_hold_route(ax25_route *ax25_rt)
 
 void __ax25_put_route(ax25_route *ax25_rt);
 
+extern rwlock_t ax25_route_lock;
+
+static inline void ax25_route_lock_use(void)
+{
+	read_lock(&ax25_route_lock);
+}
+
+static inline void ax25_route_lock_unuse(void)
+{
+	read_unlock(&ax25_route_lock);
+}
+
 static inline void ax25_put_route(ax25_route *ax25_rt)
 {
 	if (atomic_dec_and_test(&ax25_rt->refcount))
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 2fa3be965101..cd9a24e5b97a 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -114,6 +114,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
 	dst = (ax25_address *)(bp + 1);
 	src = (ax25_address *)(bp + 8);
 
+	ax25_route_lock_use();
 	route = ax25_get_route(dst, NULL);
 	if (route) {
 		digipeat = route->digipeat;
@@ -206,9 +207,8 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
 	ax25_queue_xmit(skb, dev);
 
 put:
-	if (route)
-		ax25_put_route(route);
 
+	ax25_route_lock_unuse();
 	return NETDEV_TX_OK;
 }
 
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index d39097737e38..149f82bd83fd 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -40,7 +40,7 @@
 #include <linux/export.h>
 
 static ax25_route *ax25_route_list;
-static DEFINE_RWLOCK(ax25_route_lock);
+DEFINE_RWLOCK(ax25_route_lock);
 
 void ax25_rt_device_down(struct net_device *dev)
 {
@@ -349,6 +349,7 @@ const struct file_operations ax25_route_fops = {
  *	Find AX.25 route
  *
  *	Only routes with a reference count of zero can be destroyed.
+ *	Must be called with ax25_route_lock read locked.
  */
 ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
 {
@@ -356,7 +357,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
 	ax25_route *ax25_def_rt = NULL;
 	ax25_route *ax25_rt;
 
-	read_lock(&ax25_route_lock);
 	/*
 	 *	Bind to the physical interface we heard them on, or the default
 	 *	route if none is found;
@@ -379,11 +379,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
 	if (ax25_spe_rt != NULL)
 		ax25_rt = ax25_spe_rt;
 
-	if (ax25_rt != NULL)
-		ax25_hold_route(ax25_rt);
-
-	read_unlock(&ax25_route_lock);
-
 	return ax25_rt;
 }
 
@@ -414,9 +409,12 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
 	ax25_route *ax25_rt;
 	int err = 0;
 
-	if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
+	ax25_route_lock_use();
+	ax25_rt = ax25_get_route(addr, NULL);
+	if (!ax25_rt) {
+		ax25_route_lock_unuse();
 		return -EHOSTUNREACH;
-
+	}
 	if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) {
 		err = -EHOSTUNREACH;
 		goto put;
@@ -451,8 +449,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
 	}
 
 put:
-	ax25_put_route(ax25_rt);
-
+	ax25_route_lock_unuse();
 	return err;
 }
 

From 49e1a9d1169adcffee99967130ff6d3b79d5fbe6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 21 Feb 2019 14:52:13 +0100
Subject: [PATCH 2646/3220] KVM: VMX: Fix x2apic check in vmx_msr_bitmap_mode()

The stable backport of upstream commit

	904e14fb7cb96 KVM: VMX: make MSR bitmaps per-VCPU

has a bug in vmx_msr_bitmap_mode(). It enables the x2apic
MSR-bitmap when the kernel emulates x2apic for the guest in
software. The upstream version of the commit checkes whether
the hardware has virtualization enabled for x2apic
emulation.

Since KVM emulates x2apic for guests even when the host does
not support x2apic in hardware, this causes the intercept of
at least the X2APIC_TASKPRI MSR to be disabled on machines
not supporting that MSR. The result is undefined behavior,
on some machines (Intel Westmere based) it causes a crash of
the guest kernel when it tries to access that MSR.

Change the check in vmx_msr_bitmap_mode() to match the upstream
code. This fixes the guest crashes observed with stable
kernels starting with v4.4.168 through v4.4.175.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aee2886a387c..14553f6c03a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4628,7 +4628,9 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
 {
 	u8 mode = 0;
 
-	if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+	if (cpu_has_secondary_exec_ctrls() &&
+	    (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+	     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
 		mode |= MSR_BITMAP_MODE_X2APIC;
 		if (enable_apicv)
 			mode |= MSR_BITMAP_MODE_X2APIC_APICV;

From af13f43f01a3e4a11686f5fa4de42ecec8ed71b1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 23 Feb 2019 09:05:14 +0100
Subject: [PATCH 2647/3220] Linux 4.4.176

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5f0bdef2af99..d7a3b832e0fd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 175
+SUBLEVEL = 176
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From d5aa6ed6b68fcdde27c83a9b091fceeb58dd623f Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 15 Feb 2019 20:57:53 -0800
Subject: [PATCH 2648/3220] Revert "f2fs: fix to check inline_xattr_size
 boundary correctly"

This reverts commit 802a643228462e0e3f6e84977d912f1871f61467.
---
 fs/f2fs/f2fs.h          |  1 +
 fs/f2fs/super.c         | 10 +++++-----
 include/linux/f2fs_fs.h | 13 ++++++-------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 607abef0abcf..c836992d5a00 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -524,6 +524,7 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
+#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
 static inline int get_inline_xattr_addrs(struct inode *inode);
 #define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4c3015a119b4..fa54715e4132 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -835,11 +835,11 @@ static int parse_options(struct super_block *sb, char *options)
 			return -EINVAL;
 		}
 		if (!F2FS_OPTION(sbi).inline_xattr_size ||
-			F2FS_OPTION(sbi).inline_xattr_size >
-				DEF_ADDRS_PER_INODE -
-				F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) -
-				DEF_INLINE_RESERVED_SIZE -
-				MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) {
+			F2FS_OPTION(sbi).inline_xattr_size >=
+					DEF_ADDRS_PER_INODE -
+					F2FS_TOTAL_EXTRA_ATTR_SIZE -
+					DEF_INLINE_RESERVED_SIZE -
+					DEF_MIN_INLINE_SIZE) {
 			f2fs_msg(sb, KERN_ERR,
 					"inline xattr size is out of range");
 			return -EINVAL;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 2f5fbc41118c..6dcde67ba171 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -490,12 +490,12 @@ typedef __le32	f2fs_hash_t;
 
 /*
  * space utilization of regular dentry and inline dentry (w/o extra reservation)
- *		regular dentry		inline dentry (def)	inline dentry (min)
- * bitmap	1 * 27 = 27		1 * 23 = 23		1 * 1 = 1
- * reserved	1 * 3 = 3		1 * 7 = 7		1 * 1 = 1
- * dentry	11 * 214 = 2354		11 * 182 = 2002		11 * 2 = 22
- * filename	8 * 214 = 1712		8 * 182 = 1456		8 * 2 = 16
- * total	4096			3488			40
+ *		regular dentry			inline dentry
+ * bitmap	1 * 27 = 27			1 * 23 = 23
+ * reserved	1 * 3 = 3			1 * 7 = 7
+ * dentry	11 * 214 = 2354			11 * 182 = 2002
+ * filename	8 * 214 = 1712			8 * 182 = 1456
+ * total	4096				3488
  *
  * Note: there are more reserved space in inline dentry than in regular
  * dentry, when converting inline dentry we should handle this carefully.
@@ -507,7 +507,6 @@ typedef __le32	f2fs_hash_t;
 #define SIZE_OF_RESERVED	(PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
 				F2FS_SLOT_LEN) * \
 				NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
-#define MIN_INLINE_DENTRY_SIZE		40	/* just include '.' and '..' entries */
 
 /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
 struct f2fs_dir_entry {

From 2599cc3d728c639ea15f88a0157f1455c4179a86 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 26 Feb 2019 09:47:34 -0800
Subject: [PATCH 2649/3220] Revert "f2fs: fix to avoid deadlock of atomic file
 operations"

This reverts commit f3ac182210162c7e76997a8566a7f9869349f3d8.
---
 fs/f2fs/segment.c | 43 ++++++++++++-------------------------------
 1 file changed, 12 insertions(+), 31 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 67bcecfcb55c..6f0bfaeebb47 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -215,8 +215,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
 }
 
 static int __revoke_inmem_pages(struct inode *inode,
-				struct list_head *head, bool drop, bool recover,
-				bool trylock)
+				struct list_head *head, bool drop, bool recover)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct inmem_pages *cur, *tmp;
@@ -228,16 +227,7 @@ static int __revoke_inmem_pages(struct inode *inode,
 		if (drop)
 			trace_f2fs_commit_inmem_page(page, INMEM_DROP);
 
-		if (trylock) {
-			/*
-			 * to avoid deadlock in between page lock and
-			 * inmem_lock.
-			 */
-			if (!trylock_page(page))
-				continue;
-		} else {
-			lock_page(page);
-		}
+		lock_page(page);
 
 		f2fs_wait_on_page_writeback(page, DATA, true, true);
 
@@ -328,19 +318,13 @@ void f2fs_drop_inmem_pages(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	while (!list_empty(&fi->inmem_pages)) {
-		mutex_lock(&fi->inmem_lock);
-		__revoke_inmem_pages(inode, &fi->inmem_pages,
-						true, false, true);
-
-		if (list_empty(&fi->inmem_pages)) {
-			spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
-			if (!list_empty(&fi->inmem_ilist))
-				list_del_init(&fi->inmem_ilist);
-			spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
-			mutex_unlock(&fi->inmem_lock);
-		}
-	}
+	mutex_lock(&fi->inmem_lock);
+	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (!list_empty(&fi->inmem_ilist))
+		list_del_init(&fi->inmem_ilist);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+	mutex_unlock(&fi->inmem_lock);
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
@@ -445,15 +429,12 @@ retry:
 		 * recovery or rewrite & commit last transaction. For other
 		 * error number, revoking was done by filesystem itself.
 		 */
-		err = __revoke_inmem_pages(inode, &revoke_list,
-						false, true, false);
+		err = __revoke_inmem_pages(inode, &revoke_list, false, true);
 
 		/* drop all uncommitted pages */
-		__revoke_inmem_pages(inode, &fi->inmem_pages,
-						true, false, false);
+		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
 	} else {
-		__revoke_inmem_pages(inode, &revoke_list,
-						false, false, false);
+		__revoke_inmem_pages(inode, &revoke_list, false, false);
 	}
 
 	return err;

From 0272cd402b73b3a07efb799f643d5a62a8db3305 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@android.com>
Date: Mon, 14 Jan 2019 09:10:21 -0800
Subject: [PATCH 2650/3220] FROMGIT: binder: create node flag to request
 sender's security context

To allow servers to verify client identity, allow a node
flag to be set that causes the sender's security context
to be delivered with the transaction. The BR_TRANSACTION
command is extended in BR_TRANSACTION_SEC_CTX to
contain a pointer to the security context string.

Signed-off-by: Todd Kjos <tkjos@google.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(cherry picked from commit ec74136ded792deed80780a2f8baf3521eeb72f9
 https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
 master)
Change-Id: I44496546e2d0dc0022f818a45cd52feb1c1a92cb
Signed-off-by: Todd Kjos <tkjos@google.com>
---
 drivers/android/binder.c            | 105 ++++++++++++++++++++++------
 include/uapi/linux/android/binder.h |  19 +++++
 2 files changed, 101 insertions(+), 23 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index bd9e285df613..14af4c979d9a 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -356,6 +356,7 @@ struct binder_error {
  * @min_priority:         minimum scheduling priority
  *                        (invariant after initialized)
  * @inherit_rt:           inherit RT scheduling policy from caller
+ * @txn_security_ctx:     require sender's security context
  *                        (invariant after initialized)
  * @async_todo:           list of async work items
  *                        (protected by @proc->inner_lock)
@@ -395,6 +396,7 @@ struct binder_node {
 		u8 sched_policy:2;
 		u8 inherit_rt:1;
 		u8 accept_fds:1;
+		u8 txn_security_ctx:1;
 		u8 min_priority;
 	};
 	bool has_async_transaction;
@@ -652,6 +654,7 @@ struct binder_transaction {
 	struct binder_priority	saved_priority;
 	bool    set_priority_called;
 	kuid_t	sender_euid;
+	binder_uintptr_t security_ctx;
 	/**
 	 * @lock:  protects @from, @to_proc, and @to_thread
 	 *
@@ -1361,6 +1364,7 @@ static struct binder_node *binder_init_node_ilocked(
 	node->min_priority = to_kernel_prio(node->sched_policy, priority);
 	node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
 	node->inherit_rt = !!(flags & FLAT_BINDER_FLAG_INHERIT_RT);
+	node->txn_security_ctx = !!(flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX);
 	spin_lock_init(&node->lock);
 	INIT_LIST_HEAD(&node->work.entry);
 	INIT_LIST_HEAD(&node->async_todo);
@@ -2899,6 +2903,8 @@ static void binder_transaction(struct binder_proc *proc,
 	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
 	int t_debug_id = atomic_inc_return(&binder_last_id);
+	char *secctx = NULL;
+	u32 secctx_sz = 0;
 
 	e = binder_transaction_log_add(&binder_transaction_log);
 	e->debug_id = t_debug_id;
@@ -3122,6 +3128,20 @@ static void binder_transaction(struct binder_proc *proc,
 		t->priority = target_proc->default_priority;
 	}
 
+	if (target_node && target_node->txn_security_ctx) {
+		u32 secid;
+
+		security_task_getsecid(proc->tsk, &secid);
+		ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
+		if (ret) {
+			return_error = BR_FAILED_REPLY;
+			return_error_param = ret;
+			return_error_line = __LINE__;
+			goto err_get_secctx_failed;
+		}
+		extra_buffers_size += ALIGN(secctx_sz, sizeof(u64));
+	}
+
 	trace_binder_transaction(reply, t, target_node);
 
 	t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size,
@@ -3138,6 +3158,19 @@ static void binder_transaction(struct binder_proc *proc,
 		t->buffer = NULL;
 		goto err_binder_alloc_buf_failed;
 	}
+	if (secctx) {
+		size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) +
+				    ALIGN(tr->offsets_size, sizeof(void *)) +
+				    ALIGN(extra_buffers_size, sizeof(void *)) -
+				    ALIGN(secctx_sz, sizeof(u64));
+		char *kptr = t->buffer->data + buf_offset;
+
+		t->security_ctx = (uintptr_t)kptr +
+		    binder_alloc_get_user_buffer_offset(&target_proc->alloc);
+		memcpy(kptr, secctx, secctx_sz);
+		security_release_secctx(secctx, secctx_sz);
+		secctx = NULL;
+	}
 	t->buffer->debug_id = t->debug_id;
 	t->buffer->transaction = t;
 	t->buffer->target_node = target_node;
@@ -3408,6 +3441,9 @@ err_copy_data_failed:
 	t->buffer->transaction = NULL;
 	binder_alloc_free_buf(&target_proc->alloc, t->buffer);
 err_binder_alloc_buf_failed:
+	if (secctx)
+		security_release_secctx(secctx, secctx_sz);
+err_get_secctx_failed:
 	kfree(tcomplete);
 	binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
 err_alloc_tcomplete_failed:
@@ -4054,11 +4090,13 @@ retry:
 
 	while (1) {
 		uint32_t cmd;
-		struct binder_transaction_data tr;
+		struct binder_transaction_data_secctx tr;
+		struct binder_transaction_data *trd = &tr.transaction_data;
 		struct binder_work *w = NULL;
 		struct list_head *list = NULL;
 		struct binder_transaction *t = NULL;
 		struct binder_thread *t_from;
+		size_t trsize = sizeof(*trd);
 
 		binder_inner_proc_lock(proc);
 		if (!binder_worklist_empty_ilocked(&thread->todo))
@@ -4254,41 +4292,47 @@ retry:
 			struct binder_node *target_node = t->buffer->target_node;
 			struct binder_priority node_prio;
 
-			tr.target.ptr = target_node->ptr;
-			tr.cookie =  target_node->cookie;
+			trd->target.ptr = target_node->ptr;
+			trd->cookie =  target_node->cookie;
 			node_prio.sched_policy = target_node->sched_policy;
 			node_prio.prio = target_node->min_priority;
 			binder_transaction_priority(current, t, node_prio,
 						    target_node->inherit_rt);
 			cmd = BR_TRANSACTION;
 		} else {
-			tr.target.ptr = 0;
-			tr.cookie = 0;
+			trd->target.ptr = 0;
+			trd->cookie = 0;
 			cmd = BR_REPLY;
 		}
-		tr.code = t->code;
-		tr.flags = t->flags;
-		tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid);
+		trd->code = t->code;
+		trd->flags = t->flags;
+		trd->sender_euid = from_kuid(current_user_ns(), t->sender_euid);
 
 		t_from = binder_get_txn_from(t);
 		if (t_from) {
 			struct task_struct *sender = t_from->proc->tsk;
 
-			tr.sender_pid = task_tgid_nr_ns(sender,
-							task_active_pid_ns(current));
+			trd->sender_pid =
+				task_tgid_nr_ns(sender,
+						task_active_pid_ns(current));
 		} else {
-			tr.sender_pid = 0;
+			trd->sender_pid = 0;
 		}
 
-		tr.data_size = t->buffer->data_size;
-		tr.offsets_size = t->buffer->offsets_size;
-		tr.data.ptr.buffer = (binder_uintptr_t)
+		trd->data_size = t->buffer->data_size;
+		trd->offsets_size = t->buffer->offsets_size;
+		trd->data.ptr.buffer = (binder_uintptr_t)
 			((uintptr_t)t->buffer->data +
 			binder_alloc_get_user_buffer_offset(&proc->alloc));
-		tr.data.ptr.offsets = tr.data.ptr.buffer +
+		trd->data.ptr.offsets = trd->data.ptr.buffer +
 					ALIGN(t->buffer->data_size,
 					    sizeof(void *));
 
+		tr.secctx = t->security_ctx;
+		if (t->security_ctx) {
+			cmd = BR_TRANSACTION_SEC_CTX;
+			trsize = sizeof(tr);
+		}
 		if (put_user(cmd, (uint32_t __user *)ptr)) {
 			if (t_from)
 				binder_thread_dec_tmpref(t_from);
@@ -4299,7 +4343,7 @@ retry:
 			return -EFAULT;
 		}
 		ptr += sizeof(uint32_t);
-		if (copy_to_user(ptr, &tr, sizeof(tr))) {
+		if (copy_to_user(ptr, &tr, trsize)) {
 			if (t_from)
 				binder_thread_dec_tmpref(t_from);
 
@@ -4308,7 +4352,7 @@ retry:
 
 			return -EFAULT;
 		}
-		ptr += sizeof(tr);
+		ptr += trsize;
 
 		trace_binder_transaction_received(t);
 		binder_stat_br(proc, thread, cmd);
@@ -4316,16 +4360,18 @@ retry:
 			     "%d:%d %s %d %d:%d, cmd %d size %zd-%zd ptr %016llx-%016llx\n",
 			     proc->pid, thread->pid,
 			     (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" :
-			     "BR_REPLY",
+				(cmd == BR_TRANSACTION_SEC_CTX) ?
+				     "BR_TRANSACTION_SEC_CTX" : "BR_REPLY",
 			     t->debug_id, t_from ? t_from->proc->pid : 0,
 			     t_from ? t_from->pid : 0, cmd,
 			     t->buffer->data_size, t->buffer->offsets_size,
-			     (u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets);
+			     (u64)trd->data.ptr.buffer,
+			     (u64)trd->data.ptr.offsets);
 
 		if (t_from)
 			binder_thread_dec_tmpref(t_from);
 		t->buffer->allow_user_free = 1;
-		if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) {
+		if (cmd != BR_REPLY && !(t->flags & TF_ONE_WAY)) {
 			binder_inner_proc_lock(thread->proc);
 			t->to_parent = thread->transaction_stack;
 			t->to_thread = thread;
@@ -4670,7 +4716,8 @@ out:
 	return ret;
 }
 
-static int binder_ioctl_set_ctx_mgr(struct file *filp)
+static int binder_ioctl_set_ctx_mgr(struct file *filp,
+				    struct flat_binder_object *fbo)
 {
 	int ret = 0;
 	struct binder_proc *proc = filp->private_data;
@@ -4699,7 +4746,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 	} else {
 		context->binder_context_mgr_uid = curr_euid;
 	}
-	new_node = binder_new_node(proc, NULL);
+	new_node = binder_new_node(proc, fbo);
 	if (!new_node) {
 		ret = -ENOMEM;
 		goto out;
@@ -4821,8 +4868,20 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		binder_inner_proc_unlock(proc);
 		break;
 	}
+	case BINDER_SET_CONTEXT_MGR_EXT: {
+		struct flat_binder_object fbo;
+
+		if (copy_from_user(&fbo, ubuf, sizeof(fbo))) {
+			ret = -EINVAL;
+			goto err;
+		}
+		ret = binder_ioctl_set_ctx_mgr(filp, &fbo);
+		if (ret)
+			goto err;
+		break;
+	}
 	case BINDER_SET_CONTEXT_MGR:
-		ret = binder_ioctl_set_ctx_mgr(filp);
+		ret = binder_ioctl_set_ctx_mgr(filp, NULL);
 		if (ret)
 			goto err;
 		break;
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index bd0da0e992b8..0631c500702c 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -87,6 +87,14 @@ enum flat_binder_object_flags {
 	 * scheduling policy from the caller (for synchronous transactions).
 	 */
 	FLAT_BINDER_FLAG_INHERIT_RT = 0x800,
+
+	/**
+	 * @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts
+	 *
+	 * Only when set, causes senders to include their security
+	 * context
+	 */
+	FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000,
 };
 
 #ifdef BINDER_IPC_32BIT
@@ -264,6 +272,7 @@ struct binder_node_info_for_ref {
 #define BINDER_VERSION			_IOWR('b', 9, struct binder_version)
 #define BINDER_GET_NODE_DEBUG_INFO	_IOWR('b', 11, struct binder_node_debug_info)
 #define BINDER_GET_NODE_INFO_FOR_REF	_IOWR('b', 12, struct binder_node_info_for_ref)
+#define BINDER_SET_CONTEXT_MGR_EXT	_IOW('b', 13, struct flat_binder_object)
 
 /*
  * NOTE: Two special error codes you should check for when calling
@@ -322,6 +331,11 @@ struct binder_transaction_data {
 	} data;
 };
 
+struct binder_transaction_data_secctx {
+	struct binder_transaction_data transaction_data;
+	binder_uintptr_t secctx;
+};
+
 struct binder_transaction_data_sg {
 	struct binder_transaction_data transaction_data;
 	binder_size_t buffers_size;
@@ -357,6 +371,11 @@ enum binder_driver_return_protocol {
 	BR_OK = _IO('r', 1),
 	/* No parameters! */
 
+	BR_TRANSACTION_SEC_CTX = _IOR('r', 2,
+				      struct binder_transaction_data_secctx),
+	/*
+	 * binder_transaction_data_secctx: the received command.
+	 */
 	BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data),
 	BR_REPLY = _IOR('r', 3, struct binder_transaction_data),
 	/*

From f44df02cd20229706d1a6beb1ae767f37412dd03 Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <salyzyn@google.com>
Date: Thu, 21 Feb 2019 12:45:59 -0800
Subject: [PATCH 2651/3220] ANDROID: overlayfs: override_creds=off option
 bypass creator_cred

By default, all access to the upper, lower and work directories is the
recorded mounter's MAC and DAC credentials.  The incoming accesses are
checked against the caller's credentials.

If the principles of least privilege are applied, the mounter's
credentials might not overlap the credentials of the caller's when
accessing the overlayfs filesystem.  For example, a file that a lower
DAC privileged caller can execute, is MAC denied to the generally
higher DAC privileged mounter, to prevent an attack vector.

We add the option to turn off override_creds in the mount options; all
subsequent operations after mount on the filesystem will be only the
caller's credentials.  The module boolean parameter and mount option
override_creds is also added as a presence check for this "feature",
existence of /sys/module/overlay/parameters/override_creds.

It was not always this way.  Circa 4.6 there was no recorded mounter's
credentials, instead privileged access to upper or work directories
were temporarily increased to perform the operations.  The MAC
(selinux) policies were caller's in all cases.  override_creds=off
partially returns us to this older access model minus the insecure
temporary credential increases.  This is to permit use in a system
with non-overlapping security models for each executable including
the agent that mounts the overlayfs filesystem.  In Android
this is the case since init, which performs the mount operations,
has a minimal MAC set of privileges to reduce any attack surface,
and services that use the content have a different set of MAC
privileges (eg: read, for vendor labelled configuration, execute for
vendor libraries and modules).  The caveats are not a problem in
the Android usage model, however they should be fixed for
completeness and for general use in time.

Signed-off-by: Mark Salyzyn <salyzyn@android.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: linux-unionfs@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: kernel-team@android.com
---
---
v9:
- Add to the caveats

v8:
- drop pr_warn message after straw poll to remove it.
- added a use case in the commit message

v7:
- change name of internal parameter to ovl_override_creds_def
- report override_creds only if different than default

v6:
- Drop CONFIG_OVERLAY_FS_OVERRIDE_CREDS.
- Do better with the documentation.
- pr_warn message adjusted to report consequences.

v5:
- beefed up the caveats in the Documentation
- Is dependent on
  "overlayfs: check CAP_DAC_READ_SEARCH before issuing exportfs_decode_fh"
  "overlayfs: check CAP_MKNOD before issuing vfs_whiteout"
- Added prwarn when override_creds=off

v4:
- spelling and grammar errors in text

v3:
- Change name from caller_credentials / creator_credentials to the
  boolean override_creds.
- Changed from creator to mounter credentials.
- Updated and fortified the documentation.
- Added CONFIG_OVERLAY_FS_OVERRIDE_CREDS

v2:
- Forward port changed attr to stat, resulting in a build error.
- altered commit message.

Signed-off-by: Mark Salyzyn <salyzyn@google.com>
(cherry picked from https://lore.kernel.org/patchwork/patch/1009299)
Bug: 109821005
Bug: 112955896
Bug: 126256072
Bug: 127298877
Change-Id: Ie43b0c7dfd64f4cfc27dfe5e1622ea01f3b000cf
---
 Documentation/filesystems/overlayfs.txt | 23 +++++++++++++++++++
 fs/overlayfs/copy_up.c                  |  2 +-
 fs/overlayfs/dir.c                      |  7 +++---
 fs/overlayfs/overlayfs.h                |  1 +
 fs/overlayfs/readdir.c                  |  2 +-
 fs/overlayfs/super.c                    | 30 +++++++++++++++++++++++++
 6 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index 28091457b71a..771bb220449b 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -82,6 +82,29 @@ Only the lists of names from directories are merged.  Other content
 such as metadata and extended attributes are reported for the upper
 directory only.  These attributes of the lower directory are hidden.
 
+credentials
+-----------
+
+By default, all access to the upper, lower and work directories is the
+recorded mounter's MAC and DAC credentials.  The incoming accesses are
+checked against the caller's credentials.
+
+In the case where caller MAC or DAC credentials do not overlap, a
+use case available in older versions of the driver, the
+override_creds mount flag can be turned off and help when the use
+pattern has caller with legitimate credentials where the mounter
+does not.  Several unintended side effects will occur though.  The
+caller without certain key capabilities or lower privilege will not
+always be able to delete files or directories, create nodes, or
+search some restricted directories.  The ability to search and read
+a directory entry is spotty as a result of the cache mechanism not
+retesting the credentials because of the assumption, a privileged
+caller can fill cache, then a lower privilege can read the directory
+cache.  The uneven security model where cache, upperdir and workdir
+are opened at privilege, but accessed without creating a form of
+privilege escalation, should only be used with strict understanding
+of the side effects and of the security policies.
+
 whiteouts and opaque directories
 --------------------------------
 
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 64c5386d0c1b..d49ac2ae7099 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -357,7 +357,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
 	}
 out_unlock:
 	unlock_rename(workdir, upperdir);
-	revert_creds(old_cred);
+	ovl_revert_creds(old_cred);
 
 	if (link)
 		free_page((unsigned long) link);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index f8aa54272121..fd71597462ed 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -414,7 +414,7 @@ static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
 		err = ovl_create_over_whiteout(dentry, inode, &stat, link,
 					       hardlink);
 
-		revert_creds(old_cred);
+		ovl_revert_creds(old_cred);
 	}
 
 	if (!err)
@@ -648,7 +648,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
 
 		err = ovl_remove_and_whiteout(dentry, is_dir);
 
-		revert_creds(old_cred);
+		ovl_revert_creds(old_cred);
 	}
 out_drop_write:
 	ovl_drop_write(dentry);
@@ -887,8 +887,7 @@ out_dput_old:
 out_unlock:
 	unlock_rename(new_upperdir, old_upperdir);
 out_revert_creds:
-	if (old_opaque || new_opaque)
-		revert_creds(old_cred);
+	ovl_revert_creds(old_cred);
 out_drop_write:
 	ovl_drop_write(old);
 out:
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 27a42975d7cd..0723b8f97651 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -151,6 +151,7 @@ bool ovl_dentry_is_opaque(struct dentry *dentry);
 void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
 bool ovl_is_whiteout(struct dentry *dentry);
 const struct cred *ovl_override_creds(struct super_block *sb);
+void ovl_revert_creds(const struct cred *oldcred);
 void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
 struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			  unsigned int flags);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index da999e73c97a..9ff8f8192bf1 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -223,7 +223,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
 		}
 		mutex_unlock(&dir->d_inode->i_mutex);
 	}
-	revert_creds(old_cred);
+	ovl_revert_creds(old_cred);
 
 	return err;
 }
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index fa20c95bd456..48f7c1ab94cd 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -31,6 +31,7 @@ struct ovl_config {
 	char *lowerdir;
 	char *upperdir;
 	char *workdir;
+	bool override_creds;
 };
 
 /* private information held for overlayfs's superblock */
@@ -252,9 +253,17 @@ const struct cred *ovl_override_creds(struct super_block *sb)
 {
 	struct ovl_fs *ofs = sb->s_fs_info;
 
+	if (!ofs->config.override_creds)
+		return NULL;
 	return override_creds(ofs->creator_cred);
 }
 
+void ovl_revert_creds(const struct cred *old_cred)
+{
+	if (old_cred)
+		revert_creds(old_cred);
+}
+
 static bool ovl_is_opaquedir(struct dentry *dentry)
 {
 	int res;
@@ -626,6 +635,11 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return err;
 }
 
+static bool __read_mostly ovl_override_creds_def = true;
+module_param_named(override_creds, ovl_override_creds_def, bool, 0644);
+MODULE_PARM_DESC(ovl_override_creds_def,
+		 "Use mounter's credentials for accesses");
+
 /**
  * ovl_show_options
  *
@@ -642,6 +656,9 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 		seq_show_option(m, "upperdir", ufs->config.upperdir);
 		seq_show_option(m, "workdir", ufs->config.workdir);
 	}
+	if (ufs->config.override_creds != ovl_override_creds_def)
+		seq_show_option(m, "override_creds",
+				ufs->config.override_creds ? "on" : "off");
 	return 0;
 }
 
@@ -666,6 +683,8 @@ enum {
 	OPT_LOWERDIR,
 	OPT_UPPERDIR,
 	OPT_WORKDIR,
+	OPT_OVERRIDE_CREDS_ON,
+	OPT_OVERRIDE_CREDS_OFF,
 	OPT_ERR,
 };
 
@@ -673,6 +692,8 @@ static const match_table_t ovl_tokens = {
 	{OPT_LOWERDIR,			"lowerdir=%s"},
 	{OPT_UPPERDIR,			"upperdir=%s"},
 	{OPT_WORKDIR,			"workdir=%s"},
+	{OPT_OVERRIDE_CREDS_ON,		"override_creds=on"},
+	{OPT_OVERRIDE_CREDS_OFF,	"override_creds=off"},
 	{OPT_ERR,			NULL}
 };
 
@@ -703,6 +724,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
 {
 	char *p;
 
+	config->override_creds = ovl_override_creds_def;
 	while ((p = ovl_next_opt(&opt)) != NULL) {
 		int token;
 		substring_t args[MAX_OPT_ARGS];
@@ -733,6 +755,14 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
 				return -ENOMEM;
 			break;
 
+		case OPT_OVERRIDE_CREDS_ON:
+			config->override_creds = true;
+			break;
+
+		case OPT_OVERRIDE_CREDS_OFF:
+			config->override_creds = false;
+			break;
+
 		default:
 			pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
 			return -EINVAL;

From bf53364cba2f0b3ba213bc82bc71975f4ef2451a Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 5 Mar 2019 11:11:34 -0800
Subject: [PATCH 2652/3220] ANDROID: cuttlefish_defconfig: Add support for AC97
 audio

Enable driver support for the ac97 emulation provided by QEMU and
crosvm. This is for the older 'ac97' soundhw, not 'hda'.

Bug: 126955561
Test: local build and test of sound from cuttlefish
Change-Id: I6c29e352e0be161e2a1dc35fde50b888b7dbf86e
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/arm64/configs/cuttlefish_defconfig      | 6 ++++++
 arch/x86/configs/x86_64_cuttlefish_defconfig | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
index 931204f707a3..e25e9b50b12e 100644
--- a/arch/arm64/configs/cuttlefish_defconfig
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -278,6 +278,12 @@ CONFIG_DRM=y
 CONFIG_DRM_VIRTIO_GPU=y
 CONFIG_SOUND=y
 CONFIG_SND=y
+CONFIG_SND_HRTIMER=y
+# CONFIG_SND_SUPPORT_OLD_API is not set
+# CONFIG_SND_VERBOSE_PROCFS is not set
+# CONFIG_SND_DRIVERS is not set
+CONFIG_SND_INTEL8X0=y
+# CONFIG_SND_USB is not set
 CONFIG_HIDRAW=y
 CONFIG_UHID=y
 CONFIG_HID_A4TECH=y
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 374caac24cb6..8f6c76476220 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -306,6 +306,12 @@ CONFIG_DRM=y
 CONFIG_DRM_VIRTIO_GPU=y
 CONFIG_SOUND=y
 CONFIG_SND=y
+CONFIG_SND_HRTIMER=y
+# CONFIG_SND_SUPPORT_OLD_API is not set
+# CONFIG_SND_VERBOSE_PROCFS is not set
+# CONFIG_SND_DRIVERS is not set
+CONFIG_SND_INTEL8X0=y
+# CONFIG_SND_USB is not set
 CONFIG_HIDRAW=y
 CONFIG_UHID=y
 CONFIG_HID_A4TECH=y

From 49293dce7b727fa17ca9f73e90d40e5f502de3ca Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 11 Jan 2019 18:44:18 -0800
Subject: [PATCH 2653/3220] ANDROID: mnt: Propagate remount correctly

This switches over to propagation_next to respect
namepsace semantics.

Test: Remounting to change the options of a fs with mount based
      options should propagate to all shared copies of that mount,
      and the slaves/indirect slaves of those.
Bug: 122428178
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Change-Id: Ic35cd2782a646435689f5bedfa1f218fe4ab8254
---
 fs/pnode.c | 34 ++++++++--------------------------
 1 file changed, 8 insertions(+), 26 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index ddb846f878b8..35154a5c9392 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -610,36 +610,18 @@ int propagate_umount(struct list_head *list)
 	return 0;
 }
 
-/*
- *  Iterates over all slaves, and slaves of slaves.
- */
-static struct mount *next_descendent(struct mount *root, struct mount *cur)
-{
-	if (!IS_MNT_NEW(cur) && !list_empty(&cur->mnt_slave_list))
-		return first_slave(cur);
-	do {
-		struct mount *master = cur->mnt_master;
-
-		if (!master || cur->mnt_slave.next != &master->mnt_slave_list) {
-			struct mount *next = next_slave(cur);
-
-			return (next == root) ? NULL : next;
-		}
-		cur = master;
-	} while (cur != root);
-	return NULL;
-}
-
 void propagate_remount(struct mount *mnt)
 {
-	struct mount *m = mnt;
+	struct mount *parent = mnt->mnt_parent;
+	struct mount *p = mnt, *m;
 	struct super_block *sb = mnt->mnt.mnt_sb;
 
-	if (sb->s_op->copy_mnt_data) {
-		m = next_descendent(mnt, m);
-		while (m) {
+	if (!sb->s_op->copy_mnt_data)
+		return;
+	for (p = propagation_next(parent, parent); p;
+				p = propagation_next(p, parent)) {
+		m = __lookup_mnt(&p->mnt, mnt->mnt_mountpoint);
+		if (m)
 			sb->s_op->copy_mnt_data(m->mnt.data, mnt->mnt.data);
-			m = next_descendent(mnt, m);
-		}
 	}
 }

From 52bff3f6ce0fd6e9324236600cfee2780a66ffb7 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 15 Feb 2019 00:08:25 +0800
Subject: [PATCH 2654/3220] f2fs: fix to check inline_xattr_size boundary
 correctly

We use below condition to check inline_xattr_size boundary:

	if (!F2FS_OPTION(sbi).inline_xattr_size ||
		F2FS_OPTION(sbi).inline_xattr_size >=
				DEF_ADDRS_PER_INODE -
				F2FS_TOTAL_EXTRA_ATTR_SIZE -
				DEF_INLINE_RESERVED_SIZE -
				DEF_MIN_INLINE_SIZE)

There is there problems in that check:
- we should allow inline_xattr_size equaling to min size of inline
{data,dentry} area.
- F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on
different size unit, previous one is 4 bytes, latter one is 1 bytes.
- DEF_MIN_INLINE_SIZE only indicate min size of inline data area,
however, we need to consider min size of inline dentry area as well,
minimal inline dentry should at least contain two entries: '.' and
'..', so that min inline_dentry size is 40 bytes.

.bitmap		1 * 1 = 1
.reserved	1 * 1 = 1
.dentry		11 * 2 = 22
.filename	8 * 2 = 16
total		40

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          |  1 -
 fs/f2fs/super.c         | 13 +++++++------
 include/linux/f2fs_fs.h | 13 +++++++------
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c836992d5a00..607abef0abcf 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -524,7 +524,6 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
-#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
 static inline int get_inline_xattr_addrs(struct inode *inode);
 #define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fa54715e4132..1a586512a0d7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -834,12 +834,13 @@ static int parse_options(struct super_block *sb, char *options)
 					"set with inline_xattr option");
 			return -EINVAL;
 		}
-		if (!F2FS_OPTION(sbi).inline_xattr_size ||
-			F2FS_OPTION(sbi).inline_xattr_size >=
-					DEF_ADDRS_PER_INODE -
-					F2FS_TOTAL_EXTRA_ATTR_SIZE -
-					DEF_INLINE_RESERVED_SIZE -
-					DEF_MIN_INLINE_SIZE) {
+		if (F2FS_OPTION(sbi).inline_xattr_size <
+			sizeof(struct f2fs_xattr_header) / sizeof(__le32) ||
+			F2FS_OPTION(sbi).inline_xattr_size >
+			DEF_ADDRS_PER_INODE -
+			F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) -
+			DEF_INLINE_RESERVED_SIZE -
+			MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) {
 			f2fs_msg(sb, KERN_ERR,
 					"inline xattr size is out of range");
 			return -EINVAL;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 6dcde67ba171..2f5fbc41118c 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -490,12 +490,12 @@ typedef __le32	f2fs_hash_t;
 
 /*
  * space utilization of regular dentry and inline dentry (w/o extra reservation)
- *		regular dentry			inline dentry
- * bitmap	1 * 27 = 27			1 * 23 = 23
- * reserved	1 * 3 = 3			1 * 7 = 7
- * dentry	11 * 214 = 2354			11 * 182 = 2002
- * filename	8 * 214 = 1712			8 * 182 = 1456
- * total	4096				3488
+ *		regular dentry		inline dentry (def)	inline dentry (min)
+ * bitmap	1 * 27 = 27		1 * 23 = 23		1 * 1 = 1
+ * reserved	1 * 3 = 3		1 * 7 = 7		1 * 1 = 1
+ * dentry	11 * 214 = 2354		11 * 182 = 2002		11 * 2 = 22
+ * filename	8 * 214 = 1712		8 * 182 = 1456		8 * 2 = 16
+ * total	4096			3488			40
  *
  * Note: there are more reserved space in inline dentry than in regular
  * dentry, when converting inline dentry we should handle this carefully.
@@ -507,6 +507,7 @@ typedef __le32	f2fs_hash_t;
 #define SIZE_OF_RESERVED	(PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
 				F2FS_SLOT_LEN) * \
 				NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
+#define MIN_INLINE_DENTRY_SIZE		40	/* just include '.' and '..' entries */
 
 /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
 struct f2fs_dir_entry {

From 6a19f801501c559f98ef9c46f41ff2e51d72eb4b Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 15 Feb 2019 00:16:15 +0800
Subject: [PATCH 2655/3220] f2fs: don't allow negative ->write_io_size_bits

As Dan reported:

"We put an upper bound on ->write_io_size_bits but we don't have a lower
bound."

So let's add lower bound check for ->write_io_size_bits in parse_options().

[We don't allow configuring ->write_io_size_bits to zero, since at least
we need to fill one dummy page for aligned IO.]

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1a586512a0d7..fac6875ffa3f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_io_size_bits:
 			if (args->from && match_int(args, &arg))
 				return -EINVAL;
-			if (arg > __ilog2_u32(BIO_MAX_PAGES)) {
+			if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) {
 				f2fs_msg(sb, KERN_WARNING,
 					"Not support %d, larger than %d",
 					1 << arg, BIO_MAX_PAGES);

From b88b75de8305636fb9b3b43bd475249109304563 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 5 Feb 2019 07:59:57 -0800
Subject: [PATCH 2656/3220] f2fs: don't clear CP_QUOTA_NEED_FSCK_FLAG

If we met this once, let fsck.f2fs clear this only.
Note that, this addresses all the subtle fault injection test.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0dc483a7f3d8..dc01dea911d6 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1268,8 +1268,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
 		__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
-	else
-		__clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+	/*
+	 * TODO: we count on fsck.f2fs to clear this flag until we figure out
+	 * missing cases which clear it incorrectly.
+	 */
 
 	if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
 		__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);

From 79c3cf3d3b5ac73b92f47e2d894b5db4b2b22402 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 15 Feb 2019 19:04:38 -0800
Subject: [PATCH 2657/3220] f2fs: fix wrong #endif

We have to cover whole headerfile with last #endif.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 607abef0abcf..ffea1f6e8a3e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3709,8 +3709,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
 #define f2fs_build_fault_attr(sbi, rate, type)		do { } while (0)
 #endif
 
-#endif
-
 static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
 {
 #ifdef CONFIG_QUOTA
@@ -3723,3 +3721,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
 #endif
 	return false;
 }
+
+#endif

From 66ebc2e50a523c2079cdc798286d2b6932d7bb64 Mon Sep 17 00:00:00 2001
From: Zeng Guangyue <zengguangyue@hisilicon.com>
Date: Mon, 18 Feb 2019 14:26:41 +0800
Subject: [PATCH 2658/3220] f2fs: correct spelling mistake

correct spelling mistake for "nunmber"

Signed-off-by: Zeng Guangyue <zengguangyue@hisilicon.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 2f5fbc41118c..b19433738690 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -285,7 +285,7 @@ enum {
 
 struct node_footer {
 	__le32 nid;		/* node id */
-	__le32 ino;		/* inode nunmber */
+	__le32 ino;		/* inode number */
 	__le32 flag;		/* include cold/fsync/dentry marks and offset */
 	__le64 cp_ver;		/* checkpoint version */
 	__le32 next_blkaddr;	/* next node page block address */

From 7307334f43684753f9bd4228a126d57cd20d6c1e Mon Sep 17 00:00:00 2001
From: Gao Xiang <gaoxiang25@huawei.com>
Date: Tue, 19 Feb 2019 10:31:52 +0800
Subject: [PATCH 2659/3220] f2fs: silence VM_WARN_ON_ONCE in mempool_alloc

Note that __GFP_ZERO is not supported for mempool_alloc,
which also documented in the mempool_alloc comments.

Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 16a389733eb8..ba03cb1412f8 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -296,9 +296,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
 		for (; start < F2FS_IO_SIZE(sbi); start++) {
 			struct page *page =
 				mempool_alloc(sbi->write_io_dummy,
-					GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL);
+					      GFP_NOIO | __GFP_NOFAIL);
 			f2fs_bug_on(sbi, !page);
 
+			zero_user_segment(page, 0, PAGE_SIZE);
 			SetPagePrivate(page);
 			set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
 			lock_page(page);

From 9878080f02d67df945dd34fd56cc6b41df002704 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 19 Feb 2019 16:23:53 +0800
Subject: [PATCH 2660/3220] f2fs: fix to retry fill_super only if recovery
 failed

With current retry mechanism in f2fs_fill_super, first fill_super
fails due to no memory, then second fill_super runs w/o recovery,
if we succeed, we may lose fsynced data, it doesn't make sense.

Let's retry fill_super only if it occurs non-ENOMEM error during
recovery.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fac6875ffa3f..db40d5e0a435 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3061,10 +3061,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	struct f2fs_super_block *raw_super;
 	struct inode *root;
 	int err;
-	bool retry = true, need_fsck = false;
+	bool skip_recovery = false, need_fsck = false;
 	char *options = NULL;
 	int recovery, i, valid_super_block;
 	struct curseg_info *seg_i;
+	int retry_cnt = 1;
 
 try_onemore:
 	err = -EINVAL;
@@ -3352,7 +3353,7 @@ try_onemore:
 		goto free_meta;
 
 	if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)))
-		goto skip_recovery;
+		goto reset_checkpoint;
 
 	/* recover fsynced data */
 	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -3369,11 +3370,13 @@ try_onemore:
 		if (need_fsck)
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
 
-		if (!retry)
-			goto skip_recovery;
+		if (skip_recovery)
+			goto reset_checkpoint;
 
 		err = f2fs_recover_fsync_data(sbi, false);
 		if (err < 0) {
+			if (err != -ENOMEM)
+				skip_recovery = true;
 			need_fsck = true;
 			f2fs_msg(sb, KERN_ERR,
 				"Cannot recover all fsync data errno=%d", err);
@@ -3389,7 +3392,7 @@ try_onemore:
 			goto free_meta;
 		}
 	}
-skip_recovery:
+reset_checkpoint:
 	/* f2fs_recover_fsync_data() cleared this already */
 	clear_sbi_flag(sbi, SBI_POR_DOING);
 
@@ -3435,7 +3438,7 @@ skip_recovery:
 sync_free_meta:
 	/* safe to flush all the data */
 	sync_filesystem(sbi->sb);
-	retry = false;
+	retry_cnt = 0;
 
 free_meta:
 #ifdef CONFIG_QUOTA
@@ -3495,8 +3498,8 @@ free_sbi:
 	kvfree(sbi);
 
 	/* give only one another chance */
-	if (retry) {
-		retry = false;
+	if (retry_cnt > 0 && skip_recovery) {
+		retry_cnt--;
 		shrink_dcache_sb(sb);
 		goto try_onemore;
 	}

From 40c364049511f14fee99dc271265f956d8d486f0 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 19 Feb 2019 17:08:18 +0800
Subject: [PATCH 2661/3220] f2fs: make fault injection covering
 __submit_flush_wait()

This patch changes to allow failure of f2fs_bio_alloc() in
__submit_flush_wait(), which can simulate flush error in checkpoint()
for covering more error paths.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6f0bfaeebb47..cebfcadcb9df 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -542,9 +542,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 				struct block_device *bdev)
 {
-	struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
+	struct bio *bio;
 	int ret;
 
+	bio = f2fs_bio_alloc(sbi, 0, false);
+	if (!bio)
+		return -ENOMEM;
+
 	bio->bi_rw = REQ_OP_WRITE;
 	bio->bi_bdev = bdev;
 	ret = submit_bio_wait(WRITE_FLUSH, bio);

From ac589b083d7345a7942210ca4f2901ae2758ce82 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 21 Feb 2019 20:37:14 +0800
Subject: [PATCH 2662/3220] f2fs: fix encrypted page memory leak

For IPU path of f2fs_do_write_data_page(), in its error path, we
need to release encrypted page and fscrypt context, otherwise it
will cause memory leak.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ba03cb1412f8..51d97757dea8 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1857,8 +1857,13 @@ got_it:
 		if (fio->need_lock == LOCK_REQ)
 			f2fs_unlock_op(fio->sbi);
 		err = f2fs_inplace_write_data(fio);
-		if (err && PageWriteback(page))
-			end_page_writeback(page);
+		if (err) {
+			if (f2fs_encrypted_file(inode))
+				fscrypt_pullback_bio_page(&fio->encrypted_page,
+									true);
+			if (PageWriteback(page))
+				end_page_writeback(page);
+		}
 		trace_f2fs_do_write_data_page(fio->page, IPU);
 		set_inode_flag(inode, FI_UPDATE_WRITE);
 		return err;

From 2e7ee431e4c36d2da217c06d9138e71a8b93bc5a Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 21 Feb 2019 20:40:13 +0800
Subject: [PATCH 2663/3220] f2fs: fix to update iostat correctly in IPU path

In error path of IPU, we didn't account iostat correctly, fix it.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index cebfcadcb9df..4a4d49f4482f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3259,10 +3259,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
 	stat_inc_inplace_blocks(fio->sbi);
 
 	err = f2fs_submit_page_bio(fio);
-	if (!err)
+	if (!err) {
 		update_device_state(fio);
-
-	f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+		f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+	}
 
 	return err;
 }

From 0d150640d185b6fa04080f74010d1c7fee835c4a Mon Sep 17 00:00:00 2001
From: Gao Xiang <gaoxiang25@huawei.com>
Date: Thu, 21 Feb 2019 12:57:35 +0800
Subject: [PATCH 2664/3220] f2fs: no need to take page lock in readdir

VFS will take inode_lock for readdir, therefore no need to
take page lock in readdir at all just as the majority of
other generic filesystems.

This patch improves concurrency since .iterate_shared
was introduced to VFS years ago.

Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8dff29053150..1f68e210f1db 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -896,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 			page_cache_sync_readahead(inode->i_mapping, ra, file, n,
 				min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
 
-		dentry_page = f2fs_get_lock_data_page(inode, n, false);
+		dentry_page = f2fs_find_data_page(inode, n);
 		if (IS_ERR(dentry_page)) {
 			err = PTR_ERR(dentry_page);
 			if (err == -ENOENT) {
@@ -914,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		err = f2fs_fill_dentries(ctx, &d,
 				n * NR_DENTRY_IN_BLOCK, &fstr);
 		if (err) {
-			f2fs_put_page(dentry_page, 1);
+			f2fs_put_page(dentry_page, 0);
 			break;
 		}
 
-		f2fs_put_page(dentry_page, 1);
+		f2fs_put_page(dentry_page, 0);
 	}
 out_free:
 	fscrypt_fname_free_buffer(&fstr);

From 2d7bc3f943058acadde2e596dcb69f129937ae83 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 25 Feb 2019 09:46:45 -0800
Subject: [PATCH 2665/3220] f2fs: give random value to i_generation

This follows to give random number to i_generation along with commit
232530680290b ("ext4: improve smp scalability for inode generation")

This can be used for DUN for UFS HW encryption.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 2 --
 fs/f2fs/namei.c | 3 ++-
 fs/f2fs/super.c | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ffea1f6e8a3e..03678fcce65c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1307,8 +1307,6 @@ struct f2fs_sb_info {
 
 	unsigned int nquota_files;		/* # of quota sysfile */
 
-	u32 s_next_generation;			/* for NFS support */
-
 	/* # of pages, see count_type */
 	atomic_t nr_pages[NR_COUNT_TYPE];
 	/* # of allocated blocks */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e682589e07c4..1e8a21fdb6a2 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -10,6 +10,7 @@
 #include <linux/pagemap.h>
 #include <linux/sched.h>
 #include <linux/ctype.h>
+#include <linux/random.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/quotaops.h>
@@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime =
 			F2FS_I(inode)->i_crtime = current_time(inode);
-	inode->i_generation = sbi->s_next_generation++;
+	inode->i_generation = prandom_u32();
 
 	if (S_ISDIR(inode->i_mode))
 		F2FS_I(inode)->i_current_depth = 1;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index db40d5e0a435..f39861c1dcdb 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3137,7 +3137,6 @@ try_onemore:
 	sb->s_maxbytes = sbi->max_file_blocks <<
 				le32_to_cpu(raw_super->log_blocksize);
 	sb->s_max_links = F2FS_LINK_MAX;
-	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &f2fs_quota_operations;

From 09554b63c705d49eb3e793cbc2781f8a8662955e Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 23 Feb 2019 09:48:27 +0800
Subject: [PATCH 2666/3220] f2fs: fix to dirty inode for i_mode recovery

As Seulbae Kim reported in bugzilla:

https://bugzilla.kernel.org/show_bug.cgi?id=202637

We didn't recover permission field correctly after sudden power-cut,
the reason is in setattr we didn't add inode into global dirty list
once i_mode is changed, so latter checkpoint triggered by fsync will
not flush last i_mode into disk, result in this problem, fix it.

Reported-by: Seulbae Kim <seulbae@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 724b2094bdec..8b1d393c08a4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -770,7 +770,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int err;
-	bool size_changed = false;
 
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;
@@ -845,8 +844,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		down_write(&F2FS_I(inode)->i_sem);
 		F2FS_I(inode)->last_disk_size = i_size_read(inode);
 		up_write(&F2FS_I(inode)->i_sem);
-
-		size_changed = true;
 	}
 
 	__setattr_copy(inode, attr);
@@ -860,7 +857,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	/* file size may changed here */
-	f2fs_mark_inode_dirty_sync(inode, size_changed);
+	f2fs_mark_inode_dirty_sync(inode, true);
 
 	/* inode change will produce dirty node pages flushed by checkpoint */
 	f2fs_balance_fs(F2FS_I_SB(inode), true);

From 528a7e781c075cd35cf13413fe3a057e83e5b0fd Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Mon, 25 Feb 2019 17:11:03 +0800
Subject: [PATCH 2667/3220] f2fs: fix to avoid deadlock of atomic file
 operations

Thread A				Thread B
- __fput
 - f2fs_release_file
  - drop_inmem_pages
   - mutex_lock(&fi->inmem_lock)
   - __revoke_inmem_pages
    - lock_page(page)
					- open
					- f2fs_setattr
					- truncate_setsize
					 - truncate_inode_pages_range
					  - lock_page(page)
					  - truncate_cleanup_page
					   - f2fs_invalidate_page
					    - drop_inmem_page
					    - mutex_lock(&fi->inmem_lock);

We may encounter above ABBA deadlock as reported by Kyungtae Kim:

I'm reporting a bug in linux-4.17.19: "INFO: task hung in
drop_inmem_page" (no reproducer)

I think this might be somehow related to the following:
https://groups.google.com/forum/#!searchin/syzkaller-bugs/INFO$3A$20task$20hung$20in$20%7Csort:date/syzkaller-bugs/c6soBTrdaIo/AjAzPeIzCgAJ

=========================================
INFO: task syz-executor7:10822 blocked for more than 120 seconds.
      Not tainted 4.17.19 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor7   D27024 10822   6346 0x00000004
Call Trace:
 context_switch kernel/sched/core.c:2867 [inline]
 __schedule+0x721/0x1e60 kernel/sched/core.c:3515
 schedule+0x88/0x1c0 kernel/sched/core.c:3559
 schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:3617
 __mutex_lock_common kernel/locking/mutex.c:833 [inline]
 __mutex_lock+0x5bd/0x1410 kernel/locking/mutex.c:893
 mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:908
 drop_inmem_page+0xcb/0x810 fs/f2fs/segment.c:327
 f2fs_invalidate_page+0x337/0x5e0 fs/f2fs/data.c:2401
 do_invalidatepage mm/truncate.c:165 [inline]
 truncate_cleanup_page+0x261/0x330 mm/truncate.c:187
 truncate_inode_pages_range+0x552/0x1610 mm/truncate.c:367
 truncate_inode_pages mm/truncate.c:478 [inline]
 truncate_pagecache+0x6d/0x90 mm/truncate.c:801
 truncate_setsize+0x81/0xa0 mm/truncate.c:826
 f2fs_setattr+0x44f/0x1270 fs/f2fs/file.c:781
 notify_change+0xa62/0xe80 fs/attr.c:313
 do_truncate+0x12e/0x1e0 fs/open.c:63
 do_last fs/namei.c:2955 [inline]
 path_openat+0x2042/0x29f0 fs/namei.c:3505
 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540
 do_sys_open+0x35e/0x4e0 fs/open.c:1101
 __do_sys_open fs/open.c:1119 [inline]
 __se_sys_open fs/open.c:1114 [inline]
 __x64_sys_open+0x89/0xc0 fs/open.c:1114
 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4497b9
RSP: 002b:00007f734e459c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002
RAX: ffffffffffffffda RBX: 00007f734e45a6cc RCX: 00000000004497b9
RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080
RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e45a700
INFO: task syz-executor7:10858 blocked for more than 120 seconds.
      Not tainted 4.17.19 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor7   D28880 10858   6346 0x00000004
Call Trace:
 context_switch kernel/sched/core.c:2867 [inline]
 __schedule+0x721/0x1e60 kernel/sched/core.c:3515
 schedule+0x88/0x1c0 kernel/sched/core.c:3559
 __rwsem_down_write_failed_common kernel/locking/rwsem-xadd.c:565 [inline]
 rwsem_down_write_failed+0x5e6/0xc90 kernel/locking/rwsem-xadd.c:594
 call_rwsem_down_write_failed+0x17/0x30 arch/x86/lib/rwsem.S:117
 __down_write arch/x86/include/asm/rwsem.h:142 [inline]
 down_write+0x58/0xa0 kernel/locking/rwsem.c:72
 inode_lock include/linux/fs.h:713 [inline]
 do_truncate+0x120/0x1e0 fs/open.c:61
 do_last fs/namei.c:2955 [inline]
 path_openat+0x2042/0x29f0 fs/namei.c:3505
 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540
 do_sys_open+0x35e/0x4e0 fs/open.c:1101
 __do_sys_open fs/open.c:1119 [inline]
 __se_sys_open fs/open.c:1114 [inline]
 __x64_sys_open+0x89/0xc0 fs/open.c:1114
 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4497b9
RSP: 002b:00007f734e3b4c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002
RAX: ffffffffffffffda RBX: 00007f734e3b56cc RCX: 00000000004497b9
RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080
RBP: 000000000071c238 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e3b5700
INFO: task syz-executor5:10829 blocked for more than 120 seconds.
      Not tainted 4.17.19 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor5   D28760 10829   6308 0x80000002
Call Trace:
 context_switch kernel/sched/core.c:2867 [inline]
 __schedule+0x721/0x1e60 kernel/sched/core.c:3515
 schedule+0x88/0x1c0 kernel/sched/core.c:3559
 io_schedule+0x21/0x80 kernel/sched/core.c:5179
 wait_on_page_bit_common mm/filemap.c:1100 [inline]
 __lock_page+0x2b5/0x390 mm/filemap.c:1273
 lock_page include/linux/pagemap.h:483 [inline]
 __revoke_inmem_pages+0xb35/0x11c0 fs/f2fs/segment.c:231
 drop_inmem_pages+0xa3/0x3e0 fs/f2fs/segment.c:306
 f2fs_release_file+0x2c7/0x330 fs/f2fs/file.c:1556
 __fput+0x2c7/0x780 fs/file_table.c:209
 ____fput+0x1a/0x20 fs/file_table.c:243
 task_work_run+0x151/0x1d0 kernel/task_work.c:113
 exit_task_work include/linux/task_work.h:22 [inline]
 do_exit+0x8ba/0x30a0 kernel/exit.c:865
 do_group_exit+0x13b/0x3a0 kernel/exit.c:968
 get_signal+0x6bb/0x1650 kernel/signal.c:2482
 do_signal+0x84/0x1b70 arch/x86/kernel/signal.c:810
 exit_to_usermode_loop+0x155/0x190 arch/x86/entry/common.c:162
 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
 do_syscall_64+0x445/0x4e0 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4497b9
RSP: 002b:00007f1c68e74ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
RAX: fffffffffffffe00 RBX: 000000000071bf80 RCX: 00000000004497b9
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000071bf80
RBP: 000000000071bf80 R08: 0000000000000000 R09: 000000000071bf58
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000000000 R14: 00007f1c68e759c0 R15: 00007f1c68e75700

This patch tries to use trylock_page to mitigate such deadlock condition
for fix.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4a4d49f4482f..fe0ab8d9b07f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -215,7 +215,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
 }
 
 static int __revoke_inmem_pages(struct inode *inode,
-				struct list_head *head, bool drop, bool recover)
+				struct list_head *head, bool drop, bool recover,
+				bool trylock)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct inmem_pages *cur, *tmp;
@@ -227,7 +228,16 @@ static int __revoke_inmem_pages(struct inode *inode,
 		if (drop)
 			trace_f2fs_commit_inmem_page(page, INMEM_DROP);
 
-		lock_page(page);
+		if (trylock) {
+			/*
+			 * to avoid deadlock in between page lock and
+			 * inmem_lock.
+			 */
+			if (!trylock_page(page))
+				continue;
+		} else {
+			lock_page(page);
+		}
 
 		f2fs_wait_on_page_writeback(page, DATA, true, true);
 
@@ -318,13 +328,19 @@ void f2fs_drop_inmem_pages(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	mutex_lock(&fi->inmem_lock);
-	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
-	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
-	if (!list_empty(&fi->inmem_ilist))
-		list_del_init(&fi->inmem_ilist);
-	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
-	mutex_unlock(&fi->inmem_lock);
+	while (!list_empty(&fi->inmem_pages)) {
+		mutex_lock(&fi->inmem_lock);
+		__revoke_inmem_pages(inode, &fi->inmem_pages,
+						true, false, true);
+
+		if (list_empty(&fi->inmem_pages)) {
+			spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+			if (!list_empty(&fi->inmem_ilist))
+				list_del_init(&fi->inmem_ilist);
+			spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+		}
+		mutex_unlock(&fi->inmem_lock);
+	}
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
@@ -429,12 +445,15 @@ retry:
 		 * recovery or rewrite & commit last transaction. For other
 		 * error number, revoking was done by filesystem itself.
 		 */
-		err = __revoke_inmem_pages(inode, &revoke_list, false, true);
+		err = __revoke_inmem_pages(inode, &revoke_list,
+						false, true, false);
 
 		/* drop all uncommitted pages */
-		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+		__revoke_inmem_pages(inode, &fi->inmem_pages,
+						true, false, false);
 	} else {
-		__revoke_inmem_pages(inode, &revoke_list, false, false);
+		__revoke_inmem_pages(inode, &revoke_list,
+						false, false, false);
 	}
 
 	return err;

From 0b5f8cceb8509d26d9a112898647be1502a1a8ce Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 26 Feb 2019 19:01:15 +0800
Subject: [PATCH 2668/3220] f2fs: trace f2fs_ioc_shutdown

This patch supports to trace f2fs_ioc_shutdown.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c              |  3 +++
 include/trace/events/f2fs.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8b1d393c08a4..9cc2563d6811 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1987,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 out:
 	if (in != F2FS_GOING_DOWN_FULLSYNC)
 		mnt_drop_write_file(filp);
+
+	trace_f2fs_shutdown(sbi, in, ret);
+
 	return ret;
 }
 
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 9bbb7c916c12..8fd6fcbc8c7a 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -141,6 +141,14 @@ TRACE_DEFINE_ENUM(CP_TRIMMED);
 		{ CP_SPEC_LOG_NUM,	"log type is 2" },		\
 		{ CP_RECOVER_DIR,	"dir needs recovery" })
 
+#define show_shutdown_mode(type)					\
+	__print_symbolic(type,						\
+		{ F2FS_GOING_DOWN_FULLSYNC,	"full sync" },		\
+		{ F2FS_GOING_DOWN_METASYNC,	"meta sync" },		\
+		{ F2FS_GOING_DOWN_NOSYNC,	"no sync" },		\
+		{ F2FS_GOING_DOWN_METAFLUSH,	"meta flush" },		\
+		{ F2FS_GOING_DOWN_NEED_FSCK,	"need fsck" })
+
 struct f2fs_sb_info;
 struct f2fs_io_info;
 struct extent_info;
@@ -1607,6 +1615,30 @@ DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit,
 	TP_ARGS(sb, type, count)
 );
 
+TRACE_EVENT(f2fs_shutdown,
+
+	TP_PROTO(struct f2fs_sb_info *sbi, unsigned int mode, int ret),
+
+	TP_ARGS(sbi, mode, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(unsigned int, mode)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->dev = sbi->sb->s_dev;
+		__entry->mode = mode;
+		__entry->ret = ret;
+	),
+
+	TP_printk("dev = (%d,%d), mode: %s, ret:%d",
+		show_dev(__entry->dev),
+		show_shutdown_mode(__entry->mode),
+		__entry->ret)
+);
+
 #endif /* _TRACE_F2FS_H */
 
  /* This part must be outside protection */

From ce14043aae02e0ccb146a049614547f26100d6c4 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 26 Feb 2019 19:01:16 +0800
Subject: [PATCH 2669/3220] f2fs: print more parameters in
 trace_f2fs_map_blocks

for better map_blocks trace.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/trace/events/f2fs.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 8fd6fcbc8c7a..fe009522f4e1 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -536,6 +536,9 @@ TRACE_EVENT(f2fs_map_blocks,
 		__field(block_t,	m_lblk)
 		__field(block_t,	m_pblk)
 		__field(unsigned int,	m_len)
+		__field(unsigned int,	m_flags)
+		__field(int,	m_seg_type)
+		__field(bool,	m_may_create)
 		__field(int,	ret)
 	),
 
@@ -545,15 +548,22 @@ TRACE_EVENT(f2fs_map_blocks,
 		__entry->m_lblk		= map->m_lblk;
 		__entry->m_pblk		= map->m_pblk;
 		__entry->m_len		= map->m_len;
+		__entry->m_flags	= map->m_flags;
+		__entry->m_seg_type	= map->m_seg_type;
+		__entry->m_may_create	= map->m_may_create;
 		__entry->ret		= ret;
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, "
-		"start blkaddr = 0x%llx, len = 0x%llx, err = %d",
+		"start blkaddr = 0x%llx, len = 0x%llx, flags = %u,"
+		"seg_type = %d, may_create = %d, err = %d",
 		show_dev_ino(__entry),
 		(unsigned long long)__entry->m_lblk,
 		(unsigned long long)__entry->m_pblk,
 		(unsigned long long)__entry->m_len,
+		__entry->m_flags,
+		__entry->m_seg_type,
+		__entry->m_may_create,
 		__entry->ret)
 );
 

From f261d5b43f934094a57aedddae092d6184cd501c Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 5 Mar 2019 17:52:33 +0800
Subject: [PATCH 2670/3220] f2fs: fix to use kvfree instead of kzfree

As Jiqun Li reported in bugzilla:

https://bugzilla.kernel.org/show_bug.cgi?id=202747

System can panic due to using wrong allocate/free function pair
in xattr interface:
- use kvmalloc to allocate memory
- use kzfree to free memory

Let's fix to use kvfree instead of kzfree, BTW, we are safe to
get rid of kzfree, since there is no such confidential data stored
as xattr, we don't need to zero it before free memory.

Fixes: 5222595d093e ("f2fs: use kvmalloc, if kmalloc is failed")
Reported-by: Jiqun Li <jiqun.li@unisoc.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/xattr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 71b27c55988a..262bda631931 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -386,7 +386,7 @@ check:
 	*base_addr = txattr_addr;
 	return 0;
 out:
-	kzfree(txattr_addr);
+	kvfree(txattr_addr);
 	return err;
 }
 
@@ -429,7 +429,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
 	*base_addr = txattr_addr;
 	return 0;
 fail:
-	kzfree(txattr_addr);
+	kvfree(txattr_addr);
 	return err;
 }
 
@@ -556,7 +556,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 	}
 	error = size;
 out:
-	kzfree(base_addr);
+	kvfree(base_addr);
 	return error;
 }
 
@@ -595,7 +595,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 	}
 	error = buffer_size - rest;
 cleanup:
-	kzfree(base_addr);
+	kvfree(base_addr);
 	return error;
 }
 
@@ -726,7 +726,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 	if (!error && S_ISDIR(inode->i_mode))
 		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
 exit:
-	kzfree(base_addr);
+	kvfree(base_addr);
 	return error;
 }
 

From a103284fe8e82b3a7a20c920d139bb0641af0172 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 6 Mar 2019 16:18:33 +0800
Subject: [PATCH 2671/3220] f2fs: remove wrong comment in
 f2fs_invalidate_page()

Since 8c242db9b8c0 ("f2fs: fix stale ATOMIC_WRITTEN_PAGE private pointer"),
we've started to not skip clear private flag for atomic_write page
truncation, so removing old wrong comment in f2fs_invalidate_page().

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 51d97757dea8..d1c4162e332c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2708,7 +2708,6 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 
 	clear_cold_data(page);
 
-	/* This is atomic written page, keep Private */
 	if (IS_ATOMIC_WRITTEN_PAGE(page))
 		return f2fs_drop_inmem_page(inode, page);
 

From 6831f03b94ec71929a3c0d5ee638e7845a2bfe28 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 6 Mar 2019 17:30:59 +0800
Subject: [PATCH 2672/3220] f2fs: fix to add refcount once page is tagged
 PG_private

As Gao Xiang reported in bugzilla:

https://bugzilla.kernel.org/show_bug.cgi?id=202749

f2fs may skip pageout() due to incorrect page reference count.

The problem here is that MM defined the rule [1] very clearly that
once page was set with PG_private flag, we should increment the
refcount in that page, also main flows like pageout(), migrate_page()
will assume there is one additional page reference count if
page_has_private() returns true.

But currently, f2fs won't add/del refcount when changing PG_private
flag. Anyway, f2fs should follow MM's rule to make MM's related flows
running as expected.

[1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com/

Reported-by: Gao Xiang <gaoxiang25@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  4 ++--
 fs/f2fs/data.c       | 21 ++++++++-------------
 fs/f2fs/dir.c        |  2 +-
 fs/f2fs/f2fs.h       | 21 +++++++++++++++++++++
 fs/f2fs/node.c       |  2 +-
 fs/f2fs/segment.c    |  9 +++------
 6 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index dc01dea911d6..06554ea87b0a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -407,7 +407,7 @@ static int f2fs_set_meta_page_dirty(struct page *page)
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
-		SetPagePrivate(page);
+		f2fs_set_page_private(page, 0);
 		f2fs_trace_pid(page);
 		return 1;
 	}
@@ -958,7 +958,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
 	inode_inc_dirty_pages(inode);
 	spin_unlock(&sbi->inode_lock[type]);
 
-	SetPagePrivate(page);
+	f2fs_set_page_private(page, 0);
 	f2fs_trace_pid(page);
 }
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d1c4162e332c..aa33c500f703 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2711,8 +2711,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 	if (IS_ATOMIC_WRITTEN_PAGE(page))
 		return f2fs_drop_inmem_page(inode, page);
 
-	set_page_private(page, 0);
-	ClearPagePrivate(page);
+	f2fs_clear_page_private(page);
 }
 
 int f2fs_release_page(struct page *page, gfp_t wait)
@@ -2726,8 +2725,7 @@ int f2fs_release_page(struct page *page, gfp_t wait)
 		return 0;
 
 	clear_cold_data(page);
-	set_page_private(page, 0);
-	ClearPagePrivate(page);
+	f2fs_clear_page_private(page);
 	return 1;
 }
 
@@ -2795,12 +2793,8 @@ int f2fs_migrate_page(struct address_space *mapping,
 			return -EAGAIN;
 	}
 
-	/*
-	 * A reference is expected if PagePrivate set when move mapping,
-	 * however F2FS breaks this for maintaining dirty page counts when
-	 * truncating pages. So here adjusting the 'extra_count' make it work.
-	 */
-	extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
+	/* one extra reference was held for atomic_write page */
+	extra_count = atomic_written ? 1 : 0;
 	rc = migrate_page_move_mapping(mapping, newpage,
 				page, NULL, mode, extra_count);
 	if (rc != MIGRATEPAGE_SUCCESS) {
@@ -2821,9 +2815,10 @@ int f2fs_migrate_page(struct address_space *mapping,
 		get_page(newpage);
 	}
 
-	if (PagePrivate(page))
-		SetPagePrivate(newpage);
-	set_page_private(newpage, page_private(page));
+	if (PagePrivate(page)) {
+		f2fs_set_page_private(newpage, page_private(page));
+		f2fs_clear_page_private(page);
+	}
 
 	migrate_page_copy(newpage, page);
 
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1f68e210f1db..988af0b594a7 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 		!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
 		f2fs_clear_radix_tree_dirty_tag(page);
 		clear_page_dirty_for_io(page);
-		ClearPagePrivate(page);
+		f2fs_clear_page_private(page);
 		ClearPageUptodate(page);
 		clear_cold_data(page);
 		inode_dec_dirty_pages(dir);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 03678fcce65c..6204b15e1521 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2920,6 +2920,27 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi,
 	return true;
 }
 
+static inline void f2fs_set_page_private(struct page *page,
+						unsigned long data)
+{
+	if (PagePrivate(page))
+		return;
+
+	get_page(page);
+	SetPagePrivate(page);
+	set_page_private(page, data);
+}
+
+static inline void f2fs_clear_page_private(struct page *page)
+{
+	if (!PagePrivate(page))
+		return;
+
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+	f2fs_put_page(page, 0);
+}
+
 /*
  * file.c
  */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0f76c5e7998c..f527c2b08798 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1966,7 +1966,7 @@ static int f2fs_set_node_page_dirty(struct page *page)
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
-		SetPagePrivate(page);
+		f2fs_set_page_private(page, 0);
 		f2fs_trace_pid(page);
 		return 1;
 	}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fe0ab8d9b07f..cbfa80812a66 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
 
 	f2fs_trace_pid(page);
 
-	set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
-	SetPagePrivate(page);
+	f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
 
 	new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
 
@@ -280,8 +279,7 @@ next:
 			ClearPageUptodate(page);
 			clear_cold_data(page);
 		}
-		set_page_private(page, 0);
-		ClearPagePrivate(page);
+		f2fs_clear_page_private(page);
 		f2fs_put_page(page, 1);
 
 		list_del(&cur->list);
@@ -370,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
 	kmem_cache_free(inmem_entry_slab, cur);
 
 	ClearPageUptodate(page);
-	set_page_private(page, 0);
-	ClearPagePrivate(page);
+	f2fs_clear_page_private(page);
 	f2fs_put_page(page, 0);
 
 	trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);

From 52b876811ab7c141c9a873d62b7b933f61c4e449 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 7 Mar 2019 17:31:30 +0800
Subject: [PATCH 2673/3220] f2fs: don't trigger read IO for beyond EOF page

In f2fs_mpage_readpages(), if page is beyond EOF, we should just
zero out it, but previously, before checking previous mapping
info, we missed to check filesize boundary, fix it.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa33c500f703..62c1be90d952 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1547,6 +1547,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 		if (last_block > last_block_in_file)
 			last_block = last_block_in_file;
 
+		/* just zeroing out page which is beyond EOF */
+		if (block_in_file >= last_block)
+			goto zero_out;
 		/*
 		 * Map blocks using the previous result first.
 		 */
@@ -1559,16 +1562,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 		 * Then do more f2fs_map_blocks() calls until we are
 		 * done with this page.
 		 */
-		map.m_flags = 0;
+		map.m_lblk = block_in_file;
+		map.m_len = last_block - block_in_file;
 
-		if (block_in_file < last_block) {
-			map.m_lblk = block_in_file;
-			map.m_len = last_block - block_in_file;
-
-			if (f2fs_map_blocks(inode, &map, 0,
-						F2FS_GET_BLOCK_DEFAULT))
-				goto set_error_page;
-		}
+		if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT))
+			goto set_error_page;
 got_it:
 		if ((map.m_flags & F2FS_MAP_MAPPED)) {
 			block_nr = map.m_pblk + block_in_file - map.m_lblk;
@@ -1583,6 +1581,7 @@ got_it:
 								DATA_GENERIC))
 				goto set_error_page;
 		} else {
+zero_out:
 			zero_user_segment(page, 0, PAGE_SIZE);
 			if (!PageUptodate(page))
 				SetPageUptodate(page);

From 6fda31ae4bd975800270bc566dfbcc4af4f77348 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 12 Mar 2019 11:49:53 -0700
Subject: [PATCH 2674/3220] f2fs: give some messages for inline_xattr_size

This patch adds some kernel messages when user sets wrong inline_xattr_size.

Fixes: 500e0b28ecd3 ("f2fs: fix to check inline_xattr_size boundary correctly")
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f39861c1dcdb..b9d4d75f54d5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options)
 	}
 
 	if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+		int min_size, max_size;
+
 		if (!f2fs_sb_has_extra_attr(sbi) ||
 			!f2fs_sb_has_flexible_inline_xattr(sbi)) {
 			f2fs_msg(sb, KERN_ERR,
@@ -834,15 +836,18 @@ static int parse_options(struct super_block *sb, char *options)
 					"set with inline_xattr option");
 			return -EINVAL;
 		}
-		if (F2FS_OPTION(sbi).inline_xattr_size <
-			sizeof(struct f2fs_xattr_header) / sizeof(__le32) ||
-			F2FS_OPTION(sbi).inline_xattr_size >
-			DEF_ADDRS_PER_INODE -
+
+		min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32);
+		max_size = DEF_ADDRS_PER_INODE -
 			F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) -
 			DEF_INLINE_RESERVED_SIZE -
-			MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) {
+			MIN_INLINE_DENTRY_SIZE / sizeof(__le32);
+
+		if (F2FS_OPTION(sbi).inline_xattr_size < min_size ||
+				F2FS_OPTION(sbi).inline_xattr_size > max_size) {
 			f2fs_msg(sb, KERN_ERR,
-					"inline xattr size is out of range");
+				"inline xattr size is out of range: %d ~ %d",
+				min_size, max_size);
 			return -EINVAL;
 		}
 	}

From 1f4e6f41cc90709c8faaa4b344589093d3d74d7e Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Mon, 4 Mar 2019 17:19:04 +0800
Subject: [PATCH 2675/3220] f2fs: fix to do sanity check with
 inode.i_inline_xattr_size

As Paul Bandha reported in bugzilla:

https://bugzilla.kernel.org/show_bug.cgi?id=202709

When I run the poc on the mounted f2fs img I get a buffer overflow in
read_inline_xattr due to there being no sanity check on the value of
i_inline_xattr_size.

I created the img by just modifying the value of i_inline_xattr_size
in the inode:

i_name                        		[test1.txt]
i_ext: fofs:0 blkaddr:0 len:0
i_extra_isize                 		[0x      18 : 24]
i_inline_xattr_size           		[0x    ffff : 65535]
i_addr[ofs]                   		[0x       0 : 0]

mkdir /mnt/f2fs
mount ./f2fs1.img /mnt/f2fs
gcc poc.c -o poc
./poc

int main() {
	int y = syscall(SYS_listxattr, "/mnt/f2fs/test1.txt", NULL, 0);
	printf("ret %d", y);
	printf("errno: %d\n", errno);

}

 BUG: KASAN: slab-out-of-bounds in read_inline_xattr+0x18f/0x260
 Read of size 262140 at addr ffff88011035efd8 by task f2fs1poc/3263

 CPU: 0 PID: 3263 Comm: f2fs1poc Not tainted 4.18.0-custom #1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.1-0-g0551a4be2c-prebuilt.qemu-project.org 04/01/2014
 Call Trace:
  dump_stack+0x71/0xab
  print_address_description+0x83/0x250
  kasan_report+0x213/0x350
  memcpy+0x1f/0x50
  read_inline_xattr+0x18f/0x260
  read_all_xattrs+0xba/0x190
  f2fs_listxattr+0x9d/0x3f0
  listxattr+0xb2/0xd0
  path_listxattr+0x93/0xe0
  do_syscall_64+0x9d/0x220
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Let's add sanity check for inode.i_inline_xattr_size during f2fs_iget()
to avoid this issue.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inode.c | 15 +++++++++++++++
 fs/f2fs/super.c |  5 +----
 fs/f2fs/xattr.h |  6 ++++++
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 3e739e4986b8..d33f860c4e63 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -14,6 +14,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "xattr.h"
 
 #include <trace/events/f2fs.h>
 
@@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 		return false;
 	}
 
+	if (f2fs_has_extra_attr(inode) &&
+		f2fs_sb_has_flexible_inline_xattr(sbi) &&
+		f2fs_has_inline_xattr(inode) &&
+		(!fi->i_inline_xattr_size ||
+		fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
+		set_sbi_flag(sbi, SBI_NEED_FSCK);
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"%s: inode (ino=%lx) has corrupted "
+			"i_inline_xattr_size: %d, max: %zu",
+			__func__, inode->i_ino, fi->i_inline_xattr_size,
+			MAX_INLINE_XATTR_SIZE);
+		return false;
+	}
+
 	if (F2FS_I(inode)->extent_tree) {
 		struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b9d4d75f54d5..5418b1d55a4f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -838,10 +838,7 @@ static int parse_options(struct super_block *sb, char *options)
 		}
 
 		min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32);
-		max_size = DEF_ADDRS_PER_INODE -
-			F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) -
-			DEF_INLINE_RESERVED_SIZE -
-			MIN_INLINE_DENTRY_SIZE / sizeof(__le32);
+		max_size = MAX_INLINE_XATTR_SIZE;
 
 		if (F2FS_OPTION(sbi).inline_xattr_size < min_size ||
 				F2FS_OPTION(sbi).inline_xattr_size > max_size) {
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 595d2af00a58..12a5ef3a607d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -78,6 +78,12 @@ struct f2fs_xattr_entry {
 				sizeof(struct f2fs_xattr_header) -	\
 				sizeof(struct f2fs_xattr_entry))
 
+#define MAX_INLINE_XATTR_SIZE						\
+			(DEF_ADDRS_PER_INODE -				\
+			F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) -	\
+			DEF_INLINE_RESERVED_SIZE -			\
+			MIN_INLINE_DENTRY_SIZE / sizeof(__le32))
+
 /*
  * On-disk structure of f2fs_xattr
  * We use inline xattrs space + 1 block for xattr.

From 413e1f1994f2476e911d6ccebd561ffebe7d1c20 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 5 Mar 2019 19:32:26 +0800
Subject: [PATCH 2676/3220] f2fs: fix to adapt small inline xattr space in
 __find_inline_xattr()

With below testcase, we will fail to find existed xattr entry:

1. mkfs.f2fs -O extra_attr -O flexible_inline_xattr /dev/zram0
2. mount -t f2fs -o inline_xattr_size=1 /dev/zram0 /mnt/f2fs/
3. touch /mnt/f2fs/file
4. setfattr -n "user.name" -v 0 /mnt/f2fs/file
5. getfattr -n "user.name" /mnt/f2fs/file

/mnt/f2fs/file: user.name: No such attribute

The reason is for inode which has very small inline xattr size,
__find_inline_xattr() will fail to traverse any entry due to first
entry may not be loaded from xattr node yet, later, we may skip to
check entire xattr datas in __find_xattr(), result in such wrong
condition.

This patch adds condition to check such case to avoid this issue.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/xattr.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 262bda631931..8c6161ea4e23 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -270,11 +270,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
 {
 	struct f2fs_xattr_entry *entry;
 	unsigned int inline_size = inline_xattr_size(inode);
+	void *max_addr = base_addr + inline_size;
 
 	list_for_each_xattr(entry, base_addr) {
-		if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
-			(void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) >
-			base_addr + inline_size) {
+		if ((void *)entry + sizeof(__u32) > max_addr ||
+			(void *)XATTR_NEXT_ENTRY(entry) > max_addr) {
 			*last_addr = entry;
 			return NULL;
 		}
@@ -285,6 +285,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
 		if (!memcmp(entry->e_name, name, len))
 			break;
 	}
+
+	/* inline xattr header or entry across max inline xattr size */
+	if (IS_XATTR_LAST_ENTRY(entry) &&
+		(void *)entry + sizeof(__u32) > max_addr) {
+		*last_addr = entry;
+		return NULL;
+	}
 	return entry;
 }
 

From 557dadbd8114716f215d72d2d89060720e5ad45d Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 12 Mar 2019 15:44:27 +0800
Subject: [PATCH 2677/3220] f2fs: fix to avoid deadlock in
 f2fs_read_inline_dir()

As Jiqun Li reported in bugzilla:

https://bugzilla.kernel.org/show_bug.cgi?id=202883

sometimes, dead lock when make system call SYS_getdents64 with fsync() is
called by another process.

monkey running on android9.0

1.  task 9785 held sbi->cp_rwsem and waiting lock_page()
2.  task 10349 held mm_sem and waiting sbi->cp_rwsem
3. task 9709 held lock_page() and waiting mm_sem

so this is a dead lock scenario.

task stack is show by crash tools as following

crash_arm64> bt ffffffc03c354080
PID: 9785   TASK: ffffffc03c354080  CPU: 1   COMMAND: "RxIoScheduler-3"
>> #7 [ffffffc01b50fac0] __lock_page at ffffff80081b11e8

crash-arm64> bt 10349
PID: 10349  TASK: ffffffc018b83080  CPU: 1   COMMAND: "BUGLY_ASYNC_UPL"
>> #3 [ffffffc01f8cfa40] rwsem_down_read_failed at ffffff8008a93afc
     PC: 00000033  LR: 00000000  SP: 00000000  PSTATE: ffffffffffffffff

crash-arm64> bt 9709
PID: 9709   TASK: ffffffc03e7f3080  CPU: 1   COMMAND: "IntentService[A"
>> #3 [ffffffc001e67850] rwsem_down_read_failed at ffffff8008a93afc
>> #8 [ffffffc001e67b80] el1_ia at ffffff8008084fc4
     PC: ffffff8008274114  [compat_filldir64+120]
     LR: ffffff80083584d4  [f2fs_fill_dentries+448]
     SP: ffffffc001e67b80  PSTATE: 80400145
    X29: ffffffc001e67b80  X28: 0000000000000000  X27: 000000000000001a
    X26: 00000000000093d7  X25: ffffffc070d52480  X24: 0000000000000008
    X23: 0000000000000028  X22: 00000000d43dfd60  X21: ffffffc001e67e90
    X20: 0000000000000011  X19: ffffff80093a4000  X18: 0000000000000000
    X17: 0000000000000000  X16: 0000000000000000  X15: 0000000000000000
    X14: ffffffffffffffff  X13: 0000000000000008  X12: 0101010101010101
    X11: 7f7f7f7f7f7f7f7f  X10: 6a6a6a6a6a6a6a6a   X9: 7f7f7f7f7f7f7f7f
     X8: 0000000080808000   X7: ffffff800827409c   X6: 0000000080808000
     X5: 0000000000000008   X4: 00000000000093d7   X3: 000000000000001a
     X2: 0000000000000011   X1: ffffffc070d52480   X0: 0000000000800238
>> #9 [ffffffc001e67be0] f2fs_fill_dentries at ffffff80083584d0
     PC: 0000003c  LR: 00000000  SP: 00000000  PSTATE: 000000d9
    X12: f48a02ff X11: d4678960 X10: d43dfc00  X9: d4678ae4
     X8: 00000058  X7: d4678994  X6: d43de800  X5: 000000d9
     X4: d43dfc0c  X3: d43dfc10  X2: d46799c8  X1: 00000000
     X0: 00001068

Below potential deadlock will happen between three threads:
Thread A		Thread B		Thread C
- f2fs_do_sync_file
 - f2fs_write_checkpoint
  - down_write(&sbi->node_change) -- 1)
			- do_page_fault
			 - down_write(&mm->mmap_sem) -- 2)
			  - do_wp_page
			   - f2fs_vm_page_mkwrite
						- getdents64
						 - f2fs_read_inline_dir
						  - lock_page -- 3)
  - f2fs_sync_node_pages
   - lock_page -- 3)
			    - __do_map_lock
			     - down_read(&sbi->node_change) -- 1)
						  - f2fs_fill_dentries
						   - dir_emit
						    - compat_filldir64
						     - do_page_fault
						      - down_read(&mm->mmap_sem) -- 2)

Since f2fs_readdir is protected by inode.i_rwsem, there should not be
any updates in inode page, we're safe to lookup dents in inode page
without its lock held, so taking off the lock to improve concurrency
of readdir and avoid potential deadlock.

Reported-by: Jiqun Li <jiqun.li@unisoc.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/inline.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index ec067935ca78..8d3d0022cef8 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
 
+	/*
+	 * f2fs_readdir was protected by inode.i_rwsem, it is safe to access
+	 * ipage without page's lock held.
+	 */
+	unlock_page(ipage);
+
 	inline_dentry = inline_data_addr(inode, ipage);
 
 	make_dentry_ptr_inline(inode, &d, inline_dentry);
@@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 	if (!err)
 		ctx->pos = d.max;
 
-	f2fs_put_page(ipage, 1);
+	f2fs_put_page(ipage, 0);
 	return err < 0 ? err : 0;
 }
 

From 22c197243aeef0fa552f7e2ae41500a437e79890 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 13 Mar 2019 16:15:08 -0700
Subject: [PATCH 2678/3220] f2fs: set pin_file under CAP_SYS_ADMIN

Android uses pin_file for uncrypt during OTA, and that should be managed by
CAP_SYS_ADMIN only.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9cc2563d6811..637a9dfbd578 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2653,8 +2653,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
 	__u32 pin;
 	int ret = 0;
 
-	if (!inode_owner_or_capable(inode))
-		return -EACCES;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
 
 	if (get_user(pin, (__u32 __user *)arg))
 		return -EFAULT;

From a71fc927e55243a0c7ad881e47bc3e44c8b13de5 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Wed, 20 Feb 2019 11:15:36 -0800
Subject: [PATCH 2679/3220] Revert "ANDROID: arm: process: Add display of
 memory around registers when displaying regs."

This reverts commit 7ad60b42bca671a3b2b60ba2663b312cecc9cdd2.

This change has been deprecated in newer Android kernels, and upstream
changes to the location of kernel text broke its original purpose, so
let's just remove it.

Bug: 124903482
Change-Id: Icdb32f1166caac091fec138a340860a591ebc142
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/arm/kernel/process.c | 73 ---------------------------------------
 1 file changed, 73 deletions(-)

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index e06307ad1d1c..fd739fe04834 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -93,77 +93,6 @@ void arch_cpu_idle_exit(void)
 	idle_notifier_call_chain(IDLE_END);
 }
 
-/*
- * dump a block of kernel memory from around the given address
- */
-static void show_data(unsigned long addr, int nbytes, const char *name)
-{
-	int	i, j;
-	int	nlines;
-	u32	*p;
-
-	/*
-	 * don't attempt to dump non-kernel addresses or
-	 * values that are probably just small negative numbers
-	 */
-	if (addr < PAGE_OFFSET || addr > -256UL)
-		return;
-
-	printk("\n%s: %#lx:\n", name, addr);
-
-	/*
-	 * round address down to a 32 bit boundary
-	 * and always dump a multiple of 32 bytes
-	 */
-	p = (u32 *)(addr & ~(sizeof(u32) - 1));
-	nbytes += (addr & (sizeof(u32) - 1));
-	nlines = (nbytes + 31) / 32;
-
-
-	for (i = 0; i < nlines; i++) {
-		/*
-		 * just display low 16 bits of address to keep
-		 * each line of the dump < 80 characters
-		 */
-		printk("%04lx ", (unsigned long)p & 0xffff);
-		for (j = 0; j < 8; j++) {
-			u32	data;
-			if (probe_kernel_address(p, data)) {
-				printk(" ********");
-			} else {
-				printk(" %08x", data);
-			}
-			++p;
-		}
-		printk("\n");
-	}
-}
-
-static void show_extra_register_data(struct pt_regs *regs, int nbytes)
-{
-	mm_segment_t fs;
-
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-	show_data(regs->ARM_pc - nbytes, nbytes * 2, "PC");
-	show_data(regs->ARM_lr - nbytes, nbytes * 2, "LR");
-	show_data(regs->ARM_sp - nbytes, nbytes * 2, "SP");
-	show_data(regs->ARM_ip - nbytes, nbytes * 2, "IP");
-	show_data(regs->ARM_fp - nbytes, nbytes * 2, "FP");
-	show_data(regs->ARM_r0 - nbytes, nbytes * 2, "R0");
-	show_data(regs->ARM_r1 - nbytes, nbytes * 2, "R1");
-	show_data(regs->ARM_r2 - nbytes, nbytes * 2, "R2");
-	show_data(regs->ARM_r3 - nbytes, nbytes * 2, "R3");
-	show_data(regs->ARM_r4 - nbytes, nbytes * 2, "R4");
-	show_data(regs->ARM_r5 - nbytes, nbytes * 2, "R5");
-	show_data(regs->ARM_r6 - nbytes, nbytes * 2, "R6");
-	show_data(regs->ARM_r7 - nbytes, nbytes * 2, "R7");
-	show_data(regs->ARM_r8 - nbytes, nbytes * 2, "R8");
-	show_data(regs->ARM_r9 - nbytes, nbytes * 2, "R9");
-	show_data(regs->ARM_r10 - nbytes, nbytes * 2, "R10");
-	set_fs(fs);
-}
-
 void __show_regs(struct pt_regs *regs)
 {
 	unsigned long flags;
@@ -251,8 +180,6 @@ void __show_regs(struct pt_regs *regs)
 		printk("Control: %08x%s\n", ctrl, buf);
 	}
 #endif
-
-	show_extra_register_data(regs, 128);
 }
 
 void show_regs(struct pt_regs * regs)

From 1ff6adf145499685d4552a24e840f01e366a744a Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 11 Feb 2019 15:18:52 +0800
Subject: [PATCH 2680/3220] ceph: avoid repeatedly adding inode to
 mdsc->snap_flush_list

commit 04242ff3ac0abbaa4362f97781dac268e6c3541a upstream.

Otherwise, mdsc->snap_flush_list may get corrupted.

Cc: stable@vger.kernel.org
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/snap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4aa7122a8d38..a485d0cdc559 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -611,7 +611,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 	     capsnap->size);
 
 	spin_lock(&mdsc->snap_flush_lock);
-	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+	if (list_empty(&ci->i_snap_flush_item))
+		list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
 	spin_unlock(&mdsc->snap_flush_lock);
 	return 1;  /* caller may want to ceph_flush_snaps */
 }

From d3f2228a22ab99b8cee651d75ffb7f3b6dff9eba Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Wed, 20 Feb 2019 22:18:58 -0800
Subject: [PATCH 2681/3220] numa: change get_mempolicy() to use nr_node_ids
 instead of MAX_NUMNODES

commit 050c17f239fd53adb55aa768d4f41bc76c0fe045 upstream.

The system call, get_mempolicy() [1], passes an unsigned long *nodemask
pointer and an unsigned long maxnode argument which specifies the length
of the user's nodemask array in bits (which is rounded up).  The manual
page says that if the maxnode value is too small, get_mempolicy will
return EINVAL but there is no system call to return this minimum value.
To determine this value, some programs search /proc/<pid>/status for a
line starting with "Mems_allowed:" and use the number of digits in the
mask to determine the minimum value.  A recent change to the way this line
is formatted [2] causes these programs to compute a value less than
MAX_NUMNODES so get_mempolicy() returns EINVAL.

Change get_mempolicy(), the older compat version of get_mempolicy(), and
the copy_nodes_to_user() function to use nr_node_ids instead of
MAX_NUMNODES, thus preserving the defacto method of computing the minimum
size for the nodemask array and the maxnode argument.

[1] http://man7.org/linux/man-pages/man2/get_mempolicy.2.html
[2] https://lore.kernel.org/lkml/1545405631-6808-1-git-send-email-longman@redhat.com

Link: http://lkml.kernel.org/r/20190211180245.22295-1-rcampbell@nvidia.com
Fixes: 4fb8e5b89bcbbbb ("include/linux/nodemask.h: use nr_node_ids (not MAX_NUMNODES) in __nodemask_pr_numnodes()")
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Suggested-by: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Waiman Long <longman@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 44134ba6fb53..5418ab0c5e2c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1295,7 +1295,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 			      nodemask_t *nodes)
 {
 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
-	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
 
 	if (copy > nbytes) {
 		if (copy > PAGE_SIZE)
@@ -1456,7 +1456,7 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 	int uninitialized_var(pval);
 	nodemask_t nodes;
 
-	if (nmask != NULL && maxnode < MAX_NUMNODES)
+	if (nmask != NULL && maxnode < nr_node_ids)
 		return -EINVAL;
 
 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1485,7 +1485,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 	unsigned long nr_bits, alloc_size;
 	DECLARE_BITMAP(bm, MAX_NUMNODES);
 
-	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 
 	if (nmask)

From 1e73c0aeb3ee5a5226a65f8f2a22b8cfe2970a03 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 14 Feb 2019 16:20:01 +0000
Subject: [PATCH 2682/3220] KEYS: allow reaching the keys quotas exactly

commit a08bf91ce28ed3ae7b6fef35d843fef8dc8c2cd9 upstream.

If the sysctl 'kernel.keys.maxkeys' is set to some number n, then
actually users can only add up to 'n - 1' keys.  Likewise for
'kernel.keys.maxbytes' and the root_* versions of these sysctls.  But
these sysctls are apparently supposed to be *maximums*, as per their
names and all documentation I could find -- the keyrings(7) man page,
Documentation/security/keys/core.rst, and all the mentions of EDQUOT
meaning that the key quota was *exceeded* (as opposed to reached).

Thus, fix the code to allow reaching the quotas exactly.

Fixes: 0b77f5bfb45c ("keys: make the keyring quotas controllable through /proc/sys")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <james.morris@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 security/keys/key.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/security/keys/key.c b/security/keys/key.c
index 4d971bf88ac3..03160f1f1aa2 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -260,8 +260,8 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 
 		spin_lock(&user->lock);
 		if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) {
-			if (user->qnkeys + 1 >= maxkeys ||
-			    user->qnbytes + quotalen >= maxbytes ||
+			if (user->qnkeys + 1 > maxkeys ||
+			    user->qnbytes + quotalen > maxbytes ||
 			    user->qnbytes + quotalen < user->qnbytes)
 				goto no_quota;
 		}

From 3e442a35c361d776f14e372a2c86fb5e18a3e3dd Mon Sep 17 00:00:00 2001
From: Vignesh R <vigneshr@ti.com>
Date: Mon, 3 Dec 2018 13:31:17 +0530
Subject: [PATCH 2683/3220] mfd: ti_am335x_tscadc: Use PLATFORM_DEVID_AUTO
 while registering mfd cells

[ Upstream commit b40ee006fe6a8a25093434e5d394128c356a48f3 ]

Use PLATFORM_DEVID_AUTO to number mfd cells while registering, so that
different instances are uniquely identified. This is required in order
to support registering of multiple instances of same ti_am335x_tscadc IP.

Signed-off-by: Vignesh R <vigneshr@ti.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mfd/ti_am335x_tscadc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/ti_am335x_tscadc.c b/drivers/mfd/ti_am335x_tscadc.c
index 4a0f076c91ba..faf8ce5be576 100644
--- a/drivers/mfd/ti_am335x_tscadc.c
+++ b/drivers/mfd/ti_am335x_tscadc.c
@@ -279,8 +279,9 @@ static	int ti_tscadc_probe(struct platform_device *pdev)
 		cell->pdata_size = sizeof(tscadc);
 	}
 
-	err = mfd_add_devices(&pdev->dev, pdev->id, tscadc->cells,
-			tscadc->used_cells, NULL, 0, NULL);
+	err = mfd_add_devices(&pdev->dev, PLATFORM_DEVID_AUTO,
+			      tscadc->cells, tscadc->used_cells, NULL,
+			      0, NULL);
 	if (err < 0)
 		goto err_disable_clk;
 

From 2309acaeee2da012faa4b45d3ec6a42d6d0c2f5f Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Wed, 17 Oct 2018 10:13:23 -0700
Subject: [PATCH 2684/3220] mfd: twl-core: Fix section annotations on
 {,un}protect_pm_master

[ Upstream commit 8838555089f0345b87f4277fe5a8dd647dc65589 ]

When building the kernel with Clang, the following section mismatch
warning appears:

WARNING: vmlinux.o(.text+0x3d84a3b): Section mismatch in reference from
the function twl_probe() to the function
.init.text:unprotect_pm_master()
The function twl_probe() references
the function __init unprotect_pm_master().
This is often because twl_probe lacks a __init
annotation or the annotation of unprotect_pm_master is wrong.

Remove the __init annotation on the *protect_pm_master functions so
there is no more mismatch.

Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mfd/twl-core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c
index 831696ee2472..90732a655d57 100644
--- a/drivers/mfd/twl-core.c
+++ b/drivers/mfd/twl-core.c
@@ -982,7 +982,7 @@ add_children(struct twl4030_platform_data *pdata, unsigned irq_base,
  * letting it generate the right frequencies for USB, MADC, and
  * other purposes.
  */
-static inline int __init protect_pm_master(void)
+static inline int protect_pm_master(void)
 {
 	int e = 0;
 
@@ -991,7 +991,7 @@ static inline int __init protect_pm_master(void)
 	return e;
 }
 
-static inline int __init unprotect_pm_master(void)
+static inline int unprotect_pm_master(void)
 {
 	int e = 0;
 

From 40f1c388f26349249364c3c4df75796a13302285 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Wed, 17 Oct 2018 17:56:28 -0700
Subject: [PATCH 2685/3220] mfd: db8500-prcmu: Fix some section annotations

[ Upstream commit a3888f62fe66429fad3be7f2ba962e1e08c26fd6 ]

When building the kernel with Clang, the following section mismatch
warnings appear:

WARNING: vmlinux.o(.text+0x7239cc): Section mismatch in reference from
the function db8500_prcmu_probe() to the function
.init.text:init_prcm_registers()
The function db8500_prcmu_probe() references
the function __init init_prcm_registers().
This is often because db8500_prcmu_probe lacks a __init
annotation or the annotation of init_prcm_registers is wrong.

WARNING: vmlinux.o(.text+0x723e28): Section mismatch in reference from
the function db8500_prcmu_probe() to the function
.init.text:fw_project_name()
The function db8500_prcmu_probe() references
the function __init fw_project_name().
This is often because db8500_prcmu_probe lacks a __init
annotation or the annotation of fw_project_name is wrong.

db8500_prcmu_probe should not be marked as __init so remove the __init
annotation from fw_project_name and init_prcm_registers.

Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mfd/db8500-prcmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 12099b09a9a7..e71b9f23379d 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -2610,7 +2610,7 @@ static struct irq_chip prcmu_irq_chip = {
 	.irq_unmask	= prcmu_irq_unmask,
 };
 
-static __init char *fw_project_name(u32 project)
+static char *fw_project_name(u32 project)
 {
 	switch (project) {
 	case PRCMU_FW_PROJECT_U8500:
@@ -2758,7 +2758,7 @@ void __init db8500_prcmu_early_init(u32 phy_base, u32 size)
 	INIT_WORK(&mb0_transfer.mask_work, prcmu_mask_work);
 }
 
-static void __init init_prcm_registers(void)
+static void init_prcm_registers(void)
 {
 	u32 val;
 

From c02e9400dc2e76ab9a1bc3fc1b1cf055aad67f34 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 25 Oct 2018 15:43:44 +0300
Subject: [PATCH 2686/3220] mfd: ab8500-core: Return zero in
 get_register_interruptible()

[ Upstream commit 10628e3ecf544fa2e4e24f8e112d95c37884dc98 ]

This function is supposed to return zero on success or negative error
codes on error.  Unfortunately, there is a bug so it sometimes returns
non-zero, positive numbers on success.

I noticed this bug during review and I can't test it.  It does appear
that the return is sometimes propogated back to _regmap_read() where all
non-zero returns are treated as failure so this may affect run time.

Fixes: 47c1697508f2 ("mfd: Align ab8500 with the abx500 interface")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mfd/ab8500-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index fefbe4cfa61d..1263cfd8b4d2 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -259,7 +259,7 @@ static int get_register_interruptible(struct ab8500 *ab8500, u8 bank,
 	mutex_unlock(&ab8500->lock);
 	dev_vdbg(ab8500->dev, "rd: addr %#x => data %#x\n", addr, ret);
 
-	return ret;
+	return (ret < 0) ? ret : 0;
 }
 
 static int ab8500_get_register(struct device *dev, u8 bank,

From 6a422b0676ededd71c29c7761baa61216123087a Mon Sep 17 00:00:00 2001
From: Jonathan Marek <jonathan@marek.ca>
Date: Mon, 19 Nov 2018 14:53:17 -0500
Subject: [PATCH 2687/3220] mfd: qcom_rpm: write fw_version to CTRL_REG

[ Upstream commit 504e4175829c44328773b96ad9c538e4783a8d22 ]

This is required as part of the initialization sequence on certain SoCs.

If these registers are not initialized, the hardware can be unresponsive.
This fixes the driver on apq8060 (HP TouchPad device).

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mfd/qcom_rpm.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/mfd/qcom_rpm.c b/drivers/mfd/qcom_rpm.c
index a867cc91657e..27486f278201 100644
--- a/drivers/mfd/qcom_rpm.c
+++ b/drivers/mfd/qcom_rpm.c
@@ -570,6 +570,10 @@ static int qcom_rpm_probe(struct platform_device *pdev)
 		return -EFAULT;
 	}
 
+	writel(fw_version[0], RPM_CTRL_REG(rpm, 0));
+	writel(fw_version[1], RPM_CTRL_REG(rpm, 1));
+	writel(fw_version[2], RPM_CTRL_REG(rpm, 2));
+
 	dev_info(&pdev->dev, "RPM firmware %u.%u.%u\n", fw_version[0],
 							fw_version[1],
 							fw_version[2]);

From 81456de2d4b3a48d91845c0a8cbfccfc6cb4f5cd Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 28 Nov 2018 10:04:22 +0000
Subject: [PATCH 2688/3220] mfd: wm5110: Add missing ASRC rate register

[ Upstream commit 04c801c18ded421845324255e660147a6f58dcd6 ]

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mfd/wm5110-tables.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/mfd/wm5110-tables.c b/drivers/mfd/wm5110-tables.c
index 2bb2d0467a92..c47efe6dcb01 100644
--- a/drivers/mfd/wm5110-tables.c
+++ b/drivers/mfd/wm5110-tables.c
@@ -1622,6 +1622,7 @@ static const struct reg_default wm5110_reg_default[] = {
 	{ 0x00000ECD, 0x0000 },    /* R3789  - HPLPF4_2 */
 	{ 0x00000EE0, 0x0000 },    /* R3808  - ASRC_ENABLE */
 	{ 0x00000EE2, 0x0000 },    /* R3810  - ASRC_RATE1 */
+	{ 0x00000EE3, 0x4000 },    /* R3811  - ASRC_RATE2 */
 	{ 0x00000EF0, 0x0000 },    /* R3824  - ISRC 1 CTRL 1 */
 	{ 0x00000EF1, 0x0000 },    /* R3825  - ISRC 1 CTRL 2 */
 	{ 0x00000EF2, 0x0000 },    /* R3826  - ISRC 1 CTRL 3 */
@@ -2877,6 +2878,7 @@ static bool wm5110_readable_register(struct device *dev, unsigned int reg)
 	case ARIZONA_ASRC_ENABLE:
 	case ARIZONA_ASRC_STATUS:
 	case ARIZONA_ASRC_RATE1:
+	case ARIZONA_ASRC_RATE2:
 	case ARIZONA_ISRC_1_CTRL_1:
 	case ARIZONA_ISRC_1_CTRL_2:
 	case ARIZONA_ISRC_1_CTRL_3:

From d8189ee86c3a63ba73eecab676f7a10b13d8ff65 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Thu, 20 Dec 2018 15:12:11 -0600
Subject: [PATCH 2689/3220] mfd: mc13xxx: Fix a missing check of a
 register-read failure

[ Upstream commit 9e28989d41c0eab57ec0bb156617a8757406ff8a ]

When mc13xxx_reg_read() fails, "old_adc0" is uninitialized and will
contain random value. Further execution uses "old_adc0" even when
mc13xxx_reg_read() fails.
The fix checks the return value of mc13xxx_reg_read(), and exits
the execution when it fails.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mfd/mc13xxx-core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/mfd/mc13xxx-core.c b/drivers/mfd/mc13xxx-core.c
index 3f9f4c874d2a..8d74806b83c1 100644
--- a/drivers/mfd/mc13xxx-core.c
+++ b/drivers/mfd/mc13xxx-core.c
@@ -274,7 +274,9 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode,
 
 	mc13xxx->adcflags |= MC13XXX_ADC_WORKING;
 
-	mc13xxx_reg_read(mc13xxx, MC13XXX_ADC0, &old_adc0);
+	ret = mc13xxx_reg_read(mc13xxx, MC13XXX_ADC0, &old_adc0);
+	if (ret)
+		goto out;
 
 	adc0 = MC13XXX_ADC0_ADINC1 | MC13XXX_ADC0_ADINC2;
 	adc1 = MC13XXX_ADC1_ADEN | MC13XXX_ADC1_ADTRIGIGN | MC13XXX_ADC1_ASC;

From b8265637ec8a6cc2242ab2e8f0f2b929fa6a4f7a Mon Sep 17 00:00:00 2001
From: Yonglong Liu <liuyonglong@huawei.com>
Date: Fri, 4 Jan 2019 20:18:11 +0800
Subject: [PATCH 2690/3220] net: hns: Fix use after free identified by SLUB
 debug

[ Upstream commit bb989501abcafa0de5f18b0ec0ec459b5b817908 ]

When enable SLUB debug, than remove hns_enet_drv module, SLUB debug will
identify a use after free bug:

[134.189505] Unable to handle kernel paging request at virtual address
		006b6b6b6b6b6b6b
[134.197553] Mem abort info:
[134.200381]   ESR = 0x96000004
[134.203487]   Exception class = DABT (current EL), IL = 32 bits
[134.209497]   SET = 0, FnV = 0
[134.212596]   EA = 0, S1PTW = 0
[134.215777] Data abort info:
[134.218701]   ISV = 0, ISS = 0x00000004
[134.222596]   CM = 0, WnR = 0
[134.225606] [006b6b6b6b6b6b6b] address between user and kernel address ranges
[134.232851] Internal error: Oops: 96000004 [#1] SMP
[134.237798] CPU: 21 PID: 27834 Comm: rmmod Kdump: loaded Tainted: G
		OE     4.19.5-1.2.34.aarch64 #1
[134.247856] Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.58 10/24/2018
[134.255181] pstate: 20000005 (nzCv daif -PAN -UAO)
[134.260044] pc : hns_ae_put_handle+0x38/0x60
[134.264372] lr : hns_ae_put_handle+0x24/0x60
[134.268700] sp : ffff00001be93c50
[134.272054] x29: ffff00001be93c50 x28: ffff802faaec8040
[134.277442] x27: 0000000000000000 x26: 0000000000000000
[134.282830] x25: 0000000056000000 x24: 0000000000000015
[134.288284] x23: ffff0000096fe098 x22: ffff000001050070
[134.293671] x21: ffff801fb3c044a0 x20: ffff80afb75ec098
[134.303287] x19: ffff80afb75ec098 x18: 0000000000000000
[134.312945] x17: 0000000000000000 x16: 0000000000000000
[134.322517] x15: 0000000000000002 x14: 0000000000000000
[134.332030] x13: dead000000000100 x12: ffff7e02bea3c988
[134.341487] x11: ffff80affbee9e68 x10: 0000000000000000
[134.351033] x9 : 6fffff8000008101 x8 : 0000000000000000
[134.360569] x7 : dead000000000100 x6 : ffff000009579748
[134.370059] x5 : 0000000000210d00 x4 : 0000000000000000
[134.379550] x3 : 0000000000000001 x2 : 0000000000000000
[134.388813] x1 : 6b6b6b6b6b6b6b6b x0 : 0000000000000000
[134.397993] Process rmmod (pid: 27834, stack limit = 0x00000000d474b7fd)
[134.408498] Call trace:
[134.414611]  hns_ae_put_handle+0x38/0x60
[134.422208]  hnae_put_handle+0xd4/0x108
[134.429563]  hns_nic_dev_remove+0x60/0xc0 [hns_enet_drv]
[134.438342]  platform_drv_remove+0x2c/0x70
[134.445958]  device_release_driver_internal+0x174/0x208
[134.454810]  driver_detach+0x70/0xd8
[134.461913]  bus_remove_driver+0x64/0xe8
[134.469396]  driver_unregister+0x34/0x60
[134.476822]  platform_driver_unregister+0x20/0x30
[134.485130]  hns_nic_dev_driver_exit+0x14/0x6e4 [hns_enet_drv]
[134.494634]  __arm64_sys_delete_module+0x238/0x290

struct hnae_handle is a member of struct hnae_vf_cb, so when vf_cb is
freed, than use hnae_handle will cause use after free panic.

This patch frees vf_cb after hnae_handle used.

Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
index 1a16c0307b47..bd36fbe81ad2 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
@@ -188,12 +188,10 @@ static void hns_ae_put_handle(struct hnae_handle *handle)
 	struct hnae_vf_cb *vf_cb = hns_ae_get_vf_cb(handle);
 	int i;
 
-	vf_cb->mac_cb	 = NULL;
-
-	kfree(vf_cb);
-
 	for (i = 0; i < handle->q_num; i++)
 		hns_ae_get_ring_pair(handle->qs[i])->used_by_vf = 0;
+
+	kfree(vf_cb);
 }
 
 static void hns_ae_ring_enable_all(struct hnae_handle *handle, int val)

From 5647975ec2b613f1d4e778df37accb9f469e8c36 Mon Sep 17 00:00:00 2001
From: Alban Bedel <albeu@free.fr>
Date: Mon, 7 Jan 2019 20:45:15 +0100
Subject: [PATCH 2691/3220] MIPS: ath79: Enable OF serial ports in the default
 config

[ Upstream commit 565dc8a4f55e491935bfb04866068d21784ea9a4 ]

CONFIG_SERIAL_OF_PLATFORM is needed to get a working console on the OF
boards, enable it in the default config to get a working setup out of
the box.

Signed-off-by: Alban Bedel <albeu@free.fr>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@vger.kernel.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/mips/configs/ath79_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/configs/ath79_defconfig b/arch/mips/configs/ath79_defconfig
index 134879c1310a..4ed369c0ec6a 100644
--- a/arch/mips/configs/ath79_defconfig
+++ b/arch/mips/configs/ath79_defconfig
@@ -74,6 +74,7 @@ CONFIG_SERIAL_8250_CONSOLE=y
 # CONFIG_SERIAL_8250_PCI is not set
 CONFIG_SERIAL_8250_NR_UARTS=1
 CONFIG_SERIAL_8250_RUNTIME_UARTS=1
+CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_SERIAL_AR933X=y
 CONFIG_SERIAL_AR933X_CONSOLE=y
 # CONFIG_HW_RANDOM is not set

From 40ff741a7b49b9036fe63100e88ee8fcb79a5f45 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 20 Dec 2018 11:16:07 +0800
Subject: [PATCH 2692/3220] scsi: qla4xxx: check return code of
 qla4xxx_copy_from_fwddb_param

[ Upstream commit 72b4a0465f995175a2e22cf4a636bf781f1f28a7 ]

The return code should be check while qla4xxx_copy_from_fwddb_param fails.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Manish Rangankar <mrangankar@marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/qla4xxx/ql4_os.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index d8c03431d0aa..f9f899ec9427 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -7245,6 +7245,8 @@ static int qla4xxx_sysfs_ddb_tgt_create(struct scsi_qla_host *ha,
 
 	rc = qla4xxx_copy_from_fwddb_param(fnode_sess, fnode_conn,
 					   fw_ddb_entry);
+	if (rc)
+		goto free_sess;
 
 	ql4_printk(KERN_INFO, ha, "%s: sysfs entry %s created\n",
 		   __func__, fnode_sess->dev.kobj.name);

From 61c0c195980b6ef0667df11c6613dcbbae15de61 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Tue, 8 Jan 2019 13:50:43 -0700
Subject: [PATCH 2693/3220] scsi: isci: initialize shost fully before calling
 scsi_add_host()

[ Upstream commit cc29a1b0a3f2597ce887d339222fa85b9307706d ]

scsi_mq_setup_tags(), which is called by scsi_add_host(), calculates the
command size to allocate based on the prot_capabilities. In the isci
driver, scsi_host_set_prot() is called after scsi_add_host() so the command
size gets calculated to be smaller than it needs to be.  Eventually,
scsi_mq_init_request() locates the 'prot_sdb' after the command assuming it
was sized correctly and a buffer overrun may occur.

However, seeing blk_mq_alloc_rqs() rounds up to the nearest cache line
size, the mistake can go unnoticed.

The bug was noticed after the struct request size was reduced by commit
9d037ad707ed ("block: remove req->timeout_list")

Which likely reduced the allocated space for the request by an entire cache
line, enough that the overflow could be hit and it caused a panic, on boot,
at:

  RIP: 0010:t10_pi_complete+0x77/0x1c0
  Call Trace:
    <IRQ>
    sd_done+0xf5/0x340
    scsi_finish_command+0xc3/0x120
    blk_done_softirq+0x83/0xb0
    __do_softirq+0xa1/0x2e6
    irq_exit+0xbc/0xd0
    call_function_single_interrupt+0xf/0x20
    </IRQ>

sd_done() would call scsi_prot_sg_count() which reads the number of
entities in 'prot_sdb', but seeing 'prot_sdb' is located after the end of
the allocated space it reads a garbage number and erroneously calls
t10_pi_complete().

To prevent this, the calls to scsi_host_set_prot() are moved into
isci_host_alloc() before the call to scsi_add_host(). Out of caution, also
move the similar call to scsi_host_set_guard().

Fixes: 3d2d75254915 ("[SCSI] isci: T10 DIF support")
Link: http://lkml.kernel.org/r/da851333-eadd-163a-8c78-e1f4ec5ec857@deltatee.com
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Intel SCU Linux support <intel-linux-scu@intel.com>
Cc: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/isci/init.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/isci/init.c b/drivers/scsi/isci/init.c
index 77128d680e3b..6f38fa1f468a 100644
--- a/drivers/scsi/isci/init.c
+++ b/drivers/scsi/isci/init.c
@@ -595,6 +595,13 @@ static struct isci_host *isci_host_alloc(struct pci_dev *pdev, int id)
 	shost->max_lun = ~0;
 	shost->max_cmd_len = MAX_COMMAND_SIZE;
 
+	/* turn on DIF support */
+	scsi_host_set_prot(shost,
+			   SHOST_DIF_TYPE1_PROTECTION |
+			   SHOST_DIF_TYPE2_PROTECTION |
+			   SHOST_DIF_TYPE3_PROTECTION);
+	scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC);
+
 	err = scsi_add_host(shost, &pdev->dev);
 	if (err)
 		goto err_shost;
@@ -682,13 +689,6 @@ static int isci_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 			goto err_host_alloc;
 		}
 		pci_info->hosts[i] = h;
-
-		/* turn on DIF support */
-		scsi_host_set_prot(to_shost(h),
-				   SHOST_DIF_TYPE1_PROTECTION |
-				   SHOST_DIF_TYPE2_PROTECTION |
-				   SHOST_DIF_TYPE3_PROTECTION);
-		scsi_host_set_guard(to_shost(h), SHOST_DIX_GUARD_CRC);
 	}
 
 	err = isci_setup_interrupts(pdev);

From 2b285e446056b0dd37bce8058bbdf292a44fe83b Mon Sep 17 00:00:00 2001
From: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Date: Wed, 9 Jan 2019 18:12:16 +0100
Subject: [PATCH 2694/3220] MIPS: jazz: fix 64bit build

[ Upstream commit 41af167fbc0032f9d7562854f58114eaa9270336 ]

64bit JAZZ builds failed with

  linux-next/arch/mips/jazz/jazzdma.c: In function `vdma_init`:
  /linux-next/arch/mips/jazz/jazzdma.c:77:30: error: implicit declaration
    of function `KSEG1ADDR`; did you mean `CKSEG1ADDR`?
    [-Werror=implicit-function-declaration]
    pgtbl = (VDMA_PGTBL_ENTRY *)KSEG1ADDR(pgtbl);
                                ^~~~~~~~~
                                CKSEG1ADDR
  /linux-next/arch/mips/jazz/jazzdma.c:77:10: error: cast to pointer from
    integer of different size [-Werror=int-to-pointer-cast]
    pgtbl = (VDMA_PGTBL_ENTRY *)KSEG1ADDR(pgtbl);
            ^
  In file included from /linux-next/arch/mips/include/asm/barrier.h:11:0,
                   from /linux-next/include/linux/compiler.h:248,
                   from /linux-next/include/linux/kernel.h:10,
                   from /linux-next/arch/mips/jazz/jazzdma.c:11:
  /linux-next/arch/mips/include/asm/addrspace.h:41:29: error: cast from
    pointer to integer of different size [-Werror=pointer-to-int-cast]
   #define _ACAST32_  (_ATYPE_)(_ATYPE32_) /* widen if necessary */
                               ^
  /linux-next/arch/mips/include/asm/addrspace.h:53:25: note: in
    expansion of macro `_ACAST32_`
   #define CPHYSADDR(a)  ((_ACAST32_(a)) & 0x1fffffff)
                           ^~~~~~~~~
  /linux-next/arch/mips/jazz/jazzdma.c:84:44: note: in expansion of
    macro `CPHYSADDR`
    r4030_write_reg32(JAZZ_R4030_TRSTBL_BASE, CPHYSADDR(pgtbl));

Using correct casts and CKSEG1ADDR when dealing with the pgtbl setup
fixes this.

Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/mips/jazz/jazzdma.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c
index db6f5afff4ff..ea897912bc71 100644
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@@ -71,14 +71,15 @@ static int __init vdma_init(void)
 						    get_order(VDMA_PGTBL_SIZE));
 	BUG_ON(!pgtbl);
 	dma_cache_wback_inv((unsigned long)pgtbl, VDMA_PGTBL_SIZE);
-	pgtbl = (VDMA_PGTBL_ENTRY *)KSEG1ADDR(pgtbl);
+	pgtbl = (VDMA_PGTBL_ENTRY *)CKSEG1ADDR((unsigned long)pgtbl);
 
 	/*
 	 * Clear the R4030 translation table
 	 */
 	vdma_pgtbl_init();
 
-	r4030_write_reg32(JAZZ_R4030_TRSTBL_BASE, CPHYSADDR(pgtbl));
+	r4030_write_reg32(JAZZ_R4030_TRSTBL_BASE,
+			  CPHYSADDR((unsigned long)pgtbl));
 	r4030_write_reg32(JAZZ_R4030_TRSTBL_LIM, VDMA_PGTBL_SIZE);
 	r4030_write_reg32(JAZZ_R4030_TRSTBL_INV, 0);
 

From fd8662fa59301e788b43e1cd01915391cada02ea Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Tue, 8 Jan 2019 21:04:48 +0800
Subject: [PATCH 2695/3220] isdn: i4l: isdn_tty: Fix some concurrency
 double-free bugs

[ Upstream commit 2ff33d6637393fe9348357285931811b76e1402f ]

The functions isdn_tty_tiocmset() and isdn_tty_set_termios() may be
concurrently executed.

isdn_tty_tiocmset
  isdn_tty_modem_hup
    line 719: kfree(info->dtmf_state);
    line 721: kfree(info->silence_state);
    line 723: kfree(info->adpcms);
    line 725: kfree(info->adpcmr);

isdn_tty_set_termios
  isdn_tty_modem_hup
    line 719: kfree(info->dtmf_state);
    line 721: kfree(info->silence_state);
    line 723: kfree(info->adpcms);
    line 725: kfree(info->adpcmr);

Thus, some concurrency double-free bugs may occur.

These possible bugs are found by a static tool written by myself and
my manual code review.

To fix these possible bugs, the mutex lock "modem_info_mutex" used in
isdn_tty_tiocmset() is added in isdn_tty_set_termios().

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/isdn/i4l/isdn_tty.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 2175225af742..8291e9cc949a 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1459,15 +1459,19 @@ isdn_tty_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 {
 	modem_info *info = (modem_info *) tty->driver_data;
 
+	mutex_lock(&modem_info_mutex);
 	if (!old_termios)
 		isdn_tty_change_speed(info);
 	else {
 		if (tty->termios.c_cflag == old_termios->c_cflag &&
 		    tty->termios.c_ispeed == old_termios->c_ispeed &&
-		    tty->termios.c_ospeed == old_termios->c_ospeed)
+		    tty->termios.c_ospeed == old_termios->c_ospeed) {
+			mutex_unlock(&modem_info_mutex);
 			return;
+		}
 		isdn_tty_change_speed(info);
 	}
+	mutex_unlock(&modem_info_mutex);
 }
 
 /*

From 4949b728976c1142c8a6b484e789af5d554886b1 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 15 Jan 2019 18:03:38 +0000
Subject: [PATCH 2696/3220] atm: he: fix sign-extension overflow on large shift

[ Upstream commit cb12d72b27a6f41325ae23a11033cf5fedfa1b97 ]

Shifting the 1 by exp by an int can lead to sign-extension overlow when
exp is 31 since 1 is an signed int and sign-extending this result to an
unsigned long long will set the upper 32 bits.  Fix this by shifting an
unsigned long.

Detected by cppcheck:
(warning) Shifting signed 32-bit value by 31 bits is undefined behaviour

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/atm/he.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 0f5cb37636bc..010581e8bee0 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -717,7 +717,7 @@ static int he_init_cs_block_rcm(struct he_dev *he_dev)
 			instead of '/ 512', use '>> 9' to prevent a call
 			to divdu3 on x86 platforms
 		*/
-		rate_cps = (unsigned long long) (1 << exp) * (man + 512) >> 9;
+		rate_cps = (unsigned long long) (1UL << exp) * (man + 512) >> 9;
 
 		if (rate_cps < 10)
 			rate_cps = 10;	/* 2.2.1 minimum payload rate is 10 cps */

From 65aac32fee8007170c5ce738837046bde5f2b12d Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Tue, 25 Dec 2018 22:18:23 -0600
Subject: [PATCH 2697/3220] leds: lp5523: fix a missing check of return value
 of lp55xx_read

[ Upstream commit 248b57015f35c94d4eae2fdd8c6febf5cd703900 ]

When lp55xx_read() fails, "status" is an uninitialized variable and thus
may contain random value; using it leads to undefined behaviors.

The fix inserts a check for the return value of lp55xx_read: if it
fails, returns with its error code.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/leds/leds-lp5523.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/leds/leds-lp5523.c b/drivers/leds/leds-lp5523.c
index 1d0187f42941..d12370352ae3 100644
--- a/drivers/leds/leds-lp5523.c
+++ b/drivers/leds/leds-lp5523.c
@@ -318,7 +318,9 @@ static int lp5523_init_program_engine(struct lp55xx_chip *chip)
 
 	/* Let the programs run for couple of ms and check the engine status */
 	usleep_range(3000, 6000);
-	lp55xx_read(chip, LP5523_REG_STATUS, &status);
+	ret = lp55xx_read(chip, LP5523_REG_STATUS, &status);
+	if (ret)
+		return ret;
 	status &= LP5523_ENG_STATUS_MASK;
 
 	if (status != LP5523_ENG_STATUS_MASK) {

From 1c3d88357065ce3e5d4581377eb425fad0fc2c15 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Wed, 9 Jan 2019 22:41:08 -0700
Subject: [PATCH 2698/3220] isdn: avm: Fix string plus integer warning from
 Clang

[ Upstream commit 7afa81c55fca0cad589722cb4bce698b4803b0e1 ]

A recent commit in Clang expanded the -Wstring-plus-int warning, showing
some odd behavior in this file.

drivers/isdn/hardware/avm/b1.c:426:30: warning: adding 'int' to a string does not append to the string [-Wstring-plus-int]
                cinfo->version[j] = "\0\0" + 1;
                                    ~~~~~~~^~~
drivers/isdn/hardware/avm/b1.c:426:30: note: use array indexing to silence this warning
                cinfo->version[j] = "\0\0" + 1;
                                           ^
                                    &      [  ]
1 warning generated.

This is equivalent to just "\0". Nick pointed out that it is smarter to
use "" instead of "\0" because "" is used elsewhere in the kernel and
can be deduplicated at the linking stage.

Link: https://github.com/ClangBuiltLinux/linux/issues/309
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/isdn/hardware/avm/b1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c
index 4d9b195547c5..df2a10157720 100644
--- a/drivers/isdn/hardware/avm/b1.c
+++ b/drivers/isdn/hardware/avm/b1.c
@@ -423,7 +423,7 @@ void b1_parse_version(avmctrl_info *cinfo)
 	int i, j;
 
 	for (j = 0; j < AVM_MAXVERSION; j++)
-		cinfo->version[j] = "\0\0" + 1;
+		cinfo->version[j] = "";
 	for (i = 0, j = 0;
 	     j < AVM_MAXVERSION && i < cinfo->versionlen;
 	     j++, i += cinfo->versionbuf[i] + 1)

From 2ef22c73c241cb5e38888a32176ac5356986f127 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 30 Jan 2019 14:05:55 -0800
Subject: [PATCH 2699/3220] RDMA/srp: Rework SCSI device reset handling

commit 48396e80fb6526ea5ed267bd84f028bae56d2f9e upstream.

Since .scsi_done() must only be called after scsi_queue_rq() has
finished, make sure that the SRP initiator driver does not call
.scsi_done() while scsi_queue_rq() is in progress. Although
invoking sg_reset -d while I/O is in progress works fine with kernel
v4.20 and before, that is not the case with kernel v5.0-rc1. This
patch avoids that the following crash is triggered with kernel
v5.0-rc1:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000138
CPU: 0 PID: 360 Comm: kworker/0:1H Tainted: G    B             5.0.0-rc1-dbg+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Workqueue: kblockd blk_mq_run_work_fn
RIP: 0010:blk_mq_dispatch_rq_list+0x116/0xb10
Call Trace:
 blk_mq_sched_dispatch_requests+0x2f7/0x300
 __blk_mq_run_hw_queue+0xd6/0x180
 blk_mq_run_work_fn+0x27/0x30
 process_one_work+0x4f1/0xa20
 worker_thread+0x67/0x5b0
 kthread+0x1cf/0x1f0
 ret_from_fork+0x24/0x30

Cc: <stable@vger.kernel.org>
Fixes: 94a9174c630c ("IB/srp: reduce lock coverage of command completion")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/infiniband/ulp/srp/ib_srp.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 1897c4080346..3dbc3ed263c2 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -2594,7 +2594,6 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
 {
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
 	struct srp_rdma_ch *ch;
-	int i, j;
 	u8 status;
 
 	shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
@@ -2606,15 +2605,6 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
 	if (status)
 		return FAILED;
 
-	for (i = 0; i < target->ch_count; i++) {
-		ch = &target->ch[i];
-		for (j = 0; j < target->req_ring_size; ++j) {
-			struct srp_request *req = &ch->req_ring[j];
-
-			srp_finish_req(ch, req, scmnd->device, DID_RESET << 16);
-		}
-	}
-
 	return SUCCESS;
 }
 

From d76ef3c980247096164e47b708658f054347c04e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 20 Feb 2019 13:32:11 +0000
Subject: [PATCH 2700/3220] KEYS: user: Align the payload buffer

commit cc1780fc42c76c705dd07ea123f1143dc5057630 upstream.

Align the payload of "user" and "logon" keys so that users of the
keyrings service can access it as a struct that requires more than
2-byte alignment.  fscrypt currently does this which results in the read
of fscrypt_key::size being misaligned as it needs 4-byte alignment.

Align to __alignof__(u64) rather than __alignof__(long) since in the
future it's conceivable that people would use structs beginning with
u64, which on some platforms would require more than 'long' alignment.

Reported-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Fixes: 2aa349f6e37c ("[PATCH] Keys: Export user-defined keyring operations")
Fixes: 88bd6ccdcdd6 ("ext4 crypto: add encryption key management facilities")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Tested-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <james.morris@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/keys/user-type.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index c56fef40f53e..5d744ec8f644 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -31,7 +31,7 @@
 struct user_key_payload {
 	struct rcu_head	rcu;		/* RCU destructor */
 	unsigned short	datalen;	/* length of this data */
-	char		data[0];	/* actual data */
+	char		data[0] __aligned(__alignof__(u64)); /* actual data */
 };
 
 extern struct key_type key_type_user;

From 00c3ec3af87a72cb34b2262078e7144a0e2cf345 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 22 Feb 2019 15:36:18 +0000
Subject: [PATCH 2701/3220] KEYS: always initialize keyring_index_key::desc_len

commit ede0fa98a900e657d1fcd80b50920efc896c1a4c upstream.

syzbot hit the 'BUG_ON(index_key->desc_len == 0);' in __key_link_begin()
called from construct_alloc_key() during sys_request_key(), because the
length of the key description was never calculated.

The problem is that we rely on ->desc_len being initialized by
search_process_keyrings(), specifically by search_nested_keyrings().
But, if the process isn't subscribed to any keyrings that never happens.

Fix it by always initializing keyring_index_key::desc_len as soon as the
description is set, like we already do in some places.

The following program reproduces the BUG_ON() when it's run as root and
no session keyring has been installed.  If it doesn't work, try removing
pam_keyinit.so from /etc/pam.d/login and rebooting.

    #include <stdlib.h>
    #include <unistd.h>
    #include <keyutils.h>

    int main(void)
    {
            int id = add_key("keyring", "syz", NULL, 0, KEY_SPEC_USER_KEYRING);

            keyctl_setperm(id, KEY_OTH_WRITE);
            setreuid(5000, 5000);
            request_key("user", "desc", "", id);
    }

Reported-by: syzbot+ec24e95ea483de0a24da@syzkaller.appspotmail.com
Fixes: b2a4df200d57 ("KEYS: Expand the capacity of a keyring")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: James Morris <james.morris@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 security/keys/keyring.c          | 4 +---
 security/keys/proc.c             | 3 +--
 security/keys/request_key.c      | 1 +
 security/keys/request_key_auth.c | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index d5264f950ce1..737e60b3d4bd 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -628,9 +628,6 @@ static bool search_nested_keyrings(struct key *keyring,
 	BUG_ON((ctx->flags & STATE_CHECKS) == 0 ||
 	       (ctx->flags & STATE_CHECKS) == STATE_CHECKS);
 
-	if (ctx->index_key.description)
-		ctx->index_key.desc_len = strlen(ctx->index_key.description);
-
 	/* Check to see if this top-level keyring is what we are looking for
 	 * and whether it is valid or not.
 	 */
@@ -888,6 +885,7 @@ key_ref_t keyring_search(key_ref_t keyring,
 	struct keyring_search_context ctx = {
 		.index_key.type		= type,
 		.index_key.description	= description,
+		.index_key.desc_len	= strlen(description),
 		.cred			= current_cred(),
 		.match_data.cmp		= key_default_cmp,
 		.match_data.raw_data	= description,
diff --git a/security/keys/proc.c b/security/keys/proc.c
index 036128682463..ec493ddadd11 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -186,8 +186,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	int rc;
 
 	struct keyring_search_context ctx = {
-		.index_key.type		= key->type,
-		.index_key.description	= key->description,
+		.index_key		= key->index_key,
 		.cred			= current_cred(),
 		.match_data.cmp		= lookup_user_key_possessed,
 		.match_data.raw_data	= key,
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 3ae3acf473c8..88172c163953 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -544,6 +544,7 @@ struct key *request_key_and_link(struct key_type *type,
 	struct keyring_search_context ctx = {
 		.index_key.type		= type,
 		.index_key.description	= description,
+		.index_key.desc_len	= strlen(description),
 		.cred			= current_cred(),
 		.match_data.cmp		= key_default_cmp,
 		.match_data.raw_data	= description,
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 217775fcd0f3..8882b729924d 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -254,7 +254,7 @@ struct key *key_get_instantiation_authkey(key_serial_t target_id)
 	struct key *authkey;
 	key_ref_t authkey_ref;
 
-	sprintf(description, "%x", target_id);
+	ctx.index_key.desc_len = sprintf(description, "%x", target_id);
 
 	authkey_ref = search_process_keyrings(&ctx);
 

From b76ca18c858853cde1164bdacce88b6b5634ec9a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Feb 2019 14:41:22 -0800
Subject: [PATCH 2702/3220] batman-adv: fix uninit-value in
 batadv_interface_tx()

[ Upstream commit 4ffcbfac60642f63ae3d80891f573ba7e94a265c ]

KMSAN reported batadv_interface_tx() was possibly using a
garbage value [1]

batadv_get_vid() does have a pskb_may_pull() call
but batadv_interface_tx() does not actually make sure
this did not fail.

[1]
BUG: KMSAN: uninit-value in batadv_interface_tx+0x908/0x1e40 net/batman-adv/soft-interface.c:231
CPU: 0 PID: 10006 Comm: syz-executor469 Not tainted 4.20.0-rc7+ #5
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x173/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613
 __msan_warning+0x82/0xf0 mm/kmsan/kmsan_instr.c:313
 batadv_interface_tx+0x908/0x1e40 net/batman-adv/soft-interface.c:231
 __netdev_start_xmit include/linux/netdevice.h:4356 [inline]
 netdev_start_xmit include/linux/netdevice.h:4365 [inline]
 xmit_one net/core/dev.c:3257 [inline]
 dev_hard_start_xmit+0x607/0xc40 net/core/dev.c:3273
 __dev_queue_xmit+0x2e42/0x3bc0 net/core/dev.c:3843
 dev_queue_xmit+0x4b/0x60 net/core/dev.c:3876
 packet_snd net/packet/af_packet.c:2928 [inline]
 packet_sendmsg+0x8306/0x8f30 net/packet/af_packet.c:2953
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 __sys_sendto+0x8c4/0xac0 net/socket.c:1788
 __do_sys_sendto net/socket.c:1800 [inline]
 __se_sys_sendto+0x107/0x130 net/socket.c:1796
 __x64_sys_sendto+0x6e/0x90 net/socket.c:1796
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x441889
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 bb 10 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffdda6fd468 EFLAGS: 00000216 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 0000000000441889
RDX: 000000000000000e RSI: 00000000200000c0 RDI: 0000000000000003
RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000216 R12: 00007ffdda6fd4c0
R13: 00007ffdda6fd4b0 R14: 0000000000000000 R15: 0000000000000000

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline]
 kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:158
 kmsan_kmalloc+0xa6/0x130 mm/kmsan/kmsan_hooks.c:176
 kmsan_slab_alloc+0xe/0x10 mm/kmsan/kmsan_hooks.c:185
 slab_post_alloc_hook mm/slab.h:446 [inline]
 slab_alloc_node mm/slub.c:2759 [inline]
 __kmalloc_node_track_caller+0xe18/0x1030 mm/slub.c:4383
 __kmalloc_reserve net/core/skbuff.c:137 [inline]
 __alloc_skb+0x309/0xa20 net/core/skbuff.c:205
 alloc_skb include/linux/skbuff.h:998 [inline]
 alloc_skb_with_frags+0x1c7/0xac0 net/core/skbuff.c:5220
 sock_alloc_send_pskb+0xafd/0x10e0 net/core/sock.c:2083
 packet_alloc_skb net/packet/af_packet.c:2781 [inline]
 packet_snd net/packet/af_packet.c:2872 [inline]
 packet_sendmsg+0x661a/0x8f30 net/packet/af_packet.c:2953
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg net/socket.c:631 [inline]
 __sys_sendto+0x8c4/0xac0 net/socket.c:1788
 __do_sys_sendto net/socket.c:1800 [inline]
 __se_sys_sendto+0x107/0x130 net/socket.c:1796
 __x64_sys_sendto+0x6e/0x90 net/socket.c:1796
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc:	Marek Lindner <mareklindner@neomailbox.ch>
Cc:	Simon Wunderlich <sw@simonwunderlich.de>
Cc:	Antonio Quartulli <a@unstable.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/batman-adv/soft-interface.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 5aeb585571ed..4812123e0a2c 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -215,6 +215,8 @@ static int batadv_interface_tx(struct sk_buff *skb,
 
 	switch (ntohs(ethhdr->h_proto)) {
 	case ETH_P_8021Q:
+		if (!pskb_may_pull(skb, sizeof(*vhdr)))
+			goto dropped;
 		vhdr = vlan_eth_hdr(skb);
 
 		if (vhdr->h_vlan_encapsulated_proto != ethertype) {

From e41a4dc253df0c5961e3a3bc9faf71109db7866e Mon Sep 17 00:00:00 2001
From: Kal Conley <kal.conley@dectris.com>
Date: Sun, 10 Feb 2019 09:57:11 +0100
Subject: [PATCH 2703/3220] net/packet: fix 4gb buffer limit due to overflow
 check

[ Upstream commit fc62814d690cf62189854464f4bd07457d5e9e50 ]

When calculating rb->frames_per_block * req->tp_block_nr the result
can overflow. Check it for overflow without limiting the total buffer
size to UINT_MAX.

This change fixes support for packet ring buffers >= UINT_MAX.

Fixes: 8f8d28e4d6d8 ("net/packet: fix overflow in check for tp_frame_nr")
Signed-off-by: Kal Conley <kal.conley@dectris.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/packet/af_packet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 753b2837318d..d517dd7f4ac7 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4217,7 +4217,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
 		if (unlikely(rb->frames_per_block == 0))
 			goto out;
-		if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
+		if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
 			goto out;
 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
 					req->tp_frame_nr))

From bd49916ee0f793cc25e9b501cc9dfef121146f51 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 11 Feb 2019 21:59:51 -0800
Subject: [PATCH 2704/3220] team: avoid complex list operations in
 team_nl_cmd_options_set()

[ Upstream commit 2fdeee2549231b1f989f011bb18191f5660d3745 ]

The current opt_inst_list operations inside team_nl_cmd_options_set()
is too complex to track:

    LIST_HEAD(opt_inst_list);
    nla_for_each_nested(...) {
        list_for_each_entry(opt_inst, &team->option_inst_list, list) {
            if (__team_option_inst_tmp_find(&opt_inst_list, opt_inst))
                continue;
            list_add(&opt_inst->tmp_list, &opt_inst_list);
        }
    }
    team_nl_send_event_options_get(team, &opt_inst_list);

as while we retrieve 'opt_inst' from team->option_inst_list, it could
be added to the local 'opt_inst_list' for multiple times. The
__team_option_inst_tmp_find() doesn't work, as the setter
team_mode_option_set() still calls team->ops.exit() which uses
->tmp_list too in __team_options_change_check().

Simplify the list operations by moving the 'opt_inst_list' and
team_nl_send_event_options_get() into the nla_for_each_nested() loop so
that it can be guranteed that we won't insert a same list entry for
multiple times. Therefore, __team_option_inst_tmp_find() can be removed
too.

Fixes: 4fb0534fb7bb ("team: avoid adding twice the same option to the event list")
Fixes: 2fcdb2c9e659 ("team: allow to send multiple set events in one message")
Reported-by: syzbot+4d4af685432dc0e56c91@syzkaller.appspotmail.com
Reported-by: syzbot+68ee510075cf64260cc4@syzkaller.appspotmail.com
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/team/team.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 33ffb573fd67..267a90423154 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -247,17 +247,6 @@ static void __team_option_inst_mark_removed_port(struct team *team,
 	}
 }
 
-static bool __team_option_inst_tmp_find(const struct list_head *opts,
-					const struct team_option_inst *needle)
-{
-	struct team_option_inst *opt_inst;
-
-	list_for_each_entry(opt_inst, opts, tmp_list)
-		if (opt_inst == needle)
-			return true;
-	return false;
-}
-
 static int __team_options_register(struct team *team,
 				   const struct team_option *option,
 				   size_t option_count)
@@ -2447,7 +2436,6 @@ static int team_nl_cmd_options_set(struct sk_buff *skb, struct genl_info *info)
 	int err = 0;
 	int i;
 	struct nlattr *nl_option;
-	LIST_HEAD(opt_inst_list);
 
 	team = team_nl_team_get(info);
 	if (!team)
@@ -2463,6 +2451,7 @@ static int team_nl_cmd_options_set(struct sk_buff *skb, struct genl_info *info)
 		struct nlattr *opt_attrs[TEAM_ATTR_OPTION_MAX + 1];
 		struct nlattr *attr;
 		struct nlattr *attr_data;
+		LIST_HEAD(opt_inst_list);
 		enum team_option_type opt_type;
 		int opt_port_ifindex = 0; /* != 0 for per-port options */
 		u32 opt_array_index = 0;
@@ -2566,23 +2555,17 @@ static int team_nl_cmd_options_set(struct sk_buff *skb, struct genl_info *info)
 			if (err)
 				goto team_put;
 			opt_inst->changed = true;
-
-			/* dumb/evil user-space can send us duplicate opt,
-			 * keep only the last one
-			 */
-			if (__team_option_inst_tmp_find(&opt_inst_list,
-							opt_inst))
-				continue;
-
 			list_add(&opt_inst->tmp_list, &opt_inst_list);
 		}
 		if (!opt_found) {
 			err = -ENOENT;
 			goto team_put;
 		}
-	}
 
-	err = team_nl_send_event_options_get(team, &opt_inst_list);
+		err = team_nl_send_event_options_get(team, &opt_inst_list);
+		if (err)
+			break;
+	}
 
 team_put:
 	team_nl_team_put(team);

From 9a24e9286b5b0f0510e264a62228dd5919f5d2de Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 7 Feb 2019 18:36:11 +0800
Subject: [PATCH 2705/3220] sit: check if IPv6 enabled before calling
 ip6_err_gen_icmpv6_unreach()

[ Upstream commit 173656accaf583698bac3f9e269884ba60d51ef4 ]

If we disabled IPv6 from the kernel command line (ipv6.disable=1), we should
not call ip6_err_gen_icmpv6_unreach(). This:

  ip link add sit1 type sit local 192.0.2.1 remote 192.0.2.2 ttl 1
  ip link set sit1 up
  ip addr add 198.51.100.1/24 dev sit1
  ping 198.51.100.2

if IPv6 is disabled at boot time, will crash the kernel.

v2: there's no need to use in6_dev_get(), use __in6_dev_get() instead,
    as we only need to check that idev exists and we are under
    rcu_read_lock() (from netif_receive_skb_internal()).

Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: ca15a078bd90 ("sit: generate icmpv6 error when receiving icmpv4 error")
Cc: Oussama Ghorbel <ghorbel@pivasoftware.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/sit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 11282ffca567..eb64cdf5888d 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -577,7 +577,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
 		goto out;
 
 	err = 0;
-	if (!ipip6_err_gen_icmpv6_unreach(skb))
+	if (__in6_dev_get(skb->dev) && !ipip6_err_gen_icmpv6_unreach(skb))
 		goto out;
 
 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)

From 6f0ec47b9a615ea495e501a9cce34f08a441241a Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 11 Feb 2019 18:04:17 +0200
Subject: [PATCH 2706/3220] net/mlx4_en: Force CHECKSUM_NONE for short ethernet
 frames

[ Upstream commit 29dded89e80e3fff61efb34f07a8a3fba3ea146d ]

When an ethernet frame is padded to meet the minimum ethernet frame
size, the padding octets are not covered by the hardware checksum.
Fortunately the padding octets are usually zero's, which don't affect
checksum. However, it is not guaranteed. For example, switches might
choose to make other use of these octets.
This repeatedly causes kernel hardware checksum fault.

Prior to the cited commit below, skb checksum was forced to be
CHECKSUM_NONE when padding is detected. After it, we need to keep
skb->csum updated. However, fixing up CHECKSUM_COMPLETE requires to
verify and parse IP headers, it does not worth the effort as the packets
are so small that CHECKSUM_COMPLETE has no significant advantage.

Future work: when reporting checksum complete is not an option for
IP non-TCP/UDP packets, we can actually fallback to report checksum
unnecessary, by looking at cqe IPOK bit.

Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends")
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 82bf1b539d87..ac7c64bae2a5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -725,13 +725,27 @@ static int get_fixed_ipv6_csum(__wsum hw_checksum, struct sk_buff *skb,
 	return 0;
 }
 #endif
+
+#define short_frame(size) ((size) <= ETH_ZLEN + ETH_FCS_LEN)
+
 static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
 		      netdev_features_t dev_features)
 {
 	__wsum hw_checksum = 0;
+	void *hdr;
 
-	void *hdr = (u8 *)va + sizeof(struct ethhdr);
+	/* CQE csum doesn't cover padding octets in short ethernet
+	 * frames. And the pad field is appended prior to calculating
+	 * and appending the FCS field.
+	 *
+	 * Detecting these padded frames requires to verify and parse
+	 * IP headers, so we simply force all those small frames to skip
+	 * checksum complete.
+	 */
+	if (short_frame(skb->len))
+		return -EINVAL;
 
+	hdr = (u8 *)va + sizeof(struct ethhdr);
 	hw_checksum = csum_unfold((__force __sum16)cqe->checksum);
 
 	if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK) &&
@@ -851,6 +865,11 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
 
 		if (likely(dev->features & NETIF_F_RXCSUM)) {
+			/* TODO: For IP non TCP/UDP packets when csum complete is
+			 * not an option (not supported or any other reason) we can
+			 * actually check cqe IPOK status bit and report
+			 * CHECKSUM_UNNECESSARY rather than CHECKSUM_NONE
+			 */
 			if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
 						      MLX4_CQE_STATUS_UDP)) {
 				if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&

From 2c2433eba19a9762ff8d151777f6fd5774b5bff4 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Wed, 16 Jan 2019 14:29:50 +0300
Subject: [PATCH 2707/3220] ARCv2: Enable unaligned access in early ASM code

commit 252f6e8eae909bc075a1b1e3b9efb095ae4c0b56 upstream.

It is currently done in arc_init_IRQ() which might be too late
considering gcc 7.3.1 onwards (GNU 2018.03) generates unaligned
memory accesses by default

Cc: stable@vger.kernel.org #4.4+
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
[vgupta: rewrote changelog]
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arc/kernel/head.S | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/arc/kernel/head.S b/arch/arc/kernel/head.S
index 689dd867fdff..cd64cb4ef7b0 100644
--- a/arch/arc/kernel/head.S
+++ b/arch/arc/kernel/head.S
@@ -17,6 +17,7 @@
 #include <asm/entry.h>
 #include <asm/arcregs.h>
 #include <asm/cache.h>
+#include <asm/irqflags.h>
 
 .macro CPU_EARLY_SETUP
 
@@ -47,6 +48,15 @@
 	sr	r5, [ARC_REG_DC_CTRL]
 
 1:
+
+#ifdef CONFIG_ISA_ARCV2
+	; Unaligned access is disabled at reset, so re-enable early as
+	; gcc 7.3.1 (ARC GNU 2018.03) onwards generates unaligned access
+	; by default
+	lr	r5, [status32]
+	bset	r5, r5, STATUS_AD_BIT
+	kflag	r5
+#endif
 .endm
 
 	.section .init.text, "ax",@progbits

From 0effb9fb3dfeac514d8ce8a2ba3da970793711ba Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 22 Feb 2019 21:22:32 +0800
Subject: [PATCH 2708/3220] Revert "bridge: do not add port to router list when
 receives query with source 0.0.0.0"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 278e2148c07559dd4ad8602f22366d61eb2ee7b7 upstream.

This reverts commit 5a2de63fd1a5 ("bridge: do not add port to router list
when receives query with source 0.0.0.0") and commit 0fe5119e267f ("net:
bridge: remove ipv6 zero address check in mcast queries")

The reason is RFC 4541 is not a standard but suggestive. Currently we
will elect 0.0.0.0 as Querier if there is no ip address configured on
bridge. If we do not add the port which recives query with source
0.0.0.0 to router list, the IGMP reports will not be about to forward
to Querier, IGMP data will also not be able to forward to dest.

As Nikolay suggested, revert this change first and add a boolopt api
to disable none-zero election in future if needed.

Reported-by: Linus Lüssing <linus.luessing@c0d3.blue>
Reported-by: Sebastian Gottschall <s.gottschall@newmedia-net.de>
Fixes: 5a2de63fd1a5 ("bridge: do not add port to router list when receives query with source 0.0.0.0")
Fixes: 0fe5119e267f ("net: bridge: remove ipv6 zero address check in mcast queries")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/br_multicast.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 270d9c9a5331..d80c15d028fe 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1261,14 +1261,7 @@ static void br_multicast_query_received(struct net_bridge *br,
 		return;
 
 	br_multicast_update_query_timer(br, query, max_delay);
-
-	/* Based on RFC4541, section 2.1.1 IGMP Forwarding Rules,
-	 * the arrival port for IGMP Queries where the source address
-	 * is 0.0.0.0 should not be added to router port list.
-	 */
-	if ((saddr->proto == htons(ETH_P_IP) && saddr->u.ip4) ||
-	    saddr->proto == htons(ETH_P_IPV6))
-		br_multicast_mark_router(br, port);
+	br_multicast_mark_router(br, port);
 }
 
 static int br_ip4_multicast_query(struct net_bridge *br,

From 82774efd6d34167fdb260cd74ee37fcbf323dda0 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 5 Feb 2019 20:30:27 +0100
Subject: [PATCH 2709/3220] libceph: handle an empty authorize reply

commit 0fd3fd0a9bb0b02b6435bb7070e9f7b82a23f068 upstream.

The authorize reply can be empty, for example when the ticket used to
build the authorizer is too old and TAG_BADAUTHORIZER is returned from
the service.  Calling ->verify_authorizer_reply() results in an attempt
to decrypt and validate (somewhat) random data in au->buf (most likely
the signature block from calc_signature()), which fails and ends up in
con_fault_finish() with !con->auth_retry.  The ticket isn't invalidated
and the connection is retried again and again until a new ticket is
obtained from the monitor:

  libceph: osd2 192.168.122.1:6809 bad authorize reply
  libceph: osd2 192.168.122.1:6809 bad authorize reply
  libceph: osd2 192.168.122.1:6809 bad authorize reply
  libceph: osd2 192.168.122.1:6809 bad authorize reply

Let TAG_BADAUTHORIZER handler kick in and increment con->auth_retry.

Cc: stable@vger.kernel.org
Fixes: 5c056fdc5b47 ("libceph: verify authorize reply on connect")
Link: https://tracker.ceph.com/issues/20164
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
[idryomov@gmail.com: backport to 4.4: extra arg, no CEPHX_V2]
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ceph/messenger.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 3e6897efe1eb..3ed2796d008b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2049,15 +2049,19 @@ static int process_connect(struct ceph_connection *con)
 	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
 
 	if (con->auth_reply_buf) {
+		int len = le32_to_cpu(con->in_reply.authorizer_len);
+
 		/*
 		 * Any connection that defines ->get_authorizer()
 		 * should also define ->verify_authorizer_reply().
 		 * See get_connect_authorizer().
 		 */
-		ret = con->ops->verify_authorizer_reply(con, 0);
-		if (ret < 0) {
-			con->error_msg = "bad authorize reply";
-			return ret;
+		if (len) {
+			ret = con->ops->verify_authorizer_reply(con, 0);
+			if (ret < 0) {
+				con->error_msg = "bad authorize reply";
+				return ret;
+			}
 		}
 	}
 

From bbb07fa8dfe38081d94daca94ffa74e832ab4dd4 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Fri, 15 Feb 2019 00:37:57 +0800
Subject: [PATCH 2710/3220] scsi: libsas: Fix rphy phy_identifier for PHYs with
 end devices attached

commit ffeafdd2bf0b280d67ec1a47ea6287910d271f3f upstream.

The sysfs phy_identifier attribute for a sas_end_device comes from the rphy
phy_identifier value.

Currently this is not being set for rphys with an end device attached, so
we see incorrect symlinks from systemd disk/by-path:

root@localhost:~# ls -l /dev/disk/by-path/
total 0
lrwxrwxrwx 1 root root  9 Feb 13 12:26 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy0-lun-0 -> ../../sdb
lrwxrwxrwx 1 root root 10 Feb 13 12:26 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy0-lun-0-part1 -> ../../sdb1
lrwxrwxrwx 1 root root 10 Feb 13 12:26 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy0-lun-0-part2 -> ../../sdb2
lrwxrwxrwx 1 root root 10 Feb 13 12:26 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy0-lun-0-part3 -> ../../sdc3

Indeed, each sas_end_device phy_identifier value is 0:

root@localhost:/# more sys/class/sas_device/end_device-0\:0\:2/phy_identifier
0
root@localhost:/# more sys/class/sas_device/end_device-0\:0\:10/phy_identifier
0

This patch fixes the discovery code to set the phy_identifier.  With this,
we now get proper symlinks:

root@localhost:~# ls -l /dev/disk/by-path/
total 0
lrwxrwxrwx 1 root root  9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy10-lun-0 -> ../../sdg
lrwxrwxrwx 1 root root  9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy11-lun-0 -> ../../sdh
lrwxrwxrwx 1 root root  9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy2-lun-0 -> ../../sda
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy2-lun-0-part1 -> ../../sda1
lrwxrwxrwx 1 root root  9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy3-lun-0 -> ../../sdb
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy3-lun-0-part1 -> ../../sdb1
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy3-lun-0-part2 -> ../../sdb2
lrwxrwxrwx 1 root root  9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy4-lun-0 -> ../../sdc
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy4-lun-0-part1 -> ../../sdc1
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy4-lun-0-part2 -> ../../sdc2
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy4-lun-0-part3 -> ../../sdc3
lrwxrwxrwx 1 root root  9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy5-lun-0 -> ../../sdd
lrwxrwxrwx 1 root root  9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy7-lun-0 -> ../../sde
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy7-lun-0-part1 -> ../../sde1
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy7-lun-0-part2 -> ../../sde2
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy7-lun-0-part3 -> ../../sde3
lrwxrwxrwx 1 root root  9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy8-lun-0 -> ../../sdf
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy8-lun-0-part1 -> ../../sdf1
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy8-lun-0-part2 -> ../../sdf2
lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy8-lun-0-part3 -> ../../sdf3

Fixes: 2908d778ab3e ("[SCSI] aic94xx: new driver")
Reported-by: dann frazier <dann.frazier@canonical.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Tested-by: dann frazier <dann.frazier@canonical.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/scsi/libsas/sas_expander.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 12886f96b286..7be581f7c35d 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -818,6 +818,7 @@ static struct domain_device *sas_ex_discover_end_dev(
 		rphy = sas_end_device_alloc(phy->port);
 		if (!rphy)
 			goto out_free;
+		rphy->identify.phy_identifier = phy_id;
 
 		child->rphy = rphy;
 		get_device(&rphy->dev);
@@ -845,6 +846,7 @@ static struct domain_device *sas_ex_discover_end_dev(
 
 		child->rphy = rphy;
 		get_device(&rphy->dev);
+		rphy->identify.phy_identifier = phy_id;
 		sas_fill_in_rphy(child, rphy);
 
 		list_add_tail(&child->disco_list_node, &parent->port->disco_list);

From 59f6c707704a5e7c2f1c03842f28366697901af5 Mon Sep 17 00:00:00 2001
From: "Kristian H. Kristensen" <hoegsberg@gmail.com>
Date: Wed, 19 Dec 2018 08:57:41 -0800
Subject: [PATCH 2711/3220] drm/msm: Unblock writer if reader closes file

[ Upstream commit 99c66bc051e7407fe0bf0607b142ec0be1a1d1dd ]

Prevents deadlock when fifo is full and reader closes file.

Signed-off-by: Kristian H. Kristensen <hoegsberg@chromium.org>
Signed-off-by: Rob Clark <robdclark@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/msm/msm_rd.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c
index 9a78c48817c6..909a52b21ebe 100644
--- a/drivers/gpu/drm/msm/msm_rd.c
+++ b/drivers/gpu/drm/msm/msm_rd.c
@@ -103,7 +103,9 @@ static void rd_write(struct msm_rd_state *rd, const void *buf, int sz)
 		char *fptr = &fifo->buf[fifo->head];
 		int n;
 
-		wait_event(rd->fifo_event, circ_space(&rd->fifo) > 0);
+		wait_event(rd->fifo_event, circ_space(&rd->fifo) > 0 || !rd->open);
+		if (!rd->open)
+			return;
 
 		n = min(sz, circ_space_to_end(&rd->fifo));
 		memcpy(fptr, ptr, n);
@@ -192,7 +194,10 @@ out:
 static int rd_release(struct inode *inode, struct file *file)
 {
 	struct msm_rd_state *rd = inode->i_private;
+
 	rd->open = false;
+	wake_up_all(&rd->fifo_event);
+
 	return 0;
 }
 

From 2fa06f59d44e9ff0e98a082011f3bfb2f0e6a183 Mon Sep 17 00:00:00 2001
From: Rander Wang <rander.wang@linux.intel.com>
Date: Tue, 18 Dec 2018 16:24:54 +0800
Subject: [PATCH 2712/3220] ASoC: Intel: Haswell/Broadwell: fix setting for
 .dynamic field

[ Upstream commit 906a9abc5de73c383af518f5a806f4be2993a0c7 ]

For some reason this field was set to zero when all other drivers use
.dynamic = 1 for front-ends. This change was tested on Dell XPS13 and
has no impact with the existing legacy driver. The SOF driver also works
with this change which enables it to override the fixed topology.

Signed-off-by: Rander Wang <rander.wang@linux.intel.com>
Acked-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/intel/boards/broadwell.c | 2 +-
 sound/soc/intel/boards/haswell.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sound/soc/intel/boards/broadwell.c b/sound/soc/intel/boards/broadwell.c
index 3f8a1e10bed0..e5ca41ffa890 100644
--- a/sound/soc/intel/boards/broadwell.c
+++ b/sound/soc/intel/boards/broadwell.c
@@ -191,7 +191,7 @@ static struct snd_soc_dai_link broadwell_rt286_dais[] = {
 		.stream_name = "Loopback",
 		.cpu_dai_name = "Loopback Pin",
 		.platform_name = "haswell-pcm-audio",
-		.dynamic = 0,
+		.dynamic = 1,
 		.codec_name = "snd-soc-dummy",
 		.codec_dai_name = "snd-soc-dummy-dai",
 		.trigger = {SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST},
diff --git a/sound/soc/intel/boards/haswell.c b/sound/soc/intel/boards/haswell.c
index 22558572cb9c..de955c2e8c4e 100644
--- a/sound/soc/intel/boards/haswell.c
+++ b/sound/soc/intel/boards/haswell.c
@@ -145,7 +145,7 @@ static struct snd_soc_dai_link haswell_rt5640_dais[] = {
 		.stream_name = "Loopback",
 		.cpu_dai_name = "Loopback Pin",
 		.platform_name = "haswell-pcm-audio",
-		.dynamic = 0,
+		.dynamic = 1,
 		.codec_name = "snd-soc-dummy",
 		.codec_dai_name = "snd-soc-dummy-dai",
 		.trigger = {SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST},

From c56caf662ad133829fdde4602572b904527bda07 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 21 Dec 2018 12:06:58 +0300
Subject: [PATCH 2713/3220] ALSA: compress: prevent potential divide by zero
 bugs

[ Upstream commit 678e2b44c8e3fec3afc7202f1996a4500a50be93 ]

The problem is seen in the q6asm_dai_compr_set_params() function:

	ret = q6asm_map_memory_regions(dir, prtd->audio_client, prtd->phys,
				       (prtd->pcm_size / prtd->periods),
                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
				       prtd->periods);

In this code prtd->pcm_size is the buffer_size and prtd->periods comes
from params->buffer.fragments.  If we allow the number of fragments to
be zero then it results in a divide by zero bug.  One possible fix would
be to use prtd->pcm_count directly instead of using the division to
re-calculate it.  But I decided that it doesn't really make sense to
allow zero fragments.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/core/compress_offload.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c
index 6163bf3e8177..2272aee12871 100644
--- a/sound/core/compress_offload.c
+++ b/sound/core/compress_offload.c
@@ -500,7 +500,8 @@ static int snd_compress_check_input(struct snd_compr_params *params)
 {
 	/* first let's check the buffer parameter's */
 	if (params->buffer.fragment_size == 0 ||
-	    params->buffer.fragments > INT_MAX / params->buffer.fragment_size)
+	    params->buffer.fragments > INT_MAX / params->buffer.fragment_size ||
+	    params->buffer.fragments == 0)
 		return -EINVAL;
 
 	/* now codec parameters */

From 0a9ecf2b301239f95e3eaa4b964ed87c7113d1ec Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 17 Dec 2018 10:02:42 +0300
Subject: [PATCH 2714/3220] thermal: int340x_thermal: Fix a NULL vs IS_ERR()
 check

[ Upstream commit 3fe931b31a4078395c1967f0495dcc9e5ec6b5e3 ]

The intel_soc_dts_iosf_init() function doesn't return NULL, it returns
error pointers.

Fixes: 4d0dd6c1576b ("Thermal/int340x/processor_thermal: Enable auxiliary DTS for Braswell")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/thermal/int340x_thermal/processor_thermal_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/thermal/int340x_thermal/processor_thermal_device.c b/drivers/thermal/int340x_thermal/processor_thermal_device.c
index ccc0ad02d066..7f374ab5b176 100644
--- a/drivers/thermal/int340x_thermal/processor_thermal_device.c
+++ b/drivers/thermal/int340x_thermal/processor_thermal_device.c
@@ -363,7 +363,7 @@ static int  proc_thermal_pci_probe(struct pci_dev *pdev,
 		proc_priv->soc_dts = intel_soc_dts_iosf_init(
 					INTEL_SOC_DTS_INTERRUPT_MSI, 2, 0);
 
-		if (proc_priv->soc_dts && pdev->irq) {
+		if (!IS_ERR(proc_priv->soc_dts) && pdev->irq) {
 			ret = pci_enable_msi(pdev);
 			if (!ret) {
 				ret = request_threaded_irq(pdev->irq, NULL,

From f84996563d7ba74d915955572bfc31a50c50fecd Mon Sep 17 00:00:00 2001
From: Zeng Tao <prime.zeng@hisilicon.com>
Date: Wed, 26 Dec 2018 19:22:00 +0800
Subject: [PATCH 2715/3220] usb: dwc3: gadget: Fix the uninitialized link_state
 when udc starts

[ Upstream commit 88b1bb1f3b88e0bf20b05d543a53a5b99bd7ceb6 ]

Currently the link_state is uninitialized and the default value is 0(U0)
before the first time we start the udc, and after we start the udc then
 stop the udc, the link_state will be undefined.
We may have the following warnings if we start the udc again with
an undefined link_state:

WARNING: CPU: 0 PID: 327 at drivers/usb/dwc3/gadget.c:294 dwc3_send_gadget_ep_cmd+0x304/0x308
dwc3 100e0000.hidwc3_0: wakeup failed --> -22
[...]
Call Trace:
[<c010f270>] (unwind_backtrace) from [<c010b3d8>] (show_stack+0x10/0x14)
[<c010b3d8>] (show_stack) from [<c034a4dc>] (dump_stack+0x84/0x98)
[<c034a4dc>] (dump_stack) from [<c0118000>] (__warn+0xe8/0x100)
[<c0118000>] (__warn) from [<c0118050>](warn_slowpath_fmt+0x38/0x48)
[<c0118050>] (warn_slowpath_fmt) from [<c0442ec0>](dwc3_send_gadget_ep_cmd+0x304/0x308)
[<c0442ec0>] (dwc3_send_gadget_ep_cmd) from [<c0445e68>](dwc3_ep0_start_trans+0x48/0xf4)
[<c0445e68>] (dwc3_ep0_start_trans) from [<c0446750>](dwc3_ep0_out_start+0x64/0x80)
[<c0446750>] (dwc3_ep0_out_start) from [<c04451c0>](__dwc3_gadget_start+0x1e0/0x278)
[<c04451c0>] (__dwc3_gadget_start) from [<c04452e0>](dwc3_gadget_start+0x88/0x10c)
[<c04452e0>] (dwc3_gadget_start) from [<c045ee54>](udc_bind_to_driver+0x88/0xbc)
[<c045ee54>] (udc_bind_to_driver) from [<c045f29c>](usb_gadget_probe_driver+0xf8/0x140)
[<c045f29c>] (usb_gadget_probe_driver) from [<bf005424>](gadget_dev_desc_UDC_store+0xac/0xc4 [libcomposite])
[<bf005424>] (gadget_dev_desc_UDC_store [libcomposite]) from[<c023d8e0>] (configfs_write_file+0xd4/0x160)
[<c023d8e0>] (configfs_write_file) from [<c01d51e8>] (__vfs_write+0x1c/0x114)
[<c01d51e8>] (__vfs_write) from [<c01d5ff4>] (vfs_write+0xa4/0x168)
[<c01d5ff4>] (vfs_write) from [<c01d6d40>] (SyS_write+0x3c/0x90)
[<c01d6d40>] (SyS_write) from [<c0107400>] (ret_fast_syscall+0x0/0x3c)

Signed-off-by: Zeng Tao <prime.zeng@hisilicon.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/dwc3/gadget.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index b6037a0ae829..557f08adf644 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1676,6 +1676,7 @@ static int dwc3_gadget_start(struct usb_gadget *g,
 
 	/* begin to receive SETUP packets */
 	dwc->ep0state = EP0_SETUP_PHASE;
+	dwc->link_state = DWC3_LINK_STATE_SS_DIS;
 	dwc3_ep0_out_start(dwc);
 
 	dwc3_gadget_enable_irq(dwc);

From 8751b9509990fe4d5faa7affdf77fc6374055a1e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 21 Dec 2018 23:42:52 +0300
Subject: [PATCH 2716/3220] usb: gadget: Potential NULL dereference on
 allocation error

[ Upstream commit df28169e1538e4a8bcd8b779b043e5aa6524545c ]

The source_sink_alloc_func() function is supposed to return error
pointers on error.  The function is called from usb_get_function() which
doesn't check for NULL returns so it would result in an Oops.

Of course, in the current kernel, small allocations always succeed so
this doesn't affect runtime.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/gadget/function/f_sourcesink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/gadget/function/f_sourcesink.c b/drivers/usb/gadget/function/f_sourcesink.c
index 67b243989938..d7d095781be1 100644
--- a/drivers/usb/gadget/function/f_sourcesink.c
+++ b/drivers/usb/gadget/function/f_sourcesink.c
@@ -849,7 +849,7 @@ static struct usb_function *source_sink_alloc_func(
 
 	ss = kzalloc(sizeof(*ss), GFP_KERNEL);
 	if (!ss)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	ss_opts =  container_of(fi, struct f_ss_opts, func_inst);
 

From df7a151d53b1c5869dc469db5c1b17146e19b34c Mon Sep 17 00:00:00 2001
From: Silvio Cesare <silvio.cesare@gmail.com>
Date: Sat, 12 Jan 2019 16:28:43 +0100
Subject: [PATCH 2717/3220] ASoC: dapm: change snprintf to scnprintf for
 possible overflow

[ Upstream commit e581e151e965bf1f2815dd94620b638fec4d0a7e ]

Change snprintf to scnprintf. There are generally two cases where using
snprintf causes problems.

1) Uses of size += snprintf(buf, SIZE - size, fmt, ...)
In this case, if snprintf would have written more characters than what the
buffer size (SIZE) is, then size will end up larger than SIZE. In later
uses of snprintf, SIZE - size will result in a negative number, leading
to problems. Note that size might already be too large by using
size = snprintf before the code reaches a case of size += snprintf.

2) If size is ultimately used as a length parameter for a copy back to user
space, then it will potentially allow for a buffer overflow and information
disclosure when size is greater than SIZE. When the size is used to index
the buffer directly, we can have memory corruption. This also means when
size = snprintf... is used, it may also cause problems since size may become
large.  Copying to userspace is mitigated by the HARDENED_USERCOPY kernel
configuration.

The solution to these issues is to use scnprintf which returns the number of
characters actually written to the buffer, so the size variable will never
exceed SIZE.

Signed-off-by: Silvio Cesare <silvio.cesare@gmail.com>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/soc-dapm.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 0aefed8ab0cf..7e26d173da41 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -1943,19 +1943,19 @@ static ssize_t dapm_widget_power_read_file(struct file *file,
 		out = is_connected_output_ep(w, NULL);
 	}
 
-	ret = snprintf(buf, PAGE_SIZE, "%s: %s%s  in %d out %d",
+	ret = scnprintf(buf, PAGE_SIZE, "%s: %s%s  in %d out %d",
 		       w->name, w->power ? "On" : "Off",
 		       w->force ? " (forced)" : "", in, out);
 
 	if (w->reg >= 0)
-		ret += snprintf(buf + ret, PAGE_SIZE - ret,
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 				" - R%d(0x%x) mask 0x%x",
 				w->reg, w->reg, w->mask << w->shift);
 
-	ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n");
+	ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
 
 	if (w->sname)
-		ret += snprintf(buf + ret, PAGE_SIZE - ret, " stream %s %s\n",
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret, " stream %s %s\n",
 				w->sname,
 				w->active ? "active" : "inactive");
 
@@ -1968,7 +1968,7 @@ static ssize_t dapm_widget_power_read_file(struct file *file,
 			if (!p->connect)
 				continue;
 
-			ret += snprintf(buf + ret, PAGE_SIZE - ret,
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 					" %s  \"%s\" \"%s\"\n",
 					(rdir == SND_SOC_DAPM_DIR_IN) ? "in" : "out",
 					p->name ? p->name : "static",

From bf36c587df839aacda8b72b41c6414f9654b1bce Mon Sep 17 00:00:00 2001
From: Silvio Cesare <silvio.cesare@gmail.com>
Date: Tue, 15 Jan 2019 04:27:27 +0100
Subject: [PATCH 2718/3220] ASoC: imx-audmux: change snprintf to scnprintf for
 possible overflow

[ Upstream commit c407cd008fd039320d147088b52d0fa34ed3ddcb ]

Change snprintf to scnprintf. There are generally two cases where using
snprintf causes problems.

1) Uses of size += snprintf(buf, SIZE - size, fmt, ...)
In this case, if snprintf would have written more characters than what the
buffer size (SIZE) is, then size will end up larger than SIZE. In later
uses of snprintf, SIZE - size will result in a negative number, leading
to problems. Note that size might already be too large by using
size = snprintf before the code reaches a case of size += snprintf.

2) If size is ultimately used as a length parameter for a copy back to user
space, then it will potentially allow for a buffer overflow and information
disclosure when size is greater than SIZE. When the size is used to index
the buffer directly, we can have memory corruption. This also means when
size = snprintf... is used, it may also cause problems since size may become
large.  Copying to userspace is mitigated by the HARDENED_USERCOPY kernel
configuration.

The solution to these issues is to use scnprintf which returns the number of
characters actually written to the buffer, so the size variable will never
exceed SIZE.

Signed-off-by: Silvio Cesare <silvio.cesare@gmail.com>
Cc: Timur Tabi <timur@kernel.org>
Cc: Nicolin Chen <nicoleotsuka@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Xiubo Li <Xiubo.Lee@gmail.com>
Cc: Fabio Estevam <fabio.estevam@nxp.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Acked-by: Nicolin Chen <nicoleotsuka@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/fsl/imx-audmux.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/sound/soc/fsl/imx-audmux.c b/sound/soc/fsl/imx-audmux.c
index fc57da341d61..136df38c4536 100644
--- a/sound/soc/fsl/imx-audmux.c
+++ b/sound/soc/fsl/imx-audmux.c
@@ -86,49 +86,49 @@ static ssize_t audmux_read_file(struct file *file, char __user *user_buf,
 	if (!buf)
 		return -ENOMEM;
 
-	ret = snprintf(buf, PAGE_SIZE, "PDCR: %08x\nPTCR: %08x\n",
+	ret = scnprintf(buf, PAGE_SIZE, "PDCR: %08x\nPTCR: %08x\n",
 		       pdcr, ptcr);
 
 	if (ptcr & IMX_AUDMUX_V2_PTCR_TFSDIR)
-		ret += snprintf(buf + ret, PAGE_SIZE - ret,
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 				"TxFS output from %s, ",
 				audmux_port_string((ptcr >> 27) & 0x7));
 	else
-		ret += snprintf(buf + ret, PAGE_SIZE - ret,
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 				"TxFS input, ");
 
 	if (ptcr & IMX_AUDMUX_V2_PTCR_TCLKDIR)
-		ret += snprintf(buf + ret, PAGE_SIZE - ret,
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 				"TxClk output from %s",
 				audmux_port_string((ptcr >> 22) & 0x7));
 	else
-		ret += snprintf(buf + ret, PAGE_SIZE - ret,
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 				"TxClk input");
 
-	ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n");
+	ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
 
 	if (ptcr & IMX_AUDMUX_V2_PTCR_SYN) {
-		ret += snprintf(buf + ret, PAGE_SIZE - ret,
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 				"Port is symmetric");
 	} else {
 		if (ptcr & IMX_AUDMUX_V2_PTCR_RFSDIR)
-			ret += snprintf(buf + ret, PAGE_SIZE - ret,
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 					"RxFS output from %s, ",
 					audmux_port_string((ptcr >> 17) & 0x7));
 		else
-			ret += snprintf(buf + ret, PAGE_SIZE - ret,
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 					"RxFS input, ");
 
 		if (ptcr & IMX_AUDMUX_V2_PTCR_RCLKDIR)
-			ret += snprintf(buf + ret, PAGE_SIZE - ret,
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 					"RxClk output from %s",
 					audmux_port_string((ptcr >> 12) & 0x7));
 		else
-			ret += snprintf(buf + ret, PAGE_SIZE - ret,
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 					"RxClk input");
 	}
 
-	ret += snprintf(buf + ret, PAGE_SIZE - ret,
+	ret += scnprintf(buf + ret, PAGE_SIZE - ret,
 			"\nData received from %s\n",
 			audmux_port_string((pdcr >> 13) & 0x7));
 

From e120f9d8596ea0a458875fdd16e8823adadd7e66 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Thu, 13 Dec 2018 18:42:57 +0300
Subject: [PATCH 2719/3220] ARC: fix __ffs return value to avoid build warnings

[ Upstream commit 4e868f8419cb4cb558c5d428e7ab5629cef864c7 ]

|  CC      mm/nobootmem.o
|In file included from ./include/asm-generic/bug.h:18:0,
|                 from ./arch/arc/include/asm/bug.h:32,
|                 from ./include/linux/bug.h:5,
|                 from ./include/linux/mmdebug.h:5,
|                 from ./include/linux/gfp.h:5,
|                 from ./include/linux/slab.h:15,
|                 from mm/nobootmem.c:14:
|mm/nobootmem.c: In function '__free_pages_memory':
|./include/linux/kernel.h:845:29: warning: comparison of distinct pointer types lacks a cast
|   (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
|                             ^
|./include/linux/kernel.h:859:4: note: in expansion of macro '__typecheck'
|   (__typecheck(x, y) && __no_side_effects(x, y))
|    ^~~~~~~~~~~
|./include/linux/kernel.h:869:24: note: in expansion of macro '__safe_cmp'
|  __builtin_choose_expr(__safe_cmp(x, y), \
|                        ^~~~~~~~~~
|./include/linux/kernel.h:878:19: note: in expansion of macro '__careful_cmp'
| #define min(x, y) __careful_cmp(x, y, <)
|                   ^~~~~~~~~~~~~
|mm/nobootmem.c:104:11: note: in expansion of macro 'min'
|   order = min(MAX_ORDER - 1UL, __ffs(start));

Change __ffs return value from 'int' to 'unsigned long' as it
is done in other implementations (like asm-generic, x86, etc...)
to avoid build-time warnings in places where type is strictly
checked.

As __ffs may return values in [0-31] interval changing return
type to unsigned is valid.

Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arc/include/asm/bitops.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h
index 0352fb8d21b9..9623ae002f5b 100644
--- a/arch/arc/include/asm/bitops.h
+++ b/arch/arc/include/asm/bitops.h
@@ -286,7 +286,7 @@ static inline __attribute__ ((const)) int __fls(unsigned long x)
 /*
  * __ffs: Similar to ffs, but zero based (0-31)
  */
-static inline __attribute__ ((const)) int __ffs(unsigned long word)
+static inline __attribute__ ((const)) unsigned long __ffs(unsigned long word)
 {
 	if (!word)
 		return word;
@@ -346,9 +346,9 @@ static inline __attribute__ ((const)) int ffs(unsigned long x)
 /*
  * __ffs: Similar to ffs, but zero based (0-31)
  */
-static inline __attribute__ ((const)) int __ffs(unsigned long x)
+static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x)
 {
-	int n;
+	unsigned long n;
 
 	asm volatile(
 	"	ffs.f	%0, %1		\n"  /* 0:31; 31(Z) if src 0 */

From f5c5093be741777371bd5e66ece2a96f25ad9588 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Thu, 17 Jan 2019 16:32:42 -0500
Subject: [PATCH 2720/3220] mac80211: fix miscounting of ttl-dropped frames

[ Upstream commit a0dc02039a2ee54fb4ae400e0b755ed30e73e58c ]

In ieee80211_rx_h_mesh_fwding, we increment the 'dropped_frames_ttl'
counter when we decrement the ttl to zero.  For unicast frames
destined for other hosts, we stop processing the frame at that point.

For multicast frames, we do not rebroadcast it in this case, but we
do pass the frame up the stack to process it on this STA.  That
doesn't match the usual definition of "dropped," so don't count
those as such.

With this change, something like `ping6 -i0.2 ff02::1%mesh0` from a
peer in a ttl=1 network no longer increments the counter rapidly.

Signed-off-by: Bob Copeland <bobcopeland@fb.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/mac80211/rx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index acacceec8cd8..833ad779659c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2340,7 +2340,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 	skb_set_queue_mapping(skb, q);
 
 	if (!--mesh_hdr->ttl) {
-		IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl);
+		if (!is_multicast_ether_addr(hdr->addr1))
+			IEEE80211_IFSTA_MESH_CTR_INC(ifmsh,
+						     dropped_frames_ttl);
 		goto out;
 	}
 

From e9fdf60dafdcef2af9c8f1a9ceefc0125e3f9f3c Mon Sep 17 00:00:00 2001
From: Tomonori Sakita <tomonori.sakita@sord.co.jp>
Date: Mon, 21 Jan 2019 17:34:16 +0900
Subject: [PATCH 2721/3220] serial: fsl_lpuart: fix maximum acceptable baud
 rate with over-sampling

[ Upstream commit 815d835b7ba46685c316b000013367dacb2b461b ]

Using over-sampling ratio, lpuart can accept baud rate upto uartclk / 4.

Signed-off-by: Tomonori Sakita <tomonori.sakita@sord.co.jp>
Signed-off-by: Atsushi Nemoto <atsushi.nemoto@sord.co.jp>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/tty/serial/fsl_lpuart.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index 8b5ec9386f0f..1544a7cc76ff 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -1409,7 +1409,7 @@ lpuart32_set_termios(struct uart_port *port, struct ktermios *termios,
 	}
 
 	/* ask the core to calculate the divisor */
-	baud = uart_get_baud_rate(port, termios, old, 50, port->uartclk / 16);
+	baud = uart_get_baud_rate(port, termios, old, 50, port->uartclk / 4);
 
 	spin_lock_irqsave(&sport->port.lock, flags);
 

From ff028bbb57aaf540964d37a00f3f68a572a1c0cb Mon Sep 17 00:00:00 2001
From: Varun Prakash <varun@chelsio.com>
Date: Sat, 12 Jan 2019 22:14:30 +0530
Subject: [PATCH 2722/3220] scsi: csiostor: fix NULL pointer dereference in
 csio_vport_set_state()

[ Upstream commit fe35a40e675473eb65f2f5462b82770f324b5689 ]

Assign fc_vport to ln->fc_vport before calling csio_fcoe_alloc_vnp() to
avoid a NULL pointer dereference in csio_vport_set_state().

ln->fc_vport is dereferenced in csio_vport_set_state().

Signed-off-by: Varun Prakash <varun@chelsio.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/csiostor/csio_attr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/csiostor/csio_attr.c b/drivers/scsi/csiostor/csio_attr.c
index 2d1c4ebd40f9..6587f20cff1a 100644
--- a/drivers/scsi/csiostor/csio_attr.c
+++ b/drivers/scsi/csiostor/csio_attr.c
@@ -582,12 +582,12 @@ csio_vport_create(struct fc_vport *fc_vport, bool disable)
 	}
 
 	fc_vport_set_state(fc_vport, FC_VPORT_INITIALIZING);
+	ln->fc_vport = fc_vport;
 
 	if (csio_fcoe_alloc_vnp(hw, ln))
 		goto error;
 
 	*(struct csio_lnode **)fc_vport->dd_data = ln;
-	ln->fc_vport = fc_vport;
 	if (!fc_vport->node_name)
 		fc_vport->node_name = wwn_to_u64(csio_ln_wwnn(ln));
 	if (!fc_vport->port_name)

From 14ae77e8a4f83c844bdf144d90fdaecbd1cc4394 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <atsushi.nemoto@sord.co.jp>
Date: Mon, 21 Jan 2019 17:26:41 +0900
Subject: [PATCH 2723/3220] net: altera_tse: fix connect_local_phy error path

[ Upstream commit 17b42a20d7ca59377788c6a2409e77569570cc10 ]

The connect_local_phy should return NULL (not negative errno) on
error, since its caller expects it.

Signed-off-by: Atsushi Nemoto <atsushi.nemoto@sord.co.jp>
Acked-by: Thor Thayer <thor.thayer@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/altera/altera_tse_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c
index fe644823ceaf..bb51f124d8c7 100644
--- a/drivers/net/ethernet/altera/altera_tse_main.c
+++ b/drivers/net/ethernet/altera/altera_tse_main.c
@@ -716,8 +716,10 @@ static struct phy_device *connect_local_phy(struct net_device *dev)
 
 		phydev = phy_connect(dev, phy_id_fmt, &altera_tse_adjust_link,
 				     priv->phy_iface);
-		if (IS_ERR(phydev))
+		if (IS_ERR(phydev)) {
 			netdev_err(dev, "Could not attach to PHY\n");
+			phydev = NULL;
+		}
 
 	} else {
 		int ret;

From b83f68e0396c493c46b1a531eda8d21b2f687620 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.ibm.com>
Date: Thu, 24 Jan 2019 11:17:01 -0600
Subject: [PATCH 2724/3220] ibmveth: Do not process frames after calling
 napi_reschedule

[ Upstream commit e95d22c69b2c130ccce257b84daf283fd82d611e ]

The IBM virtual ethernet driver's polling function continues
to process frames after rescheduling NAPI, resulting in a warning
if it exhausted its budget. Do not restart polling after calling
napi_reschedule. Instead let frames be processed in the following
instance.

Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/ibm/ibmveth.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index 61a9ab4fe047..70b3253e7ed5 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1238,7 +1238,6 @@ static int ibmveth_poll(struct napi_struct *napi, int budget)
 	struct iphdr *iph;
 	u16 mss = 0;
 
-restart_poll:
 	while (frames_processed < budget) {
 		if (!ibmveth_rxq_pending_buffer(adapter))
 			break;
@@ -1336,7 +1335,6 @@ restart_poll:
 		    napi_reschedule(napi)) {
 			lpar_rc = h_vio_signal(adapter->vdev->unit_address,
 					       VIO_IRQ_DISABLE);
-			goto restart_poll;
 		}
 	}
 

From 39aabcb7dcf9d59207d3f467f448025ee8374dcc Mon Sep 17 00:00:00 2001
From: Balaji Pothunoori <bpothuno@codeaurora.org>
Date: Mon, 21 Jan 2019 12:30:43 +0530
Subject: [PATCH 2725/3220] mac80211: don't initiate TDLS connection if station
 is not associated to AP

[ Upstream commit 7ed5285396c257fd4070b1e29e7b2341aae2a1ce ]

Following call trace is observed while adding TDLS peer entry in driver
during TDLS setup.

Call Trace:
[<c1301476>] dump_stack+0x47/0x61
[<c10537d2>] __warn+0xe2/0x100
[<fa22415f>] ? sta_apply_parameters+0x49f/0x550 [mac80211]
[<c1053895>] warn_slowpath_null+0x25/0x30
[<fa22415f>] sta_apply_parameters+0x49f/0x550 [mac80211]
[<fa20ad42>] ? sta_info_alloc+0x1c2/0x450 [mac80211]
[<fa224623>] ieee80211_add_station+0xe3/0x160 [mac80211]
[<c1876fe3>] nl80211_new_station+0x273/0x420
[<c170f6d9>] genl_rcv_msg+0x219/0x3c0
[<c170f4c0>] ? genl_rcv+0x30/0x30
[<c170ee7e>] netlink_rcv_skb+0x8e/0xb0
[<c170f4ac>] genl_rcv+0x1c/0x30
[<c170e8aa>] netlink_unicast+0x13a/0x1d0
[<c170ec18>] netlink_sendmsg+0x2d8/0x390
[<c16c5acd>] sock_sendmsg+0x2d/0x40
[<c16c6369>] ___sys_sendmsg+0x1d9/0x1e0

Fixing this by allowing TDLS setup request only when we have completed
association.

Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/mac80211/cfg.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 67348d8ac35d..7349bf26ae7b 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1228,6 +1228,10 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
 	if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))
 		sta->sta.tdls = true;
 
+	if (sta->sta.tdls && sdata->vif.type == NL80211_IFTYPE_STATION &&
+	    !sdata->u.mgd.associated)
+		return -EINVAL;
+
 	err = sta_apply_parameters(local, sta, params);
 	if (err) {
 		sta_info_free(local, sta);

From 31f49f68d27eb326f0d494aecb5cb76d1cd9ac72 Mon Sep 17 00:00:00 2001
From: Chaitanya Tata <chaitanya.tata@bluwirelesstechnology.com>
Date: Sat, 19 Jan 2019 03:17:47 +0530
Subject: [PATCH 2726/3220] cfg80211: extend range deviation for DMG

[ Upstream commit 93183bdbe73bbdd03e9566c8dc37c9d06b0d0db6 ]

Recently, DMG frequency bands have been extended till 71GHz, so extend
the range check till 20GHz (45-71GHZ), else some channels will be marked
as disabled.

Signed-off-by: Chaitanya Tata <Chaitanya.Tata@bluwireless.co.uk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/wireless/reg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 50dffd183cc6..429abf421906 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -780,7 +780,7 @@ static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range,
  * definitions (the "2.4 GHz band", the "5 GHz band" and the "60GHz band"),
  * however it is safe for now to assume that a frequency rule should not be
  * part of a frequency's band if the start freq or end freq are off by more
- * than 2 GHz for the 2.4 and 5 GHz bands, and by more than 10 GHz for the
+ * than 2 GHz for the 2.4 and 5 GHz bands, and by more than 20 GHz for the
  * 60 GHz band.
  * This resolution can be lowered and should be considered as we add
  * regulatory rule support for other "bands".
@@ -795,7 +795,7 @@ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range,
 	 * with the Channel starting frequency above 45 GHz.
 	 */
 	u32 limit = freq_khz > 45 * ONE_GHZ_IN_KHZ ?
-			10 * ONE_GHZ_IN_KHZ : 2 * ONE_GHZ_IN_KHZ;
+			20 * ONE_GHZ_IN_KHZ : 2 * ONE_GHZ_IN_KHZ;
 	if (abs(freq_khz - freq_range->start_freq_khz) <= limit)
 		return true;
 	if (abs(freq_khz - freq_range->end_freq_khz) <= limit)

From 37131ae9135c048494f1424dea064488ac1ffd04 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 7 Jan 2019 19:44:51 +0100
Subject: [PATCH 2727/3220] KVM: nSVM: clear events pending from
 svm_complete_interrupts() when exiting to L1

[ Upstream commit 619ad846fc3452adaf71ca246c5aa711e2055398 ]

kvm-unit-tests' eventinj "NMI failing on IDT" test results in NMI being
delivered to the host (L1) when it's running nested. The problem seems to
be: svm_complete_interrupts() raises 'nmi_injected' flag but later we
decide to reflect EXIT_NPF to L1. The flag remains pending and we do NMI
injection upon entry so it got delivered to L1 instead of L2.

It seems that VMX code solves the same issue in prepare_vmcs12(), this was
introduced with code refactoring in commit 5f3d5799974b ("KVM: nVMX: Rework
event injection and recovery").

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kvm/svm.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7ce1a19d9d8b..acbde1249b6f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2388,6 +2388,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	kvm_mmu_reset_context(&svm->vcpu);
 	kvm_mmu_load(&svm->vcpu);
 
+	/*
+	 * Drop what we picked up for L2 via svm_complete_interrupts() so it
+	 * doesn't end up in L1.
+	 */
+	svm->vcpu.arch.nmi_injected = false;
+	kvm_clear_exception_queue(&svm->vcpu);
+	kvm_clear_interrupt_queue(&svm->vcpu);
+
 	return 0;
 }
 

From be96dcc315c75f371774739a34c43c213f177c80 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 15 Feb 2016 17:04:04 +0000
Subject: [PATCH 2728/3220] arm/arm64: KVM: Feed initialized memory to MMIO
 accesses

commit 1d6a821277aaa0cdd666278aaff93298df313d41 upstream.

On an MMIO access, we always copy the on-stack buffer info
the shared "run" structure, even if this is a read access.
This ends up leaking up to 8 bytes of uninitialized memory
into userspace, depending on the size of the access.

An obvious fix for this one is to only perform the copy if
this is an actual write.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/kvm/mmio.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
index 885cd0e0015b..0b9d152b38c8 100644
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@@ -207,7 +207,8 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
 	run->mmio.is_write	= is_write;
 	run->mmio.phys_addr	= fault_ipa;
 	run->mmio.len		= len;
-	memcpy(run->mmio.data, data_buf, len);
+	if (is_write)
+		memcpy(run->mmio.data, data_buf, len);
 
 	if (!ret) {
 		/* We handled the access successfully in the kernel. */

From 05de33f10001bc617e45110a4815273c245ac5b2 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 29 Mar 2016 14:29:28 +0200
Subject: [PATCH 2729/3220] KVM: arm/arm64: Fix MMIO emulation data handling

commit 83091db981e105d97562d3ed3ffe676e21927e3a upstream.

When the kernel was handling a guest MMIO read access internally, we
need to copy the emulation result into the run->mmio structure in order
for the kvm_handle_mmio_return() function to pick it up and inject the
	result back into the guest.

Currently the only user of kvm_io_bus for ARM is the VGIC, which did
this copying itself, so this was not causing issues so far.

But with the upcoming new vgic implementation we need this done
properly.

Update the kvm_handle_mmio_return description and cleanup the code to
only perform a single copying when needed.

Code and commit message inspired by Andre Przywara.

Reported-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/kvm/mmio.c | 11 ++++++-----
 virt/kvm/arm/vgic.c |  7 -------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
index 0b9d152b38c8..ae61e2ea7255 100644
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@@ -87,11 +87,10 @@ static unsigned long mmio_read_buf(char *buf, unsigned int len)
 
 /**
  * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
+ *			     or in-kernel IO emulation
+ *
  * @vcpu: The VCPU pointer
  * @run:  The VCPU run struct containing the mmio data
- *
- * This should only be called after returning from userspace for MMIO load
- * emulation.
  */
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
@@ -207,15 +206,17 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
 	run->mmio.is_write	= is_write;
 	run->mmio.phys_addr	= fault_ipa;
 	run->mmio.len		= len;
-	if (is_write)
-		memcpy(run->mmio.data, data_buf, len);
 
 	if (!ret) {
 		/* We handled the access successfully in the kernel. */
+		if (!is_write)
+			memcpy(run->mmio.data, data_buf, len);
 		kvm_handle_mmio_return(vcpu, run);
 		return 1;
 	}
 
+	if (is_write)
+		memcpy(run->mmio.data, data_buf, len);
 	run->exit_reason	= KVM_EXIT_MMIO;
 	return 0;
 }
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 5d10f104f3eb..964df643509d 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -821,7 +821,6 @@ static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	struct vgic_io_device *iodev = container_of(this,
 						    struct vgic_io_device, dev);
-	struct kvm_run *run = vcpu->run;
 	const struct vgic_io_range *range;
 	struct kvm_exit_mmio mmio;
 	bool updated_state;
@@ -850,12 +849,6 @@ static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
 		updated_state = false;
 	}
 	spin_unlock(&dist->lock);
-	run->mmio.is_write	= is_write;
-	run->mmio.len		= len;
-	run->mmio.phys_addr	= addr;
-	memcpy(run->mmio.data, val, len);
-
-	kvm_handle_mmio_return(vcpu, run);
 
 	if (updated_state)
 		vgic_kick_vcpus(vcpu->kvm);

From 1a8ccbf263d63e22bdee411c1ac8a70393c2fe82 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Thu, 28 Sep 2017 09:33:39 -0400
Subject: [PATCH 2730/3220] powerpc: Always initialize input array when calling
 epapr_hypercall()

commit 186b8f1587c79c2fa04bfa392fdf084443e398c1 upstream.

Several callers to epapr_hypercall() pass an uninitialized stack
allocated array for the input arguments, presumably because they
have no input arguments. However this can produce errors like
this one

 arch/powerpc/include/asm/epapr_hcalls.h:470:42: error: 'in' may be used uninitialized in this function [-Werror=maybe-uninitialized]
  unsigned long register r3 asm("r3") = in[0];
                                        ~~^~~

Fix callers to this function to always zero-initialize the input
arguments array to prevent this.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: "A. Wilcox" <awilfox@adelielinux.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/include/asm/epapr_hcalls.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index 334459ad145b..90863245df53 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -508,7 +508,7 @@ static unsigned long epapr_hypercall(unsigned long *in,
 
 static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 	unsigned long r;
 
@@ -520,7 +520,7 @@ static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2)
 
 static inline long epapr_hypercall0(unsigned int nr)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	return epapr_hypercall(in, out, nr);
@@ -528,7 +528,7 @@ static inline long epapr_hypercall0(unsigned int nr)
 
 static inline long epapr_hypercall1(unsigned int nr, unsigned long p1)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	in[0] = p1;
@@ -538,7 +538,7 @@ static inline long epapr_hypercall1(unsigned int nr, unsigned long p1)
 static inline long epapr_hypercall2(unsigned int nr, unsigned long p1,
 				    unsigned long p2)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	in[0] = p1;
@@ -549,7 +549,7 @@ static inline long epapr_hypercall2(unsigned int nr, unsigned long p1,
 static inline long epapr_hypercall3(unsigned int nr, unsigned long p1,
 				    unsigned long p2, unsigned long p3)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	in[0] = p1;
@@ -562,7 +562,7 @@ static inline long epapr_hypercall4(unsigned int nr, unsigned long p1,
 				    unsigned long p2, unsigned long p3,
 				    unsigned long p4)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	in[0] = p1;

From 7404c65cac2d50581ee01fe4f361d0688cd27602 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= <j.neuschaefer@gmx.net>
Date: Sun, 10 Feb 2019 18:31:07 +0100
Subject: [PATCH 2731/3220] mmc: spi: Fix card detection during probe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit c9bd505dbd9d3dc80c496f88eafe70affdcf1ba6 upstream.

When using the mmc_spi driver with a card-detect pin, I noticed that the
card was not detected immediately after probe, but only after it was
unplugged and plugged back in (and the CD IRQ fired).

The call tree looks something like this:

mmc_spi_probe
  mmc_add_host
    mmc_start_host
      _mmc_detect_change
        mmc_schedule_delayed_work(&host->detect, 0)
          mmc_rescan
            host->bus_ops->detect(host)
              mmc_detect
                _mmc_detect_card_removed
                  host->ops->get_cd(host)
                    mmc_gpio_get_cd -> -ENOSYS (ctx->cd_gpio not set)
  mmc_gpiod_request_cd
    ctx->cd_gpio = desc

To fix this issue, call mmc_detect_change after the card-detect GPIO/IRQ
is registered.

Signed-off-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Cc: stable@vger.kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/host/mmc_spi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index aad3243a48fc..e03ec74f3fb0 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -1451,6 +1451,7 @@ static int mmc_spi_probe(struct spi_device *spi)
 		if (status != 0)
 			goto fail_add_host;
 	}
+	mmc_detect_change(mmc, 0);
 
 	dev_info(&spi->dev, "SD/MMC host %s%s%s%s%s\n",
 			dev_name(&mmc->class_dev),

From 40952b6a649b9bfad11ae4fa2862fa0108c9ec24 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 27 Feb 2019 21:29:52 +0100
Subject: [PATCH 2732/3220] mm: enforce min addr even if capable() in
 expand_downwards()

commit 0a1d52994d440e21def1c2174932410b4f2a98a1 upstream.

security_mmap_addr() does a capability check with current_cred(), but
we can reach this code from contexts like a VFS write handler where
current_cred() must not be used.

This can be abused on systems without SMAP to make NULL pointer
dereferences exploitable again.

Fixes: 8869477a49c3 ("security: protect from stack expansion into low vm addresses")
Cc: stable@kernel.org
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mmap.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 3074dbcd9621..baa4c1280bff 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2294,12 +2294,11 @@ int expand_downwards(struct vm_area_struct *vma,
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *prev;
 	unsigned long gap_addr;
-	int error;
+	int error = 0;
 
 	address &= PAGE_MASK;
-	error = security_mmap_addr(address);
-	if (error)
-		return error;
+	if (address < mmap_min_addr)
+		return -EPERM;
 
 	/* Enforce stack_guard_gap */
 	gap_addr = address - stack_guard_gap;

From e90171edbef7b43e3eab51906f5f499f131a2887 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 22 Feb 2019 17:17:04 -0800
Subject: [PATCH 2733/3220] x86/uaccess: Don't leak the AC flag into
 __put_user() value evaluation

commit 2a418cf3f5f1caf911af288e978d61c9844b0695 upstream.

When calling __put_user(foo(), ptr), the __put_user() macro would call
foo() in between __uaccess_begin() and __uaccess_end().  If that code
were buggy, then those bugs would be run without SMAP protection.

Fortunately, there seem to be few instances of the problem in the
kernel. Nevertheless, __put_user() should be fixed to avoid doing this.
Therefore, evaluate __put_user()'s argument before setting AC.

This issue was noticed when an objtool hack by Peter Zijlstra complained
about genregs_get() and I compared the assembly output to the C source.

 [ bp: Massage commit message and fixed up whitespace. ]

Fixes: 11f1a4b9755f ("x86: reorganize SMAP handling in user space accesses")
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20190225125231.845656645@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/uaccess.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 6f8eadf0681f..ac6932bf1a01 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -314,8 +314,7 @@ do {									\
 		__put_user_asm(x, ptr, retval, "l", "k", "ir", errret);	\
 		break;							\
 	case 8:								\
-		__put_user_asm_u64((__typeof__(*ptr))(x), ptr, retval,	\
-				   errret);				\
+		__put_user_asm_u64(x, ptr, retval, errret);		\
 		break;							\
 	default:							\
 		__put_user_bad();					\
@@ -426,8 +425,10 @@ do {									\
 #define __put_user_nocheck(x, ptr, size)			\
 ({								\
 	int __pu_err;						\
+	__typeof__(*(ptr)) __pu_val;				\
+	__pu_val = x;						\
 	__uaccess_begin();					\
-	__put_user_size((x), (ptr), (size), __pu_err, -EFAULT);	\
+	__put_user_size(__pu_val, (ptr), (size), __pu_err, -EFAULT);\
 	__uaccess_end();					\
 	__builtin_expect(__pu_err, 0);				\
 })

From 83e29e4451321edc3cdddd99e3ee4f977c936c78 Mon Sep 17 00:00:00 2001
From: Daniele Palmas <dnlplm@gmail.com>
Date: Wed, 20 Feb 2019 11:43:17 +0100
Subject: [PATCH 2734/3220] USB: serial: option: add Telit ME910 ECM
 composition

commit 6431866b6707d27151be381252d6eef13025cfce upstream.

This patch adds Telit ME910 family ECM composition 0x1102.

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 7bc2c9fef605..b2b7c12e5c86 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1147,6 +1147,8 @@ static const struct usb_device_id option_ids[] = {
 	  .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM),
 	  .driver_info = NCTRL(0) | RSVD(3) },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1102, 0xff),	/* Telit ME910 (ECM) */
+	  .driver_info = NCTRL(0) },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
 	  .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),

From a78f651889f40272d7c0eecbab2471aaf6308ed0 Mon Sep 17 00:00:00 2001
From: Ivan Mironov <mironov.ivan@gmail.com>
Date: Wed, 6 Feb 2019 21:14:13 +0500
Subject: [PATCH 2735/3220] USB: serial: cp210x: add ID for Ingenico 3070

commit dd9d3d86b08d6a106830364879c42c78db85389c upstream.

Here is how this device appears in kernel log:

	usb 3-1: new full-speed USB device number 18 using xhci_hcd
	usb 3-1: New USB device found, idVendor=0b00, idProduct=3070
	usb 3-1: New USB device strings: Mfr=1, Product=2, SerialNumber=3
	usb 3-1: Product: Ingenico 3070
	usb 3-1: Manufacturer: Silicon Labs
	usb 3-1: SerialNumber: 0001

Apparently this is a POS terminal with embedded USB-to-Serial converter.

Cc: stable@vger.kernel.org
Signed-off-by: Ivan Mironov <mironov.ivan@gmail.com>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/cp210x.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 97382301c393..b317594a6342 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -57,6 +57,7 @@ static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x08e6, 0x5501) }, /* Gemalto Prox-PU/CU contactless smartcard reader */
 	{ USB_DEVICE(0x08FD, 0x000A) }, /* Digianswer A/S , ZigBee/802.15.4 MAC Device */
 	{ USB_DEVICE(0x0908, 0x01FF) }, /* Siemens RUGGEDCOM USB Serial Console */
+	{ USB_DEVICE(0x0B00, 0x3070) }, /* Ingenico 3070 */
 	{ USB_DEVICE(0x0BED, 0x1100) }, /* MEI (TM) Cashflow-SC Bill/Voucher Acceptor */
 	{ USB_DEVICE(0x0BED, 0x1101) }, /* MEI series 2000 Combo Acceptor */
 	{ USB_DEVICE(0x0FCF, 0x1003) }, /* Dynastream ANT development board */

From 25d05441f9e58cbe20d4bb191a384ee215b33dd9 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Thu, 14 Feb 2019 19:45:33 +0000
Subject: [PATCH 2736/3220] USB: serial: ftdi_sio: add ID for Hjelmslund
 Electronics USB485

commit 8d7fa3d4ea3f0ca69554215e87411494e6346fdc upstream.

This adds the USB ID of the Hjelmslund Electronics USB485 Iso stick.

Signed-off-by: Mans Rullgard <mans@mansr.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/ftdi_sio.c     | 2 ++
 drivers/usb/serial/ftdi_sio_ids.h | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 3e5b189a79b4..4287e2b1c175 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1020,6 +1020,8 @@ static const struct usb_device_id id_table_combined[] = {
 	{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) },
 	{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) },
 	{ USB_DEVICE(AIRBUS_DS_VID, AIRBUS_DS_P8GR) },
+	/* EZPrototypes devices */
+	{ USB_DEVICE(EZPROTOTYPES_VID, HJELMSLUND_USB485_ISO_PID) },
 	{ }					/* Terminating entry */
 };
 
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 76a10b222ff9..ddf5ab983dc9 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -1307,6 +1307,12 @@
 #define IONICS_VID			0x1c0c
 #define IONICS_PLUGCOMPUTER_PID		0x0102
 
+/*
+ * EZPrototypes (PID reseller)
+ */
+#define EZPROTOTYPES_VID		0x1c40
+#define HJELMSLUND_USB485_ISO_PID	0x0477
+
 /*
  * Dresden Elektronik Sensor Terminal Board
  */

From c8d66722d84b7d543073edf3459e024869b706e3 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 25 Jan 2019 12:53:07 +0530
Subject: [PATCH 2737/3220] cpufreq: Use struct kobj_attribute instead of
 struct global_attr

commit 625c85a62cb7d3c79f6e16de3cfa972033658250 upstream.

The cpufreq_global_kobject is created using kobject_create_and_add()
helper, which assigns the kobj_type as dynamic_kobj_ktype and show/store
routines are set to kobj_attr_show() and kobj_attr_store().

These routines pass struct kobj_attribute as an argument to the
show/store callbacks. But all the cpufreq files created using the
cpufreq_global_kobject expect the argument to be of type struct
attribute. Things work fine currently as no one accesses the "attr"
argument. We may not see issues even if the argument is used, as struct
kobj_attribute has struct attribute as its first element and so they
will both get same address.

But this is logically incorrect and we should rather use struct
kobj_attribute instead of struct global_attr in the cpufreq core and
drivers and the show/store callbacks should take struct kobj_attribute
as argument instead.

This bug is caught using CFI CLANG builds in android kernel which
catches mismatch in function prototypes for such callbacks.

Reported-by: Donghee Han <dh.han@samsung.com>
Reported-by: Sangkyu Kim <skwith.kim@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cpufreq/cpufreq.c          |  6 +++---
 drivers/cpufreq/cpufreq_governor.h | 10 +++++-----
 drivers/cpufreq/intel_pstate.c     | 14 +++++++-------
 include/linux/cpufreq.h            | 12 ++----------
 4 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 68b604ad8413..205df72ee873 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -474,13 +474,13 @@ EXPORT_SYMBOL_GPL(cpufreq_freq_transition_end);
  *                          SYSFS INTERFACE                          *
  *********************************************************************/
 static ssize_t show_boost(struct kobject *kobj,
-				 struct attribute *attr, char *buf)
+			  struct kobj_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%d\n", cpufreq_driver->boost_enabled);
 }
 
-static ssize_t store_boost(struct kobject *kobj, struct attribute *attr,
-				  const char *buf, size_t count)
+static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr,
+			   const char *buf, size_t count)
 {
 	int ret, enable;
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 5621bb03e874..f7b340c27ff2 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -48,11 +48,11 @@ enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE};
 
 /* Create attributes */
 #define gov_sys_attr_ro(_name)						\
-static struct global_attr _name##_gov_sys =				\
+static struct kobj_attribute _name##_gov_sys =				\
 __ATTR(_name, 0444, show_##_name##_gov_sys, NULL)
 
 #define gov_sys_attr_rw(_name)						\
-static struct global_attr _name##_gov_sys =				\
+static struct kobj_attribute _name##_gov_sys =				\
 __ATTR(_name, 0644, show_##_name##_gov_sys, store_##_name##_gov_sys)
 
 #define gov_pol_attr_ro(_name)						\
@@ -74,7 +74,7 @@ __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
 /* Create show/store routines */
 #define show_one(_gov, file_name)					\
 static ssize_t show_##file_name##_gov_sys				\
-(struct kobject *kobj, struct attribute *attr, char *buf)		\
+(struct kobject *kobj, struct kobj_attribute *attr, char *buf)		\
 {									\
 	struct _gov##_dbs_tuners *tuners = _gov##_dbs_cdata.gdbs_data->tuners; \
 	return sprintf(buf, "%u\n", tuners->file_name);			\
@@ -90,7 +90,7 @@ static ssize_t show_##file_name##_gov_pol				\
 
 #define store_one(_gov, file_name)					\
 static ssize_t store_##file_name##_gov_sys				\
-(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) \
+(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) \
 {									\
 	struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data;		\
 	return store_##file_name(dbs_data, buf, count);			\
@@ -254,7 +254,7 @@ static inline int delay_for_sampling_rate(unsigned int sampling_rate)
 
 #define declare_show_sampling_rate_min(_gov)				\
 static ssize_t show_sampling_rate_min_gov_sys				\
-(struct kobject *kobj, struct attribute *attr, char *buf)		\
+(struct kobject *kobj, struct kobj_attribute *attr, char *buf)		\
 {									\
 	struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data;		\
 	return sprintf(buf, "%u\n", dbs_data->min_sampling_rate);	\
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 88728d997088..15fcf2cac971 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -368,13 +368,13 @@ static void __init intel_pstate_debug_expose_params(void)
 /************************** sysfs begin ************************/
 #define show_one(file_name, object)					\
 	static ssize_t show_##file_name					\
-	(struct kobject *kobj, struct attribute *attr, char *buf)	\
+	(struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
 	{								\
 		return sprintf(buf, "%u\n", limits->object);		\
 	}
 
 static ssize_t show_turbo_pct(struct kobject *kobj,
-				struct attribute *attr, char *buf)
+				struct kobj_attribute *attr, char *buf)
 {
 	struct cpudata *cpu;
 	int total, no_turbo, turbo_pct;
@@ -390,7 +390,7 @@ static ssize_t show_turbo_pct(struct kobject *kobj,
 }
 
 static ssize_t show_num_pstates(struct kobject *kobj,
-				struct attribute *attr, char *buf)
+				struct kobj_attribute *attr, char *buf)
 {
 	struct cpudata *cpu;
 	int total;
@@ -401,7 +401,7 @@ static ssize_t show_num_pstates(struct kobject *kobj,
 }
 
 static ssize_t show_no_turbo(struct kobject *kobj,
-			     struct attribute *attr, char *buf)
+			     struct kobj_attribute *attr, char *buf)
 {
 	ssize_t ret;
 
@@ -414,7 +414,7 @@ static ssize_t show_no_turbo(struct kobject *kobj,
 	return ret;
 }
 
-static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
+static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
 			      const char *buf, size_t count)
 {
 	unsigned int input;
@@ -438,7 +438,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 	return count;
 }
 
-static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
+static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
 				  const char *buf, size_t count)
 {
 	unsigned int input;
@@ -463,7 +463,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 	return count;
 }
 
-static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
+static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
 				  const char *buf, size_t count)
 {
 	unsigned int input;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 177c7680c1a8..e684a9ba98a3 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -203,20 +203,12 @@ __ATTR(_name, _perm, show_##_name, NULL)
 static struct freq_attr _name =			\
 __ATTR(_name, 0644, show_##_name, store_##_name)
 
-struct global_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct kobject *kobj,
-			struct attribute *attr, char *buf);
-	ssize_t (*store)(struct kobject *a, struct attribute *b,
-			 const char *c, size_t count);
-};
-
 #define define_one_global_ro(_name)		\
-static struct global_attr _name =		\
+static struct kobj_attribute _name =		\
 __ATTR(_name, 0444, show_##_name, NULL)
 
 #define define_one_global_rw(_name)		\
-static struct global_attr _name =		\
+static struct kobj_attribute _name =		\
 __ATTR(_name, 0644, show_##_name, store_##_name)
 
 

From c776cff6de5291edffd3f31b4c1750b48756ac48 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 29 Sep 2016 17:48:34 +0200
Subject: [PATCH 2738/3220] sockfs: getxattr: Fail with -EOPNOTSUPP for invalid
 attribute names

commit 971df15bd54ad46e907046ff33750a137b2f0096 upstream.

The standard return value for unsupported attribute names is
-EOPNOTSUPP, as opposed to undefined but supported attributes
(-ENODATA).

Also, fail for attribute names like "system.sockprotonameXXX" and
simplify the code a bit.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
[removes a build warning on 4.4.y - gregkh]
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/socket.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index 96133777d17c..e5bb73eb36fe 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -470,27 +470,15 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
 static ssize_t sockfs_getxattr(struct dentry *dentry,
 			       const char *name, void *value, size_t size)
 {
-	const char *proto_name;
-	size_t proto_size;
-	int error;
-
-	error = -ENODATA;
-	if (!strncmp(name, XATTR_NAME_SOCKPROTONAME, XATTR_NAME_SOCKPROTONAME_LEN)) {
-		proto_name = dentry->d_name.name;
-		proto_size = strlen(proto_name);
-
+	if (!strcmp(name, XATTR_NAME_SOCKPROTONAME)) {
 		if (value) {
-			error = -ERANGE;
-			if (proto_size + 1 > size)
-				goto out;
-
-			strncpy(value, proto_name, proto_size + 1);
+			if (dentry->d_name.len + 1 > size)
+				return -ERANGE;
+			memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
 		}
-		error = proto_size + 1;
+		return dentry->d_name.len + 1;
 	}
-
-out:
-	return error;
+	return -EOPNOTSUPP;
 }
 
 static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,

From eeb234f7f9079648f43e400bd0bfe84a8241a805 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 7 Mar 2019 17:02:50 +0100
Subject: [PATCH 2739/3220] ncpfs: fix build warning of strncpy

Not upstream as ncpfs is long deleted.

Fix up two strncpy build warnings in ncp_get_charsets() by using strscpy
and the max size of the array.

It's not like anyone uses this code anyway, and this gets rid of two
build warnings so that we can see real warnings as they pop up over
time.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ncpfs/ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0a3f9b594602..37779ed3f790 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -233,7 +233,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 		len = strlen(server->nls_vol->charset);
 		if (len > NCP_IOCSNAME_LEN)
 			len = NCP_IOCSNAME_LEN;
-		strncpy(user.codepage, server->nls_vol->charset, len);
+		strscpy(user.codepage, server->nls_vol->charset, NCP_IOCSNAME_LEN);
 		user.codepage[len] = 0;
 	}
 
@@ -243,7 +243,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 		len = strlen(server->nls_io->charset);
 		if (len > NCP_IOCSNAME_LEN)
 			len = NCP_IOCSNAME_LEN;
-		strncpy(user.iocharset,	server->nls_io->charset, len);
+		strscpy(user.iocharset,	server->nls_io->charset, NCP_IOCSNAME_LEN);
 		user.iocharset[len] = 0;
 	}
 	mutex_unlock(&server->root_setup_lock);

From b48715dcf16b4379823ae64b08c2d2cfd13a6432 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 7 Mar 2019 18:06:23 +0100
Subject: [PATCH 2740/3220] isdn: isdn_tty: fix build warning of strncpy

Not upstream as isdn is long deleted.

Fix up a strncpy build warning for isdn_tty_suspend() using strscpy.

It's not like anyone uses this code anyway, and this gets rid of a build
warnings so that we can see real warnings as they pop up over time.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/i4l/isdn_tty.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 8291e9cc949a..2da3f5cd0729 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -786,7 +786,7 @@ isdn_tty_suspend(char *id, modem_info *info, atemu *m)
 		cmd.parm.cmsg.para[3] = 4; /* 16 bit 0x0004 Suspend */
 		cmd.parm.cmsg.para[4] = 0;
 		cmd.parm.cmsg.para[5] = l;
-		strncpy(&cmd.parm.cmsg.para[6], id, l);
+		strscpy(&cmd.parm.cmsg.para[6], id, l);
 		cmd.command = CAPI_PUT_MESSAGE;
 		cmd.driver = info->isdn_driver;
 		cmd.arg = info->isdn_channel;

From fcee2927c0578c07c16f211b3bb674734032bb18 Mon Sep 17 00:00:00 2001
From: Dmitry Eremin <dmitry.eremin@intel.com>
Date: Wed, 4 Nov 2015 13:40:00 -0500
Subject: [PATCH 2741/3220] staging: lustre: fix buffer overflow of string
 buffer

commit 9563fe8a2de9db5eb087fe0e48ec335ee66f8f41 upstream.

Buffer overflow of string buffer due to non null terminated string.
Use strlcpy() when it's justifiable.
Use sizeof(var) instead of constants.

Signed-off-by: Dmitry Eremin <dmitry.eremin@intel.com>
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4629
Reviewed-on: http://review.whamcloud.com/9389
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../staging/lustre/lnet/klnds/socklnd/socklnd.c    |  9 +++++----
 drivers/staging/lustre/lnet/lnet/config.c          | 14 ++++++++------
 drivers/staging/lustre/lnet/selftest/conrpc.c      |  4 ++--
 drivers/staging/lustre/lnet/selftest/console.c     |  6 ++++--
 .../staging/lustre/lustre/include/lustre_disk.h    |  1 +
 drivers/staging/lustre/lustre/libcfs/debug.c       |  6 +++---
 drivers/staging/lustre/lustre/libcfs/hash.c        |  3 +--
 drivers/staging/lustre/lustre/libcfs/workitem.c    |  4 ++--
 drivers/staging/lustre/lustre/llite/dir.c          |  2 +-
 drivers/staging/lustre/lustre/lov/lov_pool.c       |  3 +--
 drivers/staging/lustre/lustre/obdclass/obd_mount.c | 10 +++++++---
 drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c     |  1 +
 drivers/staging/lustre/lustre/ptlrpc/sec_config.c  |  3 +--
 13 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
index ecfe73302350..46a24b4ead09 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -2621,8 +2621,8 @@ ksocknal_enumerate_interfaces(ksock_net_t *net)
 
 		net->ksnn_interfaces[j].ksni_ipaddr = ip;
 		net->ksnn_interfaces[j].ksni_netmask = mask;
-		strncpy(&net->ksnn_interfaces[j].ksni_name[0],
-			names[i], IFNAMSIZ);
+		strlcpy(net->ksnn_interfaces[j].ksni_name,
+			names[i], sizeof(net->ksnn_interfaces[j].ksni_name));
 		j++;
 	}
 
@@ -2805,8 +2805,9 @@ ksocknal_startup(lnet_ni_t *ni)
 				goto fail_1;
 			}
 
-			strncpy(&net->ksnn_interfaces[i].ksni_name[0],
-				ni->ni_interfaces[i], IFNAMSIZ);
+			strlcpy(net->ksnn_interfaces[i].ksni_name,
+				ni->ni_interfaces[i],
+				sizeof(net->ksnn_interfaces[i].ksni_name));
 		}
 		net->ksnn_ninterfaces = i;
 	}
diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c
index 1b3bc8386524..75f120da0a84 100644
--- a/drivers/staging/lustre/lnet/lnet/config.c
+++ b/drivers/staging/lustre/lnet/lnet/config.c
@@ -650,8 +650,8 @@ lnet_parse_route(char *str, int *im_a_router)
 	INIT_LIST_HEAD(&nets);
 
 	/* save a copy of the string for error messages */
-	strncpy(cmd, str, sizeof(cmd) - 1);
-	cmd[sizeof(cmd) - 1] = 0;
+	strncpy(cmd, str, sizeof(cmd));
+	cmd[sizeof(cmd) - 1] = '\0';
 
 	sep = str;
 	for (;;) {
@@ -972,11 +972,13 @@ lnet_splitnets(char *source, struct list_head *nets)
 			return 0;
 
 		offset += (int)(sep - tb->ltb_text);
-		tb2 = lnet_new_text_buf(strlen(sep));
+		len = strlen(sep);
+		tb2 = lnet_new_text_buf(len);
 		if (tb2 == NULL)
 			return -ENOMEM;
 
-		strcpy(tb2->ltb_text, sep);
+		strncpy(tb2->ltb_text, sep, len);
+		tb2->ltb_text[len] = '\0';
 		list_add_tail(&tb2->ltb_list, nets);
 
 		tb = tb2;
@@ -1021,8 +1023,8 @@ lnet_match_networks(char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
 		tb = list_entry(raw_entries.next, struct lnet_text_buf_t,
 				    ltb_list);
 
-		strncpy(source, tb->ltb_text, sizeof(source)-1);
-		source[sizeof(source)-1] = 0;
+		strncpy(source, tb->ltb_text, sizeof(source));
+		source[sizeof(source)-1] = '\0';
 
 		/* replace ltb_text with the network(s) add on match */
 		rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c
index 64a0335934f3..1066c70434b1 100644
--- a/drivers/staging/lustre/lnet/selftest/conrpc.c
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.c
@@ -612,8 +612,8 @@ lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
 		msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
 		msrq->mksn_sid     = console_session.ses_id;
 		msrq->mksn_force   = console_session.ses_force;
-		strncpy(msrq->mksn_name, console_session.ses_name,
-			strlen(console_session.ses_name));
+		strlcpy(msrq->mksn_name, console_session.ses_name,
+			sizeof(msrq->mksn_name));
 		break;
 
 	case LST_TRANS_SESEND:
diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c
index d315dd44ae3b..ed1bc6ac79dd 100644
--- a/drivers/staging/lustre/lnet/selftest/console.c
+++ b/drivers/staging/lustre/lnet/selftest/console.c
@@ -1739,7 +1739,8 @@ lstcon_session_new(char *name, int key, unsigned feats,
 	console_session.ses_feats_updated = 0;
 	console_session.ses_timeout = (timeout <= 0) ?
 				      LST_CONSOLE_TIMEOUT : timeout;
-	strcpy(console_session.ses_name, name);
+	strlcpy(console_session.ses_name, name,
+		sizeof(console_session.ses_name));
 
 	rc = lstcon_batch_add(LST_DEFAULT_BATCH);
 	if (rc != 0)
@@ -1959,7 +1960,8 @@ lstcon_acceptor_handle(srpc_server_rpc_t *rpc)
 	if (grp->grp_userland == 0)
 		grp->grp_userland = 1;
 
-	strcpy(jrep->join_session, console_session.ses_name);
+	strlcpy(jrep->join_session, console_session.ses_name,
+		sizeof(jrep->join_session));
 	jrep->join_timeout = console_session.ses_timeout;
 	jrep->join_status  = 0;
 
diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
index 5e1ac129a681..7c6933ffc9c1 100644
--- a/drivers/staging/lustre/lustre/include/lustre_disk.h
+++ b/drivers/staging/lustre/lustre/include/lustre_disk.h
@@ -68,6 +68,7 @@
    everything as string options */
 
 #define LMD_MAGIC    0xbdacbd03
+#define LMD_PARAMS_MAXLEN	4096
 
 /* gleaned from the mount command - no persistent info here */
 struct lustre_mount_data {
diff --git a/drivers/staging/lustre/lustre/libcfs/debug.c b/drivers/staging/lustre/lustre/libcfs/debug.c
index 1d1c67164418..170775bc7bc0 100644
--- a/drivers/staging/lustre/lustre/libcfs/debug.c
+++ b/drivers/staging/lustre/lustre/libcfs/debug.c
@@ -512,9 +512,9 @@ int libcfs_debug_init(unsigned long bufsize)
 	}
 
 	if (libcfs_debug_file_path != NULL) {
-		strncpy(libcfs_debug_file_path_arr,
-			libcfs_debug_file_path, PATH_MAX-1);
-		libcfs_debug_file_path_arr[PATH_MAX - 1] = '\0';
+		strlcpy(libcfs_debug_file_path_arr,
+			libcfs_debug_file_path,
+			sizeof(libcfs_debug_file_path_arr));
 	}
 
 	/* If libcfs_debug_mb is set to an invalid value or uninitialized
diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c
index 030874428952..55fc2190a5bb 100644
--- a/drivers/staging/lustre/lustre/libcfs/hash.c
+++ b/drivers/staging/lustre/lustre/libcfs/hash.c
@@ -1062,8 +1062,7 @@ cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
 	if (hs == NULL)
 		return NULL;
 
-	strncpy(hs->hs_name, name, len);
-	hs->hs_name[len - 1] = '\0';
+	strlcpy(hs->hs_name, name, len);
 	hs->hs_flags = flags;
 
 	atomic_set(&hs->hs_refcount, 1);
diff --git a/drivers/staging/lustre/lustre/libcfs/workitem.c b/drivers/staging/lustre/lustre/libcfs/workitem.c
index e1143a566ac4..f6cc434af756 100644
--- a/drivers/staging/lustre/lustre/libcfs/workitem.c
+++ b/drivers/staging/lustre/lustre/libcfs/workitem.c
@@ -360,8 +360,8 @@ cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
 	if (sched == NULL)
 		return -ENOMEM;
 
-	strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
-	sched->ws_name[CFS_WS_NAME_LEN - 1] = '\0';
+	strlcpy(sched->ws_name, name, CFS_WS_NAME_LEN);
+
 	sched->ws_cptab = cptab;
 	sched->ws_cpt = cpt;
 
diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c
index 5c9502b5b358..951259a98323 100644
--- a/drivers/staging/lustre/lustre/llite/dir.c
+++ b/drivers/staging/lustre/lustre/llite/dir.c
@@ -641,7 +641,7 @@ static int ll_send_mgc_param(struct obd_export *mgc, char *string)
 	if (!msp)
 		return -ENOMEM;
 
-	strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
+	strlcpy(msp->mgs_param, string, sizeof(msp->mgs_param));
 	rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
 				sizeof(struct mgs_send_param), msp, NULL);
 	if (rc)
diff --git a/drivers/staging/lustre/lustre/lov/lov_pool.c b/drivers/staging/lustre/lustre/lov/lov_pool.c
index b03827ef6514..b43ce6cd64c2 100644
--- a/drivers/staging/lustre/lustre/lov/lov_pool.c
+++ b/drivers/staging/lustre/lustre/lov/lov_pool.c
@@ -412,8 +412,7 @@ int lov_pool_new(struct obd_device *obd, char *poolname)
 	if (!new_pool)
 		return -ENOMEM;
 
-	strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME);
-	new_pool->pool_name[LOV_MAXPOOLNAME] = '\0';
+	strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name));
 	new_pool->pool_lobd = obd;
 	/* ref count init to 1 because when created a pool is always used
 	 * up to deletion
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
index 48003d5325e3..7617c57d16e0 100644
--- a/drivers/staging/lustre/lustre/obdclass/obd_mount.c
+++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
@@ -892,7 +892,7 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 	}
 	lmd->lmd_magic = LMD_MAGIC;
 
-	lmd->lmd_params = kzalloc(4096, GFP_NOFS);
+	lmd->lmd_params = kzalloc(LMD_PARAMS_MAXLEN, GFP_NOFS);
 	if (!lmd->lmd_params)
 		return -ENOMEM;
 	lmd->lmd_params[0] = '\0';
@@ -978,7 +978,7 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 				goto invalid;
 			clear++;
 		} else if (strncmp(s1, "param=", 6) == 0) {
-			int length;
+			size_t length, params_length;
 			char *tail = strchr(s1 + 6, ',');
 
 			if (tail == NULL)
@@ -986,8 +986,12 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
 			else
 				length = tail - s1;
 			length -= 6;
+			params_length = strlen(lmd->lmd_params);
+			if (params_length + length + 1 >= LMD_PARAMS_MAXLEN)
+				return -E2BIG;
 			strncat(lmd->lmd_params, s1 + 6, length);
-			strcat(lmd->lmd_params, " ");
+			lmd->lmd_params[params_length + length] = '\0';
+			strlcat(lmd->lmd_params, " ", LMD_PARAMS_MAXLEN);
 			clear++;
 		} else if (strncmp(s1, "osd=", 4) == 0) {
 			rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
index ce036a1ac466..ac87aa12bd7e 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
@@ -422,6 +422,7 @@ static int ptlrpcd(void *arg)
 	complete(&pc->pc_starting);
 
 	/*
+
 	 * This mainloop strongly resembles ptlrpc_set_wait() except that our
 	 * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
 	 * there are requests in the set. New requests come in on the set's
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
index 7ff948fe1424..7a206705865b 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
@@ -83,8 +83,7 @@ int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
 		return 0;
 	}
 
-	strncpy(buf, str, sizeof(buf));
-	buf[sizeof(buf) - 1] = '\0';
+	strlcpy(buf, str, sizeof(buf));
 
 	bulk = strchr(buf, '-');
 	if (bulk)

From d28a029290fd80e9fb40850eb6eb2b7281f1f007 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 2 Mar 2019 10:34:55 +0800
Subject: [PATCH 2742/3220] net-sysfs: Fix mem leak in netdev_register_kobject

[ Upstream commit 895a5e96dbd6386c8e78e5b78e067dcc67b7f0ab ]

syzkaller report this:
BUG: memory leak
unreferenced object 0xffff88837a71a500 (size 256):
  comm "syz-executor.2", pid 9770, jiffies 4297825125 (age 17.843s)
  hex dump (first 32 bytes):
    00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00  .....N..........
    ff ff ff ff ff ff ff ff 20 c0 ef 86 ff ff ff ff  ........ .......
  backtrace:
    [<00000000db12624b>] netdev_register_kobject+0x124/0x2e0 net/core/net-sysfs.c:1751
    [<00000000dc49a994>] register_netdevice+0xcc1/0x1270 net/core/dev.c:8516
    [<00000000e5f3fea0>] tun_set_iff drivers/net/tun.c:2649 [inline]
    [<00000000e5f3fea0>] __tun_chr_ioctl+0x2218/0x3d20 drivers/net/tun.c:2883
    [<000000001b8ac127>] vfs_ioctl fs/ioctl.c:46 [inline]
    [<000000001b8ac127>] do_vfs_ioctl+0x1a5/0x10e0 fs/ioctl.c:690
    [<0000000079b269f8>] ksys_ioctl+0x89/0xa0 fs/ioctl.c:705
    [<00000000de649beb>] __do_sys_ioctl fs/ioctl.c:712 [inline]
    [<00000000de649beb>] __se_sys_ioctl fs/ioctl.c:710 [inline]
    [<00000000de649beb>] __x64_sys_ioctl+0x74/0xb0 fs/ioctl.c:710
    [<000000007ebded1e>] do_syscall_64+0xc8/0x580 arch/x86/entry/common.c:290
    [<00000000db315d36>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<00000000115be9bb>] 0xffffffffffffffff

It should call kset_unregister to free 'dev->queues_kset'
in error path of register_queue_kobjects, otherwise will cause a mem leak.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: 1d24eb4815d1 ("xps: Transmit Packet Steering")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/net-sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index f88a62ab019d..579d351f6ddd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1361,6 +1361,9 @@ static int register_queue_kobjects(struct net_device *dev)
 error:
 	netdev_queue_update_kobjects(dev, txq, 0);
 	net_rx_queue_update_kobjects(dev, rxq, 0);
+#ifdef CONFIG_SYSFS
+	kset_unregister(dev->queues_kset);
+#endif
 	return error;
 }
 

From 89ef111eb52f43ec432ea05663e0dccdfdc3d9bb Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Mon, 4 Mar 2019 15:00:03 +0800
Subject: [PATCH 2743/3220] sky2: Disable MSI on Dell Inspiron 1545 and Gateway
 P-79

[ Upstream commit b33b7cd6fd86478dd2890a9abeb6f036aa01fdf7 ]

Some sky2 chips fire IRQ after S3, before the driver is fully resumed:
[ 686.804877] do_IRQ: 1.37 No irq handler for vector

This is likely a platform bug that device isn't fully quiesced during
S3. Use MSI-X, maskable MSI or INTx can prevent this issue from
happening.

Since MSI-X and maskable MSI are not supported by this device, fallback
to use INTx on affected platforms.

BugLink: https://bugs.launchpad.net/bugs/1807259
BugLink: https://bugs.launchpad.net/bugs/1809843
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/marvell/sky2.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 4b97aa24559a..5cc05df69a86 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -46,6 +46,7 @@
 #include <linux/mii.h>
 #include <linux/of_device.h>
 #include <linux/of_net.h>
+#include <linux/dmi.h>
 
 #include <asm/irq.h>
 
@@ -93,7 +94,7 @@ static int copybreak __read_mostly = 128;
 module_param(copybreak, int, 0);
 MODULE_PARM_DESC(copybreak, "Receive copy threshold");
 
-static int disable_msi = 0;
+static int disable_msi = -1;
 module_param(disable_msi, int, 0);
 MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)");
 
@@ -4923,6 +4924,24 @@ static const char *sky2_name(u8 chipid, char *buf, int sz)
 	return buf;
 }
 
+static const struct dmi_system_id msi_blacklist[] = {
+	{
+		.ident = "Dell Inspiron 1545",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1545"),
+		},
+	},
+	{
+		.ident = "Gateway P-79",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Gateway"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "P-79"),
+		},
+	},
+	{}
+};
+
 static int sky2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct net_device *dev, *dev1;
@@ -5034,6 +5053,9 @@ static int sky2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_free_pci;
 	}
 
+	if (disable_msi == -1)
+		disable_msi = !!dmi_check_system(msi_blacklist);
+
 	if (!disable_msi && pci_enable_msi(pdev) == 0) {
 		err = sky2_test_msi(hw);
 		if (err) {

From 7e7a271035c41b110d89687d842154ddc5705580 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 3 Mar 2019 07:35:51 +0000
Subject: [PATCH 2744/3220] team: Free BPF filter when unregistering netdev

[ Upstream commit 692c31bd4054212312396b1d303bffab2c5b93a7 ]

When team is used in loadbalance mode a BPF filter can be used to
provide a hash which will determine the Tx port.

When the netdev is later unregistered the filter is not freed which
results in memory leaks [1].

Fix by freeing the program and the corresponding filter when
unregistering the netdev.

[1]
unreferenced object 0xffff8881dbc47cc8 (size 16):
  comm "teamd", pid 3068, jiffies 4294997779 (age 438.247s)
  hex dump (first 16 bytes):
    a3 00 6b 6b 6b 6b 6b 6b 88 a5 82 e1 81 88 ff ff  ..kkkkkk........
  backtrace:
    [<000000008a3b47e3>] team_nl_cmd_options_set+0x88f/0x11b0
    [<00000000c4f4f27e>] genl_family_rcv_msg+0x78f/0x1080
    [<00000000610ef838>] genl_rcv_msg+0xca/0x170
    [<00000000a281df93>] netlink_rcv_skb+0x132/0x380
    [<000000004d9448a2>] genl_rcv+0x29/0x40
    [<000000000321b2f4>] netlink_unicast+0x4c0/0x690
    [<000000008c25dffb>] netlink_sendmsg+0x929/0xe10
    [<00000000068298c5>] sock_sendmsg+0xc8/0x110
    [<0000000082a61ff0>] ___sys_sendmsg+0x77a/0x8f0
    [<00000000663ae29d>] __sys_sendmsg+0xf7/0x250
    [<0000000027c5f11a>] do_syscall_64+0x14d/0x610
    [<000000006cfbc8d3>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<00000000e23197e2>] 0xffffffffffffffff
unreferenced object 0xffff8881e182a588 (size 2048):
  comm "teamd", pid 3068, jiffies 4294997780 (age 438.247s)
  hex dump (first 32 bytes):
    20 00 00 00 02 00 00 00 30 00 00 00 28 f0 ff ff   .......0...(...
    07 00 00 00 00 00 00 00 28 00 00 00 00 00 00 00  ........(.......
  backtrace:
    [<000000002daf01fb>] lb_bpf_func_set+0x45c/0x6d0
    [<000000008a3b47e3>] team_nl_cmd_options_set+0x88f/0x11b0
    [<00000000c4f4f27e>] genl_family_rcv_msg+0x78f/0x1080
    [<00000000610ef838>] genl_rcv_msg+0xca/0x170
    [<00000000a281df93>] netlink_rcv_skb+0x132/0x380
    [<000000004d9448a2>] genl_rcv+0x29/0x40
    [<000000000321b2f4>] netlink_unicast+0x4c0/0x690
    [<000000008c25dffb>] netlink_sendmsg+0x929/0xe10
    [<00000000068298c5>] sock_sendmsg+0xc8/0x110
    [<0000000082a61ff0>] ___sys_sendmsg+0x77a/0x8f0
    [<00000000663ae29d>] __sys_sendmsg+0xf7/0x250
    [<0000000027c5f11a>] do_syscall_64+0x14d/0x610
    [<000000006cfbc8d3>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<00000000e23197e2>] 0xffffffffffffffff

Fixes: 01d7f30a9f96 ("team: add loadbalance mode")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Amit Cohen <amitc@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/team/team_mode_loadbalance.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index a1536d0d83a9..a00335b3786e 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -305,6 +305,20 @@ static int lb_bpf_func_set(struct team *team, struct team_gsetter_ctx *ctx)
 	return 0;
 }
 
+static void lb_bpf_func_free(struct team *team)
+{
+	struct lb_priv *lb_priv = get_lb_priv(team);
+	struct bpf_prog *fp;
+
+	if (!lb_priv->ex->orig_fprog)
+		return;
+
+	__fprog_destroy(lb_priv->ex->orig_fprog);
+	fp = rcu_dereference_protected(lb_priv->fp,
+				       lockdep_is_held(&team->lock));
+	bpf_prog_destroy(fp);
+}
+
 static int lb_tx_method_get(struct team *team, struct team_gsetter_ctx *ctx)
 {
 	struct lb_priv *lb_priv = get_lb_priv(team);
@@ -619,6 +633,7 @@ static void lb_exit(struct team *team)
 
 	team_options_unregister(team, lb_options,
 				ARRAY_SIZE(lb_options));
+	lb_bpf_func_free(team);
 	cancel_delayed_work_sync(&lb_priv->ex->stats.refresh_dw);
 	free_percpu(lb_priv->pcpu_stats);
 	kfree(lb_priv->ex);

From 7fb2b5380fee29f2e2beeba2b45f3fe6205442dd Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Wed, 27 Feb 2019 03:58:53 -0500
Subject: [PATCH 2745/3220] bnxt_en: Drop oversize TX packets to prevent
 errors.

[ Upstream commit 2b3c6885386020b1b9d92d45e8349637e27d1f66 ]

There have been reports of oversize UDP packets being sent to the
driver to be transmitted, causing error conditions.  The issue is
likely caused by the dst of the SKB switching between 'lo' with
64K MTU and the hardware device with a smaller MTU.  Patches are
being proposed by Mahesh Bandewar <maheshb@google.com> to fix the
issue.

In the meantime, add a quick length check in the driver to prevent
the error.  The driver uses the TX packet size as index to look up an
array to setup the TX BD.  The array is large enough to support all MTU
sizes supported by the driver.  The oversize TX packet causes the
driver to index beyond the array and put garbage values into the
TX BD.  Add a simple check to prevent this.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index fea8116da06a..00bd7be85679 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -330,6 +330,12 @@ normal_tx:
 	}
 
 	length >>= 9;
+	if (unlikely(length >= ARRAY_SIZE(bnxt_lhint_arr))) {
+		dev_warn_ratelimited(&pdev->dev, "Dropped oversize %d bytes TX packet.\n",
+				     skb->len);
+		i = 0;
+		goto tx_dma_error;
+	}
 	flags |= bnxt_lhint_arr[length];
 	txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
 

From 6e7339d5ee302fcbfe8ef29c058cc85c360849b9 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 22 Feb 2019 15:37:58 +0800
Subject: [PATCH 2746/3220] net: nfc: Fix NULL dereference on
 nfc_llcp_build_tlv fails

[ Upstream commit 58bdd544e2933a21a51eecf17c3f5f94038261b5 ]

KASAN report this:

BUG: KASAN: null-ptr-deref in nfc_llcp_build_gb+0x37f/0x540 [nfc]
Read of size 3 at addr 0000000000000000 by task syz-executor.0/5401

CPU: 0 PID: 5401 Comm: syz-executor.0 Not tainted 5.0.0-rc7+ #45
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0xfa/0x1ce lib/dump_stack.c:113
 kasan_report+0x171/0x18d mm/kasan/report.c:321
 memcpy+0x1f/0x50 mm/kasan/common.c:130
 nfc_llcp_build_gb+0x37f/0x540 [nfc]
 nfc_llcp_register_device+0x6eb/0xb50 [nfc]
 nfc_register_device+0x50/0x1d0 [nfc]
 nfcsim_device_new+0x394/0x67d [nfcsim]
 ? 0xffffffffc1080000
 nfcsim_init+0x6b/0x1000 [nfcsim]
 do_one_initcall+0xfa/0x5ca init/main.c:887
 do_init_module+0x204/0x5f6 kernel/module.c:3460
 load_module+0x66b2/0x8570 kernel/module.c:3808
 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902
 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x462e99
Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f9cb79dcc58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99
RDX: 0000000000000000 RSI: 0000000020000280 RDI: 0000000000000003
RBP: 00007f9cb79dcc70 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f9cb79dd6bc
R13: 00000000004bcefb R14: 00000000006f7030 R15: 0000000000000004

nfc_llcp_build_tlv will return NULL on fails, caller should check it,
otherwise will trigger a NULL dereference.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: eda21f16a5ed ("NFC: Set MIU and RW values from CONNECT and CC LLCP frames")
Fixes: d646960f7986 ("NFC: Initial LLCP support")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/nfc/llcp_commands.c | 20 ++++++++++++++++++++
 net/nfc/llcp_core.c     | 24 ++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
index 04f060488686..96277ac37dac 100644
--- a/net/nfc/llcp_commands.c
+++ b/net/nfc/llcp_commands.c
@@ -419,6 +419,10 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock)
 						      sock->service_name,
 						      sock->service_name_len,
 						      &service_name_tlv_length);
+		if (!service_name_tlv) {
+			err = -ENOMEM;
+			goto error_tlv;
+		}
 		size += service_name_tlv_length;
 	}
 
@@ -429,9 +433,17 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock)
 
 	miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0,
 				      &miux_tlv_length);
+	if (!miux_tlv) {
+		err = -ENOMEM;
+		goto error_tlv;
+	}
 	size += miux_tlv_length;
 
 	rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length);
+	if (!rw_tlv) {
+		err = -ENOMEM;
+		goto error_tlv;
+	}
 	size += rw_tlv_length;
 
 	pr_debug("SKB size %d SN length %zu\n", size, sock->service_name_len);
@@ -486,9 +498,17 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock)
 
 	miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0,
 				      &miux_tlv_length);
+	if (!miux_tlv) {
+		err = -ENOMEM;
+		goto error_tlv;
+	}
 	size += miux_tlv_length;
 
 	rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length);
+	if (!rw_tlv) {
+		err = -ENOMEM;
+		goto error_tlv;
+	}
 	size += rw_tlv_length;
 
 	skb = llcp_allocate_pdu(sock, LLCP_PDU_CC, size);
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index 98876274a1ee..c1334b826dd5 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -532,10 +532,10 @@ static u8 nfc_llcp_reserve_sdp_ssap(struct nfc_llcp_local *local)
 
 static int nfc_llcp_build_gb(struct nfc_llcp_local *local)
 {
-	u8 *gb_cur, *version_tlv, version, version_length;
-	u8 *lto_tlv, lto_length;
-	u8 *wks_tlv, wks_length;
-	u8 *miux_tlv, miux_length;
+	u8 *gb_cur, version, version_length;
+	u8 lto_length, wks_length, miux_length;
+	u8 *version_tlv = NULL, *lto_tlv = NULL,
+	   *wks_tlv = NULL, *miux_tlv = NULL;
 	__be16 wks = cpu_to_be16(local->local_wks);
 	u8 gb_len = 0;
 	int ret = 0;
@@ -543,17 +543,33 @@ static int nfc_llcp_build_gb(struct nfc_llcp_local *local)
 	version = LLCP_VERSION_11;
 	version_tlv = nfc_llcp_build_tlv(LLCP_TLV_VERSION, &version,
 					 1, &version_length);
+	if (!version_tlv) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	gb_len += version_length;
 
 	lto_tlv = nfc_llcp_build_tlv(LLCP_TLV_LTO, &local->lto, 1, &lto_length);
+	if (!lto_tlv) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	gb_len += lto_length;
 
 	pr_debug("Local wks 0x%lx\n", local->local_wks);
 	wks_tlv = nfc_llcp_build_tlv(LLCP_TLV_WKS, (u8 *)&wks, 2, &wks_length);
+	if (!wks_tlv) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	gb_len += wks_length;
 
 	miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0,
 				      &miux_length);
+	if (!miux_tlv) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	gb_len += miux_length;
 
 	gb_len += ARRAY_SIZE(llcp_magic);

From 51d27870ecf1a5f9fed9846d337874bc89e25234 Mon Sep 17 00:00:00 2001
From: Igor Druzhinin <igor.druzhinin@citrix.com>
Date: Thu, 28 Feb 2019 12:48:03 +0000
Subject: [PATCH 2747/3220] xen-netback: fix occasional leak of grant ref
 mappings under memory pressure

[ Upstream commit 99e87f56b48f490fb16b6e0f74691c1e664dea95 ]

Zero-copy callback flag is not yet set on frag list skb at the moment
xenvif_handle_frag_list() returns -ENOMEM. This eventually results in
leaking grant ref mappings since xenvif_zerocopy_callback() is never
called for these fragments. Those eventually build up and cause Xen
to kill Dom0 as the slots get reused for new mappings:

"d0v0 Attempt to implicitly unmap a granted PTE c010000329fce005"

That behavior is observed under certain workloads where sudden spikes
of page cache writes coexist with active atomic skb allocations from
network traffic. Additionally, rework the logic to deal with frag_list
deallocation in a single place.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com>
Acked-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/xen-netback/netback.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 02db20b26749..d324ac308e6d 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1538,11 +1538,6 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s
 		skb_frag_size_set(&frags[i], len);
 	}
 
-	/* Copied all the bits from the frag list -- free it. */
-	skb_frag_list_init(skb);
-	xenvif_skb_zerocopy_prepare(queue, nskb);
-	kfree_skb(nskb);
-
 	/* Release all the original (foreign) frags. */
 	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++)
 		skb_frag_unref(skb, f);
@@ -1611,6 +1606,8 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
 		xenvif_fill_frags(queue, skb);
 
 		if (unlikely(skb_has_frag_list(skb))) {
+			struct sk_buff *nskb = skb_shinfo(skb)->frag_list;
+			xenvif_skb_zerocopy_prepare(queue, nskb);
 			if (xenvif_handle_frag_list(queue, skb)) {
 				if (net_ratelimit())
 					netdev_err(queue->vif->dev,
@@ -1619,6 +1616,9 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
 				kfree_skb(skb);
 				continue;
 			}
+			/* Copied all the bits from the frag list -- free it. */
+			skb_frag_list_init(skb);
+			kfree_skb(nskb);
 		}
 
 		skb->dev      = queue->vif->dev;

From 826987f6f98fb7719e99618ef60c4201d8e4ef8b Mon Sep 17 00:00:00 2001
From: Nazarov Sergey <s-nazarov@yandex.ru>
Date: Mon, 25 Feb 2019 19:24:15 +0300
Subject: [PATCH 2748/3220] net: Add __icmp_send helper.

[ Upstream commit 9ef6b42ad6fd7929dd1b6092cb02014e382c6a91 ]

Add __icmp_send function having ip_options struct parameter

Signed-off-by: Sergey Nazarov <s-nazarov@yandex.ru>
Reviewed-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/icmp.h | 9 ++++++++-
 net/ipv4/icmp.c    | 7 ++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/net/icmp.h b/include/net/icmp.h
index 970028e13382..06ceb483475d 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -22,6 +22,7 @@
 
 #include <net/inet_sock.h>
 #include <net/snmp.h>
+#include <net/ip.h>
 
 struct icmp_err {
   int		errno;
@@ -39,7 +40,13 @@ struct net_proto_family;
 struct sk_buff;
 struct net;
 
-void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info);
+void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
+		 const struct ip_options *opt);
+static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+	__icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt);
+}
+
 int icmp_rcv(struct sk_buff *skb);
 void icmp_err(struct sk_buff *skb, u32 info);
 int icmp_init(void);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36e26977c908..d0ec8a997210 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -565,7 +565,8 @@ relookup_failed:
  *			MUST reply to only the first fragment.
  */
 
-void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
+		 const struct ip_options *opt)
 {
 	struct iphdr *iph;
 	int room;
@@ -679,7 +680,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 					  iph->tos;
 	mark = IP4_REPLY_MARK(net, skb_in->mark);
 
-	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
+	if (__ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in, opt))
 		goto out_unlock;
 
 
@@ -731,7 +732,7 @@ out_free:
 	kfree(icmp_param);
 out:;
 }
-EXPORT_SYMBOL(icmp_send);
+EXPORT_SYMBOL(__icmp_send);
 
 
 static void icmp_socket_deliver(struct sk_buff *skb, u32 info)

From b98af83bc70968a7307c7ac79f4b17cdc66b25ab Mon Sep 17 00:00:00 2001
From: Nazarov Sergey <s-nazarov@yandex.ru>
Date: Mon, 25 Feb 2019 19:27:15 +0300
Subject: [PATCH 2749/3220] net: avoid use IPCB in cipso_v4_error

[ Upstream commit 3da1ed7ac398f34fff1694017a07054d69c5f5c5 ]

Extract IP options in cipso_v4_error and use __icmp_send.

Signed-off-by: Sergey Nazarov <s-nazarov@yandex.ru>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/ip.h      |  2 ++
 net/ipv4/cipso_ipv4.c | 17 +++++++++++++++--
 net/ipv4/ip_options.c | 22 +++++++++++++++++-----
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 7b968927477d..e2320f9e4d3e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -546,6 +546,8 @@ static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 }
 
 void ip_options_fragment(struct sk_buff *skb);
+int __ip_options_compile(struct net *net, struct ip_options *opt,
+			 struct sk_buff *skb, __be32 *info);
 int ip_options_compile(struct net *net, struct ip_options *opt,
 		       struct sk_buff *skb);
 int ip_options_get(struct net *net, struct ip_options_rcu **optp,
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index cfaacaa023e6..eeed8b39bcb4 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1805,13 +1805,26 @@ validate_return:
  */
 void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
 {
+	unsigned char optbuf[sizeof(struct ip_options) + 40];
+	struct ip_options *opt = (struct ip_options *)optbuf;
+
 	if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
 		return;
 
+	/*
+	 * We might be called above the IP layer,
+	 * so we can not use icmp_send and IPCB here.
+	 */
+
+	memset(opt, 0, sizeof(struct ip_options));
+	opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
+	if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL))
+		return;
+
 	if (gateway)
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
+		__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt);
 	else
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
+		__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt);
 }
 
 /**
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index bd246792360b..d3922a93e4c1 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -254,8 +254,9 @@ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
  * If opt == NULL, then skb->data should point to IP header.
  */
 
-int ip_options_compile(struct net *net,
-		       struct ip_options *opt, struct sk_buff *skb)
+int __ip_options_compile(struct net *net,
+			 struct ip_options *opt, struct sk_buff *skb,
+			 __be32 *info)
 {
 	__be32 spec_dst = htonl(INADDR_ANY);
 	unsigned char *pp_ptr = NULL;
@@ -472,11 +473,22 @@ eol:
 		return 0;
 
 error:
-	if (skb) {
-		icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
-	}
+	if (info)
+		*info = htonl((pp_ptr-iph)<<24);
 	return -EINVAL;
 }
+
+int ip_options_compile(struct net *net,
+		       struct ip_options *opt, struct sk_buff *skb)
+{
+	int ret;
+	__be32 info;
+
+	ret = __ip_options_compile(net, opt, skb, &info);
+	if (ret != 0 && skb)
+		icmp_send(skb, ICMP_PARAMETERPROB, 0, info);
+	return ret;
+}
 EXPORT_SYMBOL(ip_options_compile);
 
 /*

From 3f7902c723cf1f44a19227d6ffd6bfc952a038a2 Mon Sep 17 00:00:00 2001
From: Rajasingh Thavamani <T.Rajasingh@landisgyr.com>
Date: Wed, 27 Feb 2019 17:43:19 +0530
Subject: [PATCH 2750/3220] net: phy: Micrel KSZ8061: link failure after cable
 connect

[ Upstream commit 232ba3a51cc224b339c7114888ed7f0d4d95695e ]

With Micrel KSZ8061 PHY, the link may occasionally not come up after
Ethernet cable connect. The vendor's (Microchip, former Micrel) errata
sheet 80000688A.pdf descripes the problem and possible workarounds in
detail, see below.
The batch implements workaround 1, which permanently fixes the issue.

DESCRIPTION
Link-up may not occur properly when the Ethernet cable is initially
connected. This issue occurs more commonly when the cable is connected
slowly, but it may occur any time a cable is connected. This issue occurs
in the auto-negotiation circuit, and will not occur if auto-negotiation
is disabled (which requires that the two link partners be set to the
same speed and duplex).

END USER IMPLICATIONS
When this issue occurs, link is not established. Subsequent cable
plug/unplaug cycle will not correct the issue.

WORk AROUND
There are four approaches to work around this issue:
1. This issue can be prevented by setting bit 15 in MMD device address 1,
   register 2, prior to connecting the cable or prior to setting the
   Restart Auto-negotiation bit in register 0h. The MMD registers are
   accessed via the indirect access registers Dh and Eh, or via the Micrel
   EthUtil utility as shown here:
   . if using the EthUtil utility (usually with a Micrel KSZ8061
     Evaluation Board), type the following commands:
     > address 1
     > mmd 1
     > iw 2 b61a
   . Alternatively, write the following registers to write to the
     indirect MMD register:
     Write register Dh, data 0001h
     Write register Eh, data 0002h
     Write register Dh, data 4001h
     Write register Eh, data B61Ah
2. The issue can be avoided by disabling auto-negotiation in the KSZ8061,
   either by the strapping option, or by clearing bit 12 in register 0h.
   Care must be taken to ensure that the KSZ8061 and the link partner
   will link with the same speed and duplex. Note that the KSZ8061
   defaults to full-duplex when auto-negotiation is off, but other
   devices may default to half-duplex in the event of failed
   auto-negotiation.
3. The issue can be avoided by connecting the cable prior to powering-up
   or resetting the KSZ8061, and leaving it plugged in thereafter.
4. If the above measures are not taken and the problem occurs, link can
   be recovered by setting the Restart Auto-Negotiation bit in
   register 0h, or by resetting or power cycling the device. Reset may
   be either hardware reset or software reset (register 0h, bit 15).

PLAN
This errata will not be corrected in the future revision.

Fixes: 7ab59dc15e2f ("drivers/net/phy/micrel_phy: Add support for new PHYs")
Signed-off-by: Alexander Onnasch <alexander.onnasch@landisgyr.com>
Signed-off-by: Rajasingh Thavamani <T.Rajasingh@landisgyr.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/phy/micrel.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 920391165f18..ba84fc3637b1 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -28,6 +28,7 @@
 #include <linux/micrel_phy.h>
 #include <linux/of.h>
 #include <linux/clk.h>
+#include <uapi/linux/mdio.h>
 
 /* Operation Mode Strap Override */
 #define MII_KSZPHY_OMSO				0x16
@@ -287,6 +288,17 @@ static int kszphy_config_init(struct phy_device *phydev)
 	return 0;
 }
 
+static int ksz8061_config_init(struct phy_device *phydev)
+{
+	int ret;
+
+	ret = phy_write_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_DEVID1, 0xB61A);
+	if (ret)
+		return ret;
+
+	return kszphy_config_init(phydev);
+}
+
 static int ksz9021_load_values_from_of(struct phy_device *phydev,
 				       const struct device_node *of_node,
 				       u16 reg,
@@ -771,7 +783,7 @@ static struct phy_driver ksphy_driver[] = {
 	.phy_id_mask	= 0x00fffff0,
 	.features	= (PHY_BASIC_FEATURES | SUPPORTED_Pause),
 	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
-	.config_init	= kszphy_config_init,
+	.config_init	= ksz8061_config_init,
 	.config_aneg	= genphy_config_aneg,
 	.read_status	= genphy_read_status,
 	.ack_interrupt	= kszphy_ack_interrupt,

From 5d58d8969037205101f7e4bdd6ed06b525f05b7e Mon Sep 17 00:00:00 2001
From: Jiaxun Yang <jiaxun.yang@flygoat.com>
Date: Tue, 20 Nov 2018 11:00:18 +0800
Subject: [PATCH 2751/3220] x86/CPU/AMD: Set the CPB bit unconditionally on
 F17h

commit 0237199186e7a4aa5310741f0a6498a20c820fd7 upstream.

Some F17h models do not have CPB set in CPUID even though the CPU
supports it. Set the feature bit unconditionally on all F17h.

 [ bp: Rewrite commit message and patch. ]

Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Sherry Hurwitz <sherry.hurwitz@amd.com>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20181120030018.5185-1-jiaxun.yang@flygoat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/amd.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9f6151884249..e94e6f16172b 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -716,11 +716,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
 static void init_amd_zn(struct cpuinfo_x86 *c)
 {
 	set_cpu_cap(c, X86_FEATURE_ZEN);
-	/*
-	 * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
-	 * all up to and including B1.
-	 */
-	if (c->x86_model <= 1 && c->x86_mask <= 1)
+
+	/* Fix erratum 1076: CPB feature bit not being set in CPUID. */
+	if (!cpu_has(c, X86_FEATURE_CPB))
 		set_cpu_cap(c, X86_FEATURE_CPB);
 }
 

From 0adb8a1d60b1949945f4bd16af3c04f4337045a9 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 9 Jan 2019 16:05:10 -0600
Subject: [PATCH 2752/3220] applicom: Fix potential Spectre v1 vulnerabilities

commit d7ac3c6ef5d8ce14b6381d52eb7adafdd6c8bb3c upstream.

IndexCard is indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

drivers/char/applicom.c:418 ac_write() warn: potential spectre issue 'apbs' [r]
drivers/char/applicom.c:728 ac_ioctl() warn: potential spectre issue 'apbs' [r] (local cap)

Fix this by sanitizing IndexCard before using it to index apbs.

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://lore.kernel.org/lkml/20180423164740.GY17484@dhcp22.suse.cz/

Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/applicom.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c
index 14790304b84b..9fcd51095d13 100644
--- a/drivers/char/applicom.c
+++ b/drivers/char/applicom.c
@@ -32,6 +32,7 @@
 #include <linux/wait.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/nospec.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
@@ -386,7 +387,11 @@ static ssize_t ac_write(struct file *file, const char __user *buf, size_t count,
 	TicCard = st_loc.tic_des_from_pc;	/* tic number to send            */
 	IndexCard = NumCard - 1;
 
-	if((NumCard < 1) || (NumCard > MAX_BOARD) || !apbs[IndexCard].RamIO)
+	if (IndexCard >= MAX_BOARD)
+		return -EINVAL;
+	IndexCard = array_index_nospec(IndexCard, MAX_BOARD);
+
+	if (!apbs[IndexCard].RamIO)
 		return -EINVAL;
 
 #ifdef DEBUG
@@ -697,6 +702,7 @@ static long ac_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	unsigned char IndexCard;
 	void __iomem *pmem;
 	int ret = 0;
+	static int warncount = 10;
 	volatile unsigned char byte_reset_it;
 	struct st_ram_io *adgl;
 	void __user *argp = (void __user *)arg;
@@ -711,16 +717,12 @@ static long ac_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	mutex_lock(&ac_mutex);	
 	IndexCard = adgl->num_card-1;
 	 
-	if(cmd != 6 && ((IndexCard >= MAX_BOARD) || !apbs[IndexCard].RamIO)) {
-		static int warncount = 10;
-		if (warncount) {
-			printk( KERN_WARNING "APPLICOM driver IOCTL, bad board number %d\n",(int)IndexCard+1);
-			warncount--;
-		}
-		kfree(adgl);
-		mutex_unlock(&ac_mutex);
-		return -EINVAL;
-	}
+	if (cmd != 6 && IndexCard >= MAX_BOARD)
+		goto err;
+	IndexCard = array_index_nospec(IndexCard, MAX_BOARD);
+
+	if (cmd != 6 && !apbs[IndexCard].RamIO)
+		goto err;
 
 	switch (cmd) {
 		
@@ -838,5 +840,16 @@ static long ac_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	kfree(adgl);
 	mutex_unlock(&ac_mutex);
 	return 0;
+
+err:
+	if (warncount) {
+		pr_warn("APPLICOM driver IOCTL, bad board number %d\n",
+			(int)IndexCard + 1);
+		warncount--;
+	}
+	kfree(adgl);
+	mutex_unlock(&ac_mutex);
+	return -EINVAL;
+
 }
 

From 5b98f0928666b8da3ff4a5fc412c55fb092e37d5 Mon Sep 17 00:00:00 2001
From: Liu Xiang <liu.xiang6@zte.com.cn>
Date: Sat, 16 Feb 2019 17:12:24 +0800
Subject: [PATCH 2753/3220] MIPS: irq: Allocate accurate order pages for irq
 stack

commit 72faa7a773ca59336f3c889e878de81445c5a85c upstream.

The irq_pages is the number of pages for irq stack, but not the
order which is needed by __get_free_pages().
We can use get_order() to calculate the accurate order.

Signed-off-by: Liu Xiang <liu.xiang6@zte.com.cn>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Fixes: fe8bd18ffea5 ("MIPS: Introduce irq_stack")
Cc: linux-mips@vger.kernel.org
Cc: stable@vger.kernel.org # v4.11+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/irq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c
index dc1180a8bfa1..66736397af9f 100644
--- a/arch/mips/kernel/irq.c
+++ b/arch/mips/kernel/irq.c
@@ -52,6 +52,7 @@ asmlinkage void spurious_interrupt(void)
 void __init init_IRQ(void)
 {
 	int i;
+	unsigned int order = get_order(IRQ_STACK_SIZE);
 
 	for (i = 0; i < NR_IRQS; i++)
 		irq_set_noprobe(i);
@@ -59,8 +60,7 @@ void __init init_IRQ(void)
 	arch_init_irq();
 
 	for_each_possible_cpu(i) {
-		int irq_pages = IRQ_STACK_SIZE / PAGE_SIZE;
-		void *s = (void *)__get_free_pages(GFP_KERNEL, irq_pages);
+		void *s = (void *)__get_free_pages(GFP_KERNEL, order);
 
 		irq_stack[i] = s;
 		pr_debug("CPU%d IRQ stack at 0x%p - 0x%p\n", i,

From aba029c8e7a84f259d7366b513bc262e2d70cbd8 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 28 Feb 2019 16:22:02 -0800
Subject: [PATCH 2754/3220] hugetlbfs: fix races and page leaks during
 migration

commit cb6acd01e2e43fd8bad11155752b7699c3d0fb76 upstream.

hugetlb pages should only be migrated if they are 'active'.  The
routines set/clear_page_huge_active() modify the active state of hugetlb
pages.

When a new hugetlb page is allocated at fault time, set_page_huge_active
is called before the page is locked.  Therefore, another thread could
race and migrate the page while it is being added to page table by the
fault code.  This race is somewhat hard to trigger, but can be seen by
strategically adding udelay to simulate worst case scheduling behavior.
Depending on 'how' the code races, various BUG()s could be triggered.

To address this issue, simply delay the set_page_huge_active call until
after the page is successfully added to the page table.

Hugetlb pages can also be leaked at migration time if the pages are
associated with a file in an explicitly mounted hugetlbfs filesystem.
For example, consider a two node system with 4GB worth of huge pages
available.  A program mmaps a 2G file in a hugetlbfs filesystem.  It
then migrates the pages associated with the file from one node to
another.  When the program exits, huge page counts are as follows:

  node0
  1024    free_hugepages
  1024    nr_hugepages

  node1
  0       free_hugepages
  1024    nr_hugepages

  Filesystem                         Size  Used Avail Use% Mounted on
  nodev                              4.0G  2.0G  2.0G  50% /var/opt/hugepool

That is as expected.  2G of huge pages are taken from the free_hugepages
counts, and 2G is the size of the file in the explicitly mounted
filesystem.  If the file is then removed, the counts become:

  node0
  1024    free_hugepages
  1024    nr_hugepages

  node1
  1024    free_hugepages
  1024    nr_hugepages

  Filesystem                         Size  Used Avail Use% Mounted on
  nodev                              4.0G  2.0G  2.0G  50% /var/opt/hugepool

Note that the filesystem still shows 2G of pages used, while there
actually are no huge pages in use.  The only way to 'fix' the filesystem
accounting is to unmount the filesystem

If a hugetlb page is associated with an explicitly mounted filesystem,
this information in contained in the page_private field.  At migration
time, this information is not preserved.  To fix, simply transfer
page_private from old to new page at migration time if necessary.

There is a related race with removing a huge page from a file and
migration.  When a huge page is removed from the pagecache, the
page_mapping() field is cleared, yet page_private remains set until the
page is actually freed by free_huge_page().  A page could be migrated
while in this state.  However, since page_mapping() is not set the
hugetlbfs specific routine to transfer page_private is not called and we
leak the page count in the filesystem.

To fix that, check for this condition before migrating a huge page.  If
the condition is detected, return EBUSY for the page.

Link: http://lkml.kernel.org/r/74510272-7319-7372-9ea6-ec914734c179@oracle.com
Link: http://lkml.kernel.org/r/20190212221400.3512-1-mike.kravetz@oracle.com
Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: <stable@vger.kernel.org>
[mike.kravetz@oracle.com: v2]
  Link: http://lkml.kernel.org/r/7534d322-d782-8ac6-1c8d-a8dc380eb3ab@oracle.com
[mike.kravetz@oracle.com: update comment and changelog]
  Link: http://lkml.kernel.org/r/420bcfd6-158b-38e4-98da-26d0cd85bd01@oracle.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/hugetlbfs/inode.c | 12 ++++++++++++
 mm/hugetlb.c         | 14 ++++++++++++--
 mm/migrate.c         | 11 +++++++++++
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ab34f613fa85..cefae2350da5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -869,6 +869,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
 	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
+
+	/*
+	 * page_private is subpool pointer in hugetlb pages.  Transfer to
+	 * new page.  PagePrivate is not associated with page_private for
+	 * hugetlb pages and can not be set here as only page_huge_active
+	 * pages can be migrated.
+	 */
+	if (page_private(page)) {
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+	}
+
 	migrate_page_copy(newpage, page);
 
 	return MIGRATEPAGE_SUCCESS;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f1a45f5077fe..324b2953e57e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3472,7 +3472,6 @@ retry_avoidcopy:
 	copy_user_huge_page(new_page, old_page, address, vma,
 			    pages_per_huge_page(h));
 	__SetPageUptodate(new_page);
-	set_page_huge_active(new_page);
 
 	mmun_start = address & huge_page_mask(h);
 	mmun_end = mmun_start + huge_page_size(h);
@@ -3494,6 +3493,7 @@ retry_avoidcopy:
 				make_huge_pte(vma, new_page, 1));
 		page_remove_rmap(old_page);
 		hugepage_add_new_anon_rmap(new_page, vma, address);
+		set_page_huge_active(new_page);
 		/* Make the old page be freed below */
 		new_page = old_page;
 	}
@@ -3575,6 +3575,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page;
 	pte_t new_pte;
 	spinlock_t *ptl;
+	bool new_page = false;
 
 	/*
 	 * Currently, we are forced to kill the process in the event the
@@ -3608,7 +3609,7 @@ retry:
 		}
 		clear_huge_page(page, address, pages_per_huge_page(h));
 		__SetPageUptodate(page);
-		set_page_huge_active(page);
+		new_page = true;
 
 		if (vma->vm_flags & VM_MAYSHARE) {
 			int err = huge_add_to_page_cache(page, mapping, idx);
@@ -3680,6 +3681,15 @@ retry:
 	}
 
 	spin_unlock(ptl);
+
+	/*
+	 * Only make newly allocated pages active.  Existing pages found
+	 * in the pagecache could be !page_huge_active() if they have been
+	 * isolated for migration.
+	 */
+	if (new_page)
+		set_page_huge_active(page);
+
 	unlock_page(page);
 out:
 	return ret;
diff --git a/mm/migrate.c b/mm/migrate.c
index ce88dff1da98..73da75d5e5b2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1056,6 +1056,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 		lock_page(hpage);
 	}
 
+	/*
+	 * Check for pages which are in the process of being freed.  Without
+	 * page_mapping() set, hugetlbfs specific move page routine will not
+	 * be called and we could leak usage counts for subpools.
+	 */
+	if (page_private(hpage) && !page_mapping(hpage)) {
+		rc = -EBUSY;
+		goto out_unlock;
+	}
+
 	if (PageAnon(hpage))
 		anon_vma = page_get_anon_vma(hpage);
 
@@ -1086,6 +1096,7 @@ put_anon:
 		put_new_page = NULL;
 	}
 
+out_unlock:
 	unlock_page(hpage);
 out:
 	if (rc != -EAGAIN)

From dc18101f95fa6e815f426316b8b9a5cee28a334e Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 25 Feb 2019 19:06:06 -0500
Subject: [PATCH 2755/3220] netlabel: fix out-of-bounds memory accesses

[ Upstream commit 5578de4834fe0f2a34fedc7374be691443396d1f ]

There are two array out-of-bounds memory accesses, one in
cipso_v4_map_lvl_valid(), the other in netlbl_bitmap_walk().  Both
errors are embarassingly simple, and the fixes are straightforward.

As a FYI for anyone backporting this patch to kernels prior to v4.8,
you'll want to apply the netlbl_bitmap_walk() patch to
cipso_v4_bitmap_walk() as netlbl_bitmap_walk() doesn't exist before
Linux v4.8.

Reported-by: Jann Horn <jannh@google.com>
Fixes: 446fda4f2682 ("[NetLabel]: CIPSOv4 engine")
Fixes: 3faa8f982f95 ("netlabel: Move bitmap manipulation functions to the NetLabel core.")
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/cipso_ipv4.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index eeed8b39bcb4..7fe643062013 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -167,7 +167,8 @@ static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
 		    (state == 0 && (byte & bitmask) == 0))
 			return bit_spot;
 
-		bit_spot++;
+		if (++bit_spot >= bitmap_len)
+			return -1;
 		bitmask >>= 1;
 		if (bitmask == 0) {
 			byte = bitmap[++byte_offset];
@@ -737,7 +738,8 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
 	case CIPSO_V4_MAP_PASS:
 		return 0;
 	case CIPSO_V4_MAP_TRANS:
-		if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
+		if ((level < doi_def->map.std->lvl.cipso_size) &&
+		    (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL))
 			return 0;
 		break;
 	}

From aaf68ba03cf75f1cdbc6fa51bcd615d3ed730095 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 28 Feb 2019 18:14:03 +0100
Subject: [PATCH 2756/3220] net: dsa: mv88e6xxx: Fix u64 statistics

[ Upstream commit 6e46e2d821bb22b285ae8187959096b65d063b0d ]

The switch maintains u64 counters for the number of octets sent and
received. These are kept as two u32's which need to be combined.  Fix
the combing, which wrongly worked on u16's.

Fixes: 80c4627b2719 ("dsa: mv88x6xxx: Refactor getting a single statistic")
Reported-by: Chris Healy <Chris.Healy@zii.aero>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/dsa/mv88e6xxx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 2dea39b5cb0b..e2414f2d7ba9 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -712,7 +712,7 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct dsa_switch *ds,
 		if (s->sizeof_stat == 8)
 			_mv88e6xxx_stats_read(ds, s->reg + 1, &high);
 	}
-	value = (((u64)high) << 16) | low;
+	value = (((u64)high) << 32) | low;
 	return value;
 }
 

From c9b1f85066723740f3e23fba7a974a42ca545507 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 3 Mar 2019 07:34:57 +0000
Subject: [PATCH 2757/3220] ip6mr: Do not call __IP6_INC_STATS() from
 preemptible context

[ Upstream commit 87c11f1ddbbad38ad8bad47af133a8208985fbdf ]

Similar to commit 44f49dd8b5a6 ("ipmr: fix possible race resulting from
improper usage of IP_INC_STATS_BH() in preemptible context."), we cannot
assume preemption is disabled when incrementing the counter and
accessing a per-CPU variable.

Preemption can be enabled when we add a route in process context that
corresponds to packets stored in the unresolved queue, which are then
forwarded using this route [1].

Fix this by using IP6_INC_STATS() which takes care of disabling
preemption on architectures where it is needed.

[1]
[  157.451447] BUG: using __this_cpu_add() in preemptible [00000000] code: smcrouted/2314
[  157.460409] caller is ip6mr_forward2+0x73e/0x10e0
[  157.460434] CPU: 3 PID: 2314 Comm: smcrouted Not tainted 5.0.0-rc7-custom-03635-g22f2712113f1 #1336
[  157.460449] Hardware name: Mellanox Technologies Ltd. MSN2100-CB2FO/SA001017, BIOS 5.6.5 06/07/2016
[  157.460461] Call Trace:
[  157.460486]  dump_stack+0xf9/0x1be
[  157.460553]  check_preemption_disabled+0x1d6/0x200
[  157.460576]  ip6mr_forward2+0x73e/0x10e0
[  157.460705]  ip6_mr_forward+0x9a0/0x1510
[  157.460771]  ip6mr_mfc_add+0x16b3/0x1e00
[  157.461155]  ip6_mroute_setsockopt+0x3cb/0x13c0
[  157.461384]  do_ipv6_setsockopt.isra.8+0x348/0x4060
[  157.462013]  ipv6_setsockopt+0x90/0x110
[  157.462036]  rawv6_setsockopt+0x4a/0x120
[  157.462058]  __sys_setsockopt+0x16b/0x340
[  157.462198]  __x64_sys_setsockopt+0xbf/0x160
[  157.462220]  do_syscall_64+0x14d/0x610
[  157.462349]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fixes: 0912ea38de61 ("[IPV6] MROUTE: Add stats in multicast routing module method ip6_mr_forward().")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Amit Cohen <amitc@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6mr.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 74b3e9718e84..e348a140e540 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1990,10 +1990,10 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 
 static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
-			 IPSTATS_MIB_OUTFORWDATAGRAMS);
-	IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
-			 IPSTATS_MIB_OUTOCTETS, skb->len);
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+		      IPSTATS_MIB_OUTFORWDATAGRAMS);
+	IP6_ADD_STATS(net, ip6_dst_idev(skb_dst(skb)),
+		      IPSTATS_MIB_OUTOCTETS, skb->len);
 	return dst_output(net, sk, skb);
 }
 

From bba078c9fc3c7e44370f870d97c5eed64f6c5d1f Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 18 Dec 2018 20:32:48 -0500
Subject: [PATCH 2758/3220] media: uvcvideo: Fix 'type' check leading to
 overflow

commit 47bb117911b051bbc90764a8bff96543cbd2005f upstream.

When initially testing the Camera Terminal Descriptor wTerminalType
field (buffer[4]), no mask is used. Later in the function, the MSB is
overloaded to store the descriptor subtype, and so a mask of 0x7fff
is used to check the type.

If a descriptor is specially crafted to set this overloaded bit in the
original wTerminalType field, the initial type check will fail (falling
through, without adjusting the buffer size), but the later type checks
will pass, assuming the buffer has been made suitably large, causing an
overflow.

Avoid this problem by checking for the MSB in the wTerminalType field.
If the bit is set, assume the descriptor is bad, and abort parsing it.

Originally reported here:
https://groups.google.com/forum/#!topic/syzkaller/Ot1fOE6v1d8
A similar (non-compiling) patch was provided at that time.

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Alistair Strachan <astrachan@google.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/usb/uvc/uvc_driver.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
index 885f689ac870..f2e3fdf385cc 100644
--- a/drivers/media/usb/uvc/uvc_driver.c
+++ b/drivers/media/usb/uvc/uvc_driver.c
@@ -1019,11 +1019,19 @@ static int uvc_parse_standard_control(struct uvc_device *dev,
 			return -EINVAL;
 		}
 
-		/* Make sure the terminal type MSB is not null, otherwise it
-		 * could be confused with a unit.
+		/*
+		 * Reject invalid terminal types that would cause issues:
+		 *
+		 * - The high byte must be non-zero, otherwise it would be
+		 *   confused with a unit.
+		 *
+		 * - Bit 15 must be 0, as we use it internally as a terminal
+		 *   direction flag.
+		 *
+		 * Other unknown types are accepted.
 		 */
 		type = get_unaligned_le16(&buffer[4]);
-		if ((type & 0xff00) == 0) {
+		if ((type & 0x7f00) == 0 || (type & 0x8000) != 0) {
 			uvc_trace(UVC_TRACE_DESCR, "device %d videocontrol "
 				"interface %d INPUT_TERMINAL %d has invalid "
 				"type 0x%04x, skipping\n", udev->devnum,

From a4fa2a1304120c426486f1149c8a98ea64f67aef Mon Sep 17 00:00:00 2001
From: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Date: Sun, 6 Jan 2019 21:31:20 -0500
Subject: [PATCH 2759/3220] vti4: Fix a ipip packet processing bug in 'IPCOMP'
 virtual tunnel

[ Upstream commit dd9ee3444014e8f28c0eefc9fffc9ac9c5248c12 ]

Recently we run a network test over ipcomp virtual tunnel.We find that
if a ipv4 packet needs fragment, then the peer can't receive
it.

We deep into the code and find that when packet need fragment the smaller
fragment will be encapsulated by ipip not ipcomp. So when the ipip packet
goes into xfrm, it's skb->dev is not properly set. The ipv4 reassembly code
always set skb'dev to the last fragment's dev. After ipv4 defrag processing,
when the kernel rp_filter parameter is set, the skb will be drop by -EXDEV
error.

This patch adds compatible support for the ipip process in ipcomp virtual tunnel.

Signed-off-by: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/ipv4/ip_vti.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 4b7c81f88abf..fcf327ebd134 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -75,6 +75,33 @@ drop:
 	return 0;
 }
 
+static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi,
+		     int encap_type)
+{
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+				  iph->saddr, iph->daddr, 0);
+	if (tunnel) {
+		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+			goto drop;
+
+		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+
+		skb->dev = tunnel->dev;
+
+		return xfrm_input(skb, nexthdr, spi, encap_type);
+	}
+
+	return -EINVAL;
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
 static int vti_rcv(struct sk_buff *skb)
 {
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
@@ -83,6 +110,14 @@ static int vti_rcv(struct sk_buff *skb)
 	return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
 }
 
+static int vti_rcv_ipip(struct sk_buff *skb)
+{
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+	return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0);
+}
+
 static int vti_rcv_cb(struct sk_buff *skb, int err)
 {
 	unsigned short family;
@@ -409,6 +444,12 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
 	.priority	=	100,
 };
 
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+	.handler	=	vti_rcv_ipip,
+	.err_handler	=	vti4_err,
+	.priority	=	0,
+};
+
 static int __net_init vti_init_net(struct net *net)
 {
 	int err;
@@ -592,6 +633,13 @@ static int __init vti_init(void)
 	if (err < 0)
 		goto xfrm_proto_comp_failed;
 
+	msg = "ipip tunnel";
+	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+	if (err < 0) {
+		pr_info("%s: cant't register tunnel\n",__func__);
+		goto xfrm_tunnel_failed;
+	}
+
 	msg = "netlink interface";
 	err = rtnl_link_register(&vti_link_ops);
 	if (err < 0)
@@ -601,6 +649,8 @@ static int __init vti_init(void)
 
 rtnl_link_failed:
 	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_tunnel_failed:
+	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
 xfrm_proto_comp_failed:
 	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
 xfrm_proto_ah_failed:

From aa7fb40263219c0a50bb9a39bb63d3db5477dc9a Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Sat, 19 Jan 2019 00:12:39 -0800
Subject: [PATCH 2760/3220] perf tools: Handle TOPOLOGY headers with no CPU

[ Upstream commit 1497e804d1a6e2bd9107ddf64b0310449f4673eb ]

This patch fixes an issue in cpumap.c when used with the TOPOLOGY
header. In some configurations, some NUMA nodes may have no CPU (empty
cpulist). Yet a cpumap map must be created otherwise perf abort with an
error. This patch handles this case by creating a dummy map.

  Before:

  $ perf record -o - -e cycles noploop 2 | perf script -i -
  0x6e8 [0x6c]: failed to process type: 80

  After:

  $ perf record -o - -e cycles noploop 2 | perf script -i -
  noploop for 2 seconds

Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1547885559-1657-1-git-send-email-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/util/cpumap.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 10af1e7524fb..f1aae86f7f6c 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -124,7 +124,12 @@ struct cpu_map *cpu_map__new(const char *cpu_list)
 	if (!cpu_list)
 		return cpu_map__read_all_cpu_map();
 
-	if (!isdigit(*cpu_list))
+	/*
+	 * must handle the case of empty cpumap to cover
+	 * TOPOLOGY header for NUMA nodes with no CPU
+	 * ( e.g., because of CPU hotplug)
+	 */
+	if (!isdigit(*cpu_list) && *cpu_list != '\0')
 		goto out;
 
 	while (isdigit(*cpu_list)) {
@@ -171,8 +176,10 @@ struct cpu_map *cpu_map__new(const char *cpu_list)
 
 	if (nr_cpus > 0)
 		cpus = cpu_map__trim_new(nr_cpus, tmp_cpus);
-	else
+	else if (*cpu_list != '\0')
 		cpus = cpu_map__default_new();
+	else
+		cpus = cpu_map__dummy_new();
 invalid:
 	free(tmp_cpus);
 out:

From e5bff433d050a585d4cb56a20c321236ff94f77f Mon Sep 17 00:00:00 2001
From: Brian Welty <brian.welty@intel.com>
Date: Thu, 17 Jan 2019 12:41:32 -0800
Subject: [PATCH 2761/3220] IB/{hfi1, qib}: Fix WC.byte_len calculation for
 UD_SEND_WITH_IMM

[ Upstream commit 904bba211acc2112fdf866e5a2bc6cd9ecd0de1b ]

The work completion length for a receiving a UD send with immediate is
short by 4 bytes causing application using this opcode to fail.

The UD receive logic incorrectly subtracts 4 bytes for immediate
value. These bytes are already included in header length and are used to
calculate header/payload split, so the result is these 4 bytes are
subtracted twice, once when the header length subtracted from the overall
length and once again in the UD opcode specific path.

Remove the extra subtraction when handling the opcode.

Fixes: 7724105686e7 ("IB/hfi1: add driver files")
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Brian Welty <brian.welty@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/infiniband/hw/qib/qib_ud.c | 1 -
 drivers/staging/rdma/hfi1/ud.c     | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index 59193f67ea78..56bd59bc08b5 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -515,7 +515,6 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr,
 	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
 		wc.ex.imm_data = ohdr->u.ud.imm_data;
 		wc.wc_flags = IB_WC_WITH_IMM;
-		tlen -= sizeof(u32);
 	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
 		wc.ex.imm_data = 0;
 		wc.wc_flags = 0;
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
index 5a9c784bec04..a88e37444be0 100644
--- a/drivers/staging/rdma/hfi1/ud.c
+++ b/drivers/staging/rdma/hfi1/ud.c
@@ -793,7 +793,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
 		wc.ex.imm_data = ohdr->u.ud.imm_data;
 		wc.wc_flags = IB_WC_WITH_IMM;
-		tlen -= sizeof(u32);
 	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
 		wc.ex.imm_data = 0;
 		wc.wc_flags = 0;

From 81f097d689e543b7723f01ff841c5c99f1117975 Mon Sep 17 00:00:00 2001
From: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Date: Thu, 10 Jan 2019 16:39:06 +0800
Subject: [PATCH 2762/3220] ipvs: Fix signed integer overflow when setsockopt
 timeout

[ Upstream commit 53ab60baa1ac4f20b080a22c13b77b6373922fd7 ]

There is a UBSAN bug report as below:
UBSAN: Undefined behaviour in net/netfilter/ipvs/ip_vs_ctl.c:2227:21
signed integer overflow:
-2147483647 * 1000 cannot be represented in type 'int'

Reproduce program:
	#include <stdio.h>
	#include <sys/types.h>
	#include <sys/socket.h>

	#define IPPROTO_IP 0
	#define IPPROTO_RAW 255

	#define IP_VS_BASE_CTL		(64+1024+64)
	#define IP_VS_SO_SET_TIMEOUT	(IP_VS_BASE_CTL+10)

	/* The argument to IP_VS_SO_GET_TIMEOUT */
	struct ipvs_timeout_t {
		int tcp_timeout;
		int tcp_fin_timeout;
		int udp_timeout;
	};

	int main() {
		int ret = -1;
		int sockfd = -1;
		struct ipvs_timeout_t to;

		sockfd = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
		if (sockfd == -1) {
			printf("socket init error\n");
			return -1;
		}

		to.tcp_timeout = -2147483647;
		to.tcp_fin_timeout = -2147483647;
		to.udp_timeout = -2147483647;

		ret = setsockopt(sockfd,
				 IPPROTO_IP,
				 IP_VS_SO_SET_TIMEOUT,
				 (char *)(&to),
				 sizeof(to));

		printf("setsockopt return %d\n", ret);
		return ret;
	}

Return -EINVAL if the timeout value is negative or max than 'INT_MAX / HZ'.

Signed-off-by: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 3167ec76903a..56c62b65923f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2217,6 +2217,18 @@ static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user
 		  u->tcp_fin_timeout,
 		  u->udp_timeout);
 
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
+	    u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
+		return -EINVAL;
+	}
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
+		return -EINVAL;
+#endif
+
 #ifdef CONFIG_IP_VS_PROTO_TCP
 	if (u->tcp_timeout) {
 		pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);

From 9974f32f34d52963285763919b4ed4aa544df01a Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Thu, 24 Jan 2019 04:16:45 +0000
Subject: [PATCH 2763/3220] iommu/amd: Fix IOMMU page flush when detach device
 from a domain

[ Upstream commit 9825bd94e3a2baae1f4874767ae3a7d4c049720e ]

When a VM is terminated, the VFIO driver detaches all pass-through
devices from VFIO domain by clearing domain id and page table root
pointer from each device table entry (DTE), and then invalidates
the DTE. Then, the VFIO driver unmap pages and invalidate IOMMU pages.

Currently, the IOMMU driver keeps track of which IOMMU and how many
devices are attached to the domain. When invalidate IOMMU pages,
the driver checks if the IOMMU is still attached to the domain before
issuing the invalidate page command.

However, since VFIO has already detached all devices from the domain,
the subsequent INVALIDATE_IOMMU_PAGES commands are being skipped as
there is no IOMMU attached to the domain. This results in data
corruption and could cause the PCI device to end up in indeterministic
state.

Fix this by invalidate IOMMU pages when detach a device, and
before decrementing the per-domain device reference counts.

Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Suggested-by: Joerg Roedel <joro@8bytes.org>
Co-developed-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Fixes: 6de8ad9b9ee0 ('x86/amd-iommu: Make iommu_flush_pages aware of multiple IOMMUs')
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/iommu/amd_iommu.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 52c36394dba5..0ad8b7c78a43 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1982,6 +1982,7 @@ static void do_attach(struct iommu_dev_data *dev_data,
 
 static void do_detach(struct iommu_dev_data *dev_data)
 {
+	struct protection_domain *domain = dev_data->domain;
 	struct amd_iommu *iommu;
 	u16 alias;
 
@@ -1997,10 +1998,6 @@ static void do_detach(struct iommu_dev_data *dev_data)
 	iommu = amd_iommu_rlookup_table[dev_data->devid];
 	alias = dev_data->alias;
 
-	/* decrease reference counters */
-	dev_data->domain->dev_iommu[iommu->index] -= 1;
-	dev_data->domain->dev_cnt                 -= 1;
-
 	/* Update data structures */
 	dev_data->domain = NULL;
 	list_del(&dev_data->list);
@@ -2010,6 +2007,16 @@ static void do_detach(struct iommu_dev_data *dev_data)
 
 	/* Flush the DTE entry */
 	device_flush_dte(dev_data);
+
+	/* Flush IOTLB */
+	domain_flush_tlb_pde(domain);
+
+	/* Wait for the flushes to finish */
+	domain_flush_complete(domain);
+
+	/* decrease reference counters - needs to happen after the flushes */
+	domain->dev_iommu[iommu->index] -= 1;
+	domain->dev_cnt                 -= 1;
 }
 
 /*

From f6efc18bbfc3136a829f081388368d75f0b55923 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Mon, 29 Jan 2018 09:09:41 -0800
Subject: [PATCH 2764/3220] xtensa: SMP: fix ccount_timer_shutdown

[ Upstream commit 4fe8713b873fc881284722ce4ac47995de7cf62c ]

ccount_timer_shutdown is called from the atomic context in the
secondary_start_kernel, resulting in the following BUG:

BUG: sleeping function called from invalid context
in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1
Preemption disabled at:
  secondary_start_kernel+0xa1/0x130
Call Trace:
  ___might_sleep+0xe7/0xfc
  __might_sleep+0x41/0x44
  synchronize_irq+0x24/0x64
  disable_irq+0x11/0x14
  ccount_timer_shutdown+0x12/0x20
  clockevents_switch_state+0x82/0xb4
  clockevents_exchange_device+0x54/0x60
  tick_check_new_device+0x46/0x70
  clockevents_register_device+0x8c/0xc8
  clockevents_config_and_register+0x1d/0x2c
  local_timer_setup+0x75/0x7c
  secondary_start_kernel+0xb4/0x130
  should_never_return+0x32/0x35

Use disable_irq_nosync instead of disable_irq to avoid it.
This is safe because the ccount timer IRQ is per-CPU, and once IRQ is
masked the ISR will not be called.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/xtensa/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index b9ad9feadc2d..a992cb6a47db 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -87,7 +87,7 @@ static int ccount_timer_shutdown(struct clock_event_device *evt)
 		container_of(evt, struct ccount_timer, evt);
 
 	if (timer->irq_enabled) {
-		disable_irq(evt->irq);
+		disable_irq_nosync(evt->irq);
 		timer->irq_enabled = 0;
 	}
 	return 0;

From 56b84e4201894b1cf68dc67b88d20f9493bf3072 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Fri, 21 Dec 2018 08:26:20 -0800
Subject: [PATCH 2765/3220] xtensa: SMP: fix secondary CPU initialization

[ Upstream commit 32a7726c4f4aadfabdb82440d84f88a5a2c8fe13 ]

- add missing memory barriers to the secondary CPU synchronization spin
  loops; add comment to the matching memory barrier in the boot_secondary
  and __cpu_die functions;
- use READ_ONCE/WRITE_ONCE to access cpu_start_id/cpu_start_ccount
  instead of reading/writing them directly;
- re-initialize cpu_running every time before starting secondary CPU to
  flush possible previous CPU startup results.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/xtensa/kernel/head.S |  5 ++++-
 arch/xtensa/kernel/smp.c  | 34 +++++++++++++++++++++-------------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S
index c7b3bedbfffe..e3823b4f9d08 100644
--- a/arch/xtensa/kernel/head.S
+++ b/arch/xtensa/kernel/head.S
@@ -286,12 +286,13 @@ should_never_return:
 
 	movi	a2, cpu_start_ccount
 1:
+	memw
 	l32i	a3, a2, 0
 	beqi	a3, 0, 1b
 	movi	a3, 0
 	s32i	a3, a2, 0
-	memw
 1:
+	memw
 	l32i	a3, a2, 0
 	beqi	a3, 0, 1b
 	wsr	a3, ccount
@@ -328,11 +329,13 @@ ENTRY(cpu_restart)
 	rsr	a0, prid
 	neg	a2, a0
 	movi	a3, cpu_start_id
+	memw
 	s32i	a2, a3, 0
 #if XCHAL_DCACHE_IS_WRITEBACK
 	dhwbi	a3, 0
 #endif
 1:
+	memw
 	l32i	a2, a3, 0
 	dhi	a3, 0
 	bne	a2, a0, 1b
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 4d02e38514f5..545144d1431d 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -192,9 +192,11 @@ static int boot_secondary(unsigned int cpu, struct task_struct *ts)
 	int i;
 
 #ifdef CONFIG_HOTPLUG_CPU
-	cpu_start_id = cpu;
-	system_flush_invalidate_dcache_range(
-			(unsigned long)&cpu_start_id, sizeof(cpu_start_id));
+	WRITE_ONCE(cpu_start_id, cpu);
+	/* Pairs with the third memw in the cpu_restart */
+	mb();
+	system_flush_invalidate_dcache_range((unsigned long)&cpu_start_id,
+					     sizeof(cpu_start_id));
 #endif
 	smp_call_function_single(0, mx_cpu_start, (void *)cpu, 1);
 
@@ -203,18 +205,21 @@ static int boot_secondary(unsigned int cpu, struct task_struct *ts)
 			ccount = get_ccount();
 		while (!ccount);
 
-		cpu_start_ccount = ccount;
+		WRITE_ONCE(cpu_start_ccount, ccount);
 
-		while (time_before(jiffies, timeout)) {
+		do {
+			/*
+			 * Pairs with the first two memws in the
+			 * .Lboot_secondary.
+			 */
 			mb();
-			if (!cpu_start_ccount)
-				break;
-		}
+			ccount = READ_ONCE(cpu_start_ccount);
+		} while (ccount && time_before(jiffies, timeout));
 
-		if (cpu_start_ccount) {
+		if (ccount) {
 			smp_call_function_single(0, mx_cpu_stop,
-					(void *)cpu, 1);
-			cpu_start_ccount = 0;
+						 (void *)cpu, 1);
+			WRITE_ONCE(cpu_start_ccount, 0);
 			return -EIO;
 		}
 	}
@@ -234,6 +239,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	pr_debug("%s: Calling wakeup_secondary(cpu:%d, idle:%p, sp: %08lx)\n",
 			__func__, cpu, idle, start_info.stack);
 
+	init_completion(&cpu_running);
 	ret = boot_secondary(cpu, idle);
 	if (ret == 0) {
 		wait_for_completion_timeout(&cpu_running,
@@ -295,8 +301,10 @@ void __cpu_die(unsigned int cpu)
 	unsigned long timeout = jiffies + msecs_to_jiffies(1000);
 	while (time_before(jiffies, timeout)) {
 		system_invalidate_dcache_range((unsigned long)&cpu_start_id,
-				sizeof(cpu_start_id));
-		if (cpu_start_id == -cpu) {
+					       sizeof(cpu_start_id));
+		/* Pairs with the second memw in the cpu_restart */
+		mb();
+		if (READ_ONCE(cpu_start_id) == -cpu) {
 			platform_cpu_kill(cpu);
 			return;
 		}

From ce73d179cf66f6ea37e326716b419948e3df53a6 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Thu, 24 Jan 2019 17:16:11 -0800
Subject: [PATCH 2766/3220] xtensa: smp_lx200_defconfig: fix vectors clash

[ Upstream commit 306b38305c0f86de7f17c5b091a95451dcc93d7d ]

Secondary CPU reset vector overlaps part of the double exception handler
code, resulting in weird crashes and hangups when running user code.
Move exception vectors one page up so that they don't clash with the
secondary CPU reset vector.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/xtensa/configs/smp_lx200_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig
index 22eeacba37cc..199e05f85e89 100644
--- a/arch/xtensa/configs/smp_lx200_defconfig
+++ b/arch/xtensa/configs/smp_lx200_defconfig
@@ -35,6 +35,7 @@ CONFIG_SMP=y
 CONFIG_HOTPLUG_CPU=y
 # CONFIG_INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX is not set
 # CONFIG_PCI is not set
+CONFIG_VECTORS_OFFSET=0x00002000
 CONFIG_XTENSA_PLATFORM_XTFPGA=y
 CONFIG_CMDLINE_BOOL=y
 CONFIG_CMDLINE="earlycon=uart8250,mmio32,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug"

From 19960e19a7f903062d8191a5773611e313f889b9 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Sat, 19 Jan 2019 00:26:48 -0800
Subject: [PATCH 2767/3220] xtensa: SMP: mark each possible CPU as present

[ Upstream commit 8b1c42cdd7181200dc1fff39dcb6ac1a3fac2c25 ]

Otherwise it is impossible to enable CPUs after booting with 'maxcpus'
parameter.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/xtensa/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 545144d1431d..0e34c1ed4aa8 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -80,7 +80,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	unsigned i;
 
-	for (i = 0; i < max_cpus; ++i)
+	for_each_possible_cpu(i)
 		set_cpu_present(i, true);
 }
 

From 3afc8b84643848f9d195d9856f12b620fd9a9c95 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Sat, 26 Jan 2019 20:35:18 -0800
Subject: [PATCH 2768/3220] xtensa: SMP: limit number of possible CPUs by
 NR_CPUS

[ Upstream commit 25384ce5f9530def39421597b1457d9462df6455 ]

This fixes the following warning at boot when the kernel is booted on a
board with more CPU cores than was configured in NR_CPUS:

  smp_init_cpus: Core Count = 8
  smp_init_cpus: Core Id = 0
  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 0 at include/linux/cpumask.h:121 smp_init_cpus+0x54/0x74
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 5.0.0-rc3-00015-g1459333f88a0 #124
  Call Trace:
    __warn$part$3+0x6a/0x7c
    warn_slowpath_null+0x35/0x3c
    smp_init_cpus+0x54/0x74
    setup_arch+0x1c0/0x1d0
    start_kernel+0x44/0x310
    _startup+0x107/0x107

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/xtensa/kernel/smp.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 0e34c1ed4aa8..54bb8e0473a0 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -93,6 +93,11 @@ void __init smp_init_cpus(void)
 	pr_info("%s: Core Count = %d\n", __func__, ncpus);
 	pr_info("%s: Core Id = %d\n", __func__, core_id);
 
+	if (ncpus > NR_CPUS) {
+		ncpus = NR_CPUS;
+		pr_info("%s: limiting core count by %d\n", __func__, ncpus);
+	}
+
 	for (i = 0; i < ncpus; ++i)
 		set_cpu_possible(i, true);
 }

From a270c6f52f5a4398c13d0f77da728ce50241104c Mon Sep 17 00:00:00 2001
From: Tomonori Sakita <tomonori.sakita@sord.co.jp>
Date: Fri, 25 Jan 2019 11:02:22 +0900
Subject: [PATCH 2769/3220] net: altera_tse: fix msgdma_tx_completion on
 non-zero fill_level case

[ Upstream commit 6571ebce112a21ec9be68ef2f53b96fcd41fd81b ]

If fill_level was not zero and status was not BUSY,
result of "tx_prod - tx_cons - inuse" might be zero.
Subtracting 1 unconditionally results invalid negative return value
on this case.
Make sure not to return an negative value.

Signed-off-by: Tomonori Sakita <tomonori.sakita@sord.co.jp>
Signed-off-by: Atsushi Nemoto <atsushi.nemoto@sord.co.jp>
Reviewed-by: Dalon L Westergreen <dalon.westergreen@linux.intel.com>
Acked-by: Thor Thayer <thor.thayer@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/altera/altera_msgdma.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/altera/altera_msgdma.c b/drivers/net/ethernet/altera/altera_msgdma.c
index 0fb986ba3290..0ae723f75341 100644
--- a/drivers/net/ethernet/altera/altera_msgdma.c
+++ b/drivers/net/ethernet/altera/altera_msgdma.c
@@ -145,7 +145,8 @@ u32 msgdma_tx_completions(struct altera_tse_private *priv)
 			& 0xffff;
 
 	if (inuse) { /* Tx FIFO is not empty */
-		ready = priv->tx_prod - priv->tx_cons - inuse - 1;
+		ready = max_t(int,
+			      priv->tx_prod - priv->tx_cons - inuse - 1, 0);
 	} else {
 		/* Check for buffered last packet */
 		status = csrrd32(priv->tx_dma_csr, msgdma_csroffs(status));

From f398c84acf985bbc18998d3a300d5930564d56b8 Mon Sep 17 00:00:00 2001
From: Yonglong Liu <liuyonglong@huawei.com>
Date: Sat, 26 Jan 2019 17:18:27 +0800
Subject: [PATCH 2770/3220] net: hns: Fix wrong read accesses via Clause 45
 MDIO protocol

[ Upstream commit cec8abba13e6a26729dfed41019720068eeeff2b ]

When reading phy registers via Clause 45 MDIO protocol, after write
address operation, the driver use another write address operation, so
can not read the right value of any phy registers. This patch fixes it.

Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/hisilicon/hns_mdio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns_mdio.c b/drivers/net/ethernet/hisilicon/hns_mdio.c
index 37491c85bc42..6ff13c559e52 100644
--- a/drivers/net/ethernet/hisilicon/hns_mdio.c
+++ b/drivers/net/ethernet/hisilicon/hns_mdio.c
@@ -319,7 +319,7 @@ static int hns_mdio_read(struct mii_bus *bus, int phy_id, int regnum)
 		}
 
 		hns_mdio_cmd_write(mdio_dev, is_c45,
-				   MDIO_C45_WRITE_ADDR, phy_id, devad);
+				   MDIO_C45_READ, phy_id, devad);
 	}
 
 	/* Step 5: waitting for MDIO_COMMAND_REG 's mdio_start==0,*/

From ec1f9572c841a73335d536c03e324e8dc6c060aa Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Sat, 26 Jan 2019 22:48:57 +0300
Subject: [PATCH 2771/3220] net: stmmac: dwmac-rk: fix error handling in
 rk_gmac_powerup()

[ Upstream commit c69c29a1a0a8f68cd87e98ba4a5a79fb8ef2a58c ]

If phy_power_on() fails in rk_gmac_powerup(), clocks are left enabled.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index 0cd3ecff768b..398b08e07149 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -535,8 +535,10 @@ static int rk_gmac_init(struct platform_device *pdev, void *priv)
 	int ret;
 
 	ret = phy_power_on(bsp_priv, true);
-	if (ret)
+	if (ret) {
+		gmac_clk_enable(bsp_priv, false);
 		return ret;
+	}
 
 	ret = gmac_clk_enable(bsp_priv, true);
 	if (ret)

From 35709d6424884c90483b61a14708680fc94ab36e Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 27 Jan 2019 22:58:00 +0100
Subject: [PATCH 2772/3220] gpio: vf610: Mask all GPIO interrupts

[ Upstream commit 7ae710f9f8b2cf95297e7bbfe1c09789a7dc43d4 ]

On SoC reset all GPIO interrupts are disable. However, if kexec is
used to boot into a new kernel, the SoC does not experience a
reset. Hence GPIO interrupts can be left enabled from the previous
kernel. It is then possible for the interrupt to fire before an
interrupt handler is registered, resulting in the kernel complaining
of an "unexpected IRQ trap", the interrupt is never cleared, and so
fires again, resulting in an interrupt storm.

Disable all GPIO interrupts before registering the GPIO IRQ chip.

Fixes: 7f2691a19627 ("gpio: vf610: add gpiolib/IRQ chip driver for Vybrid")
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Stefan Agner <stefan@agner.ch>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpio/gpio-vf610.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c
index 87b950cec6ec..db95c4b99a74 100644
--- a/drivers/gpio/gpio-vf610.c
+++ b/drivers/gpio/gpio-vf610.c
@@ -227,6 +227,7 @@ static int vf610_gpio_probe(struct platform_device *pdev)
 	struct vf610_gpio_port *port;
 	struct resource *iores;
 	struct gpio_chip *gc;
+	int i;
 	int ret;
 
 	port = devm_kzalloc(&pdev->dev, sizeof(*port), GFP_KERNEL);
@@ -265,6 +266,10 @@ static int vf610_gpio_probe(struct platform_device *pdev)
 	if (ret < 0)
 		return ret;
 
+	/* Mask all GPIO interrupts */
+	for (i = 0; i < gc->ngpio; i++)
+		vf610_gpio_writel(0, port->base + PORT_PCR(i));
+
 	/* Clear the interrupt status register for all GPIO's */
 	vf610_gpio_writel(~0, port->base + PORT_ISFR);
 

From e7e01b2acf94916f190a705c11e0e1d0b3d07c1c Mon Sep 17 00:00:00 2001
From: Yao Liu <yotta.liu@ucloud.cn>
Date: Mon, 28 Jan 2019 19:44:14 +0800
Subject: [PATCH 2773/3220] nfs: Fix NULL pointer dereference of dev_name

[ Upstream commit 80ff00172407e0aad4b10b94ef0816fc3e7813cb ]

There is a NULL pointer dereference of dev_name in nfs_parse_devname()

The oops looks something like:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  ...
  RIP: 0010:nfs_fs_mount+0x3b6/0xc20 [nfs]
  ...
  Call Trace:
   ? ida_alloc_range+0x34b/0x3d0
   ? nfs_clone_super+0x80/0x80 [nfs]
   ? nfs_free_parsed_mount_data+0x60/0x60 [nfs]
   mount_fs+0x52/0x170
   ? __init_waitqueue_head+0x3b/0x50
   vfs_kern_mount+0x6b/0x170
   do_mount+0x216/0xdc0
   ksys_mount+0x83/0xd0
   __x64_sys_mount+0x25/0x30
   do_syscall_64+0x65/0x220
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fix this by adding a NULL check on dev_name

Signed-off-by: Yao Liu <yotta.liu@ucloud.cn>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 412fcfbc50e2..9b42139a479b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1877,6 +1877,11 @@ static int nfs_parse_devname(const char *dev_name,
 	size_t len;
 	char *end;
 
+	if (unlikely(!dev_name || !*dev_name)) {
+		dfprintk(MOUNT, "NFS: device name not specified\n");
+		return -EINVAL;
+	}
+
 	/* Is the host name protected with square brakcets? */
 	if (*dev_name == '[') {
 		end = strchr(++dev_name, ']');

From 930cc2bd8d5c85cbf5b3e53905dd7037ee5064aa Mon Sep 17 00:00:00 2001
From: Ming Lu <ming.lu@citrix.com>
Date: Thu, 24 Jan 2019 13:25:42 +0800
Subject: [PATCH 2774/3220] scsi: libfc: free skb when receiving invalid flogi
 resp

[ Upstream commit 5d8fc4a9f0eec20b6c07895022a6bea3fb6dfb38 ]

The issue to be fixed in this commit is when libfc found it received a
invalid FLOGI response from FC switch, it would return without freeing the
fc frame, which is just the skb data. This would cause memory leak if FC
switch keeps sending invalid FLOGI responses.

This fix is just to make it execute `fc_frame_free(fp)` before returning
from function `fc_lport_flogi_resp`.

Signed-off-by: Ming Lu <ming.lu@citrix.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/libfc/fc_lport.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c
index e01a29863c38..867fc036d6ef 100644
--- a/drivers/scsi/libfc/fc_lport.c
+++ b/drivers/scsi/libfc/fc_lport.c
@@ -1739,14 +1739,14 @@ void fc_lport_flogi_resp(struct fc_seq *sp, struct fc_frame *fp,
 	    fc_frame_payload_op(fp) != ELS_LS_ACC) {
 		FC_LPORT_DBG(lport, "FLOGI not accepted or bad response\n");
 		fc_lport_error(lport, fp);
-		goto err;
+		goto out;
 	}
 
 	flp = fc_frame_payload_get(fp, sizeof(*flp));
 	if (!flp) {
 		FC_LPORT_DBG(lport, "FLOGI bad response\n");
 		fc_lport_error(lport, fp);
-		goto err;
+		goto out;
 	}
 
 	mfs = ntohs(flp->fl_csp.sp_bb_data) &
@@ -1756,7 +1756,7 @@ void fc_lport_flogi_resp(struct fc_seq *sp, struct fc_frame *fp,
 		FC_LPORT_DBG(lport, "FLOGI bad mfs:%hu response, "
 			     "lport->mfs:%hu\n", mfs, lport->mfs);
 		fc_lport_error(lport, fp);
-		goto err;
+		goto out;
 	}
 
 	if (mfs <= lport->mfs) {

From 177a51b9adadd522da515bad516da961b11c28ef Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@kernel.org>
Date: Thu, 24 Jan 2019 19:31:01 +0000
Subject: [PATCH 2775/3220] platform/x86: Fix unmet dependency warning for
 SAMSUNG_Q10

[ Upstream commit 0ee4b5f801b73b83a9fb3921d725f2162fd4a2e5 ]

Add BACKLIGHT_LCD_SUPPORT for SAMSUNG_Q10 to fix the
warning: unmet direct dependencies detected for BACKLIGHT_CLASS_DEVICE.

SAMSUNG_Q10 selects BACKLIGHT_CLASS_DEVICE but BACKLIGHT_CLASS_DEVICE
depends on BACKLIGHT_LCD_SUPPORT.

Copy BACKLIGHT_LCD_SUPPORT dependency into SAMSUNG_Q10 to fix:

WARNING: unmet direct dependencies detected for BACKLIGHT_CLASS_DEVICE
  Depends on [n]: HAS_IOMEM [=y] && BACKLIGHT_LCD_SUPPORT [=n]
  Selected by [y]:
  - SAMSUNG_Q10 [=y] && X86 [=y] && X86_PLATFORM_DEVICES [=y] && ACPI [=y]

Signed-off-by: Sinan Kaya <okaya@kernel.org>
Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/platform/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 988ebe9a6b90..953974b5a9a9 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -881,6 +881,7 @@ config INTEL_OAKTRAIL
 config SAMSUNG_Q10
 	tristate "Samsung Q10 Extras"
 	depends on ACPI
+	depends on BACKLIGHT_LCD_SUPPORT
 	select BACKLIGHT_CLASS_DEVICE
 	---help---
 	  This driver provides support for backlight control on Samsung Q10

From 3cefd2a4256823d58e4a8aadb897eed918761f7a Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <lsahlber@redhat.com>
Date: Tue, 29 Jan 2019 12:46:16 +1000
Subject: [PATCH 2776/3220] cifs: fix computation for MAX_SMB2_HDR_SIZE

[ Upstream commit 58d15ed1203f4d858c339ea4d7dafa94bd2a56d3 ]

The size of the fixed part of the create response is 88 bytes not 56.

Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/smb2pdu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index aacb15bd56fe..f087158c5555 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -82,8 +82,8 @@
 
 #define NUMBER_OF_SMB2_COMMANDS	0x0013
 
-/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */
-#define MAX_SMB2_HDR_SIZE 0x00b0
+/* 52 transform hdr + 64 hdr + 88 create rsp */
+#define MAX_SMB2_HDR_SIZE 204
 
 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
 

From b8c82bd0cc5e4880b478fcda883e5040dd13e50e Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@redhat.com>
Date: Fri, 18 Jan 2019 19:13:08 +0800
Subject: [PATCH 2777/3220] x86/kexec: Don't setup EFI info if EFI runtime is
 not enabled

[ Upstream commit 2aa958c99c7fd3162b089a1a56a34a0cdb778de1 ]

Kexec-ing a kernel with "efi=noruntime" on the first kernel's command
line causes the following null pointer dereference:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  #PF error: [normal kernel read fault]
  Call Trace:
   efi_runtime_map_copy+0x28/0x30
   bzImage64_load+0x688/0x872
   arch_kexec_kernel_image_load+0x6d/0x70
   kimage_file_alloc_init+0x13e/0x220
   __x64_sys_kexec_file_load+0x144/0x290
   do_syscall_64+0x55/0x1a0
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

Just skip the EFI info setup if EFI runtime services are not enabled.

 [ bp: Massage commit message. ]

Suggested-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Kairui Song <kasong@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: bhe@redhat.com
Cc: David Howells <dhowells@redhat.com>
Cc: erik.schmauss@intel.com
Cc: fanc.fnst@cn.fujitsu.com
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: kexec@lists.infradead.org
Cc: lenb@kernel.org
Cc: linux-acpi@vger.kernel.org
Cc: Philipp Rudo <prudo@linux.vnet.ibm.com>
Cc: rafael.j.wysocki@intel.com
Cc: robert.moore@intel.com
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86-ml <x86@kernel.org>
Cc: Yannik Sembritzki <yannik@sembritzki.me>
Link: https://lkml.kernel.org/r/20190118111310.29589-2-kasong@redhat.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kernel/kexec-bzimage64.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 0f8a6bbaaa44..0bf17576dd2a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -168,6 +168,9 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
 	struct efi_info *current_ei = &boot_params.efi_info;
 	struct efi_info *ei = &params->efi_info;
 
+	if (!efi_enabled(EFI_RUNTIME_SERVICES))
+		return 0;
+
 	if (!current_ei->efi_memmap_size)
 		return 0;
 

From d4cf6d934f72915df1cfd75ee390176bf3a4c99a Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Fri, 1 Feb 2019 14:20:20 -0800
Subject: [PATCH 2778/3220] x86_64: increase stack size for KASAN_EXTRA

[ Upstream commit a8e911d13540487942d53137c156bd7707f66e5d ]

If the kernel is configured with KASAN_EXTRA, the stack size is
increasted significantly because this option sets "-fstack-reuse" to
"none" in GCC [1].  As a result, it triggers stack overrun quite often
with 32k stack size compiled using GCC 8.  For example, this reproducer

  https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/syscalls/madvise/madvise06.c

triggers a "corrupted stack end detected inside scheduler" very reliably
with CONFIG_SCHED_STACK_END_CHECK enabled.

There are just too many functions that could have a large stack with
KASAN_EXTRA due to large local variables that have been called over and
over again without being able to reuse the stacks.  Some noticiable ones
are

  size
  7648 shrink_page_list
  3584 xfs_rmap_convert
  3312 migrate_page_move_mapping
  3312 dev_ethtool
  3200 migrate_misplaced_transhuge_page
  3168 copy_process

There are other 49 functions are over 2k in size while compiling kernel
with "-Wframe-larger-than=" even with a related minimal config on this
machine.  Hence, it is too much work to change Makefiles for each object
to compile without "-fsanitize-address-use-after-scope" individually.

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715#c23

Although there is a patch in GCC 9 to help the situation, GCC 9 probably
won't be released in a few months and then it probably take another
6-month to 1-year for all major distros to include it as a default.
Hence, the stack usage with KASAN_EXTRA can be revisited again in 2020
when GCC 9 is everywhere.  Until then, this patch will help users avoid
stack overrun.

This has already been fixed for arm64 for the same reason via
6e8830674ea ("arm64: kasan: Increase stack size for KASAN_EXTRA").

Link: http://lkml.kernel.org/r/20190109215209.2903-1-cai@lca.pw
Signed-off-by: Qian Cai <cai@lca.pw>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/include/asm/page_64_types.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 4928cf0d5af0..fb1251946b45 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -2,7 +2,11 @@
 #define _ASM_X86_PAGE_64_DEFS_H
 
 #ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_EXTRA
+#define KASAN_STACK_ORDER 2
+#else
 #define KASAN_STACK_ORDER 1
+#endif
 #else
 #define KASAN_STACK_ORDER 0
 #endif

From 3ba0452668ea16ace809d6b0d86cc261dc835b4d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 1 Feb 2019 14:20:34 -0800
Subject: [PATCH 2779/3220] mm, memory_hotplug: is_mem_section_removable do not
 pass the end of a zone

[ Upstream commit efad4e475c312456edb3c789d0996d12ed744c13 ]

Patch series "mm, memory_hotplug: fix uninitialized pages fallouts", v2.

Mikhail Zaslonko has posted fixes for the two bugs quite some time ago
[1].  I have pushed back on those fixes because I believed that it is
much better to plug the problem at the initialization time rather than
play whack-a-mole all over the hotplug code and find all the places
which expect the full memory section to be initialized.

We have ended up with commit 2830bf6f05fb ("mm, memory_hotplug:
initialize struct pages for the full memory section") merged and cause a
regression [2][3].  The reason is that there might be memory layouts
when two NUMA nodes share the same memory section so the merged fix is
simply incorrect.

In order to plug this hole we really have to be zone range aware in
those handlers.  I have split up the original patch into two.  One is
unchanged (patch 2) and I took a different approach for `removable'
crash.

[1] http://lkml.kernel.org/r/20181105150401.97287-2-zaslonko@linux.ibm.com
[2] https://bugzilla.redhat.com/show_bug.cgi?id=1666948
[3] http://lkml.kernel.org/r/20190125163938.GA20411@dhcp22.suse.cz

This patch (of 2):

Mikhail has reported the following VM_BUG_ON triggered when reading sysfs
removable state of a memory block:

 page:000003d08300c000 is uninitialized and poisoned
 page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
 Call Trace:
   is_mem_section_removable+0xb4/0x190
   show_mem_removable+0x9a/0xd8
   dev_attr_show+0x34/0x70
   sysfs_kf_seq_show+0xc8/0x148
   seq_read+0x204/0x480
   __vfs_read+0x32/0x178
   vfs_read+0x82/0x138
   ksys_read+0x5a/0xb0
   system_call+0xdc/0x2d8
 Last Breaking-Event-Address:
   is_mem_section_removable+0xb4/0x190
 Kernel panic - not syncing: Fatal exception: panic_on_oops

The reason is that the memory block spans the zone boundary and we are
stumbling over an unitialized struct page.  Fix this by enforcing zone
range in is_mem_section_removable so that we never run away from a zone.

Link: http://lkml.kernel.org/r/20190128144506.15603-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Debugged-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Tested-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/memory_hotplug.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0addef5f8aa3..153acbf5f83d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1358,7 +1358,8 @@ static struct page *next_active_pageblock(struct page *page)
 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
 	struct page *page = pfn_to_page(start_pfn);
-	struct page *end_page = page + nr_pages;
+	unsigned long end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(page)));
+	struct page *end_page = pfn_to_page(end_pfn);
 
 	/* Check the starting page of each pageblock within the range */
 	for (; page < end_page; page = next_active_pageblock(page)) {

From fcd11325572d2965ac27b09cafde485657176407 Mon Sep 17 00:00:00 2001
From: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Date: Fri, 1 Feb 2019 14:20:38 -0800
Subject: [PATCH 2780/3220] mm, memory_hotplug: test_pages_in_a_zone do not
 pass the end of zone

[ Upstream commit 24feb47c5fa5b825efb0151f28906dfdad027e61 ]

If memory end is not aligned with the sparse memory section boundary,
the mapping of such a section is only partly initialized.  This may lead
to VM_BUG_ON due to uninitialized struct pages access from
test_pages_in_a_zone() function triggered by memory_hotplug sysfs
handlers.

Here are the the panic examples:
 CONFIG_DEBUG_VM_PGFLAGS=y
 kernel parameter mem=2050M
 --------------------------
 page:000003d082008000 is uninitialized and poisoned
 page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
 Call Trace:
   test_pages_in_a_zone+0xde/0x160
   show_valid_zones+0x5c/0x190
   dev_attr_show+0x34/0x70
   sysfs_kf_seq_show+0xc8/0x148
   seq_read+0x204/0x480
   __vfs_read+0x32/0x178
   vfs_read+0x82/0x138
   ksys_read+0x5a/0xb0
   system_call+0xdc/0x2d8
 Last Breaking-Event-Address:
   test_pages_in_a_zone+0xde/0x160
 Kernel panic - not syncing: Fatal exception: panic_on_oops

Fix this by checking whether the pfn to check is within the zone.

[mhocko@suse.com: separated this change from http://lkml.kernel.org/r/20181105150401.97287-2-zaslonko@linux.ibm.com]
Link: http://lkml.kernel.org/r/20190128144506.15603-3-mhocko@kernel.org

[mhocko@suse.com: separated this change from
http://lkml.kernel.org/r/20181105150401.97287-2-zaslonko@linux.ibm.com]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Tested-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/memory_hotplug.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 153acbf5f83d..804cbfe9132d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1399,6 +1399,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
 				i++;
 			if (i == MAX_ORDER_NR_PAGES)
 				continue;
+			/* Check if we got outside of the zone */
+			if (zone && !zone_spans_pfn(zone, pfn + i))
+				return 0;
 			page = pfn_to_page(pfn + i);
 			if (zone && page_zone(page) != zone)
 				return 0;

From 5d5a802ec2eb1d271ca22e184641015ae0a7b8b2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 1 Feb 2019 14:21:23 -0800
Subject: [PATCH 2781/3220] fs/drop_caches.c: avoid softlockups in
 drop_pagecache_sb()

[ Upstream commit c27d82f52f75fc9d8d9d40d120d2a96fdeeada5e ]

When superblock has lots of inodes without any pagecache (like is the
case for /proc), drop_pagecache_sb() will iterate through all of them
without dropping sb->s_inode_list_lock which can lead to softlockups
(one of our customers hit this).

Fix the problem by going to the slow path and doing cond_resched() in
case the process needs rescheduling.

Link: http://lkml.kernel.org/r/20190114085343.15011-1-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/drop_caches.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d72d52b90433..280460fef066 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -20,8 +20,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
+		/*
+		 * We must skip inodes in unusual state. We may also skip
+		 * inodes without pages but we deliberately won't in case
+		 * we need to reschedule to avoid softlockups.
+		 */
 		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-		    (inode->i_mapping->nrpages == 0)) {
+		    (inode->i_mapping->nrpages == 0 && !need_resched())) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
@@ -29,6 +34,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&sb->s_inode_list_lock);
 
+		cond_resched();
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
 		toput_inode = inode;

From 0d456fae1fb567882e95a4f8d5151718b5c8129d Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Fri, 1 Feb 2019 14:21:26 -0800
Subject: [PATCH 2782/3220] autofs: drop dentry reference only when it is never
 used

[ Upstream commit 63ce5f552beb9bdb41546b3a26c4374758b21815 ]

autofs_expire_run() calls dput(dentry) to drop the reference count of
dentry.  However, dentry is read via autofs_dentry_ino(dentry) after
that.  This may result in a use-free-bug.  The patch drops the reference
count of dentry only when it is never used.

Link: http://lkml.kernel.org/r/154725122396.11260.16053424107144453867.stgit@pluto-themaw-net
Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/autofs4/expire.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 7a5a598a2d94..0d8b9c4f27f2 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -560,7 +560,6 @@ int autofs4_expire_run(struct super_block *sb,
 	pkt.len = dentry->d_name.len;
 	memcpy(pkt.name, dentry->d_name.name, pkt.len);
 	pkt.name[pkt.len] = '\0';
-	dput(dentry);
 
 	if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
 		ret = -EFAULT;
@@ -573,6 +572,8 @@ int autofs4_expire_run(struct super_block *sb,
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
+	dput(dentry);
+
 	return ret;
 }
 

From 648fca89b734190237dd901aed8b9348ef444f9a Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 1 Feb 2019 14:21:29 -0800
Subject: [PATCH 2783/3220] autofs: fix error return in autofs_fill_super()

[ Upstream commit f585b283e3f025754c45bbe7533fc6e5c4643700 ]

In autofs_fill_super() on error of get inode/make root dentry the return
should be ENOMEM as this is the only failure case of the called
functions.

Link: http://lkml.kernel.org/r/154725123240.11260.796773942606871359.stgit@pluto-themaw-net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/autofs4/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 1132fe71b312..0fd472d67029 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -255,8 +255,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	}
 	root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
 	root = d_make_root(root_inode);
-	if (!root)
+	if (!root) {
+		ret = -ENOMEM;
 		goto fail_ino;
+	}
 	pipe = NULL;
 
 	root->d_fsdata = ino;

From e83b1928c838f561b4d86fe8f51af9c4195973d0 Mon Sep 17 00:00:00 2001
From: Peng Hao <peng.hao2@zte.com.cn>
Date: Sat, 29 Dec 2018 13:10:06 +0800
Subject: [PATCH 2784/3220] ARM: pxa: ssp: unneeded to free devm_ allocated
 data

[ Upstream commit ba16adeb346387eb2d1ada69003588be96f098fa ]

devm_ allocated data will be automatically freed. The free
of devm_ allocated data is invalid.

Fixes: 1c459de1e645 ("ARM: pxa: ssp: use devm_ functions")
Signed-off-by: Peng Hao <peng.hao2@zte.com.cn>
[title's prefix changed]
Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/plat-pxa/ssp.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/arm/plat-pxa/ssp.c b/arch/arm/plat-pxa/ssp.c
index daa1a65f2eb7..6748827c2ec8 100644
--- a/arch/arm/plat-pxa/ssp.c
+++ b/arch/arm/plat-pxa/ssp.c
@@ -238,8 +238,6 @@ static int pxa_ssp_remove(struct platform_device *pdev)
 	if (ssp == NULL)
 		return -ENODEV;
 
-	iounmap(ssp->mmio_base);
-
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	release_mem_region(res->start, resource_size(res));
 
@@ -249,7 +247,6 @@ static int pxa_ssp_remove(struct platform_device *pdev)
 	list_del(&ssp->node);
 	mutex_unlock(&ssp_lock);
 
-	kfree(ssp);
 	return 0;
 }
 

From 4560637bde5c9fe909cf4427a041aee84eb6998c Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 28 Jan 2019 16:59:35 +0100
Subject: [PATCH 2785/3220] irqchip/mmp: Only touch the PJ4 IRQ & FIQ bits on
 enable/disable

[ Upstream commit 2380a22b60ce6f995eac806e69c66e397b59d045 ]

Resetting bit 4 disables the interrupt delivery to the "secure
processor" core. This breaks the keyboard on a OLPC XO 1.75 laptop,
where the firmware running on the "secure processor" bit-bangs the
PS/2 protocol over the GPIO lines.

It is not clear what the rest of the bits are and Marvell was unhelpful
when asked for documentation. Aside from the SP bit, there are probably
priority bits.

Leaving the unknown bits as the firmware set them up seems to be a wiser
course of action compared to just turning them off.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Acked-by: Pavel Machek <pavel@ucw.cz>
[maz: fixed-up subject and commit message]
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/irqchip/irq-mmp.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-mmp.c b/drivers/irqchip/irq-mmp.c
index 013fc9659a84..2fe2bcb63a71 100644
--- a/drivers/irqchip/irq-mmp.c
+++ b/drivers/irqchip/irq-mmp.c
@@ -34,6 +34,9 @@
 #define SEL_INT_PENDING		(1 << 6)
 #define SEL_INT_NUM_MASK	0x3f
 
+#define MMP2_ICU_INT_ROUTE_PJ4_IRQ	(1 << 5)
+#define MMP2_ICU_INT_ROUTE_PJ4_FIQ	(1 << 6)
+
 struct icu_chip_data {
 	int			nr_irqs;
 	unsigned int		virq_base;
@@ -190,7 +193,8 @@ static struct mmp_intc_conf mmp_conf = {
 static struct mmp_intc_conf mmp2_conf = {
 	.conf_enable	= 0x20,
 	.conf_disable	= 0x0,
-	.conf_mask	= 0x7f,
+	.conf_mask	= MMP2_ICU_INT_ROUTE_PJ4_IRQ |
+			  MMP2_ICU_INT_ROUTE_PJ4_FIQ,
 };
 
 static void __exception_irq_entry mmp_handle_irq(struct pt_regs *regs)

From 86785a16817cb5b27efc04bfa0327f243116e38d Mon Sep 17 00:00:00 2001
From: Codrin Ciubotariu <codrin.ciubotariu@microchip.com>
Date: Wed, 23 Jan 2019 16:33:47 +0000
Subject: [PATCH 2786/3220] dmaengine: at_xdmac: Fix wrongfull report of a
 channel as in use

[ Upstream commit dc3f595b6617ebc0307e0ce151e8f2f2b2489b95 ]

atchan->status variable is used to store two different information:
 - pass channel interrupts status from interrupt handler to tasklet;
 - channel information like whether it is cyclic or paused;

This causes a bug when device_terminate_all() is called,
(AT_XDMAC_CHAN_IS_CYCLIC cleared on atchan->status) and then a late End
of Block interrupt arrives (AT_XDMAC_CIS_BIS), which sets bit 0 of
atchan->status. Bit 0 is also used for AT_XDMAC_CHAN_IS_CYCLIC, so when
a new descriptor for a cyclic transfer is created, the driver reports
the channel as in use:

if (test_and_set_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status)) {
	dev_err(chan2dev(chan), "channel currently used\n");
	return NULL;
}

This patch fixes the bug by adding a different struct member to keep
the interrupts status separated from the channel status bits.

Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Codrin Ciubotariu <codrin.ciubotariu@microchip.com>
Acked-by: Ludovic Desroches <ludovic.desroches@microchip.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/dma/at_xdmac.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 82a7c89caae2..af24c5bf32d6 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -203,6 +203,7 @@ struct at_xdmac_chan {
 	u32				save_cim;
 	u32				save_cnda;
 	u32				save_cndc;
+	u32				irq_status;
 	unsigned long			status;
 	struct tasklet_struct		tasklet;
 	struct dma_slave_config		sconfig;
@@ -1582,8 +1583,8 @@ static void at_xdmac_tasklet(unsigned long data)
 	struct at_xdmac_desc	*desc;
 	u32			error_mask;
 
-	dev_dbg(chan2dev(&atchan->chan), "%s: status=0x%08lx\n",
-		 __func__, atchan->status);
+	dev_dbg(chan2dev(&atchan->chan), "%s: status=0x%08x\n",
+		__func__, atchan->irq_status);
 
 	error_mask = AT_XDMAC_CIS_RBEIS
 		     | AT_XDMAC_CIS_WBEIS
@@ -1591,15 +1592,15 @@ static void at_xdmac_tasklet(unsigned long data)
 
 	if (at_xdmac_chan_is_cyclic(atchan)) {
 		at_xdmac_handle_cyclic(atchan);
-	} else if ((atchan->status & AT_XDMAC_CIS_LIS)
-		   || (atchan->status & error_mask)) {
+	} else if ((atchan->irq_status & AT_XDMAC_CIS_LIS)
+		   || (atchan->irq_status & error_mask)) {
 		struct dma_async_tx_descriptor  *txd;
 
-		if (atchan->status & AT_XDMAC_CIS_RBEIS)
+		if (atchan->irq_status & AT_XDMAC_CIS_RBEIS)
 			dev_err(chan2dev(&atchan->chan), "read bus error!!!");
-		if (atchan->status & AT_XDMAC_CIS_WBEIS)
+		if (atchan->irq_status & AT_XDMAC_CIS_WBEIS)
 			dev_err(chan2dev(&atchan->chan), "write bus error!!!");
-		if (atchan->status & AT_XDMAC_CIS_ROIS)
+		if (atchan->irq_status & AT_XDMAC_CIS_ROIS)
 			dev_err(chan2dev(&atchan->chan), "request overflow error!!!");
 
 		spin_lock_bh(&atchan->lock);
@@ -1654,7 +1655,7 @@ static irqreturn_t at_xdmac_interrupt(int irq, void *dev_id)
 			atchan = &atxdmac->chan[i];
 			chan_imr = at_xdmac_chan_read(atchan, AT_XDMAC_CIM);
 			chan_status = at_xdmac_chan_read(atchan, AT_XDMAC_CIS);
-			atchan->status = chan_status & chan_imr;
+			atchan->irq_status = chan_status & chan_imr;
 			dev_vdbg(atxdmac->dma.dev,
 				 "%s: chan%d: imr=0x%x, status=0x%x\n",
 				 __func__, i, chan_imr, chan_status);
@@ -1668,7 +1669,7 @@ static irqreturn_t at_xdmac_interrupt(int irq, void *dev_id)
 				 at_xdmac_chan_read(atchan, AT_XDMAC_CDA),
 				 at_xdmac_chan_read(atchan, AT_XDMAC_CUBC));
 
-			if (atchan->status & (AT_XDMAC_CIS_RBEIS | AT_XDMAC_CIS_WBEIS))
+			if (atchan->irq_status & (AT_XDMAC_CIS_RBEIS | AT_XDMAC_CIS_WBEIS))
 				at_xdmac_write(atxdmac, AT_XDMAC_GD, atchan->mask);
 
 			tasklet_schedule(&atchan->tasklet);

From 683c0116fbc1fb5e756e7df2df3afe9152d2fd05 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jan 2019 21:48:44 +0200
Subject: [PATCH 2787/3220] dmaengine: dmatest: Abort test in case of mapping
 error

[ Upstream commit 6454368a804c4955ccd116236037536f81e5b1f1 ]

In case of mapping error the DMA addresses are invalid and continuing
will screw system memory or potentially something else.

[  222.480310] dmatest: dma0chan7-copy0: summary 1 tests, 3 failures 6 iops 349 KB/s (0)
...
[  240.912725] check: Corrupted low memory at 00000000c7c75ac9 (2940 phys) = 5656000000000000
[  240.921998] check: Corrupted low memory at 000000005715a1cd (2948 phys) = 279f2aca5595ab2b
[  240.931280] check: Corrupted low memory at 000000002f4024c0 (2950 phys) = 5e5624f349e793cf
...

Abort any test if mapping failed.

Fixes: 4076e755dbec ("dmatest: convert to dmaengine_unmap_data")
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/dma/dmatest.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 6796eb1a8a4c..884aecebb249 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -563,11 +563,9 @@ static int dmatest_func(void *data)
 			srcs[i] = um->addr[i] + src_off;
 			ret = dma_mapping_error(dev->dev, um->addr[i]);
 			if (ret) {
-				dmaengine_unmap_put(um);
 				result("src mapping error", total_tests,
 				       src_off, dst_off, len, ret);
-				failed_tests++;
-				continue;
+				goto error_unmap_continue;
 			}
 			um->to_cnt++;
 		}
@@ -582,11 +580,9 @@ static int dmatest_func(void *data)
 					       DMA_BIDIRECTIONAL);
 			ret = dma_mapping_error(dev->dev, dsts[i]);
 			if (ret) {
-				dmaengine_unmap_put(um);
 				result("dst mapping error", total_tests,
 				       src_off, dst_off, len, ret);
-				failed_tests++;
-				continue;
+				goto error_unmap_continue;
 			}
 			um->bidi_cnt++;
 		}
@@ -611,12 +607,10 @@ static int dmatest_func(void *data)
 		}
 
 		if (!tx) {
-			dmaengine_unmap_put(um);
 			result("prep error", total_tests, src_off,
 			       dst_off, len, ret);
 			msleep(100);
-			failed_tests++;
-			continue;
+			goto error_unmap_continue;
 		}
 
 		done->done = false;
@@ -625,12 +619,10 @@ static int dmatest_func(void *data)
 		cookie = tx->tx_submit(tx);
 
 		if (dma_submit_error(cookie)) {
-			dmaengine_unmap_put(um);
 			result("submit error", total_tests, src_off,
 			       dst_off, len, ret);
 			msleep(100);
-			failed_tests++;
-			continue;
+			goto error_unmap_continue;
 		}
 		dma_async_issue_pending(chan);
 
@@ -643,16 +635,14 @@ static int dmatest_func(void *data)
 			dmaengine_unmap_put(um);
 			result("test timed out", total_tests, src_off, dst_off,
 			       len, 0);
-			failed_tests++;
-			continue;
+			goto error_unmap_continue;
 		} else if (status != DMA_COMPLETE) {
 			dmaengine_unmap_put(um);
 			result(status == DMA_ERROR ?
 			       "completion error status" :
 			       "completion busy status", total_tests, src_off,
 			       dst_off, len, ret);
-			failed_tests++;
-			continue;
+			goto error_unmap_continue;
 		}
 
 		dmaengine_unmap_put(um);
@@ -691,6 +681,12 @@ static int dmatest_func(void *data)
 			verbose_result("test passed", total_tests, src_off,
 				       dst_off, len, 0);
 		}
+
+		continue;
+
+error_unmap_continue:
+		dmaengine_unmap_put(um);
+		failed_tests++;
 	}
 	runtime = ktime_us_delta(ktime_get(), ktime);
 

From 74daf70b852a9a91277209246a290b14bbb950ef Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Mon, 4 Feb 2019 17:40:07 +0100
Subject: [PATCH 2788/3220] s390/qeth: fix use-after-free in error path

[ Upstream commit afa0c5904ba16d59b0454f7ee4c807dae350f432 ]

The error path in qeth_alloc_qdio_buffers() that takes care of
cleaning up the Output Queues is buggy. It first frees the queue, but
then calls qeth_clear_outq_buffers() with that very queue struct.

Make the call to qeth_clear_outq_buffers() part of the free action
(in the correct order), and while at it fix the naming of the helper.

Fixes: 0da9581ddb0f ("qeth: exploit asynchronous delivery of storage blocks")
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/s390/net/qeth_core_main.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 533bd2467910..b40604d0126f 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -2452,11 +2452,12 @@ out:
 	return rc;
 }
 
-static void qeth_free_qdio_out_buf(struct qeth_qdio_out_q *q)
+static void qeth_free_output_queue(struct qeth_qdio_out_q *q)
 {
 	if (!q)
 		return;
 
+	qeth_clear_outq_buffers(q, 1);
 	qdio_free_buffers(q->qdio_bufs, QDIO_MAX_BUFFERS_PER_Q);
 	kfree(q);
 }
@@ -2529,10 +2530,8 @@ out_freeoutqbufs:
 		card->qdio.out_qs[i]->bufs[j] = NULL;
 	}
 out_freeoutq:
-	while (i > 0) {
-		qeth_free_qdio_out_buf(card->qdio.out_qs[--i]);
-		qeth_clear_outq_buffers(card->qdio.out_qs[i], 1);
-	}
+	while (i > 0)
+		qeth_free_output_queue(card->qdio.out_qs[--i]);
 	kfree(card->qdio.out_qs);
 	card->qdio.out_qs = NULL;
 out_freepool:
@@ -2565,10 +2564,8 @@ static void qeth_free_qdio_buffers(struct qeth_card *card)
 	qeth_free_buffer_pool(card);
 	/* free outbound qdio_qs */
 	if (card->qdio.out_qs) {
-		for (i = 0; i < card->qdio.no_out_queues; ++i) {
-			qeth_clear_outq_buffers(card->qdio.out_qs[i], 1);
-			qeth_free_qdio_out_buf(card->qdio.out_qs[i]);
-		}
+		for (i = 0; i < card->qdio.no_out_queues; i++)
+			qeth_free_output_queue(card->qdio.out_qs[i]);
 		kfree(card->qdio.out_qs);
 		card->qdio.out_qs = NULL;
 	}

From 39ee160c4f82816b05834f54047c27b6bbcee1b0 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 28 Jan 2019 14:35:26 +0100
Subject: [PATCH 2789/3220] perf symbols: Filter out hidden symbols from labels

[ Upstream commit 59a17706915fe5ea6f711e1f92d4fb706bce07fe ]

When perf is built with the annobin plugin (RHEL8 build) extra symbols
are added to its binary:

  # nm perf | grep annobin | head -10
  0000000000241100 t .annobin_annotate.c
  0000000000326490 t .annobin_annotate.c
  0000000000249255 t .annobin_annotate.c_end
  00000000003283a8 t .annobin_annotate.c_end
  00000000001bce18 t .annobin_annotate.c_end.hot
  00000000001bce18 t .annobin_annotate.c_end.hot
  00000000001bc3e2 t .annobin_annotate.c_end.unlikely
  00000000001bc400 t .annobin_annotate.c_end.unlikely
  00000000001bce18 t .annobin_annotate.c.hot
  00000000001bce18 t .annobin_annotate.c.hot
  ...

Those symbols have no use for report or annotation and should be
skipped.  Moreover they interfere with the DWARF unwind test on the PPC
arch, where they are mixed with checked symbols and then the test fails:

  # perf test dwarf -v
  59: Test dwarf unwind                                     :
  --- start ---
  test child forked, pid 8515
  unwind: .annobin_dwarf_unwind.c:ip = 0x10dba40dc (0x2740dc)
  ...
  got: .annobin_dwarf_unwind.c 0x10dba40dc, expecting test__arch_unwind_sample
  unwind: failed with 'no error'

The annobin symbols are defined as NOTYPE/LOCAL/HIDDEN:

  # readelf -s ./perf | grep annobin | head -1
    40: 00000000001bce4f     0 NOTYPE  LOCAL  HIDDEN    13 .annobin_init.c

They can still pass the check for the label symbol. Adding check for
HIDDEN and INTERNAL (as suggested by Nick below) visibility and filter
out such symbols.

>   Just to be awkward, if you are going to ignore STV_HIDDEN
>   symbols then you should probably also ignore STV_INTERNAL ones
>   as well...  Annobin does not generate them, but you never know,
>   one day some other tool might create some.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nick Clifton <nickc@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20190128133526.GD15461@krava
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/util/symbol-elf.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 7c97ecaeae48..2070c02de3af 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -74,6 +74,11 @@ static inline uint8_t elf_sym__type(const GElf_Sym *sym)
 	return GELF_ST_TYPE(sym->st_info);
 }
 
+static inline uint8_t elf_sym__visibility(const GElf_Sym *sym)
+{
+	return GELF_ST_VISIBILITY(sym->st_other);
+}
+
 #ifndef STT_GNU_IFUNC
 #define STT_GNU_IFUNC 10
 #endif
@@ -98,7 +103,9 @@ static inline int elf_sym__is_label(const GElf_Sym *sym)
 	return elf_sym__type(sym) == STT_NOTYPE &&
 		sym->st_name != 0 &&
 		sym->st_shndx != SHN_UNDEF &&
-		sym->st_shndx != SHN_ABS;
+		sym->st_shndx != SHN_ABS &&
+		elf_sym__visibility(sym) != STV_HIDDEN &&
+		elf_sym__visibility(sym) != STV_INTERNAL;
 }
 
 static bool elf_sym__is_a(GElf_Sym *sym, enum map_type type)

From 861e9499505441980380703d7affed21f767c8ce Mon Sep 17 00:00:00 2001
From: Jun-Ru Chang <jrjang@realtek.com>
Date: Tue, 29 Jan 2019 11:56:07 +0800
Subject: [PATCH 2790/3220] MIPS: Remove function size check in
 get_frame_info()

[ Upstream commit 2b424cfc69728224fcb5fad138ea7260728e0901 ]

Patch (b6c7a324df37b "MIPS: Fix get_frame_info() handling of
microMIPS function size.") introduces additional function size
check for microMIPS by only checking insn between ip and ip + func_size.
However, func_size in get_frame_info() is always 0 if KALLSYMS is not
enabled. This causes get_frame_info() to return immediately without
calculating correct frame_size, which in turn causes "Can't analyze
schedule() prologue" warning messages at boot time.

This patch removes func_size check, and let the frame_size check run
up to 128 insns for both MIPS and microMIPS.

Signed-off-by: Jun-Ru Chang <jrjang@realtek.com>
Signed-off-by: Tony Wu <tonywu@realtek.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Fixes: b6c7a324df37b ("MIPS: Fix get_frame_info() handling of microMIPS function size.")
Cc: <ralf@linux-mips.org>
Cc: <jhogan@kernel.org>
Cc: <macro@mips.com>
Cc: <yamada.masahiro@socionext.com>
Cc: <peterz@infradead.org>
Cc: <mingo@kernel.org>
Cc: <linux-mips@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/mips/kernel/process.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index ebd8a715fe38..e6102775892d 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -339,7 +339,7 @@ static inline int is_sp_move_ins(union mips_instruction *ip)
 static int get_frame_info(struct mips_frame_info *info)
 {
 	bool is_mmips = IS_ENABLED(CONFIG_CPU_MICROMIPS);
-	union mips_instruction insn, *ip, *ip_end;
+	union mips_instruction insn, *ip;
 	const unsigned int max_insns = 128;
 	unsigned int last_insn_size = 0;
 	unsigned int i;
@@ -351,10 +351,9 @@ static int get_frame_info(struct mips_frame_info *info)
 	if (!ip)
 		goto err;
 
-	ip_end = (void *)ip + info->func_size;
-
-	for (i = 0; i < max_insns && ip < ip_end; i++) {
+	for (i = 0; i < max_insns; i++) {
 		ip = (void *)ip + last_insn_size;
+
 		if (is_mmips && mm_insn_16bit(ip->halfword[0])) {
 			insn.halfword[0] = 0;
 			insn.halfword[1] = ip->halfword[0];

From 4038a524616ec98b1e4865778fbe14e110adc0b5 Mon Sep 17 00:00:00 2001
From: Jason Gerecke <jason.gerecke@wacom.com>
Date: Sat, 9 Mar 2019 15:32:13 -0800
Subject: [PATCH 2791/3220] Input: wacom_serial4 - add support for Wacom ArtPad
 II tablet

commit 44fc95e218a09d7966a9d448941fdb003f6bb69f upstream.

Tablet initially begins communicating at 9600 baud, so this command
should be used to connect to the device:

    $ inputattach --daemon --baud 9600 --wacom_iv /dev/ttyS0

https://github.com/linuxwacom/xf86-input-wacom/issues/40

Signed-off-by: Jason Gerecke <jason.gerecke@wacom.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/input/tablet/wacom_serial4.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/input/tablet/wacom_serial4.c b/drivers/input/tablet/wacom_serial4.c
index 20ab802461e7..1d46b763aae6 100644
--- a/drivers/input/tablet/wacom_serial4.c
+++ b/drivers/input/tablet/wacom_serial4.c
@@ -187,6 +187,7 @@ enum {
 	MODEL_DIGITIZER_II	= 0x5544, /* UD */
 	MODEL_GRAPHIRE		= 0x4554, /* ET */
 	MODEL_PENPARTNER	= 0x4354, /* CT */
+	MODEL_ARTPAD_II		= 0x4B54, /* KT */
 };
 
 static void wacom_handle_model_response(struct wacom *wacom)
@@ -245,6 +246,7 @@ static void wacom_handle_model_response(struct wacom *wacom)
 		wacom->flags = F_HAS_STYLUS2 | F_HAS_SCROLLWHEEL;
 		break;
 
+	case MODEL_ARTPAD_II:
 	case MODEL_DIGITIZER_II:
 		wacom->dev->name = "Wacom Digitizer II";
 		wacom->dev->id.version = MODEL_DIGITIZER_II;

From 91c44982f50342f667d41b84c33ad68657cd3b55 Mon Sep 17 00:00:00 2001
From: Vincent Batts <vbatts@hashbangbash.com>
Date: Sat, 9 Mar 2019 15:48:04 -0800
Subject: [PATCH 2792/3220] Input: elan_i2c - add id for touchpad found in
 Lenovo s21e-20

commit e154ab69321ce2c54f19863d75c77b4e2dc9d365 upstream.

Lenovo s21e-20 uses ELAN0601 in its ACPI tables for the Elan touchpad.

Signed-off-by: Vincent Batts <vbatts@hashbangbash.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/input/mouse/elan_i2c_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c
index 25ce9047b682..16f5d5660053 100644
--- a/drivers/input/mouse/elan_i2c_core.c
+++ b/drivers/input/mouse/elan_i2c_core.c
@@ -1241,6 +1241,7 @@ static const struct acpi_device_id elan_acpi_id[] = {
 	{ "ELAN0000", 0 },
 	{ "ELAN0100", 0 },
 	{ "ELAN0600", 0 },
+	{ "ELAN0601", 0 },
 	{ "ELAN0602", 0 },
 	{ "ELAN0605", 0 },
 	{ "ELAN0608", 0 },

From f7ff45b8c3f224f45033fa8d32758ac608632a38 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 11 Feb 2019 12:43:23 -0600
Subject: [PATCH 2793/3220] iscsi_ibft: Fix missing break in switch statement

commit df997abeebadaa4824271009e2d2b526a70a11cb upstream.

Add missing break statement in order to prevent the code from falling
through to case ISCSI_BOOT_TGT_NAME, which is unnecessary.

This bug was found thanks to the ongoing efforts to enable
-Wimplicit-fallthrough.

Fixes: b33a84a38477 ("ibft: convert iscsi_ibft module to iscsi boot lib")
Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/iscsi_ibft.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c
index 72791232e46b..437c8ef90643 100644
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -513,6 +513,7 @@ static umode_t __init ibft_check_tgt_for(void *data, int type)
 	case ISCSI_BOOT_TGT_NIC_ASSOC:
 	case ISCSI_BOOT_TGT_CHAP_TYPE:
 		rc = S_IRUGO;
+		break;
 	case ISCSI_BOOT_TGT_NAME:
 		if (tgt->tgt_name_len)
 			rc = S_IRUGO;

From 7570acb210618f222920736a0c43fcc99c61f3de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:57 +0100
Subject: [PATCH 2794/3220] futex,rt_mutex: Restructure
 rt_mutex_finish_proxy_lock()

commit 38d589f2fd08f1296aea3ce62bebd185125c6d81 upstream.

With the ultimate goal of keeping rt_mutex wait_list and futex_q waiters
consistent it's necessary to split 'rt_mutex_futex_lock()' into finer
parts, such that only the actual blocking can be done without hb->lock
held.

Split split_mutex_finish_proxy_lock() into two parts, one that does the
blocking and one that does remove_waiter() when the lock acquire failed.

When the rtmutex was acquired successfully the waiter can be removed in the
acquisiton path safely, since there is no concurrency on the lock owner.

This means that, except for futex_lock_pi(), all wait_list modifications
are done with both hb->lock and wait_lock held.

[bigeasy@linutronix.de: fix for futex_requeue_pi_signal_restart]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104152.001659630@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Zubin Mithra <zsm@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/futex.c                  |  7 +++--
 kernel/locking/rtmutex.c        | 52 ++++++++++++++++++++++++++++-----
 kernel/locking/rtmutex_common.h |  8 +++--
 3 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index a26d217c99fe..0c92c8d34ffa 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2923,10 +2923,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		 */
 		WARN_ON(!q.pi_state);
 		pi_mutex = &q.pi_state->pi_mutex;
-		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
-		debug_rt_mutex_free_waiter(&rt_waiter);
+		ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
 
 		spin_lock(q.lock_ptr);
+		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+			ret = 0;
+
+		debug_rt_mutex_free_waiter(&rt_waiter);
 		/*
 		 * Fixup the pi_state owner and possibly acquire the lock if we
 		 * haven't already.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b066724d7a5b..dd173df9ee5e 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1712,21 +1712,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
 }
 
 /**
- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
  * @lock:		the rt_mutex we were woken on
  * @to:			the timeout, null if none. hrtimer should already have
  *			been started.
  * @waiter:		the pre-initialized rt_mutex_waiter
  *
- * Complete the lock acquisition started our behalf by another thread.
+ * Wait for the the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
  *
  * Returns:
  *  0 - success
  * <0 - error, one of -EINTR, -ETIMEDOUT
  *
- * Special API call for PI-futex requeue support
+ * Special API call for PI-futex support
  */
-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
 			       struct hrtimer_sleeper *to,
 			       struct rt_mutex_waiter *waiter)
 {
@@ -1739,9 +1741,6 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 	/* sleep on the mutex */
 	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
 
-	if (unlikely(ret))
-		remove_waiter(lock, waiter);
-
 	/*
 	 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
 	 * have to fix that up.
@@ -1752,3 +1751,42 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 
 	return ret;
 }
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock:		the rt_mutex we were woken on
+ * @waiter:		the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ *  true  - did the cleanup, we done.
+ *  false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ *          caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+				 struct rt_mutex_waiter *waiter)
+{
+	bool cleanup = false;
+
+	raw_spin_lock_irq(&lock->wait_lock);
+	/*
+	 * Unless we're the owner; we're still enqueued on the wait_list.
+	 * So check if we became owner, if not, take us off the wait_list.
+	 */
+	if (rt_mutex_owner(lock) != current) {
+		remove_waiter(lock, waiter);
+		fixup_rt_mutex_waiters(lock);
+		cleanup = true;
+	}
+	raw_spin_unlock_irq(&lock->wait_lock);
+
+	return cleanup;
+}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index e317e1cbb3eb..6f8f68edb700 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -106,9 +106,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 				     struct rt_mutex_waiter *waiter,
 				     struct task_struct *task);
-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
-				      struct hrtimer_sleeper *to,
-				      struct rt_mutex_waiter *waiter);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+			       struct hrtimer_sleeper *to,
+			       struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+				 struct rt_mutex_waiter *waiter);
 extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
 extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
 				  struct wake_q_head *wqh);

From a264be2b41088ed423f5610e8d0b8e923b4c32af Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Fri, 15 Feb 2019 11:36:50 +0100
Subject: [PATCH 2795/3220] ARM: dts: exynos: Add minimal clkout parameters to
 Exynos3250 PMU

commit a66352e005488ecb4b534ba1af58a9f671eba9b8 upstream.

Add minimal parameters needed by the Exynos CLKOUT driver to Exynos3250
PMU node. This fixes the following warning on boot:

exynos_clkout_init: failed to register clkout clock

Fixes: d19bb397e19e ("ARM: dts: exynos: Update PMU node with CLKOUT related data")
Cc: <stable@vger.kernel.org>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/exynos3250.dtsi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm/boot/dts/exynos3250.dtsi b/arch/arm/boot/dts/exynos3250.dtsi
index 2f30d632f1cc..e81a27214188 100644
--- a/arch/arm/boot/dts/exynos3250.dtsi
+++ b/arch/arm/boot/dts/exynos3250.dtsi
@@ -150,6 +150,9 @@
 			interrupt-controller;
 			#interrupt-cells = <3>;
 			interrupt-parent = <&gic>;
+			clock-names = "clkout8";
+			clocks = <&cmu CLK_FIN_PLL>;
+			#clock-cells = <1>;
 		};
 
 		mipi_phy: video-phy@10020710 {

From a20168a138368f4d470923f0d446bfcc827fe745 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sashal@kernel.org>
Date: Mon, 11 Mar 2019 20:28:23 -0400
Subject: [PATCH 2796/3220] Revert "x86/platform/UV: Use efi_runtime_lock to
 serialise BIOS calls"

This reverts commit 7212e37cbdf99f48e4a6c689a42f4bda1ae69001.

Hedi Berriche <hedi.berriche@hpe.com> notes:

> In 4.4-stable efi_runtime_lock as defined in drivers/firmware/efi/runtime-wrappers.c
> is a spinlock (given it predates commit dce48e351c0d) and commit
>
>         f331e766c4be x86/platform/UV: Use efi_runtime_lock to serialise BIOS calls
>
> which 7212e37cbdf9 is a backport of, needs it to be a semaphore.

Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/include/asm/uv/bios.h          |  8 +-------
 arch/x86/platform/uv/bios_uv.c          | 23 ++---------------------
 drivers/firmware/efi/runtime-wrappers.c |  7 -------
 3 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 8b7594f2d48f..71605c7d5c5c 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -48,8 +48,7 @@ enum {
 	BIOS_STATUS_SUCCESS		=  0,
 	BIOS_STATUS_UNIMPLEMENTED	= -ENOSYS,
 	BIOS_STATUS_EINVAL		= -EINVAL,
-	BIOS_STATUS_UNAVAIL		= -EBUSY,
-	BIOS_STATUS_ABORT		= -EINTR,
+	BIOS_STATUS_UNAVAIL		= -EBUSY
 };
 
 /*
@@ -112,9 +111,4 @@ extern long system_serial_number;
 
 extern struct kobject *sgi_uv_kobj;	/* /sys/firmware/sgi_uv */
 
-/*
- * EFI runtime lock; cf. firmware/efi/runtime-wrappers.c for details
- */
-extern struct semaphore __efi_uv_runtime_lock;
-
 #endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index a45a1c5aabea..1584cbed0dce 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -28,8 +28,7 @@
 
 static struct uv_systab uv_systab;
 
-static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
-			u64 a4, u64 a5)
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
 {
 	struct uv_systab *tab = &uv_systab;
 	s64 ret;
@@ -44,19 +43,6 @@ static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
 			a1, a2, a3, a4, a5);
 	return ret;
 }
-
-s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
-{
-	s64 ret;
-
-	if (down_interruptible(&__efi_uv_runtime_lock))
-		return BIOS_STATUS_ABORT;
-
-	ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
-	up(&__efi_uv_runtime_lock);
-
-	return ret;
-}
 EXPORT_SYMBOL_GPL(uv_bios_call);
 
 s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
@@ -65,15 +51,10 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
 	unsigned long bios_flags;
 	s64 ret;
 
-	if (down_interruptible(&__efi_uv_runtime_lock))
-		return BIOS_STATUS_ABORT;
-
 	local_irq_save(bios_flags);
-	ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
+	ret = uv_bios_call(which, a1, a2, a3, a4, a5);
 	local_irq_restore(bios_flags);
 
-	up(&__efi_uv_runtime_lock);
-
 	return ret;
 }
 
diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index 906d0224f50d..228bbf910461 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -87,13 +87,6 @@ static DEFINE_SPINLOCK(efi_runtime_lock);
  * context through efi_pstore_write().
  */
 
-/*
- * Expose the EFI runtime lock to the UV platform
- */
-#ifdef CONFIG_X86_UV
-extern struct semaphore __efi_uv_runtime_lock __alias(efi_runtime_lock);
-#endif
-
 /*
  * As per commit ef68c8f87ed1 ("x86: Serialize EFI time accesses on rtc_lock"),
  * the EFI specification requires that callers of the time related runtime

From ec117204466e3d32771efa7b66f75497169aaecb Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Sat, 11 Feb 2017 22:14:56 +0200
Subject: [PATCH 2797/3220] ARM: dts: exynos: Do not ignore real-world fuse
 values for thermal zone 0 on Exynos5420

commit 28928a3ce142b2e4e5a7a0f067cefb41a3d2c3f9 upstream.

In Odroid XU3 Lite board, the temperature levels reported for thermal
zone 0 were weird. In warm room:
	/sys/class/thermal/thermal_zone0/temp:32000
	/sys/class/thermal/thermal_zone1/temp:51000
	/sys/class/thermal/thermal_zone2/temp:55000
	/sys/class/thermal/thermal_zone3/temp:54000
	/sys/class/thermal/thermal_zone4/temp:51000

Sometimes after booting the value was even equal to ambient temperature
which is highly unlikely to be a real temperature of sensor in SoC.

The thermal sensor's calibration (trimming) is based on fused values.
In case of the board above, the fused values are: 35, 52, 43, 58 and 43
(corresponding to each TMU device).  However driver defined a minimum value
for fused data as 40 and for smaller values it was using a hard-coded 55
instead.  This lead to mapping data from sensor to wrong temperatures
for thermal zone 0.

Various vendor 3.10 trees (Hardkernel's based on Samsung LSI, Artik 10)
do not impose any limits on fused values.  Since we do not have any
knowledge about these limits, use 0 as a minimum accepted fused value.
This should essentially allow accepting any reasonable fused value thus
behaving like vendor driver.

The exynos5420-tmu-sensor-conf.dtsi is copied directly from existing
exynos4412 with one change - the samsung,tmu_min_efuse_value.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Acked-by: Eduardo Valentin <edubezval@gmail.com>
Reviewed-by: Javier Martinez Canillas <javier@osg.samsung.com>
Tested-by: Javier Martinez Canillas <javier@osg.samsung.com>
Reviewed-by: Anand Moon <linux.amoon@gmail.com>
Tested-by: Anand Moon <linux.amoon@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../boot/dts/exynos5420-tmu-sensor-conf.dtsi  | 25 +++++++++++++++++++
 arch/arm/boot/dts/exynos5420.dtsi             | 10 ++++----
 2 files changed, 30 insertions(+), 5 deletions(-)
 create mode 100644 arch/arm/boot/dts/exynos5420-tmu-sensor-conf.dtsi

diff --git a/arch/arm/boot/dts/exynos5420-tmu-sensor-conf.dtsi b/arch/arm/boot/dts/exynos5420-tmu-sensor-conf.dtsi
new file mode 100644
index 000000000000..c8771c660550
--- /dev/null
+++ b/arch/arm/boot/dts/exynos5420-tmu-sensor-conf.dtsi
@@ -0,0 +1,25 @@
+/*
+ * Device tree sources for Exynos5420 TMU sensor configuration
+ *
+ * Copyright (c) 2014 Lukasz Majewski <l.majewski@samsung.com>
+ * Copyright (c) 2017 Krzysztof Kozlowski <krzk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <dt-bindings/thermal/thermal_exynos.h>
+
+#thermal-sensor-cells = <0>;
+samsung,tmu_gain = <8>;
+samsung,tmu_reference_voltage = <16>;
+samsung,tmu_noise_cancel_mode = <4>;
+samsung,tmu_efuse_value = <55>;
+samsung,tmu_min_efuse_value = <0>;
+samsung,tmu_max_efuse_value = <100>;
+samsung,tmu_first_point_trim = <25>;
+samsung,tmu_second_point_trim = <85>;
+samsung,tmu_default_temp_offset = <50>;
+samsung,tmu_cal_type = <TYPE_ONE_POINT_TRIMMING>;
diff --git a/arch/arm/boot/dts/exynos5420.dtsi b/arch/arm/boot/dts/exynos5420.dtsi
index 1b3d6c769a3c..d5edb7766942 100644
--- a/arch/arm/boot/dts/exynos5420.dtsi
+++ b/arch/arm/boot/dts/exynos5420.dtsi
@@ -777,7 +777,7 @@
 		interrupts = <0 65 0>;
 		clocks = <&clock CLK_TMU>;
 		clock-names = "tmu_apbif";
-		#include "exynos4412-tmu-sensor-conf.dtsi"
+		#include "exynos5420-tmu-sensor-conf.dtsi"
 	};
 
 	tmu_cpu1: tmu@10064000 {
@@ -786,7 +786,7 @@
 		interrupts = <0 183 0>;
 		clocks = <&clock CLK_TMU>;
 		clock-names = "tmu_apbif";
-		#include "exynos4412-tmu-sensor-conf.dtsi"
+		#include "exynos5420-tmu-sensor-conf.dtsi"
 	};
 
 	tmu_cpu2: tmu@10068000 {
@@ -795,7 +795,7 @@
 		interrupts = <0 184 0>;
 		clocks = <&clock CLK_TMU>, <&clock CLK_TMU>;
 		clock-names = "tmu_apbif", "tmu_triminfo_apbif";
-		#include "exynos4412-tmu-sensor-conf.dtsi"
+		#include "exynos5420-tmu-sensor-conf.dtsi"
 	};
 
 	tmu_cpu3: tmu@1006c000 {
@@ -804,7 +804,7 @@
 		interrupts = <0 185 0>;
 		clocks = <&clock CLK_TMU>, <&clock CLK_TMU_GPU>;
 		clock-names = "tmu_apbif", "tmu_triminfo_apbif";
-		#include "exynos4412-tmu-sensor-conf.dtsi"
+		#include "exynos5420-tmu-sensor-conf.dtsi"
 	};
 
 	tmu_gpu: tmu@100a0000 {
@@ -813,7 +813,7 @@
 		interrupts = <0 215 0>;
 		clocks = <&clock CLK_TMU_GPU>, <&clock CLK_TMU>;
 		clock-names = "tmu_apbif", "tmu_triminfo_apbif";
-		#include "exynos4412-tmu-sensor-conf.dtsi"
+		#include "exynos5420-tmu-sensor-conf.dtsi"
 	};
 
 	thermal-zones {

From da517f513b86c7eb133da71d9d8a01edfc5a18bb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 22 Nov 2016 09:06:45 -0800
Subject: [PATCH 2798/3220] udplite: call proper backlog handlers

commit 30c7be26fd3587abcb69587f781098e3ca2d565b upstream.

In commits 93821778def10 ("udp: Fix rcv socket locking") and
f7ad74fef3af ("net/ipv6/udp: UDP encapsulation: break backlog_rcv into
__udpv6_queue_rcv_skb") UDP backlog handlers were renamed, but UDPlite
was forgotten.

This leads to crashes if UDPlite header is pulled twice, which happens
starting from commit e6afc8ace6dd ("udp: remove headers from UDP packets
before queueing")

Bug found by syzkaller team, thanks a lot guys !

Note that backlog use in UDP/UDPlite is scheduled to be removed starting
from linux-4.10, so this patch is only needed up to linux-4.9

Fixes: 93821778def1 ("udp: Fix rcv socket locking")
Fixes: f7ad74fef3af ("net/ipv6/udp: UDP encapsulation: break backlog_rcv into __udpv6_queue_rcv_skb")
Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Zubin Mithra <zsm@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/udp.c      | 2 +-
 net/ipv4/udp_impl.h | 2 +-
 net/ipv4/udplite.c  | 2 +-
 net/ipv6/udp.c      | 2 +-
 net/ipv6/udp_impl.h | 2 +-
 net/ipv6/udplite.c  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6f929689fd03..0924f93a0aff 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1463,7 +1463,7 @@ static void udp_v4_rehash(struct sock *sk)
 	udp_lib_rehash(sk, new_hash);
 }
 
-static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
 
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 7e0fe4bdd967..feb50a16398d 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		int flags, int *addr_len);
 int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
 		 int flags);
-int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 void udp_destroy_sock(struct sock *sk);
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 3b3efbda48e1..78766b32b78b 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -50,7 +50,7 @@ struct proto 	udplite_prot = {
 	.sendmsg	   = udp_sendmsg,
 	.recvmsg	   = udp_recvmsg,
 	.sendpage	   = udp_sendpage,
-	.backlog_rcv	   = udp_queue_rcv_skb,
+	.backlog_rcv	   = __udp_queue_rcv_skb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6eb1e9293b6f..f4e06748f86b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -585,7 +585,7 @@ out:
 	sock_put(sk);
 }
 
-static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
 
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 0682c031ccdc..3c1dbc9f74cf 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -26,7 +26,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
 int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
 int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		  int flags, int *addr_len);
-int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 void udpv6_destroy_sock(struct sock *sk);
 
 void udp_v6_clear_sk(struct sock *sk, int size);
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 9cf097e206e9..d1eaeeaa34d2 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -45,7 +45,7 @@ struct proto udplitev6_prot = {
 	.getsockopt	   = udpv6_getsockopt,
 	.sendmsg	   = udpv6_sendmsg,
 	.recvmsg	   = udpv6_recvmsg,
-	.backlog_rcv	   = udpv6_queue_rcv_skb,
+	.backlog_rcv	   = __udpv6_queue_rcv_skb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v6_get_port,

From 9a07167f70a717a5171944195553c8fed7294d6b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 24 Mar 2016 21:29:53 +0100
Subject: [PATCH 2799/3220] netfilter: x_tables: enforce nul-terminated table
 name from getsockopt GET_ENTRIES

commit b301f2538759933cf9ff1f7c4f968da72e3f0757 upstream.

Make sure the table names via getsockopt GET_ENTRIES is nul-terminated
in ebtables and all the x_tables variants and their respective compat
code. Uncovered by KASAN.

Reported-by: Baozeng Ding <sploving1@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Zubin Mithra <zsm@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/netfilter/ebtables.c | 4 ++++
 net/ipv4/netfilter/arp_tables.c | 2 ++
 net/ipv4/netfilter/ip_tables.c  | 2 ++
 net/ipv6/netfilter/ip6_tables.c | 2 ++
 4 files changed, 10 insertions(+)

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 8b8a43fda6ca..f13402d407e4 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1528,6 +1528,8 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	if (copy_from_user(&tmp, user, sizeof(tmp)))
 		return -EFAULT;
 
+	tmp.name[sizeof(tmp.name) - 1] = '\0';
+
 	t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
 	if (!t)
 		return ret;
@@ -2368,6 +2370,8 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
 	if (copy_from_user(&tmp, user, sizeof(tmp)))
 		return -EFAULT;
 
+	tmp.name[sizeof(tmp.name) - 1] = '\0';
+
 	t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
 	if (!t)
 		return ret;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f51b32ed353c..cbe630aab44a 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -983,6 +983,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
 			 sizeof(struct arpt_get_entries) + get.size);
 		return -EINVAL;
 	}
+	get.name[sizeof(get.name) - 1] = '\0';
 
 	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
 	if (!IS_ERR_OR_NULL(t)) {
@@ -1557,6 +1558,7 @@ static int compat_get_entries(struct net *net,
 			 *len, sizeof(get) + get.size);
 		return -EINVAL;
 	}
+	get.name[sizeof(get.name) - 1] = '\0';
 
 	xt_compat_lock(NFPROTO_ARP);
 	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8adb6e9ba8f5..53d664a7774c 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1171,6 +1171,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
 			 *len, sizeof(get) + get.size);
 		return -EINVAL;
 	}
+	get.name[sizeof(get.name) - 1] = '\0';
 
 	t = xt_find_table_lock(net, AF_INET, get.name);
 	if (!IS_ERR_OR_NULL(t)) {
@@ -1799,6 +1800,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
 			 *len, sizeof(get) + get.size);
 		return -EINVAL;
 	}
+	get.name[sizeof(get.name) - 1] = '\0';
 
 	xt_compat_lock(AF_INET);
 	t = xt_find_table_lock(net, AF_INET, get.name);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 96de322fe5e2..f563cf3fcc4c 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1182,6 +1182,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
 			 *len, sizeof(get) + get.size);
 		return -EINVAL;
 	}
+	get.name[sizeof(get.name) - 1] = '\0';
 
 	t = xt_find_table_lock(net, AF_INET6, get.name);
 	if (!IS_ERR_OR_NULL(t)) {
@@ -1800,6 +1801,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
 			 *len, sizeof(get) + get.size);
 		return -EINVAL;
 	}
+	get.name[sizeof(get.name) - 1] = '\0';
 
 	xt_compat_lock(AF_INET6);
 	t = xt_find_table_lock(net, AF_INET6, get.name);

From c92b434e70dbc12620c0879194143bd9866a7a3b Mon Sep 17 00:00:00 2001
From: Ken-ichirou MATSUZAWA <chamaken@gmail.com>
Date: Tue, 5 Jan 2016 09:34:34 +0900
Subject: [PATCH 2800/3220] netfilter: nfnetlink_log: just returns error for
 unknown command

commit eb075954e9fde114f57adc39a9ea6d379c13f81e upstream.

This patch stops processing options for unknown command.

Signed-off-by: Ken-ichirou MATSUZAWA <chamas@h4.dion.ne.jp>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Zubin Mithra <zsm@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/netfilter/nfnetlink_log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 740cce4685ac..85b4f7902b49 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -895,7 +895,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 			goto out_put;
 		default:
 			ret = -ENOTSUPP;
-			break;
+			goto out_put;
 		}
 	} else if (!inst) {
 		ret = -ENODEV;

From a55ea87f70ae91cdce8c8fcb0e2a8821b23df15e Mon Sep 17 00:00:00 2001
From: Phil Turnbull <phil.turnbull@oracle.com>
Date: Wed, 24 Feb 2016 15:34:43 -0500
Subject: [PATCH 2801/3220] netfilter: nfnetlink_acct: validate NFACCT_FILTER
 parameters

commit 017b1b6d28c479f1ad9a7a41f775545a3e1cba35 upstream.

nfacct_filter_alloc doesn't validate the NFACCT_FILTER_MASK and
NFACCT_FILTER_VALUE parameters which can trigger a NULL pointer
dereference. CAP_NET_ADMIN is required to trigger the bug.

Signed-off-by: Phil Turnbull <phil.turnbull@oracle.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Zubin Mithra <zsm@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/netfilter/nfnetlink_acct.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index fefbf5f0b28d..088e8da06b00 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -243,6 +243,9 @@ nfacct_filter_alloc(const struct nlattr * const attr)
 	if (err < 0)
 		return ERR_PTR(err);
 
+	if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE])
+		return ERR_PTR(-EINVAL);
+
 	filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL);
 	if (!filter)
 		return ERR_PTR(-ENOMEM);

From b71ec041cce35620b2e31cb94c756d73afb494f6 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Wed, 30 Mar 2016 11:34:35 +0200
Subject: [PATCH 2802/3220] netfilter: nf_conntrack_tcp: Fix stack out of
 bounds when parsing TCP options

commit 644c7e48cb59cfc6988ddc7bf3d3b1ba5fe7fa9d upstream.

Baozeng Ding reported a KASAN stack out of bounds issue - it uncovered that
the TCP option parsing routines in netfilter TCP connection tracking could
read one byte out of the buffer of the TCP options.  Therefore in the patch
we check that the available data length is large enough to parse both TCP
option code and size.

Reported-by: Baozeng Ding <sploving1@gmail.com>
Tested-by: Baozeng Ding <sploving1@gmail.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Zubin Mithra <zsm@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/netfilter/nf_conntrack_proto_tcp.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 278f3b9356ef..7cc1d9c22a9f 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -410,6 +410,8 @@ static void tcp_options(const struct sk_buff *skb,
 			length--;
 			continue;
 		default:
+			if (length < 2)
+				return;
 			opsize=*ptr++;
 			if (opsize < 2) /* "silly options" */
 				return;
@@ -470,6 +472,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
 			length--;
 			continue;
 		default:
+			if (length < 2)
+				return;
 			opsize = *ptr++;
 			if (opsize < 2) /* "silly options" */
 				return;

From 52f6c8da89b93b98a1dfc5008018012dc3ffd508 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 18 Sep 2017 11:38:29 -0700
Subject: [PATCH 2803/3220] KEYS: restrict /proc/keys by credentials at open
 time

commit 4aa68e07d845562561f5e73c04aa521376e95252 upstream.

When checking for permission to view keys whilst reading from
/proc/keys, we should use the credentials with which the /proc/keys file
was opened.  This is because, in a classic type of exploit, it can be
possible to bypass checks for the *current* credentials by passing the
file descriptor to a suid program.

Following commit 34dbbcdbf633 ("Make file credentials available to the
seqfile interfaces") we can finally fix it.  So let's do it.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Zubin Mithra <zsm@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 security/keys/proc.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/security/keys/proc.c b/security/keys/proc.c
index ec493ddadd11..f2c7e090a66d 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -187,7 +187,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 
 	struct keyring_search_context ctx = {
 		.index_key		= key->index_key,
-		.cred			= current_cred(),
+		.cred			= m->file->f_cred,
 		.match_data.cmp		= lookup_user_key_possessed,
 		.match_data.raw_data	= key,
 		.match_data.lookup_type	= KEYRING_SEARCH_LOOKUP_DIRECT,
@@ -207,11 +207,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 		}
 	}
 
-	/* check whether the current task is allowed to view the key (assuming
-	 * non-possession)
-	 * - the caller holds a spinlock, and thus the RCU read lock, making our
-	 *   access to __current_cred() safe
-	 */
+	/* check whether the current task is allowed to view the key */
 	rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW);
 	if (rc < 0)
 		return 0;

From a795edbd6cd70364abc68a84f16b305d4a5ce04b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Mar 2019 06:50:11 -0700
Subject: [PATCH 2804/3220] l2tp: fix infoleak in l2tp_ip6_recvmsg()

[ Upstream commit 163d1c3d6f17556ed3c340d3789ea93be95d6c28 ]

Back in 2013 Hannes took care of most of such leaks in commit
bceaa90240b6 ("inet: prevent leakage of uninitialized memory to user in recv syscalls")

But the bug in l2tp_ip6_recvmsg() has not been fixed.

syzbot report :

BUG: KMSAN: kernel-infoleak in _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
CPU: 1 PID: 10996 Comm: syz-executor362 Not tainted 5.0.0+ #11
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x173/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:600
 kmsan_internal_check_memory+0x9f4/0xb10 mm/kmsan/kmsan.c:694
 kmsan_copy_to_user+0xab/0xc0 mm/kmsan/kmsan_hooks.c:601
 _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
 copy_to_user include/linux/uaccess.h:174 [inline]
 move_addr_to_user+0x311/0x570 net/socket.c:227
 ___sys_recvmsg+0xb65/0x1310 net/socket.c:2283
 do_recvmmsg+0x646/0x10c0 net/socket.c:2390
 __sys_recvmmsg net/socket.c:2469 [inline]
 __do_sys_recvmmsg net/socket.c:2492 [inline]
 __se_sys_recvmmsg+0x1d1/0x350 net/socket.c:2485
 __x64_sys_recvmmsg+0x62/0x80 net/socket.c:2485
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x445819
Code: e8 6c b6 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 2b 12 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f64453eddb8 EFLAGS: 00000246 ORIG_RAX: 000000000000012b
RAX: ffffffffffffffda RBX: 00000000006dac28 RCX: 0000000000445819
RDX: 0000000000000005 RSI: 0000000020002f80 RDI: 0000000000000003
RBP: 00000000006dac20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dac2c
R13: 00007ffeba8f87af R14: 00007f64453ee9c0 R15: 20c49ba5e353f7cf

Local variable description: ----addr@___sys_recvmsg
Variable was created at:
 ___sys_recvmsg+0xf6/0x1310 net/socket.c:2244
 do_recvmmsg+0x646/0x10c0 net/socket.c:2390

Bytes 0-31 of 32 are uninitialized
Memory access of size 32 starts at ffff8880ae62fbb0
Data copied to user address 0000000020000000

Fixes: a32e0eec7042 ("l2tp: introduce L2TPv3 IP encapsulation support for IPv6")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/l2tp/l2tp_ip6.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index e066111b9398..a88649c5d26c 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -666,9 +666,6 @@ static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	if (flags & MSG_OOB)
 		goto out;
 
-	if (addr_len)
-		*addr_len = sizeof(*lsa);
-
 	if (flags & MSG_ERRQUEUE)
 		return ipv6_recv_error(sk, msg, len, addr_len);
 
@@ -698,6 +695,7 @@ static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		lsa->l2tp_conn_id = 0;
 		if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL)
 			lsa->l2tp_scope_id = inet6_iif(skb);
+		*addr_len = sizeof(*lsa);
 	}
 
 	if (np->rxopt.all)

From 453e3b319d28f2023d45073e6eb30c5efa2fd06b Mon Sep 17 00:00:00 2001
From: Mao Wenan <maowenan@huawei.com>
Date: Wed, 6 Mar 2019 22:45:01 +0800
Subject: [PATCH 2805/3220] net: hsr: fix memory leak in hsr_dev_finalize()

[ Upstream commit 6caabe7f197d3466d238f70915d65301f1716626 ]

If hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER) failed to
add port, it directly returns res and forgets to free the node
that allocated in hsr_create_self_node(), and forgets to delete
the node->mac_list linked in hsr->self_node_db.

BUG: memory leak
unreferenced object 0xffff8881cfa0c780 (size 64):
  comm "syz-executor.0", pid 2077, jiffies 4294717969 (age 2415.377s)
  hex dump (first 32 bytes):
    e0 c7 a0 cf 81 88 ff ff 00 02 00 00 00 00 ad de  ................
    00 e6 49 cd 81 88 ff ff c0 9b 87 d0 81 88 ff ff  ..I.............
  backtrace:
    [<00000000e2ff5070>] hsr_dev_finalize+0x736/0x960 [hsr]
    [<000000003ed2e597>] hsr_newlink+0x2b2/0x3e0 [hsr]
    [<000000003fa8c6b6>] __rtnl_newlink+0xf1f/0x1600 net/core/rtnetlink.c:3182
    [<000000001247a7ad>] rtnl_newlink+0x66/0x90 net/core/rtnetlink.c:3240
    [<00000000e7d1b61d>] rtnetlink_rcv_msg+0x54e/0xb90 net/core/rtnetlink.c:5130
    [<000000005556bd3a>] netlink_rcv_skb+0x129/0x340 net/netlink/af_netlink.c:2477
    [<00000000741d5ee6>] netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
    [<00000000741d5ee6>] netlink_unicast+0x49a/0x650 net/netlink/af_netlink.c:1336
    [<000000009d56f9b7>] netlink_sendmsg+0x88b/0xdf0 net/netlink/af_netlink.c:1917
    [<0000000046b35c59>] sock_sendmsg_nosec net/socket.c:621 [inline]
    [<0000000046b35c59>] sock_sendmsg+0xc3/0x100 net/socket.c:631
    [<00000000d208adc9>] __sys_sendto+0x33e/0x560 net/socket.c:1786
    [<00000000b582837a>] __do_sys_sendto net/socket.c:1798 [inline]
    [<00000000b582837a>] __se_sys_sendto net/socket.c:1794 [inline]
    [<00000000b582837a>] __x64_sys_sendto+0xdd/0x1b0 net/socket.c:1794
    [<00000000c866801d>] do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
    [<00000000fea382d9>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<00000000e01dacb3>] 0xffffffffffffffff

Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/hsr/hsr_device.c   |  4 +++-
 net/hsr/hsr_framereg.c | 12 ++++++++++++
 net/hsr/hsr_framereg.h |  1 +
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index c7d1adca30d8..7d464f0486ed 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -477,7 +477,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
 
 	res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER);
 	if (res)
-		return res;
+		goto err_add_port;
 
 	res = register_netdevice(hsr_dev);
 	if (res)
@@ -498,6 +498,8 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
 fail:
 	hsr_for_each_port(hsr, port)
 		hsr_del_port(port);
+err_add_port:
+	hsr_del_node(&hsr->self_node_db);
 
 	return res;
 }
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index bace124d14ef..46415839e67e 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -124,6 +124,18 @@ int hsr_create_self_node(struct list_head *self_node_db,
 	return 0;
 }
 
+void hsr_del_node(struct list_head *self_node_db)
+{
+	struct hsr_node *node;
+
+	rcu_read_lock();
+	node = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list);
+	rcu_read_unlock();
+	if (node) {
+		list_del_rcu(&node->mac_list);
+		kfree(node);
+	}
+}
 
 /* Allocate an hsr_node and add it to node_db. 'addr' is the node's AddressA;
  * seq_out is used to initialize filtering of outgoing duplicate frames
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index 438b40f98f5a..7a8f4e98f515 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -16,6 +16,7 @@
 
 struct hsr_node;
 
+void hsr_del_node(struct list_head *self_node_db);
 struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
 			      u16 seq_out);
 struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,

From d84e9d3c97ee12f49f5e0b9f3087f10853e84021 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 11 Mar 2019 16:29:32 +0800
Subject: [PATCH 2806/3220] net: sit: fix UBSAN Undefined behaviour in
 check_6rd

[ Upstream commit a843dc4ebaecd15fca1f4d35a97210f72ea1473b ]

In func check_6rd,tunnel->ip6rd.relay_prefixlen may equal to
32,so UBSAN complain about it.

UBSAN: Undefined behaviour in net/ipv6/sit.c:781:47
shift exponent 32 is too large for 32-bit type 'unsigned int'
CPU: 6 PID: 20036 Comm: syz-executor.0 Not tainted 4.19.27 #2
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1
04/01/2014
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0xca/0x13e lib/dump_stack.c:113
ubsan_epilogue+0xe/0x81 lib/ubsan.c:159
__ubsan_handle_shift_out_of_bounds+0x293/0x2e8 lib/ubsan.c:425
check_6rd.constprop.9+0x433/0x4e0 net/ipv6/sit.c:781
try_6rd net/ipv6/sit.c:806 [inline]
ipip6_tunnel_xmit net/ipv6/sit.c:866 [inline]
sit_tunnel_xmit+0x141c/0x2720 net/ipv6/sit.c:1033
__netdev_start_xmit include/linux/netdevice.h:4300 [inline]
netdev_start_xmit include/linux/netdevice.h:4309 [inline]
xmit_one net/core/dev.c:3243 [inline]
dev_hard_start_xmit+0x17c/0x780 net/core/dev.c:3259
__dev_queue_xmit+0x1656/0x2500 net/core/dev.c:3829
neigh_output include/net/neighbour.h:501 [inline]
ip6_finish_output2+0xa36/0x2290 net/ipv6/ip6_output.c:120
ip6_finish_output+0x3e7/0xa20 net/ipv6/ip6_output.c:154
NF_HOOK_COND include/linux/netfilter.h:278 [inline]
ip6_output+0x1e2/0x720 net/ipv6/ip6_output.c:171
dst_output include/net/dst.h:444 [inline]
ip6_local_out+0x99/0x170 net/ipv6/output_core.c:176
ip6_send_skb+0x9d/0x2f0 net/ipv6/ip6_output.c:1697
ip6_push_pending_frames+0xc0/0x100 net/ipv6/ip6_output.c:1717
rawv6_push_pending_frames net/ipv6/raw.c:616 [inline]
rawv6_sendmsg+0x2435/0x3530 net/ipv6/raw.c:946
inet_sendmsg+0xf8/0x5c0 net/ipv4/af_inet.c:798
sock_sendmsg_nosec net/socket.c:621 [inline]
sock_sendmsg+0xc8/0x110 net/socket.c:631
___sys_sendmsg+0x6cf/0x890 net/socket.c:2114
__sys_sendmsg+0xf0/0x1b0 net/socket.c:2152
do_syscall_64+0xc8/0x580 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe

Signed-off-by: linmiaohe <linmiaohe@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/sit.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index eb64cdf5888d..96582ec9c807 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -772,8 +772,9 @@ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
 		pbw0 = tunnel->ip6rd.prefixlen >> 5;
 		pbi0 = tunnel->ip6rd.prefixlen & 0x1f;
 
-		d = (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >>
-		    tunnel->ip6rd.relay_prefixlen;
+		d = tunnel->ip6rd.relay_prefixlen < 32 ?
+			(ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >>
+		    tunnel->ip6rd.relay_prefixlen : 0;
 
 		pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen;
 		if (pbi1 > 0)

From 70ed8a0a9ec31b1b41951263f393c2079cdbec31 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 10 Mar 2019 09:07:14 -0700
Subject: [PATCH 2807/3220] net/x25: fix use-after-free in x25_device_event()

[ Upstream commit 95d6ebd53c79522bf9502dbc7e89e0d63f94dae4 ]

In case of failure x25_connect() does a x25_neigh_put(x25->neighbour)
but forgets to clear x25->neighbour pointer, thus triggering use-after-free.

Since the socket is visible in x25_list, we need to hold x25_list_lock
to protect the operation.

syzbot report :

BUG: KASAN: use-after-free in x25_kill_by_device net/x25/af_x25.c:217 [inline]
BUG: KASAN: use-after-free in x25_device_event+0x296/0x2b0 net/x25/af_x25.c:252
Read of size 8 at addr ffff8880a030edd0 by task syz-executor003/7854

CPU: 0 PID: 7854 Comm: syz-executor003 Not tainted 5.0.0+ #97
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187
 kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:135
 x25_kill_by_device net/x25/af_x25.c:217 [inline]
 x25_device_event+0x296/0x2b0 net/x25/af_x25.c:252
 notifier_call_chain+0xc7/0x240 kernel/notifier.c:93
 __raw_notifier_call_chain kernel/notifier.c:394 [inline]
 raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:401
 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1739
 call_netdevice_notifiers_extack net/core/dev.c:1751 [inline]
 call_netdevice_notifiers net/core/dev.c:1765 [inline]
 __dev_notify_flags+0x1e9/0x2c0 net/core/dev.c:7607
 dev_change_flags+0x10d/0x170 net/core/dev.c:7643
 dev_ifsioc+0x2b0/0x940 net/core/dev_ioctl.c:237
 dev_ioctl+0x1b8/0xc70 net/core/dev_ioctl.c:488
 sock_do_ioctl+0x1bd/0x300 net/socket.c:995
 sock_ioctl+0x32b/0x610 net/socket.c:1096
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:509 [inline]
 do_vfs_ioctl+0xd6e/0x1390 fs/ioctl.c:696
 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713
 __do_sys_ioctl fs/ioctl.c:720 [inline]
 __se_sys_ioctl fs/ioctl.c:718 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4467c9
Code: e8 0c e8 ff ff 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 5b 07 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fdbea222d98 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000006dbc58 RCX: 00000000004467c9
RDX: 0000000020000340 RSI: 0000000000008914 RDI: 0000000000000003
RBP: 00000000006dbc50 R08: 00007fdbea223700 R09: 0000000000000000
R10: 00007fdbea223700 R11: 0000000000000246 R12: 00000000006dbc5c
R13: 6000030030626669 R14: 0000000000000000 R15: 0000000030626669

Allocated by task 7843:
 save_stack+0x45/0xd0 mm/kasan/common.c:73
 set_track mm/kasan/common.c:85 [inline]
 __kasan_kmalloc mm/kasan/common.c:495 [inline]
 __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:468
 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:509
 kmem_cache_alloc_trace+0x151/0x760 mm/slab.c:3615
 kmalloc include/linux/slab.h:545 [inline]
 x25_link_device_up+0x46/0x3f0 net/x25/x25_link.c:249
 x25_device_event+0x116/0x2b0 net/x25/af_x25.c:242
 notifier_call_chain+0xc7/0x240 kernel/notifier.c:93
 __raw_notifier_call_chain kernel/notifier.c:394 [inline]
 raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:401
 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1739
 call_netdevice_notifiers_extack net/core/dev.c:1751 [inline]
 call_netdevice_notifiers net/core/dev.c:1765 [inline]
 __dev_notify_flags+0x121/0x2c0 net/core/dev.c:7605
 dev_change_flags+0x10d/0x170 net/core/dev.c:7643
 dev_ifsioc+0x2b0/0x940 net/core/dev_ioctl.c:237
 dev_ioctl+0x1b8/0xc70 net/core/dev_ioctl.c:488
 sock_do_ioctl+0x1bd/0x300 net/socket.c:995
 sock_ioctl+0x32b/0x610 net/socket.c:1096
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:509 [inline]
 do_vfs_ioctl+0xd6e/0x1390 fs/ioctl.c:696
 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713
 __do_sys_ioctl fs/ioctl.c:720 [inline]
 __se_sys_ioctl fs/ioctl.c:718 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 7865:
 save_stack+0x45/0xd0 mm/kasan/common.c:73
 set_track mm/kasan/common.c:85 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/common.c:457
 kasan_slab_free+0xe/0x10 mm/kasan/common.c:465
 __cache_free mm/slab.c:3494 [inline]
 kfree+0xcf/0x230 mm/slab.c:3811
 x25_neigh_put include/net/x25.h:253 [inline]
 x25_connect+0x8d8/0xde0 net/x25/af_x25.c:824
 __sys_connect+0x266/0x330 net/socket.c:1685
 __do_sys_connect net/socket.c:1696 [inline]
 __se_sys_connect net/socket.c:1693 [inline]
 __x64_sys_connect+0x73/0xb0 net/socket.c:1693
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8880a030edc0
 which belongs to the cache kmalloc-256 of size 256
The buggy address is located 16 bytes inside of
 256-byte region [ffff8880a030edc0, ffff8880a030eec0)
The buggy address belongs to the page:
page:ffffea000280c380 count:1 mapcount:0 mapping:ffff88812c3f07c0 index:0x0
flags: 0x1fffc0000000200(slab)
raw: 01fffc0000000200 ffffea0002806788 ffffea00027f0188 ffff88812c3f07c0
raw: 0000000000000000 ffff8880a030e000 000000010000000c 0000000000000000
page dumped because: kasan: bad access detected

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot+04babcefcd396fabec37@syzkaller.appspotmail.com
Cc: andrew hendry <andrew.hendry@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/x25/af_x25.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 8d7b2802d33f..51323538c1fc 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -812,8 +812,12 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
 	sock->state = SS_CONNECTED;
 	rc = 0;
 out_put_neigh:
-	if (rc)
+	if (rc) {
+		read_lock_bh(&x25_list_lock);
 		x25_neigh_put(x25->neighbour);
+		x25->neighbour = NULL;
+		read_unlock_bh(&x25_list_lock);
+	}
 out_put_route:
 	x25_route_put(rt);
 out:

From e0b2bbd93ef0f646e034b587c284140f4c9d12cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Mar 2019 13:48:44 -0700
Subject: [PATCH 2808/3220] net/x25: reset state in x25_connect()

[ Upstream commit ee74d0bd4325efb41e38affe5955f920ed973f23 ]

In case x25_connect() fails and frees the socket neighbour,
we also need to undo the change done to x25->state.

Before my last bug fix, we had use-after-free so this
patch fixes a latent bug.

syzbot report :

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 16137 Comm: syz-executor.1 Not tainted 5.0.0+ #117
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:x25_write_internal+0x1e8/0xdf0 net/x25/x25_subr.c:173
Code: 00 40 88 b5 e0 fe ff ff 0f 85 01 0b 00 00 48 8b 8b 80 04 00 00 48 ba 00 00 00 00 00 fc ff df 48 8d 79 1c 48 89 fe 48 c1 ee 03 <0f> b6 34 16 48 89 fa 83 e2 07 83 c2 03 40 38 f2 7c 09 40 84 f6 0f
RSP: 0018:ffff888076717a08 EFLAGS: 00010207
RAX: ffff88805f2f2292 RBX: ffff8880a0ae6000 RCX: 0000000000000000
kobject: 'loop5' (0000000018d0d0ee): kobject_uevent_env
RDX: dffffc0000000000 RSI: 0000000000000003 RDI: 000000000000001c
RBP: ffff888076717b40 R08: ffff8880950e0580 R09: ffffed100be5e46d
R10: ffffed100be5e46c R11: ffff88805f2f2363 R12: ffff888065579840
kobject: 'loop5' (0000000018d0d0ee): fill_kobj_path: path = '/devices/virtual/block/loop5'
R13: 1ffff1100ece2f47 R14: 0000000000000013 R15: 0000000000000013
FS:  00007fb88cf43700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f9a42a41028 CR3: 0000000087a67000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 x25_release+0xd0/0x340 net/x25/af_x25.c:658
 __sock_release+0xd3/0x2b0 net/socket.c:579
 sock_close+0x1b/0x30 net/socket.c:1162
 __fput+0x2df/0x8d0 fs/file_table.c:278
 ____fput+0x16/0x20 fs/file_table.c:309
 task_work_run+0x14a/0x1c0 kernel/task_work.c:113
 get_signal+0x1961/0x1d50 kernel/signal.c:2388
 do_signal+0x87/0x1940 arch/x86/kernel/signal.c:816
 exit_to_usermode_loop+0x244/0x2c0 arch/x86/entry/common.c:162
 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:268 [inline]
 do_syscall_64+0x52d/0x610 arch/x86/entry/common.c:293
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x457f29
Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fb88cf42c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: fffffffffffffe00 RBX: 0000000000000003 RCX: 0000000000457f29
RDX: 0000000000000012 RSI: 0000000020000080 RDI: 0000000000000004
RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fb88cf436d4
R13: 00000000004be462 R14: 00000000004cec98 R15: 00000000ffffffff
Modules linked in:

Fixes: 95d6ebd53c79 ("net/x25: fix use-after-free in x25_device_event()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: andrew hendry <andrew.hendry@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/x25/af_x25.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 51323538c1fc..d7f129cea2fa 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -817,6 +817,7 @@ out_put_neigh:
 		x25_neigh_put(x25->neighbour);
 		x25->neighbour = NULL;
 		read_unlock_bh(&x25_list_lock);
+		x25->state = X25_STATE_0;
 	}
 out_put_route:
 	x25_route_put(rt);

From 0516127386313e1dccb0f1a5bbb1fc627583ba68 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 13 Mar 2019 17:00:48 +0800
Subject: [PATCH 2809/3220] pptp: dst_release sk_dst_cache in
 pptp_sock_destruct

[ Upstream commit 9417d81f4f8adfe20a12dd1fadf73a618cbd945d ]

sk_setup_caps() is called to set sk->sk_dst_cache in pptp_connect,
so we have to dst_release(sk->sk_dst_cache) in pptp_sock_destruct,
otherwise, the dst refcnt will leak.

It can be reproduced by this syz log:

  r1 = socket$pptp(0x18, 0x1, 0x2)
  bind$pptp(r1, &(0x7f0000000100)={0x18, 0x2, {0x0, @local}}, 0x1e)
  connect$pptp(r1, &(0x7f0000000000)={0x18, 0x2, {0x3, @remote}}, 0x1e)

Consecutive dmesg warnings will occur:

  unregister_netdevice: waiting for lo to become free. Usage count = 1

v1->v2:
  - use rcu_dereference_protected() instead of rcu_dereference_check(),
    as suggested by Eric.

Fixes: 00959ade36ac ("PPTP: PPP over IPv4 (Point-to-Point Tunneling Protocol)")
Reported-by: Xiumei Mu <xmu@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ppp/pptp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 12a627fcc02c..53c1f2bd0f24 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -577,6 +577,7 @@ static void pptp_sock_destruct(struct sock *sk)
 		pppox_unbind_sock(sk);
 	}
 	skb_queue_purge(&sk->sk_receive_queue);
+	dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
 }
 
 static int pptp_create(struct net *net, struct socket *sock, int kern)

From d15bfd4603dc019dde6cd1f0298fbb10f176c44b Mon Sep 17 00:00:00 2001
From: Masaru Nagai <masaru.nagai.vx@renesas.com>
Date: Thu, 7 Mar 2019 11:24:47 +0100
Subject: [PATCH 2810/3220] ravb: Decrease TxFIFO depth of Q3 and Q2 to one

[ Upstream commit ae9819e339b451da7a86ab6fe38ecfcb6814e78a ]

Hardware has the CBS (Credit Based Shaper) which affects only Q3
and Q2. When updating the CBS settings, even if the driver does so
after waiting for Tx DMA finished, there is a possibility that frame
data still remains in TxFIFO.

To avoid this, decrease TxFIFO depth of Q3 and Q2 to one.

This patch has been exercised this using netperf TCP_MAERTS, TCP_STREAM
and UDP_STREAM tests run on an Ebisu board. No performance change was
detected, outside of noise in the tests, both in terms of throughput and
CPU utilisation.

Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper")
Signed-off-by: Masaru Nagai <masaru.nagai.vx@renesas.com>
Signed-off-by: Kazuya Mizuguchi <kazuya.mizuguchi.ks@renesas.com>
[simon: updated changelog]
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/renesas/ravb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index f735dfcb64ae..29d31eb995d7 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -453,7 +453,7 @@ static int ravb_dmac_init(struct net_device *ndev)
 	ravb_write(ndev, RCR_EFFS | RCR_ENCF | RCR_ETS0 | 0x18000000, RCR);
 
 	/* Set FIFO size */
-	ravb_write(ndev, TGC_TQP_AVBMODE1 | 0x00222200, TGC);
+	ravb_write(ndev, TGC_TQP_AVBMODE1 | 0x00112200, TGC);
 
 	/* Timestamp enable */
 	ravb_write(ndev, TCCR_TFEN, TCCR);

From d31b540d63b19f60ae34195aab5644862ea0a683 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Mar 2019 14:50:54 +0800
Subject: [PATCH 2811/3220] route: set the deleted fnhe fnhe_daddr to 0 in
 ip_del_fnhe to fix a race

[ Upstream commit ee60ad219f5c7c4fb2f047f88037770063ef785f ]

The race occurs in __mkroute_output() when 2 threads lookup a dst:

  CPU A                 CPU B
  find_exception()
                        find_exception() [fnhe expires]
                        ip_del_fnhe() [fnhe is deleted]
  rt_bind_exception()

In rt_bind_exception() it will bind a deleted fnhe with the new dst, and
this dst will get no chance to be freed. It causes a dev defcnt leak and
consecutive dmesg warnings:

  unregister_netdevice: waiting for ethX to become free. Usage count = 1

Especially thanks Jon to identify the issue.

This patch fixes it by setting fnhe_daddr to 0 in ip_del_fnhe() to stop
binding the deleted fnhe with a new dst when checking fnhe's fnhe_daddr
and daddr in rt_bind_exception().

It works as both ip_del_fnhe() and rt_bind_exception() are protected by
fnhe_lock and the fhne is freed by kfree_rcu().

Fixes: deed49df7390 ("route: check and remove route cache when we get route")
Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/route.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 80ce6b0672d2..97bf6c785767 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1604,6 +1604,10 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
 		if (fnhe->fnhe_daddr == daddr) {
 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+			/* set fnhe_daddr to 0 to ensure it won't bind with
+			 * new dsts in rt_bind_exception().
+			 */
+			fnhe->fnhe_daddr = 0;
 			fnhe_flush_routes(fnhe);
 			kfree_rcu(fnhe, rcu);
 			break;

From 191aa19ab8c1459c11a5c541801f17e01dda17de Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 8 Mar 2019 22:09:47 +0100
Subject: [PATCH 2812/3220] tcp: handle inet_csk_reqsk_queue_add() failures

[  Upstream commit 9d3e1368bb45893a75a5dfb7cd21fdebfa6b47af ]

Commit 7716682cc58e ("tcp/dccp: fix another race at listener
dismantle") let inet_csk_reqsk_queue_add() fail, and adjusted
{tcp,dccp}_check_req() accordingly. However, TFO and syncookies
weren't modified, thus leaking allocated resources on error.

Contrary to tcp_check_req(), in both syncookies and TFO cases,
we need to drop the request socket. Also, since the child socket is
created with inet_csk_clone_lock(), we have to unlock it and drop an
extra reference (->sk_refcount is initially set to 2 and
inet_csk_reqsk_queue_add() drops only one ref).

For TFO, we also need to revert the work done by tcp_try_fastopen()
(with reqsk_fastopen_remove()).

Fixes: 7716682cc58e ("tcp/dccp: fix another race at listener dismantle")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/syncookies.c | 7 ++++++-
 net/ipv4/tcp_input.c  | 8 +++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c22a74374a9c..f3d3ac5c23d5 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -228,7 +228,12 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 	if (child) {
 		atomic_set(&req->rsk_refcnt, 1);
 		sock_rps_save_rxhash(child, skb);
-		inet_csk_reqsk_queue_add(sk, req, child);
+		if (!inet_csk_reqsk_queue_add(sk, req, child)) {
+			bh_unlock_sock(child);
+			sock_put(child);
+			child = NULL;
+			reqsk_put(req);
+		}
 	} else {
 		reqsk_free(req);
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1aff93d76f24..561f568e8938 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6409,7 +6409,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		af_ops->send_synack(fastopen_sk, dst, &fl, req,
 				    &foc, false);
 		/* Add the child socket directly into the accept queue */
-		inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
+		if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
+			reqsk_fastopen_remove(fastopen_sk, req, false);
+			bh_unlock_sock(fastopen_sk);
+			sock_put(fastopen_sk);
+			reqsk_put(req);
+			goto drop;
+		}
 		sk->sk_data_ready(sk);
 		bh_unlock_sock(fastopen_sk);
 		sock_put(fastopen_sk);

From 0c21ccb8d70ef97ffc1764f978bfcd6ccc82aeca Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Tue, 12 Mar 2019 17:05:47 +0200
Subject: [PATCH 2813/3220] net/mlx4_core: Fix reset flow when in command
 polling mode

[ Upstream commit e15ce4b8d11227007577e6dc1364d288b8874fbe ]

As part of unloading a device, the driver switches from
FW command event mode to FW command polling mode.

Part of switching over to polling mode is freeing the command context array
memory (unfortunately, currently, without NULLing the command context array
pointer).

The reset flow calls "complete" to complete all outstanding fw commands
(if we are in event mode). The check for event vs. polling mode here
is to test if the command context array pointer is NULL.

If the reset flow is activated after the switch to polling mode, it will
attempt (incorrectly) to complete all the commands in the context array --
because the pointer was not NULLed when the driver switched over to polling
mode.

As a result, we have a use-after-free situation, which results in a
kernel crash.

For example:
BUG: unable to handle kernel NULL pointer dereference at           (null)
IP: [<ffffffff876c4a8e>] __wake_up_common+0x2e/0x90
PGD 0
Oops: 0000 [#1] SMP
Modules linked in: netconsole nfsv3 nfs_acl nfs lockd grace ...
CPU: 2 PID: 940 Comm: kworker/2:3 Kdump: loaded Not tainted 3.10.0-862.el7.x86_64 #1
Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090006  04/28/2016
Workqueue: events hv_eject_device_work [pci_hyperv]
task: ffff8d1734ca0fd0 ti: ffff8d17354bc000 task.ti: ffff8d17354bc000
RIP: 0010:[<ffffffff876c4a8e>]  [<ffffffff876c4a8e>] __wake_up_common+0x2e/0x90
RSP: 0018:ffff8d17354bfa38  EFLAGS: 00010082
RAX: 0000000000000000 RBX: ffff8d17362d42c8 RCX: 0000000000000000
RDX: 0000000000000001 RSI: 0000000000000003 RDI: ffff8d17362d42c8
RBP: ffff8d17354bfa70 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000298 R11: ffff8d173610e000 R12: ffff8d17362d42d0
R13: 0000000000000246 R14: 0000000000000000 R15: 0000000000000003
FS:  0000000000000000(0000) GS:ffff8d1802680000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 00000000f16d8000 CR4: 00000000001406e0
Call Trace:
 [<ffffffff876c7adc>] complete+0x3c/0x50
 [<ffffffffc04242f0>] mlx4_cmd_wake_completions+0x70/0x90 [mlx4_core]
 [<ffffffffc041e7b1>] mlx4_enter_error_state+0xe1/0x380 [mlx4_core]
 [<ffffffffc041fa4b>] mlx4_comm_cmd+0x29b/0x360 [mlx4_core]
 [<ffffffffc041ff51>] __mlx4_cmd+0x441/0x920 [mlx4_core]
 [<ffffffff877f62b1>] ? __slab_free+0x81/0x2f0
 [<ffffffff87951384>] ? __radix_tree_lookup+0x84/0xf0
 [<ffffffffc043a8eb>] mlx4_free_mtt_range+0x5b/0xb0 [mlx4_core]
 [<ffffffffc043a957>] mlx4_mtt_cleanup+0x17/0x20 [mlx4_core]
 [<ffffffffc04272c7>] mlx4_free_eq+0xa7/0x1c0 [mlx4_core]
 [<ffffffffc042803e>] mlx4_cleanup_eq_table+0xde/0x130 [mlx4_core]
 [<ffffffffc0433e08>] mlx4_unload_one+0x118/0x300 [mlx4_core]
 [<ffffffffc0434191>] mlx4_remove_one+0x91/0x1f0 [mlx4_core]

The fix is to set the command context array pointer to NULL after freeing
the array.

Fixes: f5aef5aa3506 ("net/mlx4_core: Activate reset flow upon fatal command cases")
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index fc222df47aa9..9e104dcfa9dd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2636,6 +2636,7 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev)
 		down(&priv->cmd.event_sem);
 
 	kfree(priv->cmd.context);
+	priv->cmd.context = NULL;
 
 	up(&priv->cmd.poll_sem);
 }

From 2eace0d069ff604e04e807a95bad82d65e53c11f Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Tue, 12 Mar 2019 17:05:49 +0200
Subject: [PATCH 2814/3220] net/mlx4_core: Fix qp mtt size calculation

[ Upstream commit 8511a653e9250ef36b95803c375a7be0e2edb628 ]

Calculation of qp mtt size (in function mlx4_RST2INIT_wrapper)
ultimately depends on function roundup_pow_of_two.

If the amount of memory required by the QP is less than one page,
roundup_pow_of_two is called with argument zero.  In this case, the
roundup_pow_of_two result is undefined.

Calling roundup_pow_of_two with a zero argument resulted in the
following stack trace:

UBSAN: Undefined behaviour in ./include/linux/log2.h:61:13
shift exponent 64 is too large for 64-bit type 'long unsigned int'
CPU: 4 PID: 26939 Comm: rping Tainted: G OE 4.19.0-rc1
Hardware name: Supermicro X9DR3-F/X9DR3-F, BIOS 3.2a 07/09/2015
Call Trace:
dump_stack+0x9a/0xeb
ubsan_epilogue+0x9/0x7c
__ubsan_handle_shift_out_of_bounds+0x254/0x29d
? __ubsan_handle_load_invalid_value+0x180/0x180
? debug_show_all_locks+0x310/0x310
? sched_clock+0x5/0x10
? sched_clock+0x5/0x10
? sched_clock_cpu+0x18/0x260
? find_held_lock+0x35/0x1e0
? mlx4_RST2INIT_QP_wrapper+0xfb1/0x1440 [mlx4_core]
mlx4_RST2INIT_QP_wrapper+0xfb1/0x1440 [mlx4_core]

Fix this by explicitly testing for zero, and returning one if the
argument is zero (assuming that the next higher power of 2 in this case
should be one).

Fixes: c82e9aa0a8bc ("mlx4_core: resource tracking for HCA resources used by guests")
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 7911dc3da98e..37dfdb1329f4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -2652,13 +2652,13 @@ static int qp_get_mtt_size(struct mlx4_qp_context *qpc)
 	int total_pages;
 	int total_mem;
 	int page_offset = (be32_to_cpu(qpc->params2) >> 6) & 0x3f;
+	int tot;
 
 	sq_size = 1 << (log_sq_size + log_sq_sride + 4);
 	rq_size = (srq|rss|xrc) ? 0 : (1 << (log_rq_size + log_rq_stride + 4));
 	total_mem = sq_size + rq_size;
-	total_pages =
-		roundup_pow_of_two((total_mem + (page_offset << 6)) >>
-				   page_shift);
+	tot = (total_mem + (page_offset << 6)) >> page_shift;
+	total_pages = !tot ? 1 : roundup_pow_of_two(tot);
 
 	return total_pages;
 }

From c1b7d9363334cee2447ddfaff18f7853ceb20e9d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 23 Feb 2019 13:24:59 -0800
Subject: [PATCH 2815/3220] net/x25: fix a race in x25_bind()

[ Upstream commit 797a22bd5298c2674d927893f46cadf619dad11d ]

syzbot was able to trigger another soft lockup [1]

I first thought it was the O(N^2) issue I mentioned in my
prior fix (f657d22ee1f "net/x25: do not hold the cpu
too long in x25_new_lci()"), but I eventually found
that x25_bind() was not checking SOCK_ZAPPED state under
socket lock protection.

This means that multiple threads can end up calling
x25_insert_socket() for the same socket, and corrupt x25_list

[1]
watchdog: BUG: soft lockup - CPU#0 stuck for 123s! [syz-executor.2:10492]
Modules linked in:
irq event stamp: 27515
hardirqs last  enabled at (27514): [<ffffffff81006673>] trace_hardirqs_on_thunk+0x1a/0x1c
hardirqs last disabled at (27515): [<ffffffff8100668f>] trace_hardirqs_off_thunk+0x1a/0x1c
softirqs last  enabled at (32): [<ffffffff8632ee73>] x25_get_neigh+0xa3/0xd0 net/x25/x25_link.c:336
softirqs last disabled at (34): [<ffffffff86324bc3>] x25_find_socket+0x23/0x140 net/x25/af_x25.c:341
CPU: 0 PID: 10492 Comm: syz-executor.2 Not tainted 5.0.0-rc7+ #88
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:__sanitizer_cov_trace_pc+0x4/0x50 kernel/kcov.c:97
Code: f4 ff ff ff e8 11 9f ea ff 48 c7 05 12 fb e5 08 00 00 00 00 e9 c8 e9 ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 <48> 8b 75 08 65 48 8b 04 25 40 ee 01 00 65 8b 15 38 0c 92 7e 81 e2
RSP: 0018:ffff88806e94fc48 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
RAX: 1ffff1100d84dac5 RBX: 0000000000000001 RCX: ffffc90006197000
RDX: 0000000000040000 RSI: ffffffff86324bf3 RDI: ffff88806c26d628
RBP: ffff88806e94fc48 R08: ffff88806c1c6500 R09: fffffbfff1282561
R10: fffffbfff1282560 R11: ffffffff89412b03 R12: ffff88806c26d628
R13: ffff888090455200 R14: dffffc0000000000 R15: 0000000000000000
FS:  00007f3a107e4700(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f3a107e3db8 CR3: 00000000a5544000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 __x25_find_socket net/x25/af_x25.c:327 [inline]
 x25_find_socket+0x7d/0x140 net/x25/af_x25.c:342
 x25_new_lci net/x25/af_x25.c:355 [inline]
 x25_connect+0x380/0xde0 net/x25/af_x25.c:784
 __sys_connect+0x266/0x330 net/socket.c:1662
 __do_sys_connect net/socket.c:1673 [inline]
 __se_sys_connect net/socket.c:1670 [inline]
 __x64_sys_connect+0x73/0xb0 net/socket.c:1670
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x457e29
Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f3a107e3c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457e29
RDX: 0000000000000012 RSI: 0000000020000200 RDI: 0000000000000005
RBP: 000000000073c040 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f3a107e46d4
R13: 00000000004be362 R14: 00000000004ceb98 R15: 00000000ffffffff
Sending NMI from CPU 0 to CPUs 1:
NMI backtrace for cpu 1
CPU: 1 PID: 10493 Comm: syz-executor.3 Not tainted 5.0.0-rc7+ #88
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:__read_once_size include/linux/compiler.h:193 [inline]
RIP: 0010:queued_write_lock_slowpath+0x143/0x290 kernel/locking/qrwlock.c:86
Code: 4c 8d 2c 01 41 83 c7 03 41 0f b6 45 00 41 38 c7 7c 08 84 c0 0f 85 0c 01 00 00 8b 03 3d 00 01 00 00 74 1a f3 90 41 0f b6 55 00 <41> 38 d7 7c eb 84 d2 74 e7 48 89 df e8 cc aa 4e 00 eb dd be 04 00
RSP: 0018:ffff888085c47bd8 EFLAGS: 00000206
RAX: 0000000000000300 RBX: ffffffff89412b00 RCX: 1ffffffff1282560
RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffff89412b00
RBP: ffff888085c47c70 R08: 1ffffffff1282560 R09: fffffbfff1282561
R10: fffffbfff1282560 R11: ffffffff89412b03 R12: 00000000000000ff
R13: fffffbfff1282560 R14: 1ffff11010b88f7d R15: 0000000000000003
FS:  00007fdd04086700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fdd04064db8 CR3: 0000000090be0000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 queued_write_lock include/asm-generic/qrwlock.h:104 [inline]
 do_raw_write_lock+0x1d6/0x290 kernel/locking/spinlock_debug.c:203
 __raw_write_lock_bh include/linux/rwlock_api_smp.h:204 [inline]
 _raw_write_lock_bh+0x3b/0x50 kernel/locking/spinlock.c:312
 x25_insert_socket+0x21/0xe0 net/x25/af_x25.c:267
 x25_bind+0x273/0x340 net/x25/af_x25.c:703
 __sys_bind+0x23f/0x290 net/socket.c:1481
 __do_sys_bind net/socket.c:1492 [inline]
 __se_sys_bind net/socket.c:1490 [inline]
 __x64_sys_bind+0x73/0xb0 net/socket.c:1490
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x457e29

Fixes: 90c27297a9bf ("X.25 remove bkl in bind")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: andrew hendry <andrew.hendry@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/x25/af_x25.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index d7f129cea2fa..5dca42dbc737 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -678,8 +678,7 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr;
 	int len, i, rc = 0;
 
-	if (!sock_flag(sk, SOCK_ZAPPED) ||
-	    addr_len != sizeof(struct sockaddr_x25) ||
+	if (addr_len != sizeof(struct sockaddr_x25) ||
 	    addr->sx25_family != AF_X25) {
 		rc = -EINVAL;
 		goto out;
@@ -694,9 +693,13 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	}
 
 	lock_sock(sk);
-	x25_sk(sk)->source_addr = addr->sx25_addr;
-	x25_insert_socket(sk);
-	sock_reset_flag(sk, SOCK_ZAPPED);
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		x25_sk(sk)->source_addr = addr->sx25_addr;
+		x25_insert_socket(sk);
+		sock_reset_flag(sk, SOCK_ZAPPED);
+	} else {
+		rc = -EINVAL;
+	}
 	release_sock(sk);
 	SOCK_DEBUG(sk, "x25_bind: socket is bound\n");
 out:

From a1b4ace43842791d908a8837304031e439c2ea39 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 21 Feb 2019 22:42:01 +0800
Subject: [PATCH 2816/3220] mdio_bus: Fix use-after-free on device_register
 fails

[ Upstream commit 6ff7b060535e87c2ae14dd8548512abfdda528fb ]

KASAN has found use-after-free in fixed_mdio_bus_init,
commit 0c692d07842a ("drivers/net/phy/mdio_bus.c: call
put_device on device_register() failure") call put_device()
while device_register() fails,give up the last reference
to the device and allow mdiobus_release to be executed
,kfreeing the bus. However in most drives, mdiobus_free
be called to free the bus while mdiobus_register fails.
use-after-free occurs when access bus again, this patch
revert it to let mdiobus_free free the bus.

KASAN report details as below:

BUG: KASAN: use-after-free in mdiobus_free+0x85/0x90 drivers/net/phy/mdio_bus.c:482
Read of size 4 at addr ffff8881dc824d78 by task syz-executor.0/3524

CPU: 1 PID: 3524 Comm: syz-executor.0 Not tainted 5.0.0-rc7+ #45
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0xfa/0x1ce lib/dump_stack.c:113
 print_address_description+0x65/0x270 mm/kasan/report.c:187
 kasan_report+0x149/0x18d mm/kasan/report.c:317
 mdiobus_free+0x85/0x90 drivers/net/phy/mdio_bus.c:482
 fixed_mdio_bus_init+0x283/0x1000 [fixed_phy]
 ? 0xffffffffc0e40000
 ? 0xffffffffc0e40000
 ? 0xffffffffc0e40000
 do_one_initcall+0xfa/0x5ca init/main.c:887
 do_init_module+0x204/0x5f6 kernel/module.c:3460
 load_module+0x66b2/0x8570 kernel/module.c:3808
 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902
 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x462e99
Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f6215c19c58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99
RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000003
RBP: 00007f6215c19c70 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f6215c1a6bc
R13: 00000000004bcefb R14: 00000000006f7030 R15: 0000000000000004

Allocated by task 3524:
 set_track mm/kasan/common.c:85 [inline]
 __kasan_kmalloc.constprop.3+0xa0/0xd0 mm/kasan/common.c:496
 kmalloc include/linux/slab.h:545 [inline]
 kzalloc include/linux/slab.h:740 [inline]
 mdiobus_alloc_size+0x54/0x1b0 drivers/net/phy/mdio_bus.c:143
 fixed_mdio_bus_init+0x163/0x1000 [fixed_phy]
 do_one_initcall+0xfa/0x5ca init/main.c:887
 do_init_module+0x204/0x5f6 kernel/module.c:3460
 load_module+0x66b2/0x8570 kernel/module.c:3808
 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902
 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 3524:
 set_track mm/kasan/common.c:85 [inline]
 __kasan_slab_free+0x130/0x180 mm/kasan/common.c:458
 slab_free_hook mm/slub.c:1409 [inline]
 slab_free_freelist_hook mm/slub.c:1436 [inline]
 slab_free mm/slub.c:2986 [inline]
 kfree+0xe1/0x270 mm/slub.c:3938
 device_release+0x78/0x200 drivers/base/core.c:919
 kobject_cleanup lib/kobject.c:662 [inline]
 kobject_release lib/kobject.c:691 [inline]
 kref_put include/linux/kref.h:67 [inline]
 kobject_put+0x146/0x240 lib/kobject.c:708
 put_device+0x1c/0x30 drivers/base/core.c:2060
 __mdiobus_register+0x483/0x560 drivers/net/phy/mdio_bus.c:382
 fixed_mdio_bus_init+0x26b/0x1000 [fixed_phy]
 do_one_initcall+0xfa/0x5ca init/main.c:887
 do_init_module+0x204/0x5f6 kernel/module.c:3460
 load_module+0x66b2/0x8570 kernel/module.c:3808
 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902
 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8881dc824c80
 which belongs to the cache kmalloc-2k of size 2048
The buggy address is located 248 bytes inside of
 2048-byte region [ffff8881dc824c80, ffff8881dc825480)
The buggy address belongs to the page:
page:ffffea0007720800 count:1 mapcount:0 mapping:ffff8881f6c02800 index:0x0 compound_mapcount: 0
flags: 0x2fffc0000010200(slab|head)
raw: 02fffc0000010200 0000000000000000 0000000500000001 ffff8881f6c02800
raw: 0000000000000000 00000000800f000f 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8881dc824c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff8881dc824c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8881dc824d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                                                ^
 ffff8881dc824d80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8881dc824e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Fixes: 0c692d07842a ("drivers/net/phy/mdio_bus.c: call put_device on device_register() failure")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/phy/mdio_bus.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 88cb4592b6fb..ccefba7af960 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -267,7 +267,6 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner)
 	err = device_register(&bus->dev);
 	if (err) {
 		pr_err("mii_bus %s failed to register\n", bus->id);
-		put_device(&bus->dev);
 		return -EINVAL;
 	}
 

From b4986f23b4d8a73b1ee8aa1561503797c38125bb Mon Sep 17 00:00:00 2001
From: Kalash Nainwal <kalash@arista.com>
Date: Wed, 20 Feb 2019 16:23:04 -0800
Subject: [PATCH 2817/3220] net: Set rtm_table to RT_TABLE_COMPAT for ipv6 for
 tables > 255

[ Upstream commit 97f0082a0592212fc15d4680f5a4d80f79a1687c ]

Set rtm_table to RT_TABLE_COMPAT for ipv6 for tables > 255 to
keep legacy software happy. This is similar to what was done for
ipv4 in commit 709772e6e065 ("net: Fix routing tables with
id > 255 for legacy software").

Signed-off-by: Kalash Nainwal <kalash@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1cb8954885ec..fffd2ad28942 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3095,7 +3095,7 @@ static int rt6_fill_node(struct net *net,
 		table = rt->rt6i_table->tb6_id;
 	else
 		table = RT6_TABLE_UNSPEC;
-	rtm->rtm_table = table;
+	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
 	if (nla_put_u32(skb, RTA_TABLE, table))
 		goto nla_put_failure;
 	if (rt->rt6i_flags & RTF_REJECT) {

From f412aeb9167d084df8e903dc95e023d9e22f47f9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Feb 2019 20:09:35 +0000
Subject: [PATCH 2818/3220] missing barriers in some of unix_sock ->addr and
 ->path accesses

[ Upstream commit ae3b564179bfd06f32d051b9e5d72ce4b2a07c37 ]

Several u->addr and u->path users are not holding any locks in
common with unix_bind().  unix_state_lock() is useless for those
purposes.

u->addr is assign-once and *(u->addr) is fully set up by the time
we set u->addr (all under unix_table_lock).  u->path is also
set in the same critical area, also before setting u->addr, and
any unix_sock with ->path filled will have non-NULL ->addr.

So setting ->addr with smp_store_release() is all we need for those
"lockless" users - just have them fetch ->addr with smp_load_acquire()
and don't even bother looking at ->path if they see NULL ->addr.

Users of ->addr and ->path fall into several classes now:
    1) ones that do smp_load_acquire(u->addr) and access *(u->addr)
and u->path only if smp_load_acquire() has returned non-NULL.
    2) places holding unix_table_lock.  These are guaranteed that
*(u->addr) is seen fully initialized.  If unix_sock is in one of the
"bound" chains, so's ->path.
    3) unix_sock_destructor() using ->addr is safe.  All places
that set u->addr are guaranteed to have seen all stores *(u->addr)
while holding a reference to u and unix_sock_destructor() is called
when (atomic) refcount hits zero.
    4) unix_release_sock() using ->path is safe.  unix_bind()
is serialized wrt unix_release() (normally - by struct file
refcount), and for the instances that had ->path set by unix_bind()
unix_release_sock() comes from unix_release(), so they are fine.
Instances that had it set in unix_stream_connect() either end up
attached to a socket (in unix_accept()), in which case the call
chain to unix_release_sock() and serialization are the same as in
the previous case, or they never get accept'ed and unix_release_sock()
is called when the listener is shut down and its queue gets purged.
In that case the listener's queue lock provides the barriers needed -
unix_stream_connect() shoves our unix_sock into listener's queue
under that lock right after having set ->path and eventual
unix_release_sock() caller picks them from that queue under the
same lock right before calling unix_release_sock().
    5) unix_find_other() use of ->path is pointless, but safe -
it happens with successful lookup by (abstract) name, so ->path.dentry
is guaranteed to be NULL there.

earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/unix/af_unix.c   | 48 ++++++++++++++++++++++++++------------------
 net/unix/diag.c      |  3 ++-
 security/lsm_audit.c | 10 +++++----
 3 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c6b1eec94911..b1a72615fdc3 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -890,7 +890,7 @@ retry:
 	addr->hash ^= sk->sk_type;
 
 	__unix_remove_socket(sk);
-	u->addr = addr;
+	smp_store_release(&u->addr, addr);
 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
 	spin_unlock(&unix_table_lock);
 	err = 0;
@@ -1060,7 +1060,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	err = 0;
 	__unix_remove_socket(sk);
-	u->addr = addr;
+	smp_store_release(&u->addr, addr);
 	__unix_insert_socket(list, sk);
 
 out_unlock:
@@ -1331,15 +1331,29 @@ restart:
 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
 	otheru = unix_sk(other);
 
-	/* copy address information from listening to new sock*/
-	if (otheru->addr) {
-		atomic_inc(&otheru->addr->refcnt);
-		newu->addr = otheru->addr;
-	}
+	/* copy address information from listening to new sock
+	 *
+	 * The contents of *(otheru->addr) and otheru->path
+	 * are seen fully set up here, since we have found
+	 * otheru in hash under unix_table_lock.  Insertion
+	 * into the hash chain we'd found it in had been done
+	 * in an earlier critical area protected by unix_table_lock,
+	 * the same one where we'd set *(otheru->addr) contents,
+	 * as well as otheru->path and otheru->addr itself.
+	 *
+	 * Using smp_store_release() here to set newu->addr
+	 * is enough to make those stores, as well as stores
+	 * to newu->path visible to anyone who gets newu->addr
+	 * by smp_load_acquire().  IOW, the same warranties
+	 * as for unix_sock instances bound in unix_bind() or
+	 * in unix_autobind().
+	 */
 	if (otheru->path.dentry) {
 		path_get(&otheru->path);
 		newu->path = otheru->path;
 	}
+	atomic_inc(&otheru->addr->refcnt);
+	smp_store_release(&newu->addr, otheru->addr);
 
 	/* Set credentials */
 	copy_peercred(sk, other);
@@ -1452,7 +1466,7 @@ out:
 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
 {
 	struct sock *sk = sock->sk;
-	struct unix_sock *u;
+	struct unix_address *addr;
 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
 	int err = 0;
 
@@ -1467,19 +1481,15 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_
 		sock_hold(sk);
 	}
 
-	u = unix_sk(sk);
-	unix_state_lock(sk);
-	if (!u->addr) {
+	addr = smp_load_acquire(&unix_sk(sk)->addr);
+	if (!addr) {
 		sunaddr->sun_family = AF_UNIX;
 		sunaddr->sun_path[0] = 0;
 		*uaddr_len = sizeof(short);
 	} else {
-		struct unix_address *addr = u->addr;
-
 		*uaddr_len = addr->len;
 		memcpy(sunaddr, addr->name, *uaddr_len);
 	}
-	unix_state_unlock(sk);
 	sock_put(sk);
 out:
 	return err;
@@ -2093,11 +2103,11 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
 
 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
 {
-	struct unix_sock *u = unix_sk(sk);
+	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
 
-	if (u->addr) {
-		msg->msg_namelen = u->addr->len;
-		memcpy(msg->msg_name, u->addr->name, u->addr->len);
+	if (addr) {
+		msg->msg_namelen = addr->len;
+		memcpy(msg->msg_name, addr->name, addr->len);
 	}
 }
 
@@ -2820,7 +2830,7 @@ static int unix_seq_show(struct seq_file *seq, void *v)
 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
 			sock_i_ino(s));
 
-		if (u->addr) {
+		if (u->addr) {	// under unix_table_lock here
 			int i, len;
 			seq_putc(seq, ' ');
 
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 384c84e83462..3183d9b8ab33 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -10,7 +10,8 @@
 
 static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
 {
-	struct unix_address *addr = unix_sk(sk)->addr;
+	/* might or might not have unix_table_lock */
+	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
 
 	if (!addr)
 		return 0;
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index cccbf3068cdc..331fd3bd0f39 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -308,6 +308,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		if (a->u.net->sk) {
 			struct sock *sk = a->u.net->sk;
 			struct unix_sock *u;
+			struct unix_address *addr;
 			int len = 0;
 			char *p = NULL;
 
@@ -338,14 +339,15 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 #endif
 			case AF_UNIX:
 				u = unix_sk(sk);
+				addr = smp_load_acquire(&u->addr);
+				if (!addr)
+					break;
 				if (u->path.dentry) {
 					audit_log_d_path(ab, " path=", &u->path);
 					break;
 				}
-				if (!u->addr)
-					break;
-				len = u->addr->len-sizeof(short);
-				p = &u->addr->name->sun_path[0];
+				len = addr->len-sizeof(short);
+				p = &addr->name->sun_path[0];
 				audit_log_format(ab, " path=");
 				if (*p)
 					audit_log_untrustedstring(ab, p);

From cb604342bc8596967493e2ee2aa991d2745ca42b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 20 Feb 2019 00:15:30 +0100
Subject: [PATCH 2819/3220] ipvlan: disallow userns cap_net_admin to change
 global mode/flags

[ Upstream commit 7cc9f7003a969d359f608ebb701d42cafe75b84a ]

When running Docker with userns isolation e.g. --userns-remap="default"
and spawning up some containers with CAP_NET_ADMIN under this realm, I
noticed that link changes on ipvlan slave device inside that container
can affect all devices from this ipvlan group which are in other net
namespaces where the container should have no permission to make changes
to, such as the init netns, for example.

This effectively allows to undo ipvlan private mode and switch globally to
bridge mode where slaves can communicate directly without going through
hostns, or it allows to switch between global operation mode (l2/l3/l3s)
for everyone bound to the given ipvlan master device. libnetwork plugin
here is creating an ipvlan master and ipvlan slave in hostns and a slave
each that is moved into the container's netns upon creation event.

* In hostns:

  # ip -d a
  [...]
  8: cilium_host@bond0: <BROADCAST,MULTICAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
     link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
     ipvlan  mode l3 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
     inet 10.41.0.1/32 scope link cilium_host
       valid_lft forever preferred_lft forever
  [...]

* Spawn container & change ipvlan mode setting inside of it:

  # docker run -dt --cap-add=NET_ADMIN --network cilium-net --name client -l app=test cilium/netperf
  9fff485d69dcb5ce37c9e33ca20a11ccafc236d690105aadbfb77e4f4170879c

  # docker exec -ti client ip -d a
  [...]
  10: cilium0@if4: <BROADCAST,MULTICAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
      link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
      ipvlan  mode l3 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
      inet 10.41.197.43/32 brd 10.41.197.43 scope global cilium0
         valid_lft forever preferred_lft forever

  # docker exec -ti client ip link change link cilium0 name cilium0 type ipvlan mode l2

  # docker exec -ti client ip -d a
  [...]
  10: cilium0@if4: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
      link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
      ipvlan  mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
      inet 10.41.197.43/32 brd 10.41.197.43 scope global cilium0
         valid_lft forever preferred_lft forever

* In hostns (mode switched to l2):

  # ip -d a
  [...]
  8: cilium_host@bond0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
      link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
      ipvlan  mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
      inet 10.41.0.1/32 scope link cilium_host
         valid_lft forever preferred_lft forever
  [...]

Same l3 -> l2 switch would also happen by creating another slave inside
the container's network namespace when specifying the existing cilium0
link to derive the actual (bond0) master:

  # docker exec -ti client ip link add link cilium0 name cilium1 type ipvlan mode l2

  # docker exec -ti client ip -d a
  [...]
  2: cilium1@if4: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default qlen 1000
      link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
      ipvlan  mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
  10: cilium0@if4: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
      link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
      ipvlan  mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
      inet 10.41.197.43/32 brd 10.41.197.43 scope global cilium0
         valid_lft forever preferred_lft forever

* In hostns:

  # ip -d a
  [...]
  8: cilium_host@bond0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
      link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535
      ipvlan  mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
      inet 10.41.0.1/32 scope link cilium_host
         valid_lft forever preferred_lft forever
  [...]

One way to mitigate it is to check CAP_NET_ADMIN permissions of
the ipvlan master device's ns, and only then allow to change
mode or flags for all devices bound to it. Above two cases are
then disallowed after the patch.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ipvlan/ipvlan_main.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index a9268db4e349..ae02ce17c505 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -389,7 +389,12 @@ static int ipvlan_nl_changelink(struct net_device *dev,
 	struct ipvl_dev *ipvlan = netdev_priv(dev);
 	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
 
-	if (data && data[IFLA_IPVLAN_MODE]) {
+	if (!data)
+		return 0;
+	if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (data[IFLA_IPVLAN_MODE]) {
 		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
 
 		ipvlan_set_port_mode(port, nmode);
@@ -454,6 +459,8 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 		struct ipvl_dev *tmp = netdev_priv(phy_dev);
 
 		phy_dev = tmp->phy_dev;
+		if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
 	} else if (!netif_is_ipvlan_port(phy_dev)) {
 		err = ipvlan_port_create(phy_dev);
 		if (err < 0)

From 195e9aa9fe1305b0697043a8580cfad3ee88fdd0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 10 Mar 2019 10:36:40 -0700
Subject: [PATCH 2820/3220] vxlan: test dev->flags & IFF_UP before calling
 gro_cells_receive()

[ Upstream commit 59cbf56fcd98ba2a715b6e97c4e43f773f956393 ]

Same reasons than the ones explained in commit 4179cb5a4c92
("vxlan: test dev->flags & IFF_UP before calling netif_rx()")

netif_rx() or gro_cells_receive() must be called under a strict contract.

At device dismantle phase, core networking clears IFF_UP
and flush_all_backlogs() is called after rcu grace period
to make sure no incoming packet might be in a cpu backlog
and still referencing the device.

A similar protocol is used for gro_cells infrastructure, as
gro_cells_destroy() will be called only after a full rcu
grace period is observed after IFF_UP has been cleared.

Most drivers call netif_rx() from their interrupt handler,
and since the interrupts are disabled at device dismantle,
netif_rx() does not have to check dev->flags & IFF_UP

Virtual drivers do not have this guarantee, and must
therefore make the check themselves.

Otherwise we risk use-after-free and/or crashes.

Fixes: d342894c5d2f ("vxlan: virtual extensible lan")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/vxlan.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 553908adf3c5..bf9b85d977f0 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1229,6 +1229,14 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 		}
 	}
 
+	rcu_read_lock();
+
+	if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
+		rcu_read_unlock();
+		atomic_long_inc(&vxlan->dev->rx_dropped);
+		goto drop;
+	}
+
 	stats = this_cpu_ptr(vxlan->dev->tstats);
 	u64_stats_update_begin(&stats->syncp);
 	stats->rx_packets++;
@@ -1237,6 +1245,8 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 
 	gro_cells_receive(&vxlan->gro_cells, skb);
 
+	rcu_read_unlock();
+
 	return;
 drop:
 	if (tun_dst)

From b0e8ca9a1eb3c5ec090260dd6ef8c06d71caf643 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 8 Mar 2019 16:40:57 +0100
Subject: [PATCH 2821/3220] vxlan: Fix GRO cells race condition between receive
 and link delete

[ Upstream commit ad6c9986bcb627c7c22b8f9e9a934becc27df87c ]

If we receive a packet while deleting a VXLAN device, there's a chance
vxlan_rcv() is called at the same time as vxlan_dellink(). This is fine,
except that vxlan_dellink() should never ever touch stuff that's still in
use, such as the GRO cells list.

Otherwise, vxlan_rcv() crashes while queueing packets via
gro_cells_receive().

Move the gro_cells_destroy() to vxlan_uninit(), which runs after the RCU
grace period is elapsed and nothing needs the gro_cells anymore.

This is now done in the same way as commit 8e816df87997 ("geneve: Use GRO
cells infrastructure.") originally implemented for GENEVE.

Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 58ce31cca1ff ("vxlan: GRO support at tunnel layer")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/vxlan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index bf9b85d977f0..5dadfc508ade 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2322,6 +2322,8 @@ static void vxlan_uninit(struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 
+	gro_cells_destroy(&vxlan->gro_cells);
+
 	vxlan_fdb_delete_default(vxlan);
 
 	free_percpu(dev->tstats);
@@ -3066,7 +3068,6 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 
-	gro_cells_destroy(&vxlan->gro_cells);
 	list_del(&vxlan->next);
 	unregister_netdevice_queue(dev, head);
 }

From 2d5f55d65a36bcb45b2abbff703e814fa1c4e054 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Mar 2019 09:36:33 -0800
Subject: [PATCH 2822/3220] net/hsr: fix possible crash in add_timer()

[ Upstream commit 1e027960edfaa6a43f9ca31081729b716598112b ]

syzbot found another add_timer() issue, this time in net/hsr [1]

Let's use mod_timer() which is safe.

[1]
kernel BUG at kernel/time/timer.c:1136!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 15909 Comm: syz-executor.3 Not tainted 5.0.0+ #97
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
kobject: 'loop2' (00000000f5629718): kobject_uevent_env
RIP: 0010:add_timer kernel/time/timer.c:1136 [inline]
RIP: 0010:add_timer+0x654/0xbe0 kernel/time/timer.c:1134
Code: 0f 94 c5 31 ff 44 89 ee e8 09 61 0f 00 45 84 ed 0f 84 77 fd ff ff e8 bb 5f 0f 00 e8 07 10 a0 ff e9 68 fd ff ff e8 ac 5f 0f 00 <0f> 0b e8 a5 5f 0f 00 0f 0b e8 9e 5f 0f 00 4c 89 b5 58 ff ff ff e9
RSP: 0018:ffff8880656eeca0 EFLAGS: 00010246
kobject: 'loop2' (00000000f5629718): fill_kobj_path: path = '/devices/virtual/block/loop2'
RAX: 0000000000040000 RBX: 1ffff1100caddd9a RCX: ffffc9000c436000
RDX: 0000000000040000 RSI: ffffffff816056c4 RDI: ffff88806a2f6cc8
RBP: ffff8880656eed58 R08: ffff888067f4a300 R09: ffff888067f4abc8
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88806a2f6cc0
R13: dffffc0000000000 R14: 0000000000000001 R15: ffff8880656eed30
FS:  00007fc2019bf700(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000738000 CR3: 0000000067e8e000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 hsr_check_announce net/hsr/hsr_device.c:99 [inline]
 hsr_check_carrier_and_operstate+0x567/0x6f0 net/hsr/hsr_device.c:120
 hsr_netdev_notify+0x297/0xa00 net/hsr/hsr_main.c:51
 notifier_call_chain+0xc7/0x240 kernel/notifier.c:93
 __raw_notifier_call_chain kernel/notifier.c:394 [inline]
 raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:401
 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1739
 call_netdevice_notifiers_extack net/core/dev.c:1751 [inline]
 call_netdevice_notifiers net/core/dev.c:1765 [inline]
 dev_open net/core/dev.c:1436 [inline]
 dev_open+0x143/0x160 net/core/dev.c:1424
 team_port_add drivers/net/team/team.c:1203 [inline]
 team_add_slave+0xa07/0x15d0 drivers/net/team/team.c:1933
 do_set_master net/core/rtnetlink.c:2358 [inline]
 do_set_master+0x1d4/0x230 net/core/rtnetlink.c:2332
 do_setlink+0x966/0x3510 net/core/rtnetlink.c:2493
 rtnl_setlink+0x271/0x3b0 net/core/rtnetlink.c:2747
 rtnetlink_rcv_msg+0x465/0xb00 net/core/rtnetlink.c:5192
 netlink_rcv_skb+0x17a/0x460 net/netlink/af_netlink.c:2485
 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5210
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x536/0x720 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x8ae/0xd70 net/netlink/af_netlink.c:1925
 sock_sendmsg_nosec net/socket.c:622 [inline]
 sock_sendmsg+0xdd/0x130 net/socket.c:632
 sock_write_iter+0x27c/0x3e0 net/socket.c:923
 call_write_iter include/linux/fs.h:1869 [inline]
 do_iter_readv_writev+0x5e0/0x8e0 fs/read_write.c:680
 do_iter_write fs/read_write.c:956 [inline]
 do_iter_write+0x184/0x610 fs/read_write.c:937
 vfs_writev+0x1b3/0x2f0 fs/read_write.c:1001
 do_writev+0xf6/0x290 fs/read_write.c:1036
 __do_sys_writev fs/read_write.c:1109 [inline]
 __se_sys_writev fs/read_write.c:1106 [inline]
 __x64_sys_writev+0x75/0xb0 fs/read_write.c:1106
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x457f29
Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fc2019bec78 EFLAGS: 00000246 ORIG_RAX: 0000000000000014
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457f29
RDX: 0000000000000001 RSI: 00000000200000c0 RDI: 0000000000000003
RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fc2019bf6d4
R13: 00000000004c4a60 R14: 00000000004dd218 R15: 00000000ffffffff

Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Arvid Brodin <arvid.brodin@alten.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/hsr/hsr_device.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 7d464f0486ed..943378d6e4c3 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -93,9 +93,8 @@ static void hsr_check_announce(struct net_device *hsr_dev,
 	if ((hsr_dev->operstate == IF_OPER_UP) && (old_operstate != IF_OPER_UP)) {
 		/* Went up */
 		hsr->announce_count = 0;
-		hsr->announce_timer.expires = jiffies +
-				msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
-		add_timer(&hsr->announce_timer);
+		mod_timer(&hsr->announce_timer,
+			  jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL));
 	}
 
 	if ((hsr_dev->operstate != IF_OPER_UP) && (old_operstate == IF_OPER_UP))
@@ -323,6 +322,7 @@ static void hsr_announce(unsigned long data)
 {
 	struct hsr_priv *hsr;
 	struct hsr_port *master;
+	unsigned long interval;
 
 	hsr = (struct hsr_priv *) data;
 
@@ -337,14 +337,12 @@ static void hsr_announce(unsigned long data)
 	}
 
 	if (hsr->announce_count < 3)
-		hsr->announce_timer.expires = jiffies +
-				msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+		interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
 	else
-		hsr->announce_timer.expires = jiffies +
-				msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+		interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
 
 	if (is_admin_up(master->dev))
-		add_timer(&hsr->announce_timer);
+		mod_timer(&hsr->announce_timer, jiffies + interval);
 
 	rcu_read_unlock();
 }

From dcf54e4f071739f3fe9f50aa101b5707d66cabfd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 10 Mar 2019 10:39:37 -0700
Subject: [PATCH 2823/3220] gro_cells: make sure device is up in
 gro_cells_receive()

[ Upstream commit 2a5ff07a0eb945f291e361aa6f6becca8340ba46 ]

We keep receiving syzbot reports [1] that show that tunnels do not play
the rcu/IFF_UP rules properly.

At device dismantle phase, gro_cells_destroy() will be called
only after a full rcu grace period is observed after IFF_UP
has been cleared.

This means that IFF_UP needs to be tested before queueing packets
into netif_rx() or gro_cells.

This patch implements the test in gro_cells_receive() because
too many callers do not seem to bother enough.

[1]
BUG: unable to handle kernel paging request at fffff4ca0b9ffffe
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 21 Comm: kworker/u4:1 Not tainted 5.0.0+ #97
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: netns cleanup_net
RIP: 0010:__skb_unlink include/linux/skbuff.h:1929 [inline]
RIP: 0010:__skb_dequeue include/linux/skbuff.h:1945 [inline]
RIP: 0010:__skb_queue_purge include/linux/skbuff.h:2656 [inline]
RIP: 0010:gro_cells_destroy net/core/gro_cells.c:89 [inline]
RIP: 0010:gro_cells_destroy+0x19d/0x360 net/core/gro_cells.c:78
Code: 03 42 80 3c 20 00 0f 85 53 01 00 00 48 8d 7a 08 49 8b 47 08 49 c7 07 00 00 00 00 48 89 f9 49 c7 47 08 00 00 00 00 48 c1 e9 03 <42> 80 3c 21 00 0f 85 10 01 00 00 48 89 c1 48 89 42 08 48 c1 e9 03
RSP: 0018:ffff8880aa3f79a8 EFLAGS: 00010a02
RAX: 00ffffffffffffe8 RBX: ffffe8ffffc64b70 RCX: 1ffff8ca0b9ffffe
RDX: ffffc6505cffffe8 RSI: ffffffff858410ca RDI: ffffc6505cfffff0
RBP: ffff8880aa3f7a08 R08: ffff8880aa3e8580 R09: fffffbfff1263645
R10: fffffbfff1263644 R11: ffffffff8931b223 R12: dffffc0000000000
R13: 0000000000000000 R14: ffffe8ffffc64b80 R15: ffffe8ffffc64b75
kobject: 'loop2' (000000004bd7d84a): kobject_uevent_env
FS:  0000000000000000(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: fffff4ca0b9ffffe CR3: 0000000094941000 CR4: 00000000001406f0
Call Trace:
kobject: 'loop2' (000000004bd7d84a): fill_kobj_path: path = '/devices/virtual/block/loop2'
 ip_tunnel_dev_free+0x19/0x60 net/ipv4/ip_tunnel.c:1010
 netdev_run_todo+0x51c/0x7d0 net/core/dev.c:8970
 rtnl_unlock+0xe/0x10 net/core/rtnetlink.c:116
 ip_tunnel_delete_nets+0x423/0x5f0 net/ipv4/ip_tunnel.c:1124
 vti_exit_batch_net+0x23/0x30 net/ipv4/ip_vti.c:495
 ops_exit_list.isra.0+0x105/0x160 net/core/net_namespace.c:156
 cleanup_net+0x3fb/0x960 net/core/net_namespace.c:551
 process_one_work+0x98e/0x1790 kernel/workqueue.c:2173
 worker_thread+0x98/0xe40 kernel/workqueue.c:2319
 kthread+0x357/0x430 kernel/kthread.c:246
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
Modules linked in:
CR2: fffff4ca0b9ffffe
   [ end trace 513fc9c1338d1cb3 ]
RIP: 0010:__skb_unlink include/linux/skbuff.h:1929 [inline]
RIP: 0010:__skb_dequeue include/linux/skbuff.h:1945 [inline]
RIP: 0010:__skb_queue_purge include/linux/skbuff.h:2656 [inline]
RIP: 0010:gro_cells_destroy net/core/gro_cells.c:89 [inline]
RIP: 0010:gro_cells_destroy+0x19d/0x360 net/core/gro_cells.c:78
Code: 03 42 80 3c 20 00 0f 85 53 01 00 00 48 8d 7a 08 49 8b 47 08 49 c7 07 00 00 00 00 48 89 f9 49 c7 47 08 00 00 00 00 48 c1 e9 03 <42> 80 3c 21 00 0f 85 10 01 00 00 48 89 c1 48 89 42 08 48 c1 e9 03
RSP: 0018:ffff8880aa3f79a8 EFLAGS: 00010a02
RAX: 00ffffffffffffe8 RBX: ffffe8ffffc64b70 RCX: 1ffff8ca0b9ffffe
RDX: ffffc6505cffffe8 RSI: ffffffff858410ca RDI: ffffc6505cfffff0
RBP: ffff8880aa3f7a08 R08: ffff8880aa3e8580 R09: fffffbfff1263645
R10: fffffbfff1263644 R11: ffffffff8931b223 R12: dffffc0000000000
kobject: 'loop3' (00000000e4ee57a6): kobject_uevent_env
R13: 0000000000000000 R14: ffffe8ffffc64b80 R15: ffffe8ffffc64b75
FS:  0000000000000000(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: fffff4ca0b9ffffe CR3: 0000000094941000 CR4: 00000000001406f0

Fixes: c9e6bc644e55 ("net: add gro_cells infrastructure")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/gro_cells.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h
index 86316f90ea1e..cd856b7a11f5 100644
--- a/include/net/gro_cells.h
+++ b/include/net/gro_cells.h
@@ -19,22 +19,30 @@ static inline void gro_cells_receive(struct gro_cells *gcells, struct sk_buff *s
 	struct gro_cell *cell;
 	struct net_device *dev = skb->dev;
 
+	rcu_read_lock();
+	if (unlikely(!(dev->flags & IFF_UP)))
+		goto drop;
+
 	if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) {
 		netif_rx(skb);
-		return;
+		goto unlock;
 	}
 
 	cell = this_cpu_ptr(gcells->cells);
 
 	if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
+drop:
 		atomic_long_inc(&dev->rx_dropped);
 		kfree_skb(skb);
-		return;
+		goto unlock;
 	}
 
 	__skb_queue_tail(&cell->napi_skbs, skb);
 	if (skb_queue_len(&cell->napi_skbs) == 1)
 		napi_schedule(&cell->napi);
+
+unlock:
+	rcu_read_unlock();
 }
 
 /* called under BH context */

From e13ab6c195476ba5f2a8c61eb75f7b69483b772a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Sep 2017 15:58:38 -0700
Subject: [PATCH 2824/3220] tcp/dccp: remove reqsk_put() from
 inet_child_forget()

commit da8ab57863ed7e912d10b179b6bdc652f635bd19 upstream.

Back in linux-4.4, I inadvertently put a call to reqsk_put() in
inet_child_forget(), forgetting it could be called from two different
points.

In the case it is called from inet_csk_reqsk_queue_add(), we want to
keep the reference on the request socket, since it is released later by
the caller (tcp_v{4|6}_rcv())

This bug never showed up because atomic_dec_and_test() was not signaling
the underflow, and SLAB_DESTROY_BY RCU semantic for request sockets
prevented the request to be put in quarantine.

Recent conversion of socket refcount from atomic_t to refcount_t finally
exposed the bug.

So move the reqsk_put() to inet_csk_listen_stop() to fix this.

Thanks to Shankara Pailoor for using syzkaller and providing
a nice set of .config and C repro.

WARNING: CPU: 2 PID: 4277 at lib/refcount.c:186
refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:186
Kernel panic - not syncing: panic_on_warn set ...

CPU: 2 PID: 4277 Comm: syz-executor0 Not tainted 4.13.0-rc7 #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
Ubuntu-1.8.2-1ubuntu1 04/01/2014
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0xf7/0x1aa lib/dump_stack.c:52
 panic+0x1ae/0x3a7 kernel/panic.c:180
 __warn+0x1c4/0x1d9 kernel/panic.c:541
 report_bug+0x211/0x2d0 lib/bug.c:183
 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190
 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline]
 do_trap+0x260/0x390 arch/x86/kernel/traps.c:273
 do_error_trap+0x118/0x340 arch/x86/kernel/traps.c:310
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323
 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:846
RIP: 0010:refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:186
RSP: 0018:ffff88006e006b60 EFLAGS: 00010286
RAX: 0000000000000026 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000026 RSI: 1ffff1000dc00d2c RDI: ffffed000dc00d60
RBP: ffff88006e006bf0 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff1000dc00d6d
R13: 00000000ffffffff R14: 0000000000000001 R15: ffff88006ce9d340
 refcount_dec_and_test+0x1a/0x20 lib/refcount.c:211
 reqsk_put+0x71/0x2b0 include/net/request_sock.h:123
 tcp_v4_rcv+0x259e/0x2e20 net/ipv4/tcp_ipv4.c:1729
 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:477 [inline]
 ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488
 __netif_receive_skb_core+0x1fb7/0x31f0 net/core/dev.c:4298
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4336
 process_backlog+0x1c5/0x6d0 net/core/dev.c:5102
 napi_poll net/core/dev.c:5499 [inline]
 net_rx_action+0x6d3/0x14a0 net/core/dev.c:5565
 __do_softirq+0x2cb/0xb2d kernel/softirq.c:284
 do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:898
 </IRQ>
 do_softirq.part.16+0x63/0x80 kernel/softirq.c:328
 do_softirq kernel/softirq.c:176 [inline]
 __local_bh_enable_ip+0x84/0x90 kernel/softirq.c:181
 local_bh_enable include/linux/bottom_half.h:31 [inline]
 rcu_read_unlock_bh include/linux/rcupdate.h:705 [inline]
 ip_finish_output2+0x8ad/0x1360 net/ipv4/ip_output.c:231
 ip_finish_output+0x74e/0xb80 net/ipv4/ip_output.c:317
 NF_HOOK_COND include/linux/netfilter.h:237 [inline]
 ip_output+0x1cc/0x850 net/ipv4/ip_output.c:405
 dst_output include/net/dst.h:471 [inline]
 ip_local_out+0x95/0x160 net/ipv4/ip_output.c:124
 ip_queue_xmit+0x8c6/0x1810 net/ipv4/ip_output.c:504
 tcp_transmit_skb+0x1963/0x3320 net/ipv4/tcp_output.c:1123
 tcp_send_ack.part.35+0x38c/0x620 net/ipv4/tcp_output.c:3575
 tcp_send_ack+0x49/0x60 net/ipv4/tcp_output.c:3545
 tcp_rcv_synsent_state_process net/ipv4/tcp_input.c:5795 [inline]
 tcp_rcv_state_process+0x4876/0x4b60 net/ipv4/tcp_input.c:5930
 tcp_v4_do_rcv+0x58a/0x820 net/ipv4/tcp_ipv4.c:1483
 sk_backlog_rcv include/net/sock.h:907 [inline]
 __release_sock+0x124/0x360 net/core/sock.c:2223
 release_sock+0xa4/0x2a0 net/core/sock.c:2715
 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline]
 __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643
 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682
 SYSC_connect+0x204/0x470 net/socket.c:1628
 SyS_connect+0x24/0x30 net/socket.c:1609
 entry_SYSCALL_64_fastpath+0x18/0xad
RIP: 0033:0x451e59
RSP: 002b:00007f474843fc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 0000000000451e59
RDX: 0000000000000010 RSI: 0000000020002000 RDI: 0000000000000007
RBP: 0000000000000046 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000216 R12: 0000000000000000
R13: 00007ffc040a0f8f R14: 00007f47484409c0 R15: 0000000000000000

Fixes: ebb516af60e1 ("tcp/dccp: fix race at listener dismantle phase")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Shankara Pailoor <sp3485@columbia.edu>
Tested-by: Shankara Pailoor <sp3485@columbia.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/inet_connection_sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 01acb94c4963..6c9158805b57 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -787,7 +787,6 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
 		tcp_sk(child)->fastopen_rsk = NULL;
 	}
 	inet_csk_destroy_sock(child);
-	reqsk_put(req);
 }
 
 struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
@@ -858,6 +857,7 @@ void inet_csk_listen_stop(struct sock *sk)
 		sock_hold(child);
 
 		inet_child_forget(sk, req, child);
+		reqsk_put(req);
 		bh_unlock_sock(child);
 		local_bh_enable();
 		sock_put(child);

From f83a38bf2ccd81dd20db13ab09260c4ea07bda0c Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Tue, 26 Feb 2019 13:38:16 +0900
Subject: [PATCH 2825/3220] ALSA: bebob: use more identical mod_alias for
 Saffire Pro 10 I/O against Liquid Saffire 56

commit 7dc661bd8d3261053b69e4e2d0050cd1ee540fc1 upstream.

ALSA bebob driver has an entry for Focusrite Saffire Pro 10 I/O. The
entry matches vendor_id in root directory and model_id in unit
directory of configuration ROM for IEEE 1394 bus.

On the other hand, configuration ROM of Focusrite Liquid Saffire 56
has the same vendor_id and model_id. This device is an application of
TCAT Dice (TCD2220 a.k.a Dice Jr.) however ALSA bebob driver can be
bound to it randomly instead of ALSA dice driver. At present, drivers
in ALSA firewire stack can not handle this situation appropriately.

This commit uses more identical mod_alias for Focusrite Saffire Pro 10
I/O in ALSA bebob driver.

$ python2 crpp < /sys/bus/firewire/devices/fw1/config_rom
               ROM header and bus information block
               -----------------------------------------------------------------
400  042a829d  bus_info_length 4, crc_length 42, crc 33437
404  31333934  bus_name "1394"
408  f0649222  irmc 1, cmc 1, isc 1, bmc 1, pmc 0, cyc_clk_acc 100,
               max_rec 9 (1024), max_rom 2, gen 2, spd 2 (S400)
40c  00130e01  company_id 00130e     |
410  000606e0  device_id 01000606e0  | EUI-64 00130e01000606e0

               root directory
               -----------------------------------------------------------------
414  0009d31c  directory_length 9, crc 54044
418  04000014  hardware version
41c  0c0083c0  node capabilities per IEEE 1394
420  0300130e  vendor
424  81000012  --> descriptor leaf at 46c
428  17000006  model
42c  81000016  --> descriptor leaf at 484
430  130120c2  version
434  d1000002  --> unit directory at 43c
438  d4000006  --> dependent info directory at 450

               unit directory at 43c
               -----------------------------------------------------------------
43c  0004707c  directory_length 4, crc 28796
440  1200a02d  specifier id: 1394 TA
444  13010001  version: AV/C
448  17000006  model
44c  81000013  --> descriptor leaf at 498

               dependent info directory at 450
               -----------------------------------------------------------------
450  000637c7  directory_length 6, crc 14279
454  120007f5  specifier id
458  13000001  version
45c  3affffc7  (immediate value)
460  3b100000  (immediate value)
464  3cffffc7  (immediate value)
468  3d600000  (immediate value)

               descriptor leaf at 46c
               -----------------------------------------------------------------
46c  00056f3b  leaf_length 5, crc 28475
470  00000000  textual descriptor
474  00000000  minimal ASCII
478  466f6375  "Focu"
47c  73726974  "srit"
480  65000000  "e"

               descriptor leaf at 484
               -----------------------------------------------------------------
484  0004a165  leaf_length 4, crc 41317
488  00000000  textual descriptor
48c  00000000  minimal ASCII
490  50726f31  "Pro1"
494  30494f00  "0IO"

               descriptor leaf at 498
               -----------------------------------------------------------------
498  0004a165  leaf_length 4, crc 41317
49c  00000000  textual descriptor
4a0  00000000  minimal ASCII
4a4  50726f31  "Pro1"
4a8  30494f00  "0IO"

$ python2 crpp < /sys/bus/firewire/devices/fw1/config_rom
               ROM header and bus information block
               -----------------------------------------------------------------
400  040442e4  bus_info_length 4, crc_length 4, crc 17124
404  31333934  bus_name "1394"
408  e0ff8112  irmc 1, cmc 1, isc 1, bmc 0, pmc 0, cyc_clk_acc 255,
               max_rec 8 (512), max_rom 1, gen 1, spd 2 (S400)
40c  00130e04  company_id 00130e     |
410  018001e9  device_id 04018001e9  | EUI-64 00130e04018001e9

               root directory
               -----------------------------------------------------------------
414  00065612  directory_length 6, crc 22034
418  0300130e  vendor
41c  8100000a  --> descriptor leaf at 444
420  17000006  model
424  8100000e  --> descriptor leaf at 45c
428  0c0087c0  node capabilities per IEEE 1394
42c  d1000001  --> unit directory at 430

               unit directory at 430
               -----------------------------------------------------------------
430  000418a0  directory_length 4, crc 6304
434  1200130e  specifier id
438  13000001  version
43c  17000006  model
440  8100000f  --> descriptor leaf at 47c

               descriptor leaf at 444
               -----------------------------------------------------------------
444  00056f3b  leaf_length 5, crc 28475
448  00000000  textual descriptor
44c  00000000  minimal ASCII
450  466f6375  "Focu"
454  73726974  "srit"
458  65000000  "e"

               descriptor leaf at 45c
               -----------------------------------------------------------------
45c  000762c6  leaf_length 7, crc 25286
460  00000000  textual descriptor
464  00000000  minimal ASCII
468  4c495155  "LIQU"
46c  49445f53  "ID_S"
470  41464649  "AFFI"
474  52455f35  "RE_5"
478  36000000  "6"

               descriptor leaf at 47c
               -----------------------------------------------------------------
47c  000762c6  leaf_length 7, crc 25286
480  00000000  textual descriptor
484  00000000  minimal ASCII
488  4c495155  "LIQU"
48c  49445f53  "ID_S"
490  41464649  "AFFI"
494  52455f35  "RE_5"
498  36000000  "6"

Cc: <stable@vger.kernel.org> # v3.16+
Fixes: 25784ec2d034 ("ALSA: bebob: Add support for Focusrite Saffire/SaffirePro series")
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/firewire/bebob/bebob.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/sound/firewire/bebob/bebob.c b/sound/firewire/bebob/bebob.c
index 1898fa4228ad..3a0361458597 100644
--- a/sound/firewire/bebob/bebob.c
+++ b/sound/firewire/bebob/bebob.c
@@ -422,7 +422,19 @@ static const struct ieee1394_device_id bebob_id_table[] = {
 	/* Focusrite, SaffirePro 26 I/O */
 	SND_BEBOB_DEV_ENTRY(VEN_FOCUSRITE, 0x00000003, &saffirepro_26_spec),
 	/* Focusrite, SaffirePro 10 I/O */
-	SND_BEBOB_DEV_ENTRY(VEN_FOCUSRITE, 0x00000006, &saffirepro_10_spec),
+	{
+		// The combination of vendor_id and model_id is the same as the
+		// same as the one of Liquid Saffire 56.
+		.match_flags	= IEEE1394_MATCH_VENDOR_ID |
+				  IEEE1394_MATCH_MODEL_ID |
+				  IEEE1394_MATCH_SPECIFIER_ID |
+				  IEEE1394_MATCH_VERSION,
+		.vendor_id	= VEN_FOCUSRITE,
+		.model_id	= 0x000006,
+		.specifier_id	= 0x00a02d,
+		.version	= 0x010001,
+		.driver_data	= (kernel_ulong_t)&saffirepro_10_spec,
+	},
 	/* Focusrite, Saffire(no label and LE) */
 	SND_BEBOB_DEV_ENTRY(VEN_FOCUSRITE, MODEL_FOCUSRITE_SAFFIRE_BOTH,
 			    &saffire_spec),

From 9b37b964775e36ed742e112b7f44a91042c99a63 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Thu, 7 Jan 2016 17:49:51 -0500
Subject: [PATCH 2826/3220] fs/9p: use fscache mutex rather than spinlock

commit 8f5fed1e917588f946ad8882bd47a4093db0ff4c upstream.

We may sleep inside a the lock, so use a mutex rather than spinlock.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Emil Karlson <jkarlson@tuxera.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/9p/cache.c     | 8 ++++----
 fs/9p/v9fs.h      | 2 +-
 fs/9p/vfs_inode.c | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index a69260f27555..103ca5e1267b 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -243,14 +243,14 @@ void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 	if (!v9inode->fscache)
 		return;
 
-	spin_lock(&v9inode->fscache_lock);
+	mutex_lock(&v9inode->fscache_lock);
 
 	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
 		v9fs_cache_inode_flush_cookie(inode);
 	else
 		v9fs_cache_inode_get_cookie(inode);
 
-	spin_unlock(&v9inode->fscache_lock);
+	mutex_unlock(&v9inode->fscache_lock);
 }
 
 void v9fs_cache_inode_reset_cookie(struct inode *inode)
@@ -264,7 +264,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
 
 	old = v9inode->fscache;
 
-	spin_lock(&v9inode->fscache_lock);
+	mutex_lock(&v9inode->fscache_lock);
 	fscache_relinquish_cookie(v9inode->fscache, 1);
 
 	v9ses = v9fs_inode2v9ses(inode);
@@ -274,7 +274,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
 	p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
 		 inode, old, v9inode->fscache);
 
-	spin_unlock(&v9inode->fscache_lock);
+	mutex_unlock(&v9inode->fscache_lock);
 }
 
 int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 0923f2cf3c80..6877050384a1 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -123,7 +123,7 @@ struct v9fs_session_info {
 
 struct v9fs_inode {
 #ifdef CONFIG_9P_FSCACHE
-	spinlock_t fscache_lock;
+	struct mutex fscache_lock;
 	struct fscache_cookie *fscache;
 #endif
 	struct p9_qid qid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 73f1d1b3a51c..4483d97a4abf 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -244,7 +244,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 		return NULL;
 #ifdef CONFIG_9P_FSCACHE
 	v9inode->fscache = NULL;
-	spin_lock_init(&v9inode->fscache_lock);
+	mutex_init(&v9inode->fscache_lock);
 #endif
 	v9inode->writeback_fid = NULL;
 	v9inode->cache_validity = 0;

From b83f73aa760899a492752f8447ec4ef230e5c01d Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Fri, 8 Mar 2019 23:52:05 +0800
Subject: [PATCH 2827/3220] It's wrong to add len to sector_nr in raid10
 reshape twice

commit b761dcf1217760a42f7897c31dcb649f59b2333e upstream.

In reshape_request it already adds len to sector_nr already. It's wrong to add len to
sector_nr again after adding pages to bio. If there is bad block it can't copy one chunk
at a time, it needs to goto read_more. Now the sector_nr is wrong. It can cause data
corruption.

Cc: stable@vger.kernel.org # v3.16+
Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/md/raid10.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8d613652d0e2..98da5f5d847d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4442,7 +4442,6 @@ bio_full:
 	atomic_inc(&r10_bio->remaining);
 	read_bio->bi_next = NULL;
 	generic_make_request(read_bio);
-	sector_nr += nr_sectors;
 	sectors_done += nr_sectors;
 	if (sector_nr <= last)
 		goto read_more;

From 7b5115689bf9dafc5127b28ace4589f698d4adfa Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 19 Nov 2018 10:33:44 -0500
Subject: [PATCH 2828/3220] media: videobuf2-v4l2: drop WARN_ON in
 vb2_warn_zero_bytesused()

commit 5e99456c20f712dcc13d9f6ca4278937d5367355 upstream.

Userspace shouldn't set bytesused to 0 for output buffers.
vb2_warn_zero_bytesused() warns about this (only once!), but it also
calls WARN_ON(1), which is confusing since it is not immediately clear
that it warns about a 0 value for bytesused.

Just drop the WARN_ON as it serves no purpose.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Cc: Matthias Maennich <maennich@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/v4l2-core/videobuf2-v4l2.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/media/v4l2-core/videobuf2-v4l2.c b/drivers/media/v4l2-core/videobuf2-v4l2.c
index bf23234d957e..412a6a74d0a8 100644
--- a/drivers/media/v4l2-core/videobuf2-v4l2.c
+++ b/drivers/media/v4l2-core/videobuf2-v4l2.c
@@ -141,7 +141,6 @@ static void vb2_warn_zero_bytesused(struct vb2_buffer *vb)
 		return;
 
 	check_once = true;
-	WARN_ON(1);
 
 	pr_warn("use of bytesused == 0 is deprecated and will be removed in the future,\n");
 	if (vb->vb2_queue->allow_zero_bytesused)

From f289ac1f6eb2d024793732ddb629595ab85674b7 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Thu, 24 Jan 2019 14:35:13 +0800
Subject: [PATCH 2829/3220] 9p: use inode->i_lock to protect i_size_write()
 under 32-bit

commit 5e3cc1ee1405a7eb3487ed24f786dec01b4cbe1f upstream.

Use inode->i_lock to protect i_size_write(), else i_size_read() in
generic_fillattr() may loop infinitely in read_seqcount_begin() when
multiple processes invoke v9fs_vfs_getattr() or v9fs_vfs_getattr_dotl()
simultaneously under 32-bit SMP environment, and a soft lockup will be
triggered as show below:

  watchdog: BUG: soft lockup - CPU#5 stuck for 22s! [stat:2217]
  Modules linked in:
  CPU: 5 PID: 2217 Comm: stat Not tainted 5.0.0-rc1-00005-g7f702faf5a9e #4
  Hardware name: Generic DT based system
  PC is at generic_fillattr+0x104/0x108
  LR is at 0xec497f00
  pc : [<802b8898>]    lr : [<ec497f00>]    psr: 200c0013
  sp : ec497e20  ip : ed608030  fp : ec497e3c
  r10: 00000000  r9 : ec497f00  r8 : ed608030
  r7 : ec497ebc  r6 : ec497f00  r5 : ee5c1550  r4 : ee005780
  r3 : 0000052d  r2 : 00000000  r1 : ec497f00  r0 : ed608030
  Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
  Control: 10c5387d  Table: ac48006a  DAC: 00000051
  CPU: 5 PID: 2217 Comm: stat Not tainted 5.0.0-rc1-00005-g7f702faf5a9e #4
  Hardware name: Generic DT based system
  Backtrace:
  [<8010d974>] (dump_backtrace) from [<8010dc88>] (show_stack+0x20/0x24)
  [<8010dc68>] (show_stack) from [<80a1d194>] (dump_stack+0xb0/0xdc)
  [<80a1d0e4>] (dump_stack) from [<80109f34>] (show_regs+0x1c/0x20)
  [<80109f18>] (show_regs) from [<801d0a80>] (watchdog_timer_fn+0x280/0x2f8)
  [<801d0800>] (watchdog_timer_fn) from [<80198658>] (__hrtimer_run_queues+0x18c/0x380)
  [<801984cc>] (__hrtimer_run_queues) from [<80198e60>] (hrtimer_run_queues+0xb8/0xf0)
  [<80198da8>] (hrtimer_run_queues) from [<801973e8>] (run_local_timers+0x28/0x64)
  [<801973c0>] (run_local_timers) from [<80197460>] (update_process_times+0x3c/0x6c)
  [<80197424>] (update_process_times) from [<801ab2b8>] (tick_nohz_handler+0xe0/0x1bc)
  [<801ab1d8>] (tick_nohz_handler) from [<80843050>] (arch_timer_handler_virt+0x38/0x48)
  [<80843018>] (arch_timer_handler_virt) from [<80180a64>] (handle_percpu_devid_irq+0x8c/0x240)
  [<801809d8>] (handle_percpu_devid_irq) from [<8017ac20>] (generic_handle_irq+0x34/0x44)
  [<8017abec>] (generic_handle_irq) from [<8017b344>] (__handle_domain_irq+0x6c/0xc4)
  [<8017b2d8>] (__handle_domain_irq) from [<801022e0>] (gic_handle_irq+0x4c/0x88)
  [<80102294>] (gic_handle_irq) from [<80101a30>] (__irq_svc+0x70/0x98)
  [<802b8794>] (generic_fillattr) from [<8056b284>] (v9fs_vfs_getattr_dotl+0x74/0xa4)
  [<8056b210>] (v9fs_vfs_getattr_dotl) from [<802b8904>] (vfs_getattr_nosec+0x68/0x7c)
  [<802b889c>] (vfs_getattr_nosec) from [<802b895c>] (vfs_getattr+0x44/0x48)
  [<802b8918>] (vfs_getattr) from [<802b8a74>] (vfs_statx+0x9c/0xec)
  [<802b89d8>] (vfs_statx) from [<802b9428>] (sys_lstat64+0x48/0x78)
  [<802b93e0>] (sys_lstat64) from [<80101000>] (ret_fast_syscall+0x0/0x28)

[dominique.martinet@cea.fr: updated comment to not refer to a function
in another subsystem]
Link: http://lkml.kernel.org/r/20190124063514.8571-2-houtao1@huawei.com
Cc: stable@vger.kernel.org
Fixes: 7549ae3e81cc ("9p: Use the i_size_[read, write]() macros instead of using inode->i_size directly.")
Reported-by: Xing Gaopeng <xingaopeng@huawei.com>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/9p/v9fs_vfs.h       | 23 +++++++++++++++++++++--
 fs/9p/vfs_file.c       |  6 +++++-
 fs/9p/vfs_inode.c      | 23 +++++++++++------------
 fs/9p/vfs_inode_dotl.c | 27 ++++++++++++++-------------
 fs/9p/vfs_super.c      |  4 ++--
 5 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 5a0db6dec8d1..aaee1e6584e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,6 +40,9 @@
  */
 #define P9_LOCK_TIMEOUT (30*HZ)
 
+/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */
+#define V9FS_STAT2INODE_KEEP_ISIZE 1
+
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
@@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 		    struct inode *inode, umode_t mode, dev_t);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
-void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
+void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
+		      struct super_block *sb, unsigned int flags);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+			   unsigned int flags);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
@@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
 }
 
 int v9fs_open_to_dotl_flags(int flags);
+
+static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+	/*
+	 * 32-bit need the lock, concurrent updates could break the
+	 * sequences and make i_size_read() loop forever.
+	 * 64-bit updates are atomic and can skip the locking.
+	 */
+	if (sizeof(i_size) > sizeof(long))
+		spin_lock(&inode->i_lock);
+	i_size_write(inode, i_size);
+	if (sizeof(i_size) > sizeof(long))
+		spin_unlock(&inode->i_lock);
+}
 #endif
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e7b3d2c4472d..62ce8b4a7e5f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -442,7 +442,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		i_size = i_size_read(inode);
 		if (iocb->ki_pos > i_size) {
 			inode_add_bytes(inode, iocb->ki_pos - i_size);
-			i_size_write(inode, iocb->ki_pos);
+			/*
+			 * Need to serialize against i_size_write() in
+			 * v9fs_stat2inode()
+			 */
+			v9fs_i_size_write(inode, iocb->ki_pos);
 		}
 		return retval;
 	}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4483d97a4abf..2de1505aedfd 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 	if (retval)
 		goto error;
 
-	v9fs_stat2inode(st, inode, sb);
+	v9fs_stat2inode(st, inode, sb, 0);
 	v9fs_cache_inode_get_cookie(inode);
 	unlock_new_inode(inode);
 	return inode;
@@ -1074,7 +1074,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
-	v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb);
+	v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb, 0);
 	generic_fillattr(d_inode(dentry), stat);
 
 	p9stat_free(st);
@@ -1152,12 +1152,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
  * @stat: Plan 9 metadata (mistat) structure
  * @inode: inode to populate
  * @sb: superblock of filesystem
+ * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
  *
  */
 
 void
 v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
-	struct super_block *sb)
+		 struct super_block *sb, unsigned int flags)
 {
 	umode_t mode;
 	char ext[32];
@@ -1198,10 +1199,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	mode = p9mode2perm(v9ses, stat);
 	mode |= inode->i_mode & ~S_IALLUGO;
 	inode->i_mode = mode;
-	i_size_write(inode, stat->length);
 
+	if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+		v9fs_i_size_write(inode, stat->length);
 	/* not real number of blocks, but 512 byte ones ... */
-	inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+	inode->i_blocks = (stat->length + 512 - 1) >> 9;
 	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 
@@ -1389,9 +1391,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 {
 	int umode;
 	dev_t rdev;
-	loff_t i_size;
 	struct p9_wstat *st;
 	struct v9fs_session_info *v9ses;
+	unsigned int flags;
 
 	v9ses = v9fs_inode2v9ses(inode);
 	st = p9_client_stat(fid);
@@ -1404,16 +1406,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
 		goto out;
 
-	spin_lock(&inode->i_lock);
 	/*
 	 * We don't want to refresh inode->i_size,
 	 * because we may have cached data
 	 */
-	i_size = inode->i_size;
-	v9fs_stat2inode(st, inode, inode->i_sb);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-		inode->i_size = i_size;
-	spin_unlock(&inode->i_lock);
+	flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+		V9FS_STAT2INODE_KEEP_ISIZE : 0;
+	v9fs_stat2inode(st, inode, inode->i_sb, flags);
 out:
 	p9stat_free(st);
 	kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0b88744c6446..7ae67fcca031 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 	if (retval)
 		goto error;
 
-	v9fs_stat2inode_dotl(st, inode);
+	v9fs_stat2inode_dotl(st, inode, 0);
 	v9fs_cache_inode_get_cookie(inode);
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
@@ -498,7 +498,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
-	v9fs_stat2inode_dotl(st, d_inode(dentry));
+	v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
 	generic_fillattr(d_inode(dentry), stat);
 	/* Change block size to what the server returned */
 	stat->blksize = st->st_blksize;
@@ -609,11 +609,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
  * v9fs_stat2inode_dotl - populate an inode structure with stat info
  * @stat: stat structure
  * @inode: inode to populate
+ * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
  *
  */
 
 void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+		      unsigned int flags)
 {
 	umode_t mode;
 	struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -633,7 +635,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 		mode |= inode->i_mode & ~S_IALLUGO;
 		inode->i_mode = mode;
 
-		i_size_write(inode, stat->st_size);
+		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+			v9fs_i_size_write(inode, stat->st_size);
 		inode->i_blocks = stat->st_blocks;
 	} else {
 		if (stat->st_result_mask & P9_STATS_ATIME) {
@@ -663,8 +666,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 		}
 		if (stat->st_result_mask & P9_STATS_RDEV)
 			inode->i_rdev = new_decode_dev(stat->st_rdev);
-		if (stat->st_result_mask & P9_STATS_SIZE)
-			i_size_write(inode, stat->st_size);
+		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
+		    stat->st_result_mask & P9_STATS_SIZE)
+			v9fs_i_size_write(inode, stat->st_size);
 		if (stat->st_result_mask & P9_STATS_BLOCKS)
 			inode->i_blocks = stat->st_blocks;
 	}
@@ -926,9 +930,9 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
 
 int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 {
-	loff_t i_size;
 	struct p9_stat_dotl *st;
 	struct v9fs_session_info *v9ses;
+	unsigned int flags;
 
 	v9ses = v9fs_inode2v9ses(inode);
 	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
@@ -940,16 +944,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
 		goto out;
 
-	spin_lock(&inode->i_lock);
 	/*
 	 * We don't want to refresh inode->i_size,
 	 * because we may have cached data
 	 */
-	i_size = inode->i_size;
-	v9fs_stat2inode_dotl(st, inode);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-		inode->i_size = i_size;
-	spin_unlock(&inode->i_lock);
+	flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+		V9FS_STAT2INODE_KEEP_ISIZE : 0;
+	v9fs_stat2inode_dotl(st, inode, flags);
 out:
 	kfree(st);
 	return 0;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index bf495cedec26..ccf935d9e722 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -165,7 +165,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 			goto release_sb;
 		}
 		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
-		v9fs_stat2inode_dotl(st, d_inode(root));
+		v9fs_stat2inode_dotl(st, d_inode(root), 0);
 		kfree(st);
 	} else {
 		struct p9_wstat *st = NULL;
@@ -176,7 +176,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		}
 
 		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
-		v9fs_stat2inode(st, d_inode(root), sb);
+		v9fs_stat2inode(st, d_inode(root), sb, 0);
 
 		p9stat_free(st);
 		kfree(st);

From d96e50a53ba106ff845fe8d17300f38f1d3d4536 Mon Sep 17 00:00:00 2001
From: zhengbin <zhengbin13@huawei.com>
Date: Wed, 13 Mar 2019 16:01:37 +0800
Subject: [PATCH 2830/3220] 9p/net: fix memory leak in p9_client_create

commit bb06c388fa20ae24cfe80c52488de718a7e3a53f upstream.

If msize is less than 4096, we should close and put trans, destroy
tagpool, not just free client. This patch fixes that.

Link: http://lkml.kernel.org/m/1552464097-142659-1-git-send-email-zhengbin13@huawei.com
Cc: stable@vger.kernel.org
Fixes: 574d356b7a02 ("9p/net: put a lower bound on msize")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: zhengbin <zhengbin13@huawei.com>
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/9p/client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/9p/client.c b/net/9p/client.c
index 8fba9cd973c1..443db202db09 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1058,7 +1058,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
 		p9_debug(P9_DEBUG_ERROR,
 			 "Please specify a msize of at least 4k\n");
 		err = -EINVAL;
-		goto free_client;
+		goto close_trans;
 	}
 
 	err = p9_client_version(clnt);

From fdeefd2af1f382d4e8f174269c4a46ea00708902 Mon Sep 17 00:00:00 2001
From: "S.j. Wang" <shengjiu.wang@nxp.com>
Date: Mon, 18 Feb 2019 08:29:11 +0000
Subject: [PATCH 2831/3220] ASoC: fsl_esai: fix register setting issue in
 RIGHT_J mode

commit cc29ea007347f39f4c5a4d27b0b555955a0277f9 upstream.

The ESAI_xCR_xWA is xCR's bit, not the xCCR's bit, driver set it to
wrong register, correct it.

Fixes 43d24e76b698 ("ASoC: fsl_esai: Add ESAI CPU DAI driver")
Cc: <stable@vger.kernel.org>
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Ackedy-by: Nicolin Chen <nicoleotsuka@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/soc/fsl/fsl_esai.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sound/soc/fsl/fsl_esai.c b/sound/soc/fsl/fsl_esai.c
index e8adead8be00..a87836d4de15 100644
--- a/sound/soc/fsl/fsl_esai.c
+++ b/sound/soc/fsl/fsl_esai.c
@@ -394,7 +394,8 @@ static int fsl_esai_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 		break;
 	case SND_SOC_DAIFMT_RIGHT_J:
 		/* Data on rising edge of bclk, frame high, right aligned */
-		xccr |= ESAI_xCCR_xCKP | ESAI_xCCR_xHCKP | ESAI_xCR_xWA;
+		xccr |= ESAI_xCCR_xCKP | ESAI_xCCR_xHCKP;
+		xcr  |= ESAI_xCR_xWA;
 		break;
 	case SND_SOC_DAIFMT_DSP_A:
 		/* Data on rising edge of bclk, frame high, 1clk before data */
@@ -451,12 +452,12 @@ static int fsl_esai_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 		return -EINVAL;
 	}
 
-	mask = ESAI_xCR_xFSL | ESAI_xCR_xFSR;
+	mask = ESAI_xCR_xFSL | ESAI_xCR_xFSR | ESAI_xCR_xWA;
 	regmap_update_bits(esai_priv->regmap, REG_ESAI_TCR, mask, xcr);
 	regmap_update_bits(esai_priv->regmap, REG_ESAI_RCR, mask, xcr);
 
 	mask = ESAI_xCCR_xCKP | ESAI_xCCR_xHCKP | ESAI_xCCR_xFSP |
-		ESAI_xCCR_xFSD | ESAI_xCCR_xCKD | ESAI_xCR_xWA;
+		ESAI_xCCR_xFSD | ESAI_xCCR_xCKD;
 	regmap_update_bits(esai_priv->regmap, REG_ESAI_TCCR, mask, xccr);
 	regmap_update_bits(esai_priv->regmap, REG_ESAI_RCCR, mask, xccr);
 

From 5672988e09e1f82f76a368e1672478de20c26ce5 Mon Sep 17 00:00:00 2001
From: Zhi Jin <zhi.jin@intel.com>
Date: Thu, 6 Sep 2018 15:22:10 +0800
Subject: [PATCH 2832/3220] stm class: Fix an endless loop in channel
 allocation

commit a1d75dad3a2c689e70a1c4e0214cca9de741d0aa upstream.

There is a bug in the channel allocation logic that leads to an endless
loop when looking for a contiguous range of channels in a range with a
mixture of free and occupied channels. For example, opening three
consequtive channels, closing the first two and requesting 4 channels in
a row will trigger this soft lockup. The bug is that the search loop
forgets to skip over the range once it detects that one channel in that
range is occupied.

Restore the original intent to the logic by fixing the omission.

Signed-off-by: Zhi Jin <zhi.jin@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Fixes: 7bd1d4093c2f ("stm class: Introduce an abstraction for System Trace Module devices")
CC: stable@vger.kernel.org # v4.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/stm/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index 99434f5be34c..8bd1b4dab94e 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -229,6 +229,9 @@ static int find_free_channels(unsigned long *bitmap, unsigned int start,
 			;
 		if (i == width)
 			return pos;
+
+		/* step over [pos..pos+i) to continue search */
+		pos += i;
 	}
 
 	return -1;

From ef3e805c9476e17743b8eeaa9c8fa5b0c7cbf9c0 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pankaj.gupta@nxp.com>
Date: Fri, 1 Feb 2019 07:18:20 +0000
Subject: [PATCH 2833/3220] crypto: caam - fixed handling of sg list

commit 42e95d1f10dcf8b18b1d7f52f7068985b3dc5b79 upstream.

when the source sg contains more than 1 fragment and
destination sg contains 1 fragment, the caam driver
mishandle the buffers to be sent to caam.

Fixes: f2147b88b2b1 ("crypto: caam - Convert GCM to new AEAD interface")
Cc: <stable@vger.kernel.org> # 4.2+
Signed-off-by: Pankaj Gupta <pankaj.gupta@nxp.com>
Signed-off-by: Arun Pathak <arun.pathak@nxp.com>
Reviewed-by: Horia Geanta <horia.geanta@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/crypto/caam/caamalg.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index f3307fc38e79..f2d1fea23fbf 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -2081,6 +2081,7 @@ static void init_aead_job(struct aead_request *req,
 	if (unlikely(req->src != req->dst)) {
 		if (!edesc->dst_nents) {
 			dst_dma = sg_dma_address(req->dst);
+			out_options = 0;
 		} else {
 			dst_dma = edesc->sec4_sg_dma +
 				  sec4_sg_index *

From 82351c83b1c5a4c4052207cffaf2423a4c5e7871 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 31 Jan 2019 23:51:41 -0800
Subject: [PATCH 2834/3220] crypto: ahash - fix another early termination in
 hash walk

commit 77568e535af7c4f97eaef1e555bf0af83772456c upstream.

Hash algorithms with an alignmask set, e.g. "xcbc(aes-aesni)" and
"michael_mic", fail the improved hash tests because they sometimes
produce the wrong digest.  The bug is that in the case where a
scatterlist element crosses pages, not all the data is actually hashed
because the scatterlist walk terminates too early.  This happens because
the 'nbytes' variable in crypto_hash_walk_done() is assigned the number
of bytes remaining in the page, then later interpreted as the number of
bytes remaining in the scatterlist element.  Fix it.

Fixes: 900a081f6912 ("crypto: ahash - Fix early termination in hash walk")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 crypto/ahash.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 6978ad86e516..595c4f3657ff 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -85,17 +85,17 @@ static int hash_walk_new_entry(struct crypto_hash_walk *walk)
 int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
 {
 	unsigned int alignmask = walk->alignmask;
-	unsigned int nbytes = walk->entrylen;
 
 	walk->data -= walk->offset;
 
-	if (nbytes && walk->offset & alignmask && !err) {
-		walk->offset = ALIGN(walk->offset, alignmask + 1);
-		nbytes = min(nbytes,
-			     ((unsigned int)(PAGE_SIZE)) - walk->offset);
-		walk->entrylen -= nbytes;
+	if (walk->entrylen && (walk->offset & alignmask) && !err) {
+		unsigned int nbytes;
 
+		walk->offset = ALIGN(walk->offset, alignmask + 1);
+		nbytes = min(walk->entrylen,
+			     (unsigned int)(PAGE_SIZE - walk->offset));
 		if (nbytes) {
+			walk->entrylen -= nbytes;
 			walk->data += walk->offset;
 			return nbytes;
 		}
@@ -115,7 +115,7 @@ int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
 	if (err)
 		return err;
 
-	if (nbytes) {
+	if (walk->entrylen) {
 		walk->offset = 0;
 		walk->pg++;
 		return hash_walk_next(walk);

From 8512f804fd88b43bb2f2bb67d17993fc57bd3d58 Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Thu, 20 Dec 2018 11:06:38 +0300
Subject: [PATCH 2835/3220] gpu: ipu-v3: Fix i.MX51 CSI control registers
 offset

[ Upstream commit 2c0408dd0d8906b26fe8023889af7adf5e68b2c2 ]

The CSI0/CSI1 registers offset is at +0xe030000/+0xe038000 relative
to the control module registers on IPUv3EX.
This patch fixes wrong values for i.MX51 CSI0/CSI1.

Fixes: 2ffd48f2e7 ("gpu: ipu-v3: Add Camera Sensor Interface unit")

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/ipu-v3/ipu-common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c
index 5030cba4a581..0c51b1dde494 100644
--- a/drivers/gpu/ipu-v3/ipu-common.c
+++ b/drivers/gpu/ipu-v3/ipu-common.c
@@ -746,8 +746,8 @@ static struct ipu_devtype ipu_type_imx51 = {
 	.cpmem_ofs = 0x1f000000,
 	.srm_ofs = 0x1f040000,
 	.tpm_ofs = 0x1f060000,
-	.csi0_ofs = 0x1f030000,
-	.csi1_ofs = 0x1f038000,
+	.csi0_ofs = 0x1e030000,
+	.csi1_ofs = 0x1e038000,
 	.ic_ofs = 0x1e020000,
 	.disp0_ofs = 0x1e040000,
 	.disp1_ofs = 0x1e048000,

From e212aa684767a43ed2fe3f0d468f2e982662d5fb Mon Sep 17 00:00:00 2001
From: Steve Longerbeam <slongerbeam@gmail.com>
Date: Tue, 16 Oct 2018 17:31:40 -0700
Subject: [PATCH 2836/3220] gpu: ipu-v3: Fix CSI offsets for imx53

[ Upstream commit bb867d219fda7fbaabea3314702474c4eac2b91d ]

The CSI offsets are wrong for both CSI0 and CSI1. They are at
physical address 0x1e030000 and 0x1e038000 respectively.

Fixes: 2ffd48f2e7 ("gpu: ipu-v3: Add Camera Sensor Interface unit")

Signed-off-by: Steve Longerbeam <slongerbeam@gmail.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/ipu-v3/ipu-common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c
index 0c51b1dde494..df295a0ce87d 100644
--- a/drivers/gpu/ipu-v3/ipu-common.c
+++ b/drivers/gpu/ipu-v3/ipu-common.c
@@ -762,8 +762,8 @@ static struct ipu_devtype ipu_type_imx53 = {
 	.cpmem_ofs = 0x07000000,
 	.srm_ofs = 0x07040000,
 	.tpm_ofs = 0x07060000,
-	.csi0_ofs = 0x07030000,
-	.csi1_ofs = 0x07038000,
+	.csi0_ofs = 0x06030000,
+	.csi1_ofs = 0x06038000,
 	.ic_ofs = 0x06020000,
 	.disp0_ofs = 0x06040000,
 	.disp1_ofs = 0x06048000,

From 21442c73253b3faca08461adcebcf823970c4811 Mon Sep 17 00:00:00 2001
From: Stefan Haberland <sth@linux.ibm.com>
Date: Wed, 21 Nov 2018 12:39:47 +0100
Subject: [PATCH 2837/3220] s390/dasd: fix using offset into zero size array
 error

[ Upstream commit 4a8ef6999bce998fa5813023a9a6b56eea329dba ]

Dan Carpenter reported the following:

The patch 52898025cf7d: "[S390] dasd: security and PSF update patch
for EMC CKD ioctl" from Mar 8, 2010, leads to the following static
checker warning:

	drivers/s390/block/dasd_eckd.c:4486 dasd_symm_io()
	error: using offset into zero size array 'psf_data[]'

drivers/s390/block/dasd_eckd.c
  4458          /* Copy parms from caller */
  4459          rc = -EFAULT;
  4460          if (copy_from_user(&usrparm, argp, sizeof(usrparm)))
                                    ^^^^^^^
The user can specify any "usrparm.psf_data_len".  They choose zero by
mistake.

  4461                  goto out;
  4462          if (is_compat_task()) {
  4463                  /* Make sure pointers are sane even on 31 bit. */
  4464                  rc = -EINVAL;
  4465                  if ((usrparm.psf_data >> 32) != 0)
  4466                          goto out;
  4467                  if ((usrparm.rssd_result >> 32) != 0)
  4468                          goto out;
  4469                  usrparm.psf_data &= 0x7fffffffULL;
  4470                  usrparm.rssd_result &= 0x7fffffffULL;
  4471          }
  4472          /* alloc I/O data area */
  4473          psf_data = kzalloc(usrparm.psf_data_len, GFP_KERNEL
  			   				 | GFP_DMA);
  4474          rssd_result = kzalloc(usrparm.rssd_result_len, GFP_KERNEL
							       | GFP_DMA);
  4475          if (!psf_data || !rssd_result) {

kzalloc() returns a ZERO_SIZE_PTR (0x16).

  4476                  rc = -ENOMEM;
  4477                  goto out_free;
  4478          }
  4479
  4480          /* get syscall header from user space */
  4481          rc = -EFAULT;
  4482          if (copy_from_user(psf_data,
  4483                             (void __user *)(unsigned long)
  				   	 		 usrparm.psf_data,
  4484                             usrparm.psf_data_len))

That all works great.

  4485                  goto out_free;
  4486          psf0 = psf_data[0];
  4487          psf1 = psf_data[1];

But now we're assuming that "->psf_data_len" was at least 2 bytes.

Fix this by checking the user specified length psf_data_len.

Fixes: 52898025cf7d ("[S390] dasd: security and PSF update patch for EMC CKD ioctl")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Stefan Haberland <sth@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/s390/block/dasd_eckd.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index dac2f6883e28..80a43074c2f9 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -4023,6 +4023,14 @@ static int dasd_symm_io(struct dasd_device *device, void __user *argp)
 		usrparm.psf_data &= 0x7fffffffULL;
 		usrparm.rssd_result &= 0x7fffffffULL;
 	}
+	/* at least 2 bytes are accessed and should be allocated */
+	if (usrparm.psf_data_len < 2) {
+		DBF_DEV_EVENT(DBF_WARNING, device,
+			      "Symmetrix ioctl invalid data length %d",
+			      usrparm.psf_data_len);
+		rc = -EINVAL;
+		goto out;
+	}
 	/* alloc I/O data area */
 	psf_data = kzalloc(usrparm.psf_data_len, GFP_KERNEL | GFP_DMA);
 	rssd_result = kzalloc(usrparm.rssd_result_len, GFP_KERNEL | GFP_DMA);

From 4e873fa2105264ff7847798153a464761d34e126 Mon Sep 17 00:00:00 2001
From: Yizhuo <yzhai003@ucr.edu>
Date: Fri, 25 Jan 2019 22:32:20 -0800
Subject: [PATCH 2838/3220] ARM: OMAP2+: Variable "reg" in function
 omap4_dsi_mux_pads() could be uninitialized

[ Upstream commit dc30e70391376ba3987aeb856ae6d9c0706534f1 ]

In function omap4_dsi_mux_pads(), local variable "reg" could
be uninitialized if function regmap_read() returns -EINVAL.
However, it will be used directly in the later context, which
is potentially unsafe.

Signed-off-by: Yizhuo <yzhai003@ucr.edu>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/mach-omap2/display.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c
index 6ab13d18c636..cde86d1199cf 100644
--- a/arch/arm/mach-omap2/display.c
+++ b/arch/arm/mach-omap2/display.c
@@ -115,6 +115,7 @@ static int omap4_dsi_mux_pads(int dsi_id, unsigned lanes)
 	u32 enable_mask, enable_shift;
 	u32 pipd_mask, pipd_shift;
 	u32 reg;
+	int ret;
 
 	if (dsi_id == 0) {
 		enable_mask = OMAP4_DSI1_LANEENABLE_MASK;
@@ -130,7 +131,11 @@ static int omap4_dsi_mux_pads(int dsi_id, unsigned lanes)
 		return -ENODEV;
 	}
 
-	regmap_read(omap4_dsi_mux_syscon, OMAP4_DSIPHY_SYSCON_OFFSET, &reg);
+	ret = regmap_read(omap4_dsi_mux_syscon,
+					  OMAP4_DSIPHY_SYSCON_OFFSET,
+					  &reg);
+	if (ret)
+		return ret;
 
 	reg &= ~enable_mask;
 	reg &= ~pipd_mask;

From 65513463874ca411a5febbd55d85827ac5065799 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 7 Feb 2019 14:39:40 -0800
Subject: [PATCH 2839/3220] Input: matrix_keypad - use flush_delayed_work()

[ Upstream commit a342083abe576db43594a32d458a61fa81f7cb32 ]

We should be using flush_delayed_work() instead of flush_work() in
matrix_keypad_stop() to ensure that we are not missing work that is
scheduled but not yet put in the workqueue (i.e. its delay timer has not
expired yet).

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/input/keyboard/matrix_keypad.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c
index c64d87442a62..2e12e31f45c5 100644
--- a/drivers/input/keyboard/matrix_keypad.c
+++ b/drivers/input/keyboard/matrix_keypad.c
@@ -220,7 +220,7 @@ static void matrix_keypad_stop(struct input_dev *dev)
 	keypad->stopped = true;
 	spin_unlock_irq(&keypad->lock);
 
-	flush_work(&keypad->work.work);
+	flush_delayed_work(&keypad->work);
 	/*
 	 * matrix_keypad_scan() will leave IRQs enabled;
 	 * we should disable them now.

From c18daf1bdae9b5edd9b3fa6404dd13f30488ba83 Mon Sep 17 00:00:00 2001
From: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
Date: Tue, 5 Feb 2019 16:42:53 +0530
Subject: [PATCH 2840/3220] i2c: cadence: Fix the hold bit setting

[ Upstream commit d358def706880defa4c9e87381c5bf086a97d5f9 ]

In case the hold bit is not needed we are carrying the old values.
Fix the same by resetting the bit when not needed.

Fixes the sporadic i2c bus lockups on National Instruments
Zynq-based devices.

Fixes: df8eb5691c48 ("i2c: Add driver for Cadence I2C controller")
Reported-by: Kyle Roeschley <kyle.roeschley@ni.com>
Acked-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
Tested-by: Kyle Roeschley <kyle.roeschley@ni.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/i2c/busses/i2c-cadence.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/i2c/busses/i2c-cadence.c b/drivers/i2c/busses/i2c-cadence.c
index 84deed6571bd..6d32e6da3110 100644
--- a/drivers/i2c/busses/i2c-cadence.c
+++ b/drivers/i2c/busses/i2c-cadence.c
@@ -378,8 +378,10 @@ static void cdns_i2c_mrecv(struct cdns_i2c *id)
 	 * Check for the message size against FIFO depth and set the
 	 * 'hold bus' bit if it is greater than FIFO depth.
 	 */
-	if (id->recv_count > CDNS_I2C_FIFO_DEPTH)
+	if ((id->recv_count > CDNS_I2C_FIFO_DEPTH)  || id->bus_hold_flag)
 		ctrl_reg |= CDNS_I2C_CR_HOLD;
+	else
+		ctrl_reg = ctrl_reg & ~CDNS_I2C_CR_HOLD;
 
 	cdns_i2c_writereg(ctrl_reg, CDNS_I2C_CR_OFFSET);
 
@@ -436,8 +438,11 @@ static void cdns_i2c_msend(struct cdns_i2c *id)
 	 * Check for the message size against FIFO depth and set the
 	 * 'hold bus' bit if it is greater than FIFO depth.
 	 */
-	if (id->send_count > CDNS_I2C_FIFO_DEPTH)
+	if ((id->send_count > CDNS_I2C_FIFO_DEPTH) || id->bus_hold_flag)
 		ctrl_reg |= CDNS_I2C_CR_HOLD;
+	else
+		ctrl_reg = ctrl_reg & ~CDNS_I2C_CR_HOLD;
+
 	cdns_i2c_writereg(ctrl_reg, CDNS_I2C_CR_OFFSET);
 
 	/* Clear the interrupts in interrupt status register. */

From 27bd1497181cadefa1d731770efb06a8dff1569c Mon Sep 17 00:00:00 2001
From: Gabriel Fernandez <gabriel.fernandez@st.com>
Date: Sat, 16 Feb 2019 21:10:16 -0800
Subject: [PATCH 2841/3220] Input: st-keyscan - fix potential zalloc NULL
 dereference

[ Upstream commit 2439d37e1bf8a34d437573c086572abe0f3f1b15 ]

This patch fixes the following static checker warning:

drivers/input/keyboard/st-keyscan.c:156 keyscan_probe()
error: potential zalloc NULL dereference: 'keypad_data->input_dev'

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Gabriel Fernandez <gabriel.fernandez@st.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/input/keyboard/st-keyscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/input/keyboard/st-keyscan.c b/drivers/input/keyboard/st-keyscan.c
index de7be4f03d91..ebf9f643d910 100644
--- a/drivers/input/keyboard/st-keyscan.c
+++ b/drivers/input/keyboard/st-keyscan.c
@@ -153,6 +153,8 @@ static int keyscan_probe(struct platform_device *pdev)
 
 	input_dev->id.bustype = BUS_HOST;
 
+	keypad_data->input_dev = input_dev;
+
 	error = keypad_matrix_key_parse_dt(keypad_data);
 	if (error)
 		return error;
@@ -168,8 +170,6 @@ static int keyscan_probe(struct platform_device *pdev)
 
 	input_set_drvdata(input_dev, keypad_data);
 
-	keypad_data->input_dev = input_dev;
-
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	keypad_data->base = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(keypad_data->base))

From 823c717dbf5d63433713e1daaea3578624926210 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Mon, 21 Jan 2019 14:42:42 +0100
Subject: [PATCH 2842/3220] ARM: 8824/1: fix a migrating irq bug when hotplug
 cpu

[ Upstream commit 1b5ba350784242eb1f899bcffd95d2c7cff61e84 ]

Arm TC2 fails cpu hotplug stress test.

This issue was tracked down to a missing copy of the new affinity
cpumask for the vexpress-spc interrupt into struct
irq_common_data.affinity when the interrupt is migrated in
migrate_one_irq().

Fix it by replacing the arm specific hotplug cpu migration with the
generic irq code.

This is the counterpart implementation to commit 217d453d473c ("arm64:
fix a migrating irq bug when hotplug cpu").

Tested with cpu hotplug stress test on Arm TC2 (multi_v7_defconfig plus
CONFIG_ARM_BIG_LITTLE_CPUFREQ=y and CONFIG_ARM_VEXPRESS_SPC_CPUFREQ=y).
The vexpress-spc interrupt (irq=22) on this board is affine to CPU0.
Its affinity cpumask now changes correctly e.g. from 0 to 1-4 when
CPU0 is hotplugged out.

Suggested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/Kconfig           |  1 +
 arch/arm/include/asm/irq.h |  1 -
 arch/arm/kernel/irq.c      | 62 --------------------------------------
 arch/arm/kernel/smp.c      |  2 +-
 4 files changed, 2 insertions(+), 64 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 34e1569a11ee..3a0277c6c060 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1475,6 +1475,7 @@ config NR_CPUS
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"
 	depends on SMP
+	select GENERIC_IRQ_MIGRATION
 	help
 	  Say Y here to experiment with turning CPUs off and on.  CPUs
 	  can be controlled through /sys/devices/system/cpu.
diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
index 1bd9510de1b9..cae4df39f02e 100644
--- a/arch/arm/include/asm/irq.h
+++ b/arch/arm/include/asm/irq.h
@@ -24,7 +24,6 @@
 #ifndef __ASSEMBLY__
 struct irqaction;
 struct pt_regs;
-extern void migrate_irqs(void);
 
 extern void asm_do_IRQ(unsigned int, struct pt_regs *);
 void handle_IRQ(unsigned int, struct pt_regs *);
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 1d45320ee125..900c591913d5 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -31,7 +31,6 @@
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
-#include <linux/ratelimit.h>
 #include <linux/errno.h>
 #include <linux/list.h>
 #include <linux/kallsyms.h>
@@ -119,64 +118,3 @@ int __init arch_probe_nr_irqs(void)
 	return nr_irqs;
 }
 #endif
-
-#ifdef CONFIG_HOTPLUG_CPU
-static bool migrate_one_irq(struct irq_desc *desc)
-{
-	struct irq_data *d = irq_desc_get_irq_data(desc);
-	const struct cpumask *affinity = irq_data_get_affinity_mask(d);
-	struct irq_chip *c;
-	bool ret = false;
-
-	/*
-	 * If this is a per-CPU interrupt, or the affinity does not
-	 * include this CPU, then we have nothing to do.
-	 */
-	if (irqd_is_per_cpu(d) || !cpumask_test_cpu(smp_processor_id(), affinity))
-		return false;
-
-	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
-		affinity = cpu_online_mask;
-		ret = true;
-	}
-
-	c = irq_data_get_irq_chip(d);
-	if (!c->irq_set_affinity)
-		pr_debug("IRQ%u: unable to set affinity\n", d->irq);
-	else if (c->irq_set_affinity(d, affinity, false) == IRQ_SET_MASK_OK && ret)
-		cpumask_copy(irq_data_get_affinity_mask(d), affinity);
-
-	return ret;
-}
-
-/*
- * The current CPU has been marked offline.  Migrate IRQs off this CPU.
- * If the affinity settings do not allow other CPUs, force them onto any
- * available CPU.
- *
- * Note: we must iterate over all IRQs, whether they have an attached
- * action structure or not, as we need to get chained interrupts too.
- */
-void migrate_irqs(void)
-{
-	unsigned int i;
-	struct irq_desc *desc;
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	for_each_irq_desc(i, desc) {
-		bool affinity_broken;
-
-		raw_spin_lock(&desc->lock);
-		affinity_broken = migrate_one_irq(desc);
-		raw_spin_unlock(&desc->lock);
-
-		if (affinity_broken)
-			pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n",
-				i, smp_processor_id());
-	}
-
-	local_irq_restore(flags);
-}
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index e42be5800f37..08ce9e36dc5a 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -218,7 +218,7 @@ int __cpu_disable(void)
 	/*
 	 * OK - migrate IRQs away from this CPU
 	 */
-	migrate_irqs();
+	irq_migrate_all_off_this_cpu();
 
 	/*
 	 * Flush user cache and TLB mappings, and then remove this CPU

From 526efb1049bffc15fa7383494ebcec35cf3a216c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 14 Feb 2019 16:20:15 +0000
Subject: [PATCH 2843/3220] assoc_array: Fix shortcut creation

[ Upstream commit bb2ba2d75a2d673e76ddaf13a9bd30d6a8b1bb08 ]

Fix the creation of shortcuts for which the length of the index key value
is an exact multiple of the machine word size.  The problem is that the
code that blanks off the unused bits of the shortcut value malfunctions if
the number of bits in the last word equals machine word size.  This is due
to the "<<" operator being given a shift of zero in this case, and so the
mask that should be all zeros is all ones instead.  This causes the
subsequent masking operation to clear everything rather than clearing
nothing.

Ordinarily, the presence of the hash at the beginning of the tree index key
makes the issue very hard to test for, but in this case, it was encountered
due to a development mistake that caused the hash output to be either 0
(keyring) or 1 (non-keyring) only.  This made it susceptible to the
keyctl/unlink/valid test in the keyutils package.

The fix is simply to skip the blanking if the shift would be 0.  For
example, an index key that is 64 bits long would produce a 0 shift and thus
a 'blank' of all 1s.  This would then be inverted and AND'd onto the
index_key, incorrectly clearing the entire last word.

Fixes: 3cb989501c26 ("Add a generic associative array implementation.")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <james.morris@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/assoc_array.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index 5cd093589c5a..3b46c5433b7a 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -781,9 +781,11 @@ all_leaves_cluster_together:
 		new_s0->index_key[i] =
 			ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE);
 
-	blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
-	pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank);
-	new_s0->index_key[keylen - 1] &= ~blank;
+	if (level & ASSOC_ARRAY_KEY_CHUNK_MASK) {
+		blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
+		pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank);
+		new_s0->index_key[keylen - 1] &= ~blank;
+	}
 
 	/* This now reduces to a node splitting exercise for which we'll need
 	 * to regenerate the disparity table.

From 75330ce98d95a8657a552a12ea6625a1d4aa0dc2 Mon Sep 17 00:00:00 2001
From: Anoob Soman <anoob.soman@citrix.com>
Date: Wed, 13 Feb 2019 13:21:39 +0800
Subject: [PATCH 2844/3220] scsi: libiscsi: Fix race between iscsi_xmit_task
 and iscsi_complete_task

[ Upstream commit 79edd00dc6a96644d76b4a1cb97d94d49e026768 ]

When a target sends Check Condition, whilst initiator is busy xmiting
re-queued data, could lead to race between iscsi_complete_task() and
iscsi_xmit_task() and eventually crashing with the following kernel
backtrace.

[3326150.987523] ALERT: BUG: unable to handle kernel NULL pointer dereference at 0000000000000078
[3326150.987549] ALERT: IP: [<ffffffffa05ce70d>] iscsi_xmit_task+0x2d/0xc0 [libiscsi]
[3326150.987571] WARN: PGD 569c8067 PUD 569c9067 PMD 0
[3326150.987582] WARN: Oops: 0002 [#1] SMP
[3326150.987593] WARN: Modules linked in: tun nfsv3 nfs fscache dm_round_robin
[3326150.987762] WARN: CPU: 2 PID: 8399 Comm: kworker/u32:1 Tainted: G O 4.4.0+2 #1
[3326150.987774] WARN: Hardware name: Dell Inc. PowerEdge R720/0W7JN5, BIOS 2.5.4 01/22/2016
[3326150.987790] WARN: Workqueue: iscsi_q_13 iscsi_xmitworker [libiscsi]
[3326150.987799] WARN: task: ffff8801d50f3800 ti: ffff8801f5458000 task.ti: ffff8801f5458000
[3326150.987810] WARN: RIP: e030:[<ffffffffa05ce70d>] [<ffffffffa05ce70d>] iscsi_xmit_task+0x2d/0xc0 [libiscsi]
[3326150.987825] WARN: RSP: e02b:ffff8801f545bdb0 EFLAGS: 00010246
[3326150.987831] WARN: RAX: 00000000ffffffc3 RBX: ffff880282d2ab20 RCX: ffff88026b6ac480
[3326150.987842] WARN: RDX: 0000000000000000 RSI: 00000000fffffe01 RDI: ffff880282d2ab20
[3326150.987852] WARN: RBP: ffff8801f545bdc8 R08: 0000000000000000 R09: 0000000000000008
[3326150.987862] WARN: R10: 0000000000000000 R11: 000000000000fe88 R12: 0000000000000000
[3326150.987872] WARN: R13: ffff880282d2abe8 R14: ffff880282d2abd8 R15: ffff880282d2ac08
[3326150.987890] WARN: FS: 00007f5a866b4840(0000) GS:ffff88028a640000(0000) knlGS:0000000000000000
[3326150.987900] WARN: CS: e033 DS: 0000 ES: 0000 CR0: 0000000080050033
[3326150.987907] WARN: CR2: 0000000000000078 CR3: 0000000070244000 CR4: 0000000000042660
[3326150.987918] WARN: Stack:
[3326150.987924] WARN: ffff880282d2ad58 ffff880282d2ab20 ffff880282d2abe8 ffff8801f545be18
[3326150.987938] WARN: ffffffffa05cea90 ffff880282d2abf8 ffff88026b59cc80 ffff88026b59cc00
[3326150.987951] WARN: ffff88022acf32c0 ffff880289491800 ffff880255a80800 0000000000000400
[3326150.987964] WARN: Call Trace:
[3326150.987975] WARN: [<ffffffffa05cea90>] iscsi_xmitworker+0x2f0/0x360 [libiscsi]
[3326150.987988] WARN: [<ffffffff8108862c>] process_one_work+0x1fc/0x3b0
[3326150.987997] WARN: [<ffffffff81088f95>] worker_thread+0x2a5/0x470
[3326150.988006] WARN: [<ffffffff8159cad8>] ? __schedule+0x648/0x870
[3326150.988015] WARN: [<ffffffff81088cf0>] ? rescuer_thread+0x300/0x300
[3326150.988023] WARN: [<ffffffff8108ddf5>] kthread+0xd5/0xe0
[3326150.988031] WARN: [<ffffffff8108dd20>] ? kthread_stop+0x110/0x110
[3326150.988040] WARN: [<ffffffff815a0bcf>] ret_from_fork+0x3f/0x70
[3326150.988048] WARN: [<ffffffff8108dd20>] ? kthread_stop+0x110/0x110
[3326150.988127] ALERT: RIP [<ffffffffa05ce70d>] iscsi_xmit_task+0x2d/0xc0 [libiscsi]
[3326150.988138] WARN: RSP <ffff8801f545bdb0>
[3326150.988144] WARN: CR2: 0000000000000078
[3326151.020366] WARN: ---[ end trace 1c60974d4678d81b ]---

Commit 6f8830f5bbab ("scsi: libiscsi: add lock around task lists to fix
list corruption regression") introduced "taskqueuelock" to fix list
corruption during the race, but this wasn't enough.

Re-setting of conn->task to NULL, could race with iscsi_xmit_task().
iscsi_complete_task()
{
    ....
    if (conn->task == task)
        conn->task = NULL;
}

conn->task in iscsi_xmit_task() could be NULL and so will be task.
__iscsi_get_task(task) will crash (NullPtr de-ref), trying to access
refcount.

iscsi_xmit_task()
{
    struct iscsi_task *task = conn->task;

    __iscsi_get_task(task);
}

This commit will take extra conn->session->back_lock in iscsi_xmit_task()
to ensure iscsi_xmit_task() waits for iscsi_complete_task(), if
iscsi_complete_task() wins the race.  If iscsi_xmit_task() wins the race,
iscsi_xmit_task() increments task->refcount
(__iscsi_get_task) ensuring iscsi_complete_task() will not iscsi_free_task().

Signed-off-by: Anoob Soman <anoob.soman@citrix.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Acked-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/libiscsi.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 009a2ef829d6..0fdc8c417035 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1448,7 +1448,13 @@ static int iscsi_xmit_task(struct iscsi_conn *conn)
 	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx))
 		return -ENODATA;
 
+	spin_lock_bh(&conn->session->back_lock);
+	if (conn->task == NULL) {
+		spin_unlock_bh(&conn->session->back_lock);
+		return -ENODATA;
+	}
 	__iscsi_get_task(task);
+	spin_unlock_bh(&conn->session->back_lock);
 	spin_unlock_bh(&conn->session->frwd_lock);
 	rc = conn->session->tt->xmit_task(task);
 	spin_lock_bh(&conn->session->frwd_lock);

From 3685be7cb47f292f3d95eca080ae7683b901264e Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 15 Feb 2019 12:16:51 -0800
Subject: [PATCH 2845/3220] net: systemport: Fix reception of BPDUs

[ Upstream commit a40061ea2e39494104602b3048751341bda374a1 ]

SYSTEMPORT has its RXCHK parser block that attempts to validate the
packet structures, unfortunately setting the L2 header check bit will
cause Bridge PDUs (BPDUs) to be incorrectly rejected because they look
like LLC/SNAP packets with a non-IPv4 or non-IPv6 Ethernet Type.

Fixes: 4e8aedfe78c7 ("net: systemport: Turn on offloads by default")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 143b9a384af8..53b3c1a5851c 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -126,6 +126,10 @@ static int bcm_sysport_set_rx_csum(struct net_device *dev,
 
 	priv->rx_chk_en = !!(wanted & NETIF_F_RXCSUM);
 	reg = rxchk_readl(priv, RXCHK_CONTROL);
+	/* Clear L2 header checks, which would prevent BPDUs
+	 * from being received.
+	 */
+	reg &= ~RXCHK_L2_HDR_DIS;
 	if (priv->rx_chk_en)
 		reg |= RXCHK_EN;
 	else

From 0b6f466dfc6200811a908ecde8741ea66a8e12f0 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 9 Feb 2019 02:01:01 +0100
Subject: [PATCH 2846/3220] pinctrl: meson: meson8b: fix the sdxc_a data 1..3
 pins

[ Upstream commit c17abcfa93bf0be5e48bb011607d237ac2bfc839 ]

Fix the mismatch between the "sdxc_d13_1_a" pin group definition from
meson8b_cbus_groups and the entry in sdxc_a_groups ("sdxc_d0_13_1_a").
This makes it possible to use "sdxc_d13_1_a" in device-tree files to
route the MMC data 1..3 pins to GPIOX_1..3.

Fixes: 0fefcb6876d0d6 ("pinctrl: Add support for Meson8b")
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/pinctrl/meson/pinctrl-meson8b.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pinctrl/meson/pinctrl-meson8b.c b/drivers/pinctrl/meson/pinctrl-meson8b.c
index b505b87661f8..07c4153e6f3d 100644
--- a/drivers/pinctrl/meson/pinctrl-meson8b.c
+++ b/drivers/pinctrl/meson/pinctrl-meson8b.c
@@ -656,7 +656,7 @@ static const char * const sd_a_groups[] = {
 
 static const char * const sdxc_a_groups[] = {
 	"sdxc_d0_0_a", "sdxc_d13_0_a", "sdxc_d47_a", "sdxc_clk_a",
-	"sdxc_cmd_a", "sdxc_d0_1_a", "sdxc_d0_13_1_a"
+	"sdxc_cmd_a", "sdxc_d0_1_a", "sdxc_d13_1_a"
 };
 
 static const char * const pcm_a_groups[] = {

From 764498fa2d2be92af6c9a80563c91a2206c02918 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Sat, 16 Feb 2019 00:20:54 +0300
Subject: [PATCH 2847/3220] net: mv643xx_eth: disable clk on error path in
 mv643xx_eth_shared_probe()

[ Upstream commit e928b5d6b75e239feb9c6d5488974b6646a0ebc8 ]

If mv643xx_eth_shared_of_probe() fails, mv643xx_eth_shared_probe()
leaves clk enabled.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/marvell/mv643xx_eth.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 4182290fdbcf..82f080a5ed5c 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -2884,7 +2884,7 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 
 	ret = mv643xx_eth_shared_of_probe(pdev);
 	if (ret)
-		return ret;
+		goto err_put_clk;
 	pd = dev_get_platdata(&pdev->dev);
 
 	msp->tx_csum_limit = (pd != NULL && pd->tx_csum_limit) ?
@@ -2892,6 +2892,11 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 	infer_hw_params(msp);
 
 	return 0;
+
+err_put_clk:
+	if (!IS_ERR(msp->clk))
+		clk_disable_unprepare(msp->clk);
+	return ret;
 }
 
 static int mv643xx_eth_shared_remove(struct platform_device *pdev)

From c8380f42e82ea2bde5208ba0477ce79db0006d2b Mon Sep 17 00:00:00 2001
From: Bard liao <yung-chuan.liao@linux.intel.com>
Date: Sun, 17 Feb 2019 21:23:47 +0800
Subject: [PATCH 2848/3220] ASoC: topology: free created components in tplg
 load error

[ Upstream commit 304017d31df36fb61eb2ed3ebf65fb6870b3c731 ]

Topology resources are no longer needed if any element failed to load.

Signed-off-by: Bard liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/soc-topology.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index c1e76feb3529..824f4d7fc41f 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -1770,6 +1770,7 @@ int snd_soc_tplg_component_load(struct snd_soc_component *comp,
 	struct snd_soc_tplg_ops *ops, const struct firmware *fw, u32 id)
 {
 	struct soc_tplg tplg;
+	int ret;
 
 	/* setup parsing context */
 	memset(&tplg, 0, sizeof(tplg));
@@ -1783,7 +1784,12 @@ int snd_soc_tplg_component_load(struct snd_soc_component *comp,
 	tplg.bytes_ext_ops = ops->bytes_ext_ops;
 	tplg.bytes_ext_ops_count = ops->bytes_ext_ops_count;
 
-	return soc_tplg_load(&tplg);
+	ret = soc_tplg_load(&tplg);
+	/* free the created components if fail to load topology */
+	if (ret)
+		snd_soc_tplg_component_remove(comp, SND_SOC_TPLG_INDEX_ALL);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(snd_soc_tplg_component_load);
 

From aa5740d660ac628b300af200e6a36c094b25fffa Mon Sep 17 00:00:00 2001
From: Vladimir Murzin <vladimir.murzin@arm.com>
Date: Wed, 20 Feb 2019 11:43:05 +0000
Subject: [PATCH 2849/3220] arm64: Relax GIC version check during early boot

[ Upstream commit 74698f6971f25d045301139413578865fc2bd8f9 ]

Updates to the GIC architecture allow ID_AA64PFR0_EL1.GIC to have
values other than 0 or 1. At the moment, Linux is quite strict in the
way it handles this field at early boot stage (cpufeature is fine) and
will refuse to use the system register CPU interface if it doesn't
find the value 1.

Fixes: 021f653791ad17e03f98aaa7fb933816ae16f161 ("irqchip: gic-v3: Initial support for GICv3")
Reported-by: Chase Conklin <Chase.Conklin@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/kernel/head.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 0382eba4bf7b..6299a8a361ee 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -478,8 +478,7 @@ CPU_LE(	bic	x0, x0, #(3 << 24)	)	// Clear the EE and E0E bits for EL1
 	/* GICv3 system register access */
 	mrs	x0, id_aa64pfr0_el1
 	ubfx	x0, x0, #24, #4
-	cmp	x0, #1
-	b.ne	3f
+	cbz	x0, 3f
 
 	mrs_s	x0, ICC_SRE_EL2
 	orr	x0, x0, #ICC_SRE_EL2_SRE	// Set ICC_SRE_EL2.SRE==1

From f8f413336b0cc35f8d9cb49d322989ba92a0f794 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 21 Feb 2019 08:48:09 -0800
Subject: [PATCH 2850/3220] tmpfs: fix link accounting when a tmpfile is linked
 in

[ Upstream commit 1062af920c07f5b54cf5060fde3339da6df0cf6b ]

tmpfs has a peculiarity of accounting hard links as if they were
separate inodes: so that when the number of inodes is limited, as it is
by default, a user cannot soak up an unlimited amount of unreclaimable
dcache memory just by repeatedly linking a file.

But when v3.11 added O_TMPFILE, and the ability to use linkat() on the
fd, we missed accommodating this new case in tmpfs: "df -i" shows that
an extra "inode" remains accounted after the file is unlinked and the fd
closed and the actual inode evicted.  If a user repeatedly links
tmpfiles into a tmpfs, the limit will be hit (ENOSPC) even after they
are deleted.

Just skip the extra reservation from shmem_link() in this case: there's
a sense in which this first link of a tmpfile is then cheaper than a
hard link of another file, but the accounting works out, and there's
still good limiting, so no need to do anything more complicated.

Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1902182134370.7035@eggly.anvils
Fixes: f4e0c30c191 ("allow the temp files created by open() to be linked to")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Matej Kupljen <matej.kupljen@gmail.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/shmem.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index d902b413941a..183ed4dae219 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2299,10 +2299,14 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 	 * No ordinary (disk based) filesystem counts links as inodes;
 	 * but each new link needs a new dentry, pinning lowmem, and
 	 * tmpfs dentries cannot be pruned until they are unlinked.
+	 * But if an O_TMPFILE file is linked into the tmpfs, the
+	 * first link must skip that, to get the accounting right.
 	 */
-	ret = shmem_reserve_inode(inode->i_sb);
-	if (ret)
-		goto out;
+	if (inode->i_nlink) {
+		ret = shmem_reserve_inode(inode->i_sb);
+		if (ret)
+			goto out;
+	}
 
 	dir->i_size += BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;

From 225bbd61b3ab4eb485dbc0ebd45a3968d400b3bf Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Tue, 5 Feb 2019 10:07:07 -0800
Subject: [PATCH 2851/3220] ARC: uacces: remove lp_start, lp_end from clobber
 list

[ Upstream commit d5e3c55e01d8b1774b37b4647c30fb22f1d39077 ]

Newer ARC gcc handles lp_start, lp_end in a different way and doesn't
like them in the clobber list.

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arc/include/asm/uaccess.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index 57387b567f34..f077a419cb51 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -209,7 +209,7 @@ __arc_copy_from_user(void *to, const void __user *from, unsigned long n)
 		*/
 		  "=&r" (tmp), "+r" (to), "+r" (from)
 		:
-		: "lp_count", "lp_start", "lp_end", "memory");
+		: "lp_count", "memory");
 
 		return n;
 	}
@@ -438,7 +438,7 @@ __arc_copy_to_user(void __user *to, const void *from, unsigned long n)
 		 */
 		  "=&r" (tmp), "+r" (to), "+r" (from)
 		:
-		: "lp_count", "lp_start", "lp_end", "memory");
+		: "lp_count", "memory");
 
 		return n;
 	}
@@ -658,7 +658,7 @@ static inline unsigned long __arc_clear_user(void __user *to, unsigned long n)
 	"	.previous			\n"
 	: "+r"(d_char), "+r"(res)
 	: "i"(0)
-	: "lp_count", "lp_start", "lp_end", "memory");
+	: "lp_count", "memory");
 
 	return res;
 }
@@ -691,7 +691,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
 	"	.previous			\n"
 	: "+r"(res), "+r"(dst), "+r"(src), "=r"(val)
 	: "g"(-EFAULT), "r"(count)
-	: "lp_count", "lp_start", "lp_end", "memory");
+	: "lp_count", "memory");
 
 	return res;
 }

From 5115ca2ba42837692fa9a24050fd07874045cdd9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 19 Feb 2019 22:53:50 +0100
Subject: [PATCH 2852/3220] phonet: fix building with clang
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 6321aa197547da397753757bd84c6ce64b3e3d89 ]

clang warns about overflowing the data[] member in the struct pnpipehdr:

net/phonet/pep.c:295:8: warning: array index 4 is past the end of the array (which contains 1 element) [-Warray-bounds]
                        if (hdr->data[4] == PEP_IND_READY)
                            ^         ~
include/net/phonet/pep.h:66:3: note: array 'data' declared here
                u8              data[1];

Using a flexible array member at the end of the struct avoids the
warning, but since we cannot have a flexible array member inside
of the union, each index now has to be moved back by one, which
makes it a little uglier.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Rémi Denis-Courmont <remi@remlab.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/net/phonet/pep.h |  5 +++--
 net/phonet/pep.c         | 32 ++++++++++++++++----------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/include/net/phonet/pep.h b/include/net/phonet/pep.h
index b669fe6dbc3b..98f31c7ea23d 100644
--- a/include/net/phonet/pep.h
+++ b/include/net/phonet/pep.h
@@ -63,10 +63,11 @@ struct pnpipehdr {
 		u8		state_after_reset;	/* reset request */
 		u8		error_code;		/* any response */
 		u8		pep_type;		/* status indication */
-		u8		data[1];
+		u8		data0;			/* anything else */
 	};
+	u8			data[];
 };
-#define other_pep_type		data[1]
+#define other_pep_type		data[0]
 
 static inline struct pnpipehdr *pnp_hdr(struct sk_buff *skb)
 {
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 850a86cde0b3..f6aa532bcbf6 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -131,7 +131,7 @@ static int pep_indicate(struct sock *sk, u8 id, u8 code,
 	ph->utid = 0;
 	ph->message_id = id;
 	ph->pipe_handle = pn->pipe_handle;
-	ph->data[0] = code;
+	ph->error_code = code;
 	return pn_skb_send(sk, skb, NULL);
 }
 
@@ -152,7 +152,7 @@ static int pipe_handler_request(struct sock *sk, u8 id, u8 code,
 	ph->utid = id; /* whatever */
 	ph->message_id = id;
 	ph->pipe_handle = pn->pipe_handle;
-	ph->data[0] = code;
+	ph->error_code = code;
 	return pn_skb_send(sk, skb, NULL);
 }
 
@@ -207,7 +207,7 @@ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code,
 	struct pnpipehdr *ph;
 	struct sockaddr_pn dst;
 	u8 data[4] = {
-		oph->data[0], /* PEP type */
+		oph->pep_type, /* PEP type */
 		code, /* error code, at an unusual offset */
 		PAD, PAD,
 	};
@@ -220,7 +220,7 @@ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code,
 	ph->utid = oph->utid;
 	ph->message_id = PNS_PEP_CTRL_RESP;
 	ph->pipe_handle = oph->pipe_handle;
-	ph->data[0] = oph->data[1]; /* CTRL id */
+	ph->data0 = oph->data[0]; /* CTRL id */
 
 	pn_skb_get_src_sockaddr(oskb, &dst);
 	return pn_skb_send(sk, skb, &dst);
@@ -271,17 +271,17 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
 		return -EINVAL;
 
 	hdr = pnp_hdr(skb);
-	if (hdr->data[0] != PN_PEP_TYPE_COMMON) {
+	if (hdr->pep_type != PN_PEP_TYPE_COMMON) {
 		net_dbg_ratelimited("Phonet unknown PEP type: %u\n",
-				    (unsigned int)hdr->data[0]);
+				    (unsigned int)hdr->pep_type);
 		return -EOPNOTSUPP;
 	}
 
-	switch (hdr->data[1]) {
+	switch (hdr->data[0]) {
 	case PN_PEP_IND_FLOW_CONTROL:
 		switch (pn->tx_fc) {
 		case PN_LEGACY_FLOW_CONTROL:
-			switch (hdr->data[4]) {
+			switch (hdr->data[3]) {
 			case PEP_IND_BUSY:
 				atomic_set(&pn->tx_credits, 0);
 				break;
@@ -291,7 +291,7 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
 			}
 			break;
 		case PN_ONE_CREDIT_FLOW_CONTROL:
-			if (hdr->data[4] == PEP_IND_READY)
+			if (hdr->data[3] == PEP_IND_READY)
 				atomic_set(&pn->tx_credits, wake = 1);
 			break;
 		}
@@ -300,12 +300,12 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
 	case PN_PEP_IND_ID_MCFC_GRANT_CREDITS:
 		if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL)
 			break;
-		atomic_add(wake = hdr->data[4], &pn->tx_credits);
+		atomic_add(wake = hdr->data[3], &pn->tx_credits);
 		break;
 
 	default:
 		net_dbg_ratelimited("Phonet unknown PEP indication: %u\n",
-				    (unsigned int)hdr->data[1]);
+				    (unsigned int)hdr->data[0]);
 		return -EOPNOTSUPP;
 	}
 	if (wake)
@@ -317,7 +317,7 @@ static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb)
 {
 	struct pep_sock *pn = pep_sk(sk);
 	struct pnpipehdr *hdr = pnp_hdr(skb);
-	u8 n_sb = hdr->data[0];
+	u8 n_sb = hdr->data0;
 
 	pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL;
 	__skb_pull(skb, sizeof(*hdr));
@@ -505,7 +505,7 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
 		return -ECONNREFUSED;
 
 	/* Parse sub-blocks */
-	n_sb = hdr->data[4];
+	n_sb = hdr->data[3];
 	while (n_sb > 0) {
 		u8 type, buf[6], len = sizeof(buf);
 		const u8 *data = pep_get_sb(skb, &type, &len, buf);
@@ -738,7 +738,7 @@ static int pipe_do_remove(struct sock *sk)
 	ph->utid = 0;
 	ph->message_id = PNS_PIPE_REMOVE_REQ;
 	ph->pipe_handle = pn->pipe_handle;
-	ph->data[0] = PAD;
+	ph->data0 = PAD;
 	return pn_skb_send(sk, skb, NULL);
 }
 
@@ -815,7 +815,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
 	peer_type = hdr->other_pep_type << 8;
 
 	/* Parse sub-blocks (options) */
-	n_sb = hdr->data[4];
+	n_sb = hdr->data[3];
 	while (n_sb > 0) {
 		u8 type, buf[1], len = sizeof(buf);
 		const u8 *data = pep_get_sb(skb, &type, &len, buf);
@@ -1106,7 +1106,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
 	ph->utid = 0;
 	if (pn->aligned) {
 		ph->message_id = PNS_PIPE_ALIGNED_DATA;
-		ph->data[0] = 0; /* padding */
+		ph->data0 = 0; /* padding */
 	} else
 		ph->message_id = PNS_PIPE_DATA;
 	ph->pipe_handle = pn->pipe_handle;

From 8e91a0b4d5f1c5eebbc9aabb54e366bb96808708 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Tue, 19 Feb 2019 13:12:40 +0800
Subject: [PATCH 2853/3220] mac80211_hwsim: propagate genlmsg_reply return code

[ Upstream commit 17407715240456448e4989bee46ffc93991add83 ]

genlmsg_reply can fail, so propagate its return code

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/wireless/mac80211_hwsim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 0d1abcfec003..0f582117b0e3 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -3002,7 +3002,7 @@ static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info)
 			goto out_err;
 		}
 
-		genlmsg_reply(skb, info);
+		res = genlmsg_reply(skb, info);
 		break;
 	}
 

From 8bbb2ce3da911a527e4c60d9a5a0003f4c6bac70 Mon Sep 17 00:00:00 2001
From: Mao Wenan <maowenan@huawei.com>
Date: Fri, 22 Feb 2019 14:57:23 +0800
Subject: [PATCH 2854/3220] net: set static variable an initial value in
 atl2_probe()

[ Upstream commit 4593403fa516a5a4cffe6883c5062d60932cbfbe ]

cards_found is a static variable, but when it enters atl2_probe(),
cards_found is set to zero, the value is not consistent with last probe,
so next behavior is not our expect.

Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/atheros/atlx/atl2.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 2ff465848b65..097a0bf592ab 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -1338,13 +1338,11 @@ static int atl2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct net_device *netdev;
 	struct atl2_adapter *adapter;
-	static int cards_found;
+	static int cards_found = 0;
 	unsigned long mmio_start;
 	int mmio_len;
 	int err;
 
-	cards_found = 0;
-
 	err = pci_enable_device(pdev);
 	if (err)
 		return err;

From 5f4c9964d10710659d6cd6b58134b3aba600dffc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 22 Feb 2019 22:35:32 -0800
Subject: [PATCH 2855/3220] tmpfs: fix uninitialized return value in shmem_link

[ Upstream commit 29b00e609960ae0fcff382f4c7079dd0874a5311 ]

When we made the shmem_reserve_inode call in shmem_link conditional, we
forgot to update the declaration for ret so that it always has a known
value.  Dan Carpenter pointed out this deficiency in the original patch.

Fixes: 1062af920c07 ("tmpfs: fix link accounting when a tmpfile is linked in")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Matej Kupljen <matej.kupljen@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 183ed4dae219..f11aec40f2e1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2293,7 +2293,7 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(old_dentry);
-	int ret;
+	int ret = 0;
 
 	/*
 	 * No ordinary (disk based) filesystem counts links as inodes;

From aa9c7ee2922a557ff430651f2125c2eb28821db6 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 21 Feb 2019 14:19:17 +0200
Subject: [PATCH 2856/3220] stm class: Prevent division by zero

commit bf7cbaae0831252b416f375ca9b1027ecd4642dd upstream.

Using STP_POLICY_ID_SET ioctl command with dummy_stm device, or any STM
device that supplies zero mmio channel size, will trigger a division by
zero bug in the kernel.

Prevent this by disallowing channel widths other than 1 for such devices.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Fixes: 7bd1d4093c2f ("stm class: Introduce an abstraction for System Trace Module devices")
CC: stable@vger.kernel.org # v4.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/stm/core.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index 8bd1b4dab94e..92ab51aa8a74 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -477,7 +477,7 @@ static int stm_char_policy_set_ioctl(struct stm_file *stmf, void __user *arg)
 {
 	struct stm_device *stm = stmf->stm;
 	struct stp_policy_id *id;
-	int ret = -EINVAL;
+	int ret = -EINVAL, wlimit = 1;
 	u32 size;
 
 	if (stmf->output.nr_chans)
@@ -505,8 +505,10 @@ static int stm_char_policy_set_ioctl(struct stm_file *stmf, void __user *arg)
 	if (id->__reserved_0 || id->__reserved_1)
 		goto err_free;
 
-	if (id->width < 1 ||
-	    id->width > PAGE_SIZE / stm->data->sw_mmiosz)
+	if (stm->data->sw_mmiosz)
+		wlimit = PAGE_SIZE / stm->data->sw_mmiosz;
+
+	if (id->width < 1 || id->width > wlimit)
 		goto err_free;
 
 	ret = stm_file_assign(stmf, id->id, id->width);

From a2ef87f9d268190e27580ec5ede7cbf7dd6702aa Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Thu, 24 Jan 2019 17:33:45 +0100
Subject: [PATCH 2857/3220] crypto: arm64/aes-ccm - fix logical bug in AAD MAC
 handling

commit eaf46edf6ea89675bd36245369c8de5063a0272c upstream.

The NEON MAC calculation routine fails to handle the case correctly
where there is some data in the buffer, and the input fills it up
exactly. In this case, we enter the loop at the end with w8 == 0,
while a negative value is assumed, and so the loop carries on until
the increment of the 32-bit counter wraps around, which is quite
obviously wrong.

So omit the loop altogether in this case, and exit right away.

Reported-by: Eric Biggers <ebiggers@kernel.org>
Fixes: a3fd82105b9d1 ("arm64/crypto: AES in CCM mode using ARMv8 Crypto ...")
Cc: stable@vger.kernel.org
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/crypto/aes-ce-ccm-core.S | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index 3363560c79b7..7bc459d9235c 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -74,12 +74,13 @@ ENTRY(ce_aes_ccm_auth_data)
 	beq	10f
 	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
 	b	7b
-8:	mov	w7, w8
+8:	cbz	w8, 91f
+	mov	w7, w8
 	add	w8, w8, #16
 9:	ext	v1.16b, v1.16b, v1.16b, #1
 	adds	w7, w7, #1
 	bne	9b
-	eor	v0.16b, v0.16b, v1.16b
+91:	eor	v0.16b, v0.16b, v1.16b
 	st1	{v0.16b}, [x0]
 10:	str	w8, [x3]
 	ret

From b1faf3d2b5b4f81eeee6b367076d243adf87859f Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Mon, 4 Mar 2019 17:48:01 -0800
Subject: [PATCH 2858/3220] CIFS: Fix read after write for files with read
 caching

commit 6dfbd84684700cb58b34e8602c01c12f3d2595c8 upstream.

When we have a READ lease for a file and have just issued a write
operation to the server we need to purge the cache and set oplock/lease
level to NONE to avoid reading stale data. Currently we do that
only if a write operation succedeed thus not covering cases when
a request was sent to the server but a negative error code was
returned later for some other reasons (e.g. -EIOCBQUEUED or -EINTR).
Fix this by turning off caching regardless of the error code being
returned.

The patches fixes generic tests 075 and 112 from the xfs-tests.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/file.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1062e96ee272..0305e3866216 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2753,14 +2753,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
 	 * these pages but not on the region from pos to ppos+len-1.
 	 */
 	written = cifs_user_writev(iocb, from);
-	if (written > 0 && CIFS_CACHE_READ(cinode)) {
+	if (CIFS_CACHE_READ(cinode)) {
 		/*
-		 * Windows 7 server can delay breaking level2 oplock if a write
-		 * request comes - break it on the client to prevent reading
-		 * an old data.
+		 * We have read level caching and we have just sent a write
+		 * request to the server thus making data in the cache stale.
+		 * Zap the cache and set oplock/lease level to NONE to avoid
+		 * reading stale data from the cache. All subsequent read
+		 * operations will read new data from the server.
 		 */
 		cifs_zap_mapping(inode);
-		cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+		cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n",
 			 inode);
 		cinode->oplock = 0;
 	}

From 19c53c1f817e1e04fe0a679b5b7790b7eaa3e1f9 Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Wed, 13 Feb 2019 20:29:06 +0800
Subject: [PATCH 2859/3220] tracing: Do not free iter->trace in fail path of
 tracing_open_pipe()

commit e7f0c424d0806b05d6f47be9f202b037eb701707 upstream.

Commit d716ff71dd12 ("tracing: Remove taking of trace_types_lock in
pipe files") use the current tracer instead of the copy in
tracing_open_pipe(), but it forget to remove the freeing sentence in
the error path.

There's an error path that can call kfree(iter->trace) after the iter->trace
was assigned to tr->current_trace, which would be bad to free.

Link: http://lkml.kernel.org/r/1550060946-45984-1-git-send-email-yi.zhang@huawei.com

Cc: stable@vger.kernel.org
Fixes: d716ff71dd12 ("tracing: Remove taking of trace_types_lock in pipe files")
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/trace/trace.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a47a64d623f..8c097de8a596 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4646,7 +4646,6 @@ out:
 	return ret;
 
 fail:
-	kfree(iter->trace);
 	kfree(iter);
 	__trace_array_put(tr);
 	mutex_unlock(&trace_types_lock);

From 84d798497ea95a2cd606e9bcb17221942a6b5e46 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 11 Mar 2019 18:41:03 +0200
Subject: [PATCH 2860/3220] ACPI / device_sysfs: Avoid OF modalias creation for
 removed device

commit f16eb8a4b096514ac06fb25bf599dcc792899b3d upstream.

If SSDT overlay is loaded via ConfigFS and then unloaded the device,
we would like to have OF modalias for, already gone. Thus, acpi_get_name()
returns no allocated buffer for such case and kernel crashes afterwards:

 ACPI: Host-directed Dynamic ACPI Table Unload
 ads7950 spi-PRP0001:00: Dropping the link to regulator.0
 BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
 #PF error: [normal kernel read fault]
 PGD 80000000070d6067 P4D 80000000070d6067 PUD 70d0067 PMD 0
 Oops: 0000 [#1] SMP PTI
 CPU: 0 PID: 40 Comm: kworker/u4:2 Not tainted 5.0.0+ #96
 Hardware name: Intel Corporation Merrifield/BODEGA BAY, BIOS 542 2015.01.21:18.19.48
 Workqueue: kacpi_hotplug acpi_device_del_work_fn
 RIP: 0010:create_of_modalias.isra.1+0x4c/0x150
 Code: 00 00 48 89 44 24 18 31 c0 48 8d 54 24 08 48 c7 44 24 10 00 00 00 00 48 c7 44 24 08 ff ff ff ff e8 7a b0 03 00 48 8b 4c 24 10 <0f> b6 01 84 c0 74 27 48 c7 c7 00 09 f4 a5 0f b6 f0 8d 50 20 f6 04
 RSP: 0000:ffffa51040297c10 EFLAGS: 00010246
 RAX: 0000000000001001 RBX: 0000000000000785 RCX: 0000000000000000
 RDX: 0000000000001001 RSI: 0000000000000286 RDI: ffffa2163dc042e0
 RBP: ffffa216062b1196 R08: 0000000000001001 R09: ffffa21639873000
 R10: ffffffffa606761d R11: 0000000000000001 R12: ffffa21639873218
 R13: ffffa2163deb5060 R14: ffffa216063d1010 R15: 0000000000000000
 FS:  0000000000000000(0000) GS:ffffa2163e000000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 0000000007114000 CR4: 00000000001006f0
 Call Trace:
  __acpi_device_uevent_modalias+0xb0/0x100
  spi_uevent+0xd/0x40

 ...

In order to fix above let create_of_modalias() check the status returned
by acpi_get_name() and bail out in case of failure.

Fixes: 8765c5ba1949 ("ACPI / scan: Rework modalias creation when "compatible" is present")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=201381
Reported-by: Ferry Toth <fntoth@gmail.com>
Tested-by: Ferry Toth<fntoth@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: 4.1+ <stable@vger.kernel.org> # 4.1+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/acpi/device_sysfs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/device_sysfs.c b/drivers/acpi/device_sysfs.c
index 1521d9a41d25..a899a7abcf63 100644
--- a/drivers/acpi/device_sysfs.c
+++ b/drivers/acpi/device_sysfs.c
@@ -202,11 +202,15 @@ static int create_of_modalias(struct acpi_device *acpi_dev, char *modalias,
 {
 	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
 	const union acpi_object *of_compatible, *obj;
+	acpi_status status;
 	int len, count;
 	int i, nval;
 	char *c;
 
-	acpi_get_name(acpi_dev->handle, ACPI_SINGLE_NAME, &buf);
+	status = acpi_get_name(acpi_dev->handle, ACPI_SINGLE_NAME, &buf);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
 	/* DT strings are all in lower case */
 	for (c = buf.pointer; *c != '\0'; c++)
 		*c = tolower(*c);

From f20f5fca3b45828933e32d7518c1d689a08f944e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Sat, 9 Feb 2019 18:14:14 +0100
Subject: [PATCH 2861/3220] regulator: s2mps11: Fix steps for buck7, buck8 and
 LDO35

commit 56b5d4ea778c1b0989c5cdb5406d4a488144c416 upstream.

LDO35 uses 25 mV step, not 50 mV.  Bucks 7 and 8 use 12.5 mV step
instead of 6.25 mV.  Wrong step caused over-voltage (LDO35) or
under-voltage (buck7 and 8) if regulators were used (e.g. on Exynos5420
Arndale Octa board).

Cc: <stable@vger.kernel.org>
Fixes: cb74685ecb39 ("regulator: s2mps11: Add samsung s2mps11 regulator driver")
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/regulator/s2mps11.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c
index b6d831b84e1d..47694dd515ab 100644
--- a/drivers/regulator/s2mps11.c
+++ b/drivers/regulator/s2mps11.c
@@ -372,7 +372,7 @@ static const struct regulator_desc s2mps11_regulators[] = {
 	regulator_desc_s2mps11_ldo(32, STEP_50_MV),
 	regulator_desc_s2mps11_ldo(33, STEP_50_MV),
 	regulator_desc_s2mps11_ldo(34, STEP_50_MV),
-	regulator_desc_s2mps11_ldo(35, STEP_50_MV),
+	regulator_desc_s2mps11_ldo(35, STEP_25_MV),
 	regulator_desc_s2mps11_ldo(36, STEP_50_MV),
 	regulator_desc_s2mps11_ldo(37, STEP_50_MV),
 	regulator_desc_s2mps11_ldo(38, STEP_50_MV),
@@ -382,8 +382,8 @@ static const struct regulator_desc s2mps11_regulators[] = {
 	regulator_desc_s2mps11_buck1_4(4),
 	regulator_desc_s2mps11_buck5,
 	regulator_desc_s2mps11_buck67810(6, MIN_600_MV, STEP_6_25_MV),
-	regulator_desc_s2mps11_buck67810(7, MIN_600_MV, STEP_6_25_MV),
-	regulator_desc_s2mps11_buck67810(8, MIN_600_MV, STEP_6_25_MV),
+	regulator_desc_s2mps11_buck67810(7, MIN_600_MV, STEP_12_5_MV),
+	regulator_desc_s2mps11_buck67810(8, MIN_600_MV, STEP_12_5_MV),
 	regulator_desc_s2mps11_buck9,
 	regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV),
 };

From 3f0edcec1da064584b322372f87677f652c6b2cd Mon Sep 17 00:00:00 2001
From: Stuart Menefy <stuart.menefy@mathembedded.com>
Date: Tue, 12 Feb 2019 21:51:18 +0000
Subject: [PATCH 2862/3220] regulator: s2mpa01: Fix step values for some LDOs

commit 28c4f730d2a44f2591cb104091da29a38dac49fe upstream.

The step values for some of the LDOs appears to be incorrect, resulting
in incorrect voltages (or at least, ones which are different from the
Samsung 3.4 vendor kernel).

Signed-off-by: Stuart Menefy <stuart.menefy@mathembedded.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/regulator/s2mpa01.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/regulator/s2mpa01.c b/drivers/regulator/s2mpa01.c
index 92f88753bfed..2daf751c26c7 100644
--- a/drivers/regulator/s2mpa01.c
+++ b/drivers/regulator/s2mpa01.c
@@ -303,13 +303,13 @@ static const struct regulator_desc regulators[] = {
 	regulator_desc_ldo(2, STEP_50_MV),
 	regulator_desc_ldo(3, STEP_50_MV),
 	regulator_desc_ldo(4, STEP_50_MV),
-	regulator_desc_ldo(5, STEP_50_MV),
+	regulator_desc_ldo(5, STEP_25_MV),
 	regulator_desc_ldo(6, STEP_25_MV),
 	regulator_desc_ldo(7, STEP_50_MV),
 	regulator_desc_ldo(8, STEP_50_MV),
 	regulator_desc_ldo(9, STEP_50_MV),
 	regulator_desc_ldo(10, STEP_50_MV),
-	regulator_desc_ldo(11, STEP_25_MV),
+	regulator_desc_ldo(11, STEP_50_MV),
 	regulator_desc_ldo(12, STEP_50_MV),
 	regulator_desc_ldo(13, STEP_50_MV),
 	regulator_desc_ldo(14, STEP_50_MV),
@@ -320,11 +320,11 @@ static const struct regulator_desc regulators[] = {
 	regulator_desc_ldo(19, STEP_50_MV),
 	regulator_desc_ldo(20, STEP_50_MV),
 	regulator_desc_ldo(21, STEP_50_MV),
-	regulator_desc_ldo(22, STEP_25_MV),
-	regulator_desc_ldo(23, STEP_25_MV),
+	regulator_desc_ldo(22, STEP_50_MV),
+	regulator_desc_ldo(23, STEP_50_MV),
 	regulator_desc_ldo(24, STEP_50_MV),
 	regulator_desc_ldo(25, STEP_50_MV),
-	regulator_desc_ldo(26, STEP_50_MV),
+	regulator_desc_ldo(26, STEP_25_MV),
 	regulator_desc_buck1_4(1),
 	regulator_desc_buck1_4(2),
 	regulator_desc_buck1_4(3),

From 3e08ffef428689e452e169bd23ae7b35affb6f8f Mon Sep 17 00:00:00 2001
From: Stuart Menefy <stuart.menefy@mathembedded.com>
Date: Sun, 10 Feb 2019 22:51:13 +0000
Subject: [PATCH 2863/3220] clocksource/drivers/exynos_mct: Move one-shot check
 from tick clear to ISR

commit a5719a40aef956ba704f2aa1c7b977224d60fa96 upstream.

When a timer tick occurs and the clock is in one-shot mode, the timer
needs to be stopped to prevent it triggering subsequent interrupts.
Currently this code is in exynos4_mct_tick_clear(), but as it is
only needed when an ISR occurs move it into exynos4_mct_tick_isr(),
leaving exynos4_mct_tick_clear() just doing what its name suggests it
should.

Signed-off-by: Stuart Menefy <stuart.menefy@mathembedded.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: stable@vger.kernel.org # v4.3+
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/clocksource/exynos_mct.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c
index 47f8aafe3344..f6734fbd1de5 100644
--- a/drivers/clocksource/exynos_mct.c
+++ b/drivers/clocksource/exynos_mct.c
@@ -379,6 +379,13 @@ static void exynos4_mct_tick_start(unsigned long cycles,
 	exynos4_mct_write(tmp, mevt->base + MCT_L_TCON_OFFSET);
 }
 
+static void exynos4_mct_tick_clear(struct mct_clock_event_device *mevt)
+{
+	/* Clear the MCT tick interrupt */
+	if (readl_relaxed(reg_base + mevt->base + MCT_L_INT_CSTAT_OFFSET) & 1)
+		exynos4_mct_write(0x1, mevt->base + MCT_L_INT_CSTAT_OFFSET);
+}
+
 static int exynos4_tick_set_next_event(unsigned long cycles,
 				       struct clock_event_device *evt)
 {
@@ -411,8 +418,11 @@ static int set_state_periodic(struct clock_event_device *evt)
 	return 0;
 }
 
-static void exynos4_mct_tick_clear(struct mct_clock_event_device *mevt)
+static irqreturn_t exynos4_mct_tick_isr(int irq, void *dev_id)
 {
+	struct mct_clock_event_device *mevt = dev_id;
+	struct clock_event_device *evt = &mevt->evt;
+
 	/*
 	 * This is for supporting oneshot mode.
 	 * Mct would generate interrupt periodically
@@ -421,16 +431,6 @@ static void exynos4_mct_tick_clear(struct mct_clock_event_device *mevt)
 	if (!clockevent_state_periodic(&mevt->evt))
 		exynos4_mct_tick_stop(mevt);
 
-	/* Clear the MCT tick interrupt */
-	if (readl_relaxed(reg_base + mevt->base + MCT_L_INT_CSTAT_OFFSET) & 1)
-		exynos4_mct_write(0x1, mevt->base + MCT_L_INT_CSTAT_OFFSET);
-}
-
-static irqreturn_t exynos4_mct_tick_isr(int irq, void *dev_id)
-{
-	struct mct_clock_event_device *mevt = dev_id;
-	struct clock_event_device *evt = &mevt->evt;
-
 	exynos4_mct_tick_clear(mevt);
 
 	evt->event_handler(evt);

From 185ca832f7d74c2bbb4001ea22a5ca0819caa991 Mon Sep 17 00:00:00 2001
From: Stuart Menefy <stuart.menefy@mathembedded.com>
Date: Sun, 10 Feb 2019 22:51:14 +0000
Subject: [PATCH 2864/3220] clocksource/drivers/exynos_mct: Clear timer
 interrupt when shutdown

commit d2f276c8d3c224d5b493c42b6cf006ae4e64fb1c upstream.

When shutting down the timer, ensure that after we have stopped the
timer any pending interrupts are cleared. This fixes a problem when
suspending, as interrupts are disabled before the timer is stopped,
so the timer interrupt may still be asserted, preventing the system
entering a low power state when the wfi is executed.

Signed-off-by: Stuart Menefy <stuart.menefy@mathembedded.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: <stable@vger.kernel.org> # v4.3+
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/clocksource/exynos_mct.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c
index f6734fbd1de5..d65a6036d610 100644
--- a/drivers/clocksource/exynos_mct.c
+++ b/drivers/clocksource/exynos_mct.c
@@ -402,6 +402,7 @@ static int set_state_shutdown(struct clock_event_device *evt)
 
 	mevt = container_of(evt, struct mct_clock_event_device, evt);
 	exynos4_mct_tick_stop(mevt);
+	exynos4_mct_tick_clear(mevt);
 	return 0;
 }
 

From 62a86906494db6a8cb0bf38de21d6ac0282269f3 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic@linux.ibm.com>
Date: Mon, 21 Jan 2019 13:19:43 +0100
Subject: [PATCH 2865/3220] s390/virtio: handle find on invalid queue
 gracefully

commit 3438b2c039b4bf26881786a1f3450f016d66ad11 upstream.

A queue with a capacity of zero is clearly not a valid virtio queue.
Some emulators report zero queue size if queried with an invalid queue
index. Instead of crashing in this case let us just return -ENOENT. To
make that work properly, let us fix the notifier cleanup logic as well.

Cc: stable@vger.kernel.org
Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
Signed-off-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/s390/virtio/virtio_ccw.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index ff06bdfd2b20..2bb275fb39d1 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -283,6 +283,8 @@ static void virtio_ccw_drop_indicators(struct virtio_ccw_device *vcdev)
 {
 	struct virtio_ccw_vq_info *info;
 
+	if (!vcdev->airq_info)
+		return;
 	list_for_each_entry(info, &vcdev->virtqueues, node)
 		drop_airq_indicator(info->vq, vcdev->airq_info);
 }
@@ -423,7 +425,7 @@ static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,
 	ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_VQ_CONF);
 	if (ret)
 		return ret;
-	return vcdev->config_block->num;
+	return vcdev->config_block->num ?: -ENOENT;
 }
 
 static void virtio_ccw_del_vq(struct virtqueue *vq, struct ccw1 *ccw)

From 1cda5468c04a714e637d65e36245db1c09b0560a Mon Sep 17 00:00:00 2001
From: Felipe Franciosi <felipe@nutanix.com>
Date: Wed, 27 Feb 2019 16:10:34 +0000
Subject: [PATCH 2866/3220] scsi: virtio_scsi: don't send sc payload with tmfs

commit 3722e6a52174d7c3a00e6f5efd006ca093f346c1 upstream.

The virtio scsi spec defines struct virtio_scsi_ctrl_tmf as a set of
device-readable records and a single device-writable response entry:

    struct virtio_scsi_ctrl_tmf
    {
        // Device-readable part
        le32 type;
        le32 subtype;
        u8 lun[8];
        le64 id;
        // Device-writable part
        u8 response;
    }

The above should be organised as two descriptor entries (or potentially
more if using VIRTIO_F_ANY_LAYOUT), but without any extra data after "le64
id" or after "u8 response".

The Linux driver doesn't respect that, with virtscsi_abort() and
virtscsi_device_reset() setting cmd->sc before calling virtscsi_tmf().  It
results in the original scsi command payload (or writable buffers) added to
the tmf.

This fixes the problem by leaving cmd->sc zeroed out, which makes
virtscsi_kick_cmd() add the tmf to the control vq without any payload.

Cc: stable@vger.kernel.org
Signed-off-by: Felipe Franciosi <felipe@nutanix.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/scsi/virtio_scsi.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 8ef905cbfc9c..9237427728ce 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -692,7 +692,6 @@ static int virtscsi_device_reset(struct scsi_cmnd *sc)
 		return FAILED;
 
 	memset(cmd, 0, sizeof(*cmd));
-	cmd->sc = sc;
 	cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
 		.type = VIRTIO_SCSI_T_TMF,
 		.subtype = cpu_to_virtio32(vscsi->vdev,
@@ -751,7 +750,6 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
 		return FAILED;
 
 	memset(cmd, 0, sizeof(*cmd));
-	cmd->sc = sc;
 	cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
 		.type = VIRTIO_SCSI_T_TMF,
 		.subtype = VIRTIO_SCSI_T_TMF_ABORT_TASK,

From 5c6e7bd3a627edef58ad4480cd7cf323f2bca6ed Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 25 Jan 2019 10:34:56 -0800
Subject: [PATCH 2867/3220] scsi: target/iscsi: Avoid
 iscsit_release_commands_from_conn() deadlock

commit 32e36bfbcf31452a854263e7c7f32fbefc4b44d8 upstream.

When using SCSI passthrough in combination with the iSCSI target driver
then cmd->t_state_lock may be obtained from interrupt context. Hence, all
code that obtains cmd->t_state_lock from thread context must disable
interrupts first. This patch avoids that lockdep reports the following:

WARNING: inconsistent lock state
4.18.0-dbg+ #1 Not tainted
--------------------------------
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
iscsi_ttx/1800 [HC1[1]:SC0[2]:HE0:SE0] takes:
000000006e7b0ceb (&(&cmd->t_state_lock)->rlock){?...}, at: target_complete_cmd+0x47/0x2c0 [target_core_mod]
{HARDIRQ-ON-W} state was registered at:
 lock_acquire+0xd2/0x260
 _raw_spin_lock+0x32/0x50
 iscsit_close_connection+0x97e/0x1020 [iscsi_target_mod]
 iscsit_take_action_for_connection_exit+0x108/0x200 [iscsi_target_mod]
 iscsi_target_rx_thread+0x180/0x190 [iscsi_target_mod]
 kthread+0x1cf/0x1f0
 ret_from_fork+0x24/0x30
irq event stamp: 1281
hardirqs last  enabled at (1279): [<ffffffff970ade79>] __local_bh_enable_ip+0xa9/0x160
hardirqs last disabled at (1281): [<ffffffff97a008a5>] interrupt_entry+0xb5/0xd0
softirqs last  enabled at (1278): [<ffffffff977cd9a1>] lock_sock_nested+0x51/0xc0
softirqs last disabled at (1280): [<ffffffffc07a6e04>] ip6_finish_output2+0x124/0xe40 [ipv6]

other info that might help us debug this:
Possible unsafe locking scenario:

      CPU0
      ----
 lock(&(&cmd->t_state_lock)->rlock);
 <Interrupt>
   lock(&(&cmd->t_state_lock)->rlock);
---
 drivers/target/iscsi/iscsi_target.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 58fe27705b96..cbb4414edd71 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -4232,9 +4232,9 @@ static void iscsit_release_commands_from_conn(struct iscsi_conn *conn)
 		struct se_cmd *se_cmd = &cmd->se_cmd;
 
 		if (se_cmd->se_tfo != NULL) {
-			spin_lock(&se_cmd->t_state_lock);
+			spin_lock_irq(&se_cmd->t_state_lock);
 			se_cmd->transport_state |= CMD_T_FABRIC_STOP;
-			spin_unlock(&se_cmd->t_state_lock);
+			spin_unlock_irq(&se_cmd->t_state_lock);
 		}
 	}
 	spin_unlock_bh(&conn->cmd_lock);

From 22058c290c9498dd9f0d507b6f0e1a0e8465d87a Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Wed, 16 Jan 2019 16:23:24 +1100
Subject: [PATCH 2868/3220] m68k: Add -ffreestanding to CFLAGS

commit 28713169d879b67be2ef2f84dcf54905de238294 upstream.

This patch fixes a build failure when using GCC 8.1:

/usr/bin/ld: block/partitions/ldm.o: in function `ldm_parse_tocblock':
block/partitions/ldm.c:153: undefined reference to `strcmp'

This is caused by a new optimization which effectively replaces a
strncmp() call with a strcmp() call. This affects a number of strncmp()
call sites in the kernel.

The entire class of optimizations is avoided with -fno-builtin, which
gets enabled by -ffreestanding. This may avoid possible future build
failures in case new optimizations appear in future compilers.

I haven't done any performance measurements with this patch but I did
count the function calls in a defconfig build. For example, there are now
23 more sprintf() calls and 39 fewer strcpy() calls. The effect on the
other libc functions is smaller.

If this harms performance we can tackle that regression by optimizing
the call sites, ideally using semantic patches. That way, clang and ICC
builds might benfit too.

Cc: stable@vger.kernel.org
Reference: https://marc.info/?l=linux-m68k&m=154514816222244&w=2
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/m68k/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile
index 0b29dcfef69f..0c736ed58abd 100644
--- a/arch/m68k/Makefile
+++ b/arch/m68k/Makefile
@@ -59,7 +59,10 @@ cpuflags-$(CONFIG_M5206e)	:= $(call cc-option,-mcpu=5206e,-m5200)
 cpuflags-$(CONFIG_M5206)	:= $(call cc-option,-mcpu=5206,-m5200)
 
 KBUILD_AFLAGS += $(cpuflags-y)
-KBUILD_CFLAGS += $(cpuflags-y) -pipe
+KBUILD_CFLAGS += $(cpuflags-y)
+
+KBUILD_CFLAGS += -pipe -ffreestanding
+
 ifdef CONFIG_MMU
 # without -fno-strength-reduce the 53c7xx.c driver fails ;-(
 KBUILD_CFLAGS += -fno-strength-reduce -ffixed-a2

From eb4763b1bb4a417019fa4f4a29f9777782749587 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Mon, 18 Feb 2019 11:28:37 +0100
Subject: [PATCH 2869/3220] btrfs: ensure that a DUP or RAID1 block group has
 exactly two stripes

commit 349ae63f40638a28c6fce52e8447c2d14b84cc0c upstream.

We recently had a customer issue with a corrupted filesystem. When
trying to mount this image btrfs panicked with a division by zero in
calc_stripe_length().

The corrupt chunk had a 'num_stripes' value of 1. calc_stripe_length()
takes this value and divides it by the number of copies the RAID profile
is expected to have to calculate the amount of data stripes. As a DUP
profile is expected to have 2 copies this division resulted in 1/2 = 0.
Later then the 'data_stripes' variable is used as a divisor in the
stripe length calculation which results in a division by 0 and thus a
kernel panic.

When encountering a filesystem with a DUP block group and a
'num_stripes' value unequal to 2, refuse mounting as the image is
corrupted and will lead to unexpected behaviour.

Code inspection showed a RAID1 block group has the same issues.

Fixes: e06cd3dd7cea ("Btrfs: add validadtion checks for chunk loading")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e8fe8f3942d..d1cca19b29d3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6287,10 +6287,10 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root,
 	}
 
 	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
-	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
 	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
 	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
-	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
 	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
 	     num_stripes != 1)) {
 		btrfs_err(root->fs_info,

From efe908169f4f57f653690faf35655e001f0734d9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 14 Feb 2019 15:17:20 +0000
Subject: [PATCH 2870/3220] Btrfs: fix corruption reading shared and compressed
 extents after hole punching

commit 8e928218780e2f1cf2f5891c7575e8f0b284fcce upstream.

In the past we had data corruption when reading compressed extents that
are shared within the same file and they are consecutive, this got fixed
by commit 005efedf2c7d0 ("Btrfs: fix read corruption of compressed and
shared extents") and by commit 808f80b46790f ("Btrfs: update fix for read
corruption of compressed and shared extents"). However there was a case
that was missing in those fixes, which is when the shared and compressed
extents are referenced with a non-zero offset. The following shell script
creates a reproducer for this issue:

  #!/bin/bash

  mkfs.btrfs -f /dev/sdc &> /dev/null
  mount -o compress /dev/sdc /mnt/sdc

  # Create a file with 3 consecutive compressed extents, each has an
  # uncompressed size of 128Kb and a compressed size of 4Kb.
  for ((i = 1; i <= 3; i++)); do
      head -c 4096 /dev/zero
      for ((j = 1; j <= 31; j++)); do
          head -c 4096 /dev/zero | tr '\0' "\377"
      done
  done > /mnt/sdc/foobar
  sync

  echo "Digest after file creation:   $(md5sum /mnt/sdc/foobar)"

  # Clone the first extent into offsets 128K and 256K.
  xfs_io -c "reflink /mnt/sdc/foobar 0 128K 128K" /mnt/sdc/foobar
  xfs_io -c "reflink /mnt/sdc/foobar 0 256K 128K" /mnt/sdc/foobar
  sync

  echo "Digest after cloning:         $(md5sum /mnt/sdc/foobar)"

  # Punch holes into the regions that are already full of zeroes.
  xfs_io -c "fpunch 0 4K" /mnt/sdc/foobar
  xfs_io -c "fpunch 128K 4K" /mnt/sdc/foobar
  xfs_io -c "fpunch 256K 4K" /mnt/sdc/foobar
  sync

  echo "Digest after hole punching:   $(md5sum /mnt/sdc/foobar)"

  echo "Dropping page cache..."
  sysctl -q vm.drop_caches=1
  echo "Digest after hole punching:   $(md5sum /mnt/sdc/foobar)"

  umount /dev/sdc

When running the script we get the following output:

  Digest after file creation:   5a0888d80d7ab1fd31c229f83a3bbcc8  /mnt/sdc/foobar
  linked 131072/131072 bytes at offset 131072
  128 KiB, 1 ops; 0.0033 sec (36.960 MiB/sec and 295.6830 ops/sec)
  linked 131072/131072 bytes at offset 262144
  128 KiB, 1 ops; 0.0015 sec (78.567 MiB/sec and 628.5355 ops/sec)
  Digest after cloning:         5a0888d80d7ab1fd31c229f83a3bbcc8  /mnt/sdc/foobar
  Digest after hole punching:   5a0888d80d7ab1fd31c229f83a3bbcc8  /mnt/sdc/foobar
  Dropping page cache...
  Digest after hole punching:   fba694ae8664ed0c2e9ff8937e7f1484  /mnt/sdc/foobar

This happens because after reading all the pages of the extent in the
range from 128K to 256K for example, we read the hole at offset 256K
and then when reading the page at offset 260K we don't submit the
existing bio, which is responsible for filling all the page in the
range 128K to 256K only, therefore adding the pages from range 260K
to 384K to the existing bio and submitting it after iterating over the
entire range. Once the bio completes, the uncompressed data fills only
the pages in the range 128K to 256K because there's no more data read
from disk, leaving the pages in the range 260K to 384K unfilled. It is
just a slightly different variant of what was solved by commit
005efedf2c7d0 ("Btrfs: fix read corruption of compressed and shared
extents").

Fix this by forcing a bio submit, during readpages(), whenever we find a
compressed extent map for a page that is different from the extent map
for the previous page or has a different starting offset (in case it's
the same compressed extent), instead of the extent map's original start
offset.

A test case for fstests follows soon.

Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Fixes: 808f80b46790f ("Btrfs: update fix for read corruption of compressed and shared extents")
Fixes: 005efedf2c7d0 ("Btrfs: fix read corruption of compressed and shared extents")
Cc: stable@vger.kernel.org # 4.3+
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 42e7f6a8f91d..a18f558b4477 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3106,11 +3106,11 @@ static int __do_readpage(struct extent_io_tree *tree,
 		 */
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
 		    prev_em_start && *prev_em_start != (u64)-1 &&
-		    *prev_em_start != em->orig_start)
+		    *prev_em_start != em->start)
 			force_bio_submit = true;
 
 		if (prev_em_start)
-			*prev_em_start = em->orig_start;
+			*prev_em_start = em->start;
 
 		free_extent_map(em);
 		em = NULL;

From 9bde9df79f24895e0b1be03b9f76a7ad8fa1cbba Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 3 Jan 2019 20:16:13 -0800
Subject: [PATCH 2871/3220] crypto: pcbc - remove bogus memcpy()s with src ==
 dest

commit 251b7aea34ba3c4d4fdfa9447695642eb8b8b098 upstream.

The memcpy()s in the PCBC implementation use walk->iv as both the source
and destination, which has undefined behavior.  These memcpy()'s are
actually unneeded, because walk->iv is already used to hold the previous
plaintext block XOR'd with the previous ciphertext block.  Thus,
walk->iv is already updated to its final value.

So remove the broken and unnecessary memcpy()s.

Fixes: 91652be5d1b9 ("[CRYPTO] pcbc: Add Propagated CBC template")
Cc: <stable@vger.kernel.org> # v2.6.21+
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Maxim Zhukov <mussitantesmortem@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 crypto/pcbc.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/crypto/pcbc.c b/crypto/pcbc.c
index f654965f0933..de81f716cf26 100644
--- a/crypto/pcbc.c
+++ b/crypto/pcbc.c
@@ -52,7 +52,7 @@ static int crypto_pcbc_encrypt_segment(struct blkcipher_desc *desc,
 	unsigned int nbytes = walk->nbytes;
 	u8 *src = walk->src.virt.addr;
 	u8 *dst = walk->dst.virt.addr;
-	u8 *iv = walk->iv;
+	u8 * const iv = walk->iv;
 
 	do {
 		crypto_xor(iv, src, bsize);
@@ -76,7 +76,7 @@ static int crypto_pcbc_encrypt_inplace(struct blkcipher_desc *desc,
 	int bsize = crypto_cipher_blocksize(tfm);
 	unsigned int nbytes = walk->nbytes;
 	u8 *src = walk->src.virt.addr;
-	u8 *iv = walk->iv;
+	u8 * const iv = walk->iv;
 	u8 tmpbuf[bsize];
 
 	do {
@@ -89,8 +89,6 @@ static int crypto_pcbc_encrypt_inplace(struct blkcipher_desc *desc,
 		src += bsize;
 	} while ((nbytes -= bsize) >= bsize);
 
-	memcpy(walk->iv, iv, bsize);
-
 	return nbytes;
 }
 
@@ -130,7 +128,7 @@ static int crypto_pcbc_decrypt_segment(struct blkcipher_desc *desc,
 	unsigned int nbytes = walk->nbytes;
 	u8 *src = walk->src.virt.addr;
 	u8 *dst = walk->dst.virt.addr;
-	u8 *iv = walk->iv;
+	u8 * const iv = walk->iv;
 
 	do {
 		fn(crypto_cipher_tfm(tfm), dst, src);
@@ -142,8 +140,6 @@ static int crypto_pcbc_decrypt_segment(struct blkcipher_desc *desc,
 		dst += bsize;
 	} while ((nbytes -= bsize) >= bsize);
 
-	memcpy(walk->iv, iv, bsize);
-
 	return nbytes;
 }
 
@@ -156,7 +152,7 @@ static int crypto_pcbc_decrypt_inplace(struct blkcipher_desc *desc,
 	int bsize = crypto_cipher_blocksize(tfm);
 	unsigned int nbytes = walk->nbytes;
 	u8 *src = walk->src.virt.addr;
-	u8 *iv = walk->iv;
+	u8 * const iv = walk->iv;
 	u8 tmpbuf[bsize];
 
 	do {
@@ -169,8 +165,6 @@ static int crypto_pcbc_decrypt_inplace(struct blkcipher_desc *desc,
 		src += bsize;
 	} while ((nbytes -= bsize) >= bsize);
 
-	memcpy(walk->iv, iv, bsize);
-
 	return nbytes;
 }
 

From b1485461297217cb88da894397d3768b72f300dc Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Mon, 4 Feb 2019 02:48:54 -0500
Subject: [PATCH 2872/3220] cpufreq: tegra124: add missing of_node_put()

commit 446fae2bb5395f3028d8e3aae1508737e5a72ea1 upstream.

of_cpu_device_node_get() will increase the refcount of device_node,
it is necessary to call of_node_put() at the end to release the
refcount.

Fixes: 9eb15dbbfa1a2 ("cpufreq: Add cpufreq driver for Tegra124")
Cc: <stable@vger.kernel.org> # 4.4+
Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cpufreq/tegra124-cpufreq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/tegra124-cpufreq.c b/drivers/cpufreq/tegra124-cpufreq.c
index 20bcceb58ccc..8e7deb65fc32 100644
--- a/drivers/cpufreq/tegra124-cpufreq.c
+++ b/drivers/cpufreq/tegra124-cpufreq.c
@@ -141,6 +141,8 @@ static int tegra124_cpufreq_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, priv);
 
+	of_node_put(np);
+
 	return 0;
 
 out_switch_to_pllx:

From 0d97ba8b9a5fac7b577ab12b77819def430f1868 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 7 Mar 2019 11:22:41 +0100
Subject: [PATCH 2873/3220] cpufreq: pxa2xx: remove incorrect __init annotation

commit 9505b98ccddc454008ca7efff90044e3e857c827 upstream.

pxa_cpufreq_init_voltages() is marked __init but usually inlined into
the non-__init pxa_cpufreq_init() function. When building with clang,
it can stay as a standalone function in a discarded section, and produce
this warning:

WARNING: vmlinux.o(.text+0x616a00): Section mismatch in reference from the function pxa_cpufreq_init() to the function .init.text:pxa_cpufreq_init_voltages()
The function pxa_cpufreq_init() references
the function __init pxa_cpufreq_init_voltages().
This is often because pxa_cpufreq_init lacks a __init
annotation or the annotation of pxa_cpufreq_init_voltages is wrong.

Fixes: 50e77fcd790e ("ARM: pxa: remove __init from cpufreq_driver->init()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cpufreq/pxa2xx-cpufreq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/pxa2xx-cpufreq.c b/drivers/cpufreq/pxa2xx-cpufreq.c
index 096377232747..cd0333418d15 100644
--- a/drivers/cpufreq/pxa2xx-cpufreq.c
+++ b/drivers/cpufreq/pxa2xx-cpufreq.c
@@ -191,7 +191,7 @@ static int pxa_cpufreq_change_voltage(const struct pxa_freqs *pxa_freq)
 	return ret;
 }
 
-static void __init pxa_cpufreq_init_voltages(void)
+static void pxa_cpufreq_init_voltages(void)
 {
 	vcc_core = regulator_get(NULL, "vcc_core");
 	if (IS_ERR(vcc_core)) {
@@ -207,7 +207,7 @@ static int pxa_cpufreq_change_voltage(const struct pxa_freqs *pxa_freq)
 	return 0;
 }
 
-static void __init pxa_cpufreq_init_voltages(void) { }
+static void pxa_cpufreq_init_voltages(void) { }
 #endif
 
 static void find_freq_tables(struct cpufreq_frequency_table **freq_table,

From 8b710dc8a5e2a4c3202f61e6cfa3bae07c70fec9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 11 Feb 2019 13:30:32 -0500
Subject: [PATCH 2874/3220] ext4: fix crash during online resizing

commit f96c3ac8dfc24b4e38fc4c2eba5fea2107b929d1 upstream.

When computing maximum size of filesystem possible with given number of
group descriptor blocks, we forget to include s_first_data_block into
the number of blocks. Thus for filesystems with non-zero
s_first_data_block it can happen that computed maximum filesystem size
is actually lower than current filesystem size which confuses the code
and eventually leads to a BUG_ON in ext4_alloc_group_tables() hitting on
flex_gd->count == 0. The problem can be reproduced like:

truncate -s 100g /tmp/image
mkfs.ext4 -b 1024 -E resize=262144 /tmp/image 32768
mount -t ext4 -o loop /tmp/image /mnt
resize2fs /dev/loop0 262145
resize2fs /dev/loop0 300000

Fix the problem by properly including s_first_data_block into the
computed number of filesystem blocks.

Fixes: 1c6bd7173d66 "ext4: convert file system to meta_bg if needed..."
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/resize.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 2fc1564f62dd..4bd12247a9be 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1928,7 +1928,8 @@ retry:
 				le16_to_cpu(es->s_reserved_gdt_blocks);
 			n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
 			n_blocks_count = (ext4_fsblk_t)n_group *
-				EXT4_BLOCKS_PER_GROUP(sb);
+				EXT4_BLOCKS_PER_GROUP(sb) +
+				le32_to_cpu(es->s_first_data_block);
 			n_group--; /* set to last group number */
 		}
 

From 64847df5bff163f4f0ded9dd29b0e9d8ce8832c6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 29 Jan 2019 17:17:24 +0100
Subject: [PATCH 2875/3220] ext2: Fix underflow in ext2_max_size()

commit 1c2d14212b15a60300a2d4f6364753e87394c521 upstream.

When ext2 filesystem is created with 64k block size, ext2_max_size()
will return value less than 0. Also, we cannot write any file in this fs
since the sb->maxbytes is less than 0. The core of the problem is that
the size of block index tree for such large block size is more than
i_blocks can carry. So fix the computation to count with this
possibility.

File size limits computed with the new function for the full range of
possible block sizes look like:

bits file_size
10     17247252480
11    275415851008
12   2196873666560
13   2197948973056
14   2198486220800
15   2198754754560
16   2198888906752

CC: stable@vger.kernel.org
Reported-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext2/super.c | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 748d35afc902..860024392969 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -721,7 +721,8 @@ static loff_t ext2_max_size(int bits)
 {
 	loff_t res = EXT2_NDIR_BLOCKS;
 	int meta_blocks;
-	loff_t upper_limit;
+	unsigned int upper_limit;
+	unsigned int ppb = 1 << (bits-2);
 
 	/* This is calculated to be the largest file size for a
 	 * dense, file such that the total number of
@@ -735,24 +736,34 @@ static loff_t ext2_max_size(int bits)
 	/* total blocks in file system block size */
 	upper_limit >>= (bits - 9);
 
-
-	/* indirect blocks */
-	meta_blocks = 1;
-	/* double indirect blocks */
-	meta_blocks += 1 + (1LL << (bits-2));
-	/* tripple indirect blocks */
-	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
-	upper_limit -= meta_blocks;
-	upper_limit <<= bits;
-
+	/* Compute how many blocks we can address by block tree */
 	res += 1LL << (bits-2);
 	res += 1LL << (2*(bits-2));
 	res += 1LL << (3*(bits-2));
-	res <<= bits;
-	if (res > upper_limit)
-		res = upper_limit;
+	/* Does block tree limit file size? */
+	if (res < upper_limit)
+		goto check_lfs;
 
+	res = upper_limit;
+	/* How many metadata blocks are needed for addressing upper_limit? */
+	upper_limit -= EXT2_NDIR_BLOCKS;
+	/* indirect blocks */
+	meta_blocks = 1;
+	upper_limit -= ppb;
+	/* double indirect blocks */
+	if (upper_limit < ppb * ppb) {
+		meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb);
+		res -= meta_blocks;
+		goto check_lfs;
+	}
+	meta_blocks += 1 + ppb;
+	upper_limit -= ppb * ppb;
+	/* tripple indirect blocks for the rest */
+	meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) +
+		DIV_ROUND_UP(upper_limit, ppb*ppb);
+	res -= meta_blocks;
+check_lfs:
+	res <<= bits;
 	if (res > MAX_LFS_FILESIZE)
 		res = MAX_LFS_FILESIZE;
 

From 96ad35532ccc3353c56800b4e4b0f020eca82344 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Sun, 27 Jan 2019 23:09:20 -0300
Subject: [PATCH 2876/3220] clk: ingenic: Fix round_rate misbehaving with
 non-integer dividers

commit bc5d922c93491878c44c9216e9d227c7eeb81d7f upstream.

Take a parent rate of 180 MHz, and a requested rate of 4.285715 MHz.
This results in a theorical divider of 41.999993 which is then rounded
up to 42. The .round_rate function would then return (180 MHz / 42) as
the clock, rounded down, so 4.285714 MHz.

Calling clk_set_rate on 4.285714 MHz would round the rate again, and
give a theorical divider of 42,0000028, now rounded up to 43, and the
rate returned would be (180 MHz / 43) which is 4.186046 MHz, aka. not
what we requested.

Fix this by rounding up the divisions.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Tested-by: Maarten ter Huurne <maarten@treewalker.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/clk/ingenic/cgu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/clk/ingenic/cgu.c b/drivers/clk/ingenic/cgu.c
index 7cfb7b2a2ed6..8878efb80620 100644
--- a/drivers/clk/ingenic/cgu.c
+++ b/drivers/clk/ingenic/cgu.c
@@ -355,16 +355,16 @@ ingenic_clk_round_rate(struct clk_hw *hw, unsigned long req_rate,
 	struct ingenic_clk *ingenic_clk = to_ingenic_clk(hw);
 	struct ingenic_cgu *cgu = ingenic_clk->cgu;
 	const struct ingenic_cgu_clk_info *clk_info;
-	long rate = *parent_rate;
+	unsigned int div = 1;
 
 	clk_info = &cgu->clock_info[ingenic_clk->idx];
 
 	if (clk_info->type & CGU_CLK_DIV)
-		rate /= ingenic_clk_calc_div(clk_info, *parent_rate, req_rate);
+		div = ingenic_clk_calc_div(clk_info, *parent_rate, req_rate);
 	else if (clk_info->type & CGU_CLK_FIXDIV)
-		rate /= clk_info->fixdiv.div;
+		div = clk_info->fixdiv.div;
 
-	return rate;
+	return DIV_ROUND_UP(*parent_rate, div);
 }
 
 static int
@@ -384,7 +384,7 @@ ingenic_clk_set_rate(struct clk_hw *hw, unsigned long req_rate,
 
 	if (clk_info->type & CGU_CLK_DIV) {
 		div = ingenic_clk_calc_div(clk_info, parent_rate, req_rate);
-		rate = parent_rate / div;
+		rate = DIV_ROUND_UP(parent_rate, div);
 
 		if (rate != req_rate)
 			return -EINVAL;

From f7572a4534b50e92fb7b71ef50d25d60acce6cb0 Mon Sep 17 00:00:00 2001
From: Phuong Nguyen <phuong.nguyen.xw@renesas.com>
Date: Thu, 17 Jan 2019 17:44:17 +0900
Subject: [PATCH 2877/3220] dmaengine: usb-dmac: Make DMAC system sleep
 callbacks explicit

commit d9140a0da4a230a03426d175145989667758aa6a upstream.

This commit fixes the issue that USB-DMAC hangs silently after system
resumes on R-Car Gen3 hence renesas_usbhs will not work correctly
when using USB-DMAC for bulk transfer e.g. ethernet or serial
gadgets.

The issue can be reproduced by these steps:
 1. modprobe g_serial
 2. Suspend and resume system.
 3. connect a usb cable to host side
 4. Transfer data from Host to Target
 5. cat /dev/ttyGS0 (Target side)
 6. echo "test" > /dev/ttyACM0 (Host side)

The 'cat' will not result anything. However, system still can work
normally.

Currently, USB-DMAC driver does not have system sleep callbacks hence
this driver relies on the PM core to force runtime suspend/resume to
suspend and reinitialize USB-DMAC during system resume. After
the commit 17218e0092f8 ("PM / genpd: Stop/start devices without
pm_runtime_force_suspend/resume()"), PM core will not force
runtime suspend/resume anymore so this issue happens.

To solve this, make system suspend resume explicit by using
pm_runtime_force_{suspend,resume}() as the system sleep callbacks.
SET_NOIRQ_SYSTEM_SLEEP_PM_OPS() is used to make sure USB-DMAC
suspended after and initialized before renesas_usbhs."

Signed-off-by: Phuong Nguyen <phuong.nguyen.xw@renesas.com>
Signed-off-by: Hiroyuki Yokoyama <hiroyuki.yokoyama.vx@renesas.com>
Cc: <stable@vger.kernel.org> # v4.16+
[shimoda: revise the commit log and add Cc tag]
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/dma/sh/usb-dmac.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dma/sh/usb-dmac.c b/drivers/dma/sh/usb-dmac.c
index 6682b3eec2b6..cc8fc601ed47 100644
--- a/drivers/dma/sh/usb-dmac.c
+++ b/drivers/dma/sh/usb-dmac.c
@@ -700,6 +700,8 @@ static int usb_dmac_runtime_resume(struct device *dev)
 #endif /* CONFIG_PM */
 
 static const struct dev_pm_ops usb_dmac_pm = {
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+				      pm_runtime_force_resume)
 	SET_RUNTIME_PM_OPS(usb_dmac_runtime_suspend, usb_dmac_runtime_resume,
 			   NULL)
 };

From 49b3c4a292b0b7fe834ab19a39295398b784dc19 Mon Sep 17 00:00:00 2001
From: Roman Penyaev <rpenyaev@suse.de>
Date: Tue, 5 Mar 2019 15:43:20 -0800
Subject: [PATCH 2878/3220] mm/vmalloc: fix size check for
 remap_vmalloc_range_partial()

commit 401592d2e095947344e10ec0623adbcd58934dd4 upstream.

When VM_NO_GUARD is not set area->size includes adjacent guard page,
thus for correct size checking get_vm_area_size() should be used, but
not area->size.

This fixes possible kernel oops when userspace tries to mmap an area on
1 page bigger than was allocated by vmalloc_user() call: the size check
inside remap_vmalloc_range_partial() accounts non-existing guard page
also, so check successfully passes but vmalloc_to_page() returns NULL
(guard page does not physically exist).

The following code pattern example should trigger an oops:

  static int oops_mmap(struct file *file, struct vm_area_struct *vma)
  {
        void *mem;

        mem = vmalloc_user(4096);
        BUG_ON(!mem);
        /* Do not care about mem leak */

        return remap_vmalloc_range(vma, mem, 0);
  }

And userspace simply mmaps size + PAGE_SIZE:

  mmap(NULL, 8192, PROT_WRITE|PROT_READ, MAP_PRIVATE, fd, 0);

Possible candidates for oops which do not have any explicit size
checks:

   *** drivers/media/usb/stkwebcam/stk-webcam.c:
   v4l_stk_mmap[789]   ret = remap_vmalloc_range(vma, sbuf->buffer, 0);

Or the following one:

   *** drivers/video/fbdev/core/fbmem.c
   static int
   fb_mmap(struct file *file, struct vm_area_struct * vma)
        ...
        res = fb->fb_mmap(info, vma);

Where fb_mmap callback calls remap_vmalloc_range() directly without any
explicit checks:

   *** drivers/video/fbdev/vfb.c
   static int vfb_mmap(struct fb_info *info,
             struct vm_area_struct *vma)
   {
       return remap_vmalloc_range(vma, (void *)info->fix.smem_start, vma->vm_pgoff);
   }

Link: http://lkml.kernel.org/r/20190103145954.16942-2-rpenyaev@suse.de
Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Joe Perches <joe@perches.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index de8e372ece04..400e580725da 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2162,7 +2162,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
 	if (!(area->flags & VM_USERMAP))
 		return -EINVAL;
 
-	if (kaddr + size > area->addr + area->size)
+	if (kaddr + size > area->addr + get_vm_area_size(area))
 		return -EINVAL;
 
 	do {

From c8d2a21fdf518bea5cf72bf212df85ca50dcb156 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Mon, 11 Mar 2019 23:28:02 -0700
Subject: [PATCH 2879/3220] kernel/sysctl.c: add missing range check in
 do_proc_dointvec_minmax_conv

commit 8cf7630b29701d364f8df4a50e4f1f5e752b2778 upstream.

This bug has apparently existed since the introduction of this function
in the pre-git era (4500e91754d3 in Thomas Gleixner's history.git,
"[NET]: Add proc_dointvec_userhz_jiffies, use it for proper handling of
neighbour sysctls.").

As a minimal fix we can simply duplicate the corresponding check in
do_proc_dointvec_conv().

Link: http://lkml.kernel.org/r/20190207123426.9202-3-zev@bewilderbeest.net
Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: <stable@vger.kernel.org>	[2.6.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sysctl.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7e832f9a8f42..beadcf83ceba 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2306,7 +2306,16 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
 {
 	struct do_proc_dointvec_minmax_conv_param *param = data;
 	if (write) {
-		int val = *negp ? -*lvalp : *lvalp;
+		int val;
+		if (*negp) {
+			if (*lvalp > (unsigned long) INT_MAX + 1)
+				return -EINVAL;
+			val = -*lvalp;
+		} else {
+			if (*lvalp > (unsigned long) INT_MAX)
+				return -EINVAL;
+			val = *lvalp;
+		}
 		if ((param->min && *param->min > val) ||
 		    (param->max && *param->max < val))
 			return -EINVAL;

From dba3801e8dc83a5cadd171acf56b1d2ac98a20f7 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 24 Jan 2019 15:11:53 +0200
Subject: [PATCH 2880/3220] intel_th: Don't reference unassigned outputs

commit 9ed3f22223c33347ed963e7c7019cf2956dd4e37 upstream.

When an output port driver is removed, also remove references to it from
any masters. Failing to do this causes a NULL ptr dereference when
configuring another output port:

> BUG: unable to handle kernel NULL pointer dereference at 000000000000000d
> RIP: 0010:master_attr_store+0x9d/0x160 [intel_th_gth]
> Call Trace:
> dev_attr_store+0x1b/0x30
> sysfs_kf_write+0x3c/0x50
> kernfs_fop_write+0x125/0x1a0
> __vfs_write+0x3a/0x190
> ? __vfs_write+0x5/0x190
> ? _cond_resched+0x1a/0x50
> ? rcu_all_qs+0x5/0xb0
> ? __vfs_write+0x5/0x190
> vfs_write+0xb8/0x1b0
> ksys_write+0x55/0xc0
> __x64_sys_write+0x1a/0x20
> do_syscall_64+0x5a/0x140
> entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Fixes: b27a6a3f97b9 ("intel_th: Add Global Trace Hub driver")
CC: stable@vger.kernel.org # v4.4+
Reported-by: Ammy Yi <ammy.yi@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/intel_th/gth.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/hwtracing/intel_th/gth.c b/drivers/hwtracing/intel_th/gth.c
index 2dc5378ccd3a..eb43943cdf07 100644
--- a/drivers/hwtracing/intel_th/gth.c
+++ b/drivers/hwtracing/intel_th/gth.c
@@ -591,11 +591,15 @@ static void intel_th_gth_unassign(struct intel_th_device *thdev,
 {
 	struct gth_device *gth = dev_get_drvdata(&thdev->dev);
 	int port = othdev->output.port;
+	int master;
 
 	spin_lock(&gth->gth_lock);
 	othdev->output.port = -1;
 	othdev->output.active = false;
 	gth->output[port].output = NULL;
+	for (master = 0; master < TH_CONFIGURABLE_MASTERS; master++)
+		if (gth->master[master] == port)
+			gth->master[master] = -1;
 	spin_unlock(&gth->gth_lock);
 }
 

From ddc6521426c9581d258f286d32300c27690cfefe Mon Sep 17 00:00:00 2001
From: QiaoChong <qiaochong@loongson.cn>
Date: Sat, 9 Feb 2019 20:59:07 +0000
Subject: [PATCH 2881/3220] parport_pc: fix find_superio io compare code,
 should use equal test.

commit 21698fd57984cd28207d841dbdaa026d6061bceb upstream.

In the original code before 181bf1e815a2 the loop was continuing until
it finds the first matching superios[i].io and p->base.
But after 181bf1e815a2 the logic changed and the loop now returns the
pointer to the first mismatched array element which is then used in
get_superio_dma() and get_superio_irq() and thus returning the wrong
value.
Fix the condition so that it now returns the correct pointer.

Fixes: 181bf1e815a2 ("parport_pc: clean up the modified while loops using for")
Cc: Alan Cox <alan@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: QiaoChong <qiaochong@loongson.cn>
[rewrite the commit message]
Signed-off-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/parport/parport_pc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index bdce0679674c..02e6485c1ed5 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -1377,7 +1377,7 @@ static struct superio_struct *find_superio(struct parport *p)
 {
 	int i;
 	for (i = 0; i < NR_SUPERIOS; i++)
-		if (superios[i].io != p->base)
+		if (superios[i].io == p->base)
 			return &superios[i];
 	return NULL;
 }

From fefcb294a459a43104993a20b7215cd14793805c Mon Sep 17 00:00:00 2001
From: Sowjanya Komatineni <skomatineni@nvidia.com>
Date: Tue, 12 Feb 2019 11:06:44 -0800
Subject: [PATCH 2882/3220] i2c: tegra: fix maximum transfer size

commit f4e3f4ae1d9c9330de355f432b69952e8cef650c upstream.

Tegra186 and prior supports maximum 4K bytes per packet transfer
including 12 bytes of packet header.

This patch fixes max write length limit to account packet header
size for transfers.

Cc: stable@vger.kernel.org # 4.4+

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Sowjanya Komatineni <skomatineni@nvidia.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/i2c/busses/i2c-tegra.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
index a0522fcc4ff8..1004422dbb10 100644
--- a/drivers/i2c/busses/i2c-tegra.c
+++ b/drivers/i2c/busses/i2c-tegra.c
@@ -696,7 +696,7 @@ static const struct i2c_algorithm tegra_i2c_algo = {
 /* payload size is only 12 bit */
 static struct i2c_adapter_quirks tegra_i2c_quirks = {
 	.max_read_len = 4096,
-	.max_write_len = 4096,
+	.max_write_len = 4096 - 12,
 };
 
 static const struct tegra_i2c_hw_feature tegra20_i2c_hw = {

From 7bc0cb4da25ea85a177451339637bfe08580f11d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 11 Jul 2016 12:36:41 -0300
Subject: [PATCH 2883/3220] perf bench: Copy kernel files needed to build
 mem{cpy,set} x86_64 benchmarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 7d7d1bf1d1dabe435ef50efb051724b8664749cb upstream.

We can't access kernel files directly from tools/, so copy the required
bits, and make sure that we detect when the original files, in the
kernel, gets modified.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-z7e76274ch5j4nugv048qacb@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Daniel Díaz <daniel.diaz@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/arch/x86/include/asm/cpufeatures.h      | 336 ++++++++++++++++++
 .../arch/x86/include/asm/disabled-features.h  |  65 ++++
 .../arch/x86/include/asm/required-features.h  | 106 ++++++
 tools/arch/x86/lib/memcpy_64.S                | 179 ++++++++++
 tools/arch/x86/lib/memset_64.S                | 138 +++++++
 .../util => }/include/asm/alternative-asm.h   |   4 +-
 tools/perf/MANIFEST                           |   8 +-
 tools/perf/Makefile.perf                      |  15 +
 tools/perf/bench/mem-memcpy-x86-64-asm.S      |   2 +-
 tools/perf/bench/mem-memset-x86-64-asm.S      |   2 +-
 10 files changed, 849 insertions(+), 6 deletions(-)
 create mode 100644 tools/arch/x86/include/asm/cpufeatures.h
 create mode 100644 tools/arch/x86/include/asm/disabled-features.h
 create mode 100644 tools/arch/x86/include/asm/required-features.h
 create mode 100644 tools/arch/x86/lib/memcpy_64.S
 create mode 100644 tools/arch/x86/lib/memset_64.S
 rename tools/{perf/util => }/include/asm/alternative-asm.h (66%)

diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
new file mode 100644
index 000000000000..a5fa3195a230
--- /dev/null
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -0,0 +1,336 @@
+#ifndef _ASM_X86_CPUFEATURES_H
+#define _ASM_X86_CPUFEATURES_H
+
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#include <asm/required-features.h>
+#endif
+
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#include <asm/disabled-features.h>
+#endif
+
+/*
+ * Defines x86 CPU feature bits
+ */
+#define NCAPINTS	19	/* N 32-bit words worth of info */
+#define NBUGINTS	1	/* N 32-bit bug flags */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name.  If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU		( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME		( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE		( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE		( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC		( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR		( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE		( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE		( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8		( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC	( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP		( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR	( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE		( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA		( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV	( 0*32+15) /* CMOV instructions */
+					  /* (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT		( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36	( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN		( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH	( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS		( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI	( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX		( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR	( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM		( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2	( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP	( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT		( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC		( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64	( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE		( 0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL	( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP		( 1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX		( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT	( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT	( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES	( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP	( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM		( 1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT	( 1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW	( 1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY	( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN	( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI	( 2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX	( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR	( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR	( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR	( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8		( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7		( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3		( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4		( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP		( 3*32+ 9) /* smp kernel running on up */
+/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS	( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS		( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32	( 3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32	( 3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD	( 3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
+/* free, was #define X86_FEATURE_11AP	( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL	( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS	( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC	( 3*32+24) /* TSC does not stop in C states */
+/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID	( 3*32+26) /* has extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ	( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64	( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT	( 4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL	( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX		( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX		( 4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST		( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2		( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3	( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID		( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG	( 4*32+11) /* Silicon Debug */
+#define X86_FEATURE_FMA		( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16	( 4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR	( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM	( 4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_PCID	( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA		( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1	( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2	( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC	( 4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE	( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER	( 4*32+24) /* Tsc deadline timer */
+#define X86_FEATURE_AES		( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE	( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE	( 4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX		( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C	( 4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRAND	( 4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR	( 4*32+31) /* Running on a hypervisor */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE	( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN	( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT	( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN	( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2	( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN	( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE		( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN	( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM		( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN	( 5*32+13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM	( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY	( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM		( 6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC	( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY	( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM		( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A	( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW	( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS		( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP		( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT	( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT		( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP		( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4	( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE		( 6*32+17) /* translation cache extension */
+#define X86_FEATURE_NODEID_MSR	( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM		( 6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT	( 6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
+#define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+
+#define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
+
+#define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+
+#define X86_FEATURE_RETPOLINE	( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
+
+#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
+#define X86_FEATURE_RSB_CTXSW	( 7*32+19) /* "" Fill RSB on context switches */
+
+#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
+#define X86_FEATURE_SSBD	( 7*32+17) /* Speculative Store Bypass Disable */
+
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
+
+#define X86_FEATURE_USE_IBPB	( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled*/
+#define X86_FEATURE_USE_IBRS_FW	( 7*32+22) /* "" Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
+#define X86_FEATURE_LS_CFG_SSBD	( 7*32+24) /* "" AMD SSBD implementation */
+
+#define X86_FEATURE_IBRS	( 7*32+25) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB	( 7*32+26) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_STIBP	( 7*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN		( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
+#define X86_FEATURE_L1TF_PTEINV	( 7*32+29) /* "" L1TF workaround PTE inversion */
+
+/* Virtualization flags: Linux defined, word 8 */
+#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
+
+#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
+
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE	( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST	( 9*32+ 1) /* TSC adjustment MSR 0x3b */
+#define X86_FEATURE_BMI1	( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE		( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2	( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP	( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2	( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
+#define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
+#define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
+#define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI	( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+
+/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+#define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
+#define X86_FEATURE_XSAVEC	(10*32+ 1) /* XSAVEC */
+#define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
+#define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
+/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
+#define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
+#define X86_FEATURE_AMD_IBPB	(13*32+12) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS	(13*32+14) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP	(13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_VIRT_SSBD	(13*32+25) /* Virtualized Speculative Store Bypass Disable */
+
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
+#define X86_FEATURE_DTHERM	(14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA		(14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT	(14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN		(14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS		(14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP		(14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY	(14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP	(14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
+#define X86_FEATURE_NPT		(15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV	(15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML	(15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS	(15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
+#define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
+#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
+#define X86_FEATURE_SUCCOR	(17*32+1) /* Uncorrectable error containment and recovery */
+#define X86_FEATURE_SMCA	(17*32+3) /* Scalable MCA */
+
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
+#define X86_FEATURE_AVX512_4VNNIW	(18*32+ 2) /* AVX-512 Neural Network Instructions */
+#define X86_FEATURE_AVX512_4FMAPS	(18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
+#define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D		(18*32+28) /* Flush L1D cache */
+#define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_SPEC_CTRL_SSBD	(18*32+31) /* "" Speculative Store Bypass Disable */
+
+/*
+ * BUG word(s)
+ */
+#define X86_BUG(x)		(NCAPINTS*32 + (x))
+
+#define X86_BUG_F00F		X86_BUG(0) /* Intel F00F */
+#define X86_BUG_FDIV		X86_BUG(1) /* FPU FDIV */
+#define X86_BUG_COMA		X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_AMD_TLB_MMATCH	X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E	X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#define X86_BUG_CPU_MELTDOWN	X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
+#define X86_BUG_SPECTRE_V1	X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
+#define X86_BUG_SPECTRE_V2	X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
+#define X86_BUG_L1TF		X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
+
+#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
new file mode 100644
index 000000000000..1f8cca459c6c
--- /dev/null
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#define _ASM_X86_DISABLED_FEATURES_H
+
+/* These features, although they might be available in a CPU
+ * will not be used because the compile options to support
+ * them are not present.
+ *
+ * This code allows them to be checked and disabled at
+ * compile time without an explicit #ifdef.  Use
+ * cpu_feature_enabled().
+ */
+
+#ifdef CONFIG_X86_INTEL_MPX
+# define DISABLE_MPX	0
+#else
+# define DISABLE_MPX	(1<<(X86_FEATURE_MPX & 31))
+#endif
+
+#ifdef CONFIG_X86_64
+# define DISABLE_VME		(1<<(X86_FEATURE_VME & 31))
+# define DISABLE_K6_MTRR	(1<<(X86_FEATURE_K6_MTRR & 31))
+# define DISABLE_CYRIX_ARR	(1<<(X86_FEATURE_CYRIX_ARR & 31))
+# define DISABLE_CENTAUR_MCR	(1<<(X86_FEATURE_CENTAUR_MCR & 31))
+# define DISABLE_PCID		0
+#else
+# define DISABLE_VME		0
+# define DISABLE_K6_MTRR	0
+# define DISABLE_CYRIX_ARR	0
+# define DISABLE_CENTAUR_MCR	0
+# define DISABLE_PCID		(1<<(X86_FEATURE_PCID & 31))
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+# define DISABLE_PKU		0
+# define DISABLE_OSPKE		0
+#else
+# define DISABLE_PKU		(1<<(X86_FEATURE_PKU & 31))
+# define DISABLE_OSPKE		(1<<(X86_FEATURE_OSPKE & 31))
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
+
+/*
+ * Make sure to add features to the correct mask
+ */
+#define DISABLED_MASK0	(DISABLE_VME)
+#define DISABLED_MASK1	0
+#define DISABLED_MASK2	0
+#define DISABLED_MASK3	(DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
+#define DISABLED_MASK4	(DISABLE_PCID)
+#define DISABLED_MASK5	0
+#define DISABLED_MASK6	0
+#define DISABLED_MASK7	0
+#define DISABLED_MASK8	0
+#define DISABLED_MASK9	(DISABLE_MPX)
+#define DISABLED_MASK10	0
+#define DISABLED_MASK11	0
+#define DISABLED_MASK12	0
+#define DISABLED_MASK13	0
+#define DISABLED_MASK14	0
+#define DISABLED_MASK15	0
+#define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE)
+#define DISABLED_MASK17	0
+#define DISABLED_MASK18	0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+
+#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h
new file mode 100644
index 000000000000..6847d85400a8
--- /dev/null
+++ b/tools/arch/x86/include/asm/required-features.h
@@ -0,0 +1,106 @@
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#define _ASM_X86_REQUIRED_FEATURES_H
+
+/* Define minimum CPUID feature set for kernel These bits are checked
+   really early to actually display a visible error message before the
+   kernel dies.  Make sure to assign features to the proper mask!
+
+   Some requirements that are not in CPUID yet are also in the
+   CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too.
+
+   The real information is in arch/x86/Kconfig.cpu, this just converts
+   the CONFIGs into a bitmask */
+
+#ifndef CONFIG_MATH_EMULATION
+# define NEED_FPU	(1<<(X86_FEATURE_FPU & 31))
+#else
+# define NEED_FPU	0
+#endif
+
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+# define NEED_PAE	(1<<(X86_FEATURE_PAE & 31))
+#else
+# define NEED_PAE	0
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+# define NEED_CX8	(1<<(X86_FEATURE_CX8 & 31))
+#else
+# define NEED_CX8	0
+#endif
+
+#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64)
+# define NEED_CMOV	(1<<(X86_FEATURE_CMOV & 31))
+#else
+# define NEED_CMOV	0
+#endif
+
+#ifdef CONFIG_X86_USE_3DNOW
+# define NEED_3DNOW	(1<<(X86_FEATURE_3DNOW & 31))
+#else
+# define NEED_3DNOW	0
+#endif
+
+#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
+# define NEED_NOPL	(1<<(X86_FEATURE_NOPL & 31))
+#else
+# define NEED_NOPL	0
+#endif
+
+#ifdef CONFIG_MATOM
+# define NEED_MOVBE	(1<<(X86_FEATURE_MOVBE & 31))
+#else
+# define NEED_MOVBE	0
+#endif
+
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT
+/* Paravirtualized systems may not have PSE or PGE available */
+#define NEED_PSE	0
+#define NEED_PGE	0
+#else
+#define NEED_PSE	(1<<(X86_FEATURE_PSE) & 31)
+#define NEED_PGE	(1<<(X86_FEATURE_PGE) & 31)
+#endif
+#define NEED_MSR	(1<<(X86_FEATURE_MSR & 31))
+#define NEED_FXSR	(1<<(X86_FEATURE_FXSR & 31))
+#define NEED_XMM	(1<<(X86_FEATURE_XMM & 31))
+#define NEED_XMM2	(1<<(X86_FEATURE_XMM2 & 31))
+#define NEED_LM		(1<<(X86_FEATURE_LM & 31))
+#else
+#define NEED_PSE	0
+#define NEED_MSR	0
+#define NEED_PGE	0
+#define NEED_FXSR	0
+#define NEED_XMM	0
+#define NEED_XMM2	0
+#define NEED_LM		0
+#endif
+
+#define REQUIRED_MASK0	(NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\
+			 NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\
+			 NEED_XMM|NEED_XMM2)
+#define SSE_MASK	(NEED_XMM|NEED_XMM2)
+
+#define REQUIRED_MASK1	(NEED_LM|NEED_3DNOW)
+
+#define REQUIRED_MASK2	0
+#define REQUIRED_MASK3	(NEED_NOPL)
+#define REQUIRED_MASK4	(NEED_MOVBE)
+#define REQUIRED_MASK5	0
+#define REQUIRED_MASK6	0
+#define REQUIRED_MASK7	0
+#define REQUIRED_MASK8	0
+#define REQUIRED_MASK9	0
+#define REQUIRED_MASK10	0
+#define REQUIRED_MASK11	0
+#define REQUIRED_MASK12	0
+#define REQUIRED_MASK13	0
+#define REQUIRED_MASK14	0
+#define REQUIRED_MASK15	0
+#define REQUIRED_MASK16	0
+#define REQUIRED_MASK17	0
+#define REQUIRED_MASK18	0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+
+#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S
new file mode 100644
index 000000000000..a0de849435ad
--- /dev/null
+++ b/tools/arch/x86/lib/memcpy_64.S
@@ -0,0 +1,179 @@
+/* Copyright 2002 Andi Kleen */
+
+#include <linux/linkage.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+
+/*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:
+ *  rdi destination
+ *  rsi source
+ *  rdx count
+ *
+ * Output:
+ * rax original destination
+ */
+ENTRY(__memcpy)
+ENTRY(memcpy)
+	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp memcpy_erms", X86_FEATURE_ERMS
+
+	movq %rdi, %rax
+	movq %rdx, %rcx
+	shrq $3, %rcx
+	andl $7, %edx
+	rep movsq
+	movl %edx, %ecx
+	rep movsb
+	ret
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+
+/*
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
+ */
+ENTRY(memcpy_erms)
+	movq %rdi, %rax
+	movq %rdx, %rcx
+	rep movsb
+	ret
+ENDPROC(memcpy_erms)
+
+ENTRY(memcpy_orig)
+	movq %rdi, %rax
+
+	cmpq $0x20, %rdx
+	jb .Lhandle_tail
+
+	/*
+	 * We check whether memory false dependence could occur,
+	 * then jump to corresponding copy mode.
+	 */
+	cmp  %dil, %sil
+	jl .Lcopy_backward
+	subq $0x20, %rdx
+.Lcopy_forward_loop:
+	subq $0x20,	%rdx
+
+	/*
+	 * Move in blocks of 4x8 bytes:
+	 */
+	movq 0*8(%rsi),	%r8
+	movq 1*8(%rsi),	%r9
+	movq 2*8(%rsi),	%r10
+	movq 3*8(%rsi),	%r11
+	leaq 4*8(%rsi),	%rsi
+
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	2*8(%rdi)
+	movq %r11,	3*8(%rdi)
+	leaq 4*8(%rdi),	%rdi
+	jae  .Lcopy_forward_loop
+	addl $0x20,	%edx
+	jmp  .Lhandle_tail
+
+.Lcopy_backward:
+	/*
+	 * Calculate copy position to tail.
+	 */
+	addq %rdx,	%rsi
+	addq %rdx,	%rdi
+	subq $0x20,	%rdx
+	/*
+	 * At most 3 ALU operations in one cycle,
+	 * so append NOPS in the same 16 bytes trunk.
+	 */
+	.p2align 4
+.Lcopy_backward_loop:
+	subq $0x20,	%rdx
+	movq -1*8(%rsi),	%r8
+	movq -2*8(%rsi),	%r9
+	movq -3*8(%rsi),	%r10
+	movq -4*8(%rsi),	%r11
+	leaq -4*8(%rsi),	%rsi
+	movq %r8,		-1*8(%rdi)
+	movq %r9,		-2*8(%rdi)
+	movq %r10,		-3*8(%rdi)
+	movq %r11,		-4*8(%rdi)
+	leaq -4*8(%rdi),	%rdi
+	jae  .Lcopy_backward_loop
+
+	/*
+	 * Calculate copy position to head.
+	 */
+	addl $0x20,	%edx
+	subq %rdx,	%rsi
+	subq %rdx,	%rdi
+.Lhandle_tail:
+	cmpl $16,	%edx
+	jb   .Lless_16bytes
+
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r8
+	movq 1*8(%rsi),	%r9
+	movq -2*8(%rsi, %rdx),	%r10
+	movq -1*8(%rsi, %rdx),	%r11
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	-2*8(%rdi, %rdx)
+	movq %r11,	-1*8(%rdi, %rdx)
+	retq
+	.p2align 4
+.Lless_16bytes:
+	cmpl $8,	%edx
+	jb   .Lless_8bytes
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi),	%r8
+	movq -1*8(%rsi, %rdx),	%r9
+	movq %r8,	0*8(%rdi)
+	movq %r9,	-1*8(%rdi, %rdx)
+	retq
+	.p2align 4
+.Lless_8bytes:
+	cmpl $4,	%edx
+	jb   .Lless_3bytes
+
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %ecx
+	movl -4(%rsi, %rdx), %r8d
+	movl %ecx, (%rdi)
+	movl %r8d, -4(%rdi, %rdx)
+	retq
+	.p2align 4
+.Lless_3bytes:
+	subl $1, %edx
+	jb .Lend
+	/*
+	 * Move data from 1 bytes to 3 bytes.
+	 */
+	movzbl (%rsi), %ecx
+	jz .Lstore_1byte
+	movzbq 1(%rsi), %r8
+	movzbq (%rsi, %rdx), %r9
+	movb %r8b, 1(%rdi)
+	movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+	movb %cl, (%rdi)
+
+.Lend:
+	retq
+ENDPROC(memcpy_orig)
diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S
new file mode 100644
index 000000000000..c9c81227ea37
--- /dev/null
+++ b/tools/arch/x86/lib/memset_64.S
@@ -0,0 +1,138 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+
+.weak memset
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
+ *
+ * rdi   destination
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
+ * rax   original destination
+ */
+ENTRY(memset)
+ENTRY(__memset)
+	/*
+	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+	 * to use it when possible. If not available, use fast string instructions.
+	 *
+	 * Otherwise, use original memset function.
+	 */
+	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp memset_erms", X86_FEATURE_ERMS
+
+	movq %rdi,%r9
+	movq %rdx,%rcx
+	andl $7,%edx
+	shrq $3,%rcx
+	/* expand byte value  */
+	movzbl %sil,%esi
+	movabs $0x0101010101010101,%rax
+	imulq %rsi,%rax
+	rep stosq
+	movl %edx,%ecx
+	rep stosb
+	movq %r9,%rax
+	ret
+ENDPROC(memset)
+ENDPROC(__memset)
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi   destination
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
+ * rax   original destination
+ */
+ENTRY(memset_erms)
+	movq %rdi,%r9
+	movb %sil,%al
+	movq %rdx,%rcx
+	rep stosb
+	movq %r9,%rax
+	ret
+ENDPROC(memset_erms)
+
+ENTRY(memset_orig)
+	movq %rdi,%r10
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	imulq  %rcx,%rax
+
+	/* align dst */
+	movl  %edi,%r9d
+	andl  $7,%r9d
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+
+	movq  %rdx,%rcx
+	shrq  $6,%rcx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decq  %rcx
+	movq  %rax,(%rdi)
+	movq  %rax,8(%rdi)
+	movq  %rax,16(%rdi)
+	movq  %rax,24(%rdi)
+	movq  %rax,32(%rdi)
+	movq  %rax,40(%rdi)
+	movq  %rax,48(%rdi)
+	movq  %rax,56(%rdi)
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */
+	.p2align 4
+.Lhandle_tail:
+	movl	%edx,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	andl	$7,%edx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %edx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+
+.Lende:
+	movq	%r10,%rax
+	ret
+
+.Lbad_alignment:
+	cmpq $7,%rdx
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8
+	subq %r9,%r8
+	addq %r8,%rdi
+	subq %r8,%rdx
+	jmp .Lafter_bad_alignment
+.Lfinal:
+ENDPROC(memset_orig)
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/include/asm/alternative-asm.h
similarity index 66%
rename from tools/perf/util/include/asm/alternative-asm.h
rename to tools/include/asm/alternative-asm.h
index 3a3a0f16456a..2a4d1bfa2988 100644
--- a/tools/perf/util/include/asm/alternative-asm.h
+++ b/tools/include/asm/alternative-asm.h
@@ -1,5 +1,5 @@
-#ifndef _PERF_ASM_ALTERNATIVE_ASM_H
-#define _PERF_ASM_ALTERNATIVE_ASM_H
+#ifndef _TOOLS_ASM_ALTERNATIVE_ASM_H
+#define _TOOLS_ASM_ALTERNATIVE_ASM_H
 
 /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
 
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index 39c38cb45b00..358b810057d6 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -11,6 +11,11 @@ tools/arch/sparc/include/asm/barrier_32.h
 tools/arch/sparc/include/asm/barrier_64.h
 tools/arch/tile/include/asm/barrier.h
 tools/arch/x86/include/asm/barrier.h
+tools/arch/x86/include/asm/cpufeatures.h
+tools/arch/x86/include/asm/disabled-features.h
+tools/arch/x86/include/asm/required-features.h
+tools/arch/x86/lib/memcpy_64.S
+tools/arch/x86/lib/memset_64.S
 tools/arch/xtensa/include/asm/barrier.h
 tools/scripts
 tools/build
@@ -25,6 +30,7 @@ tools/lib/rbtree.c
 tools/lib/symbol/kallsyms.c
 tools/lib/symbol/kallsyms.h
 tools/lib/util/find_next_bit.c
+tools/include/asm/alternative-asm.h
 tools/include/asm/atomic.h
 tools/include/asm/barrier.h
 tools/include/asm/bug.h
@@ -65,8 +71,6 @@ include/linux/swab.h
 arch/*/include/asm/unistd*.h
 arch/*/include/uapi/asm/unistd*.h
 arch/*/include/uapi/asm/perf_regs.h
-arch/*/lib/memcpy*.S
-arch/*/lib/memset*.S
 include/linux/poison.h
 include/linux/hw_breakpoint.h
 include/uapi/linux/perf_event.h
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index b67e006d56cc..7e0837579f40 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -310,6 +310,21 @@ export srctree OUTPUT RM CC LD AR CFLAGS V BISON FLEX AWK
 include $(srctree)/tools/build/Makefile.include
 
 $(PERF_IN): prepare FORCE
+	@(test -f ../../arch/x86/include/asm/disabled-features.h && ( \
+        (diff -B ../arch/x86/include/asm/disabled-features.h ../../arch/x86/include/asm/disabled-features.h >/dev/null) \
+        || echo "Warning: tools/arch/x86/include/asm/disabled-features.h differs from kernel" >&2 )) || true
+	@(test -f ../../arch/x86/include/asm/required-features.h && ( \
+        (diff -B ../arch/x86/include/asm/required-features.h ../../arch/x86/include/asm/required-features.h >/dev/null) \
+        || echo "Warning: tools/arch/x86/include/asm/required-features.h differs from kernel" >&2 )) || true
+	@(test -f ../../arch/x86/include/asm/cpufeatures.h && ( \
+        (diff -B ../arch/x86/include/asm/cpufeatures.h ../../arch/x86/include/asm/cpufeatures.h >/dev/null) \
+        || echo "Warning: tools/arch/x86/include/asm/cpufeatures.h differs from kernel" >&2 )) || true
+	@(test -f ../../arch/x86/lib/memcpy_64.S && ( \
+        (diff -B ../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memcpy_64.S >/dev/null) \
+        || echo "Warning: tools/arch/x86/lib/memcpy_64.S differs from kernel" >&2 )) || true
+	@(test -f ../../arch/x86/lib/memset_64.S && ( \
+        (diff -B ../arch/x86/lib/memset_64.S ../../arch/x86/lib/memset_64.S >/dev/null) \
+        || echo "Warning: tools/arch/x86/lib/memset_64.S differs from kernel" >&2 )) || true
 	$(Q)$(MAKE) $(build)=perf
 
 $(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(LIBTRACEEVENT_DYNAMIC_LIST)
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
index e4c2c30143b9..9d82c44a6d71 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,7 +1,7 @@
 #define memcpy MEMCPY /* don't hide glibc's memcpy() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#include "../../../arch/x86/lib/memcpy_64.S"
+#include "../../arch/x86/lib/memcpy_64.S"
 /*
  * We need to provide note.GNU-stack section, saying that we want
  * NOT executable stack. Otherwise the final linking will assume that
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S
index de278784c866..58407aa24c1b 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm.S
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -1,7 +1,7 @@
 #define memset MEMSET /* don't hide glibc's memset() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#include "../../../arch/x86/lib/memset_64.S"
+#include "../../arch/x86/lib/memset_64.S"
 
 /*
  * We need to provide note.GNU-stack section, saying that we want

From 1a366cb3ae98013c1db9d498ce1a959f711d6f73 Mon Sep 17 00:00:00 2001
From: Jay Dolan <jay.dolan@accesio.com>
Date: Tue, 12 Feb 2019 21:43:11 -0800
Subject: [PATCH 2884/3220] serial: 8250_pci: Fix number of ports for ACCES
 serial cards

commit b896b03bc7fce43a07012cc6bf5e2ab2fddf3364 upstream.

Have the correct number of ports created for ACCES serial cards. Two port
cards show up as four ports, and four port cards show up as eight.

Fixes: c8d192428f52 ("serial: 8250: added acces i/o products quad and octal serial cards")
Signed-off-by: Jay Dolan <jay.dolan@accesio.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_pci.c | 36 +++++++++++++++---------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 746c76b358a0..86c495cc615c 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -5176,10 +5176,10 @@ static struct pci_device_id serial_pci_tbl[] = {
 	 */
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM_2SDB,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_MPCIE_COM_2S,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM_4SDB,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pericom_PI7C9X7954 },
@@ -5188,10 +5188,10 @@ static struct pci_device_id serial_pci_tbl[] = {
 		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM232_2DB,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_MPCIE_COM232_2,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM232_4DB,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pericom_PI7C9X7954 },
@@ -5200,10 +5200,10 @@ static struct pci_device_id serial_pci_tbl[] = {
 		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM_2SMDB,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_MPCIE_COM_2SM,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM_4SMDB,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pericom_PI7C9X7954 },
@@ -5212,13 +5212,13 @@ static struct pci_device_id serial_pci_tbl[] = {
 		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_MPCIE_ICM485_1,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7951 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_MPCIE_ICM422_2,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_MPCIE_ICM485_2,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_MPCIE_ICM422_4,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pericom_PI7C9X7954 },
@@ -5227,16 +5227,16 @@ static struct pci_device_id serial_pci_tbl[] = {
 		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_ICM_2S,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_ICM_4S,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_ICM232_2,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_MPCIE_ICM232_2,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_ICM232_4,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pericom_PI7C9X7954 },
@@ -5245,13 +5245,13 @@ static struct pci_device_id serial_pci_tbl[] = {
 		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_ICM_2SM,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7954 },
+		pbn_pericom_PI7C9X7952 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM422_4,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7958 },
+		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM485_4,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7958 },
+		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM422_8,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pericom_PI7C9X7958 },
@@ -5260,19 +5260,19 @@ static struct pci_device_id serial_pci_tbl[] = {
 		pbn_pericom_PI7C9X7958 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM232_4,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7958 },
+		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM232_8,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pericom_PI7C9X7958 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM_4SM,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7958 },
+		pbn_pericom_PI7C9X7954 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_COM_8SM,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_pericom_PI7C9X7958 },
 	{	PCI_VENDOR_ID_ACCESIO, PCI_DEVICE_ID_ACCESIO_PCIE_ICM_4SM,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
-		pbn_pericom_PI7C9X7958 },
+		pbn_pericom_PI7C9X7954 },
 	/*
 	 * Topic TP560 Data/Fax/Voice 56k modem (reported by Evan Clarke)
 	 */

From 98257572f7c71837dfdfb6e2977bf1c37423ec69 Mon Sep 17 00:00:00 2001
From: Jay Dolan <jay.dolan@accesio.com>
Date: Tue, 12 Feb 2019 21:43:12 -0800
Subject: [PATCH 2885/3220] serial: 8250_pci: Have ACCES cards that use the
 four port Pericom PI7C9X7954 chip use the pci_pericom_setup()

commit 78d3820b9bd39028727c6aab7297b63c093db343 upstream.

The four port Pericom chips have the fourth port at the wrong address.
Make use of quirk to fix it.

Fixes: c8d192428f52 ("serial: 8250: added acces i/o products quad and octal serial cards")
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Jay Dolan <jay.dolan@accesio.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_pci.c | 105 +++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 86c495cc615c..b032add92722 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -2326,6 +2326,111 @@ static struct pci_serial_quirk pci_serial_quirks[] __refdata = {
 		.setup		= pci_default_setup,
 		.exit		= pci_plx9050_exit,
 	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_PCIE_COM_4SDB,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_MPCIE_COM_4S,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_PCIE_COM232_4DB,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_MPCIE_COM232_4,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_PCIE_COM_4SMDB,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_MPCIE_COM_4SM,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_MPCIE_ICM422_4,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_MPCIE_ICM485_4,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_DEVICE_ID_ACCESIO_PCIE_ICM_4S,
+		.device     = PCI_DEVICE_ID_ACCESIO_PCIE_ICM232_4,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_MPCIE_ICM232_4,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_PCIE_COM422_4,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_PCIE_COM485_4,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_PCIE_COM232_4,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_PCIE_COM_4SM,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
+	{
+		.vendor     = PCI_VENDOR_ID_ACCESIO,
+		.device     = PCI_DEVICE_ID_ACCESIO_PCIE_ICM_4SM,
+		.subvendor  = PCI_ANY_ID,
+		.subdevice  = PCI_ANY_ID,
+		.setup      = pci_pericom_setup,
+	},
 	/*
 	 * SBS Technologies, Inc., PMC-OCTALPRO 232
 	 */

From 1fb35b7eda3ddd9fc266458d8b0056a4ef564b5a Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Sun, 10 Feb 2019 23:23:04 -0500
Subject: [PATCH 2886/3220] jbd2: clear dirty flag when revoking a buffer from
 an older transaction

commit 904cdbd41d749a476863a0ca41f6f396774f26e4 upstream.

Now, we capture a data corruption problem on ext4 while we're truncating
an extent index block. Imaging that if we are revoking a buffer which
has been journaled by the committing transaction, the buffer's jbddirty
flag will not be cleared in jbd2_journal_forget(), so the commit code
will set the buffer dirty flag again after refile the buffer.

fsx                               kjournald2
                                  jbd2_journal_commit_transaction
jbd2_journal_revoke                commit phase 1~5...
 jbd2_journal_forget
   belongs to older transaction    commit phase 6
   jbddirty not clear               __jbd2_journal_refile_buffer
                                     __jbd2_journal_unfile_buffer
                                      test_clear_buffer_jbddirty
                                       mark_buffer_dirty

Finally, if the freed extent index block was allocated again as data
block by some other files, it may corrupt the file data after writing
cached pages later, such as during unmount time. (In general,
clean_bdev_aliases() related helpers should be invoked after
re-allocation to prevent the above corruption, but unfortunately we
missed it when zeroout the head of extra extent blocks in
ext4_ext_handle_unwritten_extents()).

This patch mark buffer as freed and set j_next_transaction to the new
transaction when it already belongs to the committing transaction in
jbd2_journal_forget(), so that commit code knows it should clear dirty
bits when it is done with the buffer.

This problem can be reproduced by xfstests generic/455 easily with
seeds (3246 3247 3248 3249).

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/jbd2/transaction.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bce343febb9e..c15b185f4d24 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1578,14 +1578,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 		/* However, if the buffer is still owned by a prior
 		 * (committing) transaction, we can't drop it yet... */
 		JBUFFER_TRACE(jh, "belongs to older transaction");
-		/* ... but we CAN drop it from the new transaction if we
-		 * have also modified it since the original commit. */
+		/* ... but we CAN drop it from the new transaction through
+		 * marking the buffer as freed and set j_next_transaction to
+		 * the new transaction, so that not only the commit code
+		 * knows it should clear dirty bits when it is done with the
+		 * buffer, but also the buffer can be checkpointed only
+		 * after the new transaction commits. */
 
-		if (jh->b_next_transaction) {
-			J_ASSERT(jh->b_next_transaction == transaction);
+		set_buffer_freed(bh);
+
+		if (!jh->b_next_transaction) {
 			spin_lock(&journal->j_list_lock);
-			jh->b_next_transaction = NULL;
+			jh->b_next_transaction = transaction;
 			spin_unlock(&journal->j_list_lock);
+		} else {
+			J_ASSERT(jh->b_next_transaction == transaction);
 
 			/*
 			 * only drop a reference if this transaction modified

From 86ca0da1ddb050b8f81e08a51c3cb7a2ca529554 Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Thu, 21 Feb 2019 11:24:09 -0500
Subject: [PATCH 2887/3220] jbd2: fix compile warning when using JBUFFER_TRACE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 01215d3edb0f384ddeaa5e4a22c1ae5ff634149f upstream.

The jh pointer may be used uninitialized in the two cases below and the
compiler complain about it when enabling JBUFFER_TRACE macro, fix them.

In file included from fs/jbd2/transaction.c:19:0:
fs/jbd2/transaction.c: In function ‘jbd2_journal_get_undo_access’:
./include/linux/jbd2.h:1637:38: warning: ‘jh’ is used uninitialized in this function [-Wuninitialized]
 #define JBUFFER_TRACE(jh, info) do { printk("%s: %d\n", __func__, jh->b_jcount);} while (0)
                                      ^
fs/jbd2/transaction.c:1219:23: note: ‘jh’ was declared here
  struct journal_head *jh;
                       ^
In file included from fs/jbd2/transaction.c:19:0:
fs/jbd2/transaction.c: In function ‘jbd2_journal_dirty_metadata’:
./include/linux/jbd2.h:1637:38: warning: ‘jh’ may be used uninitialized in this function [-Wmaybe-uninitialized]
 #define JBUFFER_TRACE(jh, info) do { printk("%s: %d\n", __func__, jh->b_jcount);} while (0)
                                      ^
fs/jbd2/transaction.c:1332:23: note: ‘jh’ was declared here
  struct journal_head *jh;
                       ^

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/jbd2/transaction.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index c15b185f4d24..c34433432d47 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1215,11 +1215,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 	struct journal_head *jh;
 	char *committed_data = NULL;
 
-	JBUFFER_TRACE(jh, "entry");
 	if (jbd2_write_access_granted(handle, bh, true))
 		return 0;
 
 	jh = jbd2_journal_add_journal_head(bh);
+	JBUFFER_TRACE(jh, "entry");
+
 	/*
 	 * Do this first --- it can drop the journal lock, so we want to
 	 * make sure that obtaining the committed_data is done
@@ -1336,15 +1337,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 
 	if (is_handle_aborted(handle))
 		return -EROFS;
-	if (!buffer_jbd(bh)) {
-		ret = -EUCLEAN;
-		goto out;
-	}
+	if (!buffer_jbd(bh))
+		return -EUCLEAN;
+
 	/*
 	 * We don't grab jh reference here since the buffer must be part
 	 * of the running transaction.
 	 */
 	jh = bh2jh(bh);
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
+
 	/*
 	 * This and the following assertions are unreliable since we may see jh
 	 * in inconsistent state unless we grab bh_state lock. But this is
@@ -1378,9 +1381,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	}
 
 	journal = transaction->t_journal;
-	jbd_debug(5, "journal_head %p\n", jh);
-	JBUFFER_TRACE(jh, "entry");
-
 	jbd_lock_bh_state(bh);
 
 	if (jh->b_modified == 0) {

From 788b1a98f41511885ae432fca12df32fc4fb9473 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 27 Feb 2019 11:45:30 +0000
Subject: [PATCH 2888/3220] powerpc/32: Clear on-stack exception marker upon
 exception return

commit 9580b71b5a7863c24a9bd18bcd2ad759b86b1eff upstream.

Clear the on-stack STACK_FRAME_REGS_MARKER on exception exit in order
to avoid confusing stacktrace like the one below.

  Call Trace:
  [c0e9dca0] [c01c42a0] print_address_description+0x64/0x2bc (unreliable)
  [c0e9dcd0] [c01c4684] kasan_report+0xfc/0x180
  [c0e9dd10] [c0895130] memchr+0x24/0x74
  [c0e9dd30] [c00a9e38] msg_print_text+0x124/0x574
  [c0e9dde0] [c00ab710] console_unlock+0x114/0x4f8
  [c0e9de40] [c00adc60] vprintk_emit+0x188/0x1c4
  --- interrupt: c0e9df00 at 0x400f330
      LR = init_stack+0x1f00/0x2000
  [c0e9de80] [c00ae3c4] printk+0xa8/0xcc (unreliable)
  [c0e9df20] [c0c27e44] early_irq_init+0x38/0x108
  [c0e9df50] [c0c15434] start_kernel+0x310/0x488
  [c0e9dff0] [00003484] 0x3484

With this patch the trace becomes:

  Call Trace:
  [c0e9dca0] [c01c42c0] print_address_description+0x64/0x2bc (unreliable)
  [c0e9dcd0] [c01c46a4] kasan_report+0xfc/0x180
  [c0e9dd10] [c0895150] memchr+0x24/0x74
  [c0e9dd30] [c00a9e58] msg_print_text+0x124/0x574
  [c0e9dde0] [c00ab730] console_unlock+0x114/0x4f8
  [c0e9de40] [c00adc80] vprintk_emit+0x188/0x1c4
  [c0e9de80] [c00ae3e4] printk+0xa8/0xcc
  [c0e9df20] [c0c27e44] early_irq_init+0x38/0x108
  [c0e9df50] [c0c15434] start_kernel+0x310/0x488
  [c0e9dff0] [00003484] 0x3484

Cc: stable@vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/entry_32.S | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 2405631e91a2..3728e617e17e 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -685,6 +685,9 @@ fast_exception_return:
 	mtcr	r10
 	lwz	r10,_LINK(r11)
 	mtlr	r10
+	/* Clear the exception_marker on the stack to avoid confusing stacktrace */
+	li	r10, 0
+	stw	r10, 8(r11)
 	REST_GPR(10, r11)
 	mtspr	SPRN_SRR1,r9
 	mtspr	SPRN_SRR0,r12
@@ -915,6 +918,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
 	mtcrf	0xFF,r10
 	mtlr	r11
 
+	/* Clear the exception_marker on the stack to avoid confusing stacktrace */
+	li	r10, 0
+	stw	r10, 8(r1)
 	/*
 	 * Once we put values in SRR0 and SRR1, we are in a state
 	 * where exceptions are not recoverable, since taking an
@@ -952,6 +958,9 @@ exc_exit_restart_end:
 	mtlr	r11
 	lwz	r10,_CCR(r1)
 	mtcrf	0xff,r10
+	/* Clear the exception_marker on the stack to avoid confusing stacktrace */
+	li	r10, 0
+	stw	r10, 8(r1)
 	REST_2GPRS(9, r1)
 	.globl exc_exit_restart
 exc_exit_restart:

From aa3995f04e39c6b7cb60ce66e8d20502a42bf78d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 21 Feb 2019 19:08:37 +0000
Subject: [PATCH 2889/3220] powerpc/wii: properly disable use of BATs when
 requested.

commit 6d183ca8baec983dc4208ca45ece3c36763df912 upstream.

'nobats' kernel parameter or some options like CONFIG_DEBUG_PAGEALLOC
deny the use of BATS for mapping memory.

This patch makes sure that the specific wii RAM mapping function
takes it into account as well.

Fixes: de32400dd26e ("wii: use both mem1 and mem2 as ram")
Cc: stable@vger.kernel.org
Reviewed-by: Jonathan Neuschafer <j.neuschaefer@gmx.net>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/embedded6xx/wii.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c
index 352592d3e44e..7fd19a480422 100644
--- a/arch/powerpc/platforms/embedded6xx/wii.c
+++ b/arch/powerpc/platforms/embedded6xx/wii.c
@@ -104,6 +104,10 @@ unsigned long __init wii_mmu_mapin_mem2(unsigned long top)
 	/* MEM2 64MB@0x10000000 */
 	delta = wii_hole_start + wii_hole_size;
 	size = top - delta;
+
+	if (__map_without_bats)
+		return delta;
+
 	for (bl = 128<<10; bl < max_size; bl <<= 1) {
 		if (bl * 2 > size)
 			break;

From d9fbe055bc95027b03fc87aba1f88afab317017c Mon Sep 17 00:00:00 2001
From: Jordan Niethe <jniethe5@gmail.com>
Date: Wed, 27 Feb 2019 14:02:29 +1100
Subject: [PATCH 2890/3220] powerpc/powernv: Make opal log only readable by
 root

commit 7b62f9bd2246b7d3d086e571397c14ba52645ef1 upstream.

Currently the opal log is globally readable. It is kernel policy to
limit the visibility of physical addresses / kernel pointers to root.
Given this and the fact the opal log may contain this information it
would be better to limit the readability to root.

Fixes: bfc36894a48b ("powerpc/powernv: Add OPAL message log interface")
Cc: stable@vger.kernel.org # v3.15+
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
Reviewed-by: Stewart Smith <stewart@linux.ibm.com>
Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/powernv/opal-msglog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index 44ed78af1a0d..9021b7272889 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -92,7 +92,7 @@ out:
 }
 
 static struct bin_attribute opal_msglog_attr = {
-	.attr = {.name = "msglog", .mode = 0444},
+	.attr = {.name = "msglog", .mode = 0400},
 	.read = opal_msglog_read
 };
 

From 7ea0c2f9788a18a8cbd634a0ebfee0510aaabf78 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 25 Jan 2019 12:03:55 +0000
Subject: [PATCH 2891/3220] powerpc/83xx: Also save/restore SPRG4-7 during
 suspend

commit 36da5ff0bea2dc67298150ead8d8471575c54c7d upstream.

The 83xx has 8 SPRG registers and uses at least SPRG4
for DTLB handling LRU.

Fixes: 2319f1239592 ("powerpc/mm: e300c2/c3/c4 TLB errata workaround")
Cc: stable@vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/83xx/suspend-asm.S | 34 ++++++++++++++++++-----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/83xx/suspend-asm.S b/arch/powerpc/platforms/83xx/suspend-asm.S
index 3d1ecd211776..8137f77abad5 100644
--- a/arch/powerpc/platforms/83xx/suspend-asm.S
+++ b/arch/powerpc/platforms/83xx/suspend-asm.S
@@ -26,13 +26,13 @@
 #define SS_MSR		0x74
 #define SS_SDR1		0x78
 #define SS_LR		0x7c
-#define SS_SPRG		0x80 /* 4 SPRGs */
-#define SS_DBAT		0x90 /* 8 DBATs */
-#define SS_IBAT		0xd0 /* 8 IBATs */
-#define SS_TB		0x110
-#define SS_CR		0x118
-#define SS_GPREG	0x11c /* r12-r31 */
-#define STATE_SAVE_SIZE 0x16c
+#define SS_SPRG		0x80 /* 8 SPRGs */
+#define SS_DBAT		0xa0 /* 8 DBATs */
+#define SS_IBAT		0xe0 /* 8 IBATs */
+#define SS_TB		0x120
+#define SS_CR		0x128
+#define SS_GPREG	0x12c /* r12-r31 */
+#define STATE_SAVE_SIZE 0x17c
 
 	.section .data
 	.align	5
@@ -103,6 +103,16 @@ _GLOBAL(mpc83xx_enter_deep_sleep)
 	stw	r7, SS_SPRG+12(r3)
 	stw	r8, SS_SDR1(r3)
 
+	mfspr	r4, SPRN_SPRG4
+	mfspr	r5, SPRN_SPRG5
+	mfspr	r6, SPRN_SPRG6
+	mfspr	r7, SPRN_SPRG7
+
+	stw	r4, SS_SPRG+16(r3)
+	stw	r5, SS_SPRG+20(r3)
+	stw	r6, SS_SPRG+24(r3)
+	stw	r7, SS_SPRG+28(r3)
+
 	mfspr	r4, SPRN_DBAT0U
 	mfspr	r5, SPRN_DBAT0L
 	mfspr	r6, SPRN_DBAT1U
@@ -493,6 +503,16 @@ mpc83xx_deep_resume:
 	mtspr	SPRN_IBAT7U, r6
 	mtspr	SPRN_IBAT7L, r7
 
+	lwz	r4, SS_SPRG+16(r3)
+	lwz	r5, SS_SPRG+20(r3)
+	lwz	r6, SS_SPRG+24(r3)
+	lwz	r7, SS_SPRG+28(r3)
+
+	mtspr	SPRN_SPRG4, r4
+	mtspr	SPRN_SPRG5, r5
+	mtspr	SPRN_SPRG6, r6
+	mtspr	SPRN_SPRG7, r7
+
 	lwz	r4, SS_SPRG+0(r3)
 	lwz	r5, SS_SPRG+4(r3)
 	lwz	r6, SS_SPRG+8(r3)

From fd2ebccb58843f76d5d5c552a7d2079d76790a7d Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 3 Jan 2019 14:14:08 -0600
Subject: [PATCH 2892/3220] ARM: s3c24xx: Fix boolean expressions in
 osiris_dvs_notify

commit e2477233145f2156434afb799583bccd878f3e9f upstream.

Fix boolean expressions by using logical AND operator '&&' instead of
bitwise operator '&'.

This issue was detected with the help of Coccinelle.

Fixes: 4fa084af28ca ("ARM: OSIRIS: DVS (Dynamic Voltage Scaling) supoort.")
Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
[krzk: Fix -Wparentheses warning]
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-s3c24xx/mach-osiris-dvs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/mach-s3c24xx/mach-osiris-dvs.c b/arch/arm/mach-s3c24xx/mach-osiris-dvs.c
index ce2db235dbaf..5e8a306163de 100644
--- a/arch/arm/mach-s3c24xx/mach-osiris-dvs.c
+++ b/arch/arm/mach-s3c24xx/mach-osiris-dvs.c
@@ -70,16 +70,16 @@ static int osiris_dvs_notify(struct notifier_block *nb,
 
 	switch (val) {
 	case CPUFREQ_PRECHANGE:
-		if (old_dvs & !new_dvs ||
-		    cur_dvs & !new_dvs) {
+		if ((old_dvs && !new_dvs) ||
+		    (cur_dvs && !new_dvs)) {
 			pr_debug("%s: exiting dvs\n", __func__);
 			cur_dvs = false;
 			gpio_set_value(OSIRIS_GPIO_DVS, 1);
 		}
 		break;
 	case CPUFREQ_POSTCHANGE:
-		if (!old_dvs & new_dvs ||
-		    !cur_dvs & new_dvs) {
+		if ((!old_dvs && new_dvs) ||
+		    (!cur_dvs && new_dvs)) {
 			pr_debug("entering dvs\n");
 			cur_dvs = true;
 			gpio_set_value(OSIRIS_GPIO_DVS, 0);

From cbb0f82ebbc46bd6c7a395636c57eddbc6316dfe Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Sun, 6 Jan 2019 21:06:25 +1100
Subject: [PATCH 2893/3220] dm: fix to_sector() for 32bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 0bdb50c531f7377a9da80d3ce2d61f389c84cb30 upstream.

A dm-raid array with devices larger than 4GB won't assemble on
a 32 bit host since _check_data_dev_sectors() was added in 4.16.
This is because to_sector() treats its argument as an "unsigned long"
which is 32bits (4GB) on a 32bit host.  Using "unsigned long long"
is more correct.

Kernels as early as 4.2 can have other problems due to to_sector()
being used on the size of a device.

Fixes: 0cf4503174c1 ("dm raid: add support for the MD RAID0 personality")
cc: stable@vger.kernel.org (v4.2+)
Reported-and-tested-by: Guillaume Perréal <gperreal@free.fr>
Signed-off-by: NeilBrown <neil@brown.name>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device-mapper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 899ab9f8549e..82621fa441f3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -593,7 +593,7 @@ extern struct ratelimit_state dm_ratelimit_state;
  */
 #define dm_target_offset(ti, sector) ((sector) - (ti)->begin)
 
-static inline sector_t to_sector(unsigned long n)
+static inline sector_t to_sector(unsigned long long n)
 {
 	return (n >> SECTOR_SHIFT);
 }

From 58cc0b4098d342ed0d12050d97ce021c387b140e Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Fri, 4 Dec 2015 02:57:48 +0800
Subject: [PATCH 2894/3220] NFS41: pop some layoutget errors to application

commit d600ad1f2bdbf97c4818dcc85b174f72c90c21bd upstream.

For ERESTARTSYS/EIO/EROFS/ENOSPC/E2BIG in layoutget, we
should just bail out instead of hiding the error and
retrying inband IO.

Change all the call sites to pop the error all the way up.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/direct.c                        | 15 ++++++++++++++-
 fs/nfs/filelayout/filelayout.c         | 17 +++++++++++++++--
 fs/nfs/flexfilelayout/flexfilelayout.c | 25 ++++++++++++++++++++++---
 fs/nfs/pagelist.c                      |  9 ++++++++-
 fs/nfs/pnfs.c                          | 24 ++++++++++++++++++------
 fs/nfs/read.c                          |  2 +-
 6 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 211440722e24..88cb8e0d6014 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -670,6 +670,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 
 	req = nfs_list_entry(reqs.next);
 	nfs_direct_setup_mirroring(dreq, &desc, req);
+	if (desc.pg_error < 0) {
+		list_splice_init(&reqs, &failed);
+		goto out_failed;
+	}
 
 	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
 		if (!nfs_pageio_add_request(&desc, req)) {
@@ -677,13 +681,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 			nfs_list_add_request(req, &failed);
 			spin_lock(cinfo.lock);
 			dreq->flags = 0;
-			dreq->error = -EIO;
+			if (desc.pg_error < 0)
+				dreq->error = desc.pg_error;
+			else
+				dreq->error = -EIO;
 			spin_unlock(cinfo.lock);
 		}
 		nfs_release_request(req);
 	}
 	nfs_pageio_complete(&desc);
 
+out_failed:
 	while (!list_empty(&failed)) {
 		req = nfs_list_entry(failed.next);
 		nfs_list_remove_request(req);
@@ -898,6 +906,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 			}
 
 			nfs_direct_setup_mirroring(dreq, &desc, req);
+			if (desc.pg_error < 0) {
+				nfs_free_request(req);
+				result = desc.pg_error;
+				break;
+			}
 
 			nfs_lock_request(req);
 			req->wb_index = pos >> PAGE_SHIFT;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index fd8da630fd22..8e268965c96d 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -882,13 +882,19 @@ static void
 filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 			struct nfs_page *req)
 {
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   0,
 					   NFS4_MAX_UINT64,
 					   IOMODE_READ,
 					   GFP_KERNEL);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
 	/* If no lseg, fall back to read through mds */
 	if (pgio->pg_lseg == NULL)
 		nfs_pageio_reset_read_mds(pgio);
@@ -901,13 +907,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	struct nfs_commit_info cinfo;
 	int status;
 
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   0,
 					   NFS4_MAX_UINT64,
 					   IOMODE_RW,
 					   GFP_NOFS);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
+
 	/* If no lseg, fall back to write through mds */
 	if (pgio->pg_lseg == NULL)
 		goto out_mds;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index c8e90152b61b..6506775575aa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -786,13 +786,19 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 	int ds_idx;
 
 	/* Use full layout for now */
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   req->wb_context,
 						   0,
 						   NFS4_MAX_UINT64,
 						   IOMODE_READ,
 						   GFP_KERNEL);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
 	/* If no lseg, fall back to read through mds */
 	if (pgio->pg_lseg == NULL)
 		goto out_mds;
@@ -826,13 +832,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	int i;
 	int status;
 
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   req->wb_context,
 						   0,
 						   NFS4_MAX_UINT64,
 						   IOMODE_RW,
 						   GFP_NOFS);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
 	/* If no lseg, fall back to write through mds */
 	if (pgio->pg_lseg == NULL)
 		goto out_mds;
@@ -868,18 +880,25 @@ static unsigned int
 ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 				    struct nfs_page *req)
 {
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   req->wb_context,
 						   0,
 						   NFS4_MAX_UINT64,
 						   IOMODE_RW,
 						   GFP_NOFS);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			goto out;
+		}
+	}
 	if (pgio->pg_lseg)
 		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
 	/* no lseg means that pnfs is not in use, so no mirroring here */
 	nfs_pageio_reset_write_mds(pgio);
+out:
 	return 1;
 }
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4bdc2fc86280..79e941848e9d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -872,6 +872,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
 
 	mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
 
+	if (pgio->pg_error < 0)
+		return pgio->pg_error;
+
 	if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
 		return -EINVAL;
 
@@ -980,6 +983,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 	} else {
 		if (desc->pg_ops->pg_init)
 			desc->pg_ops->pg_init(desc, req);
+		if (desc->pg_error < 0)
+			return 0;
 		mirror->pg_base = req->wb_pgbase;
 	}
 	if (!nfs_can_coalesce_requests(prev, req, desc))
@@ -1145,6 +1150,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 	bytes = req->wb_bytes;
 
 	nfs_pageio_setup_mirroring(desc, req);
+	if (desc->pg_error < 0)
+		return 0;
 
 	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
 		if (midx) {
@@ -1230,7 +1237,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 	nfs_pageio_complete(desc);
 	if (!list_empty(&failed)) {
 		list_move(&failed, &hdr->pages);
-		return -EIO;
+		return desc->pg_error < 0 ? desc->pg_error : -EIO;
 	}
 	return 0;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c8e75e5e6a67..d34fb0feb5c2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -909,14 +909,15 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 
 	if (IS_ERR(lseg)) {
 		switch (PTR_ERR(lseg)) {
-		case -ENOMEM:
 		case -ERESTARTSYS:
+		case -EIO:
+		case -ENOSPC:
+		case -EROFS:
+		case -E2BIG:
 			break;
 		default:
-			/* remember that LAYOUTGET failed and suspend trying */
-			pnfs_layout_io_set_failed(lo, range->iomode);
+			return NULL;
 		}
-		return NULL;
 	} else
 		pnfs_layout_clear_fail_bit(lo,
 				pnfs_iomode_to_fail_bit(range->iomode));
@@ -1625,7 +1626,7 @@ out:
 			"(%s, offset: %llu, length: %llu)\n",
 			__func__, ino->i_sb->s_id,
 			(unsigned long long)NFS_FILEID(ino),
-			lseg == NULL ? "not found" : "found",
+			IS_ERR_OR_NULL(lseg) ? "not found" : "found",
 			iomode==IOMODE_RW ?  "read/write" : "read-only",
 			(unsigned long long)pos,
 			(unsigned long long)count);
@@ -1804,6 +1805,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 						   rd_size,
 						   IOMODE_READ,
 						   GFP_KERNEL);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
 	}
 	/* If no lseg, fall back to read through mds */
 	if (pgio->pg_lseg == NULL)
@@ -1816,13 +1822,19 @@ void
 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
 			   struct nfs_page *req, u64 wb_size)
 {
-	if (pgio->pg_lseg == NULL)
+	if (pgio->pg_lseg == NULL) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   req->wb_context,
 						   req_offset(req),
 						   wb_size,
 						   IOMODE_RW,
 						   GFP_NOFS);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
 	/* If no lseg, fall back to write through mds */
 	if (pgio->pg_lseg == NULL)
 		nfs_pageio_reset_write_mds(pgio);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 0a5e33f33b5c..0bb580174cb3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -115,7 +115,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	pgm = &pgio.pg_mirrors[0];
 	NFS_I(inode)->read_io += pgm->pg_bytes_written;
 
-	return 0;
+	return pgio.pg_error < 0 ? pgio.pg_error : 0;
 }
 
 static void nfs_readpage_release(struct nfs_page *req)

From ecd307da77713bb86df567bc8047b4733b64c6f5 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Wed, 6 Feb 2019 12:39:45 +0200
Subject: [PATCH 2895/3220] perf intel-pt: Fix CYC timestamp calculation after
 OVF

commit 03997612904866abe7cdcc992784ef65cb3a4b81 upstream.

CYC packet timestamp calculation depends upon CBR which was being
cleared upon overflow (OVF). That can cause errors due to failing to
synchronize with sideband events. Even if a CBR change has been lost,
the old CBR is still a better estimate than zero. So remove the clearing
of CBR.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20190206103947.15750-4-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index dc17c881275d..c4b699943466 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -1281,7 +1281,6 @@ static int intel_pt_overflow(struct intel_pt_decoder *decoder)
 {
 	intel_pt_log("ERROR: Buffer overflow\n");
 	intel_pt_clear_tx_flags(decoder);
-	decoder->cbr = 0;
 	decoder->timestamp_insn_cnt = 0;
 	decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC;
 	decoder->overflow = true;

From 5664e60dc0b1bff6ab8c05d2e0a800e73e27d047 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Wed, 6 Feb 2019 12:39:43 +0200
Subject: [PATCH 2896/3220] perf auxtrace: Define auxtrace record alignment

commit c3fcadf0bb765faf45d6d562246e1d08885466df upstream.

Define auxtrace record alignment so that it can be referenced elsewhere.

Note this is preparation for patch "perf intel-pt: Fix overlap calculation
for padding"

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20190206103947.15750-2-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/perf/util/auxtrace.c | 4 ++--
 tools/perf/util/auxtrace.h | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 58426e7d320d..4b898b15643d 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -1226,9 +1226,9 @@ static int __auxtrace_mmap__read(struct auxtrace_mmap *mm,
 	}
 
 	/* padding must be written by fn() e.g. record__process_auxtrace() */
-	padding = size & 7;
+	padding = size & (PERF_AUXTRACE_RECORD_ALIGNMENT - 1);
 	if (padding)
-		padding = 8 - padding;
+		padding = PERF_AUXTRACE_RECORD_ALIGNMENT - padding;
 
 	memset(&ev, 0, sizeof(ev));
 	ev.auxtrace.header.type = PERF_RECORD_AUXTRACE;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index b86f90db1352..b6d6ccf630d9 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -37,6 +37,9 @@ struct record_opts;
 struct auxtrace_info_event;
 struct events_stats;
 
+/* Auxtrace records must have the same alignment as perf event records */
+#define PERF_AUXTRACE_RECORD_ALIGNMENT 8
+
 enum auxtrace_type {
 	PERF_AUXTRACE_UNKNOWN,
 	PERF_AUXTRACE_INTEL_PT,

From 4426d5f7be8f4d7c675eb15d5d8707b7f1261a24 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Wed, 6 Feb 2019 12:39:44 +0200
Subject: [PATCH 2897/3220] perf intel-pt: Fix overlap calculation for padding

commit 5a99d99e3310a565b0cf63f785b347be9ee0da45 upstream.

Auxtrace records might have up to 7 bytes of padding appended. Adjust
the overlap accordingly.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20190206103947.15750-3-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../util/intel-pt-decoder/intel-pt-decoder.c  | 36 +++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index c4b699943466..d01e2ce818f7 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -26,6 +26,7 @@
 
 #include "../cache.h"
 #include "../util.h"
+#include "../auxtrace.h"
 
 #include "intel-pt-insn-decoder.h"
 #include "intel-pt-pkt-decoder.h"
@@ -2320,6 +2321,34 @@ static int intel_pt_tsc_cmp(uint64_t tsc1, uint64_t tsc2)
 	}
 }
 
+#define MAX_PADDING (PERF_AUXTRACE_RECORD_ALIGNMENT - 1)
+
+/**
+ * adj_for_padding - adjust overlap to account for padding.
+ * @buf_b: second buffer
+ * @buf_a: first buffer
+ * @len_a: size of first buffer
+ *
+ * @buf_a might have up to 7 bytes of padding appended. Adjust the overlap
+ * accordingly.
+ *
+ * Return: A pointer into @buf_b from where non-overlapped data starts
+ */
+static unsigned char *adj_for_padding(unsigned char *buf_b,
+				      unsigned char *buf_a, size_t len_a)
+{
+	unsigned char *p = buf_b - MAX_PADDING;
+	unsigned char *q = buf_a + len_a - MAX_PADDING;
+	int i;
+
+	for (i = MAX_PADDING; i; i--, p++, q++) {
+		if (*p != *q)
+			break;
+	}
+
+	return p;
+}
+
 /**
  * intel_pt_find_overlap_tsc - determine start of non-overlapped trace data
  *                             using TSC.
@@ -2370,8 +2399,11 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a,
 
 			/* Same TSC, so buffers are consecutive */
 			if (!cmp && rem_b >= rem_a) {
+				unsigned char *start;
+
 				*consecutive = true;
-				return buf_b + len_b - (rem_b - rem_a);
+				start = buf_b + len_b - (rem_b - rem_a);
+				return adj_for_padding(start, buf_a, len_a);
 			}
 			if (cmp < 0)
 				return buf_b; /* tsc_a < tsc_b => no overlap */
@@ -2434,7 +2466,7 @@ unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a,
 		found = memmem(buf_a, len_a, buf_b, len_a);
 		if (found) {
 			*consecutive = true;
-			return buf_b + len_a;
+			return adj_for_padding(buf_b + len_a, buf_a, len_a);
 		}
 
 		/* Try again at next PSB in buffer 'a' */

From 128f60fe3bb0b759cf52548fcb15683f91bd7d00 Mon Sep 17 00:00:00 2001
From: Aditya Pakki <pakki001@umn.edu>
Date: Mon, 4 Mar 2019 16:48:54 -0600
Subject: [PATCH 2898/3220] md: Fix failed allocation of md_register_thread

commit e406f12dde1a8375d77ea02d91f313fb1a9c6aec upstream.

mddev->sync_thread can be set to NULL on kzalloc failure downstream.
The patch checks for such a scenario and frees allocated resources.

Committer node:

Added similar fix to raid5.c, as suggested by Guoqing.

Cc: stable@vger.kernel.org # v3.16+
Acked-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Aditya Pakki <pakki001@umn.edu>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/md/raid10.c | 2 ++
 drivers/md/raid5.c  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 98da5f5d847d..69e9abf00c74 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3755,6 +3755,8 @@ static int run(struct mddev *mddev)
 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
 							"reshape");
+		if (!mddev->sync_thread)
+			goto out_free_conf;
 	}
 
 	return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0841d8f10a58..5e65dc6def7e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6973,6 +6973,8 @@ static int run(struct mddev *mddev)
 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
 							"reshape");
+		if (!mddev->sync_thread)
+			goto abort;
 	}
 
 	/* Ok, everything is just fine now */

From a853de72abbe3996e7e9eb787d2fef1adae7f950 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 15 Feb 2019 14:59:52 -0500
Subject: [PATCH 2899/3220] NFS: Fix an I/O request leakage in
 nfs_do_recoalesce

commit 4d91969ed4dbcefd0e78f77494f0cb8fada9048a upstream.

Whether we need to exit early, or just reprocess the list, we
must not lost track of the request which failed to get recoalesced.

Fixes: 03d5eb65b538 ("NFS: Fix a memory leak in nfs_do_recoalesce")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/pagelist.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 79e941848e9d..1c9e979fa235 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1107,7 +1107,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
 			struct nfs_page *req;
 
 			req = list_first_entry(&head, struct nfs_page, wb_list);
-			nfs_list_remove_request(req);
 			if (__nfs_pageio_add_request(desc, req))
 				continue;
 			if (desc->pg_error < 0) {

From 295aac3a5a371d5574c869a134e86188b458be4f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 15 Feb 2019 16:08:25 -0500
Subject: [PATCH 2900/3220] NFS: Don't recoalesce on error in
 nfs_pageio_complete_mirror()

commit 8127d82705998568b52ac724e28e00941538083d upstream.

If the I/O completion failed with a fatal error, then we should just
exit nfs_pageio_complete_mirror() rather than try to recoalesce.

Fixes: a7d42ddb3099 ("nfs: add mirroring support to pgio layer")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/pagelist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 1c9e979fa235..8a2077408ab0 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1202,7 +1202,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
 		desc->pg_mirror_idx = mirror_idx;
 	for (;;) {
 		nfs_pageio_doio(desc);
-		if (!mirror->pg_recoalesce)
+		if (desc->pg_error < 0 || !mirror->pg_recoalesce)
 			break;
 		if (!nfs_do_recoalesce(desc))
 			break;

From d2f777c50bfa42d0d04fd5fb301cf4ffaedb7dd4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 4 Mar 2019 14:08:22 +1100
Subject: [PATCH 2901/3220] nfsd: fix memory corruption caused by readdir

commit b602345da6cbb135ba68cf042df8ec9a73da7981 upstream.

If the result of an NFSv3 readdir{,plus} request results in the
"offset" on one entry having to be split across 2 pages, and is sized
so that the next directory entry doesn't fit in the requested size,
then memory corruption can happen.

When encode_entry() is called after encoding the last entry that fits,
it notices that ->offset and ->offset1 are set, and so stores the
offset value in the two pages as required.  It clears ->offset1 but
*does not* clear ->offset.

Normally this omission doesn't matter as encode_entry_baggage() will
be called, and will set ->offset to a suitable value (not on a page
boundary).
But in the case where cd->buflen < elen and nfserr_toosmall is
returned, ->offset is not reset.

This means that nfsd3proc_readdirplus will see ->offset with a value 4
bytes before the end of a page, and ->offset1 set to NULL.
It will try to write 8bytes to ->offset.
If we are lucky, the next page will be read-only, and the system will
  BUG: unable to handle kernel paging request at...

If we are unlucky, some innocent page will have the first 4 bytes
corrupted.

nfsd3proc_readdir() doesn't even check for ->offset1, it just blindly
writes 8 bytes to the offset wherever it is.

Fix this by clearing ->offset after it is used, and copying the
->offset handling code from nfsd3_proc_readdirplus into
nfsd3_proc_readdir.

(Note that the commit hash in the Fixes tag is from the 'history'
 tree - this bug predates git).

Fixes: 0b1d57cf7654 ("[PATCH] kNFSd: Fix nfs3 dentry encoding")
Fixes-URL: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=0b1d57cf7654
Cc: stable@vger.kernel.org (v2.6.12+)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfs3proc.c | 16 ++++++++++++++--
 fs/nfsd/nfs3xdr.c  |  1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7b755b7f785c..91146f025769 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -430,8 +430,19 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 					&resp->common, nfs3svc_encode_entry);
 	memcpy(resp->verf, argp->verf, 8);
 	resp->count = resp->buffer - argp->buffer;
-	if (resp->offset)
-		xdr_encode_hyper(resp->offset, argp->cookie);
+	if (resp->offset) {
+		loff_t offset = argp->cookie;
+
+		if (unlikely(resp->offset1)) {
+			/* we ended up with offset on a page boundary */
+			*resp->offset = htonl(offset >> 32);
+			*resp->offset1 = htonl(offset & 0xffffffff);
+			resp->offset1 = NULL;
+		} else {
+			xdr_encode_hyper(resp->offset, offset);
+		}
+		resp->offset = NULL;
+	}
 
 	RETURN_STATUS(nfserr);
 }
@@ -499,6 +510,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 		} else {
 			xdr_encode_hyper(resp->offset, offset);
 		}
+		resp->offset = NULL;
 	}
 
 	RETURN_STATUS(nfserr);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 7162ab7bc093..d4fa7fbc37dc 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -898,6 +898,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
 		} else {
 			xdr_encode_hyper(cd->offset, offset64);
 		}
+		cd->offset = NULL;
 	}
 
 	/*

From d18bcfe455106b23aa254d1da0c1a7f39b9b20fe Mon Sep 17 00:00:00 2001
From: Yihao Wu <wuyihao@linux.alibaba.com>
Date: Wed, 6 Mar 2019 21:03:50 +0800
Subject: [PATCH 2902/3220] nfsd: fix wrong check in write_v4_end_grace()

commit dd838821f0a29781b185cd8fb8e48d5c177bd838 upstream.

Commit 62a063b8e7d1 "nfsd4: fix crash on writing v4_end_grace before
nfsd startup" is trying to fix a NULL dereference issue, but it
mistakenly checks if the nfsd server is started. So fix it.

Fixes: 62a063b8e7d1 "nfsd4: fix crash on writing v4_end_grace before nfsd startup"
Cc: stable@vger.kernel.org
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Signed-off-by: Yihao Wu <wuyihao@linux.alibaba.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfsctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 03c7a4e7b6ba..0cd57db5c5af 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1106,7 +1106,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
 		case 'Y':
 		case 'y':
 		case '1':
-			if (nn->nfsd_serv)
+			if (!nn->nfsd_serv)
 				return -EBUSY;
 			nfsd4_end_grace(nn);
 			break;

From 06a7cc29d18aa04c3bbac3f61f33376369ab73d1 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 8 Mar 2019 15:23:11 +0530
Subject: [PATCH 2903/3220] PM / wakeup: Rework wakeup source timer
 cancellation

commit 1fad17fb1bbcd73159c2b992668a6957ecc5af8a upstream.

If wakeup_source_add() is called right after wakeup_source_remove()
for the same wakeup source, timer_setup() may be called for a
potentially scheduled timer which is incorrect.

To avoid that, move the wakeup source timer cancellation from
wakeup_source_drop() to wakeup_source_remove().

Moreover, make wakeup_source_remove() clear the timer function after
canceling the timer to let wakeup_source_not_registered() treat
unregistered wakeup sources in the same way as the ones that have
never been registered.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: 4.4+ <stable@vger.kernel.org> # 4.4+
[ rjw: Subject, changelog, merged two patches together ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/power/wakeup.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index e613633ffe9c..4e01bf65317a 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -113,7 +113,6 @@ void wakeup_source_drop(struct wakeup_source *ws)
 	if (!ws)
 		return;
 
-	del_timer_sync(&ws->timer);
 	__pm_relax(ws);
 }
 EXPORT_SYMBOL_GPL(wakeup_source_drop);
@@ -201,6 +200,13 @@ void wakeup_source_remove(struct wakeup_source *ws)
 	list_del_rcu(&ws->entry);
 	spin_unlock_irqrestore(&events_lock, flags);
 	synchronize_srcu(&wakeup_srcu);
+
+	del_timer_sync(&ws->timer);
+	/*
+	 * Clear timer.function to make wakeup_source_not_registered() treat
+	 * this wakeup source as not registered.
+	 */
+	ws->timer.function = NULL;
 }
 EXPORT_SYMBOL_GPL(wakeup_source_remove);
 

From 25c4c4519352b57c8fc02d7e1a2ff2c4407c1171 Mon Sep 17 00:00:00 2001
From: "Zhang, Jun" <jun.zhang@intel.com>
Date: Tue, 18 Dec 2018 06:55:01 -0800
Subject: [PATCH 2904/3220] rcu: Do RCU GP kthread self-wakeup from softirq and
 interrupt

commit 1d1f898df6586c5ea9aeaf349f13089c6fa37903 upstream.

The rcu_gp_kthread_wake() function is invoked when it might be necessary
to wake the RCU grace-period kthread.  Because self-wakeups are normally
a useless waste of CPU cycles, if rcu_gp_kthread_wake() is invoked from
this kthread, it naturally refuses to do the wakeup.

Unfortunately, natural though it might be, this heuristic fails when
rcu_gp_kthread_wake() is invoked from an interrupt or softirq handler
that interrupted the grace-period kthread just after the final check of
the wait-event condition but just before the schedule() call.  In this
case, a wakeup is required, even though the call to rcu_gp_kthread_wake()
is within the RCU grace-period kthread's context.  Failing to provide
this wakeup can result in grace periods failing to start, which in turn
results in out-of-memory conditions.

This race window is quite narrow, but it actually did happen during real
testing.  It would of course need to be fixed even if it was strictly
theoretical in nature.

This patch does not Cc stable because it does not apply cleanly to
earlier kernel versions.

Fixes: 48a7639ce80c ("rcu: Make callers awaken grace-period kthread")
Reported-by: "He, Bo" <bo.he@intel.com>
Co-developed-by: "Zhang, Jun" <jun.zhang@intel.com>
Co-developed-by: "He, Bo" <bo.he@intel.com>
Co-developed-by: "xiao, jin" <jin.xiao@intel.com>
Co-developed-by: Bai, Jie A <jie.a.bai@intel.com>
Signed-off: "Zhang, Jun" <jun.zhang@intel.com>
Signed-off: "He, Bo" <bo.he@intel.com>
Signed-off: "xiao, jin" <jin.xiao@intel.com>
Signed-off: Bai, Jie A <jie.a.bai@intel.com>
Signed-off-by: "Zhang, Jun" <jun.zhang@intel.com>
[ paulmck: Switch from !in_softirq() to "!in_interrupt() &&
  !in_serving_softirq() to avoid redundant wakeups and to also handle the
  interrupt-handler scenario as well as the softirq-handler scenario that
  actually occurred in testing. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Link: https://lkml.kernel.org/r/CD6925E8781EFD4D8E11882D20FC406D52A11F61@SHSMSX104.ccr.corp.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/rcu/tree.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4e886ccd40db..082aedefe29c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1611,15 +1611,23 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 }
 
 /*
- * Awaken the grace-period kthread for the specified flavor of RCU.
- * Don't do a self-awaken, and don't bother awakening when there is
- * nothing for the grace-period kthread to do (as in several CPUs
- * raced to awaken, and we lost), and finally don't try to awaken
- * a kthread that has not yet been created.
+ * Awaken the grace-period kthread.  Don't do a self-awaken (unless in
+ * an interrupt or softirq handler), and don't bother awakening when there
+ * is nothing for the grace-period kthread to do (as in several CPUs raced
+ * to awaken, and we lost), and finally don't try to awaken a kthread that
+ * has not yet been created.  If all those checks are passed, track some
+ * debug information and awaken.
+ *
+ * So why do the self-wakeup when in an interrupt or softirq handler
+ * in the grace-period kthread's context?  Because the kthread might have
+ * been interrupted just as it was going to sleep, and just after the final
+ * pre-sleep check of the awaken condition.  In this case, a wakeup really
+ * is required, and is therefore supplied.
  */
 static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 {
-	if (current == rsp->gp_kthread ||
+	if ((current == rsp->gp_kthread &&
+	     !in_interrupt() && !in_serving_softirq()) ||
 	    !READ_ONCE(rsp->gp_flags) ||
 	    !rsp->gp_kthread)
 		return;

From e4de142d0a07f12d122e1934f41470532b5cc82b Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 30 Jan 2019 05:09:41 -0500
Subject: [PATCH 2905/3220] media: uvcvideo: Avoid NULL pointer dereference at
 the end of streaming

commit 9dd0627d8d62a7ddb001a75f63942d92b5336561 upstream.

The UVC video driver converts the timestamp from hardware specific unit
to one known by the kernel at the time when the buffer is dequeued. This
is fine in general, but the streamoff operation consists of the
following steps (among other things):

1. uvc_video_clock_cleanup --- the hardware clock sample array is
   released and the pointer to the array is set to NULL,

2. buffers in active state are returned to the user and

3. buf_finish callback is called on buffers that are prepared.
   buf_finish includes calling uvc_video_clock_update that accesses the
   hardware clock sample array.

The above is serialised by a queue specific mutex. Address the problem
by skipping the clock conversion if the hardware clock sample array is
already released.

Fixes: 9c0863b1cc48 ("[media] vb2: call buf_finish from __queue_cancel")

Reported-by: Chiranjeevi Rapolu <chiranjeevi.rapolu@intel.com>
Tested-by: Chiranjeevi Rapolu <chiranjeevi.rapolu@intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/usb/uvc/uvc_video.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
index a4048a04d236..a550dbe36dc5 100644
--- a/drivers/media/usb/uvc/uvc_video.c
+++ b/drivers/media/usb/uvc/uvc_video.c
@@ -638,6 +638,14 @@ void uvc_video_clock_update(struct uvc_streaming *stream,
 	if (!uvc_hw_timestamps_param)
 		return;
 
+	/*
+	 * We will get called from __vb2_queue_cancel() if there are buffers
+	 * done but not dequeued by the user, but the sample array has already
+	 * been released at that time. Just bail out in that case.
+	 */
+	if (!clock->samples)
+		return;
+
 	spin_lock_irqsave(&clock->lock, flags);
 
 	if (clock->count < clock->size)

From 71e005f009ffeb3372e41ed58cc4d3b6453cebfd Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 15 Feb 2019 14:29:26 -0600
Subject: [PATCH 2906/3220] drm/radeon/evergreen_cs: fix missing break in
 switch statement

commit cc5034a5d293dd620484d1d836aa16c6764a1c8c upstream.

Add missing break statement in order to prevent the code from falling
through to case CB_TARGET_MASK.

This bug was found thanks to the ongoing efforts to enable
-Wimplicit-fallthrough.

Fixes: dd220a00e8bd ("drm/radeon/kms: add support for streamout v7")
Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/radeon/evergreen_cs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c
index 46f87d4aaf31..782fee330b4c 100644
--- a/drivers/gpu/drm/radeon/evergreen_cs.c
+++ b/drivers/gpu/drm/radeon/evergreen_cs.c
@@ -1299,6 +1299,7 @@ static int evergreen_cs_handle_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
 			return -EINVAL;
 		}
 		ib[idx] += (u32)((reloc->gpu_offset >> 8) & 0xffffffff);
+		break;
 	case CB_TARGET_MASK:
 		track->cb_target_mask = radeon_get_ib_value(p, idx);
 		track->cb_dirty = true;

From 2866808ffc0f0147e84d5643f9f4731401303df3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 23 Jan 2019 14:39:23 -0800
Subject: [PATCH 2907/3220] KVM: nVMX: Sign extend displacements of VMX instr's
 mem operands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 946c522b603f281195af1df91837a1d4d1eb3bc9 upstream.

The VMCS.EXIT_QUALIFCATION field reports the displacements of memory
operands for various instructions, including VMX instructions, as a
naturally sized unsigned value, but masks the value by the addr size,
e.g. given a ModRM encoded as -0x28(%ebp), the -0x28 displacement is
reported as 0xffffffd8 for a 32-bit address size.  Despite some weird
wording regarding sign extension, the SDM explicitly states that bits
beyond the instructions address size are undefined:

    In all cases, bits of this field beyond the instructionâ€™s address
    size are undefined.

Failure to sign extend the displacement results in KVM incorrectly
treating a negative displacement as a large positive displacement when
the address size of the VMX instruction is smaller than KVM's native
size, e.g. a 32-bit address size on a 64-bit KVM.

The very original decoding, added by commit 064aea774768 ("KVM: nVMX:
Decoding memory operands of VMX instructions"), sort of modeled sign
extension by truncating the final virtual/linear address for a 32-bit
address size.  I.e. it messed up the effective address but made it work
by adjusting the final address.

When segmentation checks were added, the truncation logic was kept
as-is and no sign extension logic was introduced.  In other words, it
kept calculating the wrong effective address while mostly generating
the correct virtual/linear address.  As the effective address is what's
used in the segment limit checks, this results in KVM incorreclty
injecting #GP/#SS faults due to non-existent segment violations when
a nested VMM uses negative displacements with an address size smaller
than KVM's native address size.

Using the -0x28(%ebp) example, an EBP value of 0x1000 will result in
KVM using 0x100000fd8 as the effective address when checking for a
segment limit violation.  This causes a 100% failure rate when running
a 32-bit KVM build as L1 on top of a 64-bit KVM L0.

Fixes: f9eb4af67c9d ("KVM: nVMX: VMX instructions: add checks for #GP/#SS exceptions")
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 14553f6c03a6..b5520c17c2a1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6656,6 +6656,10 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 	/* Addr = segment_base + offset */
 	/* offset = base + [index * scale] + displacement */
 	off = exit_qualification; /* holds the displacement */
+	if (addr_size == 1)
+		off = (gva_t)sign_extend64(off, 31);
+	else if (addr_size == 0)
+		off = (gva_t)sign_extend64(off, 15);
 	if (base_is_valid)
 		off += kvm_register_read(vcpu, base_reg);
 	if (index_is_valid)

From 8c7543e3b8eb160b8458390e424b315855ddf13e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 23 Jan 2019 14:39:25 -0800
Subject: [PATCH 2908/3220] KVM: nVMX: Ignore limit checks on VMX instructions
 using flat segments

commit 34333cc6c2cb021662fd32e24e618d1b86de95bf upstream.

Regarding segments with a limit==0xffffffff, the SDM officially states:

    When the effective limit is FFFFFFFFH (4 GBytes), these accesses may
    or may not cause the indicated exceptions.  Behavior is
    implementation-specific and may vary from one execution to another.

In practice, all CPUs that support VMX ignore limit checks for "flat
segments", i.e. an expand-up data or code segment with base=0 and
limit=0xffffffff.  This is subtly different than wrapping the effective
address calculation based on the address size, as the flat segment
behavior also applies to accesses that would wrap the 4g boundary, e.g.
a 4-byte access starting at 0xffffffff will access linear addresses
0xffffffff, 0x0, 0x1 and 0x2.

Fixes: f9eb4af67c9d ("KVM: nVMX: VMX instructions: add checks for #GP/#SS exceptions")
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b5520c17c2a1..61e3ffe58bce 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6702,10 +6702,16 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 		/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
 		 */
 		exn = (s.unusable != 0);
-		/* Protected mode: #GP(0)/#SS(0) if the memory
-		 * operand is outside the segment limit.
+
+		/*
+		 * Protected mode: #GP(0)/#SS(0) if the memory operand is
+		 * outside the segment limit.  All CPUs that support VMX ignore
+		 * limit checks for flat segments, i.e. segments with base==0,
+		 * limit==0xffffffff and of type expand-up data or code.
 		 */
-		exn = exn || (off + sizeof(u64) > s.limit);
+		if (!(s.base == 0 && s.limit == 0xffffffff &&
+		     ((s.type & 8) || !(s.type & 4))))
+			exn = exn || (off + sizeof(u64) > s.limit);
 	}
 	if (exn) {
 		kvm_queue_exception_e(vcpu,

From 5d8f03acc1a485f47775a41ab5af4e8d050b716e Mon Sep 17 00:00:00 2001
From: Wanpeng Li <kernellwp@gmail.com>
Date: Wed, 9 Aug 2017 22:33:12 -0700
Subject: [PATCH 2909/3220] KVM: X86: Fix residual mmio emulation request to
 userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit bbeac2830f4de270bb48141681cb730aadf8dce1 upstream.

Reported by syzkaller:

The kvm-intel.unrestricted_guest=0

   WARNING: CPU: 5 PID: 1014 at /home/kernel/data/kvm/arch/x86/kvm//x86.c:7227 kvm_arch_vcpu_ioctl_run+0x38b/0x1be0 [kvm]
   CPU: 5 PID: 1014 Comm: warn_test Tainted: G        W  OE   4.13.0-rc3+ #8
   RIP: 0010:kvm_arch_vcpu_ioctl_run+0x38b/0x1be0 [kvm]
   Call Trace:
    ? put_pid+0x3a/0x50
    ? rcu_read_lock_sched_held+0x79/0x80
    ? kmem_cache_free+0x2f2/0x350
    kvm_vcpu_ioctl+0x340/0x700 [kvm]
    ? kvm_vcpu_ioctl+0x340/0x700 [kvm]
    ? __fget+0xfc/0x210
    do_vfs_ioctl+0xa4/0x6a0
    ? __fget+0x11d/0x210
    SyS_ioctl+0x79/0x90
    entry_SYSCALL_64_fastpath+0x23/0xc2
    ? __this_cpu_preempt_check+0x13/0x20

The syszkaller folks reported a residual mmio emulation request to userspace
due to vm86 fails to emulate inject real mode interrupt(fails to read CS) and
incurs a triple fault. The vCPU returns to userspace with vcpu->mmio_needed == true
and KVM_EXIT_SHUTDOWN exit reason. However, the syszkaller testcase constructs
several threads to launch the same vCPU, the thread which lauch this vCPU after
the thread whichs get the vcpu->mmio_needed == true and KVM_EXIT_SHUTDOWN will
trigger the warning.

   #define _GNU_SOURCE
   #include <pthread.h>
   #include <stdio.h>
   #include <stdlib.h>
   #include <string.h>
   #include <sys/wait.h>
   #include <sys/types.h>
   #include <sys/stat.h>
   #include <sys/mman.h>
   #include <fcntl.h>
   #include <unistd.h>
   #include <linux/kvm.h>
   #include <stdio.h>

   int kvmcpu;
   struct kvm_run *run;

   void* thr(void* arg)
   {
     int res;
     res = ioctl(kvmcpu, KVM_RUN, 0);
     printf("ret1=%d exit_reason=%d suberror=%d\n",
         res, run->exit_reason, run->internal.suberror);
     return 0;
   }

   void test()
   {
     int i, kvm, kvmvm;
     pthread_t th[4];

     kvm = open("/dev/kvm", O_RDWR);
     kvmvm = ioctl(kvm, KVM_CREATE_VM, 0);
     kvmcpu = ioctl(kvmvm, KVM_CREATE_VCPU, 0);
     run = (struct kvm_run*)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, kvmcpu, 0);
     srand(getpid());
     for (i = 0; i < 4; i++) {
       pthread_create(&th[i], 0, thr, 0);
       usleep(rand() % 10000);
     }
     for (i = 0; i < 4; i++)
       pthread_join(th[i], 0);
   }

   int main()
   {
     for (;;) {
       int pid = fork();
       if (pid < 0)
         exit(1);
       if (pid == 0) {
         test();
         exit(0);
       }
       int status;
       while (waitpid(pid, &status, __WALL) != pid) {}
     }
     return 0;
   }

This patch fixes it by resetting the vcpu->mmio_needed once we receive
the triple fault to avoid the residue.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Zubin Mithra <zsm@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 1 +
 arch/x86/kvm/x86.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 61e3ffe58bce..098be61a6b4c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5574,6 +5574,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
 static int handle_triple_fault(struct kvm_vcpu *vcpu)
 {
 	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+	vcpu->mmio_needed = 0;
 	return 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6bd0538d8ebf..706c5d63a53f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6478,6 +6478,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+			vcpu->mmio_needed = 0;
 			r = 0;
 			goto out;
 		}

From 6b50202a4d53bf527c640467bcff68b50a5e38a2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 23 Mar 2019 08:44:40 +0100
Subject: [PATCH 2910/3220] Linux 4.4.177

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index d7a3b832e0fd..1de443248119 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 176
+SUBLEVEL = 177
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From 9e6526c9a1430f4e5d7ae6f43457fa0558426f13 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 21 Feb 2019 14:13:56 -0800
Subject: [PATCH 2911/3220] UPSTREAM: net: socket: set sock->sk to NULL after
 calling proto_ops::release()

Commit 9060cb719e61 ("net: crypto set sk to NULL when af_alg_release.")
fixed a use-after-free in sockfs_setattr() when an AF_ALG socket is
closed concurrently with fchownat().  However, it ignored that many
other proto_ops::release() methods don't set sock->sk to NULL and
therefore allow the same use-after-free:

    - base_sock_release
    - bnep_sock_release
    - cmtp_sock_release
    - data_sock_release
    - dn_release
    - hci_sock_release
    - hidp_sock_release
    - iucv_sock_release
    - l2cap_sock_release
    - llcp_sock_release
    - llc_ui_release
    - rawsock_release
    - rfcomm_sock_release
    - sco_sock_release
    - svc_release
    - vcc_release
    - x25_release

Rather than fixing all these and relying on every socket type to get
this right forever, just make __sock_release() set sock->sk to NULL
itself after calling proto_ops::release().

Reproducer that produces the KASAN splat when any of these socket types
are configured into the kernel:

    #include <pthread.h>
    #include <stdlib.h>
    #include <sys/socket.h>
    #include <unistd.h>

    pthread_t t;
    volatile int fd;

    void *close_thread(void *arg)
    {
        for (;;) {
            usleep(rand() % 100);
            close(fd);
        }
    }

    int main()
    {
        pthread_create(&t, NULL, close_thread, NULL);
        for (;;) {
            fd = socket(rand() % 50, rand() % 11, 0);
            fchownat(fd, "", 1000, 1000, 0x1000);
            close(fd);
        }
    }

Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

(cherry picked from commit ff7b11aa481f682e0e9711abfeb7d03f5cd612bf)
Bug: 125367761
Test: used reproducer above
Change-Id: Ied4bbca5c7eb80c201fec6e0aabc95c24acc1b59
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 net/socket.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/socket.c b/net/socket.c
index f14e6e2e08ea..6412baf4da27 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -578,6 +578,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
 		if (inode)
 			inode_lock(inode);
 		sock->ops->release(sock);
+		sock->sk = NULL;
 		if (inode)
 			inode_unlock(inode);
 		sock->ops = NULL;

From 643dbe00f4800bf24b731f77bb673245b5b4b360 Mon Sep 17 00:00:00 2001
From: Cody Schuffelen <schuffelen@google.com>
Date: Wed, 6 Feb 2019 15:54:15 -0800
Subject: [PATCH 2912/3220] UPSTREAM: virt_wifi: Remove
 REGULATORY_WIPHY_SELF_MANAGED

REGULATORY_WIPHY_SELF_MANAGED as set here breaks NL80211_CMD_GET_REG,
because it expects the wiphy to do regulatory management. Since
virt_wifi does not do regulatory management, this triggers a WARN_ON in
NL80211_CMD_GET_REG and fails the netlink command.

Removing REGULATORY_WIPHY_SELF_MANAGED fixes the problem and the virtual
wireless network continues to work.

Signed-off-by: Cody Schuffelen <schuffelen@google.com>
Acked-by: Alistair Strachan <astrachan@google.com>
Acked-by: Greg Hartman <ghartman@google.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
(cherry picked from commit f440b125450dfc65ce0386eb231da2f663c45ca0)
Bug: 119771992
Change-Id: Ida660b77a2f091e7aaabcecf8a60e66cc2f1f91d
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/net/wireless/virt_wifi.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/virt_wifi.c b/drivers/net/wireless/virt_wifi.c
index 9c303f482aa9..7b7d4270ffc7 100644
--- a/drivers/net/wireless/virt_wifi.c
+++ b/drivers/net/wireless/virt_wifi.c
@@ -363,7 +363,6 @@ static struct wiphy *virt_wifi_make_wiphy(void)
 	wiphy->bands[IEEE80211_BAND_5GHZ] = &band_5ghz;
 	wiphy->bands[IEEE80211_BAND_60GHZ] = NULL;
 
-	wiphy->regulatory_flags = REGULATORY_WIPHY_SELF_MANAGED;
 	wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION);
 
 	priv = wiphy_priv(wiphy);

From b20fa86720f852d10f93a23d2b5be65a806c6128 Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <salyzyn@google.com>
Date: Mon, 1 Apr 2019 07:51:53 -0700
Subject: [PATCH 2913/3220] ANDROID: drop CONFIG_INPUT_KEYCHORD from cuttlefish
 and ranchu

Remove keychord driver, replaced in user space by
https://android-review.googlesource.com/c/677629.

Signed-off-by: Mark Salyzyn <salyzyn@google.com>
Bug: 64114943
Bug: 129556081
Change-Id: Ie8a2b9977a21022c204a19f1a8d781ea5a23c656
---
 arch/arm/configs/ranchu_defconfig            | 1 -
 arch/arm64/configs/ranchu64_defconfig        | 1 -
 arch/x86/configs/i386_ranchu_defconfig       | 1 -
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 -
 arch/x86/configs/x86_64_ranchu_defconfig     | 1 -
 5 files changed, 5 deletions(-)

diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig
index d5a16dfe678f..c0da14fa7366 100644
--- a/arch/arm/configs/ranchu_defconfig
+++ b/arch/arm/configs/ranchu_defconfig
@@ -186,7 +186,6 @@ CONFIG_TABLET_USB_GTCO=y
 CONFIG_TABLET_USB_HANWANG=y
 CONFIG_TABLET_USB_KBTAB=y
 CONFIG_INPUT_MISC=y
-CONFIG_INPUT_KEYCHORD=y
 CONFIG_INPUT_UINPUT=y
 CONFIG_INPUT_GPIO=y
 # CONFIG_SERIO_SERPORT is not set
diff --git a/arch/arm64/configs/ranchu64_defconfig b/arch/arm64/configs/ranchu64_defconfig
index 7f847fc40d14..ceebab60da2b 100644
--- a/arch/arm64/configs/ranchu64_defconfig
+++ b/arch/arm64/configs/ranchu64_defconfig
@@ -214,7 +214,6 @@ CONFIG_TABLET_USB_GTCO=y
 CONFIG_TABLET_USB_HANWANG=y
 CONFIG_TABLET_USB_KBTAB=y
 CONFIG_INPUT_MISC=y
-CONFIG_INPUT_KEYCHORD=y
 CONFIG_INPUT_UINPUT=y
 CONFIG_INPUT_GPIO=y
 # CONFIG_SERIO_SERPORT is not set
diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig
index dde67eacd8e2..36efc448ab0e 100644
--- a/arch/x86/configs/i386_ranchu_defconfig
+++ b/arch/x86/configs/i386_ranchu_defconfig
@@ -254,7 +254,6 @@ CONFIG_TABLET_USB_HANWANG=y
 CONFIG_TABLET_USB_KBTAB=y
 CONFIG_INPUT_TOUCHSCREEN=y
 CONFIG_INPUT_MISC=y
-CONFIG_INPUT_KEYCHORD=y
 CONFIG_INPUT_UINPUT=y
 CONFIG_INPUT_GPIO=y
 # CONFIG_SERIO is not set
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 8f6c76476220..4cfd30b2d0a7 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -264,7 +264,6 @@ CONFIG_TABLET_USB_GTCO=y
 CONFIG_TABLET_USB_HANWANG=y
 CONFIG_TABLET_USB_KBTAB=y
 CONFIG_INPUT_MISC=y
-CONFIG_INPUT_KEYCHORD=y
 CONFIG_INPUT_UINPUT=y
 CONFIG_INPUT_GPIO=y
 # CONFIG_SERIO_I8042 is not set
diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig
index 5cf2450842ab..6c983f2262d3 100644
--- a/arch/x86/configs/x86_64_ranchu_defconfig
+++ b/arch/x86/configs/x86_64_ranchu_defconfig
@@ -281,7 +281,6 @@ CONFIG_TABLET_USB_HANWANG=y
 CONFIG_TABLET_USB_KBTAB=y
 CONFIG_INPUT_TOUCHSCREEN=y
 CONFIG_INPUT_MISC=y
-CONFIG_INPUT_KEYCHORD=y
 CONFIG_INPUT_UINPUT=y
 CONFIG_INPUT_GPIO=y
 # CONFIG_SERIO is not set

From e884f7d5c6279452243c86c74112fc6f0639ebd8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 7 Mar 2019 11:09:19 +0100
Subject: [PATCH 2914/3220] mmc: pxamci: fix enum type confusion

commit e60a582bcde01158a64ff948fb799f21f5d31a11 upstream.

clang points out several instances of mismatched types in this drivers,
all coming from a single declaration:

drivers/mmc/host/pxamci.c:193:15: error: implicit conversion from enumeration type 'enum dma_transfer_direction' to
      different enumeration type 'enum dma_data_direction' [-Werror,-Wenum-conversion]
                direction = DMA_DEV_TO_MEM;
                          ~ ^~~~~~~~~~~~~~
drivers/mmc/host/pxamci.c:212:62: error: implicit conversion from enumeration type 'enum dma_data_direction' to
      different enumeration type 'enum dma_transfer_direction' [-Werror,-Wenum-conversion]
        tx = dmaengine_prep_slave_sg(chan, data->sg, host->dma_len, direction,

The behavior is correct, so this must be a simply typo from
dma_data_direction and dma_transfer_direction being similarly named
types with a similar purpose.

Fixes: 6464b7140951 ("mmc: pxamci: switch over to dmaengine use")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: stable@vger.kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/host/pxamci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c
index 72bbb12fb938..1d57c12b191c 100644
--- a/drivers/mmc/host/pxamci.c
+++ b/drivers/mmc/host/pxamci.c
@@ -181,7 +181,7 @@ static void pxamci_dma_irq(void *param);
 static void pxamci_setup_data(struct pxamci_host *host, struct mmc_data *data)
 {
 	struct dma_async_tx_descriptor *tx;
-	enum dma_data_direction direction;
+	enum dma_transfer_direction direction;
 	struct dma_slave_config	config;
 	struct dma_chan *chan;
 	unsigned int nob = data->blocks;

From 8b12fd7c5f36ad85926fa9e9d1107565fdbf75c2 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 18 Mar 2019 15:47:58 +0100
Subject: [PATCH 2915/3220] drm/vmwgfx: Don't double-free the mode stored in
 par->set_mode

commit c2d311553855395764e2e5bf401d987ba65c2056 upstream.

When calling vmw_fb_set_par(), the mode stored in par->set_mode gets free'd
twice. The first free is in vmw_fb_kms_detach(), the second is near the
end of vmw_fb_set_par() under the name of 'old_mode'. The mode-setting code
only works correctly if the mode doesn't actually change. Removing
'old_mode' in favor of using par->set_mode directly fixes the problem.

Cc: <stable@vger.kernel.org>
Fixes: a278724aa23c ("drm/vmwgfx: Implement fbdev on kms v2")
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Deepak Rawat <drawat@vmware.com>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_fb.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c
index aec6e9eef489..55884cb5a0fc 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c
@@ -531,11 +531,9 @@ static int vmw_fb_set_par(struct fb_info *info)
 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 		DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC)
 	};
-	struct drm_display_mode *old_mode;
 	struct drm_display_mode *mode;
 	int ret;
 
-	old_mode = par->set_mode;
 	mode = drm_mode_duplicate(vmw_priv->dev, &new_mode);
 	if (!mode) {
 		DRM_ERROR("Could not create new fb mode.\n");
@@ -546,11 +544,7 @@ static int vmw_fb_set_par(struct fb_info *info)
 	mode->vdisplay = var->yres;
 	vmw_guess_mode_timing(mode);
 
-	if (old_mode && drm_mode_equal(old_mode, mode)) {
-		drm_mode_destroy(vmw_priv->dev, mode);
-		mode = old_mode;
-		old_mode = NULL;
-	} else if (!vmw_kms_validate_mode_vram(vmw_priv,
+	if (!vmw_kms_validate_mode_vram(vmw_priv,
 					mode->hdisplay *
 					DIV_ROUND_UP(var->bits_per_pixel, 8),
 					mode->vdisplay)) {
@@ -613,8 +607,8 @@ static int vmw_fb_set_par(struct fb_info *info)
 	schedule_delayed_work(&par->local_work, 0);
 
 out_unlock:
-	if (old_mode)
-		drm_mode_destroy(vmw_priv->dev, old_mode);
+	if (par->set_mode)
+		drm_mode_destroy(vmw_priv->dev, par->set_mode);
 	par->set_mode = mode;
 
 	drm_modeset_unlock_all(vmw_priv->dev);

From c30b41bbbb874b31f9563fa5a42f057deaccbd4a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 11 Mar 2019 15:04:18 +0100
Subject: [PATCH 2916/3220] udf: Fix crash on IO error during truncate

commit d3ca4651d05c0ff7259d087d8c949bcf3e14fb46 upstream.

When truncate(2) hits IO error when reading indirect extent block the
code just bugs with:

kernel BUG at linux-4.15.0/fs/udf/truncate.c:249!
...

Fix the problem by bailing out cleanly in case of IO error.

CC: stable@vger.kernel.org
Reported-by: jean-luc malet <jeanluc.malet@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/udf/truncate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 42b8c57795cb..c6ce7503a329 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode)
 			epos.block = eloc;
 			epos.bh = udf_tread(sb,
 					udf_get_lb_pblock(sb, &eloc, 0));
+			/* Error reading indirect block? */
+			if (!epos.bh)
+				return;
 			if (elen)
 				indirect_ext_len =
 					(elen + sb->s_blocksize - 1) >>

From 5c06f2409550b5cd308c22637bba5422d9a5f769 Mon Sep 17 00:00:00 2001
From: Yifeng Li <tomli@tomli.me>
Date: Tue, 5 Mar 2019 06:00:22 +0800
Subject: [PATCH 2917/3220] mips: loongson64: lemote-2f: Add IRQF_NO_SUSPEND to
 "cascade" irqaction.

commit 5f5f67da9781770df0403269bc57d7aae608fecd upstream.

Timekeeping IRQs from CS5536 MFGPT are routed to i8259, which then
triggers the "cascade" IRQ on MIPS CPU. Without IRQF_NO_SUSPEND in
cascade_irqaction, MFGPT interrupts will be masked in suspend mode,
and the machine would be unable to resume once suspended.

Previously, MIPS IRQs were not disabled properly, so the original
code appeared to work. Commit a3e6c1eff5 ("MIPS: IRQ: Fix disable_irq on
CPU IRQs") uncovers the bug. To fix it, add IRQF_NO_SUSPEND to
cascade_irqaction.

This commit is functionally identical to 0add9c2f1cff ("MIPS:
Loongson-3: Add IRQF_NO_SUSPEND to Cascade irqaction"), but it forgot
to apply the same fix to Loongson2.

Signed-off-by: Yifeng Li <tomli@tomli.me>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@vger.kernel.org
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # v3.19+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/loongson64/lemote-2f/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/loongson64/lemote-2f/irq.c b/arch/mips/loongson64/lemote-2f/irq.c
index cab5f43e0e29..d371f0294cbb 100644
--- a/arch/mips/loongson64/lemote-2f/irq.c
+++ b/arch/mips/loongson64/lemote-2f/irq.c
@@ -102,7 +102,7 @@ static struct irqaction ip6_irqaction = {
 static struct irqaction cascade_irqaction = {
 	.handler = no_action,
 	.name = "cascade",
-	.flags = IRQF_NO_THREAD,
+	.flags = IRQF_NO_THREAD | IRQF_NO_SUSPEND,
 };
 
 void __init mach_init_irq(void)

From 8df0d57e5c085678645f27489d83ba434457c533 Mon Sep 17 00:00:00 2001
From: Archer Yan <ayan@wavecomp.com>
Date: Fri, 8 Mar 2019 03:29:19 +0000
Subject: [PATCH 2918/3220] MIPS: Fix kernel crash for R6 in jump label branch
 function

commit 47c25036b60f27b86ab44b66a8861bcf81cde39b upstream.

Insert Branch instruction instead of NOP to make sure assembler don't
patch code in forbidden slot. In jump label function, it might
be possible to patch Control Transfer Instructions(CTIs) into
forbidden slot, which will generate Reserved Instruction exception
in MIPS release 6.

Signed-off-by: Archer Yan <ayan@wavecomp.com>
Reviewed-by: Paul Burton <paul.burton@mips.com>
[paul.burton@mips.com:
  - Add MIPS prefix to subject.
  - Mark for stable from v4.0, which introduced r6 support, onwards.]
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@vger.kernel.org
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/include/asm/jump_label.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index e77672539e8e..e4456e450f94 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -21,15 +21,15 @@
 #endif
 
 #ifdef CONFIG_CPU_MICROMIPS
-#define NOP_INSN "nop32"
+#define B_INSN "b32"
 #else
-#define NOP_INSN "nop"
+#define B_INSN "b"
 #endif
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\t" NOP_INSN "\n\t"
-		"nop\n\t"
+	asm_volatile_goto("1:\t" B_INSN " 2f\n\t"
+		"2:\tnop\n\t"
 		".pushsection __jump_table,  \"aw\"\n\t"
 		WORD_INSN " 1b, %l[l_yes], %0\n\t"
 		".popsection\n\t"

From 2f59c12b47c187c8c221f11ee139365f9bb2b334 Mon Sep 17 00:00:00 2001
From: Chen Jie <chenjie6@huawei.com>
Date: Fri, 15 Mar 2019 03:44:38 +0000
Subject: [PATCH 2919/3220] futex: Ensure that futex address is aligned in
 handle_futex_death()

commit 5a07168d8d89b00fe1760120714378175b3ef992 upstream.

The futex code requires that the user space addresses of futexes are 32bit
aligned. sys_futex() checks this in futex_get_keys() but the robust list
code has no alignment check in place.

As a consequence the kernel crashes on architectures with strict alignment
requirements in handle_futex_death() when trying to cmpxchg() on an
unaligned futex address which was retrieved from the robust list.

[ tglx: Rewrote changelog, proper sizeof() based alignement check and add
  	comment ]

Fixes: 0771dfefc9e5 ("[PATCH] lightweight robust futexes: core")
Signed-off-by: Chen Jie <chenjie6@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <dvhart@infradead.org>
Cc: <peterz@infradead.org>
Cc: <zengweilin@huawei.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1552621478-119787-1-git-send-email-chenjie6@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/futex.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/futex.c b/kernel/futex.c
index 0c92c8d34ffa..ec9df5ba040b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3067,6 +3067,10 @@ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
 	u32 uval, uninitialized_var(nval), mval;
 
+	/* Futex address must be 32bit aligned */
+	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+		return -1;
+
 retry:
 	if (get_user(uval, uaddr))
 		return -1;

From 2b6575ed8c7e13c957b9a64693f403486646e160 Mon Sep 17 00:00:00 2001
From: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Date: Thu, 14 Mar 2019 23:19:22 -0400
Subject: [PATCH 2920/3220] ext4: fix NULL pointer dereference while journal is
 aborted

commit fa30dde38aa8628c73a6dded7cb0bba38c27b576 upstream.

We see the following NULL pointer dereference while running xfstests
generic/475:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
PGD 8000000c84bad067 P4D 8000000c84bad067 PUD c84e62067 PMD 0
Oops: 0000 [#1] SMP PTI
CPU: 7 PID: 9886 Comm: fsstress Kdump: loaded Not tainted 5.0.0-rc8 #10
RIP: 0010:ext4_do_update_inode+0x4ec/0x760
...
Call Trace:
? jbd2_journal_get_write_access+0x42/0x50
? __ext4_journal_get_write_access+0x2c/0x70
? ext4_truncate+0x186/0x3f0
ext4_mark_iloc_dirty+0x61/0x80
ext4_mark_inode_dirty+0x62/0x1b0
ext4_truncate+0x186/0x3f0
? unmap_mapping_pages+0x56/0x100
ext4_setattr+0x817/0x8b0
notify_change+0x1df/0x430
do_truncate+0x5e/0x90
? generic_permission+0x12b/0x1a0

This is triggered because the NULL pointer handle->h_transaction was
dereferenced in function ext4_update_inode_fsync_trans().
I found that the h_transaction was set to NULL in jbd2__journal_restart
but failed to attached to a new transaction while the journal is aborted.

Fix this by checking the handle before updating the inode.

Fixes: b436b9bef84d ("ext4: Wait for proper transaction commit on fsync")
Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ext4_jbd2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index f817ed58f5ad..b40e75dbf48c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -372,7 +372,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
-	if (ext4_handle_valid(handle)) {
+	if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
 		ei->i_sync_tid = handle->h_transaction->t_tid;
 		if (datasync)
 			ei->i_datasync_tid = handle->h_transaction->t_tid;

From 2ebfb9ae0047e900a932e6219a1a8ee830cde940 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Thu, 14 Mar 2019 23:20:25 -0400
Subject: [PATCH 2921/3220] ext4: fix data corruption caused by unaligned
 direct AIO

commit 372a03e01853f860560eade508794dd274e9b390 upstream.

Ext4 needs to serialize unaligned direct AIO because the zeroing of
partial blocks of two competing unaligned AIOs can result in data
corruption.

However it decides not to serialize if the potentially unaligned aio is
past i_size with the rationale that no pending writes are possible past
i_size. Unfortunately if the i_size is not block aligned and the second
unaligned write lands past i_size, but still into the same block, it has
the potential of corrupting the previous unaligned write to the same
block.

This is (very simplified) reproducer from Frank

    // 41472 = (10 * 4096) + 512
    // 37376 = 41472 - 4096

    ftruncate(fd, 41472);
    io_prep_pwrite(iocbs[0], fd, buf[0], 4096, 37376);
    io_prep_pwrite(iocbs[1], fd, buf[1], 4096, 41472);

    io_submit(io_ctx, 1, &iocbs[1]);
    io_submit(io_ctx, 1, &iocbs[2]);

    io_getevents(io_ctx, 2, 2, events, NULL);

Without this patch the 512B range from 40960 up to the start of the
second unaligned write (41472) is going to be zeroed overwriting the data
written by the first write. This is a data corruption.

00000000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
*
00009200  30 30 30 30 30 30 30 30  30 30 30 30 30 30 30 30
*
0000a000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
*
0000a200  31 31 31 31 31 31 31 31  31 31 31 31 31 31 31 31

With this patch the data corruption is avoided because we will recognize
the unaligned_aio and wait for the unwritten extent conversion.

00000000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
*
00009200  30 30 30 30 30 30 30 30  30 30 30 30 30 30 30 30
*
0000a200  31 31 31 31 31 31 31 31  31 31 31 31 31 31 31 31
*
0000b200

Reported-by: Frank Sorenson <fsorenso@redhat.com>
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Fixes: e9e3bcecf44c ("ext4: serialize unaligned asynchronous DIO")
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index debf0707789d..2e5ae183a18a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -79,7 +79,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
 	struct super_block *sb = inode->i_sb;
 	int blockmask = sb->s_blocksize - 1;
 
-	if (pos >= i_size_read(inode))
+	if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
 		return 0;
 
 	if ((pos | iov_iter_alignment(from)) & blockmask)

From c37fd9822514d1e76d3ab467da8e613e204fbd10 Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Sat, 23 Mar 2019 11:43:05 -0400
Subject: [PATCH 2922/3220] ext4: brelse all indirect buffer in
 ext4_ind_remove_space()

commit 674a2b27234d1b7afcb0a9162e81b2e53aeef217 upstream.

All indirect buffers get by ext4_find_shared() should be released no
mater the branch should be freed or not. But now, we forget to release
the lower depth indirect buffers when removing space from the same
higher depth indirect block. It will lead to buffer leak and futher
more, it may lead to quota information corruption when using old quota,
consider the following case.

 - Create and mount an empty ext4 filesystem without extent and quota
   features,
 - quotacheck and enable the user & group quota,
 - Create some files and write some data to them, and then punch hole
   to some files of them, it may trigger the buffer leak problem
   mentioned above.
 - Disable quota and run quotacheck again, it will create two new
   aquota files and write the checked quota information to them, which
   probably may reuse the freed indirect block(the buffer and page
   cache was not freed) as data block.
 - Enable quota again, it will invoke
   vfs_load_quota_inode()->invalidate_bdev() to try to clean unused
   buffers and pagecache. Unfortunately, because of the buffer of quota
   data block is still referenced, quota code cannot read the up to date
   quota info from the device and lead to quota information corruption.

This problem can be reproduced by xfstests generic/231 on ext3 file
system or ext4 file system without extent and quota features.

This patch fix this problem by releasing the missing indirect buffers,
in ext4_ind_remove_space().

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/indirect.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 355ef9c36c87..8f3e78eb0bbd 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1491,10 +1491,14 @@ end_range:
 					   partial->p + 1,
 					   partial2->p,
 					   (chain+n-1) - partial);
-			BUFFER_TRACE(partial->bh, "call brelse");
-			brelse(partial->bh);
-			BUFFER_TRACE(partial2->bh, "call brelse");
-			brelse(partial2->bh);
+			while (partial > chain) {
+				BUFFER_TRACE(partial->bh, "call brelse");
+				brelse(partial->bh);
+			}
+			while (partial2 > chain2) {
+				BUFFER_TRACE(partial2->bh, "call brelse");
+				brelse(partial2->bh);
+			}
 			return 0;
 		}
 

From a8dea33440f1693caf42706f8c69248e24e3075a Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Mon, 18 Feb 2019 20:45:40 +0300
Subject: [PATCH 2923/3220] mmc: tmio_mmc_core: don't claim spurious interrupts

commit 5c27ff5db1491a947264d6d4e4cbe43ae6535bae upstream.

I have encountered an interrupt storm during the eMMC chip probing (and
the chip finally didn't get detected).  It turned out that U-Boot left
the DMAC interrupts enabled while the Linux driver  didn't use those.
The SDHI driver's interrupt handler somehow assumes that, even if an
SDIO interrupt didn't happen, it should return IRQ_HANDLED.  I think
that if none of the enabled interrupts happened and got handled, we
should return IRQ_NONE -- that way the kernel IRQ code recoginizes
a spurious interrupt and masks it off pretty quickly...

Fixes: 7729c7a232a9 ("mmc: tmio: Provide separate interrupt handlers")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Cc: stable@vger.kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/host/tmio_mmc_pio.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index a10fde40b6c3..3c7c3a1c8f4f 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -716,7 +716,7 @@ irqreturn_t tmio_mmc_sdio_irq(int irq, void *devid)
 	unsigned int sdio_status;
 
 	if (!(pdata->flags & TMIO_MMC_SDIO_IRQ))
-		return IRQ_HANDLED;
+		return IRQ_NONE;
 
 	status = sd_ctrl_read16(host, CTL_SDIO_STATUS);
 	ireg = status & TMIO_SDIO_MASK_ALL & ~host->sdcard_irq_mask;
@@ -730,7 +730,7 @@ irqreturn_t tmio_mmc_sdio_irq(int irq, void *devid)
 	if (mmc->caps & MMC_CAP_SDIO_IRQ && ireg & TMIO_SDIO_STAT_IOIRQ)
 		mmc_signal_sdio_irq(mmc);
 
-	return IRQ_HANDLED;
+	return IRQ_RETVAL(ireg);
 }
 EXPORT_SYMBOL(tmio_mmc_sdio_irq);
 
@@ -747,9 +747,7 @@ irqreturn_t tmio_mmc_irq(int irq, void *devid)
 	if (__tmio_mmc_sdcard_irq(host, ireg, status))
 		return IRQ_HANDLED;
 
-	tmio_mmc_sdio_irq(irq, devid);
-
-	return IRQ_HANDLED;
+	return tmio_mmc_sdio_irq(irq, devid);
 }
 EXPORT_SYMBOL(tmio_mmc_irq);
 

From 786207750b492a14d07444992f49b26a4ee5bb97 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Tue, 18 Dec 2018 08:37:08 -0500
Subject: [PATCH 2924/3220] media: v4l2-ctrls.c/uvc: zero v4l2_event

commit f45f3f753b0a3d739acda8e311b4f744d82dc52a upstream.

Control events can leak kernel memory since they do not fully zero the
event. The same code is present in both v4l2-ctrls.c and uvc_ctrl.c, so
fix both.

It appears that all other event code is properly zeroing the structure,
it's these two places.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reported-by: syzbot+4f021cf3697781dbd9fb@syzkaller.appspotmail.com
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/usb/uvc/uvc_ctrl.c     | 2 +-
 drivers/media/v4l2-core/v4l2-ctrls.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/media/usb/uvc/uvc_ctrl.c b/drivers/media/usb/uvc/uvc_ctrl.c
index 618e4e2b4207..fea09a33c6c8 100644
--- a/drivers/media/usb/uvc/uvc_ctrl.c
+++ b/drivers/media/usb/uvc/uvc_ctrl.c
@@ -1202,7 +1202,7 @@ static void uvc_ctrl_fill_event(struct uvc_video_chain *chain,
 
 	__uvc_query_v4l2_ctrl(chain, ctrl, mapping, &v4l2_ctrl);
 
-	memset(ev->reserved, 0, sizeof(ev->reserved));
+	memset(ev, 0, sizeof(*ev));
 	ev->type = V4L2_EVENT_CTRL;
 	ev->id = v4l2_ctrl.id;
 	ev->u.ctrl.value = value;
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 523758e71fe6..70097cc3a35d 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -1212,7 +1212,7 @@ static u32 user_flags(const struct v4l2_ctrl *ctrl)
 
 static void fill_event(struct v4l2_event *ev, struct v4l2_ctrl *ctrl, u32 changes)
 {
-	memset(ev->reserved, 0, sizeof(ev->reserved));
+	memset(ev, 0, sizeof(*ev));
 	ev->type = V4L2_EVENT_CTRL;
 	ev->id = ctrl->id;
 	ev->u.ctrl.changes = changes;

From 4aada79c6793c59e484b69fd4ed591396e2d4b39 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 9 Jan 2019 23:03:25 -0500
Subject: [PATCH 2925/3220] locking/lockdep: Add debug_locks check in
 __lock_downgrade()

commit 71492580571467fb7177aade19c18ce7486267f5 upstream.

Tetsuo Handa had reported he saw an incorrect "downgrading a read lock"
warning right after a previous lockdep warning. It is likely that the
previous warning turned off lock debugging causing the lockdep to have
inconsistency states leading to the lock downgrade warning.

Fix that by add a check for debug_locks at the beginning of
__lock_downgrade().

Debugged-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Reported-by: syzbot+53383ae265fb161ef488@syzkaller.appspotmail.com
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: https://lkml.kernel.org/r/1547093005-26085-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/locking/lockdep.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 774ab79d3ec7..a49c565529a0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3314,6 +3314,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 	unsigned int depth;
 	int i;
 
+	if (unlikely(!debug_locks))
+		return 0;
+
 	depth = curr->lockdep_depth;
 	/*
 	 * This function is about (re)setting the class of a held lock,

From 2569eed24d93ce94c34efd1a8eaf17d1d03eb66e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 29 Jan 2019 14:03:33 +0100
Subject: [PATCH 2926/3220] ALSA: hda - Record the current power state before
 suspend/resume calls

commit 98081ca62cbac31fb0f7efaf90b2e7384ce22257 upstream.

Currently we deal with single codec and suspend codec callbacks for
all S3, S4 and runtime PM handling.  But it turned out that we want
distinguish the call patterns sometimes, e.g. for applying some init
sequence only at probing and restoring from hibernate.

This patch slightly modifies the common PM callbacks for HD-audio
codec and stores the currently processed PM event in power_state of
the codec's device.power field, which is currently unused.  The codec
callback can take a look at this event value and judges which purpose
it's being called.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/hda_codec.c | 43 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
index f6d4a1046e54..482aa3518c1e 100644
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c
@@ -3004,6 +3004,7 @@ static void hda_call_codec_resume(struct hda_codec *codec)
 		hda_jackpoll_work(&codec->jackpoll_work.work);
 	else
 		snd_hda_jack_report_sync(codec);
+	codec->core.dev.power.power_state = PMSG_ON;
 	atomic_dec(&codec->core.in_pm);
 }
 
@@ -3036,10 +3037,48 @@ static int hda_codec_runtime_resume(struct device *dev)
 }
 #endif /* CONFIG_PM */
 
+#ifdef CONFIG_PM_SLEEP
+static int hda_codec_pm_suspend(struct device *dev)
+{
+	dev->power.power_state = PMSG_SUSPEND;
+	return pm_runtime_force_suspend(dev);
+}
+
+static int hda_codec_pm_resume(struct device *dev)
+{
+	dev->power.power_state = PMSG_RESUME;
+	return pm_runtime_force_resume(dev);
+}
+
+static int hda_codec_pm_freeze(struct device *dev)
+{
+	dev->power.power_state = PMSG_FREEZE;
+	return pm_runtime_force_suspend(dev);
+}
+
+static int hda_codec_pm_thaw(struct device *dev)
+{
+	dev->power.power_state = PMSG_THAW;
+	return pm_runtime_force_resume(dev);
+}
+
+static int hda_codec_pm_restore(struct device *dev)
+{
+	dev->power.power_state = PMSG_RESTORE;
+	return pm_runtime_force_resume(dev);
+}
+#endif /* CONFIG_PM_SLEEP */
+
 /* referred in hda_bind.c */
 const struct dev_pm_ops hda_codec_driver_pm = {
-	SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
-				pm_runtime_force_resume)
+#ifdef CONFIG_PM_SLEEP
+	.suspend = hda_codec_pm_suspend,
+	.resume = hda_codec_pm_resume,
+	.freeze = hda_codec_pm_freeze,
+	.thaw = hda_codec_pm_thaw,
+	.poweroff = hda_codec_pm_suspend,
+	.restore = hda_codec_pm_restore,
+#endif /* CONFIG_PM_SLEEP */
 	SET_RUNTIME_PM_OPS(hda_codec_runtime_suspend, hda_codec_runtime_resume,
 			   NULL)
 };

From 4d026d221b3435144aac0920f77595bb1806fb5e Mon Sep 17 00:00:00 2001
From: Hui Wang <hui.wang@canonical.com>
Date: Tue, 19 Mar 2019 09:28:44 +0800
Subject: [PATCH 2927/3220] ALSA: hda - Enforces runtime_resume after S3 and S4
 for each codec

commit b5a236c175b0d984552a5f7c9d35141024c2b261 upstream.

Recently we found the audio jack detection stop working after suspend
on many machines with Realtek codec. Sometimes the audio selection
dialogue didn't show up after users plugged headhphone/headset into
the headset jack, sometimes after uses plugged headphone/headset, then
click the sound icon on the upper-right corner of gnome-desktop, it
also showed the speaker rather than the headphone.

The root cause is that before suspend, the codec already call the
runtime_suspend since this codec is not used by any apps, then in
resume, it will not call runtime_resume for this codec. But for some
realtek codec (so far, alc236, alc255 and alc891) with the specific
BIOS, if it doesn't run runtime_resume after suspend, all codec
functions including jack detection stop working anymore.

This problem existed for a long time, but it was not exposed, that is
because when problem happens, if users play sound or open
sound-setting to check audio device, this will trigger calling to
runtime_resume (via snd_hda_power_up), then the codec starts working
again before users notice this problem.

Since we don't know how many codec and BIOS combinations have this
problem, to fix it, let the driver call runtime_resume for all codecs
in pm_resume, maybe for some codecs, this is not needed, but it is
harmless. After a codec is runtime resumed, if it is not used by any
apps, it will be runtime suspended soon and furthermore we don't run
suspend frequently, this change will not add much power consumption.

Fixes: cc72da7d4d06 ("ALSA: hda - Use standard runtime PM for codec power-save control")
Signed-off-by: Hui Wang <hui.wang@canonical.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/hda_codec.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
index 482aa3518c1e..ad0b23a21bc8 100644
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c
@@ -3038,6 +3038,20 @@ static int hda_codec_runtime_resume(struct device *dev)
 #endif /* CONFIG_PM */
 
 #ifdef CONFIG_PM_SLEEP
+static int hda_codec_force_resume(struct device *dev)
+{
+	int ret;
+
+	/* The get/put pair below enforces the runtime resume even if the
+	 * device hasn't been used at suspend time.  This trick is needed to
+	 * update the jack state change during the sleep.
+	 */
+	pm_runtime_get_noresume(dev);
+	ret = pm_runtime_force_resume(dev);
+	pm_runtime_put(dev);
+	return ret;
+}
+
 static int hda_codec_pm_suspend(struct device *dev)
 {
 	dev->power.power_state = PMSG_SUSPEND;
@@ -3047,7 +3061,7 @@ static int hda_codec_pm_suspend(struct device *dev)
 static int hda_codec_pm_resume(struct device *dev)
 {
 	dev->power.power_state = PMSG_RESUME;
-	return pm_runtime_force_resume(dev);
+	return hda_codec_force_resume(dev);
 }
 
 static int hda_codec_pm_freeze(struct device *dev)
@@ -3059,13 +3073,13 @@ static int hda_codec_pm_freeze(struct device *dev)
 static int hda_codec_pm_thaw(struct device *dev)
 {
 	dev->power.power_state = PMSG_THAW;
-	return pm_runtime_force_resume(dev);
+	return hda_codec_force_resume(dev);
 }
 
 static int hda_codec_pm_restore(struct device *dev)
 {
 	dev->power.power_state = PMSG_RESTORE;
-	return pm_runtime_force_resume(dev);
+	return hda_codec_force_resume(dev);
 }
 #endif /* CONFIG_PM_SLEEP */
 

From c8816bb0ed6dc34c888021a34891385c0a84566f Mon Sep 17 00:00:00 2001
From: Martin Fuzzey <mfuzzey@parkeon.com>
Date: Wed, 20 Jan 2016 16:08:03 +0100
Subject: [PATCH 2928/3220] mmc: pwrseq_simple: Make reset-gpios optional to
 match doc

commit 64a67d4762ce3ce4c9466eadd152d825fbf84967 upstream.

The DT binding doc says reset-gpios is an optional property but the code
currently bails out if it is omitted.

This is a regression since it breaks previously working device trees.
Fix it by restoring the original documented behaviour.

Fixes: ce037275861e ("mmc: pwrseq_simple: use GPIO descriptors array API")
Tested-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Martin Fuzzey <mfuzzey@parkeon.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/core/pwrseq_simple.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/mmc/core/pwrseq_simple.c b/drivers/mmc/core/pwrseq_simple.c
index d10538bb5e07..96f45caea109 100644
--- a/drivers/mmc/core/pwrseq_simple.c
+++ b/drivers/mmc/core/pwrseq_simple.c
@@ -29,15 +29,18 @@ struct mmc_pwrseq_simple {
 static void mmc_pwrseq_simple_set_gpios_value(struct mmc_pwrseq_simple *pwrseq,
 					      int value)
 {
-	int i;
 	struct gpio_descs *reset_gpios = pwrseq->reset_gpios;
-	int values[reset_gpios->ndescs];
 
-	for (i = 0; i < reset_gpios->ndescs; i++)
-		values[i] = value;
+	if (!IS_ERR(reset_gpios)) {
+		int i;
+		int values[reset_gpios->ndescs];
 
-	gpiod_set_array_value_cansleep(reset_gpios->ndescs, reset_gpios->desc,
-				       values);
+		for (i = 0; i < reset_gpios->ndescs; i++)
+			values[i] = value;
+
+		gpiod_set_array_value_cansleep(
+			reset_gpios->ndescs, reset_gpios->desc, values);
+	}
 }
 
 static void mmc_pwrseq_simple_pre_power_on(struct mmc_host *host)
@@ -79,7 +82,8 @@ static void mmc_pwrseq_simple_free(struct mmc_host *host)
 	struct mmc_pwrseq_simple *pwrseq = container_of(host->pwrseq,
 					struct mmc_pwrseq_simple, pwrseq);
 
-	gpiod_put_array(pwrseq->reset_gpios);
+	if (!IS_ERR(pwrseq->reset_gpios))
+		gpiod_put_array(pwrseq->reset_gpios);
 
 	if (!IS_ERR(pwrseq->ext_clk))
 		clk_put(pwrseq->ext_clk);
@@ -112,7 +116,9 @@ struct mmc_pwrseq *mmc_pwrseq_simple_alloc(struct mmc_host *host,
 	}
 
 	pwrseq->reset_gpios = gpiod_get_array(dev, "reset", GPIOD_OUT_HIGH);
-	if (IS_ERR(pwrseq->reset_gpios)) {
+	if (IS_ERR(pwrseq->reset_gpios) &&
+	    PTR_ERR(pwrseq->reset_gpios) != -ENOENT &&
+	    PTR_ERR(pwrseq->reset_gpios) != -ENOSYS) {
 		ret = PTR_ERR(pwrseq->reset_gpios);
 		goto clk_put;
 	}

From 282d5f0567be5abcc57a38ada569460ef63b5be9 Mon Sep 17 00:00:00 2001
From: Chuanxiao Dong <chuanxiao.dong@intel.com>
Date: Thu, 21 Jan 2016 13:57:51 +0100
Subject: [PATCH 2929/3220] mmc: debugfs: Add a restriction to mmc debugfs
 clock setting

commit e5905ff1281f0a0f5c9863c430ac1ed5faaf5707 upstream.

Clock frequency values written to an mmc host should not be less than
the minimum clock frequency which the mmc host supports.

Signed-off-by: Yuan Juntao <juntaox.yuan@intel.com>
Signed-off-by: Pawel Wodkowski <pawelx.wodkowski@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/core/debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 154aced0b91b..705586dcd9fa 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -220,7 +220,7 @@ static int mmc_clock_opt_set(void *data, u64 val)
 	struct mmc_host *host = data;
 
 	/* We need this check due to input value is u64 */
-	if (val > host->f_max)
+	if (val != 0 && (val > host->f_max || val < host->f_min))
 		return -EINVAL;
 
 	mmc_claim_host(host);

From 40c01c8301b85d9454e4fb3b5dca436435266937 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 25 Jan 2016 20:18:12 +0100
Subject: [PATCH 2930/3220] mmc: make MAN_BKOPS_EN message a debug

commit 4ec96b4cbde8d5714a4477b5a2562c3dd40bc5fa upstream.

IMO this info is only useful for developers. Most users won't need this
information, since there is not much they can do about it.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/core/mmc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index a31789be0840..adc3291e9d6a 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -508,7 +508,7 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
 			card->ext_csd.raw_bkops_status =
 				ext_csd[EXT_CSD_BKOPS_STATUS];
 			if (!card->ext_csd.man_bkops_en)
-				pr_info("%s: MAN_BKOPS_EN bit is not set\n",
+				pr_debug("%s: MAN_BKOPS_EN bit is not set\n",
 					mmc_hostname(card->host));
 		}
 

From 6a0f5752f1fd5e98c87392e61e437907c149ed63 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Fri, 29 Jan 2016 09:27:50 +0100
Subject: [PATCH 2931/3220] mmc: sanitize 'bus width' in debug output

commit ed9feec72fc1fa194ebfdb79e14561b35decce63 upstream.

The bus width is sometimes the actual bus width, and sometimes indices
to different arrays encoding the bus width. In my debugging case "2"
could mean 8-bit as well as 4-bit, which was extremly confusing. Let's
use the human-readable actual bus width in all places.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/core/core.c | 2 +-
 drivers/mmc/core/mmc.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 299a83f1ad38..e2e927d1f7e4 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1039,7 +1039,7 @@ static inline void mmc_set_ios(struct mmc_host *host)
 		"width %u timing %u\n",
 		 mmc_hostname(host), ios->clock, ios->bus_mode,
 		 ios->power_mode, ios->chip_select, ios->vdd,
-		 ios->bus_width, ios->timing);
+		 1 << ios->bus_width, ios->timing);
 
 	host->ops->set_ios(host, ios);
 }
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index adc3291e9d6a..7286d0d324e1 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -952,7 +952,7 @@ static int mmc_select_bus_width(struct mmc_card *card)
 			break;
 		} else {
 			pr_warn("%s: switch to bus width %d failed\n",
-				mmc_hostname(host), ext_csd_bits[idx]);
+				mmc_hostname(host), 1 << bus_width);
 		}
 	}
 

From f2427238c9ed72e7a8c56c0d342c3b283adce4c6 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 29 Jan 2016 09:43:50 +0000
Subject: [PATCH 2932/3220] mmc: core: shut up "voltage-ranges unspecified"
 pr_info()

commit 10a16a01d8f72e80f4780e40cf3122f4caffa411 upstream.

Each time a driver such as sdhci-esdhc-imx is probed, we get a info
printk complaining that the DT voltage-ranges property has not been
specified.

However, the DT binding specifically says that the voltage-ranges
property is optional.  That means we should not be complaining that
DT hasn't specified this property: by indicating that it's optional,
it is valid not to have the property in DT.

Silence the warning if the property is missing.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/core/core.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index e2e927d1f7e4..df074f8c7cb7 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1220,8 +1220,12 @@ int mmc_of_parse_voltage(struct device_node *np, u32 *mask)
 
 	voltage_ranges = of_get_property(np, "voltage-ranges", &num_ranges);
 	num_ranges = num_ranges / sizeof(*voltage_ranges) / 2;
-	if (!voltage_ranges || !num_ranges) {
-		pr_info("%s: voltage-ranges unspecified\n", np->full_name);
+	if (!voltage_ranges) {
+		pr_debug("%s: voltage-ranges unspecified\n", np->full_name);
+		return -EINVAL;
+	}
+	if (!num_ranges) {
+		pr_err("%s: voltage-ranges empty\n", np->full_name);
 		return -EINVAL;
 	}
 

From df2ca3271569367352835f981618e284fdc4ca94 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Tue, 12 Apr 2016 11:33:29 +0300
Subject: [PATCH 2933/3220] usb: dwc3: gadget: Fix suspend/resume during device
 mode

commit 9772b47a4c2916d645c551228b6085ea24acbe5d upstream.

Gadget controller might not be always active during system
suspend/resume as gadget driver might not have yet been loaded or
might have been unloaded prior to system suspend.

Check if we're active and only then perform
necessary actions during suspend/resume.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/dwc3/gadget.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 557f08adf644..5e015631413c 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -2894,6 +2894,9 @@ void dwc3_gadget_exit(struct dwc3 *dwc)
 
 int dwc3_gadget_suspend(struct dwc3 *dwc)
 {
+	if (!dwc->gadget_driver)
+		return 0;
+
 	if (dwc->pullups_connected) {
 		dwc3_gadget_disable_irq(dwc);
 		dwc3_gadget_run_stop(dwc, true, true);
@@ -2912,6 +2915,9 @@ int dwc3_gadget_resume(struct dwc3 *dwc)
 	struct dwc3_ep		*dep;
 	int			ret;
 
+	if (!dwc->gadget_driver)
+		return 0;
+
 	/* Start with SuperSpeed Default */
 	dwc3_gadget_ep0_desc.wMaxPacketSize = cpu_to_le16(512);
 

From aa8b7ed2168510efcf40f4efaa3fbc794ea4003f Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 13 Apr 2016 13:40:00 +0100
Subject: [PATCH 2934/3220] arm64: mm: Add trace_irqflags annotations to
 do_debug_exception()

commit 6afedcd23cfd7ac56c011069e4a8db37b46e4623 upstream.

With CONFIG_PROVE_LOCKING, CONFIG_DEBUG_LOCKDEP and CONFIG_TRACE_IRQFLAGS
enabled, lockdep will compare current->hardirqs_enabled with the flags from
local_irq_save().

When a debug exception occurs, interrupts are disabled in entry.S, but
lockdep isn't told, resulting in:
DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)
------------[ cut here ]------------
WARNING: at ../kernel/locking/lockdep.c:3523
Modules linked in:
CPU: 3 PID: 1752 Comm: perf Not tainted 4.5.0-rc4+ #2204
Hardware name: ARM Juno development board (r1) (DT)
task: ffffffc974868000 ti: ffffffc975f40000 task.ti: ffffffc975f40000
PC is at check_flags.part.35+0x17c/0x184
LR is at check_flags.part.35+0x17c/0x184
pc : [<ffffff80080fc93c>] lr : [<ffffff80080fc93c>] pstate: 600003c5
[...]
---[ end trace 74631f9305ef5020 ]---
Call trace:
[<ffffff80080fc93c>] check_flags.part.35+0x17c/0x184
[<ffffff80080ffe30>] lock_acquire+0xa8/0xc4
[<ffffff8008093038>] breakpoint_handler+0x118/0x288
[<ffffff8008082434>] do_debug_exception+0x3c/0xa8
[<ffffff80080854b4>] el1_dbg+0x18/0x6c
[<ffffff80081e82f4>] do_filp_open+0x64/0xdc
[<ffffff80081d6e60>] do_sys_open+0x140/0x204
[<ffffff80081d6f58>] SyS_openat+0x10/0x18
[<ffffff8008085d30>] el0_svc_naked+0x24/0x28
possible reason: unannotated irqs-off.
irq event stamp: 65857
hardirqs last  enabled at (65857): [<ffffff80081fb1c0>] lookup_mnt+0xf4/0x1b4
hardirqs last disabled at (65856): [<ffffff80081fb188>] lookup_mnt+0xbc/0x1b4
softirqs last  enabled at (65790): [<ffffff80080bdca4>] __do_softirq+0x1f8/0x290
softirqs last disabled at (65757): [<ffffff80080be038>] irq_exit+0x9c/0xd0

This patch adds the annotations to do_debug_exception(), while trying not
to call trace_hardirqs_off() if el1_dbg() interrupted a task that already
had irqs disabled.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/mm/fault.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index be7f8416809f..04c4b88706d8 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -595,20 +595,33 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 {
 	const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
 	struct siginfo info;
+	int rv;
 
-	if (!inf->fn(addr, esr, regs))
-		return 1;
+	/*
+	 * Tell lockdep we disabled irqs in entry.S. Do nothing if they were
+	 * already disabled to preserve the last enabled/disabled addresses.
+	 */
+	if (interrupts_enabled(regs))
+		trace_hardirqs_off();
 
-	pr_alert("Unhandled debug exception: %s (0x%08x) at 0x%016lx\n",
-		 inf->name, esr, addr);
+	if (!inf->fn(addr, esr, regs)) {
+		rv = 1;
+	} else {
+		pr_alert("Unhandled debug exception: %s (0x%08x) at 0x%016lx\n",
+			 inf->name, esr, addr);
 
-	info.si_signo = inf->sig;
-	info.si_errno = 0;
-	info.si_code  = inf->code;
-	info.si_addr  = (void __user *)addr;
-	arm64_notify_die("", regs, &info, 0);
+		info.si_signo = inf->sig;
+		info.si_errno = 0;
+		info.si_code  = inf->code;
+		info.si_addr  = (void __user *)addr;
+		arm64_notify_die("", regs, &info, 0);
+		rv = 0;
+	}
 
-	return 0;
+	if (interrupts_enabled(regs))
+		trace_hardirqs_on();
+
+	return rv;
 }
 
 #ifdef CONFIG_ARM64_PAN

From 1394b1bdfd00560beb1b1ea20b75ee1734007ac8 Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@nxp.com>
Date: Thu, 21 Apr 2016 00:51:30 +0800
Subject: [PATCH 2935/3220] mmc: core: fix using wrong io voltage if
 mmc_select_hs200 fails

commit e51534c806609c806d81bfb034f02737461f855c upstream.

Currently MMC core will keep going if HS200/HS timing switch failed
with -EBADMSG error by the assumption that the old timing is still valid.

However, for mmc_select_hs200 case, the signal voltage may have already
been switched. If the timing switch failed, we should fall back to
the old voltage in case the card is continue run with legacy timing.

If fall back signal voltage failed, we explicitly report an EIO error
to force retry during the next power cycle.

Signed-off-by: Dong Aisheng <aisheng.dong@nxp.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/core/mmc.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 7286d0d324e1..7844baecf306 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1251,10 +1251,11 @@ static int mmc_select_hs200(struct mmc_card *card)
 {
 	struct mmc_host *host = card->host;
 	bool send_status = true;
-	unsigned int old_timing;
+	unsigned int old_timing, old_signal_voltage;
 	int err = -EINVAL;
 	u8 val;
 
+	old_signal_voltage = host->ios.signal_voltage;
 	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_2V)
 		err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_120);
 
@@ -1263,7 +1264,7 @@ static int mmc_select_hs200(struct mmc_card *card)
 
 	/* If fails try again during next card power cycle */
 	if (err)
-		goto err;
+		return err;
 
 	mmc_select_driver_type(card);
 
@@ -1297,9 +1298,14 @@ static int mmc_select_hs200(struct mmc_card *card)
 		}
 	}
 err:
-	if (err)
+	if (err) {
+		/* fall back to the old signal voltage, if fails report error */
+		if (__mmc_set_signal_voltage(host, old_signal_voltage))
+			err = -EIO;
+
 		pr_err("%s: %s failed, error %d\n", mmc_hostname(card->host),
 		       __func__, err);
+	}
 	return err;
 }
 

From 7f69a980f6351e4390cc32341499abb13182e8c3 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 19 May 2016 17:11:46 -0700
Subject: [PATCH 2936/3220] mm/rmap: replace BUG_ON(anon_vma->degree) with
 VM_WARN_ON

commit e4c5800a3991f0c6a766983535dfc10d51802cf6 upstream.

This check effectively catches anon vma hierarchy inconsistence and some
vma corruptions.  It was effective for catching corner cases in anon vma
reusing logic.  For now this code seems stable so check could be hidden
under CONFIG_DEBUG_VM and replaced with WARN because it's not so fatal.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Suggested-by: Vasily Averin <vvs@virtuozzo.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/rmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 488dda209431..cf733fab230f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -408,7 +408,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 		struct anon_vma *anon_vma = avc->anon_vma;
 
-		BUG_ON(anon_vma->degree);
+		VM_WARN_ON(anon_vma->degree);
 		put_anon_vma(anon_vma);
 
 		list_del(&avc->same_vma);

From 1e647cf23fb28b83c431411dd2f67b2947550e13 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Mon, 11 Apr 2016 17:04:45 +0300
Subject: [PATCH 2937/3220] extcon: usb-gpio: Don't miss event during
 suspend/resume

commit 04c080080855ce84dcd490a2e04805608a21085d upstream.

Pin state might have changed during suspend/resume while
our interrupts were disabled and if device doesn't support wakeup.

Scan for change during resume for such case.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/extcon/extcon-usb-gpio.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/extcon/extcon-usb-gpio.c b/drivers/extcon/extcon-usb-gpio.c
index 2b2fecffb1ad..c6a7c9ddf0ac 100644
--- a/drivers/extcon/extcon-usb-gpio.c
+++ b/drivers/extcon/extcon-usb-gpio.c
@@ -192,6 +192,9 @@ static int usb_extcon_resume(struct device *dev)
 	}
 
 	enable_irq(info->id_irq);
+	if (!device_may_wakeup(dev))
+		queue_delayed_work(system_power_efficient_wq,
+				   &info->wq_detcable, 0);
 
 	return ret;
 }

From c0f71babf7bbfeba850abeba2b56545a7eaf8a75 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 6 Jun 2016 21:00:38 +0200
Subject: [PATCH 2938/3220] kbuild: setlocalversion: print error to STDERR

commit 78283edf2c01c38eb840a3de5ffd18fe2992ab64 upstream.

I tried to use 'make O=...' from an unclean source tree. This triggered
the error path of setlocalversion. But by printing to STDOUT, it created
a broken localversion which then caused another (unrelated) error:

"4.7.0-rc2Error: kernelrelease not valid - run make prepare to update it" exceeds 64 characters

After printing to STDERR, the true build error gets displayed later:

  /home/wsa/Kernel/linux is not clean, please run 'make mrproper'
  in the '/home/wsa/Kernel/linux' directory.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Michal Marek <mmarek@suse.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 scripts/setlocalversion | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/setlocalversion b/scripts/setlocalversion
index 63d91e22ed7c..966dd3924ea9 100755
--- a/scripts/setlocalversion
+++ b/scripts/setlocalversion
@@ -143,7 +143,7 @@ fi
 if test -e include/config/auto.conf; then
 	. include/config/auto.conf
 else
-	echo "Error: kernelrelease not valid - run 'make prepare' to update it"
+	echo "Error: kernelrelease not valid - run 'make prepare' to update it" >&2
 	exit 1
 fi
 

From 36be44e1905d593402d373aa35b48ceec9ae46f0 Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@nxp.com>
Date: Fri, 1 Jul 2016 15:33:28 +0800
Subject: [PATCH 2939/3220] usb: gadget: composite: fix dereference after null
 check coverify warning

commit c526c62d565ea5a5bba9433f28756079734f430d upstream.

cdev->config is checked for null pointer at above code, so cdev->config
might be null, fix it by adding null pointer check.

Signed-off-by: Peter Chen <peter.chen@nxp.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/composite.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 58f5fbdb6959..8bf54477f472 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -1819,6 +1819,8 @@ unknown:
 			break;
 
 		case USB_RECIP_ENDPOINT:
+			if (!cdev->config)
+				break;
 			endp = ((w_index & 0x80) >> 3) | (w_index & 0x0f);
 			list_for_each_entry(f, &cdev->config->functions, list) {
 				if (test_bit(endp, f->endpoints))

From a12cf3214235681a347a53c93378479caab3f404 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Thu, 30 Jun 2016 17:10:23 +0800
Subject: [PATCH 2940/3220] usb: gadget: Add the gserial port checking in
 gs_start_tx()

commit 511a36d2f357724312bb3776d2f6eed3890928b2 upstream.

When usb gadget is set gadget serial function, it will be crash in below
situation.

It will clean the 'port->port_usb' pointer in gserial_disconnect() function
when usb link is inactive, but it will release lock for disabling the endpoints
in this function. Druing the lock release period, it maybe complete one request
to issue gs_write_complete()--->gs_start_tx() function, but the 'port->port_usb'
pointer had been set NULL, thus it will be crash in gs_start_tx() function.

This patch adds the 'port->port_usb' pointer checking in gs_start_tx() function
to avoid this situation.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/function/u_serial.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index 4ea44f7122ee..d73618475664 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -361,10 +361,15 @@ __acquires(&port->port_lock)
 */
 {
 	struct list_head	*pool = &port->write_pool;
-	struct usb_ep		*in = port->port_usb->in;
+	struct usb_ep		*in;
 	int			status = 0;
 	bool			do_tty_wake = false;
 
+	if (!port->port_usb)
+		return status;
+
+	in = port->port_usb->in;
+
 	while (!port->write_busy && !list_empty(pool)) {
 		struct usb_request	*req;
 		int			len;

From 7ed7c0386ef2a5cbe58e15af5014c9302d3593eb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 26 Oct 2016 09:27:57 -0700
Subject: [PATCH 2941/3220] tcp/dccp: drop SYN packets if accept queue is full

commit 5ea8ea2cb7f1d0db15762c9b0bb9e7330425a071 upstream.

Per listen(fd, backlog) rules, there is really no point accepting a SYN,
sending a SYNACK, and dropping the following ACK packet if accept queue
is full, because application is not draining accept queue fast enough.

This behavior is fooling TCP clients that believe they established a
flow, while there is nothing at server side. They might then send about
10 MSS (if using IW10) that will be dropped anyway while server is under
stress.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/inet_connection_sock.h | 5 -----
 net/dccp/ipv4.c                    | 8 +-------
 net/dccp/ipv6.c                    | 2 +-
 net/ipv4/tcp_input.c               | 8 +-------
 4 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 49dcad4fe99e..72599bbc8255 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -289,11 +289,6 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
 	return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
 }
 
-static inline int inet_csk_reqsk_queue_young(const struct sock *sk)
-{
-	return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue);
-}
-
 static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 {
 	return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 45fd82e61e79..b0a577a79a6a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -592,13 +592,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (inet_csk_reqsk_queue_is_full(sk))
 		goto drop;
 
-	/*
-	 * Accept backlog is full. If we have already queued enough
-	 * of warm entries in syn queue, drop request. It is better than
-	 * clogging syn queue with openreqs with exponentially increasing
-	 * timeout.
-	 */
-	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+	if (sk_acceptq_is_full(sk))
 		goto drop;
 
 	req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 0bf41faeffc4..18bb2a42f0d1 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -324,7 +324,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (inet_csk_reqsk_queue_is_full(sk))
 		goto drop;
 
-	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+	if (sk_acceptq_is_full(sk))
 		goto drop;
 
 	req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 561f568e8938..aff90b0ddb63 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6305,13 +6305,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			goto drop;
 	}
 
-
-	/* Accept backlog is full. If we have already queued enough
-	 * of warm entries in syn queue, drop request. It is better than
-	 * clogging syn queue with openreqs with exponentially increasing
-	 * timeout.
-	 */
-	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+	if (sk_acceptq_is_full(sk)) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
 		goto drop;
 	}

From 4a97f086f1924032990e35884baca3af5d5abae8 Mon Sep 17 00:00:00 2001
From: Wei Qiao <wei.qiao@spreadtrum.com>
Date: Mon, 27 Mar 2017 14:06:42 +0800
Subject: [PATCH 2942/3220] serial: sprd: adjust TIMEOUT to a big value

commit e1dc9b08051a2c2e694edf48d1e704f07c7c143c upstream.

SPRD_TIMEOUT was 256, which is too small to wait until the status
switched to workable in a while loop, so that the earlycon could
not work correctly.

Signed-off-by: Wei Qiao <wei.qiao@spreadtrum.com>
Signed-off-by: Chunyan Zhang <chunyan.zhang@spreadtrum.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/sprd_serial.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c
index 1e302caaa450..176f0a2bf9d9 100644
--- a/drivers/tty/serial/sprd_serial.c
+++ b/drivers/tty/serial/sprd_serial.c
@@ -36,7 +36,7 @@
 #define SPRD_FIFO_SIZE		128
 #define SPRD_DEF_RATE		26000000
 #define SPRD_BAUD_IO_LIMIT	3000000
-#define SPRD_TIMEOUT		256
+#define SPRD_TIMEOUT		256000
 
 /* the offset of serial registers and BITs for them */
 /* data registers */

From a3fbab100127b99b82e13f1b9ad8ec91c9c6ed56 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sat, 3 Jun 2017 07:20:09 +0100
Subject: [PATCH 2943/3220] Hang/soft lockup in d_invalidate with simultaneous
 calls

commit 81be24d263dbeddaba35827036d6f6787a59c2c3 upstream.

It's not hard to trigger a bunch of d_invalidate() on the same
dentry in parallel.  They end up fighting each other - any
dentry picked for removal by one will be skipped by the rest
and we'll go for the next iteration through the entire
subtree, even if everything is being skipped.  Morevoer, we
immediately go back to scanning the subtree.  The only thing
we really need is to dissolve all mounts in the subtree and
as soon as we've nothing left to do, we can just unhash the
dentry and bugger off.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dcache.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 9ffe60702299..cb554e406545 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1510,7 +1510,7 @@ static void check_and_drop(void *_data)
 {
 	struct detach_data *data = _data;
 
-	if (!data->mountpoint && !data->select.found)
+	if (!data->mountpoint && list_empty(&data->select.dispose))
 		__d_drop(data->select.start);
 }
 
@@ -1552,17 +1552,15 @@ void d_invalidate(struct dentry *dentry)
 
 		d_walk(dentry, &data, detach_and_collect, check_and_drop);
 
-		if (data.select.found)
+		if (!list_empty(&data.select.dispose))
 			shrink_dentry_list(&data.select.dispose);
+		else if (!data.mountpoint)
+			return;
 
 		if (data.mountpoint) {
 			detach_mounts(data.mountpoint);
 			dput(data.mountpoint);
 		}
-
-		if (!data.mountpoint && !data.select.found)
-			break;
-
 		cond_resched();
 	}
 }

From e86206bd875877825a3f0328adcc332f9bdf8ef1 Mon Sep 17 00:00:00 2001
From: Qiao Zhou <qiaozhou@asrmicro.com>
Date: Fri, 7 Jul 2017 17:29:34 +0800
Subject: [PATCH 2944/3220] arm64: traps: disable irq in die()

commit 6f44a0bacb79a03972c83759711832b382b1b8ac upstream.

In current die(), the irq is disabled for __die() handle, not
including the possible panic() handling. Since the log in __die()
can take several hundreds ms, new irq might come and interrupt
current die().

If the process calling die() holds some critical resource, and some
other process scheduled later also needs it, then it would deadlock.
The first panic will not be executed.

So here disable irq for the whole flow of die().

Signed-off-by: Qiao Zhou <qiaozhou@asrmicro.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/kernel/traps.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 5d270ca76aec..6b4579e07aa2 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -239,10 +239,12 @@ void die(const char *str, struct pt_regs *regs, int err)
 {
 	struct thread_info *thread = current_thread_info();
 	int ret;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&die_lock, flags);
 
 	oops_enter();
 
-	raw_spin_lock_irq(&die_lock);
 	console_verbose();
 	bust_spinlocks(1);
 	ret = __die(str, err, thread, regs);
@@ -252,13 +254,15 @@ void die(const char *str, struct pt_regs *regs, int err)
 
 	bust_spinlocks(0);
 	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
-	raw_spin_unlock_irq(&die_lock);
 	oops_exit();
 
 	if (in_interrupt())
 		panic("Fatal exception in interrupt");
 	if (panic_on_oops)
 		panic("Fatal exception");
+
+	raw_spin_unlock_irqrestore(&die_lock, flags);
+
 	if (ret != NOTIFY_STOP)
 		do_exit(SIGSEGV);
 }

From 4104b44e5d926fa4052ef37e41a053e17b7c00f4 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Fri, 28 Jul 2017 19:28:57 +0900
Subject: [PATCH 2945/3220] usb: renesas_usbhs: gadget: fix
 unused-but-set-variable warning

commit b7d44c36a6f6d956e1539e0dd42f98b26e5a4684 upstream.

The commit b8b9c974afee ("usb: renesas_usbhs: gadget: disable all eps
when the driver stops") causes the unused-but-set-variable warning.
But, if the usbhsg_ep_disable() will return non-zero value, udc/core.c
doesn't clear the ep->enabled flag. So, this driver should not return
non-zero value, if the pipe is zero because this means the pipe is
already disabled. Otherwise, the ep->enabled flag is never cleared
when the usbhsg_ep_disable() is called by the renesas_usbhs driver first.

Fixes: b8b9c974afee ("usb: renesas_usbhs: gadget: disable all eps when the driver stops")
Fixes: 11432050f070 ("usb: renesas_usbhs: gadget: fix NULL pointer dereference in ep_disable()")
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/mod_gadget.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c
index 8647d2c2a8c4..c5553028e616 100644
--- a/drivers/usb/renesas_usbhs/mod_gadget.c
+++ b/drivers/usb/renesas_usbhs/mod_gadget.c
@@ -641,14 +641,11 @@ static int usbhsg_ep_disable(struct usb_ep *ep)
 	struct usbhsg_uep *uep = usbhsg_ep_to_uep(ep);
 	struct usbhs_pipe *pipe;
 	unsigned long flags;
-	int ret = 0;
 
 	spin_lock_irqsave(&uep->lock, flags);
 	pipe = usbhsg_uep_to_pipe(uep);
-	if (!pipe) {
-		ret = -EINVAL;
+	if (!pipe)
 		goto out;
-	}
 
 	usbhsg_pipe_disable(uep);
 	usbhs_pipe_free(pipe);

From 7a47e3c3232d7695bb334a306c839ebdf0c8ee32 Mon Sep 17 00:00:00 2001
From: Lanqing Liu <lanqing.liu@spreadtrum.com>
Date: Tue, 18 Jul 2017 17:58:13 +0800
Subject: [PATCH 2946/3220] serial: sprd: clear timeout interrupt only rather
 than all interrupts

commit 4350782570b919f254c1e083261a21c19fcaee90 upstream.

On Spreadtrum's serial device, nearly all of interrupts would be cleared
by hardware except timeout interrupt.  This patch removed the operation
of clearing all interrupt in irq handler, instead added an if statement
to check if the timeout interrupt is supposed to be cleared.

Wrongly clearing timeout interrupt would lead to uart data stay in rx
fifo, that means the driver cannot read them out anymore.

Signed-off-by: Lanqing Liu <lanqing.liu@spreadtrum.com>
Signed-off-by: Chunyan Zhang <chunyan.zhang@spreadtrum.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/sprd_serial.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c
index 176f0a2bf9d9..c894eca57e73 100644
--- a/drivers/tty/serial/sprd_serial.c
+++ b/drivers/tty/serial/sprd_serial.c
@@ -63,6 +63,7 @@
 
 /* interrupt clear register */
 #define SPRD_ICLR		0x0014
+#define SPRD_ICLR_TIMEOUT	BIT(13)
 
 /* line control register */
 #define SPRD_LCR		0x0018
@@ -298,7 +299,8 @@ static irqreturn_t sprd_handle_irq(int irq, void *dev_id)
 		return IRQ_NONE;
 	}
 
-	serial_out(port, SPRD_ICLR, ~0);
+	if (ims & SPRD_IMSR_TIMEOUT)
+		serial_out(port, SPRD_ICLR, SPRD_ICLR_TIMEOUT);
 
 	if (ims & (SPRD_IMSR_RX_FIFO_FULL |
 		SPRD_IMSR_BREAK_DETECT | SPRD_IMSR_TIMEOUT))

From 466ab66b5c2fa7ba0ee372495833af08cce17a52 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Nov 2017 15:28:04 -0800
Subject: [PATCH 2947/3220] lib/int_sqrt: optimize small argument

commit 3f3295709edea6268ff1609855f498035286af73 upstream.

The current int_sqrt() computation is sub-optimal for the case of small
@x.  Which is the interesting case when we're going to do cumulative
distribution functions on idle times, which we assume to be a random
variable, where the target residency of the deepest idle state gives an
upper bound on the variable (5e6ns on recent Intel chips).

In the case of small @x, the compute loop:

	while (m != 0) {
		b = y + m;
		y >>= 1;

		if (x >= b) {
			x -= b;
			y += m;
		}
		m >>= 2;
	}

can be reduced to:

	while (m > x)
		m >>= 2;

Because y==0, b==m and until x>=m y will remain 0.

And while this is computationally equivalent, it runs much faster
because there's less code, in particular less branches.

      cycles:                 branches:              branch-misses:

OLD:

hot:   45.109444 +- 0.044117  44.333392 +- 0.002254  0.018723 +- 0.000593
cold: 187.737379 +- 0.156678  44.333407 +- 0.002254  6.272844 +- 0.004305

PRE:

hot:   67.937492 +- 0.064124  66.999535 +- 0.000488  0.066720 +- 0.001113
cold: 232.004379 +- 0.332811  66.999527 +- 0.000488  6.914634 +- 0.006568

POST:

hot:   43.633557 +- 0.034373  45.333132 +- 0.002277  0.023529 +- 0.000681
cold: 207.438411 +- 0.125840  45.333132 +- 0.002277  6.976486 +- 0.004219

Averages computed over all values <128k using a LFSR to generate order.
Cold numbers have a LFSR based branch trace buffer 'confuser' ran between
each int_sqrt() invocation.

Link: http://lkml.kernel.org/r/20171020164644.876503355@infradead.org
Fixes: 30493cc9dddb ("lib/int_sqrt.c: optimize square root algorithm")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Anshul Garg <aksgarg1989@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Joe Perches <joe@perches.com>
Cc: David Miller <davem@davemloft.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Davidson <md@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 lib/int_sqrt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index 1ef4cc344977..1afb545a37c5 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -22,6 +22,9 @@ unsigned long int_sqrt(unsigned long x)
 		return x;
 
 	m = 1UL << (BITS_PER_LONG - 2);
+	while (m > x)
+		m >>= 2;
+
 	while (m != 0) {
 		b = y + m;
 		y >>= 1;

From 0cd08672e0aa514364ff619975067731e39207ab Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 11 Dec 2017 22:48:41 +0100
Subject: [PATCH 2948/3220] USB: core: only clean up what we allocated

commit 32fd87b3bbf5f7a045546401dfe2894dbbf4d8c3 upstream.

When cleaning up the configurations, make sure we only free the number
of configurations and interfaces that we could have allocated.

Reported-by: Andrey Konovalov <andreyknvl@google.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/config.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 6a287c81a7be..b8eb289e0b17 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -734,18 +734,21 @@ void usb_destroy_configuration(struct usb_device *dev)
 		return;
 
 	if (dev->rawdescriptors) {
-		for (i = 0; i < dev->descriptor.bNumConfigurations; i++)
+		for (i = 0; i < dev->descriptor.bNumConfigurations &&
+				i < USB_MAXCONFIG; i++)
 			kfree(dev->rawdescriptors[i]);
 
 		kfree(dev->rawdescriptors);
 		dev->rawdescriptors = NULL;
 	}
 
-	for (c = 0; c < dev->descriptor.bNumConfigurations; c++) {
+	for (c = 0; c < dev->descriptor.bNumConfigurations &&
+			c < USB_MAXCONFIG; c++) {
 		struct usb_host_config *cf = &dev->config[c];
 
 		kfree(cf->string);
-		for (i = 0; i < cf->desc.bNumInterfaces; i++) {
+		for (i = 0; i < cf->desc.bNumInterfaces &&
+				i < USB_MAXINTERFACES; i++) {
 			if (cf->intf_cache[i])
 				kref_put(&cf->intf_cache[i]->ref,
 					  usb_release_interface_cache);

From ee8f5d476dc91a570cf8ab9babf2b85485f509c1 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Mon, 25 Dec 2017 19:10:37 +0800
Subject: [PATCH 2949/3220] rtc: Fix overflow when converting time64_t to
 rtc_time

commit 36d46cdb43efea74043e29e2a62b13e9aca31452 upstream.

If we convert one large time values to rtc_time, in the original formula
'days * 86400' can be overflowed in 'unsigned int' type to make the formula
get one incorrect remain seconds value. Thus we can use div_s64_rem()
function to avoid this situation.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/rtc/rtc-lib.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c
index e6bfb9c42a10..5b136bdc03d4 100644
--- a/drivers/rtc/rtc-lib.c
+++ b/drivers/rtc/rtc-lib.c
@@ -52,13 +52,11 @@ EXPORT_SYMBOL(rtc_year_days);
  */
 void rtc_time64_to_tm(time64_t time, struct rtc_time *tm)
 {
-	unsigned int month, year;
-	unsigned long secs;
+	unsigned int month, year, secs;
 	int days;
 
 	/* time must be positive */
-	days = div_s64(time, 86400);
-	secs = time - (unsigned int) days * 86400;
+	days = div_s64_rem(time, 86400, &secs);
 
 	/* day of the week, 1970-01-01 was a Thursday */
 	tm->tm_wday = (days + 4) % 7;

From a1402232e193e76066f55573236dbaf02a3dc5e2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 29 Mar 2018 00:06:10 +0200
Subject: [PATCH 2950/3220] ath10k: avoid possible string overflow

commit 6707ba0105a2d350710bc0a537a98f49eb4b895d upstream.

The way that 'strncat' is used here raised a warning in gcc-8:

drivers/net/wireless/ath/ath10k/wmi.c: In function 'ath10k_wmi_tpc_stats_final_disp_tables':
drivers/net/wireless/ath/ath10k/wmi.c:4649:4: error: 'strncat' output truncated before terminating nul copying as many bytes from a string as its length [-Werror=stringop-truncation]

Effectively, this is simply a strcat() but the use of strncat() suggests
some form of overflow check. Regardless of whether this might actually
overflow, using strlcat() instead of strncat() avoids the warning and
makes the code more robust.

Fixes: bc64d05220f3 ("ath10k: debugfs support to get final TPC stats for 10.4 variants")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/wireless/ath/ath10k/wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index f201e50447d8..b867875aa6e6 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -4065,7 +4065,7 @@ static void ath10k_tpc_config_disp_tables(struct ath10k *ar,
 							    rate_code[i],
 							    type);
 			snprintf(buff, sizeof(buff), "%8d ", tpc[j]);
-			strncat(tpc_value, buff, strlen(buff));
+			strlcat(tpc_value, buff, sizeof(tpc_value));
 		}
 		tpc_stats->tpc_table[type].pream_idx[i] = pream_idx;
 		tpc_stats->tpc_table[type].rate_code[i] = rate_code[i];

From d22036003893cbe479404e20fdae10addc6c18dd Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Fri, 18 Jan 2019 12:56:20 +0100
Subject: [PATCH 2951/3220] Bluetooth: Check L2CAP option sizes returned from
 l2cap_get_conf_opt

commit af3d5d1c87664a4f150fcf3534c6567cb19909b0 upstream.

When doing option parsing for standard type values of 1, 2 or 4 octets,
the value is converted directly into a variable instead of a pointer. To
avoid being tricked into being a pointer, check that for these option
types that sizes actually match. In L2CAP every option is fixed size and
thus it is prudent anyway to ensure that the remote side sends us the
right option size along with option paramters.

If the option size is not matching the option type, then that option is
silently ignored. It is a protocol violation and instead of trying to
give the remote attacker any further hints just pretend that option is
not present and proceed with the default values. Implementation
following the specification and its qualification procedures will always
use the correct size and thus not being impacted here.

To keep the code readable and consistent accross all options, a few
cosmetic changes were also required.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bluetooth/l2cap_core.c | 77 +++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 31 deletions(-)

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index af68674690af..e9fd756b88da 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -3321,10 +3321,14 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
 
 		switch (type) {
 		case L2CAP_CONF_MTU:
+			if (olen != 2)
+				break;
 			mtu = val;
 			break;
 
 		case L2CAP_CONF_FLUSH_TO:
+			if (olen != 2)
+				break;
 			chan->flush_to = val;
 			break;
 
@@ -3332,26 +3336,30 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
 			break;
 
 		case L2CAP_CONF_RFC:
-			if (olen == sizeof(rfc))
-				memcpy(&rfc, (void *) val, olen);
+			if (olen != sizeof(rfc))
+				break;
+			memcpy(&rfc, (void *) val, olen);
 			break;
 
 		case L2CAP_CONF_FCS:
+			if (olen != 1)
+				break;
 			if (val == L2CAP_FCS_NONE)
 				set_bit(CONF_RECV_NO_FCS, &chan->conf_state);
 			break;
 
 		case L2CAP_CONF_EFS:
-			if (olen == sizeof(efs)) {
-				remote_efs = 1;
-				memcpy(&efs, (void *) val, olen);
-			}
+			if (olen != sizeof(efs))
+				break;
+			remote_efs = 1;
+			memcpy(&efs, (void *) val, olen);
 			break;
 
 		case L2CAP_CONF_EWS:
+			if (olen != 2)
+				break;
 			if (!(chan->conn->local_fixed_chan & L2CAP_FC_A2MP))
 				return -ECONNREFUSED;
-
 			set_bit(FLAG_EXT_CTRL, &chan->flags);
 			set_bit(CONF_EWS_RECV, &chan->conf_state);
 			chan->tx_win_max = L2CAP_DEFAULT_EXT_WINDOW;
@@ -3361,7 +3369,6 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
 		default:
 			if (hint)
 				break;
-
 			result = L2CAP_CONF_UNKNOWN;
 			*((u8 *) ptr++) = type;
 			break;
@@ -3529,55 +3536,60 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
 
 		switch (type) {
 		case L2CAP_CONF_MTU:
+			if (olen != 2)
+				break;
 			if (val < L2CAP_DEFAULT_MIN_MTU) {
 				*result = L2CAP_CONF_UNACCEPT;
 				chan->imtu = L2CAP_DEFAULT_MIN_MTU;
 			} else
 				chan->imtu = val;
-			l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr);
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu,
+					   endptr - ptr);
 			break;
 
 		case L2CAP_CONF_FLUSH_TO:
+			if (olen != 2)
+				break;
 			chan->flush_to = val;
-			l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO,
-					   2, chan->flush_to, endptr - ptr);
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, 2,
+					   chan->flush_to, endptr - ptr);
 			break;
 
 		case L2CAP_CONF_RFC:
-			if (olen == sizeof(rfc))
-				memcpy(&rfc, (void *)val, olen);
-
+			if (olen != sizeof(rfc))
+				break;
+			memcpy(&rfc, (void *)val, olen);
 			if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state) &&
 			    rfc.mode != chan->mode)
 				return -ECONNREFUSED;
-
 			chan->fcs = 0;
-
-			l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
-					   sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+					   (unsigned long) &rfc, endptr - ptr);
 			break;
 
 		case L2CAP_CONF_EWS:
+			if (olen != 2)
+				break;
 			chan->ack_win = min_t(u16, val, chan->ack_win);
 			l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
 					   chan->tx_win, endptr - ptr);
 			break;
 
 		case L2CAP_CONF_EFS:
-			if (olen == sizeof(efs)) {
-				memcpy(&efs, (void *)val, olen);
-
-				if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
-				    efs.stype != L2CAP_SERV_NOTRAFIC &&
-				    efs.stype != chan->local_stype)
-					return -ECONNREFUSED;
-
-				l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
-						   (unsigned long) &efs, endptr - ptr);
-			}
+			if (olen != sizeof(efs))
+				break;
+			memcpy(&efs, (void *)val, olen);
+			if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
+			    efs.stype != L2CAP_SERV_NOTRAFIC &&
+			    efs.stype != chan->local_stype)
+				return -ECONNREFUSED;
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
+					   (unsigned long) &efs, endptr - ptr);
 			break;
 
 		case L2CAP_CONF_FCS:
+			if (olen != 1)
+				break;
 			if (*result == L2CAP_CONF_PENDING)
 				if (val == L2CAP_FCS_NONE)
 					set_bit(CONF_RECV_NO_FCS,
@@ -3709,10 +3721,13 @@ static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len)
 
 		switch (type) {
 		case L2CAP_CONF_RFC:
-			if (olen == sizeof(rfc))
-				memcpy(&rfc, (void *)val, olen);
+			if (olen != sizeof(rfc))
+				break;
+			memcpy(&rfc, (void *)val, olen);
 			break;
 		case L2CAP_CONF_EWS:
+			if (olen != 2)
+				break;
 			txwin_ext = val;
 			break;
 		}

From ade4560e4fea198866e033fe1c02f063d6d7db2e Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Fri, 18 Jan 2019 13:43:19 +0100
Subject: [PATCH 2952/3220] Bluetooth: Verify that l2cap_get_conf_opt provides
 large enough buffer

commit 7c9cbd0b5e38a1672fcd137894ace3b042dfbf69 upstream.

The function l2cap_get_conf_opt will return L2CAP_CONF_OPT_SIZE + opt->len
as length value. The opt->len however is in control over the remote user
and can be used by an attacker to gain access beyond the bounds of the
actual packet.

To prevent any potential leak of heap memory, it is enough to check that
the resulting len calculation after calling l2cap_get_conf_opt is not
below zero. A well formed packet will always return >= 0 here and will
end with the length value being zero after the last option has been
parsed. In case of malformed packets messing with the opt->len field the
length value will become negative. If that is the case, then just abort
and ignore the option.

In case an attacker uses a too short opt->len value, then garbage will
be parsed, but that is protected by the unknown option handling and also
the option parameter size checks.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bluetooth/l2cap_core.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index e9fd756b88da..f76e9c1e9f17 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -3315,6 +3315,8 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
 
 	while (len >= L2CAP_CONF_OPT_SIZE) {
 		len -= l2cap_get_conf_opt(&req, &type, &olen, &val);
+		if (len < 0)
+			break;
 
 		hint  = type & L2CAP_CONF_HINT;
 		type &= L2CAP_CONF_MASK;
@@ -3533,6 +3535,8 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
 
 	while (len >= L2CAP_CONF_OPT_SIZE) {
 		len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+		if (len < 0)
+			break;
 
 		switch (type) {
 		case L2CAP_CONF_MTU:
@@ -3718,6 +3722,8 @@ static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len)
 
 	while (len >= L2CAP_CONF_OPT_SIZE) {
 		len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+		if (len < 0)
+			break;
 
 		switch (type) {
 		case L2CAP_CONF_RFC:

From e43196819cc3d2ac044f1669741f440397b82692 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Thu, 17 Dec 2015 07:34:27 +0800
Subject: [PATCH 2953/3220] sched/fair: Fix new task's load avg removed from
 source CPU in wake_up_new_task()

[ Upstream commit 0905f04eb21fc1c2e690bed5d0418a061d56c225 ]

If a newly created task is selected to go to a different CPU in fork
balance when it wakes up the first time, its load averages should
not be removed from the source CPU since they are never added to
it before. The same is also applicable to a never used group entity.

Fix it in remove_entity_load_avg(): when entity's last_update_time
is 0, simply return. This should precisely identify the case in
question, because in other migrations, the last_update_time is set
to 0 after remove_entity_load_avg().

Reported-by: Steve Muckle <steve.muckle@linaro.org>
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
[peterz: cfs_rq_last_update_time]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <Juri.Lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20151216233427.GJ28098@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c2af250547bb..6051007918ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2841,6 +2841,27 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
 }
 
+#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	u64 last_update_time_copy;
+	u64 last_update_time;
+
+	do {
+		last_update_time_copy = cfs_rq->load_last_update_time_copy;
+		smp_rmb();
+		last_update_time = cfs_rq->avg.last_update_time;
+	} while (last_update_time != last_update_time_copy);
+
+	return last_update_time;
+}
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->avg.last_update_time;
+}
+#endif
+
 /*
  * Task first catches up with cfs_rq, and then subtract
  * itself from the cfs_rq (task must be off the queue now).
@@ -2850,17 +2871,14 @@ void remove_entity_load_avg(struct sched_entity *se)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 last_update_time;
 
-#ifndef CONFIG_64BIT
-	u64 last_update_time_copy;
+	/*
+	 * Newly created task or never used group entity should not be removed
+	 * from its (source) cfs_rq
+	 */
+	if (se->avg.last_update_time == 0)
+		return;
 
-	do {
-		last_update_time_copy = cfs_rq->load_last_update_time_copy;
-		smp_rmb();
-		last_update_time = cfs_rq->avg.last_update_time;
-	} while (last_update_time != last_update_time_copy);
-#else
-	last_update_time = cfs_rq->avg.last_update_time;
-#endif
+	last_update_time = cfs_rq_last_update_time(cfs_rq);
 
 	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);

From 4afe1affaa6d3ffac7bf97ff05fca442c61a36a3 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Thu, 22 Oct 2015 10:00:41 -0700
Subject: [PATCH 2954/3220] mmc: block: Allow more than 8 partitions per card

[ Upstream commit 382c55f88ffeb218c446bf0c46d0fc25d2795fe2 ]

It is quite common for Android devices to utilize more
then 8 partitions on internal eMMC storage.

The vanilla kernel can support this via
CONFIG_MMC_BLOCK_MINORS, however that solution caps the
system to 256 minors total, which limits the number of
mmc cards the system can support.

This patch, which has been carried for quite awhile in
the AOSP common tree, provides an alternative solution
that doesn't seem to limit the total card count. So I
wanted to submit it for consideration upstream.

This patch sets the GENHD_FL_EXT_DEVT flag, which will
allocate minor number in major 259 for partitions past
disk->minors.

It also removes the use of disk_devt to determine devidx
from md->disk. md->disk->first_minor is always initialized
from devidx and can always be used to recover it.

Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Chuanxiao Dong <chuanxiao.dong@intel.com>
Cc: Shawn Lin <shawn.lin@rock-chips.com>
Cc: Austin S Hemmelgarn <ahferroin7@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Android Kernel Team <kernel-team@android.com>
Cc: linux-mmc@vger.kernel.org
Signed-off-by: Colin Cross <ccross@android.com>
[jstultz: Added context to commit message]
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mmc/card/block.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index f2b733275a0a..c15b879c3070 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -171,11 +171,7 @@ static struct mmc_blk_data *mmc_blk_get(struct gendisk *disk)
 
 static inline int mmc_get_devidx(struct gendisk *disk)
 {
-	int devmaj = MAJOR(disk_devt(disk));
-	int devidx = MINOR(disk_devt(disk)) / perdev_minors;
-
-	if (!devmaj)
-		devidx = disk->first_minor / perdev_minors;
+	int devidx = disk->first_minor / perdev_minors;
 	return devidx;
 }
 
@@ -2252,6 +2248,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
 	md->disk->queue = md->queue.queue;
 	md->disk->driverfs_dev = parent;
 	set_disk_ro(md->disk, md->read_only || default_ro);
+	md->disk->flags = GENHD_FL_EXT_DEVT;
 	if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT))
 		md->disk->flags |= GENHD_FL_NO_PART_SCAN;
 

From 599dcbf7e8c65094bfc3b5e5d8adb9236eef465b Mon Sep 17 00:00:00 2001
From: Yury Norov <ynorov@caviumnetworks.com>
Date: Wed, 2 Dec 2015 14:00:10 +0000
Subject: [PATCH 2955/3220] arm64: fix COMPAT_SHMLBA definition for large pages

[ Upstream commit b9b7aebb42d1b1392f3111de61136bb6cf3aae3f ]

ARM glibc uses (4 * __getpagesize()) for SHMLBA, which is correct for
4KB pages and works fine for 64KB pages, but the kernel uses a hardcoded
16KB that is too small for 64KB page based kernels. This changes the
definition to what user space sees when using 64KB pages.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/include/asm/shmparam.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/shmparam.h b/arch/arm64/include/asm/shmparam.h
index 4df608a8459e..e368a55ebd22 100644
--- a/arch/arm64/include/asm/shmparam.h
+++ b/arch/arm64/include/asm/shmparam.h
@@ -21,7 +21,7 @@
  * alignment value. Since we don't have aliasing D-caches, the rest of
  * the time we can safely use PAGE_SIZE.
  */
-#define COMPAT_SHMLBA	0x4000
+#define COMPAT_SHMLBA	(4 * PAGE_SIZE)
 
 #include <asm-generic/shmparam.h>
 

From 1f2c3b46811d9fa157e3e8a4b0f700a5c488ee1a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 23 Dec 2015 10:29:28 +0100
Subject: [PATCH 2956/3220] efi: stub: define DISABLE_BRANCH_PROFILING for all
 architectures

[ Upstream commit b523e185bba36164ca48a190f5468c140d815414 ]

This moves the DISABLE_BRANCH_PROFILING define from the x86 specific
to the general CFLAGS definition for the stub. This fixes build errors
when building for arm64 with CONFIG_PROFILE_ALL_BRANCHES_ENABLED.

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/firmware/efi/libstub/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 88bd6829a358..edb45f72b34c 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -8,7 +8,7 @@ cflags-$(CONFIG_X86_32)		:= -march=i386
 cflags-$(CONFIG_X86_64)		:= -mcmodel=small
 cflags-$(CONFIG_X86)		+= -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
 				   -fPIC -fno-strict-aliasing -mno-red-zone \
-				   -mno-mmx -mno-sse -DDISABLE_BRANCH_PROFILING
+				   -mno-mmx -mno-sse
 
 cflags-$(CONFIG_ARM64)		:= $(subst -pg,,$(KBUILD_CFLAGS)) -fpie
 cflags-$(CONFIG_ARM)		:= $(subst -pg,,$(KBUILD_CFLAGS)) \
@@ -16,7 +16,7 @@ cflags-$(CONFIG_ARM)		:= $(subst -pg,,$(KBUILD_CFLAGS)) \
 
 cflags-$(CONFIG_EFI_ARMSTUB)	+= -I$(srctree)/scripts/dtc/libfdt
 
-KBUILD_CFLAGS			:= $(cflags-y) \
+KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
 				   $(call cc-option,-ffreestanding) \
 				   $(call cc-option,-fno-stack-protector)
 

From 717da8c75b412e6873259ccc878a99fc932ac969 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Nov 2015 15:49:23 +0100
Subject: [PATCH 2957/3220] ARM: 8458/1: bL_switcher: add GIC dependency

[ Upstream commit 6c044fecdf78be3fda159a5036bb33700cdd5e59 ]

It is not possible to build the bL_switcher code if the GIC
driver is disabled, because it relies on calling into some
gic specific interfaces, and that would result in this build
error:

arch/arm/common/built-in.o: In function `bL_switch_to':
:(.text+0x1230): undefined reference to `gic_get_sgir_physaddr'
:(.text+0x1244): undefined reference to `gic_send_sgi'
:(.text+0x1268): undefined reference to `gic_migrate_target'
arch/arm/common/built-in.o: In function `bL_switcher_enable.part.4':
:(.text.unlikely+0x2f8): undefined reference to `gic_get_cpu_id'

This adds a Kconfig dependency to ensure we only build the big-little
switcher if the GIC driver is present as well.

Almost all ARMv7 platforms come with a GIC anyway, but it is possible
to build a kernel that disables all platforms.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 3a0277c6c060..4cc908ee107f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1422,7 +1422,7 @@ config BIG_LITTLE
 
 config BL_SWITCHER
 	bool "big.LITTLE switcher support"
-	depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
+	depends on BIG_LITTLE && MCPM && HOTPLUG_CPU && ARM_GIC
 	select ARM_CPU_SUSPEND
 	select CPU_PM
 	help

From 7276c3fbd3b680df29732e443780de6117546b9e Mon Sep 17 00:00:00 2001
From: Jungseung Lee <js07.lee@samsung.com>
Date: Tue, 29 Dec 2015 05:47:00 +0100
Subject: [PATCH 2958/3220] ARM: 8494/1: mm: Enable PXN when running non-LPAE
 kernel on LPAE processor

[ Upstream commit ad84f56bf6d620fe6ed4d57ce6ec9945684d7f35 ]

The VMSA field of MMFR0 (bottom 4 bits) is incremented for each
added feature.  PXN is supported if the value is >= 4 and LPAE
is supported if it is >= 5.

In case a kernel with CONFIG_ARM_LPAE disabled is used on a
processor that supports LPAE, we can still use PXN in short
descriptors.  So check for >= 4 not == 4.

Signed-off-by: Jungseung Lee <js07.lee@samsung.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/mm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index e47cffd25c6c..aead23f15213 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -572,7 +572,7 @@ static void __init build_mem_type_table(void)
 	 * in the Short-descriptor translation table format descriptors.
 	 */
 	if (cpu_arch == CPU_ARCH_ARMv7 &&
-		(read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) {
+		(read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) {
 		user_pmd_table |= PMD_PXNTABLE;
 	}
 #endif

From 119dbcde59da0d44c6f39a4d83b01577910303d4 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@chromium.org>
Date: Mon, 14 Dec 2015 17:34:08 -0800
Subject: [PATCH 2959/3220] android: unconditionally remove callbacks in
 sync_fence_free()

[ Upstream commit 699f685569434510d944e419f4048c4e3ba8d631 ]

Using fence->status to determine whether or not there are callbacks
remaining on the sync_fence is racy since fence->status may have been
decremented to 0 on another CPU before fence_check_cb_func() has
completed.  By unconditionally calling fence_remove_callback() for each
fence in the sync_fence, we guarantee that each callback has either
completed (since fence_remove_callback() grabs the fence lock) or been
removed.

Signed-off-by: Andrew Bresticker <abrestic@chromium.org>
Signed-off-by: Dmitry Torokhov <dtor@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/staging/android/sync.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/android/sync.c b/drivers/staging/android/sync.c
index f83e00c78051..50a9945da27e 100644
--- a/drivers/staging/android/sync.c
+++ b/drivers/staging/android/sync.c
@@ -519,12 +519,10 @@ static const struct fence_ops android_fence_ops = {
 static void sync_fence_free(struct kref *kref)
 {
 	struct sync_fence *fence = container_of(kref, struct sync_fence, kref);
-	int i, status = atomic_read(&fence->status);
+	int i;
 
 	for (i = 0; i < fence->num_fences; ++i) {
-		if (status)
-			fence_remove_callback(fence->cbs[i].sync_pt,
-					      &fence->cbs[i].cb);
+		fence_remove_callback(fence->cbs[i].sync_pt, &fence->cbs[i].cb);
 		fence_put(fence->cbs[i].sync_pt);
 	}
 

From bdf3c006b9a2308f481ca070d3a85baf1bc5d48f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 14 Jan 2016 15:21:40 -0800
Subject: [PATCH 2960/3220] vmstat: make vmstat_updater deferrable again and
 shut down on idle

[ Upstream commit 0eb77e9880321915322d42913c3b53241739c8aa ]

Currently the vmstat updater is not deferrable as a result of commit
ba4877b9ca51 ("vmstat: do not use deferrable delayed work for
vmstat_update").  This in turn can cause multiple interruptions of the
applications because the vmstat updater may run at

Make vmstate_update deferrable again and provide a function that folds
the differentials when the processor is going to idle mode thus
addressing the issue of the above commit in a clean way.

Note that the shepherd thread will continue scanning the differentials
from another processor and will reenable the vmstat workers if it
detects any changes.

Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update")
Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/linux/vmstat.h |  2 ++
 kernel/sched/idle.c    |  1 +
 mm/vmstat.c            | 69 +++++++++++++++++++++++++++---------------
 3 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3e5d9075960f..73fae8c4a5fb 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -189,6 +189,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item);
 extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
+void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
@@ -249,6 +250,7 @@ static inline void __dec_zone_page_state(struct page *page,
 
 static inline void refresh_zone_stat_thresholds(void) { }
 static inline void cpu_vm_stats_fold(int cpu) { }
+static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
 			struct per_cpu_pageset *pset) { }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index bfd573122e0d..306a859b36f0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,6 +219,7 @@ static void cpu_idle_loop(void)
 		 */
 
 		__current_set_polling();
+		quiet_vmstat();
 		tick_nohz_idle_enter();
 
 		while (!need_resched()) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a2d70ef74db7..6af9bbad94c7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -460,7 +460,7 @@ static int fold_diff(int *diff)
  *
  * The function returns the number of global counters updated.
  */
-static int refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(bool do_pagesets)
 {
 	struct zone *zone;
 	int i;
@@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void)
 #endif
 			}
 		}
-		cond_resched();
 #ifdef CONFIG_NUMA
-		/*
-		 * Deal with draining the remote pageset of this
-		 * processor
-		 *
-		 * Check if there are pages remaining in this pageset
-		 * if not then there is nothing to expire.
-		 */
-		if (!__this_cpu_read(p->expire) ||
+		if (do_pagesets) {
+			cond_resched();
+			/*
+			 * Deal with draining the remote pageset of this
+			 * processor
+			 *
+			 * Check if there are pages remaining in this pageset
+			 * if not then there is nothing to expire.
+			 */
+			if (!__this_cpu_read(p->expire) ||
 			       !__this_cpu_read(p->pcp.count))
-			continue;
+				continue;
 
-		/*
-		 * We never drain zones local to this processor.
-		 */
-		if (zone_to_nid(zone) == numa_node_id()) {
-			__this_cpu_write(p->expire, 0);
-			continue;
-		}
+			/*
+			 * We never drain zones local to this processor.
+			 */
+			if (zone_to_nid(zone) == numa_node_id()) {
+				__this_cpu_write(p->expire, 0);
+				continue;
+			}
 
-		if (__this_cpu_dec_return(p->expire))
-			continue;
+			if (__this_cpu_dec_return(p->expire))
+				continue;
 
-		if (__this_cpu_read(p->pcp.count)) {
-			drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
-			changes++;
+			if (__this_cpu_read(p->pcp.count)) {
+				drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+				changes++;
+			}
 		}
 #endif
 	}
@@ -1393,7 +1395,7 @@ static cpumask_var_t cpu_stat_off;
 
 static void vmstat_update(struct work_struct *w)
 {
-	if (refresh_cpu_vm_stats()) {
+	if (refresh_cpu_vm_stats(true)) {
 		/*
 		 * Counters were updated so we expect more updates
 		 * to occur in the future. Keep on running the
@@ -1424,6 +1426,23 @@ static void vmstat_update(struct work_struct *w)
 	}
 }
 
+/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
+void quiet_vmstat(void)
+{
+	if (system_state != SYSTEM_RUNNING)
+		return;
+
+	do {
+		if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+			cancel_delayed_work(this_cpu_ptr(&vmstat_work));
+
+	} while (refresh_cpu_vm_stats(false));
+}
+
 /*
  * Check if the diffs for a certain cpu indicate that
  * an update is needed.
@@ -1456,7 +1475,7 @@ static bool need_update(int cpu)
  */
 static void vmstat_shepherd(struct work_struct *w);
 
-static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
+static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
 
 static void vmstat_shepherd(struct work_struct *w)
 {

From 2aeca9a27ba4155e413648cc1f498e68c9dd46f0 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 3 Nov 2015 17:01:46 -0500
Subject: [PATCH 2961/3220] hid-sensor-hub.c: fix wrong do_div() usage

[ Upstream commit 8d43b49e7e0070f96ac46d30659a336c0224fa0b ]

do_div() must only be used with a u64 dividend.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hid/hid-sensor-hub.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c
index 92870cdb52d9..8efaa88329aa 100644
--- a/drivers/hid/hid-sensor-hub.c
+++ b/drivers/hid/hid-sensor-hub.c
@@ -218,7 +218,8 @@ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
 		goto done_proc;
 	}
 
-	remaining_bytes = do_div(buffer_size, sizeof(__s32));
+	remaining_bytes = buffer_size % sizeof(__s32);
+	buffer_size = buffer_size / sizeof(__s32);
 	if (buffer_size) {
 		for (i = 0; i < buffer_size; ++i) {
 			hid_set_field(report->field[field_index], i,

From b6496f00a29def9176bae3b63b739ff7454b783a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 15 Jan 2016 13:28:57 +0100
Subject: [PATCH 2962/3220] arm64: hide __efistub_ aliases from kallsyms

[ Upstream commit 75feee3d9d51775072d3a04f47d4a439a4c4590e ]

Commit e8f3010f7326 ("arm64/efi: isolate EFI stub from the kernel
proper") isolated the EFI stub code from the kernel proper by prefixing
all of its symbols with __efistub_, and selectively allowing access to
core kernel symbols from the stub by emitting __efistub_ aliases for
functions and variables that the stub can access legally.

As an unintended side effect, these aliases are emitted into the
kallsyms symbol table, which means they may turn up in backtraces,
e.g.,

  ...
  PC is at __efistub_memset+0x108/0x200
  LR is at fixup_init+0x3c/0x48
  ...
  [<ffffff8008328608>] __efistub_memset+0x108/0x200
  [<ffffff8008094dcc>] free_initmem+0x2c/0x40
  [<ffffff8008645198>] kernel_init+0x20/0xe0
  [<ffffff8008085cd0>] ret_from_fork+0x10/0x40

The backtrace in question has nothing to do with the EFI stub, but
simply returns one of the several aliases of memset() that have been
recorded in the kallsyms table. This is undesirable, since it may
suggest to people who are not aware of this that the issue they are
seeing is somehow EFI related.

So hide the __efistub_ aliases from kallsyms, by emitting them as
absolute linker symbols explicitly. The distinction between those
and section relative symbols is completely irrelevant to these
definitions, and to the final link we are performing when these
definitions are being taken into account (the distinction is only
relevant to symbols defined inside a section definition when performing
a partial link), and so the resulting values are identical to the
original ones. Since absolute symbols are ignored by kallsyms, this
will result in these values to be omitted from its symbol table.

After this patch, the backtrace generated from the same address looks
like this:
  ...
  PC is at __memset+0x108/0x200
  LR is at fixup_init+0x3c/0x48
  ...
  [<ffffff8008328608>] __memset+0x108/0x200
  [<ffffff8008094dcc>] free_initmem+0x2c/0x40
  [<ffffff8008645198>] kernel_init+0x20/0xe0
  [<ffffff8008085cd0>] ret_from_fork+0x10/0x40

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/kernel/image.h | 40 ++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index bc2abb8b1599..999633bd7294 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -64,6 +64,16 @@
 
 #ifdef CONFIG_EFI
 
+/*
+ * Prevent the symbol aliases below from being emitted into the kallsyms
+ * table, by forcing them to be absolute symbols (which are conveniently
+ * ignored by scripts/kallsyms) rather than section relative symbols.
+ * The distinction is only relevant for partial linking, and only for symbols
+ * that are defined within a section declaration (which is not the case for
+ * the definitions below) so the resulting values will be identical.
+ */
+#define KALLSYMS_HIDE(sym)	ABSOLUTE(sym)
+
 /*
  * The EFI stub has its own symbol namespace prefixed by __efistub_, to
  * isolate it from the kernel proper. The following symbols are legally
@@ -73,25 +83,25 @@
  * linked at. The routines below are all implemented in assembler in a
  * position independent manner
  */
-__efistub_memcmp		= __pi_memcmp;
-__efistub_memchr		= __pi_memchr;
-__efistub_memcpy		= __pi_memcpy;
-__efistub_memmove		= __pi_memmove;
-__efistub_memset		= __pi_memset;
-__efistub_strlen		= __pi_strlen;
-__efistub_strcmp		= __pi_strcmp;
-__efistub_strncmp		= __pi_strncmp;
-__efistub___flush_dcache_area	= __pi___flush_dcache_area;
+__efistub_memcmp		= KALLSYMS_HIDE(__pi_memcmp);
+__efistub_memchr		= KALLSYMS_HIDE(__pi_memchr);
+__efistub_memcpy		= KALLSYMS_HIDE(__pi_memcpy);
+__efistub_memmove		= KALLSYMS_HIDE(__pi_memmove);
+__efistub_memset		= KALLSYMS_HIDE(__pi_memset);
+__efistub_strlen		= KALLSYMS_HIDE(__pi_strlen);
+__efistub_strcmp		= KALLSYMS_HIDE(__pi_strcmp);
+__efistub_strncmp		= KALLSYMS_HIDE(__pi_strncmp);
+__efistub___flush_dcache_area	= KALLSYMS_HIDE(__pi___flush_dcache_area);
 
 #ifdef CONFIG_KASAN
-__efistub___memcpy		= __pi_memcpy;
-__efistub___memmove		= __pi_memmove;
-__efistub___memset		= __pi_memset;
+__efistub___memcpy		= KALLSYMS_HIDE(__pi_memcpy);
+__efistub___memmove		= KALLSYMS_HIDE(__pi_memmove);
+__efistub___memset		= KALLSYMS_HIDE(__pi_memset);
 #endif
 
-__efistub__text			= _text;
-__efistub__end			= _end;
-__efistub__edata		= _edata;
+__efistub__text			= KALLSYMS_HIDE(_text);
+__efistub__end			= KALLSYMS_HIDE(_end);
+__efistub__edata		= KALLSYMS_HIDE(_edata);
 
 #endif
 

From 6f311381f318a1c756075848784cd39cf32fa6e3 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 19 Jan 2016 17:14:29 +0200
Subject: [PATCH 2963/3220] perf: Synchronously free aux pages in case of
 allocation failure

[ Upstream commit 45c815f06b80031659c63d7b93e580015d6024dd ]

We are currently using asynchronous deallocation in the error path in
AUX mmap code, which is unnecessary and also presents a problem for users
that wish to probe for the biggest possible buffer size they can get:
they'll get -EINVAL on all subsequent attemts to allocate a smaller
buffer before the asynchronous deallocation callback frees up the pages
from the previous unsuccessful attempt.

Currently, gdb does that for allocating AUX buffers for Intel PT traces.
More specifically, overwrite mode of AUX pmus that don't support hardware
sg (some implementations of Intel PT, for instance) is limited to only
one contiguous high order allocation for its buffer and there is no way
of knowing its size without trying.

This patch changes error path freeing to be synchronous as there won't
be any contenders for the AUX pages at that point.

Reported-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1453216469-9509-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/events/ring_buffer.c | 40 ++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 358bb53c1e74..94dc6b0763ab 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -468,6 +468,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
 	__free_page(page);
 }
 
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+	int pg;
+
+	if (rb->aux_priv) {
+		rb->free_aux(rb->aux_priv);
+		rb->free_aux = NULL;
+		rb->aux_priv = NULL;
+	}
+
+	if (rb->aux_nr_pages) {
+		for (pg = 0; pg < rb->aux_nr_pages; pg++)
+			rb_free_aux_page(rb, pg);
+
+		kfree(rb->aux_pages);
+		rb->aux_nr_pages = 0;
+	}
+}
+
 int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 		 pgoff_t pgoff, int nr_pages, long watermark, int flags)
 {
@@ -556,30 +575,11 @@ out:
 	if (!ret)
 		rb->aux_pgoff = pgoff;
 	else
-		rb_free_aux(rb);
+		__rb_free_aux(rb);
 
 	return ret;
 }
 
-static void __rb_free_aux(struct ring_buffer *rb)
-{
-	int pg;
-
-	if (rb->aux_priv) {
-		rb->free_aux(rb->aux_priv);
-		rb->free_aux = NULL;
-		rb->aux_priv = NULL;
-	}
-
-	if (rb->aux_nr_pages) {
-		for (pg = 0; pg < rb->aux_nr_pages; pg++)
-			rb_free_aux_page(rb, pg);
-
-		kfree(rb->aux_pages);
-		rb->aux_nr_pages = 0;
-	}
-}
-
 void rb_free_aux(struct ring_buffer *rb)
 {
 	if (atomic_dec_and_test(&rb->aux_refcount))

From ab3f1a5b9ce54a4509824ae4336e3fa5ca65ffa6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 20 Jan 2016 16:25:01 -0800
Subject: [PATCH 2964/3220] net: diag: support v4mapped sockets in
 inet_diag_find_one_icsk()

[ Upstream commit 7c1306723ee916ea9f1fa7d9e4c7a6d029ca7aaf ]

Lorenzo reported that we could not properly find v4mapped sockets
in inet_diag_find_one_icsk(). This patch fixes the issue.

Reported-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/ipv4/inet_diag.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index ab9f8a66615d..386443e780da 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -366,13 +366,20 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 				 req->id.idiag_dport, req->id.idiag_src[0],
 				 req->id.idiag_sport, req->id.idiag_if);
 #if IS_ENABLED(CONFIG_IPV6)
-	else if (req->sdiag_family == AF_INET6)
-		sk = inet6_lookup(net, hashinfo,
-				  (struct in6_addr *)req->id.idiag_dst,
-				  req->id.idiag_dport,
-				  (struct in6_addr *)req->id.idiag_src,
-				  req->id.idiag_sport,
-				  req->id.idiag_if);
+	else if (req->sdiag_family == AF_INET6) {
+		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+		    ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+			sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3],
+					 req->id.idiag_dport, req->id.idiag_src[3],
+					 req->id.idiag_sport, req->id.idiag_if);
+		else
+			sk = inet6_lookup(net, hashinfo,
+					  (struct in6_addr *)req->id.idiag_dst,
+					  req->id.idiag_dport,
+					  (struct in6_addr *)req->id.idiag_src,
+					  req->id.idiag_sport,
+					  req->id.idiag_if);
+	}
 #endif
 	else
 		goto out_nosk;

From cb8251f8d66785fbd8cc8f3ffc38bafdd5017272 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 11 Feb 2016 16:42:58 +0100
Subject: [PATCH 2965/3220] Revert "mmc: block: don't use parameter prefix if
 built as module"

[ Upstream commit a5ebb87db84392edfd3142c3a6a78431d820a789 ]

This reverts commit 829b6962f7e3cfc06f7c5c26269fd47ad48cf503.

Revert this change as it causes a sysfs path to change and therefore
introduces and ABI regression. More precisely Android's vold is not being
able to access /sys/module/mmcblk/parameters/perdev_minors any more, since
the path becomes changed to: "/sys/module/mmc_block/..."

Fixes: 829b6962f7e3 ("mmc: block: don't use parameter prefix if built as
module")
Reported-by: John Stultz <john.stultz@linaro.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mmc/card/block.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index c15b879c3070..f600bdcaf5b4 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -47,13 +47,10 @@
 #include "queue.h"
 
 MODULE_ALIAS("mmc:block");
-
-#ifdef KERNEL
 #ifdef MODULE_PARAM_PREFIX
 #undef MODULE_PARAM_PREFIX
 #endif
 #define MODULE_PARAM_PREFIX "mmcblk."
-#endif
 
 #define INAND_CMD38_ARG_EXT_CSD  113
 #define INAND_CMD38_ARG_ERASE    0x00

From 9bda5e6b1819daf09fafb66c2e9d4f0625ac8ad4 Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Tue, 16 Feb 2016 13:34:39 -0800
Subject: [PATCH 2966/3220] writeback: initialize inode members that track
 writeback history

[ Upstream commit 3d65ae4634ed8350aee98a4e6f4e41fe40c7d282 ]

inode struct members that track cgroup writeback information
should be reinitialized when inode gets allocated from
kmem_cache. Otherwise, their values remain and get used by the
new inode.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Fixes: d10c80955265 ("writeback: implement foreign cgroup inode bdi_writeback switching")
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/inode.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/inode.c b/fs/inode.c
index a39c2724d8a0..b5c3a6473aaa 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+	inode->i_wb_frn_winner = 0;
+	inode->i_wb_frn_avg_time = 0;
+	inode->i_wb_frn_history = 0;
+#endif
+
 	if (security_inode_alloc(inode))
 		goto out;
 	spin_lock_init(&inode->i_lock);

From f485f972de3d1d8497a5bffd84f8dd6ba5aee3ad Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Tue, 2 Feb 2016 14:13:56 -0700
Subject: [PATCH 2967/3220] coresight: fixing lockdep error

[ Upstream commit a9ddc71f5840c2711e530f2e055b278f79948b29 ]

On some platform the following lockdep error occurs when doing simple
manipulations:

    [   23.197021]
    [   23.198608] ======================================================
    [   23.205078] [ INFO: possible circular locking dependency detected ]
    [   23.211639] 4.4.0-rc8-00025-gbbf360b #172 Not tainted
    [   23.216918] -------------------------------------------------------
    [   23.223480] sh/858 is trying to acquire lock:
    [   23.228057]  (coresight_mutex){+.+.+.}, at: [<c0415d40>] coresight_enable+0x1c/0x1b4
    [   23.236206]
    [   23.236206] but task is already holding lock:
    [   23.242309]  (s_active#52){++++.+}, at: [<c01d4b40>] kernfs_fop_write+0x5c/0x1c0
    [   23.250122]
    [   23.250122] which lock already depends on the new lock.
    [   23.250122]
    [   23.258697]
    [   23.258697] the existing dependency chain (in reverse order) is:
    [   23.266510]
    -> #1 (s_active#52){++++.+}:
    [   23.270843]        [<c01d30ec>] __kernfs_remove+0x294/0x35c
    [   23.276672]        [<c01d3e44>] kernfs_remove_by_name_ns+0x44/0x8c
    [   23.283172]        [<c01d6318>] remove_files+0x3c/0x84
    [   23.288543]        [<c01d66b4>] sysfs_remove_group+0x48/0x9c
    [   23.294494]        [<c01d6734>] sysfs_remove_groups+0x2c/0x3c
    [   23.300506]        [<c030b658>] device_remove_attrs+0x5c/0x74
    [   23.306549]        [<c030c290>] device_del+0x110/0x218
    [   23.311950]        [<c030c3c4>] device_unregister+0x2c/0x6c
    [   23.317779]        [<c04156d8>] coresight_unregister+0x30/0x40
    [   23.323883]        [<c041a290>] etm_probe+0x228/0x2e8
    [   23.329193]        [<c02bc760>] amba_probe+0xe4/0x160
    [   23.334503]        [<c0310540>] driver_probe_device+0x23c/0x480
    [   23.340728]        [<c0310820>] __driver_attach+0x9c/0xa0
    [   23.346374]        [<c030e400>] bus_for_each_dev+0x70/0xa4
    [   23.352142]        [<c030fcf4>] driver_attach+0x24/0x28
    [   23.357604]        [<c030f86c>] bus_add_driver+0x1e0/0x278
    [   23.363372]        [<c0310d48>] driver_register+0x80/0x100
    [   23.369110]        [<c02bc508>] amba_driver_register+0x58/0x5c
    [   23.375244]        [<c0749514>] etm_driver_init+0x18/0x1c
    [   23.380889]        [<c0009918>] do_one_initcall+0xc4/0x20c
    [   23.386657]        [<c0715e7c>] kernel_init_freeable+0x160/0x208
    [   23.392974]        [<c052d7fc>] kernel_init+0x18/0xf0
    [   23.398254]        [<c0010850>] ret_from_fork+0x14/0x24
    [   23.403747]
    -> #0 (coresight_mutex){+.+.+.}:
    [   23.408447]        [<c008ed60>] lock_acquire+0xe4/0x210
    [   23.413909]        [<c0530a30>] mutex_lock_nested+0x74/0x450
    [   23.419860]        [<c0415d40>] coresight_enable+0x1c/0x1b4
    [   23.425689]        [<c0416030>] enable_source_store+0x58/0x68
    [   23.431732]        [<c030b358>] dev_attr_store+0x20/0x2c
    [   23.437286]        [<c01d55e8>] sysfs_kf_write+0x50/0x54
    [   23.442871]        [<c01d4ba8>] kernfs_fop_write+0xc4/0x1c0
    [   23.448699]        [<c015b60c>] __vfs_write+0x34/0xe4
    [   23.454040]        [<c015bf38>] vfs_write+0x98/0x174
    [   23.459228]        [<c015c7a8>] SyS_write+0x4c/0xa8
    [   23.464355]        [<c00107c0>] ret_fast_syscall+0x0/0x1c
    [   23.470031]
    [   23.470031] other info that might help us debug this:
    [   23.470031]
    [   23.478393]  Possible unsafe locking scenario:
    [   23.478393]
    [   23.484619]        CPU0                    CPU1
    [   23.489349]        ----                    ----
    [   23.494079]   lock(s_active#52);
    [   23.497497]                                lock(coresight_mutex);
    [   23.503906]                                lock(s_active#52);
    [   23.509918]   lock(coresight_mutex);
    [   23.513702]
    [   23.513702]  *** DEADLOCK ***
    [   23.513702]
    [   23.519897] 3 locks held by sh/858:
    [   23.523529]  #0:  (sb_writers#7){.+.+.+}, at: [<c015ec38>] __sb_start_write+0xa8/0xd4
    [   23.531799]  #1:  (&of->mutex){+.+...}, at: [<c01d4b38>] kernfs_fop_write+0x54/0x1c0
    [   23.539916]  #2:  (s_active#52){++++.+}, at: [<c01d4b40>] kernfs_fop_write+0x5c/0x1c0
    [   23.548156]
    [   23.548156] stack backtrace:
    [   23.552734] CPU: 0 PID: 858 Comm: sh Not tainted 4.4.0-rc8-00025-gbbf360b #172
    [   23.560302] Hardware name: Generic OMAP4 (Flattened Device Tree)
    [   23.566589] Backtrace:
    [   23.569152] [<c00154d4>] (dump_backtrace) from [<c00156d0>] (show_stack+0x18/0x1c)
    [   23.577087]  r7:ed4b8570 r6:c0936400 r5:c07ae71c r4:00000000
    [   23.583038] [<c00156b8>] (show_stack) from [<c027e69c>] (dump_stack+0x98/0xc0)
    [   23.590606] [<c027e604>] (dump_stack) from [<c008a750>] (print_circular_bug+0x21c/0x33c)
    [   23.599090]  r5:c0939d60 r4:c0936400
    [   23.602874] [<c008a534>] (print_circular_bug) from [<c008e370>] (__lock_acquire+0x1c98/0x1d88)
    [   23.611877]  r10:00000003 r9:c0fd7a5c r8:ed4b8550 r7:ed4b8570 r6:ed4b8000 r5:c0ff69e4
    [   23.620117]  r4:c0936400 r3:ed4b8550
    [   23.623901] [<c008c6d8>] (__lock_acquire) from [<c008ed60>] (lock_acquire+0xe4/0x210)
    [   23.632080]  r10:00000000 r9:00000000 r8:60000013 r7:c07cb7b4 r6:00000001 r5:00000000
    [   23.640350]  r4:00000000
    [   23.643005] [<c008ec7c>] (lock_acquire) from [<c0530a30>] (mutex_lock_nested+0x74/0x450)
    [   23.651458]  r10:ecc0bf80 r9:edbe7dcc r8:ed4b8000 r7:c0fd7a5c r6:c0415d40 r5:00000000
    [   23.659729]  r4:c07cb780
    [   23.662384] [<c05309bc>] (mutex_lock_nested) from [<c0415d40>] (coresight_enable+0x1c/0x1b4)
    [   23.671234]  r10:ecc0bf80 r9:edbe7dcc r8:ed733c00 r7:00000000 r6:ed733c00 r5:00000002
    [   23.679473]  r4:ed762140
    [   23.682128] [<c0415d24>] (coresight_enable) from [<c0416030>] (enable_source_store+0x58/0x68)
    [   23.691070]  r7:00000000 r6:ed733c00 r5:00000002 r4:ed762160
    [   23.697052] [<c0415fd8>] (enable_source_store) from [<c030b358>] (dev_attr_store+0x20/0x2c)
    [   23.705780]  r5:edbe7dc0 r4:c0415fd8
    [   23.709533] [<c030b338>] (dev_attr_store) from [<c01d55e8>] (sysfs_kf_write+0x50/0x54)
    [   23.717834]  r5:edbe7dc0 r4:c030b338
    [   23.721618] [<c01d5598>] (sysfs_kf_write) from [<c01d4ba8>] (kernfs_fop_write+0xc4/0x1c0)
    [   23.730163]  r7:00000000 r6:00000000 r5:00000002 r4:edbe7dc0
    [   23.736145] [<c01d4ae4>] (kernfs_fop_write) from [<c015b60c>] (__vfs_write+0x34/0xe4)
    [   23.744323]  r10:00000000 r9:ecc0a000 r8:c0010964 r7:ecc0bf80 r6:00000002 r5:c01d4ae4
    [   23.752593]  r4:ee385a40
    [   23.755249] [<c015b5d8>] (__vfs_write) from [<c015bf38>] (vfs_write+0x98/0x174)
    [   23.762908]  r9:ecc0a000 r8:c0010964 r7:ecc0bf80 r6:000ab0d8 r5:00000002 r4:ee385a40
    [   23.771057] [<c015bea0>] (vfs_write) from [<c015c7a8>] (SyS_write+0x4c/0xa8)
    [   23.778442]  r8:c0010964 r7:00000002 r6:000ab0d8 r5:ee385a40 r4:ee385a40
    [   23.785522] [<c015c75c>] (SyS_write) from [<c00107c0>] (ret_fast_syscall+0x0/0x1c)
    [   23.793457]  r7:00000004 r6:00000001 r5:000ab0d8 r4:00000002
    [   23.799652] coresight-etb10 54162000.etb: ETB enabled
    [   23.805084] coresight-funnel 54164000.funnel: FUNNEL inport 0 enabled
    [   23.811859] coresight-replicator 44000000.ocp:replicator: REPLICATOR enabled
    [   23.819335] coresight-funnel 54158000.funnel: FUNNEL inport 0 enabled
    [   23.826110] coresight-etm3x 5414c000.ptm: ETM tracing enabled

The locking in coresight_unregister() is not required as the only customers of
the function are drivers themselves when an initialisation failure has been
encoutered.

Reported-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/coresight/coresight.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 902ee6efd09c..84fc60318f79 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -716,12 +716,8 @@ EXPORT_SYMBOL_GPL(coresight_register);
 
 void coresight_unregister(struct coresight_device *csdev)
 {
-	mutex_lock(&coresight_mutex);
-
 	kfree(csdev->conns);
 	device_unregister(&csdev->dev);
-
-	mutex_unlock(&coresight_mutex);
 }
 EXPORT_SYMBOL_GPL(coresight_unregister);
 

From 3e2070abcc90909d6a10e32ba8cda5529a978e8c Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Tue, 2 Feb 2016 14:13:57 -0700
Subject: [PATCH 2968/3220] coresight: coresight_unregister() function cleanup

[ Upstream commit fae54158792aec705620bdc3938d342879204f0c ]

In its current form the code never frees csdev->refcnt allocated
in coresight_register().  There is also a problem with csdev->conns
that is freed before device_unregister() rather than in the device
release function.

This patch addresses both issues by moving kfree(csdev->conns) to
coresight_device_release() and freeing csdev->refcnt, also in
the same function.

Reported-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/coresight/coresight.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 84fc60318f79..a7d1edbf3340 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -484,6 +484,8 @@ static void coresight_device_release(struct device *dev)
 {
 	struct coresight_device *csdev = to_coresight_device(dev);
 
+	kfree(csdev->conns);
+	kfree(csdev->refcnt);
 	kfree(csdev);
 }
 
@@ -716,7 +718,6 @@ EXPORT_SYMBOL_GPL(coresight_register);
 
 void coresight_unregister(struct coresight_device *csdev)
 {
-	kfree(csdev->conns);
 	device_unregister(&csdev->dev);
 }
 EXPORT_SYMBOL_GPL(coresight_unregister);

From a2828b9445919489c10afb2dab09b1e41676df6f Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Tue, 2 Feb 2016 14:13:58 -0700
Subject: [PATCH 2969/3220] coresight: release reference taken by
 'bus_find_device()'

[ Upstream commit f2dfab3568fc32afeac8b698481e80e7ab2dc658 ]

The reference count taken by function bus_find_device() needs
to be released if a child device is found, something this patch
is adding.

Reported-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/coresight/coresight.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index a7d1edbf3340..5e2a2a5ad601 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -573,6 +573,8 @@ static void coresight_fixup_device_conns(struct coresight_device *csdev)
 
 		if (dev) {
 			conn->child_dev = to_coresight_device(dev);
+			/* and put reference from 'bus_find_device()' */
+			put_device(dev);
 		} else {
 			csdev->orphan = true;
 			conn->child_dev = NULL;

From 03d7b4edf78a976f5d7e74f7ed17a10bb7550e9a Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Tue, 2 Feb 2016 14:13:59 -0700
Subject: [PATCH 2970/3220] coresight: remove csdev's link from topology

[ Upstream commit ad725aee070caf8fa93d84d6fb78321f9642db18 ]

In function 'coresight_unregister()', all references to the csdev that
is being taken away need to be removed from the topology.  Otherwise
building the next coresight path from source to sink may use memory
that has been released.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/coresight/coresight.c | 46 +++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 5e2a2a5ad601..c6aea4795d0b 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -582,6 +582,50 @@ static void coresight_fixup_device_conns(struct coresight_device *csdev)
 	}
 }
 
+static int coresight_remove_match(struct device *dev, void *data)
+{
+	int i;
+	struct coresight_device *csdev, *iterator;
+	struct coresight_connection *conn;
+
+	csdev = data;
+	iterator = to_coresight_device(dev);
+
+	/* No need to check oneself */
+	if (csdev == iterator)
+		return 0;
+
+	/*
+	 * Circle throuch all the connection of that component.  If we find
+	 * a connection whose name matches @csdev, remove it.
+	 */
+	for (i = 0; i < iterator->nr_outport; i++) {
+		conn = &iterator->conns[i];
+
+		if (conn->child_dev == NULL)
+			continue;
+
+		if (!strcmp(dev_name(&csdev->dev), conn->child_name)) {
+			iterator->orphan = true;
+			conn->child_dev = NULL;
+			/* No need to continue */
+			break;
+		}
+	}
+
+	/*
+	 * Returning '0' ensures that all known component on the
+	 * bus will be checked.
+	 */
+	return 0;
+}
+
+static void coresight_remove_conns(struct coresight_device *csdev)
+{
+	bus_for_each_dev(&coresight_bustype, NULL,
+			 csdev, coresight_remove_match);
+}
+
 /**
  * coresight_timeout - loop until a bit has changed to a specific state.
  * @addr: base address of the area of interest.
@@ -720,6 +764,8 @@ EXPORT_SYMBOL_GPL(coresight_register);
 
 void coresight_unregister(struct coresight_device *csdev)
 {
+	/* Remove references of that device in the topology */
+	coresight_remove_conns(csdev);
 	device_unregister(&csdev->dev);
 }
 EXPORT_SYMBOL_GPL(coresight_unregister);

From a556f0babe56eacf66aee9cc88f4cff98d90d727 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 22 Dec 2015 17:25:18 +0200
Subject: [PATCH 2971/3220] stm class: Fix locking in unbinding policy path

[ Upstream commit 4c127fd16e6b33ecb7badc091480c84ea9aebeb6 ]

Right now, if stm device removal has to unbind from a policy (that is,
an stm device that has STP policy, gets removed), it will trigger a
nested lock on the stm device's policy mutex.

This patch fixes the problem by moving the locking from the policy
unbinding to policy removal (configfs path), where it's actually needed;
the other caller of the policy unbinding function already takes the
mutex around the call.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/policy.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/hwtracing/stm/policy.c b/drivers/hwtracing/stm/policy.c
index 11ab6d01adf6..94d3abfb737a 100644
--- a/drivers/hwtracing/stm/policy.c
+++ b/drivers/hwtracing/stm/policy.c
@@ -272,13 +272,17 @@ void stp_policy_unbind(struct stp_policy *policy)
 {
 	struct stm_device *stm = policy->stm;
 
+	/*
+	 * stp_policy_release() will not call here if the policy is already
+	 * unbound; other users should not either, as no link exists between
+	 * this policy and anything else in that case
+	 */
 	if (WARN_ON_ONCE(!policy->stm))
 		return;
 
-	mutex_lock(&stm->policy_mutex);
-	stm->policy = NULL;
-	mutex_unlock(&stm->policy_mutex);
+	lockdep_assert_held(&stm->policy_mutex);
 
+	stm->policy = NULL;
 	policy->stm = NULL;
 
 	stm_put_device(stm);
@@ -287,8 +291,16 @@ void stp_policy_unbind(struct stp_policy *policy)
 static void stp_policy_release(struct config_item *item)
 {
 	struct stp_policy *policy = to_stp_policy(item);
+	struct stm_device *stm = policy->stm;
 
+	/* a policy *can* be unbound and still exist in configfs tree */
+	if (!stm)
+		return;
+
+	mutex_lock(&stm->policy_mutex);
 	stp_policy_unbind(policy);
+	mutex_unlock(&stm->policy_mutex);
+
 	kfree(policy);
 }
 

From 30e94dc24edd8d028da6a02d1a85028ead2f40ed Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 22 Dec 2015 17:25:19 +0200
Subject: [PATCH 2972/3220] stm class: Fix link list locking

[ Upstream commit c74f7e8281add80bdfa0ad2998b8df287b13df73 ]

Currently, the list of stm_sources linked to an stm device is protected by
a spinlock, which also means that sources' .unlink() method is called under
this spinlock. However, this method may (and does) sleep, which means
trouble.

This patch slightly reworks locking around stm::link_list so that bits that
might_sleep() are called with a mutex held instead. Modification of this
list requires both mutex and spinlock to be held, while looking at the list
can be done under either mutex or spinlock.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/core.c | 38 +++++++++++++++++++++++++++---------
 drivers/hwtracing/stm/stm.h  |  1 +
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index 92ab51aa8a74..f286de2e86af 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -647,6 +647,7 @@ int stm_register_device(struct device *parent, struct stm_data *stm_data,
 	if (err)
 		goto err_device;
 
+	mutex_init(&stm->link_mutex);
 	spin_lock_init(&stm->link_lock);
 	INIT_LIST_HEAD(&stm->link_list);
 
@@ -677,11 +678,11 @@ void stm_unregister_device(struct stm_data *stm_data)
 	struct stm_source_device *src, *iter;
 	int i;
 
-	spin_lock(&stm->link_lock);
+	mutex_lock(&stm->link_mutex);
 	list_for_each_entry_safe(src, iter, &stm->link_list, link_entry) {
 		__stm_source_link_drop(src, stm);
 	}
-	spin_unlock(&stm->link_lock);
+	mutex_unlock(&stm->link_mutex);
 
 	synchronize_srcu(&stm_source_srcu);
 
@@ -700,6 +701,17 @@ void stm_unregister_device(struct stm_data *stm_data)
 }
 EXPORT_SYMBOL_GPL(stm_unregister_device);
 
+/*
+ * stm::link_list access serialization uses a spinlock and a mutex; holding
+ * either of them guarantees that the list is stable; modification requires
+ * holding both of them.
+ *
+ * Lock ordering is as follows:
+ *   stm::link_mutex
+ *     stm::link_lock
+ *       src::link_lock
+ */
+
 /**
  * stm_source_link_add() - connect an stm_source device to an stm device
  * @src:	stm_source device
@@ -716,6 +728,7 @@ static int stm_source_link_add(struct stm_source_device *src,
 	char *id;
 	int err;
 
+	mutex_lock(&stm->link_mutex);
 	spin_lock(&stm->link_lock);
 	spin_lock(&src->link_lock);
 
@@ -725,6 +738,7 @@ static int stm_source_link_add(struct stm_source_device *src,
 
 	spin_unlock(&src->link_lock);
 	spin_unlock(&stm->link_lock);
+	mutex_unlock(&stm->link_mutex);
 
 	id = kstrdup(src->data->name, GFP_KERNEL);
 	if (id) {
@@ -762,6 +776,7 @@ fail_free_output:
 	stm_put_device(stm);
 
 fail_detach:
+	mutex_lock(&stm->link_mutex);
 	spin_lock(&stm->link_lock);
 	spin_lock(&src->link_lock);
 
@@ -770,6 +785,7 @@ fail_detach:
 
 	spin_unlock(&src->link_lock);
 	spin_unlock(&stm->link_lock);
+	mutex_unlock(&stm->link_mutex);
 
 	return err;
 }
@@ -782,13 +798,20 @@ fail_detach:
  * If @stm is @src::link, disconnect them from one another and put the
  * reference on the @stm device.
  *
- * Caller must hold stm::link_lock.
+ * Caller must hold stm::link_mutex.
  */
 static void __stm_source_link_drop(struct stm_source_device *src,
 				   struct stm_device *stm)
 {
 	struct stm_device *link;
 
+	lockdep_assert_held(&stm->link_mutex);
+
+	if (src->data->unlink)
+		src->data->unlink(src->data);
+
+	/* for stm::link_list modification, we hold both mutex and spinlock */
+	spin_lock(&stm->link_lock);
 	spin_lock(&src->link_lock);
 	link = srcu_dereference_check(src->link, &stm_source_srcu, 1);
 	if (WARN_ON_ONCE(link != stm)) {
@@ -797,13 +820,13 @@ static void __stm_source_link_drop(struct stm_source_device *src,
 	}
 
 	stm_output_free(link, &src->output);
-	/* caller must hold stm::link_lock */
 	list_del_init(&src->link_entry);
 	/* matches stm_find_device() from stm_source_link_store() */
 	stm_put_device(link);
 	rcu_assign_pointer(src->link, NULL);
 
 	spin_unlock(&src->link_lock);
+	spin_unlock(&stm->link_lock);
 }
 
 /**
@@ -825,12 +848,9 @@ static void stm_source_link_drop(struct stm_source_device *src)
 	stm = srcu_dereference(src->link, &stm_source_srcu);
 
 	if (stm) {
-		if (src->data->unlink)
-			src->data->unlink(src->data);
-
-		spin_lock(&stm->link_lock);
+		mutex_lock(&stm->link_mutex);
 		__stm_source_link_drop(src, stm);
-		spin_unlock(&stm->link_lock);
+		mutex_unlock(&stm->link_mutex);
 	}
 
 	srcu_read_unlock(&stm_source_srcu, idx);
diff --git a/drivers/hwtracing/stm/stm.h b/drivers/hwtracing/stm/stm.h
index 95ece0292c99..97ee02241440 100644
--- a/drivers/hwtracing/stm/stm.h
+++ b/drivers/hwtracing/stm/stm.h
@@ -45,6 +45,7 @@ struct stm_device {
 	int			major;
 	unsigned int		sw_nmasters;
 	struct stm_data		*data;
+	struct mutex		link_mutex;
 	spinlock_t		link_lock;
 	struct list_head	link_list;
 	/* master allocation */

From ffb2f3f86b3a6e0ef2b9c6f53e1e98931061db9b Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 22 Dec 2015 17:25:21 +0200
Subject: [PATCH 2973/3220] stm class: Prevent user-controllable allocations

[ Upstream commit f08b18266c7116e2ec6885dd53a928f580060a71 ]

Currently, the character device write method allocates a temporary buffer
for user's data, but the user's data size is not sanitized and can cause
arbitrarily large allocations via kzalloc() or an integer overflow that
will then result in overwriting kernel memory.

This patch trims the input buffer size to avoid these issues.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index f286de2e86af..e4fa583b57a6 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -410,6 +410,9 @@ static ssize_t stm_char_write(struct file *file, const char __user *buf,
 	char *kbuf;
 	int err;
 
+	if (count + 1 > PAGE_SIZE)
+		count = PAGE_SIZE - 1;
+
 	/*
 	 * if no m/c have been assigned to this writer up to this
 	 * point, use "default" policy entry

From 65b52bbc35a6c996e92f67301fd0fc85cf523496 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Mon, 15 Feb 2016 19:12:02 +0200
Subject: [PATCH 2974/3220] stm class: Support devices with multiple instances

[ Upstream commit 59be422e4ce10e3d49d4c9407a80fab8a9b7bc84 ]

By convention, the name of the stm policy directory in configfs consists of
the device name to which it applies and the actual policy name, separated
by a dot. Now, some devices already have dots in their names that separate
name of the actual device from its instance identifier. Such devices will
result in two (or more, who can tell) dots in the policy directory name.

Existing policy code, however, will treat the first dot as the one that
separates device name from policy name, therefore failing the above case.

This patch makes the last dot in the directory name be the separator, thus
prohibiting dots from being used in policy names.

Suggested-by: Chunyan Zhang <zhang.chunyan@linaro.org>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/policy.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/hwtracing/stm/policy.c b/drivers/hwtracing/stm/policy.c
index 94d3abfb737a..1db189657b2b 100644
--- a/drivers/hwtracing/stm/policy.c
+++ b/drivers/hwtracing/stm/policy.c
@@ -332,10 +332,11 @@ stp_policies_make(struct config_group *group, const char *name)
 
 	/*
 	 * node must look like <device_name>.<policy_name>, where
-	 * <device_name> is the name of an existing stm device and
-	 * <policy_name> is an arbitrary string
+	 * <device_name> is the name of an existing stm device; may
+	 *               contain dots;
+	 * <policy_name> is an arbitrary string; may not contain dots
 	 */
-	p = strchr(devname, '.');
+	p = strrchr(devname, '.');
 	if (!p) {
 		kfree(devname);
 		return ERR_PTR(-EINVAL);

From bc9d781bed9db2fc53da97d9559479e30f509374 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Mon, 15 Feb 2016 19:12:05 +0200
Subject: [PATCH 2975/3220] stm class: Fix unlocking braino in the error path

[ Upstream commit 1810f2c44817c74ca3d05d1e3981e3a2e2ceb6f5 ]

If an illegal attempt is made to unlink stm source device from an
stm device, the stm device's link spinlock mistakenly remains locked.
While this really shouldn't happen (there's a warning in place), the
locking should remain in order so that we can still recover from this
situation if it indeed does happen.

This patch unifies the unlocking in the exit path of
__stm_source_link_drop() to fix this.

Reported-by: Laurent Fert <laurent.fert@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/core.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index e4fa583b57a6..d4deac108578 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -817,10 +817,8 @@ static void __stm_source_link_drop(struct stm_source_device *src,
 	spin_lock(&stm->link_lock);
 	spin_lock(&src->link_lock);
 	link = srcu_dereference_check(src->link, &stm_source_srcu, 1);
-	if (WARN_ON_ONCE(link != stm)) {
-		spin_unlock(&src->link_lock);
-		return;
-	}
+	if (WARN_ON_ONCE(link != stm))
+		goto unlock;
 
 	stm_output_free(link, &src->output);
 	list_del_init(&src->link_entry);
@@ -828,6 +826,7 @@ static void __stm_source_link_drop(struct stm_source_device *src,
 	stm_put_device(link);
 	rcu_assign_pointer(src->link, NULL);
 
+unlock:
 	spin_unlock(&src->link_lock);
 	spin_unlock(&stm->link_lock);
 }

From f34bc5ad7a8415944a0bfe7dc005f50811651260 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Mon, 15 Feb 2016 19:12:06 +0200
Subject: [PATCH 2976/3220] stm class: Guard output assignment against
 concurrency

[ Upstream commit cde4ad8368840e414ecf67db258fe1dabaa5fd2e ]

It is possible to concurrently assign the same output (a character
device writer or an stm_source device) to different stm devices,
which sets off a strategically placed warning in stm_output_assign().

To avoid this, use a spinlock to serialize (un)assignments between
outputs and stm devices.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/core.c | 17 +++++++++++++++++
 drivers/hwtracing/stm/stm.h  |  1 +
 2 files changed, 18 insertions(+)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index d4deac108578..f8e46c38b565 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -186,6 +186,9 @@ static void stm_output_claim(struct stm_device *stm, struct stm_output *output)
 {
 	struct stp_master *master = stm_master(stm, output->master);
 
+	lockdep_assert_held(&stm->mc_lock);
+	lockdep_assert_held(&output->lock);
+
 	if (WARN_ON_ONCE(master->nr_free < output->nr_chans))
 		return;
 
@@ -200,6 +203,9 @@ stm_output_disclaim(struct stm_device *stm, struct stm_output *output)
 {
 	struct stp_master *master = stm_master(stm, output->master);
 
+	lockdep_assert_held(&stm->mc_lock);
+	lockdep_assert_held(&output->lock);
+
 	bitmap_release_region(&master->chan_map[0], output->channel,
 			      ilog2(output->nr_chans));
 
@@ -292,6 +298,7 @@ static int stm_output_assign(struct stm_device *stm, unsigned int width,
 	}
 
 	spin_lock(&stm->mc_lock);
+	spin_lock(&output->lock);
 	/* output is already assigned -- shouldn't happen */
 	if (WARN_ON_ONCE(output->nr_chans))
 		goto unlock;
@@ -308,6 +315,7 @@ static int stm_output_assign(struct stm_device *stm, unsigned int width,
 
 	ret = 0;
 unlock:
+	spin_unlock(&output->lock);
 	spin_unlock(&stm->mc_lock);
 
 	return ret;
@@ -316,11 +324,18 @@ unlock:
 static void stm_output_free(struct stm_device *stm, struct stm_output *output)
 {
 	spin_lock(&stm->mc_lock);
+	spin_lock(&output->lock);
 	if (output->nr_chans)
 		stm_output_disclaim(stm, output);
+	spin_unlock(&output->lock);
 	spin_unlock(&stm->mc_lock);
 }
 
+static void stm_output_init(struct stm_output *output)
+{
+	spin_lock_init(&output->lock);
+}
+
 static int major_match(struct device *dev, const void *data)
 {
 	unsigned int major = *(unsigned int *)data;
@@ -343,6 +358,7 @@ static int stm_char_open(struct inode *inode, struct file *file)
 	if (!stmf)
 		return -ENOMEM;
 
+	stm_output_init(&stmf->output);
 	stmf->stm = to_stm_device(dev);
 
 	if (!try_module_get(stmf->stm->owner))
@@ -953,6 +969,7 @@ int stm_source_register_device(struct device *parent,
 	if (err)
 		goto err;
 
+	stm_output_init(&src->output);
 	spin_lock_init(&src->link_lock);
 	INIT_LIST_HEAD(&src->link_entry);
 	src->data = data;
diff --git a/drivers/hwtracing/stm/stm.h b/drivers/hwtracing/stm/stm.h
index 97ee02241440..4e8c6926260f 100644
--- a/drivers/hwtracing/stm/stm.h
+++ b/drivers/hwtracing/stm/stm.h
@@ -57,6 +57,7 @@ struct stm_device {
 	container_of((_d), struct stm_device, dev)
 
 struct stm_output {
+	spinlock_t		lock;
 	unsigned int		master;
 	unsigned int		channel;
 	unsigned int		nr_chans;

From 8c0bfd9d7cb3e306df32057bc120d1ea8c17830d Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Mon, 15 Feb 2016 19:12:07 +0200
Subject: [PATCH 2977/3220] stm class: Fix unbalanced module/device refcounting

[ Upstream commit f7c81c7176c72c7899390754b4b038a64b296e4d ]

STM code takes references to the stm device and its module for the
duration of the character device's existence or the stm_source link.
Dropping these references is not well balanced everywhere, which may
lead to leaks.

This patch balances the acquisition and releasing of these two
references and annotates each site so that it's easier to verify
correctness by reading the code.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/core.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index f8e46c38b565..cdc692d6cedd 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -114,6 +114,7 @@ struct stm_device *stm_find_device(const char *buf)
 
 	stm = to_stm_device(dev);
 	if (!try_module_get(stm->owner)) {
+		/* matches class_find_device() above */
 		put_device(dev);
 		return NULL;
 	}
@@ -126,7 +127,7 @@ struct stm_device *stm_find_device(const char *buf)
  * @stm:	stm device, previously acquired by stm_find_device()
  *
  * This drops the module reference and device reference taken by
- * stm_find_device().
+ * stm_find_device() or stm_char_open().
  */
 void stm_put_device(struct stm_device *stm)
 {
@@ -369,6 +370,8 @@ static int stm_char_open(struct inode *inode, struct file *file)
 	return nonseekable_open(inode, file);
 
 err_free:
+	/* matches class_find_device() above */
+	put_device(dev);
 	kfree(stmf);
 
 	return err;
@@ -379,6 +382,11 @@ static int stm_char_release(struct inode *inode, struct file *file)
 	struct stm_file *stmf = file->private_data;
 
 	stm_output_free(stmf->stm, &stmf->output);
+
+	/*
+	 * matches the stm_char_open()'s
+	 * class_find_device() + try_module_get()
+	 */
 	stm_put_device(stmf->stm);
 	kfree(stmf);
 
@@ -540,10 +548,8 @@ static int stm_char_policy_set_ioctl(struct stm_file *stmf, void __user *arg)
 		ret = stm->data->link(stm->data, stmf->output.master,
 				      stmf->output.channel);
 
-	if (ret) {
+	if (ret)
 		stm_output_free(stmf->stm, &stmf->output);
-		stm_put_device(stmf->stm);
-	}
 
 err_free:
 	kfree(id);
@@ -680,6 +686,7 @@ int stm_register_device(struct device *parent, struct stm_data *stm_data,
 	return 0;
 
 err_device:
+	/* matches device_initialize() above */
 	put_device(&stm->dev);
 err_free:
 	vfree(stm);
@@ -792,7 +799,6 @@ static int stm_source_link_add(struct stm_source_device *src,
 
 fail_free_output:
 	stm_output_free(stm, &src->output);
-	stm_put_device(stm);
 
 fail_detach:
 	mutex_lock(&stm->link_mutex);
@@ -906,8 +912,10 @@ static ssize_t stm_source_link_store(struct device *dev,
 		return -EINVAL;
 
 	err = stm_source_link_add(src, link);
-	if (err)
+	if (err) {
+		/* matches the stm_find_device() above */
 		stm_put_device(link);
+	}
 
 	return err ? : count;
 }

From c66866b13b3c43606bf12d55988d1948e5e03511 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Mon, 15 Feb 2016 19:12:08 +0200
Subject: [PATCH 2978/3220] stm class: Fix a race in unlinking

[ Upstream commit b4ca34aaf78ed0cdfc15956d377064104257a437 ]

There is a window in stm_source_link_drop(), during which the source's
link may change before locks are acquired. When this happens, it throws
a warning, since this is not an expected scenario.

This patch handles the race in such a way that if the link appears to
have changed by the time we took the locks, it will release them and
repeat the whole unlinking procedure from the beginning, unless the
other contender beat us to it.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/core.c | 54 ++++++++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index cdc692d6cedd..03b34dcff7f2 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -695,18 +695,26 @@ err_free:
 }
 EXPORT_SYMBOL_GPL(stm_register_device);
 
-static void __stm_source_link_drop(struct stm_source_device *src,
-				   struct stm_device *stm);
+static int __stm_source_link_drop(struct stm_source_device *src,
+				  struct stm_device *stm);
 
 void stm_unregister_device(struct stm_data *stm_data)
 {
 	struct stm_device *stm = stm_data->stm;
 	struct stm_source_device *src, *iter;
-	int i;
+	int i, ret;
 
 	mutex_lock(&stm->link_mutex);
 	list_for_each_entry_safe(src, iter, &stm->link_list, link_entry) {
-		__stm_source_link_drop(src, stm);
+		ret = __stm_source_link_drop(src, stm);
+		/*
+		 * src <-> stm link must not change under the same
+		 * stm::link_mutex, so complain loudly if it has;
+		 * also in this situation ret!=0 means this src is
+		 * not connected to this stm and it should be otherwise
+		 * safe to proceed with the tear-down of stm.
+		 */
+		WARN_ON_ONCE(ret);
 	}
 	mutex_unlock(&stm->link_mutex);
 
@@ -825,22 +833,28 @@ fail_detach:
  *
  * Caller must hold stm::link_mutex.
  */
-static void __stm_source_link_drop(struct stm_source_device *src,
-				   struct stm_device *stm)
+static int __stm_source_link_drop(struct stm_source_device *src,
+				  struct stm_device *stm)
 {
 	struct stm_device *link;
+	int ret = 0;
 
 	lockdep_assert_held(&stm->link_mutex);
 
-	if (src->data->unlink)
-		src->data->unlink(src->data);
-
 	/* for stm::link_list modification, we hold both mutex and spinlock */
 	spin_lock(&stm->link_lock);
 	spin_lock(&src->link_lock);
 	link = srcu_dereference_check(src->link, &stm_source_srcu, 1);
-	if (WARN_ON_ONCE(link != stm))
+
+	/*
+	 * The linked device may have changed since we last looked, because
+	 * we weren't holding the src::link_lock back then; if this is the
+	 * case, tell the caller to retry.
+	 */
+	if (link != stm) {
+		ret = -EAGAIN;
 		goto unlock;
+	}
 
 	stm_output_free(link, &src->output);
 	list_del_init(&src->link_entry);
@@ -851,6 +865,11 @@ static void __stm_source_link_drop(struct stm_source_device *src,
 unlock:
 	spin_unlock(&src->link_lock);
 	spin_unlock(&stm->link_lock);
+
+	if (!ret && src->data->unlink)
+		src->data->unlink(src->data);
+
+	return ret;
 }
 
 /**
@@ -866,18 +885,29 @@ unlock:
 static void stm_source_link_drop(struct stm_source_device *src)
 {
 	struct stm_device *stm;
-	int idx;
+	int idx, ret;
 
+retry:
 	idx = srcu_read_lock(&stm_source_srcu);
+	/*
+	 * The stm device will be valid for the duration of this
+	 * read section, but the link may change before we grab
+	 * the src::link_lock in __stm_source_link_drop().
+	 */
 	stm = srcu_dereference(src->link, &stm_source_srcu);
 
+	ret = 0;
 	if (stm) {
 		mutex_lock(&stm->link_mutex);
-		__stm_source_link_drop(src, stm);
+		ret = __stm_source_link_drop(src, stm);
 		mutex_unlock(&stm->link_mutex);
 	}
 
 	srcu_read_unlock(&stm_source_srcu, idx);
+
+	/* if it did change, retry */
+	if (ret == -EAGAIN)
+		goto retry;
 }
 
 static ssize_t stm_source_link_show(struct device *dev,

From 58af2f0f6fdfe07c031ecc2181428cfcdc63584b Mon Sep 17 00:00:00 2001
From: Eric Long <eric.long@linaro.org>
Date: Wed, 17 Feb 2016 17:51:43 -0700
Subject: [PATCH 2979/3220] coresight: "DEVICE_ATTR_RO" should defined as
 static.

[ Upstream commit bf16e5b8cdeabc1fe6565af0be475bb2084dc388 ]

"DEVICE_ATTR_RO(name)" should be defined as static. And
there is an unnecessary space at the front of the code.

The sparse tool output logs as the following:
coresight-etm4x.c:2224:1: warning: symbol 'dev_attr_trcoslsr' was
not declared. Should it be static?
coresight-etm4x.c:2225:1: warning: symbol 'dev_attr_trcpdcr' was
not declared. Should it be static?
coresight-etm4x.c:2226:1: warning: symbol 'dev_attr_trcpdsr' was
not declared. Should it be static?
And the smatch tool output logs as the following:
of_coresight.c:89 of_coresight_alloc_memory() warn:
inconsistent indenting

Signed-off-by: Eric Long <eric.long@linaro.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/coresight/coresight-etm4x.c | 2 +-
 drivers/hwtracing/coresight/of_coresight.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index a6707642bb23..1ec6798b21e8 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -2219,7 +2219,7 @@ static ssize_t name##_show(struct device *_dev,				\
 	return scnprintf(buf, PAGE_SIZE, "0x%x\n",			\
 			 readl_relaxed(drvdata->base + offset));	\
 }									\
-DEVICE_ATTR_RO(name)
+static DEVICE_ATTR_RO(name)
 
 coresight_simple_func(trcoslsr, TRCOSLSR);
 coresight_simple_func(trcpdcr, TRCPDCR);
diff --git a/drivers/hwtracing/coresight/of_coresight.c b/drivers/hwtracing/coresight/of_coresight.c
index 7d2bb1549608..fb7597b1c66f 100644
--- a/drivers/hwtracing/coresight/of_coresight.c
+++ b/drivers/hwtracing/coresight/of_coresight.c
@@ -86,7 +86,7 @@ static int of_coresight_alloc_memory(struct device *dev,
 		return -ENOMEM;
 
 	/* Children connected to this component via @outports */
-	 pdata->child_names = devm_kzalloc(dev, pdata->nr_outport *
+	pdata->child_names = devm_kzalloc(dev, pdata->nr_outport *
 					  sizeof(*pdata->child_names),
 					  GFP_KERNEL);
 	if (!pdata->child_names)

From 43ea28e71c2d54cd40eeb97fcdf71da38f84c963 Mon Sep 17 00:00:00 2001
From: Eric Long <eric.long@linaro.org>
Date: Wed, 17 Feb 2016 17:51:44 -0700
Subject: [PATCH 2980/3220] coresight: etm4x: Check every parameter used by
 dma_xx_coherent.

[ Upstream commit 61390593f72377c3a8f41ef998462e2d3985adac ]

The dma_alloc_coherent return an "void *" not an "void __iomen *".
It uses the wrong parameters when calls dma_free_coherent function.

The sparse tool output logs as the following:
coresight-tmc.c:199:23:    expected void *<noident>
coresight-tmc.c:199:23:    got void [noderef] <asn:2>*vaddr
coresight-tmc.c:336:30: warning: incorrect type in assignment
(different address spaces)
coresight-tmc.c:336:30:    expected char *buf
coresight-tmc.c:336:30:    got void [noderef] <asn:2>*
coresight-tmc.c:769:50: warning: incorrect type in argument 4
(different base types)
coresight-tmc.c:769:50:    expected unsigned long long
[unsigned] [usertype] dma_handle
coresight-tmc.c:769:50:    got restricted gfp_t

Signed-off-by: Eric Long <eric.long@linaro.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/coresight/coresight-tmc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc.c b/drivers/hwtracing/coresight/coresight-tmc.c
index a57c7ec1661f..62f9d7372e3a 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -124,7 +124,7 @@ struct tmc_drvdata {
 	bool			reading;
 	char			*buf;
 	dma_addr_t		paddr;
-	void __iomem		*vaddr;
+	void			*vaddr;
 	u32			size;
 	bool			enable;
 	enum tmc_config_type	config_type;
@@ -766,7 +766,7 @@ err_misc_register:
 err_devm_kzalloc:
 	if (drvdata->config_type == TMC_CONFIG_TYPE_ETR)
 		dma_free_coherent(dev, drvdata->size,
-				&drvdata->paddr, GFP_KERNEL);
+				drvdata->vaddr, drvdata->paddr);
 	return ret;
 }
 

From 021c0d5cdfc0e228963ade79c18015ce7cc181bb Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 25 Jan 2016 11:44:55 +0000
Subject: [PATCH 2981/3220] asm-generic: Fix local variable shadow in
 __set_fixmap_offset

[ Upstream commit 3694bd76781b76c4f8d2ecd85018feeb1609f0e5 ]

Currently __set_fixmap_offset is a macro function which has a local
variable called 'addr'. If a caller passes a 'phys' parameter which is
derived from a variable also called 'addr', the local variable will
shadow this, and the compiler will complain about the use of an
uninitialized variable. To avoid the issue with namespace clashes,
'addr' is prefixed with a liberal sprinkling of underscores.

Turning __set_fixmap_offset into a static inline breaks the build for
several architectures. Fixing this properly requires updates to a number
of architectures to make them agree on the prototype of __set_fixmap (it
could be done as a subsequent patch series).

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
[catalin.marinas@arm.com: squashed the original function patch and macro fixup]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/asm-generic/fixmap.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h
index 1cbb8338edf3..827e4d3bbc7a 100644
--- a/include/asm-generic/fixmap.h
+++ b/include/asm-generic/fixmap.h
@@ -70,12 +70,12 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
 #endif
 
 /* Return a pointer with offset calculated */
-#define __set_fixmap_offset(idx, phys, flags)		      \
-({							      \
-	unsigned long addr;				      \
-	__set_fixmap(idx, phys, flags);			      \
-	addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1)); \
-	addr;						      \
+#define __set_fixmap_offset(idx, phys, flags)				\
+({									\
+	unsigned long ________addr;					\
+	__set_fixmap(idx, phys, flags);					\
+	________addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1));	\
+	________addr;							\
 })
 
 #define set_fixmap_offset(idx, phys) \

From 4dcbf6fa3b47536e74be573ff5a2ff1d670a25aa Mon Sep 17 00:00:00 2001
From: Laura Abbott <lauraa@codeaurora.org>
Date: Fri, 29 Jan 2016 22:07:30 -0800
Subject: [PATCH 2982/3220] staging: ashmem: Avoid deadlock with mmap/shrink

[ Upstream commit 18e77054de741ef3ed2a2489bc9bf82a318b2d5e ]

Both ashmem_mmap and ashmem_shrink take the ashmem_lock. It may
be possible for ashmem_mmap to invoke ashmem_shrink:

-000|mutex_lock(lock = 0x0)
-001|ashmem_shrink(?, sc = 0x0) <--- try to take ashmem_mutex again
-002|shrink_slab(shrink = 0xDA5F1CC0, nr_pages_scanned = 0, lru_pages
-002|=
-002|124)
-003|try_to_free_pages(zonelist = 0x0, ?, ?, ?)
-004|__alloc_pages_nodemask(gfp_mask = 21200, order = 1, zonelist =
-004|0xC11D0940,
-005|new_slab(s = 0xE4841E80, ?, node = -1)
-006|__slab_alloc.isra.43.constprop.50(s = 0xE4841E80, gfpflags =
-006|2148925462, ad
-007|kmem_cache_alloc(s = 0xE4841E80, gfpflags = 208)
-008|shmem_alloc_inode(?)
-009|alloc_inode(sb = 0xE480E800)
-010|new_inode_pseudo(?)
-011|new_inode(?)
-012|shmem_get_inode(sb = 0xE480E800, dir = 0x0, ?, dev = 0, flags =
-012|187)
-013|shmem_file_setup(?, ?, flags = 187)
-014|ashmem_mmap(?, vma = 0xC5D64210) <---- Acquire ashmem_mutex
-015|mmap_region(file = 0xDF8E2C00, addr = 1772974080, len = 233472,
-015|flags = 57,
-016|sys_mmap_pgoff(addr = 0, len = 230400, prot = 3, flags = 1, fd =
-016|157, pgoff
-017|ret_fast_syscall(asm)
-->|exception
-018|NUR:0x40097508(asm)
---|end of frame

Avoid this deadlock by using mutex_trylock in ashmem_shrink; if the mutex
is already held, do not attempt to shrink.

Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Android Kernel Team <kernel-team@android.com>
Reported-by: Matt Wagantall <mattw@codeaurora.org>
Reported-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
Reported-by: Osvaldo Banuelos <osvaldob@codeaurora.org>
Reported-by: Subbaraman Narayanamurthy <subbaram@codeaurora.org>
Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
[jstultz: Minor commit message tweaks]
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/staging/android/ashmem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index e9c74c41aece..b4c425383f99 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -447,7 +447,9 @@ ashmem_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 	if (!(sc->gfp_mask & __GFP_FS))
 		return SHRINK_STOP;
 
-	mutex_lock(&ashmem_mutex);
+	if (!mutex_trylock(&ashmem_mutex))
+		return -1;
+
 	list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
 		loff_t start = range->pgstart * PAGE_SIZE;
 		loff_t end = (range->pgend + 1) * PAGE_SIZE;

From efa2ec29dd4ef93c1e5a7bd719f9b2e283513e76 Mon Sep 17 00:00:00 2001
From: Rom Lemarchand <romlem@android.com>
Date: Fri, 29 Jan 2016 22:07:31 -0800
Subject: [PATCH 2983/3220] staging: ashmem: Add missing include

[ Upstream commit 90a2f171383b5ae43b33ab4d9d566b9765622ac7 ]

Include <linux/types.h> into ashmem.h to ensure referenced types
are defined

Cc: Android Kernel Team <kernel-team@android.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Signed-off-by: Rom Lemarchand <romlem@android.com>
[jstultz: Minor commit message tweaks]
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/staging/android/uapi/ashmem.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/staging/android/uapi/ashmem.h b/drivers/staging/android/uapi/ashmem.h
index ba4743c71d6b..13df42d200b7 100644
--- a/drivers/staging/android/uapi/ashmem.h
+++ b/drivers/staging/android/uapi/ashmem.h
@@ -13,6 +13,7 @@
 #define _UAPI_LINUX_ASHMEM_H
 
 #include <linux/ioctl.h>
+#include <linux/types.h>
 
 #define ASHMEM_NAME_LEN		256
 

From 9d4b4395067b41e5065a20851f73979de41bd0f8 Mon Sep 17 00:00:00 2001
From: Rajmal Menariya <rajmal.menariya@spreadtrum.com>
Date: Fri, 29 Jan 2016 22:07:35 -0800
Subject: [PATCH 2984/3220] staging: ion: Set minimum carveout heap allocation
 order to PAGE_SHIFT

[ Upstream commit 1328d8efef17d5e16bd6e9cfe59130a833674534 ]

In carveout heap, change minimum allocation order from 12 to
PAGE_SHIFT. After this change each bit in bitmap (genalloc -
General purpose special memory pool) represents one page size
memory.

Cc: sprd-ind-kernel-group@googlegroups.com
Cc: sanjeev.yadav@spreadtrum.com
Cc: Colin Cross <ccross@android.com>
Cc: Android Kernel Team <kernel-team@android.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Signed-off-by: Rajmal Menariya <rajmal.menariya@spreadtrum.com>
[jstultz: Reworked commit message]
Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/staging/android/ion/ion_carveout_heap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/android/ion/ion_carveout_heap.c b/drivers/staging/android/ion/ion_carveout_heap.c
index 9156d8238c97..e702ce6461fc 100644
--- a/drivers/staging/android/ion/ion_carveout_heap.c
+++ b/drivers/staging/android/ion/ion_carveout_heap.c
@@ -167,7 +167,7 @@ struct ion_heap *ion_carveout_heap_create(struct ion_platform_heap *heap_data)
 	if (!carveout_heap)
 		return ERR_PTR(-ENOMEM);
 
-	carveout_heap->pool = gen_pool_create(12, -1);
+	carveout_heap->pool = gen_pool_create(PAGE_SHIFT, -1);
 	if (!carveout_heap->pool) {
 		kfree(carveout_heap);
 		return ERR_PTR(-ENOMEM);

From a99d340dc70bf970e8bb9872dd5766ed7c546cfe Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Fri, 26 Feb 2016 19:00:18 +0000
Subject: [PATCH 2985/3220] staging: goldfish: audio: fix compiliation on arm

[ Upstream commit 4532150762ceb0d6fd765ebcb3ba6966fbb8faab ]

We do actually need slab.h, by luck we get it on other platforms but not
always on ARM. Include it properly.

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Jin Qian <jinqian@android.com>
Signed-off-by: Alan <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/staging/goldfish/goldfish_audio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c
index b0927e49d0a8..6ca288bf4059 100644
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c
@@ -26,6 +26,7 @@
 #include <linux/sched.h>
 #include <linux/dma-mapping.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 #include <linux/goldfish.h>
 
 MODULE_AUTHOR("Google, Inc.");

From 823e262919f9c09cb077df5727fb85d1760aa275 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Mon, 1 Feb 2016 18:01:29 +0100
Subject: [PATCH 2986/3220] ARM: 8510/1: rework ARM_CPU_SUSPEND dependencies

[ Upstream commit 1b9bdf5c1661873a10e193b8cbb803a87fe5c4a1 ]

The code enabled by the ARM_CPU_SUSPEND config option is used by
kernel subsystems for purposes that go beyond system suspend so its
config entry should be augmented to take more default options into
account and avoid forcing its selection to prevent dependencies
override.

To achieve this goal, this patch reworks the ARM_CPU_SUSPEND config
entry and updates its default config value (by adding the BL_SWITCHER
option to it) and its dependencies (ARCH_SUSPEND_POSSIBLE), so that the
symbol is still selected by default by the subsystems requiring it and
at the same time enforcing the dependencies correctly.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Nicolas Pitre <nico@fluxnic.net>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4cc908ee107f..737c8b0dda84 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1423,7 +1423,6 @@ config BIG_LITTLE
 config BL_SWITCHER
 	bool "big.LITTLE switcher support"
 	depends on BIG_LITTLE && MCPM && HOTPLUG_CPU && ARM_GIC
-	select ARM_CPU_SUSPEND
 	select CPU_PM
 	help
 	  The big.LITTLE "switcher" provides the core functionality to
@@ -2141,7 +2140,8 @@ config ARCH_SUSPEND_POSSIBLE
 	def_bool y
 
 config ARM_CPU_SUSPEND
-	def_bool PM_SLEEP
+	def_bool PM_SLEEP || BL_SWITCHER
+	depends on ARCH_SUSPEND_POSSIBLE
 
 config ARCH_HIBERNATION_POSSIBLE
 	bool

From 08d870c51ec512d9a7ec243aa24180e517243b32 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 18 Mar 2016 10:58:09 +0100
Subject: [PATCH 2987/3220] arm64/kernel: fix incorrect EL0 check in inv_entry
 macro

[ Upstream commit b660950c60a7278f9d8deb7c32a162031207c758 ]

The implementation of macro inv_entry refers to its 'el' argument without
the required leading backslash, which results in an undefined symbol
'el' to be passed into the kernel_entry macro rather than the index of
the exception level as intended.

This undefined symbol strangely enough does not result in build failures,
although it is visible in vmlinux:

     $ nm -n vmlinux |head
                      U el
     0000000000000000 A _kernel_flags_le_hi32
     0000000000000000 A _kernel_offset_le_hi32
     0000000000000000 A _kernel_size_le_hi32
     000000000000000a A _kernel_flags_le_lo32
     .....

However, it does result in incorrect code being generated for invalid
exceptions taken from EL0, since the argument check in kernel_entry
assumes EL1 if its argument does not equal '0'.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/kernel/entry.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 3028d9b028c7..586326981769 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -243,7 +243,7 @@ END(vectors)
  * Invalid mode handlers
  */
 	.macro	inv_entry, el, reason, regsize = 64
-	kernel_entry el, \regsize
+	kernel_entry \el, \regsize
 	mov	x0, sp
 	mov	x1, #\reason
 	mrs	x2, esr_el1

From e5a19cf78a23d7b9d87ce3421fce27ffcd4ca986 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 4 Apr 2016 14:15:23 -0400
Subject: [PATCH 2988/3220] =?UTF-8?q?mac80211:=20fix=20"warning:=20?=
 =?UTF-8?q?=E2=80=98target=5Fmetric=E2=80=99=20may=20be=20used=20uninitial?=
 =?UTF-8?q?ized"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit b4201cc4fc6e1c57d6d306b1f787865043d60129 ]

This fixes:

net/mac80211/mesh_hwmp.c:603:26: warning: ‘target_metric’ may be used uninitialized in this function

target_metric is only consumed when reply = true so no bug exists here,
but not all versions of gcc realize it.  Initialize to 0 to remove the
warning.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/mac80211/mesh_hwmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 33d5271a9e32..466922f09d04 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -530,7 +530,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 	const u8 *target_addr, *orig_addr;
 	const u8 *da;
 	u8 target_flags, ttl, flags;
-	u32 orig_sn, target_sn, lifetime, target_metric;
+	u32 orig_sn, target_sn, lifetime, target_metric = 0;
 	bool reply = false;
 	bool forward = true;
 	bool root_is_gate;

From 2ba2cca736d852cf0ec2fcd0b3e7148a3882adaa Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 4 Mar 2016 15:42:45 +0200
Subject: [PATCH 2989/3220] perf/ring_buffer: Refuse to begin AUX transaction
 after rb->aux_mmap_count drops

[ Upstream commit dcb10a967ce82d5ad20570693091139ae716ff76 ]

When ring buffer's AUX area is unmapped and rb->aux_mmap_count drops to
zero, new AUX transactions into this buffer can still be started,
even though the buffer in en route to deallocation.

This patch adds a check to perf_aux_output_begin() for rb->aux_mmap_count
being zero, in which case there is no point starting new transactions,
in other words, the ring buffers that pass a certain point in
perf_mmap_close will not have their events sending new data, which
clears path for freeing those buffers' pages right there and then,
provided that no active transactions are holding the AUX reference.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1457098969-21595-2-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/events/ring_buffer.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 94dc6b0763ab..7324d83d6bd8 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -288,6 +288,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
 		goto err;
 
+	/*
+	 * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
+	 * the aux buffer is in perf_mmap_close(), about to get freed.
+	 */
+	if (!atomic_read(&rb->aux_mmap_count))
+		goto err;
+
 	/*
 	 * Nesting is not supported for AUX area, make sure nested
 	 * writers are caught early

From 60e4a50d332bbbb3cc62acee58fd04236f9d57a1 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:08 +0100
Subject: [PATCH 2990/3220] arm64: kernel: Include _AC definition in page.h

[ Upstream commit 812264550dcba6cdbe84bfac2f27e7d23b5b8733 ]

page.h uses '_AC' in the definition of PAGE_SIZE, but doesn't include
linux/const.h where this is defined. This produces build warnings when only
asm/page.h is included by asm code.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/include/asm/page.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 9b2f5a9d019d..fbafd0ad16df 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -19,6 +19,8 @@
 #ifndef __ASM_PAGE_H
 #define __ASM_PAGE_H
 
+#include <linux/const.h>
+
 /* PAGE_SHIFT determines the page size */
 /* CONT_SHIFT determines the number of pages which can be tracked together  */
 #ifdef CONFIG_ARM64_64K_PAGES

From fbfaa26fa9e77f4b8b141f650340195f23cd7029 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:11 +0100
Subject: [PATCH 2991/3220] PM / Hibernate: Call flush_icache_range() on pages
 restored in-place

[ Upstream commit f6cf0545ec697ddc278b7457b7d0c0d86a2ea88e ]

Some architectures require code written to memory as if it were data to be
'cleaned' from any data caches before the processor can fetch them as new
instructions.

During resume from hibernate, the snapshot code copies some pages directly,
meaning these architectures do not get a chance to perform their cache
maintenance. Modify the read and decompress code to call
flush_icache_range() on all pages that are restored, so that the restored
in-place pages are guaranteed to be executable on these architectures.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
[will: make clean_pages_on_* static and remove initialisers]
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/power/swap.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 12cd989dadf6..160e1006640d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -36,6 +36,14 @@
 
 #define HIBERNATE_SIG	"S1SUSPEND"
 
+/*
+ * When reading an {un,}compressed image, we may restore pages in place,
+ * in which case some architectures need these pages cleaning before they
+ * can be executed. We don't know which pages these may be, so clean the lot.
+ */
+static bool clean_pages_on_read;
+static bool clean_pages_on_decompress;
+
 /*
  *	The swap map is a data structure used for keeping track of each page
  *	written to a swap partition.  It consists of many swap_map_page
@@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio)
 
 	if (bio_data_dir(bio) == WRITE)
 		put_page(page);
+	else if (clean_pages_on_read)
+		flush_icache_range((unsigned long)page_address(page),
+				   (unsigned long)page_address(page) + PAGE_SIZE);
 
 	if (bio->bi_error && !hb->error)
 		hb->error = bio->bi_error;
@@ -1049,6 +1060,7 @@ static int load_image(struct swap_map_handle *handle,
 
 	hib_init_batch(&hb);
 
+	clean_pages_on_read = true;
 	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
 		nr_to_read);
 	m = nr_to_read / 10;
@@ -1124,6 +1136,10 @@ static int lzo_decompress_threadfn(void *data)
 		d->unc_len = LZO_UNC_SIZE;
 		d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
 		                               d->unc, &d->unc_len);
+		if (clean_pages_on_decompress)
+			flush_icache_range((unsigned long)d->unc,
+					   (unsigned long)d->unc + d->unc_len);
+
 		atomic_set(&d->stop, 1);
 		wake_up(&d->done);
 	}
@@ -1189,6 +1205,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	}
 	memset(crc, 0, offsetof(struct crc_data, go));
 
+	clean_pages_on_decompress = true;
+
 	/*
 	 * Start the decompression threads.
 	 */

From 2040e80d8095e6aff0743944011cbaf94252ecc0 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 4 Mar 2016 16:36:10 +0200
Subject: [PATCH 2992/3220] stm class: Do not leak the chrdev in error path

[ Upstream commit cbe4a61d1ddc4790d950ca8c33ef79ee68ef5e2b ]

Currently, the error path of stm_register_device() forgets to unregister
the chrdev. Fix this.

Reported-by: Alan Cox <alan.cox@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Reviewed-by: Laurent Fert <laurent.fert@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index 03b34dcff7f2..0c7f0bae001a 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -686,6 +686,8 @@ int stm_register_device(struct device *parent, struct stm_data *stm_data,
 	return 0;
 
 err_device:
+	unregister_chrdev(stm->major, stm_data->name);
+
 	/* matches device_initialize() above */
 	put_device(&stm->dev);
 err_free:

From b0351a51ffda593b2b1b35dd0c00a73505edb256 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 4 Mar 2016 16:48:14 +0200
Subject: [PATCH 2993/3220] stm class: Fix stm device initialization order

[ Upstream commit 389b6699a2aa0b457aa69986e9ddf39f3b4030fd ]

Currently, stm_register_device() makes the device visible and then
proceeds to initializing spinlocks and other properties, which leaves
a window when the device can already be opened but is not yet fully
operational.

Fix this by reversing the initialization order.

Reported-by: Alan Cox <alan.cox@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Reviewed-by: Laurent Fert <laurent.fert@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/core.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
index 0c7f0bae001a..b6cc841de79d 100644
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -664,6 +664,18 @@ int stm_register_device(struct device *parent, struct stm_data *stm_data,
 	stm->dev.parent = parent;
 	stm->dev.release = stm_device_release;
 
+	mutex_init(&stm->link_mutex);
+	spin_lock_init(&stm->link_lock);
+	INIT_LIST_HEAD(&stm->link_list);
+
+	/* initialize the object before it is accessible via sysfs */
+	spin_lock_init(&stm->mc_lock);
+	mutex_init(&stm->policy_mutex);
+	stm->sw_nmasters = nmasters;
+	stm->owner = owner;
+	stm->data = stm_data;
+	stm_data->stm = stm;
+
 	err = kobject_set_name(&stm->dev.kobj, "%s", stm_data->name);
 	if (err)
 		goto err_device;
@@ -672,17 +684,6 @@ int stm_register_device(struct device *parent, struct stm_data *stm_data,
 	if (err)
 		goto err_device;
 
-	mutex_init(&stm->link_mutex);
-	spin_lock_init(&stm->link_lock);
-	INIT_LIST_HEAD(&stm->link_list);
-
-	spin_lock_init(&stm->mc_lock);
-	mutex_init(&stm->policy_mutex);
-	stm->sw_nmasters = nmasters;
-	stm->owner = owner;
-	stm->data = stm_data;
-	stm_data->stm = stm;
-
 	return 0;
 
 err_device:

From 01855b6299eb71f9d9aee7d2c8d5573ad1fd13c2 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 11 Jun 2016 20:32:06 +0200
Subject: [PATCH 2994/3220] ipv6: fix endianness error in icmpv6_err

[ Upstream commit dcb94b88c09ce82a80e188d49bcffdc83ba215a6 ]

IPv6 ping socket error handler doesn't correctly convert the new 32 bit
mtu to host endianness before using.

Cc: Lorenzo Colitti <lorenzo@google.com>
Fixes: 6d0bfe22611602f ("net: ipv6: Add IPv6 support to the ping socket.")
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/ipv6/icmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 0a37ddc7af51..3697cd08c515 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -98,7 +98,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	if (!(type & ICMPV6_INFOMSG_MASK))
 		if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
-			ping_err(skb, offset, info);
+			ping_err(skb, offset, ntohl(info));
 }
 
 static int icmpv6_rcv(struct sk_buff *skb);

From 0752035a776efeaf615e9ebc0d94dafd1f209898 Mon Sep 17 00:00:00 2001
From: Winter Wang <wente.wang@nxp.com>
Date: Wed, 27 Jul 2016 10:03:19 +0800
Subject: [PATCH 2995/3220] usb: gadget: configfs: add mutex lock before
 unregister gadget

[ Upstream commit cee51c33f52ebf673a088a428ac0fecc33ab77fa ]

There may be a race condition if f_fs calls unregister_gadget_item in
ffs_closed() when unregister_gadget is called by UDC store at the same time.
this leads to a kernel NULL pointer dereference:

[  310.644928] Unable to handle kernel NULL pointer dereference at virtual address 00000004
[  310.645053] init: Service 'adbd' is being killed...
[  310.658938] pgd = c9528000
[  310.662515] [00000004] *pgd=19451831, *pte=00000000, *ppte=00000000
[  310.669702] Internal error: Oops: 817 [#1] PREEMPT SMP ARM
[  310.675211] Modules linked in:
[  310.678294] CPU: 0 PID: 1537 Comm: ->transport Not tainted 4.1.15-03725-g793404c #2
[  310.685958] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree)
[  310.692493] task: c8e24200 ti: c945e000 task.ti: c945e000
[  310.697911] PC is at usb_gadget_unregister_driver+0xb4/0xd0
[  310.703502] LR is at __mutex_lock_slowpath+0x10c/0x16c
[  310.708648] pc : [<c075efc0>]    lr : [<c0bfb0bc>]    psr: 600f0113
<snip..>
[  311.565585] [<c075efc0>] (usb_gadget_unregister_driver) from [<c075e2b8>] (unregister_gadget_item+0x1c/0x34)
[  311.575426] [<c075e2b8>] (unregister_gadget_item) from [<c076fcc8>] (ffs_closed+0x8c/0x9c)
[  311.583702] [<c076fcc8>] (ffs_closed) from [<c07736b8>] (ffs_data_reset+0xc/0xa0)
[  311.591194] [<c07736b8>] (ffs_data_reset) from [<c07738ac>] (ffs_data_closed+0x90/0xd0)
[  311.599208] [<c07738ac>] (ffs_data_closed) from [<c07738f8>] (ffs_ep0_release+0xc/0x14)
[  311.607224] [<c07738f8>] (ffs_ep0_release) from [<c023e030>] (__fput+0x80/0x1d0)
[  311.614635] [<c023e030>] (__fput) from [<c014e688>] (task_work_run+0xb0/0xe8)
[  311.621788] [<c014e688>] (task_work_run) from [<c010afdc>] (do_work_pending+0x7c/0xa4)
[  311.629718] [<c010afdc>] (do_work_pending) from [<c010770c>] (work_pending+0xc/0x20)

for functions using functionFS, i.e. android adbd will close /dev/usb-ffs/adb/ep0
when usb IO thread fails, but switch adb from on to off also triggers write
"none" > UDC. These 2 operations both call unregister_gadget, which will lead
to the panic above.

add a mutex before calling unregister_gadget for api used in f_fs.

Signed-off-by: Winter Wang <wente.wang@nxp.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/gadget/configfs.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index 6abb6a10ee82..d412e234f336 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -1496,7 +1496,9 @@ void unregister_gadget_item(struct config_item *item)
 {
 	struct gadget_info *gi = to_gadget_info(item);
 
+	mutex_lock(&gi->lock);
 	unregister_gadget(gi);
+	mutex_unlock(&gi->lock);
 }
 EXPORT_SYMBOL_GPL(unregister_gadget_item);
 

From 13e84cdbd790a9a567dcb6f206f755e20162ff43 Mon Sep 17 00:00:00 2001
From: Xerox Lin <xerox_lin@htc.com>
Date: Wed, 29 Jun 2016 14:34:21 +0530
Subject: [PATCH 2996/3220] usb: gadget: rndis: free response queue during
 REMOTE_NDIS_RESET_MSG

[ Upstream commit 207707d8fd48ebc977fb2b2794004a020e1ee08e ]

When rndis data transfer is in progress, some Windows7 Host PC is not
sending the GET_ENCAPSULATED_RESPONSE command for receiving the response
for the previous SEND_ENCAPSULATED_COMMAND processed.

The rndis function driver appends each response for the
SEND_ENCAPSULATED_COMMAND in a queue. As the above process got corrupted,
the Host sends a REMOTE_NDIS_RESET_MSG command to do a soft-reset.
As the rndis response queue is not freed, the previous response is sent
as a part of this REMOTE_NDIS_RESET_MSG's reset response and the Host
block any more Rndis transfers.

Hence free the rndis response queue as a part of this soft-reset so that
the correct response for REMOTE_NDIS_RESET_MSG is sent properly during the
response command.

Signed-off-by: Rajkumar Raghupathy <raghup@codeaurora.org>
Signed-off-by: Xerox Lin <xerox_lin@htc.com>
[AmitP: Cherry-picked this patch and folded other relevant
        fixes from Android common kernel android-4.4]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/gadget/function/rndis.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c
index 70d3917cc003..2582db38d6a6 100644
--- a/drivers/usb/gadget/function/rndis.c
+++ b/drivers/usb/gadget/function/rndis.c
@@ -680,6 +680,12 @@ static int rndis_reset_response(struct rndis_params *params,
 {
 	rndis_reset_cmplt_type *resp;
 	rndis_resp_t *r;
+	u8 *xbuf;
+	u32 length;
+
+	/* drain the response queue */
+	while ((xbuf = rndis_get_next_response(params, &length)))
+		rndis_free_response(params, xbuf);
 
 	r = rndis_add_response(params, sizeof(rndis_reset_cmplt_type));
 	if (!r)

From d200cc995188b88832350fae508f79211a284ff7 Mon Sep 17 00:00:00 2001
From: Lianwei Wang <lianwei.wang@gmail.com>
Date: Thu, 9 Jun 2016 23:43:28 -0700
Subject: [PATCH 2997/3220] cpu/hotplug: Handle unbalanced hotplug
 enable/disable

[ Upstream commit 01b41159066531cc8d664362ff0cd89dd137bbfa ]

When cpu_hotplug_enable() is called unbalanced w/o a preceeding
cpu_hotplug_disable() the code emits a warning, but happily decrements the
disabled counter. This causes the next operations to malfunction.

Prevent the decrement and just emit a warning.

Signed-off-by: Lianwei Wang <lianwei.wang@gmail.com>
Cc: peterz@infradead.org
Cc: linux-pm@vger.kernel.org
Cc: oleg@redhat.com
Link: http://lkml.kernel.org/r/1465541008-12476-1-git-send-email-lianwei.wang@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/cpu.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 40d20bf5de28..42ce0b0ae5c5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -183,10 +183,17 @@ void cpu_hotplug_disable(void)
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
 
+static void __cpu_hotplug_enable(void)
+{
+	if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+		return;
+	cpu_hotplug_disabled--;
+}
+
 void cpu_hotplug_enable(void)
 {
 	cpu_maps_update_begin();
-	WARN_ON(--cpu_hotplug_disabled < 0);
+	__cpu_hotplug_enable();
 	cpu_maps_update_done();
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
@@ -626,7 +633,7 @@ void enable_nonboot_cpus(void)
 
 	/* Allow everyone to use the CPU hotplug again */
 	cpu_maps_update_begin();
-	WARN_ON(--cpu_hotplug_disabled < 0);
+	__cpu_hotplug_enable();
 	if (cpumask_empty(frozen_cpus))
 		goto out;
 

From 4a024ea936b649db60f3f28f4c39d31984004b19 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 3 Jul 2018 17:43:09 +0200
Subject: [PATCH 2998/3220] video: fbdev: Set pixclock = 0 in goldfishfb

[ Upstream commit ace6033ec5c356615eaa3582fb1946e9eaff6662 ]

User space Android code identifies pixclock == 0 as a sign for emulation
and will set the frame rate to 60 fps when reading this value, which is
the desired outcome.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Roman Kiryanov <rkir@google.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/video/fbdev/goldfishfb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c
index 14a93cb21310..66d58e93bc32 100644
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -234,7 +234,7 @@ static int goldfish_fb_probe(struct platform_device *pdev)
 	fb->fb.var.activate	= FB_ACTIVATE_NOW;
 	fb->fb.var.height	= readl(fb->reg_base + FB_GET_PHYS_HEIGHT);
 	fb->fb.var.width	= readl(fb->reg_base + FB_GET_PHYS_WIDTH);
-	fb->fb.var.pixclock	= 10000;
+	fb->fb.var.pixclock	= 0;
 
 	fb->fb.var.red.offset = 11;
 	fb->fb.var.red.length = 5;

From c89eceddfaeb07249b5043d98f8b51e7d04ec386 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Date: Wed, 27 Apr 2016 13:55:28 -0300
Subject: [PATCH 2999/3220] arm64: kconfig: drop CONFIG_RTC_LIB dependency

[ Upstream commit 99a507771fa57238dc7ffe674ae06090333d02c9 ]

The rtc-lib dependency is not required, and seems it was just
copy-pasted from ARM's Kconfig. If platform requires rtc-lib,
they should select it individually.

Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5b47218809e0..00c491750918 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -89,7 +89,6 @@ config ARM64
 	select PERF_USE_VMALLOC
 	select POWER_RESET
 	select POWER_SUPPLY
-	select RTC_LIB
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	select HAVE_CONTEXT_TRACKING

From 695fc282c337c5f520582eaf3bda9b53319384e4 Mon Sep 17 00:00:00 2001
From: Chaotian Jing <chaotian.jing@mediatek.com>
Date: Thu, 19 May 2016 16:47:42 +0800
Subject: [PATCH 3000/3220] mmc: mmc: fix switch timeout issue caused by
 jiffies precision

[ Upstream commit 987aa5f8059613bf85cbb6f64ffbd34f5cb7a9d1 ]

with CONFIG_HZ=100, the precision of jiffies is 10ms, and the
generic_cmd6_time of some card is also 10ms. then, may be current
time is only 5ms, but already timed out caused by jiffies precision.

Signed-off-by: Chaotian Jing <chaotian.jing@mediatek.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mmc/core/mmc_ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index 1f444269ebbe..76b49b9772d0 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -542,7 +542,7 @@ int __mmc_switch(struct mmc_card *card, u8 set, u8 index, u8 value,
 		timeout_ms = MMC_OPS_TIMEOUT_MS;
 
 	/* Must check status to be sure of no errors. */
-	timeout = jiffies + msecs_to_jiffies(timeout_ms);
+	timeout = jiffies + msecs_to_jiffies(timeout_ms) + 1;
 	do {
 		if (send_status) {
 			err = __mmc_send_status(card, &status, ignore_crc);

From 949c4ef45eee38d9aad5af5d28078be356c82360 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 9 Jan 2017 11:10:42 +0100
Subject: [PATCH 3001/3220] cfg80211: size various nl80211 messages correctly

[ Upstream commit 4ef8c1c93f848e360754f10eb2e7134c872b6597 ]

Ilan reported that sometimes nl80211 messages weren't working if
the frames being transported got very large, which was really a
problem for userspace-to-kernel messages, but prompted me to look
at the code.

Upon review, I found various places where variable-length data is
transported in an nl80211 message but the message isn't allocated
taking that into account. This shouldn't cause any problems since
the frames aren't really that long, apart in one place where two
(possibly very long frames) might not fit.

Fix all the places (that I found) that get variable length data
from the driver and put it into a message to take the length of
the variable data into account. The 100 there is just a safe
constant for the remaining message overhead (it's usually around
50 for most messages.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/wireless/nl80211.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 642a78079ae1..81013490a99f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -11721,7 +11721,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return;
 
@@ -11873,7 +11873,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
 	if (!msg)
 		return;
 
@@ -11913,7 +11913,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp);
 	if (!msg)
 		return;
 
@@ -11951,7 +11951,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	msg = nlmsg_new(100 + ie_len, GFP_KERNEL);
 	if (!msg)
 		return;
 
@@ -12028,7 +12028,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
 
 	trace_cfg80211_notify_new_peer_candidate(dev, addr);
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + ie_len, gfp);
 	if (!msg)
 		return;
 
@@ -12397,7 +12397,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return -ENOMEM;
 
@@ -12440,7 +12440,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 
 	trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return;
 
@@ -13244,7 +13244,7 @@ void cfg80211_ft_event(struct net_device *netdev,
 	if (!ft_event->target_ap)
 		return;
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	msg = nlmsg_new(100 + ft_event->ric_ies_len, GFP_KERNEL);
 	if (!msg)
 		return;
 

From 46c8459158ae79db789d46b7cb6f1958e2627e9a Mon Sep 17 00:00:00 2001
From: Bhadram Varka <vbhadram@nvidia.com>
Date: Fri, 27 Oct 2017 08:22:02 +0530
Subject: [PATCH 3002/3220] stmmac: copy unicast mac address to MAC registers

[ Upstream commit a830405ee452ddc4101c3c9334e6fedd42c6b357 ]

Currently stmmac driver not copying the valid ethernet
MAC address to MAC registers. This patch takes care
of updating the MAC register with MAC address.

Signed-off-by: Bhadram Varka <vbhadram@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 .../net/ethernet/stmicro/stmmac/stmmac_main.c    | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 7bba30f24135..059113dce6e0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2529,6 +2529,20 @@ static int stmmac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	return ret;
 }
 
+static int stmmac_set_mac_address(struct net_device *ndev, void *addr)
+{
+	struct stmmac_priv *priv = netdev_priv(ndev);
+	int ret = 0;
+
+	ret = eth_mac_addr(ndev, addr);
+	if (ret)
+		return ret;
+
+	priv->hw->mac->set_umac_addr(priv->hw, ndev->dev_addr, 0);
+
+	return ret;
+}
+
 #ifdef CONFIG_DEBUG_FS
 static struct dentry *stmmac_fs_dir;
 
@@ -2730,7 +2744,7 @@ static const struct net_device_ops stmmac_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = stmmac_poll_controller,
 #endif
-	.ndo_set_mac_address = eth_mac_addr,
+	.ndo_set_mac_address = stmmac_set_mac_address,
 };
 
 /**

From 2f242b299bcd79a8e4471c9c6e21c62219d350d7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 Mar 2019 05:46:18 -0700
Subject: [PATCH 3003/3220] dccp: do not use ipv6 header for ipv4 flow

[ Upstream commit e0aa67709f89d08c8d8e5bdd9e0b649df61d0090 ]

When a dual stack dccp listener accepts an ipv4 flow,
it should not attempt to use an ipv6 header or
inet6_iif() helper.

Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/dccp/ipv6.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 18bb2a42f0d1..d2caa4d69159 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -427,8 +427,8 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 		newnp->ipv6_mc_list = NULL;
 		newnp->ipv6_ac_list = NULL;
 		newnp->ipv6_fl_list = NULL;
-		newnp->mcast_oif   = inet6_iif(skb);
-		newnp->mcast_hops  = ipv6_hdr(skb)->hop_limit;
+		newnp->mcast_oif   = inet_iif(skb);
+		newnp->mcast_hops  = ip_hdr(skb)->ttl;
 
 		/*
 		 * No need to charge this sock to the relevant IPv6 refcnt debug socks count

From 3f1c0e4b79652cea3cecadf9af4ee5047a2de3d3 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 18 Mar 2019 08:51:06 -0500
Subject: [PATCH 3004/3220] mISDN: hfcpci: Test both vendor & device ID for
 Digium HFC4S

[ Upstream commit fae846e2b7124d4b076ef17791c73addf3b26350 ]

The device ID alone does not uniquely identify a device.  Test both the
vendor and device ID to make sure we don't mistakenly think some other
vendor's 0xB410 device is a Digium HFC4S.  Also, instead of the bare hex
ID, use the same constant (PCI_DEVICE_ID_DIGIUM_HFC4S) used in the device
ID table.

No functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/hardware/mISDN/hfcmulti.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index 28543d795188..9a27809bdaf2 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -4370,7 +4370,8 @@ setup_pci(struct hfc_multi *hc, struct pci_dev *pdev,
 	if (m->clock2)
 		test_and_set_bit(HFC_CHIP_CLOCK2, &hc->chip);
 
-	if (ent->device == 0xB410) {
+	if (ent->vendor == PCI_VENDOR_ID_DIGIUM &&
+	    ent->device == PCI_DEVICE_ID_DIGIUM_HFC4S) {
 		test_and_set_bit(HFC_CHIP_B410P, &hc->chip);
 		test_and_set_bit(HFC_CHIP_PCM_MASTER, &hc->chip);
 		test_and_clear_bit(HFC_CHIP_PCM_SLAVE, &hc->chip);

From b969da5f7e8de296757f23723a85819ec336e33d Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Mon, 18 Mar 2019 23:14:52 -0700
Subject: [PATCH 3005/3220] net/packet: Set __GFP_NOWARN upon allocation in
 alloc_pg_vec

[ Upstream commit 398f0132c14754fcd03c1c4f8e7176d001ce8ea1 ]

Since commit fc62814d690c ("net/packet: fix 4gb buffer limit due to overflow check")
one can now allocate packet ring buffers >= UINT_MAX. However, syzkaller
found that that triggers a warning:

[   21.100000] WARNING: CPU: 2 PID: 2075 at mm/page_alloc.c:4584 __alloc_pages_nod0
[   21.101490] Modules linked in:
[   21.101921] CPU: 2 PID: 2075 Comm: syz-executor.0 Not tainted 5.0.0 #146
[   21.102784] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011
[   21.103887] RIP: 0010:__alloc_pages_nodemask+0x2a0/0x630
[   21.104640] Code: fe ff ff 65 48 8b 04 25 c0 de 01 00 48 05 90 0f 00 00 41 bd 01 00 00 00 48 89 44 24 48 e9 9c fe 3
[   21.107121] RSP: 0018:ffff88805e1cf920 EFLAGS: 00010246
[   21.107819] RAX: 0000000000000000 RBX: ffffffff85a488a0 RCX: 0000000000000000
[   21.108753] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000000000
[   21.109699] RBP: 1ffff1100bc39f28 R08: ffffed100bcefb67 R09: ffffed100bcefb67
[   21.110646] R10: 0000000000000001 R11: ffffed100bcefb66 R12: 000000000000000d
[   21.111623] R13: 0000000000000000 R14: ffff88805e77d888 R15: 000000000000000d
[   21.112552] FS:  00007f7c7de05700(0000) GS:ffff88806d100000(0000) knlGS:0000000000000000
[   21.113612] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   21.114405] CR2: 000000000065c000 CR3: 000000005e58e006 CR4: 00000000001606e0
[   21.115367] Call Trace:
[   21.115705]  ? __alloc_pages_slowpath+0x21c0/0x21c0
[   21.116362]  alloc_pages_current+0xac/0x1e0
[   21.116923]  kmalloc_order+0x18/0x70
[   21.117393]  kmalloc_order_trace+0x18/0x110
[   21.117949]  packet_set_ring+0x9d5/0x1770
[   21.118524]  ? packet_rcv_spkt+0x440/0x440
[   21.119094]  ? lock_downgrade+0x620/0x620
[   21.119646]  ? __might_fault+0x177/0x1b0
[   21.120177]  packet_setsockopt+0x981/0x2940
[   21.120753]  ? __fget+0x2fb/0x4b0
[   21.121209]  ? packet_release+0xab0/0xab0
[   21.121740]  ? sock_has_perm+0x1cd/0x260
[   21.122297]  ? selinux_secmark_relabel_packet+0xd0/0xd0
[   21.123013]  ? __fget+0x324/0x4b0
[   21.123451]  ? selinux_netlbl_socket_setsockopt+0x101/0x320
[   21.124186]  ? selinux_netlbl_sock_rcv_skb+0x3a0/0x3a0
[   21.124908]  ? __lock_acquire+0x529/0x3200
[   21.125453]  ? selinux_socket_setsockopt+0x5d/0x70
[   21.126075]  ? __sys_setsockopt+0x131/0x210
[   21.126533]  ? packet_release+0xab0/0xab0
[   21.127004]  __sys_setsockopt+0x131/0x210
[   21.127449]  ? kernel_accept+0x2f0/0x2f0
[   21.127911]  ? ret_from_fork+0x8/0x50
[   21.128313]  ? do_raw_spin_lock+0x11b/0x280
[   21.128800]  __x64_sys_setsockopt+0xba/0x150
[   21.129271]  ? lockdep_hardirqs_on+0x37f/0x560
[   21.129769]  do_syscall_64+0x9f/0x450
[   21.130182]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

We should allocate with __GFP_NOWARN to handle this.

Cc: Kal Conley <kal.conley@dectris.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Fixes: fc62814d690c ("net/packet: fix 4gb buffer limit due to overflow check")
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/packet/af_packet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d517dd7f4ac7..25629242a13e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4130,7 +4130,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
 	struct pgv *pg_vec;
 	int i;
 
-	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
+	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
 	if (unlikely(!pg_vec))
 		goto out;
 

From 2c2d8a1c8f12537371b8ca1ca09702474072787d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 Mar 2019 10:41:14 -0700
Subject: [PATCH 3006/3220] net: rose: fix a possible stack overflow

[ Upstream commit e5dcc0c3223c45c94100f05f28d8ef814db3d82c ]

rose_write_internal() uses a temp buffer of 100 bytes, but a manual
inspection showed that given arbitrary input, rose_create_facilities()
can fill up to 110 bytes.

Lets use a tailroom of 256 bytes for peace of mind, and remove
the bounce buffer : we can simply allocate a big enough skb
and adjust its length as needed.

syzbot report :

BUG: KASAN: stack-out-of-bounds in memcpy include/linux/string.h:352 [inline]
BUG: KASAN: stack-out-of-bounds in rose_create_facilities net/rose/rose_subr.c:521 [inline]
BUG: KASAN: stack-out-of-bounds in rose_write_internal+0x597/0x15d0 net/rose/rose_subr.c:116
Write of size 7 at addr ffff88808b1ffbef by task syz-executor.0/24854

CPU: 0 PID: 24854 Comm: syz-executor.0 Not tainted 5.0.0+ #97
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187
 kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 check_memory_region_inline mm/kasan/generic.c:185 [inline]
 check_memory_region+0x123/0x190 mm/kasan/generic.c:191
 memcpy+0x38/0x50 mm/kasan/common.c:131
 memcpy include/linux/string.h:352 [inline]
 rose_create_facilities net/rose/rose_subr.c:521 [inline]
 rose_write_internal+0x597/0x15d0 net/rose/rose_subr.c:116
 rose_connect+0x7cb/0x1510 net/rose/af_rose.c:826
 __sys_connect+0x266/0x330 net/socket.c:1685
 __do_sys_connect net/socket.c:1696 [inline]
 __se_sys_connect net/socket.c:1693 [inline]
 __x64_sys_connect+0x73/0xb0 net/socket.c:1693
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x458079
Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f47b8d9dc78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000458079
RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000004
RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f47b8d9e6d4
R13: 00000000004be4a4 R14: 00000000004ceca8 R15: 00000000ffffffff

The buggy address belongs to the page:
page:ffffea00022c7fc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
flags: 0x1fffc0000000000()
raw: 01fffc0000000000 0000000000000000 ffffffff022c0101 0000000000000000
raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88808b1ffa80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff88808b1ffb00: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 00 00 03
>ffff88808b1ffb80: f2 f2 00 00 00 00 00 00 00 00 00 00 00 00 04 f3
                                                             ^
 ffff88808b1ffc00: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
 ffff88808b1ffc80: 00 00 00 00 00 00 00 f1 f1 f1 f1 f1 f1 01 f2 01

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/rose/rose_subr.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c
index 7ca57741b2fb..7849f286bb93 100644
--- a/net/rose/rose_subr.c
+++ b/net/rose/rose_subr.c
@@ -105,16 +105,17 @@ void rose_write_internal(struct sock *sk, int frametype)
 	struct sk_buff *skb;
 	unsigned char  *dptr;
 	unsigned char  lci1, lci2;
-	char buffer[100];
-	int len, faclen = 0;
+	int maxfaclen = 0;
+	int len, faclen;
+	int reserve;
 
-	len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1;
+	reserve = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1;
+	len = ROSE_MIN_LEN;
 
 	switch (frametype) {
 	case ROSE_CALL_REQUEST:
 		len   += 1 + ROSE_ADDR_LEN + ROSE_ADDR_LEN;
-		faclen = rose_create_facilities(buffer, rose);
-		len   += faclen;
+		maxfaclen = 256;
 		break;
 	case ROSE_CALL_ACCEPTED:
 	case ROSE_CLEAR_REQUEST:
@@ -123,15 +124,16 @@ void rose_write_internal(struct sock *sk, int frametype)
 		break;
 	}
 
-	if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+	skb = alloc_skb(reserve + len + maxfaclen, GFP_ATOMIC);
+	if (!skb)
 		return;
 
 	/*
 	 *	Space for AX.25 header and PID.
 	 */
-	skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1);
+	skb_reserve(skb, reserve);
 
-	dptr = skb_put(skb, skb_tailroom(skb));
+	dptr = skb_put(skb, len);
 
 	lci1 = (rose->lci >> 8) & 0x0F;
 	lci2 = (rose->lci >> 0) & 0xFF;
@@ -146,7 +148,8 @@ void rose_write_internal(struct sock *sk, int frametype)
 		dptr   += ROSE_ADDR_LEN;
 		memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN);
 		dptr   += ROSE_ADDR_LEN;
-		memcpy(dptr, buffer, faclen);
+		faclen = rose_create_facilities(dptr, rose);
+		skb_put(skb, faclen);
 		dptr   += faclen;
 		break;
 

From dd3351b81fac8a7b431fe636fe35415e35f00fa0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 23 Apr 2016 18:26:24 -0400
Subject: [PATCH 3007/3220] Add hlist_add_tail_rcu() (Merge
 git://git.kernel.org/pub/scm/linux/kernel/git/davem/net)

commit 1602f49b58abcb0d34a5f0a29d68e7c1769547aa upstream.

[This commit was a merge, but it added hlist_add_tail_rcu(), which is what we
 need in this stable tree, so I've changed the subject to be more descriptive
 - gregkh]

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/rculist.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 5ed540986019..a579240c64e9 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -401,6 +401,42 @@ static inline void hlist_add_head_rcu(struct hlist_node *n,
 		first->pprev = &n->next;
 }
 
+/**
+ * hlist_add_tail_rcu
+ * @n: the element to add to the hash list.
+ * @h: the list to add to.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist,
+ * while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.  Regardless of the type of CPU, the
+ * list-traversal primitive must be guarded by rcu_read_lock().
+ */
+static inline void hlist_add_tail_rcu(struct hlist_node *n,
+				      struct hlist_head *h)
+{
+	struct hlist_node *i, *last = NULL;
+
+	for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i))
+		last = i;
+
+	if (last) {
+		n->next = last->next;
+		n->pprev = &last->next;
+		rcu_assign_pointer(hlist_next_rcu(last), n);
+	} else {
+		hlist_add_head_rcu(n, h);
+	}
+}
+
 /**
  * hlist_add_before_rcu
  * @n: the new element to add to the hash list.

From 617ac1aa4dd453bed175531894723f3fe2d763dc Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Sat, 16 Mar 2019 14:41:30 +0100
Subject: [PATCH 3008/3220] packets: Always register packet sk in the same
 order

[ Upstream commit a4dc6a49156b1f8d6e17251ffda17c9e6a5db78a ]

When using fanouts with AF_PACKET, the demux functions such as
fanout_demux_cpu will return an index in the fanout socket array, which
corresponds to the selected socket.

The ordering of this array depends on the order the sockets were added
to a given fanout group, so for FANOUT_CPU this means sockets are bound
to cpus in the order they are configured, which is OK.

However, when stopping then restarting the interface these sockets are
bound to, the sockets are reassigned to the fanout group in the reverse
order, due to the fact that they were inserted at the head of the
interface's AF_PACKET socket list.

This means that traffic that was directed to the first socket in the
fanout group is now directed to the last one after an interface restart.

In the case of FANOUT_CPU, traffic from CPU0 will be directed to the
socket that used to receive traffic from the last CPU after an interface
restart.

This commit introduces a helper to add a socket at the tail of a list,
then uses it to register AF_PACKET sockets.

Note that this changes the order in which sockets are listed in /proc and
with sock_diag.

Fixes: dc99f600698d ("packet: Add fanout support")
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/sock.h     | 6 ++++++
 net/packet/af_packet.c | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 7420299c31f5..0aadd3b03ced 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -651,6 +651,12 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
 	hlist_add_head_rcu(&sk->sk_node, list);
 }
 
+static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list)
+{
+	sock_hold(sk);
+	hlist_add_tail_rcu(&sk->sk_node, list);
+}
+
 static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
 	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 25629242a13e..7d93228ba1e1 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3155,7 +3155,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	}
 
 	mutex_lock(&net->packet.sklist_lock);
-	sk_add_node_rcu(sk, &net->packet.sklist);
+	sk_add_node_tail_rcu(sk, &net->packet.sklist);
 	mutex_unlock(&net->packet.sklist_lock);
 
 	preempt_disable();

From 6c362ffe2a22861c121e92dddb3f90047b1bb75d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 Mar 2019 05:45:35 -0700
Subject: [PATCH 3009/3220] tcp: do not use ipv6 header for ipv4 flow

[ Upstream commit 89e4130939a20304f4059ab72179da81f5347528 ]

When a dual stack tcp listener accepts an ipv4 flow,
it should not attempt to use an ipv6 header or tcp_v6_iif() helper.

Fixes: 1397ed35f22d ("ipv6: add flowinfo for tcp6 pkt_options for all cases")
Fixes: df3687ffc665 ("ipv6: add the IPV6_FL_F_REFLECT flag to IPV6_FL_A_GET")
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/tcp_ipv6.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d6c191158e07..6e7f99569bdf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1043,11 +1043,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		newnp->ipv6_fl_list = NULL;
 		newnp->pktoptions  = NULL;
 		newnp->opt	   = NULL;
-		newnp->mcast_oif   = tcp_v6_iif(skb);
-		newnp->mcast_hops  = ipv6_hdr(skb)->hop_limit;
-		newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
+		newnp->mcast_oif   = inet_iif(skb);
+		newnp->mcast_hops  = ip_hdr(skb)->ttl;
+		newnp->rcv_flowinfo = 0;
 		if (np->repflow)
-			newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+			newnp->flow_label = 0;
 
 		/*
 		 * No need to charge this sock to the relevant IPv6 refcnt debug socks count

From c5874f35b86b861ee33e377e5d10fa5d8c2cd981 Mon Sep 17 00:00:00 2001
From: Zhiqiang Liu <liuzhiqiang26@huawei.com>
Date: Sat, 16 Mar 2019 17:02:54 +0800
Subject: [PATCH 3010/3220] vxlan: Don't call gro_cells_destroy() before device
 is unregistered

[ Upstream commit cc4807bb609230d8959fd732b0bf3bd4c2de8eac ]

Commit ad6c9986bcb62 ("vxlan: Fix GRO cells race condition between
receive and link delete") fixed a race condition for the typical case a vxlan
device is dismantled from the current netns. But if a netns is dismantled,
vxlan_destroy_tunnels() is called to schedule a unregister_netdevice_queue()
of all the vxlan tunnels that are related to this netns.

In vxlan_destroy_tunnels(), gro_cells_destroy() is called and finished before
unregister_netdevice_queue(). This means that the gro_cells_destroy() call is
done too soon, for the same reasons explained in above commit.

So we need to fully respect the RCU rules, and thus must remove the
gro_cells_destroy() call or risk use after-free.

Fixes: 58ce31cca1ff ("vxlan: GRO support at tunnel layer")
Signed-off-by: Suanming.Mou <mousuanming@huawei.com>
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/vxlan.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 5dadfc508ade..835129152fc4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3276,10 +3276,8 @@ static void __net_exit vxlan_exit_net(struct net *net)
 		/* If vxlan->dev is in the same netns, it has already been added
 		 * to the list by the previous loop.
 		 */
-		if (!net_eq(dev_net(vxlan->dev), net)) {
-			gro_cells_destroy(&vxlan->gro_cells);
+		if (!net_eq(dev_net(vxlan->dev), net))
 			unregister_netdevice_queue(vxlan->dev, &list);
-		}
 	}
 
 	unregister_netdevice_many(&list);

From 981cb03ec5a650c9ea06a3f7a7d316be1e0f00f1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 18 Mar 2019 19:47:00 +0800
Subject: [PATCH 3011/3220] sctp: get sctphdr by offset in sctp_compute_cksum

[ Upstream commit 273160ffc6b993c7c91627f5a84799c66dfe4dee ]

sctp_hdr(skb) only works when skb->transport_header is set properly.

But in Netfilter, skb->transport_header for ipv6 is not guaranteed
to be right value for sctphdr. It would cause to fail to check the
checksum for sctp packets.

So fix it by using offset, which is always right in all places.

v1->v2:
  - Fix the changelog.

Fixes: e6d8b64b34aa ("net: sctp: fix and consolidate SCTP checksumming code")
Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/sctp/checksum.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 4a5b9a306c69..803fc26ef0ba 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -60,7 +60,7 @@ static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2,
 static inline __le32 sctp_compute_cksum(const struct sk_buff *skb,
 					unsigned int offset)
 {
-	struct sctphdr *sh = sctp_hdr(skb);
+	struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
         __le32 ret, old = sh->checksum;
 	const struct skb_checksum_ops ops = {
 		.update  = sctp_csum_update,

From 48b22ac589985f10153afe09624950bb91aeb99a Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 16 Mar 2019 14:21:19 +1100
Subject: [PATCH 3012/3220] mac8390: Fix mmio access size probe

[ Upstream commit bb9e5c5bcd76f4474eac3baf643d7a39f7bac7bb ]

The bug that Stan reported is as follows. After a restart, a 16-bit NIC
may be incorrectly identified as a 32-bit NIC and stop working.

mac8390 slot.E: Memory length resource not found, probing
mac8390 slot.E: Farallon EtherMac II-C (type farallon)
mac8390 slot.E: MAC 00:00:c5:30:c2:99, IRQ 61, 32 KB shared memory at 0xfeed0000, 32-bit access.

The bug never arises after a cold start and only intermittently after a
warm start. (I didn't investigate why the bug is intermittent.)

It turns out that memcpy_toio() is deprecated and memcmp_withio() also
has issues. Replacing these calls with mmio accessors fixes the problem.

Reported-and-tested-by: Stan Johnson <userm57@yahoo.com>
Fixes: 2964db0f5904 ("m68k: Mac DP8390 update")
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/8390/mac8390.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/8390/mac8390.c b/drivers/net/ethernet/8390/mac8390.c
index b9283901136e..0fdc9ad32a2e 100644
--- a/drivers/net/ethernet/8390/mac8390.c
+++ b/drivers/net/ethernet/8390/mac8390.c
@@ -156,8 +156,6 @@ static void dayna_block_output(struct net_device *dev, int count,
 #define memcpy_fromio(a, b, c)	memcpy((a), (void *)(b), (c))
 #define memcpy_toio(a, b, c)	memcpy((void *)(a), (b), (c))
 
-#define memcmp_withio(a, b, c)	memcmp((a), (void *)(b), (c))
-
 /* Slow Sane (16-bit chunk memory read/write) Cabletron uses this */
 static void slow_sane_get_8390_hdr(struct net_device *dev,
 				   struct e8390_pkt_hdr *hdr, int ring_page);
@@ -237,19 +235,26 @@ static enum mac8390_type __init mac8390_ident(struct nubus_dev *dev)
 
 static enum mac8390_access __init mac8390_testio(volatile unsigned long membase)
 {
-	unsigned long outdata = 0xA5A0B5B0;
-	unsigned long indata =  0x00000000;
+	u32 outdata = 0xA5A0B5B0;
+	u32 indata = 0;
+
 	/* Try writing 32 bits */
-	memcpy_toio(membase, &outdata, 4);
-	/* Now compare them */
-	if (memcmp_withio(&outdata, membase, 4) == 0)
+	nubus_writel(outdata, membase);
+	/* Now read it back */
+	indata = nubus_readl(membase);
+	if (outdata == indata)
 		return ACCESS_32;
+
+	outdata = 0xC5C0D5D0;
+	indata = 0;
+
 	/* Write 16 bit output */
 	word_memcpy_tocard(membase, &outdata, 4);
 	/* Now read it back */
 	word_memcpy_fromcard(&indata, membase, 4);
 	if (outdata == indata)
 		return ACCESS_16;
+
 	return ACCESS_UNKNOWN;
 }
 

From a23f00416aaeb2c5b71fd954917d073c8f068c06 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 6 Mar 2019 17:13:04 -0500
Subject: [PATCH 3013/3220] btrfs: remove WARN_ON in log_dir_items

commit 2cc8334270e281815c3850c3adea363c51f21e0d upstream.

When Filipe added the recursive directory logging stuff in
2f2ff0ee5e430 ("Btrfs: fix metadata inconsistencies after directory
fsync") he specifically didn't take the directory i_mutex for the
children directories that we need to log because of lockdep.  This is
generally fine, but can lead to this WARN_ON() tripping if we happen to
run delayed deletion's in between our first search and our second search
of dir_item/dir_indexes for this directory.  We expect this to happen,
so the WARN_ON() isn't necessary.  Drop the WARN_ON() and add a comment
so we know why this case can happen.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-log.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 63f59f17c97e..c7190f322576 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3321,9 +3321,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
-	/* find the first key from this transaction again */
+	/*
+	 * Find the first key from this transaction again.  See the note for
+	 * log_new_dir_dentries, if we're logging a directory recursively we
+	 * won't be holding its i_mutex, which means we can modify the directory
+	 * while we're logging it.  If we remove an entry between our first
+	 * search and this search we'll not find the key again and can just
+	 * bail.
+	 */
 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
-	if (WARN_ON(ret != 0))
+	if (ret != 0)
 		goto done;
 
 	/*

From 61bde5e5af49f42c7d398d8bbbf4f0e9fae7ca72 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Thu, 14 Mar 2019 08:56:28 +0100
Subject: [PATCH 3014/3220] btrfs: raid56: properly unmap parity page in
 finish_parity_scrub()

commit 3897b6f0a859288c22fb793fad11ec2327e60fcd upstream.

Parity page is incorrectly unmapped in finish_parity_scrub(), triggering
a reference counter bug on i386, i.e.:

 [ 157.662401] kernel BUG at mm/highmem.c:349!
 [ 157.666725] invalid opcode: 0000 [#1] SMP PTI

The reason is that kunmap(p_page) was completely left out, so we never
did an unmap for the p_page and the loop unmapping the rbio page was
iterating over the wrong number of stripes: unmapping should be done
with nr_data instead of rbio->real_stripes.

Test case to reproduce the bug:

 - create a raid5 btrfs filesystem:
   # mkfs.btrfs -m raid5 -d raid5 /dev/sdb /dev/sdc /dev/sdd /dev/sde

 - mount it:
   # mount /dev/sdb /mnt

 - run btrfs scrub in a loop:
   # while :; do btrfs scrub start -BR /mnt; done

BugLink: https://bugs.launchpad.net/bugs/1812845
Fixes: 5a6ac9eacb49 ("Btrfs, raid56: support parity scrub on raid56")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/raid56.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index b9fa99577bf7..2d2a76906786 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2420,8 +2420,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 			bitmap_clear(rbio->dbitmap, pagenr, 1);
 		kunmap(p);
 
-		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+		for (stripe = 0; stripe < nr_data; stripe++)
 			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+		kunmap(p_page);
 	}
 
 	__free_page(p_page);

From ed2f3c82b0b48de01d9c74e73daef58d00035274 Mon Sep 17 00:00:00 2001
From: Kohji Okuno <okuno.kohji@jp.panasonic.com>
Date: Tue, 26 Feb 2019 11:34:13 +0900
Subject: [PATCH 3015/3220] ARM: imx6q: cpuidle: fix bug that CPU might not
 wake up at expected time

commit 91740fc8242b4f260cfa4d4536d8551804777fae upstream.

In the current cpuidle implementation for i.MX6q, the CPU that sets
'WAIT_UNCLOCKED' and the CPU that returns to 'WAIT_CLOCKED' are always
the same. While the CPU that sets 'WAIT_UNCLOCKED' is in IDLE state of
"WAIT", if the other CPU wakes up and enters IDLE state of "WFI"
istead of "WAIT", this CPU can not wake up at expired time.
 Because, in the case of "WFI", the CPU must be waked up by the local
timer interrupt. But, while 'WAIT_UNCLOCKED' is set, the local timer
is stopped, when all CPUs execute "wfi" instruction. As a result, the
local timer interrupt is not fired.
 In this situation, this CPU will wake up by IRQ different from local
timer. (e.g. broacast timer)

So, this fix changes CPU to return to 'WAIT_CLOCKED'.

Signed-off-by: Kohji Okuno <okuno.kohji@jp.panasonic.com>
Fixes: e5f9dec8ff5f ("ARM: imx6q: support WAIT mode using cpuidle")
Cc: <stable@vger.kernel.org>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-imx/cpuidle-imx6q.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/arch/arm/mach-imx/cpuidle-imx6q.c b/arch/arm/mach-imx/cpuidle-imx6q.c
index 353bb8774112..ec74c2812c1a 100644
--- a/arch/arm/mach-imx/cpuidle-imx6q.c
+++ b/arch/arm/mach-imx/cpuidle-imx6q.c
@@ -14,30 +14,23 @@
 #include "cpuidle.h"
 #include "hardware.h"
 
-static atomic_t master = ATOMIC_INIT(0);
-static DEFINE_SPINLOCK(master_lock);
+static int num_idle_cpus = 0;
+static DEFINE_SPINLOCK(cpuidle_lock);
 
 static int imx6q_enter_wait(struct cpuidle_device *dev,
 			    struct cpuidle_driver *drv, int index)
 {
-	if (atomic_inc_return(&master) == num_online_cpus()) {
-		/*
-		 * With this lock, we prevent other cpu to exit and enter
-		 * this function again and become the master.
-		 */
-		if (!spin_trylock(&master_lock))
-			goto idle;
+	spin_lock(&cpuidle_lock);
+	if (++num_idle_cpus == num_online_cpus())
 		imx6_set_lpm(WAIT_UNCLOCKED);
-		cpu_do_idle();
-		imx6_set_lpm(WAIT_CLOCKED);
-		spin_unlock(&master_lock);
-		goto done;
-	}
+	spin_unlock(&cpuidle_lock);
 
-idle:
 	cpu_do_idle();
-done:
-	atomic_dec(&master);
+
+	spin_lock(&cpuidle_lock);
+	if (num_idle_cpus-- == num_online_cpus())
+		imx6_set_lpm(WAIT_CLOCKED);
+	spin_unlock(&cpuidle_lock);
 
 	return index;
 }

From e162927cd1617028a3e2e775a24a89fccb7f67fd Mon Sep 17 00:00:00 2001
From: Ravindra Lokhande <rlokhande@nvidia.com>
Date: Mon, 7 Dec 2015 12:08:31 +0530
Subject: [PATCH 3016/3220] ALSA: compress: add support for 32bit calls in a
 64bit kernel

commit c10368897e104c008c610915a218f0fe5fa4ec96 upstream.

Compress offload does not support ioctl calls from a 32bit userspace
in a 64 bit kernel. This patch adds support for ioctls from a 32bit
userspace in a 64bit kernel

Signed-off-by: Ravindra Lokhande <rlokhande@nvidia.com>
Acked-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/core/compress_offload.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c
index 2272aee12871..3c88a3384064 100644
--- a/sound/core/compress_offload.c
+++ b/sound/core/compress_offload.c
@@ -38,6 +38,7 @@
 #include <linux/uio.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 #include <sound/core.h>
 #include <sound/initval.h>
 #include <sound/compress_params.h>
@@ -859,6 +860,15 @@ static long snd_compr_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 	return retval;
 }
 
+/* support of 32bit userspace on 64bit platforms */
+#ifdef CONFIG_COMPAT
+static long snd_compr_ioctl_compat(struct file *file, unsigned int cmd,
+						unsigned long arg)
+{
+	return snd_compr_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
 static const struct file_operations snd_compr_file_ops = {
 		.owner =	THIS_MODULE,
 		.open =		snd_compr_open,
@@ -866,6 +876,9 @@ static const struct file_operations snd_compr_file_ops = {
 		.write =	snd_compr_write,
 		.read =		snd_compr_read,
 		.unlocked_ioctl = snd_compr_ioctl,
+#ifdef CONFIG_COMPAT
+		.compat_ioctl = snd_compr_ioctl_compat,
+#endif
 		.mmap =		snd_compr_mmap,
 		.poll =		snd_compr_poll,
 };

From 619ae9f179dea5cae3a297ca3f06071fe2068c8a Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 20 Mar 2019 16:15:24 -0500
Subject: [PATCH 3017/3220] ALSA: rawmidi: Fix potential Spectre v1
 vulnerability

commit 2b1d9c8f87235f593826b9cf46ec10247741fff9 upstream.

info->stream is indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

sound/core/rawmidi.c:604 __snd_rawmidi_info_select() warn: potential spectre issue 'rmidi->streams' [r] (local cap)

Fix this by sanitizing info->stream before using it to index
rmidi->streams.

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://lore.kernel.org/lkml/20180423164740.GY17484@dhcp22.suse.cz/

Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/core/rawmidi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index 59111cadaec2..c8b2309352d7 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -29,6 +29,7 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/delay.h>
+#include <linux/nospec.h>
 #include <sound/rawmidi.h>
 #include <sound/info.h>
 #include <sound/control.h>
@@ -591,6 +592,7 @@ static int __snd_rawmidi_info_select(struct snd_card *card,
 		return -ENXIO;
 	if (info->stream < 0 || info->stream > 1)
 		return -EINVAL;
+	info->stream = array_index_nospec(info->stream, 2);
 	pstr = &rmidi->streams[info->stream];
 	if (pstr->substream_count == 0)
 		return -ENOENT;

From f98242a8debb8dd9fbbabc3ae1fc80c316cf8945 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 20 Mar 2019 18:42:01 -0500
Subject: [PATCH 3018/3220] ALSA: seq: oss: Fix Spectre v1 vulnerability

commit c709f14f0616482b67f9fbcb965e1493a03ff30b upstream.

dev is indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

sound/core/seq/oss/seq_oss_synth.c:626 snd_seq_oss_synth_make_info() warn: potential spectre issue 'dp->synths' [w] (local cap)

Fix this by sanitizing dev before using it to index dp->synths.

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://lore.kernel.org/lkml/20180423164740.GY17484@dhcp22.suse.cz/

Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/core/seq/oss/seq_oss_synth.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sound/core/seq/oss/seq_oss_synth.c b/sound/core/seq/oss/seq_oss_synth.c
index ea545f9291b4..df5b984bb33f 100644
--- a/sound/core/seq/oss/seq_oss_synth.c
+++ b/sound/core/seq/oss/seq_oss_synth.c
@@ -617,13 +617,14 @@ int
 snd_seq_oss_synth_make_info(struct seq_oss_devinfo *dp, int dev, struct synth_info *inf)
 {
 	struct seq_oss_synth *rec;
+	struct seq_oss_synthinfo *info = get_synthinfo_nospec(dp, dev);
 
-	if (dev < 0 || dev >= dp->max_synthdev)
+	if (!info)
 		return -ENXIO;
 
-	if (dp->synths[dev].is_midi) {
+	if (info->is_midi) {
 		struct midi_info minf;
-		snd_seq_oss_midi_make_info(dp, dp->synths[dev].midi_mapped, &minf);
+		snd_seq_oss_midi_make_info(dp, info->midi_mapped, &minf);
 		inf->synth_type = SYNTH_TYPE_MIDI;
 		inf->synth_subtype = 0;
 		inf->nr_voices = 16;

From c07db6f073aba3513f5896dd9b736acf490d8d1c Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 22 Mar 2019 16:00:54 +0100
Subject: [PATCH 3019/3220] ALSA: pcm: Fix possible OOB access in PCM oss
 plugins

commit ca0214ee2802dd47239a4e39fb21c5b00ef61b22 upstream.

The PCM OSS emulation converts and transfers the data on the fly via
"plugins".  The data is converted over the dynamically allocated
buffer for each plugin, and recently syzkaller caught OOB in this
flow.

Although the bisection by syzbot pointed out to the commit
65766ee0bf7f ("ALSA: oss: Use kvzalloc() for local buffer
allocations"), this is merely a commit to replace vmalloc() with
kvmalloc(), hence it can't be the cause.  The further debug action
revealed that this happens in the case where a slave PCM doesn't
support only the stereo channels while the OSS stream is set up for a
mono channel.  Below is a brief explanation:

At each OSS parameter change, the driver sets up the PCM hw_params
again in snd_pcm_oss_change_params_lock().  This is also the place
where plugins are created and local buffers are allocated.  The
problem is that the plugins are created before the final hw_params is
determined.  Namely, two snd_pcm_hw_param_near() calls for setting the
period size and periods may influence on the final result of channels,
rates, etc, too, while the current code has already created plugins
beforehand with the premature values.  So, the plugin believes that
channels=1, while the actual I/O is with channels=2, which makes the
driver reading/writing over the allocated buffer size.

The fix is simply to move the plugin allocation code after the final
hw_params call.

Reported-by: syzbot+d4503ae45b65c5bc1194@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/core/oss/pcm_oss.c | 43 ++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 07feb35f1935..443bb8ce8255 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -950,6 +950,28 @@ static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream)
 	oss_frame_size = snd_pcm_format_physical_width(params_format(params)) *
 			 params_channels(params) / 8;
 
+	err = snd_pcm_oss_period_size(substream, params, sparams);
+	if (err < 0)
+		goto failure;
+
+	n = snd_pcm_plug_slave_size(substream, runtime->oss.period_bytes / oss_frame_size);
+	err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, n, NULL);
+	if (err < 0)
+		goto failure;
+
+	err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIODS,
+				     runtime->oss.periods, NULL);
+	if (err < 0)
+		goto failure;
+
+	snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL);
+
+	err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_HW_PARAMS, sparams);
+	if (err < 0) {
+		pcm_dbg(substream->pcm, "HW_PARAMS failed: %i\n", err);
+		goto failure;
+	}
+
 #ifdef CONFIG_SND_PCM_OSS_PLUGINS
 	snd_pcm_oss_plugin_clear(substream);
 	if (!direct) {
@@ -984,27 +1006,6 @@ static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream)
 	}
 #endif
 
-	err = snd_pcm_oss_period_size(substream, params, sparams);
-	if (err < 0)
-		goto failure;
-
-	n = snd_pcm_plug_slave_size(substream, runtime->oss.period_bytes / oss_frame_size);
-	err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, n, NULL);
-	if (err < 0)
-		goto failure;
-
-	err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIODS,
-				     runtime->oss.periods, NULL);
-	if (err < 0)
-		goto failure;
-
-	snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL);
-
-	if ((err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_HW_PARAMS, sparams)) < 0) {
-		pcm_dbg(substream->pcm, "HW_PARAMS failed: %i\n", err);
-		goto failure;
-	}
-
 	if (runtime->oss.trigger) {
 		sw_params->start_threshold = 1;
 	} else {

From 49ad739854431e23da239f03116addc16ba3e81a Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 25 Mar 2019 10:38:58 +0100
Subject: [PATCH 3020/3220] ALSA: pcm: Don't suspend stream in unrecoverable
 PCM state

commit 113ce08109f8e3b091399e7cc32486df1cff48e7 upstream.

Currently PCM core sets each opened stream forcibly to SUSPENDED state
via snd_pcm_suspend_all() call, and the user-space is responsible for
re-triggering the resume manually either via snd_pcm_resume() or
prepare call.  The scheme works fine usually, but there are corner
cases where the stream can't be resumed by that call: the streams
still in OPEN state before finishing hw_params.  When they are
suspended, user-space cannot perform resume or prepare because they
haven't been set up yet.  The only possible recovery is to re-open the
device, which isn't nice at all.  Similarly, when a stream is in
DISCONNECTED state, it makes no sense to change it to SUSPENDED
state.  Ditto for in SETUP state; which you can re-prepare directly.

So, this patch addresses these issues by filtering the PCM streams to
be suspended by checking the PCM state.  When a stream is in either
OPEN, SETUP or DISCONNECTED as well as already SUSPENDED, the suspend
action is skipped.

To be noted, this problem was originally reported for the PCM runtime
PM on HD-audio.  And, the runtime PM problem itself was already
addressed (although not intended) by the code refactoring commits
3d21ef0b49f8 ("ALSA: pcm: Suspend streams globally via device type PM
ops") and 17bc4815de58 ("ALSA: pci: Remove superfluous
snd_pcm_suspend*() calls").  These commits eliminated the
snd_pcm_suspend*() calls from the runtime PM suspend callback code
path, hence the racy OPEN state won't appear while runtime PM.
(FWIW, the race window is between snd_pcm_open_substream() and the
first power up in azx_pcm_open().)

Although the runtime PM issue was already "fixed", the same problem is
still present for the system PM, hence this patch is still needed.
And for stable trees, this patch alone should suffice for fixing the
runtime PM problem, too.

Reported-and-tested-by: Jon Hunter <jonathanh@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/core/pcm_native.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 9b6dcdea4431..4d6f0f56d54a 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -1254,8 +1254,15 @@ static int snd_pcm_pause(struct snd_pcm_substream *substream, int push)
 static int snd_pcm_pre_suspend(struct snd_pcm_substream *substream, int state)
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
-	if (runtime->status->state == SNDRV_PCM_STATE_SUSPENDED)
+	switch (runtime->status->state) {
+	case SNDRV_PCM_STATE_SUSPENDED:
 		return -EBUSY;
+	/* unresumable PCM state; return -EBUSY for skipping suspend */
+	case SNDRV_PCM_STATE_OPEN:
+	case SNDRV_PCM_STATE_SETUP:
+	case SNDRV_PCM_STATE_DISCONNECTED:
+		return -EBUSY;
+	}
 	runtime->trigger_master = substream;
 	return 0;
 }

From 2f369124b97a8992262c632478be904cde85325a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Mon, 25 Mar 2019 10:01:46 -0700
Subject: [PATCH 3021/3220] scsi: sd: Fix a race between closing an sd device
 and sd I/O

commit c14a57264399efd39514a2329c591a4b954246d8 upstream.

The scsi_end_request() function calls scsi_cmd_to_driver() indirectly and
hence needs the disk->private_data pointer. Avoid that that pointer is
cleared before all affected I/O requests have finished. This patch avoids
that the following crash occurs:

Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
Call trace:
 scsi_mq_uninit_cmd+0x1c/0x30
 scsi_end_request+0x7c/0x1b8
 scsi_io_completion+0x464/0x668
 scsi_finish_command+0xbc/0x160
 scsi_eh_flush_done_q+0x10c/0x170
 sas_scsi_recover_host+0x84c/0xa98 [libsas]
 scsi_error_handler+0x140/0x5b0
 kthread+0x100/0x12c
 ret_from_fork+0x10/0x18

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Jason Yan <yanaijie@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reported-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/scsi/sd.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index ec80a0077ace..62adaca8fb97 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1276,11 +1276,6 @@ static void sd_release(struct gendisk *disk, fmode_t mode)
 			scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW);
 	}
 
-	/*
-	 * XXX and what if there are packets in flight and this close()
-	 * XXX is followed by a "rmmod sd_mod"?
-	 */
-
 	scsi_disk_put(sdkp);
 }
 
@@ -3227,11 +3222,23 @@ static void scsi_disk_release(struct device *dev)
 {
 	struct scsi_disk *sdkp = to_scsi_disk(dev);
 	struct gendisk *disk = sdkp->disk;
-	
+	struct request_queue *q = disk->queue;
+
 	spin_lock(&sd_index_lock);
 	ida_remove(&sd_index_ida, sdkp->index);
 	spin_unlock(&sd_index_lock);
 
+	/*
+	 * Wait until all requests that are in progress have completed.
+	 * This is necessary to avoid that e.g. scsi_end_request() crashes
+	 * due to clearing the disk->private_data pointer. Wait from inside
+	 * scsi_disk_release() instead of from sd_release() to avoid that
+	 * freezing and unfreezing the request queue affects user space I/O
+	 * in case multiple processes open a /dev/sd... node concurrently.
+	 */
+	blk_mq_freeze_queue(q);
+	blk_mq_unfreeze_queue(q);
+
 	disk->private_data = NULL;
 	put_disk(disk);
 	put_device(&sdkp->device->sdev_gendev);

From d8007fb2e81618e08c4cd44ea8d63f5872ce5840 Mon Sep 17 00:00:00 2001
From: Steffen Maier <maier@linux.ibm.com>
Date: Tue, 26 Mar 2019 14:36:58 +0100
Subject: [PATCH 3022/3220] scsi: zfcp: fix rport unblock if deleted SCSI
 devices on Scsi_Host

commit fe67888fc007a76b81e37da23ce5bd8fb95890b0 upstream.

An already deleted SCSI device can exist on the Scsi_Host and remain there
because something still holds a reference.  A new SCSI device with the same
H:C:T:L and FCP device, target port WWPN, and FCP LUN can be created.  When
we try to unblock an rport, we still find the deleted SCSI device and
return early because the zfcp_scsi_dev of that SCSI device is not
ZFCP_STATUS_COMMON_UNBLOCKED. Hence we miss to unblock the rport, even if
the new proper SCSI device would be in good state.

Therefore, skip deleted SCSI devices when iterating the sdevs of the shost.
[cf. __scsi_device_lookup{_by_target}() or scsi_device_get()]

The following abbreviated trace sequence can indicate such problem:

Area           : REC
Tag            : ersfs_3
LUN            : 0x4045400300000000
WWPN           : 0x50050763031bd327
LUN status     : 0x40000000     not ZFCP_STATUS_COMMON_UNBLOCKED
Ready count    : n		not incremented yet
Running count  : 0x00000000
ERP want       : 0x01
ERP need       : 0xc1		ZFCP_ERP_ACTION_NONE

Area           : REC
Tag            : ersfs_3
LUN            : 0x4045400300000000
WWPN           : 0x50050763031bd327
LUN status     : 0x41000000
Ready count    : n+1
Running count  : 0x00000000
ERP want       : 0x01
ERP need       : 0x01

...

Area           : REC
Level          : 4		only with increased trace level
Tag            : ertru_l
LUN            : 0x4045400300000000
WWPN           : 0x50050763031bd327
LUN status     : 0x40000000
Request ID     : 0x0000000000000000
ERP status     : 0x01800000
ERP step       : 0x1000
ERP action     : 0x01
ERP count      : 0x00

NOT followed by a trace record with tag "scpaddy"
for WWPN 0x50050763031bd327.

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Fixes: 6f2ce1c6af37 ("scsi: zfcp: fix rport unblock race with LUN recovery")
Cc: <stable@vger.kernel.org> #2.6.32+
Reviewed-by: Jens Remus <jremus@linux.ibm.com>
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/s390/scsi/zfcp_erp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 2abcd331b05d..dbab3fa9e779 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -1306,6 +1306,9 @@ static void zfcp_erp_try_rport_unblock(struct zfcp_port *port)
 		struct zfcp_scsi_dev *zsdev = sdev_to_zfcp(sdev);
 		int lun_status;
 
+		if (sdev->sdev_state == SDEV_DEL ||
+		    sdev->sdev_state == SDEV_CANCEL)
+			continue;
 		if (zsdev->port != port)
 			continue;
 		/* LUN under port of interest */

From 21da2b461fbda5570e17e10269bb6ca739d099e4 Mon Sep 17 00:00:00 2001
From: Steffen Maier <maier@linux.ibm.com>
Date: Tue, 26 Mar 2019 14:36:59 +0100
Subject: [PATCH 3023/3220] scsi: zfcp: fix scsi_eh host reset with port_forced
 ERP for non-NPIV FCP devices

commit 242ec1455151267fe35a0834aa9038e4c4670884 upstream.

Suppose more than one non-NPIV FCP device is active on the same channel.
Send I/O to storage and have some of the pending I/O run into a SCSI
command timeout, e.g. due to bit errors on the fibre. Now the error
situation stops. However, we saw FCP requests continue to timeout in the
channel. The abort will be successful, but the subsequent TUR fails.
Scsi_eh starts. The LUN reset fails. The target reset fails.  The host
reset only did an FCP device recovery. However, for non-NPIV FCP devices,
this does not close and reopen ports on the SAN-side if other non-NPIV FCP
device(s) share the same open ports.

In order to resolve the continuing FCP request timeouts, we need to
explicitly close and reopen ports on the SAN-side.

This was missing since the beginning of zfcp in v2.6.0 history commit
ea127f975424 ("[PATCH] s390 (7/7): zfcp host adapter.").

Note: The FSF requests for forced port reopen could run into FSF request
timeouts due to other reasons. This would trigger an internal FCP device
recovery. Pending forced port reopen recoveries would get dismissed. So
some ports might not get fully reopened during this host reset handler.
However, subsequent I/O would trigger the above described escalation and
eventually all ports would be forced reopen to resolve any continuing FCP
request timeouts due to earlier bit errors.

Signed-off-by: Steffen Maier <maier@linux.ibm.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: <stable@vger.kernel.org> #3.0+
Reviewed-by: Jens Remus <jremus@linux.ibm.com>
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/s390/scsi/zfcp_erp.c  | 14 ++++++++++++++
 drivers/s390/scsi/zfcp_ext.h  |  2 ++
 drivers/s390/scsi/zfcp_scsi.c |  4 ++++
 3 files changed, 20 insertions(+)

diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index dbab3fa9e779..abe460eac712 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -652,6 +652,20 @@ static void zfcp_erp_strategy_memwait(struct zfcp_erp_action *erp_action)
 	add_timer(&erp_action->timer);
 }
 
+void zfcp_erp_port_forced_reopen_all(struct zfcp_adapter *adapter,
+				     int clear, char *dbftag)
+{
+	unsigned long flags;
+	struct zfcp_port *port;
+
+	write_lock_irqsave(&adapter->erp_lock, flags);
+	read_lock(&adapter->port_list_lock);
+	list_for_each_entry(port, &adapter->port_list, list)
+		_zfcp_erp_port_forced_reopen(port, clear, dbftag);
+	read_unlock(&adapter->port_list_lock);
+	write_unlock_irqrestore(&adapter->erp_lock, flags);
+}
+
 static void _zfcp_erp_port_reopen_all(struct zfcp_adapter *adapter,
 				      int clear, char *id)
 {
diff --git a/drivers/s390/scsi/zfcp_ext.h b/drivers/s390/scsi/zfcp_ext.h
index b326f05c7f89..a39a74500e23 100644
--- a/drivers/s390/scsi/zfcp_ext.h
+++ b/drivers/s390/scsi/zfcp_ext.h
@@ -68,6 +68,8 @@ extern void zfcp_erp_clear_port_status(struct zfcp_port *, u32);
 extern int  zfcp_erp_port_reopen(struct zfcp_port *, int, char *);
 extern void zfcp_erp_port_shutdown(struct zfcp_port *, int, char *);
 extern void zfcp_erp_port_forced_reopen(struct zfcp_port *, int, char *);
+extern void zfcp_erp_port_forced_reopen_all(struct zfcp_adapter *adapter,
+					    int clear, char *dbftag);
 extern void zfcp_erp_set_lun_status(struct scsi_device *, u32);
 extern void zfcp_erp_clear_lun_status(struct scsi_device *, u32);
 extern void zfcp_erp_lun_reopen(struct scsi_device *, int, char *);
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index 3afb200b2829..bdb257eaa2e5 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -326,6 +326,10 @@ static int zfcp_scsi_eh_host_reset_handler(struct scsi_cmnd *scpnt)
 	struct zfcp_adapter *adapter = zfcp_sdev->port->adapter;
 	int ret = SUCCESS, fc_ret;
 
+	if (!(adapter->connection_features & FSF_FEATURE_NPIV_MODE)) {
+		zfcp_erp_port_forced_reopen_all(adapter, 0, "schrh_p");
+		zfcp_erp_wait(adapter);
+	}
 	zfcp_erp_adapter_reopen(adapter, 0, "schrh_1");
 	zfcp_erp_wait(adapter);
 	fc_ret = fc_block_scsi_eh(scpnt);

From 13f6808ec2bb43cffafd3a24b01692676bdfccfc Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Fri, 15 Mar 2019 12:16:06 -0500
Subject: [PATCH 3024/3220] tty: atmel_serial: fix a potential NULL pointer
 dereference

commit c85be041065c0be8bc48eda4c45e0319caf1d0e5 upstream.

In case dmaengine_prep_dma_cyclic fails, the fix returns a proper
error code to avoid NULL pointer dereference.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Fixes: 34df42f59a60 ("serial: at91: add rx dma support")
Acked-by: Richard Genoud <richard.genoud@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/atmel_serial.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index e0277cf0bf58..f5c4e92b5172 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -1167,6 +1167,10 @@ static int atmel_prepare_rx_dma(struct uart_port *port)
 					 sg_dma_len(&atmel_port->sg_rx)/2,
 					 DMA_DEV_TO_MEM,
 					 DMA_PREP_INTERRUPT);
+	if (!desc) {
+		dev_err(port->dev, "Preparing DMA cyclic failed\n");
+		goto chan_err;
+	}
 	desc->callback = atmel_complete_rx_dma;
 	desc->callback_param = port;
 	atmel_port->desc_rx = desc;

From 17400647682ac54e88e97b28ee0df85f9305a0dc Mon Sep 17 00:00:00 2001
From: Malcolm Priestley <tvboxspy@gmail.com>
Date: Wed, 27 Mar 2019 18:45:26 +0000
Subject: [PATCH 3025/3220] staging: vt6655: Remove vif check from
 vnt_interrupt

commit cc26358f89c3e493b54766b1ca56cfc6b14db78a upstream.

A check for vif is made in vnt_interrupt_work.

There is a small chance of leaving interrupt disabled while vif
is NULL and the work hasn't been scheduled.

Signed-off-by: Malcolm Priestley <tvboxspy@gmail.com>
CC: stable@vger.kernel.org # v4.2+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/vt6655/device_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/staging/vt6655/device_main.c b/drivers/staging/vt6655/device_main.c
index 8fd8f3a2d1bf..00027072fa3e 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -1079,8 +1079,7 @@ static irqreturn_t vnt_interrupt(int irq,  void *arg)
 {
 	struct vnt_private *priv = arg;
 
-	if (priv->vif)
-		schedule_work(&priv->interrupt_work);
+	schedule_work(&priv->interrupt_work);
 
 	return IRQ_HANDLED;
 }

From 2e7c2f25e902579fc81636c0f5c9f856f4428d7c Mon Sep 17 00:00:00 2001
From: Malcolm Priestley <tvboxspy@gmail.com>
Date: Sun, 24 Mar 2019 18:53:49 +0000
Subject: [PATCH 3026/3220] staging: vt6655: Fix interrupt race condition on
 device start up.

commit 3b9c2f2e0e99bb67c96abcb659b3465efe3bee1f upstream.

It appears on some slower systems that the driver can find its way
out of the workqueue while the interrupt is disabled by continuous polling
by it.

Move MACvIntEnable to vnt_interrupt_work so that it is always enabled
on all routes out of vnt_interrupt_process.

Move MACvIntDisable so that the device doesn't keep polling the system
while the workqueue is being processed.

Signed-off-by: Malcolm Priestley <tvboxspy@gmail.com>
CC: stable@vger.kernel.org # v4.2+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/vt6655/device_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/vt6655/device_main.c b/drivers/staging/vt6655/device_main.c
index 00027072fa3e..58b6403458b7 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -972,8 +972,6 @@ static void vnt_interrupt_process(struct vnt_private *priv)
 		return;
 	}
 
-	MACvIntDisable(priv->PortOffset);
-
 	spin_lock_irqsave(&priv->lock, flags);
 
 	/* Read low level stats */
@@ -1062,8 +1060,6 @@ static void vnt_interrupt_process(struct vnt_private *priv)
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
-
-	MACvIntEnable(priv->PortOffset, IMR_MASK_VALUE);
 }
 
 static void vnt_interrupt_work(struct work_struct *work)
@@ -1073,6 +1069,8 @@ static void vnt_interrupt_work(struct work_struct *work)
 
 	if (priv->vif)
 		vnt_interrupt_process(priv);
+
+	MACvIntEnable(priv->PortOffset, IMR_MASK_VALUE);
 }
 
 static irqreturn_t vnt_interrupt(int irq,  void *arg)
@@ -1081,6 +1079,8 @@ static irqreturn_t vnt_interrupt(int irq,  void *arg)
 
 	schedule_work(&priv->interrupt_work);
 
+	MACvIntDisable(priv->PortOffset);
+
 	return IRQ_HANDLED;
 }
 

From 7124c71944ee422c0f56b1f2093bfdb7bd6f5f97 Mon Sep 17 00:00:00 2001
From: Aditya Pakki <pakki001@umn.edu>
Date: Mon, 18 Mar 2019 18:44:14 -0500
Subject: [PATCH 3027/3220] serial: max310x: Fix to avoid potential NULL
 pointer dereference

commit 3a10e3dd52e80b9a97a3346020024d17b2c272d6 upstream.

of_match_device can return a NULL pointer when matching device is not
found. This patch avoids a scenario causing NULL pointer derefernce.

Signed-off-by: Aditya Pakki <pakki001@umn.edu>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/max310x.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index d45133056f51..be55fb6def89 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -1306,6 +1306,8 @@ static int max310x_spi_probe(struct spi_device *spi)
 	if (spi->dev.of_node) {
 		const struct of_device_id *of_id =
 			of_match_device(max310x_dt_ids, &spi->dev);
+		if (!of_id)
+			return -ENODEV;
 
 		devtype = (struct max310x_devtype *)of_id->data;
 	} else {

From 327ee45823eb2c757e3c5507e763dea88568e424 Mon Sep 17 00:00:00 2001
From: Hoan Nguyen An <na-hoan@jinso.co.jp>
Date: Mon, 18 Mar 2019 18:26:32 +0900
Subject: [PATCH 3028/3220] serial: sh-sci: Fix setting SCSCR_TIE while
 transferring data

commit 93bcefd4c6bad4c69dbc4edcd3fbf774b24d930d upstream.

We disable transmission interrupt (clear SCSCR_TIE) after all data has been transmitted
(if uart_circ_empty(xmit)). While transmitting, if the data is still in the tty buffer,
re-enable the SCSCR_TIE bit, which was done at sci_start_tx().
This is unnecessary processing, wasting CPU operation if the data transmission length is large.
And further, transmit end, FIFO empty bits disabling have also been performed in the step above.

Signed-off-by: Hoan Nguyen An <na-hoan@jinso.co.jp>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/sh-sci.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index b63920481b1d..669134e27ed9 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -746,19 +746,9 @@ static void sci_transmit_chars(struct uart_port *port)
 
 	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
-	if (uart_circ_empty(xmit)) {
+	if (uart_circ_empty(xmit))
 		sci_stop_tx(port);
-	} else {
-		ctrl = serial_port_in(port, SCSCR);
 
-		if (port->type != PORT_SCI) {
-			serial_port_in(port, SCxSR); /* Dummy read */
-			sci_clear_SCxSR(port, SCxSR_TDxE_CLEAR(port));
-		}
-
-		ctrl |= SCSCR_TIE;
-		serial_port_out(port, SCSCR, ctrl);
-	}
 }
 
 /* On SH3, SCIF may read end-of-break as a space->mark char */

From 7139e4d5bf835123d7bb75199ac463d865f665db Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 27 Mar 2019 10:11:14 +0900
Subject: [PATCH 3029/3220] USB: serial: cp210x: add new device id

commit a595ecdd5f60b2d93863cebb07eec7f935839b54 upstream.

Lorenz Messtechnik has a device that is controlled by the cp210x driver,
so add the device id to the driver.  The device id was provided by
Silicon-Labs for the devices from this vendor.

Reported-by: Uli <t9cpu@web.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/cp210x.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index b317594a6342..e3ea0fdd3913 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -76,6 +76,7 @@ static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x10C4, 0x804E) }, /* Software Bisque Paramount ME build-in converter */
 	{ USB_DEVICE(0x10C4, 0x8053) }, /* Enfora EDG1228 */
 	{ USB_DEVICE(0x10C4, 0x8054) }, /* Enfora GSM2228 */
+	{ USB_DEVICE(0x10C4, 0x8056) }, /* Lorenz Messtechnik devices */
 	{ USB_DEVICE(0x10C4, 0x8066) }, /* Argussoft In-System Programmer */
 	{ USB_DEVICE(0x10C4, 0x806F) }, /* IMS USB to RS422 Converter Cable */
 	{ USB_DEVICE(0x10C4, 0x807A) }, /* Crumb128 board */

From ef0d78184aff9a8f8bdad79ddb84d8838e0fea40 Mon Sep 17 00:00:00 2001
From: George McCollister <george.mccollister@gmail.com>
Date: Tue, 5 Mar 2019 16:05:03 -0600
Subject: [PATCH 3030/3220] USB: serial: ftdi_sio: add additional NovaTech
 products

commit 422c2537ba9d42320f8ab6573940269f87095320 upstream.

Add PIDs for the NovaTech OrionLX+ and Orion I/O so they can be
automatically detected.

Signed-off-by: George McCollister <george.mccollister@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/ftdi_sio.c     | 2 ++
 drivers/usb/serial/ftdi_sio_ids.h | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 4287e2b1c175..af258bb632dd 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -604,6 +604,8 @@ static const struct usb_device_id id_table_combined[] = {
 		.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
 	{ USB_DEVICE(FTDI_VID, FTDI_NT_ORIONLXM_PID),
 		.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
+	{ USB_DEVICE(FTDI_VID, FTDI_NT_ORIONLX_PLUS_PID) },
+	{ USB_DEVICE(FTDI_VID, FTDI_NT_ORION_IO_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_SYNAPSE_SS200_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX2_PID) },
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index ddf5ab983dc9..15d220eaf6e6 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -566,7 +566,9 @@
 /*
  * NovaTech product ids (FTDI_VID)
  */
-#define FTDI_NT_ORIONLXM_PID	0x7c90	/* OrionLXm Substation Automation Platform */
+#define FTDI_NT_ORIONLXM_PID		0x7c90	/* OrionLXm Substation Automation Platform */
+#define FTDI_NT_ORIONLX_PLUS_PID	0x7c91	/* OrionLX+ Substation Automation Platform */
+#define FTDI_NT_ORION_IO_PID		0x7c92	/* Orion I/O */
 
 /*
  * Synapse Wireless product ids (FTDI_VID)

From 6ef819f7158ae71edd690e8c4c1ddc0ed52c809f Mon Sep 17 00:00:00 2001
From: Lin Yi <teroincn@163.com>
Date: Wed, 20 Mar 2019 19:04:56 +0800
Subject: [PATCH 3031/3220] USB: serial: mos7720: fix mos_parport refcount
 imbalance on error path

commit 2908b076f5198d231de62713cb2b633a3a4b95ac upstream.

The write_parport_reg_nonblock() helper takes a reference to the struct
mos_parport, but failed to release it in a couple of error paths after
allocation failures, leading to a memory leak.

Johan said that move the kref_get() and mos_parport assignment to the
end of urbtrack initialisation is a better way, so move it. and
mos_parport do not used until urbtrack initialisation.

Signed-off-by: Lin Yi <teroincn@163.com>
Fixes: b69578df7e98 ("USB: usbserial: mos7720: add support for parallel port on moschip 7715")
Cc: stable <stable@vger.kernel.org>     # 2.6.35
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/mos7720.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 4581fa1dec98..286b43c79d38 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -368,8 +368,6 @@ static int write_parport_reg_nonblock(struct mos7715_parport *mos_parport,
 	if (!urbtrack)
 		return -ENOMEM;
 
-	kref_get(&mos_parport->ref_count);
-	urbtrack->mos_parport = mos_parport;
 	urbtrack->urb = usb_alloc_urb(0, GFP_ATOMIC);
 	if (!urbtrack->urb) {
 		kfree(urbtrack);
@@ -390,6 +388,8 @@ static int write_parport_reg_nonblock(struct mos7715_parport *mos_parport,
 			     usb_sndctrlpipe(usbdev, 0),
 			     (unsigned char *)urbtrack->setup,
 			     NULL, 0, async_complete, urbtrack);
+	kref_get(&mos_parport->ref_count);
+	urbtrack->mos_parport = mos_parport;
 	kref_init(&urbtrack->ref_count);
 	INIT_LIST_HEAD(&urbtrack->urblist_entry);
 

From 09d3f1eb8499b92456cdc5d0e8937572467f651a Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Tue, 26 Feb 2019 17:07:10 +0000
Subject: [PATCH 3032/3220] USB: serial: option: set driver_info for SIM5218
 and compatibles

commit f8df5c2c3e2df5ffaf9fb5503da93d477a8c7db4 upstream.

The SIMCom SIM5218 and compatible devices have 5 USB interfaces, only 4
of which are serial ports.  The fifth is a network interface supported
by the qmi-wwan driver.  Furthermore, the serial ports do not support
modem control signals.  Add driver_info flags to reflect this.

Signed-off-by: Mans Rullgard <mans@mansr.com>
Fixes: ec0cd94d881c ("usb: option: add SIMCom SIM5218")
Cc: stable <stable@vger.kernel.org>	# 3.2
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index b2b7c12e5c86..86fd79cc9782 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1066,7 +1066,8 @@ static const struct usb_device_id option_ids[] = {
 	  .driver_info = RSVD(3) },
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6613)}, /* Onda H600/ZTE MF330 */
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x0023)}, /* ONYX 3G device */
-	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x9000)}, /* SIMCom SIM5218 */
+	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x9000), /* SIMCom SIM5218 */
+	  .driver_info = NCTRL(0) | NCTRL(1) | NCTRL(2) | NCTRL(3) | RSVD(4) },
 	/* Quectel products using Qualcomm vendor ID */
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)},
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20),

From 6c1c9cfc2fdd3044bba1059e1caea075019fb45c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Wed, 27 Mar 2019 15:25:32 +0100
Subject: [PATCH 3033/3220] USB: serial: option: add Olicard 600
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 84f3b43f7378b98b7e3096d5499de75183d4347c upstream.

This is a Qualcomm based device with a QMI function on interface 4.
It is mode switched from 2020:2030 using a standard eject message.

T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#=  6 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=2020 ProdID=2031 Rev= 2.32
S:  Manufacturer=Mobile Connect
S:  Product=Mobile Connect
S:  SerialNumber=0123456789ABCDEF
C:* #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none)
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E:  Ad=83(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E:  Ad=85(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E:  Ad=87(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none)
E:  Ad=89(I) Atr=03(Int.) MxPS=   8 Ivl=32ms
E:  Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 5 Alt= 0 #EPs= 2 Cls=08(stor.) Sub=06 Prot=50 Driver=(none)
E:  Ad=8a(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=125us

Cc: stable@vger.kernel.org
Signed-off-by: Bjørn Mork <bjorn@mork.no>
[ johan: use tabs to align comments in adjacent lines ]
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 86fd79cc9782..9f96dd274370 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1942,10 +1942,12 @@ static const struct usb_device_id option_ids[] = {
 	  .driver_info = RSVD(4) },
 	{ USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7e35, 0xff),			/* D-Link DWM-222 */
 	  .driver_info = RSVD(4) },
-	{ USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e01, 0xff, 0xff, 0xff) }, /* D-Link DWM-152/C1 */
-	{ USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e02, 0xff, 0xff, 0xff) }, /* D-Link DWM-156/C1 */
-	{ USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x7e11, 0xff, 0xff, 0xff) }, /* D-Link DWM-156/A3 */
-	{ USB_DEVICE_INTERFACE_CLASS(0x2020, 0x4000, 0xff) },                /* OLICARD300 - MT6225 */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e01, 0xff, 0xff, 0xff) },	/* D-Link DWM-152/C1 */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e02, 0xff, 0xff, 0xff) },	/* D-Link DWM-156/C1 */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x7e11, 0xff, 0xff, 0xff) },	/* D-Link DWM-156/A3 */
+	{ USB_DEVICE_INTERFACE_CLASS(0x2020, 0x2031, 0xff),			/* Olicard 600 */
+	  .driver_info = RSVD(4) },
+	{ USB_DEVICE_INTERFACE_CLASS(0x2020, 0x4000, 0xff) },			/* OLICARD300 - MT6225 */
 	{ USB_DEVICE(INOVIA_VENDOR_ID, INOVIA_SEW858) },
 	{ USB_DEVICE(VIATELECOM_VENDOR_ID, VIATELECOM_PRODUCT_CDS7) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(WETELECOM_VENDOR_ID, WETELECOM_PRODUCT_WMD200, 0xff, 0xff, 0xff) },

From 98bc2f91e918979b3b660b38cca9697ca1178330 Mon Sep 17 00:00:00 2001
From: Wentao Wang <witallwang@gmail.com>
Date: Wed, 20 Mar 2019 15:30:39 +0000
Subject: [PATCH 3034/3220] Disable kgdboc failed by echo space to
 /sys/module/kgdboc/parameters/kgdboc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 3ec8002951ea173e24b466df1ea98c56b7920e63 upstream.

Echo "" to /sys/module/kgdboc/parameters/kgdboc will fail with "No such
device” error.

This is caused by function "configure_kgdboc" who init err to ENODEV
when the config is empty (legal input) the code go out with ENODEV
returned.

Fixes: 2dd453168643 ("kgdboc: Fix restrict error")
Signed-off-by: Wentao Wang <witallwang@gmail.com>
Cc: stable <stable@vger.kernel.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/kgdboc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/serial/kgdboc.c b/drivers/tty/serial/kgdboc.c
index f2b0d8cee8ef..0314e78e31ff 100644
--- a/drivers/tty/serial/kgdboc.c
+++ b/drivers/tty/serial/kgdboc.c
@@ -148,8 +148,10 @@ static int configure_kgdboc(void)
 	char *cptr = config;
 	struct console *cons;
 
-	if (!strlen(config) || isspace(config[0]))
+	if (!strlen(config) || isspace(config[0])) {
+		err = 0;
 		goto noconfig;
+	}
 
 	kgdboc_io_ops.is_console = 0;
 	kgdb_tty_driver = NULL;

From 6271fa6fc366827c0249864157e8fd18c4eac68a Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 28 Mar 2019 20:44:40 -0700
Subject: [PATCH 3035/3220] fs/proc/proc_sysctl.c: fix NULL pointer dereference
 in put_links

commit 23da9588037ecdd4901db76a5b79a42b529c4ec3 upstream.

Syzkaller reports:

kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN PTI
CPU: 1 PID: 5373 Comm: syz-executor.0 Not tainted 5.0.0-rc8+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
RIP: 0010:put_links+0x101/0x440 fs/proc/proc_sysctl.c:1599
Code: 00 0f 85 3a 03 00 00 48 8b 43 38 48 89 44 24 20 48 83 c0 38 48 89 c2 48 89 44 24 28 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 fe 02 00 00 48 8b 74 24 20 48 c7 c7 60 2a 9d 91
RSP: 0018:ffff8881d828f238 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: ffff8881e01b1140 RCX: ffffffff8ee98267
RDX: 0000000000000007 RSI: ffffc90001479000 RDI: ffff8881e01b1178
RBP: dffffc0000000000 R08: ffffed103ee27259 R09: ffffed103ee27259
R10: 0000000000000001 R11: ffffed103ee27258 R12: fffffffffffffff4
R13: 0000000000000006 R14: ffff8881f59838c0 R15: dffffc0000000000
FS:  00007f072254f700(0000) GS:ffff8881f7100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff8b286668 CR3: 00000001f0542002 CR4: 00000000007606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
 drop_sysctl_table+0x152/0x9f0 fs/proc/proc_sysctl.c:1629
 get_subdir fs/proc/proc_sysctl.c:1022 [inline]
 __register_sysctl_table+0xd65/0x1090 fs/proc/proc_sysctl.c:1335
 br_netfilter_init+0xbc/0x1000 [br_netfilter]
 do_one_initcall+0xfa/0x5ca init/main.c:887
 do_init_module+0x204/0x5f6 kernel/module.c:3460
 load_module+0x66b2/0x8570 kernel/module.c:3808
 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902
 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x462e99
Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f072254ec58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99
RDX: 0000000000000000 RSI: 0000000020000280 RDI: 0000000000000003
RBP: 00007f072254ec70 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f072254f6bc
R13: 00000000004bcefa R14: 00000000006f6fb0 R15: 0000000000000004
Modules linked in: br_netfilter(+) dvb_usb_dibusb_mc_common dib3000mc dibx000_common dvb_usb_dibusb_common dvb_usb_dw2102 dvb_usb classmate_laptop palmas_regulator cn videobuf2_v4l2 v4l2_common snd_soc_bd28623 mptbase snd_usb_usx2y snd_usbmidi_lib snd_rawmidi wmi libnvdimm lockd sunrpc grace rc_kworld_pc150u rc_core rtc_da9063 sha1_ssse3 i2c_cros_ec_tunnel adxl34x_spi adxl34x nfnetlink lib80211 i5500_temp dvb_as102 dvb_core videobuf2_common videodev media videobuf2_vmalloc videobuf2_memops udc_core lnbp22 leds_lp3952 hid_roccat_ryos s1d13xxxfb mtd vport_geneve openvswitch nf_conncount nf_nat_ipv6 nsh geneve udp_tunnel ip6_udp_tunnel snd_soc_mt6351 sis_agp phylink snd_soc_adau1761_spi snd_soc_adau1761 snd_soc_adau17x1 snd_soc_core snd_pcm_dmaengine ac97_bus snd_compress snd_soc_adau_utils snd_soc_sigmadsp_regmap snd_soc_sigmadsp raid_class hid_roccat_konepure hid_roccat_common hid_roccat c2port_duramar2150 core mdio_bcm_unimac iptable_security iptable_raw iptable_mangle
 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bpfilter ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel hsr veth netdevsim devlink vxcan batman_adv cfg80211 rfkill chnl_net caif nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel joydev mousedev ide_pci_generic piix aesni_intel aes_x86_64 ide_core crypto_simd atkbd cryptd glue_helper serio_raw ata_generic pata_acpi i2c_piix4 floppy sch_fq_codel ip_tables x_tables ipv6 [last unloaded: lm73]
Dumping ftrace buffer:
   (ftrace buffer empty)
---[ end trace 770020de38961fd0 ]---

A new dir entry can be created in get_subdir and its 'header->parent' is
set to NULL.  Only after insert_header success, it will be set to 'dir',
otherwise 'header->parent' is set to NULL and drop_sysctl_table is called.
However in err handling path of get_subdir, drop_sysctl_table also be
called on 'new->header' regardless its value of parent pointer.  Then
put_links is called, which triggers NULL-ptr deref when access member of
header->parent.

In fact we have multiple error paths which call drop_sysctl_table() there,
upon failure on insert_links() we also call drop_sysctl_table().And even
in the successful case on __register_sysctl_table() we still always call
drop_sysctl_table().This patch fix it.

Link: http://lkml.kernel.org/r/20190314085527.13244-1-yuehaibing@huawei.com
Fixes: 0e47c99d7fe25 ("sysctl: Replace root_list with links between sysctl_table_sets")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reported-by: Hulk Robot <hulkci@huawei.com>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>    [3.4+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/proc_sysctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5e1054f028af..c7e32a891502 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1550,7 +1550,8 @@ static void drop_sysctl_table(struct ctl_table_header *header)
 	if (--header->nreg)
 		return;
 
-	put_links(header);
+	if (parent)
+		put_links(header);
 	start_unregistering(header);
 	if (!--header->count)
 		kfree_rcu(header, rcu);

From e250a5ac0e12ddba89d2ece260d167999af8a7c2 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Mon, 11 Mar 2019 21:29:37 +0800
Subject: [PATCH 3036/3220] gpio: adnp: Fix testing wrong value in
 adnp_gpio_direction_input

commit c5bc6e526d3f217ed2cc3681d256dc4a2af4cc2b upstream.

Current code test wrong value so it does not verify if the written
data is correctly read back. Fix it.
Also make it return -EPERM if read value does not match written bit,
just like it done for adnp_gpio_direction_output().

Fixes: 5e969a401a01 ("gpio: Add Avionic Design N-bit GPIO expander support")
Cc: <stable@vger.kernel.org>
Signed-off-by: Axel Lin <axel.lin@ingics.com>
Reviewed-by: Thierry Reding <thierry.reding@gmail.com>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpio/gpio-adnp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpio/gpio-adnp.c b/drivers/gpio/gpio-adnp.c
index d3d0a90fe542..995b2be45982 100644
--- a/drivers/gpio/gpio-adnp.c
+++ b/drivers/gpio/gpio-adnp.c
@@ -137,8 +137,10 @@ static int adnp_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
 	if (err < 0)
 		goto out;
 
-	if (err & BIT(pos))
-		err = -EACCES;
+	if (value & BIT(pos)) {
+		err = -EPERM;
+		goto out;
+	}
 
 	err = 0;
 

From 0fe681f4ef50062e3ca11aced9f56429d5c048b8 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 25 Mar 2019 15:51:35 +0200
Subject: [PATCH 3037/3220] perf intel-pt: Fix TSC slip

commit f3b4e06b3bda759afd042d3d5fa86bea8f1fe278 upstream.

A TSC packet can slip past MTC packets so that the timestamp appears to
go backwards. One estimate is that can be up to about 40 CPU cycles,
which is certainly less than 0x1000 TSC ticks, but accept slippage an
order of magnitude more to be on the safe side.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 79b58424b821c ("perf tools: Add Intel PT support for decoding MTC packets")
Link: http://lkml.kernel.org/r/20190325135135.18348-1-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../util/intel-pt-decoder/intel-pt-decoder.c  | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index d01e2ce818f7..62b38f2ff60d 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -238,19 +238,15 @@ struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params)
 		if (!(decoder->tsc_ctc_ratio_n % decoder->tsc_ctc_ratio_d))
 			decoder->tsc_ctc_mult = decoder->tsc_ctc_ratio_n /
 						decoder->tsc_ctc_ratio_d;
-
-		/*
-		 * Allow for timestamps appearing to backwards because a TSC
-		 * packet has slipped past a MTC packet, so allow 2 MTC ticks
-		 * or ...
-		 */
-		decoder->tsc_slip = multdiv(2 << decoder->mtc_shift,
-					decoder->tsc_ctc_ratio_n,
-					decoder->tsc_ctc_ratio_d);
 	}
-	/* ... or 0x100 paranoia */
-	if (decoder->tsc_slip < 0x100)
-		decoder->tsc_slip = 0x100;
+
+	/*
+	 * A TSC packet can slip past MTC packets so that the timestamp appears
+	 * to go backwards. One estimate is that can be up to about 40 CPU
+	 * cycles, which is certainly less than 0x1000 TSC ticks, but accept
+	 * slippage an order of magnitude more to be on the safe side.
+	 */
+	decoder->tsc_slip = 0x10000;
 
 	intel_pt_log("timestamp: mtc_shift %u\n", decoder->mtc_shift);
 	intel_pt_log("timestamp: tsc_ctc_ratio_n %u\n", decoder->tsc_ctc_ratio_n);

From 5ce6e5bd23f01ac83d912887267e60c319fda969 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 26 Mar 2019 17:36:06 +0100
Subject: [PATCH 3038/3220] x86/smp: Enforce CONFIG_HOTPLUG_CPU when SMP=y

commit bebd024e4815b1a170fcd21ead9c2222b23ce9e6 upstream.

The SMT disable 'nosmt' command line argument is not working properly when
CONFIG_HOTPLUG_CPU is disabled. The teardown of the sibling CPUs which are
required to be brought up due to the MCE issues, cannot work. The CPUs are
then kept in a half dead state.

As the 'nosmt' functionality has become popular due to the speculative
hardware vulnerabilities, the half torn down state is not a proper solution
to the problem.

Enforce CONFIG_HOTPLUG_CPU=y when SMP is enabled so the full operation is
possible.

Reported-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Konrad Wilk <konrad.wilk@oracle.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Mukesh Ojha <mojha@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Rik van Riel <riel@surriel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Micheal Kelley <michael.h.kelley@microsoft.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20190326163811.598166056@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9beee7f364ad..4598d087dec2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1970,14 +1970,8 @@ config PHYSICAL_ALIGN
 	  Don't change this unless you know what you are doing.
 
 config HOTPLUG_CPU
-	bool "Support for hot-pluggable CPUs"
+	def_bool y
 	depends on SMP
-	---help---
-	  Say Y here to allow turning CPUs off and on. CPUs can be
-	  controlled through /sys/devices/system/cpu.
-	  ( Note: power management support will enable this option
-	    automatically on SMP systems. )
-	  Say N if you want to disable CPU hotplug.
 
 config BOOTPARAM_HOTPLUG_CPU0
 	bool "Set default setting of cpu0_hotpluggable"

From 9aacea736c9a218d04cb01fff289a35d67d358cf Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 15 Feb 2019 12:48:39 -0800
Subject: [PATCH 3039/3220] KVM: Reject device ioctls from processes other than
 the VM's creator

commit ddba91801aeb5c160b660caed1800eb3aef403f8 upstream.

KVM's API requires thats ioctls must be issued from the same process
that created the VM.  In other words, userspace can play games with a
VM's file descriptors, e.g. fork(), SCM_RIGHTS, etc..., but only the
creator can do anything useful.  Explicitly reject device ioctls that
are issued by a process other than the VM's creator, and update KVM's
API documentation to extend its requirements to device ioctls.

Fixes: 852b6d57dc7f ("kvm: add device control API")
Cc: <stable@vger.kernel.org>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/virtual/kvm/api.txt | 16 +++++++++++-----
 virt/kvm/kvm_main.c               |  3 +++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index df8ab4fc240a..496673adcb6b 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -13,7 +13,7 @@ of a virtual machine.  The ioctls belong to three classes
 
  - VM ioctls: These query and set attributes that affect an entire virtual
    machine, for example memory layout.  In addition a VM ioctl is used to
-   create virtual cpus (vcpus).
+   create virtual cpus (vcpus) and devices.
 
    Only run VM ioctls from the same process (address space) that was used
    to create the VM.
@@ -24,6 +24,11 @@ of a virtual machine.  The ioctls belong to three classes
    Only run vcpu ioctls from the same thread that was used to create the
    vcpu.
 
+ - device ioctls: These query and set attributes that control the operation
+   of a single device.
+
+   device ioctls must be issued from the same process (address space) that
+   was used to create the VM.
 
 2. File descriptors
 -------------------
@@ -32,10 +37,11 @@ The kvm API is centered around file descriptors.  An initial
 open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
 can be used to issue system ioctls.  A KVM_CREATE_VM ioctl on this
 handle will create a VM file descriptor which can be used to issue VM
-ioctls.  A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu
-and return a file descriptor pointing to it.  Finally, ioctls on a vcpu
-fd can be used to control the vcpu, including the important task of
-actually running guest code.
+ioctls.  A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will
+create a virtual cpu or device and return a file descriptor pointing to
+the new resource.  Finally, ioctls on a vcpu or device fd can be used
+to control the vcpu or device.  For vcpus, this includes the important
+task of actually running guest code.
 
 In general file descriptors can be migrated among processes by means
 of fork() and the SCM_RIGHTS facility of unix domain socket.  These
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fce48d11ae07..08a954582e31 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2611,6 +2611,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
 {
 	struct kvm_device *dev = filp->private_data;
 
+	if (dev->kvm->mm != current->mm)
+		return -EIO;
+
 	switch (ioctl) {
 	case KVM_SET_DEVICE_ATTR:
 		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);

From 78d145f71db5c2b32b24baba875da651c92a7b1f Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Fri, 22 Mar 2019 17:50:15 +0200
Subject: [PATCH 3040/3220] xhci: Fix port resume done detection for SS ports
 with LPM enabled

commit 6cbcf596934c8e16d6288c7cc62dfb7ad8eadf15 upstream.

A suspended SS port in U3 link state will go to U0 when resumed, but
can almost immediately after that enter U1 or U2 link power save
states before host controller driver reads the port status.

Host controller driver only checks for U0 state, and might miss
the finished resume, leaving flags unclear and skip notifying usb
code of the wake.

Add U1 and U2 to the possible link states when checking for finished
port resume.

Cc: stable <stable@vger.kernel.org>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-ring.c | 9 ++++++---
 drivers/usb/host/xhci.h      | 1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 845fa426fa0d..80192698df87 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1642,10 +1642,13 @@ static void handle_port_status(struct xhci_hcd *xhci,
 		}
 	}
 
-	if ((temp & PORT_PLC) && (temp & PORT_PLS_MASK) == XDEV_U0 &&
-			DEV_SUPERSPEED_ANY(temp)) {
+	if ((temp & PORT_PLC) &&
+	    DEV_SUPERSPEED_ANY(temp) &&
+	    ((temp & PORT_PLS_MASK) == XDEV_U0 ||
+	     (temp & PORT_PLS_MASK) == XDEV_U1 ||
+	     (temp & PORT_PLS_MASK) == XDEV_U2)) {
 		xhci_dbg(xhci, "resume SS port %d finished\n", port_id);
-		/* We've just brought the device into U0 through either the
+		/* We've just brought the device into U0/1/2 through either the
 		 * Resume state after a device remote wakeup, or through the
 		 * U3Exit state after a host-initiated resume.  If it's a device
 		 * initiated remote wake, don't pass up the link state change,
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 0635cea42e6f..b57bee70cdef 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -309,6 +309,7 @@ struct xhci_op_regs {
  */
 #define PORT_PLS_MASK	(0xf << 5)
 #define XDEV_U0		(0x0 << 5)
+#define XDEV_U1		(0x1 << 5)
 #define XDEV_U2		(0x2 << 5)
 #define XDEV_U3		(0x3 << 5)
 #define XDEV_INACTIVE	(0x6 << 5)

From d14ac4368f22216ad06b40017094714150c6ed45 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 13 Dec 2017 11:59:39 +0100
Subject: [PATCH 3041/3220] Revert "USB: core: only clean up what we allocated"

commit cf4df407e0d7cde60a45369c2a3414d18e2d4fdd upstream.

This reverts commit 32fd87b3bbf5f7a045546401dfe2894dbbf4d8c3.

Alan wrote a better fix for this...

Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: stable <stable@vger.kernel.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/config.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index b8eb289e0b17..6a287c81a7be 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -734,21 +734,18 @@ void usb_destroy_configuration(struct usb_device *dev)
 		return;
 
 	if (dev->rawdescriptors) {
-		for (i = 0; i < dev->descriptor.bNumConfigurations &&
-				i < USB_MAXCONFIG; i++)
+		for (i = 0; i < dev->descriptor.bNumConfigurations; i++)
 			kfree(dev->rawdescriptors[i]);
 
 		kfree(dev->rawdescriptors);
 		dev->rawdescriptors = NULL;
 	}
 
-	for (c = 0; c < dev->descriptor.bNumConfigurations &&
-			c < USB_MAXCONFIG; c++) {
+	for (c = 0; c < dev->descriptor.bNumConfigurations; c++) {
 		struct usb_host_config *cf = &dev->config[c];
 
 		kfree(cf->string);
-		for (i = 0; i < cf->desc.bNumInterfaces &&
-				i < USB_MAXINTERFACES; i++) {
+		for (i = 0; i < cf->desc.bNumInterfaces; i++) {
 			if (cf->intf_cache[i])
 				kref_put(&cf->intf_cache[i]->ref,
 					  usb_release_interface_cache);

From 455b9a675ecfad505d9c4e59b827107073fb71ce Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 8 Mar 2017 16:27:04 -0800
Subject: [PATCH 3042/3220] arm64: support keyctl() system call in 32-bit mode

[ Upstream commit 5c2a625937ba49bc691089370638223d310cda9a ]

As is the case for a number of other architectures that have a 32-bit
compat mode, enable KEYS_COMPAT if both COMPAT and KEYS are enabled.
This allows AArch32 programs to use the keyctl() system call when
running on an AArch64 kernel.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm64/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 00c491750918..f18b8c26a959 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -818,6 +818,10 @@ config SYSVIPC_COMPAT
 	def_bool y
 	depends on COMPAT && SYSVIPC
 
+config KEYS_COMPAT
+	def_bool y
+	depends on COMPAT && KEYS
+
 endmenu
 
 menu "Power management options"

From c003b5675d3479e46e5aa8c677c153a556a45110 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Tue, 2 Feb 2016 14:14:00 -0700
Subject: [PATCH 3043/3220] coresight: removing bind/unbind options from sysfs

[ Upstream commit b15f0fb657e040401d875d11ae13b269af8a16e0 ]

The coresight drivers have absolutely no control over bind and unbind
operations triggered from sysfs. The operations simply can't be
cancelled or denied event when one or several tracing sessions are
under way.  Since the memory associated to individual device is
invariably freed, the end result is a kernel crash when the path from
source to sink is travelled again as demonstrated here[1].

One solution could be to keep track of all the path (i.e tracing
session) that get created and iterate through the elements of those path
looking for the coresight device that is being removed.  This proposition
doesn't scale well since there is no upper bound on the amount of
concurrent trace session that can be created.

With the above in mind, this patch prevent devices from being unbounded
from their driver by using the driver->suppress_bind_attr option.  That way
trace sessions can be managed without fearing to loose devices.

Since device can't be removed anymore the xyz_remove() functions found in
each driver is also removed.

[1]. http://www.spinics.net/lists/arm-kernel/msg474952.html

Reported-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/coresight/coresight-etb10.c    | 11 +----------
 drivers/hwtracing/coresight/coresight-etm3x.c    | 13 +------------
 drivers/hwtracing/coresight/coresight-etm4x.c    | 13 +------------
 drivers/hwtracing/coresight/coresight-funnel.c   | 10 +---------
 .../coresight/coresight-replicator-qcom.c        | 11 +----------
 .../hwtracing/coresight/coresight-replicator.c   | 16 +---------------
 drivers/hwtracing/coresight/coresight-tmc.c      | 15 +--------------
 drivers/hwtracing/coresight/coresight-tpiu.c     | 10 +---------
 8 files changed, 8 insertions(+), 91 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index 77d0f9c1118d..92969dae739d 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -489,15 +489,6 @@ err_misc_register:
 	return ret;
 }
 
-static int etb_remove(struct amba_device *adev)
-{
-	struct etb_drvdata *drvdata = amba_get_drvdata(adev);
-
-	misc_deregister(&drvdata->miscdev);
-	coresight_unregister(drvdata->csdev);
-	return 0;
-}
-
 #ifdef CONFIG_PM
 static int etb_runtime_suspend(struct device *dev)
 {
@@ -537,10 +528,10 @@ static struct amba_driver etb_driver = {
 		.name	= "coresight-etb10",
 		.owner	= THIS_MODULE,
 		.pm	= &etb_dev_pm_ops,
+		.suppress_bind_attrs = true,
 
 	},
 	.probe		= etb_probe,
-	.remove		= etb_remove,
 	.id_table	= etb_ids,
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c
index d630b7ece735..5981fcc69960 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -1877,17 +1877,6 @@ err_arch_supported:
 	return ret;
 }
 
-static int etm_remove(struct amba_device *adev)
-{
-	struct etm_drvdata *drvdata = amba_get_drvdata(adev);
-
-	coresight_unregister(drvdata->csdev);
-	if (--etm_count == 0)
-		unregister_hotcpu_notifier(&etm_cpu_notifier);
-
-	return 0;
-}
-
 #ifdef CONFIG_PM
 static int etm_runtime_suspend(struct device *dev)
 {
@@ -1948,9 +1937,9 @@ static struct amba_driver etm_driver = {
 		.name	= "coresight-etm3x",
 		.owner	= THIS_MODULE,
 		.pm	= &etm_dev_pm_ops,
+		.suppress_bind_attrs = true,
 	},
 	.probe		= etm_probe,
-	.remove		= etm_remove,
 	.id_table	= etm_ids,
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index 1ec6798b21e8..0edc10b44004 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -2684,17 +2684,6 @@ err_coresight_register:
 	return ret;
 }
 
-static int etm4_remove(struct amba_device *adev)
-{
-	struct etmv4_drvdata *drvdata = amba_get_drvdata(adev);
-
-	coresight_unregister(drvdata->csdev);
-	if (--etm4_count == 0)
-		unregister_hotcpu_notifier(&etm4_cpu_notifier);
-
-	return 0;
-}
-
 static struct amba_id etm4_ids[] = {
 	{       /* ETM 4.0 - Qualcomm */
 		.id	= 0x0003b95d,
@@ -2712,9 +2701,9 @@ static struct amba_id etm4_ids[] = {
 static struct amba_driver etm4x_driver = {
 	.drv = {
 		.name   = "coresight-etm4x",
+		.suppress_bind_attrs = true,
 	},
 	.probe		= etm4_probe,
-	.remove		= etm4_remove,
 	.id_table	= etm4_ids,
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index 2e36bde7fcb4..25e8ea140a09 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -226,14 +226,6 @@ static int funnel_probe(struct amba_device *adev, const struct amba_id *id)
 	return 0;
 }
 
-static int funnel_remove(struct amba_device *adev)
-{
-	struct funnel_drvdata *drvdata = amba_get_drvdata(adev);
-
-	coresight_unregister(drvdata->csdev);
-	return 0;
-}
-
 #ifdef CONFIG_PM
 static int funnel_runtime_suspend(struct device *dev)
 {
@@ -273,9 +265,9 @@ static struct amba_driver funnel_driver = {
 		.name	= "coresight-funnel",
 		.owner	= THIS_MODULE,
 		.pm	= &funnel_dev_pm_ops,
+		.suppress_bind_attrs = true,
 	},
 	.probe		= funnel_probe,
-	.remove		= funnel_remove,
 	.id_table	= funnel_ids,
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-replicator-qcom.c b/drivers/hwtracing/coresight/coresight-replicator-qcom.c
index 584059e9e866..444815179460 100644
--- a/drivers/hwtracing/coresight/coresight-replicator-qcom.c
+++ b/drivers/hwtracing/coresight/coresight-replicator-qcom.c
@@ -156,15 +156,6 @@ static int replicator_probe(struct amba_device *adev, const struct amba_id *id)
 	return 0;
 }
 
-static int replicator_remove(struct amba_device *adev)
-{
-	struct replicator_state *drvdata = amba_get_drvdata(adev);
-
-	pm_runtime_disable(&adev->dev);
-	coresight_unregister(drvdata->csdev);
-	return 0;
-}
-
 #ifdef CONFIG_PM
 static int replicator_runtime_suspend(struct device *dev)
 {
@@ -206,9 +197,9 @@ static struct amba_driver replicator_driver = {
 	.drv = {
 		.name	= "coresight-replicator-qcom",
 		.pm	= &replicator_dev_pm_ops,
+		.suppress_bind_attrs = true,
 	},
 	.probe		= replicator_probe,
-	.remove		= replicator_remove,
 	.id_table	= replicator_ids,
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 963ac197c253..b77d700a3f0e 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -127,20 +127,6 @@ out_disable_pm:
 	return ret;
 }
 
-static int replicator_remove(struct platform_device *pdev)
-{
-	struct replicator_drvdata *drvdata = platform_get_drvdata(pdev);
-
-	coresight_unregister(drvdata->csdev);
-	pm_runtime_get_sync(&pdev->dev);
-	if (!IS_ERR(drvdata->atclk))
-		clk_disable_unprepare(drvdata->atclk);
-	pm_runtime_put_noidle(&pdev->dev);
-	pm_runtime_disable(&pdev->dev);
-
-	return 0;
-}
-
 #ifdef CONFIG_PM
 static int replicator_runtime_suspend(struct device *dev)
 {
@@ -175,11 +161,11 @@ static const struct of_device_id replicator_match[] = {
 
 static struct platform_driver replicator_driver = {
 	.probe          = replicator_probe,
-	.remove         = replicator_remove,
 	.driver         = {
 		.name   = "coresight-replicator",
 		.of_match_table = replicator_match,
 		.pm	= &replicator_dev_pm_ops,
+		.suppress_bind_attrs = true,
 	},
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc.c b/drivers/hwtracing/coresight/coresight-tmc.c
index 62f9d7372e3a..c4fa70ed14ce 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -770,19 +770,6 @@ err_devm_kzalloc:
 	return ret;
 }
 
-static int tmc_remove(struct amba_device *adev)
-{
-	struct tmc_drvdata *drvdata = amba_get_drvdata(adev);
-
-	misc_deregister(&drvdata->miscdev);
-	coresight_unregister(drvdata->csdev);
-	if (drvdata->config_type == TMC_CONFIG_TYPE_ETR)
-		dma_free_coherent(drvdata->dev, drvdata->size,
-				  &drvdata->paddr, GFP_KERNEL);
-
-	return 0;
-}
-
 static struct amba_id tmc_ids[] = {
 	{
 		.id     = 0x0003b961,
@@ -795,9 +782,9 @@ static struct amba_driver tmc_driver = {
 	.drv = {
 		.name   = "coresight-tmc",
 		.owner  = THIS_MODULE,
+		.suppress_bind_attrs = true,
 	},
 	.probe		= tmc_probe,
-	.remove		= tmc_remove,
 	.id_table	= tmc_ids,
 };
 
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index fe3a2b19a5db..105c192eb2c1 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -180,14 +180,6 @@ static int tpiu_probe(struct amba_device *adev, const struct amba_id *id)
 	return 0;
 }
 
-static int tpiu_remove(struct amba_device *adev)
-{
-	struct tpiu_drvdata *drvdata = amba_get_drvdata(adev);
-
-	coresight_unregister(drvdata->csdev);
-	return 0;
-}
-
 #ifdef CONFIG_PM
 static int tpiu_runtime_suspend(struct device *dev)
 {
@@ -231,9 +223,9 @@ static struct amba_driver tpiu_driver = {
 		.name	= "coresight-tpiu",
 		.owner	= THIS_MODULE,
 		.pm	= &tpiu_dev_pm_ops,
+		.suppress_bind_attrs = true,
 	},
 	.probe		= tpiu_probe,
-	.remove		= tpiu_remove,
 	.id_table	= tpiu_ids,
 };
 

From b1b1efe07ea030c400c9bd2cfba2f9b8ec66e3a9 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 22 Dec 2015 17:25:15 +0200
Subject: [PATCH 3044/3220] stm class: Hide STM-specific options if STM is
 disabled

[ Upstream commit 4a2e2b19f96acfc037a9773c7729d133ce1e7e3b ]

If STM=n, it doesn't make sense to ask about STM_DUMMY and
STM_SOURCE_CONSOLE support, which are not even built when enabled
anyway. Hence hide these options if STM=n.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/stm/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/hwtracing/stm/Kconfig b/drivers/hwtracing/stm/Kconfig
index e7a348807f0c..e0ac75395526 100644
--- a/drivers/hwtracing/stm/Kconfig
+++ b/drivers/hwtracing/stm/Kconfig
@@ -9,6 +9,8 @@ config STM
 
 	  Say Y here to enable System Trace Module device support.
 
+if STM
+
 config STM_DUMMY
 	tristate "Dummy STM driver"
 	help
@@ -25,3 +27,5 @@ config STM_SOURCE_CONSOLE
 
 	  If you want to send kernel console messages over STM devices,
 	  say Y.
+
+endif

From 12ae58ca7ec42fe23df5d0b0d01bce2ccb728fd5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 3 Apr 2019 06:23:29 +0200
Subject: [PATCH 3045/3220] Linux 4.4.178

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1de443248119..35be7983ef2d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 177
+SUBLEVEL = 178
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From aa922bed25d4ec044814ef9f52dcae437828f530 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Tue, 2 Apr 2019 23:59:58 -0700
Subject: [PATCH 3046/3220] ANDROID: cuttlefish_defconfig: Enable
 CONFIG_OVERLAY_FS

Bug: 120439617
Bug: 123755887
Change-Id: I5e1225c0ba82ac6d12133ed0e118bf31c21b8da5
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 arch/arm64/configs/cuttlefish_defconfig      | 1 +
 arch/x86/configs/x86_64_cuttlefish_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
index e25e9b50b12e..1aac330efd72 100644
--- a/arch/arm64/configs/cuttlefish_defconfig
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -384,6 +384,7 @@ CONFIG_F2FS_FS_ENCRYPTION=y
 CONFIG_QUOTA=y
 CONFIG_QFMT_V2=y
 CONFIG_FUSE_FS=y
+CONFIG_OVERLAY_FS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_TMPFS=y
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
index 4cfd30b2d0a7..faa0479f17bd 100644
--- a/arch/x86/configs/x86_64_cuttlefish_defconfig
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -415,6 +415,7 @@ CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_QFMT_V2=y
 CONFIG_AUTOFS4_FS=y
 CONFIG_FUSE_FS=y
+CONFIG_OVERLAY_FS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_PROC_KCORE=y

From abe4979abd879af2f4d82e8c0bd46ded3a5ea1dd Mon Sep 17 00:00:00 2001
From: Tom Gall <tom.gall@linaro.org>
Date: Tue, 26 Mar 2019 10:13:26 -0500
Subject: [PATCH 3047/3220] Fix merge issue with 4.4.177

Change global_attr to kobj_attribute to keep up with change in
symbol name (drivers/cpufreq/cpufreq_governor.h) that came in via
LTS.

Signed-off-by: Tom Gall <tom.gall@linaro.org>
Link: https://github.com/tom-gall/hikey-linaro/commit/5ea186d035e346383f18ce189214b315d60f1316
Change-Id: I672f2a8016038790d134b45437571a4d68d73f42
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/cpufreq/cpufreq_interactive.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
index e0a586895136..55b3c58931a7 100644
--- a/drivers/cpufreq/cpufreq_interactive.c
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -1041,7 +1041,7 @@ show_store_gov_pol_sys(boostpulse_duration);
 show_store_gov_pol_sys(io_is_busy);
 
 #define gov_sys_attr_rw(_name)						\
-static struct global_attr _name##_gov_sys =				\
+static struct kobj_attribute _name##_gov_sys =				\
 __ATTR(_name, 0644, show_##_name##_gov_sys, store_##_name##_gov_sys)
 
 #define gov_pol_attr_rw(_name)						\
@@ -1063,7 +1063,7 @@ gov_sys_pol_attr_rw(boost);
 gov_sys_pol_attr_rw(boostpulse_duration);
 gov_sys_pol_attr_rw(io_is_busy);
 
-static struct global_attr boostpulse_gov_sys =
+static struct kobj_attribute boostpulse_gov_sys =
 	__ATTR(boostpulse, 0200, NULL, store_boostpulse_gov_sys);
 
 static struct freq_attr boostpulse_gov_pol =

From 07797f0ce78953f8527f708fbf79857d3f3ed479 Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Sat, 6 Apr 2019 11:53:39 -0700
Subject: [PATCH 3048/3220] Fix merge issue with 4.4.178

Change-Id: I4bd7f3c192046b4e7fc257f03b9cf1fea18e13f7
Signed-off-by: Alistair Strachan <astrachan@google.com>
---
 drivers/usb/gadget/function/rndis.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c
index 35766fedd556..4dba794a6ad5 100644
--- a/drivers/usb/gadget/function/rndis.c
+++ b/drivers/usb/gadget/function/rndis.c
@@ -693,13 +693,6 @@ static int rndis_reset_response(struct rndis_params *params,
 	u8 *xbuf;
 	u32 length;
 
-	/* drain the response queue */
-	while ((xbuf = rndis_get_next_response(params, &length)))
-		rndis_free_response(params, xbuf);
-
-	u32 length;
-	u8 *xbuf;
-
 	/* drain the response queue */
 	while ((xbuf = rndis_get_next_response(params, &length)))
 		rndis_free_response(params, xbuf);

From 74f9ab0d3ed30afc1835a88376c5cd3a1483b8a6 Mon Sep 17 00:00:00 2001
From: Greg Hartman <ghartman@google.com>
Date: Wed, 17 Apr 2019 12:58:54 -0700
Subject: [PATCH 3049/3220] Make arm64 serial port config compatible with
 crosvm

BUG: 118442619
Test: Ran ~/bin/crosvm run --disable-sandbox ~/image, saw output
Change-Id: If086af28b69eca5353a101228ae986653bc1465e
Signed-off-by: Greg Hartman <ghartman@google.com>
(cherry picked from commit 2860dfe869a94c8c1950effb55da116f83ebc740)
---
 arch/arm64/configs/cuttlefish_defconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
index 1aac330efd72..c4ce7cc2d4e5 100644
--- a/arch/arm64/configs/cuttlefish_defconfig
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -54,8 +54,6 @@ CONFIG_CP15_BARRIER_EMULATION=y
 CONFIG_SETEND_EMULATION=y
 CONFIG_ARM64_SW_TTBR0_PAN=y
 CONFIG_RANDOMIZE_BASE=y
-CONFIG_CMDLINE="console=ttyAMA0"
-CONFIG_CMDLINE_EXTEND=y
 # CONFIG_EFI is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_COMPAT=y
@@ -260,6 +258,7 @@ CONFIG_SERIAL_8250_MANY_PORTS=y
 CONFIG_SERIAL_8250_SHARE_IRQ=y
 CONFIG_SERIAL_AMBA_PL011=y
 CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_VIRTIO_CONSOLE=y
 CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_VIRTIO=y
@@ -356,6 +355,7 @@ CONFIG_MMC=y
 # CONFIG_MMC_BLOCK is not set
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_SYSTOHC is not set
+CONFIG_RTC_DRV_PL030=y
 CONFIG_RTC_DRV_PL031=y
 CONFIG_VIRTIO_PCI=y
 # CONFIG_VIRTIO_PCI_LEGACY is not set

From a930f8ce20034ca7373eb2a3e91c2b9f6d5bb64c Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 1 Mar 2019 13:28:00 +0000
Subject: [PATCH 3050/3220] arm64: debug: Don't propagate UNKNOWN FAR into
 si_code for debug signals

commit b9a4b9d084d978f80eb9210727c81804588b42ff upstream.

FAR_EL1 is UNKNOWN for all debug exceptions other than those caused by
taking a hardware watchpoint. Unfortunately, if a debug handler returns
a non-zero value, then we will propagate the UNKNOWN FAR value to
userspace via the si_addr field of the SIGTRAP siginfo_t.

Instead, let's set si_addr to take on the PC of the faulting instruction,
which we have available in the current pt_regs.

Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/mm/fault.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 04c4b88706d8..89abdf9af4e6 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -589,11 +589,12 @@ void __init hook_debug_fault_code(int nr,
 	debug_fault_info[nr].name	= name;
 }
 
-asmlinkage int __exception do_debug_exception(unsigned long addr,
+asmlinkage int __exception do_debug_exception(unsigned long addr_if_watchpoint,
 					      unsigned int esr,
 					      struct pt_regs *regs)
 {
 	const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
+	unsigned long pc = instruction_pointer(regs);
 	struct siginfo info;
 	int rv;
 
@@ -604,16 +605,16 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 	if (interrupts_enabled(regs))
 		trace_hardirqs_off();
 
-	if (!inf->fn(addr, esr, regs)) {
+	if (!inf->fn(addr_if_watchpoint, esr, regs)) {
 		rv = 1;
 	} else {
 		pr_alert("Unhandled debug exception: %s (0x%08x) at 0x%016lx\n",
-			 inf->name, esr, addr);
+			 inf->name, esr, pc);
 
 		info.si_signo = inf->sig;
 		info.si_errno = 0;
 		info.si_code  = inf->code;
-		info.si_addr  = (void __user *)addr;
+		info.si_addr  = (void __user *)pc;
 		arm64_notify_die("", regs, &info, 0);
 		rv = 0;
 	}

From 20df60004a3a9ce2dbfad4793b4092398afa073e Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 1 Mar 2019 13:28:01 +0000
Subject: [PATCH 3051/3220] arm64: debug: Ensure debug handlers check
 triggering exception level

commit 6bd288569b50bc89fa5513031086746968f585cb upstream.

Debug exception handlers may be called for exceptions generated both by
user and kernel code. In many cases, this is checked explicitly, but
in other cases things either happen to work by happy accident or they
go slightly wrong. For example, executing 'brk #4' from userspace will
enter the kprobes code and be ignored, but the instruction will be
retried forever in userspace instead of delivering a SIGTRAP.

Fix this issue in the most stable-friendly fashion by simply adding
explicit checks of the triggering exception level to all of our debug
exception handlers.

Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/kernel/kgdb.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index bcac81e600b9..f72743dc070d 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -215,22 +215,31 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
 
 static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr)
 {
+	if (user_mode(regs))
+		return DBG_HOOK_ERROR;
+
 	kgdb_handle_exception(1, SIGTRAP, 0, regs);
-	return 0;
+	return DBG_HOOK_HANDLED;
 }
 
 static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr)
 {
+	if (user_mode(regs))
+		return DBG_HOOK_ERROR;
+
 	compiled_break = 1;
 	kgdb_handle_exception(1, SIGTRAP, 0, regs);
 
-	return 0;
+	return DBG_HOOK_HANDLED;
 }
 
 static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr)
 {
+	if (user_mode(regs))
+		return DBG_HOOK_ERROR;
+
 	kgdb_handle_exception(1, SIGTRAP, 0, regs);
-	return 0;
+	return DBG_HOOK_HANDLED;
 }
 
 static struct break_hook kgdb_brkpt_hook = {

From 3f44dacd111a1148f8084155a6a81a723e7fcfca Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Sat, 23 Mar 2019 11:56:01 -0400
Subject: [PATCH 3052/3220] ext4: cleanup bh release code in
 ext4_ind_remove_space()

commit 5e86bdda41534e17621d5a071b294943cae4376e upstream.

Currently, we are releasing the indirect buffer where we are done with
it in ext4_ind_remove_space(), so we can see the brelse() and
BUFFER_TRACE() everywhere.  It seems fragile and hard to read, and we
may probably forget to release the buffer some day.  This patch cleans
up the code by putting of the code which releases the buffers to the
end of the function.

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jari Ruusu <jari.ruusu@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/indirect.c | 47 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 8f3e78eb0bbd..08f3a0c0f468 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1323,6 +1323,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
 	ext4_lblk_t offsets[4], offsets2[4];
 	Indirect chain[4], chain2[4];
 	Indirect *partial, *partial2;
+	Indirect *p = NULL, *p2 = NULL;
 	ext4_lblk_t max_block;
 	__le32 nr = 0, nr2 = 0;
 	int n = 0, n2 = 0;
@@ -1364,7 +1365,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
 		}
 
 
-		partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+		partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
 		if (nr) {
 			if (partial == chain) {
 				/* Shared branch grows from the inode */
@@ -1389,13 +1390,11 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
 				partial->p + 1,
 				(__le32 *)partial->bh->b_data+addr_per_block,
 				(chain+n-1) - partial);
-			BUFFER_TRACE(partial->bh, "call brelse");
-			brelse(partial->bh);
 			partial--;
 		}
 
 end_range:
-		partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+		partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
 		if (nr2) {
 			if (partial2 == chain2) {
 				/*
@@ -1425,16 +1424,14 @@ end_range:
 					   (__le32 *)partial2->bh->b_data,
 					   partial2->p,
 					   (chain2+n2-1) - partial2);
-			BUFFER_TRACE(partial2->bh, "call brelse");
-			brelse(partial2->bh);
 			partial2--;
 		}
 		goto do_indirects;
 	}
 
 	/* Punch happened within the same level (n == n2) */
-	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
-	partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+	partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
+	partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
 
 	/* Free top, but only if partial2 isn't its subtree. */
 	if (nr) {
@@ -1491,15 +1488,7 @@ end_range:
 					   partial->p + 1,
 					   partial2->p,
 					   (chain+n-1) - partial);
-			while (partial > chain) {
-				BUFFER_TRACE(partial->bh, "call brelse");
-				brelse(partial->bh);
-			}
-			while (partial2 > chain2) {
-				BUFFER_TRACE(partial2->bh, "call brelse");
-				brelse(partial2->bh);
-			}
-			return 0;
+			goto cleanup;
 		}
 
 		/*
@@ -1514,8 +1503,6 @@ end_range:
 					   partial->p + 1,
 					   (__le32 *)partial->bh->b_data+addr_per_block,
 					   (chain+n-1) - partial);
-			BUFFER_TRACE(partial->bh, "call brelse");
-			brelse(partial->bh);
 			partial--;
 		}
 		if (partial2 > chain2 && depth2 <= depth) {
@@ -1523,11 +1510,21 @@ end_range:
 					   (__le32 *)partial2->bh->b_data,
 					   partial2->p,
 					   (chain2+n2-1) - partial2);
-			BUFFER_TRACE(partial2->bh, "call brelse");
-			brelse(partial2->bh);
 			partial2--;
 		}
 	}
+
+cleanup:
+	while (p && p > chain) {
+		BUFFER_TRACE(p->bh, "call brelse");
+		brelse(p->bh);
+		p--;
+	}
+	while (p2 && p2 > chain2) {
+		BUFFER_TRACE(p2->bh, "call brelse");
+		brelse(p2->bh);
+		p2--;
+	}
 	return 0;
 
 do_indirects:
@@ -1535,7 +1532,7 @@ do_indirects:
 	switch (offsets[0]) {
 	default:
 		if (++n >= n2)
-			return 0;
+			break;
 		nr = i_data[EXT4_IND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
@@ -1543,7 +1540,7 @@ do_indirects:
 		}
 	case EXT4_IND_BLOCK:
 		if (++n >= n2)
-			return 0;
+			break;
 		nr = i_data[EXT4_DIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
@@ -1551,7 +1548,7 @@ do_indirects:
 		}
 	case EXT4_DIND_BLOCK:
 		if (++n >= n2)
-			return 0;
+			break;
 		nr = i_data[EXT4_TIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
@@ -1560,5 +1557,5 @@ do_indirects:
 	case EXT4_TIND_BLOCK:
 		;
 	}
-	return 0;
+	goto cleanup;
 }

From 6f88ce1ff22581e67e2c892189126216f5bce795 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Nov 2017 15:28:08 -0800
Subject: [PATCH 3053/3220] lib/int_sqrt: optimize initial value compute

commit f8ae107eef209bff29a5816bc1aad40d5cd69a80 upstream.

The initial value (@m) compute is:

	m = 1UL << (BITS_PER_LONG - 2);
	while (m > x)
		m >>= 2;

Which is a linear search for the highest even bit smaller or equal to @x
We can implement this using a binary search using __fls() (or better when
its hardware implemented).

	m = 1UL << (__fls(x) & ~1UL);

Especially for small values of @x; which are the more common arguments
when doing a CDF on idle times; the linear search is near to worst case,
while the binary search of __fls() is a constant 6 (or 5 on 32bit)
branches.

      cycles:                 branches:              branch-misses:

PRE:

hot:   43.633557 +- 0.034373  45.333132 +- 0.002277  0.023529 +- 0.000681
cold: 207.438411 +- 0.125840  45.333132 +- 0.002277  6.976486 +- 0.004219

SOFTWARE FLS:

hot:   29.576176 +- 0.028850  26.666730 +- 0.004511  0.019463 +- 0.000663
cold: 165.947136 +- 0.188406  26.666746 +- 0.004511  6.133897 +- 0.004386

HARDWARE FLS:

hot:   24.720922 +- 0.025161  20.666784 +- 0.004509  0.020836 +- 0.000677
cold: 132.777197 +- 0.127471  20.666776 +- 0.004509  5.080285 +- 0.003874

Averages computed over all values <128k using a LFSR to generate order.
Cold numbers have a LFSR based branch trace buffer 'confuser' ran between
each int_sqrt() invocation.

Link: http://lkml.kernel.org/r/20171020164644.936577234@infradead.org
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Joe Perches <joe@perches.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Anshul Garg <aksgarg1989@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Michael Davidson <md@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 lib/int_sqrt.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index 1afb545a37c5..6d35274170bc 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -7,6 +7,7 @@
 
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/bitops.h>
 
 /**
  * int_sqrt - rough approximation to sqrt
@@ -21,10 +22,7 @@ unsigned long int_sqrt(unsigned long x)
 	if (x <= 1)
 		return x;
 
-	m = 1UL << (BITS_PER_LONG - 2);
-	while (m > x)
-		m >>= 2;
-
+	m = 1UL << (__fls(x) & ~1UL);
 	while (m != 0) {
 		b = y + m;
 		y >>= 1;

From a526c14d841b2dba90a56d2d59e2aaa6ef70f8c8 Mon Sep 17 00:00:00 2001
From: Razvan Stefanescu <razvan.stefanescu@microchip.com>
Date: Tue, 19 Mar 2019 15:20:34 +0200
Subject: [PATCH 3054/3220] tty/serial: atmel: Add is_half_duplex helper

commit f3040983132bf3477acd45d2452a906e67c2fec9 upstream.

Use a helper function to check that a port needs to use half duplex
communication, replacing several occurrences of multi-line bit checking.

Fixes: b389f173aaa1 ("tty/serial: atmel: RS485 half duplex w/DMA: enable RX after TX is done")
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Razvan Stefanescu <razvan.stefanescu@microchip.com>
Acked-by: Richard Genoud <richard.genoud@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/atmel_serial.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index f5c4e92b5172..f97f1f456bc0 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -237,6 +237,12 @@ static inline void atmel_uart_write_char(struct uart_port *port, u8 value)
 
 #endif
 
+static inline int atmel_uart_is_half_duplex(struct uart_port *port)
+{
+	return (port->rs485.flags & SER_RS485_ENABLED) &&
+		!(port->rs485.flags & SER_RS485_RX_DURING_TX);
+}
+
 #ifdef CONFIG_SERIAL_ATMEL_PDC
 static bool atmel_use_pdc_rx(struct uart_port *port)
 {
@@ -481,9 +487,9 @@ static void atmel_stop_tx(struct uart_port *port)
 	/* Disable interrupts */
 	atmel_uart_writel(port, ATMEL_US_IDR, atmel_port->tx_done_mask);
 
-	if ((port->rs485.flags & SER_RS485_ENABLED) &&
-	    !(port->rs485.flags & SER_RS485_RX_DURING_TX))
+	if (atmel_uart_is_half_duplex(port))
 		atmel_start_rx(port);
+
 }
 
 /*
@@ -500,8 +506,7 @@ static void atmel_start_tx(struct uart_port *port)
 		return;
 
 	if (atmel_use_pdc_tx(port) || atmel_use_dma_tx(port))
-		if ((port->rs485.flags & SER_RS485_ENABLED) &&
-		    !(port->rs485.flags & SER_RS485_RX_DURING_TX))
+		if (atmel_uart_is_half_duplex(port))
 			atmel_stop_rx(port);
 
 	if (atmel_use_pdc_tx(port))
@@ -810,8 +815,7 @@ static void atmel_complete_tx_dma(void *arg)
 	 */
 	if (!uart_circ_empty(xmit))
 		tasklet_schedule(&atmel_port->tasklet);
-	else if ((port->rs485.flags & SER_RS485_ENABLED) &&
-		 !(port->rs485.flags & SER_RS485_RX_DURING_TX)) {
+	else if (atmel_uart_is_half_duplex(port)) {
 		/* DMA done, stop TX, start RX for RS485 */
 		atmel_start_rx(port);
 	}
@@ -1388,8 +1392,7 @@ static void atmel_tx_pdc(struct uart_port *port)
 		atmel_uart_writel(port, ATMEL_US_IER,
 				  atmel_port->tx_done_mask);
 	} else {
-		if ((port->rs485.flags & SER_RS485_ENABLED) &&
-		    !(port->rs485.flags & SER_RS485_RX_DURING_TX)) {
+		if (atmel_uart_is_half_duplex(port)) {
 			/* DMA done, stop TX, start RX for RS485 */
 			atmel_start_rx(port);
 		}

From b3b489eea284f61ed2dcec033ff9ef9f83394730 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 28 Mar 2019 20:43:55 -0700
Subject: [PATCH 3055/3220] mm: mempolicy: make mbind() return -EIO when
 MPOL_MF_STRICT is specified

commit a7f40cfe3b7ada57af9b62fd28430eeb4a7cfcb7 upstream.

When MPOL_MF_STRICT was specified and an existing page was already on a
node that does not follow the policy, mbind() should return -EIO.  But
commit 6f4576e3687b ("mempolicy: apply page table walker on
queue_pages_range()") broke the rule.

And commit c8633798497c ("mm: mempolicy: mbind and migrate_pages support
thp migration") didn't return the correct value for THP mbind() too.

If MPOL_MF_STRICT is set, ignore vma_migratable() to make sure it
reaches queue_pages_to_pte_range() or queue_pages_pmd() to check if an
existing page was already on a node that does not follow the policy.
And, non-migratable vma may be used, return -EIO too if MPOL_MF_MOVE or
MPOL_MF_MOVE_ALL was specified.

Tested with https://github.com/metan-ucw/ltp/blob/master/testcases/kernel/syscalls/mbind/mbind02.c

[akpm@linux-foundation.org: tweak code comment]
Link: http://lkml.kernel.org/r/1553020556-38583-1-git-send-email-yang.shi@linux.alibaba.com
Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()")
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reported-by: Cyril Hrubis <chrubis@suse.cz>
Suggested-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: Rafael Aquini <aquini@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5418ab0c5e2c..878d05bd185c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -514,12 +514,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 			continue;
 
-		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+			if (!vma_migratable(vma))
+				break;
 			migrate_page_add(page, qp->pagelist, flags);
+		} else
+			break;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
-	return 0;
+	return addr != end ? -EIO : 0;
 }
 
 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,

From 02bfc06ca2fa1158d6cd2e5688bfc4ef278d8425 Mon Sep 17 00:00:00 2001
From: Jeremy Compostella <jeremy.compostella@intel.com>
Date: Wed, 15 Nov 2017 12:31:44 -0700
Subject: [PATCH 3056/3220] i2c: core-smbus: prevent stack corruption on read
 I2C_BLOCK_DATA

commit 89c6efa61f5709327ecfa24bff18e57a4e80c7fa upstream.

On a I2C_SMBUS_I2C_BLOCK_DATA read request, if data->block[0] is
greater than I2C_SMBUS_BLOCK_MAX + 1, the underlying I2C driver writes
data out of the msgbuf1 array boundary.

It is possible from a user application to run into that issue by
calling the I2C_SMBUS ioctl with data.block[0] greater than
I2C_SMBUS_BLOCK_MAX + 1.

This patch makes the code compliant with
Documentation/i2c/dev-interface by raising an error when the requested
size is larger than 32 bytes.

Call Trace:
 [<ffffffff8139f695>] dump_stack+0x67/0x92
 [<ffffffff811802a4>] panic+0xc5/0x1eb
 [<ffffffff810ecb5f>] ? vprintk_default+0x1f/0x30
 [<ffffffff817456d3>] ? i2cdev_ioctl_smbus+0x303/0x320
 [<ffffffff8109a68b>] __stack_chk_fail+0x1b/0x20
 [<ffffffff817456d3>] i2cdev_ioctl_smbus+0x303/0x320
 [<ffffffff81745aed>] i2cdev_ioctl+0x4d/0x1e0
 [<ffffffff811f761a>] do_vfs_ioctl+0x2ba/0x490
 [<ffffffff81336e43>] ? security_file_ioctl+0x43/0x60
 [<ffffffff811f7869>] SyS_ioctl+0x79/0x90
 [<ffffffff81a22e97>] entry_SYSCALL_64_fastpath+0x12/0x6a

Signed-off-by: Jeremy Compostella <jeremy.compostella@intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Cc: stable@kernel.org
[connoro@google.com: 4.9 backport: adjust filename]
Signed-off-by: Connor O'Brien <connoro@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/i2c/i2c-core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index e4587411b447..153376009b46 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -2936,16 +2936,16 @@ static s32 i2c_smbus_xfer_emulated(struct i2c_adapter *adapter, u16 addr,
 				   the underlying bus driver */
 		break;
 	case I2C_SMBUS_I2C_BLOCK_DATA:
+		if (data->block[0] > I2C_SMBUS_BLOCK_MAX) {
+			dev_err(&adapter->dev, "Invalid block %s size %d\n",
+				read_write == I2C_SMBUS_READ ? "read" : "write",
+				data->block[0]);
+			return -EINVAL;
+		}
 		if (read_write == I2C_SMBUS_READ) {
 			msg[1].len = data->block[0];
 		} else {
 			msg[0].len = data->block[0] + 1;
-			if (msg[0].len > I2C_SMBUS_BLOCK_MAX + 1) {
-				dev_err(&adapter->dev,
-					"Invalid block write size %d\n",
-					data->block[0]);
-				return -EINVAL;
-			}
 			for (i = 1; i <= data->block[0]; i++)
 				msgbuf0[i] = data->block[i];
 		}

From 59ae59920af83903fa5fd73ebe8850c1860198d9 Mon Sep 17 00:00:00 2001
From: Myungho Jung <mhjungk@gmail.com>
Date: Sat, 2 Feb 2019 16:56:36 -0800
Subject: [PATCH 3057/3220] Bluetooth: Fix decrementing reference count twice
 in releasing socket

commit e20a2e9c42c9e4002d9e338d74e7819e88d77162 upstream.

When releasing socket, it is possible to enter hci_sock_release() and
hci_sock_dev_event(HCI_DEV_UNREG) at the same time in different thread.
The reference count of hdev should be decremented only once from one of
them but if storing hdev to local variable in hci_sock_release() before
detached from socket and setting to NULL in hci_sock_dev_event(),
hci_dev_put(hdev) is unexpectedly called twice. This is resolved by
referencing hdev from socket after bt_sock_unlink() in
hci_sock_release().

Reported-by: syzbot+fdc00003f4efff43bc5b@syzkaller.appspotmail.com
Signed-off-by: Myungho Jung <mhjungk@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Zubin Mithra <zsm@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bluetooth/hci_sock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index c842f40c1173..ea1cd8b21708 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -558,13 +558,12 @@ static int hci_sock_release(struct socket *sock)
 	if (!sk)
 		return 0;
 
-	hdev = hci_pi(sk)->hdev;
-
 	if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR)
 		atomic_dec(&monitor_promisc);
 
 	bt_sock_unlink(&hci_sk_list, sk);
 
+	hdev = hci_pi(sk)->hdev;
 	if (hdev) {
 		if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
 			/* When releasing an user channel exclusive access,

From a6efba2fc33b706fa0706eab567eec9234a7df52 Mon Sep 17 00:00:00 2001
From: Razvan Stefanescu <razvan.stefanescu@microchip.com>
Date: Tue, 19 Mar 2019 15:20:35 +0200
Subject: [PATCH 3058/3220] tty/serial: atmel: RS485 HD w/DMA: enable RX after
 TX is stopped

commit 69646d7a3689fbe1a65ae90397d22ac3f1b8d40f upstream.

In half-duplex operation, RX should be started after TX completes.

If DMA is used, there is a case when the DMA transfer completes but the
TX FIFO is not emptied, so the RX cannot be restarted just yet.

Use a boolean variable to store this state and rearm TX interrupt mask
to be signaled again that the transfer finished. In interrupt transmit
handler this variable is used to start RX. A warning message is generated
if RX is activated before TX fifo is cleared.

Fixes: b389f173aaa1 ("tty/serial: atmel: RS485 half duplex w/DMA: enable
RX after TX is done")
Signed-off-by: Razvan Stefanescu <razvan.stefanescu@microchip.com>
Acked-by: Richard Genoud <richard.genoud@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/atmel_serial.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index f97f1f456bc0..fc46c8cf5fcd 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -169,6 +169,8 @@ struct atmel_uart_port {
 	unsigned int		pending_status;
 	spinlock_t		lock_suspended;
 
+	bool			hd_start_rx;	/* can start RX during half-duplex operation */
+
 	int (*prepare_rx)(struct uart_port *port);
 	int (*prepare_tx)(struct uart_port *port);
 	void (*schedule_rx)(struct uart_port *port);
@@ -816,8 +818,13 @@ static void atmel_complete_tx_dma(void *arg)
 	if (!uart_circ_empty(xmit))
 		tasklet_schedule(&atmel_port->tasklet);
 	else if (atmel_uart_is_half_duplex(port)) {
-		/* DMA done, stop TX, start RX for RS485 */
-		atmel_start_rx(port);
+		/*
+		 * DMA done, re-enable TXEMPTY and signal that we can stop
+		 * TX and start RX for RS485
+		 */
+		atmel_port->hd_start_rx = true;
+		atmel_uart_writel(port, ATMEL_US_IER,
+				  atmel_port->tx_done_mask);
 	}
 
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -1257,9 +1264,20 @@ atmel_handle_transmit(struct uart_port *port, unsigned int pending)
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
 
 	if (pending & atmel_port->tx_done_mask) {
-		/* Either PDC or interrupt transmission */
 		atmel_uart_writel(port, ATMEL_US_IDR,
 				  atmel_port->tx_done_mask);
+
+		/* Start RX if flag was set and FIFO is empty */
+		if (atmel_port->hd_start_rx) {
+			if (!(atmel_uart_readl(port, ATMEL_US_CSR)
+					& ATMEL_US_TXEMPTY))
+				dev_warn(port->dev, "Should start RX, but TX fifo is not empty\n");
+
+			atmel_port->hd_start_rx = false;
+			atmel_start_rx(port);
+			return;
+		}
+
 		tasklet_schedule(&atmel_port->tasklet);
 	}
 }

From f6e4bc5003ffbcf195bed4bde8ef4feebe2d1490 Mon Sep 17 00:00:00 2001
From: Aurelien Aptel <aaptel@suse.com>
Date: Thu, 14 Mar 2019 18:44:16 +0100
Subject: [PATCH 3059/3220] CIFS: fix POSIX lock leak and invalid ptr deref

[ Upstream commit bc31d0cdcfbadb6258b45db97e93b1c83822ba33 ]

We have a customer reporting crashes in lock_get_status() with many
"Leaked POSIX lock" messages preceeding the crash.

 Leaked POSIX lock on dev=0x0:0x56 ...
 Leaked POSIX lock on dev=0x0:0x56 ...
 Leaked POSIX lock on dev=0x0:0x56 ...
 Leaked POSIX lock on dev=0x0:0x53 ...
 Leaked POSIX lock on dev=0x0:0x53 ...
 Leaked POSIX lock on dev=0x0:0x53 ...
 Leaked POSIX lock on dev=0x0:0x53 ...
 POSIX: fl_owner=ffff8900e7b79380 fl_flags=0x1 fl_type=0x1 fl_pid=20709
 Leaked POSIX lock on dev=0x0:0x4b ino...
 Leaked locks on dev=0x0:0x4b ino=0xf911400000029:
 POSIX: fl_owner=ffff89f41c870e00 fl_flags=0x1 fl_type=0x1 fl_pid=19592
 stack segment: 0000 [#1] SMP
 Modules linked in: binfmt_misc msr tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag rpcsec_gss_krb5 arc4 ecb auth_rpcgss nfsv4 md4 nfs nls_utf8 lockd grace cifs sunrpc ccm dns_resolver fscache af_packet iscsi_ibft iscsi_boot_sysfs vmw_vsock_vmci_transport vsock xfs libcrc32c sb_edac edac_core crct10dif_pclmul crc32_pclmul ghash_clmulni_intel drbg ansi_cprng vmw_balloon aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd joydev pcspkr vmxnet3 i2c_piix4 vmw_vmci shpchp fjes processor button ac btrfs xor raid6_pq sr_mod cdrom ata_generic sd_mod ata_piix vmwgfx crc32c_intel drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm serio_raw ahci libahci drm libata vmw_pvscsi sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4

 Supported: Yes
 CPU: 6 PID: 28250 Comm: lsof Not tainted 4.4.156-94.64-default #1
 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016
 task: ffff88a345f28740 ti: ffff88c74005c000 task.ti: ffff88c74005c000
 RIP: 0010:[<ffffffff8125dcab>]  [<ffffffff8125dcab>] lock_get_status+0x9b/0x3b0
 RSP: 0018:ffff88c74005fd90  EFLAGS: 00010202
 RAX: ffff89bde83e20ae RBX: ffff89e870003d18 RCX: 0000000049534f50
 RDX: ffffffff81a3541f RSI: ffffffff81a3544e RDI: ffff89bde83e20ae
 RBP: 0026252423222120 R08: 0000000020584953 R09: 000000000000ffff
 R10: 0000000000000000 R11: ffff88c74005fc70 R12: ffff89e5ca7b1340
 R13: 00000000000050e5 R14: ffff89e870003d30 R15: ffff89e5ca7b1340
 FS:  00007fafd64be800(0000) GS:ffff89f41fd00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000001c80018 CR3: 000000a522048000 CR4: 0000000000360670
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Stack:
  0000000000000208 ffffffff81a3d6b6 ffff89e870003d30 ffff89e870003d18
  ffff89e5ca7b1340 ffff89f41738d7c0 ffff89e870003d30 ffff89e5ca7b1340
  ffffffff8125e08f 0000000000000000 ffff89bc22b67d00 ffff88c74005ff28
 Call Trace:
  [<ffffffff8125e08f>] locks_show+0x2f/0x70
  [<ffffffff81230ad1>] seq_read+0x251/0x3a0
  [<ffffffff81275bbc>] proc_reg_read+0x3c/0x70
  [<ffffffff8120e456>] __vfs_read+0x26/0x140
  [<ffffffff8120e9da>] vfs_read+0x7a/0x120
  [<ffffffff8120faf2>] SyS_read+0x42/0xa0
  [<ffffffff8161cbc3>] entry_SYSCALL_64_fastpath+0x1e/0xb7

When Linux closes a FD (close(), close-on-exec, dup2(), ...) it calls
filp_close() which also removes all posix locks.

The lock struct is initialized like so in filp_close() and passed
down to cifs

	...
        lock.fl_type = F_UNLCK;
        lock.fl_flags = FL_POSIX | FL_CLOSE;
        lock.fl_start = 0;
        lock.fl_end = OFFSET_MAX;
	...

Note the FL_CLOSE flag, which hints the VFS code that this unlocking
is done for closing the fd.

filp_close()
  locks_remove_posix(filp, id);
    vfs_lock_file(filp, F_SETLK, &lock, NULL);
      return filp->f_op->lock(filp, cmd, fl) => cifs_lock()
        rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock, xid);
          rc = server->ops->mand_unlock_range(cfile, flock, xid);
          if (flock->fl_flags & FL_POSIX && !rc)
                  rc = locks_lock_file_wait(file, flock)

Notice how we don't call locks_lock_file_wait() which does the
generic VFS lock/unlock/wait work on the inode if rc != 0.

If we are closing the handle, the SMB server is supposed to remove any
locks associated with it. Similarly, cifs.ko frees and wakes up any
lock and lock waiter when closing the file:

cifs_close()
  cifsFileInfo_put(file->private_data)
	/*
	 * Delete any outstanding lock records. We'll lose them when the file
	 * is closed anyway.
	 */
	down_write(&cifsi->lock_sem);
	list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
		list_del(&li->llist);
		cifs_del_lock_waiters(li);
		kfree(li);
	}
	list_del(&cifs_file->llist->llist);
	kfree(cifs_file->llist);
	up_write(&cifsi->lock_sem);

So we can safely ignore unlocking failures in cifs_lock() if they
happen with the FL_CLOSE flag hint set as both the server and the
client take care of it during the actual closing.

This is not a proper fix for the unlocking failure but it's safe and
it seems to prevent the lock leakages and crashes the customer
experiences.

Signed-off-by: Aurelien Aptel <aaptel@suse.com>
Signed-off-by: NeilBrown <neil@brown.name>
Signed-off-by: Steve French <stfrench@microsoft.com>
Acked-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/file.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0305e3866216..23a8374fa97f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1574,8 +1574,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
 		rc = server->ops->mand_unlock_range(cfile, flock, xid);
 
 out:
-	if (flock->fl_flags & FL_POSIX && !rc)
+	if (flock->fl_flags & FL_POSIX) {
+		/*
+		 * If this is a request to remove all locks because we
+		 * are closing the file, it doesn't matter if the
+		 * unlocking failed as both cifs.ko and the SMB server
+		 * remove the lock on file close
+		 */
+		if (rc) {
+			cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
+			if (!(flock->fl_flags & FL_CLOSE))
+				return rc;
+		}
 		rc = locks_lock_file_wait(file, flock);
+	}
 	return rc;
 }
 

From 09f4e69e09f55f76c38498f58547204eb48fc345 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 15 Feb 2019 13:04:26 +0900
Subject: [PATCH 3060/3220] h8300: use cc-cross-prefix instead of hardcoding
 h8300-unknown-linux-

[ Upstream commit fc2b47b55f17fd996f7a01975ce1c33c2f2513f6 ]

It believe it is a bad idea to hardcode a specific compiler prefix
that may or may not be installed on a user's system. It is annoying
when testing features that should not require compilers at all.

For example, mrproper, headers_install, etc. should work without
any compiler.

They look like follows on my machine.

$ make ARCH=h8300 mrproper
./scripts/gcc-version.sh: line 26: h8300-unknown-linux-gcc: command not found
./scripts/gcc-version.sh: line 27: h8300-unknown-linux-gcc: command not found
make: h8300-unknown-linux-gcc: Command not found
make: h8300-unknown-linux-gcc: Command not found
  [ a bunch of the same error messages continue ]

$ make ARCH=h8300 headers_install
./scripts/gcc-version.sh: line 26: h8300-unknown-linux-gcc: command not found
./scripts/gcc-version.sh: line 27: h8300-unknown-linux-gcc: command not found
make: h8300-unknown-linux-gcc: Command not found
  HOSTCC  scripts/basic/fixdep
make: h8300-unknown-linux-gcc: Command not found
  WRAP    arch/h8300/include/generated/uapi/asm/kvm_para.h
  [ snip ]

The solution is to delete this line, or to use cc-cross-prefix like
some architectures do. I chose the latter as a moderate fixup.

I added an alternative 'h8300-linux-' because it is available at:

https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/x86_64/8.1.0/

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/h8300/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/h8300/Makefile b/arch/h8300/Makefile
index e1c02ca230cb..073bba6f9f60 100644
--- a/arch/h8300/Makefile
+++ b/arch/h8300/Makefile
@@ -23,7 +23,7 @@ KBUILD_AFLAGS += $(aflags-y)
 LDFLAGS += $(ldflags-y)
 
 ifeq ($(CROSS_COMPILE),)
-CROSS_COMPILE := h8300-unknown-linux-
+CROSS_COMPILE := $(call cc-cross-prefix, h8300-unknown-linux- h8300-linux-)
 endif
 
 core-y	+= arch/$(ARCH)/kernel/ arch/$(ARCH)/mm/

From f7b7a59b47a0052c25c2b53f1f8cfdf104c688fa Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 8 Mar 2019 11:32:04 -0800
Subject: [PATCH 3061/3220] tracing: kdb: Fix ftdump to not sleep

[ Upstream commit 31b265b3baaf55f209229888b7ffea523ddab366 ]

As reported back in 2016-11 [1], the "ftdump" kdb command triggers a
BUG for "sleeping function called from invalid context".

kdb's "ftdump" command wants to call ring_buffer_read_prepare() in
atomic context.  A very simple solution for this is to add allocation
flags to ring_buffer_read_prepare() so kdb can call it without
triggering the allocation error.  This patch does that.

Note that in the original email thread about this, it was suggested
that perhaps the solution for kdb was to either preallocate the buffer
ahead of time or create our own iterator.  I'm hoping that this
alternative of adding allocation flags to ring_buffer_read_prepare()
can be considered since it means I don't need to duplicate more of the
core trace code into "trace_kdb.c" (for either creating my own
iterator or re-preparing a ring allocator whose memory was already
allocated).

NOTE: another option for kdb is to actually figure out how to make it
reuse the existing ftrace_dump() function and totally eliminate the
duplication.  This sounds very appealing and actually works (the "sr
z" command can be seen to properly dump the ftrace buffer).  The
downside here is that ftrace_dump() fully consumes the trace buffer.
Unless that is changed I'd rather not use it because it means "ftdump
| grep xyz" won't be very useful to search the ftrace buffer since it
will throw away the whole trace on the first grep.  A future patch to
dump only the last few lines of the buffer will also be hard to
implement.

[1] https://lkml.kernel.org/r/20161117191605.GA21459@google.com

Link: http://lkml.kernel.org/r/20190308193205.213659-1-dianders@chromium.org

Reported-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/linux/ring_buffer.h | 2 +-
 kernel/trace/ring_buffer.c  | 5 +++--
 kernel/trace/trace.c        | 6 ++++--
 kernel/trace/trace_kdb.c    | 6 ++++--
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 19d0778ec382..121c8f99ecdd 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -125,7 +125,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu);
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_prepare_sync(void);
 void ring_buffer_read_start(struct ring_buffer_iter *iter);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 74b20e3ab8c6..5e091614fe39 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4042,6 +4042,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
  * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
+ * @flags: gfp flags to use for memory allocation
  *
  * This performs the initial preparations necessary to iterate
  * through the buffer.  Memory is allocated, buffer recording
@@ -4059,7 +4060,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
  * This overall must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -4067,7 +4068,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
-	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+	iter = kmalloc(sizeof(*iter), flags);
 	if (!iter)
 		return NULL;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8c097de8a596..ae00e68ceae3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3122,7 +3122,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+				ring_buffer_read_prepare(iter->trace_buffer->buffer,
+							 cpu, GFP_KERNEL);
 		}
 		ring_buffer_read_prepare_sync();
 		for_each_tracing_cpu(cpu) {
@@ -3132,7 +3133,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+			ring_buffer_read_prepare(iter->trace_buffer->buffer,
+						 cpu, GFP_KERNEL);
 		ring_buffer_read_prepare_sync();
 		ring_buffer_read_start(iter->buffer_iter[cpu]);
 		tracing_iter_reset(iter, cpu);
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 57149bce6aad..896458285fdd 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -50,14 +50,16 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
+			ring_buffer_read_prepare(iter.trace_buffer->buffer,
+						 cpu, GFP_ATOMIC);
 			ring_buffer_read_start(iter.buffer_iter[cpu]);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
+			ring_buffer_read_prepare(iter.trace_buffer->buffer,
+						 cpu_file, GFP_ATOMIC);
 		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}

From 28833fee9f4a4deb5bffefd7afb22d0e8cd0ed4e Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 1 Mar 2019 11:02:52 -0800
Subject: [PATCH 3062/3220] gpio: gpio-omap: fix level interrupt idling

[ Upstream commit d01849f7deba81f4959fd9e51bf20dbf46987d1c ]

Tony notes that the GPIO module does not idle when level interrupts are
in use, as the wakeup appears to get stuck.

After extensive investigation, it appears that the wakeup will only be
cleared if the interrupt status register is cleared while the interrupt
is enabled. However, we are currently clearing it with the interrupt
disabled for level-based interrupts.

It is acknowledged that this observed behaviour conflicts with a
statement in the TRM:

CAUTION
  After servicing the interrupt, the status bit in the interrupt status
  register (GPIOi.GPIO_IRQSTATUS_0 or GPIOi.GPIO_IRQSTATUS_1) must be
  reset and the interrupt line released (by setting the corresponding
  bit of the interrupt status register to 1) before enabling an
  interrupt for the GPIO channel in the interrupt-enable register
  (GPIOi.GPIO_IRQSTATUS_SET_0 or GPIOi.GPIO_IRQSTATUS_SET_1) to prevent
  the occurrence of unexpected interrupts when enabling an interrupt
  for the GPIO channel.

However, this does not appear to be a practical problem.

Further, as reported by Grygorii Strashko <grygorii.strashko@ti.com>,
the TI Android kernel tree has an earlier similar patch as "GPIO: OMAP:
Fix the sequence to clear the IRQ status" saying:

 if the status is cleared after disabling the IRQ then sWAKEUP will not
 be cleared and gates the module transition

When we unmask the level interrupt after the interrupt has been handled,
enable the interrupt and only then clear the interrupt. If the interrupt
is still pending, the hardware will re-assert the interrupt status.

Should the caution note in the TRM prove to be a problem, we could
use a clear-enable-clear sequence instead.

Cc: Aaro Koskinen <aaro.koskinen@iki.fi>
Cc: Keerthy <j-keerthy@ti.com>
Cc: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
[tony@atomide.com: updated comments based on an earlier TI patch]
Signed-off-by: Tony Lindgren <tony@atomide.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpio/gpio-omap.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index f7fbb46d5d79..9943273ec981 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -872,14 +872,16 @@ static void omap_gpio_unmask_irq(struct irq_data *d)
 	if (trigger)
 		omap_set_gpio_triggering(bank, offset, trigger);
 
-	/* For level-triggered GPIOs, the clearing must be done after
-	 * the HW source is cleared, thus after the handler has run */
-	if (bank->level_mask & BIT(offset)) {
-		omap_set_gpio_irqenable(bank, offset, 0);
-		omap_clear_gpio_irqstatus(bank, offset);
-	}
-
 	omap_set_gpio_irqenable(bank, offset, 1);
+
+	/*
+	 * For level-triggered GPIOs, clearing must be done after the source
+	 * is cleared, thus after the handler has run. OMAP4 needs this done
+	 * after enabing the interrupt to clear the wakeup status.
+	 */
+	if (bank->level_mask & BIT(offset))
+		omap_clear_gpio_irqstatus(bank, offset);
+
 	raw_spin_unlock_irqrestore(&bank->lock, flags);
 }
 

From cf503f1b938fa1aeebd44a8388a0c738b6614999 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Thu, 7 Mar 2019 16:29:43 -0800
Subject: [PATCH 3063/3220] sysctl: handle overflow for file-max

[ Upstream commit 32a5ad9c22852e6bd9e74bdec5934ef9d1480bc5 ]

Currently, when writing

  echo 18446744073709551616 > /proc/sys/fs/file-max

/proc/sys/fs/file-max will overflow and be set to 0.  That quickly
crashes the system.

This commit sets the max and min value for file-max.  The max value is
set to long int.  Any higher value cannot currently be used as the
percpu counters are long ints and not unsigned integers.

Note that the file-max value is ultimately parsed via
__do_proc_doulongvec_minmax().  This function does not report error when
min or max are exceeded.  Which means if a value largen that long int is
written userspace will not receive an error instead the old value will be
kept.  There is an argument to be made that this should be changed and
__do_proc_doulongvec_minmax() should return an error when a dedicated min
or max value are exceeded.  However this has the potential to break
userspace so let's defer this to an RFC patch.

Link: http://lkml.kernel.org/r/20190107222700.15954-3-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
[christian@brauner.io: v4]
  Link: http://lkml.kernel.org/r/20190210203943.8227-3-christian@brauner.io
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/sysctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index beadcf83ceba..2f98b11477b8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,6 +126,7 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
+static unsigned long long_max = LONG_MAX;
 static int one_hundred = 100;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
@@ -1603,6 +1604,8 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(files_stat.max_files),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &long_max,
 	},
 	{
 		.procname	= "nr_open",

From 722a15d798ff696608da37d77a3b1593fee0a566 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 7 Mar 2019 16:52:24 +0100
Subject: [PATCH 3064/3220] enic: fix build warning without
 CONFIG_CPUMASK_OFFSTACK

[ Upstream commit 43d281662fdb46750d49417559b71069f435298d ]

The enic driver relies on the CONFIG_CPUMASK_OFFSTACK feature to
dynamically allocate a struct member, but this is normally intended for
local variables.

Building with clang, I get a warning for a few locations that check the
address of the cpumask_var_t:

drivers/net/ethernet/cisco/enic/enic_main.c:122:22: error: address of array 'enic->msix[i].affinity_mask' will always evaluate to 'true' [-Werror,-Wpointer-bool-conversion]

As far as I can tell, the code is still correct, as the truth value of
the pointer is what we need in this configuration. To get rid of
the warning, use cpumask_available() instead of checking the
pointer directly.

Fixes: 322cf7e3a4e8 ("enic: assign affinity hint to interrupts")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/cisco/enic/enic_main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 9ef4caa4b84d..3c20d0dc9256 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -120,7 +120,7 @@ static void enic_init_affinity_hint(struct enic *enic)
 
 	for (i = 0; i < enic->intr_count; i++) {
 		if (enic_is_err_intr(enic, i) || enic_is_notify_intr(enic, i) ||
-		    (enic->msix[i].affinity_mask &&
+		    (cpumask_available(enic->msix[i].affinity_mask) &&
 		     !cpumask_empty(enic->msix[i].affinity_mask)))
 			continue;
 		if (zalloc_cpumask_var(&enic->msix[i].affinity_mask,
@@ -149,7 +149,7 @@ static void enic_set_affinity_hint(struct enic *enic)
 	for (i = 0; i < enic->intr_count; i++) {
 		if (enic_is_err_intr(enic, i)		||
 		    enic_is_notify_intr(enic, i)	||
-		    !enic->msix[i].affinity_mask	||
+		    !cpumask_available(enic->msix[i].affinity_mask) ||
 		    cpumask_empty(enic->msix[i].affinity_mask))
 			continue;
 		err = irq_set_affinity_hint(enic->msix_entry[i].vector,
@@ -162,7 +162,7 @@ static void enic_set_affinity_hint(struct enic *enic)
 	for (i = 0; i < enic->wq_count; i++) {
 		int wq_intr = enic_msix_wq_intr(enic, i);
 
-		if (enic->msix[wq_intr].affinity_mask &&
+		if (cpumask_available(enic->msix[wq_intr].affinity_mask) &&
 		    !cpumask_empty(enic->msix[wq_intr].affinity_mask))
 			netif_set_xps_queue(enic->netdev,
 					    enic->msix[wq_intr].affinity_mask,

From 4970a8ba94ca69579991962581fd65a1b6cb2416 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Tue, 5 Mar 2019 15:49:50 -0800
Subject: [PATCH 3065/3220] mm/cma.c: cma_declare_contiguous: correct err
 handling

[ Upstream commit 0d3bd18a5efd66097ef58622b898d3139790aa9d ]

In case cma_init_reserved_mem failed, need to free the memblock
allocated by memblock_reserve or memblock_alloc_range.

Quote Catalin's comments:
  https://lkml.org/lkml/2019/2/26/482

Kmemleak is supposed to work with the memblock_{alloc,free} pair and it
ignores the memblock_reserve() as a memblock_alloc() implementation
detail. It is, however, tolerant to memblock_free() being called on
a sub-range or just a different range from a previous memblock_alloc().
So the original patch looks fine to me. FWIW:

Link: http://lkml.kernel.org/r/20190227144631.16708-1-peng.fan@nxp.com
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/cma.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/cma.c b/mm/cma.c
index 43f4a122e969..f0d91aca5a4c 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -339,12 +339,14 @@ int __init cma_declare_contiguous(phys_addr_t base,
 
 	ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma);
 	if (ret)
-		goto err;
+		goto free_mem;
 
 	pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
 		&base);
 	return 0;
 
+free_mem:
+	memblock_free(base, size);
 err:
 	pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
 	return ret;

From 2ea83494ce110e2c71ebbcd91add8b6e510b51d9 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Tue, 5 Mar 2019 15:49:46 -0800
Subject: [PATCH 3066/3220] mm/page_ext.c: fix an imbalance with kmemleak

[ Upstream commit 0c81585499601acd1d0e1cbf424cabfaee60628c ]

After offlining a memory block, kmemleak scan will trigger a crash, as
it encounters a page ext address that has already been freed during
memory offlining.  At the beginning in alloc_page_ext(), it calls
kmemleak_alloc(), but it does not call kmemleak_free() in
free_page_ext().

    BUG: unable to handle kernel paging request at ffff888453d00000
    PGD 128a01067 P4D 128a01067 PUD 128a04067 PMD 47e09e067 PTE 800ffffbac2ff060
    Oops: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
    CPU: 1 PID: 1594 Comm: bash Not tainted 5.0.0-rc8+ #15
    Hardware name: HP ProLiant DL180 Gen9/ProLiant DL180 Gen9, BIOS U20 10/25/2017
    RIP: 0010:scan_block+0xb5/0x290
    Code: 85 6e 01 00 00 48 b8 00 00 30 f5 81 88 ff ff 48 39 c3 0f 84 5b 01 00 00 48 89 d8 48 c1 e8 03 42 80 3c 20 00 0f 85 87 01 00 00 <4c> 8b 3b e8 f3 0c fa ff 4c 39 3d 0c 6b 4c 01 0f 87 08 01 00 00 4c
    RSP: 0018:ffff8881ec57f8e0 EFLAGS: 00010082
    RAX: 0000000000000000 RBX: ffff888453d00000 RCX: ffffffffa61e5a54
    RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff888453d00000
    RBP: ffff8881ec57f920 R08: fffffbfff4ed588d R09: fffffbfff4ed588c
    R10: fffffbfff4ed588c R11: ffffffffa76ac463 R12: dffffc0000000000
    R13: ffff888453d00ff9 R14: ffff8881f80cef48 R15: ffff8881f80cef48
    FS:  00007f6c0e3f8740(0000) GS:ffff8881f7680000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffff888453d00000 CR3: 00000001c4244003 CR4: 00000000001606a0
    Call Trace:
     scan_gray_list+0x269/0x430
     kmemleak_scan+0x5a8/0x10f0
     kmemleak_write+0x541/0x6ca
     full_proxy_write+0xf8/0x190
     __vfs_write+0xeb/0x980
     vfs_write+0x15a/0x4f0
     ksys_write+0xd2/0x1b0
     __x64_sys_write+0x73/0xb0
     do_syscall_64+0xeb/0xaaa
     entry_SYSCALL_64_after_hwframe+0x44/0xa9
    RIP: 0033:0x7f6c0dad73b8
    Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 63 2d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55
    RSP: 002b:00007ffd5b863cb8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
    RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f6c0dad73b8
    RDX: 0000000000000005 RSI: 000055a9216e1710 RDI: 0000000000000001
    RBP: 000055a9216e1710 R08: 000000000000000a R09: 00007ffd5b863840
    R10: 000000000000000a R11: 0000000000000246 R12: 00007f6c0dda9780
    R13: 0000000000000005 R14: 00007f6c0dda4740 R15: 0000000000000005
    Modules linked in: nls_iso8859_1 nls_cp437 vfat fat kvm_intel kvm irqbypass efivars ip_tables x_tables xfs sd_mod ahci libahci igb i2c_algo_bit libata i2c_core dm_mirror dm_region_hash dm_log dm_mod efivarfs
    CR2: ffff888453d00000
    ---[ end trace ccf646c7456717c5 ]---
    Kernel panic - not syncing: Fatal exception
    Shutting down cpus with NMI
    Kernel Offset: 0x24c00000 from 0xffffffff81000000 (relocation range:
    0xffffffff80000000-0xffffffffbfffffff)
    ---[ end Kernel panic - not syncing: Fatal exception ]---

Link: http://lkml.kernel.org/r/20190227173147.75650-1-cai@lca.pw
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/page_ext.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/page_ext.c b/mm/page_ext.c
index 4d1eac0d4fc5..de1f34c5a2f1 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -255,6 +255,7 @@ static void free_page_ext(void *addr)
 		table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
 
 		BUG_ON(PageReserved(page));
+		kmemleak_free(addr);
 		free_pages_exact(addr, table_size);
 	}
 }

From cb4d6cd27604b67f522a166302a6302c523e0d8c Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 5 Mar 2019 15:45:59 -0800
Subject: [PATCH 3067/3220] mm/vmalloc.c: fix kernel BUG at mm/vmalloc.c:512!

[ Upstream commit afd07389d3f4933c7f7817a92fb5e053d59a3182 ]

One of the vmalloc stress test case triggers the kernel BUG():

  <snip>
  [60.562151] ------------[ cut here ]------------
  [60.562154] kernel BUG at mm/vmalloc.c:512!
  [60.562206] invalid opcode: 0000 [#1] PREEMPT SMP PTI
  [60.562247] CPU: 0 PID: 430 Comm: vmalloc_test/0 Not tainted 4.20.0+ #161
  [60.562293] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
  [60.562351] RIP: 0010:alloc_vmap_area+0x36f/0x390
  <snip>

it can happen due to big align request resulting in overflowing of
calculated address, i.e.  it becomes 0 after ALIGN()'s fixup.

Fix it by checking if calculated address is within vstart/vend range.

Link: http://lkml.kernel.org/r/20190124115648.9433-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/vmalloc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 400e580725da..7c556b59f0ec 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -446,7 +446,11 @@ nocache:
 	}
 
 found:
-	if (addr + size > vend)
+	/*
+	 * Check also calculated address against the vstart,
+	 * because it can be 0 because of big align request.
+	 */
+	if (addr + size > vend || addr < vstart)
 		goto overflow;
 
 	va->va_start = addr;

From b1399497b701217d2bce6b80ffee7a5d68b1c5ff Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Tue, 5 Mar 2019 15:42:03 -0800
Subject: [PATCH 3068/3220] mm/slab.c: kmemleak no scan alien caches

[ Upstream commit 92d1d07daad65c300c7d0b68bbef8867e9895d54 ]

Kmemleak throws endless warnings during boot due to in
__alloc_alien_cache(),

    alc = kmalloc_node(memsize, gfp, node);
    init_arraycache(&alc->ac, entries, batch);
    kmemleak_no_scan(ac);

Kmemleak does not track the array cache (alc->ac) but the alien cache
(alc) instead, so let it track the latter by lifting kmemleak_no_scan()
out of init_arraycache().

There is another place that calls init_arraycache(), but
alloc_kmem_cache_cpus() uses the percpu allocation where will never be
considered as a leak.

  kmemleak: Found object by alias at 0xffff8007b9aa7e38
  CPU: 190 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc2+ #2
  Call trace:
   dump_backtrace+0x0/0x168
   show_stack+0x24/0x30
   dump_stack+0x88/0xb0
   lookup_object+0x84/0xac
   find_and_get_object+0x84/0xe4
   kmemleak_no_scan+0x74/0xf4
   setup_kmem_cache_node+0x2b4/0x35c
   __do_tune_cpucache+0x250/0x2d4
   do_tune_cpucache+0x4c/0xe4
   enable_cpucache+0xc8/0x110
   setup_cpu_cache+0x40/0x1b8
   __kmem_cache_create+0x240/0x358
   create_cache+0xc0/0x198
   kmem_cache_create_usercopy+0x158/0x20c
   kmem_cache_create+0x50/0x64
   fsnotify_init+0x58/0x6c
   do_one_initcall+0x194/0x388
   kernel_init_freeable+0x668/0x688
   kernel_init+0x18/0x124
   ret_from_fork+0x10/0x18
  kmemleak: Object 0xffff8007b9aa7e00 (size 256):
  kmemleak:   comm "swapper/0", pid 1, jiffies 4294697137
  kmemleak:   min_count = 1
  kmemleak:   count = 0
  kmemleak:   flags = 0x1
  kmemleak:   checksum = 0
  kmemleak:   backtrace:
       kmemleak_alloc+0x84/0xb8
       kmem_cache_alloc_node_trace+0x31c/0x3a0
       __kmalloc_node+0x58/0x78
       setup_kmem_cache_node+0x26c/0x35c
       __do_tune_cpucache+0x250/0x2d4
       do_tune_cpucache+0x4c/0xe4
       enable_cpucache+0xc8/0x110
       setup_cpu_cache+0x40/0x1b8
       __kmem_cache_create+0x240/0x358
       create_cache+0xc0/0x198
       kmem_cache_create_usercopy+0x158/0x20c
       kmem_cache_create+0x50/0x64
       fsnotify_init+0x58/0x6c
       do_one_initcall+0x194/0x388
       kernel_init_freeable+0x668/0x688
       kernel_init+0x18/0x124
  kmemleak: Not scanning unknown object at 0xffff8007b9aa7e38
  CPU: 190 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc2+ #2
  Call trace:
   dump_backtrace+0x0/0x168
   show_stack+0x24/0x30
   dump_stack+0x88/0xb0
   kmemleak_no_scan+0x90/0xf4
   setup_kmem_cache_node+0x2b4/0x35c
   __do_tune_cpucache+0x250/0x2d4
   do_tune_cpucache+0x4c/0xe4
   enable_cpucache+0xc8/0x110
   setup_cpu_cache+0x40/0x1b8
   __kmem_cache_create+0x240/0x358
   create_cache+0xc0/0x198
   kmem_cache_create_usercopy+0x158/0x20c
   kmem_cache_create+0x50/0x64
   fsnotify_init+0x58/0x6c
   do_one_initcall+0x194/0x388
   kernel_init_freeable+0x668/0x688
   kernel_init+0x18/0x124
   ret_from_fork+0x10/0x18

Link: http://lkml.kernel.org/r/20190129184518.39808-1-cai@lca.pw
Fixes: 1fe00d50a9e8 ("slab: factor out initialization of array cache")
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 mm/slab.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 92df044f5e00..d043b8007f23 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -660,14 +660,6 @@ static void start_cpu_timer(int cpu)
 
 static void init_arraycache(struct array_cache *ac, int limit, int batch)
 {
-	/*
-	 * The array_cache structures contain pointers to free object.
-	 * However, when such objects are allocated or transferred to another
-	 * cache the pointers are not cleared and they could be counted as
-	 * valid references during a kmemleak scan. Therefore, kmemleak must
-	 * not scan such objects.
-	 */
-	kmemleak_no_scan(ac);
 	if (ac) {
 		ac->avail = 0;
 		ac->limit = limit;
@@ -683,6 +675,14 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 	struct array_cache *ac = NULL;
 
 	ac = kmalloc_node(memsize, gfp, node);
+	/*
+	 * The array_cache structures contain pointers to free object.
+	 * However, when such objects are allocated or transferred to another
+	 * cache the pointers are not cleared and they could be counted as
+	 * valid references during a kmemleak scan. Therefore, kmemleak must
+	 * not scan such objects.
+	 */
+	kmemleak_no_scan(ac);
 	init_arraycache(ac, entries, batchcount);
 	return ac;
 }
@@ -876,6 +876,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
 
 	alc = kmalloc_node(memsize, gfp, node);
 	if (alc) {
+		kmemleak_no_scan(alc);
 		init_arraycache(&alc->ac, entries, batch);
 		spin_lock_init(&alc->lock);
 	}

From 6c96456d930248f2ad63dbd44ef3bfb0dfbda764 Mon Sep 17 00:00:00 2001
From: Jia Guo <guojia12@huawei.com>
Date: Tue, 5 Mar 2019 15:41:41 -0800
Subject: [PATCH 3069/3220] ocfs2: fix a panic problem caused by o2cb_ctl

[ Upstream commit cc725ef3cb202ef2019a3c67c8913efa05c3cce6 ]

In the process of creating a node, it will cause NULL pointer
dereference in kernel if o2cb_ctl failed in the interval (mkdir,
o2cb_set_node_attribute(node_num)] in function o2cb_add_node.

The node num is initialized to 0 in function o2nm_node_group_make_item,
o2nm_node_group_drop_item will mistake the node number 0 for a valid
node number when we delete the node before the node number is set
correctly.  If the local node number of the current host happens to be
0, cluster->cl_local_node will be set to O2NM_INVALID_NODE_NUM while
o2hb_thread still running.  The panic stack is generated as follows:

  o2hb_thread
      \-o2hb_do_disk_heartbeat
          \-o2hb_check_own_slot
              |-slot = &reg->hr_slots[o2nm_this_node()];
              //o2nm_this_node() return O2NM_INVALID_NODE_NUM

We need to check whether the node number is set when we delete the node.

Link: http://lkml.kernel.org/r/133d8045-72cc-863e-8eae-5013f9f6bc51@huawei.com
Signed-off-by: Jia Guo <guojia12@huawei.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Acked-by: Jun Piao <piaojun@huawei.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ocfs2/cluster/nodemanager.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 3c45a9301a09..a87a08e1bfab 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group,
 	struct o2nm_node *node = to_o2nm_node(item);
 	struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
 
-	o2net_disconnect_node(node);
+	if (cluster->cl_nodes[node->nd_num] == node) {
+		o2net_disconnect_node(node);
 
-	if (cluster->cl_has_local &&
-	    (cluster->cl_local_node == node->nd_num)) {
-		cluster->cl_has_local = 0;
-		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
-		o2net_stop_listening(node);
+		if (cluster->cl_has_local &&
+		    (cluster->cl_local_node == node->nd_num)) {
+			cluster->cl_has_local = 0;
+			cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+			o2net_stop_listening(node);
+		}
 	}
 
 	/* XXX call into net to stop this node from trading messages */

From 9b7e7899386f1da3ec30abed61ea720237ff3e49 Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Mon, 4 Feb 2019 13:36:53 +0530
Subject: [PATCH 3070/3220] f2fs: do not use mutex lock in atomic context

[ Upstream commit 9083977dabf3833298ddcd40dee28687f1e6b483 ]

Fix below warning coming because of using mutex lock in atomic context.

BUG: sleeping function called from invalid context at kernel/locking/mutex.c:98
in_atomic(): 1, irqs_disabled(): 0, pid: 585, name: sh
Preemption disabled at: __radix_tree_preload+0x28/0x130
Call trace:
 dump_backtrace+0x0/0x2b4
 show_stack+0x20/0x28
 dump_stack+0xa8/0xe0
 ___might_sleep+0x144/0x194
 __might_sleep+0x58/0x8c
 mutex_lock+0x2c/0x48
 f2fs_trace_pid+0x88/0x14c
 f2fs_set_node_page_dirty+0xd0/0x184

Do not use f2fs_radix_tree_insert() to avoid doing cond_resched() with
spin_lock() acquired.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/trace.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 145fb659ad44..8f327fa7ae47 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -60,6 +60,7 @@ void f2fs_trace_pid(struct page *page)
 
 	page->private = pid;
 
+retry:
 	if (radix_tree_preload(GFP_NOFS))
 		return;
 
@@ -70,7 +71,12 @@ void f2fs_trace_pid(struct page *page)
 	if (p)
 		radix_tree_delete(&pids, pid);
 
-	f2fs_radix_tree_insert(&pids, pid, current);
+	if (radix_tree_insert(&pids, pid, current)) {
+		spin_unlock(&pids_lock);
+		radix_tree_preload_end();
+		cond_resched();
+		goto retry;
+	}
 
 	trace_printk("%3x:%3x %4x %-16s\n",
 			MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),

From 1eaab115dc7c21805f1d6e2cda5fc9feb18abb37 Mon Sep 17 00:00:00 2001
From: Shuriyc Chu <sureeju@gmail.com>
Date: Tue, 5 Mar 2019 15:41:56 -0800
Subject: [PATCH 3071/3220] fs/file.c: initialize init_files.resize_wait

[ Upstream commit 5704a06810682683355624923547b41540e2801a ]

(Taken from https://bugzilla.kernel.org/show_bug.cgi?id=200647)

'get_unused_fd_flags' in kthread cause kernel crash.  It works fine on
4.1, but causes crash after get 64 fds.  It also cause crash on
ubuntu1404/1604/1804, centos7.5, and the crash messages are almost the
same.

The crash message on centos7.5 shows below:

  start fd 61
  start fd 62
  start fd 63
  BUG: unable to handle kernel NULL pointer dereference at           (null)
  IP: __wake_up_common+0x2e/0x90
  PGD 0
  Oops: 0000 [#1] SMP
  Modules linked in: test(OE) xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter devlink sunrpc kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd sg ppdev pcspkr virtio_balloon parport_pc parport i2c_piix4 joydev ip_tables xfs libcrc32c sr_mod cdrom sd_mod crc_t10dif crct10dif_generic ata_generic pata_acpi virtio_scsi virtio_console virtio_net cirrus drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul crct10dif_common crc32c_intel drm ata_piix serio_raw libata virtio_pci virtio_ring i2c_core
   virtio floppy dm_mirror dm_region_hash dm_log dm_mod
  CPU: 2 PID: 1820 Comm: test_fd Kdump: loaded Tainted: G           OE  ------------   3.10.0-862.3.3.el7.x86_64 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
  task: ffff8e92b9431fa0 ti: ffff8e94247a0000 task.ti: ffff8e94247a0000
  RIP: 0010:__wake_up_common+0x2e/0x90
  RSP: 0018:ffff8e94247a2d18  EFLAGS: 00010086
  RAX: 0000000000000000 RBX: ffffffff9d09daa0 RCX: 0000000000000000
  RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffffffff9d09daa0
  RBP: ffff8e94247a2d50 R08: 0000000000000000 R09: ffff8e92b95dfda8
  R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff9d09daa8
  R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000003
  FS:  0000000000000000(0000) GS:ffff8e9434e80000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000000 CR3: 000000017c686000 CR4: 00000000000207e0
  Call Trace:
    __wake_up+0x39/0x50
    expand_files+0x131/0x250
    __alloc_fd+0x47/0x170
    get_unused_fd_flags+0x30/0x40
    test_fd+0x12a/0x1c0 [test]
    kthread+0xd1/0xe0
    ret_from_fork_nospec_begin+0x21/0x21
  Code: 66 90 55 48 89 e5 41 57 41 89 f7 41 56 41 89 ce 41 55 41 54 49 89 fc 49 83 c4 08 53 48 83 ec 10 48 8b 47 08 89 55 cc 4c 89 45 d0 <48> 8b 08 49 39 c4 48 8d 78 e8 4c 8d 69 e8 75 08 eb 3b 4c 89 ef
  RIP   __wake_up_common+0x2e/0x90
   RSP <ffff8e94247a2d18>
  CR2: 0000000000000000

This issue exists since CentOS 7.5 3.10.0-862 and CentOS 7.4
(3.10.0-693.21.1 ) is ok.  Root cause: the item 'resize_wait' is not
initialized before being used.

Reported-by: Richard Zhang <zhang.zijian@h3c.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/file.c b/fs/file.c
index 39f8f15921da..7e9eb65a2912 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -474,6 +474,7 @@ struct files_struct init_files = {
 		.full_fds_bits	= init_files.full_fds_bits_init,
 	},
 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+	.resize_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
 };
 
 static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start)

From 1605df0045d956d939797b6b72b85b57969db320 Mon Sep 17 00:00:00 2001
From: Louis Taylor <louis@kragniz.eu>
Date: Wed, 27 Feb 2019 22:25:15 +0000
Subject: [PATCH 3072/3220] cifs: use correct format characters

[ Upstream commit 259594bea574e515a148171b5cd84ce5cbdc028a ]

When compiling with -Wformat, clang emits the following warnings:

fs/cifs/smb1ops.c:312:20: warning: format specifies type 'unsigned
short' but the argument has type 'unsigned int' [-Wformat]
                         tgt_total_cnt, total_in_tgt);
                                        ^~~~~~~~~~~~

fs/cifs/cifs_dfs_ref.c:289:4: warning: format specifies type 'short'
but the argument has type 'int' [-Wformat]
                 ref->flags, ref->server_type);
                 ^~~~~~~~~~

fs/cifs/cifs_dfs_ref.c:289:16: warning: format specifies type 'short'
but the argument has type 'int' [-Wformat]
                 ref->flags, ref->server_type);
                             ^~~~~~~~~~~~~~~~

fs/cifs/cifs_dfs_ref.c:291:4: warning: format specifies type 'short'
but the argument has type 'int' [-Wformat]
                 ref->ref_flag, ref->path_consumed);
                 ^~~~~~~~~~~~~

fs/cifs/cifs_dfs_ref.c:291:19: warning: format specifies type 'short'
but the argument has type 'int' [-Wformat]
                 ref->ref_flag, ref->path_consumed);
                                ^~~~~~~~~~~~~~~~~~
The types of these arguments are unconditionally defined, so this patch
updates the format character to the correct ones for ints and unsigned
ints.

Link: https://github.com/ClangBuiltLinux/linux/issues/378

Signed-off-by: Louis Taylor <louis@kragniz.eu>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/cifs_dfs_ref.c | 4 ++--
 fs/cifs/smb1ops.c      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 7dc886c9a78f..1ea643faf04b 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -266,9 +266,9 @@ static void dump_referral(const struct dfs_info3_param *ref)
 {
 	cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
 	cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
-	cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+	cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n",
 		 ref->flags, ref->server_type);
-	cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+	cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n",
 		 ref->ref_flag, ref->path_consumed);
 }
 
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index efd72e1fae74..f7a9adab0b84 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -305,7 +305,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
 	remaining = tgt_total_cnt - total_in_tgt;
 
 	if (remaining < 0) {
-		cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+		cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n",
 			 tgt_total_cnt, total_in_tgt);
 		return -EPROTO;
 	}

From 4a430ac683b372b0fa1bb0317bc828db5ec5a3db Mon Sep 17 00:00:00 2001
From: "Jason Cai (Xiang Feng)" <jason.cai.kern@gmail.com>
Date: Sun, 20 Jan 2019 22:39:13 +0800
Subject: [PATCH 3073/3220] dm thin: add sanity checks to thin-pool and
 external snapshot creation

[ Upstream commit 70de2cbda8a5d788284469e755f8b097d339c240 ]

Invoking dm_get_device() twice on the same device path with different
modes is dangerous.  Because in that case, upgrade_mode() will alloc a
new 'dm_dev' and free the old one, which may be referenced by a previous
caller.  Dereferencing the dangling pointer will trigger kernel NULL
pointer dereference.

The following two cases can reproduce this issue.  Actually, they are
invalid setups that must be disallowed, e.g.:

1. Creating a thin-pool with read_only mode, and the same device as
both metadata and data.

dmsetup create thinp --table \
    "0 41943040 thin-pool /dev/vdb /dev/vdb 128 0 1 read_only"

BUG: unable to handle kernel NULL pointer dereference at 0000000000000080
...
Call Trace:
 new_read+0xfb/0x110 [dm_bufio]
 dm_bm_read_lock+0x43/0x190 [dm_persistent_data]
 ? kmem_cache_alloc_trace+0x15c/0x1e0
 __create_persistent_data_objects+0x65/0x3e0 [dm_thin_pool]
 dm_pool_metadata_open+0x8c/0xf0 [dm_thin_pool]
 pool_ctr.cold.79+0x213/0x913 [dm_thin_pool]
 ? realloc_argv+0x50/0x70 [dm_mod]
 dm_table_add_target+0x14e/0x330 [dm_mod]
 table_load+0x122/0x2e0 [dm_mod]
 ? dev_status+0x40/0x40 [dm_mod]
 ctl_ioctl+0x1aa/0x3e0 [dm_mod]
 dm_ctl_ioctl+0xa/0x10 [dm_mod]
 do_vfs_ioctl+0xa2/0x600
 ? handle_mm_fault+0xda/0x200
 ? __do_page_fault+0x26c/0x4f0
 ksys_ioctl+0x60/0x90
 __x64_sys_ioctl+0x16/0x20
 do_syscall_64+0x55/0x150
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

2. Creating a external snapshot using the same thin-pool device.

dmsetup create thinp --table \
    "0 41943040 thin-pool /dev/vdc /dev/vdb 128 0 2 ignore_discard"
dmsetup message /dev/mapper/thinp 0 "create_thin 0"
dmsetup create snap --table \
            "0 204800 thin /dev/mapper/thinp 0 /dev/mapper/thinp"

BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
...
Call Trace:
? __alloc_pages_nodemask+0x13c/0x2e0
retrieve_status+0xa5/0x1f0 [dm_mod]
? dm_get_live_or_inactive_table.isra.7+0x20/0x20 [dm_mod]
 table_status+0x61/0xa0 [dm_mod]
 ctl_ioctl+0x1aa/0x3e0 [dm_mod]
 dm_ctl_ioctl+0xa/0x10 [dm_mod]
 do_vfs_ioctl+0xa2/0x600
 ksys_ioctl+0x60/0x90
 ? ksys_write+0x4f/0xb0
 __x64_sys_ioctl+0x16/0x20
 do_syscall_64+0x55/0x150
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Jason Cai (Xiang Feng) <jason.cai@linux.alibaba.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/dm-thin.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 07eaa9f90712..d52ea584e0bc 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3210,6 +3210,13 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	as.argc = argc;
 	as.argv = argv;
 
+	/* make sure metadata and data are different devices */
+	if (!strcmp(argv[0], argv[1])) {
+		ti->error = "Error setting metadata or data device";
+		r = -EINVAL;
+		goto out_unlock;
+	}
+
 	/*
 	 * Set default pool features.
 	 */
@@ -4092,6 +4099,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	tc->sort_bio_list = RB_ROOT;
 
 	if (argc == 3) {
+		if (!strcmp(argv[0], argv[2])) {
+			ti->error = "Error setting origin device";
+			r = -EINVAL;
+			goto bad_origin_dev;
+		}
+
 		r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
 		if (r) {
 			ti->error = "Error opening origin device";

From 8f983d2cbfad7e79f6115b5666ffee4d6538b4ec Mon Sep 17 00:00:00 2001
From: Yao Liu <yotta.liu@ucloud.cn>
Date: Mon, 28 Jan 2019 19:47:28 +0800
Subject: [PATCH 3074/3220] cifs: Fix NULL pointer dereference of devname

[ Upstream commit 68e2672f8fbd1e04982b8d2798dd318bf2515dd2 ]

There is a NULL pointer dereference of devname in strspn()

The oops looks something like:

  CIFS: Attempting to mount (null)
  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  ...
  RIP: 0010:strspn+0x0/0x50
  ...
  Call Trace:
   ? cifs_parse_mount_options+0x222/0x1710 [cifs]
   ? cifs_get_volume_info+0x2f/0x80 [cifs]
   cifs_setup_volume_info+0x20/0x190 [cifs]
   cifs_get_volume_info+0x50/0x80 [cifs]
   cifs_smb3_do_mount+0x59/0x630 [cifs]
   ? ida_alloc_range+0x34b/0x3d0
   cifs_do_mount+0x11/0x20 [cifs]
   mount_fs+0x52/0x170
   vfs_kern_mount+0x6b/0x170
   do_mount+0x216/0xdc0
   ksys_mount+0x83/0xd0
   __x64_sys_mount+0x25/0x30
   do_syscall_64+0x65/0x220
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fix this by adding a NULL check on devname in cifs_parse_devname()

Signed-off-by: Yao Liu <yotta.liu@ucloud.cn>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/connect.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index eacf57c24ca9..9cb72fd40eff 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1255,6 +1255,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol)
 	const char *delims = "/\\";
 	size_t len;
 
+	if (unlikely(!devname || !*devname)) {
+		cifs_dbg(VFS, "Device name not specified.\n");
+		return -EINVAL;
+	}
+
 	/* make sure we have a valid UNC double delimiter prefix */
 	len = strspn(devname, delims);
 	if (len != 2)

From 2e5086f3ac6f1a3aad4d95adf7cae9ec0f956c75 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Tue, 26 Feb 2019 11:51:50 +0100
Subject: [PATCH 3075/3220] fs: fix guard_bio_eod to check for real EOD errors

[ Upstream commit dce30ca9e3b676fb288c33c1f4725a0621361185 ]

guard_bio_eod() can truncate a segment in bio to allow it to do IO on
odd last sectors of a device.

It already checks if the IO starts past EOD, but it does not consider
the possibility of an IO request starting within device boundaries can
contain more than one segment past EOD.

In such cases, truncated_bytes can be bigger than PAGE_SIZE, and will
underflow bvec->bv_len.

Fix this by checking if truncated_bytes is lower than PAGE_SIZE.

This situation has been found on filesystems such as isofs and vfat,
which doesn't check the device size before mount, if the device is
smaller than the filesystem itself, a readahead on such filesystem,
which spans EOD, can trigger this situation, leading a call to
zero_user() with a wrong size possibly corrupting memory.

I didn't see any crash, or didn't let the system run long enough to
check if memory corruption will be hit somewhere, but adding
instrumentation to guard_bio_end() to check truncated_bytes size, was
enough to see the error.

The following script can trigger the error.

MNT=/mnt
IMG=./DISK.img
DEV=/dev/loop0

mkfs.vfat $IMG
mount $IMG $MNT
cp -R /etc $MNT &> /dev/null
umount $MNT

losetup -D

losetup --find --show --sizelimit 16247280 $IMG
mount $DEV $MNT

find $MNT -type f -exec cat {} + >/dev/null

Kudos to Eric Sandeen for coming up with the reproducer above

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/buffer.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/buffer.c b/fs/buffer.c
index 6f7d519a093b..f278e27bd8c0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2985,6 +2985,13 @@ void guard_bio_eod(int rw, struct bio *bio)
 	/* Uhhuh. We've got a bio that straddles the device size! */
 	truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
 
+	/*
+	 * The bio contains more than one segment which spans EOD, just return
+	 * and let IO layer turn it into an EIO
+	 */
+	if (truncated_bytes > bvec->bv_len)
+		return;
+
 	/* Truncate the bio.. */
 	bio->bi_iter.bi_size -= truncated_bytes;
 	bvec->bv_len -= truncated_bytes;

From be8e9e9ebffb6a77b6b23b17b6ba14cec8bc313d Mon Sep 17 00:00:00 2001
From: Tony Jones <tonyj@suse.de>
Date: Wed, 27 Feb 2019 17:55:32 -0800
Subject: [PATCH 3076/3220] tools lib traceevent: Fix buffer overflow in
 arg_eval

[ Upstream commit 7c5b019e3a638a5a290b0ec020f6ca83d2ec2aaa ]

Fix buffer overflow observed when running perf test.

The overflow is when trying to evaluate "1ULL << (64 - 1)" which is
resulting in -9223372036854775808 which overflows the 20 character
buffer.

If is possible this bug has been reported before but I still don't see
any fix checked in:

See: https://www.spinics.net/lists/linux-perf-users/msg07714.html

Reported-by: Michael Sartain <mikesart@fastmail.com>
Reported-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Tony Jones <tonyj@suse.de>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Fixes: f7d82350e597 ("tools/events: Add files to create libtraceevent.a")
Link: http://lkml.kernel.org/r/20190228015532.8941-1-tonyj@suse.de
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/lib/traceevent/event-parse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 6e4a10fe9dd0..743746a3c50d 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -2419,7 +2419,7 @@ static int arg_num_eval(struct print_arg *arg, long long *val)
 static char *arg_eval (struct print_arg *arg)
 {
 	long long val;
-	static char buf[20];
+	static char buf[24];
 
 	switch (arg->type) {
 	case PRINT_ATOM:

From f3666bfc4bc4d0302db4a80266666c72ca92f01c Mon Sep 17 00:00:00 2001
From: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Date: Wed, 27 Feb 2019 06:51:36 +0000
Subject: [PATCH 3077/3220] usb: chipidea: Grab the (legacy) USB PHY by phandle
 first

[ Upstream commit 68ef236274793066b9ba3154b16c0acc1c891e5c ]

According to the chipidea driver bindings, the USB PHY is specified via
the "phys" phandle node. However, this only takes effect for USB PHYs
that use the common PHY framework. For legacy USB PHYs, a simple lookup
based on the USB PHY type is done instead.

This does not play out well when more than one USB PHY is registered,
since the first registered PHY matching the type will always be
returned regardless of what the driver was bound to.

Fix this by looking up the PHY based on the "phys" phandle node.
Although generic PHYs are rather matched by their "phys-name" and not
the "phys" phandle directly, there is no helper for similar lookup on
legacy PHYs and it's probably not worth the effort to add it.

When no legacy USB PHY is found by phandle, fallback to grabbing any
registered USB2 PHY. This ensures backward compatibility if some users
were actually relying on this mechanism.

Signed-off-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/usb/chipidea/core.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 57ee43512992..dee22d8effda 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -913,8 +913,15 @@ static int ci_hdrc_probe(struct platform_device *pdev)
 	} else if (ci->platdata->usb_phy) {
 		ci->usb_phy = ci->platdata->usb_phy;
 	} else {
+		ci->usb_phy = devm_usb_get_phy_by_phandle(dev->parent, "phys",
+							  0);
 		ci->phy = devm_phy_get(dev->parent, "usb-phy");
-		ci->usb_phy = devm_usb_get_phy(dev->parent, USB_PHY_TYPE_USB2);
+
+		/* Fallback to grabbing any registered USB2 PHY */
+		if (IS_ERR(ci->usb_phy) &&
+		    PTR_ERR(ci->usb_phy) != -EPROBE_DEFER)
+			ci->usb_phy = devm_usb_get_phy(dev->parent,
+						       USB_PHY_TYPE_USB2);
 
 		/* if both generic PHY and USB PHY layers aren't enabled */
 		if (PTR_ERR(ci->phy) == -ENOSYS &&

From 2ad89a6a0dba18afa761fb6e768c51c257549ef9 Mon Sep 17 00:00:00 2001
From: Benjamin Block <bblock@linux.ibm.com>
Date: Thu, 21 Feb 2019 10:18:00 +0100
Subject: [PATCH 3078/3220] scsi: core: replace GFP_ATOMIC with GFP_KERNEL in
 scsi_scan.c

[ Upstream commit 1749ef00f7312679f76d5e9104c5d1e22a829038 ]

We had a test-report where, under memory pressure, adding LUNs to the
systems would fail (the tests add LUNs strictly in sequence):

[ 5525.853432] scsi 0:0:1:1088045124: Direct-Access     IBM      2107900          .148 PQ: 0 ANSI: 5
[ 5525.853826] scsi 0:0:1:1088045124: alua: supports implicit TPGS
[ 5525.853830] scsi 0:0:1:1088045124: alua: device naa.6005076303ffd32700000000000044da port group 0 rel port 43
[ 5525.853931] sd 0:0:1:1088045124: Attached scsi generic sg10 type 0
[ 5525.854075] sd 0:0:1:1088045124: [sdk] Disabling DIF Type 1 protection
[ 5525.855495] sd 0:0:1:1088045124: [sdk] 2097152 512-byte logical blocks: (1.07 GB/1.00 GiB)
[ 5525.855606] sd 0:0:1:1088045124: [sdk] Write Protect is off
[ 5525.855609] sd 0:0:1:1088045124: [sdk] Mode Sense: ed 00 00 08
[ 5525.855795] sd 0:0:1:1088045124: [sdk] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
[ 5525.857838]  sdk: sdk1
[ 5525.859468] sd 0:0:1:1088045124: [sdk] Attached SCSI disk
[ 5525.865073] sd 0:0:1:1088045124: alua: transition timeout set to 60 seconds
[ 5525.865078] sd 0:0:1:1088045124: alua: port group 00 state A preferred supports tolusnA
[ 5526.015070] sd 0:0:1:1088045124: alua: port group 00 state A preferred supports tolusnA
[ 5526.015213] sd 0:0:1:1088045124: alua: port group 00 state A preferred supports tolusnA
[ 5526.587439] scsi_alloc_sdev: Allocation failure during SCSI scanning, some SCSI devices might not be configured
[ 5526.588562] scsi_alloc_sdev: Allocation failure during SCSI scanning, some SCSI devices might not be configured

Looking at the code of scsi_alloc_sdev(), and all the calling contexts,
there seems to be no reason to use GFP_ATMOIC here. All the different
call-contexts use a mutex at some point, and nothing in between that
requires no sleeping, as far as I could see. Additionally, the code that
later allocates the block queue for the device (scsi_mq_alloc_queue())
already uses GFP_KERNEL.

There are similar allocations in two other functions:
scsi_probe_and_add_lun(), and scsi_add_lun(),; that can also be done with
GFP_KERNEL.

Here is the contexts for the three functions so far:

    scsi_alloc_sdev()
        scsi_probe_and_add_lun()
            scsi_sequential_lun_scan()
                __scsi_scan_target()
                    scsi_scan_target()
                        mutex_lock()
                    scsi_scan_channel()
                        scsi_scan_host_selected()
                            mutex_lock()
            scsi_report_lun_scan()
                __scsi_scan_target()
    	            ...
            __scsi_add_device()
                mutex_lock()
            __scsi_scan_target()
                ...
        scsi_report_lun_scan()
            ...
        scsi_get_host_dev()
            mutex_lock()

    scsi_probe_and_add_lun()
        ...

    scsi_add_lun()
        scsi_probe_and_add_lun()
            ...

So replace all these, and give them a bit of a better chance to succeed,
with more chances of reclaim.

Signed-off-by: Benjamin Block <bblock@linux.ibm.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/scsi_scan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 850ddc5fac04..3e2288af56bc 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -217,7 +217,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	extern void scsi_requeue_run_queue(struct work_struct *work);
 
 	sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size,
-		       GFP_ATOMIC);
+		       GFP_KERNEL);
 	if (!sdev)
 		goto out;
 
@@ -791,7 +791,7 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 	 */
 	sdev->inquiry = kmemdup(inq_result,
 				max_t(size_t, sdev->inquiry_len, 36),
-				GFP_ATOMIC);
+				GFP_KERNEL);
 	if (sdev->inquiry == NULL)
 		return SCSI_SCAN_NO_RESPONSE;
 
@@ -1085,7 +1085,7 @@ static int scsi_probe_and_add_lun(struct scsi_target *starget,
 	if (!sdev)
 		goto out;
 
-	result = kmalloc(result_len, GFP_ATOMIC |
+	result = kmalloc(result_len, GFP_KERNEL |
 			((shost->unchecked_isa_dma) ? __GFP_DMA : 0));
 	if (!result)
 		goto out_free_sdev;

From 2f36d3ea8c55d673d42accac6ad3d16a517fe6d1 Mon Sep 17 00:00:00 2001
From: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Date: Mon, 25 Feb 2019 10:54:01 -0700
Subject: [PATCH 3079/3220] coresight: etm4x: Add support to enable ETMv4.2

[ Upstream commit 5666dfd1d8a45a167f0d8b4ef47ea7f780b1f24a ]

SDM845 has ETMv4.2 and can use the existing etm4x driver.
But the current etm driver checks only for ETMv4.0 and
errors out for other etm4x versions. This patch adds this
missing support to enable SoC's with ETMv4x to use same
driver by checking only the ETM architecture major version
number.

Without this change, we get below error during etm probe:

/ # dmesg | grep etm
[    6.660093] coresight-etm4x: probe of 7040000.etm failed with error -22
[    6.666902] coresight-etm4x: probe of 7140000.etm failed with error -22
[    6.673708] coresight-etm4x: probe of 7240000.etm failed with error -22
[    6.680511] coresight-etm4x: probe of 7340000.etm failed with error -22
[    6.687313] coresight-etm4x: probe of 7440000.etm failed with error -22
[    6.694113] coresight-etm4x: probe of 7540000.etm failed with error -22
[    6.700914] coresight-etm4x: probe of 7640000.etm failed with error -22
[    6.707717] coresight-etm4x: probe of 7740000.etm failed with error -22

With this change, etm probe is successful:

/ # dmesg | grep etm
[    6.659198] coresight-etm4x 7040000.etm: CPU0: ETM v4.2 initialized
[    6.665848] coresight-etm4x 7140000.etm: CPU1: ETM v4.2 initialized
[    6.672493] coresight-etm4x 7240000.etm: CPU2: ETM v4.2 initialized
[    6.679129] coresight-etm4x 7340000.etm: CPU3: ETM v4.2 initialized
[    6.685770] coresight-etm4x 7440000.etm: CPU4: ETM v4.2 initialized
[    6.692403] coresight-etm4x 7540000.etm: CPU5: ETM v4.2 initialized
[    6.699024] coresight-etm4x 7640000.etm: CPU6: ETM v4.2 initialized
[    6.705646] coresight-etm4x 7740000.etm: CPU7: ETM v4.2 initialized

Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/hwtracing/coresight/coresight-etm4x.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index 0edc10b44004..c80cc18747cb 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -54,7 +54,8 @@ static void etm4_os_unlock(void *info)
 
 static bool etm4_arch_supported(u8 arch)
 {
-	switch (arch) {
+	/* Mask out the minor version number */
+	switch (arch & 0xf0) {
 	case ETM_ARCH_V4:
 		break;
 	default:

From 1c2bfc4636de925f47d5f4e111e32235da0ef646 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 13 Feb 2019 17:14:42 +0100
Subject: [PATCH 3080/3220] ARM: 8840/1: use a raw_spinlock_t in unwind

[ Upstream commit 74ffe79ae538283bbf7c155e62339f1e5c87b55a ]

Mostly unwind is done with irqs enabled however SLUB may call it with
irqs disabled while creating a new SLUB cache.

I had system freeze while loading a module which called
kmem_cache_create() on init. That means SLUB's __slab_alloc() disabled
interrupts and then

->new_slab_objects()
 ->new_slab()
  ->setup_object()
   ->setup_object_debug()
    ->init_tracking()
     ->set_track()
      ->save_stack_trace()
       ->save_stack_trace_tsk()
        ->walk_stackframe()
         ->unwind_frame()
          ->unwind_find_idx()
           =>spin_lock_irqsave(&unwind_lock);

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/kernel/unwind.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
index 0bee233fef9a..314cfb232a63 100644
--- a/arch/arm/kernel/unwind.c
+++ b/arch/arm/kernel/unwind.c
@@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
 static const struct unwind_idx *__origin_unwind_idx;
 extern const struct unwind_idx __stop_unwind_idx[];
 
-static DEFINE_SPINLOCK(unwind_lock);
+static DEFINE_RAW_SPINLOCK(unwind_lock);
 static LIST_HEAD(unwind_tables);
 
 /* Convert a prel31 symbol to an absolute address */
@@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 		/* module unwind tables */
 		struct unwind_table *table;
 
-		spin_lock_irqsave(&unwind_lock, flags);
+		raw_spin_lock_irqsave(&unwind_lock, flags);
 		list_for_each_entry(table, &unwind_tables, list) {
 			if (addr >= table->begin_addr &&
 			    addr < table->end_addr) {
@@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 				break;
 			}
 		}
-		spin_unlock_irqrestore(&unwind_lock, flags);
+		raw_spin_unlock_irqrestore(&unwind_lock, flags);
 	}
 
 	pr_debug("%s: idx = %p\n", __func__, idx);
@@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
 	tab->begin_addr = text_addr;
 	tab->end_addr = text_addr + text_size;
 
-	spin_lock_irqsave(&unwind_lock, flags);
+	raw_spin_lock_irqsave(&unwind_lock, flags);
 	list_add_tail(&tab->list, &unwind_tables);
-	spin_unlock_irqrestore(&unwind_lock, flags);
+	raw_spin_unlock_irqrestore(&unwind_lock, flags);
 
 	return tab;
 }
@@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
 	if (!tab)
 		return;
 
-	spin_lock_irqsave(&unwind_lock, flags);
+	raw_spin_lock_irqsave(&unwind_lock, flags);
 	list_del(&tab->list);
-	spin_unlock_irqrestore(&unwind_lock, flags);
+	raw_spin_unlock_irqrestore(&unwind_lock, flags);
 
 	kfree(tab);
 }

From 3bd631e5360e5fe0557ab723ca37fbc79b3a8322 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Sun, 3 Feb 2019 00:14:33 +0200
Subject: [PATCH 3081/3220] mmc: omap: fix the maximum timeout setting

[ Upstream commit a6327b5e57fdc679c842588c3be046c0b39cc127 ]

When running OMAP1 kernel on QEMU, MMC access is annoyingly noisy:

	MMC: CTO of 0xff and 0xfe cannot be used!
	MMC: CTO of 0xff and 0xfe cannot be used!
	MMC: CTO of 0xff and 0xfe cannot be used!
	[ad inf.]

Emulator warnings appear to be valid. The TI document SPRU680 [1]
("OMAP5910 Dual-Core Processor MultiMedia Card/Secure Data Memory Card
(MMC/SD) Reference Guide") page 36 states that the maximum timeout is 253
cycles and "0xff and 0xfe cannot be used".

Fix by using 0xfd as the maximum timeout.

Tested using QEMU 2.5 (Siemens SX1 machine, OMAP310), and also checked on
real hardware using Palm TE (OMAP310), Nokia 770 (OMAP1710) and Nokia N810
(OMAP2420) that MMC works as before.

[1] http://www.ti.com/lit/ug/spru680/spru680.pdf

Fixes: 730c9b7e6630f ("[MMC] Add OMAP MMC host driver")
Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mmc/host/omap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c
index 5bcf4f45f8b4..20d422558fa3 100644
--- a/drivers/mmc/host/omap.c
+++ b/drivers/mmc/host/omap.c
@@ -921,7 +921,7 @@ static inline void set_cmd_timeout(struct mmc_omap_host *host, struct mmc_reques
 	reg &= ~(1 << 5);
 	OMAP_MMC_WRITE(host, SDIO, reg);
 	/* Set maximum timeout */
-	OMAP_MMC_WRITE(host, CTO, 0xff);
+	OMAP_MMC_WRITE(host, CTO, 0xfd);
 }
 
 static inline void set_data_timeout(struct mmc_omap_host *host, struct mmc_request *req)

From f8d1bf871805696e7cd580c869d13844f80bbdd5 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 21 Feb 2019 20:09:28 -0800
Subject: [PATCH 3082/3220] e1000e: Fix -Wformat-truncation warnings

[ Upstream commit 135e7245479addc6b1f5d031e3d7e2ddb3d2b109 ]

Provide precision hints to snprintf() since we know the destination
buffer size of the RX/TX ring names are IFNAMSIZ + 5 - 1. This fixes the
following warnings:

drivers/net/ethernet/intel/e1000e/netdev.c: In function
'e1000_request_msix':
drivers/net/ethernet/intel/e1000e/netdev.c:2109:13: warning: 'snprintf'
output may be truncated before the last format character
[-Wformat-truncation=]
     "%s-rx-0", netdev->name);
             ^
drivers/net/ethernet/intel/e1000e/netdev.c:2107:3: note: 'snprintf'
output between 6 and 21 bytes into a destination of size 20
   snprintf(adapter->rx_ring->name,
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     sizeof(adapter->rx_ring->name) - 1,
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     "%s-rx-0", netdev->name);
     ~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/intel/e1000e/netdev.c:2125:13: warning: 'snprintf'
output may be truncated before the last format character
[-Wformat-truncation=]
     "%s-tx-0", netdev->name);
             ^
drivers/net/ethernet/intel/e1000e/netdev.c:2123:3: note: 'snprintf'
output between 6 and 21 bytes into a destination of size 20
   snprintf(adapter->tx_ring->name,
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     sizeof(adapter->tx_ring->name) - 1,
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     "%s-tx-0", netdev->name);
     ~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 6369d88b81c1..6b1cacd86c6e 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -2131,7 +2131,7 @@ static int e1000_request_msix(struct e1000_adapter *adapter)
 	if (strlen(netdev->name) < (IFNAMSIZ - 5))
 		snprintf(adapter->rx_ring->name,
 			 sizeof(adapter->rx_ring->name) - 1,
-			 "%s-rx-0", netdev->name);
+			 "%.14s-rx-0", netdev->name);
 	else
 		memcpy(adapter->rx_ring->name, netdev->name, IFNAMSIZ);
 	err = request_irq(adapter->msix_entries[vector].vector,
@@ -2147,7 +2147,7 @@ static int e1000_request_msix(struct e1000_adapter *adapter)
 	if (strlen(netdev->name) < (IFNAMSIZ - 5))
 		snprintf(adapter->tx_ring->name,
 			 sizeof(adapter->tx_ring->name) - 1,
-			 "%s-tx-0", netdev->name);
+			 "%.14s-tx-0", netdev->name);
 	else
 		memcpy(adapter->tx_ring->name, netdev->name, IFNAMSIZ);
 	err = request_irq(adapter->msix_entries[vector].vector,

From 98f7f1cf74e445b43f7d855bf20eb2058b82c4a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20Bugge?= <haakon.bugge@oracle.com>
Date: Sun, 17 Feb 2019 15:45:12 +0100
Subject: [PATCH 3083/3220] IB/mlx4: Increase the timeout for CM cache
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 2612d723aadcf8281f9bf8305657129bd9f3cd57 ]

Using CX-3 virtual functions, either from a bare-metal machine or
pass-through from a VM, MAD packets are proxied through the PF driver.

Since the VF drivers have separate name spaces for MAD Transaction Ids
(TIDs), the PF driver has to re-map the TIDs and keep the book keeping
in a cache.

Following the RDMA Connection Manager (CM) protocol, it is clear when
an entry has to evicted form the cache. But life is not perfect,
remote peers may die or be rebooted. Hence, it's a timeout to wipe out
a cache entry, when the PF driver assumes the remote peer has gone.

During workloads where a high number of QPs are destroyed concurrently,
excessive amount of CM DREQ retries has been observed

The problem can be demonstrated in a bare-metal environment, where two
nodes have instantiated 8 VFs each. This using dual ported HCAs, so we
have 16 vPorts per physical server.

64 processes are associated with each vPort and creates and destroys
one QP for each of the remote 64 processes. That is, 1024 QPs per
vPort, all in all 16K QPs. The QPs are created/destroyed using the
CM.

When tearing down these 16K QPs, excessive CM DREQ retries (and
duplicates) are observed. With some cat/paste/awk wizardry on the
infiniband_cm sysfs, we observe as sum of the 16 vPorts on one of the
nodes:

cm_rx_duplicates:
      dreq  2102
cm_rx_msgs:
      drep  1989
      dreq  6195
       rep  3968
       req  4224
       rtu  4224
cm_tx_msgs:
      drep  4093
      dreq 27568
       rep  4224
       req  3968
       rtu  3968
cm_tx_retries:
      dreq 23469

Note that the active/passive side is equally distributed between the
two nodes.

Enabling pr_debug in cm.c gives tons of:

[171778.814239] <mlx4_ib> mlx4_ib_multiplex_cm_handler: id{slave:
1,sl_cm_id: 0xd393089f} is NULL!

By increasing the CM_CLEANUP_CACHE_TIMEOUT from 5 to 30 seconds, the
tear-down phase of the application is reduced from approximately 90 to
50 seconds. Retries/duplicates are also significantly reduced:

cm_rx_duplicates:
      dreq  2460
[]
cm_tx_retries:
      dreq  3010
       req    47

Increasing the timeout further didn't help, as these duplicates and
retries stems from a too short CMA timeout, which was 20 (~4 seconds)
on the systems. By increasing the CMA timeout to 22 (~17 seconds), the
numbers fell down to about 10 for both of them.

Adjustment of the CMA timeout is not part of this commit.

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
Acked-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/infiniband/hw/mlx4/cm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index 39a488889fc7..5dc920fe1326 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -39,7 +39,7 @@
 
 #include "mlx4_ib.h"
 
-#define CM_CLEANUP_CACHE_TIMEOUT  (5 * HZ)
+#define CM_CLEANUP_CACHE_TIMEOUT  (30 * HZ)
 
 struct id_map_entry {
 	struct rb_node node;

From 09a6db51dcb41990fe020e45af8b2e44010fc1e7 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Fri, 15 Feb 2019 19:50:27 +0800
Subject: [PATCH 3084/3220] scsi: megaraid_sas: return error when create DMA
 pool failed

[ Upstream commit bcf3b67d16a4c8ffae0aa79de5853435e683945c ]

when create DMA pool for cmd frames failed, we should return -ENOMEM,
instead of 0.
In some case in:

    megasas_init_adapter_fusion()

    -->megasas_alloc_cmds()
       -->megasas_create_frame_pool
          create DMA pool failed,
        --> megasas_free_cmds() [1]

    -->megasas_alloc_cmds_fusion()
       failed, then goto fail_alloc_cmds.
    -->megasas_free_cmds() [2]

we will call megasas_free_cmds twice, [1] will kfree cmd_list,
[2] will use cmd_list.it will cause a problem:

Unable to handle kernel NULL pointer dereference at virtual address
00000000
pgd = ffffffc000f70000
[00000000] *pgd=0000001fbf893003, *pud=0000001fbf893003,
*pmd=0000001fbf894003, *pte=006000006d000707
Internal error: Oops: 96000005 [#1] SMP
 Modules linked in:
 CPU: 18 PID: 1 Comm: swapper/0 Not tainted
 task: ffffffdfb9290000 ti: ffffffdfb923c000 task.ti: ffffffdfb923c000
 PC is at megasas_free_cmds+0x30/0x70
 LR is at megasas_free_cmds+0x24/0x70
 ...
 Call trace:
 [<ffffffc0005b779c>] megasas_free_cmds+0x30/0x70
 [<ffffffc0005bca74>] megasas_init_adapter_fusion+0x2f4/0x4d8
 [<ffffffc0005b926c>] megasas_init_fw+0x2dc/0x760
 [<ffffffc0005b9ab0>] megasas_probe_one+0x3c0/0xcd8
 [<ffffffc0004a5abc>] local_pci_probe+0x4c/0xb4
 [<ffffffc0004a5c40>] pci_device_probe+0x11c/0x14c
 [<ffffffc00053a5e4>] driver_probe_device+0x1ec/0x430
 [<ffffffc00053a92c>] __driver_attach+0xa8/0xb0
 [<ffffffc000538178>] bus_for_each_dev+0x74/0xc8
  [<ffffffc000539e88>] driver_attach+0x28/0x34
 [<ffffffc000539a18>] bus_add_driver+0x16c/0x248
 [<ffffffc00053b234>] driver_register+0x6c/0x138
 [<ffffffc0004a5350>] __pci_register_driver+0x5c/0x6c
 [<ffffffc000ce3868>] megasas_init+0xc0/0x1a8
 [<ffffffc000082a58>] do_one_initcall+0xe8/0x1ec
 [<ffffffc000ca7be8>] kernel_init_freeable+0x1c8/0x284
 [<ffffffc0008d90b8>] kernel_init+0x1c/0xe4

Signed-off-by: Jason Yan <yanaijie@huawei.com>
Acked-by: Sumit Saxena <sumit.saxena@broadcom.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/megaraid/megaraid_sas_base.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index ac7acd257c99..2422094f1f15 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -3847,6 +3847,7 @@ int megasas_alloc_cmds(struct megasas_instance *instance)
 	if (megasas_create_frame_pool(instance)) {
 		dev_printk(KERN_DEBUG, &instance->pdev->dev, "Error creating frame DMA pool\n");
 		megasas_free_cmds(instance);
+		return -ENOMEM;
 	}
 
 	return 0;

From 7a36263302979039305b2103ecc550394b0a944c Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Tue, 19 Feb 2019 16:36:39 +0100
Subject: [PATCH 3085/3220] perf test: Fix failure of 'evsel-tp-sched' test on
 s390

[ Upstream commit 03d309711d687460d1345de8a0363f45b1c8cd11 ]

Commit 489338a717a0 ("perf tests evsel-tp-sched: Fix bitwise operator")
causes test case 14 "Parse sched tracepoints fields" to fail on s390.

This test succeeds on x86.

In fact this test now fails on all architectures with type char treated
as type unsigned char.

The root cause is the signed-ness of character arrays in the tracepoints
sched_switch for structure members prev_comm and next_comm.

On s390 the output of:

 [root@m35lp76 perf]# cat /sys/kernel/debug/tracing/events/sched/sched_switch/format
 name: sched_switch
 ID: 287
 format:
   field:unsigned short common_type; offset:0; size:2;	signed:0;
   ...
   field:char prev_comm[16]; offset:8; size:16;	signed:0;
   ...
   field:char next_comm[16]; offset:40; size:16; signed:0;

reveals the character arrays prev_comm and next_comm are per
default unsigned char and have values in the range of 0..255.

On x86 both fields are signed as this output shows:
 [root@f29]# cat /sys/kernel/debug/tracing/events/sched/sched_switch/format
 name: sched_switch
 ID: 287
 format:
   field:unsigned short common_type; offset:0; size:2;	signed:0;
   ...
   field:char prev_comm[16]; offset:8; size:16;	signed:1;
   ...
   field:char next_comm[16]; offset:40; size:16; signed:1;

and the character arrays prev_comm and next_comm are per default signed
char and have values in the range of -1..127.  The implementation of
type char is architecture specific.

Since the character arrays in both tracepoints sched_switch and
sched_wakeup should contain ascii characters, simply omit the check for
signedness in the test case.

Output before:

  [root@m35lp76 perf]# ./perf test -F 14
  14: Parse sched tracepoints fields                        :
  --- start ---
  sched:sched_switch: "prev_comm" signedness(0) is wrong, should be 1
  sched:sched_switch: "next_comm" signedness(0) is wrong, should be 1
  sched:sched_wakeup: "comm" signedness(0) is wrong, should be 1
  ---- end ----
  14: Parse sched tracepoints fields                        : FAILED!
  [root@m35lp76 perf]#

Output after:

  [root@m35lp76 perf]# ./perf test -Fv 14
  14: Parse sched tracepoints fields                        :
  --- start ---
  ---- end ----
  Parse sched tracepoints fields: Ok
  [root@m35lp76 perf]#

Fixes: 489338a717a0 ("perf tests evsel-tp-sched: Fix bitwise operator")

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Link: http://lkml.kernel.org/r/20190219153639.31267-1-tmricht@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/tests/evsel-tp-sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/tests/evsel-tp-sched.c b/tools/perf/tests/evsel-tp-sched.c
index da474d743b6a..1b832e54c04b 100644
--- a/tools/perf/tests/evsel-tp-sched.c
+++ b/tools/perf/tests/evsel-tp-sched.c
@@ -42,7 +42,7 @@ int test__perf_evsel__tp_sched_test(void)
 		return -1;
 	}
 
-	if (perf_evsel__test_field(evsel, "prev_comm", 16, true))
+	if (perf_evsel__test_field(evsel, "prev_comm", 16, false))
 		ret = -1;
 
 	if (perf_evsel__test_field(evsel, "prev_pid", 4, true))
@@ -54,7 +54,7 @@ int test__perf_evsel__tp_sched_test(void)
 	if (perf_evsel__test_field(evsel, "prev_state", sizeof(long), true))
 		ret = -1;
 
-	if (perf_evsel__test_field(evsel, "next_comm", 16, true))
+	if (perf_evsel__test_field(evsel, "next_comm", 16, false))
 		ret = -1;
 
 	if (perf_evsel__test_field(evsel, "next_pid", 4, true))
@@ -72,7 +72,7 @@ int test__perf_evsel__tp_sched_test(void)
 		return -1;
 	}
 
-	if (perf_evsel__test_field(evsel, "comm", 16, true))
+	if (perf_evsel__test_field(evsel, "comm", 16, false))
 		ret = -1;
 
 	if (perf_evsel__test_field(evsel, "pid", 4, true))

From 853e3862a8d02947f2d58f3ba017b7c5358646e8 Mon Sep 17 00:00:00 2001
From: Wen Yang <yellowriver2010@hotmail.com>
Date: Mon, 18 Feb 2019 15:13:47 +0000
Subject: [PATCH 3086/3220] SoC: imx-sgtl5000: add missing put_device()

[ Upstream commit 8fa857da9744f513036df1c43ab57f338941ae7d ]

The of_find_device_by_node() takes a reference to the underlying device
structure, we should release that reference.

Detected by coccinelle with the following warnings:
./sound/soc/fsl/imx-sgtl5000.c:169:1-7: ERROR: missing put_device;
call of_find_device_by_node on line 105, but without a corresponding
object release within this function.
./sound/soc/fsl/imx-sgtl5000.c:177:1-7: ERROR: missing put_device;
call of_find_device_by_node on line 105, but without a corresponding
object release within this function.

Signed-off-by: Wen Yang <yellowriver2010@hotmail.com>
Cc: Timur Tabi <timur@kernel.org>
Cc: Nicolin Chen <nicoleotsuka@gmail.com>
Cc: Xiubo Li <Xiubo.Lee@gmail.com>
Cc: Fabio Estevam <festevam@gmail.com>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Pengutronix Kernel Team <kernel@pengutronix.de>
Cc: NXP Linux Team <linux-imx@nxp.com>
Cc: alsa-devel@alsa-project.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/fsl/imx-sgtl5000.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/soc/fsl/imx-sgtl5000.c b/sound/soc/fsl/imx-sgtl5000.c
index b99e0b5e00e9..8e525f7ac08d 100644
--- a/sound/soc/fsl/imx-sgtl5000.c
+++ b/sound/soc/fsl/imx-sgtl5000.c
@@ -115,6 +115,7 @@ static int imx_sgtl5000_probe(struct platform_device *pdev)
 		ret = -EPROBE_DEFER;
 		goto fail;
 	}
+	put_device(&ssi_pdev->dev);
 	codec_dev = of_find_i2c_device_by_node(codec_np);
 	if (!codec_dev) {
 		dev_err(&pdev->dev, "failed to find codec platform device\n");

From 3967d2d6bcc91ecefd7787c69066a4f9a7b48965 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Fri, 8 Feb 2019 11:17:46 -0500
Subject: [PATCH 3087/3220] media: sh_veu: Correct return type for mem2mem
 buffer helpers

[ Upstream commit 43c145195c7fc3025ee7ecfc67112ac1c82af7c2 ]

Fix the assigned type of mem2mem buffer handling API.
Namely, these functions:

 v4l2_m2m_next_buf
 v4l2_m2m_last_buf
 v4l2_m2m_buf_remove
 v4l2_m2m_next_src_buf
 v4l2_m2m_next_dst_buf
 v4l2_m2m_last_src_buf
 v4l2_m2m_last_dst_buf
 v4l2_m2m_src_buf_remove
 v4l2_m2m_dst_buf_remove

return a struct vb2_v4l2_buffer, and not a struct vb2_buffer.

Fixing this is necessary to fix the mem2mem buffer handling API,
changing the return to the correct struct vb2_v4l2_buffer instead
of a void pointer.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/media/platform/sh_veu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/media/platform/sh_veu.c b/drivers/media/platform/sh_veu.c
index d6ab33e7060a..b9f4cdee555e 100644
--- a/drivers/media/platform/sh_veu.c
+++ b/drivers/media/platform/sh_veu.c
@@ -277,13 +277,13 @@ static void sh_veu_process(struct sh_veu_dev *veu,
 static void sh_veu_device_run(void *priv)
 {
 	struct sh_veu_dev *veu = priv;
-	struct vb2_buffer *src_buf, *dst_buf;
+	struct vb2_v4l2_buffer *src_buf, *dst_buf;
 
 	src_buf = v4l2_m2m_next_src_buf(veu->m2m_ctx);
 	dst_buf = v4l2_m2m_next_dst_buf(veu->m2m_ctx);
 
 	if (src_buf && dst_buf)
-		sh_veu_process(veu, src_buf, dst_buf);
+		sh_veu_process(veu, &src_buf->vb2_buf, &dst_buf->vb2_buf);
 }
 
 		/* ========== video ioctls ========== */

From 9f127b7284eb6d8af40b9dd6d67fdf12e4169e40 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Fri, 8 Feb 2019 11:17:45 -0500
Subject: [PATCH 3088/3220] media: s5p-jpeg: Correct return type for mem2mem
 buffer helpers

[ Upstream commit 4a88f89885c7cf65c62793f385261a6e3315178a ]

Fix the assigned type of mem2mem buffer handling API.
Namely, these functions:

 v4l2_m2m_next_buf
 v4l2_m2m_last_buf
 v4l2_m2m_buf_remove
 v4l2_m2m_next_src_buf
 v4l2_m2m_next_dst_buf
 v4l2_m2m_last_src_buf
 v4l2_m2m_last_dst_buf
 v4l2_m2m_src_buf_remove
 v4l2_m2m_dst_buf_remove

return a struct vb2_v4l2_buffer, and not a struct vb2_buffer.

Fixing this is necessary to fix the mem2mem buffer handling API,
changing the return to the correct struct vb2_v4l2_buffer instead
of a void pointer.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/media/platform/s5p-jpeg/jpeg-core.c | 38 ++++++++++-----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.c b/drivers/media/platform/s5p-jpeg/jpeg-core.c
index 9c6fc09b88e0..80c83bba7af3 100644
--- a/drivers/media/platform/s5p-jpeg/jpeg-core.c
+++ b/drivers/media/platform/s5p-jpeg/jpeg-core.c
@@ -788,14 +788,14 @@ static void skip(struct s5p_jpeg_buffer *buf, long len);
 static void exynos4_jpeg_parse_decode_h_tbl(struct s5p_jpeg_ctx *ctx)
 {
 	struct s5p_jpeg *jpeg = ctx->jpeg;
-	struct vb2_buffer *vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
+	struct vb2_v4l2_buffer *vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 	struct s5p_jpeg_buffer jpeg_buffer;
 	unsigned int word;
 	int c, x, components;
 
 	jpeg_buffer.size = 2; /* Ls */
 	jpeg_buffer.data =
-		(unsigned long)vb2_plane_vaddr(vb, 0) + ctx->out_q.sos + 2;
+		(unsigned long)vb2_plane_vaddr(&vb->vb2_buf, 0) + ctx->out_q.sos + 2;
 	jpeg_buffer.curr = 0;
 
 	word = 0;
@@ -825,14 +825,14 @@ static void exynos4_jpeg_parse_decode_h_tbl(struct s5p_jpeg_ctx *ctx)
 static void exynos4_jpeg_parse_huff_tbl(struct s5p_jpeg_ctx *ctx)
 {
 	struct s5p_jpeg *jpeg = ctx->jpeg;
-	struct vb2_buffer *vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
+	struct vb2_v4l2_buffer *vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 	struct s5p_jpeg_buffer jpeg_buffer;
 	unsigned int word;
 	int c, i, n, j;
 
 	for (j = 0; j < ctx->out_q.dht.n; ++j) {
 		jpeg_buffer.size = ctx->out_q.dht.len[j];
-		jpeg_buffer.data = (unsigned long)vb2_plane_vaddr(vb, 0) +
+		jpeg_buffer.data = (unsigned long)vb2_plane_vaddr(&vb->vb2_buf, 0) +
 				   ctx->out_q.dht.marker[j];
 		jpeg_buffer.curr = 0;
 
@@ -884,13 +884,13 @@ static void exynos4_jpeg_parse_huff_tbl(struct s5p_jpeg_ctx *ctx)
 static void exynos4_jpeg_parse_decode_q_tbl(struct s5p_jpeg_ctx *ctx)
 {
 	struct s5p_jpeg *jpeg = ctx->jpeg;
-	struct vb2_buffer *vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
+	struct vb2_v4l2_buffer *vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 	struct s5p_jpeg_buffer jpeg_buffer;
 	int c, x, components;
 
 	jpeg_buffer.size = ctx->out_q.sof_len;
 	jpeg_buffer.data =
-		(unsigned long)vb2_plane_vaddr(vb, 0) + ctx->out_q.sof;
+		(unsigned long)vb2_plane_vaddr(&vb->vb2_buf, 0) + ctx->out_q.sof;
 	jpeg_buffer.curr = 0;
 
 	skip(&jpeg_buffer, 5); /* P, Y, X */
@@ -915,14 +915,14 @@ static void exynos4_jpeg_parse_decode_q_tbl(struct s5p_jpeg_ctx *ctx)
 static void exynos4_jpeg_parse_q_tbl(struct s5p_jpeg_ctx *ctx)
 {
 	struct s5p_jpeg *jpeg = ctx->jpeg;
-	struct vb2_buffer *vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
+	struct vb2_v4l2_buffer *vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 	struct s5p_jpeg_buffer jpeg_buffer;
 	unsigned int word;
 	int c, i, j;
 
 	for (j = 0; j < ctx->out_q.dqt.n; ++j) {
 		jpeg_buffer.size = ctx->out_q.dqt.len[j];
-		jpeg_buffer.data = (unsigned long)vb2_plane_vaddr(vb, 0) +
+		jpeg_buffer.data = (unsigned long)vb2_plane_vaddr(&vb->vb2_buf, 0) +
 				   ctx->out_q.dqt.marker[j];
 		jpeg_buffer.curr = 0;
 
@@ -2016,15 +2016,15 @@ static void s5p_jpeg_device_run(void *priv)
 {
 	struct s5p_jpeg_ctx *ctx = priv;
 	struct s5p_jpeg *jpeg = ctx->jpeg;
-	struct vb2_buffer *src_buf, *dst_buf;
+	struct vb2_v4l2_buffer *src_buf, *dst_buf;
 	unsigned long src_addr, dst_addr, flags;
 
 	spin_lock_irqsave(&ctx->jpeg->slock, flags);
 
 	src_buf = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 	dst_buf = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
-	src_addr = vb2_dma_contig_plane_dma_addr(src_buf, 0);
-	dst_addr = vb2_dma_contig_plane_dma_addr(dst_buf, 0);
+	src_addr = vb2_dma_contig_plane_dma_addr(&src_buf->vb2_buf, 0);
+	dst_addr = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);
 
 	s5p_jpeg_reset(jpeg->regs);
 	s5p_jpeg_poweron(jpeg->regs);
@@ -2097,7 +2097,7 @@ static void exynos4_jpeg_set_img_addr(struct s5p_jpeg_ctx *ctx)
 {
 	struct s5p_jpeg *jpeg = ctx->jpeg;
 	struct s5p_jpeg_fmt *fmt;
-	struct vb2_buffer *vb;
+	struct vb2_v4l2_buffer *vb;
 	struct s5p_jpeg_addr jpeg_addr = {};
 	u32 pix_size, padding_bytes = 0;
 
@@ -2116,7 +2116,7 @@ static void exynos4_jpeg_set_img_addr(struct s5p_jpeg_ctx *ctx)
 		vb = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
 	}
 
-	jpeg_addr.y = vb2_dma_contig_plane_dma_addr(vb, 0);
+	jpeg_addr.y = vb2_dma_contig_plane_dma_addr(&vb->vb2_buf, 0);
 
 	if (fmt->colplanes == 2) {
 		jpeg_addr.cb = jpeg_addr.y + pix_size - padding_bytes;
@@ -2134,7 +2134,7 @@ static void exynos4_jpeg_set_img_addr(struct s5p_jpeg_ctx *ctx)
 static void exynos4_jpeg_set_jpeg_addr(struct s5p_jpeg_ctx *ctx)
 {
 	struct s5p_jpeg *jpeg = ctx->jpeg;
-	struct vb2_buffer *vb;
+	struct vb2_v4l2_buffer *vb;
 	unsigned int jpeg_addr = 0;
 
 	if (ctx->mode == S5P_JPEG_ENCODE)
@@ -2142,7 +2142,7 @@ static void exynos4_jpeg_set_jpeg_addr(struct s5p_jpeg_ctx *ctx)
 	else
 		vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 
-	jpeg_addr = vb2_dma_contig_plane_dma_addr(vb, 0);
+	jpeg_addr = vb2_dma_contig_plane_dma_addr(&vb->vb2_buf, 0);
 	if (jpeg->variant->version == SJPEG_EXYNOS5433 &&
 	    ctx->mode == S5P_JPEG_DECODE)
 		jpeg_addr += ctx->out_q.sos;
@@ -2257,7 +2257,7 @@ static void exynos3250_jpeg_set_img_addr(struct s5p_jpeg_ctx *ctx)
 {
 	struct s5p_jpeg *jpeg = ctx->jpeg;
 	struct s5p_jpeg_fmt *fmt;
-	struct vb2_buffer *vb;
+	struct vb2_v4l2_buffer *vb;
 	struct s5p_jpeg_addr jpeg_addr = {};
 	u32 pix_size;
 
@@ -2271,7 +2271,7 @@ static void exynos3250_jpeg_set_img_addr(struct s5p_jpeg_ctx *ctx)
 		fmt = ctx->cap_q.fmt;
 	}
 
-	jpeg_addr.y = vb2_dma_contig_plane_dma_addr(vb, 0);
+	jpeg_addr.y = vb2_dma_contig_plane_dma_addr(&vb->vb2_buf, 0);
 
 	if (fmt->colplanes == 2) {
 		jpeg_addr.cb = jpeg_addr.y + pix_size;
@@ -2289,7 +2289,7 @@ static void exynos3250_jpeg_set_img_addr(struct s5p_jpeg_ctx *ctx)
 static void exynos3250_jpeg_set_jpeg_addr(struct s5p_jpeg_ctx *ctx)
 {
 	struct s5p_jpeg *jpeg = ctx->jpeg;
-	struct vb2_buffer *vb;
+	struct vb2_v4l2_buffer *vb;
 	unsigned int jpeg_addr = 0;
 
 	if (ctx->mode == S5P_JPEG_ENCODE)
@@ -2297,7 +2297,7 @@ static void exynos3250_jpeg_set_jpeg_addr(struct s5p_jpeg_ctx *ctx)
 	else
 		vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
 
-	jpeg_addr = vb2_dma_contig_plane_dma_addr(vb, 0);
+	jpeg_addr = vb2_dma_contig_plane_dma_addr(&vb->vb2_buf, 0);
 	exynos3250_jpeg_jpgadr(jpeg->regs, jpeg_addr);
 }
 

From 0bc5f939031cdbf5841c5354876da8f0c71aed7d Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Fri, 8 Feb 2019 11:17:44 -0500
Subject: [PATCH 3089/3220] media: s5p-g2d: Correct return type for mem2mem
 buffer helpers

[ Upstream commit 30fa627b32230737bc3f678067e2adfecf956987 ]

Fix the assigned type of mem2mem buffer handling API.
Namely, these functions:

 v4l2_m2m_next_buf
 v4l2_m2m_last_buf
 v4l2_m2m_buf_remove
 v4l2_m2m_next_src_buf
 v4l2_m2m_next_dst_buf
 v4l2_m2m_last_src_buf
 v4l2_m2m_last_dst_buf
 v4l2_m2m_src_buf_remove
 v4l2_m2m_dst_buf_remove

return a struct vb2_v4l2_buffer, and not a struct vb2_buffer.

Fixing this is necessary to fix the mem2mem buffer handling API,
changing the return to the correct struct vb2_v4l2_buffer instead
of a void pointer.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/media/platform/s5p-g2d/g2d.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/media/platform/s5p-g2d/g2d.c b/drivers/media/platform/s5p-g2d/g2d.c
index e1936d9d27da..2b939555cccb 100644
--- a/drivers/media/platform/s5p-g2d/g2d.c
+++ b/drivers/media/platform/s5p-g2d/g2d.c
@@ -497,7 +497,7 @@ static void device_run(void *prv)
 {
 	struct g2d_ctx *ctx = prv;
 	struct g2d_dev *dev = ctx->dev;
-	struct vb2_buffer *src, *dst;
+	struct vb2_v4l2_buffer *src, *dst;
 	unsigned long flags;
 	u32 cmd = 0;
 
@@ -512,10 +512,10 @@ static void device_run(void *prv)
 	spin_lock_irqsave(&dev->ctrl_lock, flags);
 
 	g2d_set_src_size(dev, &ctx->in);
-	g2d_set_src_addr(dev, vb2_dma_contig_plane_dma_addr(src, 0));
+	g2d_set_src_addr(dev, vb2_dma_contig_plane_dma_addr(&src->vb2_buf, 0));
 
 	g2d_set_dst_size(dev, &ctx->out);
-	g2d_set_dst_addr(dev, vb2_dma_contig_plane_dma_addr(dst, 0));
+	g2d_set_dst_addr(dev, vb2_dma_contig_plane_dma_addr(&dst->vb2_buf, 0));
 
 	g2d_set_rop4(dev, ctx->rop);
 	g2d_set_flip(dev, ctx->flip);

From abcbad869e68c472a16f25737ab3cf22277d4b63 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Fri, 8 Feb 2019 11:17:42 -0500
Subject: [PATCH 3090/3220] media: mx2_emmaprp: Correct return type for mem2mem
 buffer helpers

[ Upstream commit 8d20dcefe471763f23ad538369ec65b51993ffff ]

Fix the assigned type of mem2mem buffer handling API.
Namely, these functions:

 v4l2_m2m_next_buf
 v4l2_m2m_last_buf
 v4l2_m2m_buf_remove
 v4l2_m2m_next_src_buf
 v4l2_m2m_next_dst_buf
 v4l2_m2m_last_src_buf
 v4l2_m2m_last_dst_buf
 v4l2_m2m_src_buf_remove
 v4l2_m2m_dst_buf_remove

return a struct vb2_v4l2_buffer, and not a struct vb2_buffer.

Fixing this is necessary to fix the mem2mem buffer handling API,
changing the return to the correct struct vb2_v4l2_buffer instead
of a void pointer.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/media/platform/mx2_emmaprp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/media/platform/mx2_emmaprp.c b/drivers/media/platform/mx2_emmaprp.c
index 03a1b606655d..009a4bb77d05 100644
--- a/drivers/media/platform/mx2_emmaprp.c
+++ b/drivers/media/platform/mx2_emmaprp.c
@@ -289,7 +289,7 @@ static void emmaprp_device_run(void *priv)
 {
 	struct emmaprp_ctx *ctx = priv;
 	struct emmaprp_q_data *s_q_data, *d_q_data;
-	struct vb2_buffer *src_buf, *dst_buf;
+	struct vb2_v4l2_buffer *src_buf, *dst_buf;
 	struct emmaprp_dev *pcdev = ctx->dev;
 	unsigned int s_width, s_height;
 	unsigned int d_width, d_height;
@@ -309,8 +309,8 @@ static void emmaprp_device_run(void *priv)
 	d_height = d_q_data->height;
 	d_size = d_width * d_height;
 
-	p_in = vb2_dma_contig_plane_dma_addr(src_buf, 0);
-	p_out = vb2_dma_contig_plane_dma_addr(dst_buf, 0);
+	p_in = vb2_dma_contig_plane_dma_addr(&src_buf->vb2_buf, 0);
+	p_out = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);
 	if (!p_in || !p_out) {
 		v4l2_err(&pcdev->v4l2_dev,
 			 "Acquiring kernel pointers to buffers failed\n");

From f1289edae5148d31477dcbac53010570b291879b Mon Sep 17 00:00:00 2001
From: Michal Kazior <michal@plume.com>
Date: Mon, 11 Feb 2019 10:29:27 +0100
Subject: [PATCH 3091/3220] leds: lp55xx: fix null deref on firmware load
 failure

[ Upstream commit 5ddb0869bfc1bca6cfc592c74c64a026f936638c ]

I've stumbled upon a kernel crash and the logs
pointed me towards the lp5562 driver:

> <4>[306013.841294] lp5562 0-0030: Direct firmware load for lp5562 failed with error -2
> <4>[306013.894990] lp5562 0-0030: Falling back to user helper
> ...
> <3>[306073.924886] lp5562 0-0030: firmware request failed
> <1>[306073.939456] Unable to handle kernel NULL pointer dereference at virtual address 00000000
> <4>[306074.251011] PC is at _raw_spin_lock+0x1c/0x58
> <4>[306074.255539] LR is at release_firmware+0x6c/0x138
> ...

After taking a look I noticed firmware_release()
could be called with either NULL or a dangling
pointer.

Fixes: 10c06d178df11 ("leds-lp55xx: support firmware interface")
Signed-off-by: Michal Kazior <michal@plume.com>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/leds/leds-lp55xx-common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/leds/leds-lp55xx-common.c b/drivers/leds/leds-lp55xx-common.c
index 59b76833f0d3..fd077c176a62 100644
--- a/drivers/leds/leds-lp55xx-common.c
+++ b/drivers/leds/leds-lp55xx-common.c
@@ -200,7 +200,7 @@ static void lp55xx_firmware_loaded(const struct firmware *fw, void *context)
 
 	if (!fw) {
 		dev_err(dev, "firmware request failed\n");
-		goto out;
+		return;
 	}
 
 	/* handling firmware data is chip dependent */
@@ -213,9 +213,9 @@ static void lp55xx_firmware_loaded(const struct firmware *fw, void *context)
 
 	mutex_unlock(&chip->lock);
 
-out:
 	/* firmware should be released for other channel use */
 	release_firmware(chip->fw);
+	chip->fw = NULL;
 }
 
 static int lp55xx_request_firmware(struct lp55xx_chip *chip)

From 7d4d46f51d6c0ee16806ace612c427fc0fa40277 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Wed, 13 Feb 2019 01:15:34 +0900
Subject: [PATCH 3092/3220] kprobes: Prohibit probing on bsearch()

[ Upstream commit 02106f883cd745523f7766d90a739f983f19e650 ]

Since kprobe breakpoing handler is using bsearch(), probing on this
routine can cause recursive breakpoint problem.

int3
 ->do_int3()
   ->ftrace_int3_handler()
     ->ftrace_location()
       ->ftrace_location_range()
         ->bsearch() -> int3

Prohibit probing on bsearch().

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/154998813406.31052.8791425358974650922.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/bsearch.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/bsearch.c b/lib/bsearch.c
index e33c179089db..d50048446b77 100644
--- a/lib/bsearch.c
+++ b/lib/bsearch.c
@@ -11,6 +11,7 @@
 
 #include <linux/export.h>
 #include <linux/bsearch.h>
+#include <linux/kprobes.h>
 
 /*
  * bsearch - binary search an array of elements
@@ -51,3 +52,4 @@ void *bsearch(const void *key, const void *base, size_t num, size_t size,
 	return NULL;
 }
 EXPORT_SYMBOL(bsearch);
+NOKPROBE_SYMBOL(bsearch);

From b67ef52116de116e4945b964aa3c4c828848eb1c Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Sat, 2 Feb 2019 03:34:36 +0100
Subject: [PATCH 3093/3220] ARM: 8833/1: Ensure that NEON code always compiles
 with Clang

[ Upstream commit de9c0d49d85dc563549972edc5589d195cd5e859 ]

While building arm32 allyesconfig, I ran into the following errors:

  arch/arm/lib/xor-neon.c:17:2: error: You should compile this file with
  '-mfloat-abi=softfp -mfpu=neon'

  In file included from lib/raid6/neon1.c:27:
  /home/nathan/cbl/prebuilt/lib/clang/8.0.0/include/arm_neon.h:28:2:
  error: "NEON support not enabled"

Building V=1 showed NEON_FLAGS getting passed along to Clang but
__ARM_NEON__ was not getting defined. Ultimately, it boils down to Clang
only defining __ARM_NEON__ when targeting armv7, rather than armv6k,
which is the '-march' value for allyesconfig.

>From lib/Basic/Targets/ARM.cpp in the Clang source:

  // This only gets set when Neon instructions are actually available, unlike
  // the VFP define, hence the soft float and arch check. This is subtly
  // different from gcc, we follow the intent which was that it should be set
  // when Neon instructions are actually available.
  if ((FPU & NeonFPU) && !SoftFloat && ArchVersion >= 7) {
    Builder.defineMacro("__ARM_NEON", "1");
    Builder.defineMacro("__ARM_NEON__");
    // current AArch32 NEON implementations do not support double-precision
    // floating-point even when it is present in VFP.
    Builder.defineMacro("__ARM_NEON_FP",
                        "0x" + Twine::utohexstr(HW_FP & ~HW_FP_DP));
  }

Ard Biesheuvel recommended explicitly adding '-march=armv7-a' at the
beginning of the NEON_FLAGS definitions so that __ARM_NEON__ always gets
definined by Clang. This doesn't functionally change anything because
that code will only run where NEON is supported, which is implicitly
armv7.

Link: https://github.com/ClangBuiltLinux/linux/issues/287

Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Nicolas Pitre <nico@linaro.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Stefan Agner <stefan@agner.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 Documentation/arm/kernel_mode_neon.txt | 4 ++--
 arch/arm/lib/Makefile                  | 2 +-
 arch/arm/lib/xor-neon.c                | 2 +-
 lib/raid6/Makefile                     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Documentation/arm/kernel_mode_neon.txt b/Documentation/arm/kernel_mode_neon.txt
index 525452726d31..b9e060c5b61e 100644
--- a/Documentation/arm/kernel_mode_neon.txt
+++ b/Documentation/arm/kernel_mode_neon.txt
@@ -6,7 +6,7 @@ TL;DR summary
 * Use only NEON instructions, or VFP instructions that don't rely on support
   code
 * Isolate your NEON code in a separate compilation unit, and compile it with
-  '-mfpu=neon -mfloat-abi=softfp'
+  '-march=armv7-a -mfpu=neon -mfloat-abi=softfp'
 * Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your
   NEON code
 * Don't sleep in your NEON code, and be aware that it will be executed with
@@ -87,7 +87,7 @@ instructions appearing in unexpected places if no special care is taken.
 Therefore, the recommended and only supported way of using NEON/VFP in the
 kernel is by adhering to the following rules:
 * isolate the NEON code in a separate compilation unit and compile it with
-  '-mfpu=neon -mfloat-abi=softfp';
+  '-march=armv7-a -mfpu=neon -mfloat-abi=softfp';
 * issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls
   into the unit containing the NEON code from a compilation unit which is *not*
   built with the GCC flag '-mfpu=neon' set.
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index d8a780799506..06348a3d50c2 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -35,7 +35,7 @@ $(obj)/csumpartialcopy.o:	$(obj)/csumpartialcopygeneric.S
 $(obj)/csumpartialcopyuser.o:	$(obj)/csumpartialcopygeneric.S
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
-  NEON_FLAGS			:= -mfloat-abi=softfp -mfpu=neon
+  NEON_FLAGS			:= -march=armv7-a -mfloat-abi=softfp -mfpu=neon
   CFLAGS_xor-neon.o		+= $(NEON_FLAGS)
   obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
 endif
diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c
index 2c40aeab3eaa..c691b901092f 100644
--- a/arch/arm/lib/xor-neon.c
+++ b/arch/arm/lib/xor-neon.c
@@ -14,7 +14,7 @@
 MODULE_LICENSE("GPL");
 
 #ifndef __ARM_NEON__
-#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon'
+#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
 #endif
 
 /*
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3b10a48fa040..a84efd4aad37 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -23,7 +23,7 @@ endif
 ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
 NEON_FLAGS := -ffreestanding
 ifeq ($(ARCH),arm)
-NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
+NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
 endif
 ifeq ($(ARCH),arm64)
 CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only

From b25147b49301454409d9e38b0cbd46221be089a8 Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Date: Fri, 8 Feb 2019 17:29:53 -0600
Subject: [PATCH 3094/3220] ALSA: PCM: check if ops are defined before
 suspending PCM

[ Upstream commit d9c0b2afe820fa3b3f8258a659daee2cc71ca3ef ]

BE dai links only have internal PCM's and their substream ops may
not be set. Suspending these PCM's will result in their
 ops->trigger() being invoked and cause a kernel oops.
So skip suspending PCM's if their ops are NULL.

[ NOTE: this change is required now for following the recent PCM core
  change to get rid of snd_pcm_suspend() call.  Since DPCM BE takes
  the runtime carried from FE while keeping NULL ops, it can hit this
  bug.  See details at:
     https://github.com/thesofproject/linux/pull/582
  -- tiwai ]

Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/core/pcm_native.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 4d6f0f56d54a..252392abd1b9 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -1342,6 +1342,14 @@ int snd_pcm_suspend_all(struct snd_pcm *pcm)
 			/* FIXME: the open/close code should lock this as well */
 			if (substream->runtime == NULL)
 				continue;
+
+			/*
+			 * Skip BE dai link PCM's that are internal and may
+			 * not have their substream ops set.
+			 */
+			if (!substream->ops)
+				continue;
+
 			err = snd_pcm_suspend(substream);
 			if (err < 0 && err != -EBUSY)
 				return err;

From d775179e00cdc6fed4c33fef92dcfb24a797e39e Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 9 Feb 2019 12:53:10 +0800
Subject: [PATCH 3095/3220] bcache: fix input overflow to cache set sysfs file
 io_error_halflife

[ Upstream commit a91fbda49f746119828f7e8ad0f0aa2ab0578f65 ]

Cache set sysfs entry io_error_halflife is used to set c->error_decay.
c->error_decay is in type unsigned int, and it is converted by
strtoul_or_return(), therefore overflow to c->error_decay is possible
for a large input value.

This patch fixes the overflow by using strtoul_safe_clamp() to convert
input string to an unsigned long value in range [0, UINT_MAX], then
divides by 88 and set it to c->error_decay.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/bcache/sysfs.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 5a5c1f1bd8a5..87daccbbc61b 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -645,8 +645,17 @@ STORE(__bch_cache_set)
 		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
 
 	/* See count_io_errors() for why 88 */
-	if (attr == &sysfs_io_error_halflife)
-		c->error_decay = strtoul_or_return(buf) / 88;
+	if (attr == &sysfs_io_error_halflife) {
+		unsigned long v = 0;
+		ssize_t ret;
+
+		ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX);
+		if (!ret) {
+			c->error_decay = v / 88;
+			return size;
+		}
+		return ret;
+	}
 
 	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms);
 	sysfs_strtoul(verify,			c->verify);

From 97f00d95dfb02f5d6689f74f11ce0df5ce1fa526 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 9 Feb 2019 12:53:01 +0800
Subject: [PATCH 3096/3220] bcache: fix input overflow to sequential_cutoff

[ Upstream commit 8c27a3953e92eb0b22dbb03d599f543a05f9574e ]

People may set sequential_cutoff of a cached device via sysfs file,
but current code does not check input value overflow. E.g. if value
4294967295 (UINT_MAX) is written to file sequential_cutoff, its value
is 4GB, but if 4294967296 (UINT_MAX + 1) is written into, its value
will be 0. This is an unexpected behavior.

This patch replaces d_strtoi_h() by sysfs_strtoul_clamp() to convert
input string to unsigned integer value, and limit its range in
[0, UINT_MAX]. Then the input overflow can be fixed.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/bcache/sysfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 87daccbbc61b..463ce6757338 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -215,7 +215,9 @@ STORE(__cached_dev)
 	d_strtoul(writeback_rate_d_term);
 	d_strtoul_nonzero(writeback_rate_p_term_inverse);
 
-	d_strtoi_h(sequential_cutoff);
+	sysfs_strtoul_clamp(sequential_cutoff,
+			    dc->sequential_cutoff,
+			    0, UINT_MAX);
 	d_strtoi_h(readahead);
 
 	if (attr == &sysfs_clear_stats)

From 267da08ea979ab2d00b10b33eb8e5d5151b45863 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 9 Feb 2019 12:52:59 +0800
Subject: [PATCH 3097/3220] bcache: improve sysfs_strtoul_clamp()

[ Upstream commit 596b5a5dd1bc2fa019fdaaae522ef331deef927f ]

Currently sysfs_strtoul_clamp() is defined as,
 82 #define sysfs_strtoul_clamp(file, var, min, max)                   \
 83 do {                                                               \
 84         if (attr == &sysfs_ ## file)                               \
 85                 return strtoul_safe_clamp(buf, var, min, max)      \
 86                         ?: (ssize_t) size;                         \
 87 } while (0)

The problem is, if bit width of var is less then unsigned long, min and
max may not protect var from integer overflow, because overflow happens
in strtoul_safe_clamp() before checking min and max.

To fix such overflow in sysfs_strtoul_clamp(), to make min and max take
effect, this patch adds an unsigned long variable, and uses it to macro
strtoul_safe_clamp() to convert an unsigned long value in range defined
by [min, max]. Then assign this value to var. By this method, if bit
width of var is less than unsigned long, integer overflow won't happen
before min and max are checking.

Now sysfs_strtoul_clamp() can properly handle smaller data type like
unsigned int, of cause min and max should be defined in range of
unsigned int too.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/bcache/sysfs.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
index 0526fe92a683..e7a3c12aa66f 100644
--- a/drivers/md/bcache/sysfs.h
+++ b/drivers/md/bcache/sysfs.h
@@ -80,9 +80,16 @@ do {									\
 
 #define sysfs_strtoul_clamp(file, var, min, max)			\
 do {									\
-	if (attr == &sysfs_ ## file)					\
-		return strtoul_safe_clamp(buf, var, min, max)		\
-			?: (ssize_t) size;				\
+	if (attr == &sysfs_ ## file) {					\
+		unsigned long v = 0;					\
+		ssize_t ret;						\
+		ret = strtoul_safe_clamp(buf, v, min, max);		\
+		if (!ret) {						\
+			var = v;					\
+			return size;					\
+		}							\
+		return ret;						\
+	}								\
 } while (0)
 
 #define strtoul_or_return(cp)						\

From 6b8382afa7af3e684906c3449c87780c9261d197 Mon Sep 17 00:00:00 2001
From: Manfred Schlaegl <manfred.schlaegl@ginzinger.com>
Date: Fri, 8 Feb 2019 19:24:47 +0100
Subject: [PATCH 3098/3220] fbdev: fbmem: fix memory access if logo is bigger
 than the screen

[ Upstream commit a5399db139cb3ad9b8502d8b1bd02da9ce0b9df0 ]

There is no clipping on the x or y axis for logos larger that the framebuffer
size. Therefore: a logo bigger than screen size leads to invalid memory access:

[    1.254664] Backtrace:
[    1.254728] [<c02714e0>] (cfb_imageblit) from [<c026184c>] (fb_show_logo+0x620/0x684)
[    1.254763]  r10:00000003 r9:00027fd8 r8:c6a40000 r7:c6a36e50 r6:00000000 r5:c06b81e4
[    1.254774]  r4:c6a3e800
[    1.254810] [<c026122c>] (fb_show_logo) from [<c026c1e4>] (fbcon_switch+0x3fc/0x46c)
[    1.254842]  r10:c6a3e824 r9:c6a3e800 r8:00000000 r7:c6a0c000 r6:c070b014 r5:c6a3e800
[    1.254852]  r4:c6808c00
[    1.254889] [<c026bde8>] (fbcon_switch) from [<c029c8f8>] (redraw_screen+0xf0/0x1e8)
[    1.254918]  r10:00000000 r9:00000000 r8:00000000 r7:00000000 r6:c070d5a0 r5:00000080
[    1.254928]  r4:c6808c00
[    1.254961] [<c029c808>] (redraw_screen) from [<c029d264>] (do_bind_con_driver+0x194/0x2e4)
[    1.254991]  r9:00000000 r8:00000000 r7:00000014 r6:c070d5a0 r5:c070d5a0 r4:c070d5a0

So prevent displaying a logo bigger than screen size and avoid invalid
memory access.

Signed-off-by: Manfred Schlaegl <manfred.schlaegl@ginzinger.com>
Signed-off-by: Martin Kepplinger <martin.kepplinger@ginzinger.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/video/fbdev/core/fbmem.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index ea2bd6208a2f..9eae191728d2 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -425,6 +425,9 @@ static void fb_do_show_logo(struct fb_info *info, struct fb_image *image,
 {
 	unsigned int x;
 
+	if (image->width > info->var.xres || image->height > info->var.yres)
+		return;
+
 	if (rotate == FB_ROTATE_UR) {
 		for (x = 0;
 		     x < num && image->dx + image->width <= info->var.xres;

From 46ff76bd4ea16f1e0e9fb59f6d3d1f3b252aafd6 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 6 Feb 2019 21:13:49 -0800
Subject: [PATCH 3099/3220] cdrom: Fix race condition in cdrom_sysctl_register

[ Upstream commit f25191bb322dec8fa2979ecb8235643aa42470e1 ]

The following traceback is sometimes seen when booting an image in qemu:

[   54.608293] cdrom: Uniform CD-ROM driver Revision: 3.20
[   54.611085] Fusion MPT base driver 3.04.20
[   54.611877] Copyright (c) 1999-2008 LSI Corporation
[   54.616234] Fusion MPT SAS Host driver 3.04.20
[   54.635139] sysctl duplicate entry: /dev/cdrom//info
[   54.639578] CPU: 0 PID: 266 Comm: kworker/u4:5 Not tainted 5.0.0-rc5 #1
[   54.639578] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[   54.641273] Workqueue: events_unbound async_run_entry_fn
[   54.641273] Call Trace:
[   54.641273]  dump_stack+0x67/0x90
[   54.641273]  __register_sysctl_table+0x50b/0x570
[   54.641273]  ? rcu_read_lock_sched_held+0x6f/0x80
[   54.641273]  ? kmem_cache_alloc_trace+0x1c7/0x1f0
[   54.646814]  __register_sysctl_paths+0x1c8/0x1f0
[   54.646814]  cdrom_sysctl_register.part.7+0xc/0x5f
[   54.646814]  register_cdrom.cold.24+0x2a/0x33
[   54.646814]  sr_probe+0x4bd/0x580
[   54.646814]  ? __driver_attach+0xd0/0xd0
[   54.646814]  really_probe+0xd6/0x260
[   54.646814]  ? __driver_attach+0xd0/0xd0
[   54.646814]  driver_probe_device+0x4a/0xb0
[   54.646814]  ? __driver_attach+0xd0/0xd0
[   54.646814]  bus_for_each_drv+0x73/0xc0
[   54.646814]  __device_attach+0xd6/0x130
[   54.646814]  bus_probe_device+0x9a/0xb0
[   54.646814]  device_add+0x40c/0x670
[   54.646814]  ? __pm_runtime_resume+0x4f/0x80
[   54.646814]  scsi_sysfs_add_sdev+0x81/0x290
[   54.646814]  scsi_probe_and_add_lun+0x888/0xc00
[   54.646814]  ? scsi_autopm_get_host+0x21/0x40
[   54.646814]  __scsi_add_device+0x116/0x130
[   54.646814]  ata_scsi_scan_host+0x93/0x1c0
[   54.646814]  async_run_entry_fn+0x34/0x100
[   54.646814]  process_one_work+0x237/0x5e0
[   54.646814]  worker_thread+0x37/0x380
[   54.646814]  ? rescuer_thread+0x360/0x360
[   54.646814]  kthread+0x118/0x130
[   54.646814]  ? kthread_create_on_node+0x60/0x60
[   54.646814]  ret_from_fork+0x3a/0x50

The only sensible explanation is that cdrom_sysctl_register() is called
twice, once from the module init function and once from register_cdrom().
cdrom_sysctl_register() is not mutex protected and may happily execute
twice if the second call is made before the first call is complete.

Use a static atomic to ensure that the function is executed exactly once.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/cdrom/cdrom.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index d203940203b6..aee23092f50e 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -265,6 +265,7 @@
 /* #define ERRLOGMASK (CD_WARNING|CD_OPEN|CD_COUNT_TRACKS|CD_CLOSE) */
 /* #define ERRLOGMASK (CD_WARNING|CD_REG_UNREG|CD_DO_IOCTL|CD_OPEN|CD_CLOSE|CD_COUNT_TRACKS) */
 
+#include <linux/atomic.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/major.h>
@@ -3677,9 +3678,9 @@ static struct ctl_table_header *cdrom_sysctl_header;
 
 static void cdrom_sysctl_register(void)
 {
-	static int initialized;
+	static atomic_t initialized = ATOMIC_INIT(0);
 
-	if (initialized == 1)
+	if (!atomic_add_unless(&initialized, 1, 1))
 		return;
 
 	cdrom_sysctl_header = register_sysctl_table(cdrom_root_table);
@@ -3690,8 +3691,6 @@ static void cdrom_sysctl_register(void)
 	cdrom_sysctl_settings.debug = debug;
 	cdrom_sysctl_settings.lock = lockdoor;
 	cdrom_sysctl_settings.check = check_media_type;
-
-	initialized = 1;
 }
 
 static void cdrom_sysctl_unregister(void)

From b396b5e89be6cd3d5b7c51f05a0a16c800e015fc Mon Sep 17 00:00:00 2001
From: wen yang <yellowriver2010@hotmail.com>
Date: Sat, 2 Feb 2019 14:53:16 +0000
Subject: [PATCH 3100/3220] ASoC: fsl-asoc-card: fix object reference leaks in
 fsl_asoc_card_probe

[ Upstream commit 11907e9d3533648615db08140e3045b829d2c141 ]

The of_find_device_by_node() takes a reference to the underlying device
structure, we should release that reference.

Signed-off-by: Wen Yang <yellowriver2010@hotmil.com>
Cc: Timur Tabi <timur@kernel.org>
Cc: Nicolin Chen <nicoleotsuka@gmail.com>
Cc: Xiubo Li <Xiubo.Lee@gmail.com>
Cc: Fabio Estevam <festevam@gmail.com>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: alsa-devel@alsa-project.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/fsl/fsl-asoc-card.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c
index 1b05d1c5d9fd..a32fe14b4687 100644
--- a/sound/soc/fsl/fsl-asoc-card.c
+++ b/sound/soc/fsl/fsl-asoc-card.c
@@ -659,6 +659,7 @@ static int fsl_asoc_card_probe(struct platform_device *pdev)
 asrc_fail:
 	of_node_put(asrc_np);
 	of_node_put(codec_np);
+	put_device(&cpu_pdev->dev);
 fail:
 	of_node_put(cpu_np);
 

From 5735cd4e0c785e200492e32f074536a506d43900 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Sat, 8 Dec 2018 01:57:04 +0300
Subject: [PATCH 3101/3220] soc: qcom: gsbi: Fix error handling in gsbi_probe()

[ Upstream commit 8cd09a3dd3e176c62da67efcd477a44a8d87185e ]

If of_platform_populate() fails in gsbi_probe(),
gsbi->hclk is left undisabled.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/soc/qcom/qcom_gsbi.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/soc/qcom/qcom_gsbi.c b/drivers/soc/qcom/qcom_gsbi.c
index 09c669e70d63..038abc377fdb 100644
--- a/drivers/soc/qcom/qcom_gsbi.c
+++ b/drivers/soc/qcom/qcom_gsbi.c
@@ -138,7 +138,7 @@ static int gsbi_probe(struct platform_device *pdev)
 	struct resource *res;
 	void __iomem *base;
 	struct gsbi_info *gsbi;
-	int i;
+	int i, ret;
 	u32 mask, gsbi_num;
 	const struct crci_config *config = NULL;
 
@@ -221,7 +221,10 @@ static int gsbi_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, gsbi);
 
-	return of_platform_populate(node, NULL, NULL, &pdev->dev);
+	ret = of_platform_populate(node, NULL, NULL, &pdev->dev);
+	if (ret)
+		clk_disable_unprepare(gsbi->hclk);
+	return ret;
 }
 
 static int gsbi_remove(struct platform_device *pdev)

From c627e297b9b631b249c16be423a1837f075cf459 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 22 Jan 2019 13:47:54 +0100
Subject: [PATCH 3102/3220] mt7601u: bump supported EEPROM version

[ Upstream commit 3bd1505fed71d834f45e87b32ff07157fdda47e0 ]

As reported by Michael eeprom 0d is supported and work with the driver.

Dump of /sys/kernel/debug/ieee80211/phy1/mt7601u/eeprom_param
with 0d EEPORM looks like this:

RSSI offset: 0 0
Reference temp: f9
LNA gain: 8
Reg channels: 1-14
Per rate power:
	 raw:05 bw20:05 bw40:05
	 raw:05 bw20:05 bw40:05
	 raw:03 bw20:03 bw40:03
	 raw:03 bw20:03 bw40:03
	 raw:04 bw20:04 bw40:04
	 raw:00 bw20:00 bw40:00
	 raw:00 bw20:00 bw40:00
	 raw:00 bw20:00 bw40:00
	 raw:02 bw20:02 bw40:02
	 raw:00 bw20:00 bw40:00
Per channel power:
	 tx_power  ch1:09 ch2:09
	 tx_power  ch3:0a ch4:0a
	 tx_power  ch5:0a ch6:0a
	 tx_power  ch7:0b ch8:0b
	 tx_power  ch9:0b ch10:0b
	 tx_power  ch11:0b ch12:0b
	 tx_power  ch13:0b ch14:0b

Reported-and-tested-by: Michael <ZeroBeat@gmx.de>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Jakub Kicinski <kubakici@wp.pl>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/wireless/mediatek/mt7601u/eeprom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt7601u/eeprom.h b/drivers/net/wireless/mediatek/mt7601u/eeprom.h
index 662d12703b69..57b503ae63f1 100644
--- a/drivers/net/wireless/mediatek/mt7601u/eeprom.h
+++ b/drivers/net/wireless/mediatek/mt7601u/eeprom.h
@@ -17,7 +17,7 @@
 
 struct mt7601u_dev;
 
-#define MT7601U_EE_MAX_VER			0x0c
+#define MT7601U_EE_MAX_VER			0x0d
 #define MT7601U_EEPROM_SIZE			256
 
 #define MT7601U_DEFAULT_TX_POWER		6

From d9c190a5ee9296656191161c34eba164dc3e8bec Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 10 Apr 2018 11:35:36 +0100
Subject: [PATCH 3103/3220] ARM: avoid Cortex-A9 livelock on tight dmb loops

[ Upstream commit 5388a5b82199facacd3d7ac0d05aca6e8f902fed ]

machine_crash_nonpanic_core() does this:

	while (1)
		cpu_relax();

because the kernel has crashed, and we have no known safe way to deal
with the CPU.  So, we place the CPU into an infinite loop which we
expect it to never exit - at least not until the system as a whole is
reset by some method.

In the absence of erratum 754327, this code assembles to:

	b	.

In other words, an infinite loop.  When erratum 754327 is enabled,
this becomes:

1:	dmb
	b	1b

It has been observed that on some systems (eg, OMAP4) where, if a
crash is triggered, the system tries to kexec into the panic kernel,
but fails after taking the secondary CPU down - placing it into one
of these loops.  This causes the system to livelock, and the most
noticable effect is the system stops after issuing:

	Loading crashdump kernel...

to the system console.

The tested as working solution I came up with was to add wfe() to
these infinite loops thusly:

	while (1) {
		cpu_relax();
		wfe();
	}

which, without 754327 builds to:

1:	wfe
	b	1b

or with 754327 is enabled:

1:	dmb
	wfe
	b	1b

Adding "wfe" does two things depending on the environment we're running
under:
- where we're running on bare metal, and the processor implements
  "wfe", it stops us spinning endlessly in a loop where we're never
  going to do any useful work.
- if we're running in a VM, it allows the CPU to be given back to the
  hypervisor and rescheduled for other purposes (maybe a different VM)
  rather than wasting CPU cycles inside a crashed VM.

However, in light of erratum 794072, Will Deacon wanted to see 10 nops
as well - which is reasonable to cover the case where we have erratum
754327 enabled _and_ we have a processor that doesn't implement the
wfe hint.

So, we now end up with:

1:      wfe
        b       1b

when erratum 754327 is disabled, or:

1:      dmb
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        wfe
        b       1b

when erratum 754327 is enabled.  We also get the dmb + 10 nop
sequence elsewhere in the kernel, in terminating loops.

This is reasonable - it means we get the workaround for erratum
794072 when erratum 754327 is enabled, but still relinquish the dead
processor - either by placing it in a lower power mode when wfe is
implemented as such or by returning it to the hypervisior, or in the
case where wfe is a no-op, we use the workaround specified in erratum
794072 to avoid the problem.

These as two entirely orthogonal problems - the 10 nops addresses
erratum 794072, and the wfe is an optimisation that makes the system
more efficient when crashed either in terms of power consumption or
by allowing the host/other VMs to make use of the CPU.

I don't see any reason not to use kexec() inside a VM - it has the
potential to provide automated recovery from a failure of the VMs
kernel with the opportunity for saving a crashdump of the failure.
A panic() with a reboot timeout won't do that, and reading the
libvirt documentation, setting on_reboot to "preserve" won't either
(the documentation states "The preserve action for an on_reboot event
is treated as a destroy".)  Surely it has to be a good thing to
avoiding having CPUs spinning inside a VM that is doing no useful
work.

Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/include/asm/barrier.h   | 2 ++
 arch/arm/include/asm/processor.h | 6 +++++-
 arch/arm/kernel/machine_kexec.c  | 5 ++++-
 arch/arm/kernel/smp.c            | 4 +++-
 arch/arm/mach-omap2/prm_common.c | 4 +++-
 5 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index 3ff5642d9788..27c1d26b05b5 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -10,6 +10,8 @@
 #define sev()	__asm__ __volatile__ ("sev" : : : "memory")
 #define wfe()	__asm__ __volatile__ ("wfe" : : : "memory")
 #define wfi()	__asm__ __volatile__ ("wfi" : : : "memory")
+#else
+#define wfe()	do { } while (0)
 #endif
 
 #if __LINUX_ARM_ARCH__ >= 7
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index 8a1e8e995dae..08509183c7df 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -77,7 +77,11 @@ extern void release_thread(struct task_struct *);
 unsigned long get_wchan(struct task_struct *p);
 
 #if __LINUX_ARM_ARCH__ == 6 || defined(CONFIG_ARM_ERRATA_754327)
-#define cpu_relax()			smp_mb()
+#define cpu_relax()						\
+	do {							\
+		smp_mb();					\
+		__asm__ __volatile__("nop; nop; nop; nop; nop; nop; nop; nop; nop; nop;");	\
+	} while (0)
 #else
 #define cpu_relax()			barrier()
 #endif
diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index 8bf3b7c09888..46519916a465 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -87,8 +87,11 @@ void machine_crash_nonpanic_core(void *unused)
 
 	set_cpu_online(smp_processor_id(), false);
 	atomic_dec(&waiting_for_crash_ipi);
-	while (1)
+
+	while (1) {
 		cpu_relax();
+		wfe();
+	}
 }
 
 static void machine_kexec_mask_interrupts(void)
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 08ce9e36dc5a..0f1c11861147 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -563,8 +563,10 @@ static void ipi_cpu_stop(unsigned int cpu)
 	local_fiq_disable();
 	local_irq_disable();
 
-	while (1)
+	while (1) {
 		cpu_relax();
+		wfe();
+	}
 }
 
 static DEFINE_PER_CPU(struct completion *, cpu_completion);
diff --git a/arch/arm/mach-omap2/prm_common.c b/arch/arm/mach-omap2/prm_common.c
index 0ce4548ef7f0..4b9e9d1d8229 100644
--- a/arch/arm/mach-omap2/prm_common.c
+++ b/arch/arm/mach-omap2/prm_common.c
@@ -533,8 +533,10 @@ void omap_prm_reset_system(void)
 
 	prm_ll_data->reset_system();
 
-	while (1)
+	while (1) {
 		cpu_relax();
+		wfe();
+	}
 }
 
 /**

From d1ba82ccffab80af33abbb10ef602b48541dd886 Mon Sep 17 00:00:00 2001
From: Manfred Schlaegl <manfred.schlaegl@ginzinger.com>
Date: Mon, 28 Jan 2019 19:01:10 +0100
Subject: [PATCH 3104/3220] tty: increase the default flip buffer limit to
 2*640K

[ Upstream commit 7ab57b76ebf632bf2231ccabe26bea33868118c6 ]

We increase the default limit for buffer memory allocation by a factor of
10 to 640K to prevent data loss when using fast serial interfaces.

For example when using RS485 without flow-control at speeds of 1Mbit/s
an upwards we've run into problems such as applications being too slow
to read out this buffer (on embedded devices based on imx53 or imx6).

If you want to write transmitted data to a slow SD card and thus have
realtime requirements, this limit can become a problem.

That shouldn't be the case and 640K buffers fix such problems for us.

This value is a maximum limit for allocation only. It has no effect
on systems that currently run fine. When transmission is slow enough
applications and hardware can keep up and increasing this limit
doesn't change anything.

It only _allows_ to allocate more than 2*64K in cases we currently fail to
allocate memory despite having some.

Signed-off-by: Manfred Schlaegl <manfred.schlaegl@ginzinger.com>
Signed-off-by: Martin Kepplinger <martin.kepplinger@ginzinger.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/tty/tty_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index 355e9cad680d..4706df20191b 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -25,7 +25,7 @@
  * Byte threshold to limit memory consumption for flip buffers.
  * The actual memory limit is > 2x this amount.
  */
-#define TTYB_DEFAULT_MEM_LIMIT	65536
+#define TTYB_DEFAULT_MEM_LIMIT	(640 * 1024UL)
 
 /*
  * We default to dicing tty buffer allocations to this many characters

From a6adffa0fff4099f3b585c9816472836b78153d9 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 15 Jan 2019 12:05:41 -0200
Subject: [PATCH 3105/3220] media: mt9m111: set initial frame size other than
 0x0

[ Upstream commit 29856308137de1c21eda89411695f4fc6e9780ff ]

This driver sets initial frame width and height to 0x0, which is invalid.
So set it to selection rectangle bounds instead.

This is detected by v4l2-compliance detected.

Cc: Enrico Scholz <enrico.scholz@sigma-chemnitz.de>
Cc: Michael Grzeschik <m.grzeschik@pengutronix.de>
Cc: Marco Felsch <m.felsch@pengutronix.de>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/media/i2c/soc_camera/mt9m111.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/media/i2c/soc_camera/mt9m111.c b/drivers/media/i2c/soc_camera/mt9m111.c
index 6dfaead6aaa8..1d1ca03c797f 100644
--- a/drivers/media/i2c/soc_camera/mt9m111.c
+++ b/drivers/media/i2c/soc_camera/mt9m111.c
@@ -988,6 +988,8 @@ static int mt9m111_probe(struct i2c_client *client,
 	mt9m111->rect.top	= MT9M111_MIN_DARK_ROWS;
 	mt9m111->rect.width	= MT9M111_MAX_WIDTH;
 	mt9m111->rect.height	= MT9M111_MAX_HEIGHT;
+	mt9m111->width		= mt9m111->rect.width;
+	mt9m111->height		= mt9m111->rect.height;
 	mt9m111->fmt		= &mt9m111_colour_fmts[0];
 	mt9m111->lastpage	= -1;
 	mutex_init(&mt9m111->power_lock);

From 407f1a8b8d7d873d2e6206a045c76b34837fd2fc Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Mon, 7 Jan 2019 14:36:11 -0800
Subject: [PATCH 3106/3220] hwrng: virtio - Avoid repeated init of completion

[ Upstream commit aef027db48da56b6f25d0e54c07c8401ada6ce21 ]

The virtio-rng driver uses a completion called have_data to wait for a
virtio read to be fulfilled by the hypervisor. The completion is reset
before placing a buffer on the virtio queue and completed by the virtio
callback once data has been written into the buffer.

Prior to this commit, the driver called init_completion on this
completion both during probe as well as when registering virtio buffers
as part of a hwrng read operation. The second of these init_completion
calls should instead be reinit_completion because the have_data
completion has already been inited by probe. As described in
Documentation/scheduler/completion.txt, "Calling init_completion() twice
on the same completion object is most likely a bug".

This bug was present in the initial implementation of virtio-rng in
f7f510ec1957 ("virtio: An entropy device, as suggested by hpa"). Back
then the have_data completion was a single static completion rather than
a member of one of potentially multiple virtrng_info structs as
implemented later by 08e53fbdb85c ("virtio-rng: support multiple
virtio-rng devices"). The original driver incorrectly used
init_completion rather than INIT_COMPLETION to reset have_data during
read.

Tested by running `head -c48 /dev/random | hexdump` within crosvm, the
Chrome OS virtual machine monitor, and confirming that the virtio-rng
driver successfully produces random bytes from the host.

Signed-off-by: David Tolnay <dtolnay@gmail.com>
Tested-by: David Tolnay <dtolnay@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/char/hw_random/virtio-rng.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 3fa2f8a009b3..1c5c4314c6b5 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -73,7 +73,7 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait)
 
 	if (!vi->busy) {
 		vi->busy = true;
-		init_completion(&vi->have_data);
+		reinit_completion(&vi->have_data);
 		register_buffer(vi, buf, size);
 	}
 

From cdc7a66f679e5fb1f60a41a3766e1cbe309d3e82 Mon Sep 17 00:00:00 2001
From: Timo Alho <talho@nvidia.com>
Date: Sun, 30 Dec 2018 17:58:08 +0200
Subject: [PATCH 3107/3220] soc/tegra: fuse: Fix illegal free of IO base
 address

[ Upstream commit 51294bf6b9e897d595466dcda5a3f2751906a200 ]

On cases where device tree entries for fuse and clock provider are in
different order, fuse driver needs to defer probing. This leads to
freeing incorrect IO base address as the fuse->base variable gets
overwritten once during first probe invocation. This leads to the
following spew during boot:

[    3.082285] Trying to vfree() nonexistent vm area (00000000cfe8fd94)
[    3.082308] WARNING: CPU: 5 PID: 126 at /hdd/l4t/kernel/stable/mm/vmalloc.c:1511 __vunmap+0xcc/0xd8
[    3.082318] Modules linked in:
[    3.082330] CPU: 5 PID: 126 Comm: kworker/5:1 Tainted: G S                4.19.7-tegra-gce119d3 #1
[    3.082340] Hardware name: quill (DT)
[    3.082353] Workqueue: events deferred_probe_work_func
[    3.082364] pstate: 40000005 (nZcv daif -PAN -UAO)
[    3.082372] pc : __vunmap+0xcc/0xd8
[    3.082379] lr : __vunmap+0xcc/0xd8
[    3.082385] sp : ffff00000a1d3b60
[    3.082391] x29: ffff00000a1d3b60 x28: 0000000000000000
[    3.082402] x27: 0000000000000000 x26: ffff000008e8b610
[    3.082413] x25: 0000000000000000 x24: 0000000000000009
[    3.082423] x23: ffff000009221a90 x22: ffff000009f6d000
[    3.082432] x21: 0000000000000000 x20: 0000000000000000
[    3.082442] x19: ffff000009f6d000 x18: ffffffffffffffff
[    3.082452] x17: 0000000000000000 x16: 0000000000000000
[    3.082462] x15: ffff0000091396c8 x14: 0720072007200720
[    3.082471] x13: 0720072007200720 x12: 0720072907340739
[    3.082481] x11: 0764076607380765 x10: 0766076307300730
[    3.082491] x9 : 0730073007300730 x8 : 0730073007280720
[    3.082501] x7 : 0761076507720761 x6 : 0000000000000102
[    3.082510] x5 : 0000000000000000 x4 : 0000000000000000
[    3.082519] x3 : ffffffffffffffff x2 : ffff000009150ff8
[    3.082528] x1 : 3d95b1429fff5200 x0 : 0000000000000000
[    3.082538] Call trace:
[    3.082545]  __vunmap+0xcc/0xd8
[    3.082552]  vunmap+0x24/0x30
[    3.082561]  __iounmap+0x2c/0x38
[    3.082569]  tegra_fuse_probe+0xc8/0x118
[    3.082577]  platform_drv_probe+0x50/0xa0
[    3.082585]  really_probe+0x1b0/0x288
[    3.082593]  driver_probe_device+0x58/0x100
[    3.082601]  __device_attach_driver+0x98/0xf0
[    3.082609]  bus_for_each_drv+0x64/0xc8
[    3.082616]  __device_attach+0xd8/0x130
[    3.082624]  device_initial_probe+0x10/0x18
[    3.082631]  bus_probe_device+0x90/0x98
[    3.082638]  deferred_probe_work_func+0x74/0xb0
[    3.082649]  process_one_work+0x1e0/0x318
[    3.082656]  worker_thread+0x228/0x450
[    3.082664]  kthread+0x128/0x130
[    3.082672]  ret_from_fork+0x10/0x18
[    3.082678] ---[ end trace 0810fe6ba772c1c7 ]---

Fix this by retaining the value of fuse->base until driver has
successfully probed.

Signed-off-by: Timo Alho <talho@nvidia.com>
Acked-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/soc/tegra/fuse/fuse-tegra.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c
index de2c1bfe28b5..c4f5e5bbb8dc 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra.c
@@ -131,13 +131,17 @@ static int tegra_fuse_probe(struct platform_device *pdev)
 	/* take over the memory region from the early initialization */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	fuse->base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(fuse->base))
-		return PTR_ERR(fuse->base);
+	if (IS_ERR(fuse->base)) {
+		err = PTR_ERR(fuse->base);
+		fuse->base = base;
+		return err;
+	}
 
 	fuse->clk = devm_clk_get(&pdev->dev, "fuse");
 	if (IS_ERR(fuse->clk)) {
 		dev_err(&pdev->dev, "failed to get FUSE clock: %ld",
 			PTR_ERR(fuse->clk));
+		fuse->base = base;
 		return PTR_ERR(fuse->clk);
 	}
 
@@ -146,8 +150,10 @@ static int tegra_fuse_probe(struct platform_device *pdev)
 
 	if (fuse->soc->probe) {
 		err = fuse->soc->probe(fuse);
-		if (err < 0)
+		if (err < 0) {
+			fuse->base = base;
 			return err;
+		}
 	}
 
 	if (tegra_fuse_create_sysfs(&pdev->dev, fuse->soc->info->size,

From e5cb8ab4b0e02238afeb0915d5e3a10f065ebabc Mon Sep 17 00:00:00 2001
From: Buland Singh <bsingh@redhat.com>
Date: Thu, 20 Dec 2018 17:35:24 +0530
Subject: [PATCH 3108/3220] hpet: Fix missing '=' character in the __setup()
 code of hpet_mmap_enable

[ Upstream commit 24d48a61f2666630da130cc2ec2e526eacf229e3 ]

Commit '3d035f580699 ("drivers/char/hpet.c: allow user controlled mmap for
user processes")' introduced a new kernel command line parameter hpet_mmap,
that is required to expose the memory map of the HPET registers to
user-space. Unfortunately the kernel command line parameter 'hpet_mmap' is
broken and never takes effect due to missing '=' character in the __setup()
code of hpet_mmap_enable.

Before this patch:

dmesg output with the kernel command line parameter hpet_mmap=1

[    0.204152] HPET mmap disabled

dmesg output with the kernel command line parameter hpet_mmap=0

[    0.204192] HPET mmap disabled

After this patch:

dmesg output with the kernel command line parameter hpet_mmap=1

[    0.203945] HPET mmap enabled

dmesg output with the kernel command line parameter hpet_mmap=0

[    0.204652] HPET mmap disabled

Fixes: 3d035f580699 ("drivers/char/hpet.c: allow user controlled mmap for user processes")
Signed-off-by: Buland Singh <bsingh@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/char/hpet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 240b6cf1d97c..72e073895ed9 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -376,7 +376,7 @@ static __init int hpet_mmap_enable(char *str)
 	pr_info("HPET mmap %s\n", hpet_mmap_enabled ? "enabled" : "disabled");
 	return 1;
 }
-__setup("hpet_mmap", hpet_mmap_enable);
+__setup("hpet_mmap=", hpet_mmap_enable);
 
 static int hpet_mmap(struct file *file, struct vm_area_struct *vma)
 {

From 10fc10c3270d7d60108b071ebc37058be5dbf028 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Thu, 10 Jan 2019 12:15:35 +0100
Subject: [PATCH 3109/3220] dmaengine: imx-dma: fix warning comparison of
 distinct pointer types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 9227ab5643cb8350449502dd9e3168a873ab0e3b ]

The warning got introduced by commit 930507c18304 ("arm64: add basic
Kconfig symbols for i.MX8"). Since it got enabled for arm64. The warning
haven't been seen before since size_t was 'unsigned int' when built on
arm32.

../drivers/dma/imx-dma.c: In function ‘imxdma_sg_next’:
../include/linux/kernel.h:846:29: warning: comparison of distinct pointer types lacks a cast
   (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
                             ^~
../include/linux/kernel.h:860:4: note: in expansion of macro ‘__typecheck’
   (__typecheck(x, y) && __no_side_effects(x, y))
    ^~~~~~~~~~~
../include/linux/kernel.h:870:24: note: in expansion of macro ‘__safe_cmp’
  __builtin_choose_expr(__safe_cmp(x, y), \
                        ^~~~~~~~~~
../include/linux/kernel.h:879:19: note: in expansion of macro ‘__careful_cmp’
 #define min(x, y) __careful_cmp(x, y, <)
                   ^~~~~~~~~~~~~
../drivers/dma/imx-dma.c:288:8: note: in expansion of macro ‘min’
  now = min(d->len, sg_dma_len(sg));
        ^~~

Rework so that we use min_t and pass in the size_t that returns the
minimum of two values, using the specified type.

Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Acked-by: Olof Johansson <olof@lixom.net>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/dma/imx-dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index dfa337ae06fc..529b315a6683 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -286,7 +286,7 @@ static inline int imxdma_sg_next(struct imxdma_desc *d)
 	struct scatterlist *sg = d->sg;
 	unsigned long now;
 
-	now = min(d->len, sg_dma_len(sg));
+	now = min_t(size_t, d->len, sg_dma_len(sg));
 	if (d->len != IMX_DMA_LENGTH_LOOP)
 		d->len -= now;
 

From 421d2aae103cc215cc8167c00e842441d2b06fce Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 11 Jan 2019 14:46:15 +0100
Subject: [PATCH 3110/3220] netfilter: physdev: relax br_netfilter dependency

[ Upstream commit 8e2f311a68494a6677c1724bdcb10bada21af37c ]

Following command:
  iptables -D FORWARD -m physdev ...
causes connectivity loss in some setups.

Reason is that iptables userspace will probe kernel for the module revision
of the physdev patch, and physdev has an artificial dependency on
br_netfilter (xt_physdev use makes no sense unless a br_netfilter module
is loaded).

This causes the "phydev" module to be loaded, which in turn enables the
"call-iptables" infrastructure.

bridged packets might then get dropped by the iptables ruleset.

The better fix would be to change the "call-iptables" defaults to 0 and
enforce explicit setting to 1, but that breaks backwards compatibility.

This does the next best thing: add a request_module call to checkentry.
This was a stray '-D ... -m physdev' won't activate br_netfilter
anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/net/netfilter/br_netfilter.h | 1 -
 net/bridge/br_netfilter_hooks.c      | 5 -----
 net/netfilter/xt_physdev.c           | 9 +++++++--
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h
index e8d1448425a7..b1d0d46344e2 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h
@@ -42,7 +42,6 @@ static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
 }
 
 struct net_device *setup_pre_routing(struct sk_buff *skb);
-void br_netfilter_enable(void);
 
 #if IS_ENABLED(CONFIG_IPV6)
 int br_validate_ipv6(struct net *net, struct sk_buff *skb);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 6def85d75b1d..93b5525bcccf 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -873,11 +873,6 @@ static const struct nf_br_ops br_ops = {
 	.br_dev_xmit_hook =	br_nf_dev_xmit,
 };
 
-void br_netfilter_enable(void)
-{
-}
-EXPORT_SYMBOL_GPL(br_netfilter_enable);
-
 /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
  * br_dev_queue_push_xmit is called afterwards */
 static struct nf_hook_ops br_nf_ops[] __read_mostly = {
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 1caaccbc306c..7e4063621960 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -96,8 +96,7 @@ match_outdev:
 static int physdev_mt_check(const struct xt_mtchk_param *par)
 {
 	const struct xt_physdev_info *info = par->matchinfo;
-
-	br_netfilter_enable();
+	static bool brnf_probed __read_mostly;
 
 	if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
 	    info->bitmask & ~XT_PHYSDEV_OP_MASK)
@@ -113,6 +112,12 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
 		if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
 			return -EINVAL;
 	}
+
+	if (!brnf_probed) {
+		brnf_probed = true;
+		request_module("br_netfilter");
+	}
+
 	return 0;
 }
 

From 98bbbf661567888030dd122906f9b815bd2eff48 Mon Sep 17 00:00:00 2001
From: Pawe? Chmiel <pawel.mikolaj.chmiel@gmail.com>
Date: Sat, 29 Dec 2018 10:46:01 -0500
Subject: [PATCH 3111/3220] media: s5p-jpeg: Check for fmt_ver_flag when doing
 fmt enumeration

[ Upstream commit 49710c32cd9d6626a77c9f5f978a5f58cb536b35 ]

Previously when doing format enumeration, it was returning all
 formats supported by driver, even if they're not supported by hw.
Add missing check for fmt_ver_flag, so it'll be fixed and only those
 supported by hw will be returned. Similar thing is already done
 in s5p_jpeg_find_format.

It was found by using v4l2-compliance tool and checking result
 of VIDIOC_ENUM_FMT/FRAMESIZES/FRAMEINTERVALS test
and using v4l2-ctl to get list of all supported formats.

Tested on s5pv210-galaxys (Samsung i9000 phone).

Fixes: bb677f3ac434 ("[media] Exynos4 JPEG codec v4l2 driver")

Signed-off-by: Pawe? Chmiel <pawel.mikolaj.chmiel@gmail.com>
Reviewed-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
[hverkuil-cisco@xs4all.nl: fix a few alignment issues]
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/media/platform/s5p-jpeg/jpeg-core.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/media/platform/s5p-jpeg/jpeg-core.c b/drivers/media/platform/s5p-jpeg/jpeg-core.c
index 80c83bba7af3..0d981bbf38bc 100644
--- a/drivers/media/platform/s5p-jpeg/jpeg-core.c
+++ b/drivers/media/platform/s5p-jpeg/jpeg-core.c
@@ -1262,13 +1262,16 @@ static int s5p_jpeg_querycap(struct file *file, void *priv,
 	return 0;
 }
 
-static int enum_fmt(struct s5p_jpeg_fmt *sjpeg_formats, int n,
+static int enum_fmt(struct s5p_jpeg_ctx *ctx,
+		    struct s5p_jpeg_fmt *sjpeg_formats, int n,
 		    struct v4l2_fmtdesc *f, u32 type)
 {
 	int i, num = 0;
+	unsigned int fmt_ver_flag = ctx->jpeg->variant->fmt_ver_flag;
 
 	for (i = 0; i < n; ++i) {
-		if (sjpeg_formats[i].flags & type) {
+		if (sjpeg_formats[i].flags & type &&
+		    sjpeg_formats[i].flags & fmt_ver_flag) {
 			/* index-th format of type type found ? */
 			if (num == f->index)
 				break;
@@ -1294,11 +1297,11 @@ static int s5p_jpeg_enum_fmt_vid_cap(struct file *file, void *priv,
 	struct s5p_jpeg_ctx *ctx = fh_to_ctx(priv);
 
 	if (ctx->mode == S5P_JPEG_ENCODE)
-		return enum_fmt(sjpeg_formats, SJPEG_NUM_FORMATS, f,
+		return enum_fmt(ctx, sjpeg_formats, SJPEG_NUM_FORMATS, f,
 				SJPEG_FMT_FLAG_ENC_CAPTURE);
 
-	return enum_fmt(sjpeg_formats, SJPEG_NUM_FORMATS, f,
-					SJPEG_FMT_FLAG_DEC_CAPTURE);
+	return enum_fmt(ctx, sjpeg_formats, SJPEG_NUM_FORMATS, f,
+			SJPEG_FMT_FLAG_DEC_CAPTURE);
 }
 
 static int s5p_jpeg_enum_fmt_vid_out(struct file *file, void *priv,
@@ -1307,11 +1310,11 @@ static int s5p_jpeg_enum_fmt_vid_out(struct file *file, void *priv,
 	struct s5p_jpeg_ctx *ctx = fh_to_ctx(priv);
 
 	if (ctx->mode == S5P_JPEG_ENCODE)
-		return enum_fmt(sjpeg_formats, SJPEG_NUM_FORMATS, f,
+		return enum_fmt(ctx, sjpeg_formats, SJPEG_NUM_FORMATS, f,
 				SJPEG_FMT_FLAG_ENC_OUTPUT);
 
-	return enum_fmt(sjpeg_formats, SJPEG_NUM_FORMATS, f,
-					SJPEG_FMT_FLAG_DEC_OUTPUT);
+	return enum_fmt(ctx, sjpeg_formats, SJPEG_NUM_FORMATS, f,
+			SJPEG_FMT_FLAG_DEC_OUTPUT);
 }
 
 static struct s5p_jpeg_q_data *get_q_data(struct s5p_jpeg_ctx *ctx,

From f2606873984ee56b438dee92cd28b12b0ecaa39d Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Thu, 10 Jan 2019 17:26:16 +0800
Subject: [PATCH 3112/3220] regulator: act8865: Fix
 act8600_sudcdc_voltage_ranges setting

[ Upstream commit f01a7beb6791f1c419424c1a6958b7d0a289c974 ]

The act8600_sudcdc_voltage_ranges setting does not match the datasheet.

The problems in below entry:
  REGULATOR_LINEAR_RANGE(19000000, 191, 255, 400000),

1. The off-by-one min_sel causes wrong volatage calculation.
   The min_sel should be 192.
2. According to the datasheet[1] Table 7. (on page 43):
   The selector 248 (0b11111000) ~ 255 (0b11111111) are 41.400V.

Also fix off-by-one for ACT8600_SUDCDC_VOLTAGE_NUM.

[1] https://active-semi.com/wp-content/uploads/ACT8600_Datasheet.pdf

Fixes: df3a950e4e73 ("regulator: act8865: Add act8600 support")
Signed-off-by: Axel Lin <axel.lin@ingics.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/regulator/act8865-regulator.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/regulator/act8865-regulator.c b/drivers/regulator/act8865-regulator.c
index f8d4cd3d1397..63a00d1d9360 100644
--- a/drivers/regulator/act8865-regulator.c
+++ b/drivers/regulator/act8865-regulator.c
@@ -131,7 +131,7 @@
  * ACT8865 voltage number
  */
 #define	ACT8865_VOLTAGE_NUM	64
-#define ACT8600_SUDCDC_VOLTAGE_NUM	255
+#define ACT8600_SUDCDC_VOLTAGE_NUM	256
 
 struct act8865 {
 	struct regmap *regmap;
@@ -154,7 +154,8 @@ static const struct regulator_linear_range act8600_sudcdc_voltage_ranges[] = {
 	REGULATOR_LINEAR_RANGE(3000000, 0, 63, 0),
 	REGULATOR_LINEAR_RANGE(3000000, 64, 159, 100000),
 	REGULATOR_LINEAR_RANGE(12600000, 160, 191, 200000),
-	REGULATOR_LINEAR_RANGE(19000000, 191, 255, 400000),
+	REGULATOR_LINEAR_RANGE(19000000, 192, 247, 400000),
+	REGULATOR_LINEAR_RANGE(41400000, 248, 255, 0),
 };
 
 static struct regulator_ops act8865_ops = {

From 517fbf72718ec612bc6fed008f278edf5fc116ab Mon Sep 17 00:00:00 2001
From: Zumeng Chen <zumeng.chen@gmail.com>
Date: Wed, 19 Dec 2018 15:50:29 +0800
Subject: [PATCH 3113/3220] wlcore: Fix memory leak in case
 wl12xx_fetch_firmware failure

[ Upstream commit ba2ffc96321c8433606ceeb85c9e722b8113e5a7 ]

Release fw_status, raw_fw_status, and tx_res_if when wl12xx_fetch_firmware
failed instead of meaningless goto out to avoid the following memory leak
reports(Only the last one listed):

unreferenced object 0xc28a9a00 (size 512):
  comm "kworker/0:4", pid 31298, jiffies 2783204 (age 203.290s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
  backtrace:
    [<6624adab>] kmemleak_alloc+0x40/0x74
    [<500ddb31>] kmem_cache_alloc_trace+0x1ac/0x270
    [<db4d731d>] wl12xx_chip_wakeup+0xc4/0x1fc [wlcore]
    [<76c5db53>] wl1271_op_add_interface+0x4a4/0x8f4 [wlcore]
    [<cbf30777>] drv_add_interface+0xa4/0x1a0 [mac80211]
    [<65bac325>] ieee80211_reconfig+0x9c0/0x1644 [mac80211]
    [<2817c80e>] ieee80211_restart_work+0x90/0xc8 [mac80211]
    [<7e1d425a>] process_one_work+0x284/0x42c
    [<55f9432e>] worker_thread+0x2fc/0x48c
    [<abb582c6>] kthread+0x148/0x160
    [<63144b13>] ret_from_fork+0x14/0x2c
    [< (null)>] (null)
    [<1f6e7715>] 0xffffffff

Signed-off-by: Zumeng Chen <zumeng.chen@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/wireless/ti/wlcore/main.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index 7b27c7e23af2..cc10b72607c6 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -1123,8 +1123,11 @@ static int wl12xx_chip_wakeup(struct wl1271 *wl, bool plt)
 		goto out;
 
 	ret = wl12xx_fetch_firmware(wl, plt);
-	if (ret < 0)
-		goto out;
+	if (ret < 0) {
+		kfree(wl->fw_status);
+		kfree(wl->raw_fw_status);
+		kfree(wl->tx_res_if);
+	}
 
 out:
 	return ret;

From 589562152f36bb48fd8e8ea6c17c260a16c63bbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafael=20=C3=81vila=20de=20Esp=C3=ADndola?=
 <rafael@espindo.la>
Date: Wed, 19 Dec 2018 11:01:43 -0800
Subject: [PATCH 3114/3220] x86/build: Mark per-CPU symbols as absolute
 explicitly for LLD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit d071ae09a4a1414c1433d5ae9908959a7325b0ad ]

Accessing per-CPU variables is done by finding the offset of the
variable in the per-CPU block and adding it to the address of the
respective CPU's block.

Section 3.10.8 of ld.bfd's documentation states:

  For expressions involving numbers, relative addresses and absolute
  addresses, ld follows these rules to evaluate terms:

  Other binary operations, that is, between two relative addresses
  not in the same section, or between a relative address and an
  absolute address, first convert any non-absolute term to an
  absolute address before applying the operator."

Note that LLVM's linker does not adhere to the GNU ld's implementation
and as such requires implicitly-absolute terms to be explicitly marked
as absolute in the linker script. If not, it fails currently with:

  ld.lld: error: ./arch/x86/kernel/vmlinux.lds:153: at least one side of the expression must be absolute
  ld.lld: error: ./arch/x86/kernel/vmlinux.lds:154: at least one side of the expression must be absolute
  Makefile:1040: recipe for target 'vmlinux' failed

This is not a functional change for ld.bfd which converts the term to an
absolute symbol anyways as specified above.

Based on a previous submission by Tri Vo <trong@android.com>.

Reported-by: Dmitry Golovin <dima@golovin.in>
Signed-off-by: Rafael Ávila de Espíndola <rafael@espindo.la>
[ Update commit message per Boris' and Michael's suggestions. ]
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
[ Massage commit message more, fix typos. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Dmitry Golovin <dima@golovin.in>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Cao Jin <caoj.fnst@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tri Vo <trong@android.com>
Cc: dima@golovin.in
Cc: morbo@google.com
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20181219190145.252035-1-ndesaulniers@google.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kernel/vmlinux.lds.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index a703842b54de..17e1e60b6b40 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -365,7 +365,7 @@ SECTIONS
  * Per-cpu symbols which need to be offset from __per_cpu_load
  * for the boot processor.
  */
-#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
+#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
 INIT_PER_CPU(gdt_page);
 INIT_PER_CPU(irq_stack_union);
 

From 04ce0e76959cc8a4dce9b0370054a91ef7129b4b Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Wed, 21 Nov 2018 16:13:19 +0000
Subject: [PATCH 3115/3220] dmaengine: tegra: avoid overflow of byte tracking

[ Upstream commit e486df39305864604b7e25f2a95d51039517ac57 ]

The dma_desc->bytes_transferred counter tracks the number of bytes
moved by the DMA channel. This is then used to calculate the information
passed back in the in the tegra_dma_tx_status callback, which is usually
fine.

When the DMA channel is configured as continous, then the bytes_transferred
counter will increase over time and eventually overflow to become negative
so the residue count will become invalid and the ALSA sound-dma code will
report invalid hardware pointer values to the application. This results in
some users becoming confused about the playout position and putting audio
data in the wrong place.

To fix this issue, always ensure the bytes_transferred field is modulo the
size of the request. We only do this for the case of the cyclic transfer
done ISR as anyone attempting to move 2GiB of DMA data in one transfer
is unlikely.

Note, we don't fix the issue that we should /never/ transfer a negative
number of bytes so we could make those fields unsigned.

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Acked-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/dma/tegra20-apb-dma.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
index c8f79dcaaee8..67f201b8dcda 100644
--- a/drivers/dma/tegra20-apb-dma.c
+++ b/drivers/dma/tegra20-apb-dma.c
@@ -632,7 +632,10 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
 
 	sgreq = list_first_entry(&tdc->pending_sg_req, typeof(*sgreq), node);
 	dma_desc = sgreq->dma_desc;
-	dma_desc->bytes_transferred += sgreq->req_len;
+	/* if we dma for long enough the transfer count will wrap */
+	dma_desc->bytes_transferred =
+		(dma_desc->bytes_transferred + sgreq->req_len) %
+		dma_desc->bytes_requested;
 
 	/* Callback need to be call */
 	if (!dma_desc->cb_count)

From 7ad57b32aa083dee3f476e485a986ce8f63b509f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 28 Sep 2018 21:03:59 +0300
Subject: [PATCH 3116/3220] drm/dp/mst: Configure no_stop_bit correctly for
 remote i2c xfers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit c978ae9bde582e82a04c63a4071701691dd8b35c ]

We aren't supposed to force a stop+start between every i2c msg
when performing multi message transfers. This should eg. cause
the DDC segment address to be reset back to 0 between writing
the segment address and reading the actual EDID extension block.

To quote the E-DDC spec:
"... this standard requires that the segment pointer be
 reset to 00h when a NO ACK or a STOP condition is received."

Since we're going to touch this might as well consult the
I2C_M_STOP flag to determine whether we want to force the stop
or not.

Cc: Brian Vincent <brainn@gmail.com>
References: https://bugs.freedesktop.org/show_bug.cgi?id=108081
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180928180403.22499-1-ville.syrjala@linux.intel.com
Reviewed-by: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/drm_dp_mst_topology.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c
index 5a1bafb5ecbb..ff12d926eb65 100644
--- a/drivers/gpu/drm/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/drm_dp_mst_topology.c
@@ -3019,6 +3019,7 @@ static int drm_dp_mst_i2c_xfer(struct i2c_adapter *adapter, struct i2c_msg *msgs
 		msg.u.i2c_read.transactions[i].i2c_dev_id = msgs[i].addr;
 		msg.u.i2c_read.transactions[i].num_bytes = msgs[i].len;
 		msg.u.i2c_read.transactions[i].bytes = msgs[i].buf;
+		msg.u.i2c_read.transactions[i].no_stop_bit = !(msgs[i].flags & I2C_M_STOP);
 	}
 	msg.u.i2c_read.read_i2c_device_id = msgs[num - 1].addr;
 	msg.u.i2c_read.num_bytes_read = msgs[num - 1].len;

From 1eb40df35d99f353e7bdfcbb21f249428ec0f4b6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 22 Aug 2016 16:41:46 -0700
Subject: [PATCH 3117/3220] binfmt_elf: switch to new creds when switching to
 new mm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 9f834ec18defc369d73ccf9e87a2790bfa05bf46 upstream.

We used to delay switching to the new credentials until after we had
mapped the executable (and possible elf interpreter).  That was kind of
odd to begin with, since the new executable will actually then _run_
with the new creds, but whatever.

The bigger problem was that we also want to make sure that we turn off
prof events and tracing before we start mapping the new executable
state.  So while this is a cleanup, it's also a fix for a possible
information leak.

Reported-by: Robert Święcki <robert@swiecki.net>
Tested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Federico Manuel Bento <up201407890@fc.up.pt>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/binfmt_elf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f010d6c8dd14..f1f32e55d877 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -850,6 +850,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		current->flags |= PF_RANDOMIZE;
 
 	setup_new_exec(bprm);
+	install_exec_creds(bprm);
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
@@ -1084,7 +1085,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		goto out;
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
 
-	install_exec_creds(bprm);
 	retval = create_elf_tables(bprm, &loc->elf_ex,
 			  load_addr, interp_load_addr);
 	if (retval < 0)

From a34640de4046b962a848da232c95fff01d1265a7 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 11 Feb 2019 11:30:04 -0800
Subject: [PATCH 3118/3220] kbuild: clang: choose GCC_TOOLCHAIN_DIR not on LD

commit ad15006cc78459d059af56729c4d9bed7c7fd860 upstream.

This causes an issue when trying to build with `make LD=ld.lld` if
ld.lld and the rest of your cross tools aren't in the same directory
(ex. /usr/local/bin) (as is the case for Android's build system), as the
GCC_TOOLCHAIN_DIR then gets set based on `which $(LD)` which will point
where LLVM tools are, not GCC/binutils tools are located.

Instead, select the GCC_TOOLCHAIN_DIR based on another tool provided by
binutils for which LLVM does not provide a substitute for, such as
elfedit.

Fixes: 785f11aa595b ("kbuild: Add better clang cross build support")
Link: https://github.com/ClangBuiltLinux/linux/issues/341
Suggested-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Tested-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 35be7983ef2d..7bf3fb717921 100644
--- a/Makefile
+++ b/Makefile
@@ -610,7 +610,7 @@ all: vmlinux
 ifeq ($(cc-name),clang)
 ifneq ($(CROSS_COMPILE),)
 CLANG_TARGET	:= --target=$(notdir $(CROSS_COMPILE:%-=%))
-GCC_TOOLCHAIN_DIR := $(dir $(shell which $(LD)))
+GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
 CLANG_PREFIX	:= --prefix=$(GCC_TOOLCHAIN_DIR)
 GCC_TOOLCHAIN	:= $(realpath $(GCC_TOOLCHAIN_DIR)/..)
 endif

From b79e268b1fb420c4398b0e5977a35e535b9e4301 Mon Sep 17 00:00:00 2001
From: George Rimar <grimar@accesssoftek.com>
Date: Fri, 11 Jan 2019 12:10:12 -0800
Subject: [PATCH 3119/3220] x86/build: Specify elf_i386 linker emulation
 explicitly for i386 objects

commit 927185c124d62a9a4d35878d7f6d432a166b74e3 upstream.

The kernel uses the OUTPUT_FORMAT linker script command in it's linker
scripts. Most of the time, the -m option is passed to the linker with
correct architecture, but sometimes (at least for x86_64) the -m option
contradicts the OUTPUT_FORMAT directive.

Specifically, arch/x86/boot and arch/x86/realmode/rm produce i386 object
files, but are linked with the -m elf_x86_64 linker flag when building
for x86_64.

The GNU linker manpage doesn't explicitly state any tie-breakers between
-m and OUTPUT_FORMAT. But with BFD and Gold linkers, OUTPUT_FORMAT
overrides the emulation value specified with the -m option.

LLVM lld has a different behavior, however. When supplied with
contradicting -m and OUTPUT_FORMAT values it fails with the following
error message:

  ld.lld: error: arch/x86/realmode/rm/header.o is incompatible with elf_x86_64

Therefore, just add the correct -m after the incorrect one (it overrides
it), so the linker invocation looks like this:

  ld -m elf_x86_64 -z max-page-size=0x200000 -m elf_i386 --emit-relocs -T \
    realmode.lds header.o trampoline_64.o stack.o reboot.o -o realmode.elf

This is not a functional change for GNU ld, because (although not
explicitly documented) OUTPUT_FORMAT overrides -m EMULATION.

Tested by building x86_64 kernel with GNU gcc/ld toolchain and booting
it in QEMU.

 [ bp: massage and clarify text. ]

Suggested-by: Dmitry Golovin <dima@golovin.in>
Signed-off-by: George Rimar <grimar@accesssoftek.com>
Signed-off-by: Tri Vo <trong@android.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Tri Vo <trong@android.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michael Matz <matz@suse.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: morbo@google.com
Cc: ndesaulniers@google.com
Cc: ruiu@google.com
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190111201012.71210-1-trong@android.com
[nc: Fix conflicts due to lack of commit 58ab5e0c2c40 ("Kbuild: arch:
     look for generated headers in obtree") in this tree]
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/boot/Makefile        | 2 +-
 arch/x86/realmode/rm/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6da2cd0897f3..e94745321cac 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -100,7 +100,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
 AFLAGS_header.o += -I$(obj)
 $(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
 
-LDFLAGS_setup.elf	:= -T
+LDFLAGS_setup.elf	:= -m elf_i386 -T
 $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
 	$(call if_changed,ld)
 
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 2730d775ef9a..228cb16962ba 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -43,7 +43,7 @@ $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
 targets += realmode.lds
 $(obj)/realmode.lds: $(obj)/pasyms.h
 
-LDFLAGS_realmode.elf := --emit-relocs -T
+LDFLAGS_realmode.elf := -m elf_i386 --emit-relocs -T
 CPPFLAGS_realmode.lds += -P -C -I$(obj)
 
 targets += realmode.elf

From 79739ad2d0ac5787a15a1acf7caaf34cd95bbf3c Mon Sep 17 00:00:00 2001
From: Alistair Strachan <astrachan@google.com>
Date: Fri, 3 Aug 2018 10:39:31 -0700
Subject: [PATCH 3120/3220] x86: vdso: Use $LD instead of $CC to link

commit 379d98ddf41344273d9718556f761420f4dc80b3 upstream.

The vdso{32,64}.so can fail to link with CC=clang when clang tries to find
a suitable GCC toolchain to link these libraries with.

/usr/bin/ld: arch/x86/entry/vdso/vclock_gettime.o:
  access beyond end of merged section (782)

This happens because the host environment leaked into the cross compiler
environment due to the way clang searches for suitable GCC toolchains.

Clang is a retargetable compiler, and each invocation of it must provide
--target=<something> --gcc-toolchain=<something> to allow it to find the
correct binutils for cross compilation. These flags had been added to
KBUILD_CFLAGS, but the vdso code uses CC and not KBUILD_CFLAGS (for various
reasons) which breaks clang's ability to find the correct linker when cross
compiling.

Most of the time this goes unnoticed because the host linker is new enough
to work anyway, or is incompatible and skipped, but this cannot be reliably
assumed.

This change alters the vdso makefile to just use LD directly, which
bypasses clang and thus the searching problem. The makefile will just use
${CROSS_COMPILE}ld instead, which is always what we want. This matches the
method used to link vmlinux.

This drops references to DISABLE_LTO; this option doesn't seem to be set
anywhere, and not knowing what its possible values are, it's not clear how
to convert it from CC to LD flag.

Signed-off-by: Alistair Strachan <astrachan@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: kernel-team@android.com
Cc: joel@joelfernandes.org
Cc: Andi Kleen <andi.kleen@intel.com>
Link: https://lkml.kernel.org/r/20180803173931.117515-1-astrachan@google.com
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/entry/vdso/Makefile | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 265c0ed68118..84c4a7105c2a 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -41,10 +41,8 @@ targets += $(vdso_img_sodbg)
 
 export CPPFLAGS_vdso.lds += -P -C
 
-VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
-			-Wl,--no-undefined \
-			-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
-			$(DISABLE_LTO)
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
+			-z max-page-size=4096 -z common-page-size=4096
 
 $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
 	$(call if_changed,vdso)
@@ -90,10 +88,8 @@ CFLAGS_REMOVE_vvar.o = -pg
 #
 
 CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
-VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
-			   -Wl,-soname=linux-vdso.so.1 \
-			   -Wl,-z,max-page-size=4096 \
-			   -Wl,-z,common-page-size=4096
+VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
+			   -z max-page-size=4096 -z common-page-size=4096
 
 # 64-bit objects to re-brand as x32
 vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))
@@ -121,7 +117,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
 	$(call if_changed,vdso)
 
 CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
-VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
+VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
 
 # This makes sure the $(obj) subdirectory exists even though vdso32/
 # is not a kbuild sub-make subdirectory.
@@ -157,13 +153,13 @@ $(obj)/vdso32.so.dbg: FORCE \
 # The DSO images are built using a special linker script.
 #
 quiet_cmd_vdso = VDSO    $@
-      cmd_vdso = $(CC) -nostdlib -o $@ \
+      cmd_vdso = $(LD) -nostdlib -o $@ \
 		       $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
+		       -T $(filter %.lds,$^) $(filter %.o,$^) && \
 		 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
 
-VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
-	$(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
+VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \
+	$(call ld-option, --build-id) -Bsymbolic
 GCOV_PROFILE := n
 
 #

From 7c45b45fd6e928c9ce275c32f6fa98d317e6f5ee Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Thu, 6 Dec 2018 11:12:31 -0800
Subject: [PATCH 3121/3220] x86/vdso: Drop implicit common-page-size linker
 flag

commit ac3e233d29f7f77f28243af0132057d378d3ea58 upstream.

GNU linker's -z common-page-size's default value is based on the target
architecture. arch/x86/entry/vdso/Makefile sets it to the architecture
default, which is implicit and redundant. Drop it.

Fixes: 2aae950b21e4 ("x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu")
Reported-by: Dmitry Golovin <dima@golovin.in>
Reported-by: Bill Wendling <morbo@google.com>
Suggested-by: Dmitry Golovin <dima@golovin.in>
Suggested-by: Rui Ueyama <ruiu@google.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Fangrui Song <maskray@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20181206191231.192355-1-ndesaulniers@google.com
Link: https://bugs.llvm.org/show_bug.cgi?id=38774
Link: https://github.com/ClangBuiltLinux/linux/issues/31
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/entry/vdso/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 84c4a7105c2a..297dda4d5947 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -42,7 +42,7 @@ targets += $(vdso_img_sodbg)
 export CPPFLAGS_vdso.lds += -P -C
 
 VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
-			-z max-page-size=4096 -z common-page-size=4096
+			-z max-page-size=4096
 
 $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
 	$(call if_changed,vdso)
@@ -89,7 +89,7 @@ CFLAGS_REMOVE_vvar.o = -pg
 
 CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
 VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
-			   -z max-page-size=4096 -z common-page-size=4096
+			   -z max-page-size=4096
 
 # 64-bit objects to re-brand as x32
 vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))

From 780e236c373713ff0b7ba7218c0723b6c06cac52 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 5 Apr 2019 18:38:45 -0700
Subject: [PATCH 3122/3220] lib/string.c: implement a basic bcmp

[ Upstream commit 5f074f3e192f10c9fade898b9b3b8812e3d83342 ]

A recent optimization in Clang (r355672) lowers comparisons of the
return value of memcmp against zero to comparisons of the return value
of bcmp against zero.  This helps some platforms that implement bcmp
more efficiently than memcmp.  glibc simply aliases bcmp to memcmp, but
an optimized implementation is in the works.

This results in linkage failures for all targets with Clang due to the
undefined symbol.  For now, just implement bcmp as a tailcail to memcmp
to unbreak the build.  This routine can be further optimized in the
future.

Other ideas discussed:

 * A weak alias was discussed, but breaks for architectures that define
   their own implementations of memcmp since aliases to declarations are
   not permitted (only definitions). Arch-specific memcmp
   implementations typically declare memcmp in C headers, but implement
   them in assembly.

 * -ffreestanding also is used sporadically throughout the kernel.

 * -fno-builtin-bcmp doesn't work when doing LTO.

Link: https://bugs.llvm.org/show_bug.cgi?id=41035
Link: https://code.woboq.org/userspace/glibc/string/memcmp.c.html#bcmp
Link: https://github.com/llvm/llvm-project/commit/8e16d73346f8091461319a7dfc4ddd18eedcff13
Link: https://github.com/ClangBuiltLinux/linux/issues/416
Link: http://lkml.kernel.org/r/20190313211335.165605-1-ndesaulniers@google.com
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Reported-by: Nathan Chancellor <natechancellor@gmail.com>
Reported-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: James Y Knight <jyknight@google.com>
Suggested-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Suggested-by: Nathan Chancellor <natechancellor@gmail.com>
Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Tested-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/linux/string.h |  3 +++
 lib/string.c           | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/include/linux/string.h b/include/linux/string.h
index c026b7a19e26..870268d42ae7 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -110,6 +110,9 @@ extern void * memscan(void *,int,__kernel_size_t);
 #ifndef __HAVE_ARCH_MEMCMP
 extern int memcmp(const void *,const void *,__kernel_size_t);
 #endif
+#ifndef __HAVE_ARCH_BCMP
+extern int bcmp(const void *,const void *,__kernel_size_t);
+#endif
 #ifndef __HAVE_ARCH_MEMCHR
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
diff --git a/lib/string.c b/lib/string.c
index 1a90db9bc6e1..c7cf65ac42ad 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -746,6 +746,26 @@ __visible int memcmp(const void *cs, const void *ct, size_t count)
 EXPORT_SYMBOL(memcmp);
 #endif
 
+#ifndef __HAVE_ARCH_BCMP
+/**
+ * bcmp - returns 0 if and only if the buffers have identical contents.
+ * @a: pointer to first buffer.
+ * @b: pointer to second buffer.
+ * @len: size of buffers.
+ *
+ * The sign or magnitude of a non-zero return value has no particular
+ * meaning, and architectures may implement their own more efficient bcmp(). So
+ * while this particular implementation is a simple (tail) call to memcmp, do
+ * not rely on anything but whether the return value is zero or non-zero.
+ */
+#undef bcmp
+int bcmp(const void *a, const void *b, size_t len)
+{
+	return memcmp(a, b, len);
+}
+EXPORT_SYMBOL(bcmp);
+#endif
+
 #ifndef __HAVE_ARCH_MEMSCAN
 /**
  * memscan - Find a character in an area of memory.

From c3e76f072adbcc426fbeef5d7469f847a2f700d6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 5 Apr 2019 15:39:26 +0200
Subject: [PATCH 3123/3220] tty: mark Siemens R3964 line discipline as BROKEN

commit c7084edc3f6d67750f50d4183134c4fb5712a5c8 upstream.

The n_r3964 line discipline driver was written in a different time, when
SMP machines were rare, and users were trusted to do the right thing.
Since then, the world has moved on but not this code, it has stayed
rooted in the past with its lovely hand-crafted list structures and
loads of "interesting" race conditions all over the place.

After attempting to clean up most of the issues, I just gave up and am
now marking the driver as BROKEN so that hopefully someone who has this
hardware will show up out of the woodwork (I know you are out there!)
and will help with debugging a raft of changes that I had laying around
for the code, but was too afraid to commit as odds are they would break
things.

Many thanks to Jann and Linus for pointing out the initial problems in
this codebase, as well as many reviews of my attempts to fix the issues.
It was a case of whack-a-mole, and as you can see, the mole won.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 3143db57ce44..2bc741bea8f3 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -389,7 +389,7 @@ config XILINX_HWICAP
 
 config R3964
 	tristate "Siemens R3964 line discipline"
-	depends on TTY
+	depends on TTY && BROKEN
 	---help---
 	  This driver allows synchronous communication with devices using the
 	  Siemens R3964 packet protocol. Unless you are dealing with special

From c9b31a7e672630051e815d70ca8697ed34078951 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 21 Jan 2019 17:26:42 +0100
Subject: [PATCH 3124/3220] tty: ldisc: add sysctl to prevent autoloading of
 ldiscs

commit 7c0cca7c847e6e019d67b7d793efbbe3b947d004 upstream.

By default, the kernel will automatically load the module of any line
dicipline that is asked for.  As this sometimes isn't the safest thing
to do, provide a sysctl to disable this feature.

By default, we set this to 'y' as that is the historical way that Linux
has worked, and we do not want to break working systems.  But in the
future, perhaps this can default to 'n' to prevent this functionality.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/Kconfig     | 23 ++++++++++++++++++++
 drivers/tty/tty_io.c    |  3 +++
 drivers/tty/tty_ldisc.c | 47 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)

diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 82c4d2e45319..864cceea46ad 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -466,4 +466,27 @@ config MIPS_EJTAG_FDC_KGDB_CHAN
 	help
 	  FDC channel number to use for KGDB.
 
+config LDISC_AUTOLOAD
+	bool "Automatically load TTY Line Disciplines"
+	default y
+	help
+	  Historically the kernel has always automatically loaded any
+	  line discipline that is in a kernel module when a user asks
+	  for it to be loaded with the TIOCSETD ioctl, or through other
+	  means.  This is not always the best thing to do on systems
+	  where you know you will not be using some of the more
+	  "ancient" line disciplines, so prevent the kernel from doing
+	  this unless the request is coming from a process with the
+	  CAP_SYS_MODULE permissions.
+
+	  Say 'Y' here if you trust your userspace users to do the right
+	  thing, or if you have only provided the line disciplines that
+	  you know you will be using, or if you wish to continue to use
+	  the traditional method of on-demand loading of these modules
+	  by any user.
+
+	  This functionality can be changed at runtime with the
+	  dev.tty.ldisc_autoload sysctl, this configuration option will
+	  only set the default value of this functionality.
+
 endif # TTY
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 5b86ebc76a8a..b7effcfee91d 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -513,6 +513,8 @@ void proc_clear_tty(struct task_struct *p)
 	tty_kref_put(tty);
 }
 
+extern void tty_sysctl_init(void);
+
 /**
  * proc_set_tty -  set the controlling terminal
  *
@@ -3689,6 +3691,7 @@ void console_sysfs_notify(void)
  */
 int __init tty_init(void)
 {
+	tty_sysctl_init();
 	cdev_init(&tty_cdev, &tty_fops);
 	if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) ||
 	    register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0)
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index d9e013dc2c08..02ab7e2d4494 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -148,6 +148,13 @@ static void put_ldops(struct tty_ldisc_ops *ldops)
  *		takes tty_ldiscs_lock to guard against ldisc races
  */
 
+#if defined(CONFIG_LDISC_AUTOLOAD)
+	#define INITIAL_AUTOLOAD_STATE	1
+#else
+	#define INITIAL_AUTOLOAD_STATE	0
+#endif
+static int tty_ldisc_autoload = INITIAL_AUTOLOAD_STATE;
+
 static struct tty_ldisc *tty_ldisc_get(struct tty_struct *tty, int disc)
 {
 	struct tty_ldisc *ld;
@@ -162,6 +169,8 @@ static struct tty_ldisc *tty_ldisc_get(struct tty_struct *tty, int disc)
 	 */
 	ldops = get_ldops(disc);
 	if (IS_ERR(ldops)) {
+		if (!capable(CAP_SYS_MODULE) && !tty_ldisc_autoload)
+			return ERR_PTR(-EPERM);
 		request_module("tty-ldisc-%d", disc);
 		ldops = get_ldops(disc);
 		if (IS_ERR(ldops))
@@ -830,3 +839,41 @@ void tty_ldisc_begin(void)
 	/* Setup the default TTY line discipline. */
 	(void) tty_register_ldisc(N_TTY, &tty_ldisc_N_TTY);
 }
+
+static int zero;
+static int one = 1;
+static struct ctl_table tty_table[] = {
+	{
+		.procname	= "ldisc_autoload",
+		.data		= &tty_ldisc_autoload,
+		.maxlen		= sizeof(tty_ldisc_autoload),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{ }
+};
+
+static struct ctl_table tty_dir_table[] = {
+	{
+		.procname	= "tty",
+		.mode		= 0555,
+		.child		= tty_table,
+	},
+	{ }
+};
+
+static struct ctl_table tty_root_table[] = {
+	{
+		.procname	= "dev",
+		.mode		= 0555,
+		.child		= tty_dir_table,
+	},
+	{ }
+};
+
+void tty_sysctl_init(void)
+{
+	register_sysctl_table(tty_root_table);
+}

From c0aeeafae9441b5e96be40d94a05f7fb88ff2dc4 Mon Sep 17 00:00:00 2001
From: Junwei Hu <hujunwei4@huawei.com>
Date: Tue, 2 Apr 2019 19:38:04 +0800
Subject: [PATCH 3125/3220] ipv6: Fix dangling pointer when ipv6 fragment

[ Upstream commit ef0efcd3bd3fd0589732b67fb586ffd3c8705806 ]

At the beginning of ip6_fragment func, the prevhdr pointer is
obtained in the ip6_find_1stfragopt func.
However, all the pointers pointing into skb header may change
when calling skb_checksum_help func with
skb->ip_summed = CHECKSUM_PARTIAL condition.
The prevhdr pointe will be dangling if it is not reloaded after
calling __skb_linearize func in skb_checksum_help func.

Here, I add a variable, nexthdr_offset, to evaluate the offset,
which does not changes even after calling __skb_linearize func.

Fixes: 405c92f7a541 ("ipv6: add defensive check for CHECKSUM_PARTIAL skbs in ip_fragment")
Signed-off-by: Junwei Hu <hujunwei4@huawei.com>
Reported-by: Wenhao Zhang <zhangwenhao8@huawei.com>
Reported-by: syzbot+e8ce541d095e486074fc@syzkaller.appspotmail.com
Reviewed-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6_output.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f8cca81d66f2..e39dc94486b2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -575,7 +575,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 				inet6_sk(skb->sk) : NULL;
 	struct ipv6hdr *tmp_hdr;
 	struct frag_hdr *fh;
-	unsigned int mtu, hlen, left, len;
+	unsigned int mtu, hlen, left, len, nexthdr_offset;
 	int hroom, troom;
 	__be32 frag_id;
 	int ptr, offset = 0, err = 0;
@@ -586,6 +586,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		goto fail;
 	hlen = err;
 	nexthdr = *prevhdr;
+	nexthdr_offset = prevhdr - skb_network_header(skb);
 
 	mtu = ip6_skb_dst_mtu(skb);
 
@@ -620,6 +621,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	    (err = skb_checksum_help(skb)))
 		goto fail;
 
+	prevhdr = skb_network_header(skb) + nexthdr_offset;
 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 	if (skb_has_frag_list(skb)) {
 		int first_len = skb_pagelen(skb);

From a04dde4e15d7b10752d64c2b56b3e36deeb4a45c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Thu, 4 Apr 2019 16:37:53 +0200
Subject: [PATCH 3126/3220] ipv6: sit: reset ip header pointer in ipip6_rcv

[ Upstream commit bb9bd814ebf04f579be466ba61fc922625508807 ]

ipip6 tunnels run iptunnel_pull_header on received skbs. This can
determine the following use-after-free accessing iph pointer since
the packet will be 'uncloned' running pskb_expand_head if it is a
cloned gso skb (e.g if the packet has been sent though a veth device)

[  706.369655] BUG: KASAN: use-after-free in ipip6_rcv+0x1678/0x16e0 [sit]
[  706.449056] Read of size 1 at addr ffffe01b6bd855f5 by task ksoftirqd/1/=
[  706.669494] Hardware name: HPE ProLiant m400 Server/ProLiant m400 Server, BIOS U02 08/19/2016
[  706.771839] Call trace:
[  706.801159]  dump_backtrace+0x0/0x2f8
[  706.845079]  show_stack+0x24/0x30
[  706.884833]  dump_stack+0xe0/0x11c
[  706.925629]  print_address_description+0x68/0x260
[  706.982070]  kasan_report+0x178/0x340
[  707.025995]  __asan_report_load1_noabort+0x30/0x40
[  707.083481]  ipip6_rcv+0x1678/0x16e0 [sit]
[  707.132623]  tunnel64_rcv+0xd4/0x200 [tunnel4]
[  707.185940]  ip_local_deliver_finish+0x3b8/0x988
[  707.241338]  ip_local_deliver+0x144/0x470
[  707.289436]  ip_rcv_finish+0x43c/0x14b0
[  707.335447]  ip_rcv+0x628/0x1138
[  707.374151]  __netif_receive_skb_core+0x1670/0x2600
[  707.432680]  __netif_receive_skb+0x28/0x190
[  707.482859]  process_backlog+0x1d0/0x610
[  707.529913]  net_rx_action+0x37c/0xf68
[  707.574882]  __do_softirq+0x288/0x1018
[  707.619852]  run_ksoftirqd+0x70/0xa8
[  707.662734]  smpboot_thread_fn+0x3a4/0x9e8
[  707.711875]  kthread+0x2c8/0x350
[  707.750583]  ret_from_fork+0x10/0x18

[  707.811302] Allocated by task 16982:
[  707.854182]  kasan_kmalloc.part.1+0x40/0x108
[  707.905405]  kasan_kmalloc+0xb4/0xc8
[  707.948291]  kasan_slab_alloc+0x14/0x20
[  707.994309]  __kmalloc_node_track_caller+0x158/0x5e0
[  708.053902]  __kmalloc_reserve.isra.8+0x54/0xe0
[  708.108280]  __alloc_skb+0xd8/0x400
[  708.150139]  sk_stream_alloc_skb+0xa4/0x638
[  708.200346]  tcp_sendmsg_locked+0x818/0x2b90
[  708.251581]  tcp_sendmsg+0x40/0x60
[  708.292376]  inet_sendmsg+0xf0/0x520
[  708.335259]  sock_sendmsg+0xac/0xf8
[  708.377096]  sock_write_iter+0x1c0/0x2c0
[  708.424154]  new_sync_write+0x358/0x4a8
[  708.470162]  __vfs_write+0xc4/0xf8
[  708.510950]  vfs_write+0x12c/0x3d0
[  708.551739]  ksys_write+0xcc/0x178
[  708.592533]  __arm64_sys_write+0x70/0xa0
[  708.639593]  el0_svc_handler+0x13c/0x298
[  708.686646]  el0_svc+0x8/0xc

[  708.739019] Freed by task 17:
[  708.774597]  __kasan_slab_free+0x114/0x228
[  708.823736]  kasan_slab_free+0x10/0x18
[  708.868703]  kfree+0x100/0x3d8
[  708.905320]  skb_free_head+0x7c/0x98
[  708.948204]  skb_release_data+0x320/0x490
[  708.996301]  pskb_expand_head+0x60c/0x970
[  709.044399]  __iptunnel_pull_header+0x3b8/0x5d0
[  709.098770]  ipip6_rcv+0x41c/0x16e0 [sit]
[  709.146873]  tunnel64_rcv+0xd4/0x200 [tunnel4]
[  709.200195]  ip_local_deliver_finish+0x3b8/0x988
[  709.255596]  ip_local_deliver+0x144/0x470
[  709.303692]  ip_rcv_finish+0x43c/0x14b0
[  709.349705]  ip_rcv+0x628/0x1138
[  709.388413]  __netif_receive_skb_core+0x1670/0x2600
[  709.446943]  __netif_receive_skb+0x28/0x190
[  709.497120]  process_backlog+0x1d0/0x610
[  709.544169]  net_rx_action+0x37c/0xf68
[  709.589131]  __do_softirq+0x288/0x1018

[  709.651938] The buggy address belongs to the object at ffffe01b6bd85580
                which belongs to the cache kmalloc-1024 of size 1024
[  709.804356] The buggy address is located 117 bytes inside of
                1024-byte region [ffffe01b6bd85580, ffffe01b6bd85980)
[  709.946340] The buggy address belongs to the page:
[  710.003824] page:ffff7ff806daf600 count:1 mapcount:0 mapping:ffffe01c4001f600 index:0x0
[  710.099914] flags: 0xfffff8000000100(slab)
[  710.149059] raw: 0fffff8000000100 dead000000000100 dead000000000200 ffffe01c4001f600
[  710.242011] raw: 0000000000000000 0000000000380038 00000001ffffffff 0000000000000000
[  710.334966] page dumped because: kasan: bad access detected

Fix it resetting iph pointer after iptunnel_pull_header

Fixes: a09a4c8dd1ec ("tunnels: Remove encapsulation offloads on decap")
Tested-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/sit.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 96582ec9c807..77736190dc15 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -693,6 +693,10 @@ static int ipip6_rcv(struct sk_buff *skb)
 		if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6)))
 			goto out;
 
+		/* skb can be uncloned in iptunnel_pull_header, so
+		 * old iph is no longer valid
+		 */
+		iph = (const struct iphdr *)skb_mac_header(skb);
 		err = IP_ECN_decapsulate(iph, skb);
 		if (unlikely(err)) {
 			if (log_ecn_error)

From c4e97b06cfdc5213494c22dd5c2b41ff8b15e0ee Mon Sep 17 00:00:00 2001
From: Mao Wenan <maowenan@huawei.com>
Date: Thu, 28 Mar 2019 17:10:56 +0800
Subject: [PATCH 3127/3220] net: rds: force to destroy connection if t_sock is
 NULL in rds_tcp_kill_sock().

[ Upstream commit cb66ddd156203daefb8d71158036b27b0e2caf63 ]

When it is to cleanup net namespace, rds_tcp_exit_net() will call
rds_tcp_kill_sock(), if t_sock is NULL, it will not call
rds_conn_destroy(), rds_conn_path_destroy() and rds_tcp_conn_free() to free
connection, and the worker cp_conn_w is not stopped, afterwards the net is freed in
net_drop_ns(); While cp_conn_w rds_connect_worker() will call rds_tcp_conn_path_connect()
and reference 'net' which has already been freed.

In rds_tcp_conn_path_connect(), rds_tcp_set_callbacks() will set t_sock = sock before
sock->ops->connect, but if connect() is failed, it will call
rds_tcp_restore_callbacks() and set t_sock = NULL, if connect is always
failed, rds_connect_worker() will try to reconnect all the time, so
rds_tcp_kill_sock() will never to cancel worker cp_conn_w and free the
connections.

Therefore, the condition !tc->t_sock is not needed if it is going to do
cleanup_net->rds_tcp_exit_net->rds_tcp_kill_sock, because tc->t_sock is always
NULL, and there is on other path to cancel cp_conn_w and free
connection. So this patch is to fix this.

rds_tcp_kill_sock():
...
if (net != c_net || !tc->t_sock)
...
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>

==================================================================
BUG: KASAN: use-after-free in inet_create+0xbcc/0xd28
net/ipv4/af_inet.c:340
Read of size 4 at addr ffff8003496a4684 by task kworker/u8:4/3721

CPU: 3 PID: 3721 Comm: kworker/u8:4 Not tainted 5.1.0 #11
Hardware name: linux,dummy-virt (DT)
Workqueue: krdsd rds_connect_worker
Call trace:
 dump_backtrace+0x0/0x3c0 arch/arm64/kernel/time.c:53
 show_stack+0x28/0x38 arch/arm64/kernel/traps.c:152
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x120/0x188 lib/dump_stack.c:113
 print_address_description+0x68/0x278 mm/kasan/report.c:253
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x21c/0x348 mm/kasan/report.c:409
 __asan_report_load4_noabort+0x30/0x40 mm/kasan/report.c:429
 inet_create+0xbcc/0xd28 net/ipv4/af_inet.c:340
 __sock_create+0x4f8/0x770 net/socket.c:1276
 sock_create_kern+0x50/0x68 net/socket.c:1322
 rds_tcp_conn_path_connect+0x2b4/0x690 net/rds/tcp_connect.c:114
 rds_connect_worker+0x108/0x1d0 net/rds/threads.c:175
 process_one_work+0x6e8/0x1700 kernel/workqueue.c:2153
 worker_thread+0x3b0/0xdd0 kernel/workqueue.c:2296
 kthread+0x2f0/0x378 kernel/kthread.c:255
 ret_from_fork+0x10/0x18 arch/arm64/kernel/entry.S:1117

Allocated by task 687:
 save_stack mm/kasan/kasan.c:448 [inline]
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xd4/0x180 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x14/0x20 mm/kasan/kasan.c:490
 slab_post_alloc_hook mm/slab.h:444 [inline]
 slab_alloc_node mm/slub.c:2705 [inline]
 slab_alloc mm/slub.c:2713 [inline]
 kmem_cache_alloc+0x14c/0x388 mm/slub.c:2718
 kmem_cache_zalloc include/linux/slab.h:697 [inline]
 net_alloc net/core/net_namespace.c:384 [inline]
 copy_net_ns+0xc4/0x2d0 net/core/net_namespace.c:424
 create_new_namespaces+0x300/0x658 kernel/nsproxy.c:107
 unshare_nsproxy_namespaces+0xa0/0x198 kernel/nsproxy.c:206
 ksys_unshare+0x340/0x628 kernel/fork.c:2577
 __do_sys_unshare kernel/fork.c:2645 [inline]
 __se_sys_unshare kernel/fork.c:2643 [inline]
 __arm64_sys_unshare+0x38/0x58 kernel/fork.c:2643
 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline]
 invoke_syscall arch/arm64/kernel/syscall.c:47 [inline]
 el0_svc_common+0x168/0x390 arch/arm64/kernel/syscall.c:83
 el0_svc_handler+0x60/0xd0 arch/arm64/kernel/syscall.c:129
 el0_svc+0x8/0xc arch/arm64/kernel/entry.S:960

Freed by task 264:
 save_stack mm/kasan/kasan.c:448 [inline]
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x114/0x220 mm/kasan/kasan.c:521
 kasan_slab_free+0x10/0x18 mm/kasan/kasan.c:528
 slab_free_hook mm/slub.c:1370 [inline]
 slab_free_freelist_hook mm/slub.c:1397 [inline]
 slab_free mm/slub.c:2952 [inline]
 kmem_cache_free+0xb8/0x3a8 mm/slub.c:2968
 net_free net/core/net_namespace.c:400 [inline]
 net_drop_ns.part.6+0x78/0x90 net/core/net_namespace.c:407
 net_drop_ns net/core/net_namespace.c:406 [inline]
 cleanup_net+0x53c/0x6d8 net/core/net_namespace.c:569
 process_one_work+0x6e8/0x1700 kernel/workqueue.c:2153
 worker_thread+0x3b0/0xdd0 kernel/workqueue.c:2296
 kthread+0x2f0/0x378 kernel/kthread.c:255
 ret_from_fork+0x10/0x18 arch/arm64/kernel/entry.S:1117

The buggy address belongs to the object at ffff8003496a3f80
 which belongs to the cache net_namespace of size 7872
The buggy address is located 1796 bytes inside of
 7872-byte region [ffff8003496a3f80, ffff8003496a5e40)
The buggy address belongs to the page:
page:ffff7e000d25a800 count:1 mapcount:0 mapping:ffff80036ce4b000
index:0x0 compound_mapcount: 0
flags: 0xffffe0000008100(slab|head)
raw: 0ffffe0000008100 dead000000000100 dead000000000200 ffff80036ce4b000
raw: 0000000000000000 0000000080040004 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8003496a4580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8003496a4600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8003496a4680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                   ^
 ffff8003496a4700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8003496a4780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

Fixes: 467fa15356ac("RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/rds/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 18e50a8fc05f..554d4b461983 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -346,7 +346,7 @@ static void rds_tcp_kill_sock(struct net *net)
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 		struct net *c_net = read_pnet(&tc->conn->c_net);
 
-		if (net != c_net || !tc->t_sock)
+		if (net != c_net)
 			continue;
 		list_move_tail(&tc->t_tcp_node, &tmp_list);
 	}

From 91d95c98c6c2c84ed45f1efd4e422c49bdb70043 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Thu, 28 Mar 2019 07:36:00 +0100
Subject: [PATCH 3128/3220] openvswitch: fix flow actions reallocation

[ Upstream commit f28cd2af22a0c134e4aa1c64a70f70d815d473fb ]

The flow action buffer can be resized if it's not big enough to contain
all the requested flow actions. However, this resize doesn't take into
account the new requested size, the buffer is only increased by a factor
of 2x. This might be not enough to contain the new data, causing a
buffer overflow, for example:

[   42.044472] =============================================================================
[   42.045608] BUG kmalloc-96 (Not tainted): Redzone overwritten
[   42.046415] -----------------------------------------------------------------------------

[   42.047715] Disabling lock debugging due to kernel taint
[   42.047716] INFO: 0x8bf2c4a5-0x720c0928. First byte 0x0 instead of 0xcc
[   42.048677] INFO: Slab 0xbc6d2040 objects=29 used=18 fp=0xdc07dec4 flags=0x2808101
[   42.049743] INFO: Object 0xd53a3464 @offset=2528 fp=0xccdcdebb

[   42.050747] Redzone 76f1b237: cc cc cc cc cc cc cc cc                          ........
[   42.051839] Object d53a3464: 6b 6b 6b 6b 6b 6b 6b 6b 0c 00 00 00 6c 00 00 00  kkkkkkkk....l...
[   42.053015] Object f49a30cc: 6c 00 0c 00 00 00 00 00 00 00 00 03 78 a3 15 f6  l...........x...
[   42.054203] Object acfe4220: 20 00 02 00 ff ff ff ff 00 00 00 00 00 00 00 00   ...............
[   42.055370] Object 21024e91: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[   42.056541] Object 070e04c3: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[   42.057797] Object 948a777a: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[   42.059061] Redzone 8bf2c4a5: 00 00 00 00                                      ....
[   42.060189] Padding a681b46e: 5a 5a 5a 5a 5a 5a 5a 5a                          ZZZZZZZZ

Fix by making sure the new buffer is properly resized to contain all the
requested data.

BugLink: https://bugs.launchpad.net/bugs/1813244
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/openvswitch/flow_netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 537917dfa83a..9b676f8fc16f 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1736,14 +1736,14 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
 
 	struct sw_flow_actions *acts;
 	int new_acts_size;
-	int req_size = NLA_ALIGN(attr_len);
+	size_t req_size = NLA_ALIGN(attr_len);
 	int next_offset = offsetof(struct sw_flow_actions, actions) +
 					(*sfa)->actions_len;
 
 	if (req_size <= (ksize(*sfa) - next_offset))
 		goto out;
 
-	new_acts_size = ksize(*sfa) * 2;
+	new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
 
 	if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
 		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {

From 1edf53ecd2aed4140f6f3876f002b15ca089c414 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Wed, 27 Mar 2019 15:26:01 +0100
Subject: [PATCH 3129/3220] qmi_wwan: add Olicard 600
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 6289d0facd9ebce4cc83e5da39e15643ee998dc5 ]

This is a Qualcomm based device with a QMI function on interface 4.
It is mode switched from 2020:2030 using a standard eject message.

T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#=  6 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=2020 ProdID=2031 Rev= 2.32
S:  Manufacturer=Mobile Connect
S:  Product=Mobile Connect
S:  SerialNumber=0123456789ABCDEF
C:* #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none)
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E:  Ad=83(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E:  Ad=85(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E:  Ad=87(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none)
E:  Ad=89(I) Atr=03(Int.) MxPS=   8 Ivl=32ms
E:  Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 5 Alt= 0 #EPs= 2 Cls=08(stor.) Sub=06 Prot=50 Driver=(none)
E:  Ad=8a(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=125us

Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 3b67140eed73..ee6fefe92af4 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -717,6 +717,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x19d2, 0x2002, 4)},	/* ZTE (Vodafone) K3765-Z */
 	{QMI_FIXED_INTF(0x2001, 0x7e19, 4)},	/* D-Link DWM-221 B1 */
 	{QMI_FIXED_INTF(0x2001, 0x7e35, 4)},	/* D-Link DWM-222 */
+	{QMI_FIXED_INTF(0x2020, 0x2031, 4)},	/* Olicard 600 */
 	{QMI_FIXED_INTF(0x2020, 0x2033, 4)},	/* BroadMobi BM806U */
 	{QMI_FIXED_INTF(0x0f3d, 0x68a2, 8)},    /* Sierra Wireless MC7700 */
 	{QMI_FIXED_INTF(0x114f, 0x68a2, 8)},    /* Sierra Wireless MC7750 */

From 03328716cb336f3a284ec1fe5154a479bd249639 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 31 Mar 2019 16:58:15 +0800
Subject: [PATCH 3130/3220] sctp: initialize _pad of sockaddr_in before copying
 to user memory

[ Upstream commit 09279e615c81ce55e04835970601ae286e3facbe ]

Syzbot report a kernel-infoleak:

  BUG: KMSAN: kernel-infoleak in _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
  Call Trace:
    _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
    copy_to_user include/linux/uaccess.h:174 [inline]
    sctp_getsockopt_peer_addrs net/sctp/socket.c:5911 [inline]
    sctp_getsockopt+0x1668e/0x17f70 net/sctp/socket.c:7562
    ...
  Uninit was stored to memory at:
    sctp_transport_init net/sctp/transport.c:61 [inline]
    sctp_transport_new+0x16d/0x9a0 net/sctp/transport.c:115
    sctp_assoc_add_peer+0x532/0x1f70 net/sctp/associola.c:637
    sctp_process_param net/sctp/sm_make_chunk.c:2548 [inline]
    sctp_process_init+0x1a1b/0x3ed0 net/sctp/sm_make_chunk.c:2361
    ...
  Bytes 8-15 of 16 are uninitialized

It was caused by that th _pad field (the 8-15 bytes) of a v4 addr (saved in
struct sockaddr_in) wasn't initialized, but directly copied to user memory
in sctp_getsockopt_peer_addrs().

So fix it by calling memset(addr->v4.sin_zero, 0, 8) to initialize _pad of
sockaddr_in before copying it to user memory in sctp_v4_addr_to_user(), as
sctp_v6_addr_to_user() does.

Reported-by: syzbot+86b5c7c236a22616a72f@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Tested-by: Alexander Potapenko <glider@google.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sctp/protocol.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 9f2f3c48b7b6..247d1888c386 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -598,6 +598,7 @@ out:
 static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
 {
 	/* No address mapping for V4 sockets */
+	memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero));
 	return sizeof(struct sockaddr_in);
 }
 

From 68337354043a3d5207cd4f055e5a8342ec4eec0f Mon Sep 17 00:00:00 2001
From: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
Date: Thu, 4 Apr 2019 12:24:02 +0000
Subject: [PATCH 3131/3220] tcp: Ensure DCTCP reacts to losses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit aecfde23108b8e637d9f5c5e523b24fb97035dc3 ]

RFC8257 §3.5 explicitly states that "A DCTCP sender MUST react to
loss episodes in the same way as conventional TCP".

Currently, Linux DCTCP performs no cwnd reduction when losses
are encountered. Optionally, the dctcp_clamp_alpha_on_loss resets
alpha to its maximal value if a RTO happens. This behavior
is sub-optimal for at least two reasons: i) it ignores losses
triggering fast retransmissions; and ii) it causes unnecessary large
cwnd reduction in the future if the loss was isolated as it resets
the historical term of DCTCP's alpha EWMA to its maximal value (i.e.,
denoting a total congestion). The second reason has an especially
noticeable effect when using DCTCP in high BDP environments, where
alpha normally stays at low values.

This patch replace the clamping of alpha by setting ssthresh to
half of cwnd for both fast retransmissions and RTOs, at most once
per RTT. Consequently, the dctcp_clamp_alpha_on_loss module parameter
has been removed.

The table below shows experimental results where we measured the
drop probability of a PIE AQM (not applying ECN marks) at a
bottleneck in the presence of a single TCP flow with either the
alpha-clamping option enabled or the cwnd halving proposed by this
patch. Results using reno or cubic are given for comparison.

                          |  Link   |   RTT    |    Drop
                 TCP CC   |  speed  | base+AQM | probability
        ==================|=========|==========|============
                    CUBIC |  40Mbps |  7+20ms  |    0.21%
                     RENO |         |          |    0.19%
        DCTCP-CLAMP-ALPHA |         |          |   25.80%
         DCTCP-HALVE-CWND |         |          |    0.22%
        ------------------|---------|----------|------------
                    CUBIC | 100Mbps |  7+20ms  |    0.03%
                     RENO |         |          |    0.02%
        DCTCP-CLAMP-ALPHA |         |          |   23.30%
         DCTCP-HALVE-CWND |         |          |    0.04%
        ------------------|---------|----------|------------
                    CUBIC | 800Mbps |   1+1ms  |    0.04%
                     RENO |         |          |    0.05%
        DCTCP-CLAMP-ALPHA |         |          |   18.70%
         DCTCP-HALVE-CWND |         |          |    0.06%

We see that, without halving its cwnd for all source of losses,
DCTCP drives the AQM to large drop probabilities in order to keep
the queue length under control (i.e., it repeatedly faces RTOs).
Instead, if DCTCP reacts to all source of losses, it can then be
controlled by the AQM using similar drop levels than cubic or reno.

Signed-off-by: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia-bell-labs.com>
Cc: Bob Briscoe <research@bobbriscoe.net>
Cc: Lawrence Brakmo <brakmo@fb.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <borkmann@iogearbox.net>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Andrew Shewmaker <agshew@gmail.com>
Cc: Glenn Judd <glenn.judd@morganstanley.com>
Acked-by: Florian Westphal <fw@strlen.de>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_dctcp.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 62f90f6b7a9d..7ccbbd0d3e43 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -66,11 +66,6 @@ static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
 module_param(dctcp_alpha_on_init, uint, 0644);
 MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
 
-static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
-module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
-MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
-		 "parameter for clamping alpha on loss");
-
 static struct tcp_congestion_ops dctcp_reno;
 
 static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
@@ -211,21 +206,23 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
 	}
 }
 
+static void dctcp_react_to_loss(struct sock *sk)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->loss_cwnd = tp->snd_cwnd;
+	tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
+}
+
 static void dctcp_state(struct sock *sk, u8 new_state)
 {
-	if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
-		struct dctcp *ca = inet_csk_ca(sk);
-
-		/* If this extension is enabled, we clamp dctcp_alpha to
-		 * max on packet loss; the motivation is that dctcp_alpha
-		 * is an indicator to the extend of congestion and packet
-		 * loss is an indicator of extreme congestion; setting
-		 * this in practice turned out to be beneficial, and
-		 * effectively assumes total congestion which reduces the
-		 * window by half.
-		 */
-		ca->dctcp_alpha = DCTCP_MAX_ALPHA;
-	}
+	if (new_state == TCP_CA_Recovery &&
+	    new_state != inet_csk(sk)->icsk_ca_state)
+		dctcp_react_to_loss(sk);
+	/* We handle RTO in dctcp_cwnd_event to ensure that we perform only
+	 * one loss-adjustment per RTT.
+	 */
 }
 
 static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
@@ -237,6 +234,9 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 	case CA_EVENT_ECN_NO_CE:
 		dctcp_ce_state_1_to_0(sk);
 		break;
+	case CA_EVENT_LOSS:
+		dctcp_react_to_loss(sk);
+		break;
 	default:
 		/* Don't care for the rest. */
 		break;

From 0ede14314f6d9e6a172eb4c4b6b9fe5477aa70bc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Mar 2019 08:21:30 -0700
Subject: [PATCH 3132/3220] netns: provide pure entropy for net_hash_mix()

[ Upstream commit 355b98553789b646ed97ad801a619ff898471b92 ]

net_hash_mix() currently uses kernel address of a struct net,
and is used in many places that could be used to reveal this
address to a patient attacker, thus defeating KASLR, for
the typical case (initial net namespace, &init_net is
not dynamically allocated)

I believe the original implementation tried to avoid spending
too many cycles in this function, but security comes first.

Also provide entropy regardless of CONFIG_NET_NS.

Fixes: 0b4419162aa6 ("netns: introduce the net_hash_mix "salt" for hashes")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Amit Klein <aksecurity@gmail.com>
Reported-by: Benny Pinkas <benny@pinkas.net>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/net_namespace.h |  1 +
 include/net/netns/hash.h    | 15 ++-------------
 net/core/net_namespace.c    |  1 +
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 6965dfe7e88b..0daed810724d 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -53,6 +53,7 @@ struct net {
 						 */
 	spinlock_t		rules_mod_lock;
 
+	u32			hash_mix;
 	atomic64_t		cookie_gen;
 
 	struct list_head	list;		/* list of network namespaces */
diff --git a/include/net/netns/hash.h b/include/net/netns/hash.h
index 69a6715d9f3f..a347b2f9e748 100644
--- a/include/net/netns/hash.h
+++ b/include/net/netns/hash.h
@@ -1,21 +1,10 @@
 #ifndef __NET_NS_HASH_H__
 #define __NET_NS_HASH_H__
 
-#include <asm/cache.h>
-
-struct net;
+#include <net/net_namespace.h>
 
 static inline u32 net_hash_mix(const struct net *net)
 {
-#ifdef CONFIG_NET_NS
-	/*
-	 * shift this right to eliminate bits, that are
-	 * always zeroed
-	 */
-
-	return (u32)(((unsigned long)net) >> L1_CACHE_SHIFT);
-#else
-	return 0;
-#endif
+	return net->hash_mix;
 }
 #endif
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ccd20669ac00..087ce1598b74 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -280,6 +280,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 
 	atomic_set(&net->count, 1);
 	atomic_set(&net->passive, 1);
+	get_random_bytes(&net->hash_mix, sizeof(u32));
 	net->dev_base_seq = 1;
 	net->user_ns = user_ns;
 	idr_init(&net->netns_ids);

From 8ed95ff262aa8ed37df94b59b84eee93a0b9e133 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Fri, 29 Mar 2019 09:18:02 +0800
Subject: [PATCH 3133/3220] net: ethtool: not call vzalloc for zero sized
 memory request

[ Upstream commit 3d8830266ffc28c16032b859e38a0252e014b631 ]

NULL or ZERO_SIZE_PTR will be returned for zero sized memory
request, and derefencing them will lead to a segfault

so it is unnecessory to call vzalloc for zero sized memory
request and not call functions which maybe derefence the
NULL allocated memory

this also fixes a possible memory leak if phy_ethtool_get_stats
returns error, memory should be freed before exit

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Reviewed-by: Wang Li <wangli39@baidu.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/ethtool.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b6bca625b0d2..9a53c66deb64 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1287,17 +1287,22 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
 
 	gstrings.len = ret;
 
-	data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER);
-	if (!data)
-		return -ENOMEM;
+	if (gstrings.len) {
+		data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER);
+		if (!data)
+			return -ENOMEM;
 
-	__ethtool_get_strings(dev, gstrings.string_set, data);
+		__ethtool_get_strings(dev, gstrings.string_set, data);
+	} else {
+		data = NULL;
+	}
 
 	ret = -EFAULT;
 	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
 		goto out;
 	useraddr += sizeof(gstrings);
-	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+	if (gstrings.len &&
+	    copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
 		goto out;
 	ret = 0;
 
@@ -1385,17 +1390,21 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 		return -EFAULT;
 
 	stats.n_stats = n_stats;
-	data = kmalloc(n_stats * sizeof(u64), GFP_USER);
-	if (!data)
-		return -ENOMEM;
+	if (n_stats) {
+		data = kmalloc(n_stats * sizeof(u64), GFP_USER);
+		if (!data)
+			return -ENOMEM;
 
-	ops->get_ethtool_stats(dev, &stats, data);
+		ops->get_ethtool_stats(dev, &stats, data);
+	} else {
+		data = NULL;
+	}
 
 	ret = -EFAULT;
 	if (copy_to_user(useraddr, &stats, sizeof(stats)))
 		goto out;
 	useraddr += sizeof(stats);
-	if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+	if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
 		goto out;
 	ret = 0;
 

From 3efb978c3c73f66ed40157de2f45f7fda7bda059 Mon Sep 17 00:00:00 2001
From: Sheena Mira-ato <sheena.mira-ato@alliedtelesis.co.nz>
Date: Mon, 1 Apr 2019 13:04:42 +1300
Subject: [PATCH 3134/3220] ip6_tunnel: Match to ARPHRD_TUNNEL6 for dev type

[ Upstream commit b2e54b09a3d29c4db883b920274ca8dca4d9f04d ]

The device type for ip6 tunnels is set to
ARPHRD_TUNNEL6. However, the ip4ip6_err function
is expecting the device type of the tunnel to be
ARPHRD_TUNNEL.  Since the device types do not
match, the function exits and the ICMP error
packet is not sent to the originating host. Note
that the device type for IPv4 tunnels is set to
ARPHRD_TUNNEL.

Fix is to expect a tunnel device type of
ARPHRD_TUNNEL6 instead.  Now the tunnel device
type matches and the ICMP error packet is sent
to the originating host.

Signed-off-by: Sheena Mira-ato <sheena.mira-ato@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6_tunnel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 7c7a74ea2b0d..f072a4c4575c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -622,7 +622,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 					   IPPROTO_IPIP,
 					   RT_TOS(eiph->tos), 0);
 		if (IS_ERR(rt) ||
-		    rt->dst.dev->type != ARPHRD_TUNNEL) {
+		    rt->dst.dev->type != ARPHRD_TUNNEL6) {
 			if (!IS_ERR(rt))
 				ip_rt_put(rt);
 			goto out;
@@ -632,7 +632,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		ip_rt_put(rt);
 		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
 				   skb2->dev) ||
-		    skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
+		    skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6)
 			goto out;
 	}
 

From ae9981cee7b321ae018a345beb0ed0eb09a0fc18 Mon Sep 17 00:00:00 2001
From: Zubin Mithra <zsm@chromium.org>
Date: Thu, 4 Apr 2019 14:33:55 -0700
Subject: [PATCH 3135/3220] ALSA: seq: Fix OOB-reads from strlcpy

commit 212ac181c158c09038c474ba68068be49caecebb upstream.

When ioctl calls are made with non-null-terminated userspace strings,
strlcpy causes an OOB-read from within strlen. Fix by changing to use
strscpy instead.

Signed-off-by: Zubin Mithra <zsm@chromium.org>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/core/seq/seq_clientmgr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index 73ee8476584d..0d0e0c2651c2 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -1249,7 +1249,7 @@ static int snd_seq_ioctl_set_client_info(struct snd_seq_client *client,
 
 	/* fill the info fields */
 	if (client_info.name[0])
-		strlcpy(client->name, client_info.name, sizeof(client->name));
+		strscpy(client->name, client_info.name, sizeof(client->name));
 
 	client->filter = client_info.filter;
 	client->event_lost = client_info.event_lost;
@@ -1558,7 +1558,7 @@ static int snd_seq_ioctl_create_queue(struct snd_seq_client *client,
 	/* set queue name */
 	if (! info.name[0])
 		snprintf(info.name, sizeof(info.name), "Queue-%d", q->queue);
-	strlcpy(q->name, info.name, sizeof(q->name));
+	strscpy(q->name, info.name, sizeof(q->name));
 	snd_use_lock_free(&q->use_lock);
 
 	if (copy_to_user(arg, &info, sizeof(info)))
@@ -1636,7 +1636,7 @@ static int snd_seq_ioctl_set_queue_info(struct snd_seq_client *client,
 		queuefree(q);
 		return -EPERM;
 	}
-	strlcpy(q->name, info.name, sizeof(q->name));
+	strscpy(q->name, info.name, sizeof(q->name));
 	queuefree(q);
 
 	return 0;

From 4e843163c6d9a5a6fc35dd45f1f357ca9aef357f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 5 Apr 2019 18:38:53 -0700
Subject: [PATCH 3136/3220] include/linux/bitrev.h: fix constant bitrev

commit 6147e136ff5071609b54f18982dea87706288e21 upstream.

clang points out with hundreds of warnings that the bitrev macros have a
problem with constant input:

  drivers/hwmon/sht15.c:187:11: error: variable '__x' is uninitialized when used within its own initialization
        [-Werror,-Wuninitialized]
          u8 crc = bitrev8(data->val_status & 0x0F);
                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  include/linux/bitrev.h:102:21: note: expanded from macro 'bitrev8'
          __constant_bitrev8(__x) :                       \
          ~~~~~~~~~~~~~~~~~~~^~~~
  include/linux/bitrev.h:67:11: note: expanded from macro '__constant_bitrev8'
          u8 __x = x;                     \
             ~~~   ^

Both the bitrev and the __constant_bitrev macros use an internal
variable named __x, which goes horribly wrong when passing one to the
other.

The obvious fix is to rename one of the variables, so this adds an extra
'_'.

It seems we got away with this because

 - there are only a few drivers using bitrev macros

 - usually there are no constant arguments to those

 - when they are constant, they tend to be either 0 or (unsigned)-1
   (drivers/isdn/i4l/isdnhdlc.o, drivers/iio/amplifiers/ad8366.c) and
   give the correct result by pure chance.

In fact, the only driver that I could find that gets different results
with this is drivers/net/wan/slic_ds26522.c, which in turn is a driver
for fairly rare hardware (adding the maintainer to Cc for testing).

Link: http://lkml.kernel.org/r/20190322140503.123580-1-arnd@arndb.de
Fixes: 556d2f055bf6 ("ARM: 8187/1: add CONFIG_HAVE_ARCH_BITREVERSE to support rbit instruction")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Cc: Zhao Qiang <qiang.zhao@nxp.com>
Cc: Yalin Wang <yalin.wang@sonymobile.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/bitrev.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h
index fb790b8449c1..333e42cf08de 100644
--- a/include/linux/bitrev.h
+++ b/include/linux/bitrev.h
@@ -31,32 +31,32 @@ static inline u32 __bitrev32(u32 x)
 
 #define __constant_bitrev32(x)	\
 ({					\
-	u32 __x = x;			\
-	__x = (__x >> 16) | (__x << 16);	\
-	__x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8);	\
-	__x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4);	\
-	__x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2);	\
-	__x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1);	\
-	__x;								\
+	u32 ___x = x;			\
+	___x = (___x >> 16) | (___x << 16);	\
+	___x = ((___x & (u32)0xFF00FF00UL) >> 8) | ((___x & (u32)0x00FF00FFUL) << 8);	\
+	___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4);	\
+	___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2);	\
+	___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1);	\
+	___x;								\
 })
 
 #define __constant_bitrev16(x)	\
 ({					\
-	u16 __x = x;			\
-	__x = (__x >> 8) | (__x << 8);	\
-	__x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4);	\
-	__x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2);	\
-	__x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1);	\
-	__x;								\
+	u16 ___x = x;			\
+	___x = (___x >> 8) | (___x << 8);	\
+	___x = ((___x & (u16)0xF0F0U) >> 4) | ((___x & (u16)0x0F0FU) << 4);	\
+	___x = ((___x & (u16)0xCCCCU) >> 2) | ((___x & (u16)0x3333U) << 2);	\
+	___x = ((___x & (u16)0xAAAAU) >> 1) | ((___x & (u16)0x5555U) << 1);	\
+	___x;								\
 })
 
 #define __constant_bitrev8(x)	\
 ({					\
-	u8 __x = x;			\
-	__x = (__x >> 4) | (__x << 4);	\
-	__x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2);	\
-	__x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1);	\
-	__x;								\
+	u8 ___x = x;			\
+	___x = (___x >> 4) | (___x << 4);	\
+	___x = ((___x & (u8)0xCCU) >> 2) | ((___x & (u8)0x33U) << 2);	\
+	___x = ((___x & (u8)0xAAU) >> 1) | ((___x & (u8)0x55U) << 1);	\
+	___x;								\
 })
 
 #define bitrev32(x) \

From ad344fb86201ce9b2631969b4416e8e528a6373e Mon Sep 17 00:00:00 2001
From: "S.j. Wang" <shengjiu.wang@nxp.com>
Date: Wed, 27 Feb 2019 06:31:12 +0000
Subject: [PATCH 3137/3220] ASoC: fsl_esai: fix channel swap issue when stream
 starts

commit 0ff4e8c61b794a4bf6c854ab071a1abaaa80f358 upstream.

There is very low possibility ( < 0.1% ) that channel swap happened
in beginning when multi output/input pin is enabled. The issue is
that hardware can't send data to correct pin in the beginning with
the normal enable flow.

This is hardware issue, but there is no errata, the workaround flow
is that: Each time playback/recording, firstly clear the xSMA/xSMB,
then enable TE/RE, then enable xSMB and xSMA (xSMB must be enabled
before xSMA). Which is to use the xSMA as the trigger start register,
previously the xCR_TE or xCR_RE is the bit for starting.

Fixes commit 43d24e76b698 ("ASoC: fsl_esai: Add ESAI CPU DAI driver")
Cc: <stable@vger.kernel.org>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Acked-by: Nicolin Chen <nicoleotsuka@gmail.com>
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/soc/fsl/fsl_esai.c | 47 +++++++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/sound/soc/fsl/fsl_esai.c b/sound/soc/fsl/fsl_esai.c
index a87836d4de15..40075b9afb79 100644
--- a/sound/soc/fsl/fsl_esai.c
+++ b/sound/soc/fsl/fsl_esai.c
@@ -57,6 +57,8 @@ struct fsl_esai {
 	u32 fifo_depth;
 	u32 slot_width;
 	u32 slots;
+	u32 tx_mask;
+	u32 rx_mask;
 	u32 hck_rate[2];
 	u32 sck_rate[2];
 	bool hck_dir[2];
@@ -357,21 +359,13 @@ static int fsl_esai_set_dai_tdm_slot(struct snd_soc_dai *dai, u32 tx_mask,
 	regmap_update_bits(esai_priv->regmap, REG_ESAI_TCCR,
 			   ESAI_xCCR_xDC_MASK, ESAI_xCCR_xDC(slots));
 
-	regmap_update_bits(esai_priv->regmap, REG_ESAI_TSMA,
-			   ESAI_xSMA_xS_MASK, ESAI_xSMA_xS(tx_mask));
-	regmap_update_bits(esai_priv->regmap, REG_ESAI_TSMB,
-			   ESAI_xSMB_xS_MASK, ESAI_xSMB_xS(tx_mask));
-
 	regmap_update_bits(esai_priv->regmap, REG_ESAI_RCCR,
 			   ESAI_xCCR_xDC_MASK, ESAI_xCCR_xDC(slots));
 
-	regmap_update_bits(esai_priv->regmap, REG_ESAI_RSMA,
-			   ESAI_xSMA_xS_MASK, ESAI_xSMA_xS(rx_mask));
-	regmap_update_bits(esai_priv->regmap, REG_ESAI_RSMB,
-			   ESAI_xSMB_xS_MASK, ESAI_xSMB_xS(rx_mask));
-
 	esai_priv->slot_width = slot_width;
 	esai_priv->slots = slots;
+	esai_priv->tx_mask = tx_mask;
+	esai_priv->rx_mask = rx_mask;
 
 	return 0;
 }
@@ -582,6 +576,7 @@ static int fsl_esai_trigger(struct snd_pcm_substream *substream, int cmd,
 	bool tx = substream->stream == SNDRV_PCM_STREAM_PLAYBACK;
 	u8 i, channels = substream->runtime->channels;
 	u32 pins = DIV_ROUND_UP(channels, esai_priv->slots);
+	u32 mask;
 
 	switch (cmd) {
 	case SNDRV_PCM_TRIGGER_START:
@@ -594,15 +589,38 @@ static int fsl_esai_trigger(struct snd_pcm_substream *substream, int cmd,
 		for (i = 0; tx && i < channels; i++)
 			regmap_write(esai_priv->regmap, REG_ESAI_ETDR, 0x0);
 
+		/*
+		 * When set the TE/RE in the end of enablement flow, there
+		 * will be channel swap issue for multi data line case.
+		 * In order to workaround this issue, we switch the bit
+		 * enablement sequence to below sequence
+		 * 1) clear the xSMB & xSMA: which is done in probe and
+		 *                           stop state.
+		 * 2) set TE/RE
+		 * 3) set xSMB
+		 * 4) set xSMA:  xSMA is the last one in this flow, which
+		 *               will trigger esai to start.
+		 */
 		regmap_update_bits(esai_priv->regmap, REG_ESAI_xCR(tx),
 				   tx ? ESAI_xCR_TE_MASK : ESAI_xCR_RE_MASK,
 				   tx ? ESAI_xCR_TE(pins) : ESAI_xCR_RE(pins));
+		mask = tx ? esai_priv->tx_mask : esai_priv->rx_mask;
+
+		regmap_update_bits(esai_priv->regmap, REG_ESAI_xSMB(tx),
+				   ESAI_xSMB_xS_MASK, ESAI_xSMB_xS(mask));
+		regmap_update_bits(esai_priv->regmap, REG_ESAI_xSMA(tx),
+				   ESAI_xSMA_xS_MASK, ESAI_xSMA_xS(mask));
+
 		break;
 	case SNDRV_PCM_TRIGGER_SUSPEND:
 	case SNDRV_PCM_TRIGGER_STOP:
 	case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
 		regmap_update_bits(esai_priv->regmap, REG_ESAI_xCR(tx),
 				   tx ? ESAI_xCR_TE_MASK : ESAI_xCR_RE_MASK, 0);
+		regmap_update_bits(esai_priv->regmap, REG_ESAI_xSMA(tx),
+				   ESAI_xSMA_xS_MASK, 0);
+		regmap_update_bits(esai_priv->regmap, REG_ESAI_xSMB(tx),
+				   ESAI_xSMB_xS_MASK, 0);
 
 		/* Disable and reset FIFO */
 		regmap_update_bits(esai_priv->regmap, REG_ESAI_xFCR(tx),
@@ -887,6 +905,15 @@ static int fsl_esai_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	esai_priv->tx_mask = 0xFFFFFFFF;
+	esai_priv->rx_mask = 0xFFFFFFFF;
+
+	/* Clear the TSMA, TSMB, RSMA, RSMB */
+	regmap_write(esai_priv->regmap, REG_ESAI_TSMA, 0);
+	regmap_write(esai_priv->regmap, REG_ESAI_TSMB, 0);
+	regmap_write(esai_priv->regmap, REG_ESAI_RSMA, 0);
+	regmap_write(esai_priv->regmap, REG_ESAI_RSMB, 0);
+
 	ret = devm_snd_soc_register_component(&pdev->dev, &fsl_esai_component,
 					      &fsl_esai_dai, 1);
 	if (ret) {

From f449309b7f43ae6aa457a2ee75e1e00783bc8f2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
Date: Wed, 10 Apr 2019 16:27:51 -0400
Subject: [PATCH 3138/3220] block: do not leak memory in bio_copy_user_iov()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit a3761c3c91209b58b6f33bf69dd8bb8ec0c9d925 upstream.

When bio_add_pc_page() fails in bio_copy_user_iov() we should free
the page we just allocated otherwise we are leaking it.

Cc: linux-block@vger.kernel.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: stable@vger.kernel.org
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/bio.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 63363a689922..cf513f74dffd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1216,8 +1216,11 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 			}
 		}
 
-		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
+		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) {
+			if (!map_data)
+				__free_page(page);
 			break;
+		}
 
 		len -= bytes;
 		offset = 0;

From 39aecae01dd20be204798832f68d617512ec8cc2 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Mon, 25 Mar 2019 11:10:26 -0700
Subject: [PATCH 3139/3220] genirq: Respect IRQCHIP_SKIP_SET_WAKE in
 irq_chip_set_wake_parent()

commit 325aa19598e410672175ed50982f902d4e3f31c5 upstream.

If a child irqchip calls irq_chip_set_wake_parent() but its parent irqchip
has the IRQCHIP_SKIP_SET_WAKE flag set an error is returned.

This is inconsistent behaviour vs. set_irq_wake_real() which returns 0 when
the irqchip has the IRQCHIP_SKIP_SET_WAKE flag set. It doesn't attempt to
walk the chain of parents and set irq wake on any chips that don't have the
flag set either. If the intent is to call the .irq_set_wake() callback of
the parent irqchip, then we expect irqchip implementations to omit the
IRQCHIP_SKIP_SET_WAKE flag and implement an .irq_set_wake() function that
calls irq_chip_set_wake_parent().

The problem has been observed on a Qualcomm sdm845 device where set wake
fails on any GPIO interrupts after applying work in progress wakeup irq
patches to the GPIO driver. The chain of chips looks like this:

     QCOM GPIO -> QCOM PDC (SKIP) -> ARM GIC (SKIP)

The GPIO controllers parent is the QCOM PDC irqchip which in turn has ARM
GIC as parent.  The QCOM PDC irqchip has the IRQCHIP_SKIP_SET_WAKE flag
set, and so does the grandparent ARM GIC.

The GPIO driver doesn't know if the parent needs to set wake or not, so it
unconditionally calls irq_chip_set_wake_parent() causing this function to
return a failure because the parent irqchip (PDC) doesn't have the
.irq_set_wake() callback set. Returning 0 instead makes everything work and
irqs from the GPIO controller can be configured for wakeup.

Make it consistent by returning 0 (success) from irq_chip_set_wake_parent()
when a parent chip has IRQCHIP_SKIP_SET_WAKE set.

[ tglx: Massaged changelog ]

Fixes: 08b55e2a9208e ("genirq: Add irqchip_set_wake_parent")
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-gpio@vger.kernel.org
Cc: Lina Iyer <ilina@codeaurora.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20190325181026.247796-1-swboyd@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/irq/chip.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3c74e13a95dc..67aafc2b249c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1056,6 +1056,10 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
 int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
 {
 	data = data->parent_data;
+
+	if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE)
+		return 0;
+
 	if (data->chip->irq_set_wake)
 		return data->chip->irq_set_wake(data, on);
 

From e9b63a38da24b0bb41889d54a3e3c1c318725861 Mon Sep 17 00:00:00 2001
From: David Engraf <david.engraf@sysgo.com>
Date: Mon, 11 Mar 2019 08:57:42 +0100
Subject: [PATCH 3140/3220] ARM: dts: at91: Fix typo in ISC_D0 on PC9

commit e7dfb6d04e4715be1f3eb2c60d97b753fd2e4516 upstream.

The function argument for the ISC_D0 on PC9 was incorrect. According to
the documentation it should be 'C' aka 3.

Signed-off-by: David Engraf <david.engraf@sysgo.com>
Reviewed-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Signed-off-by: Ludovic Desroches <ludovic.desroches@microchip.com>
Fixes: 7f16cb676c00 ("ARM: at91/dt: add sama5d2 pinmux")
Cc: <stable@vger.kernel.org> # v4.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/sama5d2-pinfunc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/sama5d2-pinfunc.h b/arch/arm/boot/dts/sama5d2-pinfunc.h
index 8a394f336003..ee65702f9645 100644
--- a/arch/arm/boot/dts/sama5d2-pinfunc.h
+++ b/arch/arm/boot/dts/sama5d2-pinfunc.h
@@ -517,7 +517,7 @@
 #define PIN_PC9__GPIO			PINMUX_PIN(PIN_PC9, 0, 0)
 #define PIN_PC9__FIQ			PINMUX_PIN(PIN_PC9, 1, 3)
 #define PIN_PC9__GTSUCOMP		PINMUX_PIN(PIN_PC9, 2, 1)
-#define PIN_PC9__ISC_D0			PINMUX_PIN(PIN_PC9, 2, 1)
+#define PIN_PC9__ISC_D0			PINMUX_PIN(PIN_PC9, 3, 1)
 #define PIN_PC9__TIOA4			PINMUX_PIN(PIN_PC9, 4, 2)
 #define PIN_PC10			74
 #define PIN_PC10__GPIO			PINMUX_PIN(PIN_PC10, 0, 0)

From 442ebc68526116bfa820d0c447499cfaa2df65c2 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 8 Apr 2019 12:45:09 +0100
Subject: [PATCH 3141/3220] arm64: futex: Fix FUTEX_WAKE_OP atomic ops with
 non-zero result value

commit 045afc24124d80c6998d9c770844c67912083506 upstream.

Rather embarrassingly, our futex() FUTEX_WAKE_OP implementation doesn't
explicitly set the return value on the non-faulting path and instead
leaves it holding the result of the underlying atomic operation. This
means that any FUTEX_WAKE_OP atomic operation which computes a non-zero
value will be reported as having failed. Regrettably, I wrote the buggy
code back in 2011 and it was upstreamed as part of the initial arm64
support in 2012.

The reasons we appear to get away with this are:

  1. FUTEX_WAKE_OP is rarely used and therefore doesn't appear to get
     exercised by futex() test applications

  2. If the result of the atomic operation is zero, the system call
     behaves correctly

  3. Prior to version 2.25, the only operation used by GLIBC set the
     futex to zero, and therefore worked as expected. From 2.25 onwards,
     FUTEX_WAKE_OP is not used by GLIBC at all.

Fix the implementation by ensuring that the return value is either 0
to indicate that the atomic operation completed successfully, or -EFAULT
if we encountered a fault when accessing the user mapping.

Cc: <stable@kernel.org>
Fixes: 6170a97460db ("arm64: Atomic operations")
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/include/asm/futex.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 195fd56b2377..0377c1756945 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -33,8 +33,8 @@
 "	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%w1, %2\n"						\
 	insn "\n"							\
-"2:	stlxr	%w3, %w0, %2\n"						\
-"	cbnz	%w3, 1b\n"						\
+"2:	stlxr	%w0, %w3, %2\n"						\
+"	cbnz	%w0, 1b\n"						\
 "	dmb	ish\n"							\
 "3:\n"									\
 "	.pushsection .fixup,\"ax\"\n"					\
@@ -55,29 +55,29 @@
 static inline int
 arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int oldval = 0, ret, tmp;
+	int oldval, ret, tmp;
 
 	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
-		__futex_atomic_op("mov	%w0, %w4",
+		__futex_atomic_op("mov	%w3, %w4",
 				  ret, oldval, uaddr, tmp, oparg);
 		break;
 	case FUTEX_OP_ADD:
-		__futex_atomic_op("add	%w0, %w1, %w4",
+		__futex_atomic_op("add	%w3, %w1, %w4",
 				  ret, oldval, uaddr, tmp, oparg);
 		break;
 	case FUTEX_OP_OR:
-		__futex_atomic_op("orr	%w0, %w1, %w4",
+		__futex_atomic_op("orr	%w3, %w1, %w4",
 				  ret, oldval, uaddr, tmp, oparg);
 		break;
 	case FUTEX_OP_ANDN:
-		__futex_atomic_op("and	%w0, %w1, %w4",
+		__futex_atomic_op("and	%w3, %w1, %w4",
 				  ret, oldval, uaddr, tmp, ~oparg);
 		break;
 	case FUTEX_OP_XOR:
-		__futex_atomic_op("eor	%w0, %w1, %w4",
+		__futex_atomic_op("eor	%w3, %w1, %w4",
 				  ret, oldval, uaddr, tmp, oparg);
 		break;
 	default:

From c082a57d41de986e9e850beb1af674cea5234e86 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 4 Apr 2019 18:12:17 +0300
Subject: [PATCH 3142/3220] xen: Prevent buffer overflow in privcmd ioctl

commit 42d8644bd77dd2d747e004e367cb0c895a606f39 upstream.

The "call" variable comes from the user in privcmd_ioctl_hypercall().
It's an offset into the hypercall_page[] which has (PAGE_SIZE / 32)
elements.  We need to put an upper bound on it to prevent an out of
bounds access.

Cc: stable@vger.kernel.org
Fixes: 1246ae0bb992 ("xen: add variable hypercall caller")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/xen/hypercall.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 0977e7607046..5937b112ebc2 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -215,6 +215,9 @@ privcmd_call(unsigned call,
 	__HYPERCALL_DECLS;
 	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
 
+	if (call >= PAGE_SIZE / sizeof(hypercall_page[0]))
+		return -EINVAL;
+
 	stac();
 	asm volatile(CALL_NOSPEC
 		     : __HYPERCALL_5PARAM

From e15f3c34844e6fcaaf1f61f309f1778c37a4e8af Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 19 Mar 2019 12:36:10 +0000
Subject: [PATCH 3143/3220] sched/fair: Do not re-read ->h_load_next during
 hierarchical load calculation

commit 0e9f02450da07fc7b1346c8c32c771555173e397 upstream.

A NULL pointer dereference bug was reported on a distribution kernel but
the same issue should be present on mainline kernel. It occured on s390
but should not be arch-specific.  A partial oops looks like:

  Unable to handle kernel pointer dereference in virtual kernel address space
  ...
  Call Trace:
    ...
    try_to_wake_up+0xfc/0x450
    vhost_poll_wakeup+0x3a/0x50 [vhost]
    __wake_up_common+0xbc/0x178
    __wake_up_common_lock+0x9e/0x160
    __wake_up_sync_key+0x4e/0x60
    sock_def_readable+0x5e/0x98

The bug hits any time between 1 hour to 3 days. The dereference occurs
in update_cfs_rq_h_load when accumulating h_load. The problem is that
cfq_rq->h_load_next is not protected by any locking and can be updated
by parallel calls to task_h_load. Depending on the compiler, code may be
generated that re-reads cfq_rq->h_load_next after the check for NULL and
then oops when reading se->avg.load_avg. The dissassembly showed that it
was possible to reread h_load_next after the check for NULL.

While this does not appear to be an issue for later compilers, it's still
an accident if the correct code is generated. Full locking in this path
would have high overhead so this patch uses READ_ONCE to read h_load_next
only once and check for NULL before dereferencing. It was confirmed that
there were no further oops after 10 days of testing.

As Peter pointed out, it is also necessary to use WRITE_ONCE() to avoid any
potential problems with store tearing.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Fixes: 685207963be9 ("sched: Move h_load calculation to task_h_load()")
Link: https://lkml.kernel.org/r/20190319123610.nsivgf3mjbjjesxb@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6051007918ad..7671ae95753f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6022,10 +6022,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 	if (cfs_rq->last_h_load_update == now)
 		return;
 
-	cfs_rq->h_load_next = NULL;
+	WRITE_ONCE(cfs_rq->h_load_next, NULL);
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		cfs_rq->h_load_next = se;
+		WRITE_ONCE(cfs_rq->h_load_next, se);
 		if (cfs_rq->last_h_load_update == now)
 			break;
 	}
@@ -6035,7 +6035,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 		cfs_rq->last_h_load_update = now;
 	}
 
-	while ((se = cfs_rq->h_load_next) != NULL) {
+	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
 		load = cfs_rq->h_load;
 		load = div64_ul(load * se->avg.load_avg,
 			cfs_rq_load_avg(cfs_rq) + 1);

From 34419534a70a4cac7d64512e8a00638cc2a7aa91 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Thu, 4 Apr 2019 11:08:40 -0700
Subject: [PATCH 3144/3220] xtensa: fix return_address

commit ada770b1e74a77fff2d5f539bf6c42c25f4784db upstream.

return_address returns the address that is one level higher in the call
stack than requested in its argument, because level 0 corresponds to its
caller's return address. Use requested level as the number of stack
frames to skip.

This fixes the address reported by might_sleep and friends.

Cc: stable@vger.kernel.org
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/xtensa/kernel/stacktrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/xtensa/kernel/stacktrace.c b/arch/xtensa/kernel/stacktrace.c
index 7538d802b65a..483593068139 100644
--- a/arch/xtensa/kernel/stacktrace.c
+++ b/arch/xtensa/kernel/stacktrace.c
@@ -272,10 +272,14 @@ static int return_address_cb(struct stackframe *frame, void *data)
 	return 1;
 }
 
+/*
+ * level == 0 is for the return address from the caller of this function,
+ * not from this function itself.
+ */
 unsigned long return_address(unsigned level)
 {
 	struct return_addr_data r = {
-		.skip = level + 1,
+		.skip = level,
 	};
 	walk_stackframe(stack_pointer(NULL), return_address_cb, &r);
 	return r.addr;

From 653ee8df1f3209b171b1615d9ad4a2e96f9352f0 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 5 Apr 2019 16:20:47 +0100
Subject: [PATCH 3145/3220] PCI: Add function 1 DMA alias quirk for Marvell
 9170 SATA controller

commit 9cde402a59770a0669d895399c13407f63d7d209 upstream.

There is a Marvell 88SE9170 PCIe SATA controller I found on a board here.
Some quick testing with the ARM SMMU enabled reveals that it suffers from
the same requester ID mixup problems as the other Marvell chips listed
already.

Add the PCI vendor/device ID to the list of chips which need the
workaround.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/quirks.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 84d501f5ff4e..d85010ebac5a 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3623,6 +3623,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9128,
 /* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c14 */
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9130,
 			 quirk_dma_func1_alias);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9170,
+			 quirk_dma_func1_alias);
 /* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c47 + c57 */
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9172,
 			 quirk_dma_func1_alias);

From e92dc325a560de6efd1e4e1ac6b1f638170ab197 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 7 Mar 2019 10:52:33 -0800
Subject: [PATCH 3146/3220] perf/core: Restore mmap record type correctly

[ Upstream commit d9c1bb2f6a2157b38e8eb63af437cb22701d31ee ]

On mmap(), perf_events generates a RECORD_MMAP record and then checks
which events are interested in this record. There are currently 2
versions of mmap records: RECORD_MMAP and RECORD_MMAP2. MMAP2 is larger.
The event configuration controls which version the user level tool
accepts.

If the event->attr.mmap2=1 field then MMAP2 record is returned.  The
perf_event_mmap_output() takes care of this. It checks attr->mmap2 and
corrects the record fields before putting it in the sampling buffer of
the event.  At the end the function restores the modified MMAP record
fields.

The problem is that the function restores the size but not the type.
Thus, if a subsequent event only accepts MMAP type, then it would
instead receive an MMAP2 record with a size of MMAP record.

This patch fixes the problem by restoring the record type on exit.

Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Fixes: 13d7a2410fa6 ("perf: Add attr->mmap2 attribute to an event")
Link: http://lkml.kernel.org/r/20190307185233.225521-1-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/events/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e53dfb5b826e..17230ca00bd4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5998,6 +5998,7 @@ static void perf_event_mmap_output(struct perf_event *event,
 	struct perf_output_handle handle;
 	struct perf_sample_data sample;
 	int size = mmap_event->event_id.header.size;
+	u32 type = mmap_event->event_id.header.type;
 	int ret;
 
 	if (!perf_event_mmap_match(event, data))
@@ -6041,6 +6042,7 @@ static void perf_event_mmap_output(struct perf_event *event,
 	perf_output_end(&handle);
 out:
 	mmap_event->event_id.header.size = size;
+	mmap_event->event_id.header.type = type;
 }
 
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)

From d8ef8e9465ef976f3b8a61d9b453947b2d144d73 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Fri, 15 Mar 2019 00:15:32 -0400
Subject: [PATCH 3147/3220] ext4: add missing brelse() in add_new_gdb_meta_bg()

[ Upstream commit d64264d6218e6892edd832dc3a5a5857c2856c53 ]

Currently in add_new_gdb_meta_bg() there is a missing brelse of gdb_bh
in case ext4_journal_get_write_access() fails.
Additionally kvfree() is missing in the same error path. Fix it by
moving the ext4_journal_get_write_access() before the ext4 sb update as
Ted suggested and release n_group_desc and gdb_bh in case it fails.

Fixes: 61a9c11e5e7a ("ext4: add missing brelse() add_new_gdb_meta_bg()'s error path")
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ext4/resize.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 4bd12247a9be..22c90eb9a4e0 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -907,11 +907,18 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
 	memcpy(n_group_desc, o_group_desc,
 	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
 	n_group_desc[gdb_num] = gdb_bh;
+
+	BUFFER_TRACE(gdb_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gdb_bh);
+	if (err) {
+		kvfree(n_group_desc);
+		brelse(gdb_bh);
+		return err;
+	}
+
 	EXT4_SB(sb)->s_group_desc = n_group_desc;
 	EXT4_SB(sb)->s_gdb_count++;
 	kvfree(o_group_desc);
-	BUFFER_TRACE(gdb_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, gdb_bh);
 	return err;
 }
 

From 0e9bb07a374ae378d45ce0c159504a3a2eb0fe72 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Fri, 15 Mar 2019 00:22:28 -0400
Subject: [PATCH 3148/3220] ext4: report real fs size after failed resize

[ Upstream commit 6c7328400e0488f7d49e19e02290ba343b6811b2 ]

Currently when the file system resize using ext4_resize_fs() fails it
will report into log that "resized filesystem to <requested block
count>".  However this may not be true in the case of failure.  Use the
current block count as returned by ext4_blocks_count() to report the
block count.

Additionally, report a warning that "error occurred during file system
resize"

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ext4/resize.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 22c90eb9a4e0..5223eb25bf59 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -2047,6 +2047,10 @@ out:
 		free_flex_gd(flex_gd);
 	if (resize_inode != NULL)
 		iput(resize_inode);
-	ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
+	if (err)
+		ext4_warning(sb, "error (%d) occurred during "
+			     "file system resize", err);
+	ext4_msg(sb, KERN_INFO, "resized filesystem to %llu",
+		 ext4_blocks_count(es));
 	return err;
 }

From 574469ec6dcfc25c5ae8149fb93cb3dede55ba57 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Thu, 14 Mar 2019 22:58:29 -0500
Subject: [PATCH 3149/3220] ALSA: echoaudio: add a check for ioremap_nocache

[ Upstream commit 6ade657d6125ec3ec07f95fa51e28138aef6208f ]

In case ioremap_nocache fails, the fix releases chip and returns
an error code upstream to avoid NULL pointer dereference.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/pci/echoaudio/echoaudio.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sound/pci/echoaudio/echoaudio.c b/sound/pci/echoaudio/echoaudio.c
index 286f5e3686a3..d73ee11a32bd 100644
--- a/sound/pci/echoaudio/echoaudio.c
+++ b/sound/pci/echoaudio/echoaudio.c
@@ -1953,6 +1953,11 @@ static int snd_echo_create(struct snd_card *card,
 	}
 	chip->dsp_registers = (volatile u32 __iomem *)
 		ioremap_nocache(chip->dsp_registers_phys, sz);
+	if (!chip->dsp_registers) {
+		dev_err(chip->card->dev, "ioremap failed\n");
+		snd_echo_free(chip);
+		return -ENOMEM;
+	}
 
 	if (request_irq(pci->irq, snd_echo_interrupt, IRQF_SHARED,
 			KBUILD_MODNAME, chip)) {

From 0a2741cd5269d8d5cd34898086e543b75bb18002 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Thu, 14 Mar 2019 23:04:14 -0500
Subject: [PATCH 3150/3220] ALSA: sb8: add a check for request_region

[ Upstream commit dcd0feac9bab901d5739de51b3f69840851f8919 ]

In case request_region fails, the fix returns an error code to
avoid NULL pointer dereference.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/isa/sb/sb8.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sound/isa/sb/sb8.c b/sound/isa/sb/sb8.c
index b8e2391c33ff..0c7fe1418447 100644
--- a/sound/isa/sb/sb8.c
+++ b/sound/isa/sb/sb8.c
@@ -111,6 +111,10 @@ static int snd_sb8_probe(struct device *pdev, unsigned int dev)
 
 	/* block the 0x388 port to avoid PnP conflicts */
 	acard->fm_res = request_region(0x388, 4, "SoundBlaster FM");
+	if (!acard->fm_res) {
+		err = -EBUSY;
+		goto _err;
+	}
 
 	if (port[dev] != SNDRV_AUTO_PORT) {
 		if ((err = snd_sbdsp_create(card, port[dev], irq[dev],

From b33c00770b36a5ebbcd546bee74357b2a210843c Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Wed, 6 Mar 2019 19:17:56 +0200
Subject: [PATCH 3151/3220] IB/mlx4: Fix race condition between catas error
 reset and aliasguid flows

[ Upstream commit 587443e7773e150ae29e643ee8f41a1eed226565 ]

Code review revealed a race condition which could allow the catas error
flow to interrupt the alias guid query post mechanism at random points.
Thiis is fixed by doing cancel_delayed_work_sync() instead of
cancel_delayed_work() during the alias guid mechanism destroy flow.

Fixes: a0c64a17aba8 ("mlx4: Add alias_guid mechanism")
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/infiniband/hw/mlx4/alias_GUID.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
index 21cb41a60fe8..3a70b418d913 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -805,8 +805,8 @@ void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev)
 	unsigned long flags;
 
 	for (i = 0 ; i < dev->num_ports; i++) {
-		cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work);
 		det = &sriov->alias_guid.ports_guid[i];
+		cancel_delayed_work_sync(&det->alias_guid_work);
 		spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
 		while (!list_empty(&det->cb_list)) {
 			cb_ctx = list_entry(det->cb_list.next,

From 9bb0b8e3460774c56d7b736cf81cde1652a855cf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 7 Mar 2019 11:10:11 +0100
Subject: [PATCH 3152/3220] mmc: davinci: remove extraneous __init annotation

[ Upstream commit 9ce58dd7d9da3ca0d7cb8c9568f1c6f4746da65a ]

Building with clang finds a mistaken __init tag:

WARNING: vmlinux.o(.text+0x5e4250): Section mismatch in reference from the function davinci_mmcsd_probe() to the function .init.text:init_mmcsd_host()
The function davinci_mmcsd_probe() references
the function __init init_mmcsd_host().
This is often because davinci_mmcsd_probe lacks a __init
annotation or the annotation of init_mmcsd_host is wrong.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Wolfram Sang <wsa@the-dreams.de>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/mmc/host/davinci_mmc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c
index ea2a2ebc6b91..dba7565571a5 100644
--- a/drivers/mmc/host/davinci_mmc.c
+++ b/drivers/mmc/host/davinci_mmc.c
@@ -1147,7 +1147,7 @@ static inline void mmc_davinci_cpufreq_deregister(struct mmc_davinci_host *host)
 {
 }
 #endif
-static void __init init_mmcsd_host(struct mmc_davinci_host *host)
+static void init_mmcsd_host(struct mmc_davinci_host *host)
 {
 
 	mmc_davinci_reset_ctrl(host, 1);

From 149f72ff296eea0b7ca515fbd0a9bd1f608c30b7 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sun, 17 Mar 2019 23:21:24 +0000
Subject: [PATCH 3153/3220] ALSA: opl3: fix mismatch between
 snd_opl3_drum_switch definition and declaration

[ Upstream commit b4748e7ab731e436cf5db4786358ada5dd2db6dd ]

The function snd_opl3_drum_switch declaration in the header file
has the order of the two arguments on_off and vel swapped when
compared to the definition arguments of vel and on_off.  Fix this
by swapping them around to match the definition.

This error predates the git history, so no idea when this error
was introduced.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/drivers/opl3/opl3_voice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/drivers/opl3/opl3_voice.h b/sound/drivers/opl3/opl3_voice.h
index a371c075ac87..e26702559f61 100644
--- a/sound/drivers/opl3/opl3_voice.h
+++ b/sound/drivers/opl3/opl3_voice.h
@@ -41,7 +41,7 @@ void snd_opl3_timer_func(unsigned long data);
 
 /* Prototypes for opl3_drums.c */
 void snd_opl3_load_drums(struct snd_opl3 *opl3);
-void snd_opl3_drum_switch(struct snd_opl3 *opl3, int note, int on_off, int vel, struct snd_midi_channel *chan);
+void snd_opl3_drum_switch(struct snd_opl3 *opl3, int note, int vel, int on_off, struct snd_midi_channel *chan);
 
 /* Prototypes for opl3_oss.c */
 #ifdef CONFIG_SND_SEQUENCER_OSS

From 25c001949c934dace0be9022b60b7e85d059f5ee Mon Sep 17 00:00:00 2001
From: Matthew Garrett <matthewgarrett@google.com>
Date: Wed, 10 Oct 2018 01:30:06 -0700
Subject: [PATCH 3154/3220] thermal/int340x_thermal: Add additional UUIDs

[ Upstream commit 16fc8eca1975358111dbd7ce65e4ce42d1a848fb ]

Add more supported DPTF policies than the driver currently exposes.

Signed-off-by: Matthew Garrett <mjg59@google.com>
Cc: Nisha Aram <nisha.aram@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/thermal/int340x_thermal/int3400_thermal.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c
index 5836e5554433..0beed2899163 100644
--- a/drivers/thermal/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3400_thermal.c
@@ -20,6 +20,13 @@ enum int3400_thermal_uuid {
 	INT3400_THERMAL_PASSIVE_1,
 	INT3400_THERMAL_ACTIVE,
 	INT3400_THERMAL_CRITICAL,
+	INT3400_THERMAL_ADAPTIVE_PERFORMANCE,
+	INT3400_THERMAL_EMERGENCY_CALL_MODE,
+	INT3400_THERMAL_PASSIVE_2,
+	INT3400_THERMAL_POWER_BOSS,
+	INT3400_THERMAL_VIRTUAL_SENSOR,
+	INT3400_THERMAL_COOLING_MODE,
+	INT3400_THERMAL_HARDWARE_DUTY_CYCLING,
 	INT3400_THERMAL_MAXIMUM_UUID,
 };
 
@@ -27,6 +34,13 @@ static u8 *int3400_thermal_uuids[INT3400_THERMAL_MAXIMUM_UUID] = {
 	"42A441D6-AE6A-462b-A84B-4A8CE79027D3",
 	"3A95C389-E4B8-4629-A526-C52C88626BAE",
 	"97C68AE7-15FA-499c-B8C9-5DA81D606E0A",
+	"63BE270F-1C11-48FD-A6F7-3AF253FF3E2D",
+	"5349962F-71E6-431D-9AE8-0A635B710AEE",
+	"9E04115A-AE87-4D1C-9500-0F3E340BFE75",
+	"F5A35014-C209-46A4-993A-EB56DE7530A1",
+	"6ED722A7-9240-48A5-B479-31EEF723D7CF",
+	"16CAF1B7-DD38-40ED-B1C1-1B8A1913D531",
+	"BE84BABF-C4D4-403D-B495-3128FD44dAC1",
 };
 
 struct int3400_thermal_priv {

From 3a3e67537041388db5ef81bead78c4a49fd33d86 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <matthewgarrett@google.com>
Date: Wed, 10 Oct 2018 01:30:07 -0700
Subject: [PATCH 3155/3220] thermal/int340x_thermal: fix mode setting

[ Upstream commit 396ee4d0cd52c13b3f6421b8d324d65da5e7e409 ]

int3400 only pushes the UUID into the firmware when the mode is flipped
to "enable". The current code only exposes the mode flag if the firmware
supports the PASSIVE_1 UUID, which not all machines do. Remove the
restriction.

Signed-off-by: Matthew Garrett <mjg59@google.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/thermal/int340x_thermal/int3400_thermal.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c
index 0beed2899163..d4c374cc4f74 100644
--- a/drivers/thermal/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3400_thermal.c
@@ -285,10 +285,9 @@ static int int3400_thermal_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, priv);
 
-	if (priv->uuid_bitmap & 1 << INT3400_THERMAL_PASSIVE_1) {
-		int3400_thermal_ops.get_mode = int3400_thermal_get_mode;
-		int3400_thermal_ops.set_mode = int3400_thermal_set_mode;
-	}
+	int3400_thermal_ops.get_mode = int3400_thermal_get_mode;
+	int3400_thermal_ops.set_mode = int3400_thermal_set_mode;
+
 	priv->thermal = thermal_zone_device_register("INT3400 Thermal", 0, 0,
 						priv, &int3400_thermal_ops,
 						&int3400_thermal_params, 0, 0);

From 63e03efad5eca8762ad3418b3c2b51a8a436293d Mon Sep 17 00:00:00 2001
From: David Arcari <darcari@redhat.com>
Date: Tue, 12 Feb 2019 09:34:39 -0500
Subject: [PATCH 3156/3220] tools/power turbostat: return the exit status of a
 command

[ Upstream commit 2a95496634a017c19641f26f00907af75b962f01 ]

turbostat failed to return a non-zero exit status even though the
supplied command (turbostat <command>) failed.  Currently when turbostat
forks a command it returns zero instead of the actual exit status of the
command.  Modify the code to return the exit status.

Signed-off-by: David Arcari <darcari@redhat.com>
Acked-by: Len Brown <len.brown@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/power/x86/turbostat/turbostat.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 33c79e415075..532e7bf06868 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3089,6 +3089,9 @@ int fork_it(char **argv)
 		signal(SIGQUIT, SIG_IGN);
 		if (waitpid(child_pid, &status, 0) == -1)
 			err(status, "waitpid");
+
+		if (WIFEXITED(status))
+			status = WEXITSTATUS(status);
 	}
 	/*
 	 * n.b. fork_it() does not check for errors from for_all_cpus()

From f0023871b3399db052594983f4bc1e680269b708 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Sat, 16 Mar 2019 16:05:48 +0800
Subject: [PATCH 3157/3220] perf top: Fix error handling in cmd_top()

[ Upstream commit 70c819e4bf1c5f492768b399d898d458ccdad2b6 ]

We should go to the cleanup path, to avoid leaks, detected using gcc's
ASan.

Signed-off-by: Changbin Du <changbin.du@gmail.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20190316080556.3075-9-changbin.du@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/builtin-top.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 47719bde34c6..4e64ba8163bb 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1320,8 +1320,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 	symbol_conf.priv_size = sizeof(struct annotation);
 
 	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
-	if (symbol__init(NULL) < 0)
-		return -1;
+	status = symbol__init(NULL);
+	if (status < 0)
+		goto out_delete_evlist;
 
 	sort__setup_elide(stdout);
 

From 8b89e3c3cf7f077c3614aeb9db1221486cc0355d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 Mar 2019 16:41:28 -0300
Subject: [PATCH 3158/3220] perf evsel: Free evsel->counts in
 perf_evsel__exit()

[ Upstream commit 42dfa451d825a2ad15793c476f73e7bbc0f9d312 ]

Using gcc's ASan, Changbin reports:

  =================================================================
  ==7494==ERROR: LeakSanitizer: detected memory leaks

  Direct leak of 48 byte(s) in 1 object(s) allocated from:
      #0 0x7f0333a89138 in calloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xee138)
      #1 0x5625e5330a5e in zalloc util/util.h:23
      #2 0x5625e5330a9b in perf_counts__new util/counts.c:10
      #3 0x5625e5330ca0 in perf_evsel__alloc_counts util/counts.c:47
      #4 0x5625e520d8e5 in __perf_evsel__read_on_cpu util/evsel.c:1505
      #5 0x5625e517a985 in perf_evsel__read_on_cpu /home/work/linux/tools/perf/util/evsel.h:347
      #6 0x5625e517ad1a in test__openat_syscall_event tests/openat-syscall.c:47
      #7 0x5625e51528e6 in run_test tests/builtin-test.c:358
      #8 0x5625e5152baf in test_and_print tests/builtin-test.c:388
      #9 0x5625e51543fe in __cmd_test tests/builtin-test.c:583
      #10 0x5625e515572f in cmd_test tests/builtin-test.c:722
      #11 0x5625e51c3fb8 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302
      #12 0x5625e51c44f7 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354
      #13 0x5625e51c48fb in run_argv /home/changbin/work/linux/tools/perf/perf.c:398
      #14 0x5625e51c5069 in main /home/changbin/work/linux/tools/perf/perf.c:520
      #15 0x7f033214d09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a)

  Indirect leak of 72 byte(s) in 1 object(s) allocated from:
      #0 0x7f0333a89138 in calloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xee138)
      #1 0x5625e532560d in zalloc util/util.h:23
      #2 0x5625e532566b in xyarray__new util/xyarray.c:10
      #3 0x5625e5330aba in perf_counts__new util/counts.c:15
      #4 0x5625e5330ca0 in perf_evsel__alloc_counts util/counts.c:47
      #5 0x5625e520d8e5 in __perf_evsel__read_on_cpu util/evsel.c:1505
      #6 0x5625e517a985 in perf_evsel__read_on_cpu /home/work/linux/tools/perf/util/evsel.h:347
      #7 0x5625e517ad1a in test__openat_syscall_event tests/openat-syscall.c:47
      #8 0x5625e51528e6 in run_test tests/builtin-test.c:358
      #9 0x5625e5152baf in test_and_print tests/builtin-test.c:388
      #10 0x5625e51543fe in __cmd_test tests/builtin-test.c:583
      #11 0x5625e515572f in cmd_test tests/builtin-test.c:722
      #12 0x5625e51c3fb8 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302
      #13 0x5625e51c44f7 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354
      #14 0x5625e51c48fb in run_argv /home/changbin/work/linux/tools/perf/perf.c:398
      #15 0x5625e51c5069 in main /home/changbin/work/linux/tools/perf/perf.c:520
      #16 0x7f033214d09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a)

His patch took care of evsel->prev_raw_counts, but the above backtraces
are about evsel->counts, so fix that instead.

Reported-by: Changbin Du <changbin.du@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/n/tip-hd1x13g59f0nuhe4anxhsmfp@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/util/evsel.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f0bd4825f95a..97fde9275f42 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1051,6 +1051,7 @@ void perf_evsel__exit(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
 	assert(evsel->evlist == NULL);
+	perf_evsel__free_counts(evsel);
 	perf_evsel__free_fd(evsel);
 	perf_evsel__free_id(evsel);
 	perf_evsel__free_config_terms(evsel);

From 98977a809cdad45f74546a547b13b5da4cb709c6 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Sat, 16 Mar 2019 16:05:54 +0800
Subject: [PATCH 3159/3220] perf tests: Fix a memory leak of cpu_map object in
 the openat_syscall_event_on_all_cpus test

[ Upstream commit 93faa52e8371f0291ee1ff4994edae2b336b6233 ]

  =================================================================
  ==7497==ERROR: LeakSanitizer: detected memory leaks

  Direct leak of 40 byte(s) in 1 object(s) allocated from:
      #0 0x7f0333a88f30 in __interceptor_malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xedf30)
      #1 0x5625e5326213 in cpu_map__trim_new util/cpumap.c:45
      #2 0x5625e5326703 in cpu_map__read util/cpumap.c:103
      #3 0x5625e53267ef in cpu_map__read_all_cpu_map util/cpumap.c:120
      #4 0x5625e5326915 in cpu_map__new util/cpumap.c:135
      #5 0x5625e517b355 in test__openat_syscall_event_on_all_cpus tests/openat-syscall-all-cpus.c:36
      #6 0x5625e51528e6 in run_test tests/builtin-test.c:358
      #7 0x5625e5152baf in test_and_print tests/builtin-test.c:388
      #8 0x5625e51543fe in __cmd_test tests/builtin-test.c:583
      #9 0x5625e515572f in cmd_test tests/builtin-test.c:722
      #10 0x5625e51c3fb8 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302
      #11 0x5625e51c44f7 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354
      #12 0x5625e51c48fb in run_argv /home/changbin/work/linux/tools/perf/perf.c:398
      #13 0x5625e51c5069 in main /home/changbin/work/linux/tools/perf/perf.c:520
      #14 0x7f033214d09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a)

Signed-off-by: Changbin Du <changbin.du@gmail.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Fixes: f30a79b012e5 ("perf tools: Add reference counting for cpu_map object")
Link: http://lkml.kernel.org/r/20190316080556.3075-15-changbin.du@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/tests/openat-syscall-all-cpus.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c
index 2006485a2859..3848d5ab378d 100644
--- a/tools/perf/tests/openat-syscall-all-cpus.c
+++ b/tools/perf/tests/openat-syscall-all-cpus.c
@@ -35,7 +35,7 @@ int test__openat_syscall_event_on_all_cpus(void)
 	if (IS_ERR(evsel)) {
 		tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "syscalls", "sys_enter_openat");
 		pr_debug("%s\n", errbuf);
-		goto out_thread_map_delete;
+		goto out_cpu_map_delete;
 	}
 
 	if (perf_evsel__open(evsel, cpus, threads) < 0) {
@@ -109,6 +109,8 @@ out_close_fd:
 	perf_evsel__close_fd(evsel, 1, threads->nr);
 out_evsel_delete:
 	perf_evsel__delete(evsel);
+out_cpu_map_delete:
+	cpu_map__put(cpus);
 out_thread_map_delete:
 	thread_map__put(threads);
 	return err;

From b17b42fbad1b8d2b98a6a8bf3c10a305c5d4d3e2 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Sat, 16 Mar 2019 16:05:56 +0800
Subject: [PATCH 3160/3220] perf tests: Fix a memory leak in
 test__perf_evsel__tp_sched_test()

[ Upstream commit d982b33133284fa7efa0e52ae06b88f9be3ea764 ]

  =================================================================
  ==20875==ERROR: LeakSanitizer: detected memory leaks

  Direct leak of 1160 byte(s) in 1 object(s) allocated from:
      #0 0x7f1b6fc84138 in calloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xee138)
      #1 0x55bd50005599 in zalloc util/util.h:23
      #2 0x55bd500068f5 in perf_evsel__newtp_idx util/evsel.c:327
      #3 0x55bd4ff810fc in perf_evsel__newtp /home/work/linux/tools/perf/util/evsel.h:216
      #4 0x55bd4ff81608 in test__perf_evsel__tp_sched_test tests/evsel-tp-sched.c:69
      #5 0x55bd4ff528e6 in run_test tests/builtin-test.c:358
      #6 0x55bd4ff52baf in test_and_print tests/builtin-test.c:388
      #7 0x55bd4ff543fe in __cmd_test tests/builtin-test.c:583
      #8 0x55bd4ff5572f in cmd_test tests/builtin-test.c:722
      #9 0x55bd4ffc4087 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302
      #10 0x55bd4ffc45c6 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354
      #11 0x55bd4ffc49ca in run_argv /home/changbin/work/linux/tools/perf/perf.c:398
      #12 0x55bd4ffc5138 in main /home/changbin/work/linux/tools/perf/perf.c:520
      #13 0x7f1b6e34809a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a)

  Indirect leak of 19 byte(s) in 1 object(s) allocated from:
      #0 0x7f1b6fc83f30 in __interceptor_malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xedf30)
      #1 0x7f1b6e3ac30f in vasprintf (/lib/x86_64-linux-gnu/libc.so.6+0x8830f)

Signed-off-by: Changbin Du <changbin.du@gmail.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Fixes: 6a6cd11d4e57 ("perf test: Add test for the sched tracepoint format fields")
Link: http://lkml.kernel.org/r/20190316080556.3075-17-changbin.du@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 tools/perf/tests/evsel-tp-sched.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/tests/evsel-tp-sched.c b/tools/perf/tests/evsel-tp-sched.c
index 1b832e54c04b..ea3161170f9e 100644
--- a/tools/perf/tests/evsel-tp-sched.c
+++ b/tools/perf/tests/evsel-tp-sched.c
@@ -84,5 +84,6 @@ int test__perf_evsel__tp_sched_test(void)
 	if (perf_evsel__test_field(evsel, "target_cpu", 4, true))
 		ret = -1;
 
+	perf_evsel__delete(evsel);
 	return ret;
 }

From c2dc2fdc0cd17d308671fb2dcdfcb2d2b89cfcf9 Mon Sep 17 00:00:00 2001
From: Aditya Pakki <pakki001@umn.edu>
Date: Mon, 18 Mar 2019 21:19:56 -0500
Subject: [PATCH 3161/3220] x86/hpet: Prevent potential NULL pointer
 dereference

[ Upstream commit 2e84f116afca3719c9d0a1a78b47b48f75fd5724 ]

hpet_virt_address may be NULL when ioremap_nocache fail, but the code lacks
a check.

Add a check to prevent NULL pointer dereference.

Signed-off-by: Aditya Pakki <pakki001@umn.edu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: kjlu@umn.edu
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Joe Perches <joe@perches.com>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Roland Dreier <roland@purestorage.com>
Link: https://lkml.kernel.org/r/20190319021958.17275-1-pakki001@umn.edu
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kernel/hpet.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 3fdc1e53aaac..9cce5504a5c7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -825,6 +825,8 @@ int __init hpet_enable(void)
 		return 0;
 
 	hpet_set_mapping();
+	if (!hpet_virt_address)
+		return 0;
 
 	/*
 	 * Read the period and check for a sane value:

From 1a883611e2f282af13bf0baa2bb35a9301d01b55 Mon Sep 17 00:00:00 2001
From: Matthew Whitehead <tedheadster@gmail.com>
Date: Thu, 14 Mar 2019 16:46:00 -0400
Subject: [PATCH 3162/3220] x86/cpu/cyrix: Use correct macros for Cyrix calls
 on Geode processors

[ Upstream commit 18fb053f9b827bd98cfc64f2a35df8ab19745a1d ]

There are comments in processor-cyrix.h advising you to _not_ make calls
using the deprecated macros in this style:

  setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);

This is because it expands the macro into a non-functioning calling
sequence. The calling order must be:

  outb(CX86_CCR2, 0x22);
  inb(0x23);

From the comments:

 * When using the old macros a line like
 *   setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
 * gets expanded to:
 *  do {
 *    outb((CX86_CCR2), 0x22);
 *    outb((({
 *        outb((CX86_CCR2), 0x22);
 *        inb(0x23);
 *    }) | 0x88), 0x23);
 *  } while (0);

The new macros fix this problem, so use them instead. Tested on an
actual Geode processor.

Signed-off-by: Matthew Whitehead <tedheadster@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: luto@kernel.org
Link: https://lkml.kernel.org/r/1552596361-8967-2-git-send-email-tedheadster@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kernel/cpu/cyrix.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 15e47c1cd412..6e4e4191abb5 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -121,7 +121,7 @@ static void set_cx86_reorder(void)
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
 
 	/* Load/Store Serialize to mem access disable (=reorder it) */
-	setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
+	setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
 	/* set load/store serialize from 1GB to 4GB */
 	ccr3 |= 0xe0;
 	setCx86(CX86_CCR3, ccr3);
@@ -132,11 +132,11 @@ static void set_cx86_memwb(void)
 	printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
 
 	/* CCR2 bit 2: unlock NW bit */
-	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
+	setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
 	/* set 'Not Write-through' */
 	write_cr0(read_cr0() | X86_CR0_NW);
 	/* CCR2 bit 2: lock NW bit and set WT1 */
-	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
+	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
 }
 
 /*
@@ -150,14 +150,14 @@ static void geode_configure(void)
 	local_irq_save(flags);
 
 	/* Suspend on halt power saving and enable #SUSP pin */
-	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
+	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
 
 	ccr3 = getCx86(CX86_CCR3);
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);	/* enable MAPEN */
 
 
 	/* FPU fast, DTE cache, Mem bypass */
-	setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
+	setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
 	setCx86(CX86_CCR3, ccr3);			/* disable MAPEN */
 
 	set_cx86_memwb();
@@ -292,7 +292,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
 		/* GXm supports extended cpuid levels 'ala' AMD */
 		if (c->cpuid_level == 2) {
 			/* Enable cxMMX extensions (GX1 Datasheet 54) */
-			setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
+			setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
 
 			/*
 			 * GXm : 0x30 ... 0x5f GXm  datasheet 51
@@ -315,7 +315,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
 		if (dir1 > 7) {
 			dir0_msn++;  /* M II */
 			/* Enable MMX extensions (App note 108) */
-			setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
+			setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
 		} else {
 			/* A 6x86MX - it has the bug. */
 			set_cpu_bug(c, X86_BUG_COMA);

From 158c4cb0d6866f65d6416ae66876a59f8f992c55 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 20 Mar 2019 09:58:33 +0800
Subject: [PATCH 3163/3220] iommu/vt-d: Check capability before disabling
 protected memory

[ Upstream commit 5bb71fc790a88d063507dc5d445ab8b14e845591 ]

The spec states in 10.4.16 that the Protected Memory Enable
Register should be treated as read-only for implementations
not supporting protected memory regions (PLMR and PHMR fields
reported as Clear in the Capability register).

Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: mark gross <mgross@intel.com>
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Fixes: f8bab73515ca5 ("intel-iommu: PMEN support")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/iommu/intel-iommu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 8b4a4d95669a..3e97c4b2ebed 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1598,6 +1598,9 @@ static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
 	u32 pmen;
 	unsigned long flags;
 
+	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
+		return;
+
 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
 	pmen &= ~DMA_PMEN_EPM;

From 35ee305cfe4c3827a90283bcc37ff364838baf9c Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Thu, 7 Mar 2019 14:27:56 -0700
Subject: [PATCH 3164/3220] x86/hw_breakpoints: Make default case in
 hw_breakpoint_arch_parse() return an error

[ Upstream commit e898e69d6b9475bf123f99b3c5d1a67bb7cb2361 ]

When building with -Wsometimes-uninitialized, Clang warns:

arch/x86/kernel/hw_breakpoint.c:355:2: warning: variable 'align' is used
uninitialized whenever switch default is taken
[-Wsometimes-uninitialized]

The default cannot be reached because arch_build_bp_info() initializes
hw->len to one of the specified cases. Nevertheless the warning is valid
and returning -EINVAL makes sure that this cannot be broken by future
modifications.

Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: clang-built-linux@googlegroups.com
Link: https://github.com/ClangBuiltLinux/linux/issues/392
Link: https://lkml.kernel.org/r/20190307212756.4648-1-natechancellor@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/kernel/hw_breakpoint.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 2bcfb5f2bc44..433f17d154e2 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -351,6 +351,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 #endif
 	default:
 		WARN_ON_ONCE(1);
+		return -EINVAL;
 	}
 
 	/*

From 0f2d3133530a4437deca167a887575e8999d7c03 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Sun, 17 Mar 2019 15:58:38 -0500
Subject: [PATCH 3165/3220] fix incorrect error code mapping for
 OBJECTID_NOT_FOUND

[ Upstream commit 85f9987b236cf46e06ffdb5c225cf1f3c0acb789 ]

It was mapped to EIO which can be confusing when user space
queries for an object GUID for an object for which the server
file system doesn't support (or hasn't saved one).

As Amir Goldstein suggested this is similar to ENOATTR
(equivalently ENODATA in Linux errno definitions) so
changing NT STATUS code mapping for OBJECTID_NOT_FOUND
to ENODATA.

Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/smb2maperror.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 98c25b969ab8..7e93d5706bf6 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -1034,7 +1034,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
 	{STATUS_UNFINISHED_CONTEXT_DELETED, -EIO,
 	"STATUS_UNFINISHED_CONTEXT_DELETED"},
 	{STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"},
-	{STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"},
+	/* Note that ENOATTTR and ENODATA are the same errno */
+	{STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"},
 	{STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"},
 	{STATUS_WRONG_CREDENTIAL_HANDLE, -EIO,
 	"STATUS_WRONG_CREDENTIAL_HANDLE"},

From 48fda95006374eff0250d8886e8cb1ff04a81ced Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sat, 23 Mar 2019 12:10:29 -0400
Subject: [PATCH 3166/3220] ext4: prohibit fstrim in norecovery mode

[ Upstream commit 18915b5873f07e5030e6fb108a050fa7c71c59fb ]

The ext4 fstrim implementation uses the block bitmaps to find free space
that can be discarded.  If we haven't replayed the journal, the bitmaps
will be stale and we absolutely *cannot* use stale metadata to zap the
underlying storage.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ext4/ioctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bcd7c4788903..e44e3cd738b6 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -599,6 +599,13 @@ resizefs_out:
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
 
+		/*
+		 * We haven't replayed the journal, so we cannot use our
+		 * block-bitmap-guided storage zapping commands.
+		 */
+		if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb))
+			return -EROFS;
+
 		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
 		    sizeof(range)))
 			return -EFAULT;

From ec759c0015fb7d4f5c7cb5711d2c8905724c7983 Mon Sep 17 00:00:00 2001
From: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Date: Mon, 27 Aug 2018 17:05:15 +0530
Subject: [PATCH 3167/3220] rsi: improve kernel thread handling to fix kernel
 panic

[ Upstream commit 4c62764d0fc21a34ffc44eec1210038c3a2e4473 ]

While running regressions, observed below kernel panic when sdio disconnect
called. This is because of, kthread_stop() is taking care of
wait_for_completion() by default. When wait_for_completion triggered
in kthread_stop and as it was done already, giving kernel panic.
Hence, removing redundant wait_for_completion() from rsi_kill_thread().

... skipping ...
BUG: unable to handle kernel NULL pointer dereference at           (null)
IP: [<ffffffff810a63df>] exit_creds+0x1f/0x50
PGD 0
Oops: 0002 [#1] SMP
CPU: 0 PID: 6502 Comm: rmmod Tainted: G  OE   4.15.9-Generic #154-Ubuntu
Hardware name: Dell Inc. Edge Gateway 3003/ , BIOS 01.00.00 04/17/2017
Stack:
ffff88007392e600 ffff880075847dc0 ffffffff8108160a 0000000000000000
ffff88007392e600 ffff880075847de8 ffffffff810a484b ffff880076127000
ffff88003cd3a800 ffff880074f12a00 ffff880075847e28 ffffffffc09bed15
Call Trace:
[<ffffffff8108160a>] __put_task_struct+0x5a/0x140
[<ffffffff810a484b>] kthread_stop+0x10b/0x110
[<ffffffffc09bed15>] rsi_disconnect+0x2f5/0x300 [ven_rsi_sdio]
[<ffffffff81578bcb>] ? __pm_runtime_resume+0x5b/0x80
[<ffffffff816f0918>] sdio_bus_remove+0x38/0x100
[<ffffffff8156cc64>] __device_release_driver+0xa4/0x150
[<ffffffff8156d7a5>] driver_detach+0xb5/0xc0
[<ffffffff8156c6c5>] bus_remove_driver+0x55/0xd0
[<ffffffff8156dfbc>] driver_unregister+0x2c/0x50
[<ffffffff816f0b8a>] sdio_unregister_driver+0x1a/0x20
[<ffffffffc09bf0f5>] rsi_module_exit+0x15/0x30 [ven_rsi_sdio]
[<ffffffff8110cad8>] SyS_delete_module+0x1b8/0x210
[<ffffffff81851dc8>] entry_SYSCALL_64_fastpath+0x1c/0xbb

Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/wireless/rsi/rsi_common.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_common.h b/drivers/net/wireless/rsi/rsi_common.h
index d3fbe33d2324..a13f08fd8690 100644
--- a/drivers/net/wireless/rsi/rsi_common.h
+++ b/drivers/net/wireless/rsi/rsi_common.h
@@ -75,7 +75,6 @@ static inline int rsi_kill_thread(struct rsi_thread *handle)
 	atomic_inc(&handle->thread_done);
 	rsi_set_event(&handle->event);
 
-	wait_for_completion(&handle->completion);
 	return kthread_stop(handle->task);
 }
 

From ff831803330c01b9b29ee05b7de2887d7752394d Mon Sep 17 00:00:00 2001
From: Gertjan Halkes <gertjan@google.com>
Date: Wed, 5 Sep 2018 15:41:29 +0900
Subject: [PATCH 3168/3220] 9p: do not trust pdu content for stat item size

[ Upstream commit 2803cf4379ed252894f046cb8812a48db35294e3 ]

v9fs_dir_readdir() could deadloop if a struct was sent with a size set
to -2

Link: http://lkml.kernel.org/r/1536134432-11997-1-git-send-email-asmadeus@codewreck.org
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=88021
Signed-off-by: Gertjan Halkes <gertjan@google.com>
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/9p/vfs_dir.c   | 8 +++-----
 net/9p/protocol.c | 3 ++-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 7d889f56b8e7..05769219d2c2 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -105,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 	int err = 0;
 	struct p9_fid *fid;
 	int buflen;
-	int reclen = 0;
 	struct p9_rdir *rdir;
 	struct kvec kvec;
 
@@ -138,11 +137,10 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 		while (rdir->head < rdir->tail) {
 			err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
 					  rdir->tail - rdir->head, &st);
-			if (err) {
+			if (err <= 0) {
 				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
 				return -EIO;
 			}
-			reclen = st.size+2;
 
 			over = !dir_emit(ctx, st.name, strlen(st.name),
 					 v9fs_qid2ino(&st.qid), dt_type(&st));
@@ -150,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 			if (over)
 				return 0;
 
-			rdir->head += reclen;
-			ctx->pos += reclen;
+			rdir->head += err;
+			ctx->pos += err;
 		}
 	}
 }
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 145f80518064..7f1b45c082c9 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -570,9 +570,10 @@ int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st)
 	if (ret) {
 		p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
 		trace_9p_protocol_dump(clnt, &fake_pdu);
+		return ret;
 	}
 
-	return ret;
+	return fake_pdu.offset;
 }
 EXPORT_SYMBOL(p9stat_read);
 

From 6891371a2d57f32f064a8639ef5c90eb4968a532 Mon Sep 17 00:00:00 2001
From: Dinu-Razvan Chis-Serban <justcsdr@gmail.com>
Date: Wed, 5 Sep 2018 16:44:12 +0900
Subject: [PATCH 3169/3220] 9p locks: add mount option for lock retry interval

[ Upstream commit 5e172f75e51e3de1b4274146d9b990f803cb5c2a ]

The default P9_LOCK_TIMEOUT can be too long for some users exporting
a local file system to a guest VM (30s), make this configurable at
mount time.

Link: http://lkml.kernel.org/r/1536295827-3181-1-git-send-email-asmadeus@codewreck.org
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195727
Signed-off-by: Dinu-Razvan Chis-Serban <justcsdr@gmail.com>
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/9p/v9fs.c     | 21 +++++++++++++++++++++
 fs/9p/v9fs.h     |  1 +
 fs/9p/vfs_file.c |  6 +++++-
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6caca025019d..1e9bb8db7b48 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -59,6 +59,8 @@ enum {
 	Opt_cache_loose, Opt_fscache, Opt_mmap,
 	/* Access options */
 	Opt_access, Opt_posixacl,
+	/* Lock timeout option */
+	Opt_locktimeout,
 	/* Error token */
 	Opt_err
 };
@@ -78,6 +80,7 @@ static const match_table_t tokens = {
 	{Opt_cachetag, "cachetag=%s"},
 	{Opt_access, "access=%s"},
 	{Opt_posixacl, "posixacl"},
+	{Opt_locktimeout, "locktimeout=%u"},
 	{Opt_err, NULL}
 };
 
@@ -126,6 +129,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 #ifdef CONFIG_9P_FSCACHE
 	v9ses->cachetag = NULL;
 #endif
+	v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
 
 	if (!opts)
 		return 0;
@@ -298,6 +302,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 #endif
 			break;
 
+		case Opt_locktimeout:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			if (option < 1) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "locktimeout must be a greater than zero integer.\n");
+				ret = -EINVAL;
+				continue;
+			}
+			v9ses->session_lock_timeout = (long)option * HZ;
+			break;
+
 		default:
 			continue;
 		}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6877050384a1..3775f275ede3 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -116,6 +116,7 @@ struct v9fs_session_info {
 	struct list_head slist; /* list of sessions registered with v9fs */
 	struct backing_dev_info bdi;
 	struct rw_semaphore rename_sem;
+	long session_lock_timeout; /* retry interval for blocking locks */
 };
 
 /* cache_validity flags */
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 62ce8b4a7e5f..373cc50544e9 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 	uint8_t status = P9_LOCK_ERROR;
 	int res = 0;
 	unsigned char fl_type;
+	struct v9fs_session_info *v9ses;
 
 	fid = filp->private_data;
 	BUG_ON(fid == NULL);
@@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 	if (IS_SETLKW(cmd))
 		flock.flags = P9_LOCK_FLAGS_BLOCK;
 
+	v9ses = v9fs_inode2v9ses(file_inode(filp));
+
 	/*
 	 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
 	 * for lock request, keep on trying
@@ -202,7 +205,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 			break;
 		if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
 			break;
-		if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+		if (schedule_timeout_interruptible(v9ses->session_lock_timeout)
+				!= 0)
 			break;
 		/*
 		 * p9_client_lock_dotl overwrites flock.client_id with the

From 045aac482ad93d4eef25120d9c5263e2b739fec3 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 6 Sep 2018 20:34:12 +0800
Subject: [PATCH 3170/3220] f2fs: fix to do sanity check with current segment
 number

[ Upstream commit 042be0f849e5fc24116d0afecfaf926eed5cac63 ]

https://bugzilla.kernel.org/show_bug.cgi?id=200219

Reproduction way:
- mount image
- run poc code
- umount image

F2FS-fs (loop1): Bitmap was wrongly set, blk:15364
------------[ cut here ]------------
kernel BUG at /home/yuchao/git/devf2fs/segment.c:2061!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU: 2 PID: 17686 Comm: umount Tainted: G        W  O      4.18.0-rc2+ #39
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
EIP: update_sit_entry+0x459/0x4e0 [f2fs]
Code: e8 1c b5 fd ff 0f 0b 0f 0b 8b 45 e4 c7 44 24 08 9c 7a 6c f8 c7 44 24 04 bc 4a 6c f8 89 44 24 0c 8b 06 89 04 24 e8 f7 b4 fd ff <0f> 0b 8b 45 e4 0f b6 d2 89 54 24 10 c7 44 24 08 60 7a 6c f8 c7 44
EAX: 00000032 EBX: 000000f8 ECX: 00000002 EDX: 00000001
ESI: d7177000 EDI: f520fe68 EBP: d6477c6c ESP: d6477c34
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010282
CR0: 80050033 CR2: b7fbe000 CR3: 2a99b3c0 CR4: 000406f0
Call Trace:
 f2fs_allocate_data_block+0x124/0x580 [f2fs]
 do_write_page+0x78/0x150 [f2fs]
 f2fs_do_write_node_page+0x25/0xa0 [f2fs]
 __write_node_page+0x2bf/0x550 [f2fs]
 f2fs_sync_node_pages+0x60e/0x6d0 [f2fs]
 ? sync_inode_metadata+0x2f/0x40
 ? f2fs_write_checkpoint+0x28f/0x7d0 [f2fs]
 ? up_write+0x1e/0x80
 f2fs_write_checkpoint+0x2a9/0x7d0 [f2fs]
 ? mark_held_locks+0x5d/0x80
 ? _raw_spin_unlock_irq+0x27/0x50
 kill_f2fs_super+0x68/0x90 [f2fs]
 deactivate_locked_super+0x3d/0x70
 deactivate_super+0x40/0x60
 cleanup_mnt+0x39/0x70
 __cleanup_mnt+0x10/0x20
 task_work_run+0x81/0xa0
 exit_to_usermode_loop+0x59/0xa7
 do_fast_syscall_32+0x1f5/0x22c
 entry_SYSENTER_32+0x53/0x86
EIP: 0xb7f95c51
Code: c1 1e f7 ff ff 89 e5 8b 55 08 85 d2 8b 81 64 cd ff ff 74 02 89 02 5d c3 8b 0c 24 c3 8b 1c 24 c3 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76
EAX: 00000000 EBX: 0871ab90 ECX: bfb2cd00 EDX: 00000000
ESI: 00000000 EDI: 0871ab90 EBP: 0871ab90 ESP: bfb2cd7c
DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000246
Modules linked in: f2fs(O) crc32_generic bnep rfcomm bluetooth ecdh_generic snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq pcbc joydev aesni_intel snd_seq_device aes_i586 snd_timer crypto_simd snd cryptd soundcore mac_hid serio_raw video i2c_piix4 parport_pc ppdev lp parport hid_generic psmouse usbhid hid e1000 [last unloaded: f2fs]
---[ end trace d423f83982cfcdc5 ]---

The reason is, different log headers using the same segment, once
one log's next block address is used by another log, it will cause
panic as above.

Main area: 24 segs, 24 secs 24 zones
  - COLD  data: 0, 0, 0
  - WARM  data: 1, 1, 1
  - HOT   data: 20, 20, 20
  - Dir   dnode: 22, 22, 22
  - File   dnode: 22, 22, 22
  - Indir nodes: 21, 21, 21

So this patch adds sanity check to detect such condition to avoid
this issue.

Signed-off-by: Chao Yu <yuchao0@huawei.com>

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>

Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/super.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index dbd7adff8b5a..bd0dfaecfac3 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1143,7 +1143,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	unsigned int segment_count_main;
 	unsigned int cp_pack_start_sum, cp_payload;
 	block_t user_block_count;
-	int i;
+	int i, j;
 
 	total = le32_to_cpu(raw_super->segment_count);
 	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -1184,11 +1184,43 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 		if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
 			le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg)
 			return 1;
+		for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) {
+			if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
+				le32_to_cpu(ckpt->cur_node_segno[j])) {
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Node segment (%u, %u) has the same "
+					"segno: %u", i, j,
+					le32_to_cpu(ckpt->cur_node_segno[i]));
+				return 1;
+			}
+		}
 	}
 	for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
 		if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
 			le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg)
 			return 1;
+		for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) {
+			if (le32_to_cpu(ckpt->cur_data_segno[i]) ==
+				le32_to_cpu(ckpt->cur_data_segno[j])) {
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Data segment (%u, %u) has the same "
+					"segno: %u", i, j,
+					le32_to_cpu(ckpt->cur_data_segno[i]));
+				return 1;
+			}
+		}
+	}
+	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
+		for (j = i; j < NR_CURSEG_DATA_TYPE; j++) {
+			if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
+				le32_to_cpu(ckpt->cur_data_segno[j])) {
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Data segment (%u) and Data segment (%u)"
+					" has the same segno: %u", i, j,
+					le32_to_cpu(ckpt->cur_node_segno[i]));
+				return 1;
+			}
+		}
 	}
 
 	sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);

From 3e56f3036b4d347c0b65e82b93d67e69d362429c Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Mon, 3 Sep 2018 15:10:49 +0200
Subject: [PATCH 3171/3220] serial: uartps: console_setup() can't be placed to
 init section

[ Upstream commit 4bb1ce2350a598502b23088b169e16b43d4bc639 ]

When console device is rebinded, console_setup() is called again.
But marking it as __init means that function will be clear after boot is
complete. If console device is binded again console_setup() is not found
and error "Unable to handle kernel paging request at virtual address"
is reported.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/tty/serial/xilinx_uartps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c
index 4f2f4aca8d2e..06efcef1b495 100644
--- a/drivers/tty/serial/xilinx_uartps.c
+++ b/drivers/tty/serial/xilinx_uartps.c
@@ -1145,7 +1145,7 @@ static void cdns_uart_console_write(struct console *co, const char *s,
  *
  * Return: 0 on success, negative errno otherwise.
  */
-static int __init cdns_uart_console_setup(struct console *co, char *options)
+static int cdns_uart_console_setup(struct console *co, char *options)
 {
 	struct uart_port *port = &cdns_uart_port[co->index];
 	int baud = 9600;

From 1e3975db46daaf51e1fd35ae8b69b304895eeddd Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Fri, 28 Sep 2018 15:32:46 +0200
Subject: [PATCH 3172/3220] ARM: samsung: Limit SAMSUNG_PM_CHECK config option
 to non-Exynos platforms

[ Upstream commit 6862fdf2201ab67cd962dbf0643d37db909f4860 ]

"S3C2410 PM Suspend Memory CRC" feature (controlled by
SAMSUNG_PM_CHECK config option) is incompatible with highmem
(uses phys_to_virt() instead of proper mapping) which is used by
the majority of Exynos boards. The issue manifests itself in OOPS
on affected boards, i.e. on Odroid-U3 I got the following one:

Unable to handle kernel paging request at virtual address f0000000
pgd = 1c0f9bb4
[f0000000] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT SMP ARM
[<c0458034>] (crc32_le) from [<c0121f8c>] (s3c_pm_makecheck+0x34/0x54)
[<c0121f8c>] (s3c_pm_makecheck) from [<c0121efc>] (s3c_pm_run_res+0x74/0x8c)
[<c0121efc>] (s3c_pm_run_res) from [<c0121ecc>] (s3c_pm_run_res+0x44/0x8c)
[<c0121ecc>] (s3c_pm_run_res) from [<c01210b8>] (exynos_suspend_enter+0x64/0x148)
[<c01210b8>] (exynos_suspend_enter) from [<c018893c>] (suspend_devices_and_enter+0x9ec/0xe74)
[<c018893c>] (suspend_devices_and_enter) from [<c0189534>] (pm_suspend+0x770/0xc04)
[<c0189534>] (pm_suspend) from [<c0186ce8>] (state_store+0x6c/0xcc)
[<c0186ce8>] (state_store) from [<c09db434>] (kobj_attr_store+0x14/0x20)
[<c09db434>] (kobj_attr_store) from [<c02fa63c>] (sysfs_kf_write+0x4c/0x50)
[<c02fa63c>] (sysfs_kf_write) from [<c02f97a4>] (kernfs_fop_write+0xfc/0x1e4)
[<c02f97a4>] (kernfs_fop_write) from [<c027b198>] (__vfs_write+0x2c/0x140)
[<c027b198>] (__vfs_write) from [<c027b418>] (vfs_write+0xa4/0x160)
[<c027b418>] (vfs_write) from [<c027b5d8>] (ksys_write+0x40/0x8c)
[<c027b5d8>] (ksys_write) from [<c0101000>] (ret_fast_syscall+0x0/0x28)

Add PLAT_S3C24XX, ARCH_S3C64XX and ARCH_S5PV210 dependencies to
SAMSUNG_PM_CHECK config option to hide it on Exynos platforms.

Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/plat-samsung/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/plat-samsung/Kconfig b/arch/arm/plat-samsung/Kconfig
index 57729b915003..b9396dcf836d 100644
--- a/arch/arm/plat-samsung/Kconfig
+++ b/arch/arm/plat-samsung/Kconfig
@@ -255,7 +255,7 @@ config S3C_PM_DEBUG_LED_SMDK
 
 config SAMSUNG_PM_CHECK
 	bool "S3C2410 PM Suspend Memory CRC"
-	depends on PM
+	depends on PM && (PLAT_S3C24XX || ARCH_S3C64XX || ARCH_S5PV210)
 	select CRC32
 	help
 	  Enable the PM code's memory area checksum over sleep. This option

From c49c3925a8462a9804bf6e498af39378b619b095 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ronald=20Tschal=C3=A4r?= <ronald@innovation.ch>
Date: Sun, 30 Sep 2018 19:52:51 -0700
Subject: [PATCH 3173/3220] ACPI / SBS: Fix GPE storm on recent MacBookPro's
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit ca1721c5bee77105829cbd7baab8ee0eab85b06d ]

On Apple machines, plugging-in or unplugging the power triggers a GPE
for the EC. Since these machines expose an SBS device, this GPE ends
up triggering the acpi_sbs_callback(). This in turn tries to get the
status of the SBS charger. However, on MBP13,* and MBP14,* machines,
performing the smbus-read operation to get the charger's status triggers
the EC's GPE again. The result is an endless re-triggering and handling
of that GPE, consuming significant CPU resources (> 50% in irq).

In the end this is quite similar to commit 3031cddea633 (ACPI / SBS:
Don't assume the existence of an SBS charger), except that on the above
machines a status of all 1's is returned. And like there, we just want
ignore the charger here.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=198169
Signed-off-by: Ronald TschalÃ¤r <ronald@innovation.ch>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/acpi/sbs.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index cb3dedb1beae..b133dac8a7f2 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -443,9 +443,13 @@ static int acpi_ac_get_present(struct acpi_sbs *sbs)
 
 	/*
 	 * The spec requires that bit 4 always be 1. If it's not set, assume
-	 * that the implementation doesn't support an SBS charger
+	 * that the implementation doesn't support an SBS charger.
+	 *
+	 * And on some MacBooks a status of 0xffff is always returned, no
+	 * matter whether the charger is plugged in or not, which is also
+	 * wrong, so ignore the SBS charger for those too.
 	 */
-	if (!((status >> 4) & 0x1))
+	if (!((status >> 4) & 0x1) || status == 0xffff)
 		return -ENODEV;
 
 	sbs->charger_present = (status >> 15) & 0x1;

From 740562f32daa2ed881c0f3e1430a6f67dda15199 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Fri, 19 Oct 2018 01:58:22 -0500
Subject: [PATCH 3174/3220] cifs: fallback to older infolevels on findfirst
 queryinfo retry

[ Upstream commit 3b7960caceafdfc2cdfe2850487f8d091eb41144 ]

In cases where queryinfo fails, we have cases in cifs (vers=1.0)
where with backupuid mounts we retry the query info with findfirst.
This doesn't work to some NetApp servers which don't support
WindowsXP (and later) infolevel 261 (SMB_FIND_FILE_ID_FULL_DIR_INFO)
so in this case use other info levels (in this case it will usually
be level 257, SMB_FIND_FILE_DIRECTORY_INFO).

(Also fixes some indentation)

See kernel bugzilla 201435

Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/inode.c | 67 +++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5c3187df9ab9..d8bd8dd36211 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -759,43 +759,50 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	} else if ((rc == -EACCES) && backup_cred(cifs_sb) &&
 		   (strcmp(server->vals->version_string, SMB1_VERSION_STRING)
 		      == 0)) {
-			/*
-			 * For SMB2 and later the backup intent flag is already
-			 * sent if needed on open and there is no path based
-			 * FindFirst operation to use to retry with
-			 */
+		/*
+		 * For SMB2 and later the backup intent flag is already
+		 * sent if needed on open and there is no path based
+		 * FindFirst operation to use to retry with
+		 */
 
-			srchinf = kzalloc(sizeof(struct cifs_search_info),
-						GFP_KERNEL);
-			if (srchinf == NULL) {
-				rc = -ENOMEM;
-				goto cgii_exit;
-			}
+		srchinf = kzalloc(sizeof(struct cifs_search_info),
+					GFP_KERNEL);
+		if (srchinf == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
 
-			srchinf->endOfSearch = false;
+		srchinf->endOfSearch = false;
+		if (tcon->unix_ext)
+			srchinf->info_level = SMB_FIND_FILE_UNIX;
+		else if ((tcon->ses->capabilities &
+			 tcon->ses->server->vals->cap_nt_find) == 0)
+			srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD;
+		else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
 			srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+		else /* no srvino useful for fallback to some netapp */
+			srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO;
 
-			srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
-					CIFS_SEARCH_CLOSE_AT_END |
-					CIFS_SEARCH_BACKUP_SEARCH;
+		srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
+				CIFS_SEARCH_CLOSE_AT_END |
+				CIFS_SEARCH_BACKUP_SEARCH;
 
-			rc = CIFSFindFirst(xid, tcon, full_path,
-				cifs_sb, NULL, srchflgs, srchinf, false);
-			if (!rc) {
-				data =
-				(FILE_ALL_INFO *)srchinf->srch_entries_start;
+		rc = CIFSFindFirst(xid, tcon, full_path,
+			cifs_sb, NULL, srchflgs, srchinf, false);
+		if (!rc) {
+			data = (FILE_ALL_INFO *)srchinf->srch_entries_start;
 
-				cifs_dir_info_to_fattr(&fattr,
-				(FILE_DIRECTORY_INFO *)data, cifs_sb);
-				fattr.cf_uniqueid = le64_to_cpu(
-				((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
-				validinum = true;
+			cifs_dir_info_to_fattr(&fattr,
+			(FILE_DIRECTORY_INFO *)data, cifs_sb);
+			fattr.cf_uniqueid = le64_to_cpu(
+			((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
+			validinum = true;
 
-				cifs_buf_release(srchinf->ntwrk_buf_start);
-			}
-			kfree(srchinf);
-			if (rc)
-				goto cgii_exit;
+			cifs_buf_release(srchinf->ntwrk_buf_start);
+		}
+		kfree(srchinf);
+		if (rc)
+			goto cgii_exit;
 	} else
 		goto cgii_exit;
 

From 8a2dd49a63c752b3f7ae850b6a2d0c44ebb37276 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sat, 16 Feb 2019 14:51:25 +0100
Subject: [PATCH 3175/3220] crypto: sha256/arm - fix crash bug in Thumb2 build

[ Upstream commit 69216a545cf81b2b32d01948f7039315abaf75a0 ]

The SHA256 code we adopted from the OpenSSL project uses a rather
peculiar way to take the address of the round constant table: it
takes the address of the sha256_block_data_order() routine, and
substracts a constant known quantity to arrive at the base of the
table, which is emitted by the same assembler code right before
the routine's entry point.

However, recent versions of binutils have helpfully changed the
behavior of references emitted via an ADR instruction when running
in Thumb2 mode: it now takes the Thumb execution mode bit into
account, which is bit 0 af the address. This means the produced
table address also has bit 0 set, and so we end up with an address
value pointing 1 byte past the start of the table, which results
in crashes such as

  Unable to handle kernel paging request at virtual address bf825000
  pgd = 42f44b11
  [bf825000] *pgd=80000040206003, *pmd=5f1bd003, *pte=00000000
  Internal error: Oops: 207 [#1] PREEMPT SMP THUMB2
  Modules linked in: sha256_arm(+) sha1_arm_ce sha1_arm ...
  CPU: 7 PID: 396 Comm: cryptomgr_test Not tainted 5.0.0-rc6+ #144
  Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
  PC is at sha256_block_data_order+0xaaa/0xb30 [sha256_arm]
  LR is at __this_module+0x17fd/0xffffe800 [sha256_arm]
  pc : [<bf820bca>]    lr : [<bf824ffd>]    psr: 800b0033
  sp : ebc8bbe8  ip : faaabe1c  fp : 2fdd3433
  r10: 4c5f1692  r9 : e43037df  r8 : b04b0a5a
  r7 : c369d722  r6 : 39c3693e  r5 : 7a013189  r4 : 1580d26b
  r3 : 8762a9b0  r2 : eea9c2cd  r1 : 3e9ab536  r0 : 1dea4ae7
  Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  ISA Thumb  Segment user
  Control: 70c5383d  Table: 6b8467c0  DAC: dbadc0de
  Process cryptomgr_test (pid: 396, stack limit = 0x69e1fe23)
  Stack: (0xebc8bbe8 to 0xebc8c000)
  ...
  unwind: Unknown symbol address bf820bca
  unwind: Index not found bf820bca
  Code: 441a ea80 40f9 440a (f85e) 3b04
  ---[ end trace e560cce92700ef8a ]---

Given that this affects older kernels as well, in case they are built
with a recent toolchain, apply a minimal backportable fix, which is
to emit another non-code label at the start of the routine, and
reference that instead. (This is similar to the current upstream state
of this file in OpenSSL)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/crypto/sha256-armv4.pl       | 3 ++-
 arch/arm/crypto/sha256-core.S_shipped | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
index fac0533ea633..f64e8413ab9a 100644
--- a/arch/arm/crypto/sha256-armv4.pl
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -205,10 +205,11 @@ K256:
 .global	sha256_block_data_order
 .type	sha256_block_data_order,%function
 sha256_block_data_order:
+.Lsha256_block_data_order:
 #if __ARM_ARCH__<7
 	sub	r3,pc,#8		@ sha256_block_data_order
 #else
-	adr	r3,sha256_block_data_order
+	adr	r3,.Lsha256_block_data_order
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
index 555a1a8eec90..72c248081d27 100644
--- a/arch/arm/crypto/sha256-core.S_shipped
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -86,10 +86,11 @@ K256:
 .global	sha256_block_data_order
 .type	sha256_block_data_order,%function
 sha256_block_data_order:
+.Lsha256_block_data_order:
 #if __ARM_ARCH__<7
 	sub	r3,pc,#8		@ sha256_block_data_order
 #else
-	adr	r3,sha256_block_data_order
+	adr	r3,.Lsha256_block_data_order
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap

From cc8c23ad5ec876ebea901f28e33a7123ab6ae78f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sat, 16 Feb 2019 14:51:26 +0100
Subject: [PATCH 3176/3220] crypto: sha512/arm - fix crash bug in Thumb2 build

[ Upstream commit c64316502008064c158fa40cc250665e461b0f2a ]

The SHA512 code we adopted from the OpenSSL project uses a rather
peculiar way to take the address of the round constant table: it
takes the address of the sha256_block_data_order() routine, and
substracts a constant known quantity to arrive at the base of the
table, which is emitted by the same assembler code right before
the routine's entry point.

However, recent versions of binutils have helpfully changed the
behavior of references emitted via an ADR instruction when running
in Thumb2 mode: it now takes the Thumb execution mode bit into
account, which is bit 0 af the address. This means the produced
table address also has bit 0 set, and so we end up with an address
value pointing 1 byte past the start of the table, which results
in crashes such as

  Unable to handle kernel paging request at virtual address bf825000
  pgd = 42f44b11
  [bf825000] *pgd=80000040206003, *pmd=5f1bd003, *pte=00000000
  Internal error: Oops: 207 [#1] PREEMPT SMP THUMB2
  Modules linked in: sha256_arm(+) sha1_arm_ce sha1_arm ...
  CPU: 7 PID: 396 Comm: cryptomgr_test Not tainted 5.0.0-rc6+ #144
  Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
  PC is at sha256_block_data_order+0xaaa/0xb30 [sha256_arm]
  LR is at __this_module+0x17fd/0xffffe800 [sha256_arm]
  pc : [<bf820bca>]    lr : [<bf824ffd>]    psr: 800b0033
  sp : ebc8bbe8  ip : faaabe1c  fp : 2fdd3433
  r10: 4c5f1692  r9 : e43037df  r8 : b04b0a5a
  r7 : c369d722  r6 : 39c3693e  r5 : 7a013189  r4 : 1580d26b
  r3 : 8762a9b0  r2 : eea9c2cd  r1 : 3e9ab536  r0 : 1dea4ae7
  Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  ISA Thumb  Segment user
  Control: 70c5383d  Table: 6b8467c0  DAC: dbadc0de
  Process cryptomgr_test (pid: 396, stack limit = 0x69e1fe23)
  Stack: (0xebc8bbe8 to 0xebc8c000)
  ...
  unwind: Unknown symbol address bf820bca
  unwind: Index not found bf820bca
  Code: 441a ea80 40f9 440a (f85e) 3b04
  ---[ end trace e560cce92700ef8a ]---

Given that this affects older kernels as well, in case they are built
with a recent toolchain, apply a minimal backportable fix, which is
to emit another non-code label at the start of the routine, and
reference that instead. (This is similar to the current upstream state
of this file in OpenSSL)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/crypto/sha512-armv4.pl       | 3 ++-
 arch/arm/crypto/sha512-core.S_shipped | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl
index a2b11a844357..5fe336420bcf 100644
--- a/arch/arm/crypto/sha512-armv4.pl
+++ b/arch/arm/crypto/sha512-armv4.pl
@@ -267,10 +267,11 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
+.Lsha512_block_data_order:
 #if __ARM_ARCH__<7
 	sub	r3,pc,#8		@ sha512_block_data_order
 #else
-	adr	r3,sha512_block_data_order
+	adr	r3,.Lsha512_block_data_order
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap
diff --git a/arch/arm/crypto/sha512-core.S_shipped b/arch/arm/crypto/sha512-core.S_shipped
index 3694c4d4ca2b..de9bd7f55242 100644
--- a/arch/arm/crypto/sha512-core.S_shipped
+++ b/arch/arm/crypto/sha512-core.S_shipped
@@ -134,10 +134,11 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
+.Lsha512_block_data_order:
 #if __ARM_ARCH__<7
 	sub	r3,pc,#8		@ sha512_block_data_order
 #else
-	adr	r3,sha512_block_data_order
+	adr	r3,.Lsha512_block_data_order
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap

From 3d11671070042b3525c2495eb4c5956e16afd0ec Mon Sep 17 00:00:00 2001
From: Julia Cartwright <julia@ni.com>
Date: Wed, 20 Feb 2019 16:46:31 +0000
Subject: [PATCH 3177/3220] iommu/dmar: Fix buffer overflow during PCI bus
 notification

[ Upstream commit cffaaf0c816238c45cd2d06913476c83eb50f682 ]

Commit 57384592c433 ("iommu/vt-d: Store bus information in RMRR PCI
device path") changed the type of the path data, however, the change in
path type was not reflected in size calculations.  Update to use the
correct type and prevent a buffer overflow.

This bug manifests in systems with deep PCI hierarchies, and can lead to
an overflow of the static allocated buffer (dmar_pci_notify_info_buf),
or can lead to overflow of slab-allocated data.

   BUG: KASAN: global-out-of-bounds in dmar_alloc_pci_notify_info+0x1d5/0x2e0
   Write of size 1 at addr ffffffff90445d80 by task swapper/0/1
   CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W       4.14.87-rt49-02406-gd0a0e96 #1
   Call Trace:
    ? dump_stack+0x46/0x59
    ? print_address_description+0x1df/0x290
    ? dmar_alloc_pci_notify_info+0x1d5/0x2e0
    ? kasan_report+0x256/0x340
    ? dmar_alloc_pci_notify_info+0x1d5/0x2e0
    ? e820__memblock_setup+0xb0/0xb0
    ? dmar_dev_scope_init+0x424/0x48f
    ? __down_write_common+0x1ec/0x230
    ? dmar_dev_scope_init+0x48f/0x48f
    ? dmar_free_unused_resources+0x109/0x109
    ? cpumask_next+0x16/0x20
    ? __kmem_cache_create+0x392/0x430
    ? kmem_cache_create+0x135/0x2f0
    ? e820__memblock_setup+0xb0/0xb0
    ? intel_iommu_init+0x170/0x1848
    ? _raw_spin_unlock_irqrestore+0x32/0x60
    ? migrate_enable+0x27a/0x5b0
    ? sched_setattr+0x20/0x20
    ? migrate_disable+0x1fc/0x380
    ? task_rq_lock+0x170/0x170
    ? try_to_run_init_process+0x40/0x40
    ? locks_remove_file+0x85/0x2f0
    ? dev_prepare_static_identity_mapping+0x78/0x78
    ? rt_spin_unlock+0x39/0x50
    ? lockref_put_or_lock+0x2a/0x40
    ? dput+0x128/0x2f0
    ? __rcu_read_unlock+0x66/0x80
    ? __fput+0x250/0x300
    ? __rcu_read_lock+0x1b/0x30
    ? mntput_no_expire+0x38/0x290
    ? e820__memblock_setup+0xb0/0xb0
    ? pci_iommu_init+0x25/0x63
    ? pci_iommu_init+0x25/0x63
    ? do_one_initcall+0x7e/0x1c0
    ? initcall_blacklisted+0x120/0x120
    ? kernel_init_freeable+0x27b/0x307
    ? rest_init+0xd0/0xd0
    ? kernel_init+0xf/0x120
    ? rest_init+0xd0/0xd0
    ? ret_from_fork+0x1f/0x40
   The buggy address belongs to the variable:
    dmar_pci_notify_info_buf+0x40/0x60

Fixes: 57384592c433 ("iommu/vt-d: Store bus information in RMRR PCI device path")
Signed-off-by: Julia Cartwright <julia@ni.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/iommu/dmar.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 5a63e32a4a6b..cbad1926cec1 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -143,7 +143,7 @@ dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event)
 		for (tmp = dev; tmp; tmp = tmp->bus->self)
 			level++;
 
-	size = sizeof(*info) + level * sizeof(struct acpi_dmar_pci_path);
+	size = sizeof(*info) + level * sizeof(info->path[0]);
 	if (size <= sizeof(dmar_pci_notify_info_buf)) {
 		info = (struct dmar_pci_notify_info *)dmar_pci_notify_info_buf;
 	} else {

From 4528b128bbbc138b295f1dc9ff19fb1a9552bc88 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Wed, 13 Feb 2019 17:14:23 +0100
Subject: [PATCH 3178/3220] ARM: 8839/1: kprobe: make patch_lock a
 raw_spinlock_t

[ Upstream commit 143c2a89e0e5fda6c6fd08d7bc1126438c19ae90 ]

When running kprobe on -rt kernel, the below bug is caught:

|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:931
|in_atomic(): 1, irqs_disabled(): 128, pid: 14, name: migration/0
|Preemption disabled at:[<802f2b98>] cpu_stopper_thread+0xc0/0x140
|CPU: 0 PID: 14 Comm: migration/0 Tainted: G O 4.8.3-rt2 #1
|Hardware name: Freescale LS1021A
|[<8025a43c>] (___might_sleep)
|[<80b5b324>] (rt_spin_lock)
|[<80b5c31c>] (__patch_text_real)
|[<80b5c3ac>] (patch_text_stop_machine)
|[<802f2920>] (multi_cpu_stop)

Since patch_text_stop_machine() is called in stop_machine() which
disables IRQ, sleepable lock should be not used in this atomic context,
 so replace patch_lock to raw lock.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/arm/kernel/patch.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
index 69bda1a5707e..1f665acaa6a9 100644
--- a/arch/arm/kernel/patch.c
+++ b/arch/arm/kernel/patch.c
@@ -15,7 +15,7 @@ struct patch {
 	unsigned int insn;
 };
 
-static DEFINE_SPINLOCK(patch_lock);
+static DEFINE_RAW_SPINLOCK(patch_lock);
 
 static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 	__acquires(&patch_lock)
@@ -32,7 +32,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 		return addr;
 
 	if (flags)
-		spin_lock_irqsave(&patch_lock, *flags);
+		raw_spin_lock_irqsave(&patch_lock, *flags);
 	else
 		__acquire(&patch_lock);
 
@@ -47,7 +47,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
 	clear_fixmap(fixmap);
 
 	if (flags)
-		spin_unlock_irqrestore(&patch_lock, *flags);
+		raw_spin_unlock_irqrestore(&patch_lock, *flags);
 	else
 		__release(&patch_lock);
 }

From d49a75f5add4543eb138fb0a8fe0560fb276352e Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 1 Mar 2019 10:57:57 +0800
Subject: [PATCH 3179/3220] appletalk: Fix use-after-free in atalk_proc_exit

[ Upstream commit 6377f787aeb945cae7abbb6474798de129e1f3ac ]

KASAN report this:

BUG: KASAN: use-after-free in pde_subdir_find+0x12d/0x150 fs/proc/generic.c:71
Read of size 8 at addr ffff8881f41fe5b0 by task syz-executor.0/2806

CPU: 0 PID: 2806 Comm: syz-executor.0 Not tainted 5.0.0-rc7+ #45
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0xfa/0x1ce lib/dump_stack.c:113
 print_address_description+0x65/0x270 mm/kasan/report.c:187
 kasan_report+0x149/0x18d mm/kasan/report.c:317
 pde_subdir_find+0x12d/0x150 fs/proc/generic.c:71
 remove_proc_entry+0xe8/0x420 fs/proc/generic.c:667
 atalk_proc_exit+0x18/0x820 [appletalk]
 atalk_exit+0xf/0x5a [appletalk]
 __do_sys_delete_module kernel/module.c:1018 [inline]
 __se_sys_delete_module kernel/module.c:961 [inline]
 __x64_sys_delete_module+0x3dc/0x5e0 kernel/module.c:961
 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x462e99
Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fb2de6b9c58 EFLAGS: 00000246 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000200001c0
RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fb2de6ba6bc
R13: 00000000004bccaa R14: 00000000006f6bc8 R15: 00000000ffffffff

Allocated by task 2806:
 set_track mm/kasan/common.c:85 [inline]
 __kasan_kmalloc.constprop.3+0xa0/0xd0 mm/kasan/common.c:496
 slab_post_alloc_hook mm/slab.h:444 [inline]
 slab_alloc_node mm/slub.c:2739 [inline]
 slab_alloc mm/slub.c:2747 [inline]
 kmem_cache_alloc+0xcf/0x250 mm/slub.c:2752
 kmem_cache_zalloc include/linux/slab.h:730 [inline]
 __proc_create+0x30f/0xa20 fs/proc/generic.c:408
 proc_mkdir_data+0x47/0x190 fs/proc/generic.c:469
 0xffffffffc10c01bb
 0xffffffffc10c0166
 do_one_initcall+0xfa/0x5ca init/main.c:887
 do_init_module+0x204/0x5f6 kernel/module.c:3460
 load_module+0x66b2/0x8570 kernel/module.c:3808
 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902
 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 2806:
 set_track mm/kasan/common.c:85 [inline]
 __kasan_slab_free+0x130/0x180 mm/kasan/common.c:458
 slab_free_hook mm/slub.c:1409 [inline]
 slab_free_freelist_hook mm/slub.c:1436 [inline]
 slab_free mm/slub.c:2986 [inline]
 kmem_cache_free+0xa6/0x2a0 mm/slub.c:3002
 pde_put+0x6e/0x80 fs/proc/generic.c:647
 remove_proc_entry+0x1d3/0x420 fs/proc/generic.c:684
 0xffffffffc10c031c
 0xffffffffc10c0166
 do_one_initcall+0xfa/0x5ca init/main.c:887
 do_init_module+0x204/0x5f6 kernel/module.c:3460
 load_module+0x66b2/0x8570 kernel/module.c:3808
 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902
 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8881f41fe500
 which belongs to the cache proc_dir_entry of size 256
The buggy address is located 176 bytes inside of
 256-byte region [ffff8881f41fe500, ffff8881f41fe600)
The buggy address belongs to the page:
page:ffffea0007d07f80 count:1 mapcount:0 mapping:ffff8881f6e69a00 index:0x0
flags: 0x2fffc0000000200(slab)
raw: 02fffc0000000200 dead000000000100 dead000000000200 ffff8881f6e69a00
raw: 0000000000000000 00000000800c000c 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8881f41fe480: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
 ffff8881f41fe500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8881f41fe580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                     ^
 ffff8881f41fe600: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
 ffff8881f41fe680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

It should check the return value of atalk_proc_init fails,
otherwise atalk_exit will trgger use-after-free in pde_subdir_find
while unload the module.This patch fix error cleanup path of atalk_init

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/linux/atalk.h            |  2 +-
 net/appletalk/atalk_proc.c       |  2 +-
 net/appletalk/ddp.c              | 37 ++++++++++++++++++++++++++------
 net/appletalk/sysctl_net_atalk.c |  5 ++++-
 4 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index 73fd8b7e9534..716d53799d1f 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -150,7 +150,7 @@ extern int sysctl_aarp_retransmit_limit;
 extern int sysctl_aarp_resolve_time;
 
 #ifdef CONFIG_SYSCTL
-extern void atalk_register_sysctl(void);
+extern int atalk_register_sysctl(void);
 extern void atalk_unregister_sysctl(void);
 #else
 #define atalk_register_sysctl()		do { } while(0)
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index af46bc49e1e9..b5f84f428aa6 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -293,7 +293,7 @@ out_interface:
 	goto out;
 }
 
-void __exit atalk_proc_exit(void)
+void atalk_proc_exit(void)
 {
 	remove_proc_entry("interface", atalk_proc_dir);
 	remove_proc_entry("route", atalk_proc_dir);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index d5871ac493eb..4246df3b7ae8 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1912,12 +1912,16 @@ static const char atalk_err_snap[] __initconst =
 /* Called by proto.c on kernel start up */
 static int __init atalk_init(void)
 {
-	int rc = proto_register(&ddp_proto, 0);
+	int rc;
 
-	if (rc != 0)
+	rc = proto_register(&ddp_proto, 0);
+	if (rc)
 		goto out;
 
-	(void)sock_register(&atalk_family_ops);
+	rc = sock_register(&atalk_family_ops);
+	if (rc)
+		goto out_proto;
+
 	ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv);
 	if (!ddp_dl)
 		printk(atalk_err_snap);
@@ -1925,12 +1929,33 @@ static int __init atalk_init(void)
 	dev_add_pack(&ltalk_packet_type);
 	dev_add_pack(&ppptalk_packet_type);
 
-	register_netdevice_notifier(&ddp_notifier);
+	rc = register_netdevice_notifier(&ddp_notifier);
+	if (rc)
+		goto out_sock;
+
 	aarp_proto_init();
-	atalk_proc_init();
-	atalk_register_sysctl();
+	rc = atalk_proc_init();
+	if (rc)
+		goto out_aarp;
+
+	rc = atalk_register_sysctl();
+	if (rc)
+		goto out_proc;
 out:
 	return rc;
+out_proc:
+	atalk_proc_exit();
+out_aarp:
+	aarp_cleanup_module();
+	unregister_netdevice_notifier(&ddp_notifier);
+out_sock:
+	dev_remove_pack(&ppptalk_packet_type);
+	dev_remove_pack(&ltalk_packet_type);
+	unregister_snap_client(ddp_dl);
+	sock_unregister(PF_APPLETALK);
+out_proto:
+	proto_unregister(&ddp_proto);
+	goto out;
 }
 module_init(atalk_init);
 
diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c
index ebb864361f7a..4e6042e0fcac 100644
--- a/net/appletalk/sysctl_net_atalk.c
+++ b/net/appletalk/sysctl_net_atalk.c
@@ -44,9 +44,12 @@ static struct ctl_table atalk_table[] = {
 
 static struct ctl_table_header *atalk_table_header;
 
-void atalk_register_sysctl(void)
+int __init atalk_register_sysctl(void)
 {
 	atalk_table_header = register_net_sysctl(&init_net, "net/appletalk", atalk_table);
+	if (!atalk_table_header)
+		return -ENOMEM;
+	return 0;
 }
 
 void atalk_unregister_sysctl(void)

From aaedb76c488e5dcf7980f7766ab8dd14edca44bb Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Thu, 7 Mar 2019 16:28:18 -0800
Subject: [PATCH 3180/3220] lib/div64.c: off by one in shift

[ Upstream commit cdc94a37493135e355dfc0b0e086d84e3eadb50d ]

fls counts bits starting from 1 to 32 (returns 0 for zero argument).  If
we add 1 we shift right one bit more and loose precision from divisor,
what cause function incorect results with some numbers.

Corrected code was tested in user-space, see bugzilla:
   https://bugzilla.kernel.org/show_bug.cgi?id=202391

Link: http://lkml.kernel.org/r/1548686944-11891-1-git-send-email-sgruszka@redhat.com
Fixes: 658716d19f8f ("div64_u64(): improve precision on 32bit platforms")
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Reported-by: Siarhei Volkau <lis8215@gmail.com>
Tested-by: Siarhei Volkau <lis8215@gmail.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 lib/div64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/div64.c b/lib/div64.c
index 62a698a432bc..75b8521c2146 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -100,7 +100,7 @@ u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
 		quot = div_u64_rem(dividend, divisor, &rem32);
 		*remainder = rem32;
 	} else {
-		int n = 1 + fls(high);
+		int n = fls(high);
 		quot = div_u64(dividend >> n, divisor >> n);
 
 		if (quot != 0)
@@ -138,7 +138,7 @@ u64 div64_u64(u64 dividend, u64 divisor)
 	if (high == 0) {
 		quot = div_u64(dividend, divisor);
 	} else {
-		int n = 1 + fls(high);
+		int n = fls(high);
 		quot = div_u64(dividend >> n, divisor >> n);
 
 		if (quot != 0)

From c947b45f2e0cd03b903aaf59cd8d00fd80ef2a0c Mon Sep 17 00:00:00 2001
From: Pi-Hsun Shih <pihsun@chromium.org>
Date: Wed, 13 Mar 2019 11:44:33 -0700
Subject: [PATCH 3181/3220] include/linux/swap.h: use offsetof() instead of
 custom __swapoffset macro

[ Upstream commit a4046c06be50a4f01d435aa7fe57514818e6cc82 ]

Use offsetof() to calculate offset of a field to take advantage of
compiler built-in version when possible, and avoid UBSAN warning when
compiling with Clang:

  UBSAN: Undefined behaviour in mm/swapfile.c:3010:38
  member access within null pointer of type 'union swap_header'
  CPU: 6 PID: 1833 Comm: swapon Tainted: G S                4.19.23 #43
  Call trace:
   dump_backtrace+0x0/0x194
   show_stack+0x20/0x2c
   __dump_stack+0x20/0x28
   dump_stack+0x70/0x94
   ubsan_epilogue+0x14/0x44
   ubsan_type_mismatch_common+0xf4/0xfc
   __ubsan_handle_type_mismatch_v1+0x34/0x54
   __se_sys_swapon+0x654/0x1084
   __arm64_sys_swapon+0x1c/0x24
   el0_svc_common+0xa8/0x150
   el0_svc_compat_handler+0x2c/0x38
   el0_svc_compat+0x8/0x18

Link: http://lkml.kernel.org/r/20190312081902.223764-1-pihsun@chromium.org
Signed-off-by: Pi-Hsun Shih <pihsun@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/linux/swap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d8ca2eaa3a8b..0a0b7529dae4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -135,9 +135,9 @@ struct swap_extent {
 /*
  * Max bad pages in the new format..
  */
-#define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x)
 #define MAX_SWAP_BADPAGES \
-	((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int))
+	((offsetof(union swap_header, magic.magic) - \
+	  offsetof(union swap_header, info.badpages)) / sizeof(int))
 
 enum {
 	SWP_USED	= (1 << 0),	/* is slot in swap_info[] used? */

From df57807765dd2a60a2c49696a3899549b238d886 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Date: Wed, 10 Apr 2019 16:54:32 +0300
Subject: [PATCH 3182/3220] tpm/tpm_crb: Avoid unaligned reads in crb_recv()

commit 3d7a850fdc1a2e4d2adbc95cc0fc962974725e88 upstream

The current approach to read first 6 bytes from the response and then tail
of the response, can cause the 2nd memcpy_fromio() to do an unaligned read
(e.g. read 32-bit word from address aligned to a 16-bits), depending on how
memcpy_fromio() is implemented. If this happens, the read will fail and the
memory controller will fill the read with 1's.

This was triggered by 170d13ca3a2f, which should be probably refined to
check and react to the address alignment. Before that commit, on x86
memcpy_fromio() turned out to be memcpy(). By a luck GCC has done the right
thing (from tpm_crb's perspective) for us so far, but we should not rely on
that. Thus, it makes sense to fix this also in tpm_crb, not least because
the fix can be then backported to stable kernels and make them more robust
when compiled in differing environments.

Cc: stable@vger.kernel.org
Cc: James Morris <jmorris@namei.org>
Cc: Tomas Winkler <tomas.winkler@intel.com>
Cc: Jerry Snitselaar <jsnitsel@redhat.com>
Fixes: 30fc8d138e91 ("tpm: TPM 2.0 CRB Interface")
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Acked-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Sasha Levin (Microsoft) <sashal@kernel.org>
---
 drivers/char/tpm/tpm_crb.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c
index 35308dfff754..8226e3b6dc1f 100644
--- a/drivers/char/tpm/tpm_crb.c
+++ b/drivers/char/tpm/tpm_crb.c
@@ -109,19 +109,29 @@ static int crb_recv(struct tpm_chip *chip, u8 *buf, size_t count)
 	struct crb_priv *priv = chip->vendor.priv;
 	unsigned int expected;
 
-	/* sanity check */
-	if (count < 6)
+	/* A sanity check that the upper layer wants to get at least the header
+	 * as that is the minimum size for any TPM response.
+	 */
+	if (count < TPM_HEADER_SIZE)
 		return -EIO;
 
+	/* If this bit is set, according to the spec, the TPM is in
+	 * unrecoverable condition.
+	 */
 	if (le32_to_cpu(ioread32(&priv->cca->sts)) & CRB_CA_STS_ERROR)
 		return -EIO;
 
-	memcpy_fromio(buf, priv->rsp, 6);
-	expected = be32_to_cpup((__be32 *) &buf[2]);
-	if (expected > count || expected < 6)
+	/* Read the first 8 bytes in order to get the length of the response.
+	 * We read exactly a quad word in order to make sure that the remaining
+	 * reads will be aligned.
+	 */
+	memcpy_fromio(buf, priv->rsp, 8);
+
+	expected = be32_to_cpup((__be32 *)&buf[2]);
+	if (expected > count || expected < TPM_HEADER_SIZE)
 		return -EIO;
 
-	memcpy_fromio(&buf[6], &priv->rsp[6], expected - 6);
+	memcpy_fromio(&buf[8], &priv->rsp[8], expected - 8);
 
 	return expected;
 }

From 54a07fff4b217607e862b2e1e3fcec32919ad583 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 15 Jun 2016 14:18:59 +0200
Subject: [PATCH 3183/3220] ovl: fix uid/gid when creating over whiteout

[ Upstream commit d0e13f5bbe4be7c8f27736fc40503dcec04b7de0 ]

Fix a regression when creating a file over a whiteout.  The new
file/directory needs to use the current fsuid/fsgid, not the ones from the
mounter's credentials.

The refcounting is a bit tricky: prepare_creds() sets an original refcount,
override_creds() gets one more, which revert_cred() drops.  So

  1) we need to expicitly put the mounter's credentials when overriding
     with the updated one

  2) we need to put the original ref to the updated creds (and this can
     safely be done before revert_creds(), since we'll still have the ref
     from override_creds()).

Reported-by: Stephen Smalley <sds@tycho.nsa.gov>
Fixes: 3fe6e52f0626 ("ovl: override creds with the ones from the superblock mounter")
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Sasha Levin (Microsoft) <sashal@kernel.org>
---
 fs/overlayfs/dir.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index f8aa54272121..eedacae889b9 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -408,12 +408,21 @@ static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
 		err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
 	} else {
 		const struct cred *old_cred;
+		struct cred *override_cred;
 
 		old_cred = ovl_override_creds(dentry->d_sb);
 
-		err = ovl_create_over_whiteout(dentry, inode, &stat, link,
-					       hardlink);
+		err = -ENOMEM;
+		override_cred = prepare_creds();
+		if (override_cred) {
+			override_cred->fsuid = old_cred->fsuid;
+			override_cred->fsgid = old_cred->fsgid;
+			put_cred(override_creds(override_cred));
+			put_cred(override_cred);
 
+			err = ovl_create_over_whiteout(dentry, inode, &stat,
+						       link, hardlink);
+		}
 		revert_creds(old_cred);
 	}
 

From 9ca0f944a76b8b27e00328f7d940d176fa1a72c1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 6 Mar 2019 11:52:36 +0100
Subject: [PATCH 3184/3220] appletalk: Fix compile regression

[ Upstream commit 27da0d2ef998e222a876c0cec72aa7829a626266 ]

A bugfix just broke compilation of appletalk when CONFIG_SYSCTL
is disabled:

In file included from net/appletalk/ddp.c:65:
net/appletalk/ddp.c: In function 'atalk_init':
include/linux/atalk.h:164:34: error: expected expression before 'do'
 #define atalk_register_sysctl()  do { } while(0)
                                  ^~
net/appletalk/ddp.c:1934:7: note: in expansion of macro 'atalk_register_sysctl'
  rc = atalk_register_sysctl();

This is easier to avoid by using conventional inline functions
as stubs rather than macros. The header already has inline
functions for other purposes, so I'm changing over all the
macros for consistency.

Fixes: 6377f787aeb9 ("appletalk: Fix use-after-free in atalk_proc_exit")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 include/linux/atalk.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index 716d53799d1f..af43ed404ff4 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -153,16 +153,26 @@ extern int sysctl_aarp_resolve_time;
 extern int atalk_register_sysctl(void);
 extern void atalk_unregister_sysctl(void);
 #else
-#define atalk_register_sysctl()		do { } while(0)
-#define atalk_unregister_sysctl()	do { } while(0)
+static inline int atalk_register_sysctl(void)
+{
+	return 0;
+}
+static inline void atalk_unregister_sysctl(void)
+{
+}
 #endif
 
 #ifdef CONFIG_PROC_FS
 extern int atalk_proc_init(void);
 extern void atalk_proc_exit(void);
 #else
-#define atalk_proc_init()	({ 0; })
-#define atalk_proc_exit()	do { } while(0)
+static inline int atalk_proc_init(void)
+{
+	return 0;
+}
+static inline void atalk_proc_exit(void)
+{
+}
 #endif /* CONFIG_PROC_FS */
 
 #endif /* __LINUX_ATALK_H__ */

From d8e18cccd25e05271ef5741a55b2d3be6220be99 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 12 Apr 2019 15:04:10 +0200
Subject: [PATCH 3185/3220] bonding: fix event handling for stacked bonds

[ Upstream commit 92480b3977fd3884649d404cbbaf839b70035699 ]

When a bond is enslaved to another bond, bond_netdev_event() only
handles the event as if the bond is a master, and skips treating the
bond as a slave.

This leads to a refcount leak on the slave, since we don't remove the
adjacency to its master and the master holds a reference on the slave.

Reproducer:
  ip link add bondL type bond
  ip link add bondU type bond
  ip link set bondL master bondU
  ip link del bondL

No "Fixes:" tag, this code is older than git history.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/bonding/bond_main.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a32dcb6718ca..fde7f5efc47d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3067,8 +3067,12 @@ static int bond_netdev_event(struct notifier_block *this,
 		return NOTIFY_DONE;
 
 	if (event_dev->flags & IFF_MASTER) {
+		int ret;
+
 		netdev_dbg(event_dev, "IFF_MASTER\n");
-		return bond_master_netdev_event(event, event_dev);
+		ret = bond_master_netdev_event(event, event_dev);
+		if (ret != NOTIFY_DONE)
+			return ret;
 	}
 
 	if (event_dev->flags & IFF_SLAVE) {

From 423ffcd5136ca75bf50a9cbd1496b6bfd3f04b94 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 15 Apr 2019 15:57:23 -0500
Subject: [PATCH 3186/3220] net: atm: Fix potential Spectre v1 vulnerabilities

[ Upstream commit 899537b73557aafbdd11050b501cf54b4f5c45af ]

arg is controlled by user-space, hence leading to a potential
exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

net/atm/lec.c:715 lec_mcast_attach() warn: potential spectre issue 'dev_lec' [r] (local cap)

Fix this by sanitizing arg before using it to index dev_lec.

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://lore.kernel.org/lkml/20180423164740.GY17484@dhcp22.suse.cz/

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/atm/lec.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/atm/lec.c b/net/atm/lec.c
index 10e4066991b8..e4afac94ff15 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -721,7 +721,10 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
 
 static int lec_mcast_attach(struct atm_vcc *vcc, int arg)
 {
-	if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg])
+	if (arg < 0 || arg >= MAX_LEC_ITF)
+		return -EINVAL;
+	arg = array_index_nospec(arg, MAX_LEC_ITF);
+	if (!dev_lec[arg])
 		return -EINVAL;
 	vcc->proto_data = dev_lec[arg];
 	return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc);
@@ -739,6 +742,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
 		i = arg;
 	if (arg >= MAX_LEC_ITF)
 		return -EINVAL;
+	i = array_index_nospec(arg, MAX_LEC_ITF);
 	if (!dev_lec[i]) {
 		int size;
 

From dc20066a631c4cba287cd3a9920718a6e6658a76 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 11 Apr 2019 15:08:25 +0300
Subject: [PATCH 3187/3220] net: bridge: multicast: use rcu to access port list
 from br_multicast_start_querier

[ Upstream commit c5b493ce192bd7a4e7bd073b5685aad121eeef82 ]

br_multicast_start_querier() walks over the port list but it can be
called from a timer with only multicast_lock held which doesn't protect
the port list, so use RCU to walk over it.

Fixes: c83b8fab06fc ("bridge: Restart queries when last querier expires")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/br_multicast.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d80c15d028fe..a52b4ffe30f4 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1894,7 +1894,8 @@ static void br_multicast_start_querier(struct net_bridge *br,
 
 	__br_multicast_open(br, query);
 
-	list_for_each_entry(port, &br->port_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(port, &br->port_list, list) {
 		if (port->state == BR_STATE_DISABLED ||
 		    port->state == BR_STATE_BLOCKING)
 			continue;
@@ -1906,6 +1907,7 @@ static void br_multicast_start_querier(struct net_bridge *br,
 			br_multicast_enable(&port->ip6_own_query);
 #endif
 	}
+	rcu_read_unlock();
 }
 
 int br_multicast_toggle(struct net_bridge *br, unsigned long val)

From b1aaee5a97c9d6557bec13cf22d3db888c91509c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 9 Apr 2019 11:47:20 +0200
Subject: [PATCH 3188/3220] net: fou: do not use guehdr after
 iptunnel_pull_offloads in gue_udp_recv

[ Upstream commit 988dc4a9a3b66be75b30405a5494faf0dc7cffb6 ]

gue tunnels run iptunnel_pull_offloads on received skbs. This can
determine a possible use-after-free accessing guehdr pointer since
the packet will be 'uncloned' running pskb_expand_head if it is a
cloned gso skb (e.g if the packet has been sent though a veth device)

Fixes: a09a4c8dd1ec ("tunnels: Remove encapsulation offloads on decap")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/fou.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index d83888bc33d3..b5a137338e50 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -116,6 +116,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
 	struct guehdr *guehdr;
 	void *data;
 	u16 doffset = 0;
+	u8 proto_ctype;
 
 	if (!fou)
 		return 1;
@@ -173,13 +174,14 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(guehdr->control))
 		return gue_control_message(skb, guehdr);
 
+	proto_ctype = guehdr->proto_ctype;
 	__skb_pull(skb, sizeof(struct udphdr) + hdrlen);
 	skb_reset_transport_header(skb);
 
 	if (iptunnel_pull_offloads(skb))
 		goto drop;
 
-	return -guehdr->proto_ctype;
+	return -proto_ctype;
 
 drop:
 	kfree_skb(skb);

From b6d37bba0f7a7492427d7518d4be485dcdd9d5d1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 16 Apr 2019 10:55:20 -0700
Subject: [PATCH 3189/3220] tcp: tcp_grow_window() needs to respect tcp_space()

[ Upstream commit 50ce163a72d817a99e8974222dcf2886d5deb1ae ]

For some reason, tcp_grow_window() correctly tests if enough room
is present before attempting to increase tp->rcv_ssthresh,
but does not prevent it to grow past tcp_space()

This is causing hard to debug issues, like failing
the (__tcp_select_window(sk) >= tp->rcv_wnd) test
in __tcp_ack_snd_check(), causing ACK delays and possibly
slow flows.

Depending on tcp_rmem[2], MTU, skb->len/skb->truesize ratio,
we can see the problem happening on "netperf -t TCP_RR -- -r 2000,2000"
after about 60 round trips, when the active side no longer sends
immediate acks.

This bug predates git history.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_input.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aff90b0ddb63..44a3aa7a41e2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -365,11 +365,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
 static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	int room;
+
+	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
 
 	/* Check #1 */
-	if (tp->rcv_ssthresh < tp->window_clamp &&
-	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
-	    !tcp_under_memory_pressure(sk)) {
+	if (room > 0 && !tcp_under_memory_pressure(sk)) {
 		int incr;
 
 		/* Check #2. Increase window, if skb with such overhead
@@ -382,8 +383,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 
 		if (incr) {
 			incr = max_t(int, incr, 2 * skb->len);
-			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
-					       tp->window_clamp);
+			tp->rcv_ssthresh += min(room, incr);
 			inet_csk(sk)->icsk_ack.quick |= 1;
 		}
 	}

From fbf569d2beee2a4a7a0bc8b619c26101d1211a88 Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Fri, 12 Apr 2019 16:19:27 -0400
Subject: [PATCH 3190/3220] ipv4: recompile ip options in ipv4_link_failure

[ Upstream commit ed0de45a1008991fdaa27a0152befcb74d126a8b ]

Recompile IP options since IPCB may not be valid anymore when
ipv4_link_failure is called from arp_error_report.

Refer to the commit 3da1ed7ac398 ("net: avoid use IPCB in cipso_v4_error")
and the commit before that (9ef6b42ad6fd) for a similar issue.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/route.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 97bf6c785767..11c0518445f9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1165,8 +1165,16 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 static void ipv4_link_failure(struct sk_buff *skb)
 {
 	struct rtable *rt;
+	struct ip_options opt;
 
-	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+	/* Recompile ip options since IPCB may not be valid anymore.
+	 */
+	memset(&opt, 0, sizeof(opt));
+	opt.optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
+	if (__ip_options_compile(dev_net(skb->dev), &opt, skb, NULL))
+		return;
+
+	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
 
 	rt = skb_rtable(skb);
 	if (rt)

From faac81f7cb3430d40e13a0331095384b6c4f11bd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 13 Apr 2019 17:32:21 -0700
Subject: [PATCH 3191/3220] ipv4: ensure rcu_read_lock() in ipv4_link_failure()

[ Upstream commit c543cb4a5f07e09237ec0fc2c60c9f131b2c79ad ]

fib_compute_spec_dst() needs to be called under rcu protection.

syzbot reported :

WARNING: suspicious RCU usage
5.1.0-rc4+ #165 Not tainted
include/linux/inetdevice.h:220 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
1 lock held by swapper/0/0:
 #0: 0000000051b67925 ((&n->timer)){+.-.}, at: lockdep_copy_map include/linux/lockdep.h:170 [inline]
 #0: 0000000051b67925 ((&n->timer)){+.-.}, at: call_timer_fn+0xda/0x720 kernel/time/timer.c:1315

stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.1.0-rc4+ #165
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 lockdep_rcu_suspicious+0x153/0x15d kernel/locking/lockdep.c:5162
 __in_dev_get_rcu include/linux/inetdevice.h:220 [inline]
 fib_compute_spec_dst+0xbbd/0x1030 net/ipv4/fib_frontend.c:294
 spec_dst_fill net/ipv4/ip_options.c:245 [inline]
 __ip_options_compile+0x15a7/0x1a10 net/ipv4/ip_options.c:343
 ipv4_link_failure+0x172/0x400 net/ipv4/route.c:1195
 dst_link_failure include/net/dst.h:427 [inline]
 arp_error_report+0xd1/0x1c0 net/ipv4/arp.c:297
 neigh_invalidate+0x24b/0x570 net/core/neighbour.c:995
 neigh_timer_handler+0xc35/0xf30 net/core/neighbour.c:1081
 call_timer_fn+0x190/0x720 kernel/time/timer.c:1325
 expire_timers kernel/time/timer.c:1362 [inline]
 __run_timers kernel/time/timer.c:1681 [inline]
 __run_timers kernel/time/timer.c:1649 [inline]
 run_timer_softirq+0x652/0x1700 kernel/time/timer.c:1694
 __do_softirq+0x266/0x95a kernel/softirq.c:293
 invoke_softirq kernel/softirq.c:374 [inline]
 irq_exit+0x180/0x1d0 kernel/softirq.c:414
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 smp_apic_timer_interrupt+0x14a/0x570 arch/x86/kernel/apic/apic.c:1062
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:807

Fixes: ed0de45a1008 ("ipv4: recompile ip options in ipv4_link_failure")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/route.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11c0518445f9..1d580d290054 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1164,14 +1164,20 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 
 static void ipv4_link_failure(struct sk_buff *skb)
 {
-	struct rtable *rt;
 	struct ip_options opt;
+	struct rtable *rt;
+	int res;
 
 	/* Recompile ip options since IPCB may not be valid anymore.
 	 */
 	memset(&opt, 0, sizeof(opt));
 	opt.optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
-	if (__ip_options_compile(dev_net(skb->dev), &opt, skb, NULL))
+
+	rcu_read_lock();
+	res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
+	rcu_read_unlock();
+
+	if (res)
 		return;
 
 	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);

From e9a60ab1609a7d975922adad1bf9c46ac6954584 Mon Sep 17 00:00:00 2001
From: Christian Lamparter <chunkeey@gmail.com>
Date: Thu, 19 Apr 2018 18:41:55 +0200
Subject: [PATCH 3192/3220] crypto: crypto4xx - properly set IV after de- and
 encrypt

[ Upstream commit fc340115ffb8235c1bbd200c28855e6373d0dd1a ]

This patch fixes cts(cbc(aes)) test when cbc-aes-ppc4xx is used.
alg: skcipher: Test 1 failed (invalid result) on encryption for cts(cbc-aes-ppc4xx)
00000000: 4b 10 75 fc 2f 14 1b 6a 27 35 37 33 d1 b7 70 05
00000010: 97
alg: skcipher: Failed to load transform for cts(cbc(aes)): -2

The CTS cipher mode expect the IV (req->iv) of skcipher_request
to contain the last ciphertext block after the {en,de}crypt
operation is complete.

Fix this issue for the AMCC Crypto4xx hardware engine.
The tcrypt test case for cts(cbc(aes)) is now correctly passed.

name         : cts(cbc(aes))
driver       : cts(cbc-aes-ppc4xx)
module       : cts
priority     : 300
refcnt       : 1
selftest     : passed
internal     : no
type         : skcipher
async        : yes
blocksize    : 16
min keysize  : 16
max keysize  : 32
ivsize       : 16
chunksize    : 16
walksize     : 16

Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/crypto/amcc/crypto4xx_alg.c  | 3 ++-
 drivers/crypto/amcc/crypto4xx_core.c | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/amcc/crypto4xx_alg.c b/drivers/crypto/amcc/crypto4xx_alg.c
index 4afca3968773..e3b8bebfdd30 100644
--- a/drivers/crypto/amcc/crypto4xx_alg.c
+++ b/drivers/crypto/amcc/crypto4xx_alg.c
@@ -138,7 +138,8 @@ static int crypto4xx_setkey_aes(struct crypto_ablkcipher *cipher,
 	sa = (struct dynamic_sa_ctl *) ctx->sa_in;
 	ctx->hash_final = 0;
 
-	set_dynamic_sa_command_0(sa, SA_NOT_SAVE_HASH, SA_NOT_SAVE_IV,
+	set_dynamic_sa_command_0(sa, SA_NOT_SAVE_HASH, (cm == CRYPTO_MODE_CBC ?
+				 SA_SAVE_IV : SA_NOT_SAVE_IV),
 				 SA_LOAD_HASH_FROM_SA, SA_LOAD_IV_FROM_STATE,
 				 SA_NO_HEADER_PROC, SA_HASH_ALG_NULL,
 				 SA_CIPHER_ALG_AES, SA_PAD_TYPE_ZERO,
diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c
index 78d0722feacb..1e810f5f03fa 100644
--- a/drivers/crypto/amcc/crypto4xx_core.c
+++ b/drivers/crypto/amcc/crypto4xx_core.c
@@ -645,6 +645,15 @@ static u32 crypto4xx_ablkcipher_done(struct crypto4xx_device *dev,
 		addr = dma_map_page(dev->core_dev->device, sg_page(dst),
 				    dst->offset, dst->length, DMA_FROM_DEVICE);
 	}
+
+	if (pd_uinfo->sa_va->sa_command_0.bf.save_iv == SA_SAVE_IV) {
+		struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
+
+		crypto4xx_memcpy_from_le32((u32 *)req->iv,
+			pd_uinfo->sr_va->save_iv,
+			crypto_skcipher_ivsize(skcipher));
+	}
+
 	crypto4xx_ret_sg_desc(dev, pd_uinfo);
 	if (ablk_req->base.complete != NULL)
 		ablk_req->base.complete(&ablk_req->base, 0);

From 5a25b6f93f8422caa7b24c82887a6b30061291e9 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 22 Nov 2018 13:28:41 +0900
Subject: [PATCH 3193/3220] modpost: file2alias: go back to simple devtable
 lookup

commit ec91e78d378cc5d4b43805a1227d8e04e5dfa17d upstream.

Commit e49ce14150c6 ("modpost: use linker section to generate table.")
was not so cool as we had expected first; it ended up with ugly section
hacks when commit dd2a3acaecd7 ("mod/file2alias: make modpost compile
on darwin again") came in.

Given a certain degree of unknowledge about the link stage of host
programs, I really want to see simple, stupid table lookup so that
this works in the same way regardless of the underlying executable
format.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Mathieu Malaterre <malat@debian.org>
[nc: Omit rpmsg, sdw, fslmc, tbsvc, and typec as they don't exist here
     Add of to avoid backporting two larger patches]
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 scripts/mod/file2alias.c | 136 +++++++++++++--------------------------
 1 file changed, 45 insertions(+), 91 deletions(-)

diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 9f5cdd49ff0b..d0b164084225 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -50,46 +50,6 @@ struct devtable {
 	void *function;
 };
 
-#define ___cat(a,b) a ## b
-#define __cat(a,b) ___cat(a,b)
-
-/* we need some special handling for this host tool running eventually on
- * Darwin. The Mach-O section handling is a bit different than ELF section
- * handling. The differnces in detail are:
- *  a) we have segments which have sections
- *  b) we need a API call to get the respective section symbols */
-#if defined(__MACH__)
-#include <mach-o/getsect.h>
-
-#define INIT_SECTION(name)  do {					\
-		unsigned long name ## _len;				\
-		char *__cat(pstart_,name) = getsectdata("__TEXT",	\
-			#name, &__cat(name,_len));			\
-		char *__cat(pstop_,name) = __cat(pstart_,name) +	\
-			__cat(name, _len);				\
-		__cat(__start_,name) = (void *)__cat(pstart_,name);	\
-		__cat(__stop_,name) = (void *)__cat(pstop_,name);	\
-	} while (0)
-#define SECTION(name)   __attribute__((section("__TEXT, " #name)))
-
-struct devtable **__start___devtable, **__stop___devtable;
-#else
-#define INIT_SECTION(name) /* no-op for ELF */
-#define SECTION(name)   __attribute__((section(#name)))
-
-/* We construct a table of pointers in an ELF section (pointers generally
- * go unpadded by gcc).  ld creates boundary syms for us. */
-extern struct devtable *__start___devtable[], *__stop___devtable[];
-#endif /* __MACH__ */
-
-#if !defined(__used)
-# if __GNUC__ == 3 && __GNUC_MINOR__ < 3
-#  define __used			__attribute__((__unused__))
-# else
-#  define __used			__attribute__((__used__))
-# endif
-#endif
-
 /* Define a variable f that holds the value of field f of struct devid
  * based at address m.
  */
@@ -102,16 +62,6 @@ extern struct devtable *__start___devtable[], *__stop___devtable[];
 #define DEF_FIELD_ADDR(m, devid, f) \
 	typeof(((struct devid *)0)->f) *f = ((m) + OFF_##devid##_##f)
 
-/* Add a table entry.  We test function type matches while we're here. */
-#define ADD_TO_DEVTABLE(device_id, type, function) \
-	static struct devtable __cat(devtable,__LINE__) = {	\
-		device_id + 0*sizeof((function)((const char *)NULL,	\
-						(void *)NULL,		\
-						(char *)NULL)),		\
-		SIZE_##type, (function) };				\
-	static struct devtable *SECTION(__devtable) __used \
-		__cat(devtable_ptr,__LINE__) = &__cat(devtable,__LINE__)
-
 #define ADD(str, sep, cond, field)                              \
 do {                                                            \
         strcat(str, sep);                                       \
@@ -388,7 +338,6 @@ static int do_hid_entry(const char *filename,
 
 	return 1;
 }
-ADD_TO_DEVTABLE("hid", hid_device_id, do_hid_entry);
 
 /* Looks like: ieee1394:venNmoNspNverN */
 static int do_ieee1394_entry(const char *filename,
@@ -413,7 +362,6 @@ static int do_ieee1394_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("ieee1394", ieee1394_device_id, do_ieee1394_entry);
 
 /* Looks like: pci:vNdNsvNsdNbcNscNiN. */
 static int do_pci_entry(const char *filename,
@@ -457,7 +405,6 @@ static int do_pci_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("pci", pci_device_id, do_pci_entry);
 
 /* looks like: "ccw:tNmNdtNdmN" */
 static int do_ccw_entry(const char *filename,
@@ -481,7 +428,6 @@ static int do_ccw_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("ccw", ccw_device_id, do_ccw_entry);
 
 /* looks like: "ap:tN" */
 static int do_ap_entry(const char *filename,
@@ -492,7 +438,6 @@ static int do_ap_entry(const char *filename,
 	sprintf(alias, "ap:t%02X*", dev_type);
 	return 1;
 }
-ADD_TO_DEVTABLE("ap", ap_device_id, do_ap_entry);
 
 /* looks like: "css:tN" */
 static int do_css_entry(const char *filename,
@@ -503,7 +448,6 @@ static int do_css_entry(const char *filename,
 	sprintf(alias, "css:t%01X", type);
 	return 1;
 }
-ADD_TO_DEVTABLE("css", css_device_id, do_css_entry);
 
 /* Looks like: "serio:tyNprNidNexN" */
 static int do_serio_entry(const char *filename,
@@ -523,7 +467,6 @@ static int do_serio_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("serio", serio_device_id, do_serio_entry);
 
 /* looks like: "acpi:ACPI0003" or "acpi:PNP0C0B" or "acpi:LNXVIDEO" or
  *             "acpi:bbsspp" (bb=base-class, ss=sub-class, pp=prog-if)
@@ -561,7 +504,6 @@ static int do_acpi_entry(const char *filename,
 	}
 	return 1;
 }
-ADD_TO_DEVTABLE("acpi", acpi_device_id, do_acpi_entry);
 
 /* looks like: "pnp:dD" */
 static void do_pnp_device_entry(void *symval, unsigned long size,
@@ -682,7 +624,6 @@ static int do_pcmcia_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("pcmcia", pcmcia_device_id, do_pcmcia_entry);
 
 static int do_of_entry (const char *filename, void *symval, char *alias)
 {
@@ -707,7 +648,6 @@ static int do_of_entry (const char *filename, void *symval, char *alias)
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("of", of_device_id, do_of_entry);
 
 static int do_vio_entry(const char *filename, void *symval,
 		char *alias)
@@ -727,7 +667,6 @@ static int do_vio_entry(const char *filename, void *symval,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("vio", vio_device_id, do_vio_entry);
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 
@@ -800,7 +739,6 @@ static int do_input_entry(const char *filename, void *symval,
 		do_input(alias, *swbit, 0, INPUT_DEVICE_ID_SW_MAX);
 	return 1;
 }
-ADD_TO_DEVTABLE("input", input_device_id, do_input_entry);
 
 static int do_eisa_entry(const char *filename, void *symval,
 		char *alias)
@@ -812,7 +750,6 @@ static int do_eisa_entry(const char *filename, void *symval,
 		strcat(alias, "*");
 	return 1;
 }
-ADD_TO_DEVTABLE("eisa", eisa_device_id, do_eisa_entry);
 
 /* Looks like: parisc:tNhvNrevNsvN */
 static int do_parisc_entry(const char *filename, void *symval,
@@ -832,7 +769,6 @@ static int do_parisc_entry(const char *filename, void *symval,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("parisc", parisc_device_id, do_parisc_entry);
 
 /* Looks like: sdio:cNvNdN. */
 static int do_sdio_entry(const char *filename,
@@ -849,7 +785,6 @@ static int do_sdio_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("sdio", sdio_device_id, do_sdio_entry);
 
 /* Looks like: ssb:vNidNrevN. */
 static int do_ssb_entry(const char *filename,
@@ -866,7 +801,6 @@ static int do_ssb_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("ssb", ssb_device_id, do_ssb_entry);
 
 /* Looks like: bcma:mNidNrevNclN. */
 static int do_bcma_entry(const char *filename,
@@ -885,7 +819,6 @@ static int do_bcma_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("bcma", bcma_device_id, do_bcma_entry);
 
 /* Looks like: virtio:dNvN */
 static int do_virtio_entry(const char *filename, void *symval,
@@ -901,7 +834,6 @@ static int do_virtio_entry(const char *filename, void *symval,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("virtio", virtio_device_id, do_virtio_entry);
 
 /*
  * Looks like: vmbus:guid
@@ -924,7 +856,6 @@ static int do_vmbus_entry(const char *filename, void *symval,
 
 	return 1;
 }
-ADD_TO_DEVTABLE("vmbus", hv_vmbus_device_id, do_vmbus_entry);
 
 /* Looks like: i2c:S */
 static int do_i2c_entry(const char *filename, void *symval,
@@ -935,7 +866,6 @@ static int do_i2c_entry(const char *filename, void *symval,
 
 	return 1;
 }
-ADD_TO_DEVTABLE("i2c", i2c_device_id, do_i2c_entry);
 
 /* Looks like: spi:S */
 static int do_spi_entry(const char *filename, void *symval,
@@ -946,7 +876,6 @@ static int do_spi_entry(const char *filename, void *symval,
 
 	return 1;
 }
-ADD_TO_DEVTABLE("spi", spi_device_id, do_spi_entry);
 
 static const struct dmifield {
 	const char *prefix;
@@ -1001,7 +930,6 @@ static int do_dmi_entry(const char *filename, void *symval,
 	strcat(alias, ":");
 	return 1;
 }
-ADD_TO_DEVTABLE("dmi", dmi_system_id, do_dmi_entry);
 
 static int do_platform_entry(const char *filename,
 			     void *symval, char *alias)
@@ -1010,7 +938,6 @@ static int do_platform_entry(const char *filename,
 	sprintf(alias, PLATFORM_MODULE_PREFIX "%s", *name);
 	return 1;
 }
-ADD_TO_DEVTABLE("platform", platform_device_id, do_platform_entry);
 
 static int do_mdio_entry(const char *filename,
 			 void *symval, char *alias)
@@ -1035,7 +962,6 @@ static int do_mdio_entry(const char *filename,
 
 	return 1;
 }
-ADD_TO_DEVTABLE("mdio", mdio_device_id, do_mdio_entry);
 
 /* Looks like: zorro:iN. */
 static int do_zorro_entry(const char *filename, void *symval,
@@ -1046,7 +972,6 @@ static int do_zorro_entry(const char *filename, void *symval,
 	ADD(alias, "i", id != ZORRO_WILDCARD, id);
 	return 1;
 }
-ADD_TO_DEVTABLE("zorro", zorro_device_id, do_zorro_entry);
 
 /* looks like: "pnp:dD" */
 static int do_isapnp_entry(const char *filename,
@@ -1062,7 +987,6 @@ static int do_isapnp_entry(const char *filename,
 		(function >> 12) & 0x0f, (function >> 8) & 0x0f);
 	return 1;
 }
-ADD_TO_DEVTABLE("isapnp", isapnp_device_id, do_isapnp_entry);
 
 /* Looks like: "ipack:fNvNdN". */
 static int do_ipack_entry(const char *filename,
@@ -1078,7 +1002,6 @@ static int do_ipack_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("ipack", ipack_device_id, do_ipack_entry);
 
 /*
  * Append a match expression for a single masked hex digit.
@@ -1149,7 +1072,6 @@ static int do_amba_entry(const char *filename,
 
 	return 1;
 }
-ADD_TO_DEVTABLE("amba", amba_id, do_amba_entry);
 
 /*
  * looks like: "mipscdmm:tN"
@@ -1165,7 +1087,6 @@ static int do_mips_cdmm_entry(const char *filename,
 	sprintf(alias, "mipscdmm:t%02X*", type);
 	return 1;
 }
-ADD_TO_DEVTABLE("mipscdmm", mips_cdmm_device_id, do_mips_cdmm_entry);
 
 /* LOOKS like cpu:type:x86,venVVVVfamFFFFmodMMMM:feature:*,FEAT,*
  * All fields are numbers. It would be nicer to use strings for vendor
@@ -1190,7 +1111,6 @@ static int do_x86cpu_entry(const char *filename, void *symval,
 		sprintf(alias + strlen(alias), "%04X*", feature);
 	return 1;
 }
-ADD_TO_DEVTABLE("x86cpu", x86_cpu_id, do_x86cpu_entry);
 
 /* LOOKS like cpu:type:*:feature:*FEAT* */
 static int do_cpu_entry(const char *filename, void *symval, char *alias)
@@ -1200,7 +1120,6 @@ static int do_cpu_entry(const char *filename, void *symval, char *alias)
 	sprintf(alias, "cpu:type:*:feature:*%04X*", feature);
 	return 1;
 }
-ADD_TO_DEVTABLE("cpu", cpu_feature, do_cpu_entry);
 
 /* Looks like: mei:S:uuid:N:* */
 static int do_mei_entry(const char *filename, void *symval,
@@ -1219,7 +1138,6 @@ static int do_mei_entry(const char *filename, void *symval,
 
 	return 1;
 }
-ADD_TO_DEVTABLE("mei", mei_cl_device_id, do_mei_entry);
 
 /* Looks like: rapidio:vNdNavNadN */
 static int do_rio_entry(const char *filename,
@@ -1239,7 +1157,6 @@ static int do_rio_entry(const char *filename,
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("rapidio", rio_device_id, do_rio_entry);
 
 /* Looks like: ulpi:vNpN */
 static int do_ulpi_entry(const char *filename, void *symval,
@@ -1252,7 +1169,6 @@ static int do_ulpi_entry(const char *filename, void *symval,
 
 	return 1;
 }
-ADD_TO_DEVTABLE("ulpi", ulpi_device_id, do_ulpi_entry);
 
 /* Looks like: hdaudio:vNrNaN */
 static int do_hda_entry(const char *filename, void *symval, char *alias)
@@ -1269,7 +1185,6 @@ static int do_hda_entry(const char *filename, void *symval, char *alias)
 	add_wildcard(alias);
 	return 1;
 }
-ADD_TO_DEVTABLE("hdaudio", hda_device_id, do_hda_entry);
 
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
@@ -1302,6 +1217,44 @@ static void do_table(void *symval, unsigned long size,
 	}
 }
 
+static const struct devtable devtable[] = {
+	{"hid", SIZE_hid_device_id, do_hid_entry},
+	{"ieee1394", SIZE_ieee1394_device_id, do_ieee1394_entry},
+	{"pci", SIZE_pci_device_id, do_pci_entry},
+	{"ccw", SIZE_ccw_device_id, do_ccw_entry},
+	{"ap", SIZE_ap_device_id, do_ap_entry},
+	{"css", SIZE_css_device_id, do_css_entry},
+	{"serio", SIZE_serio_device_id, do_serio_entry},
+	{"acpi", SIZE_acpi_device_id, do_acpi_entry},
+	{"pcmcia", SIZE_pcmcia_device_id, do_pcmcia_entry},
+	{"vio", SIZE_vio_device_id, do_vio_entry},
+	{"input", SIZE_input_device_id, do_input_entry},
+	{"eisa", SIZE_eisa_device_id, do_eisa_entry},
+	{"parisc", SIZE_parisc_device_id, do_parisc_entry},
+	{"sdio", SIZE_sdio_device_id, do_sdio_entry},
+	{"ssb", SIZE_ssb_device_id, do_ssb_entry},
+	{"bcma", SIZE_bcma_device_id, do_bcma_entry},
+	{"virtio", SIZE_virtio_device_id, do_virtio_entry},
+	{"vmbus", SIZE_hv_vmbus_device_id, do_vmbus_entry},
+	{"i2c", SIZE_i2c_device_id, do_i2c_entry},
+	{"spi", SIZE_spi_device_id, do_spi_entry},
+	{"dmi", SIZE_dmi_system_id, do_dmi_entry},
+	{"platform", SIZE_platform_device_id, do_platform_entry},
+	{"mdio", SIZE_mdio_device_id, do_mdio_entry},
+	{"zorro", SIZE_zorro_device_id, do_zorro_entry},
+	{"isapnp", SIZE_isapnp_device_id, do_isapnp_entry},
+	{"ipack", SIZE_ipack_device_id, do_ipack_entry},
+	{"amba", SIZE_amba_id, do_amba_entry},
+	{"mipscdmm", SIZE_mips_cdmm_device_id, do_mips_cdmm_entry},
+	{"x86cpu", SIZE_x86_cpu_id, do_x86cpu_entry},
+	{"cpu", SIZE_cpu_feature, do_cpu_entry},
+	{"mei", SIZE_mei_cl_device_id, do_mei_entry},
+	{"rapidio", SIZE_rio_device_id, do_rio_entry},
+	{"ulpi", SIZE_ulpi_device_id, do_ulpi_entry},
+	{"hdaudio", SIZE_hda_device_id, do_hda_entry},
+	{"of", SIZE_of_device_id, do_of_entry},
+};
+
 /* Create MODULE_ALIAS() statements.
  * At this time, we cannot write the actual output C source yet,
  * so we write into the mod->dev_table_buf buffer. */
@@ -1354,13 +1307,14 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 	else if (sym_is(name, namelen, "pnp_card"))
 		do_pnp_card_entries(symval, sym->st_size, mod);
 	else {
-		struct devtable **p;
-		INIT_SECTION(__devtable);
+		int i;
 
-		for (p = __start___devtable; p < __stop___devtable; p++) {
-			if (sym_is(name, namelen, (*p)->device_id)) {
-				do_table(symval, sym->st_size, (*p)->id_size,
-					 (*p)->device_id, (*p)->function, mod);
+		for (i = 0; i < ARRAY_SIZE(devtable); i++) {
+			const struct devtable *p = &devtable[i];
+
+			if (sym_is(name, namelen, p->device_id)) {
+				do_table(symval, sym->st_size, p->id_size,
+					 p->device_id, p->function, mod);
 				break;
 			}
 		}

From 5721b1717bd689321cc192d57e42ab7dcc5fe959 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 22 Nov 2018 13:28:42 +0900
Subject: [PATCH 3194/3220] modpost: file2alias: check prototype of handler

commit f880eea68fe593342fa6e09be9bb661f3c297aec upstream.

Use specific prototype instead of an opaque pointer so that the
compiler can catch function prototype mismatch.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 scripts/mod/file2alias.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index d0b164084225..4250d3d6f391 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -47,7 +47,7 @@ typedef struct {
 struct devtable {
 	const char *device_id; /* name of table, __mod_<name>__*_device_table. */
 	unsigned long id_size;
-	void *function;
+	int (*do_entry)(const char *filename, void *symval, char *alias);
 };
 
 /* Define a variable f that holds the value of field f of struct devid
@@ -1198,12 +1198,11 @@ static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 static void do_table(void *symval, unsigned long size,
 		     unsigned long id_size,
 		     const char *device_id,
-		     void *function,
+		     int (*do_entry)(const char *filename, void *symval, char *alias),
 		     struct module *mod)
 {
 	unsigned int i;
 	char alias[500];
-	int (*do_entry)(const char *, void *entry, char *alias) = function;
 
 	device_id_check(mod->name, device_id, size, id_size, symval);
 	/* Leave last one: it's the terminator. */
@@ -1314,7 +1313,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 
 			if (sym_is(name, namelen, p->device_id)) {
 				do_table(symval, sym->st_size, p->id_size,
-					 p->device_id, p->function, mod);
+					 p->device_id, p->do_entry, mod);
 				break;
 			}
 		}

From 83a98463d6f97a9736241bdc7a5002e17893a0b1 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Date: Tue, 23 Apr 2019 15:44:48 +0300
Subject: [PATCH 3195/3220] tpm/tpm_i2c_atmel: Return -E2BIG when the transfer
 is incomplete

commit 442601e87a4769a8daba4976ec3afa5222ca211d upstream

Return -E2BIG when the transfer is incomplete. The upper layer does
not retry, so not doing that is incorrect behaviour.

Cc: stable@vger.kernel.org
Fixes: a2871c62e186 ("tpm: Add support for Atmel I2C TPMs")
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/char/tpm/tpm_i2c_atmel.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/char/tpm/tpm_i2c_atmel.c b/drivers/char/tpm/tpm_i2c_atmel.c
index dd8f0eb3170a..73f7e0f7e34a 100644
--- a/drivers/char/tpm/tpm_i2c_atmel.c
+++ b/drivers/char/tpm/tpm_i2c_atmel.c
@@ -65,7 +65,15 @@ static int i2c_atmel_send(struct tpm_chip *chip, u8 *buf, size_t len)
 	dev_dbg(&chip->dev,
 		"%s(buf=%*ph len=%0zx) -> sts=%d\n", __func__,
 		(int)min_t(size_t, 64, len), buf, len, status);
-	return status;
+
+	if (status < 0)
+		return status;
+
+	/* The upper layer does not support incomplete sends. */
+	if (status != len)
+		return -E2BIG;
+
+	return 0;
 }
 
 static int i2c_atmel_recv(struct tpm_chip *chip, u8 *buf, size_t count)

From eaa303a2d65d52260fa8e0ad0a356ac3f0b7568e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 2 Apr 2019 08:10:47 -0700
Subject: [PATCH 3196/3220] KVM: x86: Don't clear EFER during SMM transitions
 for 32-bit vCPU

commit 8f4dc2e77cdfaf7e644ef29693fa229db29ee1de upstream.

Neither AMD nor Intel CPUs have an EFER field in the legacy SMRAM save
state area, i.e. don't save/restore EFER across SMM transitions.  KVM
somewhat models this, e.g. doesn't clear EFER on entry to SMM if the
guest doesn't support long mode.  But during RSM, KVM unconditionally
clears EFER so that it can get back to pure 32-bit mode in order to
start loading CRs with their actual non-SMM values.

Clear EFER only when it will be written when loading the non-SMM state
so as to preserve bits that can theoretically be set on 32-bit vCPUs,
e.g. KVM always emulates EFER_SCE.

And because CR4.PAE is cleared only to play nice with EFER, wrap that
code in the long mode check as well.  Note, this may result in a
compiler warning about cr4 being consumed uninitialized.  Re-read CR4
even though it's technically unnecessary, as doing so allows for more
readable code and RSM emulation is not a performance critical path.

Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch")
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/emulate.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f1507626ed36..5dd56e3517f3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2567,15 +2567,13 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 	 * CR0/CR3/CR4/EFER.  It's all a bit more complicated if the vCPU
 	 * supports long mode.
 	 */
-	cr4 = ctxt->ops->get_cr(ctxt, 4);
 	if (emulator_has_longmode(ctxt)) {
 		struct desc_struct cs_desc;
 
 		/* Zero CR4.PCIDE before CR0.PG.  */
-		if (cr4 & X86_CR4_PCIDE) {
+		cr4 = ctxt->ops->get_cr(ctxt, 4);
+		if (cr4 & X86_CR4_PCIDE)
 			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
-			cr4 &= ~X86_CR4_PCIDE;
-		}
 
 		/* A 32-bit code segment is required to clear EFER.LMA.  */
 		memset(&cs_desc, 0, sizeof(cs_desc));
@@ -2589,13 +2587,16 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 	if (cr0 & X86_CR0_PE)
 		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
 
-	/* Now clear CR4.PAE (which must be done before clearing EFER.LME).  */
-	if (cr4 & X86_CR4_PAE)
-		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+	if (emulator_has_longmode(ctxt)) {
+		/* Clear CR4.PAE before clearing EFER.LME. */
+		cr4 = ctxt->ops->get_cr(ctxt, 4);
+		if (cr4 & X86_CR4_PAE)
+			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
 
-	/* And finally go back to 32-bit mode.  */
-	efer = 0;
-	ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+		/* And finally go back to 32-bit mode.  */
+		efer = 0;
+		ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+	}
 
 	smbase = ctxt->ops->get_smbase(ctxt);
 	if (emulator_has_longmode(ctxt))

From e20204c43c0bc2c3f793f7d5c20716748a0db0c7 Mon Sep 17 00:00:00 2001
From: Mike Looijmans <mike.looijmans@topic.nl>
Date: Wed, 13 Feb 2019 08:41:47 +0100
Subject: [PATCH 3197/3220] iio/gyro/bmg160: Use millidegrees for temperature
 scale

commit 40a7198a4a01037003c7ca714f0d048a61e729ac upstream.

Standard unit for temperature is millidegrees Celcius, whereas this driver
was reporting in degrees. Fix the scale factor in the driver.

Signed-off-by: Mike Looijmans <mike.looijmans@topic.nl>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/iio/gyro/bmg160_core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iio/gyro/bmg160_core.c b/drivers/iio/gyro/bmg160_core.c
index 90841abd3ce4..a4dc6a3783d0 100644
--- a/drivers/iio/gyro/bmg160_core.c
+++ b/drivers/iio/gyro/bmg160_core.c
@@ -519,11 +519,10 @@ static int bmg160_read_raw(struct iio_dev *indio_dev,
 		} else
 			return -EINVAL;
 	case IIO_CHAN_INFO_SCALE:
-		*val = 0;
 		switch (chan->type) {
 		case IIO_TEMP:
-			*val2 = 500000;
-			return IIO_VAL_INT_PLUS_MICRO;
+			*val = 500;
+			return IIO_VAL_INT;
 		case IIO_ANGL_VEL:
 		{
 			int i;
@@ -531,6 +530,7 @@ static int bmg160_read_raw(struct iio_dev *indio_dev,
 			for (i = 0; i < ARRAY_SIZE(bmg160_scale_table); ++i) {
 				if (bmg160_scale_table[i].dps_range ==
 							data->dps_range) {
+					*val = 0;
 					*val2 = bmg160_scale_table[i].scale;
 					return IIO_VAL_INT_PLUS_MICRO;
 				}

From c69e21be54b4cc2d30125e97335998760c7beaf9 Mon Sep 17 00:00:00 2001
From: Dragos Bogdan <dragos.bogdan@analog.com>
Date: Tue, 19 Mar 2019 12:47:00 +0200
Subject: [PATCH 3198/3220] iio: ad_sigma_delta: select channel when reading
 register

commit fccfb9ce70ed4ea7a145f77b86de62e38178517f upstream.

The desired channel has to be selected in order to correctly fill the
buffer with the corresponding data.
The `ad_sd_write_reg()` already does this, but for the
`ad_sd_read_reg_raw()` this was omitted.

Fixes: af3008485ea03 ("iio:adc: Add common code for ADI Sigma Delta devices")
Signed-off-by: Dragos Bogdan <dragos.bogdan@analog.com>
Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/iio/adc/ad_sigma_delta.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index 22c4c17cd996..a1d072ecb717 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -121,6 +121,7 @@ static int ad_sd_read_reg_raw(struct ad_sigma_delta *sigma_delta,
 	if (sigma_delta->info->has_registers) {
 		data[0] = reg << sigma_delta->info->addr_shift;
 		data[0] |= sigma_delta->info->read_mask;
+		data[0] |= sigma_delta->comm;
 		spi_message_add_tail(&t[0], &m);
 	}
 	spi_message_add_tail(&t[1], &m);

From 07921e3cb1eaa632a4182ec53f90b7bcc3ac11d3 Mon Sep 17 00:00:00 2001
From: Georg Ottinger <g.ottinger@abatec.at>
Date: Wed, 30 Jan 2019 14:42:02 +0100
Subject: [PATCH 3199/3220] iio: adc: at91: disable adc channel interrupt in
 timeout case

commit 09c6bdee51183a575bf7546890c8c137a75a2b44 upstream.

Having a brief look at at91_adc_read_raw() it is obvious that in the case
of a timeout the setting of AT91_ADC_CHDR and AT91_ADC_IDR registers is
omitted. If 2 different channels are queried we can end up with a
situation where two interrupts are enabled, but only one interrupt is
cleared in the interrupt handler. Resulting in a interrupt loop and a
system hang.

Signed-off-by: Georg Ottinger <g.ottinger@abatec.at>
Acked-by: Ludovic Desroches <ludovic.desroches@microchip.com>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/iio/adc/at91_adc.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c
index d83e5b75a37b..4b317ffd144c 100644
--- a/drivers/iio/adc/at91_adc.c
+++ b/drivers/iio/adc/at91_adc.c
@@ -702,23 +702,29 @@ static int at91_adc_read_raw(struct iio_dev *idev,
 		ret = wait_event_interruptible_timeout(st->wq_data_avail,
 						       st->done,
 						       msecs_to_jiffies(1000));
-		if (ret == 0)
-			ret = -ETIMEDOUT;
-		if (ret < 0) {
-			mutex_unlock(&st->lock);
-			return ret;
-		}
-
-		*val = st->last_value;
 
+		/* Disable interrupts, regardless if adc conversion was
+		 * successful or not
+		 */
 		at91_adc_writel(st, AT91_ADC_CHDR,
 				AT91_ADC_CH(chan->channel));
 		at91_adc_writel(st, AT91_ADC_IDR, BIT(chan->channel));
 
-		st->last_value = 0;
-		st->done = false;
+		if (ret > 0) {
+			/* a valid conversion took place */
+			*val = st->last_value;
+			st->last_value = 0;
+			st->done = false;
+			ret = IIO_VAL_INT;
+		} else if (ret == 0) {
+			/* conversion timeout */
+			dev_err(&idev->dev, "ADC Channel %d timeout.\n",
+				chan->channel);
+			ret = -ETIMEDOUT;
+		}
+
 		mutex_unlock(&st->lock);
-		return IIO_VAL_INT;
+		return ret;
 
 	case IIO_CHAN_INFO_SCALE:
 		*val = st->vref_mv;

From dc4f96499d43c07d5edcc61df50d930b8e769476 Mon Sep 17 00:00:00 2001
From: "he, bo" <bo.he@intel.com>
Date: Wed, 6 Mar 2019 10:32:20 +0800
Subject: [PATCH 3200/3220] io: accel: kxcjk1013: restore the range after
 resume.

commit fe2d3df639a7940a125a33d6460529b9689c5406 upstream.

On some laptops, kxcjk1013 is powered off when system enters S3. We need
restore the range regiter during resume. Otherwise, the sensor doesn't
work properly after S3.

Signed-off-by: he, bo <bo.he@intel.com>
Signed-off-by: Chen, Hu <hu1.chen@intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/iio/accel/kxcjk-1013.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iio/accel/kxcjk-1013.c b/drivers/iio/accel/kxcjk-1013.c
index 18c1b06684c1..0667b2875ee4 100644
--- a/drivers/iio/accel/kxcjk-1013.c
+++ b/drivers/iio/accel/kxcjk-1013.c
@@ -1343,6 +1343,8 @@ static int kxcjk1013_resume(struct device *dev)
 
 	mutex_lock(&data->mutex);
 	ret = kxcjk1013_set_mode(data, OPERATION);
+	if (ret == 0)
+		ret = kxcjk1013_set_range(data, data->range);
 	mutex_unlock(&data->mutex);
 
 	return ret;

From 32ae16ff1eaf2bdb3458c2197b31f72fc3ed3049 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Mon, 15 Apr 2019 12:10:14 +0100
Subject: [PATCH 3201/3220] staging: comedi: vmk80xx: Fix use of uninitialized
 semaphore

commit 08b7c2f9208f0e2a32159e4e7a4831b7adb10a3e upstream.

If `vmk80xx_auto_attach()` returns an error, the core comedi module code
will call `vmk80xx_detach()` to clean up.  If `vmk80xx_auto_attach()`
successfully allocated the comedi device private data,
`vmk80xx_detach()` assumes that a `struct semaphore limit_sem` contained
in the private data has been initialized and uses it.  Unfortunately,
there are a couple of places where `vmk80xx_auto_attach()` can return an
error after allocating the device private data but before initializing
the semaphore, so this assumption is invalid.  Fix it by initializing
the semaphore just after allocating the private data in
`vmk80xx_auto_attach()` before any other errors can be returned.

I believe this was the cause of the following syzbot crash report
<https://syzkaller.appspot.com/bug?extid=54c2f58f15fe6876b6ad>:

usb 1-1: config 0 has no interface number 0
usb 1-1: New USB device found, idVendor=10cf, idProduct=8068, bcdDevice=e6.8d
usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0
usb 1-1: config 0 descriptor??
vmk80xx 1-1:0.117: driver 'vmk80xx' failed to auto-configure device.
INFO: trying to register non-static key.
the code is fine but needs lockdep annotation.
turning off the locking correctness validator.
CPU: 0 PID: 12 Comm: kworker/0:1 Not tainted 5.1.0-rc4-319354-g9a33b36 #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: usb_hub_wq hub_event
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0xe8/0x16e lib/dump_stack.c:113
 assign_lock_key kernel/locking/lockdep.c:786 [inline]
 register_lock_class+0x11b8/0x1250 kernel/locking/lockdep.c:1095
 __lock_acquire+0xfb/0x37c0 kernel/locking/lockdep.c:3582
 lock_acquire+0x10d/0x2f0 kernel/locking/lockdep.c:4211
 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
 _raw_spin_lock_irqsave+0x44/0x60 kernel/locking/spinlock.c:152
 down+0x12/0x80 kernel/locking/semaphore.c:58
 vmk80xx_detach+0x59/0x100 drivers/staging/comedi/drivers/vmk80xx.c:829
 comedi_device_detach+0xed/0x800 drivers/staging/comedi/drivers.c:204
 comedi_device_cleanup.part.0+0x68/0x140 drivers/staging/comedi/comedi_fops.c:156
 comedi_device_cleanup drivers/staging/comedi/comedi_fops.c:187 [inline]
 comedi_free_board_dev.part.0+0x16/0x90 drivers/staging/comedi/comedi_fops.c:190
 comedi_free_board_dev drivers/staging/comedi/comedi_fops.c:189 [inline]
 comedi_release_hardware_device+0x111/0x140 drivers/staging/comedi/comedi_fops.c:2880
 comedi_auto_config.cold+0x124/0x1b0 drivers/staging/comedi/drivers.c:1068
 usb_probe_interface+0x31d/0x820 drivers/usb/core/driver.c:361
 really_probe+0x2da/0xb10 drivers/base/dd.c:509
 driver_probe_device+0x21d/0x350 drivers/base/dd.c:671
 __device_attach_driver+0x1d8/0x290 drivers/base/dd.c:778
 bus_for_each_drv+0x163/0x1e0 drivers/base/bus.c:454
 __device_attach+0x223/0x3a0 drivers/base/dd.c:844
 bus_probe_device+0x1f1/0x2a0 drivers/base/bus.c:514
 device_add+0xad2/0x16e0 drivers/base/core.c:2106
 usb_set_configuration+0xdf7/0x1740 drivers/usb/core/message.c:2021
 generic_probe+0xa2/0xda drivers/usb/core/generic.c:210
 usb_probe_device+0xc0/0x150 drivers/usb/core/driver.c:266
 really_probe+0x2da/0xb10 drivers/base/dd.c:509
 driver_probe_device+0x21d/0x350 drivers/base/dd.c:671
 __device_attach_driver+0x1d8/0x290 drivers/base/dd.c:778
 bus_for_each_drv+0x163/0x1e0 drivers/base/bus.c:454
 __device_attach+0x223/0x3a0 drivers/base/dd.c:844
 bus_probe_device+0x1f1/0x2a0 drivers/base/bus.c:514
 device_add+0xad2/0x16e0 drivers/base/core.c:2106
 usb_new_device.cold+0x537/0xccf drivers/usb/core/hub.c:2534
 hub_port_connect drivers/usb/core/hub.c:5089 [inline]
 hub_port_connect_change drivers/usb/core/hub.c:5204 [inline]
 port_event drivers/usb/core/hub.c:5350 [inline]
 hub_event+0x138e/0x3b00 drivers/usb/core/hub.c:5432
 process_one_work+0x90f/0x1580 kernel/workqueue.c:2269
 worker_thread+0x9b/0xe20 kernel/workqueue.c:2415
 kthread+0x313/0x420 kernel/kthread.c:253
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352

Reported-by: syzbot+54c2f58f15fe6876b6ad@syzkaller.appspotmail.com
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/comedi/drivers/vmk80xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/comedi/drivers/vmk80xx.c b/drivers/staging/comedi/drivers/vmk80xx.c
index 8c7393ef762d..1ba24c230cb4 100644
--- a/drivers/staging/comedi/drivers/vmk80xx.c
+++ b/drivers/staging/comedi/drivers/vmk80xx.c
@@ -809,6 +809,8 @@ static int vmk80xx_auto_attach(struct comedi_device *dev,
 
 	devpriv->model = board->model;
 
+	sema_init(&devpriv->limit_sem, 8);
+
 	ret = vmk80xx_find_usb_endpoints(dev);
 	if (ret)
 		return ret;
@@ -817,8 +819,6 @@ static int vmk80xx_auto_attach(struct comedi_device *dev,
 	if (ret)
 		return ret;
 
-	sema_init(&devpriv->limit_sem, 8);
-
 	usb_set_intfdata(intf, devpriv);
 
 	if (devpriv->model == VMK8055_MODEL)

From 1d149c6cfcd12d9b6be588474cfeb08a2f570da5 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Mon, 15 Apr 2019 12:52:30 +0100
Subject: [PATCH 3202/3220] staging: comedi: vmk80xx: Fix possible double-free
 of ->usb_rx_buf

commit 663d294b4768bfd89e529e069bffa544a830b5bf upstream.

`vmk80xx_alloc_usb_buffers()` is called from `vmk80xx_auto_attach()` to
allocate RX and TX buffers for USB transfers.  It allocates
`devpriv->usb_rx_buf` followed by `devpriv->usb_tx_buf`.  If the
allocation of `devpriv->usb_tx_buf` fails, it frees
`devpriv->usb_rx_buf`,  leaving the pointer set dangling, and returns an
error.  Later, `vmk80xx_detach()` will be called from the core comedi
module code to clean up.  `vmk80xx_detach()` also frees both
`devpriv->usb_rx_buf` and `devpriv->usb_tx_buf`, but
`devpriv->usb_rx_buf` may have already been freed, leading to a
double-free error.  Fix it by removing the call to
`kfree(devpriv->usb_rx_buf)` from `vmk80xx_alloc_usb_buffers()`, relying
on `vmk80xx_detach()` to free the memory.

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/comedi/drivers/vmk80xx.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/staging/comedi/drivers/vmk80xx.c b/drivers/staging/comedi/drivers/vmk80xx.c
index 1ba24c230cb4..95e53cfd76a4 100644
--- a/drivers/staging/comedi/drivers/vmk80xx.c
+++ b/drivers/staging/comedi/drivers/vmk80xx.c
@@ -691,10 +691,8 @@ static int vmk80xx_alloc_usb_buffers(struct comedi_device *dev)
 
 	size = le16_to_cpu(devpriv->ep_tx->wMaxPacketSize);
 	devpriv->usb_tx_buf = kzalloc(size, GFP_KERNEL);
-	if (!devpriv->usb_tx_buf) {
-		kfree(devpriv->usb_rx_buf);
+	if (!devpriv->usb_tx_buf)
 		return -ENOMEM;
-	}
 
 	return 0;
 }

From 9f2bf97bb6c36793bc782fd71098e4ebfff9723d Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Mon, 15 Apr 2019 12:43:01 +0100
Subject: [PATCH 3203/3220] staging: comedi: ni_usb6501: Fix use of
 uninitialized mutex

commit 660cf4ce9d0f3497cc7456eaa6d74c8b71d6282c upstream.

If `ni6501_auto_attach()` returns an error, the core comedi module code
will call `ni6501_detach()` to clean up.  If `ni6501_auto_attach()`
successfully allocated the comedi device private data, `ni6501_detach()`
assumes that a `struct mutex mut` contained in the private data has been
initialized and uses it.  Unfortunately, there are a couple of places
where `ni6501_auto_attach()` can return an error after allocating the
device private data but before initializing the mutex, so this
assumption is invalid.  Fix it by initializing the mutex just after
allocating the private data in `ni6501_auto_attach()` before any other
errors can be retturned.  Also move the call to `usb_set_intfdata()`
just to keep the code a bit neater (either position for the call is
fine).

I believe this was the cause of the following syzbot crash report
<https://syzkaller.appspot.com/bug?extid=cf4f2b6c24aff0a3edf6>:

usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0
usb 1-1: config 0 descriptor??
usb 1-1: string descriptor 0 read error: -71
comedi comedi0: Wrong number of endpoints
ni6501 1-1:0.233: driver 'ni6501' failed to auto-configure device.
INFO: trying to register non-static key.
the code is fine but needs lockdep annotation.
turning off the locking correctness validator.
CPU: 0 PID: 585 Comm: kworker/0:3 Not tainted 5.1.0-rc4-319354-g9a33b36 #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: usb_hub_wq hub_event
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0xe8/0x16e lib/dump_stack.c:113
 assign_lock_key kernel/locking/lockdep.c:786 [inline]
 register_lock_class+0x11b8/0x1250 kernel/locking/lockdep.c:1095
 __lock_acquire+0xfb/0x37c0 kernel/locking/lockdep.c:3582
 lock_acquire+0x10d/0x2f0 kernel/locking/lockdep.c:4211
 __mutex_lock_common kernel/locking/mutex.c:925 [inline]
 __mutex_lock+0xfe/0x12b0 kernel/locking/mutex.c:1072
 ni6501_detach+0x5b/0x110 drivers/staging/comedi/drivers/ni_usb6501.c:567
 comedi_device_detach+0xed/0x800 drivers/staging/comedi/drivers.c:204
 comedi_device_cleanup.part.0+0x68/0x140 drivers/staging/comedi/comedi_fops.c:156
 comedi_device_cleanup drivers/staging/comedi/comedi_fops.c:187 [inline]
 comedi_free_board_dev.part.0+0x16/0x90 drivers/staging/comedi/comedi_fops.c:190
 comedi_free_board_dev drivers/staging/comedi/comedi_fops.c:189 [inline]
 comedi_release_hardware_device+0x111/0x140 drivers/staging/comedi/comedi_fops.c:2880
 comedi_auto_config.cold+0x124/0x1b0 drivers/staging/comedi/drivers.c:1068
 usb_probe_interface+0x31d/0x820 drivers/usb/core/driver.c:361
 really_probe+0x2da/0xb10 drivers/base/dd.c:509
 driver_probe_device+0x21d/0x350 drivers/base/dd.c:671
 __device_attach_driver+0x1d8/0x290 drivers/base/dd.c:778
 bus_for_each_drv+0x163/0x1e0 drivers/base/bus.c:454
 __device_attach+0x223/0x3a0 drivers/base/dd.c:844
 bus_probe_device+0x1f1/0x2a0 drivers/base/bus.c:514
 device_add+0xad2/0x16e0 drivers/base/core.c:2106
 usb_set_configuration+0xdf7/0x1740 drivers/usb/core/message.c:2021
 generic_probe+0xa2/0xda drivers/usb/core/generic.c:210
 usb_probe_device+0xc0/0x150 drivers/usb/core/driver.c:266
 really_probe+0x2da/0xb10 drivers/base/dd.c:509
 driver_probe_device+0x21d/0x350 drivers/base/dd.c:671
 __device_attach_driver+0x1d8/0x290 drivers/base/dd.c:778
 bus_for_each_drv+0x163/0x1e0 drivers/base/bus.c:454
 __device_attach+0x223/0x3a0 drivers/base/dd.c:844
 bus_probe_device+0x1f1/0x2a0 drivers/base/bus.c:514
 device_add+0xad2/0x16e0 drivers/base/core.c:2106
 usb_new_device.cold+0x537/0xccf drivers/usb/core/hub.c:2534
 hub_port_connect drivers/usb/core/hub.c:5089 [inline]
 hub_port_connect_change drivers/usb/core/hub.c:5204 [inline]
 port_event drivers/usb/core/hub.c:5350 [inline]
 hub_event+0x138e/0x3b00 drivers/usb/core/hub.c:5432
 process_one_work+0x90f/0x1580 kernel/workqueue.c:2269
 worker_thread+0x9b/0xe20 kernel/workqueue.c:2415
 kthread+0x313/0x420 kernel/kthread.c:253
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352

Reported-by: syzbot+cf4f2b6c24aff0a3edf6@syzkaller.appspotmail.com
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/comedi/drivers/ni_usb6501.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/comedi/drivers/ni_usb6501.c b/drivers/staging/comedi/drivers/ni_usb6501.c
index 95b537a8ecdb..83a7eb894c86 100644
--- a/drivers/staging/comedi/drivers/ni_usb6501.c
+++ b/drivers/staging/comedi/drivers/ni_usb6501.c
@@ -527,6 +527,9 @@ static int ni6501_auto_attach(struct comedi_device *dev,
 	if (!devpriv)
 		return -ENOMEM;
 
+	mutex_init(&devpriv->mut);
+	usb_set_intfdata(intf, devpriv);
+
 	ret = ni6501_find_endpoints(dev);
 	if (ret)
 		return ret;
@@ -535,9 +538,6 @@ static int ni6501_auto_attach(struct comedi_device *dev,
 	if (ret)
 		return ret;
 
-	mutex_init(&devpriv->mut);
-	usb_set_intfdata(intf, devpriv);
-
 	ret = comedi_alloc_subdevices(dev, 2);
 	if (ret)
 		return ret;

From 0241c6f9f56cafbcb3087d680efe454acdf658cd Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Mon, 15 Apr 2019 12:43:02 +0100
Subject: [PATCH 3204/3220] staging: comedi: ni_usb6501: Fix possible
 double-free of ->usb_rx_buf

commit af4b54a2e5ba18259ff9aac445bf546dd60d037e upstream.

`ni6501_alloc_usb_buffers()` is called from `ni6501_auto_attach()` to
allocate RX and TX buffers for USB transfers.  It allocates
`devpriv->usb_rx_buf` followed by `devpriv->usb_tx_buf`.  If the
allocation of `devpriv->usb_tx_buf` fails, it frees
`devpriv->usb_rx_buf`, leaving the pointer set dangling, and returns an
error.  Later, `ni6501_detach()` will be called from the core comedi
module code to clean up.  `ni6501_detach()` also frees both
`devpriv->usb_rx_buf` and `devpriv->usb_tx_buf`, but
`devpriv->usb_rx_buf` may have already beed freed, leading to a
double-free error.  Fix it bu removing the call to
`kfree(devpriv->usb_rx_buf)` from `ni6501_alloc_usb_buffers()`, relying
on `ni6501_detach()` to free the memory.

Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/comedi/drivers/ni_usb6501.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/staging/comedi/drivers/ni_usb6501.c b/drivers/staging/comedi/drivers/ni_usb6501.c
index 83a7eb894c86..6778e2b73667 100644
--- a/drivers/staging/comedi/drivers/ni_usb6501.c
+++ b/drivers/staging/comedi/drivers/ni_usb6501.c
@@ -472,10 +472,8 @@ static int ni6501_alloc_usb_buffers(struct comedi_device *dev)
 
 	size = le16_to_cpu(devpriv->ep_tx->wMaxPacketSize);
 	devpriv->usb_tx_buf = kzalloc(size, GFP_KERNEL);
-	if (!devpriv->usb_tx_buf) {
-		kfree(devpriv->usb_rx_buf);
+	if (!devpriv->usb_tx_buf)
 		return -ENOMEM;
-	}
 
 	return 0;
 }

From f94135f92d97d85444691bcc4f79784d995a5458 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 16 Apr 2019 17:06:33 +0200
Subject: [PATCH 3205/3220] ALSA: core: Fix card races between register and
 disconnect

commit 2a3f7221acddfe1caa9ff09b3a8158c39b2fdeac upstream.

There is a small race window in the card disconnection code that
allows the registration of another card with the very same card id.
This leads to a warning in procfs creation as caught by syzkaller.

The problem is that we delete snd_cards and snd_cards_lock entries at
the very beginning of the disconnection procedure.  This makes the
slot available to be assigned for another card object while the
disconnection procedure is being processed.  Then it becomes possible
to issue a procfs registration with the existing file name although we
check the conflict beforehand.

The fix is simply to move the snd_cards and snd_cards_lock clearances
at the end of the disconnection procedure.  The references to these
entries are merely either from the global proc files like
/proc/asound/cards or from the card registration / disconnection, so
it should be fine to shift at the very end.

Reported-by: syzbot+48df349490c36f9f54ab@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/core/init.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sound/core/init.c b/sound/core/init.c
index 20f37fb3800e..67765c61e5d5 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -405,14 +405,7 @@ int snd_card_disconnect(struct snd_card *card)
 	card->shutdown = 1;
 	spin_unlock(&card->files_lock);
 
-	/* phase 1: disable fops (user space) operations for ALSA API */
-	mutex_lock(&snd_card_mutex);
-	snd_cards[card->number] = NULL;
-	clear_bit(card->number, snd_cards_lock);
-	mutex_unlock(&snd_card_mutex);
-	
-	/* phase 2: replace file->f_op with special dummy operations */
-	
+	/* replace file->f_op with special dummy operations */
 	spin_lock(&card->files_lock);
 	list_for_each_entry(mfile, &card->files_list, list) {
 		/* it's critical part, use endless loop */
@@ -428,7 +421,7 @@ int snd_card_disconnect(struct snd_card *card)
 	}
 	spin_unlock(&card->files_lock);	
 
-	/* phase 3: notify all connected devices about disconnection */
+	/* notify all connected devices about disconnection */
 	/* at this point, they cannot respond to any calls except release() */
 
 #if IS_ENABLED(CONFIG_SND_MIXER_OSS)
@@ -444,6 +437,13 @@ int snd_card_disconnect(struct snd_card *card)
 		device_del(&card->card_dev);
 		card->registered = false;
 	}
+
+	/* disable fops (user space) operations for ALSA API */
+	mutex_lock(&snd_card_mutex);
+	snd_cards[card->number] = NULL;
+	clear_bit(card->number, snd_cards_lock);
+	mutex_unlock(&snd_card_mutex);
+
 #ifdef CONFIG_PM
 	wake_up(&card->power_sleep);
 #endif

From 7977328f4265a9f1312eaa5fc4597c67a4ef4cbb Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 31 Mar 2019 13:04:11 -0700
Subject: [PATCH 3206/3220] crypto: x86/poly1305 - fix overflow during partial
 reduction

commit 678cce4019d746da6c680c48ba9e6d417803e127 upstream.

The x86_64 implementation of Poly1305 produces the wrong result on some
inputs because poly1305_4block_avx2() incorrectly assumes that when
partially reducing the accumulator, the bits carried from limb 'd4' to
limb 'h0' fit in a 32-bit integer.  This is true for poly1305-generic
which processes only one block at a time.  However, it's not true for
the AVX2 implementation, which processes 4 blocks at a time and
therefore can produce intermediate limbs about 4x larger.

Fix it by making the relevant calculations use 64-bit arithmetic rather
than 32-bit.  Note that most of the carries already used 64-bit
arithmetic, but the d4 -> h0 carry was different for some reason.

To be safe I also made the same change to the corresponding SSE2 code,
though that only operates on 1 or 2 blocks at a time.  I don't think
it's really needed for poly1305_block_sse2(), but it doesn't hurt
because it's already x86_64 code.  It *might* be needed for
poly1305_2block_sse2(), but overflows aren't easy to reproduce there.

This bug was originally detected by my patches that improve testmgr to
fuzz algorithms against their generic implementation.  But also add a
test vector which reproduces it directly (in the AVX2 case).

Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64")
Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64")
Cc: <stable@vger.kernel.org> # v4.3+
Cc: Martin Willi <martin@strongswan.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/crypto/poly1305-avx2-x86_64.S | 14 +++++---
 arch/x86/crypto/poly1305-sse2-x86_64.S | 22 ++++++++-----
 crypto/testmgr.h                       | 44 +++++++++++++++++++++++++-
 3 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
index eff2f414e22b..ec234c43b3f4 100644
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -321,6 +321,12 @@ ENTRY(poly1305_4block_avx2)
 	vpaddq		t2,t1,t1
 	vmovq		t1x,d4
 
+	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+	# integers.  It's true in a single-block implementation, but not here.
+
 	# d1 += d0 >> 26
 	mov		d0,%rax
 	shr		$26,%rax
@@ -359,16 +365,16 @@ ENTRY(poly1305_4block_avx2)
 	# h0 += (d4 >> 26) * 5
 	mov		d4,%rax
 	shr		$26,%rax
-	lea		(%eax,%eax,4),%eax
-	add		%eax,%ebx
+	lea		(%rax,%rax,4),%rax
+	add		%rax,%rbx
 	# h4 = d4 & 0x3ffffff
 	mov		d4,%rax
 	and		$0x3ffffff,%eax
 	mov		%eax,h4
 
 	# h1 += h0 >> 26
-	mov		%ebx,%eax
-	shr		$26,%eax
+	mov		%rbx,%rax
+	shr		$26,%rax
 	add		%eax,h1
 	# h0 = h0 & 0x3ffffff
 	andl		$0x3ffffff,%ebx
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index 338c748054ed..639d9760b089 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -251,16 +251,16 @@ ENTRY(poly1305_block_sse2)
 	# h0 += (d4 >> 26) * 5
 	mov		d4,%rax
 	shr		$26,%rax
-	lea		(%eax,%eax,4),%eax
-	add		%eax,%ebx
+	lea		(%rax,%rax,4),%rax
+	add		%rax,%rbx
 	# h4 = d4 & 0x3ffffff
 	mov		d4,%rax
 	and		$0x3ffffff,%eax
 	mov		%eax,h4
 
 	# h1 += h0 >> 26
-	mov		%ebx,%eax
-	shr		$26,%eax
+	mov		%rbx,%rax
+	shr		$26,%rax
 	add		%eax,h1
 	# h0 = h0 & 0x3ffffff
 	andl		$0x3ffffff,%ebx
@@ -518,6 +518,12 @@ ENTRY(poly1305_2block_sse2)
 	paddq		t2,t1
 	movq		t1,d4
 
+	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+	# integers.  It's true in a single-block implementation, but not here.
+
 	# d1 += d0 >> 26
 	mov		d0,%rax
 	shr		$26,%rax
@@ -556,16 +562,16 @@ ENTRY(poly1305_2block_sse2)
 	# h0 += (d4 >> 26) * 5
 	mov		d4,%rax
 	shr		$26,%rax
-	lea		(%eax,%eax,4),%eax
-	add		%eax,%ebx
+	lea		(%rax,%rax,4),%rax
+	add		%rax,%rbx
 	# h4 = d4 & 0x3ffffff
 	mov		d4,%rax
 	and		$0x3ffffff,%eax
 	mov		%eax,h4
 
 	# h1 += h0 >> 26
-	mov		%ebx,%eax
-	shr		$26,%eax
+	mov		%rbx,%rax
+	shr		$26,%rax
 	add		%eax,h1
 	# h0 = h0 & 0x3ffffff
 	andl		$0x3ffffff,%ebx
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 0e02c60a57b6..743d6cb7f8cd 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -3494,7 +3494,49 @@ static struct hash_testvec poly1305_tv_template[] = {
 		.psize		= 80,
 		.digest		= "\x13\x00\x00\x00\x00\x00\x00\x00"
 				  "\x00\x00\x00\x00\x00\x00\x00\x00",
-	},
+	}, { /* Regression test for overflow in AVX2 implementation */
+		.plaintext	= "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff",
+		.psize		= 300,
+		.digest		= "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8"
+				  "\x78\xe5\x87\xcc\x2d\x5a\x22\xe1",
+	}
 };
 
 /*

From bc76b595cd13dc7ea22f12b62930a5c2bcbddbe0 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Wed, 17 Apr 2019 00:21:21 -0700
Subject: [PATCH 3207/3220] arm64: futex: Restore oldval initialization to work
 around buggy compilers

commit ff8acf929014b7f87315588e0daf8597c8aa9d1c upstream.

Commit 045afc24124d ("arm64: futex: Fix FUTEX_WAKE_OP atomic ops with
non-zero result value") removed oldval's zero initialization in
arch_futex_atomic_op_inuser because it is not necessary. Unfortunately,
Android's arm64 GCC 4.9.4 [1] does not agree:

../kernel/futex.c: In function 'do_futex':
../kernel/futex.c:1658:17: warning: 'oldval' may be used uninitialized
in this function [-Wmaybe-uninitialized]
   return oldval == cmparg;
                 ^
In file included from ../kernel/futex.c:73:0:
../arch/arm64/include/asm/futex.h:53:6: note: 'oldval' was declared here
  int oldval, ret, tmp;
      ^

GCC fails to follow that when ret is non-zero, futex_atomic_op_inuser
returns right away, avoiding the uninitialized use that it claims.
Restoring the zero initialization works around this issue.

[1]: https://android.googlesource.com/platform/prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/

Cc: stable@vger.kernel.org
Fixes: 045afc24124d ("arm64: futex: Fix FUTEX_WAKE_OP atomic ops with non-zero result value")
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/include/asm/futex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 0377c1756945..34d4d2e2f561 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -55,7 +55,7 @@
 static inline int
 arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int oldval, ret, tmp;
+	int oldval = 0, ret, tmp;
 
 	pagefault_disable();
 

From 3dda8d29b56eee7f76cb0833cfd4a408a21d599d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 24 Feb 2019 01:49:52 +0900
Subject: [PATCH 3208/3220] x86/kprobes: Verify stack frame on kretprobe

commit 3ff9c075cc767b3060bdac12da72fc94dd7da1b8 upstream.

Verify the stack frame pointer on kretprobe trampoline handler,
If the stack frame pointer does not match, it skips the wrong
entry and tries to find correct one.

This can happen if user puts the kretprobe on the function
which can be used in the path of ftrace user-function call.
Such functions should not be probed, so this adds a warning
message that reports which function should be blacklisted.

Tested-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/155094059185.6137.15527904013362842072.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/kprobes/core.c | 26 ++++++++++++++++++++++++++
 include/linux/kprobes.h        |  1 +
 2 files changed, 27 insertions(+)

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index c6f466d6cc57..a9fc2292d9ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -541,6 +541,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 	unsigned long *sara = stack_addr(regs);
 
 	ri->ret_addr = (kprobe_opcode_t *) *sara;
+	ri->fp = sara;
 
 	/* Replace the return addr with trampoline addr */
 	*sara = (unsigned long) &kretprobe_trampoline;
@@ -742,15 +743,21 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
 	unsigned long flags, orig_ret_address = 0;
 	unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
 	kprobe_opcode_t *correct_ret_addr = NULL;
+	void *frame_pointer;
+	bool skipped = false;
 
 	INIT_HLIST_HEAD(&empty_rp);
 	kretprobe_hash_lock(current, &head, &flags);
 	/* fixup registers */
 #ifdef CONFIG_X86_64
 	regs->cs = __KERNEL_CS;
+	/* On x86-64, we use pt_regs->sp for return address holder. */
+	frame_pointer = &regs->sp;
 #else
 	regs->cs = __KERNEL_CS | get_kernel_rpl();
 	regs->gs = 0;
+	/* On x86-32, we use pt_regs->flags for return address holder. */
+	frame_pointer = &regs->flags;
 #endif
 	regs->ip = trampoline_address;
 	regs->orig_ax = ~0UL;
@@ -772,8 +779,25 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
 		if (ri->task != current)
 			/* another task is sharing our hash bucket */
 			continue;
+		/*
+		 * Return probes must be pushed on this hash list correct
+		 * order (same as return order) so that it can be poped
+		 * correctly. However, if we find it is pushed it incorrect
+		 * order, this means we find a function which should not be
+		 * probed, because the wrong order entry is pushed on the
+		 * path of processing other kretprobe itself.
+		 */
+		if (ri->fp != frame_pointer) {
+			if (!skipped)
+				pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n");
+			skipped = true;
+			continue;
+		}
 
 		orig_ret_address = (unsigned long)ri->ret_addr;
+		if (skipped)
+			pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n",
+				ri->rp->kp.addr);
 
 		if (orig_ret_address != trampoline_address)
 			/*
@@ -791,6 +815,8 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
 		if (ri->task != current)
 			/* another task is sharing our hash bucket */
 			continue;
+		if (ri->fp != frame_pointer)
+			continue;
 
 		orig_ret_address = (unsigned long)ri->ret_addr;
 		if (ri->rp && ri->rp->handler) {
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e23392517db9..cb527c78de9f 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -197,6 +197,7 @@ struct kretprobe_instance {
 	struct kretprobe *rp;
 	kprobe_opcode_t *ret_addr;
 	struct task_struct *task;
+	void *fp;
 	char data[0];
 };
 

From abb5b93fe0f54695d6f7ef2f38610835cd12981c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 24 Feb 2019 01:50:20 +0900
Subject: [PATCH 3209/3220] kprobes: Mark ftrace mcount handler functions
 nokprobe

commit fabe38ab6b2bd9418350284c63825f13b8a6abba upstream.

Mark ftrace mcount handler functions nokprobe since
probing on these functions with kretprobe pushes
return address incorrectly on kretprobe shadow stack.

Reported-by: Francis Deslauriers <francis.deslauriers@efficios.com>
Tested-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/155094062044.6137.6419622920568680640.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/trace/ftrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d90b42b39908..6380ec0453e0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,6 +32,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/rcupdate.h>
+#include <linux/kprobes.h>
 
 #include <trace/events/sched.h>
 
@@ -5165,7 +5166,7 @@ static struct ftrace_ops control_ops = {
 	INIT_OPS_HASH(control_ops)
 };
 
-static inline void
+static nokprobe_inline void
 __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *ignored, struct pt_regs *regs)
 {
@@ -5214,11 +5215,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 {
 	__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
 }
+NOKPROBE_SYMBOL(ftrace_ops_list_func);
 #else
 static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
 {
 	__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
 }
+NOKPROBE_SYMBOL(ftrace_ops_no_ops);
 #endif
 
 /*
@@ -5239,6 +5242,7 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
 
 	trace_clear_recursion(bit);
 }
+NOKPROBE_SYMBOL(ftrace_ops_recurs_func);
 
 /**
  * ftrace_ops_get_func - get the function a trampoline should call

From e70a2d376c59147b897dd4f38bcd16ad97ccec0c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 15 Apr 2019 15:01:25 +0900
Subject: [PATCH 3210/3220] kprobes: Fix error check when reusing optimized
 probes

commit 5f843ed415581cfad4ef8fefe31c138a8346ca8a upstream.

The following commit introduced a bug in one of our error paths:

  819319fc9346 ("kprobes: Return error if we fail to reuse kprobe instead of BUG_ON()")

it missed to handle the return value of kprobe_optready() as
error-value. In reality, the kprobe_optready() returns a bool
result, so "true" case must be passed instead of 0.

This causes some errors on kprobe boot-time selftests on ARM:

 [   ] Beginning kprobe tests...
 [   ] Probe ARM code
 [   ]     kprobe
 [   ]     kretprobe
 [   ] ARM instruction simulation
 [   ]     Check decoding tables
 [   ]     Run test cases
 [   ] FAIL: test_case_handler not run
 [   ] FAIL: Test andge	r10, r11, r14, asr r7
 [   ] FAIL: Scenario 11
 ...
 [   ] FAIL: Scenario 7
 [   ] Total instruction simulation tests=1631, pass=1433 fail=198
 [   ] kprobe tests failed

This can happen if an optimized probe is unregistered and next
kprobe is registered on same address until the previous probe
is not reclaimed.

If this happens, a hidden aggregated probe may be kept in memory,
and no new kprobe can probe same address. Also, in that case
register_kprobe() will return "1" instead of minus error value,
which can mislead caller logic.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Naveen N . Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # v5.0+
Fixes: 819319fc9346 ("kprobes: Return error if we fail to reuse kprobe instead of BUG_ON()")
Link: http://lkml.kernel.org/r/155530808559.32517.539898325433642204.stgit@devnote2
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/kprobes.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d8daf6c55d2b..a53998cba804 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -668,7 +668,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
 static int reuse_unused_kprobe(struct kprobe *ap)
 {
 	struct optimized_kprobe *op;
-	int ret;
 
 	BUG_ON(!kprobe_unused(ap));
 	/*
@@ -682,9 +681,8 @@ static int reuse_unused_kprobe(struct kprobe *ap)
 	/* Enable the probe again */
 	ap->flags &= ~KPROBE_FLAG_DISABLED;
 	/* Optimize it again (remove from op->list) */
-	ret = kprobe_optready(ap);
-	if (ret)
-		return ret;
+	if (!kprobe_optready(ap))
+		return -EINVAL;
 
 	optimize_kprobe(ap);
 	return 0;

From 9090d691a874f6b4bc8e20bcfc37b421683f60ee Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 1 Mar 2019 14:48:37 +0100
Subject: [PATCH 3211/3220] mac80211: do not call driver wake_tx_queue op
 during reconfig

commit 4856bfd230985e43e84c26473c91028ff0a533bd upstream.

There are several scenarios in which mac80211 can call drv_wake_tx_queue
after ieee80211_restart_hw has been called and has not yet completed.
Driver private structs are considered uninitialized until mac80211 has
uploaded the vifs, stations and keys again, so using private tx queue
data during that time is not safe.

The driver can also not rely on drv_reconfig_complete to figure out when
it is safe to accept drv_wake_tx_queue calls again, because it is only
called after all tx queues are woken again.

To fix this, bail out early in drv_wake_tx_queue if local->in_reconfig
is set.

Cc: stable@vger.kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/mac80211/driver-ops.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 18b0d65baff0..6019988bfb84 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1157,6 +1157,9 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local,
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif);
 
+	if (local->in_reconfig)
+		return;
+
 	if (!check_sdata_in_driver(sdata))
 		return;
 

From 568f6b28743ccb5e01b5b78926e37eb6451dfc21 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Tue, 23 Apr 2019 12:04:24 -0700
Subject: [PATCH 3212/3220] Revert "kbuild: use -Oz instead of -Os when using
 clang"

commit a75bb4eb9e565b9f5115e2e8c07377ce32cbe69a upstream.

The clang option -Oz enables *aggressive* optimization for size,
which doesn't necessarily result in smaller images, but can have
negative impact on performance. Switch back to the less aggressive
-Os.

This reverts commit 6748cb3c299de1ffbe56733647b01dbcc398c419.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 7bf3fb717921..cc621fbe6c62 100644
--- a/Makefile
+++ b/Makefile
@@ -639,7 +639,7 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, int-in-bool-context)
 KBUILD_CFLAGS	+= $(call cc-disable-warning, attribute-alias)
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
-KBUILD_CFLAGS	+= $(call cc-option,-Oz,-Os)
+KBUILD_CFLAGS	+= -Os
 else
 ifdef CONFIG_PROFILE_ALL_BRANCHES
 KBUILD_CFLAGS	+= -O2

From 20fdfad30c43cbd62d1d48a1d50899da423ef7ea Mon Sep 17 00:00:00 2001
From: Phil Auld <pauld@redhat.com>
Date: Tue, 23 Apr 2019 19:51:06 -0400
Subject: [PATCH 3213/3220] sched/fair: Limit sched_cfs_period_timer() loop to
 avoid hard lockup

[ Upstream commit 2e8e19226398db8265a8e675fcc0118b9e80c9e8 ]

With extremely short cfs_period_us setting on a parent task group with a large
number of children the for loop in sched_cfs_period_timer() can run until the
watchdog fires. There is no guarantee that the call to hrtimer_forward_now()
will ever return 0.  The large number of children can make
do_sched_cfs_period_timer() take longer than the period.

 NMI watchdog: Watchdog detected hard LOCKUP on cpu 24
 RIP: 0010:tg_nop+0x0/0x10
  <IRQ>
  walk_tg_tree_from+0x29/0xb0
  unthrottle_cfs_rq+0xe0/0x1a0
  distribute_cfs_runtime+0xd3/0xf0
  sched_cfs_period_timer+0xcb/0x160
  ? sched_cfs_slack_timer+0xd0/0xd0
  __hrtimer_run_queues+0xfb/0x270
  hrtimer_interrupt+0x122/0x270
  smp_apic_timer_interrupt+0x6a/0x140
  apic_timer_interrupt+0xf/0x20
  </IRQ>

To prevent this we add protection to the loop that detects when the loop has run
too many times and scales the period and quota up, proportionally, so that the timer
can complete before then next period expires.  This preserves the relative runtime
quota while preventing the hard lockup.

A warning is issued reporting this state and the new values.

Signed-off-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190319130005.25492-1-pauld@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 kernel/sched/fair.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7671ae95753f..d706cf4fda99 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4016,12 +4016,15 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+extern const u64 max_cfs_quota_period;
+
 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, period_timer);
 	int overrun;
 	int idle = 0;
+	int count = 0;
 
 	raw_spin_lock(&cfs_b->lock);
 	for (;;) {
@@ -4029,6 +4032,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 		if (!overrun)
 			break;
 
+		if (++count > 3) {
+			u64 new, old = ktime_to_ns(cfs_b->period);
+
+			new = (old * 147) / 128; /* ~115% */
+			new = min(new, max_cfs_quota_period);
+
+			cfs_b->period = ns_to_ktime(new);
+
+			/* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+			cfs_b->quota *= new;
+			cfs_b->quota = div64_u64(cfs_b->quota, old);
+
+			pr_warn_ratelimited(
+        "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+	                        smp_processor_id(),
+	                        div_u64(new, NSEC_PER_USEC),
+                                div_u64(cfs_b->quota, NSEC_PER_USEC));
+
+			/* reset count so we don't come right back in here */
+			count = 0;
+		}
+
 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 	}
 	if (idle)

From 4088997e5dc1fcf8e040239ebdbbb2dc2afcb6d5 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 19 Mar 2019 02:36:59 +0100
Subject: [PATCH 3214/3220] device_cgroup: fix RCU imbalance in error case

commit 0fcc4c8c044e117ac126ab6df4138ea9a67fa2a9 upstream.

When dev_exception_add() returns an error (due to a failed memory
allocation), make sure that we move the RCU preemption count back to where
it was before we were called. We dropped the RCU read lock inside the loop
body, so we can't just "break".

sparse complains about this, too:

$ make -s C=2 security/device_cgroup.o
./include/linux/rcupdate.h:647:9: warning: context imbalance in
'propagate_exception' - unexpected unlock

Fixes: d591fb56618f ("device_cgroup: simplify cgroup tree walk in propagate_exception()")
Cc: stable@vger.kernel.org
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 security/device_cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 03c1652c9a1f..db3bdc91c520 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -568,7 +568,7 @@ static int propagate_exception(struct dev_cgroup *devcg_root,
 		    devcg->behavior == DEVCG_DEFAULT_ALLOW) {
 			rc = dev_exception_add(devcg, ex);
 			if (rc)
-				break;
+				return rc;
 		} else {
 			/*
 			 * in the other possible cases:

From 0e4d4e0d6ba9e33622bcc2dc157b775fe8c5eb6e Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 18 Apr 2019 17:50:20 -0700
Subject: [PATCH 3215/3220] mm/vmstat.c: fix /proc/vmstat format for
 CONFIG_DEBUG_TLBFLUSH=y CONFIG_SMP=n

commit e8277b3b52240ec1caad8e6df278863e4bf42eac upstream.

Commit 58bc4c34d249 ("mm/vmstat.c: skip NR_TLB_REMOTE_FLUSH* properly")
depends on skipping vmstat entries with empty name introduced in
7aaf77272358 ("mm: don't show nr_indirectly_reclaimable in
/proc/vmstat") but reverted in b29940c1abd7 ("mm: rename and change
semantics of nr_indirectly_reclaimable_bytes").

So skipping no longer works and /proc/vmstat has misformatted lines " 0".

This patch simply shows debug counters "nr_tlb_remote_*" for UP.

Link: http://lkml.kernel.org/r/155481488468.467.4295519102880913454.stgit@buzz
Fixes: 58bc4c34d249 ("mm/vmstat.c: skip NR_TLB_REMOTE_FLUSH* properly")
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Roman Gushchin <guro@fb.com>
Cc: Jann Horn <jannh@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmstat.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6af9bbad94c7..dd0a13013cb4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -857,13 +857,8 @@ const char * const vmstat_text[] = {
 #endif
 #endif /* CONFIG_MEMORY_BALLOON */
 #ifdef CONFIG_DEBUG_TLBFLUSH
-#ifdef CONFIG_SMP
 	"nr_tlb_remote_flush",
 	"nr_tlb_remote_flush_received",
-#else
-	"", /* nr_tlb_remote_flush */
-	"", /* nr_tlb_remote_flush_received */
-#endif /* CONFIG_SMP */
 	"nr_tlb_local_flush_all",
 	"nr_tlb_local_flush_one",
 #endif /* CONFIG_DEBUG_TLBFLUSH */

From abc81720ea872ba9b1fa6ac17e837456869b2281 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 16 Apr 2019 15:25:00 +0200
Subject: [PATCH 3216/3220] ALSA: info: Fix racy addition/deletion of nodes

commit 8c2f870890fd28e023b0fcf49dcee333f2c8bad7 upstream.

The ALSA proc helper manages the child nodes in a linked list, but its
addition and deletion is done without any lock.  This leads to a
corruption if they are operated concurrently.  Usually this isn't a
problem because the proc entries are added sequentially in the driver
probe procedure itself.  But the card registrations are done often
asynchronously, and the crash could be actually reproduced with
syzkaller.

This patch papers over it by protecting the link addition and deletion
with the parent's mutex.  There is "access" mutex that is used for the
file access, and this can be reused for this purpose as well.

Reported-by: syzbot+48df349490c36f9f54ab@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/core/info.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/sound/core/info.c b/sound/core/info.c
index 8ab72e0f5932..358a6947342d 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -724,8 +724,11 @@ snd_info_create_entry(const char *name, struct snd_info_entry *parent)
 	INIT_LIST_HEAD(&entry->children);
 	INIT_LIST_HEAD(&entry->list);
 	entry->parent = parent;
-	if (parent)
+	if (parent) {
+		mutex_lock(&parent->access);
 		list_add_tail(&entry->list, &parent->children);
+		mutex_unlock(&parent->access);
+	}
 	return entry;
 }
 
@@ -809,7 +812,12 @@ void snd_info_free_entry(struct snd_info_entry * entry)
 	list_for_each_entry_safe(p, n, &entry->children, list)
 		snd_info_free_entry(p);
 
-	list_del(&entry->list);
+	p = entry->parent;
+	if (p) {
+		mutex_lock(&p->access);
+		list_del(&entry->list);
+		mutex_unlock(&p->access);
+	}
 	kfree(entry->name);
 	if (entry->private_free)
 		entry->private_free(entry);

From 70a0882cd600d855e6867e90435b7d11d48c0f7a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 25 Apr 2019 10:00:43 +0200
Subject: [PATCH 3217/3220] Revert "locking/lockdep: Add debug_locks check in
 __lock_downgrade()"

This reverts commit 4aada79c6793c59e484b69fd4ed591396e2d4b39 which was
commit 71492580571467fb7177aade19c18ce7486267f5 upstream.

Tetsuo rightly points out that the backport here is incorrect, as it
touches the __lock_set_class function instead of the intended
__lock_downgrade function.

Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Waiman Long <longman@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Sasha Levin <sashal@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/locking/lockdep.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a49c565529a0..774ab79d3ec7 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3314,9 +3314,6 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 	unsigned int depth;
 	int i;
 
-	if (unlikely(!debug_locks))
-		return 0;
-
 	depth = curr->lockdep_depth;
 	/*
 	 * This function is about (re)setting the class of a held lock,

From e6ca59daafd002905230cdebdb167b2a01579e17 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 5 Apr 2019 18:39:38 -0700
Subject: [PATCH 3218/3220] kernel/sysctl.c: fix out-of-bounds access when
 setting file-max

commit 9002b21465fa4d829edfc94a5a441005cffaa972 upstream.

Commit 32a5ad9c2285 ("sysctl: handle overflow for file-max") hooked up
min/max values for the file-max sysctl parameter via the .extra1 and
.extra2 fields in the corresponding struct ctl_table entry.

Unfortunately, the minimum value points at the global 'zero' variable,
which is an int.  This results in a KASAN splat when accessed as a long
by proc_doulongvec_minmax on 64-bit architectures:

  | BUG: KASAN: global-out-of-bounds in __do_proc_doulongvec_minmax+0x5d8/0x6a0
  | Read of size 8 at addr ffff2000133d1c20 by task systemd/1
  |
  | CPU: 0 PID: 1 Comm: systemd Not tainted 5.1.0-rc3-00012-g40b114779944 #2
  | Hardware name: linux,dummy-virt (DT)
  | Call trace:
  |  dump_backtrace+0x0/0x228
  |  show_stack+0x14/0x20
  |  dump_stack+0xe8/0x124
  |  print_address_description+0x60/0x258
  |  kasan_report+0x140/0x1a0
  |  __asan_report_load8_noabort+0x18/0x20
  |  __do_proc_doulongvec_minmax+0x5d8/0x6a0
  |  proc_doulongvec_minmax+0x4c/0x78
  |  proc_sys_call_handler.isra.19+0x144/0x1d8
  |  proc_sys_write+0x34/0x58
  |  __vfs_write+0x54/0xe8
  |  vfs_write+0x124/0x3c0
  |  ksys_write+0xbc/0x168
  |  __arm64_sys_write+0x68/0x98
  |  el0_svc_common+0x100/0x258
  |  el0_svc_handler+0x48/0xc0
  |  el0_svc+0x8/0xc
  |
  | The buggy address belongs to the variable:
  |  zero+0x0/0x40
  |
  | Memory state around the buggy address:
  |  ffff2000133d1b00: 00 00 00 00 00 00 00 00 fa fa fa fa 04 fa fa fa
  |  ffff2000133d1b80: fa fa fa fa 04 fa fa fa fa fa fa fa 04 fa fa fa
  | >ffff2000133d1c00: fa fa fa fa 04 fa fa fa fa fa fa fa 00 00 00 00
  |                                ^
  |  ffff2000133d1c80: fa fa fa fa 00 fa fa fa fa fa fa fa 00 00 00 00
  |  ffff2000133d1d00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

Fix the splat by introducing a unsigned long 'zero_ul' and using that
instead.

Link: http://lkml.kernel.org/r/20190403153409.17307-1-will.deacon@arm.com
Fixes: 32a5ad9c2285 ("sysctl: handle overflow for file-max")
Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Christian Brauner <christian@brauner.io>
Cc: Kees Cook <keescook@chromium.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sysctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2f98b11477b8..c140659db669 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,6 +125,7 @@ static int zero;
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
+static unsigned long zero_ul;
 static unsigned long one_ul = 1;
 static unsigned long long_max = LONG_MAX;
 static int one_hundred = 100;
@@ -1604,7 +1605,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(files_stat.max_files),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &zero,
+		.extra1		= &zero_ul,
 		.extra2		= &long_max,
 	},
 	{

From 5875149a400a9552aafbad204f724424aee2ffa5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 27 Apr 2019 09:34:03 +0200
Subject: [PATCH 3219/3220] Linux 4.4.179

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index cc621fbe6c62..ee0a50b871b9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 178
+SUBLEVEL = 179
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 

From 152bacdd85c46f0c76b00c4acc253e414513634c Mon Sep 17 00:00:00 2001
From: Jim Blackler <jimblackler@google.com>
Date: Fri, 5 Apr 2019 16:25:04 +0100
Subject: [PATCH 3220/3220] ANDROID: Communicates LMK events to userland where
 they can be logged

Makes a file /proc/lowmemorykiller available for polling by user space.
Immediately as a process is killed by lowmemorykiller information about
that process is pushed to the file. It contains information about the
process at the time it was killed, to be gathered as metrics.

The file is expected to be continuously read from. To prevent failure
to do so from causing memory to be continuously consumed, after eight
processes have been killed, no more events will be logged until the
file is read from again.

The format is integers separated by spaces:
Field 0    pid
The identifier of the process.
Field 1    uid
The UNIX ID of the user the process was running under.
Field 2    group_leader_pid
The group leader of the process.
Field 3    min_flt
The number of minor page faults.
Field 4    maj_flt
The number of minor page faults.
Field 5    rss_in_pages
The Resident Set Size in pages.
Field 6    oom_score_adj
The adjusted Out Of Memory score.
Field 7    start_time
The number of nanoseconds that the process ran for.

--New line--
Field 8    taskname
The cmd line used to launch the process, truncated to 127 characters,
and terminated by a new line.

Bug: 130017100
Test: Tested manually
Change-Id: I461b191eda9e578b9d2c2d1843b1b81948353a95
Signed-off-by: Jim Blackler <jimblackler@google.com>
---
 drivers/staging/android/lowmemorykiller.c | 157 ++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index 2f4ef2120d31..8b19b5e4bd69 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -42,6 +42,9 @@
 #include <linux/rcupdate.h>
 #include <linux/profile.h>
 #include <linux/notifier.h>
+#include <linux/circ_buf.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace/lowmemorykiller.h"
@@ -70,6 +73,157 @@ static unsigned long lowmem_deathpending_timeout;
 			pr_info(x);			\
 	} while (0)
 
+
+static DECLARE_WAIT_QUEUE_HEAD(event_wait);
+static DEFINE_SPINLOCK(lmk_event_lock);
+static struct circ_buf event_buffer;
+#define MAX_BUFFERED_EVENTS 8
+#define MAX_TASKNAME 128
+
+struct lmk_event {
+	char taskname[MAX_TASKNAME];
+	pid_t pid;
+	uid_t uid;
+	pid_t group_leader_pid;
+	unsigned long min_flt;
+	unsigned long maj_flt;
+	unsigned long rss_in_pages;
+	short oom_score_adj;
+	short min_score_adj;
+	unsigned long long start_time;
+	struct list_head list;
+};
+
+void handle_lmk_event(struct task_struct *selected, short min_score_adj)
+{
+	int head;
+	int tail;
+	struct lmk_event *events;
+	struct lmk_event *event;
+	int res;
+	long rss_in_pages = -1;
+	struct mm_struct *mm = get_task_mm(selected);
+
+	if (mm) {
+		rss_in_pages = get_mm_rss(mm);
+		mmput(mm);
+	}
+
+	spin_lock(&lmk_event_lock);
+
+	head = event_buffer.head;
+	tail = READ_ONCE(event_buffer.tail);
+
+	/* Do not continue to log if no space remains in the buffer. */
+	if (CIRC_SPACE(head, tail, MAX_BUFFERED_EVENTS) < 1) {
+		spin_unlock(&lmk_event_lock);
+		return;
+	}
+
+	events = (struct lmk_event *) event_buffer.buf;
+	event = &events[head];
+
+	res = get_cmdline(selected, event->taskname, MAX_TASKNAME - 1);
+
+	/* No valid process name means this is definitely not associated with a
+	 * userspace activity.
+	 */
+
+	if (res <= 0 || res >= MAX_TASKNAME) {
+		spin_unlock(&lmk_event_lock);
+		return;
+	}
+
+	event->taskname[res] = '\0';
+	event->pid = selected->pid;
+	event->uid = from_kuid_munged(current_user_ns(), task_uid(selected));
+	if (selected->group_leader)
+		event->group_leader_pid = selected->group_leader->pid;
+	else
+		event->group_leader_pid = -1;
+	event->min_flt = selected->min_flt;
+	event->maj_flt = selected->maj_flt;
+	event->oom_score_adj = selected->signal->oom_score_adj;
+	event->start_time = nsec_to_clock_t(selected->real_start_time);
+	event->rss_in_pages = rss_in_pages;
+	event->min_score_adj = min_score_adj;
+
+	event_buffer.head = (head + 1) & (MAX_BUFFERED_EVENTS - 1);
+
+	spin_unlock(&lmk_event_lock);
+
+	wake_up_interruptible(&event_wait);
+}
+
+static int lmk_event_show(struct seq_file *s, void *unused)
+{
+	struct lmk_event *events = (struct lmk_event *) event_buffer.buf;
+	int head;
+	int tail;
+	struct lmk_event *event;
+
+	spin_lock(&lmk_event_lock);
+
+	head = event_buffer.head;
+	tail = event_buffer.tail;
+
+	if (head == tail) {
+		spin_unlock(&lmk_event_lock);
+		return -EAGAIN;
+	}
+
+	event = &events[tail];
+
+	seq_printf(s, "%lu %lu %lu %lu %lu %lu %hd %hd %llu\n%s\n",
+		(unsigned long) event->pid, (unsigned long) event->uid,
+		(unsigned long) event->group_leader_pid, event->min_flt,
+		event->maj_flt, event->rss_in_pages, event->oom_score_adj,
+		event->min_score_adj, event->start_time, event->taskname);
+
+	event_buffer.tail = (tail + 1) & (MAX_BUFFERED_EVENTS - 1);
+
+	spin_unlock(&lmk_event_lock);
+	return 0;
+}
+
+static unsigned int lmk_event_poll(struct file *file, poll_table *wait)
+{
+	int ret = 0;
+
+	poll_wait(file, &event_wait, wait);
+	spin_lock(&lmk_event_lock);
+	if (event_buffer.head != event_buffer.tail)
+		ret = POLLIN;
+	spin_unlock(&lmk_event_lock);
+	return ret;
+}
+
+static int lmk_event_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lmk_event_show, inode->i_private);
+}
+
+static const struct file_operations event_file_ops = {
+	.open = lmk_event_open,
+	.poll = lmk_event_poll,
+	.read = seq_read
+};
+
+static void lmk_event_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	event_buffer.head = 0;
+	event_buffer.tail = 0;
+	event_buffer.buf = kmalloc(
+		sizeof(struct lmk_event) * MAX_BUFFERED_EVENTS, GFP_KERNEL);
+	if (!event_buffer.buf)
+		return;
+	entry = proc_create("lowmemorykiller", 0, NULL, &event_file_ops);
+	if (!entry)
+		pr_err("error creating kernel lmk event file\n");
+}
+
 static unsigned long lowmem_count(struct shrinker *s,
 				  struct shrink_control *sc)
 {
@@ -190,6 +344,8 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 			     free);
 		lowmem_deathpending_timeout = jiffies + HZ;
 		rem += selected_tasksize;
+
+		handle_lmk_event(selected, min_score_adj);
 	}
 
 	lowmem_print(4, "lowmem_scan %lu, %x, return %lu\n",
@@ -207,6 +363,7 @@ static struct shrinker lowmem_shrinker = {
 static int __init lowmem_init(void)
 {
 	register_shrinker(&lowmem_shrinker);
+	lmk_event_init();
 	return 0;
 }
 device_initcall(lowmem_init);